diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index dd53e9508cc3..2b856bea249d 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -15,12 +15,12 @@ body:
       Please provide the following system information to help us diagnose the bug. For example:
 
       // example for c++ user
-      OpenCV version: 4.6.0
+      OpenCV version: 4.8.0
       Operating System / Platform: Ubuntu 20.04
       Compiler & compiler version: GCC 9.3.0
 
       // example for python user
-      OpenCV python version: 4.6.0.66
+      OpenCV python version: 4.8.0.74
       Operating System / Platform: Ubuntu 20.04
       Python version: 3.9.6
   validations:
diff --git a/.github/ISSUE_TEMPLATE/documentation.yml b/.github/ISSUE_TEMPLATE/documentation.yml
index d33c030972a0..8d4d3e440bef 100644
--- a/.github/ISSUE_TEMPLATE/documentation.yml
+++ b/.github/ISSUE_TEMPLATE/documentation.yml
@@ -12,7 +12,7 @@ body:
   attributes:
     label: Describe the doc issue
     description: >
-      Please provide a clear and concise description of what content in https://docs.opencv.org/ is an issue. Note that there are multiple active branches, such as 3.4, 4.x and 5.x, so please specify the branch with the problem.
+      Please provide a clear and concise description of what content in https://docs.opencv.org/ is an issue. Note that there are multiple active branches, such as 4.x and 5.x, so please specify the branch with the problem.
     placeholder: |
       A clear and concise description of what content in https://docs.opencv.org/ is an issue.
 
diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index 551bade42232..58858fe495e8 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -15,6 +15,15 @@ jobs:
   Ubuntu2004-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U20.yaml@main
 
+  Ubuntu2004-x64-OpenVINO:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U20-OpenVINO.yaml@main
+
+  Ubuntu2204-x64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U22.yaml@main
+
+  Ubuntu2404-x64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U24.yaml@main
+
   Ubuntu2004-x64-CUDA:
     if: "${{ contains(github.event.pull_request.labels.*.name, 'category: dnn') }} || ${{ contains(github.event.pull_request.labels.*.name, 'category: dnn (onnx)') }}"
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-U20-Cuda.yaml@main
@@ -22,6 +31,9 @@ jobs:
   Windows10-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10.yaml@main
 
+  Windows10-ARM64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-ARM64.yaml@main
+
   Windows10-x64-Vulkan:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-W10-Vulkan.yaml@main
 
@@ -37,11 +49,17 @@ jobs:
   iOS:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-iOS.yaml@main
 
-  Android:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-Android.yaml@main
+  Android-SDK:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-4.x-Android-SDK.yaml@main
+
+  Android-Test:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-Android-Test.yaml@main
 
   TIM-VX:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-timvx-backend-tests-4.x.yml@main
 
   docs:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-docs.yaml@main
+
+  Linux-RISC-V-Clang:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-4.x-RISCV.yaml@main
diff --git a/3rdparty/carotene/CMakeLists.txt b/3rdparty/carotene/CMakeLists.txt
index ebcdf1a9f6c1..aa95956e7f0b 100644
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@@ -42,6 +42,14 @@ endif()
 
 if(WITH_NEON)
     target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
+    if(NOT DEFINED CAROTENE_NEON_ARCH )
+    elseif(CAROTENE_NEON_ARCH EQUAL 8)
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
+    elseif(CAROTENE_NEON_ARCH EQUAL 7)
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
+    else()
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
+    endif()
 endif()
 
 # we add dummy file to fix XCode build
diff --git a/3rdparty/carotene/src/add_weighted.cpp b/3rdparty/carotene/src/add_weighted.cpp
index 6559b9fe5363..7e2945e88cf4 100644
--- a/3rdparty/carotene/src/add_weighted.cpp
+++ b/3rdparty/carotene/src/add_weighted.cpp
@@ -39,6 +39,7 @@
 
 #include "common.hpp"
 #include "vtransform.hpp"
+#include "vround_helper.hpp"
 
 namespace CAROTENE_NS {
 
@@ -106,7 +107,7 @@ template <> struct wAdd<s32>
     {
         valpha = vdupq_n_f32(_alpha);
         vbeta = vdupq_n_f32(_beta);
-        vgamma = vdupq_n_f32(_gamma + 0.5);
+        vgamma = vdupq_n_f32(_gamma);
     }
 
     void operator() (const VecTraits<s32>::vec128 & v_src0,
@@ -118,7 +119,7 @@ template <> struct wAdd<s32>
 
         vs1 = vmlaq_f32(vgamma, vs1, valpha);
         vs1 = vmlaq_f32(vs1, vs2, vbeta);
-        v_dst = vcvtq_s32_f32(vs1);
+        v_dst = vroundq_s32_f32(vs1);
     }
 
     void operator() (const VecTraits<s32>::vec64 & v_src0,
@@ -130,7 +131,7 @@ template <> struct wAdd<s32>
 
         vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
         vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
-        v_dst = vcvt_s32_f32(vs1);
+        v_dst = vround_s32_f32(vs1);
     }
 
     void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
@@ -150,7 +151,7 @@ template <> struct wAdd<u32>
     {
         valpha = vdupq_n_f32(_alpha);
         vbeta = vdupq_n_f32(_beta);
-        vgamma = vdupq_n_f32(_gamma + 0.5);
+        vgamma = vdupq_n_f32(_gamma);
     }
 
     void operator() (const VecTraits<u32>::vec128 & v_src0,
@@ -162,7 +163,7 @@ template <> struct wAdd<u32>
 
         vs1 = vmlaq_f32(vgamma, vs1, valpha);
         vs1 = vmlaq_f32(vs1, vs2, vbeta);
-        v_dst = vcvtq_u32_f32(vs1);
+        v_dst = vroundq_u32_f32(vs1);
     }
 
     void operator() (const VecTraits<u32>::vec64 & v_src0,
@@ -174,7 +175,7 @@ template <> struct wAdd<u32>
 
         vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
         vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
-        v_dst = vcvt_u32_f32(vs1);
+        v_dst = vround_u32_f32(vs1);
     }
 
     void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
diff --git a/3rdparty/carotene/src/blur.cpp b/3rdparty/carotene/src/blur.cpp
index 21689a2bd3fb..30c1f8a7293e 100644
--- a/3rdparty/carotene/src/blur.cpp
+++ b/3rdparty/carotene/src/blur.cpp
@@ -41,6 +41,7 @@
 
 #include "common.hpp"
 #include "saturate_cast.hpp"
+#include "vround_helper.hpp"
 
 namespace CAROTENE_NS {
 
@@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn,
 //#define FLOAT_VARIANT_1_9
 #ifdef FLOAT_VARIANT_1_9
     float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
-    float32x4_t v0_5 = vdupq_n_f32 (.5);
 #else
     const int16x8_t vScale = vmovq_n_s16(3640);
 #endif
@@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn,
                 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
                 float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
                 float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
-                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                tres1 = internal::vroundq_u32_f32(vf1);
+                tres2 = internal::vroundq_u32_f32(vf2);
                 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
                 vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
@@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn,
                 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
                 float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
                 float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
-                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                tres1 = internal::vroundq_u32_f32(vf1);
+                tres2 = internal::vroundq_u32_f32(vf2);
                 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
                 vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
@@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn,
 #define FLOAT_VARIANT_1_25
 #ifdef FLOAT_VARIANT_1_25
     float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
-    float32x4_t v0_5 = vdupq_n_f32 (.5f);
 #else
     const int16x8_t vScale = vmovq_n_s16(1310);
 #endif
@@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn,
             uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
             float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
             float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
-            tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-            tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+            tres1 = internal::vroundq_u32_f32(vf1);
+            tres2 = internal::vroundq_u32_f32(vf2);
             t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
             vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
diff --git a/3rdparty/carotene/src/colorconvert.cpp b/3rdparty/carotene/src/colorconvert.cpp
index a8aef9b722c0..752c65146ae3 100644
--- a/3rdparty/carotene/src/colorconvert.cpp
+++ b/3rdparty/carotene/src/colorconvert.cpp
@@ -40,6 +40,7 @@
 #include "common.hpp"
 
 #include "saturate_cast.hpp"
+#include "vround_helper.hpp"
 
 namespace CAROTENE_NS {
 
@@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui
     vSt3 = vmulq_f32(vHF1, vDivTab);
     vSt4 = vmulq_f32(vHF2, vDivTab);
 
-    float32x4_t bias = vdupq_n_f32(0.5f);
-
-    vSt1 = vaddq_f32(vSt1, bias);
-    vSt2 = vaddq_f32(vSt2, bias);
-    vSt3 = vaddq_f32(vSt3, bias);
-    vSt4 = vaddq_f32(vSt4, bias);
-
-    uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
-    uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
-    uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
-    uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
+    uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1);
+    uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2);
+    uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3);
+    uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4);
 
     int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
     int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
diff --git a/3rdparty/carotene/src/common.hpp b/3rdparty/carotene/src/common.hpp
index 823ddf1ccf06..b9de371a6afe 100644
--- a/3rdparty/carotene/src/common.hpp
+++ b/3rdparty/carotene/src/common.hpp
@@ -58,6 +58,17 @@
 
 namespace CAROTENE_NS { namespace internal {
 
+#ifndef CAROTENE_NEON_ARCH
+#    if defined(__aarch64__) || defined(__aarch32__)
+#        define CAROTENE_NEON_ARCH 8
+#    else
+#        define CAROTENE_NEON_ARCH 7
+#    endif
+#endif
+#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
+#    error("ARMv7 doen't support A32/A64 Neon instructions")
+#endif
+
 inline void prefetch(const void *ptr, size_t offset = 32*10)
 {
 #if defined __GNUC__
diff --git a/3rdparty/carotene/src/convert_scale.cpp b/3rdparty/carotene/src/convert_scale.cpp
index d599d24c1e28..f88dbea1823a 100644
--- a/3rdparty/carotene/src/convert_scale.cpp
+++ b/3rdparty/carotene/src/convert_scale.cpp
@@ -38,6 +38,7 @@
  */
 
 #include "common.hpp"
+#include "vround_helper.hpp"
 
 namespace CAROTENE_NS {
 
@@ -185,7 +186,7 @@ CVTS_FUNC1(u8, 16,
 #else
 CVTS_FUNC1(u8, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -209,10 +210,10 @@ CVTS_FUNC1(u8, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
         uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
         vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
@@ -270,7 +271,7 @@ CVTS_FUNC(u8, s8, 16,
 #else
 CVTS_FUNC(u8, s8, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -294,10 +295,10 @@ CVTS_FUNC(u8, s8, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
         int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
         vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
@@ -355,7 +356,7 @@ CVTS_FUNC(u8, u16, 16,
 #else
 CVTS_FUNC(u8, u16, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -379,10 +380,10 @@ CVTS_FUNC(u8, u16, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
         vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
     }
@@ -439,7 +440,7 @@ CVTS_FUNC(u8, s16, 16,
 #else
 CVTS_FUNC(u8, s16, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -463,10 +464,10 @@ CVTS_FUNC(u8, s16, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
         vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
     }
@@ -526,7 +527,7 @@ CVTS_FUNC(u8, s32, 16,
 #else
 CVTS_FUNC(u8, s32, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -550,10 +551,10 @@ CVTS_FUNC(u8, s32, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         vst1q_s32(_dst + i + 0,  vline1_s32);
         vst1q_s32(_dst + i + 4,  vline2_s32);
         vst1q_s32(_dst + i + 8,  vline3_s32);
@@ -693,7 +694,7 @@ CVTS_FUNC(s8, u8, 16,
 #else
 CVTS_FUNC(s8, u8, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -717,10 +718,10 @@ CVTS_FUNC(s8, u8, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
         uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
         vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
@@ -778,7 +779,7 @@ CVTS_FUNC1(s8, 16,
 #else
 CVTS_FUNC1(s8, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -802,10 +803,10 @@ CVTS_FUNC1(s8, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
         int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
         vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
@@ -863,7 +864,7 @@ CVTS_FUNC(s8, u16, 16,
 #else
 CVTS_FUNC(s8, u16, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -887,10 +888,10 @@ CVTS_FUNC(s8, u16, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
         uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
         vst1q_u16(_dst + i + 0, vRes1_u16);
@@ -949,7 +950,7 @@ CVTS_FUNC(s8, s16, 16,
 #else
 CVTS_FUNC(s8, s16, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -973,10 +974,10 @@ CVTS_FUNC(s8, s16, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
         int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
         vst1q_s16(_dst + i + 0, vRes1_s16);
@@ -1038,7 +1039,7 @@ CVTS_FUNC(s8, s32, 16,
 #else
 CVTS_FUNC(s8, s32, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 16)
     {
@@ -1062,10 +1063,10 @@ CVTS_FUNC(s8, s32, 16,
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
         vline3_f32 = vaddq_f32(vline3_f32, vshift);
         vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
         vst1q_s32(_dst + i + 0,  vline1_s32);
         vst1q_s32(_dst + i + 4,  vline2_s32);
         vst1q_s32(_dst + i + 8,  vline3_s32);
@@ -1190,7 +1191,7 @@ CVTS_FUNC(u16, u8, 16,
 #else
 CVTS_FUNC(u16, u8, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1204,8 +1205,8 @@ CVTS_FUNC(u16, u8, 16,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
@@ -1249,7 +1250,7 @@ CVTS_FUNC(u16, s8, 16,
 #else
 CVTS_FUNC(u16, s8, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1263,8 +1264,8 @@ CVTS_FUNC(u16, s8, 16,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@@ -1307,7 +1308,7 @@ CVTS_FUNC1(u16, 16,
 #else
 CVTS_FUNC1(u16, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1321,8 +1322,8 @@ CVTS_FUNC1(u16, 16,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
         uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
         vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@@ -1364,7 +1365,7 @@ CVTS_FUNC(u16, s16, 8,
 #else
 CVTS_FUNC(u16, s16, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1378,8 +1379,8 @@ CVTS_FUNC(u16, s16, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@@ -1421,7 +1422,7 @@ CVTS_FUNC(u16, s32, 8,
 #else
 CVTS_FUNC(u16, s32, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1435,8 +1436,8 @@ CVTS_FUNC(u16, s32, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         vst1q_s32(_dst + i + 0, vline1_s32);
         vst1q_s32(_dst + i + 4, vline2_s32);
     }
@@ -1530,7 +1531,7 @@ CVTS_FUNC(s16, u8, 16,
 #else
 CVTS_FUNC(s16, u8, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1544,8 +1545,8 @@ CVTS_FUNC(s16, u8, 16,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
@@ -1589,7 +1590,7 @@ CVTS_FUNC(s16, s8, 16,
 #else
 CVTS_FUNC(s16, s8, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1603,8 +1604,8 @@ CVTS_FUNC(s16, s8, 16,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@@ -1647,7 +1648,7 @@ CVTS_FUNC(s16, u16, 8,
 #else
 CVTS_FUNC(s16, u16, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1661,8 +1662,8 @@ CVTS_FUNC(s16, u16, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
         uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
         vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@@ -1704,7 +1705,7 @@ CVTS_FUNC1(s16, 16,
 #else
 CVTS_FUNC1(s16, 16,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1718,8 +1719,8 @@ CVTS_FUNC1(s16, 16,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@@ -1761,7 +1762,7 @@ CVTS_FUNC(s16, s32, 8,
 #else
 CVTS_FUNC(s16, s32, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1775,8 +1776,8 @@ CVTS_FUNC(s16, s32, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         vst1q_s32(_dst + i + 0, vline1_s32);
         vst1q_s32(_dst + i + 4, vline2_s32);
     }
@@ -1870,7 +1871,7 @@ CVTS_FUNC(s32, u8, 8,
 #else
 CVTS_FUNC(s32, u8, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1883,8 +1884,8 @@ CVTS_FUNC(s32, u8, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
         uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
         uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
@@ -1928,7 +1929,7 @@ CVTS_FUNC(s32, s8, 8,
 #else
 CVTS_FUNC(s32, s8, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1941,8 +1942,8 @@ CVTS_FUNC(s32, s8, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@@ -1985,7 +1986,7 @@ CVTS_FUNC(s32, u16, 8,
 #else
 CVTS_FUNC(s32, u16, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -1998,8 +1999,8 @@ CVTS_FUNC(s32, u16, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
         uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
         vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@@ -2041,7 +2042,7 @@ CVTS_FUNC(s32, s16, 8,
 #else
 CVTS_FUNC(s32, s16, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -2054,8 +2055,8 @@ CVTS_FUNC(s32, s16, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@@ -2097,7 +2098,7 @@ CVTS_FUNC1(s32, 8,
 #else
 CVTS_FUNC1(s32, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -2110,8 +2111,8 @@ CVTS_FUNC1(s32, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         vst1q_s32(_dst + i + 0, vline1_s32);
         vst1q_s32(_dst + i + 4, vline2_s32);
     }
@@ -2272,7 +2273,7 @@ CVTS_FUNC(f32, s8, 8,
 #else
 CVTS_FUNC(f32, s8, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -2283,8 +2284,8 @@ CVTS_FUNC(f32, s8, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@@ -2325,7 +2326,7 @@ CVTS_FUNC(f32, u16, 8,
 #else
 CVTS_FUNC(f32, u16, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -2336,8 +2337,8 @@ CVTS_FUNC(f32, u16, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
-        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
+        uint32x4_t vline1_u32 = internal::vroundq_u32_f32(vline1_f32);
+        uint32x4_t vline2_u32 = internal::vroundq_u32_f32(vline2_f32);
         uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
         uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
         vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@@ -2377,7 +2378,7 @@ CVTS_FUNC(f32, s16, 8,
 #else
 CVTS_FUNC(f32, s16, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -2388,8 +2389,8 @@ CVTS_FUNC(f32, s16, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         int16x4_t vRes1 = vqmovn_s32(vline1_s32);
         int16x4_t vRes2 = vqmovn_s32(vline2_s32);
         vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@@ -2429,7 +2430,7 @@ CVTS_FUNC(f32, s32, 8,
 #else
 CVTS_FUNC(f32, s32, 8,
     float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
     for (size_t i = 0; i < w; i += 8)
     {
@@ -2440,8 +2441,8 @@ CVTS_FUNC(f32, s32, 8,
         vline2_f32 = vmulq_f32(vline2_f32, vscale);
         vline1_f32 = vaddq_f32(vline1_f32, vshift);
         vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
         vst1q_s32(_dst + i + 0, vline1_s32);
         vst1q_s32(_dst + i + 4, vline2_s32);
     }
diff --git a/3rdparty/carotene/src/div.cpp b/3rdparty/carotene/src/div.cpp
index 38892acab3eb..75502c736b64 100644
--- a/3rdparty/carotene/src/div.cpp
+++ b/3rdparty/carotene/src/div.cpp
@@ -39,6 +39,7 @@
 
 #include "common.hpp"
 #include "vtransform.hpp"
+#include "vround_helper.hpp"
 
 #include <cstring>
 #include <cfloat>
@@ -51,13 +52,6 @@ namespace {
 
 #ifdef CAROTENE_NEON
 
-inline float32x4_t vroundq(const float32x4_t& v)
-{
-    const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
-    float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
-    return vaddq_f32(v, v_addition);
-}
-
 template <typename T>
 inline T divSaturateQ(const T &v1, const T &v2, const float scale)
 {
@@ -69,17 +63,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale)
 }
 template <>
 inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
-{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
+{ return internal::vroundq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
 template <>
 inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
-{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
-
-inline float32x2_t vround(const float32x2_t& v)
-{
-    const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
-    float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
-    return vadd_f32(v, v_addition);
-}
+{ return internal::vroundq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
 
 template <typename T>
 inline T divSaturate(const T &v1, const T &v2, const float scale)
@@ -88,10 +75,10 @@ inline T divSaturate(const T &v1, const T &v2, const float scale)
 }
 template <>
 inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
-{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
+{ return internal::vround_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
 template <>
 inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
-{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
+{ return internal::vround_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
 
 
 template <typename T>
@@ -157,8 +144,8 @@ void div(const Size2D &size,
 
     if (scale == 0.0f ||
         (std::numeric_limits<T>::is_integer &&
-         (scale * std::numeric_limits<T>::max()) <  1.0f &&
-         (scale * std::numeric_limits<T>::max()) > -1.0f))
+         (scale * static_cast<float>(std::numeric_limits<T>::max())) <  1.0f &&
+         (scale * static_cast<float>(std::numeric_limits<T>::max())) > -1.0f))
     {
         for (size_t y = 0; y < size.height; ++y)
         {
diff --git a/3rdparty/carotene/src/phase.cpp b/3rdparty/carotene/src/phase.cpp
index 141b1e864ab2..48dea2a860f5 100644
--- a/3rdparty/carotene/src/phase.cpp
+++ b/3rdparty/carotene/src/phase.cpp
@@ -41,6 +41,7 @@
 #include <cmath>
 
 #include "common.hpp"
+#include "vround_helper.hpp"
 
 namespace CAROTENE_NS {
 
@@ -121,8 +122,6 @@ void phase(const Size2D &size,
     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
 
-    float32x4_t v_05 = vdupq_n_f32(0.5f);
-
     for (size_t i = 0; i < size.height; ++i)
     {
         const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
@@ -149,8 +148,8 @@ void phase(const Size2D &size,
             float32x4_t v_dst32f1;
             FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
 
-            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
-                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
+                                                vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
 
             // 1
             v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
@@ -161,8 +160,8 @@ void phase(const Size2D &size,
             v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
             FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
 
-            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
-                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
+                                                vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
 
             vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
                                           vmovn_u16(v_dst16s1)));
@@ -182,8 +181,8 @@ void phase(const Size2D &size,
             float32x4_t v_dst32f1;
             FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
 
-            uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
-                                            vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+            uint16x8_t v_dst = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
+                                            vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
 
             vst1_u8(dst + j, vmovn_u16(v_dst));
         }
diff --git a/3rdparty/carotene/src/vround_helper.hpp b/3rdparty/carotene/src/vround_helper.hpp
new file mode 100644
index 000000000000..89a62545106f
--- /dev/null
+++ b/3rdparty/carotene/src/vround_helper.hpp
@@ -0,0 +1,102 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_VROUND_HELPER_HPP
+#define CAROTENE_SRC_VROUND_HELPER_HPP
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#ifdef CAROTENE_NEON
+
+/**
+ * This helper header is for rounding from float32xN to uin32xN or int32xN to nearest, ties to even.
+ * See https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
+ */
+
+// See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
+#define CAROTENE_ROUND_DELTA (12582912.0f)
+
+namespace CAROTENE_NS { namespace internal {
+
+inline uint32x4_t vroundq_u32_f32(const float32x4_t val)
+{
+#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+    return vcvtnq_u32_f32(val);
+#else
+    const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
+    return vcvtq_u32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
+#endif
+}
+
+inline uint32x2_t vround_u32_f32(const float32x2_t val)
+{
+#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+    return vcvtn_u32_f32(val);
+#else
+    const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
+    return vcvt_u32_f32(vsub_f32(vadd_f32(val, delta), delta));
+#endif
+}
+
+inline int32x4_t vroundq_s32_f32(const float32x4_t val)
+{
+#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+    return vcvtnq_s32_f32(val);
+#else
+    const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
+    return vcvtq_s32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
+#endif
+}
+
+inline int32x2_t vround_s32_f32(const float32x2_t val)
+{
+#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
+    return vcvtn_s32_f32(val);
+#else
+    const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
+    return vcvt_s32_f32(vsub_f32(vadd_f32(val, delta), delta));
+#endif
+}
+
+} }
+
+#endif // CAROTENE_NEON
+
+#endif
diff --git a/3rdparty/ffmpeg/ffmpeg.cmake b/3rdparty/ffmpeg/ffmpeg.cmake
index da75e3d2cae8..f55882e59f14 100644
--- a/3rdparty/ffmpeg/ffmpeg.cmake
+++ b/3rdparty/ffmpeg/ffmpeg.cmake
@@ -1,8 +1,8 @@
-# Binaries branch name: ffmpeg/4.x_20230622
-# Binaries were created for OpenCV: 61d48dd0f8d1cc1a115d26998705a61478f64a3c
-ocv_update(FFMPEG_BINARIES_COMMIT "7da61f0695eabf8972a2c302bf1632a3d99fb0d5")
-ocv_update(FFMPEG_FILE_HASH_BIN32 "4aaef1456e282e5ef665d65555f47f56")
-ocv_update(FFMPEG_FILE_HASH_BIN64 "38a638851e064c591ce812e27ed43f1f")
+# Binaries branch name: ffmpeg/4.x_20240522
+# Binaries were created for OpenCV: 8393885a39dac1e650bf5d0aaff84c04ad8bcdd3
+ocv_update(FFMPEG_BINARIES_COMMIT "394dca6ceb3085c979415e6385996b6570e94153")
+ocv_update(FFMPEG_FILE_HASH_BIN32 "bdfbd1efb295f3e54c07d2cb7a843bf9")
+ocv_update(FFMPEG_FILE_HASH_BIN64 "bfef029900f788480a363d6dc05c4f0e")
 ocv_update(FFMPEG_FILE_HASH_CMAKE "8862c87496e2e8c375965e1277dee1c7")
 
 function(download_win_ffmpeg script_var)
diff --git a/3rdparty/ippicv/ippicv.cmake b/3rdparty/ippicv/ippicv.cmake
index 08cf091db381..744c45882e04 100644
--- a/3rdparty/ippicv/ippicv.cmake
+++ b/3rdparty/ippicv/ippicv.cmake
@@ -2,32 +2,33 @@ function(download_ippicv root_var)
   set(${root_var} "" PARENT_SCOPE)
 
   # Commit SHA in the opencv_3rdparty repo
-  set(IPPICV_COMMIT "1224f78da6684df04397ac0f40c961ed37f79ccb")
+  set(IPPICV_COMMIT "fd27188235d85e552de31425e7ea0f53ba73ba53")
   # Define actual ICV versions
   if(APPLE)
+    set(IPPICV_COMMIT "0cc4aa06bf2bef4b05d237c69a5a96b9cd0cb85a")
     set(OPENCV_ICV_PLATFORM "macosx")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_mac")
-    set(OPENCV_ICV_NAME "ippicv_2021.8_mac_intel64_20230330_general.tgz")
-    set(OPENCV_ICV_HASH "d2b234a86af1b616958619a4560356d9")
+    set(OPENCV_ICV_NAME "ippicv_2021.9.1_mac_intel64_20230919_general.tgz")
+    set(OPENCV_ICV_HASH "14f01c5a4780bfae9dde9b0aaf5e56fc")
   elseif((UNIX AND NOT ANDROID) OR (UNIX AND ANDROID_ABI MATCHES "x86"))
     set(OPENCV_ICV_PLATFORM "linux")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_lnx")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2021.8_lnx_intel64_20230330_general.tgz")
-      set(OPENCV_ICV_HASH "43219bdc7e3805adcbe3a1e2f1f3ef3b")
+      set(OPENCV_ICV_NAME "ippicv_2021.11.0_lnx_intel64_20240201_general.tgz")
+      set(OPENCV_ICV_HASH "0f2745ff705ecae31176dad437608f6f")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2021.8_lnx_ia32_20230330_general.tgz")
-      set(OPENCV_ICV_HASH "165875443d72faa3fd2146869da90d07")
+      set(OPENCV_ICV_NAME "ippicv_2021.11.0_lnx_ia32_20240201_general.tgz")
+      set(OPENCV_ICV_HASH "63e381bf08076ca34fd5264203043a45")
     endif()
   elseif(WIN32 AND NOT ARM)
     set(OPENCV_ICV_PLATFORM "windows")
     set(OPENCV_ICV_PACKAGE_SUBDIR "ippicv_win")
     if(X86_64)
-      set(OPENCV_ICV_NAME "ippicv_2021.8_win_intel64_20230330_general.zip")
-      set(OPENCV_ICV_HASH "71e4f58de939f0348ec7fb58ffb17dbf")
+      set(OPENCV_ICV_NAME "ippicv_2021.11.0_win_intel64_20240201_general.zip")
+      set(OPENCV_ICV_HASH "59d154bf54a1e3eea20d7248f81a2a8e")
     else()
-      set(OPENCV_ICV_NAME "ippicv_2021.8_win_ia32_20230330_general.zip")
-      set(OPENCV_ICV_HASH "57fd4648cfe64eae9e2ad9d50173a553")
+      set(OPENCV_ICV_NAME "ippicv_2021.11.0_win_ia32_20240201_general.zip")
+      set(OPENCV_ICV_HASH "7a6d8ac5825c02fea6cbfc1201b521b5")
     endif()
   else()
     return()
diff --git a/3rdparty/kleidicv/CMakeLists.txt b/3rdparty/kleidicv/CMakeLists.txt
new file mode 100644
index 000000000000..26e485441603
--- /dev/null
+++ b/3rdparty/kleidicv/CMakeLists.txt
@@ -0,0 +1,23 @@
+project(kleidicv_hal)
+
+set(KLEIDICV_SOURCE_PATH "" CACHE PATH "Directory containing KleidiCV sources")
+ocv_update(KLEIDICV_SRC_COMMIT "0.1.0")
+ocv_update(KLEIDICV_SRC_HASH "9388f28cf2fbe3338197b2b57d491468")
+
+if(KLEIDICV_SOURCE_PATH)
+  set(THE_ROOT "${KLEIDICV_SOURCE_PATH}")
+else()
+  ocv_download(FILENAME "kleidicv-${KLEIDICV_SRC_COMMIT}.tar.gz"
+                HASH ${KLEIDICV_SRC_HASH}
+                URL
+                  "${OPENCV_KLEIDICV_URL}"
+                  "$ENV{OPENCV_KLEIDICV_URL}"
+                  "https://gitlab.arm.com/kleidi/kleidicv/-/archive/${KLEIDICV_SRC_COMMIT}/"
+                DESTINATION_DIR "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/"
+                ID KLEIDICV
+                STATUS res
+                UNPACK RELATIVE_URL)
+  set(THE_ROOT "${OpenCV_BINARY_DIR}/3rdparty/kleidicv/kleidicv-${KLEIDICV_SRC_COMMIT}")
+endif()
+
+include("${THE_ROOT}/adapters/opencv/CMakeLists.txt")
diff --git a/3rdparty/libjasper/jas_stream.c b/3rdparty/libjasper/jas_stream.c
index 3ba7a837db87..0a85379b274f 100644
--- a/3rdparty/libjasper/jas_stream.c
+++ b/3rdparty/libjasper/jas_stream.c
@@ -889,7 +889,7 @@ int jas_stream_copy(jas_stream_t *out, jas_stream_t *in, int n)
     while (all || m > 0) {
         if ((c = jas_stream_getc_macro(in)) == EOF) {
             /* The next character of input could not be read. */
-            /* Return with an error if an I/O error occured
+            /* Return with an error if an I/O error occurred
               (not including EOF) or if an explicit copy count
               was specified. */
             return (!all || jas_stream_error(in)) ? (-1) : 0;
diff --git a/3rdparty/libjasper/jpc_bs.h b/3rdparty/libjasper/jpc_bs.h
index c85d4ef5306a..4465d7a41b3c 100644
--- a/3rdparty/libjasper/jpc_bs.h
+++ b/3rdparty/libjasper/jpc_bs.h
@@ -100,7 +100,7 @@
 #define	JPC_BITSTREAM_NOCLOSE	0x01
 /* End of file has been reached while reading. */
 #define	JPC_BITSTREAM_EOF	0x02
-/* An I/O error has occured. */
+/* An I/O error has occurerd. */
 #define	JPC_BITSTREAM_ERR	0x04
 
 /******************************************************************************\
diff --git a/3rdparty/libjpeg-turbo/CMakeLists.txt b/3rdparty/libjpeg-turbo/CMakeLists.txt
index ac0aaf63e1bf..f41665f329e5 100644
--- a/3rdparty/libjpeg-turbo/CMakeLists.txt
+++ b/3rdparty/libjpeg-turbo/CMakeLists.txt
@@ -1,12 +1,43 @@
 project(${JPEG_LIBRARY} C)
 
+macro(boolean_number var)
+  if(${var})
+    set(${var} 1 ${ARGN})
+  else()
+    set(${var} 0 ${ARGN})
+  endif()
+endmacro()
+
 ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter -Wsign-compare -Wshorten-64-to-32 -Wimplicit-fallthrough)
+if(APPLE)
+  ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable) # NEON flags are not used on Mac
+endif()
 
-set(VERSION_MAJOR 2)
-set(VERSION_MINOR 1)
-set(VERSION_REVISION 3)
-set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_REVISION})
-set(LIBJPEG_TURBO_VERSION_NUMBER 2001003)
+if(CV_GCC AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13)
+  # src/jchuff.c:1042:22: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
+  ocv_warnings_disable(CMAKE_C_FLAGS -Wstringop-overflow)
+endif()
+
+set(VERSION 3.0.3)
+set(COPYRIGHT_YEAR "1991-2024")
+string(REPLACE "." ";" VERSION_TRIPLET ${VERSION})
+list(GET VERSION_TRIPLET 0 VERSION_MAJOR)
+list(GET VERSION_TRIPLET 1 VERSION_MINOR)
+list(GET VERSION_TRIPLET 2 VERSION_REVISION)
+function(pad_number NUMBER OUTPUT_LEN)
+  string(LENGTH "${${NUMBER}}" INPUT_LEN)
+  if(INPUT_LEN LESS OUTPUT_LEN)
+    math(EXPR ZEROES "${OUTPUT_LEN} - ${INPUT_LEN} - 1")
+    set(NUM ${${NUMBER}})
+    foreach(C RANGE ${ZEROES})
+      set(NUM "0${NUM}")
+    endforeach()
+    set(${NUMBER} ${NUM} PARENT_SCOPE)
+  endif()
+endfunction()
+pad_number(VERSION_MINOR 3)
+pad_number(VERSION_REVISION 3)
+set(LIBJPEG_TURBO_VERSION_NUMBER ${VERSION_MAJOR}${VERSION_MINOR}${VERSION_REVISION})
 
 string(TIMESTAMP BUILD "opencv-${OPENCV_VERSION}-libjpeg-turbo")
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -16,7 +47,7 @@ endif()
 message(STATUS "libjpeg-turbo: VERSION = ${VERSION}, BUILD = ${BUILD}")
 
 math(EXPR BITS "${CMAKE_SIZEOF_VOID_P} * 8")
-string(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} CMAKE_SYSTEM_PROCESSOR_LC)
+string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" CMAKE_SYSTEM_PROCESSOR_LC)
 
 if(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "x86_64" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "amd64" OR
@@ -56,8 +87,7 @@ if(MSVC_IDE AND CMAKE_GENERATOR_PLATFORM MATCHES "arm64")
   set(CPU_TYPE arm64)
 endif()
 
-OCV_OPTION(ENABLE_LIBJPEG_TURBO_SIMD "Include SIMD extensions for libjpeg-turbo, if available for this platform" (NOT CV_DISABLE_OPTIMIZATION)
-  VISIBLE_IF BUILD_JPEG)
+OCV_OPTION(ENABLE_LIBJPEG_TURBO_SIMD "Include SIMD extensions for libjpeg-turbo, if available for this platform" (NOT CV_DISABLE_OPTIMIZATION))
 option(WITH_ARITH_ENC "Include arithmetic encoding support when emulating the libjpeg v6b API/ABI" TRUE)
 option(WITH_ARITH_DEC "Include arithmetic decoding support when emulating the libjpeg v6b API/ABI" TRUE)
 set(WITH_SIMD 1)
@@ -67,8 +97,8 @@ include(CheckCSourceCompiles)
 include(CheckIncludeFiles)
 include(CheckTypeSize)
 
-check_type_size("size_t" SIZEOF_SIZE_T)
-check_type_size("unsigned long" SIZEOF_UNSIGNED_LONG)
+check_type_size("size_t" SIZE_T)
+check_type_size("unsigned long" UNSIGNED_LONG)
 
 if(SIZEOF_SIZE_T EQUAL SIZEOF_UNSIGNED_LONG)
   check_c_source_compiles("int main(int argc, char **argv) { unsigned long a = argc;  return __builtin_ctzl(a); }"
@@ -104,33 +134,34 @@ if(WITH_ARITH_DEC)
   set(D_ARITH_CODING_SUPPORTED 1)
 endif()
 
-set(JPEG_LIB_VERSION 62)
+set(JPEG_LIB_VERSION 70)
 
 # OpenCV
 set(JPEG_LIB_VERSION "${VERSION}-${JPEG_LIB_VERSION}" PARENT_SCOPE)
 
 set(THREAD_LOCAL "")  # WITH_TURBOJPEG is not used
 
+add_definitions(-DNO_GETENV -DNO_PUTENV)
+
 if(MSVC)
   add_definitions(-W3 -wd4996 -wd4018)
 endif()
 
-if(WIN32)
-  configure_file(jconfig.h.win.in jconfig.h)
-else()
-  configure_file(jconfig.h.in jconfig.h)
-endif()
-configure_file(jconfigint.h.in jconfigint.h)
-
 include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
-        jcicc.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c
-        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c
-        jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c
-        jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c
-        jdtrans.c jerror.c jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c
-        jidctint.c jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
+set(JPEG16_SOURCES jcapistd.c jccolor.c jcdiffct.c jclossls.c jcmainct.c
+    jcprepct.c jcsample.c jdapistd.c jdcolor.c jddiffct.c jdlossls.c jdmainct.c
+    jdpostct.c jdsample.c jutils.c)
+
+set(JPEG12_SOURCES ${JPEG16_SOURCES} jccoefct.c jcdctmgr.c jdcoefct.c
+    jddctmgr.c jdmerge.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c
+    jidctred.c jquant1.c jquant2.c)
+
+set(JPEG_SOURCES ${JPEG12_SOURCES} jcapimin.c jchuff.c jcicc.c jcinit.c
+    jclhuff.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c jctrans.c
+    jdapimin.c jdatadst.c jdatasrc.c jdhuff.c jdicc.c jdinput.c jdlhuff.c
+    jdmarker.c jdmaster.c jdphuff.c jdtrans.c jerror.c jfdctflt.c jmemmgr.c
+    jmemnobs.c jpeg_nbits.c)
 
 if(WITH_ARITH_ENC OR WITH_ARITH_DEC)
   set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c)
@@ -144,7 +175,7 @@ if(WITH_ARITH_DEC)
   set(JPEG_SOURCES ${JPEG_SOURCES} jdarith.c)
 endif()
 
-if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
   # Use the maximum optimization level for release builds
   foreach(var CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_RELWITHDEBINFO)
     if(${var} MATCHES "-O2")
@@ -167,6 +198,10 @@ if(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
   endif()
 endif()
 
+include(CheckTypeSize)
+check_type_size("size_t" SIZE_T)
+check_type_size("unsigned long" UNSIGNED_LONG)
+
 if(ENABLE_LIBJPEG_TURBO_SIMD)
   add_subdirectory(src/simd)
   if(NEON_INTRINSICS)
@@ -182,19 +217,28 @@ if(WITH_SIMD)
   if(MSVC_IDE OR XCODE)
     set_source_files_properties(${SIMD_OBJS} PROPERTIES GENERATED 1)
   endif()
-else()
-  add_library(jsimd OBJECT src/jsimd_none.c)
-  set_target_properties(jsimd PROPERTIES FOLDER "3rdparty")
-  if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
-    set_target_properties(jsimd PROPERTIES POSITION_INDEPENDENT_CODE 1)
-  endif()
+  set(SIMD_TARGET_OBJECTS $<TARGET_OBJECTS:simd>)
 endif()
 
+configure_file(jversion.h.in jversion.h)
+configure_file(jconfig.h.in jconfig.h)
+configure_file(jconfigint.h.in jconfigint.h)
+
+ocv_list_add_prefix(JPEG16_SOURCES src/)
+ocv_list_add_prefix(JPEG12_SOURCES src/)
 ocv_list_add_prefix(JPEG_SOURCES src/)
 
 set(JPEG_SOURCES ${JPEG_SOURCES} ${SIMD_OBJS})
 
-add_library(${JPEG_LIBRARY} STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} ${JPEG_SOURCES} $<TARGET_OBJECTS:jsimd> ${SIMD_OBJS})
+add_library(jpeg12-static OBJECT ${JPEG12_SOURCES})
+set_property(TARGET jpeg12-static PROPERTY COMPILE_FLAGS
+  "-DBITS_IN_JSAMPLE=12")
+add_library(jpeg16-static OBJECT ${JPEG16_SOURCES})
+set_property(TARGET jpeg16-static PROPERTY COMPILE_FLAGS
+  "-DBITS_IN_JSAMPLE=16")
+add_library(${JPEG_LIBRARY} STATIC ${JPEG_SOURCES} ${SIMD_TARGET_OBJECTS}
+  ${SIMD_OBJS} $<TARGET_OBJECTS:jpeg12-static>
+  $<TARGET_OBJECTS:jpeg16-static>)
 
 set_target_properties(${JPEG_LIBRARY}
   PROPERTIES OUTPUT_NAME ${JPEG_LIBRARY}
@@ -205,7 +249,9 @@ set_target_properties(${JPEG_LIBRARY}
   )
 
 if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${JPEG_LIBRARY} PROPERTIES FOLDER "3rdparty")
+  set_target_properties(${JPEG_LIBRARY} PROPERTIES FOLDER "3rdparty/jpeg")
+  set_target_properties(jpeg12-static PROPERTIES FOLDER "3rdparty/jpeg")
+  set_target_properties(jpeg16-static PROPERTIES FOLDER "3rdparty/jpeg")
 endif()
 
 if(NOT BUILD_SHARED_LIBS)
diff --git a/3rdparty/libjpeg-turbo/LICENSE.md b/3rdparty/libjpeg-turbo/LICENSE.md
index d753e1d76aa0..2204864fa118 100644
--- a/3rdparty/libjpeg-turbo/LICENSE.md
+++ b/3rdparty/libjpeg-turbo/LICENSE.md
@@ -1,30 +1,33 @@
 libjpeg-turbo Licenses
 ======================
 
-libjpeg-turbo is covered by three compatible BSD-style open source licenses:
+libjpeg-turbo is covered by two compatible BSD-style open source licenses:
 
 - The IJG (Independent JPEG Group) License, which is listed in
   [README.ijg](README.ijg)
 
-  This license applies to the libjpeg API library and associated programs
-  (any code inherited from libjpeg, and any modifications to that code.)
+  This license applies to the libjpeg API library and associated programs,
+  including any code inherited from libjpeg and any modifications to that
+  code.  Note that the libjpeg-turbo SIMD source code bears the
+  [zlib License](https://opensource.org/licenses/Zlib), but in the context of
+  the overall libjpeg API library, the terms of the zlib License are subsumed
+  by the terms of the IJG License.
 
 - The Modified (3-clause) BSD License, which is listed below
 
-  This license covers the TurboJPEG API library and associated programs, as
-  well as the build system.
-
-- The [zlib License](https://opensource.org/licenses/Zlib)
-
-  This license is a subset of the other two, and it covers the libjpeg-turbo
-  SIMD extensions.
+  This license applies to the TurboJPEG API library and associated programs, as
+  well as the build system.  Note that the TurboJPEG API library wraps the
+  libjpeg API library, so in the context of the overall TurboJPEG API library,
+  both the terms of the IJG License and the terms of the Modified (3-clause)
+  BSD License apply.
 
 
 Complying with the libjpeg-turbo Licenses
 =========================================
 
 This section provides a roll-up of the libjpeg-turbo licensing terms, to the
-best of our understanding.
+best of our understanding.  This is not a license in and of itself.  It is
+intended solely for clarification.
 
 1.  If you are distributing a modified version of the libjpeg-turbo source,
     then:
@@ -38,7 +41,7 @@ best of our understanding.
         - Clauses 1 and 3 of the zlib License
 
     2.  You must add your own copyright notice to the header of each source
-        file you modified, so others can tell that you modified that file (if
+        file you modified, so others can tell that you modified that file.  (If
         there is not an existing copyright header in that file, then you can
         simply add a notice stating that you modified the file.)
 
@@ -91,7 +94,7 @@ best of our understanding.
 The Modified (3-clause) BSD License
 ===================================
 
-Copyright (C)2009-2022 D. R. Commander.  All Rights Reserved.<br>
+Copyright (C)2009-2023 D. R. Commander.  All Rights Reserved.<br>
 Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -119,8 +122,8 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 
-Why Three Licenses?
-===================
+Why Two Licenses?
+=================
 
 The zlib License could have been used instead of the Modified (3-clause) BSD
 License, and since the IJG License effectively subsumes the distribution
diff --git a/3rdparty/libjpeg-turbo/README.ijg b/3rdparty/libjpeg-turbo/README.ijg
index 9453c195010f..8f3768265f68 100644
--- a/3rdparty/libjpeg-turbo/README.ijg
+++ b/3rdparty/libjpeg-turbo/README.ijg
@@ -43,7 +43,7 @@ User documentation:
   change.log        Version-to-version change highlights.
 Programmer and internal documentation:
   libjpeg.txt       How to use the JPEG library in your own programs.
-  example.txt       Sample code for calling the JPEG library.
+  example.c         Sample code for calling the JPEG library.
   structure.txt     Overview of the JPEG library's internal structure.
   coderules.txt     Coding style rules --- please read if you contribute code.
 
@@ -68,17 +68,17 @@ other abrupt features may not compress well with JPEG, and a higher JPEG
 quality may have to be used to avoid visible compression artifacts with such
 images.
 
-JPEG is lossy, meaning that the output pixels are not necessarily identical to
-the input pixels.  However, on photographic content and other "smooth" images,
-very good compression ratios can be obtained with no visible compression
-artifacts, and extremely high compression ratios are possible if you are
-willing to sacrifice image quality (by reducing the "quality" setting in the
-compressor.)
-
-This software implements JPEG baseline, extended-sequential, and progressive
-compression processes.  Provision is made for supporting all variants of these
-processes, although some uncommon parameter settings aren't implemented yet.
-We have made no provision for supporting the hierarchical or lossless
+JPEG is normally lossy, meaning that the output pixels are not necessarily
+identical to the input pixels.  However, on photographic content and other
+"smooth" images, very good compression ratios can be obtained with no visible
+compression artifacts, and extremely high compression ratios are possible if
+you are willing to sacrifice image quality (by reducing the "quality" setting
+in the compressor.)
+
+This software implements JPEG baseline, extended-sequential, progressive, and
+lossless compression processes.  Provision is made for supporting all variants
+of these processes, although some uncommon parameter settings aren't
+implemented yet.  We have made no provision for supporting the hierarchical
 processes defined in the standard.
 
 We provide a set of library routines for reading and writing JPEG image files,
@@ -241,7 +241,7 @@ This software implements ITU T.81 | ISO/IEC 10918 with some extensions from
 ITU T.871 | ISO/IEC 10918-5 (JPEG File Interchange Format-- see REFERENCES).
 Informally, the term "JPEG image" or "JPEG file" most often refers to JFIF or
 a subset thereof, but there are other formats containing the name "JPEG" that
-are incompatible with the DCT-based JPEG standard or with JFIF (for instance,
+are incompatible with the original JPEG standard or with JFIF (for instance,
 JPEG 2000 and JPEG XR).  This software therefore does not support these
 formats.  Indeed, one of the original reasons for developing this free software
 was to help force convergence on a common, interoperable format standard for
diff --git a/3rdparty/libjpeg-turbo/README.md b/3rdparty/libjpeg-turbo/README.md
index 01e391ea7c08..923e61d231c1 100644
--- a/3rdparty/libjpeg-turbo/README.md
+++ b/3rdparty/libjpeg-turbo/README.md
@@ -21,7 +21,26 @@ derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and
 VirtualGL projects made numerous enhancements to the codec in 2009, and in
 early 2010, libjpeg-turbo spun off into an independent project, with the goal
 of making high-speed JPEG compression/decompression technology available to a
-broader range of users and developers.
+broader range of users and developers.  libjpeg-turbo is an ISO/IEC and ITU-T
+reference implementation of the JPEG standard.
+
+More information about libjpeg-turbo can be found at
+<https://libjpeg-turbo.org>.
+
+
+Funding
+=======
+
+libjpeg-turbo is an independent open source project, but we rely on patronage
+and funded development in order to maintain that independence.  The easiest way
+to ensure that libjpeg-turbo remains community-focused and free of any one
+organization's agenda is to
+[sponsor our project through GitHub](https://github.com/sponsors/libjpeg-turbo).
+All sponsorship money goes directly toward funding the labor necessary to
+maintain libjpeg-turbo, support the user community, and implement bug fixes and
+strategically important features.
+
+[![Sponsor libjpeg-turbo](https://img.shields.io/github/sponsors/libjpeg-turbo?label=Sponsor&logo=GitHub)](https://github.com/sponsors/libjpeg-turbo)
 
 
 License
@@ -245,16 +264,6 @@ programs that need them, without breaking ABI compatibility for programs that
 don't, and it allows those functions to be provided in the "official"
 libjpeg-turbo binaries.
 
-Those who are concerned about maintaining strict conformance with the libjpeg
-v6b or v7 API can pass an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to
-building libjpeg-turbo.  This will restore the pre-1.3 behavior, in which
-`jpeg_mem_src()` and `jpeg_mem_dest()` are only included when emulating the
-libjpeg v8 API/ABI.
-
-On Un*x systems, including the in-memory source/destination managers changes
-the dynamic library version from 62.2.0 to 62.3.0 if using libjpeg v6b API/ABI
-emulation and from 7.2.0 to 7.3.0 if using libjpeg v7 API/ABI emulation.
-
 Note that, on most Un*x systems, the dynamic linker will not look for a
 function in a library until that function is actually used.  Thus, if a program
 is built against libjpeg-turbo 1.3+ and uses `jpeg_mem_src()` or
@@ -274,30 +283,35 @@ Mathematical Compatibility
 ==========================
 
 For the most part, libjpeg-turbo should produce identical output to libjpeg
-v6b.  The one exception to this is when using the floating point DCT/IDCT, in
-which case the outputs of libjpeg v6b and libjpeg-turbo can differ for the
-following reasons:
-
-- The SSE/SSE2 floating point DCT implementation in libjpeg-turbo is ever so
-  slightly more accurate than the implementation in libjpeg v6b, but not by
-  any amount perceptible to human vision (generally in the range of 0.01 to
-  0.08 dB gain in PNSR.)
-
-- When not using the SIMD extensions, libjpeg-turbo uses the more accurate
-  (and slightly faster) floating point IDCT algorithm introduced in libjpeg
-  v8a as opposed to the algorithm used in libjpeg v6b.  It should be noted,
-  however, that this algorithm basically brings the accuracy of the floating
-  point IDCT in line with the accuracy of the accurate integer IDCT.  The
-  floating point DCT/IDCT algorithms are mainly a legacy feature, and they do
-  not produce significantly more accuracy than the accurate integer algorithms
-  (to put numbers on this, the typical difference in PNSR between the two
-  algorithms is less than 0.10 dB, whereas changing the quality level by 1 in
-  the upper range of the quality scale is typically more like a 1.0 dB
-  difference.)
-
-- If the floating point algorithms in libjpeg-turbo are not implemented using
-  SIMD instructions on a particular platform, then the accuracy of the
-  floating point DCT/IDCT can depend on the compiler settings.
+v6b.  There are two exceptions:
+
+1. When decompressing a JPEG image that uses 4:4:0 chrominance subsampling, the
+outputs of libjpeg v6b and libjpeg-turbo can differ because libjpeg-turbo
+implements a "fancy" (smooth) 4:4:0 upsampling algorithm and libjpeg did not.
+
+2. When using the floating point DCT/IDCT, the outputs of libjpeg v6b and
+libjpeg-turbo can differ for the following reasons:
+
+    - The SSE/SSE2 floating point DCT implementation in libjpeg-turbo is ever
+      so slightly more accurate than the implementation in libjpeg v6b, but not
+      by any amount perceptible to human vision (generally in the range of 0.01
+      to 0.08 dB gain in PNSR.)
+
+    - When not using the SIMD extensions, libjpeg-turbo uses the more accurate
+      (and slightly faster) floating point IDCT algorithm introduced in libjpeg
+      v8a as opposed to the algorithm used in libjpeg v6b.  It should be noted,
+      however, that this algorithm basically brings the accuracy of the
+      floating point IDCT in line with the accuracy of the accurate integer
+      IDCT.  The floating point DCT/IDCT algorithms are mainly a legacy
+      feature, and they do not produce significantly more accuracy than the
+      accurate integer algorithms.  (To put numbers on this, the typical
+      difference in PNSR between the two algorithms is less than 0.10 dB,
+      whereas changing the quality level by 1 in the upper range of the quality
+      scale is typically more like a 1.0 dB difference.)
+
+    - If the floating point algorithms in libjpeg-turbo are not implemented
+      using SIMD instructions on a particular platform, then the accuracy of
+      the floating point DCT/IDCT can depend on the compiler settings.
 
 While libjpeg-turbo does emulate the libjpeg v8 API/ABI, under the hood it is
 still using the same algorithms as libjpeg v6b, so there are several specific
diff --git a/3rdparty/libjpeg-turbo/jconfig.h.in b/3rdparty/libjpeg-turbo/jconfig.h.in
index d4284d97b812..6cb82962ffeb 100644
--- a/3rdparty/libjpeg-turbo/jconfig.h.in
+++ b/3rdparty/libjpeg-turbo/jconfig.h.in
@@ -9,60 +9,52 @@
 /* libjpeg-turbo version in integer form */
 #define LIBJPEG_TURBO_VERSION_NUMBER  @LIBJPEG_TURBO_VERSION_NUMBER@
 
-/* Support arithmetic encoding */
+/* Support arithmetic encoding when using 8-bit samples */
 #cmakedefine C_ARITH_CODING_SUPPORTED 1
 
-/* Support arithmetic decoding */
+/* Support arithmetic decoding when using 8-bit samples */
 #cmakedefine D_ARITH_CODING_SUPPORTED 1
 
 /* Support in-memory source/destination managers */
-#cmakedefine MEM_SRCDST_SUPPORTED 1
+#define MEM_SRCDST_SUPPORTED  1
 
-/* Use accelerated SIMD routines. */
+/* Use accelerated SIMD routines when using 8-bit samples */
 #cmakedefine WITH_SIMD 1
 
-/*
- * Define BITS_IN_JSAMPLE as either
- *   8   for 8-bit sample values (the usual setting)
- *   12  for 12-bit sample values
- * Only 8 and 12 are legal data precisions for lossy JPEG according to the
- * JPEG standard, and the IJG code does not support anything else!
- * We do not support run-time selection of data precision, sorry.
+/* This version of libjpeg-turbo supports run-time selection of data precision,
+ * so BITS_IN_JSAMPLE is no longer used to specify the data precision at build
+ * time.  However, some downstream software expects the macro to be defined.
+ * Since 12-bit data precision is an opt-in feature that requires explicitly
+ * calling 12-bit-specific libjpeg API functions and using 12-bit-specific data
+ * types, the unmodified portion of the libjpeg API still behaves as if it were
+ * built for 8-bit precision, and JSAMPLE is still literally an 8-bit data
+ * type.  Thus, it is correct to define BITS_IN_JSAMPLE to 8 here.
  */
+#ifndef BITS_IN_JSAMPLE
+#define BITS_IN_JSAMPLE  8
+#endif
 
-#define BITS_IN_JSAMPLE  @BITS_IN_JSAMPLE@      /* use 8 or 12 */
+#ifdef _WIN32
 
-/* Define to 1 if you have the <locale.h> header file. */
-#cmakedefine HAVE_LOCALE_H 1
+#undef RIGHT_SHIFT_IS_UNSIGNED
 
-/* Define to 1 if you have the <stddef.h> header file. */
-#cmakedefine HAVE_STDDEF_H 1
+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
+#endif
+#define HAVE_BOOLEAN            /* prevent jmorecfg.h from redefining it */
 
-/* Define to 1 if you have the <stdlib.h> header file. */
-#cmakedefine HAVE_STDLIB_H 1
+/* Define "INT32" as int, not long, per Windows custom */
+#if !(defined(_BASETSD_H_) || defined(_BASETSD_H))   /* don't conflict if basetsd.h already read */
+typedef short INT16;
+typedef signed int INT32;
+#endif
+#define XMD_H                   /* prevent jmorecfg.h from redefining it */
 
-/* Define if you need to include <sys/types.h> to get size_t. */
-#cmakedefine NEED_SYS_TYPES_H 1
-
-/* Define if you have BSD-like bzero and bcopy in <strings.h> rather than
-   memset/memcpy in <string.h>. */
-#cmakedefine NEED_BSD_STRINGS 1
-
-/* Define to 1 if the system has the type `unsigned char'. */
-#cmakedefine HAVE_UNSIGNED_CHAR 1
-
-/* Define to 1 if the system has the type `unsigned short'. */
-#cmakedefine HAVE_UNSIGNED_SHORT 1
-
-/* Compiler does not support pointers to undefined structures. */
-#cmakedefine INCOMPLETE_TYPES_BROKEN 1
+#else
 
 /* Define if your (broken) compiler shifts signed values as if they were
    unsigned. */
 #cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1
 
-/* Define to empty if `const' does not conform to ANSI C. */
-/* #undef const */
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
+#endif
diff --git a/3rdparty/libjpeg-turbo/jconfig.h.win.in b/3rdparty/libjpeg-turbo/jconfig.h.win.in
deleted file mode 100644
index 13cceef01d13..000000000000
--- a/3rdparty/libjpeg-turbo/jconfig.h.win.in
+++ /dev/null
@@ -1,33 +0,0 @@
-#define JPEG_LIB_VERSION  @JPEG_LIB_VERSION@
-#define LIBJPEG_TURBO_VERSION  @VERSION@
-#define LIBJPEG_TURBO_VERSION_NUMBER  @LIBJPEG_TURBO_VERSION_NUMBER@
-
-#cmakedefine C_ARITH_CODING_SUPPORTED
-#cmakedefine D_ARITH_CODING_SUPPORTED
-#cmakedefine MEM_SRCDST_SUPPORTED
-#cmakedefine WITH_SIMD
-
-#define BITS_IN_JSAMPLE  @BITS_IN_JSAMPLE@      /* use 8 or 12 */
-
-#define HAVE_STDDEF_H
-#define HAVE_STDLIB_H
-#undef NEED_SYS_TYPES_H
-#undef NEED_BSD_STRINGS
-
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-#undef INCOMPLETE_TYPES_BROKEN
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-/* Define "boolean" as unsigned char, not int, per Windows custom */
-#ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
-typedef unsigned char boolean;
-#endif
-#define HAVE_BOOLEAN            /* prevent jmorecfg.h from redefining it */
-
-/* Define "INT32" as int, not long, per Windows custom */
-#if !(defined(_BASETSD_H_) || defined(_BASETSD_H))   /* don't conflict if basetsd.h already read */
-typedef short INT16;
-typedef signed int INT32;
-#endif
-#define XMD_H                   /* prevent jmorecfg.h from redefining it */
diff --git a/3rdparty/libjpeg-turbo/jconfigint.h.in b/3rdparty/libjpeg-turbo/jconfigint.h.in
index a46979df1e83..5c14e32a1d15 100644
--- a/3rdparty/libjpeg-turbo/jconfigint.h.in
+++ b/3rdparty/libjpeg-turbo/jconfigint.h.in
@@ -1,19 +1,14 @@
 /* libjpeg-turbo build number */
 #define BUILD  "@BUILD@"
 
+/* How to hide global symbols. */
+#define HIDDEN  @HIDDEN@
+
 /* Compiler's inline keyword */
 #undef inline
 
 /* How to obtain function inlining. */
-#ifndef INLINE
-#if defined(__GNUC__)
-#define INLINE inline __attribute__((always_inline))
-#elif defined(_MSC_VER)
-#define INLINE __forceinline
-#else
-#define INLINE
-#endif
-#endif
+#define INLINE  @INLINE@
 
 /* How to obtain thread-local storage */
 #define THREAD_LOCAL  @THREAD_LOCAL@
@@ -25,7 +20,7 @@
 #define VERSION  "@VERSION@"
 
 /* The size of `size_t', as computed by sizeof. */
-#define SIZEOF_SIZE_T  @SIZEOF_SIZE_T@
+#define SIZEOF_SIZE_T  @SIZE_T@
 
 /* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
 #cmakedefine HAVE_BUILTIN_CTZL
@@ -50,3 +45,32 @@
 #else
 #define FALLTHROUGH
 #endif
+
+/*
+ * Define BITS_IN_JSAMPLE as either
+ *   8   for 8-bit sample values (the usual setting)
+ *   12  for 12-bit sample values
+ * Only 8 and 12 are legal data precisions for lossy JPEG according to the
+ * JPEG standard, and the IJG code does not support anything else!
+ */
+
+#ifndef BITS_IN_JSAMPLE
+#define BITS_IN_JSAMPLE  8      /* use 8 or 12 */
+#endif
+
+#undef C_ARITH_CODING_SUPPORTED
+#undef D_ARITH_CODING_SUPPORTED
+#undef WITH_SIMD
+
+#if BITS_IN_JSAMPLE == 8
+
+/* Support arithmetic encoding */
+#cmakedefine C_ARITH_CODING_SUPPORTED 1
+
+/* Support arithmetic decoding */
+#cmakedefine D_ARITH_CODING_SUPPORTED 1
+
+/* Use accelerated SIMD routines. */
+#cmakedefine WITH_SIMD 1
+
+#endif
diff --git a/3rdparty/libjpeg-turbo/src/jversion.h b/3rdparty/libjpeg-turbo/jversion.h.in
similarity index 79%
rename from 3rdparty/libjpeg-turbo/src/jversion.h
rename to 3rdparty/libjpeg-turbo/jversion.h.in
index 2ab534af4147..fc0ce3e09e3b 100644
--- a/3rdparty/libjpeg-turbo/src/jversion.h
+++ b/3rdparty/libjpeg-turbo/jversion.h.in
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2021, D. R. Commander.
+ * Copyright (C) 2010, 2012-2024, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -36,19 +36,21 @@
  *   their code
  */
 
-#define JCOPYRIGHT \
-  "Copyright (C) 2009-2021 D. R. Commander\n" \
+#define JCOPYRIGHT1 \
+  "Copyright (C) 2009-2024 D. R. Commander\n" \
   "Copyright (C) 2015, 2020 Google, Inc.\n" \
   "Copyright (C) 2019-2020 Arm Limited\n" \
   "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
   "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
-  "Copyright (C) 2015 Intel Corporation\n" \
+  "Copyright (C) 2015 Intel Corporation\n"
+#define JCOPYRIGHT2 \
   "Copyright (C) 2013-2014 Linaro Limited\n" \
   "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
   "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
   "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
   "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
+  "Copyright (C) 1999 Ken Murchison\n" \
+  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding\n"
 
 #define JCOPYRIGHT_SHORT \
-  "Copyright (C) 1991-2021 The libjpeg-turbo Project and many others"
+  "Copyright (C) @COPYRIGHT_YEAR@ The libjpeg-turbo Project and many others"
diff --git a/3rdparty/libjpeg-turbo/src/cjpeg.c b/3rdparty/libjpeg-turbo/src/cjpeg.c
new file mode 100644
index 000000000000..44c39bec20da
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/cjpeg.c
@@ -0,0 +1,841 @@
+/*
+ * cjpeg.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2011 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2013-2014, 2017, 2019-2022, 2024, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a command-line user interface for the JPEG compressor.
+ * It should work on any system with Unix- or MS-DOS-style command lines.
+ *
+ * Two different command line styles are permitted, depending on the
+ * compile-time switch TWO_FILE_COMMANDLINE:
+ *      cjpeg [options]  inputfile outputfile
+ *      cjpeg [options]  [inputfile]
+ * In the second style, output is always to standard output, which you'd
+ * normally redirect to a file or pipe to some other program.  Input is
+ * either from a named file or from standard input (typically redirected).
+ * The second style is convenient on Unix but is unhelpful on systems that
+ * don't support pipes.  Also, you MUST use the first style if your system
+ * doesn't do binary I/O to stdin/stdout.
+ * To simplify script writing, the "-outfile" switch is provided.  The syntax
+ *      cjpeg [options]  -outfile outputfile  inputfile
+ * works regardless of which command line style is used.
+ */
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#ifdef CJPEG_FUZZER
+#define JPEG_INTERNALS
+#endif
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "jversion.h"           /* for version message */
+#include "jconfigint.h"
+
+
+/* Create the add-on message string table. */
+
+#define JMESSAGE(code, string)  string,
+
+static const char * const cdjpeg_message_table[] = {
+#include "cderror.h"
+  NULL
+};
+
+
+/*
+ * This routine determines what format the input file is,
+ * and selects the appropriate input-reading module.
+ *
+ * To determine which family of input formats the file belongs to,
+ * we may look only at the first byte of the file, since C does not
+ * guarantee that more than one character can be pushed back with ungetc.
+ * Looking at additional bytes would require one of these approaches:
+ *     1) assume we can fseek() the input file (fails for piped input);
+ *     2) assume we can push back more than one character (works in
+ *        some C implementations, but unportable);
+ *     3) provide our own buffering (breaks input readers that want to use
+ *        stdio directly);
+ * or  4) don't put back the data, and modify the input_init methods to assume
+ *        they start reading after the start of file.
+ * #1 is attractive for MS-DOS but is untenable on Unix.
+ *
+ * The most portable solution for file types that can't be identified by their
+ * first byte is to make the user tell us what they are.  This is also the
+ * only approach for "raw" file types that contain only arbitrary values.
+ * We presently apply this method for Targa files.  Most of the time Targa
+ * files start with 0x00, so we recognize that case.  Potentially, however,
+ * a Targa file could start with any byte value (byte 0 is the length of the
+ * seldom-used ID field), so we provide a switch to force Targa input mode.
+ */
+
+static boolean is_targa;        /* records user -targa switch */
+
+
+LOCAL(cjpeg_source_ptr)
+select_file_type(j_compress_ptr cinfo, FILE *infile)
+{
+  int c;
+
+  if (is_targa) {
+#ifdef TARGA_SUPPORTED
+    return jinit_read_targa(cinfo);
+#else
+    ERREXIT(cinfo, JERR_TGA_NOTCOMP);
+#endif
+  }
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+  if (ungetc(c, infile) == EOF)
+    ERREXIT(cinfo, JERR_UNGETC_FAILED);
+
+  switch (c) {
+#ifdef BMP_SUPPORTED
+  case 'B':
+    return jinit_read_bmp(cinfo, TRUE);
+#endif
+#ifdef GIF_SUPPORTED
+  case 'G':
+    if (cinfo->data_precision == 16) {
+#ifdef C_LOSSLESS_SUPPORTED
+      return j16init_read_gif(cinfo);
+#else
+      ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+      break;
+#endif
+    } else if (cinfo->data_precision == 12)
+      return j12init_read_gif(cinfo);
+    else
+      return jinit_read_gif(cinfo);
+#endif
+#ifdef PPM_SUPPORTED
+  case 'P':
+    if (cinfo->data_precision == 16) {
+#ifdef C_LOSSLESS_SUPPORTED
+      return j16init_read_ppm(cinfo);
+#else
+      ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+      break;
+#endif
+    } else if (cinfo->data_precision == 12)
+      return j12init_read_ppm(cinfo);
+    else
+      return jinit_read_ppm(cinfo);
+#endif
+#ifdef TARGA_SUPPORTED
+  case 0x00:
+    return jinit_read_targa(cinfo);
+#endif
+  default:
+    ERREXIT(cinfo, JERR_UNKNOWN_FORMAT);
+    break;
+  }
+
+  return NULL;                  /* suppress compiler warnings */
+}
+
+
+/*
+ * Argument-parsing code.
+ * The switch parser is designed to be useful with DOS-style command line
+ * syntax, ie, intermixed switches and file names, where only the switches
+ * to the left of a given file name affect processing of that file.
+ * The main program in this file doesn't actually use this capability...
+ */
+
+
+static const char *progname;    /* program name for error messages */
+static char *icc_filename;      /* for -icc switch */
+static char *outfilename;       /* for -outfile switch */
+static boolean memdst;          /* for -memdst switch */
+static boolean report;          /* for -report switch */
+static boolean strict;          /* for -strict switch */
+
+
+#ifdef CJPEG_FUZZER
+
+#include <setjmp.h>
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+};
+
+void my_error_exit(j_common_ptr cinfo)
+{
+  struct my_error_mgr *myerr = (struct my_error_mgr *)cinfo->err;
+
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+static void my_emit_message_fuzzer(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0)
+    cinfo->err->num_warnings++;
+}
+
+#define HANDLE_ERROR() { \
+  if (cinfo.global_state > CSTATE_START) { \
+    if (memdst && outbuffer) \
+      (*cinfo.dest->term_destination) (&cinfo); \
+    jpeg_abort_compress(&cinfo); \
+  } \
+  jpeg_destroy_compress(&cinfo); \
+  if (input_file != stdin && input_file != NULL) \
+    fclose(input_file); \
+  if (memdst) \
+    free(outbuffer); \
+  return EXIT_FAILURE; \
+}
+
+#endif
+
+
+LOCAL(void)
+usage(void)
+/* complain about bad command line */
+{
+  fprintf(stderr, "usage: %s [switches] ", progname);
+#ifdef TWO_FILE_COMMANDLINE
+  fprintf(stderr, "inputfile outputfile\n");
+#else
+  fprintf(stderr, "[inputfile]\n");
+#endif
+
+  fprintf(stderr, "Switches (names may be abbreviated):\n");
+  fprintf(stderr, "  -quality N[,...]   Compression quality (0..100; 5-95 is most useful range,\n");
+  fprintf(stderr, "                     default is 75)\n");
+  fprintf(stderr, "  -grayscale     Create monochrome JPEG file\n");
+  fprintf(stderr, "  -rgb           Create RGB JPEG file\n");
+#ifdef ENTROPY_OPT_SUPPORTED
+  fprintf(stderr, "  -optimize      Optimize Huffman table (smaller file, but slow compression)\n");
+#endif
+#ifdef C_PROGRESSIVE_SUPPORTED
+  fprintf(stderr, "  -progressive   Create progressive JPEG file\n");
+#endif
+#ifdef TARGA_SUPPORTED
+  fprintf(stderr, "  -targa         Input file is Targa format (usually not needed)\n");
+#endif
+  fprintf(stderr, "Switches for advanced users:\n");
+  fprintf(stderr, "  -precision N   Create JPEG file with N-bit data precision\n");
+#ifdef C_LOSSLESS_SUPPORTED
+  fprintf(stderr, "                 (N is 8, 12, or 16; default is 8; if N is 16, then -lossless\n");
+  fprintf(stderr, "                 must also be specified)\n");
+#else
+  fprintf(stderr, "                 (N is 8 or 12; default is 8)\n");
+#endif
+#ifdef C_LOSSLESS_SUPPORTED
+  fprintf(stderr, "  -lossless psv[,Pt]  Create lossless JPEG file\n");
+#endif
+#ifdef C_ARITH_CODING_SUPPORTED
+  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
+#endif
+#ifdef DCT_ISLOW_SUPPORTED
+  fprintf(stderr, "  -dct int       Use accurate integer DCT method%s\n",
+          (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  fprintf(stderr, "  -dct fast      Use less accurate integer DCT method [legacy feature]%s\n",
+          (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  fprintf(stderr, "  -dct float     Use floating-point DCT method [legacy feature]%s\n",
+          (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
+#endif
+  fprintf(stderr, "  -icc FILE      Embed ICC profile contained in FILE\n");
+  fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
+#ifdef INPUT_SMOOTHING_SUPPORTED
+  fprintf(stderr, "  -smooth N      Smooth dithered input (N=1..100 is strength)\n");
+#endif
+  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+  fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -memdst        Compress to memory instead of file (useful for benchmarking)\n");
+  fprintf(stderr, "  -report        Report compression progress\n");
+  fprintf(stderr, "  -strict        Treat all warnings as fatal\n");
+  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
+  fprintf(stderr, "Switches for wizards:\n");
+  fprintf(stderr, "  -baseline      Force baseline quantization tables\n");
+  fprintf(stderr, "  -qtables FILE  Use quantization tables given in FILE\n");
+  fprintf(stderr, "  -qslots N[,...]    Set component quantization tables\n");
+  fprintf(stderr, "  -sample HxV[,...]  Set component sampling factors\n");
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+  fprintf(stderr, "  -scans FILE    Create multi-scan JPEG per script FILE\n");
+#endif
+  exit(EXIT_FAILURE);
+}
+
+
+LOCAL(int)
+parse_switches(j_compress_ptr cinfo, int argc, char **argv,
+               int last_file_arg_seen, boolean for_real)
+/* Parse optional switches.
+ * Returns argv[] index of first file-name argument (== argc if none).
+ * Any file names with indexes <= last_file_arg_seen are ignored;
+ * they have presumably been processed in a previous iteration.
+ * (Pass 0 for last_file_arg_seen on the first or only iteration.)
+ * for_real is FALSE on the first (dummy) pass; we may skip any expensive
+ * processing.
+ */
+{
+  int argn;
+  char *arg;
+#ifdef C_LOSSLESS_SUPPORTED
+  int psv, pt = 0;
+#endif
+  boolean force_baseline;
+  boolean simple_progressive;
+  char *qualityarg = NULL;      /* saves -quality parm if any */
+  char *qtablefile = NULL;      /* saves -qtables filename if any */
+  char *qslotsarg = NULL;       /* saves -qslots parm if any */
+  char *samplearg = NULL;       /* saves -sample parm if any */
+  char *scansarg = NULL;        /* saves -scans parm if any */
+
+  /* Set up default JPEG parameters. */
+
+  force_baseline = FALSE;       /* by default, allow 16-bit quantizers */
+  simple_progressive = FALSE;
+  is_targa = FALSE;
+  icc_filename = NULL;
+  outfilename = NULL;
+  memdst = FALSE;
+  report = FALSE;
+  strict = FALSE;
+  cinfo->err->trace_level = 0;
+
+  /* Scan command line options, adjust parameters */
+
+  for (argn = 1; argn < argc; argn++) {
+    arg = argv[argn];
+    if (*arg != '-') {
+      /* Not a switch, must be a file name argument */
+      if (argn <= last_file_arg_seen) {
+        outfilename = NULL;     /* -outfile applies to just one input file */
+        continue;               /* ignore this name if previously processed */
+      }
+      break;                    /* else done parsing switches */
+    }
+    arg++;                      /* advance past switch marker character */
+
+    if (keymatch(arg, "arithmetic", 1)) {
+      /* Use arithmetic coding. */
+#ifdef C_ARITH_CODING_SUPPORTED
+      cinfo->arith_code = TRUE;
+#else
+      fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
+              progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "baseline", 1)) {
+      /* Force baseline-compatible output (8-bit quantizer values). */
+      force_baseline = TRUE;
+
+    } else if (keymatch(arg, "dct", 2)) {
+      /* Select DCT algorithm. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (keymatch(argv[argn], "int", 1)) {
+        cinfo->dct_method = JDCT_ISLOW;
+      } else if (keymatch(argv[argn], "fast", 2)) {
+        cinfo->dct_method = JDCT_IFAST;
+      } else if (keymatch(argv[argn], "float", 2)) {
+        cinfo->dct_method = JDCT_FLOAT;
+      } else
+        usage();
+
+    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
+      /* Enable debug printouts. */
+      /* On first -d, print version identification */
+      static boolean printed_version = FALSE;
+
+      if (!printed_version) {
+        fprintf(stderr, "%s version %s (build %s)\n",
+                PACKAGE_NAME, VERSION, BUILD);
+        fprintf(stderr, JCOPYRIGHT1);
+        fprintf(stderr, JCOPYRIGHT2 "\n");
+        fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n",
+                JVERSION);
+        printed_version = TRUE;
+      }
+      cinfo->err->trace_level++;
+
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);
+
+    } else if (keymatch(arg, "grayscale", 2) ||
+               keymatch(arg, "greyscale", 2)) {
+      /* Force a monochrome JPEG file to be generated. */
+      jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
+
+    } else if (keymatch(arg, "rgb", 3)) {
+      /* Force an RGB JPEG file to be generated. */
+      jpeg_set_colorspace(cinfo, JCS_RGB);
+
+    } else if (keymatch(arg, "icc", 1)) {
+      /* Set ICC filename. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      icc_filename = argv[argn];
+
+    } else if (keymatch(arg, "lossless", 1)) {
+      /* Enable lossless mode. */
+#ifdef C_LOSSLESS_SUPPORTED
+      char ch = ',', *ptr;
+
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%d%c", &psv, &ch) < 1 || ch != ',')
+        usage();
+      ptr = argv[argn];
+      while (*ptr && *ptr++ != ','); /* advance to next segment of arg
+                                        string */
+      if (*ptr)
+        sscanf(ptr, "%d", &pt);
+      jpeg_enable_lossless(cinfo, psv, pt);
+#else
+      fprintf(stderr, "%s: sorry, lossless output was not compiled\n",
+              progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "maxmemory", 3)) {
+      /* Maximum memory in Kb (or Mb with 'm'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+        usage();
+      if (ch == 'm' || ch == 'M')
+        lval *= 1000L;
+      cinfo->mem->max_memory_to_use = lval * 1000L;
+
+    } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
+      /* Enable entropy parm optimization. */
+#ifdef ENTROPY_OPT_SUPPORTED
+      cinfo->optimize_coding = TRUE;
+#else
+      fprintf(stderr, "%s: sorry, entropy optimization was not compiled in\n",
+              progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "outfile", 4)) {
+      /* Set output file name. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      outfilename = argv[argn]; /* save it away for later use */
+
+    } else if (keymatch(arg, "precision", 3)) {
+      /* Set data precision. */
+      int val;
+
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%d", &val) != 1)
+        usage();
+#ifdef C_LOSSLESS_SUPPORTED
+      if (val != 8 && val != 12 && val != 16)
+#else
+      if (val != 8 && val != 12)
+#endif
+        usage();
+      cinfo->data_precision = val;
+
+    } else if (keymatch(arg, "progressive", 3)) {
+      /* Select simple progressive mode. */
+#ifdef C_PROGRESSIVE_SUPPORTED
+      simple_progressive = TRUE;
+      /* We must postpone execution until num_components is known. */
+#else
+      fprintf(stderr, "%s: sorry, progressive output was not compiled in\n",
+              progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "memdst", 2)) {
+      /* Use in-memory destination manager */
+      memdst = TRUE;
+
+    } else if (keymatch(arg, "quality", 1)) {
+      /* Quality ratings (quantization table scaling factors). */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      qualityarg = argv[argn];
+
+    } else if (keymatch(arg, "qslots", 2)) {
+      /* Quantization table slot numbers. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      qslotsarg = argv[argn];
+      /* Must delay setting qslots until after we have processed any
+       * colorspace-determining switches, since jpeg_set_colorspace sets
+       * default quant table numbers.
+       */
+
+    } else if (keymatch(arg, "qtables", 2)) {
+      /* Quantization tables fetched from file. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      qtablefile = argv[argn];
+      /* We postpone actually reading the file in case -quality comes later. */
+
+    } else if (keymatch(arg, "report", 3)) {
+      report = TRUE;
+
+    } else if (keymatch(arg, "restart", 1)) {
+      /* Restart interval in MCU rows (or in MCUs with 'b'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+        usage();
+      if (lval < 0 || lval > 65535L)
+        usage();
+      if (ch == 'b' || ch == 'B') {
+        cinfo->restart_interval = (unsigned int)lval;
+        cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
+      } else {
+        cinfo->restart_in_rows = (int)lval;
+        /* restart_interval will be computed during startup */
+      }
+
+    } else if (keymatch(arg, "sample", 2)) {
+      /* Set sampling factors. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      samplearg = argv[argn];
+      /* Must delay setting sample factors until after we have processed any
+       * colorspace-determining switches, since jpeg_set_colorspace sets
+       * default sampling factors.
+       */
+
+    } else if (keymatch(arg, "scans", 4)) {
+      /* Set scan script. */
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      scansarg = argv[argn];
+      /* We must postpone reading the file in case -progressive appears. */
+#else
+      fprintf(stderr, "%s: sorry, multi-scan output was not compiled in\n",
+              progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "smooth", 2)) {
+      /* Set input smoothing factor. */
+      int val;
+
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%d", &val) != 1)
+        usage();
+      if (val < 0 || val > 100)
+        usage();
+      cinfo->smoothing_factor = val;
+
+    } else if (keymatch(arg, "strict", 2)) {
+      strict = TRUE;
+
+    } else if (keymatch(arg, "targa", 1)) {
+      /* Input file is Targa format. */
+      is_targa = TRUE;
+
+    } else {
+      usage();                  /* bogus switch */
+    }
+  }
+
+  /* Post-switch-scanning cleanup */
+
+  if (for_real) {
+
+    /* Set quantization tables for selected quality. */
+    /* Some or all may be overridden if -qtables is present. */
+    if (qualityarg != NULL)     /* process -quality if it was present */
+      if (!set_quality_ratings(cinfo, qualityarg, force_baseline))
+        usage();
+
+    if (qtablefile != NULL)     /* process -qtables if it was present */
+      if (!read_quant_tables(cinfo, qtablefile, force_baseline))
+        usage();
+
+    if (qslotsarg != NULL)      /* process -qslots if it was present */
+      if (!set_quant_slots(cinfo, qslotsarg))
+        usage();
+
+    if (samplearg != NULL)      /* process -sample if it was present */
+      if (!set_sample_factors(cinfo, samplearg))
+        usage();
+
+#ifdef C_PROGRESSIVE_SUPPORTED
+    if (simple_progressive)     /* process -progressive; -scans can override */
+      jpeg_simple_progression(cinfo);
+#endif
+
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+    if (scansarg != NULL)       /* process -scans if it was present */
+      if (!read_scan_script(cinfo, scansarg))
+        usage();
+#endif
+  }
+
+  return argn;                  /* return index of next arg (file name) */
+}
+
+
+METHODDEF(void)
+my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0) {
+    /* Treat warning as fatal */
+    cinfo->err->error_exit(cinfo);
+  } else {
+    if (cinfo->err->trace_level >= msg_level)
+      cinfo->err->output_message(cinfo);
+  }
+}
+
+
+/*
+ * The main program.
+ */
+
+int
+main(int argc, char **argv)
+{
+  struct jpeg_compress_struct cinfo;
+#ifdef CJPEG_FUZZER
+  struct my_error_mgr myerr;
+  struct jpeg_error_mgr &jerr = myerr.pub;
+#else
+  struct jpeg_error_mgr jerr;
+#endif
+  struct cdjpeg_progress_mgr progress;
+  int file_index;
+  cjpeg_source_ptr src_mgr;
+  FILE *input_file = NULL;
+  FILE *icc_file;
+  JOCTET *icc_profile = NULL;
+  long icc_len = 0;
+  FILE *output_file = NULL;
+  unsigned char *outbuffer = NULL;
+  unsigned long outsize = 0;
+  JDIMENSION num_scanlines;
+
+  progname = argv[0];
+  if (progname == NULL || progname[0] == 0)
+    progname = "cjpeg";         /* in case C library doesn't provide it */
+
+  /* Initialize the JPEG compression object with default error handling. */
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_compress(&cinfo);
+  /* Add some application-specific error messages (from cderror.h) */
+  jerr.addon_message_table = cdjpeg_message_table;
+  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
+  jerr.last_addon_message = JMSG_LASTADDONCODE;
+
+  /* Initialize JPEG parameters.
+   * Much of this may be overridden later.
+   * In particular, we don't yet know the input file's color space,
+   * but we need to provide some value for jpeg_set_defaults() to work.
+   */
+
+  cinfo.in_color_space = JCS_RGB; /* arbitrary guess */
+  jpeg_set_defaults(&cinfo);
+
+  /* Scan command line to find file names.
+   * It is convenient to use just one switch-parsing routine, but the switch
+   * values read here are ignored; we will rescan the switches after opening
+   * the input file.
+   */
+
+  file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
+
+  if (strict)
+    jerr.emit_message = my_emit_message;
+
+#ifdef TWO_FILE_COMMANDLINE
+  if (!memdst) {
+    /* Must have either -outfile switch or explicit output file name */
+    if (outfilename == NULL) {
+      if (file_index != argc - 2) {
+        fprintf(stderr, "%s: must name one input and one output file\n",
+                progname);
+        usage();
+      }
+      outfilename = argv[file_index + 1];
+    } else {
+      if (file_index != argc - 1) {
+        fprintf(stderr, "%s: must name one input and one output file\n",
+                progname);
+        usage();
+      }
+    }
+  }
+#else
+  /* Unix style: expect zero or one file name */
+  if (file_index < argc - 1) {
+    fprintf(stderr, "%s: only one input file\n", progname);
+    usage();
+  }
+#endif /* TWO_FILE_COMMANDLINE */
+
+  /* Open the input file. */
+  if (file_index < argc) {
+    if ((input_file = fopen(argv[file_index], READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, argv[file_index]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    /* default input file is stdin */
+    input_file = read_stdin();
+  }
+
+  /* Open the output file. */
+  if (outfilename != NULL) {
+    if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, outfilename);
+      exit(EXIT_FAILURE);
+    }
+  } else if (!memdst) {
+    /* default output file is stdout */
+    output_file = write_stdout();
+  }
+
+  if (icc_filename != NULL) {
+    if ((icc_file = fopen(icc_filename, READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, icc_filename);
+      exit(EXIT_FAILURE);
+    }
+    if (fseek(icc_file, 0, SEEK_END) < 0 ||
+        (icc_len = ftell(icc_file)) < 1 ||
+        fseek(icc_file, 0, SEEK_SET) < 0) {
+      fprintf(stderr, "%s: can't determine size of %s\n", progname,
+              icc_filename);
+      exit(EXIT_FAILURE);
+    }
+    if ((icc_profile = (JOCTET *)malloc(icc_len)) == NULL) {
+      fprintf(stderr, "%s: can't allocate memory for ICC profile\n", progname);
+      fclose(icc_file);
+      exit(EXIT_FAILURE);
+    }
+    if (fread(icc_profile, icc_len, 1, icc_file) < 1) {
+      fprintf(stderr, "%s: can't read ICC profile from %s\n", progname,
+              icc_filename);
+      free(icc_profile);
+      fclose(icc_file);
+      exit(EXIT_FAILURE);
+    }
+    fclose(icc_file);
+  }
+
+#ifdef CJPEG_FUZZER
+  jerr.error_exit = my_error_exit;
+  jerr.emit_message = my_emit_message_fuzzer;
+  if (setjmp(myerr.setjmp_buffer))
+    HANDLE_ERROR()
+#endif
+
+  if (report) {
+    start_progress_monitor((j_common_ptr)&cinfo, &progress);
+    progress.report = report;
+  }
+
+  /* Figure out the input file format, and set up to read it. */
+  src_mgr = select_file_type(&cinfo, input_file);
+  src_mgr->input_file = input_file;
+#ifdef CJPEG_FUZZER
+  src_mgr->max_pixels = 1048576;
+#endif
+
+  /* Read the input file header to obtain file size & colorspace. */
+  (*src_mgr->start_input) (&cinfo, src_mgr);
+
+  /* Now that we know input colorspace, fix colorspace-dependent defaults */
+  jpeg_default_colorspace(&cinfo);
+
+  /* Adjust default compression parameters by re-parsing the options */
+  file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
+
+  /* Specify data destination for compression */
+  if (memdst)
+    jpeg_mem_dest(&cinfo, &outbuffer, &outsize);
+  else
+    jpeg_stdio_dest(&cinfo, output_file);
+
+#ifdef CJPEG_FUZZER
+  if (setjmp(myerr.setjmp_buffer))
+    HANDLE_ERROR()
+#endif
+
+  /* Start compressor */
+  jpeg_start_compress(&cinfo, TRUE);
+
+  if (icc_profile != NULL)
+    jpeg_write_icc_profile(&cinfo, icc_profile, (unsigned int)icc_len);
+
+  /* Process data */
+  if (cinfo.data_precision == 16) {
+#ifdef C_LOSSLESS_SUPPORTED
+    while (cinfo.next_scanline < cinfo.image_height) {
+      num_scanlines = (*src_mgr->get_pixel_rows) (&cinfo, src_mgr);
+      (void)jpeg16_write_scanlines(&cinfo, src_mgr->buffer16, num_scanlines);
+    }
+#else
+    ERREXIT1(&cinfo, JERR_BAD_PRECISION, cinfo.data_precision);
+#endif
+  } else if (cinfo.data_precision == 12) {
+    while (cinfo.next_scanline < cinfo.image_height) {
+      num_scanlines = (*src_mgr->get_pixel_rows) (&cinfo, src_mgr);
+      (void)jpeg12_write_scanlines(&cinfo, src_mgr->buffer12, num_scanlines);
+    }
+  } else {
+    while (cinfo.next_scanline < cinfo.image_height) {
+      num_scanlines = (*src_mgr->get_pixel_rows) (&cinfo, src_mgr);
+      (void)jpeg_write_scanlines(&cinfo, src_mgr->buffer, num_scanlines);
+    }
+  }
+
+  /* Finish compression and release memory */
+  (*src_mgr->finish_input) (&cinfo, src_mgr);
+  jpeg_finish_compress(&cinfo);
+  jpeg_destroy_compress(&cinfo);
+
+  /* Close files, if we opened them */
+  if (input_file != stdin)
+    fclose(input_file);
+  if (output_file != stdout && output_file != NULL)
+    fclose(output_file);
+
+  if (report)
+    end_progress_monitor((j_common_ptr)&cinfo);
+
+  if (memdst) {
+#ifndef CJPEG_FUZZER
+    fprintf(stderr, "Compressed size:  %lu bytes\n", outsize);
+#endif
+    free(outbuffer);
+  }
+
+  free(icc_profile);
+
+  /* All done. */
+  return (jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
+}
diff --git a/3rdparty/libjpeg-turbo/src/cmyk.h b/3rdparty/libjpeg-turbo/src/cmyk.h
new file mode 100644
index 000000000000..23891249cfd1
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/cmyk.h
@@ -0,0 +1,61 @@
+/*
+ * cmyk.h
+ *
+ * Copyright (C) 2017-2018, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains convenience functions for performing quick & dirty
+ * CMYK<->RGB conversion.  This algorithm is suitable for testing purposes
+ * only.  Properly converting between CMYK and RGB requires a color management
+ * system.
+ */
+
+#ifndef CMYK_H
+#define CMYK_H
+
+#include <jinclude.h>
+#define JPEG_INTERNALS
+#include <jpeglib.h>
+#include "jsamplecomp.h"
+
+
+/* Fully reversible */
+
+INLINE
+LOCAL(void)
+rgb_to_cmyk(_JSAMPLE r, _JSAMPLE g, _JSAMPLE b,
+            _JSAMPLE *c, _JSAMPLE *m, _JSAMPLE *y, _JSAMPLE *k)
+{
+  double ctmp = 1.0 - ((double)r / (double)_MAXJSAMPLE);
+  double mtmp = 1.0 - ((double)g / (double)_MAXJSAMPLE);
+  double ytmp = 1.0 - ((double)b / (double)_MAXJSAMPLE);
+  double ktmp = MIN(MIN(ctmp, mtmp), ytmp);
+
+  if (ktmp == 1.0) ctmp = mtmp = ytmp = 0.0;
+  else {
+    ctmp = (ctmp - ktmp) / (1.0 - ktmp);
+    mtmp = (mtmp - ktmp) / (1.0 - ktmp);
+    ytmp = (ytmp - ktmp) / (1.0 - ktmp);
+  }
+  *c = (_JSAMPLE)((double)_MAXJSAMPLE - ctmp * (double)_MAXJSAMPLE + 0.5);
+  *m = (_JSAMPLE)((double)_MAXJSAMPLE - mtmp * (double)_MAXJSAMPLE + 0.5);
+  *y = (_JSAMPLE)((double)_MAXJSAMPLE - ytmp * (double)_MAXJSAMPLE + 0.5);
+  *k = (_JSAMPLE)((double)_MAXJSAMPLE - ktmp * (double)_MAXJSAMPLE + 0.5);
+}
+
+
+/* Fully reversible only for C/M/Y/K values generated with rgb_to_cmyk() */
+
+INLINE
+LOCAL(void)
+cmyk_to_rgb(_JSAMPLE c, _JSAMPLE m, _JSAMPLE y, _JSAMPLE k,
+            _JSAMPLE *r, _JSAMPLE *g, _JSAMPLE *b)
+{
+  *r = (_JSAMPLE)((double)c * (double)k / (double)_MAXJSAMPLE + 0.5);
+  *g = (_JSAMPLE)((double)m * (double)k / (double)_MAXJSAMPLE + 0.5);
+  *b = (_JSAMPLE)((double)y * (double)k / (double)_MAXJSAMPLE + 0.5);
+}
+
+
+#endif /* CMYK_H */
diff --git a/3rdparty/libjpeg-turbo/src/djpeg.c b/3rdparty/libjpeg-turbo/src/djpeg.c
new file mode 100644
index 000000000000..1baedddeff89
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/djpeg.c
@@ -0,0 +1,932 @@
+/*
+ * djpeg.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2013-2019 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010-2011, 2013-2017, 2019-2020, 2022-2024, D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a command-line user interface for the JPEG decompressor.
+ * It should work on any system with Unix- or MS-DOS-style command lines.
+ *
+ * Two different command line styles are permitted, depending on the
+ * compile-time switch TWO_FILE_COMMANDLINE:
+ *      djpeg [options]  inputfile outputfile
+ *      djpeg [options]  [inputfile]
+ * In the second style, output is always to standard output, which you'd
+ * normally redirect to a file or pipe to some other program.  Input is
+ * either from a named file or from standard input (typically redirected).
+ * The second style is convenient on Unix but is unhelpful on systems that
+ * don't support pipes.  Also, you MUST use the first style if your system
+ * doesn't do binary I/O to stdin/stdout.
+ * To simplify script writing, the "-outfile" switch is provided.  The syntax
+ *      djpeg [options]  -outfile outputfile  inputfile
+ * works regardless of which command line style is used.
+ */
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "jversion.h"           /* for version message */
+#include "jconfigint.h"
+
+#include <ctype.h>              /* to declare isprint() */
+
+
+/* Create the add-on message string table. */
+
+#define JMESSAGE(code, string)  string,
+
+static const char * const cdjpeg_message_table[] = {
+#include "cderror.h"
+  NULL
+};
+
+
+/*
+ * This list defines the known output image formats
+ * (not all of which need be supported by a given version).
+ * You can change the default output format by defining DEFAULT_FMT;
+ * indeed, you had better do so if you undefine PPM_SUPPORTED.
+ */
+
+typedef enum {
+  FMT_BMP,                      /* BMP format (Windows flavor) */
+  FMT_GIF,                      /* GIF format (LZW-compressed) */
+  FMT_GIF0,                     /* GIF format (uncompressed) */
+  FMT_OS2,                      /* BMP format (OS/2 flavor) */
+  FMT_PPM,                      /* PPM/PGM (PBMPLUS formats) */
+  FMT_TARGA,                    /* Targa format */
+  FMT_TIFF                      /* TIFF format */
+} IMAGE_FORMATS;
+
+#ifndef DEFAULT_FMT             /* so can override from CFLAGS in Makefile */
+#define DEFAULT_FMT     FMT_PPM
+#endif
+
+static IMAGE_FORMATS requested_fmt;
+
+
+/*
+ * Argument-parsing code.
+ * The switch parser is designed to be useful with DOS-style command line
+ * syntax, ie, intermixed switches and file names, where only the switches
+ * to the left of a given file name affect processing of that file.
+ * The main program in this file doesn't actually use this capability...
+ */
+
+
+static const char *progname;    /* program name for error messages */
+static char *icc_filename;      /* for -icc switch */
+static JDIMENSION max_scans;    /* for -maxscans switch */
+static char *outfilename;       /* for -outfile switch */
+static boolean memsrc;          /* for -memsrc switch */
+static boolean report;          /* for -report switch */
+static boolean skip, crop;
+static JDIMENSION skip_start, skip_end;
+static JDIMENSION crop_x, crop_y, crop_width, crop_height;
+static boolean strict;          /* for -strict switch */
+#define INPUT_BUF_SIZE  4096
+
+
+LOCAL(void)
+usage(void)
+/* complain about bad command line */
+{
+  fprintf(stderr, "usage: %s [switches] ", progname);
+#ifdef TWO_FILE_COMMANDLINE
+  fprintf(stderr, "inputfile outputfile\n");
+#else
+  fprintf(stderr, "[inputfile]\n");
+#endif
+
+  fprintf(stderr, "Switches (names may be abbreviated):\n");
+  fprintf(stderr, "  -colors N      Reduce image to no more than N colors\n");
+  fprintf(stderr, "  -fast          Fast, low-quality processing\n");
+  fprintf(stderr, "  -grayscale     Force grayscale output\n");
+  fprintf(stderr, "  -rgb           Force RGB output\n");
+  fprintf(stderr, "  -rgb565        Force RGB565 output\n");
+#ifdef IDCT_SCALING_SUPPORTED
+  fprintf(stderr, "  -scale M/N     Scale output image by fraction M/N, eg, 1/8\n");
+#endif
+#ifdef BMP_SUPPORTED
+  fprintf(stderr, "  -bmp           Select BMP output format (Windows style)%s\n",
+          (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
+#endif
+#ifdef GIF_SUPPORTED
+  fprintf(stderr, "  -gif           Select GIF output format (LZW-compressed)%s\n",
+          (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
+  fprintf(stderr, "  -gif0          Select GIF output format (uncompressed)%s\n",
+          (DEFAULT_FMT == FMT_GIF0 ? " (default)" : ""));
+#endif
+#ifdef BMP_SUPPORTED
+  fprintf(stderr, "  -os2           Select BMP output format (OS/2 style)%s\n",
+          (DEFAULT_FMT == FMT_OS2 ? " (default)" : ""));
+#endif
+#ifdef PPM_SUPPORTED
+  fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
+          (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
+#endif
+#ifdef TARGA_SUPPORTED
+  fprintf(stderr, "  -targa         Select Targa output format%s\n",
+          (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
+#endif
+  fprintf(stderr, "Switches for advanced users:\n");
+#ifdef DCT_ISLOW_SUPPORTED
+  fprintf(stderr, "  -dct int       Use accurate integer DCT method%s\n",
+          (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  fprintf(stderr, "  -dct fast      Use less accurate integer DCT method [legacy feature]%s\n",
+          (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  fprintf(stderr, "  -dct float     Use floating-point DCT method [legacy feature]%s\n",
+          (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
+#endif
+  fprintf(stderr, "  -dither fs     Use F-S dithering (default)\n");
+  fprintf(stderr, "  -dither none   Don't use dithering in quantization\n");
+  fprintf(stderr, "  -dither ordered  Use ordered dither (medium speed, quality)\n");
+  fprintf(stderr, "  -icc FILE      Extract ICC profile to FILE\n");
+#ifdef QUANT_2PASS_SUPPORTED
+  fprintf(stderr, "  -map FILE      Map to colors used in named image file\n");
+#endif
+  fprintf(stderr, "  -nosmooth      Don't use high-quality upsampling\n");
+#ifdef QUANT_1PASS_SUPPORTED
+  fprintf(stderr, "  -onepass       Use 1-pass quantization (fast, low quality)\n");
+#endif
+  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+  fprintf(stderr, "  -maxscans N    Maximum number of scans to allow in input file\n");
+  fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -memsrc        Load input file into memory before decompressing\n");
+  fprintf(stderr, "  -report        Report decompression progress\n");
+  fprintf(stderr, "  -skip Y0,Y1    Decompress all rows except those between Y0 and Y1 (inclusive)\n");
+  fprintf(stderr, "  -crop WxH+X+Y  Decompress only a rectangular subregion of the image\n");
+  fprintf(stderr, "                 [requires PBMPLUS (PPM/PGM), GIF, or Targa output format]\n");
+  fprintf(stderr, "  -strict        Treat all warnings as fatal\n");
+  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "  -version       Print version information and exit\n");
+  exit(EXIT_FAILURE);
+}
+
+
+LOCAL(int)
+parse_switches(j_decompress_ptr cinfo, int argc, char **argv,
+               int last_file_arg_seen, boolean for_real)
+/* Parse optional switches.
+ * Returns argv[] index of first file-name argument (== argc if none).
+ * Any file names with indexes <= last_file_arg_seen are ignored;
+ * they have presumably been processed in a previous iteration.
+ * (Pass 0 for last_file_arg_seen on the first or only iteration.)
+ * for_real is FALSE on the first (dummy) pass; we may skip any expensive
+ * processing.
+ */
+{
+  int argn;
+  char *arg;
+
+  /* Set up default JPEG parameters. */
+  requested_fmt = DEFAULT_FMT;  /* set default output file format */
+  icc_filename = NULL;
+  max_scans = 0;
+  outfilename = NULL;
+  memsrc = FALSE;
+  report = FALSE;
+  skip = FALSE;
+  crop = FALSE;
+  strict = FALSE;
+  cinfo->err->trace_level = 0;
+
+  /* Scan command line options, adjust parameters */
+
+  for (argn = 1; argn < argc; argn++) {
+    arg = argv[argn];
+    if (*arg != '-') {
+      /* Not a switch, must be a file name argument */
+      if (argn <= last_file_arg_seen) {
+        outfilename = NULL;     /* -outfile applies to just one input file */
+        continue;               /* ignore this name if previously processed */
+      }
+      break;                    /* else done parsing switches */
+    }
+    arg++;                      /* advance past switch marker character */
+
+    if (keymatch(arg, "bmp", 1)) {
+      /* BMP output format (Windows flavor). */
+      requested_fmt = FMT_BMP;
+
+    } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) ||
+               keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) {
+      /* Do color quantization. */
+      int val;
+
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%d", &val) != 1)
+        usage();
+      cinfo->desired_number_of_colors = val;
+      cinfo->quantize_colors = TRUE;
+
+    } else if (keymatch(arg, "dct", 2)) {
+      /* Select IDCT algorithm. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (keymatch(argv[argn], "int", 1)) {
+        cinfo->dct_method = JDCT_ISLOW;
+      } else if (keymatch(argv[argn], "fast", 2)) {
+        cinfo->dct_method = JDCT_IFAST;
+      } else if (keymatch(argv[argn], "float", 2)) {
+        cinfo->dct_method = JDCT_FLOAT;
+      } else
+        usage();
+
+    } else if (keymatch(arg, "dither", 2)) {
+      /* Select dithering algorithm. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (keymatch(argv[argn], "fs", 2)) {
+        cinfo->dither_mode = JDITHER_FS;
+      } else if (keymatch(argv[argn], "none", 2)) {
+        cinfo->dither_mode = JDITHER_NONE;
+      } else if (keymatch(argv[argn], "ordered", 2)) {
+        cinfo->dither_mode = JDITHER_ORDERED;
+      } else
+        usage();
+
+    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
+      /* Enable debug printouts. */
+      /* On first -d, print version identification */
+      static boolean printed_version = FALSE;
+
+      if (!printed_version) {
+        fprintf(stderr, "%s version %s (build %s)\n",
+                PACKAGE_NAME, VERSION, BUILD);
+        fprintf(stderr, JCOPYRIGHT1);
+        fprintf(stderr, JCOPYRIGHT2 "\n");
+        fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n",
+                JVERSION);
+        printed_version = TRUE;
+      }
+      cinfo->err->trace_level++;
+
+    } else if (keymatch(arg, "version", 4)) {
+      fprintf(stderr, "%s version %s (build %s)\n",
+              PACKAGE_NAME, VERSION, BUILD);
+      exit(EXIT_SUCCESS);
+
+    } else if (keymatch(arg, "fast", 1)) {
+      /* Select recommended processing options for quick-and-dirty output. */
+      cinfo->two_pass_quantize = FALSE;
+      cinfo->dither_mode = JDITHER_ORDERED;
+      if (!cinfo->quantize_colors) /* don't override an earlier -colors */
+        cinfo->desired_number_of_colors = 216;
+      cinfo->dct_method = JDCT_FASTEST;
+      cinfo->do_fancy_upsampling = FALSE;
+
+    } else if (keymatch(arg, "gif", 1)) {
+      /* GIF output format (LZW-compressed). */
+      requested_fmt = FMT_GIF;
+
+    } else if (keymatch(arg, "gif0", 4)) {
+      /* GIF output format (uncompressed). */
+      requested_fmt = FMT_GIF0;
+
+    } else if (keymatch(arg, "grayscale", 2) ||
+               keymatch(arg, "greyscale", 2)) {
+      /* Force monochrome output. */
+      cinfo->out_color_space = JCS_GRAYSCALE;
+
+    } else if (keymatch(arg, "rgb", 2)) {
+      /* Force RGB output. */
+      cinfo->out_color_space = JCS_RGB;
+
+    } else if (keymatch(arg, "rgb565", 2)) {
+      /* Force RGB565 output. */
+      cinfo->out_color_space = JCS_RGB565;
+
+    } else if (keymatch(arg, "icc", 1)) {
+      /* Set ICC filename. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      icc_filename = argv[argn];
+#ifdef SAVE_MARKERS_SUPPORTED
+      jpeg_save_markers(cinfo, JPEG_APP0 + 2, 0xFFFF);
+#endif
+
+    } else if (keymatch(arg, "map", 3)) {
+      /* Quantize to a color map taken from an input file. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (for_real) {           /* too expensive to do twice! */
+#ifdef QUANT_2PASS_SUPPORTED    /* otherwise can't quantize to supplied map */
+        FILE *mapfile;
+
+        if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) {
+          fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
+          exit(EXIT_FAILURE);
+        }
+        if (cinfo->data_precision == 12)
+          read_color_map_12(cinfo, mapfile);
+        else
+          read_color_map(cinfo, mapfile);
+        fclose(mapfile);
+        cinfo->quantize_colors = TRUE;
+#else
+        ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+      }
+
+    } else if (keymatch(arg, "maxmemory", 3)) {
+      /* Maximum memory in Kb (or Mb with 'm'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+        usage();
+      if (ch == 'm' || ch == 'M')
+        lval *= 1000L;
+      cinfo->mem->max_memory_to_use = lval * 1000L;
+
+    } else if (keymatch(arg, "maxscans", 4)) {
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%u", &max_scans) != 1)
+        usage();
+
+    } else if (keymatch(arg, "nosmooth", 3)) {
+      /* Suppress fancy upsampling */
+      cinfo->do_fancy_upsampling = FALSE;
+
+    } else if (keymatch(arg, "onepass", 3)) {
+      /* Use fast one-pass quantization. */
+      cinfo->two_pass_quantize = FALSE;
+
+    } else if (keymatch(arg, "os2", 3)) {
+      /* BMP output format (OS/2 flavor). */
+      requested_fmt = FMT_OS2;
+
+    } else if (keymatch(arg, "outfile", 4)) {
+      /* Set output file name. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      outfilename = argv[argn]; /* save it away for later use */
+
+    } else if (keymatch(arg, "memsrc", 2)) {
+      /* Use in-memory source manager */
+      memsrc = TRUE;
+
+    } else if (keymatch(arg, "pnm", 1) || keymatch(arg, "ppm", 1)) {
+      /* PPM/PGM output format. */
+      requested_fmt = FMT_PPM;
+
+    } else if (keymatch(arg, "report", 2)) {
+      report = TRUE;
+
+    } else if (keymatch(arg, "scale", 2)) {
+      /* Scale the output image by a fraction M/N. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%u/%u",
+                 &cinfo->scale_num, &cinfo->scale_denom) != 2)
+        usage();
+
+    } else if (keymatch(arg, "skip", 2)) {
+      if (++argn >= argc)
+        usage();
+      if (sscanf(argv[argn], "%u,%u", &skip_start, &skip_end) != 2 ||
+          skip_start > skip_end)
+        usage();
+      skip = TRUE;
+
+    } else if (keymatch(arg, "crop", 2)) {
+      char c;
+      if (++argn >= argc)
+        usage();
+      if (sscanf(argv[argn], "%u%c%u+%u+%u", &crop_width, &c, &crop_height,
+                 &crop_x, &crop_y) != 5 ||
+          (c != 'X' && c != 'x') || crop_width < 1 || crop_height < 1)
+        usage();
+      crop = TRUE;
+
+    } else if (keymatch(arg, "strict", 2)) {
+      strict = TRUE;
+
+    } else if (keymatch(arg, "targa", 1)) {
+      /* Targa output format. */
+      requested_fmt = FMT_TARGA;
+
+    } else {
+      usage();                  /* bogus switch */
+    }
+  }
+
+  return argn;                  /* return index of next arg (file name) */
+}
+
+
+/*
+ * Marker processor for COM and interesting APPn markers.
+ * This replaces the library's built-in processor, which just skips the marker.
+ * We want to print out the marker as text, to the extent possible.
+ * Note this code relies on a non-suspending data source.
+ */
+
+LOCAL(unsigned int)
+jpeg_getc(j_decompress_ptr cinfo)
+/* Read next byte */
+{
+  struct jpeg_source_mgr *datasrc = cinfo->src;
+
+  if (datasrc->bytes_in_buffer == 0) {
+    if (!(*datasrc->fill_input_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+  }
+  datasrc->bytes_in_buffer--;
+  return *datasrc->next_input_byte++;
+}
+
+
+METHODDEF(boolean)
+print_text_marker(j_decompress_ptr cinfo)
+{
+  boolean traceit = (cinfo->err->trace_level >= 1);
+  long length;
+  unsigned int ch;
+  unsigned int lastch = 0;
+
+  length = jpeg_getc(cinfo) << 8;
+  length += jpeg_getc(cinfo);
+  length -= 2;                  /* discount the length word itself */
+
+  if (traceit) {
+    if (cinfo->unread_marker == JPEG_COM)
+      fprintf(stderr, "Comment, length %ld:\n", (long)length);
+    else                        /* assume it is an APPn otherwise */
+      fprintf(stderr, "APP%d, length %ld:\n",
+              cinfo->unread_marker - JPEG_APP0, (long)length);
+  }
+
+  while (--length >= 0) {
+    ch = jpeg_getc(cinfo);
+    if (traceit) {
+      /* Emit the character in a readable form.
+       * Nonprintables are converted to \nnn form,
+       * while \ is converted to \\.
+       * Newlines in CR, CR/LF, or LF form will be printed as one newline.
+       */
+      if (ch == '\r') {
+        fprintf(stderr, "\n");
+      } else if (ch == '\n') {
+        if (lastch != '\r')
+          fprintf(stderr, "\n");
+      } else if (ch == '\\') {
+        fprintf(stderr, "\\\\");
+      } else if (isprint(ch)) {
+        putc(ch, stderr);
+      } else {
+        fprintf(stderr, "\\%03o", ch);
+      }
+      lastch = ch;
+    }
+  }
+
+  if (traceit)
+    fprintf(stderr, "\n");
+
+  return TRUE;
+}
+
+
+METHODDEF(void)
+my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0) {
+    /* Treat warning as fatal */
+    cinfo->err->error_exit(cinfo);
+  } else {
+    if (cinfo->err->trace_level >= msg_level)
+      cinfo->err->output_message(cinfo);
+  }
+}
+
+
+/*
+ * The main program.
+ */
+
+int
+main(int argc, char **argv)
+{
+  struct jpeg_decompress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  struct cdjpeg_progress_mgr progress;
+  int file_index;
+  djpeg_dest_ptr dest_mgr = NULL;
+  FILE *input_file;
+  FILE *output_file;
+  unsigned char *inbuffer = NULL;
+  unsigned long insize = 0;
+  JDIMENSION num_scanlines;
+
+  progname = argv[0];
+  if (progname == NULL || progname[0] == 0)
+    progname = "djpeg";         /* in case C library doesn't provide it */
+
+  /* Initialize the JPEG decompression object with default error handling. */
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_decompress(&cinfo);
+  /* Add some application-specific error messages (from cderror.h) */
+  jerr.addon_message_table = cdjpeg_message_table;
+  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
+  jerr.last_addon_message = JMSG_LASTADDONCODE;
+
+  /* Insert custom marker processor for COM and APP12.
+   * APP12 is used by some digital camera makers for textual info,
+   * so we provide the ability to display it as text.
+   * If you like, additional APPn marker types can be selected for display,
+   * but don't try to override APP0 or APP14 this way (see libjpeg.txt).
+   */
+  jpeg_set_marker_processor(&cinfo, JPEG_COM, print_text_marker);
+  jpeg_set_marker_processor(&cinfo, JPEG_APP0 + 12, print_text_marker);
+
+  /* Scan command line to find file names. */
+  /* It is convenient to use just one switch-parsing routine, but the switch
+   * values read here are ignored; we will rescan the switches after opening
+   * the input file.
+   * (Exception: tracing level set here controls verbosity for COM markers
+   * found during jpeg_read_header...)
+   */
+
+  file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
+
+  if (strict)
+    jerr.emit_message = my_emit_message;
+
+#ifdef TWO_FILE_COMMANDLINE
+  /* Must have either -outfile switch or explicit output file name */
+  if (outfilename == NULL) {
+    if (file_index != argc - 2) {
+      fprintf(stderr, "%s: must name one input and one output file\n",
+              progname);
+      usage();
+    }
+    outfilename = argv[file_index + 1];
+  } else {
+    if (file_index != argc - 1) {
+      fprintf(stderr, "%s: must name one input and one output file\n",
+              progname);
+      usage();
+    }
+  }
+#else
+  /* Unix style: expect zero or one file name */
+  if (file_index < argc - 1) {
+    fprintf(stderr, "%s: only one input file\n", progname);
+    usage();
+  }
+#endif /* TWO_FILE_COMMANDLINE */
+
+  /* Open the input file. */
+  if (file_index < argc) {
+    if ((input_file = fopen(argv[file_index], READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, argv[file_index]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    /* default input file is stdin */
+    input_file = read_stdin();
+  }
+
+  /* Open the output file. */
+  if (outfilename != NULL) {
+    if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, outfilename);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    /* default output file is stdout */
+    output_file = write_stdout();
+  }
+
+  if (report || max_scans != 0) {
+    start_progress_monitor((j_common_ptr)&cinfo, &progress);
+    progress.report = report;
+    progress.max_scans = max_scans;
+  }
+
+  /* Specify data source for decompression */
+  if (memsrc) {
+    size_t nbytes;
+    do {
+      inbuffer = (unsigned char *)realloc(inbuffer, insize + INPUT_BUF_SIZE);
+      if (inbuffer == NULL) {
+        fprintf(stderr, "%s: memory allocation failure\n", progname);
+        exit(EXIT_FAILURE);
+      }
+      nbytes = fread(&inbuffer[insize], 1, INPUT_BUF_SIZE, input_file);
+      if (nbytes < INPUT_BUF_SIZE && ferror(input_file)) {
+        if (file_index < argc)
+          fprintf(stderr, "%s: can't read from %s\n", progname,
+                  argv[file_index]);
+        else
+          fprintf(stderr, "%s: can't read from stdin\n", progname);
+      }
+      insize += (unsigned long)nbytes;
+    } while (nbytes == INPUT_BUF_SIZE);
+    fprintf(stderr, "Compressed size:  %lu bytes\n", insize);
+    jpeg_mem_src(&cinfo, inbuffer, insize);
+  } else
+    jpeg_stdio_src(&cinfo, input_file);
+
+  /* Read file header, set default decompression parameters */
+  (void)jpeg_read_header(&cinfo, TRUE);
+
+  /* Adjust default decompression parameters by re-parsing the options */
+  file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
+
+  /* Initialize the output module now to let it override any crucial
+   * option settings (for instance, GIF wants to force color quantization).
+   */
+  switch (requested_fmt) {
+#ifdef BMP_SUPPORTED
+  case FMT_BMP:
+    dest_mgr = jinit_write_bmp(&cinfo, FALSE, TRUE);
+    break;
+  case FMT_OS2:
+    dest_mgr = jinit_write_bmp(&cinfo, TRUE, TRUE);
+    break;
+#endif
+#ifdef GIF_SUPPORTED
+  case FMT_GIF:
+    if (cinfo.data_precision == 16)
+      ERREXIT1(&cinfo, JERR_BAD_PRECISION, cinfo.data_precision);
+    else if (cinfo.data_precision == 12)
+      dest_mgr = j12init_write_gif(&cinfo, TRUE);
+    else
+      dest_mgr = jinit_write_gif(&cinfo, TRUE);
+    break;
+  case FMT_GIF0:
+    dest_mgr = jinit_write_gif(&cinfo, FALSE);
+    break;
+#endif
+#ifdef PPM_SUPPORTED
+  case FMT_PPM:
+    if (cinfo.data_precision == 16)
+#ifdef D_LOSSLESS_SUPPORTED
+      dest_mgr = j16init_write_ppm(&cinfo);
+#else
+      ERREXIT1(&cinfo, JERR_BAD_PRECISION, cinfo.data_precision);
+#endif
+    else if (cinfo.data_precision == 12)
+      dest_mgr = j12init_write_ppm(&cinfo);
+    else
+      dest_mgr = jinit_write_ppm(&cinfo);
+    break;
+#endif
+#ifdef TARGA_SUPPORTED
+  case FMT_TARGA:
+    dest_mgr = jinit_write_targa(&cinfo);
+    break;
+#endif
+  default:
+    ERREXIT(&cinfo, JERR_UNSUPPORTED_FORMAT);
+    break;
+  }
+  dest_mgr->output_file = output_file;
+
+  /* Start decompressor */
+  (void)jpeg_start_decompress(&cinfo);
+
+  /* Skip rows */
+  if (skip) {
+    JDIMENSION tmp;
+
+    /* Check for valid skip_end.  We cannot check this value until after
+     * jpeg_start_decompress() is called.  Note that we have already verified
+     * that skip_start <= skip_end.
+     */
+    if (skip_end > cinfo.output_height - 1) {
+      fprintf(stderr, "%s: skip region exceeds image height %u\n", progname,
+              cinfo.output_height);
+      exit(EXIT_FAILURE);
+    }
+
+    /* Write output file header.  This is a hack to ensure that the destination
+     * manager creates an output image of the proper size.
+     */
+    tmp = cinfo.output_height;
+    cinfo.output_height -= (skip_end - skip_start + 1);
+    (*dest_mgr->start_output) (&cinfo, dest_mgr);
+    cinfo.output_height = tmp;
+
+    if (cinfo.data_precision == 16)
+      ERREXIT(&cinfo, JERR_NOTIMPL);
+    else if (cinfo.data_precision == 12) {
+      /* Process data */
+      while (cinfo.output_scanline < skip_start) {
+        num_scanlines = jpeg12_read_scanlines(&cinfo, dest_mgr->buffer12,
+                                              dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+      if ((tmp = jpeg12_skip_scanlines(&cinfo, skip_end - skip_start + 1)) !=
+          skip_end - skip_start + 1) {
+        fprintf(stderr, "%s: jpeg12_skip_scanlines() returned %u rather than %u\n",
+                progname, tmp, skip_end - skip_start + 1);
+        exit(EXIT_FAILURE);
+      }
+      while (cinfo.output_scanline < cinfo.output_height) {
+        num_scanlines = jpeg12_read_scanlines(&cinfo, dest_mgr->buffer12,
+                                              dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+    } else {
+      /* Process data */
+      while (cinfo.output_scanline < skip_start) {
+        num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                            dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+      if ((tmp = jpeg_skip_scanlines(&cinfo, skip_end - skip_start + 1)) !=
+          skip_end - skip_start + 1) {
+        fprintf(stderr, "%s: jpeg_skip_scanlines() returned %u rather than %u\n",
+                progname, tmp, skip_end - skip_start + 1);
+        exit(EXIT_FAILURE);
+      }
+      while (cinfo.output_scanline < cinfo.output_height) {
+        num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                            dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+    }
+
+  /* Decompress a subregion */
+  } else if (crop) {
+    JDIMENSION tmp;
+
+    /* Check for valid crop dimensions.  We cannot check these values until
+     * after jpeg_start_decompress() is called.
+     */
+    if (crop_x + crop_width > cinfo.output_width ||
+        crop_y + crop_height > cinfo.output_height) {
+      fprintf(stderr, "%s: crop dimensions exceed image dimensions %u x %u\n",
+              progname, cinfo.output_width, cinfo.output_height);
+      exit(EXIT_FAILURE);
+    }
+
+    if (cinfo.data_precision == 16)
+      ERREXIT(&cinfo, JERR_NOTIMPL);
+    else if (cinfo.data_precision == 12)
+      jpeg12_crop_scanline(&cinfo, &crop_x, &crop_width);
+    else
+      jpeg_crop_scanline(&cinfo, &crop_x, &crop_width);
+    if (dest_mgr->calc_buffer_dimensions)
+      (*dest_mgr->calc_buffer_dimensions) (&cinfo, dest_mgr);
+    else
+      ERREXIT(&cinfo, JERR_UNSUPPORTED_FORMAT);
+
+    /* Write output file header.  This is a hack to ensure that the destination
+     * manager creates an output image of the proper size.
+     */
+    tmp = cinfo.output_height;
+    cinfo.output_height = crop_height;
+    (*dest_mgr->start_output) (&cinfo, dest_mgr);
+    cinfo.output_height = tmp;
+
+    if (cinfo.data_precision == 16)
+      ERREXIT(&cinfo, JERR_NOTIMPL);
+    else if (cinfo.data_precision == 12) {
+      /* Process data */
+      if ((tmp = jpeg12_skip_scanlines(&cinfo, crop_y)) != crop_y) {
+        fprintf(stderr, "%s: jpeg12_skip_scanlines() returned %u rather than %u\n",
+                progname, tmp, crop_y);
+        exit(EXIT_FAILURE);
+      }
+      while (cinfo.output_scanline < crop_y + crop_height) {
+        num_scanlines = jpeg12_read_scanlines(&cinfo, dest_mgr->buffer12,
+                                              dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+      if ((tmp =
+           jpeg12_skip_scanlines(&cinfo, cinfo.output_height - crop_y -
+                                         crop_height)) !=
+          cinfo.output_height - crop_y - crop_height) {
+        fprintf(stderr, "%s: jpeg12_skip_scanlines() returned %u rather than %u\n",
+                progname, tmp, cinfo.output_height - crop_y - crop_height);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      /* Process data */
+      if ((tmp = jpeg_skip_scanlines(&cinfo, crop_y)) != crop_y) {
+        fprintf(stderr, "%s: jpeg_skip_scanlines() returned %u rather than %u\n",
+                progname, tmp, crop_y);
+        exit(EXIT_FAILURE);
+      }
+      while (cinfo.output_scanline < crop_y + crop_height) {
+        num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                            dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+      if ((tmp =
+           jpeg_skip_scanlines(&cinfo,
+                               cinfo.output_height - crop_y - crop_height)) !=
+          cinfo.output_height - crop_y - crop_height) {
+        fprintf(stderr, "%s: jpeg_skip_scanlines() returned %u rather than %u\n",
+                progname, tmp, cinfo.output_height - crop_y - crop_height);
+        exit(EXIT_FAILURE);
+      }
+    }
+
+  /* Normal full-image decompress */
+  } else {
+    /* Write output file header */
+    (*dest_mgr->start_output) (&cinfo, dest_mgr);
+
+    if (cinfo.data_precision == 16) {
+#ifdef D_LOSSLESS_SUPPORTED
+      /* Process data */
+      while (cinfo.output_scanline < cinfo.output_height) {
+        num_scanlines = jpeg16_read_scanlines(&cinfo, dest_mgr->buffer16,
+                                              dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+#else
+      ERREXIT1(&cinfo, JERR_BAD_PRECISION, cinfo.data_precision);
+#endif
+    } else if (cinfo.data_precision == 12) {
+      /* Process data */
+      while (cinfo.output_scanline < cinfo.output_height) {
+        num_scanlines = jpeg12_read_scanlines(&cinfo, dest_mgr->buffer12,
+                                              dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+    } else {
+      /* Process data */
+      while (cinfo.output_scanline < cinfo.output_height) {
+        num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+                                            dest_mgr->buffer_height);
+        (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+      }
+    }
+  }
+
+  /* Hack: count final pass as done in case finish_output does an extra pass.
+   * The library won't have updated completed_passes.
+   */
+  if (report || max_scans != 0)
+    progress.pub.completed_passes = progress.pub.total_passes;
+
+  if (icc_filename != NULL) {
+    FILE *icc_file;
+    JOCTET *icc_profile;
+    unsigned int icc_len;
+
+    if ((icc_file = fopen(icc_filename, WRITE_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, icc_filename);
+      exit(EXIT_FAILURE);
+    }
+    if (jpeg_read_icc_profile(&cinfo, &icc_profile, &icc_len)) {
+      if (fwrite(icc_profile, icc_len, 1, icc_file) < 1) {
+        fprintf(stderr, "%s: can't read ICC profile from %s\n", progname,
+                icc_filename);
+        free(icc_profile);
+        fclose(icc_file);
+        exit(EXIT_FAILURE);
+      }
+      free(icc_profile);
+      fclose(icc_file);
+    } else if (cinfo.err->msg_code != JWRN_BOGUS_ICC)
+      fprintf(stderr, "%s: no ICC profile data in JPEG file\n", progname);
+  }
+
+  /* Finish decompression and release memory.
+   * I must do it in this order because output module has allocated memory
+   * of lifespan JPOOL_IMAGE; it needs to finish before releasing memory.
+   */
+  (*dest_mgr->finish_output) (&cinfo, dest_mgr);
+  (void)jpeg_finish_decompress(&cinfo);
+  jpeg_destroy_decompress(&cinfo);
+
+  /* Close files, if we opened them */
+  if (input_file != stdin)
+    fclose(input_file);
+  if (output_file != stdout)
+    fclose(output_file);
+
+  if (report || max_scans != 0)
+    end_progress_monitor((j_common_ptr)&cinfo);
+
+  if (memsrc)
+    free(inbuffer);
+
+  /* All done. */
+  exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
+  return 0;                     /* suppress no-return-value warnings */
+}
diff --git a/3rdparty/libjpeg-turbo/src/example.c b/3rdparty/libjpeg-turbo/src/example.c
new file mode 100644
index 000000000000..78b658a0ab51
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/example.c
@@ -0,0 +1,643 @@
+/*
+ * example.c
+ *
+ * This file was part of the Independent JPEG Group's software.
+ * Copyright (C) 1992-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2017, 2019, 2022-2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file illustrates how to use the IJG code as a subroutine library
+ * to read or write JPEG image files with 8-bit or 12-bit data precision.  You
+ * should look at this code in conjunction with the documentation file
+ * libjpeg.txt.
+ *
+ * We present these routines in the same coding style used in the JPEG code
+ * (ANSI function definitions, etc); but you are of course free to code your
+ * routines in a different style if you prefer.
+ */
+
+/* First-time users of libjpeg-turbo might be better served by looking at
+ * tjexample.c, which uses the more straightforward TurboJPEG API.  Note that
+ * this example, like cjpeg and djpeg, interleaves disk I/O with JPEG
+ * compression/decompression, so it is not suitable for benchmarking purposes.
+ */
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#define strcasecmp  stricmp
+#define strncasecmp  strnicmp
+#endif
+
+/*
+ * Include file for users of JPEG library.
+ * You will need to have included system headers that define at least
+ * the typedefs FILE and size_t before you can include jpeglib.h.
+ * (stdio.h is sufficient on ANSI-conforming systems.)
+ * You may also wish to include "jerror.h".
+ */
+
+#include "jpeglib.h"
+#include "jerror.h"
+
+/*
+ * <setjmp.h> is used for the optional error recovery mechanism shown in
+ * the second part of the example.
+ */
+
+#include <setjmp.h>
+
+
+
+/******************** JPEG COMPRESSION SAMPLE INTERFACE *******************/
+
+/* This half of the example shows how to feed data into the JPEG compressor.
+ * We present a minimal version that does not worry about refinements such
+ * as error recovery (the JPEG code will just exit() if it gets an error).
+ */
+
+
+/*
+ * IMAGE DATA FORMATS:
+ *
+ * The standard input image format is a rectangular array of pixels, with
+ * each pixel having the same number of "component" values (color channels).
+ * Each pixel row is an array of JSAMPLEs (which typically are unsigned chars)
+ * or J12SAMPLEs (which typically are shorts).  If you are working with color
+ * data, then the color values for each pixel must be adjacent in the row; for
+ * example, R,G,B,R,G,B,R,G,B,... for 24-bit RGB color.
+ *
+ * For this example, we'll assume that this data structure matches the way
+ * our application has stored the image in memory, so we can just pass a
+ * pointer to our image buffer.  In particular, let's say that the image is
+ * RGB color and is described by:
+ */
+
+#define WIDTH  640              /* Number of columns in image */
+#define HEIGHT  480             /* Number of rows in image */
+
+
+/*
+ * Sample routine for JPEG compression.  We assume that the target file name,
+ * a compression quality factor, and a data precision are passed in.
+ */
+
+METHODDEF(void)
+write_JPEG_file(char *filename, int quality, int data_precision)
+{
+  /* This struct contains the JPEG compression parameters and pointers to
+   * working space (which is allocated as needed by the JPEG library).
+   * It is possible to have several such structures, representing multiple
+   * compression/decompression processes, in existence at once.  We refer
+   * to any one struct (and its associated working data) as a "JPEG object".
+   */
+  struct jpeg_compress_struct cinfo;
+  /* This struct represents a JPEG error handler.  It is declared separately
+   * because applications often want to supply a specialized error handler
+   * (see the second half of this file for an example).  But here we just
+   * take the easy way out and use the standard error handler, which will
+   * print a message on stderr and call exit() if compression fails.
+   * Note that this struct must live as long as the main JPEG parameter
+   * struct, to avoid dangling-pointer problems.
+   */
+  struct jpeg_error_mgr jerr;
+  /* More stuff */
+  FILE *outfile;                /* target file */
+  JSAMPARRAY image_buffer = NULL;
+                                /* Points to large array of R,G,B-order data */
+  JSAMPROW row_pointer[1];      /* pointer to JSAMPLE row[s] */
+  J12SAMPARRAY image_buffer12 = NULL;
+                                /* Points to large array of R,G,B-order 12-bit
+                                   data */
+  J12SAMPROW row_pointer12[1];  /* pointer to J12SAMPLE row[s] */
+  int row_stride;               /* physical row width in image buffer */
+  int row, col;
+
+  /* Step 1: allocate and initialize JPEG compression object */
+
+  /* We have to set up the error handler first, in case the initialization
+   * step fails.  (Unlikely, but it could happen if you are out of memory.)
+   * This routine fills in the contents of struct jerr, and returns jerr's
+   * address which we place into the link field in cinfo.
+   */
+  cinfo.err = jpeg_std_error(&jerr);
+  /* Now we can initialize the JPEG compression object. */
+  jpeg_create_compress(&cinfo);
+
+  /* Step 2: specify data destination (eg, a file) */
+  /* Note: steps 2 and 3 can be done in either order. */
+
+  /* Here we use the library-supplied code to send compressed data to a
+   * stdio stream.  You can also write your own code to do something else.
+   * VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
+   * requires it in order to write binary files.
+   */
+  if ((outfile = fopen(filename, "wb")) == NULL)
+    ERREXIT(&cinfo, JERR_FILE_WRITE);
+  jpeg_stdio_dest(&cinfo, outfile);
+
+  /* Step 3: set parameters for compression */
+
+  /* First we supply a description of the input image.
+   * Four fields of the cinfo struct must be filled in:
+   */
+  cinfo.image_width = WIDTH;            /* image width and height, in pixels */
+  cinfo.image_height = HEIGHT;
+  cinfo.input_components = 3;           /* # of color components per pixel */
+  cinfo.in_color_space = JCS_RGB;       /* colorspace of input image */
+  cinfo.data_precision = data_precision; /* data precision of input image */
+  /* Now use the library's routine to set default compression parameters.
+   * (You must set at least cinfo.in_color_space before calling this,
+   * since the defaults depend on the source color space.)
+   */
+  jpeg_set_defaults(&cinfo);
+  /* Now you can set any non-default parameters you wish to.
+   * Here we just illustrate the use of quality (quantization table) scaling:
+   */
+  jpeg_set_quality(&cinfo, quality, TRUE /* limit to baseline-JPEG values */);
+  /* Use 4:4:4 subsampling (default is 4:2:0) */
+  cinfo.comp_info[0].h_samp_factor = cinfo.comp_info[0].v_samp_factor = 1;
+
+  /* Step 4: Start compressor */
+
+  /* TRUE ensures that we will write a complete interchange-JPEG file.
+   * Pass TRUE unless you are very sure of what you're doing.
+   */
+  jpeg_start_compress(&cinfo, TRUE);
+
+  /* Step 5: allocate and initialize image buffer */
+
+  row_stride = WIDTH * 3;       /* J[12]SAMPLEs per row in image_buffer */
+  /* Make a sample array that will go away when done with image.  Note that,
+   * for the purposes of this example, we could also create a one-row-high
+   * sample array and initialize it for each successive scanline written in the
+   * scanline loop below.
+   */
+  if (cinfo.data_precision == 12) {
+    image_buffer12 = (J12SAMPARRAY)(*cinfo.mem->alloc_sarray)
+      ((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, HEIGHT);
+
+    /* Initialize image buffer with a repeating pattern */
+    for (row = 0; row < HEIGHT; row++) {
+      for (col = 0; col < WIDTH; col++) {
+        image_buffer12[row][col * 3] =
+          (col * (MAXJ12SAMPLE + 1) / WIDTH) % (MAXJ12SAMPLE + 1);
+        image_buffer12[row][col * 3 + 1] =
+          (row * (MAXJ12SAMPLE + 1) / HEIGHT) % (MAXJ12SAMPLE + 1);
+        image_buffer12[row][col * 3 + 2] =
+          (row * (MAXJ12SAMPLE + 1) / HEIGHT +
+           col * (MAXJ12SAMPLE + 1) / WIDTH) % (MAXJ12SAMPLE + 1);
+      }
+    }
+  } else {
+    image_buffer = (*cinfo.mem->alloc_sarray)
+      ((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, HEIGHT);
+
+    for (row = 0; row < HEIGHT; row++) {
+      for (col = 0; col < WIDTH; col++) {
+        image_buffer[row][col * 3] =
+          (col * (MAXJSAMPLE + 1) / WIDTH) % (MAXJSAMPLE + 1);
+        image_buffer[row][col * 3 + 1] =
+          (row * (MAXJSAMPLE + 1) / HEIGHT) % (MAXJSAMPLE + 1);
+        image_buffer[row][col * 3 + 2] =
+          (row * (MAXJSAMPLE + 1) / HEIGHT + col * (MAXJSAMPLE + 1) / WIDTH) %
+          (MAXJSAMPLE + 1);
+      }
+    }
+  }
+
+  /* Step 6: while (scan lines remain to be written) */
+  /*           jpeg_write_scanlines(...); */
+
+  /* Here we use the library's state variable cinfo.next_scanline as the
+   * loop counter, so that we don't have to keep track ourselves.
+   * To keep things simple, we pass one scanline per call; you can pass
+   * more if you wish, though.
+   */
+  if (cinfo.data_precision == 12) {
+    while (cinfo.next_scanline < cinfo.image_height) {
+      /* jpeg12_write_scanlines expects an array of pointers to scanlines.
+       * Here the array is only one element long, but you could pass
+       * more than one scanline at a time if that's more convenient.
+       */
+      row_pointer12[0] = image_buffer12[cinfo.next_scanline];
+      (void)jpeg12_write_scanlines(&cinfo, row_pointer12, 1);
+    }
+  } else {
+    while (cinfo.next_scanline < cinfo.image_height) {
+      /* jpeg_write_scanlines expects an array of pointers to scanlines.
+       * Here the array is only one element long, but you could pass
+       * more than one scanline at a time if that's more convenient.
+       */
+      row_pointer[0] = image_buffer[cinfo.next_scanline];
+      (void)jpeg_write_scanlines(&cinfo, row_pointer, 1);
+    }
+  }
+
+  /* Step 7: Finish compression */
+
+  jpeg_finish_compress(&cinfo);
+  /* After finish_compress, we can close the output file. */
+  fclose(outfile);
+
+  /* Step 8: release JPEG compression object */
+
+  /* This is an important step since it will release a good deal of memory. */
+  jpeg_destroy_compress(&cinfo);
+
+  /* And we're done! */
+}
+
+
+/*
+ * SOME FINE POINTS:
+ *
+ * In the above loop, we ignored the return value of jpeg_write_scanlines,
+ * which is the number of scanlines actually written.  We could get away
+ * with this because we were only relying on the value of cinfo.next_scanline,
+ * which will be incremented correctly.  If you maintain additional loop
+ * variables then you should be careful to increment them properly.
+ * Actually, for output to a stdio stream you needn't worry, because
+ * then jpeg_write_scanlines will write all the lines passed (or else exit
+ * with a fatal error).  Partial writes can only occur if you use a data
+ * destination module that can demand suspension of the compressor.
+ * (If you don't know what that's for, you don't need it.)
+ *
+ * Scanlines MUST be supplied in top-to-bottom order if you want your JPEG
+ * files to be compatible with everyone else's.  If you cannot readily read
+ * your data in that order, you'll need an intermediate array to hold the
+ * image.  See rdtarga.c or rdbmp.c for examples of handling bottom-to-top
+ * source data using the JPEG code's internal virtual-array mechanisms.
+ */
+
+
+
+/******************** JPEG DECOMPRESSION SAMPLE INTERFACE *******************/
+
+/* This half of the example shows how to read data from the JPEG decompressor.
+ * It's a bit more refined than the above, in that we show:
+ *   (a) how to modify the JPEG library's standard error-reporting behavior;
+ *   (b) how to allocate workspace using the library's memory manager.
+ *
+ * Just to make this example a little different from the first one, we'll
+ * assume that we do not intend to put the whole image into an in-memory
+ * buffer, but to send it line-by-line someplace else.  We need a one-
+ * scanline-high JSAMPLE or J12SAMPLE array as a work buffer, and we will let
+ * the JPEG memory manager allocate it for us.  This approach is actually quite
+ * useful because we don't need to remember to deallocate the buffer
+ * separately: it will go away automatically when the JPEG object is cleaned
+ * up.
+ */
+
+
+/*
+ * ERROR HANDLING:
+ *
+ * The JPEG library's standard error handler (jerror.c) is divided into
+ * several "methods" which you can override individually.  This lets you
+ * adjust the behavior without duplicating a lot of code, which you might
+ * have to update with each future release.
+ *
+ * Our example here shows how to override the "error_exit" method so that
+ * control is returned to the library's caller when a fatal error occurs,
+ * rather than calling exit() as the standard error_exit method does.
+ *
+ * We use C's setjmp/longjmp facility to return control.  This means that the
+ * routine which calls the JPEG library must first execute a setjmp() call to
+ * establish the return point.  We want the replacement error_exit to do a
+ * longjmp().  But we need to make the setjmp buffer accessible to the
+ * error_exit routine.  To do this, we make a private extension of the
+ * standard JPEG error handler object.  (If we were using C++, we'd say we
+ * were making a subclass of the regular error handler.)
+ *
+ * Here's the extended error handler struct:
+ */
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;    /* "public" fields */
+
+  jmp_buf setjmp_buffer;        /* for return to caller */
+};
+
+typedef struct my_error_mgr *my_error_ptr;
+
+/*
+ * Here's the routine that will replace the standard error_exit method:
+ */
+
+METHODDEF(void)
+my_error_exit(j_common_ptr cinfo)
+{
+  /* cinfo->err really points to a my_error_mgr struct, so coerce pointer */
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
+
+  /* Always display the message. */
+  /* We could postpone this until after returning, if we chose. */
+  (*cinfo->err->output_message) (cinfo);
+
+  /* Return control to the setjmp point */
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+
+METHODDEF(int) do_read_JPEG_file(struct jpeg_decompress_struct *cinfo,
+                                 char *infilename, char *outfilename);
+
+/*
+ * Sample routine for JPEG decompression.  We assume that the source file name
+ * is passed in.  We want to return 1 on success, 0 on error.
+ */
+
+METHODDEF(int)
+read_JPEG_file(char *infilename, char *outfilename)
+{
+  /* This struct contains the JPEG decompression parameters and pointers to
+   * working space (which is allocated as needed by the JPEG library).
+   */
+  struct jpeg_decompress_struct cinfo;
+
+  return do_read_JPEG_file(&cinfo, infilename, outfilename);
+}
+
+/*
+ * We call the libjpeg API from within a separate function, because modifying
+ * the local non-volatile jpeg_decompress_struct instance below the setjmp()
+ * return point and then accessing the instance after setjmp() returns would
+ * result in undefined behavior that may potentially overwrite all or part of
+ * the structure.
+ */
+
+METHODDEF(int)
+do_read_JPEG_file(struct jpeg_decompress_struct *cinfo, char *infilename,
+                  char *outfilename)
+{
+  /* We use our private extension JPEG error handler.
+   * Note that this struct must live as long as the main JPEG parameter
+   * struct, to avoid dangling-pointer problems.
+   */
+  struct my_error_mgr jerr;
+  /* More stuff */
+  FILE *infile;                 /* source file */
+  FILE *outfile;                /* output file */
+  JSAMPARRAY buffer = NULL;     /* Output row buffer */
+  J12SAMPARRAY buffer12 = NULL; /* 12-bit output row buffer */
+  int col;
+  int row_stride;               /* physical row width in output buffer */
+  int little_endian = 1;
+
+  /* In this example we want to open the input and output files before doing
+   * anything else, so that the setjmp() error recovery below can assume the
+   * files are open.
+   *
+   * VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
+   * requires it in order to read/write binary files.
+   */
+
+  if ((infile = fopen(infilename, "rb")) == NULL) {
+    fprintf(stderr, "can't open %s\n", infilename);
+    return 0;
+  }
+  if ((outfile = fopen(outfilename, "wb")) == NULL) {
+    fprintf(stderr, "can't open %s\n", outfilename);
+    fclose(infile);
+    return 0;
+  }
+
+  /* Step 1: allocate and initialize JPEG decompression object */
+
+  /* We set up the normal JPEG error routines, then override error_exit. */
+  cinfo->err = jpeg_std_error(&jerr.pub);
+  jerr.pub.error_exit = my_error_exit;
+  /* Establish the setjmp return context for my_error_exit to use. */
+  if (setjmp(jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error.
+     * We need to clean up the JPEG object, close the input file, and return.
+     */
+    jpeg_destroy_decompress(cinfo);
+    fclose(infile);
+    fclose(outfile);
+    return 0;
+  }
+  /* Now we can initialize the JPEG decompression object. */
+  jpeg_create_decompress(cinfo);
+
+  /* Step 2: specify data source (eg, a file) */
+
+  jpeg_stdio_src(cinfo, infile);
+
+  /* Step 3: read file parameters with jpeg_read_header() */
+
+  (void)jpeg_read_header(cinfo, TRUE);
+  /* We can ignore the return value from jpeg_read_header since
+   *   (a) suspension is not possible with the stdio data source, and
+   *   (b) we passed TRUE to reject a tables-only JPEG file as an error.
+   * See libjpeg.txt for more info.
+   */
+
+  /* emit header for raw PPM format */
+  fprintf(outfile, "P6\n%d %d\n%d\n", WIDTH, HEIGHT,
+          cinfo->data_precision == 12 ? MAXJ12SAMPLE : MAXJSAMPLE);
+
+  /* Step 4: set parameters for decompression */
+
+  /* In this example, we don't need to change any of the defaults set by
+   * jpeg_read_header(), so we do nothing here.
+   */
+
+  /* Step 5: Start decompressor */
+
+  (void)jpeg_start_decompress(cinfo);
+  /* We can ignore the return value since suspension is not possible
+   * with the stdio data source.
+   */
+
+  /* We may need to do some setup of our own at this point before reading
+   * the data.  After jpeg_start_decompress() we have the correct scaled
+   * output image dimensions available, as well as the output colormap
+   * if we asked for color quantization.
+   * In this example, we need to make an output work buffer of the right size.
+   */
+  /* Samples per row in output buffer */
+  row_stride = cinfo->output_width * cinfo->output_components;
+  /* Make a one-row-high sample array that will go away when done with image */
+  if (cinfo->data_precision == 12)
+    buffer12 = (J12SAMPARRAY)(*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, row_stride, 1);
+  else
+    buffer = (*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, row_stride, 1);
+
+  /* Step 6: while (scan lines remain to be read) */
+  /*           jpeg_read_scanlines(...); */
+
+  /* Here we use the library's state variable cinfo->output_scanline as the
+   * loop counter, so that we don't have to keep track ourselves.
+   */
+  if (cinfo->data_precision == 12) {
+    while (cinfo->output_scanline < cinfo->output_height) {
+      /* jpeg12_read_scanlines expects an array of pointers to scanlines.
+       * Here the array is only one element long, but you could ask for
+       * more than one scanline at a time if that's more convenient.
+       */
+      (void)jpeg12_read_scanlines(cinfo, buffer12, 1);
+      if (*(char *)&little_endian == 1) {
+        /* Swap MSB and LSB in each sample */
+        for (col = 0; col < row_stride; col++)
+          buffer12[0][col] = ((buffer12[0][col] & 0xFF) << 8) |
+                             ((buffer12[0][col] >> 8) & 0xFF);
+      }
+      fwrite(buffer12[0], 1, row_stride * sizeof(J12SAMPLE), outfile);
+    }
+  } else {
+    while (cinfo->output_scanline < cinfo->output_height) {
+      /* jpeg_read_scanlines expects an array of pointers to scanlines.
+       * Here the array is only one element long, but you could ask for
+       * more than one scanline at a time if that's more convenient.
+       */
+      (void)jpeg_read_scanlines(cinfo, buffer, 1);
+      fwrite(buffer[0], 1, row_stride, outfile);
+    }
+  }
+
+  /* Step 7: Finish decompression */
+
+  (void)jpeg_finish_decompress(cinfo);
+  /* We can ignore the return value since suspension is not possible
+   * with the stdio data source.
+   */
+
+  /* Step 8: Release JPEG decompression object */
+
+  /* This is an important step since it will release a good deal of memory. */
+  jpeg_destroy_decompress(cinfo);
+
+  /* After finish_decompress, we can close the input and output files.
+   * Here we postpone it until after no more JPEG errors are possible,
+   * so as to simplify the setjmp error logic above.  (Actually, I don't
+   * think that jpeg_destroy can do an error exit, but why assume anything...)
+   */
+  fclose(infile);
+  fclose(outfile);
+
+  /* At this point you may want to check to see whether any corrupt-data
+   * warnings occurred (test whether jerr.pub.num_warnings is nonzero).
+   */
+
+  /* And we're done! */
+  return 1;
+}
+
+
+/*
+ * SOME FINE POINTS:
+ *
+ * In the above code, we ignored the return value of jpeg_read_scanlines,
+ * which is the number of scanlines actually read.  We could get away with
+ * this because we asked for only one line at a time and we weren't using
+ * a suspending data source.  See libjpeg.txt for more info.
+ *
+ * We cheated a bit by calling alloc_sarray() after jpeg_start_decompress();
+ * we should have done it beforehand to ensure that the space would be
+ * counted against the JPEG max_memory setting.  In some systems the above
+ * code would risk an out-of-memory error.  However, in general we don't
+ * know the output image dimensions before jpeg_start_decompress(), unless we
+ * call jpeg_calc_output_dimensions().  See libjpeg.txt for more about this.
+ *
+ * Scanlines are returned in the same order as they appear in the JPEG file,
+ * which is standardly top-to-bottom.  If you must emit data bottom-to-top,
+ * you can use one of the virtual arrays provided by the JPEG memory manager
+ * to invert the data.  See wrbmp.c for an example.
+ */
+
+
+LOCAL(void)
+usage(const char *progname)
+{
+  fprintf(stderr, "usage: %s compress [switches] outputfile[.jpg]\n",
+          progname);
+  fprintf(stderr, "       %s decompress inputfile[.jpg] outputfile[.ppm]\n",
+          progname);
+  fprintf(stderr, "Switches (names may be abbreviated):\n");
+  fprintf(stderr, "  -precision N   Create JPEG file with N-bit data precision\n");
+  fprintf(stderr, "                 (N is 8 or 12; default is 8)\n");
+  fprintf(stderr, "  -quality N     Compression quality (0..100; 5-95 is most useful range,\n");
+  fprintf(stderr, "                 default is 75)\n");
+
+  exit(EXIT_FAILURE);
+}
+
+
+typedef enum {
+  COMPRESS,
+  DECOMPRESS
+} EXAMPLE_MODE;
+
+
+int
+main(int argc, char **argv)
+{
+  int argn, quality = 75;
+  int data_precision = 8;
+  EXAMPLE_MODE mode = -1;
+  char *arg, *filename = NULL;
+
+  if (argc < 3)
+    usage(argv[0]);
+
+  if (!strcasecmp(argv[1], "compress"))
+    mode = COMPRESS;
+  else if (!strcasecmp(argv[1], "decompress"))
+    mode = DECOMPRESS;
+  else
+    usage(argv[0]);
+
+  for (argn = 2; argn < argc; argn++) {
+    arg = argv[argn];
+    if (*arg != '-') {
+      filename = arg;
+      /* Not a switch, must be a file name argument */
+      break;                    /* done parsing switches */
+    }
+    arg++;                      /* advance past switch marker character */
+
+    if (!strncasecmp(arg, "p", 1)) {
+      /* Set data precision. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage(argv[0]);
+      if (sscanf(argv[argn], "%d", &data_precision) < 1 ||
+          (data_precision != 8 && data_precision != 12))
+        usage(argv[0]);
+    } else if (!strncasecmp(arg, "q", 1)) {
+      /* Quality rating (quantization table scaling factor). */
+      if (++argn >= argc)       /* advance to next argument */
+        usage(argv[0]);
+      if (sscanf(argv[argn], "%d", &quality) < 1 || quality < 0 ||
+          quality > 100)
+        usage(argv[0]);
+      if (quality < 1)
+        quality = 1;
+    }
+  }
+
+  if (!filename)
+    usage(argv[0]);
+
+  if (mode == COMPRESS)
+    write_JPEG_file(filename, quality, data_precision);
+  else if (mode == DECOMPRESS) {
+    if (argc - argn < 2)
+      usage(argv[0]);
+
+    read_JPEG_file(argv[argn], argv[argn + 1]);
+  }
+
+  return 0;
+}
diff --git a/3rdparty/libjpeg-turbo/src/jcapimin.c b/3rdparty/libjpeg-turbo/src/jcapimin.c
index 84e7ecc9a73e..cbb3d13e1cef 100644
--- a/3rdparty/libjpeg-turbo/src/jcapimin.c
+++ b/3rdparty/libjpeg-turbo/src/jcapimin.c
@@ -23,6 +23,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcmaster.h"
 
 
 /*
@@ -90,8 +91,18 @@ jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
 
   cinfo->input_gamma = 1.0;     /* in case application forgets */
 
+  cinfo->data_precision = BITS_IN_JSAMPLE;
+
   /* OK, I'm ready */
   cinfo->global_state = CSTATE_START;
+
+  /* The master struct is used to store extension parameters, so we allocate it
+   * here.
+   */
+  cinfo->master = (struct jpeg_comp_master *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+                                  sizeof(my_comp_master));
+  memset(cinfo->master, 0, sizeof(my_comp_master));
 }
 
 
@@ -183,8 +194,20 @@ jpeg_finish_compress(j_compress_ptr cinfo)
       /* We bypass the main controller and invoke coef controller directly;
        * all work is being done from the coefficient buffer.
        */
-      if (!(*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE)NULL))
-        ERREXIT(cinfo, JERR_CANT_SUSPEND);
+      if (cinfo->data_precision == 16) {
+#ifdef C_LOSSLESS_SUPPORTED
+        if (!(*cinfo->coef->compress_data_16) (cinfo, (J16SAMPIMAGE)NULL))
+          ERREXIT(cinfo, JERR_CANT_SUSPEND);
+#else
+        ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+#endif
+      } else if (cinfo->data_precision == 12) {
+        if (!(*cinfo->coef->compress_data_12) (cinfo, (J12SAMPIMAGE)NULL))
+          ERREXIT(cinfo, JERR_CANT_SUSPEND);
+      } else {
+        if (!(*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE)NULL))
+          ERREXIT(cinfo, JERR_CANT_SUSPEND);
+      }
     }
     (*cinfo->master->finish_pass) (cinfo);
   }
diff --git a/3rdparty/libjpeg-turbo/src/jcapistd.c b/3rdparty/libjpeg-turbo/src/jcapistd.c
index aa2aad9f66cd..2053028f2bf0 100644
--- a/3rdparty/libjpeg-turbo/src/jcapistd.c
+++ b/3rdparty/libjpeg-turbo/src/jcapistd.c
@@ -1,8 +1,10 @@
 /*
  * jcapistd.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -18,8 +20,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE == 8
+
 /*
  * Compression initialization.
  * Before calling this, all parameters and a data destination must be set up.
@@ -51,13 +56,15 @@ jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables)
   jinit_compress_master(cinfo);
   /* Set up for the first pass */
   (*cinfo->master->prepare_for_pass) (cinfo);
-  /* Ready for application to drive first pass through jpeg_write_scanlines
-   * or jpeg_write_raw_data.
+  /* Ready for application to drive first pass through _jpeg_write_scanlines
+   * or _jpeg_write_raw_data.
    */
   cinfo->next_scanline = 0;
   cinfo->global_state = (cinfo->raw_data_in ? CSTATE_RAW_OK : CSTATE_SCANNING);
 }
 
+#endif
+
 
 /*
  * Write some scanlines of data to the JPEG compressor.
@@ -67,7 +74,7 @@ jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables)
  * the data destination module has requested suspension of the compressor,
  * or if more than image_height scanlines are passed in.
  *
- * Note: we warn about excess calls to jpeg_write_scanlines() since
+ * Note: we warn about excess calls to _jpeg_write_scanlines() since
  * this likely signals an application programmer error.  However,
  * excess scanlines passed in the last valid call are *silently* ignored,
  * so that the application need not adjust num_lines for end-of-image
@@ -75,11 +82,15 @@ jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
-                     JDIMENSION num_lines)
+_jpeg_write_scanlines(j_compress_ptr cinfo, _JSAMPARRAY scanlines,
+                      JDIMENSION num_lines)
 {
+#if BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED)
   JDIMENSION row_ctr, rows_left;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   if (cinfo->global_state != CSTATE_SCANNING)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   if (cinfo->next_scanline >= cinfo->image_height)
@@ -93,9 +104,9 @@ jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
   }
 
   /* Give master control module another chance if this is first call to
-   * jpeg_write_scanlines.  This lets output of the frame/scan headers be
+   * _jpeg_write_scanlines.  This lets output of the frame/scan headers be
    * delayed so that application can write COM, etc, markers between
-   * jpeg_start_compress and jpeg_write_scanlines.
+   * jpeg_start_compress and _jpeg_write_scanlines.
    */
   if (cinfo->master->call_pass_startup)
     (*cinfo->master->pass_startup) (cinfo);
@@ -106,23 +117,35 @@ jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
     num_lines = rows_left;
 
   row_ctr = 0;
-  (*cinfo->main->process_data) (cinfo, scanlines, &row_ctr, num_lines);
+  (*cinfo->main->_process_data) (cinfo, scanlines, &row_ctr, num_lines);
   cinfo->next_scanline += row_ctr;
   return row_ctr;
+#else
+  ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+  return 0;
+#endif
 }
 
 
+#if BITS_IN_JSAMPLE != 16
+
 /*
  * Alternate entry point to write raw data.
  * Processes exactly one iMCU row per call, unless suspended.
  */
 
 GLOBAL(JDIMENSION)
-jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
-                    JDIMENSION num_lines)
+_jpeg_write_raw_data(j_compress_ptr cinfo, _JSAMPIMAGE data,
+                     JDIMENSION num_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  if (cinfo->master->lossless)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
   if (cinfo->global_state != CSTATE_RAW_OK)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   if (cinfo->next_scanline >= cinfo->image_height) {
@@ -138,9 +161,9 @@ jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
   }
 
   /* Give master control module another chance if this is first call to
-   * jpeg_write_raw_data.  This lets output of the frame/scan headers be
+   * _jpeg_write_raw_data.  This lets output of the frame/scan headers be
    * delayed so that application can write COM, etc, markers between
-   * jpeg_start_compress and jpeg_write_raw_data.
+   * jpeg_start_compress and _jpeg_write_raw_data.
    */
   if (cinfo->master->call_pass_startup)
     (*cinfo->master->pass_startup) (cinfo);
@@ -151,7 +174,7 @@ jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Directly compress the row. */
-  if (!(*cinfo->coef->compress_data) (cinfo, data)) {
+  if (!(*cinfo->coef->_compress_data) (cinfo, data)) {
     /* If compressor did not consume the whole row, suspend processing. */
     return 0;
   }
@@ -160,3 +183,5 @@ jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
   cinfo->next_scanline += lines_per_iMCU_row;
   return lines_per_iMCU_row;
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 */
diff --git a/3rdparty/libjpeg-turbo/src/jccoefct.c b/3rdparty/libjpeg-turbo/src/jccoefct.c
index 068232a527d1..2a5dde2d07e8 100644
--- a/3rdparty/libjpeg-turbo/src/jccoefct.c
+++ b/3rdparty/libjpeg-turbo/src/jccoefct.c
@@ -3,19 +3,20 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code and
- * information relevant to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains the coefficient buffer controller for compression.
- * This controller is the top level of the JPEG compressor proper.
+ * This controller is the top level of the lossy JPEG compressor proper.
  * The coefficient buffer lies between forward-DCT and entropy encoding steps.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
 
 /* We use a full-image coefficient buffer when doing Huffman optimization,
@@ -58,11 +59,12 @@ typedef my_coef_controller *my_coef_ptr;
 
 
 /* Forward declarations */
-METHODDEF(boolean) compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_data(j_compress_ptr cinfo, _JSAMPIMAGE input_buf);
 #ifdef FULL_COEF_BUFFER_SUPPORTED
 METHODDEF(boolean) compress_first_pass(j_compress_ptr cinfo,
-                                       JSAMPIMAGE input_buf);
-METHODDEF(boolean) compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+                                       _JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_output(j_compress_ptr cinfo,
+                                   _JSAMPIMAGE input_buf);
 #endif
 
 
@@ -106,18 +108,18 @@ start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
   case JBUF_PASS_THRU:
     if (coef->whole_image[0] != NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    coef->pub.compress_data = compress_data;
+    coef->pub._compress_data = compress_data;
     break;
 #ifdef FULL_COEF_BUFFER_SUPPORTED
   case JBUF_SAVE_AND_PASS:
     if (coef->whole_image[0] == NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    coef->pub.compress_data = compress_first_pass;
+    coef->pub._compress_data = compress_first_pass;
     break;
   case JBUF_CRANK_DEST:
     if (coef->whole_image[0] == NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    coef->pub.compress_data = compress_output;
+    coef->pub._compress_data = compress_output;
     break;
 #endif
   default:
@@ -138,7 +140,7 @@ start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(boolean)
-compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_data(j_compress_ptr cinfo, _JSAMPIMAGE input_buf)
 {
   my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
@@ -172,10 +174,10 @@ compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
           if (coef->iMCU_row_num < last_iMCU_row ||
               yoffset + yindex < compptr->last_row_height) {
-            (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-                                         input_buf[compptr->component_index],
-                                         coef->MCU_buffer[blkn],
-                                         ypos, xpos, (JDIMENSION)blockcnt);
+            (*cinfo->fdct->_forward_DCT) (cinfo, compptr,
+                                          input_buf[compptr->component_index],
+                                          coef->MCU_buffer[blkn],
+                                          ypos, xpos, (JDIMENSION)blockcnt);
             if (blockcnt < compptr->MCU_width) {
               /* Create some dummy blocks at the right edge of the image. */
               jzero_far((void *)coef->MCU_buffer[blkn + blockcnt],
@@ -242,7 +244,7 @@ compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 METHODDEF(boolean)
-compress_first_pass(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_first_pass(j_compress_ptr cinfo, _JSAMPIMAGE input_buf)
 {
   my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -279,10 +281,10 @@ compress_first_pass(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
      */
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
-      (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-                                   input_buf[ci], thisblockrow,
-                                   (JDIMENSION)(block_row * DCTSIZE),
-                                   (JDIMENSION)0, blocks_across);
+      (*cinfo->fdct->_forward_DCT) (cinfo, compptr,
+                                    input_buf[ci], thisblockrow,
+                                    (JDIMENSION)(block_row * DCTSIZE),
+                                    (JDIMENSION)0, blocks_across);
       if (ndummy > 0) {
         /* Create dummy blocks at the right edge of the image. */
         thisblockrow += blocks_across; /* => first dummy block */
@@ -338,7 +340,7 @@ compress_first_pass(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 METHODDEF(boolean)
-compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, _JSAMPIMAGE input_buf)
 {
   my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
@@ -402,10 +404,13 @@ compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 GLOBAL(void)
-jinit_c_coef_controller(j_compress_ptr cinfo, boolean need_full_buffer)
+_jinit_c_coef_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   coef = (my_coef_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
diff --git a/3rdparty/libjpeg-turbo/src/jccolext.c b/3rdparty/libjpeg-turbo/src/jccolext.c
index 303b322ce674..8eba36c4dffe 100644
--- a/3rdparty/libjpeg-turbo/src/jccolext.c
+++ b/3rdparty/libjpeg-turbo/src/jccolext.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2012, 2015, D. R. Commander.
+ * Copyright (C) 2009-2012, 2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -29,15 +29,16 @@
 
 INLINE
 LOCAL(void)
-rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+rgb_ycc_convert_internal(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+                         _JSAMPIMAGE output_buf, JDIMENSION output_row,
                          int num_rows)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_ycc_tab;
-  register JSAMPROW inptr;
-  register JSAMPROW outptr0, outptr1, outptr2;
+  register _JSAMPROW inptr;
+  register _JSAMPROW outptr0, outptr1, outptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->image_width;
 
@@ -48,26 +49,29 @@ rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = inptr[RGB_RED];
-      g = inptr[RGB_GREEN];
-      b = inptr[RGB_BLUE];
+      r = RANGE_LIMIT(inptr[RGB_RED]);
+      g = RANGE_LIMIT(inptr[RGB_GREEN]);
+      b = RANGE_LIMIT(inptr[RGB_BLUE]);
       inptr += RGB_PIXELSIZE;
-      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
+      /* If the inputs are 0.._MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
        * Hence the value being shifted is never negative, and we don't
        * need the general RIGHT_SHIFT macro.
        */
       /* Y */
-      outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
-                                ctab[b + B_Y_OFF]) >> SCALEBITS);
+      outptr0[col] = (_JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                 ctab[b + B_Y_OFF]) >> SCALEBITS);
       /* Cb */
-      outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
-                                ctab[b + B_CB_OFF]) >> SCALEBITS);
+      outptr1[col] = (_JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+                                 ctab[b + B_CB_OFF]) >> SCALEBITS);
       /* Cr */
-      outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
-                                ctab[b + B_CR_OFF]) >> SCALEBITS);
+      outptr2[col] = (_JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+                                 ctab[b + B_CR_OFF]) >> SCALEBITS);
     }
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -83,15 +87,16 @@ rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
 
 INLINE
 LOCAL(void)
-rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                          JSAMPIMAGE output_buf, JDIMENSION output_row,
+rgb_gray_convert_internal(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+                          _JSAMPIMAGE output_buf, JDIMENSION output_row,
                           int num_rows)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_ycc_tab;
-  register JSAMPROW inptr;
-  register JSAMPROW outptr;
+  register _JSAMPROW inptr;
+  register _JSAMPROW outptr;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->image_width;
 
@@ -100,15 +105,18 @@ rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = inptr[RGB_RED];
-      g = inptr[RGB_GREEN];
-      b = inptr[RGB_BLUE];
+      r = RANGE_LIMIT(inptr[RGB_RED]);
+      g = RANGE_LIMIT(inptr[RGB_GREEN]);
+      b = RANGE_LIMIT(inptr[RGB_BLUE]);
       inptr += RGB_PIXELSIZE;
       /* Y */
-      outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
-                               ctab[b + B_Y_OFF]) >> SCALEBITS);
+      outptr[col] = (_JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                ctab[b + B_Y_OFF]) >> SCALEBITS);
     }
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -119,12 +127,12 @@ rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
 
 INLINE
 LOCAL(void)
-rgb_rgb_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+rgb_rgb_convert_internal(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+                         _JSAMPIMAGE output_buf, JDIMENSION output_row,
                          int num_rows)
 {
-  register JSAMPROW inptr;
-  register JSAMPROW outptr0, outptr1, outptr2;
+  register _JSAMPROW inptr;
+  register _JSAMPROW outptr0, outptr1, outptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->image_width;
 
diff --git a/3rdparty/libjpeg-turbo/src/jccolor.c b/3rdparty/libjpeg-turbo/src/jccolor.c
index bdc563c723ca..cd3a6a7a567a 100644
--- a/3rdparty/libjpeg-turbo/src/jccolor.c
+++ b/3rdparty/libjpeg-turbo/src/jccolor.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2012, 2015, D. R. Commander.
+ * Copyright (C) 2009-2012, 2015, 2022, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -17,16 +17,20 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jsimd.h"
-#include "jconfigint.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED)
+
 /* Private subobject */
 
 typedef struct {
   struct jpeg_color_converter pub; /* public fields */
 
+#if BITS_IN_JSAMPLE != 16
   /* Private state for RGB->YCC conversion */
   JLONG *rgb_ycc_tab;           /* => table for RGB to YCbCr conversion */
+#endif
 } my_color_converter;
 
 typedef my_color_converter *my_cconvert_ptr;
@@ -36,14 +40,14 @@ typedef my_color_converter *my_cconvert_ptr;
 
 /*
  * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
- * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
+ * normalized to the range 0.._MAXJSAMPLE rather than -0.5 .. 0.5.
  * The conversion equations to be implemented are therefore
  *      Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
- *      Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + CENTERJSAMPLE
- *      Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + CENTERJSAMPLE
+ *      Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + _CENTERJSAMPLE
+ *      Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + _CENTERJSAMPLE
  * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
- * Note: older versions of the IJG code used a zero offset of MAXJSAMPLE/2,
- * rather than CENTERJSAMPLE, for Cb and Cr.  This gave equal positive and
+ * Note: older versions of the IJG code used a zero offset of _MAXJSAMPLE/2,
+ * rather than _CENTERJSAMPLE, for Cb and Cr.  This gave equal positive and
  * negative swings for Cb/Cr, but meant that grayscale values (Cb=Cr=0)
  * were not represented exactly.  Now we sacrifice exact representation of
  * maximum red and maximum blue in order to get exact grayscales.
@@ -54,16 +58,16 @@ typedef my_color_converter *my_cconvert_ptr;
  *
  * For even more speed, we avoid doing any multiplications in the inner loop
  * by precalculating the constants times R,G,B for all possible values.
- * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
+ * For 8-bit samples this is very reasonable (only 256 entries per table);
  * for 12-bit samples it is still acceptable.  It's not very reasonable for
  * 16-bit samples, but if you want lossless storage you shouldn't be changing
  * colorspace anyway.
- * The CENTERJSAMPLE offsets and the rounding fudge-factor of 0.5 are included
+ * The _CENTERJSAMPLE offsets and the rounding fudge-factor of 0.5 are included
  * in the tables to save adding them separately in the inner loop.
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define CBCR_OFFSET     ((JLONG)CENTERJSAMPLE << SCALEBITS)
+#define CBCR_OFFSET     ((JLONG)_CENTERJSAMPLE << SCALEBITS)
 #define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
 #define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
@@ -74,15 +78,27 @@ typedef my_color_converter *my_cconvert_ptr;
  */
 
 #define R_Y_OFF         0                       /* offset to R => Y section */
-#define G_Y_OFF         (1 * (MAXJSAMPLE + 1))  /* offset to G => Y section */
-#define B_Y_OFF         (2 * (MAXJSAMPLE + 1))  /* etc. */
-#define R_CB_OFF        (3 * (MAXJSAMPLE + 1))
-#define G_CB_OFF        (4 * (MAXJSAMPLE + 1))
-#define B_CB_OFF        (5 * (MAXJSAMPLE + 1))
+#define G_Y_OFF         (1 * (_MAXJSAMPLE + 1)) /* offset to G => Y section */
+#define B_Y_OFF         (2 * (_MAXJSAMPLE + 1)) /* etc. */
+#define R_CB_OFF        (3 * (_MAXJSAMPLE + 1))
+#define G_CB_OFF        (4 * (_MAXJSAMPLE + 1))
+#define B_CB_OFF        (5 * (_MAXJSAMPLE + 1))
 #define R_CR_OFF        B_CB_OFF                /* B=>Cb, R=>Cr are the same */
-#define G_CR_OFF        (6 * (MAXJSAMPLE + 1))
-#define B_CR_OFF        (7 * (MAXJSAMPLE + 1))
-#define TABLE_SIZE      (8 * (MAXJSAMPLE + 1))
+#define G_CR_OFF        (6 * (_MAXJSAMPLE + 1))
+#define B_CR_OFF        (7 * (_MAXJSAMPLE + 1))
+#define TABLE_SIZE      (8 * (_MAXJSAMPLE + 1))
+
+/* 12-bit samples use a 16-bit data type, so it is possible to pass
+ * out-of-range sample values (< 0 or > 4095) to jpeg_write_scanlines().
+ * Thus, we mask the incoming 12-bit samples to guard against overrunning
+ * or underrunning the conversion tables.
+ */
+
+#if BITS_IN_JSAMPLE == 12
+#define RANGE_LIMIT(value)  ((value) & 0xFFF)
+#else
+#define RANGE_LIMIT(value)  (value)
+#endif
 
 
 /* Include inline routines for colorspace extensions */
@@ -197,6 +213,7 @@ typedef my_color_converter *my_cconvert_ptr;
 METHODDEF(void)
 rgb_ycc_start(j_compress_ptr cinfo)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   JLONG *rgb_ycc_tab;
   JLONG i;
@@ -206,15 +223,15 @@ rgb_ycc_start(j_compress_ptr cinfo)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (TABLE_SIZE * sizeof(JLONG)));
 
-  for (i = 0; i <= MAXJSAMPLE; i++) {
+  for (i = 0; i <= _MAXJSAMPLE; i++) {
     rgb_ycc_tab[i + R_Y_OFF] = FIX(0.29900) * i;
     rgb_ycc_tab[i + G_Y_OFF] = FIX(0.58700) * i;
     rgb_ycc_tab[i + B_Y_OFF] = FIX(0.11400) * i   + ONE_HALF;
     rgb_ycc_tab[i + R_CB_OFF] = (-FIX(0.16874)) * i;
     rgb_ycc_tab[i + G_CB_OFF] = (-FIX(0.33126)) * i;
     /* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
-     * This ensures that the maximum output will round to MAXJSAMPLE
-     * not MAXJSAMPLE+1, and thus that we don't have to range-limit.
+     * This ensures that the maximum output will round to _MAXJSAMPLE
+     * not _MAXJSAMPLE+1, and thus that we don't have to range-limit.
      */
     rgb_ycc_tab[i + B_CB_OFF] = FIX(0.50000) * i  + CBCR_OFFSET + ONE_HALF - 1;
 /*  B=>Cb and R=>Cr tables are the same
@@ -223,6 +240,9 @@ rgb_ycc_start(j_compress_ptr cinfo)
     rgb_ycc_tab[i + G_CR_OFF] = (-FIX(0.41869)) * i;
     rgb_ycc_tab[i + B_CR_OFF] = (-FIX(0.08131)) * i;
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -231,8 +251,8 @@ rgb_ycc_start(j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+rgb_ycc_convert(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+                _JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
@@ -279,8 +299,8 @@ rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 METHODDEF(void)
-rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                 JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+rgb_gray_convert(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+                 _JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
@@ -324,8 +344,8 @@ rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 METHODDEF(void)
-rgb_rgb_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+rgb_rgb_convert(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+                _JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
@@ -373,14 +393,15 @@ rgb_rgb_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 METHODDEF(void)
-cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                  JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+cmyk_ycck_convert(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+                  _JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_ycc_tab;
-  register JSAMPROW inptr;
-  register JSAMPROW outptr0, outptr1, outptr2, outptr3;
+  register _JSAMPROW inptr;
+  register _JSAMPROW outptr0, outptr1, outptr2, outptr3;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->image_width;
 
@@ -392,28 +413,31 @@ cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr3 = output_buf[3][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = MAXJSAMPLE - inptr[0];
-      g = MAXJSAMPLE - inptr[1];
-      b = MAXJSAMPLE - inptr[2];
+      r = _MAXJSAMPLE - RANGE_LIMIT(inptr[0]);
+      g = _MAXJSAMPLE - RANGE_LIMIT(inptr[1]);
+      b = _MAXJSAMPLE - RANGE_LIMIT(inptr[2]);
       /* K passes through as-is */
       outptr3[col] = inptr[3];
       inptr += 4;
-      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
+      /* If the inputs are 0.._MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
        * Hence the value being shifted is never negative, and we don't
        * need the general RIGHT_SHIFT macro.
        */
       /* Y */
-      outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
-                                ctab[b + B_Y_OFF]) >> SCALEBITS);
+      outptr0[col] = (_JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                 ctab[b + B_Y_OFF]) >> SCALEBITS);
       /* Cb */
-      outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
-                                ctab[b + B_CB_OFF]) >> SCALEBITS);
+      outptr1[col] = (_JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+                                 ctab[b + B_CB_OFF]) >> SCALEBITS);
       /* Cr */
-      outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
-                                ctab[b + B_CR_OFF]) >> SCALEBITS);
+      outptr2[col] = (_JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+                                 ctab[b + B_CR_OFF]) >> SCALEBITS);
     }
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -424,11 +448,11 @@ cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 METHODDEF(void)
-grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                  JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
+grayscale_convert(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+                  _JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
-  register JSAMPROW inptr;
-  register JSAMPROW outptr;
+  register _JSAMPROW inptr;
+  register _JSAMPROW outptr;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->image_width;
   int instride = cinfo->input_components;
@@ -452,11 +476,11 @@ grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 METHODDEF(void)
-null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-             JDIMENSION output_row, int num_rows)
+null_convert(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
+             _JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
-  register JSAMPROW inptr;
-  register JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3;
+  register _JSAMPROW inptr;
+  register _JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3;
   register JDIMENSION col;
   register int ci;
   int nc = cinfo->num_components;
@@ -524,10 +548,13 @@ null_method(j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_color_converter(j_compress_ptr cinfo)
+_jinit_color_converter(j_compress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   cconvert = (my_cconvert_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_color_converter));
@@ -574,123 +601,116 @@ jinit_color_converter(j_compress_ptr cinfo)
     break;
   }
 
-  /* Check num_components, set conversion method based on requested space */
+  /* Check num_components, set conversion method based on requested space.
+   * NOTE: We do not allow any lossy color conversion algorithms in lossless
+   * mode.
+   */
   switch (cinfo->jpeg_color_space) {
   case JCS_GRAYSCALE:
+    if (cinfo->master->lossless &&
+        cinfo->in_color_space != cinfo->jpeg_color_space)
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     if (cinfo->num_components != 1)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     if (cinfo->in_color_space == JCS_GRAYSCALE)
-      cconvert->pub.color_convert = grayscale_convert;
-    else if (cinfo->in_color_space == JCS_RGB ||
-             cinfo->in_color_space == JCS_EXT_RGB ||
-             cinfo->in_color_space == JCS_EXT_RGBX ||
-             cinfo->in_color_space == JCS_EXT_BGR ||
-             cinfo->in_color_space == JCS_EXT_BGRX ||
-             cinfo->in_color_space == JCS_EXT_XBGR ||
-             cinfo->in_color_space == JCS_EXT_XRGB ||
-             cinfo->in_color_space == JCS_EXT_RGBA ||
-             cinfo->in_color_space == JCS_EXT_BGRA ||
-             cinfo->in_color_space == JCS_EXT_ABGR ||
-             cinfo->in_color_space == JCS_EXT_ARGB) {
+      cconvert->pub._color_convert = grayscale_convert;
+    else if (IsExtRGB(cinfo->in_color_space)) {
+#ifdef WITH_SIMD
       if (jsimd_can_rgb_gray())
-        cconvert->pub.color_convert = jsimd_rgb_gray_convert;
-      else {
+        cconvert->pub._color_convert = jsimd_rgb_gray_convert;
+      else
+#endif
+      {
         cconvert->pub.start_pass = rgb_ycc_start;
-        cconvert->pub.color_convert = rgb_gray_convert;
+        cconvert->pub._color_convert = rgb_gray_convert;
       }
     } else if (cinfo->in_color_space == JCS_YCbCr)
-      cconvert->pub.color_convert = grayscale_convert;
+      cconvert->pub._color_convert = grayscale_convert;
     else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
   case JCS_RGB:
+    if (cinfo->master->lossless && !IsExtRGB(cinfo->in_color_space))
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     if (rgb_red[cinfo->in_color_space] == 0 &&
         rgb_green[cinfo->in_color_space] == 1 &&
         rgb_blue[cinfo->in_color_space] == 2 &&
         rgb_pixelsize[cinfo->in_color_space] == 3) {
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
       if (jsimd_c_can_null_convert())
-        cconvert->pub.color_convert = jsimd_c_null_convert;
+        cconvert->pub._color_convert = jsimd_c_null_convert;
       else
 #endif
-        cconvert->pub.color_convert = null_convert;
-    } else if (cinfo->in_color_space == JCS_RGB ||
-               cinfo->in_color_space == JCS_EXT_RGB ||
-               cinfo->in_color_space == JCS_EXT_RGBX ||
-               cinfo->in_color_space == JCS_EXT_BGR ||
-               cinfo->in_color_space == JCS_EXT_BGRX ||
-               cinfo->in_color_space == JCS_EXT_XBGR ||
-               cinfo->in_color_space == JCS_EXT_XRGB ||
-               cinfo->in_color_space == JCS_EXT_RGBA ||
-               cinfo->in_color_space == JCS_EXT_BGRA ||
-               cinfo->in_color_space == JCS_EXT_ABGR ||
-               cinfo->in_color_space == JCS_EXT_ARGB)
-      cconvert->pub.color_convert = rgb_rgb_convert;
+        cconvert->pub._color_convert = null_convert;
+    } else if (IsExtRGB(cinfo->in_color_space))
+      cconvert->pub._color_convert = rgb_rgb_convert;
     else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
   case JCS_YCbCr:
+    if (cinfo->master->lossless &&
+        cinfo->in_color_space != cinfo->jpeg_color_space)
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB ||
-        cinfo->in_color_space == JCS_EXT_RGB ||
-        cinfo->in_color_space == JCS_EXT_RGBX ||
-        cinfo->in_color_space == JCS_EXT_BGR ||
-        cinfo->in_color_space == JCS_EXT_BGRX ||
-        cinfo->in_color_space == JCS_EXT_XBGR ||
-        cinfo->in_color_space == JCS_EXT_XRGB ||
-        cinfo->in_color_space == JCS_EXT_RGBA ||
-        cinfo->in_color_space == JCS_EXT_BGRA ||
-        cinfo->in_color_space == JCS_EXT_ABGR ||
-        cinfo->in_color_space == JCS_EXT_ARGB) {
+    if (IsExtRGB(cinfo->in_color_space)) {
+#ifdef WITH_SIMD
       if (jsimd_can_rgb_ycc())
-        cconvert->pub.color_convert = jsimd_rgb_ycc_convert;
-      else {
+        cconvert->pub._color_convert = jsimd_rgb_ycc_convert;
+      else
+#endif
+      {
         cconvert->pub.start_pass = rgb_ycc_start;
-        cconvert->pub.color_convert = rgb_ycc_convert;
+        cconvert->pub._color_convert = rgb_ycc_convert;
       }
     } else if (cinfo->in_color_space == JCS_YCbCr) {
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
       if (jsimd_c_can_null_convert())
-        cconvert->pub.color_convert = jsimd_c_null_convert;
+        cconvert->pub._color_convert = jsimd_c_null_convert;
       else
 #endif
-        cconvert->pub.color_convert = null_convert;
+        cconvert->pub._color_convert = null_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
   case JCS_CMYK:
+    if (cinfo->master->lossless &&
+        cinfo->in_color_space != cinfo->jpeg_color_space)
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     if (cinfo->num_components != 4)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     if (cinfo->in_color_space == JCS_CMYK) {
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
       if (jsimd_c_can_null_convert())
-        cconvert->pub.color_convert = jsimd_c_null_convert;
+        cconvert->pub._color_convert = jsimd_c_null_convert;
       else
 #endif
-        cconvert->pub.color_convert = null_convert;
+        cconvert->pub._color_convert = null_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
   case JCS_YCCK:
+    if (cinfo->master->lossless &&
+        cinfo->in_color_space != cinfo->jpeg_color_space)
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     if (cinfo->num_components != 4)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     if (cinfo->in_color_space == JCS_CMYK) {
       cconvert->pub.start_pass = rgb_ycc_start;
-      cconvert->pub.color_convert = cmyk_ycck_convert;
+      cconvert->pub._color_convert = cmyk_ycck_convert;
     } else if (cinfo->in_color_space == JCS_YCCK) {
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
       if (jsimd_c_can_null_convert())
-        cconvert->pub.color_convert = jsimd_c_null_convert;
+        cconvert->pub._color_convert = jsimd_c_null_convert;
       else
 #endif
-        cconvert->pub.color_convert = null_convert;
+        cconvert->pub._color_convert = null_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
@@ -699,12 +719,14 @@ jinit_color_converter(j_compress_ptr cinfo)
     if (cinfo->jpeg_color_space != cinfo->in_color_space ||
         cinfo->num_components != cinfo->input_components)
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
     if (jsimd_c_can_null_convert())
-      cconvert->pub.color_convert = jsimd_c_null_convert;
+      cconvert->pub._color_convert = jsimd_c_null_convert;
     else
 #endif
-      cconvert->pub.color_convert = null_convert;
+      cconvert->pub._color_convert = null_convert;
     break;
   }
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jcdctmgr.c b/3rdparty/libjpeg-turbo/src/jcdctmgr.c
index 7dae17a6e149..7191ee73169c 100644
--- a/3rdparty/libjpeg-turbo/src/jcdctmgr.c
+++ b/3rdparty/libjpeg-turbo/src/jcdctmgr.c
@@ -6,7 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2015, D. R. Commander.
+ * Copyright (C) 2011, 2014-2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -28,10 +28,10 @@
 typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
 typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
 
-typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
+typedef void (*convsamp_method_ptr) (_JSAMPARRAY sample_data,
                                      JDIMENSION start_col,
                                      DCTELEM *workspace);
-typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data,
+typedef void (*float_convsamp_method_ptr) (_JSAMPARRAY sample_data,
                                            JDIMENSION start_col,
                                            FAST_FLOAT *workspace);
 
@@ -265,9 +265,13 @@ start_pass_fdctmgr(j_compress_ptr cinfo)
       dtbl = fdct->divisors[qtblno];
       for (i = 0; i < DCTSIZE2; i++) {
 #if BITS_IN_JSAMPLE == 8
+#ifdef WITH_SIMD
         if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
             fdct->quantize == jsimd_quantize)
           fdct->quantize = quantize;
+#else
+        compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
+#endif
 #else
         dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
 #endif
@@ -305,12 +309,19 @@ start_pass_fdctmgr(j_compress_ptr cinfo)
         dtbl = fdct->divisors[qtblno];
         for (i = 0; i < DCTSIZE2; i++) {
 #if BITS_IN_JSAMPLE == 8
+#ifdef WITH_SIMD
           if (!compute_reciprocal(
                 DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
                                       (JLONG)aanscales[i]),
                         CONST_BITS - 3), &dtbl[i]) &&
               fdct->quantize == jsimd_quantize)
             fdct->quantize = quantize;
+#else
+          compute_reciprocal(
+            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                  (JLONG)aanscales[i]),
+                    CONST_BITS-3), &dtbl[i]);
+#endif
 #else
           dtbl[i] = (DCTELEM)
             DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
@@ -370,10 +381,10 @@ start_pass_fdctmgr(j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
+convsamp(_JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
 {
   register DCTELEM *workspaceptr;
-  register JSAMPROW elemptr;
+  register _JSAMPROW elemptr;
   register int elemr;
 
   workspaceptr = workspace;
@@ -381,19 +392,19 @@ convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
     elemptr = sample_data[elemr] + start_col;
 
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+        *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
     }
 #endif
   }
@@ -488,7 +499,7 @@ quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 
 METHODDEF(void)
 forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
-            JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+            _JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
             JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
 /* This version is used for integer DCT implementations. */
 {
@@ -522,30 +533,30 @@ forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
 #ifdef DCT_FLOAT_SUPPORTED
 
 METHODDEF(void)
-convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+convsamp_float(_JSAMPARRAY sample_data, JDIMENSION start_col,
                FAST_FLOAT *workspace)
 {
   register FAST_FLOAT *workspaceptr;
-  register JSAMPROW elemptr;
+  register _JSAMPROW elemptr;
   register int elemr;
 
   workspaceptr = workspace;
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
     }
 #endif
   }
@@ -577,7 +588,7 @@ quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
 
 METHODDEF(void)
 forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
-                  JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+                  _JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
                   JDIMENSION start_row, JDIMENSION start_col,
                   JDIMENSION num_blocks)
 /* This version is used for floating-point DCT implementations. */
@@ -617,11 +628,14 @@ forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_forward_dct(j_compress_ptr cinfo)
+_jinit_forward_dct(j_compress_ptr cinfo)
 {
   my_fdct_ptr fdct;
   int i;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   fdct = (my_fdct_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_fdct_controller));
@@ -632,28 +646,34 @@ jinit_forward_dct(j_compress_ptr cinfo)
   switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
   case JDCT_ISLOW:
-    fdct->pub.forward_DCT = forward_DCT;
+    fdct->pub._forward_DCT = forward_DCT;
+#ifdef WITH_SIMD
     if (jsimd_can_fdct_islow())
       fdct->dct = jsimd_fdct_islow;
     else
-      fdct->dct = jpeg_fdct_islow;
+#endif
+      fdct->dct = _jpeg_fdct_islow;
     break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
   case JDCT_IFAST:
-    fdct->pub.forward_DCT = forward_DCT;
+    fdct->pub._forward_DCT = forward_DCT;
+#ifdef WITH_SIMD
     if (jsimd_can_fdct_ifast())
       fdct->dct = jsimd_fdct_ifast;
     else
-      fdct->dct = jpeg_fdct_ifast;
+#endif
+      fdct->dct = _jpeg_fdct_ifast;
     break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
   case JDCT_FLOAT:
-    fdct->pub.forward_DCT = forward_DCT_float;
+    fdct->pub._forward_DCT = forward_DCT_float;
+#ifdef WITH_SIMD
     if (jsimd_can_fdct_float())
       fdct->float_dct = jsimd_fdct_float;
     else
+#endif
       fdct->float_dct = jpeg_fdct_float;
     break;
 #endif
@@ -671,25 +691,33 @@ jinit_forward_dct(j_compress_ptr cinfo)
   case JDCT_IFAST:
 #endif
 #if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
+#ifdef WITH_SIMD
     if (jsimd_can_convsamp())
       fdct->convsamp = jsimd_convsamp;
     else
+#endif
       fdct->convsamp = convsamp;
+#ifdef WITH_SIMD
     if (jsimd_can_quantize())
       fdct->quantize = jsimd_quantize;
     else
+#endif
       fdct->quantize = quantize;
     break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
   case JDCT_FLOAT:
+#ifdef WITH_SIMD
     if (jsimd_can_convsamp_float())
       fdct->float_convsamp = jsimd_convsamp_float;
     else
+#endif
       fdct->float_convsamp = convsamp_float;
+#ifdef WITH_SIMD
     if (jsimd_can_quantize_float())
       fdct->float_quantize = jsimd_quantize_float;
     else
+#endif
       fdct->float_quantize = quantize_float;
     break;
 #endif
diff --git a/3rdparty/libjpeg-turbo/src/jcdiffct.c b/3rdparty/libjpeg-turbo/src/jcdiffct.c
new file mode 100644
index 000000000000..0bae0689191e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jcdiffct.c
@@ -0,0 +1,411 @@
+/*
+ * jcdiffct.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the difference buffer controller for compression.
+ * This controller is the top level of the lossless JPEG compressor proper.
+ * The difference buffer lies between the prediction/differencing and entropy
+ * encoding steps.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jlossls.h"            /* Private declarations for lossless codec */
+
+
+#ifdef C_LOSSLESS_SUPPORTED
+
+/* We use a full-image sample buffer when doing Huffman optimization,
+ * and also for writing multiple-scan JPEG files.  In all cases, the
+ * full-image buffer is filled during the first pass, and the scaling,
+ * prediction and differencing steps are run during subsequent passes.
+ */
+#ifdef ENTROPY_OPT_SUPPORTED
+#define FULL_SAMP_BUFFER_SUPPORTED
+#else
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+#define FULL_SAMP_BUFFER_SUPPORTED
+#endif
+#endif
+
+
+/* Private buffer controller object */
+
+typedef struct {
+  struct jpeg_c_coef_controller pub; /* public fields */
+
+  JDIMENSION iMCU_row_num;      /* iMCU row # within image */
+  JDIMENSION mcu_ctr;           /* counts MCUs processed in current row */
+  int MCU_vert_offset;          /* counts MCU rows within iMCU row */
+  int MCU_rows_per_iMCU_row;    /* number of such rows needed */
+
+  _JSAMPROW cur_row[MAX_COMPONENTS];    /* row of point-transformed samples */
+  _JSAMPROW prev_row[MAX_COMPONENTS];   /* previous row of Pt'd samples */
+  JDIFFARRAY diff_buf[MAX_COMPONENTS];  /* iMCU row of differences */
+
+  /* In multi-pass modes, we need a virtual sample array for each component. */
+  jvirt_sarray_ptr whole_image[MAX_COMPONENTS];
+} my_diff_controller;
+
+typedef my_diff_controller *my_diff_ptr;
+
+
+/* Forward declarations */
+METHODDEF(boolean) compress_data(j_compress_ptr cinfo, _JSAMPIMAGE input_buf);
+#ifdef FULL_SAMP_BUFFER_SUPPORTED
+METHODDEF(boolean) compress_first_pass(j_compress_ptr cinfo,
+                                       _JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_output(j_compress_ptr cinfo,
+                                   _JSAMPIMAGE input_buf);
+#endif
+
+
+LOCAL(void)
+start_iMCU_row(j_compress_ptr cinfo)
+/* Reset within-iMCU-row counters for a new row */
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+
+  /* In an interleaved scan, an MCU row is the same as an iMCU row.
+   * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
+   * But at the bottom of the image, process only what's left.
+   */
+  if (cinfo->comps_in_scan > 1) {
+    diff->MCU_rows_per_iMCU_row = 1;
+  } else {
+    if (diff->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+      diff->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
+    else
+      diff->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
+  }
+
+  diff->mcu_ctr = 0;
+  diff->MCU_vert_offset = 0;
+}
+
+
+/*
+ * Initialize for a processing pass.
+ */
+
+METHODDEF(void)
+start_pass_diff(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+
+  /* Because it is hitching a ride on the jpeg_forward_dct struct,
+   * start_pass_lossless() will be called at the start of the initial pass.
+   * This ensures that it will be called at the start of the Huffman
+   * optimization and output passes as well.
+   */
+  if (pass_mode == JBUF_CRANK_DEST)
+    (*cinfo->fdct->start_pass) (cinfo);
+
+  diff->iMCU_row_num = 0;
+  start_iMCU_row(cinfo);
+
+  switch (pass_mode) {
+  case JBUF_PASS_THRU:
+    if (diff->whole_image[0] != NULL)
+      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+    diff->pub._compress_data = compress_data;
+    break;
+#ifdef FULL_SAMP_BUFFER_SUPPORTED
+  case JBUF_SAVE_AND_PASS:
+    if (diff->whole_image[0] == NULL)
+      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+    diff->pub._compress_data = compress_first_pass;
+    break;
+  case JBUF_CRANK_DEST:
+    if (diff->whole_image[0] == NULL)
+      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+    diff->pub._compress_data = compress_output;
+    break;
+#endif
+  default:
+    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+    break;
+  }
+}
+
+
+#define SWAP_ROWS(rowa, rowb) { \
+  _JSAMPROW temp = rowa; \
+  rowa = rowb;  rowb = temp; \
+}
+
+/*
+ * Process some data in the single-pass case.
+ * We process the equivalent of one fully interleaved MCU row ("iMCU" row)
+ * per call, ie, v_samp_factor rows for each component in the image.
+ * Returns TRUE if the iMCU row is completed, FALSE if suspended.
+ *
+ * NB: input_buf contains a plane for each component in image,
+ * which we index according to the component's SOF position.
+ */
+
+METHODDEF(boolean)
+compress_data(j_compress_ptr cinfo, _JSAMPIMAGE input_buf)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+  lossless_comp_ptr losslessc = (lossless_comp_ptr)cinfo->fdct;
+  JDIMENSION MCU_col_num;       /* index of current MCU within row */
+  JDIMENSION MCU_count;         /* number of MCUs encoded */
+  JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+  int ci, compi, yoffset, samp_row, samp_rows, samps_across;
+  jpeg_component_info *compptr;
+
+  /* Loop to write as much as one whole iMCU row */
+  for (yoffset = diff->MCU_vert_offset; yoffset < diff->MCU_rows_per_iMCU_row;
+       yoffset++) {
+
+    MCU_col_num = diff->mcu_ctr;
+
+    /* Scale and predict each scanline of the MCU row separately.
+     *
+     * Note: We only do this if we are at the start of an MCU row, ie,
+     * we don't want to reprocess a row suspended by the output.
+     */
+    if (MCU_col_num == 0) {
+      for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+        compptr = cinfo->cur_comp_info[ci];
+        compi = compptr->component_index;
+        if (diff->iMCU_row_num < last_iMCU_row)
+          samp_rows = compptr->v_samp_factor;
+        else {
+          /* NB: can't use last_row_height here, since may not be set! */
+          samp_rows =
+            (int)(compptr->height_in_blocks % compptr->v_samp_factor);
+          if (samp_rows == 0) samp_rows = compptr->v_samp_factor;
+          else {
+            /* Fill dummy difference rows at the bottom edge with zeros, which
+             * will encode to the smallest amount of data.
+             */
+            for (samp_row = samp_rows; samp_row < compptr->v_samp_factor;
+                 samp_row++)
+              memset(diff->diff_buf[compi][samp_row], 0,
+                     jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor) * sizeof(JDIFF));
+          }
+        }
+        samps_across = compptr->width_in_blocks;
+
+        for (samp_row = 0; samp_row < samp_rows; samp_row++) {
+          (*losslessc->scaler_scale) (cinfo,
+                                      input_buf[compi][samp_row],
+                                      diff->cur_row[compi],
+                                      samps_across);
+          (*losslessc->predict_difference[compi])
+            (cinfo, compi, diff->cur_row[compi], diff->prev_row[compi],
+             diff->diff_buf[compi][samp_row], samps_across);
+          SWAP_ROWS(diff->cur_row[compi], diff->prev_row[compi]);
+        }
+      }
+    }
+    /* Try to write the MCU row (or remaining portion of suspended MCU row). */
+    MCU_count =
+      (*cinfo->entropy->encode_mcus) (cinfo,
+                                      diff->diff_buf, yoffset, MCU_col_num,
+                                      cinfo->MCUs_per_row - MCU_col_num);
+    if (MCU_count != cinfo->MCUs_per_row - MCU_col_num) {
+      /* Suspension forced; update state counters and exit */
+      diff->MCU_vert_offset = yoffset;
+      diff->mcu_ctr += MCU_col_num;
+      return FALSE;
+    }
+    /* Completed an MCU row, but perhaps not an iMCU row */
+    diff->mcu_ctr = 0;
+  }
+  /* Completed the iMCU row, advance counters for next one */
+  diff->iMCU_row_num++;
+  start_iMCU_row(cinfo);
+  return TRUE;
+}
+
+
+#ifdef FULL_SAMP_BUFFER_SUPPORTED
+
+/*
+ * Process some data in the first pass of a multi-pass case.
+ * We process the equivalent of one fully interleaved MCU row ("iMCU" row)
+ * per call, ie, v_samp_factor rows for each component in the image.
+ * This amount of data is read from the source buffer and saved into the
+ * virtual arrays.
+ *
+ * We must also emit the data to the compressor.  This is conveniently
+ * done by calling compress_output() after we've loaded the current strip
+ * of the virtual arrays.
+ *
+ * NB: input_buf contains a plane for each component in image.  All components
+ * are loaded into the virtual arrays in this pass.  However, it may be that
+ * only a subset of the components are emitted to the compressor during
+ * this first pass; be careful about looking at the scan-dependent variables
+ * (MCU dimensions, etc).
+ */
+
+METHODDEF(boolean)
+compress_first_pass(j_compress_ptr cinfo, _JSAMPIMAGE input_buf)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+  JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+  JDIMENSION samps_across;
+  int ci, samp_row, samp_rows;
+  _JSAMPARRAY buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    /* Align the virtual buffer for this component. */
+    buffer = (_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, diff->whole_image[ci],
+       diff->iMCU_row_num * compptr->v_samp_factor,
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
+
+    /* Count non-dummy sample rows in this iMCU row. */
+    if (diff->iMCU_row_num < last_iMCU_row)
+      samp_rows = compptr->v_samp_factor;
+    else {
+      /* NB: can't use last_row_height here, since may not be set! */
+      samp_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
+      if (samp_rows == 0) samp_rows = compptr->v_samp_factor;
+    }
+    samps_across = compptr->width_in_blocks;
+
+    /* Perform point transform scaling and prediction/differencing for all
+     * non-dummy rows in this iMCU row.  Each call on these functions
+     * processes a complete row of samples.
+     */
+    for (samp_row = 0; samp_row < samp_rows; samp_row++) {
+      memcpy(buffer[samp_row], input_buf[ci][samp_row],
+             samps_across * sizeof(_JSAMPLE));
+    }
+  }
+  /* NB: compress_output will increment iMCU_row_num if successful.
+   * A suspension return will result in redoing all the work above next time.
+   */
+
+  /* Emit data to the compressor, sharing code with subsequent passes */
+  return compress_output(cinfo, input_buf);
+}
+
+
+/*
+ * Process some data in subsequent passes of a multi-pass case.
+ * We process the equivalent of one fully interleaved MCU row ("iMCU" row)
+ * per call, ie, v_samp_factor rows for each component in the scan.
+ * The data is obtained from the virtual arrays and fed to the compressor.
+ * Returns TRUE if the iMCU row is completed, FALSE if suspended.
+ *
+ * NB: input_buf is ignored; it is likely to be a NULL pointer.
+ */
+
+METHODDEF(boolean)
+compress_output(j_compress_ptr cinfo, _JSAMPIMAGE input_buf)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+  int ci, compi;
+  _JSAMPARRAY buffer[MAX_COMPS_IN_SCAN];
+  jpeg_component_info *compptr;
+
+  /* Align the virtual buffers for the components used in this scan.
+   * NB: during first pass, this is safe only because the buffers will
+   * already be aligned properly, so jmemmgr.c won't need to do any I/O.
+   */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    compi = compptr->component_index;
+    buffer[compi] = (_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, diff->whole_image[compi],
+       diff->iMCU_row_num * compptr->v_samp_factor,
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
+  }
+
+  return compress_data(cinfo, buffer);
+}
+
+#endif /* FULL_SAMP_BUFFER_SUPPORTED */
+
+
+/*
+ * Initialize difference buffer controller.
+ */
+
+GLOBAL(void)
+_jinit_c_diff_controller(j_compress_ptr cinfo, boolean need_full_buffer)
+{
+  my_diff_ptr diff;
+  int ci, row;
+  jpeg_component_info *compptr;
+
+  diff = (my_diff_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(my_diff_controller));
+  cinfo->coef = (struct jpeg_c_coef_controller *)diff;
+  diff->pub.start_pass = start_pass_diff;
+
+  /* Create the prediction row buffers. */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    diff->cur_row[ci] = *(_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                             (long)compptr->h_samp_factor),
+       (JDIMENSION)1);
+    diff->prev_row[ci] = *(_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                             (long)compptr->h_samp_factor),
+       (JDIMENSION)1);
+  }
+
+  /* Create the difference buffer. */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    diff->diff_buf[ci] =
+      ALLOC_DARRAY(JPOOL_IMAGE,
+                   (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                                         (long)compptr->h_samp_factor),
+                   (JDIMENSION)compptr->v_samp_factor);
+    /* Prefill difference rows with zeros.  We do this because only actual
+     * data is placed in the buffers during prediction/differencing, leaving
+     * any dummy differences at the right edge as zeros, which will encode
+     * to the smallest amount of data.
+     */
+    for (row = 0; row < compptr->v_samp_factor; row++)
+      memset(diff->diff_buf[ci][row], 0,
+             jround_up((long)compptr->width_in_blocks,
+                       (long)compptr->h_samp_factor) * sizeof(JDIFF));
+  }
+
+  /* Create the sample buffer. */
+  if (need_full_buffer) {
+#ifdef FULL_SAMP_BUFFER_SUPPORTED
+    /* Allocate a full-image virtual array for each component, */
+    /* padded to a multiple of samp_factor differences in each direction. */
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+         ci++, compptr++) {
+      diff->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+         (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor),
+         (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+                               (long)compptr->v_samp_factor),
+         (JDIMENSION)compptr->v_samp_factor);
+    }
+#else
+    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
+#endif
+  } else
+    diff->whole_image[0] = NULL; /* flag for no virtual arrays */
+}
+
+#endif /* C_LOSSLESS_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/jchuff.c b/3rdparty/libjpeg-turbo/src/jchuff.c
index f4dfa1cb5403..488c9b5c3a7f 100644
--- a/3rdparty/libjpeg-turbo/src/jchuff.c
+++ b/3rdparty/libjpeg-turbo/src/jchuff.c
@@ -3,11 +3,14 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2024, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
  * Copyright (C) 2018, Matthias Räncker.
  * Copyright (C) 2020, Arm Limited.
+ * Copyright (C) 2022, Felix Hanau.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -26,44 +29,13 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#ifdef WITH_SIMD
 #include "jsimd.h"
-#include "jconfigint.h"
-#include <limits.h>
-
-/*
- * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
- * used for bit counting rather than the lookup table.  This will reduce the
- * memory footprint by 64k, which is important for some mobile applications
- * that create many isolated instances of libjpeg-turbo (web browsers, for
- * instance.)  This may improve performance on some mobile platforms as well.
- * This feature is enabled by default only on Arm processors, because some x86
- * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
- * shown to have a significant performance impact even on the x86 chips that
- * have a fast implementation of it.  When building for Armv6, you can
- * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
- * flags (this defines __thumb__).
- */
-
-/* NOTE: Both GCC and Clang define __GNUC__ */
-#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
-    defined(_M_ARM) || defined(_M_ARM64)
-#if !defined(__thumb__) || defined(__thumb2__)
-#define USE_CLZ_INTRINSIC
-#endif
-#endif
-
-#ifdef USE_CLZ_INTRINSIC
-#if defined(_MSC_VER) && !defined(__clang__)
-#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
-#else
-#define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
-#endif
-#define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
 #else
-#include "jpeg_nbits_table.h"
-#define JPEG_NBITS(x)          (jpeg_nbits_table[x])
-#define JPEG_NBITS_NONZERO(x)  JPEG_NBITS(x)
+#include "jchuff.h"             /* Declarations shared with jc*huff.c */
 #endif
+#include <limits.h>
+#include "jpeg_nbits.h"
 
 
 /* Expanded entropy encoder object for Huffman encoding.
@@ -102,7 +74,9 @@ typedef bit_buf_type simd_bit_buf_type;
 typedef struct {
   union {
     bit_buf_type c;
+#ifdef WITH_SIMD
     simd_bit_buf_type simd;
+#endif
   } put_buffer;                         /* current bit accumulation buffer */
   int free_bits;                        /* # of bits available in it */
                                         /* (Neon GAS: # of bits now in it) */
@@ -127,7 +101,9 @@ typedef struct {
   long *ac_count_ptrs[NUM_HUFF_TBLS];
 #endif
 
+#ifdef WITH_SIMD
   int simd;
+#endif
 } huff_entropy_encoder;
 
 typedef huff_entropy_encoder *huff_entropy_ptr;
@@ -141,7 +117,9 @@ typedef struct {
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
   savable_state cur;            /* Current bit buffer & DC state */
   j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+#ifdef WITH_SIMD
   int simd;
+#endif
 } working_state;
 
 
@@ -180,7 +158,9 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
     entropy->pub.finish_pass = finish_pass_huff;
   }
 
+#ifdef WITH_SIMD
   entropy->simd = jsimd_can_huff_encode_one_block();
+#endif
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
@@ -220,6 +200,7 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
   }
 
   /* Initialize bit buffer to empty */
+#ifdef WITH_SIMD
   if (entropy->simd) {
     entropy->saved.put_buffer.simd = 0;
 #if defined(__aarch64__) && !defined(NEON_INTRINSICS)
@@ -227,7 +208,9 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
 #else
     entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
 #endif
-  } else {
+  } else
+#endif
+  {
     entropy->saved.put_buffer.c = 0;
     entropy->saved.free_bits = BIT_BUF_SIZE;
   }
@@ -242,7 +225,7 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
  * Compute the derived values for a Huffman table.
  * This routine also performs some validation checks on the table.
  *
- * Note this is also used by jcphuff.c.
+ * Note this is also used by jcphuff.c and jclhuff.c.
  */
 
 GLOBAL(void)
@@ -318,12 +301,12 @@ jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
   memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));
   memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));
 
-  /* This is also a convenient place to check for out-of-range
-   * and duplicated VAL entries.  We allow 0..255 for AC symbols
-   * but only 0..15 for DC.  (We could constrain them further
-   * based on data depth and mode, but this seems enough.)
+  /* This is also a convenient place to check for out-of-range and duplicated
+   * VAL entries.  We allow 0..255 for AC symbols but only 0..15 for DC in
+   * lossy mode and 0..16 for DC in lossless mode.  (We could constrain them
+   * further based on data depth and mode, but this seems enough.)
    */
-  maxsymbol = isDC ? 15 : 255;
+  maxsymbol = isDC ? (cinfo->master->lossless ? 16 : 15) : 255;
 
   for (p = 0; p < lastp; p++) {
     i = htbl->huffval[p];
@@ -500,6 +483,7 @@ flush_bits(working_state *state)
   simd_bit_buf_type put_buffer;  int put_bits;
   int localbuf = 0;
 
+#ifdef WITH_SIMD
   if (state->simd) {
 #if defined(__aarch64__) && !defined(NEON_INTRINSICS)
     put_bits = state->cur.free_bits;
@@ -507,7 +491,9 @@ flush_bits(working_state *state)
     put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
 #endif
     put_buffer = state->cur.put_buffer.simd;
-  } else {
+  } else
+#endif
+  {
     put_bits = BIT_BUF_SIZE - state->cur.free_bits;
     put_buffer = state->cur.put_buffer.c;
   }
@@ -525,6 +511,7 @@ flush_bits(working_state *state)
     EMIT_BYTE(temp)
   }
 
+#ifdef WITH_SIMD
   if (state->simd) {                    /* and reset bit buffer to empty */
     state->cur.put_buffer.simd = 0;
 #if defined(__aarch64__) && !defined(NEON_INTRINSICS)
@@ -532,7 +519,9 @@ flush_bits(working_state *state)
 #else
     state->cur.free_bits = SIMD_BIT_BUF_SIZE;
 #endif
-  } else {
+  } else
+#endif
+  {
     state->cur.put_buffer.c = 0;
     state->cur.free_bits = BIT_BUF_SIZE;
   }
@@ -542,6 +531,8 @@ flush_bits(working_state *state)
 }
 
 
+#ifdef WITH_SIMD
+
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
@@ -561,6 +552,8 @@ encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
   return TRUE;
 }
 
+#endif
+
 LOCAL(boolean)
 encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
@@ -569,6 +562,7 @@ encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
   bit_buf_type put_buffer;
   JOCTET _buffer[BUFSIZE], *buffer;
   int localbuf = 0;
+  int max_coef_bits = state->cinfo->data_precision + 2;
 
   free_bits = state->cur.free_bits;
   put_buffer = state->cur.put_buffer.c;
@@ -589,6 +583,11 @@ encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
 
   /* Find the number of bits needed for the magnitude of the coefficient */
   nbits = JPEG_NBITS(nbits);
+  /* Check for out-of-range coefficient values.
+   * Since we're encoding a difference, the range limit is twice as much.
+   */
+  if (nbits > max_coef_bits + 1)
+    ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
 
   /* Emit the Huffman-coded symbol for the number of bits.
    * Emit that number of bits of the value, if positive,
@@ -614,6 +613,9 @@ encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
     temp += nbits; \
     nbits ^= temp; \
     nbits = JPEG_NBITS_NONZERO(nbits); \
+    /* Check for out-of-range coefficient values */ \
+    if (nbits > max_coef_bits) \
+      ERREXIT(state->cinfo, JERR_BAD_DCT_COEF); \
     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
     while (r >= 16 * 16) { \
       r -= 16 * 16; \
@@ -695,7 +697,9 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   state.free_in_buffer = cinfo->dest->free_in_buffer;
   state.cur = entropy->saved;
   state.cinfo = cinfo;
+#ifdef WITH_SIMD
   state.simd = entropy->simd;
+#endif
 
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
@@ -705,6 +709,7 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Encode the MCU data blocks */
+#ifdef WITH_SIMD
   if (entropy->simd) {
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
       ci = cinfo->MCU_membership[blkn];
@@ -717,7 +722,9 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       /* Update last_dc_val */
       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
     }
-  } else {
+  } else
+#endif
+  {
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
       ci = cinfo->MCU_membership[blkn];
       compptr = cinfo->cur_comp_info[ci];
@@ -765,7 +772,9 @@ finish_pass_huff(j_compress_ptr cinfo)
   state.free_in_buffer = cinfo->dest->free_in_buffer;
   state.cur = entropy->saved;
   state.cinfo = cinfo;
+#ifdef WITH_SIMD
   state.simd = entropy->simd;
+#endif
 
   /* Flush out the last data */
   if (!flush_bits(&state))
@@ -801,6 +810,7 @@ htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   register int temp;
   register int nbits;
   register int k, r;
+  int max_coef_bits = cinfo->data_precision + 2;
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
@@ -817,7 +827,7 @@ htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   /* Check for out-of-range coefficient values.
    * Since we're encoding a difference, the range limit is twice as much.
    */
-  if (nbits > MAX_COEF_BITS + 1)
+  if (nbits > max_coef_bits + 1)
     ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
   /* Count the Huffman symbol for the number of bits */
@@ -846,7 +856,7 @@ htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
       while ((temp >>= 1))
         nbits++;
       /* Check for out-of-range coefficient values */
-      if (nbits > MAX_COEF_BITS)
+      if (nbits > max_coef_bits)
         ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
       /* Count Huffman symbol for run length / number of bits */
@@ -901,7 +911,7 @@ encode_mcu_gather(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
 /*
  * Generate the best Huffman code table for the given counts, fill htbl.
- * Note this is also used by jcphuff.c.
+ * Note this is also used by jcphuff.c and jclhuff.c.
  *
  * The JPEG standard requires that no symbol be assigned a codeword of all
  * one bits (so that padding bits added at the end of a compressed segment
@@ -933,11 +943,15 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 {
 #define MAX_CLEN  32            /* assumed maximum initial code length */
   UINT8 bits[MAX_CLEN + 1];     /* bits[k] = # of symbols with code length k */
+  int bit_pos[MAX_CLEN + 1];    /* # of symbols with smaller code length */
   int codesize[257];            /* codesize[k] = code length of symbol k */
+  int nz_index[257];            /* index of nonzero symbol in the original freq
+                                   array */
   int others[257];              /* next symbol in current branch of tree */
   int c1, c2;
   int p, i, j;
-  long v;
+  int num_nz_symbols;
+  long v, v2;
 
   /* This algorithm is explained in section K.2 of the JPEG standard */
 
@@ -952,28 +966,41 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
    * will be placed last in the largest codeword category.
    */
 
+  /* Group nonzero frequencies together so we can more easily find the
+   * smallest.
+   */
+  num_nz_symbols = 0;
+  for (i = 0; i < 257; i++) {
+    if (freq[i]) {
+      nz_index[num_nz_symbols] = i;
+      freq[num_nz_symbols] = freq[i];
+      num_nz_symbols++;
+    }
+  }
+
   /* Huffman's basic algorithm to assign optimal code lengths to symbols */
 
   for (;;) {
-    /* Find the smallest nonzero frequency, set c1 = its symbol */
-    /* In case of ties, take the larger symbol number */
+    /* Find the two smallest nonzero frequencies; set c1, c2 = their symbols */
+    /* In case of ties, take the larger symbol number.  Since we have grouped
+     * the nonzero symbols together, checking for zero symbols is not
+     * necessary.
+     */
     c1 = -1;
-    v = 1000000000L;
-    for (i = 0; i <= 256; i++) {
-      if (freq[i] && freq[i] <= v) {
-        v = freq[i];
-        c1 = i;
-      }
-    }
-
-    /* Find the next smallest nonzero frequency, set c2 = its symbol */
-    /* In case of ties, take the larger symbol number */
     c2 = -1;
     v = 1000000000L;
-    for (i = 0; i <= 256; i++) {
-      if (freq[i] && freq[i] <= v && i != c1) {
-        v = freq[i];
-        c2 = i;
+    v2 = 1000000000L;
+    for (i = 0; i < num_nz_symbols; i++) {
+      if (freq[i] <= v2) {
+        if (freq[i] <= v) {
+          c2 = c1;
+          v2 = v;
+          v = freq[i];
+          c1 = i;
+        } else {
+          v2 = freq[i];
+          c2 = i;
+        }
       }
     }
 
@@ -983,7 +1010,10 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 
     /* Else merge the two counts/trees */
     freq[c1] += freq[c2];
-    freq[c2] = 0;
+    /* Set the frequency to a very high value instead of zero, so we don't have
+     * to check for zero values.
+     */
+    freq[c2] = 1000000001L;
 
     /* Increment the codesize of everything in c1's tree branch */
     codesize[c1]++;
@@ -1003,15 +1033,24 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
   }
 
   /* Now count the number of symbols of each code length */
-  for (i = 0; i <= 256; i++) {
-    if (codesize[i]) {
-      /* The JPEG standard seems to think that this can't happen, */
-      /* but I'm paranoid... */
-      if (codesize[i] > MAX_CLEN)
-        ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW);
-
-      bits[codesize[i]]++;
-    }
+  for (i = 0; i < num_nz_symbols; i++) {
+    /* The JPEG standard seems to think that this can't happen, */
+    /* but I'm paranoid... */
+    if (codesize[i] > MAX_CLEN)
+      ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW);
+
+    bits[codesize[i]]++;
+  }
+
+  /* Count the number of symbols with a length smaller than i bits, so we can
+   * construct the symbol table more efficiently.  Note that this includes the
+   * pseudo-symbol 256, but since it is the last symbol, it will not affect the
+   * table.
+   */
+  p = 0;
+  for (i = 1; i <= MAX_CLEN; i++) {
+    bit_pos[i] = p;
+    p += bits[i];
   }
 
   /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure
@@ -1051,14 +1090,9 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
    * changes made above, but Rec. ITU-T T.81 | ISO/IEC 10918-1 seems to think
    * this works.
    */
-  p = 0;
-  for (i = 1; i <= MAX_CLEN; i++) {
-    for (j = 0; j <= 255; j++) {
-      if (codesize[j] == i) {
-        htbl->huffval[p] = (UINT8)j;
-        p++;
-      }
-    }
+  for (i = 0; i < num_nz_symbols - 1; i++) {
+    htbl->huffval[bit_pos[codesize[i]]] = (UINT8)nz_index[i];
+    bit_pos[codesize[i]]++;
   }
 
   /* Set sent_table FALSE so updated table will be written to JPEG file. */
diff --git a/3rdparty/libjpeg-turbo/src/jchuff.h b/3rdparty/libjpeg-turbo/src/jchuff.h
index 314a2325c9e5..21f17b89b098 100644
--- a/3rdparty/libjpeg-turbo/src/jchuff.h
+++ b/3rdparty/libjpeg-turbo/src/jchuff.h
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -19,11 +19,13 @@
  * Hence the magnitude should always fit in 10 or 14 bits respectively.
  */
 
-#if BITS_IN_JSAMPLE == 8
-#define MAX_COEF_BITS  10
-#else
-#define MAX_COEF_BITS  14
-#endif
+/* The progressive Huffman encoder uses an unsigned 16-bit data type to store
+ * absolute values of coefficients, because it is possible to inject a
+ * coefficient value of -32768 into the encoder by attempting to transform a
+ * malformed 12-bit JPEG image, and the absolute value of -32768 would overflow
+ * a signed 16-bit integer.
+ */
+typedef unsigned short UJCOEF;
 
 /* Derived data constructed for each Huffman table */
 
diff --git a/3rdparty/libjpeg-turbo/src/jcinit.c b/3rdparty/libjpeg-turbo/src/jcinit.c
index 157353a22e9d..fe8a13a8d98b 100644
--- a/3rdparty/libjpeg-turbo/src/jcinit.c
+++ b/3rdparty/libjpeg-turbo/src/jcinit.c
@@ -3,8 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2020, D. R. Commander.
+ * Copyright (C) 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -21,7 +23,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
 
 
 /*
@@ -38,34 +40,101 @@ jinit_compress_master(j_compress_ptr cinfo)
 
   /* Preprocessing */
   if (!cinfo->raw_data_in) {
-    jinit_color_converter(cinfo);
-    jinit_downsampler(cinfo);
-    jinit_c_prep_controller(cinfo, FALSE /* never need full buffer here */);
+    if (cinfo->data_precision == 16) {
+#ifdef C_LOSSLESS_SUPPORTED
+      j16init_color_converter(cinfo);
+      j16init_downsampler(cinfo);
+      j16init_c_prep_controller(cinfo,
+                                FALSE /* never need full buffer here */);
+#else
+      ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+#endif
+    } else if (cinfo->data_precision == 12) {
+      j12init_color_converter(cinfo);
+      j12init_downsampler(cinfo);
+      j12init_c_prep_controller(cinfo,
+                                FALSE /* never need full buffer here */);
+    } else {
+      jinit_color_converter(cinfo);
+      jinit_downsampler(cinfo);
+      jinit_c_prep_controller(cinfo, FALSE /* never need full buffer here */);
+    }
   }
-  /* Forward DCT */
-  jinit_forward_dct(cinfo);
-  /* Entropy encoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-#ifdef C_ARITH_CODING_SUPPORTED
-    jinit_arith_encoder(cinfo);
+
+  if (cinfo->master->lossless) {
+#ifdef C_LOSSLESS_SUPPORTED
+    /* Prediction, sample differencing, and point transform */
+    if (cinfo->data_precision == 16)
+      j16init_lossless_compressor(cinfo);
+    else if (cinfo->data_precision == 12)
+      j12init_lossless_compressor(cinfo);
+    else
+      jinit_lossless_compressor(cinfo);
+    /* Entropy encoding: either Huffman or arithmetic coding. */
+    if (cinfo->arith_code) {
+      ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+    } else {
+      jinit_lhuff_encoder(cinfo);
+    }
+
+    /* Need a full-image difference buffer in any multi-pass mode. */
+    if (cinfo->data_precision == 16)
+      j16init_c_diff_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+                                                 cinfo->optimize_coding));
+    else if (cinfo->data_precision == 12)
+      j12init_c_diff_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+                                                 cinfo->optimize_coding));
+    else
+      jinit_c_diff_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+                                               cinfo->optimize_coding));
 #else
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
   } else {
-    if (cinfo->progressive_mode) {
+    if (cinfo->data_precision == 16)
+      ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+    /* Forward DCT */
+    if (cinfo->data_precision == 12)
+      j12init_forward_dct(cinfo);
+    else
+      jinit_forward_dct(cinfo);
+    /* Entropy encoding: either Huffman or arithmetic coding. */
+    if (cinfo->arith_code) {
+#ifdef C_ARITH_CODING_SUPPORTED
+      jinit_arith_encoder(cinfo);
+#else
+      ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
+    } else {
+      if (cinfo->progressive_mode) {
 #ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
+        jinit_phuff_encoder(cinfo);
 #else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
+        ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
-    } else
-      jinit_huff_encoder(cinfo);
+      } else
+        jinit_huff_encoder(cinfo);
+    }
+
+    /* Need a full-image coefficient buffer in any multi-pass mode. */
+    if (cinfo->data_precision == 12)
+      j12init_c_coef_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+                                                 cinfo->optimize_coding));
+    else
+      jinit_c_coef_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+                                               cinfo->optimize_coding));
   }
 
-  /* Need a full-image coefficient buffer in any multi-pass mode. */
-  jinit_c_coef_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
-                                           cinfo->optimize_coding));
-  jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
+  if (cinfo->data_precision == 16)
+#ifdef C_LOSSLESS_SUPPORTED
+    j16init_c_main_controller(cinfo, FALSE /* never need full buffer here */);
+#else
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+#endif
+  else if (cinfo->data_precision == 12)
+    j12init_c_main_controller(cinfo, FALSE /* never need full buffer here */);
+  else
+    jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   jinit_marker_writer(cinfo);
 
diff --git a/3rdparty/libjpeg-turbo/src/jclhuff.c b/3rdparty/libjpeg-turbo/src/jclhuff.c
new file mode 100644
index 000000000000..ae4154532edf
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jclhuff.c
@@ -0,0 +1,587 @@
+/*
+ * jclhuff.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains Huffman entropy encoding routines for lossless JPEG.
+ *
+ * Much of the complexity here has to do with supporting output suspension.
+ * If the data destination module demands suspension, we want to be able to
+ * back up to the start of the current MCU.  To do this, we copy state
+ * variables into local working storage, and update them back to the
+ * permanent JPEG objects only upon successful completion of an MCU.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jlossls.h"            /* Private declarations for lossless codec */
+#include "jchuff.h"             /* Declarations shared with jc*huff.c */
+
+
+#ifdef C_LOSSLESS_SUPPORTED
+
+/* The legal range of a spatial difference is
+ * -32767 .. +32768.
+ * Hence the magnitude should always fit in 16 bits.
+ */
+
+#define MAX_DIFF_BITS  16
+
+
+/* Expanded entropy encoder object for Huffman encoding in lossless mode.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+typedef struct {
+  size_t put_buffer;            /* current bit-accumulation buffer */
+  int put_bits;                 /* # of bits now in it */
+} savable_state;
+
+
+typedef struct {
+  int ci, yoffset, MCU_width;
+} lhe_input_ptr_info;
+
+
+typedef struct {
+  struct jpeg_entropy_encoder pub; /* public fields */
+
+  savable_state saved;          /* Bit buffer at start of MCU */
+
+  /* These fields are NOT loaded into local working state. */
+  unsigned int restarts_to_go;  /* MCUs left in this restart interval */
+  int next_restart_num;         /* next restart number to write (0-7) */
+
+  /* Pointers to derived tables (these workspaces have image lifespan) */
+  c_derived_tbl *derived_tbls[NUM_HUFF_TBLS];
+
+  /* Pointers to derived tables to be used for each data unit within an MCU */
+  c_derived_tbl *cur_tbls[C_MAX_BLOCKS_IN_MCU];
+
+#ifdef ENTROPY_OPT_SUPPORTED    /* Statistics tables for optimization */
+  long *count_ptrs[NUM_HUFF_TBLS];
+
+  /* Pointers to stats tables to be used for each data unit within an MCU */
+  long *cur_counts[C_MAX_BLOCKS_IN_MCU];
+#endif
+
+  /* Pointers to the proper input difference row for each group of data units
+   * within an MCU.  For each component, there are Vi groups of Hi data units.
+   */
+  JDIFFROW input_ptr[C_MAX_BLOCKS_IN_MCU];
+
+  /* Number of input pointers in use for the current MCU.  This is the sum
+   * of all Vi in the MCU.
+   */
+  int num_input_ptrs;
+
+  /* Information used for positioning the input pointers within the input
+   * difference rows.
+   */
+  lhe_input_ptr_info input_ptr_info[C_MAX_BLOCKS_IN_MCU];
+
+  /* Index of the proper input pointer for each data unit within an MCU */
+  int input_ptr_index[C_MAX_BLOCKS_IN_MCU];
+
+} lhuff_entropy_encoder;
+
+typedef lhuff_entropy_encoder *lhuff_entropy_ptr;
+
+/* Working state while writing an MCU.
+ * This struct contains all the fields that are needed by subroutines.
+ */
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+} working_state;
+
+
+/* Forward declarations */
+METHODDEF(JDIMENSION) encode_mcus_huff(j_compress_ptr cinfo,
+                                       JDIFFIMAGE diff_buf,
+                                       JDIMENSION MCU_row_num,
+                                       JDIMENSION MCU_col_num,
+                                       JDIMENSION nMCU);
+METHODDEF(void) finish_pass_huff(j_compress_ptr cinfo);
+#ifdef ENTROPY_OPT_SUPPORTED
+METHODDEF(JDIMENSION) encode_mcus_gather(j_compress_ptr cinfo,
+                                         JDIFFIMAGE diff_buf,
+                                         JDIMENSION MCU_row_num,
+                                         JDIMENSION MCU_col_num,
+                                         JDIMENSION nMCU);
+METHODDEF(void) finish_pass_gather(j_compress_ptr cinfo);
+#endif
+
+
+/*
+ * Initialize for a Huffman-compressed scan.
+ * If gather_statistics is TRUE, we do not output anything during the scan,
+ * just count the Huffman symbols used and generate Huffman code tables.
+ */
+
+METHODDEF(void)
+start_pass_lhuff(j_compress_ptr cinfo, boolean gather_statistics)
+{
+  lhuff_entropy_ptr entropy = (lhuff_entropy_ptr)cinfo->entropy;
+  int ci, dctbl, sampn, ptrn, yoffset, xoffset;
+  jpeg_component_info *compptr;
+
+  if (gather_statistics) {
+#ifdef ENTROPY_OPT_SUPPORTED
+    entropy->pub.encode_mcus = encode_mcus_gather;
+    entropy->pub.finish_pass = finish_pass_gather;
+#else
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+  } else {
+    entropy->pub.encode_mcus = encode_mcus_huff;
+    entropy->pub.finish_pass = finish_pass_huff;
+  }
+
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    dctbl = compptr->dc_tbl_no;
+    if (gather_statistics) {
+#ifdef ENTROPY_OPT_SUPPORTED
+      /* Check for invalid table indexes */
+      /* (make_c_derived_tbl does this in the other path) */
+      if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS)
+        ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
+      /* Allocate and zero the statistics tables */
+      /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
+      if (entropy->count_ptrs[dctbl] == NULL)
+        entropy->count_ptrs[dctbl] = (long *)
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                      257 * sizeof(long));
+      memset(entropy->count_ptrs[dctbl], 0, 257 * sizeof(long));
+#endif
+    } else {
+      /* Compute derived values for Huffman tables */
+      /* We may do this more than once for a table, but it's not expensive */
+      jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
+                              &entropy->derived_tbls[dctbl]);
+    }
+  }
+
+  /* Precalculate encoding info for each sample in an MCU of this scan */
+  for (sampn = 0, ptrn = 0; sampn < cinfo->blocks_in_MCU;) {
+    compptr = cinfo->cur_comp_info[cinfo->MCU_membership[sampn]];
+    ci = compptr->component_index;
+    for (yoffset = 0; yoffset < compptr->MCU_height; yoffset++, ptrn++) {
+      /* Precalculate the setup info for each input pointer */
+      entropy->input_ptr_info[ptrn].ci = ci;
+      entropy->input_ptr_info[ptrn].yoffset = yoffset;
+      entropy->input_ptr_info[ptrn].MCU_width = compptr->MCU_width;
+      for (xoffset = 0; xoffset < compptr->MCU_width; xoffset++, sampn++) {
+        /* Precalculate the input pointer index for each sample */
+        entropy->input_ptr_index[sampn] = ptrn;
+        /* Precalculate which tables to use for each sample */
+        entropy->cur_tbls[sampn] = entropy->derived_tbls[compptr->dc_tbl_no];
+        entropy->cur_counts[sampn] = entropy->count_ptrs[compptr->dc_tbl_no];
+      }
+    }
+  }
+  entropy->num_input_ptrs = ptrn;
+
+  /* Initialize bit buffer to empty */
+  entropy->saved.put_buffer = 0;
+  entropy->saved.put_bits = 0;
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
+
+
+/* Outputting bytes to the file */
+
+/* Emit a byte, taking 'action' if must suspend. */
+#define emit_byte(state, val, action) { \
+  *(state)->next_output_byte++ = (JOCTET)(val); \
+  if (--(state)->free_in_buffer == 0) \
+    if (!dump_buffer(state)) \
+      { action; } \
+}
+
+
+LOCAL(boolean)
+dump_buffer(working_state *state)
+/* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
+{
+  struct jpeg_destination_mgr *dest = state->cinfo->dest;
+
+  if (!(*dest->empty_output_buffer) (state->cinfo))
+    return FALSE;
+  /* After a successful buffer dump, must reset buffer pointers */
+  state->next_output_byte = dest->next_output_byte;
+  state->free_in_buffer = dest->free_in_buffer;
+  return TRUE;
+}
+
+
+/* Outputting bits to the file */
+
+/* Only the right 24 bits of put_buffer are used; the valid bits are
+ * left-justified in this part.  At most 16 bits can be passed to emit_bits
+ * in one call, and we never retain more than 7 bits in put_buffer
+ * between calls, so 24 bits are sufficient.
+ */
+
+INLINE
+LOCAL(boolean)
+emit_bits(working_state *state, unsigned int code, int size)
+/* Emit some bits; return TRUE if successful, FALSE if must suspend */
+{
+  /* This routine is heavily used, so it's worth coding tightly. */
+  register size_t put_buffer = (size_t)code;
+  register int put_bits = state->cur.put_bits;
+
+  /* if size is 0, caller used an invalid Huffman table entry */
+  if (size == 0)
+    ERREXIT(state->cinfo, JERR_HUFF_MISSING_CODE);
+
+  put_buffer &= (((size_t)1) << size) - 1; /* mask off any extra bits in code */
+
+  put_bits += size;             /* new number of bits in buffer */
+
+  put_buffer <<= 24 - put_bits; /* align incoming bits */
+
+  put_buffer |= state->cur.put_buffer; /* and merge with old buffer contents */
+
+  while (put_bits >= 8) {
+    int c = (int)((put_buffer >> 16) & 0xFF);
+
+    emit_byte(state, c, return FALSE);
+    if (c == 0xFF) {            /* need to stuff a zero byte? */
+      emit_byte(state, 0, return FALSE);
+    }
+    put_buffer <<= 8;
+    put_bits -= 8;
+  }
+
+  state->cur.put_buffer = put_buffer; /* update state variables */
+  state->cur.put_bits = put_bits;
+
+  return TRUE;
+}
+
+
+LOCAL(boolean)
+flush_bits(working_state *state)
+{
+  if (!emit_bits(state, 0x7F, 7)) /* fill any partial byte with ones */
+    return FALSE;
+  state->cur.put_buffer = 0;    /* and reset bit-buffer to empty */
+  state->cur.put_bits = 0;
+  return TRUE;
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(boolean)
+emit_restart(working_state *state, int restart_num)
+{
+  if (!flush_bits(state))
+    return FALSE;
+
+  emit_byte(state, 0xFF, return FALSE);
+  emit_byte(state, JPEG_RST0 + restart_num, return FALSE);
+
+  /* The restart counter is not updated until we successfully write the MCU. */
+
+  return TRUE;
+}
+
+
+/*
+ * Encode and output nMCU MCUs' worth of Huffman-compressed differences.
+ */
+
+METHODDEF(JDIMENSION)
+encode_mcus_huff(j_compress_ptr cinfo, JDIFFIMAGE diff_buf,
+                 JDIMENSION MCU_row_num, JDIMENSION MCU_col_num,
+                 JDIMENSION nMCU)
+{
+  lhuff_entropy_ptr entropy = (lhuff_entropy_ptr)cinfo->entropy;
+  working_state state;
+  int sampn, ci, yoffset, MCU_width, ptrn;
+  JDIMENSION mcu_num;
+
+  /* Load up working state */
+  state.next_output_byte = cinfo->dest->next_output_byte;
+  state.free_in_buffer = cinfo->dest->free_in_buffer;
+  state.cur = entropy->saved;
+  state.cinfo = cinfo;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      if (!emit_restart(&state, entropy->next_restart_num))
+        return 0;
+  }
+
+  /* Set input pointer locations based on MCU_col_num */
+  for (ptrn = 0; ptrn < entropy->num_input_ptrs; ptrn++) {
+    ci = entropy->input_ptr_info[ptrn].ci;
+    yoffset = entropy->input_ptr_info[ptrn].yoffset;
+    MCU_width = entropy->input_ptr_info[ptrn].MCU_width;
+    entropy->input_ptr[ptrn] =
+      diff_buf[ci][MCU_row_num + yoffset] + (MCU_col_num * MCU_width);
+  }
+
+  for (mcu_num = 0; mcu_num < nMCU; mcu_num++) {
+
+    /* Inner loop handles the samples in the MCU */
+    for (sampn = 0; sampn < cinfo->blocks_in_MCU; sampn++) {
+      register int temp, temp2;
+      register int nbits;
+      c_derived_tbl *dctbl = entropy->cur_tbls[sampn];
+
+      /* Encode the difference per section H.1.2.2 */
+
+      /* Input the sample difference */
+      temp = *entropy->input_ptr[entropy->input_ptr_index[sampn]]++;
+
+      if (temp & 0x8000) {      /* instead of temp < 0 */
+        temp = (-temp) & 0x7FFF; /* absolute value, mod 2^16 */
+        if (temp == 0)          /* special case: magnitude = 32768 */
+          temp2 = temp = 0x8000;
+        temp2 = ~temp;          /* one's complement of magnitude */
+      } else {
+        temp &= 0x7FFF;         /* abs value mod 2^16 */
+        temp2 = temp;           /* magnitude */
+      }
+
+      /* Find the number of bits needed for the magnitude of the difference */
+      nbits = 0;
+      while (temp) {
+        nbits++;
+        temp >>= 1;
+      }
+      /* Check for out-of-range difference values.
+       */
+      if (nbits > MAX_DIFF_BITS)
+        ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+      /* Emit the Huffman-coded symbol for the number of bits */
+      if (!emit_bits(&state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
+        return mcu_num;
+
+      /* Emit that number of bits of the value, if positive, */
+      /* or the complement of its magnitude, if negative. */
+      if (nbits &&              /* emit_bits rejects calls with size 0 */
+          nbits != 16)          /* special case: no bits should be emitted */
+        if (!emit_bits(&state, (unsigned int)temp2, nbits))
+          return mcu_num;
+    }
+
+    /* Completed MCU, so update state */
+    cinfo->dest->next_output_byte = state.next_output_byte;
+    cinfo->dest->free_in_buffer = state.free_in_buffer;
+    entropy->saved = state.cur;
+
+    /* Update restart-interval state too */
+    if (cinfo->restart_interval) {
+      if (entropy->restarts_to_go == 0) {
+        entropy->restarts_to_go = cinfo->restart_interval;
+        entropy->next_restart_num++;
+        entropy->next_restart_num &= 7;
+      }
+      entropy->restarts_to_go--;
+    }
+
+  }
+
+  return nMCU;
+}
+
+
+/*
+ * Finish up at the end of a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+finish_pass_huff(j_compress_ptr cinfo)
+{
+  lhuff_entropy_ptr entropy = (lhuff_entropy_ptr)cinfo->entropy;
+  working_state state;
+
+  /* Load up working state ... flush_bits needs it */
+  state.next_output_byte = cinfo->dest->next_output_byte;
+  state.free_in_buffer = cinfo->dest->free_in_buffer;
+  state.cur = entropy->saved;
+  state.cinfo = cinfo;
+
+  /* Flush out the last data */
+  if (!flush_bits(&state))
+    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+  /* Update state */
+  cinfo->dest->next_output_byte = state.next_output_byte;
+  cinfo->dest->free_in_buffer = state.free_in_buffer;
+  entropy->saved = state.cur;
+}
+
+
+/*
+ * Huffman coding optimization.
+ *
+ * We first scan the supplied data and count the number of uses of each symbol
+ * that is to be Huffman-coded. (This process MUST agree with the code above.)
+ * Then we build a Huffman coding tree for the observed counts.
+ * Symbols which are not needed at all for the particular image are not
+ * assigned any code, which saves space in the DHT marker as well as in
+ * the compressed data.
+ */
+
+#ifdef ENTROPY_OPT_SUPPORTED
+
+/*
+ * Trial-encode nMCU MCUs' worth of Huffman-compressed differences.
+ * No data is actually output, so no suspension return is possible.
+ */
+
+METHODDEF(JDIMENSION)
+encode_mcus_gather(j_compress_ptr cinfo, JDIFFIMAGE diff_buf,
+                   JDIMENSION MCU_row_num, JDIMENSION MCU_col_num,
+                   JDIMENSION nMCU)
+{
+  lhuff_entropy_ptr entropy = (lhuff_entropy_ptr)cinfo->entropy;
+  int sampn, ci, yoffset, MCU_width, ptrn;
+  JDIMENSION mcu_num;
+
+  /* Take care of restart intervals if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      /* Update restart state */
+      entropy->restarts_to_go = cinfo->restart_interval;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Set input pointer locations based on MCU_col_num */
+  for (ptrn = 0; ptrn < entropy->num_input_ptrs; ptrn++) {
+    ci = entropy->input_ptr_info[ptrn].ci;
+    yoffset = entropy->input_ptr_info[ptrn].yoffset;
+    MCU_width = entropy->input_ptr_info[ptrn].MCU_width;
+    entropy->input_ptr[ptrn] =
+      diff_buf[ci][MCU_row_num + yoffset] + (MCU_col_num * MCU_width);
+  }
+
+  for (mcu_num = 0; mcu_num < nMCU; mcu_num++) {
+
+    /* Inner loop handles the samples in the MCU */
+    for (sampn = 0; sampn < cinfo->blocks_in_MCU; sampn++) {
+      register int temp;
+      register int nbits;
+      long *counts = entropy->cur_counts[sampn];
+
+      /* Encode the difference per section H.1.2.2 */
+
+      /* Input the sample difference */
+      temp = *entropy->input_ptr[entropy->input_ptr_index[sampn]]++;
+
+      if (temp & 0x8000) {      /* instead of temp < 0 */
+        temp = (-temp) & 0x7FFF; /* absolute value, mod 2^16 */
+        if (temp == 0)          /* special case: magnitude = 32768 */
+          temp = 0x8000;
+      } else
+        temp &= 0x7FFF;         /* abs value mod 2^16 */
+
+      /* Find the number of bits needed for the magnitude of the difference */
+      nbits = 0;
+      while (temp) {
+        nbits++;
+        temp >>= 1;
+      }
+      /* Check for out-of-range difference values.
+       */
+      if (nbits > MAX_DIFF_BITS)
+        ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+
+      /* Count the Huffman symbol for the number of bits */
+      counts[nbits]++;
+    }
+  }
+
+  return nMCU;
+}
+
+
+/*
+ * Finish up a statistics-gathering pass and create the new Huffman tables.
+ */
+
+METHODDEF(void)
+finish_pass_gather(j_compress_ptr cinfo)
+{
+  lhuff_entropy_ptr entropy = (lhuff_entropy_ptr)cinfo->entropy;
+  int ci, dctbl;
+  jpeg_component_info *compptr;
+  JHUFF_TBL **htblptr;
+  boolean did_dc[NUM_HUFF_TBLS];
+
+  /* It's important not to apply jpeg_gen_optimal_table more than once
+   * per table, because it clobbers the input frequency counts!
+   */
+  memset(did_dc, 0, sizeof(did_dc));
+
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    dctbl = compptr->dc_tbl_no;
+    if (!did_dc[dctbl]) {
+      htblptr = &cinfo->dc_huff_tbl_ptrs[dctbl];
+      if (*htblptr == NULL)
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
+      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[dctbl]);
+      did_dc[dctbl] = TRUE;
+    }
+  }
+}
+
+
+#endif /* ENTROPY_OPT_SUPPORTED */
+
+
+/*
+ * Module initialization routine for Huffman entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_lhuff_encoder(j_compress_ptr cinfo)
+{
+  lhuff_entropy_ptr entropy;
+  int i;
+
+  entropy = (lhuff_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(lhuff_entropy_encoder));
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
+  entropy->pub.start_pass = start_pass_lhuff;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_HUFF_TBLS; i++) {
+    entropy->derived_tbls[i] = NULL;
+#ifdef ENTROPY_OPT_SUPPORTED
+    entropy->count_ptrs[i] = NULL;
+#endif
+  }
+}
+
+#endif /* C_LOSSLESS_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/jclossls.c b/3rdparty/libjpeg-turbo/src/jclossls.c
new file mode 100644
index 000000000000..e9ba92a7dfea
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jclossls.c
@@ -0,0 +1,319 @@
+/*
+ * jclossls.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1998, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains prediction, sample differencing, and point transform
+ * routines for the lossless JPEG compressor.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jlossls.h"
+
+#ifdef C_LOSSLESS_SUPPORTED
+
+
+/************************** Sample differencing **************************/
+
+/*
+ * In order to avoid a performance penalty for checking which predictor is
+ * being used and which row is being processed for each call of the
+ * undifferencer, and to promote optimization, we have separate differencing
+ * functions for each predictor selection value.
+ *
+ * We are able to avoid duplicating source code by implementing the predictors
+ * and differencers as macros.  Each of the differencing functions is simply a
+ * wrapper around a DIFFERENCE macro with the appropriate PREDICTOR macro
+ * passed as an argument.
+ */
+
+/* Forward declarations */
+LOCAL(void) reset_predictor(j_compress_ptr cinfo, int ci);
+
+
+/* Predictor for the first column of the first row: 2^(P-Pt-1) */
+#define INITIAL_PREDICTORx  (1 << (cinfo->data_precision - cinfo->Al - 1))
+
+/* Predictor for the first column of the remaining rows: Rb */
+#define INITIAL_PREDICTOR2  prev_row[0]
+
+
+/*
+ * 1-Dimensional differencer routine.
+ *
+ * This macro implements the 1-D horizontal predictor (1).  INITIAL_PREDICTOR
+ * is used as the special case predictor for the first column, which must be
+ * either INITIAL_PREDICTOR2 or INITIAL_PREDICTORx.  The remaining samples
+ * use PREDICTOR1.
+ */
+
+#define DIFFERENCE_1D(INITIAL_PREDICTOR) \
+  lossless_comp_ptr losslessc = (lossless_comp_ptr)cinfo->fdct; \
+  boolean restart = FALSE; \
+  int samp, Ra; \
+  \
+  samp = *input_buf++; \
+  *diff_buf++ = samp - INITIAL_PREDICTOR; \
+  \
+  while (--width) { \
+    Ra = samp; \
+    samp = *input_buf++; \
+    *diff_buf++ = samp - PREDICTOR1; \
+  } \
+  \
+  /* Account for restart interval (no-op if not using restarts) */ \
+  if (cinfo->restart_interval) { \
+    if (--(losslessc->restart_rows_to_go[ci]) == 0) { \
+      reset_predictor(cinfo, ci); \
+      restart = TRUE; \
+    } \
+  }
+
+
+/*
+ * 2-Dimensional differencer routine.
+ *
+ * This macro implements the 2-D horizontal predictors (#2-7).  PREDICTOR2 is
+ * used as the special case predictor for the first column.  The remaining
+ * samples use PREDICTOR, which is a function of Ra, Rb, and Rc.
+ *
+ * Because prev_row and output_buf may point to the same storage area (in an
+ * interleaved image with Vi=1, for example), we must take care to buffer Rb/Rc
+ * before writing the current reconstructed sample value into output_buf.
+ */
+
+#define DIFFERENCE_2D(PREDICTOR) \
+  lossless_comp_ptr losslessc = (lossless_comp_ptr)cinfo->fdct; \
+  int samp, Ra, Rb, Rc; \
+  \
+  Rb = *prev_row++; \
+  samp = *input_buf++; \
+  *diff_buf++ = samp - PREDICTOR2; \
+  \
+  while (--width) { \
+    Rc = Rb; \
+    Rb = *prev_row++; \
+    Ra = samp; \
+    samp = *input_buf++; \
+    *diff_buf++ = samp - PREDICTOR; \
+  } \
+  \
+  /* Account for restart interval (no-op if not using restarts) */ \
+  if (cinfo->restart_interval) { \
+    if (--losslessc->restart_rows_to_go[ci] == 0) \
+      reset_predictor(cinfo, ci); \
+  }
+
+
+/*
+ * Differencers for the second and subsequent rows in a scan or restart
+ * interval.  The first sample in the row is differenced using the vertical
+ * predictor (2).  The rest of the samples are differenced using the predictor
+ * specified in the scan header.
+ */
+
+METHODDEF(void)
+jpeg_difference1(j_compress_ptr cinfo, int ci,
+                 _JSAMPROW input_buf, _JSAMPROW prev_row,
+                 JDIFFROW diff_buf, JDIMENSION width)
+{
+  DIFFERENCE_1D(INITIAL_PREDICTOR2);
+  (void)(restart);
+}
+
+METHODDEF(void)
+jpeg_difference2(j_compress_ptr cinfo, int ci,
+                 _JSAMPROW input_buf, _JSAMPROW prev_row,
+                 JDIFFROW diff_buf, JDIMENSION width)
+{
+  DIFFERENCE_2D(PREDICTOR2);
+  (void)(Ra);
+  (void)(Rc);
+}
+
+METHODDEF(void)
+jpeg_difference3(j_compress_ptr cinfo, int ci,
+                 _JSAMPROW input_buf, _JSAMPROW prev_row,
+                 JDIFFROW diff_buf, JDIMENSION width)
+{
+  DIFFERENCE_2D(PREDICTOR3);
+  (void)(Ra);
+}
+
+METHODDEF(void)
+jpeg_difference4(j_compress_ptr cinfo, int ci,
+                 _JSAMPROW input_buf, _JSAMPROW prev_row,
+                 JDIFFROW diff_buf, JDIMENSION width)
+{
+  DIFFERENCE_2D(PREDICTOR4);
+}
+
+METHODDEF(void)
+jpeg_difference5(j_compress_ptr cinfo, int ci,
+                 _JSAMPROW input_buf, _JSAMPROW prev_row,
+                 JDIFFROW diff_buf, JDIMENSION width)
+{
+  DIFFERENCE_2D(PREDICTOR5);
+}
+
+METHODDEF(void)
+jpeg_difference6(j_compress_ptr cinfo, int ci,
+                 _JSAMPROW input_buf, _JSAMPROW prev_row,
+                 JDIFFROW diff_buf, JDIMENSION width)
+{
+  DIFFERENCE_2D(PREDICTOR6);
+}
+
+METHODDEF(void)
+jpeg_difference7(j_compress_ptr cinfo, int ci,
+                 _JSAMPROW input_buf, _JSAMPROW prev_row,
+                 JDIFFROW diff_buf, JDIMENSION width)
+{
+  DIFFERENCE_2D(PREDICTOR7);
+  (void)(Rc);
+}
+
+
+/*
+ * Differencer for the first row in a scan or restart interval.  The first
+ * sample in the row is differenced using the special predictor constant
+ * x = 2 ^ (P-Pt-1).  The rest of the samples are differenced using the
+ * 1-D horizontal predictor (1).
+ */
+
+METHODDEF(void)
+jpeg_difference_first_row(j_compress_ptr cinfo, int ci,
+                          _JSAMPROW input_buf, _JSAMPROW prev_row,
+                          JDIFFROW diff_buf, JDIMENSION width)
+{
+  DIFFERENCE_1D(INITIAL_PREDICTORx);
+
+  /*
+   * Now that we have differenced the first row, we want to use the
+   * differencer that corresponds to the predictor specified in the
+   * scan header.
+   *
+   * Note that we don't do this if we have just reset the predictor
+   * for a new restart interval.
+   */
+  if (!restart) {
+    switch (cinfo->Ss) {
+    case 1:
+      losslessc->predict_difference[ci] = jpeg_difference1;
+      break;
+    case 2:
+      losslessc->predict_difference[ci] = jpeg_difference2;
+      break;
+    case 3:
+      losslessc->predict_difference[ci] = jpeg_difference3;
+      break;
+    case 4:
+      losslessc->predict_difference[ci] = jpeg_difference4;
+      break;
+    case 5:
+      losslessc->predict_difference[ci] = jpeg_difference5;
+      break;
+    case 6:
+      losslessc->predict_difference[ci] = jpeg_difference6;
+      break;
+    case 7:
+      losslessc->predict_difference[ci] = jpeg_difference7;
+      break;
+    }
+  }
+}
+
+/*
+ * Reset predictor at the start of a pass or restart interval.
+ */
+
+LOCAL(void)
+reset_predictor(j_compress_ptr cinfo, int ci)
+{
+  lossless_comp_ptr losslessc = (lossless_comp_ptr)cinfo->fdct;
+
+  /* Initialize restart counter */
+  losslessc->restart_rows_to_go[ci] =
+    cinfo->restart_interval / cinfo->MCUs_per_row;
+
+  /* Set difference function to first row function */
+  losslessc->predict_difference[ci] = jpeg_difference_first_row;
+}
+
+
+/********************** Sample downscaling by 2^Pt ***********************/
+
+METHODDEF(void)
+simple_downscale(j_compress_ptr cinfo,
+                 _JSAMPROW input_buf, _JSAMPROW output_buf, JDIMENSION width)
+{
+  do {
+    *output_buf++ = (_JSAMPLE)RIGHT_SHIFT(*input_buf++, cinfo->Al);
+  } while (--width);
+}
+
+
+METHODDEF(void)
+noscale(j_compress_ptr cinfo,
+        _JSAMPROW input_buf, _JSAMPROW output_buf, JDIMENSION width)
+{
+  memcpy(output_buf, input_buf, width * sizeof(_JSAMPLE));
+}
+
+
+/*
+ * Initialize for a processing pass.
+ */
+
+METHODDEF(void)
+start_pass_lossless(j_compress_ptr cinfo)
+{
+  lossless_comp_ptr losslessc = (lossless_comp_ptr)cinfo->fdct;
+  int ci;
+
+  /* Set scaler function based on Pt */
+  if (cinfo->Al)
+    losslessc->scaler_scale = simple_downscale;
+  else
+    losslessc->scaler_scale = noscale;
+
+  /* Check that the restart interval is an integer multiple of the number
+   * of MCUs in an MCU row.
+   */
+  if (cinfo->restart_interval % cinfo->MCUs_per_row != 0)
+    ERREXIT2(cinfo, JERR_BAD_RESTART,
+             cinfo->restart_interval, cinfo->MCUs_per_row);
+
+  /* Set predictors for start of pass */
+  for (ci = 0; ci < cinfo->num_components; ci++)
+    reset_predictor(cinfo, ci);
+}
+
+
+/*
+ * Initialize the lossless compressor.
+ */
+
+GLOBAL(void)
+_jinit_lossless_compressor(j_compress_ptr cinfo)
+{
+  lossless_comp_ptr losslessc;
+
+  /* Create subobject in permanent pool */
+  losslessc = (lossless_comp_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+                                sizeof(jpeg_lossless_compressor));
+  cinfo->fdct = (struct jpeg_forward_dct *)losslessc;
+  losslessc->pub.start_pass = start_pass_lossless;
+}
+
+#endif /* C_LOSSLESS_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/jcmainct.c b/3rdparty/libjpeg-turbo/src/jcmainct.c
index 3f23028c467e..fe8fc0b1acde 100644
--- a/3rdparty/libjpeg-turbo/src/jcmainct.c
+++ b/3rdparty/libjpeg-turbo/src/jcmainct.c
@@ -3,8 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -16,8 +18,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED)
+
 /* Private buffer controller object */
 
 typedef struct {
@@ -32,7 +37,7 @@ typedef struct {
    * (we allocate one for each component).  In the full-image case, this
    * points to the currently accessible strips of the virtual arrays.
    */
-  JSAMPARRAY buffer[MAX_COMPONENTS];
+  _JSAMPARRAY buffer[MAX_COMPONENTS];
 } my_main_controller;
 
 typedef my_main_controller *my_main_ptr;
@@ -40,7 +45,7 @@ typedef my_main_controller *my_main_ptr;
 
 /* Forward declarations */
 METHODDEF(void) process_data_simple_main(j_compress_ptr cinfo,
-                                         JSAMPARRAY input_buf,
+                                         _JSAMPARRAY input_buf,
                                          JDIMENSION *in_row_ctr,
                                          JDIMENSION in_rows_avail);
 
@@ -65,7 +70,7 @@ start_pass_main(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
   main_ptr->rowgroup_ctr = 0;
   main_ptr->suspended = FALSE;
   main_ptr->pass_mode = pass_mode;      /* save mode for use by process_data */
-  main_ptr->pub.process_data = process_data_simple_main;
+  main_ptr->pub._process_data = process_data_simple_main;
 }
 
 
@@ -76,28 +81,28 @@ start_pass_main(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-process_data_simple_main(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+process_data_simple_main(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
                          JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)
 {
   my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+  JDIMENSION data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
 
   while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
     /* Read input data if we haven't filled the main buffer yet */
-    if (main_ptr->rowgroup_ctr < DCTSIZE)
-      (*cinfo->prep->pre_process_data) (cinfo, input_buf, in_row_ctr,
-                                        in_rows_avail, main_ptr->buffer,
-                                        &main_ptr->rowgroup_ctr,
-                                        (JDIMENSION)DCTSIZE);
+    if (main_ptr->rowgroup_ctr < data_unit)
+      (*cinfo->prep->_pre_process_data) (cinfo, input_buf, in_row_ctr,
+                                         in_rows_avail, main_ptr->buffer,
+                                         &main_ptr->rowgroup_ctr, data_unit);
 
     /* If we don't have a full iMCU row buffered, return to application for
      * more data.  Note that preprocessor will always pad to fill the iMCU row
      * at the bottom of the image.
      */
-    if (main_ptr->rowgroup_ctr != DCTSIZE)
+    if (main_ptr->rowgroup_ctr != data_unit)
       return;
 
     /* Send the completed row to the compressor */
-    if (!(*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+    if (!(*cinfo->coef->_compress_data) (cinfo, main_ptr->buffer)) {
       /* If compressor did not consume the whole row, then we must need to
        * suspend processing and return to the application.  In this situation
        * we pretend we didn't yet consume the last input row; otherwise, if
@@ -128,11 +133,15 @@ process_data_simple_main(j_compress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 GLOBAL(void)
-jinit_c_main_controller(j_compress_ptr cinfo, boolean need_full_buffer)
+_jinit_c_main_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_main_ptr main_ptr;
   int ci;
   jpeg_component_info *compptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
 
   main_ptr = (my_main_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
@@ -153,10 +162,12 @@ jinit_c_main_controller(j_compress_ptr cinfo, boolean need_full_buffer)
     /* Allocate a strip buffer for each component */
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
-      main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
+      main_ptr->buffer[ci] = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
         ((j_common_ptr)cinfo, JPOOL_IMAGE,
-         compptr->width_in_blocks * DCTSIZE,
-         (JDIMENSION)(compptr->v_samp_factor * DCTSIZE));
+         compptr->width_in_blocks * data_unit,
+         (JDIMENSION)(compptr->v_samp_factor * data_unit));
     }
   }
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jcmarker.c b/3rdparty/libjpeg-turbo/src/jcmarker.c
index 801fbab4ef01..a064d4dd9e78 100644
--- a/3rdparty/libjpeg-turbo/src/jcmarker.c
+++ b/3rdparty/libjpeg-turbo/src/jcmarker.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,7 +17,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
 
 
 typedef enum {                  /* JPEG marker codes */
@@ -497,25 +499,26 @@ write_file_header(j_compress_ptr cinfo)
 METHODDEF(void)
 write_frame_header(j_compress_ptr cinfo)
 {
-  int ci, prec;
+  int ci, prec = 0;
   boolean is_baseline;
   jpeg_component_info *compptr;
 
-  /* Emit DQT for each quantization table.
-   * Note that emit_dqt() suppresses any duplicate tables.
-   */
-  prec = 0;
-  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-       ci++, compptr++) {
-    prec += emit_dqt(cinfo, compptr->quant_tbl_no);
+  if (!cinfo->master->lossless) {
+    /* Emit DQT for each quantization table.
+     * Note that emit_dqt() suppresses any duplicate tables.
+     */
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+         ci++, compptr++) {
+      prec += emit_dqt(cinfo, compptr->quant_tbl_no);
+    }
+    /* now prec is nonzero iff there are any 16-bit quant tables. */
   }
-  /* now prec is nonzero iff there are any 16-bit quant tables. */
 
   /* Check for a non-baseline specification.
    * Note we assume that Huffman table numbers won't be changed later.
    */
   if (cinfo->arith_code || cinfo->progressive_mode ||
-      cinfo->data_precision != 8) {
+      cinfo->master->lossless || cinfo->data_precision != 8) {
     is_baseline = FALSE;
   } else {
     is_baseline = TRUE;
@@ -540,6 +543,8 @@ write_frame_header(j_compress_ptr cinfo)
   } else {
     if (cinfo->progressive_mode)
       emit_sof(cinfo, M_SOF2);  /* SOF code for progressive Huffman */
+    else if (cinfo->master->lossless)
+      emit_sof(cinfo, M_SOF3);  /* SOF code for lossless Huffman */
     else if (is_baseline)
       emit_sof(cinfo, M_SOF0);  /* SOF code for baseline implementation */
     else
@@ -574,10 +579,11 @@ write_scan_header(j_compress_ptr cinfo)
     for (i = 0; i < cinfo->comps_in_scan; i++) {
       compptr = cinfo->cur_comp_info[i];
       /* DC needs no table for refinement scan */
-      if (cinfo->Ss == 0 && cinfo->Ah == 0)
+      if ((cinfo->Ss == 0 && cinfo->Ah == 0) || cinfo->master->lossless)
         emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
-      /* AC needs no table when not present */
-      if (cinfo->Se)
+      /* AC needs no table when not present, and lossless mode uses only DC
+         tables. */
+      if (cinfo->Se && !cinfo->master->lossless)
         emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
     }
   }
diff --git a/3rdparty/libjpeg-turbo/src/jcmaster.c b/3rdparty/libjpeg-turbo/src/jcmaster.c
index c2b260003181..161019763d4b 100644
--- a/3rdparty/libjpeg-turbo/src/jcmaster.c
+++ b/3rdparty/libjpeg-turbo/src/jcmaster.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2018, 2022-2024, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -18,40 +20,8 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jpegcomp.h"
-#include "jconfigint.h"
-
-
-/* Private state */
-
-typedef enum {
-  main_pass,                    /* input data, also do first output step */
-  huff_opt_pass,                /* Huffman code optimization pass */
-  output_pass                   /* data output pass */
-} c_pass_type;
-
-typedef struct {
-  struct jpeg_comp_master pub;  /* public fields */
-
-  c_pass_type pass_type;        /* the type of the current pass */
-
-  int pass_number;              /* # of passes completed */
-  int total_passes;             /* total # of passes needed */
-
-  int scan_number;              /* current index in scan_info[] */
-
-  /*
-   * This is here so we can add libjpeg-turbo version/build information to the
-   * global string table without introducing a new global symbol.  Adding this
-   * information to the global string table allows one to examine a binary
-   * object and determine which version of libjpeg-turbo it was built from or
-   * linked against.
-   */
-  const char *jpeg_version;
-
-} my_comp_master;
-
-typedef my_comp_master *my_master_ptr;
+#include "jpegapicomp.h"
+#include "jcmaster.h"
 
 
 /*
@@ -69,15 +39,124 @@ GLOBAL(void)
 jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+
   /* Hardwire it to "no scaling" */
   cinfo->jpeg_width = cinfo->image_width;
   cinfo->jpeg_height = cinfo->image_height;
-  cinfo->min_DCT_h_scaled_size = DCTSIZE;
-  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+  cinfo->min_DCT_h_scaled_size = data_unit;
+  cinfo->min_DCT_v_scaled_size = data_unit;
 }
 #endif
 
 
+LOCAL(boolean)
+using_std_huff_tables(j_compress_ptr cinfo)
+{
+  int i;
+
+  static const UINT8 bits_dc_luminance[17] = {
+    /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
+  };
+  static const UINT8 val_dc_luminance[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+  };
+
+  static const UINT8 bits_dc_chrominance[17] = {
+    /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
+  };
+  static const UINT8 val_dc_chrominance[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+  };
+
+  static const UINT8 bits_ac_luminance[17] = {
+    /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d
+  };
+  static const UINT8 val_ac_luminance[] = {
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+    0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+    0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+    0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+    0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+    0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+    0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+    0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+    0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+    0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+    0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+    0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+    0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+    0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+  };
+
+  static const UINT8 bits_ac_chrominance[17] = {
+    /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77
+  };
+  static const UINT8 val_ac_chrominance[] = {
+    0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+    0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+    0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+    0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+    0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+    0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+    0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+    0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+    0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+    0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+    0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+    0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+    0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+    0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+    0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+    0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+    0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+    0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+  };
+
+  if (cinfo->dc_huff_tbl_ptrs[0] == NULL ||
+      cinfo->ac_huff_tbl_ptrs[0] == NULL ||
+      cinfo->dc_huff_tbl_ptrs[1] == NULL ||
+      cinfo->ac_huff_tbl_ptrs[1] == NULL)
+    return FALSE;
+
+  for (i = 2; i < NUM_HUFF_TBLS; i++) {
+    if (cinfo->dc_huff_tbl_ptrs[i] != NULL ||
+        cinfo->ac_huff_tbl_ptrs[i] != NULL)
+      return FALSE;
+  }
+
+  if (memcmp(cinfo->dc_huff_tbl_ptrs[0]->bits, bits_dc_luminance,
+             sizeof(bits_dc_luminance)) ||
+      memcmp(cinfo->dc_huff_tbl_ptrs[0]->huffval, val_dc_luminance,
+             sizeof(val_dc_luminance)) ||
+      memcmp(cinfo->ac_huff_tbl_ptrs[0]->bits, bits_ac_luminance,
+             sizeof(bits_ac_luminance)) ||
+      memcmp(cinfo->ac_huff_tbl_ptrs[0]->huffval, val_ac_luminance,
+             sizeof(val_ac_luminance)) ||
+      memcmp(cinfo->dc_huff_tbl_ptrs[1]->bits, bits_dc_chrominance,
+             sizeof(bits_dc_chrominance)) ||
+      memcmp(cinfo->dc_huff_tbl_ptrs[1]->huffval, val_dc_chrominance,
+             sizeof(val_dc_chrominance)) ||
+      memcmp(cinfo->ac_huff_tbl_ptrs[1]->bits, bits_ac_chrominance,
+             sizeof(bits_ac_chrominance)) ||
+      memcmp(cinfo->ac_huff_tbl_ptrs[1]->huffval, val_ac_chrominance,
+             sizeof(val_ac_chrominance)))
+    return FALSE;
+
+  return TRUE;
+}
+
+
 LOCAL(void)
 initial_setup(j_compress_ptr cinfo, boolean transcode_only)
 /* Do computations that are needed before master selection phase */
@@ -86,6 +165,7 @@ initial_setup(j_compress_ptr cinfo, boolean transcode_only)
   jpeg_component_info *compptr;
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
 
 #if JPEG_LIB_VERSION >= 70
 #if JPEG_LIB_VERSION >= 80
@@ -110,8 +190,12 @@ initial_setup(j_compress_ptr cinfo, boolean transcode_only)
   if ((long)jd_samplesperrow != samplesperrow)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
 
-  /* For now, precision must match compiled-in value... */
-  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+#ifdef C_LOSSLESS_SUPPORTED
+  if (cinfo->data_precision != 8 && cinfo->data_precision != 12 &&
+      cinfo->data_precision != 16)
+#else
+  if (cinfo->data_precision != 8 && cinfo->data_precision != 12)
+#endif
     ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
 
   /* Check that number of components won't exceed internal array sizes */
@@ -142,17 +226,17 @@ initial_setup(j_compress_ptr cinfo, boolean transcode_only)
     compptr->component_index = ci;
     /* For compression, we never do DCT scaling. */
 #if JPEG_LIB_VERSION >= 70
-    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE;
+    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = data_unit;
 #else
-    compptr->DCT_scaled_size = DCTSIZE;
+    compptr->DCT_scaled_size = data_unit;
 #endif
-    /* Size in DCT blocks */
+    /* Size in data units */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
-                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
+                    (long)(cinfo->max_h_samp_factor * data_unit));
     compptr->height_in_blocks = (JDIMENSION)
       jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
-                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
+                    (long)(cinfo->max_v_samp_factor * data_unit));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
       jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
@@ -165,15 +249,19 @@ initial_setup(j_compress_ptr cinfo, boolean transcode_only)
   }
 
   /* Compute number of fully interleaved MCU rows (number of times that
-   * main controller will call coefficient controller).
+   * main controller will call coefficient or difference controller).
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
     jdiv_round_up((long)cinfo->_jpeg_height,
-                  (long)(cinfo->max_v_samp_factor * DCTSIZE));
+                  (long)(cinfo->max_v_samp_factor * data_unit));
 }
 
 
-#ifdef C_MULTISCAN_FILES_SUPPORTED
+#if defined(C_MULTISCAN_FILES_SUPPORTED) || defined(C_LOSSLESS_SUPPORTED)
+#define NEED_SCAN_SCRIPT
+#endif
+
+#ifdef NEED_SCAN_SCRIPT
 
 LOCAL(void)
 validate_script(j_compress_ptr cinfo)
@@ -194,13 +282,29 @@ validate_script(j_compress_ptr cinfo)
   if (cinfo->num_scans <= 0)
     ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, 0);
 
+#ifndef C_MULTISCAN_FILES_SUPPORTED
+  if (cinfo->num_scans > 1)
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+
+  scanptr = cinfo->scan_info;
+  if (scanptr->Ss != 0 && scanptr->Se == 0) {
+#ifdef C_LOSSLESS_SUPPORTED
+    cinfo->master->lossless = TRUE;
+    cinfo->progressive_mode = FALSE;
+    for (ci = 0; ci < cinfo->num_components; ci++)
+      component_sent[ci] = FALSE;
+#else
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+  }
   /* For sequential JPEG, all scans must have Ss=0, Se=DCTSIZE2-1;
    * for progressive JPEG, no scan can have this.
    */
-  scanptr = cinfo->scan_info;
-  if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2 - 1) {
+  else if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2 - 1) {
 #ifdef C_PROGRESSIVE_SUPPORTED
     cinfo->progressive_mode = TRUE;
+    cinfo->master->lossless = FALSE;
     last_bitpos_ptr = &last_bitpos[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
       for (coefi = 0; coefi < DCTSIZE2; coefi++)
@@ -209,7 +313,7 @@ validate_script(j_compress_ptr cinfo)
     ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
   } else {
-    cinfo->progressive_mode = FALSE;
+    cinfo->progressive_mode = cinfo->master->lossless = FALSE;
     for (ci = 0; ci < cinfo->num_components; ci++)
       component_sent[ci] = FALSE;
   }
@@ -241,13 +345,10 @@ validate_script(j_compress_ptr cinfo)
        * out-of-range reconstructed DC values during the first DC scan,
        * which might cause problems for some decoders.
        */
-#if BITS_IN_JSAMPLE == 8
-#define MAX_AH_AL  10
-#else
-#define MAX_AH_AL  13
-#endif
+      int max_Ah_Al = cinfo->data_precision == 12 ? 13 : 10;
+
       if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 ||
-          Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
+          Ah < 0 || Ah > max_Ah_Al || Al < 0 || Al > max_Ah_Al)
         ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       if (Ss == 0) {
         if (Se != 0)            /* DC and AC together not OK */
@@ -275,9 +376,25 @@ validate_script(j_compress_ptr cinfo)
       }
 #endif
     } else {
-      /* For sequential JPEG, all progression parameters must be these: */
-      if (Ss != 0 || Se != DCTSIZE2 - 1 || Ah != 0 || Al != 0)
-        ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+#ifdef C_LOSSLESS_SUPPORTED
+      if (cinfo->master->lossless) {
+        /* The JPEG spec simply gives the range 0..15 for Al (Pt), but that
+         * seems wrong: the upper bound ought to depend on data precision.
+         * Perhaps they really meant 0..N-1 for N-bit precision, which is what
+         * we allow here.  Values greater than or equal to the data precision
+         * will result in a blank image.
+         */
+        if (Ss < 1 || Ss > 7 ||         /* predictor selection value */
+            Se != 0 || Ah != 0 ||
+            Al < 0 || Al >= cinfo->data_precision) /* point transform */
+          ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+      } else
+#endif
+      {
+        /* For sequential JPEG, all progression parameters must be these: */
+        if (Ss != 0 || Se != DCTSIZE2 - 1 || Ah != 0 || Al != 0)
+          ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
+      }
       /* Make sure components are not sent twice */
       for (ci = 0; ci < ncomps; ci++) {
         thisi = scanptr->component_index[ci];
@@ -309,7 +426,7 @@ validate_script(j_compress_ptr cinfo)
   }
 }
 
-#endif /* C_MULTISCAN_FILES_SUPPORTED */
+#endif /* NEED_SCAN_SCRIPT */
 
 
 LOCAL(void)
@@ -318,7 +435,7 @@ select_scan_parameters(j_compress_ptr cinfo)
 {
   int ci;
 
-#ifdef C_MULTISCAN_FILES_SUPPORTED
+#ifdef NEED_SCAN_SCRIPT
   if (cinfo->scan_info != NULL) {
     /* Prepare for current scan --- the script is already validated */
     my_master_ptr master = (my_master_ptr)cinfo->master;
@@ -344,10 +461,12 @@ select_scan_parameters(j_compress_ptr cinfo)
     for (ci = 0; ci < cinfo->num_components; ci++) {
       cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
     }
-    cinfo->Ss = 0;
-    cinfo->Se = DCTSIZE2 - 1;
-    cinfo->Ah = 0;
-    cinfo->Al = 0;
+    if (!cinfo->master->lossless) {
+      cinfo->Ss = 0;
+      cinfo->Se = DCTSIZE2 - 1;
+      cinfo->Ah = 0;
+      cinfo->Al = 0;
+    }
   }
 }
 
@@ -359,6 +478,7 @@ per_scan_setup(j_compress_ptr cinfo)
 {
   int ci, mcublks, tmp;
   jpeg_component_info *compptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
 
   if (cinfo->comps_in_scan == 1) {
 
@@ -373,7 +493,7 @@ per_scan_setup(j_compress_ptr cinfo)
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = DCTSIZE;
+    compptr->MCU_sample_width = data_unit;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
@@ -396,10 +516,10 @@ per_scan_setup(j_compress_ptr cinfo)
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long)cinfo->_jpeg_width,
-                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
+                    (long)(cinfo->max_h_samp_factor * data_unit));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
       jdiv_round_up((long)cinfo->_jpeg_height,
-                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
+                    (long)(cinfo->max_v_samp_factor * data_unit));
 
     cinfo->blocks_in_MCU = 0;
 
@@ -409,7 +529,7 @@ per_scan_setup(j_compress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
+      compptr->MCU_sample_width = compptr->MCU_width * data_unit;
       /* Figure number of non-dummy blocks in last MCU column & row */
       tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
@@ -481,7 +601,8 @@ prepare_for_pass(j_compress_ptr cinfo)
     /* Do Huffman optimization for a scan after the first one. */
     select_scan_parameters(cinfo);
     per_scan_setup(cinfo);
-    if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) {
+    if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code ||
+        cinfo->master->lossless) {
       (*cinfo->entropy->start_pass) (cinfo, TRUE);
       (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST);
       master->pub.call_pass_startup = FALSE;
@@ -590,22 +711,17 @@ finish_pass_master(j_compress_ptr cinfo)
 GLOBAL(void)
 jinit_c_master_control(j_compress_ptr cinfo, boolean transcode_only)
 {
-  my_master_ptr master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
+  boolean empty_huff_tables = TRUE;
+  int i;
 
-  master = (my_master_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                sizeof(my_comp_master));
-  cinfo->master = (struct jpeg_comp_master *)master;
   master->pub.prepare_for_pass = prepare_for_pass;
   master->pub.pass_startup = pass_startup;
   master->pub.finish_pass = finish_pass_master;
   master->pub.is_last_pass = FALSE;
 
-  /* Validate parameters, determine derived values */
-  initial_setup(cinfo, transcode_only);
-
   if (cinfo->scan_info != NULL) {
-#ifdef C_MULTISCAN_FILES_SUPPORTED
+#ifdef NEED_SCAN_SCRIPT
     validate_script(cinfo);
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -615,8 +731,42 @@ jinit_c_master_control(j_compress_ptr cinfo, boolean transcode_only)
     cinfo->num_scans = 1;
   }
 
-  if (cinfo->progressive_mode && !cinfo->arith_code)  /*  TEMPORARY HACK ??? */
-    cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
+  /* Disable smoothing and subsampling in lossless mode, since those are lossy
+   * algorithms.  Set the JPEG colorspace to the input colorspace.  Disable raw
+   * (downsampled) data input, because it isn't particularly useful without
+   * subsampling and has not been tested in lossless mode.
+   */
+  if (cinfo->master->lossless) {
+    int ci;
+    jpeg_component_info *compptr;
+
+    cinfo->raw_data_in = FALSE;
+    cinfo->smoothing_factor = 0;
+    jpeg_default_colorspace(cinfo);
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+         ci++, compptr++)
+      compptr->h_samp_factor = compptr->v_samp_factor = 1;
+  }
+
+  /* Validate parameters, determine derived values */
+  initial_setup(cinfo, transcode_only);
+
+  if (cinfo->master->lossless ||        /*  TEMPORARY HACK ??? */
+      (cinfo->progressive_mode && !cinfo->arith_code))
+    cinfo->optimize_coding = TRUE; /* assume default tables no good for
+                                      progressive mode or lossless mode */
+  for (i = 0; i < NUM_HUFF_TBLS; i++) {
+    if (cinfo->dc_huff_tbl_ptrs[i] != NULL ||
+        cinfo->ac_huff_tbl_ptrs[i] != NULL) {
+      empty_huff_tables = FALSE;
+      break;
+    }
+  }
+  if (cinfo->data_precision == 12 && !cinfo->arith_code &&
+      !cinfo->optimize_coding &&
+      (empty_huff_tables || using_std_huff_tables(cinfo)))
+    cinfo->optimize_coding = TRUE; /* assume default tables no good for 12-bit
+                                      data precision */
 
   /* Initialize my private state */
   if (transcode_only) {
diff --git a/3rdparty/libjpeg-turbo/src/jcmaster.h b/3rdparty/libjpeg-turbo/src/jcmaster.h
new file mode 100644
index 000000000000..3b13289b6915
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jcmaster.h
@@ -0,0 +1,43 @@
+/*
+ * jcmaster.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1995, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2016, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains master control structure for the JPEG compressor.
+ */
+
+/* Private state */
+
+typedef enum {
+  main_pass,                    /* input data, also do first output step */
+  huff_opt_pass,                /* Huffman code optimization pass */
+  output_pass                   /* data output pass */
+} c_pass_type;
+
+typedef struct {
+  struct jpeg_comp_master pub;  /* public fields */
+
+  c_pass_type pass_type;        /* the type of the current pass */
+
+  int pass_number;              /* # of passes completed */
+  int total_passes;             /* total # of passes needed */
+
+  int scan_number;              /* current index in scan_info[] */
+
+  /*
+   * This is here so we can add libjpeg-turbo version/build information to the
+   * global string table without introducing a new global symbol.  Adding this
+   * information to the global string table allows one to examine a binary
+   * object and determine which version of libjpeg-turbo it was built from or
+   * linked against.
+   */
+  const char *jpeg_version;
+
+} my_comp_master;
+
+typedef my_comp_master *my_master_ptr;
diff --git a/3rdparty/libjpeg-turbo/src/jcparam.c b/3rdparty/libjpeg-turbo/src/jcparam.c
index 5bc7174dcb54..d1dee4da3df8 100644
--- a/3rdparty/libjpeg-turbo/src/jcparam.c
+++ b/3rdparty/libjpeg-turbo/src/jcparam.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2008 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2018, 2023, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -202,7 +204,6 @@ jpeg_set_defaults(j_compress_ptr cinfo)
   cinfo->scale_num = 1;         /* 1:1 scaling */
   cinfo->scale_denom = 1;
 #endif
-  cinfo->data_precision = BITS_IN_JSAMPLE;
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
   /* Set up two Huffman tables */
@@ -232,7 +233,7 @@ jpeg_set_defaults(j_compress_ptr cinfo)
    * tables will be computed.  This test can be removed if default tables
    * are supplied that are valid for the desired precision.
    */
-  if (cinfo->data_precision > 8)
+  if (cinfo->data_precision == 12 && !cinfo->arith_code)
     cinfo->optimize_coding = TRUE;
 
   /* By default, use the simpler non-cosited sampling alignment */
@@ -296,7 +297,10 @@ jpeg_default_colorspace(j_compress_ptr cinfo)
   case JCS_EXT_BGRA:
   case JCS_EXT_ABGR:
   case JCS_EXT_ARGB:
-    jpeg_set_colorspace(cinfo, JCS_YCbCr);
+    if (cinfo->master->lossless)
+      jpeg_set_colorspace(cinfo, JCS_RGB);
+    else
+      jpeg_set_colorspace(cinfo, JCS_YCbCr);
     break;
   case JCS_YCbCr:
     jpeg_set_colorspace(cinfo, JCS_YCbCr);
@@ -475,6 +479,11 @@ jpeg_simple_progression(j_compress_ptr cinfo)
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
+  if (cinfo->master->lossless) {
+    cinfo->master->lossless = FALSE;
+    jpeg_default_colorspace(cinfo);
+  }
+
   /* Figure space needed for script.  Calculation must match code below! */
   if (ncomps == 3 && cinfo->jpeg_color_space == JCS_YCbCr) {
     /* Custom script for YCbCr color images. */
@@ -539,3 +548,38 @@ jpeg_simple_progression(j_compress_ptr cinfo)
 }
 
 #endif /* C_PROGRESSIVE_SUPPORTED */
+
+
+#ifdef C_LOSSLESS_SUPPORTED
+
+/*
+ * Enable lossless mode.
+ */
+
+GLOBAL(void)
+jpeg_enable_lossless(j_compress_ptr cinfo, int predictor_selection_value,
+                     int point_transform)
+{
+  /* Safety check to ensure start_compress not called yet. */
+  if (cinfo->global_state != CSTATE_START)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  cinfo->master->lossless = TRUE;
+  cinfo->Ss = predictor_selection_value;
+  cinfo->Se = 0;
+  cinfo->Ah = 0;
+  cinfo->Al = point_transform;
+
+  /* The JPEG spec simply gives the range 0..15 for Al (Pt), but that seems
+   * wrong: the upper bound ought to depend on data precision.  Perhaps they
+   * really meant 0..N-1 for N-bit precision, which is what we allow here.
+   * Values greater than or equal to the data precision will result in a blank
+   * image.
+   */
+  if (cinfo->Ss < 1 || cinfo->Ss > 7 ||
+      cinfo->Al < 0 || cinfo->Al >= cinfo->data_precision)
+    ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+             cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+}
+
+#endif /* C_LOSSLESS_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/jcphuff.c b/3rdparty/libjpeg-turbo/src/jcphuff.c
index 872e570bff0b..484e2d857f05 100644
--- a/3rdparty/libjpeg-turbo/src/jcphuff.c
+++ b/3rdparty/libjpeg-turbo/src/jcphuff.c
@@ -3,9 +3,11 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
- * Copyright (C) 2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2011, 2015, 2018, 2021-2022, 2024, D. R. Commander.
+ * Copyright (C) 2016, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2020, Arm Limited.
  * Copyright (C) 2021, Alex Richardson.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -21,8 +23,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#ifdef WITH_SIMD
 #include "jsimd.h"
-#include "jconfigint.h"
+#else
+#include "jchuff.h"             /* Declarations shared with jc*huff.c */
+#endif
 #include <limits.h>
 
 #ifdef HAVE_INTRIN_H
@@ -39,40 +44,7 @@
 
 #ifdef C_PROGRESSIVE_SUPPORTED
 
-/*
- * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
- * used for bit counting rather than the lookup table.  This will reduce the
- * memory footprint by 64k, which is important for some mobile applications
- * that create many isolated instances of libjpeg-turbo (web browsers, for
- * instance.)  This may improve performance on some mobile platforms as well.
- * This feature is enabled by default only on Arm processors, because some x86
- * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
- * shown to have a significant performance impact even on the x86 chips that
- * have a fast implementation of it.  When building for Armv6, you can
- * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
- * flags (this defines __thumb__).
- */
-
-/* NOTE: Both GCC and Clang define __GNUC__ */
-#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
-    defined(_M_ARM) || defined(_M_ARM64)
-#if !defined(__thumb__) || defined(__thumb2__)
-#define USE_CLZ_INTRINSIC
-#endif
-#endif
-
-#ifdef USE_CLZ_INTRINSIC
-#if defined(_MSC_VER) && !defined(__clang__)
-#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
-#else
-#define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
-#endif
-#define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
-#else
-#include "jpeg_nbits_table.h"
-#define JPEG_NBITS(x)          (jpeg_nbits_table[x])
-#define JPEG_NBITS_NONZERO(x)  JPEG_NBITS(x)
-#endif
+#include "jpeg_nbits.h"
 
 
 /* Expanded entropy encoder object for progressive Huffman encoding. */
@@ -83,11 +55,11 @@ typedef struct {
   /* Pointer to routine to prepare data for encode_mcu_AC_first() */
   void (*AC_first_prepare) (const JCOEF *block,
                             const int *jpeg_natural_order_start, int Sl,
-                            int Al, JCOEF *values, size_t *zerobits);
+                            int Al, UJCOEF *values, size_t *zerobits);
   /* Pointer to routine to prepare data for encode_mcu_AC_refine() */
   int (*AC_refine_prepare) (const JCOEF *block,
                             const int *jpeg_natural_order_start, int Sl,
-                            int Al, JCOEF *absvalues, size_t *bits);
+                            int Al, UJCOEF *absvalues, size_t *bits);
 
   /* Mode flag: TRUE for optimization, FALSE for actual data output */
   boolean gather_statistics;
@@ -157,14 +129,14 @@ METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
                                        JBLOCKROW *MCU_data);
 METHODDEF(void) encode_mcu_AC_first_prepare
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *values, size_t *zerobits);
+   UJCOEF *values, size_t *zerobits);
 METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
                                        JBLOCKROW *MCU_data);
 METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
 METHODDEF(int) encode_mcu_AC_refine_prepare
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *absvalues, size_t *bits);
+   UJCOEF *absvalues, size_t *bits);
 METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
 METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
@@ -224,18 +196,22 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
       entropy->pub.encode_mcu = encode_mcu_DC_first;
     else
       entropy->pub.encode_mcu = encode_mcu_AC_first;
+#ifdef WITH_SIMD
     if (jsimd_can_encode_mcu_AC_first_prepare())
       entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
     else
+#endif
       entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
   } else {
     if (is_DC_band)
       entropy->pub.encode_mcu = encode_mcu_DC_refine;
     else {
       entropy->pub.encode_mcu = encode_mcu_AC_refine;
+#ifdef WITH_SIMD
       if (jsimd_can_encode_mcu_AC_refine_prepare())
         entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
       else
+#endif
         entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
       /* AC refinement needs a correction bit buffer */
       if (entropy->bit_buffer == NULL)
@@ -490,6 +466,7 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   JBLOCKROW block;
   jpeg_component_info *compptr;
   ISHIFT_TEMPS
+  int max_coef_bits = cinfo->data_precision + 2;
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -532,7 +509,7 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Check for out-of-range coefficient values.
      * Since we're encoding a difference, the range limit is twice as much.
      */
-    if (nbits > MAX_COEF_BITS + 1)
+    if (nbits > max_coef_bits + 1)
       ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
     /* Count/emit the Huffman-coded symbol for the number of bits */
@@ -584,8 +561,8 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       continue; \
     /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
     temp2 ^= temp; \
-    values[k] = (JCOEF)temp; \
-    values[k + DCTSIZE2] = (JCOEF)temp2; \
+    values[k] = (UJCOEF)temp; \
+    values[k + DCTSIZE2] = (UJCOEF)temp2; \
     zerobits |= ((size_t)1U) << k; \
   } \
 }
@@ -593,7 +570,7 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 METHODDEF(void)
 encode_mcu_AC_first_prepare(const JCOEF *block,
                             const int *jpeg_natural_order_start, int Sl,
-                            int Al, JCOEF *values, size_t *bits)
+                            int Al, UJCOEF *values, size_t *bits)
 {
   register int k, temp, temp2;
   size_t zerobits = 0U;
@@ -643,7 +620,7 @@ label \
     /* Find the number of bits needed for the magnitude of the coefficient */ \
     nbits = JPEG_NBITS_NONZERO(temp);  /* there must be at least one 1 bit */ \
     /* Check for out-of-range coefficient values */ \
-    if (nbits > MAX_COEF_BITS) \
+    if (nbits > max_coef_bits) \
       ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
     \
     /* Count/emit Huffman symbol for run length / number of bits */ \
@@ -666,11 +643,12 @@ encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   register int nbits, r;
   int Sl = cinfo->Se - cinfo->Ss + 1;
   int Al = cinfo->Al;
-  JCOEF values_unaligned[2 * DCTSIZE2 + 15];
-  JCOEF *values;
-  const JCOEF *cvalue;
+  UJCOEF values_unaligned[2 * DCTSIZE2 + 15];
+  UJCOEF *values;
+  const UJCOEF *cvalue;
   size_t zerobits;
   size_t bits[8 / SIZEOF_SIZE_T];
+  int max_coef_bits = cinfo->data_precision + 2;
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -681,7 +659,7 @@ encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       emit_restart(entropy, entropy->next_restart_num);
 
 #ifdef WITH_SIMD
-  cvalue = values = (JCOEF *)PAD((JUINTPTR)values_unaligned, 16);
+  cvalue = values = (UJCOEF *)PAD((JUINTPTR)values_unaligned, 16);
 #else
   /* Not using SIMD, so alignment is not needed */
   cvalue = values = values_unaligned;
@@ -815,7 +793,7 @@ encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       zerobits |= ((size_t)1U) << k; \
       signbits |= ((size_t)(temp2 + 1)) << k; \
     } \
-    absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \
+    absvalues[k] = (UJCOEF)temp; /* save abs value for main pass */ \
     if (temp == 1) \
       EOB = k + koffset;        /* EOB = index of last newly-nonzero coef */ \
   } \
@@ -824,7 +802,7 @@ encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 METHODDEF(int)
 encode_mcu_AC_refine_prepare(const JCOEF *block,
                              const int *jpeg_natural_order_start, int Sl,
-                             int Al, JCOEF *absvalues, size_t *bits)
+                             int Al, UJCOEF *absvalues, size_t *bits)
 {
   register int k, temp, temp2;
   int EOB = 0;
@@ -931,9 +909,9 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   unsigned int BR;
   int Sl = cinfo->Se - cinfo->Ss + 1;
   int Al = cinfo->Al;
-  JCOEF absvalues_unaligned[DCTSIZE2 + 15];
-  JCOEF *absvalues;
-  const JCOEF *cabsvalue, *EOBPTR;
+  UJCOEF absvalues_unaligned[DCTSIZE2 + 15];
+  UJCOEF *absvalues;
+  const UJCOEF *cabsvalue, *EOBPTR;
   size_t zerobits, signbits;
   size_t bits[16 / SIZEOF_SIZE_T];
 
@@ -946,7 +924,7 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       emit_restart(entropy, entropy->next_restart_num);
 
 #ifdef WITH_SIMD
-  cabsvalue = absvalues = (JCOEF *)PAD((JUINTPTR)absvalues_unaligned, 16);
+  cabsvalue = absvalues = (UJCOEF *)PAD((JUINTPTR)absvalues_unaligned, 16);
 #else
   /* Not using SIMD, so alignment is not needed */
   cabsvalue = absvalues = absvalues_unaligned;
diff --git a/3rdparty/libjpeg-turbo/src/jcprepct.c b/3rdparty/libjpeg-turbo/src/jcprepct.c
index f27cc345079e..ac2311c1388e 100644
--- a/3rdparty/libjpeg-turbo/src/jcprepct.c
+++ b/3rdparty/libjpeg-turbo/src/jcprepct.c
@@ -1,8 +1,10 @@
 /*
  * jcprepct.c
  *
- * This file is part of the Independent JPEG Group's software:
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -20,8 +22,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED)
+
 /* At present, jcsample.c can request context rows only for smoothing.
  * In the future, we might also need context rows for CCIR601 sampling
  * or other more-complex downsampling procedures.  The code to support
@@ -59,7 +64,7 @@ typedef struct {
   /* Downsampling input buffer.  This buffer holds color-converted data
    * until we have enough to do a downsample step.
    */
-  JSAMPARRAY color_buf[MAX_COMPONENTS];
+  _JSAMPARRAY color_buf[MAX_COMPONENTS];
 
   JDIMENSION rows_to_go;        /* counts rows remaining in source image */
   int next_buf_row;             /* index of next row to store in color_buf */
@@ -106,14 +111,14 @@ start_pass_prep(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 LOCAL(void)
-expand_bottom_edge(JSAMPARRAY image_data, JDIMENSION num_cols, int input_rows,
+expand_bottom_edge(_JSAMPARRAY image_data, JDIMENSION num_cols, int input_rows,
                    int output_rows)
 {
   register int row;
 
   for (row = input_rows; row < output_rows; row++) {
-    jcopy_sample_rows(image_data, input_rows - 1, image_data, row, 1,
-                      num_cols);
+    _jcopy_sample_rows(image_data, input_rows - 1, image_data, row, 1,
+                       num_cols);
   }
 }
 
@@ -128,15 +133,16 @@ expand_bottom_edge(JSAMPARRAY image_data, JDIMENSION num_cols, int input_rows,
  */
 
 METHODDEF(void)
-pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+pre_process_data(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
                  JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
-                 JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                 _JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
                  JDIMENSION out_row_groups_avail)
 {
   my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int numrows, ci;
   JDIMENSION inrows;
   jpeg_component_info *compptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
 
   while (*in_row_ctr < in_rows_avail &&
          *out_row_group_ctr < out_row_groups_avail) {
@@ -144,10 +150,10 @@ pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     inrows = in_rows_avail - *in_row_ctr;
     numrows = cinfo->max_v_samp_factor - prep->next_buf_row;
     numrows = (int)MIN((JDIMENSION)numrows, inrows);
-    (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
-                                       prep->color_buf,
-                                       (JDIMENSION)prep->next_buf_row,
-                                       numrows);
+    (*cinfo->cconvert->_color_convert) (cinfo, input_buf + *in_row_ctr,
+                                        prep->color_buf,
+                                        (JDIMENSION)prep->next_buf_row,
+                                        numrows);
     *in_row_ctr += numrows;
     prep->next_buf_row += numrows;
     prep->rows_to_go -= numrows;
@@ -162,9 +168,9 @@ pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     }
     /* If we've filled the conversion buffer, empty it. */
     if (prep->next_buf_row == cinfo->max_v_samp_factor) {
-      (*cinfo->downsample->downsample) (cinfo,
-                                        prep->color_buf, (JDIMENSION)0,
-                                        output_buf, *out_row_group_ctr);
+      (*cinfo->downsample->_downsample) (cinfo,
+                                         prep->color_buf, (JDIMENSION)0,
+                                         output_buf, *out_row_group_ctr);
       prep->next_buf_row = 0;
       (*out_row_group_ctr)++;
     }
@@ -174,7 +180,8 @@ pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     if (prep->rows_to_go == 0 && *out_row_group_ctr < out_row_groups_avail) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
            ci++, compptr++) {
-        expand_bottom_edge(output_buf[ci], compptr->width_in_blocks * DCTSIZE,
+        expand_bottom_edge(output_buf[ci],
+                           compptr->width_in_blocks * data_unit,
                            (int)(*out_row_group_ctr * compptr->v_samp_factor),
                            (int)(out_row_groups_avail * compptr->v_samp_factor));
       }
@@ -192,9 +199,9 @@ pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 METHODDEF(void)
-pre_process_context(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+pre_process_context(j_compress_ptr cinfo, _JSAMPARRAY input_buf,
                     JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
-                    JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                    _JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
                     JDIMENSION out_row_groups_avail)
 {
   my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
@@ -208,17 +215,17 @@ pre_process_context(j_compress_ptr cinfo, JSAMPARRAY input_buf,
       inrows = in_rows_avail - *in_row_ctr;
       numrows = prep->next_buf_stop - prep->next_buf_row;
       numrows = (int)MIN((JDIMENSION)numrows, inrows);
-      (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
-                                         prep->color_buf,
-                                         (JDIMENSION)prep->next_buf_row,
-                                         numrows);
+      (*cinfo->cconvert->_color_convert) (cinfo, input_buf + *in_row_ctr,
+                                          prep->color_buf,
+                                          (JDIMENSION)prep->next_buf_row,
+                                          numrows);
       /* Pad at top of image, if first time through */
       if (prep->rows_to_go == cinfo->image_height) {
         for (ci = 0; ci < cinfo->num_components; ci++) {
           int row;
           for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
-            jcopy_sample_rows(prep->color_buf[ci], 0, prep->color_buf[ci],
-                              -row, 1, cinfo->image_width);
+            _jcopy_sample_rows(prep->color_buf[ci], 0, prep->color_buf[ci],
+                               -row, 1, cinfo->image_width);
           }
         }
       }
@@ -240,9 +247,9 @@ pre_process_context(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     }
     /* If we've gotten enough data, downsample a row group. */
     if (prep->next_buf_row == prep->next_buf_stop) {
-      (*cinfo->downsample->downsample) (cinfo, prep->color_buf,
-                                        (JDIMENSION)prep->this_row_group,
-                                        output_buf, *out_row_group_ctr);
+      (*cinfo->downsample->_downsample) (cinfo, prep->color_buf,
+                                         (JDIMENSION)prep->this_row_group,
+                                         output_buf, *out_row_group_ctr);
       (*out_row_group_ctr)++;
       /* Advance pointers with wraparound as necessary. */
       prep->this_row_group += cinfo->max_v_samp_factor;
@@ -267,15 +274,16 @@ create_context_buffer(j_compress_ptr cinfo)
   int rgroup_height = cinfo->max_v_samp_factor;
   int ci, i;
   jpeg_component_info *compptr;
-  JSAMPARRAY true_buffer, fake_buffer;
+  _JSAMPARRAY true_buffer, fake_buffer;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
 
   /* Grab enough space for fake row pointers for all the components;
    * we need five row groups' worth of pointers for each component.
    */
-  fake_buffer = (JSAMPARRAY)
+  fake_buffer = (_JSAMPARRAY)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (cinfo->num_components * 5 * rgroup_height) *
-                                sizeof(JSAMPROW));
+                                sizeof(_JSAMPROW));
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -283,14 +291,14 @@ create_context_buffer(j_compress_ptr cinfo)
      * We make the buffer wide enough to allow the downsampler to edge-expand
      * horizontally within the buffer, if it so chooses.
      */
-    true_buffer = (*cinfo->mem->alloc_sarray)
+    true_buffer = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
       ((j_common_ptr)cinfo, JPOOL_IMAGE,
-       (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+       (JDIMENSION)(((long)compptr->width_in_blocks * data_unit *
                      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
        (JDIMENSION)(3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
     memcpy(fake_buffer + rgroup_height, true_buffer,
-           3 * rgroup_height * sizeof(JSAMPROW));
+           3 * rgroup_height * sizeof(_JSAMPROW));
     /* Fill in the above and below wraparound pointers */
     for (i = 0; i < rgroup_height; i++) {
       fake_buffer[i] = true_buffer[2 * rgroup_height + i];
@@ -309,11 +317,15 @@ create_context_buffer(j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_c_prep_controller(j_compress_ptr cinfo, boolean need_full_buffer)
+_jinit_c_prep_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_prep_ptr prep;
   int ci;
   jpeg_component_info *compptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
 
   if (need_full_buffer)         /* safety check */
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -331,21 +343,23 @@ jinit_c_prep_controller(j_compress_ptr cinfo, boolean need_full_buffer)
   if (cinfo->downsample->need_context_rows) {
     /* Set up to provide context rows */
 #ifdef CONTEXT_ROWS_SUPPORTED
-    prep->pub.pre_process_data = pre_process_context;
+    prep->pub._pre_process_data = pre_process_context;
     create_context_buffer(cinfo);
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
   } else {
     /* No context, just make it tall enough for one row group */
-    prep->pub.pre_process_data = pre_process_data;
+    prep->pub._pre_process_data = pre_process_data;
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
-      prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
+      prep->color_buf[ci] = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
         ((j_common_ptr)cinfo, JPOOL_IMAGE,
-         (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+         (JDIMENSION)(((long)compptr->width_in_blocks * data_unit *
                        cinfo->max_h_samp_factor) / compptr->h_samp_factor),
          (JDIMENSION)cinfo->max_v_samp_factor);
     }
   }
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jcsample.c b/3rdparty/libjpeg-turbo/src/jcsample.c
index e8515ebf0fce..30e6e54b4058 100644
--- a/3rdparty/libjpeg-turbo/src/jcsample.c
+++ b/3rdparty/libjpeg-turbo/src/jcsample.c
@@ -3,10 +3,12 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, 2019, D. R. Commander.
+ * Copyright (C) 2015, 2019, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -54,13 +56,16 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jsimd.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED)
+
 /* Pointer to routine to downsample a single component */
 typedef void (*downsample1_ptr) (j_compress_ptr cinfo,
                                  jpeg_component_info *compptr,
-                                 JSAMPARRAY input_data,
-                                 JSAMPARRAY output_data);
+                                 _JSAMPARRAY input_data,
+                                 _JSAMPARRAY output_data);
 
 /* Private subobject */
 
@@ -91,11 +96,11 @@ start_pass_downsample(j_compress_ptr cinfo)
  */
 
 LOCAL(void)
-expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+expand_right_edge(_JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
                   JDIMENSION output_cols)
 {
-  register JSAMPROW ptr;
-  register JSAMPLE pixval;
+  register _JSAMPROW ptr;
+  register _JSAMPLE pixval;
   register int count;
   int row;
   int numcols = (int)(output_cols - input_cols);
@@ -118,14 +123,14 @@ expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
  */
 
 METHODDEF(void)
-sep_downsample(j_compress_ptr cinfo, JSAMPIMAGE input_buf,
-               JDIMENSION in_row_index, JSAMPIMAGE output_buf,
+sep_downsample(j_compress_ptr cinfo, _JSAMPIMAGE input_buf,
+               JDIMENSION in_row_index, _JSAMPIMAGE output_buf,
                JDIMENSION out_row_group_index)
 {
   my_downsample_ptr downsample = (my_downsample_ptr)cinfo->downsample;
   int ci;
   jpeg_component_info *compptr;
-  JSAMPARRAY in_ptr, out_ptr;
+  _JSAMPARRAY in_ptr, out_ptr;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -145,12 +150,13 @@ sep_downsample(j_compress_ptr cinfo, JSAMPIMAGE input_buf,
 
 METHODDEF(void)
 int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY output_data)
+               _JSAMPARRAY input_data, _JSAMPARRAY output_data)
 {
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
   JDIMENSION outcol, outcol_h;  /* outcol_h == outcol*h_expand */
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
-  JSAMPROW inptr, outptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * data_unit;
+  _JSAMPROW inptr, outptr;
   JLONG outvalue;
 
   h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
@@ -177,7 +183,7 @@ int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
           outvalue += (JLONG)(*inptr++);
         }
       }
-      *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
+      *outptr++ = (_JSAMPLE)((outvalue + numpix2) / numpix);
     }
     inrow += v_expand;
   }
@@ -192,14 +198,16 @@ int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 fullsize_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY output_data)
+                    _JSAMPARRAY input_data, _JSAMPARRAY output_data)
 {
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+
   /* Copy the data */
-  jcopy_sample_rows(input_data, 0, output_data, 0, cinfo->max_v_samp_factor,
-                    cinfo->image_width);
+  _jcopy_sample_rows(input_data, 0, output_data, 0, cinfo->max_v_samp_factor,
+                     cinfo->image_width);
   /* Edge-expand */
   expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
-                    compptr->width_in_blocks * DCTSIZE);
+                    compptr->width_in_blocks * data_unit);
 }
 
 
@@ -217,12 +225,13 @@ fullsize_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-                JSAMPARRAY input_data, JSAMPARRAY output_data)
+                _JSAMPARRAY input_data, _JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
-  register JSAMPROW inptr, outptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * data_unit;
+  register _JSAMPROW inptr, outptr;
   register int bias;
 
   /* Expand input data enough to let all the output samples be generated
@@ -237,7 +246,7 @@ h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     inptr = input_data[outrow];
     bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
+      *outptr++ = (_JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
       bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
@@ -253,12 +262,13 @@ h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-                JSAMPARRAY input_data, JSAMPARRAY output_data)
+                _JSAMPARRAY input_data, _JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
-  register JSAMPROW inptr0, inptr1, outptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * data_unit;
+  register _JSAMPROW inptr0, inptr1, outptr;
   register int bias;
 
   /* Expand input data enough to let all the output samples be generated
@@ -275,8 +285,8 @@ h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     inptr1 = input_data[inrow + 1];
     bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ =
-        (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
+      *outptr++ = (_JSAMPLE)
+        ((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
       bias ^= 3;                /* 1=>2, 2=>1 */
       inptr0 += 2;  inptr1 += 2;
     }
@@ -295,12 +305,13 @@ h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+                       _JSAMPARRAY input_data, _JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
-  register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * data_unit;
+  register _JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
   JLONG membersum, neighsum, memberscale, neighscale;
 
   /* Expand input data enough to let all the output samples be generated
@@ -341,7 +352,7 @@ h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     neighsum += neighsum;
     neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+    *outptr++ = (_JSAMPLE)((membersum + 32768) >> 16);
     inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
@@ -357,7 +368,7 @@ h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
-      *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+      *outptr++ = (_JSAMPLE)((membersum + 32768) >> 16);
       inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
     }
 
@@ -368,7 +379,7 @@ h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     neighsum += neighsum;
     neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr = (JSAMPLE)((membersum + 32768) >> 16);
+    *outptr = (_JSAMPLE)((membersum + 32768) >> 16);
 
     inrow += 2;
   }
@@ -383,12 +394,13 @@ h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-                           JSAMPARRAY input_data, JSAMPARRAY output_data)
+                           _JSAMPARRAY input_data, _JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
-  register JSAMPROW inptr, above_ptr, below_ptr, outptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * data_unit;
+  register _JSAMPROW inptr, above_ptr, below_ptr, outptr;
   JLONG membersum, neighsum, memberscale, neighscale;
   int colsum, lastcolsum, nextcolsum;
 
@@ -420,7 +432,7 @@ fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+    *outptr++ = (_JSAMPLE)((membersum + 32768) >> 16);
     lastcolsum = colsum;  colsum = nextcolsum;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
@@ -429,7 +441,7 @@ fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
       nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
-      *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+      *outptr++ = (_JSAMPLE)((membersum + 32768) >> 16);
       lastcolsum = colsum;  colsum = nextcolsum;
     }
 
@@ -437,7 +449,7 @@ fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     membersum = *inptr;
     neighsum = lastcolsum + (colsum - membersum) + colsum;
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr = (JSAMPLE)((membersum + 32768) >> 16);
+    *outptr = (_JSAMPLE)((membersum + 32768) >> 16);
 
   }
 }
@@ -451,19 +463,22 @@ fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_downsampler(j_compress_ptr cinfo)
+_jinit_downsampler(j_compress_ptr cinfo)
 {
   my_downsample_ptr downsample;
   int ci;
   jpeg_component_info *compptr;
   boolean smoothok = TRUE;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   downsample = (my_downsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_downsampler));
   cinfo->downsample = (struct jpeg_downsampler *)downsample;
   downsample->pub.start_pass = start_pass_downsample;
-  downsample->pub.downsample = sep_downsample;
+  downsample->pub._downsample = sep_downsample;
   downsample->pub.need_context_rows = FALSE;
 
   if (cinfo->CCIR601_sampling)
@@ -484,15 +499,17 @@ jinit_downsampler(j_compress_ptr cinfo)
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
                compptr->v_samp_factor == cinfo->max_v_samp_factor) {
       smoothok = FALSE;
+#ifdef WITH_SIMD
       if (jsimd_can_h2v1_downsample())
         downsample->methods[ci] = jsimd_h2v1_downsample;
       else
+#endif
         downsample->methods[ci] = h2v1_downsample;
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
                compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
       if (cinfo->smoothing_factor) {
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
         if (jsimd_can_h2v2_smooth_downsample())
           downsample->methods[ci] = jsimd_h2v2_smooth_downsample;
         else
@@ -502,9 +519,11 @@ jinit_downsampler(j_compress_ptr cinfo)
       } else
 #endif
       {
+#ifdef WITH_SIMD
         if (jsimd_can_h2v2_downsample())
           downsample->methods[ci] = jsimd_h2v2_downsample;
         else
+#endif
           downsample->methods[ci] = h2v2_downsample;
       }
     } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
@@ -520,3 +539,5 @@ jinit_downsampler(j_compress_ptr cinfo)
     TRACEMS(cinfo, 0, JTRC_SMOOTH_NOTIMPL);
 #endif
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jcstest.c b/3rdparty/libjpeg-turbo/src/jcstest.c
new file mode 100644
index 000000000000..8b1fe38082b1
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jcstest.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This program demonstrates how to check for the colorspace extension
+   capabilities of libjpeg-turbo at both compile time and run time. */
+
+#include <stdio.h>
+#include <jpeglib.h>
+#include <jerror.h>
+#include <setjmp.h>
+
+#ifndef JCS_EXTENSIONS
+#define JCS_EXT_RGB  6
+#endif
+#if !defined(JCS_EXTENSIONS) || !defined(JCS_ALPHA_EXTENSIONS)
+#define JCS_EXT_RGBA  12
+#endif
+
+static char lasterror[JMSG_LENGTH_MAX] = "No error";
+
+typedef struct _error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf jb;
+} error_mgr;
+
+static void my_error_exit(j_common_ptr cinfo)
+{
+  error_mgr *myerr = (error_mgr *)cinfo->err;
+  (*cinfo->err->output_message) (cinfo);
+  longjmp(myerr->jb, 1);
+}
+
+static void my_output_message(j_common_ptr cinfo)
+{
+  (*cinfo->err->format_message) (cinfo, lasterror);
+}
+
+int main(void)
+{
+  int jcs_valid = -1, jcs_alpha_valid = -1;
+  struct jpeg_compress_struct cinfo;
+  error_mgr jerr;
+
+  printf("libjpeg-turbo colorspace extensions:\n");
+#if JCS_EXTENSIONS
+  printf("  Present at compile time\n");
+#else
+  printf("  Not present at compile time\n");
+#endif
+
+  cinfo.err = jpeg_std_error(&jerr.pub);
+  jerr.pub.error_exit = my_error_exit;
+  jerr.pub.output_message = my_output_message;
+
+  if (setjmp(jerr.jb)) {
+    /* this will execute if libjpeg has an error */
+    jcs_valid = 0;
+    goto done;
+  }
+
+  jpeg_create_compress(&cinfo);
+  cinfo.input_components = 3;
+  jpeg_set_defaults(&cinfo);
+  cinfo.in_color_space = JCS_EXT_RGB;
+  jpeg_default_colorspace(&cinfo);
+  jcs_valid = 1;
+
+done:
+  if (jcs_valid)
+    printf("  Working properly\n");
+  else
+    printf("  Not working properly.  Error returned was:\n    %s\n",
+           lasterror);
+
+  printf("libjpeg-turbo alpha colorspace extensions:\n");
+#if JCS_ALPHA_EXTENSIONS
+  printf("  Present at compile time\n");
+#else
+  printf("  Not present at compile time\n");
+#endif
+
+  if (setjmp(jerr.jb)) {
+    /* this will execute if libjpeg has an error */
+    jcs_alpha_valid = 0;
+    goto done2;
+  }
+
+  cinfo.in_color_space = JCS_EXT_RGBA;
+  jpeg_default_colorspace(&cinfo);
+  jcs_alpha_valid = 1;
+
+done2:
+  if (jcs_alpha_valid)
+    printf("  Working properly\n");
+  else
+    printf("  Not working properly.  Error returned was:\n    %s\n",
+           lasterror);
+
+  jpeg_destroy_compress(&cinfo);
+  return 0;
+}
diff --git a/3rdparty/libjpeg-turbo/src/jctrans.c b/3rdparty/libjpeg-turbo/src/jctrans.c
index e121028ec70d..ae52e3989ee0 100644
--- a/3rdparty/libjpeg-turbo/src/jctrans.c
+++ b/3rdparty/libjpeg-turbo/src/jctrans.c
@@ -17,7 +17,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
 
 
 /* Forward declarations */
@@ -42,6 +42,9 @@ LOCAL(void) transencode_coef_controller(j_compress_ptr cinfo,
 GLOBAL(void)
 jpeg_write_coefficients(j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
 {
+  if (cinfo->master->lossless)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   /* Mark all tables to be written */
@@ -72,6 +75,9 @@ jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
   JQUANT_TBL *c_quant, *slot_quant;
   int tblno, ci, coefi;
 
+  if (srcinfo->master->lossless)
+    ERREXIT(dstinfo, JERR_NOTIMPL);
+
   /* Safety check to ensure start_compress not called yet. */
   if (dstinfo->global_state != CSTATE_START)
     ERREXIT1(dstinfo, JERR_BAD_STATE, dstinfo->global_state);
@@ -364,6 +370,13 @@ compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 }
 
 
+METHODDEF(boolean)
+compress_output_12(j_compress_ptr cinfo, J12SAMPIMAGE input_buf)
+{
+  return compress_output(cinfo, (JSAMPIMAGE)input_buf);
+}
+
+
 /*
  * Initialize coefficient buffer controller.
  *
@@ -386,6 +399,7 @@ transencode_coef_controller(j_compress_ptr cinfo,
   cinfo->coef = (struct jpeg_c_coef_controller *)coef;
   coef->pub.start_pass = start_pass_coef;
   coef->pub.compress_data = compress_output;
+  coef->pub.compress_data_12 = compress_output_12;
 
   /* Save pointer to virtual arrays */
   coef->whole_image = coef_arrays;
diff --git a/3rdparty/libjpeg-turbo/src/jdapimin.c b/3rdparty/libjpeg-turbo/src/jdapimin.c
index f50c27edc323..30d92841a8cb 100644
--- a/3rdparty/libjpeg-turbo/src/jdapimin.c
+++ b/3rdparty/libjpeg-turbo/src/jdapimin.c
@@ -3,8 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2016, 2022, D. R. Commander.
+ * Copyright (C) 2016, 2022, 2024, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,7 +25,6 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdmaster.h"
-#include "jconfigint.h"
 
 
 /*
@@ -83,6 +84,8 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
   /* And initialize the overall input controller. */
   jinit_input_controller(cinfo);
 
+  cinfo->data_precision = BITS_IN_JSAMPLE;
+
   /* OK, I'm ready */
   cinfo->global_state = DSTATE_START;
 
@@ -157,13 +160,19 @@ default_decompress_parms(j_decompress_ptr cinfo)
       int cid1 = cinfo->comp_info[1].component_id;
       int cid2 = cinfo->comp_info[2].component_id;
 
-      if (cid0 == 1 && cid1 == 2 && cid2 == 3)
-        cinfo->jpeg_color_space = JCS_YCbCr; /* assume JFIF w/out marker */
-      else if (cid0 == 82 && cid1 == 71 && cid2 == 66)
+      if (cid0 == 1 && cid1 == 2 && cid2 == 3) {
+        if (cinfo->master->lossless)
+          cinfo->jpeg_color_space = JCS_RGB; /* assume RGB w/out marker */
+        else
+          cinfo->jpeg_color_space = JCS_YCbCr; /* assume JFIF w/out marker */
+      } else if (cid0 == 82 && cid1 == 71 && cid2 == 66)
         cinfo->jpeg_color_space = JCS_RGB; /* ASCII 'R', 'G', 'B' */
       else {
         TRACEMS3(cinfo, 1, JTRC_UNKNOWN_IDS, cid0, cid1, cid2);
-        cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */
+        if (cinfo->master->lossless)
+          cinfo->jpeg_color_space = JCS_RGB; /* assume it's RGB */
+        else
+          cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */
       }
     }
     /* Always guess RGB is proper output colorspace. */
diff --git a/3rdparty/libjpeg-turbo/src/jdapistd.c b/3rdparty/libjpeg-turbo/src/jdapistd.c
index 8827d8abf5c5..1f4492723682 100644
--- a/3rdparty/libjpeg-turbo/src/jdapistd.c
+++ b/3rdparty/libjpeg-turbo/src/jdapistd.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2020, 2022, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, 2022-2023, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -19,13 +19,20 @@
  */
 
 #include "jinclude.h"
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
 #include "jdmainct.h"
 #include "jdcoefct.h"
+#else
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+#endif
 #include "jdmaster.h"
 #include "jdmerge.h"
 #include "jdsample.h"
 #include "jmemsys.h"
 
+#if BITS_IN_JSAMPLE == 8
+
 /* Forward declarations */
 LOCAL(boolean) output_pass_setup(j_decompress_ptr cinfo);
 
@@ -121,8 +128,20 @@ output_pass_setup(j_decompress_ptr cinfo)
       }
       /* Process some data */
       last_scanline = cinfo->output_scanline;
-      (*cinfo->main->process_data) (cinfo, (JSAMPARRAY)NULL,
-                                    &cinfo->output_scanline, (JDIMENSION)0);
+#ifdef D_LOSSLESS_SUPPORTED
+      if (cinfo->data_precision == 16)
+        (*cinfo->main->process_data_16) (cinfo, (J16SAMPARRAY)NULL,
+                                         &cinfo->output_scanline,
+                                         (JDIMENSION)0);
+      else
+#endif
+      if (cinfo->data_precision == 12)
+        (*cinfo->main->process_data_12) (cinfo, (J12SAMPARRAY)NULL,
+                                         &cinfo->output_scanline,
+                                         (JDIMENSION)0);
+      else
+        (*cinfo->main->process_data) (cinfo, (JSAMPARRAY)NULL,
+                                      &cinfo->output_scanline, (JDIMENSION)0);
       if (cinfo->output_scanline == last_scanline)
         return FALSE;           /* No progress made, must suspend */
     }
@@ -135,33 +154,46 @@ output_pass_setup(j_decompress_ptr cinfo)
 #endif /* QUANT_2PASS_SUPPORTED */
   }
   /* Ready for application to drive output pass through
-   * jpeg_read_scanlines or jpeg_read_raw_data.
+   * _jpeg_read_scanlines or _jpeg_read_raw_data.
    */
   cinfo->global_state = cinfo->raw_data_out ? DSTATE_RAW_OK : DSTATE_SCANNING;
   return TRUE;
 }
 
+#endif /* BITS_IN_JSAMPLE == 8 */
+
+
+#if BITS_IN_JSAMPLE != 16
 
 /*
  * Enable partial scanline decompression
  *
  * Must be called after jpeg_start_decompress() and before any calls to
- * jpeg_read_scanlines() or jpeg_skip_scanlines().
+ * _jpeg_read_scanlines() or _jpeg_skip_scanlines().
  *
  * Refer to libjpeg.txt for more information.
  */
 
 GLOBAL(void)
-jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
-                   JDIMENSION *width)
+_jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                    JDIMENSION *width)
 {
   int ci, align, orig_downsampled_width;
   JDIMENSION input_xoffset;
   boolean reinit_upsampler = FALSE;
   jpeg_component_info *compptr;
+#ifdef UPSAMPLE_MERGING_SUPPORTED
   my_master_ptr master = (my_master_ptr)cinfo->master;
+#endif
+
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
 
-  if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
+  if (cinfo->master->lossless)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
+  if ((cinfo->global_state != DSTATE_SCANNING &&
+       cinfo->global_state != DSTATE_BUFIMAGE) || cinfo->output_scanline != 0)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
   if (!xoffset || !width)
@@ -209,11 +241,13 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
    */
   *width = *width + input_xoffset - *xoffset;
   cinfo->output_width = *width;
+#ifdef UPSAMPLE_MERGING_SUPPORTED
   if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
     my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
     upsample->out_row_width =
       cinfo->output_width * cinfo->out_color_components;
   }
+#endif
 
   /* Set the first and last iMCU columns that we must decompress.  These values
    * will be used in single-scan decompressions.
@@ -231,9 +265,11 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
     /* Set downsampled_width to the new output width. */
     orig_downsampled_width = compptr->downsampled_width;
     compptr->downsampled_width =
-      (JDIMENSION)jdiv_round_up((long)(cinfo->output_width *
-                                       compptr->h_samp_factor),
-                                (long)cinfo->max_h_samp_factor);
+      (JDIMENSION)jdiv_round_up((long)cinfo->output_width *
+                                (long)(compptr->h_samp_factor *
+                                       compptr->_DCT_scaled_size),
+                                (long)(cinfo->max_h_samp_factor *
+                                       cinfo->_min_DCT_scaled_size));
     if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2)
       reinit_upsampler = TRUE;
 
@@ -249,11 +285,13 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
 
   if (reinit_upsampler) {
     cinfo->master->jinit_upsampler_no_alloc = TRUE;
-    jinit_upsampler(cinfo);
+    _jinit_upsampler(cinfo);
     cinfo->master->jinit_upsampler_no_alloc = FALSE;
   }
 }
 
+#endif /* BITS_IN_JSAMPLE != 16 */
+
 
 /*
  * Read some scanlines of data from the JPEG decompressor.
@@ -263,17 +301,21 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
  * including bottom of image, data source suspension, and operating
  * modes that emit multiple scanlines at a time.
  *
- * Note: we warn about excess calls to jpeg_read_scanlines() since
+ * Note: we warn about excess calls to _jpeg_read_scanlines() since
  * this likely signals an application programmer error.  However,
  * an oversize buffer (max_lines > scanlines remaining) is not an error.
  */
 
 GLOBAL(JDIMENSION)
-jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
-                    JDIMENSION max_lines)
+_jpeg_read_scanlines(j_decompress_ptr cinfo, _JSAMPARRAY scanlines,
+                     JDIMENSION max_lines)
 {
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
   JDIMENSION row_ctr;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   if (cinfo->global_state != DSTATE_SCANNING)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   if (cinfo->output_scanline >= cinfo->output_height) {
@@ -290,30 +332,36 @@ jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
 
   /* Process some data */
   row_ctr = 0;
-  (*cinfo->main->process_data) (cinfo, scanlines, &row_ctr, max_lines);
+  (*cinfo->main->_process_data) (cinfo, scanlines, &row_ctr, max_lines);
   cinfo->output_scanline += row_ctr;
   return row_ctr;
+#else
+  ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+  return 0;
+#endif
 }
 
 
-/* Dummy color convert function used by jpeg_skip_scanlines() */
+#if BITS_IN_JSAMPLE != 16
+
+/* Dummy color convert function used by _jpeg_skip_scanlines() */
 LOCAL(void)
-noop_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-             JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+noop_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+             JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
 }
 
 
-/* Dummy quantize function used by jpeg_skip_scanlines() */
+/* Dummy quantize function used by _jpeg_skip_scanlines() */
 LOCAL(void)
-noop_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-              JSAMPARRAY output_buf, int num_rows)
+noop_quantize(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+              _JSAMPARRAY output_buf, int num_rows)
 {
 }
 
 
 /*
- * In some cases, it is best to call jpeg_read_scanlines() and discard the
+ * In some cases, it is best to call _jpeg_read_scanlines() and discard the
  * output, rather than skipping the scanlines, because this allows us to
  * maintain the internal state of the context-based upsampler.  In these cases,
  * we set up and tear down a dummy color converter in order to avoid valgrind
@@ -324,49 +372,53 @@ LOCAL(void)
 read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
   JDIMENSION n;
+#ifdef UPSAMPLE_MERGING_SUPPORTED
   my_master_ptr master = (my_master_ptr)cinfo->master;
-  JSAMPLE dummy_sample[1] = { 0 };
-  JSAMPROW dummy_row = dummy_sample;
-  JSAMPARRAY scanlines = NULL;
-  void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                         JDIMENSION input_row, JSAMPARRAY output_buf,
+#endif
+  _JSAMPLE dummy_sample[1] = { 0 };
+  _JSAMPROW dummy_row = dummy_sample;
+  _JSAMPARRAY scanlines = NULL;
+  void (*color_convert) (j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, _JSAMPARRAY output_buf,
                          int num_rows) = NULL;
-  void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                          JSAMPARRAY output_buf, int num_rows) = NULL;
+  void (*color_quantize) (j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+                          _JSAMPARRAY output_buf, int num_rows) = NULL;
 
-  if (cinfo->cconvert && cinfo->cconvert->color_convert) {
-    color_convert = cinfo->cconvert->color_convert;
-    cinfo->cconvert->color_convert = noop_convert;
+  if (cinfo->cconvert && cinfo->cconvert->_color_convert) {
+    color_convert = cinfo->cconvert->_color_convert;
+    cinfo->cconvert->_color_convert = noop_convert;
     /* This just prevents UBSan from complaining about adding 0 to a NULL
      * pointer.  The pointer isn't actually used.
      */
     scanlines = &dummy_row;
   }
 
-  if (cinfo->cquantize && cinfo->cquantize->color_quantize) {
-    color_quantize = cinfo->cquantize->color_quantize;
-    cinfo->cquantize->color_quantize = noop_quantize;
+  if (cinfo->cquantize && cinfo->cquantize->_color_quantize) {
+    color_quantize = cinfo->cquantize->_color_quantize;
+    cinfo->cquantize->_color_quantize = noop_quantize;
   }
 
+#ifdef UPSAMPLE_MERGING_SUPPORTED
   if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
     my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
     scanlines = &upsample->spare_row;
   }
+#endif
 
   for (n = 0; n < num_lines; n++)
-    jpeg_read_scanlines(cinfo, scanlines, 1);
+    _jpeg_read_scanlines(cinfo, scanlines, 1);
 
   if (color_convert)
-    cinfo->cconvert->color_convert = color_convert;
+    cinfo->cconvert->_color_convert = color_convert;
 
   if (color_quantize)
-    cinfo->cquantize->color_quantize = color_quantize;
+    cinfo->cquantize->_color_quantize = color_quantize;
 }
 
 
 /*
- * Called by jpeg_skip_scanlines().  This partially skips a decompress block by
- * incrementing the rowgroup counter.
+ * Called by _jpeg_skip_scanlines().  This partially skips a decompress block
+ * by incrementing the rowgroup counter.
  */
 
 LOCAL(void)
@@ -405,7 +457,7 @@ increment_simple_rowgroup_ctr(j_decompress_ptr cinfo, JDIMENSION rows)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
+_jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
   my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
@@ -416,6 +468,12 @@ jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
   JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row;
   JDIMENSION lines_to_skip, lines_to_read;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  if (cinfo->master->lossless)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
   /* Two-pass color quantization is not supported. */
   if (cinfo->quantize_colors && cinfo->two_pass_quantize)
     ERREXIT(cinfo, JERR_NOTIMPL);
@@ -517,7 +575,7 @@ jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
    * all of the entropy decoding occurs in jpeg_start_decompress(), assuming
    * that the input data source is non-suspending.  This makes skipping easy.
    */
-  if (cinfo->inputctl->has_multiple_scans) {
+  if (cinfo->inputctl->has_multiple_scans || cinfo->buffered_image) {
     if (cinfo->upsample->need_context_rows) {
       cinfo->output_scanline += lines_to_skip;
       cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
@@ -588,11 +646,17 @@ jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
-                   JDIMENSION max_lines)
+_jpeg_read_raw_data(j_decompress_ptr cinfo, _JSAMPIMAGE data,
+                    JDIMENSION max_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  if (cinfo->master->lossless)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
   if (cinfo->global_state != DSTATE_RAW_OK)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   if (cinfo->output_scanline >= cinfo->output_height) {
@@ -613,7 +677,7 @@ jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Decompress directly into user's buffer. */
-  if (!(*cinfo->coef->decompress_data) (cinfo, data))
+  if (!(*cinfo->coef->_decompress_data) (cinfo, data))
     return 0;                   /* suspension forced, can do nothing more */
 
   /* OK, we processed one iMCU row. */
@@ -621,6 +685,10 @@ jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
   return lines_per_iMCU_row;
 }
 
+#endif /* BITS_IN_JSAMPLE != 16 */
+
+
+#if BITS_IN_JSAMPLE == 8
 
 /* Additional entry points for buffered-image mode. */
 
@@ -678,3 +746,5 @@ jpeg_finish_output(j_decompress_ptr cinfo)
 }
 
 #endif /* D_MULTISCAN_FILES_SUPPORTED */
+
+#endif /* BITS_IN_JSAMPLE == 8 */
diff --git a/3rdparty/libjpeg-turbo/src/jdatadst-tj.c b/3rdparty/libjpeg-turbo/src/jdatadst-tj.c
new file mode 100644
index 000000000000..cce263af747a
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jdatadst-tj.c
@@ -0,0 +1,198 @@
+/*
+ * jdatadst-tj.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2012 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2011, 2014, 2016, 2019, 2022-2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains compression data destination routines for the case of
+ * emitting JPEG data to memory or to a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different destination manager.
+ * IMPORTANT: we assume that fwrite() will correctly transcribe an array of
+ * JOCTETs into 8-bit-wide elements on external storage.  If char is wider
+ * than 8 bits on your machine, you may need to do some tweaking.
+ */
+
+/* this is not a core library module, so it doesn't define JPEG_INTERNALS */
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+void jpeg_mem_dest_tj(j_compress_ptr cinfo, unsigned char **outbuffer,
+                      size_t *outsize, boolean alloc);
+
+
+#define OUTPUT_BUF_SIZE  4096   /* choose an efficiently fwrite'able size */
+
+
+/* Expanded data destination object for memory output */
+
+typedef struct {
+  struct jpeg_destination_mgr pub; /* public fields */
+
+  unsigned char **outbuffer;    /* target buffer */
+  size_t *outsize;
+  unsigned char *newbuffer;     /* newly allocated buffer */
+  JOCTET *buffer;               /* start of buffer */
+  size_t bufsize;
+  boolean alloc;
+} my_mem_destination_mgr;
+
+typedef my_mem_destination_mgr *my_mem_dest_ptr;
+
+
+/*
+ * Initialize destination --- called by jpeg_start_compress
+ * before any data is actually written.
+ */
+
+METHODDEF(void)
+init_mem_destination(j_compress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+
+
+/*
+ * Empty the output buffer --- called whenever buffer fills up.
+ *
+ * In typical applications, this should write the entire output buffer
+ * (ignoring the current state of next_output_byte & free_in_buffer),
+ * reset the pointer & count to the start of the buffer, and return TRUE
+ * indicating that the buffer has been dumped.
+ *
+ * In applications that need to be able to suspend compression due to output
+ * overrun, a FALSE return indicates that the buffer cannot be emptied now.
+ * In this situation, the compressor will return to its caller (possibly with
+ * an indication that it has not accepted all the supplied scanlines).  The
+ * application should resume compression after it has made more room in the
+ * output buffer.  Note that there are substantial restrictions on the use of
+ * suspension --- see the documentation.
+ *
+ * When suspending, the compressor will back up to a convenient restart point
+ * (typically the start of the current MCU). next_output_byte & free_in_buffer
+ * indicate where the restart point will be if the current call returns FALSE.
+ * Data beyond this point will be regenerated after resumption, so do not
+ * write it out when emptying the buffer externally.
+ */
+
+METHODDEF(boolean)
+empty_mem_output_buffer(j_compress_ptr cinfo)
+{
+  size_t nextsize;
+  JOCTET *nextbuffer;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
+
+  if (!dest->alloc) ERREXIT(cinfo, JERR_BUFFER_SIZE);
+
+  /* Try to allocate new buffer with double size */
+  nextsize = dest->bufsize * 2;
+  nextbuffer = (JOCTET *)malloc(nextsize);
+
+  if (nextbuffer == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+
+  memcpy(nextbuffer, dest->buffer, dest->bufsize);
+
+  free(dest->newbuffer);
+
+  dest->newbuffer = nextbuffer;
+
+  dest->pub.next_output_byte = nextbuffer + dest->bufsize;
+  dest->pub.free_in_buffer = dest->bufsize;
+
+  dest->buffer = nextbuffer;
+  dest->bufsize = nextsize;
+
+  return TRUE;
+}
+
+
+/*
+ * Terminate destination --- called by jpeg_finish_compress
+ * after all data has been written.  Usually needs to flush buffer.
+ *
+ * NB: *not* called by jpeg_abort or jpeg_destroy; surrounding
+ * application must deal with any cleanup that should happen even
+ * for error exit.
+ */
+
+METHODDEF(void)
+term_mem_destination(j_compress_ptr cinfo)
+{
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
+
+  if (dest->alloc) *dest->outbuffer = dest->buffer;
+  *dest->outsize = dest->bufsize - dest->pub.free_in_buffer;
+}
+
+
+/*
+ * Prepare for output to a memory buffer.
+ * The caller may supply an own initial buffer with appropriate size.
+ * Otherwise, or when the actual data output exceeds the given size,
+ * the library adapts the buffer size as necessary.
+ * The standard library functions malloc/free are used for allocating
+ * larger memory, so the buffer is available to the application after
+ * finishing compression, and then the application is responsible for
+ * freeing the requested memory.
+ */
+
+GLOBAL(void)
+jpeg_mem_dest_tj(j_compress_ptr cinfo, unsigned char **outbuffer,
+                 size_t *outsize, boolean alloc)
+{
+  boolean reused = FALSE;
+  my_mem_dest_ptr dest;
+
+  if (outbuffer == NULL || outsize == NULL)     /* sanity check */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+
+  /* The destination object is made permanent so that multiple JPEG images
+   * can be written to the same buffer without re-executing jpeg_mem_dest.
+   */
+  if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
+    cinfo->dest = (struct jpeg_destination_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+                                  sizeof(my_mem_destination_mgr));
+    dest = (my_mem_dest_ptr)cinfo->dest;
+    dest->newbuffer = NULL;
+    dest->buffer = NULL;
+  } else if (cinfo->dest->init_destination != init_mem_destination) {
+    /* It is unsafe to reuse the existing destination manager unless it was
+     * created by this function.
+     */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  }
+
+  dest = (my_mem_dest_ptr)cinfo->dest;
+  dest->pub.init_destination = init_mem_destination;
+  dest->pub.empty_output_buffer = empty_mem_output_buffer;
+  dest->pub.term_destination = term_mem_destination;
+  if (dest->buffer == *outbuffer && *outbuffer != NULL && alloc)
+    reused = TRUE;
+  dest->outbuffer = outbuffer;
+  dest->outsize = outsize;
+  dest->alloc = alloc;
+
+  if (*outbuffer == NULL || *outsize == 0) {
+    if (alloc) {
+      /* Allocate initial buffer */
+      dest->newbuffer = *outbuffer = (unsigned char *)malloc(OUTPUT_BUF_SIZE);
+      if (dest->newbuffer == NULL)
+        ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+      *outsize = OUTPUT_BUF_SIZE;
+    } else
+      ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  }
+
+  dest->pub.next_output_byte = dest->buffer = *outbuffer;
+  if (!reused)
+    dest->bufsize = *outsize;
+  dest->pub.free_in_buffer = dest->bufsize;
+}
diff --git a/3rdparty/libjpeg-turbo/src/jdatadst.c b/3rdparty/libjpeg-turbo/src/jdatadst.c
index 6b4fed233971..529f93b49045 100644
--- a/3rdparty/libjpeg-turbo/src/jdatadst.c
+++ b/3rdparty/libjpeg-turbo/src/jdatadst.c
@@ -38,7 +38,6 @@ typedef my_destination_mgr *my_dest_ptr;
 #define OUTPUT_BUF_SIZE  4096   /* choose an efficiently fwrite'able size */
 
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /* Expanded data destination object for memory output */
 
 typedef struct {
@@ -52,7 +51,6 @@ typedef struct {
 } my_mem_destination_mgr;
 
 typedef my_mem_destination_mgr *my_mem_dest_ptr;
-#endif
 
 
 /*
@@ -74,13 +72,11 @@ init_destination(j_compress_ptr cinfo)
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
 }
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
 init_mem_destination(j_compress_ptr cinfo)
 {
   /* no work necessary here */
 }
-#endif
 
 
 /*
@@ -121,7 +117,6 @@ empty_output_buffer(j_compress_ptr cinfo)
   return TRUE;
 }
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(boolean)
 empty_mem_output_buffer(j_compress_ptr cinfo)
 {
@@ -150,7 +145,6 @@ empty_mem_output_buffer(j_compress_ptr cinfo)
 
   return TRUE;
 }
-#endif
 
 
 /*
@@ -179,7 +173,6 @@ term_destination(j_compress_ptr cinfo)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 }
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
 term_mem_destination(j_compress_ptr cinfo)
 {
@@ -188,7 +181,6 @@ term_mem_destination(j_compress_ptr cinfo)
   *dest->outbuffer = dest->buffer;
   *dest->outsize = (unsigned long)(dest->bufsize - dest->pub.free_in_buffer);
 }
-#endif
 
 
 /*
@@ -227,7 +219,6 @@ jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile)
 }
 
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /*
  * Prepare for output to a memory buffer.
  * The caller may supply an own initial buffer with appropriate size.
@@ -284,4 +275,3 @@ jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
   dest->pub.next_output_byte = dest->buffer = *outbuffer;
   dest->pub.free_in_buffer = dest->bufsize = *outsize;
 }
-#endif
diff --git a/3rdparty/libjpeg-turbo/src/jdatasrc-tj.c b/3rdparty/libjpeg-turbo/src/jdatasrc-tj.c
new file mode 100644
index 000000000000..a5970b53fe8f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jdatasrc-tj.c
@@ -0,0 +1,194 @@
+/*
+ * jdatasrc-tj.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2011 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2011, 2016, 2019, 2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains decompression data source routines for the case of
+ * reading JPEG data from memory or from a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different source manager.
+ * IMPORTANT: we assume that fread() will correctly transcribe an array of
+ * JOCTETs from 8-bit-wide elements on external storage.  If char is wider
+ * than 8 bits on your machine, you may need to do some tweaking.
+ */
+
+/* this is not a core library module, so it doesn't define JPEG_INTERNALS */
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+void jpeg_mem_src_tj(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                     size_t insize);
+
+
+/*
+ * Initialize source --- called by jpeg_read_header
+ * before any data is actually read.
+ */
+
+METHODDEF(void)
+init_mem_source(j_decompress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+
+
+/*
+ * Fill the input buffer --- called whenever buffer is emptied.
+ *
+ * In typical applications, this should read fresh data into the buffer
+ * (ignoring the current state of next_input_byte & bytes_in_buffer),
+ * reset the pointer & count to the start of the buffer, and return TRUE
+ * indicating that the buffer has been reloaded.  It is not necessary to
+ * fill the buffer entirely, only to obtain at least one more byte.
+ *
+ * There is no such thing as an EOF return.  If the end of the file has been
+ * reached, the routine has a choice of ERREXIT() or inserting fake data into
+ * the buffer.  In most cases, generating a warning message and inserting a
+ * fake EOI marker is the best course of action --- this will allow the
+ * decompressor to output however much of the image is there.  However,
+ * the resulting error message is misleading if the real problem is an empty
+ * input file, so we handle that case specially.
+ *
+ * In applications that need to be able to suspend compression due to input
+ * not being available yet, a FALSE return indicates that no more data can be
+ * obtained right now, but more may be forthcoming later.  In this situation,
+ * the decompressor will return to its caller (with an indication of the
+ * number of scanlines it has read, if any).  The application should resume
+ * decompression after it has loaded more data into the input buffer.  Note
+ * that there are substantial restrictions on the use of suspension --- see
+ * the documentation.
+ *
+ * When suspending, the decompressor will back up to a convenient restart point
+ * (typically the start of the current MCU). next_input_byte & bytes_in_buffer
+ * indicate where the restart point will be if the current call returns FALSE.
+ * Data beyond this point must be rescanned after resumption, so move it to
+ * the front of the buffer rather than discarding it.
+ */
+
+METHODDEF(boolean)
+fill_mem_input_buffer(j_decompress_ptr cinfo)
+{
+  static const JOCTET mybuffer[4] = {
+    (JOCTET)0xFF, (JOCTET)JPEG_EOI, 0, 0
+  };
+
+  /* The whole JPEG data is expected to reside in the supplied memory
+   * buffer, so any request for more data beyond the given buffer size
+   * is treated as an error.
+   */
+  WARNMS(cinfo, JWRN_JPEG_EOF);
+
+  /* Insert a fake EOI marker */
+
+  cinfo->src->next_input_byte = mybuffer;
+  cinfo->src->bytes_in_buffer = 2;
+
+  return TRUE;
+}
+
+
+/*
+ * Skip data --- used to skip over a potentially large amount of
+ * uninteresting data (such as an APPn marker).
+ *
+ * Writers of suspendable-input applications must note that skip_input_data
+ * is not granted the right to give a suspension return.  If the skip extends
+ * beyond the data currently in the buffer, the buffer can be marked empty so
+ * that the next read will cause a fill_input_buffer call that can suspend.
+ * Arranging for additional bytes to be discarded before reloading the input
+ * buffer is the application writer's problem.
+ */
+
+METHODDEF(void)
+skip_input_data(j_decompress_ptr cinfo, long num_bytes)
+{
+  struct jpeg_source_mgr *src = cinfo->src;
+
+  /* Just a dumb implementation for now.  Could use fseek() except
+   * it doesn't work on pipes.  Not clear that being smart is worth
+   * any trouble anyway --- large skips are infrequent.
+   */
+  if (num_bytes > 0) {
+    while (num_bytes > (long)src->bytes_in_buffer) {
+      num_bytes -= (long)src->bytes_in_buffer;
+      (void)(*src->fill_input_buffer) (cinfo);
+      /* note we assume that fill_input_buffer will never return FALSE,
+       * so suspension need not be handled.
+       */
+    }
+    src->next_input_byte += (size_t)num_bytes;
+    src->bytes_in_buffer -= (size_t)num_bytes;
+  }
+}
+
+
+/*
+ * An additional method that can be provided by data source modules is the
+ * resync_to_restart method for error recovery in the presence of RST markers.
+ * For the moment, this source module just uses the default resync method
+ * provided by the JPEG library.  That method assumes that no backtracking
+ * is possible.
+ */
+
+
+/*
+ * Terminate source --- called by jpeg_finish_decompress
+ * after all data has been read.  Often a no-op.
+ *
+ * NB: *not* called by jpeg_abort or jpeg_destroy; surrounding
+ * application must deal with any cleanup that should happen even
+ * for error exit.
+ */
+
+METHODDEF(void)
+term_source(j_decompress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+
+
+/*
+ * Prepare for input from a supplied memory buffer.
+ * The buffer must contain the whole JPEG data.
+ */
+
+GLOBAL(void)
+jpeg_mem_src_tj(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                size_t insize)
+{
+  struct jpeg_source_mgr *src;
+
+  if (inbuffer == NULL || insize == 0)  /* Treat empty input as fatal error */
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+
+  /* The source object is made permanent so that a series of JPEG images
+   * can be read from the same buffer by calling jpeg_mem_src only before
+   * the first one.
+   */
+  if (cinfo->src == NULL) {     /* first time for this JPEG object? */
+    cinfo->src = (struct jpeg_source_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+                                  sizeof(struct jpeg_source_mgr));
+  } else if (cinfo->src->init_source != init_mem_source) {
+    /* It is unsafe to reuse the existing source manager unless it was created
+     * by this function.
+     */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  }
+
+  src = cinfo->src;
+  src->init_source = init_mem_source;
+  src->fill_input_buffer = fill_mem_input_buffer;
+  src->skip_input_data = skip_input_data;
+  src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
+  src->term_source = term_source;
+  src->bytes_in_buffer = insize;
+  src->next_input_byte = (const JOCTET *)inbuffer;
+}
diff --git a/3rdparty/libjpeg-turbo/src/jdatasrc.c b/3rdparty/libjpeg-turbo/src/jdatasrc.c
index e36a30d89449..dc135f43a470 100644
--- a/3rdparty/libjpeg-turbo/src/jdatasrc.c
+++ b/3rdparty/libjpeg-turbo/src/jdatasrc.c
@@ -56,13 +56,11 @@ init_source(j_decompress_ptr cinfo)
   src->start_of_file = TRUE;
 }
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
 init_mem_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
-#endif
 
 
 /*
@@ -123,7 +121,6 @@ fill_input_buffer(j_decompress_ptr cinfo)
   return TRUE;
 }
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(boolean)
 fill_mem_input_buffer(j_decompress_ptr cinfo)
 {
@@ -144,7 +141,6 @@ fill_mem_input_buffer(j_decompress_ptr cinfo)
 
   return TRUE;
 }
-#endif
 
 
 /*
@@ -253,7 +249,6 @@ jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile)
 }
 
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /*
  * Prepare for input from a supplied memory buffer.
  * The buffer must contain the whole JPEG data.
@@ -292,4 +287,3 @@ jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
   src->bytes_in_buffer = (size_t)insize;
   src->next_input_byte = (const JOCTET *)inbuffer;
 }
-#endif
diff --git a/3rdparty/libjpeg-turbo/src/jdcoefct.c b/3rdparty/libjpeg-turbo/src/jdcoefct.c
index 15e6cded628e..40ce27259ba1 100644
--- a/3rdparty/libjpeg-turbo/src/jdcoefct.c
+++ b/3rdparty/libjpeg-turbo/src/jdcoefct.c
@@ -5,13 +5,13 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, 2019-2020, D. R. Commander.
+ * Copyright (C) 2010, 2015-2016, 2019-2020, 2022-2023, D. R. Commander.
  * Copyright (C) 2015, 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains the coefficient buffer controller for decompression.
- * This controller is the top level of the JPEG decompressor proper.
+ * This controller is the top level of the lossy JPEG decompressor proper.
  * The coefficient buffer lies between entropy decoding and inverse-DCT steps.
  *
  * In buffered-image mode, this controller is the interface between
@@ -21,19 +21,20 @@
 
 #include "jinclude.h"
 #include "jdcoefct.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
+#include "jsamplecomp.h"
 
 
 /* Forward declarations */
 METHODDEF(int) decompress_onepass(j_decompress_ptr cinfo,
-                                  JSAMPIMAGE output_buf);
+                                  _JSAMPIMAGE output_buf);
 #ifdef D_MULTISCAN_FILES_SUPPORTED
-METHODDEF(int) decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_data(j_decompress_ptr cinfo, _JSAMPIMAGE output_buf);
 #endif
 #ifdef BLOCK_SMOOTHING_SUPPORTED
 LOCAL(boolean) smoothing_ok(j_decompress_ptr cinfo);
 METHODDEF(int) decompress_smooth_data(j_decompress_ptr cinfo,
-                                      JSAMPIMAGE output_buf);
+                                      _JSAMPIMAGE output_buf);
 #endif
 
 
@@ -62,9 +63,9 @@ start_output_pass(j_decompress_ptr cinfo)
   /* If multipass, check to see whether to use block smoothing on this pass */
   if (coef->pub.coef_arrays != NULL) {
     if (cinfo->do_block_smoothing && smoothing_ok(cinfo))
-      coef->pub.decompress_data = decompress_smooth_data;
+      coef->pub._decompress_data = decompress_smooth_data;
     else
-      coef->pub.decompress_data = decompress_data;
+      coef->pub._decompress_data = decompress_data;
   }
 #endif
   cinfo->output_iMCU_row = 0;
@@ -82,17 +83,17 @@ start_output_pass(j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_onepass(j_decompress_ptr cinfo, _JSAMPIMAGE output_buf)
 {
   my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   int blkn, ci, xindex, yindex, yoffset, useful_width;
-  JSAMPARRAY output_ptr;
+  _JSAMPARRAY output_ptr;
   JDIMENSION start_col, output_col;
   jpeg_component_info *compptr;
-  inverse_DCT_method_ptr inverse_DCT;
+  _inverse_DCT_method_ptr inverse_DCT;
 
   /* Loop to process as much as one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
@@ -129,7 +130,7 @@ decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
             blkn += compptr->MCU_blocks;
             continue;
           }
-          inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
+          inverse_DCT = cinfo->idct->_inverse_DCT[compptr->component_index];
           useful_width = (MCU_col_num < last_MCU_col) ?
                          compptr->MCU_width : compptr->last_col_width;
           output_ptr = output_buf[compptr->component_index] +
@@ -262,7 +263,7 @@ consume_data(j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_data(j_decompress_ptr cinfo, _JSAMPIMAGE output_buf)
 {
   my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -270,10 +271,10 @@ decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   int ci, block_row, block_rows;
   JBLOCKARRAY buffer;
   JBLOCKROW buffer_ptr;
-  JSAMPARRAY output_ptr;
+  _JSAMPARRAY output_ptr;
   JDIMENSION output_col;
   jpeg_component_info *compptr;
-  inverse_DCT_method_ptr inverse_DCT;
+  _inverse_DCT_method_ptr inverse_DCT;
 
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number < cinfo->output_scan_number ||
@@ -302,7 +303,7 @@ decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
       block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
     }
-    inverse_DCT = cinfo->idct->inverse_DCT[ci];
+    inverse_DCT = cinfo->idct->_inverse_DCT[ci];
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
@@ -425,19 +426,20 @@ smoothing_ok(j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_smooth_data(j_decompress_ptr cinfo, _JSAMPIMAGE output_buf)
 {
   my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION block_num, last_block_column;
-  int ci, block_row, block_rows, access_rows;
+  int ci, block_row, block_rows, access_rows, image_block_row,
+    image_block_rows;
   JBLOCKARRAY buffer;
   JBLOCKROW buffer_ptr, prev_prev_block_row, prev_block_row;
   JBLOCKROW next_block_row, next_next_block_row;
-  JSAMPARRAY output_ptr;
+  _JSAMPARRAY output_ptr;
   JDIMENSION output_col;
   jpeg_component_info *compptr;
-  inverse_DCT_method_ptr inverse_DCT;
+  _inverse_DCT_method_ptr inverse_DCT;
   boolean change_dc;
   JCOEF *workspace;
   int *coef_bits;
@@ -475,7 +477,7 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     if (!compptr->component_needed)
       continue;
     /* Count non-dummy DCT block rows in this iMCU row. */
-    if (cinfo->output_iMCU_row < last_iMCU_row - 1) {
+    if (cinfo->output_iMCU_row + 1 < last_iMCU_row) {
       block_rows = compptr->v_samp_factor;
       access_rows = block_rows * 3; /* this and next two iMCU rows */
     } else if (cinfo->output_iMCU_row < last_iMCU_row) {
@@ -496,6 +498,7 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
          (JDIMENSION)access_rows, FALSE);
       buffer += 2 * compptr->v_samp_factor; /* point to current iMCU row */
     } else if (cinfo->output_iMCU_row > 0) {
+      access_rows += compptr->v_samp_factor; /* prior iMCU row too */
       buffer = (*cinfo->mem->access_virt_barray)
         ((j_common_ptr)cinfo, coef->whole_image[ci],
          (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
@@ -535,32 +538,33 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
       Q21 = quanttbl->quantval[Q21_POS];
       Q30 = quanttbl->quantval[Q30_POS];
     }
-    inverse_DCT = cinfo->idct->inverse_DCT[ci];
+    inverse_DCT = cinfo->idct->_inverse_DCT[ci];
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
+    image_block_rows = block_rows * cinfo->total_iMCU_rows;
     for (block_row = 0; block_row < block_rows; block_row++) {
+      image_block_row = cinfo->output_iMCU_row * block_rows + block_row;
       buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
 
-      if (block_row > 0 || cinfo->output_iMCU_row > 0)
+      if (image_block_row > 0)
         prev_block_row =
           buffer[block_row - 1] + cinfo->master->first_MCU_col[ci];
       else
         prev_block_row = buffer_ptr;
 
-      if (block_row > 1 || cinfo->output_iMCU_row > 1)
+      if (image_block_row > 1)
         prev_prev_block_row =
           buffer[block_row - 2] + cinfo->master->first_MCU_col[ci];
       else
         prev_prev_block_row = prev_block_row;
 
-      if (block_row < block_rows - 1 || cinfo->output_iMCU_row < last_iMCU_row)
+      if (image_block_row < image_block_rows - 1)
         next_block_row =
           buffer[block_row + 1] + cinfo->master->first_MCU_col[ci];
       else
         next_block_row = buffer_ptr;
 
-      if (block_row < block_rows - 2 ||
-          cinfo->output_iMCU_row < last_iMCU_row - 1)
+      if (image_block_row < image_block_rows - 2)
         next_next_block_row =
           buffer[block_row + 2] + cinfo->master->first_MCU_col[ci];
       else
@@ -583,11 +587,11 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         /* Update DC values */
         if (block_num == cinfo->master->first_MCU_col[ci] &&
             block_num < last_block_column) {
-          DC04 = (int)prev_prev_block_row[1][0];
-          DC09 = (int)prev_block_row[1][0];
-          DC14 = (int)buffer_ptr[1][0];
-          DC19 = (int)next_block_row[1][0];
-          DC24 = (int)next_next_block_row[1][0];
+          DC04 = DC05 = (int)prev_prev_block_row[1][0];
+          DC09 = DC10 = (int)prev_block_row[1][0];
+          DC14 = DC15 = (int)buffer_ptr[1][0];
+          DC19 = DC20 = (int)next_block_row[1][0];
+          DC24 = DC25 = (int)next_next_block_row[1][0];
         }
         if (block_num + 1 < last_block_column) {
           DC05 = (int)prev_prev_block_row[2][0];
@@ -810,10 +814,13 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  */
 
 GLOBAL(void)
-jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
+_jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   coef = (my_coef_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
@@ -850,7 +857,7 @@ jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
          (JDIMENSION)access_rows);
     }
     coef->pub.consume_data = consume_data;
-    coef->pub.decompress_data = decompress_data;
+    coef->pub._decompress_data = decompress_data;
     coef->pub.coef_arrays = coef->whole_image; /* link to virtual arrays */
 #else
     ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -867,7 +874,7 @@ jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
       coef->MCU_buffer[i] = buffer + i;
     }
     coef->pub.consume_data = dummy_consume_data;
-    coef->pub.decompress_data = decompress_onepass;
+    coef->pub._decompress_data = decompress_onepass;
     coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
   }
 
diff --git a/3rdparty/libjpeg-turbo/src/jdcoefct.h b/3rdparty/libjpeg-turbo/src/jdcoefct.h
index 9a0e78066364..bbe9e970515c 100644
--- a/3rdparty/libjpeg-turbo/src/jdcoefct.h
+++ b/3rdparty/libjpeg-turbo/src/jdcoefct.h
@@ -6,6 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2020, Google, Inc.
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
@@ -14,6 +15,8 @@
 #include "jpeglib.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
+
 /* Block smoothing is only applicable for progressive JPEG, so: */
 #ifndef D_PROGRESSIVE_SUPPORTED
 #undef BLOCK_SMOOTHING_SUPPORTED
@@ -81,3 +84,5 @@ start_iMCU_row(j_decompress_ptr cinfo)
   coef->MCU_ctr = 0;
   coef->MCU_vert_offset = 0;
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jdcol565.c b/3rdparty/libjpeg-turbo/src/jdcol565.c
index 53c7bd9187d4..2172d98fdaa4 100644
--- a/3rdparty/libjpeg-turbo/src/jdcol565.c
+++ b/3rdparty/libjpeg-turbo/src/jdcol565.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014-2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -17,18 +17,19 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                            JDIMENSION input_row, JSAMPARRAY output_buf,
+ycc_rgb565_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                            JDIMENSION input_row, _JSAMPARRAY output_buf,
                             int num_rows)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
-  register JSAMPROW outptr;
-  register JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  register _JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   register int *Crrtab = cconvert->Cr_r_tab;
   register int *Cbbtab = cconvert->Cb_b_tab;
   register JLONG *Crgtab = cconvert->Cr_g_tab;
@@ -91,23 +92,27 @@ ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       *(INT16 *)outptr = (INT16)rgb;
     }
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
 INLINE
 LOCAL(void)
-ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                             JDIMENSION input_row, JSAMPARRAY output_buf,
+ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, _JSAMPARRAY output_buf,
                              int num_rows)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
-  register JSAMPROW outptr;
-  register JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  register _JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   register int *Crrtab = cconvert->Cr_r_tab;
   register int *Cbbtab = cconvert->Cb_b_tab;
   register JLONG *Crgtab = cconvert->Cr_g_tab;
@@ -177,17 +182,20 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       *(INT16 *)outptr = (INT16)rgb;
     }
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
 INLINE
 LOCAL(void)
-rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                            JDIMENSION input_row, JSAMPARRAY output_buf,
+rgb_rgb565_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                            JDIMENSION input_row, _JSAMPARRAY output_buf,
                             int num_rows)
 {
-  register JSAMPROW outptr;
-  register JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  register _JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   SHIFT_TEMPS
@@ -237,14 +245,14 @@ rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 INLINE
 LOCAL(void)
-rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                             JDIMENSION input_row, JSAMPARRAY output_buf,
+rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, _JSAMPARRAY output_buf,
                              int num_rows)
 {
-  register JSAMPROW outptr;
-  register JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  register _JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
@@ -296,11 +304,11 @@ rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 INLINE
 LOCAL(void)
-gray_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                             JDIMENSION input_row, JSAMPARRAY output_buf,
+gray_rgb565_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, _JSAMPARRAY output_buf,
                              int num_rows)
 {
-  register JSAMPROW inptr, outptr;
+  register _JSAMPROW inptr, outptr;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
 
@@ -336,13 +344,13 @@ gray_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 INLINE
 LOCAL(void)
-gray_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                              JDIMENSION input_row, JSAMPARRAY output_buf,
+gray_rgb565D_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                              JDIMENSION input_row, _JSAMPARRAY output_buf,
                               int num_rows)
 {
-  register JSAMPROW inptr, outptr;
+  register _JSAMPROW inptr, outptr;
   register JDIMENSION col;
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
 
diff --git a/3rdparty/libjpeg-turbo/src/jdcolext.c b/3rdparty/libjpeg-turbo/src/jdcolext.c
index 863c7a2fbc76..f22e29d7224e 100644
--- a/3rdparty/libjpeg-turbo/src/jdcolext.c
+++ b/3rdparty/libjpeg-turbo/src/jdcolext.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, 2015, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2015, 2022-2023, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -28,18 +28,19 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                         JDIMENSION input_row, JSAMPARRAY output_buf,
+ycc_rgb_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, _JSAMPARRAY output_buf,
                          int num_rows)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
-  register JSAMPROW outptr;
-  register JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  register _JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   register int *Crrtab = cconvert->Cr_r_tab;
   register int *Cbbtab = cconvert->Cb_b_tab;
   register JLONG *Crgtab = cconvert->Cr_g_tab;
@@ -62,14 +63,17 @@ ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                               ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                 SCALEBITS))];
       outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
-      /* Set unused byte to 0xFF so it can be interpreted as an opaque */
-      /* alpha channel value */
+      /* Set unused byte to _MAXJSAMPLE so it can be interpreted as an */
+      /* opaque alpha channel value */
 #ifdef RGB_ALPHA
-      outptr[RGB_ALPHA] = 0xFF;
+      outptr[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
       outptr += RGB_PIXELSIZE;
     }
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -81,11 +85,11 @@ ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 INLINE
 LOCAL(void)
-gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                          JDIMENSION input_row, JSAMPARRAY output_buf,
+gray_rgb_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                          JDIMENSION input_row, _JSAMPARRAY output_buf,
                           int num_rows)
 {
-  register JSAMPROW inptr, outptr;
+  register _JSAMPROW inptr, outptr;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
 
@@ -94,10 +98,10 @@ gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
       outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
-      /* Set unused byte to 0xFF so it can be interpreted as an opaque */
-      /* alpha channel value */
+      /* Set unused byte to _MAXJSAMPLE so it can be interpreted as an */
+      /* opaque alpha channel value */
 #ifdef RGB_ALPHA
-      outptr[RGB_ALPHA] = 0xFF;
+      outptr[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
       outptr += RGB_PIXELSIZE;
     }
@@ -111,12 +115,12 @@ gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 INLINE
 LOCAL(void)
-rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                         JDIMENSION input_row, JSAMPARRAY output_buf,
+rgb_rgb_convert_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, _JSAMPARRAY output_buf,
                          int num_rows)
 {
-  register JSAMPROW inptr0, inptr1, inptr2;
-  register JSAMPROW outptr;
+  register _JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
 
@@ -130,10 +134,10 @@ rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       outptr[RGB_RED] = inptr0[col];
       outptr[RGB_GREEN] = inptr1[col];
       outptr[RGB_BLUE] = inptr2[col];
-      /* Set unused byte to 0xFF so it can be interpreted as an opaque */
-      /* alpha channel value */
+      /* Set unused byte to _MAXJSAMPLE so it can be interpreted as an */
+      /* opaque alpha channel value */
 #ifdef RGB_ALPHA
-      outptr[RGB_ALPHA] = 0xFF;
+      outptr[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
       outptr += RGB_PIXELSIZE;
     }
diff --git a/3rdparty/libjpeg-turbo/src/jdcolor.c b/3rdparty/libjpeg-turbo/src/jdcolor.c
index 8da2b4eaf2e9..e5c7b58ebfad 100644
--- a/3rdparty/libjpeg-turbo/src/jdcolor.c
+++ b/3rdparty/libjpeg-turbo/src/jdcolor.c
@@ -6,7 +6,7 @@
  * Modified 2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009, 2011-2012, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2011-2012, 2014-2015, 2022, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -18,14 +18,17 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jsimd.h"
-#include "jconfigint.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
+
 /* Private subobject */
 
 typedef struct {
   struct jpeg_color_deconverter pub; /* public fields */
 
+#if BITS_IN_JSAMPLE != 16
   /* Private state for YCC->RGB conversion */
   int *Cr_r_tab;                /* => table for Cr to R conversion */
   int *Cb_b_tab;                /* => table for Cb to B conversion */
@@ -34,6 +37,7 @@ typedef struct {
 
   /* Private state for RGB->Y conversion */
   JLONG *rgb_y_tab;             /* => table for RGB to Y conversion */
+#endif
 } my_color_deconverter;
 
 typedef my_color_deconverter *my_cconvert_ptr;
@@ -44,7 +48,7 @@ typedef my_color_deconverter *my_cconvert_ptr;
 
 /*
  * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
- * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
+ * normalized to the range 0.._MAXJSAMPLE rather than -0.5 .. 0.5.
  * The conversion equations to be implemented are therefore
  *
  *      R = Y                + 1.40200 * Cr
@@ -53,7 +57,7 @@ typedef my_color_deconverter *my_cconvert_ptr;
  *
  *      Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
  *
- * where Cb and Cr represent the incoming values less CENTERJSAMPLE.
+ * where Cb and Cr represent the incoming values less _CENTERJSAMPLE.
  * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
  *
  * To avoid floating-point arithmetic, we represent the fractional constants
@@ -64,7 +68,7 @@ typedef my_color_deconverter *my_cconvert_ptr;
  *
  * For even more speed, we avoid doing any multiplications in the inner loop
  * by precalculating the constants times Cb and Cr for all possible values.
- * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
+ * For 8-bit samples this is very reasonable (only 256 entries per table);
  * for 12-bit samples it is still acceptable.  It's not very reasonable for
  * 16-bit samples, but if you want lossless storage you shouldn't be changing
  * colorspace anyway.
@@ -85,9 +89,9 @@ typedef my_color_deconverter *my_cconvert_ptr;
  */
 
 #define R_Y_OFF         0                       /* offset to R => Y section */
-#define G_Y_OFF         (1 * (MAXJSAMPLE + 1))  /* offset to G => Y section */
-#define B_Y_OFF         (2 * (MAXJSAMPLE + 1))  /* etc. */
-#define TABLE_SIZE      (3 * (MAXJSAMPLE + 1))
+#define G_Y_OFF         (1 * (_MAXJSAMPLE + 1)) /* offset to G => Y section */
+#define B_Y_OFF         (2 * (_MAXJSAMPLE + 1)) /* etc. */
+#define TABLE_SIZE      (3 * (_MAXJSAMPLE + 1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -210,6 +214,7 @@ typedef my_color_deconverter *my_cconvert_ptr;
 LOCAL(void)
 build_ycc_rgb_table(j_decompress_ptr cinfo)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   int i;
   JLONG x;
@@ -217,20 +222,20 @@ build_ycc_rgb_table(j_decompress_ptr cinfo)
 
   cconvert->Cr_r_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE + 1) * sizeof(int));
+                                (_MAXJSAMPLE + 1) * sizeof(int));
   cconvert->Cb_b_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE + 1) * sizeof(int));
+                                (_MAXJSAMPLE + 1) * sizeof(int));
   cconvert->Cr_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE + 1) * sizeof(JLONG));
+                                (_MAXJSAMPLE + 1) * sizeof(JLONG));
   cconvert->Cb_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE + 1) * sizeof(JLONG));
+                                (_MAXJSAMPLE + 1) * sizeof(JLONG));
 
-  for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
-    /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
-    /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
+  for (i = 0, x = -_CENTERJSAMPLE; i <= _MAXJSAMPLE; i++, x++) {
+    /* i is the actual input pixel value, in the range 0.._MAXJSAMPLE */
+    /* The Cb or Cr value we are thinking of is x = i - _CENTERJSAMPLE */
     /* Cr=>R value is nearest int to 1.40200 * x */
     cconvert->Cr_r_tab[i] = (int)
                     RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
@@ -243,6 +248,9 @@ build_ycc_rgb_table(j_decompress_ptr cinfo)
     /* We also add in ONE_HALF so that need not do it in inner loop */
     cconvert->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -251,8 +259,8 @@ build_ycc_rgb_table(j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
@@ -301,6 +309,7 @@ ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 LOCAL(void)
 build_rgb_y_table(j_decompress_ptr cinfo)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   JLONG *rgb_y_tab;
   JLONG i;
@@ -310,11 +319,14 @@ build_rgb_y_table(j_decompress_ptr cinfo)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (TABLE_SIZE * sizeof(JLONG)));
 
-  for (i = 0; i <= MAXJSAMPLE; i++) {
+  for (i = 0; i <= _MAXJSAMPLE; i++) {
     rgb_y_tab[i + R_Y_OFF] = FIX(0.29900) * i;
     rgb_y_tab[i + G_Y_OFF] = FIX(0.58700) * i;
     rgb_y_tab[i + B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -323,14 +335,15 @@ build_rgb_y_table(j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                 JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+rgb_gray_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                 JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_y_tab;
-  register JSAMPROW outptr;
-  register JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  register _JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
 
@@ -345,10 +358,13 @@ rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       g = inptr1[col];
       b = inptr2[col];
       /* Y */
-      outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
-                               ctab[b + B_Y_OFF]) >> SCALEBITS);
+      outptr[col] = (_JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                ctab[b + B_Y_OFF]) >> SCALEBITS);
     }
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -358,10 +374,10 @@ rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 METHODDEF(void)
-null_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-             JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+null_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+             JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
-  register JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr;
+  register _JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr;
   register JDIMENSION col;
   register int num_components = cinfo->num_components;
   JDIMENSION num_cols = cinfo->output_width;
@@ -419,11 +435,11 @@ null_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 METHODDEF(void)
-grayscale_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                  JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+grayscale_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                  JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
-  jcopy_sample_rows(input_buf[0], (int)input_row, output_buf, 0, num_rows,
-                    cinfo->output_width);
+  _jcopy_sample_rows(input_buf[0], (int)input_row, output_buf, 0, num_rows,
+                     cinfo->output_width);
 }
 
 
@@ -432,8 +448,8 @@ grayscale_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 METHODDEF(void)
-gray_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                 JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                 JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
@@ -477,8 +493,8 @@ gray_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 METHODDEF(void)
-rgb_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
@@ -525,17 +541,18 @@ rgb_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 METHODDEF(void)
-ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                  JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+ycck_cmyk_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                  JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
+#if BITS_IN_JSAMPLE != 16
   my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
-  register JSAMPROW outptr;
-  register JSAMPROW inptr0, inptr1, inptr2, inptr3;
+  register _JSAMPROW outptr;
+  register _JSAMPROW inptr0, inptr1, inptr2, inptr3;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   register int *Crrtab = cconvert->Cr_r_tab;
   register int *Cbbtab = cconvert->Cb_b_tab;
   register JLONG *Crgtab = cconvert->Cr_g_tab;
@@ -554,16 +571,19 @@ ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       cb = inptr1[col];
       cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
-      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
-      outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
+      outptr[0] = range_limit[_MAXJSAMPLE - (y + Crrtab[cr])];  /* red */
+      outptr[1] = range_limit[_MAXJSAMPLE - (y +                /* green */
                               ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                  SCALEBITS)))];
-      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
+      outptr[2] = range_limit[_MAXJSAMPLE - (y + Cbbtab[cb])];  /* blue */
       /* K passes through unchanged */
       outptr[3] = inptr3[col];
       outptr += 4;
     }
   }
+#else
+  ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+#endif
 }
 
 
@@ -653,8 +673,8 @@ static INLINE boolean is_big_endian(void)
 
 
 METHODDEF(void)
-ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                   JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                   JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     ycc_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -664,8 +684,8 @@ ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-ycc_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     ycc_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -675,8 +695,8 @@ ycc_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-rgb_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                   JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                   JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     rgb_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -686,8 +706,8 @@ rgb_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-rgb_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     rgb_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -697,8 +717,8 @@ rgb_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-gray_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     gray_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -708,8 +728,8 @@ gray_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-gray_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                     JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                     JDIMENSION input_row, _JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     gray_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -734,11 +754,14 @@ start_pass_dcolor(j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_color_deconverter(j_decompress_ptr cinfo)
+_jinit_color_deconverter(j_decompress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
   int ci;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   cconvert = (my_cconvert_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_color_deconverter));
@@ -773,19 +796,24 @@ jinit_color_deconverter(j_decompress_ptr cinfo)
   /* Set out_color_components and conversion method based on requested space.
    * Also clear the component_needed flags for any unused components,
    * so that earlier pipeline stages can avoid useless computation.
+   * NOTE: We do not allow any lossy color conversion algorithms in lossless
+   * mode.
    */
 
   switch (cinfo->out_color_space) {
   case JCS_GRAYSCALE:
+    if (cinfo->master->lossless &&
+        cinfo->jpeg_color_space != cinfo->out_color_space)
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     cinfo->out_color_components = 1;
     if (cinfo->jpeg_color_space == JCS_GRAYSCALE ||
         cinfo->jpeg_color_space == JCS_YCbCr) {
-      cconvert->pub.color_convert = grayscale_convert;
+      cconvert->pub._color_convert = grayscale_convert;
       /* For color->grayscale conversion, only the Y (0) component is needed */
       for (ci = 1; ci < cinfo->num_components; ci++)
         cinfo->comp_info[ci].component_needed = FALSE;
     } else if (cinfo->jpeg_color_space == JCS_RGB) {
-      cconvert->pub.color_convert = rgb_gray_convert;
+      cconvert->pub._color_convert = rgb_gray_convert;
       build_rgb_y_table(cinfo);
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
@@ -802,65 +830,78 @@ jinit_color_deconverter(j_decompress_ptr cinfo)
   case JCS_EXT_BGRA:
   case JCS_EXT_ABGR:
   case JCS_EXT_ARGB:
+    if (cinfo->master->lossless && cinfo->jpeg_color_space != JCS_RGB)
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
     if (cinfo->jpeg_color_space == JCS_YCbCr) {
+#ifdef WITH_SIMD
       if (jsimd_can_ycc_rgb())
-        cconvert->pub.color_convert = jsimd_ycc_rgb_convert;
-      else {
-        cconvert->pub.color_convert = ycc_rgb_convert;
+        cconvert->pub._color_convert = jsimd_ycc_rgb_convert;
+      else
+#endif
+      {
+        cconvert->pub._color_convert = ycc_rgb_convert;
         build_ycc_rgb_table(cinfo);
       }
     } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
-      cconvert->pub.color_convert = gray_rgb_convert;
+      cconvert->pub._color_convert = gray_rgb_convert;
     } else if (cinfo->jpeg_color_space == JCS_RGB) {
       if (rgb_red[cinfo->out_color_space] == 0 &&
           rgb_green[cinfo->out_color_space] == 1 &&
           rgb_blue[cinfo->out_color_space] == 2 &&
           rgb_pixelsize[cinfo->out_color_space] == 3)
-        cconvert->pub.color_convert = null_convert;
+        cconvert->pub._color_convert = null_convert;
       else
-        cconvert->pub.color_convert = rgb_rgb_convert;
+        cconvert->pub._color_convert = rgb_rgb_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
 
   case JCS_RGB565:
+    if (cinfo->master->lossless)
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     cinfo->out_color_components = 3;
     if (cinfo->dither_mode == JDITHER_NONE) {
       if (cinfo->jpeg_color_space == JCS_YCbCr) {
+#ifdef WITH_SIMD
         if (jsimd_can_ycc_rgb565())
-          cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
-        else {
-          cconvert->pub.color_convert = ycc_rgb565_convert;
+          cconvert->pub._color_convert = jsimd_ycc_rgb565_convert;
+        else
+#endif
+        {
+          cconvert->pub._color_convert = ycc_rgb565_convert;
           build_ycc_rgb_table(cinfo);
         }
       } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
-        cconvert->pub.color_convert = gray_rgb565_convert;
+        cconvert->pub._color_convert = gray_rgb565_convert;
       } else if (cinfo->jpeg_color_space == JCS_RGB) {
-        cconvert->pub.color_convert = rgb_rgb565_convert;
+        cconvert->pub._color_convert = rgb_rgb565_convert;
       } else
         ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     } else {
       /* only ordered dithering is supported */
       if (cinfo->jpeg_color_space == JCS_YCbCr) {
-        cconvert->pub.color_convert = ycc_rgb565D_convert;
+        cconvert->pub._color_convert = ycc_rgb565D_convert;
         build_ycc_rgb_table(cinfo);
       } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
-        cconvert->pub.color_convert = gray_rgb565D_convert;
+        cconvert->pub._color_convert = gray_rgb565D_convert;
       } else if (cinfo->jpeg_color_space == JCS_RGB) {
-        cconvert->pub.color_convert = rgb_rgb565D_convert;
+        cconvert->pub._color_convert = rgb_rgb565D_convert;
       } else
         ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     }
     break;
 
   case JCS_CMYK:
+    if (cinfo->master->lossless &&
+        cinfo->jpeg_color_space != cinfo->out_color_space)
+      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     cinfo->out_color_components = 4;
     if (cinfo->jpeg_color_space == JCS_YCCK) {
-      cconvert->pub.color_convert = ycck_cmyk_convert;
+      cconvert->pub._color_convert = ycck_cmyk_convert;
       build_ycc_rgb_table(cinfo);
     } else if (cinfo->jpeg_color_space == JCS_CMYK) {
-      cconvert->pub.color_convert = null_convert;
+      cconvert->pub._color_convert = null_convert;
     } else
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
@@ -869,7 +910,7 @@ jinit_color_deconverter(j_decompress_ptr cinfo)
     /* Permit null conversion to same output space */
     if (cinfo->out_color_space == cinfo->jpeg_color_space) {
       cinfo->out_color_components = cinfo->num_components;
-      cconvert->pub.color_convert = null_convert;
+      cconvert->pub._color_convert = null_convert;
     } else                      /* unsupported non-null conversion */
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
     break;
@@ -880,3 +921,5 @@ jinit_color_deconverter(j_decompress_ptr cinfo)
   else
     cinfo->output_components = cinfo->out_color_components;
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jdct.h b/3rdparty/libjpeg-turbo/src/jdct.h
index 66d1718b770b..0411a79bc0b9 100644
--- a/3rdparty/libjpeg-turbo/src/jdct.h
+++ b/3rdparty/libjpeg-turbo/src/jdct.h
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,13 +15,15 @@
  * machine-dependent tuning (e.g., assembly coding).
  */
 
+#include "jsamplecomp.h"
+
 
 /*
  * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
  * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
  * for 8-bit samples, JLONG for 12-bit samples.  (NOTE: Floating-point DCT
  * implementations use an array of type FAST_FLOAT, instead.)
- * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
+ * The DCT inputs are expected to be signed (range +-_CENTERJSAMPLE).
  * The DCT outputs are returned scaled up by a factor of 8; they therefore
  * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
  * convention improves accuracy in integer implementations and saves some
@@ -76,78 +78,89 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE;  /* preferred floating type */
 
 /*
  * Each IDCT routine is responsible for range-limiting its results and
- * converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+ * converting them to unsigned form (0.._MAXJSAMPLE).  The raw outputs could
  * be quite far out of range if the input data is corrupt, so a bulletproof
  * range-limiting step is required.  We use a mask-and-table-lookup method
  * to do the combined operations quickly.  See the comments with
  * prepare_range_limit_table (in jdmaster.c) for more info.
  */
 
-#define IDCT_range_limit(cinfo)  ((cinfo)->sample_range_limit + CENTERJSAMPLE)
+#define IDCT_range_limit(cinfo) \
+  ((_JSAMPLE *)((cinfo)->sample_range_limit) + _CENTERJSAMPLE)
 
-#define RANGE_MASK  (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */
+#define RANGE_MASK  (_MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */
 
 
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow(DCTELEM *data);
-EXTERN(void) jpeg_fdct_ifast(DCTELEM *data);
+EXTERN(void) _jpeg_fdct_islow(DCTELEM *data);
+EXTERN(void) _jpeg_fdct_ifast(DCTELEM *data);
 EXTERN(void) jpeg_fdct_float(FAST_FLOAT *data);
 
-EXTERN(void) jpeg_idct_islow(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_ifast(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_float(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_7x7(j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr, JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_6x6(j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr, JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_5x5(j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr, JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_4x4(j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr, JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_3x3(j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr, JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_2x2(j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr, JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_1x1(j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr, JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_9x9(j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr, JCOEFPTR coef_block,
-                           JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_10x10(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_11x11(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_12x12(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_13x13(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_14x14(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_15x15(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_16x16(j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr, JCOEFPTR coef_block,
-                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_islow(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_ifast(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_float(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_7x7(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            _JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_6x6(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            _JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_5x5(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            _JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_4x4(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            _JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_3x3(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            _JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_2x2(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            _JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_1x1(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            _JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_9x9(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            _JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_10x10(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_11x11(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_12x12(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_13x13(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_14x14(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_15x15(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) _jpeg_idct_16x16(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
 
 
 /*
diff --git a/3rdparty/libjpeg-turbo/src/jddctmgr.c b/3rdparty/libjpeg-turbo/src/jddctmgr.c
index e78d7bebe28a..0bd8c2b591de 100644
--- a/3rdparty/libjpeg-turbo/src/jddctmgr.c
+++ b/3rdparty/libjpeg-turbo/src/jddctmgr.c
@@ -26,7 +26,7 @@
 #include "jpeglib.h"
 #include "jdct.h"               /* Private declarations for DCT subsystem */
 #include "jsimddct.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
 
 
 /*
@@ -100,7 +100,7 @@ start_pass(j_decompress_ptr cinfo)
   int ci, i;
   jpeg_component_info *compptr;
   int method = 0;
-  inverse_DCT_method_ptr method_ptr = NULL;
+  _inverse_DCT_method_ptr method_ptr = NULL;
   JQUANT_TBL *qtbl;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -109,42 +109,46 @@ start_pass(j_decompress_ptr cinfo)
     switch (compptr->_DCT_scaled_size) {
 #ifdef IDCT_SCALING_SUPPORTED
     case 1:
-      method_ptr = jpeg_idct_1x1;
+      method_ptr = _jpeg_idct_1x1;
       method = JDCT_ISLOW;      /* jidctred uses islow-style table */
       break;
     case 2:
+#ifdef WITH_SIMD
       if (jsimd_can_idct_2x2())
         method_ptr = jsimd_idct_2x2;
       else
-        method_ptr = jpeg_idct_2x2;
+#endif
+        method_ptr = _jpeg_idct_2x2;
       method = JDCT_ISLOW;      /* jidctred uses islow-style table */
       break;
     case 3:
-      method_ptr = jpeg_idct_3x3;
+      method_ptr = _jpeg_idct_3x3;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 4:
+#ifdef WITH_SIMD
       if (jsimd_can_idct_4x4())
         method_ptr = jsimd_idct_4x4;
       else
-        method_ptr = jpeg_idct_4x4;
+#endif
+        method_ptr = _jpeg_idct_4x4;
       method = JDCT_ISLOW;      /* jidctred uses islow-style table */
       break;
     case 5:
-      method_ptr = jpeg_idct_5x5;
+      method_ptr = _jpeg_idct_5x5;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 6:
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
       if (jsimd_can_idct_6x6())
         method_ptr = jsimd_idct_6x6;
       else
 #endif
-      method_ptr = jpeg_idct_6x6;
+      method_ptr = _jpeg_idct_6x6;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 7:
-      method_ptr = jpeg_idct_7x7;
+      method_ptr = _jpeg_idct_7x7;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
 #endif
@@ -152,28 +156,34 @@ start_pass(j_decompress_ptr cinfo)
       switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
+#ifdef WITH_SIMD
         if (jsimd_can_idct_islow())
           method_ptr = jsimd_idct_islow;
         else
-          method_ptr = jpeg_idct_islow;
+#endif
+          method_ptr = _jpeg_idct_islow;
         method = JDCT_ISLOW;
         break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
       case JDCT_IFAST:
+#ifdef WITH_SIMD
         if (jsimd_can_idct_ifast())
           method_ptr = jsimd_idct_ifast;
         else
-          method_ptr = jpeg_idct_ifast;
+#endif
+          method_ptr = _jpeg_idct_ifast;
         method = JDCT_IFAST;
         break;
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
+#ifdef WITH_SIMD
         if (jsimd_can_idct_float())
           method_ptr = jsimd_idct_float;
         else
-          method_ptr = jpeg_idct_float;
+#endif
+          method_ptr = _jpeg_idct_float;
         method = JDCT_FLOAT;
         break;
 #endif
@@ -184,40 +194,40 @@ start_pass(j_decompress_ptr cinfo)
       break;
 #ifdef IDCT_SCALING_SUPPORTED
     case 9:
-      method_ptr = jpeg_idct_9x9;
+      method_ptr = _jpeg_idct_9x9;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 10:
-      method_ptr = jpeg_idct_10x10;
+      method_ptr = _jpeg_idct_10x10;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 11:
-      method_ptr = jpeg_idct_11x11;
+      method_ptr = _jpeg_idct_11x11;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 12:
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
       if (jsimd_can_idct_12x12())
         method_ptr = jsimd_idct_12x12;
       else
 #endif
-      method_ptr = jpeg_idct_12x12;
+      method_ptr = _jpeg_idct_12x12;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 13:
-      method_ptr = jpeg_idct_13x13;
+      method_ptr = _jpeg_idct_13x13;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 14:
-      method_ptr = jpeg_idct_14x14;
+      method_ptr = _jpeg_idct_14x14;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 15:
-      method_ptr = jpeg_idct_15x15;
+      method_ptr = _jpeg_idct_15x15;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
     case 16:
-      method_ptr = jpeg_idct_16x16;
+      method_ptr = _jpeg_idct_16x16;
       method = JDCT_ISLOW;      /* jidctint uses islow-style table */
       break;
 #endif
@@ -225,7 +235,7 @@ start_pass(j_decompress_ptr cinfo)
       ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->_DCT_scaled_size);
       break;
     }
-    idct->pub.inverse_DCT[ci] = method_ptr;
+    idct->pub._inverse_DCT[ci] = method_ptr;
     /* Create multiplier table from quant table.
      * However, we can skip this if the component is uninteresting
      * or if we already built the table.  Also, if no quant table
@@ -327,12 +337,15 @@ start_pass(j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_inverse_dct(j_decompress_ptr cinfo)
+_jinit_inverse_dct(j_decompress_ptr cinfo)
 {
   my_idct_ptr idct;
   int ci;
   jpeg_component_info *compptr;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   idct = (my_idct_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_idct_controller));
diff --git a/3rdparty/libjpeg-turbo/src/jddiffct.c b/3rdparty/libjpeg-turbo/src/jddiffct.c
new file mode 100644
index 000000000000..f1d7f61b5209
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jddiffct.c
@@ -0,0 +1,403 @@
+/*
+ * jddiffct.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains the [un]difference buffer controller for decompression.
+ * This controller is the top level of the lossless JPEG decompressor proper.
+ * The difference buffer lies between the entropy decoding and
+ * prediction/undifferencing steps.  The undifference buffer lies between the
+ * prediction/undifferencing and scaling steps.
+ *
+ * In buffered-image mode, this controller is the interface between
+ * input-oriented processing and output-oriented processing.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jlossls.h"            /* Private declarations for lossless codec */
+
+
+#ifdef D_LOSSLESS_SUPPORTED
+
+/* Private buffer controller object */
+
+typedef struct {
+  struct jpeg_d_coef_controller pub; /* public fields */
+
+  /* These variables keep track of the current location of the input side. */
+  /* cinfo->input_iMCU_row is also used for this. */
+  JDIMENSION MCU_ctr;           /* counts MCUs processed in current row */
+  unsigned int restart_rows_to_go;      /* MCU rows left in this restart
+                                           interval */
+  unsigned int MCU_vert_offset;         /* counts MCU rows within iMCU row */
+  unsigned int MCU_rows_per_iMCU_row;   /* number of such rows needed */
+
+  /* The output side's location is represented by cinfo->output_iMCU_row. */
+
+  JDIFFARRAY diff_buf[MAX_COMPONENTS];  /* iMCU row of differences */
+  JDIFFARRAY undiff_buf[MAX_COMPONENTS]; /* iMCU row of undiff'd samples */
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+  /* In multi-pass modes, we need a virtual sample array for each component. */
+  jvirt_sarray_ptr whole_image[MAX_COMPONENTS];
+#endif
+} my_diff_controller;
+
+typedef my_diff_controller *my_diff_ptr;
+
+/* Forward declarations */
+METHODDEF(int) decompress_data(j_decompress_ptr cinfo, _JSAMPIMAGE output_buf);
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+METHODDEF(int) output_data(j_decompress_ptr cinfo, _JSAMPIMAGE output_buf);
+#endif
+
+
+LOCAL(void)
+start_iMCU_row(j_decompress_ptr cinfo)
+/* Reset within-iMCU-row counters for a new row (input side) */
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+
+  /* In an interleaved scan, an MCU row is the same as an iMCU row.
+   * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
+   * But at the bottom of the image, process only what's left.
+   */
+  if (cinfo->comps_in_scan > 1) {
+    diff->MCU_rows_per_iMCU_row = 1;
+  } else {
+    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
+      diff->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
+    else
+      diff->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
+  }
+
+  diff->MCU_ctr = 0;
+  diff->MCU_vert_offset = 0;
+}
+
+
+/*
+ * Initialize for an input processing pass.
+ */
+
+METHODDEF(void)
+start_input_pass(j_decompress_ptr cinfo)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+
+  /* Because it is hitching a ride on the jpeg_inverse_dct struct,
+   * start_pass_lossless() will be called at the start of the output pass.
+   * This ensures that it will be called at the start of the input pass as
+   * well.
+   */
+  (*cinfo->idct->start_pass) (cinfo);
+
+  /* Check that the restart interval is an integer multiple of the number
+   * of MCUs in an MCU row.
+   */
+  if (cinfo->restart_interval % cinfo->MCUs_per_row != 0)
+    ERREXIT2(cinfo, JERR_BAD_RESTART,
+             cinfo->restart_interval, cinfo->MCUs_per_row);
+
+  /* Initialize restart counter */
+  diff->restart_rows_to_go = cinfo->restart_interval / cinfo->MCUs_per_row;
+
+  cinfo->input_iMCU_row = 0;
+  start_iMCU_row(cinfo);
+}
+
+
+/*
+ * Check for a restart marker & resynchronize decoder, undifferencer.
+ * Returns FALSE if must suspend.
+ */
+
+METHODDEF(boolean)
+process_restart(j_decompress_ptr cinfo)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+
+  if (!(*cinfo->entropy->process_restart) (cinfo))
+    return FALSE;
+
+  (*cinfo->idct->start_pass) (cinfo);
+
+  /* Reset restart counter */
+  diff->restart_rows_to_go = cinfo->restart_interval / cinfo->MCUs_per_row;
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an output processing pass.
+ */
+
+METHODDEF(void)
+start_output_pass(j_decompress_ptr cinfo)
+{
+  cinfo->output_iMCU_row = 0;
+}
+
+
+/*
+ * Decompress and return some data in the supplied buffer.
+ * Always attempts to emit one fully interleaved MCU row ("iMCU" row).
+ * Input and output must run in lockstep since we have only a one-MCU buffer.
+ * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED.
+ *
+ * NB: output_buf contains a plane for each component in image,
+ * which we index according to the component's SOF position.
+ */
+
+METHODDEF(int)
+decompress_data(j_decompress_ptr cinfo, _JSAMPIMAGE output_buf)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+  lossless_decomp_ptr losslessd = (lossless_decomp_ptr)cinfo->idct;
+  JDIMENSION MCU_col_num;       /* index of current MCU within row */
+  JDIMENSION MCU_count;         /* number of MCUs decoded */
+  JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+  int ci, compi, row, prev_row;
+  unsigned int yoffset;
+  jpeg_component_info *compptr;
+
+  /* Loop to process as much as one whole iMCU row */
+  for (yoffset = diff->MCU_vert_offset; yoffset < diff->MCU_rows_per_iMCU_row;
+       yoffset++) {
+
+    /* Process restart marker if needed; may have to suspend */
+    if (cinfo->restart_interval) {
+      if (diff->restart_rows_to_go == 0)
+        if (!process_restart(cinfo))
+          return JPEG_SUSPENDED;
+    }
+
+    MCU_col_num = diff->MCU_ctr;
+    /* Try to fetch an MCU row (or remaining portion of suspended MCU row). */
+    MCU_count =
+      (*cinfo->entropy->decode_mcus) (cinfo,
+                                      diff->diff_buf, yoffset, MCU_col_num,
+                                      cinfo->MCUs_per_row - MCU_col_num);
+    if (MCU_count != cinfo->MCUs_per_row - MCU_col_num) {
+      /* Suspension forced; update state counters and exit */
+      diff->MCU_vert_offset = yoffset;
+      diff->MCU_ctr += MCU_count;
+      return JPEG_SUSPENDED;
+    }
+
+    /* Account for restart interval (no-op if not using restarts) */
+    if (cinfo->restart_interval)
+      diff->restart_rows_to_go--;
+
+    /* Completed an MCU row, but perhaps not an iMCU row */
+    diff->MCU_ctr = 0;
+  }
+
+  /*
+   * Undifference and scale each scanline of the disassembled MCU row
+   * separately.  We do not process dummy samples at the end of a scanline
+   * or dummy rows at the end of the image.
+   */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    compi = compptr->component_index;
+    for (row = 0, prev_row = compptr->v_samp_factor - 1;
+         row < (cinfo->input_iMCU_row == last_iMCU_row ?
+                compptr->last_row_height : compptr->v_samp_factor);
+         prev_row = row, row++) {
+      (*losslessd->predict_undifference[compi])
+        (cinfo, compi, diff->diff_buf[compi][row],
+          diff->undiff_buf[compi][prev_row], diff->undiff_buf[compi][row],
+          compptr->width_in_blocks);
+      (*losslessd->scaler_scale) (cinfo, diff->undiff_buf[compi][row],
+                                  output_buf[compi][row],
+                                  compptr->width_in_blocks);
+    }
+  }
+
+  /* Completed the iMCU row, advance counters for next one.
+   *
+   * NB: output_data will increment output_iMCU_row.
+   * This counter is not needed for the single-pass case
+   * or the input side of the multi-pass case.
+   */
+  if (++(cinfo->input_iMCU_row) < cinfo->total_iMCU_rows) {
+    start_iMCU_row(cinfo);
+    return JPEG_ROW_COMPLETED;
+  }
+  /* Completed the scan */
+  (*cinfo->inputctl->finish_input_pass) (cinfo);
+  return JPEG_SCAN_COMPLETED;
+}
+
+
+/*
+ * Dummy consume-input routine for single-pass operation.
+ */
+
+METHODDEF(int)
+dummy_consume_data(j_decompress_ptr cinfo)
+{
+  return JPEG_SUSPENDED;        /* Always indicate nothing was done */
+}
+
+
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+
+/*
+ * Consume input data and store it in the full-image sample buffer.
+ * We read as much as one fully interleaved MCU row ("iMCU" row) per call,
+ * ie, v_samp_factor rows for each component in the scan.
+ * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED.
+ */
+
+METHODDEF(int)
+consume_data(j_decompress_ptr cinfo)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+  int ci, compi;
+  _JSAMPARRAY buffer[MAX_COMPS_IN_SCAN];
+  jpeg_component_info *compptr;
+
+  /* Align the virtual buffers for the components used in this scan. */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    compi = compptr->component_index;
+    buffer[compi] = (_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, diff->whole_image[compi],
+       cinfo->input_iMCU_row * compptr->v_samp_factor,
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
+  }
+
+  return decompress_data(cinfo, buffer);
+}
+
+
+/*
+ * Output some data from the full-image sample buffer in the multi-pass case.
+ * Always attempts to emit one fully interleaved MCU row ("iMCU" row).
+ * Return value is JPEG_ROW_COMPLETED, JPEG_SCAN_COMPLETED, or JPEG_SUSPENDED.
+ *
+ * NB: output_buf contains a plane for each component in image.
+ */
+
+METHODDEF(int)
+output_data(j_decompress_ptr cinfo, _JSAMPIMAGE output_buf)
+{
+  my_diff_ptr diff = (my_diff_ptr)cinfo->coef;
+  JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
+  int ci, samp_rows, row;
+  _JSAMPARRAY buffer;
+  jpeg_component_info *compptr;
+
+  /* Force some input to be done if we are getting ahead of the input. */
+  while (cinfo->input_scan_number < cinfo->output_scan_number ||
+         (cinfo->input_scan_number == cinfo->output_scan_number &&
+          cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) {
+    if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
+      return JPEG_SUSPENDED;
+  }
+
+  /* OK, output from the virtual arrays. */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    /* Align the virtual buffer for this component. */
+    buffer = (_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, diff->whole_image[ci],
+       cinfo->output_iMCU_row * compptr->v_samp_factor,
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
+
+    if (cinfo->output_iMCU_row < last_iMCU_row)
+      samp_rows = compptr->v_samp_factor;
+    else {
+      /* NB: can't use last_row_height here; it is input-side-dependent! */
+      samp_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
+      if (samp_rows == 0) samp_rows = compptr->v_samp_factor;
+    }
+
+    for (row = 0; row < samp_rows; row++) {
+      memcpy(output_buf[ci][row], buffer[row],
+             compptr->width_in_blocks * sizeof(_JSAMPLE));
+    }
+  }
+
+  if (++(cinfo->output_iMCU_row) < cinfo->total_iMCU_rows)
+    return JPEG_ROW_COMPLETED;
+  return JPEG_SCAN_COMPLETED;
+}
+
+#endif /* D_MULTISCAN_FILES_SUPPORTED */
+
+
+/*
+ * Initialize difference buffer controller.
+ */
+
+GLOBAL(void)
+_jinit_d_diff_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
+{
+  my_diff_ptr diff;
+  int ci;
+  jpeg_component_info *compptr;
+
+  diff = (my_diff_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(my_diff_controller));
+  cinfo->coef = (struct jpeg_d_coef_controller *)diff;
+  diff->pub.start_input_pass = start_input_pass;
+  diff->pub.start_output_pass = start_output_pass;
+
+  /* Create the [un]difference buffers. */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    diff->diff_buf[ci] =
+      ALLOC_DARRAY(JPOOL_IMAGE,
+                   (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                                         (long)compptr->h_samp_factor),
+                   (JDIMENSION)compptr->v_samp_factor);
+    diff->undiff_buf[ci] =
+      ALLOC_DARRAY(JPOOL_IMAGE,
+                   (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                                         (long)compptr->h_samp_factor),
+                   (JDIMENSION)compptr->v_samp_factor);
+  }
+
+  if (need_full_buffer) {
+#ifdef D_MULTISCAN_FILES_SUPPORTED
+    /* Allocate a full-image virtual array for each component. */
+    int access_rows;
+
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+         ci++, compptr++) {
+      access_rows = compptr->v_samp_factor;
+      diff->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+         (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor),
+         (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+                               (long)compptr->v_samp_factor),
+         (JDIMENSION)access_rows);
+    }
+    diff->pub.consume_data = consume_data;
+    diff->pub._decompress_data = output_data;
+#else
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+  } else {
+    diff->pub.consume_data = dummy_consume_data;
+    diff->pub._decompress_data = decompress_data;
+    diff->whole_image[0] = NULL; /* flag for no virtual arrays */
+  }
+}
+
+#endif /* D_LOSSLESS_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/jdhuff.c b/3rdparty/libjpeg-turbo/src/jdhuff.c
index 679d22168591..cd8c0847a22e 100644
--- a/3rdparty/libjpeg-turbo/src/jdhuff.c
+++ b/3rdparty/libjpeg-turbo/src/jdhuff.c
@@ -3,8 +3,10 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2018-2019, 2022, D. R. Commander.
  * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -24,8 +26,8 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdhuff.h"             /* Declarations shared with jdphuff.c */
-#include "jpegcomp.h"
+#include "jdhuff.h"             /* Declarations shared with jd*huff.c */
+#include "jpegapicomp.h"
 #include "jstdhuff.c"
 
 
@@ -134,7 +136,7 @@ start_pass_huff_decoder(j_decompress_ptr cinfo)
  * Compute the derived values for a Huffman table.
  * This routine also performs some validation checks on the table.
  *
- * Note this is also used by jdphuff.c.
+ * Note this is also used by jdphuff.c and jdlhuff.c.
  */
 
 GLOBAL(void)
@@ -245,14 +247,14 @@ jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC, int tblno,
 
   /* Validate symbols as being reasonable.
    * For AC tables, we make no check, but accept all byte values 0..255.
-   * For DC tables, we require the symbols to be in range 0..15.
-   * (Tighter bounds could be applied depending on the data depth and mode,
-   * but this is sufficient to ensure safe decoding.)
+   * For DC tables, we require the symbols to be in range 0..15 in lossy mode
+   * and 0..16 in lossless mode.  (Tighter bounds could be applied depending on
+   * the data depth and mode, but this is sufficient to ensure safe decoding.)
    */
   if (isDC) {
     for (i = 0; i < numsymbols; i++) {
       int sym = htbl->huffval[i];
-      if (sym < 0 || sym > 15)
+      if (sym < 0 || sym > (cinfo->master->lossless ? 16 : 15))
         ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     }
   }
@@ -260,7 +262,7 @@ jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC, int tblno,
 
 
 /*
- * Out-of-line code for bit fetching (shared with jdphuff.c).
+ * Out-of-line code for bit fetching (shared with jdphuff.c and jdlhuff.c).
  * See jdhuff.h for info about usage.
  * Note: current values of get_buffer and bits_left are passed as parameters,
  * but are returned in the corresponding fields of the state struct.
diff --git a/3rdparty/libjpeg-turbo/src/jdhuff.h b/3rdparty/libjpeg-turbo/src/jdhuff.h
index cfa0b7f55888..3eee002c020a 100644
--- a/3rdparty/libjpeg-turbo/src/jdhuff.h
+++ b/3rdparty/libjpeg-turbo/src/jdhuff.h
@@ -3,6 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010-2011, 2015-2016, 2021, D. R. Commander.
  * Copyright (C) 2018, Matthias Räncker.
@@ -10,8 +12,9 @@
  * file.
  *
  * This file contains declarations for Huffman entropy decoding routines
- * that are shared between the sequential decoder (jdhuff.c) and the
- * progressive decoder (jdphuff.c).  No other modules need to see these.
+ * that are shared between the sequential decoder (jdhuff.c), the progressive
+ * decoder (jdphuff.c), and the lossless decoder (jdlhuff.c).  No other modules
+ * need to see these.
  */
 
 #include "jconfigint.h"
diff --git a/3rdparty/libjpeg-turbo/src/jdinput.c b/3rdparty/libjpeg-turbo/src/jdinput.c
index 1bc5aff1a70a..136bef59d753 100644
--- a/3rdparty/libjpeg-turbo/src/jdinput.c
+++ b/3rdparty/libjpeg-turbo/src/jdinput.c
@@ -3,6 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
@@ -11,14 +13,15 @@
  *
  * This file contains input control logic for the JPEG decompressor.
  * These routines are concerned with controlling the decompressor's input
- * processing (marker reading and coefficient decoding).  The actual input
- * reading is done in jdmarker.c, jdhuff.c, and jdphuff.c.
+ * processing (marker reading and coefficient/difference decoding).
+ * The actual input reading is done in jdmarker.c, jdhuff.c, jdphuff.c,
+ * and jdlhuff.c.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
 
 
 /* Private state */
@@ -46,6 +49,7 @@ initial_setup(j_decompress_ptr cinfo)
 {
   int ci;
   jpeg_component_info *compptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
 
   /* Make sure image isn't bigger than I can handle */
   if ((long)cinfo->image_height > (long)JPEG_MAX_DIMENSION ||
@@ -53,7 +57,12 @@ initial_setup(j_decompress_ptr cinfo)
     ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
 
   /* For now, precision must match compiled-in value... */
-  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+#ifdef D_LOSSLESS_SUPPORTED
+  if (cinfo->data_precision != 8 && cinfo->data_precision != 12 &&
+      cinfo->data_precision != 16)
+#else
+  if (cinfo->data_precision != 8 && cinfo->data_precision != 12)
+#endif
     ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
 
   /* Check that number of components won't exceed internal array sizes */
@@ -78,36 +87,36 @@ initial_setup(j_decompress_ptr cinfo)
   }
 
 #if JPEG_LIB_VERSION >= 80
-  cinfo->block_size = DCTSIZE;
+  cinfo->block_size = data_unit;
   cinfo->natural_order = jpeg_natural_order;
   cinfo->lim_Se = DCTSIZE2 - 1;
 #endif
 
-  /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
-   * In the full decompressor, this will be overridden by jdmaster.c;
+  /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE in lossy
+   * mode.  In the full decompressor, this will be overridden by jdmaster.c;
    * but in the transcoder, jdmaster.c is not used, so we must do it here.
    */
 #if JPEG_LIB_VERSION >= 70
-  cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = DCTSIZE;
+  cinfo->min_DCT_h_scaled_size = cinfo->min_DCT_v_scaled_size = data_unit;
 #else
-  cinfo->min_DCT_scaled_size = DCTSIZE;
+  cinfo->min_DCT_scaled_size = data_unit;
 #endif
 
   /* Compute dimensions of components */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
 #if JPEG_LIB_VERSION >= 70
-    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = DCTSIZE;
+    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = data_unit;
 #else
-    compptr->DCT_scaled_size = DCTSIZE;
+    compptr->DCT_scaled_size = data_unit;
 #endif
-    /* Size in DCT blocks */
+    /* Size in data units */
     compptr->width_in_blocks = (JDIMENSION)
       jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
-                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
+                    (long)(cinfo->max_h_samp_factor * data_unit));
     compptr->height_in_blocks = (JDIMENSION)
       jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
-                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
+                    (long)(cinfo->max_v_samp_factor * data_unit));
     /* Set the first and last MCU columns to decompress from multi-scan images.
      * By default, decompress all of the MCU columns.
      */
@@ -133,7 +142,7 @@ initial_setup(j_decompress_ptr cinfo)
   /* Compute number of fully interleaved MCU rows. */
   cinfo->total_iMCU_rows = (JDIMENSION)
     jdiv_round_up((long)cinfo->image_height,
-                  (long)(cinfo->max_v_samp_factor * DCTSIZE));
+                  (long)(cinfo->max_v_samp_factor * data_unit));
 
   /* Decide whether file contains multiple scans */
   if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -150,6 +159,7 @@ per_scan_setup(j_decompress_ptr cinfo)
 {
   int ci, mcublks, tmp;
   jpeg_component_info *compptr;
+  int data_unit = cinfo->master->lossless ? 1 : DCTSIZE;
 
   if (cinfo->comps_in_scan == 1) {
 
@@ -160,14 +170,14 @@ per_scan_setup(j_decompress_ptr cinfo)
     cinfo->MCUs_per_row = compptr->width_in_blocks;
     cinfo->MCU_rows_in_scan = compptr->height_in_blocks;
 
-    /* For noninterleaved scan, always one block per MCU */
+    /* For noninterleaved scan, always one data unit per MCU */
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
     compptr->MCU_blocks = 1;
     compptr->MCU_sample_width = compptr->_DCT_scaled_size;
     compptr->last_col_width = 1;
     /* For noninterleaved scans, it is convenient to define last_row_height
-     * as the number of block rows present in the last iMCU row.
+     * as the number of data unit rows present in the last iMCU row.
      */
     tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
@@ -187,22 +197,22 @@ per_scan_setup(j_decompress_ptr cinfo)
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long)cinfo->image_width,
-                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
+                    (long)(cinfo->max_h_samp_factor * data_unit));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
       jdiv_round_up((long)cinfo->image_height,
-                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
+                    (long)(cinfo->max_v_samp_factor * data_unit));
 
     cinfo->blocks_in_MCU = 0;
 
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       compptr = cinfo->cur_comp_info[ci];
-      /* Sampling factors give # of blocks of component in each MCU */
+      /* Sampling factors give # of data units of component in each MCU */
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
       compptr->MCU_sample_width = compptr->MCU_width *
                                   compptr->_DCT_scaled_size;
-      /* Figure number of non-dummy blocks in last MCU column & row */
+      /* Figure number of non-dummy data units in last MCU column & row */
       tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
       compptr->last_col_width = tmp;
@@ -281,7 +291,8 @@ METHODDEF(void)
 start_input_pass(j_decompress_ptr cinfo)
 {
   per_scan_setup(cinfo);
-  latch_quant_tables(cinfo);
+  if (!cinfo->master->lossless)
+    latch_quant_tables(cinfo);
   (*cinfo->entropy->start_pass) (cinfo);
   (*cinfo->coef->start_input_pass) (cinfo);
   cinfo->inputctl->consume_input = cinfo->coef->consume_data;
@@ -290,8 +301,8 @@ start_input_pass(j_decompress_ptr cinfo)
 
 /*
  * Finish up after inputting a compressed-data scan.
- * This is called by the coefficient controller after it's read all
- * the expected data of the scan.
+ * This is called by the coefficient or difference controller after it's read
+ * all the expected data of the scan.
  */
 
 METHODDEF(void)
@@ -307,8 +318,8 @@ finish_input_pass(j_decompress_ptr cinfo)
  * Return value is JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
  *
  * The consume_input method pointer points either here or to the
- * coefficient controller's consume_data routine, depending on whether
- * we are reading a compressed data segment or inter-segment markers.
+ * coefficient or difference controller's consume_data routine, depending on
+ * whether we are reading a compressed data segment or inter-segment markers.
  */
 
 METHODDEF(int)
diff --git a/3rdparty/libjpeg-turbo/src/jdlhuff.c b/3rdparty/libjpeg-turbo/src/jdlhuff.c
new file mode 100644
index 000000000000..9964830dba07
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jdlhuff.c
@@ -0,0 +1,302 @@
+/*
+ * jdlhuff.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains Huffman entropy decoding routines for lossless JPEG.
+ *
+ * Much of the complexity here has to do with supporting input suspension.
+ * If the data source module demands suspension, we want to be able to back
+ * up to the start of the current MCU.  To do this, we copy state variables
+ * into local working storage, and update them back to the permanent
+ * storage only upon successful completion of an MCU.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jlossls.h"            /* Private declarations for lossless codec */
+#include "jdhuff.h"             /* Declarations shared with jd*huff.c */
+
+
+#ifdef D_LOSSLESS_SUPPORTED
+
+typedef struct {
+  int ci, yoffset, MCU_width;
+} lhd_output_ptr_info;
+
+/*
+ * Expanded entropy decoder object for Huffman decoding in lossless mode.
+ */
+
+typedef struct {
+  struct jpeg_entropy_decoder pub; /* public fields */
+
+  /* These fields are loaded into local variables at start of each MCU.
+   * In case of suspension, we exit WITHOUT updating them.
+   */
+  bitread_perm_state bitstate;  /* Bit buffer at start of MCU */
+
+  /* Pointers to derived tables (these workspaces have image lifespan) */
+  d_derived_tbl *derived_tbls[NUM_HUFF_TBLS];
+
+  /* Precalculated info set up by start_pass for use in decode_mcus: */
+
+  /* Pointers to derived tables to be used for each data unit within an MCU */
+  d_derived_tbl *cur_tbls[D_MAX_BLOCKS_IN_MCU];
+
+  /* Pointers to the proper output difference row for each group of data units
+   * within an MCU.  For each component, there are Vi groups of Hi data units.
+   */
+  JDIFFROW output_ptr[D_MAX_BLOCKS_IN_MCU];
+
+  /* Number of output pointers in use for the current MCU.  This is the sum
+   * of all Vi in the MCU.
+   */
+  int num_output_ptrs;
+
+  /* Information used for positioning the output pointers within the output
+   * difference rows.
+   */
+  lhd_output_ptr_info output_ptr_info[D_MAX_BLOCKS_IN_MCU];
+
+  /* Index of the proper output pointer for each data unit within an MCU */
+  int output_ptr_index[D_MAX_BLOCKS_IN_MCU];
+
+} lhuff_entropy_decoder;
+
+typedef lhuff_entropy_decoder *lhuff_entropy_ptr;
+
+
+/*
+ * Initialize for a Huffman-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass_lhuff_decoder(j_decompress_ptr cinfo)
+{
+  lhuff_entropy_ptr entropy = (lhuff_entropy_ptr)cinfo->entropy;
+  int ci, dctbl, sampn, ptrn, yoffset, xoffset;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    dctbl = compptr->dc_tbl_no;
+    /* Make sure requested tables are present */
+    if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS ||
+        cinfo->dc_huff_tbl_ptrs[dctbl] == NULL)
+      ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl);
+    /* Compute derived values for Huffman tables */
+    /* We may do this more than once for a table, but it's not expensive */
+    jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl,
+                            &entropy->derived_tbls[dctbl]);
+  }
+
+  /* Precalculate decoding info for each sample in an MCU of this scan */
+  for (sampn = 0, ptrn = 0; sampn < cinfo->blocks_in_MCU;) {
+    compptr = cinfo->cur_comp_info[cinfo->MCU_membership[sampn]];
+    ci = compptr->component_index;
+    for (yoffset = 0; yoffset < compptr->MCU_height; yoffset++, ptrn++) {
+      /* Precalculate the setup info for each output pointer */
+      entropy->output_ptr_info[ptrn].ci = ci;
+      entropy->output_ptr_info[ptrn].yoffset = yoffset;
+      entropy->output_ptr_info[ptrn].MCU_width = compptr->MCU_width;
+      for (xoffset = 0; xoffset < compptr->MCU_width; xoffset++, sampn++) {
+        /* Precalculate the output pointer index for each sample */
+        entropy->output_ptr_index[sampn] = ptrn;
+        /* Precalculate which table to use for each sample */
+        entropy->cur_tbls[sampn] = entropy->derived_tbls[compptr->dc_tbl_no];
+      }
+    }
+  }
+  entropy->num_output_ptrs = ptrn;
+
+  /* Initialize bitread state variables */
+  entropy->bitstate.bits_left = 0;
+  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
+  entropy->pub.insufficient_data = FALSE;
+}
+
+
+/*
+ * Figure F.12: extend sign bit.
+ * On some machines, a shift and add will be faster than a table lookup.
+ */
+
+#define AVOID_TABLES
+#ifdef AVOID_TABLES
+
+#define NEG_1  ((unsigned int)-1)
+#define HUFF_EXTEND(x, s) \
+  ((x) + ((((x) - (1 << ((s) - 1))) >> 31) & (((NEG_1) << (s)) + 1)))
+
+#else
+
+#define HUFF_EXTEND(x, s) \
+  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+
+static const int extend_test[16] = {   /* entry n is 2**(n-1) */
+  0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+  0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
+
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+  0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+  ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+  ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+  ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
+
+#endif /* AVOID_TABLES */
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ * Returns FALSE if must suspend.
+ */
+
+LOCAL(boolean)
+process_restart(j_decompress_ptr cinfo)
+{
+  lhuff_entropy_ptr entropy = (lhuff_entropy_ptr)cinfo->entropy;
+
+  /* Throw away any unused bits remaining in bit buffer; */
+  /* include any full bytes in next_marker's count of discarded bytes */
+  cinfo->marker->discarded_bytes += entropy->bitstate.bits_left / 8;
+  entropy->bitstate.bits_left = 0;
+
+  /* Advance past the RSTn marker */
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
+    return FALSE;
+
+  /* Reset out-of-data flag, unless read_restart_marker left us smack up
+   * against a marker.  In that case we will end up treating the next data
+   * segment as empty, and we can avoid producing bogus output pixels by
+   * leaving the flag set.
+   */
+  if (cinfo->unread_marker == 0)
+    entropy->pub.insufficient_data = FALSE;
+
+  return TRUE;
+}
+
+
+/*
+ * Decode and return nMCU MCUs' worth of Huffman-compressed differences.
+ * Each MCU is also disassembled and placed accordingly in diff_buf.
+ *
+ * MCU_col_num specifies the column of the first MCU being requested within
+ * the MCU row.  This tells us where to position the output row pointers in
+ * diff_buf.
+ *
+ * Returns the number of MCUs decoded.  This may be less than nMCU MCUs if
+ * data source requested suspension.  In that case no changes have been made
+ * to permanent state.  (Exception: some output differences may already have
+ * been assigned.  This is harmless for this module, since we'll just
+ * re-assign them on the next call.)
+ */
+
+METHODDEF(JDIMENSION)
+decode_mcus(j_decompress_ptr cinfo, JDIFFIMAGE diff_buf,
+            JDIMENSION MCU_row_num, JDIMENSION MCU_col_num, JDIMENSION nMCU)
+{
+  lhuff_entropy_ptr entropy = (lhuff_entropy_ptr)cinfo->entropy;
+  int sampn, ci, yoffset, MCU_width, ptrn;
+  JDIMENSION mcu_num;
+  BITREAD_STATE_VARS;
+
+  /* Set output pointer locations based on MCU_col_num */
+  for (ptrn = 0; ptrn < entropy->num_output_ptrs; ptrn++) {
+    ci = entropy->output_ptr_info[ptrn].ci;
+    yoffset = entropy->output_ptr_info[ptrn].yoffset;
+    MCU_width = entropy->output_ptr_info[ptrn].MCU_width;
+    entropy->output_ptr[ptrn] =
+      diff_buf[ci][MCU_row_num + yoffset] + (MCU_col_num * MCU_width);
+  }
+
+  /*
+   * If we've run out of data, zero out the buffers and return.
+   * By resetting the undifferencer, the output samples will be CENTERJSAMPLE.
+   *
+   * NB: We should find a way to do this without interacting with the
+   * undifferencer module directly.
+   */
+  if (entropy->pub.insufficient_data) {
+    for (ptrn = 0; ptrn < entropy->num_output_ptrs; ptrn++)
+      jzero_far((void FAR *)entropy->output_ptr[ptrn],
+                nMCU * entropy->output_ptr_info[ptrn].MCU_width *
+                sizeof(JDIFF));
+
+    (*cinfo->idct->start_pass) (cinfo);
+
+  } else {
+
+    /* Load up working state */
+    BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+
+    /* Outer loop handles the number of MCUs requested */
+
+    for (mcu_num = 0; mcu_num < nMCU; mcu_num++) {
+
+      /* Inner loop handles the samples in the MCU */
+      for (sampn = 0; sampn < cinfo->blocks_in_MCU; sampn++) {
+        d_derived_tbl *dctbl = entropy->cur_tbls[sampn];
+        register int s, r;
+
+        /* Section H.2.2: decode the sample difference */
+        HUFF_DECODE(s, br_state, dctbl, return mcu_num, label1);
+        if (s) {
+          if (s == 16)  /* special case: always output 32768 */
+            s = 32768;
+          else {        /* normal case: fetch subsequent bits */
+            CHECK_BIT_BUFFER(br_state, s, return mcu_num);
+            r = GET_BITS(s);
+            s = HUFF_EXTEND(r, s);
+          }
+        }
+
+        /* Output the sample difference */
+        *entropy->output_ptr[entropy->output_ptr_index[sampn]]++ = (JDIFF)s;
+      }
+
+      /* Completed MCU, so update state */
+      BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+    }
+  }
+
+ return nMCU;
+}
+
+
+/*
+ * Module initialization routine for lossless mode Huffman entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_lhuff_decoder(j_decompress_ptr cinfo)
+{
+  lhuff_entropy_ptr entropy;
+  int i;
+
+  entropy = (lhuff_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(lhuff_entropy_decoder));
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
+  entropy->pub.start_pass = start_pass_lhuff_decoder;
+  entropy->pub.decode_mcus = decode_mcus;
+  entropy->pub.process_restart = process_restart;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_HUFF_TBLS; i++) {
+    entropy->derived_tbls[i] = NULL;
+  }
+}
+
+#endif /* D_LOSSLESS_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/jdlossls.c b/3rdparty/libjpeg-turbo/src/jdlossls.c
new file mode 100644
index 000000000000..4d15e6bbaf2b
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jdlossls.c
@@ -0,0 +1,289 @@
+/*
+ * jdlossls.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1998, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains prediction, sample undifferencing, point transform, and
+ * sample scaling routines for the lossless JPEG decompressor.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jlossls.h"
+
+#ifdef D_LOSSLESS_SUPPORTED
+
+
+/**************** Sample undifferencing (reconstruction) *****************/
+
+/*
+ * In order to avoid a performance penalty for checking which predictor is
+ * being used and which row is being processed for each call of the
+ * undifferencer, and to promote optimization, we have separate undifferencing
+ * functions for each predictor selection value.
+ *
+ * We are able to avoid duplicating source code by implementing the predictors
+ * and undifferencers as macros.  Each of the undifferencing functions is
+ * simply a wrapper around an UNDIFFERENCE macro with the appropriate PREDICTOR
+ * macro passed as an argument.
+ */
+
+/* Predictor for the first column of the first row: 2^(P-Pt-1) */
+#define INITIAL_PREDICTORx  (1 << (cinfo->data_precision - cinfo->Al - 1))
+
+/* Predictor for the first column of the remaining rows: Rb */
+#define INITIAL_PREDICTOR2  prev_row[0]
+
+
+/*
+ * 1-Dimensional undifferencer routine.
+ *
+ * This macro implements the 1-D horizontal predictor (1).  INITIAL_PREDICTOR
+ * is used as the special case predictor for the first column, which must be
+ * either INITIAL_PREDICTOR2 or INITIAL_PREDICTORx.  The remaining samples
+ * use PREDICTOR1.
+ *
+ * The reconstructed sample is supposed to be calculated modulo 2^16, so we
+ * logically AND the result with 0xFFFF.
+ */
+
+#define UNDIFFERENCE_1D(INITIAL_PREDICTOR) \
+  int Ra; \
+  \
+  Ra = (*diff_buf++ + INITIAL_PREDICTOR) & 0xFFFF; \
+  *undiff_buf++ = Ra; \
+  \
+  while (--width) { \
+    Ra = (*diff_buf++ + PREDICTOR1) & 0xFFFF; \
+    *undiff_buf++ = Ra; \
+  }
+
+
+/*
+ * 2-Dimensional undifferencer routine.
+ *
+ * This macro implements the 2-D horizontal predictors (#2-7).  PREDICTOR2 is
+ * used as the special case predictor for the first column.  The remaining
+ * samples use PREDICTOR, which is a function of Ra, Rb, and Rc.
+ *
+ * Because prev_row and output_buf may point to the same storage area (in an
+ * interleaved image with Vi=1, for example), we must take care to buffer Rb/Rc
+ * before writing the current reconstructed sample value into output_buf.
+ *
+ * The reconstructed sample is supposed to be calculated modulo 2^16, so we
+ * logically AND the result with 0xFFFF.
+ */
+
+#define UNDIFFERENCE_2D(PREDICTOR) \
+  int Ra, Rb, Rc; \
+  \
+  Rb = *prev_row++; \
+  Ra = (*diff_buf++ + PREDICTOR2) & 0xFFFF; \
+  *undiff_buf++ = Ra; \
+  \
+  while (--width) { \
+    Rc = Rb; \
+    Rb = *prev_row++; \
+    Ra = (*diff_buf++ + PREDICTOR) & 0xFFFF; \
+    *undiff_buf++ = Ra; \
+  }
+
+
+/*
+ * Undifferencers for the second and subsequent rows in a scan or restart
+ * interval.  The first sample in the row is undifferenced using the vertical
+ * predictor (2).  The rest of the samples are undifferenced using the
+ * predictor specified in the scan header.
+ */
+
+METHODDEF(void)
+jpeg_undifference1(j_decompress_ptr cinfo, int comp_index,
+                   JDIFFROW diff_buf, JDIFFROW prev_row,
+                   JDIFFROW undiff_buf, JDIMENSION width)
+{
+  UNDIFFERENCE_1D(INITIAL_PREDICTOR2);
+}
+
+METHODDEF(void)
+jpeg_undifference2(j_decompress_ptr cinfo, int comp_index,
+                   JDIFFROW diff_buf, JDIFFROW prev_row,
+                   JDIFFROW undiff_buf, JDIMENSION width)
+{
+  UNDIFFERENCE_2D(PREDICTOR2);
+  (void)(Rc);
+}
+
+METHODDEF(void)
+jpeg_undifference3(j_decompress_ptr cinfo, int comp_index,
+                   JDIFFROW diff_buf, JDIFFROW prev_row,
+                   JDIFFROW undiff_buf, JDIMENSION width)
+{
+  UNDIFFERENCE_2D(PREDICTOR3);
+}
+
+METHODDEF(void)
+jpeg_undifference4(j_decompress_ptr cinfo, int comp_index,
+                   JDIFFROW diff_buf, JDIFFROW prev_row,
+                   JDIFFROW undiff_buf, JDIMENSION width)
+{
+  UNDIFFERENCE_2D(PREDICTOR4);
+}
+
+METHODDEF(void)
+jpeg_undifference5(j_decompress_ptr cinfo, int comp_index,
+                   JDIFFROW diff_buf, JDIFFROW prev_row,
+                   JDIFFROW undiff_buf, JDIMENSION width)
+{
+  UNDIFFERENCE_2D(PREDICTOR5);
+}
+
+METHODDEF(void)
+jpeg_undifference6(j_decompress_ptr cinfo, int comp_index,
+                   JDIFFROW diff_buf, JDIFFROW prev_row,
+                   JDIFFROW undiff_buf, JDIMENSION width)
+{
+  UNDIFFERENCE_2D(PREDICTOR6);
+}
+
+METHODDEF(void)
+jpeg_undifference7(j_decompress_ptr cinfo, int comp_index,
+                   JDIFFROW diff_buf, JDIFFROW prev_row,
+                   JDIFFROW undiff_buf, JDIMENSION width)
+{
+  UNDIFFERENCE_2D(PREDICTOR7);
+  (void)(Rc);
+}
+
+
+/*
+ * Undifferencer for the first row in a scan or restart interval.  The first
+ * sample in the row is undifferenced using the special predictor constant
+ * x=2^(P-Pt-1).  The rest of the samples are undifferenced using the
+ * 1-D horizontal predictor (1).
+ */
+
+METHODDEF(void)
+jpeg_undifference_first_row(j_decompress_ptr cinfo, int comp_index,
+                            JDIFFROW diff_buf, JDIFFROW prev_row,
+                            JDIFFROW undiff_buf, JDIMENSION width)
+{
+  lossless_decomp_ptr losslessd = (lossless_decomp_ptr)cinfo->idct;
+
+  UNDIFFERENCE_1D(INITIAL_PREDICTORx);
+
+  /*
+   * Now that we have undifferenced the first row, we want to use the
+   * undifferencer that corresponds to the predictor specified in the
+   * scan header.
+   */
+  switch (cinfo->Ss) {
+  case 1:
+    losslessd->predict_undifference[comp_index] = jpeg_undifference1;
+    break;
+  case 2:
+    losslessd->predict_undifference[comp_index] = jpeg_undifference2;
+    break;
+  case 3:
+    losslessd->predict_undifference[comp_index] = jpeg_undifference3;
+    break;
+  case 4:
+    losslessd->predict_undifference[comp_index] = jpeg_undifference4;
+    break;
+  case 5:
+    losslessd->predict_undifference[comp_index] = jpeg_undifference5;
+    break;
+  case 6:
+    losslessd->predict_undifference[comp_index] = jpeg_undifference6;
+    break;
+  case 7:
+    losslessd->predict_undifference[comp_index] = jpeg_undifference7;
+    break;
+  }
+}
+
+
+/*********************** Sample upscaling by 2^Pt ************************/
+
+METHODDEF(void)
+simple_upscale(j_decompress_ptr cinfo,
+               JDIFFROW diff_buf, _JSAMPROW output_buf, JDIMENSION width)
+{
+  do {
+    *output_buf++ = (_JSAMPLE)(*diff_buf++ << cinfo->Al);
+  } while (--width);
+}
+
+METHODDEF(void)
+noscale(j_decompress_ptr cinfo,
+        JDIFFROW diff_buf, _JSAMPROW output_buf, JDIMENSION width)
+{
+  do {
+    *output_buf++ = (_JSAMPLE)(*diff_buf++);
+  } while (--width);
+}
+
+
+/*
+ * Initialize for an input processing pass.
+ */
+
+METHODDEF(void)
+start_pass_lossless(j_decompress_ptr cinfo)
+{
+  lossless_decomp_ptr losslessd = (lossless_decomp_ptr)cinfo->idct;
+  int ci;
+
+  /* Check that the scan parameters Ss, Se, Ah, Al are OK for lossless JPEG.
+   *
+   * Ss is the predictor selection value (psv).  Legal values for sequential
+   * lossless JPEG are: 1 <= psv <= 7.
+   *
+   * Se and Ah are not used and should be zero.
+   *
+   * Al specifies the point transform (Pt).
+   * Legal values are: 0 <= Pt <= (data precision - 1).
+   */
+  if (cinfo->Ss < 1 || cinfo->Ss > 7 ||
+      cinfo->Se != 0 || cinfo->Ah != 0 ||
+      cinfo->Al < 0 || cinfo->Al >= cinfo->data_precision)
+    ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+             cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+
+  /* Set undifference functions to first row function */
+  for (ci = 0; ci < cinfo->num_components; ci++)
+    losslessd->predict_undifference[ci] = jpeg_undifference_first_row;
+
+  /* Set scaler function based on Pt */
+  if (cinfo->Al)
+    losslessd->scaler_scale = simple_upscale;
+  else
+    losslessd->scaler_scale = noscale;
+}
+
+
+/*
+ * Initialize the lossless decompressor.
+ */
+
+GLOBAL(void)
+_jinit_lossless_decompressor(j_decompress_ptr cinfo)
+{
+  lossless_decomp_ptr losslessd;
+
+  /* Create subobject in permanent pool */
+  losslessd = (lossless_decomp_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+                                sizeof(jpeg_lossless_decompressor));
+  cinfo->idct = (struct jpeg_inverse_dct *)losslessd;
+  losslessd->pub.start_pass = start_pass_lossless;
+}
+
+#endif /* D_LOSSLESS_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/jdmainct.c b/3rdparty/libjpeg-turbo/src/jdmainct.c
index f466b259f0d8..c672b4baf586 100644
--- a/3rdparty/libjpeg-turbo/src/jdmainct.c
+++ b/3rdparty/libjpeg-turbo/src/jdmainct.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -18,15 +18,17 @@
 
 #include "jinclude.h"
 #include "jdmainct.h"
-#include "jconfigint.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
+
 /*
  * In the current system design, the main buffer need never be a full-image
- * buffer; any full-height buffers will be found inside the coefficient or
- * postprocessing controllers.  Nonetheless, the main controller is not
- * trivial.  Its responsibility is to provide context rows for upsampling/
- * rescaling, and doing this in an efficient fashion is a bit tricky.
+ * buffer; any full-height buffers will be found inside the coefficient,
+ * difference, or postprocessing controllers.  Nonetheless, the main controller
+ * is not trivial.  Its responsibility is to provide context rows for
+ * upsampling/rescaling, and doing this in an efficient fashion is a bit
+ * tricky.
  *
  * Postprocessor input data is counted in "row groups".  A row group
  * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
@@ -38,20 +40,20 @@
  * row group (times any additional scale factor that the upsampler is
  * applying).
  *
- * The coefficient controller will deliver data to us one iMCU row at a time;
- * each iMCU row contains v_samp_factor * DCT_scaled_size sample rows, or
- * exactly min_DCT_scaled_size row groups.  (This amount of data corresponds
- * to one row of MCUs when the image is fully interleaved.)  Note that the
- * number of sample rows varies across components, but the number of row
- * groups does not.  Some garbage sample rows may be included in the last iMCU
- * row at the bottom of the image.
+ * The coefficient or difference controller will deliver data to us one iMCU
+ * row at a time; each iMCU row contains v_samp_factor * DCT_scaled_size sample
+ * rows, or exactly min_DCT_scaled_size row groups.  (This amount of data
+ * corresponds to one row of MCUs when the image is fully interleaved.)  Note
+ * that the number of sample rows varies across components, but the number of
+ * row groups does not.  Some garbage sample rows may be included in the last
+ * iMCU row at the bottom of the image.
  *
  * Depending on the vertical scaling algorithm used, the upsampler may need
  * access to the sample row(s) above and below its current input row group.
  * The upsampler is required to set need_context_rows TRUE at global selection
  * time if so.  When need_context_rows is FALSE, this controller can simply
- * obtain one iMCU row at a time from the coefficient controller and dole it
- * out as row groups to the postprocessor.
+ * obtain one iMCU row at a time from the coefficient or difference controller
+ * and dole it out as row groups to the postprocessor.
  *
  * When need_context_rows is TRUE, this controller guarantees that the buffer
  * passed to postprocessing contains at least one row group's worth of samples
@@ -114,16 +116,16 @@
 
 /* Forward declarations */
 METHODDEF(void) process_data_simple_main(j_decompress_ptr cinfo,
-                                         JSAMPARRAY output_buf,
+                                         _JSAMPARRAY output_buf,
                                          JDIMENSION *out_row_ctr,
                                          JDIMENSION out_rows_avail);
 METHODDEF(void) process_data_context_main(j_decompress_ptr cinfo,
-                                          JSAMPARRAY output_buf,
+                                          _JSAMPARRAY output_buf,
                                           JDIMENSION *out_row_ctr,
                                           JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
 METHODDEF(void) process_data_crank_post(j_decompress_ptr cinfo,
-                                        JSAMPARRAY output_buf,
+                                        _JSAMPARRAY output_buf,
                                         JDIMENSION *out_row_ctr,
                                         JDIMENSION out_rows_avail);
 #endif
@@ -139,14 +141,15 @@ alloc_funny_pointers(j_decompress_ptr cinfo)
   int ci, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
-  JSAMPARRAY xbuf;
+  _JSAMPARRAY xbuf;
 
   /* Get top-level space for component array pointers.
    * We alloc both arrays with one call to save a few cycles.
    */
-  main_ptr->xbuffer[0] = (JSAMPIMAGE)
+  main_ptr->xbuffer[0] = (_JSAMPIMAGE)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                cinfo->num_components * 2 * sizeof(JSAMPARRAY));
+                                cinfo->num_components * 2 *
+                                sizeof(_JSAMPARRAY));
   main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -156,9 +159,9 @@ alloc_funny_pointers(j_decompress_ptr cinfo)
     /* Get space for pointer lists --- M+4 row groups in each list.
      * We alloc both pointer lists with one call to save a few cycles.
      */
-    xbuf = (JSAMPARRAY)
+    xbuf = (_JSAMPARRAY)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  2 * (rgroup * (M + 4)) * sizeof(JSAMPROW));
+                                  2 * (rgroup * (M + 4)) * sizeof(_JSAMPROW));
     xbuf += rgroup;             /* want one row group at negative offsets */
     main_ptr->xbuffer[0][ci] = xbuf;
     xbuf += rgroup * (M + 4);
@@ -180,7 +183,7 @@ make_funny_pointers(j_decompress_ptr cinfo)
   int ci, i, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
-  JSAMPARRAY buf, xbuf0, xbuf1;
+  _JSAMPARRAY buf, xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -220,7 +223,7 @@ set_bottom_pointers(j_decompress_ptr cinfo)
   my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup, iMCUheight, rows_left;
   jpeg_component_info *compptr;
-  JSAMPARRAY xbuf;
+  _JSAMPARRAY xbuf;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -259,14 +262,14 @@ start_pass_main(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
   switch (pass_mode) {
   case JBUF_PASS_THRU:
     if (cinfo->upsample->need_context_rows) {
-      main_ptr->pub.process_data = process_data_context_main;
+      main_ptr->pub._process_data = process_data_context_main;
       make_funny_pointers(cinfo); /* Create the xbuffer[] lists */
       main_ptr->whichptr = 0;   /* Read first iMCU row into xbuffer[0] */
       main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
       main_ptr->iMCU_row_ctr = 0;
     } else {
       /* Simple case with no context needed */
-      main_ptr->pub.process_data = process_data_simple_main;
+      main_ptr->pub._process_data = process_data_simple_main;
     }
     main_ptr->buffer_full = FALSE;      /* Mark buffer empty */
     main_ptr->rowgroup_ctr = 0;
@@ -274,7 +277,7 @@ start_pass_main(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 #ifdef QUANT_2PASS_SUPPORTED
   case JBUF_CRANK_DEST:
     /* For last pass of 2-pass quantization, just crank the postprocessor */
-    main_ptr->pub.process_data = process_data_crank_post;
+    main_ptr->pub._process_data = process_data_crank_post;
     break;
 #endif
   default:
@@ -290,7 +293,7 @@ start_pass_main(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+process_data_simple_main(j_decompress_ptr cinfo, _JSAMPARRAY output_buf,
                          JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
   my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
@@ -298,7 +301,7 @@ process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
 
   /* Read input data if we haven't filled the main buffer yet */
   if (!main_ptr->buffer_full) {
-    if (!(*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
+    if (!(*cinfo->coef->_decompress_data) (cinfo, main_ptr->buffer))
       return;                   /* suspension forced, can do nothing more */
     main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
   }
@@ -311,9 +314,9 @@ process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
    */
 
   /* Feed the postprocessor */
-  (*cinfo->post->post_process_data) (cinfo, main_ptr->buffer,
-                                     &main_ptr->rowgroup_ctr, rowgroups_avail,
-                                     output_buf, out_row_ctr, out_rows_avail);
+  (*cinfo->post->_post_process_data) (cinfo, main_ptr->buffer,
+                                      &main_ptr->rowgroup_ctr, rowgroups_avail,
+                                      output_buf, out_row_ctr, out_rows_avail);
 
   /* Has postprocessor consumed all the data yet? If so, mark buffer empty */
   if (main_ptr->rowgroup_ctr >= rowgroups_avail) {
@@ -329,15 +332,15 @@ process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
  */
 
 METHODDEF(void)
-process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+process_data_context_main(j_decompress_ptr cinfo, _JSAMPARRAY output_buf,
                           JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
   my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   /* Read input data if we haven't filled the main buffer yet */
   if (!main_ptr->buffer_full) {
-    if (!(*cinfo->coef->decompress_data) (cinfo,
-                                          main_ptr->xbuffer[main_ptr->whichptr]))
+    if (!(*cinfo->coef->_decompress_data) (cinfo,
+                                           main_ptr->xbuffer[main_ptr->whichptr]))
       return;                   /* suspension forced, can do nothing more */
     main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
     main_ptr->iMCU_row_ctr++;   /* count rows received */
@@ -351,11 +354,11 @@ process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
   switch (main_ptr->context_state) {
   case CTX_POSTPONED_ROW:
     /* Call postprocessor using previously set pointers for postponed row */
-    (*cinfo->post->post_process_data) (cinfo,
-                                       main_ptr->xbuffer[main_ptr->whichptr],
-                                       &main_ptr->rowgroup_ctr,
-                                       main_ptr->rowgroups_avail, output_buf,
-                                       out_row_ctr, out_rows_avail);
+    (*cinfo->post->_post_process_data) (cinfo,
+                                        main_ptr->xbuffer[main_ptr->whichptr],
+                                        &main_ptr->rowgroup_ctr,
+                                        main_ptr->rowgroups_avail, output_buf,
+                                        out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
       return;                   /* Need to suspend */
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
@@ -375,11 +378,11 @@ process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
     FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
-    (*cinfo->post->post_process_data) (cinfo,
-                                       main_ptr->xbuffer[main_ptr->whichptr],
-                                       &main_ptr->rowgroup_ctr,
-                                       main_ptr->rowgroups_avail, output_buf,
-                                       out_row_ctr, out_rows_avail);
+    (*cinfo->post->_post_process_data) (cinfo,
+                                        main_ptr->xbuffer[main_ptr->whichptr],
+                                        &main_ptr->rowgroup_ctr,
+                                        main_ptr->rowgroups_avail, output_buf,
+                                        out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
       return;                   /* Need to suspend */
     /* After the first iMCU, change wraparound pointers to normal state */
@@ -406,12 +409,12 @@ process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
 #ifdef QUANT_2PASS_SUPPORTED
 
 METHODDEF(void)
-process_data_crank_post(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+process_data_crank_post(j_decompress_ptr cinfo, _JSAMPARRAY output_buf,
                         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE)NULL,
-                                     (JDIMENSION *)NULL, (JDIMENSION)0,
-                                     output_buf, out_row_ctr, out_rows_avail);
+  (*cinfo->post->_post_process_data) (cinfo, (_JSAMPIMAGE)NULL,
+                                      (JDIMENSION *)NULL, (JDIMENSION)0,
+                                      output_buf, out_row_ctr, out_rows_avail);
 }
 
 #endif /* QUANT_2PASS_SUPPORTED */
@@ -422,12 +425,15 @@ process_data_crank_post(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
  */
 
 GLOBAL(void)
-jinit_d_main_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
+_jinit_d_main_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_main_ptr main_ptr;
   int ci, rgroup, ngroups;
   jpeg_component_info *compptr;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   main_ptr = (my_main_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_main_controller));
@@ -453,9 +459,11 @@ jinit_d_main_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
        ci++, compptr++) {
     rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
       cinfo->_min_DCT_scaled_size; /* height of a row group of component */
-    main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
+    main_ptr->buffer[ci] = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
                         ((j_common_ptr)cinfo, JPOOL_IMAGE,
                          compptr->width_in_blocks * compptr->_DCT_scaled_size,
                          (JDIMENSION)(rgroup * ngroups));
   }
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jdmainct.h b/3rdparty/libjpeg-turbo/src/jdmainct.h
index 37b201ca8826..914ad11f694e 100644
--- a/3rdparty/libjpeg-turbo/src/jdmainct.h
+++ b/3rdparty/libjpeg-turbo/src/jdmainct.h
@@ -3,22 +3,27 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
 
 #define JPEG_INTERNALS
 #include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
+
 /* Private buffer controller object */
 
 typedef struct {
   struct jpeg_d_main_controller pub; /* public fields */
 
   /* Pointer to allocated workspace (M or M+2 row groups). */
-  JSAMPARRAY buffer[MAX_COMPONENTS];
+  _JSAMPARRAY buffer[MAX_COMPONENTS];
 
   boolean buffer_full;          /* Have we gotten an iMCU row from decoder? */
   JDIMENSION rowgroup_ctr;      /* counts row groups output to postprocessor */
@@ -26,7 +31,7 @@ typedef struct {
   /* Remaining fields are only used in the context case. */
 
   /* These are the master pointers to the funny-order pointer lists. */
-  JSAMPIMAGE xbuffer[2];        /* pointers to weird pointer lists */
+  _JSAMPIMAGE xbuffer[2];       /* pointers to weird pointer lists */
 
   int whichptr;                 /* indicates which pointer set is now in use */
   int context_state;            /* process_data state machine status */
@@ -53,7 +58,7 @@ set_wraparound_pointers(j_decompress_ptr cinfo)
   int ci, i, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
-  JSAMPARRAY xbuf0, xbuf1;
+  _JSAMPARRAY xbuf0, xbuf1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -69,3 +74,5 @@ set_wraparound_pointers(j_decompress_ptr cinfo)
     }
   }
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jdmarker.c b/3rdparty/libjpeg-turbo/src/jdmarker.c
index f7eba615fd5c..bd19a735dd0a 100644
--- a/3rdparty/libjpeg-turbo/src/jdmarker.c
+++ b/3rdparty/libjpeg-turbo/src/jdmarker.c
@@ -3,6 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2012, 2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -237,7 +239,8 @@ get_soi(j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_lossless,
+        boolean is_arith)
 /* Process a SOFn marker */
 {
   JLONG length;
@@ -246,6 +249,7 @@ get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
   INPUT_VARS(cinfo);
 
   cinfo->progressive_mode = is_prog;
+  cinfo->master->lossless = is_lossless;
   cinfo->arith_code = is_arith;
 
   INPUT_2BYTES(cinfo, length, return FALSE);
@@ -920,7 +924,6 @@ next_marker(j_decompress_ptr cinfo)
   }
 
   if (cinfo->marker->discarded_bytes != 0) {
-    WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
     cinfo->marker->discarded_bytes = 0;
   }
 
@@ -990,32 +993,40 @@ read_markers(j_decompress_ptr cinfo)
 
     case M_SOF0:                /* Baseline */
     case M_SOF1:                /* Extended sequential, Huffman */
-      if (!get_sof(cinfo, FALSE, FALSE))
+      if (!get_sof(cinfo, FALSE, FALSE, FALSE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF2:                /* Progressive, Huffman */
-      if (!get_sof(cinfo, TRUE, FALSE))
+      if (!get_sof(cinfo, TRUE, FALSE, FALSE))
+        return JPEG_SUSPENDED;
+      break;
+
+    case M_SOF3:                /* Lossless, Huffman */
+      if (!get_sof(cinfo, FALSE, TRUE, FALSE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF9:                /* Extended sequential, arithmetic */
-      if (!get_sof(cinfo, FALSE, TRUE))
+      if (!get_sof(cinfo, FALSE, FALSE, TRUE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF10:               /* Progressive, arithmetic */
-      if (!get_sof(cinfo, TRUE, TRUE))
+      if (!get_sof(cinfo, TRUE, FALSE, TRUE))
+        return JPEG_SUSPENDED;
+      break;
+
+    case M_SOF11:               /* Lossless, arithmetic */
+      if (!get_sof(cinfo, FALSE, TRUE, TRUE))
         return JPEG_SUSPENDED;
       break;
 
     /* Currently unsupported SOFn types */
-    case M_SOF3:                /* Lossless, Huffman */
     case M_SOF5:                /* Differential sequential, Huffman */
     case M_SOF6:                /* Differential progressive, Huffman */
     case M_SOF7:                /* Differential lossless, Huffman */
     case M_JPG:                 /* Reserved for JPEG extensions */
-    case M_SOF11:               /* Lossless, arithmetic */
     case M_SOF13:               /* Differential sequential, arithmetic */
     case M_SOF14:               /* Differential progressive, arithmetic */
     case M_SOF15:               /* Differential lossless, arithmetic */
diff --git a/3rdparty/libjpeg-turbo/src/jdmaster.c b/3rdparty/libjpeg-turbo/src/jdmaster.c
index a3690bf560ba..80a4842ac114 100644
--- a/3rdparty/libjpeg-turbo/src/jdmaster.c
+++ b/3rdparty/libjpeg-turbo/src/jdmaster.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, 2022-2023, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -20,7 +22,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
 #include "jdmaster.h"
 
 
@@ -33,6 +35,9 @@ LOCAL(boolean)
 use_merged_upsample(j_decompress_ptr cinfo)
 {
 #ifdef UPSAMPLE_MERGING_SUPPORTED
+  /* Colorspace conversion is not supported with lossless JPEG images */
+  if (cinfo->master->lossless)
+    return FALSE;
   /* Merging is the equivalent of plain box-filter upsampling */
   if (cinfo->do_fancy_upsampling || cinfo->CCIR601_sampling)
     return FALSE;
@@ -97,154 +102,154 @@ jpeg_core_output_dimensions(j_decompress_ptr cinfo)
   int ci;
   jpeg_component_info *compptr;
 
-  /* Compute actual output image dimensions and DCT scaling choices. */
-  if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom) {
-    /* Provide 1/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 1;
-    cinfo->_min_DCT_v_scaled_size = 1;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 2) {
-    /* Provide 2/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 2L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 2L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 2;
-    cinfo->_min_DCT_v_scaled_size = 2;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 3) {
-    /* Provide 3/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 3L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 3L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 3;
-    cinfo->_min_DCT_v_scaled_size = 3;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 4) {
-    /* Provide 4/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 4L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 4L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 4;
-    cinfo->_min_DCT_v_scaled_size = 4;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 5) {
-    /* Provide 5/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 5L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 5L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 5;
-    cinfo->_min_DCT_v_scaled_size = 5;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 6) {
-    /* Provide 6/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 6L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 6L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 6;
-    cinfo->_min_DCT_v_scaled_size = 6;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 7) {
-    /* Provide 7/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 7L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 7L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 7;
-    cinfo->_min_DCT_v_scaled_size = 7;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 8) {
-    /* Provide 8/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 8L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 8L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 8;
-    cinfo->_min_DCT_v_scaled_size = 8;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 9) {
-    /* Provide 9/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 9L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 9L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 9;
-    cinfo->_min_DCT_v_scaled_size = 9;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 10) {
-    /* Provide 10/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 10L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 10L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 10;
-    cinfo->_min_DCT_v_scaled_size = 10;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 11) {
-    /* Provide 11/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 11L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 11L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 11;
-    cinfo->_min_DCT_v_scaled_size = 11;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 12) {
-    /* Provide 12/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 12L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 12L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 12;
-    cinfo->_min_DCT_v_scaled_size = 12;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 13) {
-    /* Provide 13/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 13L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 13L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 13;
-    cinfo->_min_DCT_v_scaled_size = 13;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 14) {
-    /* Provide 14/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 14L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 14L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 14;
-    cinfo->_min_DCT_v_scaled_size = 14;
-  } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 15) {
-    /* Provide 15/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 15L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 15L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 15;
-    cinfo->_min_DCT_v_scaled_size = 15;
-  } else {
-    /* Provide 16/block_size scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width * 16L, (long)DCTSIZE);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height * 16L, (long)DCTSIZE);
-    cinfo->_min_DCT_h_scaled_size = 16;
-    cinfo->_min_DCT_v_scaled_size = 16;
-  }
+  if (!cinfo->master->lossless) {
+    /* Compute actual output image dimensions and DCT scaling choices. */
+    if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom) {
+      /* Provide 1/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 1;
+      cinfo->_min_DCT_v_scaled_size = 1;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 2) {
+      /* Provide 2/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 2L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 2L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 2;
+      cinfo->_min_DCT_v_scaled_size = 2;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 3) {
+      /* Provide 3/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 3L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 3L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 3;
+      cinfo->_min_DCT_v_scaled_size = 3;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 4) {
+      /* Provide 4/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 4L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 4L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 4;
+      cinfo->_min_DCT_v_scaled_size = 4;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 5) {
+      /* Provide 5/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 5L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 5L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 5;
+      cinfo->_min_DCT_v_scaled_size = 5;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 6) {
+      /* Provide 6/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 6L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 6L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 6;
+      cinfo->_min_DCT_v_scaled_size = 6;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 7) {
+      /* Provide 7/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 7L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 7L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 7;
+      cinfo->_min_DCT_v_scaled_size = 7;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 8) {
+      /* Provide 8/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 8L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 8L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 8;
+      cinfo->_min_DCT_v_scaled_size = 8;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 9) {
+      /* Provide 9/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 9L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 9L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 9;
+      cinfo->_min_DCT_v_scaled_size = 9;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 10) {
+      /* Provide 10/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 10L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 10L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 10;
+      cinfo->_min_DCT_v_scaled_size = 10;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 11) {
+      /* Provide 11/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 11L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 11L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 11;
+      cinfo->_min_DCT_v_scaled_size = 11;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 12) {
+      /* Provide 12/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 12L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 12L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 12;
+      cinfo->_min_DCT_v_scaled_size = 12;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 13) {
+      /* Provide 13/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 13L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 13L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 13;
+      cinfo->_min_DCT_v_scaled_size = 13;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 14) {
+      /* Provide 14/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 14L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 14L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 14;
+      cinfo->_min_DCT_v_scaled_size = 14;
+    } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 15) {
+      /* Provide 15/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 15L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 15L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 15;
+      cinfo->_min_DCT_v_scaled_size = 15;
+    } else {
+      /* Provide 16/block_size scaling */
+      cinfo->output_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width * 16L, (long)DCTSIZE);
+      cinfo->output_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height * 16L, (long)DCTSIZE);
+      cinfo->_min_DCT_h_scaled_size = 16;
+      cinfo->_min_DCT_v_scaled_size = 16;
+    }
 
-  /* Recompute dimensions of components */
-  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-       ci++, compptr++) {
-    compptr->_DCT_h_scaled_size = cinfo->_min_DCT_h_scaled_size;
-    compptr->_DCT_v_scaled_size = cinfo->_min_DCT_v_scaled_size;
+    /* Recompute dimensions of components */
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+         ci++, compptr++) {
+      compptr->_DCT_h_scaled_size = cinfo->_min_DCT_h_scaled_size;
+      compptr->_DCT_v_scaled_size = cinfo->_min_DCT_v_scaled_size;
+    }
+  } else
+#endif /* !IDCT_SCALING_SUPPORTED */
+  {
+    /* Hardwire it to "no scaling" */
+    cinfo->output_width = cinfo->image_width;
+    cinfo->output_height = cinfo->image_height;
+    /* jdinput.c has already initialized DCT_scaled_size,
+     * and has computed unscaled downsampled_width and downsampled_height.
+     */
   }
-
-#else /* !IDCT_SCALING_SUPPORTED */
-
-  /* Hardwire it to "no scaling" */
-  cinfo->output_width = cinfo->image_width;
-  cinfo->output_height = cinfo->image_height;
-  /* jdinput.c has already initialized DCT_scaled_size,
-   * and has computed unscaled downsampled_width and downsampled_height.
-   */
-
-#endif /* IDCT_SCALING_SUPPORTED */
 }
 
 
@@ -273,54 +278,56 @@ jpeg_calc_output_dimensions(j_decompress_ptr cinfo)
 
 #ifdef IDCT_SCALING_SUPPORTED
 
-  /* In selecting the actual DCT scaling for each component, we try to
-   * scale up the chroma components via IDCT scaling rather than upsampling.
-   * This saves time if the upsampler gets to use 1:1 scaling.
-   * Note this code adapts subsampling ratios which are powers of 2.
-   */
-  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-       ci++, compptr++) {
-    int ssize = cinfo->_min_DCT_scaled_size;
-    while (ssize < DCTSIZE &&
-           ((cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) %
-            (compptr->h_samp_factor * ssize * 2) == 0) &&
-           ((cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size) %
-            (compptr->v_samp_factor * ssize * 2) == 0)) {
-      ssize = ssize * 2;
-    }
+  if (!cinfo->master->lossless) {
+    /* In selecting the actual DCT scaling for each component, we try to
+     * scale up the chroma components via IDCT scaling rather than upsampling.
+     * This saves time if the upsampler gets to use 1:1 scaling.
+     * Note this code adapts subsampling ratios which are powers of 2.
+     */
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+         ci++, compptr++) {
+      int ssize = cinfo->_min_DCT_scaled_size;
+      while (ssize < DCTSIZE &&
+             ((cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) %
+              (compptr->h_samp_factor * ssize * 2) == 0) &&
+             ((cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size) %
+              (compptr->v_samp_factor * ssize * 2) == 0)) {
+        ssize = ssize * 2;
+      }
 #if JPEG_LIB_VERSION >= 70
-    compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = ssize;
+      compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size = ssize;
 #else
-    compptr->DCT_scaled_size = ssize;
+      compptr->DCT_scaled_size = ssize;
 #endif
-  }
-
-  /* Recompute downsampled dimensions of components;
-   * application needs to know these if using raw downsampled data.
-   */
-  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-       ci++, compptr++) {
-    /* Size in samples, after IDCT scaling */
-    compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_width *
-                    (long)(compptr->h_samp_factor * compptr->_DCT_scaled_size),
-                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
-    compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long)cinfo->image_height *
-                    (long)(compptr->v_samp_factor * compptr->_DCT_scaled_size),
-                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
-  }
-
-#else /* !IDCT_SCALING_SUPPORTED */
-
-  /* Hardwire it to "no scaling" */
-  cinfo->output_width = cinfo->image_width;
-  cinfo->output_height = cinfo->image_height;
-  /* jdinput.c has already initialized DCT_scaled_size to DCTSIZE,
-   * and has computed unscaled downsampled_width and downsampled_height.
-   */
+    }
 
+    /* Recompute downsampled dimensions of components;
+     * application needs to know these if using raw downsampled data.
+     */
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+         ci++, compptr++) {
+      /* Size in samples, after IDCT scaling */
+      compptr->downsampled_width = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_width *
+                      (long)(compptr->h_samp_factor *
+                             compptr->_DCT_scaled_size),
+                      (long)(cinfo->max_h_samp_factor * DCTSIZE));
+      compptr->downsampled_height = (JDIMENSION)
+        jdiv_round_up((long)cinfo->image_height *
+                      (long)(compptr->v_samp_factor *
+                             compptr->_DCT_scaled_size),
+                      (long)(cinfo->max_v_samp_factor * DCTSIZE));
+    }
+  } else
 #endif /* IDCT_SCALING_SUPPORTED */
+  {
+    /* Hardwire it to "no scaling" */
+    cinfo->output_width = cinfo->image_width;
+    cinfo->output_height = cinfo->image_height;
+    /* jdinput.c has already initialized DCT_scaled_size to DCTSIZE,
+     * and has computed unscaled downsampled_width and downsampled_height.
+     */
+  }
 
   /* Report number of components in selected colorspace. */
   /* Probably this should be in the color conversion module... */
@@ -409,27 +416,83 @@ prepare_range_limit_table(j_decompress_ptr cinfo)
 /* Allocate and fill in the sample_range_limit table */
 {
   JSAMPLE *table;
+  J12SAMPLE *table12;
+#ifdef D_LOSSLESS_SUPPORTED
+  J16SAMPLE *table16;
+#endif
   int i;
 
-  table = (JSAMPLE *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                (5 * (MAXJSAMPLE + 1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
-  table += (MAXJSAMPLE + 1);    /* allow negative subscripts of simple table */
-  cinfo->sample_range_limit = table;
-  /* First segment of "simple" table: limit[x] = 0 for x < 0 */
-  memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
-  /* Main part of "simple" table: limit[x] = x */
-  for (i = 0; i <= MAXJSAMPLE; i++)
-    table[i] = (JSAMPLE)i;
-  table += CENTERJSAMPLE;       /* Point to where post-IDCT table starts */
-  /* End of simple table, rest of first half of post-IDCT table */
-  for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
-    table[i] = MAXJSAMPLE;
-  /* Second half of post-IDCT table */
-  memset(table + (2 * (MAXJSAMPLE + 1)), 0,
-         (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
-  memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
-         cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
+  if (cinfo->data_precision == 16) {
+#ifdef D_LOSSLESS_SUPPORTED
+    table16 = (J16SAMPLE *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                  (5 * (MAXJ16SAMPLE + 1) + CENTERJ16SAMPLE) *
+                  sizeof(J16SAMPLE));
+    table16 += (MAXJ16SAMPLE + 1);  /* allow negative subscripts of simple
+                                       table */
+    cinfo->sample_range_limit = (JSAMPLE *)table16;
+    /* First segment of "simple" table: limit[x] = 0 for x < 0 */
+    memset(table16 - (MAXJ16SAMPLE + 1), 0,
+           (MAXJ16SAMPLE + 1) * sizeof(J16SAMPLE));
+    /* Main part of "simple" table: limit[x] = x */
+    for (i = 0; i <= MAXJ16SAMPLE; i++)
+      table16[i] = (J16SAMPLE)i;
+    table16 += CENTERJ16SAMPLE; /* Point to where post-IDCT table starts */
+    /* End of simple table, rest of first half of post-IDCT table */
+    for (i = CENTERJ16SAMPLE; i < 2 * (MAXJ16SAMPLE + 1); i++)
+      table16[i] = MAXJ16SAMPLE;
+    /* Second half of post-IDCT table */
+    memset(table16 + (2 * (MAXJ16SAMPLE + 1)), 0,
+           (2 * (MAXJ16SAMPLE + 1) - CENTERJ16SAMPLE) * sizeof(J16SAMPLE));
+    memcpy(table16 + (4 * (MAXJ16SAMPLE + 1) - CENTERJ16SAMPLE),
+           cinfo->sample_range_limit, CENTERJ16SAMPLE * sizeof(J16SAMPLE));
+#else
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+#endif
+  } else if (cinfo->data_precision == 12) {
+    table12 = (J12SAMPLE *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                  (5 * (MAXJ12SAMPLE + 1) + CENTERJ12SAMPLE) *
+                  sizeof(J12SAMPLE));
+    table12 += (MAXJ12SAMPLE + 1);  /* allow negative subscripts of simple
+                                       table */
+    cinfo->sample_range_limit = (JSAMPLE *)table12;
+    /* First segment of "simple" table: limit[x] = 0 for x < 0 */
+    memset(table12 - (MAXJ12SAMPLE + 1), 0,
+           (MAXJ12SAMPLE + 1) * sizeof(J12SAMPLE));
+    /* Main part of "simple" table: limit[x] = x */
+    for (i = 0; i <= MAXJ12SAMPLE; i++)
+      table12[i] = (J12SAMPLE)i;
+    table12 += CENTERJ12SAMPLE; /* Point to where post-IDCT table starts */
+    /* End of simple table, rest of first half of post-IDCT table */
+    for (i = CENTERJ12SAMPLE; i < 2 * (MAXJ12SAMPLE + 1); i++)
+      table12[i] = MAXJ12SAMPLE;
+    /* Second half of post-IDCT table */
+    memset(table12 + (2 * (MAXJ12SAMPLE + 1)), 0,
+           (2 * (MAXJ12SAMPLE + 1) - CENTERJ12SAMPLE) * sizeof(J12SAMPLE));
+    memcpy(table12 + (4 * (MAXJ12SAMPLE + 1) - CENTERJ12SAMPLE),
+           cinfo->sample_range_limit, CENTERJ12SAMPLE * sizeof(J12SAMPLE));
+  } else {
+    table = (JSAMPLE *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                  (5 * (MAXJSAMPLE + 1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
+    table += (MAXJSAMPLE + 1);  /* allow negative subscripts of simple table */
+    cinfo->sample_range_limit = table;
+    /* First segment of "simple" table: limit[x] = 0 for x < 0 */
+    memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
+    /* Main part of "simple" table: limit[x] = x */
+    for (i = 0; i <= MAXJSAMPLE; i++)
+      table[i] = (JSAMPLE)i;
+    table += CENTERJSAMPLE;     /* Point to where post-IDCT table starts */
+    /* End of simple table, rest of first half of post-IDCT table */
+    for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
+      table[i] = MAXJSAMPLE;
+    /* Second half of post-IDCT table */
+    memset(table + (2 * (MAXJSAMPLE + 1)), 0,
+           (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
+    memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
+           cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
+  }
 }
 
 
@@ -452,6 +515,17 @@ master_selection(j_decompress_ptr cinfo)
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
 
+  /* Disable IDCT scaling and raw (downsampled) data output in lossless mode.
+   * IDCT scaling is not useful in lossless mode, and it must be disabled in
+   * order to properly calculate the output dimensions.  Raw data output isn't
+   * particularly useful without subsampling and has not been tested in
+   * lossless mode.
+   */
+  if (cinfo->master->lossless) {
+    cinfo->raw_data_out = FALSE;
+    cinfo->scale_num = cinfo->scale_denom = 1;
+  }
+
   /* Initialize dimensions and other stuff */
   jpeg_calc_output_dimensions(cinfo);
   prepare_range_limit_table(cinfo);
@@ -480,7 +554,8 @@ master_selection(j_decompress_ptr cinfo)
     if (cinfo->raw_data_out)
       ERREXIT(cinfo, JERR_NOTIMPL);
     /* 2-pass quantizer only works in 3-component color space. */
-    if (cinfo->out_color_components != 3) {
+    if (cinfo->out_color_components != 3 ||
+        cinfo->out_color_space == JCS_RGB565) {
       cinfo->enable_1pass_quant = TRUE;
       cinfo->enable_external_quant = FALSE;
       cinfo->enable_2pass_quant = FALSE;
@@ -495,7 +570,12 @@ master_selection(j_decompress_ptr cinfo)
 
     if (cinfo->enable_1pass_quant) {
 #ifdef QUANT_1PASS_SUPPORTED
-      jinit_1pass_quantizer(cinfo);
+      if (cinfo->data_precision == 16)
+        ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+      else if (cinfo->data_precision == 12)
+        j12init_1pass_quantizer(cinfo);
+      else
+        jinit_1pass_quantizer(cinfo);
       master->quantizer_1pass = cinfo->cquantize;
 #else
       ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -505,7 +585,12 @@ master_selection(j_decompress_ptr cinfo)
     /* We use the 2-pass code to map to external colormaps. */
     if (cinfo->enable_2pass_quant || cinfo->enable_external_quant) {
 #ifdef QUANT_2PASS_SUPPORTED
-      jinit_2pass_quantizer(cinfo);
+      if (cinfo->data_precision == 16)
+        ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+      else if (cinfo->data_precision == 12)
+        j12init_2pass_quantizer(cinfo);
+      else
+        jinit_2pass_quantizer(cinfo);
       master->quantizer_2pass = cinfo->cquantize;
 #else
       ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -520,42 +605,122 @@ master_selection(j_decompress_ptr cinfo)
   if (!cinfo->raw_data_out) {
     if (master->using_merged_upsample) {
 #ifdef UPSAMPLE_MERGING_SUPPORTED
-      jinit_merged_upsampler(cinfo); /* does color conversion too */
+      if (cinfo->data_precision == 16)
+        ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+      else if (cinfo->data_precision == 12)
+        j12init_merged_upsampler(cinfo); /* does color conversion too */
+      else
+        jinit_merged_upsampler(cinfo); /* does color conversion too */
 #else
       ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
     } else {
-      jinit_color_deconverter(cinfo);
-      jinit_upsampler(cinfo);
+      if (cinfo->data_precision == 16) {
+#ifdef D_LOSSLESS_SUPPORTED
+        j16init_color_deconverter(cinfo);
+        j16init_upsampler(cinfo);
+#else
+        ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+#endif
+      } else if (cinfo->data_precision == 12) {
+        j12init_color_deconverter(cinfo);
+        j12init_upsampler(cinfo);
+      } else {
+        jinit_color_deconverter(cinfo);
+        jinit_upsampler(cinfo);
+      }
     }
-    jinit_d_post_controller(cinfo, cinfo->enable_2pass_quant);
+    if (cinfo->data_precision == 16)
+#ifdef D_LOSSLESS_SUPPORTED
+      j16init_d_post_controller(cinfo, cinfo->enable_2pass_quant);
+#else
+      ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+#endif
+    else if (cinfo->data_precision == 12)
+      j12init_d_post_controller(cinfo, cinfo->enable_2pass_quant);
+    else
+      jinit_d_post_controller(cinfo, cinfo->enable_2pass_quant);
   }
-  /* Inverse DCT */
-  jinit_inverse_dct(cinfo);
-  /* Entropy decoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-#ifdef D_ARITH_CODING_SUPPORTED
-    jinit_arith_decoder(cinfo);
+
+  if (cinfo->master->lossless) {
+#ifdef D_LOSSLESS_SUPPORTED
+    /* Prediction, sample undifferencing, point transform, and sample size
+     * scaling
+     */
+    if (cinfo->data_precision == 16)
+      j16init_lossless_decompressor(cinfo);
+    else if (cinfo->data_precision == 12)
+      j12init_lossless_decompressor(cinfo);
+    else
+      jinit_lossless_decompressor(cinfo);
+    /* Entropy decoding: either Huffman or arithmetic coding. */
+    if (cinfo->arith_code) {
+      ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+    } else {
+      jinit_lhuff_decoder(cinfo);
+    }
+
+    /* Initialize principal buffer controllers. */
+    use_c_buffer = cinfo->inputctl->has_multiple_scans ||
+                   cinfo->buffered_image;
+    if (cinfo->data_precision == 16)
+      j16init_d_diff_controller(cinfo, use_c_buffer);
+    else if (cinfo->data_precision == 12)
+      j12init_d_diff_controller(cinfo, use_c_buffer);
+    else
+      jinit_d_diff_controller(cinfo, use_c_buffer);
 #else
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
   } else {
-    if (cinfo->progressive_mode) {
+    if (cinfo->data_precision == 16)
+      ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+    /* Inverse DCT */
+    if (cinfo->data_precision == 12)
+      j12init_inverse_dct(cinfo);
+    else
+      jinit_inverse_dct(cinfo);
+    /* Entropy decoding: either Huffman or arithmetic coding. */
+    if (cinfo->arith_code) {
+#ifdef D_ARITH_CODING_SUPPORTED
+      jinit_arith_decoder(cinfo);
+#else
+      ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
+#endif
+    } else {
+      if (cinfo->progressive_mode) {
 #ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
+        jinit_phuff_decoder(cinfo);
 #else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
+        ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
-    } else
-      jinit_huff_decoder(cinfo);
-  }
+      } else
+        jinit_huff_decoder(cinfo);
+    }
 
-  /* Initialize principal buffer controllers. */
-  use_c_buffer = cinfo->inputctl->has_multiple_scans || cinfo->buffered_image;
-  jinit_d_coef_controller(cinfo, use_c_buffer);
+    /* Initialize principal buffer controllers. */
+    use_c_buffer = cinfo->inputctl->has_multiple_scans ||
+                   cinfo->buffered_image;
+    if (cinfo->data_precision == 12)
+      j12init_d_coef_controller(cinfo, use_c_buffer);
+    else
+      jinit_d_coef_controller(cinfo, use_c_buffer);
+  }
 
-  if (!cinfo->raw_data_out)
-    jinit_d_main_controller(cinfo, FALSE /* never need full buffer here */);
+  if (!cinfo->raw_data_out) {
+    if (cinfo->data_precision == 16)
+#ifdef D_LOSSLESS_SUPPORTED
+      j16init_d_main_controller(cinfo,
+                                FALSE /* never need full buffer here */);
+#else
+      ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+#endif
+    else if (cinfo->data_precision == 12)
+      j12init_d_main_controller(cinfo,
+                                FALSE /* never need full buffer here */);
+    else
+      jinit_d_main_controller(cinfo, FALSE /* never need full buffer here */);
+  }
 
   /* We can now tell the memory manager to allocate virtual arrays. */
   (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
diff --git a/3rdparty/libjpeg-turbo/src/jdmerge.c b/3rdparty/libjpeg-turbo/src/jdmerge.c
index 3a456d658128..49f2006fc02a 100644
--- a/3rdparty/libjpeg-turbo/src/jdmerge.c
+++ b/3rdparty/libjpeg-turbo/src/jdmerge.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009, 2011, 2014-2015, 2020, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014-2015, 2020, 2022, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -42,7 +42,6 @@
 #include "jpeglib.h"
 #include "jdmerge.h"
 #include "jsimd.h"
-#include "jconfigint.h"
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
@@ -168,20 +167,20 @@ build_ycc_rgb_table(j_decompress_ptr cinfo)
 
   upsample->Cr_r_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE + 1) * sizeof(int));
+                                (_MAXJSAMPLE + 1) * sizeof(int));
   upsample->Cb_b_tab = (int *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE + 1) * sizeof(int));
+                                (_MAXJSAMPLE + 1) * sizeof(int));
   upsample->Cr_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE + 1) * sizeof(JLONG));
+                                (_MAXJSAMPLE + 1) * sizeof(JLONG));
   upsample->Cb_g_tab = (JLONG *)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE + 1) * sizeof(JLONG));
+                                (_MAXJSAMPLE + 1) * sizeof(JLONG));
 
-  for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
-    /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
-    /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
+  for (i = 0, x = -_CENTERJSAMPLE; i <= _MAXJSAMPLE; i++, x++) {
+    /* i is the actual input pixel value, in the range 0.._MAXJSAMPLE */
+    /* The Cb or Cr value we are thinking of is x = i - _CENTERJSAMPLE */
     /* Cr=>R value is nearest int to 1.40200 * x */
     upsample->Cr_r_tab[i] = (int)
                     RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
@@ -220,14 +219,14 @@ start_pass_merged_upsample(j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-merged_2v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+merged_2v_upsample(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
                    JDIMENSION *in_row_group_ctr,
-                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION in_row_groups_avail, _JSAMPARRAY output_buf,
                    JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 /* 2:1 vertical sampling case: may need a spare row. */
 {
   my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
-  JSAMPROW work_ptrs[2];
+  _JSAMPROW work_ptrs[2];
   JDIMENSION num_rows;          /* number of rows returned to caller */
 
   if (upsample->spare_full) {
@@ -235,8 +234,8 @@ merged_2v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     JDIMENSION size = upsample->out_row_width;
     if (cinfo->out_color_space == JCS_RGB565)
       size = cinfo->output_width * 2;
-    jcopy_sample_rows(&upsample->spare_row, 0, output_buf + *out_row_ctr, 0, 1,
-                      size);
+    _jcopy_sample_rows(&upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
+                       1, size);
     num_rows = 1;
     upsample->spare_full = FALSE;
   } else {
@@ -271,9 +270,9 @@ merged_2v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-merged_1v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+merged_1v_upsample(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
                    JDIMENSION *in_row_group_ctr,
-                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION in_row_groups_avail, _JSAMPARRAY output_buf,
                    JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 /* 1:1 vertical sampling case: much easier, never need a spare row. */
 {
@@ -303,8 +302,8 @@ merged_1v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 METHODDEF(void)
-h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+h2v1_merged_upsample(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr, _JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
@@ -348,8 +347,8 @@ h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 METHODDEF(void)
-h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+h2v2_merged_upsample(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr, _JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
@@ -475,8 +474,8 @@ static INLINE boolean is_big_endian(void)
 
 
 METHODDEF(void)
-h2v1_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+h2v1_merged_upsample_565(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                         JDIMENSION in_row_group_ctr, _JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v1_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -488,8 +487,8 @@ h2v1_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-h2v1_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                          JDIMENSION in_row_group_ctr, _JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v1_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -501,8 +500,8 @@ h2v1_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-h2v2_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+h2v2_merged_upsample_565(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                         JDIMENSION in_row_group_ctr, _JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v2_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -514,8 +513,8 @@ h2v2_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 
 METHODDEF(void)
-h2v2_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                          JDIMENSION in_row_group_ctr, _JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v2_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -535,10 +534,13 @@ h2v2_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 GLOBAL(void)
-jinit_merged_upsampler(j_decompress_ptr cinfo)
+_jinit_merged_upsampler(j_decompress_ptr cinfo)
 {
   my_merged_upsample_ptr upsample;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   upsample = (my_merged_upsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_merged_upsampler));
@@ -549,10 +551,12 @@ jinit_merged_upsampler(j_decompress_ptr cinfo)
   upsample->out_row_width = cinfo->output_width * cinfo->out_color_components;
 
   if (cinfo->max_v_samp_factor == 2) {
-    upsample->pub.upsample = merged_2v_upsample;
+    upsample->pub._upsample = merged_2v_upsample;
+#ifdef WITH_SIMD
     if (jsimd_can_h2v2_merged_upsample())
       upsample->upmethod = jsimd_h2v2_merged_upsample;
     else
+#endif
       upsample->upmethod = h2v2_merged_upsample;
     if (cinfo->out_color_space == JCS_RGB565) {
       if (cinfo->dither_mode != JDITHER_NONE) {
@@ -562,14 +566,16 @@ jinit_merged_upsampler(j_decompress_ptr cinfo)
       }
     }
     /* Allocate a spare row buffer */
-    upsample->spare_row = (JSAMPROW)
+    upsample->spare_row = (_JSAMPROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                (size_t)(upsample->out_row_width * sizeof(JSAMPLE)));
+                (size_t)(upsample->out_row_width * sizeof(_JSAMPLE)));
   } else {
-    upsample->pub.upsample = merged_1v_upsample;
+    upsample->pub._upsample = merged_1v_upsample;
+#ifdef WITH_SIMD
     if (jsimd_can_h2v1_merged_upsample())
       upsample->upmethod = jsimd_h2v1_merged_upsample;
     else
+#endif
       upsample->upmethod = h2v1_merged_upsample;
     if (cinfo->out_color_space == JCS_RGB565) {
       if (cinfo->dither_mode != JDITHER_NONE) {
diff --git a/3rdparty/libjpeg-turbo/src/jdmerge.h b/3rdparty/libjpeg-turbo/src/jdmerge.h
index b583396b1065..73cbd605495a 100644
--- a/3rdparty/libjpeg-turbo/src/jdmerge.h
+++ b/3rdparty/libjpeg-turbo/src/jdmerge.h
@@ -4,13 +4,14 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2020, D. R. Commander.
+ * Copyright (C) 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
 
 #define JPEG_INTERNALS
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
@@ -21,8 +22,8 @@ typedef struct {
   struct jpeg_upsampler pub;    /* public fields */
 
   /* Pointer to routine to do actual upsampling/conversion of one row group */
-  void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                    JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  void (*upmethod) (j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
+                    JDIMENSION in_row_group_ctr, _JSAMPARRAY output_buf);
 
   /* Private state for YCC->RGB conversion */
   int *Cr_r_tab;                /* => table for Cr to R conversion */
@@ -35,7 +36,7 @@ typedef struct {
    * application provides just a one-row buffer; we also use the spare
    * to discard the dummy last row if the image height is odd.
    */
-  JSAMPROW spare_row;
+  _JSAMPROW spare_row;
   boolean spare_full;           /* T if spare buffer is occupied */
 
   JDIMENSION out_row_width;     /* samples per output row */
diff --git a/3rdparty/libjpeg-turbo/src/jdmrg565.c b/3rdparty/libjpeg-turbo/src/jdmrg565.c
index 980a4e216e4d..0c719b912ce6 100644
--- a/3rdparty/libjpeg-turbo/src/jdmrg565.c
+++ b/3rdparty/libjpeg-turbo/src/jdmrg565.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2014-2015, 2018, 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,18 +15,19 @@
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo,
+                                  _JSAMPIMAGE input_buf,
                                   JDIMENSION in_row_group_ctr,
-                                  JSAMPARRAY output_buf)
+                                  _JSAMPARRAY output_buf)
 {
   my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
-  register JSAMPROW outptr;
-  JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  _JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   int *Crrtab = upsample->Cr_r_tab;
   int *Cbbtab = upsample->Cb_b_tab;
   JLONG *Crgtab = upsample->Cr_g_tab;
@@ -86,18 +87,18 @@ h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 INLINE
 LOCAL(void)
 h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
+                                   _JSAMPIMAGE input_buf,
                                    JDIMENSION in_row_group_ctr,
-                                   JSAMPARRAY output_buf)
+                                   _JSAMPARRAY output_buf)
 {
   my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
-  register JSAMPROW outptr;
-  JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  _JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   int *Crrtab = upsample->Cr_r_tab;
   int *Cbbtab = upsample->Cb_b_tab;
   JLONG *Crgtab = upsample->Cr_g_tab;
@@ -159,18 +160,18 @@ h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
                                   JDIMENSION in_row_group_ctr,
-                                  JSAMPARRAY output_buf)
+                                  _JSAMPARRAY output_buf)
 {
   my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
+  register _JSAMPROW outptr0, outptr1;
+  _JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   int *Crrtab = upsample->Cr_r_tab;
   int *Cbbtab = upsample->Cb_b_tab;
   JLONG *Crgtab = upsample->Cr_g_tab;
@@ -255,18 +256,18 @@ h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 INLINE
 LOCAL(void)
 h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
+                                   _JSAMPIMAGE input_buf,
                                    JDIMENSION in_row_group_ctr,
-                                   JSAMPARRAY output_buf)
+                                   _JSAMPARRAY output_buf)
 {
   my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
+  register _JSAMPROW outptr0, outptr1;
+  _JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   int *Crrtab = upsample->Cr_r_tab;
   int *Cbbtab = upsample->Cb_b_tab;
   JLONG *Crgtab = upsample->Cr_g_tab;
diff --git a/3rdparty/libjpeg-turbo/src/jdmrgext.c b/3rdparty/libjpeg-turbo/src/jdmrgext.c
index 9bf4f1a307f3..8139e0a3ed65 100644
--- a/3rdparty/libjpeg-turbo/src/jdmrgext.c
+++ b/3rdparty/libjpeg-turbo/src/jdmrgext.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2015, 2020, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2020, 2022-2023, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -21,18 +21,18 @@
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+h2v1_merged_upsample_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
                               JDIMENSION in_row_group_ctr,
-                              JSAMPARRAY output_buf)
+                              _JSAMPARRAY output_buf)
 {
   my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
-  register JSAMPROW outptr;
-  JSAMPROW inptr0, inptr1, inptr2;
+  register _JSAMPROW outptr;
+  _JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   int *Crrtab = upsample->Cr_r_tab;
   int *Cbbtab = upsample->Cb_b_tab;
   JLONG *Crgtab = upsample->Cr_g_tab;
@@ -57,7 +57,7 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr[RGB_ALPHA] = 0xFF;
+    outptr[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
     outptr += RGB_PIXELSIZE;
     y  = *inptr0++;
@@ -65,7 +65,7 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr[RGB_ALPHA] = 0xFF;
+    outptr[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
     outptr += RGB_PIXELSIZE;
   }
@@ -81,7 +81,7 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr[RGB_ALPHA] = 0xFF;
+    outptr[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
   }
 }
@@ -93,18 +93,18 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+h2v2_merged_upsample_internal(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
                               JDIMENSION in_row_group_ctr,
-                              JSAMPARRAY output_buf)
+                              _JSAMPARRAY output_buf)
 {
   my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
-  register JSAMPROW outptr0, outptr1;
-  JSAMPROW inptr00, inptr01, inptr1, inptr2;
+  register _JSAMPROW outptr0, outptr1;
+  _JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   int *Crrtab = upsample->Cr_r_tab;
   int *Cbbtab = upsample->Cb_b_tab;
   JLONG *Crgtab = upsample->Cr_g_tab;
@@ -131,7 +131,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr0[RGB_ALPHA] = 0xFF;
+    outptr0[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
     outptr0 += RGB_PIXELSIZE;
     y  = *inptr00++;
@@ -139,7 +139,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr0[RGB_ALPHA] = 0xFF;
+    outptr0[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
     outptr0 += RGB_PIXELSIZE;
     y  = *inptr01++;
@@ -147,7 +147,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr1[RGB_ALPHA] = 0xFF;
+    outptr1[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
     outptr1 += RGB_PIXELSIZE;
     y  = *inptr01++;
@@ -155,7 +155,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr1[RGB_ALPHA] = 0xFF;
+    outptr1[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
     outptr1 += RGB_PIXELSIZE;
   }
@@ -171,14 +171,14 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr0[RGB_ALPHA] = 0xFF;
+    outptr0[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
     y  = *inptr01;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
-    outptr1[RGB_ALPHA] = 0xFF;
+    outptr1[RGB_ALPHA] = _MAXJSAMPLE;
 #endif
   }
 }
diff --git a/3rdparty/libjpeg-turbo/src/jdphuff.c b/3rdparty/libjpeg-turbo/src/jdphuff.c
index 9680ebcbd06e..bf97333a34c0 100644
--- a/3rdparty/libjpeg-turbo/src/jdphuff.c
+++ b/3rdparty/libjpeg-turbo/src/jdphuff.c
@@ -3,6 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015-2016, 2018-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -23,7 +25,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jdhuff.h"             /* Declarations shared with jdhuff.c */
+#include "jdhuff.h"             /* Declarations shared with jd*huff.c */
 #include <limits.h>
 
 
diff --git a/3rdparty/libjpeg-turbo/src/jdpostct.c b/3rdparty/libjpeg-turbo/src/jdpostct.c
index 6a2cf5c1b31e..d38495f5f316 100644
--- a/3rdparty/libjpeg-turbo/src/jdpostct.c
+++ b/3rdparty/libjpeg-turbo/src/jdpostct.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022-2023, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -22,8 +22,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
+
 /* Private buffer controller object */
 
 typedef struct {
@@ -35,7 +38,7 @@ typedef struct {
    * for one-pass operation, a strip buffer is sufficient.
    */
   jvirt_sarray_ptr whole_image; /* virtual array, or NULL if one-pass */
-  JSAMPARRAY buffer;            /* strip buffer, or current strip of virtual */
+  _JSAMPARRAY buffer;           /* strip buffer, or current strip of virtual */
   JDIMENSION strip_height;      /* buffer size in rows */
   /* for two-pass mode only: */
   JDIMENSION starting_row;      /* row # of first row in current strip */
@@ -46,26 +49,28 @@ typedef my_post_controller *my_post_ptr;
 
 
 /* Forward declarations */
+#if BITS_IN_JSAMPLE != 16
 METHODDEF(void) post_process_1pass(j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
+                                   _JSAMPIMAGE input_buf,
                                    JDIMENSION *in_row_group_ctr,
                                    JDIMENSION in_row_groups_avail,
-                                   JSAMPARRAY output_buf,
+                                   _JSAMPARRAY output_buf,
                                    JDIMENSION *out_row_ctr,
                                    JDIMENSION out_rows_avail);
-#ifdef QUANT_2PASS_SUPPORTED
+#endif
+#if defined(QUANT_2PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16
 METHODDEF(void) post_process_prepass(j_decompress_ptr cinfo,
-                                     JSAMPIMAGE input_buf,
+                                     _JSAMPIMAGE input_buf,
                                      JDIMENSION *in_row_group_ctr,
                                      JDIMENSION in_row_groups_avail,
-                                     JSAMPARRAY output_buf,
+                                     _JSAMPARRAY output_buf,
                                      JDIMENSION *out_row_ctr,
                                      JDIMENSION out_rows_avail);
 METHODDEF(void) post_process_2pass(j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
+                                   _JSAMPIMAGE input_buf,
                                    JDIMENSION *in_row_group_ctr,
                                    JDIMENSION in_row_groups_avail,
-                                   JSAMPARRAY output_buf,
+                                   _JSAMPARRAY output_buf,
                                    JDIMENSION *out_row_ctr,
                                    JDIMENSION out_rows_avail);
 #endif
@@ -82,39 +87,42 @@ start_pass_dpost(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
+#if BITS_IN_JSAMPLE != 16
     if (cinfo->quantize_colors) {
       /* Single-pass processing with color quantization. */
-      post->pub.post_process_data = post_process_1pass;
+      post->pub._post_process_data = post_process_1pass;
       /* We could be doing buffered-image output before starting a 2-pass
        * color quantization; in that case, jinit_d_post_controller did not
        * allocate a strip buffer.  Use the virtual-array buffer as workspace.
        */
       if (post->buffer == NULL) {
-        post->buffer = (*cinfo->mem->access_virt_sarray)
+        post->buffer = (_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
           ((j_common_ptr)cinfo, post->whole_image,
            (JDIMENSION)0, post->strip_height, TRUE);
       }
-    } else {
+    } else
+#endif
+    {
       /* For single-pass processing without color quantization,
        * I have no work to do; just call the upsampler directly.
        */
-      post->pub.post_process_data = cinfo->upsample->upsample;
+      post->pub._post_process_data = cinfo->upsample->_upsample;
     }
     break;
-#ifdef QUANT_2PASS_SUPPORTED
+#if defined(QUANT_2PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16
   case JBUF_SAVE_AND_PASS:
     /* First pass of 2-pass quantization */
     if (post->whole_image == NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    post->pub.post_process_data = post_process_prepass;
+    post->pub._post_process_data = post_process_prepass;
     break;
   case JBUF_CRANK_DEST:
     /* Second pass of 2-pass quantization */
     if (post->whole_image == NULL)
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    post->pub.post_process_data = post_process_2pass;
+    post->pub._post_process_data = post_process_2pass;
     break;
-#endif /* QUANT_2PASS_SUPPORTED */
+#endif /* defined(QUANT_2PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16 */
   default:
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
     break;
@@ -128,10 +136,12 @@ start_pass_dpost(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
  * This is used for color precision reduction as well as one-pass quantization.
  */
 
+#if BITS_IN_JSAMPLE != 16
+
 METHODDEF(void)
-post_process_1pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+post_process_1pass(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
                    JDIMENSION *in_row_group_ctr,
-                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION in_row_groups_avail, _JSAMPARRAY output_buf,
                    JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
   my_post_ptr post = (my_post_ptr)cinfo->post;
@@ -143,27 +153,29 @@ post_process_1pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   if (max_rows > post->strip_height)
     max_rows = post->strip_height;
   num_rows = 0;
-  (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
-                                in_row_groups_avail, post->buffer, &num_rows,
-                                max_rows);
+  (*cinfo->upsample->_upsample) (cinfo, input_buf, in_row_group_ctr,
+                                 in_row_groups_avail, post->buffer, &num_rows,
+                                 max_rows);
   /* Quantize and emit data. */
-  (*cinfo->cquantize->color_quantize) (cinfo, post->buffer,
-                                       output_buf + *out_row_ctr,
-                                       (int)num_rows);
+  (*cinfo->cquantize->_color_quantize) (cinfo, post->buffer,
+                                        output_buf + *out_row_ctr,
+                                        (int)num_rows);
   *out_row_ctr += num_rows;
 }
 
+#endif
 
-#ifdef QUANT_2PASS_SUPPORTED
+
+#if defined(QUANT_2PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16
 
 /*
  * Process some data in the first pass of 2-pass quantization.
  */
 
 METHODDEF(void)
-post_process_prepass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+post_process_prepass(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
                      JDIMENSION *in_row_group_ctr,
-                     JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                     JDIMENSION in_row_groups_avail, _JSAMPARRAY output_buf,
                      JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
   my_post_ptr post = (my_post_ptr)cinfo->post;
@@ -171,23 +183,23 @@ post_process_prepass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
-    post->buffer = (*cinfo->mem->access_virt_sarray)
+    post->buffer = (_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
         ((j_common_ptr)cinfo, post->whole_image,
          post->starting_row, post->strip_height, TRUE);
   }
 
   /* Upsample some data (up to a strip height's worth). */
   old_next_row = post->next_row;
-  (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
-                                in_row_groups_avail, post->buffer,
-                                &post->next_row, post->strip_height);
+  (*cinfo->upsample->_upsample) (cinfo, input_buf, in_row_group_ctr,
+                                 in_row_groups_avail, post->buffer,
+                                 &post->next_row, post->strip_height);
 
   /* Allow quantizer to scan new data.  No data is emitted, */
   /* but we advance out_row_ctr so outer loop can tell when we're done. */
   if (post->next_row > old_next_row) {
     num_rows = post->next_row - old_next_row;
-    (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row,
-                                         (JSAMPARRAY)NULL, (int)num_rows);
+    (*cinfo->cquantize->_color_quantize) (cinfo, post->buffer + old_next_row,
+                                          (_JSAMPARRAY)NULL, (int)num_rows);
     *out_row_ctr += num_rows;
   }
 
@@ -204,9 +216,9 @@ post_process_prepass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 METHODDEF(void)
-post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+post_process_2pass(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
                    JDIMENSION *in_row_group_ctr,
-                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION in_row_groups_avail, _JSAMPARRAY output_buf,
                    JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
   my_post_ptr post = (my_post_ptr)cinfo->post;
@@ -214,7 +226,7 @@ post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
-    post->buffer = (*cinfo->mem->access_virt_sarray)
+    post->buffer = (_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
         ((j_common_ptr)cinfo, post->whole_image,
          post->starting_row, post->strip_height, FALSE);
   }
@@ -230,9 +242,9 @@ post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     num_rows = max_rows;
 
   /* Quantize and emit data. */
-  (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + post->next_row,
-                                       output_buf + *out_row_ctr,
-                                       (int)num_rows);
+  (*cinfo->cquantize->_color_quantize) (cinfo, post->buffer + post->next_row,
+                                        output_buf + *out_row_ctr,
+                                        (int)num_rows);
   *out_row_ctr += num_rows;
 
   /* Advance if we filled the strip. */
@@ -243,7 +255,7 @@ post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   }
 }
 
-#endif /* QUANT_2PASS_SUPPORTED */
+#endif /* defined(QUANT_2PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16 */
 
 
 /*
@@ -251,10 +263,13 @@ post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 GLOBAL(void)
-jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
+_jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_post_ptr post;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   post = (my_post_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_post_controller));
@@ -265,6 +280,7 @@ jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 
   /* Create the quantization buffer, if needed */
   if (cinfo->quantize_colors) {
+#if BITS_IN_JSAMPLE != 16
     /* The buffer strip height is max_v_samp_factor, which is typically
      * an efficient number of rows for upsampling to return.
      * (In the presence of output rescaling, we might want to be smarter?)
@@ -285,10 +301,15 @@ jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 #endif /* QUANT_2PASS_SUPPORTED */
     } else {
       /* One-pass color quantization: just make a strip buffer. */
-      post->buffer = (*cinfo->mem->alloc_sarray)
+      post->buffer = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
         ((j_common_ptr)cinfo, JPOOL_IMAGE,
          cinfo->output_width * cinfo->out_color_components,
          post->strip_height);
     }
+#else
+    ERREXIT(cinfo, JERR_NOTIMPL);
+#endif
   }
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jdsample.c b/3rdparty/libjpeg-turbo/src/jdsample.c
index eaad72a03089..cc8015c97d72 100644
--- a/3rdparty/libjpeg-turbo/src/jdsample.c
+++ b/3rdparty/libjpeg-turbo/src/jdsample.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010, 2015-2016, 2022, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
  * Copyright (C) 2019-2020, Arm Limited.
@@ -28,10 +28,12 @@
 #include "jinclude.h"
 #include "jdsample.h"
 #include "jsimd.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
 
 
 
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
+
 /*
  * Initialize for an upsampling pass.
  */
@@ -57,9 +59,9 @@ start_pass_upsample(j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-sep_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+sep_upsample(j_decompress_ptr cinfo, _JSAMPIMAGE input_buf,
              JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-             JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+             _JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
              JDIMENSION out_rows_avail)
 {
   my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
@@ -95,9 +97,10 @@ sep_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   if (num_rows > out_rows_avail)
     num_rows = out_rows_avail;
 
-  (*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf,
-                                     (JDIMENSION)upsample->next_row_out,
-                                     output_buf + *out_row_ctr, (int)num_rows);
+  (*cinfo->cconvert->_color_convert) (cinfo, upsample->color_buf,
+                                      (JDIMENSION)upsample->next_row_out,
+                                      output_buf + *out_row_ctr,
+                                      (int)num_rows);
 
   /* Adjust counts */
   *out_row_ctr += num_rows;
@@ -124,7 +127,7 @@ sep_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
 METHODDEF(void)
 fullsize_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+                  _JSAMPARRAY input_data, _JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = input_data;
 }
@@ -137,7 +140,7 @@ fullsize_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 noop_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+              _JSAMPARRAY input_data, _JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = NULL;      /* safety check */
 }
@@ -156,14 +159,14 @@ noop_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-             JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+             _JSAMPARRAY input_data, _JSAMPARRAY *output_data_ptr)
 {
   my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr, outptr;
-  register JSAMPLE invalue;
+  _JSAMPARRAY output_data = *output_data_ptr;
+  register _JSAMPROW inptr, outptr;
+  register _JSAMPLE invalue;
   register int h;
-  JSAMPROW outend;
+  _JSAMPROW outend;
   int h_expand, v_expand;
   int inrow, outrow;
 
@@ -184,8 +187,8 @@ int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     }
     /* Generate any additional output rows by duplicating the first one */
     if (v_expand > 1) {
-      jcopy_sample_rows(output_data, outrow, output_data, outrow + 1,
-                        v_expand - 1, cinfo->output_width);
+      _jcopy_sample_rows(output_data, outrow, output_data, outrow + 1,
+                         v_expand - 1, cinfo->output_width);
     }
     inrow++;
     outrow += v_expand;
@@ -200,12 +203,12 @@ int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+              _JSAMPARRAY input_data, _JSAMPARRAY *output_data_ptr)
 {
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr, outptr;
-  register JSAMPLE invalue;
-  JSAMPROW outend;
+  _JSAMPARRAY output_data = *output_data_ptr;
+  register _JSAMPROW inptr, outptr;
+  register _JSAMPLE invalue;
+  _JSAMPROW outend;
   int inrow;
 
   for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
@@ -228,12 +231,12 @@ h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+              _JSAMPARRAY input_data, _JSAMPARRAY *output_data_ptr)
 {
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr, outptr;
-  register JSAMPLE invalue;
-  JSAMPROW outend;
+  _JSAMPARRAY output_data = *output_data_ptr;
+  register _JSAMPROW inptr, outptr;
+  register _JSAMPLE invalue;
+  _JSAMPROW outend;
   int inrow, outrow;
 
   inrow = outrow = 0;
@@ -246,8 +249,8 @@ h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
-    jcopy_sample_rows(output_data, outrow, output_data, outrow + 1, 1,
-                      cinfo->output_width);
+    _jcopy_sample_rows(output_data, outrow, output_data, outrow + 1, 1,
+                       cinfo->output_width);
     inrow++;
     outrow += 2;
   }
@@ -271,10 +274,10 @@ h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+                    _JSAMPARRAY input_data, _JSAMPARRAY *output_data_ptr)
 {
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr, outptr;
+  _JSAMPARRAY output_data = *output_data_ptr;
+  register _JSAMPROW inptr, outptr;
   register int invalue;
   register JDIMENSION colctr;
   int inrow;
@@ -284,20 +287,20 @@ h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[inrow];
     /* Special case for first column */
     invalue = *inptr++;
-    *outptr++ = (JSAMPLE)invalue;
-    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
+    *outptr++ = (_JSAMPLE)invalue;
+    *outptr++ = (_JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
 
     for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
       /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
       invalue = (*inptr++) * 3;
-      *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
-      *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
+      *outptr++ = (_JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+      *outptr++ = (_JSAMPLE)((invalue + inptr[0] + 2) >> 2);
     }
 
     /* Special case for last column */
     invalue = *inptr;
-    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
-    *outptr++ = (JSAMPLE)invalue;
+    *outptr++ = (_JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
+    *outptr++ = (_JSAMPLE)invalue;
   }
 }
 
@@ -311,10 +314,10 @@ h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+                    _JSAMPARRAY input_data, _JSAMPARRAY *output_data_ptr)
 {
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr0, inptr1, outptr;
+  _JSAMPARRAY output_data = *output_data_ptr;
+  _JSAMPROW inptr0, inptr1, outptr;
 #if BITS_IN_JSAMPLE == 8
   int thiscolsum, bias;
 #else
@@ -339,7 +342,7 @@ h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
       for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
         thiscolsum = (*inptr0++) * 3 + (*inptr1++);
-        *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
+        *outptr++ = (_JSAMPLE)((thiscolsum + bias) >> 2);
       }
     }
     inrow++;
@@ -357,10 +360,10 @@ h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 METHODDEF(void)
 h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+                    _JSAMPARRAY input_data, _JSAMPARRAY *output_data_ptr)
 {
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr0, inptr1, outptr;
+  _JSAMPARRAY output_data = *output_data_ptr;
+  register _JSAMPROW inptr0, inptr1, outptr;
 #if BITS_IN_JSAMPLE == 8
   register int thiscolsum, lastcolsum, nextcolsum;
 #else
@@ -383,22 +386,22 @@ h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
       /* Special case for first column */
       thiscolsum = (*inptr0++) * 3 + (*inptr1++);
       nextcolsum = (*inptr0++) * 3 + (*inptr1++);
-      *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
-      *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+      *outptr++ = (_JSAMPLE)((thiscolsum * 4 + 8) >> 4);
+      *outptr++ = (_JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
       lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
 
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
         /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
         /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
         nextcolsum = (*inptr0++) * 3 + (*inptr1++);
-        *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
-        *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+        *outptr++ = (_JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+        *outptr++ = (_JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
         lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
       }
 
       /* Special case for last column */
-      *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
-      *outptr++ = (JSAMPLE)((thiscolsum * 4 + 7) >> 4);
+      *outptr++ = (_JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+      *outptr++ = (_JSAMPLE)((thiscolsum * 4 + 7) >> 4);
     }
     inrow++;
   }
@@ -410,7 +413,7 @@ h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_upsampler(j_decompress_ptr cinfo)
+_jinit_upsampler(j_decompress_ptr cinfo)
 {
   my_upsample_ptr upsample;
   int ci;
@@ -418,13 +421,16 @@ jinit_upsampler(j_decompress_ptr cinfo)
   boolean need_buffer, do_fancy;
   int h_in_group, v_in_group, h_out_group, v_out_group;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   if (!cinfo->master->jinit_upsampler_no_alloc) {
     upsample = (my_upsample_ptr)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(my_upsampler));
     cinfo->upsample = (struct jpeg_upsampler *)upsample;
     upsample->pub.start_pass = start_pass_upsample;
-    upsample->pub.upsample = sep_upsample;
+    upsample->pub._upsample = sep_upsample;
     upsample->pub.need_context_rows = FALSE; /* until we find out differently */
   } else
     upsample = (my_upsample_ptr)cinfo->upsample;
@@ -464,21 +470,25 @@ jinit_upsampler(j_decompress_ptr cinfo)
     } else if (h_in_group * 2 == h_out_group && v_in_group == v_out_group) {
       /* Special cases for 2h1v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
+#ifdef WITH_SIMD
         if (jsimd_can_h2v1_fancy_upsample())
           upsample->methods[ci] = jsimd_h2v1_fancy_upsample;
         else
+#endif
           upsample->methods[ci] = h2v1_fancy_upsample;
       } else {
+#ifdef WITH_SIMD
         if (jsimd_can_h2v1_upsample())
           upsample->methods[ci] = jsimd_h2v1_upsample;
         else
+#endif
           upsample->methods[ci] = h2v1_upsample;
       }
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-#if defined(__arm__) || defined(__aarch64__) || \
-    defined(_M_ARM) || defined(_M_ARM64)
+#if defined(WITH_SIMD) && (defined(__arm__) || defined(__aarch64__) || \
+                           defined(_M_ARM) || defined(_M_ARM64))
       if (jsimd_can_h1v2_fancy_upsample())
         upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
       else
@@ -489,21 +499,25 @@ jinit_upsampler(j_decompress_ptr cinfo)
                v_in_group * 2 == v_out_group) {
       /* Special cases for 2h2v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
+#ifdef WITH_SIMD
         if (jsimd_can_h2v2_fancy_upsample())
           upsample->methods[ci] = jsimd_h2v2_fancy_upsample;
         else
+#endif
           upsample->methods[ci] = h2v2_fancy_upsample;
         upsample->pub.need_context_rows = TRUE;
       } else {
+#ifdef WITH_SIMD
         if (jsimd_can_h2v2_upsample())
           upsample->methods[ci] = jsimd_h2v2_upsample;
         else
+#endif
           upsample->methods[ci] = h2v2_upsample;
       }
     } else if ((h_out_group % h_in_group) == 0 &&
                (v_out_group % v_in_group) == 0) {
       /* Generic integral-factors upsampling method */
-#if defined(__mips__)
+#if defined(WITH_SIMD) && defined(__mips__)
       if (jsimd_can_int_upsample())
         upsample->methods[ci] = jsimd_int_upsample;
       else
@@ -514,7 +528,7 @@ jinit_upsampler(j_decompress_ptr cinfo)
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
     if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) {
-      upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
+      upsample->color_buf[ci] = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
         ((j_common_ptr)cinfo, JPOOL_IMAGE,
          (JDIMENSION)jround_up((long)cinfo->output_width,
                                (long)cinfo->max_h_samp_factor),
@@ -522,3 +536,5 @@ jinit_upsampler(j_decompress_ptr cinfo)
     }
   }
 }
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED) */
diff --git a/3rdparty/libjpeg-turbo/src/jdsample.h b/3rdparty/libjpeg-turbo/src/jdsample.h
index a6bf08a032ac..a8a929809402 100644
--- a/3rdparty/libjpeg-turbo/src/jdsample.h
+++ b/3rdparty/libjpeg-turbo/src/jdsample.h
@@ -3,19 +3,22 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
 
 #define JPEG_INTERNALS
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
 
 /* Pointer to routine to upsample a single component */
 typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
                                jpeg_component_info *compptr,
-                               JSAMPARRAY input_data,
-                               JSAMPARRAY *output_data_ptr);
+                               _JSAMPARRAY input_data,
+                               _JSAMPARRAY *output_data_ptr);
 
 /* Private subobject */
 
@@ -29,7 +32,7 @@ typedef struct {
    * ie do not need rescaling.  The corresponding entry of color_buf[] is
    * simply set to point to the input data array, thereby avoiding copying.
    */
-  JSAMPARRAY color_buf[MAX_COMPONENTS];
+  _JSAMPARRAY color_buf[MAX_COMPONENTS];
 
   /* Per-component upsampling method pointers */
   upsample1_ptr methods[MAX_COMPONENTS];
diff --git a/3rdparty/libjpeg-turbo/src/jdtrans.c b/3rdparty/libjpeg-turbo/src/jdtrans.c
index d7ec4b83b3a4..719813f67672 100644
--- a/3rdparty/libjpeg-turbo/src/jdtrans.c
+++ b/3rdparty/libjpeg-turbo/src/jdtrans.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2020, D. R. Commander.
+ * Copyright (C) 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -16,7 +16,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jpegcomp.h"
+#include "jpegapicomp.h"
 
 
 /* Forward declarations */
@@ -48,6 +48,9 @@ LOCAL(void) transdecode_master_selection(j_decompress_ptr cinfo);
 GLOBAL(jvirt_barray_ptr *)
 jpeg_read_coefficients(j_decompress_ptr cinfo)
 {
+  if (cinfo->master->lossless)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
   if (cinfo->global_state == DSTATE_READY) {
     /* First call: initialize active modules */
     transdecode_master_selection(cinfo);
@@ -127,7 +130,10 @@ transdecode_master_selection(j_decompress_ptr cinfo)
   }
 
   /* Always get a full-image coefficient buffer. */
-  jinit_d_coef_controller(cinfo, TRUE);
+  if (cinfo->data_precision == 12)
+    j12init_d_coef_controller(cinfo, TRUE);
+  else
+    jinit_d_coef_controller(cinfo, TRUE);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
   (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
diff --git a/3rdparty/libjpeg-turbo/src/jerror.c b/3rdparty/libjpeg-turbo/src/jerror.c
index d54470293758..3a75fec02c17 100644
--- a/3rdparty/libjpeg-turbo/src/jerror.c
+++ b/3rdparty/libjpeg-turbo/src/jerror.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2022, D. R. Commander.
+ * Copyright (C) 2022, 2024, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -46,7 +46,7 @@
 
 #define JMESSAGE(code, string)  string,
 
-const char * const jpeg_std_message_table[] = {
+static const char * const jpeg_std_message_table[] = {
 #include "jerror.h"
   NULL
 };
@@ -189,9 +189,9 @@ format_message(j_common_ptr cinfo, char *buffer)
 
   /* Format the message into the passed buffer */
   if (isstring)
-    snprintf(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
+    SNPRINTF(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
   else
-    snprintf(buffer, JMSG_LENGTH_MAX, msgtext,
+    SNPRINTF(buffer, JMSG_LENGTH_MAX, msgtext,
              err->msg_parm.i[0], err->msg_parm.i[1],
              err->msg_parm.i[2], err->msg_parm.i[3],
              err->msg_parm.i[4], err->msg_parm.i[5],
@@ -229,23 +229,17 @@ reset_error_mgr(j_common_ptr cinfo)
 GLOBAL(struct jpeg_error_mgr *)
 jpeg_std_error(struct jpeg_error_mgr *err)
 {
+  memset(err, 0, sizeof(struct jpeg_error_mgr));
+
   err->error_exit = error_exit;
   err->emit_message = emit_message;
   err->output_message = output_message;
   err->format_message = format_message;
   err->reset_error_mgr = reset_error_mgr;
 
-  err->trace_level = 0;         /* default = no tracing */
-  err->num_warnings = 0;        /* no warnings emitted yet */
-  err->msg_code = 0;            /* may be useful as a flag for "no error" */
-
   /* Initialize message table pointers */
   err->jpeg_message_table = jpeg_std_message_table;
   err->last_jpeg_message = (int)JMSG_LASTMSGCODE - 1;
 
-  err->addon_message_table = NULL;
-  err->first_addon_message = 0; /* for safety */
-  err->last_addon_message = 0;
-
   return err;
 }
diff --git a/3rdparty/libjpeg-turbo/src/jerror.h b/3rdparty/libjpeg-turbo/src/jerror.h
index eb44a1140a2e..71ba03e2a3ed 100644
--- a/3rdparty/libjpeg-turbo/src/jerror.h
+++ b/3rdparty/libjpeg-turbo/src/jerror.h
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
+ * Copyright (C) 2014, 2017, 2021-2023, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -53,7 +55,8 @@ JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
 #if JPEG_LIB_VERSION >= 70
 JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
 #endif
-JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
+JMESSAGE(JERR_BAD_DCT_COEF,
+         "DCT coefficient (lossy) or spatial difference (lossless) out of range")
 JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
 #if JPEG_LIB_VERSION >= 70
 JMESSAGE(JERR_BAD_DROP_SAMPLING,
@@ -69,9 +72,9 @@ JMESSAGE(JERR_BAD_MCU_SIZE, "Sampling factors too large for interleaved scan")
 JMESSAGE(JERR_BAD_POOL_ID, "Invalid memory pool code %d")
 JMESSAGE(JERR_BAD_PRECISION, "Unsupported JPEG data precision %d")
 JMESSAGE(JERR_BAD_PROGRESSION,
-         "Invalid progressive parameters Ss=%d Se=%d Ah=%d Al=%d")
+         "Invalid progressive/lossless parameters Ss=%d Se=%d Ah=%d Al=%d")
 JMESSAGE(JERR_BAD_PROG_SCRIPT,
-         "Invalid progressive parameters at scan script entry %d")
+         "Invalid progressive/lossless parameters at scan script entry %d")
 JMESSAGE(JERR_BAD_SAMPLING, "Bogus sampling factors")
 JMESSAGE(JERR_BAD_SCAN_SCRIPT, "Invalid scan script at entry %d")
 JMESSAGE(JERR_BAD_STATE, "Improper call to JPEG library in state %d")
@@ -108,7 +111,7 @@ JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
 #if JPEG_LIB_VERSION >= 70
 JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 #endif
-JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
+JMESSAGE(JERR_NO_BACKING_STORE, "Memory limit exceeded")
 JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
 JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
 JMESSAGE(JERR_NO_QUANT_TABLE, "Quantization table 0x%02x was not defined")
@@ -180,7 +183,7 @@ JMESSAGE(JTRC_THUMB_PALETTE,
 JMESSAGE(JTRC_THUMB_RGB,
          "JFIF extension marker: RGB thumbnail image, length %u")
 JMESSAGE(JTRC_UNKNOWN_IDS,
-         "Unrecognized component IDs %d %d %d, assuming YCbCr")
+         "Unrecognized component IDs %d %d %d, assuming YCbCr (lossy) or RGB (lossless)")
 JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
 JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
 JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
@@ -211,6 +214,8 @@ JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
 JMESSAGE(JERR_BAD_DROP_SAMPLING,
          "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
 #endif
+JMESSAGE(JERR_BAD_RESTART,
+         "Invalid restart interval %d; must be an integer multiple of the number of MCUs in an MCU row (%d)")
 
 #ifdef JMAKE_ENUM_LIST
 
diff --git a/3rdparty/libjpeg-turbo/src/jfdctfst.c b/3rdparty/libjpeg-turbo/src/jfdctfst.c
index 4c9ce0de8faa..26070d19a620 100644
--- a/3rdparty/libjpeg-turbo/src/jfdctfst.c
+++ b/3rdparty/libjpeg-turbo/src/jfdctfst.c
@@ -114,7 +114,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast(DCTELEM *data)
+_jpeg_fdct_ifast(DCTELEM *data)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
diff --git a/3rdparty/libjpeg-turbo/src/jfdctint.c b/3rdparty/libjpeg-turbo/src/jfdctint.c
index c95a3a7fb8a6..974013fa409c 100644
--- a/3rdparty/libjpeg-turbo/src/jfdctint.c
+++ b/3rdparty/libjpeg-turbo/src/jfdctint.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, 2020, D. R. Commander.
+ * Copyright (C) 2015, 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -140,7 +140,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow(DCTELEM *data)
+_jpeg_fdct_islow(DCTELEM *data)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   JLONG tmp10, tmp11, tmp12, tmp13;
diff --git a/3rdparty/libjpeg-turbo/src/jidctflt.c b/3rdparty/libjpeg-turbo/src/jidctflt.c
index 5aee74e2321e..ee3a31a61682 100644
--- a/3rdparty/libjpeg-turbo/src/jidctflt.c
+++ b/3rdparty/libjpeg-turbo/src/jidctflt.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * Modified 2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -69,9 +69,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -79,8 +79,8 @@ jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   FLOAT_MULT_TYPE *quantptr;
   FAST_FLOAT *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = cinfo->sample_range_limit;
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   int ctr;
   FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
 #define _0_125  ((FLOAT_MULT_TYPE)0.125)
@@ -192,7 +192,7 @@ jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Apply signed->unsigned and prepare float->int conversion */
-    z5 = wsptr[0] + ((FAST_FLOAT)CENTERJSAMPLE + (FAST_FLOAT)0.5);
+    z5 = wsptr[0] + ((FAST_FLOAT)_CENTERJSAMPLE + (FAST_FLOAT)0.5);
     tmp10 = z5 + wsptr[4];
     tmp11 = z5 - wsptr[4];
 
diff --git a/3rdparty/libjpeg-turbo/src/jidctfst.c b/3rdparty/libjpeg-turbo/src/jidctfst.c
index 89a20c937bbe..68119b9942be 100644
--- a/3rdparty/libjpeg-turbo/src/jidctfst.c
+++ b/3rdparty/libjpeg-turbo/src/jidctfst.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -64,10 +64,10 @@
  * The dequantized coefficients are not integers because the AA&N scaling
  * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
  * so that the first and second IDCT rounds have the same input scaling.
- * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
+ * For 8-bit samples, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
  * avoid a descaling shift; this compromises accuracy rather drastically
  * for small quantization table entries, but it saves a lot of shifts.
- * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
+ * For 12-bit samples, there's no hope of using 16x16 multiplies anyway,
  * so we use a much larger scaling factor to preserve accuracy.
  *
  * A final compromise is to represent the multiplicative constants to only
@@ -168,9 +168,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -178,8 +178,8 @@ jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   IFAST_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[DCTSIZE2];      /* buffers data between passes */
   SHIFT_TEMPS                   /* for DESCALE */
@@ -296,7 +296,7 @@ jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval =
+      _JSAMPLE dcval =
         range_limit[IDESCALE(wsptr[0], PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
diff --git a/3rdparty/libjpeg-turbo/src/jidctint.c b/3rdparty/libjpeg-turbo/src/jidctint.c
index bb0874801920..c58592d626d8 100644
--- a/3rdparty/libjpeg-turbo/src/jidctint.c
+++ b/3rdparty/libjpeg-turbo/src/jidctint.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modification developed 2002-2018 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, 2020, D. R. Commander.
+ * Copyright (C) 2015, 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -170,9 +170,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3;
   JLONG tmp10, tmp11, tmp12, tmp13;
@@ -180,8 +180,8 @@ jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[DCTSIZE2];      /* buffers data between passes */
   SHIFT_TEMPS
@@ -314,8 +314,8 @@ jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
-                                               PASS1_BITS + 3) & RANGE_MASK];
+      _JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                                PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -424,17 +424,17 @@ jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JCOEFPTR coef_block, JSAMPARRAY output_buf,
-              JDIMENSION output_col)
+_jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
   JLONG z1, z2, z3;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[7 * 7];         /* buffers data between passes */
   SHIFT_TEMPS
@@ -573,17 +573,17 @@ jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JCOEFPTR coef_block, JSAMPARRAY output_buf,
-              JDIMENSION output_col)
+_jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
   JLONG z1, z2, z3;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[6 * 6];         /* buffers data between passes */
   SHIFT_TEMPS
@@ -694,17 +694,17 @@ jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JCOEFPTR coef_block, JSAMPARRAY output_buf,
-              JDIMENSION output_col)
+_jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
   JLONG z1, z2, z3;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[5 * 5];         /* buffers data between passes */
   SHIFT_TEMPS
@@ -809,16 +809,16 @@ jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JCOEFPTR coef_block, JSAMPARRAY output_buf,
-              JDIMENSION output_col)
+_jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
   JLONG tmp0, tmp2, tmp10, tmp12;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[3 * 3];         /* buffers data between passes */
   SHIFT_TEMPS
@@ -899,17 +899,17 @@ jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JCOEFPTR coef_block, JSAMPARRAY output_buf,
-              JDIMENSION output_col)
+_jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[8 * 9];         /* buffers data between passes */
   SHIFT_TEMPS
@@ -1070,9 +1070,9 @@ jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
@@ -1080,8 +1080,8 @@ jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[8 * 10];        /* buffers data between passes */
   SHIFT_TEMPS
@@ -1265,9 +1265,9 @@ jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1275,8 +1275,8 @@ jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[8 * 11];        /* buffers data between passes */
   SHIFT_TEMPS
@@ -1459,9 +1459,9 @@ jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1469,8 +1469,8 @@ jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[8 * 12];        /* buffers data between passes */
   SHIFT_TEMPS
@@ -1675,9 +1675,9 @@ jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1685,8 +1685,8 @@ jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[8 * 13];        /* buffers data between passes */
   SHIFT_TEMPS
@@ -1903,9 +1903,9 @@ jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1913,8 +1913,8 @@ jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[8 * 14];        /* buffers data between passes */
   SHIFT_TEMPS
@@ -2129,9 +2129,9 @@ jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2139,8 +2139,8 @@ jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[8 * 15];        /* buffers data between passes */
   SHIFT_TEMPS
@@ -2371,9 +2371,9 @@ jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+_jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2381,8 +2381,8 @@ jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[8 * 16];        /* buffers data between passes */
   SHIFT_TEMPS
diff --git a/3rdparty/libjpeg-turbo/src/jidctred.c b/3rdparty/libjpeg-turbo/src/jidctred.c
index 1dd65a94d975..6521e3ebbfc7 100644
--- a/3rdparty/libjpeg-turbo/src/jidctred.c
+++ b/3rdparty/libjpeg-turbo/src/jidctred.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -118,17 +118,17 @@
  */
 
 GLOBAL(void)
-jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JCOEFPTR coef_block, JSAMPARRAY output_buf,
-              JDIMENSION output_col)
+_jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
   JLONG tmp0, tmp2, tmp10, tmp12;
   JLONG z1, z2, z3, z4;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[DCTSIZE * 4];   /* buffers data between passes */
   SHIFT_TEMPS
@@ -210,8 +210,8 @@ jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
-                                               PASS1_BITS + 3) & RANGE_MASK];
+      _JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                                PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -276,16 +276,16 @@ jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JCOEFPTR coef_block, JSAMPARRAY output_buf,
-              JDIMENSION output_col)
+_jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
   JLONG tmp0, tmp10, z1;
   JCOEFPTR inptr;
   ISLOW_MULT_TYPE *quantptr;
   int *wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPROW outptr;
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
   int workspace[DCTSIZE * 2];   /* buffers data between passes */
   SHIFT_TEMPS
@@ -345,8 +345,8 @@ jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
-                                               PASS1_BITS + 3) & RANGE_MASK];
+      _JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                                PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -387,13 +387,13 @@ jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_1x1(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JCOEFPTR coef_block, JSAMPARRAY output_buf,
-              JDIMENSION output_col)
+_jpeg_idct_1x1(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
   int dcval;
   ISLOW_MULT_TYPE *quantptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   SHIFT_TEMPS
 
   /* We hardly need an inverse DCT routine for this: just take the
diff --git a/3rdparty/libjpeg-turbo/src/jinclude.h b/3rdparty/libjpeg-turbo/src/jinclude.h
index 120614b25cf3..56e7a4b296d2 100644
--- a/3rdparty/libjpeg-turbo/src/jinclude.h
+++ b/3rdparty/libjpeg-turbo/src/jinclude.h
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1994, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2022, D. R. Commander.
+ * Copyright (C) 2022-2023, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -45,6 +45,18 @@
  */
 
 
+#ifdef _MSC_VER
+
+#define SNPRINTF(str, n, format, ...) \
+  _snprintf_s(str, n, _TRUNCATE, format, ##__VA_ARGS__)
+
+#else
+
+#define SNPRINTF  snprintf
+
+#endif
+
+
 #ifndef NO_GETENV
 
 #ifdef _MSC_VER
@@ -111,6 +123,8 @@ static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
 
 #else
 
+#include <errno.h>
+
 /* This provides a similar interface to the Microsoft _putenv_s() function, but
  * other than parameter validation, it has no advantages over setenv().
  */
diff --git a/3rdparty/libjpeg-turbo/src/jlossls.h b/3rdparty/libjpeg-turbo/src/jlossls.h
new file mode 100644
index 000000000000..ce4170413452
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jlossls.h
@@ -0,0 +1,101 @@
+/*
+ * jlossls.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1998, Thomas G. Lane.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This include file contains common declarations for the lossless JPEG
+ * codec modules.
+ */
+
+#ifndef JLOSSLS_H
+#define JLOSSLS_H
+
+#if defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED)
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+#include "jsamplecomp.h"
+
+
+#define ALLOC_DARRAY(pool_id, diffsperrow, numrows) \
+  (JDIFFARRAY)(*cinfo->mem->alloc_sarray) \
+    ((j_common_ptr)cinfo, pool_id, \
+     (diffsperrow) * sizeof(JDIFF) / sizeof(_JSAMPLE), numrows)
+
+
+/*
+ * Table H.1: Predictors for lossless coding.
+ */
+
+#define PREDICTOR1  Ra
+#define PREDICTOR2  Rb
+#define PREDICTOR3  Rc
+#define PREDICTOR4  (int)((JLONG)Ra + (JLONG)Rb - (JLONG)Rc)
+#define PREDICTOR5  (int)((JLONG)Ra + RIGHT_SHIFT((JLONG)Rb - (JLONG)Rc, 1))
+#define PREDICTOR6  (int)((JLONG)Rb + RIGHT_SHIFT((JLONG)Ra - (JLONG)Rc, 1))
+#define PREDICTOR7  (int)RIGHT_SHIFT((JLONG)Ra + (JLONG)Rb, 1)
+
+#endif
+
+
+#ifdef C_LOSSLESS_SUPPORTED
+
+typedef void (*predict_difference_method_ptr) (j_compress_ptr cinfo, int ci,
+                                               _JSAMPROW input_buf,
+                                               _JSAMPROW prev_row,
+                                               JDIFFROW diff_buf,
+                                               JDIMENSION width);
+
+/* Lossless compressor */
+typedef struct {
+  struct jpeg_forward_dct pub;  /* public fields */
+
+  /* It is useful to allow each component to have a separate diff method. */
+  predict_difference_method_ptr predict_difference[MAX_COMPONENTS];
+
+  /* MCU rows left in the restart interval for each component */
+  unsigned int restart_rows_to_go[MAX_COMPONENTS];
+
+  /* Sample scaling */
+  void (*scaler_scale) (j_compress_ptr cinfo, _JSAMPROW input_buf,
+                        _JSAMPROW output_buf, JDIMENSION width);
+} jpeg_lossless_compressor;
+
+typedef jpeg_lossless_compressor *lossless_comp_ptr;
+
+#endif /* C_LOSSLESS_SUPPORTED */
+
+
+#ifdef D_LOSSLESS_SUPPORTED
+
+typedef void (*predict_undifference_method_ptr) (j_decompress_ptr cinfo,
+                                                 int comp_index,
+                                                 JDIFFROW diff_buf,
+                                                 JDIFFROW prev_row,
+                                                 JDIFFROW undiff_buf,
+                                                 JDIMENSION width);
+
+/* Lossless decompressor */
+typedef struct {
+  struct jpeg_inverse_dct pub;  /* public fields */
+
+  /* It is useful to allow each component to have a separate undiff method. */
+  predict_undifference_method_ptr predict_undifference[MAX_COMPONENTS];
+
+  /* Sample scaling */
+  void (*scaler_scale) (j_decompress_ptr cinfo, JDIFFROW diff_buf,
+                        _JSAMPROW output_buf, JDIMENSION width);
+} jpeg_lossless_decompressor;
+
+typedef jpeg_lossless_decompressor *lossless_decomp_ptr;
+
+#endif /* D_LOSSLESS_SUPPORTED */
+
+#endif /* JLOSSLS_H */
diff --git a/3rdparty/libjpeg-turbo/src/jmemmgr.c b/3rdparty/libjpeg-turbo/src/jmemmgr.c
index 8f5a4ab1c78b..dca8f5c22ca7 100644
--- a/3rdparty/libjpeg-turbo/src/jmemmgr.c
+++ b/3rdparty/libjpeg-turbo/src/jmemmgr.c
@@ -68,10 +68,13 @@ round_up_pow2(size_t a, size_t b)
  * There isn't any really portable way to determine the worst-case alignment
  * requirement.  This module assumes that the alignment requirement is
  * multiples of ALIGN_SIZE.
- * By default, we define ALIGN_SIZE as sizeof(double).  This is necessary on
- * some workstations (where doubles really do need 8-byte alignment) and will
- * work fine on nearly everything.  If your machine has lesser alignment needs,
- * you can save a few bytes by making ALIGN_SIZE smaller.
+ * By default, we define ALIGN_SIZE as the maximum of sizeof(double) and
+ * sizeof(void *).  This is necessary on some workstations (where doubles
+ * really do need 8-byte alignment) and will work fine on nearly everything.
+ * We use the maximum of sizeof(double) and sizeof(void *) since sizeof(double)
+ * may be insufficient, for example, on CHERI-enabled platforms with 16-byte
+ * pointers and a 16-byte alignment requirement.  If your machine has lesser
+ * alignment needs, you can save a few bytes by making ALIGN_SIZE smaller.
  * The only place I know of where this will NOT work is certain Macintosh
  * 680x0 compilers that define double as a 10-byte IEEE extended float.
  * Doing 10-byte alignment is counterproductive because longwords won't be
@@ -81,7 +84,7 @@ round_up_pow2(size_t a, size_t b)
 
 #ifndef ALIGN_SIZE              /* so can override from jconfig.h */
 #ifndef WITH_SIMD
-#define ALIGN_SIZE  sizeof(double)
+#define ALIGN_SIZE  MAX(sizeof(void *), sizeof(double))
 #else
 #define ALIGN_SIZE  32 /* Most of the SIMD instructions we support require
                           16-byte (128-bit) alignment, but AVX2 requires
@@ -152,7 +155,9 @@ typedef my_memory_mgr *my_mem_ptr;
  */
 
 struct jvirt_sarray_control {
-  JSAMPARRAY mem_buffer;        /* => the in-memory buffer */
+  JSAMPARRAY mem_buffer;        /* => the in-memory buffer (if
+                                   cinfo->data_precision is 12, then this is
+                                   actually a J12SAMPARRAY) */
   JDIMENSION rows_in_array;     /* total virtual array height */
   JDIMENSION samplesperrow;     /* width of array (and of memory buffer) */
   JDIMENSION maxaccess;         /* max rows accessed by access_virt_sarray */
@@ -348,9 +353,10 @@ alloc_small(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  * request is large enough that it may as well be passed directly to
  * jpeg_get_large; the pool management just links everything together
  * so that we can free it all on demand.
- * Note: the major use of "large" objects is in JSAMPARRAY and JBLOCKARRAY
- * structures.  The routines that create these structures (see below)
- * deliberately bunch rows together to ensure a large request size.
+ * Note: the major use of "large" objects is in
+ * JSAMPARRAY/J12SAMPARRAY/J16SAMPARRAY and JBLOCKARRAY structures.  The
+ * routines that create these structures (see below) deliberately bunch rows
+ * together to ensure a large request size.
  */
 
 METHODDEF(void *)
@@ -434,9 +440,22 @@ alloc_sarray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
   JSAMPROW workspace;
   JDIMENSION rowsperchunk, currow, i;
   long ltemp;
+  J12SAMPARRAY result12;
+  J12SAMPROW workspace12;
+#if defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED)
+  J16SAMPARRAY result16;
+  J16SAMPROW workspace16;
+#endif
+  int data_precision = cinfo->is_decompressor ?
+                        ((j_decompress_ptr)cinfo)->data_precision :
+                        ((j_compress_ptr)cinfo)->data_precision;
+  size_t sample_size = data_precision == 16 ?
+                       sizeof(J16SAMPLE) : (data_precision == 12 ?
+                                            sizeof(J12SAMPLE) :
+                                            sizeof(JSAMPLE));
 
   /* Make sure each row is properly aligned */
-  if ((ALIGN_SIZE % sizeof(JSAMPLE)) != 0)
+  if ((ALIGN_SIZE % sample_size) != 0)
     out_of_memory(cinfo, 5);    /* safety check */
 
   if (samplesperrow > MAX_ALLOC_CHUNK) {
@@ -445,11 +464,11 @@ alloc_sarray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
     out_of_memory(cinfo, 9);
   }
   samplesperrow = (JDIMENSION)round_up_pow2(samplesperrow, (2 * ALIGN_SIZE) /
-                                                           sizeof(JSAMPLE));
+                                                           sample_size);
 
   /* Calculate max # of rows allowed in one allocation chunk */
   ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
-          ((long)samplesperrow * sizeof(JSAMPLE));
+          ((long)samplesperrow * (long)sample_size);
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
   if (ltemp < (long)numrows)
@@ -458,24 +477,68 @@ alloc_sarray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
     rowsperchunk = numrows;
   mem->last_rowsperchunk = rowsperchunk;
 
-  /* Get space for row pointers (small object) */
-  result = (JSAMPARRAY)alloc_small(cinfo, pool_id,
-                                   (size_t)(numrows * sizeof(JSAMPROW)));
+  if (data_precision == 16) {
+#if defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED)
+    /* Get space for row pointers (small object) */
+    result16 = (J16SAMPARRAY)alloc_small(cinfo, pool_id,
+                                         (size_t)(numrows *
+                                                  sizeof(J16SAMPROW)));
+
+    /* Get the rows themselves (large objects) */
+    currow = 0;
+    while (currow < numrows) {
+      rowsperchunk = MIN(rowsperchunk, numrows - currow);
+      workspace16 = (J16SAMPROW)alloc_large(cinfo, pool_id,
+        (size_t)((size_t)rowsperchunk * (size_t)samplesperrow * sample_size));
+      for (i = rowsperchunk; i > 0; i--) {
+        result16[currow++] = workspace16;
+        workspace16 += samplesperrow;
+      }
+    }
 
-  /* Get the rows themselves (large objects) */
-  currow = 0;
-  while (currow < numrows) {
-    rowsperchunk = MIN(rowsperchunk, numrows - currow);
-    workspace = (JSAMPROW)alloc_large(cinfo, pool_id,
-      (size_t)((size_t)rowsperchunk * (size_t)samplesperrow *
-               sizeof(JSAMPLE)));
-    for (i = rowsperchunk; i > 0; i--) {
-      result[currow++] = workspace;
-      workspace += samplesperrow;
+    return (JSAMPARRAY)result16;
+#else
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, data_precision);
+    return NULL;
+#endif
+  } else if (data_precision == 12) {
+    /* Get space for row pointers (small object) */
+    result12 = (J12SAMPARRAY)alloc_small(cinfo, pool_id,
+                                         (size_t)(numrows *
+                                                  sizeof(J12SAMPROW)));
+
+    /* Get the rows themselves (large objects) */
+    currow = 0;
+    while (currow < numrows) {
+      rowsperchunk = MIN(rowsperchunk, numrows - currow);
+      workspace12 = (J12SAMPROW)alloc_large(cinfo, pool_id,
+        (size_t)((size_t)rowsperchunk * (size_t)samplesperrow * sample_size));
+      for (i = rowsperchunk; i > 0; i--) {
+        result12[currow++] = workspace12;
+        workspace12 += samplesperrow;
+      }
     }
-  }
 
-  return result;
+    return (JSAMPARRAY)result12;
+  } else {
+    /* Get space for row pointers (small object) */
+    result = (JSAMPARRAY)alloc_small(cinfo, pool_id,
+                                     (size_t)(numrows * sizeof(JSAMPROW)));
+
+    /* Get the rows themselves (large objects) */
+    currow = 0;
+    while (currow < numrows) {
+      rowsperchunk = MIN(rowsperchunk, numrows - currow);
+      workspace = (JSAMPROW)alloc_large(cinfo, pool_id,
+        (size_t)((size_t)rowsperchunk * (size_t)samplesperrow * sample_size));
+      for (i = rowsperchunk; i > 0; i--) {
+        result[currow++] = workspace;
+        workspace += samplesperrow;
+      }
+    }
+
+    return result;
+  }
 }
 
 
@@ -637,6 +700,13 @@ realize_virt_arrays(j_common_ptr cinfo)
   size_t minheights, max_minheights;
   jvirt_sarray_ptr sptr;
   jvirt_barray_ptr bptr;
+  int data_precision = cinfo->is_decompressor ?
+                        ((j_decompress_ptr)cinfo)->data_precision :
+                        ((j_compress_ptr)cinfo)->data_precision;
+  size_t sample_size = data_precision == 16 ?
+                       sizeof(J16SAMPLE) : (data_precision == 12 ?
+                                            sizeof(J12SAMPLE) :
+                                            sizeof(JSAMPLE));
 
   /* Compute the minimum space needed (maxaccess rows in each buffer)
    * and the maximum space needed (full image height in each buffer).
@@ -647,10 +717,10 @@ realize_virt_arrays(j_common_ptr cinfo)
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
       size_t new_space = (long)sptr->rows_in_array *
-                         (long)sptr->samplesperrow * sizeof(JSAMPLE);
+                         (long)sptr->samplesperrow * sample_size;
 
       space_per_minheight += (long)sptr->maxaccess *
-                             (long)sptr->samplesperrow * sizeof(JSAMPLE);
+                             (long)sptr->samplesperrow * sample_size;
       if (SIZE_MAX - maximum_space < new_space)
         out_of_memory(cinfo, 10);
       maximum_space += new_space;
@@ -705,7 +775,7 @@ realize_virt_arrays(j_common_ptr cinfo)
         jpeg_open_backing_store(cinfo, &sptr->b_s_info,
                                 (long)sptr->rows_in_array *
                                 (long)sptr->samplesperrow *
-                                (long)sizeof(JSAMPLE));
+                                (long)sample_size);
         sptr->b_s_open = TRUE;
       }
       sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE,
@@ -748,8 +818,15 @@ do_sarray_io(j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
 /* Do backing store read or write of a virtual sample array */
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
-
-  bytesperrow = (long)ptr->samplesperrow * sizeof(JSAMPLE);
+  int data_precision = cinfo->is_decompressor ?
+                        ((j_decompress_ptr)cinfo)->data_precision :
+                        ((j_compress_ptr)cinfo)->data_precision;
+  size_t sample_size = data_precision == 16 ?
+                       sizeof(J16SAMPLE) : (data_precision == 12 ?
+                                            sizeof(J12SAMPLE) :
+                                            sizeof(JSAMPLE));
+
+  bytesperrow = (long)ptr->samplesperrow * (long)sample_size;
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
   for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
@@ -763,14 +840,42 @@ do_sarray_io(j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
     if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
-    if (writing)
-      (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
-                                            (void *)ptr->mem_buffer[i],
-                                            file_offset, byte_count);
-    else
-      (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
-                                           (void *)ptr->mem_buffer[i],
-                                           file_offset, byte_count);
+    if (data_precision == 16) {
+#if defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED)
+      J16SAMPARRAY mem_buffer16 = (J16SAMPARRAY)ptr->mem_buffer;
+
+      if (writing)
+        (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                              (void *)mem_buffer16[i],
+                                              file_offset, byte_count);
+      else
+        (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                             (void *)mem_buffer16[i],
+                                             file_offset, byte_count);
+#else
+      ERREXIT1(cinfo, JERR_BAD_PRECISION, data_precision);
+#endif
+    } else if (data_precision == 12) {
+      J12SAMPARRAY mem_buffer12 = (J12SAMPARRAY)ptr->mem_buffer;
+
+      if (writing)
+        (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                              (void *)mem_buffer12[i],
+                                              file_offset, byte_count);
+      else
+        (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                             (void *)mem_buffer12[i],
+                                             file_offset, byte_count);
+    } else {
+      if (writing)
+        (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                              (void *)ptr->mem_buffer[i],
+                                              file_offset, byte_count);
+      else
+        (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                             (void *)ptr->mem_buffer[i],
+                                             file_offset, byte_count);
+    }
     file_offset += byte_count;
   }
 }
@@ -818,6 +923,13 @@ access_virt_sarray(j_common_ptr cinfo, jvirt_sarray_ptr ptr,
 {
   JDIMENSION end_row = start_row + num_rows;
   JDIMENSION undef_row;
+  int data_precision = cinfo->is_decompressor ?
+                        ((j_decompress_ptr)cinfo)->data_precision :
+                        ((j_compress_ptr)cinfo)->data_precision;
+  size_t sample_size = data_precision == 16 ?
+                       sizeof(J16SAMPLE) : (data_precision == 12 ?
+                                            sizeof(J12SAMPLE) :
+                                            sizeof(JSAMPLE));
 
   /* debugging check */
   if (end_row > ptr->rows_in_array || num_rows > ptr->maxaccess ||
@@ -873,7 +985,7 @@ access_virt_sarray(j_common_ptr cinfo, jvirt_sarray_ptr ptr,
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t)ptr->samplesperrow * sizeof(JSAMPLE);
+      size_t bytesperrow = (size_t)ptr->samplesperrow * sample_size;
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
diff --git a/3rdparty/libjpeg-turbo/src/jmemsys.h b/3rdparty/libjpeg-turbo/src/jmemsys.h
index 9229550afde0..ac09ef4c36d3 100644
--- a/3rdparty/libjpeg-turbo/src/jmemsys.h
+++ b/3rdparty/libjpeg-turbo/src/jmemsys.h
@@ -99,24 +99,6 @@ EXTERN(size_t) jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
 #define TEMP_NAME_LENGTH   64   /* max length of a temporary file's name */
 
 
-#ifdef USE_MSDOS_MEMMGR         /* DOS-specific junk */
-
-typedef unsigned short XMSH;    /* type of extended-memory handles */
-typedef unsigned short EMSH;    /* type of expanded-memory handles */
-
-typedef union {
-  short file_handle;            /* DOS file handle if it's a temp file */
-  XMSH xms_handle;              /* handle if it's a chunk of XMS */
-  EMSH ems_handle;              /* handle if it's a chunk of EMS */
-} handle_union;
-
-#endif /* USE_MSDOS_MEMMGR */
-
-#ifdef USE_MAC_MEMMGR           /* Mac-specific junk */
-#include <Files.h>
-#endif /* USE_MAC_MEMMGR */
-
-
 typedef struct backing_store_struct *backing_store_ptr;
 
 typedef struct backing_store_struct {
@@ -130,22 +112,9 @@ typedef struct backing_store_struct {
   void (*close_backing_store) (j_common_ptr cinfo, backing_store_ptr info);
 
   /* Private fields for system-dependent backing-store management */
-#ifdef USE_MSDOS_MEMMGR
-  /* For the MS-DOS manager (jmemdos.c), we need: */
-  handle_union handle;          /* reference to backing-store storage object */
-  char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
-#else
-#ifdef USE_MAC_MEMMGR
-  /* For the Mac manager (jmemmac.c), we need: */
-  short temp_file;              /* file reference number to temp file */
-  FSSpec tempSpec;              /* the FSSpec for the temp file */
-  char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
-#else
   /* For a typical implementation with temp files, we need: */
   FILE *temp_file;              /* stdio reference to temp file */
   char temp_name[TEMP_NAME_LENGTH]; /* name of temp file */
-#endif
-#endif
 } backing_store_info;
 
 
diff --git a/3rdparty/libjpeg-turbo/src/jmorecfg.h b/3rdparty/libjpeg-turbo/src/jmorecfg.h
index b33a991914ee..89c7842c8716 100644
--- a/3rdparty/libjpeg-turbo/src/jmorecfg.h
+++ b/3rdparty/libjpeg-turbo/src/jmorecfg.h
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, 2014-2015, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014-2015, 2018, 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -41,31 +43,29 @@
  * arrays is very slow on your hardware, you might want to change these.
  */
 
-#if BITS_IN_JSAMPLE == 8
-/* JSAMPLE should be the smallest type that will hold the values 0..255.
- */
+/* JSAMPLE should be the smallest type that will hold the values 0..255. */
 
 typedef unsigned char JSAMPLE;
 #define GETJSAMPLE(value)  ((int)(value))
 
-#define MAXJSAMPLE      255
-#define CENTERJSAMPLE   128
+#define MAXJSAMPLE       255
+#define CENTERJSAMPLE    128
 
-#endif /* BITS_IN_JSAMPLE == 8 */
 
+/* J12SAMPLE should be the smallest type that will hold the values 0..4095. */
 
-#if BITS_IN_JSAMPLE == 12
-/* JSAMPLE should be the smallest type that will hold the values 0..4095.
- * On nearly all machines "short" will do nicely.
- */
+typedef short J12SAMPLE;
 
-typedef short JSAMPLE;
-#define GETJSAMPLE(value)  ((int)(value))
+#define MAXJ12SAMPLE     4095
+#define CENTERJ12SAMPLE  2048
+
+
+/* J16SAMPLE should be the smallest type that will hold the values 0..65535. */
 
-#define MAXJSAMPLE      4095
-#define CENTERJSAMPLE   2048
+typedef unsigned short J16SAMPLE;
 
-#endif /* BITS_IN_JSAMPLE == 12 */
+#define MAXJ16SAMPLE     65535
+#define CENTERJ16SAMPLE  32768
 
 
 /* Representation of a DCT frequency coefficient.
@@ -242,14 +242,16 @@ typedef int boolean;
 
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define C_PROGRESSIVE_SUPPORTED     /* Progressive JPEG? (Requires MULTISCAN)*/
+#define C_LOSSLESS_SUPPORTED        /* Lossless JPEG? */
 #define ENTROPY_OPT_SUPPORTED       /* Optimization of entropy coding parms? */
 /* Note: if you selected 12-bit data precision, it is dangerous to turn off
  * ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only good for 8-bit
  * precision, so jchuff.c normally uses entropy optimization to compute
  * usable tables for higher precision.  If you don't want to do optimization,
  * you'll have to supply different default Huffman tables.
- * The exact same statements apply for progressive JPEG: the default tables
- * don't work for progressive mode.  (This may get fixed, however.)
+ * The exact same statements apply for progressive and lossless JPEG:
+ * the default tables don't work for progressive mode or lossless mode.
+ * (This may get fixed, however.)
  */
 #define INPUT_SMOOTHING_SUPPORTED   /* Input image smoothing option? */
 
@@ -257,6 +259,7 @@ typedef int boolean;
 
 #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define D_PROGRESSIVE_SUPPORTED     /* Progressive JPEG? (Requires MULTISCAN)*/
+#define D_LOSSLESS_SUPPORTED        /* Lossless JPEG? */
 #define SAVE_MARKERS_SUPPORTED      /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
 #define IDCT_SCALING_SUPPORTED      /* Output rescaling via IDCT? */
diff --git a/3rdparty/libjpeg-turbo/src/jpeg_nbits_table.h b/3rdparty/libjpeg-turbo/src/jpeg_nbits.c
similarity index 99%
rename from 3rdparty/libjpeg-turbo/src/jpeg_nbits_table.h
rename to 3rdparty/libjpeg-turbo/src/jpeg_nbits.c
index fcf73878c318..c8ee6b056cbf 100644
--- a/3rdparty/libjpeg-turbo/src/jpeg_nbits_table.h
+++ b/3rdparty/libjpeg-turbo/src/jpeg_nbits.c
@@ -1,4 +1,32 @@
-static const unsigned char jpeg_nbits_table[65536] = {
+/*
+ * Copyright (C) 2024, D. R. Commander.
+ *
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#include "jpeg_nbits.h"
+#include "jconfigint.h"
+
+
+#ifndef USE_CLZ_INTRINSIC
+
+#define INCLUDE_JPEG_NBITS_TABLE
+
+/* When building for x86[-64] with the SIMD extensions enabled, the C Huffman
+ * encoders can reuse jpeg_nbits_table from the SSE2 baseline Huffman encoder.
+ */
+#if (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || \
+     defined(_M_X64)) && defined(WITH_SIMD)
+#undef INCLUDE_JPEG_NBITS_TABLE
+#endif
+
+#endif
+
+
+#ifdef INCLUDE_JPEG_NBITS_TABLE
+
+const unsigned char HIDDEN jpeg_nbits_table[65536] = {
    0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
    5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
@@ -4096,3 +4124,11 @@ static const unsigned char jpeg_nbits_table[65536] = {
   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 };
+
+#else
+
+/* Suppress compiler warnings about empty translation unit. */
+
+typedef int dummy_jpeg_nbits_table;
+
+#endif
diff --git a/3rdparty/libjpeg-turbo/src/jpeg_nbits.h b/3rdparty/libjpeg-turbo/src/jpeg_nbits.h
new file mode 100644
index 000000000000..6481a1228d18
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jpeg_nbits.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2014, 2021, 2024, D. R. Commander.
+ * Copyright (C) 2014, Olle Liljenzin.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table.  This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.)  This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on Arm processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it.  When building for Armv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
+#else
+#define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
+#endif
+#define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+extern const unsigned char jpeg_nbits_table[65536];
+#define JPEG_NBITS(x)          (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x)  JPEG_NBITS(x)
+#endif
diff --git a/3rdparty/libjpeg-turbo/src/jpegcomp.h b/3rdparty/libjpeg-turbo/src/jpegapicomp.h
similarity index 98%
rename from 3rdparty/libjpeg-turbo/src/jpegcomp.h
rename to 3rdparty/libjpeg-turbo/src/jpegapicomp.h
index c4834ac0df9d..bb3912eb2f16 100644
--- a/3rdparty/libjpeg-turbo/src/jpegcomp.h
+++ b/3rdparty/libjpeg-turbo/src/jpegapicomp.h
@@ -1,5 +1,5 @@
 /*
- * jpegcomp.h
+ * jpegapicomp.h
  *
  * Copyright (C) 2010, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
diff --git a/3rdparty/libjpeg-turbo/src/jpegint.h b/3rdparty/libjpeg-turbo/src/jpegint.h
index 6af9e2a179e8..654142014349 100644
--- a/3rdparty/libjpeg-turbo/src/jpegint.h
+++ b/3rdparty/libjpeg-turbo/src/jpegint.h
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, 2019, 2021, D. R. Commander.
+ * Copyright (C) 2015-2017, 2019, 2021-2022, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * Copyright (C) 2021, Alex Richardson.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -17,6 +19,17 @@
  */
 
 
+/* Representation of a spatial difference value.
+ * This should be a signed value of at least 16 bits; int is usually OK.
+ */
+
+typedef int JDIFF;
+
+typedef JDIFF FAR *JDIFFROW;    /* pointer to one row of difference values */
+typedef JDIFFROW *JDIFFARRAY;   /* ptr to some rows (a 2-D diff array) */
+typedef JDIFFARRAY *JDIFFIMAGE; /* a 3-D diff array: top index is color */
+
+
 /* Declarations for both compression & decompression */
 
 typedef enum {            /* Operating modes for buffer controllers */
@@ -61,6 +74,9 @@ typedef __UINTPTR_TYPE__ JUINTPTR;
 typedef size_t JUINTPTR;
 #endif
 
+#define IsExtRGB(cs) \
+  (cs == JCS_RGB || (cs >= JCS_EXT_RGB && cs <= JCS_EXT_ARGB))
+
 /*
  * Left shift macro that handles a negative operand without causing any
  * sanitizer warnings
@@ -80,6 +96,7 @@ struct jpeg_comp_master {
   /* State variables made visible to other modules */
   boolean call_pass_startup;    /* True if pass_startup must be called */
   boolean is_last_pass;         /* True during last pass */
+  boolean lossless;             /* True if lossless mode is enabled */
 };
 
 /* Main buffer control (downsampled-data buffer) */
@@ -87,6 +104,12 @@ struct jpeg_c_main_controller {
   void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
   void (*process_data) (j_compress_ptr cinfo, JSAMPARRAY input_buf,
                         JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail);
+  void (*process_data_12) (j_compress_ptr cinfo, J12SAMPARRAY input_buf,
+                           JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail);
+#ifdef C_LOSSLESS_SUPPORTED
+  void (*process_data_16) (j_compress_ptr cinfo, J16SAMPARRAY input_buf,
+                           JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail);
+#endif
 };
 
 /* Compression preprocessing (downsampling input buffer control) */
@@ -97,12 +120,32 @@ struct jpeg_c_prep_controller {
                             JSAMPIMAGE output_buf,
                             JDIMENSION *out_row_group_ctr,
                             JDIMENSION out_row_groups_avail);
+  void (*pre_process_data_12) (j_compress_ptr cinfo, J12SAMPARRAY input_buf,
+                               JDIMENSION *in_row_ctr,
+                               JDIMENSION in_rows_avail,
+                               J12SAMPIMAGE output_buf,
+                               JDIMENSION *out_row_group_ctr,
+                               JDIMENSION out_row_groups_avail);
+#ifdef C_LOSSLESS_SUPPORTED
+  void (*pre_process_data_16) (j_compress_ptr cinfo, J16SAMPARRAY input_buf,
+                               JDIMENSION *in_row_ctr,
+                               JDIMENSION in_rows_avail,
+                               J16SAMPIMAGE output_buf,
+                               JDIMENSION *out_row_group_ctr,
+                               JDIMENSION out_row_groups_avail);
+#endif
 };
 
-/* Coefficient buffer control */
+/* Lossy mode: Coefficient buffer control
+ * Lossless mode: Difference buffer control
+ */
 struct jpeg_c_coef_controller {
   void (*start_pass) (j_compress_ptr cinfo, J_BUF_MODE pass_mode);
   boolean (*compress_data) (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+  boolean (*compress_data_12) (j_compress_ptr cinfo, J12SAMPIMAGE input_buf);
+#ifdef C_LOSSLESS_SUPPORTED
+  boolean (*compress_data_16) (j_compress_ptr cinfo, J16SAMPIMAGE input_buf);
+#endif
 };
 
 /* Colorspace conversion */
@@ -111,6 +154,14 @@ struct jpeg_color_converter {
   void (*color_convert) (j_compress_ptr cinfo, JSAMPARRAY input_buf,
                          JSAMPIMAGE output_buf, JDIMENSION output_row,
                          int num_rows);
+  void (*color_convert_12) (j_compress_ptr cinfo, J12SAMPARRAY input_buf,
+                            J12SAMPIMAGE output_buf, JDIMENSION output_row,
+                            int num_rows);
+#ifdef C_LOSSLESS_SUPPORTED
+  void (*color_convert_16) (j_compress_ptr cinfo, J16SAMPARRAY input_buf,
+                            J16SAMPIMAGE output_buf, JDIMENSION output_row,
+                            int num_rows);
+#endif
 };
 
 /* Downsampling */
@@ -119,24 +170,47 @@ struct jpeg_downsampler {
   void (*downsample) (j_compress_ptr cinfo, JSAMPIMAGE input_buf,
                       JDIMENSION in_row_index, JSAMPIMAGE output_buf,
                       JDIMENSION out_row_group_index);
+  void (*downsample_12) (j_compress_ptr cinfo, J12SAMPIMAGE input_buf,
+                         JDIMENSION in_row_index, J12SAMPIMAGE output_buf,
+                         JDIMENSION out_row_group_index);
+#ifdef C_LOSSLESS_SUPPORTED
+  void (*downsample_16) (j_compress_ptr cinfo, J16SAMPIMAGE input_buf,
+                         JDIMENSION in_row_index, J16SAMPIMAGE output_buf,
+                         JDIMENSION out_row_group_index);
+#endif
 
   boolean need_context_rows;    /* TRUE if need rows above & below */
 };
 
-/* Forward DCT (also controls coefficient quantization) */
+/* Lossy mode: Forward DCT (also controls coefficient quantization)
+ * Lossless mode: Prediction, sample differencing, and point transform
+ */
 struct jpeg_forward_dct {
   void (*start_pass) (j_compress_ptr cinfo);
+
+  /* Lossy mode */
   /* perhaps this should be an array??? */
   void (*forward_DCT) (j_compress_ptr cinfo, jpeg_component_info *compptr,
                        JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
                        JDIMENSION start_row, JDIMENSION start_col,
                        JDIMENSION num_blocks);
+  void (*forward_DCT_12) (j_compress_ptr cinfo, jpeg_component_info *compptr,
+                          J12SAMPARRAY sample_data, JBLOCKROW coef_blocks,
+                          JDIMENSION start_row, JDIMENSION start_col,
+                          JDIMENSION num_blocks);
 };
 
 /* Entropy encoding */
 struct jpeg_entropy_encoder {
   void (*start_pass) (j_compress_ptr cinfo, boolean gather_statistics);
+
+  /* Lossy mode */
   boolean (*encode_mcu) (j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+  /* Lossless mode */
+  JDIMENSION (*encode_mcus) (j_compress_ptr cinfo, JDIFFIMAGE diff_buf,
+                             JDIMENSION MCU_row_num, JDIMENSION MCU_col_num,
+                             JDIMENSION nMCU);
+
   void (*finish_pass) (j_compress_ptr cinfo);
 };
 
@@ -164,6 +238,7 @@ struct jpeg_decomp_master {
 
   /* State variables made visible to other modules */
   boolean is_dummy_pass;        /* True during 1st pass for 2-pass quant */
+  boolean lossless;             /* True if decompressing a lossless image */
 
   /* Partial decompression variables */
   JDIMENSION first_iMCU_col;
@@ -193,14 +268,36 @@ struct jpeg_d_main_controller {
   void (*start_pass) (j_decompress_ptr cinfo, J_BUF_MODE pass_mode);
   void (*process_data) (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
                         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+  void (*process_data_12) (j_decompress_ptr cinfo, J12SAMPARRAY output_buf,
+                           JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+#ifdef D_LOSSLESS_SUPPORTED
+  void (*process_data_16) (j_decompress_ptr cinfo, J16SAMPARRAY output_buf,
+                           JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+#endif
 };
 
-/* Coefficient buffer control */
+/* Lossy mode: Coefficient buffer control
+ * Lossless mode: Difference buffer control
+ */
 struct jpeg_d_coef_controller {
   void (*start_input_pass) (j_decompress_ptr cinfo);
   int (*consume_data) (j_decompress_ptr cinfo);
   void (*start_output_pass) (j_decompress_ptr cinfo);
   int (*decompress_data) (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+  int (*decompress_data_12) (j_decompress_ptr cinfo, J12SAMPIMAGE output_buf);
+#ifdef D_LOSSLESS_SUPPORTED
+  int (*decompress_data_16) (j_decompress_ptr cinfo, J16SAMPIMAGE output_buf);
+#endif
+
+  /* These variables keep track of the current location of the input side. */
+  /* cinfo->input_iMCU_row is also used for this. */
+  JDIMENSION MCU_ctr;           /* counts MCUs processed in current row */
+  int MCU_vert_offset;          /* counts MCU rows within iMCU row */
+  int MCU_rows_per_iMCU_row;    /* number of such rows needed */
+
+  /* The output side's location is represented by cinfo->output_iMCU_row. */
+
+  /* Lossy mode */
   /* Pointer to array of coefficient virtual arrays, or NULL if none */
   jvirt_barray_ptr *coef_arrays;
 };
@@ -213,6 +310,20 @@ struct jpeg_d_post_controller {
                              JDIMENSION in_row_groups_avail,
                              JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
                              JDIMENSION out_rows_avail);
+  void (*post_process_data_12) (j_decompress_ptr cinfo, J12SAMPIMAGE input_buf,
+                                JDIMENSION *in_row_group_ctr,
+                                JDIMENSION in_row_groups_avail,
+                                J12SAMPARRAY output_buf,
+                                JDIMENSION *out_row_ctr,
+                                JDIMENSION out_rows_avail);
+#ifdef D_LOSSLESS_SUPPORTED
+  void (*post_process_data_16) (j_decompress_ptr cinfo, J16SAMPIMAGE input_buf,
+                                JDIMENSION *in_row_group_ctr,
+                                JDIMENSION in_row_groups_avail,
+                                J16SAMPARRAY output_buf,
+                                JDIMENSION *out_row_ctr,
+                                JDIMENSION out_rows_avail);
+#endif
 };
 
 /* Marker reading & parsing */
@@ -238,24 +349,42 @@ struct jpeg_marker_reader {
 /* Entropy decoding */
 struct jpeg_entropy_decoder {
   void (*start_pass) (j_decompress_ptr cinfo);
+
+  /* Lossy mode */
   boolean (*decode_mcu) (j_decompress_ptr cinfo, JBLOCKROW *MCU_data);
+  /* Lossless mode */
+  JDIMENSION (*decode_mcus) (j_decompress_ptr cinfo, JDIFFIMAGE diff_buf,
+                             JDIMENSION MCU_row_num, JDIMENSION MCU_col_num,
+                             JDIMENSION nMCU);
+  boolean (*process_restart) (j_decompress_ptr cinfo);
 
   /* This is here to share code between baseline and progressive decoders; */
   /* other modules probably should not use it */
   boolean insufficient_data;    /* set TRUE after emitting warning */
 };
 
-/* Inverse DCT (also performs dequantization) */
+/* Lossy mode: Inverse DCT (also performs dequantization)
+ * Lossless mode: Prediction, sample undifferencing, point transform, and
+ * sample size scaling
+ */
 typedef void (*inverse_DCT_method_ptr) (j_decompress_ptr cinfo,
                                         jpeg_component_info *compptr,
                                         JCOEFPTR coef_block,
                                         JSAMPARRAY output_buf,
                                         JDIMENSION output_col);
+typedef void (*inverse_DCT_12_method_ptr) (j_decompress_ptr cinfo,
+                                           jpeg_component_info *compptr,
+                                           JCOEFPTR coef_block,
+                                           J12SAMPARRAY output_buf,
+                                           JDIMENSION output_col);
 
 struct jpeg_inverse_dct {
   void (*start_pass) (j_decompress_ptr cinfo);
+
+  /* Lossy mode */
   /* It is useful to allow each component to have a separate IDCT method. */
   inverse_DCT_method_ptr inverse_DCT[MAX_COMPONENTS];
+  inverse_DCT_12_method_ptr inverse_DCT_12[MAX_COMPONENTS];
 };
 
 /* Upsampling (note that upsampler must also call color converter) */
@@ -265,6 +394,16 @@ struct jpeg_upsampler {
                     JDIMENSION *in_row_group_ctr,
                     JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
                     JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+  void (*upsample_12) (j_decompress_ptr cinfo, J12SAMPIMAGE input_buf,
+                       JDIMENSION *in_row_group_ctr,
+                       JDIMENSION in_row_groups_avail, J12SAMPARRAY output_buf,
+                       JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+#ifdef D_LOSSLESS_SUPPORTED
+  void (*upsample_16) (j_decompress_ptr cinfo, J16SAMPIMAGE input_buf,
+                       JDIMENSION *in_row_group_ctr,
+                       JDIMENSION in_row_groups_avail, J16SAMPARRAY output_buf,
+                       JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+#endif
 
   boolean need_context_rows;    /* TRUE if need rows above & below */
 };
@@ -275,6 +414,14 @@ struct jpeg_color_deconverter {
   void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                          JDIMENSION input_row, JSAMPARRAY output_buf,
                          int num_rows);
+  void (*color_convert_12) (j_decompress_ptr cinfo, J12SAMPIMAGE input_buf,
+                            JDIMENSION input_row, J12SAMPARRAY output_buf,
+                            int num_rows);
+#ifdef D_LOSSLESS_SUPPORTED
+  void (*color_convert_16) (j_decompress_ptr cinfo, J16SAMPIMAGE input_buf,
+                            JDIMENSION input_row, J16SAMPARRAY output_buf,
+                            int num_rows);
+#endif
 };
 
 /* Color quantization or color precision reduction */
@@ -282,6 +429,8 @@ struct jpeg_color_quantizer {
   void (*start_pass) (j_decompress_ptr cinfo, boolean is_pre_scan);
   void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
                           JSAMPARRAY output_buf, int num_rows);
+  void (*color_quantize_12) (j_decompress_ptr cinfo, J12SAMPARRAY input_buf,
+                             J12SAMPARRAY output_buf, int num_rows);
   void (*finish_pass) (j_decompress_ptr cinfo);
   void (*new_color_map) (j_decompress_ptr cinfo);
 };
@@ -323,36 +472,95 @@ EXTERN(void) jinit_c_master_control(j_compress_ptr cinfo,
                                     boolean transcode_only);
 EXTERN(void) jinit_c_main_controller(j_compress_ptr cinfo,
                                      boolean need_full_buffer);
+EXTERN(void) j12init_c_main_controller(j_compress_ptr cinfo,
+                                       boolean need_full_buffer);
 EXTERN(void) jinit_c_prep_controller(j_compress_ptr cinfo,
                                      boolean need_full_buffer);
+EXTERN(void) j12init_c_prep_controller(j_compress_ptr cinfo,
+                                       boolean need_full_buffer);
 EXTERN(void) jinit_c_coef_controller(j_compress_ptr cinfo,
                                      boolean need_full_buffer);
+EXTERN(void) j12init_c_coef_controller(j_compress_ptr cinfo,
+                                       boolean need_full_buffer);
 EXTERN(void) jinit_color_converter(j_compress_ptr cinfo);
+EXTERN(void) j12init_color_converter(j_compress_ptr cinfo);
 EXTERN(void) jinit_downsampler(j_compress_ptr cinfo);
+EXTERN(void) j12init_downsampler(j_compress_ptr cinfo);
 EXTERN(void) jinit_forward_dct(j_compress_ptr cinfo);
+EXTERN(void) j12init_forward_dct(j_compress_ptr cinfo);
 EXTERN(void) jinit_huff_encoder(j_compress_ptr cinfo);
 EXTERN(void) jinit_phuff_encoder(j_compress_ptr cinfo);
 EXTERN(void) jinit_arith_encoder(j_compress_ptr cinfo);
 EXTERN(void) jinit_marker_writer(j_compress_ptr cinfo);
+#ifdef C_LOSSLESS_SUPPORTED
+EXTERN(void) j16init_c_main_controller(j_compress_ptr cinfo,
+                                       boolean need_full_buffer);
+EXTERN(void) j16init_c_prep_controller(j_compress_ptr cinfo,
+                                       boolean need_full_buffer);
+EXTERN(void) j16init_color_converter(j_compress_ptr cinfo);
+EXTERN(void) j16init_downsampler(j_compress_ptr cinfo);
+EXTERN(void) jinit_c_diff_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) j12init_c_diff_controller(j_compress_ptr cinfo,
+                                       boolean need_full_buffer);
+EXTERN(void) j16init_c_diff_controller(j_compress_ptr cinfo,
+                                       boolean need_full_buffer);
+EXTERN(void) jinit_lhuff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_lossless_compressor(j_compress_ptr cinfo);
+EXTERN(void) j12init_lossless_compressor(j_compress_ptr cinfo);
+EXTERN(void) j16init_lossless_compressor(j_compress_ptr cinfo);
+#endif
+
 /* Decompression module initialization routines */
 EXTERN(void) jinit_master_decompress(j_decompress_ptr cinfo);
 EXTERN(void) jinit_d_main_controller(j_decompress_ptr cinfo,
                                      boolean need_full_buffer);
+EXTERN(void) j12init_d_main_controller(j_decompress_ptr cinfo,
+                                       boolean need_full_buffer);
 EXTERN(void) jinit_d_coef_controller(j_decompress_ptr cinfo,
                                      boolean need_full_buffer);
+EXTERN(void) j12init_d_coef_controller(j_decompress_ptr cinfo,
+                                       boolean need_full_buffer);
 EXTERN(void) jinit_d_post_controller(j_decompress_ptr cinfo,
                                      boolean need_full_buffer);
+EXTERN(void) j12init_d_post_controller(j_decompress_ptr cinfo,
+                                       boolean need_full_buffer);
 EXTERN(void) jinit_input_controller(j_decompress_ptr cinfo);
 EXTERN(void) jinit_marker_reader(j_decompress_ptr cinfo);
 EXTERN(void) jinit_huff_decoder(j_decompress_ptr cinfo);
 EXTERN(void) jinit_phuff_decoder(j_decompress_ptr cinfo);
 EXTERN(void) jinit_arith_decoder(j_decompress_ptr cinfo);
 EXTERN(void) jinit_inverse_dct(j_decompress_ptr cinfo);
+EXTERN(void) j12init_inverse_dct(j_decompress_ptr cinfo);
 EXTERN(void) jinit_upsampler(j_decompress_ptr cinfo);
+EXTERN(void) j12init_upsampler(j_decompress_ptr cinfo);
 EXTERN(void) jinit_color_deconverter(j_decompress_ptr cinfo);
+EXTERN(void) j12init_color_deconverter(j_decompress_ptr cinfo);
 EXTERN(void) jinit_1pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) j12init_1pass_quantizer(j_decompress_ptr cinfo);
 EXTERN(void) jinit_2pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) j12init_2pass_quantizer(j_decompress_ptr cinfo);
 EXTERN(void) jinit_merged_upsampler(j_decompress_ptr cinfo);
+EXTERN(void) j12init_merged_upsampler(j_decompress_ptr cinfo);
+#ifdef D_LOSSLESS_SUPPORTED
+EXTERN(void) j16init_d_main_controller(j_decompress_ptr cinfo,
+                                       boolean need_full_buffer);
+EXTERN(void) j16init_d_post_controller(j_decompress_ptr cinfo,
+                                       boolean need_full_buffer);
+EXTERN(void) j16init_upsampler(j_decompress_ptr cinfo);
+EXTERN(void) j16init_color_deconverter(j_decompress_ptr cinfo);
+EXTERN(void) jinit_d_diff_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) j12init_d_diff_controller(j_decompress_ptr cinfo,
+                                       boolean need_full_buffer);
+EXTERN(void) j16init_d_diff_controller(j_decompress_ptr cinfo,
+                                       boolean need_full_buffer);
+EXTERN(void) jinit_lhuff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_lossless_decompressor(j_decompress_ptr cinfo);
+EXTERN(void) j12init_lossless_decompressor(j_decompress_ptr cinfo);
+EXTERN(void) j16init_lossless_decompressor(j_decompress_ptr cinfo);
+#endif
+
 /* Memory manager initialization */
 EXTERN(void) jinit_memory_mgr(j_common_ptr cinfo);
 
@@ -362,6 +570,14 @@ EXTERN(long) jround_up(long a, long b);
 EXTERN(void) jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
                                JSAMPARRAY output_array, int dest_row,
                                int num_rows, JDIMENSION num_cols);
+EXTERN(void) j12copy_sample_rows(J12SAMPARRAY input_array, int source_row,
+                                 J12SAMPARRAY output_array, int dest_row,
+                                 int num_rows, JDIMENSION num_cols);
+#if defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED)
+EXTERN(void) j16copy_sample_rows(J16SAMPARRAY input_array, int source_row,
+                                 J16SAMPARRAY output_array, int dest_row,
+                                 int num_rows, JDIMENSION num_cols);
+#endif
 EXTERN(void) jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
                              JDIMENSION num_blocks);
 EXTERN(void) jzero_far(void *target, size_t bytestozero);
diff --git a/3rdparty/libjpeg-turbo/src/jpeglib.h b/3rdparty/libjpeg-turbo/src/jpeglib.h
index d7664f063092..a59e98c25e58 100644
--- a/3rdparty/libjpeg-turbo/src/jpeglib.h
+++ b/3rdparty/libjpeg-turbo/src/jpeglib.h
@@ -4,8 +4,11 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2013-2014, 2016-2017, 2020, D. R. Commander.
+ * Copyright (C) 2009-2011, 2013-2014, 2016-2017, 2020, 2022-2023,
+             D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -43,6 +46,13 @@ extern "C" {
  * if you want to be compatible.
  */
 
+/* NOTE: In lossless mode, an MCU contains one or more samples rather than one
+ * or more 8x8 DCT blocks, so the term "data unit" is used to generically
+ * describe a sample in lossless mode or an 8x8 DCT block in lossy mode.  To
+ * preserve backward API/ABI compatibility, the field and macro names retain
+ * the "block" terminology.
+ */
+
 #define DCTSIZE             8   /* The basic DCT block is 8x8 samples */
 #define DCTSIZE2            64  /* DCTSIZE squared; # of elements in a block */
 #define NUM_QUANT_TBLS      4   /* Quantization tables are numbered 0..3 */
@@ -57,9 +67,9 @@ extern "C" {
  * we strongly discourage changing C_MAX_BLOCKS_IN_MCU; just because Adobe
  * sometimes emits noncompliant files doesn't mean you should too.
  */
-#define C_MAX_BLOCKS_IN_MCU   10 /* compressor's limit on blocks per MCU */
+#define C_MAX_BLOCKS_IN_MCU   10 /* compressor's limit on data units/MCU */
 #ifndef D_MAX_BLOCKS_IN_MCU
-#define D_MAX_BLOCKS_IN_MCU   10 /* decompressor's limit on blocks per MCU */
+#define D_MAX_BLOCKS_IN_MCU   10 /* decompressor's limit on data units/MCU */
 #endif
 
 
@@ -70,6 +80,20 @@ typedef JSAMPLE *JSAMPROW;      /* ptr to one image row of pixel samples. */
 typedef JSAMPROW *JSAMPARRAY;   /* ptr to some rows (a 2-D sample array) */
 typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */
 
+typedef J12SAMPLE *J12SAMPROW;      /* ptr to one image row of 12-bit pixel
+                                       samples. */
+typedef J12SAMPROW *J12SAMPARRAY;   /* ptr to some 12-bit sample rows (a 2-D
+                                       12-bit sample array) */
+typedef J12SAMPARRAY *J12SAMPIMAGE; /* a 3-D 12-bit sample array: top index is
+                                       color */
+
+typedef J16SAMPLE *J16SAMPROW;      /* ptr to one image row of 16-bit pixel
+                                       samples. */
+typedef J16SAMPROW *J16SAMPARRAY;   /* ptr to some 16-bit sample rows (a 2-D
+                                       16-bit sample array) */
+typedef J16SAMPARRAY *J16SAMPIMAGE; /* a 3-D 16-bit sample array: top index is
+                                       color */
+
 typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */
 typedef JBLOCK *JBLOCKROW;      /* pointer to one row of coefficient blocks */
 typedef JBLOCKROW *JBLOCKARRAY;         /* a 2-D array of coefficient blocks */
@@ -135,17 +159,20 @@ typedef struct {
   /* Remaining fields should be treated as private by applications. */
 
   /* These values are computed during compression or decompression startup: */
-  /* Component's size in DCT blocks.
-   * Any dummy blocks added to complete an MCU are not counted; therefore
-   * these values do not depend on whether a scan is interleaved or not.
+  /* Component's size in data units.
+   * In lossy mode, any dummy blocks added to complete an MCU are not counted;
+   * therefore these values do not depend on whether a scan is interleaved or
+   * not.  In lossless mode, these are always equal to the image width and
+   * height.
    */
   JDIMENSION width_in_blocks;
   JDIMENSION height_in_blocks;
-  /* Size of a DCT block in samples.  Always DCTSIZE for compression.
-   * For decompression this is the size of the output from one DCT block,
+  /* Size of a data unit in samples.  Always DCTSIZE for lossy compression.
+   * For lossy decompression this is the size of the output from one DCT block,
    * reflecting any scaling we choose to apply during the IDCT step.
-   * Values from 1 to 16 are supported.
-   * Note that different components may receive different IDCT scalings.
+   * Values from 1 to 16 are supported.  Note that different components may
+   * receive different IDCT scalings.  In lossless mode, this is always equal
+   * to 1.
    */
 #if JPEG_LIB_VERSION >= 70
   int DCT_h_scaled_size;
@@ -156,8 +183,10 @@ typedef struct {
   /* The downsampled dimensions are the component's actual, unpadded number
    * of samples at the main buffer (preprocessing/compression interface), thus
    * downsampled_width = ceil(image_width * Hi/Hmax)
-   * and similarly for height.  For decompression, IDCT scaling is included, so
+   * and similarly for height.  For lossy decompression, IDCT scaling is
+   * included, so
    * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE)
+   * In lossless mode, these are always equal to the image width and height.
    */
   JDIMENSION downsampled_width;  /* actual width in samples */
   JDIMENSION downsampled_height; /* actual height in samples */
@@ -169,12 +198,12 @@ typedef struct {
 
   /* These values are computed before starting a scan of the component. */
   /* The decompressor output side may not use these variables. */
-  int MCU_width;                /* number of blocks per MCU, horizontally */
-  int MCU_height;               /* number of blocks per MCU, vertically */
+  int MCU_width;                /* number of data units per MCU, horizontally */
+  int MCU_height;               /* number of data units per MCU, vertically */
   int MCU_blocks;               /* MCU_width * MCU_height */
   int MCU_sample_width;         /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
-  int last_col_width;           /* # of non-dummy blocks across in last MCU */
-  int last_row_height;          /* # of non-dummy blocks down in last MCU */
+  int last_col_width;           /* # of non-dummy data units across in last MCU */
+  int last_row_height;          /* # of non-dummy data units down in last MCU */
 
   /* Saved quantization table for component; NULL if none yet saved.
    * See jdinput.c comments about the need for this information.
@@ -192,8 +221,12 @@ typedef struct {
 typedef struct {
   int comps_in_scan;            /* number of components encoded in this scan */
   int component_index[MAX_COMPS_IN_SCAN]; /* their SOF/comp_info[] indexes */
-  int Ss, Se;                   /* progressive JPEG spectral selection parms */
-  int Ah, Al;                   /* progressive JPEG successive approx. parms */
+  int Ss, Se;                   /* progressive JPEG spectral selection parms
+                                   (Ss is the predictor selection value in
+                                   lossless mode) */
+  int Ah, Al;                   /* progressive JPEG successive approx. parms
+                                   (Al is the point transform value in lossless
+                                   mode) */
 } jpeg_scan_info;
 
 /* The decompressor can save APPn and COM markers in a list of these: */
@@ -238,7 +271,8 @@ typedef enum {
   JCS_EXT_BGRA,           /* blue/green/red/alpha */
   JCS_EXT_ABGR,           /* alpha/blue/green/red */
   JCS_EXT_ARGB,           /* alpha/red/green/blue */
-  JCS_RGB565              /* 5-bit red/6-bit green/5-bit blue */
+  JCS_RGB565              /* 5-bit red/6-bit green/5-bit blue
+                             [decompression only] */
 } J_COLOR_SPACE;
 
 /* DCT/IDCT algorithm options. */
@@ -419,11 +453,13 @@ struct jpeg_compress_struct {
   int min_DCT_v_scaled_size;    /* smallest DCT_v_scaled_size of any component */
 #endif
 
-  JDIMENSION total_iMCU_rows;   /* # of iMCU rows to be input to coef ctlr */
-  /* The coefficient controller receives data in units of MCU rows as defined
-   * for fully interleaved scans (whether the JPEG file is interleaved or not).
-   * There are v_samp_factor * DCTSIZE sample rows of each component in an
-   * "iMCU" (interleaved MCU) row.
+  JDIMENSION total_iMCU_rows;   /* # of iMCU rows to be input to coefficient or
+                                   difference controller */
+  /* The coefficient or difference controller receives data in units of MCU
+   * rows as defined for fully interleaved scans (whether the JPEG file is
+   * interleaved or not).  In lossy mode, there are v_samp_factor * DCTSIZE
+   * sample rows of each component in an "iMCU" (interleaved MCU) row.  In
+   * lossless mode, total_iMCU_rows is always equal to the image height.
    */
 
   /*
@@ -437,12 +473,13 @@ struct jpeg_compress_struct {
   JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
   JDIMENSION MCU_rows_in_scan;  /* # of MCU rows in the image */
 
-  int blocks_in_MCU;            /* # of DCT blocks per MCU */
+  int blocks_in_MCU;            /* # of data units per MCU */
   int MCU_membership[C_MAX_BLOCKS_IN_MCU];
   /* MCU_membership[i] is index in cur_comp_info of component owning */
-  /* i'th block in an MCU */
+  /* i'th data unit in an MCU */
 
-  int Ss, Se, Ah, Al;           /* progressive JPEG parameters for scan */
+  int Ss, Se, Ah, Al;           /* progressive/lossless JPEG parameters for
+                                   scan */
 
 #if JPEG_LIB_VERSION >= 80
   int block_size;               /* the basic DCT block size: 1..16 */
@@ -537,7 +574,12 @@ struct jpeg_decompress_struct {
    * The map has out_color_components rows and actual_number_of_colors columns.
    */
   int actual_number_of_colors;  /* number of entries in use */
-  JSAMPARRAY colormap;          /* The color map as a 2-D pixel array */
+  JSAMPARRAY colormap;          /* The color map as a 2-D pixel array
+                                   If data_precision is 12 or 16, then this is
+                                   actually a J12SAMPARRAY or a J16SAMPARRAY,
+                                   so callers must type-cast it in order to
+                                   read/write 12-bit or 16-bit samples from/to
+                                   the array. */
 
   /* State variables: these variables indicate the progress of decompression.
    * The application may examine these but must not modify them.
@@ -647,15 +689,21 @@ struct jpeg_decompress_struct {
 #endif
 
   JDIMENSION total_iMCU_rows;   /* # of iMCU rows in image */
-  /* The coefficient controller's input and output progress is measured in
-   * units of "iMCU" (interleaved MCU) rows.  These are the same as MCU rows
-   * in fully interleaved JPEG scans, but are used whether the scan is
-   * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
-   * rows of each component.  Therefore, the IDCT output contains
+  /* The coefficient or difference controller's input and output progress is
+   * measured in units of "iMCU" (interleaved MCU) rows.  These are the same as
+   * MCU rows in fully interleaved JPEG scans, but are used whether the scan is
+   * interleaved or not.  In lossy mode, we define an iMCU row as v_samp_factor
+   * DCT block rows of each component.  Therefore, the IDCT output contains
    * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row.
+   * In lossless mode, total_iMCU_rows is always equal to the image height.
    */
 
-  JSAMPLE *sample_range_limit;  /* table for fast range-limiting */
+  JSAMPLE *sample_range_limit;  /* table for fast range-limiting
+                                   If data_precision is 12 or 16, then this is
+                                   actually a J12SAMPLE pointer or a J16SAMPLE
+                                   pointer, so callers must type-cast it in
+                                   order to read 12-bit or 16-bit samples from
+                                   the array. */
 
   /*
    * These fields are valid during any one scan.
@@ -669,12 +717,13 @@ struct jpeg_decompress_struct {
   JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
   JDIMENSION MCU_rows_in_scan;  /* # of MCU rows in the image */
 
-  int blocks_in_MCU;            /* # of DCT blocks per MCU */
+  int blocks_in_MCU;            /* # of data units per MCU */
   int MCU_membership[D_MAX_BLOCKS_IN_MCU];
   /* MCU_membership[i] is index in cur_comp_info of component owning */
-  /* i'th block in an MCU */
+  /* i'th data unit in an MCU */
 
-  int Ss, Se, Ah, Al;           /* progressive JPEG parameters for scan */
+  int Ss, Se, Ah, Al;           /* progressive/lossless JPEG parameters for
+                                   scan */
 
 #if JPEG_LIB_VERSION >= 80
   /* These fields are derived from Se of first SOS marker.
@@ -835,6 +884,11 @@ struct jpeg_memory_mgr {
   void *(*alloc_small) (j_common_ptr cinfo, int pool_id, size_t sizeofobject);
   void *(*alloc_large) (j_common_ptr cinfo, int pool_id,
                         size_t sizeofobject);
+  /* If cinfo->data_precision is 12 or 16, then this method and the
+   * access_virt_sarray method actually return a J12SAMPARRAY or a
+   * J16SAMPARRAY, so callers must type-cast the return value in order to
+   * read/write 12-bit or 16-bit samples from/to the array.
+   */
   JSAMPARRAY (*alloc_sarray) (j_common_ptr cinfo, int pool_id,
                               JDIMENSION samplesperrow, JDIMENSION numrows);
   JBLOCKARRAY (*alloc_barray) (j_common_ptr cinfo, int pool_id,
@@ -916,13 +970,11 @@ EXTERN(void) jpeg_destroy_decompress(j_decompress_ptr cinfo);
 EXTERN(void) jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile);
 EXTERN(void) jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile);
 
-#if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /* Data source and destination managers: memory buffers. */
 EXTERN(void) jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
                            unsigned long *outsize);
 EXTERN(void) jpeg_mem_src(j_decompress_ptr cinfo,
                           const unsigned char *inbuffer, unsigned long insize);
-#endif
 
 /* Default parameter setup for compression */
 EXTERN(void) jpeg_set_defaults(j_compress_ptr cinfo);
@@ -942,6 +994,9 @@ EXTERN(void) jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
                                   const unsigned int *basic_table,
                                   int scale_factor, boolean force_baseline);
 EXTERN(int) jpeg_quality_scaling(int quality);
+EXTERN(void) jpeg_enable_lossless(j_compress_ptr cinfo,
+                                  int predictor_selection_value,
+                                  int point_transform);
 EXTERN(void) jpeg_simple_progression(j_compress_ptr cinfo);
 EXTERN(void) jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress);
 EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table(j_common_ptr cinfo);
@@ -953,6 +1008,12 @@ EXTERN(void) jpeg_start_compress(j_compress_ptr cinfo,
 EXTERN(JDIMENSION) jpeg_write_scanlines(j_compress_ptr cinfo,
                                         JSAMPARRAY scanlines,
                                         JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg12_write_scanlines(j_compress_ptr cinfo,
+                                          J12SAMPARRAY scanlines,
+                                          JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg16_write_scanlines(j_compress_ptr cinfo,
+                                          J16SAMPARRAY scanlines,
+                                          JDIMENSION num_lines);
 EXTERN(void) jpeg_finish_compress(j_compress_ptr cinfo);
 
 #if JPEG_LIB_VERSION >= 70
@@ -963,6 +1024,9 @@ EXTERN(void) jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo);
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
 EXTERN(JDIMENSION) jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
                                        JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg12_write_raw_data(j_compress_ptr cinfo,
+                                         J12SAMPIMAGE data,
+                                         JDIMENSION num_lines);
 
 /* Write a special marker.  See libjpeg.txt concerning safe usage. */
 EXTERN(void) jpeg_write_marker(j_compress_ptr cinfo, int marker,
@@ -998,15 +1062,28 @@ EXTERN(boolean) jpeg_start_decompress(j_decompress_ptr cinfo);
 EXTERN(JDIMENSION) jpeg_read_scanlines(j_decompress_ptr cinfo,
                                        JSAMPARRAY scanlines,
                                        JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg12_read_scanlines(j_decompress_ptr cinfo,
+                                         J12SAMPARRAY scanlines,
+                                         JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg16_read_scanlines(j_decompress_ptr cinfo,
+                                         J16SAMPARRAY scanlines,
+                                         JDIMENSION max_lines);
 EXTERN(JDIMENSION) jpeg_skip_scanlines(j_decompress_ptr cinfo,
                                        JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg12_skip_scanlines(j_decompress_ptr cinfo,
+                                         JDIMENSION num_lines);
 EXTERN(void) jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
                                 JDIMENSION *width);
+EXTERN(void) jpeg12_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                                  JDIMENSION *width);
 EXTERN(boolean) jpeg_finish_decompress(j_decompress_ptr cinfo);
 
 /* Replaces jpeg_read_scanlines when reading raw downsampled data. */
 EXTERN(JDIMENSION) jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
                                       JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg12_read_raw_data(j_decompress_ptr cinfo,
+                                        J12SAMPIMAGE data,
+                                        JDIMENSION max_lines);
 
 /* Additional entry points for buffered-image mode. */
 EXTERN(boolean) jpeg_has_multiple_scans(j_decompress_ptr cinfo);
diff --git a/3rdparty/libjpeg-turbo/src/jquant1.c b/3rdparty/libjpeg-turbo/src/jquant1.c
index 73b83e16e5cc..2e914b919c63 100644
--- a/3rdparty/libjpeg-turbo/src/jquant1.c
+++ b/3rdparty/libjpeg-turbo/src/jquant1.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2015, D. R. Commander.
+ * Copyright (C) 2009, 2015, 2022-2023, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -16,8 +16,9 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
-#ifdef QUANT_1PASS_SUPPORTED
+#if defined(QUANT_1PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16
 
 
 /*
@@ -66,7 +67,7 @@
  * worse, since the dither may be too much or too little at a given point.
  *
  * The normal calculation would be to form pixel value + dither, range-limit
- * this to 0..MAXJSAMPLE, and then index into the colorindex table as usual.
+ * this to 0.._MAXJSAMPLE, and then index into the colorindex table as usual.
  * We can skip the separate range-limiting step by extending the colorindex
  * table in both directions.
  */
@@ -144,13 +145,13 @@ typedef struct {
   struct jpeg_color_quantizer pub; /* public fields */
 
   /* Initially allocated colormap is saved here */
-  JSAMPARRAY sv_colormap;       /* The color map as a 2-D pixel array */
+  _JSAMPARRAY sv_colormap;      /* The color map as a 2-D pixel array */
   int sv_actual;                /* number of entries in use */
 
-  JSAMPARRAY colorindex;        /* Precomputed mapping for speed */
+  _JSAMPARRAY colorindex;       /* Precomputed mapping for speed */
   /* colorindex[i][j] = index of color closest to pixel value j in component i,
    * premultiplied as described above.  Since colormap indexes must fit into
-   * JSAMPLEs, the entries of this array will too.
+   * _JSAMPLEs, the entries of this array will too.
    */
   boolean is_padded;            /* is the colorindex padded for odither? */
 
@@ -248,24 +249,24 @@ select_ncolors(j_decompress_ptr cinfo, int Ncolors[])
 LOCAL(int)
 output_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
 /* Return j'th output value, where j will range from 0 to maxj */
-/* The output values must fall in 0..MAXJSAMPLE in increasing order */
+/* The output values must fall in 0.._MAXJSAMPLE in increasing order */
 {
-  /* We always provide values 0 and MAXJSAMPLE for each component;
+  /* We always provide values 0 and _MAXJSAMPLE for each component;
    * any additional values are equally spaced between these limits.
    * (Forcing the upper and lower values to the limits ensures that
    * dithering can't produce a color outside the selected gamut.)
    */
-  return (int)(((JLONG)j * MAXJSAMPLE + maxj / 2) / maxj);
+  return (int)(((JLONG)j * _MAXJSAMPLE + maxj / 2) / maxj);
 }
 
 
 LOCAL(int)
 largest_input_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
 /* Return largest input value that should map to j'th output value */
-/* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */
+/* Must have largest(j=0) >= 0, and largest(j=maxj) >= _MAXJSAMPLE */
 {
   /* Breakpoints are halfway between values returned by output_value */
-  return (int)(((JLONG)(2 * j + 1) * MAXJSAMPLE + maxj) / (2 * maxj));
+  return (int)(((JLONG)(2 * j + 1) * _MAXJSAMPLE + maxj) / (2 * maxj));
 }
 
 
@@ -277,7 +278,7 @@ LOCAL(void)
 create_colormap(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
-  JSAMPARRAY colormap;          /* Created colormap */
+  _JSAMPARRAY colormap;         /* Created colormap */
   int total_colors;             /* Number of distinct output colors */
   int i, j, k, nci, blksize, blkdist, ptr, val;
 
@@ -296,7 +297,7 @@ create_colormap(j_decompress_ptr cinfo)
   /* The colors are ordered in the map in standard row-major order, */
   /* i.e. rightmost (highest-indexed) color changes most rapidly. */
 
-  colormap = (*cinfo->mem->alloc_sarray)
+  colormap = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
     ((j_common_ptr)cinfo, JPOOL_IMAGE,
      (JDIMENSION)total_colors, (JDIMENSION)cinfo->out_color_components);
 
@@ -315,7 +316,7 @@ create_colormap(j_decompress_ptr cinfo)
       for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) {
         /* fill in blksize entries beginning at ptr */
         for (k = 0; k < blksize; k++)
-          colormap[i][ptr + k] = (JSAMPLE)val;
+          colormap[i][ptr + k] = (_JSAMPLE)val;
       }
     }
     blkdist = blksize;          /* blksize of this color is blkdist of next */
@@ -337,25 +338,25 @@ LOCAL(void)
 create_colorindex(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
-  JSAMPROW indexptr;
+  _JSAMPROW indexptr;
   int i, j, k, nci, blksize, val, pad;
 
-  /* For ordered dither, we pad the color index tables by MAXJSAMPLE in
-   * each direction (input index values can be -MAXJSAMPLE .. 2*MAXJSAMPLE).
+  /* For ordered dither, we pad the color index tables by _MAXJSAMPLE in
+   * each direction (input index values can be -_MAXJSAMPLE .. 2*_MAXJSAMPLE).
    * This is not necessary in the other dithering modes.  However, we
    * flag whether it was done in case user changes dithering mode.
    */
   if (cinfo->dither_mode == JDITHER_ORDERED) {
-    pad = MAXJSAMPLE * 2;
+    pad = _MAXJSAMPLE * 2;
     cquantize->is_padded = TRUE;
   } else {
     pad = 0;
     cquantize->is_padded = FALSE;
   }
 
-  cquantize->colorindex = (*cinfo->mem->alloc_sarray)
+  cquantize->colorindex = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
     ((j_common_ptr)cinfo, JPOOL_IMAGE,
-     (JDIMENSION)(MAXJSAMPLE + 1 + pad),
+     (JDIMENSION)(_MAXJSAMPLE + 1 + pad),
      (JDIMENSION)cinfo->out_color_components);
 
   /* blksize is number of adjacent repeated entries for a component */
@@ -368,24 +369,24 @@ create_colorindex(j_decompress_ptr cinfo)
 
     /* adjust colorindex pointers to provide padding at negative indexes. */
     if (pad)
-      cquantize->colorindex[i] += MAXJSAMPLE;
+      cquantize->colorindex[i] += _MAXJSAMPLE;
 
     /* in loop, val = index of current output value, */
     /* and k = largest j that maps to current val */
     indexptr = cquantize->colorindex[i];
     val = 0;
     k = largest_input_value(cinfo, i, 0, nci - 1);
-    for (j = 0; j <= MAXJSAMPLE; j++) {
+    for (j = 0; j <= _MAXJSAMPLE; j++) {
       while (j > k)             /* advance val if past boundary */
         k = largest_input_value(cinfo, i, ++val, nci - 1);
       /* premultiply so that no multiplication needed in main processing */
-      indexptr[j] = (JSAMPLE)(val * blksize);
+      indexptr[j] = (_JSAMPLE)(val * blksize);
     }
     /* Pad at both ends if necessary */
     if (pad)
-      for (j = 1; j <= MAXJSAMPLE; j++) {
+      for (j = 1; j <= _MAXJSAMPLE; j++) {
         indexptr[-j] = indexptr[0];
-        indexptr[MAXJSAMPLE + j] = indexptr[MAXJSAMPLE];
+        indexptr[_MAXJSAMPLE + j] = indexptr[_MAXJSAMPLE];
       }
   }
 }
@@ -406,16 +407,16 @@ make_odither_array(j_decompress_ptr cinfo, int ncolors)
   odither = (ODITHER_MATRIX_PTR)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(ODITHER_MATRIX));
-  /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1).
+  /* The inter-value distance for this color is _MAXJSAMPLE/(ncolors-1).
    * Hence the dither value for the matrix cell with fill order f
-   * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
+   * (f=0..N-1) should be (N-1-2*f)/(2*N) * _MAXJSAMPLE/(ncolors-1).
    * On 16-bit-int machine, be careful to avoid overflow.
    */
   den = 2 * ODITHER_CELLS * ((JLONG)(ncolors - 1));
   for (j = 0; j < ODITHER_SIZE; j++) {
     for (k = 0; k < ODITHER_SIZE; k++) {
       num = ((JLONG)(ODITHER_CELLS - 1 -
-                     2 * ((int)base_dither_matrix[j][k]))) * MAXJSAMPLE;
+                     2 * ((int)base_dither_matrix[j][k]))) * _MAXJSAMPLE;
       /* Ensure round towards zero despite C's lack of consistency
        * about rounding negative values in integer division...
        */
@@ -460,14 +461,14 @@ create_odither_tables(j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-               JSAMPARRAY output_buf, int num_rows)
+color_quantize(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+               _JSAMPARRAY output_buf, int num_rows)
 /* General case, no dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
-  JSAMPARRAY colorindex = cquantize->colorindex;
+  _JSAMPARRAY colorindex = cquantize->colorindex;
   register int pixcode, ci;
-  register JSAMPROW ptrin, ptrout;
+  register _JSAMPROW ptrin, ptrout;
   int row;
   JDIMENSION col;
   JDIMENSION width = cinfo->output_width;
@@ -481,23 +482,23 @@ color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
       for (ci = 0; ci < nc; ci++) {
         pixcode += colorindex[ci][*ptrin++];
       }
-      *ptrout++ = (JSAMPLE)pixcode;
+      *ptrout++ = (_JSAMPLE)pixcode;
     }
   }
 }
 
 
 METHODDEF(void)
-color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                JSAMPARRAY output_buf, int num_rows)
+color_quantize3(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+                _JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, no dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register int pixcode;
-  register JSAMPROW ptrin, ptrout;
-  JSAMPROW colorindex0 = cquantize->colorindex[0];
-  JSAMPROW colorindex1 = cquantize->colorindex[1];
-  JSAMPROW colorindex2 = cquantize->colorindex[2];
+  register _JSAMPROW ptrin, ptrout;
+  _JSAMPROW colorindex0 = cquantize->colorindex[0];
+  _JSAMPROW colorindex1 = cquantize->colorindex[1];
+  _JSAMPROW colorindex2 = cquantize->colorindex[2];
   int row;
   JDIMENSION col;
   JDIMENSION width = cinfo->output_width;
@@ -509,21 +510,21 @@ color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
       pixcode  = colorindex0[*ptrin++];
       pixcode += colorindex1[*ptrin++];
       pixcode += colorindex2[*ptrin++];
-      *ptrout++ = (JSAMPLE)pixcode;
+      *ptrout++ = (_JSAMPLE)pixcode;
     }
   }
 }
 
 
 METHODDEF(void)
-quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                    JSAMPARRAY output_buf, int num_rows)
+quantize_ord_dither(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+                    _JSAMPARRAY output_buf, int num_rows)
 /* General case, with ordered dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
-  register JSAMPROW input_ptr;
-  register JSAMPROW output_ptr;
-  JSAMPROW colorindex_ci;
+  register _JSAMPROW input_ptr;
+  register _JSAMPROW output_ptr;
+  _JSAMPROW colorindex_ci;
   int *dither;                  /* points to active row of dither matrix */
   int row_index, col_index;     /* current indexes into dither matrix */
   int nc = cinfo->out_color_components;
@@ -534,7 +535,7 @@ quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
+    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(_JSAMPLE)));
     row_index = cquantize->row_index;
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
@@ -544,11 +545,11 @@ quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
       col_index = 0;
 
       for (col = width; col > 0; col--) {
-        /* Form pixel value + dither, range-limit to 0..MAXJSAMPLE,
+        /* Form pixel value + dither, range-limit to 0.._MAXJSAMPLE,
          * select output value, accumulate into output code for this pixel.
          * Range-limiting need not be done explicitly, as we have extended
          * the colorindex table to produce the right answers for out-of-range
-         * inputs.  The maximum dither is +- MAXJSAMPLE; this sets the
+         * inputs.  The maximum dither is +- _MAXJSAMPLE; this sets the
          * required amount of padding.
          */
         *output_ptr +=
@@ -566,17 +567,17 @@ quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
 
 METHODDEF(void)
-quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                     JSAMPARRAY output_buf, int num_rows)
+quantize3_ord_dither(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+                     _JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, with ordered dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register int pixcode;
-  register JSAMPROW input_ptr;
-  register JSAMPROW output_ptr;
-  JSAMPROW colorindex0 = cquantize->colorindex[0];
-  JSAMPROW colorindex1 = cquantize->colorindex[1];
-  JSAMPROW colorindex2 = cquantize->colorindex[2];
+  register _JSAMPROW input_ptr;
+  register _JSAMPROW output_ptr;
+  _JSAMPROW colorindex0 = cquantize->colorindex[0];
+  _JSAMPROW colorindex1 = cquantize->colorindex[1];
+  _JSAMPROW colorindex2 = cquantize->colorindex[2];
   int *dither0;                 /* points to active row of dither matrix */
   int *dither1;
   int *dither2;
@@ -598,7 +599,7 @@ quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
       pixcode  = colorindex0[(*input_ptr++) + dither0[col_index]];
       pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
       pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
-      *output_ptr++ = (JSAMPLE)pixcode;
+      *output_ptr++ = (_JSAMPLE)pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
     row_index = (row_index + 1) & ODITHER_MASK;
@@ -608,8 +609,8 @@ quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
 
 METHODDEF(void)
-quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                   JSAMPARRAY output_buf, int num_rows)
+quantize_fs_dither(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+                   _JSAMPARRAY output_buf, int num_rows)
 /* General case, with Floyd-Steinberg dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
@@ -619,10 +620,10 @@ quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
   LOCFSERROR bnexterr;          /* error for below/next col */
   LOCFSERROR delta;
   register FSERRPTR errorptr;   /* => fserrors[] at column before current */
-  register JSAMPROW input_ptr;
-  register JSAMPROW output_ptr;
-  JSAMPROW colorindex_ci;
-  JSAMPROW colormap_ci;
+  register _JSAMPROW input_ptr;
+  register _JSAMPROW output_ptr;
+  _JSAMPROW colorindex_ci;
+  _JSAMPROW colormap_ci;
   int pixcode;
   int nc = cinfo->out_color_components;
   int dir;                      /* 1 for left-to-right, -1 for right-to-left */
@@ -631,12 +632,12 @@ quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
   int row;
   JDIMENSION col;
   JDIMENSION width = cinfo->output_width;
-  JSAMPLE *range_limit = cinfo->sample_range_limit;
+  _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   SHIFT_TEMPS
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
+    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(_JSAMPLE)));
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
       output_ptr = output_buf[row];
@@ -670,15 +671,15 @@ quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * Note: errorptr points to *previous* column's array entry.
          */
         cur = RIGHT_SHIFT(cur + errorptr[dir] + 8, 4);
-        /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE.
-         * The maximum error is +- MAXJSAMPLE; this sets the required size
+        /* Form pixel value + error, and range-limit to 0.._MAXJSAMPLE.
+         * The maximum error is +- _MAXJSAMPLE; this sets the required size
          * of the range_limit array.
          */
         cur += *input_ptr;
         cur = range_limit[cur];
         /* Select output value, accumulate into output code for this pixel */
         pixcode = colorindex_ci[cur];
-        *output_ptr += (JSAMPLE)pixcode;
+        *output_ptr += (_JSAMPLE)pixcode;
         /* Compute actual representation error at this pixel */
         /* Note: we can do this even though we don't have the final */
         /* pixel code, because the colormap is orthogonal. */
@@ -745,22 +746,22 @@ start_pass_1_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
   int i;
 
   /* Install my colormap. */
-  cinfo->colormap = cquantize->sv_colormap;
+  cinfo->colormap = (JSAMPARRAY)cquantize->sv_colormap;
   cinfo->actual_number_of_colors = cquantize->sv_actual;
 
   /* Initialize for desired dithering mode. */
   switch (cinfo->dither_mode) {
   case JDITHER_NONE:
     if (cinfo->out_color_components == 3)
-      cquantize->pub.color_quantize = color_quantize3;
+      cquantize->pub._color_quantize = color_quantize3;
     else
-      cquantize->pub.color_quantize = color_quantize;
+      cquantize->pub._color_quantize = color_quantize;
     break;
   case JDITHER_ORDERED:
     if (cinfo->out_color_components == 3)
-      cquantize->pub.color_quantize = quantize3_ord_dither;
+      cquantize->pub._color_quantize = quantize3_ord_dither;
     else
-      cquantize->pub.color_quantize = quantize_ord_dither;
+      cquantize->pub._color_quantize = quantize_ord_dither;
     cquantize->row_index = 0;   /* initialize state for ordered dither */
     /* If user changed to ordered dither from another mode,
      * we must recreate the color index table with padding.
@@ -773,7 +774,7 @@ start_pass_1_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
       create_odither_tables(cinfo);
     break;
   case JDITHER_FS:
-    cquantize->pub.color_quantize = quantize_fs_dither;
+    cquantize->pub._color_quantize = quantize_fs_dither;
     cquantize->on_odd_row = FALSE; /* initialize state for F-S dither */
     /* Allocate Floyd-Steinberg workspace if didn't already. */
     if (cquantize->fserrors[0] == NULL)
@@ -818,10 +819,17 @@ new_color_map_1_quant(j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_1pass_quantizer(j_decompress_ptr cinfo)
+_jinit_1pass_quantizer(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  /* Color quantization is not supported with lossless JPEG images */
+  if (cinfo->master->lossless)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
   cquantize = (my_cquantize_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_cquantizer));
@@ -835,9 +843,9 @@ jinit_1pass_quantizer(j_decompress_ptr cinfo)
   /* Make sure my internal arrays won't overflow */
   if (cinfo->out_color_components > MAX_Q_COMPS)
     ERREXIT1(cinfo, JERR_QUANT_COMPONENTS, MAX_Q_COMPS);
-  /* Make sure colormap indexes can be represented by JSAMPLEs */
-  if (cinfo->desired_number_of_colors > (MAXJSAMPLE + 1))
-    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE + 1);
+  /* Make sure colormap indexes can be represented by _JSAMPLEs */
+  if (cinfo->desired_number_of_colors > (_MAXJSAMPLE + 1))
+    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, _MAXJSAMPLE + 1);
 
   /* Create the colormap and color index table. */
   create_colormap(cinfo);
@@ -853,4 +861,4 @@ jinit_1pass_quantizer(j_decompress_ptr cinfo)
     alloc_fs_workspace(cinfo);
 }
 
-#endif /* QUANT_1PASS_SUPPORTED */
+#endif /* defined(QUANT_1PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16 */
diff --git a/3rdparty/libjpeg-turbo/src/jquant2.c b/3rdparty/libjpeg-turbo/src/jquant2.c
index 44efb18cadf1..9ba51fa8872b 100644
--- a/3rdparty/libjpeg-turbo/src/jquant2.c
+++ b/3rdparty/libjpeg-turbo/src/jquant2.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2014-2015, 2020, D. R. Commander.
+ * Copyright (C) 2009, 2014-2015, 2020, 2022-2023, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,8 +23,9 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
-#ifdef QUANT_2PASS_SUPPORTED
+#if defined(QUANT_2PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16
 
 
 /*
@@ -106,7 +107,7 @@ static const int c_scales[3] = { R_SCALE, G_SCALE, B_SCALE };
  * each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries.
  */
 
-#define MAXNUMCOLORS  (MAXJSAMPLE + 1) /* maximum size of colormap */
+#define MAXNUMCOLORS  (_MAXJSAMPLE + 1) /* maximum size of colormap */
 
 /* These will do the right thing for either R,G,B or B,G,R color order,
  * but you may not like the results for other color orders.
@@ -173,7 +174,7 @@ typedef struct {
   struct jpeg_color_quantizer pub; /* public fields */
 
   /* Space for the eventually created colormap is stashed here */
-  JSAMPARRAY sv_colormap;       /* colormap allocated at init time */
+  _JSAMPARRAY sv_colormap;      /* colormap allocated at init time */
   int desired;                  /* desired # of colors = size of colormap */
 
   /* Variables for accumulating image statistics */
@@ -200,11 +201,11 @@ typedef my_cquantizer *my_cquantize_ptr;
  */
 
 METHODDEF(void)
-prescan_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                 JSAMPARRAY output_buf, int num_rows)
+prescan_quantize(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+                 _JSAMPARRAY output_buf, int num_rows)
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
-  register JSAMPROW ptr;
+  register _JSAMPROW ptr;
   register histptr histp;
   register hist3d histogram = cquantize->histogram;
   int row;
@@ -377,7 +378,7 @@ update_box(j_decompress_ptr cinfo, boxptr boxp)
    * against making long narrow boxes, and it has the side benefit that
    * a box is splittable iff norm > 0.
    * Since the differences are expressed in histogram-cell units,
-   * we have to shift back to JSAMPLE units to get consistent distances;
+   * we have to shift back to _JSAMPLE units to get consistent distances;
    * after which, we scale according to the selected distance scale factors.
    */
   dist0 = ((c0max - c0min) << C0_SHIFT) * C0_SCALE;
@@ -508,9 +509,12 @@ compute_color(j_decompress_ptr cinfo, boxptr boxp, int icolor)
       }
     }
 
-  cinfo->colormap[0][icolor] = (JSAMPLE)((c0total + (total >> 1)) / total);
-  cinfo->colormap[1][icolor] = (JSAMPLE)((c1total + (total >> 1)) / total);
-  cinfo->colormap[2][icolor] = (JSAMPLE)((c2total + (total >> 1)) / total);
+  ((_JSAMPARRAY)cinfo->colormap)[0][icolor] =
+    (_JSAMPLE)((c0total + (total >> 1)) / total);
+  ((_JSAMPARRAY)cinfo->colormap)[1][icolor] =
+    (_JSAMPLE)((c1total + (total >> 1)) / total);
+  ((_JSAMPARRAY)cinfo->colormap)[2][icolor] =
+    (_JSAMPLE)((c2total + (total >> 1)) / total);
 }
 
 
@@ -528,11 +532,11 @@ select_colors(j_decompress_ptr cinfo, int desired_colors)
   /* Initialize one box containing whole space */
   numboxes = 1;
   boxlist[0].c0min = 0;
-  boxlist[0].c0max = MAXJSAMPLE >> C0_SHIFT;
+  boxlist[0].c0max = _MAXJSAMPLE >> C0_SHIFT;
   boxlist[0].c1min = 0;
-  boxlist[0].c1max = MAXJSAMPLE >> C1_SHIFT;
+  boxlist[0].c1max = _MAXJSAMPLE >> C1_SHIFT;
   boxlist[0].c2min = 0;
-  boxlist[0].c2max = MAXJSAMPLE >> C2_SHIFT;
+  boxlist[0].c2max = _MAXJSAMPLE >> C2_SHIFT;
   /* Shrink it to actually-used volume and set its statistics */
   update_box(cinfo, &boxlist[0]);
   /* Perform median-cut to produce final box list */
@@ -623,7 +627,7 @@ select_colors(j_decompress_ptr cinfo, int desired_colors)
 
 LOCAL(int)
 find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-                   JSAMPLE colorlist[])
+                   _JSAMPLE colorlist[])
 /* Locate the colormap entries close enough to an update box to be candidates
  * for the nearest entry to some cell(s) in the update box.  The update box
  * is specified by the center coordinates of its first cell.  The number of
@@ -665,7 +669,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
   for (i = 0; i < numcolors; i++) {
     /* We compute the squared-c0-distance term, then add in the other two. */
-    x = cinfo->colormap[0][i];
+    x = ((_JSAMPARRAY)cinfo->colormap)[0][i];
     if (x < minc0) {
       tdist = (x - minc0) * C0_SCALE;
       min_dist = tdist * tdist;
@@ -688,7 +692,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
       }
     }
 
-    x = cinfo->colormap[1][i];
+    x = ((_JSAMPARRAY)cinfo->colormap)[1][i];
     if (x < minc1) {
       tdist = (x - minc1) * C1_SCALE;
       min_dist += tdist * tdist;
@@ -710,7 +714,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
       }
     }
 
-    x = cinfo->colormap[2][i];
+    x = ((_JSAMPARRAY)cinfo->colormap)[2][i];
     if (x < minc2) {
       tdist = (x - minc2) * C2_SCALE;
       min_dist += tdist * tdist;
@@ -744,7 +748,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
   ncolors = 0;
   for (i = 0; i < numcolors; i++) {
     if (mindist[i] <= minmaxdist)
-      colorlist[ncolors++] = (JSAMPLE)i;
+      colorlist[ncolors++] = (_JSAMPLE)i;
   }
   return ncolors;
 }
@@ -752,7 +756,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
 LOCAL(void)
 find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-                 int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
+                 int numcolors, _JSAMPLE colorlist[], _JSAMPLE bestcolor[])
 /* Find the closest colormap entry for each cell in the update box,
  * given the list of candidate colors prepared by find_nearby_colors.
  * Return the indexes of the closest entries in the bestcolor[] array.
@@ -763,7 +767,7 @@ find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
   int ic0, ic1, ic2;
   int i, icolor;
   register JLONG *bptr;         /* pointer into bestdist[] array */
-  JSAMPLE *cptr;                /* pointer into bestcolor[] array */
+  _JSAMPLE *cptr;               /* pointer into bestcolor[] array */
   JLONG dist0, dist1;           /* initial distance values */
   register JLONG dist2;         /* current distance in inner loop */
   JLONG xx0, xx1;               /* distance increments */
@@ -790,11 +794,11 @@ find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
   for (i = 0; i < numcolors; i++) {
     icolor = colorlist[i];
     /* Compute (square of) distance from minc0/c1/c2 to this color */
-    inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
+    inc0 = (minc0 - ((_JSAMPARRAY)cinfo->colormap)[0][icolor]) * C0_SCALE;
     dist0 = inc0 * inc0;
-    inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
+    inc1 = (minc1 - ((_JSAMPARRAY)cinfo->colormap)[1][icolor]) * C1_SCALE;
     dist0 += inc1 * inc1;
-    inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
+    inc2 = (minc2 - ((_JSAMPARRAY)cinfo->colormap)[2][icolor]) * C2_SCALE;
     dist0 += inc2 * inc2;
     /* Form the initial difference increments */
     inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
@@ -813,7 +817,7 @@ find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
         for (ic2 = BOX_C2_ELEMS - 1; ic2 >= 0; ic2--) {
           if (dist2 < *bptr) {
             *bptr = dist2;
-            *cptr = (JSAMPLE)icolor;
+            *cptr = (_JSAMPLE)icolor;
           }
           dist2 += xx2;
           xx2 += 2 * STEP_C2 * STEP_C2;
@@ -840,13 +844,13 @@ fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
   hist3d histogram = cquantize->histogram;
   int minc0, minc1, minc2;      /* lower left corner of update box */
   int ic0, ic1, ic2;
-  register JSAMPLE *cptr;       /* pointer into bestcolor[] array */
+  register _JSAMPLE *cptr;      /* pointer into bestcolor[] array */
   register histptr cachep;      /* pointer into main cache array */
   /* This array lists the candidate colormap indexes. */
-  JSAMPLE colorlist[MAXNUMCOLORS];
+  _JSAMPLE colorlist[MAXNUMCOLORS];
   int numcolors;                /* number of candidate colors */
   /* This array holds the actually closest colormap index for each cell. */
-  JSAMPLE bestcolor[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
+  _JSAMPLE bestcolor[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS];
 
   /* Convert cell coordinates to update box ID */
   c0 >>= BOX_C0_LOG;
@@ -891,13 +895,13 @@ fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
  */
 
 METHODDEF(void)
-pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                JSAMPARRAY output_buf, int num_rows)
+pass2_no_dither(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+                _JSAMPARRAY output_buf, int num_rows)
 /* This version performs no dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
-  register JSAMPROW inptr, outptr;
+  register _JSAMPROW inptr, outptr;
   register histptr cachep;
   register int c0, c1, c2;
   int row;
@@ -918,15 +922,15 @@ pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
       if (*cachep == 0)
         fill_inverse_cmap(cinfo, c0, c1, c2);
       /* Now emit the colormap index for this cell */
-      *outptr++ = (JSAMPLE)(*cachep - 1);
+      *outptr++ = (_JSAMPLE)(*cachep - 1);
     }
   }
 }
 
 
 METHODDEF(void)
-pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                JSAMPARRAY output_buf, int num_rows)
+pass2_fs_dither(j_decompress_ptr cinfo, _JSAMPARRAY input_buf,
+                _JSAMPARRAY output_buf, int num_rows)
 /* This version performs Floyd-Steinberg dithering */
 {
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
@@ -935,19 +939,19 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
   LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */
   LOCFSERROR bpreverr0, bpreverr1, bpreverr2; /* error for below/prev col */
   register FSERRPTR errorptr;   /* => fserrors[] at column before current */
-  JSAMPROW inptr;               /* => current input pixel */
-  JSAMPROW outptr;              /* => current output pixel */
+  _JSAMPROW inptr;              /* => current input pixel */
+  _JSAMPROW outptr;             /* => current output pixel */
   histptr cachep;
   int dir;                      /* +1 or -1 depending on direction */
   int dir3;                     /* 3*dir, for advancing inptr & errorptr */
   int row;
   JDIMENSION col;
   JDIMENSION width = cinfo->output_width;
-  JSAMPLE *range_limit = cinfo->sample_range_limit;
+  _JSAMPLE *range_limit = (_JSAMPLE *)cinfo->sample_range_limit;
   int *error_limit = cquantize->error_limiter;
-  JSAMPROW colormap0 = cinfo->colormap[0];
-  JSAMPROW colormap1 = cinfo->colormap[1];
-  JSAMPROW colormap2 = cinfo->colormap[2];
+  _JSAMPROW colormap0 = ((_JSAMPARRAY)cinfo->colormap)[0];
+  _JSAMPROW colormap1 = ((_JSAMPARRAY)cinfo->colormap)[1];
+  _JSAMPROW colormap2 = ((_JSAMPARRAY)cinfo->colormap)[2];
   SHIFT_TEMPS
 
   for (row = 0; row < num_rows; row++) {
@@ -992,8 +996,8 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
       cur0 = error_limit[cur0];
       cur1 = error_limit[cur1];
       cur2 = error_limit[cur2];
-      /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE.
-       * The maximum error is +- MAXJSAMPLE (or less with error limiting);
+      /* Form pixel value + error, and range-limit to 0.._MAXJSAMPLE.
+       * The maximum error is +- _MAXJSAMPLE (or less with error limiting);
        * this sets the required size of the range_limit array.
        */
       cur0 += inptr[0];
@@ -1013,7 +1017,7 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
       /* Now emit the colormap index for this cell */
       {
         register int pixcode = *cachep - 1;
-        *outptr = (JSAMPLE)pixcode;
+        *outptr = (_JSAMPLE)pixcode;
         /* Compute representation error for this pixel */
         cur0 -= colormap0[pixcode];
         cur1 -= colormap1[pixcode];
@@ -1064,7 +1068,7 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 /*
  * Initialize the error-limiting transfer function (lookup table).
  * The raw F-S error computation can potentially compute error values of up to
- * +- MAXJSAMPLE.  But we want the maximum correction applied to a pixel to be
+ * +- _MAXJSAMPLE.  But we want the maximum correction applied to a pixel to be
  * much less, otherwise obviously wrong pixels will be created.  (Typical
  * effects include weird fringes at color-area boundaries, isolated bright
  * pixels in a dark area, etc.)  The standard advice for avoiding this problem
@@ -1073,7 +1077,7 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
  * error buildup.  However, that only prevents the error from getting
  * completely out of hand; Aaron Giles reports that error limiting improves
  * the results even with corner colors allocated.
- * A simple clamping of the error values to about +- MAXJSAMPLE/8 works pretty
+ * A simple clamping of the error values to about +- _MAXJSAMPLE/8 works pretty
  * well, but the smoother transfer function used below is even better.  Thanks
  * to Aaron Giles for this idea.
  */
@@ -1087,22 +1091,22 @@ init_error_limit(j_decompress_ptr cinfo)
   int in, out;
 
   table = (int *)(*cinfo->mem->alloc_small)
-    ((j_common_ptr)cinfo, JPOOL_IMAGE, (MAXJSAMPLE * 2 + 1) * sizeof(int));
-  table += MAXJSAMPLE;          /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (_MAXJSAMPLE * 2 + 1) * sizeof(int));
+  table += _MAXJSAMPLE;         /* so can index -_MAXJSAMPLE .. +_MAXJSAMPLE */
   cquantize->error_limiter = table;
 
-#define STEPSIZE  ((MAXJSAMPLE + 1) / 16)
-  /* Map errors 1:1 up to +- MAXJSAMPLE/16 */
+#define STEPSIZE  ((_MAXJSAMPLE + 1) / 16)
+  /* Map errors 1:1 up to +- _MAXJSAMPLE/16 */
   out = 0;
   for (in = 0; in < STEPSIZE; in++, out++) {
     table[in] = out;  table[-in] = -out;
   }
-  /* Map errors 1:2 up to +- 3*MAXJSAMPLE/16 */
+  /* Map errors 1:2 up to +- 3*_MAXJSAMPLE/16 */
   for (; in < STEPSIZE * 3; in++, out += (in & 1) ? 0 : 1) {
     table[in] = out;  table[-in] = -out;
   }
-  /* Clamp the rest to final out value (which is (MAXJSAMPLE+1)/8) */
-  for (; in <= MAXJSAMPLE; in++) {
+  /* Clamp the rest to final out value (which is (_MAXJSAMPLE+1)/8) */
+  for (; in <= _MAXJSAMPLE; in++) {
     table[in] = out;  table[-in] = -out;
   }
 #undef STEPSIZE
@@ -1119,7 +1123,7 @@ finish_pass1(j_decompress_ptr cinfo)
   my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
 
   /* Select the representative colors and fill in cinfo->colormap */
-  cinfo->colormap = cquantize->sv_colormap;
+  cinfo->colormap = (JSAMPARRAY)cquantize->sv_colormap;
   select_colors(cinfo, cquantize->desired);
   /* Force next pass to zero the color index table */
   cquantize->needs_zeroed = TRUE;
@@ -1151,15 +1155,15 @@ start_pass_2_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
 
   if (is_pre_scan) {
     /* Set up method pointers */
-    cquantize->pub.color_quantize = prescan_quantize;
+    cquantize->pub._color_quantize = prescan_quantize;
     cquantize->pub.finish_pass = finish_pass1;
     cquantize->needs_zeroed = TRUE; /* Always zero histogram */
   } else {
     /* Set up method pointers */
     if (cinfo->dither_mode == JDITHER_FS)
-      cquantize->pub.color_quantize = pass2_fs_dither;
+      cquantize->pub._color_quantize = pass2_fs_dither;
     else
-      cquantize->pub.color_quantize = pass2_no_dither;
+      cquantize->pub._color_quantize = pass2_no_dither;
     cquantize->pub.finish_pass = finish_pass2;
 
     /* Make sure color count is acceptable */
@@ -1215,11 +1219,14 @@ new_color_map_2_quant(j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_2pass_quantizer(j_decompress_ptr cinfo)
+_jinit_2pass_quantizer(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
   int i;
 
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
   cquantize = (my_cquantize_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_cquantizer));
@@ -1230,7 +1237,8 @@ jinit_2pass_quantizer(j_decompress_ptr cinfo)
   cquantize->error_limiter = NULL;
 
   /* Make sure jdmaster didn't give me a case I can't handle */
-  if (cinfo->out_color_components != 3)
+  if (cinfo->out_color_components != 3 ||
+      cinfo->out_color_space == JCS_RGB565 || cinfo->master->lossless)
     ERREXIT(cinfo, JERR_NOTIMPL);
 
   /* Allocate the histogram/inverse colormap storage */
@@ -1253,10 +1261,10 @@ jinit_2pass_quantizer(j_decompress_ptr cinfo)
     /* Lower bound on # of colors ... somewhat arbitrary as long as > 0 */
     if (desired < 8)
       ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, 8);
-    /* Make sure colormap indexes can be represented by JSAMPLEs */
+    /* Make sure colormap indexes can be represented by _JSAMPLEs */
     if (desired > MAXNUMCOLORS)
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
-    cquantize->sv_colormap = (*cinfo->mem->alloc_sarray)
+    cquantize->sv_colormap = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
       ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)desired, (JDIMENSION)3);
     cquantize->desired = desired;
   } else
@@ -1282,4 +1290,4 @@ jinit_2pass_quantizer(j_decompress_ptr cinfo)
   }
 }
 
-#endif /* QUANT_2PASS_SUPPORTED */
+#endif /* defined(QUANT_2PASS_SUPPORTED) && BITS_IN_JSAMPLE != 16 */
diff --git a/3rdparty/libjpeg-turbo/src/jsamplecomp.h b/3rdparty/libjpeg-turbo/src/jsamplecomp.h
new file mode 100644
index 000000000000..f3f275e6e29d
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/jsamplecomp.h
@@ -0,0 +1,336 @@
+/*
+ * jsamplecomp.h
+ *
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* In source files that must be compiled for multiple data precisions, we
+ * prefix all precision-dependent data types, macros, methods, fields, and
+ * function names with an underscore.  Including this file replaces those
+ * precision-independent tokens with their precision-dependent equivalents,
+ * based on the value of BITS_IN_JSAMPLE.
+ */
+
+#ifndef JSAMPLECOMP_H
+#define JSAMPLECOMP_H
+
+#if BITS_IN_JSAMPLE == 16
+
+/* Sample data types and macros (jmorecfg.h) */
+#define _JSAMPLE  J16SAMPLE
+
+#define _MAXJSAMPLE  MAXJ16SAMPLE
+#define _CENTERJSAMPLE   CENTERJ16SAMPLE
+
+#define _JSAMPROW  J16SAMPROW
+#define _JSAMPARRAY  J16SAMPARRAY
+#define _JSAMPIMAGE  J16SAMPIMAGE
+
+/* External functions (jpeglib.h) */
+#define _jpeg_write_scanlines  jpeg16_write_scanlines
+#define _jpeg_read_scanlines  jpeg16_read_scanlines
+
+/* Internal methods (jpegint.h) */
+
+#ifdef C_LOSSLESS_SUPPORTED
+/* Use the 16-bit method in the jpeg_c_main_controller structure. */
+#define _process_data  process_data_16
+/* Use the 16-bit method in the jpeg_c_prep_controller structure. */
+#define _pre_process_data  pre_process_data_16
+/* Use the 16-bit method in the jpeg_c_coef_controller structure. */
+#define _compress_data  compress_data_16
+/* Use the 16-bit method in the jpeg_color_converter structure. */
+#define _color_convert  color_convert_16
+/* Use the 16-bit method in the jpeg_downsampler structure. */
+#define _downsample  downsample_16
+#endif
+#ifdef D_LOSSLESS_SUPPORTED
+/* Use the 16-bit method in the jpeg_d_main_controller structure. */
+#define _process_data  process_data_16
+/* Use the 16-bit method in the jpeg_d_coef_controller structure. */
+#define _decompress_data  decompress_data_16
+/* Use the 16-bit method in the jpeg_d_post_controller structure. */
+#define _post_process_data  post_process_data_16
+/* Use the 16-bit method in the jpeg_upsampler structure. */
+#define _upsample  upsample_16
+/* Use the 16-bit method in the jpeg_color_converter structure. */
+#define _color_convert  color_convert_16
+#endif
+
+/* Global internal functions (jpegint.h) */
+#ifdef C_LOSSLESS_SUPPORTED
+#define _jinit_c_main_controller  j16init_c_main_controller
+#define _jinit_c_prep_controller  j16init_c_prep_controller
+#define _jinit_color_converter  j16init_color_converter
+#define _jinit_downsampler  j16init_downsampler
+#define _jinit_c_diff_controller  j16init_c_diff_controller
+#define _jinit_lossless_compressor  j16init_lossless_compressor
+#endif
+
+#ifdef D_LOSSLESS_SUPPORTED
+#define _jinit_d_main_controller  j16init_d_main_controller
+#define _jinit_d_post_controller  j16init_d_post_controller
+#define _jinit_upsampler  j16init_upsampler
+#define _jinit_color_deconverter  j16init_color_deconverter
+#define _jinit_merged_upsampler  j16init_merged_upsampler
+#define _jinit_d_diff_controller  j16init_d_diff_controller
+#define _jinit_lossless_decompressor  j16init_lossless_decompressor
+#endif
+
+#if defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED)
+#define _jcopy_sample_rows  j16copy_sample_rows
+#endif
+
+/* Internal fields (cdjpeg.h) */
+
+#if defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED)
+/* Use the 16-bit buffer in the cjpeg_source_struct and djpeg_dest_struct
+   structures. */
+#define _buffer  buffer16
+#endif
+
+/* Image I/O functions (cdjpeg.h) */
+#ifdef C_LOSSLESS_SUPPORTED
+#define _jinit_read_gif  j16init_read_gif
+#define _jinit_read_ppm  j16init_read_ppm
+#endif
+
+#ifdef D_LOSSLESS_SUPPORTED
+#define _jinit_write_ppm  j16init_write_ppm
+#endif
+
+#elif BITS_IN_JSAMPLE == 12
+
+/* Sample data types and macros (jmorecfg.h) */
+#define _JSAMPLE  J12SAMPLE
+
+#define _MAXJSAMPLE  MAXJ12SAMPLE
+#define _CENTERJSAMPLE   CENTERJ12SAMPLE
+
+#define _JSAMPROW  J12SAMPROW
+#define _JSAMPARRAY  J12SAMPARRAY
+#define _JSAMPIMAGE  J12SAMPIMAGE
+
+/* External functions (jpeglib.h) */
+#define _jpeg_write_scanlines  jpeg12_write_scanlines
+#define _jpeg_write_raw_data  jpeg12_write_raw_data
+#define _jpeg_read_scanlines  jpeg12_read_scanlines
+#define _jpeg_skip_scanlines  jpeg12_skip_scanlines
+#define _jpeg_crop_scanline  jpeg12_crop_scanline
+#define _jpeg_read_raw_data  jpeg12_read_raw_data
+
+/* Internal methods (jpegint.h) */
+
+/* Use the 12-bit method in the jpeg_c_main_controller structure. */
+#define _process_data  process_data_12
+/* Use the 12-bit method in the jpeg_c_prep_controller structure. */
+#define _pre_process_data  pre_process_data_12
+/* Use the 12-bit method in the jpeg_c_coef_controller structure. */
+#define _compress_data  compress_data_12
+/* Use the 12-bit method in the jpeg_color_converter structure. */
+#define _color_convert  color_convert_12
+/* Use the 12-bit method in the jpeg_downsampler structure. */
+#define _downsample  downsample_12
+/* Use the 12-bit method in the jpeg_forward_dct structure. */
+#define _forward_DCT  forward_DCT_12
+/* Use the 12-bit method in the jpeg_d_main_controller structure. */
+#define _process_data  process_data_12
+/* Use the 12-bit method in the jpeg_d_coef_controller structure. */
+#define _decompress_data  decompress_data_12
+/* Use the 12-bit method in the jpeg_d_post_controller structure. */
+#define _post_process_data  post_process_data_12
+/* Use the 12-bit method in the jpeg_inverse_dct structure. */
+#define _inverse_DCT_method_ptr  inverse_DCT_12_method_ptr
+#define _inverse_DCT  inverse_DCT_12
+/* Use the 12-bit method in the jpeg_upsampler structure. */
+#define _upsample  upsample_12
+/* Use the 12-bit method in the jpeg_color_converter structure. */
+#define _color_convert  color_convert_12
+/* Use the 12-bit method in the jpeg_color_quantizer structure. */
+#define _color_quantize  color_quantize_12
+
+/* Global internal functions (jpegint.h) */
+#define _jinit_c_main_controller  j12init_c_main_controller
+#define _jinit_c_prep_controller  j12init_c_prep_controller
+#define _jinit_c_coef_controller  j12init_c_coef_controller
+#define _jinit_color_converter  j12init_color_converter
+#define _jinit_downsampler  j12init_downsampler
+#define _jinit_forward_dct  j12init_forward_dct
+#ifdef C_LOSSLESS_SUPPORTED
+#define _jinit_c_diff_controller  j12init_c_diff_controller
+#define _jinit_lossless_compressor  j12init_lossless_compressor
+#endif
+
+#define _jinit_d_main_controller  j12init_d_main_controller
+#define _jinit_d_coef_controller  j12init_d_coef_controller
+#define _jinit_d_post_controller  j12init_d_post_controller
+#define _jinit_inverse_dct  j12init_inverse_dct
+#define _jinit_upsampler  j12init_upsampler
+#define _jinit_color_deconverter  j12init_color_deconverter
+#define _jinit_1pass_quantizer  j12init_1pass_quantizer
+#define _jinit_2pass_quantizer  j12init_2pass_quantizer
+#define _jinit_merged_upsampler  j12init_merged_upsampler
+#ifdef D_LOSSLESS_SUPPORTED
+#define _jinit_d_diff_controller  j12init_d_diff_controller
+#define _jinit_lossless_decompressor  j12init_lossless_decompressor
+#endif
+
+#define _jcopy_sample_rows  j12copy_sample_rows
+
+/* Global internal functions (jdct.h) */
+#define _jpeg_fdct_islow  jpeg12_fdct_islow
+#define _jpeg_fdct_ifast  jpeg12_fdct_ifast
+
+#define _jpeg_idct_islow  jpeg12_idct_islow
+#define _jpeg_idct_ifast  jpeg12_idct_ifast
+#define _jpeg_idct_float  jpeg12_idct_float
+#define _jpeg_idct_7x7  jpeg12_idct_7x7
+#define _jpeg_idct_6x6  jpeg12_idct_6x6
+#define _jpeg_idct_5x5  jpeg12_idct_5x5
+#define _jpeg_idct_4x4  jpeg12_idct_4x4
+#define _jpeg_idct_3x3  jpeg12_idct_3x3
+#define _jpeg_idct_2x2  jpeg12_idct_2x2
+#define _jpeg_idct_1x1  jpeg12_idct_1x1
+#define _jpeg_idct_9x9  jpeg12_idct_9x9
+#define _jpeg_idct_10x10  jpeg12_idct_10x10
+#define _jpeg_idct_11x11  jpeg12_idct_11x11
+#define _jpeg_idct_12x12  jpeg12_idct_12x12
+#define _jpeg_idct_13x13  jpeg12_idct_13x13
+#define _jpeg_idct_14x14  jpeg12_idct_14x14
+#define _jpeg_idct_15x15  jpeg12_idct_15x15
+#define _jpeg_idct_16x16  jpeg12_idct_16x16
+
+/* Internal fields (cdjpeg.h) */
+
+/* Use the 12-bit buffer in the cjpeg_source_struct and djpeg_dest_struct
+   structures. */
+#define _buffer  buffer12
+
+/* Image I/O functions (cdjpeg.h) */
+#define _jinit_read_gif  j12init_read_gif
+#define _jinit_write_gif  j12init_write_gif
+#define _jinit_read_ppm  j12init_read_ppm
+#define _jinit_write_ppm  j12init_write_ppm
+
+#define _read_color_map  read_color_map_12
+
+#else /* BITS_IN_JSAMPLE */
+
+/* Sample data types and macros (jmorecfg.h) */
+#define _JSAMPLE  JSAMPLE
+
+#define _MAXJSAMPLE  MAXJSAMPLE
+#define _CENTERJSAMPLE   CENTERJSAMPLE
+
+#define _JSAMPROW  JSAMPROW
+#define _JSAMPARRAY  JSAMPARRAY
+#define _JSAMPIMAGE  JSAMPIMAGE
+
+/* External functions (jpeglib.h) */
+#define _jpeg_write_scanlines  jpeg_write_scanlines
+#define _jpeg_write_raw_data  jpeg_write_raw_data
+#define _jpeg_read_scanlines  jpeg_read_scanlines
+#define _jpeg_skip_scanlines  jpeg_skip_scanlines
+#define _jpeg_crop_scanline  jpeg_crop_scanline
+#define _jpeg_read_raw_data  jpeg_read_raw_data
+
+/* Internal methods (jpegint.h) */
+
+/* Use the 8-bit method in the jpeg_c_main_controller structure. */
+#define _process_data  process_data
+/* Use the 8-bit method in the jpeg_c_prep_controller structure. */
+#define _pre_process_data  pre_process_data
+/* Use the 8-bit method in the jpeg_c_coef_controller structure. */
+#define _compress_data  compress_data
+/* Use the 8-bit method in the jpeg_color_converter structure. */
+#define _color_convert  color_convert
+/* Use the 8-bit method in the jpeg_downsampler structure. */
+#define _downsample  downsample
+/* Use the 8-bit method in the jpeg_forward_dct structure. */
+#define _forward_DCT  forward_DCT
+/* Use the 8-bit method in the jpeg_d_main_controller structure. */
+#define _process_data  process_data
+/* Use the 8-bit method in the jpeg_d_coef_controller structure. */
+#define _decompress_data  decompress_data
+/* Use the 8-bit method in the jpeg_d_post_controller structure. */
+#define _post_process_data  post_process_data
+/* Use the 8-bit method in the jpeg_inverse_dct structure. */
+#define _inverse_DCT_method_ptr  inverse_DCT_method_ptr
+#define _inverse_DCT  inverse_DCT
+/* Use the 8-bit method in the jpeg_upsampler structure. */
+#define _upsample  upsample
+/* Use the 8-bit method in the jpeg_color_converter structure. */
+#define _color_convert  color_convert
+/* Use the 8-bit method in the jpeg_color_quantizer structure. */
+#define _color_quantize  color_quantize
+
+/* Global internal functions (jpegint.h) */
+#define _jinit_c_main_controller  jinit_c_main_controller
+#define _jinit_c_prep_controller  jinit_c_prep_controller
+#define _jinit_c_coef_controller  jinit_c_coef_controller
+#define _jinit_color_converter  jinit_color_converter
+#define _jinit_downsampler  jinit_downsampler
+#define _jinit_forward_dct  jinit_forward_dct
+#ifdef C_LOSSLESS_SUPPORTED
+#define _jinit_c_diff_controller  jinit_c_diff_controller
+#define _jinit_lossless_compressor  jinit_lossless_compressor
+#endif
+
+#define _jinit_d_main_controller  jinit_d_main_controller
+#define _jinit_d_coef_controller  jinit_d_coef_controller
+#define _jinit_d_post_controller  jinit_d_post_controller
+#define _jinit_inverse_dct  jinit_inverse_dct
+#define _jinit_upsampler  jinit_upsampler
+#define _jinit_color_deconverter  jinit_color_deconverter
+#define _jinit_1pass_quantizer  jinit_1pass_quantizer
+#define _jinit_2pass_quantizer  jinit_2pass_quantizer
+#define _jinit_merged_upsampler  jinit_merged_upsampler
+#ifdef D_LOSSLESS_SUPPORTED
+#define _jinit_d_diff_controller  jinit_d_diff_controller
+#define _jinit_lossless_decompressor  jinit_lossless_decompressor
+#endif
+
+#define _jcopy_sample_rows  jcopy_sample_rows
+
+/* Global internal functions (jdct.h) */
+#define _jpeg_fdct_islow  jpeg_fdct_islow
+#define _jpeg_fdct_ifast  jpeg_fdct_ifast
+
+#define _jpeg_idct_islow  jpeg_idct_islow
+#define _jpeg_idct_ifast  jpeg_idct_ifast
+#define _jpeg_idct_float  jpeg_idct_float
+#define _jpeg_idct_7x7  jpeg_idct_7x7
+#define _jpeg_idct_6x6  jpeg_idct_6x6
+#define _jpeg_idct_5x5  jpeg_idct_5x5
+#define _jpeg_idct_4x4  jpeg_idct_4x4
+#define _jpeg_idct_3x3  jpeg_idct_3x3
+#define _jpeg_idct_2x2  jpeg_idct_2x2
+#define _jpeg_idct_1x1  jpeg_idct_1x1
+#define _jpeg_idct_9x9  jpeg_idct_9x9
+#define _jpeg_idct_10x10  jpeg_idct_10x10
+#define _jpeg_idct_11x11  jpeg_idct_11x11
+#define _jpeg_idct_12x12  jpeg_idct_12x12
+#define _jpeg_idct_13x13  jpeg_idct_13x13
+#define _jpeg_idct_14x14  jpeg_idct_14x14
+#define _jpeg_idct_15x15  jpeg_idct_15x15
+#define _jpeg_idct_16x16  jpeg_idct_16x16
+
+/* Internal fields (cdjpeg.h) */
+
+/* Use the 8-bit buffer in the cjpeg_source_struct and djpeg_dest_struct
+   structures. */
+#define _buffer  buffer
+
+/* Image I/O functions (cdjpeg.h) */
+#define _jinit_read_gif  jinit_read_gif
+#define _jinit_write_gif  jinit_write_gif
+#define _jinit_read_ppm  jinit_read_ppm
+#define _jinit_write_ppm  jinit_write_ppm
+
+#define _read_color_map  read_color_map
+
+#endif /* BITS_IN_JSAMPLE */
+
+#endif /* JSAMPLECOMP_H */
diff --git a/3rdparty/libjpeg-turbo/src/jsimd.h b/3rdparty/libjpeg-turbo/src/jsimd.h
index 6c203655ef84..6ae021a651df 100644
--- a/3rdparty/libjpeg-turbo/src/jsimd.h
+++ b/3rdparty/libjpeg-turbo/src/jsimd.h
@@ -2,8 +2,8 @@
  * jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2011, 2014, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
@@ -12,6 +12,8 @@
  *
  */
 
+#ifdef WITH_SIMD
+
 #include "jchuff.h"             /* Declarations shared with jcphuff.c */
 
 EXTERN(int) jsimd_can_rgb_ycc(void);
@@ -114,10 +116,12 @@ EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void);
 
 EXTERN(void) jsimd_encode_mcu_AC_first_prepare
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *values, size_t *zerobits);
+   UJCOEF *values, size_t *zerobits);
 
 EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
 
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *absvalues, size_t *bits);
+   UJCOEF *absvalues, size_t *bits);
+
+#endif /* WITH_SIMD */
diff --git a/3rdparty/libjpeg-turbo/src/jsimd_none.c b/3rdparty/libjpeg-turbo/src/jsimd_none.c
deleted file mode 100644
index 5b38a9fb5c99..000000000000
--- a/3rdparty/libjpeg-turbo/src/jsimd_none.c
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * jsimd_none.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
- * Copyright (C) 2020, Arm Limited.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains stubs for when there is no SIMD support available.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jsimd.h"
-#include "jdct.h"
-#include "jsimddct.h"
-
-GLOBAL(int)
-jsimd_can_rgb_ycc(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_c_can_null_convert(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                      JSAMPIMAGE output_buf, JDIMENSION output_row,
-                      int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                       JSAMPIMAGE output_buf, JDIMENSION output_row,
-                       int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                      JDIMENSION input_row, JSAMPARRAY output_buf,
-                      int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                         JDIMENSION input_row, JSAMPARRAY output_buf,
-                         int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-                     JSAMPIMAGE output_buf, JDIMENSION output_row,
-                     int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-                      JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-                      JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_int_upsample(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h1v2_fancy_upsample(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
-               DCTELEM *workspace)
-{
-}
-
-GLOBAL(void)
-jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
-                     FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow(DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast(DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_float(FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
-{
-}
-
-GLOBAL(void)
-jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                     FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_6x6(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_12x12(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-               JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-               JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-               JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                 JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                 JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                 JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                 JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block(void)
-{
-  return 0;
-}
-
-GLOBAL(JOCTET *)
-jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
-                            int last_dc_val, c_derived_tbl *dctbl,
-                            c_derived_tbl *actbl)
-{
-  return NULL;
-}
-
-GLOBAL(int)
-jsimd_can_encode_mcu_AC_first_prepare(void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
-                                  const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
-{
-}
-
-GLOBAL(int)
-jsimd_can_encode_mcu_AC_refine_prepare(void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
-                                   const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
-{
-  return 0;
-}
diff --git a/3rdparty/libjpeg-turbo/src/jutils.c b/3rdparty/libjpeg-turbo/src/jutils.c
index d86271624a66..24caac19021d 100644
--- a/3rdparty/libjpeg-turbo/src/jutils.c
+++ b/3rdparty/libjpeg-turbo/src/jutils.c
@@ -17,8 +17,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jsamplecomp.h"
 
 
+#if BITS_IN_JSAMPLE == 8
+
 /*
  * jpeg_zigzag_order[i] is the zigzag-order position of the i'th element
  * of a DCT block read in natural order (left to right, top to bottom).
@@ -89,19 +92,24 @@ jround_up(long a, long b)
   return a - (a % b);
 }
 
+#endif /* BITS_IN_JSAMPLE == 8 */
+
+
+#if BITS_IN_JSAMPLE != 16 || \
+    defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED)
 
 GLOBAL(void)
-jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
-                  JSAMPARRAY output_array, int dest_row, int num_rows,
-                  JDIMENSION num_cols)
+_jcopy_sample_rows(_JSAMPARRAY input_array, int source_row,
+                   _JSAMPARRAY output_array, int dest_row, int num_rows,
+                   JDIMENSION num_cols)
 /* Copy some rows of samples from one place to another.
  * num_rows rows are copied from input_array[source_row++]
  * to output_array[dest_row++]; these areas may overlap for duplication.
  * The source and destination arrays must be at least as wide as num_cols.
  */
 {
-  register JSAMPROW inptr, outptr;
-  register size_t count = (size_t)(num_cols * sizeof(JSAMPLE));
+  register _JSAMPROW inptr, outptr;
+  register size_t count = (size_t)(num_cols * sizeof(_JSAMPLE));
   register int row;
 
   input_array += source_row;
@@ -114,6 +122,11 @@ jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
   }
 }
 
+#endif /* BITS_IN_JSAMPLE != 16 ||
+          defined(C_LOSSLESS_SUPPORTED) || defined(D_LOSSLESS_SUPPORTED) */
+
+
+#if BITS_IN_JSAMPLE == 8
 
 GLOBAL(void)
 jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
@@ -131,3 +144,5 @@ jzero_far(void *target, size_t bytestozero)
 {
   memset(target, 0, bytestozero);
 }
+
+#endif /* BITS_IN_JSAMPLE == 8 */
diff --git a/3rdparty/libjpeg-turbo/src/jversion.h.in b/3rdparty/libjpeg-turbo/src/jversion.h.in
index dca4f08fdb4c..fc0ce3e09e3b 100644
--- a/3rdparty/libjpeg-turbo/src/jversion.h.in
+++ b/3rdparty/libjpeg-turbo/src/jversion.h.in
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2022, D. R. Commander.
+ * Copyright (C) 2010, 2012-2024, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -36,19 +36,21 @@
  *   their code
  */
 
-#define JCOPYRIGHT \
-  "Copyright (C) 2009-2022 D. R. Commander\n" \
+#define JCOPYRIGHT1 \
+  "Copyright (C) 2009-2024 D. R. Commander\n" \
   "Copyright (C) 2015, 2020 Google, Inc.\n" \
   "Copyright (C) 2019-2020 Arm Limited\n" \
   "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
   "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
-  "Copyright (C) 2015 Intel Corporation\n" \
+  "Copyright (C) 2015 Intel Corporation\n"
+#define JCOPYRIGHT2 \
   "Copyright (C) 2013-2014 Linaro Limited\n" \
   "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
   "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
   "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
   "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
+  "Copyright (C) 1999 Ken Murchison\n" \
+  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding\n"
 
 #define JCOPYRIGHT_SHORT \
   "Copyright (C) @COPYRIGHT_YEAR@ The libjpeg-turbo Project and many others"
diff --git a/3rdparty/libjpeg-turbo/src/libjpeg.map.in b/3rdparty/libjpeg-turbo/src/libjpeg.map.in
new file mode 100644
index 000000000000..b4480d834773
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/libjpeg.map.in
@@ -0,0 +1,11 @@
+LIBJPEGTURBO_@JPEG_LIB_VERSION_DECIMAL@ {
+  @MEM_SRCDST_FUNCTIONS@
+  local:
+    jsimd_*;
+    jconst_*;
+};
+
+LIBJPEG_@JPEG_LIB_VERSION_DECIMAL@ {
+  global:
+    *;
+};
diff --git a/3rdparty/libjpeg-turbo/src/libjpeg.txt b/3rdparty/libjpeg-turbo/src/libjpeg.txt
new file mode 100644
index 000000000000..0fe95bb63c43
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/libjpeg.txt
@@ -0,0 +1,3282 @@
+USING THE IJG JPEG LIBRARY
+
+This file was part of the Independent JPEG Group's software:
+Copyright (C) 1994-2013, Thomas G. Lane, Guido Vollbeding.
+Lossless JPEG Modifications:
+Copyright (C) 1999, Ken Murchison.
+libjpeg-turbo Modifications:
+Copyright (C) 2010, 2014-2018, 2020, 2022-2023, D. R. Commander.
+Copyright (C) 2015, Google, Inc.
+For conditions of distribution and use, see the accompanying README.ijg file.
+
+
+This file describes how to use the IJG JPEG library within an application
+program.  Read it if you want to write a program that uses the library.
+
+The file example.c provides heavily commented code for calling the JPEG
+library.  Also see jpeglib.h (the include file to be used by application
+programs) for full details about data structures and function parameter lists.
+The library source code, of course, is the ultimate reference.
+
+Note that there have been *major* changes from the application interface
+presented by IJG version 4 and earlier versions.  The old design had several
+inherent limitations, and it had accumulated a lot of cruft as we added
+features while trying to minimize application-interface changes.  We have
+sacrificed backward compatibility in the version 5 rewrite, but we think the
+improvements justify this.
+
+
+TABLE OF CONTENTS
+-----------------
+
+Overview:
+        Functions provided by the library
+        12-bit and 16-bit Data Precision
+        Outline of typical usage
+Basic library usage:
+        Data formats
+        Compression details
+        Decompression details
+        Partial image decompression
+        Mechanics of usage: include files, linking, etc
+Advanced features:
+        Compression parameter selection
+        Decompression parameter selection
+        Special color spaces
+        Error handling
+        Compressed data handling (source and destination managers)
+        I/O suspension
+        Progressive JPEG support
+        Buffered-image mode
+        Abbreviated datastreams and multiple images
+        Special markers
+        ICC profiles
+        Raw (downsampled) image data
+        Really raw data: DCT coefficients
+        Progress monitoring
+        Memory management
+        Memory usage
+        Library compile-time options
+        Portability considerations
+
+You should read at least the overview and basic usage sections before trying
+to program with the library.  The sections on advanced features can be read
+if and when you need them.
+
+
+OVERVIEW
+========
+
+Functions provided by the library
+---------------------------------
+
+The IJG JPEG library provides C code to read and write JPEG-compressed image
+files.  The surrounding application program receives or supplies image data a
+scanline at a time, using a straightforward uncompressed image format.  All
+details of color conversion and other preprocessing/postprocessing can be
+handled by the library.
+
+The library includes a substantial amount of code that is not covered by the
+JPEG standard but is necessary for typical applications of JPEG.  These
+functions preprocess the image before JPEG compression or postprocess it after
+decompression.  They include colorspace conversion, downsampling/upsampling,
+and color quantization.  The application indirectly selects use of this code
+by specifying the format in which it wishes to supply or receive image data.
+For example, if colormapped output is requested, then the decompression
+library automatically invokes color quantization.
+
+A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
+and even more so in decompression postprocessing.  The decompression library
+provides multiple implementations that cover most of the useful tradeoffs,
+ranging from very-high-quality down to fast-preview operation.  On the
+compression side we have generally not provided low-quality choices, since
+compression is normally less time-critical.  It should be understood that the
+low-quality modes may not meet the JPEG standard's accuracy requirements;
+nonetheless, they are useful for viewers.
+
+A word about functions *not* provided by the library.  We handle a subset of
+the ISO JPEG standard; most baseline, extended-sequential, and progressive
+JPEG processes are supported.  (Our subset includes all features now in common
+use.)  Unsupported ISO options include:
+        * Hierarchical storage
+        * DNL marker
+        * Nonintegral subsampling ratios
+We support 8-bit (lossy and lossless), 12-bit (lossy and lossless), and 16-bit
+(lossless) data precision.
+
+By itself, the library handles only interchange JPEG datastreams --- in
+particular the widely used JFIF file format.  The library can be used by
+surrounding code to process interchange or abbreviated JPEG datastreams that
+are embedded in more complex file formats.  (For example, this library is
+used by the free LIBTIFF library to support JPEG compression in TIFF.)
+
+
+12-bit and 16-bit Data Precision
+--------------------------------
+
+The JPEG standard provides for baseline 8-bit and 12-bit DCT processes as well
+as 8-bit, 12-bit, and 16-bit lossless (predictive) processes.  This code
+supports 12-bit-per-component lossy or lossless JPEG if you set
+cinfo->data_precision to 12 and 16-bit-per-component lossless JPEG if you set
+cinfo->data_precision to 16.  Note that this causes the sample size to be
+larger than a char, so it affects the surrounding application's image data.
+The sample applications cjpeg and djpeg can support 12-bit mode only for PPM,
+PGM, and GIF file formats and 16-bit mode only for PPM and PGM file formats.
+
+Note that, when 12-bit data precision is enabled, the library always compresses
+in Huffman optimization mode, in order to generate valid Huffman tables.  This
+is necessary because our default Huffman tables only cover 8-bit data.  If you
+need to output 12-bit files in one pass, you'll have to supply suitable default
+Huffman tables.  You may also want to supply your own DCT quantization tables;
+the existing quality-scaling code has been developed for 8-bit use, and
+probably doesn't generate especially good tables for 12-bit.
+
+Functions that are specific to 12-bit data precision have a prefix of "jpeg12_"
+instead of "jpeg_" and use the following data types and macros:
+
+  * J12SAMPLE instead of JSAMPLE
+  * J12SAMPROW instead of JSAMPROW
+  * J12SAMPARRAY instead of JSAMPARRAY
+  * J12SAMPIMAGE instead of JSAMPIMAGE
+  * MAXJ12SAMPLE instead of MAXJSAMPLE
+  * CENTERJ12SAMPLE instead of CENTERJSAMPLE
+
+Functions that are specific to 16-bit data precision have a prefix of "jpeg16_"
+instead of "jpeg_" and use the following data types and macros:
+
+  * J16SAMPLE instead of JSAMPLE
+  * J16SAMPROW instead of JSAMPROW
+  * J16SAMPARRAY instead of JSAMPARRAY
+  * J16SAMPIMAGE instead of JSAMPIMAGE
+  * MAXJ16SAMPLE instead of MAXJSAMPLE
+  * CENTERJ16SAMPLE instead of CENTERJSAMPLE
+
+This allows 8-bit, 12-bit, and 16-bit data precision to be used in a single
+application.  (Refer to example.c).  Arithmetic coding and SIMD acceleration
+are not currently implemented for 12-bit data precision, nor are they
+implemented for lossless mode with any data precision.
+
+Refer to the descriptions of the data_precision compression and decompression
+parameters below for further information.
+
+This documentation uses "J*SAMPLE", "J*SAMPROW", "J*SAMPARRAY", and
+"J*SAMPIMAGE" to generically refer to the 8-bit, 12-bit, or 16-bit data types.
+
+
+Outline of typical usage
+------------------------
+
+The rough outline of a JPEG compression operation is:
+
+        Allocate and initialize a JPEG compression object
+        Specify the destination for the compressed data (eg, a file)
+        Set parameters for compression, including image size & colorspace
+        jpeg_start_compress(...);
+        while (scan lines remain to be written)
+                jpeg_write_scanlines(...);  /* Use jpeg12_write_scanlines() for
+                                               12-bit data precision and
+                                               jpeg16_write_scanlines() for
+                                               16-bit data precision. */
+        jpeg_finish_compress(...);
+        Release the JPEG compression object
+
+A JPEG compression object holds parameters and working state for the JPEG
+library.  We make creation/destruction of the object separate from starting
+or finishing compression of an image; the same object can be re-used for a
+series of image compression operations.  This makes it easy to re-use the
+same parameter settings for a sequence of images.  Re-use of a JPEG object
+also has important implications for processing abbreviated JPEG datastreams,
+as discussed later.
+
+The image data to be compressed is supplied to jpeg*_write_scanlines() from
+in-memory buffers.  If the application is doing file-to-file compression,
+reading image data from the source file is the application's responsibility.
+The library emits compressed data by calling a "data destination manager",
+which typically will write the data into a file; but the application can
+provide its own destination manager to do something else.
+
+Similarly, the rough outline of a JPEG decompression operation is:
+
+        Allocate and initialize a JPEG decompression object
+        Specify the source of the compressed data (eg, a file)
+        Call jpeg_read_header() to obtain image info
+        Set parameters for decompression
+        jpeg_start_decompress(...);
+        while (scan lines remain to be read)
+                jpeg_read_scanlines(...);  /* Use jpeg12_read_scanlines() for
+                                              12-bit data precision and
+                                              jpeg16_read_scanlines() for
+                                              16-bit data precision. */
+        jpeg_finish_decompress(...);
+        Release the JPEG decompression object
+
+This is comparable to the compression outline except that reading the
+datastream header is a separate step.  This is helpful because information
+about the image's size, colorspace, etc is available when the application
+selects decompression parameters.  For example, the application can choose an
+output scaling ratio that will fit the image into the available screen size.
+
+The decompression library obtains compressed data by calling a data source
+manager, which typically will read the data from a file; but other behaviors
+can be obtained with a custom source manager.  Decompressed data is delivered
+into in-memory buffers passed to jpeg*_read_scanlines().
+
+It is possible to abort an incomplete compression or decompression operation
+by calling jpeg_abort(); or, if you do not need to retain the JPEG object,
+simply release it by calling jpeg_destroy().
+
+JPEG compression and decompression objects are two separate struct types.
+However, they share some common fields, and certain routines such as
+jpeg_destroy() can work on either type of object.
+
+The JPEG library has no static variables: all state is in the compression
+or decompression object.  Therefore it is possible to process multiple
+compression and decompression operations concurrently, using multiple JPEG
+objects.
+
+Both compression and decompression can be done in an incremental memory-to-
+memory fashion, if suitable source/destination managers are used.  See the
+section on "I/O suspension" for more details.
+
+
+BASIC LIBRARY USAGE
+===================
+
+Data formats
+------------
+
+Before diving into procedural details, it is helpful to understand the
+image data format that the JPEG library expects or returns.
+
+The standard input image format is a rectangular array of pixels, with each
+pixel having the same number of "component" or "sample" values (color
+channels).  You must specify how many components there are and the colorspace
+interpretation of the components.  Most applications will use RGB data
+(three components per pixel) or grayscale data (one component per pixel).
+PLEASE NOTE THAT RGB DATA IS THREE SAMPLES PER PIXEL, GRAYSCALE ONLY ONE.
+A remarkable number of people manage to miss this, only to find that their
+programs don't work with grayscale JPEG files.
+
+There is no provision for colormapped input.  JPEG files are always full-color
+or full grayscale (or sometimes another colorspace such as CMYK).  You can
+feed in a colormapped image by expanding it to full-color format.  However
+JPEG often doesn't work very well with source data that has been colormapped,
+because of dithering noise.  This is discussed in more detail in the JPEG FAQ
+and the other references mentioned in the README.ijg file.
+
+Pixels are stored by scanlines, with each scanline running from left to
+right.  The component values for each pixel are adjacent in the row; for
+example, R,G,B,R,G,B,R,G,B,... for 24-bit RGB color.  Each scanline is an
+array of data type JSAMPLE or J12SAMPLE --- which is typically "unsigned char"
+or "short" (respectively), unless you've changed jmorecfg.h.  (You can also
+change the RGB pixel layout, say to B,G,R order, by modifying jmorecfg.h.  But
+see the restrictions listed in that file before doing so.)
+
+A 2-D array of pixels is formed by making a list of pointers to the starts of
+scanlines; so the scanlines need not be physically adjacent in memory.  Even
+if you process just one scanline at a time, you must make a one-element
+pointer array to conform to this structure.  Pointers to J*SAMPLE rows are of
+type J*SAMPROW, and the pointer to the pointer array is of type J*SAMPARRAY.
+
+The library accepts or supplies one or more complete scanlines per call.
+It is not possible to process part of a row at a time.  Scanlines are always
+processed top-to-bottom.  You can process an entire image in one call if you
+have it all in memory, but usually it's simplest to process one scanline at
+a time.
+
+For best results, source data values should have the precision specified by
+cinfo->data_precision (normally 8 bits).  For instance, if you choose to
+compress data that's only 6 bits/channel, you should left-justify each value in
+a byte before passing it to the compressor.  If you need to compress data
+that has more than 8 bits/channel, set cinfo->data_precision = 12 or 16.
+
+
+The data format returned by the decompressor is the same in all details,
+except that colormapped output is supported.  (Again, a JPEG file is never
+colormapped.  But you can ask the decompressor to perform on-the-fly color
+quantization to deliver colormapped output.)  If you request colormapped
+output then the returned data array contains a single J*SAMPLE per pixel;
+its value is an index into a color map.  The color map is represented as
+a 2-D J*SAMPARRAY in which each row holds the values of one color component,
+that is, colormap[i][j] is the value of the i'th color component for pixel
+value (map index) j.  Note that since the colormap indexes are stored in
+J*SAMPLEs, the maximum number of colors is limited by the size of J*SAMPLE
+(ie, at most 256 colors for 8-bit data precision, 4096 colors for 12-bit data
+precision, and 65536 colors for 16-bit data precision).
+
+
+Compression details
+-------------------
+
+Here we revisit the JPEG compression outline given in the overview.
+
+1. Allocate and initialize a JPEG compression object.
+
+A JPEG compression object is a "struct jpeg_compress_struct".  (It also has
+a bunch of subsidiary structures which are allocated via malloc(), but the
+application doesn't control those directly.)  This struct can be just a local
+variable in the calling routine, if a single routine is going to execute the
+whole JPEG compression sequence.  Otherwise it can be static or allocated
+from malloc().
+
+You will also need a structure representing a JPEG error handler.  The part
+of this that the library cares about is a "struct jpeg_error_mgr".  If you
+are providing your own error handler, you'll typically want to embed the
+jpeg_error_mgr struct in a larger structure; this is discussed later under
+"Error handling".  For now we'll assume you are just using the default error
+handler.  The default error handler will print JPEG error/warning messages
+on stderr, and it will call exit() if a fatal error occurs.
+
+You must initialize the error handler structure, store a pointer to it into
+the JPEG object's "err" field, and then call jpeg_create_compress() to
+initialize the rest of the JPEG object.
+
+Typical code for this step, if you are using the default error handler, is
+
+        struct jpeg_compress_struct cinfo;
+        struct jpeg_error_mgr jerr;
+        ...
+        cinfo.err = jpeg_std_error(&jerr);
+        jpeg_create_compress(&cinfo);
+
+jpeg_create_compress allocates a small amount of memory, so it could fail
+if you are out of memory.  In that case it will exit via the error handler;
+that's why the error handler must be initialized first.
+
+
+2. Specify the destination for the compressed data (eg, a file).
+
+As previously mentioned, the JPEG library delivers compressed data to a
+"data destination" module.  The library includes one data destination
+module which knows how to write to a stdio stream.  You can use your own
+destination module if you want to do something else, as discussed later.
+
+If you use the standard destination module, you must open the target stdio
+stream beforehand.  Typical code for this step looks like:
+
+        FILE *outfile;
+        ...
+        if ((outfile = fopen(filename, "wb")) == NULL) {
+            fprintf(stderr, "can't open %s\n", filename);
+            exit(1);
+        }
+        jpeg_stdio_dest(&cinfo, outfile);
+
+where the last line invokes the standard destination module.
+
+WARNING: it is critical that the binary compressed data be delivered to the
+output file unchanged.  On non-Unix systems the stdio library may perform
+newline translation or otherwise corrupt binary data.  To suppress this
+behavior, you may need to use a "b" option to fopen (as shown above), or use
+setmode() or another routine to put the stdio stream in binary mode.  See
+cjpeg.c and djpeg.c for code that has been found to work on many systems.
+
+You can select the data destination after setting other parameters (step 3),
+if that's more convenient.  You may not change the destination between
+calling jpeg_start_compress() and jpeg_finish_compress().
+
+
+3. Set parameters for compression, including image size & colorspace.
+
+You must supply information about the source image by setting the following
+fields in the JPEG object (cinfo structure):
+
+        image_width             Width of image, in pixels
+        image_height            Height of image, in pixels
+        input_components        Number of color channels (samples per pixel)
+        in_color_space          Color space of source image
+
+The image dimensions are, hopefully, obvious.  JPEG supports image dimensions
+of 1 to 64K pixels in either direction.  The input color space is typically
+RGB or grayscale, and input_components is 3 or 1 accordingly.  (See "Special
+color spaces", later, for more info.)  The in_color_space field must be
+assigned one of the J_COLOR_SPACE enum constants, typically JCS_RGB or
+JCS_GRAYSCALE.
+
+JPEG has a large number of compression parameters that determine how the
+image is encoded.  Most applications don't need or want to know about all
+these parameters.  You can set all the parameters to reasonable defaults by
+calling jpeg_set_defaults(); then, if there are particular values you want
+to change, you can do so after that.  The "Compression parameter selection"
+section tells about all the parameters.
+
+You must set in_color_space correctly before calling jpeg_set_defaults(),
+because the defaults depend on the source image colorspace.  However the
+other three source image parameters need not be valid until you call
+jpeg_start_compress().  There's no harm in calling jpeg_set_defaults() more
+than once, if that happens to be convenient.
+
+Typical code for a 24-bit RGB source image is
+
+        cinfo.image_width = Width;      /* image width and height, in pixels */
+        cinfo.image_height = Height;
+        cinfo.input_components = 3;     /* # of color components per pixel */
+        cinfo.in_color_space = JCS_RGB; /* colorspace of input image */
+
+        jpeg_set_defaults(&cinfo);
+        /* Make optional parameter settings here */
+
+
+4. jpeg_start_compress(...);
+
+After you have established the data destination and set all the necessary
+source image info and other parameters, call jpeg_start_compress() to begin
+a compression cycle.  This will initialize internal state, allocate working
+storage, and emit the first few bytes of the JPEG datastream header.
+
+Typical code:
+
+        jpeg_start_compress(&cinfo, TRUE);
+
+The "TRUE" parameter ensures that a complete JPEG interchange datastream
+will be written.  This is appropriate in most cases.  If you think you might
+want to use an abbreviated datastream, read the section on abbreviated
+datastreams, below.
+
+Once you have called jpeg_start_compress(), you may not alter any JPEG
+parameters or other fields of the JPEG object until you have completed
+the compression cycle.
+
+
+5. while (scan lines remain to be written)
+        jpeg_write_scanlines(...);  /* Use jpeg12_write_scanlines() for 12-bit
+                                       data precision and
+                                       jpeg16_write_scanlines() for 16-bit data
+                                       precision. */
+
+Now write all the required image data by calling jpeg*_write_scanlines()
+one or more times.  You can pass one or more scanlines in each call, up
+to the total image height.  In most applications it is convenient to pass
+just one or a few scanlines at a time.  The expected format for the passed
+data is discussed under "Data formats", above.
+
+Image data should be written in top-to-bottom scanline order.
+Rec. ITU-T T.81 | ISO/IEC 10918-1 says, "Applications determine which edges of
+a source image are defined as top, bottom, left, and right."  However, if you
+want your files to be compatible with everyone else's, then top-to-bottom order
+must be used.  If the source data must be read in bottom-to-top order, then you
+can use the JPEG library's virtual array mechanism to invert the data
+efficiently.  Examples of this can be found in the sample application cjpeg.
+
+The library maintains a count of the number of scanlines written so far
+in the next_scanline field of the JPEG object.  Usually you can just use
+this variable as the loop counter, so that the loop test looks like
+"while (cinfo.next_scanline < cinfo.image_height)".
+
+Code for this step depends heavily on the way that you store the source data.
+example.c shows the following code for the case of a full-size 2-D source
+array containing 3-byte RGB pixels:
+
+        JSAMPROW row_pointer[1];        /* pointer to a single row
+                                           Use J12SAMPROW for 12-bit data
+                                           precision and J16SAMPROW for 16-bit
+                                           data precision. */
+
+        while (cinfo.next_scanline < cinfo.image_height) {
+            row_pointer[0] = image_buffer[cinfo.next_scanline];
+            jpeg_write_scanlines(&cinfo, row_pointer, 1);
+                                        /* Use jpeg12_write_scanlines() for
+                                           12-bit data precision and
+                                           jpeg16_write_scanlines() for 16-bit
+                                           data precision. */
+        }
+
+jpeg*_write_scanlines() returns the number of scanlines actually written.
+This will normally be equal to the number passed in, so you can usually
+ignore the return value.  It is different in just two cases:
+  * If you try to write more scanlines than the declared image height,
+    the additional scanlines are ignored.
+  * If you use a suspending data destination manager, output buffer overrun
+    will cause the compressor to return before accepting all the passed lines.
+    This feature is discussed under "I/O suspension", below.  The normal
+    stdio destination manager will NOT cause this to happen.
+In any case, the return value is the same as the change in the value of
+next_scanline.
+
+
+6. jpeg_finish_compress(...);
+
+After all the image data has been written, call jpeg_finish_compress() to
+complete the compression cycle.  This step is ESSENTIAL to ensure that the
+last bufferload of data is written to the data destination.
+jpeg_finish_compress() also releases working memory associated with the JPEG
+object.
+
+Typical code:
+
+        jpeg_finish_compress(&cinfo);
+
+If using the stdio destination manager, don't forget to close the output
+stdio stream (if necessary) afterwards.
+
+If you have requested a multi-pass operating mode, such as Huffman code
+optimization, jpeg_finish_compress() will perform the additional passes using
+data buffered by the first pass.  In this case jpeg_finish_compress() may take
+quite a while to complete.  With the default compression parameters, this will
+not happen.
+
+It is an error to call jpeg_finish_compress() before writing the necessary
+total number of scanlines.  If you wish to abort compression, call
+jpeg_abort() as discussed below.
+
+After completing a compression cycle, you may dispose of the JPEG object
+as discussed next, or you may use it to compress another image.  In that case
+return to step 2, 3, or 4 as appropriate.  If you do not change the
+destination manager, the new datastream will be written to the same target.
+If you do not change any JPEG parameters, the new datastream will be written
+with the same parameters as before.  Note that you can change the input image
+dimensions freely between cycles, but if you change the input colorspace, you
+should call jpeg_set_defaults() to adjust for the new colorspace; and then
+you'll need to repeat all of step 3.
+
+
+7. Release the JPEG compression object.
+
+When you are done with a JPEG compression object, destroy it by calling
+jpeg_destroy_compress().  This will free all subsidiary memory (regardless of
+the previous state of the object).  Or you can call jpeg_destroy(), which
+works for either compression or decompression objects --- this may be more
+convenient if you are sharing code between compression and decompression
+cases.  (Actually, these routines are equivalent except for the declared type
+of the passed pointer.  To avoid gripes from ANSI C compilers, jpeg_destroy()
+should be passed a j_common_ptr.)
+
+If you allocated the jpeg_compress_struct structure from malloc(), freeing
+it is your responsibility --- jpeg_destroy() won't.  Ditto for the error
+handler structure.
+
+Typical code:
+
+        jpeg_destroy_compress(&cinfo);
+
+
+8. Aborting.
+
+If you decide to abort a compression cycle before finishing, you can clean up
+in either of two ways:
+
+* If you don't need the JPEG object any more, just call
+  jpeg_destroy_compress() or jpeg_destroy() to release memory.  This is
+  legitimate at any point after calling jpeg_create_compress() --- in fact,
+  it's safe even if jpeg_create_compress() fails.
+
+* If you want to re-use the JPEG object, call jpeg_abort_compress(), or call
+  jpeg_abort() which works on both compression and decompression objects.
+  This will return the object to an idle state, releasing any working memory.
+  jpeg_abort() is allowed at any time after successful object creation.
+
+Note that cleaning up the data destination, if required, is your
+responsibility; neither of these routines will call term_destination().
+(See "Compressed data handling", below, for more about that.)
+
+jpeg_destroy() and jpeg_abort() are the only safe calls to make on a JPEG
+object that has reported an error by calling error_exit (see "Error handling"
+for more info).  The internal state of such an object is likely to be out of
+whack.  Either of these two routines will return the object to a known state.
+
+
+Decompression details
+---------------------
+
+Here we revisit the JPEG decompression outline given in the overview.
+
+1. Allocate and initialize a JPEG decompression object.
+
+This is just like initialization for compression, as discussed above,
+except that the object is a "struct jpeg_decompress_struct" and you
+call jpeg_create_decompress().  Error handling is exactly the same.
+
+Typical code:
+
+        struct jpeg_decompress_struct cinfo;
+        struct jpeg_error_mgr jerr;
+        ...
+        cinfo.err = jpeg_std_error(&jerr);
+        jpeg_create_decompress(&cinfo);
+
+(Both here and in the IJG code, we usually use variable name "cinfo" for
+both compression and decompression objects.)
+
+
+2. Specify the source of the compressed data (eg, a file).
+
+As previously mentioned, the JPEG library reads compressed data from a "data
+source" module.  The library includes one data source module which knows how
+to read from a stdio stream.  You can use your own source module if you want
+to do something else, as discussed later.
+
+If you use the standard source module, you must open the source stdio stream
+beforehand.  Typical code for this step looks like:
+
+        FILE *infile;
+        ...
+        if ((infile = fopen(filename, "rb")) == NULL) {
+            fprintf(stderr, "can't open %s\n", filename);
+            exit(1);
+        }
+        jpeg_stdio_src(&cinfo, infile);
+
+where the last line invokes the standard source module.
+
+WARNING: it is critical that the binary compressed data be read unchanged.
+On non-Unix systems the stdio library may perform newline translation or
+otherwise corrupt binary data.  To suppress this behavior, you may need to use
+a "b" option to fopen (as shown above), or use setmode() or another routine to
+put the stdio stream in binary mode.  See cjpeg.c and djpeg.c for code that
+has been found to work on many systems.
+
+You may not change the data source between calling jpeg_read_header() and
+jpeg_finish_decompress().  If you wish to read a series of JPEG images from
+a single source file, you should repeat the jpeg_read_header() to
+jpeg_finish_decompress() sequence without reinitializing either the JPEG
+object or the data source module; this prevents buffered input data from
+being discarded.
+
+
+3. Call jpeg_read_header() to obtain image info.
+
+Typical code for this step is just
+
+        jpeg_read_header(&cinfo, TRUE);
+
+This will read the source datastream header markers, up to the beginning
+of the compressed data proper.  On return, the image dimensions and other
+info have been stored in the JPEG object.  The application may wish to
+consult this information before selecting decompression parameters.
+
+More complex code is necessary if
+  * A suspending data source is used --- in that case jpeg_read_header()
+    may return before it has read all the header data.  See "I/O suspension",
+    below.  The normal stdio source manager will NOT cause this to happen.
+  * Abbreviated JPEG files are to be processed --- see the section on
+    abbreviated datastreams.  Standard applications that deal only in
+    interchange JPEG files need not be concerned with this case either.
+
+It is permissible to stop at this point if you just wanted to find out the
+image dimensions and other header info for a JPEG file.  In that case,
+call jpeg_destroy() when you are done with the JPEG object, or call
+jpeg_abort() to return it to an idle state before selecting a new data
+source and reading another header.
+
+
+4. Set parameters for decompression.
+
+jpeg_read_header() sets appropriate default decompression parameters based on
+the properties of the image (in particular, its colorspace).  However, you
+may well want to alter these defaults before beginning the decompression.
+For example, the default is to produce full color output from a color file.
+If you want colormapped output you must ask for it.  Other options allow the
+returned image to be scaled and allow various speed/quality tradeoffs to be
+selected.  "Decompression parameter selection", below, gives details.
+
+If the defaults are appropriate, nothing need be done at this step.
+
+Note that all default values are set by each call to jpeg_read_header().
+If you reuse a decompression object, you cannot expect your parameter
+settings to be preserved across cycles, as you can for compression.
+You must set desired parameter values each time.
+
+
+5. jpeg_start_decompress(...);
+
+Once the parameter values are satisfactory, call jpeg_start_decompress() to
+begin decompression.  This will initialize internal state, allocate working
+memory, and prepare for returning data.
+
+Typical code is just
+
+        jpeg_start_decompress(&cinfo);
+
+If you have requested a multi-pass operating mode, such as 2-pass color
+quantization, jpeg_start_decompress() will do everything needed before data
+output can begin.  In this case jpeg_start_decompress() may take quite a while
+to complete.  With a single-scan (non progressive) JPEG file and default
+decompression parameters, this will not happen; jpeg_start_decompress() will
+return quickly.
+
+After this call, the final output image dimensions, including any requested
+scaling, are available in the JPEG object; so is the selected colormap, if
+colormapped output has been requested.  Useful fields include
+
+        output_width            image width and height, as scaled
+        output_height
+        out_color_components    # of color components in out_color_space
+        output_components       # of color components returned per pixel
+        colormap                the selected colormap, if any
+        actual_number_of_colors         number of entries in colormap
+
+output_components is 1 (a colormap index) when quantizing colors; otherwise it
+equals out_color_components.  It is the number of J*SAMPLE values that will be
+emitted per pixel in the output arrays.
+
+Typically you will need to allocate data buffers to hold the incoming image.
+You will need output_width * output_components J*SAMPLEs per scanline in your
+output buffer, and a total of output_height scanlines will be returned.
+
+Note: if you are using the JPEG library's internal memory manager to allocate
+data buffers (as djpeg does), then the manager's protocol requires that you
+request large buffers *before* calling jpeg_start_decompress().  This is a
+little tricky since the output_XXX fields are not normally valid then.  You
+can make them valid by calling jpeg_calc_output_dimensions() after setting the
+relevant parameters (scaling, output color space, and quantization flag).
+
+
+6. while (scan lines remain to be read)
+        jpeg_read_scanlines(...);  /* Use jpeg12_read_scanlines() for 12-bit
+                                      data precision and
+                                      jpeg16_read_scanlines() for 16-bit data
+                                      precision. */
+
+Now you can read the decompressed image data by calling jpeg*_read_scanlines()
+one or more times.  At each call, you pass in the maximum number of scanlines
+to be read (ie, the height of your working buffer); jpeg*_read_scanlines()
+will return up to that many lines.  The return value is the number of lines
+actually read.  The format of the returned data is discussed under "Data
+formats", above.  Don't forget that grayscale and color JPEGs will return
+different data formats!
+
+Image data is returned in top-to-bottom scanline order.  If you must write
+out the image in bottom-to-top order, you can use the JPEG library's virtual
+array mechanism to invert the data efficiently.  Examples of this can be
+found in the sample application djpeg.
+
+The library maintains a count of the number of scanlines returned so far
+in the output_scanline field of the JPEG object.  Usually you can just use
+this variable as the loop counter, so that the loop test looks like
+"while (cinfo.output_scanline < cinfo.output_height)".  (Note that the test
+should NOT be against image_height, unless you never use scaling.  The
+image_height field is the height of the original unscaled image.)
+The return value always equals the change in the value of output_scanline.
+
+If you don't use a suspending data source, it is safe to assume that
+jpeg*_read_scanlines() reads at least one scanline per call, until the
+bottom of the image has been reached.
+
+If you use a buffer larger than one scanline, it is NOT safe to assume that
+jpeg*_read_scanlines() fills it.  (The current implementation returns only a
+few scanlines per call, no matter how large a buffer you pass.)  So you must
+always provide a loop that calls jpeg*_read_scanlines() repeatedly until the
+whole image has been read.
+
+
+7. jpeg_finish_decompress(...);
+
+After all the image data has been read, call jpeg_finish_decompress() to
+complete the decompression cycle.  This causes working memory associated
+with the JPEG object to be released.
+
+Typical code:
+
+        jpeg_finish_decompress(&cinfo);
+
+If using the stdio source manager, don't forget to close the source stdio
+stream if necessary.
+
+It is an error to call jpeg_finish_decompress() before reading the correct
+total number of scanlines.  If you wish to abort decompression, call
+jpeg_abort() as discussed below.
+
+After completing a decompression cycle, you may dispose of the JPEG object as
+discussed next, or you may use it to decompress another image.  In that case
+return to step 2 or 3 as appropriate.  If you do not change the source
+manager, the next image will be read from the same source.
+
+
+8. Release the JPEG decompression object.
+
+When you are done with a JPEG decompression object, destroy it by calling
+jpeg_destroy_decompress() or jpeg_destroy().  The previous discussion of
+destroying compression objects applies here too.
+
+Typical code:
+
+        jpeg_destroy_decompress(&cinfo);
+
+
+9. Aborting.
+
+You can abort a decompression cycle by calling jpeg_destroy_decompress() or
+jpeg_destroy() if you don't need the JPEG object any more, or
+jpeg_abort_decompress() or jpeg_abort() if you want to reuse the object.
+The previous discussion of aborting compression cycles applies here too.
+
+
+Partial image decompression
+---------------------------
+
+Partial image decompression is convenient for performance-critical applications
+that wish to view only a portion of a large JPEG image without decompressing
+the whole thing.  It it also useful in memory-constrained environments (such as
+on mobile devices.)  This library provides the following functions to support
+partial image decompression:
+
+1. Skipping rows when decompressing
+
+        jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines);
+                /* Use jpeg12_skip_scanlines() for 12-bit data precision. */
+
+This function provides application programmers with the ability to skip over
+multiple rows in the JPEG image.
+
+Suspending data sources are not supported by this function.  Calling
+jpeg*_skip_scanlines() with a suspending data source will result in undefined
+behavior.  Two-pass color quantization is also not supported by this function.
+Calling jpeg*_skip_scanlines() with two-pass color quantization enabled will
+result in an error.
+
+jpeg*_skip_scanlines() will not allow skipping past the bottom of the image.
+If the value of num_lines is large enough to skip past the bottom of the image,
+then the function will skip to the end of the image instead.
+
+If the value of num_lines is valid, then jpeg*_skip_scanlines() will always
+skip all of the input rows requested.  There is no need to inspect the return
+value of the function in that case.
+
+Best results will be achieved by calling jpeg*_skip_scanlines() for large
+chunks of rows.  The function should be viewed as a way to quickly jump to a
+particular vertical offset in the JPEG image in order to decode a subset of the
+image.  Used in this manner, it will provide significant performance
+improvements.
+
+Calling jpeg*_skip_scanlines() for small values of num_lines has several
+potential drawbacks:
+    1) JPEG decompression occurs in blocks, so if jpeg*_skip_scanlines() is
+       called from the middle of a decompression block, then it is likely that
+       much of the decompression work has already been done for the first
+       couple of rows that need to be skipped.
+    2) When this function returns, it must leave the decompressor in a state
+       such that it is ready to read the next line.  This may involve
+       decompressing a block that must be partially skipped.
+These issues are especially tricky for cases in which upsampling requires
+context rows.  In the worst case, jpeg*_skip_scanlines() will perform similarly
+to jpeg*_read_scanlines() (since it will actually call jpeg*_read_scanlines().)
+
+2. Decompressing partial scanlines
+
+        jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                            JDIMENSION *width)
+                /* Use jpeg12_crop_scanline() for 12-bit data precision. */
+
+This function provides application programmers with the ability to decompress
+only a portion of each row in the JPEG image.  It must be called after
+jpeg_start_decompress() and before any calls to jpeg*_read_scanlines() or
+jpeg*_skip_scanlines().
+
+If xoffset and width do not form a valid subset of the image row, then this
+function will generate an error.  Note that if the output image is scaled, then
+xoffset and width are relative to the scaled image dimensions.
+
+xoffset and width are passed by reference because xoffset must fall on an iMCU
+boundary.  If it doesn't, then it will be moved left to the nearest iMCU
+boundary, and width will be increased accordingly.  If the calling program does
+not like the adjusted values of xoffset and width, then it can call
+jpeg*_crop_scanline() again with new values (for instance, if it wants to move
+xoffset to the nearest iMCU boundary to the right instead of to the left.)
+
+After calling this function, cinfo->output_width will be set to the adjusted
+width.  This value should be used when allocating an output buffer to pass to
+jpeg*_read_scanlines().
+
+The output image from a partial-width decompression will be identical to the
+corresponding image region from a full decode, with one exception:  The "fancy"
+(smooth) h2v2 (4:2:0) and h2v1 (4:2:2) upsampling algorithms fill in the
+missing chroma components by averaging the chroma components from neighboring
+pixels, except on the right and left edges of the image (where there are no
+neighboring pixels.)  When performing a partial-width decompression, these
+"fancy" upsampling algorithms may treat the left and right edges of the partial
+image region as if they are the left and right edges of the image, meaning that
+the upsampling algorithm may be simplified.  The result is that the pixels on
+the left or right edge of the partial image may not be exactly identical to the
+corresponding pixels in the original image.
+
+
+Mechanics of usage: include files, linking, etc
+-----------------------------------------------
+
+Applications using the JPEG library should include the header file jpeglib.h
+to obtain declarations of data types and routines.  Before including
+jpeglib.h, include system headers that define at least the typedefs FILE and
+size_t.  On ANSI-conforming systems, including <stdio.h> is sufficient; on
+older Unix systems, you may need <sys/types.h> to define size_t.
+
+If the application needs to refer to individual JPEG library error codes, also
+include jerror.h to define those symbols.
+
+jpeglib.h indirectly includes the files jconfig.h and jmorecfg.h.  If you are
+installing the JPEG header files in a system directory, you will want to
+install all four files: jpeglib.h, jerror.h, jconfig.h, jmorecfg.h.
+
+The most convenient way to include the JPEG code into your executable program
+is to prepare a library file ("libjpeg.a", or a corresponding name on non-Unix
+machines) and reference it at your link step.  If you use only half of the
+library (only compression or only decompression), only that much code will be
+included from the library, unless your linker is hopelessly brain-damaged.
+The supplied build system builds libjpeg.a automatically.
+
+It may be worth pointing out that the core JPEG library does not actually
+require the stdio library: only the default source/destination managers and
+error handler need it.  You can use the library in a stdio-less environment
+if you replace those modules and use jmemnobs.c (or another memory manager of
+your own devising).  More info about the minimum system library requirements
+may be found in jinclude.h.
+
+
+ADVANCED FEATURES
+=================
+
+Compression parameter selection
+-------------------------------
+
+This section describes all the optional parameters you can set for JPEG
+compression, as well as the "helper" routines provided to assist in this
+task.  Proper setting of some parameters requires detailed understanding
+of the JPEG standard; if you don't know what a parameter is for, it's best
+not to mess with it!  See REFERENCES in the README.ijg file for pointers to
+more info about JPEG.
+
+It's a good idea to call jpeg_set_defaults() first, even if you plan to set
+all the parameters; that way your code is more likely to work with future JPEG
+libraries that have additional parameters.  For the same reason, we recommend
+you use a helper routine where one is provided, in preference to twiddling
+cinfo fields directly.
+
+The helper routines are:
+
+jpeg_set_defaults (j_compress_ptr cinfo)
+        This routine sets all JPEG parameters to reasonable defaults, using
+        only the input image's color space (field in_color_space, which must
+        already be set in cinfo).  Many applications will only need to use
+        this routine and perhaps jpeg_set_quality().
+
+jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+        Sets the JPEG file's colorspace (field jpeg_color_space) as specified,
+        and sets other color-space-dependent parameters appropriately.  See
+        "Special color spaces", below, before using this.  A large number of
+        parameters, including all per-component parameters, are set by this
+        routine; if you want to twiddle individual parameters you should call
+        jpeg_set_colorspace() before rather than after.
+
+jpeg_default_colorspace (j_compress_ptr cinfo)
+        Selects an appropriate JPEG colorspace based on cinfo->in_color_space,
+        and calls jpeg_set_colorspace().  This is actually a subroutine of
+        jpeg_set_defaults().  It's broken out in case you want to change
+        just the colorspace-dependent JPEG parameters.
+
+jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
+        Constructs JPEG quantization tables appropriate for the indicated
+        quality setting.  The quality value is expressed on the 0..100 scale
+        recommended by IJG (cjpeg's "-quality" switch uses this routine).
+        Note that the exact mapping from quality values to tables may change
+        in future IJG releases as more is learned about DCT quantization.
+        If the force_baseline parameter is TRUE, then the quantization table
+        entries are constrained to the range 1..255 for full JPEG baseline
+        compatibility.  In the current implementation, this only makes a
+        difference for quality settings below 25, and it effectively prevents
+        very small/low quality files from being generated.  The IJG decoder
+        is capable of reading the non-baseline files generated at low quality
+        settings when force_baseline is FALSE, but other decoders may not be.
+
+jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
+                         boolean force_baseline)
+        Same as jpeg_set_quality() except that the generated tables are the
+        sample tables given in Annex K (Clause K.1) of
+        Rec. ITU-T T.81 (1992) | ISO/IEC 10918-1:1994, multiplied by the
+        specified scale factor (which is expressed as a percentage; thus
+        scale_factor = 100 reproduces the spec's tables).  Note that larger
+        scale factors give lower quality.  This entry point is useful for
+        conforming to the Adobe PostScript DCT conventions, but we do not
+        recommend linear scaling as a user-visible quality scale otherwise.
+        force_baseline again constrains the computed table entries to 1..255.
+
+int jpeg_quality_scaling (int quality)
+        Converts a value on the IJG-recommended quality scale to a linear
+        scaling percentage.  Note that this routine may change or go away
+        in future releases --- IJG may choose to adopt a scaling method that
+        can't be expressed as a simple scalar multiplier, in which case the
+        premise of this routine collapses.  Caveat user.
+
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+        [libjpeg v7+ API/ABI emulation only]
+        Set default quantization tables with linear q_scale_factor[] values
+        (see below).
+
+jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
+                      const unsigned int *basic_table,
+                      int scale_factor, boolean force_baseline)
+        Allows an arbitrary quantization table to be created.  which_tbl
+        indicates which table slot to fill.  basic_table points to an array
+        of 64 unsigned ints given in normal array order.  These values are
+        multiplied by scale_factor/100 and then clamped to the range 1..65535
+        (or to 1..255 if force_baseline is TRUE).
+        CAUTION: prior to library version 6a, jpeg_add_quant_table expected
+        the basic table to be given in JPEG zigzag order.  If you need to
+        write code that works with either older or newer versions of this
+        routine, you must check the library version number.  Something like
+        "#if JPEG_LIB_VERSION >= 61" is the right test.
+
+jpeg_simple_progression (j_compress_ptr cinfo)
+        Generates a default scan script for writing a progressive-JPEG file.
+        This is the recommended method of creating a progressive file,
+        unless you want to make a custom scan sequence.  You must ensure that
+        the JPEG color space is set correctly before calling this routine.
+
+jpeg_enable_lossless (j_compress_ptr cinfo, int predictor_selection_value,
+                      int point_transform)
+        Enables lossless mode with the specified predictor selection value
+        (1 - 7) and optional point transform (0 - {precision}-1, where
+        {precision} is the JPEG data precision).  A point transform value of 0
+        is necessary in order to create a fully lossless JPEG image.  (A
+        non-zero point transform value right-shifts the input samples by the
+        specified number of bits, which is effectively a form of lossy color
+        quantization.)  In most cases, lossless mode is considerably slower
+        than, and does not compress as effectively as, lossy mode.  Thus, it is
+        typically used only for applications that require mathematically
+        lossless compression.  Note that the following features will be
+        unavailable when compressing or decompressing lossless JPEG images:
+          * Partial image decompression
+          * Quality/quantization table selection
+          * DCT/IDCT algorithm selection
+          * Smoothing
+          * Downsampling/upsampling
+          * Color space conversion (the JPEG image will use the same color
+            space as the input image)
+          * Color quantization
+          * IDCT scaling
+          * Raw (downsampled) data input/output
+          * Transcoding of DCT coefficients
+        Any parameters used to enable or configure those features will be
+        ignored.
+
+        Lossless mode shares no algorithms with lossy mode.  Instead, it uses
+        differential pulse-code modulation (DPCM), an algorithm whereby each
+        sample is encoded as the difference between the sample's value and a
+        "predictor", which is based on the values of neighboring samples.  If
+        Ra is the sample immediately to the left of the current sample, Rb is
+        the sample immediately above the current sample, and Rc is the sample
+        diagonally to the left and above the current sample, then the
+        relationship between the predictor selection value and the predictor is
+        as follows:
+
+        PSV  Predictor
+        --------------
+        1    Ra
+        2    Rb
+        3    Rc
+        4    Ra + Rb – Rc
+        5    Ra + (Rb – Rc) / 2
+        6    Rb + (Ra – Rc) / 2
+        7    (Ra + Rb) / 2
+
+        Predictors 1-3 are 1-dimensional predictors, whereas Predictors 4-7 are
+        2-dimensional predictors.  The best predictor for a particular image
+        depends on the image.
+
+
+Compression parameters (cinfo fields) include:
+
+boolean arith_code
+        If TRUE, use arithmetic coding.
+        If FALSE, use Huffman coding.
+
+int data_precision
+        To create a 12-bit-per-component JPEG file, set data_precision to 12
+        prior to calling jpeg_start_compress() or using the memory manager,
+        then use jpeg12_write_scanlines() or jpeg12_write_raw_data() instead of
+        jpeg_write_scanlines() or jpeg_write_raw_data().  To create a
+        16-bit-per-component lossless JPEG file, set data_precision to 16 prior
+        to calling jpeg_start_compress() or using the memory manager, then use
+        jpeg16_write_scanlines() instead of jpeg_write_scanlines().  Note that
+        16-bit data precision requires lossless mode.  (See
+        jpeg_enable_lossless().)
+
+J_DCT_METHOD dct_method
+        Selects the algorithm used for the DCT step.  Choices are:
+                JDCT_ISLOW: accurate integer method
+                JDCT_IFAST: less accurate integer method [legacy feature]
+                JDCT_FLOAT: floating-point method [legacy feature]
+                JDCT_DEFAULT: default method (normally JDCT_ISLOW)
+                JDCT_FASTEST: fastest method (normally JDCT_IFAST)
+        When the Independent JPEG Group's software was first released in 1991,
+        the compression time for a 1-megapixel JPEG image on a mainstream PC
+        was measured in minutes.  Thus, JDCT_IFAST provided noticeable
+        performance benefits.  On modern CPUs running libjpeg-turbo, however,
+        the compression time for a 1-megapixel JPEG image is measured in
+        milliseconds, and thus the performance benefits of JDCT_IFAST are much
+        less noticeable.  On modern x86/x86-64 CPUs that support AVX2
+        instructions, JDCT_IFAST and JDCT_ISLOW have similar performance.  On
+        other types of CPUs, JDCT_IFAST is generally about 5-15% faster than
+        JDCT_ISLOW.
+
+        For quality levels of 90 and below, there should be little or no
+        perceptible quality difference between the two algorithms.  For quality
+        levels above 90, however, the difference between JDCT_IFAST and
+        JDCT_ISLOW becomes more pronounced.  With quality=97, for instance,
+        JDCT_IFAST incurs generally about a 1-3 dB loss in PSNR relative to
+        JDCT_ISLOW, but this can be larger for some images.  Do not use
+        JDCT_IFAST with quality levels above 97.  The algorithm often
+        degenerates at quality=98 and above and can actually produce a more
+        lossy image than if lower quality levels had been used.  Also, in
+        libjpeg-turbo, JDCT_IFAST is not fully accelerated for quality levels
+        above 97, so it will be slower than JDCT_ISLOW.
+
+        JDCT_FLOAT does not produce significantly more accurate results than
+        JDCT_ISLOW, and it is much slower.  JDCT_FLOAT may also give different
+        results on different machines due to varying roundoff behavior, whereas
+        the integer methods should give the same results on all machines.
+
+J_COLOR_SPACE jpeg_color_space
+int num_components
+        The JPEG color space and corresponding number of components; see
+        "Special color spaces", below, for more info.  We recommend using
+        jpeg_set_color_space() if you want to change these.
+
+boolean optimize_coding
+        TRUE causes the compressor to compute optimal Huffman coding tables
+        for the image.  This requires an extra pass over the data and
+        therefore costs a good deal of space and time.  The default is
+        FALSE, which tells the compressor to use the supplied or default
+        Huffman tables.  In most cases optimal tables save only a few percent
+        of file size compared to the default tables.  Note that when this is
+        TRUE, you need not supply Huffman tables at all, and any you do
+        supply will be overwritten.
+
+unsigned int restart_interval
+int restart_in_rows
+        To emit restart markers in the JPEG file, set one of these nonzero.
+        Set restart_interval to specify the exact interval in MCU blocks
+        (samples in lossless mode).  Set restart_in_rows to specify the
+        interval in MCU rows.  (If restart_in_rows is not 0, then
+        restart_interval is set after the image width in MCUs is computed.)
+        Defaults are zero (no restarts).  One restart marker per MCU row is
+        often a good choice.  NOTE: the overhead of restart markers is higher
+        in grayscale JPEG files than in color files, and MUCH higher in
+        progressive JPEGs.  If you use restarts, you may want to use larger
+        intervals in those cases.
+
+const jpeg_scan_info *scan_info
+int num_scans
+        By default, scan_info is NULL; this causes the compressor to write a
+        single-scan sequential JPEG file.  If not NULL, scan_info points to
+        an array of scan definition records of length num_scans.  The
+        compressor will then write a JPEG file having one scan for each scan
+        definition record.  This is used to generate noninterleaved or
+        progressive JPEG files.  The library checks that the scan array
+        defines a valid JPEG scan sequence.  (jpeg_simple_progression creates
+        a suitable scan definition array for progressive JPEG.)  This is
+        discussed further under "Progressive JPEG support".
+
+int smoothing_factor
+        If non-zero, the input image is smoothed; the value should be 1 for
+        minimal smoothing to 100 for maximum smoothing.  Consult jcsample.c
+        for details of the smoothing algorithm.  The default is zero.
+
+boolean write_JFIF_header
+        If TRUE, a JFIF APP0 marker is emitted.  jpeg_set_defaults() and
+        jpeg_set_colorspace() set this TRUE if a JFIF-legal JPEG color space
+        (ie, YCbCr or grayscale) is selected, otherwise FALSE.
+
+UINT8 JFIF_major_version
+UINT8 JFIF_minor_version
+        The version number to be written into the JFIF marker.
+        jpeg_set_defaults() initializes the version to 1.01 (major=minor=1).
+        You should set it to 1.02 (major=1, minor=2) if you plan to write
+        any JFIF 1.02 extension markers.
+
+UINT8 density_unit
+UINT16 X_density
+UINT16 Y_density
+        The resolution information to be written into the JFIF marker;
+        not used otherwise.  density_unit may be 0 for unknown,
+        1 for dots/inch, or 2 for dots/cm.  The default values are 0,1,1
+        indicating square pixels of unknown size.
+
+boolean write_Adobe_marker
+        If TRUE, an Adobe APP14 marker is emitted.  jpeg_set_defaults() and
+        jpeg_set_colorspace() set this TRUE if JPEG color space RGB, CMYK,
+        or YCCK is selected, otherwise FALSE.  It is generally a bad idea
+        to set both write_JFIF_header and write_Adobe_marker.  In fact,
+        you probably shouldn't change the default settings at all --- the
+        default behavior ensures that the JPEG file's color space can be
+        recognized by the decoder.
+
+JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS]
+        Pointers to coefficient quantization tables, one per table slot,
+        or NULL if no table is defined for a slot.  Usually these should
+        be set via one of the above helper routines; jpeg_add_quant_table()
+        is general enough to define any quantization table.  The other
+        routines will set up table slot 0 for luminance quality and table
+        slot 1 for chrominance.
+
+int q_scale_factor[NUM_QUANT_TBLS]
+        [libjpeg v7+ API/ABI emulation only]
+        Linear quantization scaling factors (0-100, default 100)
+        for use with jpeg_default_qtables().
+        See rdswitch.c and cjpeg.c for an example of usage.
+        Note that the q_scale_factor[] values use "linear" scales, so JPEG
+        quality levels chosen by the user must be converted to these scales
+        using jpeg_quality_scaling().  Here is an example that corresponds to
+        cjpeg -quality 90,70:
+
+                jpeg_set_defaults(cinfo);
+
+                /* Set luminance quality 90. */
+                cinfo->q_scale_factor[0] = jpeg_quality_scaling(90);
+                /* Set chrominance quality 70. */
+                cinfo->q_scale_factor[1] = jpeg_quality_scaling(70);
+
+                jpeg_default_qtables(cinfo, force_baseline);
+
+        CAUTION: Setting separate quality levels for chrominance and luminance
+        is mainly only useful if chrominance subsampling is disabled.  2x2
+        chrominance subsampling (AKA "4:2:0") is the default, but you can
+        explicitly disable subsampling as follows:
+
+                cinfo->comp_info[0].v_samp_factor = 1;
+                cinfo->comp_info[0].h_samp_factor = 1;
+
+JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS]
+JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
+        Pointers to Huffman coding tables, one per table slot, or NULL if
+        no table is defined for a slot.  Slots 0 and 1 are filled with the
+        JPEG sample tables by jpeg_set_defaults().  If you need to allocate
+        more table structures, jpeg_alloc_huff_table() may be used.
+        Note that optimal Huffman tables can be computed for an image
+        by setting optimize_coding, as discussed above; there's seldom
+        any need to mess with providing your own Huffman tables.
+
+
+[libjpeg v7+ API/ABI emulation only]
+The actual dimensions of the JPEG image that will be written to the file are
+given by the following fields.  These are computed from the input image
+dimensions and the compression parameters by jpeg_start_compress().  You can
+also call jpeg_calc_jpeg_dimensions() to obtain the values that will result
+from the current parameter settings.  This can be useful if you are trying
+to pick a scaling ratio that will get close to a desired target size.
+
+JDIMENSION jpeg_width           Actual dimensions of output image.
+JDIMENSION jpeg_height
+
+
+Per-component parameters are stored in the struct cinfo.comp_info[i] for
+component number i.  Note that components here refer to components of the
+JPEG color space, *not* the source image color space.  A suitably large
+comp_info[] array is allocated by jpeg_set_defaults(); if you choose not
+to use that routine, it's up to you to allocate the array.
+
+int component_id
+        The one-byte identifier code to be recorded in the JPEG file for
+        this component.  For the standard color spaces, we recommend you
+        leave the default values alone.
+
+int h_samp_factor
+int v_samp_factor
+        Horizontal and vertical sampling factors for the component; must
+        be 1..4 according to the JPEG standard.  Note that larger sampling
+        factors indicate a higher-resolution component; many people find
+        this behavior quite unintuitive.  The default values are 2,2 for
+        luminance components and 1,1 for chrominance components, except
+        for grayscale where 1,1 is used.
+
+int quant_tbl_no
+        Quantization table number for component.  The default value is
+        0 for luminance components and 1 for chrominance components.
+
+int dc_tbl_no
+int ac_tbl_no
+        DC and AC entropy coding table numbers.  The default values are
+        0 for luminance components and 1 for chrominance components.
+
+int component_index
+        Must equal the component's index in comp_info[].  (Beginning in
+        release v6, the compressor library will fill this in automatically;
+        you don't have to.)
+
+
+Decompression parameter selection
+---------------------------------
+
+Decompression parameter selection is somewhat simpler than compression
+parameter selection, since all of the JPEG internal parameters are
+recorded in the source file and need not be supplied by the application.
+(Unless you are working with abbreviated files, in which case see
+"Abbreviated datastreams", below.)  Decompression parameters control
+the postprocessing done on the image to deliver it in a format suitable
+for the application's use.  Many of the parameters control speed/quality
+tradeoffs, in which faster decompression may be obtained at the price of
+a poorer-quality image.  The defaults select the highest quality (slowest)
+processing.
+
+The following fields in the JPEG object are set by jpeg_read_header() and
+may be useful to the application in choosing decompression parameters:
+
+int data_precision                      Data precision (bits per component)
+        If data_precision is 12, then use jpeg12_read_scanlines(),
+        jpeg12_skip_scanlines(), jpeg12_crop_scanline(), and/or
+        jpeg12_read_raw_data() instead of jpeg_read_scanlines(),
+        jpeg_skip_scanlines(), jpeg_crop_scanline(), and/or
+        jpeg_read_raw_data().  If data_precision is 16, then use
+        jpeg16_read_scanlines() instead of jpeg_read_scanlines().
+
+JDIMENSION image_width                  Width and height of image
+JDIMENSION image_height
+int num_components                      Number of color components
+J_COLOR_SPACE jpeg_color_space          Colorspace of image
+boolean saw_JFIF_marker                 TRUE if a JFIF APP0 marker was seen
+  UINT8 JFIF_major_version              Version information from JFIF marker
+  UINT8 JFIF_minor_version
+  UINT8 density_unit                    Resolution data from JFIF marker
+  UINT16 X_density
+  UINT16 Y_density
+boolean saw_Adobe_marker                TRUE if an Adobe APP14 marker was seen
+  UINT8 Adobe_transform                 Color transform code from Adobe marker
+
+The JPEG color space, unfortunately, is something of a guess since the JPEG
+standard proper does not provide a way to record it.  In practice most files
+adhere to the JFIF or Adobe conventions, and the decoder will recognize these
+correctly.  See "Special color spaces", below, for more info.
+
+
+The decompression parameters that determine the basic properties of the
+returned image are:
+
+J_COLOR_SPACE out_color_space
+        Output color space.  jpeg_read_header() sets an appropriate default
+        based on jpeg_color_space; typically it will be RGB or grayscale.
+        The application can change this field to request output in a different
+        colorspace.  For example, set it to JCS_GRAYSCALE to get grayscale
+        output from a color file.  (This is useful for previewing: grayscale
+        output is faster than full color since the color components need not
+        be processed.)  Note that not all possible color space transforms are
+        currently implemented; you may need to extend jdcolor.c if you want an
+        unusual conversion.
+
+unsigned int scale_num, scale_denom
+        Scale the image by the fraction scale_num/scale_denom.  Default is
+        1/1, or no scaling.  Currently, the only supported scaling ratios
+        are M/8 with all M from 1 to 16, or any reduced fraction thereof (such
+        as 1/2, 3/4, etc.)  (The library design allows for arbitrary
+        scaling ratios but this is not likely to be implemented any time soon.)
+        Smaller scaling ratios permit significantly faster decoding since
+        fewer pixels need be processed and a simpler IDCT method can be used.
+
+boolean quantize_colors
+        If set TRUE, colormapped output will be delivered.  Default is FALSE,
+        meaning that full-color output will be delivered.
+
+The next three parameters are relevant only if quantize_colors is TRUE.
+
+int desired_number_of_colors
+        Maximum number of colors to use in generating a library-supplied color
+        map (the actual number of colors is returned in a different field).
+        Default 256.  Ignored when the application supplies its own color map.
+
+boolean two_pass_quantize
+        If TRUE, an extra pass over the image is made to select a custom color
+        map for the image.  This usually looks a lot better than the one-size-
+        fits-all colormap that is used otherwise.  Default is TRUE.  Ignored
+        when the application supplies its own color map.
+
+J_DITHER_MODE dither_mode
+        Selects color dithering method.  Supported values are:
+                JDITHER_NONE    no dithering: fast, very low quality
+                JDITHER_ORDERED ordered dither: moderate speed and quality
+                JDITHER_FS      Floyd-Steinberg dither: slow, high quality
+        Default is JDITHER_FS.  (At present, ordered dither is implemented
+        only in the single-pass, standard-colormap case.  If you ask for
+        ordered dither when two_pass_quantize is TRUE or when you supply
+        an external color map, you'll get F-S dithering.)
+
+When quantize_colors is TRUE, the target color map is described by the next
+two fields.  colormap is set to NULL by jpeg_read_header().  The application
+can supply a color map by setting colormap non-NULL and setting
+actual_number_of_colors to the map size.  Otherwise, jpeg_start_decompress()
+selects a suitable color map and sets these two fields itself.
+[Implementation restriction: at present, an externally supplied colormap is
+only accepted for 3-component output color spaces.]
+
+JSAMPARRAY colormap
+        The color map, represented as a 2-D pixel array of out_color_components
+        rows and actual_number_of_colors columns.  Ignored if not quantizing.
+        CAUTION: if the JPEG library creates its own colormap, the storage
+        pointed to by this field is released by jpeg_finish_decompress().
+        Copy the colormap somewhere else first, if you want to save it.
+        CAUTION: if data_precision is 12 or 16, then this is actually a
+        J12SAMPARRAY or a J16SAMPARRAY, so it must be type-cast in order to
+        read/write 12-bit or 16-bit samples from/to the array.
+
+int actual_number_of_colors
+        The number of colors in the color map.
+
+Additional decompression parameters that the application may set include:
+
+J_DCT_METHOD dct_method
+        Selects the algorithm used for the DCT step.  Choices are:
+                JDCT_ISLOW: accurate integer method
+                JDCT_IFAST: less accurate integer method [legacy feature]
+                JDCT_FLOAT: floating-point method [legacy feature]
+                JDCT_DEFAULT: default method (normally JDCT_ISLOW)
+                JDCT_FASTEST: fastest method (normally JDCT_IFAST)
+        When the Independent JPEG Group's software was first released in 1991,
+        the decompression time for a 1-megapixel JPEG image on a mainstream PC
+        was measured in minutes.  Thus, JDCT_IFAST provided noticeable
+        performance benefits.  On modern CPUs running libjpeg-turbo, however,
+        the decompression time for a 1-megapixel JPEG image is measured in
+        milliseconds, and thus the performance benefits of JDCT_IFAST are much
+        less noticeable.  On modern x86/x86-64 CPUs that support AVX2
+        instructions, JDCT_IFAST and JDCT_ISLOW have similar performance.  On
+        other types of CPUs, JDCT_IFAST is generally about 5-15% faster than
+        JDCT_ISLOW.
+
+        If the JPEG image was compressed using a quality level of 85 or below,
+        then there should be little or no perceptible quality difference
+        between the two algorithms.  When decompressing images that were
+        compressed using quality levels above 85, however, the difference
+        between JDCT_IFAST and JDCT_ISLOW becomes more pronounced.  With images
+        compressed using quality=97, for instance, JDCT_IFAST incurs generally
+        about a 4-6 dB loss in PSNR relative to JDCT_ISLOW, but this can be
+        larger for some images.  If you can avoid it, do not use JDCT_IFAST
+        when decompressing images that were compressed using quality levels
+        above 97.  The algorithm often degenerates for such images and can
+        actually produce a more lossy output image than if the JPEG image had
+        been compressed using lower quality levels.
+
+        JDCT_FLOAT does not produce significantly more accurate results than
+        JDCT_ISLOW, and it is much slower.  JDCT_FLOAT may also give different
+        results on different machines due to varying roundoff behavior, whereas
+        the integer methods should give the same results on all machines.
+
+boolean do_fancy_upsampling
+        If TRUE, do careful upsampling of chroma components.  If FALSE,
+        a faster but sloppier method is used.  Default is TRUE.  The visual
+        impact of the sloppier method is often very small.
+
+boolean do_block_smoothing
+        If TRUE, interblock smoothing is applied in early stages of decoding
+        progressive JPEG files; if FALSE, not.  Default is TRUE.  Early
+        progression stages look "fuzzy" with smoothing, "blocky" without.
+        In any case, block smoothing ceases to be applied after the first few
+        AC coefficients are known to full accuracy, so it is relevant only
+        when using buffered-image mode for progressive images.
+
+boolean enable_1pass_quant
+boolean enable_external_quant
+boolean enable_2pass_quant
+        These are significant only in buffered-image mode, which is
+        described in its own section below.
+
+
+The output image dimensions are given by the following fields.  These are
+computed from the source image dimensions and the decompression parameters
+by jpeg_start_decompress().  You can also call jpeg_calc_output_dimensions()
+to obtain the values that will result from the current parameter settings.
+This can be useful if you are trying to pick a scaling ratio that will get
+close to a desired target size.  It's also important if you are using the
+JPEG library's memory manager to allocate output buffer space, because you
+are supposed to request such buffers *before* jpeg_start_decompress().
+
+JDIMENSION output_width         Actual dimensions of output image.
+JDIMENSION output_height
+int out_color_components        Number of color components in out_color_space.
+int output_components           Number of color components returned.
+int rec_outbuf_height           Recommended height of scanline buffer.
+
+When quantizing colors, output_components is 1, indicating a single color map
+index per pixel.  Otherwise it equals out_color_components.  The output arrays
+are required to be output_width * output_components J*SAMPLEs wide.
+
+rec_outbuf_height is the recommended minimum height (in scanlines) of the
+buffer passed to jpeg*_read_scanlines().  If the buffer is smaller, the
+library will still work, but time will be wasted due to unnecessary data
+copying.  In high-quality modes, rec_outbuf_height is always 1, but some
+faster, lower-quality modes set it to larger values (typically 2 to 4).
+If you are going to ask for a high-speed processing mode, you may as well
+go to the trouble of honoring rec_outbuf_height so as to avoid data copying.
+(An output buffer larger than rec_outbuf_height lines is OK, but won't
+provide any material speed improvement over that height.)
+
+
+Special color spaces
+--------------------
+
+The JPEG standard itself is "color blind" and doesn't specify any particular
+color space.  It is customary to convert color data to a luminance/chrominance
+color space before compressing, since this permits greater compression.  The
+existing de-facto JPEG file format standards specify YCbCr or grayscale data
+(JFIF), or grayscale, RGB, YCbCr, CMYK, or YCCK (Adobe).  For special
+applications such as multispectral images, other color spaces can be used,
+but it must be understood that such files will be unportable.
+
+The JPEG library can handle the most common colorspace conversions (namely
+RGB <=> YCbCr and CMYK <=> YCCK).  It can also deal with data of an unknown
+color space, passing it through without conversion.  If you deal extensively
+with an unusual color space, you can easily extend the library to understand
+additional color spaces and perform appropriate conversions.
+
+For compression, the source data's color space is specified by field
+in_color_space.  This is transformed to the JPEG file's color space given
+by jpeg_color_space.  jpeg_set_defaults() chooses a reasonable JPEG color
+space depending on in_color_space, but you can override this by calling
+jpeg_set_colorspace().  Of course you must select a supported transformation.
+jccolor.c currently supports the following transformations:
+        RGB => YCbCr
+        RGB => GRAYSCALE
+        YCbCr => GRAYSCALE
+        CMYK => YCCK
+plus the null transforms: GRAYSCALE => GRAYSCALE, RGB => RGB,
+YCbCr => YCbCr, CMYK => CMYK, YCCK => YCCK, and UNKNOWN => UNKNOWN.
+
+The de-facto file format standards (JFIF and Adobe) specify APPn markers that
+indicate the color space of the JPEG file.  It is important to ensure that
+these are written correctly, or omitted if the JPEG file's color space is not
+one of the ones supported by the de-facto standards.  jpeg_set_colorspace()
+will set the compression parameters to include or omit the APPn markers
+properly, so long as it is told the truth about the JPEG color space.
+For example, if you are writing some random 3-component color space without
+conversion, don't try to fake out the library by setting in_color_space and
+jpeg_color_space to JCS_YCbCr; use JCS_UNKNOWN.  You may want to write an
+APPn marker of your own devising to identify the colorspace --- see "Special
+markers", below.
+
+When told that the color space is UNKNOWN, the library will default to using
+luminance-quality compression parameters for all color components.  You may
+well want to change these parameters.  See the source code for
+jpeg_set_colorspace(), in jcparam.c, for details.
+
+For decompression, the JPEG file's color space is given in jpeg_color_space,
+and this is transformed to the output color space out_color_space.
+jpeg_read_header's setting of jpeg_color_space can be relied on if the file
+conforms to JFIF or Adobe conventions, but otherwise it is no better than a
+guess.  If you know the JPEG file's color space for certain, you can override
+jpeg_read_header's guess by setting jpeg_color_space.  jpeg_read_header also
+selects a default output color space based on (its guess of) jpeg_color_space;
+set out_color_space to override this.  Again, you must select a supported
+transformation.  jdcolor.c currently supports
+        YCbCr => RGB
+        YCbCr => GRAYSCALE
+        RGB => GRAYSCALE
+        GRAYSCALE => RGB
+        YCCK => CMYK
+as well as the null transforms.  (Since GRAYSCALE=>RGB is provided, an
+application can force grayscale JPEGs to look like color JPEGs if it only
+wants to handle one case.)
+
+The two-pass color quantizer, jquant2.c, is specialized to handle RGB data
+(it weights distances appropriately for RGB colors).  You'll need to modify
+the code if you want to use it for non-RGB output color spaces.  Note that
+jquant2.c is used to map to an application-supplied colormap as well as for
+the normal two-pass colormap selection process.
+
+CAUTION: it appears that Adobe Photoshop writes inverted data in CMYK JPEG
+files: 0 represents 100% ink coverage, rather than 0% ink as you'd expect.
+This is arguably a bug in Photoshop, but if you need to work with Photoshop
+CMYK files, you will have to deal with it in your application.  We cannot
+"fix" this in the library by inverting the data during the CMYK<=>YCCK
+transform, because that would break other applications, notably Ghostscript.
+Photoshop versions prior to 3.0 write EPS files containing JPEG-encoded CMYK
+data in the same inverted-YCCK representation used in bare JPEG files, but
+the surrounding PostScript code performs an inversion using the PS image
+operator.  I am told that Photoshop 3.0 will write uninverted YCCK in
+EPS/JPEG files, and will omit the PS-level inversion.  (But the data
+polarity used in bare JPEG files will not change in 3.0.)  In either case,
+the JPEG library must not invert the data itself, or else Ghostscript would
+read these EPS files incorrectly.
+
+
+Error handling
+--------------
+
+When the default error handler is used, any error detected inside the JPEG
+routines will cause a message to be printed on stderr, followed by exit().
+You can supply your own error handling routines to override this behavior
+and to control the treatment of nonfatal warnings and trace/debug messages.
+The file example.c illustrates the most common case, which is to have the
+application regain control after an error rather than exiting.
+
+The JPEG library never writes any message directly; it always goes through
+the error handling routines.  Three classes of messages are recognized:
+  * Fatal errors: the library cannot continue.
+  * Warnings: the library can continue, but the data is corrupt, and a
+    damaged output image is likely to result.
+  * Trace/informational messages.  These come with a trace level indicating
+    the importance of the message; you can control the verbosity of the
+    program by adjusting the maximum trace level that will be displayed.
+
+You may, if you wish, simply replace the entire JPEG error handling module
+(jerror.c) with your own code.  However, you can avoid code duplication by
+only replacing some of the routines depending on the behavior you need.
+This is accomplished by calling jpeg_std_error() as usual, but then overriding
+some of the method pointers in the jpeg_error_mgr struct, as illustrated by
+example.c.
+
+All of the error handling routines will receive a pointer to the JPEG object
+(a j_common_ptr which points to either a jpeg_compress_struct or a
+jpeg_decompress_struct; if you need to tell which, test the is_decompressor
+field).  This struct includes a pointer to the error manager struct in its
+"err" field.  Frequently, custom error handler routines will need to access
+additional data which is not known to the JPEG library or the standard error
+handler.  The most convenient way to do this is to embed either the JPEG
+object or the jpeg_error_mgr struct in a larger structure that contains
+additional fields; then casting the passed pointer provides access to the
+additional fields.  Again, see example.c for one way to do it.  (Beginning
+with IJG version 6b, there is also a void pointer "client_data" in each
+JPEG object, which the application can also use to find related data.
+The library does not touch client_data at all.)
+
+The individual methods that you might wish to override are:
+
+error_exit (j_common_ptr cinfo)
+        Receives control for a fatal error.  Information sufficient to
+        generate the error message has been stored in cinfo->err; call
+        output_message to display it.  Control must NOT return to the caller;
+        generally this routine will exit() or longjmp() somewhere.
+        Typically you would override this routine to get rid of the exit()
+        default behavior.  Note that if you continue processing, you should
+        clean up the JPEG object with jpeg_abort() or jpeg_destroy().
+
+output_message (j_common_ptr cinfo)
+        Actual output of any JPEG message.  Override this to send messages
+        somewhere other than stderr.  Note that this method does not know
+        how to generate a message, only where to send it.
+
+format_message (j_common_ptr cinfo, char *buffer)
+        Constructs a readable error message string based on the error info
+        stored in cinfo->err.  This method is called by output_message.  Few
+        applications should need to override this method.  One possible
+        reason for doing so is to implement dynamic switching of error message
+        language.
+
+emit_message (j_common_ptr cinfo, int msg_level)
+        Decide whether or not to emit a warning or trace message; if so,
+        calls output_message.  The main reason for overriding this method
+        would be to abort on warnings.  msg_level is -1 for warnings,
+        0 and up for trace messages.
+
+Only error_exit() and emit_message() are called from the rest of the JPEG
+library; the other two are internal to the error handler.
+
+The actual message texts are stored in an array of strings which is pointed to
+by the field err->jpeg_message_table.  The messages are numbered from 0 to
+err->last_jpeg_message, and it is these code numbers that are used in the
+JPEG library code.  You could replace the message texts (for instance, with
+messages in French or German) by changing the message table pointer.  See
+jerror.h for the default texts.  CAUTION: this table will almost certainly
+change or grow from one library version to the next.
+
+It may be useful for an application to add its own message texts that are
+handled by the same mechanism.  The error handler supports a second "add-on"
+message table for this purpose.  To define an addon table, set the pointer
+err->addon_message_table and the message numbers err->first_addon_message and
+err->last_addon_message.  If you number the addon messages beginning at 1000
+or so, you won't have to worry about conflicts with the library's built-in
+messages.  See the sample applications cjpeg/djpeg for an example of using
+addon messages (the addon messages are defined in cderror.h).
+
+Actual invocation of the error handler is done via macros defined in jerror.h:
+        ERREXITn(...)   for fatal errors
+        WARNMSn(...)    for corrupt-data warnings
+        TRACEMSn(...)   for trace and informational messages.
+These macros store the message code and any additional parameters into the
+error handler struct, then invoke the error_exit() or emit_message() method.
+The variants of each macro are for varying numbers of additional parameters.
+The additional parameters are inserted into the generated message using
+standard printf() format codes.
+
+See jerror.h and jerror.c for further details.
+
+
+Compressed data handling (source and destination managers)
+----------------------------------------------------------
+
+The JPEG compression library sends its compressed data to a "destination
+manager" module.  The default destination manager just writes the data to a
+memory buffer or to a stdio stream, but you can provide your own manager to
+do something else.  Similarly, the decompression library calls a "source
+manager" to obtain the compressed data; you can provide your own source
+manager if you want the data to come from somewhere other than a memory
+buffer or a stdio stream.
+
+In both cases, compressed data is processed a bufferload at a time: the
+destination or source manager provides a work buffer, and the library invokes
+the manager only when the buffer is filled or emptied.  (You could define a
+one-character buffer to force the manager to be invoked for each byte, but
+that would be rather inefficient.)  The buffer's size and location are
+controlled by the manager, not by the library.  For example, the memory
+source manager just makes the buffer pointer and length point to the original
+data in memory.  In this case the buffer-reload procedure will be invoked
+only if the decompressor ran off the end of the datastream, which would
+indicate an erroneous datastream.
+
+The work buffer is defined as an array of datatype JOCTET, which is generally
+"char" or "unsigned char".  On a machine where char is not exactly 8 bits
+wide, you must define JOCTET as a wider data type and then modify the data
+source and destination modules to transcribe the work arrays into 8-bit units
+on external storage.
+
+A data destination manager struct contains a pointer and count defining the
+next byte to write in the work buffer and the remaining free space:
+
+        JOCTET *next_output_byte;   /* => next byte to write in buffer */
+        size_t free_in_buffer;      /* # of byte spaces remaining in buffer */
+
+The library increments the pointer and decrements the count until the buffer
+is filled.  The manager's empty_output_buffer method must reset the pointer
+and count.  The manager is expected to remember the buffer's starting address
+and total size in private fields not visible to the library.
+
+A data destination manager provides three methods:
+
+init_destination (j_compress_ptr cinfo)
+        Initialize destination.  This is called by jpeg_start_compress()
+        before any data is actually written.  It must initialize
+        next_output_byte and free_in_buffer.  free_in_buffer must be
+        initialized to a positive value.
+
+empty_output_buffer (j_compress_ptr cinfo)
+        This is called whenever the buffer has filled (free_in_buffer
+        reaches zero).  In typical applications, it should write out the
+        *entire* buffer (use the saved start address and buffer length;
+        ignore the current state of next_output_byte and free_in_buffer).
+        Then reset the pointer & count to the start of the buffer, and
+        return TRUE indicating that the buffer has been dumped.
+        free_in_buffer must be set to a positive value when TRUE is
+        returned.  A FALSE return should only be used when I/O suspension is
+        desired (this operating mode is discussed in the next section).
+
+term_destination (j_compress_ptr cinfo)
+        Terminate destination --- called by jpeg_finish_compress() after all
+        data has been written.  In most applications, this must flush any
+        data remaining in the buffer.  Use either next_output_byte or
+        free_in_buffer to determine how much data is in the buffer.
+
+term_destination() is NOT called by jpeg_abort() or jpeg_destroy().  If you
+want the destination manager to be cleaned up during an abort, you must do it
+yourself.
+
+You will also need code to create a jpeg_destination_mgr struct, fill in its
+method pointers, and insert a pointer to the struct into the "dest" field of
+the JPEG compression object.  This can be done in-line in your setup code if
+you like, but it's probably cleaner to provide a separate routine similar to
+the jpeg_stdio_dest() or jpeg_mem_dest() routines of the supplied destination
+managers.
+
+Decompression source managers follow a parallel design, but with some
+additional frammishes.  The source manager struct contains a pointer and count
+defining the next byte to read from the work buffer and the number of bytes
+remaining:
+
+        const JOCTET *next_input_byte;  /* => next byte to read from buffer */
+        size_t bytes_in_buffer;         /* # of bytes remaining in buffer */
+
+The library increments the pointer and decrements the count until the buffer
+is emptied.  The manager's fill_input_buffer method must reset the pointer and
+count.  In most applications, the manager must remember the buffer's starting
+address and total size in private fields not visible to the library.
+
+A data source manager provides five methods:
+
+init_source (j_decompress_ptr cinfo)
+        Initialize source.  This is called by jpeg_read_header() before any
+        data is actually read.  Unlike init_destination(), it may leave
+        bytes_in_buffer set to 0 (in which case a fill_input_buffer() call
+        will occur immediately).
+
+fill_input_buffer (j_decompress_ptr cinfo)
+        This is called whenever bytes_in_buffer has reached zero and more
+        data is wanted.  In typical applications, it should read fresh data
+        into the buffer (ignoring the current state of next_input_byte and
+        bytes_in_buffer), reset the pointer & count to the start of the
+        buffer, and return TRUE indicating that the buffer has been reloaded.
+        It is not necessary to fill the buffer entirely, only to obtain at
+        least one more byte.  bytes_in_buffer MUST be set to a positive value
+        if TRUE is returned.  A FALSE return should only be used when I/O
+        suspension is desired (this mode is discussed in the next section).
+
+skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+        Skip num_bytes worth of data.  The buffer pointer and count should
+        be advanced over num_bytes input bytes, refilling the buffer as
+        needed.  This is used to skip over a potentially large amount of
+        uninteresting data (such as an APPn marker).  In some applications
+        it may be possible to optimize away the reading of the skipped data,
+        but it's not clear that being smart is worth much trouble; large
+        skips are uncommon.  bytes_in_buffer may be zero on return.
+        A zero or negative skip count should be treated as a no-op.
+
+resync_to_restart (j_decompress_ptr cinfo, int desired)
+        This routine is called only when the decompressor has failed to find
+        a restart (RSTn) marker where one is expected.  Its mission is to
+        find a suitable point for resuming decompression.  For most
+        applications, we recommend that you just use the default resync
+        procedure, jpeg_resync_to_restart().  However, if you are able to back
+        up in the input data stream, or if you have a-priori knowledge about
+        the likely location of restart markers, you may be able to do better.
+        Read the read_restart_marker() and jpeg_resync_to_restart() routines
+        in jdmarker.c if you think you'd like to implement your own resync
+        procedure.
+
+term_source (j_decompress_ptr cinfo)
+        Terminate source --- called by jpeg_finish_decompress() after all
+        data has been read.  Often a no-op.
+
+For both fill_input_buffer() and skip_input_data(), there is no such thing
+as an EOF return.  If the end of the file has been reached, the routine has
+a choice of exiting via ERREXIT() or inserting fake data into the buffer.
+In most cases, generating a warning message and inserting a fake EOI marker
+is the best course of action --- this will allow the decompressor to output
+however much of the image is there.  In pathological cases, the decompressor
+may swallow the EOI and again demand data ... just keep feeding it fake EOIs.
+jdatasrc.c illustrates the recommended error recovery behavior.
+
+term_source() is NOT called by jpeg_abort() or jpeg_destroy().  If you want
+the source manager to be cleaned up during an abort, you must do it yourself.
+
+You will also need code to create a jpeg_source_mgr struct, fill in its method
+pointers, and insert a pointer to the struct into the "src" field of the JPEG
+decompression object.  This can be done in-line in your setup code if you
+like, but it's probably cleaner to provide a separate routine similar to the
+jpeg_stdio_src() or jpeg_mem_src() routines of the supplied source managers.
+
+For more information, consult the memory and stdio source and destination
+managers in jdatasrc.c and jdatadst.c.
+
+
+I/O suspension
+--------------
+
+Some applications need to use the JPEG library as an incremental memory-to-
+memory filter: when the compressed data buffer is filled or emptied, they want
+control to return to the outer loop, rather than expecting that the buffer can
+be emptied or reloaded within the data source/destination manager subroutine.
+The library supports this need by providing an "I/O suspension" mode, which we
+describe in this section.
+
+The I/O suspension mode is not a panacea: nothing is guaranteed about the
+maximum amount of time spent in any one call to the library, so it will not
+eliminate response-time problems in single-threaded applications.  If you
+need guaranteed response time, we suggest you "bite the bullet" and implement
+a real multi-tasking capability.
+
+To use I/O suspension, cooperation is needed between the calling application
+and the data source or destination manager; you will always need a custom
+source/destination manager.  (Please read the previous section if you haven't
+already.)  The basic idea is that the empty_output_buffer() or
+fill_input_buffer() routine is a no-op, merely returning FALSE to indicate
+that it has done nothing.  Upon seeing this, the JPEG library suspends
+operation and returns to its caller.  The surrounding application is
+responsible for emptying or refilling the work buffer before calling the
+JPEG library again.
+
+Compression suspension:
+
+For compression suspension, use an empty_output_buffer() routine that returns
+FALSE; typically it will not do anything else.  This will cause the
+compressor to return to the caller of jpeg*_write_scanlines(), with the return
+value indicating that not all the supplied scanlines have been accepted.
+The application must make more room in the output buffer, adjust the output
+buffer pointer/count appropriately, and then call jpeg*_write_scanlines()
+again, pointing to the first unconsumed scanline.
+
+When forced to suspend, the compressor will backtrack to a convenient stopping
+point (usually the start of the current MCU); it will regenerate some output
+data when restarted.  Therefore, although empty_output_buffer() is only
+called when the buffer is filled, you should NOT write out the entire buffer
+after a suspension.  Write only the data up to the current position of
+next_output_byte/free_in_buffer.  The data beyond that point will be
+regenerated after resumption.
+
+Because of the backtracking behavior, a good-size output buffer is essential
+for efficiency; you don't want the compressor to suspend often.  (In fact, an
+overly small buffer could lead to infinite looping, if a single MCU required
+more data than would fit in the buffer.)  We recommend a buffer of at least
+several Kbytes.  You may want to insert explicit code to ensure that you don't
+call jpeg*_write_scanlines() unless there is a reasonable amount of space in
+the output buffer; in other words, flush the buffer before trying to compress
+more data.
+
+The compressor does not allow suspension while it is trying to write JPEG
+markers at the beginning and end of the file.  This means that:
+  * At the beginning of a compression operation, there must be enough free
+    space in the output buffer to hold the header markers (typically 600 or
+    so bytes).  The recommended buffer size is bigger than this anyway, so
+    this is not a problem as long as you start with an empty buffer.  However,
+    this restriction might catch you if you insert large special markers, such
+    as a JFIF thumbnail image, without flushing the buffer afterwards.
+  * When you call jpeg_finish_compress(), there must be enough space in the
+    output buffer to emit any buffered data and the final EOI marker.  In the
+    current implementation, half a dozen bytes should suffice for this, but
+    for safety's sake we recommend ensuring that at least 100 bytes are free
+    before calling jpeg_finish_compress().
+
+A more significant restriction is that jpeg_finish_compress() cannot suspend.
+This means you cannot use suspension with multi-pass operating modes, namely
+Huffman code optimization and multiple-scan output.  Those modes write the
+whole file during jpeg_finish_compress(), which will certainly result in
+buffer overrun.  (Note that this restriction applies only to compression,
+not decompression.  The decompressor supports input suspension in all of its
+operating modes.)
+
+Decompression suspension:
+
+For decompression suspension, use a fill_input_buffer() routine that simply
+returns FALSE (except perhaps during error recovery, as discussed below).
+This will cause the decompressor to return to its caller with an indication
+that suspension has occurred.  This can happen at four places:
+  * jpeg_read_header(): will return JPEG_SUSPENDED.
+  * jpeg_start_decompress(): will return FALSE, rather than its usual TRUE.
+  * jpeg*_read_scanlines(): will return the number of scanlines already
+        completed (possibly 0).
+  * jpeg_finish_decompress(): will return FALSE, rather than its usual TRUE.
+The surrounding application must recognize these cases, load more data into
+the input buffer, and repeat the call.  In the case of jpeg*_read_scanlines(),
+increment the passed pointers past any scanlines successfully read.
+
+Just as with compression, the decompressor will typically backtrack to a
+convenient restart point before suspending.  When fill_input_buffer() is
+called, next_input_byte/bytes_in_buffer point to the current restart point,
+which is where the decompressor will backtrack to if FALSE is returned.
+The data beyond that position must NOT be discarded if you suspend; it needs
+to be re-read upon resumption.  In most implementations, you'll need to shift
+this data down to the start of your work buffer and then load more data after
+it.  Again, this behavior means that a several-Kbyte work buffer is essential
+for decent performance; furthermore, you should load a reasonable amount of
+new data before resuming decompression.  (If you loaded, say, only one new
+byte each time around, you could waste a LOT of cycles.)
+
+The skip_input_data() source manager routine requires special care in a
+suspension scenario.  This routine is NOT granted the ability to suspend the
+decompressor; it can decrement bytes_in_buffer to zero, but no more.  If the
+requested skip distance exceeds the amount of data currently in the input
+buffer, then skip_input_data() must set bytes_in_buffer to zero and record the
+additional skip distance somewhere else.  The decompressor will immediately
+call fill_input_buffer(), which should return FALSE, which will cause a
+suspension return.  The surrounding application must then arrange to discard
+the recorded number of bytes before it resumes loading the input buffer.
+(Yes, this design is rather baroque, but it avoids complexity in the far more
+common case where a non-suspending source manager is used.)
+
+If the input data has been exhausted, we recommend that you emit a warning
+and insert dummy EOI markers just as a non-suspending data source manager
+would do.  This can be handled either in the surrounding application logic or
+within fill_input_buffer(); the latter is probably more efficient.  If
+fill_input_buffer() knows that no more data is available, it can set the
+pointer/count to point to a dummy EOI marker and then return TRUE just as
+though it had read more data in a non-suspending situation.
+
+The decompressor does not attempt to suspend within standard JPEG markers;
+instead it will backtrack to the start of the marker and reprocess the whole
+marker next time.  Hence the input buffer must be large enough to hold the
+longest standard marker in the file.  Standard JPEG markers should normally
+not exceed a few hundred bytes each (DHT tables are typically the longest).
+We recommend at least a 2K buffer for performance reasons, which is much
+larger than any correct marker is likely to be.  For robustness against
+damaged marker length counts, you may wish to insert a test in your
+application for the case that the input buffer is completely full and yet
+the decoder has suspended without consuming any data --- otherwise, if this
+situation did occur, it would lead to an endless loop.  (The library can't
+provide this test since it has no idea whether "the buffer is full", or
+even whether there is a fixed-size input buffer.)
+
+The input buffer would need to be 64K to allow for arbitrary COM or APPn
+markers, but these are handled specially: they are either saved into allocated
+memory, or skipped over by calling skip_input_data().  In the former case,
+suspension is handled correctly, and in the latter case, the problem of
+buffer overrun is placed on skip_input_data's shoulders, as explained above.
+Note that if you provide your own marker handling routine for large markers,
+you should consider how to deal with buffer overflow.
+
+Multiple-buffer management:
+
+In some applications it is desirable to store the compressed data in a linked
+list of buffer areas, so as to avoid data copying.  This can be handled by
+having empty_output_buffer() or fill_input_buffer() set the pointer and count
+to reference the next available buffer; FALSE is returned only if no more
+buffers are available.  Although seemingly straightforward, there is a
+pitfall in this approach: the backtrack that occurs when FALSE is returned
+could back up into an earlier buffer.  For example, when fill_input_buffer()
+is called, the current pointer & count indicate the backtrack restart point.
+Since fill_input_buffer() will set the pointer and count to refer to a new
+buffer, the restart position must be saved somewhere else.  Suppose a second
+call to fill_input_buffer() occurs in the same library call, and no
+additional input data is available, so fill_input_buffer must return FALSE.
+If the JPEG library has not moved the pointer/count forward in the current
+buffer, then *the correct restart point is the saved position in the prior
+buffer*.  Prior buffers may be discarded only after the library establishes
+a restart point within a later buffer.  Similar remarks apply for output into
+a chain of buffers.
+
+The library will never attempt to backtrack over a skip_input_data() call,
+so any skipped data can be permanently discarded.  You still have to deal
+with the case of skipping not-yet-received data, however.
+
+It's much simpler to use only a single buffer; when fill_input_buffer() is
+called, move any unconsumed data (beyond the current pointer/count) down to
+the beginning of this buffer and then load new data into the remaining buffer
+space.  This approach requires a little more data copying but is far easier
+to get right.
+
+
+Progressive JPEG support
+------------------------
+
+Progressive JPEG rearranges the stored data into a series of scans of
+increasing quality.  In situations where a JPEG file is transmitted across a
+slow communications link, a decoder can generate a low-quality image very
+quickly from the first scan, then gradually improve the displayed quality as
+more scans are received.  The final image after all scans are complete is
+identical to that of a regular (sequential) JPEG file of the same quality
+setting.  Progressive JPEG files are often slightly smaller than equivalent
+sequential JPEG files, but the possibility of incremental display is the main
+reason for using progressive JPEG.
+
+The IJG encoder library generates progressive JPEG files when given a
+suitable "scan script" defining how to divide the data into scans.
+Creation of progressive JPEG files is otherwise transparent to the encoder.
+Progressive JPEG files can also be read transparently by the decoder library.
+If the decoding application simply uses the library as defined above, it
+will receive a final decoded image without any indication that the file was
+progressive.  Of course, this approach does not allow incremental display.
+To perform incremental display, an application needs to use the decoder
+library's "buffered-image" mode, in which it receives a decoded image
+multiple times.
+
+Each displayed scan requires about as much work to decode as a full JPEG
+image of the same size, so the decoder must be fairly fast in relation to the
+data transmission rate in order to make incremental display useful.  However,
+it is possible to skip displaying the image and simply add the incoming bits
+to the decoder's coefficient buffer.  This is fast because only Huffman
+decoding need be done, not IDCT, upsampling, colorspace conversion, etc.
+The IJG decoder library allows the application to switch dynamically between
+displaying the image and simply absorbing the incoming bits.  A properly
+coded application can automatically adapt the number of display passes to
+suit the time available as the image is received.  Also, a final
+higher-quality display cycle can be performed from the buffered data after
+the end of the file is reached.
+
+Progressive compression:
+
+To create a progressive JPEG file (or a multiple-scan sequential JPEG file),
+set the scan_info cinfo field to point to an array of scan descriptors, and
+perform compression as usual.  Instead of constructing your own scan list,
+you can call the jpeg_simple_progression() helper routine to create a
+recommended progression sequence; this method should be used by all
+applications that don't want to get involved in the nitty-gritty of
+progressive scan sequence design.  (If you want to provide user control of
+scan sequences, you may wish to borrow the scan script reading code found
+in rdswitch.c, so that you can read scan script files just like cjpeg's.)
+When scan_info is not NULL, the compression library will store DCT'd data
+into a buffer array as jpeg*_write_scanlines() is called, and will emit all
+the requested scans during jpeg_finish_compress().  This implies that
+multiple-scan output cannot be created with a suspending data destination
+manager, since jpeg_finish_compress() does not support suspension.  We
+should also note that the compressor currently forces Huffman optimization
+mode when creating a progressive JPEG file, because the default Huffman
+tables are unsuitable for progressive files.
+
+Progressive decompression:
+
+When buffered-image mode is not used, the decoder library will read all of
+a multi-scan file during jpeg_start_decompress(), so that it can provide a
+final decoded image.  (Here "multi-scan" means either progressive or
+multi-scan sequential.)  This makes multi-scan files transparent to the
+decoding application.  However, existing applications that used suspending
+input with version 5 of the IJG library will need to be modified to check
+for a suspension return from jpeg_start_decompress().
+
+To perform incremental display, an application must use the library's
+buffered-image mode.  This is described in the next section.
+
+
+Buffered-image mode
+-------------------
+
+In buffered-image mode, the library stores the partially decoded image in a
+coefficient buffer, from which it can be read out as many times as desired.
+This mode is typically used for incremental display of progressive JPEG files,
+but it can be used with any JPEG file.  Each scan of a progressive JPEG file
+adds more data (more detail) to the buffered image.  The application can
+display in lockstep with the source file (one display pass per input scan),
+or it can allow input processing to outrun display processing.  By making
+input and display processing run independently, it is possible for the
+application to adapt progressive display to a wide range of data transmission
+rates.
+
+The basic control flow for buffered-image decoding is
+
+        jpeg_create_decompress()
+        set data source
+        jpeg_read_header()
+        set overall decompression parameters
+        cinfo.buffered_image = TRUE;    /* select buffered-image mode */
+        jpeg_start_decompress()
+        for (each output pass) {
+            adjust output decompression parameters if required
+            jpeg_start_output()         /* start a new output pass */
+            for (all scanlines in image) {
+                jpeg_read_scanlines()   /* Use jpeg12_read_scanlines() for
+                                           12-bit data precision and
+                                           jpeg16_read_scanlines() for 16-bit
+                                           data precision. */
+                display scanlines
+            }
+            jpeg_finish_output()        /* terminate output pass */
+        }
+        jpeg_finish_decompress()
+        jpeg_destroy_decompress()
+
+This differs from ordinary unbuffered decoding in that there is an additional
+level of looping.  The application can choose how many output passes to make
+and how to display each pass.
+
+The simplest approach to displaying progressive images is to do one display
+pass for each scan appearing in the input file.  In this case the outer loop
+condition is typically
+        while (!jpeg_input_complete(&cinfo))
+and the start-output call should read
+        jpeg_start_output(&cinfo, cinfo.input_scan_number);
+The second parameter to jpeg_start_output() indicates which scan of the input
+file is to be displayed; the scans are numbered starting at 1 for this
+purpose.  (You can use a loop counter starting at 1 if you like, but using
+the library's input scan counter is easier.)  The library automatically reads
+data as necessary to complete each requested scan, and jpeg_finish_output()
+advances to the next scan or end-of-image marker (hence input_scan_number
+will be incremented by the time control arrives back at jpeg_start_output()).
+With this technique, data is read from the input file only as needed, and
+input and output processing run in lockstep.
+
+After reading the final scan and reaching the end of the input file, the
+buffered image remains available; it can be read additional times by
+repeating the jpeg_start_output()/jpeg*_read_scanlines()/jpeg_finish_output()
+sequence.  For example, a useful technique is to use fast one-pass color
+quantization for display passes made while the image is arriving, followed by
+a final display pass using two-pass quantization for highest quality.  This
+is done by changing the library parameters before the final output pass.
+Changing parameters between passes is discussed in detail below.
+
+In general the last scan of a progressive file cannot be recognized as such
+until after it is read, so a post-input display pass is the best approach if
+you want special processing in the final pass.
+
+When done with the image, be sure to call jpeg_finish_decompress() to release
+the buffered image (or just use jpeg_destroy_decompress()).
+
+If input data arrives faster than it can be displayed, the application can
+cause the library to decode input data in advance of what's needed to produce
+output.  This is done by calling the routine jpeg_consume_input().
+The return value is one of the following:
+        JPEG_REACHED_SOS:    reached an SOS marker (the start of a new scan)
+        JPEG_REACHED_EOI:    reached the EOI marker (end of image)
+        JPEG_ROW_COMPLETED:  completed reading one MCU row of compressed data
+        JPEG_SCAN_COMPLETED: completed reading last MCU row of current scan
+        JPEG_SUSPENDED:      suspended before completing any of the above
+(JPEG_SUSPENDED can occur only if a suspending data source is used.)  This
+routine can be called at any time after initializing the JPEG object.  It
+reads some additional data and returns when one of the indicated significant
+events occurs.  (If called after the EOI marker is reached, it will
+immediately return JPEG_REACHED_EOI without attempting to read more data.)
+
+The library's output processing will automatically call jpeg_consume_input()
+whenever the output processing overtakes the input; thus, simple lockstep
+display requires no direct calls to jpeg_consume_input().  But by adding
+calls to jpeg_consume_input(), you can absorb data in advance of what is
+being displayed.  This has two benefits:
+  * You can limit buildup of unprocessed data in your input buffer.
+  * You can eliminate extra display passes by paying attention to the
+    state of the library's input processing.
+
+The first of these benefits only requires interspersing calls to
+jpeg_consume_input() with your display operations and any other processing
+you may be doing.  To avoid wasting cycles due to backtracking, it's best to
+call jpeg_consume_input() only after a hundred or so new bytes have arrived.
+This is discussed further under "I/O suspension", above.  (Note: the JPEG
+library currently is not thread-safe.  You must not call jpeg_consume_input()
+from one thread of control if a different library routine is working on the
+same JPEG object in another thread.)
+
+When input arrives fast enough that more than one new scan is available
+before you start a new output pass, you may as well skip the output pass
+corresponding to the completed scan.  This occurs for free if you pass
+cinfo.input_scan_number as the target scan number to jpeg_start_output().
+The input_scan_number field is simply the index of the scan currently being
+consumed by the input processor.  You can ensure that this is up-to-date by
+emptying the input buffer just before calling jpeg_start_output(): call
+jpeg_consume_input() repeatedly until it returns JPEG_SUSPENDED or
+JPEG_REACHED_EOI.
+
+The target scan number passed to jpeg_start_output() is saved in the
+cinfo.output_scan_number field.  The library's output processing calls
+jpeg_consume_input() whenever the current input scan number and row within
+that scan is less than or equal to the current output scan number and row.
+Thus, input processing can "get ahead" of the output processing but is not
+allowed to "fall behind".  You can achieve several different effects by
+manipulating this interlock rule.  For example, if you pass a target scan
+number greater than the current input scan number, the output processor will
+wait until that scan starts to arrive before producing any output.  (To avoid
+an infinite loop, the target scan number is automatically reset to the last
+scan number when the end of image is reached.  Thus, if you specify a large
+target scan number, the library will just absorb the entire input file and
+then perform an output pass.  This is effectively the same as what
+jpeg_start_decompress() does when you don't select buffered-image mode.)
+When you pass a target scan number equal to the current input scan number,
+the image is displayed no faster than the current input scan arrives.  The
+final possibility is to pass a target scan number less than the current input
+scan number; this disables the input/output interlock and causes the output
+processor to simply display whatever it finds in the image buffer, without
+waiting for input.  (However, the library will not accept a target scan
+number less than one, so you can't avoid waiting for the first scan.)
+
+When data is arriving faster than the output display processing can advance
+through the image, jpeg_consume_input() will store data into the buffered
+image beyond the point at which the output processing is reading data out
+again.  If the input arrives fast enough, it may "wrap around" the buffer to
+the point where the input is more than one whole scan ahead of the output.
+If the output processing simply proceeds through its display pass without
+paying attention to the input, the effect seen on-screen is that the lower
+part of the image is one or more scans better in quality than the upper part.
+Then, when the next output scan is started, you have a choice of what target
+scan number to use.  The recommended choice is to use the current input scan
+number at that time, which implies that you've skipped the output scans
+corresponding to the input scans that were completed while you processed the
+previous output scan.  In this way, the decoder automatically adapts its
+speed to the arriving data, by skipping output scans as necessary to keep up
+with the arriving data.
+
+When using this strategy, you'll want to be sure that you perform a final
+output pass after receiving all the data; otherwise your last display may not
+be full quality across the whole screen.  So the right outer loop logic is
+something like this:
+        do {
+            absorb any waiting input by calling jpeg_consume_input()
+            final_pass = jpeg_input_complete(&cinfo);
+            adjust output decompression parameters if required
+            jpeg_start_output(&cinfo, cinfo.input_scan_number);
+            ...
+            jpeg_finish_output()
+        } while (!final_pass);
+rather than quitting as soon as jpeg_input_complete() returns TRUE.  This
+arrangement makes it simple to use higher-quality decoding parameters
+for the final pass.  But if you don't want to use special parameters for
+the final pass, the right loop logic is like this:
+        for (;;) {
+            absorb any waiting input by calling jpeg_consume_input()
+            jpeg_start_output(&cinfo, cinfo.input_scan_number);
+            ...
+            jpeg_finish_output()
+            if (jpeg_input_complete(&cinfo) &&
+                cinfo.input_scan_number == cinfo.output_scan_number)
+              break;
+        }
+In this case you don't need to know in advance whether an output pass is to
+be the last one, so it's not necessary to have reached EOF before starting
+the final output pass; rather, what you want to test is whether the output
+pass was performed in sync with the final input scan.  This form of the loop
+will avoid an extra output pass whenever the decoder is able (or nearly able)
+to keep up with the incoming data.
+
+When the data transmission speed is high, you might begin a display pass,
+then find that much or all of the file has arrived before you can complete
+the pass.  (You can detect this by noting the JPEG_REACHED_EOI return code
+from jpeg_consume_input(), or equivalently by testing jpeg_input_complete().)
+In this situation you may wish to abort the current display pass and start a
+new one using the newly arrived information.  To do so, just call
+jpeg_finish_output() and then start a new pass with jpeg_start_output().
+
+A variant strategy is to abort and restart display if more than one complete
+scan arrives during an output pass; this can be detected by noting
+JPEG_REACHED_SOS returns and/or examining cinfo.input_scan_number.  This
+idea should be employed with caution, however, since the display process
+might never get to the bottom of the image before being aborted, resulting
+in the lower part of the screen being several passes worse than the upper.
+In most cases it's probably best to abort an output pass only if the whole
+file has arrived and you want to begin the final output pass immediately.
+
+When receiving data across a communication link, we recommend always using
+the current input scan number for the output target scan number; if a
+higher-quality final pass is to be done, it should be started (aborting any
+incomplete output pass) as soon as the end of file is received.  However,
+many other strategies are possible.  For example, the application can examine
+the parameters of the current input scan and decide whether to display it or
+not.  If the scan contains only chroma data, one might choose not to use it
+as the target scan, expecting that the scan will be small and will arrive
+quickly.  To skip to the next scan, call jpeg_consume_input() until it
+returns JPEG_REACHED_SOS or JPEG_REACHED_EOI.  Or just use the next higher
+number as the target scan for jpeg_start_output(); but that method doesn't
+let you inspect the next scan's parameters before deciding to display it.
+
+
+In buffered-image mode, jpeg_start_decompress() never performs input and
+thus never suspends.  An application that uses input suspension with
+buffered-image mode must be prepared for suspension returns from these
+routines:
+* jpeg_start_output() performs input only if you request 2-pass quantization
+  and the target scan isn't fully read yet.  (This is discussed below.)
+* jpeg*_read_scanlines(), as always, returns the number of scanlines that it
+  was able to produce before suspending.
+* jpeg_finish_output() will read any markers following the target scan,
+  up to the end of the file or the SOS marker that begins another scan.
+  (But it reads no input if jpeg_consume_input() has already reached the
+  end of the file or a SOS marker beyond the target output scan.)
+* jpeg_finish_decompress() will read until the end of file, and thus can
+  suspend if the end hasn't already been reached (as can be tested by
+  calling jpeg_input_complete()).
+jpeg_start_output(), jpeg_finish_output(), and jpeg_finish_decompress()
+all return TRUE if they completed their tasks, FALSE if they had to suspend.
+In the event of a FALSE return, the application must load more input data
+and repeat the call.  Applications that use non-suspending data sources need
+not check the return values of these three routines.
+
+
+It is possible to change decoding parameters between output passes in the
+buffered-image mode.  The decoder library currently supports only very
+limited changes of parameters.  ONLY THE FOLLOWING parameter changes are
+allowed after jpeg_start_decompress() is called:
+* dct_method can be changed before each call to jpeg_start_output().
+  For example, one could use a fast DCT method for early scans, changing
+  to a higher quality method for the final scan.
+* dither_mode can be changed before each call to jpeg_start_output();
+  of course this has no impact if not using color quantization.  Typically
+  one would use ordered dither for initial passes, then switch to
+  Floyd-Steinberg dither for the final pass.  Caution: changing dither mode
+  can cause more memory to be allocated by the library.  Although the amount
+  of memory involved is not large (a scanline or so), it may cause the
+  initial max_memory_to_use specification to be exceeded, which in the worst
+  case would result in an out-of-memory failure.
+* do_block_smoothing can be changed before each call to jpeg_start_output().
+  This setting is relevant only when decoding a progressive JPEG image.
+  During the first DC-only scan, block smoothing provides a very "fuzzy" look
+  instead of the very "blocky" look seen without it; which is better seems a
+  matter of personal taste.  But block smoothing is nearly always a win
+  during later stages, especially when decoding a successive-approximation
+  image: smoothing helps to hide the slight blockiness that otherwise shows
+  up on smooth gradients until the lowest coefficient bits are sent.
+* Color quantization mode can be changed under the rules described below.
+  You *cannot* change between full-color and quantized output (because that
+  would alter the required I/O buffer sizes), but you can change which
+  quantization method is used.
+
+When generating color-quantized output, changing quantization method is a
+very useful way of switching between high-speed and high-quality display.
+The library allows you to change among its three quantization methods:
+1. Single-pass quantization to a fixed color cube.
+   Selected by cinfo.two_pass_quantize = FALSE and cinfo.colormap = NULL.
+2. Single-pass quantization to an application-supplied colormap.
+   Selected by setting cinfo.colormap to point to the colormap (the value of
+   two_pass_quantize is ignored); also set cinfo.actual_number_of_colors.
+3. Two-pass quantization to a colormap chosen specifically for the image.
+   Selected by cinfo.two_pass_quantize = TRUE and cinfo.colormap = NULL.
+   (This is the default setting selected by jpeg_read_header, but it is
+   probably NOT what you want for the first pass of progressive display!)
+These methods offer successively better quality and lesser speed.  However,
+only the first method is available for quantizing in non-RGB color spaces.
+
+IMPORTANT: because the different quantizer methods have very different
+working-storage requirements, the library requires you to indicate which
+one(s) you intend to use before you call jpeg_start_decompress().  (If we did
+not require this, the max_memory_to_use setting would be a complete fiction.)
+You do this by setting one or more of these three cinfo fields to TRUE:
+        enable_1pass_quant              Fixed color cube colormap
+        enable_external_quant           Externally-supplied colormap
+        enable_2pass_quant              Two-pass custom colormap
+All three are initialized FALSE by jpeg_read_header().  But
+jpeg_start_decompress() automatically sets TRUE the one selected by the
+current two_pass_quantize and colormap settings, so you only need to set the
+enable flags for any other quantization methods you plan to change to later.
+
+After setting the enable flags correctly at jpeg_start_decompress() time, you
+can change to any enabled quantization method by setting two_pass_quantize
+and colormap properly just before calling jpeg_start_output().  The following
+special rules apply:
+1. You must explicitly set cinfo.colormap to NULL when switching to 1-pass
+   or 2-pass mode from a different mode, or when you want the 2-pass
+   quantizer to be re-run to generate a new colormap.
+2. To switch to an external colormap, or to change to a different external
+   colormap than was used on the prior pass, you must call
+   jpeg_new_colormap() after setting cinfo.colormap.
+NOTE: if you want to use the same colormap as was used in the prior pass,
+you should not do either of these things.  This will save some nontrivial
+switchover costs.
+(These requirements exist because cinfo.colormap will always be non-NULL
+after completing a prior output pass, since both the 1-pass and 2-pass
+quantizers set it to point to their output colormaps.  Thus you have to
+do one of these two things to notify the library that something has changed.
+Yup, it's a bit klugy, but it's necessary to do it this way for backwards
+compatibility.)
+
+Note that in buffered-image mode, the library generates any requested colormap
+during jpeg_start_output(), not during jpeg_start_decompress().
+
+When using two-pass quantization, jpeg_start_output() makes a pass over the
+buffered image to determine the optimum color map; it therefore may take a
+significant amount of time, whereas ordinarily it does little work.  The
+progress monitor hook is called during this pass, if defined.  It is also
+important to realize that if the specified target scan number is greater than
+or equal to the current input scan number, jpeg_start_output() will attempt
+to consume input as it makes this pass.  If you use a suspending data source,
+you need to check for a FALSE return from jpeg_start_output() under these
+conditions.  The combination of 2-pass quantization and a not-yet-fully-read
+target scan is the only case in which jpeg_start_output() will consume input.
+
+
+Application authors who support buffered-image mode may be tempted to use it
+for all JPEG images, even single-scan ones.  This will work, but it is
+inefficient: there is no need to create an image-sized coefficient buffer for
+single-scan images.  Requesting buffered-image mode for such an image wastes
+memory.  Worse, it can cost time on large images, since the buffered data has
+to be swapped out or written to a temporary file.  If you are concerned about
+maximum performance on baseline JPEG files, you should use buffered-image
+mode only when the incoming file actually has multiple scans.  This can be
+tested by calling jpeg_has_multiple_scans(), which will return a correct
+result at any time after jpeg_read_header() completes.
+
+It is also worth noting that when you use jpeg_consume_input() to let input
+processing get ahead of output processing, the resulting pattern of access to
+the coefficient buffer is quite nonsequential.  It's best to use the memory
+manager jmemnobs.c if you can (ie, if you have enough real or virtual main
+memory).  If not, at least make sure that max_memory_to_use is set as high as
+possible.  If the JPEG memory manager has to use a temporary file, you will
+probably see a lot of disk traffic and poor performance.  (This could be
+improved with additional work on the memory manager, but we haven't gotten
+around to it yet.)
+
+In some applications it may be convenient to use jpeg_consume_input() for all
+input processing, including reading the initial markers; that is, you may
+wish to call jpeg_consume_input() instead of jpeg_read_header() during
+startup.  This works, but note that you must check for JPEG_REACHED_SOS and
+JPEG_REACHED_EOI return codes as the equivalent of jpeg_read_header's codes.
+Once the first SOS marker has been reached, you must call
+jpeg_start_decompress() before jpeg_consume_input() will consume more input;
+it'll just keep returning JPEG_REACHED_SOS until you do.  If you read a
+tables-only file this way, jpeg_consume_input() will return JPEG_REACHED_EOI
+without ever returning JPEG_REACHED_SOS; be sure to check for this case.
+If this happens, the decompressor will not read any more input until you call
+jpeg_abort() to reset it.  It is OK to call jpeg_consume_input() even when not
+using buffered-image mode, but in that case it's basically a no-op after the
+initial markers have been read: it will just return JPEG_SUSPENDED.
+
+
+Abbreviated datastreams and multiple images
+-------------------------------------------
+
+A JPEG compression or decompression object can be reused to process multiple
+images.  This saves a small amount of time per image by eliminating the
+"create" and "destroy" operations, but that isn't the real purpose of the
+feature.  Rather, reuse of an object provides support for abbreviated JPEG
+datastreams.  Object reuse can also simplify processing a series of images in
+a single input or output file.  This section explains these features.
+
+A JPEG file normally contains several hundred bytes worth of quantization
+and Huffman tables.  In a situation where many images will be stored or
+transmitted with identical tables, this may represent an annoying overhead.
+The JPEG standard therefore permits tables to be omitted.  The standard
+defines three classes of JPEG datastreams:
+  * "Interchange" datastreams contain an image and all tables needed to decode
+     the image.  These are the usual kind of JPEG file.
+  * "Abbreviated image" datastreams contain an image, but are missing some or
+    all of the tables needed to decode that image.
+  * "Abbreviated table specification" (henceforth "tables-only") datastreams
+    contain only table specifications.
+To decode an abbreviated image, it is necessary to load the missing table(s)
+into the decoder beforehand.  This can be accomplished by reading a separate
+tables-only file.  A variant scheme uses a series of images in which the first
+image is an interchange (complete) datastream, while subsequent ones are
+abbreviated and rely on the tables loaded by the first image.  It is assumed
+that once the decoder has read a table, it will remember that table until a
+new definition for the same table number is encountered.
+
+It is the application designer's responsibility to figure out how to associate
+the correct tables with an abbreviated image.  While abbreviated datastreams
+can be useful in a closed environment, their use is strongly discouraged in
+any situation where data exchange with other applications might be needed.
+Caveat designer.
+
+The JPEG library provides support for reading and writing any combination of
+tables-only datastreams and abbreviated images.  In both compression and
+decompression objects, a quantization or Huffman table will be retained for
+the lifetime of the object, unless it is overwritten by a new table definition.
+
+
+To create abbreviated image datastreams, it is only necessary to tell the
+compressor not to emit some or all of the tables it is using.  Each
+quantization and Huffman table struct contains a boolean field "sent_table",
+which normally is initialized to FALSE.  For each table used by the image, the
+header-writing process emits the table and sets sent_table = TRUE unless it is
+already TRUE.  (In normal usage, this prevents outputting the same table
+definition multiple times, as would otherwise occur because the chroma
+components typically share tables.)  Thus, setting this field to TRUE before
+calling jpeg_start_compress() will prevent the table from being written at
+all.
+
+If you want to create a "pure" abbreviated image file containing no tables,
+just call "jpeg_suppress_tables(&cinfo, TRUE)" after constructing all the
+tables.  If you want to emit some but not all tables, you'll need to set the
+individual sent_table fields directly.
+
+To create an abbreviated image, you must also call jpeg_start_compress()
+with a second parameter of FALSE, not TRUE.  Otherwise jpeg_start_compress()
+will force all the sent_table fields to FALSE.  (This is a safety feature to
+prevent abbreviated images from being created accidentally.)
+
+To create a tables-only file, perform the same parameter setup that you
+normally would, but instead of calling jpeg_start_compress() and so on, call
+jpeg_write_tables(&cinfo).  This will write an abbreviated datastream
+containing only SOI, DQT and/or DHT markers, and EOI.  All the quantization
+and Huffman tables that are currently defined in the compression object will
+be emitted unless their sent_tables flag is already TRUE, and then all the
+sent_tables flags will be set TRUE.
+
+A sure-fire way to create matching tables-only and abbreviated image files
+is to proceed as follows:
+
+        create JPEG compression object
+        set JPEG parameters
+        set destination to tables-only file
+        jpeg_write_tables(&cinfo);
+        set destination to image file
+        jpeg_start_compress(&cinfo, FALSE);
+        write data...
+        jpeg_finish_compress(&cinfo);
+
+Since the JPEG parameters are not altered between writing the table file and
+the abbreviated image file, the same tables are sure to be used.  Of course,
+you can repeat the jpeg_start_compress() ... jpeg_finish_compress() sequence
+many times to produce many abbreviated image files matching the table file.
+
+You cannot suppress output of the computed Huffman tables when Huffman
+optimization is selected.  (If you could, there'd be no way to decode the
+image...)  Generally, you don't want to set optimize_coding = TRUE when
+you are trying to produce abbreviated files.
+
+In some cases you might want to compress an image using tables which are
+not stored in the application, but are defined in an interchange or
+tables-only file readable by the application.  This can be done by setting up
+a JPEG decompression object to read the specification file, then copying the
+tables into your compression object.  See jpeg_copy_critical_parameters()
+for an example of copying quantization tables.
+
+
+To read abbreviated image files, you simply need to load the proper tables
+into the decompression object before trying to read the abbreviated image.
+If the proper tables are stored in the application program, you can just
+allocate the table structs and fill in their contents directly.  For example,
+to load a fixed quantization table into table slot "n":
+
+    if (cinfo.quant_tbl_ptrs[n] == NULL)
+      cinfo.quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) &cinfo);
+    quant_ptr = cinfo.quant_tbl_ptrs[n];        /* quant_ptr is JQUANT_TBL* */
+    for (i = 0; i < 64; i++) {
+      /* Qtable[] is desired quantization table, in natural array order */
+      quant_ptr->quantval[i] = Qtable[i];
+    }
+
+Code to load a fixed Huffman table is typically (for AC table "n"):
+
+    if (cinfo.ac_huff_tbl_ptrs[n] == NULL)
+      cinfo.ac_huff_tbl_ptrs[n] = jpeg_alloc_huff_table((j_common_ptr) &cinfo);
+    huff_ptr = cinfo.ac_huff_tbl_ptrs[n];       /* huff_ptr is JHUFF_TBL* */
+    for (i = 1; i <= 16; i++) {
+      /* counts[i] is number of Huffman codes of length i bits, i=1..16 */
+      huff_ptr->bits[i] = counts[i];
+    }
+    for (i = 0; i < 256; i++) {
+      /* symbols[] is the list of Huffman symbols, in code-length order */
+      huff_ptr->huffval[i] = symbols[i];
+    }
+
+(Note that trying to set cinfo.quant_tbl_ptrs[n] to point directly at a
+constant JQUANT_TBL object is not safe.  If the incoming file happened to
+contain a quantization table definition, your master table would get
+overwritten!  Instead allocate a working table copy and copy the master table
+into it, as illustrated above.  Ditto for Huffman tables, of course.)
+
+You might want to read the tables from a tables-only file, rather than
+hard-wiring them into your application.  The jpeg_read_header() call is
+sufficient to read a tables-only file.  You must pass a second parameter of
+FALSE to indicate that you do not require an image to be present.  Thus, the
+typical scenario is
+
+        create JPEG decompression object
+        set source to tables-only file
+        jpeg_read_header(&cinfo, FALSE);
+        set source to abbreviated image file
+        jpeg_read_header(&cinfo, TRUE);
+        set decompression parameters
+        jpeg_start_decompress(&cinfo);
+        read data...
+        jpeg_finish_decompress(&cinfo);
+
+In some cases, you may want to read a file without knowing whether it contains
+an image or just tables.  In that case, pass FALSE and check the return value
+from jpeg_read_header(): it will be JPEG_HEADER_OK if an image was found,
+JPEG_HEADER_TABLES_ONLY if only tables were found.  (A third return value,
+JPEG_SUSPENDED, is possible when using a suspending data source manager.)
+Note that jpeg_read_header() will not complain if you read an abbreviated
+image for which you haven't loaded the missing tables; the missing-table check
+occurs later, in jpeg_start_decompress().
+
+
+It is possible to read a series of images from a single source file by
+repeating the jpeg_read_header() ... jpeg_finish_decompress() sequence,
+without releasing/recreating the JPEG object or the data source module.
+(If you did reinitialize, any partial bufferload left in the data source
+buffer at the end of one image would be discarded, causing you to lose the
+start of the next image.)  When you use this method, stored tables are
+automatically carried forward, so some of the images can be abbreviated images
+that depend on tables from earlier images.
+
+If you intend to write a series of images into a single destination file,
+you might want to make a specialized data destination module that doesn't
+flush the output buffer at term_destination() time.  This would speed things
+up by some trifling amount.  Of course, you'd need to remember to flush the
+buffer after the last image.  You can make the later images be abbreviated
+ones by passing FALSE to jpeg_start_compress().
+
+
+Special markers
+---------------
+
+Some applications may need to insert or extract special data in the JPEG
+datastream.  The JPEG standard provides marker types "COM" (comment) and
+"APP0" through "APP15" (application) to hold application-specific data.
+Unfortunately, the use of these markers is not specified by the standard.
+COM markers are fairly widely used to hold user-supplied text.  The JFIF file
+format spec uses APP0 markers with specified initial strings to hold certain
+data.  Adobe applications use APP14 markers beginning with the string "Adobe"
+for miscellaneous data.  Other APPn markers are rarely seen, but might
+contain almost anything.
+
+If you wish to store user-supplied text, we recommend you use COM markers
+and place readable 7-bit ASCII text in them.  Newline conventions are not
+standardized --- expect to find LF (Unix style), CR/LF (DOS style), or CR
+(Mac style).  A robust COM reader should be able to cope with random binary
+garbage, including nulls, since some applications generate COM markers
+containing non-ASCII junk.  (But yours should not be one of them.)
+
+For program-supplied data, use an APPn marker, and be sure to begin it with an
+identifying string so that you can tell whether the marker is actually yours.
+It's probably best to avoid using APP0 or APP14 for any private markers.
+(NOTE: the upcoming SPIFF standard will use APP8 markers; we recommend you
+not use APP8 markers for any private purposes, either.)
+
+Keep in mind that at most 65533 bytes can be put into one marker, but you
+can have as many markers as you like.
+
+By default, the IJG compression library will write a JFIF APP0 marker if the
+selected JPEG colorspace is grayscale or YCbCr, or an Adobe APP14 marker if
+the selected colorspace is RGB, CMYK, or YCCK.  You can disable this, but
+we don't recommend it.  The decompression library will recognize JFIF and
+Adobe markers and will set the JPEG colorspace properly when one is found.
+
+
+You can write special markers immediately following the datastream header by
+calling jpeg_write_marker() after jpeg_start_compress() and before the first
+call to jpeg*_write_scanlines().  When you do this, the markers appear after
+the SOI and the JFIF APP0 and Adobe APP14 markers (if written), but before
+all else.  Specify the marker type parameter as "JPEG_COM" for COM or
+"JPEG_APP0 + n" for APPn.  (Actually, jpeg_write_marker will let you write
+any marker type, but we don't recommend writing any other kinds of marker.)
+For example, to write a user comment string pointed to by comment_text:
+        jpeg_write_marker(cinfo, JPEG_COM, comment_text, strlen(comment_text));
+
+If it's not convenient to store all the marker data in memory at once,
+you can instead call jpeg_write_m_header() followed by multiple calls to
+jpeg_write_m_byte().  If you do it this way, it's your responsibility to
+call jpeg_write_m_byte() exactly the number of times given in the length
+parameter to jpeg_write_m_header().  (This method lets you empty the
+output buffer partway through a marker, which might be important when
+using a suspending data destination module.  In any case, if you are using
+a suspending destination, you should flush its buffer after inserting
+any special markers.  See "I/O suspension".)
+
+Or, if you prefer to synthesize the marker byte sequence yourself,
+you can just cram it straight into the data destination module.
+
+If you are writing JFIF 1.02 extension markers (thumbnail images), don't
+forget to set cinfo.JFIF_minor_version = 2 so that the encoder will write the
+correct JFIF version number in the JFIF header marker.  The library's default
+is to write version 1.01, but that's wrong if you insert any 1.02 extension
+markers.  (We could probably get away with just defaulting to 1.02, but there
+used to be broken decoders that would complain about unknown minor version
+numbers.  To reduce compatibility risks it's safest not to write 1.02 unless
+you are actually using 1.02 extensions.)
+
+
+When reading, two methods of handling special markers are available:
+1. You can ask the library to save the contents of COM and/or APPn markers
+into memory, and then examine them at your leisure afterwards.
+2. You can supply your own routine to process COM and/or APPn markers
+on-the-fly as they are read.
+The first method is simpler to use, especially if you are using a suspending
+data source; writing a marker processor that copes with input suspension is
+not easy (consider what happens if the marker is longer than your available
+input buffer).  However, the second method conserves memory since the marker
+data need not be kept around after it's been processed.
+
+For either method, you'd normally set up marker handling after creating a
+decompression object and before calling jpeg_read_header(), because the
+markers of interest will typically be near the head of the file and so will
+be scanned by jpeg_read_header.  Once you've established a marker handling
+method, it will be used for the life of that decompression object
+(potentially many datastreams), unless you change it.  Marker handling is
+determined separately for COM markers and for each APPn marker code.
+
+
+To save the contents of special markers in memory, call
+        jpeg_save_markers(cinfo, marker_code, length_limit)
+where marker_code is the marker type to save, JPEG_COM or JPEG_APP0+n.
+(To arrange to save all the special marker types, you need to call this
+routine 17 times, for COM and APP0-APP15.)  If the incoming marker is longer
+than length_limit data bytes, only length_limit bytes will be saved; this
+parameter allows you to avoid chewing up memory when you only need to see the
+first few bytes of a potentially large marker.  If you want to save all the
+data, set length_limit to 0xFFFF; that is enough since marker lengths are only
+16 bits.  As a special case, setting length_limit to 0 prevents that marker
+type from being saved at all.  (That is the default behavior, in fact.)
+
+After jpeg_read_header() completes, you can examine the special markers by
+following the cinfo->marker_list pointer chain.  All the special markers in
+the file appear in this list, in order of their occurrence in the file (but
+omitting any markers of types you didn't ask for).  Both the original data
+length and the saved data length are recorded for each list entry; the latter
+will not exceed length_limit for the particular marker type.  Note that these
+lengths exclude the marker length word, whereas the stored representation
+within the JPEG file includes it.  (Hence the maximum data length is really
+only 65533.)
+
+It is possible that additional special markers appear in the file beyond the
+SOS marker at which jpeg_read_header stops; if so, the marker list will be
+extended during reading of the rest of the file.  This is not expected to be
+common, however.  If you are short on memory you may want to reset the length
+limit to zero for all marker types after finishing jpeg_read_header, to
+ensure that the max_memory_to_use setting cannot be exceeded due to addition
+of later markers.
+
+The marker list remains stored until you call jpeg_finish_decompress or
+jpeg_abort, at which point the memory is freed and the list is set to empty.
+(jpeg_destroy also releases the storage, of course.)
+
+Note that the library is internally interested in APP0 and APP14 markers;
+if you try to set a small nonzero length limit on these types, the library
+will silently force the length up to the minimum it wants.  (But you can set
+a zero length limit to prevent them from being saved at all.)  Also, in a
+16-bit environment, the maximum length limit may be constrained to less than
+65533 by malloc() limitations.  It is therefore best not to assume that the
+effective length limit is exactly what you set it to be.
+
+
+If you want to supply your own marker-reading routine, you do it by calling
+jpeg_set_marker_processor().  A marker processor routine must have the
+signature
+        boolean jpeg_marker_parser_method (j_decompress_ptr cinfo)
+Although the marker code is not explicitly passed, the routine can find it
+in cinfo->unread_marker.  At the time of call, the marker proper has been
+read from the data source module.  The processor routine is responsible for
+reading the marker length word and the remaining parameter bytes, if any.
+Return TRUE to indicate success.  (FALSE should be returned only if you are
+using a suspending data source and it tells you to suspend.  See the standard
+marker processors in jdmarker.c for appropriate coding methods if you need to
+use a suspending data source.)
+
+If you override the default APP0 or APP14 processors, it is up to you to
+recognize JFIF and Adobe markers if you want colorspace recognition to occur
+properly.  We recommend copying and extending the default processors if you
+want to do that.  (A better idea is to save these marker types for later
+examination by calling jpeg_save_markers(); that method doesn't interfere
+with the library's own processing of these markers.)
+
+jpeg_set_marker_processor() and jpeg_save_markers() are mutually exclusive
+--- if you call one it overrides any previous call to the other, for the
+particular marker type specified.
+
+A simple example of an external COM processor can be found in djpeg.c.
+Also, see jpegtran.c for an example of using jpeg_save_markers.
+
+
+ICC profiles
+------------
+
+Two functions are provided for writing and reading International Color
+Consortium (ICC) device profiles embedded in JFIF JPEG image files:
+
+        void jpeg_write_icc_profile (j_compress_ptr cinfo,
+                                     const JOCTET *icc_data_ptr,
+                                     unsigned int icc_data_len);
+        boolean jpeg_read_icc_profile (j_decompress_ptr cinfo,
+                                       JOCTET **icc_data_ptr,
+                                       unsigned int *icc_data_len);
+
+The ICC has defined a standard for including such data in JPEG "APP2" markers.
+The aforementioned functions do not know anything about the internal structure
+of the ICC profile data; they just know how to embed the profile data into a
+JPEG file while writing it, or to extract the profile data from a JPEG file
+while reading it.
+
+jpeg_write_icc_profile() must be called after calling jpeg_start_compress() and
+before the first call to jpeg*_write_scanlines() or jpeg*_write_raw_data().
+This ordering ensures that the APP2 marker(s) will appear after the SOI and
+JFIF or Adobe markers, but before all other data.
+
+jpeg_read_icc_profile() returns TRUE if an ICC profile was found and FALSE
+otherwise.  If an ICC profile was found, then the function will allocate a
+memory region containing the profile and will return a pointer to that memory
+region in *icc_data_ptr, as well as the length of the region in *icc_data_len.
+This memory region is allocated by the library using malloc() and must be freed
+by the caller using free() when the memory region is no longer needed.  Callers
+wishing to use jpeg_read_icc_profile() must call
+
+        jpeg_save_markers(cinfo, JPEG_APP0 + 2, 0xFFFF);
+
+prior to calling jpeg_read_header().  jpeg_read_icc_profile() can be called at
+any point between jpeg_read_header() and jpeg_finish_decompress().
+
+
+Raw (downsampled) image data
+----------------------------
+
+Some applications need to supply already-downsampled image data to the JPEG
+compressor, or to receive raw downsampled data from the decompressor.  The
+library supports this requirement by allowing the application to write or
+read raw data, bypassing the normal preprocessing or postprocessing steps.
+The interface is different from the standard one and is somewhat harder to
+use.  If your interest is merely in bypassing color conversion, we recommend
+that you use the standard interface and simply set jpeg_color_space =
+in_color_space (or jpeg_color_space = out_color_space for decompression).
+The mechanism described in this section is necessary only to supply or
+receive downsampled image data, in which not all components have the same
+dimensions.
+
+
+To compress raw data, you must supply the data in the colorspace to be used
+in the JPEG file (please read the earlier section on Special color spaces)
+and downsampled to the sampling factors specified in the JPEG parameters.
+You must supply the data in the format used internally by the JPEG library,
+namely a J*SAMPIMAGE array.  This is an array of pointers to two-dimensional
+arrays, each of type J*SAMPARRAY.  Each 2-D array holds the values for one
+color component.  This structure is necessary since the components are of
+different sizes.  If the image dimensions are not a multiple of the MCU size,
+you must also pad the data correctly (usually, this is done by replicating
+the last column and/or row).  The data must be padded to a multiple of a DCT
+block in each component: that is, each downsampled row must contain a
+multiple of 8 valid samples, and there must be a multiple of 8 sample rows
+for each component.  (For applications such as conversion of digital TV
+images, the standard image size is usually a multiple of the DCT block size,
+so that no padding need actually be done.)
+
+The procedure for compression of raw data is basically the same as normal
+compression, except that you call jpeg_write_raw_data() or
+jpeg12_write_raw_data() in place of jpeg_write_scanlines() or
+jpeg12_write_scanlines().  Before calling jpeg_start_compress(), you must do
+the following:
+  * Set cinfo->raw_data_in to TRUE.  (It is set FALSE by jpeg_set_defaults().)
+    This notifies the library that you will be supplying raw data.
+  * Ensure jpeg_color_space is correct --- an explicit jpeg_set_colorspace()
+    call is a good idea.  Note that since color conversion is bypassed,
+    in_color_space is ignored, except that jpeg_set_defaults() uses it to
+    choose the default jpeg_color_space setting.
+  * Ensure the sampling factors, cinfo->comp_info[i].h_samp_factor and
+    cinfo->comp_info[i].v_samp_factor, are correct.  Since these indicate the
+    dimensions of the data you are supplying, it's wise to set them
+    explicitly, rather than assuming the library's defaults are what you want.
+
+To pass raw data to the library, call jpeg*_write_raw_data() in place of
+jpeg*_write_scanlines().  The routines work similarly except that
+jpeg*_write_raw_data takes a J*SAMPIMAGE data array rather than J*SAMPARRAY.
+The scanlines count passed to and returned from jpeg*_write_raw_data is
+measured in terms of the component with the largest v_samp_factor.
+
+jpeg*_write_raw_data() processes one MCU row per call, which is to say
+v_samp_factor*DCTSIZE sample rows of each component.  The passed num_lines
+value must be at least max_v_samp_factor*DCTSIZE, and the return value will
+be exactly that amount (or possibly some multiple of that amount, in future
+library versions).  This is true even on the last call at the bottom of the
+image; don't forget to pad your data as necessary.
+
+The required dimensions of the supplied data can be computed for each
+component as
+        cinfo->comp_info[i].width_in_blocks*DCTSIZE  samples per row
+        cinfo->comp_info[i].height_in_blocks*DCTSIZE rows in image
+after jpeg_start_compress() has initialized those fields.  If the valid data
+is smaller than this, it must be padded appropriately.  For some sampling
+factors and image sizes, additional dummy DCT blocks are inserted to make
+the image a multiple of the MCU dimensions.  The library creates such dummy
+blocks itself; it does not read them from your supplied data.  Therefore you
+need never pad by more than DCTSIZE samples.  An example may help here.
+Assume 2h2v downsampling of YCbCr data, that is
+        cinfo->comp_info[0].h_samp_factor = 2           for Y
+        cinfo->comp_info[0].v_samp_factor = 2
+        cinfo->comp_info[1].h_samp_factor = 1           for Cb
+        cinfo->comp_info[1].v_samp_factor = 1
+        cinfo->comp_info[2].h_samp_factor = 1           for Cr
+        cinfo->comp_info[2].v_samp_factor = 1
+and suppose that the nominal image dimensions (cinfo->image_width and
+cinfo->image_height) are 101x101 pixels.  Then jpeg_start_compress() will
+compute downsampled_width = 101 and width_in_blocks = 13 for Y,
+downsampled_width = 51 and width_in_blocks = 7 for Cb and Cr (and the same
+for the height fields).  You must pad the Y data to at least 13*8 = 104
+columns and rows, the Cb/Cr data to at least 7*8 = 56 columns and rows.  The
+MCU height is max_v_samp_factor = 2 DCT rows so you must pass at least 16
+scanlines on each call to jpeg*_write_raw_data(), which is to say 16 actual
+sample rows of Y and 8 each of Cb and Cr.  A total of 7 MCU rows are needed,
+so you must pass a total of 7*16 = 112 "scanlines".  The last DCT block row
+of Y data is dummy, so it doesn't matter what you pass for it in the data
+arrays, but the scanlines count must total up to 112 so that all of the Cb
+and Cr data gets passed.
+
+Output suspension is supported with raw-data compression: if the data
+destination module suspends, jpeg*_write_raw_data() will return 0.
+In this case the same data rows must be passed again on the next call.
+
+
+Decompression with raw data output implies bypassing all postprocessing:
+you cannot ask for rescaling or color quantization, for instance.  More
+seriously, you must deal with the color space and sampling factors present in
+the incoming file.  If your application only handles, say, 2h1v YCbCr data,
+you must check for and fail on other color spaces or other sampling factors.
+The library will not convert to a different color space for you.
+
+To obtain raw data output, set cinfo->raw_data_out = TRUE before
+jpeg_start_decompress() (it is set FALSE by jpeg_read_header()).  Be sure to
+verify that the color space and sampling factors are ones you can handle.
+Then call jpeg_read_raw_data() or jpeg12_read_raw_data() in place of
+jpeg_read_scanlines() or jpeg12_read_scanlines().  The decompression process is
+otherwise the same as usual.
+
+jpeg*_read_raw_data() returns one MCU row per call, and thus you must pass a
+buffer of at least max_v_samp_factor*DCTSIZE scanlines (scanline counting is
+the same as for raw-data compression).  The buffer you pass must be large
+enough to hold the actual data plus padding to DCT-block boundaries.  As with
+compression, any entirely dummy DCT blocks are not processed so you need not
+allocate space for them, but the total scanline count includes them.  The
+above example of computing buffer dimensions for raw-data compression is
+equally valid for decompression.
+
+Input suspension is supported with raw-data decompression: if the data source
+module suspends, jpeg*_read_raw_data() will return 0.  You can also use
+buffered-image mode to read raw data in multiple passes.
+
+
+Really raw data: DCT coefficients
+---------------------------------
+
+It is possible to read or write the contents of a JPEG file as raw DCT
+coefficients.  This facility is mainly intended for use in lossless
+transcoding between different JPEG file formats.  Other possible applications
+include lossless cropping of a JPEG image, lossless reassembly of a
+multi-strip or multi-tile TIFF/JPEG file into a single JPEG datastream, etc.
+
+To read the contents of a JPEG file as DCT coefficients, open the file and do
+jpeg_read_header() as usual.  But instead of calling jpeg_start_decompress()
+and jpeg*_read_scanlines(), call jpeg_read_coefficients().  This will read the
+entire image into a set of virtual coefficient-block arrays, one array per
+component.  The return value is a pointer to an array of virtual-array
+descriptors.  Each virtual array can be accessed directly using the JPEG
+memory manager's access_virt_barray method (see Memory management, below,
+and also read structure.txt's discussion of virtual array handling).  Or,
+for simple transcoding to a different JPEG file format, the array list can
+just be handed directly to jpeg_write_coefficients().
+
+Each block in the block arrays contains quantized coefficient values in
+normal array order (not JPEG zigzag order).  The block arrays contain only
+DCT blocks containing real data; any entirely-dummy blocks added to fill out
+interleaved MCUs at the right or bottom edges of the image are discarded
+during reading and are not stored in the block arrays.  (The size of each
+block array can be determined from the width_in_blocks and height_in_blocks
+fields of the component's comp_info entry.)  This is also the data format
+expected by jpeg_write_coefficients().
+
+When you are done using the virtual arrays, call jpeg_finish_decompress()
+to release the array storage and return the decompression object to an idle
+state; or just call jpeg_destroy() if you don't need to reuse the object.
+
+If you use a suspending data source, jpeg_read_coefficients() will return
+NULL if it is forced to suspend; a non-NULL return value indicates successful
+completion.  You need not test for a NULL return value when using a
+non-suspending data source.
+
+It is also possible to call jpeg_read_coefficients() to obtain access to the
+decoder's coefficient arrays during a normal decode cycle in buffered-image
+mode.  This frammish might be useful for progressively displaying an incoming
+image and then re-encoding it without loss.  To do this, decode in buffered-
+image mode as discussed previously, then call jpeg_read_coefficients() after
+the last jpeg_finish_output() call.  The arrays will be available for your use
+until you call jpeg_finish_decompress().
+
+
+To write the contents of a JPEG file as DCT coefficients, you must provide
+the DCT coefficients stored in virtual block arrays.  You can either pass
+block arrays read from an input JPEG file by jpeg_read_coefficients(), or
+allocate virtual arrays from the JPEG compression object and fill them
+yourself.  In either case, jpeg_write_coefficients() is substituted for
+jpeg_start_compress() and jpeg*_write_scanlines().  Thus the sequence is
+  * Create compression object
+  * Set all compression parameters as necessary
+  * Request virtual arrays if needed
+  * jpeg_write_coefficients()
+  * jpeg_finish_compress()
+  * Destroy or re-use compression object
+jpeg_write_coefficients() is passed a pointer to an array of virtual block
+array descriptors; the number of arrays is equal to cinfo.num_components.
+
+The virtual arrays need only have been requested, not realized, before
+jpeg_write_coefficients() is called.  A side-effect of
+jpeg_write_coefficients() is to realize any virtual arrays that have been
+requested from the compression object's memory manager.  Thus, when obtaining
+the virtual arrays from the compression object, you should fill the arrays
+after calling jpeg_write_coefficients().  The data is actually written out
+when you call jpeg_finish_compress(); jpeg_write_coefficients() only writes
+the file header.
+
+When writing raw DCT coefficients, it is crucial that the JPEG quantization
+tables and sampling factors match the way the data was encoded, or the
+resulting file will be invalid.  For transcoding from an existing JPEG file,
+we recommend using jpeg_copy_critical_parameters().  This routine initializes
+all the compression parameters to default values (like jpeg_set_defaults()),
+then copies the critical information from a source decompression object.
+The decompression object should have just been used to read the entire
+JPEG input file --- that is, it should be awaiting jpeg_finish_decompress().
+
+jpeg_write_coefficients() marks all tables stored in the compression object
+as needing to be written to the output file (thus, it acts like
+jpeg_start_compress(cinfo, TRUE)).  This is for safety's sake, to avoid
+emitting abbreviated JPEG files by accident.  If you really want to emit an
+abbreviated JPEG file, call jpeg_suppress_tables(), or set the tables'
+individual sent_table flags, between calling jpeg_write_coefficients() and
+jpeg_finish_compress().
+
+
+Progress monitoring
+-------------------
+
+Some applications may need to regain control from the JPEG library every so
+often.  The typical use of this feature is to produce a percent-done bar or
+other progress display.  (For a simple example, see cjpeg.c or djpeg.c.)
+Although you do get control back frequently during the data-transferring pass
+(the jpeg*_read_scanlines or jpeg*_write_scanlines loop), any additional passes
+will occur inside jpeg_finish_compress or jpeg_start_decompress; those
+routines may take a long time to execute, and you don't get control back
+until they are done.
+
+You can define a progress-monitor routine which will be called periodically
+by the library.  No guarantees are made about how often this call will occur,
+so we don't recommend you use it for mouse tracking or anything like that.
+At present, a call will occur once per MCU row, scanline, or sample row
+group, whichever unit is convenient for the current processing mode; so the
+wider the image, the longer the time between calls.  During the data
+transferring pass, only one call occurs per call of jpeg*_read_scanlines or
+jpeg*_write_scanlines, so don't pass a large number of scanlines at once if
+you want fine resolution in the progress count.  (If you really need to use
+the callback mechanism for time-critical tasks like mouse tracking, you could
+insert additional calls inside some of the library's inner loops.)
+
+To establish a progress-monitor callback, create a struct jpeg_progress_mgr,
+fill in its progress_monitor field with a pointer to your callback routine,
+and set cinfo->progress to point to the struct.  The callback will be called
+whenever cinfo->progress is non-NULL.  (This pointer is set to NULL by
+jpeg_create_compress or jpeg_create_decompress; the library will not change
+it thereafter.  So if you allocate dynamic storage for the progress struct,
+make sure it will live as long as the JPEG object does.  Allocating from the
+JPEG memory manager with lifetime JPOOL_PERMANENT will work nicely.)  You
+can use the same callback routine for both compression and decompression.
+
+The jpeg_progress_mgr struct contains four fields which are set by the library:
+        long pass_counter;      /* work units completed in this pass */
+        long pass_limit;        /* total number of work units in this pass */
+        int completed_passes;   /* passes completed so far */
+        int total_passes;       /* total number of passes expected */
+During any one pass, pass_counter increases from 0 up to (not including)
+pass_limit; the step size is usually but not necessarily 1.  The pass_limit
+value may change from one pass to another.  The expected total number of
+passes is in total_passes, and the number of passes already completed is in
+completed_passes.  Thus the fraction of work completed may be estimated as
+                completed_passes + (pass_counter/pass_limit)
+                --------------------------------------------
+                                total_passes
+ignoring the fact that the passes may not be equal amounts of work.
+
+When decompressing, pass_limit can even change within a pass, because it
+depends on the number of scans in the JPEG file, which isn't always known in
+advance.  The computed fraction-of-work-done may jump suddenly (if the library
+discovers it has overestimated the number of scans) or even decrease (in the
+opposite case).  It is not wise to put great faith in the work estimate.
+
+When using the decompressor's buffered-image mode, the progress monitor work
+estimate is likely to be completely unhelpful, because the library has no way
+to know how many output passes will be demanded of it.  Currently, the library
+sets total_passes based on the assumption that there will be one more output
+pass if the input file end hasn't yet been read (jpeg_input_complete() isn't
+TRUE), but no more output passes if the file end has been reached when the
+output pass is started.  This means that total_passes will rise as additional
+output passes are requested.  If you have a way of determining the input file
+size, estimating progress based on the fraction of the file that's been read
+will probably be more useful than using the library's value.
+
+
+Memory management
+-----------------
+
+This section covers some key facts about the JPEG library's built-in memory
+manager.  For more info, please read structure.txt's section about the memory
+manager, and consult the source code if necessary.
+
+All memory and temporary file allocation within the library is done via the
+memory manager.  If necessary, you can replace the "back end" of the memory
+manager to control allocation yourself (for example, if you don't want the
+library to use malloc() and free() for some reason).
+
+Some data is allocated "permanently" and will not be freed until the JPEG
+object is destroyed.  Most data is allocated "per image" and is freed by
+jpeg_finish_compress, jpeg_finish_decompress, or jpeg_abort.  You can call the
+memory manager yourself to allocate structures that will automatically be
+freed at these times.  Typical code for this is
+  ptr = (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, size);
+Use JPOOL_PERMANENT to get storage that lasts as long as the JPEG object.
+Use alloc_large instead of alloc_small for anything bigger than a few Kbytes.
+There are also alloc_sarray and alloc_barray routines that automatically
+build 2-D sample or block arrays.
+
+The library's minimum space requirements to process an image depend on the
+image's width, but not on its height, because the library ordinarily works
+with "strip" buffers that are as wide as the image but just a few rows high.
+Some operating modes (eg, two-pass color quantization) require full-image
+buffers.  Such buffers are treated as "virtual arrays": only the current strip
+need be in memory, and the rest can be swapped out to a temporary file.
+
+When using temporary files, the library will make the in-memory buffers for
+its virtual arrays just big enough to stay within a "maximum memory" setting.
+Your application can set this limit by setting cinfo->mem->max_memory_to_use
+after creating the JPEG object.  (Of course, there is still a minimum size for
+the buffers, so the max-memory setting is effective only if it is bigger than
+the minimum space needed.)  If you allocate any large structures yourself, you
+must allocate them before jpeg_start_compress() or jpeg_start_decompress() in
+order to have them counted against the max memory limit.  Also keep in mind
+that space allocated with alloc_small() is ignored, on the assumption that
+it's too small to be worth worrying about; so a reasonable safety margin
+should be left when setting max_memory_to_use.
+
+NOTE: Unless you develop your own memory manager back end, then temporary files
+will never be used.  The back end provided in libjpeg-turbo (jmemnobs.c) simply
+malloc()s and free()s virtual arrays, and an error occurs if the required
+memory exceeds the limit specified in cinfo->mem->max_memory_to_use.
+
+
+Memory usage
+------------
+
+Working memory requirements while performing compression or decompression
+depend on image dimensions, image characteristics (such as colorspace and
+JPEG process), and operating mode (application-selected options).
+
+As of v6b, the decompressor requires:
+ 1. About 24K in more-or-less-fixed-size data.  This varies a bit depending
+    on operating mode and image characteristics (particularly color vs.
+    grayscale), but it doesn't depend on image dimensions.
+ 2. Strip buffers (of size proportional to the image width) for IDCT and
+    upsampling results.  The worst case for commonly used sampling factors
+    is about 34 bytes * width in pixels for a color image.  A grayscale image
+    only needs about 8 bytes per pixel column.
+ 3. A full-image DCT coefficient buffer is needed to decode a multi-scan JPEG
+    file (including progressive JPEGs), or whenever you select buffered-image
+    mode.  This takes 2 bytes/coefficient.  At typical 2x2 sampling, that's
+    3 bytes per pixel for a color image.  Worst case (1x1 sampling) requires
+    6 bytes/pixel.  For grayscale, figure 2 bytes/pixel.
+ 4. To perform 2-pass color quantization, the decompressor also needs a
+    128K color lookup table and a full-image pixel buffer (3 bytes/pixel).
+This does not count any memory allocated by the application, such as a
+buffer to hold the final output image.
+
+The above figures are valid for 8-bit JPEG data precision and a machine with
+32-bit ints.  For 12-bit and 16-bit JPEG data, double the size of the strip
+buffers and quantization pixel buffer.  The "fixed-size" data will be somewhat
+smaller with 16-bit ints, larger with 64-bit ints.  Also, CMYK or other unusual
+color spaces will require different amounts of space.
+
+The full-image coefficient and pixel buffers, if needed at all, do not
+have to be fully RAM resident; you can have the library use temporary
+files instead when the total memory usage would exceed a limit you set.
+(But if your OS supports virtual memory, it's probably better to just use
+jmemnobs and let the OS do the swapping.)
+
+The compressor's memory requirements are similar, except that it has no need
+for color quantization.  Also, it needs a full-image DCT coefficient buffer
+if Huffman-table optimization is asked for, even if progressive mode is not
+requested.
+
+If you need more detailed information about memory usage in a particular
+situation, you can enable the MEM_STATS code in jmemmgr.c.
+
+
+Library compile-time options
+----------------------------
+
+A number of compile-time options are available by modifying jmorecfg.h.
+
+The maximum number of components (color channels) in the image is determined
+by MAX_COMPONENTS.  The JPEG standard allows up to 255 components, but we
+expect that few applications will need more than four or so.
+
+On machines with unusual data type sizes, you may be able to improve
+performance or reduce memory space by tweaking the various typedefs in
+jmorecfg.h.  In particular, on some RISC CPUs, access to arrays of "short"s
+is quite slow; consider trading memory for speed by making JCOEF, INT16, and
+UINT16 be "int" or "unsigned int".  UINT8 is also a candidate to become int.
+You probably don't want to make J*SAMPLE be int unless you have lots of memory
+to burn.
+
+You can reduce the size of the library by compiling out various optional
+functions.  To do this, undefine xxx_SUPPORTED symbols as necessary.
+
+You can also save a few K by not having text error messages in the library;
+the standard error message table occupies about 5Kb.  This is particularly
+reasonable for embedded applications where there's no good way to display
+a message anyway.  To do this, remove the creation of the message table
+(jpeg_std_message_table[]) from jerror.c, and alter format_message to do
+something reasonable without it.  You could output the numeric value of the
+message code number, for example.  If you do this, you can also save a couple
+more K by modifying the TRACEMSn() macros in jerror.h to expand to nothing;
+you don't need trace capability anyway, right?
+
+
+Portability considerations
+--------------------------
+
+The JPEG library has been written to be extremely portable; the sample
+applications cjpeg and djpeg are slightly less so.  This section summarizes
+the design goals in this area.  (If you encounter any bugs that cause the
+library to be less portable than is claimed here, we'd appreciate hearing
+about them.)
+
+The code works fine on ANSI C and C++ compilers, using any of the popular
+system include file setups, and some not-so-popular ones too.
+
+The code is not dependent on the exact sizes of the C data types.  As
+distributed, we make the assumptions that
+        char    is at least 8 bits wide
+        short   is at least 16 bits wide
+        int     is at least 16 bits wide
+        long    is at least 32 bits wide
+(These are the minimum requirements of the ANSI C standard.)  Wider types will
+work fine, although memory may be used inefficiently if char is much larger
+than 8 bits or short is much bigger than 16 bits.  The code should work
+equally well with 16- or 32-bit ints.
+
+In a system where these assumptions are not met, you may be able to make the
+code work by modifying the typedefs in jmorecfg.h.  However, you will probably
+have difficulty if int is less than 16 bits wide, since references to plain
+int abound in the code.
+
+char can be either signed or unsigned, although the code runs faster if an
+unsigned char type is available.  If char is wider than 8 bits, you will need
+to redefine JOCTET and/or provide custom data source/destination managers so
+that JOCTET represents exactly 8 bits of data on external storage.
+
+The JPEG library proper does not assume ASCII representation of characters.
+But some of the image file I/O modules in cjpeg/djpeg do have ASCII
+dependencies in file-header manipulation; so does cjpeg's select_file_type()
+routine.
+
+The JPEG library does not rely heavily on the C library.  In particular, C
+stdio is used only by the data source/destination modules and the error
+handler, all of which are application-replaceable.  (cjpeg/djpeg are more
+heavily dependent on stdio.)  malloc and free are called only from the memory
+manager "back end" module, so you can use a different memory allocator by
+replacing that one file.
+
+More info about porting the code may be gleaned by reading jconfig.txt,
+jmorecfg.h, and jinclude.h.
diff --git a/3rdparty/libjpeg-turbo/src/rdbmp.c b/3rdparty/libjpeg-turbo/src/rdbmp.c
new file mode 100644
index 000000000000..c2c06fd001cb
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/rdbmp.c
@@ -0,0 +1,689 @@
+/*
+ * rdbmp.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2017 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Modified 2011 by Siarhei Siamashka.
+ * Copyright (C) 2015, 2017-2018, 2021-2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to read input images in Microsoft "BMP"
+ * format (MS Windows 3.x, OS/2 1.x, and OS/2 2.x flavors).
+ * Currently, only 8-, 24-, and 32-bit images are supported, not 1-bit or
+ * 4-bit (feeding such low-depth images into JPEG would be silly anyway).
+ * Also, we don't support RLE-compressed files.
+ *
+ * These routines may need modification for non-Unix environments or
+ * specialized applications.  As they stand, they assume input from
+ * an ordinary stdio stream.  They further assume that reading begins
+ * at the start of the file; start_input may need work if the
+ * user interface has already read some data (e.g., to determine that
+ * the file is indeed BMP format).
+ *
+ * This code contributed by James Arthur Boucher.
+ */
+
+#include "cmyk.h"
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+
+#ifdef BMP_SUPPORTED
+
+
+/* Macros to deal with unsigned chars as efficiently as compiler allows */
+
+typedef unsigned char U_CHAR;
+#define UCH(x)  ((int)(x))
+
+
+#define ReadOK(file, buffer, len) \
+  (fread(buffer, 1, len, file) == ((size_t)(len)))
+
+static int alpha_index[JPEG_NUMCS] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
+};
+
+
+/* Private version of data source object */
+
+typedef struct _bmp_source_struct *bmp_source_ptr;
+
+typedef struct _bmp_source_struct {
+  struct cjpeg_source_struct pub; /* public fields */
+
+  j_compress_ptr cinfo;         /* back link saves passing separate parm */
+
+  JSAMPARRAY colormap;          /* BMP colormap (converted to my format) */
+
+  jvirt_sarray_ptr whole_image; /* Needed to reverse row order */
+  JDIMENSION source_row;        /* Current source row number */
+  JDIMENSION row_width;         /* Physical width of scanlines in file */
+
+  int bits_per_pixel;           /* remembers 8-, 24-, or 32-bit format */
+  int cmap_length;              /* colormap length */
+
+  boolean use_inversion_array;  /* TRUE = preload the whole image, which is
+                                   stored in bottom-up order, and feed it to
+                                   the calling program in top-down order
+
+                                   FALSE = the calling program will maintain
+                                   its own image buffer and read the rows in
+                                   bottom-up order */
+
+  U_CHAR *iobuffer;             /* I/O buffer (used to buffer a single row from
+                                   disk if use_inversion_array == FALSE) */
+} bmp_source_struct;
+
+
+LOCAL(int)
+read_byte(bmp_source_ptr sinfo)
+/* Read next byte from BMP file */
+{
+  register FILE *infile = sinfo->pub.input_file;
+  register int c;
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  return c;
+}
+
+
+LOCAL(void)
+read_colormap(bmp_source_ptr sinfo, int cmaplen, int mapentrysize)
+/* Read the colormap from a BMP file */
+{
+  int i, gray = 1;
+
+  switch (mapentrysize) {
+  case 3:
+    /* BGR format (occurs in OS/2 files) */
+    for (i = 0; i < cmaplen; i++) {
+      sinfo->colormap[2][i] = (JSAMPLE)read_byte(sinfo);
+      sinfo->colormap[1][i] = (JSAMPLE)read_byte(sinfo);
+      sinfo->colormap[0][i] = (JSAMPLE)read_byte(sinfo);
+      if (sinfo->colormap[2][i] != sinfo->colormap[1][i] ||
+          sinfo->colormap[1][i] != sinfo->colormap[0][i])
+        gray = 0;
+    }
+    break;
+  case 4:
+    /* BGR0 format (occurs in MS Windows files) */
+    for (i = 0; i < cmaplen; i++) {
+      sinfo->colormap[2][i] = (JSAMPLE)read_byte(sinfo);
+      sinfo->colormap[1][i] = (JSAMPLE)read_byte(sinfo);
+      sinfo->colormap[0][i] = (JSAMPLE)read_byte(sinfo);
+      (void)read_byte(sinfo);
+      if (sinfo->colormap[2][i] != sinfo->colormap[1][i] ||
+          sinfo->colormap[1][i] != sinfo->colormap[0][i])
+        gray = 0;
+    }
+    break;
+  default:
+    ERREXIT(sinfo->cinfo, JERR_BMP_BADCMAP);
+    break;
+  }
+
+  if ((sinfo->cinfo->in_color_space == JCS_UNKNOWN ||
+       sinfo->cinfo->in_color_space == JCS_RGB) && gray)
+    sinfo->cinfo->in_color_space = JCS_GRAYSCALE;
+
+  if (sinfo->cinfo->in_color_space == JCS_GRAYSCALE && !gray)
+    ERREXIT(sinfo->cinfo, JERR_BAD_IN_COLORSPACE);
+}
+
+
+/*
+ * Read one row of pixels.
+ * The image has been read into the whole_image array, but is otherwise
+ * unprocessed.  We must read it out in top-to-bottom row order, and if
+ * it is an 8-bit image, we must expand colormapped pixels to 24bit format.
+ */
+
+METHODDEF(JDIMENSION)
+get_8bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading 8-bit colormap indexes */
+{
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
+  register JSAMPARRAY colormap = source->colormap;
+  int cmaplen = source->cmap_length;
+  JSAMPARRAY image_ptr;
+  register int t;
+  register JSAMPROW inptr, outptr;
+  register JDIMENSION col;
+
+  if (source->use_inversion_array) {
+    /* Fetch next row from virtual array */
+    source->source_row--;
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->whole_image,
+       source->source_row, (JDIMENSION)1, FALSE);
+    inptr = image_ptr[0];
+  } else {
+    if (!ReadOK(source->pub.input_file, source->iobuffer, source->row_width))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    inptr = source->iobuffer;
+  }
+
+  /* Expand the colormap indexes to real data */
+  outptr = source->pub.buffer[0];
+  if (cinfo->in_color_space == JCS_GRAYSCALE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      t = *inptr++;
+      if (t >= cmaplen)
+        ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
+      *outptr++ = colormap[0][t];
+    }
+  } else if (cinfo->in_color_space == JCS_CMYK) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      t = *inptr++;
+      if (t >= cmaplen)
+        ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
+      rgb_to_cmyk(colormap[0][t], colormap[1][t], colormap[2][t], outptr,
+                  outptr + 1, outptr + 2, outptr + 3);
+      outptr += 4;
+    }
+  } else {
+    register int rindex = rgb_red[cinfo->in_color_space];
+    register int gindex = rgb_green[cinfo->in_color_space];
+    register int bindex = rgb_blue[cinfo->in_color_space];
+    register int aindex = alpha_index[cinfo->in_color_space];
+    register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+    if (aindex >= 0) {
+      for (col = cinfo->image_width; col > 0; col--) {
+        t = *inptr++;
+        if (t >= cmaplen)
+          ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
+        outptr[rindex] = colormap[0][t];
+        outptr[gindex] = colormap[1][t];
+        outptr[bindex] = colormap[2][t];
+        outptr[aindex] = 0xFF;
+        outptr += ps;
+      }
+    } else {
+      for (col = cinfo->image_width; col > 0; col--) {
+        t = *inptr++;
+        if (t >= cmaplen)
+          ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
+        outptr[rindex] = colormap[0][t];
+        outptr[gindex] = colormap[1][t];
+        outptr[bindex] = colormap[2][t];
+        outptr += ps;
+      }
+    }
+  }
+
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_24bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading 24-bit pixels */
+{
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
+  JSAMPARRAY image_ptr;
+  register JSAMPROW inptr, outptr;
+  register JDIMENSION col;
+
+  if (source->use_inversion_array) {
+    /* Fetch next row from virtual array */
+    source->source_row--;
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->whole_image,
+       source->source_row, (JDIMENSION)1, FALSE);
+    inptr = image_ptr[0];
+  } else {
+    if (!ReadOK(source->pub.input_file, source->iobuffer, source->row_width))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    inptr = source->iobuffer;
+  }
+
+  /* Transfer data.  Note source values are in BGR order
+   * (even though Microsoft's own documents say the opposite).
+   */
+  outptr = source->pub.buffer[0];
+  if (cinfo->in_color_space == JCS_EXT_BGR) {
+    memcpy(outptr, inptr, source->row_width);
+  } else if (cinfo->in_color_space == JCS_CMYK) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
+      rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
+      outptr += 4;
+    }
+  } else {
+    register int rindex = rgb_red[cinfo->in_color_space];
+    register int gindex = rgb_green[cinfo->in_color_space];
+    register int bindex = rgb_blue[cinfo->in_color_space];
+    register int aindex = alpha_index[cinfo->in_color_space];
+    register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+    if (aindex >= 0) {
+      for (col = cinfo->image_width; col > 0; col--) {
+        outptr[bindex] = *inptr++;
+        outptr[gindex] = *inptr++;
+        outptr[rindex] = *inptr++;
+        outptr[aindex] = 0xFF;
+        outptr += ps;
+      }
+    } else {
+      for (col = cinfo->image_width; col > 0; col--) {
+        outptr[bindex] = *inptr++;
+        outptr[gindex] = *inptr++;
+        outptr[rindex] = *inptr++;
+        outptr += ps;
+      }
+    }
+  }
+
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_32bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading 32-bit pixels */
+{
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
+  JSAMPARRAY image_ptr;
+  register JSAMPROW inptr, outptr;
+  register JDIMENSION col;
+
+  if (source->use_inversion_array) {
+    /* Fetch next row from virtual array */
+    source->source_row--;
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->whole_image,
+       source->source_row, (JDIMENSION)1, FALSE);
+    inptr = image_ptr[0];
+  } else {
+    if (!ReadOK(source->pub.input_file, source->iobuffer, source->row_width))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    inptr = source->iobuffer;
+  }
+
+  /* Transfer data.  Note source values are in BGR order
+   * (even though Microsoft's own documents say the opposite).
+   */
+  outptr = source->pub.buffer[0];
+  if (cinfo->in_color_space == JCS_EXT_BGRX ||
+      cinfo->in_color_space == JCS_EXT_BGRA) {
+    memcpy(outptr, inptr, source->row_width);
+  } else if (cinfo->in_color_space == JCS_CMYK) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
+      rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
+      inptr++;                          /* skip the 4th byte (Alpha channel) */
+      outptr += 4;
+    }
+  } else {
+    register int rindex = rgb_red[cinfo->in_color_space];
+    register int gindex = rgb_green[cinfo->in_color_space];
+    register int bindex = rgb_blue[cinfo->in_color_space];
+    register int aindex = alpha_index[cinfo->in_color_space];
+    register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+    if (aindex >= 0) {
+      for (col = cinfo->image_width; col > 0; col--) {
+        outptr[bindex] = *inptr++;
+        outptr[gindex] = *inptr++;
+        outptr[rindex] = *inptr++;
+        outptr[aindex] = *inptr++;
+        outptr += ps;
+      }
+    } else {
+      for (col = cinfo->image_width; col > 0; col--) {
+        outptr[bindex] = *inptr++;
+        outptr[gindex] = *inptr++;
+        outptr[rindex] = *inptr++;
+        inptr++;                        /* skip the 4th byte (Alpha channel) */
+        outptr += ps;
+      }
+    }
+  }
+
+  return 1;
+}
+
+
+/*
+ * This method loads the image into whole_image during the first call on
+ * get_pixel_rows.  The get_pixel_rows pointer is then adjusted to call
+ * get_8bit_row, get_24bit_row, or get_32bit_row on subsequent calls.
+ */
+
+METHODDEF(JDIMENSION)
+preload_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
+  register FILE *infile = source->pub.input_file;
+  register JSAMPROW out_ptr;
+  JSAMPARRAY image_ptr;
+  JDIMENSION row;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+
+  /* Read the data into a virtual array in input-file row order. */
+  for (row = 0; row < cinfo->image_height; row++) {
+    if (progress != NULL) {
+      progress->pub.pass_counter = (long)row;
+      progress->pub.pass_limit = (long)cinfo->image_height;
+      (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
+    }
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->whole_image, row, (JDIMENSION)1, TRUE);
+    out_ptr = image_ptr[0];
+    if (fread(out_ptr, 1, source->row_width, infile) != source->row_width) {
+      if (feof(infile))
+        ERREXIT(cinfo, JERR_INPUT_EOF);
+      else
+        ERREXIT(cinfo, JERR_FILE_READ);
+    }
+  }
+  if (progress != NULL)
+    progress->completed_extra_passes++;
+
+  /* Set up to read from the virtual array in top-to-bottom order */
+  switch (source->bits_per_pixel) {
+  case 8:
+    source->pub.get_pixel_rows = get_8bit_row;
+    break;
+  case 24:
+    source->pub.get_pixel_rows = get_24bit_row;
+    break;
+  case 32:
+    source->pub.get_pixel_rows = get_32bit_row;
+    break;
+  default:
+    ERREXIT(cinfo, JERR_BMP_BADDEPTH);
+  }
+  source->source_row = cinfo->image_height;
+
+  /* And read the first row */
+  return (*source->pub.get_pixel_rows) (cinfo, sinfo);
+}
+
+
+/*
+ * Read the file header; return image size and component count.
+ */
+
+METHODDEF(void)
+start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
+  U_CHAR bmpfileheader[14];
+  U_CHAR bmpinfoheader[64];
+
+#define GET_2B(array, offset) \
+  ((unsigned short)UCH(array[offset]) + \
+   (((unsigned short)UCH(array[offset + 1])) << 8))
+#define GET_4B(array, offset) \
+  ((unsigned int)UCH(array[offset]) + \
+   (((unsigned int)UCH(array[offset + 1])) << 8) + \
+   (((unsigned int)UCH(array[offset + 2])) << 16) + \
+   (((unsigned int)UCH(array[offset + 3])) << 24))
+
+  int bfOffBits;
+  int headerSize;
+  int biWidth;
+  int biHeight;
+  unsigned short biPlanes;
+  unsigned int biCompression;
+  int biXPelsPerMeter, biYPelsPerMeter;
+  int biClrUsed = 0;
+  int mapentrysize = 0;         /* 0 indicates no colormap */
+  int bPad;
+  JDIMENSION row_width = 0;
+
+  /* Read and verify the bitmap file header */
+  if (!ReadOK(source->pub.input_file, bmpfileheader, 14))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  if (GET_2B(bmpfileheader, 0) != 0x4D42) /* 'BM' */
+    ERREXIT(cinfo, JERR_BMP_NOT);
+  bfOffBits = GET_4B(bmpfileheader, 10);
+  /* We ignore the remaining fileheader fields */
+
+  /* The infoheader might be 12 bytes (OS/2 1.x), 40 bytes (Windows),
+   * or 64 bytes (OS/2 2.x).  Check the first 4 bytes to find out which.
+   */
+  if (!ReadOK(source->pub.input_file, bmpinfoheader, 4))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  headerSize = GET_4B(bmpinfoheader, 0);
+  if (headerSize < 12 || headerSize > 64 || (headerSize + 14) > bfOffBits)
+    ERREXIT(cinfo, JERR_BMP_BADHEADER);
+  if (!ReadOK(source->pub.input_file, bmpinfoheader + 4, headerSize - 4))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+
+  switch (headerSize) {
+  case 12:
+    /* Decode OS/2 1.x header (Microsoft calls this a BITMAPCOREHEADER) */
+    biWidth = (int)GET_2B(bmpinfoheader, 4);
+    biHeight = (int)GET_2B(bmpinfoheader, 6);
+    biPlanes = GET_2B(bmpinfoheader, 8);
+    source->bits_per_pixel = (int)GET_2B(bmpinfoheader, 10);
+
+    switch (source->bits_per_pixel) {
+    case 8:                     /* colormapped image */
+      mapentrysize = 3;         /* OS/2 uses RGBTRIPLE colormap */
+      TRACEMS2(cinfo, 1, JTRC_BMP_OS2_MAPPED, biWidth, biHeight);
+      break;
+    case 24:                    /* RGB image */
+    case 32:                    /* RGB image + Alpha channel */
+      TRACEMS3(cinfo, 1, JTRC_BMP_OS2, biWidth, biHeight,
+               source->bits_per_pixel);
+      break;
+    default:
+      ERREXIT(cinfo, JERR_BMP_BADDEPTH);
+      break;
+    }
+    break;
+  case 40:
+  case 64:
+    /* Decode Windows 3.x header (Microsoft calls this a BITMAPINFOHEADER) */
+    /* or OS/2 2.x header, which has additional fields that we ignore */
+    biWidth = (int)GET_4B(bmpinfoheader, 4);
+    biHeight = (int)GET_4B(bmpinfoheader, 8);
+    biPlanes = GET_2B(bmpinfoheader, 12);
+    source->bits_per_pixel = (int)GET_2B(bmpinfoheader, 14);
+    biCompression = GET_4B(bmpinfoheader, 16);
+    biXPelsPerMeter = (int)GET_4B(bmpinfoheader, 24);
+    biYPelsPerMeter = (int)GET_4B(bmpinfoheader, 28);
+    biClrUsed = GET_4B(bmpinfoheader, 32);
+    /* biSizeImage, biClrImportant fields are ignored */
+
+    switch (source->bits_per_pixel) {
+    case 8:                     /* colormapped image */
+      mapentrysize = 4;         /* Windows uses RGBQUAD colormap */
+      TRACEMS2(cinfo, 1, JTRC_BMP_MAPPED, biWidth, biHeight);
+      break;
+    case 24:                    /* RGB image */
+    case 32:                    /* RGB image + Alpha channel */
+      TRACEMS3(cinfo, 1, JTRC_BMP, biWidth, biHeight, source->bits_per_pixel);
+      break;
+    default:
+      ERREXIT(cinfo, JERR_BMP_BADDEPTH);
+      break;
+    }
+    if (biCompression != 0)
+      ERREXIT(cinfo, JERR_BMP_COMPRESSED);
+
+    if (biXPelsPerMeter > 0 && biYPelsPerMeter > 0) {
+      /* Set JFIF density parameters from the BMP data */
+      cinfo->X_density = (UINT16)(biXPelsPerMeter / 100); /* 100 cm per meter */
+      cinfo->Y_density = (UINT16)(biYPelsPerMeter / 100);
+      cinfo->density_unit = 2;  /* dots/cm */
+    }
+    break;
+  default:
+    ERREXIT(cinfo, JERR_BMP_BADHEADER);
+    return;
+  }
+
+  if (biWidth <= 0 || biHeight <= 0)
+    ERREXIT(cinfo, JERR_BMP_EMPTY);
+  if (sinfo->max_pixels &&
+      (unsigned long long)biWidth * biHeight > sinfo->max_pixels)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, sinfo->max_pixels);
+  if (biPlanes != 1)
+    ERREXIT(cinfo, JERR_BMP_BADPLANES);
+
+  /* Compute distance to bitmap data --- will adjust for colormap below */
+  bPad = bfOffBits - (headerSize + 14);
+
+  /* Read the colormap, if any */
+  if (mapentrysize > 0) {
+    if (biClrUsed <= 0)
+      biClrUsed = 256;          /* assume it's 256 */
+    else if (biClrUsed > 256)
+      ERREXIT(cinfo, JERR_BMP_BADCMAP);
+    /* Allocate space to store the colormap */
+    source->colormap = (*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)biClrUsed, (JDIMENSION)3);
+    source->cmap_length = (int)biClrUsed;
+    /* and read it from the file */
+    read_colormap(source, (int)biClrUsed, mapentrysize);
+    /* account for size of colormap */
+    bPad -= biClrUsed * mapentrysize;
+  }
+
+  /* Skip any remaining pad bytes */
+  if (bPad < 0)                 /* incorrect bfOffBits value? */
+    ERREXIT(cinfo, JERR_BMP_BADHEADER);
+  while (--bPad >= 0) {
+    (void)read_byte(source);
+  }
+
+  /* Compute row width in file, including padding to 4-byte boundary */
+  switch (source->bits_per_pixel) {
+  case 8:
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_RGB;
+    if (IsExtRGB(cinfo->in_color_space))
+      cinfo->input_components = rgb_pixelsize[cinfo->in_color_space];
+    else if (cinfo->in_color_space == JCS_GRAYSCALE)
+      cinfo->input_components = 1;
+    else if (cinfo->in_color_space == JCS_CMYK)
+      cinfo->input_components = 4;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    row_width = (JDIMENSION)biWidth;
+    break;
+  case 24:
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_BGR;
+    if (IsExtRGB(cinfo->in_color_space))
+      cinfo->input_components = rgb_pixelsize[cinfo->in_color_space];
+    else if (cinfo->in_color_space == JCS_CMYK)
+      cinfo->input_components = 4;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    if ((unsigned long long)biWidth * 3ULL > 0xFFFFFFFFULL)
+      ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+    row_width = (JDIMENSION)biWidth * 3;
+    break;
+  case 32:
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_BGRA;
+    if (IsExtRGB(cinfo->in_color_space))
+      cinfo->input_components = rgb_pixelsize[cinfo->in_color_space];
+    else if (cinfo->in_color_space == JCS_CMYK)
+      cinfo->input_components = 4;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    if ((unsigned long long)biWidth * 4ULL > 0xFFFFFFFFULL)
+      ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+    row_width = (JDIMENSION)biWidth * 4;
+    break;
+  default:
+    ERREXIT(cinfo, JERR_BMP_BADDEPTH);
+  }
+  while ((row_width & 3) != 0) row_width++;
+  source->row_width = row_width;
+
+  if (source->use_inversion_array) {
+    /* Allocate space for inversion array, prepare for preload pass */
+    source->whole_image = (*cinfo->mem->request_virt_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+       row_width, (JDIMENSION)biHeight, (JDIMENSION)1);
+    source->pub.get_pixel_rows = preload_image;
+    if (cinfo->progress != NULL) {
+      cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+      progress->total_extra_passes++; /* count file input as separate pass */
+    }
+  } else {
+    source->iobuffer = (U_CHAR *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, row_width);
+    switch (source->bits_per_pixel) {
+    case 8:
+      source->pub.get_pixel_rows = get_8bit_row;
+      break;
+    case 24:
+      source->pub.get_pixel_rows = get_24bit_row;
+      break;
+    case 32:
+      source->pub.get_pixel_rows = get_32bit_row;
+      break;
+    default:
+      ERREXIT(cinfo, JERR_BMP_BADDEPTH);
+    }
+  }
+
+  /* Ensure that biWidth * cinfo->input_components doesn't exceed the maximum
+     value of the JDIMENSION type.  This is only a danger with BMP files, since
+     their width and height fields are 32-bit integers. */
+  if ((unsigned long long)biWidth *
+      (unsigned long long)cinfo->input_components > 0xFFFFFFFFULL)
+    ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+  /* Allocate one-row buffer for returned data */
+  source->pub.buffer = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)biWidth * (JDIMENSION)cinfo->input_components, (JDIMENSION)1);
+  source->pub.buffer_height = 1;
+
+  cinfo->data_precision = 8;
+  cinfo->image_width = (JDIMENSION)biWidth;
+  cinfo->image_height = (JDIMENSION)biHeight;
+}
+
+
+/*
+ * Finish up at the end of the file.
+ */
+
+METHODDEF(void)
+finish_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  /* no work */
+}
+
+
+/*
+ * The module selection routine for BMP format input.
+ */
+
+GLOBAL(cjpeg_source_ptr)
+jinit_read_bmp(j_compress_ptr cinfo, boolean use_inversion_array)
+{
+  bmp_source_ptr source;
+
+  if (cinfo->data_precision != 8)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  /* Create module interface object */
+  source = (bmp_source_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(bmp_source_struct));
+  source->cinfo = cinfo;        /* make back link for subroutines */
+  /* Fill in method ptrs, except get_pixel_rows which start_input sets */
+  source->pub.start_input = start_input_bmp;
+  source->pub.finish_input = finish_input_bmp;
+  source->pub.max_pixels = 0;
+
+  source->use_inversion_array = use_inversion_array;
+
+  return (cjpeg_source_ptr)source;
+}
+
+#endif /* BMP_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/rdcolmap.c b/3rdparty/libjpeg-turbo/src/rdcolmap.c
new file mode 100644
index 000000000000..836685e1b80e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/rdcolmap.c
@@ -0,0 +1,261 @@
+/*
+ * rdcolmap.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file implements djpeg's "-map file" switch.  It reads a source image
+ * and constructs a colormap to be supplied to the JPEG decompressor.
+ *
+ * Currently, these file formats are supported for the map file:
+ *   GIF: the contents of the GIF's global colormap are used.
+ *   PPM (either text or raw flavor): the entire file is read and
+ *      each unique pixel value is entered in the map.
+ * Note that reading a large PPM file will be horrendously slow.
+ * Typically, a PPM-format map file should contain just one pixel
+ * of each desired color.  Such a file can be extracted from an
+ * ordinary image PPM file with ppmtomap(1).
+ *
+ * Rescaling a PPM that has a maxval unequal to _MAXJSAMPLE is not
+ * currently implemented.
+ */
+
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "jsamplecomp.h"
+
+#ifdef QUANT_2PASS_SUPPORTED    /* otherwise can't quantize to supplied map */
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
+
+/* Portions of this code are based on the PBMPLUS library, which is:
+**
+** Copyright (C) 1988 by Jef Poskanzer.
+**
+** Permission to use, copy, modify, and distribute this software and its
+** documentation for any purpose and without fee is hereby granted, provided
+** that the above copyright notice appear in all copies and that both that
+** copyright notice and this permission notice appear in supporting
+** documentation.  This software is provided "as is" without express or
+** implied warranty.
+*/
+
+
+/*
+ * Add a (potentially) new color to the color map.
+ */
+
+LOCAL(void)
+add_map_entry(j_decompress_ptr cinfo, int R, int G, int B)
+{
+  _JSAMPROW colormap0 = ((_JSAMPARRAY)cinfo->colormap)[0];
+  _JSAMPROW colormap1 = ((_JSAMPARRAY)cinfo->colormap)[1];
+  _JSAMPROW colormap2 = ((_JSAMPARRAY)cinfo->colormap)[2];
+  int ncolors = cinfo->actual_number_of_colors;
+  int index;
+
+  /* Check for duplicate color. */
+  for (index = 0; index < ncolors; index++) {
+    if (colormap0[index] == R && colormap1[index] == G &&
+        colormap2[index] == B)
+      return;                   /* color is already in map */
+  }
+
+  /* Check for map overflow. */
+  if (ncolors >= (_MAXJSAMPLE + 1))
+    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, (_MAXJSAMPLE + 1));
+
+  /* OK, add color to map. */
+  colormap0[ncolors] = (_JSAMPLE)R;
+  colormap1[ncolors] = (_JSAMPLE)G;
+  colormap2[ncolors] = (_JSAMPLE)B;
+  cinfo->actual_number_of_colors++;
+}
+
+
+/*
+ * Extract color map from a GIF file.
+ */
+
+LOCAL(void)
+read_gif_map(j_decompress_ptr cinfo, FILE *infile)
+{
+  int header[13];
+  int i, colormaplen;
+  int R, G, B;
+
+  /* Initial 'G' has already been read by read_color_map */
+  /* Read the rest of the GIF header and logical screen descriptor */
+  for (i = 1; i < 13; i++) {
+    if ((header[i] = getc(infile)) == EOF)
+      ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+  }
+
+  /* Verify GIF Header */
+  if (header[1] != 'I' || header[2] != 'F')
+    ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+
+  /* There must be a global color map. */
+  if ((header[10] & 0x80) == 0)
+    ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+
+  /* OK, fetch it. */
+  colormaplen = 2 << (header[10] & 0x07);
+
+  for (i = 0; i < colormaplen; i++) {
+    R = getc(infile);
+    G = getc(infile);
+    B = getc(infile);
+    if (R == EOF || G == EOF || B == EOF)
+      ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+    add_map_entry(cinfo,
+                  R << (BITS_IN_JSAMPLE - 8),
+                  G << (BITS_IN_JSAMPLE - 8),
+                  B << (BITS_IN_JSAMPLE - 8));
+  }
+}
+
+
+/* Support routines for reading PPM */
+
+
+LOCAL(int)
+pbm_getc(FILE *infile)
+/* Read next char, skipping over any comments */
+/* A comment/newline sequence is returned as a newline */
+{
+  register int ch;
+
+  ch = getc(infile);
+  if (ch == '#') {
+    do {
+      ch = getc(infile);
+    } while (ch != '\n' && ch != EOF);
+  }
+  return ch;
+}
+
+
+LOCAL(unsigned int)
+read_pbm_integer(j_decompress_ptr cinfo, FILE *infile)
+/* Read an unsigned decimal integer from the PPM file */
+/* Swallows one trailing character after the integer */
+/* Note that on a 16-bit-int machine, only values up to 64k can be read. */
+/* This should not be a problem in practice. */
+{
+  register int ch;
+  register unsigned int val;
+
+  /* Skip any leading whitespace */
+  do {
+    ch = pbm_getc(infile);
+    if (ch == EOF)
+      ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+  } while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
+
+  if (ch < '0' || ch > '9')
+    ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+
+  val = ch - '0';
+  while ((ch = pbm_getc(infile)) >= '0' && ch <= '9') {
+    val *= 10;
+    val += ch - '0';
+  }
+  return val;
+}
+
+
+/*
+ * Extract color map from a PPM file.
+ */
+
+LOCAL(void)
+read_ppm_map(j_decompress_ptr cinfo, FILE *infile)
+{
+  int c;
+  unsigned int w, h, maxval, row, col;
+  int R, G, B;
+
+  /* Initial 'P' has already been read by read_color_map */
+  c = getc(infile);             /* save format discriminator for a sec */
+
+  /* while we fetch the remaining header info */
+  w = read_pbm_integer(cinfo, infile);
+  h = read_pbm_integer(cinfo, infile);
+  maxval = read_pbm_integer(cinfo, infile);
+
+  if (w <= 0 || h <= 0 || maxval <= 0) /* error check */
+    ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+
+  /* For now, we don't support rescaling from an unusual maxval. */
+  if (maxval != (unsigned int)_MAXJSAMPLE)
+    ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+
+  switch (c) {
+  case '3':                     /* it's a text-format PPM file */
+    for (row = 0; row < h; row++) {
+      for (col = 0; col < w; col++) {
+        R = read_pbm_integer(cinfo, infile);
+        G = read_pbm_integer(cinfo, infile);
+        B = read_pbm_integer(cinfo, infile);
+        add_map_entry(cinfo, R, G, B);
+      }
+    }
+    break;
+
+  case '6':                     /* it's a raw-format PPM file */
+    for (row = 0; row < h; row++) {
+      for (col = 0; col < w; col++) {
+        R = getc(infile);
+        G = getc(infile);
+        B = getc(infile);
+        if (R == EOF || G == EOF || B == EOF)
+          ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+        add_map_entry(cinfo, R, G, B);
+      }
+    }
+    break;
+
+  default:
+    ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+    break;
+  }
+}
+
+
+/*
+ * Main entry point from djpeg.c.
+ *  Input: opened input file (from file name argument on command line).
+ *  Output: colormap and actual_number_of_colors fields are set in cinfo.
+ */
+
+GLOBAL(void)
+_read_color_map(j_decompress_ptr cinfo, FILE *infile)
+{
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  /* Allocate space for a color map of maximum supported size. */
+  cinfo->colormap = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)(_MAXJSAMPLE + 1), (JDIMENSION)3);
+  cinfo->actual_number_of_colors = 0; /* initialize map to empty */
+
+  /* Read first byte to determine file format */
+  switch (getc(infile)) {
+  case 'G':
+    read_gif_map(cinfo, infile);
+    break;
+  case 'P':
+    read_ppm_map(cinfo, infile);
+    break;
+  default:
+    ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
+    break;
+  }
+}
+
+#endif /* BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED) */
+#endif /* QUANT_2PASS_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/rdgif.c b/3rdparty/libjpeg-turbo/src/rdgif.c
new file mode 100644
index 000000000000..23e8b9e128b0
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/rdgif.c
@@ -0,0 +1,720 @@
+/*
+ * rdgif.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2019 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2021-2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to read input images in GIF format.
+ *
+ * These routines may need modification for non-Unix environments or
+ * specialized applications.  As they stand, they assume input from
+ * an ordinary stdio stream.  They further assume that reading begins
+ * at the start of the file; start_input may need work if the
+ * user interface has already read some data (e.g., to determine that
+ * the file is indeed GIF format).
+ */
+
+/*
+ * This code is loosely based on giftoppm from the PBMPLUS distribution
+ * of Feb. 1991.  That file contains the following copyright notice:
+ * +-------------------------------------------------------------------+
+ * | Copyright 1990, David Koblas.                                     |
+ * |   Permission to use, copy, modify, and distribute this software   |
+ * |   and its documentation for any purpose and without fee is hereby |
+ * |   granted, provided that the above copyright notice appear in all |
+ * |   copies and that both that copyright notice and this permission  |
+ * |   notice appear in supporting documentation.  This software is    |
+ * |   provided "as is" without express or implied warranty.           |
+ * +-------------------------------------------------------------------+
+ */
+
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include "jsamplecomp.h"
+
+#if defined(GIF_SUPPORTED) && \
+    (BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED))
+
+
+/* Macros to deal with unsigned chars as efficiently as compiler allows */
+
+typedef unsigned char U_CHAR;
+#define UCH(x)  ((int)(x))
+
+
+#define ReadOK(file, buffer, len) \
+  (fread(buffer, 1, len, file) == ((size_t)(len)))
+
+
+#define MAXCOLORMAPSIZE  256    /* max # of colors in a GIF colormap */
+#define NUMCOLORS        3      /* # of colors */
+#define CM_RED           0      /* color component numbers */
+#define CM_GREEN         1
+#define CM_BLUE          2
+
+#define MAX_LZW_BITS     12     /* maximum LZW code size */
+#define LZW_TABLE_SIZE   (1 << MAX_LZW_BITS) /* # of possible LZW symbols */
+
+/* Macros for extracting header data --- note we assume chars may be signed */
+
+#define LM_to_uint(array, offset) \
+  ((unsigned int)UCH(array[offset]) + \
+   (((unsigned int)UCH(array[offset + 1])) << 8))
+
+#define BitSet(byte, bit)       ((byte) & (bit))
+#define INTERLACE       0x40    /* mask for bit signifying interlaced image */
+#define COLORMAPFLAG    0x80    /* mask for bit signifying colormap presence */
+
+
+/*
+ * LZW decompression tables look like this:
+ *   symbol_head[K] = prefix symbol of any LZW symbol K (0..LZW_TABLE_SIZE-1)
+ *   symbol_tail[K] = suffix byte   of any LZW symbol K (0..LZW_TABLE_SIZE-1)
+ * Note that entries 0..end_code of the above tables are not used,
+ * since those symbols represent raw bytes or special codes.
+ *
+ * The stack represents the not-yet-used expansion of the last LZW symbol.
+ * In the worst case, a symbol could expand to as many bytes as there are
+ * LZW symbols, so we allocate LZW_TABLE_SIZE bytes for the stack.
+ * (This is conservative since that number includes the raw-byte symbols.)
+ */
+
+
+/* Private version of data source object */
+
+typedef struct {
+  struct cjpeg_source_struct pub; /* public fields */
+
+  j_compress_ptr cinfo;         /* back link saves passing separate parm */
+
+  _JSAMPARRAY colormap;         /* GIF colormap (converted to my format) */
+
+  /* State for GetCode and LZWReadByte */
+  U_CHAR code_buf[256 + 4];     /* current input data block */
+  int last_byte;                /* # of bytes in code_buf */
+  int last_bit;                 /* # of bits in code_buf */
+  int cur_bit;                  /* next bit index to read */
+  boolean first_time;           /* flags first call to GetCode */
+  boolean out_of_blocks;        /* TRUE if hit terminator data block */
+
+  int input_code_size;          /* codesize given in GIF file */
+  int clear_code, end_code;     /* values for Clear and End codes */
+
+  int code_size;                /* current actual code size */
+  int limit_code;               /* 2^code_size */
+  int max_code;                 /* first unused code value */
+
+  /* Private state for LZWReadByte */
+  int oldcode;                  /* previous LZW symbol */
+  int firstcode;                /* first byte of oldcode's expansion */
+
+  /* LZW symbol table and expansion stack */
+  UINT16 *symbol_head;          /* => table of prefix symbols */
+  UINT8  *symbol_tail;          /* => table of suffix bytes */
+  UINT8  *symbol_stack;         /* => stack for symbol expansions */
+  UINT8  *sp;                   /* stack pointer */
+
+  /* State for interlaced image processing */
+  boolean is_interlaced;        /* TRUE if have interlaced image */
+  jvirt_sarray_ptr interlaced_image; /* full image in interlaced order */
+  JDIMENSION cur_row_number;    /* need to know actual row number */
+  JDIMENSION pass2_offset;      /* # of pixel rows in pass 1 */
+  JDIMENSION pass3_offset;      /* # of pixel rows in passes 1&2 */
+  JDIMENSION pass4_offset;      /* # of pixel rows in passes 1,2,3 */
+} gif_source_struct;
+
+typedef gif_source_struct *gif_source_ptr;
+
+
+/* Forward declarations */
+METHODDEF(JDIMENSION) get_pixel_rows(j_compress_ptr cinfo,
+                                     cjpeg_source_ptr sinfo);
+METHODDEF(JDIMENSION) load_interlaced_image(j_compress_ptr cinfo,
+                                            cjpeg_source_ptr sinfo);
+METHODDEF(JDIMENSION) get_interlaced_row(j_compress_ptr cinfo,
+                                         cjpeg_source_ptr sinfo);
+
+
+LOCAL(int)
+ReadByte(gif_source_ptr sinfo)
+/* Read next byte from GIF file */
+{
+  register FILE *infile = sinfo->pub.input_file;
+  register int c;
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  return c;
+}
+
+
+LOCAL(int)
+GetDataBlock(gif_source_ptr sinfo, U_CHAR *buf)
+/* Read a GIF data block, which has a leading count byte */
+/* A zero-length block marks the end of a data block sequence */
+{
+  int count;
+
+  count = ReadByte(sinfo);
+  if (count > 0) {
+    if (!ReadOK(sinfo->pub.input_file, buf, count))
+      ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  }
+  return count;
+}
+
+
+LOCAL(void)
+SkipDataBlocks(gif_source_ptr sinfo)
+/* Skip a series of data blocks, until a block terminator is found */
+{
+  U_CHAR buf[256];
+
+  while (GetDataBlock(sinfo, buf) > 0)
+    /* skip */;
+}
+
+
+LOCAL(void)
+ReInitLZW(gif_source_ptr sinfo)
+/* (Re)initialize LZW state; shared code for startup and Clear processing */
+{
+  sinfo->code_size = sinfo->input_code_size + 1;
+  sinfo->limit_code = sinfo->clear_code << 1;   /* 2^code_size */
+  sinfo->max_code = sinfo->clear_code + 2;      /* first unused code value */
+  sinfo->sp = sinfo->symbol_stack;              /* init stack to empty */
+}
+
+
+LOCAL(void)
+InitLZWCode(gif_source_ptr sinfo)
+/* Initialize for a series of LZWReadByte (and hence GetCode) calls */
+{
+  /* GetCode initialization */
+  sinfo->last_byte = 2;         /* make safe to "recopy last two bytes" */
+  sinfo->code_buf[0] = 0;
+  sinfo->code_buf[1] = 0;
+  sinfo->last_bit = 0;          /* nothing in the buffer */
+  sinfo->cur_bit = 0;           /* force buffer load on first call */
+  sinfo->first_time = TRUE;
+  sinfo->out_of_blocks = FALSE;
+
+  /* LZWReadByte initialization: */
+  /* compute special code values (note that these do not change later) */
+  sinfo->clear_code = 1 << sinfo->input_code_size;
+  sinfo->end_code = sinfo->clear_code + 1;
+  ReInitLZW(sinfo);
+}
+
+
+LOCAL(int)
+GetCode(gif_source_ptr sinfo)
+/* Fetch the next code_size bits from the GIF data */
+/* We assume code_size is less than 16 */
+{
+  register int accum;
+  int offs, count;
+
+  while (sinfo->cur_bit + sinfo->code_size > sinfo->last_bit) {
+    /* Time to reload the buffer */
+    /* First time, share code with Clear case */
+    if (sinfo->first_time) {
+      sinfo->first_time = FALSE;
+      return sinfo->clear_code;
+    }
+    if (sinfo->out_of_blocks) {
+      WARNMS(sinfo->cinfo, JWRN_GIF_NOMOREDATA);
+      return sinfo->end_code;   /* fake something useful */
+    }
+    /* preserve last two bytes of what we have -- assume code_size <= 16 */
+    sinfo->code_buf[0] = sinfo->code_buf[sinfo->last_byte-2];
+    sinfo->code_buf[1] = sinfo->code_buf[sinfo->last_byte-1];
+    /* Load more bytes; set flag if we reach the terminator block */
+    if ((count = GetDataBlock(sinfo, &sinfo->code_buf[2])) == 0) {
+      sinfo->out_of_blocks = TRUE;
+      WARNMS(sinfo->cinfo, JWRN_GIF_NOMOREDATA);
+      return sinfo->end_code;   /* fake something useful */
+    }
+    /* Reset counters */
+    sinfo->cur_bit = (sinfo->cur_bit - sinfo->last_bit) + 16;
+    sinfo->last_byte = 2 + count;
+    sinfo->last_bit = sinfo->last_byte * 8;
+  }
+
+  /* Form up next 24 bits in accum */
+  offs = sinfo->cur_bit >> 3;   /* byte containing cur_bit */
+  accum = UCH(sinfo->code_buf[offs + 2]);
+  accum <<= 8;
+  accum |= UCH(sinfo->code_buf[offs + 1]);
+  accum <<= 8;
+  accum |= UCH(sinfo->code_buf[offs]);
+
+  /* Right-align cur_bit in accum, then mask off desired number of bits */
+  accum >>= (sinfo->cur_bit & 7);
+  sinfo->cur_bit += sinfo->code_size;
+  return accum & ((1 << sinfo->code_size) - 1);
+}
+
+
+LOCAL(int)
+LZWReadByte(gif_source_ptr sinfo)
+/* Read an LZW-compressed byte */
+{
+  register int code;            /* current working code */
+  int incode;                   /* saves actual input code */
+
+  /* If any codes are stacked from a previously read symbol, return them */
+  if (sinfo->sp > sinfo->symbol_stack)
+    return (int)(*(--sinfo->sp));
+
+  /* Time to read a new symbol */
+  code = GetCode(sinfo);
+
+  if (code == sinfo->clear_code) {
+    /* Reinit state, swallow any extra Clear codes, and */
+    /* return next code, which is expected to be a raw byte. */
+    ReInitLZW(sinfo);
+    do {
+      code = GetCode(sinfo);
+    } while (code == sinfo->clear_code);
+    if (code > sinfo->clear_code) { /* make sure it is a raw byte */
+      WARNMS(sinfo->cinfo, JWRN_GIF_BADDATA);
+      code = 0;                 /* use something valid */
+    }
+    /* make firstcode, oldcode valid! */
+    sinfo->firstcode = sinfo->oldcode = code;
+    return code;
+  }
+
+  if (code == sinfo->end_code) {
+    /* Skip the rest of the image, unless GetCode already read terminator */
+    if (!sinfo->out_of_blocks) {
+      SkipDataBlocks(sinfo);
+      sinfo->out_of_blocks = TRUE;
+    }
+    /* Complain that there's not enough data */
+    WARNMS(sinfo->cinfo, JWRN_GIF_ENDCODE);
+    /* Pad data with 0's */
+    return 0;                   /* fake something usable */
+  }
+
+  /* Got normal raw byte or LZW symbol */
+  incode = code;                /* save for a moment */
+
+  if (code >= sinfo->max_code) { /* special case for not-yet-defined symbol */
+    /* code == max_code is OK; anything bigger is bad data */
+    if (code > sinfo->max_code) {
+      WARNMS(sinfo->cinfo, JWRN_GIF_BADDATA);
+      incode = 0;               /* prevent creation of loops in symbol table */
+    }
+    /* this symbol will be defined as oldcode/firstcode */
+    *(sinfo->sp++) = (UINT8)sinfo->firstcode;
+    code = sinfo->oldcode;
+  }
+
+  /* If it's a symbol, expand it into the stack */
+  while (code >= sinfo->clear_code) {
+    *(sinfo->sp++) = sinfo->symbol_tail[code]; /* tail is a byte value */
+    code = sinfo->symbol_head[code]; /* head is another LZW symbol */
+  }
+  /* At this point code just represents a raw byte */
+  sinfo->firstcode = code;      /* save for possible future use */
+
+  /* If there's room in table... */
+  if ((code = sinfo->max_code) < LZW_TABLE_SIZE) {
+    /* Define a new symbol = prev sym + head of this sym's expansion */
+    sinfo->symbol_head[code] = (UINT16)sinfo->oldcode;
+    sinfo->symbol_tail[code] = (UINT8)sinfo->firstcode;
+    sinfo->max_code++;
+    /* Is it time to increase code_size? */
+    if (sinfo->max_code >= sinfo->limit_code &&
+        sinfo->code_size < MAX_LZW_BITS) {
+      sinfo->code_size++;
+      sinfo->limit_code <<= 1;  /* keep equal to 2^code_size */
+    }
+  }
+
+  sinfo->oldcode = incode;      /* save last input symbol for future use */
+  return sinfo->firstcode;      /* return first byte of symbol's expansion */
+}
+
+
+LOCAL(void)
+ReadColorMap(gif_source_ptr sinfo, int cmaplen, _JSAMPARRAY cmap)
+/* Read a GIF colormap */
+{
+  int i, gray = 1;
+
+  for (i = 0; i < cmaplen; i++) {
+#if BITS_IN_JSAMPLE == 8
+#define UPSCALE(x)  (x)
+#else
+#define UPSCALE(x)  ((x) << (BITS_IN_JSAMPLE - 8))
+#endif
+    cmap[CM_RED][i]   = (_JSAMPLE)UPSCALE(ReadByte(sinfo));
+    cmap[CM_GREEN][i] = (_JSAMPLE)UPSCALE(ReadByte(sinfo));
+    cmap[CM_BLUE][i]  = (_JSAMPLE)UPSCALE(ReadByte(sinfo));
+    if (cmap[CM_RED][i] != cmap[CM_GREEN][i] ||
+        cmap[CM_GREEN][i] != cmap[CM_BLUE][i])
+      gray = 0;
+  }
+
+  if (sinfo->cinfo->in_color_space == JCS_RGB && gray) {
+    sinfo->cinfo->in_color_space = JCS_GRAYSCALE;
+    sinfo->cinfo->input_components = 1;
+  }
+}
+
+
+LOCAL(void)
+DoExtension(gif_source_ptr sinfo)
+/* Process an extension block */
+/* Currently we ignore 'em all */
+{
+  int extlabel;
+
+  /* Read extension label byte */
+  extlabel = ReadByte(sinfo);
+  TRACEMS1(sinfo->cinfo, 1, JTRC_GIF_EXTENSION, extlabel);
+  /* Skip the data block(s) associated with the extension */
+  SkipDataBlocks(sinfo);
+}
+
+
+/*
+ * Read the file header; return image size and component count.
+ */
+
+METHODDEF(void)
+start_input_gif(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  U_CHAR hdrbuf[10];            /* workspace for reading control blocks */
+  unsigned int width, height;   /* image dimensions */
+  int colormaplen, aspectRatio;
+  int c;
+
+  /* Read and verify GIF Header */
+  if (!ReadOK(source->pub.input_file, hdrbuf, 6))
+    ERREXIT(cinfo, JERR_GIF_NOT);
+  if (hdrbuf[0] != 'G' || hdrbuf[1] != 'I' || hdrbuf[2] != 'F')
+    ERREXIT(cinfo, JERR_GIF_NOT);
+  /* Check for expected version numbers.
+   * If unknown version, give warning and try to process anyway;
+   * this is per recommendation in GIF89a standard.
+   */
+  if ((hdrbuf[3] != '8' || hdrbuf[4] != '7' || hdrbuf[5] != 'a') &&
+      (hdrbuf[3] != '8' || hdrbuf[4] != '9' || hdrbuf[5] != 'a'))
+    TRACEMS3(cinfo, 1, JTRC_GIF_BADVERSION, hdrbuf[3], hdrbuf[4], hdrbuf[5]);
+
+  /* Read and decipher Logical Screen Descriptor */
+  if (!ReadOK(source->pub.input_file, hdrbuf, 7))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  width = LM_to_uint(hdrbuf, 0);
+  height = LM_to_uint(hdrbuf, 2);
+  if (width == 0 || height == 0)
+    ERREXIT(cinfo, JERR_GIF_EMPTY);
+  if (sinfo->max_pixels &&
+      (unsigned long long)width * height > sinfo->max_pixels)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, sinfo->max_pixels);
+  /* we ignore the color resolution, sort flag, and background color index */
+  aspectRatio = UCH(hdrbuf[6]);
+  if (aspectRatio != 0 && aspectRatio != 49)
+    TRACEMS(cinfo, 1, JTRC_GIF_NONSQUARE);
+
+  /* Allocate space to store the colormap */
+  source->colormap = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)MAXCOLORMAPSIZE,
+     (JDIMENSION)NUMCOLORS);
+  colormaplen = 0;              /* indicate initialization */
+
+  /* Read global colormap if header indicates it is present */
+  if (BitSet(hdrbuf[4], COLORMAPFLAG)) {
+    colormaplen = 2 << (hdrbuf[4] & 0x07);
+    ReadColorMap(source, colormaplen, source->colormap);
+  }
+
+  /* Scan until we reach start of desired image.
+   * We don't currently support skipping images, but could add it easily.
+   */
+  for (;;) {
+    c = ReadByte(source);
+
+    if (c == ';')               /* GIF terminator?? */
+      ERREXIT(cinfo, JERR_GIF_IMAGENOTFOUND);
+
+    if (c == '!') {             /* Extension */
+      DoExtension(source);
+      continue;
+    }
+
+    if (c != ',') {             /* Not an image separator? */
+      WARNMS1(cinfo, JWRN_GIF_CHAR, c);
+      continue;
+    }
+
+    /* Read and decipher Local Image Descriptor */
+    if (!ReadOK(source->pub.input_file, hdrbuf, 9))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    /* we ignore top/left position info, also sort flag */
+    width = LM_to_uint(hdrbuf, 4);
+    height = LM_to_uint(hdrbuf, 6);
+    if (width == 0 || height == 0)
+      ERREXIT(cinfo, JERR_GIF_EMPTY);
+    if (sinfo->max_pixels &&
+        (unsigned long long)width * height > sinfo->max_pixels)
+      ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, sinfo->max_pixels);
+    source->is_interlaced = (BitSet(hdrbuf[8], INTERLACE) != 0);
+
+    /* Read local colormap if header indicates it is present */
+    /* Note: if we wanted to support skipping images, */
+    /* we'd need to skip rather than read colormap for ignored images */
+    if (BitSet(hdrbuf[8], COLORMAPFLAG)) {
+      colormaplen = 2 << (hdrbuf[8] & 0x07);
+      ReadColorMap(source, colormaplen, source->colormap);
+    }
+
+    source->input_code_size = ReadByte(source); /* get min-code-size byte */
+    if (source->input_code_size < 2 || source->input_code_size > 8)
+      ERREXIT1(cinfo, JERR_GIF_CODESIZE, source->input_code_size);
+
+    /* Reached desired image, so break out of loop */
+    /* If we wanted to skip this image, */
+    /* we'd call SkipDataBlocks and then continue the loop */
+    break;
+  }
+
+  /* Prepare to read selected image: first initialize LZW decompressor */
+  source->symbol_head = (UINT16 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT16));
+  source->symbol_tail = (UINT8 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT8));
+  source->symbol_stack = (UINT8 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT8));
+  InitLZWCode(source);
+
+  /*
+   * If image is interlaced, we read it into a full-size sample array,
+   * decompressing as we go; then get_interlaced_row selects rows from the
+   * sample array in the proper order.
+   */
+  if (source->is_interlaced) {
+    /* We request the virtual array now, but can't access it until virtual
+     * arrays have been allocated.  Hence, the actual work of reading the
+     * image is postponed until the first call to get_pixel_rows.
+     */
+    source->interlaced_image = (*cinfo->mem->request_virt_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+       (JDIMENSION)width, (JDIMENSION)height, (JDIMENSION)1);
+    if (cinfo->progress != NULL) {
+      cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+      progress->total_extra_passes++; /* count file input as separate pass */
+    }
+    source->pub.get_pixel_rows = load_interlaced_image;
+  } else {
+    source->pub.get_pixel_rows = get_pixel_rows;
+  }
+
+  if (cinfo->in_color_space != JCS_GRAYSCALE) {
+    cinfo->in_color_space = JCS_RGB;
+    cinfo->input_components = NUMCOLORS;
+  }
+
+  /* Create compressor input buffer. */
+  source->pub._buffer = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)width * cinfo->input_components, (JDIMENSION)1);
+  source->pub.buffer_height = 1;
+
+  /* Pad colormap for safety. */
+  for (c = colormaplen; c < source->clear_code; c++) {
+    source->colormap[CM_RED][c]   =
+    source->colormap[CM_GREEN][c] =
+    source->colormap[CM_BLUE][c]  = _CENTERJSAMPLE;
+  }
+
+  /* Return info about the image. */
+  cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
+  cinfo->image_width = width;
+  cinfo->image_height = height;
+
+  TRACEMS3(cinfo, 1, JTRC_GIF, width, height, colormaplen);
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for noninterlaced GIF images:
+ * we read directly from the GIF file.
+ */
+
+METHODDEF(JDIMENSION)
+get_pixel_rows(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register int c;
+  register _JSAMPROW ptr;
+  register JDIMENSION col;
+  register _JSAMPARRAY colormap = source->colormap;
+
+  ptr = source->pub._buffer[0];
+  if (cinfo->in_color_space == JCS_GRAYSCALE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      c = LZWReadByte(source);
+      *ptr++ = colormap[CM_RED][c];
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      c = LZWReadByte(source);
+      *ptr++ = colormap[CM_RED][c];
+      *ptr++ = colormap[CM_GREEN][c];
+      *ptr++ = colormap[CM_BLUE][c];
+    }
+  }
+  return 1;
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for the first call on get_pixel_rows when
+ * reading an interlaced GIF file: we read the whole image into memory.
+ */
+
+METHODDEF(JDIMENSION)
+load_interlaced_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register _JSAMPROW sptr;
+  register JDIMENSION col;
+  JDIMENSION row;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+
+  /* Read the interlaced image into the virtual array we've created. */
+  for (row = 0; row < cinfo->image_height; row++) {
+    if (progress != NULL) {
+      progress->pub.pass_counter = (long)row;
+      progress->pub.pass_limit = (long)cinfo->image_height;
+      (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
+    }
+    sptr = *(_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->interlaced_image, row, (JDIMENSION)1,
+       TRUE);
+    for (col = cinfo->image_width; col > 0; col--) {
+      *sptr++ = (_JSAMPLE)LZWReadByte(source);
+    }
+  }
+  if (progress != NULL)
+    progress->completed_extra_passes++;
+
+  /* Replace method pointer so subsequent calls don't come here. */
+  source->pub.get_pixel_rows = get_interlaced_row;
+  /* Initialize for get_interlaced_row, and perform first call on it. */
+  source->cur_row_number = 0;
+  source->pass2_offset = (cinfo->image_height + 7) / 8;
+  source->pass3_offset = source->pass2_offset + (cinfo->image_height + 3) / 8;
+  source->pass4_offset = source->pass3_offset + (cinfo->image_height + 1) / 4;
+
+  return get_interlaced_row(cinfo, sinfo);
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for interlaced GIF images:
+ * we read from the virtual array.
+ */
+
+METHODDEF(JDIMENSION)
+get_interlaced_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register int c;
+  register _JSAMPROW sptr, ptr;
+  register JDIMENSION col;
+  register _JSAMPARRAY colormap = source->colormap;
+  JDIMENSION irow;
+
+  /* Figure out which row of interlaced image is needed, and access it. */
+  switch ((int)(source->cur_row_number & 7)) {
+  case 0:                       /* first-pass row */
+    irow = source->cur_row_number >> 3;
+    break;
+  case 4:                       /* second-pass row */
+    irow = (source->cur_row_number >> 3) + source->pass2_offset;
+    break;
+  case 2:                       /* third-pass row */
+  case 6:
+    irow = (source->cur_row_number >> 2) + source->pass3_offset;
+    break;
+  default:                      /* fourth-pass row */
+    irow = (source->cur_row_number >> 1) + source->pass4_offset;
+  }
+  sptr = *(_JSAMPARRAY)(*cinfo->mem->access_virt_sarray)
+    ((j_common_ptr)cinfo, source->interlaced_image, irow, (JDIMENSION)1,
+     FALSE);
+  /* Scan the row, expand colormap, and output */
+  ptr = source->pub._buffer[0];
+  if (cinfo->in_color_space == JCS_GRAYSCALE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      c = *sptr++;
+      *ptr++ = colormap[CM_RED][c];
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      c = *sptr++;
+      *ptr++ = colormap[CM_RED][c];
+      *ptr++ = colormap[CM_GREEN][c];
+      *ptr++ = colormap[CM_BLUE][c];
+    }
+  }
+  source->cur_row_number++;     /* for next time */
+  return 1;
+}
+
+
+/*
+ * Finish up at the end of the file.
+ */
+
+METHODDEF(void)
+finish_input_gif(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  /* no work */
+}
+
+
+/*
+ * The module selection routine for GIF format input.
+ */
+
+GLOBAL(cjpeg_source_ptr)
+_jinit_read_gif(j_compress_ptr cinfo)
+{
+  gif_source_ptr source;
+
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  /* Create module interface object */
+  source = (gif_source_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(gif_source_struct));
+  source->cinfo = cinfo;        /* make back link for subroutines */
+  /* Fill in method ptrs, except get_pixel_rows which start_input sets */
+  source->pub.start_input = start_input_gif;
+  source->pub.finish_input = finish_input_gif;
+  source->pub.max_pixels = 0;
+
+  return (cjpeg_source_ptr)source;
+}
+
+#endif /* defined(GIF_SUPPORTED) &&
+          (BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED)) */
diff --git a/3rdparty/libjpeg-turbo/src/rdjpgcom.c b/3rdparty/libjpeg-turbo/src/rdjpgcom.c
new file mode 100644
index 000000000000..d9a6f85a38c9
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/rdjpgcom.c
@@ -0,0 +1,493 @@
+/*
+ * rdjpgcom.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2009 by Bill Allombert, Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains a very simple stand-alone application that displays
+ * the text in COM (comment) markers in a JFIF file.
+ * This may be useful as an example of the minimum logic needed to parse
+ * JPEG markers.
+ */
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#define JPEG_CJPEG_DJPEG        /* to get the command-line config symbols */
+#include "jinclude.h"           /* get auto-config symbols, <stdio.h> */
+
+#include <locale.h>             /* Bill Allombert: use locale for isprint */
+#include <ctype.h>              /* to declare isupper(), tolower() */
+#ifdef USE_SETMODE
+#include <fcntl.h>              /* to declare setmode()'s parameter macros */
+/* If you have setmode() but not <io.h>, just delete this line: */
+#include <io.h>                 /* to declare setmode() */
+#endif
+
+#ifdef DONT_USE_B_MODE          /* define mode parameters for fopen() */
+#define READ_BINARY     "r"
+#else
+#define READ_BINARY     "rb"
+#endif
+
+#ifndef EXIT_FAILURE            /* define exit() codes if not provided */
+#define EXIT_FAILURE  1
+#endif
+#ifndef EXIT_SUCCESS
+#define EXIT_SUCCESS  0
+#endif
+
+
+/*
+ * These macros are used to read the input file.
+ * To reuse this code in another application, you might need to change these.
+ */
+
+static FILE *infile;            /* input JPEG file */
+
+/* Return next input byte, or EOF if no more */
+#define NEXTBYTE()  getc(infile)
+
+
+/* Error exit handler */
+#define ERREXIT(msg)  (fprintf(stderr, "%s\n", msg), exit(EXIT_FAILURE))
+
+
+/* Read one byte, testing for EOF */
+static int
+read_1_byte(void)
+{
+  int c;
+
+  c = NEXTBYTE();
+  if (c == EOF)
+    ERREXIT("Premature EOF in JPEG file");
+  return c;
+}
+
+/* Read 2 bytes, convert to unsigned int */
+/* All 2-byte quantities in JPEG markers are MSB first */
+static unsigned int
+read_2_bytes(void)
+{
+  int c1, c2;
+
+  c1 = NEXTBYTE();
+  if (c1 == EOF)
+    ERREXIT("Premature EOF in JPEG file");
+  c2 = NEXTBYTE();
+  if (c2 == EOF)
+    ERREXIT("Premature EOF in JPEG file");
+  return (((unsigned int)c1) << 8) + ((unsigned int)c2);
+}
+
+
+/*
+ * JPEG markers consist of one or more 0xFF bytes, followed by a marker
+ * code byte (which is not an FF).  Here are the marker codes of interest
+ * in this program.  (See jdmarker.c for a more complete list.)
+ */
+
+#define M_SOF0   0xC0           /* Start Of Frame N */
+#define M_SOF1   0xC1           /* N indicates which compression process */
+#define M_SOF2   0xC2           /* Only SOF0-SOF2 are now in common use */
+#define M_SOF3   0xC3
+#define M_SOF5   0xC5           /* NB: codes C4 and CC are NOT SOF markers */
+#define M_SOF6   0xC6
+#define M_SOF7   0xC7
+#define M_SOF9   0xC9
+#define M_SOF10  0xCA
+#define M_SOF11  0xCB
+#define M_SOF13  0xCD
+#define M_SOF14  0xCE
+#define M_SOF15  0xCF
+#define M_SOI    0xD8           /* Start Of Image (beginning of datastream) */
+#define M_EOI    0xD9           /* End Of Image (end of datastream) */
+#define M_SOS    0xDA           /* Start Of Scan (begins compressed data) */
+#define M_APP12  0xEC           /* (we don't bother to list all 16 APPn's) */
+#define M_COM    0xFE           /* COMment */
+
+
+/*
+ * Find the next JPEG marker and return its marker code.
+ * We expect at least one FF byte, possibly more if the compressor used FFs
+ * to pad the file.
+ * There could also be non-FF garbage between markers.  The treatment of such
+ * garbage is unspecified; we choose to skip over it but emit a warning msg.
+ * NB: this routine must not be used after seeing SOS marker, since it will
+ * not deal correctly with FF/00 sequences in the compressed image data...
+ */
+
+static int
+next_marker(void)
+{
+  int c;
+  int discarded_bytes = 0;
+
+  /* Find 0xFF byte; count and skip any non-FFs. */
+  c = read_1_byte();
+  while (c != 0xFF) {
+    discarded_bytes++;
+    c = read_1_byte();
+  }
+  /* Get marker code byte, swallowing any duplicate FF bytes.  Extra FFs
+   * are legal as pad bytes, so don't count them in discarded_bytes.
+   */
+  do {
+    c = read_1_byte();
+  } while (c == 0xFF);
+
+  if (discarded_bytes != 0) {
+    fprintf(stderr, "Warning: garbage data found in JPEG file\n");
+  }
+
+  return c;
+}
+
+
+/*
+ * Read the initial marker, which should be SOI.
+ * For a JFIF file, the first two bytes of the file should be literally
+ * 0xFF M_SOI.  To be more general, we could use next_marker, but if the
+ * input file weren't actually JPEG at all, next_marker might read the whole
+ * file and then return a misleading error message...
+ */
+
+static int
+first_marker(void)
+{
+  int c1, c2;
+
+  c1 = NEXTBYTE();
+  c2 = NEXTBYTE();
+  if (c1 != 0xFF || c2 != M_SOI)
+    ERREXIT("Not a JPEG file");
+  return c2;
+}
+
+
+/*
+ * Most types of marker are followed by a variable-length parameter segment.
+ * This routine skips over the parameters for any marker we don't otherwise
+ * want to process.
+ * Note that we MUST skip the parameter segment explicitly in order not to
+ * be fooled by 0xFF bytes that might appear within the parameter segment;
+ * such bytes do NOT introduce new markers.
+ */
+
+static void
+skip_variable(void)
+/* Skip over an unknown or uninteresting variable-length marker */
+{
+  unsigned int length;
+
+  /* Get the marker parameter length count */
+  length = read_2_bytes();
+  /* Length includes itself, so must be at least 2 */
+  if (length < 2)
+    ERREXIT("Erroneous JPEG marker length");
+  length -= 2;
+  /* Skip over the remaining bytes */
+  while (length > 0) {
+    (void)read_1_byte();
+    length--;
+  }
+}
+
+
+/*
+ * Process a COM marker.
+ * We want to print out the marker contents as legible text;
+ * we must guard against non-text junk and varying newline representations.
+ */
+
+static void
+process_COM(int raw)
+{
+  unsigned int length;
+  int ch;
+  int lastch = 0;
+
+  /* Bill Allombert: set locale properly for isprint */
+  setlocale(LC_CTYPE, "");
+
+  /* Get the marker parameter length count */
+  length = read_2_bytes();
+  /* Length includes itself, so must be at least 2 */
+  if (length < 2)
+    ERREXIT("Erroneous JPEG marker length");
+  length -= 2;
+
+  while (length > 0) {
+    ch = read_1_byte();
+    if (raw) {
+      putc(ch, stdout);
+    /* Emit the character in a readable form.
+     * Nonprintables are converted to \nnn form,
+     * while \ is converted to \\.
+     * Newlines in CR, CR/LF, or LF form will be printed as one newline.
+     */
+    } else if (ch == '\r') {
+      printf("\n");
+    } else if (ch == '\n') {
+      if (lastch != '\r')
+        printf("\n");
+    } else if (ch == '\\') {
+      printf("\\\\");
+    } else if (isprint(ch)) {
+      putc(ch, stdout);
+    } else {
+      printf("\\%03o", (unsigned int)ch);
+    }
+    lastch = ch;
+    length--;
+  }
+  printf("\n");
+
+  /* Bill Allombert: revert to C locale */
+  setlocale(LC_CTYPE, "C");
+}
+
+
+/*
+ * Process a SOFn marker.
+ * This code is only needed if you want to know the image dimensions...
+ */
+
+static void
+process_SOFn(int marker)
+{
+  unsigned int length;
+  unsigned int image_height, image_width;
+  int data_precision, num_components;
+  const char *process;
+  int ci;
+
+  length = read_2_bytes();      /* usual parameter length count */
+
+  data_precision = read_1_byte();
+  image_height = read_2_bytes();
+  image_width = read_2_bytes();
+  num_components = read_1_byte();
+
+  switch (marker) {
+  case M_SOF0:  process = "Baseline";  break;
+  case M_SOF1:  process = "Extended sequential";  break;
+  case M_SOF2:  process = "Progressive";  break;
+  case M_SOF3:  process = "Lossless";  break;
+  case M_SOF5:  process = "Differential sequential";  break;
+  case M_SOF6:  process = "Differential progressive";  break;
+  case M_SOF7:  process = "Differential lossless";  break;
+  case M_SOF9:  process = "Extended sequential, arithmetic coding";  break;
+  case M_SOF10: process = "Progressive, arithmetic coding";  break;
+  case M_SOF11: process = "Lossless, arithmetic coding";  break;
+  case M_SOF13: process = "Differential sequential, arithmetic coding";  break;
+  case M_SOF14:
+    process = "Differential progressive, arithmetic coding";  break;
+  case M_SOF15: process = "Differential lossless, arithmetic coding";  break;
+  default:      process = "Unknown";  break;
+  }
+
+  printf("JPEG image is %uw * %uh, %d color components, %d bits per sample\n",
+         image_width, image_height, num_components, data_precision);
+  printf("JPEG process: %s\n", process);
+
+  if (length != (unsigned int)(8 + num_components * 3))
+    ERREXIT("Bogus SOF marker length");
+
+  for (ci = 0; ci < num_components; ci++) {
+    (void)read_1_byte();        /* Component ID code */
+    (void)read_1_byte();        /* H, V sampling factors */
+    (void)read_1_byte();        /* Quantization table number */
+  }
+}
+
+
+/*
+ * Parse the marker stream until SOS or EOI is seen;
+ * display any COM markers.
+ * While the companion program wrjpgcom will always insert COM markers before
+ * SOFn, other implementations might not, so we scan to SOS before stopping.
+ * If we were only interested in the image dimensions, we would stop at SOFn.
+ * (Conversely, if we only cared about COM markers, there would be no need
+ * for special code to handle SOFn; we could treat it like other markers.)
+ */
+
+static int
+scan_JPEG_header(int verbose, int raw)
+{
+  int marker;
+
+  /* Expect SOI at start of file */
+  if (first_marker() != M_SOI)
+    ERREXIT("Expected SOI marker first");
+
+  /* Scan miscellaneous markers until we reach SOS. */
+  for (;;) {
+    marker = next_marker();
+    switch (marker) {
+      /* Note that marker codes 0xC4, 0xC8, 0xCC are not, and must not be,
+       * treated as SOFn.  C4 in particular is actually DHT.
+       */
+    case M_SOF0:                /* Baseline */
+    case M_SOF1:                /* Extended sequential, Huffman */
+    case M_SOF2:                /* Progressive, Huffman */
+    case M_SOF3:                /* Lossless, Huffman */
+    case M_SOF5:                /* Differential sequential, Huffman */
+    case M_SOF6:                /* Differential progressive, Huffman */
+    case M_SOF7:                /* Differential lossless, Huffman */
+    case M_SOF9:                /* Extended sequential, arithmetic */
+    case M_SOF10:               /* Progressive, arithmetic */
+    case M_SOF11:               /* Lossless, arithmetic */
+    case M_SOF13:               /* Differential sequential, arithmetic */
+    case M_SOF14:               /* Differential progressive, arithmetic */
+    case M_SOF15:               /* Differential lossless, arithmetic */
+      if (verbose)
+        process_SOFn(marker);
+      else
+        skip_variable();
+      break;
+
+    case M_SOS:                 /* stop before hitting compressed data */
+      return marker;
+
+    case M_EOI:                 /* in case it's a tables-only JPEG stream */
+      return marker;
+
+    case M_COM:
+      process_COM(raw);
+      break;
+
+    case M_APP12:
+      /* Some digital camera makers put useful textual information into
+       * APP12 markers, so we print those out too when in -verbose mode.
+       */
+      if (verbose) {
+        printf("APP12 contains:\n");
+        process_COM(raw);
+      } else
+        skip_variable();
+      break;
+
+    default:                    /* Anything else just gets skipped */
+      skip_variable();          /* we assume it has a parameter count... */
+      break;
+    }
+  } /* end loop */
+}
+
+
+/* Command line parsing code */
+
+static const char *progname;    /* program name for error messages */
+
+
+static void
+usage(void)
+/* complain about bad command line */
+{
+  fprintf(stderr, "rdjpgcom displays any textual comments in a JPEG file.\n");
+
+  fprintf(stderr, "Usage: %s [switches] [inputfile]\n", progname);
+
+  fprintf(stderr, "Switches (names may be abbreviated):\n");
+  fprintf(stderr, "  -raw        Display non-printable characters in comments (unsafe)\n");
+  fprintf(stderr, "  -verbose    Also display dimensions of JPEG image\n");
+
+  exit(EXIT_FAILURE);
+}
+
+
+static int
+keymatch(char *arg, const char *keyword, int minchars)
+/* Case-insensitive matching of (possibly abbreviated) keyword switches. */
+/* keyword is the constant keyword (must be lower case already), */
+/* minchars is length of minimum legal abbreviation. */
+{
+  register int ca, ck;
+  register int nmatched = 0;
+
+  while ((ca = *arg++) != '\0') {
+    if ((ck = *keyword++) == '\0')
+      return 0;                 /* arg longer than keyword, no good */
+    if (isupper(ca))            /* force arg to lcase (assume ck is already) */
+      ca = tolower(ca);
+    if (ca != ck)
+      return 0;                 /* no good */
+    nmatched++;                 /* count matched characters */
+  }
+  /* reached end of argument; fail if it's too short for unique abbrev */
+  if (nmatched < minchars)
+    return 0;
+  return 1;                     /* A-OK */
+}
+
+
+/*
+ * The main program.
+ */
+
+int
+main(int argc, char **argv)
+{
+  int argn;
+  char *arg;
+  int verbose = 0, raw = 0;
+
+  progname = argv[0];
+  if (progname == NULL || progname[0] == 0)
+    progname = "rdjpgcom";      /* in case C library doesn't provide it */
+
+  /* Parse switches, if any */
+  for (argn = 1; argn < argc; argn++) {
+    arg = argv[argn];
+    if (arg[0] != '-')
+      break;                    /* not switch, must be file name */
+    arg++;                      /* advance over '-' */
+    if (keymatch(arg, "verbose", 1)) {
+      verbose++;
+    } else if (keymatch(arg, "raw", 1)) {
+      raw = 1;
+    } else
+      usage();
+  }
+
+  /* Open the input file. */
+  /* Unix style: expect zero or one file name */
+  if (argn < argc - 1) {
+    fprintf(stderr, "%s: only one input file\n", progname);
+    usage();
+  }
+  if (argn < argc) {
+    if ((infile = fopen(argv[argn], READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    /* default input file is stdin */
+#ifdef USE_SETMODE              /* need to hack file mode? */
+    setmode(fileno(stdin), O_BINARY);
+#endif
+#ifdef USE_FDOPEN               /* need to re-open in binary mode? */
+    if ((infile = fdopen(fileno(stdin), READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open stdin\n", progname);
+      exit(EXIT_FAILURE);
+    }
+#else
+    infile = stdin;
+#endif
+  }
+
+  /* Scan the JPEG headers. */
+  (void)scan_JPEG_header(verbose, raw);
+
+  /* All done. */
+  exit(EXIT_SUCCESS);
+  return 0;                     /* suppress no-return-value warnings */
+}
diff --git a/3rdparty/libjpeg-turbo/src/rdppm.c b/3rdparty/libjpeg-turbo/src/rdppm.c
new file mode 100644
index 000000000000..84e26f7b3f56
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/rdppm.c
@@ -0,0 +1,890 @@
+/*
+ * rdppm.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2009 by Bill Allombert, Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2015-2017, 2020-2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to read input images in PPM/PGM format.
+ * The extended 2-byte-per-sample raw PPM/PGM formats are supported.
+ * The PBMPLUS library is NOT required to compile this software
+ * (but it is highly useful as a set of PPM image manipulation programs).
+ *
+ * These routines may need modification for non-Unix environments or
+ * specialized applications.  As they stand, they assume input from
+ * an ordinary stdio stream.  They further assume that reading begins
+ * at the start of the file; start_input may need work if the
+ * user interface has already read some data (e.g., to determine that
+ * the file is indeed PPM format).
+ */
+
+#include "cmyk.h"
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+
+#if defined(PPM_SUPPORTED) && \
+    (BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED))
+
+
+/* Portions of this code are based on the PBMPLUS library, which is:
+**
+** Copyright (C) 1988 by Jef Poskanzer.
+**
+** Permission to use, copy, modify, and distribute this software and its
+** documentation for any purpose and without fee is hereby granted, provided
+** that the above copyright notice appear in all copies and that both that
+** copyright notice and this permission notice appear in supporting
+** documentation.  This software is provided "as is" without express or
+** implied warranty.
+*/
+
+
+/* Macros to deal with unsigned chars as efficiently as compiler allows */
+
+typedef unsigned char U_CHAR;
+#define UCH(x)  ((int)(x))
+
+
+#define ReadOK(file, buffer, len) \
+  (fread(buffer, 1, len, file) == ((size_t)(len)))
+
+static int alpha_index[JPEG_NUMCS] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
+};
+
+
+/* Private version of data source object */
+
+typedef struct {
+  struct cjpeg_source_struct pub; /* public fields */
+
+  /* Usually these two pointers point to the same place: */
+  U_CHAR *iobuffer;             /* fread's I/O buffer */
+  _JSAMPROW pixrow;             /* compressor input buffer */
+  size_t buffer_width;          /* width of I/O buffer */
+  _JSAMPLE *rescale;            /* => maxval-remapping array, or NULL */
+  unsigned int maxval;
+} ppm_source_struct;
+
+typedef ppm_source_struct *ppm_source_ptr;
+
+
+LOCAL(int)
+pbm_getc(FILE *infile)
+/* Read next char, skipping over any comments */
+/* A comment/newline sequence is returned as a newline */
+{
+  register int ch;
+
+  ch = getc(infile);
+  if (ch == '#') {
+    do {
+      ch = getc(infile);
+    } while (ch != '\n' && ch != EOF);
+  }
+  return ch;
+}
+
+
+LOCAL(unsigned int)
+read_pbm_integer(j_compress_ptr cinfo, FILE *infile, unsigned int maxval)
+/* Read an unsigned decimal integer from the PPM file */
+/* Swallows one trailing character after the integer */
+/* Note that on a 16-bit-int machine, only values up to 64k can be read. */
+/* This should not be a problem in practice. */
+{
+  register int ch;
+  register unsigned int val;
+
+  /* Skip any leading whitespace */
+  do {
+    ch = pbm_getc(infile);
+    if (ch == EOF)
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+  } while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
+
+  if (ch < '0' || ch > '9')
+    ERREXIT(cinfo, JERR_PPM_NONNUMERIC);
+
+  val = ch - '0';
+  while ((ch = pbm_getc(infile)) >= '0' && ch <= '9') {
+    val *= 10;
+    val += ch - '0';
+    if (val > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+  }
+
+  return val;
+}
+
+
+/*
+ * Read one row of pixels.
+ *
+ * We provide several different versions depending on input file format.
+ * In all cases, input is scaled to the size of _JSAMPLE.
+ *
+ * A really fast path is provided for reading byte/sample raw files with
+ * maxval = _MAXJSAMPLE, which is the normal case for 8-bit data.
+ */
+
+
+METHODDEF(JDIMENSION)
+get_text_gray_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading text-format PGM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  FILE *infile = source->pub.input_file;
+  register _JSAMPROW ptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  ptr = source->pub._buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
+  }
+  return 1;
+}
+
+
+#define GRAY_RGB_READ_LOOP(read_op, alpha_set_op) { \
+  for (col = cinfo->image_width; col > 0; col--) { \
+    ptr[rindex] = ptr[gindex] = ptr[bindex] = read_op; \
+    alpha_set_op \
+    ptr += ps; \
+  } \
+}
+
+METHODDEF(JDIMENSION)
+get_text_gray_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading text-format PGM files with any maxval and
+   converting to extended RGB */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  FILE *infile = source->pub.input_file;
+  register _JSAMPROW ptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  ptr = source->pub._buffer[0];
+  if (maxval == _MAXJSAMPLE) {
+    if (aindex >= 0)
+      GRAY_RGB_READ_LOOP((_JSAMPLE)read_pbm_integer(cinfo, infile, maxval),
+                         ptr[aindex] = _MAXJSAMPLE;)
+    else
+      GRAY_RGB_READ_LOOP((_JSAMPLE)read_pbm_integer(cinfo, infile, maxval), {})
+  } else {
+    if (aindex >= 0)
+      GRAY_RGB_READ_LOOP(rescale[read_pbm_integer(cinfo, infile, maxval)],
+                         ptr[aindex] = _MAXJSAMPLE;)
+    else
+      GRAY_RGB_READ_LOOP(rescale[read_pbm_integer(cinfo, infile, maxval)], {})
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_text_gray_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading text-format PGM files with any maxval and
+   converting to CMYK */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  FILE *infile = source->pub.input_file;
+  register _JSAMPROW ptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  ptr = source->pub._buffer[0];
+  if (maxval == _MAXJSAMPLE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      _JSAMPLE gray = (_JSAMPLE)read_pbm_integer(cinfo, infile, maxval);
+      rgb_to_cmyk(gray, gray, gray, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      _JSAMPLE gray = rescale[read_pbm_integer(cinfo, infile, maxval)];
+      rgb_to_cmyk(gray, gray, gray, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  }
+  return 1;
+}
+
+
+#define RGB_READ_LOOP(read_op, alpha_set_op) { \
+  for (col = cinfo->image_width; col > 0; col--) { \
+    ptr[rindex] = read_op; \
+    ptr[gindex] = read_op; \
+    ptr[bindex] = read_op; \
+    alpha_set_op \
+    ptr += ps; \
+  } \
+}
+
+METHODDEF(JDIMENSION)
+get_text_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading text-format PPM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  FILE *infile = source->pub.input_file;
+  register _JSAMPROW ptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  ptr = source->pub._buffer[0];
+  if (maxval == _MAXJSAMPLE) {
+    if (aindex >= 0)
+      RGB_READ_LOOP((_JSAMPLE)read_pbm_integer(cinfo, infile, maxval),
+                    ptr[aindex] = _MAXJSAMPLE;)
+    else
+      RGB_READ_LOOP((_JSAMPLE)read_pbm_integer(cinfo, infile, maxval), {})
+  } else {
+    if (aindex >= 0)
+      RGB_READ_LOOP(rescale[read_pbm_integer(cinfo, infile, maxval)],
+                    ptr[aindex] = _MAXJSAMPLE;)
+    else
+      RGB_READ_LOOP(rescale[read_pbm_integer(cinfo, infile, maxval)], {})
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_text_rgb_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading text-format PPM files with any maxval and
+   converting to CMYK */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  FILE *infile = source->pub.input_file;
+  register _JSAMPROW ptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  ptr = source->pub._buffer[0];
+  if (maxval == _MAXJSAMPLE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      _JSAMPLE r = (_JSAMPLE)read_pbm_integer(cinfo, infile, maxval);
+      _JSAMPLE g = (_JSAMPLE)read_pbm_integer(cinfo, infile, maxval);
+      _JSAMPLE b = (_JSAMPLE)read_pbm_integer(cinfo, infile, maxval);
+      rgb_to_cmyk(r, g, b, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      _JSAMPLE r = rescale[read_pbm_integer(cinfo, infile, maxval)];
+      _JSAMPLE g = rescale[read_pbm_integer(cinfo, infile, maxval)];
+      _JSAMPLE b = rescale[read_pbm_integer(cinfo, infile, maxval)];
+      rgb_to_cmyk(r, g, b, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_scaled_gray_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format PGM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  for (col = cinfo->image_width; col > 0; col--) {
+    *ptr++ = rescale[UCH(*bufferptr++)];
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_gray_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format PGM files with any maxval
+   and converting to extended RGB */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  if (maxval == _MAXJSAMPLE) {
+    if (aindex >= 0)
+      GRAY_RGB_READ_LOOP(*bufferptr++, ptr[aindex] = _MAXJSAMPLE;)
+    else
+      GRAY_RGB_READ_LOOP(*bufferptr++, {})
+  } else {
+    if (aindex >= 0)
+      GRAY_RGB_READ_LOOP(rescale[UCH(*bufferptr++)],
+                         ptr[aindex] = _MAXJSAMPLE;)
+    else
+      GRAY_RGB_READ_LOOP(rescale[UCH(*bufferptr++)], {})
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_gray_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format PGM files with any maxval
+   and converting to CMYK */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  if (maxval == _MAXJSAMPLE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      _JSAMPLE gray = *bufferptr++;
+      rgb_to_cmyk(gray, gray, gray, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      _JSAMPLE gray = rescale[UCH(*bufferptr++)];
+      rgb_to_cmyk(gray, gray, gray, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format PPM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  if (maxval == _MAXJSAMPLE) {
+    if (aindex >= 0)
+      RGB_READ_LOOP(*bufferptr++, ptr[aindex] = _MAXJSAMPLE;)
+    else
+      RGB_READ_LOOP(*bufferptr++, {})
+  } else {
+    if (aindex >= 0)
+      RGB_READ_LOOP(rescale[UCH(*bufferptr++)], ptr[aindex] = _MAXJSAMPLE;)
+    else
+      RGB_READ_LOOP(rescale[UCH(*bufferptr++)], {})
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_rgb_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format PPM files with any maxval and
+   converting to CMYK */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  if (maxval == _MAXJSAMPLE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      _JSAMPLE r = *bufferptr++;
+      _JSAMPLE g = *bufferptr++;
+      _JSAMPLE b = *bufferptr++;
+      rgb_to_cmyk(r, g, b, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      _JSAMPLE r = rescale[UCH(*bufferptr++)];
+      _JSAMPLE g = rescale[UCH(*bufferptr++)];
+      _JSAMPLE b = rescale[UCH(*bufferptr++)];
+      rgb_to_cmyk(r, g, b, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_raw_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format files with maxval = _MAXJSAMPLE.
+ * In this case we just read right into the _JSAMPLE buffer!
+ * Note that same code works for PPM and PGM files.
+ */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_word_gray_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-word-format PGM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  for (col = cinfo->image_width; col > 0; col--) {
+    register unsigned int temp;
+    temp  = UCH(*bufferptr++) << 8;
+    temp |= UCH(*bufferptr++);
+    if (temp > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    *ptr++ = rescale[temp];
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_word_gray_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-word-format PGM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  for (col = cinfo->image_width; col > 0; col--) {
+    register unsigned int temp;
+    temp  = UCH(*bufferptr++) << 8;
+    temp |= UCH(*bufferptr++);
+    if (temp > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    ptr[rindex] = ptr[gindex] = ptr[bindex] = rescale[temp];
+    if (aindex >= 0)
+      ptr[aindex] = _MAXJSAMPLE;
+    ptr += ps;
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_word_gray_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-word-format PGM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  for (col = cinfo->image_width; col > 0; col--) {
+    register unsigned int gray;
+    gray  = UCH(*bufferptr++) << 8;
+    gray |= UCH(*bufferptr++);
+    if (gray > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    rgb_to_cmyk(rescale[gray], rescale[gray], rescale[gray], ptr, ptr + 1,
+                ptr + 2, ptr + 3);
+    ptr += 4;
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_word_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-word-format PPM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  for (col = cinfo->image_width; col > 0; col--) {
+    register unsigned int temp;
+    temp  = UCH(*bufferptr++) << 8;
+    temp |= UCH(*bufferptr++);
+    if (temp > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    ptr[rindex] = rescale[temp];
+    temp  = UCH(*bufferptr++) << 8;
+    temp |= UCH(*bufferptr++);
+    if (temp > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    ptr[gindex] = rescale[temp];
+    temp  = UCH(*bufferptr++) << 8;
+    temp |= UCH(*bufferptr++);
+    if (temp > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    ptr[bindex] = rescale[temp];
+    if (aindex >= 0)
+      ptr[aindex] = _MAXJSAMPLE;
+    ptr += ps;
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_word_rgb_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-word-format PPM files with any maxval */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register _JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register _JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub._buffer[0];
+  bufferptr = source->iobuffer;
+  for (col = cinfo->image_width; col > 0; col--) {
+    register unsigned int r, g, b;
+    r  = UCH(*bufferptr++) << 8;
+    r |= UCH(*bufferptr++);
+    if (r > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    g  = UCH(*bufferptr++) << 8;
+    g |= UCH(*bufferptr++);
+    if (g > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    b  = UCH(*bufferptr++) << 8;
+    b |= UCH(*bufferptr++);
+    if (b > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
+    rgb_to_cmyk(rescale[r], rescale[g], rescale[b], ptr, ptr + 1, ptr + 2,
+                ptr + 3);
+    ptr += 4;
+  }
+  return 1;
+}
+
+
+/*
+ * Read the file header; return image size and component count.
+ */
+
+METHODDEF(void)
+start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  int c;
+  unsigned int w, h, maxval;
+  boolean need_iobuffer, use_raw_buffer, need_rescale;
+
+  if (getc(source->pub.input_file) != 'P')
+    ERREXIT(cinfo, JERR_PPM_NOT);
+
+  c = getc(source->pub.input_file); /* subformat discriminator character */
+
+  /* detect unsupported variants (ie, PBM) before trying to read header */
+  switch (c) {
+  case '2':                     /* it's a text-format PGM file */
+  case '3':                     /* it's a text-format PPM file */
+  case '5':                     /* it's a raw-format PGM file */
+  case '6':                     /* it's a raw-format PPM file */
+    break;
+  default:
+    ERREXIT(cinfo, JERR_PPM_NOT);
+    break;
+  }
+
+  /* fetch the remaining header info */
+  w = read_pbm_integer(cinfo, source->pub.input_file, 65535);
+  h = read_pbm_integer(cinfo, source->pub.input_file, 65535);
+  maxval = read_pbm_integer(cinfo, source->pub.input_file, 65535);
+
+  if (w <= 0 || h <= 0 || maxval <= 0) /* error check */
+    ERREXIT(cinfo, JERR_PPM_NOT);
+  if (sinfo->max_pixels && (unsigned long long)w * h > sinfo->max_pixels)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, sinfo->max_pixels);
+
+  cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
+  cinfo->image_width = (JDIMENSION)w;
+  cinfo->image_height = (JDIMENSION)h;
+  source->maxval = maxval;
+
+  /* initialize flags to most common settings */
+  need_iobuffer = TRUE;         /* do we need an I/O buffer? */
+  use_raw_buffer = FALSE;       /* do we map input buffer onto I/O buffer? */
+  need_rescale = TRUE;          /* do we need a rescale array? */
+
+  switch (c) {
+  case '2':                     /* it's a text-format PGM file */
+    if (cinfo->in_color_space == JCS_UNKNOWN ||
+        cinfo->in_color_space == JCS_RGB)
+      cinfo->in_color_space = JCS_GRAYSCALE;
+    TRACEMS2(cinfo, 1, JTRC_PGM_TEXT, w, h);
+    if (cinfo->in_color_space == JCS_GRAYSCALE)
+      source->pub.get_pixel_rows = get_text_gray_row;
+    else if (IsExtRGB(cinfo->in_color_space))
+      source->pub.get_pixel_rows = get_text_gray_rgb_row;
+    else if (cinfo->in_color_space == JCS_CMYK)
+      source->pub.get_pixel_rows = get_text_gray_cmyk_row;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    need_iobuffer = FALSE;
+    break;
+
+  case '3':                     /* it's a text-format PPM file */
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_RGB;
+    TRACEMS2(cinfo, 1, JTRC_PPM_TEXT, w, h);
+    if (IsExtRGB(cinfo->in_color_space))
+      source->pub.get_pixel_rows = get_text_rgb_row;
+    else if (cinfo->in_color_space == JCS_CMYK)
+      source->pub.get_pixel_rows = get_text_rgb_cmyk_row;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    need_iobuffer = FALSE;
+    break;
+
+  case '5':                     /* it's a raw-format PGM file */
+    if (cinfo->in_color_space == JCS_UNKNOWN ||
+        cinfo->in_color_space == JCS_RGB)
+      cinfo->in_color_space = JCS_GRAYSCALE;
+    TRACEMS2(cinfo, 1, JTRC_PGM, w, h);
+    if (maxval > 255) {
+      if (cinfo->in_color_space == JCS_GRAYSCALE)
+        source->pub.get_pixel_rows = get_word_gray_row;
+      else if (IsExtRGB(cinfo->in_color_space))
+        source->pub.get_pixel_rows = get_word_gray_rgb_row;
+      else if (cinfo->in_color_space == JCS_CMYK)
+        source->pub.get_pixel_rows = get_word_gray_cmyk_row;
+      else
+        ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    } else if (maxval == _MAXJSAMPLE && sizeof(_JSAMPLE) == sizeof(U_CHAR) &&
+               cinfo->in_color_space == JCS_GRAYSCALE) {
+      source->pub.get_pixel_rows = get_raw_row;
+      use_raw_buffer = TRUE;
+      need_rescale = FALSE;
+    } else {
+      if (cinfo->in_color_space == JCS_GRAYSCALE)
+        source->pub.get_pixel_rows = get_scaled_gray_row;
+      else if (IsExtRGB(cinfo->in_color_space))
+        source->pub.get_pixel_rows = get_gray_rgb_row;
+      else if (cinfo->in_color_space == JCS_CMYK)
+        source->pub.get_pixel_rows = get_gray_cmyk_row;
+      else
+        ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    }
+    break;
+
+  case '6':                     /* it's a raw-format PPM file */
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_RGB;
+    TRACEMS2(cinfo, 1, JTRC_PPM, w, h);
+    if (maxval > 255) {
+      if (IsExtRGB(cinfo->in_color_space))
+        source->pub.get_pixel_rows = get_word_rgb_row;
+      else if (cinfo->in_color_space == JCS_CMYK)
+        source->pub.get_pixel_rows = get_word_rgb_cmyk_row;
+      else
+        ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    } else if (maxval == _MAXJSAMPLE && sizeof(_JSAMPLE) == sizeof(U_CHAR) &&
+#if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3
+               (cinfo->in_color_space == JCS_EXT_RGB ||
+                cinfo->in_color_space == JCS_RGB)) {
+#else
+               cinfo->in_color_space == JCS_EXT_RGB) {
+#endif
+      source->pub.get_pixel_rows = get_raw_row;
+      use_raw_buffer = TRUE;
+      need_rescale = FALSE;
+    } else {
+      if (IsExtRGB(cinfo->in_color_space))
+        source->pub.get_pixel_rows = get_rgb_row;
+      else if (cinfo->in_color_space == JCS_CMYK)
+        source->pub.get_pixel_rows = get_rgb_cmyk_row;
+      else
+        ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    }
+    break;
+  }
+
+  if (IsExtRGB(cinfo->in_color_space))
+    cinfo->input_components = rgb_pixelsize[cinfo->in_color_space];
+  else if (cinfo->in_color_space == JCS_GRAYSCALE)
+    cinfo->input_components = 1;
+  else if (cinfo->in_color_space == JCS_CMYK)
+    cinfo->input_components = 4;
+
+  /* Allocate space for I/O buffer: 1 or 3 bytes or words/pixel. */
+  if (need_iobuffer) {
+    if (c == '6')
+      source->buffer_width = (size_t)w * 3 *
+        ((maxval <= 255) ? sizeof(U_CHAR) : (2 * sizeof(U_CHAR)));
+    else
+      source->buffer_width = (size_t)w *
+        ((maxval <= 255) ? sizeof(U_CHAR) : (2 * sizeof(U_CHAR)));
+    source->iobuffer = (U_CHAR *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  source->buffer_width);
+  }
+
+  /* Create compressor input buffer. */
+  if (use_raw_buffer) {
+    /* For unscaled raw-input case, we can just map it onto the I/O buffer. */
+    /* Synthesize a _JSAMPARRAY pointer structure */
+    source->pixrow = (_JSAMPROW)source->iobuffer;
+    source->pub._buffer = &source->pixrow;
+    source->pub.buffer_height = 1;
+  } else {
+    /* Need to translate anyway, so make a separate sample buffer. */
+    source->pub._buffer = (_JSAMPARRAY)(*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)w * cinfo->input_components, (JDIMENSION)1);
+    source->pub.buffer_height = 1;
+  }
+
+  /* Compute the rescaling array if required. */
+  if (need_rescale) {
+    long val, half_maxval;
+
+    /* On 16-bit-int machines we have to be careful of maxval = 65535 */
+    source->rescale = (_JSAMPLE *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  (size_t)(((long)MAX(maxval, 255) + 1L) *
+                                           sizeof(_JSAMPLE)));
+    memset(source->rescale, 0, (size_t)(((long)MAX(maxval, 255) + 1L) *
+                                        sizeof(_JSAMPLE)));
+    half_maxval = maxval / 2;
+    for (val = 0; val <= (long)maxval; val++) {
+      /* The multiplication here must be done in 32 bits to avoid overflow */
+      source->rescale[val] = (_JSAMPLE)((val * _MAXJSAMPLE + half_maxval) /
+                                        maxval);
+    }
+  }
+}
+
+
+/*
+ * Finish up at the end of the file.
+ */
+
+METHODDEF(void)
+finish_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  /* no work */
+}
+
+
+/*
+ * The module selection routine for PPM format input.
+ */
+
+GLOBAL(cjpeg_source_ptr)
+_jinit_read_ppm(j_compress_ptr cinfo)
+{
+  ppm_source_ptr source;
+
+  if (cinfo->data_precision != BITS_IN_JSAMPLE)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  /* Create module interface object */
+  source = (ppm_source_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(ppm_source_struct));
+  /* Fill in method ptrs, except get_pixel_rows which start_input sets */
+  source->pub.start_input = start_input_ppm;
+  source->pub.finish_input = finish_input_ppm;
+  source->pub.max_pixels = 0;
+
+  return (cjpeg_source_ptr)source;
+}
+
+#endif /* defined(PPM_SUPPORTED) &&
+          (BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED)) */
diff --git a/3rdparty/libjpeg-turbo/src/rdswitch.c b/3rdparty/libjpeg-turbo/src/rdswitch.c
new file mode 100644
index 000000000000..33449c86ba6f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/rdswitch.c
@@ -0,0 +1,428 @@
+/*
+ * rdswitch.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2018, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to process some of cjpeg's more complicated
+ * command-line switches.  Switches processed here are:
+ *      -qtables file           Read quantization tables from text file
+ *      -scans file             Read scan script from text file
+ *      -quality N[,N,...]      Set quality ratings
+ *      -qslots N[,N,...]       Set component quantization table selectors
+ *      -sample HxV[,HxV,...]   Set component sampling factors
+ */
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+#include <ctype.h>              /* to declare isdigit(), isspace() */
+
+
+LOCAL(int)
+text_getc(FILE *file)
+/* Read next char, skipping over any comments (# to end of line) */
+/* A comment/newline sequence is returned as a newline */
+{
+  register int ch;
+
+  ch = getc(file);
+  if (ch == '#') {
+    do {
+      ch = getc(file);
+    } while (ch != '\n' && ch != EOF);
+  }
+  return ch;
+}
+
+
+LOCAL(boolean)
+read_text_integer(FILE *file, long *result, int *termchar)
+/* Read an unsigned decimal integer from a file, store it in result */
+/* Reads one trailing character after the integer; returns it in termchar */
+{
+  register int ch;
+  register long val;
+
+  /* Skip any leading whitespace, detect EOF */
+  do {
+    ch = text_getc(file);
+    if (ch == EOF) {
+      *termchar = ch;
+      return FALSE;
+    }
+  } while (isspace(ch));
+
+  if (!isdigit(ch)) {
+    *termchar = ch;
+    return FALSE;
+  }
+
+  val = ch - '0';
+  while ((ch = text_getc(file)) != EOF) {
+    if (!isdigit(ch))
+      break;
+    val *= 10;
+    val += ch - '0';
+  }
+  *result = val;
+  *termchar = ch;
+  return TRUE;
+}
+
+
+#if JPEG_LIB_VERSION < 70
+static int q_scale_factor[NUM_QUANT_TBLS] = { 100, 100, 100, 100 };
+#endif
+
+GLOBAL(boolean)
+read_quant_tables(j_compress_ptr cinfo, char *filename, boolean force_baseline)
+/* Read a set of quantization tables from the specified file.
+ * The file is plain ASCII text: decimal numbers with whitespace between.
+ * Comments preceded by '#' may be included in the file.
+ * There may be one to NUM_QUANT_TBLS tables in the file, each of 64 values.
+ * The tables are implicitly numbered 0,1,etc.
+ * NOTE: does not affect the qslots mapping, which will default to selecting
+ * table 0 for luminance (or primary) components, 1 for chrominance components.
+ * You must use -qslots if you want a different component->table mapping.
+ */
+{
+  FILE *fp;
+  int tblno, i, termchar;
+  long val;
+  unsigned int table[DCTSIZE2];
+
+  if ((fp = fopen(filename, "r")) == NULL) {
+    fprintf(stderr, "Can't open table file %s\n", filename);
+    return FALSE;
+  }
+  tblno = 0;
+
+  while (read_text_integer(fp, &val, &termchar)) { /* read 1st element of table */
+    if (tblno >= NUM_QUANT_TBLS) {
+      fprintf(stderr, "Too many tables in file %s\n", filename);
+      fclose(fp);
+      return FALSE;
+    }
+    table[0] = (unsigned int)val;
+    for (i = 1; i < DCTSIZE2; i++) {
+      if (!read_text_integer(fp, &val, &termchar)) {
+        fprintf(stderr, "Invalid table data in file %s\n", filename);
+        fclose(fp);
+        return FALSE;
+      }
+      table[i] = (unsigned int)val;
+    }
+#if JPEG_LIB_VERSION >= 70
+    jpeg_add_quant_table(cinfo, tblno, table, cinfo->q_scale_factor[tblno],
+                         force_baseline);
+#else
+    jpeg_add_quant_table(cinfo, tblno, table, q_scale_factor[tblno],
+                         force_baseline);
+#endif
+    tblno++;
+  }
+
+  if (termchar != EOF) {
+    fprintf(stderr, "Non-numeric data in file %s\n", filename);
+    fclose(fp);
+    return FALSE;
+  }
+
+  fclose(fp);
+  return TRUE;
+}
+
+
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+
+LOCAL(boolean)
+read_scan_integer(FILE *file, long *result, int *termchar)
+/* Variant of read_text_integer that always looks for a non-space termchar;
+ * this simplifies parsing of punctuation in scan scripts.
+ */
+{
+  register int ch;
+
+  if (!read_text_integer(file, result, termchar))
+    return FALSE;
+  ch = *termchar;
+  while (ch != EOF && isspace(ch))
+    ch = text_getc(file);
+  if (isdigit(ch)) {            /* oops, put it back */
+    if (ungetc(ch, file) == EOF)
+      return FALSE;
+    ch = ' ';
+  } else {
+    /* Any separators other than ';' and ':' are ignored;
+     * this allows user to insert commas, etc, if desired.
+     */
+    if (ch != EOF && ch != ';' && ch != ':')
+      ch = ' ';
+  }
+  *termchar = ch;
+  return TRUE;
+}
+
+
+GLOBAL(boolean)
+read_scan_script(j_compress_ptr cinfo, char *filename)
+/* Read a scan script from the specified text file.
+ * Each entry in the file defines one scan to be emitted.
+ * Entries are separated by semicolons ';'.
+ * An entry contains one to four component indexes,
+ * optionally followed by a colon ':' and four progressive-JPEG parameters.
+ * The component indexes denote which component(s) are to be transmitted
+ * in the current scan.  The first component has index 0.
+ * Sequential JPEG is used if the progressive-JPEG parameters are omitted.
+ * The file is free format text: any whitespace may appear between numbers
+ * and the ':' and ';' punctuation marks.  Also, other punctuation (such
+ * as commas or dashes) can be placed between numbers if desired.
+ * Comments preceded by '#' may be included in the file.
+ * Note: we do very little validity checking here;
+ * jcmaster.c will validate the script parameters.
+ */
+{
+  FILE *fp;
+  int scanno, ncomps, termchar;
+  long val;
+  jpeg_scan_info *scanptr;
+#define MAX_SCANS  100          /* quite arbitrary limit */
+  jpeg_scan_info scans[MAX_SCANS];
+
+  if ((fp = fopen(filename, "r")) == NULL) {
+    fprintf(stderr, "Can't open scan definition file %s\n", filename);
+    return FALSE;
+  }
+  scanptr = scans;
+  scanno = 0;
+
+  while (read_scan_integer(fp, &val, &termchar)) {
+    if (scanno >= MAX_SCANS) {
+      fprintf(stderr, "Too many scans defined in file %s\n", filename);
+      fclose(fp);
+      return FALSE;
+    }
+    scanptr->component_index[0] = (int)val;
+    ncomps = 1;
+    while (termchar == ' ') {
+      if (ncomps >= MAX_COMPS_IN_SCAN) {
+        fprintf(stderr, "Too many components in one scan in file %s\n",
+                filename);
+        fclose(fp);
+        return FALSE;
+      }
+      if (!read_scan_integer(fp, &val, &termchar))
+        goto bogus;
+      scanptr->component_index[ncomps] = (int)val;
+      ncomps++;
+    }
+    scanptr->comps_in_scan = ncomps;
+    if (termchar == ':') {
+      if (!read_scan_integer(fp, &val, &termchar) || termchar != ' ')
+        goto bogus;
+      scanptr->Ss = (int)val;
+      if (!read_scan_integer(fp, &val, &termchar) || termchar != ' ')
+        goto bogus;
+      scanptr->Se = (int)val;
+      if (!read_scan_integer(fp, &val, &termchar) || termchar != ' ')
+        goto bogus;
+      scanptr->Ah = (int)val;
+      if (!read_scan_integer(fp, &val, &termchar))
+        goto bogus;
+      scanptr->Al = (int)val;
+    } else {
+      /* set non-progressive parameters */
+      scanptr->Ss = 0;
+      scanptr->Se = DCTSIZE2 - 1;
+      scanptr->Ah = 0;
+      scanptr->Al = 0;
+    }
+    if (termchar != ';' && termchar != EOF) {
+bogus:
+      fprintf(stderr, "Invalid scan entry format in file %s\n", filename);
+      fclose(fp);
+      return FALSE;
+    }
+    scanptr++, scanno++;
+  }
+
+  if (termchar != EOF) {
+    fprintf(stderr, "Non-numeric data in file %s\n", filename);
+    fclose(fp);
+    return FALSE;
+  }
+
+  if (scanno > 0) {
+    /* Stash completed scan list in cinfo structure.
+     * NOTE: for cjpeg's use, JPOOL_IMAGE is the right lifetime for this data,
+     * but if you want to compress multiple images you'd want JPOOL_PERMANENT.
+     */
+    scanptr = (jpeg_scan_info *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  scanno * sizeof(jpeg_scan_info));
+    memcpy(scanptr, scans, scanno * sizeof(jpeg_scan_info));
+    cinfo->scan_info = scanptr;
+    cinfo->num_scans = scanno;
+  }
+
+  fclose(fp);
+  return TRUE;
+}
+
+#endif /* C_MULTISCAN_FILES_SUPPORTED */
+
+
+#if JPEG_LIB_VERSION < 70
+/* These are the sample quantization tables given in Annex K (Clause K.1) of
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ * The spec says that the values given produce "good" quality, and
+ * when divided by 2, "very good" quality.
+ */
+static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
+  16,  11,  10,  16,  24,  40,  51,  61,
+  12,  12,  14,  19,  26,  58,  60,  55,
+  14,  13,  16,  24,  40,  57,  69,  56,
+  14,  17,  22,  29,  51,  87,  80,  62,
+  18,  22,  37,  56,  68, 109, 103,  77,
+  24,  35,  55,  64,  81, 104, 113,  92,
+  49,  64,  78,  87, 103, 121, 120, 101,
+  72,  92,  95,  98, 112, 100, 103,  99
+};
+static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
+  17,  18,  24,  47,  99,  99,  99,  99,
+  18,  21,  26,  66,  99,  99,  99,  99,
+  24,  26,  56,  99,  99,  99,  99,  99,
+  47,  66,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99,
+  99,  99,  99,  99,  99,  99,  99,  99
+};
+
+
+LOCAL(void)
+jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline)
+{
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, q_scale_factor[0],
+                       force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl, q_scale_factor[1],
+                       force_baseline);
+}
+#endif
+
+
+GLOBAL(boolean)
+set_quality_ratings(j_compress_ptr cinfo, char *arg, boolean force_baseline)
+/* Process a quality-ratings parameter string, of the form
+ *     N[,N,...]
+ * If there are more q-table slots than parameters, the last value is replicated.
+ */
+{
+  int val = 75;                 /* default value */
+  int tblno;
+  char ch;
+
+  for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
+    if (*arg) {
+      ch = ',';                 /* if not set by sscanf, will be ',' */
+      if (sscanf(arg, "%d%c", &val, &ch) < 1)
+        return FALSE;
+      if (ch != ',')            /* syntax check */
+        return FALSE;
+      /* Convert user 0-100 rating to percentage scaling */
+#if JPEG_LIB_VERSION >= 70
+      cinfo->q_scale_factor[tblno] = jpeg_quality_scaling(val);
+#else
+      q_scale_factor[tblno] = jpeg_quality_scaling(val);
+#endif
+      while (*arg && *arg++ != ','); /* advance to next segment of arg
+                                        string */
+    } else {
+      /* reached end of parameter, set remaining factors to last value */
+#if JPEG_LIB_VERSION >= 70
+      cinfo->q_scale_factor[tblno] = jpeg_quality_scaling(val);
+#else
+      q_scale_factor[tblno] = jpeg_quality_scaling(val);
+#endif
+    }
+  }
+  jpeg_default_qtables(cinfo, force_baseline);
+  return TRUE;
+}
+
+
+GLOBAL(boolean)
+set_quant_slots(j_compress_ptr cinfo, char *arg)
+/* Process a quantization-table-selectors parameter string, of the form
+ *     N[,N,...]
+ * If there are more components than parameters, the last value is replicated.
+ */
+{
+  int val = 0;                  /* default table # */
+  int ci;
+  char ch;
+
+  for (ci = 0; ci < MAX_COMPONENTS; ci++) {
+    if (*arg) {
+      ch = ',';                 /* if not set by sscanf, will be ',' */
+      if (sscanf(arg, "%d%c", &val, &ch) < 1)
+        return FALSE;
+      if (ch != ',')            /* syntax check */
+        return FALSE;
+      if (val < 0 || val >= NUM_QUANT_TBLS) {
+        fprintf(stderr, "JPEG quantization tables are numbered 0..%d\n",
+                NUM_QUANT_TBLS - 1);
+        return FALSE;
+      }
+      cinfo->comp_info[ci].quant_tbl_no = val;
+      while (*arg && *arg++ != ','); /* advance to next segment of arg
+                                        string */
+    } else {
+      /* reached end of parameter, set remaining components to last table */
+      cinfo->comp_info[ci].quant_tbl_no = val;
+    }
+  }
+  return TRUE;
+}
+
+
+GLOBAL(boolean)
+set_sample_factors(j_compress_ptr cinfo, char *arg)
+/* Process a sample-factors parameter string, of the form
+ *     HxV[,HxV,...]
+ * If there are more components than parameters, "1x1" is assumed for the rest.
+ */
+{
+  int ci, val1, val2;
+  char ch1, ch2;
+
+  for (ci = 0; ci < MAX_COMPONENTS; ci++) {
+    if (*arg) {
+      ch2 = ',';                /* if not set by sscanf, will be ',' */
+      if (sscanf(arg, "%d%c%d%c", &val1, &ch1, &val2, &ch2) < 3)
+        return FALSE;
+      if ((ch1 != 'x' && ch1 != 'X') || ch2 != ',') /* syntax check */
+        return FALSE;
+      if (val1 <= 0 || val1 > 4 || val2 <= 0 || val2 > 4) {
+        fprintf(stderr, "JPEG sampling factors must be 1..4\n");
+        return FALSE;
+      }
+      cinfo->comp_info[ci].h_samp_factor = val1;
+      cinfo->comp_info[ci].v_samp_factor = val2;
+      while (*arg && *arg++ != ',');  /* advance to next segment of arg
+                                         string */
+    } else {
+      /* reached end of parameter, set remaining components to 1x1 sampling */
+      cinfo->comp_info[ci].h_samp_factor = 1;
+      cinfo->comp_info[ci].v_samp_factor = 1;
+    }
+  }
+  return TRUE;
+}
diff --git a/3rdparty/libjpeg-turbo/src/rdtarga.c b/3rdparty/libjpeg-turbo/src/rdtarga.c
new file mode 100644
index 000000000000..b78a16539e1c
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/rdtarga.c
@@ -0,0 +1,507 @@
+/*
+ * rdtarga.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2017 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2018, 2021-2023, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains routines to read input images in Targa format.
+ *
+ * These routines may need modification for non-Unix environments or
+ * specialized applications.  As they stand, they assume input from
+ * an ordinary stdio stream.  They further assume that reading begins
+ * at the start of the file; start_input may need work if the
+ * user interface has already read some data (e.g., to determine that
+ * the file is indeed Targa format).
+ *
+ * Based on code contributed by Lee Daniel Crocker.
+ */
+
+#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
+
+#ifdef TARGA_SUPPORTED
+
+
+/* Macros to deal with unsigned chars as efficiently as compiler allows */
+
+typedef unsigned char U_CHAR;
+#define UCH(x)  ((int)(x))
+
+
+#define ReadOK(file, buffer, len) \
+  (fread(buffer, 1, len, file) == ((size_t)(len)))
+
+
+/* Private version of data source object */
+
+typedef struct _tga_source_struct *tga_source_ptr;
+
+typedef struct _tga_source_struct {
+  struct cjpeg_source_struct pub; /* public fields */
+
+  j_compress_ptr cinfo;         /* back link saves passing separate parm */
+
+  JSAMPARRAY colormap;          /* Targa colormap (converted to my format) */
+
+  jvirt_sarray_ptr whole_image; /* Needed if funny input row order */
+  JDIMENSION current_row;       /* Current logical row number to read */
+
+  /* Pointer to routine to extract next Targa pixel from input file */
+  void (*read_pixel) (tga_source_ptr sinfo);
+
+  /* Result of read_pixel is delivered here: */
+  U_CHAR tga_pixel[4];
+
+  int pixel_size;               /* Bytes per Targa pixel (1 to 4) */
+  int cmap_length;              /* colormap length */
+
+  /* State info for reading RLE-coded pixels; both counts must be init to 0 */
+  int block_count;              /* # of pixels remaining in RLE block */
+  int dup_pixel_count;          /* # of times to duplicate previous pixel */
+
+  /* This saves the correct pixel-row-expansion method for preload_image */
+  JDIMENSION (*get_pixel_rows) (j_compress_ptr cinfo, cjpeg_source_ptr sinfo);
+} tga_source_struct;
+
+
+/* For expanding 5-bit pixel values to 8-bit with best rounding */
+
+static const UINT8 c5to8bits[32] = {
+    0,   8,  16,  25,  33,  41,  49,  58,
+   66,  74,  82,  90,  99, 107, 115, 123,
+  132, 140, 148, 156, 165, 173, 181, 189,
+  197, 206, 214, 222, 230, 239, 247, 255
+};
+
+
+
+LOCAL(int)
+read_byte(tga_source_ptr sinfo)
+/* Read next byte from Targa file */
+{
+  register FILE *infile = sinfo->pub.input_file;
+  register int c;
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  return c;
+}
+
+
+LOCAL(void)
+read_colormap(tga_source_ptr sinfo, int cmaplen, int mapentrysize)
+/* Read the colormap from a Targa file */
+{
+  int i;
+
+  /* Presently only handles 24-bit BGR format */
+  if (mapentrysize != 24)
+    ERREXIT(sinfo->cinfo, JERR_TGA_BADCMAP);
+
+  for (i = 0; i < cmaplen; i++) {
+    sinfo->colormap[2][i] = (JSAMPLE)read_byte(sinfo);
+    sinfo->colormap[1][i] = (JSAMPLE)read_byte(sinfo);
+    sinfo->colormap[0][i] = (JSAMPLE)read_byte(sinfo);
+  }
+}
+
+
+/*
+ * read_pixel methods: get a single pixel from Targa file into tga_pixel[]
+ */
+
+METHODDEF(void)
+read_non_rle_pixel(tga_source_ptr sinfo)
+/* Read one Targa pixel from the input file; no RLE expansion */
+{
+  register int i;
+
+  for (i = 0; i < sinfo->pixel_size; i++) {
+    sinfo->tga_pixel[i] = (U_CHAR)read_byte(sinfo);
+  }
+}
+
+
+METHODDEF(void)
+read_rle_pixel(tga_source_ptr sinfo)
+/* Read one Targa pixel from the input file, expanding RLE data as needed */
+{
+  register int i;
+
+  /* Duplicate previously read pixel? */
+  if (sinfo->dup_pixel_count > 0) {
+    sinfo->dup_pixel_count--;
+    return;
+  }
+
+  /* Time to read RLE block header? */
+  if (--sinfo->block_count < 0) { /* decrement pixels remaining in block */
+    i = read_byte(sinfo);
+    if (i & 0x80) {             /* Start of duplicate-pixel block? */
+      sinfo->dup_pixel_count = i & 0x7F; /* number of dups after this one */
+      sinfo->block_count = 0;   /* then read new block header */
+    } else {
+      sinfo->block_count = i & 0x7F; /* number of pixels after this one */
+    }
+  }
+
+  /* Read next pixel */
+  for (i = 0; i < sinfo->pixel_size; i++) {
+    sinfo->tga_pixel[i] = (U_CHAR)read_byte(sinfo);
+  }
+}
+
+
+/*
+ * Read one row of pixels.
+ *
+ * We provide several different versions depending on input file format.
+ */
+
+
+METHODDEF(JDIMENSION)
+get_8bit_gray_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading 8-bit grayscale pixels */
+{
+  tga_source_ptr source = (tga_source_ptr)sinfo;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
+    *ptr++ = (JSAMPLE)UCH(source->tga_pixel[0]);
+  }
+  return 1;
+}
+
+METHODDEF(JDIMENSION)
+get_8bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading 8-bit colormap indexes */
+{
+  tga_source_ptr source = (tga_source_ptr)sinfo;
+  register int t;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+  register JSAMPARRAY colormap = source->colormap;
+  int cmaplen = source->cmap_length;
+
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
+    t = UCH(source->tga_pixel[0]);
+    if (t >= cmaplen)
+      ERREXIT(cinfo, JERR_TGA_BADPARMS);
+    *ptr++ = colormap[0][t];
+    *ptr++ = colormap[1][t];
+    *ptr++ = colormap[2][t];
+  }
+  return 1;
+}
+
+METHODDEF(JDIMENSION)
+get_16bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading 16-bit pixels */
+{
+  tga_source_ptr source = (tga_source_ptr)sinfo;
+  register int t;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
+    t = UCH(source->tga_pixel[0]);
+    t += UCH(source->tga_pixel[1]) << 8;
+    /* We expand 5 bit data to 8 bit sample width.
+     * The format of the 16-bit (LSB first) input word is
+     *     xRRRRRGGGGGBBBBB
+     */
+    ptr[2] = (JSAMPLE)c5to8bits[t & 0x1F];
+    t >>= 5;
+    ptr[1] = (JSAMPLE)c5to8bits[t & 0x1F];
+    t >>= 5;
+    ptr[0] = (JSAMPLE)c5to8bits[t & 0x1F];
+    ptr += 3;
+  }
+  return 1;
+}
+
+METHODDEF(JDIMENSION)
+get_24bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading 24-bit pixels */
+{
+  tga_source_ptr source = (tga_source_ptr)sinfo;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
+    *ptr++ = (JSAMPLE)UCH(source->tga_pixel[2]); /* change BGR to RGB order */
+    *ptr++ = (JSAMPLE)UCH(source->tga_pixel[1]);
+    *ptr++ = (JSAMPLE)UCH(source->tga_pixel[0]);
+  }
+  return 1;
+}
+
+/*
+ * Targa also defines a 32-bit pixel format with order B,G,R,A.
+ * We presently ignore the attribute byte, so the code for reading
+ * these pixels is identical to the 24-bit routine above.
+ * This works because the actual pixel length is only known to read_pixel.
+ */
+
+#define get_32bit_row  get_24bit_row
+
+
+/*
+ * This method is for re-reading the input data in standard top-down
+ * row order.  The entire image has already been read into whole_image
+ * with proper conversion of pixel format, but it's in a funny row order.
+ */
+
+METHODDEF(JDIMENSION)
+get_memory_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  tga_source_ptr source = (tga_source_ptr)sinfo;
+  JDIMENSION source_row;
+
+  /* Compute row of source that maps to current_row of normal order */
+  /* For now, assume image is bottom-up and not interlaced. */
+  /* NEEDS WORK to support interlaced images! */
+  source_row = cinfo->image_height - source->current_row - 1;
+
+  /* Fetch that row from virtual array */
+  source->pub.buffer = (*cinfo->mem->access_virt_sarray)
+    ((j_common_ptr)cinfo, source->whole_image,
+     source_row, (JDIMENSION)1, FALSE);
+
+  source->current_row++;
+  return 1;
+}
+
+
+/*
+ * This method loads the image into whole_image during the first call on
+ * get_pixel_rows.  The get_pixel_rows pointer is then adjusted to call
+ * get_memory_row on subsequent calls.
+ */
+
+METHODDEF(JDIMENSION)
+preload_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  tga_source_ptr source = (tga_source_ptr)sinfo;
+  JDIMENSION row;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+
+  /* Read the data into a virtual array in input-file row order. */
+  for (row = 0; row < cinfo->image_height; row++) {
+    if (progress != NULL) {
+      progress->pub.pass_counter = (long)row;
+      progress->pub.pass_limit = (long)cinfo->image_height;
+      (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
+    }
+    source->pub.buffer = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->whole_image, row, (JDIMENSION)1, TRUE);
+    (*source->get_pixel_rows) (cinfo, sinfo);
+  }
+  if (progress != NULL)
+    progress->completed_extra_passes++;
+
+  /* Set up to read from the virtual array in unscrambled order */
+  source->pub.get_pixel_rows = get_memory_row;
+  source->current_row = 0;
+  /* And read the first row */
+  return get_memory_row(cinfo, sinfo);
+}
+
+
+/*
+ * Read the file header; return image size and component count.
+ */
+
+METHODDEF(void)
+start_input_tga(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  tga_source_ptr source = (tga_source_ptr)sinfo;
+  U_CHAR targaheader[18];
+  int idlen, cmaptype, subtype, flags, interlace_type, components;
+  unsigned int width, height, maplen;
+  boolean is_bottom_up;
+
+#define GET_2B(offset) \
+  ((unsigned int)UCH(targaheader[offset]) + \
+   (((unsigned int)UCH(targaheader[offset + 1])) << 8))
+
+  if (!ReadOK(source->pub.input_file, targaheader, 18))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+
+  /* Pretend "15-bit" pixels are 16-bit --- we ignore attribute bit anyway */
+  if (targaheader[16] == 15)
+    targaheader[16] = 16;
+
+  idlen = UCH(targaheader[0]);
+  cmaptype = UCH(targaheader[1]);
+  subtype = UCH(targaheader[2]);
+  maplen = GET_2B(5);
+  width = GET_2B(12);
+  height = GET_2B(14);
+  source->pixel_size = UCH(targaheader[16]) >> 3;
+  flags = UCH(targaheader[17]); /* Image Descriptor byte */
+
+  is_bottom_up = ((flags & 0x20) == 0); /* bit 5 set => top-down */
+  interlace_type = flags >> 6;  /* bits 6/7 are interlace code */
+
+  if (cmaptype > 1 ||           /* cmaptype must be 0 or 1 */
+      source->pixel_size < 1 || source->pixel_size > 4 ||
+      (UCH(targaheader[16]) & 7) != 0 || /* bits/pixel must be multiple of 8 */
+      interlace_type != 0 ||      /* currently don't allow interlaced image */
+      width == 0 || height == 0)  /* image width/height must be non-zero */
+    ERREXIT(cinfo, JERR_TGA_BADPARMS);
+  if (sinfo->max_pixels &&
+      (unsigned long long)width * height > sinfo->max_pixels)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, sinfo->max_pixels);
+
+  if (subtype > 8) {
+    /* It's an RLE-coded file */
+    source->read_pixel = read_rle_pixel;
+    source->block_count = source->dup_pixel_count = 0;
+    subtype -= 8;
+  } else {
+    /* Non-RLE file */
+    source->read_pixel = read_non_rle_pixel;
+  }
+
+  /* Now should have subtype 1, 2, or 3 */
+  components = 3;               /* until proven different */
+  cinfo->in_color_space = JCS_RGB;
+
+  switch (subtype) {
+  case 1:                       /* Colormapped image */
+    if (source->pixel_size == 1 && cmaptype == 1)
+      source->get_pixel_rows = get_8bit_row;
+    else
+      ERREXIT(cinfo, JERR_TGA_BADPARMS);
+    TRACEMS2(cinfo, 1, JTRC_TGA_MAPPED, width, height);
+    break;
+  case 2:                       /* RGB image */
+    switch (source->pixel_size) {
+    case 2:
+      source->get_pixel_rows = get_16bit_row;
+      break;
+    case 3:
+      source->get_pixel_rows = get_24bit_row;
+      break;
+    case 4:
+      source->get_pixel_rows = get_32bit_row;
+      break;
+    default:
+      ERREXIT(cinfo, JERR_TGA_BADPARMS);
+      break;
+    }
+    TRACEMS2(cinfo, 1, JTRC_TGA, width, height);
+    break;
+  case 3:                       /* Grayscale image */
+    components = 1;
+    cinfo->in_color_space = JCS_GRAYSCALE;
+    if (source->pixel_size == 1)
+      source->get_pixel_rows = get_8bit_gray_row;
+    else
+      ERREXIT(cinfo, JERR_TGA_BADPARMS);
+    TRACEMS2(cinfo, 1, JTRC_TGA_GRAY, width, height);
+    break;
+  default:
+    ERREXIT(cinfo, JERR_TGA_BADPARMS);
+    break;
+  }
+
+  if (is_bottom_up) {
+    /* Create a virtual array to buffer the upside-down image. */
+    source->whole_image = (*cinfo->mem->request_virt_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+       (JDIMENSION)width * components, (JDIMENSION)height, (JDIMENSION)1);
+    if (cinfo->progress != NULL) {
+      cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+      progress->total_extra_passes++; /* count file input as separate pass */
+    }
+    /* source->pub.buffer will point to the virtual array. */
+    source->pub.buffer_height = 1; /* in case anyone looks at it */
+    source->pub.get_pixel_rows = preload_image;
+  } else {
+    /* Don't need a virtual array, but do need a one-row input buffer. */
+    source->whole_image = NULL;
+    source->pub.buffer = (*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)width * components, (JDIMENSION)1);
+    source->pub.buffer_height = 1;
+    source->pub.get_pixel_rows = source->get_pixel_rows;
+  }
+
+  while (idlen--)               /* Throw away ID field */
+    (void)read_byte(source);
+
+  if (maplen > 0) {
+    if (maplen > 256 || GET_2B(3) != 0)
+      ERREXIT(cinfo, JERR_TGA_BADCMAP);
+    /* Allocate space to store the colormap */
+    source->colormap = (*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)maplen, (JDIMENSION)3);
+    source->cmap_length = (int)maplen;
+    /* and read it from the file */
+    read_colormap(source, (int)maplen, UCH(targaheader[7]));
+  } else {
+    if (cmaptype)               /* but you promised a cmap! */
+      ERREXIT(cinfo, JERR_TGA_BADPARMS);
+    source->colormap = NULL;
+    source->cmap_length = 0;
+  }
+
+  cinfo->input_components = components;
+  cinfo->data_precision = 8;
+  cinfo->image_width = width;
+  cinfo->image_height = height;
+}
+
+
+/*
+ * Finish up at the end of the file.
+ */
+
+METHODDEF(void)
+finish_input_tga(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  /* no work */
+}
+
+
+/*
+ * The module selection routine for Targa format input.
+ */
+
+GLOBAL(cjpeg_source_ptr)
+jinit_read_targa(j_compress_ptr cinfo)
+{
+  tga_source_ptr source;
+
+  if (cinfo->data_precision != 8)
+    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
+
+  /* Create module interface object */
+  source = (tga_source_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(tga_source_struct));
+  source->cinfo = cinfo;        /* make back link for subroutines */
+  /* Fill in method ptrs, except get_pixel_rows which start_input sets */
+  source->pub.start_input = start_input_tga;
+  source->pub.finish_input = finish_input_tga;
+  source->pub.max_pixels = 0;
+
+  return (cjpeg_source_ptr)source;
+}
+
+#endif /* TARGA_SUPPORTED */
diff --git a/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt b/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt
index 50553020041e..48944c6bfe95 100644
--- a/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt
+++ b/3rdparty/libjpeg-turbo/src/simd/CMakeLists.txt
@@ -1,13 +1,9 @@
 macro(simd_fail message)
-    message(STATUS "libjpeg-turbo(SIMD): ${message}.  Performance will suffer.")
-    set(WITH_SIMD 0 PARENT_SCOPE)
-endmacro()
-
-macro(boolean_number var)
-  if(${var})
-    set(${var} 1 ${ARGN})
+  if(REQUIRE_SIMD)
+    message(FATAL_ERROR "${message}.")
   else()
-    set(${var} 0 ${ARGN})
+    message(STATUS "${message}.  Performance will suffer.")
+    set(WITH_SIMD 0 PARENT_SCOPE)
   endif()
 endmacro()
 
@@ -45,14 +41,14 @@ elseif(CPU_TYPE STREQUAL "i386")
   endif()
 endif()
 
-
-include(CheckLanguage)
-check_language(ASM_NASM)
-if(NOT CMAKE_ASM_NASM_COMPILER)
-  simd_fail("SIMD extensions disabled: could not find NASM compiler")
-  return()
+if(NOT REQUIRE_SIMD)
+  include(CheckLanguage)
+  check_language(ASM_NASM)
+  if(NOT CMAKE_ASM_NASM_COMPILER)
+    simd_fail("SIMD extensions disabled: could not find NASM compiler")
+    return()
+  endif()
 endif()
-
 enable_language(ASM_NASM)
 message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
 
@@ -75,6 +71,8 @@ elseif(CPU_TYPE STREQUAL "i386")
   endif()
 endif()
 
+message(STATUS "CMAKE_ASM_NASM_OBJECT_FORMAT = ${CMAKE_ASM_NASM_OBJECT_FORMAT}")
+
 if(NOT CMAKE_ASM_NASM_OBJECT_FORMAT)
   simd_fail("SIMD extensions disabled: could not determine NASM object format")
   return()
@@ -98,8 +96,21 @@ if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
   set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DPIC")
 endif()
 
+if(CPU_TYPE STREQUAL "x86_64" AND CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^elf")
+  check_c_source_compiles("
+    #if (__CET__ & 3) == 0
+    #error \"CET not enabled\"
+    #endif
+    int main(void) { return 0; }" HAVE_CET)
+
+  if(HAVE_CET)
+    set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -D__CET__")
+  endif()
+endif()
+
 string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
 set(EFFECTIVE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} ${CMAKE_ASM_NASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "CMAKE_ASM_NASM_FLAGS = ${EFFECTIVE_ASM_NASM_FLAGS}")
 
 set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -I\"${CMAKE_CURRENT_SOURCE_DIR}/nasm/\" -I\"${CMAKE_CURRENT_SOURCE_DIR}/${CPU_TYPE}/\"")
 
@@ -113,7 +124,6 @@ add_custom_target(jsimdcfg COMMAND
     ${CMAKE_CURRENT_SOURCE_DIR}/nasm/jsimdcfg.inc.h |
   ${GREP} -E '^[\;%]|^\ %' | sed 's%_cpp_protection_%%' |
   sed 's@% define@%define@g' >${CMAKE_CURRENT_SOURCE_DIR}/nasm/jsimdcfg.inc)
-set_target_properties(jsimdcfg PROPERTIES FOLDER "3rdparty")
 
 if(CPU_TYPE STREQUAL "x86_64")
   set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm
@@ -198,16 +208,17 @@ endforeach()
 
 if(MSVC_IDE OR XCODE)
   set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE)
-  add_library(jsimd OBJECT ${CPU_TYPE}/jsimd.c)
-  add_custom_target(jsimd-objs DEPENDS ${SIMD_OBJS})
-  add_dependencies(jsimd jsimd-objs)
-  set_target_properties(jsimd PROPERTIES FOLDER "3rdparty")
-  set_target_properties(jsimd-objs PROPERTIES FOLDER "3rdparty")
+  add_library(simd OBJECT ${CPU_TYPE}/jsimd.c)
+  add_custom_target(simd-objs DEPENDS ${SIMD_OBJS})
+  add_dependencies(simd simd-objs)
+  set_target_properties(simd PROPERTIES FOLDER "3rdparty/jpeg")
+  set_target_properties(simd-objs PROPERTIES FOLDER "3rdparty/jpeg")
+  set_target_properties(jsimdcfg PROPERTIES FOLDER "3rdparty/jpeg")
 else()
-  add_library(jsimd OBJECT ${SIMD_SOURCES} ${CPU_TYPE}/jsimd.c)
+  add_library(simd OBJECT ${SIMD_SOURCES} ${CPU_TYPE}/jsimd.c)
 endif()
 if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
-  set_target_properties(jsimd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
 endif()
 
 
@@ -228,7 +239,6 @@ elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
 # following test determines whether -mfloat-abi=softfp should be explicitly
 # added to the compile flags for the intrinsics implementation of the Neon SIMD
 # extensions.
-
 if(BITS EQUAL 32)
   check_c_source_compiles("
     #if defined(__ARM_NEON__) || (!defined(__linux__) && !defined(ANDROID) && !defined(__ANDROID__))
@@ -361,7 +371,7 @@ if(NOT NEON_INTRINSICS)
       -x assembler-with-cpp -c ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
     RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
   if(NOT RESULT EQUAL 0)
-    message(STATUS "libjpeg-turbo(SIMD): GAS appears to be broken.  Using the full Neon SIMD intrinsics implementation.")
+    message(WARNING "GAS appears to be broken.  Using the full Neon SIMD intrinsics implementation.")
     set(NEON_INTRINSICS 1 CACHE INTERNAL "" FORCE)
   endif()
 endif()
@@ -397,10 +407,10 @@ if(NOT NEON_INTRINSICS)
   set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S)
 endif()
 
-add_library(jsimd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
-  set_target_properties(jsimd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
 endif()
 
 
@@ -439,10 +449,10 @@ if(NOT HAVE_DSPR2)
   return()
 endif()
 
-add_library(jsimd OBJECT mips/jsimd_dspr2.S mips/jsimd.c)
+add_library(simd OBJECT mips/jsimd_dspr2.S mips/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
-  set_target_properties(jsimd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
 endif()
 
 ###############################################################################
@@ -454,6 +464,9 @@ elseif(CPU_TYPE STREQUAL "loongson" OR CPU_TYPE MATCHES "^mips64")
 set(CMAKE_REQUIRED_FLAGS -Wa,-mloongson-mmi,-mloongson-ext)
 
 check_c_source_compiles("
+  #if !(defined(__mips__) && __mips_isa_rev < 6)
+  #error Loongson MMI can't work with MIPS Release 6+
+  #endif
   int main(void) {
     int c = 0, a = 0, b = 0;
     asm (
@@ -487,10 +500,10 @@ foreach(file ${SIMD_SOURCES})
     " -Wa,-mloongson-mmi,-mloongson-ext")
 endforeach()
 
-add_library(jsimd OBJECT ${SIMD_SOURCES} mips64/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} mips64/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
-  set_target_properties(jsimd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
 endif()
 
 ###############################################################################
@@ -527,10 +540,10 @@ set(SIMD_SOURCES powerpc/jccolor-altivec.c powerpc/jcgray-altivec.c
 set_source_files_properties(${SIMD_SOURCES} PROPERTIES
   COMPILE_FLAGS -maltivec)
 
-add_library(jsimd OBJECT ${SIMD_SOURCES} powerpc/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} powerpc/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
-  set_target_properties(jsimd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
 endif()
 
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c
index e3adf23d5013..04d64526fb23 100644
--- a/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch32/jsimd.c
@@ -4,7 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
  * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2019, Google LLC.
  * Copyright (C) 2020, Arm Limited.
  *
@@ -25,12 +25,10 @@
 #include "../../../jsimddct.h"
 #include "../../jsimd.h"
 
-#include <stdio.h>
-#include <string.h>
 #include <ctype.h>
 
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
 
 #if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
 
@@ -98,8 +96,6 @@ parse_proc_cpuinfo(int bufsize)
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
@@ -947,7 +943,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
   jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
                                          Sl, Al, values, zerobits);
@@ -972,7 +968,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return jsimd_encode_mcu_AC_refine_prepare_neon(block,
                                                  jpeg_natural_order_start, Sl,
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c
index 604d5472f6a6..358e1597b165 100644
--- a/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/aarch64/jsimd.c
@@ -4,7 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
  * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
@@ -23,20 +23,17 @@
 #include "../../../jdct.h"
 #include "../../../jsimddct.h"
 #include "../../jsimd.h"
-#include "jconfigint.h"
 
-#include <stdio.h>
-#include <string.h>
 #include <ctype.h>
 
 #define JSIMD_FASTLD3  1
 #define JSIMD_FASTST3  2
 #define JSIMD_FASTTBL  4
 
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
-                                    JSIMD_FASTTBL;
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_features = JSIMD_FASTLD3 |
+                                                 JSIMD_FASTST3 | JSIMD_FASTTBL;
 
 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
 
@@ -111,8 +108,6 @@ parse_proc_cpuinfo(int bufsize)
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 
 /*
@@ -1023,7 +1018,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
   jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
                                          Sl, Al, values, zerobits);
@@ -1050,7 +1045,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return jsimd_encode_mcu_AC_refine_prepare_neon(block,
                                                  jpeg_natural_order_start,
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c
index b91c5db478a1..51db3c5f3939 100644
--- a/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jcphuff-neon.c
@@ -2,6 +2,8 @@
  * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
  *
  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2022, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2022, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -21,7 +23,6 @@
  */
 
 #define JPEG_INTERNALS
-#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
@@ -41,10 +42,10 @@
 
 void jsimd_encode_mcu_AC_first_prepare_neon
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *values, size_t *zerobits)
+   UJCOEF *values, size_t *zerobits)
 {
-  JCOEF *values_ptr = values;
-  JCOEF *diff_values_ptr = values + DCTSIZE2;
+  UJCOEF *values_ptr = values;
+  UJCOEF *diff_values_ptr = values + DCTSIZE2;
 
   /* Rows of coefficients to zero (since they haven't been processed) */
   int i, rows_to_zero = 8;
@@ -68,23 +69,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
 
     /* Isolate sign of coefficients. */
-    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
-    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
-    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
-    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
-    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
 
     /* Compute diff values. */
-    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
-    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
 
     /* Store transformed coefficients and diff values. */
-    vst1q_s16(values_ptr, coefs1);
-    vst1q_s16(values_ptr + DCTSIZE, coefs2);
-    vst1q_s16(diff_values_ptr, diff1);
-    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    vst1q_u16(values_ptr, abs_coefs1);
+    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+    vst1q_u16(diff_values_ptr, diff1);
+    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
     values_ptr += 16;
     diff_values_ptr += 16;
     jpeg_natural_order_start += 16;
@@ -130,23 +131,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
     }
 
     /* Isolate sign of coefficients. */
-    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
-    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
-    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
-    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
-    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
 
     /* Compute diff values. */
-    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
-    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
 
     /* Store transformed coefficients and diff values. */
-    vst1q_s16(values_ptr, coefs1);
-    vst1q_s16(values_ptr + DCTSIZE, coefs2);
-    vst1q_s16(diff_values_ptr, diff1);
-    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    vst1q_u16(values_ptr, abs_coefs1);
+    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+    vst1q_u16(diff_values_ptr, diff1);
+    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
     values_ptr += 16;
     diff_values_ptr += 16;
     rows_to_zero -= 2;
@@ -184,17 +185,17 @@ void jsimd_encode_mcu_AC_first_prepare_neon
     }
 
     /* Isolate sign of coefficients. */
-    int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+    uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs = vabsq_s16(coefs);
-    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
 
     /* Compute diff values. */
-    int16x8_t diff = veorq_s16(coefs, sign_coefs);
+    uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
 
     /* Store transformed coefficients and diff values. */
-    vst1q_s16(values_ptr, coefs);
-    vst1q_s16(diff_values_ptr, diff);
+    vst1q_u16(values_ptr, abs_coefs);
+    vst1q_u16(diff_values_ptr, diff);
     values_ptr += 8;
     diff_values_ptr += 8;
     rows_to_zero--;
@@ -202,8 +203,8 @@ void jsimd_encode_mcu_AC_first_prepare_neon
 
   /* Zero remaining memory in the values and diff_values blocks. */
   for (i = 0; i < rows_to_zero; i++) {
-    vst1q_s16(values_ptr, vdupq_n_s16(0));
-    vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+    vst1q_u16(values_ptr, vdupq_n_u16(0));
+    vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
     values_ptr += 8;
     diff_values_ptr += 8;
   }
@@ -211,23 +212,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
   /* Construct zerobits bitmap.  A set bit means that the corresponding
    * coefficient != 0.
    */
-  int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
-  int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
-  int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
-  int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
-  int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
-  int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
-  int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
-  int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
-
-  uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
-  uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
-  uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
-  uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
-  uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
-  uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
-  uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
-  uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+  uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
+  uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
+  uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
+  uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
+  uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
+  uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
+  uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
+  uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
 
   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
   const uint8x8_t bitmap_mask =
@@ -274,7 +275,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon
 
 int jsimd_encode_mcu_AC_refine_prepare_neon
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *absvalues, size_t *bits)
+   UJCOEF *absvalues, size_t *bits)
 {
   /* Temporary storage buffers for data used to compute the signbits bitmap and
    * the end-of-block (EOB) position
@@ -282,7 +283,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   uint8_t coef_sign_bits[64];
   uint8_t coef_eq1_bits[64];
 
-  JCOEF *absvalues_ptr = absvalues;
+  UJCOEF *absvalues_ptr = absvalues;
   uint8_t *coef_sign_bits_ptr = coef_sign_bits;
   uint8_t *eq1_bits_ptr = coef_eq1_bits;
 
@@ -316,18 +317,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
 
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
-    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
-    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
-    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
-    vst1q_s16(absvalues_ptr, coefs1);
-    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs1);
+    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
 
     /* Test whether transformed coefficient values == 1 (used to find EOB
      * position.)
      */
-    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
-    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
     vst1_u8(eq1_bits_ptr, coefs_eq11);
     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
 
@@ -385,18 +386,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
 
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
-    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
-    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
-    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
-    vst1q_s16(absvalues_ptr, coefs1);
-    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs1);
+    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
 
     /* Test whether transformed coefficient values == 1 (used to find EOB
      * position.)
      */
-    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
-    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
     vst1_u8(eq1_bits_ptr, coefs_eq11);
     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
 
@@ -444,14 +445,14 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
     vst1_u8(coef_sign_bits_ptr, sign_coefs);
 
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs = vabsq_s16(coefs);
-    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
-    vst1q_s16(absvalues_ptr, coefs);
+    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs);
 
     /* Test whether transformed coefficient values == 1 (used to find EOB
      * position.)
      */
-    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
     vst1_u8(eq1_bits_ptr, coefs_eq1);
 
     absvalues_ptr += 8;
@@ -462,7 +463,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
 
   /* Zero remaining memory in blocks. */
   for (i = 0; i < rows_to_zero; i++) {
-    vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+    vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
     vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
     vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
     absvalues_ptr += 8;
@@ -471,23 +472,23 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   }
 
   /* Construct zerobits bitmap. */
-  int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
-  int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
-  int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
-  int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
-  int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
-  int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
-  int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
-  int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
-
-  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
-  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
-  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
-  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
-  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
-  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
-  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
-  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+  uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
+  uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
+  uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
+  uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
+  uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
+  uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
+  uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
+  uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
 
   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
   const uint8x8_t bitmap_mask =
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c
index ea4668f1d308..28dbc57243ce 100644
--- a/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdcolor-neon.c
@@ -21,7 +21,6 @@
  */
 
 #define JPEG_INTERNALS
-#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c
index e4f91fdc0ef7..18fb9d8a55ab 100644
--- a/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jdmerge-neon.c
@@ -21,7 +21,6 @@
  */
 
 #define JPEG_INTERNALS
-#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c b/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c
index 043b652e6c55..d25112ef7fd2 100644
--- a/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c
+++ b/3rdparty/libjpeg-turbo/src/simd/arm/jidctint-neon.c
@@ -22,7 +22,6 @@
  */
 
 #define JPEG_INTERNALS
-#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm
index c46d684436dd..af6418f0a673 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-avx2.asm
@@ -2,7 +2,7 @@
 ; jccolext.asm - colorspace conversion (AVX2)
 ;
 ; Copyright (C) 2015, Intel Corporation.
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [img_width(eax)]
     test        ecx, ecx
@@ -80,9 +80,9 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
-    pushpic     eax
+    PUSHPIC     eax
     push        edx
     push        ebx
     push        edi
@@ -93,11 +93,11 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     mov         edi, JSAMPROW [edi]     ; outptr0
     mov         ebx, JSAMPROW [ebx]     ; outptr1
     mov         edx, JSAMPROW [edx]     ; outptr2
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     cmp         ecx, byte SIZEOF_YMMWORD
     jae         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 %if RGB_PIXELSIZE == 3  ; ---------------
 
@@ -154,7 +154,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
     vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
     jmp         short .rgb_ycc_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -278,7 +278,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
     vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
     jmp         short .rgb_ycc_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -552,7 +552,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     pop         edi
     pop         ebx
     pop         edx
-    poppic      eax
+    POPPIC      eax
 
     add         esi, byte SIZEOF_JSAMPROW  ; input_buf
     add         edi, byte SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm
index 6357a42b2cf4..dbec80e787ba 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-mmx.asm
@@ -2,7 +2,7 @@
 ; jccolext.asm - colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
     test        ecx, ecx
@@ -80,9 +80,9 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
-    pushpic     eax
+    PUSHPIC     eax
     push        edx
     push        ebx
     push        edi
@@ -93,11 +93,11 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     mov         edi, JSAMPROW [edi]     ; outptr0
     mov         ebx, JSAMPROW [ebx]     ; outptr1
     mov         edx, JSAMPROW [edx]     ; outptr2
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     cmp         ecx, byte SIZEOF_MMWORD
     jae         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 %if RGB_PIXELSIZE == 3  ; ---------------
 
@@ -143,7 +143,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
     movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
     jmp         short .rgb_ycc_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -211,7 +211,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
     movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
     jmp         short .rgb_ycc_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -449,7 +449,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
     pop         edi
     pop         ebx
     pop         edx
-    poppic      eax
+    POPPIC      eax
 
     add         esi, byte SIZEOF_JSAMPROW  ; input_buf
     add         edi, byte SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm
index c6c80852ac5b..8d411451788b 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolext-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jccolext.asm - colorspace conversion (SSE2)
 ;
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -48,15 +48,15 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [img_width(eax)]
     test        ecx, ecx
@@ -79,9 +79,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
-    pushpic     eax
+    PUSHPIC     eax
     push        edx
     push        ebx
     push        edi
@@ -92,11 +92,11 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     mov         edi, JSAMPROW [edi]     ; outptr0
     mov         ebx, JSAMPROW [ebx]     ; outptr1
     mov         edx, JSAMPROW [edx]     ; outptr2
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     cmp         ecx, byte SIZEOF_XMMWORD
     jae         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 %if RGB_PIXELSIZE == 3  ; ---------------
 
@@ -147,7 +147,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
     movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
     jmp         short .rgb_ycc_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -232,7 +232,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
     movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
     jmp         short .rgb_ycc_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -478,7 +478,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     pop         edi
     pop         ebx
     pop         edx
-    poppic      eax
+    POPPIC      eax
 
     add         esi, byte SIZEOF_JSAMPROW  ; input_buf
     add         edi, byte SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm
index 14944e952f19..3d6dfa6f8cbb 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-avx2.asm
@@ -1,7 +1,7 @@
 ;
 ; jccolor.asm - colorspace conversion (AVX2)
 ;
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -33,7 +33,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
 
 EXTN(jconst_rgb_ycc_convert_avx2):
@@ -46,7 +46,7 @@ PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
                             (CENTERJSAMPLE << SCALEBITS)
 PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm
index 8cb399bdc43f..0527488500f5 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-mmx.asm
@@ -2,7 +2,7 @@
 ; jccolor.asm - colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,7 +33,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
 
 EXTN(jconst_rgb_ycc_convert_mmx):
@@ -46,7 +46,7 @@ PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS - 1)) - 1 + \
                             (CENTERJSAMPLE << SCALEBITS)
 PD_ONEHALF      times 2 dd  (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm
index 686d222ff700..ff6a2ecd13af 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jccolor-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jccolor.asm - colorspace conversion (SSE2)
 ;
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +32,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
 
 EXTN(jconst_rgb_ycc_convert_sse2):
@@ -45,7 +45,7 @@ PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
                             (CENTERJSAMPLE << SCALEBITS)
 PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm
index 560ee0c71e2b..564974f84951 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-avx2.asm
@@ -1,7 +1,7 @@
 ;
 ; jcgray.asm - grayscale colorspace conversion (AVX2)
 ;
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -29,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
 
 EXTN(jconst_rgb_gray_convert_avx2):
@@ -38,7 +38,7 @@ PW_F0299_F0337 times 8 dw F_0_299, F_0_337
 PW_F0114_F0250 times 8 dw F_0_114, F_0_250
 PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm
index 79fdf082a848..e791ea4aa61b 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-mmx.asm
@@ -2,7 +2,7 @@
 ; jcgray.asm - grayscale colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -29,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
 
 EXTN(jconst_rgb_gray_convert_mmx):
@@ -38,7 +38,7 @@ PW_F0299_F0337 times 2 dw F_0_299, F_0_337
 PW_F0114_F0250 times 2 dw F_0_114, F_0_250
 PD_ONEHALF     times 2 dd (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm
index cb4b28e8f495..70c0177db318 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgray-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jcgray.asm - grayscale colorspace conversion (SSE2)
 ;
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -28,7 +28,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
 
 EXTN(jconst_rgb_gray_convert_sse2):
@@ -37,7 +37,7 @@ PW_F0299_F0337 times 4 dw F_0_299, F_0_337
 PW_F0114_F0250 times 4 dw F_0_114, F_0_250
 PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm
index 3fa7973d72b4..0fb284aaf919 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-avx2.asm
@@ -1,7 +1,7 @@
 ;
 ; jcgryext.asm - grayscale colorspace conversion (AVX2)
 ;
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -49,15 +49,15 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [img_width(eax)]
     test        ecx, ecx
@@ -76,20 +76,20 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
-    pushpic     eax
+    PUSHPIC     eax
     push        edi
     push        esi
     push        ecx                     ; col
 
     mov         esi, JSAMPROW [esi]     ; inptr
     mov         edi, JSAMPROW [edi]     ; outptr0
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     cmp         ecx, byte SIZEOF_YMMWORD
     jae         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 %if RGB_PIXELSIZE == 3  ; ---------------
 
@@ -146,7 +146,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
     vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
     jmp         short .rgb_gray_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -270,7 +270,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
     vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
     jmp         short .rgb_gray_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -433,7 +433,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     pop         ecx                     ; col
     pop         esi
     pop         edi
-    poppic      eax
+    POPPIC      eax
 
     add         esi, byte SIZEOF_JSAMPROW  ; input_buf
     add         edi, byte SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm
index 8af42e5a3322..1c69d3829167 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-mmx.asm
@@ -2,7 +2,7 @@
 ; jcgryext.asm - grayscale colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
     test        ecx, ecx
@@ -76,20 +76,20 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
-    pushpic     eax
+    PUSHPIC     eax
     push        edi
     push        esi
     push        ecx                     ; col
 
     mov         esi, JSAMPROW [esi]     ; inptr
     mov         edi, JSAMPROW [edi]     ; outptr0
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     cmp         ecx, byte SIZEOF_MMWORD
     jae         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 %if RGB_PIXELSIZE == 3  ; ---------------
 
@@ -135,7 +135,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
     movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
     jmp         short .rgb_gray_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -203,7 +203,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
     movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
     jmp         short .rgb_gray_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -330,7 +330,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
     pop         ecx                     ; col
     pop         esi
     pop         edi
-    poppic      eax
+    POPPIC      eax
 
     add         esi, byte SIZEOF_JSAMPROW  ; input_buf
     add         edi, byte SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm
index c9d6ff1e351c..f710816a443d 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcgryext-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jcgryext.asm - grayscale colorspace conversion (SSE2)
 ;
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -48,15 +48,15 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [img_width(eax)]
     test        ecx, ecx
@@ -75,20 +75,20 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
-    pushpic     eax
+    PUSHPIC     eax
     push        edi
     push        esi
     push        ecx                     ; col
 
     mov         esi, JSAMPROW [esi]     ; inptr
     mov         edi, JSAMPROW [edi]     ; outptr0
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     cmp         ecx, byte SIZEOF_XMMWORD
     jae         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 %if RGB_PIXELSIZE == 3  ; ---------------
 
@@ -139,7 +139,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
     movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
     jmp         short .rgb_gray_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -224,7 +224,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
     movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
     jmp         short .rgb_gray_cnv
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -359,7 +359,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     pop         ecx                     ; col
     pop         esi
     pop         edi
-    poppic      eax
+    POPPIC      eax
 
     add         esi, byte SIZEOF_JSAMPROW  ; input_buf
     add         edi, byte SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm
index 278cf5e83af3..4adf5eb51448 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jchuff-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
 ;
-; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2017, 2019, 2024, D. R. Commander.
 ; Copyright (C) 2015, Matthieu Darbois.
 ; Copyright (C) 2018, Matthias Räncker.
 ;
@@ -42,7 +42,7 @@ endstruc
 
 EXTN(jconst_huff_encode_one_block):
 
-    alignz      32
+    ALIGNZ      32
 
 jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
                dq 0x000f, 0x001f, 0x003f, 0x007f
@@ -65,7 +65,8 @@ times 1 <<  2 db  3
 times 1 <<  1 db  2
 times 1 <<  0 db  1
 times 1       db  0
-jpeg_nbits_table:
+GLOBAL_DATA(jpeg_nbits_table)
+EXTN(jpeg_nbits_table):
 times 1       db  0
 times 1 <<  0 db  1
 times 1 <<  1 db  2
@@ -83,14 +84,14 @@ times 1 << 12 db 13
 times 1 << 13 db 14
 times 1 << 14 db 15
 
-    alignz      32
+    ALIGNZ      32
 
 %ifdef PIC
 %define NBITS(x)      nbits_base + x
 %else
-%define NBITS(x)      jpeg_nbits_table + x
+%define NBITS(x)      EXTN(jpeg_nbits_table) + x
 %endif
-%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - EXTN(jpeg_nbits_table))
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -235,7 +236,7 @@ times 1 << 14 db 15
 
 ; If PIC is defined, load the address of a symbol defined in this file into a
 ; register.  Equivalent to
-;   get_GOT     %1
+;   GET_GOT     %1
 ;   lea         %1, [GOTOFF(%1, %2)]
 ; without using the GOT.
 ;
@@ -469,7 +470,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
 %endmacro
 
-    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+    GET_SYM     nbits_base, EXTN(jpeg_nbits_table), GET_SYM_BEFORE, GET_SYM_AFTER
 
     psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
     shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm
index 0a20802dd890..3d40f1d9fb6a 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-avx2.asm
@@ -3,7 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2015, Intel Corporation.
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -70,7 +70,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
 
     cld
     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .expandloop:
     push        eax
     push        ecx
@@ -106,7 +106,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
 
     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        ecx
     push        edi
@@ -117,7 +117,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
 
     cmp         ecx, byte SIZEOF_YMMWORD
     jae         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_r24:
     ; ecx can possibly be 8, 16, 24
@@ -141,7 +141,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
     vpxor       ymm1, ymm1, ymm1
     mov         ecx, SIZEOF_YMMWORD
     jmp         short .downsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
@@ -243,7 +243,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
 
     cld
     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .expandloop:
     push        eax
     push        ecx
@@ -279,7 +279,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
 
     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        ecx
     push        edi
@@ -291,7 +291,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
 
     cmp         ecx, byte SIZEOF_YMMWORD
     jae         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_r24:
     cmp         ecx, 24
@@ -320,7 +320,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
     vpxor       ymm3, ymm3, ymm3
     mov         ecx, SIZEOF_YMMWORD
     jmp         short .downsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm
index 2c223eebe816..38d5b322b65f 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-mmx.asm
@@ -2,7 +2,7 @@
 ; jcsample.asm - downsampling (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -69,7 +69,7 @@ EXTN(jsimd_h2v1_downsample_mmx):
 
     cld
     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .expandloop:
     push        eax
     push        ecx
@@ -104,7 +104,7 @@ EXTN(jsimd_h2v1_downsample_mmx):
 
     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        ecx
     push        edi
@@ -112,7 +112,7 @@ EXTN(jsimd_h2v1_downsample_mmx):
 
     mov         esi, JSAMPROW [esi]     ; inptr
     mov         edi, JSAMPROW [edi]     ; outptr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -212,7 +212,7 @@ EXTN(jsimd_h2v2_downsample_mmx):
 
     cld
     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .expandloop:
     push        eax
     push        ecx
@@ -247,7 +247,7 @@ EXTN(jsimd_h2v2_downsample_mmx):
 
     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        ecx
     push        edi
@@ -256,7 +256,7 @@ EXTN(jsimd_h2v2_downsample_mmx):
     mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
     mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
     mov         edi, JSAMPROW [edi]                    ; outptr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm
index 4fea60d2e210..26c5d7407e1c 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jcsample-sse2.asm
@@ -2,7 +2,7 @@
 ; jcsample.asm - downsampling (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -69,7 +69,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
 
     cld
     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .expandloop:
     push        eax
     push        ecx
@@ -104,7 +104,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
 
     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        ecx
     push        edi
@@ -115,14 +115,14 @@ EXTN(jsimd_h2v1_downsample_sse2):
 
     cmp         ecx, byte SIZEOF_XMMWORD
     jae         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_r8:
     movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
     pxor        xmm1, xmm1
     mov         ecx, SIZEOF_XMMWORD
     jmp         short .downsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -225,7 +225,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
 
     cld
     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .expandloop:
     push        eax
     push        ecx
@@ -260,7 +260,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
 
     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        ecx
     push        edi
@@ -272,7 +272,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
 
     cmp         ecx, byte SIZEOF_XMMWORD
     jae         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_r8:
     movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
@@ -281,7 +281,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
     pxor        xmm3, xmm3
     mov         ecx, SIZEOF_XMMWORD
     jmp         short .downsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm
index 015be0416c5c..53ea3128fc14 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-avx2.asm
@@ -2,7 +2,7 @@
 ; jdcolext.asm - colorspace conversion (AVX2)
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2012, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -50,15 +50,15 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
     test        ecx, ecx
@@ -81,7 +81,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax
     push        edi
@@ -94,8 +94,8 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     mov         ebx, JSAMPROW [ebx]     ; inptr1
     mov         edx, JSAMPROW [edx]     ; inptr2
     mov         edi, JSAMPROW [edi]     ; outptr
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
-    alignx      16, 7
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
+    ALIGNX      16, 7
 .columnloop:
 
     vmovdqu     ymm5, YMMWORD [ebx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
@@ -295,7 +295,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     add         ebx, byte SIZEOF_YMMWORD  ; inptr1
     add         edx, byte SIZEOF_YMMWORD  ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st64:
     lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
@@ -436,7 +436,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     add         ebx, byte SIZEOF_YMMWORD  ; inptr1
     add         edx, byte SIZEOF_YMMWORD  ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st64:
     cmp         ecx, byte SIZEOF_YMMWORD/2
@@ -479,7 +479,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         ecx
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm
index 5813cfcb66f5..d97faee004af 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-mmx.asm
@@ -2,7 +2,7 @@
 ; jdcolext.asm - colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
     test        ecx, ecx
@@ -80,7 +80,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax
     push        edi
@@ -93,8 +93,8 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
     mov         ebx, JSAMPROW [ebx]     ; inptr1
     mov         edx, JSAMPROW [edx]     ; inptr2
     mov         edi, JSAMPROW [edi]     ; outptr
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
-    alignx      16, 7
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
+    ALIGNX      16, 7
 .columnloop:
 
     movq        mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
@@ -255,7 +255,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
     add         edx, byte SIZEOF_MMWORD                ; inptr2
     add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st16:
     lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
@@ -344,7 +344,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
     add         edx, byte SIZEOF_MMWORD                ; inptr2
     add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st16:
     cmp         ecx, byte SIZEOF_MMWORD/2
@@ -369,7 +369,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         ecx
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm
index d5572b32946c..682efc730fc0 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolext-sse2.asm
@@ -2,7 +2,7 @@
 ; jdcolext.asm - colorspace conversion (SSE2)
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2012, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
     test        ecx, ecx
@@ -80,7 +80,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     mov         eax, INT [num_rows(eax)]
     test        eax, eax
     jle         near .return
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax
     push        edi
@@ -93,8 +93,8 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     mov         ebx, JSAMPROW [ebx]     ; inptr1
     mov         edx, JSAMPROW [edx]     ; inptr2
     mov         edi, JSAMPROW [edi]     ; outptr
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
-    alignx      16, 7
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
+    ALIGNX      16, 7
 .columnloop:
 
     movdqa      xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
@@ -275,7 +275,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     add         ebx, byte SIZEOF_XMMWORD  ; inptr1
     add         edx, byte SIZEOF_XMMWORD  ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st32:
     lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
@@ -387,7 +387,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     add         ebx, byte SIZEOF_XMMWORD  ; inptr1
     add         edx, byte SIZEOF_XMMWORD  ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st32:
     cmp         ecx, byte SIZEOF_XMMWORD/2
@@ -423,7 +423,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         ecx
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm
index e05b60d00179..0f9baf840c58 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-avx2.asm
@@ -3,7 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2015, Intel Corporation.
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +32,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
 
 EXTN(jconst_ycc_rgb_convert_avx2):
@@ -43,7 +43,7 @@ PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
 PW_ONE          times 16 dw  1
 PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm
index fb7e7bcce4b0..21e833292c0c 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-mmx.asm
@@ -2,7 +2,7 @@
 ; jdcolor.asm - colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
 
 EXTN(jconst_ycc_rgb_convert_mmx):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
 PW_ONE          times 4 dw  1
 PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm
index b736255317e3..481d0e4c9578 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdcolor-sse2.asm
@@ -2,7 +2,7 @@
 ; jdcolor.asm - colorspace conversion (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
 
 EXTN(jconst_ycc_rgb_convert_sse2):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
 PW_ONE          times 8 dw  1
 PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm
index 711e6792d0f5..00201dc419da 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-avx2.asm
@@ -2,7 +2,7 @@
 ; jdmerge.asm - merged upsampling/color conversion (AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -32,7 +32,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_merged_upsample_avx2)
 
 EXTN(jconst_merged_upsample_avx2):
@@ -43,7 +43,7 @@ PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
 PW_ONE          times 16 dw  1
 PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm
index 6e8311d40816..be28c63f539c 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-mmx.asm
@@ -2,7 +2,7 @@
 ; jdmerge.asm - merged upsampling/color conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_merged_upsample_mmx)
 
 EXTN(jconst_merged_upsample_mmx):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
 PW_ONE          times 4 dw  1
 PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm
index e32f90aa1778..9b40a67dbd67 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmerge-sse2.asm
@@ -2,7 +2,7 @@
 ; jdmerge.asm - merged upsampling/color conversion (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_merged_upsample_sse2)
 
 EXTN(jconst_merged_upsample_sse2):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
 PW_ONE          times 8 dw  1
 PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm
index e35f7282bc41..97988eb60272 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-avx2.asm
@@ -2,7 +2,7 @@
 ; jdmrgext.asm - merged upsampling/color conversion (AVX2)
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2012, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -50,15 +50,15 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [output_width(eax)]  ; col
     test        ecx, ecx
@@ -79,9 +79,9 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
 
     pop         ecx                     ; col
 
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     vmovdqu     ymm6, YMMWORD [ebx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
     vmovdqu     ymm7, YMMWORD [edx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
@@ -168,13 +168,13 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
 
     mov         al, 2                   ; Yctr
     jmp         short .Yloop_1st
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .Yloop_2nd:
     vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
     vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
     vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .Yloop_1st:
     vmovdqu     ymm7, YMMWORD [esi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
@@ -301,7 +301,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     add         ebx, byte SIZEOF_YMMWORD  ; inptr1
     add         edx, byte SIZEOF_YMMWORD  ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st64:
     lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
@@ -445,7 +445,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     add         ebx, byte SIZEOF_YMMWORD  ; inptr1
     add         edx, byte SIZEOF_YMMWORD  ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st64:
     cmp         ecx, byte SIZEOF_YMMWORD/2
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm
index eb3e36b4759b..79cee73dbdec 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-mmx.asm
@@ -2,7 +2,7 @@
 ; jdmrgext.asm - merged upsampling/color conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -47,15 +47,15 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [output_width(eax)]  ; col
     test        ecx, ecx
@@ -76,9 +76,9 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
 
     pop         ecx                     ; col
 
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     movq        mm6, MMWORD [ebx]       ; mm6=Cb(01234567)
     movq        mm7, MMWORD [edx]       ; mm7=Cr(01234567)
@@ -171,13 +171,13 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
 
     mov         al, 2                   ; Yctr
     jmp         short .Yloop_1st
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .Yloop_2nd:
     movq        mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
     movq        mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
     movq        mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .Yloop_1st:
     movq        mm7, MMWORD [esi]       ; mm7=Y(01234567)
@@ -258,7 +258,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
     add         ebx, byte SIZEOF_MMWORD                ; inptr1
     add         edx, byte SIZEOF_MMWORD                ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st16:
     lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
@@ -350,7 +350,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
     add         ebx, byte SIZEOF_MMWORD                ; inptr1
     add         edx, byte SIZEOF_MMWORD                ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st16:
     cmp         ecx, byte SIZEOF_MMWORD/2
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm
index c113dc4d27ed..331344358b19 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdmrgext-sse2.asm
@@ -2,7 +2,7 @@
 ; jdmrgext.asm - merged upsampling/color conversion (SSE2)
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2012, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,15 +49,15 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         ecx, JDIMENSION [output_width(eax)]  ; col
     test        ecx, ecx
@@ -78,9 +78,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 
     pop         ecx                     ; col
 
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
-    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
 
     movdqa      xmm6, XMMWORD [ebx]     ; xmm6=Cb(0123456789ABCDEF)
     movdqa      xmm7, XMMWORD [edx]     ; xmm7=Cr(0123456789ABCDEF)
@@ -173,13 +173,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 
     mov         al, 2                   ; Yctr
     jmp         short .Yloop_1st
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .Yloop_2nd:
     movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
     movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
     movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .Yloop_1st:
     movdqa      xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
@@ -280,7 +280,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     add         ebx, byte SIZEOF_XMMWORD  ; inptr1
     add         edx, byte SIZEOF_XMMWORD  ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st32:
     lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
@@ -395,7 +395,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     add         ebx, byte SIZEOF_XMMWORD  ; inptr1
     add         edx, byte SIZEOF_XMMWORD  ; inptr2
     jmp         near .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .column_st32:
     cmp         ecx, byte SIZEOF_XMMWORD/2
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm
index a800c35e0835..b0507aa5d610 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-avx2.asm
@@ -3,7 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2015, Intel Corporation.
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,7 +20,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fancy_upsample_avx2)
 
 EXTN(jconst_fancy_upsample_avx2):
@@ -31,7 +31,7 @@ PW_THREE times 16 dw 3
 PW_SEVEN times 16 dw 7
 PW_EIGHT times 16 dw 8
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -62,13 +62,13 @@ PW_EIGHT times 16 dw 8
 EXTN(jsimd_h2v1_fancy_upsample_avx2):
     push        ebp
     mov         ebp, esp
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
     test        eax, eax
@@ -81,7 +81,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax                     ; colctr
     push        edi
@@ -104,7 +104,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
     and         eax, byte -SIZEOF_YMMWORD
     cmp         eax, byte SIZEOF_YMMWORD
     ja          short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_last:
     vpcmpeqb    xmm6, xmm6, xmm6
@@ -112,7 +112,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
     vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
     vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
     jmp         short .upsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
@@ -196,7 +196,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
     pop         esi
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; need not be preserved
-    poppic      ebx
+    POPPIC      ebx
     pop         ebp
     ret
 
@@ -234,15 +234,15 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         edx, eax                ; edx = original ebp
     mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
@@ -256,7 +256,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
     mov         edi, POINTER [output_data_ptr(edx)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax                     ; colctr
     push        ecx
@@ -286,8 +286,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
     vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
 
@@ -328,19 +328,19 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     vmovdqa     YMMWORD [wk(0)], ymm1
     vmovdqa     YMMWORD [wk(1)], ymm2
 
-    poppic      ebx
+    POPPIC      ebx
 
     add         eax, byte SIZEOF_YMMWORD-1
     and         eax, byte -SIZEOF_YMMWORD
     cmp         eax, byte SIZEOF_YMMWORD
     ja          short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_last:
     ; -- process the last column block
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     vpcmpeqb    xmm1, xmm1, xmm1
     vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
@@ -353,7 +353,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
 
     jmp         near .upsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     ; -- process the next column block
@@ -362,8 +362,8 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
     vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
 
@@ -516,7 +516,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
     vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
 
-    poppic      ebx
+    POPPIC      ebx
 
     sub         eax, byte SIZEOF_YMMWORD
     add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
@@ -590,7 +590,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        edi
     push        esi
@@ -598,7 +598,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
     mov         esi, JSAMPROW [esi]     ; inptr
     mov         edi, JSAMPROW [edi]     ; outptr
     mov         eax, edx                ; colctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     cmp         eax, byte SIZEOF_YMMWORD
@@ -629,7 +629,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
     add         esi, byte SIZEOF_YMMWORD    ; inptr
     add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
     jmp         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         esi
@@ -689,7 +689,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        edi
     push        esi
@@ -698,7 +698,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
     mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
     mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
     mov         eax, edx                               ; colctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     cmp         eax, byte SIZEOF_YMMWORD
@@ -734,7 +734,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
     add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
     add         edi, 2*SIZEOF_YMMWORD     ; outptr1
     jmp         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         esi
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm
index 12c49f0eab57..6f70499c9777 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-mmx.asm
@@ -2,7 +2,7 @@
 ; jdsample.asm - upsampling (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -19,7 +19,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fancy_upsample_mmx)
 
 EXTN(jconst_fancy_upsample_mmx):
@@ -30,7 +30,7 @@ PW_THREE times 4 dw 3
 PW_SEVEN times 4 dw 7
 PW_EIGHT times 4 dw 8
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -61,13 +61,13 @@ PW_EIGHT times 4 dw 8
 EXTN(jsimd_h2v1_fancy_upsample_mmx):
     push        ebp
     mov         ebp, esp
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
     test        eax, eax
@@ -80,7 +80,7 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax                     ; colctr
     push        edi
@@ -103,14 +103,14 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx):
     and         eax, byte -SIZEOF_MMWORD
     cmp         eax, byte SIZEOF_MMWORD
     ja          short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_last:
     pcmpeqb     mm6, mm6
     psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
     pand        mm6, MMWORD [esi+0*SIZEOF_MMWORD]
     jmp         short .upsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movq        mm6, MMWORD [esi+1*SIZEOF_MMWORD]
@@ -187,7 +187,7 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx):
     pop         esi
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; need not be preserved
-    poppic      ebx
+    POPPIC      ebx
     pop         ebp
     ret
 
@@ -224,15 +224,15 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         edx, eax                ; edx = original ebp
     mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
@@ -246,7 +246,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
     mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
     mov         edi, POINTER [output_data_ptr(edx)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax                     ; colctr
     push        ecx
@@ -276,8 +276,8 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
     movq        mm1, MMWORD [ecx+0*SIZEOF_MMWORD]  ; mm1=row[-1][0]
     movq        mm2, MMWORD [esi+0*SIZEOF_MMWORD]  ; mm2=row[+1][0]
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     pxor        mm3, mm3                ; mm3=(all 0's)
     movq        mm4, mm0
@@ -312,19 +312,19 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
     movq        MMWORD [wk(0)], mm1
     movq        MMWORD [wk(1)], mm2
 
-    poppic      ebx
+    POPPIC      ebx
 
     add         eax, byte SIZEOF_MMWORD-1
     and         eax, byte -SIZEOF_MMWORD
     cmp         eax, byte SIZEOF_MMWORD
     ja          short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_last:
     ; -- process the last column block
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     pcmpeqb     mm1, mm1
     psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
@@ -337,7 +337,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
     movq        MMWORD [wk(3)], mm2
 
     jmp         short .upsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     ; -- process the next column block
@@ -346,8 +346,8 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
     movq        mm1, MMWORD [ecx+1*SIZEOF_MMWORD]  ; mm1=row[-1][1]
     movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]  ; mm2=row[+1][1]
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     pxor        mm3, mm3                ; mm3=(all 0's)
     movq        mm4, mm0
@@ -486,7 +486,7 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx):
     movq        MMWORD [edi+0*SIZEOF_MMWORD], mm1
     movq        MMWORD [edi+1*SIZEOF_MMWORD], mm0
 
-    poppic      ebx
+    POPPIC      ebx
 
     sub         eax, byte SIZEOF_MMWORD
     add         ecx, byte 1*SIZEOF_MMWORD  ; inptr1(above)
@@ -561,7 +561,7 @@ EXTN(jsimd_h2v1_upsample_mmx):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        edi
     push        esi
@@ -569,7 +569,7 @@ EXTN(jsimd_h2v1_upsample_mmx):
     mov         esi, JSAMPROW [esi]     ; inptr
     mov         edi, JSAMPROW [edi]     ; outptr
     mov         eax, edx                ; colctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -599,7 +599,7 @@ EXTN(jsimd_h2v1_upsample_mmx):
     add         esi, byte 2*SIZEOF_MMWORD  ; inptr
     add         edi, byte 4*SIZEOF_MMWORD  ; outptr
     jmp         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         esi
@@ -660,7 +660,7 @@ EXTN(jsimd_h2v2_upsample_mmx):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        edi
     push        esi
@@ -669,7 +669,7 @@ EXTN(jsimd_h2v2_upsample_mmx):
     mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
     mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
     mov         eax, edx                               ; colctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
@@ -704,7 +704,7 @@ EXTN(jsimd_h2v2_upsample_mmx):
     add         ebx, byte 4*SIZEOF_MMWORD  ; outptr0
     add         edi, byte 4*SIZEOF_MMWORD  ; outptr1
     jmp         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         esi
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm
index 4e28d2f4b802..f68c5ea54500 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jdsample-sse2.asm
@@ -2,7 +2,7 @@
 ; jdsample.asm - upsampling (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -19,7 +19,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fancy_upsample_sse2)
 
 EXTN(jconst_fancy_upsample_sse2):
@@ -30,7 +30,7 @@ PW_THREE times 8 dw 3
 PW_SEVEN times 8 dw 7
 PW_EIGHT times 8 dw 8
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -61,13 +61,13 @@ PW_EIGHT times 8 dw 8
 EXTN(jsimd_h2v1_fancy_upsample_sse2):
     push        ebp
     mov         ebp, esp
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
     test        eax, eax
@@ -80,7 +80,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax                     ; colctr
     push        edi
@@ -103,14 +103,14 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
     and         eax, byte -SIZEOF_XMMWORD
     cmp         eax, byte SIZEOF_XMMWORD
     ja          short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_last:
     pcmpeqb     xmm6, xmm6
     pslldq      xmm6, (SIZEOF_XMMWORD-1)
     pand        xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
     jmp         short .upsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     movdqa      xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
@@ -185,7 +185,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
     pop         esi
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; need not be preserved
-    poppic      ebx
+    POPPIC      ebx
     pop         ebp
     ret
 
@@ -223,15 +223,15 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     eax                     ; make a room for GOT address
+    PUSHPIC     eax                     ; make a room for GOT address
     push        ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
-    movpic      POINTER [gotptr], ebx   ; save GOT address
+    GET_GOT     ebx                     ; get GOT address
+    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
 
     mov         edx, eax                ; edx = original ebp
     mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
@@ -245,7 +245,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
     mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
     mov         edi, POINTER [output_data_ptr(edx)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        eax                     ; colctr
     push        ecx
@@ -275,8 +275,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
     movdqa      xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
     movdqa      xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     pxor        xmm3, xmm3              ; xmm3=(all 0's)
     movdqa      xmm4, xmm0
@@ -311,19 +311,19 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
     movdqa      XMMWORD [wk(0)], xmm1
     movdqa      XMMWORD [wk(1)], xmm2
 
-    poppic      ebx
+    POPPIC      ebx
 
     add         eax, byte SIZEOF_XMMWORD-1
     and         eax, byte -SIZEOF_XMMWORD
     cmp         eax, byte SIZEOF_XMMWORD
     ja          short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop_last:
     ; -- process the last column block
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     pcmpeqb     xmm1, xmm1
     pslldq      xmm1, (SIZEOF_XMMWORD-2)
@@ -336,7 +336,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
     movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
 
     jmp         near .upsample
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .columnloop:
     ; -- process the next column block
@@ -345,8 +345,8 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
     movdqa      xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
     movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
 
-    pushpic     ebx
-    movpic      ebx, POINTER [gotptr]   ; load GOT address
+    PUSHPIC     ebx
+    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
 
     pxor        xmm3, xmm3              ; xmm3=(all 0's)
     movdqa      xmm4, xmm0
@@ -485,7 +485,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
     movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
     movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
 
-    poppic      ebx
+    POPPIC      ebx
 
     sub         eax, byte SIZEOF_XMMWORD
     add         ecx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
@@ -558,7 +558,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        edi
     push        esi
@@ -566,7 +566,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
     mov         esi, JSAMPROW [esi]     ; inptr
     mov         edi, JSAMPROW [edi]     ; outptr
     mov         eax, edx                ; colctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -596,7 +596,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
     add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
     add         edi, byte 4*SIZEOF_XMMWORD  ; outptr
     jmp         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         esi
@@ -655,7 +655,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
     mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     mov         edi, POINTER [output_data_ptr(ebp)]
     mov         edi, JSAMPARRAY [edi]                ; output_data
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
     push        edi
     push        esi
@@ -664,7 +664,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
     mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
     mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
     mov         eax, edx                               ; colctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
@@ -699,7 +699,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
     add         ebx, byte 4*SIZEOF_XMMWORD  ; outptr0
     add         edi, byte 4*SIZEOF_XMMWORD  ; outptr1
     jmp         short .columnloop
-    alignx      16, 7
+    ALIGNX      16, 7
 
 .nextrow:
     pop         esi
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm
index 322ab1632526..34af2bf0ba0f 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-3dn.asm
@@ -2,7 +2,7 @@
 ; jfdctflt.asm - floating-point FDCT (3DNow!)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -24,7 +24,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_float_3dnow)
 
 EXTN(jconst_fdct_float_3dnow):
@@ -34,7 +34,7 @@ PD_0_707 times 2 dd 0.707106781186547524400844
 PD_0_541 times 2 dd 0.541196100146196984399723
 PD_1_306 times 2 dd 1.306562964876376527856643
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -63,19 +63,19 @@ EXTN(jsimd_fdct_float_3dnow):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
 ;   push        esi                     ; unused
 ;   push        edi                     ; unused
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process rows.
 
     mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
     mov         ecx, DCTSIZE/2
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
@@ -190,7 +190,7 @@ EXTN(jsimd_fdct_float_3dnow):
 
     mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
     mov         ecx, DCTSIZE/2
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
@@ -307,7 +307,7 @@ EXTN(jsimd_fdct_float_3dnow):
 ;   pop         esi                     ; unused
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; need not be preserved
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm
index 86952c6499cf..d247094b648b 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctflt-sse.asm
@@ -2,7 +2,7 @@
 ; jfdctflt.asm - floating-point FDCT (SSE)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,7 +34,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_float_sse)
 
 EXTN(jconst_fdct_float_sse):
@@ -44,7 +44,7 @@ PD_0_707 times 4 dd 0.707106781186547524400844
 PD_0_541 times 4 dd 0.541196100146196984399723
 PD_1_306 times 4 dd 1.306562964876376527856643
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -74,19 +74,19 @@ EXTN(jsimd_fdct_float_sse):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
 ;   push        esi                     ; unused
 ;   push        edi                     ; unused
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process rows.
 
     mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
     mov         ecx, DCTSIZE/4
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
@@ -222,7 +222,7 @@ EXTN(jsimd_fdct_float_sse):
 
     mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
     mov         ecx, DCTSIZE/4
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
@@ -358,7 +358,7 @@ EXTN(jsimd_fdct_float_sse):
 ;   pop         esi                     ; unused
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; need not be preserved
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm
index 80645a50d7e7..8c55a9876dc4 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-mmx.asm
@@ -2,7 +2,7 @@
 ; jfdctfst.asm - fast integer FDCT (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,7 +49,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
 %define PRE_MULTIPLY_SCALE_BITS  2
 %define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_ifast_mmx)
 
 EXTN(jconst_fdct_ifast_mmx):
@@ -59,7 +59,7 @@ PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
 PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
 PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -88,19 +88,19 @@ EXTN(jsimd_fdct_ifast_mmx):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
 ;   push        esi                     ; unused
 ;   push        edi                     ; unused
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process rows.
 
     mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
     mov         ecx, DCTSIZE/4
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
@@ -241,7 +241,7 @@ EXTN(jsimd_fdct_ifast_mmx):
 
     mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
     mov         ecx, DCTSIZE/4
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
@@ -384,7 +384,7 @@ EXTN(jsimd_fdct_ifast_mmx):
 ;   pop         esi                     ; unused
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; need not be preserved
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm
index 446fa7a68f78..c1ba533d6d88 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctfst-sse2.asm
@@ -2,7 +2,7 @@
 ; jfdctfst.asm - fast integer FDCT (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,7 +49,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
 %define PRE_MULTIPLY_SCALE_BITS  2
 %define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_ifast_sse2)
 
 EXTN(jconst_fdct_ifast_sse2):
@@ -59,7 +59,7 @@ PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
 PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
 PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -89,13 +89,13 @@ EXTN(jsimd_fdct_ifast_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; unused
 ;   push        edx                     ; need not be preserved
 ;   push        esi                     ; unused
 ;   push        edi                     ; unused
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process rows.
 
@@ -392,7 +392,7 @@ EXTN(jsimd_fdct_ifast_sse2):
 ;   pop         esi                     ; unused
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; unused
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm
index 23cf733135bb..21c3d5b22375 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-avx2.asm
@@ -2,7 +2,7 @@
 ; jfdctint.asm - accurate integer FDCT (AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; %1-%4: Input/output registers
 ; %5-%8: Temp registers
 
-%macro dotranspose 8
+%macro DOTRANSPOSE 8
     ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
     ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
     ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
@@ -108,7 +108,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; %5-%8: Temp registers
 ; %9:    Pass (1 or 2)
 
-%macro dodct 9
+%macro DODCT 9
     vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
     vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
     vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
@@ -223,7 +223,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_islow_avx2)
 
 EXTN(jconst_fdct_islow_avx2):
@@ -242,7 +242,7 @@ PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
 PW_1_NEG1                  times 8  dw  1
                            times 8  dw -1
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -262,13 +262,13 @@ PW_1_NEG1                  times 8  dw  1
 EXTN(jsimd_fdct_islow_avx2):
     push        ebp
     mov         ebp, esp
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; unused
 ;   push        edx                     ; need not be preserved
 ;   push        esi                     ; unused
 ;   push        edi                     ; unused
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process rows.
 
@@ -292,9 +292,9 @@ EXTN(jsimd_fdct_islow_avx2):
     ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
     ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
 
-    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
 
-    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    DODCT       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
     ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
 
     ; ---- Pass 2: process columns.
@@ -302,9 +302,9 @@ EXTN(jsimd_fdct_islow_avx2):
     vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
     vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
 
-    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
 
-    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    DODCT       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
     ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
 
     vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
@@ -322,7 +322,7 @@ EXTN(jsimd_fdct_islow_avx2):
 ;   pop         esi                     ; unused
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; unused
-    poppic      ebx
+    POPPIC      ebx
     pop         ebp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm
index 34a43b9e5ef6..c2f308ed3b62 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-mmx.asm
@@ -2,7 +2,7 @@
 ; jfdctint.asm - accurate integer FDCT (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, 2020, D. R. Commander.
+; Copyright (C) 2016, 2020, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +63,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_islow_mmx)
 
 EXTN(jconst_fdct_islow_mmx):
@@ -80,7 +80,7 @@ PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
 PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
 PW_DESCALE_P2X times 4 dw  1 << (PASS1_BITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -109,19 +109,19 @@ EXTN(jsimd_fdct_islow_mmx):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
 ;   push        esi                     ; unused
 ;   push        edi                     ; unused
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process rows.
 
     mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
     mov         ecx, DCTSIZE/4
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
@@ -363,7 +363,7 @@ EXTN(jsimd_fdct_islow_mmx):
 
     mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
     mov         ecx, DCTSIZE/4
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 
     movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
@@ -609,7 +609,7 @@ EXTN(jsimd_fdct_islow_mmx):
 ;   pop         esi                     ; unused
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; need not be preserved
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm
index 6f8e18cb9d05..b6e679918d50 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jfdctint-sse2.asm
@@ -2,7 +2,7 @@
 ; jfdctint.asm - accurate integer FDCT (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, 2020, D. R. Commander.
+; Copyright (C) 2016, 2020, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +63,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_islow_sse2)
 
 EXTN(jconst_fdct_islow_sse2):
@@ -80,7 +80,7 @@ PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
 PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
 PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -110,13 +110,13 @@ EXTN(jsimd_fdct_islow_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; unused
 ;   push        edx                     ; need not be preserved
 ;   push        esi                     ; unused
 ;   push        edi                     ; unused
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process rows.
 
@@ -622,7 +622,7 @@ EXTN(jsimd_fdct_islow_sse2):
 ;   pop         esi                     ; unused
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; unused
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm
index 87951910d8e5..1f696cb59bcc 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-3dn.asm
@@ -2,7 +2,7 @@
 ; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -24,7 +24,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_float_3dnow)
 
 EXTN(jconst_idct_float_3dnow):
@@ -36,7 +36,7 @@ PD_2_613        times 2 dd 2.613125929752753055713286
 PD_RNDINT_MAGIC times 2 dd 100663296.0  ; (float)(0x00C00000 << 3)
 PB_CENTERJSAMP  times 8 db CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -78,7 +78,7 @@ EXTN(jsimd_idct_float_3dnow):
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input, store into work array.
 
@@ -87,21 +87,21 @@ EXTN(jsimd_idct_float_3dnow):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
     lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
     mov         ecx, DCTSIZE/2                   ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
     mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
     or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
-    pushpic     ebx                     ; save GOT address
+    PUSHPIC     ebx                     ; save GOT address
     mov         ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
     mov         eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
     or          ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
     or          eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
     or          ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
     or          eax, ebx
-    poppic      ebx                     ; restore GOT address
+    POPPIC      ebx                     ; restore GOT address
     jnz         short .columnDCT
 
     ; -- AC terms all zero
@@ -127,7 +127,7 @@ EXTN(jsimd_idct_float_3dnow):
     movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
     movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
     jmp         near .nextcolumn
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -293,7 +293,7 @@ EXTN(jsimd_idct_float_3dnow):
     mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
     mov         eax, JDIMENSION [output_col(eax)]
     mov         ecx, DCTSIZE/2                     ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     ; -- Even part
@@ -420,14 +420,14 @@ EXTN(jsimd_idct_float_3dnow):
     punpckldq   mm6, mm4                ; mm6=(00 01 02 03 04 05 06 07)
     punpckhdq   mm7, mm4                ; mm7=(10 11 12 13 14 15 16 17)
 
-    pushpic     ebx                     ; save GOT address
+    PUSHPIC     ebx                     ; save GOT address
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
     movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
     movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
 
-    poppic      ebx                     ; restore GOT address
+    POPPIC      ebx                     ; restore GOT address
 
     add         esi, byte 2*SIZEOF_FAST_FLOAT  ; wsptr
     add         edi, byte 2*SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm
index b27ecfdf46a0..daeef22afc94 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse.asm
@@ -2,7 +2,7 @@
 ; jidctflt.asm - floating-point IDCT (SSE & MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -23,18 +23,18 @@
 
 ; --------------------------------------------------------------------------
 
-%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+%macro UNPCKLPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     shufps      %1, %2, 0x44
 %endmacro
 
-%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+%macro UNPCKHPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     shufps      %1, %2, 0xEE
 %endmacro
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_float_sse)
 
 EXTN(jconst_idct_float_sse):
@@ -46,7 +46,7 @@ PD_M2_613      times 4 dd -2.613125929752753055713286
 PD_0_125       times 4 dd  0.125        ; 1/8
 PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -88,7 +88,7 @@ EXTN(jsimd_idct_float_sse):
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input, store into work array.
 
@@ -97,7 +97,7 @@ EXTN(jsimd_idct_float_sse):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
     lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
     mov         ecx, DCTSIZE/4                   ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
     mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -149,7 +149,7 @@ EXTN(jsimd_idct_float_sse):
     movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
     movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
     jmp         near .nextcolumn
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -325,11 +325,11 @@ EXTN(jsimd_idct_float_sse):
     unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
 
     movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
-    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
-    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    UNPCKLPS2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    UNPCKHPS2   xmm3, xmm7              ; xmm3=(01 11 21 31)
     movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
-    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
-    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+    UNPCKLPS2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    UNPCKHPS2   xmm0, xmm2              ; xmm0=(03 13 23 33)
 
     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
     movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
@@ -340,11 +340,11 @@ EXTN(jsimd_idct_float_sse):
     movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
 
     movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
-    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
-    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    UNPCKLPS2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    UNPCKHPS2   xmm6, xmm7              ; xmm6=(41 51 61 71)
     movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
-    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
-    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+    UNPCKLPS2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    UNPCKHPS2   xmm3, xmm2              ; xmm3=(43 53 63 73)
 
     movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
     movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
@@ -372,7 +372,7 @@ EXTN(jsimd_idct_float_sse):
     mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
     mov         eax, JDIMENSION [output_col(eax)]
     mov         ecx, DCTSIZE/4                     ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     ; -- Even part
@@ -536,7 +536,7 @@ EXTN(jsimd_idct_float_sse):
     punpckldq   mm5, mm6                ; mm5=(20 21 22 23 24 25 26 27)
     punpckhdq   mm4, mm6                ; mm4=(30 31 32 33 34 35 36 37)
 
-    pushpic     ebx                     ; save GOT address
+    PUSHPIC     ebx                     ; save GOT address
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
@@ -547,7 +547,7 @@ EXTN(jsimd_idct_float_sse):
     movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
     movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
 
-    poppic      ebx                     ; restore GOT address
+    POPPIC      ebx                     ; restore GOT address
 
     add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
     add         edi, byte 4*SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm
index c646eaef76ef..c39ffbe71b25 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctflt-sse2.asm
@@ -2,7 +2,7 @@
 ; jidctflt.asm - floating-point IDCT (SSE & SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -23,18 +23,18 @@
 
 ; --------------------------------------------------------------------------
 
-%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+%macro UNPCKLPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     shufps      %1, %2, 0x44
 %endmacro
 
-%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+%macro UNPCKHPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     shufps      %1, %2, 0xEE
 %endmacro
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_float_sse2)
 
 EXTN(jconst_idct_float_sse2):
@@ -46,7 +46,7 @@ PD_M2_613       times 4  dd -2.613125929752753055713286
 PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -88,7 +88,7 @@ EXTN(jsimd_idct_float_sse2):
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input, store into work array.
 
@@ -97,7 +97,7 @@ EXTN(jsimd_idct_float_sse2):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
     lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
     mov         ecx, DCTSIZE/4                   ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
     mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -150,7 +150,7 @@ EXTN(jsimd_idct_float_sse2):
     movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
     movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
     jmp         near .nextcolumn
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -287,11 +287,11 @@ EXTN(jsimd_idct_float_sse2):
     unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
 
     movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
-    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
-    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    UNPCKLPS2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    UNPCKHPS2   xmm3, xmm7              ; xmm3=(01 11 21 31)
     movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
-    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
-    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+    UNPCKLPS2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    UNPCKHPS2   xmm0, xmm2              ; xmm0=(03 13 23 33)
 
     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
     movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
@@ -302,11 +302,11 @@ EXTN(jsimd_idct_float_sse2):
     movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
 
     movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
-    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
-    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    UNPCKLPS2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    UNPCKHPS2   xmm6, xmm7              ; xmm6=(41 51 61 71)
     movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
-    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
-    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+    UNPCKLPS2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    UNPCKHPS2   xmm3, xmm2              ; xmm3=(43 53 63 73)
 
     movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
     movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
@@ -334,7 +334,7 @@ EXTN(jsimd_idct_float_sse2):
     mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
     mov         eax, JDIMENSION [output_col(eax)]
     mov         ecx, DCTSIZE/4                     ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     ; -- Even part
@@ -464,7 +464,7 @@ EXTN(jsimd_idct_float_sse2):
     pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
     pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 
-    pushpic     ebx                     ; save GOT address
+    PUSHPIC     ebx                     ; save GOT address
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
@@ -475,7 +475,7 @@ EXTN(jsimd_idct_float_sse2):
     movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
     movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
 
-    poppic      ebx                     ; restore GOT address
+    POPPIC      ebx                     ; restore GOT address
 
     add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
     add         edi, byte 4*SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm
index 24622d43693f..19de457f7898 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-mmx.asm
@@ -2,7 +2,7 @@
 ; jidctfst.asm - fast integer IDCT (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,7 +56,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
 %define PRE_MULTIPLY_SCALE_BITS  2
 %define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_ifast_mmx)
 
 EXTN(jconst_idct_ifast_mmx):
@@ -67,7 +67,7 @@ PW_MF1613      times 4 dw -F_1_613 << CONST_SHIFT
 PW_F1082       times 4 dw  F_1_082 << CONST_SHIFT
 PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -109,7 +109,7 @@ EXTN(jsimd_idct_ifast_mmx):
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input, store into work array.
 
@@ -118,7 +118,7 @@ EXTN(jsimd_idct_ifast_mmx):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
     lea         edi, [workspace]                 ; JCOEF *wsptr
     mov         ecx, DCTSIZE/4                   ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
     mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -163,7 +163,7 @@ EXTN(jsimd_idct_ifast_mmx):
     movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
     movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
     jmp         near .nextcolumn
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -326,7 +326,7 @@ EXTN(jsimd_idct_ifast_mmx):
     mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
     mov         eax, JDIMENSION [output_col(eax)]
     mov         ecx, DCTSIZE/4                     ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     ; -- Even part
@@ -464,7 +464,7 @@ EXTN(jsimd_idct_ifast_mmx):
     punpckldq   mm5, mm4                ; mm5=(20 21 22 23 24 25 26 27)
     punpckhdq   mm1, mm4                ; mm1=(30 31 32 33 34 35 36 37)
 
-    pushpic     ebx                     ; save GOT address
+    PUSHPIC     ebx                     ; save GOT address
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
@@ -475,7 +475,7 @@ EXTN(jsimd_idct_ifast_mmx):
     movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
     movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
 
-    poppic      ebx                     ; restore GOT address
+    POPPIC      ebx                     ; restore GOT address
 
     add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
     add         edi, byte 4*SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm
index 19704ffa48f3..966311eda764 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctfst-sse2.asm
@@ -2,7 +2,7 @@
 ; jidctfst.asm - fast integer IDCT (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,7 +56,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
 %define PRE_MULTIPLY_SCALE_BITS  2
 %define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_ifast_sse2)
 
 EXTN(jconst_idct_ifast_sse2):
@@ -67,7 +67,7 @@ PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
 PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
 PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -101,13 +101,13 @@ EXTN(jsimd_idct_ifast_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; unused
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input.
 
@@ -155,7 +155,7 @@ EXTN(jsimd_idct_ifast_sse2):
     movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
     movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
     jmp         near .column_end
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -490,7 +490,7 @@ EXTN(jsimd_idct_ifast_sse2):
     pop         esi
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; unused
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm
index 199c7df3b69c..dd4a3d5e8c83 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-avx2.asm
@@ -2,7 +2,7 @@
 ; jidctint.asm - accurate integer IDCT (AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; %1-%4: Input/output registers
 ; %5-%8: Temp registers
 
-%macro dotranspose 8
+%macro DOTRANSPOSE 8
     ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
     ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
     ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
@@ -118,7 +118,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; %5-%12: Temp registers
 ; %9:     Pass (1 or 2)
 
-%macro dodct 13
+%macro DODCT 13
     ; -- Even part
 
     ; (Original)
@@ -250,7 +250,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_islow_avx2)
 
 EXTN(jconst_idct_islow_avx2):
@@ -269,7 +269,7 @@ PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
 PW_1_NEG1                  times 8  dw  1
                            times 8  dw -1
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -303,13 +303,13 @@ EXTN(jsimd_idct_islow_avx2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; unused
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns.
 
@@ -353,7 +353,7 @@ EXTN(jsimd_idct_islow_avx2):
     vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
 
     jmp         near .column_end
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -371,10 +371,10 @@ EXTN(jsimd_idct_islow_avx2):
     vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
     vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
 
-    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
+    DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
     ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
 
-    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
     ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
 
 .column_end:
@@ -395,10 +395,10 @@ EXTN(jsimd_idct_islow_avx2):
     vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
     vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
 
-    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
+    DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
     ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
 
-    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
     ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
 
     vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
@@ -442,7 +442,7 @@ EXTN(jsimd_idct_islow_avx2):
     pop         esi
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; unused
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm
index f15c8d34bcb3..e2e1b3ff7985 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-mmx.asm
@@ -2,7 +2,7 @@
 ; jidctint.asm - accurate integer IDCT (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, 2020, D. R. Commander.
+; Copyright (C) 2016, 2020, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +63,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_islow_mmx)
 
 EXTN(jconst_idct_islow_mmx):
@@ -80,7 +80,7 @@ PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
 PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
 PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -122,7 +122,7 @@ EXTN(jsimd_idct_islow_mmx):
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input, store into work array.
 
@@ -131,7 +131,7 @@ EXTN(jsimd_idct_islow_mmx):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
     lea         edi, [workspace]                 ; JCOEF *wsptr
     mov         ecx, DCTSIZE/4                   ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
     mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -178,7 +178,7 @@ EXTN(jsimd_idct_islow_mmx):
     movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
     movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
     jmp         near .nextcolumn
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -513,7 +513,7 @@ EXTN(jsimd_idct_islow_mmx):
     mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
     mov         eax, JDIMENSION [output_col(eax)]
     mov         ecx, DCTSIZE/4                     ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .rowloop:
 
     ; -- Even part
@@ -816,7 +816,7 @@ EXTN(jsimd_idct_islow_mmx):
     punpckldq   mm7, mm5                ; mm7=(20 21 22 23 24 25 26 27)
     punpckhdq   mm4, mm5                ; mm4=(30 31 32 33 34 35 36 37)
 
-    pushpic     ebx                     ; save GOT address
+    PUSHPIC     ebx                     ; save GOT address
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
@@ -827,7 +827,7 @@ EXTN(jsimd_idct_islow_mmx):
     movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
     movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
 
-    poppic      ebx                     ; restore GOT address
+    POPPIC      ebx                     ; restore GOT address
 
     add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
     add         edi, byte 4*SIZEOF_JSAMPROW
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm
index 43e320189b49..42be940d7239 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctint-sse2.asm
@@ -2,7 +2,7 @@
 ; jidctint.asm - accurate integer IDCT (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, 2020, D. R. Commander.
+; Copyright (C) 2016, 2020, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +63,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_islow_sse2)
 
 EXTN(jconst_idct_islow_sse2):
@@ -80,7 +80,7 @@ PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
 PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
 PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -114,13 +114,13 @@ EXTN(jsimd_idct_islow_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; unused
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input.
 
@@ -172,7 +172,7 @@ EXTN(jsimd_idct_islow_sse2):
     movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
     movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
     jmp         near .column_end
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -847,7 +847,7 @@ EXTN(jsimd_idct_islow_sse2):
     pop         esi
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; unused
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm
index e2307e1cb6c6..920dad90bdca 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-mmx.asm
@@ -2,7 +2,7 @@
 ; jidctred.asm - reduced-size IDCT (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -69,7 +69,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_red_mmx)
 
 EXTN(jconst_idct_red_mmx):
@@ -87,7 +87,7 @@ PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2 - 1)
 PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2 - 1)
 PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -124,13 +124,13 @@ EXTN(jsimd_idct_4x4_mmx):
     mov         [esp], eax
     mov         ebp, esp                    ; ebp = aligned ebp
     lea         esp, [workspace]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; need not be preserved
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input, store into work array.
 
@@ -139,7 +139,7 @@ EXTN(jsimd_idct_4x4_mmx):
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
     lea         edi, [workspace]                 ; JCOEF *wsptr
     mov         ecx, DCTSIZE/4                   ; ctr
-    alignx      16, 7
+    ALIGNX      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
     mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -181,7 +181,7 @@ EXTN(jsimd_idct_4x4_mmx):
     movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
     movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
     jmp         near .nextcolumn
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -479,7 +479,7 @@ EXTN(jsimd_idct_4x4_mmx):
     pop         esi
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; need not be preserved
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
@@ -512,7 +512,7 @@ EXTN(jsimd_idct_2x2_mmx):
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input.
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm
index 6e56494e9751..9a6f9946e795 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jidctred-sse2.asm
@@ -2,7 +2,7 @@
 ; jidctred.asm - reduced-size IDCT (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -69,7 +69,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_red_sse2)
 
 EXTN(jconst_idct_red_sse2):
@@ -87,7 +87,7 @@ PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
 PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -122,13 +122,13 @@ EXTN(jsimd_idct_4x4_sse2):
     mov         [esp], eax
     mov         ebp, esp                     ; ebp = aligned ebp
     lea         esp, [wk(0)]
-    pushpic     ebx
+    PUSHPIC     ebx
 ;   push        ecx                     ; unused
 ;   push        edx                     ; need not be preserved
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input.
 
@@ -171,7 +171,7 @@ EXTN(jsimd_idct_4x4_sse2):
     pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
 
     jmp         near .column_end
-    alignx      16, 7
+    ALIGNX      16, 7
 %endif
 .columnDCT:
 
@@ -400,7 +400,7 @@ EXTN(jsimd_idct_4x4_sse2):
     pop         esi
 ;   pop         edx                     ; need not be preserved
 ;   pop         ecx                     ; unused
-    poppic      ebx
+    POPPIC      ebx
     mov         esp, ebp                ; esp <- aligned ebp
     pop         esp                     ; esp <- original ebp
     pop         ebp
@@ -433,7 +433,7 @@ EXTN(jsimd_idct_2x2_sse2):
     push        esi
     push        edi
 
-    get_GOT     ebx                     ; get GOT address
+    GET_GOT     ebx                     ; get GOT address
 
     ; ---- Pass 1: process columns from input.
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm
index 5cb60caa947a..6436bad1ec9b 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-3dn.asm
@@ -2,7 +2,7 @@
 ; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_float_3dnow):
     mov         eax, JDIMENSION [start_col]
     mov         edi, POINTER [workspace]       ; (DCTELEM *)
     mov         ecx, DCTSIZE/2
-    alignx      16, 7
+    ALIGNX      16, 7
 .convloop:
     mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
@@ -154,7 +154,7 @@ EXTN(jsimd_quantize_float_3dnow):
     mov         edx, POINTER [divisors]
     mov         edi, JCOEFPTR [coef_block]
     mov         eax, DCTSIZE2/16
-    alignx      16, 7
+    ALIGNX      16, 7
 .quantloop:
     movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
     movq        mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm
index 61305c625de8..e525ba9e7b1e 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-mmx.asm
@@ -2,7 +2,7 @@
 ; jquant.asm - sample data conversion and quantization (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_mmx):
     mov         eax, JDIMENSION [start_col]
     mov         edi, POINTER [workspace]       ; (DCTELEM *)
     mov         ecx, DCTSIZE/4
-    alignx      16, 7
+    ALIGNX      16, 7
 .convloop:
     mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
@@ -157,10 +157,10 @@ EXTN(jsimd_quantize_mmx):
     mov         edx, POINTER [divisors]
     mov         edi, JCOEFPTR [coef_block]
     mov         ah, 2
-    alignx      16, 7
+    ALIGNX      16, 7
 .quantloop1:
     mov         al, DCTSIZE2/8/2
-    alignx      16, 7
+    ALIGNX      16, 7
 .quantloop2:
     movq        mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
     movq        mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm
index 218adc976f3c..1cf2cc0ce5a5 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquant-sse.asm
@@ -2,7 +2,7 @@
 ; jquant.asm - sample data conversion and quantization (SSE & MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_float_sse):
     mov         eax, JDIMENSION [start_col]
     mov         edi, POINTER [workspace]       ; (DCTELEM *)
     mov         ecx, DCTSIZE/2
-    alignx      16, 7
+    ALIGNX      16, 7
 .convloop:
     mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
@@ -150,7 +150,7 @@ EXTN(jsimd_quantize_float_sse):
     mov         edx, POINTER [divisors]
     mov         edi, JCOEFPTR [coef_block]
     mov         eax, DCTSIZE2/16
-    alignx      16, 7
+    ALIGNX      16, 7
 .quantloop:
     movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
     movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm
index a881ab50f924..66efd3eecac8 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquantf-sse2.asm
@@ -2,7 +2,7 @@
 ; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_float_sse2):
     mov         eax, JDIMENSION [start_col]
     mov         edi, POINTER [workspace]       ; (DCTELEM *)
     mov         ecx, DCTSIZE/2
-    alignx      16, 7
+    ALIGNX      16, 7
 .convloop:
     mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
@@ -127,7 +127,7 @@ EXTN(jsimd_quantize_float_sse2):
     mov         edx, POINTER [divisors]
     mov         edi, JCOEFPTR [coef_block]
     mov         eax, DCTSIZE2/16
-    alignx      16, 7
+    ALIGNX      16, 7
 .quantloop:
     movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
     movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm
index 0a509408aa13..2a69af9c95ff 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jquanti-sse2.asm
@@ -2,7 +2,7 @@
 ; jquanti.asm - sample data conversion and quantization (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -52,7 +52,7 @@ EXTN(jsimd_convsamp_sse2):
     mov         eax, JDIMENSION [start_col]
     mov         edi, POINTER [workspace]       ; (DCTELEM *)
     mov         ecx, DCTSIZE/4
-    alignx      16, 7
+    ALIGNX      16, 7
 .convloop:
     mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
@@ -133,7 +133,7 @@ EXTN(jsimd_quantize_sse2):
     mov         edx, POINTER [divisors]
     mov         edi, JCOEFPTR [coef_block]
     mov         eax, DCTSIZE2/32
-    alignx      16, 7
+    ALIGNX      16, 7
 .quantloop:
     movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
     movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
diff --git a/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c
index 80bc821ff4e7..b429b0a53208 100644
--- a/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c
+++ b/3rdparty/libjpeg-turbo/src/simd/i386/jsimd.c
@@ -2,8 +2,8 @@
  * jsimd_i386.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -21,7 +21,6 @@
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
-#include "jconfigint.h"
 
 /*
  * In the PIC cases, we have no guarantee that constants will keep
@@ -32,13 +31,11 @@
 #define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
 #define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
 
-static unsigned int simd_support = (unsigned int)(~0);
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
@@ -161,6 +158,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_extrgb_ycc_convert_avx2;
@@ -220,6 +220,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_extrgb_gray_convert_avx2;
@@ -279,6 +282,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
   void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_ycc_extrgb_convert_avx2;
@@ -382,6 +388,9 @@ GLOBAL(void)
 jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
                                compptr->v_samp_factor,
@@ -402,6 +411,9 @@ GLOBAL(void)
 jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
                                compptr->v_samp_factor,
@@ -464,6 +476,9 @@ GLOBAL(void)
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
                              input_data, output_data_ptr);
@@ -479,6 +494,9 @@ GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
                              input_data, output_data_ptr);
@@ -540,6 +558,9 @@ GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
                                    compptr->downsampled_width, input_data,
@@ -558,6 +579,9 @@ GLOBAL(void)
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
                                    compptr->downsampled_width, input_data,
@@ -626,6 +650,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
@@ -684,6 +711,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
@@ -788,6 +818,9 @@ GLOBAL(void)
 jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
                DCTELEM *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_convsamp_avx2(sample_data, start_col, workspace);
   else if (simd_support & JSIMD_SSE2)
@@ -800,6 +833,9 @@ GLOBAL(void)
 jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
                      FAST_FLOAT *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
   else if (simd_support & JSIMD_SSE)
@@ -870,6 +906,9 @@ jsimd_can_fdct_float(void)
 GLOBAL(void)
 jsimd_fdct_islow(DCTELEM *data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_fdct_islow_avx2(data);
   else if (simd_support & JSIMD_SSE2)
@@ -881,6 +920,9 @@ jsimd_fdct_islow(DCTELEM *data)
 GLOBAL(void)
 jsimd_fdct_ifast(DCTELEM *data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     jsimd_fdct_ifast_sse2(data);
   else
@@ -890,6 +932,9 @@ jsimd_fdct_ifast(DCTELEM *data)
 GLOBAL(void)
 jsimd_fdct_float(FAST_FLOAT *data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
     jsimd_fdct_float_sse(data);
   else if (simd_support & JSIMD_3DNOW)
@@ -945,6 +990,9 @@ jsimd_can_quantize_float(void)
 GLOBAL(void)
 jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_quantize_avx2(coef_block, divisors, workspace);
   else if (simd_support & JSIMD_SSE2)
@@ -957,6 +1005,9 @@ GLOBAL(void)
 jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
                      FAST_FLOAT *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_SSE2)
     jsimd_quantize_float_sse2(coef_block, divisors, workspace);
   else if (simd_support & JSIMD_SSE)
@@ -1020,6 +1071,9 @@ jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
     jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
                         output_col);
@@ -1032,6 +1086,9 @@ jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
     jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
                         output_col);
@@ -1126,6 +1183,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
                           output_col);
@@ -1142,6 +1202,9 @@ jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
     jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
                           output_col);
@@ -1155,6 +1218,9 @@ jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
     jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
                           output_col);
@@ -1212,7 +1278,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
   jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
                                          Sl, Al, values, zerobits);
@@ -1238,7 +1304,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
                                                  jpeg_natural_order_start,
diff --git a/3rdparty/libjpeg-turbo/src/simd/jsimd.h b/3rdparty/libjpeg-turbo/src/simd/jsimd.h
index 64747c6360c1..a28754adb9d0 100644
--- a/3rdparty/libjpeg-turbo/src/simd/jsimd.h
+++ b/3rdparty/libjpeg-turbo/src/simd/jsimd.h
@@ -2,10 +2,10 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, 2022, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2014, Linaro Limited.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  * Copyright (C) 2020, Arm Limited.
  *
@@ -1243,16 +1243,16 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
 /* Progressive Huffman encoding */
 EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *values, size_t *zerobits);
+   UJCOEF *values, size_t *zerobits);
 
 EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *values, size_t *zerobits);
+   UJCOEF *values, size_t *zerobits);
 
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *absvalues, size_t *bits);
+   UJCOEF *absvalues, size_t *bits);
 
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *absvalues, size_t *bits);
+   UJCOEF *absvalues, size_t *bits);
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c
index d2546eed3289..c6e789aa2f2d 100644
--- a/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c
+++ b/3rdparty/libjpeg-turbo/src/simd/mips/jsimd.c
@@ -2,9 +2,9 @@
  * jsimd_mips.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, 2022, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -23,11 +23,9 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 
-#include <stdio.h>
-#include <string.h>
 #include <ctype.h>
 
-static unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_support = ~0;
 
 #if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__)
 
@@ -57,8 +55,6 @@ parse_proc_cpuinfo(const char *search_string)
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
@@ -1128,7 +1124,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
 }
 
@@ -1141,7 +1137,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return 0;
 }
diff --git a/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c
index e8f1af562bab..917440b43bf8 100644
--- a/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c
+++ b/3rdparty/libjpeg-turbo/src/simd/mips64/jsimd.c
@@ -2,9 +2,9 @@
  * jsimd_mips64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, 2018, Matthieu Darbois.
+ * Copyright (C) 2015, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
@@ -24,11 +24,9 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 
-#include <stdio.h>
-#include <string.h>
 #include <ctype.h>
 
-static unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_support = ~0;
 
 #if defined(__linux__)
 
@@ -96,8 +94,6 @@ parse_proc_cpuinfo(int bufsize)
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
@@ -851,7 +847,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
 }
 
@@ -864,7 +860,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return 0;
 }
diff --git a/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc
index e8d50b034973..b5341ed27586 100644
--- a/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc
+++ b/3rdparty/libjpeg-turbo/src/simd/nasm/jsimdext.inc
@@ -2,9 +2,10 @@
 ; jsimdext.inc - common declarations
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
+; Copyright (C) 2010, 2016, 2018-2019, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthieu Darbois.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
 ;
@@ -75,6 +76,14 @@
 ; mark stack as non-executable
 section .note.GNU-stack noalloc noexec nowrite progbits
 
+%ifdef __CET__
+%ifdef __x86_64__
+section .note.gnu.property note alloc noexec align=8
+    dd 0x00000004, 0x00000010, 0x00000005, 0x00554e47
+    dd 0xc0000002, 0x00000004, 0x00000003, 0x00000000
+%endif
+%endif
+
 ; -- segment definition --
 ;
 %ifdef __x86_64__
@@ -271,7 +280,7 @@ const_base:
 
 %define GOTOFF(got, sym)  (got) + (sym) - const_base
 
-%imacro get_GOT 1
+%imacro GET_GOT 1
     ; NOTE: this macro destroys ecx resister.
     call        %%geteip
     add         ecx, byte (%%ref - $)
@@ -303,7 +312,7 @@ const_base:
 
 %define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
 
-%imacro get_GOT 1
+%imacro GET_GOT 1
     extern      GOT_SYMBOL
     call        %%geteip
     add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
@@ -316,13 +325,13 @@ const_base:
 
 %endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
 
-%imacro pushpic 1.nolist
+%imacro PUSHPIC 1.nolist
     push        %1
 %endmacro
-%imacro poppic  1.nolist
+%imacro POPPIC  1.nolist
     pop         %1
 %endmacro
-%imacro movpic  2.nolist
+%imacro MOVPIC  2.nolist
     mov         %1, %2
 %endmacro
 
@@ -330,13 +339,13 @@ const_base:
 
 %define GOTOFF(got, sym)  (sym)
 
-%imacro get_GOT 1.nolist
+%imacro GET_GOT 1.nolist
 %endmacro
-%imacro pushpic 1.nolist
+%imacro PUSHPIC 1.nolist
 %endmacro
-%imacro poppic  1.nolist
+%imacro POPPIC  1.nolist
 %endmacro
-%imacro movpic  2.nolist
+%imacro MOVPIC  2.nolist
 %endmacro
 
 %endif   ;  PIC -----------------------------------------
@@ -348,7 +357,7 @@ const_base:
 %define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
 %define FILLB(b, n)  (($$-(b)) & ((n)-1))
 
-%imacro alignx 1-2.nolist 0xFFFF
+%imacro ALIGNX 1-2.nolist 0xFFFF
 %%bs: \
   times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
         db 0x90                                      ; nop
@@ -370,7 +379,7 @@ const_base:
 
 ; Align the next data on {2,4,8,16,..}-byte boundary.
 ;
-%imacro alignz 1.nolist
+%imacro ALIGNZ 1.nolist
     align       %1, db 0                ; filling zeros
 %endmacro
 
@@ -378,7 +387,7 @@ const_base:
 
 %ifdef WIN64
 
-%imacro collect_args 1
+%imacro COLLECT_ARGS 1
     sub         rsp, SIZEOF_XMMWORD
     movaps      XMMWORD [rsp], xmm6
     sub         rsp, SIZEOF_XMMWORD
@@ -397,17 +406,17 @@ const_base:
 %endif
 %if %1 > 4
     push        r14
-    mov         r14, [rax+48]
+    mov         r14, [rbp+48]
 %endif
 %if %1 > 5
     push        r15
-    mov         r15, [rax+56]
+    mov         r15, [rbp+56]
 %endif
     push        rsi
     push        rdi
 %endmacro
 
-%imacro uncollect_args 1
+%imacro UNCOLLECT_ARGS 1
     pop         rdi
     pop         rsi
 %if %1 > 5
@@ -428,7 +437,7 @@ const_base:
     add         rsp, SIZEOF_XMMWORD
 %endmacro
 
-%imacro push_xmm 1
+%imacro PUSH_XMM 1
     sub         rsp, %1 * SIZEOF_XMMWORD
     movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
 %if %1 > 1
@@ -442,7 +451,7 @@ const_base:
 %endif
 %endmacro
 
-%imacro pop_xmm 1
+%imacro POP_XMM 1
     movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
 %if %1 > 1
     movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
@@ -458,7 +467,7 @@ const_base:
 
 %else
 
-%imacro collect_args 1
+%imacro COLLECT_ARGS 1
     push        r10
     mov         r10, rdi
 %if %1 > 1
@@ -483,7 +492,7 @@ const_base:
 %endif
 %endmacro
 
-%imacro uncollect_args 1
+%imacro UNCOLLECT_ARGS 1
 %if %1 > 5
     pop         r15
 %endif
@@ -502,16 +511,29 @@ const_base:
     pop         r10
 %endmacro
 
-%imacro push_xmm 1
+%imacro PUSH_XMM 1
 %endmacro
 
-%imacro pop_xmm 1
+%imacro POP_XMM 1
 %endmacro
 
 %endif
 
 %endif
 
+%ifdef __CET__
+
+%imacro ENDBR64 0
+    dd 0xfa1e0ff3
+%endmacro
+
+%else
+
+%imacro ENDBR64 0
+%endmacro
+
+%endif
+
 ; --------------------------------------------------------------------------
 ;  Defines picked up from the C headers
 ;
diff --git a/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c
index b9e86dcfac26..461f603633aa 100644
--- a/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c
+++ b/3rdparty/libjpeg-turbo/src/simd/powerpc/jsimd.c
@@ -2,8 +2,8 @@
  * jsimd_powerpc.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2014-2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -27,11 +27,12 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 
-#include <stdio.h>
-#include <string.h>
 #include <ctype.h>
 
-#if defined(__OpenBSD__)
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif defined(__OpenBSD__)
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <machine/cpu.h>
@@ -40,7 +41,7 @@
 #include <sys/auxv.h>
 #endif
 
-static unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_support = ~0;
 
 #if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
 
@@ -108,8 +109,6 @@ parse_proc_cpuinfo(int bufsize)
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
@@ -121,6 +120,10 @@ init_simd(void)
   int bufsize = 1024; /* an initial guess for the line buffer size limit */
 #elif defined(__amigaos4__)
   uint32 altivec = 0;
+#elif defined(__APPLE__)
+  int mib[2] = { CTL_HW, HW_VECTORUNIT };
+  int altivec;
+  size_t len = sizeof(altivec);
 #elif defined(__OpenBSD__)
   int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
   int altivec;
@@ -134,7 +137,7 @@ init_simd(void)
 
   simd_support = 0;
 
-#if defined(__ALTIVEC__) || defined(__APPLE__)
+#if defined(__ALTIVEC__)
   simd_support |= JSIMD_ALTIVEC;
 #elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
   while (!parse_proc_cpuinfo(bufsize)) {
@@ -146,7 +149,7 @@ init_simd(void)
   IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
   if (altivec == VECTORTYPE_ALTIVEC)
     simd_support |= JSIMD_ALTIVEC;
-#elif defined(__OpenBSD__)
+#elif defined(__APPLE__) || defined(__OpenBSD__)
   if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
     simd_support |= JSIMD_ALTIVEC;
 #elif defined(__FreeBSD__)
@@ -862,7 +865,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
 }
 
@@ -875,7 +878,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return 0;
 }
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm
index ffb527db00e1..39e6f207ca24 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-avx2.asm
@@ -1,9 +1,10 @@
 ;
 ; jccolext.asm - colorspace conversion (64-bit AVX2)
 ;
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,21 +34,22 @@
 ; r13d = JDIMENSION output_row
 ; r14d = int num_rows
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
 %define WK_NUM  8
 
     align       32
     GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
 
 EXTN(jsimd_rgb_ycc_convert_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 5
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, (SIZEOF_YMMWORD * WK_NUM)
+    COLLECT_ARGS 5
     push        rbx
 
     mov         ecx, r10d
@@ -548,9 +550,9 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
 .return:
     pop         rbx
     vzeroupper
-    uncollect_args 5
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 5
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm
index af70ed6010f6..2073988d33c2 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolext-sse2.asm
@@ -1,8 +1,9 @@
 ;
 ; jccolext.asm - colorspace conversion (64-bit SSE2)
 ;
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,21 +33,22 @@
 ; r13d = JDIMENSION output_row
 ; r14d = int num_rows
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 %define WK_NUM  8
 
     align       32
     GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
 
 EXTN(jsimd_rgb_ycc_convert_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 5
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 5
     push        rbx
 
     mov         ecx, r10d
@@ -473,9 +475,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
 
 .return:
     pop         rbx
-    uncollect_args 5
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 5
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm
index 16b78298dc4f..1f069caad99f 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-avx2.asm
@@ -1,7 +1,7 @@
 ;
 ; jccolor.asm - colorspace conversion (64-bit AVX2)
 ;
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -33,7 +33,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
 
 EXTN(jconst_rgb_ycc_convert_avx2):
@@ -46,7 +46,7 @@ PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
                             (CENTERJSAMPLE << SCALEBITS)
 PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm
index e2955c213404..c0c1526d8c4d 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jccolor-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jccolor.asm - colorspace conversion (64-bit SSE2)
 ;
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,7 +32,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
 
 EXTN(jconst_rgb_ycc_convert_sse2):
@@ -45,7 +45,7 @@ PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
                             (CENTERJSAMPLE << SCALEBITS)
 PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm
index 591255bb1122..354683ca42f4 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-avx2.asm
@@ -1,7 +1,7 @@
 ;
 ; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
 ;
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -29,7 +29,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
 
 EXTN(jconst_rgb_gray_convert_avx2):
@@ -38,7 +38,7 @@ PW_F0299_F0337 times 8 dw F_0_299, F_0_337
 PW_F0114_F0250 times 8 dw F_0_114, F_0_250
 PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm
index e389904f2f85..d27c4b9a82eb 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgray-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
 ;
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -28,7 +28,7 @@ F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
 
 EXTN(jconst_rgb_gray_convert_sse2):
@@ -37,7 +37,7 @@ PW_F0299_F0337 times 4 dw F_0_299, F_0_337
 PW_F0114_F0250 times 4 dw F_0_114, F_0_250
 PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm
index ddcc2c0a2fe4..d2ae6d63a419 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-avx2.asm
@@ -1,9 +1,10 @@
 ;
 ; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
 ;
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,21 +34,22 @@
 ; r13d = JDIMENSION output_row
 ; r14d = int num_rows
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
 %define WK_NUM  2
 
     align       32
     GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
 
 EXTN(jsimd_rgb_gray_convert_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 5
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_YMMWORD * WK_NUM)
+    COLLECT_ARGS 5
     push        rbx
 
     mov         ecx, r10d
@@ -427,9 +429,9 @@ EXTN(jsimd_rgb_gray_convert_avx2):
 .return:
     pop         rbx
     vzeroupper
-    uncollect_args 5
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 5
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm
index f1d399a63b85..3c2834e96463 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcgryext-sse2.asm
@@ -1,8 +1,9 @@
 ;
 ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
 ;
-; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2011, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -32,21 +33,22 @@
 ; r13d = JDIMENSION output_row
 ; r14d = int num_rows
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 %define WK_NUM  2
 
     align       32
     GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
 
 EXTN(jsimd_rgb_gray_convert_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 5
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 5
     push        rbx
 
     mov         ecx, r10d
@@ -352,9 +354,9 @@ EXTN(jsimd_rgb_gray_convert_sse2):
 
 .return:
     pop         rbx
-    uncollect_args 5
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 5
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm
index 9ea6df946ef9..39aa24650c19 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jchuff-sse2.asm
@@ -1,9 +1,10 @@
 ;
 ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
 ;
-; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023-2024, D. R. Commander.
 ; Copyright (C) 2015, Matthieu Darbois.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -38,7 +39,7 @@ endstruc
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_huff_encode_one_block)
 
 EXTN(jconst_huff_encode_one_block):
@@ -48,7 +49,7 @@ jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
                dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
                dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
 
-    alignz      32
+    ALIGNZ      32
 
 times 1 << 14 db 15
 times 1 << 13 db 14
@@ -66,7 +67,8 @@ times 1 <<  2 db  3
 times 1 <<  1 db  2
 times 1 <<  0 db  1
 times 1       db  0
-jpeg_nbits_table:
+GLOBAL_DATA(jpeg_nbits_table)
+EXTN(jpeg_nbits_table):
 times 1       db  0
 times 1 <<  0 db  1
 times 1 <<  1 db  2
@@ -85,10 +87,10 @@ times 1 << 13 db 14
 times 1 << 14 db 15
 times 1 << 15 db 16
 
-    alignz      32
+    ALIGNZ      32
 
 %define NBITS(x)      nbits_base + x
-%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
+%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - EXTN(jpeg_nbits_table))
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -208,15 +210,15 @@ times 1 << 15 db 16
 ; rax - buffer
 ; rbx - temp
 ; rcx - nbits
-; rdx - block --> free_bits
+; rdx - code
 ; rsi - nbits_base
 ; rdi - t
-; rbp - code
 ; r8  - dctbl --> code_temp
 ; r9  - actbl
 ; r10 - state
 ; r11 - index
 ; r12 - put_buffer
+; r15 - block --> free_bits
 
 %define buffer       rax
 %ifdef WIN64
@@ -231,12 +233,11 @@ times 1 << 15 db 16
 %define nbitsq       rcx
 %define nbits        ecx
 %define nbitsb       cl
-%define block        rdx
+%define codeq        rdx
+%define code         edx
 %define nbits_base   rsi
 %define t            rdi
 %define td           edi
-%define codeq        rbp
-%define code         ebp
 %define dctbl        r8
 %define actbl        r9
 %define state        r10
@@ -244,6 +245,7 @@ times 1 << 15 db 16
 %define indexd       r11d
 %define put_buffer   r12
 %define put_bufferd  r12d
+%define block        r15
 
 ; Step 1: Re-arrange input data according to jpeg_natural_order
 ; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
@@ -259,6 +261,9 @@ times 1 << 15 db 16
     GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
 
 EXTN(jsimd_huff_encode_one_block_sse2):
+    ENDBR64
+    push        rbp
+    mov         rbp, rsp
 
 %ifdef WIN64
 
@@ -266,15 +271,15 @@ EXTN(jsimd_huff_encode_one_block_sse2):
 ; rdx = JOCTET *buffer
 ; r8 = JCOEFPTR block
 ; r9 = int last_dc_val
-; [rax+48] = c_derived_tbl *dctbl
-; [rax+56] = c_derived_tbl *actbl
+; [rbp+48] = c_derived_tbl *dctbl
+; [rbp+56] = c_derived_tbl *actbl
 
                                                           ;X: X = code stream
     mov         buffer, rdx
+    push        r15
     mov         block, r8
     movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
     push        rbx
-    push        rbp
     movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
     push        rsi
     push        rdi
@@ -284,12 +289,10 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     movsx       code, word [block]                        ;Z:     code = block[0];
     pxor        xmm4, xmm4                                ;A: w4[i] = 0;
     sub         code, r9d                                 ;Z:     code -= last_dc_val;
-    mov         dctbl, POINTER [rsp+6*8+4*8]
-    mov         actbl, POINTER [rsp+6*8+5*8]
+    mov         dctbl, POINTER [rbp+48]
+    mov         actbl, POINTER [rbp+56]
     punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
-    lea         nbits_base, [rel jpeg_nbits_table]
-    add         rsp, -DCTSIZE2 * SIZEOF_WORD
-    mov         t, rsp
+    lea         nbits_base, [rel EXTN(jpeg_nbits_table)]
 
 %else
 
@@ -301,23 +304,27 @@ EXTN(jsimd_huff_encode_one_block_sse2):
 ; r9 = c_derived_tbl *actbl
 
                                                           ;X: X = code stream
+    push        r15
+    mov         block, rdx
     movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
     push        rbx
-    push        rbp
     movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
     push        r12
     mov         state, rdi
     mov         buffer, rsi
     movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
     movsx       codeq, word [block]                       ;Z:     code = block[0];
-    lea         nbits_base, [rel jpeg_nbits_table]
+    lea         nbits_base, [rel EXTN(jpeg_nbits_table)]
     pxor        xmm4, xmm4                                ;A: w4[i] = 0;
     sub         codeq, rcx                                ;Z:     code -= last_dc_val;
     punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
-    lea         t, [rsp - DCTSIZE2 * SIZEOF_WORD]         ;   use red zone for t_
 
 %endif
 
+    ; Allocate stack space for t array, and realign stack.
+    add         rsp, -DCTSIZE2 * SIZEOF_WORD - 8
+    mov         t, rsp
+
     pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
     pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
     punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
@@ -443,9 +450,9 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
                                                           ;        (Row 4, offset 1)
 %undef block
-%define free_bitsq  rdx
-%define free_bitsd  edx
-%define free_bitsb  dl
+%define free_bitsq  r15
+%define free_bitsd  r15d
+%define free_bitsb  r15b
     pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
     shl         tempq, 48                                 ;Z:     temp <<= 48;
     pxor        xmm2, xmm2                                ;E: w2[i] = 0;
@@ -534,12 +541,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
     test        index, index
     jnz         .BLOOP                                    ;   } while (index != 0);
 .ELOOP:                                                   ; }  /* index != 0 */
-    sub         td, esp                                   ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
-%ifdef WIN64
+    sub         td, esp                                   ; t -= &t_[0];
     cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
-%else
-    cmp         td, -2 * SIZEOF_WORD                      ; if (t != -2)
-%endif
     je          .EFN                                      ; {
     movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
                                                           ;   nbits = actbl->ehufsi[0];
@@ -556,18 +559,17 @@ EXTN(jsimd_huff_encode_one_block_sse2):
                                                           ; state->cur.put_buffer.simd = put_buffer;
     mov         byte [state + working_state.cur.free_bits], free_bitsb
                                                           ; state->cur.free_bits = free_bits;
-%ifdef WIN64
-    sub         rsp, -DCTSIZE2 * SIZEOF_WORD
+    sub         rsp, -DCTSIZE2 * SIZEOF_WORD - 8
     pop         r12
+%ifdef WIN64
     pop         rdi
     pop         rsi
-    pop         rbp
     pop         rbx
 %else
-    pop         r12
-    pop         rbp
     pop         rbx
 %endif
+    pop         r15
+    pop         rbp
     ret
 
 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm
index 01b5c0235faf..0e2740462e69 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcphuff-sse2.asm
@@ -3,6 +3,8 @@
 ; (64-bit SSE2)
 ;
 ; Copyright (C) 2016, 2018, Matthieu Darbois
+; Copyright (C) 2023, Aliaksiej Kandracienka.
+; Copyright (C) 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -281,16 +283,13 @@
     GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
 
 EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [rbp - 16]
-    collect_args 6
-
-    movdqa      XMMWORD [rbp - 16], ZERO
+    sub         rsp, SIZEOF_XMMWORD
+    movdqa      XMMWORD [rsp], ZERO
+    COLLECT_ARGS 6
 
     movd        AL, r13d
     pxor        ZERO, ZERO
@@ -384,10 +383,9 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
 
     REDUCE0
 
-    movdqa      ZERO, XMMWORD [rbp - 16]
-    uncollect_args 6
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 6
+    movdqa      ZERO, XMMWORD [rsp]
+    mov         rsp, rbp
     pop         rbp
     ret
 
@@ -449,16 +447,13 @@ EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
     GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
 
 EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [rbp - 16]
-    collect_args 6
-
-    movdqa      XMMWORD [rbp - 16], ZERO
+    sub         rsp, SIZEOF_XMMWORD
+    movdqa      XMMWORD [rsp], ZERO
+    COLLECT_ARGS 6
 
     xor         SIGN, SIGN
     xor         EOB, EOB
@@ -606,10 +601,9 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
     REDUCE0
 
     mov         eax, EOB
-    movdqa      ZERO, XMMWORD [rbp - 16]
-    uncollect_args 6
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 6
+    movdqa      ZERO, XMMWORD [rsp]
+    mov         rsp, rbp
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm
index b32527aebeaa..fede6b38b47b 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-avx2.asm
@@ -2,7 +2,7 @@
 ; jcsample.asm - downsampling (64-bit AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ; Copyright (C) 2018, Matthias Räncker.
 ;
@@ -44,10 +44,10 @@
     GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
 
 EXTN(jsimd_h2v1_downsample_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 6
+    COLLECT_ARGS 6
 
     mov         ecx, r13d
     shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -178,7 +178,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
 
 .return:
     vzeroupper
-    uncollect_args 6
+    UNCOLLECT_ARGS 6
     pop         rbp
     ret
 
@@ -206,10 +206,10 @@ EXTN(jsimd_h2v1_downsample_avx2):
     GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
 
 EXTN(jsimd_h2v2_downsample_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 6
+    COLLECT_ARGS 6
 
     mov         ecx, r13d
     shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -358,7 +358,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
 
 .return:
     vzeroupper
-    uncollect_args 6
+    UNCOLLECT_ARGS 6
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm
index 2fcfe4567ab9..0a0ee65e5a1a 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jcsample-sse2.asm
@@ -2,7 +2,7 @@
 ; jcsample.asm - downsampling (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -43,10 +43,10 @@
     GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
 
 EXTN(jsimd_h2v1_downsample_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 6
+    COLLECT_ARGS 6
 
     mov         ecx, r13d
     shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -160,7 +160,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
     jg          near .rowloop
 
 .return:
-    uncollect_args 6
+    UNCOLLECT_ARGS 6
     pop         rbp
     ret
 
@@ -188,10 +188,10 @@ EXTN(jsimd_h2v1_downsample_sse2):
     GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
 
 EXTN(jsimd_h2v2_downsample_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 6
+    COLLECT_ARGS 6
 
     mov         ecx, r13d
     shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
@@ -321,7 +321,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
     jg          near .rowloop
 
 .return:
-    uncollect_args 6
+    UNCOLLECT_ARGS 6
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm
index 2370fda64249..a8384cb5602b 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-avx2.asm
@@ -2,9 +2,10 @@
 ; jdcolext.asm - colorspace conversion (64-bit AVX2)
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,21 +35,22 @@
 ; r13 = JSAMPARRAY output_buf
 ; r14d = int num_rows
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
 %define WK_NUM  2
 
     align       32
     GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
 
 EXTN(jsimd_ycc_rgb_convert_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 5
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (WK_NUM * SIZEOF_YMMWORD)
+    COLLECT_ARGS 5
     push        rbx
 
     mov         ecx, r10d               ; num_cols
@@ -485,9 +487,9 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
 .return:
     pop         rbx
     vzeroupper
-    uncollect_args 5
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 5
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm
index e07c8d75188c..bfb59abf1232 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolext-sse2.asm
@@ -2,8 +2,9 @@
 ; jdcolext.asm - colorspace conversion (64-bit SSE2)
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,21 +34,22 @@
 ; r13 = JSAMPARRAY output_buf
 ; r14d = int num_rows
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 %define WK_NUM  2
 
     align       32
     GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
 
 EXTN(jsimd_ycc_rgb_convert_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 5
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 5
     push        rbx
 
     mov         ecx, r10d               ; num_cols
@@ -428,9 +430,9 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 
 .return:
     pop         rbx
-    uncollect_args 5
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 5
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm
index 43de9db04dc6..4d52a0f16a11 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-avx2.asm
@@ -2,7 +2,7 @@
 ; jdcolor.asm - colorspace conversion (64-bit AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -32,7 +32,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
 
 EXTN(jconst_ycc_rgb_convert_avx2):
@@ -43,7 +43,7 @@ PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
 PW_ONE          times 16 dw  1
 PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm
index b3f1fec07eb5..93d3c8dd4a07 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdcolor-sse2.asm
@@ -2,7 +2,7 @@
 ; jdcolor.asm - colorspace conversion (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
 
 EXTN(jconst_ycc_rgb_convert_sse2):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
 PW_ONE          times 8 dw  1
 PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm
index 9515a17013d3..4be435624e4f 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-avx2.asm
@@ -2,7 +2,7 @@
 ; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -32,7 +32,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_merged_upsample_avx2)
 
 EXTN(jconst_merged_upsample_avx2):
@@ -43,7 +43,7 @@ PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
 PW_ONE          times 16 dw  1
 PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm
index aedccc20f6c0..a22f6ac733c5 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmerge-sse2.asm
@@ -2,7 +2,7 @@
 ; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -31,7 +31,7 @@ F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_merged_upsample_sse2)
 
 EXTN(jconst_merged_upsample_sse2):
@@ -42,7 +42,7 @@ PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
 PW_ONE          times 8 dw  1
 PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm
index 8b264b4f039f..3392f3a38344 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-avx2.asm
@@ -2,9 +2,10 @@
 ; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,21 +35,22 @@
 ; r12d = JDIMENSION in_row_group_ctr
 ; r13 = JSAMPARRAY output_buf
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
 %define WK_NUM  3
 
     align       32
     GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
 
 EXTN(jsimd_h2v1_merged_upsample_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 4
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, SIZEOF_YMMWORD * WK_NUM
+    COLLECT_ARGS 4
     push        rbx
 
     mov         ecx, r10d               ; col
@@ -479,9 +481,9 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
 .return:
     pop         rbx
     vzeroupper
-    uncollect_args 4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 4
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
@@ -505,10 +507,10 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
 
 EXTN(jsimd_h2v2_merged_upsample_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 4
+    COLLECT_ARGS 4
     push        rbx
 
     mov         eax, r10d
@@ -587,7 +589,7 @@ EXTN(jsimd_h2v2_merged_upsample_avx2):
     add         rsp, SIZEOF_JSAMPARRAY*4
 
     pop         rbx
-    uncollect_args 4
+    UNCOLLECT_ARGS 4
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm
index eb3ab9dbd945..901db984f963 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdmrgext-sse2.asm
@@ -2,8 +2,9 @@
 ; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2009, 2012, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -33,21 +34,22 @@
 ; r12d = JDIMENSION in_row_group_ctr
 ; r13 = JSAMPARRAY output_buf
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 %define WK_NUM  3
 
     align       32
     GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
 
 EXTN(jsimd_h2v1_merged_upsample_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 4
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 4
     push        rbx
 
     mov         ecx, r10d               ; col
@@ -421,9 +423,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 
 .return:
     pop         rbx
-    uncollect_args 4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 4
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
@@ -447,10 +449,10 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
     GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
 
 EXTN(jsimd_h2v2_merged_upsample_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 4
+    COLLECT_ARGS 4
     push        rbx
 
     mov         eax, r10d
@@ -529,7 +531,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
     add         rsp, SIZEOF_JSAMPARRAY*4
 
     pop         rbx
-    uncollect_args 4
+    UNCOLLECT_ARGS 4
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm
index 1e4979f933e4..017427a15890 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-avx2.asm
@@ -2,9 +2,10 @@
 ; jdsample.asm - upsampling (64-bit AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -21,7 +22,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fancy_upsample_avx2)
 
 EXTN(jconst_fancy_upsample_avx2):
@@ -32,7 +33,7 @@ PW_THREE times 16 dw 3
 PW_SEVEN times 16 dw 7
 PW_EIGHT times 16 dw 8
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -61,11 +62,11 @@ PW_EIGHT times 16 dw 8
     GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
 
 EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    push_xmm    3
-    collect_args 4
+    PUSH_XMM    3
+    COLLECT_ARGS 4
 
     mov         eax, r11d               ; colctr
     test        rax, rax
@@ -186,8 +187,8 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
 
 .return:
     vzeroupper
-    uncollect_args 4
-    pop_xmm     3
+    UNCOLLECT_ARGS 4
+    POP_XMM     3
     pop         rbp
     ret
 
@@ -208,22 +209,23 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
 ; r12 = JSAMPARRAY input_data
 ; r13 = JSAMPARRAY *output_data_ptr
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
 %define WK_NUM  4
 
     align       32
     GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
 
 EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
-    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    push_xmm    3
-    collect_args 4
+    mov         rbp, rsp
+    push        r15
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 128 bits
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, (SIZEOF_YMMWORD * WK_NUM)
+    PUSH_XMM    3
+    COLLECT_ARGS 4
     push        rbx
 
     mov         eax, r11d               ; colctr
@@ -498,10 +500,10 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
 .return:
     pop         rbx
     vzeroupper
-    uncollect_args 4
-    pop_xmm     3
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 4
+    POP_XMM     3
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
@@ -524,10 +526,10 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
 
 EXTN(jsimd_h2v1_upsample_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 4
+    COLLECT_ARGS 4
 
     mov         edx, r11d
     add         rdx, byte (SIZEOF_YMMWORD-1)
@@ -590,7 +592,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
 
 .return:
     vzeroupper
-    uncollect_args 4
+    UNCOLLECT_ARGS 4
     pop         rbp
     ret
 
@@ -613,10 +615,10 @@ EXTN(jsimd_h2v1_upsample_avx2):
     GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
 
 EXTN(jsimd_h2v2_upsample_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 4
+    COLLECT_ARGS 4
     push        rbx
 
     mov         edx, r11d
@@ -687,7 +689,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
 .return:
     pop         rbx
     vzeroupper
-    uncollect_args 4
+    UNCOLLECT_ARGS 4
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm
index 38dbceec269d..95c4d4c9eded 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jdsample-sse2.asm
@@ -2,8 +2,9 @@
 ; jdsample.asm - upsampling (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,7 +21,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fancy_upsample_sse2)
 
 EXTN(jconst_fancy_upsample_sse2):
@@ -31,7 +32,7 @@ PW_THREE times 8 dw 3
 PW_SEVEN times 8 dw 7
 PW_EIGHT times 8 dw 8
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -60,10 +61,10 @@ PW_EIGHT times 8 dw 8
     GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
 
 EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 4
+    COLLECT_ARGS 4
 
     mov         eax, r11d               ; colctr
     test        rax, rax
@@ -174,7 +175,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
     jg          near .rowloop
 
 .return:
-    uncollect_args 4
+    UNCOLLECT_ARGS 4
     pop         rbp
     ret
 
@@ -195,21 +196,22 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
 ; r12 = JSAMPARRAY input_data
 ; r13 = JSAMPARRAY *output_data_ptr
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 %define WK_NUM  4
 
     align       32
     GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
 
 EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 4
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 4
     push        rbx
 
     mov         eax, r11d               ; colctr
@@ -472,9 +474,9 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
 
 .return:
     pop         rbx
-    uncollect_args 4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 4
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
@@ -497,10 +499,10 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
     GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
 
 EXTN(jsimd_h2v1_upsample_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 4
+    COLLECT_ARGS 4
 
     mov         edx, r11d
     add         rdx, byte (2*SIZEOF_XMMWORD)-1
@@ -561,7 +563,7 @@ EXTN(jsimd_h2v1_upsample_sse2):
     jg          short .rowloop
 
 .return:
-    uncollect_args 4
+    UNCOLLECT_ARGS 4
     pop         rbp
     ret
 
@@ -584,10 +586,10 @@ EXTN(jsimd_h2v1_upsample_sse2):
     GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
 
 EXTN(jsimd_h2v2_upsample_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 4
+    COLLECT_ARGS 4
     push        rbx
 
     mov         edx, r11d
@@ -656,7 +658,7 @@ EXTN(jsimd_h2v2_upsample_sse2):
 
 .return:
     pop         rbx
-    uncollect_args 4
+    UNCOLLECT_ARGS 4
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm
index ef2796649bc6..cf46d93d6157 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctflt-sse.asm
@@ -2,7 +2,8 @@
 ; jfdctflt.asm - floating-point FDCT (64-bit SSE)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -34,7 +35,7 @@
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_float_sse)
 
 EXTN(jconst_fdct_float_sse):
@@ -44,7 +45,7 @@ PD_0_707 times 4 dd 0.707106781186547524400844
 PD_0_541 times 4 dd 0.541196100146196984399723
 PD_1_306 times 4 dd 1.306562964876376527856643
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -58,21 +59,22 @@ PD_1_306 times 4 dd 1.306562964876376527856643
 
 ; r10 = FAST_FLOAT *data
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 %define WK_NUM  2
 
     align       32
     GLOBAL_FUNCTION(jsimd_fdct_float_sse)
 
 EXTN(jsimd_fdct_float_sse):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 1
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 1
 
     ; ---- Pass 1: process rows.
 
@@ -344,9 +346,9 @@ EXTN(jsimd_fdct_float_sse):
     dec         rcx
     jnz         near .columnloop
 
-    uncollect_args 1
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 1
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm
index 2e1bfe6e8c2f..cdc62365857e 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctfst-sse2.asm
@@ -2,7 +2,8 @@
 ; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -49,7 +50,7 @@ F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
 %define PRE_MULTIPLY_SCALE_BITS  2
 %define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_ifast_sse2)
 
 EXTN(jconst_fdct_ifast_sse2):
@@ -59,7 +60,7 @@ PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
 PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
 PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -73,21 +74,22 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
 
 ; r10 = DCTELEM *data
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 %define WK_NUM  2
 
     align       32
     GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
 
 EXTN(jsimd_fdct_ifast_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 1
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 1
 
     ; ---- Pass 1: process rows.
 
@@ -378,9 +380,9 @@ EXTN(jsimd_fdct_ifast_sse2):
     movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
     movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
 
-    uncollect_args 1
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 1
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm
index e56258b48aa3..b6b4c73a509c 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-avx2.asm
@@ -2,7 +2,7 @@
 ; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -65,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; %1-%4: Input/output registers
 ; %5-%8: Temp registers
 
-%macro dotranspose 8
+%macro DOTRANSPOSE 8
     ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
     ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
     ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
@@ -108,7 +108,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; %5-%8: Temp registers
 ; %9:    Pass (1 or 2)
 
-%macro dodct 9
+%macro DODCT 9
     vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
     vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
     vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
@@ -223,7 +223,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_islow_avx2)
 
 EXTN(jconst_fdct_islow_avx2):
@@ -242,7 +242,7 @@ PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
 PW_1_NEG1                  times 8  dw  1
                            times 8  dw -1
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -260,10 +260,10 @@ PW_1_NEG1                  times 8  dw  1
     GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
 
 EXTN(jsimd_fdct_islow_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 1
+    COLLECT_ARGS 1
 
     ; ---- Pass 1: process rows.
 
@@ -285,9 +285,9 @@ EXTN(jsimd_fdct_islow_avx2):
     ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
     ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
 
-    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
 
-    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    DODCT       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
     ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
 
     ; ---- Pass 2: process columns.
@@ -295,9 +295,9 @@ EXTN(jsimd_fdct_islow_avx2):
     vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
     vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
 
-    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
 
-    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    DODCT       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
     ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
 
     vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
@@ -311,7 +311,7 @@ EXTN(jsimd_fdct_islow_avx2):
     vmovdqu     YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
 
     vzeroupper
-    uncollect_args 1
+    UNCOLLECT_ARGS 1
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm
index ec1f383ccb73..44e7cd05546b 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jfdctint-sse2.asm
@@ -2,7 +2,8 @@
 ; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2009, 2016, 2020, 2024, D. R. Commander.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -63,7 +64,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_fdct_islow_sse2)
 
 EXTN(jconst_fdct_islow_sse2):
@@ -80,7 +81,7 @@ PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
 PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
 PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -94,21 +95,22 @@ PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
 
 ; r10 = DCTELEM *data
 
-%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 %define WK_NUM  6
 
     align       32
     GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
 
 EXTN(jsimd_fdct_islow_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 1
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 1
 
     ; ---- Pass 1: process rows.
 
@@ -608,9 +610,9 @@ EXTN(jsimd_fdct_islow_sse2):
     movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
     movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
 
-    uncollect_args 1
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 1
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm
index 60bf96189613..c7cb39a0729a 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctflt-sse2.asm
@@ -2,8 +2,9 @@
 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -24,18 +25,18 @@
 
 ; --------------------------------------------------------------------------
 
-%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+%macro UNPCKLPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     shufps      %1, %2, 0x44
 %endmacro
 
-%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+%macro UNPCKHPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     shufps      %1, %2, 0xEE
 %endmacro
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_float_sse2)
 
 EXTN(jconst_idct_float_sse2):
@@ -47,7 +48,7 @@ PD_M2_613       times 4  dd -2.613125929752753055713286
 PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -65,8 +66,7 @@ PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
 ; r12 = JSAMPARRAY output_buf
 ; r13d = JDIMENSION output_col
 
-%define original_rbp  rbp + 0
-%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+%define wk(i)         r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
                                         ; xmmword wk[WK_NUM]
 %define WK_NUM        2
 %define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
@@ -76,14 +76,15 @@ PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
     GLOBAL_FUNCTION(jsimd_idct_float_sse2)
 
 EXTN(jsimd_idct_float_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
     lea         rsp, [workspace]
-    collect_args 4
+    COLLECT_ARGS 4
     push        rbx
 
     ; ---- Pass 1: process columns from input, store into work array.
@@ -280,11 +281,11 @@ EXTN(jsimd_idct_float_sse2):
     unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
 
     movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
-    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
-    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    UNPCKLPS2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    UNPCKHPS2   xmm3, xmm7              ; xmm3=(01 11 21 31)
     movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
-    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
-    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+    UNPCKLPS2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    UNPCKHPS2   xmm0, xmm2              ; xmm0=(03 13 23 33)
 
     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
     movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
@@ -295,11 +296,11 @@ EXTN(jsimd_idct_float_sse2):
     movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 
     movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
-    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
-    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    UNPCKLPS2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    UNPCKHPS2   xmm6, xmm7              ; xmm6=(41 51 61 71)
     movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
-    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
-    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+    UNPCKLPS2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    UNPCKHPS2   xmm3, xmm2              ; xmm3=(43 53 63 73)
 
     movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
     movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
@@ -322,7 +323,6 @@ EXTN(jsimd_idct_float_sse2):
 
     ; ---- Pass 2: process rows from work array, store into output array.
 
-    mov         rax, [original_rbp]
     lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
     mov         rdi, r12                ; (JSAMPROW *)
     mov         eax, r13d
@@ -471,9 +471,9 @@ EXTN(jsimd_idct_float_sse2):
     jnz         near .rowloop
 
     pop         rbx
-    uncollect_args 4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 4
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm
index cb97fdfbb246..fd3bc32c1687 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctfst-sse2.asm
@@ -2,8 +2,9 @@
 ; jidctfst.asm - fast integer IDCT (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,7 +58,7 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS))         ; FIX(2.613125930) - FIX(1)
 %define PRE_MULTIPLY_SCALE_BITS  2
 %define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_ifast_sse2)
 
 EXTN(jconst_idct_ifast_sse2):
@@ -68,7 +69,7 @@ PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
 PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
 PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -86,8 +87,7 @@ PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
 ; r12 = JSAMPARRAY output_buf
 ; r13d = JDIMENSION output_col
 
-%define original_rbp  rbp + 0
-%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+%define wk(i)         r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
                                         ; xmmword wk[WK_NUM]
 %define WK_NUM        2
 
@@ -95,14 +95,15 @@ PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
     GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
 
 EXTN(jsimd_idct_ifast_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 4
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 4
 
     ; ---- Pass 1: process columns from input.
 
@@ -320,7 +321,6 @@ EXTN(jsimd_idct_ifast_sse2):
 
     ; ---- Pass 2: process rows from work array, store into output array.
 
-    mov         rax, [original_rbp]
     mov         rdi, r12                ; (JSAMPROW *)
     mov         eax, r13d
 
@@ -479,9 +479,9 @@ EXTN(jsimd_idct_ifast_sse2):
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
 
-    uncollect_args 4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 4
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
     ret
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm
index ca7e317f6e1b..84d125bd4353 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-avx2.asm
@@ -2,7 +2,7 @@
 ; jidctint.asm - accurate integer IDCT (64-bit AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2020, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -66,7 +66,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; %1-%4: Input/output registers
 ; %5-%8: Temp registers
 
-%macro dotranspose 8
+%macro DOTRANSPOSE 8
     ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
     ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
     ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
@@ -119,7 +119,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; %5-%12: Temp registers
 ; %9:     Pass (1 or 2)
 
-%macro dodct 13
+%macro DODCT 13
     ; -- Even part
 
     ; (Original)
@@ -241,7 +241,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_islow_avx2)
 
 EXTN(jconst_idct_islow_avx2):
@@ -260,7 +260,7 @@ PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
 PW_1_NEG1                  times 8  dw  1
                            times 8  dw -1
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -282,11 +282,11 @@ PW_1_NEG1                  times 8  dw  1
     GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
 
 EXTN(jsimd_idct_islow_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
     mov         rbp, rsp                     ; rbp = aligned rbp
-    push_xmm    4
-    collect_args 4
+    PUSH_XMM    4
+    COLLECT_ARGS 4
 
     ; ---- Pass 1: process columns.
 
@@ -343,10 +343,10 @@ EXTN(jsimd_idct_islow_avx2):
     vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
     vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
 
-    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
+    DODCT ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
     ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
 
-    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    DOTRANSPOSE ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
     ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
 
 .column_end:
@@ -363,10 +363,10 @@ EXTN(jsimd_idct_islow_avx2):
     vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
     vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
 
-    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
+    DODCT ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
     ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
 
-    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    DOTRANSPOSE ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
     ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
 
     vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
@@ -408,8 +408,8 @@ EXTN(jsimd_idct_islow_avx2):
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
 
-    uncollect_args 4
-    pop_xmm     4
+    UNCOLLECT_ARGS 4
+    POP_XMM     4
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm
index 7aa869bc0b51..3f098b2c50e8 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctint-sse2.asm
@@ -2,8 +2,9 @@
 ; jidctint.asm - accurate integer IDCT (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2009, 2016, 2020, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -64,7 +65,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_islow_sse2)
 
 EXTN(jconst_idct_islow_sse2):
@@ -81,7 +82,7 @@ PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
 PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
 PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -99,8 +100,7 @@ PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
 ; r12 = JSAMPARRAY output_buf
 ; r13d = JDIMENSION output_col
 
-%define original_rbp  rbp + 0
-%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+%define wk(i)         r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
                                         ; xmmword wk[WK_NUM]
 %define WK_NUM        12
 
@@ -108,14 +108,15 @@ PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
     GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
 
 EXTN(jsimd_idct_islow_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 4
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 4
 
     ; ---- Pass 1: process columns from input.
 
@@ -512,7 +513,6 @@ EXTN(jsimd_idct_islow_sse2):
 
     ; ---- Pass 2: process rows from work array, store into output array.
 
-    mov         rax, [original_rbp]
     mov         rdi, r12                ; (JSAMPROW *)
     mov         eax, r13d
 
@@ -836,9 +836,9 @@ EXTN(jsimd_idct_islow_sse2):
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
 
-    uncollect_args 4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 4
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm
index 4ece9d891cbd..2657cf3cb135 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jidctred-sse2.asm
@@ -2,8 +2,9 @@
 ; jidctred.asm - reduced-size IDCT (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -70,7 +71,7 @@ F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
+    ALIGNZ      32
     GLOBAL_DATA(jconst_idct_red_sse2)
 
 EXTN(jconst_idct_red_sse2):
@@ -88,7 +89,7 @@ PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
 PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
 
-    alignz      32
+    ALIGNZ      32
 
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
@@ -107,8 +108,7 @@ PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
 ; r12 = JSAMPARRAY output_buf
 ; r13d = JDIMENSION output_col
 
-%define original_rbp  rbp + 0
-%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+%define wk(i)         r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
                                         ; xmmword wk[WK_NUM]
 %define WK_NUM        2
 
@@ -116,14 +116,15 @@ PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
     GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
 
 EXTN(jsimd_idct_4x4_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
+    mov         rbp, rsp
+    push        r15
     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [wk(0)]
-    collect_args 4
+    ; Allocate stack space for wk array.  r15 is used to access it.
+    mov         r15, rsp
+    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
+    COLLECT_ARGS 4
 
     ; ---- Pass 1: process columns from input.
 
@@ -309,7 +310,6 @@ EXTN(jsimd_idct_4x4_sse2):
 
     ; ---- Pass 2: process rows, store into output array.
 
-    mov         rax, [original_rbp]
     mov         rdi, r12                ; (JSAMPROW *)
     mov         eax, r13d
 
@@ -389,9 +389,9 @@ EXTN(jsimd_idct_4x4_sse2):
     movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
     movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
 
-    uncollect_args 4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    UNCOLLECT_ARGS 4
+    lea         rsp, [rbp-8]
+    pop         r15
     pop         rbp
     ret
 
@@ -414,10 +414,10 @@ EXTN(jsimd_idct_4x4_sse2):
     GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
 
 EXTN(jsimd_idct_2x2_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 4
+    COLLECT_ARGS 4
     push        rbx
 
     ; ---- Pass 1: process columns from input.
@@ -565,7 +565,7 @@ EXTN(jsimd_idct_2x2_sse2):
     mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
 
     pop         rbx
-    uncollect_args 4
+    UNCOLLECT_ARGS 4
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm
index ab2e3954f633..8bd79662e605 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquantf-sse2.asm
@@ -2,7 +2,7 @@
 ; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -37,10 +37,10 @@
     GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
 
 EXTN(jsimd_convsamp_float_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 3
+    COLLECT_ARGS 3
     push        rbx
 
     pcmpeqw     xmm7, xmm7
@@ -89,7 +89,7 @@ EXTN(jsimd_convsamp_float_sse2):
     jnz         short .convloop
 
     pop         rbx
-    uncollect_args 3
+    UNCOLLECT_ARGS 3
     pop         rbp
     ret
 
@@ -110,10 +110,10 @@ EXTN(jsimd_convsamp_float_sse2):
     GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
 
 EXTN(jsimd_quantize_float_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 3
+    COLLECT_ARGS 3
 
     mov         rsi, r12
     mov         rdx, r11
@@ -146,7 +146,7 @@ EXTN(jsimd_quantize_float_sse2):
     dec         rax
     jnz         short .quantloop
 
-    uncollect_args 3
+    UNCOLLECT_ARGS 3
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm
index 70fe81139cc2..c8ebd7966b69 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-avx2.asm
@@ -2,7 +2,7 @@
 ; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2024, D. R. Commander.
 ; Copyright (C) 2016, Matthieu Darbois.
 ; Copyright (C) 2018, Matthias Räncker.
 ;
@@ -38,10 +38,10 @@
     GLOBAL_FUNCTION(jsimd_convsamp_avx2)
 
 EXTN(jsimd_convsamp_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 3
+    COLLECT_ARGS 3
 
     mov         eax, r11d
 
@@ -84,7 +84,7 @@ EXTN(jsimd_convsamp_avx2):
     vmovdqu     YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
 
     vzeroupper
-    uncollect_args 3
+    UNCOLLECT_ARGS 3
     pop         rbp
     ret
 
@@ -116,10 +116,10 @@ EXTN(jsimd_convsamp_avx2):
     GLOBAL_FUNCTION(jsimd_quantize_avx2)
 
 EXTN(jsimd_quantize_avx2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 3
+    COLLECT_ARGS 3
 
     vmovdqu     ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
     vmovdqu     ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
@@ -154,7 +154,7 @@ EXTN(jsimd_quantize_avx2):
     vmovdqu     [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
 
     vzeroupper
-    uncollect_args 3
+    UNCOLLECT_ARGS 3
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm
index 3ee442027a5a..352d74055c62 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jquanti-sse2.asm
@@ -2,7 +2,7 @@
 ; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2024, D. R. Commander.
 ; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
@@ -37,10 +37,10 @@
     GLOBAL_FUNCTION(jsimd_convsamp_sse2)
 
 EXTN(jsimd_convsamp_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 3
+    COLLECT_ARGS 3
     push        rbx
 
     pxor        xmm6, xmm6              ; xmm6=(all 0's)
@@ -84,7 +84,7 @@ EXTN(jsimd_convsamp_sse2):
     jnz         short .convloop
 
     pop         rbx
-    uncollect_args 3
+    UNCOLLECT_ARGS 3
     pop         rbp
     ret
 
@@ -116,10 +116,10 @@ EXTN(jsimd_convsamp_sse2):
     GLOBAL_FUNCTION(jsimd_quantize_sse2)
 
 EXTN(jsimd_quantize_sse2):
+    ENDBR64
     push        rbp
-    mov         rax, rsp
     mov         rbp, rsp
-    collect_args 3
+    COLLECT_ARGS 3
 
     mov         rsi, r12
     mov         rdx, r11
@@ -179,7 +179,7 @@ EXTN(jsimd_quantize_sse2):
     dec         rax
     jnz         near .quantloop
 
-    uncollect_args 3
+    UNCOLLECT_ARGS 3
     pop         rbp
     ret
 
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c
index 584a010ad348..3f5ee77eb99b 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimd.c
@@ -2,8 +2,8 @@
  * jsimd_x86_64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -21,7 +21,6 @@
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
-#include "jconfigint.h"
 
 /*
  * In the PIC cases, we have no guarantee that constants will keep
@@ -32,13 +31,11 @@
 #define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
 #define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
 
-static unsigned int simd_support = (unsigned int)(~0);
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
@@ -148,6 +145,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_extrgb_ycc_convert_avx2;
@@ -197,6 +197,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_extrgb_gray_convert_avx2;
@@ -246,6 +249,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_ycc_extrgb_convert_avx2;
@@ -336,6 +342,9 @@ GLOBAL(void)
 jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
                                compptr->v_samp_factor,
@@ -352,6 +361,9 @@ GLOBAL(void)
 jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
                                compptr->v_samp_factor,
@@ -406,6 +418,9 @@ GLOBAL(void)
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
                              input_data, output_data_ptr);
@@ -418,6 +433,9 @@ GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
                              input_data, output_data_ptr);
@@ -472,6 +490,9 @@ GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
                                    compptr->downsampled_width, input_data,
@@ -486,6 +507,9 @@ GLOBAL(void)
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
                                    compptr->downsampled_width, input_data,
@@ -545,6 +569,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
@@ -593,6 +620,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
@@ -682,6 +712,9 @@ GLOBAL(void)
 jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
                DCTELEM *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_convsamp_avx2(sample_data, start_col, workspace);
   else
@@ -751,6 +784,9 @@ jsimd_can_fdct_float(void)
 GLOBAL(void)
 jsimd_fdct_islow(DCTELEM *data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_fdct_islow_avx2(data);
   else
@@ -812,6 +848,9 @@ jsimd_can_quantize_float(void)
 GLOBAL(void)
 jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_quantize_avx2(coef_block, divisors, workspace);
   else
@@ -966,6 +1005,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
                           output_col);
@@ -1036,7 +1078,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
   jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
                                          Sl, Al, values, zerobits);
@@ -1060,7 +1102,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
                                                  jpeg_natural_order_start,
diff --git a/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm
index 705f813d7da6..251bc4cdae29 100644
--- a/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm
+++ b/3rdparty/libjpeg-turbo/src/simd/x86_64/jsimdcpu.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2023, Aliaksiej Kandracienka.
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -31,6 +32,8 @@
     GLOBAL_FUNCTION(jpeg_simd_cpu_support)
 
 EXTN(jpeg_simd_cpu_support):
+    push        rbp
+    mov         rbp, rsp
     push        rbx
     push        rdi
 
@@ -79,6 +82,7 @@ EXTN(jpeg_simd_cpu_support):
 
     pop         rdi
     pop         rbx
+    pop         rbp
     ret
 
 ; For some reason, the OS X linker does not honor the request to align the
diff --git a/3rdparty/libjpeg-turbo/src/structure.txt b/3rdparty/libjpeg-turbo/src/structure.txt
new file mode 100644
index 000000000000..030b8e8801bd
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/structure.txt
@@ -0,0 +1,981 @@
+IJG JPEG LIBRARY:  SYSTEM ARCHITECTURE
+
+This file was part of the Independent JPEG Group's software:
+Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+Lossless JPEG Modifications:
+Copyright (C) 1999, Ken Murchison.
+libjpeg-turbo Modifications:
+Copyright (C) 2022-2023, D. R. Commander.
+For conditions of distribution and use, see the accompanying README.ijg file.
+
+
+This file provides an overview of the architecture of the IJG JPEG software;
+that is, the functions of the various modules in the system and the interfaces
+between modules.  For more precise details about any data structure or calling
+convention, see the include files and comments in the source code.
+
+We assume that the reader is already somewhat familiar with the JPEG standard.
+The README.ijg file includes references for learning about JPEG.  The file
+libjpeg.txt describes the library from the viewpoint of an application
+programmer using the library; it's best to read that file before this one.
+Also, the file coderules.txt describes the coding style conventions we use.
+
+In this document, JPEG-specific terminology follows the JPEG standard:
+  A "component" means a color channel, e.g., Red or Luminance.
+  A "sample" is a single component value (i.e., one number in the image data).
+  A "coefficient" is a frequency coefficient (a DCT transform output number).
+  A "block" is an 8x8 group of samples or coefficients.
+  A "data unit" is an abstract data type that is either a block for lossy
+        (DCT-based) codecs or a sample for lossless (predictive) codecs.
+  An "MCU" (minimum coded unit) is an interleaved set of data units of size
+        determined by the sampling factors, or a single data unit in a
+        noninterleaved scan.
+We do not use the terms "pixel" and "sample" interchangeably.  When we say
+pixel, we mean an element of the full-size image, while a sample is an element
+of the downsampled image.  Thus the number of samples may vary across
+components while the number of pixels does not.  (This terminology is not used
+rigorously throughout the code, but it is used in places where confusion would
+otherwise result.)
+
+
+*** System features ***
+
+The IJG distribution contains two parts:
+  * A subroutine library for JPEG compression and decompression.
+  * cjpeg/djpeg, two sample applications that use the library to transform
+    JFIF JPEG files to and from several other image formats.
+cjpeg/djpeg are of no great intellectual complexity: they merely add a simple
+command-line user interface and I/O routines for several uncompressed image
+formats.  This document concentrates on the library itself.
+
+We desire the library to be capable of supporting all JPEG baseline, extended
+sequential, progressive DCT, and lossless (spatial) processes.  Hierarchical
+processes are not supported.
+
+Within these limits, any set of compression parameters allowed by the JPEG
+spec should be readable for decompression.  (We can be more restrictive about
+what formats we can generate.)  Although the system design allows for all
+parameter values, some uncommon settings are not yet implemented and may
+never be; nonintegral sampling ratios are the prime example.
+
+By itself, the library handles only interchange JPEG datastreams --- in
+particular the widely used JFIF file format.  The library can be used by
+surrounding code to process interchange or abbreviated JPEG datastreams that
+are embedded in more complex file formats.  (For example, libtiff uses this
+library to implement JPEG compression within the TIFF file format.)
+
+The library includes a substantial amount of code that is not covered by the
+JPEG standard but is necessary for typical applications of JPEG.  These
+functions preprocess the image before JPEG compression or postprocess it after
+decompression.  They include colorspace conversion, downsampling/upsampling,
+and color quantization.  This code can be omitted if not needed.
+
+A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
+and even more so in decompression postprocessing.  The decompression library
+provides multiple implementations that cover most of the useful tradeoffs,
+ranging from very-high-quality down to fast-preview operation.  On the
+compression side we have generally not provided low-quality choices, since
+compression is normally less time-critical.  It should be understood that the
+low-quality modes may not meet the JPEG standard's accuracy requirements;
+nonetheless, they are useful for viewers.
+
+
+*** System overview ***
+
+The compressor and decompressor are each divided into two main sections:
+the JPEG compressor or decompressor proper, and the preprocessing or
+postprocessing functions.  The interface between these two sections is the
+image data that Rec. ITU-T T.81 | ISO/IEC 10918-1 regards as its input or
+output: this data is in the colorspace to be used for compression, and it is
+downsampled to the sampling factors to be used.  The preprocessing and
+postprocessing steps are responsible for converting a normal image
+representation to or from this form.  (Those few applications that want to deal
+with YCbCr downsampled data can skip the preprocessing or postprocessing step.)
+
+Looking more closely, the compressor library contains the following main
+elements:
+
+  Preprocessing:
+    * Color space conversion (e.g., RGB to YCbCr).
+    * Edge expansion and downsampling.  Optionally, this step can do simple
+      smoothing --- this is often helpful for low-quality source data.
+  Lossy JPEG proper:
+    * MCU assembly, DCT, quantization.
+    * Entropy coding (sequential or progressive, Huffman or arithmetic).
+  Lossless JPEG proper:
+    * Point transform.
+    * Prediction, differencing.
+    * Entropy coding (Huffman or arithmetic)
+
+In addition to these modules we need overall control, marker generation,
+and support code (memory management & error handling).  There is also a
+module responsible for physically writing the output data --- typically
+this is just an interface to fwrite(), but some applications may need to
+do something else with the data.
+
+The decompressor library contains the following main elements:
+
+  Lossy JPEG proper:
+    * Entropy decoding (sequential or progressive, Huffman or arithmetic).
+    * Dequantization, inverse DCT, MCU disassembly.
+  Lossless JPEG proper:
+    * Entropy decoding (Huffman or arithmetic).
+    * Prediction, undifferencing.
+    * Point transform, sample size scaling.
+  Postprocessing:
+    * Upsampling.  Optionally, this step may be able to do more general
+      rescaling of the image.
+    * Color space conversion (e.g., YCbCr to RGB).  This step may also
+      provide gamma adjustment [ currently it does not ].
+    * Optional color quantization (e.g., reduction to 256 colors).
+    * Optional color precision reduction (e.g., 24-bit to 15-bit color).
+      [This feature is not currently implemented.]
+
+We also need overall control, marker parsing, and a data source module.
+The support code (memory management & error handling) can be shared with
+the compression half of the library.
+
+There may be several implementations of each of these elements, particularly
+in the decompressor, where a wide range of speed/quality tradeoffs is very
+useful.  It must be understood that some of the best speedups involve
+merging adjacent steps in the pipeline.  For example, upsampling, color space
+conversion, and color quantization might all be done at once when using a
+low-quality ordered-dither technique.  The system architecture is designed to
+allow such merging where appropriate.
+
+
+Note: it is convenient to regard edge expansion (padding to block boundaries)
+as a preprocessing/postprocessing function, even though
+Rec. ITU-T T.81 | ISO/IEC 10918-1 includes it in compression/decompression.  We
+do this because downsampling/upsampling can be simplified a little if they work
+on padded data: it's not necessary to have special cases at the right and
+bottom edges.  Therefore the interface buffer is always an integral number of
+blocks wide and high, and we expect compression preprocessing to pad the source
+data properly.  Padding will occur only to the next block (8-sample) boundary.
+In an interleaved-scan situation, additional dummy blocks may be used to fill
+out MCUs, but the MCU assembly and disassembly logic will create or discard
+these blocks internally.  (This is advantageous for speed reasons, since we
+avoid DCTing the dummy blocks.  It also permits a small reduction in file size,
+because the compressor can choose dummy block contents so as to minimize their
+size in compressed form.  Finally, it makes the interface buffer specification
+independent of whether the file is actually interleaved or not.)  Applications
+that wish to deal directly with the downsampled data must provide similar
+buffering and padding for odd-sized images.
+
+
+*** Poor man's object-oriented programming ***
+
+It should be clear by now that we have a lot of quasi-independent processing
+steps, many of which have several possible behaviors.  To avoid cluttering the
+code with lots of switch statements, we use a simple form of object-style
+programming to separate out the different possibilities.
+
+For example, two different color quantization algorithms could be implemented
+as two separate modules that present the same external interface; at runtime,
+the calling code will access the proper module indirectly through an "object".
+
+We can get the limited features we need while staying within portable C.
+The basic tool is a function pointer.  An "object" is just a struct
+containing one or more function pointer fields, each of which corresponds to
+a method name in real object-oriented languages.  During initialization we
+fill in the function pointers with references to whichever module we have
+determined we need to use in this run.  Then invocation of the module is done
+by indirecting through a function pointer; on most machines this is no more
+expensive than a switch statement, which would be the only other way of
+making the required run-time choice.  The really significant benefit, of
+course, is keeping the source code clean and well structured.
+
+We can also arrange to have private storage that varies between different
+implementations of the same kind of object.  We do this by making all the
+module-specific object structs be separately allocated entities, which will
+be accessed via pointers in the master compression or decompression struct.
+The "public" fields or methods for a given kind of object are specified by
+a commonly known struct.  But a module's initialization code can allocate
+a larger struct that contains the common struct as its first member, plus
+additional private fields.  With appropriate pointer casting, the module's
+internal functions can access these private fields.  (For a simple example,
+see jdatadst.c, which implements the external interface specified by struct
+jpeg_destination_mgr, but adds extra fields.)
+
+(Of course this would all be a lot easier if we were using C++, but we are
+not yet prepared to assume that everyone has a C++ compiler.)
+
+An important benefit of this scheme is that it is easy to provide multiple
+versions of any method, each tuned to a particular case.  While a lot of
+precalculation might be done to select an optimal implementation of a method,
+the cost per invocation is constant.  For example, the upsampling step might
+have a "generic" method, plus one or more "hardwired" methods for the most
+popular sampling factors; the hardwired methods would be faster because they'd
+use straight-line code instead of for-loops.  The cost to determine which
+method to use is paid only once, at startup, and the selection criteria are
+hidden from the callers of the method.
+
+This plan differs a little bit from usual object-oriented structures, in that
+only one instance of each object class will exist during execution.  The
+reason for having the class structure is that on different runs we may create
+different instances (choose to execute different modules).  You can think of
+the term "method" as denoting the common interface presented by a particular
+set of interchangeable functions, and "object" as denoting a group of related
+methods, or the total shared interface behavior of a group of modules.
+
+
+*** Overall control structure ***
+
+We previously mentioned the need for overall control logic in the compression
+and decompression libraries.  In IJG implementations prior to v5, overall
+control was mostly provided by "pipeline control" modules, which proved to be
+large, unwieldy, and hard to understand.  To improve the situation, the
+control logic has been subdivided into multiple modules.  The control modules
+consist of:
+
+1. Master control for module selection and initialization.  This has two
+responsibilities:
+
+   1A.  Startup initialization at the beginning of image processing.
+        The individual processing modules to be used in this run are selected
+        and given initialization calls.
+
+   1B.  Per-pass control.  This determines how many passes will be performed
+        and calls each active processing module to configure itself
+        appropriately at the beginning of each pass.  End-of-pass processing,
+        where necessary, is also invoked from the master control module.
+
+   Method selection is partially distributed, in that a particular processing
+   module may contain several possible implementations of a particular method,
+   which it will select among when given its initialization call.  The master
+   control code need only be concerned with decisions that affect more than
+   one module.
+
+2. Data buffering control.  A separate control module exists for each
+   inter-processing-step data buffer.  This module is responsible for
+   invoking the processing steps that write or read that data buffer.
+
+Each buffer controller sees the world as follows:
+
+input data => processing step A => buffer => processing step B => output data
+                      |              |               |
+              ------------------ controller ------------------
+
+The controller knows the dataflow requirements of steps A and B: how much data
+they want to accept in one chunk and how much they output in one chunk.  Its
+function is to manage its buffer and call A and B at the proper times.
+
+A data buffer control module may itself be viewed as a processing step by a
+higher-level control module; thus the control modules form a binary tree with
+elementary processing steps at the leaves of the tree.
+
+The control modules are objects.  A considerable amount of flexibility can
+be had by replacing implementations of a control module.  For example:
+* Merging of adjacent steps in the pipeline is done by replacing a control
+  module and its pair of processing-step modules with a single processing-
+  step module.  (Hence the possible merges are determined by the tree of
+  control modules.)
+* In some processing modes, a given interstep buffer need only be a "strip"
+  buffer large enough to accommodate the desired data chunk sizes.  In other
+  modes, a full-image buffer is needed and several passes are required.
+  The control module determines which kind of buffer is used and manipulates
+  virtual array buffers as needed.  One or both processing steps may be
+  unaware of the multi-pass behavior.
+
+In theory, we might be able to make all of the data buffer controllers
+interchangeable and provide just one set of implementations for all.  In
+practice, each one contains considerable special-case processing for its
+particular job.  The buffer controller concept should be regarded as an
+overall system structuring principle, not as a complete description of the
+task performed by any one controller.
+
+
+*** Compression object structure ***
+
+Here is a sketch of the logical structure of the JPEG compression library in
+lossy mode:
+
+                                                 |-- Colorspace conversion
+                  |-- Preprocessing controller --|
+                  |                              |-- Downsampling
+Main controller --|
+                  |                            |-- Forward DCT, quantize
+                  |-- Coefficient controller --|
+                                               |-- Entropy encoding
+
+... and in lossless mode:
+
+                                                 |-- Colorspace conversion
+                  |-- Preprocessing controller --|
+                  |                              |-- Downsampling
+Main controller --|
+                  |                           |-- Point transform
+                  |                           |
+                  |-- Difference controller --|-- Prediction, differencing
+                                              |
+                                              |-- Lossless mode entropy
+                                                  encoding
+
+This sketch also describes the flow of control (subroutine calls) during
+typical image data processing.  Each of the components shown in the diagram is
+an "object" which may have several different implementations available.  One
+or more source code files contain the actual implementation(s) of each object.
+
+The objects shown above are:
+
+* Main controller: buffer controller for the subsampled-data buffer, which
+  holds the preprocessed input data.  This controller invokes preprocessing to
+  fill the subsampled-data buffer, and JPEG compression to empty it.  There is
+  usually no need for a full-image buffer here; a strip buffer is adequate.
+
+* Preprocessing controller: buffer controller for the downsampling input data
+  buffer, which lies between colorspace conversion and downsampling.  Note
+  that a unified conversion/downsampling module would probably replace this
+  controller entirely.
+
+* Colorspace conversion: converts application image data into the desired
+  JPEG color space; also changes the data from pixel-interleaved layout to
+  separate component planes.  Processes one pixel row at a time.
+
+* Downsampling: performs reduction of chroma components as required.
+  Optionally may perform pixel-level smoothing as well.  Processes a "row
+  group" at a time, where a row group is defined as Vmax pixel rows of each
+  component before downsampling, and Vk sample rows afterwards (remember Vk
+  differs across components).  Some downsampling or smoothing algorithms may
+  require context rows above and below the current row group; the
+  preprocessing controller is responsible for supplying these rows via proper
+  buffering.  The downsampler is responsible for edge expansion at the right
+  edge (i.e., extending each sample row to a multiple of 8 samples); but the
+  preprocessing controller is responsible for vertical edge expansion (i.e.,
+  duplicating the bottom sample row as needed to make a multiple of 8 rows).
+
+* Coefficient controller: buffer controller for the DCT-coefficient data.
+  This controller handles MCU assembly, including insertion of dummy DCT
+  blocks when needed at the right or bottom edge.  When performing
+  Huffman-code optimization or emitting a multiscan JPEG file, this
+  controller is responsible for buffering the full image.  The equivalent of
+  one fully interleaved MCU row of subsampled data is processed per call,
+  even when the JPEG file is noninterleaved.
+
+* Forward DCT and quantization: Perform DCT, quantize, and emit coefficients.
+  Works on one or more DCT blocks at a time.  (Note: the coefficients are now
+  emitted in normal array order, which the entropy encoder is expected to
+  convert to zigzag order as necessary.  Prior versions of the IJG code did
+  the conversion to zigzag order within the quantization step.)
+
+* Entropy encoding: Perform Huffman or arithmetic entropy coding and emit the
+  coded data to the data destination module.  Works on one MCU per call.
+  For progressive JPEG, the same DCT blocks are fed to the entropy coder
+  during each pass, and the coder must emit the appropriate subset of
+  coefficients.
+
+* Difference controller: buffer controller for the spatial difference data.
+  When emitting a multiscan JPEG file, this controller is responsible for
+  buffering the full image.  The equivalent of one fully interleaved MCU row
+  of subsampled data is processed per call, even when the JPEG file is
+  noninterleaved.
+
+* Point transform: Downscale the data by the point transform value.
+
+* Prediction and differencing: Calculate the predictor and subtract it
+  from the input.  Works on one scanline per call.  The difference
+  controller supplies the prior scanline, which is used for prediction.
+
+* Lossless mode entropy encoding: Perform Huffman or arithmetic entropy coding
+  and emit the coded data to the data destination module.  This module handles
+  MCU assembly.  Works on one MCU row per call.
+
+In addition to the above objects, the compression library includes these
+objects:
+
+* Master control: determines the number of passes required, controls overall
+  and per-pass initialization of the other modules.
+
+* Marker writing: generates JPEG markers (except for RSTn, which is emitted
+  by the entropy encoder when needed).
+
+* Data destination manager: writes the output JPEG datastream to its final
+  destination (e.g., a file).  The destination manager supplied with the
+  library knows how to write to a stdio stream or to a memory buffer;
+  for other behaviors, the surrounding application may provide its own
+  destination manager.
+
+* Memory manager: allocates and releases memory, controls virtual arrays
+  (with backing store management, where required).
+
+* Error handler: performs formatting and output of error and trace messages;
+  determines handling of nonfatal errors.  The surrounding application may
+  override some or all of this object's methods to change error handling.
+
+* Progress monitor: supports output of "percent-done" progress reports.
+  This object represents an optional callback to the surrounding application:
+  if wanted, it must be supplied by the application.
+
+The error handler, destination manager, and progress monitor objects are
+defined as separate objects in order to simplify application-specific
+customization of the JPEG library.  A surrounding application may override
+individual methods or supply its own all-new implementation of one of these
+objects.  The object interfaces for these objects are therefore treated as
+part of the application interface of the library, whereas the other objects
+are internal to the library.
+
+The error handler and memory manager are shared by JPEG compression and
+decompression; the progress monitor, if used, may be shared as well.
+
+
+*** Decompression object structure ***
+
+Here is a sketch of the logical structure of the JPEG decompression library in
+lossy mode:
+
+                                               |-- Entropy decoding
+                  |-- Coefficient controller --|
+                  |                            |-- Dequantize, Inverse DCT
+Main controller --|
+                  |                               |-- Upsampling
+                  |-- Postprocessing controller --|   |-- Colorspace conversion
+                                                  |-- Color quantization
+                                                  |-- Color precision reduction
+
+... and in lossless mode:
+
+                                              |-- Lossless mode entropy
+                                              |   decoding
+                                              |
+                  |-- Difference controller --|-- Prediction, undifferencing
+                  |                           |
+                  |                           |-- Point transform, sample size
+                  |                               scaling
+Main controller --|
+                  |                               |-- Upsampling
+                  |-- Postprocessing controller --|
+                                                  |-- Color precision reduction
+
+As before, this diagram also represents typical control flow.  The objects
+shown are:
+
+* Main controller: buffer controller for the subsampled-data buffer, which
+  holds the output of JPEG decompression proper.  This controller's primary
+  task is to feed the postprocessing procedure.  Some upsampling algorithms
+  may require context rows above and below the current row group; when this
+  is true, the main controller is responsible for managing its buffer so as
+  to make context rows available.  In the current design, the main buffer is
+  always a strip buffer; a full-image buffer is never required.
+
+* Coefficient controller: buffer controller for the DCT-coefficient data.
+  This controller handles MCU disassembly, including deletion of any dummy
+  DCT blocks at the right or bottom edge.  When reading a multiscan JPEG
+  file, this controller is responsible for buffering the full image.
+  (Buffering DCT coefficients, rather than samples, is necessary to support
+  progressive JPEG.)  The equivalent of one fully interleaved MCU row of
+  subsampled data is processed per call, even when the source JPEG file is
+  noninterleaved.
+
+* Entropy decoding: Read coded data from the data source module and perform
+  Huffman or arithmetic entropy decoding.  Works on one MCU per call.
+  For progressive JPEG decoding, the coefficient controller supplies the prior
+  coefficients of each MCU (initially all zeroes), which the entropy decoder
+  modifies in each scan.
+
+* Dequantization and inverse DCT: like it says.  Note that the coefficients
+  buffered by the coefficient controller have NOT been dequantized; we
+  merge dequantization and inverse DCT into a single step for speed reasons.
+  When scaled-down output is asked for, simplified DCT algorithms may be used
+  that emit fewer samples per DCT block, not the full 8x8.  Works on one DCT
+  block at a time.
+
+* Difference controller: buffer controller for the spatial difference data.
+  When reading a multiscan JPEG file, this controller is responsible for
+  buffering the full image. The equivalent of one fully interleaved MCU row
+  is processed per call, even when the source JPEG file is noninterleaved.
+
+* Lossless mode entropy decoding: Read coded data from the data source module
+  and perform Huffman or arithmetic entropy decoding.  Works on one MCU row per
+  call.
+
+* Prediction and undifferencing: Calculate the predictor and add it to the
+  decoded difference.  Works on one scanline per call.  The difference
+  controller supplies the prior scanline, which is used for prediction.
+
+* Point transform and sample size scaling: Upscale the data by the point
+  transform value and downscale it to fit into the compiled-in sample size.
+
+* Postprocessing controller: buffer controller for the color quantization
+  input buffer, when quantization is in use.  (Without quantization, this
+  controller just calls the upsampler.)  For two-pass quantization, this
+  controller is responsible for buffering the full-image data.
+
+* Upsampling: restores chroma components to full size.  (May support more
+  general output rescaling, too.  Note that if undersized DCT outputs have
+  been emitted by the DCT module, this module must adjust so that properly
+  sized outputs are created.)  Works on one row group at a time.  This module
+  also calls the color conversion module, so its top level is effectively a
+  buffer controller for the upsampling->color conversion buffer.  However, in
+  all but the highest-quality operating modes, upsampling and color
+  conversion are likely to be merged into a single step.
+
+* Colorspace conversion: convert from JPEG color space to output color space,
+  and change data layout from separate component planes to pixel-interleaved.
+  Works on one pixel row at a time.
+
+* Color quantization: reduce the data to colormapped form, using either an
+  externally specified colormap or an internally generated one.  This module
+  is not used for full-color output.  Works on one pixel row at a time; may
+  require two passes to generate a color map.  Note that the output will
+  always be a single component representing colormap indexes.  In the current
+  design, the output values are JSAMPLEs, J12SAMPLEs, or J16SAMPLEs, so the
+  library cannot quantize to more than 256 colors when using 8-bit data
+  precision.  This is unlikely to be a problem in practice.
+
+* Color reduction: this module handles color precision reduction, e.g.,
+  generating 15-bit color (5 bits/primary) from JPEG's 24-bit output.
+  Not quite clear yet how this should be handled... should we merge it with
+  colorspace conversion???
+
+Note that some high-speed operating modes might condense the entire
+postprocessing sequence to a single module (upsample, color convert, and
+quantize in one step).
+
+In addition to the above objects, the decompression library includes these
+objects:
+
+* Master control: determines the number of passes required, controls overall
+  and per-pass initialization of the other modules.  This is subdivided into
+  input and output control: jdinput.c controls only input-side processing,
+  while jdmaster.c handles overall initialization and output-side control.
+
+* Marker reading: decodes JPEG markers (except for RSTn).
+
+* Data source manager: supplies the input JPEG datastream.  The source
+  manager supplied with the library knows how to read from a stdio stream
+  or from a memory buffer;  for other behaviors, the surrounding application
+  may provide its own source manager.
+
+* Memory manager: same as for compression library.
+
+* Error handler: same as for compression library.
+
+* Progress monitor: same as for compression library.
+
+As with compression, the data source manager, error handler, and progress
+monitor are candidates for replacement by a surrounding application.
+
+
+*** Decompression input and output separation ***
+
+To support efficient incremental display of progressive JPEG files, the
+decompressor is divided into two sections that can run independently:
+
+1. Data input includes marker parsing, entropy decoding, and input into the
+   coefficient controller's DCT coefficient buffer.  Note that this
+   processing is relatively cheap and fast.
+
+2. Data output reads from the DCT coefficient buffer and performs the IDCT
+   and all postprocessing steps.
+
+For a progressive JPEG file, the data input processing is allowed to get
+arbitrarily far ahead of the data output processing.  (This occurs only
+if the application calls jpeg_consume_input(); otherwise input and output
+run in lockstep, since the input section is called only when the output
+section needs more data.)  In this way the application can avoid making
+extra display passes when data is arriving faster than the display pass
+can run.  Furthermore, it is possible to abort an output pass without
+losing anything, since the coefficient buffer is read-only as far as the
+output section is concerned.  See libjpeg.txt for more detail.
+
+A full-image coefficient array is only created if the JPEG file has multiple
+scans (or if the application specifies buffered-image mode anyway).  When
+reading a single-scan file, the coefficient controller normally creates only
+a one-MCU buffer, so input and output processing must run in lockstep in this
+case.  jpeg_consume_input() is effectively a no-op in this situation.
+
+The main impact of dividing the decompressor in this fashion is that we must
+be very careful with shared variables in the cinfo data structure.  Each
+variable that can change during the course of decompression must be
+classified as belonging to data input or data output, and each section must
+look only at its own variables.  For example, the data output section may not
+depend on any of the variables that describe the current scan in the JPEG
+file, because these may change as the data input section advances into a new
+scan.
+
+The progress monitor is (somewhat arbitrarily) defined to treat input of the
+file as one pass when buffered-image mode is not used, and to ignore data
+input work completely when buffered-image mode is used.  Note that the
+library has no reliable way to predict the number of passes when dealing
+with a progressive JPEG file, nor can it predict the number of output passes
+in buffered-image mode.  So the work estimate is inherently bogus anyway.
+
+No comparable division is currently made in the compression library, because
+there isn't any real need for it.
+
+
+*** Data formats ***
+
+Arrays of 8-bit pixel sample values use the following data structure:
+
+    typedef something JSAMPLE;          a pixel component value, 0..MAXJSAMPLE
+    typedef JSAMPLE *JSAMPROW;          ptr to a row of samples
+    typedef JSAMPROW *JSAMPARRAY;       ptr to a list of rows
+    typedef JSAMPARRAY *JSAMPIMAGE;     ptr to a list of color-component arrays
+
+Arrays of 12-bit pixel sample values use the following data structure:
+
+    typedef something J12SAMPLE;        a pixel component value, 0..MAXJ12SAMPLE
+    typedef J12SAMPLE *J12SAMPROW;      ptr to a row of samples
+    typedef J12SAMPROW *J12SAMPARRAY;   ptr to a list of rows
+    typedef J12SAMPARRAY *J12SAMPIMAGE; ptr to a list of color-component arrays
+
+Arrays of 16-bit pixel sample values use the following data structure:
+
+    typedef something J16SAMPLE;        a pixel component value, 0..MAXJ16SAMPLE
+    typedef J16SAMPLE *J16SAMPROW;      ptr to a row of samples
+    typedef J16SAMPROW *J16SAMPARRAY;   ptr to a list of rows
+    typedef J16SAMPARRAY *J16SAMPIMAGE; ptr to a list of color-component arrays
+
+The basic element type JSAMPLE (8-bit sample) will be unsigned char, the basic
+element type J12SAMPLE (12-bit sample) will be short, and the basic element
+type J16SAMPLE (16-bit sample) will be unsigned short.
+
+With these conventions, J*SAMPLE values can be assumed to be >= 0.  This helps
+simplify correct rounding during downsampling, etc.  The JPEG standard's
+specification that 8-bit sample values run from -128..127 is accommodated by
+subtracting 128 from the sample value in the DCT step.  Similarly, during
+decompression the output of the IDCT step will be immediately shifted back to
+0..255.  (NOTE: different values are required when 12-bit samples are in use.
+When 8-bit samples are in use, the code uses MAXJSAMPLE and CENTERJSAMPLE,
+which are defined as 255 and 128 respectively.  When 12-bit samples are in use,
+the code uses MAXJ12SAMPLE and CENTERJ12SAMPLE, which are defined as 4095 and
+2048 respectively.  When 16-bit samples are in use, the code uses MAXJ16SAMPLE
+and CENTERJ16SAMPLE, which are defined as 65535 and 32768 respectively.)
+
+We use a pointer per row, rather than a two-dimensional J*SAMPLE array.  This
+choice costs only a small amount of memory and has several benefits:
+* Code using the data structure doesn't need to know the allocated width of
+  the rows.  This simplifies edge expansion/compression, since we can work
+  in an array that's wider than the logical picture width.
+* Indexing doesn't require multiplication; this is a performance win on many
+  machines.
+* Arrays with more than 64K total elements can be supported even on machines
+  where malloc() cannot allocate chunks larger than 64K.
+* The rows forming a component array may be allocated at different times
+  without extra copying.  This trick allows some speedups in smoothing steps
+  that need access to the previous and next rows.
+
+Note that each color component is stored in a separate array; we don't use the
+traditional layout in which the components of a pixel are stored together.
+This simplifies coding of modules that work on each component independently,
+because they don't need to know how many components there are.  Furthermore,
+we can read or write each component to a temporary file independently, which
+is helpful when dealing with noninterleaved JPEG files.
+
+In general, a specific sample value is accessed by code such as
+        image[colorcomponent][row][col]
+where col is measured from the image left edge, but row is measured from the
+first sample row currently in memory.  Either of the first two indexings can
+be precomputed by copying the relevant pointer.
+
+
+Since most image-processing applications prefer to work on images in which
+the components of a pixel are stored together, the data passed to or from the
+surrounding application uses the traditional convention: a single pixel is
+represented by N consecutive J*SAMPLE values, and an image row is an array of
+(# of color components)*(image width) J*SAMPLEs.  One or more rows of data can
+be represented by a pointer of type J*SAMPARRAY in this scheme.  This scheme is
+converted to component-wise storage inside the JPEG library.  (Applications
+that want to skip JPEG preprocessing or postprocessing will have to contend
+with component-wise storage.)
+
+
+Arrays of DCT-coefficient values use the following data structure:
+
+    typedef short JCOEF;                a 16-bit signed integer
+    typedef JCOEF JBLOCK[DCTSIZE2];     an 8x8 block of coefficients
+    typedef JBLOCK *JBLOCKROW;          ptr to one horizontal row of 8x8 blocks
+    typedef JBLOCKROW *JBLOCKARRAY;     ptr to a list of such rows
+    typedef JBLOCKARRAY *JBLOCKIMAGE;   ptr to a list of color component arrays
+
+The underlying type is at least a 16-bit signed integer; while "short" is big
+enough on all machines of interest, on some machines it is preferable to use
+"int" for speed reasons, despite the storage cost.  Coefficients are grouped
+into 8x8 blocks (but we always use #defines DCTSIZE and DCTSIZE2 rather than
+"8" and "64").
+
+The contents of a coefficient block may be in either "natural" or zigzagged
+order, and may be true values or divided by the quantization coefficients,
+depending on where the block is in the processing pipeline.  In the current
+library, coefficient blocks are kept in natural order everywhere; the entropy
+codecs zigzag or dezigzag the data as it is written or read.  The blocks
+contain quantized coefficients everywhere outside the DCT/IDCT subsystems.
+(This latter decision may need to be revisited to support variable
+quantization a la JPEG Part 3.)
+
+Notice that the allocation unit is now a row of 8x8 blocks, corresponding to
+eight rows of samples.  Otherwise the structure is much the same as for
+samples, and for the same reasons.
+
+
+*** Suspendable processing ***
+
+In some applications it is desirable to use the JPEG library as an
+incremental, memory-to-memory filter.  In this situation the data source or
+destination may be a limited-size buffer, and we can't rely on being able to
+empty or refill the buffer at arbitrary times.  Instead the application would
+like to have control return from the library at buffer overflow/underrun, and
+then resume compression or decompression at a later time.
+
+This scenario is supported for simple cases.  (For anything more complex, we
+recommend that the application "bite the bullet" and develop real multitasking
+capability.)  The libjpeg.txt file goes into more detail about the usage and
+limitations of this capability; here we address the implications for library
+structure.
+
+The essence of the problem is that the entropy codec (coder or decoder) must
+be prepared to stop at arbitrary times.  In turn, the controllers that call
+the entropy codec must be able to stop before having produced or consumed all
+the data that they normally would handle in one call.  That part is reasonably
+straightforward: we make the controller call interfaces include "progress
+counters" which indicate the number of data chunks successfully processed, and
+we require callers to test the counter rather than just assume all of the data
+was processed.
+
+Rather than trying to restart at an arbitrary point, the current Huffman
+codecs are designed to restart at the beginning of the current MCU after a
+suspension due to buffer overflow/underrun.  At the start of each call, the
+codec's internal state is loaded from permanent storage (in the JPEG object
+structures) into local variables.  On successful completion of the MCU, the
+permanent state is updated.  (This copying is not very expensive, and may even
+lead to *improved* performance if the local variables can be registerized.)
+If a suspension occurs, the codec simply returns without updating the state,
+thus effectively reverting to the start of the MCU.  Note that this implies
+leaving some data unprocessed in the source/destination buffer (ie, the
+compressed partial MCU).  The data source/destination module interfaces are
+specified so as to make this possible.  This also implies that the data buffer
+must be large enough to hold a worst-case compressed MCU; a couple thousand
+bytes should be enough.
+
+In a successive-approximation AC refinement scan, the progressive Huffman
+decoder has to be able to undo assignments of newly nonzero coefficients if it
+suspends before the MCU is complete, since decoding requires distinguishing
+previously-zero and previously-nonzero coefficients.  This is a bit tedious
+but probably won't have much effect on performance.  Other variants of Huffman
+decoding need not worry about this, since they will just store the same values
+again if forced to repeat the MCU.
+
+This approach would probably not work for an arithmetic codec, since its
+modifiable state is quite large and couldn't be copied cheaply.  Instead it
+would have to suspend and resume exactly at the point of the buffer end.
+
+The JPEG marker reader is designed to cope with suspension at an arbitrary
+point.  It does so by backing up to the start of the marker parameter segment,
+so the data buffer must be big enough to hold the largest marker of interest.
+Again, a couple KB should be adequate.  (A special "skip" convention is used
+to bypass COM and APPn markers, so these can be larger than the buffer size
+without causing problems; otherwise a 64K buffer would be needed in the worst
+case.)
+
+The JPEG marker writer currently does *not* cope with suspension.
+We feel that this is not necessary; it is much easier simply to require
+the application to ensure there is enough buffer space before starting.  (An
+empty 2K buffer is more than sufficient for the header markers; and ensuring
+there are a dozen or two bytes available before calling jpeg_finish_compress()
+will suffice for the trailer.)  This would not work for writing multi-scan
+JPEG files, but we simply do not intend to support that capability with
+suspension.
+
+
+*** Memory manager services ***
+
+The JPEG library's memory manager controls allocation and deallocation of
+memory, and it manages large "virtual" data arrays on machines where the
+operating system does not provide virtual memory.  Note that the same
+memory manager serves both compression and decompression operations.
+
+In all cases, allocated objects are tied to a particular compression or
+decompression master record, and they will be released when that master
+record is destroyed.
+
+The memory manager does not provide explicit deallocation of objects.
+Instead, objects are created in "pools" of free storage, and a whole pool
+can be freed at once.  This approach helps prevent storage-leak bugs, and
+it speeds up operations whenever malloc/free are slow (as they often are).
+The pools can be regarded as lifetime identifiers for objects.  Two
+pools/lifetimes are defined:
+  * JPOOL_PERMANENT     lasts until master record is destroyed
+  * JPOOL_IMAGE         lasts until done with image (JPEG datastream)
+Permanent lifetime is used for parameters and tables that should be carried
+across from one datastream to another; this includes all application-visible
+parameters.  Image lifetime is used for everything else.  (A third lifetime,
+JPOOL_PASS = one processing pass, was originally planned.  However it was
+dropped as not being worthwhile.  The actual usage patterns are such that the
+peak memory usage would be about the same anyway; and having per-pass storage
+substantially complicates the virtual memory allocation rules --- see below.)
+
+The memory manager deals with three kinds of object:
+1. "Small" objects.  Typically these require no more than 10K-20K total.
+2. "Large" objects.  These may require tens to hundreds of K depending on
+   image size.  Semantically they behave the same as small objects, but we
+   distinguish them because pool allocation heuristics may differ for large and
+   small objects (historically, large objects were also referenced by far
+   pointers on MS-DOS machines.)  Note that individual "large" objects cannot
+   exceed the size allowed by type size_t, which may be 64K or less on some
+   machines.
+3. "Virtual" objects.  These are large 2-D arrays of J*SAMPLEs or JBLOCKs
+   (typically large enough for the entire image being processed).  The
+   memory manager provides stripwise access to these arrays.  On machines
+   without virtual memory, the rest of the array may be swapped out to a
+   temporary file.
+
+(Note: J*SAMPARRAY and JBLOCKARRAY data structures are a combination of large
+objects for the data proper and small objects for the row pointers.  For
+convenience and speed, the memory manager provides single routines to create
+these structures.  Similarly, virtual arrays include a small control block
+and a J*SAMPARRAY or JBLOCKARRAY working buffer, all created with one call.)
+
+In the present implementation, virtual arrays are only permitted to have image
+lifespan.  (Permanent lifespan would not be reasonable, and pass lifespan is
+not very useful since a virtual array's raison d'etre is to store data for
+multiple passes through the image.)  We also expect that only "small" objects
+will be given permanent lifespan, though this restriction is not required by
+the memory manager.
+
+In a non-virtual-memory machine, some performance benefit can be gained by
+making the in-memory buffers for virtual arrays be as large as possible.
+(For small images, the buffers might fit entirely in memory, so blind
+swapping would be very wasteful.)  The memory manager will adjust the height
+of the buffers to fit within a prespecified maximum memory usage.  In order
+to do this in a reasonably optimal fashion, the manager needs to allocate all
+of the virtual arrays at once.  Therefore, there isn't a one-step allocation
+routine for virtual arrays; instead, there is a "request" routine that simply
+allocates the control block, and a "realize" routine (called just once) that
+determines space allocation and creates all of the actual buffers.  The
+realize routine must allow for space occupied by non-virtual large objects.
+(We don't bother to factor in the space needed for small objects, on the
+grounds that it isn't worth the trouble.)
+
+To support all this, we establish the following protocol for doing business
+with the memory manager:
+  1. Modules must request virtual arrays (which may have only image lifespan)
+     during the initial setup phase, i.e., in their jinit_xxx routines.
+  2. All "large" objects (including J*SAMPARRAYs and JBLOCKARRAYs) must also be
+     allocated during initial setup.
+  3. realize_virt_arrays will be called at the completion of initial setup.
+     The above conventions ensure that sufficient information is available
+     for it to choose a good size for virtual array buffers.
+Small objects of any lifespan may be allocated at any time.  We expect that
+the total space used for small objects will be small enough to be negligible
+in the realize_virt_arrays computation.
+
+In a virtual-memory machine, we simply pretend that the available space is
+infinite, thus causing realize_virt_arrays to decide that it can allocate all
+the virtual arrays as full-size in-memory buffers.  The overhead of the
+virtual-array access protocol is very small when no swapping occurs.
+
+A virtual array can be specified to be "pre-zeroed"; when this flag is set,
+never-yet-written sections of the array are set to zero before being made
+available to the caller.  If this flag is not set, never-written sections
+of the array contain garbage.  (This feature exists primarily because the
+equivalent logic would otherwise be needed in jdcoefct.c for progressive
+JPEG mode; we may as well make it available for possible other uses.)
+
+The first write pass on a virtual array is required to occur in top-to-bottom
+order; read passes, as well as any write passes after the first one, may
+access the array in any order.  This restriction exists partly to simplify
+the virtual array control logic, and partly because some file systems may not
+support seeking beyond the current end-of-file in a temporary file.  The main
+implication of this restriction is that rearrangement of rows (such as
+converting top-to-bottom data order to bottom-to-top) must be handled while
+reading data out of the virtual array, not while putting it in.
+
+
+*** Memory manager internal structure ***
+
+To isolate system dependencies as much as possible, we have broken the
+memory manager into two parts.  There is a reasonably system-independent
+"front end" (jmemmgr.c) and a "back end" that contains only the code
+likely to change across systems.  All of the memory management methods
+outlined above are implemented by the front end.  The back end provides
+the following routines for use by the front end (none of these routines
+are known to the rest of the JPEG code):
+
+jpeg_mem_init, jpeg_mem_term    system-dependent initialization/shutdown
+
+jpeg_get_small, jpeg_free_small interface to malloc and free library routines
+                                (or their equivalents)
+
+jpeg_get_large, jpeg_free_large historically was used to interface with
+                                FAR malloc/free on MS-DOS machines;  now the
+                                same as jpeg_get_small/jpeg_free_small
+
+jpeg_mem_available              estimate available memory
+
+jpeg_open_backing_store         create a backing-store object
+
+read_backing_store,             manipulate a backing-store object
+write_backing_store,
+close_backing_store
+
+On some systems there will be more than one type of backing-store object.
+jpeg_open_backing_store is responsible for choosing how to implement a given
+object.  The read/write/close routines are method pointers in the structure
+that describes a given object; this lets them be different for different object
+types.
+
+It may be necessary to ensure that backing store objects are explicitly
+released upon abnormal program termination.  To support this, we will expect
+the main program or surrounding application to arrange to call self_destruct
+(typically via jpeg_destroy) upon abnormal termination.  This may require a
+SIGINT signal handler or equivalent.  We don't want to have the back end module
+install its own signal handler, because that would pre-empt the surrounding
+application's ability to control signal handling.
+
+The IJG distribution includes several memory manager back end implementations.
+Usually the same back end should be suitable for all applications on a given
+system, but it is possible for an application to supply its own back end at
+need.
+
+
+*** Implications of DNL marker ***
+
+Some JPEG files may use a DNL marker to postpone definition of the image
+height (this would be useful for a fax-like scanner's output, for instance).
+In these files the SOF marker claims the image height is 0, and you only
+find out the true image height at the end of the first scan.
+
+We could read these files as follows:
+1. Upon seeing zero image height, replace it by 65535 (the maximum allowed).
+2. When the DNL is found, update the image height in the global image
+   descriptor.
+This implies that control modules must avoid making copies of the image
+height, and must re-test for termination after each MCU row.  This would
+be easy enough to do.
+
+In cases where image-size data structures are allocated, this approach will
+result in very inefficient use of virtual memory or much-larger-than-necessary
+temporary files.  This seems acceptable for something that probably won't be a
+mainstream usage.  People might have to forgo use of memory-hogging options
+(such as two-pass color quantization or noninterleaved JPEG files) if they
+want efficient conversion of such files.  (One could improve efficiency by
+demanding a user-supplied upper bound for the height, less than 65536; in most
+cases it could be much less.)
+
+The standard also permits the SOF marker to overestimate the image height,
+with a DNL to give the true, smaller height at the end of the first scan.
+This would solve the space problems if the overestimate wasn't too great.
+However, it implies that you don't even know whether DNL will be used.
+
+This leads to a couple of very serious objections:
+1. Testing for a DNL marker must occur in the inner loop of the decompressor's
+   Huffman decoder; this implies a speed penalty whether the feature is used
+   or not.
+2. There is no way to hide the last-minute change in image height from an
+   application using the decoder.  Thus *every* application using the IJG
+   library would suffer a complexity penalty whether it cared about DNL or
+   not.
+We currently do not support DNL because of these problems.
+
+A different approach is to insist that DNL-using files be preprocessed by a
+separate program that reads ahead to the DNL, then goes back and fixes the SOF
+marker.  This is a much simpler solution and is probably far more efficient.
+Even if one wants piped input, buffering the first scan of the JPEG file needs
+a lot smaller temp file than is implied by the maximum-height method.  For
+this approach we'd simply treat DNL as a no-op in the decompressor (at most,
+check that it matches the SOF image height).
+
+We will not worry about making the compressor capable of outputting DNL.
+Something similar to the first scheme above could be applied if anyone ever
+wants to make that work.
diff --git a/3rdparty/libjpeg-turbo/src/tjbench.c b/3rdparty/libjpeg-turbo/src/tjbench.c
new file mode 100644
index 000000000000..9dc6427880bb
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/tjbench.c
@@ -0,0 +1,1323 @@
+/*
+ * Copyright (C)2009-2019, 2021-2024 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+#include <errno.h>
+#include <limits.h>
+#if !defined(_MSC_VER) || _MSC_VER > 1600
+#include <stdint.h>
+#endif
+#include <cdjpeg.h>
+#include "./tjutil.h"
+#include "./turbojpeg.h"
+
+
+#define THROW(op, err) { \
+  printf("ERROR in line %d while %s:\n%s\n", __LINE__, op, err); \
+  retval = -1;  goto bailout; \
+}
+#define THROW_UNIX(m)  THROW(m, strerror(errno))
+
+static char tjErrorStr[JMSG_LENGTH_MAX] = "\0";
+static int tjErrorLine = -1, tjErrorCode = -1;
+
+#define THROW_TJG() { \
+  printf("ERROR in line %d\n%s\n", __LINE__, tj3GetErrorStr(NULL)); \
+  retval = -1;  goto bailout; \
+}
+
+#define THROW_TJ() { \
+  int _tjErrorCode = tj3GetErrorCode(handle); \
+  char *_tjErrorStr = tj3GetErrorStr(handle); \
+  \
+  if (!tj3Get(handle, TJPARAM_STOPONWARNING) && \
+      _tjErrorCode == TJERR_WARNING) { \
+    if (strncmp(tjErrorStr, _tjErrorStr, JMSG_LENGTH_MAX) || \
+        tjErrorCode != _tjErrorCode || tjErrorLine != __LINE__) { \
+      strncpy(tjErrorStr, _tjErrorStr, JMSG_LENGTH_MAX); \
+      tjErrorStr[JMSG_LENGTH_MAX - 1] = '\0'; \
+      tjErrorCode = _tjErrorCode; \
+      tjErrorLine = __LINE__; \
+      printf("WARNING in line %d:\n%s\n", __LINE__, _tjErrorStr); \
+    } \
+  } else { \
+    printf("%s in line %d:\n%s\n", \
+           _tjErrorCode == TJERR_WARNING ? "WARNING" : "ERROR", __LINE__, \
+           _tjErrorStr); \
+    retval = -1;  goto bailout; \
+  } \
+}
+
+#define IS_CROPPED(cr)  (cr.x != 0 || cr.y != 0 || cr.w != 0 || cr.h != 0)
+
+#define CROPPED_WIDTH(width) \
+  (IS_CROPPED(cr) ? (cr.w != 0 ? cr.w : TJSCALED(width, sf) - cr.x) : \
+                    TJSCALED(width, sf))
+
+#define CROPPED_HEIGHT(height) \
+  (IS_CROPPED(cr) ? (cr.h != 0 ? cr.h : TJSCALED(height, sf) - cr.y) : \
+                    TJSCALED(height, sf))
+
+static int stopOnWarning = 0, bottomUp = 0, noRealloc = 1, fastUpsample = 0,
+  fastDCT = 0, optimize = 0, progressive = 0, limitScans = 0, maxMemory = 0,
+  maxPixels = 0, arithmetic = 0, lossless = 0, restartIntervalBlocks = 0,
+  restartIntervalRows = 0;
+static int precision = 8, sampleSize, compOnly = 0, decompOnly = 0, doYUV = 0,
+  quiet = 0, doTile = 0, pf = TJPF_BGR, yuvAlign = 1, doWrite = 1;
+static char *ext = "ppm";
+static const char *pixFormatStr[TJ_NUMPF] = {
+  "RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "GRAY", "", "", "", "", "CMYK"
+};
+static const char *subNameLong[TJ_NUMSAMP] = {
+  "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1", "4:4:1"
+};
+static const char *csName[TJ_NUMCS] = {
+  "RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
+};
+static const char *subName[TJ_NUMSAMP] = {
+  "444", "422", "420", "GRAY", "440", "411", "441"
+};
+static tjscalingfactor *scalingFactors = NULL, sf = { 1, 1 };
+static tjregion cr = { 0, 0, 0, 0 };
+static int nsf = 0, xformOp = TJXOP_NONE, xformOpt = 0;
+static int (*customFilter) (short *, tjregion, tjregion, int, int,
+                            tjtransform *);
+static double benchTime = 5.0, warmup = 1.0;
+
+
+static char *formatName(int subsamp, int cs, char *buf)
+{
+  if (quiet == 1) {
+    if (lossless)
+      SNPRINTF(buf, 80, "%-2d/LOSSLESS   ", precision);
+    else if (subsamp == TJSAMP_UNKNOWN)
+      SNPRINTF(buf, 80, "%-2d/%-5s      ", precision, csName[cs]);
+    else
+      SNPRINTF(buf, 80, "%-2d/%-5s/%-5s", precision, csName[cs],
+               subNameLong[subsamp]);
+    return buf;
+  } else {
+    if (lossless)
+      return (char *)"Lossless";
+    else if (subsamp == TJSAMP_UNKNOWN)
+      return (char *)csName[cs];
+    else {
+      SNPRINTF(buf, 80, "%s %s", csName[cs], subNameLong[subsamp]);
+      return buf;
+    }
+  }
+}
+
+
+static char *sigfig(double val, int figs, char *buf, int len)
+{
+  char format[80];
+  int digitsAfterDecimal = figs - (int)ceil(log10(fabs(val)));
+
+  if (digitsAfterDecimal < 1)
+    SNPRINTF(format, 80, "%%.0f");
+  else
+    SNPRINTF(format, 80, "%%.%df", digitsAfterDecimal);
+  SNPRINTF(buf, len, format, val);
+  return buf;
+}
+
+
+/* Custom DCT filter which produces a negative of the image */
+static int dummyDCTFilter(short *coeffs, tjregion arrayRegion,
+                          tjregion planeRegion, int componentIndex,
+                          int transformIndex, tjtransform *transform)
+{
+  int i;
+
+  for (i = 0; i < arrayRegion.w * arrayRegion.h; i++)
+    coeffs[i] = -coeffs[i];
+  return 0;
+}
+
+
+/* Decompression test */
+static int decomp(unsigned char **jpegBufs, size_t *jpegSizes, void *dstBuf,
+                  int w, int h, int subsamp, int jpegQual, char *fileName,
+                  int tilew, int tileh)
+{
+  char tempStr[1024], sizeStr[24] = "\0", qualStr[16] = "\0";
+  FILE *file = NULL;
+  tjhandle handle = NULL;
+  int i, row, col, iter = 0, dstBufAlloc = 0, retval = 0;
+  double elapsed, elapsedDecode;
+  int ps = tjPixelSize[pf];
+  int scaledw, scaledh, pitch;
+  int ntilesw = (w + tilew - 1) / tilew, ntilesh = (h + tileh - 1) / tileh;
+  unsigned char *dstPtr, *dstPtr2, *yuvBuf = NULL;
+
+  if (lossless) sf = TJUNSCALED;
+
+  scaledw = TJSCALED(w, sf);
+  scaledh = TJSCALED(h, sf);
+
+  if (jpegQual > 0) {
+    SNPRINTF(qualStr, 16, "_%s%d", lossless ? "PSV" : "Q", jpegQual);
+    qualStr[15] = 0;
+  }
+
+  if ((handle = tj3Init(TJINIT_DECOMPRESS)) == NULL)
+    THROW_TJG();
+  if (tj3Set(handle, TJPARAM_STOPONWARNING, stopOnWarning) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_BOTTOMUP, bottomUp) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_FASTUPSAMPLE, fastUpsample) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_FASTDCT, fastDCT) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_SCANLIMIT, limitScans ? 500 : 0) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_MAXMEMORY, maxMemory) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_MAXPIXELS, maxPixels) == -1)
+    THROW_TJ();
+
+  if (IS_CROPPED(cr)) {
+    if (tj3DecompressHeader(handle, jpegBufs[0], jpegSizes[0]) == -1)
+      THROW_TJ();
+  }
+  if (tj3SetScalingFactor(handle, sf) == -1)
+    THROW_TJ();
+  if (tj3SetCroppingRegion(handle, cr) == -1)
+    THROW_TJ();
+  if (IS_CROPPED(cr)) {
+    scaledw = cr.w ? cr.w : scaledw - cr.x;
+    scaledh = cr.h ? cr.h : scaledh - cr.y;
+  }
+  pitch = scaledw * ps;
+
+  if (dstBuf == NULL) {
+#if ULLONG_MAX > SIZE_MAX
+    if ((unsigned long long)pitch * (unsigned long long)scaledh *
+        (unsigned long long)sampleSize > (unsigned long long)((size_t)-1))
+      THROW("allocating destination buffer", "Image is too large");
+#endif
+    if ((dstBuf = malloc((size_t)pitch * scaledh * sampleSize)) == NULL)
+      THROW_UNIX("allocating destination buffer");
+    dstBufAlloc = 1;
+  }
+
+  /* Set the destination buffer to gray so we know whether the decompressor
+     attempted to write to it */
+  if (precision == 8)
+    memset((unsigned char *)dstBuf, 127, (size_t)pitch * scaledh);
+  else if (precision == 12) {
+    for (i = 0; i < pitch * scaledh; i++)
+      ((short *)dstBuf)[i] = (short)2047;
+  } else {
+    for (i = 0; i < pitch * scaledh; i++)
+      ((unsigned short *)dstBuf)[i] = (unsigned short)32767;
+  }
+
+  if (doYUV) {
+    int width = doTile ? tilew : scaledw;
+    int height = doTile ? tileh : scaledh;
+    size_t yuvSize = tj3YUVBufSize(width, yuvAlign, height, subsamp);
+
+    if (yuvSize == 0)
+      THROW_TJG();
+    if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
+      THROW_UNIX("allocating YUV buffer");
+    memset(yuvBuf, 127, yuvSize);
+  }
+
+  /* Benchmark */
+  iter = -1;
+  elapsed = elapsedDecode = 0.;
+  while (1) {
+    int tile = 0;
+    double start = getTime();
+
+    for (row = 0, dstPtr = dstBuf; row < ntilesh;
+         row++, dstPtr += (size_t)pitch * tileh * sampleSize) {
+      for (col = 0, dstPtr2 = dstPtr; col < ntilesw;
+           col++, tile++, dstPtr2 += ps * tilew * sampleSize) {
+        int width = doTile ? min(tilew, w - col * tilew) : scaledw;
+        int height = doTile ? min(tileh, h - row * tileh) : scaledh;
+
+        if (doYUV) {
+          double startDecode;
+
+          if (tj3DecompressToYUV8(handle, jpegBufs[tile], jpegSizes[tile],
+                                  yuvBuf, yuvAlign) == -1)
+            THROW_TJ();
+          startDecode = getTime();
+          if (tj3DecodeYUV8(handle, yuvBuf, yuvAlign, dstPtr2, width, pitch,
+                            height, pf) == -1)
+            THROW_TJ();
+          if (iter >= 0) elapsedDecode += getTime() - startDecode;
+        } else {
+          if (precision == 8) {
+            if (tj3Decompress8(handle, jpegBufs[tile], jpegSizes[tile],
+                               dstPtr2, pitch, pf) == -1)
+              THROW_TJ();
+          } else if (precision == 12) {
+            if (tj3Decompress12(handle, jpegBufs[tile], jpegSizes[tile],
+                                (short *)dstPtr2, pitch, pf) == -1)
+              THROW_TJ();
+          } else {
+            if (tj3Decompress16(handle, jpegBufs[tile], jpegSizes[tile],
+                                (unsigned short *)dstPtr2, pitch, pf) == -1)
+              THROW_TJ();
+          }
+        }
+      }
+    }
+    elapsed += getTime() - start;
+    if (iter >= 0) {
+      iter++;
+      if (elapsed >= benchTime) break;
+    } else if (elapsed >= warmup) {
+      iter = 0;
+      elapsed = elapsedDecode = 0.;
+    }
+  }
+  if (doYUV) elapsed -= elapsedDecode;
+
+  if (quiet) {
+    printf("%-6s%s",
+           sigfig((double)(w * h) / 1000000. * (double)iter / elapsed, 4,
+                  tempStr, 1024),
+           quiet == 2 ? "\n" : "  ");
+    if (doYUV)
+      printf("%s\n",
+             sigfig((double)(w * h) / 1000000. * (double)iter / elapsedDecode,
+                    4, tempStr, 1024));
+    else if (quiet != 2) printf("\n");
+  } else {
+    printf("%s --> Frame rate:         %f fps\n",
+           doYUV ? "Decomp to YUV" : "Decompress   ", (double)iter / elapsed);
+    printf("                  Throughput:         %f Megapixels/sec\n",
+           (double)(w * h) / 1000000. * (double)iter / elapsed);
+    if (doYUV) {
+      printf("YUV Decode    --> Frame rate:         %f fps\n",
+             (double)iter / elapsedDecode);
+      printf("                  Throughput:         %f Megapixels/sec\n",
+             (double)(w * h) / 1000000. * (double)iter / elapsedDecode);
+    }
+  }
+
+  if (!doWrite) goto bailout;
+
+  if (sf.num != 1 || sf.denom != 1)
+    SNPRINTF(sizeStr, 24, "%d_%d", sf.num, sf.denom);
+  else if (tilew != w || tileh != h)
+    SNPRINTF(sizeStr, 24, "%dx%d", tilew, tileh);
+  else SNPRINTF(sizeStr, 24, "full");
+  if (decompOnly)
+    SNPRINTF(tempStr, 1024, "%s_%s.%s", fileName, sizeStr, ext);
+  else
+    SNPRINTF(tempStr, 1024, "%s_%s%s_%s.%s", fileName,
+             lossless ? "LOSSLS" : subName[subsamp], qualStr, sizeStr, ext);
+
+  if (precision == 8) {
+    if (tj3SaveImage8(handle, tempStr, (unsigned char *)dstBuf, scaledw, 0,
+                      scaledh, pf) == -1)
+      THROW_TJ();
+  } else if (precision == 12) {
+    if (tj3SaveImage12(handle, tempStr, (short *)dstBuf, scaledw, 0, scaledh,
+                       pf) == -1)
+      THROW_TJ();
+  } else {
+    if (tj3SaveImage16(handle, tempStr, (unsigned short *)dstBuf, scaledw, 0,
+                      scaledh, pf) == -1)
+      THROW_TJ();
+  }
+
+bailout:
+  if (file) fclose(file);
+  tj3Destroy(handle);
+  if (dstBufAlloc) free(dstBuf);
+  free(yuvBuf);
+  return retval;
+}
+
+
+static int fullTest(tjhandle handle, void *srcBuf, int w, int h, int subsamp,
+                    int jpegQual, char *fileName)
+{
+  char tempStr[1024], tempStr2[80];
+  FILE *file = NULL;
+  unsigned char **jpegBufs = NULL, *yuvBuf = NULL, *srcPtr, *srcPtr2;
+  void *tmpBuf = NULL;
+  double start, elapsed, elapsedEncode;
+  int row, col, i, tilew = w, tileh = h, retval = 0;
+  int iter;
+  size_t totalJpegSize = 0, *jpegSizes = NULL, yuvSize = 0;
+  int ps = tjPixelSize[pf];
+  int ntilesw = 1, ntilesh = 1, pitch = w * ps;
+  const char *pfStr = pixFormatStr[pf];
+
+#if ULLONG_MAX > SIZE_MAX
+  if ((unsigned long long)pitch * (unsigned long long)h *
+      (unsigned long long)sampleSize > (unsigned long long)((size_t)-1))
+    THROW("allocating temporary image buffer", "Image is too large");
+#endif
+  if ((tmpBuf = malloc((size_t)pitch * h * sampleSize)) == NULL)
+    THROW_UNIX("allocating temporary image buffer");
+
+  if (!quiet)
+    printf(">>>>>  %s (%s) <--> %d-bit JPEG (%s %s%d)  <<<<<\n", pfStr,
+           bottomUp ? "Bottom-up" : "Top-down", precision,
+           lossless ? "Lossless" : subNameLong[subsamp],
+           lossless ? "PSV" : "Q", jpegQual);
+
+  for (tilew = doTile ? 8 : w, tileh = doTile ? 8 : h; ;
+       tilew *= 2, tileh *= 2) {
+    if (tilew > w) tilew = w;
+    if (tileh > h) tileh = h;
+    ntilesw = (w + tilew - 1) / tilew;
+    ntilesh = (h + tileh - 1) / tileh;
+
+    if ((jpegBufs = (unsigned char **)malloc(sizeof(unsigned char *) *
+                                             ntilesw * ntilesh)) == NULL)
+      THROW_UNIX("allocating JPEG tile array");
+    memset(jpegBufs, 0, sizeof(unsigned char *) * ntilesw * ntilesh);
+    if ((jpegSizes = (size_t *)malloc(sizeof(size_t) * ntilesw *
+                                      ntilesh)) == NULL)
+      THROW_UNIX("allocating JPEG size array");
+    memset(jpegSizes, 0, sizeof(size_t) * ntilesw * ntilesh);
+
+    if (noRealloc) {
+      for (i = 0; i < ntilesw * ntilesh; i++) {
+        size_t jpegBufSize = tj3JPEGBufSize(tilew, tileh, subsamp);
+
+        if (jpegBufSize == 0)
+          THROW_TJG();
+        if ((jpegBufs[i] = tj3Alloc(jpegBufSize)) == NULL)
+          THROW_UNIX("allocating JPEG tiles");
+      }
+    }
+
+    /* Compression test */
+    if (quiet == 1)
+      printf("%-4s(%s)  %-2d/%-6s %-3d   ", pfStr, bottomUp ? "BU" : "TD",
+             precision, lossless ? "LOSSLS" : subNameLong[subsamp], jpegQual);
+    if (precision == 8) {
+      for (i = 0; i < h; i++)
+        memcpy(&((unsigned char *)tmpBuf)[pitch * i],
+               &((unsigned char *)srcBuf)[w * ps * i], w * ps);
+    } else {
+      for (i = 0; i < h; i++)
+        memcpy(&((unsigned short *)tmpBuf)[pitch * i],
+               &((unsigned short *)srcBuf)[w * ps * i], w * ps * sampleSize);
+    }
+
+    if (tj3Set(handle, TJPARAM_NOREALLOC, noRealloc) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_SUBSAMP, subsamp) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_FASTDCT, fastDCT) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_OPTIMIZE, optimize) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_PROGRESSIVE, progressive) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_ARITHMETIC, arithmetic) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_LOSSLESS, lossless) == -1)
+      THROW_TJ();
+    if (lossless) {
+      if (tj3Set(handle, TJPARAM_LOSSLESSPSV, jpegQual) == -1)
+        THROW_TJ();
+    } else {
+      if (tj3Set(handle, TJPARAM_QUALITY, jpegQual) == -1)
+        THROW_TJ();
+    }
+    if (tj3Set(handle, TJPARAM_RESTARTBLOCKS, restartIntervalBlocks) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_RESTARTROWS, restartIntervalRows) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_MAXMEMORY, maxMemory) == -1)
+      THROW_TJ();
+
+    if (doYUV) {
+      yuvSize = tj3YUVBufSize(tilew, yuvAlign, tileh, subsamp);
+      if (yuvSize == 0)
+        THROW_TJG();
+      if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
+        THROW_UNIX("allocating YUV buffer");
+      memset(yuvBuf, 127, yuvSize);
+    }
+
+    /* Benchmark */
+    iter = -1;
+    elapsed = elapsedEncode = 0.;
+    while (1) {
+      int tile = 0;
+
+      totalJpegSize = 0;
+      start = getTime();
+      for (row = 0, srcPtr = srcBuf; row < ntilesh;
+           row++, srcPtr += pitch * tileh * sampleSize) {
+        for (col = 0, srcPtr2 = srcPtr; col < ntilesw;
+             col++, tile++, srcPtr2 += ps * tilew * sampleSize) {
+          int width = min(tilew, w - col * tilew);
+          int height = min(tileh, h - row * tileh);
+
+          if (doYUV) {
+            double startEncode = getTime();
+
+            if (tj3EncodeYUV8(handle, srcPtr2, width, pitch, height, pf,
+                              yuvBuf, yuvAlign) == -1)
+              THROW_TJ();
+            if (iter >= 0) elapsedEncode += getTime() - startEncode;
+            if (tj3CompressFromYUV8(handle, yuvBuf, width, yuvAlign, height,
+                                    &jpegBufs[tile], &jpegSizes[tile]) == -1)
+              THROW_TJ();
+          } else {
+            if (precision == 8) {
+              if (tj3Compress8(handle, srcPtr2, width, pitch, height, pf,
+                               &jpegBufs[tile], &jpegSizes[tile]) == -1)
+                THROW_TJ();
+            } else if (precision == 12) {
+              if (tj3Compress12(handle, (short *)srcPtr2, width, pitch, height,
+                                pf, &jpegBufs[tile], &jpegSizes[tile]) == -1)
+                THROW_TJ();
+            } else {
+              if (tj3Compress16(handle, (unsigned short *)srcPtr2, width,
+                                pitch, height, pf, &jpegBufs[tile],
+                                &jpegSizes[tile]) == -1)
+                THROW_TJ();
+            }
+          }
+          totalJpegSize += jpegSizes[tile];
+        }
+      }
+      elapsed += getTime() - start;
+      if (iter >= 0) {
+        iter++;
+        if (elapsed >= benchTime) break;
+      } else if (elapsed >= warmup) {
+        iter = 0;
+        elapsed = elapsedEncode = 0.;
+      }
+    }
+    if (doYUV) elapsed -= elapsedEncode;
+
+    if (quiet == 1) printf("%-5d  %-5d   ", tilew, tileh);
+    if (quiet) {
+      if (doYUV)
+        printf("%-6s%s",
+               sigfig((double)(w * h) / 1000000. *
+                      (double)iter / elapsedEncode, 4, tempStr, 1024),
+               quiet == 2 ? "\n" : "  ");
+      printf("%-6s%s",
+             sigfig((double)(w * h) / 1000000. * (double)iter / elapsed, 4,
+                    tempStr, 1024),
+             quiet == 2 ? "\n" : "  ");
+      printf("%-6s%s",
+             sigfig((double)(w * h * ps) / (double)totalJpegSize, 4, tempStr2,
+                    80),
+             quiet == 2 ? "\n" : "  ");
+    } else {
+      printf("\n%s size: %d x %d\n", doTile ? "Tile" : "Image", tilew, tileh);
+      if (doYUV) {
+        printf("Encode YUV    --> Frame rate:         %f fps\n",
+               (double)iter / elapsedEncode);
+        printf("                  Output image size:  %lu bytes\n",
+               (unsigned long)yuvSize);
+        printf("                  Compression ratio:  %f:1\n",
+               (double)(w * h * ps) / (double)yuvSize);
+        printf("                  Throughput:         %f Megapixels/sec\n",
+               (double)(w * h) / 1000000. * (double)iter / elapsedEncode);
+        printf("                  Output bit stream:  %f Megabits/sec\n",
+               (double)yuvSize * 8. / 1000000. * (double)iter / elapsedEncode);
+      }
+      printf("%s --> Frame rate:         %f fps\n",
+             doYUV ? "Comp from YUV" : "Compress     ",
+             (double)iter / elapsed);
+      printf("                  Output image size:  %lu bytes\n",
+             (unsigned long)totalJpegSize);
+      printf("                  Compression ratio:  %f:1\n",
+             (double)(w * h * ps) / (double)totalJpegSize);
+      printf("                  Throughput:         %f Megapixels/sec\n",
+             (double)(w * h) / 1000000. * (double)iter / elapsed);
+      printf("                  Output bit stream:  %f Megabits/sec\n",
+             (double)totalJpegSize * 8. / 1000000. * (double)iter / elapsed);
+    }
+    if (tilew == w && tileh == h && doWrite) {
+     SNPRINTF(tempStr, 1024, "%s_%s_%s%d.jpg", fileName,
+              lossless ? "LOSSLS" : subName[subsamp],
+              lossless ? "PSV" : "Q", jpegQual);
+      if ((file = fopen(tempStr, "wb")) == NULL)
+        THROW_UNIX("opening reference image");
+      if (fwrite(jpegBufs[0], jpegSizes[0], 1, file) != 1)
+        THROW_UNIX("writing reference image");
+      fclose(file);  file = NULL;
+      if (!quiet) printf("Reference image written to %s\n", tempStr);
+    }
+
+    /* Decompression test */
+    if (!compOnly) {
+      if (decomp(jpegBufs, jpegSizes, tmpBuf, w, h, subsamp, jpegQual,
+                 fileName, tilew, tileh) == -1)
+        goto bailout;
+    } else if (quiet == 1) printf("N/A\n");
+
+    for (i = 0; i < ntilesw * ntilesh; i++) {
+      tj3Free(jpegBufs[i]);
+      jpegBufs[i] = NULL;
+    }
+    free(jpegBufs);  jpegBufs = NULL;
+    free(jpegSizes);  jpegSizes = NULL;
+    if (doYUV) {
+      free(yuvBuf);  yuvBuf = NULL;
+    }
+
+    if (tilew == w && tileh == h) break;
+  }
+
+bailout:
+  if (file) fclose(file);
+  if (jpegBufs) {
+    for (i = 0; i < ntilesw * ntilesh; i++)
+      tj3Free(jpegBufs[i]);
+  }
+  free(jpegBufs);
+  free(yuvBuf);
+  free(jpegSizes);
+  free(tmpBuf);
+  return retval;
+}
+
+
+static int decompTest(char *fileName)
+{
+  FILE *file = NULL;
+  tjhandle handle = NULL;
+  unsigned char **jpegBufs = NULL, *srcBuf = NULL;
+  size_t *jpegSizes = NULL, srcSize, totalJpegSize;
+  tjtransform *t = NULL;
+  double start, elapsed;
+  int ps = tjPixelSize[pf], tile, row, col, i, iter, retval = 0, decompsrc = 0;
+  char *temp = NULL, tempStr[80], tempStr2[80];
+  /* Original image */
+  int w = 0, h = 0, minTile = 16, tilew, tileh, ntilesw = 1, ntilesh = 1,
+    subsamp = -1, cs = -1;
+  /* Transformed image */
+  int tw, th, ttilew, ttileh, tntilesw, tntilesh, tsubsamp;
+
+  if ((file = fopen(fileName, "rb")) == NULL)
+    THROW_UNIX("opening file");
+  if (fseek(file, 0, SEEK_END) < 0 ||
+      (srcSize = ftell(file)) == (size_t)-1)
+    THROW_UNIX("determining file size");
+  if ((srcBuf = (unsigned char *)malloc(srcSize)) == NULL)
+    THROW_UNIX("allocating memory");
+  if (fseek(file, 0, SEEK_SET) < 0)
+    THROW_UNIX("setting file position");
+  if (fread(srcBuf, srcSize, 1, file) < 1)
+    THROW_UNIX("reading JPEG data");
+  fclose(file);  file = NULL;
+
+  temp = strrchr(fileName, '.');
+  if (temp != NULL) *temp = '\0';
+
+  if ((handle = tj3Init(TJINIT_TRANSFORM)) == NULL)
+    THROW_TJG();
+  if (tj3Set(handle, TJPARAM_STOPONWARNING, stopOnWarning) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_BOTTOMUP, bottomUp) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_NOREALLOC, noRealloc) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_FASTUPSAMPLE, fastUpsample) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_FASTDCT, fastDCT) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_SCANLIMIT, limitScans ? 500 : 0) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_MAXMEMORY, maxMemory) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_MAXPIXELS, maxPixels) == -1)
+    THROW_TJ();
+
+  if (tj3DecompressHeader(handle, srcBuf, srcSize) == -1)
+    THROW_TJ();
+  w = tj3Get(handle, TJPARAM_JPEGWIDTH);
+  h = tj3Get(handle, TJPARAM_JPEGHEIGHT);
+  subsamp = tj3Get(handle, TJPARAM_SUBSAMP);
+  precision = tj3Get(handle, TJPARAM_PRECISION);
+  if (tj3Get(handle, TJPARAM_PROGRESSIVE) == 1)
+    printf("JPEG image uses progressive entropy coding\n\n");
+  if (tj3Get(handle, TJPARAM_ARITHMETIC) == 1)
+    printf("JPEG image uses arithmetic entropy coding\n\n");
+  if (tj3Set(handle, TJPARAM_PROGRESSIVE, progressive) == -1)
+    THROW_TJ();
+  if (tj3Set(handle, TJPARAM_ARITHMETIC, arithmetic) == -1)
+    THROW_TJ();
+
+  lossless = tj3Get(handle, TJPARAM_LOSSLESS);
+  sampleSize = (precision == 8 ? sizeof(unsigned char) : sizeof(short));
+  cs = tj3Get(handle, TJPARAM_COLORSPACE);
+  if (w < 1 || h < 1)
+    THROW("reading JPEG header", "Invalid image dimensions");
+  if (cs == TJCS_YCCK || cs == TJCS_CMYK) {
+    pf = TJPF_CMYK;  ps = tjPixelSize[pf];
+  }
+  if (lossless) sf = TJUNSCALED;
+
+  if (tj3SetScalingFactor(handle, sf) == -1)
+    THROW_TJ();
+  if (tj3SetCroppingRegion(handle, cr) == -1)
+    THROW_TJ();
+
+  if (quiet == 1) {
+    printf("All performance values in Mpixels/sec\n\n");
+    printf("Pixel     JPEG             %s  %s   Xform   Comp    Decomp  ",
+           doTile ? "Tile " : "Image", doTile ? "Tile " : "Image");
+    if (doYUV) printf("Decode");
+    printf("\n");
+    printf("Format    Format           Width  Height  Perf    Ratio   Perf    ");
+    if (doYUV) printf("Perf");
+    printf("\n\n");
+  } else if (!quiet)
+    printf(">>>>>  %d-bit JPEG (%s) --> %s (%s)  <<<<<\n", precision,
+           formatName(subsamp, cs, tempStr), pixFormatStr[pf],
+           bottomUp ? "Bottom-up" : "Top-down");
+
+  if (doTile) {
+    if (subsamp == TJSAMP_UNKNOWN)
+      THROW("transforming",
+            "Could not determine subsampling level of JPEG image");
+    minTile = max(tjMCUWidth[subsamp], tjMCUHeight[subsamp]);
+  }
+  for (tilew = doTile ? minTile : w, tileh = doTile ? minTile : h; ;
+       tilew *= 2, tileh *= 2) {
+    if (tilew > w) tilew = w;
+    if (tileh > h) tileh = h;
+    ntilesw = (w + tilew - 1) / tilew;
+    ntilesh = (h + tileh - 1) / tileh;
+
+    if ((jpegBufs = (unsigned char **)malloc(sizeof(unsigned char *) *
+                                             ntilesw * ntilesh)) == NULL)
+      THROW_UNIX("allocating JPEG tile array");
+    memset(jpegBufs, 0, sizeof(unsigned char *) * ntilesw * ntilesh);
+    if ((jpegSizes = (size_t *)malloc(sizeof(size_t) * ntilesw *
+                                      ntilesh)) == NULL)
+      THROW_UNIX("allocating JPEG size array");
+    memset(jpegSizes, 0, sizeof(size_t) * ntilesw * ntilesh);
+
+    if (noRealloc &&
+        (doTile || xformOp != TJXOP_NONE || xformOpt != 0 || customFilter)) {
+      for (i = 0; i < ntilesw * ntilesh; i++) {
+        size_t jpegBufSize;
+
+        if (xformOp == TJXOP_TRANSPOSE || xformOp == TJXOP_TRANSVERSE ||
+            xformOp == TJXOP_ROT90 || xformOp == TJXOP_ROT270)
+          jpegBufSize = tj3JPEGBufSize(tileh, tilew, subsamp);
+        else
+          jpegBufSize = tj3JPEGBufSize(tilew, tileh, subsamp);
+        if (jpegBufSize == 0)
+          THROW_TJG();
+        if ((jpegBufs[i] = tj3Alloc(jpegBufSize)) == NULL)
+          THROW_UNIX("allocating JPEG tiles");
+      }
+    }
+
+    tw = w;  th = h;  ttilew = tilew;  ttileh = tileh;
+    if (!quiet) {
+      printf("\n%s size: %d x %d", doTile ? "Tile" : "Image", ttilew, ttileh);
+      if (sf.num != 1 || sf.denom != 1 || IS_CROPPED(cr))
+        printf(" --> %d x %d", CROPPED_WIDTH(tw), CROPPED_HEIGHT(th));
+      printf("\n");
+    } else if (quiet == 1) {
+      printf("%-4s(%s)  %-14s   ", pixFormatStr[pf],
+             bottomUp ? "BU" : "TD", formatName(subsamp, cs, tempStr));
+      printf("%-5d  %-5d   ", CROPPED_WIDTH(tilew), CROPPED_HEIGHT(tileh));
+    }
+
+    tsubsamp = subsamp;
+    if (doTile || xformOp != TJXOP_NONE || xformOpt != 0 || customFilter) {
+      if ((t = (tjtransform *)malloc(sizeof(tjtransform) * ntilesw *
+                                     ntilesh)) == NULL)
+        THROW_UNIX("allocating image transform array");
+
+      if (xformOp == TJXOP_TRANSPOSE || xformOp == TJXOP_TRANSVERSE ||
+          xformOp == TJXOP_ROT90 || xformOp == TJXOP_ROT270) {
+        tw = h;  th = w;  ttilew = tileh;  ttileh = tilew;
+      }
+
+      if (xformOp != TJXOP_NONE && xformOp != TJXOP_TRANSPOSE &&
+          subsamp == TJSAMP_UNKNOWN)
+        THROW("transforming",
+              "Could not determine subsampling level of JPEG image");
+      if (xformOpt & TJXOPT_GRAY) tsubsamp = TJSAMP_GRAY;
+      if (xformOp == TJXOP_HFLIP || xformOp == TJXOP_ROT180)
+        tw = tw - (tw % tjMCUWidth[tsubsamp]);
+      if (xformOp == TJXOP_VFLIP || xformOp == TJXOP_ROT180)
+        th = th - (th % tjMCUHeight[tsubsamp]);
+      if (xformOp == TJXOP_TRANSVERSE || xformOp == TJXOP_ROT90)
+        tw = tw - (tw % tjMCUHeight[tsubsamp]);
+      if (xformOp == TJXOP_TRANSVERSE || xformOp == TJXOP_ROT270)
+        th = th - (th % tjMCUWidth[tsubsamp]);
+      tntilesw = (tw + ttilew - 1) / ttilew;
+      tntilesh = (th + ttileh - 1) / ttileh;
+
+      if (xformOp == TJXOP_TRANSPOSE || xformOp == TJXOP_TRANSVERSE ||
+          xformOp == TJXOP_ROT90 || xformOp == TJXOP_ROT270) {
+        if (tsubsamp == TJSAMP_422) tsubsamp = TJSAMP_440;
+        else if (tsubsamp == TJSAMP_440) tsubsamp = TJSAMP_422;
+        else if (tsubsamp == TJSAMP_411) tsubsamp = TJSAMP_441;
+        else if (tsubsamp == TJSAMP_441) tsubsamp = TJSAMP_411;
+      }
+
+      for (row = 0, tile = 0; row < tntilesh; row++) {
+        for (col = 0; col < tntilesw; col++, tile++) {
+          t[tile].r.w = min(ttilew, tw - col * ttilew);
+          t[tile].r.h = min(ttileh, th - row * ttileh);
+          t[tile].r.x = col * ttilew;
+          t[tile].r.y = row * ttileh;
+          t[tile].op = xformOp;
+          t[tile].options = xformOpt | TJXOPT_TRIM;
+          t[tile].customFilter = customFilter;
+          if (t[tile].options & TJXOPT_NOOUTPUT && jpegBufs[tile]) {
+            tj3Free(jpegBufs[tile]);  jpegBufs[tile] = NULL;
+          }
+        }
+      }
+
+      iter = -1;
+      elapsed = 0.;
+      while (1) {
+        start = getTime();
+        if (tj3Transform(handle, srcBuf, srcSize, tntilesw * tntilesh,
+                         jpegBufs, jpegSizes, t) == -1)
+          THROW_TJ();
+        elapsed += getTime() - start;
+        if (iter >= 0) {
+          iter++;
+          if (elapsed >= benchTime) break;
+        } else if (elapsed >= warmup) {
+          iter = 0;
+          elapsed = 0.;
+        }
+      }
+
+      free(t);  t = NULL;
+
+      for (tile = 0, totalJpegSize = 0; tile < tntilesw * tntilesh; tile++)
+        totalJpegSize += jpegSizes[tile];
+
+      if (quiet) {
+        printf("%-6s%s%-6s%s",
+               sigfig((double)(w * h) / 1000000. / elapsed, 4, tempStr, 80),
+               quiet == 2 ? "\n" : "  ",
+               sigfig((double)(w * h * ps) / (double)totalJpegSize, 4,
+                      tempStr2, 80),
+               quiet == 2 ? "\n" : "  ");
+      } else {
+        printf("Transform     --> Frame rate:         %f fps\n",
+               1.0 / elapsed);
+        printf("                  Output image size:  %lu bytes\n",
+               (unsigned long)totalJpegSize);
+        printf("                  Compression ratio:  %f:1\n",
+               (double)(w * h * ps) / (double)totalJpegSize);
+        printf("                  Throughput:         %f Megapixels/sec\n",
+               (double)(w * h) / 1000000. / elapsed);
+        printf("                  Output bit stream:  %f Megabits/sec\n",
+               (double)totalJpegSize * 8. / 1000000. / elapsed);
+      }
+    } else {
+      if (quiet == 1) printf("N/A     N/A     ");
+      tj3Free(jpegBufs[0]);
+      jpegBufs[0] = NULL;
+      decompsrc = 1;
+    }
+
+    if (w == tilew) ttilew = tw;
+    if (h == tileh) ttileh = th;
+    if (!(xformOpt & TJXOPT_NOOUTPUT)) {
+      if (decomp(decompsrc ? &srcBuf : jpegBufs,
+                 decompsrc ? &srcSize : jpegSizes, NULL, tw, th, tsubsamp, 0,
+                 fileName, ttilew, ttileh) == -1)
+        goto bailout;
+    } else if (quiet == 1) printf("N/A\n");
+
+    for (i = 0; i < ntilesw * ntilesh; i++) {
+      tj3Free(jpegBufs[i]);
+      jpegBufs[i] = NULL;
+    }
+    free(jpegBufs);  jpegBufs = NULL;
+    free(jpegSizes);  jpegSizes = NULL;
+
+    if (tilew == w && tileh == h) break;
+  }
+
+bailout:
+  if (file) fclose(file);
+  if (jpegBufs) {
+    for (i = 0; i < ntilesw * ntilesh; i++)
+      tj3Free(jpegBufs[i]);
+  }
+  free(jpegBufs);
+  free(jpegSizes);
+  free(srcBuf);
+  free(t);
+  tj3Destroy(handle);
+  return retval;
+}
+
+
+static void usage(char *progName)
+{
+  int i;
+
+  printf("USAGE: %s\n", progName);
+  printf("       <Inputimage (BMP|PPM)> <Quality or PSV> [options]\n\n");
+  printf("       %s\n", progName);
+  printf("       <Inputimage (JPG)> [options]\n");
+
+  printf("\nGENERAL OPTIONS\n");
+  printf("---------------\n");
+  printf("-alloc = Dynamically allocate JPEG buffers\n");
+  printf("-benchtime T = Run each benchmark for at least T seconds [default = 5.0]\n");
+  printf("-bmp = Use Windows Bitmap format for output images [default = PPM]\n");
+  printf("     ** 8-bit data precision only **\n");
+  printf("-bottomup = Use bottom-up row order for packed-pixel source/destination buffers\n");
+  printf("-componly = Stop after running compression tests.  Do not test decompression.\n");
+  printf("-lossless = Generate lossless JPEG images when compressing (implies\n");
+  printf("     -subsamp 444).  PSV is the predictor selection value (1-7).\n");
+  printf("-maxmemory = Memory limit (in megabytes) for intermediate buffers used with\n");
+  printf("     progressive JPEG compression and decompression, optimized baseline entropy\n");
+  printf("     coding, lossless JPEG compression, and lossless transformation\n");
+  printf("     [default = no limit]\n");
+  printf("-maxpixels = Input image size limit (in pixels) [default = no limit]\n");
+  printf("-nowrite = Do not write reference or output images (improves consistency of\n");
+  printf("     benchmark results)\n");
+  printf("-rgb, -bgr, -rgbx, -bgrx, -xbgr, -xrgb =\n");
+  printf("     Use the specified pixel format for packed-pixel source/destination buffers\n");
+  printf("     [default = BGR]\n");
+  printf("-cmyk = Indirectly test YCCK JPEG compression/decompression\n");
+  printf("     (use the CMYK pixel format for packed-pixel source/destination buffers)\n");
+  printf("-precision N = Use N-bit data precision when compressing [N is 8, 12, or 16;\n");
+  printf("     default = 8; if N is 16, then -lossless must also be specified]\n");
+  printf("     (-precision 12 implies -optimize unless -arithmetic is also specified)\n");
+  printf("-quiet = Output results in tabular rather than verbose format\n");
+  printf("-restart N = When compressing, add a restart marker every N MCU rows (lossy) or\n");
+  printf("     N sample rows (lossless) [default = 0 (no restart markers)].  Append 'B'\n");
+  printf("     to specify the restart marker interval in MCU blocks (lossy) or samples\n");
+  printf("     (lossless).\n");
+  printf("-stoponwarning = Immediately discontinue the current\n");
+  printf("     compression/decompression/transform operation if a warning (non-fatal\n");
+  printf("     error) occurs\n");
+  printf("-tile = Compress/transform the input image into separate JPEG tiles of varying\n");
+  printf("     sizes (useful for measuring JPEG overhead)\n");
+  printf("-warmup T = Run each benchmark for T seconds [default = 1.0] prior to starting\n");
+  printf("     the timer, in order to prime the caches and thus improve the consistency\n");
+  printf("     of the benchmark results\n");
+
+  printf("\nLOSSY JPEG OPTIONS\n");
+  printf("------------------\n");
+  printf("-arithmetic = Use arithmetic entropy coding in JPEG images generated by\n");
+  printf("     compression and transform operations (can be combined with -progressive)\n");
+  printf("-crop WxH+X+Y = Decompress only the specified region of the JPEG image, where W\n");
+  printf("     and H are the width and height of the region (0 = maximum possible width\n");
+  printf("     or height) and X and Y are the left and upper boundary of the region, all\n");
+  printf("     specified relative to the scaled image dimensions.  X must be divible by\n");
+  printf("     the scaled MCU width.\n");
+  printf("-fastdct = Use the fastest DCT/IDCT algorithm available\n");
+  printf("-fastupsample = Use the fastest chrominance upsampling algorithm available\n");
+  printf("-optimize = Use optimized baseline entropy coding in JPEG images generated by\n");
+  printf("     compession and transform operations\n");
+  printf("-progressive = Use progressive entropy coding in JPEG images generated by\n");
+  printf("     compression and transform operations (can be combined with -arithmetic;\n");
+  printf("     implies -optimize unless -arithmetic is also specified)\n");
+  printf("-limitscans = Refuse to decompress or transform progressive JPEG images that\n");
+  printf("     have an unreasonably large number of scans\n");
+  printf("-scale M/N = When decompressing, scale the width/height of the JPEG image by a\n");
+  printf("     factor of M/N (M/N = ");
+  for (i = 0; i < nsf; i++) {
+    printf("%d/%d", scalingFactors[i].num, scalingFactors[i].denom);
+    if (nsf == 2 && i != nsf - 1) printf(" or ");
+    else if (nsf > 2) {
+      if (i != nsf - 1) printf(", ");
+      if (i == nsf - 2) printf("or ");
+    }
+    if (i % 8 == 0 && i != 0) printf("\n     ");
+  }
+  printf(")\n");
+  printf("-subsamp S = When compressing, use the specified level of chrominance\n");
+  printf("     subsampling (S = 444, 422, 440, 420, 411, 441, or GRAY) [default = test\n");
+  printf("     Grayscale, 4:2:0, 4:2:2, and 4:4:4 in sequence]\n");
+  printf("-hflip, -vflip, -transpose, -transverse, -rot90, -rot180, -rot270 =\n");
+  printf("     Perform the specified lossless transform operation on the input image\n");
+  printf("     prior to decompression (these operations are mutually exclusive)\n");
+  printf("-grayscale = Transform the input image into a grayscale JPEG image prior to\n");
+  printf("     decompression (can be combined with the other transform operations above)\n");
+  printf("-copynone = Do not copy any extra markers (including EXIF and ICC profile data)\n");
+  printf("     when transforming the input image\n");
+  printf("-yuv = Compress from/decompress to intermediate planar YUV images\n");
+  printf("     ** 8-bit data precision only **\n");
+  printf("-yuvpad N = The number of bytes by which each row in each plane of an\n");
+  printf("     intermediate YUV image is evenly divisible (N must be a power of 2)\n");
+  printf("     [default = 1]\n");
+
+  printf("\nNOTE:  If the quality/PSV is specified as a range (e.g. 90-100 or 1-4), a\n");
+  printf("separate test will be performed for all values in the range.\n\n");
+  exit(1);
+}
+
+
+int main(int argc, char *argv[])
+{
+  void *srcBuf = NULL;
+  int w = 0, h = 0, i, j, minQual = -1, maxQual = -1;
+  char *temp;
+  int minArg = 2, retval = 0, subsamp = -1;
+  tjhandle handle = NULL;
+
+  if ((scalingFactors = tj3GetScalingFactors(&nsf)) == NULL || nsf == 0)
+    THROW("executing tj3GetScalingFactors()", tj3GetErrorStr(NULL));
+
+  if (argc < minArg) usage(argv[0]);
+
+  temp = strrchr(argv[1], '.');
+  if (temp != NULL) {
+    if (!strcasecmp(temp, ".bmp")) ext = "bmp";
+    if (!strcasecmp(temp, ".jpg") || !strcasecmp(temp, ".jpeg"))
+      decompOnly = 1;
+  }
+
+  printf("\n");
+
+  if (!decompOnly) {
+    minArg = 3;
+    if (argc < minArg) usage(argv[0]);
+    minQual = atoi(argv[2]);
+    if ((temp = strchr(argv[2], '-')) != NULL && strlen(temp) > 1 &&
+        sscanf(&temp[1], "%d", &maxQual) == 1 && maxQual > minQual) {}
+    else maxQual = minQual;
+  }
+
+  if (argc > minArg) {
+    for (i = minArg; i < argc; i++) {
+      if (!strcasecmp(argv[i], "-tile")) {
+        doTile = 1;  xformOpt |= TJXOPT_CROP;
+      } else if (!strcasecmp(argv[i], "-precision") && i < argc - 1) {
+        int tempi = atoi(argv[++i]);
+
+        if (tempi != 8 && tempi != 12 && tempi != 16)
+          usage(argv[0]);
+        precision = tempi;
+      } else if (!strcasecmp(argv[i], "-fastupsample")) {
+        printf("Using fastest upsampling algorithm\n\n");
+        fastUpsample = 1;
+      } else if (!strcasecmp(argv[i], "-fastdct")) {
+        printf("Using fastest DCT/IDCT algorithm\n\n");
+        fastDCT = 1;
+      } else if (!strcasecmp(argv[i], "-optimize")) {
+        printf("Using optimized baseline entropy coding\n\n");
+        optimize = 1;
+        xformOpt |= TJXOPT_OPTIMIZE;
+      } else if (!strcasecmp(argv[i], "-progressive")) {
+        printf("Using progressive entropy coding\n\n");
+        progressive = 1;
+        xformOpt |= TJXOPT_PROGRESSIVE;
+      } else if (!strcasecmp(argv[i], "-arithmetic")) {
+        printf("Using arithmetic entropy coding\n\n");
+        arithmetic = 1;
+        xformOpt |= TJXOPT_ARITHMETIC;
+      } else if (!strcasecmp(argv[i], "-lossless")) {
+        lossless = 1;
+        subsamp = TJSAMP_444;
+      } else if (!strcasecmp(argv[i], "-rgb"))
+        pf = TJPF_RGB;
+      else if (!strcasecmp(argv[i], "-rgbx"))
+        pf = TJPF_RGBX;
+      else if (!strcasecmp(argv[i], "-bgr"))
+        pf = TJPF_BGR;
+      else if (!strcasecmp(argv[i], "-bgrx"))
+        pf = TJPF_BGRX;
+      else if (!strcasecmp(argv[i], "-xbgr"))
+        pf = TJPF_XBGR;
+      else if (!strcasecmp(argv[i], "-xrgb"))
+        pf = TJPF_XRGB;
+      else if (!strcasecmp(argv[i], "-cmyk"))
+        pf = TJPF_CMYK;
+      else if (!strcasecmp(argv[i], "-bottomup"))
+        bottomUp = 1;
+      else if (!strcasecmp(argv[i], "-quiet"))
+        quiet = 1;
+      else if (!strcasecmp(argv[i], "-qq"))
+        quiet = 2;
+      else if (!strcasecmp(argv[i], "-scale") && i < argc - 1) {
+        int temp1 = 0, temp2 = 0, match = 0;
+
+        if (sscanf(argv[++i], "%d/%d", &temp1, &temp2) == 2) {
+          for (j = 0; j < nsf; j++) {
+            if (temp1 == scalingFactors[j].num &&
+                temp2 == scalingFactors[j].denom) {
+              sf = scalingFactors[j];
+              match = 1;  break;
+            }
+          }
+          if (!match) usage(argv[0]);
+        } else usage(argv[0]);
+      } else if (!strcasecmp(argv[i], "-crop") && i < argc - 1) {
+        int temp1 = -1, temp2 = -1, temp3 = -1, temp4 = -1;
+
+        if (sscanf(argv[++i], "%dx%d+%d+%d", &temp1, &temp2, &temp3,
+                   &temp4) == 4 && temp1 >= 0 && temp2 >= 0 && temp3 >= 0 &&
+                   temp4 >= 0) {
+          cr.w = temp1;  cr.h = temp2;  cr.x = temp3;  cr.y = temp4;
+        } else usage(argv[0]);
+      } else if (!strcasecmp(argv[i], "-hflip"))
+        xformOp = TJXOP_HFLIP;
+      else if (!strcasecmp(argv[i], "-vflip"))
+        xformOp = TJXOP_VFLIP;
+      else if (!strcasecmp(argv[i], "-transpose"))
+        xformOp = TJXOP_TRANSPOSE;
+      else if (!strcasecmp(argv[i], "-transverse"))
+        xformOp = TJXOP_TRANSVERSE;
+      else if (!strcasecmp(argv[i], "-rot90"))
+        xformOp = TJXOP_ROT90;
+      else if (!strcasecmp(argv[i], "-rot180"))
+        xformOp = TJXOP_ROT180;
+      else if (!strcasecmp(argv[i], "-rot270"))
+        xformOp = TJXOP_ROT270;
+      else if (!strcasecmp(argv[i], "-grayscale"))
+        xformOpt |= TJXOPT_GRAY;
+      else if (!strcasecmp(argv[i], "-custom"))
+        customFilter = dummyDCTFilter;
+      else if (!strcasecmp(argv[i], "-nooutput"))
+        xformOpt |= TJXOPT_NOOUTPUT;
+      else if (!strcasecmp(argv[i], "-copynone"))
+        xformOpt |= TJXOPT_COPYNONE;
+      else if (!strcasecmp(argv[i], "-benchtime") && i < argc - 1) {
+        double tempd = atof(argv[++i]);
+
+        if (tempd > 0.0) benchTime = tempd;
+        else usage(argv[0]);
+      } else if (!strcasecmp(argv[i], "-warmup") && i < argc - 1) {
+        double tempd = atof(argv[++i]);
+
+        if (tempd >= 0.0) warmup = tempd;
+        else usage(argv[0]);
+        printf("Warmup time = %.1f seconds\n\n", warmup);
+      } else if (!strcasecmp(argv[i], "-alloc"))
+        noRealloc = 0;
+      else if (!strcasecmp(argv[i], "-bmp"))
+        ext = "bmp";
+      else if (!strcasecmp(argv[i], "-yuv")) {
+        printf("Testing planar YUV encoding/decoding\n\n");
+        doYUV = 1;
+      } else if (!strcasecmp(argv[i], "-yuvpad") && i < argc - 1) {
+        int tempi = atoi(argv[++i]);
+
+        if (tempi >= 1 && (tempi & (tempi - 1)) == 0) yuvAlign = tempi;
+        else usage(argv[0]);
+      } else if (!strcasecmp(argv[i], "-subsamp") && i < argc - 1) {
+        i++;
+        if (toupper(argv[i][0]) == 'G') subsamp = TJSAMP_GRAY;
+        else {
+          int tempi = atoi(argv[i]);
+
+          switch (tempi) {
+          case 444:  subsamp = TJSAMP_444;  break;
+          case 422:  subsamp = TJSAMP_422;  break;
+          case 440:  subsamp = TJSAMP_440;  break;
+          case 420:  subsamp = TJSAMP_420;  break;
+          case 411:  subsamp = TJSAMP_411;  break;
+          case 441:  subsamp = TJSAMP_441;  break;
+          default:  usage(argv[0]);
+          }
+        }
+      } else if (!strcasecmp(argv[i], "-componly"))
+        compOnly = 1;
+      else if (!strcasecmp(argv[i], "-nowrite"))
+        doWrite = 0;
+      else if (!strcasecmp(argv[i], "-limitscans"))
+        limitScans = 1;
+      else if (!strcasecmp(argv[i], "-maxmemory") && i < argc - 1) {
+        int tempi = atoi(argv[++i]);
+
+        if (tempi < 0) usage(argv[0]);
+        maxMemory = tempi;
+      } else if (!strcasecmp(argv[i], "-maxpixels") && i < argc - 1) {
+        int tempi = atoi(argv[++i]);
+
+        if (tempi < 0) usage(argv[0]);
+        maxPixels = tempi;
+      } else if (!strcasecmp(argv[i], "-restart") && i < argc - 1) {
+        int tempi = -1, nscan;  char tempc = 0;
+
+        if ((nscan = sscanf(argv[++i], "%d%c", &tempi, &tempc)) < 1 ||
+            tempi < 0 || tempi > 65535 ||
+            (nscan == 2 && tempc != 'B' && tempc != 'b'))
+          usage(argv[0]);
+
+        if (tempc == 'B' || tempc == 'b')
+          restartIntervalBlocks = tempi;
+        else
+          restartIntervalRows = tempi;
+      } else if (!strcasecmp(argv[i], "-stoponwarning"))
+        stopOnWarning = 1;
+      else usage(argv[0]);
+    }
+  }
+
+  if (precision == 16 && !lossless) {
+    printf("ERROR: -lossless must be specified along with -precision 16\n");
+    retval = -1;  goto bailout;
+  }
+  if (precision != 8 && doYUV) {
+    printf("ERROR: -yuv requires 8-bit data precision\n");
+    retval = -1;  goto bailout;
+  }
+  if (lossless && doYUV) {
+    printf("ERROR: -lossless and -yuv are incompatible\n");
+    retval = -1;  goto bailout;
+  }
+  sampleSize = (precision == 8 ? sizeof(unsigned char) : sizeof(short));
+
+  if ((sf.num != 1 || sf.denom != 1) && doTile) {
+    printf("Disabling tiled compression/decompression tests, because those tests do not\n");
+    printf("work when scaled decompression is enabled.\n\n");
+    doTile = 0;  xformOpt &= (~TJXOPT_CROP);
+  }
+
+  if (IS_CROPPED(cr)) {
+    if (!decompOnly) {
+      printf("ERROR: Partial image decompression can only be enabled for JPEG input images\n");
+      retval = -1;  goto bailout;
+    }
+    if (doTile) {
+      printf("Disabling tiled compression/decompression tests, because those tests do not\n");
+      printf("work when partial image decompression is enabled.\n\n");
+      doTile = 0;  xformOpt &= (~TJXOPT_CROP);
+    }
+    if (doYUV) {
+      printf("ERROR: -crop and -yuv are incompatible\n");
+      retval = -1;  goto bailout;
+    }
+  }
+
+  if (!noRealloc && doTile) {
+    printf("Disabling tiled compression/decompression tests, because those tests do not\n");
+    printf("work when dynamic JPEG buffer allocation is enabled.\n\n");
+    doTile = 0;  xformOpt &= (~TJXOPT_CROP);
+  }
+
+  if (!decompOnly) {
+    if ((handle = tj3Init(TJINIT_COMPRESS)) == NULL)
+      THROW_TJG();
+    if (tj3Set(handle, TJPARAM_STOPONWARNING, stopOnWarning) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_BOTTOMUP, bottomUp) == -1)
+      THROW_TJ();
+    if (tj3Set(handle, TJPARAM_MAXPIXELS, maxPixels) == -1)
+      THROW_TJ();
+
+    if (precision == 8) {
+      if ((srcBuf = tj3LoadImage8(handle, argv[1], &w, 1, &h, &pf)) == NULL)
+        THROW_TJ();
+    } else if (precision == 12) {
+      if ((srcBuf = tj3LoadImage12(handle, argv[1], &w, 1, &h, &pf)) == NULL)
+        THROW_TJ();
+    } else {
+      if ((srcBuf = tj3LoadImage16(handle, argv[1], &w, 1, &h, &pf)) == NULL)
+        THROW_TJ();
+    }
+    temp = strrchr(argv[1], '.');
+    if (temp != NULL) *temp = '\0';
+  }
+
+  if (quiet == 1 && !decompOnly) {
+    printf("All performance values in Mpixels/sec\n\n");
+    printf("Pixel     JPEG      JPEG  %s  %s   ",
+           doTile ? "Tile " : "Image", doTile ? "Tile " : "Image");
+    if (doYUV) printf("Encode  ");
+    printf("Comp    Comp    Decomp  ");
+    if (doYUV) printf("Decode");
+    printf("\n");
+    printf("Format    Format    %s  Width  Height  ",
+           lossless ? "PSV " : "Qual");
+    if (doYUV) printf("Perf    ");
+    printf("Perf    Ratio   Perf    ");
+    if (doYUV) printf("Perf");
+    printf("\n\n");
+  }
+
+  if (decompOnly) {
+    decompTest(argv[1]);
+    printf("\n");
+    goto bailout;
+  }
+  if (lossless) {
+    if (minQual < 1 || minQual > 7 || maxQual < 1 || maxQual > 7) {
+      puts("ERROR: PSV must be between 1 and 7.");
+      exit(1);
+    }
+  } else {
+    if (minQual < 1 || minQual > 100 || maxQual < 1 || maxQual > 100) {
+      puts("ERROR: Quality must be between 1 and 100.");
+      exit(1);
+    }
+  }
+  if (subsamp >= 0 && subsamp < TJ_NUMSAMP) {
+    for (i = maxQual; i >= minQual; i--)
+      fullTest(handle, srcBuf, w, h, subsamp, i, argv[1]);
+    printf("\n");
+  } else {
+    if (pf != TJPF_CMYK) {
+      for (i = maxQual; i >= minQual; i--)
+        fullTest(handle, srcBuf, w, h, TJSAMP_GRAY, i, argv[1]);
+      printf("\n");
+    }
+    for (i = maxQual; i >= minQual; i--)
+      fullTest(handle, srcBuf, w, h, TJSAMP_420, i, argv[1]);
+    printf("\n");
+    for (i = maxQual; i >= minQual; i--)
+      fullTest(handle, srcBuf, w, h, TJSAMP_422, i, argv[1]);
+    printf("\n");
+    for (i = maxQual; i >= minQual; i--)
+      fullTest(handle, srcBuf, w, h, TJSAMP_444, i, argv[1]);
+    printf("\n");
+  }
+
+bailout:
+  tj3Destroy(handle);
+  tj3Free(srcBuf);
+  return retval;
+}
diff --git a/3rdparty/libjpeg-turbo/src/tjutil.c b/3rdparty/libjpeg-turbo/src/tjutil.c
new file mode 100644
index 000000000000..2018160b161f
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/tjutil.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C)2011, 2019 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include "tjutil.h"
+
+static double getFreq(void)
+{
+  LARGE_INTEGER freq;
+
+  if (!QueryPerformanceFrequency(&freq)) return 0.0;
+  return (double)freq.QuadPart;
+}
+
+static double f = -1.0;
+
+double getTime(void)
+{
+  LARGE_INTEGER t;
+
+  if (f < 0.0) f = getFreq();
+  if (f == 0.0) return (double)GetTickCount() / 1000.;
+  else {
+    QueryPerformanceCounter(&t);
+    return (double)t.QuadPart / f;
+  }
+}
+
+#else
+
+#include <stdlib.h>
+#include <sys/time.h>
+#include "tjutil.h"
+
+double getTime(void)
+{
+  struct timeval tv;
+
+  if (gettimeofday(&tv, NULL) < 0) return 0.0;
+  else return (double)tv.tv_sec + ((double)tv.tv_usec / 1000000.);
+}
+
+#endif
diff --git a/3rdparty/libjpeg-turbo/src/tjutil.h b/3rdparty/libjpeg-turbo/src/tjutil.h
new file mode 100644
index 000000000000..10272e98867e
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/tjutil.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C)2011, 2022 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _WIN32
+#ifndef strcasecmp
+#define strcasecmp  stricmp
+#endif
+#ifndef strncasecmp
+#define strncasecmp  strnicmp
+#endif
+#endif
+
+#ifdef _MSC_VER
+#define SNPRINTF(str, n, format, ...) \
+  _snprintf_s(str, n, _TRUNCATE, format, ##__VA_ARGS__)
+#else
+#define SNPRINTF  snprintf
+#endif
+
+#ifndef min
+#define min(a, b)  ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef max
+#define max(a, b)  ((a) > (b) ? (a) : (b))
+#endif
+
+extern double getTime(void);
diff --git a/3rdparty/libjpeg-turbo/src/transupp.c b/3rdparty/libjpeg-turbo/src/transupp.c
new file mode 100644
index 000000000000..62587d3865f8
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/transupp.c
@@ -0,0 +1,2377 @@
+/*
+ * transupp.c
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1997-2019, Thomas G. Lane, Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2010, 2017, 2021-2022, 2024, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains image transformation routines and other utility code
+ * used by the jpegtran sample application.  These are NOT part of the core
+ * JPEG library.  But we keep these routines separate from jpegtran.c to
+ * ease the task of maintaining jpegtran-like programs that have other user
+ * interfaces.
+ */
+
+/* Although this file really shouldn't have access to the library internals,
+ * it's helpful to let it call jround_up() and jcopy_block_row().
+ */
+#define JPEG_INTERNALS
+
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "transupp.h"           /* My own external interface */
+#include "jpegapicomp.h"
+#include <ctype.h>              /* to declare isdigit() */
+
+
+#if JPEG_LIB_VERSION >= 70
+#define dstinfo_min_DCT_h_scaled_size  dstinfo->min_DCT_h_scaled_size
+#define dstinfo_min_DCT_v_scaled_size  dstinfo->min_DCT_v_scaled_size
+#else
+#define dstinfo_min_DCT_h_scaled_size  DCTSIZE
+#define dstinfo_min_DCT_v_scaled_size  DCTSIZE
+#endif
+
+
+#if TRANSFORMS_SUPPORTED
+
+/*
+ * Lossless image transformation routines.  These routines work on DCT
+ * coefficient arrays and thus do not require any lossy decompression
+ * or recompression of the image.
+ * Thanks to Guido Vollbeding for the initial design and code of this feature,
+ * and to Ben Jackson for introducing the cropping feature.
+ *
+ * Horizontal flipping is done in-place, using a single top-to-bottom
+ * pass through the virtual source array.  It will thus be much the
+ * fastest option for images larger than main memory.
+ *
+ * The other routines require a set of destination virtual arrays, so they
+ * need twice as much memory as jpegtran normally does.  The destination
+ * arrays are always written in normal scan order (top to bottom) because
+ * the virtual array manager expects this.  The source arrays will be scanned
+ * in the corresponding order, which means multiple passes through the source
+ * arrays for most of the transforms.  That could result in much thrashing
+ * if the image is larger than main memory.
+ *
+ * If cropping or trimming is involved, the destination arrays may be smaller
+ * than the source arrays.  Note it is not possible to do horizontal flip
+ * in-place when a nonzero Y crop offset is specified, since we'd have to move
+ * data from one block row to another but the virtual array manager doesn't
+ * guarantee we can touch more than one row at a time.  So in that case,
+ * we have to use a separate destination array.
+ *
+ * Some notes about the operating environment of the individual transform
+ * routines:
+ * 1. Both the source and destination virtual arrays are allocated from the
+ *    source JPEG object, and therefore should be manipulated by calling the
+ *    source's memory manager.
+ * 2. The destination's component count should be used.  It may be smaller
+ *    than the source's when forcing to grayscale.
+ * 3. Likewise the destination's sampling factors should be used.  When
+ *    forcing to grayscale the destination's sampling factors will be all 1,
+ *    and we may as well take that as the effective iMCU size.
+ * 4. When "trim" is in effect, the destination's dimensions will be the
+ *    trimmed values but the source's will be untrimmed.
+ * 5. When "crop" is in effect, the destination's dimensions will be the
+ *    cropped values but the source's will be uncropped.  Each transform
+ *    routine is responsible for picking up source data starting at the
+ *    correct X and Y offset for the crop region.  (The X and Y offsets
+ *    passed to the transform routines are measured in iMCU blocks of the
+ *    destination.)
+ * 6. All the routines assume that the source and destination buffers are
+ *    padded out to a full iMCU boundary.  This is true, although for the
+ *    source buffer it is an undocumented property of jdcoefct.c.
+ */
+
+
+LOCAL(void)
+dequant_comp(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             jvirt_barray_ptr coef_array, JQUANT_TBL *qtblptr1)
+{
+  JDIMENSION blk_x, blk_y;
+  int offset_y, k;
+  JQUANT_TBL *qtblptr;
+  JBLOCKARRAY buffer;
+  JBLOCKROW block;
+  JCOEFPTR ptr;
+
+  qtblptr = compptr->quant_table;
+  for (blk_y = 0; blk_y < compptr->height_in_blocks;
+       blk_y += compptr->v_samp_factor) {
+    buffer = (*cinfo->mem->access_virt_barray)
+      ((j_common_ptr)cinfo, coef_array, blk_y,
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
+    for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+      block = buffer[offset_y];
+      for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
+        ptr = block[blk_x];
+        for (k = 0; k < DCTSIZE2; k++)
+          if (qtblptr->quantval[k] != qtblptr1->quantval[k])
+            ptr[k] *= qtblptr->quantval[k] / qtblptr1->quantval[k];
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+requant_comp(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             jvirt_barray_ptr coef_array, JQUANT_TBL *qtblptr1)
+{
+  JDIMENSION blk_x, blk_y;
+  int offset_y, k;
+  JQUANT_TBL *qtblptr;
+  JBLOCKARRAY buffer;
+  JBLOCKROW block;
+  JCOEFPTR ptr;
+  JCOEF temp, qval;
+
+  qtblptr = compptr->quant_table;
+  for (blk_y = 0; blk_y < compptr->height_in_blocks;
+       blk_y += compptr->v_samp_factor) {
+    buffer = (*cinfo->mem->access_virt_barray)
+      ((j_common_ptr)cinfo, coef_array, blk_y,
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
+    for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+      block = buffer[offset_y];
+      for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
+        ptr = block[blk_x];
+        for (k = 0; k < DCTSIZE2; k++) {
+          temp = qtblptr->quantval[k];
+          qval = qtblptr1->quantval[k];
+          if (temp != qval && qval != 0) {
+            temp *= ptr[k];
+            /* The following quantization code is copied from jcdctmgr.c */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a, b)  a /= b
+#else
+#define DIVIDE_BY(a, b)  if (a >= b) a /= b;  else a = 0
+#endif
+            if (temp < 0) {
+              temp = -temp;
+              temp += qval >> 1; /* for rounding */
+              DIVIDE_BY(temp, qval);
+              temp = -temp;
+            } else {
+              temp += qval >> 1; /* for rounding */
+              DIVIDE_BY(temp, qval);
+            }
+            ptr[k] = temp;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/*
+ * Calculate largest common denominator using Euclid's algorithm.
+ */
+LOCAL(JCOEF)
+largest_common_denominator(JCOEF a, JCOEF b)
+{
+  JCOEF c;
+
+  do {
+    c = a % b;
+    a = b;
+    b = c;
+  } while (c);
+
+  return a;
+}
+
+
+LOCAL(void)
+adjust_quant(j_decompress_ptr srcinfo, jvirt_barray_ptr *src_coef_arrays,
+             j_decompress_ptr dropinfo, jvirt_barray_ptr *drop_coef_arrays,
+             boolean trim, j_compress_ptr dstinfo)
+{
+  jpeg_component_info *compptr1, *compptr2;
+  JQUANT_TBL *qtblptr1, *qtblptr2, *qtblptr3;
+  int ci, k;
+
+  for (ci = 0; ci < dstinfo->num_components && ci < dropinfo->num_components;
+       ci++) {
+    compptr1 = srcinfo->comp_info + ci;
+    compptr2 = dropinfo->comp_info + ci;
+    qtblptr1 = compptr1->quant_table;
+    if (qtblptr1 == NULL)
+      ERREXIT1(srcinfo, JERR_NO_QUANT_TABLE, compptr1->quant_tbl_no);
+    qtblptr2 = compptr2->quant_table;
+    if (qtblptr2 == NULL)
+      ERREXIT1(dropinfo, JERR_NO_QUANT_TABLE, compptr2->quant_tbl_no);
+    for (k = 0; k < DCTSIZE2; k++) {
+      if (qtblptr1->quantval[k] != qtblptr2->quantval[k]) {
+        if (trim)
+          requant_comp(dropinfo, compptr2, drop_coef_arrays[ci], qtblptr1);
+        else {
+          qtblptr3 = dstinfo->quant_tbl_ptrs[compptr1->quant_tbl_no];
+          for (k = 0; k < DCTSIZE2; k++)
+            if (qtblptr1->quantval[k] != qtblptr2->quantval[k])
+              qtblptr3->quantval[k] =
+                largest_common_denominator(qtblptr1->quantval[k],
+                                           qtblptr2->quantval[k]);
+          dequant_comp(srcinfo, compptr1, src_coef_arrays[ci], qtblptr3);
+          dequant_comp(dropinfo, compptr2, drop_coef_arrays[ci], qtblptr3);
+        }
+        break;
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_drop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+        JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+        jvirt_barray_ptr *src_coef_arrays,
+        j_decompress_ptr dropinfo, jvirt_barray_ptr *drop_coef_arrays,
+        JDIMENSION drop_width, JDIMENSION drop_height)
+/* Drop (insert) the contents of another image into the source image.  If the
+ * number of components in the drop image is smaller than the number of
+ * components in the destination image, then we fill in the remaining
+ * components with zero.  This allows for dropping the contents of grayscale
+ * images into (arbitrarily sampled) color images.
+ */
+{
+  JDIMENSION comp_width, comp_height;
+  JDIMENSION blk_y, x_drop_blocks, y_drop_blocks;
+  int ci, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = drop_width * compptr->h_samp_factor;
+    comp_height = drop_height * compptr->v_samp_factor;
+    x_drop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_drop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (blk_y = 0; blk_y < comp_height; blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y + y_drop_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (ci < dropinfo->num_components) {
+        src_buffer = (*dropinfo->mem->access_virt_barray)
+          ((j_common_ptr)dropinfo, drop_coef_arrays[ci], blk_y,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
+        for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+          jcopy_block_row(src_buffer[offset_y],
+                          dst_buffer[offset_y] + x_drop_blocks, comp_width);
+        }
+      } else {
+        for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+          memset(dst_buffer[offset_y] + x_drop_blocks, 0,
+                 comp_width * sizeof(JBLOCK));
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+        JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+        jvirt_barray_ptr *src_coef_arrays,
+        jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop. */
+{
+  JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  /* We simply have to copy the right amount of data (the destination's
+   * image size) starting at the given X and Y offsets in the source.
+   */
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      src_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], dst_blk_y + y_crop_blocks,
+         (JDIMENSION)compptr->v_samp_factor, FALSE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
+                        dst_buffer[offset_y], compptr->width_in_blocks);
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_crop_ext_zero(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                 JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                 jvirt_barray_ptr *src_coef_arrays,
+                 jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: If the destination size is larger than the source, we fill in the
+ * expanded region with zero (neutral gray).  Note that we also have to zero
+ * partial iMCUs at the right and bottom edge of the source image area in this
+ * case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height;
+  JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            memset(dst_buffer[offset_y], 0,
+                   compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (dstinfo->_jpeg_width > srcinfo->output_width) {
+          if (x_crop_blocks > 0) {
+            memset(dst_buffer[offset_y], 0, x_crop_blocks * sizeof(JBLOCK));
+          }
+          jcopy_block_row(src_buffer[offset_y],
+                          dst_buffer[offset_y] + x_crop_blocks, comp_width);
+          if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+            memset(dst_buffer[offset_y] + x_crop_blocks + comp_width, 0,
+                   (compptr->width_in_blocks - x_crop_blocks - comp_width) *
+                   sizeof(JBLOCK));
+          }
+        } else {
+          jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
+                          dst_buffer[offset_y], compptr->width_in_blocks);
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_crop_ext_flat(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                 JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                 jvirt_barray_ptr *src_coef_arrays,
+                 jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: The destination width is larger than the source, and we fill in
+ * the expanded region with the DC coefficient of the adjacent block.  Note
+ * that we also have to fill partial iMCUs at the right and bottom edge of the
+ * source image area in this case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height;
+  JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, offset_y;
+  JCOEF dc;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            memset(dst_buffer[offset_y], 0,
+                   compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+          FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (x_crop_blocks > 0) {
+          memset(dst_buffer[offset_y], 0, x_crop_blocks * sizeof(JBLOCK));
+          dc = src_buffer[offset_y][0][0];
+          for (dst_blk_x = 0; dst_blk_x < x_crop_blocks; dst_blk_x++) {
+            dst_buffer[offset_y][dst_blk_x][0] = dc;
+          }
+        }
+        jcopy_block_row(src_buffer[offset_y],
+                        dst_buffer[offset_y] + x_crop_blocks, comp_width);
+        if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+          memset(dst_buffer[offset_y] + x_crop_blocks + comp_width, 0,
+                 (compptr->width_in_blocks - x_crop_blocks - comp_width) *
+                 sizeof(JBLOCK));
+          dc = src_buffer[offset_y][comp_width - 1][0];
+          for (dst_blk_x = x_crop_blocks + comp_width;
+               dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+            dst_buffer[offset_y][dst_blk_x][0] = dc;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_crop_ext_reflect(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                    JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                    jvirt_barray_ptr *src_coef_arrays,
+                    jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: The destination width is larger than the source, and we fill in
+ * the expanded region with repeated reflections of the source image.  Note
+ * that we also have to fill partial iMCUs at the right and bottom edge of the
+ * source image area in this case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, src_blk_x;
+  JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, k, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            memset(dst_buffer[offset_y], 0,
+                   compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        /* Copy source region */
+        jcopy_block_row(src_buffer[offset_y],
+                        dst_buffer[offset_y] + x_crop_blocks, comp_width);
+        if (x_crop_blocks > 0) {
+          /* Reflect to left */
+          dst_row_ptr = dst_buffer[offset_y] + x_crop_blocks;
+          for (dst_blk_x = x_crop_blocks; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;      /* (re)set axis of reflection */
+            for (src_blk_x = comp_width; src_blk_x > 0 && dst_blk_x > 0;
+                 src_blk_x--, dst_blk_x--) {
+              dst_ptr = *(--dst_row_ptr);   /* destination goes left */
+              src_ptr = *src_row_ptr++;     /* source goes right */
+              /* This unrolled loop doesn't need to know which row it's on. */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;    /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign
+                                               change */
+              }
+            }
+          }
+        }
+        if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+          /* Reflect to right */
+          dst_row_ptr = dst_buffer[offset_y] + x_crop_blocks + comp_width;
+          for (dst_blk_x = compptr->width_in_blocks - x_crop_blocks - comp_width;
+               dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;      /* (re)set axis of reflection */
+            for (src_blk_x = comp_width; src_blk_x > 0 && dst_blk_x > 0;
+                 src_blk_x--, dst_blk_x--) {
+              dst_ptr = *dst_row_ptr++;     /* destination goes right */
+              src_ptr = *(--src_row_ptr);   /* source goes left */
+              /* This unrolled loop doesn't need to know which row it's on. */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;    /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign
+                                               change */
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_wipe(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+        JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+        jvirt_barray_ptr *src_coef_arrays,
+        JDIMENSION drop_width, JDIMENSION drop_height)
+/* Wipe - discard image contents of specified region and fill with zero
+ * (neutral gray)
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width;
+  JDIMENSION y_wipe_blocks, wipe_bottom;
+  int ci, offset_y;
+  JBLOCKARRAY buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    y_wipe_blocks = y_crop_offset * compptr->v_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor + y_wipe_blocks;
+    for (; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        memset(buffer[offset_y] + x_wipe_blocks, 0,
+               wipe_width * sizeof(JBLOCK));
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_flatten(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           JDIMENSION drop_width, JDIMENSION drop_height)
+/* Flatten - discard image contents of specified region, similarly to wipe,
+ * but fill with the average of adjacent blocks instead of zero.
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width, wipe_right;
+  JDIMENSION y_wipe_blocks, wipe_bottom, blk_x;
+  int ci, offset_y, dc_left_value, dc_right_value, average;
+  JBLOCKARRAY buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    wipe_right = wipe_width + x_wipe_blocks;
+    y_wipe_blocks = y_crop_offset * compptr->v_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor + y_wipe_blocks;
+    for (; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        memset(buffer[offset_y] + x_wipe_blocks, 0,
+               wipe_width * sizeof(JBLOCK));
+        if (x_wipe_blocks > 0) {
+          dc_left_value = buffer[offset_y][x_wipe_blocks - 1][0];
+          if (wipe_right < compptr->width_in_blocks) {
+            dc_right_value = buffer[offset_y][wipe_right][0];
+            average = (dc_left_value + dc_right_value) >> 1;
+          } else {
+            average = dc_left_value;
+          }
+        } else if (wipe_right < compptr->width_in_blocks) {
+          average = buffer[offset_y][wipe_right][0];
+        } else continue;
+        for (blk_x = x_wipe_blocks; blk_x < wipe_right; blk_x++) {
+          buffer[offset_y][blk_x][0] = (JCOEF)average;
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_reflect(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, jvirt_barray_ptr *src_coef_arrays,
+           JDIMENSION drop_width, JDIMENSION drop_height)
+/* Reflect - discard image contents of specified region, similarly to wipe,
+ * but fill with repeated reflections of the outside region instead of zero.
+ * NB: y_crop_offset is assumed to be zero.
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width;
+  JDIMENSION y_wipe_blocks, wipe_bottom;
+  JDIMENSION src_blk_x, dst_blk_x;
+  int ci, k, offset_y;
+  JBLOCKARRAY buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor;
+    for (y_wipe_blocks = 0; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (x_wipe_blocks > 0) {
+          /* Reflect from left */
+          dst_row_ptr = buffer[offset_y] + x_wipe_blocks;
+          for (dst_blk_x = wipe_width; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;     /* (re)set axis of reflection */
+            for (src_blk_x = x_wipe_blocks;
+                 src_blk_x > 0 && dst_blk_x > 0; src_blk_x--, dst_blk_x--) {
+              dst_ptr = *dst_row_ptr++;    /* destination goes right */
+              src_ptr = *(--src_row_ptr);  /* source goes left */
+              /* this unrolled loop doesn't need to know which row it's on... */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;   /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign change */
+              }
+            }
+          }
+        } else if (compptr->width_in_blocks > x_wipe_blocks + wipe_width) {
+          /* Reflect from right */
+          dst_row_ptr = buffer[offset_y] + x_wipe_blocks + wipe_width;
+          for (dst_blk_x = wipe_width; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;     /* (re)set axis of reflection */
+            src_blk_x = compptr->width_in_blocks - x_wipe_blocks - wipe_width;
+            for (; src_blk_x > 0 && dst_blk_x > 0; src_blk_x--, dst_blk_x--) {
+              dst_ptr = *(--dst_row_ptr);  /* destination goes left */
+              src_ptr = *src_row_ptr++;    /* source goes right */
+              /* this unrolled loop doesn't need to know which row it's on... */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;   /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign change */
+              }
+            }
+          }
+        } else {
+          memset(buffer[offset_y] + x_wipe_blocks, 0,
+                 wipe_width * sizeof(JBLOCK));
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_flip_h_no_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                  JDIMENSION x_crop_offset, jvirt_barray_ptr *src_coef_arrays)
+/* Horizontal flip; done in-place, so no separate dest array is required.
+ * NB: this only works when y_crop_offset is zero.
+ */
+{
+  JDIMENSION MCU_cols, comp_width, blk_x, blk_y, x_crop_blocks;
+  int ci, k, offset_y;
+  JBLOCKARRAY buffer;
+  JCOEFPTR ptr1, ptr2;
+  JCOEF temp1, temp2;
+  jpeg_component_info *compptr;
+
+  /* Horizontal mirroring of DCT blocks is accomplished by swapping
+   * pairs of blocks in-place.  Within a DCT block, we perform horizontal
+   * mirroring by changing the signs of odd-numbered columns.
+   * Partial iMCUs at the right edge are left untouched.
+   */
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    for (blk_y = 0; blk_y < compptr->height_in_blocks;
+         blk_y += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        /* Do the mirroring */
+        for (blk_x = 0; blk_x * 2 < comp_width; blk_x++) {
+          ptr1 = buffer[offset_y][blk_x];
+          ptr2 = buffer[offset_y][comp_width - blk_x - 1];
+          /* this unrolled loop doesn't need to know which row it's on... */
+          for (k = 0; k < DCTSIZE2; k += 2) {
+            temp1 = *ptr1;      /* swap even column */
+            temp2 = *ptr2;
+            *ptr1++ = temp2;
+            *ptr2++ = temp1;
+            temp1 = *ptr1;      /* swap odd column with sign change */
+            temp2 = *ptr2;
+            *ptr1++ = -temp2;
+            *ptr2++ = -temp1;
+          }
+        }
+        if (x_crop_blocks > 0) {
+          /* Now left-justify the portion of the data to be kept.
+           * We can't use a single jcopy_block_row() call because that routine
+           * depends on memcpy(), whose behavior is unspecified for overlapping
+           * source and destination areas.  Sigh.
+           */
+          for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
+            jcopy_block_row(buffer[offset_y] + blk_x + x_crop_blocks,
+                            buffer[offset_y] + blk_x, (JDIMENSION)1);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_flip_h(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+          JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+          jvirt_barray_ptr *src_coef_arrays,
+          jvirt_barray_ptr *dst_coef_arrays)
+/* Horizontal flip in general cropping case */
+{
+  JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y;
+  JDIMENSION x_crop_blocks, y_crop_blocks;
+  int ci, k, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  /* Here we must output into a separate array because we can't touch
+   * different rows of a single virtual array simultaneously.  Otherwise,
+   * this is essentially the same as the routine above.
+   */
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      src_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], dst_blk_y + y_crop_blocks,
+         (JDIMENSION)compptr->v_samp_factor, FALSE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        dst_row_ptr = dst_buffer[offset_y];
+        src_row_ptr = src_buffer[offset_y];
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x++) {
+          if (x_crop_blocks + dst_blk_x < comp_width) {
+            /* Do the mirrorable blocks */
+            dst_ptr = dst_row_ptr[dst_blk_x];
+            src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
+            /* this unrolled loop doesn't need to know which row it's on... */
+            for (k = 0; k < DCTSIZE2; k += 2) {
+              *dst_ptr++ = *src_ptr++;    /* copy even column */
+              *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign
+                                             change */
+            }
+          } else {
+            /* Copy last partial block(s) verbatim */
+            jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
+                            dst_row_ptr + dst_blk_x, (JDIMENSION)1);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_flip_v(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+          JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+          jvirt_barray_ptr *src_coef_arrays,
+          jvirt_barray_ptr *dst_coef_arrays)
+/* Vertical flip */
+{
+  JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y;
+  JDIMENSION x_crop_blocks, y_crop_blocks;
+  int ci, i, j, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  /* We output into a separate array because we can't touch different
+   * rows of the source virtual array simultaneously.  Otherwise, this
+   * is a pretty straightforward analog of horizontal flip.
+   * Within a DCT block, vertical mirroring is done by changing the signs
+   * of odd-numbered rows.
+   * Partial iMCUs at the bottom edge are copied verbatim.
+   */
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (y_crop_blocks + dst_blk_y < comp_height) {
+        /* Row is within the mirrorable area. */
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           comp_height - y_crop_blocks - dst_blk_y -
+           (JDIMENSION)compptr->v_samp_factor,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
+      } else {
+        /* Bottom-edge blocks will be copied verbatim. */
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (y_crop_blocks + dst_blk_y < comp_height) {
+          /* Row is within the mirrorable area. */
+          dst_row_ptr = dst_buffer[offset_y];
+          src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
+          src_row_ptr += x_crop_blocks;
+          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+               dst_blk_x++) {
+            dst_ptr = dst_row_ptr[dst_blk_x];
+            src_ptr = src_row_ptr[dst_blk_x];
+            for (i = 0; i < DCTSIZE; i += 2) {
+              /* copy even row */
+              for (j = 0; j < DCTSIZE; j++)
+                *dst_ptr++ = *src_ptr++;
+              /* copy odd row with sign change */
+              for (j = 0; j < DCTSIZE; j++)
+                *dst_ptr++ = -(*src_ptr++);
+            }
+          }
+        } else {
+          /* Just copy row verbatim. */
+          jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
+                          dst_buffer[offset_y], compptr->width_in_blocks);
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_transpose(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+             JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+             jvirt_barray_ptr *src_coef_arrays,
+             jvirt_barray_ptr *dst_coef_arrays)
+/* Transpose source into destination */
+{
+  JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, i, j, offset_x, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  /* Transposing pixels within a block just requires transposing the
+   * DCT coefficients.
+   * Partial iMCUs at the edges require no special treatment; we simply
+   * process all the available DCT blocks for every component.
+   */
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x += compptr->h_samp_factor) {
+          src_buffer = (*srcinfo->mem->access_virt_barray)
+            ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+             dst_blk_x + x_crop_blocks,
+             (JDIMENSION)compptr->h_samp_factor, FALSE);
+          for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
+            dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
+            src_ptr =
+              src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks];
+            for (i = 0; i < DCTSIZE; i++)
+              for (j = 0; j < DCTSIZE; j++)
+                dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_rot_90(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+          JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+          jvirt_barray_ptr *src_coef_arrays,
+          jvirt_barray_ptr *dst_coef_arrays)
+/* 90 degree rotation is equivalent to
+ *   1. Transposing the image;
+ *   2. Horizontal mirroring.
+ * These two steps are merged into a single processing routine.
+ */
+{
+  JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y;
+  JDIMENSION x_crop_blocks, y_crop_blocks;
+  int ci, i, j, offset_x, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  /* Because of the horizontal mirror step, we can't process partial iMCUs
+   * at the (output) right edge properly.  They just get transposed and
+   * not mirrored.
+   */
+  MCU_cols = srcinfo->output_height /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x += compptr->h_samp_factor) {
+          if (x_crop_blocks + dst_blk_x < comp_width) {
+            /* Block is within the mirrorable area. */
+            src_buffer = (*srcinfo->mem->access_virt_barray)
+              ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+               comp_width - x_crop_blocks - dst_blk_x -
+               (JDIMENSION)compptr->h_samp_factor,
+               (JDIMENSION)compptr->h_samp_factor, FALSE);
+          } else {
+            /* Edge blocks are transposed but not mirrored. */
+            src_buffer = (*srcinfo->mem->access_virt_barray)
+              ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+               dst_blk_x + x_crop_blocks,
+               (JDIMENSION)compptr->h_samp_factor, FALSE);
+          }
+          for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
+            dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
+            if (x_crop_blocks + dst_blk_x < comp_width) {
+              /* Block is within the mirrorable area. */
+              src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
+                [dst_blk_y + offset_y + y_crop_blocks];
+              for (i = 0; i < DCTSIZE; i++) {
+                for (j = 0; j < DCTSIZE; j++)
+                  dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+                i++;
+                for (j = 0; j < DCTSIZE; j++)
+                  dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
+              }
+            } else {
+              /* Edge blocks are transposed but not mirrored. */
+              src_ptr = src_buffer[offset_x]
+                [dst_blk_y + offset_y + y_crop_blocks];
+              for (i = 0; i < DCTSIZE; i++)
+                for (j = 0; j < DCTSIZE; j++)
+                  dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_rot_270(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           jvirt_barray_ptr *dst_coef_arrays)
+/* 270 degree rotation is equivalent to
+ *   1. Horizontal mirroring;
+ *   2. Transposing the image.
+ * These two steps are merged into a single processing routine.
+ */
+{
+  JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y;
+  JDIMENSION x_crop_blocks, y_crop_blocks;
+  int ci, i, j, offset_x, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  /* Because of the horizontal mirror step, we can't process partial iMCUs
+   * at the (output) bottom edge properly.  They just get transposed and
+   * not mirrored.
+   */
+  MCU_rows = srcinfo->output_width /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x += compptr->h_samp_factor) {
+          src_buffer = (*srcinfo->mem->access_virt_barray)
+            ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+             dst_blk_x + x_crop_blocks,
+             (JDIMENSION)compptr->h_samp_factor, FALSE);
+          for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
+            dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
+            if (y_crop_blocks + dst_blk_y < comp_height) {
+              /* Block is within the mirrorable area. */
+              src_ptr = src_buffer[offset_x]
+                [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
+              for (i = 0; i < DCTSIZE; i++) {
+                for (j = 0; j < DCTSIZE; j++) {
+                  dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+                  j++;
+                  dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
+                }
+              }
+            } else {
+              /* Edge blocks are transposed but not mirrored. */
+              src_ptr = src_buffer[offset_x]
+                [dst_blk_y + offset_y + y_crop_blocks];
+              for (i = 0; i < DCTSIZE; i++)
+                for (j = 0; j < DCTSIZE; j++)
+                  dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_rot_180(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           jvirt_barray_ptr *dst_coef_arrays)
+/* 180 degree rotation is equivalent to
+ *   1. Vertical mirroring;
+ *   2. Horizontal mirroring.
+ * These two steps are merged into a single processing routine.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, dst_blk_x, dst_blk_y;
+  JDIMENSION x_crop_blocks, y_crop_blocks;
+  int ci, i, j, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (y_crop_blocks + dst_blk_y < comp_height) {
+        /* Row is within the vertically mirrorable area. */
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           comp_height - y_crop_blocks - dst_blk_y -
+           (JDIMENSION)compptr->v_samp_factor,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
+      } else {
+        /* Bottom-edge rows are only mirrored horizontally. */
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        dst_row_ptr = dst_buffer[offset_y];
+        if (y_crop_blocks + dst_blk_y < comp_height) {
+          /* Row is within the mirrorable area. */
+          src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
+          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+               dst_blk_x++) {
+            dst_ptr = dst_row_ptr[dst_blk_x];
+            if (x_crop_blocks + dst_blk_x < comp_width) {
+              /* Process the blocks that can be mirrored both ways. */
+              src_ptr =
+                src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
+              for (i = 0; i < DCTSIZE; i += 2) {
+                /* For even row, negate every odd column. */
+                for (j = 0; j < DCTSIZE; j += 2) {
+                  *dst_ptr++ = *src_ptr++;
+                  *dst_ptr++ = -(*src_ptr++);
+                }
+                /* For odd row, negate every even column. */
+                for (j = 0; j < DCTSIZE; j += 2) {
+                  *dst_ptr++ = -(*src_ptr++);
+                  *dst_ptr++ = *src_ptr++;
+                }
+              }
+            } else {
+              /* Any remaining right-edge blocks are only mirrored vertically. */
+              src_ptr = src_row_ptr[x_crop_blocks + dst_blk_x];
+              for (i = 0; i < DCTSIZE; i += 2) {
+                for (j = 0; j < DCTSIZE; j++)
+                  *dst_ptr++ = *src_ptr++;
+                for (j = 0; j < DCTSIZE; j++)
+                  *dst_ptr++ = -(*src_ptr++);
+              }
+            }
+          }
+        } else {
+          /* Remaining rows are just mirrored horizontally. */
+          src_row_ptr = src_buffer[offset_y];
+          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+               dst_blk_x++) {
+            if (x_crop_blocks + dst_blk_x < comp_width) {
+              /* Process the blocks that can be mirrored. */
+              dst_ptr = dst_row_ptr[dst_blk_x];
+              src_ptr =
+                src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
+              for (i = 0; i < DCTSIZE2; i += 2) {
+                *dst_ptr++ = *src_ptr++;
+                *dst_ptr++ = -(*src_ptr++);
+              }
+            } else {
+              /* Any remaining right-edge blocks are only copied. */
+              jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
+                              dst_row_ptr + dst_blk_x, (JDIMENSION)1);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_transverse(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+              JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+              jvirt_barray_ptr *src_coef_arrays,
+              jvirt_barray_ptr *dst_coef_arrays)
+/* Transverse transpose is equivalent to
+ *   1. 180 degree rotation;
+ *   2. Transposition;
+ * or
+ *   1. Horizontal mirroring;
+ *   2. Transposition;
+ *   3. Horizontal mirroring.
+ * These steps are merged into a single processing routine.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, dst_blk_x, dst_blk_y;
+  JDIMENSION x_crop_blocks, y_crop_blocks;
+  int ci, i, j, offset_x, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_height /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_width /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x += compptr->h_samp_factor) {
+          if (x_crop_blocks + dst_blk_x < comp_width) {
+            /* Block is within the mirrorable area. */
+            src_buffer = (*srcinfo->mem->access_virt_barray)
+              ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+               comp_width - x_crop_blocks - dst_blk_x -
+               (JDIMENSION)compptr->h_samp_factor,
+               (JDIMENSION)compptr->h_samp_factor, FALSE);
+          } else {
+            src_buffer = (*srcinfo->mem->access_virt_barray)
+              ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+               dst_blk_x + x_crop_blocks,
+               (JDIMENSION)compptr->h_samp_factor, FALSE);
+          }
+          for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
+            dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
+            if (y_crop_blocks + dst_blk_y < comp_height) {
+              if (x_crop_blocks + dst_blk_x < comp_width) {
+                /* Block is within the mirrorable area. */
+                src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
+                  [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
+                for (i = 0; i < DCTSIZE; i++) {
+                  for (j = 0; j < DCTSIZE; j++) {
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+                    j++;
+                    dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
+                  }
+                  i++;
+                  for (j = 0; j < DCTSIZE; j++) {
+                    dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
+                    j++;
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+                  }
+                }
+              } else {
+                /* Right-edge blocks are mirrored in y only */
+                src_ptr = src_buffer[offset_x]
+                  [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
+                for (i = 0; i < DCTSIZE; i++) {
+                  for (j = 0; j < DCTSIZE; j++) {
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+                    j++;
+                    dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
+                  }
+                }
+              }
+            } else {
+              if (x_crop_blocks + dst_blk_x < comp_width) {
+                /* Bottom-edge blocks are mirrored in x only */
+                src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1]
+                  [dst_blk_y + offset_y + y_crop_blocks];
+                for (i = 0; i < DCTSIZE; i++) {
+                  for (j = 0; j < DCTSIZE; j++)
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+                  i++;
+                  for (j = 0; j < DCTSIZE; j++)
+                    dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
+                }
+              } else {
+                /* At lower right corner, just transpose, no mirroring */
+                src_ptr = src_buffer[offset_x]
+                  [dst_blk_y + offset_y + y_crop_blocks];
+                for (i = 0; i < DCTSIZE; i++)
+                  for (j = 0; j < DCTSIZE; j++)
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/* Parse an unsigned integer: subroutine for jtransform_parse_crop_spec.
+ * Returns TRUE if valid integer found, FALSE if not.
+ * *strptr is advanced over the digit string, and *result is set to its value.
+ */
+
+LOCAL(boolean)
+jt_read_integer(const char **strptr, JDIMENSION *result)
+{
+  const char *ptr = *strptr;
+  JDIMENSION val = 0;
+
+  for (; isdigit(*ptr); ptr++) {
+    val = val * 10 + (JDIMENSION)(*ptr - '0');
+  }
+  *result = val;
+  if (ptr == *strptr)
+    return FALSE;               /* oops, no digits */
+  *strptr = ptr;
+  return TRUE;
+}
+
+
+/* Parse a crop specification (written in X11 geometry style).
+ * The routine returns TRUE if the spec string is valid, FALSE if not.
+ *
+ * The crop spec string should have the format
+ *      <width>[{fr}]x<height>[{fr}]{+-}<xoffset>{+-}<yoffset>
+ * where width, height, xoffset, and yoffset are unsigned integers.
+ * Each of the elements can be omitted to indicate a default value.
+ * (A weakness of this style is that it is not possible to omit xoffset
+ * while specifying yoffset, since they look alike.)
+ *
+ * This code is loosely based on XParseGeometry from the X11 distribution.
+ */
+
+GLOBAL(boolean)
+jtransform_parse_crop_spec(jpeg_transform_info *info, const char *spec)
+{
+  info->crop = FALSE;
+  info->crop_width_set = JCROP_UNSET;
+  info->crop_height_set = JCROP_UNSET;
+  info->crop_xoffset_set = JCROP_UNSET;
+  info->crop_yoffset_set = JCROP_UNSET;
+
+  if (isdigit(*spec)) {
+    /* fetch width */
+    if (!jt_read_integer(&spec, &info->crop_width))
+      return FALSE;
+    if (*spec == 'f' || *spec == 'F') {
+      spec++;
+      info->crop_width_set = JCROP_FORCE;
+    } else if (*spec == 'r' || *spec == 'R') {
+      spec++;
+      info->crop_width_set = JCROP_REFLECT;
+    } else
+      info->crop_width_set = JCROP_POS;
+  }
+  if (*spec == 'x' || *spec == 'X') {
+    /* fetch height */
+    spec++;
+    if (!jt_read_integer(&spec, &info->crop_height))
+      return FALSE;
+    if (*spec == 'f' || *spec == 'F') {
+      spec++;
+      info->crop_height_set = JCROP_FORCE;
+    } else if (*spec == 'r' || *spec == 'R') {
+      spec++;
+      info->crop_height_set = JCROP_REFLECT;
+    } else
+      info->crop_height_set = JCROP_POS;
+  }
+  if (*spec == '+' || *spec == '-') {
+    /* fetch xoffset */
+    info->crop_xoffset_set = (*spec == '-') ? JCROP_NEG : JCROP_POS;
+    spec++;
+    if (!jt_read_integer(&spec, &info->crop_xoffset))
+      return FALSE;
+  }
+  if (*spec == '+' || *spec == '-') {
+    /* fetch yoffset */
+    info->crop_yoffset_set = (*spec == '-') ? JCROP_NEG : JCROP_POS;
+    spec++;
+    if (!jt_read_integer(&spec, &info->crop_yoffset))
+      return FALSE;
+  }
+  /* We had better have gotten to the end of the string. */
+  if (*spec != '\0')
+    return FALSE;
+  info->crop = TRUE;
+  return TRUE;
+}
+
+
+/* Trim off any partial iMCUs on the indicated destination edge */
+
+LOCAL(void)
+trim_right_edge(jpeg_transform_info *info, JDIMENSION full_width)
+{
+  JDIMENSION MCU_cols;
+
+  MCU_cols = info->output_width / info->iMCU_sample_width;
+  if (MCU_cols > 0 && info->x_crop_offset + MCU_cols ==
+      full_width / info->iMCU_sample_width)
+    info->output_width = MCU_cols * info->iMCU_sample_width;
+}
+
+LOCAL(void)
+trim_bottom_edge(jpeg_transform_info *info, JDIMENSION full_height)
+{
+  JDIMENSION MCU_rows;
+
+  MCU_rows = info->output_height / info->iMCU_sample_height;
+  if (MCU_rows > 0 && info->y_crop_offset + MCU_rows ==
+      full_height / info->iMCU_sample_height)
+    info->output_height = MCU_rows * info->iMCU_sample_height;
+}
+
+
+/* Request any required workspace.
+ *
+ * This routine figures out the size that the output image will be
+ * (which implies that all the transform parameters must be set before
+ * it is called).
+ *
+ * We allocate the workspace virtual arrays from the source decompression
+ * object, so that all the arrays (both the original data and the workspace)
+ * will be taken into account while making memory management decisions.
+ * Hence, this routine must be called after jpeg_read_header (which reads
+ * the image dimensions) and before jpeg_read_coefficients (which realizes
+ * the source's virtual arrays).
+ *
+ * This function returns FALSE right away if -perfect is given
+ * and transformation is not perfect.  Otherwise returns TRUE.
+ */
+
+GLOBAL(boolean)
+jtransform_request_workspace(j_decompress_ptr srcinfo,
+                             jpeg_transform_info *info)
+{
+  jvirt_barray_ptr *coef_arrays;
+  boolean need_workspace, transpose_it;
+  jpeg_component_info *compptr;
+  JDIMENSION xoffset, yoffset, dtemp;
+  JDIMENSION width_in_iMCUs, height_in_iMCUs;
+  JDIMENSION width_in_blocks, height_in_blocks;
+  int itemp, ci, h_samp_factor, v_samp_factor;
+
+  /* Determine number of components in output image */
+  if (info->force_grayscale &&
+      srcinfo->jpeg_color_space == JCS_YCbCr &&
+      srcinfo->num_components == 3)
+    /* We'll only process the first component */
+    info->num_components = 1;
+  else
+    /* Process all the components */
+    info->num_components = srcinfo->num_components;
+
+  /* Compute output image dimensions and related values. */
+#if JPEG_LIB_VERSION >= 80
+  jpeg_core_output_dimensions(srcinfo);
+#else
+  srcinfo->output_width = srcinfo->image_width;
+  srcinfo->output_height = srcinfo->image_height;
+#endif
+
+  /* Return right away if -perfect is given and transformation is not perfect.
+   */
+  if (info->perfect) {
+    if (info->num_components == 1) {
+      if (!jtransform_perfect_transform(srcinfo->output_width,
+          srcinfo->output_height,
+          srcinfo->_min_DCT_h_scaled_size,
+          srcinfo->_min_DCT_v_scaled_size,
+          info->transform))
+        return FALSE;
+    } else {
+      if (!jtransform_perfect_transform(srcinfo->output_width,
+          srcinfo->output_height,
+          srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size,
+          srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size,
+          info->transform))
+        return FALSE;
+    }
+  }
+
+  /* If there is only one output component, force the iMCU size to be 1;
+   * else use the source iMCU size.  (This allows us to do the right thing
+   * when reducing color to grayscale, and also provides a handy way of
+   * cleaning up "funny" grayscale images whose sampling factors are not 1x1.)
+   */
+  switch (info->transform) {
+  case JXFORM_TRANSPOSE:
+  case JXFORM_TRANSVERSE:
+  case JXFORM_ROT_90:
+  case JXFORM_ROT_270:
+    info->output_width = srcinfo->output_height;
+    info->output_height = srcinfo->output_width;
+    if (info->num_components == 1) {
+      info->iMCU_sample_width = srcinfo->_min_DCT_v_scaled_size;
+      info->iMCU_sample_height = srcinfo->_min_DCT_h_scaled_size;
+    } else {
+      info->iMCU_sample_width =
+        srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size;
+      info->iMCU_sample_height =
+        srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size;
+    }
+    break;
+  default:
+    info->output_width = srcinfo->output_width;
+    info->output_height = srcinfo->output_height;
+    if (info->num_components == 1) {
+      info->iMCU_sample_width = srcinfo->_min_DCT_h_scaled_size;
+      info->iMCU_sample_height = srcinfo->_min_DCT_v_scaled_size;
+    } else {
+      info->iMCU_sample_width =
+        srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size;
+      info->iMCU_sample_height =
+        srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size;
+    }
+    break;
+  }
+
+  /* If cropping has been requested, compute the crop area's position and
+   * dimensions, ensuring that its upper left corner falls at an iMCU boundary.
+   */
+  if (info->crop) {
+    /* Insert default values for unset crop parameters */
+    if (info->crop_xoffset_set == JCROP_UNSET)
+      info->crop_xoffset = 0;   /* default to +0 */
+    if (info->crop_yoffset_set == JCROP_UNSET)
+      info->crop_yoffset = 0;   /* default to +0 */
+    if (info->crop_width_set == JCROP_UNSET) {
+      if (info->crop_xoffset >= info->output_width)
+        ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      info->crop_width = info->output_width - info->crop_xoffset;
+    } else {
+      /* Check for crop extension */
+      if (info->crop_width > info->output_width) {
+        /* Crop extension does not work when transforming! */
+        if (info->transform != JXFORM_NONE ||
+            info->crop_xoffset >= info->crop_width ||
+            info->crop_xoffset > info->crop_width - info->output_width)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      } else {
+        if (info->crop_xoffset >= info->output_width ||
+            info->crop_width <= 0 ||
+            info->crop_xoffset > info->output_width - info->crop_width)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      }
+    }
+    if (info->crop_height_set == JCROP_UNSET) {
+      if (info->crop_yoffset >= info->output_height)
+        ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      info->crop_height = info->output_height - info->crop_yoffset;
+    } else {
+      /* Check for crop extension */
+      if (info->crop_height > info->output_height) {
+        /* Crop extension does not work when transforming! */
+        if (info->transform != JXFORM_NONE ||
+            info->crop_yoffset >= info->crop_height ||
+            info->crop_yoffset > info->crop_height - info->output_height)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      } else {
+        if (info->crop_yoffset >= info->output_height ||
+            info->crop_height <= 0 ||
+            info->crop_yoffset > info->output_height - info->crop_height)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      }
+    }
+    /* Convert negative crop offsets into regular offsets */
+    if (info->crop_xoffset_set != JCROP_NEG)
+      xoffset = info->crop_xoffset;
+    else if (info->crop_width > info->output_width) /* crop extension */
+      xoffset = info->crop_width - info->output_width - info->crop_xoffset;
+    else
+      xoffset = info->output_width - info->crop_width - info->crop_xoffset;
+    if (info->crop_yoffset_set != JCROP_NEG)
+      yoffset = info->crop_yoffset;
+    else if (info->crop_height > info->output_height) /* crop extension */
+      yoffset = info->crop_height - info->output_height - info->crop_yoffset;
+    else
+      yoffset = info->output_height - info->crop_height - info->crop_yoffset;
+    /* Now adjust so that upper left corner falls at an iMCU boundary */
+    switch (info->transform) {
+    case JXFORM_DROP:
+      /* Ensure the effective drop region will not exceed the requested */
+      itemp = info->iMCU_sample_width;
+      dtemp = itemp - 1 - ((xoffset + itemp - 1) % itemp);
+      xoffset += dtemp;
+      if (info->crop_width <= dtemp)
+        info->drop_width = 0;
+      else if (xoffset + info->crop_width - dtemp == info->output_width)
+        /* Matching right edge: include partial iMCU */
+        info->drop_width = (info->crop_width - dtemp + itemp - 1) / itemp;
+      else
+        info->drop_width = (info->crop_width - dtemp) / itemp;
+      itemp = info->iMCU_sample_height;
+      dtemp = itemp - 1 - ((yoffset + itemp - 1) % itemp);
+      yoffset += dtemp;
+      if (info->crop_height <= dtemp)
+        info->drop_height = 0;
+      else if (yoffset + info->crop_height - dtemp == info->output_height)
+        /* Matching bottom edge: include partial iMCU */
+        info->drop_height = (info->crop_height - dtemp + itemp - 1) / itemp;
+      else
+        info->drop_height = (info->crop_height - dtemp) / itemp;
+      /* Check if sampling factors match for dropping */
+      if (info->drop_width != 0 && info->drop_height != 0)
+        for (ci = 0; ci < info->num_components &&
+                     ci < info->drop_ptr->num_components; ci++) {
+          if (info->drop_ptr->comp_info[ci].h_samp_factor *
+              srcinfo->max_h_samp_factor !=
+              srcinfo->comp_info[ci].h_samp_factor *
+              info->drop_ptr->max_h_samp_factor)
+            ERREXIT6(srcinfo, JERR_BAD_DROP_SAMPLING, ci,
+              info->drop_ptr->comp_info[ci].h_samp_factor,
+              info->drop_ptr->max_h_samp_factor,
+              srcinfo->comp_info[ci].h_samp_factor,
+              srcinfo->max_h_samp_factor, 'h');
+          if (info->drop_ptr->comp_info[ci].v_samp_factor *
+              srcinfo->max_v_samp_factor !=
+              srcinfo->comp_info[ci].v_samp_factor *
+              info->drop_ptr->max_v_samp_factor)
+            ERREXIT6(srcinfo, JERR_BAD_DROP_SAMPLING, ci,
+              info->drop_ptr->comp_info[ci].v_samp_factor,
+              info->drop_ptr->max_v_samp_factor,
+              srcinfo->comp_info[ci].v_samp_factor,
+              srcinfo->max_v_samp_factor, 'v');
+        }
+      break;
+    case JXFORM_WIPE:
+      /* Ensure the effective wipe region will cover the requested */
+      info->drop_width = (JDIMENSION)jdiv_round_up
+        ((long)(info->crop_width + (xoffset % info->iMCU_sample_width)),
+         (long)info->iMCU_sample_width);
+      info->drop_height = (JDIMENSION)jdiv_round_up
+        ((long)(info->crop_height + (yoffset % info->iMCU_sample_height)),
+         (long)info->iMCU_sample_height);
+      break;
+    default:
+      /* Ensure the effective crop region will cover the requested */
+      if (info->crop_width_set == JCROP_FORCE ||
+          info->crop_width > info->output_width)
+        info->output_width = info->crop_width;
+      else
+        info->output_width =
+          info->crop_width + (xoffset % info->iMCU_sample_width);
+      if (info->crop_height_set == JCROP_FORCE ||
+          info->crop_height > info->output_height)
+        info->output_height = info->crop_height;
+      else
+        info->output_height =
+          info->crop_height + (yoffset % info->iMCU_sample_height);
+    }
+    /* Save x/y offsets measured in iMCUs */
+    info->x_crop_offset = xoffset / info->iMCU_sample_width;
+    info->y_crop_offset = yoffset / info->iMCU_sample_height;
+  } else {
+    info->x_crop_offset = 0;
+    info->y_crop_offset = 0;
+  }
+
+  /* Figure out whether we need workspace arrays,
+   * and if so whether they are transposed relative to the source.
+   */
+  need_workspace = FALSE;
+  transpose_it = FALSE;
+  switch (info->transform) {
+  case JXFORM_NONE:
+    if (info->x_crop_offset != 0 || info->y_crop_offset != 0 ||
+        info->output_width > srcinfo->output_width ||
+        info->output_height > srcinfo->output_height)
+      need_workspace = TRUE;
+    /* No workspace needed if neither cropping nor transforming */
+    break;
+  case JXFORM_FLIP_H:
+    if (info->trim)
+      trim_right_edge(info, srcinfo->output_width);
+    if (info->y_crop_offset != 0 || info->slow_hflip)
+      need_workspace = TRUE;
+    /* do_flip_h_no_crop doesn't need a workspace array */
+    break;
+  case JXFORM_FLIP_V:
+    if (info->trim)
+      trim_bottom_edge(info, srcinfo->output_height);
+    /* Need workspace arrays having same dimensions as source image. */
+    need_workspace = TRUE;
+    break;
+  case JXFORM_TRANSPOSE:
+    /* transpose does NOT have to trim anything */
+    /* Need workspace arrays having transposed dimensions. */
+    need_workspace = TRUE;
+    transpose_it = TRUE;
+    break;
+  case JXFORM_TRANSVERSE:
+    if (info->trim) {
+      trim_right_edge(info, srcinfo->output_height);
+      trim_bottom_edge(info, srcinfo->output_width);
+    }
+    /* Need workspace arrays having transposed dimensions. */
+    need_workspace = TRUE;
+    transpose_it = TRUE;
+    break;
+  case JXFORM_ROT_90:
+    if (info->trim)
+      trim_right_edge(info, srcinfo->output_height);
+    /* Need workspace arrays having transposed dimensions. */
+    need_workspace = TRUE;
+    transpose_it = TRUE;
+    break;
+  case JXFORM_ROT_180:
+    if (info->trim) {
+      trim_right_edge(info, srcinfo->output_width);
+      trim_bottom_edge(info, srcinfo->output_height);
+    }
+    /* Need workspace arrays having same dimensions as source image. */
+    need_workspace = TRUE;
+    break;
+  case JXFORM_ROT_270:
+    if (info->trim)
+      trim_bottom_edge(info, srcinfo->output_width);
+    /* Need workspace arrays having transposed dimensions. */
+    need_workspace = TRUE;
+    transpose_it = TRUE;
+    break;
+  case JXFORM_WIPE:
+    break;
+  case JXFORM_DROP:
+    break;
+  }
+
+  /* Allocate workspace if needed.
+   * Note that we allocate arrays padded out to the next iMCU boundary,
+   * so that transform routines need not worry about missing edge blocks.
+   */
+  if (need_workspace) {
+    coef_arrays = (jvirt_barray_ptr *)
+      (*srcinfo->mem->alloc_small) ((j_common_ptr)srcinfo, JPOOL_IMAGE,
+                sizeof(jvirt_barray_ptr) * info->num_components);
+    width_in_iMCUs = (JDIMENSION)
+      jdiv_round_up((long)info->output_width, (long)info->iMCU_sample_width);
+    height_in_iMCUs = (JDIMENSION)
+      jdiv_round_up((long)info->output_height, (long)info->iMCU_sample_height);
+    for (ci = 0; ci < info->num_components; ci++) {
+      compptr = srcinfo->comp_info + ci;
+      if (info->num_components == 1) {
+        /* we're going to force samp factors to 1x1 in this case */
+        h_samp_factor = v_samp_factor = 1;
+      } else if (transpose_it) {
+        h_samp_factor = compptr->v_samp_factor;
+        v_samp_factor = compptr->h_samp_factor;
+      } else {
+        h_samp_factor = compptr->h_samp_factor;
+        v_samp_factor = compptr->v_samp_factor;
+      }
+      width_in_blocks = width_in_iMCUs * h_samp_factor;
+      height_in_blocks = height_in_iMCUs * v_samp_factor;
+      coef_arrays[ci] = (*srcinfo->mem->request_virt_barray)
+        ((j_common_ptr)srcinfo, JPOOL_IMAGE, FALSE,
+         width_in_blocks, height_in_blocks, (JDIMENSION)v_samp_factor);
+    }
+    info->workspace_coef_arrays = coef_arrays;
+  } else
+    info->workspace_coef_arrays = NULL;
+
+  return TRUE;
+}
+
+
+/* Transpose destination image parameters */
+
+LOCAL(void)
+transpose_critical_parameters(j_compress_ptr dstinfo)
+{
+  int tblno, i, j, ci, itemp;
+  jpeg_component_info *compptr;
+  JQUANT_TBL *qtblptr;
+  JDIMENSION jtemp;
+  UINT16 qtemp;
+
+  /* Transpose image dimensions */
+  jtemp = dstinfo->image_width;
+  dstinfo->image_width = dstinfo->image_height;
+  dstinfo->image_height = jtemp;
+#if JPEG_LIB_VERSION >= 70
+  itemp = dstinfo->min_DCT_h_scaled_size;
+  dstinfo->min_DCT_h_scaled_size = dstinfo->min_DCT_v_scaled_size;
+  dstinfo->min_DCT_v_scaled_size = itemp;
+#endif
+
+  /* Transpose sampling factors */
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    itemp = compptr->h_samp_factor;
+    compptr->h_samp_factor = compptr->v_samp_factor;
+    compptr->v_samp_factor = itemp;
+  }
+
+  /* Transpose quantization tables */
+  for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
+    qtblptr = dstinfo->quant_tbl_ptrs[tblno];
+    if (qtblptr != NULL) {
+      for (i = 0; i < DCTSIZE; i++) {
+        for (j = 0; j < i; j++) {
+          qtemp = qtblptr->quantval[i * DCTSIZE + j];
+          qtblptr->quantval[i * DCTSIZE + j] =
+            qtblptr->quantval[j * DCTSIZE + i];
+          qtblptr->quantval[j * DCTSIZE + i] = qtemp;
+        }
+      }
+    }
+  }
+}
+
+
+/* Adjust Exif image parameters.
+ *
+ * We try to adjust the Tags ExifImageWidth and ExifImageHeight if possible.
+ */
+
+LOCAL(void)
+adjust_exif_parameters(JOCTET *data, unsigned int length, JDIMENSION new_width,
+                       JDIMENSION new_height)
+{
+  boolean is_motorola; /* Flag for byte order */
+  unsigned int number_of_tags, tagnum;
+  unsigned int firstoffset, offset;
+  JDIMENSION new_value;
+
+  if (length < 12) return; /* Length of an IFD entry */
+
+  /* Discover byte order */
+  if (data[0] == 0x49 && data[1] == 0x49)
+    is_motorola = FALSE;
+  else if (data[0] == 0x4D && data[1] == 0x4D)
+    is_motorola = TRUE;
+  else
+    return;
+
+  /* Check Tag Mark */
+  if (is_motorola) {
+    if (data[2] != 0) return;
+    if (data[3] != 0x2A) return;
+  } else {
+    if (data[3] != 0) return;
+    if (data[2] != 0x2A) return;
+  }
+
+  /* Get first IFD offset (offset to IFD0) */
+  if (is_motorola) {
+    if (data[4] != 0) return;
+    if (data[5] != 0) return;
+    firstoffset = data[6];
+    firstoffset <<= 8;
+    firstoffset += data[7];
+  } else {
+    if (data[7] != 0) return;
+    if (data[6] != 0) return;
+    firstoffset = data[5];
+    firstoffset <<= 8;
+    firstoffset += data[4];
+  }
+  if (firstoffset > length - 2) return; /* check end of data segment */
+
+  /* Get the number of directory entries contained in this IFD */
+  if (is_motorola) {
+    number_of_tags = data[firstoffset];
+    number_of_tags <<= 8;
+    number_of_tags += data[firstoffset + 1];
+  } else {
+    number_of_tags = data[firstoffset + 1];
+    number_of_tags <<= 8;
+    number_of_tags += data[firstoffset];
+  }
+  if (number_of_tags == 0) return;
+  firstoffset += 2;
+
+  /* Search for ExifSubIFD offset Tag in IFD0 */
+  for (;;) {
+    if (firstoffset > length - 12) return; /* check end of data segment */
+    /* Get Tag number */
+    if (is_motorola) {
+      tagnum = data[firstoffset];
+      tagnum <<= 8;
+      tagnum += data[firstoffset + 1];
+    } else {
+      tagnum = data[firstoffset + 1];
+      tagnum <<= 8;
+      tagnum += data[firstoffset];
+    }
+    if (tagnum == 0x8769) break; /* found ExifSubIFD offset Tag */
+    if (--number_of_tags == 0) return;
+    firstoffset += 12;
+  }
+
+  /* Get the ExifSubIFD offset */
+  if (is_motorola) {
+    if (data[firstoffset + 8] != 0) return;
+    if (data[firstoffset + 9] != 0) return;
+    offset = data[firstoffset + 10];
+    offset <<= 8;
+    offset += data[firstoffset + 11];
+  } else {
+    if (data[firstoffset + 11] != 0) return;
+    if (data[firstoffset + 10] != 0) return;
+    offset = data[firstoffset + 9];
+    offset <<= 8;
+    offset += data[firstoffset + 8];
+  }
+  if (offset > length - 2) return; /* check end of data segment */
+
+  /* Get the number of directory entries contained in this SubIFD */
+  if (is_motorola) {
+    number_of_tags = data[offset];
+    number_of_tags <<= 8;
+    number_of_tags += data[offset + 1];
+  } else {
+    number_of_tags = data[offset + 1];
+    number_of_tags <<= 8;
+    number_of_tags += data[offset];
+  }
+  if (number_of_tags < 2) return;
+  offset += 2;
+
+  /* Search for ExifImageWidth and ExifImageHeight Tags in this SubIFD */
+  do {
+    if (offset > length - 12) return; /* check end of data segment */
+    /* Get Tag number */
+    if (is_motorola) {
+      tagnum = data[offset];
+      tagnum <<= 8;
+      tagnum += data[offset + 1];
+    } else {
+      tagnum = data[offset + 1];
+      tagnum <<= 8;
+      tagnum += data[offset];
+    }
+    if (tagnum == 0xA002 || tagnum == 0xA003) {
+      if (tagnum == 0xA002)
+        new_value = new_width; /* ExifImageWidth Tag */
+      else
+        new_value = new_height; /* ExifImageHeight Tag */
+      if (is_motorola) {
+        data[offset + 2] = 0; /* Format = unsigned long (4 octets) */
+        data[offset + 3] = 4;
+        data[offset + 4] = 0; /* Number Of Components = 1 */
+        data[offset + 5] = 0;
+        data[offset + 6] = 0;
+        data[offset + 7] = 1;
+        data[offset + 8] = 0;
+        data[offset + 9] = 0;
+        data[offset + 10] = (JOCTET)((new_value >> 8) & 0xFF);
+        data[offset + 11] = (JOCTET)(new_value & 0xFF);
+      } else {
+        data[offset + 2] = 4; /* Format = unsigned long (4 octets) */
+        data[offset + 3] = 0;
+        data[offset + 4] = 1; /* Number Of Components = 1 */
+        data[offset + 5] = 0;
+        data[offset + 6] = 0;
+        data[offset + 7] = 0;
+        data[offset + 8] = (JOCTET)(new_value & 0xFF);
+        data[offset + 9] = (JOCTET)((new_value >> 8) & 0xFF);
+        data[offset + 10] = 0;
+        data[offset + 11] = 0;
+      }
+    }
+    offset += 12;
+  } while (--number_of_tags);
+}
+
+
+/* Adjust output image parameters as needed.
+ *
+ * This must be called after jpeg_copy_critical_parameters()
+ * and before jpeg_write_coefficients().
+ *
+ * The return value is the set of virtual coefficient arrays to be written
+ * (either the ones allocated by jtransform_request_workspace, or the
+ * original source data arrays).  The caller will need to pass this value
+ * to jpeg_write_coefficients().
+ */
+
+GLOBAL(jvirt_barray_ptr *)
+jtransform_adjust_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                             jvirt_barray_ptr *src_coef_arrays,
+                             jpeg_transform_info *info)
+{
+  /* If force-to-grayscale is requested, adjust destination parameters */
+  if (info->force_grayscale) {
+    /* First, ensure we have YCbCr or grayscale data, and that the source's
+     * Y channel is full resolution.  (No reasonable person would make Y
+     * be less than full resolution, so actually coping with that case
+     * isn't worth extra code space.  But we check it to avoid crashing.)
+     */
+    if (((dstinfo->jpeg_color_space == JCS_YCbCr &&
+          dstinfo->num_components == 3) ||
+         (dstinfo->jpeg_color_space == JCS_GRAYSCALE &&
+          dstinfo->num_components == 1)) &&
+        srcinfo->comp_info[0].h_samp_factor == srcinfo->max_h_samp_factor &&
+        srcinfo->comp_info[0].v_samp_factor == srcinfo->max_v_samp_factor) {
+      /* We use jpeg_set_colorspace to make sure subsidiary settings get fixed
+       * properly.  Among other things, it sets the target h_samp_factor &
+       * v_samp_factor to 1, which typically won't match the source.
+       * We have to preserve the source's quantization table number, however.
+       */
+      int sv_quant_tbl_no = dstinfo->comp_info[0].quant_tbl_no;
+      jpeg_set_colorspace(dstinfo, JCS_GRAYSCALE);
+      dstinfo->comp_info[0].quant_tbl_no = sv_quant_tbl_no;
+    } else {
+      /* Sorry, can't do it */
+      ERREXIT(dstinfo, JERR_CONVERSION_NOTIMPL);
+    }
+  } else if (info->num_components == 1) {
+    /* For a single-component source, we force the destination sampling factors
+     * to 1x1, with or without force_grayscale.  This is useful because some
+     * decoders choke on grayscale images with other sampling factors.
+     */
+    dstinfo->comp_info[0].h_samp_factor = 1;
+    dstinfo->comp_info[0].v_samp_factor = 1;
+  }
+
+  /* Correct the destination's image dimensions as necessary
+   * for rotate/flip, resize, and crop operations.
+   */
+#if JPEG_LIB_VERSION >= 80
+  dstinfo->jpeg_width = info->output_width;
+  dstinfo->jpeg_height = info->output_height;
+#endif
+
+  /* Transpose destination image parameters, adjust quantization */
+  switch (info->transform) {
+  case JXFORM_TRANSPOSE:
+  case JXFORM_TRANSVERSE:
+  case JXFORM_ROT_90:
+  case JXFORM_ROT_270:
+#if JPEG_LIB_VERSION < 80
+    dstinfo->image_width = info->output_height;
+    dstinfo->image_height = info->output_width;
+#endif
+    transpose_critical_parameters(dstinfo);
+    break;
+  case JXFORM_DROP:
+    if (info->drop_width != 0 && info->drop_height != 0)
+      adjust_quant(srcinfo, src_coef_arrays,
+                   info->drop_ptr, info->drop_coef_arrays,
+                   info->trim, dstinfo);
+    break;
+  default:
+#if JPEG_LIB_VERSION < 80
+    dstinfo->image_width = info->output_width;
+    dstinfo->image_height = info->output_height;
+#endif
+    break;
+  }
+
+  /* Adjust Exif properties */
+  if (srcinfo->marker_list != NULL &&
+      srcinfo->marker_list->marker == JPEG_APP0 + 1 &&
+      srcinfo->marker_list->data_length >= 6 &&
+      srcinfo->marker_list->data[0] == 0x45 &&
+      srcinfo->marker_list->data[1] == 0x78 &&
+      srcinfo->marker_list->data[2] == 0x69 &&
+      srcinfo->marker_list->data[3] == 0x66 &&
+      srcinfo->marker_list->data[4] == 0 &&
+      srcinfo->marker_list->data[5] == 0) {
+    /* Suppress output of JFIF marker */
+    dstinfo->write_JFIF_header = FALSE;
+    /* Adjust Exif image parameters */
+#if JPEG_LIB_VERSION >= 80
+    if (dstinfo->jpeg_width != srcinfo->image_width ||
+        dstinfo->jpeg_height != srcinfo->image_height)
+      /* Align data segment to start of TIFF structure for parsing */
+      adjust_exif_parameters(srcinfo->marker_list->data + 6,
+                             srcinfo->marker_list->data_length - 6,
+                             dstinfo->jpeg_width, dstinfo->jpeg_height);
+#else
+    if (dstinfo->image_width != srcinfo->image_width ||
+        dstinfo->image_height != srcinfo->image_height)
+      /* Align data segment to start of TIFF structure for parsing */
+      adjust_exif_parameters(srcinfo->marker_list->data + 6,
+                             srcinfo->marker_list->data_length - 6,
+                             dstinfo->image_width, dstinfo->image_height);
+#endif
+  }
+
+  /* Return the appropriate output data set */
+  if (info->workspace_coef_arrays != NULL)
+    return info->workspace_coef_arrays;
+  return src_coef_arrays;
+}
+
+
+/* Execute the actual transformation, if any.
+ *
+ * This must be called *after* jpeg_write_coefficients, because it depends
+ * on jpeg_write_coefficients to have computed subsidiary values such as
+ * the per-component width and height fields in the destination object.
+ *
+ * Note that some transformations will modify the source data arrays!
+ */
+
+GLOBAL(void)
+jtransform_execute_transform(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                             jvirt_barray_ptr *src_coef_arrays,
+                             jpeg_transform_info *info)
+{
+  jvirt_barray_ptr *dst_coef_arrays = info->workspace_coef_arrays;
+
+  /* Note: conditions tested here should match those in switch statement
+   * in jtransform_request_workspace()
+   */
+  switch (info->transform) {
+  case JXFORM_NONE:
+    if (info->output_width > srcinfo->output_width ||
+        info->output_height > srcinfo->output_height) {
+      if (info->output_width > srcinfo->output_width &&
+          info->crop_width_set == JCROP_REFLECT)
+        do_crop_ext_reflect(srcinfo, dstinfo,
+                            info->x_crop_offset, info->y_crop_offset,
+                            src_coef_arrays, dst_coef_arrays);
+      else if (info->output_width > srcinfo->output_width &&
+               info->crop_width_set == JCROP_FORCE)
+        do_crop_ext_flat(srcinfo, dstinfo,
+                         info->x_crop_offset, info->y_crop_offset,
+                         src_coef_arrays, dst_coef_arrays);
+      else
+        do_crop_ext_zero(srcinfo, dstinfo,
+                         info->x_crop_offset, info->y_crop_offset,
+                         src_coef_arrays, dst_coef_arrays);
+    } else if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
+      do_crop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, dst_coef_arrays);
+    break;
+  case JXFORM_FLIP_H:
+    if (info->y_crop_offset != 0 || info->slow_hflip)
+      do_flip_h(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+                src_coef_arrays, dst_coef_arrays);
+    else
+      do_flip_h_no_crop(srcinfo, dstinfo, info->x_crop_offset,
+                        src_coef_arrays);
+    break;
+  case JXFORM_FLIP_V:
+    do_flip_v(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, dst_coef_arrays);
+    break;
+  case JXFORM_TRANSPOSE:
+    do_transpose(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+                 src_coef_arrays, dst_coef_arrays);
+    break;
+  case JXFORM_TRANSVERSE:
+    do_transverse(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+                  src_coef_arrays, dst_coef_arrays);
+    break;
+  case JXFORM_ROT_90:
+    do_rot_90(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, dst_coef_arrays);
+    break;
+  case JXFORM_ROT_180:
+    do_rot_180(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+               src_coef_arrays, dst_coef_arrays);
+    break;
+  case JXFORM_ROT_270:
+    do_rot_270(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+               src_coef_arrays, dst_coef_arrays);
+    break;
+  case JXFORM_WIPE:
+    if (info->crop_width_set == JCROP_REFLECT &&
+        info->y_crop_offset == 0 && info->drop_height ==
+        (JDIMENSION)jdiv_round_up
+          ((long)info->output_height, (long)info->iMCU_sample_height) &&
+        (info->x_crop_offset == 0 ||
+         info->x_crop_offset + info->drop_width ==
+         (JDIMENSION)jdiv_round_up
+           ((long)info->output_width, (long)info->iMCU_sample_width)))
+      do_reflect(srcinfo, dstinfo, info->x_crop_offset,
+                 src_coef_arrays, info->drop_width, info->drop_height);
+    else if (info->crop_width_set == JCROP_FORCE)
+      do_flatten(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+                 src_coef_arrays, info->drop_width, info->drop_height);
+    else
+      do_wipe(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, info->drop_width, info->drop_height);
+    break;
+  case JXFORM_DROP:
+    if (info->drop_width != 0 && info->drop_height != 0)
+      do_drop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, info->drop_ptr, info->drop_coef_arrays,
+              info->drop_width, info->drop_height);
+    break;
+  }
+}
+
+/* jtransform_perfect_transform
+ *
+ * Determine whether lossless transformation is perfectly
+ * possible for a specified image and transformation.
+ *
+ * Inputs:
+ *   image_width, image_height: source image dimensions.
+ *   MCU_width, MCU_height: pixel dimensions of MCU.
+ *   transform: transformation identifier.
+ * Parameter sources from initialized jpeg_struct
+ * (after reading source header):
+ *   image_width = cinfo.image_width
+ *   image_height = cinfo.image_height
+ *   MCU_width = cinfo.max_h_samp_factor * cinfo.block_size
+ *   MCU_height = cinfo.max_v_samp_factor * cinfo.block_size
+ * Result:
+ *   TRUE = perfect transformation possible
+ *   FALSE = perfect transformation not possible
+ *           (may use custom action then)
+ */
+
+GLOBAL(boolean)
+jtransform_perfect_transform(JDIMENSION image_width, JDIMENSION image_height,
+                             int MCU_width, int MCU_height,
+                             JXFORM_CODE transform)
+{
+  boolean result = TRUE; /* initialize TRUE */
+
+  switch (transform) {
+  case JXFORM_FLIP_H:
+  case JXFORM_ROT_270:
+    if (image_width % (JDIMENSION)MCU_width)
+      result = FALSE;
+    break;
+  case JXFORM_FLIP_V:
+  case JXFORM_ROT_90:
+    if (image_height % (JDIMENSION)MCU_height)
+      result = FALSE;
+    break;
+  case JXFORM_TRANSVERSE:
+  case JXFORM_ROT_180:
+    if (image_width % (JDIMENSION)MCU_width)
+      result = FALSE;
+    if (image_height % (JDIMENSION)MCU_height)
+      result = FALSE;
+    break;
+  default:
+    break;
+  }
+
+  return result;
+}
+
+#endif /* TRANSFORMS_SUPPORTED */
+
+
+/* Setup decompression object to save desired markers in memory.
+ * This must be called before jpeg_read_header() to have the desired effect.
+ */
+
+GLOBAL(void)
+jcopy_markers_setup(j_decompress_ptr srcinfo, JCOPY_OPTION option)
+{
+#ifdef SAVE_MARKERS_SUPPORTED
+  int m;
+
+  /* Save comments except under NONE option */
+  if (option != JCOPYOPT_NONE && option != JCOPYOPT_ICC) {
+    jpeg_save_markers(srcinfo, JPEG_COM, 0xFFFF);
+  }
+  /* Save all types of APPn markers iff ALL option */
+  if (option == JCOPYOPT_ALL || option == JCOPYOPT_ALL_EXCEPT_ICC) {
+    for (m = 0; m < 16; m++) {
+      if (option == JCOPYOPT_ALL_EXCEPT_ICC && m == 2)
+        continue;
+      jpeg_save_markers(srcinfo, JPEG_APP0 + m, 0xFFFF);
+    }
+  }
+  /* Save only APP2 markers if ICC option selected */
+  if (option == JCOPYOPT_ICC) {
+    jpeg_save_markers(srcinfo, JPEG_APP0 + 2, 0xFFFF);
+  }
+#endif /* SAVE_MARKERS_SUPPORTED */
+}
+
+/* Copy markers saved in the given source object to the destination object.
+ * This should be called just after jpeg_start_compress() or
+ * jpeg_write_coefficients().
+ * Note that those routines will have written the SOI, and also the
+ * JFIF APP0 or Adobe APP14 markers if selected.
+ */
+
+GLOBAL(void)
+jcopy_markers_execute(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                      JCOPY_OPTION option)
+{
+  jpeg_saved_marker_ptr marker;
+
+  /* In the current implementation, we don't actually need to examine the
+   * option flag here; we just copy everything that got saved.
+   * But to avoid confusion, we do not output JFIF and Adobe APP14 markers
+   * if the encoder library already wrote one.
+   */
+  for (marker = srcinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (dstinfo->write_JFIF_header &&
+        marker->marker == JPEG_APP0 &&
+        marker->data_length >= 5 &&
+        marker->data[0] == 0x4A &&
+        marker->data[1] == 0x46 &&
+        marker->data[2] == 0x49 &&
+        marker->data[3] == 0x46 &&
+        marker->data[4] == 0)
+      continue;                 /* reject duplicate JFIF */
+    if (dstinfo->write_Adobe_marker &&
+        marker->marker == JPEG_APP0 + 14 &&
+        marker->data_length >= 5 &&
+        marker->data[0] == 0x41 &&
+        marker->data[1] == 0x64 &&
+        marker->data[2] == 0x6F &&
+        marker->data[3] == 0x62 &&
+        marker->data[4] == 0x65)
+      continue;                 /* reject duplicate Adobe */
+    jpeg_write_marker(dstinfo, marker->marker,
+                      marker->data, marker->data_length);
+  }
+}
diff --git a/3rdparty/libjpeg-turbo/src/transupp.h b/3rdparty/libjpeg-turbo/src/transupp.h
new file mode 100644
index 000000000000..cea1f409214a
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/transupp.h
@@ -0,0 +1,231 @@
+/*
+ * transupp.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1997-2019, Thomas G. Lane, Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2017, 2021, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains declarations for image transformation routines and
+ * other utility code used by the jpegtran sample application.  These are
+ * NOT part of the core JPEG library.  But we keep these routines separate
+ * from jpegtran.c to ease the task of maintaining jpegtran-like programs
+ * that have other user interfaces.
+ *
+ * NOTE: all the routines declared here have very specific requirements
+ * about when they are to be executed during the reading and writing of the
+ * source and destination files.  See the comments in transupp.c, or see
+ * jpegtran.c for an example of correct usage.
+ */
+
+/* If you happen not to want the image transform support, disable it here */
+#ifndef TRANSFORMS_SUPPORTED
+#define TRANSFORMS_SUPPORTED  1         /* 0 disables transform code */
+#endif
+
+/*
+ * Although rotating and flipping data expressed as DCT coefficients is not
+ * hard, there is an asymmetry in the JPEG format specification for images
+ * whose dimensions aren't multiples of the iMCU size.  The right and bottom
+ * image edges are padded out to the next iMCU boundary with junk data; but
+ * no padding is possible at the top and left edges.  If we were to flip
+ * the whole image including the pad data, then pad garbage would become
+ * visible at the top and/or left, and real pixels would disappear into the
+ * pad margins --- perhaps permanently, since encoders & decoders may not
+ * bother to preserve DCT blocks that appear to be completely outside the
+ * nominal image area.  So, we have to exclude any partial iMCUs from the
+ * basic transformation.
+ *
+ * Transpose is the only transformation that can handle partial iMCUs at the
+ * right and bottom edges completely cleanly.  flip_h can flip partial iMCUs
+ * at the bottom, but leaves any partial iMCUs at the right edge untouched.
+ * Similarly flip_v leaves any partial iMCUs at the bottom edge untouched.
+ * The other transforms are defined as combinations of these basic transforms
+ * and process edge blocks in a way that preserves the equivalence.
+ *
+ * The "trim" option causes untransformable partial iMCUs to be dropped;
+ * this is not strictly lossless, but it usually gives the best-looking
+ * result for odd-size images.  Note that when this option is active,
+ * the expected mathematical equivalences between the transforms may not hold.
+ * (For example, -rot 270 -trim trims only the bottom edge, but -rot 90 -trim
+ * followed by -rot 180 -trim trims both edges.)
+ *
+ * We also offer a lossless-crop option, which discards data outside a given
+ * image region but losslessly preserves what is inside.  Like the rotate and
+ * flip transforms, lossless crop is restricted by the JPEG format: the upper
+ * left corner of the selected region must fall on an iMCU boundary.  If this
+ * does not hold for the given crop parameters, we silently move the upper left
+ * corner up and/or left to make it so, simultaneously increasing the region
+ * dimensions to keep the lower right crop corner unchanged.  (Thus, the
+ * output image covers at least the requested region, but may cover more.)
+ * The adjustment of the region dimensions may be optionally disabled.
+ *
+ * A complementary lossless wipe option is provided to discard (gray out) data
+ * inside a given image region while losslessly preserving what is outside.
+ * A lossless drop option is also provided, which allows another JPEG image to
+ * be inserted ("dropped") into the source image data at a given position,
+ * replacing the existing image data at that position.  Both the source image
+ * and the drop image must have the same subsampling level.  It is best if they
+ * also have the same quantization (quality.)  Otherwise, the quantization of
+ * the output image will be adapted to accommodate the higher of the source
+ * image quality and the drop image quality.  The trim option can be used with
+ * the drop option to requantize the drop image to match the source image.
+ *
+ * We also provide a lossless-resize option, which is kind of a lossless-crop
+ * operation in the DCT coefficient block domain - it discards higher-order
+ * coefficients and losslessly preserves lower-order coefficients of a
+ * sub-block.
+ *
+ * Rotate/flip transform, resize, and crop can be requested together in a
+ * single invocation.  The crop is applied last --- that is, the crop region
+ * is specified in terms of the destination image after transform/resize.
+ *
+ * We also offer a "force to grayscale" option, which simply discards the
+ * chrominance channels of a YCbCr image.  This is lossless in the sense that
+ * the luminance channel is preserved exactly.  It's not the same kind of
+ * thing as the rotate/flip transformations, but it's convenient to handle it
+ * as part of this package, mainly because the transformation routines have to
+ * be aware of the option to know how many components to work on.
+ */
+
+
+/*
+ * Codes for supported types of image transformations.
+ */
+
+typedef enum {
+  JXFORM_NONE,            /* no transformation */
+  JXFORM_FLIP_H,          /* horizontal flip */
+  JXFORM_FLIP_V,          /* vertical flip */
+  JXFORM_TRANSPOSE,       /* transpose across UL-to-LR axis */
+  JXFORM_TRANSVERSE,      /* transpose across UR-to-LL axis */
+  JXFORM_ROT_90,          /* 90-degree clockwise rotation */
+  JXFORM_ROT_180,         /* 180-degree rotation */
+  JXFORM_ROT_270,         /* 270-degree clockwise (or 90 ccw) */
+  JXFORM_WIPE,            /* wipe */
+  JXFORM_DROP             /* drop */
+} JXFORM_CODE;
+
+/*
+ * Codes for crop parameters, which can individually be unspecified,
+ * positive or negative for xoffset or yoffset,
+ * positive or force or reflect for width or height.
+ */
+
+typedef enum {
+  JCROP_UNSET,
+  JCROP_POS,
+  JCROP_NEG,
+  JCROP_FORCE,
+  JCROP_REFLECT
+} JCROP_CODE;
+
+/*
+ * Transform parameters struct.
+ * NB: application must not change any elements of this struct after
+ * calling jtransform_request_workspace.
+ */
+
+typedef struct {
+  /* Options: set by caller */
+  JXFORM_CODE transform;        /* image transform operator */
+  boolean perfect;              /* if TRUE, fail if partial MCUs are requested */
+  boolean trim;                 /* if TRUE, trim partial MCUs as needed */
+  boolean force_grayscale;      /* if TRUE, convert color image to grayscale */
+  boolean crop;                 /* if TRUE, crop or wipe source image, or drop */
+  boolean slow_hflip;  /* For best performance, the JXFORM_FLIP_H transform
+                          normally modifies the source coefficients in place.
+                          Setting this to TRUE will instead use a slower,
+                          double-buffered algorithm, which leaves the source
+                          coefficients in tact (necessary if other transformed
+                          images must be generated from the same set of
+                          coefficients. */
+
+  /* Crop parameters: application need not set these unless crop is TRUE.
+   * These can be filled in by jtransform_parse_crop_spec().
+   */
+  JDIMENSION crop_width;        /* Width of selected region */
+  JCROP_CODE crop_width_set;    /* (force-disables adjustment) */
+  JDIMENSION crop_height;       /* Height of selected region */
+  JCROP_CODE crop_height_set;   /* (force-disables adjustment) */
+  JDIMENSION crop_xoffset;      /* X offset of selected region */
+  JCROP_CODE crop_xoffset_set;  /* (negative measures from right edge) */
+  JDIMENSION crop_yoffset;      /* Y offset of selected region */
+  JCROP_CODE crop_yoffset_set;  /* (negative measures from bottom edge) */
+
+  /* Drop parameters: set by caller for drop request */
+  j_decompress_ptr drop_ptr;
+  jvirt_barray_ptr *drop_coef_arrays;
+
+  /* Internal workspace: caller should not touch these */
+  int num_components;           /* # of components in workspace */
+  jvirt_barray_ptr *workspace_coef_arrays; /* workspace for transformations */
+  JDIMENSION output_width;      /* cropped destination dimensions */
+  JDIMENSION output_height;
+  JDIMENSION x_crop_offset;     /* destination crop offsets measured in iMCUs */
+  JDIMENSION y_crop_offset;
+  JDIMENSION drop_width;        /* drop/wipe dimensions measured in iMCUs */
+  JDIMENSION drop_height;
+  int iMCU_sample_width;        /* destination iMCU size */
+  int iMCU_sample_height;
+} jpeg_transform_info;
+
+
+#if TRANSFORMS_SUPPORTED
+
+/* Parse a crop specification (written in X11 geometry style) */
+EXTERN(boolean) jtransform_parse_crop_spec(jpeg_transform_info *info,
+                                           const char *spec);
+/* Request any required workspace */
+EXTERN(boolean) jtransform_request_workspace(j_decompress_ptr srcinfo,
+                                             jpeg_transform_info *info);
+/* Adjust output image parameters */
+EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters
+  (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+   jvirt_barray_ptr *src_coef_arrays, jpeg_transform_info *info);
+/* Execute the actual transformation, if any */
+EXTERN(void) jtransform_execute_transform(j_decompress_ptr srcinfo,
+                                          j_compress_ptr dstinfo,
+                                          jvirt_barray_ptr *src_coef_arrays,
+                                          jpeg_transform_info *info);
+/* Determine whether lossless transformation is perfectly
+ * possible for a specified image and transformation.
+ */
+EXTERN(boolean) jtransform_perfect_transform(JDIMENSION image_width,
+                                             JDIMENSION image_height,
+                                             int MCU_width, int MCU_height,
+                                             JXFORM_CODE transform);
+
+/* jtransform_execute_transform used to be called
+ * jtransform_execute_transformation, but some compilers complain about
+ * routine names that long.  This macro is here to avoid breaking any
+ * old source code that uses the original name...
+ */
+#define jtransform_execute_transformation       jtransform_execute_transform
+
+#endif /* TRANSFORMS_SUPPORTED */
+
+
+/*
+ * Support for copying optional markers from source to destination file.
+ */
+
+typedef enum {
+  JCOPYOPT_NONE,           /* copy no optional markers */
+  JCOPYOPT_COMMENTS,       /* copy only comment (COM) markers */
+  JCOPYOPT_ALL,            /* copy all optional markers */
+  JCOPYOPT_ALL_EXCEPT_ICC, /* copy all optional markers except APP2 */
+  JCOPYOPT_ICC             /* copy only ICC profile (APP2) markers */
+} JCOPY_OPTION;
+
+#define JCOPYOPT_DEFAULT  JCOPYOPT_COMMENTS     /* recommended default */
+
+/* Setup decompression object to save desired markers in memory */
+EXTERN(void) jcopy_markers_setup(j_decompress_ptr srcinfo,
+                                 JCOPY_OPTION option);
+/* Copy markers saved in the given source object to the destination object */
+EXTERN(void) jcopy_markers_execute(j_decompress_ptr srcinfo,
+                                   j_compress_ptr dstinfo,
+                                   JCOPY_OPTION option);
diff --git a/3rdparty/libjpeg-turbo/src/turbojpeg-jni.c b/3rdparty/libjpeg-turbo/src/turbojpeg-jni.c
new file mode 100644
index 000000000000..32186f3fa0ad
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/turbojpeg-jni.c
@@ -0,0 +1,1400 @@
+/*
+ * Copyright (C)2011-2023 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <limits.h>
+#include "turbojpeg.h"
+#include "jinclude.h"
+#include <jni.h>
+#include "java/org_libjpegturbo_turbojpeg_TJCompressor.h"
+#include "java/org_libjpegturbo_turbojpeg_TJDecompressor.h"
+#include "java/org_libjpegturbo_turbojpeg_TJTransformer.h"
+#include "java/org_libjpegturbo_turbojpeg_TJ.h"
+
+#define BAILIF0(f) { \
+  if (!(f) || (*env)->ExceptionCheck(env)) { \
+    goto bailout; \
+  } \
+}
+
+#define BAILIF0NOEC(f) { \
+  if (!(f)) { \
+    goto bailout; \
+  } \
+}
+
+#define THROW(msg, exceptionClass) { \
+  jclass _exccls = (*env)->FindClass(env, exceptionClass); \
+  \
+  BAILIF0(_exccls); \
+  (*env)->ThrowNew(env, _exccls, msg); \
+  goto bailout; \
+}
+
+#define THROW_TJ() { \
+  jclass _exccls; \
+  jmethodID _excid; \
+  jobject _excobj; \
+  jstring _errstr; \
+  \
+  BAILIF0(_errstr = (*env)->NewStringUTF(env, tj3GetErrorStr(handle))); \
+  BAILIF0(_exccls = (*env)->FindClass(env, \
+    "org/libjpegturbo/turbojpeg/TJException")); \
+  BAILIF0(_excid = (*env)->GetMethodID(env, _exccls, "<init>", \
+                                       "(Ljava/lang/String;I)V")); \
+  BAILIF0(_excobj = (*env)->NewObject(env, _exccls, _excid, _errstr, \
+                                      tj3GetErrorCode(handle))); \
+  (*env)->Throw(env, _excobj); \
+  goto bailout; \
+}
+
+#define THROW_ARG(msg)  THROW(msg, "java/lang/IllegalArgumentException")
+
+#define THROW_MEM() \
+  THROW("Memory allocation failure", "java/lang/OutOfMemoryError");
+
+#define GET_HANDLE() \
+  jclass _cls = (*env)->GetObjectClass(env, obj); \
+  jfieldID _fid; \
+  \
+  BAILIF0(_cls); \
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "handle", "J")); \
+  handle = (tjhandle)(size_t)(*env)->GetLongField(env, obj, _fid);
+
+#define SAFE_RELEASE(javaArray, cArray) { \
+  if (javaArray && cArray) \
+    (*env)->ReleasePrimitiveArrayCritical(env, javaArray, (void *)cArray, 0); \
+  cArray = NULL; \
+}
+
+/* TurboJPEG 1.2.x: TJ::bufSize() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSize
+  (JNIEnv *env, jclass cls, jint width, jint height, jint jpegSubsamp)
+{
+  size_t retval = tj3JPEGBufSize(width, height, jpegSubsamp);
+
+  if (retval == 0) THROW_ARG(tj3GetErrorStr(NULL));
+  if (retval > (size_t)INT_MAX)
+    THROW_ARG("Image is too large");
+
+bailout:
+  return (jint)retval;
+}
+
+/* TurboJPEG 1.4.x: TJ::bufSizeYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
+  (JNIEnv *env, jclass cls, jint width, jint align, jint height, jint subsamp)
+{
+  size_t retval = tj3YUVBufSize(width, align, height, subsamp);
+
+  if (retval == 0) THROW_ARG(tj3GetErrorStr(NULL));
+  if (retval > (size_t)INT_MAX)
+    THROW_ARG("Image is too large");
+
+bailout:
+  return (jint)retval;
+}
+
+/* TurboJPEG 1.4.x: TJ::planeSizeYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII
+  (JNIEnv *env, jclass cls, jint componentID, jint width, jint stride,
+   jint height, jint subsamp)
+{
+  size_t retval = tj3YUVPlaneSize(componentID, width, stride, height, subsamp);
+
+  if (retval == 0) THROW_ARG(tj3GetErrorStr(NULL));
+  if (retval > (size_t)INT_MAX)
+    THROW_ARG("Image is too large");
+
+bailout:
+  return (jint)retval;
+}
+
+/* TurboJPEG 1.4.x: TJ::planeWidth() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III
+  (JNIEnv *env, jclass cls, jint componentID, jint width, jint subsamp)
+{
+  jint retval = (jint)tj3YUVPlaneWidth(componentID, width, subsamp);
+
+  if (retval == 0) THROW_ARG(tj3GetErrorStr(NULL));
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.4.x: TJ::planeHeight() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III
+  (JNIEnv *env, jclass cls, jint componentID, jint height, jint subsamp)
+{
+  jint retval = (jint)tj3YUVPlaneHeight(componentID, height, subsamp);
+
+  if (retval == 0) THROW_ARG(tj3GetErrorStr(NULL));
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::init() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
+  (JNIEnv *env, jobject obj)
+{
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
+
+  if ((handle = tj3Init(TJINIT_COMPRESS)) == NULL)
+    THROW(tj3GetErrorStr(NULL), "org/libjpegturbo/turbojpeg/TJException");
+
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 3: TJCompressor::set() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_set
+  (JNIEnv *env, jobject obj, jint param, jint value)
+{
+  tjhandle handle = 0;
+
+  GET_HANDLE();
+
+  if (tj3Set(handle, param, value) == -1)
+    THROW_TJ();
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 3: TJCompressor::get() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_get
+  (JNIEnv *env, jobject obj, jint param)
+{
+  tjhandle handle = 0;
+
+  GET_HANDLE();
+
+  return tj3Get(handle, param);
+
+bailout:
+  return -1;
+}
+
+static jint TJCompressor_compress
+  (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint precision,
+   jint x, jint y, jint width, jint pitch, jint height, jint pf,
+   jbyteArray dst)
+{
+  tjhandle handle = 0;
+  size_t jpegSize = 0;
+  jsize arraySize = 0, actualPitch;
+  void *srcBuf = NULL;
+  unsigned char *jpegBuf = NULL;
+  int jpegSubsamp;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
+      height < 1 || pitch < 0)
+    THROW_ARG("Invalid argument in compress*()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
+    THROW_ARG("Mismatch between Java and C API");
+
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
+    THROW_ARG("Source buffer is not large enough");
+  jpegSubsamp = tj3Get(handle, TJPARAM_SUBSAMP);
+  if (tj3Get(handle, TJPARAM_LOSSLESS) && jpegSubsamp != TJSAMP_GRAY)
+    jpegSubsamp = TJSAMP_444;
+  else if (jpegSubsamp == TJSAMP_UNKNOWN)
+    THROW_ARG("TJPARAM_SUBSAMP must be specified");
+  jpegSize = tj3JPEGBufSize(width, height, jpegSubsamp);
+  if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
+    THROW_ARG("Destination buffer is not large enough");
+
+  if (tj3Set(handle, TJPARAM_NOREALLOC, 1) == -1)
+    THROW_TJ();
+
+  BAILIF0NOEC(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0NOEC(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (precision == 8) {
+    if (tj3Compress8(handle, &((unsigned char *)srcBuf)[y * actualPitch +
+                                                        x * tjPixelSize[pf]],
+                     width, pitch, height, pf, &jpegBuf, &jpegSize) == -1) {
+      SAFE_RELEASE(dst, jpegBuf);
+      SAFE_RELEASE(src, srcBuf);
+      THROW_TJ();
+    }
+  } else if (precision == 12) {
+    if (tj3Compress12(handle, &((short *)srcBuf)[y * actualPitch +
+                                                 x * tjPixelSize[pf]],
+                      width, pitch, height, pf, &jpegBuf, &jpegSize) == -1) {
+      SAFE_RELEASE(dst, jpegBuf);
+      SAFE_RELEASE(src, srcBuf);
+      THROW_TJ();
+    }
+  } else {
+    if (tj3Compress16(handle, &((unsigned short *)srcBuf)[y * actualPitch +
+                                                          x * tjPixelSize[pf]],
+                      width, pitch, height, pf, &jpegBuf, &jpegSize) == -1) {
+      SAFE_RELEASE(dst, jpegBuf);
+      SAFE_RELEASE(src, srcBuf);
+      THROW_TJ();
+    }
+  }
+
+bailout:
+  SAFE_RELEASE(dst, jpegBuf);
+  SAFE_RELEASE(src, srcBuf);
+  return (jint)jpegSize;
+}
+
+/* TurboJPEG 3: TJCompressor::compress8() byte source */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress8___3BIIIIII_3B
+  (JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf, jbyteArray dst)
+{
+  return TJCompressor_compress(env, obj, src, 1, 8, x, y, width, pitch, height,
+                               pf, dst);
+}
+
+/* TurboJPEG 3: TJCompressor::compress12() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress12
+  (JNIEnv *env, jobject obj, jshortArray src, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf, jbyteArray dst)
+{
+  return TJCompressor_compress(env, obj, src, 1, 12, x, y, width, pitch,
+                               height, pf, dst);
+}
+
+/* TurboJPEG 3: TJCompressor::compress16() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress16
+  (JNIEnv *env, jobject obj, jshortArray src, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf, jbyteArray dst)
+{
+  return TJCompressor_compress(env, obj, src, 1, 16, x, y, width, pitch,
+                               height, pf, dst);
+}
+
+/* TurboJPEG 3: TJCompressor::compress8() int source */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress8___3IIIIIII_3B
+  (JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
+   jint stride, jint height, jint pf, jbyteArray dst)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in compress8()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when compressing from an integer buffer.");
+
+  return TJCompressor_compress(env, obj, src, sizeof(jint), 8, x, y, width,
+                               stride * sizeof(jint), height, pf, dst);
+
+bailout:
+  return 0;
+}
+
+/* TurboJPEG 3: TJCompressor::compressFromYUV8() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV8
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jint width, jintArray jSrcStrides, jint height, jbyteArray dst)
+{
+  tjhandle handle = 0;
+  size_t jpegSize = 0;
+  jbyteArray jSrcPlanes[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanesTmp[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanes[3] = { NULL, NULL, NULL };
+  jint srcOffsetsTmp[3] = { 0, 0, 0 }, srcStridesTmp[3] = { 0, 0, 0 };
+  int srcOffsets[3] = { 0, 0, 0 }, srcStrides[3] = { 0, 0, 0 };
+  unsigned char *jpegBuf = NULL;
+  int nc = 0, i, subsamp;
+
+  GET_HANDLE();
+
+  if (org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    THROW_ARG("Mismatch between Java and C API");
+
+  if ((subsamp = tj3Get(handle, TJPARAM_SUBSAMP)) == TJSAMP_UNKNOWN)
+    THROW_ARG("TJPARAM_SUBSAMP must be specified");
+  nc = subsamp == TJSAMP_GRAY ? 1 : 3;
+  if ((*env)->GetArrayLength(env, srcobjs) < nc)
+    THROW_ARG("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
+    THROW_ARG("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
+    THROW_ARG("Strides array is too small for the subsampling type");
+
+  jpegSize = tj3JPEGBufSize(width, height, subsamp);
+  if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
+    THROW_ARG("Destination buffer is not large enough");
+
+  if (tj3Set(handle, TJPARAM_NOREALLOC, 1) == -1)
+    THROW_TJ();
+
+  (*env)->GetIntArrayRegion(env, jSrcOffsets, 0, nc, srcOffsetsTmp);
+  if ((*env)->ExceptionCheck(env)) goto bailout;
+  for (i = 0; i < 3; i++)
+    srcOffsets[i] = srcOffsetsTmp[i];
+
+  (*env)->GetIntArrayRegion(env, jSrcStrides, 0, nc, srcStridesTmp);
+  if ((*env)->ExceptionCheck(env)) goto bailout;
+  for (i = 0; i < 3; i++)
+    srcStrides[i] = srcStridesTmp[i];
+
+  for (i = 0; i < nc; i++) {
+    size_t planeSize = tj3YUVPlaneSize(i, width, srcStrides[i], height,
+                                       subsamp);
+    int pw = tj3YUVPlaneWidth(i, width, subsamp);
+
+    if (planeSize == 0 || pw == 0)
+      THROW_ARG(tj3GetErrorStr(NULL));
+
+    if (planeSize > (size_t)INT_MAX)
+      THROW_ARG("Source plane is too large");
+    if (srcOffsets[i] < 0)
+      THROW_ARG("Invalid argument in compressFromYUV8()");
+    if (srcStrides[i] < 0 && srcOffsets[i] - (int)planeSize + pw < 0)
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+
+    BAILIF0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) <
+        srcOffsets[i] + (int)planeSize)
+      THROW_ARG("Source plane is not large enough");
+  }
+  for (i = 0; i < nc; i++) {
+    BAILIF0NOEC(srcPlanesTmp[i] =
+                (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
+    srcPlanes[i] = &srcPlanesTmp[i][srcOffsets[i]];
+  }
+  BAILIF0NOEC(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (tj3CompressFromYUVPlanes8(handle, srcPlanes, width, srcStrides, height,
+                                &jpegBuf, &jpegSize) == -1) {
+    SAFE_RELEASE(dst, jpegBuf);
+    for (i = 0; i < nc; i++)
+      SAFE_RELEASE(jSrcPlanes[i], srcPlanesTmp[i]);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(dst, jpegBuf);
+  for (i = 0; i < nc; i++)
+    SAFE_RELEASE(jSrcPlanes[i], srcPlanesTmp[i]);
+  return (jint)jpegSize;
+}
+
+static void TJCompressor_encodeYUV8
+  (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
+   jint width, jint pitch, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides)
+{
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  unsigned char *srcBuf = NULL;
+  jbyteArray jDstPlanes[3] = { NULL, NULL, NULL };
+  unsigned char *dstPlanesTmp[3] = { NULL, NULL, NULL };
+  unsigned char *dstPlanes[3] = { NULL, NULL, NULL };
+  jint dstOffsetsTmp[3] = { 0, 0, 0 }, dstStridesTmp[3] = { 0, 0, 0 };
+  int dstOffsets[3] = { 0, 0, 0 }, dstStrides[3] = { 0, 0, 0 };
+  int nc = 0, i, subsamp;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
+      height < 1 || pitch < 0)
+    THROW_ARG("Invalid argument in encodeYUV8()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
+      org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    THROW_ARG("Mismatch between Java and C API");
+
+  if ((subsamp = tj3Get(handle, TJPARAM_SUBSAMP)) == TJSAMP_UNKNOWN)
+    THROW_ARG("TJPARAM_SUBSAMP must be specified");
+  nc = subsamp == TJSAMP_GRAY ? 1 : 3;
+  if ((*env)->GetArrayLength(env, dstobjs) < nc)
+    THROW_ARG("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jDstOffsets) < nc)
+    THROW_ARG("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jDstStrides) < nc)
+    THROW_ARG("Strides array is too small for the subsampling type");
+
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
+    THROW_ARG("Source buffer is not large enough");
+
+  (*env)->GetIntArrayRegion(env, jDstOffsets, 0, nc, dstOffsetsTmp);
+  if ((*env)->ExceptionCheck(env)) goto bailout;
+  for (i = 0; i < 3; i++)
+    dstOffsets[i] = dstOffsetsTmp[i];
+
+  (*env)->GetIntArrayRegion(env, jDstStrides, 0, nc, dstStridesTmp);
+  if ((*env)->ExceptionCheck(env)) goto bailout;
+  for (i = 0; i < 3; i++)
+    dstStrides[i] = dstStridesTmp[i];
+
+  for (i = 0; i < nc; i++) {
+    size_t planeSize = tj3YUVPlaneSize(i, width, dstStrides[i], height,
+                                       subsamp);
+    int pw = tj3YUVPlaneWidth(i, width, subsamp);
+
+    if (planeSize == 0 || pw == 0)
+      THROW_ARG(tj3GetErrorStr(NULL));
+
+    if (planeSize > (size_t)INT_MAX)
+      THROW_ARG("Destination plane is too large");
+    if (dstOffsets[i] < 0)
+      THROW_ARG("Invalid argument in encodeYUV8()");
+    if (dstStrides[i] < 0 && dstOffsets[i] - (int)planeSize + pw < 0)
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+
+    BAILIF0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) <
+        dstOffsets[i] + (int)planeSize)
+      THROW_ARG("Destination plane is not large enough");
+  }
+  for (i = 0; i < nc; i++) {
+    BAILIF0NOEC(dstPlanesTmp[i] =
+                (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
+    dstPlanes[i] = &dstPlanesTmp[i][dstOffsets[i]];
+  }
+  BAILIF0NOEC(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+  if (tj3EncodeYUVPlanes8(handle,
+                          &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
+                          width, pitch, height, pf, dstPlanes,
+                          dstStrides) == -1) {
+    SAFE_RELEASE(src, srcBuf);
+    for (i = 0; i < nc; i++)
+      SAFE_RELEASE(jDstPlanes[i], dstPlanesTmp[i]);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(src, srcBuf);
+  for (i = 0; i < nc; i++)
+    SAFE_RELEASE(jDstPlanes[i], dstPlanesTmp[i]);
+}
+
+/* TurboJPEG 3: TJCompressor::encodeYUV8() byte source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV8___3BIIIIII_3_3B_3I_3I
+  (JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides)
+{
+  TJCompressor_encodeYUV8(env, obj, src, 1, x, y, width, pitch, height, pf,
+                          dstobjs, jDstOffsets, jDstStrides);
+}
+
+/* TurboJPEG 3: TJCompressor::encodeYUV8() int source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV8___3IIIIIII_3_3B_3I_3I
+  (JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
+   jint stride, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in encodeYUV8()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when encoding from an integer buffer.");
+
+  TJCompressor_encodeYUV8(env, obj, src, sizeof(jint), x, y, width,
+                          stride * sizeof(jint), height, pf, dstobjs,
+                          jDstOffsets, jDstStrides);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::destroy() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
+  (JNIEnv *env, jobject obj)
+{
+  tjhandle handle = 0;
+
+  GET_HANDLE();
+
+  tj3Destroy(handle);
+  (*env)->SetLongField(env, obj, _fid, 0);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::init() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_init
+  (JNIEnv *env, jobject obj)
+{
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
+
+  if ((handle = tj3Init(TJINIT_DECOMPRESS)) == NULL)
+    THROW(tj3GetErrorStr(NULL), "org/libjpegturbo/turbojpeg/TJException");
+
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 3: TJDecompressor::set() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_set
+  (JNIEnv *env, jobject obj, jint param, jint value)
+{
+  Java_org_libjpegturbo_turbojpeg_TJCompressor_set(env, obj, param, value);
+}
+
+/* TurboJPEG 3: TJDecompressor::get() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_get
+  (JNIEnv *env, jobject obj, jint param)
+{
+  return Java_org_libjpegturbo_turbojpeg_TJCompressor_get(env, obj, param);
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::getScalingFactors() */
+JNIEXPORT jobjectArray JNICALL Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors
+  (JNIEnv *env, jclass cls)
+{
+  jclass sfcls = NULL;
+  jfieldID fid = 0;
+  tjscalingfactor *sf = NULL;
+  int n = 0, i;
+  jobject sfobj = NULL;
+  jobjectArray sfjava = NULL;
+
+  if ((sf = tj3GetScalingFactors(&n)) == NULL || n == 0)
+    THROW_ARG(tj3GetErrorStr(NULL));
+
+  BAILIF0(sfcls = (*env)->FindClass(env,
+    "org/libjpegturbo/turbojpeg/TJScalingFactor"));
+  BAILIF0(sfjava = (jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
+
+  for (i = 0; i < n; i++) {
+    BAILIF0(sfobj = (*env)->AllocObject(env, sfcls));
+    BAILIF0(fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
+    (*env)->SetIntField(env, sfobj, fid, sf[i].num);
+    BAILIF0(fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
+    (*env)->SetIntField(env, sfobj, fid, sf[i].denom);
+    (*env)->SetObjectArrayElement(env, sfjava, i, sfobj);
+  }
+
+bailout:
+  return sfjava;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::decompressHeader() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize)
+{
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL;
+
+  GET_HANDLE();
+
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+
+  BAILIF0NOEC(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+  if (tj3DecompressHeader(handle, jpegBuf, (size_t)jpegSize) == -1) {
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(src, jpegBuf);
+}
+
+/* TurboJPEG 3: TJDecompressor::setCroppingRegion() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_setCroppingRegion
+  (JNIEnv *env, jobject obj)
+{
+  tjhandle handle = 0;
+  jclass sfcls, crcls;
+  jobject sfobj, crobj;
+  tjregion croppingRegion;
+  tjscalingfactor scalingFactor;
+
+  GET_HANDLE();
+
+  BAILIF0(sfcls = (*env)->FindClass(env,
+    "org/libjpegturbo/turbojpeg/TJScalingFactor"));
+  BAILIF0(_fid =
+          (*env)->GetFieldID(env, _cls, "scalingFactor",
+                             "Lorg/libjpegturbo/turbojpeg/TJScalingFactor;"));
+  BAILIF0(sfobj = (*env)->GetObjectField(env, obj, _fid));
+  BAILIF0(_fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
+  scalingFactor.num = (*env)->GetIntField(env, sfobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
+  scalingFactor.denom = (*env)->GetIntField(env, sfobj, _fid);
+
+  if (tj3SetScalingFactor(handle, scalingFactor) == -1)
+    THROW_TJ();
+
+  BAILIF0(crcls = (*env)->FindClass(env, "java/awt/Rectangle"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "croppingRegion",
+                                    "Ljava/awt/Rectangle;"));
+  BAILIF0(crobj = (*env)->GetObjectField(env, obj, _fid));
+  BAILIF0(_fid = (*env)->GetFieldID(env, crcls, "x", "I"));
+  croppingRegion.x = (*env)->GetIntField(env, crobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, crcls, "y", "I"));
+  croppingRegion.y = (*env)->GetIntField(env, crobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, crcls, "width", "I"));
+  croppingRegion.w = (*env)->GetIntField(env, crobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, crcls, "height", "I"));
+  croppingRegion.h = (*env)->GetIntField(env, crobj, _fid);
+
+  if (tj3SetCroppingRegion(handle, croppingRegion) == -1)
+    THROW_TJ();
+
+bailout:
+  return;
+}
+
+static void TJDecompressor_decompress
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jarray dst,
+   jint dstElementSize, int precision, jint x, jint y, jint pitch, jint pf)
+{
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  unsigned char *jpegBuf = NULL;
+  void *dstBuf = NULL;
+  jclass sfcls, crcls;
+  jobject sfobj, crobj;
+  tjscalingfactor scalingFactor;
+  tjregion cr;
+  int jpegWidth, jpegHeight, scaledWidth, scaledHeight;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in decompress*()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
+    THROW_ARG("Mismatch between Java and C API");
+
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+  if ((jpegWidth = tj3Get(handle, TJPARAM_JPEGWIDTH)) == -1)
+    THROW_ARG("JPEG header has not yet been read");
+  if ((jpegHeight = tj3Get(handle, TJPARAM_JPEGHEIGHT)) == -1)
+    THROW_ARG("JPEG header has not yet been read");
+
+  BAILIF0(sfcls = (*env)->FindClass(env,
+    "org/libjpegturbo/turbojpeg/TJScalingFactor"));
+  BAILIF0(_fid =
+          (*env)->GetFieldID(env, _cls, "scalingFactor",
+                             "Lorg/libjpegturbo/turbojpeg/TJScalingFactor;"));
+  BAILIF0(sfobj = (*env)->GetObjectField(env, obj, _fid));
+  BAILIF0(_fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
+  scalingFactor.num = (*env)->GetIntField(env, sfobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
+  scalingFactor.denom = (*env)->GetIntField(env, sfobj, _fid);
+
+  if (tj3SetScalingFactor(handle, scalingFactor) == -1)
+    THROW_TJ();
+  scaledWidth = TJSCALED(jpegWidth, scalingFactor);
+  scaledHeight = TJSCALED(jpegHeight, scalingFactor);
+
+  BAILIF0(crcls = (*env)->FindClass(env, "java/awt/Rectangle"));
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "croppingRegion",
+                                    "Ljava/awt/Rectangle;"));
+  BAILIF0(crobj = (*env)->GetObjectField(env, obj, _fid));
+  BAILIF0(_fid = (*env)->GetFieldID(env, crcls, "x", "I"));
+  cr.x = (*env)->GetIntField(env, crobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, crcls, "y", "I"));
+  cr.y = (*env)->GetIntField(env, crobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, crcls, "width", "I"));
+  cr.w = (*env)->GetIntField(env, crobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, crcls, "height", "I"));
+  cr.h = (*env)->GetIntField(env, crobj, _fid);
+  if (cr.x != 0 || cr.y != 0 || cr.w != 0 || cr.h != 0) {
+    scaledWidth = cr.w ? cr.w : scaledWidth - cr.x;
+    scaledHeight = cr.h ? cr.h : scaledHeight - cr.y;
+  }
+
+  actualPitch = (pitch == 0) ? scaledWidth * tjPixelSize[pf] : pitch;
+  arraySize = (y + scaledHeight - 1) * actualPitch +
+              (x + scaledWidth) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
+    THROW_ARG("Destination buffer is not large enough");
+
+  BAILIF0NOEC(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0NOEC(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (precision == 8) {
+    if (tj3Decompress8(handle, jpegBuf, (size_t)jpegSize,
+                       &((unsigned char *)dstBuf)[y * actualPitch +
+                                                  x * tjPixelSize[pf]],
+                       pitch, pf) == -1) {
+      SAFE_RELEASE(dst, dstBuf);
+      SAFE_RELEASE(src, jpegBuf);
+      THROW_TJ();
+    }
+  } else if (precision == 12) {
+    if (tj3Decompress12(handle, jpegBuf, (size_t)jpegSize,
+                        &((short *)dstBuf)[y * actualPitch +
+                                           x * tjPixelSize[pf]],
+                        pitch, pf) == -1) {
+      SAFE_RELEASE(dst, dstBuf);
+      SAFE_RELEASE(src, jpegBuf);
+      THROW_TJ();
+    }
+  } else {
+    if (tj3Decompress16(handle, jpegBuf, (size_t)jpegSize,
+                        &((unsigned short *)dstBuf)[y * actualPitch +
+                                                    x * tjPixelSize[pf]],
+                        pitch, pf) == -1) {
+      SAFE_RELEASE(dst, dstBuf);
+      SAFE_RELEASE(src, jpegBuf);
+      THROW_TJ();
+    }
+  }
+
+bailout:
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, jpegBuf);
+}
+
+/* TurboJPEG 3: TJDecompressor::decompress8() byte destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress8___3BI_3BIIII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+   jint x, jint y, jint pitch, jint pf)
+{
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, 8, x, y, pitch,
+                            pf);
+}
+
+/* TurboJPEG 3: TJDecompressor::decompress12() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress12
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jshortArray dst,
+   jint x, jint y, jint pitch, jint pf)
+{
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, 12, x, y, pitch,
+                            pf);
+}
+
+/* TurboJPEG 3: TJDecompressor::decompress16() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress16
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jshortArray dst,
+   jint x, jint y, jint pitch, jint pf)
+{
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, 16, x, y, pitch,
+                            pf);
+}
+
+/* TurboJPEG 3: TJDecompressor::decompress8() int destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress8___3BI_3IIIII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
+   jint x, jint y, jint stride, jint pf)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in decompress8()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when decompressing to an integer buffer.");
+
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), 8, x,
+                            y, stride * sizeof(jint), pf);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 3: TJDecompressor::decompressToYUV8() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV8
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize,
+   jobjectArray dstobjs, jintArray jDstOffsets, jintArray jDstStrides)
+{
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL;
+  jbyteArray jDstPlanes[3] = { NULL, NULL, NULL };
+  unsigned char *dstPlanesTmp[3] = { NULL, NULL, NULL };
+  unsigned char *dstPlanes[3] = { NULL, NULL, NULL };
+  jint dstOffsetsTmp[3] = { 0, 0, 0 }, dstStridesTmp[3] = { 0, 0, 0 };
+  int dstOffsets[3] = { 0, 0, 0 }, dstStrides[3] = { 0, 0, 0 };
+  jclass sfcls;
+  jobject sfobj;
+  int jpegSubsamp, jpegWidth = 0, jpegHeight = 0;
+  int nc = 0, i, scaledWidth, scaledHeight;
+  tjscalingfactor scalingFactor;
+
+  GET_HANDLE();
+
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+  if ((jpegWidth = tj3Get(handle, TJPARAM_JPEGWIDTH)) == -1)
+    THROW_ARG("JPEG header has not yet been read");
+  if ((jpegHeight = tj3Get(handle, TJPARAM_JPEGHEIGHT)) == -1)
+    THROW_ARG("JPEG header has not yet been read");
+
+  BAILIF0(sfcls = (*env)->FindClass(env,
+    "org/libjpegturbo/turbojpeg/TJScalingFactor"));
+  BAILIF0(_fid =
+          (*env)->GetFieldID(env, _cls, "scalingFactor",
+                             "Lorg/libjpegturbo/turbojpeg/TJScalingFactor;"));
+  BAILIF0(sfobj = (*env)->GetObjectField(env, obj, _fid));
+  BAILIF0(_fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
+  scalingFactor.num = (*env)->GetIntField(env, sfobj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
+  scalingFactor.denom = (*env)->GetIntField(env, sfobj, _fid);
+
+  if (tj3SetScalingFactor(handle, scalingFactor) == -1)
+    THROW_TJ();
+  scaledWidth = TJSCALED(jpegWidth, scalingFactor);
+  scaledHeight = TJSCALED(jpegHeight, scalingFactor);
+
+  if ((jpegSubsamp = tj3Get(handle, TJPARAM_SUBSAMP)) == TJSAMP_UNKNOWN)
+    THROW_ARG("TJPARAM_SUBSAMP must be specified");
+  nc = jpegSubsamp == TJSAMP_GRAY ? 1 : 3;
+
+  (*env)->GetIntArrayRegion(env, jDstOffsets, 0, nc, dstOffsetsTmp);
+  if ((*env)->ExceptionCheck(env)) goto bailout;
+  for (i = 0; i < 3; i++)
+    dstOffsets[i] = dstOffsetsTmp[i];
+
+  (*env)->GetIntArrayRegion(env, jDstStrides, 0, nc, dstStridesTmp);
+  if ((*env)->ExceptionCheck(env)) goto bailout;
+  for (i = 0; i < 3; i++)
+    dstStrides[i] = dstStridesTmp[i];
+
+  for (i = 0; i < nc; i++) {
+    size_t planeSize = tj3YUVPlaneSize(i, scaledWidth, dstStrides[i],
+                                       scaledHeight, jpegSubsamp);
+    int pw = tj3YUVPlaneWidth(i, scaledWidth, jpegSubsamp);
+
+    if (planeSize == 0 || pw == 0)
+      THROW_ARG(tj3GetErrorStr(NULL));
+
+    if (planeSize > (size_t)INT_MAX)
+      THROW_ARG("Destination plane is too large");
+    if (dstOffsets[i] < 0)
+      THROW_ARG("Invalid argument in decompressToYUV8()");
+    if (dstStrides[i] < 0 && dstOffsets[i] - (int)planeSize + pw < 0)
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+
+    BAILIF0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) <
+        dstOffsets[i] + (int)planeSize)
+      THROW_ARG("Destination plane is not large enough");
+  }
+  for (i = 0; i < nc; i++) {
+    BAILIF0NOEC(dstPlanesTmp[i] =
+                (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
+    dstPlanes[i] = &dstPlanesTmp[i][dstOffsets[i]];
+  }
+  BAILIF0NOEC(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+  if (tj3DecompressToYUVPlanes8(handle, jpegBuf, (size_t)jpegSize, dstPlanes,
+                                dstStrides) == -1) {
+    SAFE_RELEASE(src, jpegBuf);
+    for (i = 0; i < nc; i++)
+      SAFE_RELEASE(jDstPlanes[i], dstPlanesTmp[i]);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(src, jpegBuf);
+  for (i = 0; i < nc; i++)
+    SAFE_RELEASE(jDstPlanes[i], dstPlanesTmp[i]);
+}
+
+static void TJDecompressor_decodeYUV8
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jarray dst, jint dstElementSize, jint x, jint y,
+   jint width, jint pitch, jint height, jint pf)
+{
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  jbyteArray jSrcPlanes[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanesTmp[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanes[3] = { NULL, NULL, NULL };
+  jint srcOffsetsTmp[3] = { 0, 0, 0 }, srcStridesTmp[3] = { 0, 0, 0 };
+  int srcOffsets[3] = { 0, 0, 0 }, srcStrides[3] = { 0, 0, 0 };
+  unsigned char *dstBuf = NULL;
+  int nc = 0, i, subsamp;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in decodeYUV8()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
+      org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    THROW_ARG("Mismatch between Java and C API");
+
+  if ((subsamp = tj3Get(handle, TJPARAM_SUBSAMP)) == TJSAMP_UNKNOWN)
+    THROW_ARG("TJPARAM_SUBSAMP must be specified");
+  nc = subsamp == TJSAMP_GRAY ? 1 : 3;
+  if ((*env)->GetArrayLength(env, srcobjs) < nc)
+    THROW_ARG("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
+    THROW_ARG("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
+    THROW_ARG("Strides array is too small for the subsampling type");
+
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
+    THROW_ARG("Destination buffer is not large enough");
+
+  (*env)->GetIntArrayRegion(env, jSrcOffsets, 0, nc, srcOffsetsTmp);
+  if ((*env)->ExceptionCheck(env)) goto bailout;
+  for (i = 0; i < 3; i++)
+    srcOffsets[i] = srcOffsetsTmp[i];
+
+  (*env)->GetIntArrayRegion(env, jSrcStrides, 0, nc, srcStridesTmp);
+  if ((*env)->ExceptionCheck(env)) goto bailout;
+  for (i = 0; i < 3; i++)
+    srcStrides[i] = srcStridesTmp[i];
+
+  for (i = 0; i < nc; i++) {
+    size_t planeSize = tj3YUVPlaneSize(i, width, srcStrides[i], height,
+                                       subsamp);
+    int pw = tj3YUVPlaneWidth(i, width, subsamp);
+
+    if (planeSize == 0 || pw == 0)
+      THROW_ARG(tj3GetErrorStr(NULL));
+
+    if (planeSize > (size_t)INT_MAX)
+      THROW_ARG("Source plane is too large");
+    if (srcOffsets[i] < 0)
+      THROW_ARG("Invalid argument in decodeYUV8()");
+    if (srcStrides[i] < 0 && srcOffsets[i] - (int)planeSize + pw < 0)
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+
+    BAILIF0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) <
+        srcOffsets[i] + (int)planeSize)
+      THROW_ARG("Source plane is not large enough");
+  }
+  for (i = 0; i < nc; i++) {
+    BAILIF0NOEC(srcPlanesTmp[i] =
+                (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
+    srcPlanes[i] = &srcPlanesTmp[i][srcOffsets[i]];
+  }
+  BAILIF0NOEC(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (tj3DecodeYUVPlanes8(handle, srcPlanes, srcStrides,
+                          &dstBuf[y * actualPitch + x * tjPixelSize[pf]],
+                          width, pitch, height, pf) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    for (i = 0; i < nc; i++)
+      SAFE_RELEASE(jSrcPlanes[i], srcPlanesTmp[i]);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(dst, dstBuf);
+  for (i = 0; i < nc; i++)
+    SAFE_RELEASE(jSrcPlanes[i], srcPlanesTmp[i]);
+}
+
+/* TurboJPEG 3: TJDecompressor::decodeYUV8() byte destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV8___3_3B_3I_3I_3BIIIIII
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jbyteArray dst, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf)
+{
+  TJDecompressor_decodeYUV8(env, obj, srcobjs, jSrcOffsets, jSrcStrides, dst,
+                            1, x, y, width, pitch, height, pf);
+}
+
+/* TurboJPEG 3: TJDecompressor::decodeYUV8() int destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV8___3_3B_3I_3I_3IIIIIII
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jintArray dst, jint x, jint y, jint width,
+   jint stride, jint height, jint pf)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in decodeYUV8()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when decoding to an integer buffer.");
+
+  TJDecompressor_decodeYUV8(env, obj, srcobjs, jSrcOffsets, jSrcStrides, dst,
+                            sizeof(jint), x, y, width, stride * sizeof(jint),
+                            height, pf);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.2.x: TJTransformer::init() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_init
+  (JNIEnv *env, jobject obj)
+{
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
+
+  if ((handle = tj3Init(TJINIT_TRANSFORM)) == NULL)
+    THROW(tj3GetErrorStr(NULL), "org/libjpegturbo/turbojpeg/TJException");
+
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
+
+bailout:
+  return;
+}
+
+typedef struct _JNICustomFilterParams {
+  JNIEnv *env;
+  jobject tobj;
+  jobject cfobj;
+} JNICustomFilterParams;
+
+static int JNICustomFilter(short *coeffs, tjregion arrayRegion,
+                           tjregion planeRegion, int componentIndex,
+                           int transformIndex, tjtransform *transform)
+{
+  JNICustomFilterParams *params = (JNICustomFilterParams *)transform->data;
+  JNIEnv *env = params->env;
+  jobject tobj = params->tobj, cfobj = params->cfobj;
+  jobject arrayRegionObj, planeRegionObj, bufobj, borobj;
+  jclass cls;
+  jmethodID mid;
+  jfieldID fid;
+
+  BAILIF0(bufobj = (*env)->NewDirectByteBuffer(env, coeffs,
+    sizeof(short) * arrayRegion.w * arrayRegion.h));
+  BAILIF0(cls = (*env)->FindClass(env, "java/nio/ByteOrder"));
+  BAILIF0(mid = (*env)->GetStaticMethodID(env, cls, "nativeOrder",
+                                          "()Ljava/nio/ByteOrder;"));
+  BAILIF0(borobj = (*env)->CallStaticObjectMethod(env, cls, mid));
+  BAILIF0(cls = (*env)->GetObjectClass(env, bufobj));
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "order",
+    "(Ljava/nio/ByteOrder;)Ljava/nio/ByteBuffer;"));
+  (*env)->CallObjectMethod(env, bufobj, mid, borobj);
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "asShortBuffer",
+                                    "()Ljava/nio/ShortBuffer;"));
+  BAILIF0(bufobj = (*env)->CallObjectMethod(env, bufobj, mid));
+
+  BAILIF0(cls = (*env)->FindClass(env, "java/awt/Rectangle"));
+  BAILIF0(arrayRegionObj = (*env)->AllocObject(env, cls));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.x);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.y);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.w);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.h);
+
+  BAILIF0(planeRegionObj = (*env)->AllocObject(env, cls));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.x);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.y);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.w);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.h);
+
+  BAILIF0(cls = (*env)->GetObjectClass(env, cfobj));
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "customFilter",
+    "(Ljava/nio/ShortBuffer;Ljava/awt/Rectangle;Ljava/awt/Rectangle;IILorg/libjpegturbo/turbojpeg/TJTransform;)V"));
+  (*env)->CallVoidMethod(env, cfobj, mid, bufobj, arrayRegionObj,
+                         planeRegionObj, componentIndex, transformIndex, tobj);
+
+  return 0;
+
+bailout:
+  return -1;
+}
+
+/* TurboJPEG 1.2.x: TJTransformer::transform() */
+JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transform
+  (JNIEnv *env, jobject obj, jbyteArray jsrcBuf, jint jpegSize,
+   jobjectArray dstobjs, jobjectArray tobjs)
+{
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL, **dstBufs = NULL;
+  jsize n = 0;
+  size_t *dstSizes = NULL;
+  tjtransform *t = NULL;
+  jbyteArray *jdstBufs = NULL;
+  int i, jpegWidth = 0, jpegHeight = 0, jpegSubsamp;
+  jintArray jdstSizes = 0;
+  jint *dstSizesi = NULL;
+  JNICustomFilterParams *params = NULL;
+
+  GET_HANDLE();
+
+  if ((*env)->GetArrayLength(env, jsrcBuf) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+  if ((jpegWidth = tj3Get(handle, TJPARAM_JPEGWIDTH)) == -1)
+    THROW_ARG("JPEG header has not yet been read");
+  if ((jpegHeight = tj3Get(handle, TJPARAM_JPEGHEIGHT)) == -1)
+    THROW_ARG("JPEG header has not yet been read");
+  if ((jpegSubsamp = tj3Get(handle, TJPARAM_SUBSAMP)) == TJSAMP_UNKNOWN)
+    THROW_ARG("TJPARAM_SUBSAMP must be specified");
+
+  n = (*env)->GetArrayLength(env, dstobjs);
+  if (n != (*env)->GetArrayLength(env, tobjs))
+    THROW_ARG("Mismatch between size of transforms array and destination buffers array");
+
+  if ((dstBufs =
+       (unsigned char **)malloc(sizeof(unsigned char *) * n)) == NULL)
+    THROW_MEM();
+  if ((jdstBufs = (jbyteArray *)malloc(sizeof(jbyteArray) * n)) == NULL)
+    THROW_MEM();
+  if ((dstSizes = (size_t *)malloc(sizeof(size_t) * n)) == NULL)
+    THROW_MEM();
+  if ((t = (tjtransform *)malloc(sizeof(tjtransform) * n)) == NULL)
+    THROW_MEM();
+  if ((params = (JNICustomFilterParams *)malloc(sizeof(JNICustomFilterParams) *
+                                                n)) == NULL)
+    THROW_MEM();
+  for (i = 0; i < n; i++) {
+    dstBufs[i] = NULL;  jdstBufs[i] = NULL;  dstSizes[i] = 0;
+    memset(&t[i], 0, sizeof(tjtransform));
+    memset(&params[i], 0, sizeof(JNICustomFilterParams));
+  }
+
+  for (i = 0; i < n; i++) {
+    jobject tobj, cfobj;
+
+    BAILIF0(tobj = (*env)->GetObjectArrayElement(env, tobjs, i));
+    BAILIF0(_cls = (*env)->GetObjectClass(env, tobj));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "op", "I"));
+    t[i].op = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "options", "I"));
+    t[i].options = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "x", "I"));
+    t[i].r.x = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "y", "I"));
+    t[i].r.y = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "width", "I"));
+    t[i].r.w = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "height", "I"));
+    t[i].r.h = (*env)->GetIntField(env, tobj, _fid);
+
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "cf",
+      "Lorg/libjpegturbo/turbojpeg/TJCustomFilter;"));
+    cfobj = (*env)->GetObjectField(env, tobj, _fid);
+    if (cfobj) {
+      params[i].env = env;
+      params[i].tobj = tobj;
+      params[i].cfobj = cfobj;
+      t[i].customFilter = JNICustomFilter;
+      t[i].data = (void *)&params[i];
+    }
+  }
+
+  if (tj3Set(handle, TJPARAM_NOREALLOC, 1) == -1)
+    THROW_TJ();
+
+  for (i = 0; i < n; i++) {
+    int w = jpegWidth, h = jpegHeight;
+
+    if (t[i].op == TJXOP_TRANSPOSE || t[i].op == TJXOP_TRANSVERSE ||
+        t[i].op == TJXOP_ROT90 || t[i].op == TJXOP_ROT270) {
+      w = jpegHeight;  h = jpegWidth;
+    }
+    if (t[i].r.w != 0) w = t[i].r.w;
+    if (t[i].r.h != 0) h = t[i].r.h;
+    BAILIF0(jdstBufs[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((size_t)(*env)->GetArrayLength(env, jdstBufs[i]) <
+        tj3JPEGBufSize(w, h, jpegSubsamp))
+      THROW_ARG("Destination buffer is not large enough");
+  }
+  BAILIF0NOEC(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
+  for (i = 0; i < n; i++)
+    BAILIF0NOEC(dstBufs[i] =
+                (*env)->GetPrimitiveArrayCritical(env, jdstBufs[i], 0));
+
+  if (tj3Transform(handle, jpegBuf, jpegSize, n, dstBufs, dstSizes, t) == -1) {
+    for (i = 0; i < n; i++)
+      SAFE_RELEASE(jdstBufs[i], dstBufs[i]);
+    SAFE_RELEASE(jsrcBuf, jpegBuf);
+    THROW_TJ();
+  }
+
+  for (i = 0; i < n; i++)
+    SAFE_RELEASE(jdstBufs[i], dstBufs[i]);
+  SAFE_RELEASE(jsrcBuf, jpegBuf);
+
+  jdstSizes = (*env)->NewIntArray(env, n);
+  BAILIF0(dstSizesi = (*env)->GetIntArrayElements(env, jdstSizes, 0));
+  for (i = 0; i < n; i++) dstSizesi[i] = (int)dstSizes[i];
+
+bailout:
+  if (dstSizesi) (*env)->ReleaseIntArrayElements(env, jdstSizes, dstSizesi, 0);
+  if (dstBufs) {
+    for (i = 0; i < n; i++) {
+      if (dstBufs[i] && jdstBufs && jdstBufs[i])
+        (*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
+    }
+    free(dstBufs);
+  }
+  SAFE_RELEASE(jsrcBuf, jpegBuf);
+  free(jdstBufs);
+  free(dstSizes);
+  free(t);
+  return jdstSizes;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::destroy() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy
+  (JNIEnv *env, jobject obj)
+{
+  Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy(env, obj);
+}
+
+/* Private image I/O routines (used only by TJBench) */
+JNIEXPORT jobject JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_loadImage
+  (JNIEnv *env, jobject obj, jint precision, jstring jfilename,
+   jintArray jwidth, jint align, jintArray jheight, jintArray jpixelFormat)
+{
+  tjhandle handle = NULL;
+  void *dstBuf = NULL, *jdstPtr;
+  int width, *warr, height, *harr, pixelFormat, *pfarr, n;
+  const char *filename = NULL;
+  jboolean isCopy;
+  jobject jdstBuf = NULL;
+
+  GET_HANDLE();
+
+  if ((precision != 8 && precision != 12 && precision != 16) ||
+      jfilename == NULL || jwidth == NULL ||
+      (*env)->GetArrayLength(env, jwidth) < 1 || jheight == NULL ||
+      (*env)->GetArrayLength(env, jheight) < 1 || jpixelFormat == NULL ||
+      (*env)->GetArrayLength(env, jpixelFormat) < 1)
+    THROW_ARG("Invalid argument in loadImage()");
+
+  BAILIF0NOEC(warr = (*env)->GetPrimitiveArrayCritical(env, jwidth, 0));
+  width = warr[0];
+  (*env)->ReleasePrimitiveArrayCritical(env, jwidth, warr, 0);
+  BAILIF0NOEC(harr = (*env)->GetPrimitiveArrayCritical(env, jheight, 0));
+  height = harr[0];
+  (*env)->ReleasePrimitiveArrayCritical(env, jheight, harr, 0);
+  BAILIF0NOEC(pfarr = (*env)->GetPrimitiveArrayCritical(env, jpixelFormat, 0));
+  pixelFormat = pfarr[0];
+  (*env)->ReleasePrimitiveArrayCritical(env, jpixelFormat, pfarr, 0);
+  BAILIF0(filename = (*env)->GetStringUTFChars(env, jfilename, &isCopy));
+
+  if (precision == 8) {
+    if ((dstBuf = tj3LoadImage8(handle, filename, &width, align, &height,
+                                &pixelFormat)) == NULL)
+      THROW_TJ();
+  } else if (precision == 12) {
+    if ((dstBuf = tj3LoadImage12(handle, filename, &width, align, &height,
+                                 &pixelFormat)) == NULL)
+      THROW_TJ();
+  } else {
+    if ((dstBuf = tj3LoadImage16(handle, filename, &width, align, &height,
+                                 &pixelFormat)) == NULL)
+      THROW_TJ();
+  }
+
+  (*env)->ReleaseStringUTFChars(env, jfilename, filename);
+  filename = NULL;
+
+  if ((unsigned long long)width * (unsigned long long)height *
+      (unsigned long long)tjPixelSize[pixelFormat] >
+      (unsigned long long)((unsigned int)-1))
+    THROW_ARG("Image is too large");
+
+  BAILIF0NOEC(warr = (*env)->GetPrimitiveArrayCritical(env, jwidth, 0));
+  warr[0] = width;
+  (*env)->ReleasePrimitiveArrayCritical(env, jwidth, warr, 0);
+  BAILIF0NOEC(harr = (*env)->GetPrimitiveArrayCritical(env, jheight, 0));
+  harr[0] = height;
+  (*env)->ReleasePrimitiveArrayCritical(env, jheight, harr, 0);
+  BAILIF0NOEC(pfarr = (*env)->GetPrimitiveArrayCritical(env, jpixelFormat, 0));
+  pfarr[0] = pixelFormat;
+  (*env)->ReleasePrimitiveArrayCritical(env, jpixelFormat, pfarr, 0);
+
+  n = width * height * tjPixelSize[pixelFormat];
+  if (precision == 8)
+    jdstBuf = (*env)->NewByteArray(env, n);
+  else
+    jdstBuf = (*env)->NewShortArray(env, n);
+  BAILIF0NOEC(jdstPtr = (*env)->GetPrimitiveArrayCritical(env, jdstBuf, 0));
+  memcpy(jdstPtr, dstBuf, n * (precision > 8 ? 2 : 1));
+  (*env)->ReleasePrimitiveArrayCritical(env, jdstBuf, jdstPtr, 0);
+
+bailout:
+  if (filename) (*env)->ReleaseStringUTFChars(env, jfilename, filename);
+  tj3Free(dstBuf);
+  return jdstBuf;
+}
+
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_saveImage
+  (JNIEnv *env, jobject obj, jint precision, jstring jfilename,
+   jobject jsrcBuf, jint width, jint pitch, jint height, jint pixelFormat)
+{
+  tjhandle handle = NULL;
+  void *srcBuf = NULL, *jsrcPtr;
+  const char *filename = NULL;
+  int n;
+  jboolean isCopy;
+
+  GET_HANDLE();
+
+  if ((precision != 8 && precision != 12 && precision != 16) ||
+      jfilename == NULL || jsrcBuf == NULL || width < 1 || height < 1 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+    THROW_ARG("Invalid argument in saveImage()");
+
+  if ((unsigned long long)width * (unsigned long long)height *
+      (unsigned long long)tjPixelSize[pixelFormat] >
+      (unsigned long long)((unsigned int)-1))
+    THROW_ARG("Image is too large");
+  n = width * height * tjPixelSize[pixelFormat];
+  if ((*env)->GetArrayLength(env, jsrcBuf) < n)
+    THROW_ARG("Source buffer is not large enough");
+
+  if ((srcBuf = malloc(n * (precision > 8 ? 2 : 1))) == NULL)
+    THROW_MEM();
+
+  BAILIF0NOEC(jsrcPtr = (*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
+  memcpy(srcBuf, jsrcPtr, n * (precision > 8 ? 2 : 1));
+  (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jsrcPtr, 0);
+  BAILIF0(filename = (*env)->GetStringUTFChars(env, jfilename, &isCopy));
+
+  if (precision == 8) {
+    if (tj3SaveImage8(handle, filename, srcBuf, width, pitch, height,
+                      pixelFormat) == -1)
+      THROW_TJ();
+  } else if (precision == 12) {
+    if (tj3SaveImage12(handle, filename, srcBuf, width, pitch, height,
+                       pixelFormat) == -1)
+      THROW_TJ();
+  } else {
+    if (tj3SaveImage16(handle, filename, srcBuf, width, pitch, height,
+                       pixelFormat) == -1)
+      THROW_TJ();
+  }
+
+bailout:
+  if (filename) (*env)->ReleaseStringUTFChars(env, jfilename, filename);
+  free(srcBuf);
+}
diff --git a/3rdparty/libjpeg-turbo/src/turbojpeg-mapfile b/3rdparty/libjpeg-turbo/src/turbojpeg-mapfile
new file mode 100644
index 000000000000..6aab87132adc
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/turbojpeg-mapfile
@@ -0,0 +1,108 @@
+TURBOJPEG_1.0
+{
+  global:
+    TJBUFSIZE;
+    tjCompress;
+    tjDecompress;
+    tjDecompressHeader;
+    tjDestroy;
+    tjGetErrorStr;
+    tjInitCompress;
+    tjInitDecompress;
+  local:
+    *;
+};
+
+TURBOJPEG_1.1
+{
+  global:
+    TJBUFSIZEYUV;
+    tjDecompressHeader2;
+    tjDecompressToYUV;
+    tjEncodeYUV;
+} TURBOJPEG_1.0;
+
+TURBOJPEG_1.2
+{
+  global:
+    tjAlloc;
+    tjBufSize;
+    tjBufSizeYUV;
+    tjCompress2;
+    tjDecompress2;
+    tjEncodeYUV2;
+    tjFree;
+    tjGetScalingFactors;
+    tjInitTransform;
+    tjTransform;
+} TURBOJPEG_1.1;
+
+TURBOJPEG_1.4
+{
+  global:
+    tjBufSizeYUV2;
+    tjCompressFromYUV;
+    tjCompressFromYUVPlanes;
+    tjDecodeYUV;
+    tjDecodeYUVPlanes;
+    tjDecompressHeader3;
+    tjDecompressToYUV2;
+    tjDecompressToYUVPlanes;
+    tjEncodeYUV3;
+    tjEncodeYUVPlanes;
+    tjPlaneHeight;
+    tjPlaneSizeYUV;
+    tjPlaneWidth;
+} TURBOJPEG_1.2;
+
+TURBOJPEG_2.0
+{
+  global:
+    tjGetErrorCode;
+    tjGetErrorStr2;
+    tjLoadImage;
+    tjSaveImage;
+} TURBOJPEG_1.4;
+
+TURBOJPEG_3
+{
+  global:
+    tj3Alloc;
+    tj3Compress8;
+    tj3Compress12;
+    tj3Compress16;
+    tj3CompressFromYUV8;
+    tj3CompressFromYUVPlanes8;
+    tj3DecodeYUV8;
+    tj3DecodeYUVPlanes8;
+    tj3Decompress8;
+    tj3Decompress12;
+    tj3Decompress16;
+    tj3DecompressHeader;
+    tj3DecompressToYUV8;
+    tj3DecompressToYUVPlanes8;
+    tj3Destroy;
+    tj3EncodeYUV8;
+    tj3EncodeYUVPlanes8;
+    tj3Free;
+    tj3Get;
+    tj3GetErrorCode;
+    tj3GetErrorStr;
+    tj3GetScalingFactors;
+    tj3Init;
+    tj3JPEGBufSize;
+    tj3LoadImage8;
+    tj3LoadImage12;
+    tj3LoadImage16;
+    tj3SaveImage8;
+    tj3SaveImage12;
+    tj3SaveImage16;
+    tj3Set;
+    tj3SetCroppingRegion;
+    tj3SetScalingFactor;
+    tj3Transform;
+    tj3YUVBufSize;
+    tj3YUVPlaneHeight;
+    tj3YUVPlaneSize;
+    tj3YUVPlaneWidth;
+} TURBOJPEG_2.0;
diff --git a/3rdparty/libjpeg-turbo/src/turbojpeg-mapfile.jni b/3rdparty/libjpeg-turbo/src/turbojpeg-mapfile.jni
new file mode 100644
index 000000000000..31be75085876
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/turbojpeg-mapfile.jni
@@ -0,0 +1,142 @@
+TURBOJPEG_1.0
+{
+  global:
+    TJBUFSIZE;
+    tjCompress;
+    tjDecompress;
+    tjDecompressHeader;
+    tjDestroy;
+    tjGetErrorStr;
+    tjInitCompress;
+    tjInitDecompress;
+  local:
+    *;
+};
+
+TURBOJPEG_1.1
+{
+  global:
+    TJBUFSIZEYUV;
+    tjDecompressHeader2;
+    tjDecompressToYUV;
+    tjEncodeYUV;
+} TURBOJPEG_1.0;
+
+TURBOJPEG_1.2
+{
+  global:
+    tjAlloc;
+    tjBufSize;
+    tjBufSizeYUV;
+    tjCompress2;
+    tjDecompress2;
+    tjEncodeYUV2;
+    tjFree;
+    tjGetScalingFactors;
+    tjInitTransform;
+    tjTransform;
+    Java_org_libjpegturbo_turbojpeg_TJ_bufSize;
+    Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_init;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_init;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy;
+    Java_org_libjpegturbo_turbojpeg_TJTransformer_init;
+    Java_org_libjpegturbo_turbojpeg_TJTransformer_transform;
+} TURBOJPEG_1.1;
+
+TURBOJPEG_1.4
+{
+  global:
+    tjBufSizeYUV2;
+    tjCompressFromYUV;
+    tjCompressFromYUVPlanes;
+    tjDecodeYUV;
+    tjDecodeYUVPlanes;
+    tjDecompressHeader3;
+    tjDecompressToYUV2;
+    tjDecompressToYUVPlanes;
+    tjEncodeYUV3;
+    tjEncodeYUVPlanes;
+    tjPlaneHeight;
+    tjPlaneSizeYUV;
+    tjPlaneWidth;
+    Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
+    Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III;
+    Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII;
+    Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III;
+} TURBOJPEG_1.2;
+
+TURBOJPEG_2.0
+{
+  global:
+    tjGetErrorCode;
+    tjGetErrorStr2;
+    tjLoadImage;
+    tjSaveImage;
+} TURBOJPEG_1.4;
+
+TURBOJPEG_3
+{
+  global:
+    tj3Alloc;
+    tj3Compress8;
+    tj3Compress12;
+    tj3Compress16;
+    tj3CompressFromYUV8;
+    tj3CompressFromYUVPlanes8;
+    tj3DecodeYUV8;
+    tj3DecodeYUVPlanes8;
+    tj3Decompress8;
+    tj3Decompress12;
+    tj3Decompress16;
+    tj3DecompressHeader;
+    tj3DecompressToYUV8;
+    tj3DecompressToYUVPlanes8;
+    tj3Destroy;
+    tj3EncodeYUV8;
+    tj3EncodeYUVPlanes8;
+    tj3Free;
+    tj3Get;
+    tj3GetErrorCode;
+    tj3GetErrorStr;
+    tj3GetScalingFactors;
+    tj3Init;
+    tj3JPEGBufSize;
+    tj3LoadImage8;
+    tj3LoadImage12;
+    tj3LoadImage16;
+    tj3SaveImage8;
+    tj3SaveImage12;
+    tj3SaveImage16;
+    tj3Set;
+    tj3SetCroppingRegion;
+    tj3SetScalingFactor;
+    tj3Transform;
+    tj3YUVBufSize;
+    tj3YUVPlaneHeight;
+    tj3YUVPlaneSize;
+    tj3YUVPlaneWidth;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compress8___3BIIIIII_3B;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compress8___3IIIIIII_3B;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compress12;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compress16;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV8;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV8___3BIIIIII_3_3B_3I_3I;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV8___3IIIIIII_3_3B_3I_3I;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_get;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_loadImage;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_set;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV8___3_3B_3I_3I_3BIIIIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV8___3_3B_3I_3I_3IIIIIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress8___3BI_3BIIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress8___3BI_3IIIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress12;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress16;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV8;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_get;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_saveImage;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_set;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_setCroppingRegion;
+} TURBOJPEG_2.0;
diff --git a/3rdparty/libjpeg-turbo/src/turbojpeg-mp.c b/3rdparty/libjpeg-turbo/src/turbojpeg-mp.c
new file mode 100644
index 000000000000..d4b3c74c3976
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/turbojpeg-mp.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (C)2009-2024 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* TurboJPEG API functions that must be compiled for multiple data
+   precisions */
+
+#if BITS_IN_JSAMPLE == 8
+#define _JSAMPLE  JSAMPLE
+#define _JSAMPROW  JSAMPROW
+#define _buffer  buffer
+#define _jinit_read_ppm  jinit_read_ppm
+#define _jinit_write_ppm  jinit_write_ppm
+#define _jpeg_crop_scanline  jpeg_crop_scanline
+#define _jpeg_read_scanlines  jpeg_read_scanlines
+#define _jpeg_skip_scanlines  jpeg_skip_scanlines
+#define _jpeg_write_scanlines  jpeg_write_scanlines
+#elif BITS_IN_JSAMPLE == 12
+#define _JSAMPLE  J12SAMPLE
+#define _JSAMPROW  J12SAMPROW
+#define _buffer  buffer12
+#define _jinit_read_ppm  j12init_read_ppm
+#define _jinit_write_ppm  j12init_write_ppm
+#define _jpeg_crop_scanline  jpeg12_crop_scanline
+#define _jpeg_read_scanlines  jpeg12_read_scanlines
+#define _jpeg_skip_scanlines  jpeg12_skip_scanlines
+#define _jpeg_write_scanlines  jpeg12_write_scanlines
+#elif BITS_IN_JSAMPLE == 16
+#define _JSAMPLE  J16SAMPLE
+#define _JSAMPROW  J16SAMPROW
+#define _buffer  buffer16
+#define _jinit_read_ppm  j16init_read_ppm
+#define _jinit_write_ppm  j16init_write_ppm
+#define _jpeg_read_scanlines  jpeg16_read_scanlines
+#define _jpeg_write_scanlines  jpeg16_write_scanlines
+#endif
+
+#define _GET_NAME(name, suffix)  name##suffix
+#define GET_NAME(name, suffix)  _GET_NAME(name, suffix)
+#define _GET_STRING(name, suffix)  #name #suffix
+#define GET_STRING(name, suffix)  _GET_STRING(name, suffix)
+
+
+/******************************** Compressor *********************************/
+
+/* TurboJPEG 3+ */
+DLLEXPORT int GET_NAME(tj3Compress, BITS_IN_JSAMPLE)
+  (tjhandle handle, const _JSAMPLE *srcBuf, int width, int pitch, int height,
+   int pixelFormat, unsigned char **jpegBuf, size_t *jpegSize)
+{
+  static const char FUNCTION_NAME[] = GET_STRING(tj3Compress, BITS_IN_JSAMPLE);
+  int i, retval = 0;
+  boolean alloc = TRUE;
+  _JSAMPROW *row_pointer = NULL;
+
+  GET_CINSTANCE(handle)
+  if ((this->init & COMPRESS) == 0)
+    THROW("Instance has not been initialized for compression");
+
+  if (srcBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF || jpegBuf == NULL ||
+      jpegSize == NULL)
+    THROW("Invalid argument");
+
+  if (!this->lossless && this->quality == -1)
+    THROW("TJPARAM_QUALITY must be specified");
+  if (!this->lossless && this->subsamp == TJSAMP_UNKNOWN)
+    THROW("TJPARAM_SUBSAMP must be specified");
+
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
+
+  if ((row_pointer = (_JSAMPROW *)malloc(sizeof(_JSAMPROW) * height)) == NULL)
+    THROW("Memory allocation failure");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  cinfo->image_width = width;
+  cinfo->image_height = height;
+  cinfo->data_precision = BITS_IN_JSAMPLE;
+
+  setCompDefaults(this, pixelFormat);
+  if (this->noRealloc) {
+    alloc = FALSE;
+    *jpegSize = tj3JPEGBufSize(width, height, this->subsamp);
+  }
+  jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
+
+  jpeg_start_compress(cinfo, TRUE);
+  for (i = 0; i < height; i++) {
+    if (this->bottomUp)
+      row_pointer[i] = (_JSAMPROW)&srcBuf[(height - i - 1) * (size_t)pitch];
+    else
+      row_pointer[i] = (_JSAMPROW)&srcBuf[i * (size_t)pitch];
+  }
+  while (cinfo->next_scanline < cinfo->image_height)
+    _jpeg_write_scanlines(cinfo, &row_pointer[cinfo->next_scanline],
+                          cinfo->image_height - cinfo->next_scanline);
+  jpeg_finish_compress(cinfo);
+
+bailout:
+  if (cinfo->global_state > CSTATE_START && alloc)
+    (*cinfo->dest->term_destination) (cinfo);
+  if (cinfo->global_state > CSTATE_START || retval == -1)
+    jpeg_abort_compress(cinfo);
+  free(row_pointer);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+
+/******************************* Decompressor ********************************/
+
+/* TurboJPEG 3+ */
+DLLEXPORT int GET_NAME(tj3Decompress, BITS_IN_JSAMPLE)
+  (tjhandle handle, const unsigned char *jpegBuf, size_t jpegSize,
+   _JSAMPLE *dstBuf, int pitch, int pixelFormat)
+{
+  static const char FUNCTION_NAME[] =
+    GET_STRING(tj3Decompress, BITS_IN_JSAMPLE);
+  _JSAMPROW *row_pointer = NULL;
+  int croppedHeight, i, retval = 0;
+#if BITS_IN_JSAMPLE != 16
+  int scaledWidth;
+#endif
+  struct my_progress_mgr progress;
+
+  GET_DINSTANCE(handle);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || pitch < 0 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+    THROW("Invalid argument");
+
+  if (this->scanLimit) {
+    memset(&progress, 0, sizeof(struct my_progress_mgr));
+    progress.pub.progress_monitor = my_progress_monitor;
+    progress.this = this;
+    dinfo->progress = &progress.pub;
+  } else
+    dinfo->progress = NULL;
+
+  dinfo->mem->max_memory_to_use = (long)this->maxMemory * 1048576L;
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  if (dinfo->global_state <= DSTATE_INHEADER) {
+    jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+    jpeg_read_header(dinfo, TRUE);
+  }
+  setDecompParameters(this);
+  if (this->maxPixels &&
+      (unsigned long long)this->jpegWidth * this->jpegHeight >
+      (unsigned long long)this->maxPixels)
+    THROW("Image is too large");
+  this->dinfo.out_color_space = pf2cs[pixelFormat];
+#if BITS_IN_JSAMPLE != 16
+  scaledWidth = TJSCALED(dinfo->image_width, this->scalingFactor);
+#endif
+  dinfo->do_fancy_upsampling = !this->fastUpsample;
+  this->dinfo.dct_method = this->fastDCT ? JDCT_FASTEST : JDCT_ISLOW;
+
+  dinfo->scale_num = this->scalingFactor.num;
+  dinfo->scale_denom = this->scalingFactor.denom;
+
+  jpeg_start_decompress(dinfo);
+
+#if BITS_IN_JSAMPLE != 16
+  if (this->croppingRegion.x != 0 ||
+      (this->croppingRegion.w != 0 && this->croppingRegion.w != scaledWidth)) {
+    JDIMENSION crop_x = this->croppingRegion.x;
+    JDIMENSION crop_w = this->croppingRegion.w;
+
+    _jpeg_crop_scanline(dinfo, &crop_x, &crop_w);
+    if ((int)crop_x != this->croppingRegion.x)
+      THROWI("Unexplained mismatch between specified (%d) and\n"
+             "actual (%d) cropping region left boundary",
+             this->croppingRegion.x, (int)crop_x);
+    if ((int)crop_w != this->croppingRegion.w)
+      THROWI("Unexplained mismatch between specified (%d) and\n"
+             "actual (%d) cropping region width",
+             this->croppingRegion.w, (int)crop_w);
+  }
+#endif
+
+  if (pitch == 0) pitch = dinfo->output_width * tjPixelSize[pixelFormat];
+
+  croppedHeight = dinfo->output_height;
+#if BITS_IN_JSAMPLE != 16
+  if (this->croppingRegion.y != 0 || this->croppingRegion.h != 0)
+    croppedHeight = this->croppingRegion.h;
+#endif
+  if ((row_pointer =
+       (_JSAMPROW *)malloc(sizeof(_JSAMPROW) * croppedHeight)) == NULL)
+    THROW("Memory allocation failure");
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+  for (i = 0; i < (int)croppedHeight; i++) {
+    if (this->bottomUp)
+      row_pointer[i] = &dstBuf[(croppedHeight - i - 1) * (size_t)pitch];
+    else
+      row_pointer[i] = &dstBuf[i * (size_t)pitch];
+  }
+
+#if BITS_IN_JSAMPLE != 16
+  if (this->croppingRegion.y != 0 || this->croppingRegion.h != 0) {
+    if (this->croppingRegion.y != 0) {
+      JDIMENSION lines = _jpeg_skip_scanlines(dinfo, this->croppingRegion.y);
+
+      if ((int)lines != this->croppingRegion.y)
+        THROWI("Unexplained mismatch between specified (%d) and\n"
+               "actual (%d) cropping region upper boundary",
+               this->croppingRegion.y, (int)lines);
+    }
+    while ((int)dinfo->output_scanline <
+           this->croppingRegion.y + this->croppingRegion.h)
+      _jpeg_read_scanlines(dinfo, &row_pointer[dinfo->output_scanline -
+                                               this->croppingRegion.y],
+                           this->croppingRegion.y + this->croppingRegion.h -
+                           dinfo->output_scanline);
+    if (this->croppingRegion.y + this->croppingRegion.h !=
+        (int)dinfo->output_height) {
+      JDIMENSION lines = _jpeg_skip_scanlines(dinfo, dinfo->output_height -
+                                                     this->croppingRegion.y -
+                                                     this->croppingRegion.h);
+
+      if (lines != dinfo->output_height - this->croppingRegion.y -
+                   this->croppingRegion.h)
+        THROWI("Unexplained mismatch between specified (%d) and\n"
+               "actual (%d) cropping region lower boundary",
+               this->croppingRegion.y + this->croppingRegion.h,
+               (int)(dinfo->output_height - lines));
+    }
+  } else
+#endif
+  {
+    while (dinfo->output_scanline < dinfo->output_height)
+      _jpeg_read_scanlines(dinfo, &row_pointer[dinfo->output_scanline],
+                           dinfo->output_height - dinfo->output_scanline);
+  }
+  jpeg_finish_decompress(dinfo);
+
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  free(row_pointer);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+
+/*************************** Packed-Pixel Image I/O **************************/
+
+/* TurboJPEG 3+ */
+DLLEXPORT _JSAMPLE *GET_NAME(tj3LoadImage, BITS_IN_JSAMPLE)
+  (tjhandle handle, const char *filename, int *width, int align, int *height,
+   int *pixelFormat)
+{
+  static const char FUNCTION_NAME[] =
+    GET_STRING(tj3LoadImage, BITS_IN_JSAMPLE);
+
+#if BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED)
+
+  int retval = 0, tempc;
+  size_t pitch;
+  tjhandle handle2 = NULL;
+  tjinstance *this2;
+  j_compress_ptr cinfo = NULL;
+  cjpeg_source_ptr src;
+  _JSAMPLE *dstBuf = NULL;
+  FILE *file = NULL;
+  boolean invert;
+
+  GET_TJINSTANCE(handle, NULL)
+
+  if (!filename || !width || align < 1 || !height || !pixelFormat ||
+      *pixelFormat < TJPF_UNKNOWN || *pixelFormat >= TJ_NUMPF)
+    THROW("Invalid argument");
+  if ((align & (align - 1)) != 0)
+    THROW("Alignment must be a power of 2");
+
+  /* The instance handle passed to this function is used only for parameter
+     retrieval.  Create a new temporary instance to avoid interfering with the
+     libjpeg state of the primary instance. */
+  if ((handle2 = tj3Init(TJINIT_COMPRESS)) == NULL) return NULL;
+  this2 = (tjinstance *)handle2;
+  cinfo = &this2->cinfo;
+
+#ifdef _MSC_VER
+  if (fopen_s(&file, filename, "rb") || file == NULL)
+#else
+  if ((file = fopen(filename, "rb")) == NULL)
+#endif
+    THROW_UNIX("Cannot open input file");
+
+  if ((tempc = getc(file)) < 0 || ungetc(tempc, file) == EOF)
+    THROW_UNIX("Could not read input file")
+  else if (tempc == EOF)
+    THROW("Input file contains no data");
+
+  if (setjmp(this2->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  cinfo->data_precision = BITS_IN_JSAMPLE;
+  if (*pixelFormat == TJPF_UNKNOWN) cinfo->in_color_space = JCS_UNKNOWN;
+  else cinfo->in_color_space = pf2cs[*pixelFormat];
+  if (tempc == 'B') {
+    if ((src = jinit_read_bmp(cinfo, FALSE)) == NULL)
+      THROW("Could not initialize bitmap loader");
+    invert = !this->bottomUp;
+  } else if (tempc == 'P') {
+    if ((src = _jinit_read_ppm(cinfo)) == NULL)
+      THROW("Could not initialize PPM loader");
+    invert = this->bottomUp;
+  } else
+    THROW("Unsupported file type");
+
+  cinfo->mem->max_memory_to_use = (long)this->maxMemory * 1048576L;
+
+  src->input_file = file;
+  /* Refuse to load images larger than the specified size. */
+  src->max_pixels = this->maxPixels;
+  (*src->start_input) (cinfo, src);
+  if (tempc == 'B') {
+    if (cinfo->X_density && cinfo->Y_density) {
+      this->xDensity = cinfo->X_density;
+      this->yDensity = cinfo->Y_density;
+      this->densityUnits = cinfo->density_unit;
+    }
+  }
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
+
+  *width = cinfo->image_width;  *height = cinfo->image_height;
+  *pixelFormat = cs2pf[cinfo->in_color_space];
+
+  pitch = PAD((*width) * tjPixelSize[*pixelFormat], align);
+  if (
+#if ULLONG_MAX > SIZE_MAX
+      (unsigned long long)pitch * (unsigned long long)(*height) >
+      (unsigned long long)((size_t)-1) ||
+#endif
+      (dstBuf = (_JSAMPLE *)malloc(pitch * (*height) *
+                                   sizeof(_JSAMPLE))) == NULL)
+    THROW("Memory allocation failure");
+
+  if (setjmp(this2->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  while (cinfo->next_scanline < cinfo->image_height) {
+    int i, nlines = (*src->get_pixel_rows) (cinfo, src);
+
+    for (i = 0; i < nlines; i++) {
+      _JSAMPLE *dstptr;
+      int row;
+
+      row = cinfo->next_scanline + i;
+      if (invert) dstptr = &dstBuf[((*height) - row - 1) * pitch];
+      else dstptr = &dstBuf[row * pitch];
+      memcpy(dstptr, src->_buffer[i],
+             (*width) * tjPixelSize[*pixelFormat] * sizeof(_JSAMPLE));
+    }
+    cinfo->next_scanline += nlines;
+  }
+
+  (*src->finish_input) (cinfo, src);
+
+bailout:
+  tj3Destroy(handle2);
+  if (file) fclose(file);
+  if (retval < 0) { free(dstBuf);  dstBuf = NULL; }
+  return dstBuf;
+
+#else /* BITS_IN_JSAMPLE != 16 || defined(C_LOSSLESS_SUPPORTED) */
+
+  static const char ERROR_MSG[] =
+    "16-bit data precision requires lossless JPEG,\n"
+    "which was disabled at build time.";
+  _JSAMPLE *retval = NULL;
+
+  GET_TJINSTANCE(handle, NULL)
+  SNPRINTF(this->errStr, JMSG_LENGTH_MAX, "%s(): %s", FUNCTION_NAME,
+           ERROR_MSG);
+  this->isInstanceError = TRUE;  THROWG(ERROR_MSG, NULL)
+
+bailout:
+  return retval;
+
+#endif
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int GET_NAME(tj3SaveImage, BITS_IN_JSAMPLE)
+  (tjhandle handle, const char *filename, const _JSAMPLE *buffer, int width,
+   int pitch, int height, int pixelFormat)
+{
+  static const char FUNCTION_NAME[] =
+    GET_STRING(tj3SaveImage, BITS_IN_JSAMPLE);
+  int retval = 0;
+
+#if BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED)
+
+  tjhandle handle2 = NULL;
+  tjinstance *this2;
+  j_decompress_ptr dinfo = NULL;
+  djpeg_dest_ptr dst;
+  FILE *file = NULL;
+  char *ptr = NULL;
+  boolean invert;
+
+  GET_TJINSTANCE(handle, -1)
+
+  if (!filename || !buffer || width < 1 || pitch < 0 || height < 1 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+    THROW("Invalid argument");
+
+  /* The instance handle passed to this function is used only for parameter
+     retrieval.  Create a new temporary instance to avoid interfering with the
+     libjpeg state of the primary instance. */
+  if ((handle2 = tj3Init(TJINIT_DECOMPRESS)) == NULL)
+    return -1;
+  this2 = (tjinstance *)handle2;
+  dinfo = &this2->dinfo;
+
+#ifdef _MSC_VER
+  if (fopen_s(&file, filename, "wb") || file == NULL)
+#else
+  if ((file = fopen(filename, "wb")) == NULL)
+#endif
+    THROW_UNIX("Cannot open output file");
+
+  if (setjmp(this2->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  this2->dinfo.out_color_space = pf2cs[pixelFormat];
+  dinfo->image_width = width;  dinfo->image_height = height;
+  dinfo->global_state = DSTATE_READY;
+  dinfo->scale_num = dinfo->scale_denom = 1;
+  dinfo->data_precision = BITS_IN_JSAMPLE;
+
+  ptr = strrchr(filename, '.');
+  if (ptr && !strcasecmp(ptr, ".bmp")) {
+    if ((dst = jinit_write_bmp(dinfo, FALSE, FALSE)) == NULL)
+      THROW("Could not initialize bitmap writer");
+    invert = !this->bottomUp;
+    dinfo->X_density = (UINT16)this->xDensity;
+    dinfo->Y_density = (UINT16)this->yDensity;
+    dinfo->density_unit = (UINT8)this->densityUnits;
+  } else {
+    if ((dst = _jinit_write_ppm(dinfo)) == NULL)
+      THROW("Could not initialize PPM writer");
+    invert = this->bottomUp;
+  }
+
+  dinfo->mem->max_memory_to_use = (long)this->maxMemory * 1048576L;
+
+  dst->output_file = file;
+  (*dst->start_output) (dinfo, dst);
+  (*dinfo->mem->realize_virt_arrays) ((j_common_ptr)dinfo);
+
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
+
+  while (dinfo->output_scanline < dinfo->output_height) {
+    _JSAMPLE *rowptr;
+
+    if (invert)
+      rowptr =
+        (_JSAMPLE *)&buffer[(height - dinfo->output_scanline - 1) * pitch];
+    else
+      rowptr = (_JSAMPLE *)&buffer[dinfo->output_scanline * pitch];
+    memcpy(dst->_buffer[0], rowptr,
+           width * tjPixelSize[pixelFormat] * sizeof(_JSAMPLE));
+    (*dst->put_pixel_rows) (dinfo, dst, 1);
+    dinfo->output_scanline++;
+  }
+
+  (*dst->finish_output) (dinfo, dst);
+
+bailout:
+  tj3Destroy(handle2);
+  if (file) fclose(file);
+  return retval;
+
+#else /* BITS_IN_JSAMPLE != 16 || defined(D_LOSSLESS_SUPPORTED) */
+
+  GET_TJINSTANCE(handle, -1)
+  THROW("16-bit data precision requires lossless JPEG,\n"
+        "which was disabled at build time.")
+bailout:
+  return retval;
+
+#endif
+}
+
+
+#undef _JSAMPLE
+#undef _JSAMPROW
+#undef _buffer
+#undef _jinit_read_ppm
+#undef _jinit_write_ppm
+#undef _jpeg_crop_scanline
+#undef _jpeg_read_scanlines
+#undef _jpeg_skip_scanlines
+#undef _jpeg_write_scanlines
diff --git a/3rdparty/libjpeg-turbo/src/turbojpeg.c b/3rdparty/libjpeg-turbo/src/turbojpeg.c
new file mode 100644
index 000000000000..3c936160dfbe
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/turbojpeg.c
@@ -0,0 +1,2921 @@
+/*
+ * Copyright (C)2009-2024 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2021 Alex Richardson.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* TurboJPEG/LJT:  this implements the TurboJPEG API using libjpeg or
+   libjpeg-turbo */
+
+#include <ctype.h>
+#include <limits.h>
+#if !defined(_MSC_VER) || _MSC_VER > 1600
+#include <stdint.h>
+#endif
+#include <jinclude.h>
+#define JPEG_INTERNALS
+#include <jpeglib.h>
+#include <jerror.h>
+#include <setjmp.h>
+#include <errno.h>
+#include "./turbojpeg.h"
+#include "./tjutil.h"
+#include "transupp.h"
+#include "./jpegapicomp.h"
+#include "./cdjpeg.h"
+
+extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **, size_t *,
+                             boolean);
+extern void jpeg_mem_src_tj(j_decompress_ptr, const unsigned char *, size_t);
+
+#define PAD(v, p)  ((v + (p) - 1) & (~((p) - 1)))
+#define IS_POW2(x)  (((x) & (x - 1)) == 0)
+
+
+/* Error handling (based on example in example.c) */
+
+static THREAD_LOCAL char errStr[JMSG_LENGTH_MAX] = "No error";
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+  void (*emit_message) (j_common_ptr, int);
+  boolean warning, stopOnWarning;
+};
+typedef struct my_error_mgr *my_error_ptr;
+
+#define JMESSAGE(code, string)  string,
+static const char *turbojpeg_message_table[] = {
+#include "cderror.h"
+  NULL
+};
+
+static void my_error_exit(j_common_ptr cinfo)
+{
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
+
+  (*cinfo->err->output_message) (cinfo);
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+/* Based on output_message() in jerror.c */
+
+static void my_output_message(j_common_ptr cinfo)
+{
+  (*cinfo->err->format_message) (cinfo, errStr);
+}
+
+static void my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
+
+  myerr->emit_message(cinfo, msg_level);
+  if (msg_level < 0) {
+    myerr->warning = TRUE;
+    if (myerr->stopOnWarning) longjmp(myerr->setjmp_buffer, 1);
+  }
+}
+
+
+/********************** Global structures, macros, etc. **********************/
+
+enum { COMPRESS = 1, DECOMPRESS = 2 };
+
+typedef struct _tjinstance {
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_decompress_struct dinfo;
+  struct my_error_mgr jerr;
+  int init;
+  char errStr[JMSG_LENGTH_MAX];
+  boolean isInstanceError;
+  /* Parameters */
+  boolean bottomUp;
+  boolean noRealloc;
+  int quality;
+  int subsamp;
+  int jpegWidth;
+  int jpegHeight;
+  int precision;
+  int colorspace;
+  boolean fastUpsample;
+  boolean fastDCT;
+  boolean optimize;
+  boolean progressive;
+  int scanLimit;
+  boolean arithmetic;
+  boolean lossless;
+  int losslessPSV;
+  int losslessPt;
+  int restartIntervalBlocks;
+  int restartIntervalRows;
+  int xDensity;
+  int yDensity;
+  int densityUnits;
+  tjscalingfactor scalingFactor;
+  tjregion croppingRegion;
+  int maxMemory;
+  int maxPixels;
+} tjinstance;
+
+static tjhandle _tjInitCompress(tjinstance *this);
+static tjhandle _tjInitDecompress(tjinstance *this);
+
+struct my_progress_mgr {
+  struct jpeg_progress_mgr pub;
+  tjinstance *this;
+};
+typedef struct my_progress_mgr *my_progress_ptr;
+
+static void my_progress_monitor(j_common_ptr dinfo)
+{
+  my_error_ptr myerr = (my_error_ptr)dinfo->err;
+  my_progress_ptr myprog = (my_progress_ptr)dinfo->progress;
+
+  if (dinfo->is_decompressor) {
+    int scan_no = ((j_decompress_ptr)dinfo)->input_scan_number;
+
+    if (scan_no > myprog->this->scanLimit) {
+      SNPRINTF(myprog->this->errStr, JMSG_LENGTH_MAX,
+               "Progressive JPEG image has more than %d scans",
+               myprog->this->scanLimit);
+      SNPRINTF(errStr, JMSG_LENGTH_MAX,
+               "Progressive JPEG image has more than %d scans",
+               myprog->this->scanLimit);
+      myprog->this->isInstanceError = TRUE;
+      myerr->warning = FALSE;
+      longjmp(myerr->setjmp_buffer, 1);
+    }
+  }
+}
+
+static const JXFORM_CODE xformtypes[TJ_NUMXOP] = {
+  JXFORM_NONE, JXFORM_FLIP_H, JXFORM_FLIP_V, JXFORM_TRANSPOSE,
+  JXFORM_TRANSVERSE, JXFORM_ROT_90, JXFORM_ROT_180, JXFORM_ROT_270
+};
+
+#define NUMSF  16
+static const tjscalingfactor sf[NUMSF] = {
+  { 2, 1 },
+  { 15, 8 },
+  { 7, 4 },
+  { 13, 8 },
+  { 3, 2 },
+  { 11, 8 },
+  { 5, 4 },
+  { 9, 8 },
+  { 1, 1 },
+  { 7, 8 },
+  { 3, 4 },
+  { 5, 8 },
+  { 1, 2 },
+  { 3, 8 },
+  { 1, 4 },
+  { 1, 8 }
+};
+
+static J_COLOR_SPACE pf2cs[TJ_NUMPF] = {
+  JCS_EXT_RGB, JCS_EXT_BGR, JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR,
+  JCS_EXT_XRGB, JCS_GRAYSCALE, JCS_EXT_RGBA, JCS_EXT_BGRA, JCS_EXT_ABGR,
+  JCS_EXT_ARGB, JCS_CMYK
+};
+
+static int cs2pf[JPEG_NUMCS] = {
+  TJPF_UNKNOWN, TJPF_GRAY,
+#if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3
+  TJPF_RGB,
+#elif RGB_RED == 2 && RGB_GREEN == 1 && RGB_BLUE == 0 && RGB_PIXELSIZE == 3
+  TJPF_BGR,
+#elif RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 4
+  TJPF_RGBX,
+#elif RGB_RED == 2 && RGB_GREEN == 1 && RGB_BLUE == 0 && RGB_PIXELSIZE == 4
+  TJPF_BGRX,
+#elif RGB_RED == 3 && RGB_GREEN == 2 && RGB_BLUE == 1 && RGB_PIXELSIZE == 4
+  TJPF_XBGR,
+#elif RGB_RED == 1 && RGB_GREEN == 2 && RGB_BLUE == 3 && RGB_PIXELSIZE == 4
+  TJPF_XRGB,
+#endif
+  TJPF_UNKNOWN, TJPF_CMYK, TJPF_UNKNOWN, TJPF_RGB, TJPF_RGBX, TJPF_BGR,
+  TJPF_BGRX, TJPF_XBGR, TJPF_XRGB, TJPF_RGBA, TJPF_BGRA, TJPF_ABGR, TJPF_ARGB,
+  TJPF_UNKNOWN
+};
+
+#define THROWG(m, rv) { \
+  SNPRINTF(errStr, JMSG_LENGTH_MAX, "%s(): %s", FUNCTION_NAME, m); \
+  retval = rv;  goto bailout; \
+}
+#ifdef _MSC_VER
+#define THROW_UNIX(m) { \
+  char strerrorBuf[80] = { 0 }; \
+  strerror_s(strerrorBuf, 80, errno); \
+  SNPRINTF(this->errStr, JMSG_LENGTH_MAX, "%s(): %s\n%s", FUNCTION_NAME, m, \
+           strerrorBuf); \
+  this->isInstanceError = TRUE; \
+  SNPRINTF(errStr, JMSG_LENGTH_MAX, "%s(): %s\n%s", FUNCTION_NAME, m, \
+           strerrorBuf); \
+  retval = -1;  goto bailout; \
+}
+#else
+#define THROW_UNIX(m) { \
+  SNPRINTF(this->errStr, JMSG_LENGTH_MAX, "%s(): %s\n%s", FUNCTION_NAME, m, \
+           strerror(errno)); \
+  this->isInstanceError = TRUE; \
+  SNPRINTF(errStr, JMSG_LENGTH_MAX, "%s(): %s\n%s", FUNCTION_NAME, m, \
+           strerror(errno)); \
+  retval = -1;  goto bailout; \
+}
+#endif
+#define THROW(m) { \
+  SNPRINTF(this->errStr, JMSG_LENGTH_MAX, "%s(): %s", FUNCTION_NAME, m); \
+  this->isInstanceError = TRUE;  THROWG(m, -1) \
+}
+#define THROWI(format, val1, val2) { \
+  SNPRINTF(this->errStr, JMSG_LENGTH_MAX, "%s(): " format, FUNCTION_NAME, \
+           val1, val2); \
+  this->isInstanceError = TRUE; \
+  SNPRINTF(errStr, JMSG_LENGTH_MAX, "%s(): " format, FUNCTION_NAME, val1, \
+           val2); \
+  retval = -1;  goto bailout; \
+}
+
+#define GET_INSTANCE(handle) \
+  tjinstance *this = (tjinstance *)handle; \
+  j_compress_ptr cinfo = NULL; \
+  j_decompress_ptr dinfo = NULL; \
+  \
+  if (!this) { \
+    SNPRINTF(errStr, JMSG_LENGTH_MAX, "%s(): Invalid handle", FUNCTION_NAME); \
+    return -1; \
+  } \
+  cinfo = &this->cinfo;  dinfo = &this->dinfo; \
+  this->jerr.warning = FALSE; \
+  this->isInstanceError = FALSE;
+
+#define GET_CINSTANCE(handle) \
+  tjinstance *this = (tjinstance *)handle; \
+  j_compress_ptr cinfo = NULL; \
+  \
+  if (!this) { \
+    SNPRINTF(errStr, JMSG_LENGTH_MAX, "%s(): Invalid handle", FUNCTION_NAME); \
+    return -1; \
+  } \
+  cinfo = &this->cinfo; \
+  this->jerr.warning = FALSE; \
+  this->isInstanceError = FALSE;
+
+#define GET_DINSTANCE(handle) \
+  tjinstance *this = (tjinstance *)handle; \
+  j_decompress_ptr dinfo = NULL; \
+  \
+  if (!this) { \
+    SNPRINTF(errStr, JMSG_LENGTH_MAX, "%s(): Invalid handle", FUNCTION_NAME); \
+    return -1; \
+  } \
+  dinfo = &this->dinfo; \
+  this->jerr.warning = FALSE; \
+  this->isInstanceError = FALSE;
+
+#define GET_TJINSTANCE(handle, errorReturn) \
+  tjinstance *this = (tjinstance *)handle; \
+  \
+  if (!this) { \
+    SNPRINTF(errStr, JMSG_LENGTH_MAX, "%s(): Invalid handle", FUNCTION_NAME); \
+    return errorReturn; \
+  } \
+  this->jerr.warning = FALSE; \
+  this->isInstanceError = FALSE;
+
+static int getPixelFormat(int pixelSize, int flags)
+{
+  if (pixelSize == 1) return TJPF_GRAY;
+  if (pixelSize == 3) {
+    if (flags & TJ_BGR) return TJPF_BGR;
+    else return TJPF_RGB;
+  }
+  if (pixelSize == 4) {
+    if (flags & TJ_ALPHAFIRST) {
+      if (flags & TJ_BGR) return TJPF_XBGR;
+      else return TJPF_XRGB;
+    } else {
+      if (flags & TJ_BGR) return TJPF_BGRX;
+      else return TJPF_RGBX;
+    }
+  }
+  return -1;
+}
+
+static void setCompDefaults(tjinstance *this, int pixelFormat)
+{
+  this->cinfo.in_color_space = pf2cs[pixelFormat];
+  this->cinfo.input_components = tjPixelSize[pixelFormat];
+  jpeg_set_defaults(&this->cinfo);
+
+  this->cinfo.restart_interval = this->restartIntervalBlocks;
+  this->cinfo.restart_in_rows = this->restartIntervalRows;
+  this->cinfo.X_density = (UINT16)this->xDensity;
+  this->cinfo.Y_density = (UINT16)this->yDensity;
+  this->cinfo.density_unit = (UINT8)this->densityUnits;
+  this->cinfo.mem->max_memory_to_use = (long)this->maxMemory * 1048576L;
+
+  if (this->lossless) {
+#ifdef C_LOSSLESS_SUPPORTED
+    jpeg_enable_lossless(&this->cinfo, this->losslessPSV, this->losslessPt);
+#endif
+    if (pixelFormat == TJPF_GRAY)
+      this->subsamp = TJSAMP_GRAY;
+    else if (this->subsamp != TJSAMP_GRAY)
+      this->subsamp = TJSAMP_444;
+    return;
+  }
+
+  jpeg_set_quality(&this->cinfo, this->quality, TRUE);
+  this->cinfo.dct_method = this->fastDCT ? JDCT_FASTEST : JDCT_ISLOW;
+
+  switch (this->colorspace) {
+  case TJCS_RGB:
+    jpeg_set_colorspace(&this->cinfo, JCS_RGB);  break;
+  case TJCS_YCbCr:
+    jpeg_set_colorspace(&this->cinfo, JCS_YCbCr);  break;
+  case TJCS_GRAY:
+    jpeg_set_colorspace(&this->cinfo, JCS_GRAYSCALE);  break;
+  case TJCS_CMYK:
+    jpeg_set_colorspace(&this->cinfo, JCS_CMYK);  break;
+  case TJCS_YCCK:
+    jpeg_set_colorspace(&this->cinfo, JCS_YCCK);  break;
+  default:
+    if (this->subsamp == TJSAMP_GRAY)
+      jpeg_set_colorspace(&this->cinfo, JCS_GRAYSCALE);
+    else if (pixelFormat == TJPF_CMYK)
+      jpeg_set_colorspace(&this->cinfo, JCS_YCCK);
+    else
+      jpeg_set_colorspace(&this->cinfo, JCS_YCbCr);
+  }
+
+  if (this->cinfo.data_precision == 8)
+    this->cinfo.optimize_coding = this->optimize;
+#ifdef C_PROGRESSIVE_SUPPORTED
+  if (this->progressive) jpeg_simple_progression(&this->cinfo);
+#endif
+  this->cinfo.arith_code = this->arithmetic;
+
+  this->cinfo.comp_info[0].h_samp_factor = tjMCUWidth[this->subsamp] / 8;
+  this->cinfo.comp_info[1].h_samp_factor = 1;
+  this->cinfo.comp_info[2].h_samp_factor = 1;
+  if (this->cinfo.num_components > 3)
+    this->cinfo.comp_info[3].h_samp_factor = tjMCUWidth[this->subsamp] / 8;
+  this->cinfo.comp_info[0].v_samp_factor = tjMCUHeight[this->subsamp] / 8;
+  this->cinfo.comp_info[1].v_samp_factor = 1;
+  this->cinfo.comp_info[2].v_samp_factor = 1;
+  if (this->cinfo.num_components > 3)
+    this->cinfo.comp_info[3].v_samp_factor = tjMCUHeight[this->subsamp] / 8;
+}
+
+
+static int getSubsamp(j_decompress_ptr dinfo)
+{
+  int retval = TJSAMP_UNKNOWN, i, k;
+
+  /* The sampling factors actually have no meaning with grayscale JPEG files,
+     and in fact it's possible to generate grayscale JPEGs with sampling
+     factors > 1 (even though those sampling factors are ignored by the
+     decompressor.)  Thus, we need to treat grayscale as a special case. */
+  if (dinfo->num_components == 1 && dinfo->jpeg_color_space == JCS_GRAYSCALE)
+    return TJSAMP_GRAY;
+
+  for (i = 0; i < TJ_NUMSAMP; i++) {
+    if (i == TJSAMP_GRAY) continue;
+
+    if (dinfo->num_components == 3 ||
+        ((dinfo->jpeg_color_space == JCS_YCCK ||
+          dinfo->jpeg_color_space == JCS_CMYK) &&
+         dinfo->num_components == 4)) {
+      if (dinfo->comp_info[0].h_samp_factor == tjMCUWidth[i] / 8 &&
+          dinfo->comp_info[0].v_samp_factor == tjMCUHeight[i] / 8) {
+        int match = 0;
+
+        for (k = 1; k < dinfo->num_components; k++) {
+          int href = 1, vref = 1;
+
+          if ((dinfo->jpeg_color_space == JCS_YCCK ||
+               dinfo->jpeg_color_space == JCS_CMYK) && k == 3) {
+            href = tjMCUWidth[i] / 8;  vref = tjMCUHeight[i] / 8;
+          }
+          if (dinfo->comp_info[k].h_samp_factor == href &&
+              dinfo->comp_info[k].v_samp_factor == vref)
+            match++;
+        }
+        if (match == dinfo->num_components - 1) {
+          retval = i;  break;
+        }
+      }
+      /* Handle 4:2:2 and 4:4:0 images whose sampling factors are specified
+         in non-standard ways. */
+      if (dinfo->comp_info[0].h_samp_factor == 2 &&
+          dinfo->comp_info[0].v_samp_factor == 2 &&
+          (i == TJSAMP_422 || i == TJSAMP_440)) {
+        int match = 0;
+
+        for (k = 1; k < dinfo->num_components; k++) {
+          int href = tjMCUHeight[i] / 8, vref = tjMCUWidth[i] / 8;
+
+          if ((dinfo->jpeg_color_space == JCS_YCCK ||
+               dinfo->jpeg_color_space == JCS_CMYK) && k == 3) {
+            href = vref = 2;
+          }
+          if (dinfo->comp_info[k].h_samp_factor == href &&
+              dinfo->comp_info[k].v_samp_factor == vref)
+            match++;
+        }
+        if (match == dinfo->num_components - 1) {
+          retval = i;  break;
+        }
+      }
+      /* Handle 4:4:4 images whose sampling factors are specified in
+         non-standard ways. */
+      if (dinfo->comp_info[0].h_samp_factor *
+          dinfo->comp_info[0].v_samp_factor <=
+          D_MAX_BLOCKS_IN_MCU / 3 && i == TJSAMP_444) {
+        int match = 0;
+        for (k = 1; k < dinfo->num_components; k++) {
+          if (dinfo->comp_info[k].h_samp_factor ==
+              dinfo->comp_info[0].h_samp_factor &&
+              dinfo->comp_info[k].v_samp_factor ==
+              dinfo->comp_info[0].v_samp_factor)
+            match++;
+          if (match == dinfo->num_components - 1) {
+            retval = i;  break;
+          }
+        }
+      }
+    }
+  }
+  return retval;
+}
+
+
+static void setDecompParameters(tjinstance *this)
+{
+  this->subsamp = getSubsamp(&this->dinfo);
+  this->jpegWidth = this->dinfo.image_width;
+  this->jpegHeight = this->dinfo.image_height;
+  this->precision = this->dinfo.data_precision;
+  switch (this->dinfo.jpeg_color_space) {
+  case JCS_GRAYSCALE:  this->colorspace = TJCS_GRAY;  break;
+  case JCS_RGB:        this->colorspace = TJCS_RGB;  break;
+  case JCS_YCbCr:      this->colorspace = TJCS_YCbCr;  break;
+  case JCS_CMYK:       this->colorspace = TJCS_CMYK;  break;
+  case JCS_YCCK:       this->colorspace = TJCS_YCCK;  break;
+  default:             this->colorspace = -1;  break;
+  }
+  this->progressive = this->dinfo.progressive_mode;
+  this->arithmetic = this->dinfo.arith_code;
+  this->lossless = this->dinfo.master->lossless;
+  this->losslessPSV = this->dinfo.Ss;
+  this->losslessPt = this->dinfo.Al;
+  this->xDensity = this->dinfo.X_density;
+  this->yDensity = this->dinfo.Y_density;
+  this->densityUnits = this->dinfo.density_unit;
+}
+
+
+static void processFlags(tjhandle handle, int flags, int operation)
+{
+  tjinstance *this = (tjinstance *)handle;
+
+  this->bottomUp = !!(flags & TJFLAG_BOTTOMUP);
+
+#ifndef NO_PUTENV
+  if (flags & TJFLAG_FORCEMMX) PUTENV_S("JSIMD_FORCEMMX", "1");
+  else if (flags & TJFLAG_FORCESSE) PUTENV_S("JSIMD_FORCESSE", "1");
+  else if (flags & TJFLAG_FORCESSE2) PUTENV_S("JSIMD_FORCESSE2", "1");
+#endif
+
+  this->fastUpsample = !!(flags & TJFLAG_FASTUPSAMPLE);
+  this->noRealloc = !!(flags & TJFLAG_NOREALLOC);
+
+  if (operation == COMPRESS) {
+    if (this->quality >= 96 || flags & TJFLAG_ACCURATEDCT)
+      this->fastDCT = FALSE;
+    else
+      this->fastDCT = TRUE;
+  } else
+    this->fastDCT = !!(flags & TJFLAG_FASTDCT);
+
+  this->jerr.stopOnWarning = !!(flags & TJFLAG_STOPONWARNING);
+  this->progressive = !!(flags & TJFLAG_PROGRESSIVE);
+
+  if (flags & TJFLAG_LIMITSCANS) this->scanLimit = 500;
+}
+
+
+/*************************** General API functions ***************************/
+
+/* TurboJPEG 3+ */
+DLLEXPORT tjhandle tj3Init(int initType)
+{
+  static const char FUNCTION_NAME[] = "tj3Init";
+  tjinstance *this = NULL;
+  tjhandle retval = NULL;
+
+  if (initType < 0 || initType >= TJ_NUMINIT)
+    THROWG("Invalid argument", NULL);
+
+  if ((this = (tjinstance *)malloc(sizeof(tjinstance))) == NULL)
+    THROWG("Memory allocation failure", NULL);
+  memset(this, 0, sizeof(tjinstance));
+  SNPRINTF(this->errStr, JMSG_LENGTH_MAX, "No error");
+
+  this->quality = -1;
+  this->subsamp = TJSAMP_UNKNOWN;
+  this->jpegWidth = -1;
+  this->jpegHeight = -1;
+  this->precision = 8;
+  this->colorspace = -1;
+  this->losslessPSV = 1;
+  this->xDensity = 1;
+  this->yDensity = 1;
+  this->scalingFactor = TJUNSCALED;
+
+  switch (initType) {
+  case TJINIT_COMPRESS:  return _tjInitCompress(this);
+  case TJINIT_DECOMPRESS:  return _tjInitDecompress(this);
+  case TJINIT_TRANSFORM:
+    retval = _tjInitCompress(this);
+    if (!retval) return NULL;
+    retval = _tjInitDecompress(this);
+    return retval;
+  }
+
+bailout:
+  return retval;
+}
+
+
+#define SET_PARAM(field, minValue, maxValue) { \
+  if (value < minValue || (maxValue > 0 && value > maxValue)) \
+    THROW("Parameter value out of range"); \
+  this->field = value; \
+}
+
+#define SET_BOOL_PARAM(field) { \
+  if (value < 0 || value > 1) \
+    THROW("Parameter value out of range"); \
+  this->field = (boolean)value; \
+}
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3Set(tjhandle handle, int param, int value)
+{
+  static const char FUNCTION_NAME[] = "tj3Set";
+  int retval = 0;
+
+  GET_TJINSTANCE(handle, -1);
+
+  switch (param) {
+  case TJPARAM_STOPONWARNING:
+    SET_BOOL_PARAM(jerr.stopOnWarning);
+    break;
+  case TJPARAM_BOTTOMUP:
+    SET_BOOL_PARAM(bottomUp);
+    break;
+  case TJPARAM_NOREALLOC:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_NOREALLOC is not applicable to decompression instances.");
+    SET_BOOL_PARAM(noRealloc);
+    break;
+  case TJPARAM_QUALITY:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_QUALITY is not applicable to decompression instances.");
+    SET_PARAM(quality, 1, 100);
+    break;
+  case TJPARAM_SUBSAMP:
+    SET_PARAM(subsamp, 0, TJ_NUMSAMP - 1);
+    break;
+  case TJPARAM_JPEGWIDTH:
+    if (!(this->init & DECOMPRESS))
+      THROW("TJPARAM_JPEGWIDTH is not applicable to compression instances.");
+    THROW("TJPARAM_JPEGWIDTH is read-only in decompression instances.");
+    break;
+  case TJPARAM_JPEGHEIGHT:
+    if (!(this->init & DECOMPRESS))
+      THROW("TJPARAM_JPEGHEIGHT is not applicable to compression instances.");
+    THROW("TJPARAM_JPEGHEIGHT is read-only in decompression instances.");
+    break;
+  case TJPARAM_PRECISION:
+    if (!(this->init & DECOMPRESS))
+      THROW("TJPARAM_PRECISION is not applicable to compression instances.");
+    THROW("TJPARAM_PRECISION is read-only in decompression instances.");
+    break;
+  case TJPARAM_COLORSPACE:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_COLORSPACE is read-only in decompression instances.");
+    SET_PARAM(colorspace, 0, TJ_NUMCS - 1);
+    break;
+  case TJPARAM_FASTUPSAMPLE:
+    if (!(this->init & DECOMPRESS))
+      THROW("TJPARAM_FASTUPSAMPLE is not applicable to compression instances.");
+    SET_BOOL_PARAM(fastUpsample);
+    break;
+  case TJPARAM_FASTDCT:
+    SET_BOOL_PARAM(fastDCT);
+    break;
+  case TJPARAM_OPTIMIZE:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_OPTIMIZE is not applicable to decompression instances.");
+    SET_BOOL_PARAM(optimize);
+    break;
+  case TJPARAM_PROGRESSIVE:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_PROGRESSIVE is read-only in decompression instances.");
+    SET_BOOL_PARAM(progressive);
+    break;
+  case TJPARAM_SCANLIMIT:
+    if (!(this->init & DECOMPRESS))
+      THROW("TJPARAM_SCANLIMIT is not applicable to compression instances.");
+    SET_PARAM(scanLimit, 0, -1);
+    break;
+  case TJPARAM_ARITHMETIC:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_ARITHMETIC is read-only in decompression instances.");
+    SET_BOOL_PARAM(arithmetic);
+    break;
+  case TJPARAM_LOSSLESS:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_LOSSLESS is read-only in decompression instances.");
+    SET_BOOL_PARAM(lossless);
+    break;
+  case TJPARAM_LOSSLESSPSV:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_LOSSLESSPSV is read-only in decompression instances.");
+    SET_PARAM(losslessPSV, 1, 7);
+    break;
+  case TJPARAM_LOSSLESSPT:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_LOSSLESSPT is read-only in decompression instances.");
+    SET_PARAM(losslessPt, 0, this->precision - 1);
+    break;
+  case TJPARAM_RESTARTBLOCKS:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_RESTARTBLOCKS is not applicable to decompression instances.");
+    SET_PARAM(restartIntervalBlocks, 0, 65535);
+    if (value != 0) this->restartIntervalRows = 0;
+    break;
+  case TJPARAM_RESTARTROWS:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_RESTARTROWS is not applicable to decompression instances.");
+    SET_PARAM(restartIntervalRows, 0, 65535);
+    if (value != 0) this->restartIntervalBlocks = 0;
+    break;
+  case TJPARAM_XDENSITY:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_XDENSITY is read-only in decompression instances.");
+    SET_PARAM(xDensity, 1, 65535);
+    break;
+  case TJPARAM_YDENSITY:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_YDENSITY is read-only in decompression instances.");
+    SET_PARAM(yDensity, 1, 65535);
+    break;
+  case TJPARAM_DENSITYUNITS:
+    if (!(this->init & COMPRESS))
+      THROW("TJPARAM_DENSITYUNITS is read-only in decompression instances.");
+    SET_PARAM(densityUnits, 0, 2);
+    break;
+  case TJPARAM_MAXMEMORY:
+    SET_PARAM(maxMemory, 0, (int)(min(LONG_MAX / 1048576L, (long)INT_MAX)));
+    break;
+  case TJPARAM_MAXPIXELS:
+    SET_PARAM(maxPixels, 0, -1);
+    break;
+  default:
+    THROW("Invalid parameter");
+  }
+
+bailout:
+  return retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3Get(tjhandle handle, int param)
+{
+  tjinstance *this = (tjinstance *)handle;
+  if (!this) return -1;
+
+  switch (param) {
+  case TJPARAM_STOPONWARNING:
+    return this->jerr.stopOnWarning;
+  case TJPARAM_BOTTOMUP:
+    return this->bottomUp;
+  case TJPARAM_NOREALLOC:
+    return this->noRealloc;
+  case TJPARAM_QUALITY:
+    return this->quality;
+  case TJPARAM_SUBSAMP:
+    return this->subsamp;
+  case TJPARAM_JPEGWIDTH:
+    return this->jpegWidth;
+  case TJPARAM_JPEGHEIGHT:
+    return this->jpegHeight;
+  case TJPARAM_PRECISION:
+    return this->precision;
+  case TJPARAM_COLORSPACE:
+    return this->colorspace;
+  case TJPARAM_FASTUPSAMPLE:
+    return this->fastUpsample;
+  case TJPARAM_FASTDCT:
+    return this->fastDCT;
+  case TJPARAM_OPTIMIZE:
+    return this->optimize;
+  case TJPARAM_PROGRESSIVE:
+    return this->progressive;
+  case TJPARAM_SCANLIMIT:
+    return this->scanLimit;
+  case TJPARAM_ARITHMETIC:
+    return this->arithmetic;
+  case TJPARAM_LOSSLESS:
+    return this->lossless;
+  case TJPARAM_LOSSLESSPSV:
+    return this->losslessPSV;
+  case TJPARAM_LOSSLESSPT:
+    return this->losslessPt;
+  case TJPARAM_RESTARTBLOCKS:
+    return this->restartIntervalBlocks;
+  case TJPARAM_RESTARTROWS:
+    return this->restartIntervalRows;
+  case TJPARAM_XDENSITY:
+    return this->xDensity;
+  case TJPARAM_YDENSITY:
+    return this->yDensity;
+  case TJPARAM_DENSITYUNITS:
+    return this->densityUnits;
+  case TJPARAM_MAXMEMORY:
+    return this->maxMemory;
+  case TJPARAM_MAXPIXELS:
+    return this->maxPixels;
+  }
+
+  return -1;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT char *tj3GetErrorStr(tjhandle handle)
+{
+  tjinstance *this = (tjinstance *)handle;
+
+  if (this && this->isInstanceError) {
+    this->isInstanceError = FALSE;
+    return this->errStr;
+  } else
+    return errStr;
+}
+
+/* TurboJPEG 2.0+ */
+DLLEXPORT char *tjGetErrorStr2(tjhandle handle)
+{
+  return tj3GetErrorStr(handle);
+}
+
+/* TurboJPEG 1.0+ */
+DLLEXPORT char *tjGetErrorStr(void)
+{
+  return errStr;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3GetErrorCode(tjhandle handle)
+{
+  tjinstance *this = (tjinstance *)handle;
+
+  if (this && this->jerr.warning) return TJERR_WARNING;
+  else return TJERR_FATAL;
+}
+
+/* TurboJPEG 2.0+ */
+DLLEXPORT int tjGetErrorCode(tjhandle handle)
+{
+  return tj3GetErrorCode(handle);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT void tj3Destroy(tjhandle handle)
+{
+  tjinstance *this = (tjinstance *)handle;
+  j_compress_ptr cinfo = NULL;
+  j_decompress_ptr dinfo = NULL;
+
+  if (!this) return;
+
+  cinfo = &this->cinfo;  dinfo = &this->dinfo;
+  this->jerr.warning = FALSE;
+  this->isInstanceError = FALSE;
+
+  if (setjmp(this->jerr.setjmp_buffer)) return;
+  if (this->init & COMPRESS) jpeg_destroy_compress(cinfo);
+  if (this->init & DECOMPRESS) jpeg_destroy_decompress(dinfo);
+  free(this);
+}
+
+/* TurboJPEG 1.0+ */
+DLLEXPORT int tjDestroy(tjhandle handle)
+{
+  static const char FUNCTION_NAME[] = "tjDestroy";
+  int retval = 0;
+
+  if (!handle) THROWG("Invalid handle", -1);
+
+  SNPRINTF(errStr, JMSG_LENGTH_MAX, "No error");
+  tj3Destroy(handle);
+  if (strcmp(errStr, "No error")) retval = -1;
+
+bailout:
+  return retval;
+}
+
+
+/* These are exposed mainly because Windows can't malloc() and free() across
+   DLL boundaries except when the CRT DLL is used, and we don't use the CRT DLL
+   with turbojpeg.dll for compatibility reasons.  However, these functions
+   can potentially be used for other purposes by different implementations. */
+
+/* TurboJPEG 3+ */
+DLLEXPORT void tj3Free(void *buf)
+{
+  free(buf);
+}
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT void tjFree(unsigned char *buf)
+{
+  tj3Free(buf);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT void *tj3Alloc(size_t bytes)
+{
+  return malloc(bytes);
+}
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT unsigned char *tjAlloc(int bytes)
+{
+  return (unsigned char *)tj3Alloc((size_t)bytes);
+}
+
+
+/******************************** Compressor *********************************/
+
+static tjhandle _tjInitCompress(tjinstance *this)
+{
+  static unsigned char buffer[1];
+  unsigned char *buf = buffer;
+  size_t size = 1;
+
+  /* This is also straight out of example.c */
+  this->cinfo.err = jpeg_std_error(&this->jerr.pub);
+  this->jerr.pub.error_exit = my_error_exit;
+  this->jerr.pub.output_message = my_output_message;
+  this->jerr.emit_message = this->jerr.pub.emit_message;
+  this->jerr.pub.emit_message = my_emit_message;
+  this->jerr.pub.addon_message_table = turbojpeg_message_table;
+  this->jerr.pub.first_addon_message = JMSG_FIRSTADDONCODE;
+  this->jerr.pub.last_addon_message = JMSG_LASTADDONCODE;
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    free(this);
+    return NULL;
+  }
+
+  jpeg_create_compress(&this->cinfo);
+  /* Make an initial call so it will create the destination manager */
+  jpeg_mem_dest_tj(&this->cinfo, &buf, &size, 0);
+
+  this->init |= COMPRESS;
+  return (tjhandle)this;
+}
+
+/* TurboJPEG 1.0+ */
+DLLEXPORT tjhandle tjInitCompress(void)
+{
+  return tj3Init(TJINIT_COMPRESS);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT size_t tj3JPEGBufSize(int width, int height, int jpegSubsamp)
+{
+  static const char FUNCTION_NAME[] = "tj3JPEGBufSize";
+  unsigned long long retval = 0;
+  int mcuw, mcuh, chromasf;
+
+  if (width < 1 || height < 1 || jpegSubsamp < TJSAMP_UNKNOWN ||
+      jpegSubsamp >= TJ_NUMSAMP)
+    THROWG("Invalid argument", 0);
+
+  if (jpegSubsamp == TJSAMP_UNKNOWN)
+    jpegSubsamp = TJSAMP_444;
+
+  /* This allows for rare corner cases in which a JPEG image can actually be
+     larger than the uncompressed input (we wouldn't mention it if it hadn't
+     happened before.) */
+  mcuw = tjMCUWidth[jpegSubsamp];
+  mcuh = tjMCUHeight[jpegSubsamp];
+  chromasf = jpegSubsamp == TJSAMP_GRAY ? 0 : 4 * 64 / (mcuw * mcuh);
+  retval = PAD(width, mcuw) * PAD(height, mcuh) * (2ULL + chromasf) + 2048ULL;
+#if ULLONG_MAX > ULONG_MAX
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("Image is too large", 0);
+#endif
+
+bailout:
+  return (size_t)retval;
+}
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT unsigned long tjBufSize(int width, int height, int jpegSubsamp)
+{
+  static const char FUNCTION_NAME[] = "tjBufSize";
+  size_t retval;
+
+  if (jpegSubsamp < 0)
+    THROWG("Invalid argument", 0);
+
+  retval = tj3JPEGBufSize(width, height, jpegSubsamp);
+
+bailout:
+  return (retval == 0) ? (unsigned long)-1 : (unsigned long)retval;
+}
+
+/* TurboJPEG 1.0+ */
+DLLEXPORT unsigned long TJBUFSIZE(int width, int height)
+{
+  static const char FUNCTION_NAME[] = "TJBUFSIZE";
+  unsigned long long retval = 0;
+
+  if (width < 1 || height < 1)
+    THROWG("Invalid argument", (unsigned long)-1);
+
+  /* This allows for rare corner cases in which a JPEG image can actually be
+     larger than the uncompressed input (we wouldn't mention it if it hadn't
+     happened before.) */
+  retval = PAD(width, 16) * PAD(height, 16) * 6ULL + 2048ULL;
+#if ULLONG_MAX > ULONG_MAX
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("Image is too large", (unsigned long)-1);
+#endif
+
+bailout:
+  return (unsigned long)retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT size_t tj3YUVBufSize(int width, int align, int height, int subsamp)
+{
+  static const char FUNCTION_NAME[] = "tj3YUVBufSize";
+  unsigned long long retval = 0;
+  int nc, i;
+
+  if (align < 1 || !IS_POW2(align) || subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROWG("Invalid argument", 0);
+
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
+  for (i = 0; i < nc; i++) {
+    int pw = tj3YUVPlaneWidth(i, width, subsamp);
+    int stride = PAD(pw, align);
+    int ph = tj3YUVPlaneHeight(i, height, subsamp);
+
+    if (pw == 0 || ph == 0) return 0;
+    else retval += (unsigned long long)stride * ph;
+  }
+#if ULLONG_MAX > ULONG_MAX
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("Image is too large", 0);
+#endif
+
+bailout:
+  return (size_t)retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT unsigned long tjBufSizeYUV2(int width, int align, int height,
+                                      int subsamp)
+{
+  size_t retval = tj3YUVBufSize(width, align, height, subsamp);
+  return (retval == 0) ? (unsigned long)-1 : (unsigned long)retval;
+}
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT unsigned long tjBufSizeYUV(int width, int height, int subsamp)
+{
+  return tjBufSizeYUV2(width, 4, height, subsamp);
+}
+
+/* TurboJPEG 1.1+ */
+DLLEXPORT unsigned long TJBUFSIZEYUV(int width, int height, int subsamp)
+{
+  return tjBufSizeYUV(width, height, subsamp);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3YUVPlaneWidth(int componentID, int width, int subsamp)
+{
+  static const char FUNCTION_NAME[] = "tj3YUVPlaneWidth";
+  unsigned long long pw, retval = 0;
+  int nc;
+
+  if (width < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROWG("Invalid argument", 0);
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
+  if (componentID < 0 || componentID >= nc)
+    THROWG("Invalid argument", 0);
+
+  pw = PAD((unsigned long long)width, tjMCUWidth[subsamp] / 8);
+  if (componentID == 0)
+    retval = pw;
+  else
+    retval = pw * 8 / tjMCUWidth[subsamp];
+
+  if (retval > (unsigned long long)INT_MAX)
+    THROWG("Width is too large", 0);
+
+bailout:
+  return (int)retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjPlaneWidth(int componentID, int width, int subsamp)
+{
+  int retval = tj3YUVPlaneWidth(componentID, width, subsamp);
+  return (retval == 0) ? -1 : retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3YUVPlaneHeight(int componentID, int height, int subsamp)
+{
+  static const char FUNCTION_NAME[] = "tj3YUVPlaneHeight";
+  unsigned long long ph, retval = 0;
+  int nc;
+
+  if (height < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROWG("Invalid argument", 0);
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
+  if (componentID < 0 || componentID >= nc)
+    THROWG("Invalid argument", 0);
+
+  ph = PAD((unsigned long long)height, tjMCUHeight[subsamp] / 8);
+  if (componentID == 0)
+    retval = ph;
+  else
+    retval = ph * 8 / tjMCUHeight[subsamp];
+
+  if (retval > (unsigned long long)INT_MAX)
+    THROWG("Height is too large", 0);
+
+bailout:
+  return (int)retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjPlaneHeight(int componentID, int height, int subsamp)
+{
+  int retval = tj3YUVPlaneHeight(componentID, height, subsamp);
+  return (retval == 0) ? -1 : retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT size_t tj3YUVPlaneSize(int componentID, int width, int stride,
+                                 int height, int subsamp)
+{
+  static const char FUNCTION_NAME[] = "tj3YUVPlaneSize";
+  unsigned long long retval = 0;
+  int pw, ph;
+
+  if (width < 1 || height < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROWG("Invalid argument", 0);
+
+  pw = tj3YUVPlaneWidth(componentID, width, subsamp);
+  ph = tj3YUVPlaneHeight(componentID, height, subsamp);
+  if (pw == 0 || ph == 0) return 0;
+
+  if (stride == 0) stride = pw;
+  else stride = abs(stride);
+
+  retval = (unsigned long long)stride * (ph - 1) + pw;
+#if ULLONG_MAX > ULONG_MAX
+  if (retval > (unsigned long long)((unsigned long)-1))
+    THROWG("Image is too large", 0);
+#endif
+
+bailout:
+  return (size_t)retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT unsigned long tjPlaneSizeYUV(int componentID, int width, int stride,
+                                       int height, int subsamp)
+{
+  size_t retval = tj3YUVPlaneSize(componentID, width, stride, height, subsamp);
+  return (retval == 0) ? -1 : (unsigned long)retval;
+}
+
+
+/* tj3Compress*() is implemented in turbojpeg-mp.c */
+#define BITS_IN_JSAMPLE  8
+#include "turbojpeg-mp.c"
+#undef BITS_IN_JSAMPLE
+#define BITS_IN_JSAMPLE  12
+#include "turbojpeg-mp.c"
+#undef BITS_IN_JSAMPLE
+#define BITS_IN_JSAMPLE  16
+#include "turbojpeg-mp.c"
+#undef BITS_IN_JSAMPLE
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          unsigned char **jpegBuf, unsigned long *jpegSize,
+                          int jpegSubsamp, int jpegQual, int flags)
+{
+  static const char FUNCTION_NAME[] = "tjCompress2";
+  int retval = 0;
+  size_t size;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (jpegSize == NULL || jpegSubsamp < 0 || jpegSubsamp >= TJ_NUMSAMP ||
+      jpegQual < 0 || jpegQual > 100)
+    THROW("Invalid argument");
+
+  this->quality = jpegQual;
+  this->subsamp = jpegSubsamp;
+  processFlags(handle, flags, COMPRESS);
+
+  size = (size_t)(*jpegSize);
+  retval = tj3Compress8(handle, srcBuf, width, pitch, height, pixelFormat,
+                        jpegBuf, &size);
+  *jpegSize = (unsigned long)size;
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.0+ */
+DLLEXPORT int tjCompress(tjhandle handle, unsigned char *srcBuf, int width,
+                         int pitch, int height, int pixelSize,
+                         unsigned char *jpegBuf, unsigned long *jpegSize,
+                         int jpegSubsamp, int jpegQual, int flags)
+{
+  int retval = 0;
+  unsigned long size = jpegSize ? *jpegSize : 0;
+
+  if (flags & TJ_YUV) {
+    size = tjBufSizeYUV(width, height, jpegSubsamp);
+    retval = tjEncodeYUV2(handle, srcBuf, width, pitch, height,
+                          getPixelFormat(pixelSize, flags), jpegBuf,
+                          jpegSubsamp, flags);
+  } else {
+    retval = tjCompress2(handle, srcBuf, width, pitch, height,
+                         getPixelFormat(pixelSize, flags), &jpegBuf, &size,
+                         jpegSubsamp, jpegQual, flags | TJFLAG_NOREALLOC);
+  }
+  *jpegSize = size;
+  return retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3EncodeYUVPlanes8(tjhandle handle, const unsigned char *srcBuf,
+                                  int width, int pitch, int height,
+                                  int pixelFormat, unsigned char **dstPlanes,
+                                  int *strides)
+{
+  static const char FUNCTION_NAME[] = "tj3EncodeYUVPlanes8";
+  JSAMPROW *row_pointer = NULL;
+  JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
+  JSAMPROW *tmpbuf[MAX_COMPONENTS], *tmpbuf2[MAX_COMPONENTS];
+  JSAMPROW *outbuf[MAX_COMPONENTS];
+  int i, retval = 0, row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
+  JSAMPLE *ptr;
+  jpeg_component_info *compptr;
+
+  GET_CINSTANCE(handle)
+
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    tmpbuf[i] = NULL;  _tmpbuf[i] = NULL;
+    tmpbuf2[i] = NULL;  _tmpbuf2[i] = NULL;  outbuf[i] = NULL;
+  }
+
+  if ((this->init & COMPRESS) == 0)
+    THROW("Instance has not been initialized for compression");
+
+  if (srcBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF || !dstPlanes ||
+      !dstPlanes[0])
+    THROW("Invalid argument");
+  if (this->subsamp != TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
+    THROW("Invalid argument");
+
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("TJPARAM_SUBSAMP must be specified");
+  if (pixelFormat == TJPF_CMYK)
+    THROW("Cannot generate YUV images from packed-pixel CMYK images");
+
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  cinfo->image_width = width;
+  cinfo->image_height = height;
+  cinfo->data_precision = 8;
+
+  setCompDefaults(this, pixelFormat);
+
+  /* Execute only the parts of jpeg_start_compress() that we need.  If we
+     were to call the whole jpeg_start_compress() function, then it would try
+     to write the file headers, which could overflow the output buffer if the
+     YUV image were very small. */
+  if (cinfo->global_state != CSTATE_START)
+    THROW("libjpeg API is in the wrong state");
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
+  jinit_c_master_control(cinfo, FALSE);
+  jinit_color_converter(cinfo);
+  jinit_downsampler(cinfo);
+  (*cinfo->cconvert->start_pass) (cinfo);
+
+  pw0 = PAD(width, cinfo->max_h_samp_factor);
+  ph0 = PAD(height, cinfo->max_v_samp_factor);
+
+  if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph0)) == NULL)
+    THROW("Memory allocation failure");
+  for (i = 0; i < height; i++) {
+    if (this->bottomUp)
+      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * (size_t)pitch];
+    else
+      row_pointer[i] = (JSAMPROW)&srcBuf[i * (size_t)pitch];
+  }
+  if (height < ph0)
+    for (i = height; i < ph0; i++) row_pointer[i] = row_pointer[height - 1];
+
+  for (i = 0; i < cinfo->num_components; i++) {
+    compptr = &cinfo->comp_info[i];
+    _tmpbuf[i] = (JSAMPLE *)malloc(
+      PAD((compptr->width_in_blocks * cinfo->max_h_samp_factor * DCTSIZE) /
+          compptr->h_samp_factor, 32) *
+      cinfo->max_v_samp_factor + 32);
+    if (!_tmpbuf[i])
+      THROW("Memory allocation failure");
+    tmpbuf[i] =
+      (JSAMPROW *)malloc(sizeof(JSAMPROW) * cinfo->max_v_samp_factor);
+    if (!tmpbuf[i])
+      THROW("Memory allocation failure");
+    for (row = 0; row < cinfo->max_v_samp_factor; row++) {
+      unsigned char *_tmpbuf_aligned =
+        (unsigned char *)PAD((JUINTPTR)_tmpbuf[i], 32);
+
+      tmpbuf[i][row] = &_tmpbuf_aligned[
+        PAD((compptr->width_in_blocks * cinfo->max_h_samp_factor * DCTSIZE) /
+            compptr->h_samp_factor, 32) * row];
+    }
+    _tmpbuf2[i] =
+      (JSAMPLE *)malloc(PAD(compptr->width_in_blocks * DCTSIZE, 32) *
+                        compptr->v_samp_factor + 32);
+    if (!_tmpbuf2[i])
+      THROW("Memory allocation failure");
+    tmpbuf2[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * compptr->v_samp_factor);
+    if (!tmpbuf2[i])
+      THROW("Memory allocation failure");
+    for (row = 0; row < compptr->v_samp_factor; row++) {
+      unsigned char *_tmpbuf2_aligned =
+        (unsigned char *)PAD((JUINTPTR)_tmpbuf2[i], 32);
+
+      tmpbuf2[i][row] =
+        &_tmpbuf2_aligned[PAD(compptr->width_in_blocks * DCTSIZE, 32) * row];
+    }
+    pw[i] = pw0 * compptr->h_samp_factor / cinfo->max_h_samp_factor;
+    ph[i] = ph0 * compptr->v_samp_factor / cinfo->max_v_samp_factor;
+    outbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i]);
+    if (!outbuf[i])
+      THROW("Memory allocation failure");
+    ptr = dstPlanes[i];
+    for (row = 0; row < ph[i]; row++) {
+      outbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  for (row = 0; row < ph0; row += cinfo->max_v_samp_factor) {
+    (*cinfo->cconvert->color_convert) (cinfo, &row_pointer[row], tmpbuf, 0,
+                                       cinfo->max_v_samp_factor);
+    (cinfo->downsample->downsample) (cinfo, tmpbuf, 0, tmpbuf2, 0);
+    for (i = 0, compptr = cinfo->comp_info; i < cinfo->num_components;
+         i++, compptr++)
+      jcopy_sample_rows(tmpbuf2[i], 0, outbuf[i],
+        row * compptr->v_samp_factor / cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, pw[i]);
+  }
+  cinfo->next_scanline += height;
+  jpeg_abort_compress(cinfo);
+
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  free(row_pointer);
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    free(tmpbuf[i]);
+    free(_tmpbuf[i]);
+    free(tmpbuf2[i]);
+    free(_tmpbuf2[i]);
+    free(outbuf[i]);
+  }
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int pitch, int height,
+                                int pixelFormat, unsigned char **dstPlanes,
+                                int *strides, int subsamp, int flags)
+{
+  static const char FUNCTION_NAME[] = "tjEncodeYUVPlanes";
+  int retval = 0;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROW("Invalid argument");
+
+  this->subsamp = subsamp;
+  processFlags(handle, flags, COMPRESS);
+
+  return tj3EncodeYUVPlanes8(handle, srcBuf, width, pitch, height, pixelFormat,
+                             dstPlanes, strides);
+
+bailout:
+  return retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3EncodeYUV8(tjhandle handle, const unsigned char *srcBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            unsigned char *dstBuf, int align)
+{
+  static const char FUNCTION_NAME[] = "tj3EncodeYUV8";
+  unsigned char *dstPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (width <= 0 || height <= 0 || dstBuf == NULL || align < 1 ||
+      !IS_POW2(align))
+    THROW("Invalid argument");
+
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("TJPARAM_SUBSAMP must be specified");
+
+  pw0 = tj3YUVPlaneWidth(0, width, this->subsamp);
+  ph0 = tj3YUVPlaneHeight(0, height, this->subsamp);
+  dstPlanes[0] = dstBuf;
+  strides[0] = PAD(pw0, align);
+  if (this->subsamp == TJSAMP_GRAY) {
+    strides[1] = strides[2] = 0;
+    dstPlanes[1] = dstPlanes[2] = NULL;
+  } else {
+    int pw1 = tj3YUVPlaneWidth(1, width, this->subsamp);
+    int ph1 = tj3YUVPlaneHeight(1, height, this->subsamp);
+
+    strides[1] = strides[2] = PAD(pw1, align);
+    if ((unsigned long long)strides[0] * (unsigned long long)ph0 >
+        (unsigned long long)INT_MAX ||
+        (unsigned long long)strides[1] * (unsigned long long)ph1 >
+        (unsigned long long)INT_MAX)
+      THROW("Image or row alignment is too large");
+    dstPlanes[1] = dstPlanes[0] + strides[0] * ph0;
+    dstPlanes[2] = dstPlanes[1] + strides[1] * ph1;
+  }
+
+  return tj3EncodeYUVPlanes8(handle, srcBuf, width, pitch, height, pixelFormat,
+                             dstPlanes, strides);
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjEncodeYUV3(tjhandle handle, const unsigned char *srcBuf,
+                           int width, int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int align, int subsamp,
+                           int flags)
+{
+  static const char FUNCTION_NAME[] = "tjEncodeYUV3";
+  int retval = 0;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROW("Invalid argument");
+
+  this->subsamp = subsamp;
+  processFlags(handle, flags, COMPRESS);
+
+  return tj3EncodeYUV8(handle, srcBuf, width, pitch, height, pixelFormat,
+                       dstBuf, align);
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT int tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf, int width,
+                           int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int subsamp, int flags)
+{
+  return tjEncodeYUV3(handle, srcBuf, width, pitch, height, pixelFormat,
+                      dstBuf, 4, subsamp, flags);
+}
+
+/* TurboJPEG 1.1+ */
+DLLEXPORT int tjEncodeYUV(tjhandle handle, unsigned char *srcBuf, int width,
+                          int pitch, int height, int pixelSize,
+                          unsigned char *dstBuf, int subsamp, int flags)
+{
+  return tjEncodeYUV2(handle, srcBuf, width, pitch, height,
+                      getPixelFormat(pixelSize, flags), dstBuf, subsamp,
+                      flags);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3CompressFromYUVPlanes8(tjhandle handle,
+                                        const unsigned char * const *srcPlanes,
+                                        int width, const int *strides,
+                                        int height, unsigned char **jpegBuf,
+                                        size_t *jpegSize)
+{
+  static const char FUNCTION_NAME[] = "tj3CompressFromYUVPlanes8";
+  int i, row, retval = 0;
+  boolean alloc = TRUE;
+  int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+    tmpbufsize = 0, usetmpbuf = 0, th[MAX_COMPONENTS];
+  JSAMPLE *_tmpbuf = NULL, *ptr;
+  JSAMPROW *inbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
+
+  GET_CINSTANCE(handle)
+
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    tmpbuf[i] = NULL;  inbuf[i] = NULL;
+  }
+
+  if ((this->init & COMPRESS) == 0)
+    THROW("Instance has not been initialized for compression");
+
+  if (!srcPlanes || !srcPlanes[0] || width <= 0 || height <= 0 ||
+      jpegBuf == NULL || jpegSize == NULL)
+    THROW("Invalid argument");
+  if (this->subsamp != TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
+    THROW("Invalid argument");
+
+  if (this->quality == -1)
+    THROW("TJPARAM_QUALITY must be specified");
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("TJPARAM_SUBSAMP must be specified");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  cinfo->image_width = width;
+  cinfo->image_height = height;
+  cinfo->data_precision = 8;
+
+  if (this->noRealloc) {
+    alloc = FALSE;  *jpegSize = tj3JPEGBufSize(width, height, this->subsamp);
+  }
+  jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
+  setCompDefaults(this, TJPF_RGB);
+  cinfo->raw_data_in = TRUE;
+
+  jpeg_start_compress(cinfo, TRUE);
+  for (i = 0; i < cinfo->num_components; i++) {
+    jpeg_component_info *compptr = &cinfo->comp_info[i];
+    int ih;
+
+    iw[i] = compptr->width_in_blocks * DCTSIZE;
+    ih = compptr->height_in_blocks * DCTSIZE;
+    pw[i] = PAD(cinfo->image_width, cinfo->max_h_samp_factor) *
+            compptr->h_samp_factor / cinfo->max_h_samp_factor;
+    ph[i] = PAD(cinfo->image_height, cinfo->max_v_samp_factor) *
+            compptr->v_samp_factor / cinfo->max_v_samp_factor;
+    if (iw[i] != pw[i] || ih != ph[i]) usetmpbuf = 1;
+    th[i] = compptr->v_samp_factor * DCTSIZE;
+    tmpbufsize += iw[i] * th[i];
+    if ((inbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i])) == NULL)
+      THROW("Memory allocation failure");
+    ptr = (JSAMPLE *)srcPlanes[i];
+    for (row = 0; row < ph[i]; row++) {
+      inbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
+  if (usetmpbuf) {
+    if ((_tmpbuf = (JSAMPLE *)malloc(sizeof(JSAMPLE) * tmpbufsize)) == NULL)
+      THROW("Memory allocation failure");
+    ptr = _tmpbuf;
+    for (i = 0; i < cinfo->num_components; i++) {
+      if ((tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * th[i])) == NULL)
+        THROW("Memory allocation failure");
+      for (row = 0; row < th[i]; row++) {
+        tmpbuf[i][row] = ptr;
+        ptr += iw[i];
+      }
+    }
+  }
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  for (row = 0; row < (int)cinfo->image_height;
+       row += cinfo->max_v_samp_factor * DCTSIZE) {
+    JSAMPARRAY yuvptr[MAX_COMPONENTS];
+    int crow[MAX_COMPONENTS];
+
+    for (i = 0; i < cinfo->num_components; i++) {
+      jpeg_component_info *compptr = &cinfo->comp_info[i];
+
+      crow[i] = row * compptr->v_samp_factor / cinfo->max_v_samp_factor;
+      if (usetmpbuf) {
+        int j, k;
+
+        for (j = 0; j < MIN(th[i], ph[i] - crow[i]); j++) {
+          memcpy(tmpbuf[i][j], inbuf[i][crow[i] + j], pw[i]);
+          /* Duplicate last sample in row to fill out MCU */
+          for (k = pw[i]; k < iw[i]; k++)
+            tmpbuf[i][j][k] = tmpbuf[i][j][pw[i] - 1];
+        }
+        /* Duplicate last row to fill out MCU */
+        for (j = ph[i] - crow[i]; j < th[i]; j++)
+          memcpy(tmpbuf[i][j], tmpbuf[i][ph[i] - crow[i] - 1], iw[i]);
+        yuvptr[i] = tmpbuf[i];
+      } else
+        yuvptr[i] = &inbuf[i][crow[i]];
+    }
+    jpeg_write_raw_data(cinfo, yuvptr, cinfo->max_v_samp_factor * DCTSIZE);
+  }
+  jpeg_finish_compress(cinfo);
+
+bailout:
+  if (cinfo->global_state > CSTATE_START && alloc)
+    (*cinfo->dest->term_destination) (cinfo);
+  if (cinfo->global_state > CSTATE_START || retval == -1)
+    jpeg_abort_compress(cinfo);
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    free(tmpbuf[i]);
+    free(inbuf[i]);
+  }
+  free(_tmpbuf);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
+                                      const unsigned char **srcPlanes,
+                                      int width, const int *strides,
+                                      int height, int subsamp,
+                                      unsigned char **jpegBuf,
+                                      unsigned long *jpegSize, int jpegQual,
+                                      int flags)
+{
+  static const char FUNCTION_NAME[] = "tjCompressFromYUVPlanes";
+  int retval = 0;
+  size_t size;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (subsamp < 0 || subsamp >= TJ_NUMSAMP || jpegSize == NULL ||
+      jpegQual < 0 || jpegQual > 100)
+    THROW("Invalid argument");
+
+  this->quality = jpegQual;
+  this->subsamp = subsamp;
+  processFlags(handle, flags, COMPRESS);
+
+  size = (size_t)(*jpegSize);
+  retval = tj3CompressFromYUVPlanes8(handle, srcPlanes, width, strides, height,
+                                     jpegBuf, &size);
+  *jpegSize = (unsigned long)size;
+
+bailout:
+  return retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3CompressFromYUV8(tjhandle handle,
+                                  const unsigned char *srcBuf, int width,
+                                  int align, int height,
+                                  unsigned char **jpegBuf, size_t *jpegSize)
+{
+  static const char FUNCTION_NAME[] = "tj3CompressFromYUV8";
+  const unsigned char *srcPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (srcBuf == NULL || width <= 0 || align < 1 || !IS_POW2(align) ||
+      height <= 0)
+    THROW("Invalid argument");
+
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("TJPARAM_SUBSAMP must be specified");
+
+  pw0 = tj3YUVPlaneWidth(0, width, this->subsamp);
+  ph0 = tj3YUVPlaneHeight(0, height, this->subsamp);
+  srcPlanes[0] = srcBuf;
+  strides[0] = PAD(pw0, align);
+  if (this->subsamp == TJSAMP_GRAY) {
+    strides[1] = strides[2] = 0;
+    srcPlanes[1] = srcPlanes[2] = NULL;
+  } else {
+    int pw1 = tjPlaneWidth(1, width, this->subsamp);
+    int ph1 = tjPlaneHeight(1, height, this->subsamp);
+
+    strides[1] = strides[2] = PAD(pw1, align);
+    if ((unsigned long long)strides[0] * (unsigned long long)ph0 >
+        (unsigned long long)INT_MAX ||
+        (unsigned long long)strides[1] * (unsigned long long)ph1 >
+        (unsigned long long)INT_MAX)
+      THROW("Image or row alignment is too large");
+    srcPlanes[1] = srcPlanes[0] + strides[0] * ph0;
+    srcPlanes[2] = srcPlanes[1] + strides[1] * ph1;
+  }
+
+  return tj3CompressFromYUVPlanes8(handle, srcPlanes, width, strides, height,
+                                   jpegBuf, jpegSize);
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int align, int height, int subsamp,
+                                unsigned char **jpegBuf,
+                                unsigned long *jpegSize, int jpegQual,
+                                int flags)
+{
+  static const char FUNCTION_NAME[] = "tjCompressFromYUV";
+  int retval = -1;
+  size_t size;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROW("Invalid argument");
+
+  this->quality = jpegQual;
+  this->subsamp = subsamp;
+  processFlags(handle, flags, COMPRESS);
+
+  size = (size_t)(*jpegSize);
+  retval = tj3CompressFromYUV8(handle, srcBuf, width, align, height, jpegBuf,
+                               &size);
+  *jpegSize = (unsigned long)size;
+
+bailout:
+  return retval;
+}
+
+
+/******************************* Decompressor ********************************/
+
+static tjhandle _tjInitDecompress(tjinstance *this)
+{
+  static unsigned char buffer[1];
+
+  /* This is also straight out of example.c */
+  this->dinfo.err = jpeg_std_error(&this->jerr.pub);
+  this->jerr.pub.error_exit = my_error_exit;
+  this->jerr.pub.output_message = my_output_message;
+  this->jerr.emit_message = this->jerr.pub.emit_message;
+  this->jerr.pub.emit_message = my_emit_message;
+  this->jerr.pub.addon_message_table = turbojpeg_message_table;
+  this->jerr.pub.first_addon_message = JMSG_FIRSTADDONCODE;
+  this->jerr.pub.last_addon_message = JMSG_LASTADDONCODE;
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    free(this);
+    return NULL;
+  }
+
+  jpeg_create_decompress(&this->dinfo);
+  /* Make an initial call so it will create the source manager */
+  jpeg_mem_src_tj(&this->dinfo, buffer, 1);
+
+  this->init |= DECOMPRESS;
+  return (tjhandle)this;
+}
+
+/* TurboJPEG 1.0+ */
+DLLEXPORT tjhandle tjInitDecompress(void)
+{
+  return tj3Init(TJINIT_DECOMPRESS);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3DecompressHeader(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  size_t jpegSize)
+{
+  static const char FUNCTION_NAME[] = "tj3DecompressHeader";
+  int retval = 0;
+
+  GET_DINSTANCE(handle);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (jpegBuf == NULL || jpegSize <= 0)
+    THROW("Invalid argument");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    return -1;
+  }
+
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+
+  /* jpeg_read_header() calls jpeg_abort() and returns JPEG_HEADER_TABLES_ONLY
+     if the datastream is a tables-only datastream.  Since we aren't using a
+     suspending data source, the only other value it can return is
+     JPEG_HEADER_OK. */
+  if (jpeg_read_header(dinfo, FALSE) == JPEG_HEADER_TABLES_ONLY)
+    return 0;
+
+  setDecompParameters(this);
+
+  jpeg_abort_decompress(dinfo);
+
+  if (this->colorspace < 0)
+    THROW("Could not determine colorspace of JPEG image");
+  if (this->jpegWidth < 1 || this->jpegHeight < 1)
+    THROW("Invalid data returned in header");
+
+bailout:
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjDecompressHeader3(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp,
+                                  int *jpegColorspace)
+{
+  static const char FUNCTION_NAME[] = "tjDecompressHeader3";
+  int retval = 0;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (width == NULL || height == NULL || jpegSubsamp == NULL ||
+      jpegColorspace == NULL)
+    THROW("Invalid argument");
+
+  retval = tj3DecompressHeader(handle, jpegBuf, jpegSize);
+
+  *width = tj3Get(handle, TJPARAM_JPEGWIDTH);
+  *height = tj3Get(handle, TJPARAM_JPEGHEIGHT);
+  *jpegSubsamp = tj3Get(handle, TJPARAM_SUBSAMP);
+  if (*jpegSubsamp == TJSAMP_UNKNOWN)
+    THROW("Could not determine subsampling level of JPEG image");
+  *jpegColorspace = tj3Get(handle, TJPARAM_COLORSPACE);
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.1+ */
+DLLEXPORT int tjDecompressHeader2(tjhandle handle, unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp)
+{
+  int jpegColorspace;
+
+  return tjDecompressHeader3(handle, jpegBuf, jpegSize, width, height,
+                             jpegSubsamp, &jpegColorspace);
+}
+
+/* TurboJPEG 1.0+ */
+DLLEXPORT int tjDecompressHeader(tjhandle handle, unsigned char *jpegBuf,
+                                 unsigned long jpegSize, int *width,
+                                 int *height)
+{
+  int jpegSubsamp;
+
+  return tjDecompressHeader2(handle, jpegBuf, jpegSize, width, height,
+                             &jpegSubsamp);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT tjscalingfactor *tj3GetScalingFactors(int *numScalingFactors)
+{
+  static const char FUNCTION_NAME[] = "tj3GetScalingFactors";
+  tjscalingfactor *retval = (tjscalingfactor *)sf;
+
+  if (numScalingFactors == NULL)
+    THROWG("Invalid argument", NULL);
+
+  *numScalingFactors = NUMSF;
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT tjscalingfactor *tjGetScalingFactors(int *numScalingFactors)
+{
+  return tj3GetScalingFactors(numScalingFactors);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3SetScalingFactor(tjhandle handle,
+                                  tjscalingfactor scalingFactor)
+{
+  static const char FUNCTION_NAME[] = "tj3SetScalingFactor";
+  int i, retval = 0;
+
+  GET_TJINSTANCE(handle, -1);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  for (i = 0; i < NUMSF; i++) {
+    if (scalingFactor.num == sf[i].num && scalingFactor.denom == sf[i].denom)
+      break;
+  }
+  if (i >= NUMSF)
+    THROW("Unsupported scaling factor");
+
+  this->scalingFactor = scalingFactor;
+
+bailout:
+  return retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3SetCroppingRegion(tjhandle handle, tjregion croppingRegion)
+{
+  static const char FUNCTION_NAME[] = "tj3SetCroppingRegion";
+  int retval = 0, scaledWidth, scaledHeight;
+
+  GET_TJINSTANCE(handle, -1);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (croppingRegion.x == 0 && croppingRegion.y == 0 &&
+      croppingRegion.w == 0 && croppingRegion.h == 0) {
+    this->croppingRegion = croppingRegion;
+    return 0;
+  }
+
+  if (croppingRegion.x < 0 || croppingRegion.y < 0 || croppingRegion.w < 0 ||
+      croppingRegion.h < 0)
+    THROW("Invalid cropping region");
+  if (this->jpegWidth < 0 || this->jpegHeight < 0)
+    THROW("JPEG header has not yet been read");
+  if (this->precision == 16 || this->lossless)
+    THROW("Cannot partially decompress lossless JPEG images");
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("Could not determine subsampling level of JPEG image");
+
+  scaledWidth = TJSCALED(this->jpegWidth, this->scalingFactor);
+  scaledHeight = TJSCALED(this->jpegHeight, this->scalingFactor);
+
+  if (croppingRegion.x %
+      TJSCALED(tjMCUWidth[this->subsamp], this->scalingFactor) != 0)
+    THROWI("The left boundary of the cropping region (%d) is not\n"
+           "divisible by the scaled MCU width (%d)",
+           croppingRegion.x,
+           TJSCALED(tjMCUWidth[this->subsamp], this->scalingFactor));
+  if (croppingRegion.w == 0)
+    croppingRegion.w = scaledWidth - croppingRegion.x;
+  if (croppingRegion.h == 0)
+    croppingRegion.h = scaledHeight - croppingRegion.y;
+  if (croppingRegion.w < 0 || croppingRegion.h < 0 ||
+      croppingRegion.x + croppingRegion.w > scaledWidth ||
+      croppingRegion.y + croppingRegion.h > scaledHeight)
+    THROW("The cropping region exceeds the scaled image dimensions");
+
+  this->croppingRegion = croppingRegion;
+
+bailout:
+  return retval;
+}
+
+
+/* tj3Decompress*() is implemented in turbojpeg-mp.c */
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags)
+{
+  static const char FUNCTION_NAME[] = "tjDecompress2";
+  int i, retval = 0, jpegwidth, jpegheight, scaledw, scaledh;
+
+  GET_DINSTANCE(handle);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (jpegBuf == NULL || jpegSize <= 0 || width < 0 || height < 0)
+    THROW("Invalid argument");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
+  for (i = 0; i < NUMSF; i++) {
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    THROW("Could not scale down to desired image dimensions");
+
+  processFlags(handle, flags, DECOMPRESS);
+
+  if (tj3SetScalingFactor(handle, sf[i]) == -1)
+    return -1;
+  if (tj3SetCroppingRegion(handle, TJUNCROPPED) == -1)
+    return -1;
+  return tj3Decompress8(handle, jpegBuf, jpegSize, dstBuf, pitch, pixelFormat);
+
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.0+ */
+DLLEXPORT int tjDecompress(tjhandle handle, unsigned char *jpegBuf,
+                           unsigned long jpegSize, unsigned char *dstBuf,
+                           int width, int pitch, int height, int pixelSize,
+                           int flags)
+{
+  if (flags & TJ_YUV)
+    return tjDecompressToYUV(handle, jpegBuf, jpegSize, dstBuf, flags);
+  else
+    return tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, width, pitch,
+                         height, getPixelFormat(pixelSize, flags), flags);
+}
+
+
+static void setDecodeDefaults(tjinstance *this, int pixelFormat)
+{
+  int i;
+
+  this->dinfo.scale_num = this->dinfo.scale_denom = 1;
+
+  if (this->subsamp == TJSAMP_GRAY) {
+    this->dinfo.num_components = this->dinfo.comps_in_scan = 1;
+    this->dinfo.jpeg_color_space = JCS_GRAYSCALE;
+  } else {
+    this->dinfo.num_components = this->dinfo.comps_in_scan = 3;
+    this->dinfo.jpeg_color_space = JCS_YCbCr;
+  }
+
+  this->dinfo.comp_info = (jpeg_component_info *)
+    (*this->dinfo.mem->alloc_small) ((j_common_ptr)&this->dinfo, JPOOL_IMAGE,
+                                     this->dinfo.num_components *
+                                     sizeof(jpeg_component_info));
+
+  for (i = 0; i < this->dinfo.num_components; i++) {
+    jpeg_component_info *compptr = &this->dinfo.comp_info[i];
+
+    compptr->h_samp_factor = (i == 0) ? tjMCUWidth[this->subsamp] / 8 : 1;
+    compptr->v_samp_factor = (i == 0) ? tjMCUHeight[this->subsamp] / 8 : 1;
+    compptr->component_index = i;
+    compptr->component_id = i + 1;
+    compptr->quant_tbl_no = compptr->dc_tbl_no =
+      compptr->ac_tbl_no = (i == 0) ? 0 : 1;
+    this->dinfo.cur_comp_info[i] = compptr;
+  }
+  this->dinfo.data_precision = 8;
+  for (i = 0; i < 2; i++) {
+    if (this->dinfo.quant_tbl_ptrs[i] == NULL)
+      this->dinfo.quant_tbl_ptrs[i] =
+        jpeg_alloc_quant_table((j_common_ptr)&this->dinfo);
+  }
+
+  this->dinfo.mem->max_memory_to_use = (long)this->maxMemory * 1048576L;
+}
+
+
+static int my_read_markers(j_decompress_ptr dinfo)
+{
+  return JPEG_REACHED_SOS;
+}
+
+static void my_reset_marker_reader(j_decompress_ptr dinfo)
+{
+}
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3DecodeYUVPlanes8(tjhandle handle,
+                                  const unsigned char * const *srcPlanes,
+                                  const int *strides, unsigned char *dstBuf,
+                                  int width, int pitch, int height,
+                                  int pixelFormat)
+{
+  static const char FUNCTION_NAME[] = "tj3DecodeYUVPlanes8";
+  JSAMPROW *row_pointer = NULL;
+  JSAMPLE *_tmpbuf[MAX_COMPONENTS];
+  JSAMPROW *tmpbuf[MAX_COMPONENTS], *inbuf[MAX_COMPONENTS];
+  int i, retval = 0, row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
+  JSAMPLE *ptr;
+  jpeg_component_info *compptr;
+  int (*old_read_markers) (j_decompress_ptr);
+  void (*old_reset_marker_reader) (j_decompress_ptr);
+
+  GET_DINSTANCE(handle);
+
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    tmpbuf[i] = NULL;  _tmpbuf[i] = NULL;  inbuf[i] = NULL;
+  }
+
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (!srcPlanes || !srcPlanes[0] || dstBuf == NULL || width <= 0 ||
+      pitch < 0 || height <= 0 || pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+    THROW("Invalid argument");
+  if (this->subsamp != TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
+    THROW("Invalid argument");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("TJPARAM_SUBSAMP must be specified");
+  if (pixelFormat == TJPF_CMYK)
+    THROW("Cannot decode YUV images into packed-pixel CMYK images.");
+
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
+  dinfo->image_width = width;
+  dinfo->image_height = height;
+
+  dinfo->progressive_mode = dinfo->inputctl->has_multiple_scans = FALSE;
+  dinfo->Ss = dinfo->Ah = dinfo->Al = 0;
+  dinfo->Se = DCTSIZE2 - 1;
+  setDecodeDefaults(this, pixelFormat);
+  old_read_markers = dinfo->marker->read_markers;
+  dinfo->marker->read_markers = my_read_markers;
+  old_reset_marker_reader = dinfo->marker->reset_marker_reader;
+  dinfo->marker->reset_marker_reader = my_reset_marker_reader;
+  jpeg_read_header(dinfo, TRUE);
+  dinfo->marker->read_markers = old_read_markers;
+  dinfo->marker->reset_marker_reader = old_reset_marker_reader;
+
+  this->dinfo.out_color_space = pf2cs[pixelFormat];
+  this->dinfo.dct_method = this->fastDCT ? JDCT_FASTEST : JDCT_ISLOW;
+  dinfo->do_fancy_upsampling = FALSE;
+  dinfo->Se = DCTSIZE2 - 1;
+  jinit_master_decompress(dinfo);
+  (*dinfo->upsample->start_pass) (dinfo);
+
+  pw0 = PAD(width, dinfo->max_h_samp_factor);
+  ph0 = PAD(height, dinfo->max_v_samp_factor);
+
+  if (pitch == 0) pitch = dinfo->output_width * tjPixelSize[pixelFormat];
+
+  if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph0)) == NULL)
+    THROW("Memory allocation failure");
+  for (i = 0; i < height; i++) {
+    if (this->bottomUp)
+      row_pointer[i] = &dstBuf[(height - i - 1) * (size_t)pitch];
+    else
+      row_pointer[i] = &dstBuf[i * (size_t)pitch];
+  }
+  if (height < ph0)
+    for (i = height; i < ph0; i++) row_pointer[i] = row_pointer[height - 1];
+
+  for (i = 0; i < dinfo->num_components; i++) {
+    compptr = &dinfo->comp_info[i];
+    _tmpbuf[i] =
+      (JSAMPLE *)malloc(PAD(compptr->width_in_blocks * DCTSIZE, 32) *
+                        compptr->v_samp_factor + 32);
+    if (!_tmpbuf[i])
+      THROW("Memory allocation failure");
+    tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * compptr->v_samp_factor);
+    if (!tmpbuf[i])
+      THROW("Memory allocation failure");
+    for (row = 0; row < compptr->v_samp_factor; row++) {
+      unsigned char *_tmpbuf_aligned =
+        (unsigned char *)PAD((JUINTPTR)_tmpbuf[i], 32);
+
+      tmpbuf[i][row] =
+        &_tmpbuf_aligned[PAD(compptr->width_in_blocks * DCTSIZE, 32) * row];
+    }
+    pw[i] = pw0 * compptr->h_samp_factor / dinfo->max_h_samp_factor;
+    ph[i] = ph0 * compptr->v_samp_factor / dinfo->max_v_samp_factor;
+    inbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i]);
+    if (!inbuf[i])
+      THROW("Memory allocation failure");
+    ptr = (JSAMPLE *)srcPlanes[i];
+    for (row = 0; row < ph[i]; row++) {
+      inbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  for (row = 0; row < ph0; row += dinfo->max_v_samp_factor) {
+    JDIMENSION inrow = 0, outrow = 0;
+
+    for (i = 0, compptr = dinfo->comp_info; i < dinfo->num_components;
+         i++, compptr++)
+      jcopy_sample_rows(inbuf[i],
+        row * compptr->v_samp_factor / dinfo->max_v_samp_factor, tmpbuf[i], 0,
+        compptr->v_samp_factor, pw[i]);
+    (dinfo->upsample->upsample) (dinfo, tmpbuf, &inrow,
+                                 dinfo->max_v_samp_factor, &row_pointer[row],
+                                 &outrow, dinfo->max_v_samp_factor);
+  }
+  jpeg_abort_decompress(dinfo);
+
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  free(row_pointer);
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    free(tmpbuf[i]);
+    free(_tmpbuf[i]);
+    free(inbuf[i]);
+  }
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
+                                const unsigned char **srcPlanes,
+                                const int *strides, int subsamp,
+                                unsigned char *dstBuf, int width, int pitch,
+                                int height, int pixelFormat, int flags)
+{
+  static const char FUNCTION_NAME[] = "tjDecodeYUVPlanes";
+  int retval = 0;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROW("Invalid argument");
+
+  this->subsamp = subsamp;
+  processFlags(handle, flags, DECOMPRESS);
+
+  return tj3DecodeYUVPlanes8(handle, srcPlanes, strides, dstBuf, width, pitch,
+                             height, pixelFormat);
+
+bailout:
+  return retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3DecodeYUV8(tjhandle handle, const unsigned char *srcBuf,
+                            int align, unsigned char *dstBuf, int width,
+                            int pitch, int height, int pixelFormat)
+{
+  static const char FUNCTION_NAME[] = "tj3DecodeYUV8";
+  const unsigned char *srcPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (srcBuf == NULL || align < 1 || !IS_POW2(align) || width <= 0 ||
+      height <= 0)
+    THROW("Invalid argument");
+
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("TJPARAM_SUBSAMP must be specified");
+
+  pw0 = tj3YUVPlaneWidth(0, width, this->subsamp);
+  ph0 = tj3YUVPlaneHeight(0, height, this->subsamp);
+  srcPlanes[0] = srcBuf;
+  strides[0] = PAD(pw0, align);
+  if (this->subsamp == TJSAMP_GRAY) {
+    strides[1] = strides[2] = 0;
+    srcPlanes[1] = srcPlanes[2] = NULL;
+  } else {
+    int pw1 = tj3YUVPlaneWidth(1, width, this->subsamp);
+    int ph1 = tj3YUVPlaneHeight(1, height, this->subsamp);
+
+    strides[1] = strides[2] = PAD(pw1, align);
+    if ((unsigned long long)strides[0] * (unsigned long long)ph0 >
+        (unsigned long long)INT_MAX ||
+        (unsigned long long)strides[1] * (unsigned long long)ph1 >
+        (unsigned long long)INT_MAX)
+      THROW("Image or row alignment is too large");
+    srcPlanes[1] = srcPlanes[0] + strides[0] * ph0;
+    srcPlanes[2] = srcPlanes[1] + strides[1] * ph1;
+  }
+
+  return tj3DecodeYUVPlanes8(handle, srcPlanes, strides, dstBuf, width, pitch,
+                             height, pixelFormat);
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
+                          int align, int subsamp, unsigned char *dstBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags)
+{
+  static const char FUNCTION_NAME[] = "tjDecodeYUV";
+  int retval = -1;
+
+  GET_TJINSTANCE(handle, -1);
+
+  if (subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    THROW("Invalid argument");
+
+  this->subsamp = subsamp;
+  processFlags(handle, flags, DECOMPRESS);
+
+  return tj3DecodeYUV8(handle, srcBuf, align, dstBuf, width, pitch, height,
+                       pixelFormat);
+
+bailout:
+  return retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3DecompressToYUVPlanes8(tjhandle handle,
+                                        const unsigned char *jpegBuf,
+                                        size_t jpegSize,
+                                        unsigned char **dstPlanes,
+                                        int *strides)
+{
+  static const char FUNCTION_NAME[] = "tj3DecompressToYUVPlanes8";
+  int i, row, retval = 0;
+  int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+    tmpbufsize = 0, usetmpbuf = 0, th[MAX_COMPONENTS];
+  JSAMPLE *_tmpbuf = NULL, *ptr;
+  JSAMPROW *outbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
+  int dctsize;
+  struct my_progress_mgr progress;
+
+  GET_DINSTANCE(handle);
+
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    tmpbuf[i] = NULL;  outbuf[i] = NULL;
+  }
+
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (jpegBuf == NULL || jpegSize <= 0 || !dstPlanes || !dstPlanes[0])
+    THROW("Invalid argument");
+
+  if (this->scanLimit) {
+    memset(&progress, 0, sizeof(struct my_progress_mgr));
+    progress.pub.progress_monitor = my_progress_monitor;
+    progress.this = this;
+    dinfo->progress = &progress.pub;
+  } else
+    dinfo->progress = NULL;
+
+  dinfo->mem->max_memory_to_use = (long)this->maxMemory * 1048576L;
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  if (dinfo->global_state <= DSTATE_INHEADER) {
+    jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+    jpeg_read_header(dinfo, TRUE);
+  }
+  setDecompParameters(this);
+  if (this->maxPixels &&
+      (unsigned long long)this->jpegWidth * this->jpegHeight >
+      (unsigned long long)this->maxPixels)
+    THROW("Image is too large");
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("Could not determine subsampling level of JPEG image");
+
+  if (this->subsamp != TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
+    THROW("Invalid argument");
+
+  if (dinfo->num_components > 3)
+    THROW("JPEG image must have 3 or fewer components");
+
+  dinfo->scale_num = this->scalingFactor.num;
+  dinfo->scale_denom = this->scalingFactor.denom;
+  jpeg_calc_output_dimensions(dinfo);
+
+  dctsize = DCTSIZE * this->scalingFactor.num / this->scalingFactor.denom;
+
+  for (i = 0; i < dinfo->num_components; i++) {
+    jpeg_component_info *compptr = &dinfo->comp_info[i];
+    int ih;
+
+    iw[i] = compptr->width_in_blocks * dctsize;
+    ih = compptr->height_in_blocks * dctsize;
+    pw[i] = tj3YUVPlaneWidth(i, dinfo->output_width, this->subsamp);
+    ph[i] = tj3YUVPlaneHeight(i, dinfo->output_height, this->subsamp);
+    if (iw[i] != pw[i] || ih != ph[i]) usetmpbuf = 1;
+    th[i] = compptr->v_samp_factor * dctsize;
+    tmpbufsize += iw[i] * th[i];
+    if ((outbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i])) == NULL)
+      THROW("Memory allocation failure");
+    ptr = dstPlanes[i];
+    for (row = 0; row < ph[i]; row++) {
+      outbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
+  if (usetmpbuf) {
+    if ((_tmpbuf = (JSAMPLE *)malloc(sizeof(JSAMPLE) * tmpbufsize)) == NULL)
+      THROW("Memory allocation failure");
+    ptr = _tmpbuf;
+    for (i = 0; i < dinfo->num_components; i++) {
+      if ((tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * th[i])) == NULL)
+        THROW("Memory allocation failure");
+      for (row = 0; row < th[i]; row++) {
+        tmpbuf[i][row] = ptr;
+        ptr += iw[i];
+      }
+    }
+  }
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  dinfo->do_fancy_upsampling = !this->fastUpsample;
+  dinfo->dct_method = this->fastDCT ? JDCT_FASTEST : JDCT_ISLOW;
+  dinfo->raw_data_out = TRUE;
+
+  dinfo->mem->max_memory_to_use = (long)this->maxMemory * 1048576L;
+
+  jpeg_start_decompress(dinfo);
+  for (row = 0; row < (int)dinfo->output_height;
+       row += dinfo->max_v_samp_factor * dinfo->_min_DCT_scaled_size) {
+    JSAMPARRAY yuvptr[MAX_COMPONENTS];
+    int crow[MAX_COMPONENTS];
+
+    for (i = 0; i < dinfo->num_components; i++) {
+      jpeg_component_info *compptr = &dinfo->comp_info[i];
+
+      if (this->subsamp == TJSAMP_420) {
+        /* When 4:2:0 subsampling is used with IDCT scaling, libjpeg will try
+           to be clever and use the IDCT to perform upsampling on the U and V
+           planes.  For instance, if the output image is to be scaled by 1/2
+           relative to the JPEG image, then the scaling factor and upsampling
+           effectively cancel each other, so a normal 8x8 IDCT can be used.
+           However, this is not desirable when using the decompress-to-YUV
+           functionality in TurboJPEG, since we want to output the U and V
+           planes in their subsampled form.  Thus, we have to override some
+           internal libjpeg parameters to force it to use the "scaled" IDCT
+           functions on the U and V planes. */
+        compptr->_DCT_scaled_size = dctsize;
+        compptr->MCU_sample_width = tjMCUWidth[this->subsamp] *
+          this->scalingFactor.num / this->scalingFactor.denom *
+          compptr->v_samp_factor / dinfo->max_v_samp_factor;
+        dinfo->idct->inverse_DCT[i] = dinfo->idct->inverse_DCT[0];
+      }
+      crow[i] = row * compptr->v_samp_factor / dinfo->max_v_samp_factor;
+      if (usetmpbuf) yuvptr[i] = tmpbuf[i];
+      else yuvptr[i] = &outbuf[i][crow[i]];
+    }
+    jpeg_read_raw_data(dinfo, yuvptr,
+                       dinfo->max_v_samp_factor * dinfo->_min_DCT_scaled_size);
+    if (usetmpbuf) {
+      int j;
+
+      for (i = 0; i < dinfo->num_components; i++) {
+        for (j = 0; j < MIN(th[i], ph[i] - crow[i]); j++) {
+          memcpy(outbuf[i][crow[i] + j], tmpbuf[i][j], pw[i]);
+        }
+      }
+    }
+  }
+  jpeg_finish_decompress(dinfo);
+
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    free(tmpbuf[i]);
+    free(outbuf[i]);
+  }
+  free(_tmpbuf);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
+                                      const unsigned char *jpegBuf,
+                                      unsigned long jpegSize,
+                                      unsigned char **dstPlanes, int width,
+                                      int *strides, int height, int flags)
+{
+  static const char FUNCTION_NAME[] = "tjDecompressToYUVPlanes";
+  int i, retval = 0, jpegwidth, jpegheight, scaledw, scaledh;
+
+  GET_DINSTANCE(handle);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (jpegBuf == NULL || jpegSize <= 0 || width < 0 || height < 0)
+    THROW("Invalid argument");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
+  for (i = 0; i < NUMSF; i++) {
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    THROW("Could not scale down to desired image dimensions");
+
+  processFlags(handle, flags, DECOMPRESS);
+
+  if (tj3SetScalingFactor(handle, sf[i]) == -1)
+    return -1;
+  return tj3DecompressToYUVPlanes8(handle, jpegBuf, jpegSize, dstPlanes,
+                                   strides);
+
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3DecompressToYUV8(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  size_t jpegSize,
+                                  unsigned char *dstBuf, int align)
+{
+  static const char FUNCTION_NAME[] = "tj3DecompressToYUV8";
+  unsigned char *dstPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
+  int width, height;
+
+  GET_DINSTANCE(handle);
+
+  if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || align < 1 ||
+      !IS_POW2(align))
+    THROW("Invalid argument");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  if (dinfo->global_state <= DSTATE_INHEADER) {
+    jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+    jpeg_read_header(dinfo, TRUE);
+  }
+  setDecompParameters(this);
+  if (this->subsamp == TJSAMP_UNKNOWN)
+    THROW("Could not determine subsampling level of JPEG image");
+
+  width = TJSCALED(dinfo->image_width, this->scalingFactor);
+  height = TJSCALED(dinfo->image_height, this->scalingFactor);
+
+  pw0 = tj3YUVPlaneWidth(0, width, this->subsamp);
+  ph0 = tj3YUVPlaneHeight(0, height, this->subsamp);
+  dstPlanes[0] = dstBuf;
+  strides[0] = PAD(pw0, align);
+  if (this->subsamp == TJSAMP_GRAY) {
+    strides[1] = strides[2] = 0;
+    dstPlanes[1] = dstPlanes[2] = NULL;
+  } else {
+    int pw1 = tj3YUVPlaneWidth(1, width, this->subsamp);
+    int ph1 = tj3YUVPlaneHeight(1, height, this->subsamp);
+
+    strides[1] = strides[2] = PAD(pw1, align);
+    if ((unsigned long long)strides[0] * (unsigned long long)ph0 >
+        (unsigned long long)INT_MAX ||
+        (unsigned long long)strides[1] * (unsigned long long)ph1 >
+        (unsigned long long)INT_MAX)
+      THROW("Image or row alignment is too large");
+    dstPlanes[1] = dstPlanes[0] + strides[0] * ph0;
+    dstPlanes[2] = dstPlanes[1] + strides[1] * ph1;
+  }
+
+  return tj3DecompressToYUVPlanes8(handle, jpegBuf, jpegSize, dstPlanes,
+                                   strides);
+
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.4+ */
+DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
+                                 unsigned long jpegSize, unsigned char *dstBuf,
+                                 int width, int align, int height, int flags)
+{
+  static const char FUNCTION_NAME[] = "tjDecompressToYUV2";
+  int i, retval = 0, jpegwidth, jpegheight, scaledw, scaledh;
+
+  GET_DINSTANCE(handle);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (jpegBuf == NULL || jpegSize <= 0 || width < 0 || height < 0)
+    THROW("Invalid argument");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
+  for (i = 0; i < NUMSF; i++) {
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    THROW("Could not scale down to desired image dimensions");
+
+  width = scaledw;  height = scaledh;
+
+  processFlags(handle, flags, DECOMPRESS);
+
+  if (tj3SetScalingFactor(handle, sf[i]) == -1)
+    return -1;
+  return tj3DecompressToYUV8(handle, jpegBuf, (size_t)jpegSize, dstBuf, align);
+
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.1+ */
+DLLEXPORT int tjDecompressToYUV(tjhandle handle, unsigned char *jpegBuf,
+                                unsigned long jpegSize, unsigned char *dstBuf,
+                                int flags)
+{
+  return tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, 0, 4, 0, flags);
+}
+
+
+/******************************** Transformer ********************************/
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT tjhandle tjInitTransform(void)
+{
+  return tj3Init(TJINIT_TRANSFORM);
+}
+
+
+/* TurboJPEG 3+ */
+DLLEXPORT int tj3Transform(tjhandle handle, const unsigned char *jpegBuf,
+                           size_t jpegSize, int n, unsigned char **dstBufs,
+                           size_t *dstSizes, const tjtransform *t)
+{
+  static const char FUNCTION_NAME[] = "tj3Transform";
+  jpeg_transform_info *xinfo = NULL;
+  jvirt_barray_ptr *srccoefs, *dstcoefs;
+  int retval = 0, i, saveMarkers = 0;
+  boolean alloc = TRUE;
+  struct my_progress_mgr progress;
+
+  GET_INSTANCE(handle);
+  if ((this->init & COMPRESS) == 0 || (this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for transformation");
+
+  if (jpegBuf == NULL || jpegSize <= 0 || n < 1 || dstBufs == NULL ||
+      dstSizes == NULL || t == NULL)
+    THROW("Invalid argument");
+
+  if (this->scanLimit) {
+    memset(&progress, 0, sizeof(struct my_progress_mgr));
+    progress.pub.progress_monitor = my_progress_monitor;
+    progress.this = this;
+    dinfo->progress = &progress.pub;
+  } else
+    dinfo->progress = NULL;
+
+  dinfo->mem->max_memory_to_use = (long)this->maxMemory * 1048576L;
+
+  if ((xinfo =
+       (jpeg_transform_info *)malloc(sizeof(jpeg_transform_info) * n)) == NULL)
+    THROW("Memory allocation failure");
+  memset(xinfo, 0, sizeof(jpeg_transform_info) * n);
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  if (dinfo->global_state <= DSTATE_INHEADER)
+    jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+
+  for (i = 0; i < n; i++) {
+    if (t[i].op < 0 || t[i].op >= TJ_NUMXOP)
+      THROW("Invalid transform operation");
+    xinfo[i].transform = xformtypes[t[i].op];
+    xinfo[i].perfect = (t[i].options & TJXOPT_PERFECT) ? 1 : 0;
+    xinfo[i].trim = (t[i].options & TJXOPT_TRIM) ? 1 : 0;
+    xinfo[i].force_grayscale = (t[i].options & TJXOPT_GRAY) ? 1 : 0;
+    xinfo[i].crop = (t[i].options & TJXOPT_CROP) ? 1 : 0;
+    if (n != 1 && t[i].op == TJXOP_HFLIP) xinfo[i].slow_hflip = 1;
+    else xinfo[i].slow_hflip = 0;
+
+    if (xinfo[i].crop) {
+      xinfo[i].crop_xoffset = t[i].r.x;  xinfo[i].crop_xoffset_set = JCROP_POS;
+      xinfo[i].crop_yoffset = t[i].r.y;  xinfo[i].crop_yoffset_set = JCROP_POS;
+      if (t[i].r.w != 0) {
+        xinfo[i].crop_width = t[i].r.w;  xinfo[i].crop_width_set = JCROP_POS;
+      } else
+        xinfo[i].crop_width = JCROP_UNSET;
+      if (t[i].r.h != 0) {
+        xinfo[i].crop_height = t[i].r.h;  xinfo[i].crop_height_set = JCROP_POS;
+      } else
+        xinfo[i].crop_height = JCROP_UNSET;
+    }
+    if (!(t[i].options & TJXOPT_COPYNONE)) saveMarkers = 1;
+  }
+
+  jcopy_markers_setup(dinfo, saveMarkers ? JCOPYOPT_ALL : JCOPYOPT_NONE);
+  if (dinfo->global_state <= DSTATE_INHEADER)
+    jpeg_read_header(dinfo, TRUE);
+  if (this->maxPixels &&
+      (unsigned long long)dinfo->image_width * dinfo->image_height >
+      (unsigned long long)this->maxPixels)
+    THROW("Image is too large");
+  this->subsamp = getSubsamp(&this->dinfo);
+
+  for (i = 0; i < n; i++) {
+    if (!jtransform_request_workspace(dinfo, &xinfo[i]))
+      THROW("Transform is not perfect");
+
+    if (xinfo[i].crop) {
+      if (this->subsamp == TJSAMP_UNKNOWN)
+        THROW("Could not determine subsampling level of JPEG image");
+      if ((t[i].r.x % tjMCUWidth[this->subsamp]) != 0 ||
+          (t[i].r.y % tjMCUHeight[this->subsamp]) != 0)
+        THROWI("To crop this JPEG image, x must be a multiple of %d\n"
+               "and y must be a multiple of %d.", tjMCUWidth[this->subsamp],
+               tjMCUHeight[this->subsamp]);
+    }
+  }
+
+  srccoefs = jpeg_read_coefficients(dinfo);
+
+  for (i = 0; i < n; i++) {
+    int w, h;
+
+    if (!xinfo[i].crop) {
+      w = dinfo->image_width;  h = dinfo->image_height;
+      if (t[i].op == TJXOP_TRANSPOSE || t[i].op == TJXOP_TRANSVERSE ||
+          t[i].op == TJXOP_ROT90 || t[i].op == TJXOP_ROT270) {
+        w = dinfo->image_height;  h = dinfo->image_width;
+      }
+    } else {
+      w = xinfo[i].crop_width;  h = xinfo[i].crop_height;
+    }
+    if (this->noRealloc) {
+      alloc = FALSE;  dstSizes[i] = tj3JPEGBufSize(w, h, this->subsamp);
+    }
+    if (!(t[i].options & TJXOPT_NOOUTPUT))
+      jpeg_mem_dest_tj(cinfo, &dstBufs[i], &dstSizes[i], alloc);
+    jpeg_copy_critical_parameters(dinfo, cinfo);
+    dstcoefs = jtransform_adjust_parameters(dinfo, cinfo, srccoefs, &xinfo[i]);
+    if (this->optimize || t[i].options & TJXOPT_OPTIMIZE)
+      cinfo->optimize_coding = TRUE;
+#ifdef C_PROGRESSIVE_SUPPORTED
+    if (this->progressive || t[i].options & TJXOPT_PROGRESSIVE)
+      jpeg_simple_progression(cinfo);
+#endif
+    if (this->arithmetic || t[i].options & TJXOPT_ARITHMETIC) {
+      cinfo->arith_code = TRUE;
+      cinfo->optimize_coding = FALSE;
+    }
+    if (!(t[i].options & TJXOPT_NOOUTPUT)) {
+      jpeg_write_coefficients(cinfo, dstcoefs);
+      jcopy_markers_execute(dinfo, cinfo, t[i].options & TJXOPT_COPYNONE ?
+                                          JCOPYOPT_NONE : JCOPYOPT_ALL);
+    } else
+      jinit_c_master_control(cinfo, TRUE);
+    jtransform_execute_transformation(dinfo, cinfo, srccoefs, &xinfo[i]);
+    if (t[i].customFilter) {
+      int ci, y;
+      JDIMENSION by;
+
+      for (ci = 0; ci < cinfo->num_components; ci++) {
+        jpeg_component_info *compptr = &cinfo->comp_info[ci];
+        tjregion arrayRegion = { 0, 0, 0, 0 };
+        tjregion planeRegion = { 0, 0, 0, 0 };
+
+        arrayRegion.w = compptr->width_in_blocks * DCTSIZE;
+        arrayRegion.h = DCTSIZE;
+        planeRegion.w = compptr->width_in_blocks * DCTSIZE;
+        planeRegion.h = compptr->height_in_blocks * DCTSIZE;
+
+        for (by = 0; by < compptr->height_in_blocks;
+             by += compptr->v_samp_factor) {
+          JBLOCKARRAY barray = (dinfo->mem->access_virt_barray)
+            ((j_common_ptr)dinfo, dstcoefs[ci], by, compptr->v_samp_factor,
+             TRUE);
+
+          for (y = 0; y < compptr->v_samp_factor; y++) {
+            if (t[i].customFilter(barray[y][0], arrayRegion, planeRegion, ci,
+                                  i, (tjtransform *)&t[i]) == -1)
+              THROW("Error in custom filter");
+            arrayRegion.y += DCTSIZE;
+          }
+        }
+      }
+    }
+    if (!(t[i].options & TJXOPT_NOOUTPUT)) jpeg_finish_compress(cinfo);
+  }
+
+  jpeg_finish_decompress(dinfo);
+
+bailout:
+  if (cinfo->global_state > CSTATE_START) {
+    if (alloc) (*cinfo->dest->term_destination) (cinfo);
+    jpeg_abort_compress(cinfo);
+  }
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  free(xinfo);
+  if (this->jerr.warning) retval = -1;
+  return retval;
+}
+
+/* TurboJPEG 1.2+ */
+DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
+                          unsigned long jpegSize, int n,
+                          unsigned char **dstBufs, unsigned long *dstSizes,
+                          tjtransform *t, int flags)
+{
+  static const char FUNCTION_NAME[] = "tjTransform";
+  int i, retval = 0;
+  size_t *sizes = NULL;
+
+  GET_DINSTANCE(handle);
+  if ((this->init & DECOMPRESS) == 0)
+    THROW("Instance has not been initialized for decompression");
+
+  if (n < 1 || dstSizes == NULL)
+    THROW("Invalid argument");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
+  if (getSubsamp(dinfo) == TJSAMP_UNKNOWN)
+    THROW("Could not determine subsampling level of JPEG image");
+  processFlags(handle, flags, COMPRESS);
+
+  if ((sizes = (size_t *)malloc(n * sizeof(size_t))) == NULL)
+    THROW("Memory allocation failure");
+  for (i = 0; i < n; i++)
+    sizes[i] = (size_t)dstSizes[i];
+  retval = tj3Transform(handle, jpegBuf, (size_t)jpegSize, n, dstBufs, sizes,
+                        t);
+  for (i = 0; i < n; i++)
+    dstSizes[i] = (unsigned long)sizes[i];
+
+bailout:
+  free(sizes);
+  return retval;
+}
+
+
+/*************************** Packed-Pixel Image I/O **************************/
+
+/* tj3LoadImage*() is implemented in turbojpeg-mp.c */
+
+/* TurboJPEG 2.0+ */
+DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
+                                     int align, int *height,
+                                     int *pixelFormat, int flags)
+{
+  tjhandle handle = NULL;
+  unsigned char *dstBuf = NULL;
+
+  if ((handle = tj3Init(TJINIT_COMPRESS)) == NULL) return NULL;
+
+  processFlags(handle, flags, COMPRESS);
+
+  dstBuf = tj3LoadImage8(handle, filename, width, align, height, pixelFormat);
+
+  tj3Destroy(handle);
+  return dstBuf;
+}
+
+
+/* tj3SaveImage*() is implemented in turbojpeg-mp.c */
+
+/* TurboJPEG 2.0+ */
+DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags)
+{
+  tjhandle handle = NULL;
+  int retval = -1;
+
+  if ((handle = tj3Init(TJINIT_DECOMPRESS)) == NULL) return -1;
+
+  processFlags(handle, flags, DECOMPRESS);
+
+  retval = tj3SaveImage8(handle, filename, buffer, width, pitch, height,
+                         pixelFormat);
+
+  tj3Destroy(handle);
+  return retval;
+}
diff --git a/3rdparty/libjpeg-turbo/src/turbojpeg.h b/3rdparty/libjpeg-turbo/src/turbojpeg.h
new file mode 100644
index 000000000000..68b88a410464
--- /dev/null
+++ b/3rdparty/libjpeg-turbo/src/turbojpeg.h
@@ -0,0 +1,2328 @@
+/*
+ * Copyright (C)2009-2015, 2017, 2020-2023 D. R. Commander.
+ *                                         All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TURBOJPEG_H__
+#define __TURBOJPEG_H__
+
+#include <stddef.h>
+
+#if defined(_WIN32) && defined(DLLDEFINE)
+#define DLLEXPORT  __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+#define DLLCALL
+
+
+/**
+ * @addtogroup TurboJPEG
+ * TurboJPEG API.  This API provides an interface for generating, decoding, and
+ * transforming planar YUV and JPEG images in memory.
+ *
+ * @anchor YUVnotes
+ * YUV Image Format Notes
+ * ----------------------
+ * Technically, the JPEG format uses the YCbCr colorspace (which is technically
+ * not a colorspace but a color transform), but per the convention of the
+ * digital video community, the TurboJPEG API uses "YUV" to refer to an image
+ * format consisting of Y, Cb, and Cr image planes.
+ *
+ * Each plane is simply a 2D array of bytes, each byte representing the value
+ * of one of the components (Y, Cb, or Cr) at a particular location in the
+ * image.  The width and height of each plane are determined by the image
+ * width, height, and level of chrominance subsampling.  The luminance plane
+ * width is the image width padded to the nearest multiple of the horizontal
+ * subsampling factor (1 in the case of 4:4:4, grayscale, 4:4:0, or 4:4:1; 2 in
+ * the case of 4:2:2 or 4:2:0; 4 in the case of 4:1:1.)  Similarly, the
+ * luminance plane height is the image height padded to the nearest multiple of
+ * the vertical subsampling factor (1 in the case of 4:4:4, 4:2:2, grayscale,
+ * or 4:1:1; 2 in the case of 4:2:0 or 4:4:0; 4 in the case of 4:4:1.)  This is
+ * irrespective of any additional padding that may be specified as an argument
+ * to the various YUV functions.  The chrominance plane width is equal to the
+ * luminance plane width divided by the horizontal subsampling factor, and the
+ * chrominance plane height is equal to the luminance plane height divided by
+ * the vertical subsampling factor.
+ *
+ * For example, if the source image is 35 x 35 pixels and 4:2:2 subsampling is
+ * used, then the luminance plane would be 36 x 35 bytes, and each of the
+ * chrominance planes would be 18 x 35 bytes.  If you specify a row alignment
+ * of 4 bytes on top of this, then the luminance plane would be 36 x 35 bytes,
+ * and each of the chrominance planes would be 20 x 35 bytes.
+ *
+ * @{
+ */
+
+
+/**
+ * The number of initialization options
+ */
+#define TJ_NUMINIT  3
+
+/**
+ * Initialization options.
+ */
+enum TJINIT {
+  /**
+   * Initialize the TurboJPEG instance for compression.
+   */
+  TJINIT_COMPRESS,
+  /**
+   * Initialize the TurboJPEG instance for decompression.
+   */
+  TJINIT_DECOMPRESS,
+  /**
+   * Initialize the TurboJPEG instance for lossless transformation (both
+   * compression and decompression.)
+   */
+  TJINIT_TRANSFORM
+};
+
+
+/**
+ * The number of chrominance subsampling options
+ */
+#define TJ_NUMSAMP  7
+
+/**
+ * Chrominance subsampling options.
+ * When pixels are converted from RGB to YCbCr (see #TJCS_YCbCr) or from CMYK
+ * to YCCK (see #TJCS_YCCK) as part of the JPEG compression process, some of
+ * the Cb and Cr (chrominance) components can be discarded or averaged together
+ * to produce a smaller image with little perceptible loss of image clarity.
+ * (The human eye is more sensitive to small changes in brightness than to
+ * small changes in color.)  This is called "chrominance subsampling".
+ */
+enum TJSAMP {
+  /**
+   * 4:4:4 chrominance subsampling (no chrominance subsampling).  The JPEG or
+   * YUV image will contain one chrominance component for every pixel in the
+   * source image.
+   */
+  TJSAMP_444,
+  /**
+   * 4:2:2 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 2x1 block of pixels in the source image.
+   */
+  TJSAMP_422,
+  /**
+   * 4:2:0 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 2x2 block of pixels in the source image.
+   */
+  TJSAMP_420,
+  /**
+   * Grayscale.  The JPEG or YUV image will contain no chrominance components.
+   */
+  TJSAMP_GRAY,
+  /**
+   * 4:4:0 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 1x2 block of pixels in the source image.
+   *
+   * @note 4:4:0 subsampling is not fully accelerated in libjpeg-turbo.
+   */
+  TJSAMP_440,
+  /**
+   * 4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 4x1 block of pixels in the source image.
+   * JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+   * horizontal features.
+   *
+   * @note 4:1:1 subsampling is not fully accelerated in libjpeg-turbo.
+   */
+  TJSAMP_411,
+  /**
+   * 4:4:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 1x4 block of pixels in the source image.
+   * JPEG images compressed with 4:4:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:4:1 is better able to reproduce sharp
+   * vertical features.
+   *
+   * @note 4:4:1 subsampling is not fully accelerated in libjpeg-turbo.
+   */
+  TJSAMP_441,
+  /**
+   * Unknown subsampling.  The JPEG image uses an unusual type of chrominance
+   * subsampling.  Such images can be decompressed into packed-pixel images,
+   * but they cannot be
+   * - decompressed into planar YUV images,
+   * - losslessly transformed if #TJXOPT_CROP is specified, or
+   * - partially decompressed using a cropping region.
+   */
+  TJSAMP_UNKNOWN = -1
+};
+
+/**
+ * MCU block width (in pixels) for a given level of chrominance subsampling.
+ * MCU block sizes:
+ * - 8x8 for no subsampling or grayscale
+ * - 16x8 for 4:2:2
+ * - 8x16 for 4:4:0
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
+ * - 8x32 for 4:4:1
+ */
+static const int tjMCUWidth[TJ_NUMSAMP]  = { 8, 16, 16, 8, 8, 32, 8 };
+
+/**
+ * MCU block height (in pixels) for a given level of chrominance subsampling.
+ * MCU block sizes:
+ * - 8x8 for no subsampling or grayscale
+ * - 16x8 for 4:2:2
+ * - 8x16 for 4:4:0
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
+ * - 8x32 for 4:4:1
+ */
+static const int tjMCUHeight[TJ_NUMSAMP] = { 8, 8, 16, 8, 16, 8, 32 };
+
+
+/**
+ * The number of pixel formats
+ */
+#define TJ_NUMPF  12
+
+/**
+ * Pixel formats
+ */
+enum TJPF {
+  /**
+   * RGB pixel format.  The red, green, and blue components in the image are
+   * stored in 3-sample pixels in the order R, G, B from lowest to highest
+   * memory address within each pixel.
+   */
+  TJPF_RGB,
+  /**
+   * BGR pixel format.  The red, green, and blue components in the image are
+   * stored in 3-sample pixels in the order B, G, R from lowest to highest
+   * memory address within each pixel.
+   */
+  TJPF_BGR,
+  /**
+   * RGBX pixel format.  The red, green, and blue components in the image are
+   * stored in 4-sample pixels in the order R, G, B from lowest to highest
+   * memory address within each pixel.  The X component is ignored when
+   * compressing and undefined when decompressing.
+   */
+  TJPF_RGBX,
+  /**
+   * BGRX pixel format.  The red, green, and blue components in the image are
+   * stored in 4-sample pixels in the order B, G, R from lowest to highest
+   * memory address within each pixel.  The X component is ignored when
+   * compressing and undefined when decompressing.
+   */
+  TJPF_BGRX,
+  /**
+   * XBGR pixel format.  The red, green, and blue components in the image are
+   * stored in 4-sample pixels in the order R, G, B from highest to lowest
+   * memory address within each pixel.  The X component is ignored when
+   * compressing and undefined when decompressing.
+   */
+  TJPF_XBGR,
+  /**
+   * XRGB pixel format.  The red, green, and blue components in the image are
+   * stored in 4-sample pixels in the order B, G, R from highest to lowest
+   * memory address within each pixel.  The X component is ignored when
+   * compressing and undefined when decompressing.
+   */
+  TJPF_XRGB,
+  /**
+   * Grayscale pixel format.  Each 1-sample pixel represents a luminance
+   * (brightness) level from 0 to the maximum sample value (255 for 8-bit
+   * samples, 4095 for 12-bit samples, and 65535 for 16-bit samples.)
+   */
+  TJPF_GRAY,
+  /**
+   * RGBA pixel format.  This is the same as @ref TJPF_RGBX, except that when
+   * decompressing, the X component is guaranteed to be equal to the maximum
+   * sample value, which can be interpreted as an opaque alpha channel.
+   */
+  TJPF_RGBA,
+  /**
+   * BGRA pixel format.  This is the same as @ref TJPF_BGRX, except that when
+   * decompressing, the X component is guaranteed to be equal to the maximum
+   * sample value, which can be interpreted as an opaque alpha channel.
+   */
+  TJPF_BGRA,
+  /**
+   * ABGR pixel format.  This is the same as @ref TJPF_XBGR, except that when
+   * decompressing, the X component is guaranteed to be equal to the maximum
+   * sample value, which can be interpreted as an opaque alpha channel.
+   */
+  TJPF_ABGR,
+  /**
+   * ARGB pixel format.  This is the same as @ref TJPF_XRGB, except that when
+   * decompressing, the X component is guaranteed to be equal to the maximum
+   * sample value, which can be interpreted as an opaque alpha channel.
+   */
+  TJPF_ARGB,
+  /**
+   * CMYK pixel format.  Unlike RGB, which is an additive color model used
+   * primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+   * color model used primarily for printing.  In the CMYK color model, the
+   * value of each color component typically corresponds to an amount of cyan,
+   * magenta, yellow, or black ink that is applied to a white background.  In
+   * order to convert between CMYK and RGB, it is necessary to use a color
+   * management system (CMS.)  A CMS will attempt to map colors within the
+   * printer's gamut to perceptually similar colors in the display's gamut and
+   * vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+   * be defined with a simple formula.  Thus, such a conversion is out of scope
+   * for a codec library.  However, the TurboJPEG API allows for compressing
+   * packed-pixel CMYK images into YCCK JPEG images (see #TJCS_YCCK) and
+   * decompressing YCCK JPEG images into packed-pixel CMYK images.
+   */
+  TJPF_CMYK,
+  /**
+   * Unknown pixel format.  Currently this is only used by #tj3LoadImage8(),
+   * #tj3LoadImage12(), and #tj3LoadImage16().
+   */
+  TJPF_UNKNOWN = -1
+};
+
+/**
+ * Red offset (in samples) for a given pixel format.  This specifies the number
+ * of samples that the red component is offset from the start of the pixel.
+ * For instance, if an 8-bit-per-component pixel of format TJPF_BGRX is stored
+ * in `unsigned char pixel[]`, then the red component will be
+ * `pixel[tjRedOffset[TJPF_BGRX]]`.  This will be -1 if the pixel format does
+ * not have a red component.
+ */
+static const int tjRedOffset[TJ_NUMPF] = {
+  0, 2, 0, 2, 3, 1, -1, 0, 2, 3, 1, -1
+};
+/**
+ * Green offset (in samples) for a given pixel format.  This specifies the
+ * number of samples that the green component is offset from the start of the
+ * pixel.  For instance, if an 8-bit-per-component pixel of format TJPF_BGRX is
+ * stored in `unsigned char pixel[]`, then the green component will be
+ * `pixel[tjGreenOffset[TJPF_BGRX]]`.  This will be -1 if the pixel format does
+ * not have a green component.
+ */
+static const int tjGreenOffset[TJ_NUMPF] = {
+  1, 1, 1, 1, 2, 2, -1, 1, 1, 2, 2, -1
+};
+/**
+ * Blue offset (in samples) for a given pixel format.  This specifies the
+ * number of samples that the blue component is offset from the start of the
+ * pixel.  For instance, if an 8-bit-per-component pixel of format TJPF_BGRX is
+ * stored in `unsigned char pixel[]`, then the blue component will be
+ * `pixel[tjBlueOffset[TJPF_BGRX]]`.  This will be -1 if the pixel format does
+ * not have a blue component.
+ */
+static const int tjBlueOffset[TJ_NUMPF] = {
+  2, 0, 2, 0, 1, 3, -1, 2, 0, 1, 3, -1
+};
+/**
+ * Alpha offset (in samples) for a given pixel format.  This specifies the
+ * number of samples that the alpha component is offset from the start of the
+ * pixel.  For instance, if an 8-bit-per-component pixel of format TJPF_BGRA is
+ * stored in `unsigned char pixel[]`, then the alpha component will be
+ * `pixel[tjAlphaOffset[TJPF_BGRA]]`.  This will be -1 if the pixel format does
+ * not have an alpha component.
+ */
+static const int tjAlphaOffset[TJ_NUMPF] = {
+  -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
+};
+/**
+ * Pixel size (in samples) for a given pixel format
+ */
+static const int tjPixelSize[TJ_NUMPF] = {
+  3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4
+};
+
+
+/**
+ * The number of JPEG colorspaces
+ */
+#define TJ_NUMCS  5
+
+/**
+ * JPEG colorspaces
+ */
+enum TJCS {
+  /**
+   * RGB colorspace.  When compressing the JPEG image, the R, G, and B
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  RGB JPEG images can be
+   * compressed from and decompressed to packed-pixel images with any of the
+   * extended RGB or grayscale pixel formats, but they cannot be compressed
+   * from or decompressed to planar YUV images.
+   */
+  TJCS_RGB,
+  /**
+   * YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+   * mathematical transformation of RGB designed solely for storage and
+   * transmission.  YCbCr images must be converted to RGB before they can
+   * actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+   * component represents the black & white portion of the original image, and
+   * the Cb and Cr (chrominance) components represent the color portion of the
+   * original image.  Originally, the analog equivalent of this transformation
+   * allowed the same signal to drive both black & white and color televisions,
+   * but JPEG images use YCbCr primarily because it allows the color data to be
+   * optionally subsampled for the purposes of reducing network or disk usage.
+   * YCbCr is the most common JPEG colorspace, and YCbCr JPEG images can be
+   * compressed from and decompressed to packed-pixel images with any of the
+   * extended RGB or grayscale pixel formats.  YCbCr JPEG images can also be
+   * compressed from and decompressed to planar YUV images.
+   */
+  TJCS_YCbCr,
+  /**
+   * Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+   * component), and any color data from the source image is discarded.
+   * Grayscale JPEG images can be compressed from and decompressed to
+   * packed-pixel images with any of the extended RGB or grayscale pixel
+   * formats, or they can be compressed from and decompressed to planar YUV
+   * images.
+   */
+  TJCS_GRAY,
+  /**
+   * CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  CMYK JPEG images can
+   * only be compressed from and decompressed to packed-pixel images with the
+   * CMYK pixel format.
+   */
+  TJCS_CMYK,
+  /**
+   * YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+   * rather a mathematical transformation of CMYK designed solely for storage
+   * and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+   * reversibly transformed into YCCK, and as with YCbCr, the chrominance
+   * components in the YCCK pixels can be subsampled without incurring major
+   * perceptual loss.  YCCK JPEG images can only be compressed from and
+   * decompressed to packed-pixel images with the CMYK pixel format.
+   */
+  TJCS_YCCK
+};
+
+
+/**
+ * Parameters
+ */
+enum TJPARAM {
+  /**
+   * Error handling behavior
+   *
+   * **Value**
+   * - `0` *[default]* Allow the current compression/decompression/transform
+   * operation to complete unless a fatal error is encountered.
+   * - `1` Immediately discontinue the current
+   * compression/decompression/transform operation if a warning (non-fatal
+   * error) occurs.
+   */
+  TJPARAM_STOPONWARNING,
+  /**
+   * Row order in packed-pixel source/destination images
+   *
+   * **Value**
+   * - `0` *[default]* top-down (X11) order
+   * - `1` bottom-up (Windows, OpenGL) order
+   */
+  TJPARAM_BOTTOMUP,
+  /**
+   * JPEG destination buffer (re)allocation [compression, lossless
+   * transformation]
+   *
+   * **Value**
+   * - `0` *[default]* Attempt to allocate or reallocate the JPEG destination
+   * buffer as needed.
+   * - `1` Generate an error if the JPEG destination buffer is invalid or too
+   * small.
+   */
+  TJPARAM_NOREALLOC,
+  /**
+   * Perceptual quality of lossy JPEG images [compression only]
+   *
+   * **Value**
+   * - `1`-`100` (`1` = worst quality but best compression, `100` = best
+   * quality but worst compression) *[no default; must be explicitly
+   * specified]*
+   */
+  TJPARAM_QUALITY,
+  /**
+   * Chrominance subsampling level
+   *
+   * The JPEG or YUV image uses (decompression, decoding) or will use (lossy
+   * compression, encoding) the specified level of chrominance subsampling.
+   *
+   * **Value**
+   * - One of the @ref TJSAMP "chrominance subsampling options" *[no default;
+   * must be explicitly specified for lossy compression, encoding, and
+   * decoding]*
+   */
+  TJPARAM_SUBSAMP,
+  /**
+   * JPEG width (in pixels) [decompression only, read-only]
+   */
+  TJPARAM_JPEGWIDTH,
+  /**
+   * JPEG height (in pixels) [decompression only, read-only]
+   */
+  TJPARAM_JPEGHEIGHT,
+  /**
+   * JPEG data precision (bits per sample) [decompression only, read-only]
+   *
+   * The JPEG image uses the specified number of bits per sample.
+   *
+   * **Value**
+   * - `8`, `12`, or `16`
+   *
+   * 12-bit data precision implies #TJPARAM_OPTIMIZE unless #TJPARAM_ARITHMETIC
+   * is set.
+   */
+  TJPARAM_PRECISION,
+  /**
+   * JPEG colorspace
+   *
+   * The JPEG image uses (decompression) or will use (lossy compression) the
+   * specified colorspace.
+   *
+   * **Value**
+   * - One of the @ref TJCS "JPEG colorspaces" *[default for lossy compression:
+   * automatically selected based on the subsampling level and pixel format]*
+   */
+  TJPARAM_COLORSPACE,
+  /**
+   * Chrominance upsampling algorithm [lossy decompression only]
+   *
+   * **Value**
+   * - `0` *[default]* Use smooth upsampling when decompressing a JPEG image
+   * that was compressed using chrominance subsampling.  This creates a smooth
+   * transition between neighboring chrominance components in order to reduce
+   * upsampling artifacts in the decompressed image.
+   * - `1` Use the fastest chrominance upsampling algorithm available, which
+   * may combine upsampling with color conversion.
+   */
+  TJPARAM_FASTUPSAMPLE,
+  /**
+   * DCT/IDCT algorithm [lossy compression and decompression]
+   *
+   * **Value**
+   * - `0` *[default]* Use the most accurate DCT/IDCT algorithm available.
+   * - `1` Use the fastest DCT/IDCT algorithm available.
+   *
+   * This parameter is provided mainly for backward compatibility with libjpeg,
+   * which historically implemented several different DCT/IDCT algorithms
+   * because of performance limitations with 1990s CPUs.  In the libjpeg-turbo
+   * implementation of the TurboJPEG API:
+   * - The "fast" and "accurate" DCT/IDCT algorithms perform similarly on
+   * modern x86/x86-64 CPUs that support AVX2 instructions.
+   * - The "fast" algorithm is generally only about 5-15% faster than the
+   * "accurate" algorithm on other types of CPUs.
+   * - The difference in accuracy between the "fast" and "accurate" algorithms
+   * is the most pronounced at JPEG quality levels above 90 and tends to be
+   * more pronounced with decompression than with compression.
+   * - The "fast" algorithm degrades and is not fully accelerated for JPEG
+   * quality levels above 97, so it will be slower than the "accurate"
+   * algorithm.
+   */
+  TJPARAM_FASTDCT,
+  /**
+   * Optimized baseline entropy coding [lossy compression only]
+   *
+   * **Value**
+   * - `0` *[default]* The JPEG image will use the default Huffman tables.
+   * - `1` Optimal Huffman tables will be computed for the JPEG image.  For
+   * lossless transformation, this can also be specified using
+   * #TJXOPT_OPTIMIZE.
+   *
+   * Optimized baseline entropy coding will improve compression slightly
+   * (generally 5% or less), but it will reduce compression performance
+   * considerably.
+   */
+  TJPARAM_OPTIMIZE,
+  /**
+   * Progressive entropy coding
+   *
+   * **Value**
+   * - `0` *[default for compression, lossless transformation]* The lossy JPEG
+   * image uses (decompression) or will use (compression, lossless
+   * transformation) baseline entropy coding.
+   * - `1` The lossy JPEG image uses (decompression) or will use (compression,
+   * lossless transformation) progressive entropy coding.  For lossless
+   * transformation, this can also be specified using #TJXOPT_PROGRESSIVE.
+   *
+   * Progressive entropy coding will generally improve compression relative to
+   * baseline entropy coding, but it will reduce compression and decompression
+   * performance considerably.  Can be combined with #TJPARAM_ARITHMETIC.
+   * Implies #TJPARAM_OPTIMIZE unless #TJPARAM_ARITHMETIC is also set.
+   */
+  TJPARAM_PROGRESSIVE,
+  /**
+   * Progressive JPEG scan limit for lossy JPEG images [decompression, lossless
+   * transformation]
+   *
+   * Setting this parameter will cause the decompression and transform
+   * functions to return an error if the number of scans in a progressive JPEG
+   * image exceeds the specified limit.  The primary purpose of this is to
+   * allow security-critical applications to guard against an exploit of the
+   * progressive JPEG format described in
+   * <a href="https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf" target="_blank">this report</a>.
+   *
+   * **Value**
+   * - maximum number of progressive JPEG scans that the decompression and
+   * transform functions will process *[default: `0` (no limit)]*
+   *
+   * @see #TJPARAM_PROGRESSIVE
+   */
+  TJPARAM_SCANLIMIT,
+  /**
+   * Arithmetic entropy coding
+   *
+   * **Value**
+   * - `0` *[default for compression, lossless transformation]* The lossy JPEG
+   * image uses (decompression) or will use (compression, lossless
+   * transformation) Huffman entropy coding.
+   * - `1` The lossy JPEG image uses (decompression) or will use (compression,
+   * lossless transformation) arithmetic entropy coding.  For lossless
+   * transformation, this can also be specified using #TJXOPT_ARITHMETIC.
+   *
+   * Arithmetic entropy coding will generally improve compression relative to
+   * Huffman entropy coding, but it will reduce compression and decompression
+   * performance considerably.  Can be combined with #TJPARAM_PROGRESSIVE.
+   */
+  TJPARAM_ARITHMETIC,
+  /**
+   * Lossless JPEG
+   *
+   * **Value**
+   * - `0` *[default for compression]* The JPEG image is (decompression) or
+   * will be (compression) lossy/DCT-based.
+   * - `1` The JPEG image is (decompression) or will be (compression)
+   * lossless/predictive.
+   *
+   * In most cases, compressing and decompressing lossless JPEG images is
+   * considerably slower than compressing and decompressing lossy JPEG images,
+   * and lossless JPEG images are much larger than lossy JPEG images.  Thus,
+   * lossless JPEG images are typically used only for applications that require
+   * mathematically lossless compression.  Also note that the following
+   * features are not available with lossless JPEG images:
+   * - Colorspace conversion (lossless JPEG images always use #TJCS_RGB,
+   * #TJCS_GRAY, or #TJCS_CMYK, depending on the pixel format of the source
+   * image)
+   * - Chrominance subsampling (lossless JPEG images always use #TJSAMP_444)
+   * - JPEG quality selection
+   * - DCT/IDCT algorithm selection
+   * - Progressive entropy coding
+   * - Arithmetic entropy coding
+   * - Compression from/decompression to planar YUV images
+   * - Decompression scaling
+   * - Lossless transformation
+   *
+   * @see #TJPARAM_LOSSLESSPSV, #TJPARAM_LOSSLESSPT
+   */
+  TJPARAM_LOSSLESS,
+  /**
+   * Lossless JPEG predictor selection value (PSV)
+   *
+   * **Value**
+   * - `1`-`7` *[default for compression: `1`]*
+   *
+   * Lossless JPEG compression shares no algorithms with lossy JPEG
+   * compression.  Instead, it uses differential pulse-code modulation (DPCM),
+   * an algorithm whereby each sample is encoded as the difference between the
+   * sample's value and a "predictor", which is based on the values of
+   * neighboring samples.  If Ra is the sample immediately to the left of the
+   * current sample, Rb is the sample immediately above the current sample, and
+   * Rc is the sample diagonally to the left and above the current sample, then
+   * the relationship between the predictor selection value and the predictor
+   * is as follows:
+   *
+   * PSV | Predictor
+   * ----|----------
+   * 1   | Ra
+   * 2   | Rb
+   * 3   | Rc
+   * 4   | Ra + Rb – Rc
+   * 5   | Ra + (Rb – Rc) / 2
+   * 6   | Rb + (Ra – Rc) / 2
+   * 7   | (Ra + Rb) / 2
+   *
+   * Predictors 1-3 are 1-dimensional predictors, whereas Predictors 4-7 are
+   * 2-dimensional predictors.  The best predictor for a particular image
+   * depends on the image.
+   *
+   * @see #TJPARAM_LOSSLESS
+   */
+  TJPARAM_LOSSLESSPSV,
+  /**
+   * Lossless JPEG point transform (Pt)
+   *
+   * **Value**
+   * - `0` through ***precision*** *- 1*, where ***precision*** is the JPEG
+   * data precision in bits *[default for compression: `0`]*
+   *
+   * A point transform value of `0` is necessary in order to generate a fully
+   * lossless JPEG image.  (A non-zero point transform value right-shifts the
+   * input samples by the specified number of bits, which is effectively a form
+   * of lossy color quantization.)
+   *
+   * @see #TJPARAM_LOSSLESS, #TJPARAM_PRECISION
+   */
+  TJPARAM_LOSSLESSPT,
+  /**
+   * JPEG restart marker interval in MCU blocks (lossy) or samples (lossless)
+   * [compression only]
+   *
+   * The nature of entropy coding is such that a corrupt JPEG image cannot
+   * be decompressed beyond the point of corruption unless it contains restart
+   * markers.  A restart marker stops and restarts the entropy coding algorithm
+   * so that, if a JPEG image is corrupted, decompression can resume at the
+   * next marker.  Thus, adding more restart markers improves the fault
+   * tolerance of the JPEG image, but adding too many restart markers can
+   * adversely affect the compression ratio and performance.
+   *
+   * **Value**
+   * - the number of MCU blocks or samples between each restart marker
+   * *[default: `0` (no restart markers)]*
+   *
+   * Setting this parameter to a non-zero value sets #TJPARAM_RESTARTROWS to 0.
+   */
+  TJPARAM_RESTARTBLOCKS,
+  /**
+   * JPEG restart marker interval in MCU rows (lossy) or sample rows (lossless)
+   * [compression only]
+   *
+   * See #TJPARAM_RESTARTBLOCKS for a description of restart markers.
+   *
+   * **Value**
+   * - the number of MCU rows or sample rows between each restart marker
+   * *[default: `0` (no restart markers)]*
+   *
+   * Setting this parameter to a non-zero value sets #TJPARAM_RESTARTBLOCKS to
+   * 0.
+   */
+  TJPARAM_RESTARTROWS,
+  /**
+   * JPEG horizontal pixel density
+   *
+   * **Value**
+   * - The JPEG image has (decompression) or will have (compression) the
+   * specified horizontal pixel density *[default for compression: `1`]*.
+   *
+   * This value is stored in or read from the JPEG header.  It does not affect
+   * the contents of the JPEG image.  Note that this parameter is set by
+   * #tj3LoadImage8() when loading a Windows BMP file that contains pixel
+   * density information, and the value of this parameter is stored to a
+   * Windows BMP file by #tj3SaveImage8() if the value of #TJPARAM_DENSITYUNITS
+   * is `2`.
+   *
+   * @see TJPARAM_DENSITYUNITS
+   */
+  TJPARAM_XDENSITY,
+  /**
+   * JPEG vertical pixel density
+   *
+   * **Value**
+   * - The JPEG image has (decompression) or will have (compression) the
+   * specified vertical pixel density *[default for compression: `1`]*.
+   *
+   * This value is stored in or read from the JPEG header.  It does not affect
+   * the contents of the JPEG image.  Note that this parameter is set by
+   * #tj3LoadImage8() when loading a Windows BMP file that contains pixel
+   * density information, and the value of this parameter is stored to a
+   * Windows BMP file by #tj3SaveImage8() if the value of #TJPARAM_DENSITYUNITS
+   * is `2`.
+   *
+   * @see TJPARAM_DENSITYUNITS
+   */
+  TJPARAM_YDENSITY,
+  /**
+   * JPEG pixel density units
+   *
+   * **Value**
+   * - `0` *[default for compression]* The pixel density of the JPEG image is
+   * expressed (decompression) or will be expressed (compression) in unknown
+   * units.
+   * - `1` The pixel density of the JPEG image is expressed (decompression) or
+   * will be expressed (compression) in units of pixels/inch.
+   * - `2` The pixel density of the JPEG image is expressed (decompression) or
+   * will be expressed (compression) in units of pixels/cm.
+   *
+   * This value is stored in or read from the JPEG header.  It does not affect
+   * the contents of the JPEG image.  Note that this parameter is set by
+   * #tj3LoadImage8() when loading a Windows BMP file that contains pixel
+   * density information, and the value of this parameter is stored to a
+   * Windows BMP file by #tj3SaveImage8() if the value is `2`.
+   *
+   * @see TJPARAM_XDENSITY, TJPARAM_YDENSITY
+   */
+  TJPARAM_DENSITYUNITS,
+  /**
+   * Memory limit for intermediate buffers
+   *
+   * **Value**
+   * - the maximum amount of memory (in megabytes) that will be allocated for
+   * intermediate buffers, which are used with progressive JPEG compression and
+   * decompression, optimized baseline entropy coding, lossless JPEG
+   * compression, and lossless transformation *[default: `0` (no limit)]*
+   */
+  TJPARAM_MAXMEMORY,
+  /**
+   * Image size limit [decompression, lossless transformation, packed-pixel
+   * image loading]
+   *
+   * Setting this parameter will cause the decompression, transform, and image
+   * loading functions to return an error if the number of pixels in the source
+   * image exceeds the specified limit.  This allows security-critical
+   * applications to guard against excessive memory consumption.
+   *
+   * **Value**
+   * - maximum number of pixels that the decompression, transform, and image
+   * loading functions will process *[default: `0` (no limit)]*
+   */
+  TJPARAM_MAXPIXELS
+};
+
+
+/**
+ * The number of error codes
+ */
+#define TJ_NUMERR  2
+
+/**
+ * Error codes
+ */
+enum TJERR {
+  /**
+   * The error was non-fatal and recoverable, but the destination image may
+   * still be corrupt.
+   */
+  TJERR_WARNING,
+  /**
+   * The error was fatal and non-recoverable.
+   */
+  TJERR_FATAL
+};
+
+
+/**
+ * The number of transform operations
+ */
+#define TJ_NUMXOP  8
+
+/**
+ * Transform operations for #tj3Transform()
+ */
+enum TJXOP {
+  /**
+   * Do not transform the position of the image pixels
+   */
+  TJXOP_NONE,
+  /**
+   * Flip (mirror) image horizontally.  This transform is imperfect if there
+   * are any partial MCU blocks on the right edge (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_HFLIP,
+  /**
+   * Flip (mirror) image vertically.  This transform is imperfect if there are
+   * any partial MCU blocks on the bottom edge (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_VFLIP,
+  /**
+   * Transpose image (flip/mirror along upper left to lower right axis.)  This
+   * transform is always perfect.
+   */
+  TJXOP_TRANSPOSE,
+  /**
+   * Transverse transpose image (flip/mirror along upper right to lower left
+   * axis.)  This transform is imperfect if there are any partial MCU blocks in
+   * the image (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_TRANSVERSE,
+  /**
+   * Rotate image clockwise by 90 degrees.  This transform is imperfect if
+   * there are any partial MCU blocks on the bottom edge (see
+   * #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT90,
+  /**
+   * Rotate image 180 degrees.  This transform is imperfect if there are any
+   * partial MCU blocks in the image (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT180,
+  /**
+   * Rotate image counter-clockwise by 90 degrees.  This transform is imperfect
+   * if there are any partial MCU blocks on the right edge (see
+   * #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT270
+};
+
+
+/**
+ * This option will cause #tj3Transform() to return an error if the transform
+ * is not perfect.  Lossless transforms operate on MCU blocks, whose size
+ * depends on the level of chrominance subsampling used (see #tjMCUWidth and
+ * #tjMCUHeight.)  If the image's width or height is not evenly divisible by
+ * the MCU block size, then there will be partial MCU blocks on the right
+ * and/or bottom edges.  It is not possible to move these partial MCU blocks to
+ * the top or left of the image, so any transform that would require that is
+ * "imperfect."  If this option is not specified, then any partial MCU blocks
+ * that cannot be transformed will be left in place, which will create
+ * odd-looking strips on the right or bottom edge of the image.
+ */
+#define TJXOPT_PERFECT  (1 << 0)
+/**
+ * This option will cause #tj3Transform() to discard any partial MCU blocks
+ * that cannot be transformed.
+ */
+#define TJXOPT_TRIM  (1 << 1)
+/**
+ * This option will enable lossless cropping.  See #tj3Transform() for more
+ * information.
+ */
+#define TJXOPT_CROP  (1 << 2)
+/**
+ * This option will discard the color data in the source image and produce a
+ * grayscale destination image.
+ */
+#define TJXOPT_GRAY  (1 << 3)
+/**
+ * This option will prevent #tj3Transform() from outputting a JPEG image for
+ * this particular transform.  (This can be used in conjunction with a custom
+ * filter to capture the transformed DCT coefficients without transcoding
+ * them.)
+ */
+#define TJXOPT_NOOUTPUT  (1 << 4)
+/**
+ * This option will enable progressive entropy coding in the JPEG image
+ * generated by this particular transform.  Progressive entropy coding will
+ * generally improve compression relative to baseline entropy coding (the
+ * default), but it will reduce decompression performance considerably.
+ * Can be combined with #TJXOPT_ARITHMETIC.  Implies #TJXOPT_OPTIMIZE unless
+ * #TJXOPT_ARITHMETIC is also specified.
+ */
+#define TJXOPT_PROGRESSIVE  (1 << 5)
+/**
+ * This option will prevent #tj3Transform() from copying any extra markers
+ * (including EXIF and ICC profile data) from the source image to the
+ * destination image.
+ */
+#define TJXOPT_COPYNONE  (1 << 6)
+/**
+ * This option will enable arithmetic entropy coding in the JPEG image
+ * generated by this particular transform.  Arithmetic entropy coding will
+ * generally improve compression relative to Huffman entropy coding (the
+ * default), but it will reduce decompression performance considerably.  Can be
+ * combined with #TJXOPT_PROGRESSIVE.
+ */
+#define TJXOPT_ARITHMETIC  (1 << 7)
+/**
+ * This option will enable optimized baseline entropy coding in the JPEG image
+ * generated by this particular transform.  Optimized baseline entropy coding
+ * will improve compression slightly (generally 5% or less.)
+ */
+#define TJXOPT_OPTIMIZE  (1 << 8)
+
+
+/**
+ * Scaling factor
+ */
+typedef struct {
+  /**
+   * Numerator
+   */
+  int num;
+  /**
+   * Denominator
+   */
+  int denom;
+} tjscalingfactor;
+
+/**
+ * Cropping region
+ */
+typedef struct {
+  /**
+   * The left boundary of the cropping region.  This must be evenly divisible
+   * by the MCU block width (see #tjMCUWidth.)
+   */
+  int x;
+  /**
+   * The upper boundary of the cropping region.  For lossless transformation,
+   * this must be evenly divisible by the MCU block height (see #tjMCUHeight.)
+   */
+  int y;
+  /**
+   * The width of the cropping region.  Setting this to 0 is the equivalent of
+   * setting it to the width of the source JPEG image - x.
+   */
+  int w;
+  /**
+   * The height of the cropping region.  Setting this to 0 is the equivalent of
+   * setting it to the height of the source JPEG image - y.
+   */
+  int h;
+} tjregion;
+
+/**
+ * A #tjregion structure that specifies no cropping
+ */
+static const tjregion TJUNCROPPED = { 0, 0, 0, 0 };
+
+/**
+ * Lossless transform
+ */
+typedef struct tjtransform {
+  /**
+   * Cropping region
+   */
+  tjregion r;
+  /**
+   * One of the @ref TJXOP "transform operations"
+   */
+  int op;
+  /**
+   * The bitwise OR of one of more of the @ref TJXOPT_ARITHMETIC
+   * "transform options"
+   */
+  int options;
+  /**
+   * Arbitrary data that can be accessed within the body of the callback
+   * function
+   */
+  void *data;
+  /**
+   * A callback function that can be used to modify the DCT coefficients after
+   * they are losslessly transformed but before they are transcoded to a new
+   * JPEG image.  This allows for custom filters or other transformations to be
+   * applied in the frequency domain.
+   *
+   * @param coeffs pointer to an array of transformed DCT coefficients.  (NOTE:
+   * this pointer is not guaranteed to be valid once the callback returns, so
+   * applications wishing to hand off the DCT coefficients to another function
+   * or library should make a copy of them within the body of the callback.)
+   *
+   * @param arrayRegion #tjregion structure containing the width and height of
+   * the array pointed to by `coeffs` as well as its offset relative to the
+   * component plane.  TurboJPEG implementations may choose to split each
+   * component plane into multiple DCT coefficient arrays and call the callback
+   * function once for each array.
+   *
+   * @param planeRegion #tjregion structure containing the width and height of
+   * the component plane to which `coeffs` belongs
+   *
+   * @param componentID ID number of the component plane to which `coeffs`
+   * belongs.  (Y, Cb, and Cr have, respectively, ID's of 0, 1, and 2 in
+   * typical JPEG images.)
+   *
+   * @param transformID ID number of the transformed image to which `coeffs`
+   * belongs.  This is the same as the index of the transform in the
+   * `transforms` array that was passed to #tj3Transform().
+   *
+   * @param transform a pointer to a #tjtransform structure that specifies the
+   * parameters and/or cropping region for this transform
+   *
+   * @return 0 if the callback was successful, or -1 if an error occurred.
+   */
+  int (*customFilter) (short *coeffs, tjregion arrayRegion,
+                       tjregion planeRegion, int componentID, int transformID,
+                       struct tjtransform *transform);
+} tjtransform;
+
+/**
+ * TurboJPEG instance handle
+ */
+typedef void *tjhandle;
+
+
+/**
+ * Compute the scaled value of `dimension` using the given scaling factor.
+ * This macro performs the integer equivalent of `ceil(dimension *
+ * scalingFactor)`.
+ */
+#define TJSCALED(dimension, scalingFactor) \
+  (((dimension) * scalingFactor.num + scalingFactor.denom - 1) / \
+   scalingFactor.denom)
+
+/**
+ * A #tjscalingfactor structure that specifies a scaling factor of 1/1 (no
+ * scaling)
+ */
+static const tjscalingfactor TJUNSCALED = { 1, 1 };
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Create a new TurboJPEG instance.
+ *
+ * @param initType one of the @ref TJINIT "initialization options"
+ *
+ * @return a handle to the newly-created instance, or NULL if an error occurred
+ * (see #tj3GetErrorStr().)
+ */
+DLLEXPORT tjhandle tj3Init(int initType);
+
+
+/**
+ * Set the value of a parameter.
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @param param one of the @ref TJPARAM "parameters"
+ *
+ * @param value value of the parameter (refer to @ref TJPARAM
+ * "parameter documentation")
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr().)
+ */
+DLLEXPORT int tj3Set(tjhandle handle, int param, int value);
+
+
+/**
+ * Get the value of a parameter.
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @param param one of the @ref TJPARAM "parameters"
+ *
+ * @return the value of the specified parameter, or -1 if the value is unknown.
+ */
+DLLEXPORT int tj3Get(tjhandle handle, int param);
+
+
+/**
+ * Compress an 8-bit-per-sample packed-pixel RGB, grayscale, or CMYK image into
+ * an 8-bit-per-sample JPEG image.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcBuf pointer to a buffer containing a packed-pixel RGB, grayscale,
+ * or CMYK source image to be compressed.  This buffer should normally be
+ * `pitch * height` samples in size.  However, you can also use this parameter
+ * to compress from a specific region of a larger buffer.
+ *
+ * @param width width (in pixels) of the source image
+ *
+ * @param pitch samples per row in the source image.  Normally this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>, if the image is unpadded.
+ * (Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can also use this
+ * parameter to specify the row alignment/padding of the source image, to skip
+ * rows, or to compress from a specific region of a larger buffer.
+ *
+ * @param height height (in pixels) of the source image
+ *
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param jpegBuf address of a pointer to a byte buffer that will receive the
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer to
+ * accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tj3Alloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set `*jpegBuf` to NULL to tell TurboJPEG to allocate the buffer for you,
+ * or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tj3JPEGBufSize().  This should ensure that the buffer never has to be
+ * re-allocated.  (Setting #TJPARAM_NOREALLOC guarantees that it won't be.)
+ * .
+ * If you choose option 1, then `*jpegSize` should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJPARAM_NOREALLOC,
+ * you should always check `*jpegBuf` upon return from this function, as it may
+ * have changed.
+ *
+ * @param jpegSize pointer to a size_t variable that holds the size of the JPEG
+ * buffer.  If `*jpegBuf` points to a pre-allocated buffer, then `*jpegSize`
+ * should be set to the size of the buffer.  Upon return, `*jpegSize` will
+ * contain the size of the JPEG image (in bytes.)  If `*jpegBuf` points to a
+ * JPEG buffer that is being reused from a previous call to one of the JPEG
+ * compression functions, then `*jpegSize` is ignored.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3Compress8(tjhandle handle, const unsigned char *srcBuf,
+                           int width, int pitch, int height, int pixelFormat,
+                           unsigned char **jpegBuf, size_t *jpegSize);
+
+/**
+ * Compress a 12-bit-per-sample packed-pixel RGB, grayscale, or CMYK image into
+ * a 12-bit-per-sample JPEG image.
+ *
+ * \details \copydetails tj3Compress8()
+ */
+DLLEXPORT int tj3Compress12(tjhandle handle, const short *srcBuf, int width,
+                            int pitch, int height, int pixelFormat,
+                            unsigned char **jpegBuf, size_t *jpegSize);
+
+/**
+ * Compress a 16-bit-per-sample packed-pixel RGB, grayscale, or CMYK image into
+ * a 16-bit-per-sample lossless JPEG image.
+ *
+ * \details \copydetails tj3Compress8()
+ */
+DLLEXPORT int tj3Compress16(tjhandle handle, const unsigned short *srcBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            unsigned char **jpegBuf, size_t *jpegSize);
+
+
+/**
+ * Compress an 8-bit-per-sample unified planar YUV image into an
+ * 8-bit-per-sample JPEG image.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcBuf pointer to a buffer containing a unified planar YUV source
+ * image to be compressed.  The size of this buffer should match the value
+ * returned by #tj3YUVBufSize() for the given image width, height, row
+ * alignment, and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  The
+ * Y, U (Cb), and V (Cr) image planes should be stored sequentially in the
+ * buffer.  (Refer to @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param width width (in pixels) of the source image.  If the width is not an
+ * even multiple of the MCU block width (see #tjMCUWidth), then an intermediate
+ * buffer copy will be performed.
+ *
+ * @param align row alignment (in bytes) of the source image (must be a power
+ * of 2.)  Setting this parameter to n indicates that each row in each plane of
+ * the source image is padded to the nearest multiple of n bytes
+ * (1 = unpadded.)
+ *
+ * @param height height (in pixels) of the source image.  If the height is not
+ * an even multiple of the MCU block height (see #tjMCUHeight), then an
+ * intermediate buffer copy will be performed.
+ *
+ * @param jpegBuf address of a pointer to a byte buffer that will receive the
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer to
+ * accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tj3Alloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set `*jpegBuf` to NULL to tell TurboJPEG to allocate the buffer for you,
+ * or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tj3JPEGBufSize().  This should ensure that the buffer never has to be
+ * re-allocated.  (Setting #TJPARAM_NOREALLOC guarantees that it won't be.)
+ * .
+ * If you choose option 1, then `*jpegSize` should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJPARAM_NOREALLOC,
+ * you should always check `*jpegBuf` upon return from this function, as it may
+ * have changed.
+ *
+ * @param jpegSize pointer to a size_t variable that holds the size of the JPEG
+ * buffer.  If `*jpegBuf` points to a pre-allocated buffer, then `*jpegSize`
+ * should be set to the size of the buffer.  Upon return, `*jpegSize` will
+ * contain the size of the JPEG image (in bytes.)  If `*jpegBuf` points to a
+ * JPEG buffer that is being reused from a previous call to one of the JPEG
+ * compression functions, then `*jpegSize` is ignored.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3CompressFromYUV8(tjhandle handle,
+                                  const unsigned char *srcBuf, int width,
+                                  int align, int height,
+                                  unsigned char **jpegBuf, size_t *jpegSize);
+
+
+/**
+ * Compress a set of 8-bit-per-sample Y, U (Cb), and V (Cr) image planes into
+ * an 8-bit-per-sample JPEG image.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if compressing a grayscale image) that contain a YUV
+ * source image to be compressed.  These planes can be contiguous or
+ * non-contiguous in memory.  The size of each plane should match the value
+ * returned by #tj3YUVPlaneSize() for the given image width, height, strides,
+ * and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  Refer to
+ * @ref YUVnotes "YUV Image Format Notes" for more details.
+ *
+ * @param width width (in pixels) of the source image.  If the width is not an
+ * even multiple of the MCU block width (see #tjMCUWidth), then an intermediate
+ * buffer copy will be performed.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * row in the corresponding plane of the YUV source image.  Setting the stride
+ * for any plane to 0 is the same as setting it to the plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If `strides` is NULL, then the
+ * strides for all planes will be set to their respective plane widths.  You
+ * can adjust the strides in order to specify an arbitrary amount of row
+ * padding in each plane or to create a JPEG image from a subregion of a larger
+ * planar YUV image.
+ *
+ * @param height height (in pixels) of the source image.  If the height is not
+ * an even multiple of the MCU block height (see #tjMCUHeight), then an
+ * intermediate buffer copy will be performed.
+ *
+ * @param jpegBuf address of a pointer to a byte buffer that will receive the
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer to
+ * accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tj3Alloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set `*jpegBuf` to NULL to tell TurboJPEG to allocate the buffer for you,
+ * or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tj3JPEGBufSize().  This should ensure that the buffer never has to be
+ * re-allocated.  (Setting #TJPARAM_NOREALLOC guarantees that it won't be.)
+ * .
+ * If you choose option 1, then `*jpegSize` should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJPARAM_NOREALLOC,
+ * you should always check `*jpegBuf` upon return from this function, as it may
+ * have changed.
+ *
+ * @param jpegSize pointer to a size_t variable that holds the size of the JPEG
+ * buffer.  If `*jpegBuf` points to a pre-allocated buffer, then `*jpegSize`
+ * should be set to the size of the buffer.  Upon return, `*jpegSize` will
+ * contain the size of the JPEG image (in bytes.)  If `*jpegBuf` points to a
+ * JPEG buffer that is being reused from a previous call to one of the JPEG
+ * compression functions, then `*jpegSize` is ignored.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3CompressFromYUVPlanes8(tjhandle handle,
+                                        const unsigned char * const *srcPlanes,
+                                        int width, const int *strides,
+                                        int height, unsigned char **jpegBuf,
+                                        size_t *jpegSize);
+
+
+/**
+ * The maximum size of the buffer (in bytes) required to hold a JPEG image with
+ * the given parameters.  The number of bytes returned by this function is
+ * larger than the size of the uncompressed source image.  The reason for this
+ * is that the JPEG format uses 16-bit coefficients, so it is possible for a
+ * very high-quality source image with very high-frequency content to expand
+ * rather than compress when converted to the JPEG format.  Such images
+ * represent very rare corner cases, but since there is no way to predict the
+ * size of a JPEG image prior to compression, the corner cases have to be
+ * handled.
+ *
+ * @param width width (in pixels) of the image
+ *
+ * @param height height (in pixels) of the image
+ *
+ * @param jpegSubsamp the level of chrominance subsampling to be used when
+ * generating the JPEG image (see @ref TJSAMP
+ * "Chrominance subsampling options".)  #TJSAMP_UNKNOWN is treated like
+ * #TJSAMP_444, since a buffer large enough to hold a JPEG image with no
+ * subsampling should also be large enough to hold a JPEG image with an
+ * arbitrary level of subsampling.  Note that lossless JPEG images always
+ * use #TJSAMP_444.
+ *
+ * @return the maximum size of the buffer (in bytes) required to hold the
+ * image, or 0 if the arguments are out of bounds.
+ */
+DLLEXPORT size_t tj3JPEGBufSize(int width, int height, int jpegSubsamp);
+
+
+/**
+ * The size of the buffer (in bytes) required to hold a unified planar YUV
+ * image with the given parameters.
+ *
+ * @param width width (in pixels) of the image
+ *
+ * @param align row alignment (in bytes) of the image (must be a power of 2.)
+ * Setting this parameter to n specifies that each row in each plane of the
+ * image will be padded to the nearest multiple of n bytes (1 = unpadded.)
+ *
+ * @param height height (in pixels) of the image
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the size of the buffer (in bytes) required to hold the image, or 0
+ * if the arguments are out of bounds.
+ */
+DLLEXPORT size_t tj3YUVBufSize(int width, int align, int height, int subsamp);
+
+
+/**
+ * The size of the buffer (in bytes) required to hold a YUV image plane with
+ * the given parameters.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param width width (in pixels) of the YUV image.  NOTE: this is the width of
+ * the whole image, not the plane width.
+ *
+ * @param stride bytes per row in the image plane.  Setting this to 0 is the
+ * equivalent of setting it to the plane width.
+ *
+ * @param height height (in pixels) of the YUV image.  NOTE: this is the height
+ * of the whole image, not the plane height.
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the size of the buffer (in bytes) required to hold the YUV image
+ * plane, or 0 if the arguments are out of bounds.
+ */
+DLLEXPORT size_t tj3YUVPlaneSize(int componentID, int width, int stride,
+                                 int height, int subsamp);
+
+
+/**
+ * The plane width of a YUV image plane with the given parameters.  Refer to
+ * @ref YUVnotes "YUV Image Format Notes" for a description of plane width.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param width width (in pixels) of the YUV image
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the plane width of a YUV image plane with the given parameters, or 0
+ * if the arguments are out of bounds.
+ */
+DLLEXPORT int tj3YUVPlaneWidth(int componentID, int width, int subsamp);
+
+
+/**
+ * The plane height of a YUV image plane with the given parameters.  Refer to
+ * @ref YUVnotes "YUV Image Format Notes" for a description of plane height.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param height height (in pixels) of the YUV image
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the plane height of a YUV image plane with the given parameters, or
+ * 0 if the arguments are out of bounds.
+ */
+DLLEXPORT int tj3YUVPlaneHeight(int componentID, int height, int subsamp);
+
+
+/**
+ * Encode an 8-bit-per-sample packed-pixel RGB or grayscale image into an
+ * 8-bit-per-sample unified planar YUV image.  This function performs color
+ * conversion (which is accelerated in the libjpeg-turbo implementation) but
+ * does not execute any of the other steps in the JPEG compression process.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcBuf pointer to a buffer containing a packed-pixel RGB or grayscale
+ * source image to be encoded.  This buffer should normally be `pitch * height`
+ * bytes in size.  However, you can also use this parameter to encode from a
+ * specific region of a larger buffer.
+ *
+ * @param width width (in pixels) of the source image
+ *
+ * @param pitch bytes per row in the source image.  Normally this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>, if the image is unpadded.
+ * (Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can also use this
+ * parameter to specify the row alignment/padding of the source image, to skip
+ * rows, or to encode from a specific region of a larger packed-pixel image.
+ *
+ * @param height height (in pixels) of the source image
+ *
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param dstBuf pointer to a buffer that will receive the unified planar YUV
+ * image.  Use #tj3YUVBufSize() to determine the appropriate size for this
+ * buffer based on the image width, height, row alignment, and level of
+ * chrominance subsampling (see #TJPARAM_SUBSAMP.)  The Y, U (Cb), and V (Cr)
+ * image planes will be stored sequentially in the buffer.  (Refer to
+ * @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param align row alignment (in bytes) of the YUV image (must be a power of
+ * 2.)  Setting this parameter to n will cause each row in each plane of the
+ * YUV image to be padded to the nearest multiple of n bytes (1 = unpadded.)
+ * To generate images suitable for X Video, `align` should be set to 4.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3EncodeYUV8(tjhandle handle, const unsigned char *srcBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            unsigned char *dstBuf, int align);
+
+
+/**
+ * Encode an 8-bit-per-sample packed-pixel RGB or grayscale image into separate
+ * 8-bit-per-sample Y, U (Cb), and V (Cr) image planes.  This function performs
+ * color conversion (which is accelerated in the libjpeg-turbo implementation)
+ * but does not execute any of the other steps in the JPEG compression process.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcBuf pointer to a buffer containing a packed-pixel RGB or grayscale
+ * source image to be encoded.  This buffer should normally be `pitch * height`
+ * bytes in size.  However, you can also use this parameter to encode from a
+ * specific region of a larger buffer.
+ *
+ *
+ * @param width width (in pixels) of the source image
+ *
+ * @param pitch bytes per row in the source image.  Normally this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>, if the image is unpadded.
+ * (Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can also use this
+ * parameter to specify the row alignment/padding of the source image, to skip
+ * rows, or to encode from a specific region of a larger packed-pixel image.
+ *
+ * @param height height (in pixels) of the source image
+ *
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param dstPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if generating a grayscale image) that will receive the
+ * encoded image.  These planes can be contiguous or non-contiguous in memory.
+ * Use #tj3YUVPlaneSize() to determine the appropriate size for each plane
+ * based on the image width, height, strides, and level of chrominance
+ * subsampling (see #TJPARAM_SUBSAMP.)  Refer to @ref YUVnotes
+ * "YUV Image Format Notes" for more details.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * row in the corresponding plane of the YUV image.  Setting the stride for any
+ * plane to 0 is the same as setting it to the plane width (see @ref YUVnotes
+ * "YUV Image Format Notes".)  If `strides` is NULL, then the strides for all
+ * planes will be set to their respective plane widths.  You can adjust the
+ * strides in order to add an arbitrary amount of row padding to each plane or
+ * to encode an RGB or grayscale image into a subregion of a larger planar YUV
+ * image.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3EncodeYUVPlanes8(tjhandle handle, const unsigned char *srcBuf,
+                                  int width, int pitch, int height,
+                                  int pixelFormat, unsigned char **dstPlanes,
+                                  int *strides);
+
+
+/**
+ * Retrieve information about a JPEG image without decompressing it, or prime
+ * the decompressor with quantization and Huffman tables.  If a JPEG image is
+ * passed to this function, then the @ref TJPARAM "parameters" that describe
+ * the JPEG image will be set when the function returns.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param jpegBuf pointer to a byte buffer containing a JPEG image or an
+ * "abbreviated table specification" (AKA "tables-only") datastream.  Passing a
+ * tables-only datastream to this function primes the decompressor with
+ * quantization and Huffman tables that can be used when decompressing
+ * subsequent "abbreviated image" datastreams.  This is useful, for instance,
+ * when decompressing video streams in which all frames share the same
+ * quantization and Huffman tables.
+ *
+ * @param jpegSize size of the JPEG image or tables-only datastream (in bytes)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecompressHeader(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  size_t jpegSize);
+
+
+/**
+ * Returns a list of fractional scaling factors that the JPEG decompressor
+ * supports.
+ *
+ * @param numScalingFactors pointer to an integer variable that will receive
+ * the number of elements in the list
+ *
+ * @return a pointer to a list of fractional scaling factors, or NULL if an
+ * error is encountered (see #tj3GetErrorStr().)
+ */
+DLLEXPORT tjscalingfactor *tj3GetScalingFactors(int *numScalingFactors);
+
+
+/**
+ * Set the scaling factor for subsequent lossy decompression operations.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param scalingFactor #tjscalingfactor structure that specifies a fractional
+ * scaling factor that the decompressor supports (see #tj3GetScalingFactors()),
+ * or <tt>#TJUNSCALED</tt> for no scaling.  Decompression scaling is a function
+ * of the IDCT algorithm, so scaling factors are generally limited to multiples
+ * of 1/8.  If the entire JPEG image will be decompressed, then the width and
+ * height of the scaled destination image can be determined by calling
+ * #TJSCALED() with the JPEG width and height (see #TJPARAM_JPEGWIDTH and
+ * #TJPARAM_JPEGHEIGHT) and the specified scaling factor.  When decompressing
+ * into a planar YUV image, an intermediate buffer copy will be performed if
+ * the width or height of the scaled destination image is not an even multiple
+ * of the MCU block size (see #tjMCUWidth and #tjMCUHeight.)  Note that
+ * decompression scaling is not available (and the specified scaling factor is
+ * ignored) when decompressing lossless JPEG images (see #TJPARAM_LOSSLESS),
+ * since the IDCT algorithm is not used with those images.  Note also that
+ * #TJPARAM_FASTDCT is ignored when decompression scaling is enabled.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr().)
+ */
+DLLEXPORT int tj3SetScalingFactor(tjhandle handle,
+                                  tjscalingfactor scalingFactor);
+
+
+/**
+ * Set the cropping region for partially decompressing a lossy JPEG image into
+ * a packed-pixel image
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param croppingRegion #tjregion structure that specifies a subregion of the
+ * JPEG image to decompress, or <tt>#TJUNCROPPED</tt> for no cropping.  The
+ * left boundary of the cropping region must be evenly divisible by the scaled
+ * MCU block width (<tt>#TJSCALED(#tjMCUWidth[subsamp], scalingFactor)</tt>,
+ * where `subsamp` is the level of chrominance subsampling in the JPEG image
+ * (see #TJPARAM_SUBSAMP) and `scalingFactor` is the decompression scaling
+ * factor (see #tj3SetScalingFactor().)  The cropping region should be
+ * specified relative to the scaled image dimensions.  Unless `croppingRegion`
+ * is <tt>#TJUNCROPPED</tt>, the JPEG header must be read (see
+ * #tj3DecompressHeader()) prior to calling this function.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr().)
+ */
+DLLEXPORT int tj3SetCroppingRegion(tjhandle handle, tjregion croppingRegion);
+
+
+/**
+ * Decompress an 8-bit-per-sample JPEG image into an 8-bit-per-sample
+ * packed-pixel RGB, grayscale, or CMYK image.  The @ref TJPARAM "parameters"
+ * that describe the JPEG image will be set when this function returns.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param jpegBuf pointer to a byte buffer containing the JPEG image to
+ * decompress
+ *
+ * @param jpegSize size of the JPEG image (in bytes)
+ *
+ * @param dstBuf pointer to a buffer that will receive the packed-pixel
+ * decompressed image.  This buffer should normally be
+ * `pitch * destinationHeight` samples in size.  However, you can also use this
+ * parameter to decompress into a specific region of a larger buffer.  NOTE:
+ * If the JPEG image is lossy, then `destinationHeight` is either the scaled
+ * JPEG height (see #TJSCALED(), #TJPARAM_JPEGHEIGHT, and
+ * #tj3SetScalingFactor()) or the height of the cropping region (see
+ * #tj3SetCroppingRegion().)  If the JPEG image is lossless, then
+ * `destinationHeight` is the JPEG height.
+ *
+ * @param pitch samples per row in the destination image.  Normally this should
+ * be set to <tt>destinationWidth * #tjPixelSize[pixelFormat]</tt>, if the
+ * destination image should be unpadded.  (Setting this parameter to 0 is the
+ * equivalent of setting it to
+ * <tt>destinationWidth * #tjPixelSize[pixelFormat]</tt>.)  However, you can
+ * also use this parameter to specify the row alignment/padding of the
+ * destination image, to skip rows, or to decompress into a specific region of
+ * a larger buffer.  NOTE: If the JPEG image is lossy, then `destinationWidth`
+ * is either the scaled JPEG width (see #TJSCALED(), #TJPARAM_JPEGWIDTH, and
+ * #tj3SetScalingFactor()) or the width of the cropping region (see
+ * #tj3SetCroppingRegion().)  If the JPEG image is lossless, then
+ * `destinationWidth` is the JPEG width.
+ *
+ * @param pixelFormat pixel format of the destination image (see @ref
+ * TJPF "Pixel formats".)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3Decompress8(tjhandle handle, const unsigned char *jpegBuf,
+                             size_t jpegSize, unsigned char *dstBuf, int pitch,
+                             int pixelFormat);
+
+/**
+ * Decompress a 12-bit-per-sample JPEG image into a 12-bit-per-sample
+ * packed-pixel RGB, grayscale, or CMYK image.
+ *
+ * \details \copydetails tj3Decompress8()
+ */
+DLLEXPORT int tj3Decompress12(tjhandle handle, const unsigned char *jpegBuf,
+                              size_t jpegSize, short *dstBuf, int pitch,
+                              int pixelFormat);
+
+/**
+ * Decompress a 16-bit-per-sample lossless JPEG image into a 16-bit-per-sample
+ * packed-pixel RGB, grayscale, or CMYK image.
+ *
+ * \details \copydetails tj3Decompress8()
+ */
+DLLEXPORT int tj3Decompress16(tjhandle handle, const unsigned char *jpegBuf,
+                              size_t jpegSize, unsigned short *dstBuf,
+                              int pitch, int pixelFormat);
+
+
+/**
+ * Decompress an 8-bit-per-sample JPEG image into an 8-bit-per-sample unified
+ * planar YUV image.  This function performs JPEG decompression but leaves out
+ * the color conversion step, so a planar YUV image is generated instead of a
+ * packed-pixel image.  The @ref TJPARAM "parameters" that describe the JPEG
+ * image will be set when this function returns.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param jpegBuf pointer to a byte buffer containing the JPEG image to
+ * decompress
+ *
+ * @param jpegSize size of the JPEG image (in bytes)
+ *
+ * @param dstBuf pointer to a buffer that will receive the unified planar YUV
+ * decompressed image.  Use #tj3YUVBufSize() to determine the appropriate size
+ * for this buffer based on the scaled JPEG width and height (see #TJSCALED(),
+ * #TJPARAM_JPEGWIDTH, #TJPARAM_JPEGHEIGHT, and #tj3SetScalingFactor()), row
+ * alignment, and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  The
+ * Y, U (Cb), and V (Cr) image planes will be stored sequentially in the
+ * buffer.  (Refer to @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param align row alignment (in bytes) of the YUV image (must be a power of
+ * 2.)  Setting this parameter to n will cause each row in each plane of the
+ * YUV image to be padded to the nearest multiple of n bytes (1 = unpadded.)
+ * To generate images suitable for X Video, `align` should be set to 4.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecompressToYUV8(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  size_t jpegSize,
+                                  unsigned char *dstBuf, int align);
+
+
+/**
+ * Decompress an 8-bit-per-sample JPEG image into separate 8-bit-per-sample Y,
+ * U (Cb), and V (Cr) image planes.  This function performs JPEG decompression
+ * but leaves out the color conversion step, so a planar YUV image is generated
+ * instead of a packed-pixel image.  The @ref TJPARAM "parameters" that
+ * describe the JPEG image will be set when this function returns.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param jpegBuf pointer to a byte buffer containing the JPEG image to
+ * decompress
+ *
+ * @param jpegSize size of the JPEG image (in bytes)
+ *
+ * @param dstPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if decompressing a grayscale image) that will receive
+ * the decompressed image.  These planes can be contiguous or non-contiguous in
+ * memory.  Use #tj3YUVPlaneSize() to determine the appropriate size for each
+ * plane based on the scaled JPEG width and height (see #TJSCALED(),
+ * #TJPARAM_JPEGWIDTH, #TJPARAM_JPEGHEIGHT, and #tj3SetScalingFactor()),
+ * strides, and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  Refer
+ * to @ref YUVnotes "YUV Image Format Notes" for more details.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * row in the corresponding plane of the YUV image.  Setting the stride for any
+ * plane to 0 is the same as setting it to the scaled plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If `strides` is NULL, then the
+ * strides for all planes will be set to their respective scaled plane widths.
+ * You can adjust the strides in order to add an arbitrary amount of row
+ * padding to each plane or to decompress the JPEG image into a subregion of a
+ * larger planar YUV image.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecompressToYUVPlanes8(tjhandle handle,
+                                        const unsigned char *jpegBuf,
+                                        size_t jpegSize,
+                                        unsigned char **dstPlanes,
+                                        int *strides);
+
+
+/**
+ * Decode an 8-bit-per-sample unified planar YUV image into an 8-bit-per-sample
+ * packed-pixel RGB or grayscale image.  This function performs color
+ * conversion (which is accelerated in the libjpeg-turbo implementation) but
+ * does not execute any of the other steps in the JPEG decompression process.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param srcBuf pointer to a buffer containing a unified planar YUV source
+ * image to be decoded.  The size of this buffer should match the value
+ * returned by #tj3YUVBufSize() for the given image width, height, row
+ * alignment, and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  The
+ * Y, U (Cb), and V (Cr) image planes should be stored sequentially in the
+ * source buffer.  (Refer to @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param align row alignment (in bytes) of the YUV source image (must be a
+ * power of 2.)  Setting this parameter to n indicates that each row in each
+ * plane of the YUV source image is padded to the nearest multiple of n bytes
+ * (1 = unpadded.)
+ *
+ * @param dstBuf pointer to a buffer that will receive the packed-pixel decoded
+ * image.  This buffer should normally be `pitch * height` bytes in size.
+ * However, you can also use this parameter to decode into a specific region of
+ * a larger buffer.
+ *
+ * @param width width (in pixels) of the source and destination images
+ *
+ * @param pitch bytes per row in the destination image.  Normally this should
+ * be set to <tt>width * #tjPixelSize[pixelFormat]</tt>, if the destination
+ * image should be unpadded.  (Setting this parameter to 0 is the equivalent of
+ * setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can
+ * also use this parameter to specify the row alignment/padding of the
+ * destination image, to skip rows, or to decode into a specific region of a
+ * larger buffer.
+ *
+ * @param height height (in pixels) of the source and destination images
+ *
+ * @param pixelFormat pixel format of the destination image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecodeYUV8(tjhandle handle, const unsigned char *srcBuf,
+                            int align, unsigned char *dstBuf, int width,
+                            int pitch, int height, int pixelFormat);
+
+
+/**
+ * Decode a set of 8-bit-per-sample Y, U (Cb), and V (Cr) image planes into an
+ * 8-bit-per-sample packed-pixel RGB or grayscale image.  This function
+ * performs color conversion (which is accelerated in the libjpeg-turbo
+ * implementation) but does not execute any of the other steps in the JPEG
+ * decompression process.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param srcPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if decoding a grayscale image) that contain a YUV image
+ * to be decoded.  These planes can be contiguous or non-contiguous in memory.
+ * The size of each plane should match the value returned by #tj3YUVPlaneSize()
+ * for the given image width, height, strides, and level of chrominance
+ * subsampling (see #TJPARAM_SUBSAMP.)  Refer to @ref YUVnotes
+ * "YUV Image Format Notes" for more details.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * row in the corresponding plane of the YUV source image.  Setting the stride
+ * for any plane to 0 is the same as setting it to the plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If `strides` is NULL, then the
+ * strides for all planes will be set to their respective plane widths.  You
+ * can adjust the strides in order to specify an arbitrary amount of row
+ * padding in each plane or to decode a subregion of a larger planar YUV image.
+ *
+ * @param dstBuf pointer to a buffer that will receive the packed-pixel decoded
+ * image.  This buffer should normally be `pitch * height` bytes in size.
+ * However, you can also use this parameter to decode into a specific region of
+ * a larger buffer.
+ *
+ * @param width width (in pixels) of the source and destination images
+ *
+ * @param pitch bytes per row in the destination image.  Normally this should
+ * be set to <tt>width * #tjPixelSize[pixelFormat]</tt>, if the destination
+ * image should be unpadded.  (Setting this parameter to 0 is the equivalent of
+ * setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can
+ * also use this parameter to specify the row alignment/padding of the
+ * destination image, to skip rows, or to decode into a specific region of a
+ * larger buffer.
+ *
+ * @param height height (in pixels) of the source and destination images
+ *
+ * @param pixelFormat pixel format of the destination image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecodeYUVPlanes8(tjhandle handle,
+                                  const unsigned char * const *srcPlanes,
+                                  const int *strides, unsigned char *dstBuf,
+                                  int width, int pitch, int height,
+                                  int pixelFormat);
+
+
+/**
+ * Losslessly transform a JPEG image into another JPEG image.  Lossless
+ * transforms work by moving the raw DCT coefficients from one JPEG image
+ * structure to another without altering the values of the coefficients.  While
+ * this is typically faster than decompressing the image, transforming it, and
+ * re-compressing it, lossless transforms are not free.  Each lossless
+ * transform requires reading and performing entropy decoding on all of the
+ * coefficients in the source image, regardless of the size of the destination
+ * image.  Thus, this function provides a means of generating multiple
+ * transformed images from the same source or applying multiple transformations
+ * simultaneously, in order to eliminate the need to read the source
+ * coefficients multiple times.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * lossless transformation
+ *
+ * @param jpegBuf pointer to a byte buffer containing the JPEG source image to
+ * transform
+ *
+ * @param jpegSize size of the JPEG source image (in bytes)
+ *
+ * @param n the number of transformed JPEG images to generate
+ *
+ * @param dstBufs pointer to an array of n byte buffers.  `dstBufs[i]` will
+ * receive a JPEG image that has been transformed using the parameters in
+ * `transforms[i]`.  TurboJPEG has the ability to reallocate the JPEG
+ * destination buffer to accommodate the size of the transformed JPEG image.
+ * Thus, you can choose to:
+ * -# pre-allocate the JPEG destination buffer with an arbitrary size using
+ * #tj3Alloc() and let TurboJPEG grow the buffer as needed,
+ * -# set `dstBufs[i]` to NULL to tell TurboJPEG to allocate the buffer for
+ * you, or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tj3JPEGBufSize() with the transformed or cropped width and height and the
+ * level of subsampling used in the source image.  Under normal circumstances,
+ * this should ensure that the buffer never has to be re-allocated.  (Setting
+ * #TJPARAM_NOREALLOC guarantees that it won't be.)  Note, however, that there
+ * are some rare cases (such as transforming images with a large amount of
+ * embedded EXIF or ICC profile data) in which the transformed JPEG image will
+ * be larger than the worst-case size, and #TJPARAM_NOREALLOC cannot be used in
+ * those cases.
+ * .
+ * If you choose option 1, then `dstSizes[i]` should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJPARAM_NOREALLOC,
+ * you should always check `dstBufs[i]` upon return from this function, as it
+ * may have changed.
+ *
+ * @param dstSizes pointer to an array of n size_t variables that will receive
+ * the actual sizes (in bytes) of each transformed JPEG image.  If `dstBufs[i]`
+ * points to a pre-allocated buffer, then `dstSizes[i]` should be set to the
+ * size of the buffer.  Upon return, `dstSizes[i]` will contain the size of the
+ * transformed JPEG image (in bytes.)
+ *
+ * @param transforms pointer to an array of n #tjtransform structures, each of
+ * which specifies the transform parameters and/or cropping region for the
+ * corresponding transformed JPEG image.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3Transform(tjhandle handle, const unsigned char *jpegBuf,
+                           size_t jpegSize, int n, unsigned char **dstBufs,
+                           size_t *dstSizes, const tjtransform *transforms);
+
+
+/**
+ * Destroy a TurboJPEG instance.
+ *
+ * @param handle handle to a TurboJPEG instance.  If the handle is NULL, then
+ * this function has no effect.
+ */
+DLLEXPORT void tj3Destroy(tjhandle handle);
+
+
+/**
+ * Allocate a byte buffer for use with TurboJPEG.  You should always use this
+ * function to allocate the JPEG destination buffer(s) for the compression and
+ * transform functions unless you are disabling automatic buffer (re)allocation
+ * (by setting #TJPARAM_NOREALLOC.)
+ *
+ * @param bytes the number of bytes to allocate
+ *
+ * @return a pointer to a newly-allocated buffer with the specified number of
+ * bytes.
+ *
+ * @see tj3Free()
+ */
+DLLEXPORT void *tj3Alloc(size_t bytes);
+
+
+/**
+ * Load an 8-bit-per-sample packed-pixel image from disk into memory.
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @param filename name of a file containing a packed-pixel image in Windows
+ * BMP or PBMPLUS (PPM/PGM) format.  Windows BMP files require 8-bit-per-sample
+ * data precision.  If the data precision of the PBMPLUS file does not match
+ * the target data precision, then upconverting or downconverting will be
+ * performed.
+ *
+ * @param width pointer to an integer variable that will receive the width (in
+ * pixels) of the packed-pixel image
+ *
+ * @param align row alignment (in samples) of the packed-pixel buffer to be
+ * returned (must be a power of 2.)  Setting this parameter to n will cause all
+ * rows in the buffer to be padded to the nearest multiple of n samples
+ * (1 = unpadded.)
+ *
+ * @param height pointer to an integer variable that will receive the height
+ * (in pixels) of the packed-pixel image
+ *
+ * @param pixelFormat pointer to an integer variable that specifies or will
+ * receive the pixel format of the packed-pixel buffer.  The behavior of this
+ * function will vary depending on the value of `*pixelFormat` passed to the
+ * function:
+ * - @ref TJPF_UNKNOWN : The packed-pixel buffer returned by this function will
+ * use the most optimal pixel format for the file type, and `*pixelFormat` will
+ * contain the ID of that pixel format upon successful return from this
+ * function.
+ * - @ref TJPF_GRAY : Only PGM files and 8-bit-per-pixel BMP files with a
+ * grayscale colormap can be loaded.
+ * - @ref TJPF_CMYK : The RGB or grayscale pixels stored in the file will be
+ * converted using a quick & dirty algorithm that is suitable only for testing
+ * purposes.  (Proper conversion between CMYK and other formats requires a
+ * color management system.)
+ * - Other @ref TJPF "pixel formats" : The packed-pixel buffer will use the
+ * specified pixel format, and pixel format conversion will be performed if
+ * necessary.
+ *
+ * @return a pointer to a newly-allocated buffer containing the packed-pixel
+ * image, converted to the chosen pixel format and with the chosen row
+ * alignment, or NULL if an error occurred (see #tj3GetErrorStr().)  This
+ * buffer should be freed using #tj3Free().
+ */
+DLLEXPORT unsigned char *tj3LoadImage8(tjhandle handle, const char *filename,
+                                       int *width, int align, int *height,
+                                       int *pixelFormat);
+
+/**
+ * Load a 12-bit-per-sample packed-pixel image from disk into memory.
+ *
+ * \details \copydetails tj3LoadImage8()
+ */
+DLLEXPORT short *tj3LoadImage12(tjhandle handle, const char *filename,
+                                int *width, int align, int *height,
+                                int *pixelFormat);
+
+/**
+ * Load a 16-bit-per-sample packed-pixel image from disk into memory.
+ *
+ * \details \copydetails tj3LoadImage8()
+ */
+DLLEXPORT unsigned short *tj3LoadImage16(tjhandle handle, const char *filename,
+                                         int *width, int align, int *height,
+                                         int *pixelFormat);
+
+
+/**
+ * Save an 8-bit-per-sample packed-pixel image from memory to disk.
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @param filename name of a file to which to save the packed-pixel image.  The
+ * image will be stored in Windows BMP or PBMPLUS (PPM/PGM) format, depending
+ * on the file extension.  Windows BMP files require 8-bit-per-sample data
+ * precision.
+ *
+ * @param buffer pointer to a buffer containing a packed-pixel RGB, grayscale,
+ * or CMYK image to be saved
+ *
+ * @param width width (in pixels) of the packed-pixel image
+ *
+ * @param pitch samples per row in the packed-pixel image.  Setting this
+ * parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
+ * @param height height (in pixels) of the packed-pixel image
+ *
+ * @param pixelFormat pixel format of the packed-pixel image (see @ref TJPF
+ * "Pixel formats".)  If this parameter is set to @ref TJPF_GRAY, then the
+ * image will be stored in PGM or 8-bit-per-pixel (indexed color) BMP format.
+ * Otherwise, the image will be stored in PPM or 24-bit-per-pixel BMP format.
+ * If this parameter is set to @ref TJPF_CMYK, then the CMYK pixels will be
+ * converted to RGB using a quick & dirty algorithm that is suitable only for
+ * testing purposes.  (Proper conversion between CMYK and other formats
+ * requires a color management system.)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr().)
+ */
+DLLEXPORT int tj3SaveImage8(tjhandle handle, const char *filename,
+                            const unsigned char *buffer, int width, int pitch,
+                            int height, int pixelFormat);
+
+/**
+ * Save a 12-bit-per-sample packed-pixel image from memory to disk.
+ *
+ * \details \copydetails tj3SaveImage8()
+ */
+DLLEXPORT int tj3SaveImage12(tjhandle handle, const char *filename,
+                             const short *buffer, int width, int pitch,
+                             int height, int pixelFormat);
+
+/**
+ * Save a 16-bit-per-sample packed-pixel image from memory to disk.
+ *
+ * \details \copydetails tj3SaveImage8()
+ */
+DLLEXPORT int tj3SaveImage16(tjhandle handle, const char *filename,
+                             const unsigned short *buffer, int width,
+                             int pitch, int height, int pixelFormat);
+
+
+/**
+ * Free a byte buffer previously allocated by TurboJPEG.  You should always use
+ * this function to free JPEG destination buffer(s) that were automatically
+ * (re)allocated by the compression and transform functions or that were
+ * manually allocated using #tj3Alloc().
+ *
+ * @param buffer address of the buffer to free.  If the address is NULL, then
+ * this function has no effect.
+ *
+ * @see tj3Alloc()
+ */
+DLLEXPORT void tj3Free(void *buffer);
+
+
+/**
+ * Returns a descriptive error message explaining why the last command failed.
+ *
+ * @param handle handle to a TurboJPEG instance, or NULL if the error was
+ * generated by a global function (but note that retrieving the error message
+ * for a global function is thread-safe only on platforms that support
+ * thread-local storage.)
+ *
+ * @return a descriptive error message explaining why the last command failed.
+ */
+DLLEXPORT char *tj3GetErrorStr(tjhandle handle);
+
+
+/**
+ * Returns a code indicating the severity of the last error.  See
+ * @ref TJERR "Error codes".
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @return a code indicating the severity of the last error.  See
+ * @ref TJERR "Error codes".
+ */
+DLLEXPORT int tj3GetErrorCode(tjhandle handle);
+
+
+/* Backward compatibility functions and macros (nothing to see here) */
+
+/* TurboJPEG 1.0+ */
+
+#define NUMSUBOPT  TJ_NUMSAMP
+#define TJ_444  TJSAMP_444
+#define TJ_422  TJSAMP_422
+#define TJ_420  TJSAMP_420
+#define TJ_411  TJSAMP_420
+#define TJ_GRAYSCALE  TJSAMP_GRAY
+
+#define TJ_BGR  1
+#define TJ_BOTTOMUP  TJFLAG_BOTTOMUP
+#define TJ_FORCEMMX  TJFLAG_FORCEMMX
+#define TJ_FORCESSE  TJFLAG_FORCESSE
+#define TJ_FORCESSE2  TJFLAG_FORCESSE2
+#define TJ_ALPHAFIRST  64
+#define TJ_FORCESSE3  TJFLAG_FORCESSE3
+#define TJ_FASTUPSAMPLE  TJFLAG_FASTUPSAMPLE
+
+#define TJPAD(width)  (((width) + 3) & (~3))
+
+DLLEXPORT unsigned long TJBUFSIZE(int width, int height);
+
+DLLEXPORT int tjCompress(tjhandle handle, unsigned char *srcBuf, int width,
+                         int pitch, int height, int pixelSize,
+                         unsigned char *dstBuf, unsigned long *compressedSize,
+                         int jpegSubsamp, int jpegQual, int flags);
+
+DLLEXPORT int tjDecompress(tjhandle handle, unsigned char *jpegBuf,
+                           unsigned long jpegSize, unsigned char *dstBuf,
+                           int width, int pitch, int height, int pixelSize,
+                           int flags);
+
+DLLEXPORT int tjDecompressHeader(tjhandle handle, unsigned char *jpegBuf,
+                                 unsigned long jpegSize, int *width,
+                                 int *height);
+
+DLLEXPORT int tjDestroy(tjhandle handle);
+
+DLLEXPORT char *tjGetErrorStr(void);
+
+DLLEXPORT tjhandle tjInitCompress(void);
+
+DLLEXPORT tjhandle tjInitDecompress(void);
+
+/* TurboJPEG 1.1+ */
+
+#define TJ_YUV  512
+
+DLLEXPORT unsigned long TJBUFSIZEYUV(int width, int height, int jpegSubsamp);
+
+DLLEXPORT int tjDecompressHeader2(tjhandle handle, unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp);
+
+DLLEXPORT int tjDecompressToYUV(tjhandle handle, unsigned char *jpegBuf,
+                                unsigned long jpegSize, unsigned char *dstBuf,
+                                int flags);
+
+DLLEXPORT int tjEncodeYUV(tjhandle handle, unsigned char *srcBuf, int width,
+                          int pitch, int height, int pixelSize,
+                          unsigned char *dstBuf, int subsamp, int flags);
+
+/* TurboJPEG 1.2+ */
+
+#define TJFLAG_BOTTOMUP  2
+#define TJFLAG_FORCEMMX  8
+#define TJFLAG_FORCESSE  16
+#define TJFLAG_FORCESSE2  32
+#define TJFLAG_FORCESSE3  128
+#define TJFLAG_FASTUPSAMPLE  256
+#define TJFLAG_NOREALLOC  1024
+
+DLLEXPORT unsigned char *tjAlloc(int bytes);
+
+DLLEXPORT unsigned long tjBufSize(int width, int height, int jpegSubsamp);
+
+DLLEXPORT unsigned long tjBufSizeYUV(int width, int height, int subsamp);
+
+DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          unsigned char **jpegBuf, unsigned long *jpegSize,
+                          int jpegSubsamp, int jpegQual, int flags);
+
+DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags);
+
+DLLEXPORT int tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf, int width,
+                           int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int subsamp, int flags);
+
+DLLEXPORT void tjFree(unsigned char *buffer);
+
+DLLEXPORT tjscalingfactor *tjGetScalingFactors(int *numscalingfactors);
+
+DLLEXPORT tjhandle tjInitTransform(void);
+
+DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, int n,
+                            unsigned char **dstBufs, unsigned long *dstSizes,
+                            tjtransform *transforms, int flags);
+
+/* TurboJPEG 1.2.1+ */
+
+#define TJFLAG_FASTDCT  2048
+#define TJFLAG_ACCURATEDCT  4096
+
+/* TurboJPEG 1.4+ */
+
+DLLEXPORT unsigned long tjBufSizeYUV2(int width, int align, int height,
+                                      int subsamp);
+
+DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int align, int height, int subsamp,
+                                unsigned char **jpegBuf,
+                                unsigned long *jpegSize, int jpegQual,
+                                int flags);
+
+DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
+                                      const unsigned char **srcPlanes,
+                                      int width, const int *strides,
+                                      int height, int subsamp,
+                                      unsigned char **jpegBuf,
+                                      unsigned long *jpegSize, int jpegQual,
+                                      int flags);
+
+DLLEXPORT int tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
+                          int align, int subsamp, unsigned char *dstBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags);
+
+DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
+                                const unsigned char **srcPlanes,
+                                const int *strides, int subsamp,
+                                unsigned char *dstBuf, int width, int pitch,
+                                int height, int pixelFormat, int flags);
+
+DLLEXPORT int tjDecompressHeader3(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp,
+                                  int *jpegColorspace);
+
+DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
+                                 unsigned long jpegSize, unsigned char *dstBuf,
+                                 int width, int align, int height, int flags);
+
+DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
+                                      const unsigned char *jpegBuf,
+                                      unsigned long jpegSize,
+                                      unsigned char **dstPlanes, int width,
+                                      int *strides, int height, int flags);
+
+DLLEXPORT int tjEncodeYUV3(tjhandle handle, const unsigned char *srcBuf,
+                           int width, int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int align, int subsamp,
+                           int flags);
+
+DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int pitch, int height,
+                                int pixelFormat, unsigned char **dstPlanes,
+                                int *strides, int subsamp, int flags);
+
+DLLEXPORT int tjPlaneHeight(int componentID, int height, int subsamp);
+
+DLLEXPORT unsigned long tjPlaneSizeYUV(int componentID, int width, int stride,
+                                       int height, int subsamp);
+
+DLLEXPORT int tjPlaneWidth(int componentID, int width, int subsamp);
+
+/* TurboJPEG 2.0+ */
+
+#define TJFLAG_STOPONWARNING  8192
+#define TJFLAG_PROGRESSIVE  16384
+
+DLLEXPORT int tjGetErrorCode(tjhandle handle);
+
+DLLEXPORT char *tjGetErrorStr2(tjhandle handle);
+
+DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
+                                     int align, int *height, int *pixelFormat,
+                                     int flags);
+
+DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags);
+
+/* TurboJPEG 2.1+ */
+
+#define TJFLAG_LIMITSCANS  32768
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/3rdparty/libjpeg/jcarith.c b/3rdparty/libjpeg/jcarith.c
index 46ce6c6a3926..1b45089a3a1d 100644
--- a/3rdparty/libjpeg/jcarith.c
+++ b/3rdparty/libjpeg/jcarith.c
@@ -1,7 +1,7 @@
 /*
  * jcarith.c
  *
- * Developed 1997-2019 by Guido Vollbeding.
+ * Developed 1997-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -361,7 +361,7 @@ emit_restart (j_compress_ptr cinfo, int restart_num)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   unsigned char *st;
@@ -450,7 +450,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   const int * natural_order;
@@ -557,7 +557,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   unsigned char *st;
@@ -592,7 +592,7 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   const int * natural_order;
@@ -691,7 +691,7 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   const int * natural_order;
diff --git a/3rdparty/libjpeg/jccoefct.c b/3rdparty/libjpeg/jccoefct.c
index 924a703dda24..494aa22988ec 100644
--- a/3rdparty/libjpeg/jccoefct.c
+++ b/3rdparty/libjpeg/jccoefct.c
@@ -2,7 +2,7 @@
  * jccoefct.c
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
- * Modified 2003-2011 by Guido Vollbeding.
+ * Modified 2003-2022 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -36,16 +36,14 @@ typedef struct {
   struct jpeg_c_coef_controller pub; /* public fields */
 
   JDIMENSION iMCU_row_num;	/* iMCU row # within image */
-  JDIMENSION mcu_ctr;		/* counts MCUs processed in current row */
+  JDIMENSION MCU_ctr;		/* counts MCUs processed in current row */
   int MCU_vert_offset;		/* counts MCU rows within iMCU row */
   int MCU_rows_per_iMCU_row;	/* number of such rows needed */
 
   /* For single-pass compression, it's sufficient to buffer just one MCU
-   * (although this may prove a bit slow in practice).  We allocate a
-   * workspace of C_MAX_BLOCKS_IN_MCU coefficient blocks, and reuse it for each
-   * MCU constructed and sent.  (On 80x86, the workspace is FAR even though
-   * it's not really very big; this is to keep the module interfaces unchanged
-   * when a large coefficient buffer is necessary.)
+   * (although this may prove a bit slow in practice).
+   * We append a workspace of C_MAX_BLOCKS_IN_MCU coefficient blocks,
+   * and reuse it for each MCU constructed and sent.
    * In multi-pass modes, this array points to the current MCU's blocks
    * within the virtual arrays.
    */
@@ -53,6 +51,9 @@ typedef struct {
 
   /* In multi-pass modes, we need a virtual block array for each component. */
   jvirt_barray_ptr whole_image[MAX_COMPONENTS];
+
+  /* Workspace for single-pass compression (omitted otherwise). */
+  JBLOCK blk_buffer[C_MAX_BLOCKS_IN_MCU];
 } my_coef_controller;
 
 typedef my_coef_controller * my_coef_ptr;
@@ -88,7 +89,7 @@ start_iMCU_row (j_compress_ptr cinfo)
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
   }
 
-  coef->mcu_ctr = 0;
+  coef->MCU_ctr = 0;
   coef->MCU_vert_offset = 0;
 }
 
@@ -125,7 +126,6 @@ start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 #endif
   default:
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    break;
   }
 }
 
@@ -147,59 +147,56 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   JDIMENSION MCU_col_num;	/* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
-  int blkn, bi, ci, yindex, yoffset, blockcnt;
-  JDIMENSION ypos, xpos;
+  int ci, xindex, yindex, yoffset, blockcnt;
+  JBLOCKROW blkp;
+  JSAMPARRAY input_ptr;
+  JDIMENSION xpos;
   jpeg_component_info *compptr;
   forward_DCT_ptr forward_DCT;
 
   /* Loop to write as much as one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
-    for (MCU_col_num = coef->mcu_ctr; MCU_col_num <= last_MCU_col;
+    for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
 	 MCU_col_num++) {
       /* Determine where data comes from in input_buf and do the DCT thing.
-       * Each call on forward_DCT processes a horizontal row of DCT blocks
-       * as wide as an MCU; we rely on having allocated the MCU_buffer[] blocks
-       * sequentially.  Dummy blocks at the right or bottom edge are filled in
+       * Each call on forward_DCT processes a horizontal row of DCT blocks as
+       * wide as an MCU.  Dummy blocks at the right or bottom edge are filled in
        * specially.  The data in them does not matter for image reconstruction,
        * so we fill them with values that will encode to the smallest amount of
        * data, viz: all zeroes in the AC entries, DC entries equal to previous
        * block's DC value.  (Thanks to Thomas Kinsman for this idea.)
        */
-      blkn = 0;
+      blkp = coef->blk_buffer;	/* pointer to current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
 	compptr = cinfo->cur_comp_info[ci];
 	forward_DCT = cinfo->fdct->forward_DCT[compptr->component_index];
+	input_ptr = input_buf[compptr->component_index] +
+	  yoffset * compptr->DCT_v_scaled_size;
+	/* ypos == (yoffset + yindex) * compptr->DCT_v_scaled_size */
 	blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						: compptr->last_col_width;
 	xpos = MCU_col_num * compptr->MCU_sample_width;
-	ypos = yoffset * compptr->DCT_v_scaled_size;
-	/* ypos == (yoffset+yindex) * DCTSIZE */
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (coef->iMCU_row_num < last_iMCU_row ||
-	      yoffset+yindex < compptr->last_row_height) {
-	    (*forward_DCT) (cinfo, compptr,
-			    input_buf[compptr->component_index],
-			    coef->MCU_buffer[blkn],
-			    ypos, xpos, (JDIMENSION) blockcnt);
-	    if (blockcnt < compptr->MCU_width) {
-	      /* Create some dummy blocks at the right edge of the image. */
-	      FMEMZERO((void FAR *) coef->MCU_buffer[blkn + blockcnt],
-		       (compptr->MCU_width - blockcnt) * SIZEOF(JBLOCK));
-	      for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
-		coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
-	      }
-	    }
+	      yoffset + yindex < compptr->last_row_height) {
+	    (*forward_DCT) (cinfo, compptr, input_ptr, blkp,
+			    xpos, (JDIMENSION) blockcnt);
+	    input_ptr += compptr->DCT_v_scaled_size;
+	    blkp += blockcnt;
+	    /* Dummy blocks at right edge */
+	    if ((xindex = compptr->MCU_width - blockcnt) == 0)
+	      continue;
 	  } else {
-	    /* Create a row of dummy blocks at the bottom of the image. */
-	    FMEMZERO((void FAR *) coef->MCU_buffer[blkn],
-		     compptr->MCU_width * SIZEOF(JBLOCK));
-	    for (bi = 0; bi < compptr->MCU_width; bi++) {
-	      coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
-	    }
+	    /* At bottom of image, need a whole row of dummy blocks */
+	    xindex = compptr->MCU_width;
 	  }
-	  blkn += compptr->MCU_width;
-	  ypos += compptr->DCT_v_scaled_size;
+	  /* Fill in any dummy blocks needed in this row */
+	  MEMZERO(blkp, xindex * SIZEOF(JBLOCK));
+	  do {
+	    blkp[0][0] = blkp[-1][0];
+	    blkp++;
+	  } while (--xindex);
 	}
       }
       /* Try to write the MCU.  In event of a suspension failure, we will
@@ -208,12 +205,12 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
 	/* Suspension forced; update state counters and exit */
 	coef->MCU_vert_offset = yoffset;
-	coef->mcu_ctr = MCU_col_num;
+	coef->MCU_ctr = MCU_col_num;
 	return FALSE;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
-    coef->mcu_ctr = 0;
+    coef->MCU_ctr = 0;
   }
   /* Completed the iMCU row, advance counters for next one */
   coef->iMCU_row_num++;
@@ -256,6 +253,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   jpeg_component_info *compptr;
   JBLOCKARRAY buffer;
   JBLOCKROW thisblockrow, lastblockrow;
+  JSAMPARRAY input_ptr;
   forward_DCT_ptr forward_DCT;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -280,14 +278,15 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
     if (ndummy > 0)
       ndummy = h_samp_factor - ndummy;
     forward_DCT = cinfo->fdct->forward_DCT[ci];
+    input_ptr = input_buf[ci];
     /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
      * on forward_DCT processes a complete horizontal row of DCT blocks.
      */
     for (block_row = 0; block_row < block_rows; block_row++) {
       thisblockrow = buffer[block_row];
-      (*forward_DCT) (cinfo, compptr, input_buf[ci], thisblockrow,
-		      (JDIMENSION) (block_row * compptr->DCT_v_scaled_size),
+      (*forward_DCT) (cinfo, compptr, input_ptr, thisblockrow,
 		      (JDIMENSION) 0, blocks_across);
+      input_ptr += compptr->DCT_v_scaled_size;
       if (ndummy > 0) {
 	/* Create dummy blocks at the right edge of the image. */
 	thisblockrow += blocks_across; /* => first dummy block */
@@ -303,15 +302,14 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
      * of the dummy blocks to match the last real block's DC value.
      * This squeezes a few more bytes out of the resulting file...
      */
-    if (coef->iMCU_row_num == last_iMCU_row) {
+    if (block_row < compptr->v_samp_factor) {
       blocks_across += ndummy;	/* include lower right corner */
       MCUs_across = blocks_across / h_samp_factor;
-      for (block_row = block_rows; block_row < compptr->v_samp_factor;
-	   block_row++) {
+      do {
 	thisblockrow = buffer[block_row];
 	lastblockrow = buffer[block_row-1];
 	FMEMZERO((void FAR *) thisblockrow,
-		 (size_t) (blocks_across * SIZEOF(JBLOCK)));
+		 (size_t) blocks_across * SIZEOF(JBLOCK));
 	for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
 	  lastDC = lastblockrow[h_samp_factor-1][0];
 	  for (bi = 0; bi < h_samp_factor; bi++) {
@@ -320,7 +318,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 	  thisblockrow += h_samp_factor; /* advance to next MCU in row */
 	  lastblockrow += h_samp_factor;
 	}
-      }
+      } while (++block_row < compptr->v_samp_factor);
     }
   }
   /* NB: compress_output will increment iMCU_row_num if successful.
@@ -347,8 +345,9 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
   my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
   JDIMENSION MCU_col_num;	/* index of current MCU within row */
-  int blkn, ci, xindex, yindex, yoffset;
+  int ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
+  JBLOCKARRAY blkp;
   JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
   JBLOCKROW buffer_ptr;
   jpeg_component_info *compptr;
@@ -368,30 +367,31 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   /* Loop to process one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
-    for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row;
+    for (MCU_col_num = coef->MCU_ctr; MCU_col_num < cinfo->MCUs_per_row;
 	 MCU_col_num++) {
       /* Construct list of pointers to DCT blocks belonging to this MCU */
-      blkn = 0;			/* index of current DCT block within MCU */
+      blkp = coef->MCU_buffer;	/* pointer to current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
 	compptr = cinfo->cur_comp_info[ci];
 	start_col = MCU_col_num * compptr->MCU_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
-	  for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
-	    coef->MCU_buffer[blkn++] = buffer_ptr++;
-	  }
+	  buffer_ptr = buffer[ci][yoffset + yindex] + start_col;
+	  xindex = compptr->MCU_width;
+	  do {
+	    *blkp++ = buffer_ptr++;
+	  } while (--xindex);
 	}
       }
       /* Try to write the MCU. */
       if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
 	/* Suspension forced; update state counters and exit */
 	coef->MCU_vert_offset = yoffset;
-	coef->mcu_ctr = MCU_col_num;
+	coef->MCU_ctr = MCU_col_num;
 	return FALSE;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
-    coef->mcu_ctr = 0;
+    coef->MCU_ctr = 0;
   }
   /* Completed the iMCU row, advance counters for next one */
   coef->iMCU_row_num++;
@@ -411,13 +411,6 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
-  coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_coef_controller));
-  cinfo->coef = (struct jpeg_c_coef_controller *) coef;
-  coef->pub.start_pass = start_pass_coef;
-
-  /* Create the coefficient buffer. */
   if (need_full_buffer) {
 #ifdef FULL_COEF_BUFFER_SUPPORTED
     /* Allocate a full-image virtual array for each component, */
@@ -425,6 +418,9 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     int ci;
     jpeg_component_info *compptr;
 
+    coef = (my_coef_ptr) (*cinfo->mem->alloc_small)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE,
+       SIZEOF(my_coef_controller) - SIZEOF(coef->blk_buffer));
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	 ci++, compptr++) {
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
@@ -440,15 +436,21 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 #endif
   } else {
     /* We only need a single-MCU buffer. */
-    JBLOCKROW buffer;
-    int i;
-
-    buffer = (JBLOCKROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-    for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
-      coef->MCU_buffer[i] = buffer + i;
-    }
+    JBLOCKARRAY blkp;
+    JBLOCKROW buffer_ptr;
+    int bi;
+
+    coef = (my_coef_ptr) (*cinfo->mem->alloc_small)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_coef_controller));
+    blkp = coef->MCU_buffer;
+    buffer_ptr = coef->blk_buffer;
+    bi = C_MAX_BLOCKS_IN_MCU;
+    do {
+      *blkp++ = buffer_ptr++;
+    } while (--bi);
     coef->whole_image[0] = NULL; /* flag for no virtual arrays */
   }
+
+  coef->pub.start_pass = start_pass_coef;
+  cinfo->coef = &coef->pub;
 }
diff --git a/3rdparty/libjpeg/jccolor.c b/3rdparty/libjpeg/jccolor.c
index db2ca429e8f0..c028dd9db304 100644
--- a/3rdparty/libjpeg/jccolor.c
+++ b/3rdparty/libjpeg/jccolor.c
@@ -2,7 +2,7 @@
  * jccolor.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * Modified 2011-2019 by Guido Vollbeding.
+ * Modified 2011-2023 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -40,10 +40,10 @@ typedef my_color_converter * my_cconvert_ptr;
  * Note that the derived conversion coefficients given in some of these
  * documents are imprecise.  The general conversion equations are
  *	Y  = Kr * R + (1 - Kr - Kb) * G + Kb * B
- *	Cb = 0.5 * (B - Y) / (1 - Kb)
- *	Cr = 0.5 * (R - Y) / (1 - Kr)
+ *	Cb = (B - Y) / (1 - Kb) / K
+ *	Cr = (R - Y) / (1 - Kr) / K
  * With Kr = 0.299 and Kb = 0.114 (derived according to SMPTE RP 177-1993
- * from the 1953 FCC NTSC primaries and CIE Illuminant C),
+ * from the 1953 FCC NTSC primaries and CIE Illuminant C), K = 2 for sYCC,
  * the conversion equations to be implemented are therefore
  *	Y  =  0.299 * R + 0.587 * G + 0.114 * B
  *	Cb = -0.168735892 * R - 0.331264108 * G + 0.5 * B + CENTERJSAMPLE
@@ -62,8 +62,8 @@ typedef my_color_converter * my_cconvert_ptr;
  * by precalculating the constants times R,G,B for all possible values.
  * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
  * for 9-bit to 12-bit samples it is still acceptable.  It's not very
- * reasonable for 16-bit samples, but if you want lossless storage you
- * shouldn't be changing colorspace anyway.
+ * reasonable for 16-bit samples, but if you want lossless storage
+ * you shouldn't be changing colorspace anyway.
  * The CENTERJSAMPLE offsets and the rounding fudge-factor of 0.5 are included
  * in the tables to save adding them separately in the inner loop.
  */
@@ -110,16 +110,16 @@ rgb_ycc_start (j_compress_ptr cinfo)
   for (i = 0; i <= MAXJSAMPLE; i++) {
     rgb_ycc_tab[i+R_Y_OFF] = FIX(0.299) * i;
     rgb_ycc_tab[i+G_Y_OFF] = FIX(0.587) * i;
-    rgb_ycc_tab[i+B_Y_OFF] = FIX(0.114) * i   + ONE_HALF;
+    rgb_ycc_tab[i+B_Y_OFF] = FIX(0.114) * i + ONE_HALF;
     rgb_ycc_tab[i+R_CB_OFF] = (- FIX(0.168735892)) * i;
     rgb_ycc_tab[i+G_CB_OFF] = (- FIX(0.331264108)) * i;
     /* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
      * This ensures that the maximum output will round to MAXJSAMPLE
      * not MAXJSAMPLE+1, and thus that we don't have to range-limit.
      */
-    rgb_ycc_tab[i+B_CB_OFF] = FIX(0.5) * i    + CBCR_OFFSET + ONE_HALF-1;
+    rgb_ycc_tab[i+B_CB_OFF] = (i << (SCALEBITS-1)) + CBCR_OFFSET + ONE_HALF-1;
 /*  B=>Cb and R=>Cr tables are the same
-    rgb_ycc_tab[i+R_CR_OFF] = FIX(0.5) * i    + CBCR_OFFSET + ONE_HALF-1;
+    rgb_ycc_tab[i+R_CR_OFF] = (i << (SCALEBITS-1)) + CBCR_OFFSET + ONE_HALF-1;
 */
     rgb_ycc_tab[i+G_CR_OFF] = (- FIX(0.418687589)) * i;
     rgb_ycc_tab[i+B_CR_OFF] = (- FIX(0.081312411)) * i;
@@ -190,8 +190,8 @@ rgb_ycc_convert (j_compress_ptr cinfo,
 
 /*
  * Convert some rows of samples to the JPEG colorspace.
- * This version handles RGB->grayscale conversion, which is the same
- * as the RGB->Y portion of RGB->YCbCr.
+ * This version handles RGB->grayscale conversion,
+ * which is the same as the RGB->Y portion of RGB->YCbCr.
  * We assume rgb_ycc_start has been called (we only use the Y tables).
  */
 
@@ -201,7 +201,7 @@ rgb_gray_convert (j_compress_ptr cinfo,
 		  JDIMENSION output_row, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  register int r, g, b;
+  register INT32 y;
   register INT32 * ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr;
@@ -212,14 +212,11 @@ rgb_gray_convert (j_compress_ptr cinfo,
     inptr = *input_buf++;
     outptr = output_buf[0][output_row++];
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      y  = ctab[R_Y_OFF + GETJSAMPLE(inptr[RGB_RED])];
+      y += ctab[G_Y_OFF + GETJSAMPLE(inptr[RGB_GREEN])];
+      y += ctab[B_Y_OFF + GETJSAMPLE(inptr[RGB_BLUE])];
       inptr += RGB_PIXELSIZE;
-      /* Y */
-      outptr[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+      outptr[col] = (JSAMPLE) (y >> SCALEBITS);
     }
   }
 }
diff --git a/3rdparty/libjpeg/jcdctmgr.c b/3rdparty/libjpeg/jcdctmgr.c
index fafab91c69bd..a48ccd81474b 100644
--- a/3rdparty/libjpeg/jcdctmgr.c
+++ b/3rdparty/libjpeg/jcdctmgr.c
@@ -2,7 +2,7 @@
  * jcdctmgr.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2003-2013 by Guido Vollbeding.
+ * Modified 2003-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -66,15 +66,14 @@ typedef union {
  * Perform forward DCT on one or more blocks of a component.
  *
  * The input samples are taken from the sample_data[] array starting at
- * position start_row/start_col, and moving to the right for any additional
- * blocks. The quantized coefficients are returned in coef_blocks[].
+ * position start_col, and moving to the right for any additional blocks.
+ * The quantized coefficients are returned in coef_blocks[].
  */
 
 METHODDEF(void)
 forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-	     JDIMENSION start_row, JDIMENSION start_col,
-	     JDIMENSION num_blocks)
+	     JDIMENSION start_col, JDIMENSION num_blocks)
 /* This version is used for integer DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
@@ -84,8 +83,6 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
   DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
   JDIMENSION bi;
 
-  sample_data += start_row;	/* fold in the vertical offset once */
-
   for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
     /* Perform the DCT */
     (*do_dct) (workspace, sample_data, start_col);
@@ -136,8 +133,7 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 METHODDEF(void)
 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-		   JDIMENSION start_row, JDIMENSION start_col,
-		   JDIMENSION num_blocks)
+		   JDIMENSION start_col, JDIMENSION num_blocks)
 /* This version is used for floating-point DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
@@ -147,8 +143,6 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
   FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
   JDIMENSION bi;
 
-  sample_data += start_row;	/* fold in the vertical offset once */
-
   for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
     /* Perform the DCT */
     (*do_dct) (workspace, sample_data, start_col);
@@ -347,13 +341,11 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 #endif
       default:
 	ERREXIT(cinfo, JERR_NOT_COMPILED);
-	break;
       }
       break;
     default:
       ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
 	       compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
-      break;
     }
     qtblno = compptr->quant_tbl_no;
     /* Make sure specified quantization table is present */
@@ -444,7 +436,6 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 #endif
     default:
       ERREXIT(cinfo, JERR_NOT_COMPILED);
-      break;
     }
   }
 }
@@ -461,17 +452,15 @@ jinit_forward_dct (j_compress_ptr cinfo)
   int ci;
   jpeg_component_info *compptr;
 
-  fdct = (my_fdct_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_fdct_controller));
+  fdct = (my_fdct_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_fdct_controller));
   cinfo->fdct = &fdct->pub;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Allocate a divisor table for each component */
-    compptr->dct_table =
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(divisor_table));
+    compptr->dct_table = (*cinfo->mem->alloc_small)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(divisor_table));
   }
 }
diff --git a/3rdparty/libjpeg/jchuff.c b/3rdparty/libjpeg/jchuff.c
index 02fc275b7abc..1f527b2182d2 100644
--- a/3rdparty/libjpeg/jchuff.c
+++ b/3rdparty/libjpeg/jchuff.c
@@ -2,7 +2,7 @@
  * jchuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2006-2019 by Guido Vollbeding.
+ * Modified 2006-2023 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -26,17 +26,11 @@
 
 
 /* The legal range of a DCT coefficient is
- *  -1024 .. +1023  for 8-bit data;
- * -16384 .. +16383 for 12-bit data.
- * Hence the magnitude should always fit in 10 or 14 bits respectively.
+ *  -1024 .. +1023  for 8-bit sample data precision;
+ * -16384 .. +16383 for 12-bit sample data precision.
+ * Hence the magnitude should always fit in sample data precision + 2 bits.
  */
 
-#if BITS_IN_JSAMPLE == 8
-#define MAX_COEF_BITS 10
-#else
-#define MAX_COEF_BITS 14
-#endif
-
 /* Derived data constructed for each Huffman table */
 
 typedef struct {
@@ -542,11 +536,12 @@ emit_restart_e (huff_entropy_ptr entropy, int restart_num)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   register int temp, temp2;
   register int nbits;
+  int max_coef_bits;
   int blkn, ci, tbl;
   ISHIFT_TEMPS
 
@@ -558,6 +553,9 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->restarts_to_go == 0)
       emit_restart_e(entropy, entropy->next_restart_num);
 
+  /* Since we're encoding a difference, the range limit is twice as much. */
+  max_coef_bits = cinfo->data_precision + 3;
+
   /* Encode the MCU data blocks */
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     ci = cinfo->MCU_membership[blkn];
@@ -569,12 +567,17 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     temp = IRIGHT_SHIFT((int) (MCU_data[blkn][0][0]), cinfo->Al);
 
     /* DC differences are figured on the point-transformed values. */
-    temp2 = temp - entropy->saved.last_dc_val[ci];
+    if ((temp2 = temp - entropy->saved.last_dc_val[ci]) == 0) {
+      /* Count/emit the Huffman-coded symbol for the number of bits */
+      emit_dc_symbol(entropy, tbl, 0);
+
+      continue;
+    }
+
     entropy->saved.last_dc_val[ci] = temp;
 
     /* Encode the DC coefficient difference per section G.1.2.1 */
-    temp = temp2;
-    if (temp < 0) {
+    if ((temp = temp2) < 0) {
       temp = -temp;		/* temp is abs value of input */
       /* For a negative input, want temp2 = bitwise complement of abs(input) */
       /* This code assumes we are on a two's complement machine */
@@ -583,14 +586,10 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
     /* Find the number of bits needed for the magnitude of the coefficient */
     nbits = 0;
-    while (temp) {
-      nbits++;
-      temp >>= 1;
-    }
-    /* Check for out-of-range coefficient values.
-     * Since we're encoding a difference, the range limit is twice as much.
-     */
-    if (nbits > MAX_COEF_BITS+1)
+    do nbits++;			/* there must be at least one 1 bit */
+    while ((temp >>= 1));
+    /* Check for out-of-range coefficient values */
+    if (nbits > max_coef_bits)
       ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
     /* Count/emit the Huffman-coded symbol for the number of bits */
@@ -598,8 +597,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
     /* Emit that number of bits of the value, if positive, */
     /* or the complement of its magnitude, if negative. */
-    if (nbits)			/* emit_bits rejects calls with size 0 */
-      emit_bits_e(entropy, (unsigned int) temp2, nbits);
+    emit_bits_e(entropy, (unsigned int) temp2, nbits);
   }
 
   cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -625,7 +623,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   const int * natural_order;
@@ -633,7 +631,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   register int temp, temp2;
   register int nbits;
   register int r, k;
-  int Se, Al;
+  int Se, Al, max_coef_bits;
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -646,6 +644,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   Se = cinfo->Se;
   Al = cinfo->Al;
   natural_order = cinfo->natural_order;
+  max_coef_bits = cinfo->data_precision + 2;
 
   /* Encode the MCU data block */
   block = MCU_data[0];
@@ -666,18 +665,23 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
      */
     if (temp < 0) {
       temp = -temp;		/* temp is abs value of input */
-      temp >>= Al;		/* apply the point transform */
+      /* Apply the point transform, and watch out for case */
+      /* that nonzero coef is zero after point transform. */
+      if ((temp >>= Al) == 0) {
+	r++;
+	continue;
+      }
       /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
       temp2 = ~temp;
     } else {
-      temp >>= Al;		/* apply the point transform */
+      /* Apply the point transform, and watch out for case */
+      /* that nonzero coef is zero after point transform. */
+      if ((temp >>= Al) == 0) {
+	r++;
+	continue;
+      }
       temp2 = temp;
     }
-    /* Watch out for case that nonzero coef is zero after point transform */
-    if (temp == 0) {
-      r++;
-      continue;
-    }
 
     /* Emit any pending EOBRUN */
     if (entropy->EOBRUN > 0)
@@ -689,11 +693,11 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     }
 
     /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;			/* there must be at least one 1 bit */
-    while ((temp >>= 1))
-      nbits++;
+    nbits = 0;
+    do nbits++;			/* there must be at least one 1 bit */
+    while ((temp >>= 1));
     /* Check for out-of-range coefficient values */
-    if (nbits > MAX_COEF_BITS)
+    if (nbits > max_coef_bits)
       ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
     /* Count/emit Huffman symbol for run length / number of bits */
@@ -736,7 +740,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int Al, blkn;
@@ -779,7 +783,7 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   const int * natural_order;
@@ -916,83 +920,89 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
   register int nbits;
   register int r, k;
   int Se = state->cinfo->lim_Se;
+  int max_coef_bits = state->cinfo->data_precision + 3;
   const int * natural_order = state->cinfo->natural_order;
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
-  temp = temp2 = block[0] - last_dc_val;
-
-  if (temp < 0) {
-    temp = -temp;		/* temp is abs value of input */
-    /* For a negative input, want temp2 = bitwise complement of abs(input) */
-    /* This code assumes we are on a two's complement machine */
-    temp2--;
-  }
+  if ((temp = block[0] - last_dc_val) == 0) {
+    /* Emit the Huffman-coded symbol for the number of bits */
+    if (! emit_bits_s(state, dctbl->ehufco[0], dctbl->ehufsi[0]))
+      return FALSE;
+  } else {
+    if ((temp2 = temp) < 0) {
+      temp = -temp;		/* temp is abs value of input */
+      /* For a negative input, want temp2 = bitwise complement of abs(input) */
+      /* This code assumes we are on a two's complement machine */
+      temp2--;
+    }
 
-  /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = 0;
-  while (temp) {
-    nbits++;
-    temp >>= 1;
-  }
-  /* Check for out-of-range coefficient values.
-   * Since we're encoding a difference, the range limit is twice as much.
-   */
-  if (nbits > MAX_COEF_BITS+1)
-    ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 0;
+    do nbits++;			/* there must be at least one 1 bit */
+    while ((temp >>= 1));
+    /* Check for out-of-range coefficient values.
+     * Since we're encoding a difference, the range limit is twice as much.
+     */
+    if (nbits > max_coef_bits)
+      ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
 
-  /* Emit the Huffman-coded symbol for the number of bits */
-  if (! emit_bits_s(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
-    return FALSE;
+    /* Emit the Huffman-coded symbol for the number of bits */
+    if (! emit_bits_s(state, dctbl->ehufco[nbits], dctbl->ehufsi[nbits]))
+      return FALSE;
 
-  /* Emit that number of bits of the value, if positive, */
-  /* or the complement of its magnitude, if negative. */
-  if (nbits)			/* emit_bits rejects calls with size 0 */
+    /* Emit that number of bits of the value, if positive, */
+    /* or the complement of its magnitude, if negative. */
     if (! emit_bits_s(state, (unsigned int) temp2, nbits))
       return FALSE;
+  }
 
   /* Encode the AC coefficients per section F.1.2.2 */
 
   r = 0;			/* r = run length of zeros */
 
   for (k = 1; k <= Se; k++) {
-    if ((temp2 = block[natural_order[k]]) == 0) {
+    if ((temp = block[natural_order[k]]) == 0) {
       r++;
-    } else {
-      /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-      while (r > 15) {
-	if (! emit_bits_s(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
-	  return FALSE;
-	r -= 16;
-      }
-
-      temp = temp2;
-      if (temp < 0) {
-	temp = -temp;		/* temp is abs value of input */
-	/* This code assumes we are on a two's complement machine */
-	temp2--;
-      }
-
-      /* Find the number of bits needed for the magnitude of the coefficient */
-      nbits = 1;		/* there must be at least one 1 bit */
-      while ((temp >>= 1))
-	nbits++;
-      /* Check for out-of-range coefficient values */
-      if (nbits > MAX_COEF_BITS)
-	ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
-
-      /* Emit Huffman symbol for run length / number of bits */
-      temp = (r << 4) + nbits;
-      if (! emit_bits_s(state, actbl->ehufco[temp], actbl->ehufsi[temp]))
-	return FALSE;
+      continue;
+    }
 
-      /* Emit that number of bits of the value, if positive, */
-      /* or the complement of its magnitude, if negative. */
-      if (! emit_bits_s(state, (unsigned int) temp2, nbits))
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+    while (r > 15) {
+      if (! emit_bits_s(state, actbl->ehufco[0xF0], actbl->ehufsi[0xF0]))
 	return FALSE;
+      r -= 16;
+    }
 
-      r = 0;
+    if ((temp2 = temp) < 0) {
+      temp = -temp;		/* temp is abs value of input */
+      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
+      /* This code assumes we are on a two's complement machine */
+      temp2--;
     }
+
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 0;
+    do nbits++;			/* there must be at least one 1 bit */
+    while ((temp >>= 1));
+    /* Check for out-of-range coefficient values.
+     * Use ">=" instead of ">" so can use the
+     * same one larger limit from DC check here.
+     */
+    if (nbits >= max_coef_bits)
+      ERREXIT(state->cinfo, JERR_BAD_DCT_COEF);
+
+    /* Emit Huffman symbol for run length / number of bits */
+    temp = (r << 4) + nbits;
+    if (! emit_bits_s(state, actbl->ehufco[temp], actbl->ehufsi[temp]))
+      return FALSE;
+
+    /* Emit that number of bits of the value, if positive, */
+    /* or the complement of its magnitude, if negative. */
+    if (! emit_bits_s(state, (unsigned int) temp2, nbits))
+      return FALSE;
+
+    r = 0;			/* reset zero run length */
   }
 
   /* If the last coef(s) were zero, emit an end-of-block code */
@@ -1009,7 +1019,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
  */
 
 METHODDEF(boolean)
-encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_huff (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   working_state state;
@@ -1122,28 +1132,31 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   register int nbits;
   register int r, k;
   int Se = cinfo->lim_Se;
+  int max_coef_bits = cinfo->data_precision + 3;
   const int * natural_order = cinfo->natural_order;
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
-  temp = block[0] - last_dc_val;
-  if (temp < 0)
-    temp = -temp;
+  if ((temp = block[0] - last_dc_val) == 0) {
+    /* Count the Huffman symbol for the number of bits */
+    dc_counts[0]++;
+  } else {
+    if (temp < 0)
+      temp = -temp;		/* temp is abs value of input */
 
-  /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = 0;
-  while (temp) {
-    nbits++;
-    temp >>= 1;
-  }
-  /* Check for out-of-range coefficient values.
-   * Since we're encoding a difference, the range limit is twice as much.
-   */
-  if (nbits > MAX_COEF_BITS+1)
-    ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 0;
+    do nbits++;			/* there must be at least one 1 bit */
+    while ((temp >>= 1));
+    /* Check for out-of-range coefficient values.
+     * Since we're encoding a difference, the range limit is twice as much.
+     */
+    if (nbits > max_coef_bits)
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
-  /* Count the Huffman symbol for the number of bits */
-  dc_counts[nbits]++;
+    /* Count the Huffman symbol for the number of bits */
+    dc_counts[nbits]++;
+  }
 
   /* Encode the AC coefficients per section F.1.2.2 */
 
@@ -1152,30 +1165,33 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   for (k = 1; k <= Se; k++) {
     if ((temp = block[natural_order[k]]) == 0) {
       r++;
-    } else {
-      /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-      while (r > 15) {
-	ac_counts[0xF0]++;
-	r -= 16;
-      }
+      continue;
+    }
+
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+    while (r > 15) {
+      ac_counts[0xF0]++;
+      r -= 16;
+    }
 
-      /* Find the number of bits needed for the magnitude of the coefficient */
-      if (temp < 0)
-	temp = -temp;
+    if (temp < 0)
+      temp = -temp;		/* temp is abs value of input */
 
-      /* Find the number of bits needed for the magnitude of the coefficient */
-      nbits = 1;		/* there must be at least one 1 bit */
-      while ((temp >>= 1))
-	nbits++;
-      /* Check for out-of-range coefficient values */
-      if (nbits > MAX_COEF_BITS)
-	ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+    /* Find the number of bits needed for the magnitude of the coefficient */
+    nbits = 0;
+    do nbits++;			/* there must be at least one 1 bit */
+    while ((temp >>= 1));
+    /* Check for out-of-range coefficient values.
+     * Use ">=" instead of ">" so can use the
+     * same one larger limit from DC check here.
+     */
+    if (nbits >= max_coef_bits)
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
-      /* Count Huffman symbol for run length / number of bits */
-      ac_counts[(r << 4) + nbits]++;
+    /* Count Huffman symbol for run length / number of bits */
+    ac_counts[(r << 4) + nbits]++;
 
-      r = 0;
-    }
+    r = 0;			/* reset zero run length */
   }
 
   /* If the last coef(s) were zero, emit an end-of-block code */
@@ -1190,7 +1206,7 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
  */
 
 METHODDEF(boolean)
-encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_gather (j_compress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int blkn, ci;
diff --git a/3rdparty/libjpeg/jcmaster.c b/3rdparty/libjpeg/jcmaster.c
index 89dcf78c2acd..a70af0c02064 100644
--- a/3rdparty/libjpeg/jcmaster.c
+++ b/3rdparty/libjpeg/jcmaster.c
@@ -2,7 +2,7 @@
  * jcmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2003-2019 by Guido Vollbeding.
+ * Modified 2003-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -391,16 +391,16 @@ per_scan_setup (j_compress_ptr cinfo)
 {
   int ci, mcublks, tmp;
   jpeg_component_info *compptr;
-  
+
   if (cinfo->comps_in_scan == 1) {
-    
+
     /* Noninterleaved (single-component) scan */
     compptr = cinfo->cur_comp_info[0];
-    
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = compptr->width_in_blocks;
     cinfo->MCU_rows_in_scan = compptr->height_in_blocks;
-    
+
     /* For noninterleaved scan, always one block per MCU */
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
@@ -413,28 +413,26 @@ per_scan_setup (j_compress_ptr cinfo)
     tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
-    
+
     /* Prepare array describing MCU composition */
     cinfo->blocks_in_MCU = 1;
     cinfo->MCU_membership[0] = 0;
-    
+
   } else {
-    
+
     /* Interleaved (multi-component) scan */
     if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan,
 	       MAX_COMPS_IN_SCAN);
-    
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long) cinfo->jpeg_width,
 		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
-    cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->jpeg_height,
-		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
-    
+    cinfo->MCU_rows_in_scan = cinfo->total_iMCU_rows;
+
     cinfo->blocks_in_MCU = 0;
-    
+
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       compptr = cinfo->cur_comp_info[ci];
       /* Sampling factors give # of blocks of component in each MCU */
@@ -457,7 +455,7 @@ per_scan_setup (j_compress_ptr cinfo)
 	cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
       }
     }
-    
+
   }
 
   /* Convert restart specified in rows to actual MCU count. */
diff --git a/3rdparty/libjpeg/jconfig.h b/3rdparty/libjpeg/jconfig.h
index e9d33e587097..8d6c8f5b3ae2 100644
--- a/3rdparty/libjpeg/jconfig.h
+++ b/3rdparty/libjpeg/jconfig.h
@@ -1,6 +1,8 @@
-/* jconfig.vc --- jconfig.h for Microsoft Visual C++ on Windows 9x or NT. */
-/* This file also works for Borland C++ 32-bit (bcc32) on Windows 9x or NT. */
-/* see jconfig.txt for explanations */
+/* jconfig.vc --- jconfig.h for Microsoft Visual C++ on Windows 9x or NT.
+ * This file also works for Borland/Embarcadero C++ for Win32 or Win64
+ * (CLI: bcc32, bcc32c, bcc32x, bcc64; GUI IDE: C++Builder/RAD Studio).
+ * See jconfig.txt for explanations.
+ */
 
 #define HAVE_PROTOTYPES
 #define HAVE_UNSIGNED_CHAR
@@ -28,6 +30,16 @@ typedef unsigned char boolean;
 #endif
 #define HAVE_BOOLEAN		/* prevent jmorecfg.h from redefining it */
 
+/* Define custom RGB color order, prevent jmorecfg.h from redefinition */
+#undef JPEG_HAVE_RGB_CUSTOM
+/* Use Windows custom BGR color order defined in jmorecfg.h */
+#undef JPEG_USE_RGB_CUSTOM
+
+/* Define custom file I/O functions, prevent jinclude.h from redefinition */
+#undef JPEG_HAVE_FILE_IO_CUSTOM
+/* Use Delphi custom file I/O functions defined in jinclude.h */
+#undef JPEG_USE_FILE_IO_CUSTOM
+
 
 #ifdef JPEG_INTERNALS
 
@@ -44,7 +56,7 @@ typedef unsigned char boolean;
 #define TARGA_SUPPORTED		/* Targa image file format */
 
 #define TWO_FILE_COMMANDLINE	/* optional */
-#define USE_SETMODE		/* Microsoft has setmode() */
+#define USE_SETMODE	/* Microsoft/Borland/Embarcadero have setmode() */
 #undef NEED_SIGNAL_CATCHER
 #undef DONT_USE_B_MODE
 #undef PROGRESS_REPORT		/* optional */
diff --git a/3rdparty/libjpeg/jcparam.c b/3rdparty/libjpeg/jcparam.c
index 3b7014ff2c4a..261ae86ca01b 100644
--- a/3rdparty/libjpeg/jcparam.c
+++ b/3rdparty/libjpeg/jcparam.c
@@ -2,7 +2,7 @@
  * jcparam.c
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modified 2003-2019 by Guido Vollbeding.
+ * Modified 2003-2022 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -62,8 +62,9 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 
 
 /* These are the sample quantization tables given in JPEG spec section K.1.
- * The spec says that the values given produce "good" quality, and
- * when divided by 2, "very good" quality.
+ * NOTE: chrominance DC value is changed from 17 to 16 for lossless support.
+ * The spec says that the values given produce "good" quality,
+ * and when divided by 2, "very good" quality.
  */
 static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
   16,  11,  10,  16,  24,  40,  51,  61,
@@ -76,7 +77,7 @@ static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = {
   72,  92,  95,  98, 112, 100, 103,  99
 };
 static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
-  17,  18,  24,  47,  99,  99,  99,  99,
+  16,  18,  24,  47,  99,  99,  99,  99,
   18,  21,  26,  66,  99,  99,  99,  99,
   24,  26,  56,  99,  99,  99,  99,  99,
   47,  66,  99,  99,  99,  99,  99,  99,
@@ -379,11 +380,13 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
   case JCS_RGB:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag RGB */
     cinfo->num_components = 3;
-    SET_COMP(0, 0x52 /* 'R' */, 1,1, 0,
+    SET_COMP(0, 0x52 /* 'R' */, 1,1,
+		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
 		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
 		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0);
     SET_COMP(1, 0x47 /* 'G' */, 1,1, 0, 0,0);
-    SET_COMP(2, 0x42 /* 'B' */, 1,1, 0,
+    SET_COMP(2, 0x42 /* 'B' */, 1,1,
+		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
 		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
 		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0);
     break;
@@ -417,11 +420,13 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
     cinfo->JFIF_major_version = 2;   /* Set JFIF major version = 2 */
     cinfo->num_components = 3;
     /* Add offset 0x20 to the normal R/G/B component IDs */
-    SET_COMP(0, 0x72 /* 'r' */, 1,1, 0,
+    SET_COMP(0, 0x72 /* 'r' */, 1,1,
+		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
 		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
 		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0);
     SET_COMP(1, 0x67 /* 'g' */, 1,1, 0, 0,0);
-    SET_COMP(2, 0x62 /* 'b' */, 1,1, 0,
+    SET_COMP(2, 0x62 /* 'b' */, 1,1,
+		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
 		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
 		cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0);
     break;
diff --git a/3rdparty/libjpeg/jcprepct.c b/3rdparty/libjpeg/jcprepct.c
index be44cc4b4511..586964bd446a 100644
--- a/3rdparty/libjpeg/jcprepct.c
+++ b/3rdparty/libjpeg/jcprepct.c
@@ -2,6 +2,7 @@
  * jcprepct.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -109,7 +110,8 @@ expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
   register int row;
 
   for (row = input_rows; row < output_rows; row++) {
-    jcopy_sample_rows(image_data, input_rows-1, image_data, row,
+    jcopy_sample_rows(image_data + input_rows - 1,
+		      image_data + row,
 		      1, num_cols);
   }
 }
@@ -220,8 +222,8 @@ pre_process_context (j_compress_ptr cinfo,
 	for (ci = 0; ci < cinfo->num_components; ci++) {
 	  int row;
 	  for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
-	    jcopy_sample_rows(prep->color_buf[ci], 0,
-			      prep->color_buf[ci], -row,
+	    jcopy_sample_rows(prep->color_buf[ci],
+			      prep->color_buf[ci] - row,
 			      1, cinfo->image_width);
 	  }
 	}
@@ -277,10 +279,9 @@ create_context_buffer (j_compress_ptr cinfo)
   /* Grab enough space for fake row pointers for all the components;
    * we need five row groups' worth of pointers for each component.
    */
-  fake_buffer = (JSAMPARRAY)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				(cinfo->num_components * 5 * rgroup_height) *
-				SIZEOF(JSAMPROW));
+  fake_buffer = (JSAMPARRAY) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE,
+     (cinfo->num_components * 5 * rgroup_height) * SIZEOF(JSAMPROW));
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -324,10 +325,9 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
   if (need_full_buffer)		/* safety check */
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 
-  prep = (my_prep_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_prep_controller));
-  cinfo->prep = (struct jpeg_c_prep_controller *) prep;
+  prep = (my_prep_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_prep_controller));
+  cinfo->prep = &prep->pub;
   prep->pub.start_pass = start_pass_prep;
 
   /* Allocate the color conversion buffer.
diff --git a/3rdparty/libjpeg/jcsample.c b/3rdparty/libjpeg/jcsample.c
index 4d36f85f356c..2372c4173fed 100644
--- a/3rdparty/libjpeg/jcsample.c
+++ b/3rdparty/libjpeg/jcsample.c
@@ -2,6 +2,7 @@
  * jcsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2003-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -200,7 +201,7 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 		     JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   /* Copy the data */
-  jcopy_sample_rows(input_data, 0, output_data, 0,
+  jcopy_sample_rows(input_data, output_data,
 		    cinfo->max_v_samp_factor, cinfo->image_width);
   /* Edge-expand */
   expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
@@ -483,10 +484,9 @@ jinit_downsampler (j_compress_ptr cinfo)
   boolean smoothok = TRUE;
   int h_in_group, v_in_group, h_out_group, v_out_group;
 
-  downsample = (my_downsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_downsampler));
-  cinfo->downsample = (struct jpeg_downsampler *) downsample;
+  downsample = (my_downsample_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_downsampler));
+  cinfo->downsample = &downsample->pub;
   downsample->pub.start_pass = start_pass_downsample;
   downsample->pub.downsample = sep_downsample;
   downsample->pub.need_context_rows = FALSE;
diff --git a/3rdparty/libjpeg/jctrans.c b/3rdparty/libjpeg/jctrans.c
index 5780de42e2b3..261dd2996e86 100644
--- a/3rdparty/libjpeg/jctrans.c
+++ b/3rdparty/libjpeg/jctrans.c
@@ -2,7 +2,7 @@
  * jctrans.c
  *
  * Copyright (C) 1995-1998, Thomas G. Lane.
- * Modified 2000-2017 by Guido Vollbeding.
+ * Modified 2000-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -224,7 +224,7 @@ typedef struct {
   struct jpeg_c_coef_controller pub; /* public fields */
 
   JDIMENSION iMCU_row_num;	/* iMCU row # within image */
-  JDIMENSION mcu_ctr;		/* counts MCUs processed in current row */
+  JDIMENSION MCU_ctr;		/* counts MCUs processed in current row */
   int MCU_vert_offset;		/* counts MCU rows within iMCU row */
   int MCU_rows_per_iMCU_row;	/* number of such rows needed */
 
@@ -232,7 +232,7 @@ typedef struct {
   jvirt_barray_ptr * whole_image;
 
   /* Workspace for constructing dummy blocks at right/bottom edges. */
-  JBLOCKROW dummy_buffer[C_MAX_BLOCKS_IN_MCU];
+  JBLOCK dummy_buffer[C_MAX_BLOCKS_IN_MCU];
 } my_coef_controller;
 
 typedef my_coef_controller * my_coef_ptr;
@@ -257,7 +257,7 @@ start_iMCU_row (j_compress_ptr cinfo)
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
   }
 
-  coef->mcu_ctr = 0;
+  coef->MCU_ctr = 0;
   coef->MCU_vert_offset = 0;
 }
 
@@ -315,25 +315,30 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   /* Loop to process one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
-    for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row;
+    for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
 	 MCU_col_num++) {
       /* Construct list of pointers to DCT blocks belonging to this MCU */
       blkn = 0;			/* index of current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
 	compptr = cinfo->cur_comp_info[ci];
-	start_col = MCU_col_num * compptr->MCU_width;
 	blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
 						: compptr->last_col_width;
+	start_col = MCU_col_num * compptr->MCU_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (coef->iMCU_row_num < last_iMCU_row ||
-	      yindex+yoffset < compptr->last_row_height) {
+	      yoffset + yindex < compptr->last_row_height) {
 	    /* Fill in pointers to real blocks in this row */
-	    buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
-	    for (xindex = 0; xindex < blockcnt; xindex++)
+	    buffer_ptr = buffer[ci][yoffset + yindex] + start_col;
+	    xindex = blockcnt;
+	    do {
 	      MCU_buffer[blkn++] = buffer_ptr++;
+	    } while (--xindex);
+	    /* Dummy blocks at right edge */
+	    if ((xindex = compptr->MCU_width - blockcnt) == 0)
+	      continue;
 	  } else {
 	    /* At bottom of image, need a whole row of dummy blocks */
-	    xindex = 0;
+	    xindex = compptr->MCU_width;
 	  }
 	  /* Fill in any dummy blocks needed in this row.
 	   * Dummy blocks are filled in the same way as in jccoefct.c:
@@ -341,23 +346,23 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 	   * block's DC value.  The init routine has already zeroed the
 	   * AC entries, so we need only set the DC entries correctly.
 	   */
-	  for (; xindex < compptr->MCU_width; xindex++) {
-	    MCU_buffer[blkn] = coef->dummy_buffer[blkn];
-	    MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0];
-	    blkn++;
-	  }
+	  buffer_ptr = coef->dummy_buffer + blkn;
+	  do {
+	    buffer_ptr[0][0] = MCU_buffer[blkn-1][0][0];
+	    MCU_buffer[blkn++] = buffer_ptr++;
+	  } while (--xindex);
 	}
       }
       /* Try to write the MCU. */
       if (! (*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
 	/* Suspension forced; update state counters and exit */
 	coef->MCU_vert_offset = yoffset;
-	coef->mcu_ctr = MCU_col_num;
+	coef->MCU_ctr = MCU_col_num;
 	return FALSE;
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
-    coef->mcu_ctr = 0;
+    coef->MCU_ctr = 0;
   }
   /* Completed the iMCU row, advance counters for next one */
   coef->iMCU_row_num++;
@@ -379,12 +384,9 @@ transencode_coef_controller (j_compress_ptr cinfo,
 			     jvirt_barray_ptr * coef_arrays)
 {
   my_coef_ptr coef;
-  JBLOCKROW buffer;
-  int i;
 
-  coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_coef_controller));
+  coef = (my_coef_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_coef_controller));
   cinfo->coef = &coef->pub;
   coef->pub.start_pass = start_pass_coef;
   coef->pub.compress_data = compress_output;
@@ -392,12 +394,6 @@ transencode_coef_controller (j_compress_ptr cinfo,
   /* Save pointer to virtual arrays */
   coef->whole_image = coef_arrays;
 
-  /* Allocate and pre-zero space for dummy DCT blocks. */
-  buffer = (JBLOCKROW)
-    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-  FMEMZERO((void FAR *) buffer, C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-  for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
-    coef->dummy_buffer[i] = buffer + i;
-  }
+  /* Pre-zero space for dummy DCT blocks */
+  MEMZERO(coef->dummy_buffer, SIZEOF(coef->dummy_buffer));
 }
diff --git a/3rdparty/libjpeg/jdapimin.c b/3rdparty/libjpeg/jdapimin.c
index a6e0dd9fb823..785e52722666 100644
--- a/3rdparty/libjpeg/jdapimin.c
+++ b/3rdparty/libjpeg/jdapimin.c
@@ -2,7 +2,7 @@
  * jdapimin.c
  *
  * Copyright (C) 1994-1998, Thomas G. Lane.
- * Modified 2009-2013 by Guido Vollbeding.
+ * Modified 2009-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -114,7 +114,7 @@ jpeg_abort_decompress (j_decompress_ptr cinfo)
 LOCAL(void)
 default_decompress_parms (j_decompress_ptr cinfo)
 {
-  int cid0, cid1, cid2;
+  int cid0, cid1, cid2, cid3;
 
   /* Guess the input colorspace, and set output colorspace accordingly. */
   /* Note application may override our guesses. */
@@ -123,13 +123,16 @@ default_decompress_parms (j_decompress_ptr cinfo)
     cinfo->jpeg_color_space = JCS_GRAYSCALE;
     cinfo->out_color_space = JCS_GRAYSCALE;
     break;
-    
+
   case 3:
     cid0 = cinfo->comp_info[0].component_id;
     cid1 = cinfo->comp_info[1].component_id;
     cid2 = cinfo->comp_info[2].component_id;
 
-    /* First try to guess from the component IDs */
+    /* For robust detection of standard colorspaces
+     * regardless of the presence of special markers,
+     * check component IDs from SOF marker first.
+     */
     if      (cid0 == 0x01 && cid1 == 0x02 && cid2 == 0x03)
       cinfo->jpeg_color_space = JCS_YCbCr;
     else if (cid0 == 0x01 && cid1 == 0x22 && cid2 == 0x23)
@@ -151,7 +154,6 @@ default_decompress_parms (j_decompress_ptr cinfo)
       default:
 	WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform);
 	cinfo->jpeg_color_space = JCS_YCbCr;	/* assume it's YCbCr */
-	break;
       }
     } else {
       TRACEMS3(cinfo, 1, JTRC_UNKNOWN_IDS, cid0, cid1, cid2);
@@ -160,9 +162,22 @@ default_decompress_parms (j_decompress_ptr cinfo)
     /* Always guess RGB is proper output colorspace. */
     cinfo->out_color_space = JCS_RGB;
     break;
-    
+
   case 4:
-    if (cinfo->saw_Adobe_marker) {
+    cid0 = cinfo->comp_info[0].component_id;
+    cid1 = cinfo->comp_info[1].component_id;
+    cid2 = cinfo->comp_info[2].component_id;
+    cid3 = cinfo->comp_info[3].component_id;
+
+    /* For robust detection of standard colorspaces
+     * regardless of the presence of special markers,
+     * check component IDs from SOF marker first.
+     */
+    if      (cid0 == 0x01 && cid1 == 0x02 && cid2 == 0x03 && cid3 == 0x04)
+      cinfo->jpeg_color_space = JCS_YCCK;
+    else if (cid0 == 0x43 && cid1 == 0x4D && cid2 == 0x59 && cid3 == 0x4B)
+      cinfo->jpeg_color_space = JCS_CMYK;   /* ASCII 'C', 'M', 'Y', 'K' */
+    else if (cinfo->saw_Adobe_marker) {
       switch (cinfo->Adobe_transform) {
       case 0:
 	cinfo->jpeg_color_space = JCS_CMYK;
@@ -173,19 +188,17 @@ default_decompress_parms (j_decompress_ptr cinfo)
       default:
 	WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform);
 	cinfo->jpeg_color_space = JCS_YCCK;	/* assume it's YCCK */
-	break;
       }
     } else {
-      /* No special markers, assume straight CMYK. */
+      /* Unknown IDs and no special markers, assume straight CMYK. */
       cinfo->jpeg_color_space = JCS_CMYK;
     }
     cinfo->out_color_space = JCS_CMYK;
     break;
-    
+
   default:
     cinfo->jpeg_color_space = JCS_UNKNOWN;
     cinfo->out_color_space = JCS_UNKNOWN;
-    break;
   }
 
   /* Set defaults for other decompression parameters. */
diff --git a/3rdparty/libjpeg/jdarith.c b/3rdparty/libjpeg/jdarith.c
index 9e4dfdf76dfb..2c9abe23f2e3 100644
--- a/3rdparty/libjpeg/jdarith.c
+++ b/3rdparty/libjpeg/jdarith.c
@@ -1,7 +1,7 @@
 /*
  * jdarith.c
  *
- * Developed 1997-2019 by Guido Vollbeding.
+ * Developed 1997-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -239,7 +239,7 @@ process_restart (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   JBLOCKROW block;
@@ -318,7 +318,7 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   JBLOCKROW block;
@@ -400,7 +400,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   unsigned char *st;
@@ -434,7 +434,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   JBLOCKROW block;
@@ -509,7 +509,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
   jpeg_component_info * compptr;
diff --git a/3rdparty/libjpeg/jdatadst.c b/3rdparty/libjpeg/jdatadst.c
index 75ebd7c22d2f..b3b4798ea4c0 100644
--- a/3rdparty/libjpeg/jdatadst.c
+++ b/3rdparty/libjpeg/jdatadst.c
@@ -2,7 +2,7 @@
  * jdatadst.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2009-2019 by Guido Vollbeding.
+ * Modified 2009-2022 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -28,17 +28,17 @@ extern void free JPP((void *ptr));
 
 /* Expanded data destination object for stdio output */
 
+#define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */
+
 typedef struct {
   struct jpeg_destination_mgr pub; /* public fields */
 
   FILE * outfile;		/* target stream */
-  JOCTET * buffer;		/* start of buffer */
+  JOCTET buffer[OUTPUT_BUF_SIZE]; /* output buffer */
 } my_destination_mgr;
 
 typedef my_destination_mgr * my_dest_ptr;
 
-#define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */
-
 
 /* Expanded data destination object for memory output */
 
@@ -65,10 +65,6 @@ init_destination (j_compress_ptr cinfo)
 {
   my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
 
-  /* Allocate the output buffer --- it will be released when done with image */
-  dest->buffer = (JOCTET *) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, OUTPUT_BUF_SIZE * SIZEOF(JOCTET));
-
   dest->pub.next_output_byte = dest->buffer;
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
 }
@@ -187,8 +183,8 @@ term_mem_destination (j_compress_ptr cinfo)
 
 /*
  * Prepare for output to a stdio stream.
- * The caller must have already opened the stream, and is responsible
- * for closing it after finishing compression.
+ * The caller must have already opened the stream,
+ * and is responsible for closing it after finishing compression.
  */
 
 GLOBAL(void)
diff --git a/3rdparty/libjpeg/jdatasrc.c b/3rdparty/libjpeg/jdatasrc.c
index 606ae11b4cc0..fd7a1a594bbb 100644
--- a/3rdparty/libjpeg/jdatasrc.c
+++ b/3rdparty/libjpeg/jdatasrc.c
@@ -2,7 +2,7 @@
  * jdatasrc.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2009-2019 by Guido Vollbeding.
+ * Modified 2009-2022 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -23,18 +23,18 @@
 
 /* Expanded data source object for stdio input */
 
+#define INPUT_BUF_SIZE  4096	/* choose an efficiently fread'able size */
+
 typedef struct {
   struct jpeg_source_mgr pub;	/* public fields */
 
   FILE * infile;		/* source stream */
-  JOCTET * buffer;		/* start of buffer */
+  JOCTET buffer[INPUT_BUF_SIZE]; /* input buffer */
   boolean start_of_file;	/* have we gotten any data yet? */
 } my_source_mgr;
 
 typedef my_source_mgr * my_src_ptr;
 
-#define INPUT_BUF_SIZE  4096	/* choose an efficiently fread'able size */
-
 
 /*
  * Initialize source --- called by jpeg_read_header
@@ -204,8 +204,8 @@ term_source (j_decompress_ptr cinfo)
 
 /*
  * Prepare for input from a stdio stream.
- * The caller must have already opened the stream, and is responsible
- * for closing it after finishing decompression.
+ * The caller must have already opened the stream,
+ * and is responsible for closing it after finishing decompression.
  */
 
 GLOBAL(void)
@@ -213,19 +213,16 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
 {
   my_src_ptr src;
 
-  /* The source object and input buffer are made permanent so that a series
-   * of JPEG images can be read from the same file by calling jpeg_stdio_src
-   * only before the first one.  (If we discarded the buffer at the end of
-   * one image, we'd likely lose the start of the next one.)
+  /* The source object including the input buffer is made permanent so that
+   * a series of JPEG images can be read from the same file by calling
+   * jpeg_stdio_src only before the first one.  (If we discarded the buffer
+   * at the end of one image, we'd likely lose the start of the next one.)
    * This makes it unsafe to use this manager and a different source
    * manager serially with the same JPEG object.  Caveat programmer.
    */
   if (cinfo->src == NULL) {	/* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *) (*cinfo->mem->alloc_small)
       ((j_common_ptr) cinfo, JPOOL_PERMANENT, SIZEOF(my_source_mgr));
-    src = (my_src_ptr) cinfo->src;
-    src->buffer = (JOCTET *) (*cinfo->mem->alloc_small)
-      ((j_common_ptr) cinfo, JPOOL_PERMANENT, INPUT_BUF_SIZE * SIZEOF(JOCTET));
   }
 
   src = (my_src_ptr) cinfo->src;
diff --git a/3rdparty/libjpeg/jdcoefct.c b/3rdparty/libjpeg/jdcoefct.c
index ed02fc378f52..79ba42014086 100644
--- a/3rdparty/libjpeg/jdcoefct.c
+++ b/3rdparty/libjpeg/jdcoefct.c
@@ -2,7 +2,7 @@
  * jdcoefct.c
  *
  * Copyright (C) 1994-1997, Thomas G. Lane.
- * Modified 2002-2011 by Guido Vollbeding.
+ * Modified 2002-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -19,11 +19,13 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 
+
 /* Block smoothing is only applicable for progressive JPEG, so: */
 #ifndef D_PROGRESSIVE_SUPPORTED
 #undef BLOCK_SMOOTHING_SUPPORTED
 #endif
 
+
 /* Private buffer controller object */
 
 typedef struct {
@@ -38,11 +40,8 @@ typedef struct {
   /* The output side's location is represented by cinfo->output_iMCU_row. */
 
   /* In single-pass modes, it's sufficient to buffer just one MCU.
-   * We allocate a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
+   * We append a workspace of D_MAX_BLOCKS_IN_MCU coefficient blocks,
    * and let the entropy decoder write into that workspace each time.
-   * (On 80x86, the workspace is FAR even though it's not really very big;
-   * this is to keep the module interfaces unchanged when a large coefficient
-   * buffer is necessary.)
    * In multi-pass modes, this array points to the current MCU's blocks
    * within the virtual arrays; it is used only by the input side.
    */
@@ -58,10 +57,14 @@ typedef struct {
   int * coef_bits_latch;
 #define SAVED_COEFS  6		/* we save coef_bits[0..5] */
 #endif
+
+  /* Workspace for single-pass modes (omitted otherwise). */
+  JBLOCK blk_buffer[D_MAX_BLOCKS_IN_MCU];
 } my_coef_controller;
 
 typedef my_coef_controller * my_coef_ptr;
 
+
 /* Forward declarations */
 METHODDEF(int) decompress_onepass
 	JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf));
@@ -151,7 +154,8 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   JDIMENSION MCU_col_num;	/* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
-  int blkn, ci, xindex, yindex, yoffset, useful_width;
+  int ci, xindex, yindex, yoffset, useful_width;
+  JBLOCKROW blkp;
   JSAMPARRAY output_ptr;
   JDIMENSION start_col, output_col;
   jpeg_component_info *compptr;
@@ -162,10 +166,10 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
        yoffset++) {
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
 	 MCU_col_num++) {
+      blkp = coef->blk_buffer;	/* pointer to current DCT block within MCU */
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
       if (cinfo->lim_Se)	/* can bypass in DC only case */
-	FMEMZERO((void FAR *) coef->MCU_buffer[0],
-		 (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
+	MEMZERO(blkp, cinfo->blocks_in_MCU * SIZEOF(JBLOCK));
       if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
 	/* Suspension forced; update state counters and exit */
 	coef->MCU_vert_offset = yoffset;
@@ -173,37 +177,34 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 	return JPEG_SUSPENDED;
       }
       /* Determine where data should go in output_buf and do the IDCT thing.
-       * We skip dummy blocks at the right and bottom edges (but blkn gets
-       * incremented past them!).  Note the inner loop relies on having
-       * allocated the MCU_buffer[] blocks sequentially.
+       * We skip dummy blocks at the right and bottom edges (but blkp gets
+       * incremented past them!).
        */
-      blkn = 0;			/* index of current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
 	compptr = cinfo->cur_comp_info[ci];
 	/* Don't bother to IDCT an uninteresting component. */
 	if (! compptr->component_needed) {
-	  blkn += compptr->MCU_blocks;
+	  blkp += compptr->MCU_blocks;
 	  continue;
 	}
 	inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-						    : compptr->last_col_width;
 	output_ptr = output_buf[compptr->component_index] +
 	  yoffset * compptr->DCT_v_scaled_size;
+	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
+						    : compptr->last_col_width;
 	start_col = MCU_col_num * compptr->MCU_sample_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
 	  if (cinfo->input_iMCU_row < last_iMCU_row ||
-	      yoffset+yindex < compptr->last_row_height) {
+	      yoffset + yindex < compptr->last_row_height) {
 	    output_col = start_col;
 	    for (xindex = 0; xindex < useful_width; xindex++) {
-	      (*inverse_DCT) (cinfo, compptr,
-			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+	      (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) (blkp + xindex),
 			      output_ptr, output_col);
 	      output_col += compptr->DCT_h_scaled_size;
 	    }
+	    output_ptr += compptr->DCT_v_scaled_size;
 	  }
-	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->DCT_v_scaled_size;
+	  blkp += compptr->MCU_width;
 	}
       }
     }
@@ -212,7 +213,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   }
   /* Completed the iMCU row, advance counters for next one */
   cinfo->output_iMCU_row++;
-  if (++(cinfo->input_iMCU_row) < cinfo->total_iMCU_rows) {
+  if (++(cinfo->input_iMCU_row) <= last_iMCU_row) {
     start_iMCU_row(cinfo);
     return JPEG_ROW_COMPLETED;
   }
@@ -247,8 +248,9 @@ consume_data (j_decompress_ptr cinfo)
 {
   my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
   JDIMENSION MCU_col_num;	/* index of current MCU within row */
-  int blkn, ci, xindex, yindex, yoffset;
+  int ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
+  JBLOCKARRAY blkp;
   JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
   JBLOCKROW buffer_ptr;
   jpeg_component_info *compptr;
@@ -272,15 +274,16 @@ consume_data (j_decompress_ptr cinfo)
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num < cinfo->MCUs_per_row;
 	 MCU_col_num++) {
       /* Construct list of pointers to DCT blocks belonging to this MCU */
-      blkn = 0;			/* index of current DCT block within MCU */
+      blkp = coef->MCU_buffer;	/* pointer to current DCT block within MCU */
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
 	compptr = cinfo->cur_comp_info[ci];
 	start_col = MCU_col_num * compptr->MCU_width;
 	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
-	  for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
-	    coef->MCU_buffer[blkn++] = buffer_ptr++;
-	  }
+	  buffer_ptr = buffer[ci][yoffset + yindex] + start_col;
+	  xindex = compptr->MCU_width;
+	  do {
+	    *blkp++ = buffer_ptr++;
+	  } while (--xindex);
 	}
       }
       /* Try to fetch the MCU. */
@@ -370,7 +373,7 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     }
   }
 
-  if (++(cinfo->output_iMCU_row) < cinfo->total_iMCU_rows)
+  if (++(cinfo->output_iMCU_row) <= last_iMCU_row)
     return JPEG_ROW_COMPLETED;
   return JPEG_SCAN_COMPLETED;
 }
@@ -419,10 +422,9 @@ smoothing_ok (j_decompress_ptr cinfo)
 
   /* Allocate latch area if not already done */
   if (coef->coef_bits_latch == NULL)
-    coef->coef_bits_latch = (int *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  cinfo->num_components *
-				  (SAVED_COEFS * SIZEOF(int)));
+    coef->coef_bits_latch = (int *) (*cinfo->mem->alloc_small)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE,
+       cinfo->num_components * (SAVED_COEFS * SIZEOF(int)));
   coef_bits_latch = coef->coef_bits_latch;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -662,7 +664,7 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     }
   }
 
-  if (++(cinfo->output_iMCU_row) < cinfo->total_iMCU_rows)
+  if (++(cinfo->output_iMCU_row) <= last_iMCU_row)
     return JPEG_ROW_COMPLETED;
   return JPEG_SCAN_COMPLETED;
 }
@@ -679,17 +681,6 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
-  coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_coef_controller));
-  cinfo->coef = (struct jpeg_d_coef_controller *) coef;
-  coef->pub.start_input_pass = start_input_pass;
-  coef->pub.start_output_pass = start_output_pass;
-#ifdef BLOCK_SMOOTHING_SUPPORTED
-  coef->coef_bits_latch = NULL;
-#endif
-
-  /* Create the coefficient buffer. */
   if (need_full_buffer) {
 #ifdef D_MULTISCAN_FILES_SUPPORTED
     /* Allocate a full-image virtual array for each component, */
@@ -698,6 +689,9 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     int ci, access_rows;
     jpeg_component_info *compptr;
 
+    coef = (my_coef_ptr) (*cinfo->mem->alloc_small)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE,
+       SIZEOF(my_coef_controller) - SIZEOF(coef->blk_buffer));
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	 ci++, compptr++) {
       access_rows = compptr->v_samp_factor;
@@ -722,20 +716,29 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 #endif
   } else {
     /* We only need a single-MCU buffer. */
-    JBLOCKROW buffer;
-    int i;
-
-    buffer = (JBLOCKROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-    for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
-      coef->MCU_buffer[i] = buffer + i;
-    }
+    JBLOCKARRAY blkp;
+    JBLOCKROW buffer_ptr;
+    int bi;
+
+    coef = (my_coef_ptr) (*cinfo->mem->alloc_small)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_coef_controller));
+    buffer_ptr = coef->blk_buffer;
     if (cinfo->lim_Se == 0)	/* DC only case: want to bypass later */
-      FMEMZERO((void FAR *) buffer,
-	       (size_t) (D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)));
+      MEMZERO(buffer_ptr, SIZEOF(coef->blk_buffer));
+    blkp = coef->MCU_buffer;
+    bi = D_MAX_BLOCKS_IN_MCU;
+    do {
+      *blkp++ = buffer_ptr++;
+    } while (--bi);
     coef->pub.consume_data = dummy_consume_data;
     coef->pub.decompress_data = decompress_onepass;
     coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
   }
+
+  coef->pub.start_input_pass = start_input_pass;
+  coef->pub.start_output_pass = start_output_pass;
+#ifdef BLOCK_SMOOTHING_SUPPORTED
+  coef->coef_bits_latch = NULL;
+#endif
+  cinfo->coef = &coef->pub;
 }
diff --git a/3rdparty/libjpeg/jdcolor.c b/3rdparty/libjpeg/jdcolor.c
index 3746c2e973fb..6b40fb53404d 100644
--- a/3rdparty/libjpeg/jdcolor.c
+++ b/3rdparty/libjpeg/jdcolor.c
@@ -2,7 +2,7 @@
  * jdcolor.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2011-2019 by Guido Vollbeding.
+ * Modified 2011-2023 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -32,7 +32,9 @@ typedef struct {
   INT32 * Cb_g_tab;		/* => table for Cb to G conversion */
 
   /* Private state for RGB->Y conversion */
-  INT32 * rgb_y_tab;		/* => table for RGB to Y conversion */
+  INT32 * R_y_tab;		/* => table for R to Y conversion */
+  INT32 * G_y_tab;		/* => table for G to Y conversion */
+  INT32 * B_y_tab;		/* => table for B to Y conversion */
 } my_color_deconverter;
 
 typedef my_color_deconverter * my_cconvert_ptr;
@@ -87,29 +89,17 @@ typedef my_color_deconverter * my_cconvert_ptr;
  * by precalculating the constants times Cb and Cr for all possible values.
  * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
  * for 9-bit to 12-bit samples it is still acceptable.  It's not very
- * reasonable for 16-bit samples, but if you want lossless storage you
- * shouldn't be changing colorspace anyway.
- * The Cr=>R and Cb=>B values can be rounded to integers in advance; the
- * values for the G calculation are left scaled up, since we must add them
- * together before rounding.
+ * reasonable for 16-bit samples, but if you want lossless storage
+ * you shouldn't be changing colorspace anyway.
+ * The Cr=>R and Cb=>B values can be rounded to integers in advance;
+ * the values for the G calculation are left scaled up,
+ * since we must add them together before rounding.
  */
 
 #define SCALEBITS	16	/* speediest right-shift on some machines */
 #define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
 #define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
 
-/* We allocate one big table for RGB->Y conversion and divide it up into
- * three parts, instead of doing three alloc_small requests.  This lets us
- * use a single table base address, which can be held in a register in the
- * inner loops on many machines (more than can hold all three addresses,
- * anyway).
- */
-
-#define R_Y_OFF		0			/* offset to R => Y section */
-#define G_Y_OFF		(1*(MAXJSAMPLE+1))	/* offset to G => Y section */
-#define B_Y_OFF		(2*(MAXJSAMPLE+1))	/* etc. */
-#define TABLE_SIZE	(3*(MAXJSAMPLE+1))
-
 
 /*
  * Initialize tables for YCbCr->RGB and BG_YCC->RGB colorspace conversion.
@@ -249,17 +239,19 @@ LOCAL(void)
 build_rgb_y_table (j_decompress_ptr cinfo)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  INT32 * rgb_y_tab;
   INT32 i;
 
-  /* Allocate and fill in the conversion tables. */
-  cconvert->rgb_y_tab = rgb_y_tab = (INT32 *) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, TABLE_SIZE * SIZEOF(INT32));
+  cconvert->R_y_tab = (INT32 *) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE+1) * SIZEOF(INT32));
+  cconvert->G_y_tab = (INT32 *) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE+1) * SIZEOF(INT32));
+  cconvert->B_y_tab = (INT32 *) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE+1) * SIZEOF(INT32));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
-    rgb_y_tab[i+R_Y_OFF] = FIX(0.299) * i;
-    rgb_y_tab[i+G_Y_OFF] = FIX(0.587) * i;
-    rgb_y_tab[i+B_Y_OFF] = FIX(0.114) * i + ONE_HALF;
+    cconvert->R_y_tab[i] = FIX(0.299) * i;
+    cconvert->G_y_tab[i] = FIX(0.587) * i;
+    cconvert->B_y_tab[i] = FIX(0.114) * i + ONE_HALF;
   }
 }
 
@@ -274,8 +266,10 @@ rgb_gray_convert (j_decompress_ptr cinfo,
 		  JSAMPARRAY output_buf, int num_rows)
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_y_tab;
+  register INT32 y;
+  register INT32 * Rytab = cconvert->R_y_tab;
+  register INT32 * Gytab = cconvert->G_y_tab;
+  register INT32 * Bytab = cconvert->B_y_tab;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
@@ -288,13 +282,10 @@ rgb_gray_convert (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr0[col]);
-      g = GETJSAMPLE(inptr1[col]);
-      b = GETJSAMPLE(inptr2[col]);
-      /* Y */
-      outptr[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+      y  = Rytab[GETJSAMPLE(inptr0[col])];
+      y += Gytab[GETJSAMPLE(inptr1[col])];
+      y += Bytab[GETJSAMPLE(inptr2[col])];
+      outptr[col] = (JSAMPLE) (y >> SCALEBITS);
     }
   }
 }
@@ -354,7 +345,10 @@ rgb1_gray_convert (j_decompress_ptr cinfo,
 {
   my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
   register int r, g, b;
-  register INT32 * ctab = cconvert->rgb_y_tab;
+  register INT32 y;
+  register INT32 * Rytab = cconvert->R_y_tab;
+  register INT32 * Gytab = cconvert->G_y_tab;
+  register INT32 * Bytab = cconvert->B_y_tab;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
@@ -373,12 +367,10 @@ rgb1_gray_convert (j_decompress_ptr cinfo,
       /* Assume that MAXJSAMPLE+1 is a power of 2, so that the MOD
        * (modulo) operator is equivalent to the bitmask operator AND.
        */
-      r = (r + g - CENTERJSAMPLE) & MAXJSAMPLE;
-      b = (b + g - CENTERJSAMPLE) & MAXJSAMPLE;
-      /* Y */
-      outptr[col] = (JSAMPLE)
-		((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-		 >> SCALEBITS);
+      y  = Rytab[(r + g - CENTERJSAMPLE) & MAXJSAMPLE];
+      y += Gytab[g];
+      y += Bytab[(b + g - CENTERJSAMPLE) & MAXJSAMPLE];
+      outptr[col] = (JSAMPLE) (y >> SCALEBITS);
     }
   }
 }
@@ -420,7 +412,7 @@ rgb_convert (j_decompress_ptr cinfo,
 /*
  * Color conversion for no colorspace change: just copy the data,
  * converting from separate-planes to interleaved representation.
- * We assume out_color_components == num_components.
+ * Note: Omit uninteresting components in output buffer.
  */
 
 METHODDEF(void)
@@ -431,22 +423,27 @@ null_convert (j_decompress_ptr cinfo,
   register JSAMPROW outptr;
   register JSAMPROW inptr;
   register JDIMENSION count;
-  register int num_comps = cinfo->num_components;
+  register int out_comps = cinfo->out_color_components;
   JDIMENSION num_cols = cinfo->output_width;
+  JSAMPROW startptr;
   int ci;
+  jpeg_component_info *compptr;
 
   while (--num_rows >= 0) {
     /* It seems fastest to make a separate pass for each component. */
-    for (ci = 0; ci < num_comps; ci++) {
+    startptr = *output_buf++;
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+	 ci++, compptr++) {
+      if (! compptr->component_needed)
+	continue;		/* skip uninteresting component */
       inptr = input_buf[ci][input_row];
-      outptr = output_buf[0] + ci;
+      outptr = startptr++;
       for (count = num_cols; count > 0; count--) {
 	*outptr = *inptr++;	/* don't need GETJSAMPLE() here */
-	outptr += num_comps;
+	outptr += out_comps;
       }
     }
     input_row++;
-    output_buf++;
   }
 }
 
@@ -462,7 +459,7 @@ grayscale_convert (j_decompress_ptr cinfo,
 		   JSAMPIMAGE input_buf, JDIMENSION input_row,
 		   JSAMPARRAY output_buf, int num_rows)
 {
-  jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0,
+  jcopy_sample_rows(input_buf[0] + input_row, output_buf,
 		    num_rows, cinfo->output_width);
 }
 
@@ -549,6 +546,46 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
 }
 
 
+/*
+ * Convert CMYK to YK part of YCCK for colorless output.
+ * We assume build_rgb_y_table has been called.
+ */
+
+METHODDEF(void)
+cmyk_yk_convert (j_decompress_ptr cinfo,
+		 JSAMPIMAGE input_buf, JDIMENSION input_row,
+		 JSAMPARRAY output_buf, int num_rows)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  register INT32 y;
+  register INT32 * Rytab = cconvert->R_y_tab;
+  register INT32 * Gytab = cconvert->G_y_tab;
+  register INT32 * Bytab = cconvert->B_y_tab;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2, inptr3;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    inptr3 = input_buf[3][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      y  = Rytab[MAXJSAMPLE - GETJSAMPLE(inptr0[col])];
+      y += Gytab[MAXJSAMPLE - GETJSAMPLE(inptr1[col])];
+      y += Bytab[MAXJSAMPLE - GETJSAMPLE(inptr2[col])];
+      outptr[0] = (JSAMPLE) (y >> SCALEBITS);
+      /* K passes through unchanged */
+      outptr[1] = inptr3[col];	/* don't need GETJSAMPLE here */
+      outptr += 2;
+    }
+  }
+}
+
+
 /*
  * Empty method for start_pass.
  */
@@ -568,7 +605,7 @@ GLOBAL(void)
 jinit_color_deconverter (j_decompress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
-  int ci;
+  int ci, i;
 
   cconvert = (my_cconvert_ptr) (*cinfo->mem->alloc_small)
     ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_color_deconverter));
@@ -608,7 +645,7 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
     ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
 
   /* Set out_color_components and conversion method based on requested space.
-   * Also clear the component_needed flags for any unused components,
+   * Also adjust the component_needed flags for any unused components,
    * so that earlier pipeline stages can avoid useless computation.
    */
 
@@ -674,9 +711,9 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
     break;
 
   case JCS_BG_RGB:
-    cinfo->out_color_components = RGB_PIXELSIZE;
     if (cinfo->jpeg_color_space != JCS_BG_RGB)
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+    cinfo->out_color_components = RGB_PIXELSIZE;
     switch (cinfo->color_transform) {
     case JCT_NONE:
       cconvert->pub.color_convert = rgb_convert;
@@ -690,25 +727,38 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
     break;
 
   case JCS_CMYK:
+    if (cinfo->jpeg_color_space != JCS_YCCK)
+      goto def_label;
     cinfo->out_color_components = 4;
-    switch (cinfo->jpeg_color_space) {
-    case JCS_YCCK:
-      cconvert->pub.color_convert = ycck_cmyk_convert;
-      build_ycc_rgb_table(cinfo);
-      break;
-    case JCS_CMYK:
-      cconvert->pub.color_convert = null_convert;
-      break;
-    default:
-      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
-    }
+    cconvert->pub.color_convert = ycck_cmyk_convert;
+    build_ycc_rgb_table(cinfo);
+    break;
+
+  case JCS_YCCK:
+    if (cinfo->jpeg_color_space != JCS_CMYK ||
+	/* Support only YK part of YCCK for colorless output */
+	! cinfo->comp_info[0].component_needed ||
+	  cinfo->comp_info[1].component_needed ||
+	  cinfo->comp_info[2].component_needed ||
+	! cinfo->comp_info[3].component_needed)
+      goto def_label;
+    cinfo->out_color_components = 2;
+    /* Need all components on input side */
+    cinfo->comp_info[1].component_needed = TRUE;
+    cinfo->comp_info[2].component_needed = TRUE;
+    cconvert->pub.color_convert = cmyk_yk_convert;
+    build_rgb_y_table(cinfo);
     break;
 
-  default:		/* permit null conversion to same output space */
+  default: def_label:	/* permit null conversion to same output space */
     if (cinfo->out_color_space != cinfo->jpeg_color_space)
       /* unsupported non-null conversion */
       ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
-    cinfo->out_color_components = cinfo->num_components;
+    i = 0;
+    for (ci = 0; ci < cinfo->num_components; ci++)
+      if (cinfo->comp_info[ci].component_needed)
+	i++;		/* count output color components */
+    cinfo->out_color_components = i;
     cconvert->pub.color_convert = null_convert;
   }
 
diff --git a/3rdparty/libjpeg/jdct.h b/3rdparty/libjpeg/jdct.h
index c8ec6cd90e94..0f251590c494 100644
--- a/3rdparty/libjpeg/jdct.h
+++ b/3rdparty/libjpeg/jdct.h
@@ -2,7 +2,7 @@
  * jdct.h
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2002-2019 by Guido Vollbeding.
+ * Modified 2002-2023 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -158,7 +158,7 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 #define jpeg_idct_6x12		jRD6x12
 #define jpeg_idct_5x10		jRD5x10
 #define jpeg_idct_4x8		jRD4x8
-#define jpeg_idct_3x6		jRD3x8
+#define jpeg_idct_3x6		jRD3x6
 #define jpeg_idct_2x4		jRD2x4
 #define jpeg_idct_1x2		jRD1x2
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
diff --git a/3rdparty/libjpeg/jdhuff.c b/3rdparty/libjpeg/jdhuff.c
index aea06f60397d..f175f0c3239c 100644
--- a/3rdparty/libjpeg/jdhuff.c
+++ b/3rdparty/libjpeg/jdhuff.c
@@ -2,7 +2,7 @@
  * jdhuff.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2006-2019 by Guido Vollbeding.
+ * Modified 2006-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -704,7 +704,7 @@ process_restart (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int Al = cinfo->Al;
@@ -776,7 +776,7 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   register int s, k, r;
@@ -864,7 +864,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   JCOEF p1;
@@ -913,7 +913,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   register int s, k, r;
@@ -1072,7 +1072,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_sub (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_sub (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   const int * natural_order;
@@ -1201,7 +1201,7 @@ decode_mcu_sub (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data)
 {
   huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
   int blkn;
diff --git a/3rdparty/libjpeg/jdinput.c b/3rdparty/libjpeg/jdinput.c
index 0199553e896a..29fbef90bfcb 100644
--- a/3rdparty/libjpeg/jdinput.c
+++ b/3rdparty/libjpeg/jdinput.c
@@ -2,7 +2,7 @@
  * jdinput.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2002-2013 by Guido Vollbeding.
+ * Modified 2002-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -330,7 +330,6 @@ initial_setup (j_decompress_ptr cinfo)
     default:
       ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
 	       cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
-      break;
     }
 
   /* We initialize DCT_scaled_size and min_DCT_scaled_size to block_size.
@@ -391,16 +390,16 @@ per_scan_setup (j_decompress_ptr cinfo)
 {
   int ci, mcublks, tmp;
   jpeg_component_info *compptr;
-  
+
   if (cinfo->comps_in_scan == 1) {
-    
+
     /* Noninterleaved (single-component) scan */
     compptr = cinfo->cur_comp_info[0];
-    
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = compptr->width_in_blocks;
     cinfo->MCU_rows_in_scan = compptr->height_in_blocks;
-    
+
     /* For noninterleaved scan, always one block per MCU */
     compptr->MCU_width = 1;
     compptr->MCU_height = 1;
@@ -413,28 +412,26 @@ per_scan_setup (j_decompress_ptr cinfo)
     tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
-    
+
     /* Prepare array describing MCU composition */
     cinfo->blocks_in_MCU = 1;
     cinfo->MCU_membership[0] = 0;
-    
+
   } else {
-    
+
     /* Interleaved (multi-component) scan */
     if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan,
 	       MAX_COMPS_IN_SCAN);
-    
+
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
       jdiv_round_up((long) cinfo->image_width,
 		    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
-    cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
-		    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
-    
+    cinfo->MCU_rows_in_scan = cinfo->total_iMCU_rows;
+
     cinfo->blocks_in_MCU = 0;
-    
+
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       compptr = cinfo->cur_comp_info[ci];
       /* Sampling factors give # of blocks of component in each MCU */
@@ -457,7 +454,7 @@ per_scan_setup (j_decompress_ptr cinfo)
 	cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci;
       }
     }
-    
+
   }
 }
 
@@ -501,9 +498,8 @@ latch_quant_tables (j_decompress_ptr cinfo)
 	cinfo->quant_tbl_ptrs[qtblno] == NULL)
       ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
     /* OK, save away the quantization table */
-    qtbl = (JQUANT_TBL *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  SIZEOF(JQUANT_TBL));
+    qtbl = (JQUANT_TBL *) (*cinfo->mem->alloc_small)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(JQUANT_TBL));
     MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], SIZEOF(JQUANT_TBL));
     compptr->quant_table = qtbl;
   }
@@ -644,9 +640,8 @@ jinit_input_controller (j_decompress_ptr cinfo)
   my_inputctl_ptr inputctl;
 
   /* Create subobject in permanent pool */
-  inputctl = (my_inputctl_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-				SIZEOF(my_input_controller));
+  inputctl = (my_inputctl_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_PERMANENT, SIZEOF(my_input_controller));
   cinfo->inputctl = &inputctl->pub;
   /* Initialize method pointers */
   inputctl->pub.consume_input = consume_markers;
diff --git a/3rdparty/libjpeg/jdmainct.c b/3rdparty/libjpeg/jdmainct.c
index 4d738fbaed6f..1cd66d853bfd 100644
--- a/3rdparty/libjpeg/jdmainct.c
+++ b/3rdparty/libjpeg/jdmainct.c
@@ -2,7 +2,7 @@
  * jdmainct.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2002-2016 by Guido Vollbeding.
+ * Modified 2002-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -170,21 +170,22 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
   /* Get top-level space for component array pointers.
    * We alloc both arrays with one call to save a few cycles.
    */
-  mainp->xbuffer[0] = (JSAMPIMAGE)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				cinfo->num_components * 2 * SIZEOF(JSAMPARRAY));
+  mainp->xbuffer[0] = (JSAMPIMAGE) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE,
+     cinfo->num_components * 2 * SIZEOF(JSAMPARRAY));
   mainp->xbuffer[1] = mainp->xbuffer[0] + cinfo->num_components;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    if (! compptr->component_needed)
+      continue;			/* skip uninteresting component */
     rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
       cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     /* Get space for pointer lists --- M+4 row groups in each list.
      * We alloc both pointer lists with one call to save a few cycles.
      */
-    xbuf = (JSAMPARRAY)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				  2 * (rgroup * (M + 4)) * SIZEOF(JSAMPROW));
+    xbuf = (JSAMPARRAY) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo,
+      JPOOL_IMAGE, 2 * (rgroup * (M + 4)) * SIZEOF(JSAMPROW));
     xbuf += rgroup;		/* want one row group at negative offsets */
     mainp->xbuffer[0][ci] = xbuf;
     xbuf += rgroup * (M + 4);
@@ -210,6 +211,8 @@ make_funny_pointers (j_decompress_ptr cinfo)
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    if (! compptr->component_needed)
+      continue;			/* skip uninteresting component */
     rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
       cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     xbuf0 = mainp->xbuffer[0][ci];
@@ -250,6 +253,8 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    if (! compptr->component_needed)
+      continue;			/* skip uninteresting component */
     rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
       cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     xbuf0 = mainp->xbuffer[0][ci];
@@ -278,6 +283,8 @@ set_bottom_pointers (j_decompress_ptr cinfo)
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    if (! compptr->component_needed)
+      continue;			/* skip uninteresting component */
     /* Count sample rows in one iMCU row and in one row group */
     iMCUheight = compptr->v_samp_factor * compptr->DCT_v_scaled_size;
     rgroup = iMCUheight / cinfo->min_DCT_v_scaled_size;
@@ -333,7 +340,6 @@ start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 #endif
   default:
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    break;
   }
 }
 
@@ -344,9 +350,8 @@ start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-process_data_simple_main (j_decompress_ptr cinfo,
-			  JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-			  JDIMENSION out_rows_avail)
+process_data_simple_main (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+			  JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
   my_main_ptr mainp = (my_main_ptr) cinfo->main;
 
@@ -375,9 +380,8 @@ process_data_simple_main (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-process_data_context_main (j_decompress_ptr cinfo,
-			   JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-			   JDIMENSION out_rows_avail)
+process_data_context_main (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+			   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
   my_main_ptr mainp = (my_main_ptr) cinfo->main;
 
@@ -449,13 +453,12 @@ process_data_context_main (j_decompress_ptr cinfo,
 #ifdef QUANT_2PASS_SUPPORTED
 
 METHODDEF(void)
-process_data_crank_post (j_decompress_ptr cinfo,
-			 JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-			 JDIMENSION out_rows_avail)
+process_data_crank_post (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+			 JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
   (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE) NULL,
-				     (JDIMENSION *) NULL, (JDIMENSION) 0,
-				     output_buf, out_row_ctr, out_rows_avail);
+			(JDIMENSION *) NULL, (JDIMENSION) 0,
+			output_buf, out_row_ctr, out_rows_avail);
 }
 
 #endif /* QUANT_2PASS_SUPPORTED */
@@ -472,9 +475,8 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
   int ci, rgroup, ngroups;
   jpeg_component_info *compptr;
 
-  mainp = (my_main_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_main_controller));
+  mainp = (my_main_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_main_controller));
   cinfo->main = &mainp->pub;
   mainp->pub.start_pass = start_pass_main;
 
@@ -497,6 +499,8 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    if (! compptr->component_needed)
+      continue;			/* skip uninteresting component */
     rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
       cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
     mainp->buffer[ci] = (*cinfo->mem->alloc_sarray)
diff --git a/3rdparty/libjpeg/jdmarker.c b/3rdparty/libjpeg/jdmarker.c
index c10fde60534f..c5dfa2dec019 100644
--- a/3rdparty/libjpeg/jdmarker.c
+++ b/3rdparty/libjpeg/jdmarker.c
@@ -1039,7 +1039,6 @@ next_marker (j_decompress_ptr cinfo)
   }
 
   if (cinfo->marker->discarded_bytes != 0) {
-    WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
     cinfo->marker->discarded_bytes = 0;
   }
 
diff --git a/3rdparty/libjpeg/jdmaster.c b/3rdparty/libjpeg/jdmaster.c
index c309f7629faa..3070b7bb41f5 100644
--- a/3rdparty/libjpeg/jdmaster.c
+++ b/3rdparty/libjpeg/jdmaster.c
@@ -2,7 +2,7 @@
  * jdmaster.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2002-2019 by Guido Vollbeding.
+ * Modified 2002-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -103,10 +103,8 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
  * This function is used for full decompression.
  */
 {
-#ifdef IDCT_SCALING_SUPPORTED
-  int ci, ssize;
+  int ci, i;
   jpeg_component_info *compptr;
-#endif
 
   /* Prevent application from calling me at wrong times */
   if (cinfo->global_state != DSTATE_READY)
@@ -124,7 +122,7 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
    */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    ssize = 1;
+    int ssize = 1;
     if (! cinfo->raw_data_out)
       while (cinfo->min_DCT_h_scaled_size * ssize <=
 	     (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
@@ -166,27 +164,22 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
 #endif /* IDCT_SCALING_SUPPORTED */
 
   /* Report number of components in selected colorspace. */
-  /* Probably this should be in the color conversion module... */
+  /* This should correspond to the actual code in the color conversion module. */
   switch (cinfo->out_color_space) {
   case JCS_GRAYSCALE:
     cinfo->out_color_components = 1;
     break;
   case JCS_RGB:
   case JCS_BG_RGB:
-#if RGB_PIXELSIZE != 3
     cinfo->out_color_components = RGB_PIXELSIZE;
     break;
-#endif /* else share code with YCbCr */
-  case JCS_YCbCr:
-  case JCS_BG_YCC:
-    cinfo->out_color_components = 3;
-    break;
-  case JCS_CMYK:
-  case JCS_YCCK:
-    cinfo->out_color_components = 4;
-    break;
-  default:			/* else must be same colorspace as in file */
-    cinfo->out_color_components = cinfo->num_components;
+  default:	/* YCCK <=> CMYK conversion or same colorspace as in file */
+    i = 0;
+    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+	 ci++, compptr++)
+      if (compptr->component_needed)
+	i++;	/* count output color components */
+    cinfo->out_color_components = i;
   }
   cinfo->output_components = (cinfo->quantize_colors ? 1 :
 			      cinfo->out_color_components);
diff --git a/3rdparty/libjpeg/jdmerge.c b/3rdparty/libjpeg/jdmerge.c
index 8b5c899cce8e..0d16821bedf5 100644
--- a/3rdparty/libjpeg/jdmerge.c
+++ b/3rdparty/libjpeg/jdmerge.c
@@ -2,7 +2,7 @@
  * jdmerge.c
  *
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * Modified 2013-2019 by Guido Vollbeding.
+ * Modified 2013-2022 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -20,17 +20,17 @@
  *	B = Y + K4 * Cb
  * only the Y term varies among the group of pixels corresponding to a pair
  * of chroma samples, so the rest of the terms can be calculated just once.
- * At typical sampling ratios, this eliminates half or three-quarters of the
- * multiplications needed for color conversion.
+ * At typical sampling ratios, this eliminates half or three-quarters
+ * of the multiplications needed for color conversion.
  *
  * This file currently provides implementations for the following cases:
  *	YCC => RGB color conversion only (YCbCr or BG_YCC).
  *	Sampling ratios of 2h1v or 2h2v.
  *	No scaling needed at upsample time.
  *	Corner-aligned (non-CCIR601) sampling alignment.
- * Other special cases could be added, but in most applications these are
- * the only common cases.  (For uncommon cases we fall back on the more
- * general code in jdsample.c and jdcolor.c.)
+ * Other special cases could be added, but in most applications these
+ * are the only common cases.  (For uncommon cases we fall back on
+ * the more general code in jdsample.c and jdcolor.c.)
  */
 
 #define JPEG_INTERNALS
@@ -190,7 +190,7 @@ merged_2v_upsample (j_decompress_ptr cinfo,
 
   if (upsample->spare_full) {
     /* If we have a spare row saved from a previous cycle, just return it. */
-    jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
+    jcopy_sample_rows(& upsample->spare_row, output_buf + *out_row_ctr,
 		      1, upsample->out_row_width);
     num_rows = 1;
     upsample->spare_full = FALSE;
@@ -286,9 +286,9 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
     /* Do the chroma part of the calculation */
     cb = GETJSAMPLE(*inptr1++);
     cr = GETJSAMPLE(*inptr2++);
-    cred   = Crrtab[cr];
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue  = Cbbtab[cb];
+    cred   = Crrtab[cr];
     /* Fetch 2 Y values and emit 2 pixels */
     y  = GETJSAMPLE(*inptr0++);
     outptr[RGB_RED]   = range_limit[y + cred];
@@ -303,15 +303,14 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
+    y  = GETJSAMPLE(*inptr0);
     cb = GETJSAMPLE(*inptr1);
     cr = GETJSAMPLE(*inptr2);
-    cred   = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
-    cblue  = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
-    outptr[RGB_RED]   = range_limit[y + cred];
-    outptr[RGB_GREEN] = range_limit[y + cgreen];
-    outptr[RGB_BLUE]  = range_limit[y + cblue];
+    outptr[RGB_RED]   = range_limit[y + Crrtab[cr]];
+    outptr[RGB_GREEN] = range_limit[y +
+			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+						 SCALEBITS))];
+    outptr[RGB_BLUE]  = range_limit[y + Cbbtab[cb]];
   }
 }
 
@@ -350,9 +349,9 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
     /* Do the chroma part of the calculation */
     cb = GETJSAMPLE(*inptr1++);
     cr = GETJSAMPLE(*inptr2++);
-    cred   = Crrtab[cr];
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue  = Cbbtab[cb];
+    cred   = Crrtab[cr];
     /* Fetch 4 Y values and emit 4 pixels */
     y  = GETJSAMPLE(*inptr00++);
     outptr0[RGB_RED]   = range_limit[y + cred];
@@ -379,9 +378,9 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
   if (cinfo->output_width & 1) {
     cb = GETJSAMPLE(*inptr1);
     cr = GETJSAMPLE(*inptr2);
-    cred   = Crrtab[cr];
     cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue  = Cbbtab[cb];
+    cred   = Crrtab[cr];
     y  = GETJSAMPLE(*inptr00);
     outptr0[RGB_RED]   = range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
diff --git a/3rdparty/libjpeg/jdsample.c b/3rdparty/libjpeg/jdsample.c
index fd9907e20cd1..15afeafe3df8 100644
--- a/3rdparty/libjpeg/jdsample.c
+++ b/3rdparty/libjpeg/jdsample.c
@@ -2,7 +2,7 @@
  * jdsample.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * Modified 2002-2015 by Guido Vollbeding.
+ * Modified 2002-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -27,7 +27,7 @@
 /* Pointer to routine to upsample a single component */
 typedef JMETHOD(void, upsample1_ptr,
 		(j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+		 JSAMPARRAY input_data, JSAMPIMAGE output_data_ptr));
 
 /* Private subobject */
 
@@ -102,6 +102,9 @@ sep_upsample (j_decompress_ptr cinfo,
   if (upsample->next_row_out >= cinfo->max_v_samp_factor) {
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 	 ci++, compptr++) {
+      /* Don't bother to upsample an uninteresting component. */
+      if (! compptr->component_needed)
+	continue;
       /* Invoke per-component upsample method.  Notice we pass a POINTER
        * to color_buf[ci], so that fullsize_upsample can change it.
        */
@@ -156,25 +159,12 @@ sep_upsample (j_decompress_ptr cinfo,
 
 METHODDEF(void)
 fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-		   JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+		   JSAMPARRAY input_data, JSAMPIMAGE output_data_ptr)
 {
   *output_data_ptr = input_data;
 }
 
 
-/*
- * This is a no-op version used for "uninteresting" components.
- * These components will not be referenced by color conversion.
- */
-
-METHODDEF(void)
-noop_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  *output_data_ptr = NULL;	/* safety check */
-}
-
-
 /*
  * This version handles any integral sampling ratios.
  * This is not used for typical JPEG files, so it need not be fast.
@@ -188,25 +178,25 @@ noop_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 
 METHODDEF(void)
 int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	      JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+	      JSAMPARRAY input_data, JSAMPIMAGE output_data_ptr)
 {
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPARRAY output_data, output_end;
   register JSAMPROW inptr, outptr;
   register JSAMPLE invalue;
   register int h;
   JSAMPROW outend;
   int h_expand, v_expand;
-  int inrow, outrow;
 
   h_expand = upsample->h_expand[compptr->component_index];
   v_expand = upsample->v_expand[compptr->component_index];
 
-  inrow = outrow = 0;
-  while (outrow < cinfo->max_v_samp_factor) {
+  output_data = *output_data_ptr;
+  output_end = output_data + cinfo->max_v_samp_factor;
+  for (; output_data < output_end; output_data += v_expand) {
     /* Generate one output row with proper horizontal expansion */
-    inptr = input_data[inrow];
-    outptr = output_data[outrow];
+    inptr = *input_data++;
+    outptr = *output_data;
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
       invalue = *inptr++;	/* don't need GETJSAMPLE() here */
@@ -216,11 +206,9 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
     }
     /* Generate any additional output rows by duplicating the first one */
     if (v_expand > 1) {
-      jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-			v_expand-1, cinfo->output_width);
+      jcopy_sample_rows(output_data, output_data + 1,
+			v_expand - 1, cinfo->output_width);
     }
-    inrow++;
-    outrow += v_expand;
   }
 }
 
@@ -232,7 +220,7 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 
 METHODDEF(void)
 h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+	       JSAMPARRAY input_data, JSAMPIMAGE output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -260,28 +248,26 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 
 METHODDEF(void)
 h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-	       JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+	       JSAMPARRAY input_data, JSAMPIMAGE output_data_ptr)
 {
-  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPARRAY output_data, output_end;
   register JSAMPROW inptr, outptr;
   register JSAMPLE invalue;
   JSAMPROW outend;
-  int inrow, outrow;
 
-  inrow = outrow = 0;
-  while (outrow < cinfo->max_v_samp_factor) {
-    inptr = input_data[inrow];
-    outptr = output_data[outrow];
+  output_data = *output_data_ptr;
+  output_end = output_data + cinfo->max_v_samp_factor;
+  for (; output_data < output_end; output_data += 2) {
+    inptr = *input_data++;
+    outptr = *output_data;
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
       invalue = *inptr++;	/* don't need GETJSAMPLE() here */
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
-    jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
+    jcopy_sample_rows(output_data, output_data + 1,
 		      1, cinfo->output_width);
-    inrow++;
-    outrow += 2;
   }
 }
 
@@ -298,9 +284,8 @@ jinit_upsampler (j_decompress_ptr cinfo)
   jpeg_component_info * compptr;
   int h_in_group, v_in_group, h_out_group, v_out_group;
 
-  upsample = (my_upsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_upsampler));
+  upsample = (my_upsample_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_upsampler));
   cinfo->upsample = &upsample->pub;
   upsample->pub.start_pass = start_pass_upsample;
   upsample->pub.upsample = sep_upsample;
@@ -314,6 +299,9 @@ jinit_upsampler (j_decompress_ptr cinfo)
    */
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    /* Don't bother to upsample an uninteresting component. */
+    if (! compptr->component_needed)
+      continue;
     /* Compute size of an "input group" after IDCT scaling.  This many samples
      * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
      */
@@ -324,11 +312,6 @@ jinit_upsampler (j_decompress_ptr cinfo)
     h_out_group = cinfo->max_h_samp_factor;
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
-    if (! compptr->component_needed) {
-      /* Don't bother to upsample an uninteresting component. */
-      upsample->methods[ci] = noop_upsample;
-      continue;		/* don't need to allocate buffer */
-    }
     if (h_in_group == h_out_group && v_in_group == v_out_group) {
       /* Fullsize components can be processed without any work. */
       upsample->methods[ci] = fullsize_upsample;
diff --git a/3rdparty/libjpeg/jinclude.h b/3rdparty/libjpeg/jinclude.h
index 20ed4ef11f8e..12ea8cd2fdf8 100644
--- a/3rdparty/libjpeg/jinclude.h
+++ b/3rdparty/libjpeg/jinclude.h
@@ -2,7 +2,7 @@
  * jinclude.h
  *
  * Copyright (C) 1991-1994, Thomas G. Lane.
- * Modified 2017 by Guido Vollbeding.
+ * Modified 2017-2022 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -11,8 +11,8 @@
  * care of by the standard jconfig symbols, but on really weird systems
  * you may have to edit this file.)
  *
- * NOTE: this file is NOT intended to be included by applications using the
- * JPEG library.  Most applications need only include jpeglib.h.
+ * NOTE: this file is NOT intended to be included by applications using
+ * the JPEG library.  Most applications need only include jpeglib.h.
  */
 
 
@@ -87,11 +87,71 @@
  *
  * Furthermore, macros are provided for fflush() and ferror() in order
  * to facilitate adaption by applications using an own FILE class.
+ *
+ * You can define your own custom file I/O functions in jconfig.h and
+ * #define JPEG_HAVE_FILE_IO_CUSTOM there to prevent redefinition here.
+ *
+ * You can #define JPEG_USE_FILE_IO_CUSTOM in jconfig.h to use custom file
+ * I/O functions implemented in Delphi VCL (Visual Component Library)
+ * in Vcl.Imaging.jpeg.pas for the TJPEGImage component utilizing
+ * the Delphi RTL (Run-Time Library) TMemoryStream component:
+ *
+ *   procedure jpeg_stdio_src(var cinfo: jpeg_decompress_struct;
+ *     input_file: TStream); external;
+ *
+ *   procedure jpeg_stdio_dest(var cinfo: jpeg_compress_struct;
+ *     output_file: TStream); external;
+ *
+ *   function jfread(var buf; recsize, reccount: Integer; S: TStream): Integer;
+ *   begin
+ *     Result := S.Read(buf, recsize * reccount);
+ *   end;
+ *
+ *   function jfwrite(const buf; recsize, reccount: Integer; S: TStream): Integer;
+ *   begin
+ *     Result := S.Write(buf, recsize * reccount);
+ *   end;
+ *
+ *   function jfflush(S: TStream): Integer;
+ *   begin
+ *     Result := 0;
+ *   end;
+ *
+ *   function jferror(S: TStream): Integer;
+ *   begin
+ *     Result := 0;
+ *   end;
+ *
+ * TMemoryStream of Delphi RTL has the distinctive feature to provide dynamic
+ * memory buffer management with a file/stream-based interface, particularly for
+ * the write (output) operation, which is easier to apply compared with direct
+ * implementations as given in jdatadst.c for memory destination.  Those direct
+ * implementations of dynamic memory write tend to be more difficult to use,
+ * so providing an option like TMemoryStream may be a useful alternative.
+ *
+ * The CFile/CMemFile classes of the Microsoft Foundation Class (MFC) Library
+ * may be used in a similar fashion.
  */
 
+#ifndef JPEG_HAVE_FILE_IO_CUSTOM
+#ifdef JPEG_USE_FILE_IO_CUSTOM
+extern size_t jfread(void * __ptr, size_t __size, size_t __n, FILE * __stream);
+extern size_t jfwrite(const void * __ptr, size_t __size, size_t __n, FILE * __stream);
+extern int    jfflush(FILE * __stream);
+extern int    jferror(FILE * __fp);
+
+#define JFREAD(file,buf,sizeofbuf)  \
+  ((size_t) jfread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
+#define JFWRITE(file,buf,sizeofbuf)  \
+  ((size_t) jfwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
+#define JFFLUSH(file)	jfflush(file)
+#define JFERROR(file)	jferror(file)
+#else
 #define JFREAD(file,buf,sizeofbuf)  \
   ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
 #define JFWRITE(file,buf,sizeofbuf)  \
   ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
 #define JFFLUSH(file)	fflush(file)
 #define JFERROR(file)	ferror(file)
+#endif
+#endif
diff --git a/3rdparty/libjpeg/jmorecfg.h b/3rdparty/libjpeg/jmorecfg.h
index 679d68bdc549..4638d6af2d21 100644
--- a/3rdparty/libjpeg/jmorecfg.h
+++ b/3rdparty/libjpeg/jmorecfg.h
@@ -2,7 +2,7 @@
  * jmorecfg.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 1997-2013 by Guido Vollbeding.
+ * Modified 1997-2022 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -351,8 +351,8 @@ typedef enum { FALSE = 0, TRUE = 1 } boolean;
 
 #define C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
-#define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
-#define DCT_SCALING_SUPPORTED	    /* Input rescaling via DCT? (Requires DCT_ISLOW)*/
+#define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN) */
+#define DCT_SCALING_SUPPORTED	/* Input rescaling via DCT? (Requires DCT_ISLOW) */
 #define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
 /* Note: if you selected more than 8-bit data precision, it is dangerous to
  * turn off ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only
@@ -369,8 +369,8 @@ typedef enum { FALSE = 0, TRUE = 1 } boolean;
 
 #define D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
-#define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
-#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? (Requires DCT_ISLOW)*/
+#define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN) */
+#define IDCT_SCALING_SUPPORTED	/* Output rescaling via IDCT? (Requires DCT_ISLOW) */
 #define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
@@ -384,20 +384,31 @@ typedef enum { FALSE = 0, TRUE = 1 } boolean;
 /*
  * Ordering of RGB data in scanlines passed to or from the application.
  * If your application wants to deal with data in the order B,G,R, just
- * change these macros.  You can also deal with formats such as R,G,B,X
- * (one extra byte per pixel) by changing RGB_PIXELSIZE.  Note that changing
- * the offsets will also change the order in which colormap data is organized.
+ * #define JPEG_USE_RGB_CUSTOM in jconfig.h, or define your own custom
+ * order in jconfig.h and #define JPEG_HAVE_RGB_CUSTOM.
+ * You can also deal with formats such as R,G,B,X (one extra byte per pixel)
+ * by changing RGB_PIXELSIZE.
+ * Note that changing the offsets will also change
+ * the order in which colormap data is organized.
  * RESTRICTIONS:
  * 1. The sample applications cjpeg,djpeg do NOT support modified RGB formats.
  * 2. The color quantizer modules will not behave desirably if RGB_PIXELSIZE
- *    is not 3 (they don't understand about dummy color components!).  So you
- *    can't use color quantization if you change that value.
+ *    is not 3 (they don't understand about dummy color components!).
+ *    So you can't use color quantization if you change that value.
  */
 
+#ifndef JPEG_HAVE_RGB_CUSTOM
+#ifdef JPEG_USE_RGB_CUSTOM
+#define RGB_RED		2	/* Offset of Red in an RGB scanline element */
+#define RGB_GREEN	1	/* Offset of Green */
+#define RGB_BLUE	0	/* Offset of Blue */
+#else
 #define RGB_RED		0	/* Offset of Red in an RGB scanline element */
 #define RGB_GREEN	1	/* Offset of Green */
 #define RGB_BLUE	2	/* Offset of Blue */
+#endif
 #define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */
+#endif
 
 
 /* Definitions for speed-related optimizations. */
diff --git a/3rdparty/libjpeg/jpegint.h b/3rdparty/libjpeg/jpegint.h
index 52c708d4e39a..3528bff5b7b1 100644
--- a/3rdparty/libjpeg/jpegint.h
+++ b/3rdparty/libjpeg/jpegint.h
@@ -2,7 +2,7 @@
  * jpegint.h
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 1997-2019 by Guido Vollbeding.
+ * Modified 1997-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -103,8 +103,7 @@ struct jpeg_downsampler {
 typedef JMETHOD(void, forward_DCT_ptr,
 		(j_compress_ptr cinfo, jpeg_component_info * compptr,
 		 JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-		 JDIMENSION start_row, JDIMENSION start_col,
-		 JDIMENSION num_blocks));
+		 JDIMENSION start_col, JDIMENSION num_blocks));
 
 struct jpeg_forward_dct {
   JMETHOD(void, start_pass, (j_compress_ptr cinfo));
@@ -115,7 +114,7 @@ struct jpeg_forward_dct {
 /* Entropy encoding */
 struct jpeg_entropy_encoder {
   JMETHOD(void, start_pass, (j_compress_ptr cinfo, boolean gather_statistics));
-  JMETHOD(boolean, encode_mcu, (j_compress_ptr cinfo, JBLOCKROW *MCU_data));
+  JMETHOD(boolean, encode_mcu, (j_compress_ptr cinfo, JBLOCKARRAY MCU_data));
   JMETHOD(void, finish_pass, (j_compress_ptr cinfo));
 };
 
@@ -211,7 +210,7 @@ struct jpeg_marker_reader {
 /* Entropy decoding */
 struct jpeg_entropy_decoder {
   JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
-  JMETHOD(boolean, decode_mcu, (j_decompress_ptr cinfo, JBLOCKROW *MCU_data));
+  JMETHOD(boolean, decode_mcu, (j_decompress_ptr cinfo, JBLOCKARRAY MCU_data));
   JMETHOD(void, finish_pass, (j_decompress_ptr cinfo));
 };
 
@@ -416,8 +415,8 @@ EXTERN(void) jinit_memory_mgr JPP((j_common_ptr cinfo));
 /* Utility routines in jutils.c */
 EXTERN(long) jdiv_round_up JPP((long a, long b));
 EXTERN(long) jround_up JPP((long a, long b));
-EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array, int source_row,
-				    JSAMPARRAY output_array, int dest_row,
+EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array,
+				    JSAMPARRAY output_array,
 				    int num_rows, JDIMENSION num_cols));
 EXTERN(void) jcopy_block_row JPP((JBLOCKROW input_row, JBLOCKROW output_row,
 				  JDIMENSION num_blocks));
diff --git a/3rdparty/libjpeg/jpeglib.h b/3rdparty/libjpeg/jpeglib.h
index 591a2cb60549..e7e15ab2cd48 100644
--- a/3rdparty/libjpeg/jpeglib.h
+++ b/3rdparty/libjpeg/jpeglib.h
@@ -2,7 +2,7 @@
  * jpeglib.h
  *
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modified 2002-2019 by Guido Vollbeding.
+ * Modified 2002-2022 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -39,7 +39,7 @@ extern "C" {
 
 #define JPEG_LIB_VERSION        90	/* Compatibility version 9.0 */
 #define JPEG_LIB_VERSION_MAJOR  9
-#define JPEG_LIB_VERSION_MINOR  4
+#define JPEG_LIB_VERSION_MINOR  6
 
 
 /* Various constants determining the sizes of things.
diff --git a/3rdparty/libjpeg/jquant1.c b/3rdparty/libjpeg/jquant1.c
index 9d11f70669b2..60b1843e73ea 100644
--- a/3rdparty/libjpeg/jquant1.c
+++ b/3rdparty/libjpeg/jquant1.c
@@ -2,7 +2,7 @@
  * jquant1.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * Modified 2011 by Guido Vollbeding.
+ * Modified 2011-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -293,8 +293,7 @@ create_colormap (j_decompress_ptr cinfo)
   /* The colors are ordered in the map in standard row-major order, */
   /* i.e. rightmost (highest-indexed) color changes most rapidly. */
 
-  colormap = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
+  colormap = (*cinfo->mem->alloc_sarray) ((j_common_ptr) cinfo, JPOOL_IMAGE,
      (JDIMENSION) total_colors, (JDIMENSION) cinfo->out_color_components);
 
   /* blksize is number of adjacent repeated entries for a component */
@@ -400,9 +399,8 @@ make_odither_array (j_decompress_ptr cinfo, int ncolors)
   int j,k;
   INT32 num,den;
 
-  odither = (ODITHER_MATRIX_PTR)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(ODITHER_MATRIX));
+  odither = (ODITHER_MATRIX_PTR) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(ODITHER_MATRIX));
   /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1).
    * Hence the dither value for the matrix cell with fill order f
    * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
@@ -531,8 +529,7 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    FMEMZERO((void FAR *) output_buf[row],
-	     (size_t) (width * SIZEOF(JSAMPLE)));
+    FMEMZERO((void FAR *) output_buf[row], (size_t) width * SIZEOF(JSAMPLE));
     row_index = cquantize->row_index;
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
@@ -636,8 +633,7 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    FMEMZERO((void FAR *) output_buf[row],
-	     (size_t) (width * SIZEOF(JSAMPLE)));
+    FMEMZERO((void FAR *) output_buf[row], (size_t) width * SIZEOF(JSAMPLE));
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
       output_ptr = output_buf[row];
@@ -726,10 +722,10 @@ alloc_fs_workspace (j_decompress_ptr cinfo)
   size_t arraysize;
   int i;
 
-  arraysize = (size_t) ((cinfo->output_width + 2) * SIZEOF(FSERROR));
+  arraysize = ((size_t) cinfo->output_width + (size_t) 2) * SIZEOF(FSERROR);
   for (i = 0; i < cinfo->out_color_components; i++) {
-    cquantize->fserrors[i] = (FSERRPTR)
-      (*cinfo->mem->alloc_large)((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+    cquantize->fserrors[i] = (FSERRPTR) (*cinfo->mem->alloc_large)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
   }
 }
 
@@ -780,13 +776,12 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
     if (cquantize->fserrors[0] == NULL)
       alloc_fs_workspace(cinfo);
     /* Initialize the propagated errors to zero. */
-    arraysize = (size_t) ((cinfo->output_width + 2) * SIZEOF(FSERROR));
+    arraysize = ((size_t) cinfo->output_width + (size_t) 2) * SIZEOF(FSERROR);
     for (i = 0; i < cinfo->out_color_components; i++)
       FMEMZERO((void FAR *) cquantize->fserrors[i], arraysize);
     break;
   default:
     ERREXIT(cinfo, JERR_NOT_COMPILED);
-    break;
   }
 }
 
@@ -823,10 +818,9 @@ jinit_1pass_quantizer (j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
 
-  cquantize = (my_cquantize_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_cquantizer));
-  cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+  cquantize = (my_cquantize_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_cquantizer));
+  cinfo->cquantize = &cquantize->pub;
   cquantize->pub.start_pass = start_pass_1_quant;
   cquantize->pub.finish_pass = finish_pass_1_quant;
   cquantize->pub.new_color_map = new_color_map_1_quant;
diff --git a/3rdparty/libjpeg/jquant2.c b/3rdparty/libjpeg/jquant2.c
index 38fc2af7a55b..662b9bcef3c4 100644
--- a/3rdparty/libjpeg/jquant2.c
+++ b/3rdparty/libjpeg/jquant2.c
@@ -2,7 +2,7 @@
  * jquant2.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * Modified 2011 by Guido Vollbeding.
+ * Modified 2011-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -1197,8 +1197,8 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
 
     if (cinfo->dither_mode == JDITHER_FS) {
-      size_t arraysize = (size_t) ((cinfo->output_width + 2) *
-				   (3 * SIZEOF(FSERROR)));
+      size_t arraysize = ((size_t) cinfo->output_width + (size_t) 2)
+	* (3 * SIZEOF(FSERROR));
       /* Allocate Floyd-Steinberg workspace if we didn't already. */
       if (cquantize->fserrors == NULL)
 	cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
@@ -1247,10 +1247,9 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
   my_cquantize_ptr cquantize;
   int i;
 
-  cquantize = (my_cquantize_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-				SIZEOF(my_cquantizer));
-  cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+  cquantize = (my_cquantize_ptr) (*cinfo->mem->alloc_small)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, SIZEOF(my_cquantizer));
+  cinfo->cquantize = &cquantize->pub;
   cquantize->pub.start_pass = start_pass_2_quant;
   cquantize->pub.new_color_map = new_color_map_2_quant;
   cquantize->fserrors = NULL;	/* flag optional arrays not allocated */
@@ -1284,7 +1283,8 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
     if (desired > MAXNUMCOLORS)
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
     cquantize->sv_colormap = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo,JPOOL_IMAGE, (JDIMENSION) desired, (JDIMENSION) 3);
+      ((j_common_ptr) cinfo, JPOOL_IMAGE,
+       (JDIMENSION) desired, (JDIMENSION) 3);
     cquantize->desired = desired;
   } else
     cquantize->sv_colormap = NULL;
@@ -1302,7 +1302,7 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
   if (cinfo->dither_mode == JDITHER_FS) {
     cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
       ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (size_t) ((cinfo->output_width + 2) * (3 * SIZEOF(FSERROR))));
+       ((size_t) cinfo->output_width + (size_t) 2) * (3 * SIZEOF(FSERROR)));
     /* Might as well create the error-limiting table too. */
     init_error_limit(cinfo);
   }
diff --git a/3rdparty/libjpeg/jutils.c b/3rdparty/libjpeg/jutils.c
index 1e2dfb017b0e..31e16dfb5391 100644
--- a/3rdparty/libjpeg/jutils.c
+++ b/3rdparty/libjpeg/jutils.c
@@ -2,7 +2,7 @@
  * jutils.c
  *
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * Modified 2009-2019 by Guido Vollbeding.
+ * Modified 2009-2020 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -52,67 +52,67 @@ const int jpeg_zigzag_order[DCTSIZE2] = {
  */
 
 const int jpeg_natural_order[DCTSIZE2+16] = {
-  0,  1,  8, 16,  9,  2,  3, 10,
- 17, 24, 32, 25, 18, 11,  4,  5,
- 12, 19, 26, 33, 40, 48, 41, 34,
- 27, 20, 13,  6,  7, 14, 21, 28,
- 35, 42, 49, 56, 57, 50, 43, 36,
- 29, 22, 15, 23, 30, 37, 44, 51,
- 58, 59, 52, 45, 38, 31, 39, 46,
- 53, 60, 61, 54, 47, 55, 62, 63,
- 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
- 63, 63, 63, 63, 63, 63, 63, 63
+   0,  1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63,
+  63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+  63, 63, 63, 63, 63, 63, 63, 63
 };
 
 const int jpeg_natural_order7[7*7+16] = {
-  0,  1,  8, 16,  9,  2,  3, 10,
- 17, 24, 32, 25, 18, 11,  4,  5,
- 12, 19, 26, 33, 40, 48, 41, 34,
- 27, 20, 13,  6, 14, 21, 28, 35,
- 42, 49, 50, 43, 36, 29, 22, 30,
- 37, 44, 51, 52, 45, 38, 46, 53,
- 54,
- 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
- 63, 63, 63, 63, 63, 63, 63, 63
+   0,  1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6, 14, 21, 28, 35,
+  42, 49, 50, 43, 36, 29, 22, 30,
+  37, 44, 51, 52, 45, 38, 46, 53,
+  54,
+  63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+  63, 63, 63, 63, 63, 63, 63, 63
 };
 
 const int jpeg_natural_order6[6*6+16] = {
-  0,  1,  8, 16,  9,  2,  3, 10,
- 17, 24, 32, 25, 18, 11,  4,  5,
- 12, 19, 26, 33, 40, 41, 34, 27,
- 20, 13, 21, 28, 35, 42, 43, 36,
- 29, 37, 44, 45,
- 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
- 63, 63, 63, 63, 63, 63, 63, 63
+   0,  1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 41, 34, 27,
+  20, 13, 21, 28, 35, 42, 43, 36,
+  29, 37, 44, 45,
+  63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+  63, 63, 63, 63, 63, 63, 63, 63
 };
 
 const int jpeg_natural_order5[5*5+16] = {
-  0,  1,  8, 16,  9,  2,  3, 10,
- 17, 24, 32, 25, 18, 11,  4, 12,
- 19, 26, 33, 34, 27, 20, 28, 35,
- 36,
- 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
- 63, 63, 63, 63, 63, 63, 63, 63
+   0,  1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4, 12,
+  19, 26, 33, 34, 27, 20, 28, 35,
+  36,
+  63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+  63, 63, 63, 63, 63, 63, 63, 63
 };
 
 const int jpeg_natural_order4[4*4+16] = {
-  0,  1,  8, 16,  9,  2,  3, 10,
- 17, 24, 25, 18, 11, 19, 26, 27,
- 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
- 63, 63, 63, 63, 63, 63, 63, 63
+   0,  1,  8, 16,  9,  2,  3, 10,
+  17, 24, 25, 18, 11, 19, 26, 27,
+  63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+  63, 63, 63, 63, 63, 63, 63, 63
 };
 
 const int jpeg_natural_order3[3*3+16] = {
-  0,  1,  8, 16,  9,  2, 10, 17,
- 18,
- 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
- 63, 63, 63, 63, 63, 63, 63, 63
+   0,  1,  8, 16,  9,  2, 10, 17,
+  18,
+  63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+  63, 63, 63, 63, 63, 63, 63, 63
 };
 
 const int jpeg_natural_order2[2*2+16] = {
-  0,  1,  8,  9,
- 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
- 63, 63, 63, 63, 63, 63, 63, 63
+   0,  1,  8,  9,
+  63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+  63, 63, 63, 63, 63, 63, 63, 63
 };
 
 
@@ -174,12 +174,12 @@ jzero_far (void FAR * target, size_t bytestozero)
 
 
 GLOBAL(void)
-jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
-		   JSAMPARRAY output_array, int dest_row,
+jcopy_sample_rows (JSAMPARRAY input_array,
+		   JSAMPARRAY output_array,
 		   int num_rows, JDIMENSION num_cols)
 /* Copy some rows of samples from one place to another.
- * num_rows rows are copied from input_array[source_row++]
- * to output_array[dest_row++]; these areas may overlap for duplication.
+ * num_rows rows are copied from *input_array++ to *output_array++;
+ * these areas may overlap for duplication.
  * The source and destination arrays must be at least as wide as num_cols.
  */
 {
@@ -191,9 +191,6 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
 #endif
   register int row;
 
-  input_array += source_row;
-  output_array += dest_row;
-
   for (row = num_rows; row > 0; row--) {
     inptr = *input_array++;
     outptr = *output_array++;
diff --git a/3rdparty/libjpeg/jversion.h b/3rdparty/libjpeg/jversion.h
index c9befacde213..df53ef5e55bd 100644
--- a/3rdparty/libjpeg/jversion.h
+++ b/3rdparty/libjpeg/jversion.h
@@ -1,7 +1,7 @@
 /*
  * jversion.h
  *
- * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1991-2024, Thomas G. Lane, Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
@@ -9,6 +9,6 @@
  */
 
 
-#define JVERSION	"9d  12-Jan-2020"
+#define JVERSION	"9f  14-Jan-2024"
 
-#define JCOPYRIGHT	"Copyright (C) 2020, Thomas G. Lane, Guido Vollbeding"
+#define JCOPYRIGHT	"Copyright (C) 2024, Thomas G. Lane, Guido Vollbeding"
diff --git a/3rdparty/libpng/CHANGES b/3rdparty/libpng/CHANGES
index f0b0a9342c3d..441b57ecf1ab 100644
--- a/3rdparty/libpng/CHANGES
+++ b/3rdparty/libpng/CHANGES
@@ -204,7 +204,7 @@ Version 0.97 [January, 1998]
   Added simple sRGB support (Glenn R-P)
   Easier conditional compiling, e.g.,
     define PNG_READ/WRITE_NOT_FULLY_SUPPORTED;
-    all configurable options can be selected from command-line instead
+    all configurable options can be selected from command line instead
     of having to edit pngconf.h (Glenn R-P)
   Fixed memory leak in pngwrite.c (free info_ptr->text) (Glenn R-P)
   Added more conditions for png_do_background, to avoid changing
@@ -942,7 +942,7 @@ Version 1.0.8 [July 24, 2000]
 Version 1.0.9beta1 [November 10, 2000]
   Fixed typo in scripts/makefile.hpux
   Updated makevms.com in scripts and contrib/* and contrib/* (Martin Zinser)
-  Fixed seqence-point bug in contrib/pngminus/png2pnm (Martin Zinser)
+  Fixed sequence-point bug in contrib/pngminus/png2pnm (Martin Zinser)
   Changed "cdrom.com" in documentation to "libpng.org"
   Revised pnggccrd.c to get it all working, and updated makefile.gcmmx (Greg).
   Changed type of "params" from voidp to png_voidp in png_read|write_png().
@@ -2295,7 +2295,7 @@ Version 1.4.0beta58 [May 14, 2009]
   Clarified usage of sig_bit versus sig_bit_p in example.c (Vincent Torri)
 
 Version 1.4.0beta59 [May 15, 2009]
-  Reformated sources in libpng style (3-space intentation, comment format)
+  Reformatted sources in libpng style (3-space indentation, comment format)
   Fixed typo in libpng docs (PNG_FILTER_AVE should be PNG_FILTER_AVG)
   Added sections about the git repository and our coding style to the
     documentation
@@ -2661,7 +2661,7 @@ Version 1.4.1beta06 [January 28, 2010]
 
 Version 1.4.1beta07 [February 6, 2010]
   Folded some long lines in the source files.
-  Added defineable PNG_USER_CHUNK_CACHE_MAX, PNG_USER_CHUNK_MALLOC_MAX,
+  Added definable PNG_USER_CHUNK_CACHE_MAX, PNG_USER_CHUNK_MALLOC_MAX,
     and a PNG_USER_LIMITS_SUPPORTED flag.
   Eliminated use of png_ptr->irowbytes and reused the slot in png_ptr as
     png_ptr->png_user_chunk_malloc_max.
@@ -3886,7 +3886,7 @@ Version 1.6.0beta06 [January 24, 2012]
 Version 1.6.0beta07 [January 28, 2012]
   Eliminated Intel icc/icl compiler warnings. The Intel (GCC derived)
     compiler issues slightly different warnings from those issued by the
-    current vesions of GCC. This eliminates those warnings by
+    current versions of GCC. This eliminates those warnings by
     adding/removing casts and small code rewrites.
   Updated configure.ac from autoupdate: added --enable-werror option.
     Also some layout regularization and removal of introduced tab characters
@@ -3919,7 +3919,7 @@ Version 1.6.0beta08 [February 1, 2012]
     version checking to configure.ac
   Improved pngstest speed by not doing redundant tests and add const to
     the background parameter of png_image_finish_read. The --background
-    option is now done automagically only when required, so that commandline
+    option is now done automagically only when required, so that command-line
     option no longer exists.
   Cleaned up pngpriv.h to consistently declare all functions and data.
     Also eliminated PNG_CONST_DATA, which is apparently not needed but we
@@ -4052,7 +4052,7 @@ Version 1.6.0beta16 [March 6, 2012]
     (in fact this is harmless, but the PNG data produced may be sub-optimal).
 
 Version 1.6.0beta17 [March 10, 2012]
-  Fixed PNG_LIBPNG_BUILD_BASE_TYPE definition. 
+  Fixed PNG_LIBPNG_BUILD_BASE_TYPE definition.
   Reject all iCCP chunks after the first, even if the first one is invalid.
   Deflate/inflate was reworked to move common zlib calls into single
     functions [rw]util.c.  A new shared keyword check routine was also added
@@ -4962,7 +4962,7 @@ Version 1.6.13beta01 [July 4, 2014]
   Changed "if defined(__ARM_NEON__)" to
     "if (defined(__ARM_NEON__) || defined(__ARM_NEON))" (James Wu).
   Fixed clang no-warning builds: png_digit was defined but never used.
-    
+
 Version 1.6.13beta02 [July 21, 2014]
   Fixed an incorrect separator ("/" should be "\") in scripts/makefile.vcwin32
     (bug report from Wolfgang S. Kechel).  Bug was introduced in libpng-1.6.11.
@@ -5453,7 +5453,7 @@ Version 1.6.21beta01 [December 11, 2015]
 Version 1.6.21beta02 [December 14, 2015]
   Moved png_check_keyword() from pngwutil.c to pngset.c
   Removed LE/BE dependencies in pngvalid, to 'fix' the current problem
-    in the BigEndian tests by not testing it, making the BE code the same 
+    in the BigEndian tests by not testing it, making the BE code the same
     as the LE version.
   Fixes to pngvalid for various reduced build configurations (eliminate unused
     statics) and a fix for the case in rgb_to_gray when the digitize option
@@ -5517,7 +5517,7 @@ Version 1.6.22beta03 [March 9, 2016]
   Added a common-law trademark notice and export control information
     to the LICENSE file, png.h, and the man page.
   Restored "& 0xff" in png_save_uint_16() and png_save_uint_32() that
-    were accidentally removed from libpng-1.6.17. 
+    were accidentally removed from libpng-1.6.17.
   Changed PNG_INFO_cHNK and PNG_FREE_cHNK from 0xnnnn to 0xnnnnU in png.h
     (Robert C. Seacord).
   Removed dubious "#if INT_MAX" test from png.h that was added to
@@ -5927,7 +5927,7 @@ Version 1.6.32beta03 [August 2, 2017]
     (Bug report from the OSS-fuzz project).
 
 Version 1.6.32beta04 [August 2, 2017]
-  Replaced local eXIf_buf with info_ptr-eXIf_buf in png_handle_eXIf().
+  Replaced local eXIf_buf with info_ptr->eXIf_buf in png_handle_eXIf().
   Update libpng.3 and libpng-manual.txt about eXIf functions.
 
 Version 1.6.32beta05 [August 2, 2017]
@@ -5950,7 +5950,7 @@ Version 1.6.32beta09 [August 3, 2017]
   Require cmake-2.8.8 in CMakeLists.txt. Revised symlink creation,
     no longer using deprecated cmake LOCATION feature (Clifford Yapp).
   Fixed five-byte error in the calculation of IDAT maximum possible size.
-  
+
 Version 1.6.32beta10 [August 5, 2017]
   Moved chunk-length check into a png_check_chunk_length() private
     function (Suggested by Max Stepin).
@@ -6103,6 +6103,99 @@ Version 1.6.37 [April 14, 2019]
   Added makefiles for AddressSanitizer-enabled builds.
   Cleaned up various makefiles.
 
+Version 1.6.38 [September 14, 2022]
+  Added configurations and scripts for continuous integration.
+  Fixed various errors in the handling of tRNS, hIST and eXIf.
+  Implemented many stability improvements across all platforms.
+  Updated the internal documentation.
+
+Version 1.6.39 [November 20, 2022]
+  Changed the error handler of oversized chunks (i.e. larger than
+    PNG_USER_CHUNK_MALLOC_MAX) from png_chunk_error to png_benign_error.
+  Fixed a buffer overflow error in contrib/tools/pngfix.
+  Fixed a memory leak (CVE-2019-6129) in contrib/tools/pngcp.
+  Disabled the ARM Neon optimizations by default in the CMake file,
+    following the default behavior of the configure script.
+  Allowed configure.ac to work with the trunk version of autoconf.
+  Removed the support for "install" targets from the legacy makefiles;
+    removed the obsolete makefile.cegcc.
+  Cleaned up the code and updated the internal documentation.
+
+Version 1.6.40 [June 21, 2023]
+  Fixed the eXIf chunk multiplicity checks.
+  Fixed a memory leak in pCAL processing.
+  Corrected the validity report about tRNS inside png_get_valid().
+  Fixed various build issues on *BSD, Mac and Windows.
+  Updated the configurations and the scripts for continuous integration.
+  Cleaned up the code, the build scripts, and the documentation.
+
+Version 1.6.41 [January 24, 2024]
+  Added SIMD-optimized code for the LoongArch LSX hardware.
+    (Contributed by GuXiWei, JinBo and ZhangLixia)
+  Fixed the run-time discovery of MIPS MSA hardware.
+    (Contributed by Sui Jingfeng)
+  Fixed an off-by-one error in the function png_do_check_palette_indexes(),
+    which failed to recognize errors that might have existed in the first
+    column of a broken palette-encoded image. This was a benign regression
+    accidentally introduced in libpng-1.6.33. No pixel was harmed.
+    (Contributed by Adam Richter; reviewed by John Bowler)
+  Fixed, improved and modernized the contrib/pngminus programs, i.e.,
+    png2pnm.c and pnm2png.c
+  Removed old and peculiar portability hacks that were meant to silence
+    warnings issued by gcc version 7.1 alone.
+    (Contributed by John Bowler)
+  Fixed and modernized the CMake file, and raised the minimum required
+    CMake version from 3.1 to 3.6.
+    (Contributed by Clinton Ingram, Timothy Lyanguzov, Tyler Kropp, et al.)
+  Allowed the configure script to disable the building of auxiliary tools
+    and tests, thus catching up with the CMake file.
+    (Contributed by Carlo Bramini)
+  Fixed a build issue on Mac.
+    (Contributed by Zixu Wang)
+  Moved the Autoconf macro files to scripts/autoconf.
+  Moved the CMake files (except for the main CMakeLists.txt) to
+    scripts/cmake and moved the list of their contributing authors to
+    scripts/cmake/AUTHORS.md
+  Updated the CI configurations and scripts.
+  Relicensed the CI scripts to the MIT License.
+  Improved the test coverage.
+    (Contributed by John Bowler)
+
+Version 1.6.42 [January 29, 2024]
+  Fixed the implementation of the macro function png_check_sig().
+    This was an API regression, introduced in libpng-1.6.41.
+    (Reported by Matthieu Darbois)
+  Fixed and updated the libpng manual.
+
+Version 1.6.43 [February 23, 2024]
+  Fixed the row width check in png_check_IHDR().
+    This corrected a bug that was specific to the 16-bit platforms,
+    and removed a spurious compiler warning from the 64-bit builds.
+    (Reported by Jacek Caban; fixed by John Bowler)
+  Added eXIf chunk support to the push-mode reader in pngpread.c.
+    (Contributed by Chris Blume)
+  Added contrib/pngexif for the benefit of the users who would like
+    to inspect the content of eXIf chunks.
+  Added contrib/conftest/basic.dfa, a basic build-time configuration.
+    (Contributed by John Bowler)
+  Fixed a preprocessor condition in pngread.c that broke build-time
+    configurations like contrib/conftest/pngcp.dfa.
+    (Contributed by John Bowler)
+  Added CMake build support for LoongArch LSX.
+    (Contributed by GuXiWei)
+  Fixed a CMake build error that occurred under a peculiar state of the
+    dependency tree. This was a regression introduced in libpng-1.6.41.
+    (Contributed by Dan Rosser)
+  Marked the installed libpng headers as system headers in CMake.
+    (Contributed by Benjamin Buch)
+  Updated the build support for RISCOS.
+    (Contributed by Cameron Cawley)
+  Updated the makefiles to allow cross-platform builds to initialize
+    conventional make variables like AR and ARFLAGS.
+  Added various improvements to the CI scripts in areas like version
+    consistency verification and text linting.
+  Added version consistency verification to pngtest.c also.
+
 Send comments/corrections/commendations to png-mng-implement at lists.sf.net.
 Subscription is required; visit
 https://lists.sourceforge.net/lists/listinfo/png-mng-implement
diff --git a/3rdparty/libpng/CMakeLists.txt b/3rdparty/libpng/CMakeLists.txt
index 973f39cafafa..d05031b9f5fd 100644
--- a/3rdparty/libpng/CMakeLists.txt
+++ b/3rdparty/libpng/CMakeLists.txt
@@ -3,23 +3,17 @@
 #
 # ----------------------------------------------------------------------------
 
-if(ENABLE_NEON)
-  project(${PNG_LIBRARY} C ASM)
-else()
-  project(${PNG_LIBRARY} C)
-endif()
+project(${PNG_LIBRARY} C)
 
-if(NOT WIN32)
-  find_library(M_LIBRARY
-    NAMES m
-    PATHS /usr/lib /usr/local/lib
-  )
-  if(NOT M_LIBRARY)
-    message(STATUS "math lib 'libm' not found; floating point support disabled")
+if(UNIX AND NOT APPLE AND NOT BEOS AND NOT HAIKU AND NOT EMSCRIPTEN)
+  find_library(M_LIBRARY m)
+  if(M_LIBRARY)
+    set(M_LIBRARY m)
+  else()
+    set(M_LIBRARY "")
   endif()
 else()
-  # not needed on windows
-  set(M_LIBRARY "")
+  # libm is not available or not needed.
 endif()
 
 ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}" ${ZLIB_INCLUDE_DIRS})
@@ -27,45 +21,192 @@ ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}" ${ZLIB_INCLUDE_DIRS})
 file(GLOB lib_srcs *.c)
 file(GLOB lib_hdrs *.h)
 
+# CMake currently sets CMAKE_SYSTEM_PROCESSOR to one of x86_64 or arm64 on macOS,
+# based upon the OS architecture, not the target architecture. As such, we need
+# to check CMAKE_OSX_ARCHITECTURES to identify which hardware-specific flags to
+# enable. Note that this will fail if you attempt to build a universal binary in
+# a single CMake invocation.
+if(APPLE AND CMAKE_OSX_ARCHITECTURES)
+  set(TARGET_ARCH ${CMAKE_OSX_ARCHITECTURES})
+else()
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+OCV_OPTION(PNG_HARDWARE_OPTIMIZATIONS "Enable Hardware Optimizations, if available for this platform" (NOT CV_DISABLE_OPTIMIZATION))
+
+if(PNG_HARDWARE_OPTIMIZATIONS)
 
-if(ARM OR AARCH64)
-  if(ENABLE_NEON)
-    if(NOT AARCH64)
+# Set definitions and sources for ARM.
+if(TARGET_ARCH MATCHES "^(ARM|arm|aarch)")
+  if(TARGET_ARCH MATCHES "^(ARM64|arm64|aarch64)")
+    set(PNG_ARM_NEON_POSSIBLE_VALUES on off)
+    set(PNG_ARM_NEON "on"
+        CACHE STRING "Enable ARM NEON optimizations: on|off; on is default")
+  else()
+    set(PNG_ARM_NEON_POSSIBLE_VALUES check on off)
+    set(PNG_ARM_NEON "off"
+        CACHE STRING "Enable ARM NEON optimizations: check|on|off; off is default")
+  endif()
+  set_property(CACHE PNG_ARM_NEON
+               PROPERTY STRINGS ${PNG_ARM_NEON_POSSIBLE_VALUES})
+  list(FIND PNG_ARM_NEON_POSSIBLE_VALUES ${PNG_ARM_NEON} index)
+  if(index EQUAL -1)
+    message(FATAL_ERROR "PNG_ARM_NEON must be one of [${PNG_ARM_NEON_POSSIBLE_VALUES}]")
+  elseif(NOT PNG_ARM_NEON STREQUAL "off")
+    list(APPEND lib_srcs arm/arm_init.c arm/filter_neon_intrinsics.c arm/palette_neon_intrinsics.c)
+    if(NOT MSVC)
+      enable_language(ASM)
       list(APPEND lib_srcs arm/filter_neon.S)
     endif()
-    list(APPEND lib_srcs arm/arm_init.c arm/filter_neon_intrinsics.c arm/palette_neon_intrinsics.c)
-    add_definitions(-DPNG_ARM_NEON_OPT=2)
+    if(PNG_ARM_NEON STREQUAL "on")
+      add_definitions(-DPNG_ARM_NEON_OPT=2)
+    elseif(PNG_ARM_NEON STREQUAL "check")
+      add_definitions(-DPNG_ARM_NEON_CHECK_SUPPORTED)
+    endif()
   else()
     add_definitions(-DPNG_ARM_NEON_OPT=0) # NEON assembler is not supported
   endif()
 endif()
 
-if(";${CPU_BASELINE_FINAL};" MATCHES "SSE2"
-    AND (NOT MSVC OR (MSVC_VERSION GREATER 1799))) # MSVS2013+ (issue #7232)
-  list(APPEND lib_srcs intel/intel_init.c intel/filter_sse2_intrinsics.c)
-  add_definitions(-DPNG_INTEL_SSE)
+# Set definitions and sources for PowerPC.
+if(TARGET_ARCH MATCHES "^(powerpc|ppc64)")
+  set(PNG_POWERPC_VSX_POSSIBLE_VALUES on off)
+  set(PNG_POWERPC_VSX "on"
+      CACHE STRING "Enable POWERPC VSX optimizations: on|off; on is default")
+  set_property(CACHE PNG_POWERPC_VSX
+               PROPERTY STRINGS ${PNG_POWERPC_VSX_POSSIBLE_VALUES})
+  list(FIND PNG_POWERPC_VSX_POSSIBLE_VALUES ${PNG_POWERPC_VSX} index)
+  if(index EQUAL -1)
+    message(FATAL_ERROR "PNG_POWERPC_VSX must be one of [${PNG_POWERPC_VSX_POSSIBLE_VALUES}]")
+  elseif(NOT PNG_POWERPC_VSX STREQUAL "off")
+    list(APPEND lib_srcs powerpc/powerpc_init.c powerpc/filter_vsx_intrinsics.c)
+    if(PNG_POWERPC_VSX STREQUAL "on")
+      add_definitions(-DPNG_POWERPC_VSX_OPT=2)
+    endif()
+  else()
+    add_definitions(-DPNG_POWERPC_VSX_OPT=0)
+  endif()
+endif()
+
+# Set definitions and sources for Intel.
+if(TARGET_ARCH MATCHES "^(i[3-6]86|x86|AMD64)")
+  set(PNG_INTEL_SSE_POSSIBLE_VALUES on off)
+  set(PNG_INTEL_SSE "on"
+      CACHE STRING "Enable INTEL_SSE optimizations: on|off; on is default")
+  set_property(CACHE PNG_INTEL_SSE
+               PROPERTY STRINGS ${PNG_INTEL_SSE_POSSIBLE_VALUES})
+  list(FIND PNG_INTEL_SSE_POSSIBLE_VALUES ${PNG_INTEL_SSE} index)
+  if(index EQUAL -1)
+    message(FATAL_ERROR "PNG_INTEL_SSE must be one of [${PNG_INTEL_SSE_POSSIBLE_VALUES}]")
+  elseif(NOT PNG_INTEL_SSE STREQUAL "off")
+    list(APPEND lib_srcs intel/intel_init.c intel/filter_sse2_intrinsics.c)
+    if(PNG_INTEL_SSE STREQUAL "on")
+      add_definitions(-DPNG_INTEL_SSE_OPT=1)
+    endif()
+  else()
+    add_definitions(-DPNG_INTEL_SSE_OPT=0)
+  endif()
 endif()
 
-# set definitions and sources for MIPS
-if(";${CPU_BASELINE_FINAL};" MATCHES "MSA")
+# Set definitions and sources for MIPS.
+if(TARGET_ARCH MATCHES "^(mipsel|mips64el)")
+  set(PNG_MIPS_MSA_POSSIBLE_VALUES on off)
+  set(PNG_MIPS_MSA "on"
+      CACHE STRING "Enable MIPS_MSA optimizations: on|off; on is default")
+  set_property(CACHE PNG_MIPS_MSA
+               PROPERTY STRINGS ${PNG_MIPS_MSA_POSSIBLE_VALUES})
+  list(FIND PNG_MIPS_MSA_POSSIBLE_VALUES ${PNG_MIPS_MSA} index_msa)
+  if(index_msa EQUAL -1)
+    message(FATAL_ERROR "PNG_MIPS_MSA must be one of [${PNG_MIPS_MSA_POSSIBLE_VALUES}]")
+  endif()
+
+  set(PNG_MIPS_MMI_POSSIBLE_VALUES on off)
+  set(PNG_MIPS_MMI "on"
+      CACHE STRING "Enable MIPS_MMI optimizations: on|off; on is default")
+  set_property(CACHE PNG_MIPS_MMI
+               PROPERTY STRINGS ${PNG_MIPS_MMI_POSSIBLE_VALUES})
+  list(FIND PNG_MIPS_MMI_POSSIBLE_VALUES ${PNG_MIPS_MMI} index_mmi)
+  if(index_mmi EQUAL -1)
+    message(FATAL_ERROR "PNG_MIPS_MMI must be one of [${PNG_MIPS_MMI_POSSIBLE_VALUES}]")
+  endif()
+
+  if(PNG_MIPS_MSA STREQUAL "on" AND PNG_MIPS_MMI STREQUAL "on")
+    list(APPEND lib_srcs mips/mips_init.c mips/filter_msa_intrinsics.c mips/filter_mmi_inline_assembly.c)
+    add_definitions(-DPNG_MIPS_MSA_OPT=2)
+    add_definitions(-DPNG_MIPS_MMI_OPT=1)
+  elseif(PNG_MIPS_MSA STREQUAL "on")
     list(APPEND lib_srcs mips/mips_init.c mips/filter_msa_intrinsics.c)
     add_definitions(-DPNG_MIPS_MSA_OPT=2)
-    ocv_warnings_disable(CMAKE_C_FLAGS -Wshadow)
-else()
+    add_definitions(-DPNG_MIPS_MMI_OPT=0)
+  elseif(PNG_MIPS_MMI STREQUAL "on")
+    list(APPEND lib_srcs mips/mips_init.c mips/filter_mmi_inline_assembly.c)
     add_definitions(-DPNG_MIPS_MSA_OPT=0)
+    add_definitions(-DPNG_MIPS_MMI_OPT=1)
+    else()
+    add_definitions(-DPNG_MIPS_MSA_OPT=0)
+    add_definitions(-DPNG_MIPS_MMI_OPT=0)
+    endif()
 endif()
 
-if(PPC64LE OR PPC64)
-  # VSX3 features are backwards compatible
-  if(";${CPU_BASELINE_FINAL};" MATCHES "VSX.*"
-      AND NOT PPC64)
-    list(APPEND lib_srcs powerpc/powerpc_init.c powerpc/filter_vsx_intrinsics.c)
-    add_definitions(-DPNG_POWERPC_VSX_OPT=2)
+# Set definitions and sources for LoongArch.
+if(TARGET_ARCH MATCHES "^(loongarch)")
+  include(CheckCCompilerFlag)
+  set(PNG_LOONGARCH_LSX_POSSIBLE_VALUES on off)
+  set(PNG_LOONGARCH_LSX "on"
+      CACHE STRING "Enable LOONGARCH_LSX optimizations: on|off; on is default")
+  set_property(CACHE PNG_LOONGARCH_LSX
+               PROPERTY STRINGS ${PNG_LOONGARCH_LSX_POSSIBLE_VALUES})
+  list(FIND PNG_LOONGARCH_LSX_POSSIBLE_VALUES ${PNG_LOONGARCH_LSX} index)
+  if(index EQUAL -1)
+    message(FATAL_ERROR "PNG_LOONGARCH_LSX must be one of [${PNG_LOONGARCH_LSX_POSSIBLE_VALUES}]")
+  elseif(NOT PNG_LOONGARCH_LSX STREQUAL "off")
+    CHECK_C_COMPILER_FLAG("-mlsx" COMPILER_SUPPORTS_LSX)
+    if(COMPILER_SUPPORTS_LSX)
+      set(libpng_loongarch_sources
+          loongarch/loongarch_lsx_init.c
+          loongarch/filter_lsx_intrinsics.c)
+      set_source_files_properties(${libpng_loongarch_sources}
+                                  PROPERTIES
+                                  COMPILE_FLAGS "-mlsx")
+    list(APPEND lib_srcs ${libpng_loongarch_sources})
+      add_definitions(-DPNG_LOONGARCH_LSX_OPT=1)
+    else()
+      message(FATAL_ERROR "Compiler does not support -mlsx option")
+    endif()
   else()
-    add_definitions(-DPNG_POWERPC_VSX_OPT=0)
+    add_definitions(-DPNG_LOONGARCH_LSX_OPT=0)
   endif()
 endif()
 
+else(PNG_HARDWARE_OPTIMIZATIONS)
+
+# Set definitions and sources for ARM.
+if(TARGET_ARCH MATCHES "^(ARM|arm|aarch)")
+  add_definitions(-DPNG_ARM_NEON_OPT=0)
+endif()
+
+# Set definitions and sources for PowerPC.
+if(TARGET_ARCH MATCHES "^(powerpc|ppc64)")
+  add_definitions(-DPNG_POWERPC_VSX_OPT=0)
+endif()
+
+# Set definitions and sources for Intel.
+if(TARGET_ARCH MATCHES "^(i[3-6]86|x86|AMD64)")
+  add_definitions(-DPNG_INTEL_SSE_OPT=0)
+endif()
+
+# Set definitions and sources for MIPS.
+if(TARGET_ARCH MATCHES "^(mipsel|mips64el)")
+  add_definitions(-DPNG_MIPS_MSA_OPT=0)
+endif()
+
+# Set definitions and sources for LoongArch.
+if(TARGET_ARCH MATCHES "^(loongarch)")
+  add_definitions(-DPNG_LOONGARCH_LSX_OPT=0)
+endif()
+
+endif(PNG_HARDWARE_OPTIMIZATIONS)
+
 # ----------------------------------------------------------------------------------
 #         Define the library target:
 # ----------------------------------------------------------------------------------
diff --git a/3rdparty/libpng/LICENSE b/3rdparty/libpng/LICENSE
index e0c5b531cf54..25f298f0fcfd 100644
--- a/3rdparty/libpng/LICENSE
+++ b/3rdparty/libpng/LICENSE
@@ -4,8 +4,8 @@ COPYRIGHT NOTICE, DISCLAIMER, and LICENSE
 PNG Reference Library License version 2
 ---------------------------------------
 
- * Copyright (c) 1995-2019 The PNG Reference Library Authors.
- * Copyright (c) 2018-2019 Cosmin Truta.
+ * Copyright (c) 1995-2024 The PNG Reference Library Authors.
+ * Copyright (c) 2018-2024 Cosmin Truta.
  * Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson.
  * Copyright (c) 1996-1997 Andreas Dilger.
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
diff --git a/3rdparty/libpng/README b/3rdparty/libpng/README
index cfc1f0e3dc97..a6ca3ae9f940 100644
--- a/3rdparty/libpng/README
+++ b/3rdparty/libpng/README
@@ -1,178 +1,179 @@
-README for libpng version 1.6.37 - April 14, 2019
-=================================================
-
-See the note about version numbers near the top of png.h.
-See INSTALL for instructions on how to install libpng.
-
-Libpng comes in several distribution formats.  Get libpng-*.tar.gz or
-libpng-*.tar.xz or if you want UNIX-style line endings in the text
-files, or lpng*.7z or lpng*.zip if you want DOS-style line endings.
-
-Version 0.89 was the first official release of libpng.  Don't let the
-fact that it's the first release fool you.  The libpng library has been
-in extensive use and testing since mid-1995.  By late 1997 it had
-finally gotten to the stage where there hadn't been significant
-changes to the API in some time, and people have a bad feeling about
-libraries with versions < 1.0.  Version 1.0.0 was released in
-March 1998.
-
-****
-Note that some of the changes to the png_info structure render this
-version of the library binary incompatible with libpng-0.89 or
-earlier versions if you are using a shared library.  The type of the
-"filler" parameter for png_set_filler() has changed from png_byte to
-png_uint_32, which will affect shared-library applications that use
-this function.
+README for libpng version 1.6.43
+================================
 
-To avoid problems with changes to the internals of the png info_struct,
-new APIs have been made available in 0.95 to avoid direct application
-access to info_ptr.  These functions are the png_set_<chunk> and
-png_get_<chunk> functions.  These functions should be used when
-accessing/storing the info_struct data, rather than manipulating it
-directly, to avoid such problems in the future.
+See the note about version numbers near the top of `png.h`.
+See `INSTALL` for instructions on how to install libpng.
 
-It is important to note that the APIs did not make current programs
-that access the info struct directly incompatible with the new
-library, through libpng-1.2.x.  In libpng-1.4.x, which was meant to
-be a transitional release, members of the png_struct and the
-info_struct can still be accessed, but the compiler will issue a
-warning about deprecated usage.  Since libpng-1.5.0, direct access
-to these structs is not allowed, and the definitions of the structs
-reside in private pngstruct.h and pnginfo.h header files that are not
-accessible to applications.  It is strongly suggested that new
-programs use the new APIs (as shown in example.c and pngtest.c), and
-older programs be converted to the new format, to facilitate upgrades
-in the future.
-****
-
-Additions since 0.90 include the ability to compile libpng as a
-Windows DLL, and new APIs for accessing data in the info struct.
-Experimental functions include the ability to set weighting and cost
-factors for row filter selection, direct reads of integers from buffers
-on big-endian processors that support misaligned data access, faster
-methods of doing alpha composition, and more accurate 16->8 bit color
-conversion.
+Libpng comes in several distribution formats.  Get `libpng-*.tar.gz`
+or `libpng-*.tar.xz` if you want UNIX-style line endings in the text
+files, or `lpng*.7z` or `lpng*.zip` if you want DOS-style line endings.
 
-The additions since 0.89 include the ability to read from a PNG stream
-which has had some (or all) of the signature bytes read by the calling
-application.  This also allows the reading of embedded PNG streams that
-do not have the PNG file signature.  As well, it is now possible to set
-the library action on the detection of chunk CRC errors.  It is possible
-to set different actions based on whether the CRC error occurred in a
-critical or an ancillary chunk.
+For a detailed description on using libpng, read `libpng-manual.txt`.
+For examples of libpng in a program, see `example.c` and `pngtest.c`.
+For usage information and restrictions (what little they are) on libpng,
+see `png.h`.  For a description on using zlib (the compression library
+used by libpng) and zlib's restrictions, see `zlib.h`.
 
-For a detailed description on using libpng, read libpng-manual.txt.
-For examples of libpng in a program, see example.c and pngtest.c.  For
-usage information and restrictions (what little they are) on libpng,
-see png.h.  For a description on using zlib (the compression library
-used by libpng) and zlib's restrictions, see zlib.h
-
-I have included a general makefile, as well as several machine and
-compiler specific ones, but you may have to modify one for your own
-needs.
-
-You should use zlib 1.0.4 or later to run this, but it MAY work with
+You should use zlib 1.0.4 or later to run this, but it _may_ work with
 versions as old as zlib 0.95.  Even so, there are bugs in older zlib
 versions which can cause the output of invalid compression streams for
 some images.
 
 You should also note that zlib is a compression library that is useful
 for more things than just PNG files.  You can use zlib as a drop-in
-replacement for fread() and fwrite(), if you are so inclined.
+replacement for `fread()` and `fwrite()`, if you are so inclined.
 
 zlib should be available at the same place that libpng is, or at
-https://zlib.net.
+https://zlib.net .
 
 You may also want a copy of the PNG specification.  It is available
 as an RFC, a W3C Recommendation, and an ISO/IEC Standard.  You can find
 these at http://www.libpng.org/pub/png/pngdocs.html .
 
-This code is currently being archived at libpng.sourceforge.io in the
-[DOWNLOAD] area, and at http://libpng.download/src .
+This code is currently being archived at https://libpng.sourceforge.io
+in the download area, and at http://libpng.download/src .
 
 This release, based in a large way on Glenn's, Guy's and Andreas'
 earlier work, was created and will be supported by myself and the PNG
 development group.
 
-Send comments/corrections/commendations to png-mng-implement at
-lists.sourceforge.net (subscription required; visit
+Send comments, corrections and commendations to `png-mng-implement`
+at `lists.sourceforge.net`.  (Subscription is required; visit
 https://lists.sourceforge.net/lists/listinfo/png-mng-implement
-to subscribe).
-
-Send general questions about the PNG specification to png-mng-misc
-at lists.sourceforge.net (subscription required; visit
-https://lists.sourceforge.net/lists/listinfo/png-mng-misc to
-subscribe).
-
-Files in this distribution:
-
-      ANNOUNCE      =>  Announcement of this version, with recent changes
-      AUTHORS       =>  List of contributing authors
-      CHANGES       =>  Description of changes between libpng versions
-      KNOWNBUG      =>  List of known bugs and deficiencies
-      LICENSE       =>  License to use and redistribute libpng
-      README        =>  This file
-      TODO          =>  Things not implemented in the current library
-      TRADEMARK     =>  Trademark information
-      example.c     =>  Example code for using libpng functions
-      libpng.3      =>  manual page for libpng (includes libpng-manual.txt)
-      libpng-manual.txt  =>  Description of libpng and its functions
-      libpngpf.3    =>  manual page for libpng's private functions
-      png.5         =>  manual page for the PNG format
-      png.c         =>  Basic interface functions common to library
-      png.h         =>  Library function and interface declarations (public)
-      pngpriv.h     =>  Library function and interface declarations (private)
-      pngconf.h     =>  System specific library configuration (public)
-      pngstruct.h   =>  png_struct declaration (private)
-      pnginfo.h     =>  png_info struct declaration (private)
-      pngdebug.h    =>  debugging macros (private)
-      pngerror.c    =>  Error/warning message I/O functions
-      pngget.c      =>  Functions for retrieving info from struct
-      pngmem.c      =>  Memory handling functions
-      pngbar.png    =>  PNG logo, 88x31
-      pngnow.png    =>  PNG logo, 98x31
-      pngpread.c    =>  Progressive reading functions
-      pngread.c     =>  Read data/helper high-level functions
-      pngrio.c      =>  Lowest-level data read I/O functions
-      pngrtran.c    =>  Read data transformation functions
-      pngrutil.c    =>  Read data utility functions
-      pngset.c      =>  Functions for storing data into the info_struct
-      pngtest.c     =>  Library test program
-      pngtest.png   =>  Library test sample image
-      pngtrans.c    =>  Common data transformation functions
-      pngwio.c      =>  Lowest-level write I/O functions
-      pngwrite.c    =>  High-level write functions
-      pngwtran.c    =>  Write data transformations
-      pngwutil.c    =>  Write utility functions
-      arm           =>  Contains optimized code for the ARM platform
-      powerpc       =>  Contains optimized code for the PowerPC platform
-      contrib       =>  Contributions
-       arm-neon         =>  Optimized code for ARM-NEON platform
-       powerpc-vsx      =>  Optimized code for POWERPC-VSX platform
-       examples         =>  Example programs
-       gregbook         =>  source code for PNG reading and writing, from
-                            Greg Roelofs' "PNG: The Definitive Guide",
-                            O'Reilly, 1999
-       libtests         =>  Test programs
-       mips-msa         =>  Optimized code for MIPS-MSA platform
-       pngminim         =>  Minimal decoder, encoder, and progressive decoder
-                            programs demonstrating use of pngusr.dfa
-       pngminus         =>  Simple pnm2png and png2pnm programs
-       pngsuite         =>  Test images
-       testpngs
-       tools            =>  Various tools
-       visupng          =>  Contains a MSVC workspace for VisualPng
-      intel             =>  Optimized code for INTEL-SSE2 platform
-      mips              =>  Optimized code for MIPS platform
-      projects      =>  Contains project files and workspaces for
-                        building a DLL
-       owatcom          =>  Contains a WATCOM project for building libpng
-       visualc71        =>  Contains a Microsoft Visual C++ (MSVC)
-                            workspace for building libpng and zlib
-       vstudio          =>  Contains a Microsoft Visual C++ (MSVC)
-                            workspace for building libpng and zlib
-      scripts       =>  Directory containing scripts for building libpng:
-                            (see scripts/README.txt for the list of scripts)
+to subscribe.)
+
+Send general questions about the PNG specification to `png-mng-misc`
+at `lists.sourceforge.net`.  (Subscription is required; visit
+https://lists.sourceforge.net/lists/listinfo/png-mng-misc
+to subscribe.)
+
+Historical notes
+----------------
+
+The libpng library has been in extensive use and testing since mid-1995.
+Version 0.89, published a year later, was the first official release.
+By late 1997, it had finally gotten to the stage where there hadn't
+been significant changes to the API in some time, and people have a bad
+feeling about libraries with versions below 1.0.  Version 1.0.0 was
+released in March 1998.
+
+Note that some of the changes to the `png_info` structure render this
+version of the library binary incompatible with libpng-0.89 or
+earlier versions if you are using a shared library.  The type of the
+`filler` parameter for `png_set_filler()` has changed from `png_byte`
+to `png_uint_32`, which will affect shared-library applications that
+use this function.
+
+To avoid problems with changes to the internals of the `info_struct`,
+new APIs have been made available in 0.95 to avoid direct application
+access to `info_ptr`.  These functions are the `png_set_<chunk>` and
+`png_get_<chunk>` functions.  These functions should be used when
+accessing/storing the `info_struct` data, rather than manipulating it
+directly, to avoid such problems in the future.
+
+It is important to note that the APIs did not make current programs
+that access the info struct directly incompatible with the new
+library, through libpng-1.2.x.  In libpng-1.4.x, which was meant to
+be a transitional release, members of the `png_struct` and the
+`info_struct` can still be accessed, but the compiler will issue a
+warning about deprecated usage.  Since libpng-1.5.0, direct access
+to these structs is not allowed, and the definitions of the structs
+reside in private `pngstruct.h` and `pnginfo.h` header files that are
+not accessible to applications.  It is strongly suggested that new
+programs use the new APIs (as shown in `example.c` and `pngtest.c`),
+and older programs be converted to the new format, to facilitate
+upgrades in the future.
+
+The additions since 0.89 include the ability to read from a PNG stream
+which has had some (or all) of the signature bytes read by the calling
+application.  This also allows the reading of embedded PNG streams that
+do not have the PNG file signature.  As well, it is now possible to set
+the library action on the detection of chunk CRC errors.  It is possible
+to set different actions based on whether the CRC error occurred in a
+critical or an ancillary chunk.
+
+The additions since 0.90 include the ability to compile libpng as a
+Windows DLL, and new APIs for accessing data in the `info_struct`.
+Experimental functions included the ability to set weighting and cost
+factors for row filter selection, direct reads of integers from buffers
+on big-endian processors that support misaligned data access, faster
+methods of doing alpha composition, and more accurate 16-to-8 bit color
+conversion.  Some of these experimental functions, such as the weighted
+filter heuristics, have since been removed.
+
+Files included in this distribution
+-----------------------------------
+
+    ANNOUNCE      =>  Announcement of this version, with recent changes
+    AUTHORS       =>  List of contributing authors
+    CHANGES       =>  Description of changes between libpng versions
+    INSTALL       =>  Instructions to install libpng
+    LICENSE       =>  License to use and redistribute libpng
+    README        =>  This file
+    TODO          =>  Things not implemented in the current library
+    TRADEMARK     =>  Trademark information
+    example.c     =>  Example code for using libpng functions
+    libpng.3      =>  Manual page for libpng (includes libpng-manual.txt)
+    libpng-manual.txt  =>  Description of libpng and its functions
+    libpngpf.3    =>  Manual page for libpng's private functions (deprecated)
+    png.5         =>  Manual page for the PNG format
+    png.c         =>  Basic interface functions common to library
+    png.h         =>  Library function and interface declarations (public)
+    pngpriv.h     =>  Library function and interface declarations (private)
+    pngconf.h     =>  System specific library configuration (public)
+    pngstruct.h   =>  png_struct declaration (private)
+    pnginfo.h     =>  png_info struct declaration (private)
+    pngdebug.h    =>  debugging macros (private)
+    pngerror.c    =>  Error/warning message I/O functions
+    pngget.c      =>  Functions for retrieving info from struct
+    pngmem.c      =>  Memory handling functions
+    pngbar.png    =>  PNG logo, 88x31
+    pngnow.png    =>  PNG logo, 98x31
+    pngpread.c    =>  Progressive reading functions
+    pngread.c     =>  Read data/helper high-level functions
+    pngrio.c      =>  Lowest-level data read I/O functions
+    pngrtran.c    =>  Read data transformation functions
+    pngrutil.c    =>  Read data utility functions
+    pngset.c      =>  Functions for storing data into the info_struct
+    pngtest.c     =>  Library test program
+    pngtest.png   =>  Library test sample image
+    pngtrans.c    =>  Common data transformation functions
+    pngwio.c      =>  Lowest-level write I/O functions
+    pngwrite.c    =>  High-level write functions
+    pngwtran.c    =>  Write data transformations
+    pngwutil.c    =>  Write utility functions
+    arm/          =>  Optimized code for ARM Neon
+    intel/        =>  Optimized code for INTEL SSE2
+    loongarch/    =>  Optimized code for LoongArch LSX
+    mips/         =>  Optimized code for MIPS MSA and MIPS MMI
+    powerpc/      =>  Optimized code for PowerPC VSX
+    ci/           =>  Scripts for continuous integration
+    contrib/      =>  External contributions
+        arm-neon/     =>  Optimized code for the ARM-NEON platform
+        mips-msa/     =>  Optimized code for the MIPS-MSA platform
+        powerpc-vsx/  =>  Optimized code for the POWERPC-VSX platform
+        examples/     =>  Examples of libpng usage
+        gregbook/     =>  Source code for PNG reading and writing, from
+                          "PNG: The Definitive Guide" by Greg Roelofs,
+                          O'Reilly, 1999
+        libtests/     =>  Test programs
+        oss-fuzz/     =>  Files used by the OSS-Fuzz project for fuzz-testing
+                          libpng
+        pngexif/      =>  Program to inspect the EXIF information in PNG files
+        pngminim/     =>  Minimal decoder, encoder, and progressive decoder
+                          programs demonstrating the use of pngusr.dfa
+        pngminus/     =>  Simple pnm2png and png2pnm programs
+        pngsuite/     =>  Test images
+        testpngs/     =>  Test images
+        tools/        =>  Various tools
+        visupng/      =>  VisualPng, a Windows viewer for PNG images
+    projects/     =>  Project files and workspaces for various IDEs
+        owatcom/      =>  OpenWatcom project
+        visualc71/    =>  Microsoft Visual C++ 7.1 workspace
+        vstudio/      =>  Microsoft Visual Studio workspace
+    scripts/      =>  Scripts and makefiles for building libpng
+                      (see scripts/README.txt for the complete list)
+    tests/        =>  Test scripts
 
 Good luck, and happy coding!
 
diff --git a/3rdparty/libpng/arm/arm_init.c b/3rdparty/libpng/arm/arm_init.c
index a34ecdbef735..84d05556f816 100644
--- a/3rdparty/libpng/arm/arm_init.c
+++ b/3rdparty/libpng/arm/arm_init.c
@@ -1,7 +1,7 @@
 
 /* arm_init.c - NEON optimised filter functions
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2022 Cosmin Truta
  * Copyright (c) 2014,2016 Glenn Randers-Pehrson
  * Written by Mans Rullgard, 2011.
  *
@@ -10,9 +10,7 @@
  * and license in png.h
  */
 
-/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are
- * called.
- */
+/* This module requires POSIX 1003.1 functions. */
 #define _POSIX_SOURCE 1
 
 #include "../pngpriv.h"
@@ -33,21 +31,26 @@
  * has partial support is contrib/arm-neon/linux.c - a generic Linux
  * implementation which reads /proc/cpufino.
  */
+#include <signal.h> /* for sig_atomic_t */
+
 #ifndef PNG_ARM_NEON_FILE
-#  ifdef __linux__
-#     define PNG_ARM_NEON_FILE "contrib/arm-neon/linux.c"
+#  if defined(__aarch64__) || defined(_M_ARM64)
+     /* ARM Neon is expected to be unconditionally available on ARM64. */
+#    error "PNG_ARM_NEON_CHECK_SUPPORTED must not be defined on ARM64"
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+     /* ARM Neon is expected to be available on the target CPU architecture. */
+#    error "PNG_ARM_NEON_CHECK_SUPPORTED must not be defined on this CPU arch"
+#  elif defined(__linux__)
+#    define PNG_ARM_NEON_FILE "contrib/arm-neon/linux.c"
+#  else
+#    error "No support for run-time ARM Neon checking; use compile-time options"
 #  endif
 #endif
 
-#ifdef PNG_ARM_NEON_FILE
-
-#include <signal.h> /* for sig_atomic_t */
 static int png_have_neon(png_structp png_ptr);
-#include PNG_ARM_NEON_FILE
-
-#else  /* PNG_ARM_NEON_FILE */
-#  error "PNG_ARM_NEON_FILE undefined: no support for run-time ARM NEON checks"
-#endif /* PNG_ARM_NEON_FILE */
+#ifdef PNG_ARM_NEON_FILE
+#  include PNG_ARM_NEON_FILE
+#endif
 #endif /* PNG_ARM_NEON_CHECK_SUPPORTED */
 
 #ifndef PNG_ALIGNED_MEMORY_SUPPORTED
diff --git a/3rdparty/libpng/arm/filter_neon_intrinsics.c b/3rdparty/libpng/arm/filter_neon_intrinsics.c
index 553c0be21c10..4466d48b20a5 100644
--- a/3rdparty/libpng/arm/filter_neon_intrinsics.c
+++ b/3rdparty/libpng/arm/filter_neon_intrinsics.c
@@ -18,7 +18,7 @@
 /* This code requires -mfpu=neon on the command line: */
 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
 
-#if defined(_MSC_VER) && defined(_M_ARM64)
+#if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
 #  include <arm64_neon.h>
 #else
 #  include <arm_neon.h>
diff --git a/3rdparty/libpng/arm/palette_neon_intrinsics.c b/3rdparty/libpng/arm/palette_neon_intrinsics.c
index b4d1fd2abfa2..92c7d6f9f6f9 100644
--- a/3rdparty/libpng/arm/palette_neon_intrinsics.c
+++ b/3rdparty/libpng/arm/palette_neon_intrinsics.c
@@ -14,7 +14,7 @@
 
 #if PNG_ARM_NEON_IMPLEMENTATION == 1
 
-#if defined(_MSC_VER) && defined(_M_ARM64)
+#if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
 #  include <arm64_neon.h>
 #else
 #  include <arm_neon.h>
@@ -30,8 +30,6 @@ png_riffle_palette_neon(png_structrp png_ptr)
    int num_trans = png_ptr->num_trans;
    int i;
 
-   png_debug(1, "in png_riffle_palette_neon");
-
    /* Initially black, opaque. */
    uint8x16x4_t w = {{
       vdupq_n_u8(0x00),
@@ -40,6 +38,8 @@ png_riffle_palette_neon(png_structrp png_ptr)
       vdupq_n_u8(0xff),
    }};
 
+   png_debug(1, "in png_riffle_palette_neon");
+
    /* First, riffle the RGB colours into an RGBA8 palette.
     * The alpha component is set to opaque for now.
     */
@@ -65,11 +65,12 @@ png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
    png_uint_32 row_width = row_info->width;
    const png_uint_32 *riffled_palette =
       (const png_uint_32 *)png_ptr->riffled_palette;
-   const png_int_32 pixels_per_chunk = 4;
-   int i;
+   const png_uint_32 pixels_per_chunk = 4;
+   png_uint_32 i;
 
    png_debug(1, "in png_do_expand_palette_rgba8_neon");
 
+   PNG_UNUSED(row)
    if (row_width < pixels_per_chunk)
       return 0;
 
@@ -109,10 +110,11 @@ png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info,
    png_uint_32 row_width = row_info->width;
    png_const_bytep palette = (png_const_bytep)png_ptr->palette;
    const png_uint_32 pixels_per_chunk = 8;
-   int i;
+   png_uint_32 i;
 
    png_debug(1, "in png_do_expand_palette_rgb8_neon");
 
+   PNG_UNUSED(row)
    if (row_width <= pixels_per_chunk)
       return 0;
 
diff --git a/3rdparty/libpng/intel/filter_sse2_intrinsics.c b/3rdparty/libpng/intel/filter_sse2_intrinsics.c
index f52aaa800a43..d3c0fe9e2d68 100644
--- a/3rdparty/libpng/intel/filter_sse2_intrinsics.c
+++ b/3rdparty/libpng/intel/filter_sse2_intrinsics.c
@@ -259,7 +259,7 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
       a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
 
       /* (p-a) == (a+b-c - a) == (b-c) */
-   
+
       pa = _mm_sub_epi16(b,c);
 
       /* (p-b) == (a+b-c - b) == (a-c) */
diff --git a/3rdparty/libpng/loongarch/filter_lsx_intrinsics.c b/3rdparty/libpng/loongarch/filter_lsx_intrinsics.c
new file mode 100644
index 000000000000..af6cc763a078
--- /dev/null
+++ b/3rdparty/libpng/loongarch/filter_lsx_intrinsics.c
@@ -0,0 +1,412 @@
+/* filter_lsx_intrinsics.c - LSX optimized filter functions
+ *
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2016 Glenn Randers-Pehrson
+ * Contributed by Jin Bo (jinbo@loongson.cn)
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ */
+
+#include "../pngpriv.h"
+
+#ifdef PNG_READ_SUPPORTED
+
+#if PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
+
+#include <lsxintrin.h>
+
+#define LSX_LD(psrc) __lsx_vld((psrc), 0)
+
+#define LSX_LD_2(psrc, stride, out0, out1) \
+{                                          \
+   out0 = LSX_LD(psrc);                    \
+   out1 = LSX_LD(psrc + stride);           \
+}
+
+#define LSX_LD_4(psrc, stride, out0, out1, out2, out3) \
+{                                                      \
+   LSX_LD_2(psrc, stride, out0, out1);                 \
+   LSX_LD_2(psrc + stride * 2, stride, out2, out3);    \
+}
+
+#define LSX_ST(in, pdst) __lsx_vst(in, (pdst), 0)
+
+#define LSX_ST_2(in0, in1, pdst, stride) \
+{                                        \
+   LSX_ST(in0, pdst);                    \
+   LSX_ST(in1, pdst + stride);           \
+}
+
+#define LSX_ST_4(in0, in1, in2, in3, pdst, stride) \
+{                                                  \
+   LSX_ST_2(in0, in1, pdst, stride);               \
+   LSX_ST_2(in2, in3, pdst + stride * 2, stride);  \
+}
+
+#define LSX_ADD_B(in0, in1, out0) \
+{                                 \
+   out0 = __lsx_vadd_b(in0, in1); \
+}
+
+#define LSX_ADD_B_2(in0, in1, in2, in3, out0, out1) \
+{                                                   \
+   LSX_ADD_B(in0, in1, out0);                       \
+   LSX_ADD_B(in2, in3, out1);                       \
+}
+
+#define LSX_ADD_B_4(in0, in1, in2, in3, in4, in5,     \
+                    in6, in7, out0, out1, out2, out3) \
+{                                                     \
+   LSX_ADD_B_2(in0, in1, in2, in3, out0, out1);       \
+   LSX_ADD_B_2(in4, in5, in6, in7, out2, out3);       \
+}
+
+#define LSX_ABS_B_3(in0, in1, in2, out0, out1, out2) \
+{                                                    \
+   out0 = __lsx_vadda_h(in0, zero);                  \
+   out1 = __lsx_vadda_h(in1, zero);                  \
+   out2 = __lsx_vadda_h(in2, zero);                  \
+}
+
+#define LSX_ILVL_B(in_h, in_l, out0)  \
+{                                     \
+   out0 = __lsx_vilvl_b(in_h, in_l);  \
+}
+
+#define LSX_ILVL_B_2(in0_h, in0_l, in1_h, in1_l, out0, out1) \
+{                                                            \
+   LSX_ILVL_B(in0_h, in0_l, out0);                           \
+   LSX_ILVL_B(in1_h, in1_l, out1);                           \
+}
+
+#define LSX_HSUB_HU_BU_2(in0, in1, out0, out1) \
+{                                              \
+   out0 = __lsx_vhsubw_hu_bu(in0, in0);        \
+   out1 = __lsx_vhsubw_hu_bu(in1, in1);        \
+}
+
+#define LSX_CMP_PICK_SMALLER(in0, in1, in2, in3, in4, in5, out0) \
+{                                                                \
+   __m128i _cmph, _cmpb, _in0, _in3;                             \
+   _cmph = __lsx_vslt_h(in1, in0);                               \
+   _cmpb = __lsx_vpickev_b(_cmph, _cmph);                        \
+   _in0  = __lsx_vmin_bu(in0,in1);                               \
+   _in3  = __lsx_vbitsel_v(in3, in4, _cmpb);                     \
+   _cmph = __lsx_vslt_h(in2, _in0);                              \
+   _cmpb = __lsx_vpickev_b(_cmph, _cmph);                        \
+   _in3  = __lsx_vbitsel_v(_in3, in5, _cmpb);                    \
+   out0  = __lsx_vadd_b(out0, _in3);                             \
+}
+
+void png_read_filter_row_up_lsx(png_row_infop row_info, png_bytep row,
+                                png_const_bytep prev_row)
+{
+   size_t n = row_info->rowbytes;
+   png_bytep rp = row;
+   png_const_bytep pp = prev_row;
+   __m128i vec_0, vec_1, vec_2, vec_3;
+   __m128i vec_4, vec_5, vec_6, vec_7;
+
+   while (n >= 64)
+   {
+      LSX_LD_4(rp, 16, vec_0, vec_1, vec_2, vec_3);
+      LSX_LD_4(pp, 16, vec_4, vec_5, vec_6, vec_7);
+      pp += 64;
+      LSX_ADD_B_4(vec_0 ,vec_4, vec_1, vec_5, vec_2, vec_6,
+                  vec_3, vec_7, vec_0, vec_1, vec_2, vec_3);
+      LSX_ST_4(vec_0, vec_1, vec_2, vec_3, rp, 16);
+      rp += 64;
+      n -= 64;
+   }
+   if (n & 63)
+   {
+      if (n >= 32)
+      {
+         LSX_LD_2(rp, 16, vec_0, vec_1);
+         LSX_LD_2(pp, 16, vec_2, vec_3);
+         pp += 32;
+         LSX_ADD_B_2(vec_0, vec_2, vec_1, vec_3, vec_0, vec_1);
+         LSX_ST_2(vec_0, vec_1, rp, 16);
+         rp += 32;
+         n -= 32;
+      }
+      if (n & 31)
+      {
+         if (n >= 16)
+         {
+            vec_0 = LSX_LD(rp);
+            vec_1 = LSX_LD(pp);
+            pp += 16;
+            LSX_ADD_B(vec_0, vec_1, vec_0);
+            LSX_ST(vec_0, rp);
+            rp += 16;
+            n -= 16;
+         }
+         if (n >= 8)
+         {
+            vec_0 = __lsx_vldrepl_d(rp, 0);
+            vec_1 = __lsx_vldrepl_d(pp, 0);
+            vec_0 = __lsx_vadd_b(vec_0, vec_1);
+            __lsx_vstelm_d(vec_0, rp, 0, 0);
+            rp += 8;
+            pp += 8;
+            n -= 8;
+         }
+         while (n--)
+         {
+            *rp = *rp + *pp++;
+            rp++;
+         }
+      }
+   }
+}
+
+void png_read_filter_row_sub3_lsx(png_row_infop row_info, png_bytep row,
+                                  png_const_bytep prev_row)
+{
+   size_t n = row_info->rowbytes;
+   png_uint_32 tmp;
+   png_bytep nxt = row;
+   __m128i vec_0, vec_1;
+
+   PNG_UNUSED(prev_row);
+
+   vec_0 = __lsx_vldrepl_w(nxt, 0);
+   nxt += 3;
+   n -= 3;
+
+   while (n >= 3)
+   {
+      vec_1 = __lsx_vldrepl_w(nxt, 0);
+      vec_1 = __lsx_vadd_b(vec_1, vec_0);
+      __lsx_vstelm_h(vec_1, nxt, 0, 0);
+      vec_0 = vec_1;
+      nxt += 2;
+      __lsx_vstelm_b(vec_1, nxt, 0, 2);
+      nxt += 1;
+      n -= 3;
+   }
+
+   row = nxt - 3;
+   while (n--)
+   {
+      *nxt = *nxt + *row++;
+      nxt++;
+   }
+}
+
+void png_read_filter_row_sub4_lsx(png_row_infop row_info, png_bytep row,
+                                  png_const_bytep prev_row)
+{
+   size_t n = row_info->rowbytes;
+   __m128i vec_0, vec_1;
+
+   PNG_UNUSED(prev_row);
+
+   vec_0 = __lsx_vldrepl_w(row, 0);
+   row += 4;
+   n -= 4;
+
+   while (n >= 4)
+   {
+      vec_1 = __lsx_vldrepl_w(row, 0);
+      vec_1 = __lsx_vadd_b(vec_1, vec_0);
+      __lsx_vstelm_w(vec_1, row, 0, 0);
+      vec_0 = vec_1;
+      row += 4;
+      n -= 4;
+   }
+}
+
+void png_read_filter_row_avg3_lsx(png_row_infop row_info, png_bytep row,
+                                  png_const_bytep prev_row)
+{
+   size_t n = row_info->rowbytes;
+   png_bytep nxt = row;
+   png_const_bytep prev_nxt = prev_row;
+   __m128i vec_0, vec_1, vec_2;
+
+   vec_0 = __lsx_vldrepl_w(nxt, 0);
+   vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
+   prev_nxt += 3;
+   vec_1 = __lsx_vsrli_b(vec_1, 1);
+   vec_1 = __lsx_vadd_b(vec_1, vec_0);
+   __lsx_vstelm_h(vec_1, nxt, 0, 0);
+   nxt += 2;
+   __lsx_vstelm_b(vec_1, nxt, 0, 2);
+   nxt += 1;
+   n -= 3;
+
+   while (n >= 3)
+   {
+      vec_2 = vec_1;
+      vec_0 = __lsx_vldrepl_w(nxt, 0);
+      vec_1 = __lsx_vldrepl_w(prev_nxt, 0);
+      prev_nxt += 3;
+
+      vec_1 = __lsx_vavg_bu(vec_1, vec_2);
+      vec_1 = __lsx_vadd_b(vec_1, vec_0);
+
+      __lsx_vstelm_h(vec_1, nxt, 0, 0);
+      nxt += 2;
+      __lsx_vstelm_b(vec_1, nxt, 0, 2);
+      nxt += 1;
+      n -= 3;
+   }
+
+   row = nxt - 3;
+   while (n--)
+   {
+      vec_2 = __lsx_vldrepl_b(row, 0);
+      row++;
+      vec_0 = __lsx_vldrepl_b(nxt, 0);
+      vec_1 = __lsx_vldrepl_b(prev_nxt, 0);
+      prev_nxt++;
+
+      vec_1 = __lsx_vavg_bu(vec_1, vec_2);
+      vec_1 = __lsx_vadd_b(vec_1, vec_0);
+
+      __lsx_vstelm_b(vec_1, nxt, 0, 0);
+      nxt++;
+   }
+}
+
+void png_read_filter_row_avg4_lsx(png_row_infop row_info, png_bytep row,
+                                  png_const_bytep prev_row)
+{
+   size_t n = row_info->rowbytes;
+   __m128i vec_0, vec_1, vec_2;
+
+   vec_0 = __lsx_vldrepl_w(row, 0);
+   vec_1 = __lsx_vldrepl_w(prev_row, 0);
+   prev_row += 4;
+   vec_1 = __lsx_vsrli_b(vec_1, 1);
+   vec_1 = __lsx_vadd_b(vec_1, vec_0);
+   __lsx_vstelm_w(vec_1, row, 0, 0);
+   row += 4;
+   n -= 4;
+
+   while (n >= 4)
+   {
+      vec_2 = vec_1;
+      vec_0 = __lsx_vldrepl_w(row, 0);
+      vec_1 = __lsx_vldrepl_w(prev_row, 0);
+      prev_row += 4;
+
+      vec_1 = __lsx_vavg_bu(vec_1, vec_2);
+      vec_1 = __lsx_vadd_b(vec_1, vec_0);
+
+      __lsx_vstelm_w(vec_1, row, 0, 0);
+      row += 4;
+      n -= 4;
+   }
+}
+
+void png_read_filter_row_paeth3_lsx(png_row_infop row_info,
+                                    png_bytep row,
+                                    png_const_bytep prev_row)
+{
+   size_t n = row_info->rowbytes;
+   png_bytep nxt = row;
+   png_const_bytep prev_nxt = prev_row;
+   __m128i vec_a, vec_b, vec_c, vec_d;
+   __m128i vec_pa, vec_pb, vec_pc;
+   __m128i zero = {0};
+
+   vec_a = __lsx_vldrepl_w(nxt, 0);
+   vec_b = __lsx_vldrepl_w(prev_nxt, 0);
+   prev_nxt += 3;
+   vec_d = __lsx_vadd_b(vec_a, vec_b);
+   __lsx_vstelm_h(vec_d, nxt, 0, 0);
+   nxt += 2;
+   __lsx_vstelm_b(vec_d, nxt, 0, 2);
+   nxt += 1;
+   n -= 3;
+
+   while (n >= 3)
+   {
+      vec_a = vec_d;
+      vec_c = vec_b;
+      vec_b = __lsx_vldrepl_w(prev_nxt, 0);
+      prev_nxt += 3;
+      vec_d = __lsx_vldrepl_w(nxt, 0);
+
+      LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
+      LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
+      vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
+      LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
+      LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
+
+      __lsx_vstelm_h(vec_d, nxt, 0, 0);
+      nxt += 2;
+      __lsx_vstelm_b(vec_d, nxt, 0, 2);
+      nxt += 1;
+      n -= 3;
+   }
+
+   prev_row = prev_nxt - 3;
+   row = nxt - 3;
+   while (n--)
+   {
+      vec_a = __lsx_vldrepl_b(row, 0);
+      row++;
+      vec_b = __lsx_vldrepl_b(prev_nxt, 0);
+      prev_nxt++;
+      vec_c = __lsx_vldrepl_b(prev_row, 0);
+      prev_row++;
+      vec_d = __lsx_vldrepl_b(nxt, 0);
+
+      LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
+      LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
+      vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
+      LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
+      LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
+
+      __lsx_vstelm_b(vec_d, nxt, 0, 0);
+      nxt++;
+   }
+}
+
+void png_read_filter_row_paeth4_lsx(png_row_infop row_info,
+                                    png_bytep row,
+                                    png_const_bytep prev_row)
+{
+   size_t n = row_info->rowbytes;
+   __m128i vec_a, vec_b, vec_c, vec_d;
+   __m128i vec_pa, vec_pb, vec_pc;
+   __m128i zero = {0};
+
+   vec_a = __lsx_vldrepl_w(row, 0);
+   vec_b = __lsx_vldrepl_w(prev_row, 0);
+   prev_row += 4;
+   vec_d = __lsx_vadd_b(vec_a, vec_b);
+   __lsx_vstelm_w(vec_d, row, 0, 0);
+   row += 4;
+   n -= 4;
+
+   while (n >= 4)
+   {
+      vec_a = vec_d;
+      vec_c = vec_b;
+      vec_b = __lsx_vldrepl_w(prev_row, 0);
+      prev_row += 4;
+      vec_d = __lsx_vldrepl_w(row, 0);
+
+      LSX_ILVL_B_2(vec_b, vec_c, vec_a, vec_c, vec_pa, vec_pb);
+      LSX_HSUB_HU_BU_2(vec_pa, vec_pb, vec_pa, vec_pb);
+      vec_pc = __lsx_vadd_h(vec_pa, vec_pb);
+      LSX_ABS_B_3(vec_pa, vec_pb, vec_pc, vec_pa, vec_pb, vec_pc);
+      LSX_CMP_PICK_SMALLER(vec_pa, vec_pb, vec_pc, vec_a, vec_b, vec_c, vec_d);
+
+      __lsx_vstelm_w(vec_d, row, 0, 0);
+      row += 4;
+      n -= 4;
+   }
+}
+
+#endif /* PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 (intrinsics) */
+#endif /* PNG_READ_SUPPORTED */
diff --git a/3rdparty/libpng/loongarch/loongarch_lsx_init.c b/3rdparty/libpng/loongarch/loongarch_lsx_init.c
new file mode 100644
index 000000000000..2c80fe81b687
--- /dev/null
+++ b/3rdparty/libpng/loongarch/loongarch_lsx_init.c
@@ -0,0 +1,65 @@
+/* loongarch_lsx_init.c - LSX optimized filter functions
+ *
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ * Contributed by Jin Bo <jinbo@loongson.cn>
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ */
+
+#include "../pngpriv.h"
+
+#ifdef PNG_READ_SUPPORTED
+#if PNG_LOONGARCH_LSX_IMPLEMENTATION == 1
+
+#include <sys/auxv.h>
+
+#define LA_HWCAP_LSX    (1<<4)
+static int png_has_lsx(void)
+{
+    int flags = 0;
+    int flag  = (int)getauxval(AT_HWCAP);
+
+    if (flag & LA_HWCAP_LSX)
+        return 1;
+
+    return 0;
+}
+
+void
+png_init_filter_functions_lsx(png_structp pp, unsigned int bpp)
+{
+   /* IMPORTANT: any new external functions used here must be declared using
+    * PNG_INTERNAL_FUNCTION in ../pngpriv.h.  This is required so that the
+    * 'prefix' option to configure works:
+    *
+    *    ./configure --with-libpng-prefix=foobar_
+    *
+    * Verify you have got this right by running the above command, doing a build
+    * and examining pngprefix.h; it must contain a #define for every external
+    * function you add.  (Notice that this happens automatically for the
+    * initialization function.)
+    */
+
+   if (png_has_lsx())
+   {
+      pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_lsx;
+      if (bpp == 3)
+      {
+         pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_lsx;
+         pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_lsx;
+         pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_lsx;
+      }
+      else if (bpp == 4)
+      {
+         pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_lsx;
+         pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_lsx;
+         pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_lsx;
+      }
+   }
+}
+
+#endif /* PNG_LOONGARCH_LSX_IMPLEMENTATION == 1 */
+#endif /* PNG_READ_SUPPORTED */
diff --git a/3rdparty/libpng/mips/filter_mmi_inline_assembly.c b/3rdparty/libpng/mips/filter_mmi_inline_assembly.c
new file mode 100644
index 000000000000..b330a4653810
--- /dev/null
+++ b/3rdparty/libpng/mips/filter_mmi_inline_assembly.c
@@ -0,0 +1,525 @@
+/* filter_mmi_intrinsics.c - MMI optimized filter functions
+ *
+ * Copyright (c) 2024 Cosmin Truta
+ * Written by zhanglixia and guxiwei, 2023
+ *
+ * This code is released under the libpng license.
+ * For conditions of distribution and use, see the disclaimer
+ * and license in png.h
+ */
+
+#include "../pngpriv.h"
+
+#ifdef PNG_READ_SUPPORTED
+
+#if PNG_MIPS_MMI_IMPLEMENTATION == 2 /* Inline Assembly */
+
+/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
+ * They're positioned like this:
+ *    prev:  c b
+ *    row:   a d
+ * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
+ * whichever of a, b, or c is closest to p=a+b-c.
+ */
+
+void png_read_filter_row_up_mmi(png_row_infop row_info, png_bytep row,
+                                png_const_bytep prev_row)
+{
+   int istop = row_info->rowbytes;
+   double rp,pp;
+   __asm__ volatile (
+       "1:                                          \n\t"
+       "ldc1   %[rp],       0x00(%[row])            \n\t"
+       "ldc1   %[pp],       0x00(%[prev_row])       \n\t"
+       "paddb  %[rp],       %[rp],            %[pp] \n\t"
+       "sdc1   %[rp],       0x00(%[row])            \n\t"
+
+       "daddiu %[row],      %[row],           0x08  \n\t"
+       "daddiu %[prev_row], %[prev_row],      0x08  \n\t"
+       "daddiu %[istop],    %[istop],        -0x08  \n\t"
+       "bgtz   %[istop],    1b                      \n\t"
+       : [rp]"=&f"(rp), [pp]"=&f"(pp)
+       : [row]"r"(row), [prev_row]"r"(prev_row),
+         [istop]"r"(istop)
+       : "memory"
+   );
+}
+
+void png_read_filter_row_sub3_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   int istop = row_info->rowbytes;
+   double rp, pp, dest;
+   double eight, sixteen, twenty_four, forty_eight;
+   double tmp0;
+   double ftmp[2];
+
+   __asm__ volatile (
+        "li         %[tmp0],    0x08                          \n\t"
+        "dmtc1      %[tmp0],    %[eight]                      \n\t"
+        "li         %[tmp0],    0x10                          \n\t"
+        "dmtc1      %[tmp0],    %[sixteen]                    \n\t"
+        "li         %[tmp0],    0x18                          \n\t"
+        "dmtc1      %[tmp0],    %[twenty_four]                \n\t"
+        "li         %[tmp0],    0x30                          \n\t"
+        "dmtc1      %[tmp0],    %[forty_eight]                \n\t"
+        "xor        %[dest],    %[dest],       %[dest]        \n\t"
+
+        "1:                                                   \n\t"
+        "gsldrc1    %[rp],      0x00(%[row])                  \n\t"
+        "gsldlc1    %[rp],      0x07(%[row])                  \n\t"
+        "gsldrc1    %[pp],      0x08(%[row])                  \n\t"
+        "gsldlc1    %[pp],      0x0f(%[row])                  \n\t"
+
+        "paddb      %[ftmp0],   %[dest],      %[rp]           \n\t"
+        "swc1       %[ftmp0],   0x00(%[row])                  \n\t"
+
+        "dsrl       %[ftmp1],   %[rp],        %[twenty_four]  \n\t"
+        "paddb      %[dest],    %[ftmp1],     %[ftmp0]        \n\t"
+        "gsswrc1    %[dest],    0x03(%[row])                  \n\t"
+        "gsswlc1    %[dest],    0x06(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[rp],        %[forty_eight]  \n\t"
+        "dsll       %[ftmp1],   %[pp],        %[sixteen]      \n\t"
+        "or         %[ftmp0],   %[ftmp0],     %[ftmp1]        \n\t"
+        "paddb      %[dest],    %[dest],      %[ftmp0]        \n\t"
+        "gsswrc1    %[dest],    0x06(%[row])                  \n\t"
+        "gsswlc1    %[dest],    0x09(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[pp],        %[eight]        \n\t"
+        "paddb      %[dest],    %[dest],      %[ftmp0]        \n\t"
+        "gsswrc1    %[dest],    0x09(%[row])                  \n\t"
+        "daddiu     %[row],     %[row],       0x0c            \n\t"
+        "daddiu     %[istop],   %[istop],    -0x0c            \n\t"
+        "bgtz       %[istop],   1b                            \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [dest]"=&f"(dest),
+          [tmp0]"=&r"(tmp0), [ftmp0]"=&f"(ftmp[0]),
+          [ftmp1]"=&f"(ftmp[1]), [eight]"=&f"(eight),
+          [sixteen]"=&f"(sixteen), [twenty_four]"=&f"(twenty_four),
+          [forty_eight]"=&f"(forty_eight)
+        : [row]"r"(row), [istop]"r"(istop)
+        : "memory"
+   );
+
+   PNG_UNUSED(prev)
+}
+
+void png_read_filter_row_sub4_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   /* The Sub filter predicts each pixel as the previous pixel, a.
+    * There is no pixel to the left of the first pixel.  It's encoded directly.
+    * That works with our main loop if we just say that left pixel was zero.
+    */
+   int istop = row_info->rowbytes;
+   double rp,pp;
+
+   __asm__ volatile (
+        "1:                                          \n\t"
+        "lwc1   %[pp],       0x00(%[row])            \n\t"
+        "lwc1   %[rp],       0x04(%[row])            \n\t"
+        "paddb  %[rp],       %[rp],       %[pp]      \n\t"
+        "swc1   %[rp],       0x04(%[row])            \n\t"
+
+        "daddiu %[row],      %[row],      0x04       \n\t"
+        "daddiu %[istop],    %[istop],   -0x04       \n\t"
+        "bgtz   %[istop],    1b                      \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp)
+        : [row]"r"(row), [istop]"r"(istop)
+        : "memory"
+   );
+
+   PNG_UNUSED(prev)
+}
+
+void png_read_filter_row_avg3_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   int istop = row_info->rowbytes;
+   double rp, pp, rp1, pp1;
+   double tmp0;
+   double ftmp[3];
+   double one, dest;
+   double eight, sixteen, twenty_four, forty_eight;
+
+   __asm__ volatile (
+        "li         %[tmp0],    0x08                          \n\t"
+        "dmtc1      %[tmp0],    %[eight]                      \n\t"
+        "li         %[tmp0],    0x10                          \n\t"
+        "dmtc1      %[tmp0],    %[sixteen]                    \n\t"
+        "li         %[tmp0],    0x18                          \n\t"
+        "dmtc1      %[tmp0],    %[twenty_four]                \n\t"
+        "li         %[tmp0],    0x30                          \n\t"
+        "dmtc1      %[tmp0],    %[forty_eight]                \n\t"
+        "xor        %[dest],    %[dest],       %[dest]        \n\t"
+
+        "li         %[tmp0],   0x01                           \n\t"
+        "ins        %[tmp0],   %[tmp0],        8,   8         \n\t"
+        "dmtc1      %[tmp0],   %[one]                         \n\t"
+        "pshufh     %[one],    %[one],         %[dest]        \n\t"
+
+        "1:                                                   \n\t"
+        "gsldrc1    %[rp],      0x00(%[row])                  \n\t"
+        "gsldlc1    %[rp],      0x07(%[row])                  \n\t"
+        "gsldrc1    %[pp],      0x00(%[prev])                 \n\t"
+        "gsldlc1    %[pp],      0x07(%[prev])                 \n\t"
+        "gsldrc1    %[rp1],     0x08(%[row])                  \n\t"
+        "gsldlc1    %[rp1],     0x0f(%[row])                  \n\t"
+        "gsldrc1    %[pp1],     0x08(%[prev])                 \n\t"
+        "gsldlc1    %[pp1],     0x0f(%[prev])                 \n\t"
+
+        "xor        %[ftmp0],   %[pp],         %[dest]        \n\t"
+        "pavgb      %[ftmp1],   %[pp],         %[dest]        \n\t"
+        "and        %[ftmp0],   %[ftmp0],      %[one]         \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],      %[ftmp0]       \n\t"
+        "paddb      %[dest],    %[rp],         %[ftmp1]       \n\t"
+        "swc1       %[dest],    0x00(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[rp],         %[twenty_four] \n\t"
+        "dsrl       %[ftmp1],   %[pp],         %[twenty_four] \n\t"
+
+        "xor        %[ftmp2],   %[ftmp1],      %[dest]        \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],      %[dest]        \n\t"
+        "and        %[ftmp2],   %[ftmp2],      %[one]         \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],      %[ftmp2]       \n\t"
+        "paddb      %[dest],    %[ftmp0],      %[ftmp1]       \n\t"
+        "gsswrc1    %[dest],    0x03(%[row])                  \n\t"
+        "gsswlc1    %[dest],    0x06(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[rp],         %[forty_eight] \n\t"
+        "dsll       %[ftmp1],   %[rp1],        %[sixteen]     \n\t"
+        "or         %[ftmp0],   %[ftmp0],      %[ftmp1]       \n\t"
+        "dsrl       %[ftmp2],   %[pp],         %[forty_eight] \n\t"
+        "dsll       %[ftmp1],   %[pp1],        %[sixteen]     \n\t"
+        "or         %[ftmp1],   %[ftmp2],      %[ftmp1]       \n\t"
+
+        "xor        %[ftmp2],   %[ftmp1],      %[dest]        \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],      %[dest]        \n\t"
+        "and        %[ftmp2],   %[ftmp2],      %[one]         \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],      %[ftmp2]       \n\t"
+        "paddb      %[dest],    %[ftmp0],      %[ftmp1]       \n\t"
+        "gsswrc1    %[dest],    0x06(%[row])                  \n\t"
+        "gsswlc1    %[dest],    0x09(%[row])                  \n\t"
+
+        "dsrl       %[ftmp0],   %[rp1],        %[eight]       \n\t"
+        "dsrl       %[ftmp1],   %[pp1],        %[eight]       \n\t"
+
+        "xor        %[ftmp2],   %[ftmp1],      %[dest]        \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],      %[dest]        \n\t"
+        "and        %[ftmp2],   %[ftmp2],      %[one]         \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],      %[ftmp2]       \n\t"
+        "paddb      %[dest],    %[ftmp0],      %[ftmp1]       \n\t"
+        "gsswrc1    %[dest],    0x09(%[row])                  \n\t"
+        "daddiu     %[row],     %[row],        0x0c           \n\t"
+        "daddiu     %[prev],    %[prev],       0x0c           \n\t"
+        "daddiu     %[istop],   %[istop],     -0x0c           \n\t"
+        "bgtz       %[istop],   1b                            \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [rp1]"=&f"(rp1),
+          [pp1]"=&f"(pp1), [tmp0]"=&r"(tmp0), [ftmp0]"=&f"(ftmp[0]),
+          [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [one]"=&f"(one),
+          [dest]"=&f"(dest), [eight]"=&f"(eight), [sixteen]"=&f"(sixteen),
+          [twenty_four]"=&f"(twenty_four), [forty_eight]"=&f"(forty_eight)
+        : [row]"r"(row), [prev]"r"(prev), [istop]"r"(istop)
+        : "memory"
+   );
+}
+
+void png_read_filter_row_avg4_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   int istop = row_info->rowbytes;
+   double rp,pp;
+   double dest;
+   double ftmp[2];
+   double tmp;
+
+   __asm__ volatile (
+        "xor        %[dest],   %[dest],       %[dest]  \n\t"
+        "li         %[tmp],    0x01                    \n\t"
+        "ins        %[tmp],    %[tmp],        8,  8    \n\t"
+        "dmtc1      %[tmp],    %[ftmp1]                \n\t"
+        "pshufh     %[ftmp1],  %[ftmp1],      %[dest]  \n\t"
+
+        "1:                                            \n\t"
+        "lwc1       %[rp],     0x00(%[row])            \n\t"
+        "lwc1       %[pp],     0x00(%[prev])           \n\t"
+        "xor        %[ftmp0],  %[pp],         %[dest]  \n\t"
+        "pavgb      %[pp],     %[pp],         %[dest]  \n\t"
+        "and        %[ftmp0],  %[ftmp0],      %[ftmp1] \n\t"
+        "psubb      %[pp],     %[pp],         %[ftmp0] \n\t"
+        "paddb      %[dest],   %[rp],         %[pp]    \n\t"
+        "swc1       %[dest],   0x00(%[row])            \n\t"
+        "daddiu     %[row],    %[row],        0x04     \n\t"
+        "daddiu     %[prev],   %[prev],       0x04     \n\t"
+        "daddiu     %[istop],  %[istop],     -0x04     \n\t"
+        "bgtz       %[istop],  1b                      \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [ftmp0]"=&f"(ftmp[0]),
+          [ftmp1]"=&f"(ftmp[1]), [dest]"=&f"(dest), [tmp]"=&r"(tmp)
+        : [row]"r"(row), [prev]"r"(prev), [istop]"r"(istop)
+        : "memory"
+   );
+}
+
+void png_read_filter_row_paeth3_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
+    * and two pixels from the previous row, b and c:
+    *   prev: c b
+    *   row:  a d
+    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
+    * p=a+b-c.
+    *
+    * The first pixel has no left context, and so uses an Up filter, p = b.
+    * This works naturally with our main loop's p = a+b-c if we force a and c
+    * to zero.
+    * Here we zero b and d, which become c and a respectively at the start of
+    * the loop.
+    */
+   int istop = row_info->rowbytes;
+   double rp, pp, rp1, pp1, zero;
+   double a, b, c, d, pa, pb, pc;
+   double tmp0;
+   double ftmp[3];
+   double eight, sixteen, twenty_four, forty_eight;
+
+   __asm__ volatile (
+        "xor        %[a],      %[a],           %[a]           \n\t"
+        "xor        %[c],      %[c],           %[c]           \n\t"
+        "xor        %[zero],   %[zero],        %[zero]        \n\t"
+        "li         %[tmp0],    0x08                          \n\t"
+        "dmtc1      %[tmp0],    %[eight]                      \n\t"
+        "li         %[tmp0],    0x10                          \n\t"
+        "dmtc1      %[tmp0],    %[sixteen]                    \n\t"
+        "li         %[tmp0],    0x18                          \n\t"
+        "dmtc1      %[tmp0],    %[twenty_four]                \n\t"
+        "li         %[tmp0],    0x30                          \n\t"
+        "dmtc1      %[tmp0],    %[forty_eight]                \n\t"
+
+        "1:                                                   \n\t"
+        "gsldrc1    %[rp],      0x00(%[row])                  \n\t"
+        "gsldlc1    %[rp],      0x07(%[row])                  \n\t"
+        "gsldrc1    %[pp],      0x00(%[prev])                 \n\t"
+        "gsldlc1    %[pp],      0x07(%[prev])                 \n\t"
+        "gsldrc1    %[rp1],     0x08(%[row])                  \n\t"
+        "gsldlc1    %[rp1],     0x0f(%[row])                  \n\t"
+        "gsldrc1    %[pp1],     0x08(%[prev])                 \n\t"
+        "gsldlc1    %[pp1],     0x0f(%[prev])                 \n\t"
+
+        "punpcklbh  %[b],      %[pp],          %[zero]        \n\t"
+        "punpcklbh  %[d],      %[rp],          %[zero]        \n\t"
+        "packushb   %[ftmp0],  %[c],           %[c]           \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]           \n\t"
+        "pasubub    %[pa],     %[pp],          %[ftmp0]       \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0]       \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]           \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]           \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1]       \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]          \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "punpcklbh  %[pa],     %[pa],          %[zero]        \n\t"
+        "punpcklbh  %[pb],     %[pb],          %[zero]        \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]          \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]          \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]          \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "paddb      %[a],      %[a],           %[d]           \n\t"
+        "packushb   %[d],      %[a],           %[a]           \n\t"
+        "punpcklbh  %[c],      %[pp],          %[zero]        \n\t"
+        "swc1       %[d],      0x00(%[row])                   \n\t"
+
+        "dsrl       %[ftmp0],  %[rp],          %[twenty_four] \n\t"
+        "dsrl       %[ftmp2],  %[pp],          %[twenty_four] \n\t"
+
+        "punpcklbh  %[b],      %[ftmp2],       %[zero]        \n\t"
+        "punpcklbh  %[d],      %[ftmp0],       %[zero]        \n\t"
+        "packushb   %[ftmp0],  %[c],           %[c]           \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]           \n\t"
+        "pasubub    %[pa],     %[ftmp2],       %[ftmp0]       \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0]       \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]           \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]           \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1]       \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]          \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "punpcklbh  %[pa],     %[pa],          %[zero]        \n\t"
+        "punpcklbh  %[pb],     %[pb],          %[zero]        \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]          \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]          \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]          \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "paddb      %[a],      %[a],           %[d]           \n\t"
+        "packushb   %[d],      %[a],           %[a]           \n\t"
+        "punpcklbh  %[c],      %[ftmp2],       %[zero]        \n\t"
+        "gsswrc1    %[d],      0x03(%[row])                   \n\t"
+        "gsswlc1    %[d],      0x06(%[row])                   \n\t"
+
+        "dsrl       %[ftmp0],  %[rp],          %[forty_eight] \n\t"
+        "dsll       %[ftmp1],  %[rp1],         %[sixteen]     \n\t"
+        "or         %[ftmp0],  %[ftmp0],       %[ftmp1]       \n\t"
+        "dsrl       %[ftmp2],  %[pp],          %[forty_eight] \n\t"
+        "dsll       %[ftmp1],  %[pp1],         %[sixteen]     \n\t"
+        "or         %[ftmp2],  %[ftmp2],       %[ftmp1]       \n\t"
+
+        "punpcklbh  %[b],      %[ftmp2],       %[zero]        \n\t"
+        "punpcklbh  %[d],      %[ftmp0],       %[zero]        \n\t"
+        "packushb   %[ftmp0],  %[c],           %[c]           \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]           \n\t"
+        "pasubub    %[pa],     %[ftmp2],       %[ftmp0]       \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0]       \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]           \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]           \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1]       \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]          \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "punpcklbh  %[pa],     %[pa],          %[zero]        \n\t"
+        "punpcklbh  %[pb],     %[pb],          %[zero]        \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]          \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]          \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]          \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "paddb      %[a],      %[a],           %[d]           \n\t"
+        "packushb   %[d],      %[a],           %[a]           \n\t"
+        "punpcklbh  %[c],      %[ftmp2],       %[zero]        \n\t"
+        "gsswrc1    %[d],      0x06(%[row])                   \n\t"
+        "gsswlc1    %[d],      0x09(%[row])                   \n\t"
+
+        "dsrl       %[ftmp0],   %[rp1],        %[eight]       \n\t"
+        "dsrl       %[ftmp2],   %[pp1],        %[eight]       \n\t"
+
+        "punpcklbh  %[b],      %[ftmp2],       %[zero]        \n\t"
+        "punpcklbh  %[d],      %[ftmp0],       %[zero]        \n\t"
+        "packushb   %[ftmp0],  %[c],           %[c]           \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]           \n\t"
+        "pasubub    %[pa],     %[ftmp2],       %[ftmp0]       \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0]       \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]           \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]           \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1]       \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]          \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0]       \n\t"
+        "punpcklbh  %[pa],     %[pa],          %[zero]        \n\t"
+        "punpcklbh  %[pb],     %[pb],          %[zero]        \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]          \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]          \n\t"
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]          \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0]       \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]           \n\t"
+        "or         %[a],      %[a],           %[ftmp1]       \n\t"
+        "paddb      %[a],      %[a],           %[d]           \n\t"
+        "packushb   %[d],      %[a],           %[a]           \n\t"
+        "punpcklbh  %[c],      %[ftmp2],       %[zero]        \n\t"
+        "gsswrc1    %[d],      0x09(%[row])                   \n\t"
+
+        "daddiu     %[row],    %[row],         0x0c           \n\t"
+        "daddiu     %[prev],   %[prev],        0x0c           \n\t"
+        "daddiu     %[istop],  %[istop],      -0x0c           \n\t"
+        "bgtz       %[istop],  1b                             \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [rp1]"=&f"(rp1), [pp1]"=&f"(pp1),
+          [zero]"=&f"(zero), [a]"=&f"(a),[b]"=&f"(b), [c]"=&f"(c),
+          [d]"=&f"(d), [pa]"=&f"(pa), [pb]"=&f"(pb), [pc]"=&f"(pc),
+          [tmp0]"=&r"(tmp0), [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]), [eight]"=&f"(eight), [sixteen]"=&f"(sixteen),
+          [twenty_four]"=&f"(twenty_four), [forty_eight]"=&f"(forty_eight)
+        : [row]"r"(row), [prev]"r"(prev), [istop]"r"(istop)
+        : "memory"
+   );
+}
+
+void png_read_filter_row_paeth4_mmi(png_row_infop row_info, png_bytep row,
+   png_const_bytep prev)
+{
+   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
+    * and two pixels from the previous row, b and c:
+    *   prev: c b
+    *   row:  a d
+    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
+    * p=a+b-c.
+    *
+    * The first pixel has no left context, and so uses an Up filter, p = b.
+    * This works naturally with our main loop's p = a+b-c if we force a and c
+    * to zero.
+    * Here we zero b and d, which become c and a respectively at the start of
+    * the loop.
+    */
+   int istop = row_info->rowbytes;
+   double rp, pp, zero;
+   double a, b, c, d, pa, pb, pc;
+   double ftmp[2];
+
+   __asm__ volatile (
+        "xor        %[a],      %[a],           %[a]     \n\t"
+        "xor        %[c],      %[c],           %[c]     \n\t"
+        "xor        %[zero],   %[zero],        %[zero]  \n\t"
+
+        "1:                                             \n\t"
+        "lwc1       %[rp],     0x00(%[row])             \n\t"
+        "lwc1       %[pp],     0x00(%[prev])            \n\t"
+        "punpcklbh  %[b],      %[pp],          %[zero]  \n\t"
+        "punpcklbh  %[d],      %[rp],          %[zero]  \n\t"
+
+        "packushb   %[ftmp0],  %[c],           %[c]     \n\t"
+        "packushb   %[ftmp1],  %[a],           %[a]     \n\t"
+        "pasubub    %[pa],     %[pp],          %[ftmp0] \n\t"
+        "pasubub    %[pb],     %[ftmp1],       %[ftmp0] \n\t"
+        "psubh      %[ftmp0],  %[b],           %[c]     \n\t"
+        "psubh      %[ftmp1],  %[a],           %[c]     \n\t"
+        "paddh      %[pc],     %[ftmp0],       %[ftmp1] \n\t"
+        "pcmpgth    %[ftmp0],  %[zero],        %[pc]    \n\t"
+        "xor        %[pc],     %[pc],          %[ftmp0] \n\t"
+        "psubh      %[pc],     %[pc],          %[ftmp0] \n\t"
+
+        "punpcklbh  %[pa],     %[pa],           %[zero] \n\t"
+        "punpcklbh  %[pb],     %[pb],           %[zero] \n\t"
+
+        "pcmpgth    %[ftmp0],  %[pa],          %[pb]    \n\t"
+        "and        %[ftmp1],  %[b],           %[ftmp0] \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]     \n\t"
+        "or         %[a],      %[a],           %[ftmp1] \n\t"
+        "pminsh     %[pa],     %[pa],          %[pb]    \n\t"
+
+        "pcmpgth    %[ftmp0],  %[pa],          %[pc]    \n\t"
+        "and        %[ftmp1],  %[c],           %[ftmp0] \n\t"
+        "pandn      %[a],      %[ftmp0],       %[a]     \n\t"
+        "or         %[a],      %[a],           %[ftmp1] \n\t"
+        "paddb      %[a],      %[a],           %[d]     \n\t"
+        "packushb   %[d],      %[a],           %[a]     \n\t"
+        "swc1       %[d],      0x00(%[row])             \n\t"
+        "punpcklbh  %[c],      %[pp],          %[zero]  \n\t"
+        "daddiu     %[row],    %[row],         0x04     \n\t"
+        "daddiu     %[prev],   %[prev],        0x04     \n\t"
+        "daddiu     %[istop],  %[istop],      -0x04     \n\t"
+        "bgtz       %[istop],  1b                       \n\t"
+        : [rp]"=&f"(rp), [pp]"=&f"(pp), [zero]"=&f"(zero),
+          [a]"=&f"(a), [b]"=&f"(b), [c]"=&f"(c), [d]"=&f"(d),
+          [pa]"=&f"(pa), [pb]"=&f"(pb), [pc]"=&f"(pc),
+          [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1])
+        : [row]"r"(row), [prev]"r"(prev), [istop]"r"(istop)
+        : "memory"
+   );
+}
+
+#endif /* PNG_MIPS_MMI_IMPLEMENTATION > 0 */
+#endif /* READ */
diff --git a/3rdparty/libpng/mips/filter_msa_intrinsics.c b/3rdparty/libpng/mips/filter_msa_intrinsics.c
index a579179421cc..1b734f4d9a7e 100644
--- a/3rdparty/libpng/mips/filter_msa_intrinsics.c
+++ b/3rdparty/libpng/mips/filter_msa_intrinsics.c
@@ -1,9 +1,9 @@
 
 /* filter_msa_intrinsics.c - MSA optimised filter functions
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 2016 Glenn Randers-Pehrson
- * Written by Mandar Sahastrabuddhe, August 2016.
+ * Written by Mandar Sahastrabuddhe, August 2016
  *
  * This code is released under the libpng license.
  * For conditions of distribution and use, see the disclaimer
@@ -11,7 +11,6 @@
  */
 
 #include <stdio.h>
-#include <stdint.h>
 #include "../pngpriv.h"
 
 #ifdef PNG_READ_SUPPORTED
@@ -20,6 +19,7 @@
 #if PNG_MIPS_MSA_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
 
 #include <msa.h>
+#include <stdint.h>
 
 /* libpng row pointers are not necessarily aligned to any particular boundary,
  * however this code will only work with appropriate alignment. mips/mips_init.c
@@ -379,8 +379,8 @@ void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row,
       LD_UB4(pp, 16, src4, src5, src6, src7);
       pp += 64;
 
-	  ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
-	       src0, src1, src2, src3);
+      ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
+           src0, src1, src2, src3);
 
       ST_UB4(src0, src1, src2, src3, rp, 16);
       rp += 64;
@@ -400,7 +400,7 @@ void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row,
             LD_UB4(pp, 16, src4, src5, src6, src7);
 
             ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
-	             src0, src1, src2, src3);
+                 src0, src1, src2, src3);
 
             ST_UB4(src0, src1, src2, src3, rp, 16);
             rp += 64;
@@ -425,7 +425,7 @@ void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row,
             LD_UB2(rp, 16, src0, src1);
             LD_UB2(pp, 16, src4, src5);
 
-			ADD2(src0, src4, src1, src5, src0, src1);
+            ADD2(src0, src4, src1, src5, src0, src1);
 
             ST_UB2(src0, src1, rp, 16);
             rp += 32;
diff --git a/3rdparty/libpng/mips/mips_init.c b/3rdparty/libpng/mips/mips_init.c
index 6a061cccfa77..5c6fa1dbf117 100644
--- a/3rdparty/libpng/mips/mips_init.c
+++ b/3rdparty/libpng/mips/mips_init.c
@@ -1,9 +1,10 @@
 
 /* mips_init.c - MSA optimised filter functions
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 2016 Glenn Randers-Pehrson
- * Written by Mandar Sahastrabuddhe, 2016.
+ * Written by Mandar Sahastrabuddhe, 2016
+ * Updated by guxiwei, 2023
  *
  * This code is released under the libpng license.
  * For conditions of distribution and use, see the disclaimer
@@ -20,8 +21,9 @@
 
 #ifdef PNG_READ_SUPPORTED
 
-#if PNG_MIPS_MSA_OPT > 0
-#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED /* Do run-time checks */
+#if PNG_MIPS_MSA_IMPLEMENTATION == 1 || PNG_MIPS_MMI_IMPLEMENTATION > 0
+
+#ifdef PNG_MIPS_MSA_CHECK_SUPPORTED /* Do MIPS MSA run-time checks */
 /* WARNING: it is strongly recommended that you do not build libpng with
  * run-time checks for CPU features if at all possible.  In the case of the MIPS
  * MSA instructions there is no processor-specific way of detecting the
@@ -51,13 +53,83 @@ static int png_have_msa(png_structp png_ptr);
 #endif /* PNG_MIPS_MSA_FILE */
 #endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
 
+#ifdef PNG_MIPS_MMI_CHECK_SUPPORTED /* Do MIPS MMI run-times checks */
+#ifndef PNG_MIPS_MMI_FILE
+#  ifdef __linux__
+#     define PNG_MIPS_MMI_FILE "contrib/mips-mmi/linux.c"
+#  endif
+#endif
+
+#ifdef PNG_MIPS_MMI_FILE
+
+#include <signal.h> /* for sig_atomic_t */
+static int png_have_mmi();
+#include PNG_MIPS_MMI_FILE
+
+#else  /* PNG_MIPS_MMI_FILE */
+#  error "PNG_MIPS_MMI_FILE undefined: no support for run-time MIPS MMI checks"
+#endif /* PNG_MIPS_MMI_FILE */
+#endif /* PNG_MIPS_MMI_CHECK_SUPPORTED*/
+
 #ifndef PNG_ALIGNED_MEMORY_SUPPORTED
 #  error "ALIGNED_MEMORY is required; set: -DPNG_ALIGNED_MEMORY_SUPPORTED"
 #endif
 
+/* MIPS supports two optimizations: MMI and MSA. The appropriate
+ * optimization is chosen at runtime
+ */
 void
-png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
+png_init_filter_functions_mips(png_structp pp, unsigned int bpp)
 {
+#if PNG_MIPS_MMI_IMPLEMENTATION  > 0
+#ifdef PNG_MIPS_MMI_API_SUPPORTED
+   switch ((pp->options >> PNG_MIPS_MMI) & 3)
+   {
+      case PNG_OPTION_UNSET:
+#endif /* PNG_MIPS_MMI_API_SUPPORTED */
+#ifdef PNG_MIPS_MMI_CHECK_SUPPORTED
+         {
+            static volatile sig_atomic_t no_mmi = -1; /* not checked */
+
+            if (no_mmi < 0)
+               no_mmi = !png_have_mmi();
+
+            if (no_mmi)
+              goto MIPS_MSA_INIT;
+         }
+#ifdef PNG_MIPS_MMI_API_SUPPORTED
+         break;
+#endif
+#endif /* PNG_MIPS_MMI_CHECK_SUPPORTED */
+
+#ifdef PNG_MIPS_MMI_API_SUPPORTED
+      default: /* OFF or INVALID */
+         goto MIPS_MSA_INIT;
+
+      case PNG_OPTION_ON:
+         /* Option turned on */
+         break;
+   }
+#endif
+   pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_mmi;
+   if (bpp == 3)
+   {
+      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_mmi;
+      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_mmi;
+      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
+         png_read_filter_row_paeth3_mmi;
+   }
+   else if (bpp == 4)
+   {
+      pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_mmi;
+      pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_mmi;
+      pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
+          png_read_filter_row_paeth4_mmi;
+   }
+#endif /* PNG_MIPS_MMI_IMPLEMENTATION > 0 */
+
+MIPS_MSA_INIT:
+#if PNG_MIPS_MSA_IMPLEMENTATION == 1
    /* The switch statement is compiled in for MIPS_MSA_API, the call to
     * png_have_msa is compiled in for MIPS_MSA_CHECK. If both are defined
     * the check is only performed if the API has not set the MSA option on
@@ -73,6 +145,7 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
           * this case will fall through to the 'default' below, which just
           * returns.
           */
+#endif /* PNG_MIPS_MSA_API_SUPPORTED */
 #ifdef PNG_MIPS_MSA_CHECK_SUPPORTED
          {
             static volatile sig_atomic_t no_msa = -1; /* not checked */
@@ -83,9 +156,12 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
             if (no_msa)
                return;
          }
-#endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
+#ifdef PNG_MIPS_MSA_API_SUPPORTED
          break;
+#endif
+#endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
 
+#ifdef PNG_MIPS_MSA_API_SUPPORTED
       default: /* OFF or INVALID */
          return;
 
@@ -93,6 +169,8 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
          /* Option turned on */
          break;
    }
+#endif
+
    /* IMPORTANT: any new external functions used here must be declared using
     * PNG_INTERNAL_FUNCTION in ../pngpriv.h.  This is required so that the
     * 'prefix' option to configure works:
@@ -112,16 +190,15 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_msa;
       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_msa;
    }
+
    else if (bpp == 4)
    {
       pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_msa;
       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_msa;
       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_msa;
    }
-#else
-   (void)pp;
-   (void)bpp;
-#endif /* PNG_MIPS_MSA_API_SUPPORTED */
+#endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 */
+   return;
 }
-#endif /* PNG_MIPS_MSA_OPT > 0 */
+#endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 || PNG_MIPS_MMI_IMPLEMENTATION > 0 */
 #endif /* READ */
diff --git a/3rdparty/libpng/patches/20190528-fix-leak-png_handle_exif.diff b/3rdparty/libpng/patches/20190528-fix-leak-png_handle_exif.diff
deleted file mode 100644
index f2dbc4dd5e8f..000000000000
--- a/3rdparty/libpng/patches/20190528-fix-leak-png_handle_exif.diff
+++ /dev/null
@@ -1,17 +0,0 @@
-diff --git a/3rdparty/libpng/pngrutil.c b/3rdparty/libpng/pngrutil.c
-index d5fa08c397..4db3de990b 100644
---- a/3rdparty/libpng/pngrutil.c
-+++ b/3rdparty/libpng/pngrutil.c
-@@ -2087,10 +2087,8 @@ png_handle_eXIf(png_structrp png_ptr, png_inforp info_ptr, png_uint_32 length)
-       }
-    }
- 
--   if (png_crc_finish(png_ptr, 0) != 0)
--      return;
--
--   png_set_eXIf_1(png_ptr, info_ptr, length, info_ptr->eXIf_buf);
-+   if (png_crc_finish(png_ptr, 0) == 0)
-+      png_set_eXIf_1(png_ptr, info_ptr, length, info_ptr->eXIf_buf);
- 
-    png_free(png_ptr, info_ptr->eXIf_buf);
-    info_ptr->eXIf_buf = NULL;
diff --git a/3rdparty/libpng/patches/20190910-msa-patch.diff b/3rdparty/libpng/patches/20190910-msa-patch.diff
deleted file mode 100644
index 42f49f991adb..000000000000
--- a/3rdparty/libpng/patches/20190910-msa-patch.diff
+++ /dev/null
@@ -1,53 +0,0 @@
-diff --git a/3rdparty/libpng/mips/mips_init.c b/3rdparty/libpng/mips/mips_init.c
-index 8dd283deef..6a061cccfa 100644
---- a/3rdparty/libpng/mips/mips_init.c
-+++ b/3rdparty/libpng/mips/mips_init.c
-@@ -73,7 +73,6 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
-           * this case will fall through to the 'default' below, which just
-           * returns.
-           */
--#endif /* PNG_MIPS_MSA_API_SUPPORTED */
- #ifdef PNG_MIPS_MSA_CHECK_SUPPORTED
-          {
-             static volatile sig_atomic_t no_msa = -1; /* not checked */
-@@ -84,12 +83,9 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
-             if (no_msa)
-                return;
-          }
--#ifdef PNG_MIPS_MSA_API_SUPPORTED
--         break;
--#endif
- #endif /* PNG_MIPS_MSA_CHECK_SUPPORTED */
-+         break;
- 
--#ifdef PNG_MIPS_MSA_API_SUPPORTED
-       default: /* OFF or INVALID */
-          return;
- 
-@@ -97,8 +93,6 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
-          /* Option turned on */
-          break;
-    }
--#endif
--
-    /* IMPORTANT: any new external functions used here must be declared using
-     * PNG_INTERNAL_FUNCTION in ../pngpriv.h.  This is required so that the
-     * 'prefix' option to configure works:
-@@ -118,13 +112,16 @@ png_init_filter_functions_msa(png_structp pp, unsigned int bpp)
-       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_msa;
-       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_msa;
-    }
--
-    else if (bpp == 4)
-    {
-       pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_msa;
-       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_msa;
-       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_msa;
-    }
-+#else
-+   (void)pp;
-+   (void)bpp;
-+#endif /* PNG_MIPS_MSA_API_SUPPORTED */
- }
- #endif /* PNG_MIPS_MSA_OPT > 0 */
- #endif /* READ */
diff --git a/3rdparty/libpng/png.c b/3rdparty/libpng/png.c
index 757c755f97c9..9ed315700924 100644
--- a/3rdparty/libpng/png.c
+++ b/3rdparty/libpng/png.c
@@ -1,7 +1,7 @@
 
 /* png.c - location for general purpose libpng functions
  *
- * Copyright (c) 2018-2019 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -14,27 +14,7 @@
 #include "pngpriv.h"
 
 /* Generate a compiler error if there is an old png.h in the search path. */
-typedef png_libpng_version_1_6_37 Your_png_h_is_not_version_1_6_37;
-
-#ifdef __GNUC__
-/* The version tests may need to be added to, but the problem warning has
- * consistently been fixed in GCC versions which obtain wide-spread release.
- * The problem is that many versions of GCC rearrange comparison expressions in
- * the optimizer in such a way that the results of the comparison will change
- * if signed integer overflow occurs.  Such comparisons are not permitted in
- * ANSI C90, however GCC isn't clever enough to work out that that do not occur
- * below in png_ascii_from_fp and png_muldiv, so it produces a warning with
- * -Wextra.  Unfortunately this is highly dependent on the optimizer and the
- * machine architecture so the warning comes and goes unpredictably and is
- * impossible to "fix", even were that a good idea.
- */
-#if __GNUC__ == 7 && __GNUC_MINOR__ == 1
-#define GCC_STRICT_OVERFLOW 1
-#endif /* GNU 7.1.x */
-#endif /* GNU */
-#ifndef GCC_STRICT_OVERFLOW
-#define GCC_STRICT_OVERFLOW 0
-#endif
+typedef png_libpng_version_1_6_43 Your_png_h_is_not_version_1_6_43;
 
 /* Tells libpng that we have already handled the first "num_bytes" bytes
  * of the PNG file signature.  If the PNG data is embedded into another
@@ -73,21 +53,21 @@ png_set_sig_bytes(png_structrp png_ptr, int num_bytes)
 int PNGAPI
 png_sig_cmp(png_const_bytep sig, size_t start, size_t num_to_check)
 {
-   png_byte png_signature[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+   static const png_byte png_signature[8] = {137, 80, 78, 71, 13, 10, 26, 10};
 
    if (num_to_check > 8)
       num_to_check = 8;
 
    else if (num_to_check < 1)
-      return (-1);
+      return -1;
 
    if (start > 7)
-      return (-1);
+      return -1;
 
    if (start + num_to_check > 8)
       num_to_check = 8 - start;
 
-   return ((int)(memcmp(&sig[start], &png_signature[start], num_to_check)));
+   return memcmp(&sig[start], &png_signature[start], num_to_check);
 }
 
 #endif /* READ */
@@ -447,7 +427,6 @@ png_info_init_3,(png_infopp ptr_ptr, size_t png_info_struct_size),
    memset(info_ptr, 0, (sizeof *info_ptr));
 }
 
-/* The following API is not called internally */
 void PNGAPI
 png_data_freer(png_const_structrp png_ptr, png_inforp info_ptr,
     int freer, png_uint_32 mask)
@@ -686,9 +665,9 @@ png_voidp PNGAPI
 png_get_io_ptr(png_const_structrp png_ptr)
 {
    if (png_ptr == NULL)
-      return (NULL);
+      return NULL;
 
-   return (png_ptr->io_ptr);
+   return png_ptr->io_ptr;
 }
 
 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
@@ -720,7 +699,7 @@ png_init_io(png_structrp png_ptr, png_FILE_p fp)
  *
  * Where UNSIGNED_MAX is the appropriate maximum unsigned value, so when the
  * negative integral value is added the result will be an unsigned value
- * correspnding to the 2's complement representation.
+ * corresponding to the 2's complement representation.
  */
 void PNGAPI
 png_save_int_32(png_bytep buf, png_int_32 i)
@@ -752,7 +731,7 @@ png_convert_to_rfc1123_buffer(char out[29], png_const_timep ptime)
 
    {
       size_t pos = 0;
-      char number_buf[5]; /* enough for a four-digit year */
+      char number_buf[5] = {0, 0, 0, 0, 0}; /* enough for a four-digit year */
 
 #     define APPEND_STRING(string) pos = png_safecat(out, 29, pos, (string))
 #     define APPEND_NUMBER(format, value)\
@@ -815,8 +794,8 @@ png_get_copyright(png_const_structrp png_ptr)
    return PNG_STRING_COPYRIGHT
 #else
    return PNG_STRING_NEWLINE \
-      "libpng version 1.6.37" PNG_STRING_NEWLINE \
-      "Copyright (c) 2018-2019 Cosmin Truta" PNG_STRING_NEWLINE \
+      "libpng version 1.6.43" PNG_STRING_NEWLINE \
+      "Copyright (c) 2018-2024 Cosmin Truta" PNG_STRING_NEWLINE \
       "Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson" \
       PNG_STRING_NEWLINE \
       "Copyright (c) 1996-1997 Andreas Dilger" PNG_STRING_NEWLINE \
@@ -977,7 +956,7 @@ png_reset_zstream(png_structrp png_ptr)
       return Z_STREAM_ERROR;
 
    /* WARNING: this resets the window bits to the maximum! */
-   return (inflateReset(&png_ptr->zstream));
+   return inflateReset(&png_ptr->zstream);
 }
 #endif /* READ */
 
@@ -986,7 +965,7 @@ png_uint_32 PNGAPI
 png_access_version_number(void)
 {
    /* Version of *.c files used when building libpng */
-   return((png_uint_32)PNG_LIBPNG_VER);
+   return (png_uint_32)PNG_LIBPNG_VER;
 }
 
 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
@@ -1842,14 +1821,14 @@ png_icc_profile_error(png_const_structrp png_ptr, png_colorspacerp colorspace,
    }
 #  ifdef PNG_WARNINGS_SUPPORTED
    else
-      {
-         char number[PNG_NUMBER_BUFFER_SIZE]; /* +24 = 114*/
+   {
+      char number[PNG_NUMBER_BUFFER_SIZE]; /* +24 = 114 */
 
-         pos = png_safecat(message, (sizeof message), pos,
-             png_format_number(number, number+(sizeof number),
-             PNG_NUMBER_FORMAT_x, value));
-         pos = png_safecat(message, (sizeof message), pos, "h: "); /*+2 = 116*/
-      }
+      pos = png_safecat(message, (sizeof message), pos,
+          png_format_number(number, number+(sizeof number),
+          PNG_NUMBER_FORMAT_x, value));
+      pos = png_safecat(message, (sizeof message), pos, "h: "); /* +2 = 116 */
+   }
 #  endif
    /* The 'reason' is an arbitrary message, allow +79 maximum 195 */
    pos = png_safecat(message, (sizeof message), pos, reason);
@@ -2532,17 +2511,6 @@ png_colorspace_set_rgb_coefficients(png_structrp png_ptr)
 
 #endif /* COLORSPACE */
 
-#ifdef __GNUC__
-/* This exists solely to work round a warning from GNU C. */
-static int /* PRIVATE */
-png_gt(size_t a, size_t b)
-{
-   return a > b;
-}
-#else
-#   define png_gt(a,b) ((a) > (b))
-#endif
-
 void /* PRIVATE */
 png_check_IHDR(png_const_structrp png_ptr,
     png_uint_32 width, png_uint_32 height, int bit_depth,
@@ -2564,8 +2532,16 @@ png_check_IHDR(png_const_structrp png_ptr,
       error = 1;
    }
 
-   if (png_gt(((width + 7) & (~7U)),
-       ((PNG_SIZE_MAX
+   /* The bit mask on the first line below must be at least as big as a
+    * png_uint_32.  "~7U" is not adequate on 16-bit systems because it will
+    * be an unsigned 16-bit value.  Casting to (png_alloc_size_t) makes the
+    * type of the result at least as bit (in bits) as the RHS of the > operator
+    * which also avoids a common warning on 64-bit systems that the comparison
+    * of (png_uint_32) against the constant value on the RHS will always be
+    * false.
+    */
+   if (((width + 7) & ~(png_alloc_size_t)7) >
+       (((PNG_SIZE_MAX
            - 48        /* big_row_buf hack */
            - 1)        /* filter byte */
            / 8)        /* 8-byte RGBA pixels */
@@ -2710,7 +2686,7 @@ png_check_IHDR(png_const_structrp png_ptr,
 
 int /* PRIVATE */
 png_check_fp_number(png_const_charp string, size_t size, int *statep,
-    png_size_tp whereami)
+    size_t *whereami)
 {
    int state = *statep;
    size_t i = *whereami;
@@ -2891,14 +2867,6 @@ png_pow10(int power)
 /* Function to format a floating point value in ASCII with a given
  * precision.
  */
-#if GCC_STRICT_OVERFLOW
-#pragma GCC diagnostic push
-/* The problem arises below with exp_b10, which can never overflow because it
- * comes, originally, from frexp and is therefore limited to a range which is
- * typically +/-710 (log2(DBL_MAX)/log2(DBL_MIN)).
- */
-#pragma GCC diagnostic warning "-Wstrict-overflow=2"
-#endif /* GCC_STRICT_OVERFLOW */
 void /* PRIVATE */
 png_ascii_from_fp(png_const_structrp png_ptr, png_charp ascii, size_t size,
     double fp, unsigned int precision)
@@ -3220,10 +3188,6 @@ png_ascii_from_fp(png_const_structrp png_ptr, png_charp ascii, size_t size,
    /* Here on buffer too small. */
    png_error(png_ptr, "ASCII conversion buffer too small");
 }
-#if GCC_STRICT_OVERFLOW
-#pragma GCC diagnostic pop
-#endif /* GCC_STRICT_OVERFLOW */
-
 #  endif /* FLOATING_POINT */
 
 #  ifdef PNG_FIXED_POINT_SUPPORTED
@@ -3251,7 +3215,7 @@ png_ascii_from_fixed(png_const_structrp png_ptr, png_charp ascii,
       if (num <= 0x80000000) /* else overflowed */
       {
          unsigned int ndigits = 0, first = 16 /* flag value */;
-         char digits[10];
+         char digits[10] = {0};
 
          while (num)
          {
@@ -3336,15 +3300,6 @@ png_fixed(png_const_structrp png_ptr, double fp, png_const_charp text)
  * the nearest .00001).  Overflow and divide by zero are signalled in
  * the result, a boolean - true on success, false on overflow.
  */
-#if GCC_STRICT_OVERFLOW /* from above */
-/* It is not obvious which comparison below gets optimized in such a way that
- * signed overflow would change the result; looking through the code does not
- * reveal any tests which have the form GCC complains about, so presumably the
- * optimizer is moving an add or subtract into the 'if' somewhere.
- */
-#pragma GCC diagnostic push
-#pragma GCC diagnostic warning "-Wstrict-overflow=2"
-#endif /* GCC_STRICT_OVERFLOW */
 int
 png_muldiv(png_fixed_point_p res, png_fixed_point a, png_int_32 times,
     png_int_32 divisor)
@@ -3459,9 +3414,6 @@ png_muldiv(png_fixed_point_p res, png_fixed_point a, png_int_32 times,
 
    return 0;
 }
-#if GCC_STRICT_OVERFLOW
-#pragma GCC diagnostic pop
-#endif /* GCC_STRICT_OVERFLOW */
 #endif /* READ_GAMMA || INCH_CONVERSIONS */
 
 #if defined(PNG_READ_GAMMA_SUPPORTED) || defined(PNG_INCH_CONVERSIONS_SUPPORTED)
diff --git a/3rdparty/libpng/png.h b/3rdparty/libpng/png.h
index 139eb0dc0f36..83d390312606 100644
--- a/3rdparty/libpng/png.h
+++ b/3rdparty/libpng/png.h
@@ -1,9 +1,9 @@
 
 /* png.h - header file for PNG reference library
  *
- * libpng version 1.6.37 - April 14, 2019
+ * libpng version 1.6.43
  *
- * Copyright (c) 2018-2019 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -15,7 +15,7 @@
  *   libpng versions 0.89, June 1996, through 0.96, May 1997: Andreas Dilger
  *   libpng versions 0.97, January 1998, through 1.6.35, July 2018:
  *     Glenn Randers-Pehrson
- *   libpng versions 1.6.36, December 2018, through 1.6.37, April 2019:
+ *   libpng versions 1.6.36, December 2018, through 1.6.43, February 2024:
  *     Cosmin Truta
  *   See also "Contributing Authors", below.
  */
@@ -27,8 +27,8 @@
  * PNG Reference Library License version 2
  * ---------------------------------------
  *
- *  * Copyright (c) 1995-2019 The PNG Reference Library Authors.
- *  * Copyright (c) 2018-2019 Cosmin Truta.
+ *  * Copyright (c) 1995-2024 The PNG Reference Library Authors.
+ *  * Copyright (c) 2018-2024 Cosmin Truta.
  *  * Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson.
  *  * Copyright (c) 1996-1997 Andreas Dilger.
  *  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -239,7 +239,7 @@
  *    ...
  *    1.5.30                  15    10530  15.so.15.30[.0]
  *    ...
- *    1.6.37                  16    10637  16.so.16.37[.0]
+ *    1.6.43                  16    10643  16.so.16.43[.0]
  *
  *    Henceforth the source version will match the shared-library major and
  *    minor numbers; the shared-library major version number will be used for
@@ -255,9 +255,6 @@
  *    to the info_ptr or png_ptr members through png.h, and the compiled
  *    application is loaded with a different version of the library.
  *
- *    DLLNUM will change each time there are forward or backward changes
- *    in binary compatibility (e.g., when a new feature is added).
- *
  * See libpng.txt or libpng.3 for more information.  The PNG specification
  * is available as a W3C Recommendation and as an ISO/IEC Standard; see
  * <https://www.w3.org/TR/2003/REC-PNG-20031110/>
@@ -278,19 +275,21 @@
  */
 
 /* Version information for png.h - this should match the version in png.c */
-#define PNG_LIBPNG_VER_STRING "1.6.37"
-#define PNG_HEADER_VERSION_STRING " libpng version 1.6.37 - April 14, 2019\n"
+#define PNG_LIBPNG_VER_STRING "1.6.43"
+#define PNG_HEADER_VERSION_STRING " libpng version " PNG_LIBPNG_VER_STRING "\n"
 
-#define PNG_LIBPNG_VER_SONUM   16
-#define PNG_LIBPNG_VER_DLLNUM  16
+/* The versions of shared library builds should stay in sync, going forward */
+#define PNG_LIBPNG_VER_SHAREDLIB 16
+#define PNG_LIBPNG_VER_SONUM     PNG_LIBPNG_VER_SHAREDLIB /* [Deprecated] */
+#define PNG_LIBPNG_VER_DLLNUM    PNG_LIBPNG_VER_SHAREDLIB /* [Deprecated] */
 
 /* These should match the first 3 components of PNG_LIBPNG_VER_STRING: */
 #define PNG_LIBPNG_VER_MAJOR   1
 #define PNG_LIBPNG_VER_MINOR   6
-#define PNG_LIBPNG_VER_RELEASE 37
+#define PNG_LIBPNG_VER_RELEASE 43
 
 /* This should be zero for a public release, or non-zero for a
- * development version.  [Deprecated]
+ * development version.
  */
 #define PNG_LIBPNG_VER_BUILD  0
 
@@ -318,7 +317,7 @@
  * From version 1.0.1 it is:
  * XXYYZZ, where XX=major, YY=minor, ZZ=release
  */
-#define PNG_LIBPNG_VER 10637 /* 1.6.37 */
+#define PNG_LIBPNG_VER 10643 /* 1.6.43 */
 
 /* Library configuration: these options cannot be changed after
  * the library has been built.
@@ -428,7 +427,7 @@ extern "C" {
 /* This triggers a compiler error in png.c, if png.c and png.h
  * do not agree upon the version number.
  */
-typedef char* png_libpng_version_1_6_37;
+typedef char* png_libpng_version_1_6_43;
 
 /* Basic control structions.  Read libpng-manual.txt or libpng.3 for more info.
  *
@@ -849,7 +848,7 @@ PNG_FUNCTION(void, (PNGCAPI *png_longjmp_ptr), PNGARG((jmp_buf, int)), typedef);
 #define PNG_TRANSFORM_GRAY_TO_RGB   0x2000      /* read only */
 /* Added to libpng-1.5.4 */
 #define PNG_TRANSFORM_EXPAND_16     0x4000      /* read only */
-#if INT_MAX >= 0x8000 /* else this might break */
+#if ~0U > 0xffffU /* or else this might break on a 16-bit machine */
 #define PNG_TRANSFORM_SCALE_16      0x8000      /* read only */
 #endif
 
@@ -908,15 +907,15 @@ PNG_EXPORT(2, void, png_set_sig_bytes, (png_structrp png_ptr, int num_bytes));
 /* Check sig[start] through sig[start + num_to_check - 1] to see if it's a
  * PNG file.  Returns zero if the supplied bytes match the 8-byte PNG
  * signature, and non-zero otherwise.  Having num_to_check == 0 or
- * start > 7 will always fail (ie return non-zero).
+ * start > 7 will always fail (i.e. return non-zero).
  */
 PNG_EXPORT(3, int, png_sig_cmp, (png_const_bytep sig, size_t start,
     size_t num_to_check));
 
 /* Simple signature checking function.  This is the same as calling
- * png_check_sig(sig, n) := !png_sig_cmp(sig, 0, n).
+ * png_check_sig(sig, n) := (png_sig_cmp(sig, 0, n) == 0).
  */
-#define png_check_sig(sig, n) !png_sig_cmp((sig), 0, (n))
+#define png_check_sig(sig, n) (png_sig_cmp((sig), 0, (n)) == 0) /* DEPRECATED */
 
 /* Allocate and initialize png_ptr struct for reading, and any other memory. */
 PNG_EXPORTA(4, png_structp, png_create_read_struct,
@@ -1446,7 +1445,7 @@ PNG_EXPORT(66, void, png_set_crc_action, (png_structrp png_ptr, int crit_action,
  * mainly useful for testing, as the defaults should work with most users.
  * Those users who are tight on memory or want faster performance at the
  * expense of compression can modify them.  See the compression library
- * header file (zlib.h) for an explination of the compression functions.
+ * header file (zlib.h) for an explanation of the compression functions.
  */
 
 /* Set the filtering method(s) used by libpng.  Currently, the only valid
@@ -1501,7 +1500,7 @@ PNG_FIXED_EXPORT(209, void, png_set_filter_heuristics_fixed,
  * 0 - 9, corresponding directly to the zlib compression levels 0 - 9
  * (0 - no compression, 9 - "maximal" compression).  Note that tests have
  * shown that zlib compression levels 3-6 usually perform as well as level 9
- * for PNG images, and do considerably fewer caclulations.  In the future,
+ * for PNG images, and do considerably fewer calculations.  In the future,
  * these values may not correspond directly to the zlib compression levels.
  */
 #ifdef PNG_WRITE_CUSTOMIZE_COMPRESSION_SUPPORTED
@@ -1730,12 +1729,9 @@ PNG_EXPORT(97, void, png_free, (png_const_structrp png_ptr, png_voidp ptr));
 PNG_EXPORT(98, void, png_free_data, (png_const_structrp png_ptr,
     png_inforp info_ptr, png_uint_32 free_me, int num));
 
-/* Reassign responsibility for freeing existing data, whether allocated
+/* Reassign the responsibility for freeing existing data, whether allocated
  * by libpng or by the application; this works on the png_info structure passed
- * in, it does not change the state for other png_info structures.
- *
- * It is unlikely that this function works correctly as of 1.6.0 and using it
- * may result either in memory leaks or double free of allocated data.
+ * in, without changing the state for other png_info structures.
  */
 PNG_EXPORT(99, void, png_data_freer, (png_const_structrp png_ptr,
     png_inforp info_ptr, int freer, png_uint_32 mask));
@@ -3207,11 +3203,18 @@ PNG_EXPORT(245, int, png_image_write_to_memory, (png_imagep image, void *memory,
 #ifdef PNG_MIPS_MSA_API_SUPPORTED
 #  define PNG_MIPS_MSA   6 /* HARDWARE: MIPS Msa SIMD instructions supported */
 #endif
-#define PNG_IGNORE_ADLER32 8
+#ifdef PNG_DISABLE_ADLER32_CHECK_SUPPORTED
+#  define PNG_IGNORE_ADLER32 8 /* SOFTWARE: disable Adler32 check on IDAT */
+#endif
 #ifdef PNG_POWERPC_VSX_API_SUPPORTED
-#  define PNG_POWERPC_VSX   10 /* HARDWARE: PowerPC VSX SIMD instructions supported */
+#  define PNG_POWERPC_VSX   10 /* HARDWARE: PowerPC VSX SIMD instructions
+                                * supported */
 #endif
-#define PNG_OPTION_NEXT  12 /* Next option - numbers must be even */
+#ifdef PNG_MIPS_MMI_API_SUPPORTED
+#  define PNG_MIPS_MMI   12 /* HARDWARE: MIPS MMI SIMD instructions supported */
+#endif
+
+#define PNG_OPTION_NEXT  14 /* Next option - numbers must be even */
 
 /* Return values: NOTE: there are four values and 'off' is *not* zero */
 #define PNG_OPTION_UNSET   0 /* Unset - defaults to off */
diff --git a/3rdparty/libpng/pngconf.h b/3rdparty/libpng/pngconf.h
index 927a769dbee8..000d7b1a8a6e 100644
--- a/3rdparty/libpng/pngconf.h
+++ b/3rdparty/libpng/pngconf.h
@@ -1,9 +1,9 @@
 
 /* pngconf.h - machine-configurable file for libpng
  *
- * libpng version 1.6.37
+ * libpng version 1.6.43
  *
- * Copyright (c) 2018-2019 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2016,2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -180,8 +180,8 @@
  * compiler-specific macros to the values required to change the calling
  * conventions of the various functions.
  */
-#if defined(_Windows) || defined(_WINDOWS) || defined(WIN32) ||\
-    defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
+#if defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || \
+    defined(__CYGWIN__)
   /* Windows system (DOS doesn't support DLLs).  Includes builds under Cygwin or
    * MinGW on any architecture currently supported by Windows.  Also includes
    * Watcom builds but these need special treatment because they are not
diff --git a/3rdparty/libpng/pngerror.c b/3rdparty/libpng/pngerror.c
index ec3a709b9d2c..29ebda794377 100644
--- a/3rdparty/libpng/pngerror.c
+++ b/3rdparty/libpng/pngerror.c
@@ -1,7 +1,7 @@
 
 /* pngerror.c - stub functions for i/o and memory allocation
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2017 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -255,7 +255,7 @@ void
 png_warning_parameter_unsigned(png_warning_parameters p, int number, int format,
     png_alloc_size_t value)
 {
-   char buffer[PNG_NUMBER_BUFFER_SIZE];
+   char buffer[PNG_NUMBER_BUFFER_SIZE] = {0};
    png_warning_parameter(p, number, PNG_FORMAT_NUMBER(buffer, format, value));
 }
 
@@ -265,7 +265,7 @@ png_warning_parameter_signed(png_warning_parameters p, int number, int format,
 {
    png_alloc_size_t u;
    png_charp str;
-   char buffer[PNG_NUMBER_BUFFER_SIZE];
+   char buffer[PNG_NUMBER_BUFFER_SIZE] = {0};
 
    /* Avoid overflow by doing the negate in a png_alloc_size_t: */
    u = (png_alloc_size_t)value;
@@ -858,7 +858,7 @@ png_get_error_ptr(png_const_structrp png_ptr)
    if (png_ptr == NULL)
       return NULL;
 
-   return ((png_voidp)png_ptr->error_ptr);
+   return (png_voidp)png_ptr->error_ptr;
 }
 
 
@@ -933,31 +933,25 @@ png_safe_warning(png_structp png_nonconst_ptr, png_const_charp warning_message)
 #endif
 
 int /* PRIVATE */
-png_safe_execute(png_imagep image_in, int (*function)(png_voidp), png_voidp arg)
+png_safe_execute(png_imagep image, int (*function)(png_voidp), png_voidp arg)
 {
-   volatile png_imagep image = image_in;
-   volatile int result;
-   volatile png_voidp saved_error_buf;
+   png_voidp saved_error_buf = image->opaque->error_buf;
    jmp_buf safe_jmpbuf;
+   int result;
 
-   /* Safely execute function(arg) with png_error returning to this function. */
-   saved_error_buf = image->opaque->error_buf;
-   result = setjmp(safe_jmpbuf) == 0;
-
-   if (result != 0)
+   /* Safely execute function(arg), with png_error returning back here. */
+   if (setjmp(safe_jmpbuf) == 0)
    {
-
       image->opaque->error_buf = safe_jmpbuf;
       result = function(arg);
+      image->opaque->error_buf = saved_error_buf;
+      return result;
    }
 
+   /* On png_error, return via longjmp, pop the jmpbuf, and free the image. */
    image->opaque->error_buf = saved_error_buf;
-
-   /* And do the cleanup prior to any failure return. */
-   if (result == 0)
-      png_image_free(image);
-
-   return result;
+   png_image_free(image);
+   return 0;
 }
 #endif /* SIMPLIFIED READ || SIMPLIFIED_WRITE */
 #endif /* READ || WRITE */
diff --git a/3rdparty/libpng/pngget.c b/3rdparty/libpng/pngget.c
index 5abf1efd9f73..1084b268ff90 100644
--- a/3rdparty/libpng/pngget.c
+++ b/3rdparty/libpng/pngget.c
@@ -1,7 +1,7 @@
 
 /* pngget.c - retrieval of values from info struct
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -21,18 +21,29 @@ png_get_valid(png_const_structrp png_ptr, png_const_inforp info_ptr,
     png_uint_32 flag)
 {
    if (png_ptr != NULL && info_ptr != NULL)
-      return(info_ptr->valid & flag);
+   {
+#ifdef PNG_READ_tRNS_SUPPORTED
+      /* png_handle_PLTE() may have canceled a valid tRNS chunk but left the
+       * 'valid' flag for the detection of duplicate chunks. Do not report a
+       * valid tRNS chunk in this case.
+       */
+      if (flag == PNG_INFO_tRNS && png_ptr->num_trans == 0)
+         return 0;
+#endif
 
-   return(0);
+      return info_ptr->valid & flag;
+   }
+
+   return 0;
 }
 
 size_t PNGAPI
 png_get_rowbytes(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
    if (png_ptr != NULL && info_ptr != NULL)
-      return(info_ptr->rowbytes);
+      return info_ptr->rowbytes;
 
-   return(0);
+   return 0;
 }
 
 #ifdef PNG_INFO_IMAGE_SUPPORTED
@@ -40,9 +51,9 @@ png_bytepp PNGAPI
 png_get_rows(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
    if (png_ptr != NULL && info_ptr != NULL)
-      return(info_ptr->row_pointers);
+      return info_ptr->row_pointers;
 
-   return(0);
+   return 0;
 }
 #endif
 
@@ -54,7 +65,7 @@ png_get_image_width(png_const_structrp png_ptr, png_const_inforp info_ptr)
    if (png_ptr != NULL && info_ptr != NULL)
       return info_ptr->width;
 
-   return (0);
+   return 0;
 }
 
 png_uint_32 PNGAPI
@@ -63,7 +74,7 @@ png_get_image_height(png_const_structrp png_ptr, png_const_inforp info_ptr)
    if (png_ptr != NULL && info_ptr != NULL)
       return info_ptr->height;
 
-   return (0);
+   return 0;
 }
 
 png_byte PNGAPI
@@ -72,7 +83,7 @@ png_get_bit_depth(png_const_structrp png_ptr, png_const_inforp info_ptr)
    if (png_ptr != NULL && info_ptr != NULL)
       return info_ptr->bit_depth;
 
-   return (0);
+   return 0;
 }
 
 png_byte PNGAPI
@@ -81,7 +92,7 @@ png_get_color_type(png_const_structrp png_ptr, png_const_inforp info_ptr)
    if (png_ptr != NULL && info_ptr != NULL)
       return info_ptr->color_type;
 
-   return (0);
+   return 0;
 }
 
 png_byte PNGAPI
@@ -90,7 +101,7 @@ png_get_filter_type(png_const_structrp png_ptr, png_const_inforp info_ptr)
    if (png_ptr != NULL && info_ptr != NULL)
       return info_ptr->filter_type;
 
-   return (0);
+   return 0;
 }
 
 png_byte PNGAPI
@@ -99,7 +110,7 @@ png_get_interlace_type(png_const_structrp png_ptr, png_const_inforp info_ptr)
    if (png_ptr != NULL && info_ptr != NULL)
       return info_ptr->interlace_type;
 
-   return (0);
+   return 0;
 }
 
 png_byte PNGAPI
@@ -108,7 +119,7 @@ png_get_compression_type(png_const_structrp png_ptr, png_const_inforp info_ptr)
    if (png_ptr != NULL && info_ptr != NULL)
       return info_ptr->compression_type;
 
-   return (0);
+   return 0;
 }
 
 png_uint_32 PNGAPI
@@ -116,21 +127,20 @@ png_get_x_pixels_per_meter(png_const_structrp png_ptr, png_const_inforp
    info_ptr)
 {
 #ifdef PNG_pHYs_SUPPORTED
+   png_debug(1, "in png_get_x_pixels_per_meter");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_pHYs) != 0)
-      {
-         png_debug1(1, "in %s retrieval function",
-             "png_get_x_pixels_per_meter");
-
-         if (info_ptr->phys_unit_type == PNG_RESOLUTION_METER)
-            return (info_ptr->x_pixels_per_unit);
-      }
+   {
+      if (info_ptr->phys_unit_type == PNG_RESOLUTION_METER)
+         return info_ptr->x_pixels_per_unit;
+   }
 #else
    PNG_UNUSED(png_ptr)
    PNG_UNUSED(info_ptr)
 #endif
 
-   return (0);
+   return 0;
 }
 
 png_uint_32 PNGAPI
@@ -138,42 +148,41 @@ png_get_y_pixels_per_meter(png_const_structrp png_ptr, png_const_inforp
     info_ptr)
 {
 #ifdef PNG_pHYs_SUPPORTED
+   png_debug(1, "in png_get_y_pixels_per_meter");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_pHYs) != 0)
    {
-      png_debug1(1, "in %s retrieval function",
-          "png_get_y_pixels_per_meter");
-
       if (info_ptr->phys_unit_type == PNG_RESOLUTION_METER)
-         return (info_ptr->y_pixels_per_unit);
+         return info_ptr->y_pixels_per_unit;
    }
 #else
    PNG_UNUSED(png_ptr)
    PNG_UNUSED(info_ptr)
 #endif
 
-   return (0);
+   return 0;
 }
 
 png_uint_32 PNGAPI
 png_get_pixels_per_meter(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
 #ifdef PNG_pHYs_SUPPORTED
+   png_debug(1, "in png_get_pixels_per_meter");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_pHYs) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "png_get_pixels_per_meter");
-
       if (info_ptr->phys_unit_type == PNG_RESOLUTION_METER &&
           info_ptr->x_pixels_per_unit == info_ptr->y_pixels_per_unit)
-         return (info_ptr->x_pixels_per_unit);
+         return info_ptr->x_pixels_per_unit;
    }
 #else
    PNG_UNUSED(png_ptr)
    PNG_UNUSED(info_ptr)
 #endif
 
-   return (0);
+   return 0;
 }
 
 #ifdef PNG_FLOATING_POINT_SUPPORTED
@@ -182,21 +191,21 @@ png_get_pixel_aspect_ratio(png_const_structrp png_ptr, png_const_inforp
    info_ptr)
 {
 #ifdef PNG_READ_pHYs_SUPPORTED
+   png_debug(1, "in png_get_pixel_aspect_ratio");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_pHYs) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "png_get_aspect_ratio");
-
       if (info_ptr->x_pixels_per_unit != 0)
-         return ((float)((float)info_ptr->y_pixels_per_unit
-             /(float)info_ptr->x_pixels_per_unit));
+         return (float)info_ptr->y_pixels_per_unit
+              / (float)info_ptr->x_pixels_per_unit;
    }
 #else
    PNG_UNUSED(png_ptr)
    PNG_UNUSED(info_ptr)
 #endif
 
-   return ((float)0.0);
+   return (float)0.0;
 }
 #endif
 
@@ -206,6 +215,8 @@ png_get_pixel_aspect_ratio_fixed(png_const_structrp png_ptr,
     png_const_inforp info_ptr)
 {
 #ifdef PNG_READ_pHYs_SUPPORTED
+   png_debug(1, "in png_get_pixel_aspect_ratio_fixed");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_pHYs) != 0 &&
        info_ptr->x_pixels_per_unit > 0 && info_ptr->y_pixels_per_unit > 0 &&
@@ -214,8 +225,6 @@ png_get_pixel_aspect_ratio_fixed(png_const_structrp png_ptr,
    {
       png_fixed_point res;
 
-      png_debug1(1, "in %s retrieval function", "png_get_aspect_ratio_fixed");
-
       /* The following casts work because a PNG 4 byte integer only has a valid
        * range of 0..2^31-1; otherwise the cast might overflow.
        */
@@ -236,80 +245,80 @@ png_int_32 PNGAPI
 png_get_x_offset_microns(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
 #ifdef PNG_oFFs_SUPPORTED
+   png_debug(1, "in png_get_x_offset_microns");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_oFFs) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "png_get_x_offset_microns");
-
       if (info_ptr->offset_unit_type == PNG_OFFSET_MICROMETER)
-         return (info_ptr->x_offset);
+         return info_ptr->x_offset;
    }
 #else
    PNG_UNUSED(png_ptr)
    PNG_UNUSED(info_ptr)
 #endif
 
-   return (0);
+   return 0;
 }
 
 png_int_32 PNGAPI
 png_get_y_offset_microns(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
 #ifdef PNG_oFFs_SUPPORTED
+   png_debug(1, "in png_get_y_offset_microns");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_oFFs) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "png_get_y_offset_microns");
-
       if (info_ptr->offset_unit_type == PNG_OFFSET_MICROMETER)
-         return (info_ptr->y_offset);
+         return info_ptr->y_offset;
    }
 #else
    PNG_UNUSED(png_ptr)
    PNG_UNUSED(info_ptr)
 #endif
 
-   return (0);
+   return 0;
 }
 
 png_int_32 PNGAPI
 png_get_x_offset_pixels(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
 #ifdef PNG_oFFs_SUPPORTED
+   png_debug(1, "in png_get_x_offset_pixels");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_oFFs) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "png_get_x_offset_pixels");
-
       if (info_ptr->offset_unit_type == PNG_OFFSET_PIXEL)
-         return (info_ptr->x_offset);
+         return info_ptr->x_offset;
    }
 #else
    PNG_UNUSED(png_ptr)
    PNG_UNUSED(info_ptr)
 #endif
 
-   return (0);
+   return 0;
 }
 
 png_int_32 PNGAPI
 png_get_y_offset_pixels(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
 #ifdef PNG_oFFs_SUPPORTED
+   png_debug(1, "in png_get_y_offset_pixels");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_oFFs) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "png_get_y_offset_pixels");
-
       if (info_ptr->offset_unit_type == PNG_OFFSET_PIXEL)
-         return (info_ptr->y_offset);
+         return info_ptr->y_offset;
    }
 #else
    PNG_UNUSED(png_ptr)
    PNG_UNUSED(info_ptr)
 #endif
 
-   return (0);
+   return 0;
 }
 
 #ifdef PNG_INCH_CONVERSIONS_SUPPORTED
@@ -423,11 +432,11 @@ png_get_pHYs_dpi(png_const_structrp png_ptr, png_const_inforp info_ptr,
 {
    png_uint_32 retval = 0;
 
+   png_debug1(1, "in %s retrieval function", "pHYs");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_pHYs) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "pHYs");
-
       if (res_x != NULL)
       {
          *res_x = info_ptr->x_pixels_per_unit;
@@ -453,7 +462,7 @@ png_get_pHYs_dpi(png_const_structrp png_ptr, png_const_inforp info_ptr,
       }
    }
 
-   return (retval);
+   return retval;
 }
 #endif /* pHYs */
 #endif /* INCH_CONVERSIONS */
@@ -467,9 +476,9 @@ png_byte PNGAPI
 png_get_channels(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
    if (png_ptr != NULL && info_ptr != NULL)
-      return(info_ptr->channels);
+      return info_ptr->channels;
 
-   return (0);
+   return 0;
 }
 
 #ifdef PNG_READ_SUPPORTED
@@ -477,9 +486,9 @@ png_const_bytep PNGAPI
 png_get_signature(png_const_structrp png_ptr, png_const_inforp info_ptr)
 {
    if (png_ptr != NULL && info_ptr != NULL)
-      return(info_ptr->signature);
+      return info_ptr->signature;
 
-   return (NULL);
+   return NULL;
 }
 #endif
 
@@ -488,17 +497,17 @@ png_uint_32 PNGAPI
 png_get_bKGD(png_const_structrp png_ptr, png_inforp info_ptr,
     png_color_16p *background)
 {
+   png_debug1(1, "in %s retrieval function", "bKGD");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_bKGD) != 0 &&
        background != NULL)
    {
-      png_debug1(1, "in %s retrieval function", "bKGD");
-
       *background = &(info_ptr->background);
-      return (PNG_INFO_bKGD);
+      return PNG_INFO_bKGD;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -513,6 +522,8 @@ png_get_cHRM(png_const_structrp png_ptr, png_const_inforp info_ptr,
     double *white_x, double *white_y, double *red_x, double *red_y,
     double *green_x, double *green_y, double *blue_x, double *blue_y)
 {
+   png_debug1(1, "in %s retrieval function", "cHRM");
+
    /* Quiet API change: this code used to only return the end points if a cHRM
     * chunk was present, but the end points can also come from iCCP or sRGB
     * chunks, so in 1.6.0 the png_get_ APIs return the end points regardless and
@@ -522,8 +533,6 @@ png_get_cHRM(png_const_structrp png_ptr, png_const_inforp info_ptr,
    if (png_ptr != NULL && info_ptr != NULL &&
       (info_ptr->colorspace.flags & PNG_COLORSPACE_HAVE_ENDPOINTS) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "cHRM");
-
       if (white_x != NULL)
          *white_x = png_float(png_ptr,
              info_ptr->colorspace.end_points_xy.whitex, "cHRM white X");
@@ -548,10 +557,10 @@ png_get_cHRM(png_const_structrp png_ptr, png_const_inforp info_ptr,
       if (blue_y != NULL)
          *blue_y = png_float(png_ptr, info_ptr->colorspace.end_points_xy.bluey,
              "cHRM blue Y");
-      return (PNG_INFO_cHRM);
+      return PNG_INFO_cHRM;
    }
 
-   return (0);
+   return 0;
 }
 
 png_uint_32 PNGAPI
@@ -560,11 +569,11 @@ png_get_cHRM_XYZ(png_const_structrp png_ptr, png_const_inforp info_ptr,
     double *green_Y, double *green_Z, double *blue_X, double *blue_Y,
     double *blue_Z)
 {
+   png_debug1(1, "in %s retrieval function", "cHRM_XYZ(float)");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->colorspace.flags & PNG_COLORSPACE_HAVE_ENDPOINTS) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "cHRM_XYZ(float)");
-
       if (red_X != NULL)
          *red_X = png_float(png_ptr, info_ptr->colorspace.end_points_XYZ.red_X,
              "cHRM red X");
@@ -592,10 +601,10 @@ png_get_cHRM_XYZ(png_const_structrp png_ptr, png_const_inforp info_ptr,
       if (blue_Z != NULL)
          *blue_Z = png_float(png_ptr,
              info_ptr->colorspace.end_points_XYZ.blue_Z, "cHRM blue Z");
-      return (PNG_INFO_cHRM);
+      return PNG_INFO_cHRM;
    }
 
-   return (0);
+   return 0;
 }
 #  endif
 
@@ -608,11 +617,11 @@ png_get_cHRM_XYZ_fixed(png_const_structrp png_ptr, png_const_inforp info_ptr,
     png_fixed_point *int_blue_X, png_fixed_point *int_blue_Y,
     png_fixed_point *int_blue_Z)
 {
+   png_debug1(1, "in %s retrieval function", "cHRM_XYZ");
+
    if (png_ptr != NULL && info_ptr != NULL &&
       (info_ptr->colorspace.flags & PNG_COLORSPACE_HAVE_ENDPOINTS) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "cHRM_XYZ");
-
       if (int_red_X != NULL)
          *int_red_X = info_ptr->colorspace.end_points_XYZ.red_X;
       if (int_red_Y != NULL)
@@ -631,10 +640,10 @@ png_get_cHRM_XYZ_fixed(png_const_structrp png_ptr, png_const_inforp info_ptr,
          *int_blue_Y = info_ptr->colorspace.end_points_XYZ.blue_Y;
       if (int_blue_Z != NULL)
          *int_blue_Z = info_ptr->colorspace.end_points_XYZ.blue_Z;
-      return (PNG_INFO_cHRM);
+      return PNG_INFO_cHRM;
    }
 
-   return (0);
+   return 0;
 }
 
 png_uint_32 PNGAPI
@@ -664,10 +673,10 @@ png_get_cHRM_fixed(png_const_structrp png_ptr, png_const_inforp info_ptr,
          *blue_x = info_ptr->colorspace.end_points_xy.bluex;
       if (blue_y != NULL)
          *blue_y = info_ptr->colorspace.end_points_xy.bluey;
-      return (PNG_INFO_cHRM);
+      return PNG_INFO_cHRM;
    }
 
-   return (0);
+   return 0;
 }
 #  endif
 #endif
@@ -685,10 +694,10 @@ png_get_gAMA_fixed(png_const_structrp png_ptr, png_const_inforp info_ptr,
        file_gamma != NULL)
    {
       *file_gamma = info_ptr->colorspace.gamma;
-      return (PNG_INFO_gAMA);
+      return PNG_INFO_gAMA;
    }
 
-   return (0);
+   return 0;
 }
 #  endif
 
@@ -705,10 +714,10 @@ png_get_gAMA(png_const_structrp png_ptr, png_const_inforp info_ptr,
    {
       *file_gamma = png_float(png_ptr, info_ptr->colorspace.gamma,
           "png_get_gAMA");
-      return (PNG_INFO_gAMA);
+      return PNG_INFO_gAMA;
    }
 
-   return (0);
+   return 0;
 }
 #  endif
 #endif
@@ -724,10 +733,10 @@ png_get_sRGB(png_const_structrp png_ptr, png_const_inforp info_ptr,
       (info_ptr->valid & PNG_INFO_sRGB) != 0 && file_srgb_intent != NULL)
    {
       *file_srgb_intent = info_ptr->colorspace.rendering_intent;
-      return (PNG_INFO_sRGB);
+      return PNG_INFO_sRGB;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -751,10 +760,10 @@ png_get_iCCP(png_const_structrp png_ptr, png_inforp info_ptr,
        */
       if (compression_type != NULL)
          *compression_type = PNG_COMPRESSION_TYPE_BASE;
-      return (PNG_INFO_iCCP);
+      return PNG_INFO_iCCP;
    }
 
-   return (0);
+   return 0;
 
 }
 #endif
@@ -764,13 +773,15 @@ int PNGAPI
 png_get_sPLT(png_const_structrp png_ptr, png_inforp info_ptr,
     png_sPLT_tpp spalettes)
 {
+   png_debug1(1, "in %s retrieval function", "sPLT");
+
    if (png_ptr != NULL && info_ptr != NULL && spalettes != NULL)
    {
       *spalettes = info_ptr->splt_palettes;
       return info_ptr->splt_palettes_num;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -796,10 +807,10 @@ png_get_eXIf_1(png_const_structrp png_ptr, png_const_inforp info_ptr,
    {
       *num_exif = info_ptr->num_exif;
       *exif = info_ptr->exif;
-      return (PNG_INFO_eXIf);
+      return PNG_INFO_eXIf;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -814,10 +825,10 @@ png_get_hIST(png_const_structrp png_ptr, png_inforp info_ptr,
        (info_ptr->valid & PNG_INFO_hIST) != 0 && hist != NULL)
    {
       *hist = info_ptr->hist;
-      return (PNG_INFO_hIST);
+      return PNG_INFO_hIST;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -830,7 +841,7 @@ png_get_IHDR(png_const_structrp png_ptr, png_const_inforp info_ptr,
    png_debug1(1, "in %s retrieval function", "IHDR");
 
    if (png_ptr == NULL || info_ptr == NULL)
-      return (0);
+      return 0;
 
    if (width != NULL)
        *width = info_ptr->width;
@@ -862,7 +873,7 @@ png_get_IHDR(png_const_structrp png_ptr, png_const_inforp info_ptr,
        info_ptr->bit_depth, info_ptr->color_type, info_ptr->interlace_type,
        info_ptr->compression_type, info_ptr->filter_type);
 
-   return (1);
+   return 1;
 }
 
 #ifdef PNG_oFFs_SUPPORTED
@@ -879,10 +890,10 @@ png_get_oFFs(png_const_structrp png_ptr, png_const_inforp info_ptr,
       *offset_x = info_ptr->x_offset;
       *offset_y = info_ptr->y_offset;
       *unit_type = (int)info_ptr->offset_unit_type;
-      return (PNG_INFO_oFFs);
+      return PNG_INFO_oFFs;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -906,10 +917,10 @@ png_get_pCAL(png_const_structrp png_ptr, png_inforp info_ptr,
       *nparams = (int)info_ptr->pcal_nparams;
       *units = info_ptr->pcal_units;
       *params = info_ptr->pcal_params;
-      return (PNG_INFO_pCAL);
+      return PNG_INFO_pCAL;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -921,6 +932,8 @@ png_uint_32 PNGAPI
 png_get_sCAL_fixed(png_const_structrp png_ptr, png_const_inforp info_ptr,
     int *unit, png_fixed_point *width, png_fixed_point *height)
 {
+   png_debug1(1, "in %s retrieval function", "sCAL");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_sCAL) != 0)
    {
@@ -932,10 +945,10 @@ png_get_sCAL_fixed(png_const_structrp png_ptr, png_const_inforp info_ptr,
       *width = png_fixed(png_ptr, atof(info_ptr->scal_s_width), "sCAL width");
       *height = png_fixed(png_ptr, atof(info_ptr->scal_s_height),
           "sCAL height");
-      return (PNG_INFO_sCAL);
+      return PNG_INFO_sCAL;
    }
 
-   return(0);
+   return 0;
 }
 #    endif /* FLOATING_ARITHMETIC */
 #  endif /* FIXED_POINT */
@@ -944,32 +957,36 @@ png_uint_32 PNGAPI
 png_get_sCAL(png_const_structrp png_ptr, png_const_inforp info_ptr,
     int *unit, double *width, double *height)
 {
+   png_debug1(1, "in %s retrieval function", "sCAL(float)");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_sCAL) != 0)
    {
       *unit = info_ptr->scal_unit;
       *width = atof(info_ptr->scal_s_width);
       *height = atof(info_ptr->scal_s_height);
-      return (PNG_INFO_sCAL);
+      return PNG_INFO_sCAL;
    }
 
-   return(0);
+   return 0;
 }
 #  endif /* FLOATING POINT */
 png_uint_32 PNGAPI
 png_get_sCAL_s(png_const_structrp png_ptr, png_const_inforp info_ptr,
     int *unit, png_charpp width, png_charpp height)
 {
+   png_debug1(1, "in %s retrieval function", "sCAL(str)");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_sCAL) != 0)
    {
       *unit = info_ptr->scal_unit;
       *width = info_ptr->scal_s_width;
       *height = info_ptr->scal_s_height;
-      return (PNG_INFO_sCAL);
+      return PNG_INFO_sCAL;
    }
 
-   return(0);
+   return 0;
 }
 #endif /* sCAL */
 
@@ -1004,7 +1021,7 @@ png_get_pHYs(png_const_structrp png_ptr, png_const_inforp info_ptr,
       }
    }
 
-   return (retval);
+   return retval;
 }
 #endif /* pHYs */
 
@@ -1020,10 +1037,10 @@ png_get_PLTE(png_const_structrp png_ptr, png_inforp info_ptr,
       *palette = info_ptr->palette;
       *num_palette = info_ptr->num_palette;
       png_debug1(3, "num_palette = %d", *num_palette);
-      return (PNG_INFO_PLTE);
+      return PNG_INFO_PLTE;
    }
 
-   return (0);
+   return 0;
 }
 
 #ifdef PNG_sBIT_SUPPORTED
@@ -1037,10 +1054,10 @@ png_get_sBIT(png_const_structrp png_ptr, png_inforp info_ptr,
        (info_ptr->valid & PNG_INFO_sBIT) != 0 && sig_bit != NULL)
    {
       *sig_bit = &(info_ptr->sig_bit);
-      return (PNG_INFO_sBIT);
+      return PNG_INFO_sBIT;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -1051,7 +1068,7 @@ png_get_text(png_const_structrp png_ptr, png_inforp info_ptr,
 {
    if (png_ptr != NULL && info_ptr != NULL && info_ptr->num_text > 0)
    {
-      png_debug1(1, "in 0x%lx retrieval function",
+      png_debug1(1, "in text retrieval function, chunk typeid = 0x%lx",
          (unsigned long)png_ptr->chunk_name);
 
       if (text_ptr != NULL)
@@ -1066,7 +1083,7 @@ png_get_text(png_const_structrp png_ptr, png_inforp info_ptr,
    if (num_text != NULL)
       *num_text = 0;
 
-   return(0);
+   return 0;
 }
 #endif
 
@@ -1081,10 +1098,10 @@ png_get_tIME(png_const_structrp png_ptr, png_inforp info_ptr,
        (info_ptr->valid & PNG_INFO_tIME) != 0 && mod_time != NULL)
    {
       *mod_time = &(info_ptr->mod_time);
-      return (PNG_INFO_tIME);
+      return PNG_INFO_tIME;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
@@ -1094,11 +1111,12 @@ png_get_tRNS(png_const_structrp png_ptr, png_inforp info_ptr,
     png_bytep *trans_alpha, int *num_trans, png_color_16p *trans_color)
 {
    png_uint_32 retval = 0;
+
+   png_debug1(1, "in %s retrieval function", "tRNS");
+
    if (png_ptr != NULL && info_ptr != NULL &&
        (info_ptr->valid & PNG_INFO_tRNS) != 0)
    {
-      png_debug1(1, "in %s retrieval function", "tRNS");
-
       if (info_ptr->color_type == PNG_COLOR_TYPE_PALETTE)
       {
          if (trans_alpha != NULL)
@@ -1130,7 +1148,7 @@ png_get_tRNS(png_const_structrp png_ptr, png_inforp info_ptr,
       }
    }
 
-   return (retval);
+   return retval;
 }
 #endif
 
@@ -1145,13 +1163,13 @@ png_get_unknown_chunks(png_const_structrp png_ptr, png_inforp info_ptr,
       return info_ptr->unknown_chunks_num;
    }
 
-   return (0);
+   return 0;
 }
 #endif
 
 #ifdef PNG_READ_RGB_TO_GRAY_SUPPORTED
 png_byte PNGAPI
-png_get_rgb_to_gray_status (png_const_structrp png_ptr)
+png_get_rgb_to_gray_status(png_const_structrp png_ptr)
 {
    return (png_byte)(png_ptr ? png_ptr->rgb_to_gray_status : 0);
 }
@@ -1192,27 +1210,27 @@ png_get_compression_buffer_size(png_const_structrp png_ptr)
 /* These functions were added to libpng 1.2.6 and were enabled
  * by default in libpng-1.4.0 */
 png_uint_32 PNGAPI
-png_get_user_width_max (png_const_structrp png_ptr)
+png_get_user_width_max(png_const_structrp png_ptr)
 {
    return (png_ptr ? png_ptr->user_width_max : 0);
 }
 
 png_uint_32 PNGAPI
-png_get_user_height_max (png_const_structrp png_ptr)
+png_get_user_height_max(png_const_structrp png_ptr)
 {
    return (png_ptr ? png_ptr->user_height_max : 0);
 }
 
 /* This function was added to libpng 1.4.0 */
 png_uint_32 PNGAPI
-png_get_chunk_cache_max (png_const_structrp png_ptr)
+png_get_chunk_cache_max(png_const_structrp png_ptr)
 {
    return (png_ptr ? png_ptr->user_chunk_cache_max : 0);
 }
 
 /* This function was added to libpng 1.4.1 */
 png_alloc_size_t PNGAPI
-png_get_chunk_malloc_max (png_const_structrp png_ptr)
+png_get_chunk_malloc_max(png_const_structrp png_ptr)
 {
    return (png_ptr ? png_ptr->user_chunk_malloc_max : 0);
 }
@@ -1221,13 +1239,13 @@ png_get_chunk_malloc_max (png_const_structrp png_ptr)
 /* These functions were added to libpng 1.4.0 */
 #ifdef PNG_IO_STATE_SUPPORTED
 png_uint_32 PNGAPI
-png_get_io_state (png_const_structrp png_ptr)
+png_get_io_state(png_const_structrp png_ptr)
 {
    return png_ptr->io_state;
 }
 
 png_uint_32 PNGAPI
-png_get_io_chunk_type (png_const_structrp png_ptr)
+png_get_io_chunk_type(png_const_structrp png_ptr)
 {
    return png_ptr->chunk_name;
 }
@@ -1241,7 +1259,7 @@ png_get_palette_max(png_const_structp png_ptr, png_const_infop info_ptr)
    if (png_ptr != NULL && info_ptr != NULL)
       return png_ptr->num_palette_max;
 
-   return (-1);
+   return -1;
 }
 #  endif
 #endif
diff --git a/3rdparty/libpng/pngpread.c b/3rdparty/libpng/pngpread.c
index e283627b77cd..ffab19c08c06 100644
--- a/3rdparty/libpng/pngpread.c
+++ b/3rdparty/libpng/pngpread.c
@@ -1,7 +1,7 @@
 
 /* pngpread.c - read a png file in push mode
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -145,10 +145,10 @@ png_push_read_sig(png_structrp png_ptr, png_inforp info_ptr)
        num_to_check);
    png_ptr->sig_bytes = (png_byte)(png_ptr->sig_bytes + num_to_check);
 
-   if (png_sig_cmp(info_ptr->signature, num_checked, num_to_check))
+   if (png_sig_cmp(info_ptr->signature, num_checked, num_to_check) != 0)
    {
       if (num_checked < 4 &&
-          png_sig_cmp(info_ptr->signature, num_checked, num_to_check - 4))
+          png_sig_cmp(info_ptr->signature, num_checked, num_to_check - 4) != 0)
          png_error(png_ptr, "Not a PNG file");
 
       else
@@ -294,6 +294,14 @@ png_push_read_chunk(png_structrp png_ptr, png_inforp info_ptr)
       png_handle_cHRM(png_ptr, info_ptr, png_ptr->push_length);
    }
 
+#endif
+#ifdef PNG_READ_eXIf_SUPPORTED
+   else if (png_ptr->chunk_name == png_eXIf)
+   {
+      PNG_PUSH_SAVE_BUFFER_IF_FULL
+      png_handle_eXIf(png_ptr, info_ptr, png_ptr->push_length);
+   }
+
 #endif
 #ifdef PNG_READ_sRGB_SUPPORTED
    else if (chunk_name == png_sRGB)
@@ -1089,7 +1097,7 @@ png_voidp PNGAPI
 png_get_progressive_ptr(png_const_structrp png_ptr)
 {
    if (png_ptr == NULL)
-      return (NULL);
+      return NULL;
 
    return png_ptr->io_ptr;
 }
diff --git a/3rdparty/libpng/pngpriv.h b/3rdparty/libpng/pngpriv.h
index 583c26f9bdec..9bfdb7134218 100644
--- a/3rdparty/libpng/pngpriv.h
+++ b/3rdparty/libpng/pngpriv.h
@@ -1,7 +1,7 @@
 
 /* pngpriv.h - private declarations for use inside libpng
  *
- * Copyright (c) 2018-2019 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -36,7 +36,7 @@
  * still required (as of 2011-05-02.)
  */
 #ifndef _POSIX_SOURCE
-# define _POSIX_SOURCE 1 /* Just the POSIX 1003.1 and C89 APIs */
+#  define _POSIX_SOURCE 1 /* Just the POSIX 1003.1 and C89 APIs */
 #endif
 
 #ifndef PNG_VERSION_INFO_ONLY
@@ -174,7 +174,7 @@
 #     else /* !defined __ARM_NEON__ */
          /* The 'intrinsics' code simply won't compile without this -mfpu=neon:
           */
-#        if !defined(__aarch64__)
+#        if !defined(__aarch64__) && !defined(_M_ARM64)
             /* The assembler code currently does not work on ARM64 */
 #          define PNG_ARM_NEON_IMPLEMENTATION 2
 #        endif /* __aarch64__ */
@@ -185,16 +185,32 @@
       /* Use the intrinsics code by default. */
 #     define PNG_ARM_NEON_IMPLEMENTATION 1
 #  endif
+#else /* PNG_ARM_NEON_OPT == 0 */
+#     define PNG_ARM_NEON_IMPLEMENTATION 0
 #endif /* PNG_ARM_NEON_OPT > 0 */
 
 #ifndef PNG_MIPS_MSA_OPT
-#  if defined(__mips_msa) && (__mips_isa_rev >= 5) && defined(PNG_ALIGNED_MEMORY_SUPPORTED)
+#  if defined(__mips_msa) && (__mips_isa_rev >= 5) && \
+   defined(PNG_ALIGNED_MEMORY_SUPPORTED)
 #     define PNG_MIPS_MSA_OPT 2
 #  else
 #     define PNG_MIPS_MSA_OPT 0
 #  endif
 #endif
 
+#ifndef PNG_MIPS_MMI_OPT
+#  ifdef PNG_MIPS_MMI
+#    if defined(__mips_loongson_mmi) && (_MIPS_SIM == _ABI64) && \
+     defined(PNG_ALIGNED_MEMORY_SUPPORTED)
+#       define PNG_MIPS_MMI_OPT 1
+#    else
+#       define PNG_MIPS_MMI_OPT 0
+#    endif
+#  else
+#    define PNG_MIPS_MMI_OPT 0
+#  endif
+#endif
+
 #ifndef PNG_POWERPC_VSX_OPT
 #  if defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__)
 #     define PNG_POWERPC_VSX_OPT 2
@@ -203,13 +219,21 @@
 #  endif
 #endif
 
+#ifndef PNG_LOONGARCH_LSX_OPT
+#  if defined(__loongarch_sx)
+#     define PNG_LOONGARCH_LSX_OPT 1
+#  else
+#     define PNG_LOONGARCH_LSX_OPT 0
+#  endif
+#endif
+
 #ifndef PNG_INTEL_SSE_OPT
 #   ifdef PNG_INTEL_SSE
       /* Only check for SSE if the build configuration has been modified to
        * enable SSE optimizations.  This means that these optimizations will
        * be off by default.  See contrib/intel for more details.
        */
-#     if defined(__SSE4_1__) || defined(__AVX__) || defined(__SSSE3__) || \
+#      if defined(__SSE4_1__) || defined(__AVX__) || defined(__SSSE3__) || \
        defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64) || \
        (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
 #         define PNG_INTEL_SSE_OPT 1
@@ -246,7 +270,6 @@
 #endif
 
 #if PNG_MIPS_MSA_OPT > 0
-#  define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_msa
 #  ifndef PNG_MIPS_MSA_IMPLEMENTATION
 #     if defined(__mips_msa)
 #        if defined(__clang__)
@@ -262,14 +285,41 @@
 
 #  ifndef PNG_MIPS_MSA_IMPLEMENTATION
 #     define PNG_MIPS_MSA_IMPLEMENTATION 1
+#     define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_mips
 #  endif
+#else
+#  define PNG_MIPS_MSA_IMPLEMENTATION 0
 #endif /* PNG_MIPS_MSA_OPT > 0 */
 
+#if PNG_MIPS_MMI_OPT > 0
+#  ifndef PNG_MIPS_MMI_IMPLEMENTATION
+#     if defined(__mips_loongson_mmi) && (_MIPS_SIM == _ABI64)
+#        define PNG_MIPS_MMI_IMPLEMENTATION 2
+#     else /* !defined __mips_loongson_mmi  || _MIPS_SIM != _ABI64 */
+#        define PNG_MIPS_MMI_IMPLEMENTATION 0
+#     endif /* __mips_loongson_mmi  && _MIPS_SIM == _ABI64 */
+#  endif /* !PNG_MIPS_MMI_IMPLEMENTATION */
+
+#   if PNG_MIPS_MMI_IMPLEMENTATION > 0
+#      define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_mips
+#   endif
+#else
+#   define PNG_MIPS_MMI_IMPLEMENTATION 0
+#endif /* PNG_MIPS_MMI_OPT > 0 */
+
 #if PNG_POWERPC_VSX_OPT > 0
 #  define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_vsx
 #  define PNG_POWERPC_VSX_IMPLEMENTATION 1
+#else
+#  define PNG_POWERPC_VSX_IMPLEMENTATION 0
 #endif
 
+#if PNG_LOONGARCH_LSX_OPT > 0
+#   define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_lsx
+#   define PNG_LOONGARCH_LSX_IMPLEMENTATION 1
+#else
+#   define PNG_LOONGARCH_LSX_IMPLEMENTATION 0
+#endif
 
 /* Is this a build of a DLL where compilation of the object modules requires
  * different preprocessor settings to those required for a simple library?  If
@@ -492,16 +542,7 @@
    static_cast<type>(static_cast<const void*>(value))
 #else
 #  define png_voidcast(type, value) (value)
-#  ifdef _WIN64
-#     ifdef __GNUC__
-         typedef unsigned long long png_ptruint;
-#     else
-         typedef unsigned __int64 png_ptruint;
-#     endif
-#  else
-      typedef unsigned long png_ptruint;
-#  endif
-#  define png_constcast(type, value) ((type)(png_ptruint)(const void*)(value))
+#  define png_constcast(type, value) ((type)(void*)(const void*)(value))
 #  define png_aligncast(type, value) ((void*)(value))
 #  define png_aligncastconst(type, value) ((const void*)(value))
 #endif /* __cplusplus */
@@ -517,18 +558,8 @@
     */
 #  include <float.h>
 
-#  if (defined(__MWERKS__) && defined(macintosh)) || defined(applec) || \
-    defined(THINK_C) || defined(__SC__) || defined(TARGET_OS_MAC)
-   /* We need to check that <math.h> hasn't already been included earlier
-    * as it seems it doesn't agree with <fp.h>, yet we should really use
-    * <fp.h> if possible.
-    */
-#    if !defined(__MATH_H__) && !defined(__MATH_H) && !defined(__cmath__)
-#      include <fp.h>
-#    endif
-#  else
-#    include <math.h>
-#  endif
+#  include <math.h>
+
 #  if defined(_AMIGA) && defined(__SASC) && defined(_M68881)
    /* Amiga SAS/C: We must include builtin FPU functions when compiling using
     * MATH=68881
@@ -543,9 +574,8 @@
 #  include <alloc.h>
 #endif
 
-#if defined(WIN32) || defined(_Windows) || defined(_WINDOWS) || \
-    defined(_WIN32) || defined(__WIN32__)
-#  include <windows.h>  /* defines _WINDOWS_ macro */
+#if defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+#  include <windows.h>
 #endif
 #endif /* PNG_VERSION_INFO_ONLY */
 
@@ -554,24 +584,20 @@
  * functions that are passed far data must be model-independent.
  */
 
-/* Memory model/platform independent fns */
+/* Platform-independent functions */
 #ifndef PNG_ABORT
-#  ifdef _WINDOWS_
-#    define PNG_ABORT() ExitProcess(0)
-#  else
-#    define PNG_ABORT() abort()
-#  endif
+#  define PNG_ABORT() abort()
 #endif
 
 /* These macros may need to be architecture dependent. */
-#define PNG_ALIGN_NONE   0 /* do not use data alignment */
-#define PNG_ALIGN_ALWAYS 1 /* assume unaligned accesses are OK */
+#define PNG_ALIGN_NONE      0 /* do not use data alignment */
+#define PNG_ALIGN_ALWAYS    1 /* assume unaligned accesses are OK */
 #ifdef offsetof
-#  define PNG_ALIGN_OFFSET 2 /* use offsetof to determine alignment */
+#  define PNG_ALIGN_OFFSET  2 /* use offsetof to determine alignment */
 #else
 #  define PNG_ALIGN_OFFSET -1 /* prevent the use of this */
 #endif
-#define PNG_ALIGN_SIZE   3 /* use sizeof to determine alignment */
+#define PNG_ALIGN_SIZE      3 /* use sizeof to determine alignment */
 
 #ifndef PNG_ALIGN_TYPE
    /* Default to using aligned access optimizations and requiring alignment to a
@@ -585,26 +611,25 @@
    /* This is used because in some compiler implementations non-aligned
     * structure members are supported, so the offsetof approach below fails.
     * Set PNG_ALIGN_SIZE=0 for compiler combinations where unaligned access
-    * is good for performance.  Do not do this unless you have tested the result
-    * and understand it.
+    * is good for performance.  Do not do this unless you have tested the
+    * result and understand it.
     */
-#  define png_alignof(type) (sizeof (type))
+#  define png_alignof(type) (sizeof(type))
 #else
 #  if PNG_ALIGN_TYPE == PNG_ALIGN_OFFSET
-#     define png_alignof(type) offsetof(struct{char c; type t;}, t)
+#    define png_alignof(type) offsetof(struct{char c; type t;}, t)
 #  else
-#     if PNG_ALIGN_TYPE == PNG_ALIGN_ALWAYS
-#        define png_alignof(type) (1)
-#     endif
-      /* Else leave png_alignof undefined to prevent use thereof */
+#    if PNG_ALIGN_TYPE == PNG_ALIGN_ALWAYS
+#      define png_alignof(type) 1
+#    endif
+     /* Else leave png_alignof undefined to prevent use thereof */
 #  endif
 #endif
 
-/* This implicitly assumes alignment is always to a power of 2. */
+/* This implicitly assumes alignment is always a multiple of 2. */
 #ifdef png_alignof
-#  define png_isaligned(ptr, type)\
-   (((type)((const char*)ptr-(const char*)0) & \
-   (type)(png_alignof(type)-1)) == 0)
+#  define png_isaligned(ptr, type) \
+   (((type)(size_t)((const void*)(ptr)) & (type)(png_alignof(type)-1)) == 0)
 #else
 #  define png_isaligned(ptr, type) 0
 #endif
@@ -635,7 +660,7 @@
 #define PNG_BACKGROUND_IS_GRAY     0x800U
 #define PNG_HAVE_PNG_SIGNATURE    0x1000U
 #define PNG_HAVE_CHUNK_AFTER_IDAT 0x2000U /* Have another chunk after IDAT */
-                   /*             0x4000U (unused) */
+#define PNG_WROTE_eXIf            0x4000U
 #define PNG_IS_READ_STRUCT        0x8000U /* Else is a write struct */
 
 /* Flags for the transformations the PNG library does on the image data */
@@ -1315,7 +1340,7 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop
     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
 #endif
 
-#if PNG_MIPS_MSA_OPT > 0
+#if PNG_MIPS_MSA_IMPLEMENTATION == 1
 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_msa,(png_row_infop row_info,
     png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_msa,(png_row_infop
@@ -1332,6 +1357,23 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_msa,(png_row_infop
     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
 #endif
 
+#if PNG_MIPS_MMI_IMPLEMENTATION > 0
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_mmi,(png_row_infop row_info,
+    png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_mmi,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+#endif
+
 #if PNG_POWERPC_VSX_OPT > 0
 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_vsx,(png_row_infop row_info,
     png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
@@ -1364,6 +1406,23 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_sse2,(png_row_infop
     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
 #endif
 
+#if PNG_LOONGARCH_LSX_IMPLEMENTATION == 1
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_lsx,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_lsx,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_lsx,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_lsx,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_lsx,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_lsx,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_lsx,(png_row_infop
+    row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+#endif
+
 /* Choose the best filter to use and filter the row data */
 PNG_INTERNAL_FUNCTION(void,png_write_find_filter,(png_structrp png_ptr,
     png_row_infop row_info),PNG_EMPTY);
@@ -1919,7 +1978,7 @@ PNG_INTERNAL_FUNCTION(void,png_ascii_from_fixed,(png_const_structrp png_ptr,
  */
 #define PNG_FP_INVALID  512  /* Available for callers as a distinct value */
 
-/* Result codes for the parser (boolean - true meants ok, false means
+/* Result codes for the parser (boolean - true means ok, false means
  * not ok yet.)
  */
 #define PNG_FP_MAYBE      0  /* The number may be valid in the future */
@@ -1955,7 +2014,7 @@ PNG_INTERNAL_FUNCTION(void,png_ascii_from_fixed,(png_const_structrp png_ptr,
  * the problem character.)  This has not been tested within libpng.
  */
 PNG_INTERNAL_FUNCTION(int,png_check_fp_number,(png_const_charp string,
-   size_t size, int *statep, png_size_tp whereami),PNG_EMPTY);
+   size_t size, int *statep, size_t *whereami),PNG_EMPTY);
 
 /* This is the same but it checks a complete string and returns true
  * only if it just contains a floating point number.  As of 1.5.4 this
@@ -2103,17 +2162,27 @@ PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_neon,
    (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
 #endif
 
-#if PNG_MIPS_MSA_OPT > 0
-PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_msa,
+#if PNG_MIPS_MSA_IMPLEMENTATION == 1
+PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_mips,
    (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
 #endif
 
+#  if PNG_MIPS_MMI_IMPLEMENTATION > 0
+PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_mips,
+   (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
+#  endif
+
 #  if PNG_INTEL_SSE_IMPLEMENTATION > 0
 PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_sse2,
    (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
 #  endif
 #endif
 
+#if PNG_LOONGARCH_LSX_OPT > 0
+PNG_INTERNAL_FUNCTION(void, png_init_filter_functions_lsx,
+    (png_structp png_ptr, unsigned int bpp), PNG_EMPTY);
+#endif
+
 PNG_INTERNAL_FUNCTION(png_uint_32, png_check_keyword, (png_structrp png_ptr,
    png_const_charp key, png_bytep new_key), PNG_EMPTY);
 
diff --git a/3rdparty/libpng/pngread.c b/3rdparty/libpng/pngread.c
index 8fa7d9f1628f..07a39df6e2e3 100644
--- a/3rdparty/libpng/pngread.c
+++ b/3rdparty/libpng/pngread.c
@@ -1,7 +1,7 @@
 
 /* pngread.c - read a PNG file
  *
- * Copyright (c) 2018-2019 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -568,7 +568,11 @@ png_read_row(png_structrp png_ptr, png_bytep row, png_bytep dsp_row)
 #endif
 
 #ifdef PNG_READ_TRANSFORMS_SUPPORTED
-   if (png_ptr->transformations)
+   if (png_ptr->transformations
+#     ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
+         || png_ptr->num_palette_max >= 0
+#     endif
+      )
       png_do_read_transformations(png_ptr, &row_info);
 #endif
 
@@ -785,7 +789,7 @@ png_read_end(png_structrp png_ptr, png_inforp info_ptr)
 #ifdef PNG_READ_CHECK_FOR_INVALID_INDEX_SUPPORTED
    /* Report invalid palette index; added at libng-1.5.10 */
    if (png_ptr->color_type == PNG_COLOR_TYPE_PALETTE &&
-       png_ptr->num_palette_max > png_ptr->num_palette)
+       png_ptr->num_palette_max >= png_ptr->num_palette)
       png_benign_error(png_ptr, "Read palette index exceeding num_palette");
 #endif
 
@@ -1049,6 +1053,8 @@ void PNGAPI
 png_read_png(png_structrp png_ptr, png_inforp info_ptr,
     int transforms, voidp params)
 {
+   png_debug(1, "in png_read_png");
+
    if (png_ptr == NULL || info_ptr == NULL)
       return;
 
@@ -3452,7 +3458,6 @@ png_image_read_background(png_voidp argument)
 
             for (pass = 0; pass < passes; ++pass)
             {
-               png_bytep row = png_voidcast(png_bytep, display->first_row);
                unsigned int     startx, stepx, stepy;
                png_uint_32      y;
 
@@ -3557,8 +3562,6 @@ png_image_read_background(png_voidp argument)
 
                         inrow += 2; /* gray and alpha channel */
                      }
-
-                     row += display->row_bytes;
                   }
                }
             }
@@ -3765,13 +3768,13 @@ png_image_read_direct(png_voidp argument)
          mode = PNG_ALPHA_PNG;
          output_gamma = PNG_DEFAULT_sRGB;
       }
-      
+
       if ((change & PNG_FORMAT_FLAG_ASSOCIATED_ALPHA) != 0)
       {
          mode = PNG_ALPHA_OPTIMIZED;
          change &= ~PNG_FORMAT_FLAG_ASSOCIATED_ALPHA;
       }
-      
+
       /* If 'do_local_background' is set check for the presence of gamma
        * correction; this is part of the work-round for the libpng bug
        * described above.
diff --git a/3rdparty/libpng/pngrtran.c b/3rdparty/libpng/pngrtran.c
index 9a8fad9f4aa7..1526123e025c 100644
--- a/3rdparty/libpng/pngrtran.c
+++ b/3rdparty/libpng/pngrtran.c
@@ -1,7 +1,7 @@
 
 /* pngrtran.c - transforms the data in a row for PNG readers
  *
- * Copyright (c) 2018-2019 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -21,7 +21,7 @@
 #ifdef PNG_ARM_NEON_IMPLEMENTATION
 #  if PNG_ARM_NEON_IMPLEMENTATION == 1
 #    define PNG_ARM_NEON_INTRINSICS_AVAILABLE
-#    if defined(_MSC_VER) && defined(_M_ARM64)
+#    if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
 #      include <arm64_neon.h>
 #    else
 #      include <arm_neon.h>
@@ -290,21 +290,20 @@ png_set_alpha_mode_fixed(png_structrp png_ptr, int mode,
    int compose = 0;
    png_fixed_point file_gamma;
 
-   png_debug(1, "in png_set_alpha_mode");
+   png_debug(1, "in png_set_alpha_mode_fixed");
 
    if (png_rtran_ok(png_ptr, 0) == 0)
       return;
 
    output_gamma = translate_gamma_flags(png_ptr, output_gamma, 1/*screen*/);
 
-   /* Validate the value to ensure it is in a reasonable range. The value
+   /* Validate the value to ensure it is in a reasonable range.  The value
     * is expected to be 1 or greater, but this range test allows for some
-    * viewing correction values.  The intent is to weed out users of this API
-    * who use the inverse of the gamma value accidentally!  Since some of these
-    * values are reasonable this may have to be changed:
+    * viewing correction values.  The intent is to weed out the API users
+    * who might use the inverse of the gamma value accidentally!
     *
-    * 1.6.x: changed from 0.07..3 to 0.01..100 (to accommodate the optimal 16-bit
-    * gamma of 36, and its reciprocal.)
+    * In libpng 1.6.0, we changed from 0.07..3 to 0.01..100, to accommodate
+    * the optimal 16-bit gamma of 36 and its reciprocal.
     */
    if (output_gamma < 1000 || output_gamma > 10000000)
       png_error(png_ptr, "output gamma out of expected range");
@@ -441,7 +440,7 @@ png_set_quantize(png_structrp png_ptr, png_colorp palette,
       int i;
 
       png_ptr->quantize_index = (png_bytep)png_malloc(png_ptr,
-          (png_alloc_size_t)((png_uint_32)num_palette * (sizeof (png_byte))));
+          (png_alloc_size_t)num_palette);
       for (i = 0; i < num_palette; i++)
          png_ptr->quantize_index[i] = (png_byte)i;
    }
@@ -458,7 +457,7 @@ png_set_quantize(png_structrp png_ptr, png_colorp palette,
 
          /* Initialize an array to sort colors */
          png_ptr->quantize_sort = (png_bytep)png_malloc(png_ptr,
-             (png_alloc_size_t)((png_uint_32)num_palette * (sizeof (png_byte))));
+             (png_alloc_size_t)num_palette);
 
          /* Initialize the quantize_sort array */
          for (i = 0; i < num_palette; i++)
@@ -592,11 +591,9 @@ png_set_quantize(png_structrp png_ptr, png_colorp palette,
 
          /* Initialize palette index arrays */
          png_ptr->index_to_palette = (png_bytep)png_malloc(png_ptr,
-             (png_alloc_size_t)((png_uint_32)num_palette *
-             (sizeof (png_byte))));
+             (png_alloc_size_t)num_palette);
          png_ptr->palette_to_index = (png_bytep)png_malloc(png_ptr,
-             (png_alloc_size_t)((png_uint_32)num_palette *
-             (sizeof (png_byte))));
+             (png_alloc_size_t)num_palette);
 
          /* Initialize the sort array */
          for (i = 0; i < num_palette; i++)
@@ -761,12 +758,11 @@ png_set_quantize(png_structrp png_ptr, png_colorp palette,
       size_t num_entries = ((size_t)1 << total_bits);
 
       png_ptr->palette_lookup = (png_bytep)png_calloc(png_ptr,
-          (png_alloc_size_t)(num_entries * (sizeof (png_byte))));
+          (png_alloc_size_t)(num_entries));
 
-      distance = (png_bytep)png_malloc(png_ptr, (png_alloc_size_t)(num_entries *
-          (sizeof (png_byte))));
+      distance = (png_bytep)png_malloc(png_ptr, (png_alloc_size_t)num_entries);
 
-      memset(distance, 0xff, num_entries * (sizeof (png_byte)));
+      memset(distance, 0xff, num_entries);
 
       for (i = 0; i < num_palette; i++)
       {
@@ -970,7 +966,7 @@ void PNGFAPI
 png_set_rgb_to_gray_fixed(png_structrp png_ptr, int error_action,
     png_fixed_point red, png_fixed_point green)
 {
-   png_debug(1, "in png_set_rgb_to_gray");
+   png_debug(1, "in png_set_rgb_to_gray_fixed");
 
    /* Need the IHDR here because of the check on color_type below. */
    /* TODO: fix this */
diff --git a/3rdparty/libpng/pngrutil.c b/3rdparty/libpng/pngrutil.c
index 4db3de990bdc..d31dc21dae89 100644
--- a/3rdparty/libpng/pngrutil.c
+++ b/3rdparty/libpng/pngrutil.c
@@ -1,7 +1,7 @@
 
 /* pngrutil.c - utilities to read a PNG file
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -26,7 +26,7 @@ png_get_uint_31(png_const_structrp png_ptr, png_const_bytep buf)
    if (uval > PNG_UINT_31_MAX)
       png_error(png_ptr, "PNG unsigned integer out of range");
 
-   return (uval);
+   return uval;
 }
 
 #if defined(PNG_READ_gAMA_SUPPORTED) || defined(PNG_READ_cHRM_SUPPORTED)
@@ -140,7 +140,7 @@ png_read_sig(png_structrp png_ptr, png_inforp info_ptr)
    if (png_sig_cmp(info_ptr->signature, num_checked, num_to_check) != 0)
    {
       if (num_checked < 4 &&
-          png_sig_cmp(info_ptr->signature, num_checked, num_to_check - 4))
+          png_sig_cmp(info_ptr->signature, num_checked, num_to_check - 4) != 0)
          png_error(png_ptr, "Not a PNG file");
       else
          png_error(png_ptr, "PNG file corrupted by ASCII conversion");
@@ -171,7 +171,7 @@ png_read_chunk_header(png_structrp png_ptr)
    /* Put the chunk name into png_ptr->chunk_name. */
    png_ptr->chunk_name = PNG_CHUNK_FROM_STRING(buf+4);
 
-   png_debug2(0, "Reading %lx chunk, length = %lu",
+   png_debug2(0, "Reading chunk typeid = 0x%lx, length = %lu",
        (unsigned long)png_ptr->chunk_name, (unsigned long)length);
 
    /* Reset the crc and run it over the chunk name. */
@@ -238,10 +238,10 @@ png_crc_finish(png_structrp png_ptr, png_uint_32 skip)
       else
          png_chunk_error(png_ptr, "CRC error");
 
-      return (1);
+      return 1;
    }
 
-   return (0);
+   return 0;
 }
 
 /* Compare the CRC stored in the PNG file with that calculated by libpng from
@@ -277,11 +277,11 @@ png_crc_error(png_structrp png_ptr)
    if (need_crc != 0)
    {
       crc = png_get_uint_32(crc_bytes);
-      return ((int)(crc != png_ptr->crc));
+      return crc != png_ptr->crc;
    }
 
    else
-      return (0);
+      return 0;
 }
 
 #if defined(PNG_READ_iCCP_SUPPORTED) || defined(PNG_READ_iTXt_SUPPORTED) ||\
@@ -301,7 +301,6 @@ png_read_buffer(png_structrp png_ptr, png_alloc_size_t new_size, int warn)
 
    if (buffer != NULL && new_size > png_ptr->read_buffer_size)
    {
-      png_ptr->read_buffer = NULL;
       png_ptr->read_buffer = NULL;
       png_ptr->read_buffer_size = 0;
       png_free(png_ptr, buffer);
@@ -422,8 +421,7 @@ png_inflate_claim(png_structrp png_ptr, png_uint_32 owner)
             png_ptr->flags |= PNG_FLAG_ZSTREAM_INITIALIZED;
       }
 
-#if ZLIB_VERNUM >= 0x1290 && \
-   defined(PNG_SET_OPTION_SUPPORTED) && defined(PNG_IGNORE_ADLER32)
+#ifdef PNG_DISABLE_ADLER32_CHECK_SUPPORTED
       if (((png_ptr->options >> PNG_IGNORE_ADLER32) & 3) == PNG_OPTION_ON)
          /* Turn off validation of the ADLER32 checksum in IDAT chunks */
          ret = inflateValidate(&png_ptr->zstream, 0);
@@ -2076,14 +2074,17 @@ png_handle_eXIf(png_structrp png_ptr, png_inforp info_ptr, png_uint_32 length)
       png_byte buf[1];
       png_crc_read(png_ptr, buf, 1);
       info_ptr->eXIf_buf[i] = buf[0];
-      if (i == 1 && buf[0] != 'M' && buf[0] != 'I'
-                 && info_ptr->eXIf_buf[0] != buf[0])
+      if (i == 1)
       {
-         png_crc_finish(png_ptr, length);
-         png_chunk_benign_error(png_ptr, "incorrect byte-order specifier");
-         png_free(png_ptr, info_ptr->eXIf_buf);
-         info_ptr->eXIf_buf = NULL;
-         return;
+         if ((buf[0] != 'M' && buf[0] != 'I') ||
+             (info_ptr->eXIf_buf[0] != buf[0]))
+         {
+            png_crc_finish(png_ptr, length - 2);
+            png_chunk_benign_error(png_ptr, "incorrect byte-order specifier");
+            png_free(png_ptr, info_ptr->eXIf_buf);
+            info_ptr->eXIf_buf = NULL;
+            return;
+         }
       }
    }
 
@@ -2124,8 +2125,9 @@ png_handle_hIST(png_structrp png_ptr, png_inforp info_ptr, png_uint_32 length)
 
    num = length / 2 ;
 
-   if (num != (unsigned int) png_ptr->num_palette ||
-       num > (unsigned int) PNG_MAX_PALETTE_LENGTH)
+   if (length != num * 2 ||
+       num != (unsigned int)png_ptr->num_palette ||
+       num > (unsigned int)PNG_MAX_PALETTE_LENGTH)
    {
       png_crc_finish(png_ptr, length);
       png_chunk_benign_error(png_ptr, "invalid");
@@ -3183,7 +3185,7 @@ png_check_chunk_length(png_const_structrp png_ptr, png_uint_32 length)
    {
       png_debug2(0," length = %lu, limit = %lu",
          (unsigned long)length,(unsigned long)limit);
-      png_chunk_error(png_ptr, "chunk data is too large");
+      png_benign_error(png_ptr, "chunk data is too large");
    }
 }
 
@@ -4619,14 +4621,13 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED)
        */
       {
          png_bytep temp = png_ptr->big_row_buf + 32;
-         int extra = (int)((temp - (png_bytep)0) & 0x0f);
+         size_t extra = (size_t)temp & 0x0f;
          png_ptr->row_buf = temp - extra - 1/*filter byte*/;
 
          temp = png_ptr->big_prev_row + 32;
-         extra = (int)((temp - (png_bytep)0) & 0x0f);
+         extra = (size_t)temp & 0x0f;
          png_ptr->prev_row = temp - extra - 1/*filter byte*/;
       }
-
 #else
       /* Use 31 bytes of padding before and 17 bytes after row_buf. */
       png_ptr->row_buf = png_ptr->big_row_buf + 31;
diff --git a/3rdparty/libpng/pngset.c b/3rdparty/libpng/pngset.c
index ec75dbe36903..eb1c8c7a35af 100644
--- a/3rdparty/libpng/pngset.c
+++ b/3rdparty/libpng/pngset.c
@@ -1,7 +1,7 @@
 
 /* pngset.c - storage of image information into info struct
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -137,46 +137,40 @@ png_set_cHRM_XYZ(png_const_structrp png_ptr, png_inforp info_ptr, double red_X,
 #ifdef PNG_eXIf_SUPPORTED
 void PNGAPI
 png_set_eXIf(png_const_structrp png_ptr, png_inforp info_ptr,
-    png_bytep eXIf_buf)
+    png_bytep exif)
 {
   png_warning(png_ptr, "png_set_eXIf does not work; use png_set_eXIf_1");
   PNG_UNUSED(info_ptr)
-  PNG_UNUSED(eXIf_buf)
+  PNG_UNUSED(exif)
 }
 
 void PNGAPI
 png_set_eXIf_1(png_const_structrp png_ptr, png_inforp info_ptr,
-    png_uint_32 num_exif, png_bytep eXIf_buf)
+    png_uint_32 num_exif, png_bytep exif)
 {
-   int i;
+   png_bytep new_exif;
 
    png_debug1(1, "in %s storage function", "eXIf");
 
-   if (png_ptr == NULL || info_ptr == NULL)
+   if (png_ptr == NULL || info_ptr == NULL ||
+       (png_ptr->mode & PNG_WROTE_eXIf) != 0)
       return;
 
-   if (info_ptr->exif)
-   {
-      png_free(png_ptr, info_ptr->exif);
-      info_ptr->exif = NULL;
-   }
-
-   info_ptr->num_exif = num_exif;
-
-   info_ptr->exif = png_voidcast(png_bytep, png_malloc_warn(png_ptr,
-       info_ptr->num_exif));
+   new_exif = png_voidcast(png_bytep, png_malloc_warn(png_ptr, num_exif));
 
-   if (info_ptr->exif == NULL)
+   if (new_exif == NULL)
    {
       png_warning(png_ptr, "Insufficient memory for eXIf chunk data");
       return;
    }
 
-   info_ptr->free_me |= PNG_FREE_EXIF;
+   memcpy(new_exif, exif, (size_t)num_exif);
 
-   for (i = 0; i < (int) info_ptr->num_exif; i++)
-      info_ptr->exif[i] = eXIf_buf[i];
+   png_free_data(png_ptr, info_ptr, PNG_FREE_EXIF, 0);
 
+   info_ptr->num_exif = num_exif;
+   info_ptr->exif = new_exif;
+   info_ptr->free_me |= PNG_FREE_EXIF;
    info_ptr->valid |= PNG_INFO_eXIf;
 }
 #endif /* eXIf */
@@ -237,15 +231,13 @@ png_set_hIST(png_const_structrp png_ptr, png_inforp info_ptr,
    if (info_ptr->hist == NULL)
    {
       png_warning(png_ptr, "Insufficient memory for hIST chunk data");
-
       return;
    }
 
-   info_ptr->free_me |= PNG_FREE_HIST;
-
    for (i = 0; i < info_ptr->num_palette; i++)
       info_ptr->hist[i] = hist[i];
 
+   info_ptr->free_me |= PNG_FREE_HIST;
    info_ptr->valid |= PNG_INFO_hIST;
 }
 #endif
@@ -367,6 +359,8 @@ png_set_pCAL(png_const_structrp png_ptr, png_inforp info_ptr,
 
    memcpy(info_ptr->pcal_purpose, purpose, length);
 
+   info_ptr->free_me |= PNG_FREE_PCAL;
+
    png_debug(3, "storing X0, X1, type, and nparams in info");
    info_ptr->pcal_X0 = X0;
    info_ptr->pcal_X1 = X1;
@@ -383,7 +377,6 @@ png_set_pCAL(png_const_structrp png_ptr, png_inforp info_ptr,
    if (info_ptr->pcal_units == NULL)
    {
       png_warning(png_ptr, "Insufficient memory for pCAL units");
-
       return;
    }
 
@@ -395,7 +388,6 @@ png_set_pCAL(png_const_structrp png_ptr, png_inforp info_ptr,
    if (info_ptr->pcal_params == NULL)
    {
       png_warning(png_ptr, "Insufficient memory for pCAL params");
-
       return;
    }
 
@@ -413,7 +405,6 @@ png_set_pCAL(png_const_structrp png_ptr, png_inforp info_ptr,
       if (info_ptr->pcal_params[i] == NULL)
       {
          png_warning(png_ptr, "Insufficient memory for pCAL parameter");
-
          return;
       }
 
@@ -421,7 +412,6 @@ png_set_pCAL(png_const_structrp png_ptr, png_inforp info_ptr,
    }
 
    info_ptr->valid |= PNG_INFO_pCAL;
-   info_ptr->free_me |= PNG_FREE_PCAL;
 }
 #endif
 
@@ -478,18 +468,17 @@ png_set_sCAL_s(png_const_structrp png_ptr, png_inforp info_ptr,
 
    if (info_ptr->scal_s_height == NULL)
    {
-      png_free (png_ptr, info_ptr->scal_s_width);
+      png_free(png_ptr, info_ptr->scal_s_width);
       info_ptr->scal_s_width = NULL;
 
       png_warning(png_ptr, "Memory allocation failed while processing sCAL");
-
       return;
    }
 
    memcpy(info_ptr->scal_s_height, sheight, lengthh);
 
-   info_ptr->valid |= PNG_INFO_sCAL;
    info_ptr->free_me |= PNG_FREE_SCAL;
+   info_ptr->valid |= PNG_INFO_sCAL;
 }
 
 #  ifdef PNG_FLOATING_POINT_SUPPORTED
@@ -625,11 +614,10 @@ png_set_PLTE(png_structrp png_ptr, png_inforp info_ptr,
    if (num_palette > 0)
       memcpy(png_ptr->palette, palette, (unsigned int)num_palette *
           (sizeof (png_color)));
+
    info_ptr->palette = png_ptr->palette;
    info_ptr->num_palette = png_ptr->num_palette = (png_uint_16)num_palette;
-
    info_ptr->free_me |= PNG_FREE_PLTE;
-
    info_ptr->valid |= PNG_INFO_PLTE;
 }
 
@@ -775,11 +763,11 @@ png_set_text_2(png_const_structrp png_ptr, png_inforp info_ptr,
 {
    int i;
 
-   png_debug1(1, "in %lx storage function", png_ptr == NULL ? 0xabadca11U :
-      (unsigned long)png_ptr->chunk_name);
+   png_debug1(1, "in text storage function, chunk typeid = 0x%lx",
+      png_ptr == NULL ? 0xabadca11UL : (unsigned long)png_ptr->chunk_name);
 
    if (png_ptr == NULL || info_ptr == NULL || num_text <= 0 || text_ptr == NULL)
-      return(0);
+      return 0;
 
    /* Make sure we have enough space in the "text" array in info_struct
     * to hold all of the incoming text_ptr objects.  This compare can't overflow
@@ -959,7 +947,7 @@ png_set_text_2(png_const_structrp png_ptr, png_inforp info_ptr,
       png_debug1(3, "transferred text chunk %d", info_ptr->num_text);
    }
 
-   return(0);
+   return 0;
 }
 #endif
 
@@ -1019,6 +1007,9 @@ png_set_tRNS(png_structrp png_ptr, png_inforp info_ptr,
           info_ptr->trans_alpha = png_voidcast(png_bytep,
               png_malloc(png_ptr, PNG_MAX_PALETTE_LENGTH));
           memcpy(info_ptr->trans_alpha, trans_alpha, (size_t)num_trans);
+
+          info_ptr->free_me |= PNG_FREE_TRNS;
+          info_ptr->valid |= PNG_INFO_tRNS;
        }
        png_ptr->trans_alpha = info_ptr->trans_alpha;
    }
@@ -1051,8 +1042,8 @@ png_set_tRNS(png_structrp png_ptr, png_inforp info_ptr,
 
    if (num_trans != 0)
    {
-      info_ptr->valid |= PNG_INFO_tRNS;
       info_ptr->free_me |= PNG_FREE_TRNS;
+      info_ptr->valid |= PNG_INFO_tRNS;
    }
 }
 #endif
@@ -1072,6 +1063,8 @@ png_set_sPLT(png_const_structrp png_ptr,
 {
    png_sPLT_tp np;
 
+   png_debug1(1, "in %s storage function", "sPLT");
+
    if (png_ptr == NULL || info_ptr == NULL || nentries <= 0 || entries == NULL)
       return;
 
@@ -1086,11 +1079,11 @@ png_set_sPLT(png_const_structrp png_ptr,
    {
       /* Out of memory or too many chunks */
       png_chunk_report(png_ptr, "too many sPLT chunks", PNG_CHUNK_WRITE_ERROR);
-
       return;
    }
 
    png_free(png_ptr, info_ptr->splt_palettes);
+
    info_ptr->splt_palettes = np;
    info_ptr->free_me |= PNG_FREE_SPLT;
 
@@ -1244,11 +1237,11 @@ png_set_unknown_chunks(png_const_structrp png_ptr,
    {
       png_chunk_report(png_ptr, "too many unknown chunks",
           PNG_CHUNK_WRITE_ERROR);
-
       return;
    }
 
    png_free(png_ptr, info_ptr->unknown_chunks);
+
    info_ptr->unknown_chunks = np; /* safe because it is initialized */
    info_ptr->free_me |= PNG_FREE_UNKN;
 
@@ -1326,7 +1319,7 @@ png_set_unknown_chunk_location(png_const_structrp png_ptr, png_inforp info_ptr,
 
 #ifdef PNG_MNG_FEATURES_SUPPORTED
 png_uint_32 PNGAPI
-png_permit_mng_features (png_structrp png_ptr, png_uint_32 mng_features)
+png_permit_mng_features(png_structrp png_ptr, png_uint_32 mng_features)
 {
    png_debug(1, "in png_permit_mng_features");
 
@@ -1546,7 +1539,7 @@ void PNGAPI
 png_set_rows(png_const_structrp png_ptr, png_inforp info_ptr,
     png_bytepp row_pointers)
 {
-   png_debug1(1, "in %s storage function", "rows");
+   png_debug(1, "in png_set_rows");
 
    if (png_ptr == NULL || info_ptr == NULL)
       return;
@@ -1565,6 +1558,8 @@ png_set_rows(png_const_structrp png_ptr, png_inforp info_ptr,
 void PNGAPI
 png_set_compression_buffer_size(png_structrp png_ptr, size_t size)
 {
+   png_debug(1, "in png_set_compression_buffer_size");
+
    if (png_ptr == NULL)
       return;
 
@@ -1633,9 +1628,11 @@ png_set_invalid(png_const_structrp png_ptr, png_inforp info_ptr, int mask)
 #ifdef PNG_SET_USER_LIMITS_SUPPORTED
 /* This function was added to libpng 1.2.6 */
 void PNGAPI
-png_set_user_limits (png_structrp png_ptr, png_uint_32 user_width_max,
+png_set_user_limits(png_structrp png_ptr, png_uint_32 user_width_max,
     png_uint_32 user_height_max)
 {
+   png_debug(1, "in png_set_user_limits");
+
    /* Images with dimensions larger than these limits will be
     * rejected by png_set_IHDR().  To accept any PNG datastream
     * regardless of dimensions, set both limits to 0x7fffffff.
@@ -1649,17 +1646,21 @@ png_set_user_limits (png_structrp png_ptr, png_uint_32 user_width_max,
 
 /* This function was added to libpng 1.4.0 */
 void PNGAPI
-png_set_chunk_cache_max (png_structrp png_ptr, png_uint_32 user_chunk_cache_max)
+png_set_chunk_cache_max(png_structrp png_ptr, png_uint_32 user_chunk_cache_max)
 {
+   png_debug(1, "in png_set_chunk_cache_max");
+
    if (png_ptr != NULL)
       png_ptr->user_chunk_cache_max = user_chunk_cache_max;
 }
 
 /* This function was added to libpng 1.4.1 */
 void PNGAPI
-png_set_chunk_malloc_max (png_structrp png_ptr,
+png_set_chunk_malloc_max(png_structrp png_ptr,
     png_alloc_size_t user_chunk_malloc_max)
 {
+   png_debug(1, "in png_set_chunk_malloc_max");
+
    if (png_ptr != NULL)
       png_ptr->user_chunk_malloc_max = user_chunk_malloc_max;
 }
diff --git a/3rdparty/libpng/pngstruct.h b/3rdparty/libpng/pngstruct.h
index 8bdc7ce46dbb..e591d94d5870 100644
--- a/3rdparty/libpng/pngstruct.h
+++ b/3rdparty/libpng/pngstruct.h
@@ -1,7 +1,7 @@
 
 /* pngstruct.h - header file for PNG reference library
  *
- * Copyright (c) 2018-2019 Cosmin Truta
+ * Copyright (c) 2018-2022 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -334,18 +334,8 @@ struct png_struct_def
    size_t current_buffer_size;       /* amount of data now in current_buffer */
    int process_mode;                 /* what push library is currently doing */
    int cur_palette;                  /* current push library palette index */
-
 #endif /* PROGRESSIVE_READ */
 
-#if defined(__TURBOC__) && !defined(_Windows) && !defined(__FLAT__)
-/* For the Borland special 64K segment handler */
-   png_bytepp offset_table_ptr;
-   png_bytep offset_table;
-   png_uint_16 offset_table_number;
-   png_uint_16 offset_table_count;
-   png_uint_16 offset_table_count_free;
-#endif
-
 #ifdef PNG_READ_QUANTIZE_SUPPORTED
    png_bytep palette_lookup; /* lookup table for quantizing */
    png_bytep quantize_index; /* index translation for palette files */
diff --git a/3rdparty/libpng/pngtrans.c b/3rdparty/libpng/pngtrans.c
index 1100f46ebec2..62cb21edf1f5 100644
--- a/3rdparty/libpng/pngtrans.c
+++ b/3rdparty/libpng/pngtrans.c
@@ -1,7 +1,7 @@
 
 /* pngtrans.c - transforms the data in a row (used by both readers and writers)
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -103,10 +103,10 @@ png_set_interlace_handling(png_structrp png_ptr)
    if (png_ptr != 0 && png_ptr->interlaced != 0)
    {
       png_ptr->transformations |= PNG_INTERLACE;
-      return (7);
+      return 7;
    }
 
-   return (1);
+   return 1;
 }
 #endif
 
@@ -498,6 +498,8 @@ png_do_strip_channel(png_row_infop row_info, png_bytep row, int at_start)
    png_bytep dp = row; /* destination pointer */
    png_bytep ep = row + row_info->rowbytes; /* One beyond end of row */
 
+   png_debug(1, "in png_do_strip_channel");
+
    /* At the start sp will point to the first byte to copy and dp to where
     * it is copied to.  ep always points just beyond the end of the row, so
     * the loop simply copies (channels-1) channels until sp reaches ep.
@@ -698,6 +700,8 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
 void /* PRIVATE */
 png_do_check_palette_indexes(png_structrp png_ptr, png_row_infop row_info)
 {
+   png_debug(1, "in png_do_check_palette_indexes");
+
    if (png_ptr->num_palette < (1 << row_info->bit_depth) &&
       png_ptr->num_palette > 0) /* num_palette can be 0 in MNG files */
    {
@@ -708,7 +712,7 @@ png_do_check_palette_indexes(png_structrp png_ptr, png_row_infop row_info)
        * forms produced on either GCC or MSVC.
        */
       int padding = PNG_PADBITS(row_info->pixel_depth, row_info->width);
-      png_bytep rp = png_ptr->row_buf + row_info->rowbytes - 1;
+      png_bytep rp = png_ptr->row_buf + row_info->rowbytes;
 
       switch (row_info->bit_depth)
       {
@@ -833,7 +837,7 @@ png_voidp PNGAPI
 png_get_user_transform_ptr(png_const_structrp png_ptr)
 {
    if (png_ptr == NULL)
-      return (NULL);
+      return NULL;
 
    return png_ptr->user_transform_ptr;
 }
diff --git a/3rdparty/libpng/pngwrite.c b/3rdparty/libpng/pngwrite.c
index 59377a4ddea2..77e412f43d58 100644
--- a/3rdparty/libpng/pngwrite.c
+++ b/3rdparty/libpng/pngwrite.c
@@ -1,7 +1,7 @@
 
 /* pngwrite.c - general routines to write a PNG file
  *
- * Copyright (c) 2018-2019 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -75,10 +75,10 @@ write_unknown_chunks(png_structrp png_ptr, png_const_inforp info_ptr,
  * library.  If you have a new chunk to add, make a function to write it,
  * and put it in the correct location here.  If you want the chunk written
  * after the image data, put it in png_write_end().  I strongly encourage
- * you to supply a PNG_INFO_ flag, and check info_ptr->valid before writing
- * the chunk, as that will keep the code from breaking if you want to just
- * write a plain PNG file.  If you have long comments, I suggest writing
- * them in png_write_end(), and compressing them.
+ * you to supply a PNG_INFO_<chunk> flag, and check info_ptr->valid before
+ * writing the chunk, as that will keep the code from breaking if you want
+ * to just write a plain PNG file.  If you have long comments, I suggest
+ * writing them in png_write_end(), and compressing them.
  */
 void PNGAPI
 png_write_info_before_PLTE(png_structrp png_ptr, png_const_inforp info_ptr)
@@ -239,7 +239,10 @@ png_write_info(png_structrp png_ptr, png_const_inforp info_ptr)
 
 #ifdef PNG_WRITE_eXIf_SUPPORTED
    if ((info_ptr->valid & PNG_INFO_eXIf) != 0)
+   {
       png_write_eXIf(png_ptr, info_ptr->exif, info_ptr->num_exif);
+      png_ptr->mode |= PNG_WROTE_eXIf;
+   }
 #endif
 
 #ifdef PNG_WRITE_hIST_SUPPORTED
@@ -366,7 +369,8 @@ png_write_end(png_structrp png_ptr, png_inforp info_ptr)
       png_error(png_ptr, "No IDATs written into file");
 
 #ifdef PNG_WRITE_CHECK_FOR_INVALID_INDEX_SUPPORTED
-   if (png_ptr->num_palette_max > png_ptr->num_palette)
+   if (png_ptr->color_type == PNG_COLOR_TYPE_PALETTE &&
+       png_ptr->num_palette_max >= png_ptr->num_palette)
       png_benign_error(png_ptr, "Wrote palette index exceeding num_palette");
 #endif
 
@@ -439,8 +443,9 @@ png_write_end(png_structrp png_ptr, png_inforp info_ptr)
 #endif
 
 #ifdef PNG_WRITE_eXIf_SUPPORTED
-   if ((info_ptr->valid & PNG_INFO_eXIf) != 0)
-      png_write_eXIf(png_ptr, info_ptr->exif, info_ptr->num_exif);
+      if ((info_ptr->valid & PNG_INFO_eXIf) != 0 &&
+          (png_ptr->mode & PNG_WROTE_eXIf) == 0)
+         png_write_eXIf(png_ptr, info_ptr->exif, info_ptr->num_exif);
 #endif
 
 #ifdef PNG_WRITE_UNKNOWN_CHUNKS_SUPPORTED
@@ -489,6 +494,16 @@ png_convert_from_time_t(png_timep ptime, time_t ttime)
    png_debug(1, "in png_convert_from_time_t");
 
    tbuf = gmtime(&ttime);
+   if (tbuf == NULL)
+   {
+      /* TODO: add a safe function which takes a png_ptr argument and raises
+       * a png_error if the ttime argument is invalid and the call to gmtime
+       * fails as a consequence.
+       */
+      memset(ptime, 0, sizeof(*ptime));
+      return;
+   }
+
    png_convert_from_struct_tm(ptime, tbuf);
 }
 #endif
@@ -700,12 +715,12 @@ png_write_row(png_structrp png_ptr, png_const_bytep row)
    /* 1.5.6: moved from png_struct to be a local structure: */
    png_row_info row_info;
 
-   if (png_ptr == NULL)
-      return;
-
    png_debug2(1, "in png_write_row (row %u, pass %d)",
        png_ptr->row_number, png_ptr->pass);
 
+   if (png_ptr == NULL)
+      return;
+
    /* Initialize transformations and other stuff if first time */
    if (png_ptr->row_number == 0 && png_ptr->pass == 0)
    {
@@ -1196,6 +1211,8 @@ png_set_compression_strategy(png_structrp png_ptr, int strategy)
 void PNGAPI
 png_set_compression_window_bits(png_structrp png_ptr, int window_bits)
 {
+   png_debug(1, "in png_set_compression_window_bits");
+
    if (png_ptr == NULL)
       return;
 
@@ -1279,6 +1296,8 @@ png_set_text_compression_strategy(png_structrp png_ptr, int strategy)
 void PNGAPI
 png_set_text_compression_window_bits(png_structrp png_ptr, int window_bits)
 {
+   png_debug(1, "in png_set_text_compression_window_bits");
+
    if (png_ptr == NULL)
       return;
 
@@ -1316,6 +1335,8 @@ png_set_text_compression_method(png_structrp png_ptr, int method)
 void PNGAPI
 png_set_write_status_fn(png_structrp png_ptr, png_write_status_ptr write_row_fn)
 {
+   png_debug(1, "in png_set_write_status_fn");
+
    if (png_ptr == NULL)
       return;
 
@@ -1343,6 +1364,8 @@ void PNGAPI
 png_write_png(png_structrp png_ptr, png_inforp info_ptr,
     int transforms, voidp params)
 {
+   png_debug(1, "in png_write_png");
+
    if (png_ptr == NULL || info_ptr == NULL)
       return;
 
diff --git a/3rdparty/libpng/pngwutil.c b/3rdparty/libpng/pngwutil.c
index 16345e4c0baa..14cc4ce367fc 100644
--- a/3rdparty/libpng/pngwutil.c
+++ b/3rdparty/libpng/pngwutil.c
@@ -1,7 +1,7 @@
 
 /* pngwutil.c - utilities to write a PNG file
  *
- * Copyright (c) 2018 Cosmin Truta
+ * Copyright (c) 2018-2024 Cosmin Truta
  * Copyright (c) 1998-2002,2004,2006-2018 Glenn Randers-Pehrson
  * Copyright (c) 1996-1997 Andreas Dilger
  * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
@@ -1747,7 +1747,7 @@ png_write_pCAL(png_structrp png_ptr, png_charp purpose, png_int_32 X0,
 {
    png_uint_32 purpose_len;
    size_t units_len, total_len;
-   png_size_tp params_len;
+   size_t *params_len;
    png_byte buf[10];
    png_byte new_purpose[80];
    int i;
@@ -1769,7 +1769,7 @@ png_write_pCAL(png_structrp png_ptr, png_charp purpose, png_int_32 X0,
    png_debug1(3, "pCAL units length = %d", (int)units_len);
    total_len = purpose_len + units_len + 10;
 
-   params_len = (png_size_tp)png_malloc(png_ptr,
+   params_len = (size_t *)png_malloc(png_ptr,
        (png_alloc_size_t)((png_alloc_size_t)nparams * (sizeof (size_t))));
 
    /* Find the length of each parameter, making sure we don't count the
@@ -2311,7 +2311,7 @@ png_setup_sub_row(png_structrp png_ptr, png_uint_32 bpp,
         break;
    }
 
-   return (sum);
+   return sum;
 }
 
 static void /* PRIVATE */
@@ -2361,7 +2361,7 @@ png_setup_up_row(png_structrp png_ptr, size_t row_bytes, size_t lmins)
         break;
    }
 
-   return (sum);
+   return sum;
 }
 static void /* PRIVATE */
 png_setup_up_row_only(png_structrp png_ptr, size_t row_bytes)
@@ -2417,7 +2417,7 @@ png_setup_avg_row(png_structrp png_ptr, png_uint_32 bpp,
         break;
    }
 
-   return (sum);
+   return sum;
 }
 static void /* PRIVATE */
 png_setup_avg_row_only(png_structrp png_ptr, png_uint_32 bpp,
@@ -2500,7 +2500,7 @@ png_setup_paeth_row(png_structrp png_ptr, png_uint_32 bpp,
         break;
    }
 
-   return (sum);
+   return sum;
 }
 static void /* PRIVATE */
 png_setup_paeth_row_only(png_structrp png_ptr, png_uint_32 bpp,
diff --git a/3rdparty/libspng/CMakeLists.txt b/3rdparty/libspng/CMakeLists.txt
index afd6d5fe4069..ab0a41a5c619 100644
--- a/3rdparty/libspng/CMakeLists.txt
+++ b/3rdparty/libspng/CMakeLists.txt
@@ -23,7 +23,6 @@ if(MSVC)
 endif(MSVC)
 
 add_library(${SPNG_LIBRARY} STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} ${spng_headers} ${spng_sources})
-ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable)
 target_link_libraries(${SPNG_LIBRARY} ${ZLIB_LIBRARIES})
 
 set_target_properties(${SPNG_LIBRARY}
diff --git a/3rdparty/libspng/LICENSE b/3rdparty/libspng/LICENSE
index f96574b80d2a..a29d5a2fa8d4 100644
--- a/3rdparty/libspng/LICENSE
+++ b/3rdparty/libspng/LICENSE
@@ -1,6 +1,6 @@
 BSD 2-Clause License
 
-Copyright (c) 2018-2022, Randy <randy408@protonmail.com>
+Copyright (c) 2018-2023, Randy <randy408@protonmail.com>
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/3rdparty/libspng/spng.c b/3rdparty/libspng/spng.c
index 6ed60f2d6ce6..b22b7110a17b 100644
--- a/3rdparty/libspng/spng.c
+++ b/3rdparty/libspng/spng.c
@@ -2691,6 +2691,7 @@ static int read_non_idat_chunks(spng_ctx *ctx)
             if(!memcmp(chunk.type, type_exif, 4))
             {
                 if(ctx->file.exif) return SPNG_EDUP_EXIF;
+                if(!chunk.length) return SPNG_EEXIF;
 
                 ctx->file.exif = 1;
 
@@ -4999,11 +5000,11 @@ void spng_ctx_free(spng_ctx *ctx)
     spng__free(ctx, ctx->prev_scanline_buf);
     spng__free(ctx, ctx->filtered_scanline_buf);
 
-    spng_free_fn *free_func = ctx->alloc.free_fn;
+    spng_free_fn *free_fn = ctx->alloc.free_fn;
 
     memset(ctx, 0, sizeof(spng_ctx));
 
-    free_func(ctx);
+    free_fn(ctx);
 }
 
 static int buffer_read_fn(spng_ctx *ctx, void *user, void *data, size_t n)
@@ -5743,7 +5744,8 @@ int spng_set_iccp(spng_ctx *ctx, struct spng_iccp *iccp)
     SPNG_SET_CHUNK_BOILERPLATE(iccp);
 
     if(check_png_keyword(iccp->profile_name)) return SPNG_EICCP_NAME;
-    if(!iccp->profile_len || iccp->profile_len > UINT_MAX) return 1;
+    if(!iccp->profile_len) return SPNG_ECHUNK_SIZE;
+    if(iccp->profile_len > spng_u32max) return SPNG_ECHUNK_STDLEN;
 
     if(ctx->iccp.profile && !ctx->user.iccp) spng__free(ctx, ctx->iccp.profile);
 
diff --git a/3rdparty/libspng/spng.h b/3rdparty/libspng/spng.h
index 5937d6c15de3..8f946337bfa2 100644
--- a/3rdparty/libspng/spng.h
+++ b/3rdparty/libspng/spng.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: (BSD-2-Clause AND libpng-2.0) */
+/* SPDX-License-Identifier: BSD-2-Clause */
 #ifndef SPNG_H
 #define SPNG_H
 
@@ -28,7 +28,7 @@ extern "C" {
 
 #define SPNG_VERSION_MAJOR 0
 #define SPNG_VERSION_MINOR 7
-#define SPNG_VERSION_PATCH 3
+#define SPNG_VERSION_PATCH 4
 
 enum spng_errno
 {
diff --git a/3rdparty/libtengine/tengine.cmake b/3rdparty/libtengine/tengine.cmake
deleted file mode 100644
index ee8f0cb86f80..000000000000
--- a/3rdparty/libtengine/tengine.cmake
+++ /dev/null
@@ -1,80 +0,0 @@
-# COPYRIGHT
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# License); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Copyright (c) 2020, OPEN AI LAB
-# Author: qtang@openailab.com or https://github.com/BUG1989
-#         qli@openailab.com
-#         sqfu@openailab.com
-
-SET(TENGINE_COMMIT_VERSION "e89cf8870de2ff0a80cfe626c0b52b2a16fb302e")
-SET(OCV_TENGINE_DIR "${OpenCV_BINARY_DIR}/3rdparty/libtengine")
-SET(OCV_TENGINE_SOURCE_PATH "${OCV_TENGINE_DIR}/Tengine-${TENGINE_COMMIT_VERSION}")
-
-IF(EXISTS "${OCV_TENGINE_SOURCE_PATH}")
-	MESSAGE(STATUS "Tengine is exist already at: ${OCV_TENGINE_SOURCE_PATH}")
-
-	SET(Tengine_FOUND ON)
-	SET(BUILD_TENGINE ON)
-ELSE()
-	SET(OCV_TENGINE_FILENAME "${TENGINE_COMMIT_VERSION}.zip")#name
-	SET(OCV_TENGINE_URL "https://github.com/OAID/Tengine/archive/") #url
-	SET(tengine_md5sum 23f61ebb1dd419f1207d8876496289c5) #md5sum
-
-	ocv_download(FILENAME ${OCV_TENGINE_FILENAME}
-						HASH ${tengine_md5sum}
-						URL
-						"${OPENCV_TENGINE_URL}"
-						"$ENV{OPENCV_TENGINE_URL}"
-						"${OCV_TENGINE_URL}"
-						DESTINATION_DIR "${OCV_TENGINE_DIR}"
-						ID TENGINE
-						STATUS res
-						UNPACK RELATIVE_URL)
-
-	if (NOT res)
-		MESSAGE(STATUS "TENGINE DOWNLOAD FAILED. Turning Tengine_FOUND off.")
-		SET(Tengine_FOUND OFF)
-	else ()
-		MESSAGE(STATUS "TENGINE DOWNLOAD success . ")
-
-		SET(Tengine_FOUND ON)
-		SET(BUILD_TENGINE ON)
-	endif()
-ENDIF()
-
-if(BUILD_TENGINE)
-	SET(HAVE_TENGINE 1)
-
-	if(NOT ANDROID)
-		# linux system
-		if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
-			   SET(TENGINE_TOOLCHAIN_FLAG "-march=armv7-a")
-		elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) ## AARCH64
-			   SET(TENGINE_TOOLCHAIN_FLAG "-march=armv8-a")
-		endif()
-	endif()
-
-	SET(BUILT_IN_OPENCV ON) ## set for tengine compile discern .
-	SET(Tengine_INCLUDE_DIR  "${OCV_TENGINE_SOURCE_PATH}/include" CACHE INTERNAL "")
-	if(EXISTS "${OCV_TENGINE_SOURCE_PATH}/CMakeLists.txt")
-		add_subdirectory("${OCV_TENGINE_SOURCE_PATH}" "${OCV_TENGINE_DIR}/build")
-	else()
-		message(WARNING "TENGINE: Missing 'CMakeLists.txt' in source code package: ${OCV_TENGINE_SOURCE_PATH}")
-	endif()
-	SET(Tengine_LIB "tengine" CACHE INTERNAL "")
-endif()
diff --git a/3rdparty/libtiff/CMakeLists.txt b/3rdparty/libtiff/CMakeLists.txt
index 826c5e2316c1..9173e207f787 100644
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -124,17 +124,17 @@ elseif(SIZEOF_UNSIGNED_LONG_LONG EQUAL 8)
 endif()
 
 if(SIZEOF_UNSIGNED_INT EQUAL SIZEOF_SIZE_T)
-  set(TIFF_SIZE_T "unsigned int")
+  set(TIFF_SIZE_T "uint32_t")
   set(TIFF_SIZE_FORMAT "%u")
-  set(TIFF_SSIZE_T "signed int")
+  set(TIFF_SSIZE_T "int32_t")
   set(TIFF_SSIZE_FORMAT "%d")
 elseif(SIZEOF_UNSIGNED_LONG EQUAL SIZEOF_SIZE_T)
-  set(TIFF_SIZE_T "unsigned long")
+  set(TIFF_SIZE_T "uint64_t")
   set(TIFF_SIZE_FORMAT "%lu")
-  set(TIFF_SSIZE_T "signed long")
+  set(TIFF_SSIZE_T "int64_t")
   set(TIFF_SSIZE_FORMAT "%ld")
 elseif(SIZEOF_UNSIGNED_LONG_LONG EQUAL SIZEOF_SIZE_T)
-  set(TIFF_SIZE_T "unsigned long")
+  set(TIFF_SIZE_T "uint64_t")
   if(MINGW)
     set(TIFF_SIZE_FORMAT "%I64u")
     set(TIFF_SSIZE_FORMAT "%I64d")
@@ -198,20 +198,6 @@ check_function_exists(strtol     HAVE_STRTOUL)
 check_function_exists(strtoull   HAVE_STRTOULL)
 check_function_exists(lfind      HAVE_LFIND)
 
-# May be inlined, so check it compiles:
-check_c_source_compiles("
-#include <stdio.h>
-int main(void) {
-  char buf[10];
-  snprintf(buf, 10, \"Test %d\", 1);
-  return 0;
-}"
-  HAVE_SNPRINTF)
-
-if(NOT HAVE_SNPRINTF)
-  add_definitions(-DNEED_LIBPORT)
-endif()
-
 # CPU bit order
 set(fillorder FILLORDER_MSB2LSB)
 if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "i.*86.*" OR
@@ -371,7 +357,13 @@ if(LIBLZMA_LIBRARIES)
   list(APPEND TIFF_LIBRARY_DEPS ${LIBLZMA_LIBRARIES})
 endif()
 
+set(LIBTIFF_MAJOR_VERSION "4")
+set(LIBTIFF_MINOR_VERSION "6")
+set(LIBTIFF_MICRO_VERSION "0")
+set(LIBTIFF_VERSION "${LIBTIFF_MAJOR_VERSION}.${LIBTIFF_MINOR_VERSION}.${LIBTIFF_MICRO_VERSION}")
+file(READ "RELEASE-DATE" LIBTIFF_RELEASE_DATE content)
 
+set(TIFF_MAX_DIR_COUNT "1048576")
 
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tif_config.h.cmake.in"
                "${CMAKE_CURRENT_BINARY_DIR}/tif_config.h"
@@ -379,6 +371,9 @@ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tif_config.h.cmake.in"
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tiffconf.h.cmake.in"
                "${CMAKE_CURRENT_BINARY_DIR}/tiffconf.h"
                @ONLY)
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tiffvers.h.cmake.in"
+               "${CMAKE_CURRENT_BINARY_DIR}/tiffvers.h"
+               @ONLY)
 
 ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}" ${ZLIB_INCLUDE_DIRS})
 
@@ -399,6 +394,7 @@ set(lib_srcs
     tif_fax3sm.c
     tif_flush.c
     tif_getimage.c
+    tif_hash_set.c
     tif_jbig.c
     tif_jpeg_12.c
     tif_jpeg.c
@@ -427,21 +423,18 @@ set(lib_srcs
     t4.h
     tif_dir.h
     tif_fax3.h
+    tif_hash_set.h
+    tif_predict.h
     tiff.h
     tiffio.h
     tiffiop.h
-    tiffvers.h
-    tif_predict.h
+    "${CMAKE_CURRENT_BINARY_DIR}/tiffvers.h"
     uvcode.h
     tiffio.hxx
     "${CMAKE_CURRENT_BINARY_DIR}/tif_config.h"
     "${CMAKE_CURRENT_BINARY_DIR}/tiffconf.h"
     )
 
-if(WIN32 AND NOT HAVE_SNPRINTF)
-  list(APPEND lib_srcs snprintf.c libport.h)
-endif()
-
 if(WIN32 AND NOT WINRT)
   list(APPEND lib_srcs tif_win32.c)
 else()
diff --git a/3rdparty/libtiff/ChangeLog b/3rdparty/libtiff/ChangeLog
index 452dcb3a1863..87b5f126bb44 100644
--- a/3rdparty/libtiff/ChangeLog
+++ b/3rdparty/libtiff/ChangeLog
@@ -1,3 +1,4732 @@
+2023-09-05  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff v4.6.0 released
+
+2023-09-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_600' into 'master'
+	CMake: fix build with -Dstrip-chopping=off (fixes #600)
+
+	See merge request libtiff/libtiff!527
+
+2023-09-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'georgthegreat-master-patch-87447' into 'master'
+	Fix using __attribute__ libtiff with clang-for-windows
+
+	See merge request libtiff/libtiff!525
+
+2023-09-05  Yuriy Chernyshov  <georgthegreat@gmail.com>
+
+	Fix using __attribute__ libtiff with clang-for-windows.
+
+2023-09-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'manpage_TIFFField_docu_update' into 'master'
+	manpages: TiffField functions documentation updated with return behaviour for...
+
+	See merge request libtiff/libtiff!526
+
+2023-09-05  Even Rouault  <even.rouault@spatialys.com>
+
+	CMake: fix build with -Dstrip-chopping=off (fixes #600)
+
+2023-09-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'consistently_update_TIFF-version_from_configure-ac' into 'master'
+	Update CMake and autoconf scripts to consistently update LibTIFF version...
+
+	See merge request libtiff/libtiff!456
+
+2023-09-03  Su Laus  <sulau@freenet.de>
+
+	Update CMake and autoconf scripts to consistently update LibTIFF version defines and references in various files when version definition in configure.ac has been changed.
+	- Move in tiffvers.h from .\libtiff source directory to .\libtiff  build directory.
+	- Remove unused version information from tif_config.h
+	- With every CMake build the version defines (e.g. 4.5.1) within tiffvers.h are consistently updated from configure.ac. The version release-date is taken from file RELEASE-DATE.
+	- The files VERSION and RELEASE-DATE are only updated with a special CMake target build: cmake --build . --target tiff_release.
+
+	- For autotools, version information is updated from configure.ac with ./autogen.sh. LIBTIFF_RELEASE_DATE is taken form file RELEASE-DATE.
+	- ./configure generates tiffvers.h with the cached version information and LIBTIFF_RELEASE_DATE.
+	- "make release" updates tiffvers.h and VERSION file with cached version info and RELEASE-DATE file and tiffves.h with the current date.
+
+2023-08-28  Su_Laus  <sulau@freenet.de>
+
+	manpages: TiffField functions documentation updated with return behaviour for not defined tags and determination of write-/read-count size.
+
+2023-08-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'change_long_to_int32_t_in_two_test_apps' into 'master'
+	Change "long" to "int32_t" in two test apps, because can be either int32_t or...
+
+	See merge request libtiff/libtiff!524
+
+2023-08-21  Su_Laus  <sulau@freenet.de>
+
+	Change "long" to "int32_t" in two test apps, because can be either int32_t or int64_t, depending on compiler and system.
+
+2023-08-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'CI_CMake_static_build' into 'master'
+	Add static build for CI/CD to run testcases which need private interface functions.
+
+	See merge request libtiff/libtiff!521
+
+2023-08-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_#597_tiffset_different_signedness' into 'master'
+	tiffset fix #597: warning: comparison of integer expressions of different signedness.
+
+	Closes #597
+
+	See merge request libtiff/libtiff!523
+
+2023-08-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcp_remove_i_option' into 'master'
+	tiffcp: remove -i option (ignore errors)
+
+	See merge request libtiff/libtiff!522
+
+2023-08-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'move_tools_to_unsupported_a_archive' into 'master'
+	Move most TIFF tools to archive and keep some as unsupported (see #580).
+
+	See merge request libtiff/libtiff!520
+
+2023-08-16  Su Laus  <sulau@freenet.de>
+
+	Move most TIFF tools to archive and keep some as unsupported (see #580).
+
+2023-08-12  Su_Laus  <sulau@freenet.de>
+
+	Add static build for CI/CD to run testcases which need private interface functions.
+
+	tiffset fix #597: warning: comparison of integer expressions of different signedness.
+
+	Remove -i option (ignore errors) from tiffcp, because almost all fuzzer issues were consequential errors from ignored errors because of the "-i" option.
+
+2023-08-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_585_test_write_read_tags_autoconf' into 'master'
+	Add missing test_write_read_tags.c and test_transferfunction_write_read.c in...
+
+	Closes #585
+
+	See merge request libtiff/libtiff!519
+
+2023-07-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix copy paste error.
+
+2023-07-23  Su_Laus  <sulau@freenet.de>
+
+	Add missing test_write_read_tags.c and test_transferfunction_write_read.c in tarball (fixes #585) and correct „long“ issue.
+	Don't use "long" because can be int32_t or int64_t, depending on compiler and system.
+
+2023-07-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'clang-format-tools' into 'master'
+	Automatically format with clang-format
+
+	See merge request libtiff/libtiff!518
+
+2023-07-20  Timothy Lyanguzov  <timothy.lyanguzov@sap.com>
+
+	Automatically format with clang-format.
+
+2023-07-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_589' into 'master'
+	TiffConfig.cmake.in: set TIFF_INCLUDE_DIR, TIFF_INCLUDE_DIRS and...
+
+	Closes #589
+
+	See merge request libtiff/libtiff!514
+
+2023-07-20  Even Rouault  <even.rouault@spatialys.com>
+
+	TiffConfig.cmake.in: set TIFF_INCLUDE_DIR, TIFF_INCLUDE_DIRS and...
+
+2023-07-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master-patch-6fc6' into 'master'
+	raw2tiff: fix integer overflow and bypass of the check (fixes #592)
+
+	See merge request libtiff/libtiff!516
+
+2023-07-19  Arie Haenel  <arie.haenel@jct.ac.il>
+
+	raw2tiff: fix integer overflow and bypass of the check (fixes #592)
+
+2023-07-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master-patch-05a4' into 'master'
+	tiffcp: fix memory corruption (overflow) on hostile images (fixes #591)
+
+	See merge request libtiff/libtiff!515
+
+2023-07-19  Arie Haenel  <arie.haenel@jct.ac.il>
+
+	tiffcp: fix memory corruption (overflow) on hostile images (fixes #591)
+
+2023-07-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix-numtrunc' into 'master'
+	fix numtrunc at tiff_dirread.c
+
+	See merge request libtiff/libtiff!512
+
+2023-07-17  headshog  <craaaaaachind@gmail.com>
+
+	TIFFReadDirectoryCheckOrder: avoid integer overflow.
+	When it occurs, it should be harmless in practice though
+
+2023-07-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'webp_lossless_exact' into 'master'
+	WebP codec: turn exact mode when creating lossless files to avoid altering...
+
+	See merge request libtiff/libtiff!511
+
+2023-07-11  Even Rouault  <even.rouault@spatialys.com>
+
+	WebP codec: turn exact mode when creating lossless files to avoid altering R,G,B values in areas where alpha=0
+	Fixes https://github.com/OSGeo/gdal/issues/8038
+
+2023-07-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'webp_reg_fix' into 'master'
+	WebP decoder: fix error when reading a 3-band blob in a RGBA image
+
+	See merge request libtiff/libtiff!510
+
+2023-07-05  Even Rouault  <even.rouault@spatialys.com>
+
+	WebP decoder: fix error when reading a 3-band blob in a RGBA image.
+	Fixes regression of 350ff161c8a61b6483a1e4689e09cd47dd0dd5f9 (master only)
+
+2023-06-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'appveyor_fix' into 'master'
+	.appveyor.yml: workaround build error
+
+	See merge request libtiff/libtiff!509
+
+2023-06-26  Even Rouault  <even.rouault@spatialys.com>
+
+	.appveyor.yml: workaround build error.
+
+2023-06-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tif_webp_warning_fixes' into 'master'
+	tif_webp.c: fix signed vs unsigned comparison warnings (fix previous commit)
+
+	See merge request libtiff/libtiff!508
+
+2023-06-26  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_webp.c: fix signed vs unsigned comparison warnings (fix previous commit)
+
+2023-06-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_TransferFunction_writing' into 'master'
+	Fix TransferFunction writing of only two transfer functions.
+
+	See merge request libtiff/libtiff!502
+
+2023-06-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_581_582' into 'master'
+	WebP decoder: validate WebP blob width, height, band count against TIFF parameters
+
+	Closes #582 et #581
+
+	See merge request libtiff/libtiff!507
+
+2023-06-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'warning_cmake_config_file' into 'master'
+	v4.5.1 release note: add warning about CMake config file being preview
+
+	See merge request libtiff/libtiff!506
+
+2023-06-17  Even Rouault  <even.rouault@spatialys.com>
+
+	WebP decoder: validate WebP blob width, height, band count against TIFF parameters
+	to avoid use of uninitialized variable, or decoding corrupted content
+	without explicit error
+
+	Fixes #581, fixes #582
+
+2023-06-15  Even Rouault  <even.rouault@spatialys.com>
+
+	v4.5.1 release note: add warning about CMake config file being preview.
+
+2023-06-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'HOWTO-RELEASE-gitlab-release' into 'master'
+	HOWTO-RELEASE: mention creating a gitlab release
+
+	See merge request libtiff/libtiff!505
+
+2023-06-14  Even Rouault  <even.rouault@spatialys.com>
+
+	HOWTO-RELEASE: mention creating a gitlab release.
+
+2023-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFOpenWExt_O_RDWR' into 'master'
+	TIFFOpenWExt(): mode r+ in the Windows implementation adjusted to that of Linux
+
+	See merge request libtiff/libtiff!504
+
+2023-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFOpenWExt(): mode r+ in the Windows implementation adjusted to that of Linux
+
+2023-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_ossfuzz_59751' into 'master'
+	TIFFReadDirectory(): fix crash when reading tag TIFFTAG_EP_BATTERYLEVEL
+
+	See merge request libtiff/libtiff!503
+
+2023-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadDirectory(): fix crash when reading tag TIFFTAG_EP_BATTERYLEVEL.
+	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=59751
+
+	In 738e0409 (refs #575), we disabled DNG / EP tags, but there was a
+	special proessing for TIFFTAG_EP_BATTERYLEVEL that must be disabled
+	since the tag is no longer defined.
+
+2023-06-09  Su_Laus  <sulau@freenet.de>
+
+	Fix TransferFunction writing of only two transfer functions. The TIFFWriteDirectoryTagTransferfunction() function writes in some cases only two transfer functions, although only exactly one or exactly three transfer functions are allowed. This then leads to an error when reading. --> TIFFReadDirectory: Warning, Incorrect count for "TransferFunction"; tag ignored.
+	This MR corrects the behaviour of TIFFWriteDirectoryTagTransferfunction() accordingly. Furthermore, a possible buffer overflow is avoided.
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_win_build' into 'master'
+	Fix Windows build
+
+	Closes #578
+
+	See merge request libtiff/libtiff!501
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	cmake/PkgConfig.cmake: avoid CMake error when prefix or suffix is empty.
+
+	Add tif_win32_versioninfo.rc and tif_tools_versioninfo.rc to EXTRA_DIST.
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_coverity_build' into 'master'
+	build/gitlab-ci: fix coverity_build()
+
+	See merge request libtiff/libtiff!499
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	build/gitlab-ci: fix coverity_build()
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'v4.5.1rc1_preparation' into 'master'
+	Prepare release 4.5.1
+
+	See merge request libtiff/libtiff!498
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Prepare for v4.5.1 release.
+
+	Merge remote-tracking branch 'sulaus/Rel_4.5.1_preparation'
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'disable_dng_tags' into 'master'
+	tif_dirinfo.c: disable DNG 1.2->1.6 tags
+
+	Closes #575
+
+	See merge request libtiff/libtiff!497
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_577' into 'master'
+	CMake related fixes
+
+	Closes #577
+
+	See merge request libtiff/libtiff!496
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff v4.5.1 released
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge remote-tracking branch 'sulaus/Rel_4.5.1_preparation'
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'disable_dng_tags' into 'master'
+	tif_dirinfo.c: disable DNG 1.2->1.6 tags
+
+	Closes #575
+
+	See merge request libtiff/libtiff!497
+
+2023-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_577' into 'master'
+	CMake related fixes
+
+	Closes #577
+
+	See merge request libtiff/libtiff!496
+
+2023-06-07  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_dirinfo.c: disable DNG 1.2->1.6 tags.
+	They were added per b90b20d36d7833f54a1f3014c324f6c21b988006 but it has
+	been found in https://gitlab.com/libtiff/libtiff/-/issues/575 that it
+	could cause compatibity issues with libtiff users, so this addition
+	should be defered for a feature release (likely 4.6.0) and not a patch one.
+
+	Fixes #575
+
+2023-06-06  Timothy Lyanguzov  <theta682@gmail.com>
+
+	Apply 1 suggestion(s) to 1 file(s)
+
+2023-06-05  Even Rouault  <even.rouault@spatialys.com>
+
+	CI: add testing of find_package(Tiff CONFIG)
+
+	CMake: export TiffConfig.cmake and TiffConfigVersion.cmake files.
+
+	libtiff/CMakeLists.txt: fix export of INTERFACE_INCLUDE_DIRECTORIES.
+
+	libtiff/CMakeLists.txt: correctly define TIFF::tiff alias (fixes #577)
+
+2023-06-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFField_downgrade_errors_to_warnings' into 'master'
+	TIFFFieldWithName() and TIFFFieldWithTag() downgrade errors to warnings.
+
+	See merge request libtiff/libtiff!495
+
+2023-05-28  Su_Laus  <sulau@freenet.de>
+
+	TIFFFieldWithName() and TIFFFieldWithTag() downgrade errors to warnings. see https://gitlab.com/libtiff/libtiff/-/issues/575#note_1407633888
+
+2023-05-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bebuch-master-patch-58347' into 'master'
+	check if upstream lzma (xz) config was used and bind to it if so
+
+	See merge request libtiff/libtiff!494
+
+2023-05-25  Benjamin Buch  <benni.buch@gmail.com>
+
+	check if upstream lzma (xz) config was used and bind to it if so.
+
+2023-05-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'upstream-zstd-config' into 'master'
+	check if upstream zstd config was used and bind to it if so
+
+	See merge request libtiff/libtiff!493
+
+2023-05-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake_FindDeflate_bugs_see_526' into 'master'
+	CMake: FindDeflate several errors fixed (see #526)
+
+	See merge request libtiff/libtiff!491
+
+2023-05-24  Su_Laus  <sulau@freenet.de>
+
+	CMake: FindDeflate several errors (see #526)
+	There are CMake issues if the library is not included in the environment path and only set with CMake -D option.
+	- For FindDeflate.cmake, FindJBIG.cmake, FindLERC.cmake, FindWebP.cmake, FindZSTD.cmake:
+	  Set IMPORTED_LOCATION (without debug or release) if neither <library>_LIBRARY_RELEASE nor <library>_LIBRARY_DEBUG were set.
+	- FindDeflate.cmake: Correct code to retrieve library version information from libdeflate.h
+	- FindLERC.cmake version string return added.
+
+2023-05-24  Benjamin Buch  <benni.buch@gmail.com>
+
+	prefer shared over static.
+
+	check if upstream zstd config was used and bind to it if so.
+
+2023-05-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	make WebP component name compatible with upstream ConfigWebP.cmake
+
+	See merge request libtiff/libtiff!492
+
+2023-05-24  Benjamin Buch  <benni.buch@gmail.com>
+
+	CMake: make WebP component name compatible with upstream ConfigWebP.cmake
+
+2023-05-18  Su_Laus  <sulau@freenet.de>
+
+	Prepare release 4.5.1 - Update till 18.05.23 after fix_559_DNG_1.6_passcount_error
+
+	Prepare release 4.5.1.
+
+2023-05-18  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_559_DNG_1.6_passcount_error' into 'master'
+	Fix #559 DNG 1.6 passcount assertion
+
+	Closes #574 et #559
+
+	See merge request libtiff/libtiff!489
+
+2023-05-18  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_557_TagExtender_for_CustomDirectories_not_possible' into 'master'
+	manpage: TIFFSetTagExtender() cannot add tags to custom directories. Closes #557.
+
+	Closes #557
+
+	See merge request libtiff/libtiff!490
+
+2023-05-18  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_559_DNG_1.6_passcount_error' into 'master'
+	Fix #559 DNG 1.6 passcount assertion
+
+	Closes #574 et #559
+
+	See merge request libtiff/libtiff!489
+
+2023-05-18  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_557_TagExtender_for_CustomDirectories_not_possible' into 'master'
+	manpage: TIFFSetTagExtender() cannot add tags to custom directories. Closes #557.
+
+	Closes #557
+
+	See merge request libtiff/libtiff!490
+
+2023-05-17  Su_Laus  <sulau@freenet.de>
+
+	Documentation update: TIFFSetTagExtender() cannot add tags to custom directories.
+	Closes #557.
+
+2023-05-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_558' into 'master'
+	Hardcode HOST_FILLORDER to FILLORDER_LSB2MSB, and make 'H' flag of TIFFOpen()...
+
+	See merge request libtiff/libtiff!488
+
+2023-05-16  Su_Laus  <sulau@freenet.de>
+
+	Fix #559 DNG 1.6 passcount assertion.
+	Amend DNG tags definition introduced with MR 482:
+	- DNG 1.6 tags specified as UTF-8 strings are defined as variable TIFF_BYTE with  passcount=TRUE.
+	- For all tags with TIFF_SETGET_C32_UINT8 the readcount and writecount were corrected to -3 (TIFF_VARIABLE2).
+
+	Testprogram to write and read all tags defined within LibTIFF is introduced.
+	It also checks for valid passcount flag setting for the defined tags but some special tags are excluded from that check.
+
+	Closes #559.
+
+2023-05-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Hardcode HOST_FILLORDER to FILLORDER_LSB2MSB, and make 'H' flag of TIFFOpen() to warn and an alias of FILLORDER_MSB2LSB
+
+	tif_lerc.c: use WORDS_BIGENDIAN instead of HOST_BIGENDIAN.
+
+2023-05-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_484_TIFFDirectory_32_64_bit' into 'master'
+	Fix 484 TIFFDirectory td_fieldsset uses unsigned long which can be 32 or 64 bits.
+
+	Closes #484
+
+	See merge request libtiff/libtiff!471
+
+2023-05-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'DNG_1.6_EP_tags' into 'master'
+	Add DNG tags up to version 1.6.0.0 and some TIFF/EP tags and update documentation
+
+	See merge request libtiff/libtiff!482
+
+2023-05-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue547' into 'master'
+	do not install libtiff-4.pc when tiff-install is reset
+
+	Closes #547
+
+	See merge request libtiff/libtiff!481
+
+2023-05-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_548' into 'master'
+	LZWDecode(): avoid crash when trying to read again from a strip whith a...
+
+	Closes #548
+
+	See merge request libtiff/libtiff!484
+
+2023-05-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_fix_553_multi-image-errors' into 'master'
+	tiffcrop: fix 553 by considering error return of writeSelections()
+
+	Closes #553
+
+	See merge request libtiff/libtiff!485
+
+2023-05-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tif_ojpeg_fix-554_FPE' into 'master'
+	tif_ojpeg.c fix 554 by checking for division by zero
+
+	Closes #554
+
+	See merge request libtiff/libtiff!486
+
+2023-05-06  Su Laus  <sulau@freenet.de>
+
+	tif_ojpeg.c fix 554 by checking for division by zero.
+
+2023-05-05  Su_Laus  <sulau@freenet.de>
+
+	Consider error return of writeSelections(). Fixes #553.
+
+2023-04-29  Even Rouault  <even.rouault@spatialys.com>
+
+	LZWDecode(): avoid crash when trying to read again from a strip whith a missing end-of-information marker (fixes #548)
+
+2023-04-25  Su_Laus  <sulau@freenet.de>
+
+	Add DNG tags up to version 1.6.0.0 and some TIFF/EP tags and update documentation
+	Amend MR !337 'Add support for DNG tags up to version 1.6.0.0 and some TIFF/EP tags' from Sami Liedes:
+	- Set most tags to OkToChange=1.
+	- Define BATTERYLEVEL tag as ASCII and convert values of rational variant to ASCII.
+	- TIFF documentation updated for tags recognized by LibTiff (DNG 1.6 and others).
+	- TIFF/EP tags added, which are equivalent to EXIF tags. This addresses part of #418 as well.
+	- Definition of tags reformatted (clang-format off) for better readability of tag comments in tiff.h and tif_dirinfo.c
+
+2023-04-23  Roman  <kosobrodov@fastmail.fm>
+
+	do not install libtiff-4.pc when tiff-install is reset.
+
+2023-04-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'mymaster1' into 'master'
+	fix runtime error: applying zero offset to null pointer
+
+	See merge request libtiff/libtiff!479
+
+2023-04-21  xiaoxiaoafeifei  <lliangliang2007@163.com>
+
+	countInkNamesString(): fix `UndefinedBehaviorSanitizer`: applying zero offset to null pointer
+
+2023-03-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tif_ovrcache_TIFFSetSubDirectory' into 'master'
+	tif_ovrcache.c: check TIFFSetSubDirectory() return value (CID 1524573)
+
+	See merge request libtiff/libtiff!478
+
+2023-03-26  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_ovrcache.c: check TIFFSetSubDirectory() return value (CID 1524573)
+
+2023-03-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'even_faster_setdirectory_with_IFDlist' into 'master'
+	Even faster TIFFSetDirectory() using IFD list.
+
+	See merge request libtiff/libtiff!477
+
+2023-03-26  Su Laus  <sulau@freenet.de>
+
+	Even faster TIFFSetDirectory() using IFD list.
+
+2023-03-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'faster-setdirectory_newMR' into 'master'
+	Optimize relative seeking with TIFFSetDirectory
+
+	See merge request libtiff/libtiff!474
+
+2023-03-12  Su Laus  <sulau@freenet.de>
+
+	Optimize relative seeking with TIFFSetDirectory.
+
+2023-03-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	Fix memory leak in tiffcrop.c
+
+	See merge request libtiff/libtiff!475
+
+2023-03-08  zhailiangliang  <zhailiangliang@loongson.cn>
+
+	Fix memory leak in tiffcrop.c.
+
+2023-02-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'skip-thumbnail-test' into 'master'
+	test: avoid running tool tests if not built
+
+	Closes #421
+
+	See merge request libtiff/libtiff!334
+
+2023-02-22  Sam James  <sam@gentoo.org>
+
+	test (cmake): skip script tests if tools aren't built.
+	In Gentoo, we avoid building the tools for multilib (32-bit, x86) builds on
+	amd64/x86_64 because we only need the library to keep binary applications working.
+
+	This causes a test failure in e.g. tiffcp-thumbnail.sh as the 'thumbnail'
+	binary isn't built. Skip it if unavailable.
+
+	Fixes: https://gitlab.com/libtiff/libtiff/-/issues/421
+
+2023-02-22  Sam James  <sam@gentoo.org>
+
+	test (autotools): skip script tests if tools aren't built.
+	In Gentoo, we avoid building the tools for multilib (32-bit, x86) builds on
+	amd64/x86_64 because we only need the library to keep binary applications working.
+
+	This causes a test failure in e.g. tiffcp-thumbnail.sh as the 'thumbnail'
+	binary isn't built. Skip it if unavailable.
+
+	Fixes: https://gitlab.com/libtiff/libtiff/-/issues/421
+
+2023-02-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_Unlink_first_directory_0' into 'master'
+	Fix TIFFUnlinkDirectory(0) case and unlink of first directory.
+
+	See merge request libtiff/libtiff!460
+
+2023-02-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tif_luv_check_NaN_fix_#530' into 'master'
+	tif_luv: Check and correct for NaN data in uv_encode().
+
+	Closes #530
+
+	See merge request libtiff/libtiff!473
+
+2023-02-16  Su_Laus  <sulau@freenet.de>
+
+	tif_luv: Check and correct for NaN data in uv_encode().
+	Closes #530
+
+	See merge request !473
+
+2023-02-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_dont_reuse_input_buffer_fix_527' into 'master'
+	tiffcrop: Do not reuse input buffer for subsequent images. Fix issue 527
+
+	Closes #527
+
+	See merge request libtiff/libtiff!472
+
+2023-02-14  Su_Laus  <sulau@freenet.de>
+
+	tiffcrop: Do not reuse input buffer for subsequent images. Fix issue 527
+	Reuse of read_buff within loadImage() from previous image is quite unsafe, because other functions (like rotateImage() etc.) reallocate that buffer with different size without updating the local prev_readsize value.
+
+	Closes #527
+
+2023-02-08  Su_Laus  <sulau@freenet.de>
+
+	Fix 484 TIFFDirectory td_fieldsset uses unsigned long which can be 32 or 64 bits.
+	Closes #484
+
+2023-02-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'test_ifd_loop_detection_fix_CoverityScan_ln_55' into 'master'
+	test_ifd_loop_detection: fix Coverity Scan issue CID 1520750: Null pointer...
+
+	See merge request libtiff/libtiff!470
+
+2023-02-08  Su_Laus  <sulau@freenet.de>
+
+	test_ifd_loop_detection: fix Coverity Scan issue CID 1520750: Null pointer dereferences (NULL_RETURNS) line 55.
+
+2023-02-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_fix_CoverityScan_line_9676' into 'master'
+	Fix Coverity Scan issue CID 1520761: Integer handling issues...
+
+	See merge request libtiff/libtiff!469
+
+2023-02-06  Su_Laus  <sulau@freenet.de>
+
+	Fix Coverity Scan issue CID 1520761: Integer handling issues (OVERFLOW_BEFORE_WIDEN) tiffcrop.c: 9676 in rotateImage()
+
+2023-02-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_R270_fix#492' into 'master'
+	tiffcrop: Amend rotateImage() not to toggle the input (main) image width and...
+
+	Closes #519, #518, #499, #495, #494, #493 et #492
+
+	See merge request libtiff/libtiff!465
+
+2023-02-05  Su_Laus  <sulau@freenet.de>
+
+	tiffcrop: Amend rotateImage() not to toggle the input (main) image width and length parameters when only cropped image sections are rotated. Remove buffptr from region structure because never used.
+	Closes #492 #493 #494 #495 #499 #518 #519
+
+2023-02-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_correctly_update_buffersize_after_rotate_fix#520' into 'master'
+	tiffcrop correctly update buffersize after rotateImage() fix#520
+
+	Closes #520
+
+	See merge request libtiff/libtiff!467
+
+2023-02-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_composite_image_assumption_test_fix#496' into 'master'
+	tiffcrop: added check for assumption on composite images (fixes #496)
+
+	Closes #501, #500, #498, #497 et #496
+
+	See merge request libtiff/libtiff!466
+
+2023-02-05  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: added check for assumption on composite images (fixes #496)
+	tiffcrop: For composite images with more than one region, the combined_length or combined_width always needs to be equal, respectively. Otherwise, even the first section/region copy action might cause buffer overrun. This is now checked before the first copy action.
+
+	Closes #496, #497, #498, #500, #501.
+
+2023-02-04  Su_Laus  <sulau@freenet.de>
+
+	tiffcrop correctly update buffersize after rotateImage() fix#520  -- enlarge buffsize and check integer overflow within rotateImage().
+
+2023-02-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'test_subidf_loop' into 'master'
+	test_ifd_loop_detection: Added test to check loops in SubIFDs that are chained.
+
+	See merge request libtiff/libtiff!464
+
+2023-02-04  Su Laus  <sulau@freenet.de>
+
+	test_ifd_loop_detection: Added test to check loops in SubIFDs that are chained.
+
+2023-02-04  Su_Laus  <sulau@freenet.de>
+
+	Fix TIFFUnlinkDirectory(0) case and unlink of first directory.
+	If directory number 0 is unlinked, then the base offset variables within LibTiff are not updated. As a result, a subsequent TIFFSetDirectory() first goes to the unlinked former directory  number 0.
+
+	In addition, the error case for dirn=0 is handled.
+
+	This MR fixes that by updating the base offset variables  tif->tif_header.classic.tiff_diroff and tif->tif_header.big.tiff_diroff.
+
+2023-02-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TiffClose_NULL_ptr_dereferencing_fix_515' into 'master'
+	TIFFClose() avoid NULL pointer dereferencing. fix#515
+
+	Closes #515
+
+	See merge request libtiff/libtiff!468
+
+2023-02-03  Su_Laus  <sulau@freenet.de>
+
+	TIFFClose() avoid NULL pointer dereferencing. fix#515.
+	Closes #515
+
+	tiffcrop correctly update buffersize after rotateImage() fix#520 rotateImage() set up a new buffer and calculates its size individually. Therefore, seg_buffs[] size needs to be updated accordingly. Before this fix, the seg_buffs buffer size was calculated with a different formula than within rotateImage().
+	Closes #520.
+
+2023-01-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'add_windows_DLL_versioninfo' into 'master'
+	Add versioninfo resource files for DLL and tools compiled with Windows MSVC and MINGW.
+
+	See merge request libtiff/libtiff!455
+
+2023-01-25  Su Laus  <sulau@freenet.de>
+
+	Add versioninfo resource files for DLL and tools compiled with Windows MSVC and MINGW.
+
+2023-01-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tif_hash_set_order_include' into 'master'
+	tif_hash_set.c: include tif_hash_set.h after tif_config.h to let a chance for...
+
+	See merge request libtiff/libtiff!462
+
+2023-01-22  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_hash_set.c: include tif_hash_set.h after tif_config.h to let a chance for GDAL symbol renaming trick
+
+2023-01-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_513' into 'master'
+	Fax3: fix failure to decode some fax3 images (fixes #513)
+
+	Closes #513
+
+	See merge request libtiff/libtiff!461
+
+2023-01-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Add test for Fax3 decoding issues (refs #513)
+
+2023-01-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_fix_#488' into 'master'
+	tiffcrop: Correct simple copy paste error. Fix #488.
+
+	Closes #488
+
+	See merge request libtiff/libtiff!459
+
+2023-01-21  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: Correct simple copy paste error. Fix #488.
+
+2023-01-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Fax3: fix failure to decode some fax3 images (fixes #513)
+	Patch by @jsummers26
+
+2023-01-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffmedian_fix_#477' into 'master'
+	tiffmedian: avoid zero num_colors, fixes #477
+
+	Closes #477
+
+	See merge request libtiff/libtiff!458
+
+2023-01-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fax2ps_fixes_#475' into 'master'
+	fax2ps: fixes #475 buffer overflow in qsort function pcompar.
+
+	Closes #475
+
+	See merge request libtiff/libtiff!457
+
+2023-01-12  Su_Laus  <sulau@freenet.de>
+
+	tiffmedian: avoid zero num_colors, fixes #477.
+
+	fax2ps: fixes #475 buffer overflow in qsort function pcompar.
+
+2023-01-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_241_tiffset_file_size_limit' into 'master'
+	tiffset: get filesize to allocate only the required memory. Fixes issue #241
+
+	Closes #241
+
+	See merge request libtiff/libtiff!451
+
+2023-01-09  Su Laus  <sulau@freenet.de>
+
+	tiffset: get filesize to allocate only the required memory. Fixes issue #241
+
+2023-01-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch '_TIFFCleanupIFDOffsetAndNumberMaps' into 'master'
+	Add _TIFFCleanupIFDOffsetAndNumberMaps() and call it from TIFFUnlinkDirectory()
+
+	See merge request libtiff/libtiff!454
+
+2023-01-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Remove use of tif_dirnumber.
+
+	TIFFSetSubDirectory(): call _TIFFCleanupIFDOffsetAndNumberMaps()
+
+	struct tiff: remove unused tif_dirlistoff.
+
+	TIFFUnlinkDirectory(): reset tif_dirnumber.
+
+	Add _TIFFCleanupIFDOffsetAndNumberMaps() and call it from TIFFUnlinkDirectory()
+
+2022-12-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake_in_files_formatting_sensitive' into 'master'
+	Disable clang-formatting for tif_config.h.cmake.in and tiffconf.h.cmake.in...
+
+	See merge request libtiff/libtiff!452
+
+2022-12-28  Su_Laus  <sulau@freenet.de>
+
+	Disable clang-formatting for tif_config.h.cmake.in and tiffconf.h.cmake.in because sensitive for CMake scripts. - explanation added
+
+2022-12-26  Su_Laus  <sulau@freenet.de>
+
+	Disable clang-formatting for tif_config.h.cmake.in and tiffconf.h.cmake.in because sensitive for CMake scripts.
+
+2022-12-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'manpage_multi-page-TIFF' into 'master'
+	manpage: Add multi page TIFF and SubIFDs description and read / write example.
+
+	See merge request libtiff/libtiff!450
+
+2022-12-19  Su Laus  <sulau@freenet.de>
+
+	manpage: Add multi page TIFF and SubIFDs description and read / write example.
+
+2022-12-18  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFOpen_r+_windows_behaviour' into 'master'
+	Behavior of TIFFOpen() mode "r+" in the Windows implementation adjusted to that of Linux.
+
+	See merge request libtiff/libtiff!449
+
+2022-12-16  Su_Laus  <sulau@freenet.de>
+
+	Behavior of TIFFOpen() mode "r+" in the Windows implementation adjusted to that of Linux.
+
+2022-12-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'ossfuzz_54343' into 'master'
+	TIFFSetDirectory: avoid harmless unsigned-integer-overflow
+
+	See merge request libtiff/libtiff!447
+
+2022-12-15  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFSetDirectory: avoid harmless unsigned-integer-overflow.
+	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=54343
+
+2022-12-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'ossfuzz_54311' into 'master'
+	TIFFWriteDirectorySec(): avoid harmless unsigned-integer-overflow
+
+	See merge request libtiff/libtiff!446
+
+2022-12-14  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFWriteDirectorySec(): avoid harmless unsigned-integer-overflow.
+	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=54311
+
+2022-12-14  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff v4.5.0rc2 preparation
+
+2022-12-14  Su_Laus  <sulau@freenet.de>
+
+	tiffinfo: update curdir from uint16_t to tdir_t for more than 64k IFD handling.
+
+2022-12-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_502' into 'master'
+	Make TIFFSetDirectory(tiff, 65534) work again (fixes #502)
+
+	Closes #502
+
+	See merge request libtiff/libtiff!436
+
+2022-12-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Add tests for IFD loop detection.
+
+	Fix IFD loop detection.
+
+	Use UINT_MAX.
+
+	Make TIFF_MAX_DIR_COUNT a autoconf/CMake setting.
+
+2022-12-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'build-shared-by-default' into 'master'
+	Restore shared libraries by default
+
+	See merge request libtiff/libtiff!437
+
+2022-12-13  shaun walbridge  <shaun.walbridge@gmail.com>
+
+	CMake: restore shared libraries by default for top-level build.
+
+2022-12-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Add a TIFF_MAX_DIR_COUNT public #define.
+
+	TIFFCurrentDirectory(), TIFFNumberOfDirectories(), TIFFSetDirectory(), TIFFUnlinkDirectory(): use tdir_t that is now a uint32_t, and raise limit of IFDs to 1048576
+
+	IFD loop checking: use hashmap to avoid quadratic performance.
+
+	Add a hashset/hashmap implementation (ported from GDAL's CPLHashSet)
+
+	Make TIFFSetDirectory(tiff, 65534) work again (fixes #502)
+
+2022-12-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'do_not_format_tiffvers_h' into 'master'
+	Revert formatting of tiffvers.h and add TIFFLIB_MAJOR_VERSION, TIFFLIB_MINOR_VERSION, TIFFLIB_MICRO_VERSION defines
+
+	See merge request libtiff/libtiff!434
+
+2022-12-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_exclude_some_comment_from_clang-format' into 'master'
+	tiffcrop: Exclude some comments from clang-format
+
+	See merge request libtiff/libtiff!435
+
+2022-12-11  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: Exclude some comments from clang-format.
+
+2022-12-11  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffvers.h.in: add clang-format off/on.
+
+2022-12-10  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffvers.h: add TIFFLIB_MAJOR_VERSION, TIFFLIB_MINOR_VERSION, TIFFLIB_MICRO_VERSION defines
+	Also add a TIFFLIB_AT_LEAST() macro
+
+	tiffvers.h: revert formatting.
+
+	Exclude reformatting of tiffvers.h which breaks version detection for FindTIFF.cmake
+
+2022-12-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'release_4_5_0' into 'master'
+	Prepare v4.5.0 release
+
+	See merge request libtiff/libtiff!433
+
+2022-12-09  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff v4.5.0rc1 preparation
+
+2022-12-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'reformat' into 'master'
+	Whole code-base reformatting
+
+	See merge request libtiff/libtiff!431
+
+2022-12-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Add .git-blame-ignore-revs.
+
+	tiffcrop: remove version_id and rev_date.
+
+2022-12-08  pre-commit run by Even Rouault  <even.rouault-bot@spatialys.com>
+
+	Reformatting in all other directories using 'pre-commit run'
+
+	Reformatting in test/ using 'pre-commit run'
+
+	Reformatting in tools/ using 'pre-commit run'
+
+	Reformatting in libtiff/ using 'pre-commit run'
+
+2022-12-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Add .clang-format, .pre-commit-config.yaml and CONTRIBUTING.md.
+
+	Remove vim/emacs formatting footers.
+
+2022-11-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_489' into 'master'
+	TIFFWriteRawStrip(): restore capabilities to append data in the current strip (fixes #489)
+
+	Closes #489
+
+	See merge request libtiff/libtiff!430
+
+2022-11-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Add test case for scenario of issue #489.
+
+	TIFFWriteRawStrip(): restore capabilities to append data in the current strip (fixes #489)
+	This fixes a regression of libtiff 4.4.0
+
+2022-11-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'manpage_re-entrant_error_handler' into 'master'
+	manpage update for re-entrant error handler TIFFErrorExtR(), TIFFOpenExt() and...
+
+	See merge request libtiff/libtiff!427
+
+2022-11-29  Su Laus  <sulau@freenet.de>
+
+	manpage update for re-entrant error handler TIFFErrorExtR(), TIFFOpenExt() and...
+
+2022-11-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_fix_#169' into 'master'
+	tiffcrop: Add check if (bps != 1) in writeSingleSection() for...
+
+	Closes #169
+
+	See merge request libtiff/libtiff!429
+
+2022-11-27  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: Add check if (bps != 1) in writeSingleSection() for...
+
+2022-11-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFErrorExtR_fix_missing_calls' into 'master'
+	TIFFErrorExt() was not replaced with TIFFErrorExtR() everywhere in libtiff....
+
+	See merge request libtiff/libtiff!428
+
+2022-11-26  Su Laus  <sulau@freenet.de>
+
+	TIFFErrorExt() was not replaced with TIFFErrorExtR() everywhere in libtiff....
+
+2022-11-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tif_jpeg_build_fix' into 'master'
+	tif_jpeg.c: fix compilation with MSVC (fixes commit 0fd1a81d3547acb8f5be50bbbc3e44bde01c014b)
+
+	See merge request libtiff/libtiff!426
+
+2022-11-25  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: fix compilation with MSVC (fixes commit 0fd1a81d3547acb8f5be50bbbc3e44bde01c014b)
+
+2022-11-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_0fd1a81d3547acb8f5be50bbbc3e44bde01c014b' into 'master'
+	JPEGEncode(): fix wrong pointer data type with libjpeg-turbo 2.2dev in 12-bit mode
+
+	See merge request libtiff/libtiff!425
+
+2022-11-25  Even Rouault  <even.rouault@spatialys.com>
+
+	JPEGEncode(): fix wrong pointer data type with libjpeg-turbo 2.2dev in 12-bit mode
+	(fixes commit 0fd1a81d3547acb8f5be50bbbc3e44bde01c014b)
+
+2022-11-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'libjpegturbo_dual' into 'master'
+	Add support for libjpeg-turbo 2.2-dev 8/12 bit dual mode
+
+	See merge request libtiff/libtiff!422
+
+2022-11-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Add support for libjpeg-turbo 2.2-dev 8/12 bit dual mode.
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'windows-fix' into 'master'
+	libtiff: Fix TIFFOpen* for the Windows platform in tif_unix.c
+
+	See merge request libtiff/libtiff!424
+
+2022-11-23  Francois Bleibel  <fbleibel@gmail.com>
+
+	libtiff: Fix TIFFOpen* for the Windows platform in tif_unix.c.
+	I'm not sure where this change was made, but it must have been in a recent update. TIFFOpenWEx is now TIFFOpenWExt, and _TIFFgetMode takes additional arguments.
+
+	Verified: Tested libtiff on a local Windows build.
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_signed_vs_unsigned' into 'master'
+	tiffcrop.c: fix warning about signed vs unsigned comparison
+
+	See merge request libtiff/libtiff!423
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffcrop.c: fix warning about signed vs unsigned comparison.
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFClientOpenExt_warning_fix' into 'master'
+	TIFFClientOpenExt(): fix warning on 32-bit platforms (master only)
+
+	See merge request libtiff/libtiff!421
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFClientOpenExt(): fix warning on 32-bit platforms (master only)
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcp_TIFFOpenOptionsFree_memleak_fix' into 'master'
+	tiffcp: fix leak of TIFFOpenOptionsAlloc() introduced in latest commit (master only)
+
+	See merge request libtiff/libtiff!420
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffcp: fix leak of TIFFOpenOptionsAlloc() introduced in latest commit (master only)
+	Fixes Coverity CID 1517032
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFOpenOptionsSetMaxSingleMemAlloc' into 'master'
+	Add TIFFOpenOptionsSetMaxSingleMemAlloc() to define a limit in bytes for a single memory allocation done by libtiff
+
+	See merge request libtiff/libtiff!419
+
+2022-11-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Emit explicit error message when tif_max_single_mem_alloc is exceeded.
+
+	test_open_options: test TIFFOpenOptionsSetMaxSingleMemAlloc()
+
+	Rename test_error_handlers to test_open_options.
+
+	tiffinfo, tiffcp, tiffcrop, tiffsplit, tiff2rgba, tiff2ps: use TIFFOpenOptionsSetMaxSingleMemAlloc()
+
+	Convert uses of _TIFFmalloc/realloc/calloc/free to the Ext functions.
+
+2022-11-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Add TIFFOpenOptionsSetMaxSingleMemAlloc()
+	to define a limit in bytes for a single memory allocation done by libtiff.
+
+	Also add internal functions used in replacement of the non Ext ones:
+	void* _TIFFmallocExt(TIFF* tif, tmsize_t s);
+	void* _TIFFcallocExt(TIFF* tif, tmsize_t nmemb, tmsize_t siz);
+	void* _TIFFreallocExt(TIFF* tif, void* p, tmsize_t s);
+	void _TIFFfreeExt(TIFF* tif, void* p);
+
+2022-11-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFOpenEx' into 'master'
+	Add TIFFOpenExt(), TIFFOpenWExt() and TIFFFdOpenExt() with re-entrant error handlers
+
+	See merge request libtiff/libtiff!413
+
+2022-11-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Remove TIFFSetErrorHandlerExtR() and TIFFSetWarningHandlerExtR() that were temporarily added in master
+
+	Add a _TIFFErrorEarly() function to be able to use the re-entrant error handler, even before TIFF* is valid
+
+	Rework TIFFOpenExt() and similar to use an opaque TIFFOpenOptions* opts argument, with alloc, free and setters
+
+	Document TIFFOpenExt, TIFFOpenWExt, TIFFFdOpenExt, TIFFClientOpenExt, TIFFSetErrorHandlerExtR, TIFFSetWarningHandlerExtR
+
+2022-11-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Add TIFFOpenExt(), TIFFOpenWExt() and TIFFFdOpenExt() with re-entrant error handlers
+	Rename TIFFClientOpenEx() to TIFFClientOpenExt()
+
+	Rework signature of the re-entrant error handlers and of
+	TIFFSetWarningHandlerExt() and TIFFSetErrorHandlerExt()
+
+	Use structures that can be extended as extra argument.
+
+	Leverages and ammends https://gitlab.com/libtiff/libtiff/-/merge_requests/409
+
+2022-11-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'manpage_fix485_file-descriptor_clientdata' into 'master'
+	manpage: Correct description of file handle/descriptors tif_fd and tif_clientdata. Closes #485.
+
+	Closes #485
+
+	See merge request libtiff/libtiff!418
+
+2022-11-21  Su Laus  <sulau@freenet.de>
+
+	manpage: Correct description of file handle/descriptors tif_fd and tif_clientdata. Closes #485.
+
+2022-11-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'manpage_fix440_fix28_TIFFOpen_SubIFD_update' into 'master'
+	manpage: fix28, fix440, update TIFFOpen and SubIFD
+
+	Closes #440 et #28
+
+	See merge request libtiff/libtiff!417
+
+2022-11-20  Su Laus  <sulau@freenet.de>
+
+	manpage: fix28, fix440, update TIFFOpen and SubIFD.
+
+2022-11-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake_tiff_install_warning' into 'master'
+	CMakeLists.txt: fix warning with -Wdev
+
+	See merge request libtiff/libtiff!416
+
+2022-11-13  Even Rouault  <even.rouault@spatialys.com>
+
+	CMakeLists.txt: fix warning with -Wdev.
+	```
+	CMake Warning (dev) at CMakeLists.txt:62 (option):
+	  Policy CMP0077 is not set: option() honors normal variables.  Run "cmake
+	  --help-policy CMP0077" for policy details.  Use the cmake_policy command to
+	  set the policy and suppress this warning.
+
+	  For compatibility with older versions of CMake, option is clearing the
+	  normal variable 'tiff-install'.
+	This warning is for project developers.  Use -Wno-dev to suppress it.
+	```
+
+2022-11-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_479' into 'master'
+	_TIFFReadEncodedTileAndAllocBuffer(): avoid excessive memory allocation on...
+
+	See merge request libtiff/libtiff!412
+
+2022-11-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'typo_fix' into 'master'
+	tif_dirread.c: fix typo in comment
+
+	See merge request libtiff/libtiff!414
+
+2022-11-12  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_dirread.c: fix typo in comment.
+
+2022-11-11  Even Rouault  <even.rouault@spatialys.com>
+
+	_TIFFReadEncodedTileAndAllocBuffer(): avoid excessive memory allocation on broken files (fixes #479)
+
+2022-11-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'bugfix/tiff2pdf-stdout' into 'master'
+	tiff2pdf Don't try to seek into stdout.
+
+	See merge request libtiff/libtiff!367
+
+2022-11-10  Claus-Justus Heine  <himself@claus-justus-heine.de>
+
+	tiff2pdf: Don't try to seek into stdout.
+	Fixes #441
+
+2022-11-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_coverity_1516759' into 'master'
+	TIFFErrorExtR(): fix Dereference after null check (CID 1516759)
+
+	See merge request libtiff/libtiff!411
+
+2022-11-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_ossfuzz_53137' into 'master'
+	TIFFReadRGBATileExt(): fix (unsigned) integer overflow on strips/tiles > 2 GB
+
+	See merge request libtiff/libtiff!410
+
+2022-11-08  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFErrorExtR(): fix Dereference after null check (CID 1516759)
+
+2022-11-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'no_sprintf' into 'master'
+	Replace sprintf calls with snprintf
+
+	See merge request libtiff/libtiff!408
+
+2022-11-08  Mark Mentovai  <mark@chromium.org>
+
+	Replace sprintf calls with snprintf.
+	This makes it possible to build libtiff without warnings using the macOS
+	13 SDK. Calls to sprintf are replaced with snprintf, passing appropriate
+	buffer sizes.
+
+	It doesn’t appear that any of the changed uses of sprintf were actually
+	unsafe, so no behavior change is expected aside from SDK compatibility.
+
+	The macOS 13 SDK deprecates sprintf as it’s difficult to use safely. The
+	deprecation warning message is visible when building C++, but it is not
+	normally visible when building plain C code due to a quirk in how
+	sprintf is declared in the SDK. However, the deprecation message is
+	visible when building plain C under Address Sanitizer
+	(-fsanitize=address). This discrepancy was discovered at
+	https://crbug.com/1381706 and reported to Apple with a copy at
+	https://openradar.appspot.com/FB11761475.
+
+	The macOS 13 SDK is packaged in Xcode 14.1, released on 2022-11-01. This
+	also affects the iOS 16 SDK and other 2022-era Apple OS SDKs packaged in
+	Xcode 14.0, released on 2022-09-12.
+
+	libtiff is visible to the Chromium build via PDFium, and this change is
+	needed to allow Chromium to move forward to the macOS 13 SDK.
+
+	This change is limited to the libtiff directory. Other uses of sprintf
+	were found in contrib, test, and tools.
+
+2022-11-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'reentrant' into 'master'
+	Add reentrant error functions
+
+	See merge request libtiff/libtiff!409
+
+2022-11-08  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadRGBATileExt(): fix (unsigned) integer overflow on strips/tiles > 2 GB
+	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=53137
+
+2022-11-08  Laramie Leavitt  <lar@google.com>
+
+	Add reentrant error functions.
+	Prior to this change, libtiff relied on global error handlers,
+	which is problematic when libtiff used by multiple independent
+	libraries from within the same process, as they may unwittingly
+	clobber the error handling, introduce race conditions when setting
+	handlers, or otherwise have unintended side effects.
+
+	This change adds error handlers to the TIFF struct, which are
+	used preferentially when available. The error handlers are invoked
+	when the re-entrant error functions are called:
+
+	void TIFFErrorExtR(TIFF*, const char* module, const char* fmt, ...)
+	void TIFFWarningExtR(TIFF*, const char* module, const char* fmt, ...)
+
+	The handlers have a similar signature to the existing extended
+	handlers, additionally returning an int:
+
+	int TIFFErrorHandlerExtR(thandle_t, const char*, const char*, va_list)
+
+	 thandle_t is the userdata passed to TIFFOpen
+	 When the handler returns 1, the global handlers are not called.
+
+	Custom error/warning handlers may be installed on a per-file
+	basis by calling the Set functions:
+
+	  TIFF* tif = TIFFOpen(...);
+	  TIFFSetErrorHandlerExtR(tif, MyErrorHandler);
+	  TIFFSetWarningHandlerExtR(tif, MyWarningHandler);
+
+	Additionally, the callsites to TIFFErrorExt and TIFFWarningExt
+	have been updated to call the reentrant versions.
+
+2022-11-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_fix_CoverityScan_tmsize_issue' into 'master'
+	tiffcrop: should fix some Coverity Scan issues OVERFLOW_BEFORE_WIDEN
+
+	See merge request libtiff/libtiff!403
+
+2022-11-08  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: should fix some Coverity Scan issues OVERFLOW_BEFORE_WIDEN.
+
+2022-11-02  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'typo_fix' into 'master'
+	tif_dirread.c: fix typo in comment
+
+	See merge request libtiff/libtiff!407
+
+2022-11-02  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_dirread.c: fix typo in comment.
+
+2022-10-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_formatting_fix' into 'master'
+	tiffcrop: add casts in TIFFError() to fix compiler warnings
+
+	See merge request libtiff/libtiff!406
+
+2022-10-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_482' into 'master'
+	CMake: correctly set default value of 'lzma' option when liblzma is detected (fixes #482)
+
+	Closes #482
+
+	See merge request libtiff/libtiff!404
+
+2022-10-23  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffcrop: add casts in TIFFError() to fix compiler warnings.
+
+	CMake: correctly set default value of 'lzma' option when liblzma is detected (fixes #482)
+
+2022-10-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_480' into 'master'
+	Fix incorrect printf() formatters introduced in recent commits (fixes #480)
+
+	Closes #480
+
+	See merge request libtiff/libtiff!401
+
+2022-10-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix incorrect printf() formatters introduced in recent commits (fixes #480)
+
+2022-10-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'CLIPPATH_tags_corrected' into 'master'
+	CLIPPATH tags defined twice but differently and also wrongly (#439) - corrected
+
+	Closes #439
+
+	See merge request libtiff/libtiff!366
+
+2022-10-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'FIELD_IGNORE_warning-messages' into 'master'
+	Warning messages for FIELD_IGNORE tags for writing and for TIFF_SETGET_UNDEFINED for reading added. (#438)
+
+	Closes #438
+
+	See merge request libtiff/libtiff!365
+
+2022-10-13  Su Laus  <sulau@freenet.de>
+
+	Warning messages for FIELD_IGNORE tags for writing and for TIFF_SETGET_UNDEFINED for reading added. (#438)
+
+2022-10-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tag-def_COMPRESSION_corrected' into 'master'
+	tif_dirinfo.c TIFFTAG_COMPRESSION and _BITSPERSAMPLE definition corrected
+
+	See merge request libtiff/libtiff!364
+
+2022-10-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_getopt_included_twice' into 'master'
+	Fix including module getopt.c twice with CMake and HAVE_GETOPT=false
+
+	See merge request libtiff/libtiff!381
+
+2022-10-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_CoverityScan_fix_PRINTF_ARGS' into 'master'
+	tiffcrop: fix Coverity Scan issues about PRINTF_ARGS.
+
+	See merge request libtiff/libtiff!400
+
+2022-10-13  Su_Laus  <sulau@freenet.de>
+
+	tiffcrop fix Coverity Scan issues about PRINTF_ARGS.
+
+2022-10-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_fix_#450_too-many-mode-options' into 'master'
+	tiffcrop: fix #450 too many 'mode' options on command line.
+
+	Closes #470 et #450
+
+	See merge request libtiff/libtiff!384
+
+2022-10-13  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: fix #450 too many 'mode' options on command line.
+
+2022-10-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_fix_#435' into 'master'
+	tiffcrop subroutines require a larger buffer (fixes #271, #381, #386, #388, #389, #435)
+
+	Closes #465, #464, #435, #389, #388, #386, #381 et #271
+
+	See merge request libtiff/libtiff!382
+
+2022-10-13  Su Laus  <sulau@freenet.de>
+
+	tiffcrop subroutines require a larger buffer (fixes #271, #381, #386, #388, #389, #435)
+
+2022-10-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'InkNames_NumberOfInks_handling_revised' into 'master'
+	Revised handling of TIFFTAG_INKNAMES and related TIFFTAG_NUMBEROFINKS value (fixes #149, #150, #152, #168, #250, #269, #398 and #456)
+
+	Closes #474, #463, #387, #456, #398, #269, #250, #168, #152, #150 et #149
+
+	See merge request libtiff/libtiff!385
+
+2022-10-12  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_fix_#411_#413' into 'master'
+	tiffcrop: disable incompatibility of -Z, -X, -Y, -z options with any PAGE_MODE_x option (fixes #411, #413 and #426)
+
+	Closes #426, #413 et #411
+
+	See merge request libtiff/libtiff!383
+
+2022-10-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFAdvanceDirectory_mapped_uio' into 'master'
+	TIFFAdvanceDirectory(): fix unsigned-integer-overflow in mapped case
+
+	See merge request libtiff/libtiff!398
+
+2022-10-10  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFAdvanceDirectory(): fix unsigned-integer-overflow in mapped case.
+	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=52309
+
+2022-10-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffinfo_parse_SubIFDs' into 'master'
+	tiffinfo: Updated to parse through SubIFDs and show their tags.
+
+	See merge request libtiff/libtiff!396
+
+2022-10-08  Su Laus  <sulau@freenet.de>
+
+	tiffinfo: Updated to parse through SubIFDs and show their tags.
+
+2022-10-07  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'master' into 'master'
+	Moved linking of CMath::CMath into CMath_LIBRARY check
+
+	See merge request libtiff/libtiff!397
+
+2022-10-07  Frei Herr  <herr.frei@googlemail.com>
+
+	Moved linking of CMath::CMath into CMath_LIBRARY check.
+
+2022-10-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'rational_precision2double_coverity-fix' into 'master'
+	rational_precision2double.c: Fix issue from Coverity Scan.
+
+	See merge request libtiff/libtiff!395
+
+2022-10-06  Su_Laus  <sulau@freenet.de>
+
+	rational_precision2double.c: Fix issue from Coverity Scan.
+
+	Fix including module getopt.c twice with CMake and HAVE_GETOPT=false.
+	The "make-files" for the tools- and test- programmes include the module getopt.c once directly as additional source and then again by including port.lib.
+	This can be avoided by including getopt.c as source in port.lib within port\CMakeLists.txt not with PUBLIC but with PRIVATE.
+
+2022-10-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix-455_Improved-IFD-loop-handling' into 'master'
+	Improved IFD-Loop Handling (fixes #455)
+
+	Closes #455
+
+	See merge request libtiff/libtiff!386
+
+2022-10-06  Su Laus  <sulau@freenet.de>
+
+	Improved IFD-Loop Handling (fixes #455)
+	IFD infinite looping was not fixed by MR 20 (see #455).
+	An improved IFD loop handling is proposed.
+
+	Basic approach:
+
+	- The order in the entire chain must be checked, and not only whether an offset has already been read once.
+	- To do this, pairs of directory number and offset are stored and checked.
+	- The offset of a directory number can change.
+	- TIFFAdvanceDirectory() must also perform an IFD loop check.
+	- TIFFCheckDirOffset() is replaced by _TIFFCheckDirNumberAndOffset().
+
+	Rules for the check:
+
+	- If an offset is already in the list, it must have the same IFD number. Otherwise it is an IDF loop.
+	- If the offset is not in the list and the IFD number is greater than there are list entries, a new list entry is added.
+	- Otherwise, the offset of the IFD number is updated.
+
+	Reference is also made to old bugzilla bug 2772 and MR 20, which did not solve the general issue.
+	This MR closes #455
+
+2022-10-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'fix-cmake-subproject' into 'master'
+	Fix CMake build to be compatible with FetchContent
+
+	See merge request libtiff/libtiff!394
+
+2022-10-04  Timothy Lyanguzov  <theta682@gmail.com>
+
+	Apply 9 suggestion(s) to 3 file(s)
+
+2022-10-04  Jeremy Maitin-Shepard  <jbms@google.com>
+
+	Fix CMake build to be compatible with FetchContent.
+	Recent versions of CMake have improved support for including
+	dependencies, using the FetchContent module, which allows a dependency
+	to be imported as a subproject and then later found automatically by
+	calls to `find_package`.
+
+	This change makes libtiff's CMake better behaved when used as a
+	sub-project:
+
+	- CMake has a single global namespace for all target names in all
+	  sub-projects.  This commit renames the following CMake targets:
+
+	  - port -> tiff_port
+	  - mkg3states -> tiff_mkg3states
+	  - faxtable -> tiff_faxtable
+	  - release -> tiff_release
+
+	- When building TIFF as a sub-project, it is not normally useful to
+	  create install rules for its targets.  This commit adds a
+	  `tiff-install` option that controls whether the install rules are
+	  added and defaults to OFF when libtiff is included as a sub-project.
+
+	- Previously, libtiff set `BUILD_SHARED_LIBS` to ON by default.  With
+	  this commit, that default is only set if libtiff is the top-level
+	  project.
+
+	- When using `find_package(TIFF)`, the targets `TIFF::TIFF` and
+	  `TIFF::CXX` are defined.  This commit makes libtiff itself define
+	  those targets as aliases, to allow other cmake projects to use
+	  either `find_package` or `FetchContent` interchangeably.
+
+	- Adds ZSTD_HAVE_DECOMPRESS_STREAM variable which may be set to bypass
+	  `check_symbol_exists` call.  Fixes
+	  https://gitlab.com/libtiff/libtiff/-/issues/472.
+
+2022-09-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'getimage_overflow' into 'master'
+	Update getimage to support reading large raster images
+
+	See merge request libtiff/libtiff!389
+
+2022-09-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'MinGW-warnings_ipctutil' into 'master'
+	Fix #458: MinGW Windows 64: warning because 'long' is a 32 bits type in...
+
+	Closes #458
+
+	See merge request libtiff/libtiff!391
+
+2022-09-26  Su Laus  <sulau@freenet.de>
+
+	Fix #458: MinGW Windows 64: warning because 'long' is a 32 bits type in...
+
+2022-09-16  Eric Siegel  <siegel.eric@gmail.com>
+
+	Update getimage to support large raster images.
+
+2022-09-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'typo_fix' into 'master'
+	tif_lzw.c: fix typo in code comment
+
+	See merge request libtiff/libtiff!387
+
+2022-09-08  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lzw.c: fix typo in code comment.
+
+2022-08-30  Su_Laus  <sulau@freenet.de>
+
+	Revised handling of TIFFTAG_INKNAMES and related TIFFTAG_NUMBEROFINKS value
+	In order to solve the buffer overflow issues related to TIFFTAG_INKNAMES and related TIFFTAG_NUMBEROFINKS value, a revised handling of those tags within LibTiff is proposed:
+
+	Behaviour for writing:
+	    `NumberOfInks`  MUST fit to the number of inks in the `InkNames` string.
+	    `NumberOfInks` is automatically set when `InkNames` is set.
+	    If `NumberOfInks` is different to the number of inks within `InkNames` string, that will be corrected and a warning is issued.
+	    If `NumberOfInks` is not equal to samplesperpixel only a warning will be issued.
+
+	Behaviour for reading:
+	    When reading `InkNames` from a TIFF file, the `NumberOfInks` will be set automatically to the number of inks in `InkNames` string.
+	    If `NumberOfInks` is different to the number of inks within `InkNames` string, that will be corrected and a warning is issued.
+	    If  `NumberOfInks` is not equal to samplesperpixel only a warning will be issued.
+
+	This allows the safe use of the NumberOfInks value to read out the InkNames without buffer overflow
+
+	This MR will close the following issues:  #149, #150, #152, #168 (to be checked), #250, #269, #398 and #456.
+
+	It also fixes the old bug at http://bugzilla.maptools.org/show_bug.cgi?id=2599, for which the limitation of `NumberOfInks = SPP` was introduced, which is in my opinion not necessary and does not solve the general issue.
+
+2022-08-25  Su_Laus  <sulau@freenet.de>
+
+	tiffcrop: disable incompatibility of -Z, -X, -Y, -z options with any PAGE_MODE_x option (fixes #411 and #413)
+	tiffcrop does not support –Z, -z, -X and –Y options together with any other PAGE_MODE_x options like  -H, -V, -P, -J, -K or –S.
+
+	Code analysis:
+
+	With the options –Z, -z, the crop.selections are set to a value > 0. Within main(), this triggers the call of processCropSelections(), which copies the sections from the read_buff into seg_buffs[].
+	In the following code in main(), the only supported step, where that seg_buffs are further handled are within an if-clause with  if (page.mode == PAGE_MODE_NONE) .
+
+	Execution of the else-clause often leads to buffer-overflows.
+
+	Therefore, the above option combination is not supported and will be disabled to prevent those buffer-overflows.
+
+	The MR solves issues #411 and #413.
+
+2022-08-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_S-option_mutually_exclusive' into 'master'
+	tiffcrop:  -S option mutually exclusive (fixes #349, #414, #422, #423, #424)
+
+	Closes #424, #423, #422, #414 et #349
+
+	See merge request libtiff/libtiff!378
+
+2022-08-20  Su_Laus  <sulau@freenet.de>
+
+	tiffcrop -S option: Make decision simpler.
+
+2022-08-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'remove_death_commented_code' into 'master'
+	Remove dead code from tif_dirread.c, tif_dirwrite.c and tif_getimage.c
+
+	See merge request libtiff/libtiff!380
+
+2022-08-20  Su Laus  <sulau@freenet.de>
+
+	Remove dead code from tif_dirread.c, tif_dirwrite.c and tif_getimage.c.
+
+2022-08-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'coverity_fixes' into 'master'
+	Silence Coverity Scan false positive warnings about out-of-bounds access
+
+	See merge request libtiff/libtiff!379
+
+2022-08-16  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_zip.c: silence Coverity Scan false positive warnings about out-of-bounds access (CID 1491190, 1491197, 1491201)
+
+	tif_dirread.c: silence Coverity Scan false positive warnings about out-of-bounds access (CID 1491182, 1491186)
+
+2022-08-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'default_tag_values_extended' into 'master'
+	Presetting of default tag values extended (e.g. PlanarConfig). (fixes #449)
+
+	Closes #449
+
+	See merge request libtiff/libtiff!377
+
+2022-08-16  Su Laus  <sulau@freenet.de>
+
+	Presetting of default tag values extended (e.g. PlanarConfig). (fixes #449)
+
+2022-08-15  Su_Laus  <sulau@freenet.de>
+
+	According to Richard Nolde https://gitlab.com/libtiff/libtiff/-/issues/401#note_877637400 the tiffcrop option „-S“ is also mutually exclusive to the other crop options (-X|-Y), -Z and -z.
+	This is now checked and ends tiffcrop if those arguments are not mutually exclusive.
+
+	This MR will fix the following tiffcrop issues: #349, #414, #422, #423, #424
+
+2022-08-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'warning_fix' into 'master'
+	Fix warning about shadowing
+
+	See merge request libtiff/libtiff!376
+
+2022-08-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix warning about shadowing.
+
+2022-08-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_225' into 'master'
+	Deal with RichTIFFIPTC tag written with LONG type (fixes #225)
+
+	Closes #225
+
+	See merge request libtiff/libtiff!374
+
+2022-08-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Writing_IFD8_to_ClassicTIFF_bugfix' into 'master'
+	Correcting defects reported by Coverity Scan for MR !369
+
+	See merge request libtiff/libtiff!375
+
+2022-08-09  Su Laus  <sulau@freenet.de>
+
+	Correcting defects reported by Coverity Scan for MR !369.
+
+2022-08-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_442_Writing_IFD8_to_ClassicTIFF' into 'master'
+	TIFFSetValue(): Writing IFD8 & LONG8 tags to ClassicTIFF corrected (fixes #442)
+
+	Closes #442
+
+	See merge request libtiff/libtiff!369
+
+2022-08-09  Su Laus  <sulau@freenet.de>
+
+	TIFFSetValue(): Writing IFD8 & LONG8 tags to ClassicTIFF corrected (fixes #442)
+
+2022-08-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Deal with RichTIFFIPTC tag written with LONG type (fixes #225)
+
+2022-08-07  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'manpage-functions-added' into 'master'
+	doc: Missing public functions added to TIFF documentation in Sphinx
+
+	See merge request libtiff/libtiff!372
+
+2022-08-07  Su Laus  <sulau@freenet.de>
+
+	doc: Missing public functions added to TIFF documentation in Sphinx.
+
+2022-07-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tifjpeg_version_check' into 'master'
+	tif_jpeg.c: allow to pass -DEXPECTED_JPEG_LIB_VERSION=number to do optional...
+
+	See merge request libtiff/libtiff!373
+
+2022-07-29  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: allow to pass -DEXPECTED_JPEG_LIB_VERSION=number to do optional compile-time version check
+
+2022-07-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFReadFromUserBuffer_fix' into 'master'
+	TIFFReadFromUserBuffer(): fix clearing of TIFF_CODERSETUP flag that could...
+
+	See merge request libtiff/libtiff!371
+
+2022-07-21  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadFromUserBuffer(): fix clearing of TIFF_CODERSETUP flag that could cause issues with reading JPEG compressed files
+
+2022-07-21  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'vs2022-fixes' into 'master'
+	cmake: Correct duplicate definition of _CRT_SECURE_NO_WARNINGS
+
+	Closes #443
+
+	See merge request libtiff/libtiff!370
+
+2022-07-13  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: Correct duplicate definition of _CRT_SECURE_NO_WARNINGS.
+
+2022-07-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'vs2022-fixes' into 'master'
+	cmake: Fixes for Visual Studio 2022
+
+	See merge request libtiff/libtiff!368
+
+2022-07-13  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: Fixes for Visual Studio 2022.
+
+2022-07-03  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'elf-symbol-export' into 'master'
+	Explicit export of versioned ELF symbols
+
+	Closes #437
+
+	See merge request libtiff/libtiff!361
+
+2022-07-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_433' into 'master'
+	_TIFFCheckFieldIsValidForCodec(): return FALSE when passed a codec-specific...
+
+	Closes #433
+
+	See merge request libtiff/libtiff!363
+
+2022-07-01  Su_Laus  <sulau@freenet.de>
+
+	CLIPPATH tags defined twice but differently and also wrongly.
+	In tif_dirinfo.c the tags for clippath are wrongly defined and the tag TIFFTAG_XCLIPPATHUNITS is even different twice. Therefore, those tags cannot be written / read correctly and may even lead to buffer overflow.
+	E.g.: In the case of TIFFSetField(YCLIPPATHUNITS), a 1 byte storage space is allocated because of TIFF_SETGET_UNDEFINED, in which an  int32_t value should be stored because of TIFF_SLONG type definition. Then, an int32_t value is read from that 1 byte storage location.
+
+	The current definition is:
+
+	{ TIFFTAG_CLIPPATH, -1, -3, TIFF_BYTE, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ClipPath", NULL },
+	{ TIFFTAG_XCLIPPATHUNITS, 1, 1, TIFF_SLONG, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "XClipPathUnits", NULL },
+	{ TIFFTAG_XCLIPPATHUNITS, 1, 1, TIFF_SBYTE, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "XClipPathUnits", NULL },
+	{ TIFFTAG_YCLIPPATHUNITS, 1, 1, TIFF_SLONG, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "YClipPathUnits", NULL },
+
+	Whereas the correct definition according to TIFF Specification Supplement 1 (https://www.awaresystems.be/imaging/tiff/specification/TIFFPM6.pdf) should be:
+
+	{ TIFFTAG_CLIPPATH, -3, -3, TIFF_BYTE, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ClipPath", NULL },
+	{ TIFFTAG_XCLIPPATHUNITS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "XClipPathUnits", NULL },
+	{ TIFFTAG_YCLIPPATHUNITS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "YClipPathUnits", NULL },
+
+	Also the set_get_field of the following tag should be corrected from
+
+	{ TIFFTAG_INTEROPERABILITYIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "InteroperabilityIFDOffset", NULL },
+	to
+	{ TIFFTAG_INTEROPERABILITYIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "InteroperabilityIFDOffset", NULL },
+
+	However, if those tags schould not be handled by LibTiff because they are deamed as abandoned or unwanted tags, those tags need to be defined with FIELD_IGNORE instead of FIELD_CUSTOM and keeping set_field_type  = TIFF_SETGET_UNDEFINED
+
+2022-07-01  Su_Laus  <sulau@freenet.de>
+
+	In tif_dirinfo.c the definition for TIFFTAG_COMPRESSION has different settings of  field_readcount=TIFF_VARIABLE (-1) and field_writecount=1. The tag is defined with Count=1, thus field_readcount is wrong and should also be 1. Although TIFFTAG_BITSPERSAMPLE is defined with Count:N=SamplesPerPixel, only ONE uint16_t value is passed with TIFFSetField() and TIFFGetField(). However, an array with N=SamplesPerPixel equal values is written into the TIFF file. Shouldn't field_readcount = field_writecount = 1 then?  The behaviour of TiffLib does not change, because the handling is coded directly.
+
+2022-06-27  Even Rouault  <even.rouault@spatialys.com>
+
+	_TIFFCheckFieldIsValidForCodec(): return FALSE when passed a codec-specific tag and the codec is not configured (fixes #433)
+	This avoids crashes when querying such tags
+
+2022-06-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch '16bit_cielab' into 'master'
+	add basic 16bit-cielab support
+
+	See merge request libtiff/libtiff!336
+
+2022-06-27  Caolán McNamara  <caolan@skynet.ie>
+
+	Add basic 16bit-cielab support.
+	just a copy of putcontig8bitCIELab that reads 16bit vals but divide l by
+	257, a and b by 256 before passing to TIFFCIELabToXYZ
+
+	motivation: https://bugs.documentfoundation.org/show_bug.cgi?id=131199
+	the "clavijo16bitlab.tiff" example where tiffinfo says:
+	```
+	  Image Width: 2601 Image Length: 3503
+	  Resolution: 96, 96 pixels/inch
+	  Bits/Sample: 16
+	  Compression Scheme: AdobeDeflate
+	  Photometric Interpretation: CIE L*a*b*
+	  Orientation: row 0 top, col 0 lhs
+	  Samples/Pixel: 3
+	  Rows/Strip: 1
+	  Planar Configuration: single image plane
+	  DateTime: 2020:03:07 10:20:42
+	```
+
+2022-06-24  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'manpage-fixes' into 'master'
+	Sphinx documentation fixes
+
+	See merge request libtiff/libtiff!362
+
+2022-06-24  Roger Leigh  <rleigh@codelibre.net>
+
+	doc: Correct types and cross-references.
+
+	doc: Correct manual page path.
+
+	build: Make rational2double static only for automake.
+	This copies the same logic as used by CMake.
+
+	build: Update autoconf version to 4.5.0 and soname to 6.0.0.
+
+	build: Update autoconf ld-version-script default.
+
+	libtiff: Correct version script for changes since v4.4.0.
+
+	libtiff: Update version script documentation.
+
+	libtiff: Add symbol versioning of all exported symbols.
+
+2022-06-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'webp_mem_improvements' into 'master'
+	WEBP codec: avoid temporary buffer and memcpy() on whole tile/strip decoding
+
+	See merge request libtiff/libtiff!360
+
+2022-06-24  Roger Leigh  <rleigh@codelibre.net>
+
+	build: Enable symbol versioning by default.
+
+2022-06-24  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'sphinx-manpages' into 'master'
+	doc: Add Sphinx conversion of all manpages
+
+	Closes #361
+
+	See merge request libtiff/libtiff!356
+
+2022-06-24  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'remove-wince' into 'master'
+	Remove obsolete WinCE source file
+
+	See merge request libtiff/libtiff!357
+
+2022-06-23  Even Rouault  <even.rouault@spatialys.com>
+
+	WEBP codec: avoid temporary buffer and memcpy() on whole tile/strip decoding
+
+2022-06-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'horAcc8_fix' into 'master'
+	tif_predict.c: make horAcc8() work with icc (ICC) 2021.6.0 20220226 -O2
+
+	See merge request libtiff/libtiff!359
+
+2022-06-22  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_predict.c: make horAcc8() work with icc (ICC) 2021.6.0 20220226 -O2.
+	For a reason I don't understand, recent ICC generates wrong code in -O2
+	mode for the stride = 3 and 4 cases. The modified code is more
+	straightfoward, so go for it.
+
+2022-06-19  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'ci-restore-old' into 'master'
+	ci: Restore testing with Ubuntu 20.04
+
+	See merge request libtiff/libtiff!358
+
+2022-06-19  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Restore testing with Ubuntu 20.04.
+
+	Remove obsolete WinCE source file.
+
+	doc: Add missing punctuation.
+
+	doc: Remove semicolon from c:function definition.
+
+	doc: Remove remaining HTML entities.
+
+	doc: Improve the build page.
+
+2022-06-18  Roger Leigh  <rleigh@codelibre.net>
+
+	doc: Add Sphinx conversion of all manpages.
+
+2022-06-18  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'dist-html' into 'master'
+	build: Distribute and install HTML documentation
+
+	See merge request libtiff/libtiff!352
+
+2022-06-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'pkgconfig' into 'master'
+	Adding Requires.private generation
+
+	See merge request libtiff/libtiff!355
+
+2022-06-13  Yishen Miao  <mys721tx@gmail.com>
+
+	Adding Requires.private generation.
+	Adds Requires.private generation so that pkg-config can correctly find
+	the dependencies of libtiff.
+
+2022-06-11  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'ci-dist' into 'master'
+	ci: Archive distribution tar and zip files
+
+	See merge request libtiff/libtiff!354
+
+2022-06-11  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Archive distribution tar and zip files.
+
+2022-06-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'export_TIFFClampDoubleToUInt32' into 'master'
+	libtiff.def: export _TIFFClampDoubleToUInt32
+
+	See merge request libtiff/libtiff!353
+
+2022-06-11  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff.def: export _TIFFClampDoubleToUInt32.
+
+2022-06-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-415+427+428' into 'master'
+	fix the FPE in tiffcrop (#415, #427, and #428)
+
+	Closes #428, #427 et #415
+
+	See merge request libtiff/libtiff!346
+
+2022-06-11  4ugustus  <wangdw.augustus@qq.com>
+
+	fix the FPE in tiffcrop (#415, #427, and #428)
+
+2022-06-11  Roger Leigh  <rleigh@codelibre.net>
+
+	build: Distribute and install HTML documentation.
+
+2022-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tif_jpeg_warning_fix' into 'master'
+	tif_jpeg.c: fix error message
+
+	See merge request libtiff/libtiff!351
+
+2022-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: fix error message.
+
+2022-06-10  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'android_libm' into 'master'
+	Fix dependency on libm on Android
+
+	See merge request libtiff/libtiff!350
+
+2022-06-09  Matthias Kuhn  <matthias@opengis.ch>
+
+	Always link to libm if available.
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'vasyl5-master-patch-97651' into 'master'
+	libtoolize: command not found on macOS.
+
+	See merge request libtiff/libtiff!289
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'rst-docs' into 'master'
+	Convert HTML documentation to Sphinx RST
+
+	See merge request libtiff/libtiff!349
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge remote-tracking branch 'origin/master' into rst-docs.
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'cmake-xc-faxtable' into 'master'
+	cmake: Do not build faxtable target when cross-compiling
+
+	See merge request libtiff/libtiff!342
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'cmake-disable-options' into 'master'
+	Add options for disabling tools, tests, contrib and docs
+
+	See merge request libtiff/libtiff!343
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'cmake-msvc-options' into 'master'
+	cmake: Add MSVC options when building all libraries and executables
+
+	See merge request libtiff/libtiff!344
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'cmake-win32-libtiffxx-static' into 'master'
+	cmake: libtiffxx is static on win32
+
+	See merge request libtiff/libtiff!338
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'licence-file' into 'master'
+	Rename COPYRIGHT to LICENSE.md
+
+	See merge request libtiff/libtiff!345
+
+2022-06-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Rename COPYRIGHT to LICENSE.md.
+
+	doc: Fix make distcheck.
+
+	doc: Update automake configuration.
+
+	doc: Do not pass srcdir and builddir to sphinx-build.
+
+	doc: Additional top-level tidying.
+
+	doc: Tidy top-level index.
+
+	doc: Move bugs to project.
+
+	doc: Move misc to project.
+
+	doc: Move all BigTIFF documentation into specification directory.
+	* Remove the BigTIFF proposal since this has long been completed
+	* Update the BigTIFF PR to note completion of the work and replace
+	  present with past tense.
+
+	doc: Split release history.
+	This permits the newer releases to be included in the top-level
+	toctree without polluting it with dozens of old releases.
+
+	doc: Correct accents.
+
+	doc: BigTIFF design markup improvements.
+
+	Add doc/_static.
+
+	doc: Move TIFF specification and design notes into subdirectory.
+
+	doc: Move releases into subdirectory.
+
+	doc: Mark up TIFF tech note 2.
+
+	doc: Use sphinxdox theme.
+	The sphix_rtd_theme formats complex tables badly.
+
+	doc: Clean up HTML tags.
+
+2022-06-04  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Install Sphinx manual for use by GitLab pages.
+
+	doc: Use sphinx_rtd_theme.
+
+	Convert HTML documentation to Sphinx RST.
+	* Add CMake build logic
+	* Add Autotools build logic
+	* Move from html/ to doc/
+	* Manual pages are still generated HTML for the time being
+
+	git: Ignore common IDE build files.
+
+2022-06-04  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'ci-ubuntu-22.04' into 'master'
+	ci: Update to use Ubuntu 22.04 CI images
+
+	Closes #429
+
+	See merge request libtiff/libtiff!348
+
+2022-06-04  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'opengl-option' into 'master'
+	cmake: Add tiff-opengl option
+
+	See merge request libtiff/libtiff!340
+
+2022-06-04  Roger Leigh  <rleigh@codelibre.net>
+
+	tiffdump: Avoid overflow warning when reading.
+
+	ci: Update to use Ubuntu 22.04 CI images.
+
+2022-06-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	Include stdlib.h in tif_lzw.c.
+
+	See merge request libtiff/libtiff!347
+
+2022-06-04  Brian Ledger  <brian.peter.ledger@gmail.com>
+
+	Merge branch 'master' of https://gitlab.com/libtiff/libtiff.
+
+2022-06-04  Brian Ledger  <brian.peter.ledger@gmail.com>
+
+	Include stdlib.h in tif_lzw.c.
+	In `tif_lzw.c`, a call is made to `_byteswap_uint64`. This is declared in `stdlib.h`. `stdlib.h` is not included in `tib_lzw.c`, so a name error may occur.
+
+	This change adds `#include stdlib.h` to `tif_lzw.c`, to prevent a name error from occuring when `stdlib.h` is not included.
+
+2022-05-29  Roger Leigh  <rleigh@codelibre.net>
+
+	Add options for disabling tools, tests, contrib and docs.
+
+	cmake: Add MSVC options when building all libraries and executables.
+
+	cmake: Do not build faxtable target when cross-compiling.
+
+	cmake: Use add_compile_definitions and add_compile_options.
+	It seems that some CMake versions can't export targets using PRIVATE
+	linking, even though the private target is never used.
+
+	Merge remote-tracking branch 'origin/master' into cmake-msvc-options.
+
+	Merge remote-tracking branch 'origin/master' into cmake-win32-libtiffxx-static
+
+2022-05-29  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'ci-x64' into 'master'
+	ci: Remove arm64 temporarily
+
+	See merge request libtiff/libtiff!341
+
+2022-05-29  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Remove arm64 temporarily.
+
+	autoconf: Add --disable-opengl option.
+
+	cmake: Add tiff-opengl option.
+
+	cmake: Add MSVC options when building all libraries and executables.
+
+	cmake: libtiffxx is static on win32.
+
+2022-05-22  Even Rouault  <even.rouault@spatialys.com>
+
+	html/Makefile.am: add v4.4.0.html to docfiles.
+
+2022-05-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Update HOWTO-RELEASE with .tar.xz.
+
+	Prepare for release 4.4.0.
+
+2022-05-16  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff v4.4.0 released
+
+2022-05-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'pkgconf_abs_path' into 'master'
+	Handle absolute paths in pkg-config file
+
+	See merge request libtiff/libtiff!333
+
+2022-05-16  Miloš Komarčević  <miloskomarcevic@aim.com>
+
+	Handle absolute paths in pkg-config file.
+
+2022-05-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix-tests-with-ro-source-dir' into 'master'
+	cmake: allow running the tests with a read-only source directory
+
+	See merge request libtiff/libtiff!332
+
+2022-05-15  Alex Richardson  <alexrichardson@google.com>
+
+	cmake: allow running the tests with a read-only source directory.
+	Prior to this commit CTest would invoke all simple_tests tests with the
+	current working directory set to the source directory. However, some of
+	the tests (e.g. rewrite) will output files to the current working
+	directory and will therefore fail when run with a read-only source
+	directory. This can happen e.g. when testing a cross-compiled version of
+	libtiff where the sources are mounted read-only in the virtual machine.
+
+	Simply changing the working directory to CMAKE_CURRENT_BINARY_DIR allows
+	all but raw_decode to pass. The raw_decode test looks for files in the
+	source directory, and uses the `srcdir` environment variable to find, so
+	we also have to add a set_tests_properties() call to specify that env var.
+
+2022-05-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcrop_pipeline_error' into 'master'
+	tiffcrop: Fixes complain of pipeline "cmake-ninja-arm64"  about abs() on...
+
+	See merge request libtiff/libtiff!331
+
+2022-05-14  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: Fixes complain of pipeline "cmake-ninja-arm64"  about abs() on...
+
+2022-05-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFField_SetGetSize_CountSize' into 'master'
+	Public functions TIFFFieldSetGetSize() and TIFFieldSetGetCountSize() added.
+
+	See merge request libtiff/libtiff!284
+
+2022-05-14  Su Laus  <sulau@freenet.de>
+
+	Public functions TIFFFieldSetGetSize() and TIFFieldSetGetCountSize() added.
+
+2022-05-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'jondo-master-patch-87274' into 'master'
+	Replace add_compile_definitions for CMake versions before 3.12 (#238)
+
+	See merge request libtiff/libtiff!330
+
+2022-05-13  Robert Pollak  <robert.pollak@posteo.net>
+
+	Replace add_compile_definitions for CMake versions before 3.12 (#238)
+
+2022-05-13  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	Remove incorrect assert.
+
+	See merge request libtiff/libtiff!329
+
+2022-05-13  Ben Laurie  <benl@google.com>
+
+	Remove incorrect assert.
+
+2022-05-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_Issue#330' into 'master'
+	tiffcrop: Fix issue #330 and some more from 320 to 349
+
+	Closes #330
+
+	See merge request libtiff/libtiff!298
+
+2022-05-10  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: Fix issue #330 and some more from 320 to 349.
+
+2022-05-10  Even Rouault  <even.rouault@spatialys.com>
+
+	test_signed_tags.c: fix CID 1504376.
+
+2022-05-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_#29_tiffcp_orientationTag' into 'master'
+	tiffcp: Fix incomprehensible setting of orientation tag (fixes #29)
+
+	Closes #29
+
+	See merge request libtiff/libtiff!327
+
+2022-05-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'palette-8bit' into 'master'
+	tiff2pdf: handle 8-bit palette colormap
+
+	See merge request libtiff/libtiff!328
+
+2022-05-09  Jay Berkenbilt  <ejb@ql.org>
+
+	tiff2pdf: handle 8-bit palette colormap.
+	If all the colors in a palette are in the range [0, 255], treat the
+	palette as an 8-bit colormap. This workaround already exists elsewhere
+	in the software including in tiff2ps.
+
+2022-05-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_#40_ReadSignedTags' into 'master'
+	Reading of signed tags added (fixes #40)
+
+	Closes #40
+
+	See merge request libtiff/libtiff!326
+
+2022-05-08  Su Laus  <sulau@freenet.de>
+
+	Reading of signed tags added (fixes #40)
+
+2022-05-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix typos in comments.
+
+2022-05-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_400' into 'master'
+	tiffcp: avoid buffer overflow in "mode" string (fixes #400)
+
+	Closes #400
+
+	See merge request libtiff/libtiff!323
+
+2022-05-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'CheckForBigTiff' into 'master'
+	TIFFIsBigTiff()  function added.
+
+	See merge request libtiff/libtiff!325
+
+2022-05-08  Su Laus  <sulau@freenet.de>
+
+	TIFFIsBigTiff()  function added.
+
+2022-05-01  Su_Laus  <sulau@freenet.de>
+
+	tiffcp: Fix incomprehensible setting of orientation tag (fixes #29)
+
+2022-04-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_#8_FreeAnonTag' into 'master'
+	extra flag for anonymous (unknown) tags (fixes #8)
+
+	Closes #400 et #8
+
+	See merge request libtiff/libtiff!324
+
+2022-04-22  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lzw.c: fix potential out-of-bounds error when trying to read in the same tile/strip after an error has occured (fixes #410)
+
+2022-04-06  Su_Laus  <sulau@freenet.de>
+
+	extra flag for anonymous (unknown) tags (fixes #8)
+
+2022-04-02  Su_Laus  <sulau@freenet.de>
+
+	tiffcp: avoid buffer overflow in "mode" string (fixes #400)
+
+2022-03-21  Even Rouault  <even.rouault@spatialys.com>
+
+	avoid hang in TIFFRewriteDirectory() if a classic file > 4 GB is attempted to be created
+	Fixes https://github.com/OSGeo/gdal/issues/5479
+
+2022-03-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Correct_tag_auto-registration_description' into 'master'
+	Correct reading description for anonymous tag auto-registration in addingtags.html (closes 353)
+
+	Closes #353
+
+	See merge request libtiff/libtiff!320
+
+2022-03-19  Su Laus  <sulau@freenet.de>
+
+	Correct reading description for anonymous tag auto-registration in addingtags.html (closes 353)
+
+2022-03-18  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lzw.c: avoid harmless unsigned-integer-overflow (https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=45741)
+
+2022-03-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_396' into 'master'
+	tiffcp: do not try to fetch compressor-specific tags when not appropriate (fixes #396)
+
+	Closes #396
+
+	See merge request libtiff/libtiff!316
+
+2022-03-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_cmake_warnings' into 'master'
+	Fix some CMake warnings
+
+	See merge request libtiff/libtiff!319
+
+2022-03-17  Su Laus  <sulau@freenet.de>
+
+	Fix some CMake warnings.
+
+2022-03-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'lzw_decode_improvements' into 'master'
+	LZWDecode(): major speed improvements
+
+	See merge request libtiff/libtiff!318
+
+2022-03-16  Even Rouault  <even.rouault@spatialys.com>
+
+	LZWDecode(): major speed improvements.
+	This mostly comes from dealing specifically with codes that expand to
+	2, 3 and 4 bytes or more to avoid branches, and dealing with longer
+	repeated sequences (e.g. lots of bytes to 0).
+
+	With the following bench.c, execution time is 32% faster on a 8000x8000
+	4 bands uint16 predictor=2 image that has a 1.6x compression ratio. with
+	gcc 9.4.0, on x86_64
+
+	bench.c:
+	```
+	 #include "tiffio.h"
+	 #include <stdlib.h>
+	 #include <stdint.h>
+
+	int main(int argc, char* argv[])
+	{
+	    if( argc != 2 )
+	    {
+	        fprintf(stderr, "Usage: ./bench my.tif\n");
+	        exit(1);
+	    }
+	    TIFF* tif = TIFFOpen(argv[1], "r");
+	    if( tif == NULL )
+	    {
+	        fprintf(stderr, "Cannot open %s\n", argv[1]);
+	        exit(1);
+	    }
+	    if( !TIFFIsTiled(tif) )
+	    {
+	        fprintf(stderr, "Only tiled image supported\n");
+	        exit(1);
+	    }
+	    int tilesize = (int)TIFFTileSize(tif);
+	    char* c = malloc(tilesize);
+	    if( c == NULL )
+	    {
+	        fprintf(stderr, "Out of memory\n");
+	        exit(1);
+	    }
+	    const uint32_t numtiles = TIFFNumberOfTiles(tif);
+	    //int numloops = 4 * (int)(1e9 / ((double)tilesize * numtiles));
+	    //printf("Number of loops: %d\n", numloops);
+	    int numloops = 1;
+	    for(int i =0; i< numloops; i++)
+	    {
+	        for(uint32_t tileindex = 0; tileindex < numtiles; tileindex++ )
+	        {
+	            TIFFReadEncodedTile(tif, tileindex, c, tilesize);
+	        }
+	    }
+	    free(c);
+	    TIFFClose(tif);
+	    return 0;
+	}
+	```
+
+2022-03-16  Even Rouault  <even.rouault@spatialys.com>
+
+	LZWDecode(): modest speed improvement: fetch input data by chunks of the largest natural integer of the architecture
+
+2022-03-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'kmilos-master-patch-45885' into 'master'
+	Correct fix for the pkgconf file relative paths
+
+	See merge request libtiff/libtiff!317
+
+2022-03-10  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lzw.c: make LZW_CHECKEOS non-optional.
+
+	tiffsplit.c: fix compiler warning on 32-bit.
+
+2022-03-10  Miloš Komarčević  <miloskomarcevic@aim.com>
+
+	Correct fix for the pkgconf file relative paths.
+
+2022-03-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-278' into 'master'
+	fix heap buffer overflow in tiffcp (#278)
+
+	Closes #278
+
+	See merge request libtiff/libtiff!311
+
+2022-03-10  4ugustus  <wangdw.augustus@qq.com>
+
+	fix heap buffer overflow in tiffcp (#278)
+
+2022-03-09  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffcp: do not try to fetch compressor-specific tags when not appropriate (fixes #396)
+
+2022-03-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'i_am_a_unsympathetic_person' into 'master'
+	index.html: make it clear that I'm a unsympathetic person
+
+	See merge request libtiff/libtiff!315
+
+2022-03-09  Even Rouault  <even.rouault@spatialys.com>
+
+	index.html: make it clear that I'm a unsympathetic person.
+
+2022-03-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_Issue#395' into 'master'
+	tiffcrop: fix issue #395: generation of strange section images.
+
+	Closes #395
+
+	See merge request libtiff/libtiff!312
+
+2022-03-08  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: fix issue #395: generation of strange section images.
+
+2022-03-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_Issue#380' into 'master'
+	tiffcrop: fix issue #380 and #382 heap buffer overflow in extractImageSection
+
+	Closes #382 et #380
+
+	See merge request libtiff/libtiff!307
+
+2022-03-08  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: fix issue #380 and #382 heap buffer overflow in extractImageSection
+
+2022-03-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-392' into 'master'
+	add checks for return value of limitMalloc (#392)
+
+	Closes #392
+
+	See merge request libtiff/libtiff!314
+
+2022-03-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-393' into 'master'
+	fix the FPE in tiffcrop (#393)
+
+	Closes #393
+
+	See merge request libtiff/libtiff!310
+
+2022-03-08  4ugustus  <wangdw.augustus@qq.com>
+
+	fix the FPE in tiffcrop (#393)
+
+2022-03-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'kmilos-master-patch-56785' into 'master'
+	Fix pkgconf file relative paths
+
+	Closes #394
+
+	See merge request libtiff/libtiff!309
+
+2022-03-07  Augustus  <wangdw.augustus@qq.com>
+
+	add checks for return value of limitMalloc (#392)
+
+2022-03-02  Miloš Komarčević  <miloskomarcevic@aim.com>
+
+	Fix pkgconf file relative paths.
+
+2022-02-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_385' into 'master'
+	tif_jbig.c: fix crash when reading a file with multiple IFD in memory-mapped...
+
+	Closes #385
+
+	See merge request libtiff/libtiff!306
+
+2022-02-24  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jbig.c: fix crash when reading a file with multiple IFD in memory-mapped mode and when bit reversal is needed (fixes #385)
+
+2022-02-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'string_size_limit' into 'master'
+	_TIFFVSetField(): when passing a string without explicit length, check that...
+
+	See merge request libtiff/libtiff!304
+
+2022-02-24  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'TIFFClientOpen_cleanup' into 'master'
+	TIFFClientOpen(): remove useless initializations of tif_rawcc and tif_flags...
+
+	See merge request libtiff/libtiff!303
+
+2022-02-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Remove extra word in comment.
+
+	TIFFPrintDirectory(): avoid potential multi-threading issue when reading the DotRange tag
+	The severity of the issue would be low (mix of values displayed) and the
+	time window where that would occur would be short.
+
+	Constify signature of _TIFFsetXXXXArray() functions, and remove unused _TIFFsetString()
+
+	_TIFFVSetField(): when passing a string without explicit length, check that the length doesn't except the 1 << 31 maximum bytes we support
+
+2022-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffsplit.c: fix use after free introduced in master per commit 8ed97f401552a2b4300d3c489b03dcada86a21fd (related to #290)
+
+2022-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_Issue#284' into 'master'
+	tiff2ps: In limitMalloc() check for negative size (fixes #284)
+
+	Closes #284
+
+	See merge request libtiff/libtiff!300
+
+2022-02-19  Su Laus  <sulau@freenet.de>
+
+	tiff2ps: In limitMalloc() check for negative size (fixes #284)
+
+2022-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_288' into 'master'
+	tiffinfo: limit more memory allocations using -M switch (fixes #288)
+
+	Closes #288
+
+	See merge request libtiff/libtiff!299
+
+2022-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_Issue#290' into 'master'
+	tiffsplit: limitMalloc() and getopt() introduced and more error messages. (fixes #290)
+
+	Closes #290
+
+	See merge request libtiff/libtiff!301
+
+2022-02-19  Su Laus  <sulau@freenet.de>
+
+	tiffsplit: limitMalloc() and getopt() introduced and more error messages. (fixes #290)
+
+2022-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_Issue#273_#275' into 'master'
+	tiffcrop: buffsize check  formula in loadImage() amended  (fixes #273,#275)
+
+	Closes #275 et #273
+
+	See merge request libtiff/libtiff!302
+
+2022-02-19  Su Laus  <sulau@freenet.de>
+
+	tiffcrop: buffsize check  formula in loadImage() amended  (fixes #273,#275)
+
+2022-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFClientOpen(): remove useless initializations of tif_rawcc and tif_flags after TIFFReadDirectory()
+	Those initializations date back to the initial commit of libtiff, but I
+	strongly suspect there are no longer needed those days.
+	Setting tif_rawcc to (tmsize_t)-1 is weird. AFAICS, nowhere else in the library
+	-1 is used as a special markeri for that field. Immediately after TIFFReadDirectory()
+	returns it is set to 0, and this is the value used in tif_read.c/tif_write.c to
+	reset it.
+	And setting the TIFF_BUFFERSETUP bit of tif_flags is even more
+	suspicious as the only place where it is set otherwise is in
+	TIFFWriteBufferSetup(). I suspect this bogus setting of the flag was the
+	reason for commit dbf2339a1 where BUFFERCHECK() in addition to checking
+	the bit also checked the tif_rawdata against nullptr.
+
+	If setting those 2 fields was needed, it would mean that TIFFClientOpen() with the
+	'h' hint to disable automatic TIFFReadDirectory() would be broken,
+	because someone issuing a manual TIFFReadDirectory() couldn't set them,
+	as being private members.
+
+	The libtiff test suite is happy with that change, and the GDAL one too.
+
+2022-02-19  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFFetchNormalTag(): speed optimization when reading a (very large) nul-terminated ASCII tag
+
+	TIFFWriteDirectoryTagData(): turn assertion on data length into a runtime check
+	For example, the assertion could actually be triggered when writing an
+	ASCII tag with more than 1 << 31 bytes.
+
+2022-02-17  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFFetchNormalTag(): avoid calling memcpy() with a null source pointer and size of zero (fixes #383)
+
+2022-02-15  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'tl/fix-cpack' into 'master'
+	Fix packaging with CPack
+
+	See merge request libtiff/libtiff!292
+
+2022-02-11  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffinfo: limit more memory allocations using -M switch (fixes #288)
+
+	tif_dirwrite.c: take into account COMPRESSION_JXL.
+
+2022-02-11  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'predictor_2_64bit' into 'master'
+	Predictor 2 (horizontal differenciation): support 64-bit
+
+	See merge request libtiff/libtiff!296
+
+2022-02-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_Issue#365' into 'master'
+	tiff2pdf: Fixes issues #365, #258 and #257 related to initializing 't2p->pdf_compressionquality'.
+
+	Closes #257, #258 et #365
+
+	See merge request libtiff/libtiff!297
+
+2022-02-10  Su Laus  <sulau@freenet.de>
+
+	tiff2pdf: Fixes issues #365, #258 and #257 related to initializing 't2p->pdf_compressionquality'.
+
+2022-02-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Predictor 2 (horizontal differenciation): support 64-bit.
+	There's no reason not to support 64-bit. The TIFF 6 specification
+	doesn't say anything about that (and even mention 4-bit, which we don't
+	support)
+
+2022-02-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_Issue#352' into 'master'
+	tiffcrop.c: Fix issue #352 heap-buffer-overflow by correcting uint32_t underflow.
+
+	Closes #352
+
+	See merge request libtiff/libtiff!294
+
+2022-02-09  Su Laus  <sulau@freenet.de>
+
+	tiffcrop.c: Fix issue #352 heap-buffer-overflow by correcting uint32_t underflow.
+
+2022-02-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'custom_dir_EXIF_Coverity_fixes' into 'master'
+	Fix Coverity Scan report issues for custom_dir_EXIF_231.c and test_directory.c
+
+	See merge request libtiff/libtiff!295
+
+2022-02-08  Su Laus  <sulau@freenet.de>
+
+	Fix Coverity Scan report issues for custom_dir_EXIF_231.c and test_directory.c
+
+2022-02-06  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'cmake-test' into 'master'
+	Correct CMake testing
+
+	Closes #317
+
+	See merge request libtiff/libtiff!291
+
+2022-02-06  Even Rouault  <even.rouault@spatialys.com>
+
+	LogLuvEncode32(): avoid undefined behaviour of left shift on a signed integer
+
+	TIFFFetchStripThing(): avoid calling memcpy() with a null source pointer and size of zero (fixes #362)
+
+2022-02-05  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadDirectory(): avoid calling memcpy() with a null source pointer and size of zero (fixes #362)
+
+2022-01-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Jamaika1-master-patch-68264' into 'master'
+	Added stdlib.h
+
+	See merge request libtiff/libtiff!293
+
+2022-01-29  Jamaika  <lukaszcz18@wp.pl>
+
+	tif_win32.c: include stdlib.h.
+
+2022-01-28  Timothy Lyanguzov  <timothy.lyanguzov@sap.com>
+
+	Fix packaging with CPack.
+	Replace all CMAKE_INSTALL_FULL_<DIR> with CMAKE_INSTALL_<DIR> to allow CPack setting CMAKE_INSTALL_PREFIX
+
+2022-01-25  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	Fix the global-buffer-overflow in tiffset
+
+	See merge request libtiff/libtiff!287
+
+2022-01-25  4ugustus  <wangdw.augustus@qq.com>
+
+	tiffset: fix global-buffer-overflow for ASCII tags where count is required (fixes #355)
+
+2022-01-23  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'autogen' into 'master'
+	Fix autogen.sh permissions issues during mv
+
+	See merge request libtiff/libtiff!290
+
+2022-01-23  Roger Leigh  <rleigh@codelibre.net>
+
+	Correct CMake testing.
+	* Use functions rather than macros to avoid problems with variables in
+	  conditions (since macro arguments are not variables)
+	* Conditionally add to file lists and test program lists based upon the
+	  configuration options (e.g. JPEG and old-JPEG availability)
+	* Sync tests, files and option usage with current automake usage
+
+2022-01-19  Will Cohen  <willcohen@users.noreply.github.com>
+
+	autogen.sh: mv -f for config.sub and config.guess.
+
+2022-01-12  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFYCbCrToRGBInit(): avoid Integer-overflow in gdal_TIFFYCbCrToRGBInit. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=43559
+
+2022-01-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_TIFFFillStrip_wrong_check' into 'master'
+	Fix sanity check in TIFFFillStrip()/TIFFFillStrile()
+
+	See merge request libtiff/libtiff!288
+
+2022-01-10  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFFillStrip()/TIFFFillStrile(): remove useless test.
+
+	Fix sanity check in TIFFFillStrip()/TIFFFillStrile()
+	A sanity check comparing the compressed vs uncompressed file that was
+	originally written 'correctly' but relied on undefined behaviour was
+	changed in 1b5e3b6a23827c33acf19ad50ce5ce78f12b3773 in an incorrect way.
+	Fix that. Credits to @burn for spotting this in
+	https://gitlab.com/libtiff/libtiff/-/issues/343#note_806089714
+
+2021-12-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'Fix_FieldName_NULL' into 'master'
+	Fix Issue #354 Segmentation Fault due to field_name=NULL
+
+	See merge request libtiff/libtiff!285
+
+2021-12-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'mingw-static' into 'master'
+	build: Fix static library imports in mingw
+
+	See merge request libtiff/libtiff!286
+
+2021-12-29  Biswapriyo Nath  <nathbappai@gmail.com>
+
+	build: Fix static library imports in mingw.
+	This defines LERC_STATIC while creating libtiff static library
+	in Win32 platform in presence of lerc library. Otherwise, the
+	static library import lerc APIs with dllimport attribute and
+	thus linked with shared lerc library.
+
+2021-12-28  Su_Laus  <sulau@freenet.de>
+
+	Fix Issue #354 Segmentation Fault due to field_name=NULL.
+
+2021-12-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_342' into 'master'
+	TIFFGetField(TIFFTAG_STRIPBYTECOUNTS/TIFFTAG_STRIPOFFSETS): return error if...
+
+	Closes #342
+
+	See merge request libtiff/libtiff!283
+
+2021-12-16  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFGetField(TIFFTAG_STRIPBYTECOUNTS/TIFFTAG_STRIPOFFSETS): return error if returned pointer is NULL (fixes #342)
+
+	tiff2pdf: validate TIFFGetField(input, TIFFTAG_STRIPBYTECOUNTS, &sbc) return (fixes #342)
+
+2021-12-16  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'master' into 'master'
+	fix raw2tiff floating point exception(fixes #338)
+
+	Closes #338
+
+	See merge request libtiff/libtiff!282
+
+2021-12-16  t.feng  <t.feng94@foxmail.com>
+
+	raw2tiff: check that band number if not zero to avoid floating point exception(fixes #338)
+
+2021-12-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_337' into 'master'
+	OJPEG: avoid assertion when using TIFFReadScanline() (fixes #337)
+
+	Closes #337
+
+	See merge request libtiff/libtiff!280
+
+2021-12-13  Even Rouault  <even.rouault@spatialys.com>
+
+	OJPEG: avoid assertion when using TIFFReadScanline() (fixes #337)
+	Note: my analyis of the issue would be that the use of the scanline API
+	is currently propably broken with OJPEG.
+
+2021-12-10  Even Rouault  <even.rouault@spatialys.com>
+
+	JPEG 12bit: make it easier for GDAL's RENAME_INTERNAL_LIBTIFF_SYMBOLS mode
+
+2021-12-09  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lzw.c: other warning fixes.
+
+	tif_lzw.c: fix warnings of previous commit.
+
+2021-12-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'lzw_2gb_windows' into 'master'
+	LZW codec: fix support for strips/tiles > 2 GB on Windows
+
+	See merge request libtiff/libtiff!279
+
+2021-12-08  Even Rouault  <even.rouault@spatialys.com>
+
+	LZW codec: fix support for strips/tiles > 2 GB on Windows.
+
+2021-12-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_287' into 'master'
+	tiffinfo: add a -M switch to define the maximum heap allocation, and default...
+
+	Closes #287
+
+	See merge request libtiff/libtiff!278
+
+2021-12-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_319' into 'master'
+	TIFFReadDirectory: fix OJPEG hack (fixes #319)
+
+	Closes #319
+
+	See merge request libtiff/libtiff!277
+
+2021-12-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_309' into 'master'
+	TIFFAppendToStrip(): fix rewrite-in-place logic (fixes #309)
+
+	Closes #309
+
+	See merge request libtiff/libtiff!276
+
+2021-12-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'b1' into 'master'
+	Fix resource leak on error path
+
+	See merge request libtiff/libtiff!263
+
+2021-12-05  bonniegong  <yuanjungong96@gmail.com>
+
+	rast2tiff: Fix resource leak on error path.
+
+2021-12-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffsplit-leak' into 'master'
+	tiffsplit.c: Fix memleak before exit
+
+	See merge request libtiff/libtiff!270
+
+2021-12-05  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffinfo: add a -M switch to define the maximum heap allocation, and default it to 256 MiB (fixes #287)
+
+	tiffinfo: fix read of invalid pointer in TIFFReadRawDataTiled() (fixes #295)
+
+2021-12-05  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadDirectory: fix OJPEG hack (fixes #319)
+	to avoid having the size of the strip arrays inconsistent with the
+	number of strips returned by TIFFNumberOfStrips(), which may cause
+	out-ouf-bounds array read afterwards.
+
+	One of the OJPEG hack that alters SamplesPerPixel may influence the
+	number of strips. Hence compute tif_dir.td_nstrips only afterwards.
+
+2021-12-04  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFAppendToStrip(): fix rewrite-in-place logic (fixes #309)
+	Properly reset tif_curoff when writing strips/tiles
+
+2021-12-03  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReInitJPEG_12(): avoid warning about unused variable in -DNDEBUG.
+
+2021-12-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_316' into 'master'
+	TIFFReadCustomDirectory(): avoid crash when reading SubjectDistance tag on a non EXIF directory
+
+	Closes #316
+
+	See merge request libtiff/libtiff!273
+
+2021-12-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'VisualStudio_warnings_suppress' into 'master'
+	Suppress unnecessary warnings in Visual Studio in AppVeyor test.
+
+	See merge request libtiff/libtiff!234
+
+2021-11-30  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFReadCustomDirectory(): avoid crash when reading SubjectDistance tag on a non EXIF directory
+	Fixes #316
+
+	The Valgrind trace was
+	```
+	TIFFReadCustomDirectory: Warning, Unknown field with tag 37382 (0x9206) encountered.
+	==3277355== Invalid read of size 1
+	==3277355==    at 0x4842B60: memmove (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
+	==3277355==    by 0x48BB799: _TIFFmemcpy (tif_unix.c:346)
+	==3277355==    by 0x485B3CB: _TIFFVSetField (tif_dir.c:647)
+	==3277355==    by 0x485C125: TIFFVSetField (tif_dir.c:890)
+	==3277355==    by 0x485BEDC: TIFFSetField (tif_dir.c:834)
+	==3277355==    by 0x486DA9A: TIFFFetchSubjectDistance (tif_dirread.c:5826)
+	==3277355==    by 0x4869E35: TIFFReadCustomDirectory (tif_dirread.c:4530)
+	==3277355==    by 0x4869F0A: TIFFReadGPSDirectory (tif_dirread.c:4564)
+	==3277355==    by 0x10AA7A: main (tiffinfo.c:171)
+	==3277355==  Address 0x3fc856aaaaaaaaab is not stack'd, malloc'd or (recently) free'd
+	==3277355==
+	```
+
+2021-11-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'add-null-check' into 'master'
+	Added missing null check.
+
+	See merge request libtiff/libtiff!274
+
+2021-11-28  Dirk Lemstra  <dirk@lemstra.org>
+
+	Added missing null check.
+
+2021-11-26  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_print.c: remove duplicated if() in previous commit.
+
+2021-11-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'GPS_Print_BugFix' into 'master'
+	Fix Segmentation fault printing GPS directory if Altitude tag is present (tif_print.c/tiffinfo.c)
+
+	See merge request libtiff/libtiff!272
+
+2021-11-26  Su Laus  <sulau@freenet.de>
+
+	Fix Segmentation fault printing GPS directory if Altitude tag is present (tif_print.c/tiffinfo.c)
+
+2021-11-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake_tiffconf' into 'master'
+	Fix STRIPCHOP_DEFAULT value in CMake builds
+
+	See merge request libtiff/libtiff!271
+
+2021-11-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix STRIPCHOP_DEFAULT value in CMake builds.
+	CMake builds erroneously used value 1 instead of TIFF_STRIPCHOP, which
+	resulted in strip chopping not being enabled by default.
+
+2021-10-26  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: typo fix.
+
+2021-10-24  Han Han  <hanhanzhiyeqianke@gmail.com>
+
+	tiffsplit.c: Fix memleak before exit.
+	Details of the memleak:
+	$ valgrind --leak-check=full tiffsplit id:001763,sync:fuzzer07,src:001641,+cov
+
+	==2090657==
+	==2090657== HEAP SUMMARY:
+	==2090657==     in use at exit: 13,517 bytes in 17 blocks
+	==2090657==   total heap usage: 41 allocs, 24 frees, 29,351 bytes allocated
+	==2090657==
+	==2090657== 2,473 (1,249 direct, 1,224 indirect) bytes in 1 blocks are definitely lost in loss record 10 of 13
+	==2090657==    at 0x484086F: malloc (vg_replace_malloc.c:381)
+	==2090657==    by 0x48BF35C: TIFFClientOpen (tif_open.c:118)
+	==2090657==    by 0x48CF058: TIFFFdOpen (tif_unix.c:209)
+	==2090657==    by 0x48CF0C4: TIFFOpen (tif_unix.c:248)
+	==2090657==    by 0x10954C: main (tiffsplit.c:91)
+	==2090657==
+	==2090657== 11,044 (1,300 direct, 9,744 indirect) bytes in 1 blocks are definitely lost in loss record 13 of 13
+	==2090657==    at 0x484086F: malloc (vg_replace_malloc.c:381)
+	==2090657==    by 0x48BF35C: TIFFClientOpen (tif_open.c:118)
+	==2090657==    by 0x48CF058: TIFFFdOpen (tif_unix.c:209)
+	==2090657==    by 0x48CF0C4: TIFFOpen (tif_unix.c:248)
+	==2090657==    by 0x1093D9: main (tiffsplit.c:75)
+	==2090657==
+	==2090657== LEAK SUMMARY:
+	==2090657==    definitely lost: 2,549 bytes in 2 blocks
+	==2090657==    indirectly lost: 10,968 bytes in 15 blocks
+	==2090657==      possibly lost: 0 bytes in 0 blocks
+	==2090657==    still reachable: 0 bytes in 0 blocks
+	==2090657==         suppressed: 0 bytes in 0 blocks
+	==2090657==
+	==2090657== For lists of detected and suppressed errors, rerun with: -s
+	==2090657== ERROR SUMMARY: 2 errors from 2 contexts (suppressed: 0 from 0)
+
+2021-10-20  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_webp.c: add explicit cast to please MSVC verbose warnings.
+
+	tif_webp.c: white space fixing.
+
+2021-10-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'work/amyspark/psd-blobs' into 'master'
+	Enable writing Photoshop blobs
+
+	See merge request libtiff/libtiff!269
+
+2021-10-04  L. E. Segovia  <amy@amyspark.me>
+
+	Enable writing Photoshop blobs.
+
+2021-09-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'remove_packbits_hack' into 'master'
+	PackBitsDecode: remove hack for when char is unsigned.
+
+	See merge request libtiff/libtiff!267
+
+2021-09-28  Even Rouault  <even.rouault@spatialys.com>
+
+	PackBitsDecode: remove hack for when char is unsigned.
+	The function has a hack for platforms where char is unsigned. This is
+	better replaced by making bp a int8_t* pointer, which is guaranteed to
+	be signed.
+
+2021-09-27  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffcrop.c: remove useless 'set but not read' variables.
+
+2021-09-23  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_gdal_4538' into 'master'
+	TIFFAppendToStrip(): fix rewrite-in-place logic
+
+	See merge request libtiff/libtiff!266
+
+2021-09-23  Even Rouault  <even.rouault@spatialys.com>
+
+	TIFFAppendToStrip(): fix rewrite-in-place logic.
+	reproducable in particular with packbits compression.
+
+	Fixes https://github.com/OSGeo/gdal/issues/4538
+
+2021-09-17  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lzw.c: silence compiler warning about set but not used variable with recent clang
+
+2021-09-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_cygwin' into 'master'
+	Fix build warnings on cygwin about 'argument 1 of type 'float[3]' with...
+
+	See merge request libtiff/libtiff!265
+
+2021-09-06  Even Rouault  <even.rouault@spatialys.com>
+
+	test/rational_precision2double.c: add missing curly braces to fix -Werror=misleading-indentation
+
+2021-09-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Fix build warnings on cygwin about 'argument 1 of type 'float[3]' with mismatched bound [-Werror=array-parameter=]'
+
+2021-09-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'rewrite-fix' into 'master'
+	Fix TIFFRewriteDirectory discarding directories after the rewritten one
+
+	See merge request libtiff/libtiff!264
+
+2021-09-05  Facundo Tuesca  <facu@tuesca.com>
+
+	tif_dirwrite.c: Fix TIFFRewriteDirectory discarding directories.
+	This fixes a bug caused by the `tif_lastdiroff` optimization when
+	rewriting directories.
+
+	Rewriting the Nth directory temporarily zeroes the pointer to it
+	(located in the N-1th directory) and relies on `TIFFLinkDirectory`
+	traversing the whole directory list to find the zeroed pointer and
+	linking the rewritten directory to it. Since `TIFFLinkDirectory` skips
+	the traversal when `tif_lastdiroff` is set, this change unsets it
+	to force the full traversal when rewriting a directory.
+
+	A test to catch this particular issue is also added.
+
+2021-09-01  Even Rouault  <even.rouault@spatialys.com>
+
+	test_directory.c: fix compiler warnings.
+
+2021-09-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'multipage-optimization' into 'master'
+	Keep track of last directory to improve performance for large multi-page files
+
+	See merge request libtiff/libtiff!262
+
+2021-08-28  Facundo Tuesca  <facu@tuesca.com>
+
+	Add field to keep track of last written directory.
+	This adds a new `tif_lastdiroff` field to the TIFF data structure
+	and uses it to store the offset of the last written directory.
+
+	Appending a new directory required traversing the whole file
+	to find the last directory. By keeping track of its offset in this
+	new field, the search is no longer necessary.
+
+	Since this offset is only stored in-memory, the first directory
+	append after opening a file will have to transverse the whole
+	directory list. Subsequent calls will have access to the last
+	offset, avoiding the transversal.
+
+2021-08-13  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: fix memory leak on error code path for JPEG 12 bit (CID 1086702)
+
+2021-07-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'jpeg12' into 'master'
+	Enable JPEG 12bit support with a libjpeg that has a different ABI than the one for 8bit support
+
+	See merge request libtiff/libtiff!261
+
+2021-07-28  Even Rouault  <even.rouault@spatialys.com>
+
+	Reformat tif_jpeg.c and tif_jpeg_12.c with clang-format-10.
+
+2021-07-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Enable JPEG 12bit support with a libjpeg that has a different ABI than the one for 8bit support
+	See https://github.com/OSGeo/gdal/pull/4139 for more details
+
+	Note: this hasn't been tested for standalone libtiff builds.
+
+2021-07-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'wip/export-targets' into 'master'
+	Export tiff targets
+
+	See merge request libtiff/libtiff!258
+
+2021-07-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'pkgconfig' into 'master'
+	Add version and requirements to pc file
+
+	See merge request libtiff/libtiff!256
+
+2021-07-09  Kai Pastor  <8989969-dg0yt@users.noreply.gitlab.com>
+
+	Fix version in libtiff-4.pc.in, and CMake build: Add requirements to pc file
+
+2021-07-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake' into 'master'
+	Fix build issues with CMake 3.10
+
+	See merge request libtiff/libtiff!260
+
+2021-07-04  Kai Pastor  <dg0yt@darc.de>
+
+	Fix reconfiguration with cmake.
+
+	Fix build with CMake 3.10.
+
+2021-06-28  Milian Wolff  <milian.wolff@kdab.com>
+
+	Export tiff targets.
+	Fixes build when including libtiff as a cmake subproject into
+	another project and then installing a target from there which
+	depends on tiff. For example we could end up with:
+
+	```
+	CMake Error in 3rdParty/diplib/CMakeLists.txt:
+	  export called with target "DIP" which requires target "tiff" that is not in
+	  any export set.
+	```
+
+2021-06-21  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'libjpeg9d_support_simplification' into 'master'
+	tif_jpeg.c: simplify libjpeg 9d support (refs #266)
+
+	See merge request libtiff/libtiff!257
+
+2021-06-20  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: simplify libjpeg 9d support (refs #266)
+	Credits to Guido Vollbeding for the suggestion
+
+2021-06-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'jpeg_in_tiff_jpeg_9d' into 'master'
+	tif_jpeg.c: workaround bug of libjpeg 9d that defers Huffman table creation
+
+	Closes #266
+
+	See merge request libtiff/libtiff!255
+
+2021-06-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'jpeg_disable_progressive_with_mozjpeg' into 'master'
+	tif_jpeg.c: do not emit progressive scans with mozjpeg and force optimize_coding
+
+	See merge request libtiff/libtiff!254
+
+2021-06-12  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: with mozjpeg, disable emission of Huffman tables in JpegTables tag, and use optimize_coding
+
+2021-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: workaround bug of libjpeg 9d that defers Huffman table creation
+	Fixes #266
+
+	libjpeg-9d no longer creates default Huffman tables in
+	jpeg_set_defaults(), which make their emission in the JpegTables tag no
+	longer possible. Workaround that by borrowing code from libjpeg to
+	manually create them when they are not initialized.
+
+2021-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_jpeg.c: do not emit progressive scans with mozjpeg.
+	Relates to #266
+
+	- On writing, explicitly disable progressive scans, which is normally
+	  not enabled, except with mozjpeg.
+	- On reading, emit a warning when encountering progressive scans.
+
+2021-06-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix-263' into 'master'
+	Fix memory leak in tiff2pdf
+
+	See merge request libtiff/libtiff!249
+
+2021-06-09  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'diizzyy-master-patch-20521' into 'master'
+	html: Add missing pages when using CMake
+
+	See merge request libtiff/libtiff!242
+
+2021-06-09  Daniel E  <daniel.engberg.lists@pyret.net>
+
+	html: Add missing pages when using CMake.
+
+2021-06-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'ci-reenable-cygwin' into 'master'
+	ci: Re-enable cygwin builds
+
+	See merge request libtiff/libtiff!252
+
+2021-06-06  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Re-enable cygwin builds.
+
+2021-06-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'ci-arm64' into 'master'
+	ci: Add arm64 build
+
+	See merge request libtiff/libtiff!251
+
+2021-06-06  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Add arm64 build.
+
+2021-06-05  Even Rouault  <even.rouault@spatialys.com>
+
+	_TIFFRewriteField(): fix when writing a IFD with a single tile that is a sparse one, on big endian hosts
+
+2021-06-02  Timothy Lyanguzov  <timothy.lyanguzov@sap.com>
+
+	Fix memory leak in tiff2pdf.
+
+2021-06-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'lzw_cleanup' into 'master'
+	tif_lzw.c: cleanup, no functional change
+
+	See merge request libtiff/libtiff!248
+
+2021-05-31  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lzw.c: cleanup, no functional change.
+
+2021-05-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'appveyor_disable_cygwin' into 'master'
+	.appveyor.yml: disable cygwin configs for now as they are broken
+
+	See merge request libtiff/libtiff!247
+
+2021-05-22  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'zstd_reuse_objects' into 'master'
+	ZSTD codec: reuse compressor/decompressor objects
+
+	See merge request libtiff/libtiff!246
+
+2021-05-22  Even Rouault  <even.rouault@spatialys.com>
+
+	.appveyor.yml: disable cygwin configs for now as they are broken.
+
+	ZSTD codec: reuse compressor/decompressor objects.
+	No need to recreate them each time in the PreEncode/Decode functions.
+	They can be reused if already existing.
+
+2021-05-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'adobedeflate-fix' into 'master'
+	Fix all remaining uses of legacy Deflate compression id and warn on use
+
+	See merge request libtiff/libtiff!245
+
+2021-05-08  David Ryskalczyk  <david.rysk@gmail.com>
+
+	Fix all remaining uses of legacy Deflate compression id and warn on use.
+
+2021-05-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'improve_tiffinfo_tiffdump_for_gdal_tags' into 'master'
+	tiffinfo/tiffdump: improve output for GDAL tags
+
+	See merge request libtiff/libtiff!244
+
+2021-05-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-252' into 'master'
+	Prevent adding root directory to include list
+
+	Closes #252 et #218
+
+	See merge request libtiff/libtiff!243
+
+2021-05-03  Even Rouault  <even.rouault@spatialys.com>
+
+	tiffinfo/tiffdump: improve output for GDAL tags.
+
+2021-04-29  Timothy Lyanguzov  <timothy.lyanguzov@sap.com>
+
+	Prevent adding root directory to include list.
+	there is a file VERSION in the root directory which clashes with C++20 standard header <version>
+	"config.h" file is created in "config" subdirectory to prevent adding "-I.." to generated Makefile
+
+	closes #218, #252
+
+2021-04-23  Laszlo Boszormenyi (GCS)  <gcs@debian.org>
+
+	fix TIFFReadRawStrip man and HTML page typo.
+	From https://github.com/conda-forge/libtiff-feedstock/blob/master/recipe/patches/fix_TIFFReadRawStrip_man_page_typo.patch
+
+2021-04-20  Even Rouault  <even.rouault@spatialys.com>
+
+	HOWTO-RELEASE: update.
+
+2021-04-18  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'lerc_zstd_deflate' into 'master'
+	Make LERC_SUPPORT conditional on ZLIB_SUPPORT. Make display of lerc options in tiffcp depend on actual zstd support.
+
+	See merge request libtiff/libtiff!239
+
+2021-04-18  Miguel Medalha  <medalist@sapo.pt>
+
+	Make LERC_SUPPORT conditional on ZLIB_SUPPORT. Make display of lerc options in tiffcp depend on actual zstd support.
+
+2021-04-17  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'vtorri_xz' into 'master'
+	automatic creation of xz archive when running make distcheck
+
+	See merge request libtiff/libtiff!238
+
+2021-04-16  Even Rouault  <even.rouault@spatialys.com>
+
+	iptcutil.c: fix bug in EOF comparison, spotted on NetBSD 9 earmv7hf-el.
+
+2021-04-16  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff v4.3.0 released
+
+2021-04-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'libjpeg12_cmake' into 'master'
+	libtiff/tif_config.h.cmake.in: surround LIBJPEG_12_PATH by double quotes
+
+	See merge request libtiff/libtiff!237
+
+2021-04-15  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff/tif_config.h.cmake.in: surround LIBJPEG_12_PATH by double quotes
+
+2021-04-14  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'lerc_api_change' into 'master'
+	tif_lerc.c: cope with API breakage in liblerc master
+
+	See merge request libtiff/libtiff!236
+
+2021-04-14  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_lerc.c: cope with API breakage in liblerc master.
+
+2021-04-08  Even Rouault  <even.rouault@spatialys.com>
+
+	libtiff: remove remaining #ifdef WIN32 and use PRI formatting.
+
+2021-03-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'kmilos-master-patch-73187' into 'master'
+	tiffcp: Remove LZW help text, preset not supported
+
+	See merge request libtiff/libtiff!229
+
+2021-03-10  Miloš Komarčević  <miloskomarcevic@aim.com>
+
+	tiffcp: Remove LZW help text, preset not supported.
+
+2021-03-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'feature/lerc' into 'master'
+	Add LERC Compression Plugin (closes: #213)
+
+	Closes #213
+
+	See merge request libtiff/libtiff!228
+
+2021-03-10  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake-fixes' into 'master'
+	CMake fixes
+
+	Closes #246 et #245
+
+	See merge request libtiff/libtiff!232
+
+2021-03-09  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: Correct FindCMath.
+
+	cmake: Correct ZSTD_USABLE typo.
+
+2021-03-07  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'cmake-find-lib-prefixes' into 'master'
+	cmake: Correct find lib prefixes for Deflate and JBIG
+
+	See merge request libtiff/libtiff!231
+
+2021-03-07  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: FindJBIG uses lib prefix on Windows.
+
+	cmake: FindDeflate uses lib prefix on Windows.
+
+2021-03-07  Even Rouault  <even.rouault@spatialys.com>
+
+	TWebPDecode(): avoid potential overflow on multiplication (CID 1472928)
+
+	TIFFReadDirEntryArrayWithLimit(): avoid false positive from Coverity Scan regarding out-of-bounds access (CID 1472927)
+
+	tif_dirwrite.c: avoid setting a variable that is not read afterwards.
+
+2021-03-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'coverity-fixes' into 'master'
+	Coverity fixes (high impact)
+
+	See merge request libtiff/libtiff!227
+
+2021-03-07  Roger Leigh  <rleigh@codelibre.net>
+
+	Fix high-impact Coverity issues (resource leaks).
+	The issues are in the tests and tiffcrop, not the core library. Real issues, but not high risk.
+
+	Use to test if Coverity integration is performing properly on merge.
+
+2021-03-07  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'fix_tif_fax3_encoder_regression' into 'master'
+	tif_fax3.c: fix master regression in encoder
+
+	See merge request libtiff/libtiff!230
+
+2021-03-07  Even Rouault  <even.rouault@spatialys.com>
+
+	tif_fax3.c: fix master regression in encoder.
+	Fix issue introduced in 39a74eede0455ec8ee334dcddf71f5354d508d8b
+
+	Spotted by gdal's tiff_write_76 test
+
+2021-03-07  Antonio Valentino  <antonio.valentino@tiscali.it>
+
+	Add LERC support in CMake.
+
+2021-03-07  Antonio Valentino  <Antonio.Valentino@esa.int>
+
+	Add LERC support in configure.ac and Makefile.am.
+
+2021-03-07  Antonio Valentino  <antonio.valentino@tiscali.it>
+
+	Add LERC support to tiffcp.
+
+2021-03-07  Antonio Valentino  <Antonio.Valentino@esa.int>
+
+	Add LERC plugin.
+	The lerc plugin code has been copyed form GDAL.
+
+2021-03-06  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'display_tool_purpose' into 'master'
+	TIFF tools: insert a line of text summarizing each tool's purpose
+
+	See merge request libtiff/libtiff!214
+
+2021-03-06  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake-update' into 'master'
+	CMake updates
+
+	See merge request libtiff/libtiff!224
+
+2021-03-06  Even Rouault  <even.rouault@spatialys.com>
+
+	tiff.h: typo fix in comment.
+
+2021-02-15  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'remove-travis' into 'master'
+	ci: Remove unused Travis-CI support
+
+	See merge request libtiff/libtiff!226
+
+2021-02-14  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Remove unused Travis-CI support.
+
+2021-02-14  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'ci-coverity' into 'master'
+	Enable Coverity static analysis with CI pipeline
+
+	See merge request libtiff/libtiff!225
+
+2021-02-14  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Add Coverity static analysis job.
+
+	ci: Use custom libtiff CI image.
+
+2021-02-13  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: Add release target.
+
+	cmake: Remove empty contrib files.
+
+	cmake: Tidy toplevel.
+
+	cmake: Move pkg-config support to PkgConfig.cmake.
+
+	cmake: Move library feature options to CXXLibraryFeatures.cmake.
+
+	cmake: Move C++ support to CXXLibrary.cmake.
+
+	cmake: Add FindCMath to handle libm linking portably.
+
+	cmake: Tidy unused includes.
+
+	cmake: Rename release date to build date.
+
+	cmake: Compute timestamp portably.
+
+	cmake: Remove remaining uses of report_values()
+
+	cmake: Move JPEG12 checks to JPEGCodec.cmake.
+
+	cmake: Move OpenGL checks to OpenGLChecks.cmake.
+
+	cmake: Move OpenGL checks to OpenGLChecks.cmake.
+
+	cmake: Move WebP codec support to WebPCodec.cmake.
+
+	cmake: Use imported targets for WebP.
+
+	cmake: Add FindWebP.
+
+	cmake: Move ZSTD codec support to ZSTDCodec.cmake.
+
+	cmake: Use imported targets for ZSTD.
+
+	cmake: Add FindZSTD.
+
+	cmake: Move LZMA codec support to LZMACodec.cmake.
+
+	cmake: Use imported targets for LibLZMA.
+
+	cmake: Move JBIG codec support to JBIGCodec.cmake.
+
+	cmake: Use imported targets for JBIG.
+
+	cmake: Add FindJBIG.
+
+	cmake: Move PixarLog codec support to PixarLogCodec.cmake.
+
+	cmake: Report system name in configuration report.
+
+	cmake: Move JPEG codec support to JPEGCodec.cmake.
+
+	cmake: Use imported targets for JPEG.
+
+	cmake: Move Deflate codec support to DeflateCodec.cmake.
+
+	cmake: Use imported targets for ZLIB and Deflate.
+
+	cmake: Add FindDeflate.
+
+	cmake: Move symbol checks to SymbolChecks.cmake.
+
+	cmake: Move include checks to IncludeChecks.cmake.
+
+	cmake: Move all autotools logic to separate files.
+
+	cmake: Move internal codec options to InternalCodecs.cmake.
+
+	cmake: Move LFS check to LargeFileSupport.cmake.
+
+	cmake: Move Win32 IO feature to WindowsIOFeature.cmake.
+
+	cmake: Move processor capability checks to ProcessorChecks.cmake.
+
+	cmake: Move type size checks to TypeSizeChecks.cmake.
+
+	cmake: Move linker checks to LinkerChecks.cmake.
+
+	cmake: Move warning options to CompilerChecks.
+
+	cmake: Move version introspection to AutotoolsVersion.cmake.
+
+	cmake: Move compiler checks to CompilerChecks.cmake.
+
+	cmake: Split into helper scripts.
+
+2021-02-08  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: Use target_sources.
+
+	libport: Adjust header and library to only define and link if required.
+	* Make libport an OBJECT library when in use, otherwise a dummy
+	  INTERFACE library
+	* libport.h will work if getopt is present or not present.  If
+	  present, will fall back to <unistd.h>, else will define
+	  symbols
+	* Add generated libport_config.h to define HAVE_GETOPT and HAVE_UNISTD_H
+	* dummy.c no longer needed with CMake
+	* libtiff/libtiffxx no longer link with libport
+
+	cmake: Remove unnecessary extra_dist usage.
+	Only makes sense in the context of Automake.  Was carried over
+	for reference while porting, but is not needed.
+
+	cmake: Update minimum version and policy version to 3.9.
+
+2021-02-08  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'remove-nmake' into 'master'
+	Remove NMake build support
+
+	See merge request libtiff/libtiff!223
+
+2021-02-08  Roger Leigh  <rleigh@codelibre.net>
+
+	Remove NMake build support.
+	The functionality provided by the NMake build is now completely
+	superseded by the CMake build.
+
+2021-02-08  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'warning-fixes' into 'master'
+	Warning fixes
+
+	See merge request libtiff/libtiff!222
+
+2021-02-07  Miguel Medalha  <medalist@sapo.pt>
+
+	Update tiffsplit.c.
+
+	Reinsert summary line lost after conflicting merge.
+
+	Merge branch 'master' into 'display_tool_purpose'
+	# Conflicts:
+	#   tools/tiffsplit.c
+
+2021-02-06  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Enable fatal warnings with -Werror for AppVeyor/GCC.
+
+	ci: Enable fatal warnings with -Werror for GitLab CI.
+
+	tif_zstd.c: Remove unused variable warning.
+
+	custom_dir_EXIF_231: Remove case statement fallthrough.
+
+	custom_dir_EXIF_231: Correct use of strncpy.
+
+	Correct include order.
+
+	Eliminate implict fallthrough usage.
+	Use simple loops in place of manual loop unrolling.  Rely on
+	the compiler optimiser to unroll loops when appropriate.
+
+	Suppress potential unused variable warning.
+
+	Suppress warnings or avoid case statement fallthrough.
+
+2021-02-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'c99-ssize_t-fixes' into 'master'
+	C99 ssize_t fixes
+
+	Closes #239
+
+	See merge request libtiff/libtiff!219
+
+2021-02-05  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffsplit-too-many-args' into 'master'
+	tiffsplit.c: exit with EXIT_FAILURE if there are extra args on the command line
+
+	See merge request libtiff/libtiff!209
+
+2021-02-05  Roger Leigh  <rleigh@codelibre.net>
+
+	Add additional TIFF_SSIZE_FORMAT uses.
+
+2021-02-04  Roger Leigh  <rleigh@codelibre.net>
+
+	NMake fixes for size type formatting.
+
+	Add TIFF_SIZE_FORMAT for portable use of %z.
+	MinGW64 does support %z, but it issues a compiler warning.
+
+	Align Autoconf tif_config.h and CMake tif_config.cmake.in.
+
+	Use TIFF_SSIZE_FORMAT for formatting tmsize_t.
+
+2021-02-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'remove-lcc' into 'master'
+	Remove Makefile.lcc
+
+	See merge request libtiff/libtiff!221
+
+2021-02-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'remove-scons' into 'master'
+	Remove SCons build
+
+	See merge request libtiff/libtiff!220
+
+2021-02-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'c99-snprintf' into 'master'
+	Use C99 snprintf
+
+	See merge request libtiff/libtiff!217
+
+2021-02-03  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'tiff2ps-const' into 'master'
+	tiff2ps.c: string literals must be const char *, not char *
+
+	See merge request libtiff/libtiff!202
+
+2021-02-03  Roger Leigh  <rleigh@codelibre.net>
+
+	Remove SCons build.
+	Unmaintained for 16 years.
+
+2021-02-03  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'codec_summary' into 'master'
+	Modify 'CMakeLists.txt' to produce a meaningful summary of external codecs
+
+	See merge request libtiff/libtiff!192
+
+2021-02-03  Roger Leigh  <rleigh@codelibre.net>
+
+	Remove Makefile.lcc.
+	Unmaintained for 22 years.
+
+2021-02-02  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'ci-32-bit' into 'master'
+	ci: Build 32- and 64-bit MSVC variants
+
+	See merge request libtiff/libtiff!218
+
+2021-02-01  Roger Leigh  <rleigh@codelibre.net>
+
+	ci: Build 32- and 64-bit MSVC variants.
+
+	Use C99 snprintf.
+
+2021-02-01  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'c99-strtol' into 'master'
+	Use C99 strtol, strtoul, strtoll and strtoull
+
+	See merge request libtiff/libtiff!216
+
+2021-01-31  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'c99-inline' into 'master'
+	Use C99 inline
+
+	See merge request libtiff/libtiff!215
+
+2021-01-31  Roger Leigh  <rleigh@codelibre.net>
+
+	Use C99 strtol, strtoul, strtoll and strtoull.
+
+	tif_fax3: Use C99 inline.
+
+	Remove inline feature tests and defines.
+	Available as a standard feature with C99.
+
+2021-01-30  Miguel Medalha  <medalist@sapo.pt>
+
+	Update raw2tiff.c (remove duplicate description of tool)
+
+2021-01-30  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'c99-format-strings' into 'master'
+	C99 format strings
+
+	See merge request libtiff/libtiff!211
+
+2021-01-29  Miguel Medalha  <medalist@sapo.pt>
+
+	Update tiffdither.c (2 tabs caused slight misalignment of lines in usage info output)
+
+2021-01-28  Miguel Medalha  <medalist@sapo.pt>
+
+	Update ppm2tiff.c (slight misalignment of lines in usage info output)
+
+	Update tiffset.c (small misalignment of lines in usage info output)
+
+2021-01-28  Medalha  <medalist@sapo.pt>
+
+	Display tool purpose.
+
+2021-01-28  Thomas Bernard  <miniupnp@free.fr>
+
+	tiff2ps.c: string literals must be const char *, not char *
+
+2021-01-28  Roger Leigh  <rleigh@codelibre.net>
+
+	libtiff: Use PRI format flags and remove unnecessary casts.
+
+2021-01-27  Roger Leigh  <rleigh@codelibre.net>
+
+	ascii_tag: Use PRI format flags and remove unnecessary casts.
+
+	check_tag: Use PRI format flags and remove unnecessary casts.
+
+	custom_dir_EXIF_231: Use PRI format flags and remove unnecessary casts.
+
+	short_tag: Use PRI format flags and remove unnecessary casts.
+
+	strip_rw: Use PRI format flags and remove unnecessary casts.
+
+	fax2tiff: Use PRI format flags and remove unnecessary casts.
+
+	ppm2tiff: Correct format strings.
+
+	raw2tiff: Use PRI format flags and remove unnecessary casts.
+
+	rgb2ycbcr: Use PRI format flags and remove unnecessary casts.
+
+	tiff2pdf: Use PRI format flags and remove unnecessary casts.
+
+	tiff2ps: Use PRI format flags and remove unnecessary casts.
+
+	tiff2rgba: Use PRI format flags and remove unnecessary casts.
+
+	tiffcmp: Use PRI format flags and remove unnecessary casts.
+
+	tiffcp: Use PRI format flags and remove unnecessary casts.
+
+	tiffcrop: Use PRI format flags and remove unnecessary casts.
+
+	tiffinfo: Use PRI format flags and remove unnecessary casts.
+
+	tiffdump: Use PRI format flags and remove unnecessary casts.
+
+2021-01-27  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'c99-require-stdtypes' into 'master'
+	Use standard C99 integer types
+
+	See merge request libtiff/libtiff!205
+
+2021-01-26  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'reserve_COMPRESSION_JXL' into 'master'
+	tiff.h: reserve COMPRESSION_JXL=50002 for JPEGXL
+
+	See merge request libtiff/libtiff!210
+
+2021-01-22  Even Rouault  <even.rouault@spatialys.com>
+
+	tiff.h: reserve COMPRESSION_JXL=50002 for JPEGXL.
+
+2021-01-22  Kurt Schwehr  <schwehr@google.com>
+
+	tiffsplit.c: exit with EXIT_FAILURE if there are extra args on the command line
+	e.g. tiffsplit in.tif a_prefix_ junk
+
+2021-01-22  Roger Leigh  <rleigh@codelibre.net>
+
+	Add and enable TIFF_DISABLE_DEPRECATED for internal use.
+
+	Add typedef deprecations for GCC/Clang and MSVC.
+
+	Use standard C99 integer types.
+
+2021-01-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'ubuntu-zstd-wepb' into 'master'
+	gitlab-ci : use libzstd-dev and libwebp-dev ubuntu packages
+
+	See merge request libtiff/libtiff!208
+
+2021-01-20  Thomas Bernard  <miniupnp@free.fr>
+
+	gitlab-ci : use libzstd-dev and libwebp-dev ubuntu packages.
+	should replace !206
+
+2021-01-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-232' into 'master'
+	tiff2ps: exit the loop in case of error
+
+	Closes #232
+
+	See merge request libtiff/libtiff!201
+
+2021-01-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffsplit-exit-status' into 'master'
+	tiffsplit: Exit with EXIT_FAILURE if unable to open the input file.
+
+	See merge request libtiff/libtiff!207
+
+2021-01-20  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'config-cleanup' into 'master'
+	Remove HAVE_INTTYPES_H, HAVE_LFIND & lfind, HAVE_SEARCH_H & include <search.h>
+
+	See merge request libtiff/libtiff!203
+
+2021-01-19  Kurt Schwehr  <schwehr@google.com>
+
+	tiffsplit: Exit with EXIT_FAILURE if unable to open the input file.
+
+2021-01-18  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcmp' into 'master'
+	tiffcmp: fix comparaison with pixels that are fractional number of bytes
+
+	Closes #53
+
+	See merge request libtiff/libtiff!141
+
+2021-01-15  Kurt Schwehr  <schwehr@google.com>
+
+	CMakeLists.txt: Remove search for lfind.
+
+	Remove HAVE_INTTYPES_H, HAVE_LFIND & lfind, HAVE_SEARCH_H & include <search.h>
+	- HAVE_INTTYPES_H is replaced with TIFF_HAVE_INTTYPES_H
+	- tif_dirinfo.c has a static td_lfind
+
+2021-01-15  Thomas Bernard  <miniupnp@free.fr>
+
+	tiffcmp: fix comparaison with pixels that are fractional number of bytes
+	For exemple : 4bits per sample + 3 samples per pixel => 1.5 bytes per pixel
+
+	tiff2ps: exit the loop in case of error.
+	fixes #232
+
+2021-01-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'rm-strcasecmp' into 'master'
+	Remove port/strcasecmp.c as strcasecmp is not currently used in libtiff.
+
+	Closes #235
+
+	See merge request libtiff/libtiff!199
+
+2021-01-15  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'std-int-types' into 'master'
+	Use standard C99 integer types
+
+	See merge request libtiff/libtiff!185
+
+2021-01-13  Kurt Schwehr  <schwehr@google.com>
+
+	Remove port/strcasecmp.c as strcasecmp is not currently used in libtiff.
+	Fixes #235
+
+2021-01-10  Miguel Medalha  <medalist@sapo.pt>
+
+	Update CMakeLists.txt. Delete unnecessary line from libdeflate codec support section
+
+2021-01-10  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'include_libport' into 'master'
+	tiff tools and libtiff/mkg3states: include 'libport.h', remove local definition of 'getopt()'
+
+	See merge request libtiff/libtiff!198
+
+2021-01-09  Miguel Medalha  <medalist@sapo.pt>
+
+	Update CMakeLists.txt. Cleanup of indentation space. Removal of leading '/' from webp include dir.
+
+2021-01-09  miguel  <medalist@sapo.pt>
+
+	cmake: Use target_include_directories correctly.
+
+2021-01-09  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: Use target_include_directories correctly.
+
+	cmake: Drop unnecessary TOOL_INCLUDES.
+
+2021-01-07  Roger Leigh  <rleigh@codelibre.net>
+
+	cmake: Use target_include_directories correctly.
+
+2021-01-07  miguel  <medalist@sapo.pt>
+
+	tiff tools and libtiff/mkg3states: include 'libport.h', remove local definition of 'getopt()'
+
+2021-01-07  Miguel Medalha  <medalist@sapo.pt>
+
+	Update CMakeLists.txt.
+
+2021-01-07  miguel  <medalist@sapo.pt>
+
+	tiff tools: include 'libport.h', remove local definition of 'getopt()'
+
+2021-01-06  Roger Leigh  <rleigh@codelibre.net>
+
+	Remove conditional use of <string.h>
+
+	cmake: Drop dlfcn.h check.
+
+	cmake: Remove duplicate line.
+
+	Use stdint.h types when available.
+
+2021-01-05  Olivier Paquet  <olivier.paquet@gmail.com>
+
+	Merge branch 'iptcutil' into 'master'
+	contrib/iptcutil.c: set '#ifdef _WIN32' (was '#ifdef WIN32', which failed at build time)
+
+	See merge request libtiff/libtiff!197
+
+2021-01-05  miguel  <medalist@sapo.pt>
+
+	tiff tools: include 'libport.h', remove local definition of 'getopt()'
+
+	contrib/iptcutil.c - set '#ifdef _WIN32' (was '#ifdef WIN32', which failed at build time)
+
+2021-01-04  Even Rouault  <even.rouault@spatialys.com>
+
+	tools/CMakeLists.txt: add comment about rgb2ycbcr and thumbnail not to be installed
+
+2021-01-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'revert-5331ed49' into 'master'
+	Revert "Merge branch 'install_targets' into 'master'"
+
+	See merge request libtiff/libtiff!196
+
+2021-01-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'drop-wince' into 'master'
+	Remove non-functional WinCE support
+
+	See merge request libtiff/libtiff!188
+
+2021-01-04  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'drop-vms' into 'master'
+	Remove non-functional VMS support
+
+	See merge request libtiff/libtiff!187
+
+2021-01-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Revert "Merge branch 'install_targets' into 'master'"
+	This reverts merge request !193
+
+2021-01-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'tiffcp_b_parameter' into 'master'
+	tiffcp: Remove unnecessary reference to compression from usage info for -b parameter
+
+	See merge request libtiff/libtiff!189
+
+2021-01-03  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'cmake-faxtable' into 'master'
+	cmake: Add faxtable target
+
+	See merge request libtiff/libtiff!186
+
+2021-01-03  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'install_targets' into 'master'
+	Update 'CMakeLists.txt' from 'tools'.
+
+	See merge request libtiff/libtiff!193
+
+2021-01-03  Miguel Medalha  <medalist@sapo.pt>
+
+	Update 'CMakeLists.txt' from 'tools'.
+
+	Update CMakeLists.txt.
+
+2021-01-03  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'usage_info' into 'master'
+	thumbnail: Rename constant variable 'stuff' to 'usage_info'
+
+	See merge request libtiff/libtiff!190
+
+2021-01-03  Miguel Medalha  <medalist@sapo.pt>
+
+	Modified 'CMakeLists.txt' to produce a meaningful summary of external codecs support.
+
+	Removed unnecessary reference to compression from usage info for -b parameter
+
+	Constant variable 'stuff' renamed to 'usage_info' for consistency with the other tools
+
+2021-01-02  Roger Leigh  <rleigh@codelibre.net>
+
+	Remove non-functional VMS support.
+
+	Remove non-functional WinCE support.
+
+2021-01-02  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'codespell' into 'master'
+	Fix spelling mistakes.
+
+	See merge request libtiff/libtiff!183
+
+2021-01-02  Kurt Schwehr  <schwehr@google.com>
+
+	ChangeLog: Remove extraneous character from prior commit - 00fe7828.
+
+2021-01-02  Roger Leigh  <rleigh@codelibre.net>
+
+	Merge branch 'codespell-custom_dir_EXIF_231' into 'master'
+	custom_dir_EXIF_231.c: dos2unix and codespell
+
+	See merge request libtiff/libtiff!184
+
+2021-01-01  Roger Leigh  <rleigh@codelibre.net>
+
+	mkg3states: Sync generator with current generated file content.
+
+	cmake: Add faxtable and mkg3states targets.
+
+2020-12-31  Kurt Schwehr  <schwehr@google.com>
+
+	custom_dir_EXIF_231.c: dos2unix and codespell.
+	additonal, Varable, greather, alwasy
+
+2020-12-31  Kurt Schwehr  <schwehr@google.com>
+
+	Fix spelling mistakes.
+	Found with:
+
+	codespell --version
+	1.17.1
+
+2020-12-29  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'remove-some-vms-ifdef' into 'master'
+	Remove "ifdef VMS" that are no longer necessary.
+
+	See merge request libtiff/libtiff!181
+
+2020-12-29  Kurt Schwehr  <schwehr@google.com>
+
+	Remove "ifdef VMS" that are no longer necessary.
+	Both sides of the if are now the same.
+
+2020-12-29  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	_TIFFBuiltinCODECS should be const.
+
+2020-12-28  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'issue-202' into 'master'
+	tiff2pdf.c: check that tiff_datasize fits in a signed tsize_t
+
+	Closes #202
+
+	See merge request libtiff/libtiff!166
+
+2020-12-28  Even Rouault  <even.rouault@spatialys.com>
+
+	Merge branch 'w_report-when-libdeflate-is-found' into 'master'
+	CMakeLists.txt: Report when libdeflate is found
+
+	See merge request libtiff/libtiff!175
+
+2020-12-28  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Declare gpsFields as static const.
+
+2020-12-28  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Merge branch 'tools-reduce-initialized-data' into 'master'
+	Reduce initialized data by making more data const and simplifying usage() implementation.
+
+	See merge request libtiff/libtiff!180
+
+2020-12-28  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Reduce initialized data by making more data const and simplifying usage() implementation.
+
+2020-12-27  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Merge branch 'master' into 'master'
+	tiff tools: display of compression methods is now conditional instead of hard-coded
+
+	See merge request libtiff/libtiff!176
+
+2020-12-27  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Merge branch 'build' into 'master'
+	Fix wrong URL for fetching config.guess and config.sub
+
+	See merge request libtiff/libtiff!177
+
+2020-12-26  Chocobo1  <Chocobo1@users.noreply.github.com>
+
+	Fix wrong URL for fetching config.guess and config.sub.
+
+2020-12-25  miguel  <medalist@sapo.pt>
+
+	tiff tools: made display of compression methods and their parameters conditional on their actual availability
+
+2020-12-20  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	autogen.sh now updates config.guess and config.sub from master gnulib version.
+
+2020-12-19  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
+
+	Add a https://libtiff.gitlab.io/libtiff/ link.
+
+	Remove stray character in URL area.
+
+	Changes for 4.2.0 release.
+
+	Changes for 4.2.0 release.
+
 2020-12-19  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
 
 	* libtiff 4.2.0 released.
@@ -469,7 +5198,7 @@
 	see #17
 
 	tiffmedian: shopw usage on stdout when -h is used.
-	aslo use EXIT_SUCCESS/EXIT_FAILURE
+	also use EXIT_SUCCESS/EXIT_FAILURE
 	see #17
 
 	tiffinfo: print usage on stdout when -h is used.
@@ -674,7 +5403,7 @@
 2020-03-27  Thomas Bernard  <miniupnp@free.fr>
 
 	tiff2pdf: fix "raw" copy of Deflate streams.
-	The Predictor parametter was not copied from the source tiff to the PDF.
+	The Predictor parameter was not copied from the source tiff to the PDF.
 	fixes #48 / http://bugzilla.maptools.org/show_bug.cgi?id=2442
 
 2020-03-26  Thomas Bernard  <miniupnp@free.fr>
@@ -982,7 +5711,7 @@
 	- EXIF_GPS_upgrade rebased onto c8c5309b765ef4ff097d2aaffbdb8f403db8967d (Merge branch 'Rational2DoublePrecision_correction' into 'master')
 	and adapted:
 	- tif_dirinfo.c:         All rational tags set to TIFF_SETGET_FLOAT but only the GPSTAG_ tags set to TIFF_SETGET_DOUBLE.
-	- custom_dir_EXIF_231.c: Editorials amended and gcc warnigs fixed.
+	- custom_dir_EXIF_231.c: Editorials amended and gcc warnings fixed.
 	- CMakeLists.txt: add_test(NAME "custom_dir_EXIF_231"  COMMAND "custom_dir_EXIF_231")  added.
 
 2020-03-07  Even Rouault  <even.rouault@spatialys.com>
@@ -1006,7 +5735,7 @@
 	fix #55
 	http://bugzilla.maptools.org/show_bug.cgi?id=2505
 
-	Patch originally submited by Ludolf Holzheid <ludolf.holzheid@gmx.de>
+	Patch originally submitted by Ludolf Holzheid <ludolf.holzheid@gmx.de>
 
 2020-03-06  Even Rouault  <even.rouault@spatialys.com>
 
@@ -1129,7 +5858,7 @@
 
 2020-02-29  Su_Laus  <sulau@freenet.de>
 
-	tif_dirwrite.c: bugfix DoubleToSrational(), which returns plain signed interger values always as unsigned rationals. Add a test into rational_precision2double.c for "-1.0" and some editorials in tif_dirwrite.c. (code is related to 6df997c786928757caea0dd68d26ea5f098f49df changes).
+	tif_dirwrite.c: bugfix DoubleToSrational(), which returns plain signed integer values always as unsigned rationals. Add a test into rational_precision2double.c for "-1.0" and some editorials in tif_dirwrite.c. (code is related to 6df997c786928757caea0dd68d26ea5f098f49df changes).
 
 2020-02-29  Even Rouault  <even.rouault@spatialys.com>
 
@@ -1174,7 +5903,7 @@
 
 	Rational with Double Precision Upgrade.
 	Unfortunately, custom rational tags (TIFF_RATIONAL with field_bit=FIELD_CUSTOM) are defined as TIFF_SETGET_DOUBLE
-	but for the reading interface and LibTiff internally they are stored ALLWAYS as floating point SINGLE precision.
+	but for the reading interface and LibTiff internally they are stored ALWAYS as floating point SINGLE precision.
 	Double precision custom rational tags are not supported by LibTiff.
 
 	For the GPS tags in WGS84 a higher accuracy / precision is needed.
@@ -1269,7 +5998,7 @@
 	raw2tiff: avoid divide by 0.
 	fixes #151 / http://bugzilla.maptools.org/show_bug.cgi?id=2839
 
-	first memcmp() lines before computing corellation
+	first memcmp() lines before computing correlation
 	and always avoid divide by 0 anyway
 
 2020-02-09  Even Rouault  <even.rouault@spatialys.com>
@@ -1294,7 +6023,7 @@
 	tiffcrop.c:4027:20: runtime error: left shift of 190 by 24 places cannot be represented in type 'int'
 
 	C treats (byte << 24) as an int expression.
-	casting explicitely to unsigned type uint32 avoids the problem.
+	casting explicitly to unsigned type uint32 avoids the problem.
 
 	the same issue has been fixed elsewhere with a24213691616e7cd35aa3e2805493de80c7e4fcf
 
@@ -1523,7 +6252,7 @@
 
 2019-08-25  Even Rouault  <even.rouault@spatialys.com>
 
-	JPEG: avoid use of unintialized memory on corrupted files.
+	JPEG: avoid use of uninitialized memory on corrupted files.
 	Follow-up of cf3ce6fab894414a336546f62adc57f02590a22c
 	Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16602
 	Credit to OSS Fuzz
@@ -1587,7 +6316,7 @@
 	signed), which was especially easily triggered on 32-bit builds (with recent
 	enough compilers that assume that signed multiplication cannot overflow, since
 	this is undefined behaviour by the C standard). The original issue which lead to
-	this fix was trigged from tif_fax3.c
+	this fix was triggered from tif_fax3.c
 
 	There were also unsafe (implementation defied), and broken in practice on 64bit
 	builds, ways of checking that a uint64 fits of a (signed) tmsize_t by doing
@@ -1660,7 +6389,7 @@
 		- Discussion in https://gitlab.com/libtiff/libtiff/merge_requests/39
 		- http://bugzilla.maptools.org/show_bug.cgi?id=2540
 
-	Comments and indention adapted.
+	Comments and indentation adapted.
 
 	Preparation to rebase onto master
 
@@ -1680,7 +6409,7 @@
 	[00:02:58] -- CMAKE_HOST_SYSTEM_PROCESSOR set to AMD64
 	[00:02:58] -- HOST_FILLORDER set to FILLORDER_MSB2LSB
 
-	Ther reason is that we match the "amd64.*" lowercase string whereas
+	The reason is that we match the "amd64.*" lowercase string whereas
 	CMAKE_HOST_SYSTEM_PROCESSOR is set to AMD64 uppercase.
 
 2019-07-09  Even Rouault  <even.rouault@spatialys.com>
@@ -1690,13 +6419,13 @@
 2019-07-09  Even Rouault  <even.rouault@spatialys.com>
 
 	Merge branch 'fix_chromium_925269' into 'master'
-	OJPEG: avoid use of unintialized memory on corrupted files
+	OJPEG: avoid use of uninitialized memory on corrupted files
 
 	See merge request libtiff/libtiff!86
 
 2019-07-05  Even Rouault  <even.rouault@spatialys.com>
 
-	OJPEG: avoid use of unintialized memory on corrupted files.
+	OJPEG: avoid use of uninitialized memory on corrupted files.
 	Fixes https://bugs.chromium.org/p/chromium/issues/detail?id=925269
 	Patch from Lei Zhang with little adaptations.
 
@@ -1849,12 +6578,12 @@
 	arrays are only loaded when first accessed. This can speed-up the opening
 	of files stored on the network when just metadata retrieval is needed.
 	This mode has been used for years by the GDAL library when compiled with
-	its embeded libtiff copy.
+	its embedded libtiff copy.
 
 	To avoid potential out-of-tree code (typically codecs) that would use
 	the td_stripbytecount and td_stripoffset array inconditionnaly assuming they
 	have been loaded, those have been suffixed with _p (for protected). The
-	use of the new functions mentionned below is then recommended.
+	use of the new functions mentioned below is then recommended.
 
 	Another addition of this commit is the capability of loading only the
 	values of the offset/bytecount of the strile of interest instead of the
@@ -1870,7 +6599,7 @@
 	if a strile is present or not without decompressing the data, or updating
 	an existing sparse file.
 	They will also be used to enable a future enhancement where client code can entirely
-	skip bytecount loading in some situtations
+	skip bytecount loading in some situations
 
 	A new test/defer_strile_loading.c test has been added to test the above
 	capabilities.
@@ -2141,8 +6870,8 @@
 	Also the values were not properly calculated. It should be
 	255-x, 15-x, 3-x for bps 8, 4, 2.
 
-	But anyway it is easyer to invert all bits as 255-x = ~x, etc.
-	(substracting from a binary number composed of all 1 is like inverting
+	But anyway it is easier to invert all bits as 255-x = ~x, etc.
+	(subtracting from a binary number composed of all 1 is like inverting
 	the bits)
 
 2019-02-11  Thomas Bernard  <miniupnp@free.fr>
@@ -2670,7 +7399,7 @@
 
 	Merge branch 'zstd'
 
-	Add warning about COMPRESSION_ZSTD not being officialy registered.
+	Add warning about COMPRESSION_ZSTD not being officially registered.
 
 2018-02-14  Even Rouault  <even.rouault@mines-paris.org>
 
@@ -2900,7 +7629,7 @@
 	result, we end up writing past the end of the buffer.
 
 	There are also some related issues that this also fixes. For example,
-	TIFFGetField can return uninitalized pointer values, and the logic to
+	TIFFGetField can return uninitialized pointer values, and the logic to
 	detect a N=3 vs N=1 transfer function seemed rather strange.
 
 	It is also strange that we declare the transfer functions to be of type
@@ -3315,7 +8044,7 @@
 	scans and not interleavedin a single one, needs to allocate memory (or
 	backing store) for the whole strip/tile.
 	See http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf
-	This limitation may be overriden by setting the 
+	This limitation may be overridden by setting the
 	LIBTIFF_ALLOW_LARGE_LIBJPEG_MEM_ALLOC environment variable, or recompiling
 	libtiff with a custom value of TIFF_LIBJPEG_LARGEST_MEM_ALLOC macro.
 
@@ -3598,7 +8327,7 @@
 
 	* libtiff/tif_jpeg.c: only run JPEGFixupTagsSubsampling() if the
 	YCbCrSubsampling tag is not explicitly present. This helps a bit to reduce
-	the I/O amount when te tag is present (especially on cloud hosted files).
+	the I/O amount when the tag is present (especially on cloud hosted files).
 
 2017-01-14 Even Rouault <even.rouault at spatialys.com>
 
@@ -3839,7 +8568,7 @@
 2016-12-03 Even Rouault <even.rouault at spatialys.com>
 
 	* libtiff/tif_dirread.c: modify ChopUpSingleUncompressedStrip() to
-	instanciate compute ntrips as TIFFhowmany_32(td->td_imagelength, rowsperstrip),
+	instantiate compute nstrips as TIFFhowmany_32(td->td_imagelength, rowsperstrip),
 	instead of a logic based on the total size of data. Which is faulty is
 	the total size of data is not sufficient to fill the whole image, and thus
 	results in reading outside of the StripByCounts/StripOffsets arrays when
@@ -3863,7 +8592,7 @@
 
 2016-12-02 Even Rouault <even.rouault at spatialys.com>
 
-	* tools/tiffcp.c: avoid uint32 underflow in cpDecodedStrips that 
+	* tools/tiffcp.c: avoid uint32 underflow in cpDecodedStrips that
 	can cause various issues, such as buffer overflows in the library.
 	Reported by Agostino Sarubbo.
 	Fixes http://bugzilla.maptools.org/show_bug.cgi?id=2598
@@ -4305,7 +9034,7 @@
 	* libtiff/tif_write.c: TIFFWriteEncodedStrip() and TIFFWriteEncodedTile()
 	should return -1 in case of failure of tif_encodestrip() as documented
 	* libtiff/tif_dumpmode.c: DumpModeEncode() should return 0 in case of
-	failure so that the above mentionned functions detect the error.
+	failure so that the above mentioned functions detect the error.
 
 2015-12-06  Even Rouault <even.rouault at spatialys.com>
 
@@ -4328,7 +9057,7 @@
 2015-11-22  Even Rouault <even.rouault at spatialys.com>
 
 	* libtiff/*.c: fix typos in comments (patch by Kurt Schwehr)
- 
+
 2015-11-22  Even Rouault <even.rouault at spatialys.com>
 
 	* libtiff/*.c: fix MSVC warnings related to cast shortening and
@@ -4896,7 +9625,7 @@
 2014-12-27  Even Rouault  <even.rouault@spatialys.com>
 
 	* libtiff/tif_dir.c: in TIFFDefaultDirectory(), reset any already existing
-	extented tags installed by user code through the extender mechaninm before
+	extended tags installed by user code through the extender mechanism before
 	calling the extender callback (GDAL #5054)
 
 2014-12-26  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
@@ -4999,14 +9728,14 @@
 	cpStripToTile() (called from writeBufferToContigTiles).
 	Note that the resulting TIFF file would be scrambled even
 	if tiffcp wouldn't crash, since the output file would contain
-	RGB data intepreted as subsampled YCbCr values.
+	RGB data interpreted as subsampled YCbCr values.
 	This patch fixes the problem by forcing RGB space on the output
 	TIF if the input is JPEG-encoded and output is *not* JPEG-encoded.
 	Author: Tomasz Buchert <tomasz.buchert@inria.fr>
 
 2014-12-21  Even Rouault  <even.rouault@spatialys.com>
 
-	Fix various crasher bugs on fuzzed images.
+	Fix various crash bugs on fuzzed images.
 	* libtiff/tif_dir.c: TIFFSetField(): refuse to set negative values for
 	TIFFTAG_XRESOLUTION and TIFFTAG_YRESOLUTION that cause asserts when writing
 	the directory
@@ -5343,7 +10072,7 @@
 
 	*  libtiff 4.0.2 released.
 
-	* tools/tif2pdf.c, tools/tifdump.c: avoid unitialized variable
+	* tools/tif2pdf.c, tools/tifdump.c: avoid uninitialized variable
 	warnings with clang.
 
 2012-06-15  Tom Lane  <tgl@sss.pgh.pa.us>
@@ -6990,7 +11719,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 	Added support for a TIFF_PTRDIFF_T type to use when doing pointer arithmetic.
 	Added support for a TIFF_SSIZE_T in order to return memory sizes but still
 	allow returning -1 for errors.
-	* libtiff/tiffconf.vc.h: Add porting type defintions for WIN32.
+	* libtiff/tiffconf.vc.h: Add porting type definitions for WIN32.
 
 2007-06-25  Bob Friesenhahn  <bfriesen@simple.dallas.tx.us>
 
@@ -7125,7 +11854,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 	* libtiff/tif_config.wince.h: Added configuration header for WinCE.
 	* libtiff/tiffconf.wince.h: Ported old configuration header for WinCE.
 	* libtiff/tif_wince.c: Added WinCE-specific implementation of some
-	functons from tif_win32.c.
+	functions from tif_win32.c.
 	* libtiff/tif_win32.c: Disabled some functions already reimplemented in tif_wince.c.
 	* libtiff/tiffiop.h, port/lfind.c: Added conditional include of some
 	standard header files for Windows CE build.
@@ -7369,7 +12098,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 
 	* libtiff/tif_jpeg.c: strip size related bugfix in encode raw
 
-	* libtiff/tif_strip.c: temporarilly added two new versions of
+	* libtiff/tif_strip.c: temporarily added two new versions of
 	TIFFScanlineSize
 	  - TIFFNewScanlineSize: proposed new version, after all related
 	    issues and side-effects are sorted out
@@ -7429,7 +12158,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 
 2006-03-16  Andrey Kiselev  <dron@ak4719.spb.edu>
 
-	* libtiff/tiffiop.h: Added decalration for
+	* libtiff/tiffiop.h: Added declaration for
 	_TIFFSetDefaultCompressionState().
 
 	* libtiff/{tif_jpeg.c, tif_fax3.c, tif_zip.c, tif_pixarlog.c,
@@ -7759,7 +12488,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 
 	http://bugzilla.remotesensing.org/show_bug.cgi?id=1003
 
-	* libtiff/tif_dirinfo.c: Correctly yse bsearch() and lfind()
+	* libtiff/tif_dirinfo.c: Correctly use bsearch() and lfind()
 	functions as per bug
 
 	http://bugzilla.remotesensing.org/show_bug.cgi?id=1008
@@ -7804,7 +12533,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 
 	http://bugzilla.remotesensing.org/show_bug.cgi?id=977
 
-	* tools/tiffsplit.c: Copy fax related fields over splitted parts
+	* tools/tiffsplit.c: Copy fax related fields over split parts
 	as per bug
 
 	http://bugzilla.remotesensing.org/show_bug.cgi?id=983
@@ -7986,12 +12715,12 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 2005-06-07  Andrey Kiselev  <dron@ak4719.spb.edu>
 
 	* contrib/addtiffo/tif_ovrcache.c: Properly extract tile/strip size;
-	use pixel sized shift in contigous case.
+	use pixel sized shift in contiguous case.
 
 2005-06-06  Andrey Kiselev  <dron@ak4719.spb.edu>
 
 	* contrib/addtiffo/{tif_overview.c, tif_ovrcache.c, tif_ovrcache.h}:
-	Make overviews working for contiguos images.
+	Make overviews working for contiguous images.
 
 2005-06-03  Andrey Kiselev  <dron@ak4719.spb.edu>
 
@@ -8421,7 +13150,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 
 	http://bugzilla.remotesensing.org/show_bug.cgi?id=697
 
-	* libtiff/tif_config.in.vc: Removed unneded definitions for
+	* libtiff/tif_config.in.vc: Removed unneeded definitions for
 	read/open/close/lseek functions to fix the
 
 	http://bugzilla.remotesensing.org/show_bug.cgi?id=680
@@ -9280,7 +14009,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 	* man/tiff2pdf.1: Few improvements in page layout.
 
 	* Makefile.in, /man/Makefile.in, /html/man/tiff2pdf.1.html:
-	 Added support fpr tiff2pdf manual page.
+	 Added support for tiff2pdf manual page.
 
 2003-11-26 Ross Finlayson  <libtiff@apexinternetsoftware.com>
 
@@ -9289,7 +14018,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 2003-11-26  Andrey Kiselev  <dron@ak4719.spb.edu>
 
 	* Makefile.in, /tools/{Makefile.in, makefile.vc}:
-	 Added support fpr tiff2pdf utility.
+	 Added support for tiff2pdf utility.
 
 2003-11-25  Ross Finlayson  <libtiff@apexinternetsoftware.com>
 
@@ -9332,7 +14061,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 2003-11-17  Andrey Kiselev  <dron@ak4719.spb.edu>
 
 	* contrib/pds/{tif_pdsdirread.c, tif_pdsdirwrite.c}: Use
-	TIFFDataWidth() function insted of tiffDataWidth array.
+	TIFFDataWidth() function instead of tiffDataWidth array.
 
 2003-11-16  Andrey Kiselev  <dron@ak4719.spb.edu>
 
@@ -10119,7 +14848,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 	TIFFDataType sizes instead of working with tiffDataWidth array
 	directly. Should prevent out-of-borders bugs in case of unknown or
 	broken data types.  EstimateStripByteCounts routine modified, so it
-	won't work when tags with uknown sizes founded.
+	won't work when tags with unknown sizes founded.
 	Closes http://bugzilla.remotesensing.org/show_bug.cgi?id=109
 
 2002-03-13  Andrey Kiselev  <dron@ak4719.spb.edu>
@@ -10321,7 +15050,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 
 	* libtiff/tif_getimage.c: relax handling of contig case where
 	there are extra samples that are supposed to be ignored.  This
-	should now work for 8bit greyscale or palletted images.
+	should now work for 8bit greyscale or paletted images.
 
 	http://bugzilla.remotesensing.org/show_bug.cgi?id=75
 
@@ -10551,7 +15280,7 @@ btiff/tif_win32.c: Replace custom Win32 memory api with generic
 2001-02-16  Frank Warmerdam  <warmerdam@pobox.com>
 
 	* libtiff/libtiff.def: Brent Roman submitted new version adding
-	serveral missing entry points.
+	several missing entry points.
 
 	* libtiff/tif_dirinfo.c: don't declare tiffFieldInfo static on VMS.
 	Some sort of weird VMS thing.
@@ -10961,7 +15690,7 @@ Tue Nov 30 14:41:43 1999  Frank Warmerdam  <warmerda@gdal.velocet.ca>    *** 3.5
 
 Tue Nov 30 14:15:32 EST 1999   Mike Welles <mike@onshore.com>
 
-        * Added zip creation to relase makefile target
+        * Added zip creation to release makefile target
 
 	* Added html for TIFFWriteTile.3t man page.
 
diff --git a/3rdparty/libtiff/README.md b/3rdparty/libtiff/README.md
new file mode 100644
index 000000000000..0d83ba24d9db
--- /dev/null
+++ b/3rdparty/libtiff/README.md
@@ -0,0 +1,69 @@
+TIFF Software Distribution
+--------------------------
+This file is just a placeholder; the entire documentation is now located
+as reStructuredText in the doc directory. To view the documentation
+as HTML, visit https://libtiff.gitlab.io/libtiff/ or
+http://www.simplesystems.org/libtiff/ or within the release package
+in the doc/html-prebuilt directory. The manual pages are
+located at doc/man-prebuilt.
+
+The release package can be downloaded at
+
+http://download.osgeo.org/libtiff/
+
+If you can't hack either of these options then basically what you
+want to do is:
+
+    % ./configure
+    % make
+    % su
+    # make install
+
+More information, email contacts, and mailing list information can be 
+found online at http://www.simplesystems.org/libtiff/
+
+Source code repository
+----------------------
+
+[GitLab](https://gitlab.com/libtiff/libtiff)
+
+Bug database
+------------
+
+[GitLab issues](https://gitlab.com/libtiff/libtiff/issues)
+
+Previously, the project used
+[Bugzilla](http://bugzilla.maptools.org/buglist.cgi?product=libtiff). This
+is no longer in use, and all remaining issues have been migrated to GitLab.
+
+Use and Copyright
+-----------------
+Silicon Graphics has seen fit to allow us to give this work away.  It
+is free.  There is no support or guarantee of any sort as to its
+operations, correctness, or whatever.  If you do anything useful with
+all or parts of it you need to honor the copyright notices.   I would
+also be interested in knowing about it and, hopefully, be acknowledged.
+
+The legal way of saying that is:
+
+Copyright (c) 1988-1997 Sam Leffler
+Copyright (c) 1991-1997 Silicon Graphics, Inc.
+
+Permission to use, copy, modify, distribute, and sell this software and 
+its documentation for any purpose is hereby granted without fee, provided
+that (i) the above copyright notices and this permission notice appear in
+all copies of the software and related documentation, and (ii) the names of
+Sam Leffler and Silicon Graphics may not be used in any advertising or
+publicity relating to the software without the specific, prior written
+permission of Sam Leffler and Silicon Graphics.
+
+THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
+EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
+WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
+
+IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
+ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
+OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
+LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+OF THIS SOFTWARE.
diff --git a/3rdparty/libtiff/RELEASE-DATE b/3rdparty/libtiff/RELEASE-DATE
new file mode 100644
index 000000000000..68d943433ec3
--- /dev/null
+++ b/3rdparty/libtiff/RELEASE-DATE
@@ -0,0 +1 @@
+20230908
diff --git a/3rdparty/libtiff/libport.h b/3rdparty/libtiff/libport.h
deleted file mode 100644
index 9f2dace1440a..000000000000
--- a/3rdparty/libtiff/libport.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2009 Frank Warmerdam
- *
- * Permission to use, copy, modify, distribute, and sell this software and 
- * its documentation for any purpose is hereby granted without fee, provided
- * that (i) the above copyright notices and this permission notice appear in
- * all copies of the software and related documentation, and (ii) the names of
- * Sam Leffler and Silicon Graphics may not be used in any advertising or
- * publicity relating to the software without the specific, prior written
- * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
- * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
- * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
- * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THIS SOFTWARE.
- */
-
-#ifndef _LIBPORT_
-#define	_LIBPORT_
-
-#if defined(HAVE_CONFIG_H)
-#  include <tif_config.h>
-#endif
-
-int getopt(int argc, char * const argv[], const char *optstring);
-extern   char *optarg;
-extern   int opterr;
-extern   int optind;
-extern   int optopt;
-
-int strcasecmp(const char *s1, const char *s2);
-
-#ifndef HAVE_GETOPT
-#  define HAVE_GETOPT 1
-#endif
-
-#if !defined(HAVE_STRTOL)
-long strtol(const char *nptr, char **endptr, int base);
-#endif
-#if !defined(HAVE_STRTOLL)
-long long strtoll(const char *nptr, char **endptr, int base);
-#endif
-#if !defined(HAVE_STRTOUL)
-unsigned long strtoul(const char *nptr, char **endptr, int base);
-#endif
-#if !defined(HAVE_STRTOULL)
-unsigned long long strtoull(const char *nptr, char **endptr, int base);
-#endif
-
-#if 0
-void *
-lfind(const void *key, const void *base, size_t *nmemb, size_t size,
-      int(*compar)(const void *, const void *));
-#endif
-
-#if !defined(HAVE_SNPRINTF)
-#undef vsnprintf
-#define vsnprintf _TIFF_vsnprintf_f
-
-#undef snprintf
-#define snprintf _TIFF_snprintf_f
-int snprintf(char* str, size_t size, const char* format, ...);
-#endif
-
-#endif /* ndef _LIBPORT_ */
diff --git a/3rdparty/libtiff/snprintf.c b/3rdparty/libtiff/snprintf.c
deleted file mode 100644
index 3542ab759ba2..000000000000
--- a/3rdparty/libtiff/snprintf.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Workaround for lack of snprintf(3) in Visual Studio.  See
- * http://stackoverflow.com/questions/2915672/snprintf-and-visual-studio-2010/8712996#8712996
- * It's a trivial wrapper around the builtin _vsnprintf_s and
- * _vscprintf functions.
- */
-
-#ifdef _MSC_VER
-
-#include <stdio.h>
-#include <stdarg.h>
-#include "libport.h"
-
-int _TIFF_vsnprintf_f(char* str, size_t size, const char* format, va_list ap)
-{
-  int count = -1;
-
-  if (size != 0)
-#if _MSC_VER <=	1310
-    count = _vsnprintf(str, size, format, ap);
-#else
-    count = _vsnprintf_s(str, size, _TRUNCATE, format, ap);
-#endif
-  if (count == -1)
-    count = _vscprintf(format, ap);
-
-  return count;
-}
-
-int _TIFF_snprintf_f(char* str, size_t size, const char* format, ...)
-{
-  int count;
-  va_list ap;
-
-  va_start(ap, format);
-  count = vsnprintf(str, size, format, ap);
-  va_end(ap);
-
-  return count;
-}
-
-#endif // _MSC_VER
diff --git a/3rdparty/libtiff/t4.h b/3rdparty/libtiff/t4.h
index fb0951a16f74..f933d4a336c9 100644
--- a/3rdparty/libtiff/t4.h
+++ b/3rdparty/libtiff/t4.h
@@ -23,26 +23,27 @@
  */
 
 #ifndef _T4_
-#define	_T4_
+#define _T4_
 /*
  * CCITT T.4 1D Huffman runlength codes and
  * related definitions.  Given the small sizes
  * of these tables it does not seem
  * worthwhile to make code & length 8 bits.
  */
-typedef struct tableentry {
-    unsigned short length;  /* bit length of g3 code */
-    unsigned short code;    /* g3 code */
-    short runlen;           /* run length in bits */
+typedef struct tableentry
+{
+    unsigned short length; /* bit length of g3 code */
+    unsigned short code;   /* g3 code */
+    short runlen;          /* run length in bits */
 } tableentry;
 
-#define EOL	0x001	/* EOL code value - 0000 0000 0000 1 */
+#define EOL 0x001 /* EOL code value - 0000 0000 0000 1 */
 
 /* status values returned instead of a run length */
-#define G3CODE_EOL	-1	/* NB: ACT_EOL - ACT_WRUNT */
-#define G3CODE_INVALID	-2	/* NB: ACT_INVALID - ACT_WRUNT */
-#define G3CODE_EOF	-3	/* end of input data */
-#define G3CODE_INCOMP	-4	/* incomplete run code */
+#define G3CODE_EOL -1     /* NB: ACT_EOL - ACT_WRUNT */
+#define G3CODE_INVALID -2 /* NB: ACT_INVALID - ACT_WRUNT */
+#define G3CODE_EOF -3     /* end of input data */
+#define G3CODE_INCOMP -4  /* incomplete run code */
 
 /*
  * Note that these tables are ordered such that the
@@ -54,237 +55,230 @@ typedef struct tableentry {
  */
 #ifdef G3CODES
 const tableentry TIFFFaxWhiteCodes[] = {
-    { 8, 0x35, 0 },	/* 0011 0101 */
-    { 6, 0x7, 1 },	/* 0001 11 */
-    { 4, 0x7, 2 },	/* 0111 */
-    { 4, 0x8, 3 },	/* 1000 */
-    { 4, 0xB, 4 },	/* 1011 */
-    { 4, 0xC, 5 },	/* 1100 */
-    { 4, 0xE, 6 },	/* 1110 */
-    { 4, 0xF, 7 },	/* 1111 */
-    { 5, 0x13, 8 },	/* 1001 1 */
-    { 5, 0x14, 9 },	/* 1010 0 */
-    { 5, 0x7, 10 },	/* 0011 1 */
-    { 5, 0x8, 11 },	/* 0100 0 */
-    { 6, 0x8, 12 },	/* 0010 00 */
-    { 6, 0x3, 13 },	/* 0000 11 */
-    { 6, 0x34, 14 },	/* 1101 00 */
-    { 6, 0x35, 15 },	/* 1101 01 */
-    { 6, 0x2A, 16 },	/* 1010 10 */
-    { 6, 0x2B, 17 },	/* 1010 11 */
-    { 7, 0x27, 18 },	/* 0100 111 */
-    { 7, 0xC, 19 },	/* 0001 100 */
-    { 7, 0x8, 20 },	/* 0001 000 */
-    { 7, 0x17, 21 },	/* 0010 111 */
-    { 7, 0x3, 22 },	/* 0000 011 */
-    { 7, 0x4, 23 },	/* 0000 100 */
-    { 7, 0x28, 24 },	/* 0101 000 */
-    { 7, 0x2B, 25 },	/* 0101 011 */
-    { 7, 0x13, 26 },	/* 0010 011 */
-    { 7, 0x24, 27 },	/* 0100 100 */
-    { 7, 0x18, 28 },	/* 0011 000 */
-    { 8, 0x2, 29 },	/* 0000 0010 */
-    { 8, 0x3, 30 },	/* 0000 0011 */
-    { 8, 0x1A, 31 },	/* 0001 1010 */
-    { 8, 0x1B, 32 },	/* 0001 1011 */
-    { 8, 0x12, 33 },	/* 0001 0010 */
-    { 8, 0x13, 34 },	/* 0001 0011 */
-    { 8, 0x14, 35 },	/* 0001 0100 */
-    { 8, 0x15, 36 },	/* 0001 0101 */
-    { 8, 0x16, 37 },	/* 0001 0110 */
-    { 8, 0x17, 38 },	/* 0001 0111 */
-    { 8, 0x28, 39 },	/* 0010 1000 */
-    { 8, 0x29, 40 },	/* 0010 1001 */
-    { 8, 0x2A, 41 },	/* 0010 1010 */
-    { 8, 0x2B, 42 },	/* 0010 1011 */
-    { 8, 0x2C, 43 },	/* 0010 1100 */
-    { 8, 0x2D, 44 },	/* 0010 1101 */
-    { 8, 0x4, 45 },	/* 0000 0100 */
-    { 8, 0x5, 46 },	/* 0000 0101 */
-    { 8, 0xA, 47 },	/* 0000 1010 */
-    { 8, 0xB, 48 },	/* 0000 1011 */
-    { 8, 0x52, 49 },	/* 0101 0010 */
-    { 8, 0x53, 50 },	/* 0101 0011 */
-    { 8, 0x54, 51 },	/* 0101 0100 */
-    { 8, 0x55, 52 },	/* 0101 0101 */
-    { 8, 0x24, 53 },	/* 0010 0100 */
-    { 8, 0x25, 54 },	/* 0010 0101 */
-    { 8, 0x58, 55 },	/* 0101 1000 */
-    { 8, 0x59, 56 },	/* 0101 1001 */
-    { 8, 0x5A, 57 },	/* 0101 1010 */
-    { 8, 0x5B, 58 },	/* 0101 1011 */
-    { 8, 0x4A, 59 },	/* 0100 1010 */
-    { 8, 0x4B, 60 },	/* 0100 1011 */
-    { 8, 0x32, 61 },	/* 0011 0010 */
-    { 8, 0x33, 62 },	/* 0011 0011 */
-    { 8, 0x34, 63 },	/* 0011 0100 */
-    { 5, 0x1B, 64 },	/* 1101 1 */
-    { 5, 0x12, 128 },	/* 1001 0 */
-    { 6, 0x17, 192 },	/* 0101 11 */
-    { 7, 0x37, 256 },	/* 0110 111 */
-    { 8, 0x36, 320 },	/* 0011 0110 */
-    { 8, 0x37, 384 },	/* 0011 0111 */
-    { 8, 0x64, 448 },	/* 0110 0100 */
-    { 8, 0x65, 512 },	/* 0110 0101 */
-    { 8, 0x68, 576 },	/* 0110 1000 */
-    { 8, 0x67, 640 },	/* 0110 0111 */
-    { 9, 0xCC, 704 },	/* 0110 0110 0 */
-    { 9, 0xCD, 768 },	/* 0110 0110 1 */
-    { 9, 0xD2, 832 },	/* 0110 1001 0 */
-    { 9, 0xD3, 896 },	/* 0110 1001 1 */
-    { 9, 0xD4, 960 },	/* 0110 1010 0 */
-    { 9, 0xD5, 1024 },	/* 0110 1010 1 */
-    { 9, 0xD6, 1088 },	/* 0110 1011 0 */
-    { 9, 0xD7, 1152 },	/* 0110 1011 1 */
-    { 9, 0xD8, 1216 },	/* 0110 1100 0 */
-    { 9, 0xD9, 1280 },	/* 0110 1100 1 */
-    { 9, 0xDA, 1344 },	/* 0110 1101 0 */
-    { 9, 0xDB, 1408 },	/* 0110 1101 1 */
-    { 9, 0x98, 1472 },	/* 0100 1100 0 */
-    { 9, 0x99, 1536 },	/* 0100 1100 1 */
-    { 9, 0x9A, 1600 },	/* 0100 1101 0 */
-    { 6, 0x18, 1664 },	/* 0110 00 */
-    { 9, 0x9B, 1728 },	/* 0100 1101 1 */
-    { 11, 0x8, 1792 },	/* 0000 0001 000 */
-    { 11, 0xC, 1856 },	/* 0000 0001 100 */
-    { 11, 0xD, 1920 },	/* 0000 0001 101 */
-    { 12, 0x12, 1984 },	/* 0000 0001 0010 */
-    { 12, 0x13, 2048 },	/* 0000 0001 0011 */
-    { 12, 0x14, 2112 },	/* 0000 0001 0100 */
-    { 12, 0x15, 2176 },	/* 0000 0001 0101 */
-    { 12, 0x16, 2240 },	/* 0000 0001 0110 */
-    { 12, 0x17, 2304 },	/* 0000 0001 0111 */
-    { 12, 0x1C, 2368 },	/* 0000 0001 1100 */
-    { 12, 0x1D, 2432 },	/* 0000 0001 1101 */
-    { 12, 0x1E, 2496 },	/* 0000 0001 1110 */
-    { 12, 0x1F, 2560 },	/* 0000 0001 1111 */
-    { 12, 0x1, G3CODE_EOL },	/* 0000 0000 0001 */
-    { 9, 0x1, G3CODE_INVALID },	/* 0000 0000 1 */
-    { 10, 0x1, G3CODE_INVALID },	/* 0000 0000 01 */
-    { 11, 0x1, G3CODE_INVALID },	/* 0000 0000 001 */
-    { 12, 0x0, G3CODE_INVALID },	/* 0000 0000 0000 */
+    {8, 0x35, 0},              /* 0011 0101 */
+    {6, 0x7, 1},               /* 0001 11 */
+    {4, 0x7, 2},               /* 0111 */
+    {4, 0x8, 3},               /* 1000 */
+    {4, 0xB, 4},               /* 1011 */
+    {4, 0xC, 5},               /* 1100 */
+    {4, 0xE, 6},               /* 1110 */
+    {4, 0xF, 7},               /* 1111 */
+    {5, 0x13, 8},              /* 1001 1 */
+    {5, 0x14, 9},              /* 1010 0 */
+    {5, 0x7, 10},              /* 0011 1 */
+    {5, 0x8, 11},              /* 0100 0 */
+    {6, 0x8, 12},              /* 0010 00 */
+    {6, 0x3, 13},              /* 0000 11 */
+    {6, 0x34, 14},             /* 1101 00 */
+    {6, 0x35, 15},             /* 1101 01 */
+    {6, 0x2A, 16},             /* 1010 10 */
+    {6, 0x2B, 17},             /* 1010 11 */
+    {7, 0x27, 18},             /* 0100 111 */
+    {7, 0xC, 19},              /* 0001 100 */
+    {7, 0x8, 20},              /* 0001 000 */
+    {7, 0x17, 21},             /* 0010 111 */
+    {7, 0x3, 22},              /* 0000 011 */
+    {7, 0x4, 23},              /* 0000 100 */
+    {7, 0x28, 24},             /* 0101 000 */
+    {7, 0x2B, 25},             /* 0101 011 */
+    {7, 0x13, 26},             /* 0010 011 */
+    {7, 0x24, 27},             /* 0100 100 */
+    {7, 0x18, 28},             /* 0011 000 */
+    {8, 0x2, 29},              /* 0000 0010 */
+    {8, 0x3, 30},              /* 0000 0011 */
+    {8, 0x1A, 31},             /* 0001 1010 */
+    {8, 0x1B, 32},             /* 0001 1011 */
+    {8, 0x12, 33},             /* 0001 0010 */
+    {8, 0x13, 34},             /* 0001 0011 */
+    {8, 0x14, 35},             /* 0001 0100 */
+    {8, 0x15, 36},             /* 0001 0101 */
+    {8, 0x16, 37},             /* 0001 0110 */
+    {8, 0x17, 38},             /* 0001 0111 */
+    {8, 0x28, 39},             /* 0010 1000 */
+    {8, 0x29, 40},             /* 0010 1001 */
+    {8, 0x2A, 41},             /* 0010 1010 */
+    {8, 0x2B, 42},             /* 0010 1011 */
+    {8, 0x2C, 43},             /* 0010 1100 */
+    {8, 0x2D, 44},             /* 0010 1101 */
+    {8, 0x4, 45},              /* 0000 0100 */
+    {8, 0x5, 46},              /* 0000 0101 */
+    {8, 0xA, 47},              /* 0000 1010 */
+    {8, 0xB, 48},              /* 0000 1011 */
+    {8, 0x52, 49},             /* 0101 0010 */
+    {8, 0x53, 50},             /* 0101 0011 */
+    {8, 0x54, 51},             /* 0101 0100 */
+    {8, 0x55, 52},             /* 0101 0101 */
+    {8, 0x24, 53},             /* 0010 0100 */
+    {8, 0x25, 54},             /* 0010 0101 */
+    {8, 0x58, 55},             /* 0101 1000 */
+    {8, 0x59, 56},             /* 0101 1001 */
+    {8, 0x5A, 57},             /* 0101 1010 */
+    {8, 0x5B, 58},             /* 0101 1011 */
+    {8, 0x4A, 59},             /* 0100 1010 */
+    {8, 0x4B, 60},             /* 0100 1011 */
+    {8, 0x32, 61},             /* 0011 0010 */
+    {8, 0x33, 62},             /* 0011 0011 */
+    {8, 0x34, 63},             /* 0011 0100 */
+    {5, 0x1B, 64},             /* 1101 1 */
+    {5, 0x12, 128},            /* 1001 0 */
+    {6, 0x17, 192},            /* 0101 11 */
+    {7, 0x37, 256},            /* 0110 111 */
+    {8, 0x36, 320},            /* 0011 0110 */
+    {8, 0x37, 384},            /* 0011 0111 */
+    {8, 0x64, 448},            /* 0110 0100 */
+    {8, 0x65, 512},            /* 0110 0101 */
+    {8, 0x68, 576},            /* 0110 1000 */
+    {8, 0x67, 640},            /* 0110 0111 */
+    {9, 0xCC, 704},            /* 0110 0110 0 */
+    {9, 0xCD, 768},            /* 0110 0110 1 */
+    {9, 0xD2, 832},            /* 0110 1001 0 */
+    {9, 0xD3, 896},            /* 0110 1001 1 */
+    {9, 0xD4, 960},            /* 0110 1010 0 */
+    {9, 0xD5, 1024},           /* 0110 1010 1 */
+    {9, 0xD6, 1088},           /* 0110 1011 0 */
+    {9, 0xD7, 1152},           /* 0110 1011 1 */
+    {9, 0xD8, 1216},           /* 0110 1100 0 */
+    {9, 0xD9, 1280},           /* 0110 1100 1 */
+    {9, 0xDA, 1344},           /* 0110 1101 0 */
+    {9, 0xDB, 1408},           /* 0110 1101 1 */
+    {9, 0x98, 1472},           /* 0100 1100 0 */
+    {9, 0x99, 1536},           /* 0100 1100 1 */
+    {9, 0x9A, 1600},           /* 0100 1101 0 */
+    {6, 0x18, 1664},           /* 0110 00 */
+    {9, 0x9B, 1728},           /* 0100 1101 1 */
+    {11, 0x8, 1792},           /* 0000 0001 000 */
+    {11, 0xC, 1856},           /* 0000 0001 100 */
+    {11, 0xD, 1920},           /* 0000 0001 101 */
+    {12, 0x12, 1984},          /* 0000 0001 0010 */
+    {12, 0x13, 2048},          /* 0000 0001 0011 */
+    {12, 0x14, 2112},          /* 0000 0001 0100 */
+    {12, 0x15, 2176},          /* 0000 0001 0101 */
+    {12, 0x16, 2240},          /* 0000 0001 0110 */
+    {12, 0x17, 2304},          /* 0000 0001 0111 */
+    {12, 0x1C, 2368},          /* 0000 0001 1100 */
+    {12, 0x1D, 2432},          /* 0000 0001 1101 */
+    {12, 0x1E, 2496},          /* 0000 0001 1110 */
+    {12, 0x1F, 2560},          /* 0000 0001 1111 */
+    {12, 0x1, G3CODE_EOL},     /* 0000 0000 0001 */
+    {9, 0x1, G3CODE_INVALID},  /* 0000 0000 1 */
+    {10, 0x1, G3CODE_INVALID}, /* 0000 0000 01 */
+    {11, 0x1, G3CODE_INVALID}, /* 0000 0000 001 */
+    {12, 0x0, G3CODE_INVALID}, /* 0000 0000 0000 */
 };
 
 const tableentry TIFFFaxBlackCodes[] = {
-    { 10, 0x37, 0 },	/* 0000 1101 11 */
-    { 3, 0x2, 1 },	/* 010 */
-    { 2, 0x3, 2 },	/* 11 */
-    { 2, 0x2, 3 },	/* 10 */
-    { 3, 0x3, 4 },	/* 011 */
-    { 4, 0x3, 5 },	/* 0011 */
-    { 4, 0x2, 6 },	/* 0010 */
-    { 5, 0x3, 7 },	/* 0001 1 */
-    { 6, 0x5, 8 },	/* 0001 01 */
-    { 6, 0x4, 9 },	/* 0001 00 */
-    { 7, 0x4, 10 },	/* 0000 100 */
-    { 7, 0x5, 11 },	/* 0000 101 */
-    { 7, 0x7, 12 },	/* 0000 111 */
-    { 8, 0x4, 13 },	/* 0000 0100 */
-    { 8, 0x7, 14 },	/* 0000 0111 */
-    { 9, 0x18, 15 },	/* 0000 1100 0 */
-    { 10, 0x17, 16 },	/* 0000 0101 11 */
-    { 10, 0x18, 17 },	/* 0000 0110 00 */
-    { 10, 0x8, 18 },	/* 0000 0010 00 */
-    { 11, 0x67, 19 },	/* 0000 1100 111 */
-    { 11, 0x68, 20 },	/* 0000 1101 000 */
-    { 11, 0x6C, 21 },	/* 0000 1101 100 */
-    { 11, 0x37, 22 },	/* 0000 0110 111 */
-    { 11, 0x28, 23 },	/* 0000 0101 000 */
-    { 11, 0x17, 24 },	/* 0000 0010 111 */
-    { 11, 0x18, 25 },	/* 0000 0011 000 */
-    { 12, 0xCA, 26 },	/* 0000 1100 1010 */
-    { 12, 0xCB, 27 },	/* 0000 1100 1011 */
-    { 12, 0xCC, 28 },	/* 0000 1100 1100 */
-    { 12, 0xCD, 29 },	/* 0000 1100 1101 */
-    { 12, 0x68, 30 },	/* 0000 0110 1000 */
-    { 12, 0x69, 31 },	/* 0000 0110 1001 */
-    { 12, 0x6A, 32 },	/* 0000 0110 1010 */
-    { 12, 0x6B, 33 },	/* 0000 0110 1011 */
-    { 12, 0xD2, 34 },	/* 0000 1101 0010 */
-    { 12, 0xD3, 35 },	/* 0000 1101 0011 */
-    { 12, 0xD4, 36 },	/* 0000 1101 0100 */
-    { 12, 0xD5, 37 },	/* 0000 1101 0101 */
-    { 12, 0xD6, 38 },	/* 0000 1101 0110 */
-    { 12, 0xD7, 39 },	/* 0000 1101 0111 */
-    { 12, 0x6C, 40 },	/* 0000 0110 1100 */
-    { 12, 0x6D, 41 },	/* 0000 0110 1101 */
-    { 12, 0xDA, 42 },	/* 0000 1101 1010 */
-    { 12, 0xDB, 43 },	/* 0000 1101 1011 */
-    { 12, 0x54, 44 },	/* 0000 0101 0100 */
-    { 12, 0x55, 45 },	/* 0000 0101 0101 */
-    { 12, 0x56, 46 },	/* 0000 0101 0110 */
-    { 12, 0x57, 47 },	/* 0000 0101 0111 */
-    { 12, 0x64, 48 },	/* 0000 0110 0100 */
-    { 12, 0x65, 49 },	/* 0000 0110 0101 */
-    { 12, 0x52, 50 },	/* 0000 0101 0010 */
-    { 12, 0x53, 51 },	/* 0000 0101 0011 */
-    { 12, 0x24, 52 },	/* 0000 0010 0100 */
-    { 12, 0x37, 53 },	/* 0000 0011 0111 */
-    { 12, 0x38, 54 },	/* 0000 0011 1000 */
-    { 12, 0x27, 55 },	/* 0000 0010 0111 */
-    { 12, 0x28, 56 },	/* 0000 0010 1000 */
-    { 12, 0x58, 57 },	/* 0000 0101 1000 */
-    { 12, 0x59, 58 },	/* 0000 0101 1001 */
-    { 12, 0x2B, 59 },	/* 0000 0010 1011 */
-    { 12, 0x2C, 60 },	/* 0000 0010 1100 */
-    { 12, 0x5A, 61 },	/* 0000 0101 1010 */
-    { 12, 0x66, 62 },	/* 0000 0110 0110 */
-    { 12, 0x67, 63 },	/* 0000 0110 0111 */
-    { 10, 0xF, 64 },	/* 0000 0011 11 */
-    { 12, 0xC8, 128 },	/* 0000 1100 1000 */
-    { 12, 0xC9, 192 },	/* 0000 1100 1001 */
-    { 12, 0x5B, 256 },	/* 0000 0101 1011 */
-    { 12, 0x33, 320 },	/* 0000 0011 0011 */
-    { 12, 0x34, 384 },	/* 0000 0011 0100 */
-    { 12, 0x35, 448 },	/* 0000 0011 0101 */
-    { 13, 0x6C, 512 },	/* 0000 0011 0110 0 */
-    { 13, 0x6D, 576 },	/* 0000 0011 0110 1 */
-    { 13, 0x4A, 640 },	/* 0000 0010 0101 0 */
-    { 13, 0x4B, 704 },	/* 0000 0010 0101 1 */
-    { 13, 0x4C, 768 },	/* 0000 0010 0110 0 */
-    { 13, 0x4D, 832 },	/* 0000 0010 0110 1 */
-    { 13, 0x72, 896 },	/* 0000 0011 1001 0 */
-    { 13, 0x73, 960 },	/* 0000 0011 1001 1 */
-    { 13, 0x74, 1024 },	/* 0000 0011 1010 0 */
-    { 13, 0x75, 1088 },	/* 0000 0011 1010 1 */
-    { 13, 0x76, 1152 },	/* 0000 0011 1011 0 */
-    { 13, 0x77, 1216 },	/* 0000 0011 1011 1 */
-    { 13, 0x52, 1280 },	/* 0000 0010 1001 0 */
-    { 13, 0x53, 1344 },	/* 0000 0010 1001 1 */
-    { 13, 0x54, 1408 },	/* 0000 0010 1010 0 */
-    { 13, 0x55, 1472 },	/* 0000 0010 1010 1 */
-    { 13, 0x5A, 1536 },	/* 0000 0010 1101 0 */
-    { 13, 0x5B, 1600 },	/* 0000 0010 1101 1 */
-    { 13, 0x64, 1664 },	/* 0000 0011 0010 0 */
-    { 13, 0x65, 1728 },	/* 0000 0011 0010 1 */
-    { 11, 0x8, 1792 },	/* 0000 0001 000 */
-    { 11, 0xC, 1856 },	/* 0000 0001 100 */
-    { 11, 0xD, 1920 },	/* 0000 0001 101 */
-    { 12, 0x12, 1984 },	/* 0000 0001 0010 */
-    { 12, 0x13, 2048 },	/* 0000 0001 0011 */
-    { 12, 0x14, 2112 },	/* 0000 0001 0100 */
-    { 12, 0x15, 2176 },	/* 0000 0001 0101 */
-    { 12, 0x16, 2240 },	/* 0000 0001 0110 */
-    { 12, 0x17, 2304 },	/* 0000 0001 0111 */
-    { 12, 0x1C, 2368 },	/* 0000 0001 1100 */
-    { 12, 0x1D, 2432 },	/* 0000 0001 1101 */
-    { 12, 0x1E, 2496 },	/* 0000 0001 1110 */
-    { 12, 0x1F, 2560 },	/* 0000 0001 1111 */
-    { 12, 0x1, G3CODE_EOL },	/* 0000 0000 0001 */
-    { 9, 0x1, G3CODE_INVALID },	/* 0000 0000 1 */
-    { 10, 0x1, G3CODE_INVALID },	/* 0000 0000 01 */
-    { 11, 0x1, G3CODE_INVALID },	/* 0000 0000 001 */
-    { 12, 0x0, G3CODE_INVALID },	/* 0000 0000 0000 */
+    {10, 0x37, 0},             /* 0000 1101 11 */
+    {3, 0x2, 1},               /* 010 */
+    {2, 0x3, 2},               /* 11 */
+    {2, 0x2, 3},               /* 10 */
+    {3, 0x3, 4},               /* 011 */
+    {4, 0x3, 5},               /* 0011 */
+    {4, 0x2, 6},               /* 0010 */
+    {5, 0x3, 7},               /* 0001 1 */
+    {6, 0x5, 8},               /* 0001 01 */
+    {6, 0x4, 9},               /* 0001 00 */
+    {7, 0x4, 10},              /* 0000 100 */
+    {7, 0x5, 11},              /* 0000 101 */
+    {7, 0x7, 12},              /* 0000 111 */
+    {8, 0x4, 13},              /* 0000 0100 */
+    {8, 0x7, 14},              /* 0000 0111 */
+    {9, 0x18, 15},             /* 0000 1100 0 */
+    {10, 0x17, 16},            /* 0000 0101 11 */
+    {10, 0x18, 17},            /* 0000 0110 00 */
+    {10, 0x8, 18},             /* 0000 0010 00 */
+    {11, 0x67, 19},            /* 0000 1100 111 */
+    {11, 0x68, 20},            /* 0000 1101 000 */
+    {11, 0x6C, 21},            /* 0000 1101 100 */
+    {11, 0x37, 22},            /* 0000 0110 111 */
+    {11, 0x28, 23},            /* 0000 0101 000 */
+    {11, 0x17, 24},            /* 0000 0010 111 */
+    {11, 0x18, 25},            /* 0000 0011 000 */
+    {12, 0xCA, 26},            /* 0000 1100 1010 */
+    {12, 0xCB, 27},            /* 0000 1100 1011 */
+    {12, 0xCC, 28},            /* 0000 1100 1100 */
+    {12, 0xCD, 29},            /* 0000 1100 1101 */
+    {12, 0x68, 30},            /* 0000 0110 1000 */
+    {12, 0x69, 31},            /* 0000 0110 1001 */
+    {12, 0x6A, 32},            /* 0000 0110 1010 */
+    {12, 0x6B, 33},            /* 0000 0110 1011 */
+    {12, 0xD2, 34},            /* 0000 1101 0010 */
+    {12, 0xD3, 35},            /* 0000 1101 0011 */
+    {12, 0xD4, 36},            /* 0000 1101 0100 */
+    {12, 0xD5, 37},            /* 0000 1101 0101 */
+    {12, 0xD6, 38},            /* 0000 1101 0110 */
+    {12, 0xD7, 39},            /* 0000 1101 0111 */
+    {12, 0x6C, 40},            /* 0000 0110 1100 */
+    {12, 0x6D, 41},            /* 0000 0110 1101 */
+    {12, 0xDA, 42},            /* 0000 1101 1010 */
+    {12, 0xDB, 43},            /* 0000 1101 1011 */
+    {12, 0x54, 44},            /* 0000 0101 0100 */
+    {12, 0x55, 45},            /* 0000 0101 0101 */
+    {12, 0x56, 46},            /* 0000 0101 0110 */
+    {12, 0x57, 47},            /* 0000 0101 0111 */
+    {12, 0x64, 48},            /* 0000 0110 0100 */
+    {12, 0x65, 49},            /* 0000 0110 0101 */
+    {12, 0x52, 50},            /* 0000 0101 0010 */
+    {12, 0x53, 51},            /* 0000 0101 0011 */
+    {12, 0x24, 52},            /* 0000 0010 0100 */
+    {12, 0x37, 53},            /* 0000 0011 0111 */
+    {12, 0x38, 54},            /* 0000 0011 1000 */
+    {12, 0x27, 55},            /* 0000 0010 0111 */
+    {12, 0x28, 56},            /* 0000 0010 1000 */
+    {12, 0x58, 57},            /* 0000 0101 1000 */
+    {12, 0x59, 58},            /* 0000 0101 1001 */
+    {12, 0x2B, 59},            /* 0000 0010 1011 */
+    {12, 0x2C, 60},            /* 0000 0010 1100 */
+    {12, 0x5A, 61},            /* 0000 0101 1010 */
+    {12, 0x66, 62},            /* 0000 0110 0110 */
+    {12, 0x67, 63},            /* 0000 0110 0111 */
+    {10, 0xF, 64},             /* 0000 0011 11 */
+    {12, 0xC8, 128},           /* 0000 1100 1000 */
+    {12, 0xC9, 192},           /* 0000 1100 1001 */
+    {12, 0x5B, 256},           /* 0000 0101 1011 */
+    {12, 0x33, 320},           /* 0000 0011 0011 */
+    {12, 0x34, 384},           /* 0000 0011 0100 */
+    {12, 0x35, 448},           /* 0000 0011 0101 */
+    {13, 0x6C, 512},           /* 0000 0011 0110 0 */
+    {13, 0x6D, 576},           /* 0000 0011 0110 1 */
+    {13, 0x4A, 640},           /* 0000 0010 0101 0 */
+    {13, 0x4B, 704},           /* 0000 0010 0101 1 */
+    {13, 0x4C, 768},           /* 0000 0010 0110 0 */
+    {13, 0x4D, 832},           /* 0000 0010 0110 1 */
+    {13, 0x72, 896},           /* 0000 0011 1001 0 */
+    {13, 0x73, 960},           /* 0000 0011 1001 1 */
+    {13, 0x74, 1024},          /* 0000 0011 1010 0 */
+    {13, 0x75, 1088},          /* 0000 0011 1010 1 */
+    {13, 0x76, 1152},          /* 0000 0011 1011 0 */
+    {13, 0x77, 1216},          /* 0000 0011 1011 1 */
+    {13, 0x52, 1280},          /* 0000 0010 1001 0 */
+    {13, 0x53, 1344},          /* 0000 0010 1001 1 */
+    {13, 0x54, 1408},          /* 0000 0010 1010 0 */
+    {13, 0x55, 1472},          /* 0000 0010 1010 1 */
+    {13, 0x5A, 1536},          /* 0000 0010 1101 0 */
+    {13, 0x5B, 1600},          /* 0000 0010 1101 1 */
+    {13, 0x64, 1664},          /* 0000 0011 0010 0 */
+    {13, 0x65, 1728},          /* 0000 0011 0010 1 */
+    {11, 0x8, 1792},           /* 0000 0001 000 */
+    {11, 0xC, 1856},           /* 0000 0001 100 */
+    {11, 0xD, 1920},           /* 0000 0001 101 */
+    {12, 0x12, 1984},          /* 0000 0001 0010 */
+    {12, 0x13, 2048},          /* 0000 0001 0011 */
+    {12, 0x14, 2112},          /* 0000 0001 0100 */
+    {12, 0x15, 2176},          /* 0000 0001 0101 */
+    {12, 0x16, 2240},          /* 0000 0001 0110 */
+    {12, 0x17, 2304},          /* 0000 0001 0111 */
+    {12, 0x1C, 2368},          /* 0000 0001 1100 */
+    {12, 0x1D, 2432},          /* 0000 0001 1101 */
+    {12, 0x1E, 2496},          /* 0000 0001 1110 */
+    {12, 0x1F, 2560},          /* 0000 0001 1111 */
+    {12, 0x1, G3CODE_EOL},     /* 0000 0000 0001 */
+    {9, 0x1, G3CODE_INVALID},  /* 0000 0000 1 */
+    {10, 0x1, G3CODE_INVALID}, /* 0000 0000 01 */
+    {11, 0x1, G3CODE_INVALID}, /* 0000 0000 001 */
+    {12, 0x0, G3CODE_INVALID}, /* 0000 0000 0000 */
 };
 #else
 extern const tableentry TIFFFaxWhiteCodes[];
 extern const tableentry TIFFFaxBlackCodes[];
 #endif
 #endif /* _T4_ */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_aux.c b/3rdparty/libtiff/tif_aux.c
index c9f190545ec4..49855bb0b203 100644
--- a/3rdparty/libtiff/tif_aux.c
+++ b/3rdparty/libtiff/tif_aux.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1991-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -27,173 +27,180 @@
  *
  * Auxiliary Support Routines.
  */
-#include "tiffiop.h"
 #include "tif_predict.h"
-#include <math.h>
+#include "tiffiop.h"
 #include <float.h>
+#include <math.h>
 
-uint32
-_TIFFMultiply32(TIFF* tif, uint32 first, uint32 second, const char* where)
+uint32_t _TIFFMultiply32(TIFF *tif, uint32_t first, uint32_t second,
+                         const char *where)
 {
-	if (second && first > TIFF_UINT32_MAX / second) {
-		TIFFErrorExt(tif->tif_clientdata, where, "Integer overflow in %s", where);
-		return 0;
-	}
+    if (second && first > UINT32_MAX / second)
+    {
+        TIFFErrorExtR(tif, where, "Integer overflow in %s", where);
+        return 0;
+    }
 
-	return first * second;
+    return first * second;
 }
 
-uint64
-_TIFFMultiply64(TIFF* tif, uint64 first, uint64 second, const char* where)
+uint64_t _TIFFMultiply64(TIFF *tif, uint64_t first, uint64_t second,
+                         const char *where)
 {
-	if (second && first > TIFF_UINT64_MAX / second) {
-		TIFFErrorExt(tif->tif_clientdata, where, "Integer overflow in %s", where);
-		return 0;
-	}
+    if (second && first > UINT64_MAX / second)
+    {
+        TIFFErrorExtR(tif, where, "Integer overflow in %s", where);
+        return 0;
+    }
 
-	return first * second;
+    return first * second;
 }
 
-tmsize_t
-_TIFFMultiplySSize(TIFF* tif, tmsize_t first, tmsize_t second, const char* where)
+tmsize_t _TIFFMultiplySSize(TIFF *tif, tmsize_t first, tmsize_t second,
+                            const char *where)
 {
-    if( first <= 0 || second <= 0 )
+    if (first <= 0 || second <= 0)
     {
-        if( tif != NULL && where != NULL )
+        if (tif != NULL && where != NULL)
         {
-            TIFFErrorExt(tif->tif_clientdata, where,
-                        "Invalid argument to _TIFFMultiplySSize() in %s", where);
+            TIFFErrorExtR(tif, where,
+                          "Invalid argument to _TIFFMultiplySSize() in %s",
+                          where);
         }
         return 0;
     }
 
-    if( first > TIFF_TMSIZE_T_MAX / second )
+    if (first > TIFF_TMSIZE_T_MAX / second)
     {
-        if( tif != NULL && where != NULL )
+        if (tif != NULL && where != NULL)
         {
-            TIFFErrorExt(tif->tif_clientdata, where,
-                        "Integer overflow in %s", where);
+            TIFFErrorExtR(tif, where, "Integer overflow in %s", where);
         }
         return 0;
     }
     return first * second;
 }
 
-tmsize_t _TIFFCastUInt64ToSSize(TIFF* tif, uint64 val, const char* module)
+tmsize_t _TIFFCastUInt64ToSSize(TIFF *tif, uint64_t val, const char *module)
 {
-    if( val > (uint64)TIFF_TMSIZE_T_MAX )
+    if (val > (uint64_t)TIFF_TMSIZE_T_MAX)
     {
-        if( tif != NULL && module != NULL )
+        if (tif != NULL && module != NULL)
         {
-            TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
+            TIFFErrorExtR(tif, module, "Integer overflow");
         }
         return 0;
     }
     return (tmsize_t)val;
 }
 
-void*
-_TIFFCheckRealloc(TIFF* tif, void* buffer,
-		  tmsize_t nmemb, tmsize_t elem_size, const char* what)
+void *_TIFFCheckRealloc(TIFF *tif, void *buffer, tmsize_t nmemb,
+                        tmsize_t elem_size, const char *what)
 {
-	void* cp = NULL;
-        tmsize_t count = _TIFFMultiplySSize(tif, nmemb, elem_size, NULL);
-	/*
-	 * Check for integer overflow.
-	 */
-	if (count != 0)
-	{
-		cp = _TIFFrealloc(buffer, count);
-	}
-
-	if (cp == NULL) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "Failed to allocate memory for %s "
-			     "(%ld elements of %ld bytes each)",
-			     what,(long) nmemb, (long) elem_size);
-	}
-
-	return cp;
+    void *cp = NULL;
+    tmsize_t count = _TIFFMultiplySSize(tif, nmemb, elem_size, NULL);
+    /*
+     * Check for integer overflow.
+     */
+    if (count != 0)
+    {
+        cp = _TIFFreallocExt(tif, buffer, count);
+    }
+
+    if (cp == NULL)
+    {
+        TIFFErrorExtR(tif, tif->tif_name,
+                      "Failed to allocate memory for %s "
+                      "(%" TIFF_SSIZE_FORMAT " elements of %" TIFF_SSIZE_FORMAT
+                      " bytes each)",
+                      what, nmemb, elem_size);
+    }
+
+    return cp;
 }
 
-void*
-_TIFFCheckMalloc(TIFF* tif, tmsize_t nmemb, tmsize_t elem_size, const char* what)
+void *_TIFFCheckMalloc(TIFF *tif, tmsize_t nmemb, tmsize_t elem_size,
+                       const char *what)
 {
-	return _TIFFCheckRealloc(tif, NULL, nmemb, elem_size, what);  
+    return _TIFFCheckRealloc(tif, NULL, nmemb, elem_size, what);
 }
 
-static int
-TIFFDefaultTransferFunction(TIFFDirectory* td)
+static int TIFFDefaultTransferFunction(TIFF *tif, TIFFDirectory *td)
 {
-	uint16 **tf = td->td_transferfunction;
-	tmsize_t i, n, nbytes;
-
-	tf[0] = tf[1] = tf[2] = 0;
-	if (td->td_bitspersample >= sizeof(tmsize_t) * 8 - 2)
-		return 0;
-
-	n = ((tmsize_t)1)<<td->td_bitspersample;
-	nbytes = n * sizeof (uint16);
-        tf[0] = (uint16 *)_TIFFmalloc(nbytes);
-	if (tf[0] == NULL)
-		return 0;
-	tf[0][0] = 0;
-	for (i = 1; i < n; i++) {
-		double t = (double)i/((double) n-1.);
-		tf[0][i] = (uint16)floor(65535.*pow(t, 2.2) + .5);
-	}
-
-	if (td->td_samplesperpixel - td->td_extrasamples > 1) {
-                tf[1] = (uint16 *)_TIFFmalloc(nbytes);
-		if(tf[1] == NULL)
-			goto bad;
-		_TIFFmemcpy(tf[1], tf[0], nbytes);
-                tf[2] = (uint16 *)_TIFFmalloc(nbytes);
-		if (tf[2] == NULL)
-			goto bad;
-		_TIFFmemcpy(tf[2], tf[0], nbytes);
-	}
-	return 1;
+    uint16_t **tf = td->td_transferfunction;
+    tmsize_t i, n, nbytes;
+
+    tf[0] = tf[1] = tf[2] = 0;
+    if (td->td_bitspersample >= sizeof(tmsize_t) * 8 - 2)
+        return 0;
+
+    n = ((tmsize_t)1) << td->td_bitspersample;
+    nbytes = n * sizeof(uint16_t);
+    tf[0] = (uint16_t *)_TIFFmallocExt(tif, nbytes);
+    if (tf[0] == NULL)
+        return 0;
+    tf[0][0] = 0;
+    for (i = 1; i < n; i++)
+    {
+        double t = (double)i / ((double)n - 1.);
+        tf[0][i] = (uint16_t)floor(65535. * pow(t, 2.2) + .5);
+    }
+
+    if (td->td_samplesperpixel - td->td_extrasamples > 1)
+    {
+        tf[1] = (uint16_t *)_TIFFmallocExt(tif, nbytes);
+        if (tf[1] == NULL)
+            goto bad;
+        _TIFFmemcpy(tf[1], tf[0], nbytes);
+        tf[2] = (uint16_t *)_TIFFmallocExt(tif, nbytes);
+        if (tf[2] == NULL)
+            goto bad;
+        _TIFFmemcpy(tf[2], tf[0], nbytes);
+    }
+    return 1;
 
 bad:
-	if (tf[0])
-		_TIFFfree(tf[0]);
-	if (tf[1])
-		_TIFFfree(tf[1]);
-	if (tf[2])
-		_TIFFfree(tf[2]);
-	tf[0] = tf[1] = tf[2] = 0;
-	return 0;
+    if (tf[0])
+        _TIFFfreeExt(tif, tf[0]);
+    if (tf[1])
+        _TIFFfreeExt(tif, tf[1]);
+    if (tf[2])
+        _TIFFfreeExt(tif, tf[2]);
+    tf[0] = tf[1] = tf[2] = 0;
+    return 0;
 }
 
-static int
-TIFFDefaultRefBlackWhite(TIFFDirectory* td)
+static int TIFFDefaultRefBlackWhite(TIFF *tif, TIFFDirectory *td)
 {
-	int i;
-
-        td->td_refblackwhite = (float *)_TIFFmalloc(6*sizeof (float));
-	if (td->td_refblackwhite == NULL)
-		return 0;
-        if (td->td_photometric == PHOTOMETRIC_YCBCR) {
-		/*
-		 * YCbCr (Class Y) images must have the ReferenceBlackWhite
-		 * tag set. Fix the broken images, which lacks that tag.
-		 */
-		td->td_refblackwhite[0] = 0.0F;
-		td->td_refblackwhite[1] = td->td_refblackwhite[3] =
-			td->td_refblackwhite[5] = 255.0F;
-		td->td_refblackwhite[2] = td->td_refblackwhite[4] = 128.0F;
-	} else {
-		/*
-		 * Assume RGB (Class R)
-		 */
-		for (i = 0; i < 3; i++) {
-		    td->td_refblackwhite[2*i+0] = 0;
-		    td->td_refblackwhite[2*i+1] =
-			    (float)((1L<<td->td_bitspersample)-1L);
-		}
-	}
-	return 1;
+    int i;
+
+    td->td_refblackwhite = (float *)_TIFFmallocExt(tif, 6 * sizeof(float));
+    if (td->td_refblackwhite == NULL)
+        return 0;
+    if (td->td_photometric == PHOTOMETRIC_YCBCR)
+    {
+        /*
+         * YCbCr (Class Y) images must have the ReferenceBlackWhite
+         * tag set. Fix the broken images, which lacks that tag.
+         */
+        td->td_refblackwhite[0] = 0.0F;
+        td->td_refblackwhite[1] = td->td_refblackwhite[3] =
+            td->td_refblackwhite[5] = 255.0F;
+        td->td_refblackwhite[2] = td->td_refblackwhite[4] = 128.0F;
+    }
+    else
+    {
+        /*
+         * Assume RGB (Class R)
+         */
+        for (i = 0; i < 3; i++)
+        {
+            td->td_refblackwhite[2 * i + 0] = 0;
+            td->td_refblackwhite[2 * i + 1] =
+                (float)((1L << td->td_bitspersample) - 1L);
+        }
+    }
+    return 1;
 }
 
 /*
@@ -204,216 +211,248 @@ TIFFDefaultRefBlackWhite(TIFFDirectory* td)
  *	explicit values so that defaults exist only one
  *	place in the library -- in TIFFDefaultDirectory.
  */
-int
-TIFFVGetFieldDefaulted(TIFF* tif, uint32 tag, va_list ap)
+int TIFFVGetFieldDefaulted(TIFF *tif, uint32_t tag, va_list ap)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-
-	if (TIFFVGetField(tif, tag, ap))
-		return (1);
-	switch (tag) {
-	case TIFFTAG_SUBFILETYPE:
-		*va_arg(ap, uint32 *) = td->td_subfiletype;
-		return (1);
-	case TIFFTAG_BITSPERSAMPLE:
-		*va_arg(ap, uint16 *) = td->td_bitspersample;
-		return (1);
-	case TIFFTAG_THRESHHOLDING:
-		*va_arg(ap, uint16 *) = td->td_threshholding;
-		return (1);
-	case TIFFTAG_FILLORDER:
-		*va_arg(ap, uint16 *) = td->td_fillorder;
-		return (1);
-	case TIFFTAG_ORIENTATION:
-		*va_arg(ap, uint16 *) = td->td_orientation;
-		return (1);
-	case TIFFTAG_SAMPLESPERPIXEL:
-		*va_arg(ap, uint16 *) = td->td_samplesperpixel;
-		return (1);
-	case TIFFTAG_ROWSPERSTRIP:
-		*va_arg(ap, uint32 *) = td->td_rowsperstrip;
-		return (1);
-	case TIFFTAG_MINSAMPLEVALUE:
-		*va_arg(ap, uint16 *) = td->td_minsamplevalue;
-		return (1);
-	case TIFFTAG_MAXSAMPLEVALUE:
-		*va_arg(ap, uint16 *) = td->td_maxsamplevalue;
-		return (1);
-	case TIFFTAG_PLANARCONFIG:
-		*va_arg(ap, uint16 *) = td->td_planarconfig;
-		return (1);
-	case TIFFTAG_RESOLUTIONUNIT:
-		*va_arg(ap, uint16 *) = td->td_resolutionunit;
-		return (1);
-	case TIFFTAG_PREDICTOR:
+    TIFFDirectory *td = &tif->tif_dir;
+
+    if (TIFFVGetField(tif, tag, ap))
+        return (1);
+    switch (tag)
     {
-        TIFFPredictorState* sp = (TIFFPredictorState*) tif->tif_data;
-        if( sp == NULL )
+        case TIFFTAG_SUBFILETYPE:
+            *va_arg(ap, uint32_t *) = td->td_subfiletype;
+            return (1);
+        case TIFFTAG_BITSPERSAMPLE:
+            *va_arg(ap, uint16_t *) = td->td_bitspersample;
+            return (1);
+        case TIFFTAG_THRESHHOLDING:
+            *va_arg(ap, uint16_t *) = td->td_threshholding;
+            return (1);
+        case TIFFTAG_FILLORDER:
+            *va_arg(ap, uint16_t *) = td->td_fillorder;
+            return (1);
+        case TIFFTAG_ORIENTATION:
+            *va_arg(ap, uint16_t *) = td->td_orientation;
+            return (1);
+        case TIFFTAG_SAMPLESPERPIXEL:
+            *va_arg(ap, uint16_t *) = td->td_samplesperpixel;
+            return (1);
+        case TIFFTAG_ROWSPERSTRIP:
+            *va_arg(ap, uint32_t *) = td->td_rowsperstrip;
+            return (1);
+        case TIFFTAG_MINSAMPLEVALUE:
+            *va_arg(ap, uint16_t *) = td->td_minsamplevalue;
+            return (1);
+        case TIFFTAG_MAXSAMPLEVALUE:
+        {
+            uint16_t maxsamplevalue;
+            /* td_bitspersample=1 is always set in TIFFDefaultDirectory().
+             * Therefore, td_maxsamplevalue has to be re-calculated in
+             * TIFFGetFieldDefaulted(). */
+            if (td->td_bitspersample > 0)
+            {
+                /* This shift operation into a uint16_t limits the value to
+                 * 65535 even if td_bitspersamle is > 16 */
+                if (td->td_bitspersample <= 16)
+                {
+                    maxsamplevalue = (1 << td->td_bitspersample) -
+                                     1; /* 2**(BitsPerSample) - 1 */
+                }
+                else
+                {
+                    maxsamplevalue = 65535;
+                }
+            }
+            else
+            {
+                maxsamplevalue = 0;
+            }
+            *va_arg(ap, uint16_t *) = maxsamplevalue;
+            return (1);
+        }
+        case TIFFTAG_PLANARCONFIG:
+            *va_arg(ap, uint16_t *) = td->td_planarconfig;
+            return (1);
+        case TIFFTAG_RESOLUTIONUNIT:
+            *va_arg(ap, uint16_t *) = td->td_resolutionunit;
+            return (1);
+        case TIFFTAG_PREDICTOR:
         {
-            TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-                         "Cannot get \"Predictor\" tag as plugin is not configured");
-            *va_arg(ap, uint16*) = 0;
-            return 0;
+            TIFFPredictorState *sp = (TIFFPredictorState *)tif->tif_data;
+            if (sp == NULL)
+            {
+                TIFFErrorExtR(
+                    tif, tif->tif_name,
+                    "Cannot get \"Predictor\" tag as plugin is not configured");
+                *va_arg(ap, uint16_t *) = 0;
+                return 0;
+            }
+            *va_arg(ap, uint16_t *) = (uint16_t)sp->predictor;
+            return 1;
         }
-        *va_arg(ap, uint16*) = (uint16) sp->predictor;
-        return 1;
+        case TIFFTAG_DOTRANGE:
+            *va_arg(ap, uint16_t *) = 0;
+            *va_arg(ap, uint16_t *) = (1 << td->td_bitspersample) - 1;
+            return (1);
+        case TIFFTAG_INKSET:
+            *va_arg(ap, uint16_t *) = INKSET_CMYK;
+            return 1;
+        case TIFFTAG_NUMBEROFINKS:
+            *va_arg(ap, uint16_t *) = 4;
+            return (1);
+        case TIFFTAG_EXTRASAMPLES:
+            *va_arg(ap, uint16_t *) = td->td_extrasamples;
+            *va_arg(ap, const uint16_t **) = td->td_sampleinfo;
+            return (1);
+        case TIFFTAG_MATTEING:
+            *va_arg(ap, uint16_t *) =
+                (td->td_extrasamples == 1 &&
+                 td->td_sampleinfo[0] == EXTRASAMPLE_ASSOCALPHA);
+            return (1);
+        case TIFFTAG_TILEDEPTH:
+            *va_arg(ap, uint32_t *) = td->td_tiledepth;
+            return (1);
+        case TIFFTAG_DATATYPE:
+            *va_arg(ap, uint16_t *) = td->td_sampleformat - 1;
+            return (1);
+        case TIFFTAG_SAMPLEFORMAT:
+            *va_arg(ap, uint16_t *) = td->td_sampleformat;
+            return (1);
+        case TIFFTAG_IMAGEDEPTH:
+            *va_arg(ap, uint32_t *) = td->td_imagedepth;
+            return (1);
+        case TIFFTAG_YCBCRCOEFFICIENTS:
+        {
+            /* defaults are from CCIR Recommendation 601-1 */
+            static const float ycbcrcoeffs[] = {0.299f, 0.587f, 0.114f};
+            *va_arg(ap, const float **) = ycbcrcoeffs;
+            return 1;
+        }
+        case TIFFTAG_YCBCRSUBSAMPLING:
+            *va_arg(ap, uint16_t *) = td->td_ycbcrsubsampling[0];
+            *va_arg(ap, uint16_t *) = td->td_ycbcrsubsampling[1];
+            return (1);
+        case TIFFTAG_YCBCRPOSITIONING:
+            *va_arg(ap, uint16_t *) = td->td_ycbcrpositioning;
+            return (1);
+        case TIFFTAG_WHITEPOINT:
+        {
+            /* TIFF 6.0 specification tells that it is no default
+               value for the WhitePoint, but AdobePhotoshop TIFF
+               Technical Note tells that it should be CIE D50. */
+            static const float whitepoint[] = {
+                D50_X0 / (D50_X0 + D50_Y0 + D50_Z0),
+                D50_Y0 / (D50_X0 + D50_Y0 + D50_Z0)};
+            *va_arg(ap, const float **) = whitepoint;
+            return 1;
+        }
+        case TIFFTAG_TRANSFERFUNCTION:
+            if (!td->td_transferfunction[0] &&
+                !TIFFDefaultTransferFunction(tif, td))
+            {
+                TIFFErrorExtR(tif, tif->tif_name,
+                              "No space for \"TransferFunction\" tag");
+                return (0);
+            }
+            *va_arg(ap, const uint16_t **) = td->td_transferfunction[0];
+            if (td->td_samplesperpixel - td->td_extrasamples > 1)
+            {
+                *va_arg(ap, const uint16_t **) = td->td_transferfunction[1];
+                *va_arg(ap, const uint16_t **) = td->td_transferfunction[2];
+            }
+            return (1);
+        case TIFFTAG_REFERENCEBLACKWHITE:
+            if (!td->td_refblackwhite && !TIFFDefaultRefBlackWhite(tif, td))
+                return (0);
+            *va_arg(ap, const float **) = td->td_refblackwhite;
+            return (1);
     }
-	case TIFFTAG_DOTRANGE:
-		*va_arg(ap, uint16 *) = 0;
-		*va_arg(ap, uint16 *) = (1<<td->td_bitspersample)-1;
-		return (1);
-	case TIFFTAG_INKSET:
-		*va_arg(ap, uint16 *) = INKSET_CMYK;
-		return 1;
-	case TIFFTAG_NUMBEROFINKS:
-		*va_arg(ap, uint16 *) = 4;
-		return (1);
-	case TIFFTAG_EXTRASAMPLES:
-		*va_arg(ap, uint16 *) = td->td_extrasamples;
-		*va_arg(ap, const uint16 **) = td->td_sampleinfo;
-		return (1);
-	case TIFFTAG_MATTEING:
-		*va_arg(ap, uint16 *) =
-		    (td->td_extrasamples == 1 &&
-		     td->td_sampleinfo[0] == EXTRASAMPLE_ASSOCALPHA);
-		return (1);
-	case TIFFTAG_TILEDEPTH:
-		*va_arg(ap, uint32 *) = td->td_tiledepth;
-		return (1);
-	case TIFFTAG_DATATYPE:
-		*va_arg(ap, uint16 *) = td->td_sampleformat-1;
-		return (1);
-	case TIFFTAG_SAMPLEFORMAT:
-		*va_arg(ap, uint16 *) = td->td_sampleformat;
-                return(1);
-	case TIFFTAG_IMAGEDEPTH:
-		*va_arg(ap, uint32 *) = td->td_imagedepth;
-		return (1);
-	case TIFFTAG_YCBCRCOEFFICIENTS:
-		{
-			/* defaults are from CCIR Recommendation 601-1 */
-			static const float ycbcrcoeffs[] = { 0.299f, 0.587f, 0.114f };
-			*va_arg(ap, const float **) = ycbcrcoeffs;
-			return 1;
-		}
-	case TIFFTAG_YCBCRSUBSAMPLING:
-		*va_arg(ap, uint16 *) = td->td_ycbcrsubsampling[0];
-		*va_arg(ap, uint16 *) = td->td_ycbcrsubsampling[1];
-		return (1);
-	case TIFFTAG_YCBCRPOSITIONING:
-		*va_arg(ap, uint16 *) = td->td_ycbcrpositioning;
-		return (1);
-	case TIFFTAG_WHITEPOINT:
-		{
-			/* TIFF 6.0 specification tells that it is no default
-			   value for the WhitePoint, but AdobePhotoshop TIFF
-			   Technical Note tells that it should be CIE D50. */
-			static const float whitepoint[] = {
-						D50_X0 / (D50_X0 + D50_Y0 + D50_Z0),
-						D50_Y0 / (D50_X0 + D50_Y0 + D50_Z0)
-			};
-			*va_arg(ap, const float **) = whitepoint;
-			return 1;
-		}
-	case TIFFTAG_TRANSFERFUNCTION:
-		if (!td->td_transferfunction[0] &&
-		    !TIFFDefaultTransferFunction(td)) {
-			TIFFErrorExt(tif->tif_clientdata, tif->tif_name, "No space for \"TransferFunction\" tag");
-			return (0);
-		}
-		*va_arg(ap, const uint16 **) = td->td_transferfunction[0];
-		if (td->td_samplesperpixel - td->td_extrasamples > 1) {
-			*va_arg(ap, const uint16 **) = td->td_transferfunction[1];
-			*va_arg(ap, const uint16 **) = td->td_transferfunction[2];
-		}
-		return (1);
-	case TIFFTAG_REFERENCEBLACKWHITE:
-		if (!td->td_refblackwhite && !TIFFDefaultRefBlackWhite(td))
-			return (0);
-		*va_arg(ap, const float **) = td->td_refblackwhite;
-		return (1);
-	}
-	return 0;
+    return 0;
 }
 
 /*
  * Like TIFFGetField, but return any default
  * value if the tag is not present in the directory.
  */
-int
-TIFFGetFieldDefaulted(TIFF* tif, uint32 tag, ...)
+int TIFFGetFieldDefaulted(TIFF *tif, uint32_t tag, ...)
 {
-	int ok;
-	va_list ap;
+    int ok;
+    va_list ap;
 
-	va_start(ap, tag);
-	ok =  TIFFVGetFieldDefaulted(tif, tag, ap);
-	va_end(ap);
-	return (ok);
+    va_start(ap, tag);
+    ok = TIFFVGetFieldDefaulted(tif, tag, ap);
+    va_end(ap);
+    return (ok);
 }
 
-struct _Int64Parts {
-	int32 low, high;
+struct _Int64Parts
+{
+    int32_t low, high;
 };
 
-typedef union {
-	struct _Int64Parts part;
-	int64 value;
+typedef union
+{
+    struct _Int64Parts part;
+    int64_t value;
 } _Int64;
 
-float
-_TIFFUInt64ToFloat(uint64 ui64)
+float _TIFFUInt64ToFloat(uint64_t ui64)
 {
-	_Int64 i;
-
-	i.value = ui64;
-	if (i.part.high >= 0) {
-		return (float)i.value;
-	} else {
-		long double df;
-		df = (long double)i.value;
-		df += 18446744073709551616.0; /* adding 2**64 */
-		return (float)df;
-	}
+    _Int64 i;
+
+    i.value = ui64;
+    if (i.part.high >= 0)
+    {
+        return (float)i.value;
+    }
+    else
+    {
+        long double df;
+        df = (long double)i.value;
+        df += 18446744073709551616.0; /* adding 2**64 */
+        return (float)df;
+    }
 }
 
-double
-_TIFFUInt64ToDouble(uint64 ui64)
+double _TIFFUInt64ToDouble(uint64_t ui64)
 {
-	_Int64 i;
-
-	i.value = ui64;
-	if (i.part.high >= 0) {
-		return (double)i.value;
-	} else {
-		long double df;
-		df = (long double)i.value;
-		df += 18446744073709551616.0; /* adding 2**64 */
-		return (double)df;
-	}
+    _Int64 i;
+
+    i.value = ui64;
+    if (i.part.high >= 0)
+    {
+        return (double)i.value;
+    }
+    else
+    {
+        long double df;
+        df = (long double)i.value;
+        df += 18446744073709551616.0; /* adding 2**64 */
+        return (double)df;
+    }
 }
 
-float _TIFFClampDoubleToFloat( double val )
+float _TIFFClampDoubleToFloat(double val)
 {
-    if( val > FLT_MAX )
+    if (val > FLT_MAX)
         return FLT_MAX;
-    if( val < -FLT_MAX )
+    if (val < -FLT_MAX)
         return -FLT_MAX;
     return (float)val;
 }
 
-int _TIFFSeekOK(TIFF* tif, toff_t off)
+uint32_t _TIFFClampDoubleToUInt32(double val)
+{
+    if (val < 0)
+        return 0;
+    if (val > 0xFFFFFFFFU || val != val)
+        return 0xFFFFFFFFU;
+    return (uint32_t)val;
+}
+
+int _TIFFSeekOK(TIFF *tif, toff_t off)
 {
     /* Huge offsets, especially -1 / UINT64_MAX, can cause issues */
     /* See http://bugzilla.maptools.org/show_bug.cgi?id=2726 */
-    return off <= (~(uint64)0)/2 && TIFFSeekFile(tif,off,SEEK_SET)==off;
+    return off <= (~(uint64_t)0) / 2 && TIFFSeekFile(tif, off, SEEK_SET) == off;
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_close.c b/3rdparty/libtiff/tif_close.c
index e4228df9c9be..907d7f139b7a 100644
--- a/3rdparty/libtiff/tif_close.c
+++ b/3rdparty/libtiff/tif_close.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -37,70 +37,98 @@
  * completely freed, so you should save opened file handle and pointer
  * to the close procedure in external variables before calling
  * _TIFFCleanup(), if you will need these ones to close the file.
- * 
+ *
  * @param tif A TIFF pointer.
  */
 
-void
-TIFFCleanup(TIFF* tif)
+void TIFFCleanup(TIFF *tif)
 {
-	/*
-         * Flush buffered data and directory (if dirty).
-         */
-	if (tif->tif_mode != O_RDONLY)
-		TIFFFlush(tif);
-	(*tif->tif_cleanup)(tif);
-	TIFFFreeDirectory(tif);
-
-	if (tif->tif_dirlist)
-		_TIFFfree(tif->tif_dirlist);
-
-	/*
-         * Clean up client info links.
-         */
-	while( tif->tif_clientinfo )
-	{
-		TIFFClientInfoLink *psLink = tif->tif_clientinfo;
-
-		tif->tif_clientinfo = psLink->next;
-		_TIFFfree( psLink->name );
-		_TIFFfree( psLink );
-	}
-
-	if (tif->tif_rawdata && (tif->tif_flags&TIFF_MYBUFFER))
-		_TIFFfree(tif->tif_rawdata);
-	if (isMapped(tif))
-		TIFFUnmapFileContents(tif, tif->tif_base, (toff_t)tif->tif_size);
-
-	/*
-         * Clean up custom fields.
-         */
-	if (tif->tif_fields && tif->tif_nfields > 0) {
-		uint32 i;
-
-		for (i = 0; i < tif->tif_nfields; i++) {
-			TIFFField *fld = tif->tif_fields[i];
-			if (fld->field_bit == FIELD_CUSTOM &&
-			    strncmp("Tag ", fld->field_name, 4) == 0) {
-				_TIFFfree(fld->field_name);
-				_TIFFfree(fld);
-			}
-		}
-
-		_TIFFfree(tif->tif_fields);
-	}
-
-        if (tif->tif_nfieldscompat > 0) {
-                uint32 i;
-
-                for (i = 0; i < tif->tif_nfieldscompat; i++) {
-                        if (tif->tif_fieldscompat[i].allocated_size)
-                                _TIFFfree(tif->tif_fieldscompat[i].fields);
+    /*
+     * Flush buffered data and directory (if dirty).
+     */
+    if (tif->tif_mode != O_RDONLY)
+        TIFFFlush(tif);
+    (*tif->tif_cleanup)(tif);
+    TIFFFreeDirectory(tif);
+
+    _TIFFCleanupIFDOffsetAndNumberMaps(tif);
+
+    /*
+     * Clean up client info links.
+     */
+    while (tif->tif_clientinfo)
+    {
+        TIFFClientInfoLink *psLink = tif->tif_clientinfo;
+
+        tif->tif_clientinfo = psLink->next;
+        _TIFFfreeExt(tif, psLink->name);
+        _TIFFfreeExt(tif, psLink);
+    }
+
+    if (tif->tif_rawdata && (tif->tif_flags & TIFF_MYBUFFER))
+        _TIFFfreeExt(tif, tif->tif_rawdata);
+    if (isMapped(tif))
+        TIFFUnmapFileContents(tif, tif->tif_base, (toff_t)tif->tif_size);
+
+    /*
+     * Clean up custom fields.
+     */
+    if (tif->tif_fields && tif->tif_nfields > 0)
+    {
+        uint32_t i;
+
+        for (i = 0; i < tif->tif_nfields; i++)
+        {
+            TIFFField *fld = tif->tif_fields[i];
+            if (fld->field_name != NULL)
+            {
+                if (fld->field_bit == FIELD_CUSTOM &&
+                    /* caution: tif_fields[i] must not be the beginning of a
+                     * fields-array. Otherwise the following tags are also freed
+                     * with the first free().
+                     */
+                    TIFFFieldIsAnonymous(fld))
+                {
+                    _TIFFfreeExt(tif, fld->field_name);
+                    _TIFFfreeExt(tif, fld);
                 }
-                _TIFFfree(tif->tif_fieldscompat);
+            }
+        }
+
+        _TIFFfreeExt(tif, tif->tif_fields);
+    }
+
+    if (tif->tif_nfieldscompat > 0)
+    {
+        uint32_t i;
+
+        for (i = 0; i < tif->tif_nfieldscompat; i++)
+        {
+            if (tif->tif_fieldscompat[i].allocated_size)
+                _TIFFfreeExt(tif, tif->tif_fieldscompat[i].fields);
         }
+        _TIFFfreeExt(tif, tif->tif_fieldscompat);
+    }
 
-	_TIFFfree(tif);
+    _TIFFfreeExt(NULL, tif);
+}
+
+/************************************************************************/
+/*                    _TIFFCleanupIFDOffsetAndNumberMaps()              */
+/************************************************************************/
+
+void _TIFFCleanupIFDOffsetAndNumberMaps(TIFF *tif)
+{
+    if (tif->tif_map_dir_offset_to_number)
+    {
+        TIFFHashSetDestroy(tif->tif_map_dir_offset_to_number);
+        tif->tif_map_dir_offset_to_number = NULL;
+    }
+    if (tif->tif_map_dir_number_to_offset)
+    {
+        TIFFHashSetDestroy(tif->tif_map_dir_number_to_offset);
+        tif->tif_map_dir_number_to_offset = NULL;
+    }
 }
 
 /************************************************************************/
@@ -113,26 +141,18 @@ TIFFCleanup(TIFF* tif)
  * TIFFClose closes a file that was previously opened with TIFFOpen().
  * Any buffered data are flushed to the file, including the contents of
  * the current directory (if modified); and all resources are reclaimed.
- * 
+ *
  * @param tif A TIFF pointer.
  */
 
-void
-TIFFClose(TIFF* tif)
+void TIFFClose(TIFF *tif)
 {
-	TIFFCloseProc closeproc = tif->tif_closeproc;
-	thandle_t fd = tif->tif_clientdata;
-
-	TIFFCleanup(tif);
-	(void) (*closeproc)(fd);
+    if (tif != NULL)
+    {
+        TIFFCloseProc closeproc = tif->tif_closeproc;
+        thandle_t fd = tif->tif_clientdata;
+
+        TIFFCleanup(tif);
+        (void)(*closeproc)(fd);
+    }
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_codec.c b/3rdparty/libtiff/tif_codec.c
index b6c04f01d7f9..d499b63a584f 100644
--- a/3rdparty/libtiff/tif_codec.c
+++ b/3rdparty/libtiff/tif_codec.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -29,7 +29,7 @@
  */
 #include "tiffiop.h"
 
-static int NotConfigured(TIFF*, int);
+static int NotConfigured(TIFF *, int);
 
 #ifndef LZW_SUPPORT
 #define TIFFInitLZW NotConfigured
@@ -67,6 +67,9 @@ static int NotConfigured(TIFF*, int);
 #ifndef LOGLUV_SUPPORT
 #define TIFFInitSGILog NotConfigured
 #endif
+#ifndef LERC_SUPPORT
+#define TIFFInitLERC NotConfigured
+#endif
 #ifndef LZMA_SUPPORT
 #define TIFFInitLZMA NotConfigured
 #endif
@@ -80,58 +83,53 @@ static int NotConfigured(TIFF*, int);
 /*
  * Compression schemes statically built into the library.
  */
-#ifdef VMS
 const TIFFCodec _TIFFBuiltinCODECS[] = {
-#else
-TIFFCodec _TIFFBuiltinCODECS[] = {
-#endif
-    { "None",		COMPRESSION_NONE,	TIFFInitDumpMode },
-    { "LZW",		COMPRESSION_LZW,	TIFFInitLZW },
-    { "PackBits",	COMPRESSION_PACKBITS,	TIFFInitPackBits },
-    { "ThunderScan",	COMPRESSION_THUNDERSCAN,TIFFInitThunderScan },
-    { "NeXT",		COMPRESSION_NEXT,	TIFFInitNeXT },
-    { "JPEG",		COMPRESSION_JPEG,	TIFFInitJPEG },
-    { "Old-style JPEG",	COMPRESSION_OJPEG,	TIFFInitOJPEG },
-    { "CCITT RLE",	COMPRESSION_CCITTRLE,	TIFFInitCCITTRLE },
-    { "CCITT RLE/W",	COMPRESSION_CCITTRLEW,	TIFFInitCCITTRLEW },
-    { "CCITT Group 3",	COMPRESSION_CCITTFAX3,	TIFFInitCCITTFax3 },
-    { "CCITT Group 4",	COMPRESSION_CCITTFAX4,	TIFFInitCCITTFax4 },
-    { "ISO JBIG",	COMPRESSION_JBIG,	TIFFInitJBIG },
-    { "Deflate",	COMPRESSION_DEFLATE,	TIFFInitZIP },
-    { "AdobeDeflate",   COMPRESSION_ADOBE_DEFLATE , TIFFInitZIP }, 
-    { "PixarLog",	COMPRESSION_PIXARLOG,	TIFFInitPixarLog },
-    { "SGILog",		COMPRESSION_SGILOG,	TIFFInitSGILog },
-    { "SGILog24",	COMPRESSION_SGILOG24,	TIFFInitSGILog },
-    { "LZMA",		COMPRESSION_LZMA,	TIFFInitLZMA },
-    { "ZSTD",		COMPRESSION_ZSTD,	TIFFInitZSTD },
-    { "WEBP",		COMPRESSION_WEBP,	TIFFInitWebP },
-    { NULL,             0,                      NULL }
-};
+    {"None", COMPRESSION_NONE, TIFFInitDumpMode},
+    {"LZW", COMPRESSION_LZW, TIFFInitLZW},
+    {"PackBits", COMPRESSION_PACKBITS, TIFFInitPackBits},
+    {"ThunderScan", COMPRESSION_THUNDERSCAN, TIFFInitThunderScan},
+    {"NeXT", COMPRESSION_NEXT, TIFFInitNeXT},
+    {"JPEG", COMPRESSION_JPEG, TIFFInitJPEG},
+    {"Old-style JPEG", COMPRESSION_OJPEG, TIFFInitOJPEG},
+    {"CCITT RLE", COMPRESSION_CCITTRLE, TIFFInitCCITTRLE},
+    {"CCITT RLE/W", COMPRESSION_CCITTRLEW, TIFFInitCCITTRLEW},
+    {"CCITT Group 3", COMPRESSION_CCITTFAX3, TIFFInitCCITTFax3},
+    {"CCITT Group 4", COMPRESSION_CCITTFAX4, TIFFInitCCITTFax4},
+    {"ISO JBIG", COMPRESSION_JBIG, TIFFInitJBIG},
+    {"Deflate", COMPRESSION_DEFLATE, TIFFInitZIP},
+    {"AdobeDeflate", COMPRESSION_ADOBE_DEFLATE, TIFFInitZIP},
+    {"PixarLog", COMPRESSION_PIXARLOG, TIFFInitPixarLog},
+    {"SGILog", COMPRESSION_SGILOG, TIFFInitSGILog},
+    {"SGILog24", COMPRESSION_SGILOG24, TIFFInitSGILog},
+    {"LZMA", COMPRESSION_LZMA, TIFFInitLZMA},
+    {"ZSTD", COMPRESSION_ZSTD, TIFFInitZSTD},
+    {"WEBP", COMPRESSION_WEBP, TIFFInitWebP},
+    {"LERC", COMPRESSION_LERC, TIFFInitLERC},
+    {NULL, 0, NULL}};
 
-static int
-_notConfigured(TIFF* tif)
+static int _notConfigured(TIFF *tif)
 {
-	const TIFFCodec* c = TIFFFindCODEC(tif->tif_dir.td_compression);
-        char compression_code[20];
-        
-        sprintf(compression_code, "%d",tif->tif_dir.td_compression );
-	TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-                     "%s compression support is not configured", 
-                     c ? c->name : compression_code );
-	return (0);
+    const TIFFCodec *c = TIFFFindCODEC(tif->tif_dir.td_compression);
+    char compression_code[20];
+
+    snprintf(compression_code, sizeof(compression_code), "%" PRIu16,
+             tif->tif_dir.td_compression);
+    TIFFErrorExtR(tif, tif->tif_name,
+                  "%s compression support is not configured",
+                  c ? c->name : compression_code);
+    return (0);
 }
 
-static int
-NotConfigured(TIFF* tif, int scheme)
+static int NotConfigured(TIFF *tif, int scheme)
 {
-	(void) scheme;
+    (void)scheme;
 
-	tif->tif_fixuptags = _notConfigured;
-	tif->tif_decodestatus = FALSE;
-	tif->tif_setupdecode = _notConfigured;
-	tif->tif_encodestatus = FALSE;
-	tif->tif_setupencode = _notConfigured;
-	return (1);
+    tif->tif_fixuptags = _notConfigured;
+    tif->tif_decodestatus = FALSE;
+    tif->tif_setupdecode = _notConfigured;
+    tif->tif_encodestatus = FALSE;
+    tif->tif_setupencode = _notConfigured;
+    return (1);
 }
 
 /************************************************************************/
@@ -145,27 +143,21 @@ NotConfigured(TIFF* tif, int scheme)
  * 0 will be returned.
  */
 
-int
-TIFFIsCODECConfigured(uint16 scheme)
+int TIFFIsCODECConfigured(uint16_t scheme)
 {
-	const TIFFCodec* codec = TIFFFindCODEC(scheme);
+    const TIFFCodec *codec = TIFFFindCODEC(scheme);
 
-	if(codec == NULL) {
-		return 0;
-	}
-	if(codec->init == NULL) {
-		return 0;
-	}
-	if(codec->init != NotConfigured){
-		return 1;
-	}
-	return 0;
+    if (codec == NULL)
+    {
+        return 0;
+    }
+    if (codec->init == NULL)
+    {
+        return 0;
+    }
+    if (codec->init != NotConfigured)
+    {
+        return 1;
+    }
+    return 0;
 }
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_color.c b/3rdparty/libtiff/tif_color.c
index 8fae40ea4be6..2d7dcac6fe68 100644
--- a/3rdparty/libtiff/tif_color.c
+++ b/3rdparty/libtiff/tif_color.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -40,173 +40,191 @@
 /*
  * Convert color value from the CIE L*a*b* 1976 space to CIE XYZ.
  */
-void
-TIFFCIELabToXYZ(TIFFCIELabToRGB *cielab, uint32 l, int32 a, int32 b,
-		float *X, float *Y, float *Z)
+void TIFFCIELabToXYZ(TIFFCIELabToRGB *cielab, uint32_t l, int32_t a, int32_t b,
+                     float *X, float *Y, float *Z)
 {
-	float L = (float)l * 100.0F / 255.0F;
-	float cby, tmp;
-
-	if( L < 8.856F ) {
-		*Y = (L * cielab->Y0) / 903.292F;
-		cby = 7.787F * (*Y / cielab->Y0) + 16.0F / 116.0F;
-	} else {
-		cby = (L + 16.0F) / 116.0F;
-		*Y = cielab->Y0 * cby * cby * cby;
-	}
-
-	tmp = (float)a / 500.0F + cby;
-	if( tmp < 0.2069F )
-		*X = cielab->X0 * (tmp - 0.13793F) / 7.787F;
-	else    
-		*X = cielab->X0 * tmp * tmp * tmp;
-
-	tmp = cby - (float)b / 200.0F;
-	if( tmp < 0.2069F )
-		*Z = cielab->Z0 * (tmp - 0.13793F) / 7.787F;
-	else    
-		*Z = cielab->Z0 * tmp * tmp * tmp;
+    TIFFCIELab16ToXYZ(cielab, l * 257, a * 256, b * 256, X, Y, Z);
 }
 
-#define RINT(R) ((uint32)((R)>0?((R)+0.5):((R)-0.5)))
+/*
+ * For CIELab encoded in 16 bits, L is an unsigned integer range [0,65535].
+ * The a* and b* components are signed integers range [-32768,32767]. The 16
+ * bit chrominance values are encoded as 256 times the 1976 CIE a* and b*
+ * values
+ */
+void TIFFCIELab16ToXYZ(TIFFCIELabToRGB *cielab, uint32_t l, int32_t a,
+                       int32_t b, float *X, float *Y, float *Z)
+{
+    float L = (float)l * 100.0F / 65535.0F;
+    float cby, tmp;
+
+    if (L < 8.856F)
+    {
+        *Y = (L * cielab->Y0) / 903.292F;
+        cby = 7.787F * (*Y / cielab->Y0) + 16.0F / 116.0F;
+    }
+    else
+    {
+        cby = (L + 16.0F) / 116.0F;
+        *Y = cielab->Y0 * cby * cby * cby;
+    }
+
+    tmp = (float)a / 256.0F / 500.0F + cby;
+    if (tmp < 0.2069F)
+        *X = cielab->X0 * (tmp - 0.13793F) / 7.787F;
+    else
+        *X = cielab->X0 * tmp * tmp * tmp;
+
+    tmp = cby - (float)b / 256.0F / 200.0F;
+    if (tmp < 0.2069F)
+        *Z = cielab->Z0 * (tmp - 0.13793F) / 7.787F;
+    else
+        *Z = cielab->Z0 * tmp * tmp * tmp;
+}
+
+#define RINT(R) ((uint32_t)((R) > 0 ? ((R) + 0.5) : ((R)-0.5)))
 /*
  * Convert color value from the XYZ space to RGB.
  */
-void
-TIFFXYZToRGB(TIFFCIELabToRGB *cielab, float X, float Y, float Z,
-	     uint32 *r, uint32 *g, uint32 *b)
+void TIFFXYZToRGB(TIFFCIELabToRGB *cielab, float X, float Y, float Z,
+                  uint32_t *r, uint32_t *g, uint32_t *b)
 {
-	int i;
-	float Yr, Yg, Yb;
-	float *matrix = &cielab->display.d_mat[0][0];
-
-	/* Multiply through the matrix to get luminosity values. */
-	Yr =  matrix[0] * X + matrix[1] * Y + matrix[2] * Z;
-	Yg =  matrix[3] * X + matrix[4] * Y + matrix[5] * Z;
-	Yb =  matrix[6] * X + matrix[7] * Y + matrix[8] * Z;
-
-	/* Clip input */
-	Yr = TIFFmax(Yr, cielab->display.d_Y0R);
-	Yg = TIFFmax(Yg, cielab->display.d_Y0G);
-	Yb = TIFFmax(Yb, cielab->display.d_Y0B);
-
-	/* Avoid overflow in case of wrong input values */
-	Yr = TIFFmin(Yr, cielab->display.d_YCR);
-	Yg = TIFFmin(Yg, cielab->display.d_YCG);
-	Yb = TIFFmin(Yb, cielab->display.d_YCB);
-
-	/* Turn luminosity to colour value. */
-	i = (int)((Yr - cielab->display.d_Y0R) / cielab->rstep);
-	i = TIFFmin(cielab->range, i);
-	*r = RINT(cielab->Yr2r[i]);
-
-	i = (int)((Yg - cielab->display.d_Y0G) / cielab->gstep);
-	i = TIFFmin(cielab->range, i);
-	*g = RINT(cielab->Yg2g[i]);
-
-	i = (int)((Yb - cielab->display.d_Y0B) / cielab->bstep);
-	i = TIFFmin(cielab->range, i);
-	*b = RINT(cielab->Yb2b[i]);
-
-	/* Clip output. */
-	*r = TIFFmin(*r, cielab->display.d_Vrwr);
-	*g = TIFFmin(*g, cielab->display.d_Vrwg);
-	*b = TIFFmin(*b, cielab->display.d_Vrwb);
+    int i;
+    float Yr, Yg, Yb;
+    float *matrix = &cielab->display.d_mat[0][0];
+
+    /* Multiply through the matrix to get luminosity values. */
+    Yr = matrix[0] * X + matrix[1] * Y + matrix[2] * Z;
+    Yg = matrix[3] * X + matrix[4] * Y + matrix[5] * Z;
+    Yb = matrix[6] * X + matrix[7] * Y + matrix[8] * Z;
+
+    /* Clip input */
+    Yr = TIFFmax(Yr, cielab->display.d_Y0R);
+    Yg = TIFFmax(Yg, cielab->display.d_Y0G);
+    Yb = TIFFmax(Yb, cielab->display.d_Y0B);
+
+    /* Avoid overflow in case of wrong input values */
+    Yr = TIFFmin(Yr, cielab->display.d_YCR);
+    Yg = TIFFmin(Yg, cielab->display.d_YCG);
+    Yb = TIFFmin(Yb, cielab->display.d_YCB);
+
+    /* Turn luminosity to colour value. */
+    i = (int)((Yr - cielab->display.d_Y0R) / cielab->rstep);
+    i = TIFFmin(cielab->range, i);
+    *r = RINT(cielab->Yr2r[i]);
+
+    i = (int)((Yg - cielab->display.d_Y0G) / cielab->gstep);
+    i = TIFFmin(cielab->range, i);
+    *g = RINT(cielab->Yg2g[i]);
+
+    i = (int)((Yb - cielab->display.d_Y0B) / cielab->bstep);
+    i = TIFFmin(cielab->range, i);
+    *b = RINT(cielab->Yb2b[i]);
+
+    /* Clip output. */
+    *r = TIFFmin(*r, cielab->display.d_Vrwr);
+    *g = TIFFmin(*g, cielab->display.d_Vrwg);
+    *b = TIFFmin(*b, cielab->display.d_Vrwb);
 }
 #undef RINT
 
-/* 
+/*
  * Allocate conversion state structures and make look_up tables for
  * the Yr,Yb,Yg <=> r,g,b conversions.
  */
-int
-TIFFCIELabToRGBInit(TIFFCIELabToRGB* cielab,
-		    const TIFFDisplay *display, float *refWhite)
+int TIFFCIELabToRGBInit(TIFFCIELabToRGB *cielab, const TIFFDisplay *display,
+                        float *refWhite)
 {
-	int i;
-	double dfGamma;
-
-	cielab->range = CIELABTORGB_TABLE_RANGE;
-
-	_TIFFmemcpy(&cielab->display, display, sizeof(TIFFDisplay));
-
-	/* Red */
-	dfGamma = 1.0 / cielab->display.d_gammaR ;
-	cielab->rstep =
-		(cielab->display.d_YCR - cielab->display.d_Y0R)	/ cielab->range;
-	for(i = 0; i <= cielab->range; i++) {
-		cielab->Yr2r[i] = cielab->display.d_Vrwr
-		    * ((float)pow((double)i / cielab->range, dfGamma));
-	}
-
-	/* Green */
-	dfGamma = 1.0 / cielab->display.d_gammaG ;
-	cielab->gstep =
-	    (cielab->display.d_YCR - cielab->display.d_Y0R) / cielab->range;
-	for(i = 0; i <= cielab->range; i++) {
-		cielab->Yg2g[i] = cielab->display.d_Vrwg
-		    * ((float)pow((double)i / cielab->range, dfGamma));
-	}
-
-	/* Blue */
-	dfGamma = 1.0 / cielab->display.d_gammaB ;
-	cielab->bstep =
-	    (cielab->display.d_YCR - cielab->display.d_Y0R) / cielab->range;
-	for(i = 0; i <= cielab->range; i++) {
-		cielab->Yb2b[i] = cielab->display.d_Vrwb
-		    * ((float)pow((double)i / cielab->range, dfGamma));
-	}
-
-	/* Init reference white point */
-	cielab->X0 = refWhite[0];
-	cielab->Y0 = refWhite[1];
-	cielab->Z0 = refWhite[2];
-
-	return 0;
+    int i;
+    double dfGamma;
+
+    cielab->range = CIELABTORGB_TABLE_RANGE;
+
+    _TIFFmemcpy(&cielab->display, display, sizeof(TIFFDisplay));
+
+    /* Red */
+    dfGamma = 1.0 / cielab->display.d_gammaR;
+    cielab->rstep =
+        (cielab->display.d_YCR - cielab->display.d_Y0R) / cielab->range;
+    for (i = 0; i <= cielab->range; i++)
+    {
+        cielab->Yr2r[i] = cielab->display.d_Vrwr *
+                          ((float)pow((double)i / cielab->range, dfGamma));
+    }
+
+    /* Green */
+    dfGamma = 1.0 / cielab->display.d_gammaG;
+    cielab->gstep =
+        (cielab->display.d_YCR - cielab->display.d_Y0R) / cielab->range;
+    for (i = 0; i <= cielab->range; i++)
+    {
+        cielab->Yg2g[i] = cielab->display.d_Vrwg *
+                          ((float)pow((double)i / cielab->range, dfGamma));
+    }
+
+    /* Blue */
+    dfGamma = 1.0 / cielab->display.d_gammaB;
+    cielab->bstep =
+        (cielab->display.d_YCR - cielab->display.d_Y0R) / cielab->range;
+    for (i = 0; i <= cielab->range; i++)
+    {
+        cielab->Yb2b[i] = cielab->display.d_Vrwb *
+                          ((float)pow((double)i / cielab->range, dfGamma));
+    }
+
+    /* Init reference white point */
+    cielab->X0 = refWhite[0];
+    cielab->Y0 = refWhite[1];
+    cielab->Z0 = refWhite[2];
+
+    return 0;
 }
 
-/* 
+/*
  * Convert color value from the YCbCr space to RGB.
  * The colorspace conversion algorithm comes from the IJG v5a code;
  * see below for more information on how it works.
  */
-#define	SHIFT			16
-#define	FIX(x)			((int32)((x) * (1L<<SHIFT) + 0.5))
-#define	ONE_HALF		((int32)(1<<(SHIFT-1)))
-#define	Code2V(c, RB, RW, CR)	((((c)-(int32)(RB))*(float)(CR))/(float)(((RW)-(RB)!=0) ? ((RW)-(RB)) : 1))
-#define	CLAMP(f,min,max)	((f)<(min)?(min):(f)>(max)?(max):(f))
-#define HICLAMP(f,max)		((f)>(max)?(max):(f))
-
-void
-TIFFYCbCrtoRGB(TIFFYCbCrToRGB *ycbcr, uint32 Y, int32 Cb, int32 Cr,
-	       uint32 *r, uint32 *g, uint32 *b)
+#define SHIFT 16
+#define FIX(x) ((int32_t)((x) * (1L << SHIFT) + 0.5))
+#define ONE_HALF ((int32_t)(1 << (SHIFT - 1)))
+#define Code2V(c, RB, RW, CR)                                                  \
+    ((((c) - (int32_t)(RB)) * (float)(CR)) /                                   \
+     (float)(((RW) - (RB) != 0) ? ((RW) - (RB)) : 1))
+/* !((f)>=(min)) written that way to deal with NaN */
+#define CLAMP(f, min, max)                                                     \
+    ((!((f) >= (min))) ? (min) : (f) > (max) ? (max) : (f))
+#define HICLAMP(f, max) ((f) > (max) ? (max) : (f))
+
+void TIFFYCbCrtoRGB(TIFFYCbCrToRGB *ycbcr, uint32_t Y, int32_t Cb, int32_t Cr,
+                    uint32_t *r, uint32_t *g, uint32_t *b)
 {
-	int32 i;
-
-	/* XXX: Only 8-bit YCbCr input supported for now */
-	Y = HICLAMP(Y, 255);
-	Cb = CLAMP(Cb, 0, 255);
-	Cr = CLAMP(Cr, 0, 255);
-
-	i = ycbcr->Y_tab[Y] + ycbcr->Cr_r_tab[Cr];
-	*r = CLAMP(i, 0, 255);
-	i = ycbcr->Y_tab[Y]
-	    + (int)((ycbcr->Cb_g_tab[Cb] + ycbcr->Cr_g_tab[Cr]) >> SHIFT);
-	*g = CLAMP(i, 0, 255);
-	i = ycbcr->Y_tab[Y] + ycbcr->Cb_b_tab[Cb];
-	*b = CLAMP(i, 0, 255);
+    int32_t i;
+
+    /* XXX: Only 8-bit YCbCr input supported for now */
+    Y = HICLAMP(Y, 255);
+    Cb = CLAMP(Cb, 0, 255);
+    Cr = CLAMP(Cr, 0, 255);
+
+    i = ycbcr->Y_tab[Y] + ycbcr->Cr_r_tab[Cr];
+    *r = CLAMP(i, 0, 255);
+    i = ycbcr->Y_tab[Y] +
+        (int)((ycbcr->Cb_g_tab[Cb] + ycbcr->Cr_g_tab[Cr]) >> SHIFT);
+    *g = CLAMP(i, 0, 255);
+    i = ycbcr->Y_tab[Y] + ycbcr->Cb_b_tab[Cb];
+    *b = CLAMP(i, 0, 255);
 }
 
 /* Clamp function for sanitization purposes. Normally clamping should not */
 /* occur for well behaved chroma and refBlackWhite coefficients */
 static float CLAMPw(float v, float vmin, float vmax)
 {
-    if( v < vmin )
+    if (v < vmin)
     {
         /* printf("%f clamped to %f\n", v, vmin); */
         return vmin;
     }
-    if( v > vmax )
+    if (v > vmax)
     {
         /* printf("%f clamped to %f\n", v, vmax); */
         return vmax;
@@ -230,78 +248,75 @@ static float CLAMPw(float v, float vmin, float vmax)
  * pre-calculating possible values indexed by Cb and Cr (this code
  * assumes conversion is being done for 8-bit samples).
  */
-int
-TIFFYCbCrToRGBInit(TIFFYCbCrToRGB* ycbcr, float *luma, float *refBlackWhite)
+int TIFFYCbCrToRGBInit(TIFFYCbCrToRGB *ycbcr, float *luma, float *refBlackWhite)
 {
-    TIFFRGBValue* clamptab;
+    TIFFRGBValue *clamptab;
     int i;
-    
-#define LumaRed	    luma[0]
-#define LumaGreen   luma[1]
-#define LumaBlue    luma[2]
-
-    clamptab = (TIFFRGBValue*)(
-	(uint8*) ycbcr+TIFFroundup_32(sizeof (TIFFYCbCrToRGB), sizeof (long)));  
-    _TIFFmemset(clamptab, 0, 256);		/* v < 0 => 0 */
+
+#define LumaRed luma[0]
+#define LumaGreen luma[1]
+#define LumaBlue luma[2]
+
+    clamptab =
+        (TIFFRGBValue *)((uint8_t *)ycbcr +
+                         TIFFroundup_32(sizeof(TIFFYCbCrToRGB), sizeof(long)));
+    _TIFFmemset(clamptab, 0, 256); /* v < 0 => 0 */
     ycbcr->clamptab = (clamptab += 256);
     for (i = 0; i < 256; i++)
-	clamptab[i] = (TIFFRGBValue) i;
-    _TIFFmemset(clamptab+256, 255, 2*256);	/* v > 255 => 255 */
-    ycbcr->Cr_r_tab = (int*) (clamptab + 3*256);
+        clamptab[i] = (TIFFRGBValue)i;
+    _TIFFmemset(clamptab + 256, 255, 2 * 256); /* v > 255 => 255 */
+    ycbcr->Cr_r_tab = (int *)(clamptab + 3 * 256);
     ycbcr->Cb_b_tab = ycbcr->Cr_r_tab + 256;
-    ycbcr->Cr_g_tab = (int32*) (ycbcr->Cb_b_tab + 256);
+    ycbcr->Cr_g_tab = (int32_t *)(ycbcr->Cb_b_tab + 256);
     ycbcr->Cb_g_tab = ycbcr->Cr_g_tab + 256;
     ycbcr->Y_tab = ycbcr->Cb_g_tab + 256;
 
-    { float f1 = 2-2*LumaRed;		int32 D1 = FIX(CLAMP(f1,0.0F,2.0F));
-      float f2 = LumaRed*f1/LumaGreen;	int32 D2 = -FIX(CLAMP(f2,0.0F,2.0F));
-      float f3 = 2-2*LumaBlue;		int32 D3 = FIX(CLAMP(f3,0.0F,2.0F));
-      float f4 = LumaBlue*f3/LumaGreen;	int32 D4 = -FIX(CLAMP(f4,0.0F,2.0F));
-      int x;
+    {
+        float f1 = 2 - 2 * LumaRed;
+        int32_t D1 = FIX(CLAMP(f1, 0.0F, 2.0F));
+        float f2 = LumaRed * f1 / LumaGreen;
+        int32_t D2 = -FIX(CLAMP(f2, 0.0F, 2.0F));
+        float f3 = 2 - 2 * LumaBlue;
+        int32_t D3 = FIX(CLAMP(f3, 0.0F, 2.0F));
+        float f4 = LumaBlue * f3 / LumaGreen;
+        int32_t D4 = -FIX(CLAMP(f4, 0.0F, 2.0F));
+        int x;
 
 #undef LumaBlue
 #undef LumaGreen
 #undef LumaRed
-      
-      /*
-       * i is the actual input pixel value in the range 0..255
-       * Cb and Cr values are in the range -128..127 (actually
-       * they are in a range defined by the ReferenceBlackWhite
-       * tag) so there is some range shifting to do here when
-       * constructing tables indexed by the raw pixel data.
-       */
-      for (i = 0, x = -128; i < 256; i++, x++) {
-	    int32 Cr = (int32)CLAMPw(Code2V(x, refBlackWhite[4] - 128.0F,
-			    refBlackWhite[5] - 128.0F, 127),
-                            -128.0F * 32, 128.0F * 32);
-	    int32 Cb = (int32)CLAMPw(Code2V(x, refBlackWhite[2] - 128.0F,
-			    refBlackWhite[3] - 128.0F, 127),
-                            -128.0F * 32, 128.0F * 32);
-
-	    ycbcr->Cr_r_tab[i] = (int32)((D1*Cr + ONE_HALF)>>SHIFT);
-	    ycbcr->Cb_b_tab[i] = (int32)((D3*Cb + ONE_HALF)>>SHIFT);
-	    ycbcr->Cr_g_tab[i] = D2*Cr;
-	    ycbcr->Cb_g_tab[i] = D4*Cb + ONE_HALF;
-	    ycbcr->Y_tab[i] =
-		    (int32)CLAMPw(Code2V(x + 128, refBlackWhite[0], refBlackWhite[1], 255),
-                                  -128.0F * 32, 128.0F * 32);
-      }
+
+        /*
+         * i is the actual input pixel value in the range 0..255
+         * Cb and Cr values are in the range -128..127 (actually
+         * they are in a range defined by the ReferenceBlackWhite
+         * tag) so there is some range shifting to do here when
+         * constructing tables indexed by the raw pixel data.
+         */
+        for (i = 0, x = -128; i < 256; i++, x++)
+        {
+            int32_t Cr = (int32_t)CLAMPw(Code2V(x, refBlackWhite[4] - 128.0F,
+                                                refBlackWhite[5] - 128.0F, 127),
+                                         -128.0F * 32, 128.0F * 32);
+            int32_t Cb = (int32_t)CLAMPw(Code2V(x, refBlackWhite[2] - 128.0F,
+                                                refBlackWhite[3] - 128.0F, 127),
+                                         -128.0F * 32, 128.0F * 32);
+
+            ycbcr->Cr_r_tab[i] = (int32_t)((D1 * Cr + ONE_HALF) >> SHIFT);
+            ycbcr->Cb_b_tab[i] = (int32_t)((D3 * Cb + ONE_HALF) >> SHIFT);
+            ycbcr->Cr_g_tab[i] = D2 * Cr;
+            ycbcr->Cb_g_tab[i] = D4 * Cb + ONE_HALF;
+            ycbcr->Y_tab[i] = (int32_t)CLAMPw(
+                Code2V(x + 128, refBlackWhite[0], refBlackWhite[1], 255),
+                -128.0F * 32, 128.0F * 32);
+        }
     }
 
     return 0;
 }
-#undef	HICLAMP
-#undef	CLAMP
-#undef	Code2V
-#undef	SHIFT
-#undef	ONE_HALF
-#undef	FIX
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
+#undef HICLAMP
+#undef CLAMP
+#undef Code2V
+#undef SHIFT
+#undef ONE_HALF
+#undef FIX
diff --git a/3rdparty/libtiff/tif_compress.c b/3rdparty/libtiff/tif_compress.c
index 915478f500c8..c6e17d3e1142 100644
--- a/3rdparty/libtiff/tif_compress.c
+++ b/3rdparty/libtiff/tif_compress.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -29,145 +29,152 @@
  */
 #include "tiffiop.h"
 
-static int
-TIFFNoEncode(TIFF* tif, const char* method)
+static int TIFFNoEncode(TIFF *tif, const char *method)
 {
-	const TIFFCodec* c = TIFFFindCODEC(tif->tif_dir.td_compression);
-
-	if (c) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "%s %s encoding is not implemented",
-			     c->name, method);
-	} else {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			"Compression scheme %u %s encoding is not implemented",
-			     tif->tif_dir.td_compression, method);
-	}
-	return (-1);
+    const TIFFCodec *c = TIFFFindCODEC(tif->tif_dir.td_compression);
+
+    if (c)
+    {
+        TIFFErrorExtR(tif, tif->tif_name, "%s %s encoding is not implemented",
+                      c->name, method);
+    }
+    else
+    {
+        TIFFErrorExtR(tif, tif->tif_name,
+                      "Compression scheme %" PRIu16
+                      " %s encoding is not implemented",
+                      tif->tif_dir.td_compression, method);
+    }
+    return (-1);
 }
 
-int
-_TIFFNoRowEncode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s)
+int _TIFFNoRowEncode(TIFF *tif, uint8_t *pp, tmsize_t cc, uint16_t s)
 {
-	(void) pp; (void) cc; (void) s;
-	return (TIFFNoEncode(tif, "scanline"));
+    (void)pp;
+    (void)cc;
+    (void)s;
+    return (TIFFNoEncode(tif, "scanline"));
 }
 
-int
-_TIFFNoStripEncode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s)
+int _TIFFNoStripEncode(TIFF *tif, uint8_t *pp, tmsize_t cc, uint16_t s)
 {
-	(void) pp; (void) cc; (void) s;
-	return (TIFFNoEncode(tif, "strip"));
+    (void)pp;
+    (void)cc;
+    (void)s;
+    return (TIFFNoEncode(tif, "strip"));
 }
 
-int
-_TIFFNoTileEncode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s)
+int _TIFFNoTileEncode(TIFF *tif, uint8_t *pp, tmsize_t cc, uint16_t s)
 {
-	(void) pp; (void) cc; (void) s;
-	return (TIFFNoEncode(tif, "tile"));
+    (void)pp;
+    (void)cc;
+    (void)s;
+    return (TIFFNoEncode(tif, "tile"));
 }
 
-static int
-TIFFNoDecode(TIFF* tif, const char* method)
+static int TIFFNoDecode(TIFF *tif, const char *method)
 {
-	const TIFFCodec* c = TIFFFindCODEC(tif->tif_dir.td_compression);
-
-	if (c)
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "%s %s decoding is not implemented",
-			     c->name, method);
-	else
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "Compression scheme %u %s decoding is not implemented",
-			     tif->tif_dir.td_compression, method);
-	return (0);
+    const TIFFCodec *c = TIFFFindCODEC(tif->tif_dir.td_compression);
+
+    if (c)
+        TIFFErrorExtR(tif, tif->tif_name, "%s %s decoding is not implemented",
+                      c->name, method);
+    else
+        TIFFErrorExtR(tif, tif->tif_name,
+                      "Compression scheme %" PRIu16
+                      " %s decoding is not implemented",
+                      tif->tif_dir.td_compression, method);
+    return (0);
 }
 
-static int
-_TIFFNoFixupTags(TIFF* tif)
+static int _TIFFNoFixupTags(TIFF *tif)
 {
-	(void) tif;
-	return (1);
+    (void)tif;
+    return (1);
 }
 
-int
-_TIFFNoRowDecode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s)
+int _TIFFNoRowDecode(TIFF *tif, uint8_t *pp, tmsize_t cc, uint16_t s)
 {
-	(void) pp; (void) cc; (void) s;
-	return (TIFFNoDecode(tif, "scanline"));
+    (void)pp;
+    (void)cc;
+    (void)s;
+    return (TIFFNoDecode(tif, "scanline"));
 }
 
-int
-_TIFFNoStripDecode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s)
+int _TIFFNoStripDecode(TIFF *tif, uint8_t *pp, tmsize_t cc, uint16_t s)
 {
-	(void) pp; (void) cc; (void) s;
-	return (TIFFNoDecode(tif, "strip"));
+    (void)pp;
+    (void)cc;
+    (void)s;
+    return (TIFFNoDecode(tif, "strip"));
 }
 
-int
-_TIFFNoTileDecode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s)
+int _TIFFNoTileDecode(TIFF *tif, uint8_t *pp, tmsize_t cc, uint16_t s)
 {
-	(void) pp; (void) cc; (void) s;
-	return (TIFFNoDecode(tif, "tile"));
+    (void)pp;
+    (void)cc;
+    (void)s;
+    return (TIFFNoDecode(tif, "tile"));
 }
 
-int
-_TIFFNoSeek(TIFF* tif, uint32 off)
+int _TIFFNoSeek(TIFF *tif, uint32_t off)
 {
-	(void) off;
-	TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-		     "Compression algorithm does not support random access");
-	return (0);
+    (void)off;
+    TIFFErrorExtR(tif, tif->tif_name,
+                  "Compression algorithm does not support random access");
+    return (0);
 }
 
-int
-_TIFFNoPreCode(TIFF* tif, uint16 s)
+int _TIFFNoPreCode(TIFF *tif, uint16_t s)
 {
-	(void) tif; (void) s;
-	return (1);
+    (void)tif;
+    (void)s;
+    return (1);
 }
 
-static int _TIFFtrue(TIFF* tif) { (void) tif; return (1); }
-static void _TIFFvoid(TIFF* tif) { (void) tif; }
-
-void
-_TIFFSetDefaultCompressionState(TIFF* tif)
+static int _TIFFtrue(TIFF *tif)
 {
-	tif->tif_fixuptags = _TIFFNoFixupTags; 
-	tif->tif_decodestatus = TRUE;
-	tif->tif_setupdecode = _TIFFtrue;
-	tif->tif_predecode = _TIFFNoPreCode;
-	tif->tif_decoderow = _TIFFNoRowDecode;  
-	tif->tif_decodestrip = _TIFFNoStripDecode;
-	tif->tif_decodetile = _TIFFNoTileDecode;  
-	tif->tif_encodestatus = TRUE;
-	tif->tif_setupencode = _TIFFtrue;
-	tif->tif_preencode = _TIFFNoPreCode;
-	tif->tif_postencode = _TIFFtrue;
-	tif->tif_encoderow = _TIFFNoRowEncode;
-	tif->tif_encodestrip = _TIFFNoStripEncode;  
-	tif->tif_encodetile = _TIFFNoTileEncode;  
-	tif->tif_close = _TIFFvoid;
-	tif->tif_seek = _TIFFNoSeek;
-	tif->tif_cleanup = _TIFFvoid;
-	tif->tif_defstripsize = _TIFFDefaultStripSize;
-	tif->tif_deftilesize = _TIFFDefaultTileSize;
-	tif->tif_flags &= ~(TIFF_NOBITREV|TIFF_NOREADRAW);
+    (void)tif;
+    return (1);
 }
+static void _TIFFvoid(TIFF *tif) { (void)tif; }
 
-int
-TIFFSetCompressionScheme(TIFF* tif, int scheme)
+void _TIFFSetDefaultCompressionState(TIFF *tif)
 {
-	const TIFFCodec *c = TIFFFindCODEC((uint16) scheme);
+    tif->tif_fixuptags = _TIFFNoFixupTags;
+    tif->tif_decodestatus = TRUE;
+    tif->tif_setupdecode = _TIFFtrue;
+    tif->tif_predecode = _TIFFNoPreCode;
+    tif->tif_decoderow = _TIFFNoRowDecode;
+    tif->tif_decodestrip = _TIFFNoStripDecode;
+    tif->tif_decodetile = _TIFFNoTileDecode;
+    tif->tif_encodestatus = TRUE;
+    tif->tif_setupencode = _TIFFtrue;
+    tif->tif_preencode = _TIFFNoPreCode;
+    tif->tif_postencode = _TIFFtrue;
+    tif->tif_encoderow = _TIFFNoRowEncode;
+    tif->tif_encodestrip = _TIFFNoStripEncode;
+    tif->tif_encodetile = _TIFFNoTileEncode;
+    tif->tif_close = _TIFFvoid;
+    tif->tif_seek = _TIFFNoSeek;
+    tif->tif_cleanup = _TIFFvoid;
+    tif->tif_defstripsize = _TIFFDefaultStripSize;
+    tif->tif_deftilesize = _TIFFDefaultTileSize;
+    tif->tif_flags &= ~(TIFF_NOBITREV | TIFF_NOREADRAW);
+}
 
-	_TIFFSetDefaultCompressionState(tif);
-	/*
-	 * Don't treat an unknown compression scheme as an error.
-	 * This permits applications to open files with data that
-	 * the library does not have builtin support for, but which
-	 * may still be meaningful.
-	 */
-	return (c ? (*c->init)(tif, scheme) : 1);
+int TIFFSetCompressionScheme(TIFF *tif, int scheme)
+{
+    const TIFFCodec *c = TIFFFindCODEC((uint16_t)scheme);
+
+    _TIFFSetDefaultCompressionState(tif);
+    /*
+     * Don't treat an unknown compression scheme as an error.
+     * This permits applications to open files with data that
+     * the library does not have builtin support for, but which
+     * may still be meaningful.
+     */
+    return (c ? (*c->init)(tif, scheme) : 1);
 }
 
 /*
@@ -175,64 +182,68 @@ TIFFSetCompressionScheme(TIFF* tif, int scheme)
  * schemes can also override the builtin versions provided
  * by this library.
  */
-typedef struct _codec {
-	struct _codec* next;
-	TIFFCodec* info;
+typedef struct _codec
+{
+    struct _codec *next;
+    TIFFCodec *info;
 } codec_t;
-static codec_t* registeredCODECS = NULL;
+static codec_t *registeredCODECS = NULL;
 
-const TIFFCodec*
-TIFFFindCODEC(uint16 scheme)
+const TIFFCodec *TIFFFindCODEC(uint16_t scheme)
 {
-	const TIFFCodec* c;
-	codec_t* cd;
-
-	for (cd = registeredCODECS; cd; cd = cd->next)
-		if (cd->info->scheme == scheme)
-			return ((const TIFFCodec*) cd->info);
-	for (c = _TIFFBuiltinCODECS; c->name; c++)
-		if (c->scheme == scheme)
-			return (c);
-	return ((const TIFFCodec*) 0);
+    const TIFFCodec *c;
+    codec_t *cd;
+
+    for (cd = registeredCODECS; cd; cd = cd->next)
+        if (cd->info->scheme == scheme)
+            return ((const TIFFCodec *)cd->info);
+    for (c = _TIFFBuiltinCODECS; c->name; c++)
+        if (c->scheme == scheme)
+            return (c);
+    return ((const TIFFCodec *)0);
 }
 
-TIFFCodec*
-TIFFRegisterCODEC(uint16 scheme, const char* name, TIFFInitMethod init)
+TIFFCodec *TIFFRegisterCODEC(uint16_t scheme, const char *name,
+                             TIFFInitMethod init)
 {
-	codec_t* cd = (codec_t*)
-	    _TIFFmalloc((tmsize_t)(sizeof (codec_t) + sizeof (TIFFCodec) + strlen(name)+1));
-
-	if (cd != NULL) {
-		cd->info = (TIFFCodec*) ((uint8*) cd + sizeof (codec_t));
-		cd->info->name = (char*)
-		    ((uint8*) cd->info + sizeof (TIFFCodec));
-		strcpy(cd->info->name, name);
-		cd->info->scheme = scheme;
-		cd->info->init = init;
-		cd->next = registeredCODECS;
-		registeredCODECS = cd;
-	} else {
-		TIFFErrorExt(0, "TIFFRegisterCODEC",
-		    "No space to register compression scheme %s", name);
-		return NULL;
-	}
-	return (cd->info);
+    codec_t *cd = (codec_t *)_TIFFmallocExt(
+        NULL,
+        (tmsize_t)(sizeof(codec_t) + sizeof(TIFFCodec) + strlen(name) + 1));
+
+    if (cd != NULL)
+    {
+        cd->info = (TIFFCodec *)((uint8_t *)cd + sizeof(codec_t));
+        cd->info->name = (char *)((uint8_t *)cd->info + sizeof(TIFFCodec));
+        strcpy(cd->info->name, name);
+        cd->info->scheme = scheme;
+        cd->info->init = init;
+        cd->next = registeredCODECS;
+        registeredCODECS = cd;
+    }
+    else
+    {
+        TIFFErrorExt(0, "TIFFRegisterCODEC",
+                     "No space to register compression scheme %s", name);
+        return NULL;
+    }
+    return (cd->info);
 }
 
-void
-TIFFUnRegisterCODEC(TIFFCodec* c)
+void TIFFUnRegisterCODEC(TIFFCodec *c)
 {
-	codec_t* cd;
-	codec_t** pcd;
-
-	for (pcd = &registeredCODECS; (cd = *pcd) != NULL; pcd = &cd->next)
-		if (cd->info == c) {
-			*pcd = cd->next;
-			_TIFFfree(cd);
-			return;
-		}
-	TIFFErrorExt(0, "TIFFUnRegisterCODEC",
-	    "Cannot remove compression scheme %s; not registered", c->name);
+    codec_t *cd;
+    codec_t **pcd;
+
+    for (pcd = &registeredCODECS; (cd = *pcd) != NULL; pcd = &cd->next)
+        if (cd->info == c)
+        {
+            *pcd = cd->next;
+            _TIFFfreeExt(NULL, cd);
+            return;
+        }
+    TIFFErrorExt(0, "TIFFUnRegisterCODEC",
+                 "Cannot remove compression scheme %s; not registered",
+                 c->name);
 }
 
 /************************************************************************/
@@ -242,61 +253,58 @@ TIFFUnRegisterCODEC(TIFFCodec* c)
 /**
  * Get list of configured codecs, both built-in and registered by user.
  * Caller is responsible to free this structure.
- * 
+ *
  * @return returns array of TIFFCodec records (the last record should be NULL)
  * or NULL if function failed.
  */
 
-TIFFCodec*
-TIFFGetConfiguredCODECs()
+TIFFCodec *TIFFGetConfiguredCODECs()
 {
-	int i = 1;
-	codec_t *cd;
-	const TIFFCodec* c;
-	TIFFCodec* codecs = NULL;
-	TIFFCodec* new_codecs;
-
-	for (cd = registeredCODECS; cd; cd = cd->next) {
-		new_codecs = (TIFFCodec *)
-			_TIFFrealloc(codecs, i * sizeof(TIFFCodec));
-		if (!new_codecs) {
-			_TIFFfree (codecs);
-			return NULL;
-		}
-		codecs = new_codecs;
-		_TIFFmemcpy(codecs + i - 1, cd->info, sizeof(TIFFCodec));
-		i++;
-	}
-	for (c = _TIFFBuiltinCODECS; c->name; c++) {
-		if (TIFFIsCODECConfigured(c->scheme)) {
-			new_codecs = (TIFFCodec *)
-				_TIFFrealloc(codecs, i * sizeof(TIFFCodec));
-			if (!new_codecs) {
-				_TIFFfree (codecs);
-				return NULL;
-			}
-			codecs = new_codecs;
-			_TIFFmemcpy(codecs + i - 1, (const void*)c, sizeof(TIFFCodec));
-			i++;
-		}
-	}
-
-	new_codecs = (TIFFCodec *) _TIFFrealloc(codecs, i * sizeof(TIFFCodec));
-	if (!new_codecs) {
-		_TIFFfree (codecs);
-		return NULL;
-	}
-	codecs = new_codecs;
-	_TIFFmemset(codecs + i - 1, 0, sizeof(TIFFCodec));
-
-	return codecs;
+    int i = 1;
+    codec_t *cd;
+    const TIFFCodec *c;
+    TIFFCodec *codecs = NULL;
+    TIFFCodec *new_codecs;
+
+    for (cd = registeredCODECS; cd; cd = cd->next)
+    {
+        new_codecs =
+            (TIFFCodec *)_TIFFreallocExt(NULL, codecs, i * sizeof(TIFFCodec));
+        if (!new_codecs)
+        {
+            _TIFFfreeExt(NULL, codecs);
+            return NULL;
+        }
+        codecs = new_codecs;
+        _TIFFmemcpy(codecs + i - 1, cd->info, sizeof(TIFFCodec));
+        i++;
+    }
+    for (c = _TIFFBuiltinCODECS; c->name; c++)
+    {
+        if (TIFFIsCODECConfigured(c->scheme))
+        {
+            new_codecs = (TIFFCodec *)_TIFFreallocExt(NULL, codecs,
+                                                      i * sizeof(TIFFCodec));
+            if (!new_codecs)
+            {
+                _TIFFfreeExt(NULL, codecs);
+                return NULL;
+            }
+            codecs = new_codecs;
+            _TIFFmemcpy(codecs + i - 1, (const void *)c, sizeof(TIFFCodec));
+            i++;
+        }
+    }
+
+    new_codecs =
+        (TIFFCodec *)_TIFFreallocExt(NULL, codecs, i * sizeof(TIFFCodec));
+    if (!new_codecs)
+    {
+        _TIFFfreeExt(NULL, codecs);
+        return NULL;
+    }
+    codecs = new_codecs;
+    _TIFFmemset(codecs + i - 1, 0, sizeof(TIFFCodec));
+
+    return codecs;
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_config.h.cmake.in b/3rdparty/libtiff/tif_config.h.cmake.in
index 241446033817..62a4c7305a8e 100644
--- a/3rdparty/libtiff/tif_config.h.cmake.in
+++ b/3rdparty/libtiff/tif_config.h.cmake.in
@@ -1,6 +1,14 @@
+/* clang-format off */
+/* clang-format disabled because CMake scripts are very sensitive to the
+ * formatting of this file. configure_file variables of type "@VAR@" are
+ * modified by clang-format and won't be substituted.
+ */
+
 /* libtiff/tif_config.h.cmake.in.  Not generated, but originated from autoheader.  */
 /* This file must be kept up-to-date with needed substitutions from libtiff/tif_config.h.in. */
 
+#include "tiffconf.h"
+
 /* Support CCITT Group 3 & 4 algorithms */
 #cmakedefine CCITT_SUPPORT 1
 
@@ -20,84 +28,33 @@
 /* Define to 1 if you have the <assert.h> header file. */
 #cmakedefine HAVE_ASSERT_H 1
 
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#cmakedefine HAVE_DLFCN_H 1
+/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. */
+#cmakedefine HAVE_DECL_OPTARG 1
 
 /* Define to 1 if you have the <fcntl.h> header file. */
 #cmakedefine HAVE_FCNTL_H 1
 
+/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
+#cmakedefine HAVE_FSEEKO 1
+
 /* Define to 1 if you have the `getopt' function. */
 #cmakedefine HAVE_GETOPT 1
 
-/* Define to 1 if you have the <GLUT/glut.h> header file. */
-#cmakedefine HAVE_GLUT_GLUT_H 1
-
-/* Define to 1 if you have the <GL/glut.h> header file. */
-#cmakedefine HAVE_GL_GLUT_H 1
-
-/* Define to 1 if you have the <GL/glu.h> header file. */
-#cmakedefine HAVE_GL_GLU_H 1
-
-/* Define to 1 if you have the <GL/gl.h> header file. */
-#cmakedefine HAVE_GL_GL_H 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#cmakedefine HAVE_INTTYPES_H 1
-
 /* Define to 1 if you have the <io.h> header file. */
 #cmakedefine HAVE_IO_H 1
 
 /* Define to 1 if you have the `jbg_newlen' function. */
 #cmakedefine HAVE_JBG_NEWLEN 1
 
-/* Define to 1 if you have the `lfind' function. */
-#cmakedefine HAVE_LFIND 1
-
 /* Define to 1 if you have the `mmap' function. */
 #cmakedefine HAVE_MMAP 1
 
-/* Define to 1 if you have the <OpenGL/glu.h> header file. */
-#cmakedefine HAVE_OPENGL_GLU_H 1
-
-/* Define to 1 if you have the <OpenGL/gl.h> header file. */
-#cmakedefine HAVE_OPENGL_GL_H 1
-
-/* Define to 1 if you have the <search.h> header file. */
-#cmakedefine HAVE_SEARCH_H 1
-
 /* Define to 1 if you have the `setmode' function. */
 #cmakedefine HAVE_SETMODE 1
 
-/* Define to 1 if you have the `snprintf' function. */
-#cmakedefine HAVE_SNPRINTF 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#cmakedefine HAVE_STDINT_H 1
-
-/* Define to 1 if you have the `strcasecmp' function. */
-#cmakedefine HAVE_STRCASECMP 1
-
 /* Define to 1 if you have the <strings.h> header file. */
 #cmakedefine HAVE_STRINGS_H 1
 
-/* Define to 1 if you have the <string.h> header file. */
-#cmakedefine HAVE_STRING_H 1
-
-/* Define to 1 if you have the `strtol' function. */
-#cmakedefine HAVE_STRTOL 1
-
-/* Define to 1 if you have the `strtoll' function. */
-#cmakedefine HAVE_STRTOLL 1
-
-/* Define to 1 if you have the `strtoul' function. */
-#cmakedefine HAVE_STRTOUL 1
-
-/* Define to 1 if you have the `strtoull' function. */
-#cmakedefine HAVE_STRTOULL 1
-
-/* Define to 1 if you have the <sys/time.h> header file. */
-#cmakedefine HAVE_SYS_TIME_H 1
-
 /* Define to 1 if you have the <sys/types.h> header file. */
 #cmakedefine HAVE_SYS_TYPES_H 1
 
@@ -105,20 +62,17 @@
 #cmakedefine HAVE_UNISTD_H 1
 
 /* 8/12 bit libjpeg dual mode enabled */
-#cmakedefine JPEG_DUAL_MODE_8_12 1
+#cmakedefine JPEG_DUAL_MODE_8_12 1 1
+
+/* Support LERC compression */
+#cmakedefine LERC_SUPPORT 1
 
 /* 12bit libjpeg primary include file with path */
-#define LIBJPEG_12_PATH @LIBJPEG_12_PATH@
+#define LIBJPEG_12_PATH "@LIBJPEG_12_PATH@"
 
 /* Support LZMA2 compression */
 #cmakedefine LZMA_SUPPORT 1
 
-/* Support ZSTD compression */
-#cmakedefine ZSTD_SUPPORT 1
-
-/* Support WEBP compression */
-#cmakedefine WEBP_SUPPORT 1
-
 /* Name of package */
 #define PACKAGE "@PACKAGE_NAME@"
 
@@ -128,80 +82,30 @@
 /* Define to the full name of this package. */
 #define PACKAGE_NAME "@PACKAGE_NAME@"
 
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "@PACKAGE_STRING@"
-
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "@PACKAGE_TARNAME@"
 
 /* Define to the home page for this package. */
 #define PACKAGE_URL "@PACKAGE_URL@"
 
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "@PACKAGE_VERSION@"
-
-/* The size of `signed int', as computed by sizeof. */
-#define SIZEOF_SIGNED_INT @SIZEOF_SIGNED_INT@
-
-/* The size of `signed long', as computed by sizeof. */
-#define SIZEOF_SIGNED_LONG @SIZEOF_SIGNED_LONG@
-
-/* The size of `signed long long', as computed by sizeof. */
-#define SIZEOF_SIGNED_LONG_LONG @SIZEOF_SIGNED_LONG_LONG@
-
-/* The size of `unsigned char *', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_CHAR_P @SIZEOF_UNSIGNED_CHAR_P@
-
-/* The size of `unsigned int', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_INT @SIZEOF_UNSIGNED_INT@
-
-/* The size of `unsigned long', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_LONG @SIZEOF_UNSIGNED_LONG@
-
-/* The size of `unsigned long long', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_LONG_LONG @SIZEOF_UNSIGNED_LONG_LONG@
-
-/* The size of `unsigned short', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_SHORT @SIZEOF_UNSIGNED_SHORT@
+/* Size of size_t */
+#define SIZEOF_SIZE_T @SIZEOF_SIZE_T@
 
 /* Default size of the strip in bytes (when strip chopping enabled) */
-#define STRIP_SIZE_DEFAULT @STRIP_SIZE_DEFAULT@
-
-/* Signed 32-bit type formatter */
-#define TIFF_INT32_FORMAT "@TIFF_INT32_FORMAT@"
-
-/* Signed 64-bit type formatter */
-#define TIFF_INT64_FORMAT "@TIFF_INT64_FORMAT@"
-
-/* Pointer difference type formatter */
-#define TIFF_PTRDIFF_FORMAT "@TIFF_PTRDIFF_FORMAT@"
-
-/* Unsigned size type formatter */
-#define TIFF_SIZE_FORMAT "@TIFF_SIZE_FORMAT@"
-
-/* Signed size type formatter */
-#define TIFF_SSIZE_FORMAT "@TIFF_SSIZE_FORMAT@"
+#cmakedefine STRIP_SIZE_DEFAULT @STRIP_SIZE_DEFAULT@
 
-/* Unsigned 32-bit type formatter */
-#define TIFF_UINT32_FORMAT "@TIFF_UINT32_FORMAT@"
-
-/* Unsigned 64-bit type formatter */
-#define TIFF_UINT64_FORMAT "@TIFF_UINT64_FORMAT@"
-
-/* Unsigned 8-bit type */
-#define TIFF_UINT8_T @TIFF_UINT8_T@
-
-/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
-#undef TIME_WITH_SYS_TIME
-
-/* Define to 1 if your <sys/time.h> declares `struct tm'. */
-#cmakedefine TM_IN_SYS_TIME 1
+/** Maximum number of TIFF IFDs that libtiff can iterate through in a file. */
+#define TIFF_MAX_DIR_COUNT @TIFF_MAX_DIR_COUNT@
 
 /* define to use win32 IO system */
 #cmakedefine USE_WIN32_FILEIO 1
 
-/* Version number of package */
-#define VERSION "@PACKAGE_VERSION@"
+/* Support WEBP compression */
+#cmakedefine WEBP_SUPPORT 1
+
+/* Support ZSTD compression */
+#cmakedefine ZSTD_SUPPORT 1
+
 
 /* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
    significant byte first (like Motorola and SPARC, unlike Intel). */
@@ -215,17 +119,21 @@
 # endif
 #endif
 
-/* Number of bits in a file offset, on hosts where this is settable. */
-#define _FILE_OFFSET_BITS @FILE_OFFSET_BITS@
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-#define inline @INLINE_KEYWORD@
+#if !defined(__MINGW32__)
+#  define TIFF_SIZE_FORMAT "zu"
+#endif
+#if SIZEOF_SIZE_T == 8
+#  define TIFF_SSIZE_FORMAT PRId64
+#  if defined(__MINGW32__)
+#    define TIFF_SIZE_FORMAT PRIu64
+#  endif
+#elif SIZEOF_SIZE_T == 4
+#  define TIFF_SSIZE_FORMAT PRId32
+#  if defined(__MINGW32__)
+#    define TIFF_SIZE_FORMAT PRIu32
+#  endif
+#else
+#  error "Unsupported size_t size; please submit a bug report"
 #endif
 
-/* Define to `long int' if <sys/types.h> does not define. */
-#undef off_t
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-#undef size_t
+/* clang-format on */
diff --git a/3rdparty/libtiff/tif_dir.c b/3rdparty/libtiff/tif_dir.c
index 347b7115cb7e..85006218379d 100644
--- a/3rdparty/libtiff/tif_dir.c
+++ b/3rdparty/libtiff/tif_dir.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -29,759 +29,1077 @@
  * (and also some miscellaneous stuff)
  */
 #include "tiffiop.h"
-#include <float.h>	/*--: for Rational2Double */
+#include <float.h> /*--: for Rational2Double */
+#include <limits.h>
 
 /*
  * These are used in the backwards compatibility code...
  */
-#define DATATYPE_VOID		0       /* !untyped data */
-#define DATATYPE_INT		1       /* !signed integer data */
-#define DATATYPE_UINT		2       /* !unsigned integer data */
-#define DATATYPE_IEEEFP		3       /* !IEEE floating point data */
+#define DATATYPE_VOID 0   /* !untyped data */
+#define DATATYPE_INT 1    /* !signed integer data */
+#define DATATYPE_UINT 2   /* !unsigned integer data */
+#define DATATYPE_IEEEFP 3 /* !IEEE floating point data */
+
+static void setByteArray(TIFF *tif, void **vpp, const void *vp, size_t nmemb,
+                         size_t elem_size)
+{
+    if (*vpp)
+    {
+        _TIFFfreeExt(tif, *vpp);
+        *vpp = 0;
+    }
+    if (vp)
+    {
+        tmsize_t bytes = _TIFFMultiplySSize(NULL, nmemb, elem_size, NULL);
+        if (bytes)
+            *vpp = (void *)_TIFFmallocExt(tif, bytes);
+        if (*vpp)
+            _TIFFmemcpy(*vpp, vp, bytes);
+    }
+}
+void _TIFFsetByteArray(void **vpp, const void *vp, uint32_t n)
+{
+    setByteArray(NULL, vpp, vp, n, 1);
+}
+void _TIFFsetByteArrayExt(TIFF *tif, void **vpp, const void *vp, uint32_t n)
+{
+    setByteArray(tif, vpp, vp, n, 1);
+}
+
+static void _TIFFsetNString(TIFF *tif, char **cpp, const char *cp, uint32_t n)
+{
+    setByteArray(tif, (void **)cpp, cp, n, 1);
+}
 
-static void
-setByteArray(void** vpp, void* vp, size_t nmemb, size_t elem_size)
+void _TIFFsetShortArray(uint16_t **wpp, const uint16_t *wp, uint32_t n)
 {
-	if (*vpp) {
-		_TIFFfree(*vpp);
-		*vpp = 0;
-	}
-	if (vp) {
-		tmsize_t bytes = _TIFFMultiplySSize(NULL, nmemb, elem_size, NULL);
-		if (bytes)
-			*vpp = (void*) _TIFFmalloc(bytes);
-		if (*vpp)
-			_TIFFmemcpy(*vpp, vp, bytes);
-	}
+    setByteArray(NULL, (void **)wpp, wp, n, sizeof(uint16_t));
 }
-void _TIFFsetByteArray(void** vpp, void* vp, uint32 n)
-    { setByteArray(vpp, vp, n, 1); }
-void _TIFFsetString(char** cpp, char* cp)
-    { setByteArray((void**) cpp, (void*) cp, strlen(cp)+1, 1); }
-static void _TIFFsetNString(char** cpp, char* cp, uint32 n)
-    { setByteArray((void**) cpp, (void*) cp, n, 1); }
-void _TIFFsetShortArray(uint16** wpp, uint16* wp, uint32 n)
-    { setByteArray((void**) wpp, (void*) wp, n, sizeof (uint16)); }
-void _TIFFsetLongArray(uint32** lpp, uint32* lp, uint32 n)
-    { setByteArray((void**) lpp, (void*) lp, n, sizeof (uint32)); }
-static void _TIFFsetLong8Array(uint64** lpp, uint64* lp, uint32 n)
-    { setByteArray((void**) lpp, (void*) lp, n, sizeof (uint64)); }
-void _TIFFsetFloatArray(float** fpp, float* fp, uint32 n)
-    { setByteArray((void**) fpp, (void*) fp, n, sizeof (float)); }
-void _TIFFsetDoubleArray(double** dpp, double* dp, uint32 n)
-    { setByteArray((void**) dpp, (void*) dp, n, sizeof (double)); }
-
-static void
-setDoubleArrayOneValue(double** vpp, double value, size_t nmemb)
+void _TIFFsetShortArrayExt(TIFF *tif, uint16_t **wpp, const uint16_t *wp,
+                           uint32_t n)
 {
-	if (*vpp)
-		_TIFFfree(*vpp);
-	*vpp = _TIFFmalloc(nmemb*sizeof(double));
-	if (*vpp)
-	{
-		while (nmemb--)
-			((double*)*vpp)[nmemb] = value;
-	}
+    setByteArray(tif, (void **)wpp, wp, n, sizeof(uint16_t));
+}
+
+void _TIFFsetLongArray(uint32_t **lpp, const uint32_t *lp, uint32_t n)
+{
+    setByteArray(NULL, (void **)lpp, lp, n, sizeof(uint32_t));
+}
+void _TIFFsetLongArrayExt(TIFF *tif, uint32_t **lpp, const uint32_t *lp,
+                          uint32_t n)
+{
+    setByteArray(tif, (void **)lpp, lp, n, sizeof(uint32_t));
+}
+
+static void _TIFFsetLong8Array(TIFF *tif, uint64_t **lpp, const uint64_t *lp,
+                               uint32_t n)
+{
+    setByteArray(tif, (void **)lpp, lp, n, sizeof(uint64_t));
+}
+
+void _TIFFsetFloatArray(float **fpp, const float *fp, uint32_t n)
+{
+    setByteArray(NULL, (void **)fpp, fp, n, sizeof(float));
+}
+void _TIFFsetFloatArrayExt(TIFF *tif, float **fpp, const float *fp, uint32_t n)
+{
+    setByteArray(tif, (void **)fpp, fp, n, sizeof(float));
+}
+
+void _TIFFsetDoubleArray(double **dpp, const double *dp, uint32_t n)
+{
+    setByteArray(NULL, (void **)dpp, dp, n, sizeof(double));
+}
+void _TIFFsetDoubleArrayExt(TIFF *tif, double **dpp, const double *dp,
+                            uint32_t n)
+{
+    setByteArray(tif, (void **)dpp, dp, n, sizeof(double));
+}
+
+static void setDoubleArrayOneValue(TIFF *tif, double **vpp, double value,
+                                   size_t nmemb)
+{
+    if (*vpp)
+        _TIFFfreeExt(tif, *vpp);
+    *vpp = _TIFFmallocExt(tif, nmemb * sizeof(double));
+    if (*vpp)
+    {
+        while (nmemb--)
+            ((double *)*vpp)[nmemb] = value;
+    }
 }
 
 /*
  * Install extra samples information.
  */
-static int
-setExtraSamples(TIFF* tif, va_list ap, uint32* v)
+static int setExtraSamples(TIFF *tif, va_list ap, uint32_t *v)
 {
 /* XXX: Unassociated alpha data == 999 is a known Corel Draw bug, see below */
-#define EXTRASAMPLE_COREL_UNASSALPHA 999 
-
-	uint16* va;
-	uint32 i;
-        TIFFDirectory* td = &tif->tif_dir;
-        static const char module[] = "setExtraSamples";
-
-	*v = (uint16) va_arg(ap, uint16_vap);
-	if ((uint16) *v > td->td_samplesperpixel)
-		return 0;
-	va = va_arg(ap, uint16*);
-	if (*v > 0 && va == NULL)		/* typically missing param */
-		return 0;
-	for (i = 0; i < *v; i++) {
-		if (va[i] > EXTRASAMPLE_UNASSALPHA) {
-			/*
-			 * XXX: Corel Draw is known to produce incorrect
-			 * ExtraSamples tags which must be patched here if we
-			 * want to be able to open some of the damaged TIFF
-			 * files: 
-			 */
-			if (va[i] == EXTRASAMPLE_COREL_UNASSALPHA)
-				va[i] = EXTRASAMPLE_UNASSALPHA;
-			else
-				return 0;
-		}
-	}
-
-        if ( td->td_transferfunction[0] != NULL && (td->td_samplesperpixel - *v > 1) &&
-                !(td->td_samplesperpixel - td->td_extrasamples > 1))
+#define EXTRASAMPLE_COREL_UNASSALPHA 999
+
+    uint16_t *va;
+    uint32_t i;
+    TIFFDirectory *td = &tif->tif_dir;
+    static const char module[] = "setExtraSamples";
+
+    *v = (uint16_t)va_arg(ap, uint16_vap);
+    if ((uint16_t)*v > td->td_samplesperpixel)
+        return 0;
+    va = va_arg(ap, uint16_t *);
+    if (*v > 0 && va == NULL) /* typically missing param */
+        return 0;
+    for (i = 0; i < *v; i++)
+    {
+        if (va[i] > EXTRASAMPLE_UNASSALPHA)
         {
-                TIFFWarningExt(tif->tif_clientdata,module,
-                    "ExtraSamples tag value is changing, "
-                    "but TransferFunction was read with a different value. Canceling it");
-                TIFFClrFieldBit(tif,FIELD_TRANSFERFUNCTION);
-                _TIFFfree(td->td_transferfunction[0]);
-                td->td_transferfunction[0] = NULL;
+            /*
+             * XXX: Corel Draw is known to produce incorrect
+             * ExtraSamples tags which must be patched here if we
+             * want to be able to open some of the damaged TIFF
+             * files:
+             */
+            if (va[i] == EXTRASAMPLE_COREL_UNASSALPHA)
+                va[i] = EXTRASAMPLE_UNASSALPHA;
+            else
+                return 0;
         }
+    }
+
+    if (td->td_transferfunction[0] != NULL &&
+        (td->td_samplesperpixel - *v > 1) &&
+        !(td->td_samplesperpixel - td->td_extrasamples > 1))
+    {
+        TIFFWarningExtR(tif, module,
+                        "ExtraSamples tag value is changing, "
+                        "but TransferFunction was read with a different value. "
+                        "Canceling it");
+        TIFFClrFieldBit(tif, FIELD_TRANSFERFUNCTION);
+        _TIFFfreeExt(tif, td->td_transferfunction[0]);
+        td->td_transferfunction[0] = NULL;
+    }
 
-	td->td_extrasamples = (uint16) *v;
-	_TIFFsetShortArray(&td->td_sampleinfo, va, td->td_extrasamples);
-	return 1;
+    td->td_extrasamples = (uint16_t)*v;
+    _TIFFsetShortArrayExt(tif, &td->td_sampleinfo, va, td->td_extrasamples);
+    return 1;
 
 #undef EXTRASAMPLE_COREL_UNASSALPHA
 }
 
 /*
- * Confirm we have "samplesperpixel" ink names separated by \0.  Returns 
+ * Count ink names separated by \0.  Returns
  * zero if the ink names are not as expected.
  */
-static uint32
-checkInkNamesString(TIFF* tif, uint32 slen, const char* s)
+static uint16_t countInkNamesString(TIFF *tif, uint32_t slen, const char *s)
 {
-	TIFFDirectory* td = &tif->tif_dir;
-	uint16 i = td->td_samplesperpixel;
-
-	if (slen > 0) {
-		const char* ep = s+slen;
-		const char* cp = s;
-		for (; i > 0; i--) {
-			for (; cp < ep && *cp != '\0'; cp++) {}
-			if (cp >= ep)
-				goto bad;
-			cp++;				/* skip \0 */
-		}
-		return ((uint32)(cp-s));
-	}
+    uint16_t i = 0;
+
+    if (slen > 0)
+    {
+        const char *ep = s + slen;
+        const char *cp = s;
+        do
+        {
+            for (; cp < ep && *cp != '\0'; cp++)
+            {
+            }
+            if (cp >= ep)
+                goto bad;
+            cp++; /* skip \0 */
+            i++;
+        } while (cp < ep);
+        return (i);
+    }
 bad:
-	TIFFErrorExt(tif->tif_clientdata, "TIFFSetField",
-	    "%s: Invalid InkNames value; expecting %d names, found %d",
-	    tif->tif_name,
-	    td->td_samplesperpixel,
-	    td->td_samplesperpixel-i);
-	return (0);
+    TIFFErrorExtR(tif, "TIFFSetField",
+                  "%s: Invalid InkNames value; no NUL at given buffer end "
+                  "location %" PRIu32 ", after %" PRIu16 " ink",
+                  tif->tif_name, slen, i);
+    return (0);
 }
 
-static int
-_TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
+static int _TIFFVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	static const char module[] = "_TIFFVSetField";
+    static const char module[] = "_TIFFVSetField";
 
-	TIFFDirectory* td = &tif->tif_dir;
-	int status = 1;
-	uint32 v32, i, v;
+    TIFFDirectory *td = &tif->tif_dir;
+    int status = 1;
+    uint32_t v32, v;
     double dblval;
-	char* s;
-	const TIFFField *fip = TIFFFindField(tif, tag, TIFF_ANY);
-	uint32 standard_tag = tag;
-	if( fip == NULL ) /* cannot happen since OkToChangeTag() already checks it */
-	    return 0;
-	/*
-	 * We want to force the custom code to be used for custom
-	 * fields even if the tag happens to match a well known 
-	 * one - important for reinterpreted handling of standard
-	 * tag values in custom directories (i.e. EXIF) 
-	 */
-	if (fip->field_bit == FIELD_CUSTOM) {
-		standard_tag = 0;
-	}
-
-	switch (standard_tag) {
-	case TIFFTAG_SUBFILETYPE:
-		td->td_subfiletype = (uint32) va_arg(ap, uint32);
-		break;
-	case TIFFTAG_IMAGEWIDTH:
-		td->td_imagewidth = (uint32) va_arg(ap, uint32);
-		break;
-	case TIFFTAG_IMAGELENGTH:
-		td->td_imagelength = (uint32) va_arg(ap, uint32);
-		break;
-	case TIFFTAG_BITSPERSAMPLE:
-		td->td_bitspersample = (uint16) va_arg(ap, uint16_vap);
-		/*
-		 * If the data require post-decoding processing to byte-swap
-		 * samples, set it up here.  Note that since tags are required
-		 * to be ordered, compression code can override this behavior
-		 * in the setup method if it wants to roll the post decoding
-		 * work in with its normal work.
-		 */
-		if (tif->tif_flags & TIFF_SWAB) {
-			if (td->td_bitspersample == 8)
-				tif->tif_postdecode = _TIFFNoPostDecode;
-			else if (td->td_bitspersample == 16)
-				tif->tif_postdecode = _TIFFSwab16BitData;
-			else if (td->td_bitspersample == 24)
-				tif->tif_postdecode = _TIFFSwab24BitData;
-			else if (td->td_bitspersample == 32)
-				tif->tif_postdecode = _TIFFSwab32BitData;
-			else if (td->td_bitspersample == 64)
-				tif->tif_postdecode = _TIFFSwab64BitData;
-			else if (td->td_bitspersample == 128) /* two 64's */
-				tif->tif_postdecode = _TIFFSwab64BitData;
-		}
-		break;
-	case TIFFTAG_COMPRESSION:
-		v = (uint16) va_arg(ap, uint16_vap);
-		/*
-		 * If we're changing the compression scheme, the notify the
-		 * previous module so that it can cleanup any state it's
-		 * setup.
-		 */
-		if (TIFFFieldSet(tif, FIELD_COMPRESSION)) {
-			if ((uint32)td->td_compression == v)
-				break;
-			(*tif->tif_cleanup)(tif);
-			tif->tif_flags &= ~TIFF_CODERSETUP;
-		}
-		/*
-		 * Setup new compression routine state.
-		 */
-		if( (status = TIFFSetCompressionScheme(tif, v)) != 0 )
-		    td->td_compression = (uint16) v;
-		else
-		    status = 0;
-		break;
-	case TIFFTAG_PHOTOMETRIC:
-		td->td_photometric = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_THRESHHOLDING:
-		td->td_threshholding = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_FILLORDER:
-		v = (uint16) va_arg(ap, uint16_vap);
-		if (v != FILLORDER_LSB2MSB && v != FILLORDER_MSB2LSB)
-			goto badvalue;
-		td->td_fillorder = (uint16) v;
-		break;
-	case TIFFTAG_ORIENTATION:
-		v = (uint16) va_arg(ap, uint16_vap);
-		if (v < ORIENTATION_TOPLEFT || ORIENTATION_LEFTBOT < v)
-			goto badvalue;
-		else
-			td->td_orientation = (uint16) v;
-		break;
-	case TIFFTAG_SAMPLESPERPIXEL:
-		v = (uint16) va_arg(ap, uint16_vap);
-		if (v == 0)
-			goto badvalue;
-        if( v != td->td_samplesperpixel )
-        {
-            /* See http://bugzilla.maptools.org/show_bug.cgi?id=2500 */
-            if( td->td_sminsamplevalue != NULL )
+    char *s;
+    const TIFFField *fip = TIFFFindField(tif, tag, TIFF_ANY);
+    uint32_t standard_tag = tag;
+    if (fip == NULL) /* cannot happen since OkToChangeTag() already checks it */
+        return 0;
+    /*
+     * We want to force the custom code to be used for custom
+     * fields even if the tag happens to match a well known
+     * one - important for reinterpreted handling of standard
+     * tag values in custom directories (i.e. EXIF)
+     */
+    if (fip->field_bit == FIELD_CUSTOM)
+    {
+        standard_tag = 0;
+    }
+
+    switch (standard_tag)
+    {
+        case TIFFTAG_SUBFILETYPE:
+            td->td_subfiletype = (uint32_t)va_arg(ap, uint32_t);
+            break;
+        case TIFFTAG_IMAGEWIDTH:
+            td->td_imagewidth = (uint32_t)va_arg(ap, uint32_t);
+            break;
+        case TIFFTAG_IMAGELENGTH:
+            td->td_imagelength = (uint32_t)va_arg(ap, uint32_t);
+            break;
+        case TIFFTAG_BITSPERSAMPLE:
+            td->td_bitspersample = (uint16_t)va_arg(ap, uint16_vap);
+            /*
+             * If the data require post-decoding processing to byte-swap
+             * samples, set it up here.  Note that since tags are required
+             * to be ordered, compression code can override this behavior
+             * in the setup method if it wants to roll the post decoding
+             * work in with its normal work.
+             */
+            if (tif->tif_flags & TIFF_SWAB)
             {
-                TIFFWarningExt(tif->tif_clientdata,module,
-                    "SamplesPerPixel tag value is changing, "
-                    "but SMinSampleValue tag was read with a different value. Canceling it");
-                TIFFClrFieldBit(tif,FIELD_SMINSAMPLEVALUE);
-                _TIFFfree(td->td_sminsamplevalue);
-                td->td_sminsamplevalue = NULL;
+                if (td->td_bitspersample == 8)
+                    tif->tif_postdecode = _TIFFNoPostDecode;
+                else if (td->td_bitspersample == 16)
+                    tif->tif_postdecode = _TIFFSwab16BitData;
+                else if (td->td_bitspersample == 24)
+                    tif->tif_postdecode = _TIFFSwab24BitData;
+                else if (td->td_bitspersample == 32)
+                    tif->tif_postdecode = _TIFFSwab32BitData;
+                else if (td->td_bitspersample == 64)
+                    tif->tif_postdecode = _TIFFSwab64BitData;
+                else if (td->td_bitspersample == 128) /* two 64's */
+                    tif->tif_postdecode = _TIFFSwab64BitData;
             }
-            if( td->td_smaxsamplevalue != NULL )
+            break;
+        case TIFFTAG_COMPRESSION:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            /*
+             * If we're changing the compression scheme, notify the
+             * previous module so that it can cleanup any state it's
+             * setup.
+             */
+            if (TIFFFieldSet(tif, FIELD_COMPRESSION))
             {
-                TIFFWarningExt(tif->tif_clientdata,module,
-                    "SamplesPerPixel tag value is changing, "
-                    "but SMaxSampleValue tag was read with a different value. Canceling it");
-                TIFFClrFieldBit(tif,FIELD_SMAXSAMPLEVALUE);
-                _TIFFfree(td->td_smaxsamplevalue);
-                td->td_smaxsamplevalue = NULL;
+                if ((uint32_t)td->td_compression == v)
+                    break;
+                (*tif->tif_cleanup)(tif);
+                tif->tif_flags &= ~TIFF_CODERSETUP;
             }
-            /* Test if 3 transfer functions instead of just one are now needed
-               See http://bugzilla.maptools.org/show_bug.cgi?id=2820 */
-            if( td->td_transferfunction[0] != NULL && (v - td->td_extrasamples > 1) &&
-                !(td->td_samplesperpixel - td->td_extrasamples > 1))
+            /*
+             * Setup new compression routine state.
+             */
+            if ((status = TIFFSetCompressionScheme(tif, v)) != 0)
+                td->td_compression = (uint16_t)v;
+            else
+                status = 0;
+            break;
+        case TIFFTAG_PHOTOMETRIC:
+            td->td_photometric = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_THRESHHOLDING:
+            td->td_threshholding = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_FILLORDER:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            if (v != FILLORDER_LSB2MSB && v != FILLORDER_MSB2LSB)
+                goto badvalue;
+            td->td_fillorder = (uint16_t)v;
+            break;
+        case TIFFTAG_ORIENTATION:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            if (v < ORIENTATION_TOPLEFT || ORIENTATION_LEFTBOT < v)
+                goto badvalue;
+            else
+                td->td_orientation = (uint16_t)v;
+            break;
+        case TIFFTAG_SAMPLESPERPIXEL:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            if (v == 0)
+                goto badvalue;
+            if (v != td->td_samplesperpixel)
             {
-                    TIFFWarningExt(tif->tif_clientdata,module,
-                        "SamplesPerPixel tag value is changing, "
-                        "but TransferFunction was read with a different value. Canceling it");
-                    TIFFClrFieldBit(tif,FIELD_TRANSFERFUNCTION);
-                    _TIFFfree(td->td_transferfunction[0]);
+                /* See http://bugzilla.maptools.org/show_bug.cgi?id=2500 */
+                if (td->td_sminsamplevalue != NULL)
+                {
+                    TIFFWarningExtR(tif, module,
+                                    "SamplesPerPixel tag value is changing, "
+                                    "but SMinSampleValue tag was read with a "
+                                    "different value. Canceling it");
+                    TIFFClrFieldBit(tif, FIELD_SMINSAMPLEVALUE);
+                    _TIFFfreeExt(tif, td->td_sminsamplevalue);
+                    td->td_sminsamplevalue = NULL;
+                }
+                if (td->td_smaxsamplevalue != NULL)
+                {
+                    TIFFWarningExtR(tif, module,
+                                    "SamplesPerPixel tag value is changing, "
+                                    "but SMaxSampleValue tag was read with a "
+                                    "different value. Canceling it");
+                    TIFFClrFieldBit(tif, FIELD_SMAXSAMPLEVALUE);
+                    _TIFFfreeExt(tif, td->td_smaxsamplevalue);
+                    td->td_smaxsamplevalue = NULL;
+                }
+                /* Test if 3 transfer functions instead of just one are now
+                   needed See http://bugzilla.maptools.org/show_bug.cgi?id=2820
+                 */
+                if (td->td_transferfunction[0] != NULL &&
+                    (v - td->td_extrasamples > 1) &&
+                    !(td->td_samplesperpixel - td->td_extrasamples > 1))
+                {
+                    TIFFWarningExtR(tif, module,
+                                    "SamplesPerPixel tag value is changing, "
+                                    "but TransferFunction was read with a "
+                                    "different value. Canceling it");
+                    TIFFClrFieldBit(tif, FIELD_TRANSFERFUNCTION);
+                    _TIFFfreeExt(tif, td->td_transferfunction[0]);
                     td->td_transferfunction[0] = NULL;
+                }
+            }
+            td->td_samplesperpixel = (uint16_t)v;
+            break;
+        case TIFFTAG_ROWSPERSTRIP:
+            v32 = (uint32_t)va_arg(ap, uint32_t);
+            if (v32 == 0)
+                goto badvalue32;
+            td->td_rowsperstrip = v32;
+            if (!TIFFFieldSet(tif, FIELD_TILEDIMENSIONS))
+            {
+                td->td_tilelength = v32;
+                td->td_tilewidth = td->td_imagewidth;
+            }
+            break;
+        case TIFFTAG_MINSAMPLEVALUE:
+            td->td_minsamplevalue = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_MAXSAMPLEVALUE:
+            td->td_maxsamplevalue = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_SMINSAMPLEVALUE:
+            if (tif->tif_flags & TIFF_PERSAMPLE)
+                _TIFFsetDoubleArrayExt(tif, &td->td_sminsamplevalue,
+                                       va_arg(ap, double *),
+                                       td->td_samplesperpixel);
+            else
+                setDoubleArrayOneValue(tif, &td->td_sminsamplevalue,
+                                       va_arg(ap, double),
+                                       td->td_samplesperpixel);
+            break;
+        case TIFFTAG_SMAXSAMPLEVALUE:
+            if (tif->tif_flags & TIFF_PERSAMPLE)
+                _TIFFsetDoubleArrayExt(tif, &td->td_smaxsamplevalue,
+                                       va_arg(ap, double *),
+                                       td->td_samplesperpixel);
+            else
+                setDoubleArrayOneValue(tif, &td->td_smaxsamplevalue,
+                                       va_arg(ap, double),
+                                       td->td_samplesperpixel);
+            break;
+        case TIFFTAG_XRESOLUTION:
+            dblval = va_arg(ap, double);
+            if (dblval != dblval || dblval < 0)
+                goto badvaluedouble;
+            td->td_xresolution = _TIFFClampDoubleToFloat(dblval);
+            break;
+        case TIFFTAG_YRESOLUTION:
+            dblval = va_arg(ap, double);
+            if (dblval != dblval || dblval < 0)
+                goto badvaluedouble;
+            td->td_yresolution = _TIFFClampDoubleToFloat(dblval);
+            break;
+        case TIFFTAG_PLANARCONFIG:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            if (v != PLANARCONFIG_CONTIG && v != PLANARCONFIG_SEPARATE)
+                goto badvalue;
+            td->td_planarconfig = (uint16_t)v;
+            break;
+        case TIFFTAG_XPOSITION:
+            td->td_xposition = _TIFFClampDoubleToFloat(va_arg(ap, double));
+            break;
+        case TIFFTAG_YPOSITION:
+            td->td_yposition = _TIFFClampDoubleToFloat(va_arg(ap, double));
+            break;
+        case TIFFTAG_RESOLUTIONUNIT:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            if (v < RESUNIT_NONE || RESUNIT_CENTIMETER < v)
+                goto badvalue;
+            td->td_resolutionunit = (uint16_t)v;
+            break;
+        case TIFFTAG_PAGENUMBER:
+            td->td_pagenumber[0] = (uint16_t)va_arg(ap, uint16_vap);
+            td->td_pagenumber[1] = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_HALFTONEHINTS:
+            td->td_halftonehints[0] = (uint16_t)va_arg(ap, uint16_vap);
+            td->td_halftonehints[1] = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_COLORMAP:
+            v32 = (uint32_t)(1L << td->td_bitspersample);
+            _TIFFsetShortArrayExt(tif, &td->td_colormap[0],
+                                  va_arg(ap, uint16_t *), v32);
+            _TIFFsetShortArrayExt(tif, &td->td_colormap[1],
+                                  va_arg(ap, uint16_t *), v32);
+            _TIFFsetShortArrayExt(tif, &td->td_colormap[2],
+                                  va_arg(ap, uint16_t *), v32);
+            break;
+        case TIFFTAG_EXTRASAMPLES:
+            if (!setExtraSamples(tif, ap, &v))
+                goto badvalue;
+            break;
+        case TIFFTAG_MATTEING:
+            td->td_extrasamples = (((uint16_t)va_arg(ap, uint16_vap)) != 0);
+            if (td->td_extrasamples)
+            {
+                uint16_t sv = EXTRASAMPLE_ASSOCALPHA;
+                _TIFFsetShortArrayExt(tif, &td->td_sampleinfo, &sv, 1);
+            }
+            break;
+        case TIFFTAG_TILEWIDTH:
+            v32 = (uint32_t)va_arg(ap, uint32_t);
+            if (v32 % 16)
+            {
+                if (tif->tif_mode != O_RDONLY)
+                    goto badvalue32;
+                TIFFWarningExtR(
+                    tif, tif->tif_name,
+                    "Nonstandard tile width %" PRIu32 ", convert file", v32);
+            }
+            td->td_tilewidth = v32;
+            tif->tif_flags |= TIFF_ISTILED;
+            break;
+        case TIFFTAG_TILELENGTH:
+            v32 = (uint32_t)va_arg(ap, uint32_t);
+            if (v32 % 16)
+            {
+                if (tif->tif_mode != O_RDONLY)
+                    goto badvalue32;
+                TIFFWarningExtR(
+                    tif, tif->tif_name,
+                    "Nonstandard tile length %" PRIu32 ", convert file", v32);
+            }
+            td->td_tilelength = v32;
+            tif->tif_flags |= TIFF_ISTILED;
+            break;
+        case TIFFTAG_TILEDEPTH:
+            v32 = (uint32_t)va_arg(ap, uint32_t);
+            if (v32 == 0)
+                goto badvalue32;
+            td->td_tiledepth = v32;
+            break;
+        case TIFFTAG_DATATYPE:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            switch (v)
+            {
+                case DATATYPE_VOID:
+                    v = SAMPLEFORMAT_VOID;
+                    break;
+                case DATATYPE_INT:
+                    v = SAMPLEFORMAT_INT;
+                    break;
+                case DATATYPE_UINT:
+                    v = SAMPLEFORMAT_UINT;
+                    break;
+                case DATATYPE_IEEEFP:
+                    v = SAMPLEFORMAT_IEEEFP;
+                    break;
+                default:
+                    goto badvalue;
+            }
+            td->td_sampleformat = (uint16_t)v;
+            break;
+        case TIFFTAG_SAMPLEFORMAT:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            if (v < SAMPLEFORMAT_UINT || SAMPLEFORMAT_COMPLEXIEEEFP < v)
+                goto badvalue;
+            td->td_sampleformat = (uint16_t)v;
+
+            /*  Try to fix up the SWAB function for complex data. */
+            if (td->td_sampleformat == SAMPLEFORMAT_COMPLEXINT &&
+                td->td_bitspersample == 32 &&
+                tif->tif_postdecode == _TIFFSwab32BitData)
+                tif->tif_postdecode = _TIFFSwab16BitData;
+            else if ((td->td_sampleformat == SAMPLEFORMAT_COMPLEXINT ||
+                      td->td_sampleformat == SAMPLEFORMAT_COMPLEXIEEEFP) &&
+                     td->td_bitspersample == 64 &&
+                     tif->tif_postdecode == _TIFFSwab64BitData)
+                tif->tif_postdecode = _TIFFSwab32BitData;
+            break;
+        case TIFFTAG_IMAGEDEPTH:
+            td->td_imagedepth = (uint32_t)va_arg(ap, uint32_t);
+            break;
+        case TIFFTAG_SUBIFD:
+            if ((tif->tif_flags & TIFF_INSUBIFD) == 0)
+            {
+                td->td_nsubifd = (uint16_t)va_arg(ap, uint16_vap);
+                _TIFFsetLong8Array(tif, &td->td_subifd,
+                                   (uint64_t *)va_arg(ap, uint64_t *),
+                                   (uint32_t)td->td_nsubifd);
+            }
+            else
+            {
+                TIFFErrorExtR(tif, module, "%s: Sorry, cannot nest SubIFDs",
+                              tif->tif_name);
+                status = 0;
+            }
+            break;
+        case TIFFTAG_YCBCRPOSITIONING:
+            td->td_ycbcrpositioning = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_YCBCRSUBSAMPLING:
+            td->td_ycbcrsubsampling[0] = (uint16_t)va_arg(ap, uint16_vap);
+            td->td_ycbcrsubsampling[1] = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_TRANSFERFUNCTION:
+        {
+            uint32_t i;
+            v = (td->td_samplesperpixel - td->td_extrasamples) > 1 ? 3 : 1;
+            for (i = 0; i < v; i++)
+                _TIFFsetShortArrayExt(tif, &td->td_transferfunction[i],
+                                      va_arg(ap, uint16_t *),
+                                      1U << td->td_bitspersample);
+            break;
+        }
+        case TIFFTAG_REFERENCEBLACKWHITE:
+            /* XXX should check for null range */
+            _TIFFsetFloatArrayExt(tif, &td->td_refblackwhite,
+                                  va_arg(ap, float *), 6);
+            break;
+        case TIFFTAG_INKNAMES:
+        {
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            s = va_arg(ap, char *);
+            uint16_t ninksinstring;
+            ninksinstring = countInkNamesString(tif, v, s);
+            status = ninksinstring > 0;
+            if (ninksinstring > 0)
+            {
+                _TIFFsetNString(tif, &td->td_inknames, s, v);
+                td->td_inknameslen = v;
+                /* Set NumberOfInks to the value ninksinstring */
+                if (TIFFFieldSet(tif, FIELD_NUMBEROFINKS))
+                {
+                    if (td->td_numberofinks != ninksinstring)
+                    {
+                        TIFFErrorExtR(
+                            tif, module,
+                            "Warning %s; Tag %s:\n  Value %" PRIu16
+                            " of NumberOfInks is different from the number of "
+                            "inks %" PRIu16
+                            ".\n  -> NumberOfInks value adapted to %" PRIu16 "",
+                            tif->tif_name, fip->field_name, td->td_numberofinks,
+                            ninksinstring, ninksinstring);
+                        td->td_numberofinks = ninksinstring;
+                    }
+                }
+                else
+                {
+                    td->td_numberofinks = ninksinstring;
+                    TIFFSetFieldBit(tif, FIELD_NUMBEROFINKS);
+                }
+                if (TIFFFieldSet(tif, FIELD_SAMPLESPERPIXEL))
+                {
+                    if (td->td_numberofinks != td->td_samplesperpixel)
+                    {
+                        TIFFErrorExtR(tif, module,
+                                      "Warning %s; Tag %s:\n  Value %" PRIu16
+                                      " of NumberOfInks is different from the "
+                                      "SamplesPerPixel value %" PRIu16 "",
+                                      tif->tif_name, fip->field_name,
+                                      td->td_numberofinks,
+                                      td->td_samplesperpixel);
+                    }
+                }
+            }
+        }
+        break;
+        case TIFFTAG_NUMBEROFINKS:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            /* If InkNames already set also NumberOfInks is set accordingly and
+             * should be equal */
+            if (TIFFFieldSet(tif, FIELD_INKNAMES))
+            {
+                if (v != td->td_numberofinks)
+                {
+                    TIFFErrorExtR(
+                        tif, module,
+                        "Error %s; Tag %s:\n  It is not possible to set the "
+                        "value %" PRIu32
+                        " for NumberOfInks\n  which is different from the "
+                        "number of inks in the InkNames tag (%" PRIu16 ")",
+                        tif->tif_name, fip->field_name, v, td->td_numberofinks);
+                    /* Do not set / overwrite number of inks already set by
+                     * InkNames case accordingly. */
+                    status = 0;
+                }
+            }
+            else
+            {
+                td->td_numberofinks = (uint16_t)v;
+                if (TIFFFieldSet(tif, FIELD_SAMPLESPERPIXEL))
+                {
+                    if (td->td_numberofinks != td->td_samplesperpixel)
+                    {
+                        TIFFErrorExtR(tif, module,
+                                      "Warning %s; Tag %s:\n  Value %" PRIu32
+                                      " of NumberOfInks is different from the "
+                                      "SamplesPerPixel value %" PRIu16 "",
+                                      tif->tif_name, fip->field_name, v,
+                                      td->td_samplesperpixel);
+                    }
+                }
+            }
+            break;
+        case TIFFTAG_PERSAMPLE:
+            v = (uint16_t)va_arg(ap, uint16_vap);
+            if (v == PERSAMPLE_MULTI)
+                tif->tif_flags |= TIFF_PERSAMPLE;
+            else
+                tif->tif_flags &= ~TIFF_PERSAMPLE;
+            break;
+        default:
+        {
+            TIFFTagValue *tv;
+            int tv_size, iCustom;
+
+            /*
+             * This can happen if multiple images are open with different
+             * codecs which have private tags.  The global tag information
+             * table may then have tags that are valid for one file but not
+             * the other. If the client tries to set a tag that is not valid
+             * for the image's codec then we'll arrive here.  This
+             * happens, for example, when tiffcp is used to convert between
+             * compression schemes and codec-specific tags are blindly copied.
+             *
+             * This also happens when a FIELD_IGNORE tag is written.
+             */
+            if (fip->field_bit == FIELD_IGNORE)
+            {
+                TIFFErrorExtR(
+                    tif, module,
+                    "%s: Ignored %stag \"%s\" (not supported by libtiff)",
+                    tif->tif_name, isPseudoTag(tag) ? "pseudo-" : "",
+                    fip->field_name);
+                status = 0;
+                break;
+            }
+            if (fip->field_bit != FIELD_CUSTOM)
+            {
+                TIFFErrorExtR(
+                    tif, module,
+                    "%s: Invalid %stag \"%s\" (not supported by codec)",
+                    tif->tif_name, isPseudoTag(tag) ? "pseudo-" : "",
+                    fip->field_name);
+                status = 0;
+                break;
+            }
+
+            /*
+             * Find the existing entry for this custom value.
+             */
+            tv = NULL;
+            for (iCustom = 0; iCustom < td->td_customValueCount; iCustom++)
+            {
+                if (td->td_customValues[iCustom].info->field_tag == tag)
+                {
+                    tv = td->td_customValues + iCustom;
+                    if (tv->value != NULL)
+                    {
+                        _TIFFfreeExt(tif, tv->value);
+                        tv->value = NULL;
+                    }
+                    break;
+                }
+            }
+
+            /*
+             * Grow the custom list if the entry was not found.
+             */
+            if (tv == NULL)
+            {
+                TIFFTagValue *new_customValues;
+
+                td->td_customValueCount++;
+                new_customValues = (TIFFTagValue *)_TIFFreallocExt(
+                    tif, td->td_customValues,
+                    sizeof(TIFFTagValue) * td->td_customValueCount);
+                if (!new_customValues)
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "%s: Failed to allocate space for list of "
+                                  "custom values",
+                                  tif->tif_name);
+                    status = 0;
+                    goto end;
+                }
+
+                td->td_customValues = new_customValues;
+
+                tv = td->td_customValues + (td->td_customValueCount - 1);
+                tv->info = fip;
+                tv->value = NULL;
+                tv->count = 0;
+            }
+
+            /*
+             * Set custom value ... save a copy of the custom tag value.
+             */
+            /*--: Rational2Double: For Rationals evaluate "set_field_type" to
+             * determine internal storage size. */
+            tv_size = TIFFFieldSetGetSize(fip);
+            if (tv_size == 0)
+            {
+                status = 0;
+                TIFFErrorExtR(tif, module, "%s: Bad field type %d for \"%s\"",
+                              tif->tif_name, fip->field_type, fip->field_name);
+                goto end;
+            }
+
+            if (fip->field_type == TIFF_ASCII)
+            {
+                uint32_t ma;
+                const char *mb;
+                if (fip->field_passcount)
+                {
+                    assert(fip->field_writecount == TIFF_VARIABLE2);
+                    ma = (uint32_t)va_arg(ap, uint32_t);
+                    mb = (const char *)va_arg(ap, const char *);
+                }
+                else
+                {
+                    mb = (const char *)va_arg(ap, const char *);
+                    size_t len = strlen(mb) + 1;
+                    if (len >= 0x80000000U)
+                    {
+                        status = 0;
+                        TIFFErrorExtR(tif, module,
+                                      "%s: Too long string value for \"%s\". "
+                                      "Maximum supported is 2147483647 bytes",
+                                      tif->tif_name, fip->field_name);
+                        goto end;
+                    }
+                    ma = (uint32_t)len;
+                }
+                tv->count = ma;
+                setByteArray(tif, &tv->value, mb, ma, 1);
+            }
+            else
+            {
+                if (fip->field_passcount)
+                {
+                    if (fip->field_writecount == TIFF_VARIABLE2)
+                        tv->count = (uint32_t)va_arg(ap, uint32_t);
+                    else
+                        tv->count = (int)va_arg(ap, int);
+                }
+                else if (fip->field_writecount == TIFF_VARIABLE ||
+                         fip->field_writecount == TIFF_VARIABLE2)
+                    tv->count = 1;
+                else if (fip->field_writecount == TIFF_SPP)
+                    tv->count = td->td_samplesperpixel;
+                else
+                    tv->count = fip->field_writecount;
+
+                if (tv->count == 0)
+                {
+                    status = 0;
+                    TIFFErrorExtR(tif, module,
+                                  "%s: Null count for \"%s\" (type "
+                                  "%d, writecount %d, passcount %d)",
+                                  tif->tif_name, fip->field_name,
+                                  fip->field_type, fip->field_writecount,
+                                  fip->field_passcount);
+                    goto end;
+                }
+
+                tv->value = _TIFFCheckMalloc(tif, tv->count, tv_size,
+                                             "custom tag binary object");
+                if (!tv->value)
+                {
+                    status = 0;
+                    goto end;
+                }
+
+                if (fip->field_tag == TIFFTAG_DOTRANGE &&
+                    strcmp(fip->field_name, "DotRange") == 0)
+                {
+                    /* TODO: This is an evil exception and should not have been
+                       handled this way ... likely best if we move it into
+                       the directory structure with an explicit field in
+                       libtiff 4.1 and assign it a FIELD_ value */
+                    uint16_t v2[2];
+                    v2[0] = (uint16_t)va_arg(ap, int);
+                    v2[1] = (uint16_t)va_arg(ap, int);
+                    _TIFFmemcpy(tv->value, &v2, 4);
+                }
+
+                else if (fip->field_passcount ||
+                         fip->field_writecount == TIFF_VARIABLE ||
+                         fip->field_writecount == TIFF_VARIABLE2 ||
+                         fip->field_writecount == TIFF_SPP || tv->count > 1)
+                {
+                    /*--: Rational2Double: For Rationals tv_size is set above to
+                     * 4 or 8 according to fip->set_field_type! */
+                    _TIFFmemcpy(tv->value, va_arg(ap, void *),
+                                tv->count * tv_size);
+                    /* Test here for too big values for LONG8, SLONG8 in
+                     * ClassicTIFF and delete custom field from custom list */
+                    if (!(tif->tif_flags & TIFF_BIGTIFF))
+                    {
+                        if (tv->info->field_type == TIFF_LONG8)
+                        {
+                            uint64_t *pui64 = (uint64_t *)tv->value;
+                            for (int i = 0; i < tv->count; i++)
+                            {
+                                if (pui64[i] > 0xffffffffu)
+                                {
+                                    TIFFErrorExtR(
+                                        tif, module,
+                                        "%s: Bad LONG8 value %" PRIu64
+                                        " at %d. array position for \"%s\" tag "
+                                        "%d in ClassicTIFF. Tag won't be "
+                                        "written to file",
+                                        tif->tif_name, pui64[i], i,
+                                        fip->field_name, tag);
+                                    goto badvalueifd8long8;
+                                }
+                            }
+                        }
+                        else if (tv->info->field_type == TIFF_SLONG8)
+                        {
+                            int64_t *pi64 = (int64_t *)tv->value;
+                            for (int i = 0; i < tv->count; i++)
+                            {
+                                if (pi64[i] > 2147483647 ||
+                                    pi64[i] < (-2147483647 - 1))
+                                {
+                                    TIFFErrorExtR(
+                                        tif, module,
+                                        "%s: Bad SLONG8 value %" PRIi64
+                                        " at %d. array position for \"%s\" tag "
+                                        "%d in ClassicTIFF. Tag won't be "
+                                        "written to file",
+                                        tif->tif_name, pi64[i], i,
+                                        fip->field_name, tag);
+                                    goto badvalueifd8long8;
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    char *val = (char *)tv->value;
+                    assert(tv->count == 1);
+
+                    switch (fip->field_type)
+                    {
+                        case TIFF_BYTE:
+                        case TIFF_UNDEFINED:
+                        {
+                            uint8_t v2 = (uint8_t)va_arg(ap, int);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                        }
+                        break;
+                        case TIFF_SBYTE:
+                        {
+                            int8_t v2 = (int8_t)va_arg(ap, int);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                        }
+                        break;
+                        case TIFF_SHORT:
+                        {
+                            uint16_t v2 = (uint16_t)va_arg(ap, int);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                        }
+                        break;
+                        case TIFF_SSHORT:
+                        {
+                            int16_t v2 = (int16_t)va_arg(ap, int);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                        }
+                        break;
+                        case TIFF_LONG:
+                        case TIFF_IFD:
+                        {
+                            uint32_t v2 = va_arg(ap, uint32_t);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                        }
+                        break;
+                        case TIFF_SLONG:
+                        {
+                            int32_t v2 = va_arg(ap, int32_t);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                        }
+                        break;
+                        case TIFF_LONG8:
+                        case TIFF_IFD8:
+                        {
+                            uint64_t v2 = va_arg(ap, uint64_t);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                            /* Test here for too big values for ClassicTIFF and
+                             * delete custom field from custom list */
+                            if (!(tif->tif_flags & TIFF_BIGTIFF) &&
+                                (v2 > 0xffffffffu))
+                            {
+                                TIFFErrorExtR(
+                                    tif, module,
+                                    "%s: Bad LONG8 or IFD8 value %" PRIu64
+                                    " for \"%s\" tag %d in ClassicTIFF. Tag "
+                                    "won't be written to file",
+                                    tif->tif_name, v2, fip->field_name, tag);
+                                goto badvalueifd8long8;
+                            }
+                        }
+                        break;
+                        case TIFF_SLONG8:
+                        {
+                            int64_t v2 = va_arg(ap, int64_t);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                            /* Test here for too big values for ClassicTIFF and
+                             * delete custom field from custom list */
+                            if (!(tif->tif_flags & TIFF_BIGTIFF) &&
+                                ((v2 > 2147483647) || (v2 < (-2147483647 - 1))))
+                            {
+                                TIFFErrorExtR(
+                                    tif, module,
+                                    "%s: Bad SLONG8 value %" PRIi64
+                                    " for \"%s\" tag %d in ClassicTIFF. Tag "
+                                    "won't be written to file",
+                                    tif->tif_name, v2, fip->field_name, tag);
+                                goto badvalueifd8long8;
+                            }
+                        }
+                        break;
+                        case TIFF_RATIONAL:
+                        case TIFF_SRATIONAL:
+                            /*-- Rational2Double: For Rationals tv_size is set
+                             * above to 4 or 8 according to fip->set_field_type!
+                             */
+                            {
+                                if (tv_size == 8)
+                                {
+                                    double v2 = va_arg(ap, double);
+                                    _TIFFmemcpy(val, &v2, tv_size);
+                                }
+                                else
+                                {
+                                    /*-- default should be tv_size == 4 */
+                                    float v3 = (float)va_arg(ap, double);
+                                    _TIFFmemcpy(val, &v3, tv_size);
+                                    /*-- ToDo: After Testing, this should be
+                                     * removed and tv_size==4 should be set as
+                                     * default. */
+                                    if (tv_size != 4)
+                                    {
+                                        TIFFErrorExtR(
+                                            tif, module,
+                                            "Rational2Double: .set_field_type "
+                                            "in not 4 but %d",
+                                            tv_size);
+                                    }
+                                }
+                            }
+                            break;
+                        case TIFF_FLOAT:
+                        {
+                            float v2 =
+                                _TIFFClampDoubleToFloat(va_arg(ap, double));
+                            _TIFFmemcpy(val, &v2, tv_size);
+                        }
+                        break;
+                        case TIFF_DOUBLE:
+                        {
+                            double v2 = va_arg(ap, double);
+                            _TIFFmemcpy(val, &v2, tv_size);
+                        }
+                        break;
+                        default:
+                            _TIFFmemset(val, 0, tv_size);
+                            status = 0;
+                            break;
+                    }
+                }
             }
         }
-		td->td_samplesperpixel = (uint16) v;
-		break;
-	case TIFFTAG_ROWSPERSTRIP:
-		v32 = (uint32) va_arg(ap, uint32);
-		if (v32 == 0)
-			goto badvalue32;
-		td->td_rowsperstrip = v32;
-		if (!TIFFFieldSet(tif, FIELD_TILEDIMENSIONS)) {
-			td->td_tilelength = v32;
-			td->td_tilewidth = td->td_imagewidth;
-		}
-		break;
-	case TIFFTAG_MINSAMPLEVALUE:
-		td->td_minsamplevalue = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_MAXSAMPLEVALUE:
-		td->td_maxsamplevalue = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_SMINSAMPLEVALUE:
-		if (tif->tif_flags & TIFF_PERSAMPLE)
-			_TIFFsetDoubleArray(&td->td_sminsamplevalue, va_arg(ap, double*), td->td_samplesperpixel);
-		else
-			setDoubleArrayOneValue(&td->td_sminsamplevalue, va_arg(ap, double), td->td_samplesperpixel);
-		break;
-	case TIFFTAG_SMAXSAMPLEVALUE:
-		if (tif->tif_flags & TIFF_PERSAMPLE)
-			_TIFFsetDoubleArray(&td->td_smaxsamplevalue, va_arg(ap, double*), td->td_samplesperpixel);
-		else
-			setDoubleArrayOneValue(&td->td_smaxsamplevalue, va_arg(ap, double), td->td_samplesperpixel);
-		break;
-	case TIFFTAG_XRESOLUTION:
-        dblval = va_arg(ap, double);
-        if( dblval < 0 )
-            goto badvaluedouble;
-		td->td_xresolution = _TIFFClampDoubleToFloat( dblval );
-		break;
-	case TIFFTAG_YRESOLUTION:
-        dblval = va_arg(ap, double);
-        if( dblval < 0 )
-            goto badvaluedouble;
-		td->td_yresolution = _TIFFClampDoubleToFloat( dblval );
-		break;
-	case TIFFTAG_PLANARCONFIG:
-		v = (uint16) va_arg(ap, uint16_vap);
-		if (v != PLANARCONFIG_CONTIG && v != PLANARCONFIG_SEPARATE)
-			goto badvalue;
-		td->td_planarconfig = (uint16) v;
-		break;
-	case TIFFTAG_XPOSITION:
-		td->td_xposition = _TIFFClampDoubleToFloat( va_arg(ap, double) );
-		break;
-	case TIFFTAG_YPOSITION:
-		td->td_yposition = _TIFFClampDoubleToFloat( va_arg(ap, double) );
-		break;
-	case TIFFTAG_RESOLUTIONUNIT:
-		v = (uint16) va_arg(ap, uint16_vap);
-		if (v < RESUNIT_NONE || RESUNIT_CENTIMETER < v)
-			goto badvalue;
-		td->td_resolutionunit = (uint16) v;
-		break;
-	case TIFFTAG_PAGENUMBER:
-		td->td_pagenumber[0] = (uint16) va_arg(ap, uint16_vap);
-		td->td_pagenumber[1] = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_HALFTONEHINTS:
-		td->td_halftonehints[0] = (uint16) va_arg(ap, uint16_vap);
-		td->td_halftonehints[1] = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_COLORMAP:
-		v32 = (uint32)(1L<<td->td_bitspersample);
-		_TIFFsetShortArray(&td->td_colormap[0], va_arg(ap, uint16*), v32);
-		_TIFFsetShortArray(&td->td_colormap[1], va_arg(ap, uint16*), v32);
-		_TIFFsetShortArray(&td->td_colormap[2], va_arg(ap, uint16*), v32);
-		break;
-	case TIFFTAG_EXTRASAMPLES:
-		if (!setExtraSamples(tif, ap, &v))
-			goto badvalue;
-		break;
-	case TIFFTAG_MATTEING:
-		td->td_extrasamples =  (((uint16) va_arg(ap, uint16_vap)) != 0);
-		if (td->td_extrasamples) {
-			uint16 sv = EXTRASAMPLE_ASSOCALPHA;
-			_TIFFsetShortArray(&td->td_sampleinfo, &sv, 1);
-		}
-		break;
-	case TIFFTAG_TILEWIDTH:
-		v32 = (uint32) va_arg(ap, uint32);
-		if (v32 % 16) {
-			if (tif->tif_mode != O_RDONLY)
-				goto badvalue32;
-			TIFFWarningExt(tif->tif_clientdata, tif->tif_name,
-				"Nonstandard tile width %u, convert file", v32);
-		}
-		td->td_tilewidth = v32;
-		tif->tif_flags |= TIFF_ISTILED;
-		break;
-	case TIFFTAG_TILELENGTH:
-		v32 = (uint32) va_arg(ap, uint32);
-		if (v32 % 16) {
-			if (tif->tif_mode != O_RDONLY)
-				goto badvalue32;
-			TIFFWarningExt(tif->tif_clientdata, tif->tif_name,
-			    "Nonstandard tile length %u, convert file", v32);
-		}
-		td->td_tilelength = v32;
-		tif->tif_flags |= TIFF_ISTILED;
-		break;
-	case TIFFTAG_TILEDEPTH:
-		v32 = (uint32) va_arg(ap, uint32);
-		if (v32 == 0)
-			goto badvalue32;
-		td->td_tiledepth = v32;
-		break;
-	case TIFFTAG_DATATYPE:
-		v = (uint16) va_arg(ap, uint16_vap);
-		switch (v) {
-		case DATATYPE_VOID:	v = SAMPLEFORMAT_VOID;	break;
-		case DATATYPE_INT:	v = SAMPLEFORMAT_INT;	break;
-		case DATATYPE_UINT:	v = SAMPLEFORMAT_UINT;	break;
-		case DATATYPE_IEEEFP:	v = SAMPLEFORMAT_IEEEFP;break;
-		default:		goto badvalue;
-		}
-		td->td_sampleformat = (uint16) v;
-		break;
-	case TIFFTAG_SAMPLEFORMAT:
-		v = (uint16) va_arg(ap, uint16_vap);
-		if (v < SAMPLEFORMAT_UINT || SAMPLEFORMAT_COMPLEXIEEEFP < v)
-			goto badvalue;
-		td->td_sampleformat = (uint16) v;
-
-		/*  Try to fix up the SWAB function for complex data. */
-		if( td->td_sampleformat == SAMPLEFORMAT_COMPLEXINT
-		    && td->td_bitspersample == 32
-		    && tif->tif_postdecode == _TIFFSwab32BitData )
-		    tif->tif_postdecode = _TIFFSwab16BitData;
-		else if( (td->td_sampleformat == SAMPLEFORMAT_COMPLEXINT
-			  || td->td_sampleformat == SAMPLEFORMAT_COMPLEXIEEEFP)
-			 && td->td_bitspersample == 64
-			 && tif->tif_postdecode == _TIFFSwab64BitData )
-		    tif->tif_postdecode = _TIFFSwab32BitData;
-		break;
-	case TIFFTAG_IMAGEDEPTH:
-		td->td_imagedepth = (uint32) va_arg(ap, uint32);
-		break;
-	case TIFFTAG_SUBIFD:
-		if ((tif->tif_flags & TIFF_INSUBIFD) == 0) {
-			td->td_nsubifd = (uint16) va_arg(ap, uint16_vap);
-			_TIFFsetLong8Array(&td->td_subifd, (uint64*) va_arg(ap, uint64*),
-			    (uint32) td->td_nsubifd);
-		} else {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				     "%s: Sorry, cannot nest SubIFDs",
-				     tif->tif_name);
-			status = 0;
-		}
-		break;
-	case TIFFTAG_YCBCRPOSITIONING:
-		td->td_ycbcrpositioning = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_YCBCRSUBSAMPLING:
-		td->td_ycbcrsubsampling[0] = (uint16) va_arg(ap, uint16_vap);
-		td->td_ycbcrsubsampling[1] = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_TRANSFERFUNCTION:
-		v = (td->td_samplesperpixel - td->td_extrasamples) > 1 ? 3 : 1;
-		for (i = 0; i < v; i++)
-			_TIFFsetShortArray(&td->td_transferfunction[i],
-			    va_arg(ap, uint16*), 1U<<td->td_bitspersample);
-		break;
-	case TIFFTAG_REFERENCEBLACKWHITE:
-		/* XXX should check for null range */
-		_TIFFsetFloatArray(&td->td_refblackwhite, va_arg(ap, float*), 6);
-		break;
-	case TIFFTAG_INKNAMES:
-		v = (uint16) va_arg(ap, uint16_vap);
-		s = va_arg(ap, char*);
-		v = checkInkNamesString(tif, v, s);
-		status = v > 0;
-		if( v > 0 ) {
-			_TIFFsetNString(&td->td_inknames, s, v);
-			td->td_inknameslen = v;
-		}
-		break;
-	case TIFFTAG_PERSAMPLE:
-		v = (uint16) va_arg(ap, uint16_vap);
-		if( v == PERSAMPLE_MULTI )
-			tif->tif_flags |= TIFF_PERSAMPLE;
-		else
-			tif->tif_flags &= ~TIFF_PERSAMPLE;
-		break;
-	default: {
-		TIFFTagValue *tv;
-		int tv_size, iCustom;
-
-		/*
-		 * This can happen if multiple images are open with different
-		 * codecs which have private tags.  The global tag information
-		 * table may then have tags that are valid for one file but not
-		 * the other. If the client tries to set a tag that is not valid
-		 * for the image's codec then we'll arrive here.  This
-		 * happens, for example, when tiffcp is used to convert between
-		 * compression schemes and codec-specific tags are blindly copied.
-		 */
-		if(fip->field_bit != FIELD_CUSTOM) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "%s: Invalid %stag \"%s\" (not supported by codec)",
-			    tif->tif_name, isPseudoTag(tag) ? "pseudo-" : "",
-			    fip->field_name);
-			status = 0;
-			break;
-		}
-
-		/*
-		 * Find the existing entry for this custom value.
-		 */
-		tv = NULL;
-		for (iCustom = 0; iCustom < td->td_customValueCount; iCustom++) {
-			if (td->td_customValues[iCustom].info->field_tag == tag) {
-				tv = td->td_customValues + iCustom;
-				if (tv->value != NULL) {
-					_TIFFfree(tv->value);
-					tv->value = NULL;
-				}
-				break;
-			}
-		}
-
-		/*
-		 * Grow the custom list if the entry was not found.
-		 */
-		if(tv == NULL) {
-			TIFFTagValue *new_customValues;
-
-			td->td_customValueCount++;
-			new_customValues = (TIFFTagValue *)
-			    _TIFFrealloc(td->td_customValues,
-			    sizeof(TIFFTagValue) * td->td_customValueCount);
-			if (!new_customValues) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "%s: Failed to allocate space for list of custom values",
-				    tif->tif_name);
-				status = 0;
-				goto end;
-			}
-
-			td->td_customValues = new_customValues;
-
-			tv = td->td_customValues + (td->td_customValueCount - 1);
-			tv->info = fip;
-			tv->value = NULL;
-			tv->count = 0;
-		}
-
-		/*
-		 * Set custom value ... save a copy of the custom tag value.
-		 */
-		tv_size = _TIFFDataSize(fip->field_type);
-		/*--: Rational2Double: For Rationals evaluate "set_field_type" to determine internal storage size. */
-		if (fip->field_type == TIFF_RATIONAL || fip->field_type == TIFF_SRATIONAL) {
-			tv_size = _TIFFSetGetFieldSize(fip->set_field_type);
-		}
-		if (tv_size == 0) {
-			status = 0;
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "%s: Bad field type %d for \"%s\"",
-			    tif->tif_name, fip->field_type,
-			    fip->field_name);
-			goto end;
-		}
-
-		if (fip->field_type == TIFF_ASCII)
-		{
-			uint32 ma;
-			char* mb;
-			if (fip->field_passcount)
-			{
-				assert(fip->field_writecount==TIFF_VARIABLE2);
-				ma=(uint32)va_arg(ap,uint32);
-				mb=(char*)va_arg(ap,char*);
-			}
-			else
-			{
-				mb=(char*)va_arg(ap,char*);
-				ma=(uint32)(strlen(mb)+1);
-			}
-			tv->count=ma;
-			setByteArray(&tv->value,mb,ma,1);
-		}
-		else
-		{
-			if (fip->field_passcount) {
-				if (fip->field_writecount == TIFF_VARIABLE2)
-					tv->count = (uint32) va_arg(ap, uint32);
-				else
-					tv->count = (int) va_arg(ap, int);
-			} else if (fip->field_writecount == TIFF_VARIABLE
-			   || fip->field_writecount == TIFF_VARIABLE2)
-				tv->count = 1;
-			else if (fip->field_writecount == TIFF_SPP)
-				tv->count = td->td_samplesperpixel;
-			else
-				tv->count = fip->field_writecount;
-
-			if (tv->count == 0) {
-				status = 0;
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "%s: Null count for \"%s\" (type "
-					     "%d, writecount %d, passcount %d)",
-					     tif->tif_name,
-					     fip->field_name,
-					     fip->field_type,
-					     fip->field_writecount,
-					     fip->field_passcount);
-				goto end;
-			}
-
-			tv->value = _TIFFCheckMalloc(tif, tv->count, tv_size,
-			    "custom tag binary object");
-			if (!tv->value) {
-				status = 0;
-				goto end;
-			}
-
-			if (fip->field_tag == TIFFTAG_DOTRANGE 
-			    && strcmp(fip->field_name,"DotRange") == 0) {
-				/* TODO: This is an evil exception and should not have been
-				   handled this way ... likely best if we move it into
-				   the directory structure with an explicit field in 
-				   libtiff 4.1 and assign it a FIELD_ value */
-				uint16 v2[2];
-				v2[0] = (uint16)va_arg(ap, int);
-				v2[1] = (uint16)va_arg(ap, int);
-				_TIFFmemcpy(tv->value, &v2, 4);
-			}
-
-			else if (fip->field_passcount
-				  || fip->field_writecount == TIFF_VARIABLE
-				  || fip->field_writecount == TIFF_VARIABLE2
-				  || fip->field_writecount == TIFF_SPP
-				  || tv->count > 1) {
-			  /*--: Rational2Double: For Rationals tv_size is set above to 4 or 8 according to fip->set_field_type! */
-				_TIFFmemcpy(tv->value, va_arg(ap, void *),
-				    tv->count * tv_size);
-			} else {
-				char *val = (char *)tv->value;
-				assert( tv->count == 1 );
-
-				switch (fip->field_type) {
-				case TIFF_BYTE:
-				case TIFF_UNDEFINED:
-					{
-						uint8 v2 = (uint8)va_arg(ap, int);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_SBYTE:
-					{
-						int8 v2 = (int8)va_arg(ap, int);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_SHORT:
-					{
-						uint16 v2 = (uint16)va_arg(ap, int);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_SSHORT:
-					{
-						int16 v2 = (int16)va_arg(ap, int);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_LONG:
-				case TIFF_IFD:
-					{
-						uint32 v2 = va_arg(ap, uint32);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_SLONG:
-					{
-						int32 v2 = va_arg(ap, int32);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_LONG8:
-				case TIFF_IFD8:
-					{
-						uint64 v2 = va_arg(ap, uint64);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_SLONG8:
-					{
-						int64 v2 = va_arg(ap, int64);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_RATIONAL:
-				case TIFF_SRATIONAL:
-					/*-- Rational2Double: For Rationals tv_size is set above to 4 or 8 according to fip->set_field_type! */
-					{
-						if (tv_size == 8) {
-							double v2 = va_arg(ap, double);
-							_TIFFmemcpy(val, &v2, tv_size);
-						} else {
-							/*-- default should be tv_size == 4 */
-							float v3 = (float)va_arg(ap, double);
-							_TIFFmemcpy(val, &v3, tv_size);
-							/*-- ToDo: After Testing, this should be removed and tv_size==4 should be set as default. */
-							if (tv_size != 4) {
-								TIFFErrorExt(0,"TIFFLib: _TIFFVSetField()", "Rational2Double: .set_field_type in not 4 but %d", tv_size); 
-							}
-						}
-					}
-					break;
-				case TIFF_FLOAT:
-					{
-						float v2 = _TIFFClampDoubleToFloat(va_arg(ap, double));
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				case TIFF_DOUBLE:
-					{
-						double v2 = va_arg(ap, double);
-						_TIFFmemcpy(val, &v2, tv_size);
-					}
-					break;
-				default:
-					_TIFFmemset(val, 0, tv_size);
-					status = 0;
-					break;
-				}
-			}
-		}
-	}
-	}
-	if (status) {
-		const TIFFField* fip2=TIFFFieldWithTag(tif,tag);
-		if (fip2)                
-			TIFFSetFieldBit(tif, fip2->field_bit);
-		tif->tif_flags |= TIFF_DIRTYDIRECT;
-	}
+    }
+    if (status)
+    {
+        const TIFFField *fip2 = TIFFFieldWithTag(tif, tag);
+        if (fip2)
+            TIFFSetFieldBit(tif, fip2->field_bit);
+        tif->tif_flags |= TIFF_DIRTYDIRECT;
+    }
 
 end:
-	va_end(ap);
-	return (status);
+    va_end(ap);
+    return (status);
 badvalue:
+{
+    const TIFFField *fip2 = TIFFFieldWithTag(tif, tag);
+    TIFFErrorExtR(tif, module, "%s: Bad value %" PRIu32 " for \"%s\" tag",
+                  tif->tif_name, v, fip2 ? fip2->field_name : "Unknown");
+    va_end(ap);
+}
+    return (0);
+badvalue32:
+{
+    const TIFFField *fip2 = TIFFFieldWithTag(tif, tag);
+    TIFFErrorExtR(tif, module, "%s: Bad value %" PRIu32 " for \"%s\" tag",
+                  tif->tif_name, v32, fip2 ? fip2->field_name : "Unknown");
+    va_end(ap);
+}
+    return (0);
+badvaluedouble:
+{
+    const TIFFField *fip2 = TIFFFieldWithTag(tif, tag);
+    TIFFErrorExtR(tif, module, "%s: Bad value %f for \"%s\" tag", tif->tif_name,
+                  dblval, fip2 ? fip2->field_name : "Unknown");
+    va_end(ap);
+}
+    return (0);
+badvalueifd8long8:
+{
+    /* Error message issued already above. */
+    TIFFTagValue *tv2 = NULL;
+    int iCustom2, iC2;
+    /* Find the existing entry for this custom value. */
+    for (iCustom2 = 0; iCustom2 < td->td_customValueCount; iCustom2++)
+    {
+        if (td->td_customValues[iCustom2].info->field_tag == tag)
         {
-		const TIFFField* fip2=TIFFFieldWithTag(tif,tag);
-		TIFFErrorExt(tif->tif_clientdata, module,
-		     "%s: Bad value %u for \"%s\" tag",
-		     tif->tif_name, v,
-		     fip2 ? fip2->field_name : "Unknown");
-		va_end(ap);
+            tv2 = td->td_customValues + (iCustom2);
+            break;
         }
-	return (0);
-badvalue32:
+    }
+    if (tv2 != NULL)
+    {
+        /* Remove custom field from custom list */
+        if (tv2->value != NULL)
         {
-		const TIFFField* fip2=TIFFFieldWithTag(tif,tag);
-		TIFFErrorExt(tif->tif_clientdata, module,
-		     "%s: Bad value %u for \"%s\" tag",
-		     tif->tif_name, v32,
-		     fip2 ? fip2->field_name : "Unknown");
-		va_end(ap);
+            _TIFFfreeExt(tif, tv2->value);
+            tv2->value = NULL;
         }
-	return (0);
-badvaluedouble:
+        /* Shorten list and close gap in customValues list.
+         * Re-allocation of td_customValues not necessary here. */
+        td->td_customValueCount--;
+        for (iC2 = iCustom2; iC2 < td->td_customValueCount; iC2++)
         {
-        const TIFFField* fip2=TIFFFieldWithTag(tif,tag);
-        TIFFErrorExt(tif->tif_clientdata, module,
-             "%s: Bad value %f for \"%s\" tag",
-             tif->tif_name, dblval,
-             fip2 ? fip2->field_name : "Unknown");
-        va_end(ap);
+            td->td_customValues[iC2] = td->td_customValues[iC2 + 1];
         }
-    return (0);
+    }
+    else
+    {
+        assert(0);
+    }
+    va_end(ap);
 }
+    return (0);
+} /*-- _TIFFVSetField() --*/
 
 /*
  * Return 1/0 according to whether or not
@@ -792,29 +1110,30 @@ _TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
  * has commenced, unless its value has no effect
  * on the format of the data that is written.
  */
-static int
-OkToChangeTag(TIFF* tif, uint32 tag)
+static int OkToChangeTag(TIFF *tif, uint32_t tag)
 {
-	const TIFFField* fip = TIFFFindField(tif, tag, TIFF_ANY);
-	if (!fip) {			/* unknown tag */
-		TIFFErrorExt(tif->tif_clientdata, "TIFFSetField", "%s: Unknown %stag %u",
-		    tif->tif_name, isPseudoTag(tag) ? "pseudo-" : "", tag);
-		return (0);
-	}
-	if (tag != TIFFTAG_IMAGELENGTH && (tif->tif_flags & TIFF_BEENWRITING) &&
-	    !fip->field_oktochange) {
-		/*
-		 * Consult info table to see if tag can be changed
-		 * after we've started writing.  We only allow changes
-		 * to those tags that don't/shouldn't affect the
-		 * compression and/or format of the data.
-		 */
-		TIFFErrorExt(tif->tif_clientdata, "TIFFSetField",
-		    "%s: Cannot modify tag \"%s\" while writing",
-		    tif->tif_name, fip->field_name);
-		return (0);
-	}
-	return (1);
+    const TIFFField *fip = TIFFFindField(tif, tag, TIFF_ANY);
+    if (!fip)
+    { /* unknown tag */
+        TIFFErrorExtR(tif, "TIFFSetField", "%s: Unknown %stag %" PRIu32,
+                      tif->tif_name, isPseudoTag(tag) ? "pseudo-" : "", tag);
+        return (0);
+    }
+    if (tag != TIFFTAG_IMAGELENGTH && (tif->tif_flags & TIFF_BEENWRITING) &&
+        !fip->field_oktochange)
+    {
+        /*
+         * Consult info table to see if tag can be changed
+         * after we've started writing.  We only allow changes
+         * to those tags that don't/shouldn't affect the
+         * compression and/or format of the data.
+         */
+        TIFFErrorExtR(tif, "TIFFSetField",
+                      "%s: Cannot modify tag \"%s\" while writing",
+                      tif->tif_name, fip->field_name);
+        return (0);
+    }
+    return (1);
 }
 
 /*
@@ -824,54 +1143,54 @@ OkToChangeTag(TIFF* tif, uint32 tag)
  * when/if the directory structure is
  * updated.
  */
-int
-TIFFSetField(TIFF* tif, uint32 tag, ...)
+int TIFFSetField(TIFF *tif, uint32_t tag, ...)
 {
-	va_list ap;
-	int status;
+    va_list ap;
+    int status;
 
-	va_start(ap, tag);
-	status = TIFFVSetField(tif, tag, ap);
-	va_end(ap);
-	return (status);
+    va_start(ap, tag);
+    status = TIFFVSetField(tif, tag, ap);
+    va_end(ap);
+    return (status);
 }
 
 /*
  * Clear the contents of the field in the internal structure.
  */
-int
-TIFFUnsetField(TIFF* tif, uint32 tag)
+int TIFFUnsetField(TIFF *tif, uint32_t tag)
 {
-    const TIFFField *fip =  TIFFFieldWithTag(tif, tag);
-    TIFFDirectory* td = &tif->tif_dir;
+    const TIFFField *fip = TIFFFieldWithTag(tif, tag);
+    TIFFDirectory *td = &tif->tif_dir;
 
-    if( !fip )
+    if (!fip)
         return 0;
 
-    if( fip->field_bit != FIELD_CUSTOM )
+    if (fip->field_bit != FIELD_CUSTOM)
         TIFFClrFieldBit(tif, fip->field_bit);
     else
     {
         TIFFTagValue *tv = NULL;
         int i;
 
-        for (i = 0; i < td->td_customValueCount; i++) {
-                
+        for (i = 0; i < td->td_customValueCount; i++)
+        {
+
             tv = td->td_customValues + i;
-            if( tv->info->field_tag == tag )
+            if (tv->info->field_tag == tag)
                 break;
         }
 
-        if( i < td->td_customValueCount )
+        if (i < td->td_customValueCount)
         {
-            _TIFFfree(tv->value);
-            for( ; i < td->td_customValueCount-1; i++) {
-                td->td_customValues[i] = td->td_customValues[i+1];
+            _TIFFfreeExt(tif, tv->value);
+            for (; i < td->td_customValueCount - 1; i++)
+            {
+                td->td_customValues[i] = td->td_customValues[i + 1];
             }
             td->td_customValueCount--;
         }
     }
-        
+
     tif->tif_flags |= TIFF_DIRTYDIRECT;
 
     return (1);
@@ -883,399 +1202,392 @@ TIFFUnsetField(TIFF* tif, uint32 tag)
  * for building higher-level interfaces on
  * top of the library.
  */
-int
-TIFFVSetField(TIFF* tif, uint32 tag, va_list ap)
+int TIFFVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	return OkToChangeTag(tif, tag) ?
-	    (*tif->tif_tagmethods.vsetfield)(tif, tag, ap) : 0;
+    return OkToChangeTag(tif, tag)
+               ? (*tif->tif_tagmethods.vsetfield)(tif, tag, ap)
+               : 0;
 }
 
-static int
-_TIFFVGetField(TIFF* tif, uint32 tag, va_list ap)
+static int _TIFFVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	TIFFDirectory* td = &tif->tif_dir;
-	int ret_val = 1;
-	uint32 standard_tag = tag;
-	const TIFFField* fip = TIFFFindField(tif, tag, TIFF_ANY);
-	if( fip == NULL ) /* cannot happen since TIFFGetField() already checks it */
-	    return 0;
-
-	/*
-	 * We want to force the custom code to be used for custom
-	 * fields even if the tag happens to match a well known 
-	 * one - important for reinterpreted handling of standard
-	 * tag values in custom directories (i.e. EXIF) 
-	 */
-	if (fip->field_bit == FIELD_CUSTOM) {
-		standard_tag = 0;
-	}
-	
-        if( standard_tag == TIFFTAG_NUMBEROFINKS )
+    TIFFDirectory *td = &tif->tif_dir;
+    int ret_val = 1;
+    uint32_t standard_tag = tag;
+    const TIFFField *fip = TIFFFindField(tif, tag, TIFF_ANY);
+    if (fip == NULL) /* cannot happen since TIFFGetField() already checks it */
+        return 0;
+
+    /*
+     * We want to force the custom code to be used for custom
+     * fields even if the tag happens to match a well known
+     * one - important for reinterpreted handling of standard
+     * tag values in custom directories (i.e. EXIF)
+     */
+    if (fip->field_bit == FIELD_CUSTOM)
+    {
+        standard_tag = 0;
+    }
+
+    switch (standard_tag)
+    {
+        case TIFFTAG_SUBFILETYPE:
+            *va_arg(ap, uint32_t *) = td->td_subfiletype;
+            break;
+        case TIFFTAG_IMAGEWIDTH:
+            *va_arg(ap, uint32_t *) = td->td_imagewidth;
+            break;
+        case TIFFTAG_IMAGELENGTH:
+            *va_arg(ap, uint32_t *) = td->td_imagelength;
+            break;
+        case TIFFTAG_BITSPERSAMPLE:
+            *va_arg(ap, uint16_t *) = td->td_bitspersample;
+            break;
+        case TIFFTAG_COMPRESSION:
+            *va_arg(ap, uint16_t *) = td->td_compression;
+            break;
+        case TIFFTAG_PHOTOMETRIC:
+            *va_arg(ap, uint16_t *) = td->td_photometric;
+            break;
+        case TIFFTAG_THRESHHOLDING:
+            *va_arg(ap, uint16_t *) = td->td_threshholding;
+            break;
+        case TIFFTAG_FILLORDER:
+            *va_arg(ap, uint16_t *) = td->td_fillorder;
+            break;
+        case TIFFTAG_ORIENTATION:
+            *va_arg(ap, uint16_t *) = td->td_orientation;
+            break;
+        case TIFFTAG_SAMPLESPERPIXEL:
+            *va_arg(ap, uint16_t *) = td->td_samplesperpixel;
+            break;
+        case TIFFTAG_ROWSPERSTRIP:
+            *va_arg(ap, uint32_t *) = td->td_rowsperstrip;
+            break;
+        case TIFFTAG_MINSAMPLEVALUE:
+            *va_arg(ap, uint16_t *) = td->td_minsamplevalue;
+            break;
+        case TIFFTAG_MAXSAMPLEVALUE:
+            *va_arg(ap, uint16_t *) = td->td_maxsamplevalue;
+            break;
+        case TIFFTAG_SMINSAMPLEVALUE:
+            if (tif->tif_flags & TIFF_PERSAMPLE)
+                *va_arg(ap, double **) = td->td_sminsamplevalue;
+            else
+            {
+                /* libtiff historically treats this as a single value. */
+                uint16_t i;
+                double v = td->td_sminsamplevalue[0];
+                for (i = 1; i < td->td_samplesperpixel; ++i)
+                    if (td->td_sminsamplevalue[i] < v)
+                        v = td->td_sminsamplevalue[i];
+                *va_arg(ap, double *) = v;
+            }
+            break;
+        case TIFFTAG_SMAXSAMPLEVALUE:
+            if (tif->tif_flags & TIFF_PERSAMPLE)
+                *va_arg(ap, double **) = td->td_smaxsamplevalue;
+            else
+            {
+                /* libtiff historically treats this as a single value. */
+                uint16_t i;
+                double v = td->td_smaxsamplevalue[0];
+                for (i = 1; i < td->td_samplesperpixel; ++i)
+                    if (td->td_smaxsamplevalue[i] > v)
+                        v = td->td_smaxsamplevalue[i];
+                *va_arg(ap, double *) = v;
+            }
+            break;
+        case TIFFTAG_XRESOLUTION:
+            *va_arg(ap, float *) = td->td_xresolution;
+            break;
+        case TIFFTAG_YRESOLUTION:
+            *va_arg(ap, float *) = td->td_yresolution;
+            break;
+        case TIFFTAG_PLANARCONFIG:
+            *va_arg(ap, uint16_t *) = td->td_planarconfig;
+            break;
+        case TIFFTAG_XPOSITION:
+            *va_arg(ap, float *) = td->td_xposition;
+            break;
+        case TIFFTAG_YPOSITION:
+            *va_arg(ap, float *) = td->td_yposition;
+            break;
+        case TIFFTAG_RESOLUTIONUNIT:
+            *va_arg(ap, uint16_t *) = td->td_resolutionunit;
+            break;
+        case TIFFTAG_PAGENUMBER:
+            *va_arg(ap, uint16_t *) = td->td_pagenumber[0];
+            *va_arg(ap, uint16_t *) = td->td_pagenumber[1];
+            break;
+        case TIFFTAG_HALFTONEHINTS:
+            *va_arg(ap, uint16_t *) = td->td_halftonehints[0];
+            *va_arg(ap, uint16_t *) = td->td_halftonehints[1];
+            break;
+        case TIFFTAG_COLORMAP:
+            *va_arg(ap, const uint16_t **) = td->td_colormap[0];
+            *va_arg(ap, const uint16_t **) = td->td_colormap[1];
+            *va_arg(ap, const uint16_t **) = td->td_colormap[2];
+            break;
+        case TIFFTAG_STRIPOFFSETS:
+        case TIFFTAG_TILEOFFSETS:
+            _TIFFFillStriles(tif);
+            *va_arg(ap, const uint64_t **) = td->td_stripoffset_p;
+            if (td->td_stripoffset_p == NULL)
+                ret_val = 0;
+            break;
+        case TIFFTAG_STRIPBYTECOUNTS:
+        case TIFFTAG_TILEBYTECOUNTS:
+            _TIFFFillStriles(tif);
+            *va_arg(ap, const uint64_t **) = td->td_stripbytecount_p;
+            if (td->td_stripbytecount_p == NULL)
+                ret_val = 0;
+            break;
+        case TIFFTAG_MATTEING:
+            *va_arg(ap, uint16_t *) =
+                (td->td_extrasamples == 1 &&
+                 td->td_sampleinfo[0] == EXTRASAMPLE_ASSOCALPHA);
+            break;
+        case TIFFTAG_EXTRASAMPLES:
+            *va_arg(ap, uint16_t *) = td->td_extrasamples;
+            *va_arg(ap, const uint16_t **) = td->td_sampleinfo;
+            break;
+        case TIFFTAG_TILEWIDTH:
+            *va_arg(ap, uint32_t *) = td->td_tilewidth;
+            break;
+        case TIFFTAG_TILELENGTH:
+            *va_arg(ap, uint32_t *) = td->td_tilelength;
+            break;
+        case TIFFTAG_TILEDEPTH:
+            *va_arg(ap, uint32_t *) = td->td_tiledepth;
+            break;
+        case TIFFTAG_DATATYPE:
+            switch (td->td_sampleformat)
+            {
+                case SAMPLEFORMAT_UINT:
+                    *va_arg(ap, uint16_t *) = DATATYPE_UINT;
+                    break;
+                case SAMPLEFORMAT_INT:
+                    *va_arg(ap, uint16_t *) = DATATYPE_INT;
+                    break;
+                case SAMPLEFORMAT_IEEEFP:
+                    *va_arg(ap, uint16_t *) = DATATYPE_IEEEFP;
+                    break;
+                case SAMPLEFORMAT_VOID:
+                    *va_arg(ap, uint16_t *) = DATATYPE_VOID;
+                    break;
+            }
+            break;
+        case TIFFTAG_SAMPLEFORMAT:
+            *va_arg(ap, uint16_t *) = td->td_sampleformat;
+            break;
+        case TIFFTAG_IMAGEDEPTH:
+            *va_arg(ap, uint32_t *) = td->td_imagedepth;
+            break;
+        case TIFFTAG_SUBIFD:
+            *va_arg(ap, uint16_t *) = td->td_nsubifd;
+            *va_arg(ap, const uint64_t **) = td->td_subifd;
+            break;
+        case TIFFTAG_YCBCRPOSITIONING:
+            *va_arg(ap, uint16_t *) = td->td_ycbcrpositioning;
+            break;
+        case TIFFTAG_YCBCRSUBSAMPLING:
+            *va_arg(ap, uint16_t *) = td->td_ycbcrsubsampling[0];
+            *va_arg(ap, uint16_t *) = td->td_ycbcrsubsampling[1];
+            break;
+        case TIFFTAG_TRANSFERFUNCTION:
+            *va_arg(ap, const uint16_t **) = td->td_transferfunction[0];
+            if (td->td_samplesperpixel - td->td_extrasamples > 1)
+            {
+                *va_arg(ap, const uint16_t **) = td->td_transferfunction[1];
+                *va_arg(ap, const uint16_t **) = td->td_transferfunction[2];
+            }
+            else
+            {
+                *va_arg(ap, const uint16_t **) = NULL;
+                *va_arg(ap, const uint16_t **) = NULL;
+            }
+            break;
+        case TIFFTAG_REFERENCEBLACKWHITE:
+            *va_arg(ap, const float **) = td->td_refblackwhite;
+            break;
+        case TIFFTAG_INKNAMES:
+            *va_arg(ap, const char **) = td->td_inknames;
+            break;
+        case TIFFTAG_NUMBEROFINKS:
+            *va_arg(ap, uint16_t *) = td->td_numberofinks;
+            break;
+        default:
         {
             int i;
-            for (i = 0; i < td->td_customValueCount; i++) {
-                uint16 val;
+
+            /*
+             * This can happen if multiple images are open
+             * with different codecs which have private
+             * tags.  The global tag information table may
+             * then have tags that are valid for one file
+             * but not the other. If the client tries to
+             * get a tag that is not valid for the image's
+             * codec then we'll arrive here.
+             */
+            if (fip->field_bit != FIELD_CUSTOM)
+            {
+                TIFFErrorExtR(tif, "_TIFFVGetField",
+                              "%s: Invalid %stag \"%s\" "
+                              "(not supported by codec)",
+                              tif->tif_name, isPseudoTag(tag) ? "pseudo-" : "",
+                              fip->field_name);
+                ret_val = 0;
+                break;
+            }
+
+            /*
+             * Do we have a custom value?
+             */
+            ret_val = 0;
+            for (i = 0; i < td->td_customValueCount; i++)
+            {
                 TIFFTagValue *tv = td->td_customValues + i;
-                if (tv->info->field_tag != standard_tag)
+
+                if (tv->info->field_tag != tag)
                     continue;
-                if( tv->value == NULL )
-                    return 0;
-                val = *(uint16 *)tv->value;
-                /* Truncate to SamplesPerPixel, since the */
-                /* setting code for INKNAMES assume that there are SamplesPerPixel */
-                /* inknames. */
-                /* Fixes http://bugzilla.maptools.org/show_bug.cgi?id=2599 */
-                if( val > td->td_samplesperpixel )
+
+                if (fip->field_passcount)
+                {
+                    if (fip->field_readcount == TIFF_VARIABLE2)
+                        *va_arg(ap, uint32_t *) = (uint32_t)tv->count;
+                    else /* Assume TIFF_VARIABLE */
+                        *va_arg(ap, uint16_t *) = (uint16_t)tv->count;
+                    *va_arg(ap, const void **) = tv->value;
+                    ret_val = 1;
+                }
+                else if (fip->field_tag == TIFFTAG_DOTRANGE &&
+                         strcmp(fip->field_name, "DotRange") == 0)
+                {
+                    /* TODO: This is an evil exception and should not have been
+                       handled this way ... likely best if we move it into
+                       the directory structure with an explicit field in
+                       libtiff 4.1 and assign it a FIELD_ value */
+                    *va_arg(ap, uint16_t *) = ((uint16_t *)tv->value)[0];
+                    *va_arg(ap, uint16_t *) = ((uint16_t *)tv->value)[1];
+                    ret_val = 1;
+                }
+                else
                 {
-                    TIFFWarningExt(tif->tif_clientdata,"_TIFFVGetField",
-                                   "Truncating NumberOfInks from %u to %u",
-                                   val, td->td_samplesperpixel);
-                    val = td->td_samplesperpixel;
+                    if (fip->field_type == TIFF_ASCII ||
+                        fip->field_readcount == TIFF_VARIABLE ||
+                        fip->field_readcount == TIFF_VARIABLE2 ||
+                        fip->field_readcount == TIFF_SPP || tv->count > 1)
+                    {
+                        *va_arg(ap, void **) = tv->value;
+                        ret_val = 1;
+                    }
+                    else
+                    {
+                        char *val = (char *)tv->value;
+                        assert(tv->count == 1);
+                        switch (fip->field_type)
+                        {
+                            case TIFF_BYTE:
+                            case TIFF_UNDEFINED:
+                                *va_arg(ap, uint8_t *) = *(uint8_t *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_SBYTE:
+                                *va_arg(ap, int8_t *) = *(int8_t *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_SHORT:
+                                *va_arg(ap, uint16_t *) = *(uint16_t *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_SSHORT:
+                                *va_arg(ap, int16_t *) = *(int16_t *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_LONG:
+                            case TIFF_IFD:
+                                *va_arg(ap, uint32_t *) = *(uint32_t *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_SLONG:
+                                *va_arg(ap, int32_t *) = *(int32_t *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_LONG8:
+                            case TIFF_IFD8:
+                                *va_arg(ap, uint64_t *) = *(uint64_t *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_SLONG8:
+                                *va_arg(ap, int64_t *) = *(int64_t *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_RATIONAL:
+                            case TIFF_SRATIONAL:
+                            {
+                                /*-- Rational2Double: For Rationals evaluate
+                                 * "set_field_type" to determine internal
+                                 * storage size and return value size. */
+                                int tv_size = TIFFFieldSetGetSize(fip);
+                                if (tv_size == 8)
+                                {
+                                    *va_arg(ap, double *) = *(double *)val;
+                                    ret_val = 1;
+                                }
+                                else
+                                {
+                                    /*-- default should be tv_size == 4  */
+                                    *va_arg(ap, float *) = *(float *)val;
+                                    ret_val = 1;
+                                    /*-- ToDo: After Testing, this should be
+                                     * removed and tv_size==4 should be set as
+                                     * default. */
+                                    if (tv_size != 4)
+                                    {
+                                        TIFFErrorExtR(
+                                            tif, "_TIFFVGetField",
+                                            "Rational2Double: .set_field_type "
+                                            "in not 4 but %d",
+                                            tv_size);
+                                    }
+                                }
+                            }
+                            break;
+                            case TIFF_FLOAT:
+                                *va_arg(ap, float *) = *(float *)val;
+                                ret_val = 1;
+                                break;
+                            case TIFF_DOUBLE:
+                                *va_arg(ap, double *) = *(double *)val;
+                                ret_val = 1;
+                                break;
+                            default:
+                                ret_val = 0;
+                                break;
+                        }
+                    }
                 }
-                *va_arg(ap, uint16*) = val;
-                return 1;
+                break;
             }
-            return 0;
         }
-
-	switch (standard_tag) {
-		case TIFFTAG_SUBFILETYPE:
-			*va_arg(ap, uint32*) = td->td_subfiletype;
-			break;
-		case TIFFTAG_IMAGEWIDTH:
-			*va_arg(ap, uint32*) = td->td_imagewidth;
-			break;
-		case TIFFTAG_IMAGELENGTH:
-			*va_arg(ap, uint32*) = td->td_imagelength;
-			break;
-		case TIFFTAG_BITSPERSAMPLE:
-			*va_arg(ap, uint16*) = td->td_bitspersample;
-			break;
-		case TIFFTAG_COMPRESSION:
-			*va_arg(ap, uint16*) = td->td_compression;
-			break;
-		case TIFFTAG_PHOTOMETRIC:
-			*va_arg(ap, uint16*) = td->td_photometric;
-			break;
-		case TIFFTAG_THRESHHOLDING:
-			*va_arg(ap, uint16*) = td->td_threshholding;
-			break;
-		case TIFFTAG_FILLORDER:
-			*va_arg(ap, uint16*) = td->td_fillorder;
-			break;
-		case TIFFTAG_ORIENTATION:
-			*va_arg(ap, uint16*) = td->td_orientation;
-			break;
-		case TIFFTAG_SAMPLESPERPIXEL:
-			*va_arg(ap, uint16*) = td->td_samplesperpixel;
-			break;
-		case TIFFTAG_ROWSPERSTRIP:
-			*va_arg(ap, uint32*) = td->td_rowsperstrip;
-			break;
-		case TIFFTAG_MINSAMPLEVALUE:
-			*va_arg(ap, uint16*) = td->td_minsamplevalue;
-			break;
-		case TIFFTAG_MAXSAMPLEVALUE:
-			*va_arg(ap, uint16*) = td->td_maxsamplevalue;
-			break;
-		case TIFFTAG_SMINSAMPLEVALUE:
-			if (tif->tif_flags & TIFF_PERSAMPLE)
-				*va_arg(ap, double**) = td->td_sminsamplevalue;
-			else
-			{
-				/* libtiff historically treats this as a single value. */
-				uint16 i;
-				double v = td->td_sminsamplevalue[0];
-				for (i=1; i < td->td_samplesperpixel; ++i)
-					if( td->td_sminsamplevalue[i] < v )
-						v = td->td_sminsamplevalue[i];
-				*va_arg(ap, double*) = v;
-			}
-			break;
-		case TIFFTAG_SMAXSAMPLEVALUE:
-			if (tif->tif_flags & TIFF_PERSAMPLE)
-				*va_arg(ap, double**) = td->td_smaxsamplevalue;
-			else
-			{
-				/* libtiff historically treats this as a single value. */
-				uint16 i;
-				double v = td->td_smaxsamplevalue[0];
-				for (i=1; i < td->td_samplesperpixel; ++i)
-					if( td->td_smaxsamplevalue[i] > v )
-						v = td->td_smaxsamplevalue[i];
-				*va_arg(ap, double*) = v;
-			}
-			break;
-		case TIFFTAG_XRESOLUTION:
-			*va_arg(ap, float*) = td->td_xresolution;
-			break;
-		case TIFFTAG_YRESOLUTION:
-			*va_arg(ap, float*) = td->td_yresolution;
-			break;
-		case TIFFTAG_PLANARCONFIG:
-			*va_arg(ap, uint16*) = td->td_planarconfig;
-			break;
-		case TIFFTAG_XPOSITION:
-			*va_arg(ap, float*) = td->td_xposition;
-			break;
-		case TIFFTAG_YPOSITION:
-			*va_arg(ap, float*) = td->td_yposition;
-			break;
-		case TIFFTAG_RESOLUTIONUNIT:
-			*va_arg(ap, uint16*) = td->td_resolutionunit;
-			break;
-		case TIFFTAG_PAGENUMBER:
-			*va_arg(ap, uint16*) = td->td_pagenumber[0];
-			*va_arg(ap, uint16*) = td->td_pagenumber[1];
-			break;
-		case TIFFTAG_HALFTONEHINTS:
-			*va_arg(ap, uint16*) = td->td_halftonehints[0];
-			*va_arg(ap, uint16*) = td->td_halftonehints[1];
-			break;
-		case TIFFTAG_COLORMAP:
-			*va_arg(ap, const uint16**) = td->td_colormap[0];
-			*va_arg(ap, const uint16**) = td->td_colormap[1];
-			*va_arg(ap, const uint16**) = td->td_colormap[2];
-			break;
-		case TIFFTAG_STRIPOFFSETS:
-		case TIFFTAG_TILEOFFSETS:
-			_TIFFFillStriles( tif );
-			*va_arg(ap, const uint64**) = td->td_stripoffset_p;
-			break;
-		case TIFFTAG_STRIPBYTECOUNTS:
-		case TIFFTAG_TILEBYTECOUNTS:
-			_TIFFFillStriles( tif );
-			*va_arg(ap, const uint64**) = td->td_stripbytecount_p;
-			break;
-		case TIFFTAG_MATTEING:
-			*va_arg(ap, uint16*) =
-			    (td->td_extrasamples == 1 &&
-			    td->td_sampleinfo[0] == EXTRASAMPLE_ASSOCALPHA);
-			break;
-		case TIFFTAG_EXTRASAMPLES:
-			*va_arg(ap, uint16*) = td->td_extrasamples;
-			*va_arg(ap, const uint16**) = td->td_sampleinfo;
-			break;
-		case TIFFTAG_TILEWIDTH:
-			*va_arg(ap, uint32*) = td->td_tilewidth;
-			break;
-		case TIFFTAG_TILELENGTH:
-			*va_arg(ap, uint32*) = td->td_tilelength;
-			break;
-		case TIFFTAG_TILEDEPTH:
-			*va_arg(ap, uint32*) = td->td_tiledepth;
-			break;
-		case TIFFTAG_DATATYPE:
-			switch (td->td_sampleformat) {
-				case SAMPLEFORMAT_UINT:
-					*va_arg(ap, uint16*) = DATATYPE_UINT;
-					break;
-				case SAMPLEFORMAT_INT:
-					*va_arg(ap, uint16*) = DATATYPE_INT;
-					break;
-				case SAMPLEFORMAT_IEEEFP:
-					*va_arg(ap, uint16*) = DATATYPE_IEEEFP;
-					break;
-				case SAMPLEFORMAT_VOID:
-					*va_arg(ap, uint16*) = DATATYPE_VOID;
-					break;
-			}
-			break;
-		case TIFFTAG_SAMPLEFORMAT:
-			*va_arg(ap, uint16*) = td->td_sampleformat;
-			break;
-		case TIFFTAG_IMAGEDEPTH:
-			*va_arg(ap, uint32*) = td->td_imagedepth;
-			break;
-		case TIFFTAG_SUBIFD:
-			*va_arg(ap, uint16*) = td->td_nsubifd;
-			*va_arg(ap, const uint64**) = td->td_subifd;
-			break;
-		case TIFFTAG_YCBCRPOSITIONING:
-			*va_arg(ap, uint16*) = td->td_ycbcrpositioning;
-			break;
-		case TIFFTAG_YCBCRSUBSAMPLING:
-			*va_arg(ap, uint16*) = td->td_ycbcrsubsampling[0];
-			*va_arg(ap, uint16*) = td->td_ycbcrsubsampling[1];
-			break;
-		case TIFFTAG_TRANSFERFUNCTION:
-			*va_arg(ap, const uint16**) = td->td_transferfunction[0];
-			if (td->td_samplesperpixel - td->td_extrasamples > 1) {
-				*va_arg(ap, const uint16**) = td->td_transferfunction[1];
-				*va_arg(ap, const uint16**) = td->td_transferfunction[2];
-			} else {
-				*va_arg(ap, const uint16**) = NULL;
-				*va_arg(ap, const uint16**) = NULL;
-			}
-			break;
-		case TIFFTAG_REFERENCEBLACKWHITE:
-			*va_arg(ap, const float**) = td->td_refblackwhite;
-			break;
-		case TIFFTAG_INKNAMES:
-			*va_arg(ap, const char**) = td->td_inknames;
-			break;
-		default:
-			{
-				int i;
-
-				/*
-				 * This can happen if multiple images are open
-				 * with different codecs which have private
-				 * tags.  The global tag information table may
-				 * then have tags that are valid for one file
-				 * but not the other. If the client tries to
-				 * get a tag that is not valid for the image's
-				 * codec then we'll arrive here.
-				 */
-				if( fip->field_bit != FIELD_CUSTOM )
-				{
-					TIFFErrorExt(tif->tif_clientdata, "_TIFFVGetField",
-					    "%s: Invalid %stag \"%s\" "
-					    "(not supported by codec)",
-					    tif->tif_name,
-					    isPseudoTag(tag) ? "pseudo-" : "",
-					    fip->field_name);
-					ret_val = 0;
-					break;
-				}
-
-				/*
-				 * Do we have a custom value?
-				 */
-				ret_val = 0;
-				for (i = 0; i < td->td_customValueCount; i++) {
-					TIFFTagValue *tv = td->td_customValues + i;
-
-					if (tv->info->field_tag != tag)
-						continue;
-
-					if (fip->field_passcount) {
-						if (fip->field_readcount == TIFF_VARIABLE2)
-							*va_arg(ap, uint32*) = (uint32)tv->count;
-						else  /* Assume TIFF_VARIABLE */
-							*va_arg(ap, uint16*) = (uint16)tv->count;
-						*va_arg(ap, const void **) = tv->value;
-						ret_val = 1;
-					} else if (fip->field_tag == TIFFTAG_DOTRANGE
-						   && strcmp(fip->field_name,"DotRange") == 0) {
-						/* TODO: This is an evil exception and should not have been
-						   handled this way ... likely best if we move it into
-						   the directory structure with an explicit field in 
-						   libtiff 4.1 and assign it a FIELD_ value */
-						*va_arg(ap, uint16*) = ((uint16 *)tv->value)[0];
-						*va_arg(ap, uint16*) = ((uint16 *)tv->value)[1];
-						ret_val = 1;
-					} else {
-						if (fip->field_type == TIFF_ASCII
-						    || fip->field_readcount == TIFF_VARIABLE
-						    || fip->field_readcount == TIFF_VARIABLE2
-						    || fip->field_readcount == TIFF_SPP
-						    || tv->count > 1) {
-							*va_arg(ap, void **) = tv->value;
-							ret_val = 1;
-						} else {
-							char *val = (char *)tv->value;
-							assert( tv->count == 1 );
-							switch (fip->field_type) {
-							case TIFF_BYTE:
-							case TIFF_UNDEFINED:
-								*va_arg(ap, uint8*) =
-									*(uint8 *)val;
-								ret_val = 1;
-								break;
-							case TIFF_SBYTE:
-								*va_arg(ap, int8*) =
-									*(int8 *)val;
-								ret_val = 1;
-								break;
-							case TIFF_SHORT:
-								*va_arg(ap, uint16*) =
-									*(uint16 *)val;
-								ret_val = 1;
-								break;
-							case TIFF_SSHORT:
-								*va_arg(ap, int16*) =
-									*(int16 *)val;
-								ret_val = 1;
-								break;
-							case TIFF_LONG:
-							case TIFF_IFD:
-								*va_arg(ap, uint32*) =
-									*(uint32 *)val;
-								ret_val = 1;
-								break;
-							case TIFF_SLONG:
-								*va_arg(ap, int32*) =
-									*(int32 *)val;
-								ret_val = 1;
-								break;
-							case TIFF_LONG8:
-							case TIFF_IFD8:
-								*va_arg(ap, uint64*) =
-									*(uint64 *)val;
-								ret_val = 1;
-								break;
-							case TIFF_SLONG8:
-								*va_arg(ap, int64*) =
-									*(int64 *)val;
-								ret_val = 1;
-								break;
-							case TIFF_RATIONAL:
-							case TIFF_SRATIONAL:
-								{
-									/*-- Rational2Double: For Rationals evaluate "set_field_type" to determine internal storage size and return value size. */
-									int tv_size = _TIFFSetGetFieldSize(fip->set_field_type);
-									if (tv_size == 8) {
-										*va_arg(ap, double*) = *(double *)val;
-										ret_val = 1;
-									} else {
-										/*-- default should be tv_size == 4  */
-										*va_arg(ap, float*) = *(float *)val;
-										ret_val = 1;
-										/*-- ToDo: After Testing, this should be removed and tv_size==4 should be set as default. */
-										if (tv_size != 4) {
-											TIFFErrorExt(0,"TIFFLib: _TIFFVGetField()", "Rational2Double: .set_field_type in not 4 but %d", tv_size); 
-										}
-									}
-								}
-								break;
-							case TIFF_FLOAT:
-								*va_arg(ap, float*) =
-									*(float *)val;
-								ret_val = 1;
-								break;
-							case TIFF_DOUBLE:
-								*va_arg(ap, double*) =
-									*(double *)val;
-								ret_val = 1;
-								break;
-							default:
-								ret_val = 0;
-								break;
-							}
-						}
-					}
-					break;
-				}
-			}
-	}
-	return(ret_val);
+    }
+    return (ret_val);
 }
 
 /*
  * Return the value of a field in the
  * internal directory structure.
  */
-int
-TIFFGetField(TIFF* tif, uint32 tag, ...)
+int TIFFGetField(TIFF *tif, uint32_t tag, ...)
 {
-	int status;
-	va_list ap;
+    int status;
+    va_list ap;
 
-	va_start(ap, tag);
-	status = TIFFVGetField(tif, tag, ap);
-	va_end(ap);
-	return (status);
+    va_start(ap, tag);
+    status = TIFFVGetField(tif, tag, ap);
+    va_end(ap);
+    return (status);
 }
 
 /*
@@ -1284,74 +1596,75 @@ TIFFGetField(TIFF* tif, uint32 tag, ...)
  * for building higher-level interfaces on
  * top of the library.
  */
-int
-TIFFVGetField(TIFF* tif, uint32 tag, va_list ap)
+int TIFFVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	const TIFFField* fip = TIFFFindField(tif, tag, TIFF_ANY);
-	return (fip && (isPseudoTag(tag) || TIFFFieldSet(tif, fip->field_bit)) ?
-	    (*tif->tif_tagmethods.vgetfield)(tif, tag, ap) : 0);
+    const TIFFField *fip = TIFFFindField(tif, tag, TIFF_ANY);
+    return (fip && (isPseudoTag(tag) || TIFFFieldSet(tif, fip->field_bit))
+                ? (*tif->tif_tagmethods.vgetfield)(tif, tag, ap)
+                : 0);
 }
 
-#define	CleanupField(member) {		\
-    if (td->member) {			\
-	_TIFFfree(td->member);		\
-	td->member = 0;			\
-    }					\
-}
+#define CleanupField(member)                                                   \
+    {                                                                          \
+        if (td->member)                                                        \
+        {                                                                      \
+            _TIFFfreeExt(tif, td->member);                                     \
+            td->member = 0;                                                    \
+        }                                                                      \
+    }
 
 /*
  * Release storage associated with a directory.
  */
-void
-TIFFFreeDirectory(TIFF* tif)
+void TIFFFreeDirectory(TIFF *tif)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-	int            i;
-
-	_TIFFmemset(td->td_fieldsset, 0, FIELD_SETLONGS);
-	CleanupField(td_sminsamplevalue);
-	CleanupField(td_smaxsamplevalue);
-	CleanupField(td_colormap[0]);
-	CleanupField(td_colormap[1]);
-	CleanupField(td_colormap[2]);
-	CleanupField(td_sampleinfo);
-	CleanupField(td_subifd);
-	CleanupField(td_inknames);
-	CleanupField(td_refblackwhite);
-	CleanupField(td_transferfunction[0]);
-	CleanupField(td_transferfunction[1]);
-	CleanupField(td_transferfunction[2]);
-	CleanupField(td_stripoffset_p);
-	CleanupField(td_stripbytecount_p);
-        td->td_stripoffsetbyteallocsize = 0;
-	TIFFClrFieldBit(tif, FIELD_YCBCRSUBSAMPLING);
-	TIFFClrFieldBit(tif, FIELD_YCBCRPOSITIONING);
-
-	/* Cleanup custom tag values */
-	for( i = 0; i < td->td_customValueCount; i++ ) {
-		if (td->td_customValues[i].value)
-			_TIFFfree(td->td_customValues[i].value);
-	}
-
-	td->td_customValueCount = 0;
-	CleanupField(td_customValues);
-
-        _TIFFmemset( &(td->td_stripoffset_entry), 0, sizeof(TIFFDirEntry));
-        _TIFFmemset( &(td->td_stripbytecount_entry), 0, sizeof(TIFFDirEntry));
+    TIFFDirectory *td = &tif->tif_dir;
+    int i;
+
+    _TIFFmemset(td->td_fieldsset, 0, sizeof(td->td_fieldsset));
+    CleanupField(td_sminsamplevalue);
+    CleanupField(td_smaxsamplevalue);
+    CleanupField(td_colormap[0]);
+    CleanupField(td_colormap[1]);
+    CleanupField(td_colormap[2]);
+    CleanupField(td_sampleinfo);
+    CleanupField(td_subifd);
+    CleanupField(td_inknames);
+    CleanupField(td_refblackwhite);
+    CleanupField(td_transferfunction[0]);
+    CleanupField(td_transferfunction[1]);
+    CleanupField(td_transferfunction[2]);
+    CleanupField(td_stripoffset_p);
+    CleanupField(td_stripbytecount_p);
+    td->td_stripoffsetbyteallocsize = 0;
+    TIFFClrFieldBit(tif, FIELD_YCBCRSUBSAMPLING);
+    TIFFClrFieldBit(tif, FIELD_YCBCRPOSITIONING);
+
+    /* Cleanup custom tag values */
+    for (i = 0; i < td->td_customValueCount; i++)
+    {
+        if (td->td_customValues[i].value)
+            _TIFFfreeExt(tif, td->td_customValues[i].value);
+    }
+
+    td->td_customValueCount = 0;
+    CleanupField(td_customValues);
+
+    _TIFFmemset(&(td->td_stripoffset_entry), 0, sizeof(TIFFDirEntry));
+    _TIFFmemset(&(td->td_stripbytecount_entry), 0, sizeof(TIFFDirEntry));
 }
 #undef CleanupField
 
 /*
  * Client Tag extension support (from Niles Ritter).
  */
-static TIFFExtendProc _TIFFextender = (TIFFExtendProc) NULL;
+static TIFFExtendProc _TIFFextender = (TIFFExtendProc)NULL;
 
-TIFFExtendProc
-TIFFSetTagExtender(TIFFExtendProc extender)
+TIFFExtendProc TIFFSetTagExtender(TIFFExtendProc extender)
 {
-	TIFFExtendProc prev = _TIFFextender;
-	_TIFFextender = extender;
-	return (prev);
+    TIFFExtendProc prev = _TIFFextender;
+    _TIFFextender = extender;
+    return (prev);
 }
 
 /*
@@ -1361,333 +1674,433 @@ TIFFSetTagExtender(TIFFExtendProc extender)
  * The newly created directory will not exist on the file till
  * TIFFWriteDirectory(), TIFFFlush() or TIFFClose() is called.
  */
-int
-TIFFCreateDirectory(TIFF* tif)
+int TIFFCreateDirectory(TIFF *tif)
 {
-	TIFFDefaultDirectory(tif);
-	tif->tif_diroff = 0;
-	tif->tif_nextdiroff = 0;
-	tif->tif_curoff = 0;
-	tif->tif_row = (uint32) -1;
-	tif->tif_curstrip = (uint32) -1;
-
-	return 0;
+    TIFFDefaultDirectory(tif);
+    tif->tif_diroff = 0;
+    tif->tif_nextdiroff = 0;
+    tif->tif_curoff = 0;
+    tif->tif_row = (uint32_t)-1;
+    tif->tif_curstrip = (uint32_t)-1;
+
+    return 0;
 }
 
-int
-TIFFCreateCustomDirectory(TIFF* tif, const TIFFFieldArray* infoarray)
+int TIFFCreateCustomDirectory(TIFF *tif, const TIFFFieldArray *infoarray)
 {
-	TIFFDefaultDirectory(tif);
-
-	/*
-	 * Reset the field definitions to match the application provided list. 
-	 * Hopefully TIFFDefaultDirectory() won't have done anything irreversable
-	 * based on it's assumption this is an image directory.
-	 */
-	_TIFFSetupFields(tif, infoarray);
-
-	tif->tif_diroff = 0;
-	tif->tif_nextdiroff = 0;
-	tif->tif_curoff = 0;
-	tif->tif_row = (uint32) -1;
-	tif->tif_curstrip = (uint32) -1;
-
-	return 0;
+    TIFFDefaultDirectory(tif);
+
+    /*
+     * Reset the field definitions to match the application provided list.
+     * Hopefully TIFFDefaultDirectory() won't have done anything irreversible
+     * based on it's assumption this is an image directory.
+     */
+    _TIFFSetupFields(tif, infoarray);
+
+    tif->tif_diroff = 0;
+    tif->tif_nextdiroff = 0;
+    tif->tif_curoff = 0;
+    tif->tif_row = (uint32_t)-1;
+    tif->tif_curstrip = (uint32_t)-1;
+    /* invalidate directory index */
+    tif->tif_curdir = TIFF_NON_EXISTENT_DIR_NUMBER;
+    /* invalidate IFD loop lists */
+    _TIFFCleanupIFDOffsetAndNumberMaps(tif);
+    /* To be able to return from SubIFD or custom-IFD to main-IFD */
+    tif->tif_setdirectory_force_absolute = TRUE;
+
+    return 0;
 }
 
-int
-TIFFCreateEXIFDirectory(TIFF* tif)
+int TIFFCreateEXIFDirectory(TIFF *tif)
 {
-	const TIFFFieldArray* exifFieldArray;
-	exifFieldArray = _TIFFGetExifFields();
-	return TIFFCreateCustomDirectory(tif, exifFieldArray);
+    const TIFFFieldArray *exifFieldArray;
+    exifFieldArray = _TIFFGetExifFields();
+    return TIFFCreateCustomDirectory(tif, exifFieldArray);
 }
 
 /*
- * Creates the EXIF GPS custom directory 
+ * Creates the EXIF GPS custom directory
  */
-int
-TIFFCreateGPSDirectory(TIFF* tif)
+int TIFFCreateGPSDirectory(TIFF *tif)
 {
-	const TIFFFieldArray* gpsFieldArray;
-	gpsFieldArray = _TIFFGetGpsFields();
-	return TIFFCreateCustomDirectory(tif, gpsFieldArray);
+    const TIFFFieldArray *gpsFieldArray;
+    gpsFieldArray = _TIFFGetGpsFields();
+    return TIFFCreateCustomDirectory(tif, gpsFieldArray);
 }
 
 /*
  * Setup a default directory structure.
  */
-int
-TIFFDefaultDirectory(TIFF* tif)
+int TIFFDefaultDirectory(TIFF *tif)
 {
-	register TIFFDirectory* td = &tif->tif_dir;
-	const TIFFFieldArray* tiffFieldArray;
-
-	tiffFieldArray = _TIFFGetFields();
-	_TIFFSetupFields(tif, tiffFieldArray);   
-
-	_TIFFmemset(td, 0, sizeof (*td));
-	td->td_fillorder = FILLORDER_MSB2LSB;
-	td->td_bitspersample = 1;
-	td->td_threshholding = THRESHHOLD_BILEVEL;
-	td->td_orientation = ORIENTATION_TOPLEFT;
-	td->td_samplesperpixel = 1;
-	td->td_rowsperstrip = (uint32) -1;
-	td->td_tilewidth = 0;
-	td->td_tilelength = 0;
-	td->td_tiledepth = 1;
+    register TIFFDirectory *td = &tif->tif_dir;
+    const TIFFFieldArray *tiffFieldArray;
+
+    tiffFieldArray = _TIFFGetFields();
+    _TIFFSetupFields(tif, tiffFieldArray);
+
+    _TIFFmemset(td, 0, sizeof(*td));
+    td->td_fillorder = FILLORDER_MSB2LSB;
+    td->td_bitspersample = 1;
+    td->td_threshholding = THRESHHOLD_BILEVEL;
+    td->td_orientation = ORIENTATION_TOPLEFT;
+    td->td_samplesperpixel = 1;
+    td->td_rowsperstrip = (uint32_t)-1;
+    td->td_tilewidth = 0;
+    td->td_tilelength = 0;
+    td->td_tiledepth = 1;
 #ifdef STRIPBYTECOUNTSORTED_UNUSED
-	td->td_stripbytecountsorted = 1; /* Our own arrays always sorted. */  
+    td->td_stripbytecountsorted = 1; /* Our own arrays always sorted. */
 #endif
-	td->td_resolutionunit = RESUNIT_INCH;
-	td->td_sampleformat = SAMPLEFORMAT_UINT;
-	td->td_imagedepth = 1;
-	td->td_ycbcrsubsampling[0] = 2;
-	td->td_ycbcrsubsampling[1] = 2;
-	td->td_ycbcrpositioning = YCBCRPOSITION_CENTERED;
-	tif->tif_postdecode = _TIFFNoPostDecode;  
-	tif->tif_foundfield = NULL;
-	tif->tif_tagmethods.vsetfield = _TIFFVSetField;  
-	tif->tif_tagmethods.vgetfield = _TIFFVGetField;
-	tif->tif_tagmethods.printdir = NULL;
-	/*
-	 *  Give client code a chance to install their own
-	 *  tag extensions & methods, prior to compression overloads,
-	 *  but do some prior cleanup first. (http://trac.osgeo.org/gdal/ticket/5054)
-	 */
-	if (tif->tif_nfieldscompat > 0) {
-		uint32 i;
-
-		for (i = 0; i < tif->tif_nfieldscompat; i++) {
-				if (tif->tif_fieldscompat[i].allocated_size)
-						_TIFFfree(tif->tif_fieldscompat[i].fields);
-		}
-		_TIFFfree(tif->tif_fieldscompat);
-		tif->tif_nfieldscompat = 0;
-		tif->tif_fieldscompat = NULL;
-	}
-	if (_TIFFextender)
-		(*_TIFFextender)(tif);
-	(void) TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_NONE);
-	/*
-	 * NB: The directory is marked dirty as a result of setting
-	 * up the default compression scheme.  However, this really
-	 * isn't correct -- we want TIFF_DIRTYDIRECT to be set only
-	 * if the user does something.  We could just do the setup
-	 * by hand, but it seems better to use the normal mechanism
-	 * (i.e. TIFFSetField).
-	 */
-	tif->tif_flags &= ~TIFF_DIRTYDIRECT;
-
-	/*
-	 * As per http://bugzilla.remotesensing.org/show_bug.cgi?id=19
-	 * we clear the ISTILED flag when setting up a new directory.
-	 * Should we also be clearing stuff like INSUBIFD?
-	 */
-	tif->tif_flags &= ~TIFF_ISTILED;
-
-	return (1);
+    td->td_resolutionunit = RESUNIT_INCH;
+    td->td_sampleformat = SAMPLEFORMAT_UINT;
+    td->td_imagedepth = 1;
+    td->td_ycbcrsubsampling[0] = 2;
+    td->td_ycbcrsubsampling[1] = 2;
+    td->td_ycbcrpositioning = YCBCRPOSITION_CENTERED;
+    tif->tif_postdecode = _TIFFNoPostDecode;
+    tif->tif_foundfield = NULL;
+    tif->tif_tagmethods.vsetfield = _TIFFVSetField;
+    tif->tif_tagmethods.vgetfield = _TIFFVGetField;
+    tif->tif_tagmethods.printdir = NULL;
+    /* additional default values */
+    td->td_planarconfig = PLANARCONFIG_CONTIG;
+    td->td_compression = COMPRESSION_NONE;
+    td->td_subfiletype = 0;
+    td->td_minsamplevalue = 0;
+    /* td_bitspersample=1 is always set in TIFFDefaultDirectory().
+     * Therefore, td_maxsamplevalue has to be re-calculated in
+     * TIFFGetFieldDefaulted(). */
+    td->td_maxsamplevalue = 1; /* Default for td_bitspersample=1 */
+    td->td_extrasamples = 0;
+    td->td_sampleinfo = NULL;
+
+    /*
+     *  Give client code a chance to install their own
+     *  tag extensions & methods, prior to compression overloads,
+     *  but do some prior cleanup first.
+     * (http://trac.osgeo.org/gdal/ticket/5054)
+     */
+    if (tif->tif_nfieldscompat > 0)
+    {
+        uint32_t i;
+
+        for (i = 0; i < tif->tif_nfieldscompat; i++)
+        {
+            if (tif->tif_fieldscompat[i].allocated_size)
+                _TIFFfreeExt(tif, tif->tif_fieldscompat[i].fields);
+        }
+        _TIFFfreeExt(tif, tif->tif_fieldscompat);
+        tif->tif_nfieldscompat = 0;
+        tif->tif_fieldscompat = NULL;
+    }
+    if (_TIFFextender)
+        (*_TIFFextender)(tif);
+    (void)TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_NONE);
+    /*
+     * NB: The directory is marked dirty as a result of setting
+     * up the default compression scheme.  However, this really
+     * isn't correct -- we want TIFF_DIRTYDIRECT to be set only
+     * if the user does something.  We could just do the setup
+     * by hand, but it seems better to use the normal mechanism
+     * (i.e. TIFFSetField).
+     */
+    tif->tif_flags &= ~TIFF_DIRTYDIRECT;
+
+    /*
+     * As per http://bugzilla.remotesensing.org/show_bug.cgi?id=19
+     * we clear the ISTILED flag when setting up a new directory.
+     * Should we also be clearing stuff like INSUBIFD?
+     */
+    tif->tif_flags &= ~TIFF_ISTILED;
+
+    return (1);
 }
 
-static int
-TIFFAdvanceDirectory(TIFF* tif, uint64* nextdir, uint64* off)
+static int TIFFAdvanceDirectory(TIFF *tif, uint64_t *nextdiroff, uint64_t *off,
+                                tdir_t *nextdirnum)
 {
-	static const char module[] = "TIFFAdvanceDirectory";
-	if (isMapped(tif))
-	{
-		uint64 poff=*nextdir;
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			tmsize_t poffa,poffb,poffc,poffd;
-			uint16 dircount;
-			uint32 nextdir32;
-			poffa=(tmsize_t)poff;
-			poffb=poffa+sizeof(uint16);
-			if (((uint64)poffa!=poff)||(poffb<poffa)||(poffb<(tmsize_t)sizeof(uint16))||(poffb>tif->tif_size))
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Error fetching directory count");
-                                  *nextdir=0;
-				return(0);
-			}
-			_TIFFmemcpy(&dircount,tif->tif_base+poffa,sizeof(uint16));
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabShort(&dircount);
-			poffc=poffb+dircount*12;
-			poffd=poffc+sizeof(uint32);
-			if ((poffc<poffb)||(poffc<dircount*12)||(poffd<poffc)||(poffd<(tmsize_t)sizeof(uint32))||(poffd>tif->tif_size))
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Error fetching directory link");
-				return(0);
-			}
-			if (off!=NULL)
-				*off=(uint64)poffc;
-			_TIFFmemcpy(&nextdir32,tif->tif_base+poffc,sizeof(uint32));
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong(&nextdir32);
-			*nextdir=nextdir32;
-		}
-		else
-		{
-			tmsize_t poffa,poffb,poffc,poffd;
-			uint64 dircount64;
-			uint16 dircount16;
-			poffa=(tmsize_t)poff;
-			poffb=poffa+sizeof(uint64);
-			if (((uint64)poffa!=poff)||(poffb<poffa)||(poffb<(tmsize_t)sizeof(uint64))||(poffb>tif->tif_size))
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Error fetching directory count");
-				return(0);
-			}
-			_TIFFmemcpy(&dircount64,tif->tif_base+poffa,sizeof(uint64));
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong8(&dircount64);
-			if (dircount64>0xFFFF)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Sanity check on directory count failed");
-				return(0);
-			}
-			dircount16=(uint16)dircount64;
-			poffc=poffb+dircount16*20;
-			poffd=poffc+sizeof(uint64);
-			if ((poffc<poffb)||(poffc<dircount16*20)||(poffd<poffc)||(poffd<(tmsize_t)sizeof(uint64))||(poffd>tif->tif_size))
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Error fetching directory link");
-				return(0);
-			}
-			if (off!=NULL)
-				*off=(uint64)poffc;
-			_TIFFmemcpy(nextdir,tif->tif_base+poffc,sizeof(uint64));
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong8(nextdir);
-		}
-		return(1);
-	}
-	else
-	{
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			uint16 dircount;
-			uint32 nextdir32;
-			if (!SeekOK(tif, *nextdir) ||
-			    !ReadOK(tif, &dircount, sizeof (uint16))) {
-				TIFFErrorExt(tif->tif_clientdata, module, "%s: Error fetching directory count",
-				    tif->tif_name);
-				return (0);
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabShort(&dircount);
-			if (off != NULL)
-				*off = TIFFSeekFile(tif,
-				    dircount*12, SEEK_CUR);
-			else
-				(void) TIFFSeekFile(tif,
-				    dircount*12, SEEK_CUR);
-			if (!ReadOK(tif, &nextdir32, sizeof (uint32))) {
-				TIFFErrorExt(tif->tif_clientdata, module, "%s: Error fetching directory link",
-				    tif->tif_name);
-				return (0);
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong(&nextdir32);
-			*nextdir=nextdir32;
-		}
-		else
-		{
-			uint64 dircount64;
-			uint16 dircount16;
-			if (!SeekOK(tif, *nextdir) ||
-			    !ReadOK(tif, &dircount64, sizeof (uint64))) {
-				TIFFErrorExt(tif->tif_clientdata, module, "%s: Error fetching directory count",
-				    tif->tif_name);
-				return (0);
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong8(&dircount64);
-			if (dircount64>0xFFFF)
-			{
-				TIFFErrorExt(tif->tif_clientdata, module, "Error fetching directory count");
-				return(0);
-			}
-			dircount16 = (uint16)dircount64;
-			if (off != NULL)
-				*off = TIFFSeekFile(tif,
-				    dircount16*20, SEEK_CUR);
-			else
-				(void) TIFFSeekFile(tif,
-				    dircount16*20, SEEK_CUR);
-			if (!ReadOK(tif, nextdir, sizeof (uint64))) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-                                             "%s: Error fetching directory link",
-				    tif->tif_name);
-				return (0);
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong8(nextdir);
-		}
-		return (1);
-	}
+    static const char module[] = "TIFFAdvanceDirectory";
+
+    /* Add this directory to the directory list, if not already in. */
+    if (!_TIFFCheckDirNumberAndOffset(tif, *nextdirnum, *nextdiroff))
+    {
+        TIFFErrorExtR(tif, module,
+                      "Starting directory %u at offset 0x%" PRIx64 " (%" PRIu64
+                      ") might cause an IFD loop",
+                      *nextdirnum, *nextdiroff, *nextdiroff);
+        *nextdiroff = 0;
+        *nextdirnum = 0;
+        return (0);
+    }
+
+    if (isMapped(tif))
+    {
+        uint64_t poff = *nextdiroff;
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            tmsize_t poffa, poffb, poffc, poffd;
+            uint16_t dircount;
+            uint32_t nextdir32;
+            poffa = (tmsize_t)poff;
+            poffb = poffa + sizeof(uint16_t);
+            if (((uint64_t)poffa != poff) || (poffb < poffa) ||
+                (poffb < (tmsize_t)sizeof(uint16_t)) || (poffb > tif->tif_size))
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory count");
+                *nextdiroff = 0;
+                return (0);
+            }
+            _TIFFmemcpy(&dircount, tif->tif_base + poffa, sizeof(uint16_t));
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort(&dircount);
+            poffc = poffb + dircount * 12;
+            poffd = poffc + sizeof(uint32_t);
+            if ((poffc < poffb) || (poffc < dircount * 12) || (poffd < poffc) ||
+                (poffd < (tmsize_t)sizeof(uint32_t)) || (poffd > tif->tif_size))
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory link");
+                return (0);
+            }
+            if (off != NULL)
+                *off = (uint64_t)poffc;
+            _TIFFmemcpy(&nextdir32, tif->tif_base + poffc, sizeof(uint32_t));
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong(&nextdir32);
+            *nextdiroff = nextdir32;
+        }
+        else
+        {
+            tmsize_t poffa, poffb, poffc, poffd;
+            uint64_t dircount64;
+            uint16_t dircount16;
+            if (poff > (uint64_t)TIFF_TMSIZE_T_MAX - sizeof(uint64_t))
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory count");
+                return (0);
+            }
+            poffa = (tmsize_t)poff;
+            poffb = poffa + sizeof(uint64_t);
+            if (poffb > tif->tif_size)
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory count");
+                return (0);
+            }
+            _TIFFmemcpy(&dircount64, tif->tif_base + poffa, sizeof(uint64_t));
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&dircount64);
+            if (dircount64 > 0xFFFF)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Sanity check on directory count failed");
+                return (0);
+            }
+            dircount16 = (uint16_t)dircount64;
+            if (poffb > TIFF_TMSIZE_T_MAX - (tmsize_t)(dircount16 * 20) -
+                            (tmsize_t)sizeof(uint64_t))
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory link");
+                return (0);
+            }
+            poffc = poffb + dircount16 * 20;
+            poffd = poffc + sizeof(uint64_t);
+            if (poffd > tif->tif_size)
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory link");
+                return (0);
+            }
+            if (off != NULL)
+                *off = (uint64_t)poffc;
+            _TIFFmemcpy(nextdiroff, tif->tif_base + poffc, sizeof(uint64_t));
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(nextdiroff);
+        }
+    }
+    else
+    {
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            uint16_t dircount;
+            uint32_t nextdir32;
+            if (!SeekOK(tif, *nextdiroff) ||
+                !ReadOK(tif, &dircount, sizeof(uint16_t)))
+            {
+                TIFFErrorExtR(tif, module, "%s: Error fetching directory count",
+                              tif->tif_name);
+                return (0);
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort(&dircount);
+            if (off != NULL)
+                *off = TIFFSeekFile(tif, dircount * 12, SEEK_CUR);
+            else
+                (void)TIFFSeekFile(tif, dircount * 12, SEEK_CUR);
+            if (!ReadOK(tif, &nextdir32, sizeof(uint32_t)))
+            {
+                TIFFErrorExtR(tif, module, "%s: Error fetching directory link",
+                              tif->tif_name);
+                return (0);
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong(&nextdir32);
+            *nextdiroff = nextdir32;
+        }
+        else
+        {
+            uint64_t dircount64;
+            uint16_t dircount16;
+            if (!SeekOK(tif, *nextdiroff) ||
+                !ReadOK(tif, &dircount64, sizeof(uint64_t)))
+            {
+                TIFFErrorExtR(tif, module, "%s: Error fetching directory count",
+                              tif->tif_name);
+                return (0);
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&dircount64);
+            if (dircount64 > 0xFFFF)
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory count");
+                return (0);
+            }
+            dircount16 = (uint16_t)dircount64;
+            if (off != NULL)
+                *off = TIFFSeekFile(tif, dircount16 * 20, SEEK_CUR);
+            else
+                (void)TIFFSeekFile(tif, dircount16 * 20, SEEK_CUR);
+            if (!ReadOK(tif, nextdiroff, sizeof(uint64_t)))
+            {
+                TIFFErrorExtR(tif, module, "%s: Error fetching directory link",
+                              tif->tif_name);
+                return (0);
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(nextdiroff);
+        }
+    }
+    if (*nextdiroff != 0)
+    {
+        (*nextdirnum)++;
+        /* Check next directory for IFD looping and if so, set it as last
+         * directory. */
+        if (!_TIFFCheckDirNumberAndOffset(tif, *nextdirnum, *nextdiroff))
+        {
+            TIFFWarningExtR(
+                tif, module,
+                "the next directory %u at offset 0x%" PRIx64 " (%" PRIu64
+                ") might be an IFD loop. Treating directory %d as "
+                "last directory",
+                *nextdirnum, *nextdiroff, *nextdiroff, (int)(*nextdirnum) - 1);
+            *nextdiroff = 0;
+            (*nextdirnum)--;
+        }
+    }
+    return (1);
 }
 
 /*
  * Count the number of directories in a file.
  */
-uint16
-TIFFNumberOfDirectories(TIFF* tif)
+tdir_t TIFFNumberOfDirectories(TIFF *tif)
 {
-	static const char module[] = "TIFFNumberOfDirectories";
-	uint64 nextdir;
-	uint16 n;
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-		nextdir = tif->tif_header.classic.tiff_diroff;
-	else
-		nextdir = tif->tif_header.big.tiff_diroff;
-	n = 0;
-	while (nextdir != 0 && TIFFAdvanceDirectory(tif, &nextdir, NULL))
-        {
-                if (n != 65535) {
-                        ++n;
-                }
-		else
-                {
-                        TIFFErrorExt(tif->tif_clientdata, module,
-                                     "Directory count exceeded 65535 limit,"
-                                     " giving up on counting.");
-                        return (65535);
-                }
-        }
-	return (n);
+    uint64_t nextdiroff;
+    tdir_t nextdirnum;
+    tdir_t n;
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+        nextdiroff = tif->tif_header.classic.tiff_diroff;
+    else
+        nextdiroff = tif->tif_header.big.tiff_diroff;
+    nextdirnum = 0;
+    n = 0;
+    while (nextdiroff != 0 &&
+           TIFFAdvanceDirectory(tif, &nextdiroff, NULL, &nextdirnum))
+    {
+        ++n;
+    }
+    return (n);
 }
 
 /*
  * Set the n-th directory as the current directory.
  * NB: Directories are numbered starting at 0.
  */
-int
-TIFFSetDirectory(TIFF* tif, uint16 dirn)
+int TIFFSetDirectory(TIFF *tif, tdir_t dirn)
 {
-	uint64 nextdir;
-	uint16 n;
-
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-		nextdir = tif->tif_header.classic.tiff_diroff;
-	else
-		nextdir = tif->tif_header.big.tiff_diroff;
-	for (n = dirn; n > 0 && nextdir != 0; n--)
-		if (!TIFFAdvanceDirectory(tif, &nextdir, NULL))
-			return (0);
-	tif->tif_nextdiroff = nextdir;
-	/*
-	 * Set curdir to the actual directory index.  The
-	 * -1 is because TIFFReadDirectory will increment
-	 * tif_curdir after successfully reading the directory.
-	 */
-	tif->tif_curdir = (dirn - n) - 1;
-	/*
-	 * Reset tif_dirnumber counter and start new list of seen directories.
-	 * We need this to prevent IFD loops.
-	 */
-	tif->tif_dirnumber = 0;
-	return (TIFFReadDirectory(tif));
+    uint64_t nextdiroff;
+    tdir_t nextdirnum = 0;
+    tdir_t n;
+
+    if (tif->tif_setdirectory_force_absolute)
+    {
+        /* tif_setdirectory_force_absolute=1 will force parsing the main IFD
+         * chain from the beginning, thus IFD directory list needs to be cleared
+         * from possible SubIFD offsets.
+         */
+        _TIFFCleanupIFDOffsetAndNumberMaps(tif); /* invalidate IFD loop lists */
+    }
+
+    /* Even faster path, if offset is available within IFD loop hash list. */
+    if (!tif->tif_setdirectory_force_absolute &&
+        _TIFFGetOffsetFromDirNumber(tif, dirn, &nextdiroff))
+    {
+        /* Set parameters for following TIFFReadDirectory() below. */
+        tif->tif_nextdiroff = nextdiroff;
+        tif->tif_curdir = dirn;
+        /* Reset to relative stepping */
+        tif->tif_setdirectory_force_absolute = FALSE;
+    }
+    else
+    {
+
+        /* Fast path when we just advance relative to the current directory:
+         * start at the current dir offset and continue to seek from there.
+         * Check special cases when relative is not allowed:
+         * - jump back from SubIFD or custom directory
+         * - right after TIFFWriteDirectory() jump back to that directory
+         *   using TIFFSetDirectory() */
+        const int relative = (dirn >= tif->tif_curdir) &&
+                             (tif->tif_diroff != 0) &&
+                             !tif->tif_setdirectory_force_absolute;
+
+        if (relative)
+        {
+            nextdiroff = tif->tif_diroff;
+            dirn -= tif->tif_curdir;
+            nextdirnum = tif->tif_curdir;
+        }
+        else if (!(tif->tif_flags & TIFF_BIGTIFF))
+            nextdiroff = tif->tif_header.classic.tiff_diroff;
+        else
+            nextdiroff = tif->tif_header.big.tiff_diroff;
+
+        /* Reset to relative stepping */
+        tif->tif_setdirectory_force_absolute = FALSE;
+
+        for (n = dirn; n > 0 && nextdiroff != 0; n--)
+            if (!TIFFAdvanceDirectory(tif, &nextdiroff, NULL, &nextdirnum))
+                return (0);
+        /* If the n-th directory could not be reached (does not exist),
+         * return here without touching anything further. */
+        if (nextdiroff == 0 || n > 0)
+            return (0);
+
+        tif->tif_nextdiroff = nextdiroff;
+
+        /* Set curdir to the actual directory index. */
+        if (relative)
+            tif->tif_curdir += dirn - n;
+        else
+            tif->tif_curdir = dirn - n;
+    }
+
+    /* The -1 decrement is because TIFFReadDirectory will increment
+     * tif_curdir after successfully reading the directory. */
+    if (tif->tif_curdir == 0)
+        tif->tif_curdir = TIFF_NON_EXISTENT_DIR_NUMBER;
+    else
+        tif->tif_curdir--;
+    return (TIFFReadDirectory(tif));
 }
 
 /*
@@ -1696,140 +2109,200 @@ TIFFSetDirectory(TIFF* tif, uint16 dirn)
  * is used mainly to access directories linked with
  * the SubIFD tag (e.g. thumbnail images).
  */
-int
-TIFFSetSubDirectory(TIFF* tif, uint64 diroff)
+int TIFFSetSubDirectory(TIFF *tif, uint64_t diroff)
 {
-	tif->tif_nextdiroff = diroff;
-	/*
-	 * Reset tif_dirnumber counter and start new list of seen directories.
-	 * We need this to prevent IFD loops.
-	 */
-	tif->tif_dirnumber = 0;
-	return (TIFFReadDirectory(tif));
+    /* Match nextdiroff and curdir for consistent IFD-loop checking.
+     * Only with TIFFSetSubDirectory() the IFD list can be corrupted with
+     * invalid offsets within the main IFD tree. In the case of several subIFDs
+     * of a main image, there are two possibilities that are not even mutually
+     * exclusive. a.) The subIFD tag contains an array with all offsets of the
+     * subIFDs. b.) The SubIFDs are concatenated with their NextIFD parameters.
+     * (refer to
+     * https://www.awaresystems.be/imaging/tiff/specification/TIFFPM6.pdf.)
+     */
+    int retval;
+    uint32_t curdir = 0;
+    int8_t probablySubIFD = 0;
+    if (diroff == 0)
+    {
+        /* Special case to invalidate the tif_lastdiroff member. */
+        tif->tif_curdir = TIFF_NON_EXISTENT_DIR_NUMBER;
+    }
+    else
+    {
+        if (!_TIFFGetDirNumberFromOffset(tif, diroff, &curdir))
+        {
+            /* Non-existing offsets might point to a SubIFD or invalid IFD.*/
+            probablySubIFD = 1;
+        }
+        /* -1 because TIFFReadDirectory() will increment tif_curdir. */
+        tif->tif_curdir =
+            curdir == 0 ? TIFF_NON_EXISTENT_DIR_NUMBER : curdir - 1;
+    }
+
+    tif->tif_nextdiroff = diroff;
+    retval = TIFFReadDirectory(tif);
+    /* If failed, curdir was not incremented in TIFFReadDirectory(), so set it
+     * back, but leave it for diroff==0. */
+    if (!retval && diroff != 0)
+    {
+        if (tif->tif_curdir == TIFF_NON_EXISTENT_DIR_NUMBER)
+            tif->tif_curdir = 0;
+        else
+            tif->tif_curdir++;
+    }
+    if (retval && probablySubIFD)
+    {
+        /* Reset IFD list to start new one for SubIFD chain and also start
+         * SubIFD chain with tif_curdir=0. */
+        _TIFFCleanupIFDOffsetAndNumberMaps(tif); /* invalidate IFD loop lists */
+        tif->tif_curdir = 0; /* first directory of new chain */
+        /* add this offset to new IFD list */
+        _TIFFCheckDirNumberAndOffset(tif, tif->tif_curdir, diroff);
+        /* To be able to return from SubIFD or custom-IFD to main-IFD */
+        tif->tif_setdirectory_force_absolute = TRUE;
+    }
+    return (retval);
 }
 
 /*
  * Return file offset of the current directory.
  */
-uint64
-TIFFCurrentDirOffset(TIFF* tif)
-{
-	return (tif->tif_diroff);
-}
+uint64_t TIFFCurrentDirOffset(TIFF *tif) { return (tif->tif_diroff); }
 
 /*
  * Return an indication of whether or not we are
  * at the last directory in the file.
  */
-int
-TIFFLastDirectory(TIFF* tif)
-{
-	return (tif->tif_nextdiroff == 0);
-}
+int TIFFLastDirectory(TIFF *tif) { return (tif->tif_nextdiroff == 0); }
 
 /*
  * Unlink the specified directory from the directory chain.
+ * Note: First directory starts with number dirn=1.
+ * This is different to TIFFSetDirectory() where the first directory starts with
+ * zero.
  */
-int
-TIFFUnlinkDirectory(TIFF* tif, uint16 dirn)
+int TIFFUnlinkDirectory(TIFF *tif, tdir_t dirn)
 {
-	static const char module[] = "TIFFUnlinkDirectory";
-	uint64 nextdir;
-	uint64 off;
-	uint16 n;
-
-	if (tif->tif_mode == O_RDONLY) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-                             "Can not unlink directory in read-only file");
-		return (0);
-	}
-	/*
-	 * Go to the directory before the one we want
-	 * to unlink and nab the offset of the link
-	 * field we'll need to patch.
-	 */
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		nextdir = tif->tif_header.classic.tiff_diroff;
-		off = 4;
-	}
-	else
-	{
-		nextdir = tif->tif_header.big.tiff_diroff;
-		off = 8;
-	}
-	for (n = dirn-1; n > 0; n--) {
-		if (nextdir == 0) {
-			TIFFErrorExt(tif->tif_clientdata, module, "Directory %d does not exist", dirn);
-			return (0);
-		}
-		if (!TIFFAdvanceDirectory(tif, &nextdir, &off))
-			return (0);
-	}
-	/*
-	 * Advance to the directory to be unlinked and fetch
-	 * the offset of the directory that follows.
-	 */
-	if (!TIFFAdvanceDirectory(tif, &nextdir, NULL))
-		return (0);
-	/*
-	 * Go back and patch the link field of the preceding
-	 * directory to point to the offset of the directory
-	 * that follows.
-	 */
-	(void) TIFFSeekFile(tif, off, SEEK_SET);
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		uint32 nextdir32;
-		nextdir32=(uint32)nextdir;
-		assert((uint64)nextdir32==nextdir);
-		if (tif->tif_flags & TIFF_SWAB)
-			TIFFSwabLong(&nextdir32);
-		if (!WriteOK(tif, &nextdir32, sizeof (uint32))) {
-			TIFFErrorExt(tif->tif_clientdata, module, "Error writing directory link");
-			return (0);
-		}
-	}
-	else
-	{
-		if (tif->tif_flags & TIFF_SWAB)
-			TIFFSwabLong8(&nextdir);
-		if (!WriteOK(tif, &nextdir, sizeof (uint64))) {
-			TIFFErrorExt(tif->tif_clientdata, module, "Error writing directory link");
-			return (0);
-		}
-	}
-	/*
-	 * Leave directory state setup safely.  We don't have
-	 * facilities for doing inserting and removing directories,
-	 * so it's safest to just invalidate everything.  This
-	 * means that the caller can only append to the directory
-	 * chain.
-	 */
-	(*tif->tif_cleanup)(tif);
-	if ((tif->tif_flags & TIFF_MYBUFFER) && tif->tif_rawdata) {
-		_TIFFfree(tif->tif_rawdata);
-		tif->tif_rawdata = NULL;
-		tif->tif_rawcc = 0;
-                tif->tif_rawdataoff = 0;
-                tif->tif_rawdataloaded = 0;
-	}
-	tif->tif_flags &= ~(TIFF_BEENWRITING|TIFF_BUFFERSETUP|TIFF_POSTENCODE|TIFF_BUF4WRITE);
-	TIFFFreeDirectory(tif);
-	TIFFDefaultDirectory(tif);
-	tif->tif_diroff = 0;			/* force link on next write */
-	tif->tif_nextdiroff = 0;		/* next write must be at end */
-	tif->tif_curoff = 0;
-	tif->tif_row = (uint32) -1;
-	tif->tif_curstrip = (uint32) -1;
-	return (1);
-}
+    static const char module[] = "TIFFUnlinkDirectory";
+    uint64_t nextdir;
+    tdir_t nextdirnum;
+    uint64_t off;
+    tdir_t n;
 
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
+    if (tif->tif_mode == O_RDONLY)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Can not unlink directory in read-only file");
+        return (0);
+    }
+    if (dirn == 0)
+    {
+        TIFFErrorExtR(tif, module,
+                      "For TIFFUnlinkDirectory() first directory starts with "
+                      "number 1 and not 0");
+        return (0);
+    }
+    /*
+     * Go to the directory before the one we want
+     * to unlink and nab the offset of the link
+     * field we'll need to patch.
+     */
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        nextdir = tif->tif_header.classic.tiff_diroff;
+        off = 4;
+    }
+    else
+    {
+        nextdir = tif->tif_header.big.tiff_diroff;
+        off = 8;
+    }
+    nextdirnum = 0; /* First directory is dirn=0 */
+
+    for (n = dirn - 1; n > 0; n--)
+    {
+        if (nextdir == 0)
+        {
+            TIFFErrorExtR(tif, module, "Directory %u does not exist", dirn);
+            return (0);
+        }
+        if (!TIFFAdvanceDirectory(tif, &nextdir, &off, &nextdirnum))
+            return (0);
+    }
+    /*
+     * Advance to the directory to be unlinked and fetch
+     * the offset of the directory that follows.
+     */
+    if (!TIFFAdvanceDirectory(tif, &nextdir, NULL, &nextdirnum))
+        return (0);
+    /*
+     * Go back and patch the link field of the preceding
+     * directory to point to the offset of the directory
+     * that follows.
+     */
+    (void)TIFFSeekFile(tif, off, SEEK_SET);
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        uint32_t nextdir32;
+        nextdir32 = (uint32_t)nextdir;
+        assert((uint64_t)nextdir32 == nextdir);
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&nextdir32);
+        if (!WriteOK(tif, &nextdir32, sizeof(uint32_t)))
+        {
+            TIFFErrorExtR(tif, module, "Error writing directory link");
+            return (0);
+        }
+    }
+    else
+    {
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong8(&nextdir);
+        if (!WriteOK(tif, &nextdir, sizeof(uint64_t)))
+        {
+            TIFFErrorExtR(tif, module, "Error writing directory link");
+            return (0);
+        }
+    }
+
+    /* For dirn=1 (first directory) also update the libtiff internal
+     * base offset variables. */
+    if (dirn == 1)
+    {
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+            tif->tif_header.classic.tiff_diroff = (uint32_t)nextdir;
+        else
+            tif->tif_header.big.tiff_diroff = nextdir;
+    }
+
+    /*
+     * Leave directory state setup safely.  We don't have
+     * facilities for doing inserting and removing directories,
+     * so it's safest to just invalidate everything.  This
+     * means that the caller can only append to the directory
+     * chain.
+     */
+    (*tif->tif_cleanup)(tif);
+    if ((tif->tif_flags & TIFF_MYBUFFER) && tif->tif_rawdata)
+    {
+        _TIFFfreeExt(tif, tif->tif_rawdata);
+        tif->tif_rawdata = NULL;
+        tif->tif_rawcc = 0;
+        tif->tif_rawdataoff = 0;
+        tif->tif_rawdataloaded = 0;
+    }
+    tif->tif_flags &= ~(TIFF_BEENWRITING | TIFF_BUFFERSETUP | TIFF_POSTENCODE |
+                        TIFF_BUF4WRITE);
+    TIFFFreeDirectory(tif);
+    TIFFDefaultDirectory(tif);
+    tif->tif_diroff = 0;     /* force link on next write */
+    tif->tif_nextdiroff = 0; /* next write must be at end */
+    tif->tif_lastdiroff = 0; /* will be updated on next link */
+    tif->tif_curoff = 0;
+    tif->tif_row = (uint32_t)-1;
+    tif->tif_curstrip = (uint32_t)-1;
+    tif->tif_curdir = TIFF_NON_EXISTENT_DIR_NUMBER;
+    _TIFFCleanupIFDOffsetAndNumberMaps(tif); /* invalidate IFD loop lists */
+    return (1);
+}
diff --git a/3rdparty/libtiff/tif_dir.h b/3rdparty/libtiff/tif_dir.h
index f608dd713b3f..9eaf22f8e62d 100644
--- a/3rdparty/libtiff/tif_dir.h
+++ b/3rdparty/libtiff/tif_dir.h
@@ -2,28 +2,28 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
 #ifndef _TIFFDIR_
-#define	_TIFFDIR_
+#define _TIFFDIR_
 
 #include "tiff.h"
 #include "tiffio.h"
@@ -32,10 +32,11 @@
  * ``Library-private'' Directory-related Definitions.
  */
 
-typedef struct {
-	const TIFFField *info;
-	int             count;
-	void           *value;
+typedef struct
+{
+    const TIFFField *info;
+    int count;
+    void *value;
 } TIFFTagValue;
 
 /*
@@ -49,79 +50,91 @@ typedef struct {
  * BigTIFF, then it is placed in the offset field to save space. If so,
  * it is left-justified in the offset field.
  */
-typedef struct {
-	uint16 tdir_tag;        /* see below */
-	uint16 tdir_type;       /* data type; see below */
-	uint64 tdir_count;      /* number of items; length in spec */
-	union {
-		uint16 toff_short;
-		uint32 toff_long;
-		uint64 toff_long8;
-	} tdir_offset;		/* either offset or the data itself if fits */
-	uint8  tdir_ignore;	/* flag status to ignore tag when parsing tags in tif_dirread.c */
+typedef struct
+{
+    uint16_t tdir_tag;   /* see below */
+    uint16_t tdir_type;  /* data type; see below */
+    uint64_t tdir_count; /* number of items; length in spec */
+    union
+    {
+        uint16_t toff_short;
+        uint32_t toff_long;
+        uint64_t toff_long8;
+    } tdir_offset;       /* either offset or the data itself if fits */
+    uint8_t tdir_ignore; /* flag status to ignore tag when parsing tags in
+                            tif_dirread.c */
 } TIFFDirEntry;
 
 /*
  * Internal format of a TIFF directory entry.
  */
-typedef struct {
-#define FIELD_SETLONGS 4
-	/* bit vector of fields that are set */
-	unsigned long td_fieldsset[FIELD_SETLONGS];
+typedef struct
+{
+#define FIELDSET_ITEMS 4
+    /* bit vector of fields that are set */
+    uint32_t td_fieldsset[FIELDSET_ITEMS];
 
-	uint32  td_imagewidth, td_imagelength, td_imagedepth;
-	uint32  td_tilewidth, td_tilelength, td_tiledepth;
-	uint32  td_subfiletype;
-	uint16  td_bitspersample;
-	uint16  td_sampleformat;
-	uint16  td_compression;
-	uint16  td_photometric;
-	uint16  td_threshholding;
-	uint16  td_fillorder;
-	uint16  td_orientation;
-	uint16  td_samplesperpixel;
-	uint32  td_rowsperstrip;
-	uint16  td_minsamplevalue, td_maxsamplevalue;
-	double* td_sminsamplevalue;
-	double* td_smaxsamplevalue;
-	float   td_xresolution, td_yresolution;
-	uint16  td_resolutionunit;
-	uint16  td_planarconfig;
-	float   td_xposition, td_yposition;
-	uint16  td_pagenumber[2];
-	uint16* td_colormap[3];
-	uint16  td_halftonehints[2];
-	uint16  td_extrasamples;
-	uint16* td_sampleinfo;
-	/* even though the name is misleading, td_stripsperimage is the number
-	 * of striles (=strips or tiles) per plane, and td_nstrips the total
-	 * number of striles */
-	uint32  td_stripsperimage;  
-	uint32  td_nstrips;              /* size of offset & bytecount arrays */
-	uint64* td_stripoffset_p;        /* should be accessed with TIFFGetStrileOffset */
-	uint64* td_stripbytecount_p;     /* should be accessed with TIFFGetStrileByteCount */
-        uint32  td_stripoffsetbyteallocsize; /* number of elements currently allocated for td_stripoffset/td_stripbytecount. Only used if TIFF_LAZYSTRILELOAD is set */
+    uint32_t td_imagewidth, td_imagelength, td_imagedepth;
+    uint32_t td_tilewidth, td_tilelength, td_tiledepth;
+    uint32_t td_subfiletype;
+    uint16_t td_bitspersample;
+    uint16_t td_sampleformat;
+    uint16_t td_compression;
+    uint16_t td_photometric;
+    uint16_t td_threshholding;
+    uint16_t td_fillorder;
+    uint16_t td_orientation;
+    uint16_t td_samplesperpixel;
+    uint32_t td_rowsperstrip;
+    uint16_t td_minsamplevalue, td_maxsamplevalue;
+    double *td_sminsamplevalue;
+    double *td_smaxsamplevalue;
+    float td_xresolution, td_yresolution;
+    uint16_t td_resolutionunit;
+    uint16_t td_planarconfig;
+    float td_xposition, td_yposition;
+    uint16_t td_pagenumber[2];
+    uint16_t *td_colormap[3];
+    uint16_t td_halftonehints[2];
+    uint16_t td_extrasamples;
+    uint16_t *td_sampleinfo;
+    /* even though the name is misleading, td_stripsperimage is the number
+     * of striles (=strips or tiles) per plane, and td_nstrips the total
+     * number of striles */
+    uint32_t td_stripsperimage;
+    uint32_t td_nstrips; /* size of offset & bytecount arrays */
+    uint64_t
+        *td_stripoffset_p; /* should be accessed with TIFFGetStrileOffset */
+    uint64_t *td_stripbytecount_p; /* should be accessed with
+                                      TIFFGetStrileByteCount */
+    uint32_t
+        td_stripoffsetbyteallocsize; /* number of elements currently allocated
+                                        for td_stripoffset/td_stripbytecount.
+                                        Only used if TIFF_LAZYSTRILELOAD is set
+                                      */
 #ifdef STRIPBYTECOUNTSORTED_UNUSED
-	int     td_stripbytecountsorted; /* is the bytecount array sorted ascending? */
+    int td_stripbytecountsorted; /* is the bytecount array sorted ascending? */
 #endif
-        TIFFDirEntry td_stripoffset_entry;    /* for deferred loading */
-        TIFFDirEntry td_stripbytecount_entry; /* for deferred loading */
-	uint16  td_nsubifd;
-	uint64* td_subifd;
-	/* YCbCr parameters */
-	uint16  td_ycbcrsubsampling[2];
-	uint16  td_ycbcrpositioning;
-	/* Colorimetry parameters */
-	uint16* td_transferfunction[3];
-	float*	td_refblackwhite;
-	/* CMYK parameters */
-	int     td_inknameslen;
-	char*   td_inknames;
+    TIFFDirEntry td_stripoffset_entry;    /* for deferred loading */
+    TIFFDirEntry td_stripbytecount_entry; /* for deferred loading */
+    uint16_t td_nsubifd;
+    uint64_t *td_subifd;
+    /* YCbCr parameters */
+    uint16_t td_ycbcrsubsampling[2];
+    uint16_t td_ycbcrpositioning;
+    /* Colorimetry parameters */
+    uint16_t *td_transferfunction[3];
+    float *td_refblackwhite;
+    /* CMYK parameters */
+    int td_inknameslen;
+    char *td_inknames;
+    uint16_t td_numberofinks; /* number of inks in InkNames string */
 
-	int     td_customValueCount;
-        TIFFTagValue *td_customValues;
+    int td_customValueCount;
+    TIFFTagValue *td_customValues;
 
-        unsigned char td_deferstrilearraywriting; /* see TIFFDeferStrileArrayWriting() */
+    unsigned char
+        td_deferstrilearraywriting; /* see TIFFDeferStrileArrayWriting() */
 } TIFFDirectory;
 
 /*
@@ -135,49 +148,49 @@ typedef struct {
  * Note that a bit *is* allocated for ignored tags; this is understood by the
  * directory reading logic which uses this fact to avoid special-case handling
  */
-#define FIELD_IGNORE                   0
+#define FIELD_IGNORE 0
 
 /* multi-item fields */
-#define FIELD_IMAGEDIMENSIONS          1
-#define FIELD_TILEDIMENSIONS           2
-#define FIELD_RESOLUTION               3
-#define FIELD_POSITION                 4
+#define FIELD_IMAGEDIMENSIONS 1
+#define FIELD_TILEDIMENSIONS 2
+#define FIELD_RESOLUTION 3
+#define FIELD_POSITION 4
 
 /* single-item fields */
-#define FIELD_SUBFILETYPE              5
-#define FIELD_BITSPERSAMPLE            6
-#define FIELD_COMPRESSION              7
-#define FIELD_PHOTOMETRIC              8
-#define FIELD_THRESHHOLDING            9
-#define FIELD_FILLORDER                10
-#define FIELD_ORIENTATION              15
-#define FIELD_SAMPLESPERPIXEL          16
-#define FIELD_ROWSPERSTRIP             17
-#define FIELD_MINSAMPLEVALUE           18
-#define FIELD_MAXSAMPLEVALUE           19
-#define FIELD_PLANARCONFIG             20
-#define FIELD_RESOLUTIONUNIT           22
-#define FIELD_PAGENUMBER               23
-#define FIELD_STRIPBYTECOUNTS          24
-#define FIELD_STRIPOFFSETS             25
-#define FIELD_COLORMAP                 26
-#define FIELD_EXTRASAMPLES             31
-#define FIELD_SAMPLEFORMAT             32
-#define FIELD_SMINSAMPLEVALUE          33
-#define FIELD_SMAXSAMPLEVALUE          34
-#define FIELD_IMAGEDEPTH               35
-#define FIELD_TILEDEPTH                36
-#define FIELD_HALFTONEHINTS            37
-#define FIELD_YCBCRSUBSAMPLING         39
-#define FIELD_YCBCRPOSITIONING         40
-#define	FIELD_REFBLACKWHITE            41
-#define FIELD_TRANSFERFUNCTION         44
-#define FIELD_INKNAMES                 46
-#define FIELD_SUBIFD                   49
+#define FIELD_SUBFILETYPE 5
+#define FIELD_BITSPERSAMPLE 6
+#define FIELD_COMPRESSION 7
+#define FIELD_PHOTOMETRIC 8
+#define FIELD_THRESHHOLDING 9
+#define FIELD_FILLORDER 10
+#define FIELD_ORIENTATION 15
+#define FIELD_SAMPLESPERPIXEL 16
+#define FIELD_ROWSPERSTRIP 17
+#define FIELD_MINSAMPLEVALUE 18
+#define FIELD_MAXSAMPLEVALUE 19
+#define FIELD_PLANARCONFIG 20
+#define FIELD_RESOLUTIONUNIT 22
+#define FIELD_PAGENUMBER 23
+#define FIELD_STRIPBYTECOUNTS 24
+#define FIELD_STRIPOFFSETS 25
+#define FIELD_COLORMAP 26
+#define FIELD_EXTRASAMPLES 31
+#define FIELD_SAMPLEFORMAT 32
+#define FIELD_SMINSAMPLEVALUE 33
+#define FIELD_SMAXSAMPLEVALUE 34
+#define FIELD_IMAGEDEPTH 35
+#define FIELD_TILEDEPTH 36
+#define FIELD_HALFTONEHINTS 37
+#define FIELD_YCBCRSUBSAMPLING 39
+#define FIELD_YCBCRPOSITIONING 40
+#define FIELD_REFBLACKWHITE 41
+#define FIELD_TRANSFERFUNCTION 44
+#define FIELD_INKNAMES 46
+#define FIELD_SUBIFD 49
+#define FIELD_NUMBEROFINKS 50
 /*      FIELD_CUSTOM (see tiffio.h)    65 */
 /* end of support for well-known tags; codec-private tags follow */
-#define FIELD_CODEC                    66  /* base of codec-private tags */
-
+#define FIELD_CODEC 66 /* base of codec-private tags */
 
 /*
  * Pseudo-tags don't normally need field bits since they are not written to an
@@ -187,131 +200,141 @@ typedef struct {
  * or ``unset'' then it can do using internal state flags without polluting
  * the field bit space defined for real tags.
  */
-#define FIELD_PSEUDO			0
+#define FIELD_PSEUDO 0
 
-#define FIELD_LAST			(32*FIELD_SETLONGS-1)
+#define FIELD_LAST (32 * FIELDSET_ITEMS - 1)
 
-#define BITn(n)				(((unsigned long)1L)<<((n)&0x1f))
-#define BITFIELDn(tif, n)		((tif)->tif_dir.td_fieldsset[(n)/32])
-#define TIFFFieldSet(tif, field)	(BITFIELDn(tif, field) & BITn(field))
-#define TIFFSetFieldBit(tif, field)	(BITFIELDn(tif, field) |= BITn(field))
-#define TIFFClrFieldBit(tif, field)	(BITFIELDn(tif, field) &= ~BITn(field))
+#define BITn(n) (((uint32_t)1L) << ((n)&0x1f))
+#define BITFIELDn(tif, n) ((tif)->tif_dir.td_fieldsset[(n) / 32])
+#define TIFFFieldSet(tif, field) (BITFIELDn(tif, field) & BITn(field))
+#define TIFFSetFieldBit(tif, field) (BITFIELDn(tif, field) |= BITn(field))
+#define TIFFClrFieldBit(tif, field) (BITFIELDn(tif, field) &= ~BITn(field))
 
-#define FieldSet(fields, f)		(fields[(f)/32] & BITn(f))
-#define ResetFieldBit(fields, f)	(fields[(f)/32] &= ~BITn(f))
+#define FieldSet(fields, f) (fields[(f) / 32] & BITn(f))
+#define ResetFieldBit(fields, f) (fields[(f) / 32] &= ~BITn(f))
 
-typedef enum {
-	TIFF_SETGET_UNDEFINED = 0,
-	TIFF_SETGET_ASCII = 1,
-	TIFF_SETGET_UINT8 = 2,
-	TIFF_SETGET_SINT8 = 3,
-	TIFF_SETGET_UINT16 = 4,
-	TIFF_SETGET_SINT16 = 5,
-	TIFF_SETGET_UINT32 = 6,
-	TIFF_SETGET_SINT32 = 7,
-	TIFF_SETGET_UINT64 = 8,
-	TIFF_SETGET_SINT64 = 9,
-	TIFF_SETGET_FLOAT = 10,
-	TIFF_SETGET_DOUBLE = 11,
-	TIFF_SETGET_IFD8 = 12,
-	TIFF_SETGET_INT = 13,
-	TIFF_SETGET_UINT16_PAIR = 14,
-	TIFF_SETGET_C0_ASCII = 15,
-	TIFF_SETGET_C0_UINT8 = 16,
-	TIFF_SETGET_C0_SINT8 = 17,
-	TIFF_SETGET_C0_UINT16 = 18,
-	TIFF_SETGET_C0_SINT16 = 19,
-	TIFF_SETGET_C0_UINT32 = 20,
-	TIFF_SETGET_C0_SINT32 = 21,
-	TIFF_SETGET_C0_UINT64 = 22,
-	TIFF_SETGET_C0_SINT64 = 23,
-	TIFF_SETGET_C0_FLOAT = 24,
-	TIFF_SETGET_C0_DOUBLE = 25,
-	TIFF_SETGET_C0_IFD8 = 26,
-	TIFF_SETGET_C16_ASCII = 27,
-	TIFF_SETGET_C16_UINT8 = 28,
-	TIFF_SETGET_C16_SINT8 = 29,
-	TIFF_SETGET_C16_UINT16 = 30,
-	TIFF_SETGET_C16_SINT16 = 31,
-	TIFF_SETGET_C16_UINT32 = 32,
-	TIFF_SETGET_C16_SINT32 = 33,
-	TIFF_SETGET_C16_UINT64 = 34,
-	TIFF_SETGET_C16_SINT64 = 35,
-	TIFF_SETGET_C16_FLOAT = 36,
-	TIFF_SETGET_C16_DOUBLE = 37,
-	TIFF_SETGET_C16_IFD8 = 38,
-	TIFF_SETGET_C32_ASCII = 39,
-	TIFF_SETGET_C32_UINT8 = 40,
-	TIFF_SETGET_C32_SINT8 = 41,
-	TIFF_SETGET_C32_UINT16 = 42,
-	TIFF_SETGET_C32_SINT16 = 43,
-	TIFF_SETGET_C32_UINT32 = 44,
-	TIFF_SETGET_C32_SINT32 = 45,
-	TIFF_SETGET_C32_UINT64 = 46,
-	TIFF_SETGET_C32_SINT64 = 47,
-	TIFF_SETGET_C32_FLOAT = 48,
-	TIFF_SETGET_C32_DOUBLE = 49,
-	TIFF_SETGET_C32_IFD8 = 50,
-	TIFF_SETGET_OTHER = 51
+typedef enum
+{
+    TIFF_SETGET_UNDEFINED = 0,
+    TIFF_SETGET_ASCII = 1,
+    TIFF_SETGET_UINT8 = 2,
+    TIFF_SETGET_SINT8 = 3,
+    TIFF_SETGET_UINT16 = 4,
+    TIFF_SETGET_SINT16 = 5,
+    TIFF_SETGET_UINT32 = 6,
+    TIFF_SETGET_SINT32 = 7,
+    TIFF_SETGET_UINT64 = 8,
+    TIFF_SETGET_SINT64 = 9,
+    TIFF_SETGET_FLOAT = 10,
+    TIFF_SETGET_DOUBLE = 11,
+    TIFF_SETGET_IFD8 = 12,
+    TIFF_SETGET_INT = 13,
+    TIFF_SETGET_UINT16_PAIR = 14,
+    TIFF_SETGET_C0_ASCII = 15,
+    TIFF_SETGET_C0_UINT8 = 16,
+    TIFF_SETGET_C0_SINT8 = 17,
+    TIFF_SETGET_C0_UINT16 = 18,
+    TIFF_SETGET_C0_SINT16 = 19,
+    TIFF_SETGET_C0_UINT32 = 20,
+    TIFF_SETGET_C0_SINT32 = 21,
+    TIFF_SETGET_C0_UINT64 = 22,
+    TIFF_SETGET_C0_SINT64 = 23,
+    TIFF_SETGET_C0_FLOAT = 24,
+    TIFF_SETGET_C0_DOUBLE = 25,
+    TIFF_SETGET_C0_IFD8 = 26,
+    TIFF_SETGET_C16_ASCII = 27,
+    TIFF_SETGET_C16_UINT8 = 28,
+    TIFF_SETGET_C16_SINT8 = 29,
+    TIFF_SETGET_C16_UINT16 = 30,
+    TIFF_SETGET_C16_SINT16 = 31,
+    TIFF_SETGET_C16_UINT32 = 32,
+    TIFF_SETGET_C16_SINT32 = 33,
+    TIFF_SETGET_C16_UINT64 = 34,
+    TIFF_SETGET_C16_SINT64 = 35,
+    TIFF_SETGET_C16_FLOAT = 36,
+    TIFF_SETGET_C16_DOUBLE = 37,
+    TIFF_SETGET_C16_IFD8 = 38,
+    TIFF_SETGET_C32_ASCII = 39,
+    TIFF_SETGET_C32_UINT8 = 40,
+    TIFF_SETGET_C32_SINT8 = 41,
+    TIFF_SETGET_C32_UINT16 = 42,
+    TIFF_SETGET_C32_SINT16 = 43,
+    TIFF_SETGET_C32_UINT32 = 44,
+    TIFF_SETGET_C32_SINT32 = 45,
+    TIFF_SETGET_C32_UINT64 = 46,
+    TIFF_SETGET_C32_SINT64 = 47,
+    TIFF_SETGET_C32_FLOAT = 48,
+    TIFF_SETGET_C32_DOUBLE = 49,
+    TIFF_SETGET_C32_IFD8 = 50,
+    TIFF_SETGET_OTHER = 51
 } TIFFSetGetFieldType;
 
 #if defined(__cplusplus)
-extern "C" {
+extern "C"
+{
 #endif
 
-extern const TIFFFieldArray* _TIFFGetFields(void);
-extern const TIFFFieldArray* _TIFFGetExifFields(void);
-extern const TIFFFieldArray* _TIFFGetGpsFields(void);
-extern void _TIFFSetupFields(TIFF* tif, const TIFFFieldArray* infoarray);
-extern void _TIFFPrintFieldInfo(TIFF*, FILE*);
+    extern const TIFFFieldArray *_TIFFGetFields(void);
+    extern const TIFFFieldArray *_TIFFGetExifFields(void);
+    extern const TIFFFieldArray *_TIFFGetGpsFields(void);
+    extern void _TIFFSetupFields(TIFF *tif, const TIFFFieldArray *infoarray);
+    extern void _TIFFPrintFieldInfo(TIFF *, FILE *);
 
-extern int _TIFFFillStriles(TIFF*);        
+    extern int _TIFFFillStriles(TIFF *);
 
-typedef enum {
-	tfiatImage,
-	tfiatExif,
-	tfiatGps,		/* EXIF-GPS fields array type */
-	tfiatOther
-} TIFFFieldArrayType;
+    typedef enum
+    {
+        tfiatImage,
+        tfiatExif,
+        tfiatGps, /* EXIF-GPS fields array type */
+        tfiatOther
+    } TIFFFieldArrayType;
 
-struct _TIFFFieldArray {
-	TIFFFieldArrayType type;    /* array type, will be used to determine if IFD is image and such */
-	uint32 allocated_size;      /* 0 if array is constant, other if modified by future definition extension support */
-	uint32 count;               /* number of elements in fields array */
-	TIFFField* fields;          /* actual field info */
-};
+    struct _TIFFFieldArray
+    {
+        TIFFFieldArrayType type; /* array type, will be used to determine if IFD
+                                    is image and such */
+        uint32_t allocated_size; /* 0 if array is constant, other if modified by
+                                    future definition extension support */
+        uint32_t count;          /* number of elements in fields array */
+        TIFFField *fields;       /* actual field info */
+    };
 
-struct _TIFFField {
-	uint32 field_tag;                       /* field's tag */
-	short field_readcount;                  /* read count/TIFF_VARIABLE/TIFF_SPP */
-	short field_writecount;                 /* write count/TIFF_VARIABLE */
-	TIFFDataType field_type;                /* type of associated data */
-	uint32 reserved;                        /* reserved for future extension */
-	TIFFSetGetFieldType set_field_type;     /* type to be passed to TIFFSetField */
-	TIFFSetGetFieldType get_field_type;     /* type to be passed to TIFFGetField */
-	unsigned short field_bit;               /* bit in fieldsset bit vector */
-	unsigned char field_oktochange;         /* if true, can change while writing */
-	unsigned char field_passcount;          /* if true, pass dir count on set */
-	char* field_name;                       /* ASCII name */
-	TIFFFieldArray* field_subfields;        /* if field points to child ifds, child ifd field definition array */
-};
+    struct _TIFFField
+    {
+        uint32_t field_tag;      /* field's tag */
+        short field_readcount;   /* read count/TIFF_VARIABLE/TIFF_SPP */
+        short field_writecount;  /* write count/TIFF_VARIABLE */
+        TIFFDataType field_type; /* type of associated data */
+        uint32_t
+            field_anonymous; /* if true, this is a unknown / anonymous tag */
+        TIFFSetGetFieldType
+            set_field_type; /* type to be passed to TIFFSetField */
+        TIFFSetGetFieldType
+            get_field_type;              /* type to be passed to TIFFGetField */
+        unsigned short field_bit;        /* bit in fieldsset bit vector */
+        unsigned char field_oktochange;  /* if true, can change while writing */
+        unsigned char field_passcount;   /* if true, pass dir count on set */
+        char *field_name;                /* ASCII name */
+        TIFFFieldArray *field_subfields; /* if field points to child ifds, child
+                                            ifd field definition array */
+    };
 
-extern int _TIFFMergeFields(TIFF*, const TIFFField[], uint32);
-extern const TIFFField* _TIFFFindOrRegisterField(TIFF *, uint32, TIFFDataType);
-extern  TIFFField* _TIFFCreateAnonField(TIFF *, uint32, TIFFDataType);
-extern int _TIFFCheckFieldIsValidForCodec(TIFF *tif, ttag_t tag);
+    extern int _TIFFMergeFields(TIFF *, const TIFFField[], uint32_t);
+    extern const TIFFField *_TIFFFindOrRegisterField(TIFF *, uint32_t,
+                                                     TIFFDataType);
+    extern TIFFField *_TIFFCreateAnonField(TIFF *, uint32_t, TIFFDataType);
+    extern int _TIFFCheckFieldIsValidForCodec(TIFF *tif, ttag_t tag);
+    extern int _TIFFCheckDirNumberAndOffset(TIFF *tif, tdir_t dirn,
+                                            uint64_t diroff);
+    extern int _TIFFGetDirNumberFromOffset(TIFF *tif, uint64_t diroff,
+                                           tdir_t *dirn);
+    extern int _TIFFGetOffsetFromDirNumber(TIFF *tif, tdir_t dirn,
+                                           uint64_t *diroff);
+    extern int _TIFFRemoveEntryFromDirectoryListByOffset(TIFF *tif,
+                                                         uint64_t diroff);
 
 #if defined(__cplusplus)
 }
 #endif
 #endif /* _TIFFDIR_ */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_dirinfo.c b/3rdparty/libtiff/tif_dirinfo.c
index 7217042c25ce..0e705e81e3d0 100644
--- a/3rdparty/libtiff/tif_dirinfo.c
+++ b/3rdparty/libtiff/tif_dirinfo.c
@@ -42,860 +42,954 @@
 
 /* const object should be initialized */
 #ifdef _MSC_VER
-#pragma warning( push )
-#pragma warning( disable : 4132 )
+#pragma warning(push)
+#pragma warning(disable : 4132)
 #endif
 static const TIFFFieldArray tiffFieldArray;
 static const TIFFFieldArray exifFieldArray;
 static const TIFFFieldArray gpsFieldArray;
 #ifdef _MSC_VER
-#pragma warning( pop )
+#pragma warning(pop)
 #endif
 /*--: Rational2Double: --
- * The Rational2Double upgraded libtiff functionality allows the definition and achievement of true double-precision accuracy
- * for TIFF tags of RATIONAL type and field_bit=FIELD_CUSTOM using the set_field_type = TIFF_SETGET_DOUBLE.
+ * The Rational2Double upgraded libtiff functionality allows the definition and
+ * achievement of true double-precision accuracy for TIFF tags of RATIONAL type
+ * and field_bit=FIELD_CUSTOM using the set_field_type = TIFF_SETGET_DOUBLE.
  * Unfortunately, that changes the old implemented interface for TIFFGetField().
- * In order to keep the old TIFFGetField() interface behavior those tags have to be redefined with set_field_type = TIFF_SETGET_FLOAT!
+ * In order to keep the old TIFFGetField() interface behavior those tags have to
+ * be redefined with set_field_type = TIFF_SETGET_FLOAT!
  *
  *  Rational custom arrays are already defined as _Cxx_FLOAT, thus can stay.
  *
  */
 
-static const TIFFField
-tiffFields[] = {
-	{ TIFFTAG_SUBFILETYPE, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_SUBFILETYPE, 1, 0, "SubfileType", NULL },
-	{ TIFFTAG_OSUBFILETYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_SUBFILETYPE, 1, 0, "OldSubfileType", NULL },
-	{ TIFFTAG_IMAGEWIDTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_IMAGEDIMENSIONS, 0, 0, "ImageWidth", NULL },
-	{ TIFFTAG_IMAGELENGTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_IMAGEDIMENSIONS, 1, 0, "ImageLength", NULL },
-	{ TIFFTAG_BITSPERSAMPLE, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_BITSPERSAMPLE, 0, 0, "BitsPerSample", NULL },
-	{ TIFFTAG_COMPRESSION, -1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_COMPRESSION, 0, 0, "Compression", NULL },
-	{ TIFFTAG_PHOTOMETRIC, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_PHOTOMETRIC, 0, 0, "PhotometricInterpretation", NULL },
-	{ TIFFTAG_THRESHHOLDING, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_THRESHHOLDING, 1, 0, "Threshholding", NULL },
-	{ TIFFTAG_CELLWIDTH, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "CellWidth", NULL },
-	{ TIFFTAG_CELLLENGTH, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "CellLength", NULL },
-	{ TIFFTAG_FILLORDER, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_FILLORDER, 0, 0, "FillOrder", NULL },
-	{ TIFFTAG_DOCUMENTNAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DocumentName", NULL },
-	{ TIFFTAG_IMAGEDESCRIPTION, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageDescription", NULL },
-	{ TIFFTAG_MAKE, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Make", NULL },
-	{ TIFFTAG_MODEL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Model", NULL },
-	{ TIFFTAG_STRIPOFFSETS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPOFFSETS, 0, 0, "StripOffsets", NULL },
-	{ TIFFTAG_ORIENTATION, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_ORIENTATION, 0, 0, "Orientation", NULL },
-	{ TIFFTAG_SAMPLESPERPIXEL, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_SAMPLESPERPIXEL, 0, 0, "SamplesPerPixel", NULL },
-	{ TIFFTAG_ROWSPERSTRIP, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_ROWSPERSTRIP, 0, 0, "RowsPerStrip", NULL },
-	{ TIFFTAG_STRIPBYTECOUNTS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPBYTECOUNTS, 0, 0, "StripByteCounts", NULL },
-	{ TIFFTAG_MINSAMPLEVALUE, -2, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_MINSAMPLEVALUE, 1, 0, "MinSampleValue", NULL },
-	{ TIFFTAG_MAXSAMPLEVALUE, -2, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_MAXSAMPLEVALUE, 1, 0, "MaxSampleValue", NULL },
-	{ TIFFTAG_XRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTION, 1, 0, "XResolution", NULL },
-	{ TIFFTAG_YRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTION, 1, 0, "YResolution", NULL },
-	{ TIFFTAG_PLANARCONFIG, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_PLANARCONFIG, 0, 0, "PlanarConfiguration", NULL },
-	{ TIFFTAG_PAGENAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PageName", NULL },
-	{ TIFFTAG_XPOSITION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_POSITION, 1, 0, "XPosition", NULL },
-	{ TIFFTAG_YPOSITION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_POSITION, 1, 0, "YPosition", NULL },
-	{ TIFFTAG_FREEOFFSETS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 0, 0, "FreeOffsets", NULL },
-	{ TIFFTAG_FREEBYTECOUNTS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 0, 0, "FreeByteCounts", NULL },
-	{ TIFFTAG_GRAYRESPONSEUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "GrayResponseUnit", NULL },
-	{ TIFFTAG_GRAYRESPONSECURVE, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "GrayResponseCurve", NULL },
-	{ TIFFTAG_RESOLUTIONUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTIONUNIT, 1, 0, "ResolutionUnit", NULL },
-	{ TIFFTAG_PAGENUMBER, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_UINT16_PAIR, TIFF_SETGET_UNDEFINED, FIELD_PAGENUMBER, 1, 0, "PageNumber", NULL },
-	{ TIFFTAG_COLORRESPONSEUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "ColorResponseUnit", NULL },
-	{ TIFFTAG_TRANSFERFUNCTION, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_OTHER, TIFF_SETGET_UNDEFINED, FIELD_TRANSFERFUNCTION, 1, 0, "TransferFunction", NULL },
-	{ TIFFTAG_SOFTWARE, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Software", NULL },
-	{ TIFFTAG_DATETIME, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateTime", NULL },
-	{ TIFFTAG_ARTIST, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Artist", NULL },
-	{ TIFFTAG_HOSTCOMPUTER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "HostComputer", NULL },
-	{ TIFFTAG_WHITEPOINT, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "WhitePoint", NULL },
-	{ TIFFTAG_PRIMARYCHROMATICITIES, 6, 6, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PrimaryChromaticities", NULL },
-	{ TIFFTAG_COLORMAP, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_OTHER, TIFF_SETGET_UNDEFINED, FIELD_COLORMAP, 1, 0, "ColorMap", NULL },
-	{ TIFFTAG_HALFTONEHINTS, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_UINT16_PAIR, TIFF_SETGET_UNDEFINED, FIELD_HALFTONEHINTS, 1, 0, "HalftoneHints", NULL },
-	{ TIFFTAG_TILEWIDTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_TILEDIMENSIONS, 0, 0, "TileWidth", NULL },
-	{ TIFFTAG_TILELENGTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_TILEDIMENSIONS, 0, 0, "TileLength", NULL },
-	{ TIFFTAG_TILEOFFSETS, -1, 1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPOFFSETS, 0, 0, "TileOffsets", NULL },
-	{ TIFFTAG_TILEBYTECOUNTS, -1, 1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPBYTECOUNTS, 0, 0, "TileByteCounts", NULL },
-	{ TIFFTAG_SUBIFD, -1, -1, TIFF_IFD8, 0, TIFF_SETGET_C16_IFD8, TIFF_SETGET_UNDEFINED, FIELD_SUBIFD, 1, 1, "SubIFD", (TIFFFieldArray*) &tiffFieldArray },
-	{ TIFFTAG_INKSET, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "InkSet", NULL },
-	{ TIFFTAG_INKNAMES, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_C16_ASCII, TIFF_SETGET_UNDEFINED, FIELD_INKNAMES, 1, 1, "InkNames", NULL },
-	{ TIFFTAG_NUMBEROFINKS, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "NumberOfInks", NULL },
-	{ TIFFTAG_DOTRANGE, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_UINT16_PAIR, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DotRange", NULL },
-	{ TIFFTAG_TARGETPRINTER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "TargetPrinter", NULL },
-	{ TIFFTAG_EXTRASAMPLES, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_EXTRASAMPLES, 0, 1, "ExtraSamples", NULL },
-	{ TIFFTAG_SAMPLEFORMAT, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_SAMPLEFORMAT, 0, 0, "SampleFormat", NULL },
-	{ TIFFTAG_SMINSAMPLEVALUE, -2, -1, TIFF_ANY, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_SMINSAMPLEVALUE, 1, 0, "SMinSampleValue", NULL },
-	{ TIFFTAG_SMAXSAMPLEVALUE, -2, -1, TIFF_ANY, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_SMAXSAMPLEVALUE, 1, 0, "SMaxSampleValue", NULL },
-	{ TIFFTAG_CLIPPATH, -1, -3, TIFF_BYTE, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ClipPath", NULL },
-	{ TIFFTAG_XCLIPPATHUNITS, 1, 1, TIFF_SLONG, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "XClipPathUnits", NULL },
-	{ TIFFTAG_XCLIPPATHUNITS, 1, 1, TIFF_SBYTE, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "XClipPathUnits", NULL },
-	{ TIFFTAG_YCLIPPATHUNITS, 1, 1, TIFF_SLONG, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "YClipPathUnits", NULL },
-	{ TIFFTAG_YCBCRCOEFFICIENTS, 3, 3, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "YCbCrCoefficients", NULL },
-	{ TIFFTAG_YCBCRSUBSAMPLING, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_UINT16_PAIR, TIFF_SETGET_UNDEFINED, FIELD_YCBCRSUBSAMPLING, 0, 0, "YCbCrSubsampling", NULL },
-	{ TIFFTAG_YCBCRPOSITIONING, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_YCBCRPOSITIONING, 0, 0, "YCbCrPositioning", NULL },
-	{ TIFFTAG_REFERENCEBLACKWHITE, 6, 6, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_REFBLACKWHITE, 1, 0, "ReferenceBlackWhite", NULL },
-	{ TIFFTAG_XMLPACKET, -3, -3, TIFF_BYTE, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "XMLPacket", NULL },
-	/* begin SGI tags */
-	{ TIFFTAG_MATTEING, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_EXTRASAMPLES, 0, 0, "Matteing", NULL },
-	{ TIFFTAG_DATATYPE, -2, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_SAMPLEFORMAT, 0, 0, "DataType", NULL },
-	{ TIFFTAG_IMAGEDEPTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_IMAGEDEPTH, 0, 0, "ImageDepth", NULL },
-	{ TIFFTAG_TILEDEPTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_TILEDEPTH, 0, 0, "TileDepth", NULL },
-	/* end SGI tags */
-	/* begin Pixar tags */
-	{ TIFFTAG_PIXAR_IMAGEFULLWIDTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageFullWidth", NULL },
-	{ TIFFTAG_PIXAR_IMAGEFULLLENGTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageFullLength", NULL },
-	{ TIFFTAG_PIXAR_TEXTUREFORMAT, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "TextureFormat", NULL },
-	{ TIFFTAG_PIXAR_WRAPMODES, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "TextureWrapModes", NULL },
-	{ TIFFTAG_PIXAR_FOVCOT, 1, 1, TIFF_FLOAT, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FieldOfViewCotangent", NULL },
-	{ TIFFTAG_PIXAR_MATRIX_WORLDTOSCREEN, 16, 16, TIFF_FLOAT, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MatrixWorldToScreen", NULL },
-	{ TIFFTAG_PIXAR_MATRIX_WORLDTOCAMERA, 16, 16, TIFF_FLOAT, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MatrixWorldToCamera", NULL },
-	{ TIFFTAG_CFAREPEATPATTERNDIM, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED,	FIELD_CUSTOM, 0,	0,	"CFARepeatPatternDim", NULL },
-	{ TIFFTAG_CFAPATTERN, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "CFAPattern" , NULL},
-	{ TIFFTAG_COPYRIGHT, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Copyright", NULL },
-	/* end Pixar tags */
-	{ TIFFTAG_RICHTIFFIPTC, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "RichTIFFIPTC", NULL },
-	{ TIFFTAG_PHOTOSHOP, -3, -3, TIFF_BYTE, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "Photoshop", NULL },
-	/*--: EXIFIFD and GPSIFD specified as TIFF_LONG by Aware-Systems and not TIFF_IFD8 as in original LibTiff.
-	 *    However, for IFD-like tags, libtiff uses the data type TIFF_IFD8 in tiffFields[]-tag definition combined with
-	 *    a special handling procedure in order to write either a 32-bit value and the TIFF_IFD type-id into ClassicTIFF files 
-	 *    or a 64-bit value and the TIFF_IFD8 type-id into BigTIFF files. */
-	{ TIFFTAG_EXIFIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EXIFIFDOffset", (TIFFFieldArray*) &exifFieldArray },
-	{ TIFFTAG_ICCPROFILE, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ICC Profile", NULL },
-	{ TIFFTAG_GPSIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "GPSIFDOffset", (TIFFFieldArray*) &gpsFieldArray },
-	{ TIFFTAG_FAXRECVPARAMS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_CUSTOM, TRUE, FALSE, "FaxRecvParams", NULL },
-	{ TIFFTAG_FAXSUBADDRESS, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_ASCII, FIELD_CUSTOM, TRUE, FALSE, "FaxSubAddress", NULL },
-	{ TIFFTAG_FAXRECVTIME, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_CUSTOM, TRUE, FALSE, "FaxRecvTime", NULL },
-	{ TIFFTAG_FAXDCS, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_ASCII, FIELD_CUSTOM, TRUE, FALSE, "FaxDcs", NULL },
-	{ TIFFTAG_STONITS, 1, 1, TIFF_DOUBLE, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "StoNits", NULL },
-	{ TIFFTAG_INTEROPERABILITYIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "InteroperabilityIFDOffset", NULL },
-	/* begin DNG tags */
-	{ TIFFTAG_DNGVERSION, 4, 4, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DNGVersion", NULL },
-	{ TIFFTAG_DNGBACKWARDVERSION, 4, 4, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DNGBackwardVersion", NULL },
-	{ TIFFTAG_UNIQUECAMERAMODEL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "UniqueCameraModel", NULL },
-	{ TIFFTAG_LOCALIZEDCAMERAMODEL, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "LocalizedCameraModel", NULL },
-	{ TIFFTAG_CFAPLANECOLOR, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "CFAPlaneColor", NULL },
-	{ TIFFTAG_CFALAYOUT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "CFALayout", NULL },
-	{ TIFFTAG_LINEARIZATIONTABLE, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "LinearizationTable", NULL },
-	{ TIFFTAG_BLACKLEVELREPEATDIM, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BlackLevelRepeatDim", NULL },
-	{ TIFFTAG_BLACKLEVEL, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "BlackLevel", NULL },
-	{ TIFFTAG_BLACKLEVELDELTAH, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "BlackLevelDeltaH", NULL },
-	{ TIFFTAG_BLACKLEVELDELTAV, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "BlackLevelDeltaV", NULL },
-	{ TIFFTAG_WHITELEVEL, -1, -1, TIFF_LONG, 0, TIFF_SETGET_C16_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "WhiteLevel", NULL },
-	{ TIFFTAG_DEFAULTSCALE, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DefaultScale", NULL },
-	{ TIFFTAG_BESTQUALITYSCALE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BestQualityScale", NULL },
-	{ TIFFTAG_DEFAULTCROPORIGIN, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DefaultCropOrigin", NULL },
-	{ TIFFTAG_DEFAULTCROPSIZE, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DefaultCropSize", NULL },
-	{ TIFFTAG_COLORMATRIX1, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ColorMatrix1", NULL },
-	{ TIFFTAG_COLORMATRIX2, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ColorMatrix2", NULL },
-	{ TIFFTAG_CAMERACALIBRATION1, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "CameraCalibration1", NULL },
-	{ TIFFTAG_CAMERACALIBRATION2, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "CameraCalibration2", NULL },
-	{ TIFFTAG_REDUCTIONMATRIX1, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ReductionMatrix1", NULL },
-	{ TIFFTAG_REDUCTIONMATRIX2, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ReductionMatrix2", NULL },
-	{ TIFFTAG_ANALOGBALANCE, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "AnalogBalance", NULL },
-	{ TIFFTAG_ASSHOTNEUTRAL, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "AsShotNeutral", NULL },
-	{ TIFFTAG_ASSHOTWHITEXY, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "AsShotWhiteXY", NULL },
-	{ TIFFTAG_BASELINEEXPOSURE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineExposure", NULL },
-	{ TIFFTAG_BASELINENOISE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineNoise", NULL },
-	{ TIFFTAG_BASELINESHARPNESS, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BaselineSharpness", NULL },
-	{ TIFFTAG_BAYERGREENSPLIT, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "BayerGreenSplit", NULL },
-	{ TIFFTAG_LINEARRESPONSELIMIT, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "LinearResponseLimit", NULL },
-	{ TIFFTAG_CAMERASERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraSerialNumber", NULL },
-	{ TIFFTAG_LENSINFO, 4, 4, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "LensInfo", NULL },
-	{ TIFFTAG_CHROMABLURRADIUS, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ChromaBlurRadius", NULL },
-	{ TIFFTAG_ANTIALIASSTRENGTH, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "AntiAliasStrength", NULL },
-	{ TIFFTAG_SHADOWSCALE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ShadowScale", NULL },
-	{ TIFFTAG_DNGPRIVATEDATA, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "DNGPrivateData", NULL },
-	{ TIFFTAG_MAKERNOTESAFETY, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "MakerNoteSafety", NULL },
-	{ TIFFTAG_CALIBRATIONILLUMINANT1, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "CalibrationIlluminant1", NULL },
-	{ TIFFTAG_CALIBRATIONILLUMINANT2, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "CalibrationIlluminant2", NULL },
-	{ TIFFTAG_RAWDATAUNIQUEID, 16, 16, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "RawDataUniqueID", NULL },
-	{ TIFFTAG_ORIGINALRAWFILENAME, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OriginalRawFileName", NULL },
-	{ TIFFTAG_ORIGINALRAWFILEDATA, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "OriginalRawFileData", NULL },
-	{ TIFFTAG_ACTIVEAREA, 4, 4, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ActiveArea", NULL },
-	{ TIFFTAG_MASKEDAREAS, -1, -1, TIFF_LONG, 0, TIFF_SETGET_C16_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "MaskedAreas", NULL },
-	{ TIFFTAG_ASSHOTICCPROFILE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "AsShotICCProfile", NULL },
-	{ TIFFTAG_ASSHOTPREPROFILEMATRIX, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "AsShotPreProfileMatrix", NULL },
-	{ TIFFTAG_CURRENTICCPROFILE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "CurrentICCProfile", NULL },
-	{ TIFFTAG_CURRENTPREPROFILEMATRIX, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "CurrentPreProfileMatrix", NULL },
-	{ TIFFTAG_PERSAMPLE, 0, 0, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "PerSample", NULL},
-	/* end DNG tags */
-	/* begin TIFF/FX tags */
-        { TIFFTAG_INDEXED, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "Indexed", NULL },
-        { TIFFTAG_GLOBALPARAMETERSIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "GlobalParametersIFD", NULL },
-        { TIFFTAG_PROFILETYPE, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ProfileType", NULL },
-        { TIFFTAG_FAXPROFILE, 1, 1, TIFF_BYTE, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "FaxProfile", NULL },
-        { TIFFTAG_CODINGMETHODS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "CodingMethods", NULL },
-        { TIFFTAG_VERSIONYEAR, 4, 4, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "VersionYear", NULL },
-        { TIFFTAG_MODENUMBER, 1, 1, TIFF_BYTE, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ModeNumber", NULL },
-        { TIFFTAG_DECODE, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "Decode", NULL },
-        { TIFFTAG_IMAGEBASECOLOR, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ImageBaseColor", NULL },
-        { TIFFTAG_T82OPTIONS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "T82Options", NULL },
-        { TIFFTAG_STRIPROWCOUNTS, -1, -1, TIFF_LONG, 0, TIFF_SETGET_C16_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "StripRowCounts", NULL },
-        { TIFFTAG_IMAGELAYER, 2, 2, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "ImageLayer", NULL },
-	/* end TIFF/FX tags */
-	/* begin pseudo tags */
+/* clang-format off */ /* for better readability of tag comments */
+static const TIFFField tiffFields[] = {
+    {TIFFTAG_SUBFILETYPE, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_SUBFILETYPE, 1, 0, "SubfileType", NULL},
+    {TIFFTAG_OSUBFILETYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "OldSubfileType", NULL},
+    {TIFFTAG_IMAGEWIDTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_IMAGEDIMENSIONS, 0, 0, "ImageWidth", NULL},
+    {TIFFTAG_IMAGELENGTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_IMAGEDIMENSIONS, 1, 0, "ImageLength", NULL},
+    {TIFFTAG_BITSPERSAMPLE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_BITSPERSAMPLE, 0, 0, "BitsPerSample", NULL},
+    {TIFFTAG_COMPRESSION, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_COMPRESSION, 0, 0, "Compression", NULL},
+    {TIFFTAG_PHOTOMETRIC, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_PHOTOMETRIC, 0, 0, "PhotometricInterpretation", NULL},
+    {TIFFTAG_THRESHHOLDING, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_THRESHHOLDING, 1, 0, "Threshholding", NULL},
+    {TIFFTAG_CELLWIDTH, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CellWidth", NULL},
+    {TIFFTAG_CELLLENGTH, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CellLength", NULL},
+    {TIFFTAG_FILLORDER, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_FILLORDER, 0, 0, "FillOrder", NULL},
+    {TIFFTAG_DOCUMENTNAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DocumentName", NULL},
+    {TIFFTAG_IMAGEDESCRIPTION, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageDescription", NULL},
+    {TIFFTAG_MAKE, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Make", NULL},
+    {TIFFTAG_MODEL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Model", NULL},
+    {TIFFTAG_STRIPOFFSETS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPOFFSETS, 0, 0, "StripOffsets", NULL},
+    {TIFFTAG_ORIENTATION, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_ORIENTATION, 0, 0, "Orientation", NULL},
+    {TIFFTAG_SAMPLESPERPIXEL, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_SAMPLESPERPIXEL, 0, 0, "SamplesPerPixel", NULL},
+    {TIFFTAG_ROWSPERSTRIP, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_ROWSPERSTRIP, 0, 0, "RowsPerStrip", NULL},
+    {TIFFTAG_STRIPBYTECOUNTS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPBYTECOUNTS, 0, 0, "StripByteCounts", NULL},
+    {TIFFTAG_MINSAMPLEVALUE, -2, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_MINSAMPLEVALUE, 1, 0, "MinSampleValue", NULL},
+    {TIFFTAG_MAXSAMPLEVALUE, -2, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_MAXSAMPLEVALUE, 1, 0, "MaxSampleValue", NULL},
+    {TIFFTAG_XRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTION, 1, 0, "XResolution", NULL},
+    {TIFFTAG_YRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTION, 1, 0, "YResolution", NULL},
+    {TIFFTAG_PLANARCONFIG, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_PLANARCONFIG, 0, 0, "PlanarConfiguration", NULL},
+    {TIFFTAG_PAGENAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PageName", NULL},
+    {TIFFTAG_XPOSITION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_POSITION, 1, 0, "XPosition", NULL},
+    {TIFFTAG_YPOSITION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_POSITION, 1, 0, "YPosition", NULL},
+    {TIFFTAG_FREEOFFSETS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 0, 0, "FreeOffsets", NULL},
+    {TIFFTAG_FREEBYTECOUNTS, -1, -1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 0, 0, "FreeByteCounts", NULL},
+    {TIFFTAG_GRAYRESPONSEUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "GrayResponseUnit", NULL},
+    {TIFFTAG_GRAYRESPONSECURVE, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "GrayResponseCurve", NULL},
+    {TIFFTAG_RESOLUTIONUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_RESOLUTIONUNIT, 1, 0, "ResolutionUnit", NULL},
+    {TIFFTAG_PAGENUMBER, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_UINT16_PAIR, TIFF_SETGET_UNDEFINED, FIELD_PAGENUMBER, 1, 0, "PageNumber", NULL},
+    {TIFFTAG_COLORRESPONSEUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_IGNORE, 1, 0, "ColorResponseUnit", NULL},
+    {TIFFTAG_TRANSFERFUNCTION, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_OTHER, TIFF_SETGET_UNDEFINED, FIELD_TRANSFERFUNCTION, 1, 0, "TransferFunction", NULL},
+    {TIFFTAG_SOFTWARE, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Software", NULL},
+    {TIFFTAG_DATETIME, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateTime", NULL},
+    {TIFFTAG_ARTIST, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Artist", NULL},
+    {TIFFTAG_HOSTCOMPUTER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "HostComputer", NULL},
+    {TIFFTAG_WHITEPOINT, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "WhitePoint", NULL},
+    {TIFFTAG_PRIMARYCHROMATICITIES, 6, 6, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PrimaryChromaticities", NULL},
+    {TIFFTAG_COLORMAP, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_OTHER, TIFF_SETGET_UNDEFINED, FIELD_COLORMAP, 1, 0, "ColorMap", NULL},
+    {TIFFTAG_HALFTONEHINTS, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_UINT16_PAIR, TIFF_SETGET_UNDEFINED, FIELD_HALFTONEHINTS, 1, 0, "HalftoneHints", NULL},
+    {TIFFTAG_TILEWIDTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_TILEDIMENSIONS, 0, 0, "TileWidth", NULL},
+    {TIFFTAG_TILELENGTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_TILEDIMENSIONS, 0, 0, "TileLength", NULL},
+    {TIFFTAG_TILEOFFSETS, -1, 1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPOFFSETS, 0, 0, "TileOffsets", NULL},
+    {TIFFTAG_TILEBYTECOUNTS, -1, 1, TIFF_LONG8, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_STRIPBYTECOUNTS, 0, 0, "TileByteCounts", NULL},
+    {TIFFTAG_SUBIFD, -1, -1, TIFF_IFD8, 0, TIFF_SETGET_C16_IFD8, TIFF_SETGET_UNDEFINED, FIELD_SUBIFD, 1, 1, "SubIFD", (TIFFFieldArray *)&tiffFieldArray},
+    {TIFFTAG_INKSET, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "InkSet", NULL},
+    {TIFFTAG_INKNAMES, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_C16_ASCII, TIFF_SETGET_UNDEFINED, FIELD_INKNAMES, 1, 1, "InkNames", NULL},
+    {TIFFTAG_NUMBEROFINKS, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_NUMBEROFINKS, 1, 0, "NumberOfInks", NULL},
+    {TIFFTAG_DOTRANGE, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_UINT16_PAIR, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "DotRange", NULL},
+    {TIFFTAG_TARGETPRINTER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "TargetPrinter", NULL},
+    {TIFFTAG_EXTRASAMPLES, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_EXTRASAMPLES, 0, 1, "ExtraSamples", NULL},
+    {TIFFTAG_SAMPLEFORMAT, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_SAMPLEFORMAT, 0, 0, "SampleFormat", NULL},
+    {TIFFTAG_SMINSAMPLEVALUE, -2, -1, TIFF_ANY, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_SMINSAMPLEVALUE, 1, 0, "SMinSampleValue", NULL},
+    {TIFFTAG_SMAXSAMPLEVALUE, -2, -1, TIFF_ANY, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_SMAXSAMPLEVALUE, 1, 0, "SMaxSampleValue", NULL},
+    {TIFFTAG_CLIPPATH, -3, -3, TIFF_BYTE, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 1, "ClipPath", NULL},
+    {TIFFTAG_XCLIPPATHUNITS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "XClipPathUnits", NULL},
+    {TIFFTAG_YCLIPPATHUNITS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "YClipPathUnits", NULL},
+    {TIFFTAG_YCBCRCOEFFICIENTS, 3, 3, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "YCbCrCoefficients", NULL},
+    {TIFFTAG_YCBCRSUBSAMPLING, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_UINT16_PAIR, TIFF_SETGET_UNDEFINED, FIELD_YCBCRSUBSAMPLING, 0, 0, "YCbCrSubsampling", NULL},
+    {TIFFTAG_YCBCRPOSITIONING, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_YCBCRPOSITIONING, 0, 0, "YCbCrPositioning", NULL},
+    {TIFFTAG_REFERENCEBLACKWHITE, 6, 6, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_REFBLACKWHITE, 1, 0, "ReferenceBlackWhite", NULL},
+    {TIFFTAG_XMLPACKET, -3, -3, TIFF_BYTE, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "XMLPacket", NULL},
+    /* begin SGI tags */
+    {TIFFTAG_MATTEING, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_EXTRASAMPLES, 0, 0, "Matteing", NULL},
+    {TIFFTAG_DATATYPE, -2, -1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_SAMPLEFORMAT, 0, 0, "DataType", NULL},
+    {TIFFTAG_IMAGEDEPTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_IMAGEDEPTH, 0, 0, "ImageDepth", NULL},
+    {TIFFTAG_TILEDEPTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_TILEDEPTH, 0, 0, "TileDepth", NULL},
+    /* end SGI tags */
+    /* begin Pixar tags */
+    {TIFFTAG_PIXAR_IMAGEFULLWIDTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageFullWidth", NULL},
+    {TIFFTAG_PIXAR_IMAGEFULLLENGTH, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageFullLength", NULL},
+    {TIFFTAG_PIXAR_TEXTUREFORMAT, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "TextureFormat", NULL},
+    {TIFFTAG_PIXAR_WRAPMODES, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "TextureWrapModes", NULL},
+    {TIFFTAG_PIXAR_FOVCOT, 1, 1, TIFF_FLOAT, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FieldOfViewCotangent", NULL},
+    {TIFFTAG_PIXAR_MATRIX_WORLDTOSCREEN, 16, 16, TIFF_FLOAT, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MatrixWorldToScreen", NULL},
+    {TIFFTAG_PIXAR_MATRIX_WORLDTOCAMERA, 16, 16, TIFF_FLOAT, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MatrixWorldToCamera", NULL},
+    {TIFFTAG_COPYRIGHT, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Copyright", NULL},
+    /* end Pixar tags */
+    {TIFFTAG_RICHTIFFIPTC, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "RichTIFFIPTC", NULL},
+    {TIFFTAG_PHOTOSHOP, -3, -3, TIFF_BYTE, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "Photoshop", NULL},
+    /*--: EXIFIFD and GPSIFD specified as TIFF_LONG by Aware-Systems and not TIFF_IFD8 as in original LibTiff. However, for IFD-like tags,
+     * libtiff uses the data type TIFF_IFD8 in tiffFields[]-tag definition combined with a special handling procedure in order to write either
+     * a 32-bit value and the TIFF_IFD type-id into ClassicTIFF files or a 64-bit value and the TIFF_IFD8 type-id into BigTIFF files. */
+    {TIFFTAG_EXIFIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EXIFIFDOffset", (TIFFFieldArray *)&exifFieldArray},
+    {TIFFTAG_ICCPROFILE, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ICC Profile", NULL},
+    {TIFFTAG_GPSIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "GPSIFDOffset", (TIFFFieldArray *)&gpsFieldArray},
+    {TIFFTAG_FAXRECVPARAMS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_CUSTOM, TRUE, FALSE, "FaxRecvParams", NULL},
+    {TIFFTAG_FAXSUBADDRESS, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_ASCII, FIELD_CUSTOM, TRUE, FALSE, "FaxSubAddress", NULL},
+    {TIFFTAG_FAXRECVTIME, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_CUSTOM, TRUE, FALSE, "FaxRecvTime", NULL},
+    {TIFFTAG_FAXDCS, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_ASCII, FIELD_CUSTOM, TRUE, FALSE, "FaxDcs", NULL},
+    {TIFFTAG_STONITS, 1, 1, TIFF_DOUBLE, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "StoNits", NULL},
+    {TIFFTAG_IMAGESOURCEDATA, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "Adobe Photoshop Document Data Block", NULL},
+    {TIFFTAG_INTEROPERABILITYIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 0, 0, "InteroperabilityIFDOffset", NULL},
+    /* begin DNG tags */
+    {TIFFTAG_DNGVERSION, 4, 4, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DNGVersion", NULL},
+    {TIFFTAG_DNGBACKWARDVERSION, 4, 4, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DNGBackwardVersion", NULL},
+    {TIFFTAG_UNIQUECAMERAMODEL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "UniqueCameraModel", NULL},
+    {TIFFTAG_LOCALIZEDCAMERAMODEL, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "LocalizedCameraModel", NULL},
+    {TIFFTAG_CFAPLANECOLOR, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CFAPlaneColor", NULL},
+    {TIFFTAG_CFALAYOUT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CFALayout", NULL},
+    {TIFFTAG_LINEARIZATIONTABLE, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "LinearizationTable", NULL},
+    {TIFFTAG_BLACKLEVELREPEATDIM, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BlackLevelRepeatDim", NULL},
+    {TIFFTAG_BLACKLEVEL, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "BlackLevel", NULL},
+    {TIFFTAG_BLACKLEVELDELTAH, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "BlackLevelDeltaH", NULL},
+    {TIFFTAG_BLACKLEVELDELTAV, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "BlackLevelDeltaV", NULL},
+    {TIFFTAG_WHITELEVEL, -1, -1, TIFF_LONG, 0, TIFF_SETGET_C16_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "WhiteLevel", NULL},
+    {TIFFTAG_DEFAULTSCALE, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DefaultScale", NULL},
+    {TIFFTAG_BESTQUALITYSCALE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BestQualityScale", NULL},
+    {TIFFTAG_DEFAULTCROPORIGIN, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DefaultCropOrigin", NULL},
+    {TIFFTAG_DEFAULTCROPSIZE, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DefaultCropSize", NULL},
+    {TIFFTAG_COLORMATRIX1, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ColorMatrix1", NULL},
+    {TIFFTAG_COLORMATRIX2, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ColorMatrix2", NULL},
+    {TIFFTAG_CAMERACALIBRATION1, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CameraCalibration1", NULL},
+    {TIFFTAG_CAMERACALIBRATION2, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CameraCalibration2", NULL},
+    {TIFFTAG_REDUCTIONMATRIX1, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ReductionMatrix1", NULL},
+    {TIFFTAG_REDUCTIONMATRIX2, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ReductionMatrix2", NULL},
+    {TIFFTAG_ANALOGBALANCE, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "AnalogBalance", NULL},
+    {TIFFTAG_ASSHOTNEUTRAL, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "AsShotNeutral", NULL},
+    {TIFFTAG_ASSHOTWHITEXY, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "AsShotWhiteXY", NULL},
+    {TIFFTAG_BASELINEEXPOSURE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BaselineExposure", NULL},
+    {TIFFTAG_BASELINENOISE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BaselineNoise", NULL},
+    {TIFFTAG_BASELINESHARPNESS, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BaselineSharpness", NULL},
+    {TIFFTAG_BAYERGREENSPLIT, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BayerGreenSplit", NULL},
+    {TIFFTAG_LINEARRESPONSELIMIT, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LinearResponseLimit", NULL},
+    {TIFFTAG_CAMERASERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraSerialNumber", NULL},
+    {TIFFTAG_LENSINFO, 4, 4, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensInfo", NULL},
+    {TIFFTAG_CHROMABLURRADIUS, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ChromaBlurRadius", NULL},
+    {TIFFTAG_ANTIALIASSTRENGTH, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "AntiAliasStrength", NULL},
+    {TIFFTAG_SHADOWSCALE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ShadowScale", NULL},
+    {TIFFTAG_DNGPRIVATEDATA, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "DNGPrivateData", NULL},
+    {TIFFTAG_MAKERNOTESAFETY, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MakerNoteSafety", NULL},
+    {TIFFTAG_CALIBRATIONILLUMINANT1, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CalibrationIlluminant1", NULL},
+    {TIFFTAG_CALIBRATIONILLUMINANT2, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CalibrationIlluminant2", NULL},
+    {TIFFTAG_RAWDATAUNIQUEID, 16, 16, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RawDataUniqueID", NULL},
+    {TIFFTAG_ORIGINALRAWFILENAME, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OriginalRawFileName", NULL},
+    {TIFFTAG_ORIGINALRAWFILEDATA, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OriginalRawFileData", NULL},
+    {TIFFTAG_ACTIVEAREA, 4, 4, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ActiveArea", NULL},
+    {TIFFTAG_MASKEDAREAS, -1, -1, TIFF_LONG, 0, TIFF_SETGET_C16_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "MaskedAreas", NULL},
+    {TIFFTAG_ASSHOTICCPROFILE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "AsShotICCProfile", NULL},
+    {TIFFTAG_ASSHOTPREPROFILEMATRIX, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "AsShotPreProfileMatrix", NULL},
+    {TIFFTAG_CURRENTICCPROFILE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CurrentICCProfile", NULL},
+    {TIFFTAG_CURRENTPREPROFILEMATRIX, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CurrentPreProfileMatrix", NULL},
+    {TIFFTAG_PERSAMPLE, 0, 0, TIFF_SHORT, 0, TIFF_SETGET_UNDEFINED, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "PerSample", NULL},
+#if 0
+    /* TODO: revert above #if 0 for TIFF 4.6.0 */
+
+    /* begin DNG 1.2.0.0 tags */
+    {TIFFTAG_COLORIMETRICREFERENCE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ColorimetricReference", NULL},
+    {TIFFTAG_CAMERACALIBRATIONSIGNATURE, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CameraCalibrationSignature", NULL},
+    {TIFFTAG_PROFILECALIBRATIONSIGNATURE, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileCalibrationSignature", NULL},
+    {TIFFTAG_EXTRACAMERAPROFILES, -1, -1, TIFF_IFD8, 0, TIFF_SETGET_C16_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ExtraCameraProfiles", NULL},
+    {TIFFTAG_ASSHOTPROFILENAME, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "AsShotProfileName", NULL},
+    {TIFFTAG_NOISEREDUCTIONAPPLIED, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "NoiseReductionApplied", NULL},
+    {TIFFTAG_PROFILENAME, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileName", NULL},
+    {TIFFTAG_PROFILEHUESATMAPDIMS, 3, 3, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ProfileHueSatMapDims", NULL},
+    {TIFFTAG_PROFILEHUESATMAPDATA1, -1, -1, TIFF_FLOAT, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileHueSatMapData1", NULL},
+    {TIFFTAG_PROFILEHUESATMAPDATA2, -1, -1, TIFF_FLOAT, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileHueSatMapData2", NULL},
+    {TIFFTAG_PROFILETONECURVE, -1, -1, TIFF_FLOAT, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileToneCurve", NULL},
+    {TIFFTAG_PROFILEEMBEDPOLICY, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ProfileEmbedPolicy", NULL},
+    {TIFFTAG_PROFILECOPYRIGHT, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileCopyright", NULL},
+    {TIFFTAG_FORWARDMATRIX1, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ForwardMatrix1", NULL},
+    {TIFFTAG_FORWARDMATRIX2, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ForwardMatrix2", NULL},
+    {TIFFTAG_PREVIEWAPPLICATIONNAME, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "PreviewApplicationName", NULL},
+    {TIFFTAG_PREVIEWAPPLICATIONVERSION, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "PreviewApplicationVersion", NULL},
+    {TIFFTAG_PREVIEWSETTINGSNAME, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "PreviewSettingsName", NULL},
+    {TIFFTAG_PREVIEWSETTINGSDIGEST, 16, 16, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PreviewSettingsDigest", NULL},
+    {TIFFTAG_PREVIEWCOLORSPACE, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PreviewColorSpace", NULL},
+    {TIFFTAG_PREVIEWDATETIME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PreviewDateTime", NULL},
+    {TIFFTAG_RAWIMAGEDIGEST, 16, 16, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RawImageDigest", NULL},
+    {TIFFTAG_ORIGINALRAWFILEDIGEST, 16, 16, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OriginalRawFileDigest", NULL},
+    {TIFFTAG_SUBTILEBLOCKSIZE, 2, 2, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubTileBlockSize", NULL},
+    {TIFFTAG_ROWINTERLEAVEFACTOR, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RowInterleaveFactor", NULL},
+    {TIFFTAG_PROFILELOOKTABLEDIMS, 3, 3, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ProfileLookTableDims", NULL},
+    {TIFFTAG_PROFILELOOKTABLEDATA, -1, -1, TIFF_FLOAT, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileLookTableData", NULL},
+    /* begin DNG 1.3.0.0 tags */
+    {TIFFTAG_OPCODELIST1, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OpcodeList1", NULL},
+    {TIFFTAG_OPCODELIST2, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OpcodeList2", NULL},
+    {TIFFTAG_OPCODELIST3, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OpcodeList3", NULL},
+    {TIFFTAG_NOISEPROFILE, -1, -1, TIFF_DOUBLE, 0, TIFF_SETGET_C16_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "NoiseProfile", NULL},
+    /* begin DNG 1.4.0.0 tags */
+    {TIFFTAG_DEFAULTUSERCROP, 4, 4, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DefaultUserCrop", NULL},
+    {TIFFTAG_DEFAULTBLACKRENDER, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DefaultBlackRender", NULL},
+    {TIFFTAG_BASELINEEXPOSUREOFFSET, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BaselineExposureOffset", NULL},
+    {TIFFTAG_PROFILELOOKTABLEENCODING, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ProfileLookTableEncoding", NULL},
+    {TIFFTAG_PROFILEHUESATMAPENCODING, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ProfileHueSatMapEncoding", NULL},
+    {TIFFTAG_ORIGINALDEFAULTFINALSIZE, 2, 2, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OriginalDefaultFinalSize", NULL},
+    {TIFFTAG_ORIGINALBESTQUALITYFINALSIZE, 2, 2, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OriginalBestQualityFinalSize", NULL},
+    {TIFFTAG_ORIGINALDEFAULTCROPSIZE, 2, 2, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OriginalDefaultCropSize", NULL}, /* could also be rational */
+    {TIFFTAG_NEWRAWIMAGEDIGEST, 16, 16, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "NewRawImageDigest", NULL},
+    {TIFFTAG_RAWTOPREVIEWGAIN, 1, 1, TIFF_DOUBLE, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RawToPreviewGain", NULL},
+    /* begin DNG 1.5.0.0 tags */
+    {TIFFTAG_DEPTHFORMAT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DepthFormat", NULL},
+    {TIFFTAG_DEPTHNEAR, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DepthNear", NULL},
+    {TIFFTAG_DEPTHFAR, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DepthFar", NULL},
+    {TIFFTAG_DEPTHUNITS, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DepthUnits", NULL},
+    {TIFFTAG_DEPTHMEASURETYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DepthMeasureType", NULL},
+    {TIFFTAG_ENHANCEPARAMS, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EnhanceParams", NULL},
+    /* begin DNG 1.6.0.0 tags */
+    {TIFFTAG_PROFILEGAINTABLEMAP, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileGainTableMap", NULL},
+    {TIFFTAG_SEMANTICNAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SemanticName", NULL},
+    {TIFFTAG_SEMANTICINSTANCEID, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SemanticInstanceID", NULL},
+    {TIFFTAG_MASKSUBAREA, 4, 4, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MaskSubArea", NULL},
+    {TIFFTAG_RGBTABLES, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "RGBTables", NULL},
+    {TIFFTAG_CALIBRATIONILLUMINANT3, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CalibrationIlluminant3", NULL},
+    {TIFFTAG_COLORMATRIX3, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ColorMatrix3", NULL},
+    {TIFFTAG_CAMERACALIBRATION3, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CameraCalibration3", NULL},
+    {TIFFTAG_REDUCTIONMATRIX3, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ReductionMatrix3", NULL},
+    {TIFFTAG_PROFILEHUESATMAPDATA3, -1, -1, TIFF_FLOAT, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProfileHueSatMapData3", NULL},
+    {TIFFTAG_FORWARDMATRIX3, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ForwardMatrix3", NULL},
+    {TIFFTAG_ILLUMINANTDATA1, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "IlluminantData1", NULL},
+    {TIFFTAG_ILLUMINANTDATA2, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "IlluminantData2", NULL},
+    {TIFFTAG_ILLUMINANTDATA3, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "IlluminantData3", NULL},
+    /* end DNG tags */
+    /* begin TIFF/EP tags */
+    {TIFFTAG_EP_CFAREPEATPATTERNDIM, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP CFARepeatPatternDim", NULL},
+    {TIFFTAG_EP_CFAPATTERN, -1, -1, TIFF_BYTE, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP CFAPattern", NULL},
+    /* TIFFTAG_EP_BATTERYLEVEL can be RATIONAL or ASCII.
+     * LibTiff defines it as ASCII and converts RATIONAL to an ASCII string. */
+    {TIFFTAG_EP_BATTERYLEVEL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP BatteryLevel", NULL},
+    {TIFFTAG_EP_INTERLACE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP Interlace", NULL},
+    /* TIFFTAG_EP_IPTC_NAA and TIFFTAG_RICHTIFFIPTC share the same tag number (33723)
+     *   LibTIFF type is UNDEFINED or BYTE, but often times incorrectly specified as LONG, because TIFF/EP (ISO/DIS 12234-2) specifies type LONG or ASCII. */
+    {TIFFTAG_EP_TIMEZONEOFFSET, -1, -1, TIFF_SSHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP TimeZoneOffset", NULL},
+    {TIFFTAG_EP_SELFTIMERMODE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP SelfTimerMode", NULL},
+    {TIFFTAG_EP_FLASHENERGY, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP FlashEnergy", NULL},
+    {TIFFTAG_EP_SPATIALFREQUENCYRESPONSE, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP SpatialFrequencyResponse", NULL},
+    {TIFFTAG_EP_NOISE, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP Noise", NULL},
+    {TIFFTAG_EP_FOCALPLANEXRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP FocalPlaneXResolution", NULL},
+    {TIFFTAG_EP_FOCALPLANEYRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP FocalPlaneYResolution", NULL},
+    {TIFFTAG_EP_FOCALPLANERESOLUTIONUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP FocalPlaneResolutionUnit", NULL},
+    {TIFFTAG_EP_IMAGENUMBER, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP ImageNumber", NULL}, /* or SHORT */
+    {TIFFTAG_EP_SECURITYCLASSIFICATION, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP SecurityClassification", NULL},
+    {TIFFTAG_EP_IMAGEHISTORY, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP ImageHistory", NULL},
+    {TIFFTAG_EP_EXPOSUREINDEX, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP ExposureIndex", NULL},
+    {TIFFTAG_EP_STANDARDID, 4, 4, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP StandardId", NULL},
+    {TIFFTAG_EP_SENSINGMETHOD, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP SensingMethod", NULL},
+    /* TIFF/EP tags equivalent to EXIF tags, sometimes defined differently. */
+    {TIFFTAG_EP_EXPOSURETIME, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP ExposureTime", NULL}, /*N=1 or 2 */
+    {TIFFTAG_EP_FNUMBER, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP FNumber", NULL},
+    {TIFFTAG_EP_EXPOSUREPROGRAM, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP ExposureProgram", NULL},
+    {TIFFTAG_EP_SPECTRALSENSITIVITY, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP SpectralSensitivity", NULL},
+    {TIFFTAG_EP_ISOSPEEDRATINGS, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP ISOSpeedRatings", NULL},
+    {TIFFTAG_EP_OECF, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP OptoelectricConversionFactor", NULL},
+    {TIFFTAG_EP_DATETIMEORIGINAL, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP DateTimeOriginal", NULL},
+    {TIFFTAG_EP_COMPRESSEDBITSPERPIXEL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP CompressedBitsPerPixel", NULL},
+    {TIFFTAG_EP_SHUTTERSPEEDVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP ShutterSpeedValue", NULL},
+    {TIFFTAG_EP_APERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP ApertureValue", NULL},
+    {TIFFTAG_EP_BRIGHTNESSVALUE, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP BrightnessValue", NULL},
+    {TIFFTAG_EP_EXPOSUREBIASVALUE, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP ExposureBiasValue", NULL}, /*N=1 or 2 */
+    {TIFFTAG_EP_MAXAPERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP MaxApertureValue", NULL},
+    {TIFFTAG_EP_SUBJECTDISTANCE, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP SubjectDistance", NULL},
+    {TIFFTAG_EP_METERINGMODE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP MeteringMode", NULL},
+    {TIFFTAG_EP_LIGHTSOURCE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP LightSource", NULL},
+    {TIFFTAG_EP_FLASH, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "EP Flash", NULL},
+    {TIFFTAG_EP_FOCALLENGTH, -1, -1, TIFF_RATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP FocalLength", NULL},
+    {TIFFTAG_EP_SUBJECTLOCATION, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "EP SubjectLocation", NULL},
+    /* end TIFF/EP tags */
+#endif
+    /* begin TIFF/FX tags */
+    {TIFFTAG_INDEXED, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Indexed", NULL},
+    {TIFFTAG_GLOBALPARAMETERSIFD, 1, 1, TIFF_IFD8, 0, TIFF_SETGET_IFD8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "GlobalParametersIFD", NULL},
+    {TIFFTAG_PROFILETYPE, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ProfileType", NULL},
+    {TIFFTAG_FAXPROFILE, 1, 1, TIFF_BYTE, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FaxProfile", NULL},
+    {TIFFTAG_CODINGMETHODS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CodingMethods", NULL},
+    {TIFFTAG_VERSIONYEAR, 4, 4, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "VersionYear", NULL},
+    {TIFFTAG_MODENUMBER, 1, 1, TIFF_BYTE, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ModeNumber", NULL},
+    {TIFFTAG_DECODE, -1, -1, TIFF_SRATIONAL, 0, TIFF_SETGET_C16_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "Decode", NULL},
+    {TIFFTAG_IMAGEBASECOLOR, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ImageBaseColor", NULL},
+    {TIFFTAG_T82OPTIONS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "T82Options", NULL},
+    {TIFFTAG_STRIPROWCOUNTS, -1, -1, TIFF_LONG, 0, TIFF_SETGET_C16_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "StripRowCounts", NULL},
+    {TIFFTAG_IMAGELAYER, 2, 2, TIFF_LONG, 0, TIFF_SETGET_C0_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageLayer", NULL},
+    /* end TIFF/FX tags */
+    /* begin pseudo tags */
 };
 
 /*
  * EXIF tags  (Version 2.31, July 2016 plus version 2.32 May 2019)
  */
-static const TIFFField
-exifFields[] = {
-	{ EXIFTAG_EXPOSURETIME, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureTime", NULL },
-	{ EXIFTAG_FNUMBER, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FNumber", NULL },
-	{ EXIFTAG_EXPOSUREPROGRAM, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureProgram", NULL },
-	{ EXIFTAG_SPECTRALSENSITIVITY, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SpectralSensitivity", NULL },
-	{ EXIFTAG_ISOSPEEDRATINGS, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ISOSpeedRatings", NULL },
-	{ EXIFTAG_OECF, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OptoelectricConversionFactor", NULL },
-	{ EXIFTAG_SENSITIVITYTYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SensitivityType", NULL },
-	{ EXIFTAG_STANDARDOUTPUTSENSITIVITY, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "StandardOutputSensitivity", NULL },
-	{ EXIFTAG_RECOMMENDEDEXPOSUREINDEX, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RecommendedExposureIndex", NULL },
-	{ EXIFTAG_ISOSPEED, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeed", NULL },
-	{ EXIFTAG_ISOSPEEDLATITUDEYYY, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeedLatitudeyyy", NULL },
-	{ EXIFTAG_ISOSPEEDLATITUDEZZZ, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeedLatitudezzz", NULL },
-	{ EXIFTAG_EXIFVERSION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExifVersion", NULL },
-	{ EXIFTAG_DATETIMEORIGINAL, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateTimeOriginal", NULL },
-	{ EXIFTAG_DATETIMEDIGITIZED, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateTimeDigitized", NULL },
-	{ EXIFTAG_OFFSETTIME, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTime", NULL },
-	{ EXIFTAG_OFFSETTIMEORIGINAL, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTimeOriginal", NULL },
-	{ EXIFTAG_OFFSETTIMEDIGITIZED, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTimeDigitized", NULL },
-	{ EXIFTAG_COMPONENTSCONFIGURATION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ComponentsConfiguration", NULL },
-	{ EXIFTAG_COMPRESSEDBITSPERPIXEL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CompressedBitsPerPixel", NULL },
-	{ EXIFTAG_SHUTTERSPEEDVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ShutterSpeedValue", NULL },
-	{ EXIFTAG_APERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ApertureValue", NULL },
-	{ EXIFTAG_BRIGHTNESSVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BrightnessValue", NULL },
-	{ EXIFTAG_EXPOSUREBIASVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureBiasValue", NULL },
-	{ EXIFTAG_MAXAPERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MaxApertureValue", NULL },
-	/*--: EXIFTAG_SUBJECTDISTANCE: LibTiff returns value of "-1" if numerator equals 4294967295 (0xFFFFFFFF) to indicate infinite distance!
-	 * However, there are two other EXIF tags where numerator indicates a special value and six other cases where the denominator indicates special values,
-	 * which are not treated within LibTiff!! */
-	{ EXIFTAG_SUBJECTDISTANCE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectDistance", NULL },
-	{ EXIFTAG_METERINGMODE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MeteringMode", NULL },
-	{ EXIFTAG_LIGHTSOURCE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LightSource", NULL },
-	{ EXIFTAG_FLASH, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Flash", NULL },
-	{ EXIFTAG_FOCALLENGTH, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalLength", NULL },
-	{ EXIFTAG_SUBJECTAREA, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "SubjectArea", NULL },
-	{ EXIFTAG_MAKERNOTE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "MakerNote", NULL },
-	{ EXIFTAG_USERCOMMENT, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "UserComment", NULL },
-	{ EXIFTAG_SUBSECTIME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTime", NULL },
-	{ EXIFTAG_SUBSECTIMEORIGINAL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTimeOriginal", NULL },
-	{ EXIFTAG_SUBSECTIMEDIGITIZED, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTimeDigitized", NULL },
-	{ EXIFTAG_TEMPERATURE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Temperature", NULL },
-	{ EXIFTAG_HUMIDITY, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Humidity", NULL },
-	{ EXIFTAG_PRESSURE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Pressure", NULL },
-	{ EXIFTAG_WATERDEPTH, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "WaterDepth", NULL },
-	{ EXIFTAG_ACCELERATION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Acceleration", NULL },
-	{ EXIFTAG_CAMERAELEVATIONANGLE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraElevationAngle", NULL },
-	{ EXIFTAG_FLASHPIXVERSION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FlashpixVersion", NULL },
-	{ EXIFTAG_COLORSPACE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ColorSpace", NULL },
-	{ EXIFTAG_PIXELXDIMENSION, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PixelXDimension", NULL },
-	{ EXIFTAG_PIXELYDIMENSION, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PixelYDimension", NULL },
-	{ EXIFTAG_RELATEDSOUNDFILE, 13, 13, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RelatedSoundFile", NULL },
-	{ EXIFTAG_FLASHENERGY, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FlashEnergy", NULL },
-	{ EXIFTAG_SPATIALFREQUENCYRESPONSE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "SpatialFrequencyResponse", NULL },
-	{ EXIFTAG_FOCALPLANEXRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneXResolution", NULL },
-	{ EXIFTAG_FOCALPLANEYRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneYResolution", NULL },
-	{ EXIFTAG_FOCALPLANERESOLUTIONUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneResolutionUnit", NULL },
-	{ EXIFTAG_SUBJECTLOCATION, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectLocation", NULL },
-	{ EXIFTAG_EXPOSUREINDEX, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureIndex", NULL },
-	{ EXIFTAG_SENSINGMETHOD, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SensingMethod", NULL },
-	{ EXIFTAG_FILESOURCE, 1, 1, TIFF_UNDEFINED, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FileSource", NULL },
-	{ EXIFTAG_SCENETYPE, 1, 1, TIFF_UNDEFINED, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SceneType", NULL },
-	{ EXIFTAG_CFAPATTERN, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CFAPattern", NULL },
-	{ EXIFTAG_CUSTOMRENDERED, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CustomRendered", NULL },
-	{ EXIFTAG_EXPOSUREMODE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureMode", NULL },
-	{ EXIFTAG_WHITEBALANCE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "WhiteBalance", NULL },
-	{ EXIFTAG_DIGITALZOOMRATIO, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DigitalZoomRatio", NULL },
-	{ EXIFTAG_FOCALLENGTHIN35MMFILM, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalLengthIn35mmFilm", NULL },
-	{ EXIFTAG_SCENECAPTURETYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SceneCaptureType", NULL },
-	{ EXIFTAG_GAINCONTROL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "GainControl", NULL },
-	{ EXIFTAG_CONTRAST, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Contrast", NULL },
-	{ EXIFTAG_SATURATION, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Saturation", NULL },
-	{ EXIFTAG_SHARPNESS, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Sharpness", NULL },
-	{ EXIFTAG_DEVICESETTINGDESCRIPTION, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "DeviceSettingDescription", NULL },
-	{ EXIFTAG_SUBJECTDISTANCERANGE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectDistanceRange", NULL },
-	{ EXIFTAG_IMAGEUNIQUEID, 33, 33, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageUniqueID", NULL },
-	{ EXIFTAG_CAMERAOWNERNAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraOwnerName", NULL },
-	{ EXIFTAG_BODYSERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BodySerialNumber", NULL },
-	{ EXIFTAG_LENSSPECIFICATION, 4, 4, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensSpecification", NULL },
-	{ EXIFTAG_LENSMAKE, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensMake", NULL },
-	{ EXIFTAG_LENSMODEL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensModel", NULL },
-	{ EXIFTAG_LENSSERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensSerialNumber", NULL },
-	{ EXIFTAG_GAMMA, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Gamma", NULL },
-	{ EXIFTAG_COMPOSITEIMAGE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CompositeImage", NULL },
-	{ EXIFTAG_SOURCEIMAGENUMBEROFCOMPOSITEIMAGE, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SourceImageNumberOfCompositeImage", NULL },
-	{ EXIFTAG_SOURCEEXPOSURETIMESOFCOMPOSITEIMAGE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "SourceExposureTimesOfCompositeImage", NULL }
-};
+static const TIFFField exifFields[] = {
+    {EXIFTAG_EXPOSURETIME, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureTime", NULL},
+    {EXIFTAG_FNUMBER, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FNumber", NULL},
+    {EXIFTAG_EXPOSUREPROGRAM, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureProgram", NULL},
+    {EXIFTAG_SPECTRALSENSITIVITY, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SpectralSensitivity", NULL},
+    /* After EXIF 2.2.1 ISOSpeedRatings is named PhotographicSensitivity. In addition, while "Count=Any", only 1 count should be used. */
+    {EXIFTAG_ISOSPEEDRATINGS, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ISOSpeedRatings", NULL},
+    {EXIFTAG_OECF, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "OptoelectricConversionFactor", NULL},
+    {EXIFTAG_SENSITIVITYTYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SensitivityType", NULL},
+    {EXIFTAG_STANDARDOUTPUTSENSITIVITY, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "StandardOutputSensitivity", NULL},
+    {EXIFTAG_RECOMMENDEDEXPOSUREINDEX, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RecommendedExposureIndex", NULL},
+    {EXIFTAG_ISOSPEED, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeed", NULL},
+    {EXIFTAG_ISOSPEEDLATITUDEYYY, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeedLatitudeyyy", NULL},
+    {EXIFTAG_ISOSPEEDLATITUDEZZZ, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ISOSpeedLatitudezzz", NULL},
+    {EXIFTAG_EXIFVERSION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExifVersion", NULL},
+    {EXIFTAG_DATETIMEORIGINAL, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateTimeOriginal", NULL},
+    {EXIFTAG_DATETIMEDIGITIZED, 20, 20, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateTimeDigitized", NULL},
+    {EXIFTAG_OFFSETTIME, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTime", NULL},
+    {EXIFTAG_OFFSETTIMEORIGINAL, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTimeOriginal", NULL},
+    {EXIFTAG_OFFSETTIMEDIGITIZED, 7, 7, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "OffsetTimeDigitized", NULL},
+    {EXIFTAG_COMPONENTSCONFIGURATION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ComponentsConfiguration", NULL},
+    {EXIFTAG_COMPRESSEDBITSPERPIXEL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CompressedBitsPerPixel", NULL},
+    {EXIFTAG_SHUTTERSPEEDVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ShutterSpeedValue", NULL},
+    {EXIFTAG_APERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ApertureValue", NULL},
+    {EXIFTAG_BRIGHTNESSVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BrightnessValue", NULL},
+    {EXIFTAG_EXPOSUREBIASVALUE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureBiasValue", NULL},
+    {EXIFTAG_MAXAPERTUREVALUE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MaxApertureValue", NULL},
+    /*--: EXIFTAG_SUBJECTDISTANCE: LibTiff returns value of "-1" if numerator equals 4294967295 (0xFFFFFFFF) to indicate infinite distance! 
+     *    However, there are two other EXIF tags where numerator indicates a special value and six other cases where the denominator indicates special values,
+     *    which are not treated within LibTiff!! */
+    {EXIFTAG_SUBJECTDISTANCE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectDistance", NULL},
+    {EXIFTAG_METERINGMODE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MeteringMode", NULL},
+    {EXIFTAG_LIGHTSOURCE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LightSource", NULL},
+    {EXIFTAG_FLASH, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Flash", NULL},
+    {EXIFTAG_FOCALLENGTH, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalLength", NULL},
+    {EXIFTAG_SUBJECTAREA, -1, -1, TIFF_SHORT, 0, TIFF_SETGET_C16_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "SubjectArea", NULL},
+    {EXIFTAG_MAKERNOTE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "MakerNote", NULL},
+    {EXIFTAG_USERCOMMENT, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "UserComment", NULL},
+    {EXIFTAG_SUBSECTIME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTime", NULL},
+    {EXIFTAG_SUBSECTIMEORIGINAL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTimeOriginal", NULL},
+    {EXIFTAG_SUBSECTIMEDIGITIZED, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubSecTimeDigitized", NULL},
+    {EXIFTAG_TEMPERATURE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Temperature", NULL},
+    {EXIFTAG_HUMIDITY, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Humidity", NULL},
+    {EXIFTAG_PRESSURE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Pressure", NULL},
+    {EXIFTAG_WATERDEPTH, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "WaterDepth", NULL},
+    {EXIFTAG_ACCELERATION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Acceleration", NULL},
+    {EXIFTAG_CAMERAELEVATIONANGLE, 1, 1, TIFF_SRATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraElevationAngle", NULL},
+    {EXIFTAG_FLASHPIXVERSION, 4, 4, TIFF_UNDEFINED, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FlashpixVersion", NULL},
+    {EXIFTAG_COLORSPACE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ColorSpace", NULL},
+    {EXIFTAG_PIXELXDIMENSION, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PixelXDimension", NULL},
+    {EXIFTAG_PIXELYDIMENSION, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "PixelYDimension", NULL},
+    {EXIFTAG_RELATEDSOUNDFILE, 13, 13, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "RelatedSoundFile", NULL},
+    {EXIFTAG_FLASHENERGY, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FlashEnergy", NULL},
+    {EXIFTAG_SPATIALFREQUENCYRESPONSE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "SpatialFrequencyResponse", NULL},
+    {EXIFTAG_FOCALPLANEXRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneXResolution", NULL},
+    {EXIFTAG_FOCALPLANEYRESOLUTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneYResolution", NULL},
+    {EXIFTAG_FOCALPLANERESOLUTIONUNIT, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalPlaneResolutionUnit", NULL},
+    {EXIFTAG_SUBJECTLOCATION, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectLocation", NULL},
+    {EXIFTAG_EXPOSUREINDEX, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureIndex", NULL},
+    {EXIFTAG_SENSINGMETHOD, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SensingMethod", NULL},
+    {EXIFTAG_FILESOURCE, 1, 1, TIFF_UNDEFINED, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FileSource", NULL},
+    {EXIFTAG_SCENETYPE, 1, 1, TIFF_UNDEFINED, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SceneType", NULL},
+    {EXIFTAG_CFAPATTERN, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "CFAPattern", NULL},
+    {EXIFTAG_CUSTOMRENDERED, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CustomRendered", NULL},
+    {EXIFTAG_EXPOSUREMODE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ExposureMode", NULL},
+    {EXIFTAG_WHITEBALANCE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "WhiteBalance", NULL},
+    {EXIFTAG_DIGITALZOOMRATIO, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DigitalZoomRatio", NULL},
+    {EXIFTAG_FOCALLENGTHIN35MMFILM, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "FocalLengthIn35mmFilm", NULL},
+    {EXIFTAG_SCENECAPTURETYPE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SceneCaptureType", NULL},
+    {EXIFTAG_GAINCONTROL, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "GainControl", NULL},
+    {EXIFTAG_CONTRAST, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Contrast", NULL},
+    {EXIFTAG_SATURATION, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Saturation", NULL},
+    {EXIFTAG_SHARPNESS, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Sharpness", NULL},
+    {EXIFTAG_DEVICESETTINGDESCRIPTION, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "DeviceSettingDescription", NULL},
+    {EXIFTAG_SUBJECTDISTANCERANGE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SubjectDistanceRange", NULL},
+    {EXIFTAG_IMAGEUNIQUEID, 33, 33, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImageUniqueID", NULL},
+    {EXIFTAG_CAMERAOWNERNAME, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CameraOwnerName", NULL},
+    {EXIFTAG_BODYSERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "BodySerialNumber", NULL},
+    {EXIFTAG_LENSSPECIFICATION, 4, 4, TIFF_RATIONAL, 0, TIFF_SETGET_C0_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensSpecification", NULL},
+    {EXIFTAG_LENSMAKE, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensMake", NULL},
+    {EXIFTAG_LENSMODEL, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensModel", NULL},
+    {EXIFTAG_LENSSERIALNUMBER, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LensSerialNumber", NULL},
+    {EXIFTAG_GAMMA, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_FLOAT, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Gamma", NULL},
+    {EXIFTAG_COMPOSITEIMAGE, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "CompositeImage", NULL},
+    {EXIFTAG_SOURCEIMAGENUMBEROFCOMPOSITEIMAGE, 2, 2, TIFF_SHORT, 0, TIFF_SETGET_C0_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SourceImageNumberOfCompositeImage", NULL},
+    {EXIFTAG_SOURCEEXPOSURETIMESOFCOMPOSITEIMAGE, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1,
+     "SourceExposureTimesOfCompositeImage", NULL}};
 /*
- * EXIF-GPS tags  (Version 2.31, July 2016; nothing changed for version 2.32 May 2019)
+ * EXIF-GPS tags  (Version 2.31, July 2016; nothing changed for version 2.32 May
+ * 2019)
  */
 
-static TIFFField
-gpsFields[] = {
-	/*  For the GPS tag definitions in gpsFields[] the standard definition for Rationals is TIFF_SETGET_DOUBLE and TIFF_SETGET_C0_FLOAT.
-	 *-- ATTENTION: After the upgrade with Rational2Double, the GPSTAG values can now be written and also read in double precision!
-	 *              In order to achieve double precision for GPS tags:
-	 *              Standard definitions for GPSTAG is kept to TIFF_SETGET_DOUBLE 
-	 *              and TIFF_SETGET_C0_FLOAT is changed to TIFF_SETGET_C0_DOUBLE.
-	 */
-	{		GPSTAG_VERSIONID	, 4, 4, 	TIFF_BYTE	, 0, 	TIFF_SETGET_C0_UINT8	, TIFF_SETGET_UINT8	, FIELD_CUSTOM	, 1, 0, 	"VersionID", NULL },
-	{		GPSTAG_LATITUDEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"LatitudeRef", NULL },
-	{		GPSTAG_LATITUDE	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Latitude", NULL },
-	{		GPSTAG_LONGITUDEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"LongitudeRef", NULL },
-	{		GPSTAG_LONGITUDE	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Longitude", NULL },
-	{		GPSTAG_ALTITUDEREF	, 1, 1,	TIFF_BYTE	, 0, 	TIFF_SETGET_UINT8	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"AltitudeRef", NULL },
-	{		GPSTAG_ALTITUDE	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Altitude", NULL },
-	{		GPSTAG_TIMESTAMP	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"TimeStamp", NULL },
-	{		GPSTAG_SATELLITES	, -1, -1,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Satellites", NULL },
-	{		GPSTAG_STATUS	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Status", NULL },
-	{		GPSTAG_MEASUREMODE	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"MeasureMode", NULL },
-	{		GPSTAG_DOP	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DOP", NULL },
-	{		GPSTAG_SPEEDREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"SpeedRef", NULL },
-	{		GPSTAG_SPEED	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Speed", NULL },
-	{		GPSTAG_TRACKREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"TrackRef", NULL },
-	{		GPSTAG_TRACK	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Track", NULL },
-	{		GPSTAG_IMGDIRECTIONREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"ImgDirectionRef", NULL },
-	{		GPSTAG_IMGDIRECTION	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"ImgDirection", NULL },
-	{		GPSTAG_MAPDATUM	, -1, -1,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"MapDatum", NULL },
-	{		GPSTAG_DESTLATITUDEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestLatitudeRef", NULL },
-	{		GPSTAG_DESTLATITUDE	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestLatitude", NULL },
-	{		GPSTAG_DESTLONGITUDEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestLongitudeRef", NULL },
-	{		GPSTAG_DESTLONGITUDE	, 3, 3,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_C0_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestLongitude", NULL },
-	{		GPSTAG_DESTBEARINGREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestBearingRef", NULL },
-	{		GPSTAG_DESTBEARING	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestBearing", NULL },
-	{		GPSTAG_DESTDISTANCEREF	, 2, 2,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestDistanceRef", NULL },
-	{		GPSTAG_DESTDISTANCE	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DestDistance", NULL },
-	{		GPSTAG_PROCESSINGMETHOD	, -1, -1,	TIFF_UNDEFINED	, 0, 	TIFF_SETGET_C16_UINT8	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 1, 	"ProcessingMethod", NULL },
-	{		GPSTAG_AREAINFORMATION	, -1, -1,	TIFF_UNDEFINED	, 0, 	TIFF_SETGET_C16_UINT8	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 1, 	"AreaInformation", NULL },
-	{		GPSTAG_DATESTAMP	, 11, 11,	TIFF_ASCII	, 0, 	TIFF_SETGET_ASCII	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"DateStamp", NULL },
-	{		GPSTAG_DIFFERENTIAL	, 1, 1,	TIFF_SHORT	, 0, 	TIFF_SETGET_UINT16	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"Differential", NULL },
-	{		GPSTAG_GPSHPOSITIONINGERROR	, 1, 1,	TIFF_RATIONAL	, 0, 	TIFF_SETGET_DOUBLE	, TIFF_SETGET_UNDEFINED	, FIELD_CUSTOM	, 1, 0, 	"HorizontalPositioningError", NULL }
-};
-
-static const TIFFFieldArray
-tiffFieldArray = { tfiatImage, 0, TIFFArrayCount(tiffFields), (TIFFField*) tiffFields };
-static const TIFFFieldArray
-exifFieldArray = { tfiatExif, 0, TIFFArrayCount(exifFields), (TIFFField*) exifFields };
-static const TIFFFieldArray
-gpsFieldArray = { tfiatGps, 0, TIFFArrayCount(gpsFields), (TIFFField*) gpsFields };
+static const TIFFField gpsFields[] = {
+    /*  For the GPS tag definitions in gpsFields[] the standard definition for Rationals is TIFF_SETGET_DOUBLE and TIFF_SETGET_C0_FLOAT.
+     *-- ATTENTION: After the upgrade with Rational2Double, the GPSTAG values can now be written and also read in double precision!
+     *              In order to achieve double precision for GPS tags: Standard definitions for GPSTAG is kept to TIFF_SETGET_DOUBLE
+     *              and TIFF_SETGET_C0_FLOAT is changed to TIFF_SETGET_C0_DOUBLE.
+     */
+    {GPSTAG_VERSIONID, 4, 4, TIFF_BYTE, 0, TIFF_SETGET_C0_UINT8, TIFF_SETGET_UINT8, FIELD_CUSTOM, 1, 0, "VersionID", NULL},
+    {GPSTAG_LATITUDEREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LatitudeRef", NULL},
+    {GPSTAG_LATITUDE, 3, 3, TIFF_RATIONAL, 0, TIFF_SETGET_C0_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Latitude", NULL},
+    {GPSTAG_LONGITUDEREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "LongitudeRef", NULL},
+    {GPSTAG_LONGITUDE, 3, 3, TIFF_RATIONAL, 0, TIFF_SETGET_C0_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Longitude", NULL},
+    {GPSTAG_ALTITUDEREF, 1, 1, TIFF_BYTE, 0, TIFF_SETGET_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "AltitudeRef", NULL},
+    {GPSTAG_ALTITUDE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Altitude", NULL},
+    {GPSTAG_TIMESTAMP, 3, 3, TIFF_RATIONAL, 0, TIFF_SETGET_C0_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "TimeStamp", NULL},
+    {GPSTAG_SATELLITES, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Satellites", NULL},
+    {GPSTAG_STATUS, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Status", NULL},
+    {GPSTAG_MEASUREMODE, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MeasureMode", NULL},
+    {GPSTAG_DOP, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DOP", NULL},
+    {GPSTAG_SPEEDREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "SpeedRef", NULL},
+    {GPSTAG_SPEED, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Speed", NULL},
+    {GPSTAG_TRACKREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "TrackRef", NULL},
+    {GPSTAG_TRACK, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Track", NULL},
+    {GPSTAG_IMGDIRECTIONREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImgDirectionRef", NULL},
+    {GPSTAG_IMGDIRECTION, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "ImgDirection", NULL},
+    {GPSTAG_MAPDATUM, -1, -1, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "MapDatum", NULL},
+    {GPSTAG_DESTLATITUDEREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DestLatitudeRef", NULL},
+    {GPSTAG_DESTLATITUDE, 3, 3, TIFF_RATIONAL, 0, TIFF_SETGET_C0_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DestLatitude", NULL},
+    {GPSTAG_DESTLONGITUDEREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DestLongitudeRef", NULL},
+    {GPSTAG_DESTLONGITUDE, 3, 3, TIFF_RATIONAL, 0, TIFF_SETGET_C0_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DestLongitude", NULL},
+    {GPSTAG_DESTBEARINGREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DestBearingRef", NULL},
+    {GPSTAG_DESTBEARING, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DestBearing", NULL},
+    {GPSTAG_DESTDISTANCEREF, 2, 2, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DestDistanceRef", NULL},
+    {GPSTAG_DESTDISTANCE, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DestDistance", NULL},
+    {GPSTAG_PROCESSINGMETHOD, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "ProcessingMethod", NULL},
+    {GPSTAG_AREAINFORMATION, -1, -1, TIFF_UNDEFINED, 0, TIFF_SETGET_C16_UINT8, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 1, "AreaInformation", NULL},
+    {GPSTAG_DATESTAMP, 11, 11, TIFF_ASCII, 0, TIFF_SETGET_ASCII, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "DateStamp", NULL},
+    {GPSTAG_DIFFERENTIAL, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "Differential", NULL},
+    {GPSTAG_GPSHPOSITIONINGERROR, 1, 1, TIFF_RATIONAL, 0, TIFF_SETGET_DOUBLE, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, 1, 0, "HorizontalPositioningError", NULL}};
+/* clang-format on */ /* was off for better readability of tag comments */
+
+static const TIFFFieldArray tiffFieldArray = {
+    tfiatImage, 0, TIFFArrayCount(tiffFields), (TIFFField *)tiffFields};
+static const TIFFFieldArray exifFieldArray = {
+    tfiatExif, 0, TIFFArrayCount(exifFields), (TIFFField *)exifFields};
+static const TIFFFieldArray gpsFieldArray = {
+    tfiatGps, 0, TIFFArrayCount(gpsFields), (TIFFField *)gpsFields};
 
 /*
  *  We have our own local lfind() equivalent to avoid subtle differences
- *  in types passed to lfind() on different systems. 
+ *  in types passed to lfind() on different systems.
  */
 
-static void *
-td_lfind(const void *key, const void *base, size_t *nmemb, size_t size,
-         int(*compar)(const void *, const void *))
+static void *td_lfind(const void *key, const void *base, size_t *nmemb,
+                      size_t size, int (*compar)(const void *, const void *))
 {
     char *element, *end;
 
     end = (char *)base + *nmemb * size;
     for (element = (char *)base; element < end; element += size)
-        if (!compar(key, element))		/* key found */
+        if (!compar(key, element)) /* key found */
             return element;
 
     return NULL;
 }
 
-const TIFFFieldArray*
-_TIFFGetFields(void)
-{
-	return(&tiffFieldArray);
-}
+const TIFFFieldArray *_TIFFGetFields(void) { return (&tiffFieldArray); }
 
-const TIFFFieldArray*
-_TIFFGetExifFields(void)
-{
-	return(&exifFieldArray);
-}
+const TIFFFieldArray *_TIFFGetExifFields(void) { return (&exifFieldArray); }
 
-const TIFFFieldArray*
-_TIFFGetGpsFields(void)
-{
-	return(&gpsFieldArray);
-}
+const TIFFFieldArray *_TIFFGetGpsFields(void) { return (&gpsFieldArray); }
 
-void
-_TIFFSetupFields(TIFF* tif, const TIFFFieldArray* fieldarray)
+void _TIFFSetupFields(TIFF *tif, const TIFFFieldArray *fieldarray)
 {
-	if (tif->tif_fields && tif->tif_nfields > 0) {
-		uint32 i;
-
-		for (i = 0; i < tif->tif_nfields; i++) {
-			TIFFField *fld = tif->tif_fields[i];
-			if (fld->field_bit == FIELD_CUSTOM &&
-				strncmp("Tag ", fld->field_name, 4) == 0) {
-					_TIFFfree(fld->field_name);
-					_TIFFfree(fld);
-				}
-		}
-
-		_TIFFfree(tif->tif_fields);
-		tif->tif_fields = NULL;
-		tif->tif_nfields = 0;
-	}
-	if (!_TIFFMergeFields(tif, fieldarray->fields, fieldarray->count)) {
-		TIFFErrorExt(tif->tif_clientdata, "_TIFFSetupFields",
-			     "Setting up field info failed");
-	}
+    if (tif->tif_fields && tif->tif_nfields > 0)
+    {
+        uint32_t i;
+
+        for (i = 0; i < tif->tif_nfields; i++)
+        {
+            TIFFField *fld = tif->tif_fields[i];
+            if (fld->field_name != NULL)
+            {
+                if (fld->field_bit == FIELD_CUSTOM && TIFFFieldIsAnonymous(fld))
+                {
+                    _TIFFfreeExt(tif, fld->field_name);
+                    /* caution: tif_fields[i] must not be the beginning of a
+                     * fields-array. Otherwise the following tags are also freed
+                     * with the first free().
+                     */
+                    _TIFFfreeExt(tif, fld);
+                }
+            }
+        }
+
+        _TIFFfreeExt(tif, tif->tif_fields);
+        tif->tif_fields = NULL;
+        tif->tif_nfields = 0;
+    }
+    if (!_TIFFMergeFields(tif, fieldarray->fields, fieldarray->count))
+    {
+        TIFFErrorExtR(tif, "_TIFFSetupFields", "Setting up field info failed");
+    }
 }
 
-static int
-tagCompare(const void* a, const void* b)
+static int tagCompare(const void *a, const void *b)
 {
-	const TIFFField* ta = *(const TIFFField**) a;
-	const TIFFField* tb = *(const TIFFField**) b;
-	/* NB: be careful of return values for 16-bit platforms */
-	if (ta->field_tag != tb->field_tag)
-		return (int)ta->field_tag - (int)tb->field_tag;
-	else
-		return (ta->field_type == TIFF_ANY) ?
-			0 : ((int)tb->field_type - (int)ta->field_type);
+    const TIFFField *ta = *(const TIFFField **)a;
+    const TIFFField *tb = *(const TIFFField **)b;
+    /* NB: be careful of return values for 16-bit platforms */
+    if (ta->field_tag != tb->field_tag)
+        return (int)ta->field_tag - (int)tb->field_tag;
+    else
+        return (ta->field_type == TIFF_ANY)
+                   ? 0
+                   : ((int)tb->field_type - (int)ta->field_type);
 }
 
-static int
-tagNameCompare(const void* a, const void* b)
+static int tagNameCompare(const void *a, const void *b)
 {
-	const TIFFField* ta = *(const TIFFField**) a;
-	const TIFFField* tb = *(const TIFFField**) b;
-	int ret = strcmp(ta->field_name, tb->field_name);
-
-	if (ret)
-		return ret;
-	else
-		return (ta->field_type == TIFF_ANY) ?
-			0 : ((int)tb->field_type - (int)ta->field_type);
+    const TIFFField *ta = *(const TIFFField **)a;
+    const TIFFField *tb = *(const TIFFField **)b;
+    int ret = strcmp(ta->field_name, tb->field_name);
+
+    if (ret)
+        return ret;
+    else
+        return (ta->field_type == TIFF_ANY)
+                   ? 0
+                   : ((int)tb->field_type - (int)ta->field_type);
 }
 
-int
-_TIFFMergeFields(TIFF* tif, const TIFFField info[], uint32 n)
+int _TIFFMergeFields(TIFF *tif, const TIFFField info[], uint32_t n)
 {
-	static const char module[] = "_TIFFMergeFields";
-	static const char reason[] = "for fields array";
-	/* TIFFField** tp; */
-	uint32 i;
-
-        tif->tif_foundfield = NULL;
-
-	if (tif->tif_fields && tif->tif_nfields > 0) {
-		tif->tif_fields = (TIFFField**)
-			_TIFFCheckRealloc(tif, tif->tif_fields,
-					  (tif->tif_nfields + n),
-					  sizeof(TIFFField *), reason);
-	} else {
-		tif->tif_fields = (TIFFField **)
-			_TIFFCheckMalloc(tif, n, sizeof(TIFFField *),
-					 reason);
-	}
-	if (!tif->tif_fields) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Failed to allocate fields array");
-		return 0;
-	}
-
-	/* tp = tif->tif_fields + tif->tif_nfields; */
-	for (i = 0; i < n; i++) {
-		const TIFFField *fip =
-			TIFFFindField(tif, info[i].field_tag, TIFF_ANY);
-
-                /* only add definitions that aren't already present */
-		if (!fip) {
-                        tif->tif_fields[tif->tif_nfields] = (TIFFField *) (info+i);
-                        tif->tif_nfields++;
-                }
-	}
-
-        /* Sort the field info by tag number */
-	qsort(tif->tif_fields, tif->tif_nfields,
-	      sizeof(TIFFField *), tagCompare);
-
-	return n;
+    static const char module[] = "_TIFFMergeFields";
+    static const char reason[] = "for fields array";
+    /* TIFFField** tp; */
+    uint32_t i;
+
+    tif->tif_foundfield = NULL;
+
+    if (tif->tif_fields && tif->tif_nfields > 0)
+    {
+        tif->tif_fields = (TIFFField **)_TIFFCheckRealloc(
+            tif, tif->tif_fields, (tif->tif_nfields + n), sizeof(TIFFField *),
+            reason);
+    }
+    else
+    {
+        tif->tif_fields =
+            (TIFFField **)_TIFFCheckMalloc(tif, n, sizeof(TIFFField *), reason);
+    }
+    if (!tif->tif_fields)
+    {
+        TIFFErrorExtR(tif, module, "Failed to allocate fields array");
+        return 0;
+    }
+
+    /* tp = tif->tif_fields + tif->tif_nfields; */
+    for (i = 0; i < n; i++)
+    {
+        const TIFFField *fip = TIFFFindField(tif, info[i].field_tag, TIFF_ANY);
+
+        /* only add definitions that aren't already present */
+        if (!fip)
+        {
+            tif->tif_fields[tif->tif_nfields] = (TIFFField *)(info + i);
+            tif->tif_nfields++;
+        }
+    }
+
+    /* Sort the field info by tag number */
+    qsort(tif->tif_fields, tif->tif_nfields, sizeof(TIFFField *), tagCompare);
+
+    return n;
 }
 
-void
-_TIFFPrintFieldInfo(TIFF* tif, FILE* fd)
+void _TIFFPrintFieldInfo(TIFF *tif, FILE *fd)
 {
-	uint32 i;
-
-	fprintf(fd, "%s: \n", tif->tif_name);
-	for (i = 0; i < tif->tif_nfields; i++) {
-		const TIFFField* fip = tif->tif_fields[i];
-		fprintf(fd, "field[%2d] %5lu, %2d, %2d, %d, %2d, %5s, %5s, %s\n"
-			, (int)i
-			, (unsigned long) fip->field_tag
-			, fip->field_readcount, fip->field_writecount
-			, fip->field_type
-			, fip->field_bit
-			, fip->field_oktochange ? "TRUE" : "FALSE"
-			, fip->field_passcount ? "TRUE" : "FALSE"
-			, fip->field_name
-		);
-	}
+    uint32_t i;
+
+    fprintf(fd, "%s: \n", tif->tif_name);
+    for (i = 0; i < tif->tif_nfields; i++)
+    {
+        const TIFFField *fip = tif->tif_fields[i];
+        fprintf(fd, "field[%2d] %5lu, %2d, %2d, %d, %2d, %5s, %5s, %s\n",
+                (int)i, (unsigned long)fip->field_tag, fip->field_readcount,
+                fip->field_writecount, fip->field_type, fip->field_bit,
+                fip->field_oktochange ? "TRUE" : "FALSE",
+                fip->field_passcount ? "TRUE" : "FALSE", fip->field_name);
+    }
 }
 
 /*
- * Return size of TIFFDataType in bytes
+ * Return size of TIFFDataType within TIFF-file in bytes
  */
-int
-TIFFDataWidth(TIFFDataType type)
+int TIFFDataWidth(TIFFDataType type)
 {
-	switch(type)
-	{
-		case 0:  /* nothing */
-		case TIFF_BYTE:
-		case TIFF_ASCII:
-		case TIFF_SBYTE:
-		case TIFF_UNDEFINED:
-			return 1;
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-			return 2;
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_FLOAT:
-		case TIFF_IFD:
-			return 4;
-		case TIFF_RATIONAL:
-		case TIFF_SRATIONAL:
-		case TIFF_DOUBLE:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-		case TIFF_IFD8:
-			return 8;
-		default:
-			return 0; /* will return 0 for unknown types */
-	}
+    switch (type)
+    {
+        case 0: /* nothing */
+        case TIFF_BYTE:
+        case TIFF_ASCII:
+        case TIFF_SBYTE:
+        case TIFF_UNDEFINED:
+            return 1;
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+            return 2;
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_FLOAT:
+        case TIFF_IFD:
+            return 4;
+        case TIFF_RATIONAL:
+        case TIFF_SRATIONAL:
+        case TIFF_DOUBLE:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+        case TIFF_IFD8:
+            return 8;
+        default:
+            return 0; /* will return 0 for unknown types */
+    }
 }
 
 /*
- * Return size of TIFFDataType in bytes.
- *
- * XXX: We need a separate function to determine the space needed
- * to store the value. For TIFF_RATIONAL values TIFFDataWidth() returns 8,
- * but we use 4-byte float to represent rationals.
+ * Return internal storage size of TIFFSetGetFieldType in bytes.
+ * TIFFSetField() and TIFFGetField() have to provide the parameter accordingly.
+ * Replaces internal functions _TIFFDataSize() and _TIFFSetGetFieldSize()
+ * with now extern available function TIFFFieldSetGetSize().
  */
-int
-_TIFFDataSize(TIFFDataType type)
+int TIFFFieldSetGetSize(const TIFFField *fip)
 {
-	switch (type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_ASCII:
-		case TIFF_UNDEFINED:
-		    return 1;
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		    return 2;
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_FLOAT:
-		case TIFF_IFD:
-		case TIFF_RATIONAL:
-		case TIFF_SRATIONAL:
-		    return 4;
-		case TIFF_DOUBLE:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-		case TIFF_IFD8:
-		    return 8;
-		default:
-		    return 0;
-	}
-}
+    /*
+     * TIFFSetField() and TIFFGetField() must provide the parameter accordingly
+     * to the definition of "set_field_type" of the tag definition in
+     * dir_info.c. This function returns the data size for that purpose.
+     *
+     * Furthermore, this data size is also used for the internal storage,
+     * even for TIFF_RATIONAL values for FIELD_CUSTOM, which are stored
+     * internally as 4-byte float, but some of them should be stored internally
+     * as 8-byte double, depending on the "set_field_type" _FLOAT_ or _DOUBLE_.
+     */
+    if (fip == NULL)
+        return 0;
+
+    switch (fip->set_field_type)
+    {
+        case TIFF_SETGET_UNDEFINED:
+        case TIFF_SETGET_ASCII:
+        case TIFF_SETGET_C0_ASCII:
+        case TIFF_SETGET_C16_ASCII:
+        case TIFF_SETGET_C32_ASCII:
+        case TIFF_SETGET_OTHER:
+            return 1;
+        case TIFF_SETGET_UINT8:
+        case TIFF_SETGET_SINT8:
+        case TIFF_SETGET_C0_UINT8:
+        case TIFF_SETGET_C0_SINT8:
+        case TIFF_SETGET_C16_UINT8:
+        case TIFF_SETGET_C16_SINT8:
+        case TIFF_SETGET_C32_UINT8:
+        case TIFF_SETGET_C32_SINT8:
+            return 1;
+        case TIFF_SETGET_UINT16:
+        case TIFF_SETGET_SINT16:
+        case TIFF_SETGET_C0_UINT16:
+        case TIFF_SETGET_C0_SINT16:
+        case TIFF_SETGET_C16_UINT16:
+        case TIFF_SETGET_C16_SINT16:
+        case TIFF_SETGET_C32_UINT16:
+        case TIFF_SETGET_C32_SINT16:
+            return 2;
+        case TIFF_SETGET_INT:
+        case TIFF_SETGET_UINT32:
+        case TIFF_SETGET_SINT32:
+        case TIFF_SETGET_FLOAT:
+        case TIFF_SETGET_UINT16_PAIR:
+        case TIFF_SETGET_C0_UINT32:
+        case TIFF_SETGET_C0_SINT32:
+        case TIFF_SETGET_C0_FLOAT:
+        case TIFF_SETGET_C16_UINT32:
+        case TIFF_SETGET_C16_SINT32:
+        case TIFF_SETGET_C16_FLOAT:
+        case TIFF_SETGET_C32_UINT32:
+        case TIFF_SETGET_C32_SINT32:
+        case TIFF_SETGET_C32_FLOAT:
+            return 4;
+        case TIFF_SETGET_UINT64:
+        case TIFF_SETGET_SINT64:
+        case TIFF_SETGET_DOUBLE:
+        case TIFF_SETGET_IFD8:
+        case TIFF_SETGET_C0_UINT64:
+        case TIFF_SETGET_C0_SINT64:
+        case TIFF_SETGET_C0_DOUBLE:
+        case TIFF_SETGET_C0_IFD8:
+        case TIFF_SETGET_C16_UINT64:
+        case TIFF_SETGET_C16_SINT64:
+        case TIFF_SETGET_C16_DOUBLE:
+        case TIFF_SETGET_C16_IFD8:
+        case TIFF_SETGET_C32_UINT64:
+        case TIFF_SETGET_C32_SINT64:
+        case TIFF_SETGET_C32_DOUBLE:
+        case TIFF_SETGET_C32_IFD8:
+            return 8;
+        default:
+            return 0;
+    }
+} /*-- TIFFFieldSetGetSize() --- */
 
 /*
- * Rational2Double: 
- * Return size of TIFFSetGetFieldType in bytes.
- *
- * XXX: TIFF_RATIONAL values for FIELD_CUSTOM are stored internally as 4-byte float.
- * However, some of them should be stored internally as 8-byte double. 
- * This is now managed by the SetGetField of the tag-definition!
+ * Return size of count parameter of TIFFSetField() and TIFFGetField()
+ * and also if it is required:  0=none, 2=uint16_t, 4=uint32_t
  */
-int
-_TIFFSetGetFieldSize(TIFFSetGetFieldType setgettype)
+int TIFFFieldSetGetCountSize(const TIFFField *fip)
 {
-	switch (setgettype)
-	{
-		case TIFF_SETGET_UNDEFINED:
-		case TIFF_SETGET_ASCII:
-		case TIFF_SETGET_C0_ASCII:
-		case TIFF_SETGET_C16_ASCII:
-		case TIFF_SETGET_C32_ASCII:
-		case TIFF_SETGET_OTHER:
-		    return 0;
-		case TIFF_SETGET_UINT8:
-		case TIFF_SETGET_SINT8:
-		case TIFF_SETGET_C0_UINT8:
-		case TIFF_SETGET_C0_SINT8:
-		case TIFF_SETGET_C16_UINT8:
-		case TIFF_SETGET_C16_SINT8:
-		case TIFF_SETGET_C32_UINT8:
-		case TIFF_SETGET_C32_SINT8:
-		    return 1;
-		case TIFF_SETGET_UINT16:
-		case TIFF_SETGET_SINT16:
-		case TIFF_SETGET_C0_UINT16:
-		case TIFF_SETGET_C0_SINT16:
-		case TIFF_SETGET_C16_UINT16:
-		case TIFF_SETGET_C16_SINT16:
-		case TIFF_SETGET_C32_UINT16:
-		case TIFF_SETGET_C32_SINT16:
-		    return 2;
-		case TIFF_SETGET_INT:
-		case TIFF_SETGET_UINT32:
-		case TIFF_SETGET_SINT32:
-		case TIFF_SETGET_FLOAT:
-		case TIFF_SETGET_UINT16_PAIR:
-		case TIFF_SETGET_C0_UINT32:
-		case TIFF_SETGET_C0_SINT32:
-		case TIFF_SETGET_C0_FLOAT:
-		case TIFF_SETGET_C16_UINT32:
-		case TIFF_SETGET_C16_SINT32:
-		case TIFF_SETGET_C16_FLOAT:
-		case TIFF_SETGET_C32_UINT32:
-		case TIFF_SETGET_C32_SINT32:
-		case TIFF_SETGET_C32_FLOAT:
-		    return 4;
-		case TIFF_SETGET_UINT64:
-		case TIFF_SETGET_SINT64:
-		case TIFF_SETGET_DOUBLE:
-		case TIFF_SETGET_IFD8:
-		case TIFF_SETGET_C0_UINT64:
-		case TIFF_SETGET_C0_SINT64:
-		case TIFF_SETGET_C0_DOUBLE:
-		case TIFF_SETGET_C0_IFD8:
-		case TIFF_SETGET_C16_UINT64:
-		case TIFF_SETGET_C16_SINT64:
-		case TIFF_SETGET_C16_DOUBLE:
-		case TIFF_SETGET_C16_IFD8:
-		case TIFF_SETGET_C32_UINT64:
-		case TIFF_SETGET_C32_SINT64:
-		case TIFF_SETGET_C32_DOUBLE:
-		case TIFF_SETGET_C32_IFD8:
-		    return 8;
-		default:
-		    return 0;
-	}
-} /*-- _TIFFSetGetFieldSize --- */
-
-
-const TIFFField*
-TIFFFindField(TIFF* tif, uint32 tag, TIFFDataType dt)
+    if (fip == NULL)
+        return 0;
+
+    switch (fip->set_field_type)
+    {
+        case TIFF_SETGET_C16_ASCII:
+        case TIFF_SETGET_C16_UINT8:
+        case TIFF_SETGET_C16_SINT8:
+        case TIFF_SETGET_C16_UINT16:
+        case TIFF_SETGET_C16_SINT16:
+        case TIFF_SETGET_C16_UINT32:
+        case TIFF_SETGET_C16_SINT32:
+        case TIFF_SETGET_C16_FLOAT:
+        case TIFF_SETGET_C16_UINT64:
+        case TIFF_SETGET_C16_SINT64:
+        case TIFF_SETGET_C16_DOUBLE:
+        case TIFF_SETGET_C16_IFD8:
+            return 2;
+        case TIFF_SETGET_C32_ASCII:
+        case TIFF_SETGET_C32_UINT8:
+        case TIFF_SETGET_C32_SINT8:
+        case TIFF_SETGET_C32_UINT16:
+        case TIFF_SETGET_C32_SINT16:
+        case TIFF_SETGET_C32_UINT32:
+        case TIFF_SETGET_C32_SINT32:
+        case TIFF_SETGET_C32_FLOAT:
+        case TIFF_SETGET_C32_UINT64:
+        case TIFF_SETGET_C32_SINT64:
+        case TIFF_SETGET_C32_DOUBLE:
+        case TIFF_SETGET_C32_IFD8:
+            return 4;
+        default:
+            return 0;
+    }
+} /*-- TIFFFieldSetGetCountSize() --- */
+
+const TIFFField *TIFFFindField(TIFF *tif, uint32_t tag, TIFFDataType dt)
 {
-	TIFFField key = {0, 0, 0, TIFF_NOTYPE, 0, 0, 0, 0, 0, 0, NULL, NULL};
-	TIFFField* pkey = &key;
-	const TIFFField **ret;
-	if (tif->tif_foundfield && tif->tif_foundfield->field_tag == tag &&
-	    (dt == TIFF_ANY || dt == tif->tif_foundfield->field_type))
-		return tif->tif_foundfield;
-
-	/* If we are invoked with no field information, then just return. */
-	if (!tif->tif_fields)
-		return NULL;
-
-	/* NB: use sorted search (e.g. binary search) */
-
-	key.field_tag = tag;
-	key.field_type = dt;
-
-	ret = (const TIFFField **) bsearch(&pkey, tif->tif_fields,
-					   tif->tif_nfields,
-					   sizeof(TIFFField *), tagCompare);
-	return tif->tif_foundfield = (ret ? *ret : NULL);
+    TIFFField key = {0, 0, 0, TIFF_NOTYPE, 0, 0, 0, 0, 0, 0, NULL, NULL};
+    TIFFField *pkey = &key;
+    const TIFFField **ret;
+    if (tif->tif_foundfield && tif->tif_foundfield->field_tag == tag &&
+        (dt == TIFF_ANY || dt == tif->tif_foundfield->field_type))
+        return tif->tif_foundfield;
+
+    /* If we are invoked with no field information, then just return. */
+    if (!tif->tif_fields)
+        return NULL;
+
+    /* NB: use sorted search (e.g. binary search) */
+
+    key.field_tag = tag;
+    key.field_type = dt;
+
+    ret = (const TIFFField **)bsearch(&pkey, tif->tif_fields, tif->tif_nfields,
+                                      sizeof(TIFFField *), tagCompare);
+    return tif->tif_foundfield = (ret ? *ret : NULL);
 }
 
-static const TIFFField*
-_TIFFFindFieldByName(TIFF* tif, const char *field_name, TIFFDataType dt)
+static const TIFFField *_TIFFFindFieldByName(TIFF *tif, const char *field_name,
+                                             TIFFDataType dt)
 {
-	TIFFField key = {0, 0, 0, TIFF_NOTYPE, 0, 0, 0, 0, 0, 0, NULL, NULL};
-	TIFFField* pkey = &key;
-	const TIFFField **ret;
-	if (tif->tif_foundfield
-	    && streq(tif->tif_foundfield->field_name, field_name)
-	    && (dt == TIFF_ANY || dt == tif->tif_foundfield->field_type))
-		return (tif->tif_foundfield);
+    TIFFField key = {0, 0, 0, TIFF_NOTYPE, 0, 0, 0, 0, 0, 0, NULL, NULL};
+    TIFFField *pkey = &key;
+    const TIFFField **ret;
+    if (tif->tif_foundfield &&
+        streq(tif->tif_foundfield->field_name, field_name) &&
+        (dt == TIFF_ANY || dt == tif->tif_foundfield->field_type))
+        return (tif->tif_foundfield);
 
-	/* If we are invoked with no field information, then just return. */
-	if (!tif->tif_fields)
-		return NULL;
+    /* If we are invoked with no field information, then just return. */
+    if (!tif->tif_fields)
+        return NULL;
 
-	/* NB: use linear search since list is sorted by key#, not name */
+    /* NB: use linear search since list is sorted by key#, not name */
 
-	key.field_name = (char *)field_name;
-	key.field_type = dt;
+    key.field_name = (char *)field_name;
+    key.field_type = dt;
 
-	ret = (const TIFFField **) 
-            td_lfind(&pkey, tif->tif_fields, &tif->tif_nfields,
-                     sizeof(TIFFField *), tagNameCompare);
+    ret =
+        (const TIFFField **)td_lfind(&pkey, tif->tif_fields, &tif->tif_nfields,
+                                     sizeof(TIFFField *), tagNameCompare);
 
-	return tif->tif_foundfield = (ret ? *ret : NULL);
+    return tif->tif_foundfield = (ret ? *ret : NULL);
 }
 
-const TIFFField*
-TIFFFieldWithTag(TIFF* tif, uint32 tag)
+const TIFFField *TIFFFieldWithTag(TIFF *tif, uint32_t tag)
 {
-	const TIFFField* fip = TIFFFindField(tif, tag, TIFF_ANY);
-	if (!fip) {
-		TIFFErrorExt(tif->tif_clientdata, "TIFFFieldWithTag",
-			     "Internal error, unknown tag 0x%x",
-			     (unsigned int) tag);
-	}
-	return (fip);
+    const TIFFField *fip = TIFFFindField(tif, tag, TIFF_ANY);
+    if (!fip)
+    {
+        TIFFWarningExtR(tif, "TIFFFieldWithTag", "Warning, unknown tag 0x%x",
+                        (unsigned int)tag);
+    }
+    return (fip);
 }
 
-const TIFFField*
-TIFFFieldWithName(TIFF* tif, const char *field_name)
+const TIFFField *TIFFFieldWithName(TIFF *tif, const char *field_name)
 {
-	const TIFFField* fip =
-		_TIFFFindFieldByName(tif, field_name, TIFF_ANY);
-	if (!fip) {
-		TIFFErrorExt(tif->tif_clientdata, "TIFFFieldWithName",
-			     "Internal error, unknown tag %s", field_name);
-	}
-	return (fip);
+    const TIFFField *fip = _TIFFFindFieldByName(tif, field_name, TIFF_ANY);
+    if (!fip)
+    {
+        TIFFWarningExtR(tif, "TIFFFieldWithName", "Warning, unknown tag %s",
+                        field_name);
+    }
+    return (fip);
 }
 
-uint32
-TIFFFieldTag(const TIFFField* fip)
-{
-	return fip->field_tag;
-}
+uint32_t TIFFFieldTag(const TIFFField *fip) { return fip->field_tag; }
 
-const char *
-TIFFFieldName(const TIFFField* fip)
-{
-	return fip->field_name;
-}
+const char *TIFFFieldName(const TIFFField *fip) { return fip->field_name; }
 
-TIFFDataType
-TIFFFieldDataType(const TIFFField* fip)
-{
-	return fip->field_type;
-}
+TIFFDataType TIFFFieldDataType(const TIFFField *fip) { return fip->field_type; }
 
-int
-TIFFFieldPassCount(const TIFFField* fip)
-{
-	return fip->field_passcount;
-}
+int TIFFFieldPassCount(const TIFFField *fip) { return fip->field_passcount; }
 
-int
-TIFFFieldReadCount(const TIFFField* fip)
-{
-	return fip->field_readcount;
-}
+int TIFFFieldReadCount(const TIFFField *fip) { return fip->field_readcount; }
 
-int
-TIFFFieldWriteCount(const TIFFField* fip)
-{
-	return fip->field_writecount;
-}
+int TIFFFieldWriteCount(const TIFFField *fip) { return fip->field_writecount; }
+
+int TIFFFieldIsAnonymous(const TIFFField *fip) { return fip->field_anonymous; }
 
-const TIFFField*
-_TIFFFindOrRegisterField(TIFF *tif, uint32 tag, TIFFDataType dt)
+const TIFFField *_TIFFFindOrRegisterField(TIFF *tif, uint32_t tag,
+                                          TIFFDataType dt)
 
 {
-	const TIFFField *fld;
+    const TIFFField *fld;
 
-	fld = TIFFFindField(tif, tag, dt);
-	if (fld == NULL) {
-		fld = _TIFFCreateAnonField(tif, tag, dt);
-		if (!_TIFFMergeFields(tif, fld, 1))
-			return NULL;
-	}
+    fld = TIFFFindField(tif, tag, dt);
+    if (fld == NULL)
+    {
+        fld = _TIFFCreateAnonField(tif, tag, dt);
+        if (!_TIFFMergeFields(tif, fld, 1))
+            return NULL;
+    }
 
-	return fld;
+    return fld;
 }
 
-TIFFField*
-_TIFFCreateAnonField(TIFF *tif, uint32 tag, TIFFDataType field_type)
+TIFFField *_TIFFCreateAnonField(TIFF *tif, uint32_t tag,
+                                TIFFDataType field_type)
 {
-	TIFFField *fld;
-	(void) tif;
-
-	fld = (TIFFField *) _TIFFmalloc(sizeof (TIFFField));
-	if (fld == NULL)
-	    return NULL;
-	_TIFFmemset(fld, 0, sizeof(TIFFField));
-
-	fld->field_tag = tag;
-	fld->field_readcount = TIFF_VARIABLE2;
-	fld->field_writecount = TIFF_VARIABLE2;
-	fld->field_type = field_type;
-	fld->reserved = 0;
-	switch (field_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_UNDEFINED:
-			fld->set_field_type = TIFF_SETGET_C32_UINT8;
-			fld->get_field_type = TIFF_SETGET_C32_UINT8;
-			break;
-		case TIFF_ASCII:
-			fld->set_field_type = TIFF_SETGET_C32_ASCII;
-			fld->get_field_type = TIFF_SETGET_C32_ASCII;
-			break;
-		case TIFF_SHORT:
-			fld->set_field_type = TIFF_SETGET_C32_UINT16;
-			fld->get_field_type = TIFF_SETGET_C32_UINT16;
-			break;
-		case TIFF_LONG:
-			fld->set_field_type = TIFF_SETGET_C32_UINT32;
-			fld->get_field_type = TIFF_SETGET_C32_UINT32;
-			break;
-		case TIFF_RATIONAL:
-		case TIFF_SRATIONAL:
-		case TIFF_FLOAT:
-			fld->set_field_type = TIFF_SETGET_C32_FLOAT;
-			fld->get_field_type = TIFF_SETGET_C32_FLOAT;
-			break;
-		case TIFF_SBYTE:
-			fld->set_field_type = TIFF_SETGET_C32_SINT8;
-			fld->get_field_type = TIFF_SETGET_C32_SINT8;
-			break;
-		case TIFF_SSHORT:
-			fld->set_field_type = TIFF_SETGET_C32_SINT16;
-			fld->get_field_type = TIFF_SETGET_C32_SINT16;
-			break;
-		case TIFF_SLONG:
-			fld->set_field_type = TIFF_SETGET_C32_SINT32;
-			fld->get_field_type = TIFF_SETGET_C32_SINT32;
-			break;
-		case TIFF_DOUBLE:
-			fld->set_field_type = TIFF_SETGET_C32_DOUBLE;
-			fld->get_field_type = TIFF_SETGET_C32_DOUBLE;
-			break;
-		case TIFF_IFD:
-		case TIFF_IFD8:
-			fld->set_field_type = TIFF_SETGET_C32_IFD8;
-			fld->get_field_type = TIFF_SETGET_C32_IFD8;
-			break;
-		case TIFF_LONG8:
-			fld->set_field_type = TIFF_SETGET_C32_UINT64;
-			fld->get_field_type = TIFF_SETGET_C32_UINT64;
-			break;
-		case TIFF_SLONG8:
-			fld->set_field_type = TIFF_SETGET_C32_SINT64;
-			fld->get_field_type = TIFF_SETGET_C32_SINT64;
-			break;
-		default:
-			fld->set_field_type = TIFF_SETGET_UNDEFINED;
-			fld->get_field_type = TIFF_SETGET_UNDEFINED;
-			break;
-	}
-	fld->field_bit = FIELD_CUSTOM;
-	fld->field_oktochange = TRUE;
-	fld->field_passcount = TRUE;
-	fld->field_name = (char *) _TIFFmalloc(32);
-	if (fld->field_name == NULL) {
-	    _TIFFfree(fld);
-	    return NULL;
-	}
-	fld->field_subfields = NULL;
-
-	/* 
-	 * note that this name is a special sign to TIFFClose() and
-	 * _TIFFSetupFields() to free the field
-	 */
-	(void) snprintf(fld->field_name, 32, "Tag %d", (int) tag);
-
-	return fld;    
+    TIFFField *fld;
+    (void)tif;
+
+    fld = (TIFFField *)_TIFFmallocExt(tif, sizeof(TIFFField));
+    if (fld == NULL)
+        return NULL;
+    _TIFFmemset(fld, 0, sizeof(TIFFField));
+
+    fld->field_tag = tag;
+    fld->field_readcount = TIFF_VARIABLE2;
+    fld->field_writecount = TIFF_VARIABLE2;
+    fld->field_type = field_type;
+    fld->field_anonymous =
+        1; /* indicate that this is an anonymous / unknown tag */
+    switch (field_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_UNDEFINED:
+            fld->set_field_type = TIFF_SETGET_C32_UINT8;
+            fld->get_field_type = TIFF_SETGET_C32_UINT8;
+            break;
+        case TIFF_ASCII:
+            fld->set_field_type = TIFF_SETGET_C32_ASCII;
+            fld->get_field_type = TIFF_SETGET_C32_ASCII;
+            break;
+        case TIFF_SHORT:
+            fld->set_field_type = TIFF_SETGET_C32_UINT16;
+            fld->get_field_type = TIFF_SETGET_C32_UINT16;
+            break;
+        case TIFF_LONG:
+            fld->set_field_type = TIFF_SETGET_C32_UINT32;
+            fld->get_field_type = TIFF_SETGET_C32_UINT32;
+            break;
+        case TIFF_RATIONAL:
+        case TIFF_SRATIONAL:
+        case TIFF_FLOAT:
+            fld->set_field_type = TIFF_SETGET_C32_FLOAT;
+            fld->get_field_type = TIFF_SETGET_C32_FLOAT;
+            break;
+        case TIFF_SBYTE:
+            fld->set_field_type = TIFF_SETGET_C32_SINT8;
+            fld->get_field_type = TIFF_SETGET_C32_SINT8;
+            break;
+        case TIFF_SSHORT:
+            fld->set_field_type = TIFF_SETGET_C32_SINT16;
+            fld->get_field_type = TIFF_SETGET_C32_SINT16;
+            break;
+        case TIFF_SLONG:
+            fld->set_field_type = TIFF_SETGET_C32_SINT32;
+            fld->get_field_type = TIFF_SETGET_C32_SINT32;
+            break;
+        case TIFF_DOUBLE:
+            fld->set_field_type = TIFF_SETGET_C32_DOUBLE;
+            fld->get_field_type = TIFF_SETGET_C32_DOUBLE;
+            break;
+        case TIFF_IFD:
+        case TIFF_IFD8:
+            fld->set_field_type = TIFF_SETGET_C32_IFD8;
+            fld->get_field_type = TIFF_SETGET_C32_IFD8;
+            break;
+        case TIFF_LONG8:
+            fld->set_field_type = TIFF_SETGET_C32_UINT64;
+            fld->get_field_type = TIFF_SETGET_C32_UINT64;
+            break;
+        case TIFF_SLONG8:
+            fld->set_field_type = TIFF_SETGET_C32_SINT64;
+            fld->get_field_type = TIFF_SETGET_C32_SINT64;
+            break;
+        default:
+            fld->set_field_type = TIFF_SETGET_UNDEFINED;
+            fld->get_field_type = TIFF_SETGET_UNDEFINED;
+            break;
+    }
+    fld->field_bit = FIELD_CUSTOM;
+    fld->field_oktochange = TRUE;
+    fld->field_passcount = TRUE;
+    fld->field_name = (char *)_TIFFmallocExt(tif, 32);
+    if (fld->field_name == NULL)
+    {
+        _TIFFfreeExt(tif, fld);
+        return NULL;
+    }
+    fld->field_subfields = NULL;
+
+    /*
+     * note that this name is a special sign to TIFFClose() and
+     * _TIFFSetupFields() to free the field
+     * Update:
+     *   This special sign is replaced by fld->field_anonymous  flag.
+     */
+    (void)snprintf(fld->field_name, 32, "Tag %d", (int)tag);
+
+    return fld;
 }
 
 /****************************************************************************
@@ -905,347 +999,353 @@ _TIFFCreateAnonField(TIFF *tif, uint32 tag, TIFFDataType field_type)
  * libtiff versions.
  ****************************************************************************/
 
-static TIFFSetGetFieldType
-_TIFFSetGetType(TIFFDataType type, short count, unsigned char passcount)
+static TIFFSetGetFieldType _TIFFSetGetType(TIFFDataType type, short count,
+                                           unsigned char passcount)
 {
-	if (type == TIFF_ASCII && count == TIFF_VARIABLE && passcount == 0)
-		return TIFF_SETGET_ASCII;
-
-	else if (count == 1 && passcount == 0) {
-		switch (type)
-		{
-			case TIFF_BYTE:
-			case TIFF_UNDEFINED:
-				return TIFF_SETGET_UINT8;
-			case TIFF_ASCII:
-				return TIFF_SETGET_ASCII;
-			case TIFF_SHORT:
-				return TIFF_SETGET_UINT16;
-			case TIFF_LONG:
-				return TIFF_SETGET_UINT32;
-			case TIFF_RATIONAL:
-			case TIFF_SRATIONAL:
-			case TIFF_FLOAT:
-				return TIFF_SETGET_FLOAT;
-			case TIFF_SBYTE:
-				return TIFF_SETGET_SINT8;
-			case TIFF_SSHORT:
-				return TIFF_SETGET_SINT16;
-			case TIFF_SLONG:
-				return TIFF_SETGET_SINT32;
-			case TIFF_DOUBLE:
-				return TIFF_SETGET_DOUBLE;
-			case TIFF_IFD:
-			case TIFF_IFD8:
-				return TIFF_SETGET_IFD8;
-			case TIFF_LONG8:
-				return TIFF_SETGET_UINT64;
-			case TIFF_SLONG8:
-				return TIFF_SETGET_SINT64;
-			default:
-				return TIFF_SETGET_UNDEFINED;
-		}
-	}
-
-	else if (count >= 1 && passcount == 0) {
-		switch (type)
-		{
-			case TIFF_BYTE:
-			case TIFF_UNDEFINED:
-				return TIFF_SETGET_C0_UINT8;
-			case TIFF_ASCII:
-				return TIFF_SETGET_C0_ASCII;
-			case TIFF_SHORT:
-				return TIFF_SETGET_C0_UINT16;
-			case TIFF_LONG:
-				return TIFF_SETGET_C0_UINT32;
-			case TIFF_RATIONAL:
-			case TIFF_SRATIONAL:
-			case TIFF_FLOAT:
-				return TIFF_SETGET_C0_FLOAT;
-			case TIFF_SBYTE:
-				return TIFF_SETGET_C0_SINT8;
-			case TIFF_SSHORT:
-				return TIFF_SETGET_C0_SINT16;
-			case TIFF_SLONG:
-				return TIFF_SETGET_C0_SINT32;
-			case TIFF_DOUBLE:
-				return TIFF_SETGET_C0_DOUBLE;
-			case TIFF_IFD:
-			case TIFF_IFD8:
-				return TIFF_SETGET_C0_IFD8;
-			case TIFF_LONG8:
-				return TIFF_SETGET_C0_UINT64;
-			case TIFF_SLONG8:
-				return TIFF_SETGET_C0_SINT64;
-			default:
-				return TIFF_SETGET_UNDEFINED;
-		}
-	}
-
-	else if (count == TIFF_VARIABLE && passcount == 1) {
-		switch (type)
-		{
-			case TIFF_BYTE:
-			case TIFF_UNDEFINED:
-				return TIFF_SETGET_C16_UINT8;
-			case TIFF_ASCII:
-				return TIFF_SETGET_C16_ASCII;
-			case TIFF_SHORT:
-				return TIFF_SETGET_C16_UINT16;
-			case TIFF_LONG:
-				return TIFF_SETGET_C16_UINT32;
-			case TIFF_RATIONAL:
-			case TIFF_SRATIONAL:
-			case TIFF_FLOAT:
-				return TIFF_SETGET_C16_FLOAT;
-			case TIFF_SBYTE:
-				return TIFF_SETGET_C16_SINT8;
-			case TIFF_SSHORT:
-				return TIFF_SETGET_C16_SINT16;
-			case TIFF_SLONG:
-				return TIFF_SETGET_C16_SINT32;
-			case TIFF_DOUBLE:
-				return TIFF_SETGET_C16_DOUBLE;
-			case TIFF_IFD:
-			case TIFF_IFD8:
-				return TIFF_SETGET_C16_IFD8;
-			case TIFF_LONG8:
-				return TIFF_SETGET_C16_UINT64;
-			case TIFF_SLONG8:
-				return TIFF_SETGET_C16_SINT64;
-			default:
-				return TIFF_SETGET_UNDEFINED;
-		}
-	}
-
-	else if (count == TIFF_VARIABLE2 && passcount == 1) {
-		switch (type)
-		{
-			case TIFF_BYTE:
-			case TIFF_UNDEFINED:
-				return TIFF_SETGET_C32_UINT8;
-			case TIFF_ASCII:
-				return TIFF_SETGET_C32_ASCII;
-			case TIFF_SHORT:
-				return TIFF_SETGET_C32_UINT16;
-			case TIFF_LONG:
-				return TIFF_SETGET_C32_UINT32;
-			case TIFF_RATIONAL:
-			case TIFF_SRATIONAL:
-			case TIFF_FLOAT:
-				return TIFF_SETGET_C32_FLOAT;
-			case TIFF_SBYTE:
-				return TIFF_SETGET_C32_SINT8;
-			case TIFF_SSHORT:
-				return TIFF_SETGET_C32_SINT16;
-			case TIFF_SLONG:
-				return TIFF_SETGET_C32_SINT32;
-			case TIFF_DOUBLE:
-				return TIFF_SETGET_C32_DOUBLE;
-			case TIFF_IFD:
-			case TIFF_IFD8:
-				return TIFF_SETGET_C32_IFD8;
-			case TIFF_LONG8:
-				return TIFF_SETGET_C32_UINT64;
-			case TIFF_SLONG8:
-				return TIFF_SETGET_C32_SINT64;
-			default:
-				return TIFF_SETGET_UNDEFINED;
-		}
-	}
-
-	return TIFF_SETGET_UNDEFINED;
+    if (type == TIFF_ASCII && count == TIFF_VARIABLE && passcount == 0)
+        return TIFF_SETGET_ASCII;
+
+    else if (count == 1 && passcount == 0)
+    {
+        switch (type)
+        {
+            case TIFF_BYTE:
+            case TIFF_UNDEFINED:
+                return TIFF_SETGET_UINT8;
+            case TIFF_ASCII:
+                return TIFF_SETGET_ASCII;
+            case TIFF_SHORT:
+                return TIFF_SETGET_UINT16;
+            case TIFF_LONG:
+                return TIFF_SETGET_UINT32;
+            case TIFF_RATIONAL:
+            case TIFF_SRATIONAL:
+            case TIFF_FLOAT:
+                return TIFF_SETGET_FLOAT;
+            case TIFF_SBYTE:
+                return TIFF_SETGET_SINT8;
+            case TIFF_SSHORT:
+                return TIFF_SETGET_SINT16;
+            case TIFF_SLONG:
+                return TIFF_SETGET_SINT32;
+            case TIFF_DOUBLE:
+                return TIFF_SETGET_DOUBLE;
+            case TIFF_IFD:
+            case TIFF_IFD8:
+                return TIFF_SETGET_IFD8;
+            case TIFF_LONG8:
+                return TIFF_SETGET_UINT64;
+            case TIFF_SLONG8:
+                return TIFF_SETGET_SINT64;
+            default:
+                return TIFF_SETGET_UNDEFINED;
+        }
+    }
+
+    else if (count >= 1 && passcount == 0)
+    {
+        switch (type)
+        {
+            case TIFF_BYTE:
+            case TIFF_UNDEFINED:
+                return TIFF_SETGET_C0_UINT8;
+            case TIFF_ASCII:
+                return TIFF_SETGET_C0_ASCII;
+            case TIFF_SHORT:
+                return TIFF_SETGET_C0_UINT16;
+            case TIFF_LONG:
+                return TIFF_SETGET_C0_UINT32;
+            case TIFF_RATIONAL:
+            case TIFF_SRATIONAL:
+            case TIFF_FLOAT:
+                return TIFF_SETGET_C0_FLOAT;
+            case TIFF_SBYTE:
+                return TIFF_SETGET_C0_SINT8;
+            case TIFF_SSHORT:
+                return TIFF_SETGET_C0_SINT16;
+            case TIFF_SLONG:
+                return TIFF_SETGET_C0_SINT32;
+            case TIFF_DOUBLE:
+                return TIFF_SETGET_C0_DOUBLE;
+            case TIFF_IFD:
+            case TIFF_IFD8:
+                return TIFF_SETGET_C0_IFD8;
+            case TIFF_LONG8:
+                return TIFF_SETGET_C0_UINT64;
+            case TIFF_SLONG8:
+                return TIFF_SETGET_C0_SINT64;
+            default:
+                return TIFF_SETGET_UNDEFINED;
+        }
+    }
+
+    else if (count == TIFF_VARIABLE && passcount == 1)
+    {
+        switch (type)
+        {
+            case TIFF_BYTE:
+            case TIFF_UNDEFINED:
+                return TIFF_SETGET_C16_UINT8;
+            case TIFF_ASCII:
+                return TIFF_SETGET_C16_ASCII;
+            case TIFF_SHORT:
+                return TIFF_SETGET_C16_UINT16;
+            case TIFF_LONG:
+                return TIFF_SETGET_C16_UINT32;
+            case TIFF_RATIONAL:
+            case TIFF_SRATIONAL:
+            case TIFF_FLOAT:
+                return TIFF_SETGET_C16_FLOAT;
+            case TIFF_SBYTE:
+                return TIFF_SETGET_C16_SINT8;
+            case TIFF_SSHORT:
+                return TIFF_SETGET_C16_SINT16;
+            case TIFF_SLONG:
+                return TIFF_SETGET_C16_SINT32;
+            case TIFF_DOUBLE:
+                return TIFF_SETGET_C16_DOUBLE;
+            case TIFF_IFD:
+            case TIFF_IFD8:
+                return TIFF_SETGET_C16_IFD8;
+            case TIFF_LONG8:
+                return TIFF_SETGET_C16_UINT64;
+            case TIFF_SLONG8:
+                return TIFF_SETGET_C16_SINT64;
+            default:
+                return TIFF_SETGET_UNDEFINED;
+        }
+    }
+
+    else if (count == TIFF_VARIABLE2 && passcount == 1)
+    {
+        switch (type)
+        {
+            case TIFF_BYTE:
+            case TIFF_UNDEFINED:
+                return TIFF_SETGET_C32_UINT8;
+            case TIFF_ASCII:
+                return TIFF_SETGET_C32_ASCII;
+            case TIFF_SHORT:
+                return TIFF_SETGET_C32_UINT16;
+            case TIFF_LONG:
+                return TIFF_SETGET_C32_UINT32;
+            case TIFF_RATIONAL:
+            case TIFF_SRATIONAL:
+            case TIFF_FLOAT:
+                return TIFF_SETGET_C32_FLOAT;
+            case TIFF_SBYTE:
+                return TIFF_SETGET_C32_SINT8;
+            case TIFF_SSHORT:
+                return TIFF_SETGET_C32_SINT16;
+            case TIFF_SLONG:
+                return TIFF_SETGET_C32_SINT32;
+            case TIFF_DOUBLE:
+                return TIFF_SETGET_C32_DOUBLE;
+            case TIFF_IFD:
+            case TIFF_IFD8:
+                return TIFF_SETGET_C32_IFD8;
+            case TIFF_LONG8:
+                return TIFF_SETGET_C32_UINT64;
+            case TIFF_SLONG8:
+                return TIFF_SETGET_C32_SINT64;
+            default:
+                return TIFF_SETGET_UNDEFINED;
+        }
+    }
+
+    return TIFF_SETGET_UNDEFINED;
 }
 
-int
-TIFFMergeFieldInfo(TIFF* tif, const TIFFFieldInfo info[], uint32 n)
+int TIFFMergeFieldInfo(TIFF *tif, const TIFFFieldInfo info[], uint32_t n)
 {
-	static const char module[] = "TIFFMergeFieldInfo";
-	static const char reason[] = "for fields array";
-	TIFFField *tp;
-	size_t nfields;
-	uint32 i;
-
-	if (tif->tif_nfieldscompat > 0) {
-		tif->tif_fieldscompat = (TIFFFieldArray *)
-			_TIFFCheckRealloc(tif, tif->tif_fieldscompat,
-					  tif->tif_nfieldscompat + 1,
-					  sizeof(TIFFFieldArray), reason);
-	} else {
-		tif->tif_fieldscompat = (TIFFFieldArray *)
-			_TIFFCheckMalloc(tif, 1, sizeof(TIFFFieldArray),
-					 reason);
-	}
-	if (!tif->tif_fieldscompat) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Failed to allocate fields array");
-		return -1;
-	}
-	nfields = tif->tif_nfieldscompat++;
-
-	tif->tif_fieldscompat[nfields].type = tfiatOther;
-	tif->tif_fieldscompat[nfields].allocated_size = n;
-	tif->tif_fieldscompat[nfields].count = n;
-	tif->tif_fieldscompat[nfields].fields =
-		(TIFFField *)_TIFFCheckMalloc(tif, n, sizeof(TIFFField),
-					      reason);
-	if (!tif->tif_fieldscompat[nfields].fields) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Failed to allocate fields array");
-		return -1;
-	}
-
-	tp = tif->tif_fieldscompat[nfields].fields;
-	for (i = 0; i < n; i++) {
-		tp->field_tag = info[i].field_tag;
-		tp->field_readcount = info[i].field_readcount;
-		tp->field_writecount = info[i].field_writecount;
-		tp->field_type = info[i].field_type;
-		tp->reserved = 0;
-		tp->set_field_type =
-		     _TIFFSetGetType(info[i].field_type,
-				info[i].field_readcount,
-				info[i].field_passcount);
-		tp->get_field_type =
-		     _TIFFSetGetType(info[i].field_type,
-				info[i].field_readcount,
-				info[i].field_passcount);
-		tp->field_bit = info[i].field_bit;
-		tp->field_oktochange = info[i].field_oktochange;
-		tp->field_passcount = info[i].field_passcount;
-		tp->field_name = info[i].field_name;
-		tp->field_subfields = NULL;
-		tp++;
-	}
-
-	if (!_TIFFMergeFields(tif, tif->tif_fieldscompat[nfields].fields, n)) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Setting up field info failed");
-		return -1;
-	}
-
-	return 0;
+    static const char module[] = "TIFFMergeFieldInfo";
+    static const char reason[] = "for fields array";
+    TIFFField *tp;
+    size_t nfields;
+    uint32_t i;
+
+    if (tif->tif_nfieldscompat > 0)
+    {
+        tif->tif_fieldscompat = (TIFFFieldArray *)_TIFFCheckRealloc(
+            tif, tif->tif_fieldscompat, tif->tif_nfieldscompat + 1,
+            sizeof(TIFFFieldArray), reason);
+    }
+    else
+    {
+        tif->tif_fieldscompat = (TIFFFieldArray *)_TIFFCheckMalloc(
+            tif, 1, sizeof(TIFFFieldArray), reason);
+    }
+    if (!tif->tif_fieldscompat)
+    {
+        TIFFErrorExtR(tif, module, "Failed to allocate fields array");
+        return -1;
+    }
+    nfields = tif->tif_nfieldscompat++;
+
+    tif->tif_fieldscompat[nfields].type = tfiatOther;
+    tif->tif_fieldscompat[nfields].allocated_size = n;
+    tif->tif_fieldscompat[nfields].count = n;
+    tif->tif_fieldscompat[nfields].fields =
+        (TIFFField *)_TIFFCheckMalloc(tif, n, sizeof(TIFFField), reason);
+    if (!tif->tif_fieldscompat[nfields].fields)
+    {
+        TIFFErrorExtR(tif, module, "Failed to allocate fields array");
+        return -1;
+    }
+
+    tp = tif->tif_fieldscompat[nfields].fields;
+    for (i = 0; i < n; i++)
+    {
+        tp->field_tag = info[i].field_tag;
+        tp->field_readcount = info[i].field_readcount;
+        tp->field_writecount = info[i].field_writecount;
+        tp->field_type = info[i].field_type;
+        tp->field_anonymous = 0;
+        tp->set_field_type =
+            _TIFFSetGetType(info[i].field_type, info[i].field_readcount,
+                            info[i].field_passcount);
+        tp->get_field_type =
+            _TIFFSetGetType(info[i].field_type, info[i].field_readcount,
+                            info[i].field_passcount);
+        tp->field_bit = info[i].field_bit;
+        tp->field_oktochange = info[i].field_oktochange;
+        tp->field_passcount = info[i].field_passcount;
+        if (info[i].field_name == NULL)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Field_name of %d.th allocation tag %d is NULL", i,
+                          info[i].field_tag);
+            return -1;
+        }
+        tp->field_name = info[i].field_name;
+        tp->field_subfields = NULL;
+        tp++;
+    }
+
+    if (!_TIFFMergeFields(tif, tif->tif_fieldscompat[nfields].fields, n))
+    {
+        TIFFErrorExtR(tif, module, "Setting up field info failed");
+        return -1;
+    }
+
+    return 0;
 }
 
-int
-_TIFFCheckFieldIsValidForCodec(TIFF *tif, ttag_t tag)
+int _TIFFCheckFieldIsValidForCodec(TIFF *tif, ttag_t tag)
 {
-	/* Filter out non-codec specific tags */
-	switch (tag) {
-	    /* Shared tags */
-	    case TIFFTAG_PREDICTOR:
-	    /* JPEG tags */
-	    case TIFFTAG_JPEGTABLES:
-	    /* OJPEG tags */
-	    case TIFFTAG_JPEGIFOFFSET:
-	    case TIFFTAG_JPEGIFBYTECOUNT:
-	    case TIFFTAG_JPEGQTABLES:
-	    case TIFFTAG_JPEGDCTABLES:
-	    case TIFFTAG_JPEGACTABLES:
-	    case TIFFTAG_JPEGPROC:
-	    case TIFFTAG_JPEGRESTARTINTERVAL:
-	    /* CCITT* */
-	    case TIFFTAG_BADFAXLINES:
-	    case TIFFTAG_CLEANFAXDATA:
-	    case TIFFTAG_CONSECUTIVEBADFAXLINES:
-	    case TIFFTAG_GROUP3OPTIONS:
-	    case TIFFTAG_GROUP4OPTIONS:
-	    /* LERC */
-	    case TIFFTAG_LERC_PARAMETERS:
-		break;
-	    default:
-		return 1;
-	}
-	/* Check if codec specific tags are allowed for the current
-	 * compression scheme (codec) */
-	switch (tif->tif_dir.td_compression) {
-	    case COMPRESSION_LZW:
-		if (tag == TIFFTAG_PREDICTOR)
-		    return 1;
-		break;
-	    case COMPRESSION_PACKBITS:
-		/* No codec-specific tags */
-		break;
-	    case COMPRESSION_THUNDERSCAN:
-		/* No codec-specific tags */
-		break;
-	    case COMPRESSION_NEXT:
-		/* No codec-specific tags */
-		break;
-	    case COMPRESSION_JPEG:
-		if (tag == TIFFTAG_JPEGTABLES)
-		    return 1;
-		break;
-	    case COMPRESSION_OJPEG:
-		switch (tag) {
-		    case TIFFTAG_JPEGIFOFFSET:
-		    case TIFFTAG_JPEGIFBYTECOUNT:
-		    case TIFFTAG_JPEGQTABLES:
-		    case TIFFTAG_JPEGDCTABLES:
-		    case TIFFTAG_JPEGACTABLES:
-		    case TIFFTAG_JPEGPROC:
-		    case TIFFTAG_JPEGRESTARTINTERVAL:
-			return 1;
-		}
-		break;
-	    case COMPRESSION_CCITTRLE:
-	    case COMPRESSION_CCITTRLEW:
-	    case COMPRESSION_CCITTFAX3:
-	    case COMPRESSION_CCITTFAX4:
-		switch (tag) {
-		    case TIFFTAG_BADFAXLINES:
-		    case TIFFTAG_CLEANFAXDATA:
-		    case TIFFTAG_CONSECUTIVEBADFAXLINES:
-			return 1;
-		    case TIFFTAG_GROUP3OPTIONS:
-			if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX3)
-			    return 1;
-			break;
-		    case TIFFTAG_GROUP4OPTIONS:
-			if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX4)
-			    return 1;
-			break;
-		}
-		break;
-	    case COMPRESSION_JBIG:
-		/* No codec-specific tags */
-		break;
-	    case COMPRESSION_DEFLATE:
-	    case COMPRESSION_ADOBE_DEFLATE:
-		if (tag == TIFFTAG_PREDICTOR)
-		    return 1;
-		break;
-	   case COMPRESSION_PIXARLOG:
-		if (tag == TIFFTAG_PREDICTOR)
-		    return 1;
-		break;
-	    case COMPRESSION_SGILOG:
-	    case COMPRESSION_SGILOG24:
-		/* No codec-specific tags */
-		break;
-	    case COMPRESSION_LZMA:
-		if (tag == TIFFTAG_PREDICTOR)
-		    return 1;
-		break;
-	    case COMPRESSION_ZSTD:
-		if (tag == TIFFTAG_PREDICTOR)
-		    return 1;
-		break;
-	    case COMPRESSION_LERC:
-		if (tag == TIFFTAG_LERC_PARAMETERS)
-		    return 1;
-		break;
-	}
-	return 0;
+    /* Filter out non-codec specific tags */
+    switch (tag)
+    {
+        /* Shared tags */
+        case TIFFTAG_PREDICTOR:
+        /* JPEG tags */
+        case TIFFTAG_JPEGTABLES:
+        /* OJPEG tags */
+        case TIFFTAG_JPEGIFOFFSET:
+        case TIFFTAG_JPEGIFBYTECOUNT:
+        case TIFFTAG_JPEGQTABLES:
+        case TIFFTAG_JPEGDCTABLES:
+        case TIFFTAG_JPEGACTABLES:
+        case TIFFTAG_JPEGPROC:
+        case TIFFTAG_JPEGRESTARTINTERVAL:
+        /* CCITT* */
+        case TIFFTAG_BADFAXLINES:
+        case TIFFTAG_CLEANFAXDATA:
+        case TIFFTAG_CONSECUTIVEBADFAXLINES:
+        case TIFFTAG_GROUP3OPTIONS:
+        case TIFFTAG_GROUP4OPTIONS:
+        /* LERC */
+        case TIFFTAG_LERC_PARAMETERS:
+            break;
+        default:
+            return 1;
+    }
+    if (!TIFFIsCODECConfigured(tif->tif_dir.td_compression))
+    {
+        return 0;
+    }
+    /* Check if codec specific tags are allowed for the current
+     * compression scheme (codec) */
+    switch (tif->tif_dir.td_compression)
+    {
+        case COMPRESSION_LZW:
+            if (tag == TIFFTAG_PREDICTOR)
+                return 1;
+            break;
+        case COMPRESSION_PACKBITS:
+            /* No codec-specific tags */
+            break;
+        case COMPRESSION_THUNDERSCAN:
+            /* No codec-specific tags */
+            break;
+        case COMPRESSION_NEXT:
+            /* No codec-specific tags */
+            break;
+        case COMPRESSION_JPEG:
+            if (tag == TIFFTAG_JPEGTABLES)
+                return 1;
+            break;
+        case COMPRESSION_OJPEG:
+            switch (tag)
+            {
+                case TIFFTAG_JPEGIFOFFSET:
+                case TIFFTAG_JPEGIFBYTECOUNT:
+                case TIFFTAG_JPEGQTABLES:
+                case TIFFTAG_JPEGDCTABLES:
+                case TIFFTAG_JPEGACTABLES:
+                case TIFFTAG_JPEGPROC:
+                case TIFFTAG_JPEGRESTARTINTERVAL:
+                    return 1;
+            }
+            break;
+        case COMPRESSION_CCITTRLE:
+        case COMPRESSION_CCITTRLEW:
+        case COMPRESSION_CCITTFAX3:
+        case COMPRESSION_CCITTFAX4:
+            switch (tag)
+            {
+                case TIFFTAG_BADFAXLINES:
+                case TIFFTAG_CLEANFAXDATA:
+                case TIFFTAG_CONSECUTIVEBADFAXLINES:
+                    return 1;
+                case TIFFTAG_GROUP3OPTIONS:
+                    if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX3)
+                        return 1;
+                    break;
+                case TIFFTAG_GROUP4OPTIONS:
+                    if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX4)
+                        return 1;
+                    break;
+            }
+            break;
+        case COMPRESSION_JBIG:
+            /* No codec-specific tags */
+            break;
+        case COMPRESSION_DEFLATE:
+        case COMPRESSION_ADOBE_DEFLATE:
+            if (tag == TIFFTAG_PREDICTOR)
+                return 1;
+            break;
+        case COMPRESSION_PIXARLOG:
+            if (tag == TIFFTAG_PREDICTOR)
+                return 1;
+            break;
+        case COMPRESSION_SGILOG:
+        case COMPRESSION_SGILOG24:
+            /* No codec-specific tags */
+            break;
+        case COMPRESSION_LZMA:
+            if (tag == TIFFTAG_PREDICTOR)
+                return 1;
+            break;
+        case COMPRESSION_ZSTD:
+            if (tag == TIFFTAG_PREDICTOR)
+                return 1;
+            break;
+        case COMPRESSION_LERC:
+            if (tag == TIFFTAG_LERC_PARAMETERS)
+                return 1;
+            break;
+    }
+    return 0;
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_dirread.c b/3rdparty/libtiff/tif_dirread.c
index ba127ca9174b..2c49dc6aa026 100644
--- a/3rdparty/libtiff/tif_dirread.c
+++ b/3rdparty/libtiff/tif_dirread.c
@@ -34,5839 +34,7250 @@
  *   TIFFReadDirectory, so as to eliminate current possibly repetitive lookup.
  */
 
+#include "tiffconf.h"
 #include "tiffiop.h"
 #include <float.h>
+#include <limits.h>
 #include <stdlib.h>
+#include <string.h>
 
-#define FAILED_FII    ((uint32) -1)
-
-/*
- * Largest 64-bit signed integer value.
- */
-#define TIFF_INT64_MAX ((int64)(TIFF_UINT64_MAX >> 1))
+#define FAILED_FII ((uint32_t)-1)
 
 #ifdef HAVE_IEEEFP
-# define TIFFCvtIEEEFloatToNative(tif, n, fp)
-# define TIFFCvtIEEEDoubleToNative(tif, n, dp)
+#define TIFFCvtIEEEFloatToNative(tif, n, fp)
+#define TIFFCvtIEEEDoubleToNative(tif, n, dp)
 #else
-extern void TIFFCvtIEEEFloatToNative(TIFF*, uint32, float*);
-extern void TIFFCvtIEEEDoubleToNative(TIFF*, uint32, double*);
+extern void TIFFCvtIEEEFloatToNative(TIFF *, uint32_t, float *);
+extern void TIFFCvtIEEEDoubleToNative(TIFF *, uint32_t, double *);
 #endif
 
-enum TIFFReadDirEntryErr {
-	TIFFReadDirEntryErrOk = 0,
-	TIFFReadDirEntryErrCount = 1,
-	TIFFReadDirEntryErrType = 2,
-	TIFFReadDirEntryErrIo = 3,
-	TIFFReadDirEntryErrRange = 4,
-	TIFFReadDirEntryErrPsdif = 5,
-	TIFFReadDirEntryErrSizesan = 6,
-	TIFFReadDirEntryErrAlloc = 7,
+enum TIFFReadDirEntryErr
+{
+    TIFFReadDirEntryErrOk = 0,
+    TIFFReadDirEntryErrCount = 1,
+    TIFFReadDirEntryErrType = 2,
+    TIFFReadDirEntryErrIo = 3,
+    TIFFReadDirEntryErrRange = 4,
+    TIFFReadDirEntryErrPsdif = 5,
+    TIFFReadDirEntryErrSizesan = 6,
+    TIFFReadDirEntryErrAlloc = 7,
 };
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryByte(TIFF* tif, TIFFDirEntry* direntry, uint8* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryShort(TIFF* tif, TIFFDirEntry* direntry, uint16* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLong(TIFF* tif, TIFFDirEntry* direntry, uint32* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLong8(TIFF* tif, TIFFDirEntry* direntry, uint64* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryFloat(TIFF* tif, TIFFDirEntry* direntry, float* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryDouble(TIFF* tif, TIFFDirEntry* direntry, double* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryIfd8(TIFF* tif, TIFFDirEntry* direntry, uint64* value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryArray(TIFF* tif, TIFFDirEntry* direntry, uint32* count, uint32 desttypesize, void** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryByteArray(TIFF* tif, TIFFDirEntry* direntry, uint8** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntrySbyteArray(TIFF* tif, TIFFDirEntry* direntry, int8** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryShortArray(TIFF* tif, TIFFDirEntry* direntry, uint16** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntrySshortArray(TIFF* tif, TIFFDirEntry* direntry, int16** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLongArray(TIFF* tif, TIFFDirEntry* direntry, uint32** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntrySlongArray(TIFF* tif, TIFFDirEntry* direntry, int32** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLong8Array(TIFF* tif, TIFFDirEntry* direntry, uint64** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntrySlong8Array(TIFF* tif, TIFFDirEntry* direntry, int64** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryFloatArray(TIFF* tif, TIFFDirEntry* direntry, float** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryDoubleArray(TIFF* tif, TIFFDirEntry* direntry, double** value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryIfd8Array(TIFF* tif, TIFFDirEntry* direntry, uint64** value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryPersampleShort(TIFF* tif, TIFFDirEntry* direntry, uint16* value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryByte(TIFF *tif, TIFFDirEntry *direntry, uint8_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySbyte(TIFF *tif, TIFFDirEntry *direntry, int8_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryShort(TIFF *tif, TIFFDirEntry *direntry, uint16_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySshort(TIFF *tif, TIFFDirEntry *direntry, int16_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryLong(TIFF *tif, TIFFDirEntry *direntry, uint32_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySlong(TIFF *tif, TIFFDirEntry *direntry, int32_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryLong8(TIFF *tif, TIFFDirEntry *direntry, uint64_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySlong8(TIFF *tif, TIFFDirEntry *direntry, int64_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryFloat(TIFF *tif, TIFFDirEntry *direntry, float *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryDouble(TIFF *tif, TIFFDirEntry *direntry, double *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryIfd8(TIFF *tif, TIFFDirEntry *direntry, uint64_t *value);
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryArray(TIFF *tif, TIFFDirEntry *direntry, uint32_t *count,
+                      uint32_t desttypesize, void **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryByteArray(TIFF *tif, TIFFDirEntry *direntry, uint8_t **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySbyteArray(TIFF *tif, TIFFDirEntry *direntry, int8_t **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryShortArray(TIFF *tif, TIFFDirEntry *direntry, uint16_t **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySshortArray(TIFF *tif, TIFFDirEntry *direntry, int16_t **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryLongArray(TIFF *tif, TIFFDirEntry *direntry, uint32_t **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySlongArray(TIFF *tif, TIFFDirEntry *direntry, int32_t **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryLong8Array(TIFF *tif, TIFFDirEntry *direntry, uint64_t **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySlong8Array(TIFF *tif, TIFFDirEntry *direntry, int64_t **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryFloatArray(TIFF *tif, TIFFDirEntry *direntry, float **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryDoubleArray(TIFF *tif, TIFFDirEntry *direntry, double **value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryIfd8Array(TIFF *tif, TIFFDirEntry *direntry, uint64_t **value);
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryPersampleShort(TIFF *tif, TIFFDirEntry *direntry,
+                               uint16_t *value);
+
+static void TIFFReadDirEntryCheckedByte(TIFF *tif, TIFFDirEntry *direntry,
+                                        uint8_t *value);
+static void TIFFReadDirEntryCheckedSbyte(TIFF *tif, TIFFDirEntry *direntry,
+                                         int8_t *value);
+static void TIFFReadDirEntryCheckedShort(TIFF *tif, TIFFDirEntry *direntry,
+                                         uint16_t *value);
+static void TIFFReadDirEntryCheckedSshort(TIFF *tif, TIFFDirEntry *direntry,
+                                          int16_t *value);
+static void TIFFReadDirEntryCheckedLong(TIFF *tif, TIFFDirEntry *direntry,
+                                        uint32_t *value);
+static void TIFFReadDirEntryCheckedSlong(TIFF *tif, TIFFDirEntry *direntry,
+                                         int32_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedLong8(TIFF *tif, TIFFDirEntry *direntry,
+                             uint64_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedSlong8(TIFF *tif, TIFFDirEntry *direntry,
+                              int64_t *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedRational(TIFF *tif, TIFFDirEntry *direntry,
+                                double *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedSrational(TIFF *tif, TIFFDirEntry *direntry,
+                                 double *value);
+static void TIFFReadDirEntryCheckedFloat(TIFF *tif, TIFFDirEntry *direntry,
+                                         float *value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedDouble(TIFF *tif, TIFFDirEntry *direntry, double *value);
 #if 0
-static enum TIFFReadDirEntryErr TIFFReadDirEntryPersampleDouble(TIFF* tif, TIFFDirEntry* direntry, double* value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedRationalDirect(TIFF *tif, TIFFDirEntry *direntry,
+                                      TIFFRational_t *value);
 #endif
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteSbyte(int8_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteShort(uint16_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteSshort(int16_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteLong(uint32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteSlong(int32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteLong8(uint64_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteSlong8(int64_t value);
 
-static void TIFFReadDirEntryCheckedByte(TIFF* tif, TIFFDirEntry* direntry, uint8* value);
-static void TIFFReadDirEntryCheckedSbyte(TIFF* tif, TIFFDirEntry* direntry, int8* value);
-static void TIFFReadDirEntryCheckedShort(TIFF* tif, TIFFDirEntry* direntry, uint16* value);
-static void TIFFReadDirEntryCheckedSshort(TIFF* tif, TIFFDirEntry* direntry, int16* value);
-static void TIFFReadDirEntryCheckedLong(TIFF* tif, TIFFDirEntry* direntry, uint32* value);
-static void TIFFReadDirEntryCheckedSlong(TIFF* tif, TIFFDirEntry* direntry, int32* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedLong8(TIFF* tif, TIFFDirEntry* direntry, uint64* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedSlong8(TIFF* tif, TIFFDirEntry* direntry, int64* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedRational(TIFF* tif, TIFFDirEntry* direntry, double* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedSrational(TIFF* tif, TIFFDirEntry* direntry, double* value);
-static void TIFFReadDirEntryCheckedFloat(TIFF* tif, TIFFDirEntry* direntry, float* value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedDouble(TIFF* tif, TIFFDirEntry* direntry, double* value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteSbyte(int8 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteShort(uint16 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteSshort(int16 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteLong(uint32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteSlong(int32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteLong8(uint64 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteSlong8(int64 value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteByte(uint8 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteShort(uint16 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteSshort(int16 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteLong(uint32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteSlong(int32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteLong8(uint64 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteSlong8(int64 value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortSbyte(int8 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortSshort(int16 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortLong(uint32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortSlong(int32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortLong8(uint64 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortSlong8(int64 value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortShort(uint16 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortLong(uint32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortSlong(int32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortLong8(uint64 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortSlong8(int64 value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongSbyte(int8 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongSshort(int16 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongSlong(int32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongLong8(uint64 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongSlong8(int64 value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSlongLong(uint32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSlongLong8(uint64 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSlongSlong8(int64 value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLong8Sbyte(int8 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLong8Sshort(int16 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLong8Slong(int32 value);
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLong8Slong8(int64 value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSlong8Long8(uint64 value);
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryData(TIFF* tif, uint64 offset, tmsize_t size, void* dest);
-static void TIFFReadDirEntryOutputErr(TIFF* tif, enum TIFFReadDirEntryErr err, const char* module, const char* tagname, int recover);
-
-static void TIFFReadDirectoryCheckOrder(TIFF* tif, TIFFDirEntry* dir, uint16 dircount);
-static TIFFDirEntry* TIFFReadDirectoryFindEntry(TIFF* tif, TIFFDirEntry* dir, uint16 dircount, uint16 tagid);
-static void TIFFReadDirectoryFindFieldInfo(TIFF* tif, uint16 tagid, uint32* fii);
-
-static int EstimateStripByteCounts(TIFF* tif, TIFFDirEntry* dir, uint16 dircount);
-static void MissingRequired(TIFF*, const char*);
-static int TIFFCheckDirOffset(TIFF* tif, uint64 diroff);
-static int CheckDirCount(TIFF*, TIFFDirEntry*, uint32);
-static uint16 TIFFFetchDirectory(TIFF* tif, uint64 diroff, TIFFDirEntry** pdir, uint64* nextdiroff);
-static int TIFFFetchNormalTag(TIFF*, TIFFDirEntry*, int recover);
-static int TIFFFetchStripThing(TIFF* tif, TIFFDirEntry* dir, uint32 nstrips, uint64** lpp);
-static int TIFFFetchSubjectDistance(TIFF*, TIFFDirEntry*);
-static void ChopUpSingleUncompressedStrip(TIFF*);
-static void TryChopUpUncompressedBigTiff(TIFF*);
-static uint64 TIFFReadUInt64(const uint8 *value);
-static int _TIFFGetMaxColorChannels(uint16 photometric);
-
-static int _TIFFFillStrilesInternal( TIFF *tif, int loadStripByteCount );
-
-typedef union _UInt64Aligned_t
-{
-        double d;
-	uint64 l;
-	uint32 i[2];
-	uint16 s[4];
-	uint8  c[8];
-} UInt64Aligned_t;
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteByte(uint8_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteShort(uint16_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteSshort(int16_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteLong(uint32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteSlong(int32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteLong8(uint64_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteSlong8(int64_t value);
 
-/*
-  Unaligned safe copy of a uint64 value from an octet array.
-*/
-static uint64 TIFFReadUInt64(const uint8 *value)
-{
-	UInt64Aligned_t result;
-
-	result.c[0]=value[0];
-	result.c[1]=value[1];
-	result.c[2]=value[2];
-	result.c[3]=value[3];
-	result.c[4]=value[4];
-	result.c[5]=value[5];
-	result.c[6]=value[6];
-	result.c[7]=value[7];
-
-	return result.l;
-}
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortSbyte(int8_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortSshort(int16_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortLong(uint32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortSlong(int32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortLong8(uint64_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortSlong8(int64_t value);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryByte(TIFF* tif, TIFFDirEntry* direntry, uint8* value)
-{
-	enum TIFFReadDirEntryErr err;
-	if (direntry->tdir_count!=1)
-		return(TIFFReadDirEntryErrCount);
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_UNDEFINED:	/* Support to read TIFF_UNDEFINED with field_readcount==1 */
-			TIFFReadDirEntryCheckedByte(tif,direntry,value);
-			return(TIFFReadDirEntryErrOk);
-		case TIFF_SBYTE:
-			{
-				int8 m;
-				TIFFReadDirEntryCheckedSbyte(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeByteSbyte(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint8)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SHORT:
-			{
-				uint16 m;
-				TIFFReadDirEntryCheckedShort(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeByteShort(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint8)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SSHORT:
-			{
-				int16 m;
-				TIFFReadDirEntryCheckedSshort(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeByteSshort(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint8)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG:
-			{
-				uint32 m;
-				TIFFReadDirEntryCheckedLong(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeByteLong(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint8)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG:
-			{
-				int32 m;
-				TIFFReadDirEntryCheckedSlong(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeByteSlong(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint8)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG8:
-			{
-				uint64 m;
-				err=TIFFReadDirEntryCheckedLong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				err=TIFFReadDirEntryCheckRangeByteLong8(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint8)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG8:
-			{
-				int64 m;
-				err=TIFFReadDirEntryCheckedSlong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				err=TIFFReadDirEntryCheckRangeByteSlong8(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint8)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-}
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortShort(uint16_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortLong(uint32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortSlong(int32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortLong8(uint64_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortSlong8(int64_t value);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryShort(TIFF* tif, TIFFDirEntry* direntry, uint16* value)
-{
-	enum TIFFReadDirEntryErr err;
-	if (direntry->tdir_count!=1)
-		return(TIFFReadDirEntryErrCount);
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8 m;
-				TIFFReadDirEntryCheckedByte(tif,direntry,&m);
-				*value=(uint16)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SBYTE:
-			{
-				int8 m;
-				TIFFReadDirEntryCheckedSbyte(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeShortSbyte(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint16)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SHORT:
-			TIFFReadDirEntryCheckedShort(tif,direntry,value);
-			return(TIFFReadDirEntryErrOk);
-		case TIFF_SSHORT:
-			{
-				int16 m;
-				TIFFReadDirEntryCheckedSshort(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeShortSshort(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint16)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG:
-			{
-				uint32 m;
-				TIFFReadDirEntryCheckedLong(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeShortLong(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint16)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG:
-			{
-				int32 m;
-				TIFFReadDirEntryCheckedSlong(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeShortSlong(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint16)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG8:
-			{
-				uint64 m;
-				err=TIFFReadDirEntryCheckedLong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				err=TIFFReadDirEntryCheckRangeShortLong8(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint16)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG8:
-			{
-				int64 m;
-				err=TIFFReadDirEntryCheckedSlong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				err=TIFFReadDirEntryCheckRangeShortSlong8(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint16)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-}
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongSbyte(int8_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongSshort(int16_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongSlong(int32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongLong8(uint64_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongSlong8(int64_t value);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLong(TIFF* tif, TIFFDirEntry* direntry, uint32* value)
-{
-	enum TIFFReadDirEntryErr err;
-	if (direntry->tdir_count!=1)
-		return(TIFFReadDirEntryErrCount);
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8 m;
-				TIFFReadDirEntryCheckedByte(tif,direntry,&m);
-				*value=(uint32)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SBYTE:
-			{
-				int8 m;
-				TIFFReadDirEntryCheckedSbyte(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeLongSbyte(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint32)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SHORT:
-			{
-				uint16 m;
-				TIFFReadDirEntryCheckedShort(tif,direntry,&m);
-				*value=(uint32)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SSHORT:
-			{
-				int16 m;
-				TIFFReadDirEntryCheckedSshort(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeLongSshort(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint32)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG:
-			TIFFReadDirEntryCheckedLong(tif,direntry,value);
-			return(TIFFReadDirEntryErrOk);
-		case TIFF_SLONG:
-			{
-				int32 m;
-				TIFFReadDirEntryCheckedSlong(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeLongSlong(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint32)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG8:
-			{
-				uint64 m;
-				err=TIFFReadDirEntryCheckedLong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				err=TIFFReadDirEntryCheckRangeLongLong8(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint32)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG8:
-			{
-				int64 m;
-				err=TIFFReadDirEntryCheckedSlong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				err=TIFFReadDirEntryCheckRangeLongSlong8(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint32)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-}
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSlongLong(uint32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSlongLong8(uint64_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSlongSlong8(int64_t value);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLong8(TIFF* tif, TIFFDirEntry* direntry, uint64* value)
-{
-	enum TIFFReadDirEntryErr err;
-	if (direntry->tdir_count!=1)
-		return(TIFFReadDirEntryErrCount);
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8 m;
-				TIFFReadDirEntryCheckedByte(tif,direntry,&m);
-				*value=(uint64)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SBYTE:
-			{
-				int8 m;
-				TIFFReadDirEntryCheckedSbyte(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeLong8Sbyte(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint64)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SHORT:
-			{
-				uint16 m;
-				TIFFReadDirEntryCheckedShort(tif,direntry,&m);
-				*value=(uint64)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SSHORT:
-			{
-				int16 m;
-				TIFFReadDirEntryCheckedSshort(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeLong8Sshort(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint64)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG:
-			{
-				uint32 m;
-				TIFFReadDirEntryCheckedLong(tif,direntry,&m);
-				*value=(uint64)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG:
-			{
-				int32 m;
-				TIFFReadDirEntryCheckedSlong(tif,direntry,&m);
-				err=TIFFReadDirEntryCheckRangeLong8Slong(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint64)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG8:
-			err=TIFFReadDirEntryCheckedLong8(tif,direntry,value);
-			return(err);
-		case TIFF_SLONG8:
-			{
-				int64 m;
-				err=TIFFReadDirEntryCheckedSlong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				err=TIFFReadDirEntryCheckRangeLong8Slong8(m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(uint64)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-}
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLong8Sbyte(int8_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLong8Sshort(int16_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLong8Slong(int32_t value);
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLong8Slong8(int64_t value);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryFloat(TIFF* tif, TIFFDirEntry* direntry, float* value)
-{
-	enum TIFFReadDirEntryErr err;
-	if (direntry->tdir_count!=1)
-		return(TIFFReadDirEntryErrCount);
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8 m;
-				TIFFReadDirEntryCheckedByte(tif,direntry,&m);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SBYTE:
-			{
-				int8 m;
-				TIFFReadDirEntryCheckedSbyte(tif,direntry,&m);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SHORT:
-			{
-				uint16 m;
-				TIFFReadDirEntryCheckedShort(tif,direntry,&m);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SSHORT:
-			{
-				int16 m;
-				TIFFReadDirEntryCheckedSshort(tif,direntry,&m);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG:
-			{
-				uint32 m;
-				TIFFReadDirEntryCheckedLong(tif,direntry,&m);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG:
-			{
-				int32 m;
-				TIFFReadDirEntryCheckedSlong(tif,direntry,&m);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG8:
-			{
-				uint64 m;
-				err=TIFFReadDirEntryCheckedLong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-#if defined(__WIN32__) && (_MSC_VER < 1500)
-				/*
-				 * XXX: MSVC 6.0 does not support conversion
-				 * of 64-bit integers into floating point
-				 * values.
-				 */
-				*value = _TIFFUInt64ToFloat(m);
-#else
-				*value=(float)m;
-#endif
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG8:
-			{
-				int64 m;
-				err=TIFFReadDirEntryCheckedSlong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_RATIONAL:
-			{
-				double m;
-				err=TIFFReadDirEntryCheckedRational(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SRATIONAL:
-			{
-				double m;
-				err=TIFFReadDirEntryCheckedSrational(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_FLOAT:
-			TIFFReadDirEntryCheckedFloat(tif,direntry,value);
-			return(TIFFReadDirEntryErrOk);
-		case TIFF_DOUBLE:
-			{
-				double m;
-				err=TIFFReadDirEntryCheckedDouble(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				if ((m > FLT_MAX) || (m < -FLT_MAX))
-					return(TIFFReadDirEntryErrRange);
-				*value=(float)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-}
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSlong8Long8(uint64_t value);
+
+static enum TIFFReadDirEntryErr TIFFReadDirEntryData(TIFF *tif, uint64_t offset,
+                                                     tmsize_t size, void *dest);
+static void TIFFReadDirEntryOutputErr(TIFF *tif, enum TIFFReadDirEntryErr err,
+                                      const char *module, const char *tagname,
+                                      int recover);
+
+static void TIFFReadDirectoryCheckOrder(TIFF *tif, TIFFDirEntry *dir,
+                                        uint16_t dircount);
+static TIFFDirEntry *TIFFReadDirectoryFindEntry(TIFF *tif, TIFFDirEntry *dir,
+                                                uint16_t dircount,
+                                                uint16_t tagid);
+static void TIFFReadDirectoryFindFieldInfo(TIFF *tif, uint16_t tagid,
+                                           uint32_t *fii);
+
+static int EstimateStripByteCounts(TIFF *tif, TIFFDirEntry *dir,
+                                   uint16_t dircount);
+static void MissingRequired(TIFF *, const char *);
+static int CheckDirCount(TIFF *, TIFFDirEntry *, uint32_t);
+static uint16_t TIFFFetchDirectory(TIFF *tif, uint64_t diroff,
+                                   TIFFDirEntry **pdir, uint64_t *nextdiroff);
+static int TIFFFetchNormalTag(TIFF *, TIFFDirEntry *, int recover);
+static int TIFFFetchStripThing(TIFF *tif, TIFFDirEntry *dir, uint32_t nstrips,
+                               uint64_t **lpp);
+static int TIFFFetchSubjectDistance(TIFF *, TIFFDirEntry *);
+static void ChopUpSingleUncompressedStrip(TIFF *);
+static void TryChopUpUncompressedBigTiff(TIFF *);
+static uint64_t TIFFReadUInt64(const uint8_t *value);
+static int _TIFFGetMaxColorChannels(uint16_t photometric);
+
+static int _TIFFFillStrilesInternal(TIFF *tif, int loadStripByteCount);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryDouble(TIFF* tif, TIFFDirEntry* direntry, double* value)
+typedef union _UInt64Aligned_t
 {
-	enum TIFFReadDirEntryErr err;
-	if (direntry->tdir_count!=1)
-		return(TIFFReadDirEntryErrCount);
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8 m;
-				TIFFReadDirEntryCheckedByte(tif,direntry,&m);
-				*value=(double)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SBYTE:
-			{
-				int8 m;
-				TIFFReadDirEntryCheckedSbyte(tif,direntry,&m);
-				*value=(double)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SHORT:
-			{
-				uint16 m;
-				TIFFReadDirEntryCheckedShort(tif,direntry,&m);
-				*value=(double)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SSHORT:
-			{
-				int16 m;
-				TIFFReadDirEntryCheckedSshort(tif,direntry,&m);
-				*value=(double)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG:
-			{
-				uint32 m;
-				TIFFReadDirEntryCheckedLong(tif,direntry,&m);
-				*value=(double)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG:
-			{
-				int32 m;
-				TIFFReadDirEntryCheckedSlong(tif,direntry,&m);
-				*value=(double)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG8:
-			{
-				uint64 m;
-				err=TIFFReadDirEntryCheckedLong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-#if defined(__WIN32__) && (_MSC_VER < 1500)
-				/*
-				 * XXX: MSVC 6.0 does not support conversion
-				 * of 64-bit integers into floating point
-				 * values.
-				 */
-				*value = _TIFFUInt64ToDouble(m);
-#else
-				*value = (double)m;
-#endif
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG8:
-			{
-				int64 m;
-				err=TIFFReadDirEntryCheckedSlong8(tif,direntry,&m);
-				if (err!=TIFFReadDirEntryErrOk)
-					return(err);
-				*value=(double)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_RATIONAL:
-			err=TIFFReadDirEntryCheckedRational(tif,direntry,value);
-			return(err);
-		case TIFF_SRATIONAL:
-			err=TIFFReadDirEntryCheckedSrational(tif,direntry,value);
-			return(err);
-		case TIFF_FLOAT:
-			{
-				float m;
-				TIFFReadDirEntryCheckedFloat(tif,direntry,&m);
-				*value=(double)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_DOUBLE:
-			err=TIFFReadDirEntryCheckedDouble(tif,direntry,value);
-			return(err);
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-}
+    double d;
+    uint64_t l;
+    uint32_t i[2];
+    uint16_t s[4];
+    uint8_t c[8];
+} UInt64Aligned_t;
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryIfd8(TIFF* tif, TIFFDirEntry* direntry, uint64* value)
+/*
+  Unaligned safe copy of a uint64_t value from an octet array.
+*/
+static uint64_t TIFFReadUInt64(const uint8_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	if (direntry->tdir_count!=1)
-		return(TIFFReadDirEntryErrCount);
-	switch (direntry->tdir_type)
-	{
-		case TIFF_LONG:
-		case TIFF_IFD:
-			{
-				uint32 m;
-				TIFFReadDirEntryCheckedLong(tif,direntry,&m);
-				*value=(uint64)m;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_LONG8:
-		case TIFF_IFD8:
-			err=TIFFReadDirEntryCheckedLong8(tif,direntry,value);
-			return(err);
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
+    UInt64Aligned_t result;
+
+    result.c[0] = value[0];
+    result.c[1] = value[1];
+    result.c[2] = value[2];
+    result.c[3] = value[3];
+    result.c[4] = value[4];
+    result.c[5] = value[5];
+    result.c[6] = value[6];
+    result.c[7] = value[7];
+
+    return result.l;
 }
 
-
-#define INITIAL_THRESHOLD (1024 * 1024)
-#define THRESHOLD_MULTIPLIER 10
-#define MAX_THRESHOLD (THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER * INITIAL_THRESHOLD)
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryDataAndRealloc(
-                    TIFF* tif, uint64 offset, tmsize_t size, void** pdest)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryByte(TIFF *tif, TIFFDirEntry *direntry, uint8_t *value)
 {
-#if SIZEOF_SIZE_T == 8
-        tmsize_t threshold = INITIAL_THRESHOLD;
-#endif
-        tmsize_t already_read = 0;
-
-        assert( !isMapped(tif) );
-
-        if (!SeekOK(tif,offset))
-                return(TIFFReadDirEntryErrIo);
-
-        /* On 64 bit processes, read first a maximum of 1 MB, then 10 MB, etc */
-        /* so as to avoid allocating too much memory in case the file is too */
-        /* short. We could ask for the file size, but this might be */
-        /* expensive with some I/O layers (think of reading a gzipped file) */
-        /* Restrict to 64 bit processes, so as to avoid reallocs() */
-        /* on 32 bit processes where virtual memory is scarce.  */
-        while( already_read < size )
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_UNDEFINED: /* Support to read TIFF_UNDEFINED with
+                                field_readcount==1 */
+            TIFFReadDirEntryCheckedByte(tif, direntry, value);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_SBYTE:
         {
-            void* new_dest;
-            tmsize_t bytes_read;
-            tmsize_t to_read = size - already_read;
-#if SIZEOF_SIZE_T == 8
-            if( to_read >= threshold && threshold < MAX_THRESHOLD )
-            {
-                to_read = threshold;
-                threshold *= THRESHOLD_MULTIPLIER;
-            }
-#endif
-
-            new_dest = (uint8*) _TIFFrealloc(
-                            *pdest, already_read + to_read);
-            if( new_dest == NULL )
-            {
-                TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-                            "Failed to allocate memory for %s "
-                            "(%ld elements of %ld bytes each)",
-                            "TIFFReadDirEntryArray",
-                             (long) 1, (long) (already_read + to_read));
-                return TIFFReadDirEntryErrAlloc;
-            }
-            *pdest = new_dest;
-
-            bytes_read = TIFFReadFile(tif,
-                (char*)*pdest + already_read, to_read);
-            already_read += bytes_read;
-            if (bytes_read != to_read) {
-                return TIFFReadDirEntryErrIo;
-            }
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeByteSbyte(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint8_t)m;
+            return (TIFFReadDirEntryErrOk);
         }
-        return TIFFReadDirEntryErrOk;
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryArrayWithLimit(
-    TIFF* tif, TIFFDirEntry* direntry, uint32* count, uint32 desttypesize,
-    void** value, uint64 maxcount)
-{
-	int typesize;
-	uint32 datasize;
-	void* data;
-        uint64 target_count64;
-        int original_datasize_clamped;
-	typesize=TIFFDataWidth(direntry->tdir_type);
-
-        target_count64 = (direntry->tdir_count > maxcount) ?
-                maxcount : direntry->tdir_count;
-
-	if ((target_count64==0)||(typesize==0))
-	{
-		*value=0;
-		return(TIFFReadDirEntryErrOk);
-	}
-        (void) desttypesize;
-
-        /* We just want to know if the original tag size is more than 4 bytes
-         * (classic TIFF) or 8 bytes (BigTIFF)
-         */
-        original_datasize_clamped =
-            ((direntry->tdir_count > 10) ? 10 : (int)direntry->tdir_count) * typesize;
-
-        /* 
-         * As a sanity check, make sure we have no more than a 2GB tag array 
-         * in either the current data type or the dest data type.  This also
-         * avoids problems with overflow of tmsize_t on 32bit systems.
-         */
-	if ((uint64)(2147483647/typesize)<target_count64)
-		return(TIFFReadDirEntryErrSizesan);
-	if ((uint64)(2147483647/desttypesize)<target_count64)
-		return(TIFFReadDirEntryErrSizesan);
-
-	*count=(uint32)target_count64;
-	datasize=(*count)*typesize;
-	assert((tmsize_t)datasize>0);
-
-	if( isMapped(tif) && datasize > (uint64)tif->tif_size )
-		return TIFFReadDirEntryErrIo;
-
-	if( !isMapped(tif) &&
-		(((tif->tif_flags&TIFF_BIGTIFF) && datasize > 8) ||
-		(!(tif->tif_flags&TIFF_BIGTIFF) && datasize > 4)) )
-	{
-		data = NULL;
-	}
-	else
-	{
-		data=_TIFFCheckMalloc(tif, *count, typesize, "ReadDirEntryArray");
-		if (data==0)
-			return(TIFFReadDirEntryErrAlloc);
-	}
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		if (original_datasize_clamped<=4)
-			_TIFFmemcpy(data,&direntry->tdir_offset,datasize);
-		else
-		{
-			enum TIFFReadDirEntryErr err;
-			uint32 offset = direntry->tdir_offset.toff_long;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong(&offset);
-			if( isMapped(tif) )
-				err=TIFFReadDirEntryData(tif,(uint64)offset,(tmsize_t)datasize,data);
-			else
-				err=TIFFReadDirEntryDataAndRealloc(tif,(uint64)offset,(tmsize_t)datasize,&data);
-			if (err!=TIFFReadDirEntryErrOk)
-			{
-				_TIFFfree(data);
-				return(err);
-			}
-		}
-	}
-	else
-	{
-		if (original_datasize_clamped<=8)
-			_TIFFmemcpy(data,&direntry->tdir_offset,datasize);
-		else
-		{
-			enum TIFFReadDirEntryErr err;
-			uint64 offset = direntry->tdir_offset.toff_long8;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong8(&offset);
-			if( isMapped(tif) )
-				err=TIFFReadDirEntryData(tif,(uint64)offset,(tmsize_t)datasize,data);
-			else
-				err=TIFFReadDirEntryDataAndRealloc(tif,(uint64)offset,(tmsize_t)datasize,&data);
-			if (err!=TIFFReadDirEntryErrOk)
-			{
-				_TIFFfree(data);
-				return(err);
-			}
-		}
-	}
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryArray(TIFF* tif, TIFFDirEntry* direntry, uint32* count, uint32 desttypesize, void** value)
-{
-    return TIFFReadDirEntryArrayWithLimit(tif, direntry, count,
-                                          desttypesize, value, ~((uint64)0));
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeByteShort(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeByteSshort(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeByteLong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeByteSlong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeByteLong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeByteSlong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
 }
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryByteArray(TIFF* tif, TIFFDirEntry* direntry, uint8** value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySbyte(TIFF *tif, TIFFDirEntry *direntry, int8_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	uint8* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_ASCII:
-		case TIFF_UNDEFINED:
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,1,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_ASCII:
-		case TIFF_UNDEFINED:
-		case TIFF_BYTE:
-			*value=(uint8*)origdata;
-			return(TIFFReadDirEntryErrOk);
-		case TIFF_SBYTE:
-			{
-				int8* m;
-				uint32 n;
-				m=(int8*)origdata;
-				for (n=0; n<count; n++)
-				{
-					err=TIFFReadDirEntryCheckRangeByteSbyte(*m);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						_TIFFfree(origdata);
-						return(err);
-					}
-					m++;
-				}
-				*value=(uint8*)origdata;
-				return(TIFFReadDirEntryErrOk);
-			}
-	}
-	data=(uint8*)_TIFFmalloc(count);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_SHORT:
-			{
-				uint16* ma;
-				uint8* mb;
-				uint32 n;
-				ma=(uint16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(ma);
-					err=TIFFReadDirEntryCheckRangeByteShort(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SSHORT:
-			{
-				int16* ma;
-				uint8* mb;
-				uint32 n;
-				ma=(int16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)ma);
-					err=TIFFReadDirEntryCheckRangeByteSshort(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG:
-			{
-				uint32* ma;
-				uint8* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					err=TIFFReadDirEntryCheckRangeByteLong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG:
-			{
-				int32* ma;
-				uint8* mb;
-				uint32 n;
-				ma=(int32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)ma);
-					err=TIFFReadDirEntryCheckRangeByteSlong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG8:
-			{
-				uint64* ma;
-				uint8* mb;
-				uint32 n;
-				ma=(uint64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(ma);
-					err=TIFFReadDirEntryCheckRangeByteLong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG8:
-			{
-				int64* ma;
-				uint8* mb;
-				uint32 n;
-				ma=(int64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)ma);
-					err=TIFFReadDirEntryCheckRangeByteSlong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint8)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		_TIFFfree(data);
-		return(err);
-	}
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_UNDEFINED: /* Support to read TIFF_UNDEFINED with
+                                field_readcount==1 */
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSbyteByte(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, value);
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSbyteShort(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSbyteSshort(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSbyteLong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSbyteSlong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeSbyteLong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeSbyteSlong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int8_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+} /*-- TIFFReadDirEntrySbyte() --*/
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntrySbyteArray(TIFF* tif, TIFFDirEntry* direntry, int8** value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryShort(TIFF *tif, TIFFDirEntry *direntry, uint16_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	int8* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_UNDEFINED:
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,1,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_UNDEFINED:
-		case TIFF_BYTE:
-			{
-				uint8* m;
-				uint32 n;
-				m=(uint8*)origdata;
-				for (n=0; n<count; n++)
-				{
-					err=TIFFReadDirEntryCheckRangeSbyteByte(*m);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						_TIFFfree(origdata);
-						return(err);
-					}
-					m++;
-				}
-				*value=(int8*)origdata;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SBYTE:
-			*value=(int8*)origdata;
-			return(TIFFReadDirEntryErrOk);
-	}
-	data=(int8*)_TIFFmalloc(count);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_SHORT:
-			{
-				uint16* ma;
-				int8* mb;
-				uint32 n;
-				ma=(uint16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(ma);
-					err=TIFFReadDirEntryCheckRangeSbyteShort(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SSHORT:
-			{
-				int16* ma;
-				int8* mb;
-				uint32 n;
-				ma=(int16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)ma);
-					err=TIFFReadDirEntryCheckRangeSbyteSshort(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG:
-			{
-				uint32* ma;
-				int8* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					err=TIFFReadDirEntryCheckRangeSbyteLong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG:
-			{
-				int32* ma;
-				int8* mb;
-				uint32 n;
-				ma=(int32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)ma);
-					err=TIFFReadDirEntryCheckRangeSbyteSlong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG8:
-			{
-				uint64* ma;
-				int8* mb;
-				uint32 n;
-				ma=(uint64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(ma);
-					err=TIFFReadDirEntryCheckRangeSbyteLong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int8)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG8:
-			{
-				int64* ma;
-				int8* mb;
-				uint32 n;
-				ma=(int64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)ma);
-					err=TIFFReadDirEntryCheckRangeSbyteSlong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int8)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		_TIFFfree(data);
-		return(err);
-	}
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            *value = (uint16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeShortSbyte(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+            TIFFReadDirEntryCheckedShort(tif, direntry, value);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeShortSshort(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeShortLong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeShortSlong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeShortLong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeShortSlong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+} /*-- TIFFReadDirEntryShort() --*/
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryShortArray(TIFF* tif, TIFFDirEntry* direntry, uint16** value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySshort(TIFF *tif, TIFFDirEntry *direntry, int16_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	uint16* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,2,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_SHORT:
-			*value=(uint16*)origdata;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfShort(*value,count);  
-			return(TIFFReadDirEntryErrOk);
-		case TIFF_SSHORT:
-			{
-				int16* m;
-				uint32 n;
-				m=(int16*)origdata;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)m);
-					err=TIFFReadDirEntryCheckRangeShortSshort(*m);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						_TIFFfree(origdata);
-						return(err);
-					}
-					m++;
-				}
-				*value=(uint16*)origdata;
-				return(TIFFReadDirEntryErrOk);
-			}
-	}
-	data=(uint16*)_TIFFmalloc(count*2);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8* ma;
-				uint16* mb;
-				uint32 n;
-				ma=(uint8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(uint16)(*ma++);
-			}
-			break;
-		case TIFF_SBYTE:
-			{
-				int8* ma;
-				uint16* mb;
-				uint32 n;
-				ma=(int8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					err=TIFFReadDirEntryCheckRangeShortSbyte(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint16)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG:
-			{
-				uint32* ma;
-				uint16* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					err=TIFFReadDirEntryCheckRangeShortLong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint16)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG:
-			{
-				int32* ma;
-				uint16* mb;
-				uint32 n;
-				ma=(int32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)ma);
-					err=TIFFReadDirEntryCheckRangeShortSlong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint16)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG8:
-			{
-				uint64* ma;
-				uint16* mb;
-				uint32 n;
-				ma=(uint64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(ma);
-					err=TIFFReadDirEntryCheckRangeShortLong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint16)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG8:
-			{
-				int64* ma;
-				uint16* mb;
-				uint32 n;
-				ma=(int64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)ma);
-					err=TIFFReadDirEntryCheckRangeShortSlong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint16)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		_TIFFfree(data);
-		return(err);
-	}
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            *value = (int16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            *value = (int16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSshortShort(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+            TIFFReadDirEntryCheckedSshort(tif, direntry, value);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSshortLong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSshortSlong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeSshortLong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeSshortSlong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int16_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+} /*-- TIFFReadDirEntrySshort() --*/
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntrySshortArray(TIFF* tif, TIFFDirEntry* direntry, int16** value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryLong(TIFF *tif, TIFFDirEntry *direntry, uint32_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	int16* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,2,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_SHORT:
-			{
-				uint16* m;
-				uint32 n;
-				m=(uint16*)origdata;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(m);
-					err=TIFFReadDirEntryCheckRangeSshortShort(*m);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						_TIFFfree(origdata);
-						return(err);
-					}
-					m++;
-				}
-				*value=(int16*)origdata;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SSHORT:
-			*value=(int16*)origdata;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfShort((uint16*)(*value),count);
-			return(TIFFReadDirEntryErrOk);
-	}
-	data=(int16*)_TIFFmalloc(count*2);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8* ma;
-				int16* mb;
-				uint32 n;
-				ma=(uint8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(int16)(*ma++);
-			}
-			break;
-		case TIFF_SBYTE:
-			{
-				int8* ma;
-				int16* mb;
-				uint32 n;
-				ma=(int8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(int16)(*ma++);
-			}
-			break;
-		case TIFF_LONG:
-			{
-				uint32* ma;
-				int16* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					err=TIFFReadDirEntryCheckRangeSshortLong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int16)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG:
-			{
-				int32* ma;
-				int16* mb;
-				uint32 n;
-				ma=(int32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)ma);
-					err=TIFFReadDirEntryCheckRangeSshortSlong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int16)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG8:
-			{
-				uint64* ma;
-				int16* mb;
-				uint32 n;
-				ma=(uint64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(ma);
-					err=TIFFReadDirEntryCheckRangeSshortLong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int16)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG8:
-			{
-				int64* ma;
-				int16* mb;
-				uint32 n;
-				ma=(int64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)ma);
-					err=TIFFReadDirEntryCheckRangeSshortSlong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int16)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		_TIFFfree(data);
-		return(err);
-	}
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            *value = (uint32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeLongSbyte(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            *value = (uint32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeLongSshort(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+            TIFFReadDirEntryCheckedLong(tif, direntry, value);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeLongSlong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeLongLong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeLongSlong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+} /*-- TIFFReadDirEntryLong() --*/
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLongArray(TIFF* tif, TIFFDirEntry* direntry, uint32** value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySlong(TIFF *tif, TIFFDirEntry *direntry, int32_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	uint32* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,4,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_LONG:
-			*value=(uint32*)origdata;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfLong(*value,count);
-			return(TIFFReadDirEntryErrOk);
-		case TIFF_SLONG:
-			{
-				int32* m;
-				uint32 n;
-				m=(int32*)origdata;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)m);
-					err=TIFFReadDirEntryCheckRangeLongSlong(*m);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						_TIFFfree(origdata);
-						return(err);
-					}
-					m++;
-				}
-				*value=(uint32*)origdata;
-				return(TIFFReadDirEntryErrOk);
-			}
-	}
-	data=(uint32*)_TIFFmalloc(count*4);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8* ma;
-				uint32* mb;
-				uint32 n;
-				ma=(uint8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(uint32)(*ma++);
-			}
-			break;
-		case TIFF_SBYTE:
-			{
-				int8* ma;
-				uint32* mb;
-				uint32 n;
-				ma=(int8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					err=TIFFReadDirEntryCheckRangeLongSbyte(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint32)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SHORT:
-			{
-				uint16* ma;
-				uint32* mb;
-				uint32 n;
-				ma=(uint16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(ma);
-					*mb++=(uint32)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SSHORT:
-			{
-				int16* ma;
-				uint32* mb;
-				uint32 n;
-				ma=(int16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)ma);
-					err=TIFFReadDirEntryCheckRangeLongSshort(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint32)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG8:
-			{
-				uint64* ma;
-				uint32* mb;
-				uint32 n;
-				ma=(uint64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(ma);
-					err=TIFFReadDirEntryCheckRangeLongLong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint32)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG8:
-			{
-				int64* ma;
-				uint32* mb;
-				uint32 n;
-				ma=(int64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)ma);
-					err=TIFFReadDirEntryCheckRangeLongSlong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint32)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		_TIFFfree(data);
-		return(err);
-	}
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            *value = (int32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            *value = (int32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            *value = (int32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            *value = (int32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeSlongLong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+            TIFFReadDirEntryCheckedSlong(tif, direntry, value);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeSlongLong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeSlongSlong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int32_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+} /*-- TIFFReadDirEntrySlong() --*/
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntrySlongArray(TIFF* tif, TIFFDirEntry* direntry, int32** value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryLong8(TIFF *tif, TIFFDirEntry *direntry, uint64_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	int32* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,4,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_LONG:
-			{
-				uint32* m;
-				uint32 n;
-				m=(uint32*)origdata;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)m);
-					err=TIFFReadDirEntryCheckRangeSlongLong(*m);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						_TIFFfree(origdata);
-						return(err);
-					}
-					m++;
-				}
-				*value=(int32*)origdata;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG:
-			*value=(int32*)origdata;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfLong((uint32*)(*value),count);
-			return(TIFFReadDirEntryErrOk);
-	}
-	data=(int32*)_TIFFmalloc(count*4);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8* ma;
-				int32* mb;
-				uint32 n;
-				ma=(uint8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(int32)(*ma++);
-			}
-			break;
-		case TIFF_SBYTE:
-			{
-				int8* ma;
-				int32* mb;
-				uint32 n;
-				ma=(int8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(int32)(*ma++);
-			}
-			break;
-		case TIFF_SHORT:
-			{
-				uint16* ma;
-				int32* mb;
-				uint32 n;
-				ma=(uint16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(ma);
-					*mb++=(int32)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SSHORT:
-			{
-				int16* ma;
-				int32* mb;
-				uint32 n;
-				ma=(int16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)ma);
-					*mb++=(int32)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG8:
-			{
-				uint64* ma;
-				int32* mb;
-				uint32 n;
-				ma=(uint64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(ma);
-					err=TIFFReadDirEntryCheckRangeSlongLong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int32)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG8:
-			{
-				int64* ma;
-				int32* mb;
-				uint32 n;
-				ma=(int64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)ma);
-					err=TIFFReadDirEntryCheckRangeSlongSlong8(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(int32)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		_TIFFfree(data);
-		return(err);
-	}
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            *value = (uint64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeLong8Sbyte(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            *value = (uint64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeLong8Sshort(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            *value = (uint64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            err = TIFFReadDirEntryCheckRangeLong8Slong(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, value);
+            return (err);
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeLong8Slong8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (uint64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+} /*-- TIFFReadDirEntryLong8() --*/
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLong8ArrayWithLimit(
-        TIFF* tif, TIFFDirEntry* direntry, uint64** value, uint64 maxcount)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySlong8(TIFF *tif, TIFFDirEntry *direntry, int64_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	uint64* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArrayWithLimit(tif,direntry,&count,8,&origdata,maxcount);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_LONG8:
-			*value=(uint64*)origdata;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfLong8(*value,count);
-			return(TIFFReadDirEntryErrOk);
-		case TIFF_SLONG8:
-			{
-				int64* m;
-				uint32 n;
-				m=(int64*)origdata;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)m);
-					err=TIFFReadDirEntryCheckRangeLong8Slong8(*m);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						_TIFFfree(origdata);
-						return(err);
-					}
-					m++;
-				}
-				*value=(uint64*)origdata;
-				return(TIFFReadDirEntryErrOk);
-			}
-	}
-	data=(uint64*)_TIFFmalloc(count*8);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8* ma;
-				uint64* mb;
-				uint32 n;
-				ma=(uint8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(uint64)(*ma++);
-			}
-			break;
-		case TIFF_SBYTE:
-			{
-				int8* ma;
-				uint64* mb;
-				uint32 n;
-				ma=(int8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					err=TIFFReadDirEntryCheckRangeLong8Sbyte(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint64)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SHORT:
-			{
-				uint16* ma;
-				uint64* mb;
-				uint32 n;
-				ma=(uint16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(ma);
-					*mb++=(uint64)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SSHORT:
-			{
-				int16* ma;
-				uint64* mb;
-				uint32 n;
-				ma=(int16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)ma);
-					err=TIFFReadDirEntryCheckRangeLong8Sshort(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint64)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG:
-			{
-				uint32* ma;
-				uint64* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					*mb++=(uint64)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG:
-			{
-				int32* ma;
-				uint64* mb;
-				uint32 n;
-				ma=(int32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)ma);
-					err=TIFFReadDirEntryCheckRangeLong8Slong(*ma);
-					if (err!=TIFFReadDirEntryErrOk)
-						break;
-					*mb++=(uint64)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		_TIFFfree(data);
-		return(err);
-	}
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryLong8Array(TIFF* tif, TIFFDirEntry* direntry, uint64** value)
-{
-    return TIFFReadDirEntryLong8ArrayWithLimit(tif, direntry, value, ~((uint64)0));
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntrySlong8Array(TIFF* tif, TIFFDirEntry* direntry, int64** value)
-{
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	int64* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,8,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_LONG8:
-			{
-				uint64* m;
-				uint32 n;
-				m=(uint64*)origdata;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(m);
-					err=TIFFReadDirEntryCheckRangeSlong8Long8(*m);
-					if (err!=TIFFReadDirEntryErrOk)
-					{
-						_TIFFfree(origdata);
-						return(err);
-					}
-					m++;
-				}
-				*value=(int64*)origdata;
-				return(TIFFReadDirEntryErrOk);
-			}
-		case TIFF_SLONG8:
-			*value=(int64*)origdata;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfLong8((uint64*)(*value),count);
-			return(TIFFReadDirEntryErrOk);
-	}
-	data=(int64*)_TIFFmalloc(count*8);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8* ma;
-				int64* mb;
-				uint32 n;
-				ma=(uint8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(int64)(*ma++);
-			}
-			break;
-		case TIFF_SBYTE:
-			{
-				int8* ma;
-				int64* mb;
-				uint32 n;
-				ma=(int8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(int64)(*ma++);
-			}
-			break;
-		case TIFF_SHORT:
-			{
-				uint16* ma;
-				int64* mb;
-				uint32 n;
-				ma=(uint16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(ma);
-					*mb++=(int64)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SSHORT:
-			{
-				int16* ma;
-				int64* mb;
-				uint32 n;
-				ma=(int16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)ma);
-					*mb++=(int64)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG:
-			{
-				uint32* ma;
-				int64* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					*mb++=(int64)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG:
-			{
-				int32* ma;
-				int64* mb;
-				uint32 n;
-				ma=(int32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)ma);
-					*mb++=(int64)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            *value = (int64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            *value = (int64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            *value = (int64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            *value = (int64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            *value = (int64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            *value = (int64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            err = TIFFReadDirEntryCheckRangeSlong8Long8(m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (int64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, value);
+            return (err);
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+} /*-- TIFFReadDirEntrySlong8() --*/
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryFloatArray(TIFF* tif, TIFFDirEntry* direntry, float** value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryFloat(TIFF *tif, TIFFDirEntry *direntry, float *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	float* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-		case TIFF_RATIONAL:
-		case TIFF_SRATIONAL:
-		case TIFF_FLOAT:
-		case TIFF_DOUBLE:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,4,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_FLOAT:
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfLong((uint32*)origdata,count);  
-			TIFFCvtIEEEDoubleToNative(tif,count,(float*)origdata);
-			*value=(float*)origdata;
-			return(TIFFReadDirEntryErrOk);
-	}
-	data=(float*)_TIFFmalloc(count*sizeof(float));
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8* ma;
-				float* mb;
-				uint32 n;
-				ma=(uint8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(float)(*ma++);
-			}
-			break;
-		case TIFF_SBYTE:
-			{
-				int8* ma;
-				float* mb;
-				uint32 n;
-				ma=(int8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(float)(*ma++);
-			}
-			break;
-		case TIFF_SHORT:
-			{
-				uint16* ma;
-				float* mb;
-				uint32 n;
-				ma=(uint16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(ma);
-					*mb++=(float)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SSHORT:
-			{
-				int16* ma;
-				float* mb;
-				uint32 n;
-				ma=(int16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)ma);
-					*mb++=(float)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG:
-			{
-				uint32* ma;
-				float* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					*mb++=(float)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG:
-			{
-				int32* ma;
-				float* mb;
-				uint32 n;
-				ma=(int32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)ma);
-					*mb++=(float)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG8:
-			{
-				uint64* ma;
-				float* mb;
-				uint32 n;
-				ma=(uint64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(ma);
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
 #if defined(__WIN32__) && (_MSC_VER < 1500)
-					/*
-					 * XXX: MSVC 6.0 does not support
-					 * conversion of 64-bit integers into
-					 * floating point values.
-					 */
-					*mb++ = _TIFFUInt64ToFloat(*ma++);
+            /*
+             * XXX: MSVC 6.0 does not support conversion
+             * of 64-bit integers into floating point
+             * values.
+             */
+            *value = _TIFFUInt64ToFloat(m);
 #else
-					*mb++ = (float)(*ma++);
+            *value = (float)m;
 #endif
-				}
-			}
-			break;
-		case TIFF_SLONG8:
-			{
-				int64* ma;
-				float* mb;
-				uint32 n;
-				ma=(int64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)ma);
-					*mb++=(float)(*ma++);
-				}
-			}
-			break;
-		case TIFF_RATIONAL:
-			{
-				uint32* ma;
-				uint32 maa;
-				uint32 mab;
-				float* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					maa=*ma++;
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					mab=*ma++;
-					if (mab==0)
-						*mb++=0.0;
-					else
-						*mb++=(float)maa/(float)mab;
-				}
-			}
-			break;
-		case TIFF_SRATIONAL:
-			{
-				uint32* ma;
-				int32 maa;
-				uint32 mab;
-				float* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					maa=*(int32*)ma;
-					ma++;
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					mab=*ma++;
-					if (mab==0)
-						*mb++=0.0;
-					else
-						*mb++=(float)maa/(float)mab;
-				}
-			}
-			break;
-		case TIFF_DOUBLE:
-			{
-				double* ma;
-				float* mb;
-				uint32 n;
-				if (tif->tif_flags&TIFF_SWAB)
-					TIFFSwabArrayOfLong8((uint64*)origdata,count);
-				TIFFCvtIEEEDoubleToNative(tif,count,(double*)origdata);
-				ma=(double*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-                                {
-                                    double val = *ma++;
-                                    if( val > FLT_MAX )
-                                        val = FLT_MAX;
-                                    else if( val < -FLT_MAX )
-                                        val = -FLT_MAX;
-                                    *mb++=(float)val;
-                                }
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_RATIONAL:
+        {
+            double m;
+            err = TIFFReadDirEntryCheckedRational(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SRATIONAL:
+        {
+            double m;
+            err = TIFFReadDirEntryCheckedSrational(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_FLOAT:
+            TIFFReadDirEntryCheckedFloat(tif, direntry, value);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_DOUBLE:
+        {
+            double m;
+            err = TIFFReadDirEntryCheckedDouble(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            if ((m > FLT_MAX) || (m < -FLT_MAX))
+                return (TIFFReadDirEntryErrRange);
+            *value = (float)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
 }
 
 static enum TIFFReadDirEntryErr
-TIFFReadDirEntryDoubleArray(TIFF* tif, TIFFDirEntry* direntry, double** value)
+TIFFReadDirEntryDouble(TIFF *tif, TIFFDirEntry *direntry, double *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	double* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-		case TIFF_SBYTE:
-		case TIFF_SHORT:
-		case TIFF_SSHORT:
-		case TIFF_LONG:
-		case TIFF_SLONG:
-		case TIFF_LONG8:
-		case TIFF_SLONG8:
-		case TIFF_RATIONAL:
-		case TIFF_SRATIONAL:
-		case TIFF_FLOAT:
-		case TIFF_DOUBLE:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,8,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_DOUBLE:
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfLong8((uint64*)origdata,count);
-			TIFFCvtIEEEDoubleToNative(tif,count,(double*)origdata);
-			*value=(double*)origdata;
-			return(TIFFReadDirEntryErrOk);
-	}
-	data=(double*)_TIFFmalloc(count*sizeof(double));
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_BYTE:
-			{
-				uint8* ma;
-				double* mb;
-				uint32 n;
-				ma=(uint8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(double)(*ma++);
-			}
-			break;
-		case TIFF_SBYTE:
-			{
-				int8* ma;
-				double* mb;
-				uint32 n;
-				ma=(int8*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(double)(*ma++);
-			}
-			break;
-		case TIFF_SHORT:
-			{
-				uint16* ma;
-				double* mb;
-				uint32 n;
-				ma=(uint16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort(ma);
-					*mb++=(double)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SSHORT:
-			{
-				int16* ma;
-				double* mb;
-				uint32 n;
-				ma=(int16*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabShort((uint16*)ma);
-					*mb++=(double)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG:
-			{
-				uint32* ma;
-				double* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					*mb++=(double)(*ma++);
-				}
-			}
-			break;
-		case TIFF_SLONG:
-			{
-				int32* ma;
-				double* mb;
-				uint32 n;
-				ma=(int32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong((uint32*)ma);
-					*mb++=(double)(*ma++);
-				}
-			}
-			break;
-		case TIFF_LONG8:
-			{
-				uint64* ma;
-				double* mb;
-				uint32 n;
-				ma=(uint64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8(ma);
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t m;
+            TIFFReadDirEntryCheckedByte(tif, direntry, &m);
+            *value = (double)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+        {
+            int8_t m;
+            TIFFReadDirEntryCheckedSbyte(tif, direntry, &m);
+            *value = (double)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SHORT:
+        {
+            uint16_t m;
+            TIFFReadDirEntryCheckedShort(tif, direntry, &m);
+            *value = (double)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+        {
+            int16_t m;
+            TIFFReadDirEntryCheckedSshort(tif, direntry, &m);
+            *value = (double)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            *value = (double)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+        {
+            int32_t m;
+            TIFFReadDirEntryCheckedSlong(tif, direntry, &m);
+            *value = (double)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        {
+            uint64_t m;
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
 #if defined(__WIN32__) && (_MSC_VER < 1500)
-					/*
-					 * XXX: MSVC 6.0 does not support
-					 * conversion of 64-bit integers into
-					 * floating point values.
-					 */
-					*mb++ = _TIFFUInt64ToDouble(*ma++);
+            /*
+             * XXX: MSVC 6.0 does not support conversion
+             * of 64-bit integers into floating point
+             * values.
+             */
+            *value = _TIFFUInt64ToDouble(m);
 #else
-					*mb++ = (double)(*ma++);
+            *value = (double)m;
 #endif
-				}
-			}
-			break;
-		case TIFF_SLONG8:
-			{
-				int64* ma;
-				double* mb;
-				uint32 n;
-				ma=(int64*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong8((uint64*)ma);
-					*mb++=(double)(*ma++);
-				}
-			}
-			break;
-		case TIFF_RATIONAL:
-			{
-				uint32* ma;
-				uint32 maa;
-				uint32 mab;
-				double* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					maa=*ma++;
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					mab=*ma++;
-					if (mab==0)
-						*mb++=0.0;
-					else
-						*mb++=(double)maa/(double)mab;
-				}
-			}
-			break;
-		case TIFF_SRATIONAL:
-			{
-				uint32* ma;
-				int32 maa;
-				uint32 mab;
-				double* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					maa=*(int32*)ma;
-					ma++;
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					mab=*ma++;
-					if (mab==0)
-						*mb++=0.0;
-					else
-						*mb++=(double)maa/(double)mab;
-				}
-			}
-			break;
-		case TIFF_FLOAT:
-			{
-				float* ma;
-				double* mb;
-				uint32 n;
-				if (tif->tif_flags&TIFF_SWAB)
-					TIFFSwabArrayOfLong((uint32*)origdata,count);  
-				TIFFCvtIEEEFloatToNative(tif,count,(float*)origdata);
-				ma=(float*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-					*mb++=(double)(*ma++);
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryIfd8Array(TIFF* tif, TIFFDirEntry* direntry, uint64** value)
-{
-	enum TIFFReadDirEntryErr err;
-	uint32 count;
-	void* origdata;
-	uint64* data;
-	switch (direntry->tdir_type)
-	{
-		case TIFF_LONG:
-		case TIFF_LONG8:
-		case TIFF_IFD:
-		case TIFF_IFD8:
-			break;
-		default:
-			return(TIFFReadDirEntryErrType);
-	}
-	err=TIFFReadDirEntryArray(tif,direntry,&count,8,&origdata);
-	if ((err!=TIFFReadDirEntryErrOk)||(origdata==0))
-	{
-		*value=0;
-		return(err);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_LONG8:
-		case TIFF_IFD8:
-			*value=(uint64*)origdata;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabArrayOfLong8(*value,count);
-			return(TIFFReadDirEntryErrOk);
-	}
-	data=(uint64*)_TIFFmalloc(count*8);
-	if (data==0)
-	{
-		_TIFFfree(origdata);
-		return(TIFFReadDirEntryErrAlloc);
-	}
-	switch (direntry->tdir_type)
-	{
-		case TIFF_LONG:
-		case TIFF_IFD:
-			{
-				uint32* ma;
-				uint64* mb;
-				uint32 n;
-				ma=(uint32*)origdata;
-				mb=data;
-				for (n=0; n<count; n++)
-				{
-					if (tif->tif_flags&TIFF_SWAB)
-						TIFFSwabLong(ma);
-					*mb++=(uint64)(*ma++);
-				}
-			}
-			break;
-	}
-	_TIFFfree(origdata);
-	*value=data;
-	return(TIFFReadDirEntryErrOk);
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+        {
+            int64_t m;
+            err = TIFFReadDirEntryCheckedSlong8(tif, direntry, &m);
+            if (err != TIFFReadDirEntryErrOk)
+                return (err);
+            *value = (double)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_RATIONAL:
+            err = TIFFReadDirEntryCheckedRational(tif, direntry, value);
+            return (err);
+        case TIFF_SRATIONAL:
+            err = TIFFReadDirEntryCheckedSrational(tif, direntry, value);
+            return (err);
+        case TIFF_FLOAT:
+        {
+            float m;
+            TIFFReadDirEntryCheckedFloat(tif, direntry, &m);
+            *value = (double)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_DOUBLE:
+            err = TIFFReadDirEntryCheckedDouble(tif, direntry, value);
+            return (err);
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
 }
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryPersampleShort(TIFF* tif, TIFFDirEntry* direntry, uint16* value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryIfd8(TIFF *tif, TIFFDirEntry *direntry, uint64_t *value)
 {
-	enum TIFFReadDirEntryErr err;
-	uint16* m;
-	uint16* na;
-	uint16 nb;
-	if (direntry->tdir_count<(uint64)tif->tif_dir.td_samplesperpixel)
-		return(TIFFReadDirEntryErrCount);
-	err=TIFFReadDirEntryShortArray(tif,direntry,&m);
-	if (err!=TIFFReadDirEntryErrOk || m == NULL)
-		return(err);
-	na=m;
-	nb=tif->tif_dir.td_samplesperpixel;
-	*value=*na++;
-	nb--;
-	while (nb>0)
-	{
-		if (*na++!=*value)
-		{
-			err=TIFFReadDirEntryErrPsdif;
-			break;
-		}
-		nb--;
-	}
-	_TIFFfree(m);
-	return(err);
+    enum TIFFReadDirEntryErr err;
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+    switch (direntry->tdir_type)
+    {
+        case TIFF_LONG:
+        case TIFF_IFD:
+        {
+            uint32_t m;
+            TIFFReadDirEntryCheckedLong(tif, direntry, &m);
+            *value = (uint64_t)m;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_LONG8:
+        case TIFF_IFD8:
+            err = TIFFReadDirEntryCheckedLong8(tif, direntry, value);
+            return (err);
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
 }
 
-#if 0
-static enum TIFFReadDirEntryErr TIFFReadDirEntryPersampleDouble(TIFF* tif, TIFFDirEntry* direntry, double* value)
+#define INITIAL_THRESHOLD (1024 * 1024)
+#define THRESHOLD_MULTIPLIER 10
+#define MAX_THRESHOLD                                                          \
+    (THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER *      \
+     INITIAL_THRESHOLD)
+
+static enum TIFFReadDirEntryErr TIFFReadDirEntryDataAndRealloc(TIFF *tif,
+                                                               uint64_t offset,
+                                                               tmsize_t size,
+                                                               void **pdest)
 {
-	enum TIFFReadDirEntryErr err;
-	double* m;
-	double* na;
-	uint16 nb;
-	if (direntry->tdir_count<(uint64)tif->tif_dir.td_samplesperpixel)
-		return(TIFFReadDirEntryErrCount);
-	err=TIFFReadDirEntryDoubleArray(tif,direntry,&m);
-	if (err!=TIFFReadDirEntryErrOk)
-		return(err);
-	na=m;
-	nb=tif->tif_dir.td_samplesperpixel;
-	*value=*na++;
-	nb--;
-	while (nb>0)
-	{
-		if (*na++!=*value)
-		{
-			err=TIFFReadDirEntryErrPsdif;
-			break;
-		}
-		nb--;
-	}
-	_TIFFfree(m);
-	return(err);
-}
+#if SIZEOF_SIZE_T == 8
+    tmsize_t threshold = INITIAL_THRESHOLD;
 #endif
+    tmsize_t already_read = 0;
 
-static void TIFFReadDirEntryCheckedByte(TIFF* tif, TIFFDirEntry* direntry, uint8* value)
-{
-	(void) tif;
-	*value=*(uint8*)(&direntry->tdir_offset);
-}
-
-static void TIFFReadDirEntryCheckedSbyte(TIFF* tif, TIFFDirEntry* direntry, int8* value)
-{
-	(void) tif;
-	*value=*(int8*)(&direntry->tdir_offset);
-}
-
-static void TIFFReadDirEntryCheckedShort(TIFF* tif, TIFFDirEntry* direntry, uint16* value)
-{
-	*value = direntry->tdir_offset.toff_short;
-	/* *value=*(uint16*)(&direntry->tdir_offset); */
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabShort(value);
-}
-
-static void TIFFReadDirEntryCheckedSshort(TIFF* tif, TIFFDirEntry* direntry, int16* value)
-{
-	*value=*(int16*)(&direntry->tdir_offset);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabShort((uint16*)value);
-}
-
-static void TIFFReadDirEntryCheckedLong(TIFF* tif, TIFFDirEntry* direntry, uint32* value)
-{
-	*value=*(uint32*)(&direntry->tdir_offset);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong(value);
-}
-
-static void TIFFReadDirEntryCheckedSlong(TIFF* tif, TIFFDirEntry* direntry, int32* value)
-{
-	*value=*(int32*)(&direntry->tdir_offset);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong((uint32*)value);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedLong8(TIFF* tif, TIFFDirEntry* direntry, uint64* value)
-{
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		enum TIFFReadDirEntryErr err;
-		uint32 offset = direntry->tdir_offset.toff_long;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabLong(&offset);
-		err=TIFFReadDirEntryData(tif,offset,8,value);
-		if (err!=TIFFReadDirEntryErrOk)
-			return(err);
-	}
-	else
-		*value = direntry->tdir_offset.toff_long8;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong8(value);
-	return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedSlong8(TIFF* tif, TIFFDirEntry* direntry, int64* value)
-{
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		enum TIFFReadDirEntryErr err;
-		uint32 offset = direntry->tdir_offset.toff_long;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabLong(&offset);
-		err=TIFFReadDirEntryData(tif,offset,8,value);
-		if (err!=TIFFReadDirEntryErrOk)
-			return(err);
-	}
-	else
-		*value=*(int64*)(&direntry->tdir_offset);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong8((uint64*)value);
-	return(TIFFReadDirEntryErrOk);
-}
+    assert(!isMapped(tif));
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedRational(TIFF* tif, TIFFDirEntry* direntry, double* value)
-{
-	UInt64Aligned_t m;
-
-	assert(sizeof(double)==8);
-	assert(sizeof(uint64)==8);
-	assert(sizeof(uint32)==4);
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		enum TIFFReadDirEntryErr err;
-		uint32 offset = direntry->tdir_offset.toff_long;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabLong(&offset);
-		err=TIFFReadDirEntryData(tif,offset,8,m.i);
-		if (err!=TIFFReadDirEntryErrOk)
-			return(err);
-	}
-	else
-		m.l = direntry->tdir_offset.toff_long8;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong(m.i,2);
-        /* Not completely sure what we should do when m.i[1]==0, but some */
-        /* sanitizers do not like division by 0.0: */
-        /* http://bugzilla.maptools.org/show_bug.cgi?id=2644 */
-	if (m.i[0]==0 || m.i[1]==0)
-		*value=0.0;
-	else
-		*value=(double)m.i[0]/(double)m.i[1];
-	return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedSrational(TIFF* tif, TIFFDirEntry* direntry, double* value)
-{
-	UInt64Aligned_t m;
-	assert(sizeof(double)==8);
-	assert(sizeof(uint64)==8);
-	assert(sizeof(int32)==4);
-	assert(sizeof(uint32)==4);
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		enum TIFFReadDirEntryErr err;
-		uint32 offset = direntry->tdir_offset.toff_long;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabLong(&offset);
-		err=TIFFReadDirEntryData(tif,offset,8,m.i);
-		if (err!=TIFFReadDirEntryErrOk)
-			return(err);
-	}
-	else
-		m.l=direntry->tdir_offset.toff_long8;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong(m.i,2);
-        /* Not completely sure what we should do when m.i[1]==0, but some */
-        /* sanitizers do not like division by 0.0: */
-        /* http://bugzilla.maptools.org/show_bug.cgi?id=2644 */
-	if ((int32)m.i[0]==0 || m.i[1]==0)
-		*value=0.0;
-	else
-		*value=(double)((int32)m.i[0])/(double)m.i[1];
-	return(TIFFReadDirEntryErrOk);
-}
+    if (!SeekOK(tif, offset))
+        return (TIFFReadDirEntryErrIo);
 
-static void TIFFReadDirEntryCheckedFloat(TIFF* tif, TIFFDirEntry* direntry, float* value)
-{
-         union
-	 {
-	   float  f;
-	   uint32 i;
-	 } float_union;
-	assert(sizeof(float)==4);
-	assert(sizeof(uint32)==4);
-	assert(sizeof(float_union)==4);
-	float_union.i=*(uint32*)(&direntry->tdir_offset);
-	*value=float_union.f;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong((uint32*)value);
-}
+    /* On 64 bit processes, read first a maximum of 1 MB, then 10 MB, etc */
+    /* so as to avoid allocating too much memory in case the file is too */
+    /* short. We could ask for the file size, but this might be */
+    /* expensive with some I/O layers (think of reading a gzipped file) */
+    /* Restrict to 64 bit processes, so as to avoid reallocs() */
+    /* on 32 bit processes where virtual memory is scarce.  */
+    while (already_read < size)
+    {
+        void *new_dest;
+        tmsize_t bytes_read;
+        tmsize_t to_read = size - already_read;
+#if SIZEOF_SIZE_T == 8
+        if (to_read >= threshold && threshold < MAX_THRESHOLD)
+        {
+            to_read = threshold;
+            threshold *= THRESHOLD_MULTIPLIER;
+        }
+#endif
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckedDouble(TIFF* tif, TIFFDirEntry* direntry, double* value)
-{
-	assert(sizeof(double)==8);
-	assert(sizeof(uint64)==8);
-	assert(sizeof(UInt64Aligned_t)==8);
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		enum TIFFReadDirEntryErr err;
-		uint32 offset = direntry->tdir_offset.toff_long;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabLong(&offset);
-		err=TIFFReadDirEntryData(tif,offset,8,value);
-		if (err!=TIFFReadDirEntryErrOk)
-			return(err);
-	}
-	else
-	{
-	       UInt64Aligned_t uint64_union;
-	       uint64_union.l=direntry->tdir_offset.toff_long8;
-	       *value=uint64_union.d;
-	}
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong8((uint64*)value);
-	return(TIFFReadDirEntryErrOk);
-}
+        new_dest =
+            (uint8_t *)_TIFFreallocExt(tif, *pdest, already_read + to_read);
+        if (new_dest == NULL)
+        {
+            TIFFErrorExtR(tif, tif->tif_name,
+                          "Failed to allocate memory for %s "
+                          "(%" TIFF_SSIZE_FORMAT
+                          " elements of %" TIFF_SSIZE_FORMAT " bytes each)",
+                          "TIFFReadDirEntryArray", (tmsize_t)1,
+                          already_read + to_read);
+            return TIFFReadDirEntryErrAlloc;
+        }
+        *pdest = new_dest;
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteSbyte(int8 value)
-{
-	if (value<0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+        bytes_read = TIFFReadFile(tif, (char *)*pdest + already_read, to_read);
+        already_read += bytes_read;
+        if (bytes_read != to_read)
+        {
+            return TIFFReadDirEntryErrIo;
+        }
+    }
+    return TIFFReadDirEntryErrOk;
 }
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteShort(uint16 value)
-{
-	if (value>0xFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
+/* Caution: if raising that value, make sure int32 / uint32 overflows can't
+ * occur elsewhere */
+#define MAX_SIZE_TAG_DATA 2147483647U
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteSshort(int16 value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryArrayWithLimit(TIFF *tif, TIFFDirEntry *direntry,
+                               uint32_t *count, uint32_t desttypesize,
+                               void **value, uint64_t maxcount)
 {
-	if ((value<0)||(value>0xFF))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
+    int typesize;
+    uint32_t datasize;
+    void *data;
+    uint64_t target_count64;
+    int original_datasize_clamped;
+    typesize = TIFFDataWidth(direntry->tdir_type);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteLong(uint32 value)
-{
-	if (value>0xFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
+    target_count64 =
+        (direntry->tdir_count > maxcount) ? maxcount : direntry->tdir_count;
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteSlong(int32 value)
-{
-	if ((value<0)||(value>0xFF))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
+    if ((target_count64 == 0) || (typesize == 0))
+    {
+        *value = 0;
+        return (TIFFReadDirEntryErrOk);
+    }
+    (void)desttypesize;
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteLong8(uint64 value)
-{
-	if (value>0xFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
+    /* We just want to know if the original tag size is more than 4 bytes
+     * (classic TIFF) or 8 bytes (BigTIFF)
+     */
+    original_datasize_clamped =
+        ((direntry->tdir_count > 10) ? 10 : (int)direntry->tdir_count) *
+        typesize;
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeByteSlong8(int64 value)
-{
-	if ((value<0)||(value>0xFF))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
+    /*
+     * As a sanity check, make sure we have no more than a 2GB tag array
+     * in either the current data type or the dest data type.  This also
+     * avoids problems with overflow of tmsize_t on 32bit systems.
+     */
+    if ((uint64_t)(MAX_SIZE_TAG_DATA / typesize) < target_count64)
+        return (TIFFReadDirEntryErrSizesan);
+    if ((uint64_t)(MAX_SIZE_TAG_DATA / desttypesize) < target_count64)
+        return (TIFFReadDirEntryErrSizesan);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteByte(uint8 value)
-{
-	if (value>0x7F)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
+    *count = (uint32_t)target_count64;
+    datasize = (*count) * typesize;
+    assert((tmsize_t)datasize > 0);
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteShort(uint16 value)
-{
-	if (value>0x7F)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
+    if (isMapped(tif) && datasize > (uint64_t)tif->tif_size)
+        return TIFFReadDirEntryErrIo;
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteSshort(int16 value)
-{
-	if ((value<-0x80)||(value>0x7F))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    if (!isMapped(tif) && (((tif->tif_flags & TIFF_BIGTIFF) && datasize > 8) ||
+                           (!(tif->tif_flags & TIFF_BIGTIFF) && datasize > 4)))
+    {
+        data = NULL;
+    }
+    else
+    {
+        data = _TIFFCheckMalloc(tif, *count, typesize, "ReadDirEntryArray");
+        if (data == 0)
+            return (TIFFReadDirEntryErrAlloc);
+    }
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        /* Only the condition on original_datasize_clamped. The second
+         * one is implied, but Coverity Scan cannot see it. */
+        if (original_datasize_clamped <= 4 && datasize <= 4)
+            _TIFFmemcpy(data, &direntry->tdir_offset, datasize);
+        else
+        {
+            enum TIFFReadDirEntryErr err;
+            uint32_t offset = direntry->tdir_offset.toff_long;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong(&offset);
+            if (isMapped(tif))
+                err = TIFFReadDirEntryData(tif, (uint64_t)offset,
+                                           (tmsize_t)datasize, data);
+            else
+                err = TIFFReadDirEntryDataAndRealloc(tif, (uint64_t)offset,
+                                                     (tmsize_t)datasize, &data);
+            if (err != TIFFReadDirEntryErrOk)
+            {
+                _TIFFfreeExt(tif, data);
+                return (err);
+            }
+        }
+    }
+    else
+    {
+        /* See above comment for the Classic TIFF case */
+        if (original_datasize_clamped <= 8 && datasize <= 8)
+            _TIFFmemcpy(data, &direntry->tdir_offset, datasize);
+        else
+        {
+            enum TIFFReadDirEntryErr err;
+            uint64_t offset = direntry->tdir_offset.toff_long8;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&offset);
+            if (isMapped(tif))
+                err = TIFFReadDirEntryData(tif, (uint64_t)offset,
+                                           (tmsize_t)datasize, data);
+            else
+                err = TIFFReadDirEntryDataAndRealloc(tif, (uint64_t)offset,
+                                                     (tmsize_t)datasize, &data);
+            if (err != TIFFReadDirEntryErrOk)
+            {
+                _TIFFfreeExt(tif, data);
+                return (err);
+            }
+        }
+    }
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteLong(uint32 value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryArray(TIFF *tif, TIFFDirEntry *direntry, uint32_t *count,
+                      uint32_t desttypesize, void **value)
 {
-	if (value>0x7F)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    return TIFFReadDirEntryArrayWithLimit(tif, direntry, count, desttypesize,
+                                          value, ~((uint64_t)0));
 }
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteSlong(int32 value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryByteArray(TIFF *tif, TIFFDirEntry *direntry, uint8_t **value)
 {
-	if ((value<-0x80)||(value>0x7F))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    uint8_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_ASCII:
+        case TIFF_UNDEFINED:
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 1, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_ASCII:
+        case TIFF_UNDEFINED:
+        case TIFF_BYTE:
+            *value = (uint8_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_SBYTE:
+        {
+            int8_t *m;
+            uint32_t n;
+            m = (int8_t *)origdata;
+            for (n = 0; n < count; n++)
+            {
+                err = TIFFReadDirEntryCheckRangeByteSbyte(*m);
+                if (err != TIFFReadDirEntryErrOk)
+                {
+                    _TIFFfreeExt(tif, origdata);
+                    return (err);
+                }
+                m++;
+            }
+            *value = (uint8_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        }
+    }
+    data = (uint8_t *)_TIFFmallocExt(tif, count);
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_SHORT:
+        {
+            uint16_t *ma;
+            uint8_t *mb;
+            uint32_t n;
+            ma = (uint16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(ma);
+                err = TIFFReadDirEntryCheckRangeByteShort(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SSHORT:
+        {
+            int16_t *ma;
+            uint8_t *mb;
+            uint32_t n;
+            ma = (int16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)ma);
+                err = TIFFReadDirEntryCheckRangeByteSshort(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG:
+        {
+            uint32_t *ma;
+            uint8_t *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                err = TIFFReadDirEntryCheckRangeByteLong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG:
+        {
+            int32_t *ma;
+            uint8_t *mb;
+            uint32_t n;
+            ma = (int32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)ma);
+                err = TIFFReadDirEntryCheckRangeByteSlong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG8:
+        {
+            uint64_t *ma;
+            uint8_t *mb;
+            uint32_t n;
+            ma = (uint64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(ma);
+                err = TIFFReadDirEntryCheckRangeByteLong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG8:
+        {
+            int64_t *ma;
+            uint8_t *mb;
+            uint32_t n;
+            ma = (int64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)ma);
+                err = TIFFReadDirEntryCheckRangeByteSlong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint8_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        _TIFFfreeExt(tif, data);
+        return (err);
+    }
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteLong8(uint64 value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntrySbyteArray(TIFF *tif, TIFFDirEntry *direntry, int8_t **value)
 {
-	if (value>0x7F)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    int8_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_UNDEFINED:
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 1, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_UNDEFINED:
+        case TIFF_BYTE:
+        {
+            uint8_t *m;
+            uint32_t n;
+            m = (uint8_t *)origdata;
+            for (n = 0; n < count; n++)
+            {
+                err = TIFFReadDirEntryCheckRangeSbyteByte(*m);
+                if (err != TIFFReadDirEntryErrOk)
+                {
+                    _TIFFfreeExt(tif, origdata);
+                    return (err);
+                }
+                m++;
+            }
+            *value = (int8_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SBYTE:
+            *value = (int8_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+    }
+    data = (int8_t *)_TIFFmallocExt(tif, count);
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_SHORT:
+        {
+            uint16_t *ma;
+            int8_t *mb;
+            uint32_t n;
+            ma = (uint16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(ma);
+                err = TIFFReadDirEntryCheckRangeSbyteShort(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SSHORT:
+        {
+            int16_t *ma;
+            int8_t *mb;
+            uint32_t n;
+            ma = (int16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)ma);
+                err = TIFFReadDirEntryCheckRangeSbyteSshort(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG:
+        {
+            uint32_t *ma;
+            int8_t *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                err = TIFFReadDirEntryCheckRangeSbyteLong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG:
+        {
+            int32_t *ma;
+            int8_t *mb;
+            uint32_t n;
+            ma = (int32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)ma);
+                err = TIFFReadDirEntryCheckRangeSbyteSlong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG8:
+        {
+            uint64_t *ma;
+            int8_t *mb;
+            uint32_t n;
+            ma = (uint64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(ma);
+                err = TIFFReadDirEntryCheckRangeSbyteLong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int8_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG8:
+        {
+            int64_t *ma;
+            int8_t *mb;
+            uint32_t n;
+            ma = (int64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)ma);
+                err = TIFFReadDirEntryCheckRangeSbyteSlong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int8_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        _TIFFfreeExt(tif, data);
+        return (err);
+    }
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSbyteSlong8(int64 value)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryShortArray(TIFF *tif, TIFFDirEntry *direntry, uint16_t **value)
 {
-	if ((value<-0x80)||(value>0x7F))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortSbyte(int8 value)
-{
-	if (value<0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortSshort(int16 value)
-{
-	if (value<0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortLong(uint32 value)
-{
-	if (value>0xFFFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortSlong(int32 value)
-{
-	if ((value<0)||(value>0xFFFF))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortLong8(uint64 value)
-{
-	if (value>0xFFFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeShortSlong8(int64 value)
-{
-	if ((value<0)||(value>0xFFFF))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortShort(uint16 value)
-{
-	if (value>0x7FFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortLong(uint32 value)
-{
-	if (value>0x7FFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortSlong(int32 value)
-{
-	if ((value<-0x8000)||(value>0x7FFF))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortLong8(uint64 value)
-{
-	if (value>0x7FFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeSshortSlong8(int64 value)
-{
-	if ((value<-0x8000)||(value>0x7FFF))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongSbyte(int8 value)
-{
-	if (value<0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongSshort(int16 value)
-{
-	if (value<0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr TIFFReadDirEntryCheckRangeLongSlong(int32 value)
-{
-	if (value<0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeLongLong8(uint64 value)
-{
-	if (value > TIFF_UINT32_MAX)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeLongSlong8(int64 value)
-{
-	if ((value < 0) || (value > (int64) TIFF_UINT32_MAX))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeSlongLong(uint32 value)
-{
-	if (value > 0x7FFFFFFFUL)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-/* Check that the 8-byte unsigned value can fit in a 4-byte unsigned range */
-static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeSlongLong8(uint64 value)
-{
-	if (value > 0x7FFFFFFF)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    uint16_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 2, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_SHORT:
+            *value = (uint16_t *)origdata;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfShort(*value, count);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_SSHORT:
+        {
+            int16_t *m;
+            uint32_t n;
+            m = (int16_t *)origdata;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)m);
+                err = TIFFReadDirEntryCheckRangeShortSshort(*m);
+                if (err != TIFFReadDirEntryErrOk)
+                {
+                    _TIFFfreeExt(tif, origdata);
+                    return (err);
+                }
+                m++;
+            }
+            *value = (uint16_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        }
+    }
+    data = (uint16_t *)_TIFFmallocExt(tif, count * 2);
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t *ma;
+            uint16_t *mb;
+            uint32_t n;
+            ma = (uint8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (uint16_t)(*ma++);
+        }
+        break;
+        case TIFF_SBYTE:
+        {
+            int8_t *ma;
+            uint16_t *mb;
+            uint32_t n;
+            ma = (int8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                err = TIFFReadDirEntryCheckRangeShortSbyte(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint16_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG:
+        {
+            uint32_t *ma;
+            uint16_t *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                err = TIFFReadDirEntryCheckRangeShortLong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint16_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG:
+        {
+            int32_t *ma;
+            uint16_t *mb;
+            uint32_t n;
+            ma = (int32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)ma);
+                err = TIFFReadDirEntryCheckRangeShortSlong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint16_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG8:
+        {
+            uint64_t *ma;
+            uint16_t *mb;
+            uint32_t n;
+            ma = (uint64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(ma);
+                err = TIFFReadDirEntryCheckRangeShortLong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint16_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG8:
+        {
+            int64_t *ma;
+            uint16_t *mb;
+            uint32_t n;
+            ma = (int64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)ma);
+                err = TIFFReadDirEntryCheckRangeShortSlong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint16_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        _TIFFfreeExt(tif, data);
+        return (err);
+    }
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
-/* Check that the 8-byte signed value can fit in a 4-byte signed range */
 static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeSlongSlong8(int64 value)
+TIFFReadDirEntrySshortArray(TIFF *tif, TIFFDirEntry *direntry, int16_t **value)
 {
-        if ((value < 0-((int64) 0x7FFFFFFF+1)) || (value > 0x7FFFFFFF))
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    int16_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 2, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_SHORT:
+        {
+            uint16_t *m;
+            uint32_t n;
+            m = (uint16_t *)origdata;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(m);
+                err = TIFFReadDirEntryCheckRangeSshortShort(*m);
+                if (err != TIFFReadDirEntryErrOk)
+                {
+                    _TIFFfreeExt(tif, origdata);
+                    return (err);
+                }
+                m++;
+            }
+            *value = (int16_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SSHORT:
+            *value = (int16_t *)origdata;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfShort((uint16_t *)(*value), count);
+            return (TIFFReadDirEntryErrOk);
+    }
+    data = (int16_t *)_TIFFmallocExt(tif, count * 2);
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t *ma;
+            int16_t *mb;
+            uint32_t n;
+            ma = (uint8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (int16_t)(*ma++);
+        }
+        break;
+        case TIFF_SBYTE:
+        {
+            int8_t *ma;
+            int16_t *mb;
+            uint32_t n;
+            ma = (int8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (int16_t)(*ma++);
+        }
+        break;
+        case TIFF_LONG:
+        {
+            uint32_t *ma;
+            int16_t *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                err = TIFFReadDirEntryCheckRangeSshortLong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int16_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG:
+        {
+            int32_t *ma;
+            int16_t *mb;
+            uint32_t n;
+            ma = (int32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)ma);
+                err = TIFFReadDirEntryCheckRangeSshortSlong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int16_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG8:
+        {
+            uint64_t *ma;
+            int16_t *mb;
+            uint32_t n;
+            ma = (uint64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(ma);
+                err = TIFFReadDirEntryCheckRangeSshortLong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int16_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG8:
+        {
+            int64_t *ma;
+            int16_t *mb;
+            uint32_t n;
+            ma = (int64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)ma);
+                err = TIFFReadDirEntryCheckRangeSshortSlong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int16_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        _TIFFfreeExt(tif, data);
+        return (err);
+    }
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
 static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeLong8Sbyte(int8 value)
+TIFFReadDirEntryLongArray(TIFF *tif, TIFFDirEntry *direntry, uint32_t **value)
 {
-	if (value < 0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    uint32_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 4, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_LONG:
+            *value = (uint32_t *)origdata;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong(*value, count);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_SLONG:
+        {
+            int32_t *m;
+            uint32_t n;
+            m = (int32_t *)origdata;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)m);
+                err = TIFFReadDirEntryCheckRangeLongSlong(*m);
+                if (err != TIFFReadDirEntryErrOk)
+                {
+                    _TIFFfreeExt(tif, origdata);
+                    return (err);
+                }
+                m++;
+            }
+            *value = (uint32_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        }
+    }
+    data = (uint32_t *)_TIFFmallocExt(tif, count * 4);
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t *ma;
+            uint32_t *mb;
+            uint32_t n;
+            ma = (uint8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (uint32_t)(*ma++);
+        }
+        break;
+        case TIFF_SBYTE:
+        {
+            int8_t *ma;
+            uint32_t *mb;
+            uint32_t n;
+            ma = (int8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                err = TIFFReadDirEntryCheckRangeLongSbyte(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint32_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SHORT:
+        {
+            uint16_t *ma;
+            uint32_t *mb;
+            uint32_t n;
+            ma = (uint16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(ma);
+                *mb++ = (uint32_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SSHORT:
+        {
+            int16_t *ma;
+            uint32_t *mb;
+            uint32_t n;
+            ma = (int16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)ma);
+                err = TIFFReadDirEntryCheckRangeLongSshort(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint32_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG8:
+        {
+            uint64_t *ma;
+            uint32_t *mb;
+            uint32_t n;
+            ma = (uint64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(ma);
+                err = TIFFReadDirEntryCheckRangeLongLong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint32_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG8:
+        {
+            int64_t *ma;
+            uint32_t *mb;
+            uint32_t n;
+            ma = (int64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)ma);
+                err = TIFFReadDirEntryCheckRangeLongSlong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint32_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        _TIFFfreeExt(tif, data);
+        return (err);
+    }
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
 static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeLong8Sshort(int16 value)
+TIFFReadDirEntrySlongArray(TIFF *tif, TIFFDirEntry *direntry, int32_t **value)
 {
-	if (value < 0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    int32_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 4, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_LONG:
+        {
+            uint32_t *m;
+            uint32_t n;
+            m = (uint32_t *)origdata;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)m);
+                err = TIFFReadDirEntryCheckRangeSlongLong(*m);
+                if (err != TIFFReadDirEntryErrOk)
+                {
+                    _TIFFfreeExt(tif, origdata);
+                    return (err);
+                }
+                m++;
+            }
+            *value = (int32_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG:
+            *value = (int32_t *)origdata;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong((uint32_t *)(*value), count);
+            return (TIFFReadDirEntryErrOk);
+    }
+    data = (int32_t *)_TIFFmallocExt(tif, count * 4);
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t *ma;
+            int32_t *mb;
+            uint32_t n;
+            ma = (uint8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (int32_t)(*ma++);
+        }
+        break;
+        case TIFF_SBYTE:
+        {
+            int8_t *ma;
+            int32_t *mb;
+            uint32_t n;
+            ma = (int8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (int32_t)(*ma++);
+        }
+        break;
+        case TIFF_SHORT:
+        {
+            uint16_t *ma;
+            int32_t *mb;
+            uint32_t n;
+            ma = (uint16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(ma);
+                *mb++ = (int32_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SSHORT:
+        {
+            int16_t *ma;
+            int32_t *mb;
+            uint32_t n;
+            ma = (int16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)ma);
+                *mb++ = (int32_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG8:
+        {
+            uint64_t *ma;
+            int32_t *mb;
+            uint32_t n;
+            ma = (uint64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(ma);
+                err = TIFFReadDirEntryCheckRangeSlongLong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int32_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG8:
+        {
+            int64_t *ma;
+            int32_t *mb;
+            uint32_t n;
+            ma = (int64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)ma);
+                err = TIFFReadDirEntryCheckRangeSlongSlong8(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (int32_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        _TIFFfreeExt(tif, data);
+        return (err);
+    }
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
 static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeLong8Slong(int32 value)
+TIFFReadDirEntryLong8ArrayWithLimit(TIFF *tif, TIFFDirEntry *direntry,
+                                    uint64_t **value, uint64_t maxcount)
 {
-	if (value < 0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    uint64_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArrayWithLimit(tif, direntry, &count, 8, &origdata,
+                                         maxcount);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_LONG8:
+            *value = (uint64_t *)origdata;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong8(*value, count);
+            return (TIFFReadDirEntryErrOk);
+        case TIFF_SLONG8:
+        {
+            int64_t *m;
+            uint32_t n;
+            m = (int64_t *)origdata;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)m);
+                err = TIFFReadDirEntryCheckRangeLong8Slong8(*m);
+                if (err != TIFFReadDirEntryErrOk)
+                {
+                    _TIFFfreeExt(tif, origdata);
+                    return (err);
+                }
+                m++;
+            }
+            *value = (uint64_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        }
+    }
+    data = (uint64_t *)_TIFFmallocExt(tif, count * 8);
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t *ma;
+            uint64_t *mb;
+            uint32_t n;
+            ma = (uint8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (uint64_t)(*ma++);
+        }
+        break;
+        case TIFF_SBYTE:
+        {
+            int8_t *ma;
+            uint64_t *mb;
+            uint32_t n;
+            ma = (int8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                err = TIFFReadDirEntryCheckRangeLong8Sbyte(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint64_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SHORT:
+        {
+            uint16_t *ma;
+            uint64_t *mb;
+            uint32_t n;
+            ma = (uint16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(ma);
+                *mb++ = (uint64_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SSHORT:
+        {
+            int16_t *ma;
+            uint64_t *mb;
+            uint32_t n;
+            ma = (int16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)ma);
+                err = TIFFReadDirEntryCheckRangeLong8Sshort(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint64_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG:
+        {
+            uint32_t *ma;
+            uint64_t *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                *mb++ = (uint64_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG:
+        {
+            int32_t *ma;
+            uint64_t *mb;
+            uint32_t n;
+            ma = (int32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)ma);
+                err = TIFFReadDirEntryCheckRangeLong8Slong(*ma);
+                if (err != TIFFReadDirEntryErrOk)
+                    break;
+                *mb++ = (uint64_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        _TIFFfreeExt(tif, data);
+        return (err);
+    }
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
 static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeLong8Slong8(int64 value)
+TIFFReadDirEntryLong8Array(TIFF *tif, TIFFDirEntry *direntry, uint64_t **value)
 {
-	if (value < 0)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
+    return TIFFReadDirEntryLong8ArrayWithLimit(tif, direntry, value,
+                                               ~((uint64_t)0));
 }
 
 static enum TIFFReadDirEntryErr
-TIFFReadDirEntryCheckRangeSlong8Long8(uint64 value)
+TIFFReadDirEntrySlong8Array(TIFF *tif, TIFFDirEntry *direntry, int64_t **value)
 {
-	if (value > TIFF_INT64_MAX)
-		return(TIFFReadDirEntryErrRange);
-	else
-		return(TIFFReadDirEntryErrOk);
-}
-
-static enum TIFFReadDirEntryErr
-TIFFReadDirEntryData(TIFF* tif, uint64 offset, tmsize_t size, void* dest)
-{
-	assert(size>0);
-	if (!isMapped(tif)) {
-		if (!SeekOK(tif,offset))
-			return(TIFFReadDirEntryErrIo);
-		if (!ReadOK(tif,dest,size))
-			return(TIFFReadDirEntryErrIo);
-	} else {
-		size_t ma,mb;
-		ma=(size_t)offset;
-                if( (uint64)ma!=offset ||
-                    ma > (~(size_t)0) - (size_t)size )
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    int64_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 8, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_LONG8:
+        {
+            uint64_t *m;
+            uint32_t n;
+            m = (uint64_t *)origdata;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(m);
+                err = TIFFReadDirEntryCheckRangeSlong8Long8(*m);
+                if (err != TIFFReadDirEntryErrOk)
                 {
-                    return TIFFReadDirEntryErrIo;
+                    _TIFFfreeExt(tif, origdata);
+                    return (err);
                 }
-		mb=ma+size;
-		if (mb > (uint64)tif->tif_size)
-			return(TIFFReadDirEntryErrIo);
-		_TIFFmemcpy(dest,tif->tif_base+ma,size);
-	}
-	return(TIFFReadDirEntryErrOk);
+                m++;
+            }
+            *value = (int64_t *)origdata;
+            return (TIFFReadDirEntryErrOk);
+        }
+        case TIFF_SLONG8:
+            *value = (int64_t *)origdata;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong8((uint64_t *)(*value), count);
+            return (TIFFReadDirEntryErrOk);
+    }
+    data = (int64_t *)_TIFFmallocExt(tif, count * 8);
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t *ma;
+            int64_t *mb;
+            uint32_t n;
+            ma = (uint8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (int64_t)(*ma++);
+        }
+        break;
+        case TIFF_SBYTE:
+        {
+            int8_t *ma;
+            int64_t *mb;
+            uint32_t n;
+            ma = (int8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (int64_t)(*ma++);
+        }
+        break;
+        case TIFF_SHORT:
+        {
+            uint16_t *ma;
+            int64_t *mb;
+            uint32_t n;
+            ma = (uint16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(ma);
+                *mb++ = (int64_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SSHORT:
+        {
+            int16_t *ma;
+            int64_t *mb;
+            uint32_t n;
+            ma = (int16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)ma);
+                *mb++ = (int64_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG:
+        {
+            uint32_t *ma;
+            int64_t *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                *mb++ = (int64_t)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG:
+        {
+            int32_t *ma;
+            int64_t *mb;
+            uint32_t n;
+            ma = (int32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)ma);
+                *mb++ = (int64_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
-static void TIFFReadDirEntryOutputErr(TIFF* tif, enum TIFFReadDirEntryErr err, const char* module, const char* tagname, int recover)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryFloatArray(TIFF *tif, TIFFDirEntry *direntry, float **value)
 {
-	if (!recover) {
-		switch (err) {
-			case TIFFReadDirEntryErrCount:
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Incorrect count for \"%s\"",
-					     tagname);
-				break;
-			case TIFFReadDirEntryErrType:
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Incompatible type for \"%s\"",
-					     tagname);
-				break;
-			case TIFFReadDirEntryErrIo:
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "IO error during reading of \"%s\"",
-					     tagname);
-				break;
-			case TIFFReadDirEntryErrRange:
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Incorrect value for \"%s\"",
-					     tagname);
-				break;
-			case TIFFReadDirEntryErrPsdif:
-				TIFFErrorExt(tif->tif_clientdata, module,
-			"Cannot handle different values per sample for \"%s\"",
-					     tagname);
-				break;
-			case TIFFReadDirEntryErrSizesan:
-				TIFFErrorExt(tif->tif_clientdata, module,
-				"Sanity check on size of \"%s\" value failed",
-					     tagname);
-				break;
-			case TIFFReadDirEntryErrAlloc:
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Out of memory reading of \"%s\"",
-					     tagname);
-				break;
-			default:
-				assert(0);   /* we should never get here */
-				break;
-		}
-	} else {
-		switch (err) {
-			case TIFFReadDirEntryErrCount:
-				TIFFWarningExt(tif->tif_clientdata, module,
-				"Incorrect count for \"%s\"; tag ignored",
-					     tagname);
-				break;
-			case TIFFReadDirEntryErrType:
-				TIFFWarningExt(tif->tif_clientdata, module,
-				"Incompatible type for \"%s\"; tag ignored",
-					       tagname);
-				break;
-			case TIFFReadDirEntryErrIo:
-				TIFFWarningExt(tif->tif_clientdata, module,
-			"IO error during reading of \"%s\"; tag ignored",
-					       tagname);
-				break;
-			case TIFFReadDirEntryErrRange:
-				TIFFWarningExt(tif->tif_clientdata, module,
-				"Incorrect value for \"%s\"; tag ignored",
-					       tagname);
-				break;
-			case TIFFReadDirEntryErrPsdif:
-				TIFFWarningExt(tif->tif_clientdata, module,
-	"Cannot handle different values per sample for \"%s\"; tag ignored",
-					       tagname);
-				break;
-			case TIFFReadDirEntryErrSizesan:
-				TIFFWarningExt(tif->tif_clientdata, module,
-		"Sanity check on size of \"%s\" value failed; tag ignored",
-					       tagname);
-				break;
-			case TIFFReadDirEntryErrAlloc:
-				TIFFWarningExt(tif->tif_clientdata, module,
-				"Out of memory reading of \"%s\"; tag ignored",
-					       tagname);
-				break;
-			default:
-				assert(0);   /* we should never get here */
-				break;
-		}
-	}
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    float *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+        case TIFF_RATIONAL:
+        case TIFF_SRATIONAL:
+        case TIFF_FLOAT:
+        case TIFF_DOUBLE:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 4, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_FLOAT:
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong((uint32_t *)origdata, count);
+            TIFFCvtIEEEDoubleToNative(tif, count, (float *)origdata);
+            *value = (float *)origdata;
+            return (TIFFReadDirEntryErrOk);
+    }
+    data = (float *)_TIFFmallocExt(tif, count * sizeof(float));
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t *ma;
+            float *mb;
+            uint32_t n;
+            ma = (uint8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (float)(*ma++);
+        }
+        break;
+        case TIFF_SBYTE:
+        {
+            int8_t *ma;
+            float *mb;
+            uint32_t n;
+            ma = (int8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (float)(*ma++);
+        }
+        break;
+        case TIFF_SHORT:
+        {
+            uint16_t *ma;
+            float *mb;
+            uint32_t n;
+            ma = (uint16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(ma);
+                *mb++ = (float)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SSHORT:
+        {
+            int16_t *ma;
+            float *mb;
+            uint32_t n;
+            ma = (int16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)ma);
+                *mb++ = (float)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG:
+        {
+            uint32_t *ma;
+            float *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                *mb++ = (float)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG:
+        {
+            int32_t *ma;
+            float *mb;
+            uint32_t n;
+            ma = (int32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)ma);
+                *mb++ = (float)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG8:
+        {
+            uint64_t *ma;
+            float *mb;
+            uint32_t n;
+            ma = (uint64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(ma);
+#if defined(__WIN32__) && (_MSC_VER < 1500)
+                /*
+                 * XXX: MSVC 6.0 does not support
+                 * conversion of 64-bit integers into
+                 * floating point values.
+                 */
+                *mb++ = _TIFFUInt64ToFloat(*ma++);
+#else
+                *mb++ = (float)(*ma++);
+#endif
+            }
+        }
+        break;
+        case TIFF_SLONG8:
+        {
+            int64_t *ma;
+            float *mb;
+            uint32_t n;
+            ma = (int64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)ma);
+                *mb++ = (float)(*ma++);
+            }
+        }
+        break;
+        case TIFF_RATIONAL:
+        {
+            uint32_t *ma;
+            uint32_t maa;
+            uint32_t mab;
+            float *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                maa = *ma++;
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                mab = *ma++;
+                if (mab == 0)
+                    *mb++ = 0.0;
+                else
+                    *mb++ = (float)maa / (float)mab;
+            }
+        }
+        break;
+        case TIFF_SRATIONAL:
+        {
+            uint32_t *ma;
+            int32_t maa;
+            uint32_t mab;
+            float *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                maa = *(int32_t *)ma;
+                ma++;
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                mab = *ma++;
+                if (mab == 0)
+                    *mb++ = 0.0;
+                else
+                    *mb++ = (float)maa / (float)mab;
+            }
+        }
+        break;
+        case TIFF_DOUBLE:
+        {
+            double *ma;
+            float *mb;
+            uint32_t n;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong8((uint64_t *)origdata, count);
+            TIFFCvtIEEEDoubleToNative(tif, count, (double *)origdata);
+            ma = (double *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                double val = *ma++;
+                if (val > FLT_MAX)
+                    val = FLT_MAX;
+                else if (val < -FLT_MAX)
+                    val = -FLT_MAX;
+                *mb++ = (float)val;
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
-/*
- * Return the maximum number of color channels specified for a given photometric
- * type. 0 is returned if photometric type isn't supported or no default value
- * is defined by the specification.
- */
-static int _TIFFGetMaxColorChannels( uint16 photometric )
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryDoubleArray(TIFF *tif, TIFFDirEntry *direntry, double **value)
 {
-    switch (photometric) {
-	case PHOTOMETRIC_PALETTE:
-	case PHOTOMETRIC_MINISWHITE:
-	case PHOTOMETRIC_MINISBLACK:
-            return 1;
-	case PHOTOMETRIC_YCBCR:
-	case PHOTOMETRIC_RGB:
-	case PHOTOMETRIC_CIELAB:
-	case PHOTOMETRIC_LOGLUV:
-	case PHOTOMETRIC_ITULAB:
-	case PHOTOMETRIC_ICCLAB:
-            return 3;
-	case PHOTOMETRIC_SEPARATED:
-	case PHOTOMETRIC_MASK:
-            return 4;
-	case PHOTOMETRIC_LOGL:
-	case PHOTOMETRIC_CFA:
-	default:
-            return 0;
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    double *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        case TIFF_SBYTE:
+        case TIFF_SHORT:
+        case TIFF_SSHORT:
+        case TIFF_LONG:
+        case TIFF_SLONG:
+        case TIFF_LONG8:
+        case TIFF_SLONG8:
+        case TIFF_RATIONAL:
+        case TIFF_SRATIONAL:
+        case TIFF_FLOAT:
+        case TIFF_DOUBLE:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 8, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_DOUBLE:
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong8((uint64_t *)origdata, count);
+            TIFFCvtIEEEDoubleToNative(tif, count, (double *)origdata);
+            *value = (double *)origdata;
+            return (TIFFReadDirEntryErrOk);
+    }
+    data = (double *)_TIFFmallocExt(tif, count * sizeof(double));
+    if (data == 0)
+    {
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_BYTE:
+        {
+            uint8_t *ma;
+            double *mb;
+            uint32_t n;
+            ma = (uint8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (double)(*ma++);
+        }
+        break;
+        case TIFF_SBYTE:
+        {
+            int8_t *ma;
+            double *mb;
+            uint32_t n;
+            ma = (int8_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (double)(*ma++);
+        }
+        break;
+        case TIFF_SHORT:
+        {
+            uint16_t *ma;
+            double *mb;
+            uint32_t n;
+            ma = (uint16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(ma);
+                *mb++ = (double)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SSHORT:
+        {
+            int16_t *ma;
+            double *mb;
+            uint32_t n;
+            ma = (int16_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort((uint16_t *)ma);
+                *mb++ = (double)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG:
+        {
+            uint32_t *ma;
+            double *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                *mb++ = (double)(*ma++);
+            }
+        }
+        break;
+        case TIFF_SLONG:
+        {
+            int32_t *ma;
+            double *mb;
+            uint32_t n;
+            ma = (int32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong((uint32_t *)ma);
+                *mb++ = (double)(*ma++);
+            }
+        }
+        break;
+        case TIFF_LONG8:
+        {
+            uint64_t *ma;
+            double *mb;
+            uint32_t n;
+            ma = (uint64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(ma);
+#if defined(__WIN32__) && (_MSC_VER < 1500)
+                /*
+                 * XXX: MSVC 6.0 does not support
+                 * conversion of 64-bit integers into
+                 * floating point values.
+                 */
+                *mb++ = _TIFFUInt64ToDouble(*ma++);
+#else
+                *mb++ = (double)(*ma++);
+#endif
+            }
+        }
+        break;
+        case TIFF_SLONG8:
+        {
+            int64_t *ma;
+            double *mb;
+            uint32_t n;
+            ma = (int64_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8((uint64_t *)ma);
+                *mb++ = (double)(*ma++);
+            }
+        }
+        break;
+        case TIFF_RATIONAL:
+        {
+            uint32_t *ma;
+            uint32_t maa;
+            uint32_t mab;
+            double *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                maa = *ma++;
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                mab = *ma++;
+                if (mab == 0)
+                    *mb++ = 0.0;
+                else
+                    *mb++ = (double)maa / (double)mab;
+            }
+        }
+        break;
+        case TIFF_SRATIONAL:
+        {
+            uint32_t *ma;
+            int32_t maa;
+            uint32_t mab;
+            double *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                maa = *(int32_t *)ma;
+                ma++;
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                mab = *ma++;
+                if (mab == 0)
+                    *mb++ = 0.0;
+                else
+                    *mb++ = (double)maa / (double)mab;
+            }
+        }
+        break;
+        case TIFF_FLOAT:
+        {
+            float *ma;
+            double *mb;
+            uint32_t n;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong((uint32_t *)origdata, count);
+            TIFFCvtIEEEFloatToNative(tif, count, (float *)origdata);
+            ma = (float *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+                *mb++ = (double)(*ma++);
+        }
+        break;
     }
+    _TIFFfreeExt(tif, origdata);
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
 }
 
-static int ByteCountLooksBad(TIFF* tif)
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryIfd8Array(TIFF *tif, TIFFDirEntry *direntry, uint64_t **value)
 {
-    /*
-        * Assume we have wrong StripByteCount value (in case
-        * of single strip) in following cases:
-        *   - it is equal to zero along with StripOffset;
-        *   - it is larger than file itself (in case of uncompressed
-        *     image);
-        *   - it is smaller than the size of the bytes per row
-        *     multiplied on the number of rows.  The last case should
-        *     not be checked in the case of writing new image,
-        *     because we may do not know the exact strip size
-        *     until the whole image will be written and directory
-        *     dumped out.
-        */
-    uint64 bytecount = TIFFGetStrileByteCount(tif, 0);
-    uint64 offset = TIFFGetStrileOffset(tif, 0);
-    uint64 filesize;
-
-    if( offset == 0 )
-        return 0;
-    if (bytecount == 0)
-        return 1;
-    if ( tif->tif_dir.td_compression != COMPRESSION_NONE )
-        return 0;
-    filesize = TIFFGetFileSize(tif);
-    if( offset <= filesize && bytecount > filesize - offset )
-        return 1;
-    if( tif->tif_mode == O_RDONLY )
+    enum TIFFReadDirEntryErr err;
+    uint32_t count;
+    void *origdata;
+    uint64_t *data;
+    switch (direntry->tdir_type)
+    {
+        case TIFF_LONG:
+        case TIFF_LONG8:
+        case TIFF_IFD:
+        case TIFF_IFD8:
+            break;
+        default:
+            return (TIFFReadDirEntryErrType);
+    }
+    err = TIFFReadDirEntryArray(tif, direntry, &count, 8, &origdata);
+    if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+    {
+        *value = 0;
+        return (err);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_LONG8:
+        case TIFF_IFD8:
+            *value = (uint64_t *)origdata;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabArrayOfLong8(*value, count);
+            return (TIFFReadDirEntryErrOk);
+    }
+    data = (uint64_t *)_TIFFmallocExt(tif, count * 8);
+    if (data == 0)
     {
-        uint64 scanlinesize = TIFFScanlineSize64(tif);
-        if( tif->tif_dir.td_imagelength > 0 &&
-            scanlinesize > TIFF_UINT64_MAX / tif->tif_dir.td_imagelength )
+        _TIFFfreeExt(tif, origdata);
+        return (TIFFReadDirEntryErrAlloc);
+    }
+    switch (direntry->tdir_type)
+    {
+        case TIFF_LONG:
+        case TIFF_IFD:
         {
-            return 1;
+            uint32_t *ma;
+            uint64_t *mb;
+            uint32_t n;
+            ma = (uint32_t *)origdata;
+            mb = data;
+            for (n = 0; n < count; n++)
+            {
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(ma);
+                *mb++ = (uint64_t)(*ma++);
+            }
+        }
+        break;
+    }
+    _TIFFfreeExt(tif, origdata);
+    *value = data;
+    return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryPersampleShort(TIFF *tif, TIFFDirEntry *direntry,
+                               uint16_t *value)
+{
+    enum TIFFReadDirEntryErr err;
+    uint16_t *m;
+    uint16_t *na;
+    uint16_t nb;
+    if (direntry->tdir_count < (uint64_t)tif->tif_dir.td_samplesperpixel)
+        return (TIFFReadDirEntryErrCount);
+    err = TIFFReadDirEntryShortArray(tif, direntry, &m);
+    if (err != TIFFReadDirEntryErrOk || m == NULL)
+        return (err);
+    na = m;
+    nb = tif->tif_dir.td_samplesperpixel;
+    *value = *na++;
+    nb--;
+    while (nb > 0)
+    {
+        if (*na++ != *value)
+        {
+            err = TIFFReadDirEntryErrPsdif;
+            break;
+        }
+        nb--;
+    }
+    _TIFFfreeExt(tif, m);
+    return (err);
+}
+
+static void TIFFReadDirEntryCheckedByte(TIFF *tif, TIFFDirEntry *direntry,
+                                        uint8_t *value)
+{
+    (void)tif;
+    *value = *(uint8_t *)(&direntry->tdir_offset);
+}
+
+static void TIFFReadDirEntryCheckedSbyte(TIFF *tif, TIFFDirEntry *direntry,
+                                         int8_t *value)
+{
+    (void)tif;
+    *value = *(int8_t *)(&direntry->tdir_offset);
+}
+
+static void TIFFReadDirEntryCheckedShort(TIFF *tif, TIFFDirEntry *direntry,
+                                         uint16_t *value)
+{
+    *value = direntry->tdir_offset.toff_short;
+    /* *value=*(uint16_t*)(&direntry->tdir_offset); */
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabShort(value);
+}
+
+static void TIFFReadDirEntryCheckedSshort(TIFF *tif, TIFFDirEntry *direntry,
+                                          int16_t *value)
+{
+    *value = *(int16_t *)(&direntry->tdir_offset);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabShort((uint16_t *)value);
+}
+
+static void TIFFReadDirEntryCheckedLong(TIFF *tif, TIFFDirEntry *direntry,
+                                        uint32_t *value)
+{
+    *value = *(uint32_t *)(&direntry->tdir_offset);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabLong(value);
+}
+
+static void TIFFReadDirEntryCheckedSlong(TIFF *tif, TIFFDirEntry *direntry,
+                                         int32_t *value)
+{
+    *value = *(int32_t *)(&direntry->tdir_offset);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabLong((uint32_t *)value);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedLong8(TIFF *tif, TIFFDirEntry *direntry, uint64_t *value)
+{
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        enum TIFFReadDirEntryErr err;
+        uint32_t offset = direntry->tdir_offset.toff_long;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&offset);
+        err = TIFFReadDirEntryData(tif, offset, 8, value);
+        if (err != TIFFReadDirEntryErrOk)
+            return (err);
+    }
+    else
+        *value = direntry->tdir_offset.toff_long8;
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabLong8(value);
+    return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedSlong8(TIFF *tif, TIFFDirEntry *direntry, int64_t *value)
+{
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        enum TIFFReadDirEntryErr err;
+        uint32_t offset = direntry->tdir_offset.toff_long;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&offset);
+        err = TIFFReadDirEntryData(tif, offset, 8, value);
+        if (err != TIFFReadDirEntryErrOk)
+            return (err);
+    }
+    else
+        *value = *(int64_t *)(&direntry->tdir_offset);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabLong8((uint64_t *)value);
+    return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedRational(TIFF *tif, TIFFDirEntry *direntry,
+                                double *value)
+{
+    UInt64Aligned_t m;
+
+    assert(sizeof(double) == 8);
+    assert(sizeof(uint64_t) == 8);
+    assert(sizeof(uint32_t) == 4);
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        enum TIFFReadDirEntryErr err;
+        uint32_t offset = direntry->tdir_offset.toff_long;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&offset);
+        err = TIFFReadDirEntryData(tif, offset, 8, m.i);
+        if (err != TIFFReadDirEntryErrOk)
+            return (err);
+    }
+    else
+        m.l = direntry->tdir_offset.toff_long8;
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong(m.i, 2);
+    /* Not completely sure what we should do when m.i[1]==0, but some */
+    /* sanitizers do not like division by 0.0: */
+    /* http://bugzilla.maptools.org/show_bug.cgi?id=2644 */
+    if (m.i[0] == 0 || m.i[1] == 0)
+        *value = 0.0;
+    else
+        *value = (double)m.i[0] / (double)m.i[1];
+    return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedSrational(TIFF *tif, TIFFDirEntry *direntry,
+                                 double *value)
+{
+    UInt64Aligned_t m;
+    assert(sizeof(double) == 8);
+    assert(sizeof(uint64_t) == 8);
+    assert(sizeof(int32_t) == 4);
+    assert(sizeof(uint32_t) == 4);
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        enum TIFFReadDirEntryErr err;
+        uint32_t offset = direntry->tdir_offset.toff_long;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&offset);
+        err = TIFFReadDirEntryData(tif, offset, 8, m.i);
+        if (err != TIFFReadDirEntryErrOk)
+            return (err);
+    }
+    else
+        m.l = direntry->tdir_offset.toff_long8;
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong(m.i, 2);
+    /* Not completely sure what we should do when m.i[1]==0, but some */
+    /* sanitizers do not like division by 0.0: */
+    /* http://bugzilla.maptools.org/show_bug.cgi?id=2644 */
+    if ((int32_t)m.i[0] == 0 || m.i[1] == 0)
+        *value = 0.0;
+    else
+        *value = (double)((int32_t)m.i[0]) / (double)m.i[1];
+    return (TIFFReadDirEntryErrOk);
+}
+
+#if 0
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedRationalDirect(TIFF *tif, TIFFDirEntry *direntry,
+                                      TIFFRational_t *value)
+{ /*--: SetGetRATIONAL_directly:_CustomTag: Read rational (and signed rationals)
+     directly --*/
+    UInt64Aligned_t m;
+
+    assert(sizeof(double) == 8);
+    assert(sizeof(uint64_t) == 8);
+    assert(sizeof(uint32_t) == 4);
+
+    if (direntry->tdir_count != 1)
+        return (TIFFReadDirEntryErrCount);
+
+    if (direntry->tdir_type != TIFF_RATIONAL &&
+        direntry->tdir_type != TIFF_SRATIONAL)
+        return (TIFFReadDirEntryErrType);
+
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        enum TIFFReadDirEntryErr err;
+        uint32_t offset = direntry->tdir_offset.toff_long;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&offset);
+        err = TIFFReadDirEntryData(tif, offset, 8, m.i);
+        if (err != TIFFReadDirEntryErrOk)
+            return (err);
+    }
+    else
+    {
+        m.l = direntry->tdir_offset.toff_long8;
+    }
+
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong(m.i, 2);
+
+    value->uNum = m.i[0];
+    value->uDenom = m.i[1];
+    return (TIFFReadDirEntryErrOk);
+} /*-- TIFFReadDirEntryCheckedRationalDirect() --*/
+#endif
+
+static void TIFFReadDirEntryCheckedFloat(TIFF *tif, TIFFDirEntry *direntry,
+                                         float *value)
+{
+    union
+    {
+        float f;
+        uint32_t i;
+    } float_union;
+    assert(sizeof(float) == 4);
+    assert(sizeof(uint32_t) == 4);
+    assert(sizeof(float_union) == 4);
+    float_union.i = *(uint32_t *)(&direntry->tdir_offset);
+    *value = float_union.f;
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabLong((uint32_t *)value);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckedDouble(TIFF *tif, TIFFDirEntry *direntry, double *value)
+{
+    assert(sizeof(double) == 8);
+    assert(sizeof(uint64_t) == 8);
+    assert(sizeof(UInt64Aligned_t) == 8);
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        enum TIFFReadDirEntryErr err;
+        uint32_t offset = direntry->tdir_offset.toff_long;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&offset);
+        err = TIFFReadDirEntryData(tif, offset, 8, value);
+        if (err != TIFFReadDirEntryErrOk)
+            return (err);
+    }
+    else
+    {
+        UInt64Aligned_t uint64_union;
+        uint64_union.l = direntry->tdir_offset.toff_long8;
+        *value = uint64_union.d;
+    }
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabLong8((uint64_t *)value);
+    return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteSbyte(int8_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteShort(uint16_t value)
+{
+    if (value > 0xFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteSshort(int16_t value)
+{
+    if ((value < 0) || (value > 0xFF))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteLong(uint32_t value)
+{
+    if (value > 0xFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteSlong(int32_t value)
+{
+    if ((value < 0) || (value > 0xFF))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteLong8(uint64_t value)
+{
+    if (value > 0xFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeByteSlong8(int64_t value)
+{
+    if ((value < 0) || (value > 0xFF))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteByte(uint8_t value)
+{
+    if (value > 0x7F)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteShort(uint16_t value)
+{
+    if (value > 0x7F)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteSshort(int16_t value)
+{
+    if ((value < -0x80) || (value > 0x7F))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteLong(uint32_t value)
+{
+    if (value > 0x7F)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteSlong(int32_t value)
+{
+    if ((value < -0x80) || (value > 0x7F))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteLong8(uint64_t value)
+{
+    if (value > 0x7F)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSbyteSlong8(int64_t value)
+{
+    if ((value < -0x80) || (value > 0x7F))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortSbyte(int8_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortSshort(int16_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortLong(uint32_t value)
+{
+    if (value > 0xFFFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortSlong(int32_t value)
+{
+    if ((value < 0) || (value > 0xFFFF))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortLong8(uint64_t value)
+{
+    if (value > 0xFFFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeShortSlong8(int64_t value)
+{
+    if ((value < 0) || (value > 0xFFFF))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortShort(uint16_t value)
+{
+    if (value > 0x7FFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortLong(uint32_t value)
+{
+    if (value > 0x7FFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortSlong(int32_t value)
+{
+    if ((value < -0x8000) || (value > 0x7FFF))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortLong8(uint64_t value)
+{
+    if (value > 0x7FFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSshortSlong8(int64_t value)
+{
+    if ((value < -0x8000) || (value > 0x7FFF))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongSbyte(int8_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongSshort(int16_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongSlong(int32_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongLong8(uint64_t value)
+{
+    if (value > UINT32_MAX)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLongSlong8(int64_t value)
+{
+    if ((value < 0) || (value > (int64_t)UINT32_MAX))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSlongLong(uint32_t value)
+{
+    if (value > 0x7FFFFFFFUL)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+/* Check that the 8-byte unsigned value can fit in a 4-byte unsigned range */
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSlongLong8(uint64_t value)
+{
+    if (value > 0x7FFFFFFF)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+/* Check that the 8-byte signed value can fit in a 4-byte signed range */
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSlongSlong8(int64_t value)
+{
+    if ((value < 0 - ((int64_t)0x7FFFFFFF + 1)) || (value > 0x7FFFFFFF))
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLong8Sbyte(int8_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLong8Sshort(int16_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLong8Slong(int32_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeLong8Slong8(int64_t value)
+{
+    if (value < 0)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr
+TIFFReadDirEntryCheckRangeSlong8Long8(uint64_t value)
+{
+    if (value > INT64_MAX)
+        return (TIFFReadDirEntryErrRange);
+    else
+        return (TIFFReadDirEntryErrOk);
+}
+
+static enum TIFFReadDirEntryErr TIFFReadDirEntryData(TIFF *tif, uint64_t offset,
+                                                     tmsize_t size, void *dest)
+{
+    assert(size > 0);
+    if (!isMapped(tif))
+    {
+        if (!SeekOK(tif, offset))
+            return (TIFFReadDirEntryErrIo);
+        if (!ReadOK(tif, dest, size))
+            return (TIFFReadDirEntryErrIo);
+    }
+    else
+    {
+        size_t ma, mb;
+        ma = (size_t)offset;
+        if ((uint64_t)ma != offset || ma > (~(size_t)0) - (size_t)size)
+        {
+            return TIFFReadDirEntryErrIo;
+        }
+        mb = ma + size;
+        if (mb > (uint64_t)tif->tif_size)
+            return (TIFFReadDirEntryErrIo);
+        _TIFFmemcpy(dest, tif->tif_base + ma, size);
+    }
+    return (TIFFReadDirEntryErrOk);
+}
+
+static void TIFFReadDirEntryOutputErr(TIFF *tif, enum TIFFReadDirEntryErr err,
+                                      const char *module, const char *tagname,
+                                      int recover)
+{
+    if (!recover)
+    {
+        switch (err)
+        {
+            case TIFFReadDirEntryErrCount:
+                TIFFErrorExtR(tif, module, "Incorrect count for \"%s\"",
+                              tagname);
+                break;
+            case TIFFReadDirEntryErrType:
+                TIFFErrorExtR(tif, module, "Incompatible type for \"%s\"",
+                              tagname);
+                break;
+            case TIFFReadDirEntryErrIo:
+                TIFFErrorExtR(tif, module, "IO error during reading of \"%s\"",
+                              tagname);
+                break;
+            case TIFFReadDirEntryErrRange:
+                TIFFErrorExtR(tif, module, "Incorrect value for \"%s\"",
+                              tagname);
+                break;
+            case TIFFReadDirEntryErrPsdif:
+                TIFFErrorExtR(
+                    tif, module,
+                    "Cannot handle different values per sample for \"%s\"",
+                    tagname);
+                break;
+            case TIFFReadDirEntryErrSizesan:
+                TIFFErrorExtR(tif, module,
+                              "Sanity check on size of \"%s\" value failed",
+                              tagname);
+                break;
+            case TIFFReadDirEntryErrAlloc:
+                TIFFErrorExtR(tif, module, "Out of memory reading of \"%s\"",
+                              tagname);
+                break;
+            default:
+                assert(0); /* we should never get here */
+                break;
+        }
+    }
+    else
+    {
+        switch (err)
+        {
+            case TIFFReadDirEntryErrCount:
+                TIFFWarningExtR(tif, module,
+                                "Incorrect count for \"%s\"; tag ignored",
+                                tagname);
+                break;
+            case TIFFReadDirEntryErrType:
+                TIFFWarningExtR(tif, module,
+                                "Incompatible type for \"%s\"; tag ignored",
+                                tagname);
+                break;
+            case TIFFReadDirEntryErrIo:
+                TIFFWarningExtR(
+                    tif, module,
+                    "IO error during reading of \"%s\"; tag ignored", tagname);
+                break;
+            case TIFFReadDirEntryErrRange:
+                TIFFWarningExtR(tif, module,
+                                "Incorrect value for \"%s\"; tag ignored",
+                                tagname);
+                break;
+            case TIFFReadDirEntryErrPsdif:
+                TIFFWarningExtR(tif, module,
+                                "Cannot handle different values per sample for "
+                                "\"%s\"; tag ignored",
+                                tagname);
+                break;
+            case TIFFReadDirEntryErrSizesan:
+                TIFFWarningExtR(
+                    tif, module,
+                    "Sanity check on size of \"%s\" value failed; tag ignored",
+                    tagname);
+                break;
+            case TIFFReadDirEntryErrAlloc:
+                TIFFWarningExtR(tif, module,
+                                "Out of memory reading of \"%s\"; tag ignored",
+                                tagname);
+                break;
+            default:
+                assert(0); /* we should never get here */
+                break;
+        }
+    }
+}
+
+/*
+ * Return the maximum number of color channels specified for a given photometric
+ * type. 0 is returned if photometric type isn't supported or no default value
+ * is defined by the specification.
+ */
+static int _TIFFGetMaxColorChannels(uint16_t photometric)
+{
+    switch (photometric)
+    {
+        case PHOTOMETRIC_PALETTE:
+        case PHOTOMETRIC_MINISWHITE:
+        case PHOTOMETRIC_MINISBLACK:
+            return 1;
+        case PHOTOMETRIC_YCBCR:
+        case PHOTOMETRIC_RGB:
+        case PHOTOMETRIC_CIELAB:
+        case PHOTOMETRIC_LOGLUV:
+        case PHOTOMETRIC_ITULAB:
+        case PHOTOMETRIC_ICCLAB:
+            return 3;
+        case PHOTOMETRIC_SEPARATED:
+        case PHOTOMETRIC_MASK:
+            return 4;
+        case PHOTOMETRIC_LOGL:
+        case PHOTOMETRIC_CFA:
+        default:
+            return 0;
+    }
+}
+
+static int ByteCountLooksBad(TIFF *tif)
+{
+    /*
+     * Assume we have wrong StripByteCount value (in case
+     * of single strip) in following cases:
+     *   - it is equal to zero along with StripOffset;
+     *   - it is larger than file itself (in case of uncompressed
+     *     image);
+     *   - it is smaller than the size of the bytes per row
+     *     multiplied on the number of rows.  The last case should
+     *     not be checked in the case of writing new image,
+     *     because we may do not know the exact strip size
+     *     until the whole image will be written and directory
+     *     dumped out.
+     */
+    uint64_t bytecount = TIFFGetStrileByteCount(tif, 0);
+    uint64_t offset = TIFFGetStrileOffset(tif, 0);
+    uint64_t filesize;
+
+    if (offset == 0)
+        return 0;
+    if (bytecount == 0)
+        return 1;
+    if (tif->tif_dir.td_compression != COMPRESSION_NONE)
+        return 0;
+    filesize = TIFFGetFileSize(tif);
+    if (offset <= filesize && bytecount > filesize - offset)
+        return 1;
+    if (tif->tif_mode == O_RDONLY)
+    {
+        uint64_t scanlinesize = TIFFScanlineSize64(tif);
+        if (tif->tif_dir.td_imagelength > 0 &&
+            scanlinesize > UINT64_MAX / tif->tif_dir.td_imagelength)
+        {
+            return 1;
+        }
+        if (bytecount < scanlinesize * tif->tif_dir.td_imagelength)
+            return 1;
+    }
+    return 0;
+}
+
+/*
+ * Read the next TIFF directory from a file and convert it to the internal
+ * format. We read directories sequentially.
+ */
+int TIFFReadDirectory(TIFF *tif)
+{
+    static const char module[] = "TIFFReadDirectory";
+    TIFFDirEntry *dir;
+    uint16_t dircount;
+    TIFFDirEntry *dp;
+    uint16_t di;
+    const TIFFField *fip;
+    uint32_t fii = FAILED_FII;
+    toff_t nextdiroff;
+    int bitspersample_read = FALSE;
+    int color_channels;
+
+    if (tif->tif_nextdiroff == 0)
+    {
+        /* In this special case, tif_diroff needs also to be set to 0.
+         * This is behind the last IFD, thus no checking or reading necessary.
+         */
+        tif->tif_diroff = tif->tif_nextdiroff;
+        return 0;
+    }
+
+    nextdiroff = tif->tif_nextdiroff;
+    /* tif_curdir++ and tif_nextdiroff should only be updated after SUCCESSFUL
+     * reading of the directory. Otherwise, invalid IFD offsets could corrupt
+     * the IFD list. */
+    if (!_TIFFCheckDirNumberAndOffset(tif,
+                                      tif->tif_curdir ==
+                                              TIFF_NON_EXISTENT_DIR_NUMBER
+                                          ? 0
+                                          : tif->tif_curdir + 1,
+                                      nextdiroff))
+    {
+        return 0; /* bad offset (IFD looping or more than TIFF_MAX_DIR_COUNT
+                     IFDs) */
+    }
+    dircount = TIFFFetchDirectory(tif, nextdiroff, &dir, &tif->tif_nextdiroff);
+    if (!dircount)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Failed to read directory at offset %" PRIu64,
+                      nextdiroff);
+        return 0;
+    }
+    /* Set global values after a valid directory has been fetched.
+     * tif_diroff is already set to nextdiroff in TIFFFetchDirectory() in the
+     * beginning. */
+    if (tif->tif_curdir == TIFF_NON_EXISTENT_DIR_NUMBER)
+        tif->tif_curdir = 0;
+    else
+        tif->tif_curdir++;
+    (*tif->tif_cleanup)(tif); /* cleanup any previous compression state */
+
+    TIFFReadDirectoryCheckOrder(tif, dir, dircount);
+
+    /*
+     * Mark duplicates of any tag to be ignored (bugzilla 1994)
+     * to avoid certain pathological problems.
+     */
+    {
+        TIFFDirEntry *ma;
+        uint16_t mb;
+        for (ma = dir, mb = 0; mb < dircount; ma++, mb++)
+        {
+            TIFFDirEntry *na;
+            uint16_t nb;
+            for (na = ma + 1, nb = mb + 1; nb < dircount; na++, nb++)
+            {
+                if (ma->tdir_tag == na->tdir_tag)
+                {
+                    na->tdir_ignore = TRUE;
+                }
+            }
+        }
+    }
+
+    tif->tif_flags &= ~TIFF_BEENWRITING; /* reset before new dir */
+    tif->tif_flags &= ~TIFF_BUF4WRITE;   /* reset before new dir */
+    tif->tif_flags &= ~TIFF_CHOPPEDUPARRAYS;
+
+    /* free any old stuff and reinit */
+    TIFFFreeDirectory(tif);
+    TIFFDefaultDirectory(tif);
+    /*
+     * Electronic Arts writes gray-scale TIFF files
+     * without a PlanarConfiguration directory entry.
+     * Thus we setup a default value here, even though
+     * the TIFF spec says there is no default value.
+     * After PlanarConfiguration is preset in TIFFDefaultDirectory()
+     * the following setting is not needed, but does not harm either.
+     */
+    TIFFSetField(tif, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
+    /*
+     * Setup default value and then make a pass over
+     * the fields to check type and tag information,
+     * and to extract info required to size data
+     * structures.  A second pass is made afterwards
+     * to read in everything not taken in the first pass.
+     * But we must process the Compression tag first
+     * in order to merge in codec-private tag definitions (otherwise
+     * we may get complaints about unknown tags).  However, the
+     * Compression tag may be dependent on the SamplesPerPixel
+     * tag value because older TIFF specs permitted Compression
+     * to be written as a SamplesPerPixel-count tag entry.
+     * Thus if we don't first figure out the correct SamplesPerPixel
+     * tag value then we may end up ignoring the Compression tag
+     * value because it has an incorrect count value (if the
+     * true value of SamplesPerPixel is not 1).
+     */
+    dp =
+        TIFFReadDirectoryFindEntry(tif, dir, dircount, TIFFTAG_SAMPLESPERPIXEL);
+    if (dp)
+    {
+        if (!TIFFFetchNormalTag(tif, dp, 0))
+            goto bad;
+        dp->tdir_ignore = TRUE;
+    }
+    dp = TIFFReadDirectoryFindEntry(tif, dir, dircount, TIFFTAG_COMPRESSION);
+    if (dp)
+    {
+        /*
+         * The 5.0 spec says the Compression tag has one value, while
+         * earlier specs say it has one value per sample.  Because of
+         * this, we accept the tag if one value is supplied with either
+         * count.
+         */
+        uint16_t value;
+        enum TIFFReadDirEntryErr err;
+        err = TIFFReadDirEntryShort(tif, dp, &value);
+        if (err == TIFFReadDirEntryErrCount)
+            err = TIFFReadDirEntryPersampleShort(tif, dp, &value);
+        if (err != TIFFReadDirEntryErrOk)
+        {
+            TIFFReadDirEntryOutputErr(tif, err, module, "Compression", 0);
+            goto bad;
+        }
+        if (!TIFFSetField(tif, TIFFTAG_COMPRESSION, value))
+            goto bad;
+        dp->tdir_ignore = TRUE;
+    }
+    else
+    {
+        if (!TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_NONE))
+            goto bad;
+    }
+    /*
+     * First real pass over the directory.
+     */
+    for (di = 0, dp = dir; di < dircount; di++, dp++)
+    {
+        if (!dp->tdir_ignore)
+        {
+            TIFFReadDirectoryFindFieldInfo(tif, dp->tdir_tag, &fii);
+            if (fii == FAILED_FII)
+            {
+                TIFFWarningExtR(tif, module,
+                                "Unknown field with tag %" PRIu16 " (0x%" PRIx16
+                                ") encountered",
+                                dp->tdir_tag, dp->tdir_tag);
+                /* the following knowingly leaks the
+                   anonymous field structure */
+                if (!_TIFFMergeFields(
+                        tif,
+                        _TIFFCreateAnonField(tif, dp->tdir_tag,
+                                             (TIFFDataType)dp->tdir_type),
+                        1))
+                {
+                    TIFFWarningExtR(
+                        tif, module,
+                        "Registering anonymous field with tag %" PRIu16
+                        " (0x%" PRIx16 ") failed",
+                        dp->tdir_tag, dp->tdir_tag);
+                    dp->tdir_ignore = TRUE;
+                }
+                else
+                {
+                    TIFFReadDirectoryFindFieldInfo(tif, dp->tdir_tag, &fii);
+                    assert(fii != FAILED_FII);
+                }
+            }
+        }
+        if (!dp->tdir_ignore)
+        {
+            fip = tif->tif_fields[fii];
+            if (fip->field_bit == FIELD_IGNORE)
+                dp->tdir_ignore = TRUE;
+            else
+            {
+                switch (dp->tdir_tag)
+                {
+                    case TIFFTAG_STRIPOFFSETS:
+                    case TIFFTAG_STRIPBYTECOUNTS:
+                    case TIFFTAG_TILEOFFSETS:
+                    case TIFFTAG_TILEBYTECOUNTS:
+                        TIFFSetFieldBit(tif, fip->field_bit);
+                        break;
+                    case TIFFTAG_IMAGEWIDTH:
+                    case TIFFTAG_IMAGELENGTH:
+                    case TIFFTAG_IMAGEDEPTH:
+                    case TIFFTAG_TILELENGTH:
+                    case TIFFTAG_TILEWIDTH:
+                    case TIFFTAG_TILEDEPTH:
+                    case TIFFTAG_PLANARCONFIG:
+                    case TIFFTAG_ROWSPERSTRIP:
+                    case TIFFTAG_EXTRASAMPLES:
+                        if (!TIFFFetchNormalTag(tif, dp, 0))
+                            goto bad;
+                        dp->tdir_ignore = TRUE;
+                        break;
+                    default:
+                        if (!_TIFFCheckFieldIsValidForCodec(tif, dp->tdir_tag))
+                            dp->tdir_ignore = TRUE;
+                        break;
+                }
+            }
+        }
+    }
+    /*
+     * XXX: OJPEG hack.
+     * If a) compression is OJPEG, b) planarconfig tag says it's separate,
+     * c) strip offsets/bytecounts tag are both present and
+     * d) both contain exactly one value, then we consistently find
+     * that the buggy implementation of the buggy compression scheme
+     * matches contig planarconfig best. So we 'fix-up' the tag here
+     */
+    if ((tif->tif_dir.td_compression == COMPRESSION_OJPEG) &&
+        (tif->tif_dir.td_planarconfig == PLANARCONFIG_SEPARATE))
+    {
+        if (!_TIFFFillStriles(tif))
+            goto bad;
+        dp = TIFFReadDirectoryFindEntry(tif, dir, dircount,
+                                        TIFFTAG_STRIPOFFSETS);
+        if ((dp != 0) && (dp->tdir_count == 1))
+        {
+            dp = TIFFReadDirectoryFindEntry(tif, dir, dircount,
+                                            TIFFTAG_STRIPBYTECOUNTS);
+            if ((dp != 0) && (dp->tdir_count == 1))
+            {
+                tif->tif_dir.td_planarconfig = PLANARCONFIG_CONTIG;
+                TIFFWarningExtR(tif, module,
+                                "Planarconfig tag value assumed incorrect, "
+                                "assuming data is contig instead of chunky");
+            }
+        }
+    }
+    /*
+     * Allocate directory structure and setup defaults.
+     */
+    if (!TIFFFieldSet(tif, FIELD_IMAGEDIMENSIONS))
+    {
+        MissingRequired(tif, "ImageLength");
+        goto bad;
+    }
+
+    /*
+     * Second pass: extract other information.
+     */
+    for (di = 0, dp = dir; di < dircount; di++, dp++)
+    {
+        if (!dp->tdir_ignore)
+        {
+            switch (dp->tdir_tag)
+            {
+                case TIFFTAG_MINSAMPLEVALUE:
+                case TIFFTAG_MAXSAMPLEVALUE:
+                case TIFFTAG_BITSPERSAMPLE:
+                case TIFFTAG_DATATYPE:
+                case TIFFTAG_SAMPLEFORMAT:
+                    /*
+                     * The MinSampleValue, MaxSampleValue, BitsPerSample
+                     * DataType and SampleFormat tags are supposed to be
+                     * written as one value/sample, but some vendors
+                     * incorrectly write one value only -- so we accept
+                     * that as well (yuck). Other vendors write correct
+                     * value for NumberOfSamples, but incorrect one for
+                     * BitsPerSample and friends, and we will read this
+                     * too.
+                     */
+                    {
+                        uint16_t value;
+                        enum TIFFReadDirEntryErr err;
+                        err = TIFFReadDirEntryShort(tif, dp, &value);
+                        if (err == TIFFReadDirEntryErrCount)
+                            err =
+                                TIFFReadDirEntryPersampleShort(tif, dp, &value);
+                        if (err != TIFFReadDirEntryErrOk)
+                        {
+                            fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                            TIFFReadDirEntryOutputErr(
+                                tif, err, module,
+                                fip ? fip->field_name : "unknown tagname", 0);
+                            goto bad;
+                        }
+                        if (!TIFFSetField(tif, dp->tdir_tag, value))
+                            goto bad;
+                        if (dp->tdir_tag == TIFFTAG_BITSPERSAMPLE)
+                            bitspersample_read = TRUE;
+                    }
+                    break;
+                case TIFFTAG_SMINSAMPLEVALUE:
+                case TIFFTAG_SMAXSAMPLEVALUE:
+                {
+
+                    double *data = NULL;
+                    enum TIFFReadDirEntryErr err;
+                    uint32_t saved_flags;
+                    int m;
+                    if (dp->tdir_count !=
+                        (uint64_t)tif->tif_dir.td_samplesperpixel)
+                        err = TIFFReadDirEntryErrCount;
+                    else
+                        err = TIFFReadDirEntryDoubleArray(tif, dp, &data);
+                    if (err != TIFFReadDirEntryErrOk)
+                    {
+                        fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                        TIFFReadDirEntryOutputErr(
+                            tif, err, module,
+                            fip ? fip->field_name : "unknown tagname", 0);
+                        goto bad;
+                    }
+                    saved_flags = tif->tif_flags;
+                    tif->tif_flags |= TIFF_PERSAMPLE;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    tif->tif_flags = saved_flags;
+                    _TIFFfreeExt(tif, data);
+                    if (!m)
+                        goto bad;
+                }
+                break;
+                case TIFFTAG_STRIPOFFSETS:
+                case TIFFTAG_TILEOFFSETS:
+                    switch (dp->tdir_type)
+                    {
+                        case TIFF_SHORT:
+                        case TIFF_LONG:
+                        case TIFF_LONG8:
+                            break;
+                        default:
+                            /* Warn except if directory typically created with
+                             * TIFFDeferStrileArrayWriting() */
+                            if (!(tif->tif_mode == O_RDWR &&
+                                  dp->tdir_count == 0 && dp->tdir_type == 0 &&
+                                  dp->tdir_offset.toff_long8 == 0))
+                            {
+                                fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                                TIFFWarningExtR(
+                                    tif, module, "Invalid data type for tag %s",
+                                    fip ? fip->field_name : "unknown tagname");
+                            }
+                            break;
+                    }
+                    _TIFFmemcpy(&(tif->tif_dir.td_stripoffset_entry), dp,
+                                sizeof(TIFFDirEntry));
+                    break;
+                case TIFFTAG_STRIPBYTECOUNTS:
+                case TIFFTAG_TILEBYTECOUNTS:
+                    switch (dp->tdir_type)
+                    {
+                        case TIFF_SHORT:
+                        case TIFF_LONG:
+                        case TIFF_LONG8:
+                            break;
+                        default:
+                            /* Warn except if directory typically created with
+                             * TIFFDeferStrileArrayWriting() */
+                            if (!(tif->tif_mode == O_RDWR &&
+                                  dp->tdir_count == 0 && dp->tdir_type == 0 &&
+                                  dp->tdir_offset.toff_long8 == 0))
+                            {
+                                fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                                TIFFWarningExtR(
+                                    tif, module, "Invalid data type for tag %s",
+                                    fip ? fip->field_name : "unknown tagname");
+                            }
+                            break;
+                    }
+                    _TIFFmemcpy(&(tif->tif_dir.td_stripbytecount_entry), dp,
+                                sizeof(TIFFDirEntry));
+                    break;
+                case TIFFTAG_COLORMAP:
+                case TIFFTAG_TRANSFERFUNCTION:
+                {
+                    enum TIFFReadDirEntryErr err;
+                    uint32_t countpersample;
+                    uint32_t countrequired;
+                    uint32_t incrementpersample;
+                    uint16_t *value = NULL;
+                    /* It would be dangerous to instantiate those tag values */
+                    /* since if td_bitspersample has not yet been read (due to
+                     */
+                    /* unordered tags), it could be read afterwards with a */
+                    /* values greater than the default one (1), which may cause
+                     */
+                    /* crashes in user code */
+                    if (!bitspersample_read)
+                    {
+                        fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                        TIFFWarningExtR(
+                            tif, module,
+                            "Ignoring %s since BitsPerSample tag not found",
+                            fip ? fip->field_name : "unknown tagname");
+                        continue;
+                    }
+                    /* ColorMap or TransferFunction for high bit */
+                    /* depths do not make much sense and could be */
+                    /* used as a denial of service vector */
+                    if (tif->tif_dir.td_bitspersample > 24)
+                    {
+                        fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                        TIFFWarningExtR(
+                            tif, module,
+                            "Ignoring %s because BitsPerSample=%" PRIu16 ">24",
+                            fip ? fip->field_name : "unknown tagname",
+                            tif->tif_dir.td_bitspersample);
+                        continue;
+                    }
+                    countpersample = (1U << tif->tif_dir.td_bitspersample);
+                    if ((dp->tdir_tag == TIFFTAG_TRANSFERFUNCTION) &&
+                        (dp->tdir_count == (uint64_t)countpersample))
+                    {
+                        countrequired = countpersample;
+                        incrementpersample = 0;
+                    }
+                    else
+                    {
+                        countrequired = 3 * countpersample;
+                        incrementpersample = countpersample;
+                    }
+                    if (dp->tdir_count != (uint64_t)countrequired)
+                        err = TIFFReadDirEntryErrCount;
+                    else
+                        err = TIFFReadDirEntryShortArray(tif, dp, &value);
+                    if (err != TIFFReadDirEntryErrOk)
+                    {
+                        fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                        TIFFReadDirEntryOutputErr(
+                            tif, err, module,
+                            fip ? fip->field_name : "unknown tagname", 1);
+                    }
+                    else
+                    {
+                        TIFFSetField(tif, dp->tdir_tag, value,
+                                     value + incrementpersample,
+                                     value + 2 * incrementpersample);
+                        _TIFFfreeExt(tif, value);
+                    }
+                }
+                break;
+                    /* BEGIN REV 4.0 COMPATIBILITY */
+                case TIFFTAG_OSUBFILETYPE:
+                {
+                    uint16_t valueo;
+                    uint32_t value;
+                    if (TIFFReadDirEntryShort(tif, dp, &valueo) ==
+                        TIFFReadDirEntryErrOk)
+                    {
+                        switch (valueo)
+                        {
+                            case OFILETYPE_REDUCEDIMAGE:
+                                value = FILETYPE_REDUCEDIMAGE;
+                                break;
+                            case OFILETYPE_PAGE:
+                                value = FILETYPE_PAGE;
+                                break;
+                            default:
+                                value = 0;
+                                break;
+                        }
+                        if (value != 0)
+                            TIFFSetField(tif, TIFFTAG_SUBFILETYPE, value);
+                    }
+                }
+                break;
+                /* END REV 4.0 COMPATIBILITY */
+#if 0
+                case TIFFTAG_EP_BATTERYLEVEL:
+                    /* TIFFTAG_EP_BATTERYLEVEL can be RATIONAL or ASCII.
+                     * LibTiff defines it as ASCII and converts RATIONAL to an
+                     * ASCII string. */
+                    switch (dp->tdir_type)
+                    {
+                        case TIFF_RATIONAL:
+                        {
+                            /* Read rational and convert to ASCII*/
+                            enum TIFFReadDirEntryErr err;
+                            TIFFRational_t rValue;
+                            err = TIFFReadDirEntryCheckedRationalDirect(
+                                tif, dp, &rValue);
+                            if (err != TIFFReadDirEntryErrOk)
+                            {
+                                fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                                TIFFReadDirEntryOutputErr(
+                                    tif, err, module,
+                                    fip ? fip->field_name : "unknown tagname",
+                                    1);
+                            }
+                            else
+                            {
+                                char szAux[32];
+                                snprintf(szAux, sizeof(szAux) - 1, "%d/%d",
+                                         rValue.uNum, rValue.uDenom);
+                                TIFFSetField(tif, dp->tdir_tag, szAux);
+                            }
+                        }
+                        break;
+                        case TIFF_ASCII:
+                            (void)TIFFFetchNormalTag(tif, dp, TRUE);
+                            break;
+                        default:
+                            fip = TIFFFieldWithTag(tif, dp->tdir_tag);
+                            TIFFWarningExtR(tif, module,
+                                            "Invalid data type for tag %s. "
+                                            "ASCII or RATIONAL expected",
+                                            fip ? fip->field_name
+                                                : "unknown tagname");
+                            break;
+                    }
+                    break;
+#endif
+                default:
+                    (void)TIFFFetchNormalTag(tif, dp, TRUE);
+                    break;
+            }
+        } /* -- if (!dp->tdir_ignore) */
+    }     /* -- for-loop -- */
+
+    /*
+     * OJPEG hack:
+     * - If a) compression is OJPEG, and b) photometric tag is missing,
+     * then we consistently find that photometric should be YCbCr
+     * - If a) compression is OJPEG, and b) photometric tag says it's RGB,
+     * then we consistently find that the buggy implementation of the
+     * buggy compression scheme matches photometric YCbCr instead.
+     * - If a) compression is OJPEG, and b) bitspersample tag is missing,
+     * then we consistently find bitspersample should be 8.
+     * - If a) compression is OJPEG, b) samplesperpixel tag is missing,
+     * and c) photometric is RGB or YCbCr, then we consistently find
+     * samplesperpixel should be 3
+     * - If a) compression is OJPEG, b) samplesperpixel tag is missing,
+     * and c) photometric is MINISWHITE or MINISBLACK, then we consistently
+     * find samplesperpixel should be 3
+     */
+    if (tif->tif_dir.td_compression == COMPRESSION_OJPEG)
+    {
+        if (!TIFFFieldSet(tif, FIELD_PHOTOMETRIC))
+        {
+            TIFFWarningExtR(
+                tif, module,
+                "Photometric tag is missing, assuming data is YCbCr");
+            if (!TIFFSetField(tif, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_YCBCR))
+                goto bad;
+        }
+        else if (tif->tif_dir.td_photometric == PHOTOMETRIC_RGB)
+        {
+            tif->tif_dir.td_photometric = PHOTOMETRIC_YCBCR;
+            TIFFWarningExtR(tif, module,
+                            "Photometric tag value assumed incorrect, "
+                            "assuming data is YCbCr instead of RGB");
+        }
+        if (!TIFFFieldSet(tif, FIELD_BITSPERSAMPLE))
+        {
+            TIFFWarningExtR(
+                tif, module,
+                "BitsPerSample tag is missing, assuming 8 bits per sample");
+            if (!TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 8))
+                goto bad;
+        }
+        if (!TIFFFieldSet(tif, FIELD_SAMPLESPERPIXEL))
+        {
+            if (tif->tif_dir.td_photometric == PHOTOMETRIC_RGB)
+            {
+                TIFFWarningExtR(tif, module,
+                                "SamplesPerPixel tag is missing, "
+                                "assuming correct SamplesPerPixel value is 3");
+                if (!TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, 3))
+                    goto bad;
+            }
+            if (tif->tif_dir.td_photometric == PHOTOMETRIC_YCBCR)
+            {
+                TIFFWarningExtR(tif, module,
+                                "SamplesPerPixel tag is missing, "
+                                "applying correct SamplesPerPixel value of 3");
+                if (!TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, 3))
+                    goto bad;
+            }
+            else if ((tif->tif_dir.td_photometric == PHOTOMETRIC_MINISWHITE) ||
+                     (tif->tif_dir.td_photometric == PHOTOMETRIC_MINISBLACK))
+            {
+                /*
+                 * SamplesPerPixel tag is missing, but is not required
+                 * by spec.  Assume correct SamplesPerPixel value of 1.
+                 */
+                if (!TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, 1))
+                    goto bad;
+            }
+        }
+    }
+
+    /*
+     * Setup appropriate structures (by strip or by tile)
+     * We do that only after the above OJPEG hack which alters SamplesPerPixel
+     * and thus influences the number of strips in the separate planarconfig.
+     */
+    if (!TIFFFieldSet(tif, FIELD_TILEDIMENSIONS))
+    {
+        tif->tif_dir.td_nstrips = TIFFNumberOfStrips(tif);
+        tif->tif_dir.td_tilewidth = tif->tif_dir.td_imagewidth;
+        tif->tif_dir.td_tilelength = tif->tif_dir.td_rowsperstrip;
+        tif->tif_dir.td_tiledepth = tif->tif_dir.td_imagedepth;
+        tif->tif_flags &= ~TIFF_ISTILED;
+    }
+    else
+    {
+        tif->tif_dir.td_nstrips = TIFFNumberOfTiles(tif);
+        tif->tif_flags |= TIFF_ISTILED;
+    }
+    if (!tif->tif_dir.td_nstrips)
+    {
+        TIFFErrorExtR(tif, module, "Cannot handle zero number of %s",
+                      isTiled(tif) ? "tiles" : "strips");
+        goto bad;
+    }
+    tif->tif_dir.td_stripsperimage = tif->tif_dir.td_nstrips;
+    if (tif->tif_dir.td_planarconfig == PLANARCONFIG_SEPARATE)
+        tif->tif_dir.td_stripsperimage /= tif->tif_dir.td_samplesperpixel;
+    if (!TIFFFieldSet(tif, FIELD_STRIPOFFSETS))
+    {
+#ifdef OJPEG_SUPPORT
+        if ((tif->tif_dir.td_compression == COMPRESSION_OJPEG) &&
+            (isTiled(tif) == 0) && (tif->tif_dir.td_nstrips == 1))
+        {
+            /*
+             * XXX: OJPEG hack.
+             * If a) compression is OJPEG, b) it's not a tiled TIFF,
+             * and c) the number of strips is 1,
+             * then we tolerate the absence of stripoffsets tag,
+             * because, presumably, all required data is in the
+             * JpegInterchangeFormat stream.
+             */
+            TIFFSetFieldBit(tif, FIELD_STRIPOFFSETS);
+        }
+        else
+#endif
+        {
+            MissingRequired(tif, isTiled(tif) ? "TileOffsets" : "StripOffsets");
+            goto bad;
+        }
+    }
+
+    if (tif->tif_mode == O_RDWR &&
+        tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
+        tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
+        tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
+        tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0)
+    {
+        /* Directory typically created with TIFFDeferStrileArrayWriting() */
+        TIFFSetupStrips(tif);
+    }
+    else if (!(tif->tif_flags & TIFF_DEFERSTRILELOAD))
+    {
+        if (tif->tif_dir.td_stripoffset_entry.tdir_tag != 0)
+        {
+            if (!TIFFFetchStripThing(tif, &(tif->tif_dir.td_stripoffset_entry),
+                                     tif->tif_dir.td_nstrips,
+                                     &tif->tif_dir.td_stripoffset_p))
+            {
+                goto bad;
+            }
+        }
+        if (tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0)
+        {
+            if (!TIFFFetchStripThing(
+                    tif, &(tif->tif_dir.td_stripbytecount_entry),
+                    tif->tif_dir.td_nstrips, &tif->tif_dir.td_stripbytecount_p))
+            {
+                goto bad;
+            }
+        }
+    }
+
+    /*
+     * Make sure all non-color channels are extrasamples.
+     * If it's not the case, define them as such.
+     */
+    color_channels = _TIFFGetMaxColorChannels(tif->tif_dir.td_photometric);
+    if (color_channels &&
+        tif->tif_dir.td_samplesperpixel - tif->tif_dir.td_extrasamples >
+            color_channels)
+    {
+        uint16_t old_extrasamples;
+        uint16_t *new_sampleinfo;
+
+        TIFFWarningExtR(
+            tif, module,
+            "Sum of Photometric type-related "
+            "color channels and ExtraSamples doesn't match SamplesPerPixel. "
+            "Defining non-color channels as ExtraSamples.");
+
+        old_extrasamples = tif->tif_dir.td_extrasamples;
+        tif->tif_dir.td_extrasamples =
+            (uint16_t)(tif->tif_dir.td_samplesperpixel - color_channels);
+
+        // sampleinfo should contain information relative to these new extra
+        // samples
+        new_sampleinfo = (uint16_t *)_TIFFcallocExt(
+            tif, tif->tif_dir.td_extrasamples, sizeof(uint16_t));
+        if (!new_sampleinfo)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Failed to allocate memory for "
+                          "temporary new sampleinfo array "
+                          "(%" PRIu16 " 16 bit elements)",
+                          tif->tif_dir.td_extrasamples);
+            goto bad;
+        }
+
+        if (old_extrasamples > 0)
+            memcpy(new_sampleinfo, tif->tif_dir.td_sampleinfo,
+                   old_extrasamples * sizeof(uint16_t));
+        _TIFFsetShortArrayExt(tif, &tif->tif_dir.td_sampleinfo, new_sampleinfo,
+                              tif->tif_dir.td_extrasamples);
+        _TIFFfreeExt(tif, new_sampleinfo);
+    }
+
+    /*
+     * Verify Palette image has a Colormap.
+     */
+    if (tif->tif_dir.td_photometric == PHOTOMETRIC_PALETTE &&
+        !TIFFFieldSet(tif, FIELD_COLORMAP))
+    {
+        if (tif->tif_dir.td_bitspersample >= 8 &&
+            tif->tif_dir.td_samplesperpixel == 3)
+            tif->tif_dir.td_photometric = PHOTOMETRIC_RGB;
+        else if (tif->tif_dir.td_bitspersample >= 8)
+            tif->tif_dir.td_photometric = PHOTOMETRIC_MINISBLACK;
+        else
+        {
+            MissingRequired(tif, "Colormap");
+            goto bad;
+        }
+    }
+    /*
+     * OJPEG hack:
+     * We do no further messing with strip/tile offsets/bytecounts in OJPEG
+     * TIFFs
+     */
+    if (tif->tif_dir.td_compression != COMPRESSION_OJPEG)
+    {
+        /*
+         * Attempt to deal with a missing StripByteCounts tag.
+         */
+        if (!TIFFFieldSet(tif, FIELD_STRIPBYTECOUNTS))
+        {
+            /*
+             * Some manufacturers violate the spec by not giving
+             * the size of the strips.  In this case, assume there
+             * is one uncompressed strip of data.
+             */
+            if ((tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG &&
+                 tif->tif_dir.td_nstrips > 1) ||
+                (tif->tif_dir.td_planarconfig == PLANARCONFIG_SEPARATE &&
+                 tif->tif_dir.td_nstrips !=
+                     (uint32_t)tif->tif_dir.td_samplesperpixel))
+            {
+                MissingRequired(tif, "StripByteCounts");
+                goto bad;
+            }
+            TIFFWarningExtR(
+                tif, module,
+                "TIFF directory is missing required "
+                "\"StripByteCounts\" field, calculating from imagelength");
+            if (EstimateStripByteCounts(tif, dir, dircount) < 0)
+                goto bad;
+        }
+        else if (tif->tif_dir.td_nstrips == 1 &&
+                 !(tif->tif_flags & TIFF_ISTILED) && ByteCountLooksBad(tif))
+        {
+            /*
+             * XXX: Plexus (and others) sometimes give a value of
+             * zero for a tag when they don't know what the
+             * correct value is!  Try and handle the simple case
+             * of estimating the size of a one strip image.
+             */
+            TIFFWarningExtR(tif, module,
+                            "Bogus \"StripByteCounts\" field, ignoring and "
+                            "calculating from imagelength");
+            if (EstimateStripByteCounts(tif, dir, dircount) < 0)
+                goto bad;
+        }
+        else if (!(tif->tif_flags & TIFF_DEFERSTRILELOAD) &&
+                 tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG &&
+                 tif->tif_dir.td_nstrips > 2 &&
+                 tif->tif_dir.td_compression == COMPRESSION_NONE &&
+                 TIFFGetStrileByteCount(tif, 0) !=
+                     TIFFGetStrileByteCount(tif, 1) &&
+                 TIFFGetStrileByteCount(tif, 0) != 0 &&
+                 TIFFGetStrileByteCount(tif, 1) != 0)
+        {
+            /*
+             * XXX: Some vendors fill StripByteCount array with
+             * absolutely wrong values (it can be equal to
+             * StripOffset array, for example). Catch this case
+             * here.
+             *
+             * We avoid this check if deferring strile loading
+             * as it would always force us to load the strip/tile
+             * information.
+             */
+            TIFFWarningExtR(tif, module,
+                            "Wrong \"StripByteCounts\" field, ignoring and "
+                            "calculating from imagelength");
+            if (EstimateStripByteCounts(tif, dir, dircount) < 0)
+                goto bad;
+        }
+    }
+    if (dir)
+    {
+        _TIFFfreeExt(tif, dir);
+        dir = NULL;
+    }
+    if (!TIFFFieldSet(tif, FIELD_MAXSAMPLEVALUE))
+    {
+        if (tif->tif_dir.td_bitspersample >= 16)
+            tif->tif_dir.td_maxsamplevalue = 0xFFFF;
+        else
+            tif->tif_dir.td_maxsamplevalue =
+                (uint16_t)((1L << tif->tif_dir.td_bitspersample) - 1);
+    }
+
+#ifdef STRIPBYTECOUNTSORTED_UNUSED
+    /*
+     * XXX: We can optimize checking for the strip bounds using the sorted
+     * bytecounts array. See also comments for TIFFAppendToStrip()
+     * function in tif_write.c.
+     */
+    if (!(tif->tif_flags & TIFF_DEFERSTRILELOAD) && tif->tif_dir.td_nstrips > 1)
+    {
+        uint32_t strip;
+
+        tif->tif_dir.td_stripbytecountsorted = 1;
+        for (strip = 1; strip < tif->tif_dir.td_nstrips; strip++)
+        {
+            if (TIFFGetStrileOffset(tif, strip - 1) >
+                TIFFGetStrileOffset(tif, strip))
+            {
+                tif->tif_dir.td_stripbytecountsorted = 0;
+                break;
+            }
+        }
+    }
+#endif
+
+    /*
+     * An opportunity for compression mode dependent tag fixup
+     */
+    (*tif->tif_fixuptags)(tif);
+
+    /*
+     * Some manufacturers make life difficult by writing
+     * large amounts of uncompressed data as a single strip.
+     * This is contrary to the recommendations of the spec.
+     * The following makes an attempt at breaking such images
+     * into strips closer to the recommended 8k bytes.  A
+     * side effect, however, is that the RowsPerStrip tag
+     * value may be changed.
+     */
+    if ((tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG) &&
+        (tif->tif_dir.td_nstrips == 1) &&
+        (tif->tif_dir.td_compression == COMPRESSION_NONE) &&
+        ((tif->tif_flags & (TIFF_STRIPCHOP | TIFF_ISTILED)) == TIFF_STRIPCHOP))
+    {
+        ChopUpSingleUncompressedStrip(tif);
+    }
+
+    /* There are also uncompressed striped files with strips larger than */
+    /* 2 GB, which make them unfriendly with a lot of code. If possible, */
+    /* try to expose smaller "virtual" strips. */
+    if (tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG &&
+        tif->tif_dir.td_compression == COMPRESSION_NONE &&
+        (tif->tif_flags & (TIFF_STRIPCHOP | TIFF_ISTILED)) == TIFF_STRIPCHOP &&
+        TIFFStripSize64(tif) > 0x7FFFFFFFUL)
+    {
+        TryChopUpUncompressedBigTiff(tif);
+    }
+
+    /*
+     * Clear the dirty directory flag.
+     */
+    tif->tif_flags &= ~TIFF_DIRTYDIRECT;
+    tif->tif_flags &= ~TIFF_DIRTYSTRIP;
+
+    /*
+     * Reinitialize i/o since we are starting on a new directory.
+     */
+    tif->tif_row = (uint32_t)-1;
+    tif->tif_curstrip = (uint32_t)-1;
+    tif->tif_col = (uint32_t)-1;
+    tif->tif_curtile = (uint32_t)-1;
+    tif->tif_tilesize = (tmsize_t)-1;
+
+    tif->tif_scanlinesize = TIFFScanlineSize(tif);
+    if (!tif->tif_scanlinesize)
+    {
+        TIFFErrorExtR(tif, module, "Cannot handle zero scanline size");
+        return (0);
+    }
+
+    if (isTiled(tif))
+    {
+        tif->tif_tilesize = TIFFTileSize(tif);
+        if (!tif->tif_tilesize)
+        {
+            TIFFErrorExtR(tif, module, "Cannot handle zero tile size");
+            return (0);
+        }
+    }
+    else
+    {
+        if (!TIFFStripSize(tif))
+        {
+            TIFFErrorExtR(tif, module, "Cannot handle zero strip size");
+            return (0);
+        }
+    }
+    return (1);
+bad:
+    if (dir)
+        _TIFFfreeExt(tif, dir);
+    return (0);
+}
+
+static void TIFFReadDirectoryCheckOrder(TIFF *tif, TIFFDirEntry *dir,
+                                        uint16_t dircount)
+{
+    static const char module[] = "TIFFReadDirectoryCheckOrder";
+    uint32_t m;
+    uint16_t n;
+    TIFFDirEntry *o;
+    m = 0;
+    for (n = 0, o = dir; n < dircount; n++, o++)
+    {
+        if (o->tdir_tag < m)
+        {
+            TIFFWarningExtR(tif, module,
+                            "Invalid TIFF directory; tags are not sorted in "
+                            "ascending order");
+            break;
+        }
+        m = o->tdir_tag + 1;
+    }
+}
+
+static TIFFDirEntry *TIFFReadDirectoryFindEntry(TIFF *tif, TIFFDirEntry *dir,
+                                                uint16_t dircount,
+                                                uint16_t tagid)
+{
+    TIFFDirEntry *m;
+    uint16_t n;
+    (void)tif;
+    for (m = dir, n = 0; n < dircount; m++, n++)
+    {
+        if (m->tdir_tag == tagid)
+            return (m);
+    }
+    return (0);
+}
+
+static void TIFFReadDirectoryFindFieldInfo(TIFF *tif, uint16_t tagid,
+                                           uint32_t *fii)
+{
+    int32_t ma, mb, mc;
+    ma = -1;
+    mc = (int32_t)tif->tif_nfields;
+    while (1)
+    {
+        if (ma + 1 == mc)
+        {
+            *fii = FAILED_FII;
+            return;
+        }
+        mb = (ma + mc) / 2;
+        if (tif->tif_fields[mb]->field_tag == (uint32_t)tagid)
+            break;
+        if (tif->tif_fields[mb]->field_tag < (uint32_t)tagid)
+            ma = mb;
+        else
+            mc = mb;
+    }
+    while (1)
+    {
+        if (mb == 0)
+            break;
+        if (tif->tif_fields[mb - 1]->field_tag != (uint32_t)tagid)
+            break;
+        mb--;
+    }
+    *fii = mb;
+}
+
+/*
+ * Read custom directory from the arbitrary offset.
+ * The code is very similar to TIFFReadDirectory().
+ */
+int TIFFReadCustomDirectory(TIFF *tif, toff_t diroff,
+                            const TIFFFieldArray *infoarray)
+{
+    static const char module[] = "TIFFReadCustomDirectory";
+    TIFFDirEntry *dir;
+    uint16_t dircount;
+    TIFFDirEntry *dp;
+    uint16_t di;
+    const TIFFField *fip;
+    uint32_t fii;
+    (*tif->tif_cleanup)(tif); /* cleanup any previous compression state */
+    _TIFFSetupFields(tif, infoarray);
+    dircount = TIFFFetchDirectory(tif, diroff, &dir, NULL);
+    if (!dircount)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Failed to read custom directory at offset %" PRIu64,
+                      diroff);
+        return 0;
+    }
+    TIFFFreeDirectory(tif);
+    _TIFFmemset(&tif->tif_dir, 0, sizeof(TIFFDirectory));
+    TIFFReadDirectoryCheckOrder(tif, dir, dircount);
+    for (di = 0, dp = dir; di < dircount; di++, dp++)
+    {
+        TIFFReadDirectoryFindFieldInfo(tif, dp->tdir_tag, &fii);
+        if (fii == FAILED_FII)
+        {
+            TIFFWarningExtR(tif, module,
+                            "Unknown field with tag %" PRIu16 " (0x%" PRIx16
+                            ") encountered",
+                            dp->tdir_tag, dp->tdir_tag);
+            if (!_TIFFMergeFields(
+                    tif,
+                    _TIFFCreateAnonField(tif, dp->tdir_tag,
+                                         (TIFFDataType)dp->tdir_type),
+                    1))
+            {
+                TIFFWarningExtR(tif, module,
+                                "Registering anonymous field with tag %" PRIu16
+                                " (0x%" PRIx16 ") failed",
+                                dp->tdir_tag, dp->tdir_tag);
+                dp->tdir_ignore = TRUE;
+            }
+            else
+            {
+                TIFFReadDirectoryFindFieldInfo(tif, dp->tdir_tag, &fii);
+                assert(fii != FAILED_FII);
+            }
+        }
+        if (!dp->tdir_ignore)
+        {
+            fip = tif->tif_fields[fii];
+            if (fip->field_bit == FIELD_IGNORE)
+                dp->tdir_ignore = TRUE;
+            else
+            {
+                /* check data type */
+                while ((fip->field_type != TIFF_ANY) &&
+                       (fip->field_type != dp->tdir_type))
+                {
+                    fii++;
+                    if ((fii == tif->tif_nfields) ||
+                        (tif->tif_fields[fii]->field_tag !=
+                         (uint32_t)dp->tdir_tag))
+                    {
+                        fii = 0xFFFF;
+                        break;
+                    }
+                    fip = tif->tif_fields[fii];
+                }
+                if (fii == 0xFFFF)
+                {
+                    TIFFWarningExtR(tif, module,
+                                    "Wrong data type %" PRIu16
+                                    " for \"%s\"; tag ignored",
+                                    dp->tdir_type, fip->field_name);
+                    dp->tdir_ignore = TRUE;
+                }
+                else
+                {
+                    /* check count if known in advance */
+                    if ((fip->field_readcount != TIFF_VARIABLE) &&
+                        (fip->field_readcount != TIFF_VARIABLE2))
+                    {
+                        uint32_t expected;
+                        if (fip->field_readcount == TIFF_SPP)
+                            expected =
+                                (uint32_t)tif->tif_dir.td_samplesperpixel;
+                        else
+                            expected = (uint32_t)fip->field_readcount;
+                        if (!CheckDirCount(tif, dp, expected))
+                            dp->tdir_ignore = TRUE;
+                    }
+                }
+            }
+            if (!dp->tdir_ignore)
+            {
+                switch (dp->tdir_tag)
+                {
+                    case EXIFTAG_SUBJECTDISTANCE:
+                        if (!TIFFFieldIsAnonymous(fip))
+                        {
+                            /* should only be called on a Exif directory */
+                            /* when exifFields[] is active */
+                            (void)TIFFFetchSubjectDistance(tif, dp);
+                        }
+                        else
+                        {
+                            (void)TIFFFetchNormalTag(tif, dp, TRUE);
+                        }
+                        break;
+                    default:
+                        (void)TIFFFetchNormalTag(tif, dp, TRUE);
+                        break;
+                }
+            } /*-- if (!dp->tdir_ignore) */
+        }
+    }
+    /* To be able to return from SubIFD or custom-IFD to main-IFD */
+    tif->tif_setdirectory_force_absolute = TRUE;
+    if (dir)
+        _TIFFfreeExt(tif, dir);
+    return 1;
+}
+
+/*
+ * EXIF is important special case of custom IFD, so we have a special
+ * function to read it.
+ */
+int TIFFReadEXIFDirectory(TIFF *tif, toff_t diroff)
+{
+    const TIFFFieldArray *exifFieldArray;
+    exifFieldArray = _TIFFGetExifFields();
+    return TIFFReadCustomDirectory(tif, diroff, exifFieldArray);
+}
+
+/*
+ *--: EXIF-GPS custom directory reading as another special case of custom IFD.
+ */
+int TIFFReadGPSDirectory(TIFF *tif, toff_t diroff)
+{
+    const TIFFFieldArray *gpsFieldArray;
+    gpsFieldArray = _TIFFGetGpsFields();
+    return TIFFReadCustomDirectory(tif, diroff, gpsFieldArray);
+}
+
+static int EstimateStripByteCounts(TIFF *tif, TIFFDirEntry *dir,
+                                   uint16_t dircount)
+{
+    static const char module[] = "EstimateStripByteCounts";
+
+    TIFFDirEntry *dp;
+    TIFFDirectory *td = &tif->tif_dir;
+    uint32_t strip;
+
+    /* Do not try to load stripbytecount as we will compute it */
+    if (!_TIFFFillStrilesInternal(tif, 0))
+        return -1;
+
+    if (td->td_stripbytecount_p)
+        _TIFFfreeExt(tif, td->td_stripbytecount_p);
+    td->td_stripbytecount_p = (uint64_t *)_TIFFCheckMalloc(
+        tif, td->td_nstrips, sizeof(uint64_t), "for \"StripByteCounts\" array");
+    if (td->td_stripbytecount_p == NULL)
+        return -1;
+
+    if (td->td_compression != COMPRESSION_NONE)
+    {
+        uint64_t space;
+        uint64_t filesize;
+        uint16_t n;
+        filesize = TIFFGetFileSize(tif);
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+            space = sizeof(TIFFHeaderClassic) + 2 + dircount * 12 + 4;
+        else
+            space = sizeof(TIFFHeaderBig) + 8 + dircount * 20 + 8;
+        /* calculate amount of space used by indirect values */
+        for (dp = dir, n = dircount; n > 0; n--, dp++)
+        {
+            uint32_t typewidth;
+            uint64_t datasize;
+            typewidth = TIFFDataWidth((TIFFDataType)dp->tdir_type);
+            if (typewidth == 0)
+            {
+                TIFFErrorExtR(
+                    tif, module,
+                    "Cannot determine size of unknown tag type %" PRIu16,
+                    dp->tdir_type);
+                return -1;
+            }
+            if (dp->tdir_count > UINT64_MAX / typewidth)
+                return -1;
+            datasize = (uint64_t)typewidth * dp->tdir_count;
+            if (!(tif->tif_flags & TIFF_BIGTIFF))
+            {
+                if (datasize <= 4)
+                    datasize = 0;
+            }
+            else
+            {
+                if (datasize <= 8)
+                    datasize = 0;
+            }
+            if (space > UINT64_MAX - datasize)
+                return -1;
+            space += datasize;
+        }
+        if (filesize < space)
+            /* we should perhaps return in error ? */
+            space = filesize;
+        else
+            space = filesize - space;
+        if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+            space /= td->td_samplesperpixel;
+        for (strip = 0; strip < td->td_nstrips; strip++)
+            td->td_stripbytecount_p[strip] = space;
+        /*
+         * This gross hack handles the case were the offset to
+         * the last strip is past the place where we think the strip
+         * should begin.  Since a strip of data must be contiguous,
+         * it's safe to assume that we've overestimated the amount
+         * of data in the strip and trim this number back accordingly.
+         */
+        strip--;
+        if (td->td_stripoffset_p[strip] >
+            UINT64_MAX - td->td_stripbytecount_p[strip])
+            return -1;
+        if (td->td_stripoffset_p[strip] + td->td_stripbytecount_p[strip] >
+            filesize)
+        {
+            if (td->td_stripoffset_p[strip] >= filesize)
+            {
+                /* Not sure what we should in that case... */
+                td->td_stripbytecount_p[strip] = 0;
+            }
+            else
+            {
+                td->td_stripbytecount_p[strip] =
+                    filesize - td->td_stripoffset_p[strip];
+            }
+        }
+    }
+    else if (isTiled(tif))
+    {
+        uint64_t bytespertile = TIFFTileSize64(tif);
+
+        for (strip = 0; strip < td->td_nstrips; strip++)
+            td->td_stripbytecount_p[strip] = bytespertile;
+    }
+    else
+    {
+        uint64_t rowbytes = TIFFScanlineSize64(tif);
+        uint32_t rowsperstrip = td->td_imagelength / td->td_stripsperimage;
+        for (strip = 0; strip < td->td_nstrips; strip++)
+        {
+            if (rowbytes > 0 && rowsperstrip > UINT64_MAX / rowbytes)
+                return -1;
+            td->td_stripbytecount_p[strip] = rowbytes * rowsperstrip;
+        }
+    }
+    TIFFSetFieldBit(tif, FIELD_STRIPBYTECOUNTS);
+    if (!TIFFFieldSet(tif, FIELD_ROWSPERSTRIP))
+        td->td_rowsperstrip = td->td_imagelength;
+    return 1;
+}
+
+static void MissingRequired(TIFF *tif, const char *tagname)
+{
+    static const char module[] = "MissingRequired";
+
+    TIFFErrorExtR(tif, module,
+                  "TIFF directory is missing required \"%s\" field", tagname);
+}
+
+static unsigned long hashFuncOffsetToNumber(const void *elt)
+{
+    const TIFFOffsetAndDirNumber *offsetAndDirNumber =
+        (const TIFFOffsetAndDirNumber *)elt;
+    const uint32_t hash = (uint32_t)(offsetAndDirNumber->offset >> 32) ^
+                          ((uint32_t)offsetAndDirNumber->offset & 0xFFFFFFFFU);
+    return hash;
+}
+
+static bool equalFuncOffsetToNumber(const void *elt1, const void *elt2)
+{
+    const TIFFOffsetAndDirNumber *offsetAndDirNumber1 =
+        (const TIFFOffsetAndDirNumber *)elt1;
+    const TIFFOffsetAndDirNumber *offsetAndDirNumber2 =
+        (const TIFFOffsetAndDirNumber *)elt2;
+    return offsetAndDirNumber1->offset == offsetAndDirNumber2->offset;
+}
+
+static unsigned long hashFuncNumberToOffset(const void *elt)
+{
+    const TIFFOffsetAndDirNumber *offsetAndDirNumber =
+        (const TIFFOffsetAndDirNumber *)elt;
+    return offsetAndDirNumber->dirNumber;
+}
+
+static bool equalFuncNumberToOffset(const void *elt1, const void *elt2)
+{
+    const TIFFOffsetAndDirNumber *offsetAndDirNumber1 =
+        (const TIFFOffsetAndDirNumber *)elt1;
+    const TIFFOffsetAndDirNumber *offsetAndDirNumber2 =
+        (const TIFFOffsetAndDirNumber *)elt2;
+    return offsetAndDirNumber1->dirNumber == offsetAndDirNumber2->dirNumber;
+}
+
+/*
+ * Check the directory number and offset against the list of already seen
+ * directory numbers and offsets. This is a trick to prevent IFD looping.
+ * The one can create TIFF file with looped directory pointers. We will
+ * maintain a list of already seen directories and check every IFD offset
+ * and its IFD number against that list. However, the offset of an IFD number
+ * can change - e.g. when writing updates to file.
+ * Returns 1 if all is ok; 0 if last directory or IFD loop is encountered,
+ * or an error has occurred.
+ */
+int _TIFFCheckDirNumberAndOffset(TIFF *tif, tdir_t dirn, uint64_t diroff)
+{
+    if (diroff == 0) /* no more directories */
+        return 0;
+
+    if (tif->tif_map_dir_offset_to_number == NULL)
+    {
+        tif->tif_map_dir_offset_to_number = TIFFHashSetNew(
+            hashFuncOffsetToNumber, equalFuncOffsetToNumber, free);
+        if (tif->tif_map_dir_offset_to_number == NULL)
+        {
+            TIFFErrorExtR(tif, "_TIFFCheckDirNumberAndOffset",
+                          "Not enough memory");
+            return 1;
+        }
+    }
+
+    if (tif->tif_map_dir_number_to_offset == NULL)
+    {
+        /* No free callback for this map, as it shares the same items as
+         * tif->tif_map_dir_offset_to_number. */
+        tif->tif_map_dir_number_to_offset = TIFFHashSetNew(
+            hashFuncNumberToOffset, equalFuncNumberToOffset, NULL);
+        if (tif->tif_map_dir_number_to_offset == NULL)
+        {
+            TIFFErrorExtR(tif, "_TIFFCheckDirNumberAndOffset",
+                          "Not enough memory");
+            return 1;
+        }
+    }
+
+    /* Check if offset is already in the list:
+     * - yes: check, if offset is at the same IFD number - if not, it is an IFD
+     * loop
+     * -  no: add to list or update offset at that IFD number
+     */
+    TIFFOffsetAndDirNumber entry;
+    entry.offset = diroff;
+    entry.dirNumber = dirn;
+
+    TIFFOffsetAndDirNumber *foundEntry =
+        (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+            tif->tif_map_dir_offset_to_number, &entry);
+    if (foundEntry)
+    {
+        if (foundEntry->dirNumber == dirn)
+        {
+            return 1;
+        }
+        else
+        {
+            TIFFWarningExtR(tif, "_TIFFCheckDirNumberAndOffset",
+                            "TIFF directory %d has IFD looping to directory %u "
+                            "at offset 0x%" PRIx64 " (%" PRIu64 ")",
+                            (int)dirn - 1, foundEntry->dirNumber, diroff,
+                            diroff);
+            return 0;
+        }
+    }
+
+    /* Check if offset of an IFD has been changed and update offset of that IFD
+     * number. */
+    foundEntry = (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+        tif->tif_map_dir_number_to_offset, &entry);
+    if (foundEntry)
+    {
+        if (foundEntry->offset != diroff)
+        {
+            TIFFOffsetAndDirNumber entryOld;
+            entryOld.offset = foundEntry->offset;
+            entryOld.dirNumber = dirn;
+            /* We must remove first from tif_map_dir_number_to_offset as the */
+            /* entry is owned (and thus freed) by */
+            /* tif_map_dir_offset_to_number */
+            TIFFOffsetAndDirNumber *foundEntryOld =
+                (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+                    tif->tif_map_dir_number_to_offset, &entryOld);
+            if (foundEntryOld)
+            {
+                TIFFHashSetRemove(tif->tif_map_dir_number_to_offset,
+                                  foundEntryOld);
+            }
+            foundEntryOld = (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+                tif->tif_map_dir_offset_to_number, &entryOld);
+            if (foundEntryOld)
+            {
+                TIFFHashSetRemove(tif->tif_map_dir_offset_to_number,
+                                  foundEntryOld);
+            }
+
+            TIFFOffsetAndDirNumber *entryPtr = (TIFFOffsetAndDirNumber *)malloc(
+                sizeof(TIFFOffsetAndDirNumber));
+            if (entryPtr == NULL)
+            {
+                return 0;
+            }
+
+            /* Add IFD offset and dirn to IFD directory list */
+            *entryPtr = entry;
+
+            if (!TIFFHashSetInsert(tif->tif_map_dir_offset_to_number, entryPtr))
+            {
+                TIFFErrorExtR(
+                    tif, "_TIFFCheckDirNumberAndOffset",
+                    "Insertion in tif_map_dir_offset_to_number failed");
+                return 0;
+            }
+            if (!TIFFHashSetInsert(tif->tif_map_dir_number_to_offset, entryPtr))
+            {
+                TIFFErrorExtR(
+                    tif, "_TIFFCheckDirNumberAndOffset",
+                    "Insertion in tif_map_dir_number_to_offset failed");
+                return 0;
+            }
+        }
+        return 1;
+    }
+
+    /* Arbitrary (hopefully big enough) limit */
+    if (TIFFHashSetSize(tif->tif_map_dir_offset_to_number) >=
+        TIFF_MAX_DIR_COUNT)
+    {
+        TIFFErrorExtR(tif, "_TIFFCheckDirNumberAndOffset",
+                      "Cannot handle more than %u TIFF directories",
+                      TIFF_MAX_DIR_COUNT);
+        return 0;
+    }
+
+    TIFFOffsetAndDirNumber *entryPtr =
+        (TIFFOffsetAndDirNumber *)malloc(sizeof(TIFFOffsetAndDirNumber));
+    if (entryPtr == NULL)
+    {
+        TIFFErrorExtR(tif, "_TIFFCheckDirNumberAndOffset",
+                      "malloc(sizeof(TIFFOffsetAndDirNumber)) failed");
+        return 0;
+    }
+
+    /* Add IFD offset and dirn to IFD directory list */
+    *entryPtr = entry;
+
+    if (!TIFFHashSetInsert(tif->tif_map_dir_offset_to_number, entryPtr))
+    {
+        TIFFErrorExtR(tif, "_TIFFCheckDirNumberAndOffset",
+                      "Insertion in tif_map_dir_offset_to_number failed");
+        return 0;
+    }
+    if (!TIFFHashSetInsert(tif->tif_map_dir_number_to_offset, entryPtr))
+    {
+        TIFFErrorExtR(tif, "_TIFFCheckDirNumberAndOffset",
+                      "Insertion in tif_map_dir_number_to_offset failed");
+        return 0;
+    }
+
+    return 1;
+} /* --- _TIFFCheckDirNumberAndOffset() ---*/
+
+/*
+ * Retrieve the matching IFD directory number of a given IFD offset
+ * from the list of directories already seen.
+ * Returns 1 if the offset was in the list and the directory number
+ * can be returned.
+ * Otherwise returns 0 or if an error occurred.
+ */
+int _TIFFGetDirNumberFromOffset(TIFF *tif, uint64_t diroff, tdir_t *dirn)
+{
+    if (diroff == 0) /* no more directories */
+        return 0;
+
+    /* Check if offset is already in the list and return matching directory
+     * number. Otherwise update IFD list using TIFFNumberOfDirectories() and
+     * search again in IFD list.
+     */
+    if (tif->tif_map_dir_offset_to_number == NULL)
+        return 0;
+    TIFFOffsetAndDirNumber entry;
+    entry.offset = diroff;
+    entry.dirNumber = 0; /* not used */
+
+    TIFFOffsetAndDirNumber *foundEntry =
+        (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+            tif->tif_map_dir_offset_to_number, &entry);
+    if (foundEntry)
+    {
+        *dirn = foundEntry->dirNumber;
+        return 1;
+    }
+
+    /* This updates the directory list for all main-IFDs in the file. */
+    TIFFNumberOfDirectories(tif);
+
+    foundEntry = (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+        tif->tif_map_dir_offset_to_number, &entry);
+    if (foundEntry)
+    {
+        *dirn = foundEntry->dirNumber;
+        return 1;
+    }
+
+    return 0;
+} /*--- _TIFFGetDirNumberFromOffset() ---*/
+
+/*
+ * Retrieve the matching IFD directory offset of a given IFD number
+ * from the list of directories already seen.
+ * Returns 1 if the offset was in the list of already seen IFDs and the
+ * directory offset can be returned. The directory list is not updated.
+ * Otherwise returns 0 or if an error occurred.
+ */
+int _TIFFGetOffsetFromDirNumber(TIFF *tif, tdir_t dirn, uint64_t *diroff)
+{
+
+    if (tif->tif_map_dir_number_to_offset == NULL)
+        return 0;
+    TIFFOffsetAndDirNumber entry;
+    entry.offset = 0; /* not used */
+    entry.dirNumber = dirn;
+
+    TIFFOffsetAndDirNumber *foundEntry =
+        (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+            tif->tif_map_dir_number_to_offset, &entry);
+    if (foundEntry)
+    {
+        *diroff = foundEntry->offset;
+        return 1;
+    }
+
+    return 0;
+} /*--- _TIFFGetOffsetFromDirNumber() ---*/
+
+/*
+ * Remove an entry from the directory list of already seen directories
+ * by directory offset.
+ * If an entry is to be removed from the list, it is also okay if the entry
+ * is not in the list or the list does not exist.
+ */
+int _TIFFRemoveEntryFromDirectoryListByOffset(TIFF *tif, uint64_t diroff)
+{
+    if (tif->tif_map_dir_offset_to_number == NULL)
+        return 1;
+
+    TIFFOffsetAndDirNumber entryOld;
+    entryOld.offset = diroff;
+    entryOld.dirNumber = 0;
+    /* We must remove first from tif_map_dir_number_to_offset as the
+     * entry is owned (and thus freed) by tif_map_dir_offset_to_number.
+     * However, we need firstly to find the directory number from offset. */
+
+    TIFFOffsetAndDirNumber *foundEntryOldOff =
+        (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+            tif->tif_map_dir_offset_to_number, &entryOld);
+    if (foundEntryOldOff)
+    {
+        entryOld.dirNumber = foundEntryOldOff->dirNumber;
+        if (tif->tif_map_dir_number_to_offset != NULL)
+        {
+            TIFFOffsetAndDirNumber *foundEntryOldDir =
+                (TIFFOffsetAndDirNumber *)TIFFHashSetLookup(
+                    tif->tif_map_dir_number_to_offset, &entryOld);
+            if (foundEntryOldDir)
+            {
+                TIFFHashSetRemove(tif->tif_map_dir_number_to_offset,
+                                  foundEntryOldDir);
+                TIFFHashSetRemove(tif->tif_map_dir_offset_to_number,
+                                  foundEntryOldOff);
+                return 1;
+            }
+        }
+        else
+        {
+            TIFFErrorExtR(tif, "_TIFFRemoveEntryFromDirectoryListByOffset",
+                          "Unexpectedly tif_map_dir_number_to_offset is "
+                          "missing but tif_map_dir_offset_to_number exists.");
+            return 0;
+        }
+    }
+    return 1;
+} /*--- _TIFFRemoveEntryFromDirectoryListByOffset() ---*/
+
+/*
+ * Check the count field of a directory entry against a known value.  The
+ * caller is expected to skip/ignore the tag if there is a mismatch.
+ */
+static int CheckDirCount(TIFF *tif, TIFFDirEntry *dir, uint32_t count)
+{
+    if ((uint64_t)count > dir->tdir_count)
+    {
+        const TIFFField *fip = TIFFFieldWithTag(tif, dir->tdir_tag);
+        TIFFWarningExtR(tif, tif->tif_name,
+                        "incorrect count for field \"%s\" (%" PRIu64
+                        ", expecting %" PRIu32 "); tag ignored",
+                        fip ? fip->field_name : "unknown tagname",
+                        dir->tdir_count, count);
+        return (0);
+    }
+    else if ((uint64_t)count < dir->tdir_count)
+    {
+        const TIFFField *fip = TIFFFieldWithTag(tif, dir->tdir_tag);
+        TIFFWarningExtR(tif, tif->tif_name,
+                        "incorrect count for field \"%s\" (%" PRIu64
+                        ", expecting %" PRIu32 "); tag trimmed",
+                        fip ? fip->field_name : "unknown tagname",
+                        dir->tdir_count, count);
+        dir->tdir_count = count;
+        return (1);
+    }
+    return (1);
+}
+
+/*
+ * Read IFD structure from the specified offset. If the pointer to
+ * nextdiroff variable has been specified, read it too. Function returns a
+ * number of fields in the directory or 0 if failed.
+ */
+static uint16_t TIFFFetchDirectory(TIFF *tif, uint64_t diroff,
+                                   TIFFDirEntry **pdir, uint64_t *nextdiroff)
+{
+    static const char module[] = "TIFFFetchDirectory";
+
+    void *origdir;
+    uint16_t dircount16;
+    uint32_t dirsize;
+    TIFFDirEntry *dir;
+    uint8_t *ma;
+    TIFFDirEntry *mb;
+    uint16_t n;
+
+    assert(pdir);
+
+    tif->tif_diroff = diroff;
+    if (nextdiroff)
+        *nextdiroff = 0;
+    if (!isMapped(tif))
+    {
+        if (!SeekOK(tif, tif->tif_diroff))
+        {
+            TIFFErrorExtR(tif, module,
+                          "%s: Seek error accessing TIFF directory",
+                          tif->tif_name);
+            return 0;
+        }
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            if (!ReadOK(tif, &dircount16, sizeof(uint16_t)))
+            {
+                TIFFErrorExtR(tif, module,
+                              "%s: Can not read TIFF directory count",
+                              tif->tif_name);
+                return 0;
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort(&dircount16);
+            if (dircount16 > 4096)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Sanity check on directory count failed, this is "
+                              "probably not a valid IFD offset");
+                return 0;
+            }
+            dirsize = 12;
+        }
+        else
+        {
+            uint64_t dircount64;
+            if (!ReadOK(tif, &dircount64, sizeof(uint64_t)))
+            {
+                TIFFErrorExtR(tif, module,
+                              "%s: Can not read TIFF directory count",
+                              tif->tif_name);
+                return 0;
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&dircount64);
+            if (dircount64 > 4096)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Sanity check on directory count failed, this is "
+                              "probably not a valid IFD offset");
+                return 0;
+            }
+            dircount16 = (uint16_t)dircount64;
+            dirsize = 20;
+        }
+        origdir = _TIFFCheckMalloc(tif, dircount16, dirsize,
+                                   "to read TIFF directory");
+        if (origdir == NULL)
+            return 0;
+        if (!ReadOK(tif, origdir, (tmsize_t)(dircount16 * dirsize)))
+        {
+            TIFFErrorExtR(tif, module, "%.100s: Can not read TIFF directory",
+                          tif->tif_name);
+            _TIFFfreeExt(tif, origdir);
+            return 0;
+        }
+        /*
+         * Read offset to next directory for sequential scans if
+         * needed.
+         */
+        if (nextdiroff)
+        {
+            if (!(tif->tif_flags & TIFF_BIGTIFF))
+            {
+                uint32_t nextdiroff32;
+                if (!ReadOK(tif, &nextdiroff32, sizeof(uint32_t)))
+                    nextdiroff32 = 0;
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(&nextdiroff32);
+                *nextdiroff = nextdiroff32;
+            }
+            else
+            {
+                if (!ReadOK(tif, nextdiroff, sizeof(uint64_t)))
+                    *nextdiroff = 0;
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(nextdiroff);
+            }
+        }
+    }
+    else
+    {
+        tmsize_t m;
+        tmsize_t off;
+        if (tif->tif_diroff > (uint64_t)INT64_MAX)
+        {
+            TIFFErrorExtR(tif, module, "Can not read TIFF directory count");
+            return (0);
+        }
+        off = (tmsize_t)tif->tif_diroff;
+
+        /*
+         * Check for integer overflow when validating the dir_off,
+         * otherwise a very high offset may cause an OOB read and
+         * crash the client. Make two comparisons instead of
+         *
+         *  off + sizeof(uint16_t) > tif->tif_size
+         *
+         * to avoid overflow.
+         */
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            m = off + sizeof(uint16_t);
+            if ((m < off) || (m < (tmsize_t)sizeof(uint16_t)) ||
+                (m > tif->tif_size))
+            {
+                TIFFErrorExtR(tif, module, "Can not read TIFF directory count");
+                return 0;
+            }
+            else
+            {
+                _TIFFmemcpy(&dircount16, tif->tif_base + off, sizeof(uint16_t));
+            }
+            off += sizeof(uint16_t);
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort(&dircount16);
+            if (dircount16 > 4096)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Sanity check on directory count failed, this is "
+                              "probably not a valid IFD offset");
+                return 0;
+            }
+            dirsize = 12;
+        }
+        else
+        {
+            uint64_t dircount64;
+            m = off + sizeof(uint64_t);
+            if ((m < off) || (m < (tmsize_t)sizeof(uint64_t)) ||
+                (m > tif->tif_size))
+            {
+                TIFFErrorExtR(tif, module, "Can not read TIFF directory count");
+                return 0;
+            }
+            else
+            {
+                _TIFFmemcpy(&dircount64, tif->tif_base + off, sizeof(uint64_t));
+            }
+            off += sizeof(uint64_t);
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&dircount64);
+            if (dircount64 > 4096)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Sanity check on directory count failed, this is "
+                              "probably not a valid IFD offset");
+                return 0;
+            }
+            dircount16 = (uint16_t)dircount64;
+            dirsize = 20;
+        }
+        if (dircount16 == 0)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Sanity check on directory count failed, zero tag "
+                          "directories not supported");
+            return 0;
+        }
+        origdir = _TIFFCheckMalloc(tif, dircount16, dirsize,
+                                   "to read TIFF directory");
+        if (origdir == NULL)
+            return 0;
+        m = off + dircount16 * dirsize;
+        if ((m < off) || (m < (tmsize_t)(dircount16 * dirsize)) ||
+            (m > tif->tif_size))
+        {
+            TIFFErrorExtR(tif, module, "Can not read TIFF directory");
+            _TIFFfreeExt(tif, origdir);
+            return 0;
+        }
+        else
+        {
+            _TIFFmemcpy(origdir, tif->tif_base + off, dircount16 * dirsize);
+        }
+        if (nextdiroff)
+        {
+            off += dircount16 * dirsize;
+            if (!(tif->tif_flags & TIFF_BIGTIFF))
+            {
+                uint32_t nextdiroff32;
+                m = off + sizeof(uint32_t);
+                if ((m < off) || (m < (tmsize_t)sizeof(uint32_t)) ||
+                    (m > tif->tif_size))
+                    nextdiroff32 = 0;
+                else
+                    _TIFFmemcpy(&nextdiroff32, tif->tif_base + off,
+                                sizeof(uint32_t));
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(&nextdiroff32);
+                *nextdiroff = nextdiroff32;
+            }
+            else
+            {
+                m = off + sizeof(uint64_t);
+                if ((m < off) || (m < (tmsize_t)sizeof(uint64_t)) ||
+                    (m > tif->tif_size))
+                    *nextdiroff = 0;
+                else
+                    _TIFFmemcpy(nextdiroff, tif->tif_base + off,
+                                sizeof(uint64_t));
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(nextdiroff);
+            }
+        }
+    }
+    dir = (TIFFDirEntry *)_TIFFCheckMalloc(
+        tif, dircount16, sizeof(TIFFDirEntry), "to read TIFF directory");
+    if (dir == 0)
+    {
+        _TIFFfreeExt(tif, origdir);
+        return 0;
+    }
+    ma = (uint8_t *)origdir;
+    mb = dir;
+    for (n = 0; n < dircount16; n++)
+    {
+        mb->tdir_ignore = FALSE;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabShort((uint16_t *)ma);
+        mb->tdir_tag = *(uint16_t *)ma;
+        ma += sizeof(uint16_t);
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabShort((uint16_t *)ma);
+        mb->tdir_type = *(uint16_t *)ma;
+        ma += sizeof(uint16_t);
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong((uint32_t *)ma);
+            mb->tdir_count = (uint64_t)(*(uint32_t *)ma);
+            ma += sizeof(uint32_t);
+            mb->tdir_offset.toff_long8 = 0;
+            *(uint32_t *)(&mb->tdir_offset) = *(uint32_t *)ma;
+            ma += sizeof(uint32_t);
+        }
+        else
+        {
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8((uint64_t *)ma);
+            mb->tdir_count = TIFFReadUInt64(ma);
+            ma += sizeof(uint64_t);
+            mb->tdir_offset.toff_long8 = TIFFReadUInt64(ma);
+            ma += sizeof(uint64_t);
+        }
+        mb++;
+    }
+    _TIFFfreeExt(tif, origdir);
+    *pdir = dir;
+    return dircount16;
+}
+
+/*
+ * Fetch a tag that is not handled by special case code.
+ */
+static int TIFFFetchNormalTag(TIFF *tif, TIFFDirEntry *dp, int recover)
+{
+    static const char module[] = "TIFFFetchNormalTag";
+    enum TIFFReadDirEntryErr err;
+    uint32_t fii;
+    const TIFFField *fip = NULL;
+    TIFFReadDirectoryFindFieldInfo(tif, dp->tdir_tag, &fii);
+    if (fii == FAILED_FII)
+    {
+        TIFFErrorExtR(tif, "TIFFFetchNormalTag",
+                      "No definition found for tag %" PRIu16, dp->tdir_tag);
+        return 0;
+    }
+    fip = tif->tif_fields[fii];
+    assert(fip != NULL); /* should not happen */
+    assert(fip->set_field_type !=
+           TIFF_SETGET_OTHER); /* if so, we shouldn't arrive here but deal with
+                                  this in specialized code */
+    assert(fip->set_field_type !=
+           TIFF_SETGET_INT); /* if so, we shouldn't arrive here as this is only
+                                the case for pseudo-tags */
+    err = TIFFReadDirEntryErrOk;
+    switch (fip->set_field_type)
+    {
+        case TIFF_SETGET_UNDEFINED:
+            TIFFErrorExtR(
+                tif, "TIFFFetchNormalTag",
+                "Defined set_field_type of custom tag %u (%s) is "
+                "TIFF_SETGET_UNDEFINED and thus tag is not read from file",
+                fip->field_tag, fip->field_name);
+            break;
+        case TIFF_SETGET_ASCII:
+        {
+            uint8_t *data;
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntryByteArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                size_t mb = 0;
+                int n;
+                if (data != NULL)
+                {
+                    if (dp->tdir_count > 0 && data[dp->tdir_count - 1] == 0)
+                    {
+                        /* optimization: if data is known to be 0 terminated, we
+                         * can use strlen() */
+                        mb = strlen((const char *)data);
+                    }
+                    else
+                    {
+                        /* general case. equivalent to non-portable */
+                        /* mb = strnlen((const char*)data,
+                         * (uint32_t)dp->tdir_count); */
+                        uint8_t *ma = data;
+                        while (mb < (uint32_t)dp->tdir_count)
+                        {
+                            if (*ma == 0)
+                                break;
+                            ma++;
+                            mb++;
+                        }
+                    }
+                }
+                if (mb + 1 < (uint32_t)dp->tdir_count)
+                    TIFFWarningExtR(
+                        tif, module,
+                        "ASCII value for tag \"%s\" contains null byte in "
+                        "value; value incorrectly truncated during reading due "
+                        "to implementation limitations",
+                        fip->field_name);
+                else if (mb + 1 > (uint32_t)dp->tdir_count)
+                {
+                    uint8_t *o;
+                    TIFFWarningExtR(
+                        tif, module,
+                        "ASCII value for tag \"%s\" does not end in null byte",
+                        fip->field_name);
+                    /* TIFFReadDirEntryArrayWithLimit() ensures this can't be
+                     * larger than MAX_SIZE_TAG_DATA */
+                    assert((uint32_t)dp->tdir_count + 1 == dp->tdir_count + 1);
+                    o = _TIFFmallocExt(tif, (uint32_t)dp->tdir_count + 1);
+                    if (o == NULL)
+                    {
+                        if (data != NULL)
+                            _TIFFfreeExt(tif, data);
+                        return (0);
+                    }
+                    if (dp->tdir_count > 0)
+                    {
+                        _TIFFmemcpy(o, data, (uint32_t)dp->tdir_count);
+                    }
+                    o[(uint32_t)dp->tdir_count] = 0;
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    data = o;
+                }
+                n = TIFFSetField(tif, dp->tdir_tag, data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!n)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_UINT8:
+        {
+            uint8_t data = 0;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntryByte(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_SINT8:
+        {
+            int8_t data = 0;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntrySbyte(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_UINT16:
+        {
+            uint16_t data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntryShort(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_SINT16:
+        {
+            int16_t data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntrySshort(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_UINT32:
+        {
+            uint32_t data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntryLong(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_SINT32:
+        {
+            int32_t data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntrySlong(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_UINT64:
+        {
+            uint64_t data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntryLong8(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_SINT64:
+        {
+            int64_t data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntrySlong8(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_FLOAT:
+        {
+            float data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntryFloat(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_DOUBLE:
+        {
+            double data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntryDouble(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_IFD8:
+        {
+            uint64_t data;
+            assert(fip->field_readcount == 1);
+            assert(fip->field_passcount == 0);
+            err = TIFFReadDirEntryIfd8(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                if (!TIFFSetField(tif, dp->tdir_tag, data))
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_UINT16_PAIR:
+        {
+            uint16_t *data;
+            assert(fip->field_readcount == 2);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != 2)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected 2, "
+                                "got %" PRIu64,
+                                fip->field_name, dp->tdir_count);
+                return (0);
+            }
+            err = TIFFReadDirEntryShortArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                assert(data); /* avoid CLang static Analyzer false positive */
+                m = TIFFSetField(tif, dp->tdir_tag, data[0], data[1]);
+                _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_UINT8:
+        {
+            uint8_t *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntryByteArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_SINT8:
+        {
+            int8_t *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntrySbyteArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_UINT16:
+        {
+            uint16_t *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntryShortArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_SINT16:
+        {
+            int16_t *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntrySshortArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_UINT32:
+        {
+            uint32_t *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntryLongArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_SINT32:
+        {
+            int32_t *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntrySlongArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_UINT64:
+        {
+            uint64_t *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntryLong8Array(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_SINT64:
+        {
+            int64_t *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntrySlong8Array(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C0_FLOAT:
+        {
+            float *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntryFloatArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        /*--: Rational2Double: Extend for Double Arrays and Rational-Arrays read
+         * into Double-Arrays. */
+        case TIFF_SETGET_C0_DOUBLE:
+        {
+            double *data;
+            assert(fip->field_readcount >= 1);
+            assert(fip->field_passcount == 0);
+            if (dp->tdir_count != (uint64_t)fip->field_readcount)
+            {
+                TIFFWarningExtR(tif, module,
+                                "incorrect count for field \"%s\", expected "
+                                "%d, got %" PRIu64,
+                                fip->field_name, (int)fip->field_readcount,
+                                dp->tdir_count);
+                return (0);
+            }
+            else
+            {
+                err = TIFFReadDirEntryDoubleArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag, data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_ASCII:
+        {
+            uint8_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntryByteArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    if (data != 0 && dp->tdir_count > 0 &&
+                        data[dp->tdir_count - 1] != '\0')
+                    {
+                        TIFFWarningExtR(
+                            tif, module,
+                            "ASCII value for tag \"%s\" does not end in null "
+                            "byte. Forcing it to be null",
+                            fip->field_name);
+                        data[dp->tdir_count - 1] = '\0';
+                    }
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_UINT8:
+        {
+            uint8_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntryByteArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_SINT8:
+        {
+            int8_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntrySbyteArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_UINT16:
+        {
+            uint16_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntryShortArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_SINT16:
+        {
+            int16_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntrySshortArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_UINT32:
+        {
+            uint32_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntryLongArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_SINT32:
+        {
+            int32_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntrySlongArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_UINT64:
+        {
+            uint64_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntryLong8Array(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
         }
-        if( bytecount < scanlinesize * tif->tif_dir.td_imagelength)
-            return 1;
-    }
-    return 0;
-}
-
-
-/*
- * Read the next TIFF directory from a file and convert it to the internal
- * format. We read directories sequentially.
- */
-int
-TIFFReadDirectory(TIFF* tif)
-{
-	static const char module[] = "TIFFReadDirectory";
-	TIFFDirEntry* dir;
-	uint16 dircount;
-	TIFFDirEntry* dp;
-	uint16 di;
-	const TIFFField* fip;
-	uint32 fii=FAILED_FII;
-        toff_t nextdiroff;
-    int bitspersample_read = FALSE;
-        int color_channels;
-
-	tif->tif_diroff=tif->tif_nextdiroff;
-	if (!TIFFCheckDirOffset(tif,tif->tif_nextdiroff))
-		return 0;           /* last offset or bad offset (IFD looping) */
-	(*tif->tif_cleanup)(tif);   /* cleanup any previous compression state */
-	tif->tif_curdir++;
-        nextdiroff = tif->tif_nextdiroff;
-	dircount=TIFFFetchDirectory(tif,nextdiroff,&dir,&tif->tif_nextdiroff);
-	if (!dircount)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,
-		    "Failed to read directory at offset " TIFF_UINT64_FORMAT,nextdiroff);
-		return 0;
-	}
-	TIFFReadDirectoryCheckOrder(tif,dir,dircount);
-
-        /*
-         * Mark duplicates of any tag to be ignored (bugzilla 1994)
-         * to avoid certain pathological problems.
-         */
-	{
-		TIFFDirEntry* ma;
-		uint16 mb;
-		for (ma=dir, mb=0; mb<dircount; ma++, mb++)
-		{
-			TIFFDirEntry* na;
-			uint16 nb;
-			for (na=ma+1, nb=mb+1; nb<dircount; na++, nb++)
-			{
-				if (ma->tdir_tag == na->tdir_tag) {
-					na->tdir_ignore = TRUE;
-				}
-			}
-		}
-	}
-        
-	tif->tif_flags &= ~TIFF_BEENWRITING;    /* reset before new dir */
-	tif->tif_flags &= ~TIFF_BUF4WRITE;      /* reset before new dir */
-	tif->tif_flags &= ~TIFF_CHOPPEDUPARRAYS;
-
-	/* free any old stuff and reinit */
-	TIFFFreeDirectory(tif);
-	TIFFDefaultDirectory(tif);
-	/*
-	 * Electronic Arts writes gray-scale TIFF files
-	 * without a PlanarConfiguration directory entry.
-	 * Thus we setup a default value here, even though
-	 * the TIFF spec says there is no default value.
-	 */
-	TIFFSetField(tif,TIFFTAG_PLANARCONFIG,PLANARCONFIG_CONTIG);
-	/*
-	 * Setup default value and then make a pass over
-	 * the fields to check type and tag information,
-	 * and to extract info required to size data
-	 * structures.  A second pass is made afterwards
-	 * to read in everything not taken in the first pass.
-	 * But we must process the Compression tag first
-	 * in order to merge in codec-private tag definitions (otherwise
-	 * we may get complaints about unknown tags).  However, the
-	 * Compression tag may be dependent on the SamplesPerPixel
-	 * tag value because older TIFF specs permitted Compression
-	 * to be written as a SamplesPerPixel-count tag entry.
-	 * Thus if we don't first figure out the correct SamplesPerPixel
-	 * tag value then we may end up ignoring the Compression tag
-	 * value because it has an incorrect count value (if the
-	 * true value of SamplesPerPixel is not 1).
-	 */
-	dp=TIFFReadDirectoryFindEntry(tif,dir,dircount,TIFFTAG_SAMPLESPERPIXEL);
-	if (dp)
-	{
-		if (!TIFFFetchNormalTag(tif,dp,0))
-			goto bad;
-		dp->tdir_ignore = TRUE;
-	}
-	dp=TIFFReadDirectoryFindEntry(tif,dir,dircount,TIFFTAG_COMPRESSION);
-	if (dp)
-	{
-		/*
-		 * The 5.0 spec says the Compression tag has one value, while
-		 * earlier specs say it has one value per sample.  Because of
-		 * this, we accept the tag if one value is supplied with either
-		 * count.
-		 */
-		uint16 value;
-		enum TIFFReadDirEntryErr err;
-		err=TIFFReadDirEntryShort(tif,dp,&value);
-		if (err==TIFFReadDirEntryErrCount)
-			err=TIFFReadDirEntryPersampleShort(tif,dp,&value);
-		if (err!=TIFFReadDirEntryErrOk)
-		{
-			TIFFReadDirEntryOutputErr(tif,err,module,"Compression",0);
-			goto bad;
-		}
-		if (!TIFFSetField(tif,TIFFTAG_COMPRESSION,value))
-			goto bad;
-		dp->tdir_ignore = TRUE;
-	}
-	else
-	{
-		if (!TIFFSetField(tif,TIFFTAG_COMPRESSION,COMPRESSION_NONE))
-			goto bad;
-	}
-	/*
-	 * First real pass over the directory.
-	 */
-	for (di=0, dp=dir; di<dircount; di++, dp++)
-	{
-		if (!dp->tdir_ignore)
-		{
-			TIFFReadDirectoryFindFieldInfo(tif,dp->tdir_tag,&fii);
-			if (fii == FAILED_FII)
-			{
-				TIFFWarningExt(tif->tif_clientdata, module,
-				    "Unknown field with tag %d (0x%x) encountered",
-				    dp->tdir_tag,dp->tdir_tag);
-				/* the following knowingly leaks the 
-				   anonymous field structure */
-				if (!_TIFFMergeFields(tif,
-					_TIFFCreateAnonField(tif,
-						dp->tdir_tag,
-						(TIFFDataType) dp->tdir_type),
-					1)) {
-					TIFFWarningExt(tif->tif_clientdata,
-					    module,
-					    "Registering anonymous field with tag %d (0x%x) failed",
-					    dp->tdir_tag,
-					    dp->tdir_tag);
-					dp->tdir_ignore = TRUE;
-				} else {
-					TIFFReadDirectoryFindFieldInfo(tif,dp->tdir_tag,&fii);
-					assert(fii != FAILED_FII);
-				}
-			}
-		}
-		if (!dp->tdir_ignore)
-		{
-			fip=tif->tif_fields[fii];
-			if (fip->field_bit==FIELD_IGNORE)
-				dp->tdir_ignore = TRUE;
-			else
-			{
-				switch (dp->tdir_tag)
-				{
-					case TIFFTAG_STRIPOFFSETS:
-					case TIFFTAG_STRIPBYTECOUNTS:
-					case TIFFTAG_TILEOFFSETS:
-					case TIFFTAG_TILEBYTECOUNTS:
-						TIFFSetFieldBit(tif,fip->field_bit);
-						break;
-					case TIFFTAG_IMAGEWIDTH:
-					case TIFFTAG_IMAGELENGTH:
-					case TIFFTAG_IMAGEDEPTH:
-					case TIFFTAG_TILELENGTH:
-					case TIFFTAG_TILEWIDTH:
-					case TIFFTAG_TILEDEPTH:
-					case TIFFTAG_PLANARCONFIG:
-					case TIFFTAG_ROWSPERSTRIP:
-					case TIFFTAG_EXTRASAMPLES:
-						if (!TIFFFetchNormalTag(tif,dp,0))
-							goto bad;
-						dp->tdir_ignore = TRUE;
-						break;
-					default:
-						if( !_TIFFCheckFieldIsValidForCodec(tif, dp->tdir_tag) )
-							dp->tdir_ignore = TRUE;
-						break;
-				}
-			}
-		}
-	}
-	/*
-	 * XXX: OJPEG hack.
-	 * If a) compression is OJPEG, b) planarconfig tag says it's separate,
-	 * c) strip offsets/bytecounts tag are both present and
-	 * d) both contain exactly one value, then we consistently find
-	 * that the buggy implementation of the buggy compression scheme
-	 * matches contig planarconfig best. So we 'fix-up' the tag here
-	 */
-	if ((tif->tif_dir.td_compression==COMPRESSION_OJPEG)&&
-	    (tif->tif_dir.td_planarconfig==PLANARCONFIG_SEPARATE))
-	{
-		if (!_TIFFFillStriles(tif))
-		    goto bad;
-		dp=TIFFReadDirectoryFindEntry(tif,dir,dircount,TIFFTAG_STRIPOFFSETS);
-		if ((dp!=0)&&(dp->tdir_count==1))
-		{
-			dp=TIFFReadDirectoryFindEntry(tif,dir,dircount,
-			    TIFFTAG_STRIPBYTECOUNTS);
-			if ((dp!=0)&&(dp->tdir_count==1))
-			{
-				tif->tif_dir.td_planarconfig=PLANARCONFIG_CONTIG;
-				TIFFWarningExt(tif->tif_clientdata,module,
-				    "Planarconfig tag value assumed incorrect, "
-				    "assuming data is contig instead of chunky");
-			}
-		}
-	}
-	/*
-	 * Allocate directory structure and setup defaults.
-	 */
-	if (!TIFFFieldSet(tif,FIELD_IMAGEDIMENSIONS))
-	{
-		MissingRequired(tif,"ImageLength");
-		goto bad;
-	}
-	/*
-	 * Setup appropriate structures (by strip or by tile)
-	 */
-	if (!TIFFFieldSet(tif, FIELD_TILEDIMENSIONS)) {
-		tif->tif_dir.td_nstrips = TIFFNumberOfStrips(tif);  
-		tif->tif_dir.td_tilewidth = tif->tif_dir.td_imagewidth;
-		tif->tif_dir.td_tilelength = tif->tif_dir.td_rowsperstrip;
-		tif->tif_dir.td_tiledepth = tif->tif_dir.td_imagedepth;
-		tif->tif_flags &= ~TIFF_ISTILED;
-	} else {
-		tif->tif_dir.td_nstrips = TIFFNumberOfTiles(tif);
-		tif->tif_flags |= TIFF_ISTILED;
-	}
-	if (!tif->tif_dir.td_nstrips) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Cannot handle zero number of %s",
-		    isTiled(tif) ? "tiles" : "strips");
-		goto bad;
-	}
-	tif->tif_dir.td_stripsperimage = tif->tif_dir.td_nstrips;
-	if (tif->tif_dir.td_planarconfig == PLANARCONFIG_SEPARATE)
-		tif->tif_dir.td_stripsperimage /= tif->tif_dir.td_samplesperpixel;
-	if (!TIFFFieldSet(tif, FIELD_STRIPOFFSETS)) {
-#ifdef OJPEG_SUPPORT
-		if ((tif->tif_dir.td_compression==COMPRESSION_OJPEG) &&
-		    (isTiled(tif)==0) &&
-		    (tif->tif_dir.td_nstrips==1)) {
-			/*
-			 * XXX: OJPEG hack.
-			 * If a) compression is OJPEG, b) it's not a tiled TIFF,
-			 * and c) the number of strips is 1,
-			 * then we tolerate the absence of stripoffsets tag,
-			 * because, presumably, all required data is in the
-			 * JpegInterchangeFormat stream.
-			 */
-			TIFFSetFieldBit(tif, FIELD_STRIPOFFSETS);
-		} else
-#endif
+        break;
+        case TIFF_SETGET_C16_SINT64:
         {
-			MissingRequired(tif,
-				isTiled(tif) ? "TileOffsets" : "StripOffsets");
-			goto bad;
-		}
-	}
-	/*
-	 * Second pass: extract other information.
-	 */
-	for (di=0, dp=dir; di<dircount; di++, dp++)
-	{
-		if (!dp->tdir_ignore) {
-			switch (dp->tdir_tag) 
-			{
-				case TIFFTAG_MINSAMPLEVALUE:
-				case TIFFTAG_MAXSAMPLEVALUE:
-				case TIFFTAG_BITSPERSAMPLE:
-				case TIFFTAG_DATATYPE:
-				case TIFFTAG_SAMPLEFORMAT:
-					/*
-					 * The MinSampleValue, MaxSampleValue, BitsPerSample
-					 * DataType and SampleFormat tags are supposed to be
-					 * written as one value/sample, but some vendors
-					 * incorrectly write one value only -- so we accept
-					 * that as well (yuck). Other vendors write correct
-					 * value for NumberOfSamples, but incorrect one for
-					 * BitsPerSample and friends, and we will read this
-					 * too.
-					 */
-					{
-						uint16 value;
-						enum TIFFReadDirEntryErr err;
-						err=TIFFReadDirEntryShort(tif,dp,&value);
-						if (err==TIFFReadDirEntryErrCount)
-							err=TIFFReadDirEntryPersampleShort(tif,dp,&value);
-						if (err!=TIFFReadDirEntryErrOk)
-						{
-							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-							TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",0);
-							goto bad;
-						}
-						if (!TIFFSetField(tif,dp->tdir_tag,value))
-							goto bad;
-						if( dp->tdir_tag == TIFFTAG_BITSPERSAMPLE )
-						    bitspersample_read = TRUE;
-					}
-					break;
-				case TIFFTAG_SMINSAMPLEVALUE:
-				case TIFFTAG_SMAXSAMPLEVALUE:
-					{
-
-						double *data = NULL;
-						enum TIFFReadDirEntryErr err;
-						uint32 saved_flags;
-						int m;
-						if (dp->tdir_count != (uint64)tif->tif_dir.td_samplesperpixel)
-							err = TIFFReadDirEntryErrCount;
-						else
-							err = TIFFReadDirEntryDoubleArray(tif, dp, &data);
-						if (err!=TIFFReadDirEntryErrOk)
-						{
-							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-							TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",0);
-							goto bad;
-						}
-						saved_flags = tif->tif_flags;
-						tif->tif_flags |= TIFF_PERSAMPLE;
-						m = TIFFSetField(tif,dp->tdir_tag,data);
-						tif->tif_flags = saved_flags;
-						_TIFFfree(data);
-						if (!m)
-							goto bad;
-					}
-					break;
-				case TIFFTAG_STRIPOFFSETS:
-				case TIFFTAG_TILEOFFSETS:
-					switch( dp->tdir_type )
-					{
-					    case TIFF_SHORT:
-					    case TIFF_LONG:
-					    case TIFF_LONG8:
-					        break;
-					    default:
-                                                /* Warn except if directory typically created with TIFFDeferStrileArrayWriting() */
-                                                if( !(tif->tif_mode == O_RDWR &&
-                                                      dp->tdir_count == 0 &&
-                                                      dp->tdir_type == 0 &&
-                                                      dp->tdir_offset.toff_long8 == 0) )
-                                                {
-                                                    fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-                                                    TIFFWarningExt(tif->tif_clientdata,module,
-                                                                   "Invalid data type for tag %s",
-                                                                   fip ? fip->field_name : "unknown tagname");
-                                                }
-                                                break;
-                                        }
-					_TIFFmemcpy( &(tif->tif_dir.td_stripoffset_entry),
-					   dp, sizeof(TIFFDirEntry) );
-					break;
-				case TIFFTAG_STRIPBYTECOUNTS:
-				case TIFFTAG_TILEBYTECOUNTS:
-					switch( dp->tdir_type )
-					{
-					    case TIFF_SHORT:
-					    case TIFF_LONG:
-					    case TIFF_LONG8:
-					        break;
-					    default:
-						/* Warn except if directory typically created with TIFFDeferStrileArrayWriting() */
-                                                if( !(tif->tif_mode == O_RDWR &&
-                                                      dp->tdir_count == 0 &&
-                                                      dp->tdir_type == 0 &&
-                                                      dp->tdir_offset.toff_long8 == 0) )
-                                                {
-                                                    fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-                                                    TIFFWarningExt(tif->tif_clientdata,module,
-                                                                   "Invalid data type for tag %s",
-                                                                   fip ? fip->field_name : "unknown tagname");
-                                                }
-                                                break;
-                                        }
-					_TIFFmemcpy( &(tif->tif_dir.td_stripbytecount_entry),
-					   dp, sizeof(TIFFDirEntry) );
-					break;
-				case TIFFTAG_COLORMAP:
-				case TIFFTAG_TRANSFERFUNCTION:
-					{
-						enum TIFFReadDirEntryErr err;
-						uint32 countpersample;
-						uint32 countrequired;
-						uint32 incrementpersample;
-						uint16* value=NULL;
-						/* It would be dangerous to instantiate those tag values */
-						/* since if td_bitspersample has not yet been read (due to */
-						/* unordered tags), it could be read afterwards with a */
-						/* values greater than the default one (1), which may cause */
-						/* crashes in user code */
-						if( !bitspersample_read )
-						{
-							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-							TIFFWarningExt(tif->tif_clientdata,module,
-								"Ignoring %s since BitsPerSample tag not found",
-								fip ? fip->field_name : "unknown tagname");
-							continue;
-						}
-						/* ColorMap or TransferFunction for high bit */
-						/* depths do not make much sense and could be */
-						/* used as a denial of service vector */
-						if (tif->tif_dir.td_bitspersample > 24)
-						{
-							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-							TIFFWarningExt(tif->tif_clientdata,module,
-								"Ignoring %s because BitsPerSample=%d>24",
-								fip ? fip->field_name : "unknown tagname",
-								tif->tif_dir.td_bitspersample);
-							continue;
-						}
-						countpersample=(1U<<tif->tif_dir.td_bitspersample);
-						if ((dp->tdir_tag==TIFFTAG_TRANSFERFUNCTION)&&(dp->tdir_count==(uint64)countpersample))
-						{
-							countrequired=countpersample;
-							incrementpersample=0;
-						}
-						else
-						{
-							countrequired=3*countpersample;
-							incrementpersample=countpersample;
-						}
-						if (dp->tdir_count!=(uint64)countrequired)
-							err=TIFFReadDirEntryErrCount;
-						else
-							err=TIFFReadDirEntryShortArray(tif,dp,&value);
-						if (err!=TIFFReadDirEntryErrOk)
-						{
-							fip = TIFFFieldWithTag(tif,dp->tdir_tag);
-							TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",1);
-						}
-						else
-						{
-							TIFFSetField(tif,dp->tdir_tag,value,value+incrementpersample,value+2*incrementpersample);
-							_TIFFfree(value);
-						}
-					}
-					break;
-/* BEGIN REV 4.0 COMPATIBILITY */
-				case TIFFTAG_OSUBFILETYPE:
-					{
-						uint16 valueo;
-						uint32 value;
-						if (TIFFReadDirEntryShort(tif,dp,&valueo)==TIFFReadDirEntryErrOk)
-						{
-							switch (valueo)
-							{
-								case OFILETYPE_REDUCEDIMAGE: value=FILETYPE_REDUCEDIMAGE; break;
-								case OFILETYPE_PAGE: value=FILETYPE_PAGE; break;
-								default: value=0; break;
-							}
-							if (value!=0)
-								TIFFSetField(tif,TIFFTAG_SUBFILETYPE,value);
-						}
-					}
-					break;
-/* END REV 4.0 COMPATIBILITY */
-				default:
-					(void) TIFFFetchNormalTag(tif, dp, TRUE);
-					break;
-				}
-			} /* -- if (!dp->tdir_ignore) */
-		} /* -- for-loop -- */
-
-        if( tif->tif_mode == O_RDWR &&
-            tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
-            tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
-            tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
-            tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
-            tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
-            tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
-            tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
-            tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0 )
-        {
-            /* Directory typically created with TIFFDeferStrileArrayWriting() */
-            TIFFSetupStrips(tif);
-        }
-        else if( !(tif->tif_flags&TIFF_DEFERSTRILELOAD) )
-        {
-            if( tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 )
-            {
-                if (!TIFFFetchStripThing(tif,&(tif->tif_dir.td_stripoffset_entry),
-                                         tif->tif_dir.td_nstrips,
-                                         &tif->tif_dir.td_stripoffset_p))
+            int64_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntrySlong8Array(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
                 {
-                    goto bad;
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
                 }
             }
-            if( tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 )
+        }
+        break;
+        case TIFF_SETGET_C16_FLOAT:
+        {
+            float *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
             {
-                if (!TIFFFetchStripThing(tif,&(tif->tif_dir.td_stripbytecount_entry),
-                                         tif->tif_dir.td_nstrips,
-                                         &tif->tif_dir.td_stripbytecount_p))
+                err = TIFFReadDirEntryFloatArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
                 {
-                    goto bad;
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
                 }
             }
         }
-
-	/*
-	 * OJPEG hack:
-	 * - If a) compression is OJPEG, and b) photometric tag is missing,
-	 * then we consistently find that photometric should be YCbCr
-	 * - If a) compression is OJPEG, and b) photometric tag says it's RGB,
-	 * then we consistently find that the buggy implementation of the
-	 * buggy compression scheme matches photometric YCbCr instead.
-	 * - If a) compression is OJPEG, and b) bitspersample tag is missing,
-	 * then we consistently find bitspersample should be 8.
-	 * - If a) compression is OJPEG, b) samplesperpixel tag is missing,
-	 * and c) photometric is RGB or YCbCr, then we consistently find
-	 * samplesperpixel should be 3
-	 * - If a) compression is OJPEG, b) samplesperpixel tag is missing,
-	 * and c) photometric is MINISWHITE or MINISBLACK, then we consistently
-	 * find samplesperpixel should be 3
-	 */
-	if (tif->tif_dir.td_compression==COMPRESSION_OJPEG)
-	{
-		if (!TIFFFieldSet(tif,FIELD_PHOTOMETRIC))
-		{
-			TIFFWarningExt(tif->tif_clientdata, module,
-			    "Photometric tag is missing, assuming data is YCbCr");
-			if (!TIFFSetField(tif,TIFFTAG_PHOTOMETRIC,PHOTOMETRIC_YCBCR))
-				goto bad;
-		}
-		else if (tif->tif_dir.td_photometric==PHOTOMETRIC_RGB)
-		{
-			tif->tif_dir.td_photometric=PHOTOMETRIC_YCBCR;
-			TIFFWarningExt(tif->tif_clientdata, module,
-			    "Photometric tag value assumed incorrect, "
-			    "assuming data is YCbCr instead of RGB");
-		}
-		if (!TIFFFieldSet(tif,FIELD_BITSPERSAMPLE))
-		{
-			TIFFWarningExt(tif->tif_clientdata,module,
-			    "BitsPerSample tag is missing, assuming 8 bits per sample");
-			if (!TIFFSetField(tif,TIFFTAG_BITSPERSAMPLE,8))
-				goto bad;
-		}
-		if (!TIFFFieldSet(tif,FIELD_SAMPLESPERPIXEL))
-		{
-			if (tif->tif_dir.td_photometric==PHOTOMETRIC_RGB)
-			{
-				TIFFWarningExt(tif->tif_clientdata,module,
-				    "SamplesPerPixel tag is missing, "
-				    "assuming correct SamplesPerPixel value is 3");
-				if (!TIFFSetField(tif,TIFFTAG_SAMPLESPERPIXEL,3))
-					goto bad;
-			}
-			if (tif->tif_dir.td_photometric==PHOTOMETRIC_YCBCR)
-			{
-				TIFFWarningExt(tif->tif_clientdata,module,
-				    "SamplesPerPixel tag is missing, "
-				    "applying correct SamplesPerPixel value of 3");
-				if (!TIFFSetField(tif,TIFFTAG_SAMPLESPERPIXEL,3))
-					goto bad;
-			}
-			else if ((tif->tif_dir.td_photometric==PHOTOMETRIC_MINISWHITE)
-				 || (tif->tif_dir.td_photometric==PHOTOMETRIC_MINISBLACK))
-			{
-				/*
-				 * SamplesPerPixel tag is missing, but is not required
-				 * by spec.  Assume correct SamplesPerPixel value of 1.
-				 */
-				if (!TIFFSetField(tif,TIFFTAG_SAMPLESPERPIXEL,1))
-					goto bad;
-			}
-		}
-	}
-
-	/*
-	 * Make sure all non-color channels are extrasamples.
-	 * If it's not the case, define them as such.
-	 */
-        color_channels = _TIFFGetMaxColorChannels(tif->tif_dir.td_photometric);
-        if (color_channels && tif->tif_dir.td_samplesperpixel - tif->tif_dir.td_extrasamples > color_channels) {
-                uint16 old_extrasamples;
-                uint16 *new_sampleinfo;
-
-                TIFFWarningExt(tif->tif_clientdata,module, "Sum of Photometric type-related "
-                    "color channels and ExtraSamples doesn't match SamplesPerPixel. "
-                    "Defining non-color channels as ExtraSamples.");
-
-                old_extrasamples = tif->tif_dir.td_extrasamples;
-                tif->tif_dir.td_extrasamples = (uint16) (tif->tif_dir.td_samplesperpixel - color_channels);
-
-                // sampleinfo should contain information relative to these new extra samples
-                new_sampleinfo = (uint16*) _TIFFcalloc(tif->tif_dir.td_extrasamples, sizeof(uint16));
-                if (!new_sampleinfo) {
-                    TIFFErrorExt(tif->tif_clientdata, module, "Failed to allocate memory for "
-                                "temporary new sampleinfo array (%d 16 bit elements)",
-                                tif->tif_dir.td_extrasamples);
-                    goto bad;
+        break;
+        case TIFF_SETGET_C16_DOUBLE:
+        {
+            double *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntryDoubleArray(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
                 }
-
-                memcpy(new_sampleinfo, tif->tif_dir.td_sampleinfo, old_extrasamples * sizeof(uint16));
-                _TIFFsetShortArray(&tif->tif_dir.td_sampleinfo, new_sampleinfo, tif->tif_dir.td_extrasamples);
-                _TIFFfree(new_sampleinfo);
-        }
-
-	/*
-	 * Verify Palette image has a Colormap.
-	 */
-	if (tif->tif_dir.td_photometric == PHOTOMETRIC_PALETTE &&
-	    !TIFFFieldSet(tif, FIELD_COLORMAP)) {
-		if ( tif->tif_dir.td_bitspersample>=8 && tif->tif_dir.td_samplesperpixel==3)
-			tif->tif_dir.td_photometric = PHOTOMETRIC_RGB;
-		else if (tif->tif_dir.td_bitspersample>=8)
-			tif->tif_dir.td_photometric = PHOTOMETRIC_MINISBLACK;
-		else {
-			MissingRequired(tif, "Colormap");
-			goto bad;
-		}
-	}
-	/*
-	 * OJPEG hack:
-	 * We do no further messing with strip/tile offsets/bytecounts in OJPEG
-	 * TIFFs
-	 */
-	if (tif->tif_dir.td_compression!=COMPRESSION_OJPEG)
-	{
-		/*
-		 * Attempt to deal with a missing StripByteCounts tag.
-		 */
-		if (!TIFFFieldSet(tif, FIELD_STRIPBYTECOUNTS)) {
-			/*
-			 * Some manufacturers violate the spec by not giving
-			 * the size of the strips.  In this case, assume there
-			 * is one uncompressed strip of data.
-			 */
-			if ((tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG &&
-			    tif->tif_dir.td_nstrips > 1) ||
-			    (tif->tif_dir.td_planarconfig == PLANARCONFIG_SEPARATE &&
-			     tif->tif_dir.td_nstrips != (uint32)tif->tif_dir.td_samplesperpixel)) {
-			    MissingRequired(tif, "StripByteCounts");
-			    goto bad;
-			}
-			TIFFWarningExt(tif->tif_clientdata, module,
-				"TIFF directory is missing required "
-				"\"StripByteCounts\" field, calculating from imagelength");
-			if (EstimateStripByteCounts(tif, dir, dircount) < 0)
-			    goto bad;
-
-		} else if (tif->tif_dir.td_nstrips == 1
-                           && !(tif->tif_flags&TIFF_ISTILED)
-			   && ByteCountLooksBad(tif)) {
-			/*
-			 * XXX: Plexus (and others) sometimes give a value of
-			 * zero for a tag when they don't know what the
-			 * correct value is!  Try and handle the simple case
-			 * of estimating the size of a one strip image.
-			 */
-			TIFFWarningExt(tif->tif_clientdata, module,
-			    "Bogus \"StripByteCounts\" field, ignoring and calculating from imagelength");
-			if(EstimateStripByteCounts(tif, dir, dircount) < 0)
-			    goto bad;
-
-		} else if (!(tif->tif_flags&TIFF_DEFERSTRILELOAD)
-			   && tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG
-			   && tif->tif_dir.td_nstrips > 2
-			   && tif->tif_dir.td_compression == COMPRESSION_NONE
-			   && TIFFGetStrileByteCount(tif, 0) != TIFFGetStrileByteCount(tif, 1)
-			   && TIFFGetStrileByteCount(tif, 0) != 0
-			   && TIFFGetStrileByteCount(tif, 1) != 0 ) {
-			/*
-			 * XXX: Some vendors fill StripByteCount array with
-			 * absolutely wrong values (it can be equal to
-			 * StripOffset array, for example). Catch this case
-			 * here.
-                         *
-                         * We avoid this check if deferring strile loading
-                         * as it would always force us to load the strip/tile
-                         * information.
-			 */
-			TIFFWarningExt(tif->tif_clientdata, module,
-			    "Wrong \"StripByteCounts\" field, ignoring and calculating from imagelength");
-			if (EstimateStripByteCounts(tif, dir, dircount) < 0)
-			    goto bad;
-		}
-	}
-	if (dir)
-	{
-		_TIFFfree(dir);
-		dir=NULL;
-	}
-	if (!TIFFFieldSet(tif, FIELD_MAXSAMPLEVALUE))
-	{
-		if (tif->tif_dir.td_bitspersample>=16)
-			tif->tif_dir.td_maxsamplevalue=0xFFFF;
-		else
-			tif->tif_dir.td_maxsamplevalue = (uint16)((1L<<tif->tif_dir.td_bitspersample)-1);
-	}
-
-#ifdef STRIPBYTECOUNTSORTED_UNUSED
-	/*
-	 * XXX: We can optimize checking for the strip bounds using the sorted
-	 * bytecounts array. See also comments for TIFFAppendToStrip()
-	 * function in tif_write.c.
-	 */
-	if (!(tif->tif_flags&TIFF_DEFERSTRILELOAD) && tif->tif_dir.td_nstrips > 1) {
-		uint32 strip;
-
-		tif->tif_dir.td_stripbytecountsorted = 1;
-		for (strip = 1; strip < tif->tif_dir.td_nstrips; strip++) {
-			if (TIFFGetStrileOffset(tif, strip - 1) >
-			    TIFFGetStrileOffset(tif, strip)) {
-				tif->tif_dir.td_stripbytecountsorted = 0;
-				break;
-			}
-		}
-	}
-#endif
-
-	/*
-	 * An opportunity for compression mode dependent tag fixup
-	 */
-	(*tif->tif_fixuptags)(tif);
-
-	/*
-	 * Some manufacturers make life difficult by writing
-	 * large amounts of uncompressed data as a single strip.
-	 * This is contrary to the recommendations of the spec.
-	 * The following makes an attempt at breaking such images
-	 * into strips closer to the recommended 8k bytes.  A
-	 * side effect, however, is that the RowsPerStrip tag
-	 * value may be changed.
-	 */
-	if ((tif->tif_dir.td_planarconfig==PLANARCONFIG_CONTIG)&&
-	    (tif->tif_dir.td_nstrips==1)&&
-	    (tif->tif_dir.td_compression==COMPRESSION_NONE)&&  
-	    ((tif->tif_flags&(TIFF_STRIPCHOP|TIFF_ISTILED))==TIFF_STRIPCHOP))
+            }
+        }
+        break;
+        case TIFF_SETGET_C16_IFD8:
         {
-            ChopUpSingleUncompressedStrip(tif);
+            uint64_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE);
+            assert(fip->field_passcount == 1);
+            if (dp->tdir_count > 0xFFFF)
+                err = TIFFReadDirEntryErrCount;
+            else
+            {
+                err = TIFFReadDirEntryIfd8Array(tif, dp, &data);
+                if (err == TIFFReadDirEntryErrOk)
+                {
+                    int m;
+                    m = TIFFSetField(tif, dp->tdir_tag,
+                                     (uint16_t)(dp->tdir_count), data);
+                    if (data != 0)
+                        _TIFFfreeExt(tif, data);
+                    if (!m)
+                        return (0);
+                }
+            }
         }
-
-        /* There are also uncompressed striped files with strips larger than */
-        /* 2 GB, which make them unfriendly with a lot of code. If possible, */
-        /* try to expose smaller "virtual" strips. */
-        if( tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG &&
-            tif->tif_dir.td_compression == COMPRESSION_NONE &&
-            (tif->tif_flags&(TIFF_STRIPCHOP|TIFF_ISTILED)) == TIFF_STRIPCHOP &&
-            TIFFStripSize64(tif) > 0x7FFFFFFFUL )
+        break;
+        case TIFF_SETGET_C32_ASCII:
         {
-            TryChopUpUncompressedBigTiff(tif);
+            uint8_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntryByteArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                if (data != 0 && dp->tdir_count > 0 &&
+                    data[dp->tdir_count - 1] != '\0')
+                {
+                    TIFFWarningExtR(tif, module,
+                                    "ASCII value for tag \"%s\" does not end "
+                                    "in null byte. Forcing it to be null",
+                                    fip->field_name);
+                    data[dp->tdir_count - 1] = '\0';
+                }
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
         }
-
-        /*
-         * Clear the dirty directory flag. 
-         */
-	tif->tif_flags &= ~TIFF_DIRTYDIRECT;
-	tif->tif_flags &= ~TIFF_DIRTYSTRIP;
-
-	/*
-	 * Reinitialize i/o since we are starting on a new directory.
-	 */
-	tif->tif_row = (uint32) -1;
-	tif->tif_curstrip = (uint32) -1;
-	tif->tif_col = (uint32) -1;
-	tif->tif_curtile = (uint32) -1;
-	tif->tif_tilesize = (tmsize_t) -1;
-
-	tif->tif_scanlinesize = TIFFScanlineSize(tif);
-	if (!tif->tif_scanlinesize) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Cannot handle zero scanline size");
-		return (0);
-	}
-
-	if (isTiled(tif)) {
-		tif->tif_tilesize = TIFFTileSize(tif);
-		if (!tif->tif_tilesize) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			     "Cannot handle zero tile size");
-			return (0);
-		}
-	} else {
-		if (!TIFFStripSize(tif)) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Cannot handle zero strip size");
-			return (0);
-		}
-	}
-	return (1);
-bad:
-	if (dir)
-		_TIFFfree(dir);
-	return (0);
-}
-
-static void
-TIFFReadDirectoryCheckOrder(TIFF* tif, TIFFDirEntry* dir, uint16 dircount)
-{
-	static const char module[] = "TIFFReadDirectoryCheckOrder";
-	uint16 m;
-	uint16 n;
-	TIFFDirEntry* o;
-	m=0;
-	for (n=0, o=dir; n<dircount; n++, o++)
-	{
-		if (o->tdir_tag<m)
-		{
-			TIFFWarningExt(tif->tif_clientdata,module,
-			    "Invalid TIFF directory; tags are not sorted in ascending order");
-			break;
-		}
-		m=o->tdir_tag+1;
-	}
-}
-
-static TIFFDirEntry*
-TIFFReadDirectoryFindEntry(TIFF* tif, TIFFDirEntry* dir, uint16 dircount, uint16 tagid)
-{
-	TIFFDirEntry* m;
-	uint16 n;
-	(void) tif;
-	for (m=dir, n=0; n<dircount; m++, n++)
-	{
-		if (m->tdir_tag==tagid)
-			return(m);
-	}
-	return(0);
-}
-
-static void
-TIFFReadDirectoryFindFieldInfo(TIFF* tif, uint16 tagid, uint32* fii)
-{
-	int32 ma,mb,mc;
-	ma=-1;
-	mc=(int32)tif->tif_nfields;
-	while (1)
-	{
-		if (ma+1==mc)
-		{
-			*fii = FAILED_FII;
-			return;
-		}
-		mb=(ma+mc)/2;
-		if (tif->tif_fields[mb]->field_tag==(uint32)tagid)
-			break;
-		if (tif->tif_fields[mb]->field_tag<(uint32)tagid)
-			ma=mb;
-		else
-			mc=mb;
-	}
-	while (1)
-	{
-		if (mb==0)
-			break;
-		if (tif->tif_fields[mb-1]->field_tag!=(uint32)tagid)
-			break;
-		mb--;
-	}
-	*fii=mb;
-}
-
-/*
- * Read custom directory from the arbitrary offset.
- * The code is very similar to TIFFReadDirectory().
- */
-int
-TIFFReadCustomDirectory(TIFF* tif, toff_t diroff,
-			const TIFFFieldArray* infoarray)
-{
-	static const char module[] = "TIFFReadCustomDirectory";
-	TIFFDirEntry* dir;
-	uint16 dircount;
-	TIFFDirEntry* dp;
-	uint16 di;
-	const TIFFField* fip;
-	uint32 fii;
-        (*tif->tif_cleanup)(tif);   /* cleanup any previous compression state */
-	_TIFFSetupFields(tif, infoarray);
-	dircount=TIFFFetchDirectory(tif,diroff,&dir,NULL);
-	if (!dircount)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,
-		    "Failed to read custom directory at offset " TIFF_UINT64_FORMAT,diroff);
-		return 0;
-	}
-	TIFFFreeDirectory(tif);
-	_TIFFmemset(&tif->tif_dir, 0, sizeof(TIFFDirectory));
-	TIFFReadDirectoryCheckOrder(tif,dir,dircount);
-	for (di=0, dp=dir; di<dircount; di++, dp++)
-	{
-		TIFFReadDirectoryFindFieldInfo(tif,dp->tdir_tag,&fii);
-		if (fii == FAILED_FII)
-		{
-			TIFFWarningExt(tif->tif_clientdata, module,
-			    "Unknown field with tag %d (0x%x) encountered",
-			    dp->tdir_tag, dp->tdir_tag);
-			if (!_TIFFMergeFields(tif, _TIFFCreateAnonField(tif,
-						dp->tdir_tag,
-						(TIFFDataType) dp->tdir_type),
-					     1)) {
-				TIFFWarningExt(tif->tif_clientdata, module,
-				    "Registering anonymous field with tag %d (0x%x) failed",
-				    dp->tdir_tag, dp->tdir_tag);
-				dp->tdir_ignore = TRUE;
-			} else {
-				TIFFReadDirectoryFindFieldInfo(tif,dp->tdir_tag,&fii);
-				assert( fii != FAILED_FII );
-			}
-		}
-		if (!dp->tdir_ignore)
-		{
-			fip=tif->tif_fields[fii];
-			if (fip->field_bit==FIELD_IGNORE)
-				dp->tdir_ignore = TRUE;
-			else
-			{
-				/* check data type */
-				while ((fip->field_type!=TIFF_ANY)&&(fip->field_type!=dp->tdir_type))
-				{
-					fii++;
-					if ((fii==tif->tif_nfields)||
-					    (tif->tif_fields[fii]->field_tag!=(uint32)dp->tdir_tag))
-					{
-						fii=0xFFFF;
-						break;
-					}
-					fip=tif->tif_fields[fii];
-				}
-				if (fii==0xFFFF)
-				{
-					TIFFWarningExt(tif->tif_clientdata, module,
-					    "Wrong data type %d for \"%s\"; tag ignored",
-					    dp->tdir_type,fip->field_name);
-					dp->tdir_ignore = TRUE;
-				}
-				else
-				{
-					/* check count if known in advance */
-					if ((fip->field_readcount!=TIFF_VARIABLE)&&
-					    (fip->field_readcount!=TIFF_VARIABLE2))
-					{
-						uint32 expected;
-						if (fip->field_readcount==TIFF_SPP)
-							expected=(uint32)tif->tif_dir.td_samplesperpixel;
-						else
-							expected=(uint32)fip->field_readcount;
-						if (!CheckDirCount(tif,dp,expected))
-							dp->tdir_ignore = TRUE;
-					}
-				}
-			}
-			if (!dp->tdir_ignore) {
-				switch (dp->tdir_tag) 
-				{
-					case EXIFTAG_SUBJECTDISTANCE:
-						(void)TIFFFetchSubjectDistance(tif, dp);
-						break;
-					default:
-						(void)TIFFFetchNormalTag(tif, dp, TRUE);
-						break;
-				}
-			} /*-- if (!dp->tdir_ignore) */
-		}
-	}
-	if (dir)
-		_TIFFfree(dir);
-	return 1;
-}
-
-/*
- * EXIF is important special case of custom IFD, so we have a special
- * function to read it.
- */
-int
-TIFFReadEXIFDirectory(TIFF* tif, toff_t diroff)
-{
-	const TIFFFieldArray* exifFieldArray;
-	exifFieldArray = _TIFFGetExifFields();
-	return TIFFReadCustomDirectory(tif, diroff, exifFieldArray);  
-}
-
-/*
- *--: EXIF-GPS custom directory reading as another special case of custom IFD.
- */
-int
-TIFFReadGPSDirectory(TIFF* tif, toff_t diroff)
-{
-	const TIFFFieldArray* gpsFieldArray;
-	gpsFieldArray = _TIFFGetGpsFields();
-	return TIFFReadCustomDirectory(tif, diroff, gpsFieldArray);  
-}
-
-static int
-EstimateStripByteCounts(TIFF* tif, TIFFDirEntry* dir, uint16 dircount)
-{
-	static const char module[] = "EstimateStripByteCounts";
-
-	TIFFDirEntry *dp;
-	TIFFDirectory *td = &tif->tif_dir;
-	uint32 strip;
-
-    /* Do not try to load stripbytecount as we will compute it */
-        if( !_TIFFFillStrilesInternal( tif, 0 ) )
-            return -1;
-
-	if (td->td_stripbytecount_p)
-		_TIFFfree(td->td_stripbytecount_p);
-	td->td_stripbytecount_p = (uint64*)
-	    _TIFFCheckMalloc(tif, td->td_nstrips, sizeof (uint64),
-		"for \"StripByteCounts\" array");
-        if( td->td_stripbytecount_p == NULL )
-            return -1;
-
-	if (td->td_compression != COMPRESSION_NONE) {
-		uint64 space;
-		uint64 filesize;
-		uint16 n;
-		filesize = TIFFGetFileSize(tif);
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-			space=sizeof(TIFFHeaderClassic)+2+dircount*12+4;
-		else
-			space=sizeof(TIFFHeaderBig)+8+dircount*20+8;
-		/* calculate amount of space used by indirect values */
-		for (dp = dir, n = dircount; n > 0; n--, dp++)
-		{
-			uint32 typewidth;
-			uint64 datasize;
-			typewidth = TIFFDataWidth((TIFFDataType) dp->tdir_type);
-			if (typewidth == 0) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Cannot determine size of unknown tag type %d",
-				    dp->tdir_type);
-				return -1;
-			}
-			if( dp->tdir_count > TIFF_UINT64_MAX / typewidth )
-                            return -1;
-			datasize=(uint64)typewidth*dp->tdir_count;
-			if (!(tif->tif_flags&TIFF_BIGTIFF))
-			{
-				if (datasize<=4)
-					datasize=0;
-			}
-			else
-			{
-				if (datasize<=8)
-					datasize=0;
-			}
-			if( space > TIFF_UINT64_MAX - datasize )
-                            return -1;
-			space+=datasize;
-		}
-		if( filesize < space )
-                    /* we should perhaps return in error ? */
-                    space = filesize;
-                else
-                    space = filesize - space;
-		if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
-			space /= td->td_samplesperpixel;
-		for (strip = 0; strip < td->td_nstrips; strip++)
-			td->td_stripbytecount_p[strip] = space;
-		/*
-		 * This gross hack handles the case were the offset to
-		 * the last strip is past the place where we think the strip
-		 * should begin.  Since a strip of data must be contiguous,
-		 * it's safe to assume that we've overestimated the amount
-		 * of data in the strip and trim this number back accordingly.
-		 */
-		strip--;
-                if (td->td_stripoffset_p[strip] > TIFF_UINT64_MAX - td->td_stripbytecount_p[strip])
-                    return -1;
-		if (td->td_stripoffset_p[strip]+td->td_stripbytecount_p[strip] > filesize) {
-                    if( td->td_stripoffset_p[strip] >= filesize ) {
-                        /* Not sure what we should in that case... */
-                        td->td_stripbytecount_p[strip] = 0;
-                    } else {
-                        td->td_stripbytecount_p[strip] = filesize - td->td_stripoffset_p[strip];
-                    }
+        break;
+        case TIFF_SETGET_C32_UINT8:
+        {
+            uint8_t *data;
+            uint32_t count = 0;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            if (fip->field_tag == TIFFTAG_RICHTIFFIPTC &&
+                dp->tdir_type == TIFF_LONG)
+            {
+                /* Adobe's software (wrongly) writes RichTIFFIPTC tag with
+                 * data type LONG instead of UNDEFINED. Work around this
+                 * frequently found issue */
+                void *origdata;
+                err = TIFFReadDirEntryArray(tif, dp, &count, 4, &origdata);
+                if ((err != TIFFReadDirEntryErrOk) || (origdata == 0))
+                {
+                    data = NULL;
                 }
-	} else if (isTiled(tif)) {
-		uint64 bytespertile = TIFFTileSize64(tif);
-
-		for (strip = 0; strip < td->td_nstrips; strip++)
-		    td->td_stripbytecount_p[strip] = bytespertile;
-	} else {
-		uint64 rowbytes = TIFFScanlineSize64(tif);
-		uint32 rowsperstrip = td->td_imagelength/td->td_stripsperimage;
-		for (strip = 0; strip < td->td_nstrips; strip++)
+                else
                 {
-                    if( rowbytes > 0 && rowsperstrip > TIFF_UINT64_MAX / rowbytes )
-                        return -1;
-                    td->td_stripbytecount_p[strip] = rowbytes * rowsperstrip;
+                    if (tif->tif_flags & TIFF_SWAB)
+                        TIFFSwabArrayOfLong((uint32_t *)origdata, count);
+                    data = (uint8_t *)origdata;
+                    count = (uint32_t)(count * 4);
                 }
-	}
-	TIFFSetFieldBit(tif, FIELD_STRIPBYTECOUNTS);
-	if (!TIFFFieldSet(tif, FIELD_ROWSPERSTRIP))
-		td->td_rowsperstrip = td->td_imagelength;
-	return 1;
-}
-
-static void
-MissingRequired(TIFF* tif, const char* tagname)
-{
-	static const char module[] = "MissingRequired";
-
-	TIFFErrorExt(tif->tif_clientdata, module,
-	    "TIFF directory is missing required \"%s\" field",
-	    tagname);
-}
-
-/*
- * Check the directory offset against the list of already seen directory
- * offsets. This is a trick to prevent IFD looping. The one can create TIFF
- * file with looped directory pointers. We will maintain a list of already
- * seen directories and check every IFD offset against that list.
- */
-static int
-TIFFCheckDirOffset(TIFF* tif, uint64 diroff)
-{
-	uint16 n;
-
-	if (diroff == 0)			/* no more directories */
-		return 0;
-	if (tif->tif_dirnumber == 65535) {
-	    TIFFErrorExt(tif->tif_clientdata, "TIFFCheckDirOffset",
-			 "Cannot handle more than 65535 TIFF directories");
-	    return 0;
-	}
-
-	for (n = 0; n < tif->tif_dirnumber && tif->tif_dirlist; n++) {
-		if (tif->tif_dirlist[n] == diroff)
-			return 0;
-	}
-
-	tif->tif_dirnumber++;
-
-	if (tif->tif_dirlist == NULL || tif->tif_dirnumber > tif->tif_dirlistsize) {
-		uint64* new_dirlist;
-
-		/*
-		 * XXX: Reduce memory allocation granularity of the dirlist
-		 * array.
-		 */
-		new_dirlist = (uint64*)_TIFFCheckRealloc(tif, tif->tif_dirlist,
-		    tif->tif_dirnumber, 2 * sizeof(uint64), "for IFD list");
-		if (!new_dirlist)
-			return 0;
-		if( tif->tif_dirnumber >= 32768 )
-		    tif->tif_dirlistsize = 65535;
-		else
-		    tif->tif_dirlistsize = 2 * tif->tif_dirnumber;
-		tif->tif_dirlist = new_dirlist;
-	}
-
-	tif->tif_dirlist[tif->tif_dirnumber - 1] = diroff;
-
-	return 1;
-}
-
-/*
- * Check the count field of a directory entry against a known value.  The
- * caller is expected to skip/ignore the tag if there is a mismatch.
- */
-static int
-CheckDirCount(TIFF* tif, TIFFDirEntry* dir, uint32 count)
-{
-	if ((uint64)count > dir->tdir_count) {
-		const TIFFField* fip = TIFFFieldWithTag(tif, dir->tdir_tag);
-		TIFFWarningExt(tif->tif_clientdata, tif->tif_name,
-	"incorrect count for field \"%s\" (" TIFF_UINT64_FORMAT ", expecting %u); tag ignored",
-		    fip ? fip->field_name : "unknown tagname",
-		    dir->tdir_count, count);
-		return (0);
-	} else if ((uint64)count < dir->tdir_count) {
-		const TIFFField* fip = TIFFFieldWithTag(tif, dir->tdir_tag);
-		TIFFWarningExt(tif->tif_clientdata, tif->tif_name,
-	"incorrect count for field \"%s\" (" TIFF_UINT64_FORMAT ", expecting %u); tag trimmed",
-		    fip ? fip->field_name : "unknown tagname",
-		    dir->tdir_count, count);
-		dir->tdir_count = count;
-		return (1);
-	}
-	return (1);
-}
-
-/*
- * Read IFD structure from the specified offset. If the pointer to
- * nextdiroff variable has been specified, read it too. Function returns a
- * number of fields in the directory or 0 if failed.
- */
-static uint16
-TIFFFetchDirectory(TIFF* tif, uint64 diroff, TIFFDirEntry** pdir,
-                   uint64 *nextdiroff)
-{
-	static const char module[] = "TIFFFetchDirectory";
-
-	void* origdir;
-	uint16 dircount16;
-	uint32 dirsize;
-	TIFFDirEntry* dir;
-	uint8* ma;
-	TIFFDirEntry* mb;
-	uint16 n;
-
-	assert(pdir);
-
-	tif->tif_diroff = diroff;
-	if (nextdiroff)
-		*nextdiroff = 0;
-	if (!isMapped(tif)) {
-		if (!SeekOK(tif, tif->tif_diroff)) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				"%s: Seek error accessing TIFF directory",
-				tif->tif_name);
-			return 0;
-		}
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			if (!ReadOK(tif, &dircount16, sizeof (uint16))) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "%s: Can not read TIFF directory count",
-				    tif->tif_name);
-				return 0;
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabShort(&dircount16);
-			if (dircount16>4096)
-			{
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Sanity check on directory count failed, this is probably not a valid IFD offset");
-				return 0;
-			}
-			dirsize = 12;
-		} else {
-			uint64 dircount64;
-			if (!ReadOK(tif, &dircount64, sizeof (uint64))) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					"%s: Can not read TIFF directory count",
-					tif->tif_name);
-				return 0;
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong8(&dircount64);
-			if (dircount64>4096)
-			{
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Sanity check on directory count failed, this is probably not a valid IFD offset");
-				return 0;
-			}
-			dircount16 = (uint16)dircount64;
-			dirsize = 20;
-		}
-		origdir = _TIFFCheckMalloc(tif, dircount16,
-		    dirsize, "to read TIFF directory");
-		if (origdir == NULL)
-			return 0;
-		if (!ReadOK(tif, origdir, (tmsize_t)(dircount16*dirsize))) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				"%.100s: Can not read TIFF directory",
-				tif->tif_name);
-			_TIFFfree(origdir);
-			return 0;
-		}
-		/*
-		 * Read offset to next directory for sequential scans if
-		 * needed.
-		 */
-		if (nextdiroff)
-		{
-			if (!(tif->tif_flags&TIFF_BIGTIFF))
-			{
-				uint32 nextdiroff32;
-				if (!ReadOK(tif, &nextdiroff32, sizeof(uint32)))
-					nextdiroff32 = 0;
-				if (tif->tif_flags&TIFF_SWAB)
-					TIFFSwabLong(&nextdiroff32);
-				*nextdiroff=nextdiroff32;
-			} else {
-				if (!ReadOK(tif, nextdiroff, sizeof(uint64)))
-					*nextdiroff = 0;
-				if (tif->tif_flags&TIFF_SWAB)
-					TIFFSwabLong8(nextdiroff);
-			}
-		}
-	} else {
-		tmsize_t m;
-		tmsize_t off;
-		if (tif->tif_diroff > (uint64)TIFF_INT64_MAX)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Can not read TIFF directory count");
-			return(0);
-		}
-		off = (tmsize_t) tif->tif_diroff;
-
-		/*
-		 * Check for integer overflow when validating the dir_off,
-		 * otherwise a very high offset may cause an OOB read and
-		 * crash the client. Make two comparisons instead of
-		 *
-		 *  off + sizeof(uint16) > tif->tif_size
-		 *
-		 * to avoid overflow.
-		 */
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			m=off+sizeof(uint16);
-			if ((m<off)||(m<(tmsize_t)sizeof(uint16))||(m>tif->tif_size)) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					"Can not read TIFF directory count");
-				return 0;
-			} else {
-				_TIFFmemcpy(&dircount16, tif->tif_base + off,
-					    sizeof(uint16));
-			}
-			off += sizeof (uint16);
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabShort(&dircount16);
-			if (dircount16>4096)
-			{
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Sanity check on directory count failed, this is probably not a valid IFD offset");
-				return 0;
-			}
-			dirsize = 12;
-		}
-		else
-		{
-			uint64 dircount64;
-			m=off+sizeof(uint64);
-			if ((m<off)||(m<(tmsize_t)sizeof(uint64))||(m>tif->tif_size)) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					"Can not read TIFF directory count");
-				return 0;
-			} else {
-				_TIFFmemcpy(&dircount64, tif->tif_base + off,
-					    sizeof(uint64));
-			}
-			off += sizeof (uint64);
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong8(&dircount64);
-			if (dircount64>4096)
-			{
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Sanity check on directory count failed, this is probably not a valid IFD offset");
-				return 0;
-			}
-			dircount16 = (uint16)dircount64;
-			dirsize = 20;
-		}
-		if (dircount16 == 0 )
-		{
-			TIFFErrorExt(tif->tif_clientdata, module,
-			             "Sanity check on directory count failed, zero tag directories not supported");
-			return 0;
-		}
-		origdir = _TIFFCheckMalloc(tif, dircount16,
-						dirsize,
-						"to read TIFF directory");
-		if (origdir == NULL)
-			return 0;
-		m=off+dircount16*dirsize;
-		if ((m<off)||(m<(tmsize_t)(dircount16*dirsize))||(m>tif->tif_size)) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				     "Can not read TIFF directory");
-			_TIFFfree(origdir);
-			return 0;
-		} else {
-			_TIFFmemcpy(origdir, tif->tif_base + off,
-				    dircount16 * dirsize);
-		}
-		if (nextdiroff) {
-			off += dircount16 * dirsize;
-			if (!(tif->tif_flags&TIFF_BIGTIFF))
-			{
-				uint32 nextdiroff32;
-				m=off+sizeof(uint32);
-				if ((m<off)||(m<(tmsize_t)sizeof(uint32))||(m>tif->tif_size))
-					nextdiroff32 = 0;
-				else
-					_TIFFmemcpy(&nextdiroff32, tif->tif_base + off,
-						    sizeof (uint32));
-				if (tif->tif_flags&TIFF_SWAB)
-					TIFFSwabLong(&nextdiroff32);
-				*nextdiroff = nextdiroff32;
-			}
-			else
-			{
-				m=off+sizeof(uint64);
-				if ((m<off)||(m<(tmsize_t)sizeof(uint64))||(m>tif->tif_size))
-					*nextdiroff = 0;
-				else
-					_TIFFmemcpy(nextdiroff, tif->tif_base + off,
-						    sizeof (uint64));
-				if (tif->tif_flags&TIFF_SWAB)
-					TIFFSwabLong8(nextdiroff);
-			}
-		}
-	}
-	dir = (TIFFDirEntry*)_TIFFCheckMalloc(tif, dircount16,
-						sizeof(TIFFDirEntry),
-						"to read TIFF directory");
-	if (dir==0)
-	{
-		_TIFFfree(origdir);
-		return 0;
-	}
-	ma=(uint8*)origdir;
-	mb=dir;
-	for (n=0; n<dircount16; n++)
-	{
-		mb->tdir_ignore = FALSE;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabShort((uint16*)ma);
-		mb->tdir_tag=*(uint16*)ma;
-		ma+=sizeof(uint16);
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabShort((uint16*)ma);
-		mb->tdir_type=*(uint16*)ma;
-		ma+=sizeof(uint16);
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong((uint32*)ma);
-			mb->tdir_count=(uint64)(*(uint32*)ma);
-			ma+=sizeof(uint32);
-			mb->tdir_offset.toff_long8=0;
-			*(uint32*)(&mb->tdir_offset)=*(uint32*)ma;
-			ma+=sizeof(uint32);
-		}
-		else
-		{
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong8((uint64*)ma);
-                        mb->tdir_count=TIFFReadUInt64(ma);
-			ma+=sizeof(uint64);
-			mb->tdir_offset.toff_long8=TIFFReadUInt64(ma);
-			ma+=sizeof(uint64);
-		}
-		mb++;
-	}
-	_TIFFfree(origdir);
-	*pdir = dir;
-	return dircount16;
-}
-
-/*
- * Fetch a tag that is not handled by special case code.
- */
-static int
-TIFFFetchNormalTag(TIFF* tif, TIFFDirEntry* dp, int recover)
-{
-	static const char module[] = "TIFFFetchNormalTag";
-	enum TIFFReadDirEntryErr err;
-	uint32 fii;
-	const TIFFField* fip = NULL;
-	TIFFReadDirectoryFindFieldInfo(tif,dp->tdir_tag,&fii);
-        if( fii == FAILED_FII )
-        {
-            TIFFErrorExt(tif->tif_clientdata, "TIFFFetchNormalTag",
-                         "No definition found for tag %d",
-                         dp->tdir_tag);
-            return 0;
+            }
+            else
+            {
+                err = TIFFReadDirEntryByteArray(tif, dp, &data);
+                count = (uint32_t)(dp->tdir_count);
+            }
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, count, data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_SINT8:
+        {
+            int8_t *data = NULL;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntrySbyteArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_UINT16:
+        {
+            uint16_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntryShortArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_SINT16:
+        {
+            int16_t *data = NULL;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntrySshortArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_UINT32:
+        {
+            uint32_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntryLongArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_SINT32:
+        {
+            int32_t *data = NULL;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntrySlongArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_UINT64:
+        {
+            uint64_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntryLong8Array(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_SINT64:
+        {
+            int64_t *data = NULL;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntrySlong8Array(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_FLOAT:
+        {
+            float *data;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntryFloatArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_DOUBLE:
+        {
+            double *data;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntryDoubleArray(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
+        }
+        break;
+        case TIFF_SETGET_C32_IFD8:
+        {
+            uint64_t *data;
+            assert(fip->field_readcount == TIFF_VARIABLE2);
+            assert(fip->field_passcount == 1);
+            err = TIFFReadDirEntryIfd8Array(tif, dp, &data);
+            if (err == TIFFReadDirEntryErrOk)
+            {
+                int m;
+                m = TIFFSetField(tif, dp->tdir_tag, (uint32_t)(dp->tdir_count),
+                                 data);
+                if (data != 0)
+                    _TIFFfreeExt(tif, data);
+                if (!m)
+                    return (0);
+            }
         }
-	fip=tif->tif_fields[fii];
-	assert(fip != NULL); /* should not happen */
-	assert(fip->set_field_type!=TIFF_SETGET_OTHER);  /* if so, we shouldn't arrive here but deal with this in specialized code */
-	assert(fip->set_field_type!=TIFF_SETGET_INT);    /* if so, we shouldn't arrive here as this is only the case for pseudo-tags */
-	err=TIFFReadDirEntryErrOk;
-	switch (fip->set_field_type)
-	{
-		case TIFF_SETGET_UNDEFINED:
-			break;
-		case TIFF_SETGET_ASCII:
-			{
-				uint8* data;
-				assert(fip->field_passcount==0);
-				err=TIFFReadDirEntryByteArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					uint32 mb = 0;
-					int n;
-					if (data != NULL)
-					{
-					    uint8* ma = data;
-					    while (mb<(uint32)dp->tdir_count)
-					    {
-					            if (*ma==0)
-					                    break;
-					            ma++;
-					            mb++;
-					    }
-					}
-					if (mb+1<(uint32)dp->tdir_count)
-						TIFFWarningExt(tif->tif_clientdata,module,"ASCII value for tag \"%s\" contains null byte in value; value incorrectly truncated during reading due to implementation limitations",fip->field_name);
-					else if (mb+1>(uint32)dp->tdir_count)
-					{
-						uint8* o;
-						TIFFWarningExt(tif->tif_clientdata,module,"ASCII value for tag \"%s\" does not end in null byte",fip->field_name);
-						if ((uint32)dp->tdir_count+1!=dp->tdir_count+1)
-							o=NULL;
-						else
-							o=_TIFFmalloc((uint32)dp->tdir_count+1);
-						if (o==NULL)
-						{
-							if (data!=NULL)
-								_TIFFfree(data);
-							return(0);
-						}
-						_TIFFmemcpy(o,data,(uint32)dp->tdir_count);
-						o[(uint32)dp->tdir_count]=0;
-						if (data!=0)
-							_TIFFfree(data);
-						data=o;
-					}
-					n=TIFFSetField(tif,dp->tdir_tag,data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!n)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_UINT8:
-			{
-				uint8 data=0;
-				assert(fip->field_readcount==1);
-				assert(fip->field_passcount==0);
-				err=TIFFReadDirEntryByte(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					if (!TIFFSetField(tif,dp->tdir_tag,data))
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_UINT16:
-			{
-				uint16 data;
-				assert(fip->field_readcount==1);
-				assert(fip->field_passcount==0);
-				err=TIFFReadDirEntryShort(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					if (!TIFFSetField(tif,dp->tdir_tag,data))
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_UINT32:
-			{
-				uint32 data;
-				assert(fip->field_readcount==1);
-				assert(fip->field_passcount==0);
-				err=TIFFReadDirEntryLong(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					if (!TIFFSetField(tif,dp->tdir_tag,data))
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_UINT64:
-			{
-				uint64 data;
-				assert(fip->field_readcount==1);
-				assert(fip->field_passcount==0);
-				err=TIFFReadDirEntryLong8(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					if (!TIFFSetField(tif,dp->tdir_tag,data))
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_FLOAT:
-			{
-				float data;
-				assert(fip->field_readcount==1);
-				assert(fip->field_passcount==0);
-				err=TIFFReadDirEntryFloat(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					if (!TIFFSetField(tif,dp->tdir_tag,data))
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_DOUBLE:
-			{
-				double data;
-				assert(fip->field_readcount==1);
-				assert(fip->field_passcount==0);
-				err=TIFFReadDirEntryDouble(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					if (!TIFFSetField(tif,dp->tdir_tag,data))
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_IFD8:
-			{
-				uint64 data;
-				assert(fip->field_readcount==1);
-				assert(fip->field_passcount==0);
-				err=TIFFReadDirEntryIfd8(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					if (!TIFFSetField(tif,dp->tdir_tag,data))
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_UINT16_PAIR:
-			{
-				uint16* data;
-				assert(fip->field_readcount==2);
-				assert(fip->field_passcount==0);
-				if (dp->tdir_count!=2) {
-					TIFFWarningExt(tif->tif_clientdata,module,
-						       "incorrect count for field \"%s\", expected 2, got %d",
-						       fip->field_name,(int)dp->tdir_count);
-					return(0);
-				}
-				err=TIFFReadDirEntryShortArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-                                        assert(data); /* avoid CLang static Analyzer false positive */
-					m=TIFFSetField(tif,dp->tdir_tag,data[0],data[1]);
-					_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C0_UINT8:
-			{
-				uint8* data;
-				assert(fip->field_readcount>=1);
-				assert(fip->field_passcount==0);
-				if (dp->tdir_count!=(uint64)fip->field_readcount) {
-					TIFFWarningExt(tif->tif_clientdata,module,
-						       "incorrect count for field \"%s\", expected %d, got %d",
-						       fip->field_name,(int) fip->field_readcount, (int)dp->tdir_count);
-					return 0;
-				}
-				else
-				{
-					err=TIFFReadDirEntryByteArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C0_UINT16:
-			{
-				uint16* data;
-				assert(fip->field_readcount>=1);
-				assert(fip->field_passcount==0);
-				if (dp->tdir_count!=(uint64)fip->field_readcount)
-                                    /* corrupt file */;
-				else
-				{
-					err=TIFFReadDirEntryShortArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C0_UINT32:
-			{
-				uint32* data;
-				assert(fip->field_readcount>=1);
-				assert(fip->field_passcount==0);
-				if (dp->tdir_count!=(uint64)fip->field_readcount)
-                                    /* corrupt file */;
-				else
-				{
-					err=TIFFReadDirEntryLongArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C0_FLOAT:
-			{
-				float* data;
-				assert(fip->field_readcount>=1);
-				assert(fip->field_passcount==0);
-				if (dp->tdir_count!=(uint64)fip->field_readcount)
-					/* corrupt file */;
-				else
-				{
-					err=TIFFReadDirEntryFloatArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		/*--: Rational2Double: Extend for Double Arrays and Rational-Arrays read into Double-Arrays. */
-		case TIFF_SETGET_C0_DOUBLE:
-			{
-				double* data;
-				assert(fip->field_readcount>=1);
-				assert(fip->field_passcount==0);
-				if (dp->tdir_count!=(uint64)fip->field_readcount)
-					/* corrupt file */;
-				else
-				{
-					err=TIFFReadDirEntryDoubleArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C16_ASCII:
-			{
-				uint8* data;
-				assert(fip->field_readcount==TIFF_VARIABLE);
-				assert(fip->field_passcount==1);
-				if (dp->tdir_count>0xFFFF)
-					err=TIFFReadDirEntryErrCount;
-				else
-				{
-					err=TIFFReadDirEntryByteArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						if( data != 0 && dp->tdir_count > 0 && data[dp->tdir_count-1] != '\0' )
-						{
-						    TIFFWarningExt(tif->tif_clientdata,module,"ASCII value for tag \"%s\" does not end in null byte. Forcing it to be null",fip->field_name);
-						    data[dp->tdir_count-1] = '\0';
-						}
-						m=TIFFSetField(tif,dp->tdir_tag,(uint16)(dp->tdir_count),data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C16_UINT8:
-			{
-				uint8* data;
-				assert(fip->field_readcount==TIFF_VARIABLE);
-				assert(fip->field_passcount==1);
-				if (dp->tdir_count>0xFFFF)
-					err=TIFFReadDirEntryErrCount;
-				else
-				{
-					err=TIFFReadDirEntryByteArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,(uint16)(dp->tdir_count),data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C16_UINT16:
-			{
-				uint16* data;
-				assert(fip->field_readcount==TIFF_VARIABLE);
-				assert(fip->field_passcount==1);
-				if (dp->tdir_count>0xFFFF)
-					err=TIFFReadDirEntryErrCount;
-				else
-				{
-					err=TIFFReadDirEntryShortArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,(uint16)(dp->tdir_count),data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C16_UINT32:
-			{
-				uint32* data;
-				assert(fip->field_readcount==TIFF_VARIABLE);
-				assert(fip->field_passcount==1);
-				if (dp->tdir_count>0xFFFF)
-					err=TIFFReadDirEntryErrCount;
-				else
-				{
-					err=TIFFReadDirEntryLongArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,(uint16)(dp->tdir_count),data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C16_UINT64:
-			{
-				uint64* data;
-				assert(fip->field_readcount==TIFF_VARIABLE);
-				assert(fip->field_passcount==1);
-				if (dp->tdir_count>0xFFFF)
-					err=TIFFReadDirEntryErrCount;
-				else
-				{
-					err=TIFFReadDirEntryLong8Array(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,(uint16)(dp->tdir_count),data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C16_FLOAT:
-			{
-				float* data;
-				assert(fip->field_readcount==TIFF_VARIABLE);
-				assert(fip->field_passcount==1);
-				if (dp->tdir_count>0xFFFF)
-					err=TIFFReadDirEntryErrCount;
-				else
-				{
-					err=TIFFReadDirEntryFloatArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,(uint16)(dp->tdir_count),data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C16_DOUBLE:
-			{
-				double* data;
-				assert(fip->field_readcount==TIFF_VARIABLE);
-				assert(fip->field_passcount==1);
-				if (dp->tdir_count>0xFFFF)
-					err=TIFFReadDirEntryErrCount;
-				else
-				{
-					err=TIFFReadDirEntryDoubleArray(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,(uint16)(dp->tdir_count),data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C16_IFD8:
-			{
-				uint64* data;
-				assert(fip->field_readcount==TIFF_VARIABLE);
-				assert(fip->field_passcount==1);
-				if (dp->tdir_count>0xFFFF)
-					err=TIFFReadDirEntryErrCount;
-				else
-				{
-					err=TIFFReadDirEntryIfd8Array(tif,dp,&data);
-					if (err==TIFFReadDirEntryErrOk)
-					{
-						int m;
-						m=TIFFSetField(tif,dp->tdir_tag,(uint16)(dp->tdir_count),data);
-						if (data!=0)
-							_TIFFfree(data);
-						if (!m)
-							return(0);
-					}
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_ASCII:
-			{
-				uint8* data;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntryByteArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					if( data != 0 && dp->tdir_count > 0 && data[dp->tdir_count-1] != '\0' )
-					{
-					    TIFFWarningExt(tif->tif_clientdata,module,"ASCII value for tag \"%s\" does not end in null byte. Forcing it to be null",fip->field_name);
-                                            data[dp->tdir_count-1] = '\0';
-					}
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_UINT8:
-			{
-				uint8* data;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntryByteArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_SINT8:
-			{
-				int8* data = NULL;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntrySbyteArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_UINT16:
-			{
-				uint16* data;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntryShortArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_SINT16:
-			{
-				int16* data = NULL;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntrySshortArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_UINT32:
-			{
-				uint32* data;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntryLongArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_SINT32:
-			{
-				int32* data = NULL;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntrySlongArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_UINT64:
-			{
-				uint64* data;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntryLong8Array(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_SINT64:
-			{
-				int64* data = NULL;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntrySlong8Array(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_FLOAT:
-			{
-				float* data;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntryFloatArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_DOUBLE:
-			{
-				double* data;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntryDoubleArray(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		case TIFF_SETGET_C32_IFD8:
-			{
-				uint64* data;
-				assert(fip->field_readcount==TIFF_VARIABLE2);
-				assert(fip->field_passcount==1);
-				err=TIFFReadDirEntryIfd8Array(tif,dp,&data);
-				if (err==TIFFReadDirEntryErrOk)
-				{
-					int m;
-					m=TIFFSetField(tif,dp->tdir_tag,(uint32)(dp->tdir_count),data);
-					if (data!=0)
-						_TIFFfree(data);
-					if (!m)
-						return(0);
-				}
-			}
-			break;
-		default:
-			assert(0);    /* we should never get here */
-			break;
-	}
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		TIFFReadDirEntryOutputErr(tif,err,module,fip->field_name,recover);
-		return(0);
-	}
-	return(1);
+        break;
+        default:
+            assert(0); /* we should never get here */
+            break;
+    }
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        TIFFReadDirEntryOutputErr(tif, err, module, fip->field_name, recover);
+        return (0);
+    }
+    return (1);
 }
 
 /*
  * Fetch a set of offsets or lengths.
  * While this routine says "strips", in fact it's also used for tiles.
  */
-static int
-TIFFFetchStripThing(TIFF* tif, TIFFDirEntry* dir, uint32 nstrips, uint64** lpp)
+static int TIFFFetchStripThing(TIFF *tif, TIFFDirEntry *dir, uint32_t nstrips,
+                               uint64_t **lpp)
 {
-	static const char module[] = "TIFFFetchStripThing";
-	enum TIFFReadDirEntryErr err;
-	uint64* data;
-	err=TIFFReadDirEntryLong8ArrayWithLimit(tif,dir,&data,nstrips);
-	if (err!=TIFFReadDirEntryErrOk)
-	{
-		const TIFFField* fip = TIFFFieldWithTag(tif,dir->tdir_tag); 
-		TIFFReadDirEntryOutputErr(tif,err,module,fip ? fip->field_name : "unknown tagname",0);
-		return(0);
-	}
-	if (dir->tdir_count<(uint64)nstrips)
-	{
-		uint64* resizeddata;
-		const TIFFField* fip = TIFFFieldWithTag(tif,dir->tdir_tag);
-		const char* pszMax = getenv("LIBTIFF_STRILE_ARRAY_MAX_RESIZE_COUNT");
-		uint32 max_nstrips = 1000000;
-		if( pszMax )
-			max_nstrips = (uint32) atoi(pszMax);
-		TIFFReadDirEntryOutputErr(tif,TIFFReadDirEntryErrCount,
-		            module,
-		            fip ? fip->field_name : "unknown tagname",
-		            ( nstrips <= max_nstrips ) );
-
-		if( nstrips > max_nstrips )
-		{
-			_TIFFfree(data);
-			return(0);
-		}
-
-		resizeddata=(uint64*)_TIFFCheckMalloc(tif,nstrips,sizeof(uint64),"for strip array");
-		if (resizeddata==0) {
-			_TIFFfree(data);
-			return(0);
-		}
-                _TIFFmemcpy(resizeddata,data,(uint32)dir->tdir_count*sizeof(uint64));
-                _TIFFmemset(resizeddata+(uint32)dir->tdir_count,0,(nstrips-(uint32)dir->tdir_count)*sizeof(uint64));
-		_TIFFfree(data);
-		data=resizeddata;
-	}
-	*lpp=data;
-	return(1);
+    static const char module[] = "TIFFFetchStripThing";
+    enum TIFFReadDirEntryErr err;
+    uint64_t *data;
+    err = TIFFReadDirEntryLong8ArrayWithLimit(tif, dir, &data, nstrips);
+    if (err != TIFFReadDirEntryErrOk)
+    {
+        const TIFFField *fip = TIFFFieldWithTag(tif, dir->tdir_tag);
+        TIFFReadDirEntryOutputErr(tif, err, module,
+                                  fip ? fip->field_name : "unknown tagname", 0);
+        return (0);
+    }
+    if (dir->tdir_count < (uint64_t)nstrips)
+    {
+        uint64_t *resizeddata;
+        const TIFFField *fip = TIFFFieldWithTag(tif, dir->tdir_tag);
+        const char *pszMax = getenv("LIBTIFF_STRILE_ARRAY_MAX_RESIZE_COUNT");
+        uint32_t max_nstrips = 1000000;
+        if (pszMax)
+            max_nstrips = (uint32_t)atoi(pszMax);
+        TIFFReadDirEntryOutputErr(tif, TIFFReadDirEntryErrCount, module,
+                                  fip ? fip->field_name : "unknown tagname",
+                                  (nstrips <= max_nstrips));
+
+        if (nstrips > max_nstrips)
+        {
+            _TIFFfreeExt(tif, data);
+            return (0);
+        }
+
+        resizeddata = (uint64_t *)_TIFFCheckMalloc(
+            tif, nstrips, sizeof(uint64_t), "for strip array");
+        if (resizeddata == 0)
+        {
+            _TIFFfreeExt(tif, data);
+            return (0);
+        }
+        if (dir->tdir_count)
+            _TIFFmemcpy(resizeddata, data,
+                        (uint32_t)dir->tdir_count * sizeof(uint64_t));
+        _TIFFmemset(resizeddata + (uint32_t)dir->tdir_count, 0,
+                    (nstrips - (uint32_t)dir->tdir_count) * sizeof(uint64_t));
+        _TIFFfreeExt(tif, data);
+        data = resizeddata;
+    }
+    *lpp = data;
+    return (1);
 }
 
 /*
  * Fetch and set the SubjectDistance EXIF tag.
  */
-static int
-TIFFFetchSubjectDistance(TIFF* tif, TIFFDirEntry* dir)
+static int TIFFFetchSubjectDistance(TIFF *tif, TIFFDirEntry *dir)
 {
-	static const char module[] = "TIFFFetchSubjectDistance";
-	enum TIFFReadDirEntryErr err;
-	UInt64Aligned_t m;
-    m.l=0;
-	assert(sizeof(double)==8);
-	assert(sizeof(uint64)==8);
-	assert(sizeof(uint32)==4);
-	if (dir->tdir_count!=1)
-		err=TIFFReadDirEntryErrCount;
-	else if (dir->tdir_type!=TIFF_RATIONAL)
-		err=TIFFReadDirEntryErrType;
-	else
-	{
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			uint32 offset;
-			offset=*(uint32*)(&dir->tdir_offset);
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong(&offset);
-			err=TIFFReadDirEntryData(tif,offset,8,m.i);
-		}
-		else
-		{
-			m.l=dir->tdir_offset.toff_long8;
-			err=TIFFReadDirEntryErrOk;
-		}
-	}
-	if (err==TIFFReadDirEntryErrOk)
-	{
-		double n;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabArrayOfLong(m.i,2);
-		if (m.i[0]==0)
-			n=0.0;
-		else if (m.i[0]==0xFFFFFFFF || m.i[1]==0)
-			/*
-			 * XXX: Numerator 0xFFFFFFFF means that we have infinite
-			 * distance. Indicate that with a negative floating point
-			 * SubjectDistance value.
-			 */
-			n=-1.0;
-		else
-			n=(double)m.i[0]/(double)m.i[1];
-		return(TIFFSetField(tif,dir->tdir_tag,n));
-	}
-	else
-	{
-		TIFFReadDirEntryOutputErr(tif,err,module,"SubjectDistance",TRUE);
-		return(0);
-	}
+    static const char module[] = "TIFFFetchSubjectDistance";
+    enum TIFFReadDirEntryErr err;
+    UInt64Aligned_t m;
+    m.l = 0;
+    assert(sizeof(double) == 8);
+    assert(sizeof(uint64_t) == 8);
+    assert(sizeof(uint32_t) == 4);
+    if (dir->tdir_count != 1)
+        err = TIFFReadDirEntryErrCount;
+    else if (dir->tdir_type != TIFF_RATIONAL)
+        err = TIFFReadDirEntryErrType;
+    else
+    {
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            uint32_t offset;
+            offset = *(uint32_t *)(&dir->tdir_offset);
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong(&offset);
+            err = TIFFReadDirEntryData(tif, offset, 8, m.i);
+        }
+        else
+        {
+            m.l = dir->tdir_offset.toff_long8;
+            err = TIFFReadDirEntryErrOk;
+        }
+    }
+    if (err == TIFFReadDirEntryErrOk)
+    {
+        double n;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabArrayOfLong(m.i, 2);
+        if (m.i[0] == 0)
+            n = 0.0;
+        else if (m.i[0] == 0xFFFFFFFF || m.i[1] == 0)
+            /*
+             * XXX: Numerator 0xFFFFFFFF means that we have infinite
+             * distance. Indicate that with a negative floating point
+             * SubjectDistance value.
+             */
+            n = -1.0;
+        else
+            n = (double)m.i[0] / (double)m.i[1];
+        return (TIFFSetField(tif, dir->tdir_tag, n));
+    }
+    else
+    {
+        TIFFReadDirEntryOutputErr(tif, err, module, "SubjectDistance", TRUE);
+        return (0);
+    }
 }
 
-static void allocChoppedUpStripArrays(TIFF* tif, uint32 nstrips,
-                                      uint64 stripbytes, uint32 rowsperstrip)
+static void allocChoppedUpStripArrays(TIFF *tif, uint32_t nstrips,
+                                      uint64_t stripbytes,
+                                      uint32_t rowsperstrip)
 {
     TIFFDirectory *td = &tif->tif_dir;
-    uint64 bytecount;
-    uint64 offset;
-    uint64 last_offset;
-    uint64 last_bytecount;
-    uint32 i;
-    uint64 *newcounts;
-    uint64 *newoffsets;
+    uint64_t bytecount;
+    uint64_t offset;
+    uint64_t last_offset;
+    uint64_t last_bytecount;
+    uint32_t i;
+    uint64_t *newcounts;
+    uint64_t *newoffsets;
 
     offset = TIFFGetStrileOffset(tif, 0);
-    last_offset = TIFFGetStrileOffset(tif, td->td_nstrips-1);
-    last_bytecount = TIFFGetStrileByteCount(tif, td->td_nstrips-1);
-    if( last_offset > TIFF_UINT64_MAX - last_bytecount ||
-        last_offset + last_bytecount < offset )
+    last_offset = TIFFGetStrileOffset(tif, td->td_nstrips - 1);
+    last_bytecount = TIFFGetStrileByteCount(tif, td->td_nstrips - 1);
+    if (last_offset > UINT64_MAX - last_bytecount ||
+        last_offset + last_bytecount < offset)
     {
         return;
     }
     bytecount = last_offset + last_bytecount - offset;
 
-    newcounts = (uint64*) _TIFFCheckMalloc(tif, nstrips, sizeof (uint64),
-                            "for chopped \"StripByteCounts\" array");
-    newoffsets = (uint64*) _TIFFCheckMalloc(tif, nstrips, sizeof (uint64),
-                            "for chopped \"StripOffsets\" array");
-    if (newcounts == NULL || newoffsets == NULL) {
+    newcounts =
+        (uint64_t *)_TIFFCheckMalloc(tif, nstrips, sizeof(uint64_t),
+                                     "for chopped \"StripByteCounts\" array");
+    newoffsets = (uint64_t *)_TIFFCheckMalloc(
+        tif, nstrips, sizeof(uint64_t), "for chopped \"StripOffsets\" array");
+    if (newcounts == NULL || newoffsets == NULL)
+    {
         /*
-        * Unable to allocate new strip information, give up and use
-        * the original one strip information.
-        */
+         * Unable to allocate new strip information, give up and use
+         * the original one strip information.
+         */
         if (newcounts != NULL)
-            _TIFFfree(newcounts);
+            _TIFFfreeExt(tif, newcounts);
         if (newoffsets != NULL)
-            _TIFFfree(newoffsets);
+            _TIFFfreeExt(tif, newoffsets);
         return;
     }
 
@@ -5890,8 +7301,8 @@ static void allocChoppedUpStripArrays(TIFF* tif, uint32 nstrips,
     td->td_stripsperimage = td->td_nstrips = nstrips;
     TIFFSetField(tif, TIFFTAG_ROWSPERSTRIP, rowsperstrip);
 
-    _TIFFfree(td->td_stripbytecount_p);
-    _TIFFfree(td->td_stripoffset_p);
+    _TIFFfreeExt(tif, td->td_stripbytecount_p);
+    _TIFFfreeExt(tif, td->td_stripoffset_p);
     td->td_stripbytecount_p = newcounts;
     td->td_stripoffset_p = newoffsets;
 #ifdef STRIPBYTECOUNTSORTED_UNUSED
@@ -5900,77 +7311,75 @@ static void allocChoppedUpStripArrays(TIFF* tif, uint32 nstrips,
     tif->tif_flags |= TIFF_CHOPPEDUPARRAYS;
 }
 
-
 /*
  * Replace a single strip (tile) of uncompressed data by multiple strips
  * (tiles), each approximately STRIP_SIZE_DEFAULT bytes. This is useful for
  * dealing with large images or for dealing with machines with a limited
  * amount memory.
  */
-static void
-ChopUpSingleUncompressedStrip(TIFF* tif)
+static void ChopUpSingleUncompressedStrip(TIFF *tif)
 {
-	register TIFFDirectory *td = &tif->tif_dir;
-	uint64 bytecount;
-	uint64 offset;
-	uint32 rowblock;
-	uint64 rowblockbytes;
-	uint64 stripbytes;
-	uint32 nstrips;
-	uint32 rowsperstrip;
-
-	bytecount = TIFFGetStrileByteCount(tif, 0);
-        /* On a newly created file, just re-opened to be filled, we */
-        /* don't want strip chop to trigger as it is going to cause issues */
-        /* later ( StripOffsets and StripByteCounts improperly filled) . */
-        if( bytecount == 0 && tif->tif_mode != O_RDONLY )
-            return;
-	offset = TIFFGetStrileByteCount(tif, 0);
-	assert(td->td_planarconfig == PLANARCONFIG_CONTIG);
-	if ((td->td_photometric == PHOTOMETRIC_YCBCR)&&
-	    (!isUpSampled(tif)))
-		rowblock = td->td_ycbcrsubsampling[1];
-	else
-		rowblock = 1;
-	rowblockbytes = TIFFVTileSize64(tif, rowblock);
-	/*
-	 * Make the rows hold at least one scanline, but fill specified amount
-	 * of data if possible.
-	 */
-	if (rowblockbytes > STRIP_SIZE_DEFAULT) {
-		stripbytes = rowblockbytes;
-		rowsperstrip = rowblock;
-	} else if (rowblockbytes > 0 ) {
-		uint32 rowblocksperstrip;
-		rowblocksperstrip = (uint32) (STRIP_SIZE_DEFAULT / rowblockbytes);
-		rowsperstrip = rowblocksperstrip * rowblock;
-		stripbytes = rowblocksperstrip * rowblockbytes;
-	}
-	else
-	    return;
-
-	/*
-	 * never increase the number of rows per strip
-	 */
-	if (rowsperstrip >= td->td_rowsperstrip)
-		return;
-        nstrips = TIFFhowmany_32(td->td_imagelength, rowsperstrip);
-        if( nstrips == 0 )
-            return;
+    register TIFFDirectory *td = &tif->tif_dir;
+    uint64_t bytecount;
+    uint64_t offset;
+    uint32_t rowblock;
+    uint64_t rowblockbytes;
+    uint64_t stripbytes;
+    uint32_t nstrips;
+    uint32_t rowsperstrip;
+
+    bytecount = TIFFGetStrileByteCount(tif, 0);
+    /* On a newly created file, just re-opened to be filled, we */
+    /* don't want strip chop to trigger as it is going to cause issues */
+    /* later ( StripOffsets and StripByteCounts improperly filled) . */
+    if (bytecount == 0 && tif->tif_mode != O_RDONLY)
+        return;
+    offset = TIFFGetStrileByteCount(tif, 0);
+    assert(td->td_planarconfig == PLANARCONFIG_CONTIG);
+    if ((td->td_photometric == PHOTOMETRIC_YCBCR) && (!isUpSampled(tif)))
+        rowblock = td->td_ycbcrsubsampling[1];
+    else
+        rowblock = 1;
+    rowblockbytes = TIFFVTileSize64(tif, rowblock);
+    /*
+     * Make the rows hold at least one scanline, but fill specified amount
+     * of data if possible.
+     */
+    if (rowblockbytes > STRIP_SIZE_DEFAULT)
+    {
+        stripbytes = rowblockbytes;
+        rowsperstrip = rowblock;
+    }
+    else if (rowblockbytes > 0)
+    {
+        uint32_t rowblocksperstrip;
+        rowblocksperstrip = (uint32_t)(STRIP_SIZE_DEFAULT / rowblockbytes);
+        rowsperstrip = rowblocksperstrip * rowblock;
+        stripbytes = rowblocksperstrip * rowblockbytes;
+    }
+    else
+        return;
 
-        /* If we are going to allocate a lot of memory, make sure that the */
-        /* file is as big as needed */
-        if( tif->tif_mode == O_RDONLY &&
-            nstrips > 1000000 &&
-            (offset >= TIFFGetFileSize(tif) ||
-             stripbytes > (TIFFGetFileSize(tif) - offset) / (nstrips - 1)) )
-        {
-            return;
-        }
+    /*
+     * never increase the number of rows per strip
+     */
+    if (rowsperstrip >= td->td_rowsperstrip)
+        return;
+    nstrips = TIFFhowmany_32(td->td_imagelength, rowsperstrip);
+    if (nstrips == 0)
+        return;
 
-        allocChoppedUpStripArrays(tif, nstrips, stripbytes, rowsperstrip);
-}
+    /* If we are going to allocate a lot of memory, make sure that the */
+    /* file is as big as needed */
+    if (tif->tif_mode == O_RDONLY && nstrips > 1000000 &&
+        (offset >= TIFFGetFileSize(tif) ||
+         stripbytes > (TIFFGetFileSize(tif) - offset) / (nstrips - 1)))
+    {
+        return;
+    }
 
+    allocChoppedUpStripArrays(tif, nstrips, stripbytes, rowsperstrip);
+}
 
 /*
  * Replace a file with contiguous strips > 2 GB of uncompressed data by
@@ -5978,62 +7387,64 @@ ChopUpSingleUncompressedStrip(TIFF* tif)
  * dealing with large images or for dealing with machines with a limited
  * amount memory.
  */
-static void TryChopUpUncompressedBigTiff( TIFF* tif )
+static void TryChopUpUncompressedBigTiff(TIFF *tif)
 {
     TIFFDirectory *td = &tif->tif_dir;
-    uint32 rowblock;
-    uint64 rowblockbytes;
-    uint32 i;
-    uint64 stripsize;
-    uint32 rowblocksperstrip;
-    uint32 rowsperstrip;
-    uint64 stripbytes;
-    uint32 nstrips;
+    uint32_t rowblock;
+    uint64_t rowblockbytes;
+    uint32_t i;
+    uint64_t stripsize;
+    uint32_t rowblocksperstrip;
+    uint32_t rowsperstrip;
+    uint64_t stripbytes;
+    uint32_t nstrips;
 
     stripsize = TIFFStripSize64(tif);
 
-    assert( tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG );
-    assert( tif->tif_dir.td_compression == COMPRESSION_NONE );
-    assert( (tif->tif_flags&(TIFF_STRIPCHOP|TIFF_ISTILED)) == TIFF_STRIPCHOP );
-    assert( stripsize > 0x7FFFFFFFUL );
+    assert(tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG);
+    assert(tif->tif_dir.td_compression == COMPRESSION_NONE);
+    assert((tif->tif_flags & (TIFF_STRIPCHOP | TIFF_ISTILED)) ==
+           TIFF_STRIPCHOP);
+    assert(stripsize > 0x7FFFFFFFUL);
 
     /* On a newly created file, just re-opened to be filled, we */
     /* don't want strip chop to trigger as it is going to cause issues */
     /* later ( StripOffsets and StripByteCounts improperly filled) . */
-    if( TIFFGetStrileByteCount(tif, 0) == 0 && tif->tif_mode != O_RDONLY )
+    if (TIFFGetStrileByteCount(tif, 0) == 0 && tif->tif_mode != O_RDONLY)
         return;
 
-    if ((td->td_photometric == PHOTOMETRIC_YCBCR)&&
-        (!isUpSampled(tif)))
+    if ((td->td_photometric == PHOTOMETRIC_YCBCR) && (!isUpSampled(tif)))
         rowblock = td->td_ycbcrsubsampling[1];
     else
         rowblock = 1;
     rowblockbytes = TIFFVStripSize64(tif, rowblock);
-    if( rowblockbytes == 0 || rowblockbytes > 0x7FFFFFFFUL )
+    if (rowblockbytes == 0 || rowblockbytes > 0x7FFFFFFFUL)
     {
         /* In case of file with gigantic width */
         return;
     }
 
     /* Check that the strips are contiguous and of the expected size */
-    for( i = 0; i < td->td_nstrips; i++ )
+    for (i = 0; i < td->td_nstrips; i++)
     {
-        if( i == td->td_nstrips - 1 )
+        if (i == td->td_nstrips - 1)
         {
-            if( TIFFGetStrileByteCount(tif, i) < TIFFVStripSize64(
-                    tif, td->td_imagelength - i * td->td_rowsperstrip ) )
+            if (TIFFGetStrileByteCount(tif, i) <
+                TIFFVStripSize64(tif,
+                                 td->td_imagelength - i * td->td_rowsperstrip))
             {
                 return;
             }
         }
         else
         {
-            if( TIFFGetStrileByteCount(tif, i) != stripsize )
+            if (TIFFGetStrileByteCount(tif, i) != stripsize)
             {
                 return;
             }
-            if( i > 0 && TIFFGetStrileOffset(tif, i) !=
-                    TIFFGetStrileOffset(tif, i-1) + TIFFGetStrileByteCount(tif, i-1) )
+            if (i > 0 && TIFFGetStrileOffset(tif, i) !=
+                             TIFFGetStrileOffset(tif, i - 1) +
+                                 TIFFGetStrileByteCount(tif, i - 1))
             {
                 return;
             }
@@ -6041,27 +7452,26 @@ static void TryChopUpUncompressedBigTiff( TIFF* tif )
     }
 
     /* Aim for 512 MB strips (that will still be manageable by 32 bit builds */
-    rowblocksperstrip = (uint32) (512 * 1024 * 1024 / rowblockbytes);
-    if( rowblocksperstrip == 0 )
+    rowblocksperstrip = (uint32_t)(512 * 1024 * 1024 / rowblockbytes);
+    if (rowblocksperstrip == 0)
         rowblocksperstrip = 1;
     rowsperstrip = rowblocksperstrip * rowblock;
     stripbytes = rowblocksperstrip * rowblockbytes;
-    assert( stripbytes <= 0x7FFFFFFFUL );
+    assert(stripbytes <= 0x7FFFFFFFUL);
 
     nstrips = TIFFhowmany_32(td->td_imagelength, rowsperstrip);
-    if( nstrips == 0 )
+    if (nstrips == 0)
         return;
 
     /* If we are going to allocate a lot of memory, make sure that the */
     /* file is as big as needed */
-    if( tif->tif_mode == O_RDONLY &&
-        nstrips > 1000000 )
+    if (tif->tif_mode == O_RDONLY && nstrips > 1000000)
     {
-        uint64 last_offset = TIFFGetStrileOffset(tif, td->td_nstrips-1);
-        uint64 filesize = TIFFGetFileSize(tif);
-        uint64 last_bytecount = TIFFGetStrileByteCount(tif, td->td_nstrips-1);
-        if( last_offset > filesize ||
-            last_bytecount > filesize - last_offset )
+        uint64_t last_offset = TIFFGetStrileOffset(tif, td->td_nstrips - 1);
+        uint64_t filesize = TIFFGetFileSize(tif);
+        uint64_t last_bytecount =
+            TIFFGetStrileByteCount(tif, td->td_nstrips - 1);
+        if (last_offset > filesize || last_bytecount > filesize - last_offset)
         {
             return;
         }
@@ -6070,9 +7480,8 @@ static void TryChopUpUncompressedBigTiff( TIFF* tif )
     allocChoppedUpStripArrays(tif, nstrips, stripbytes, rowsperstrip);
 }
 
-
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static uint64 _TIFFUnsanitizedAddUInt64AndInt(uint64 a, int b)
+static uint64_t _TIFFUnsanitizedAddUInt64AndInt(uint64_t a, int b)
 {
     return a + b;
 }
@@ -6081,9 +7490,8 @@ static uint64 _TIFFUnsanitizedAddUInt64AndInt(uint64 a, int b)
  * strip/tile of number strile. Also fetch the neighbouring values using a
  * 4096 byte page size.
  */
-static
-int _TIFFPartialReadStripArray( TIFF* tif, TIFFDirEntry* dirent,
-                                int strile, uint64* panVals )
+static int _TIFFPartialReadStripArray(TIFF *tif, TIFFDirEntry *dirent,
+                                      int strile, uint64_t *panVals)
 {
     static const char module[] = "_TIFFPartialReadStripArray";
 #define IO_CACHE_PAGE_SIZE 4096
@@ -6091,89 +7499,88 @@ int _TIFFPartialReadStripArray( TIFF* tif, TIFFDirEntry* dirent,
     size_t sizeofval;
     const int bSwab = (tif->tif_flags & TIFF_SWAB) != 0;
     int sizeofvalint;
-    uint64 nBaseOffset;
-    uint64 nOffset;
-    uint64 nOffsetStartPage;
-    uint64 nOffsetEndPage;
+    uint64_t nBaseOffset;
+    uint64_t nOffset;
+    uint64_t nOffsetStartPage;
+    uint64_t nOffsetEndPage;
     tmsize_t nToRead;
     tmsize_t nRead;
-    uint64 nLastStripOffset;
+    uint64_t nLastStripOffset;
     int iStartBefore;
     int i;
-    const uint32 arraySize = tif->tif_dir.td_stripoffsetbyteallocsize;
+    const uint32_t arraySize = tif->tif_dir.td_stripoffsetbyteallocsize;
     unsigned char buffer[2 * IO_CACHE_PAGE_SIZE];
 
-    assert( dirent->tdir_count > 4 );
+    assert(dirent->tdir_count > 4);
 
-    if( dirent->tdir_type == TIFF_SHORT )
+    if (dirent->tdir_type == TIFF_SHORT)
     {
-        sizeofval = sizeof(uint16);
+        sizeofval = sizeof(uint16_t);
     }
-    else if( dirent->tdir_type == TIFF_LONG )
+    else if (dirent->tdir_type == TIFF_LONG)
     {
-        sizeofval = sizeof(uint32);
+        sizeofval = sizeof(uint32_t);
     }
-    else if( dirent->tdir_type == TIFF_LONG8 )
+    else if (dirent->tdir_type == TIFF_LONG8)
     {
-        sizeofval = sizeof(uint64);
+        sizeofval = sizeof(uint64_t);
     }
-    else if( dirent->tdir_type == TIFF_SLONG8 )
+    else if (dirent->tdir_type == TIFF_SLONG8)
     {
         /* Non conformant but used by some images as in */
         /* https://github.com/OSGeo/gdal/issues/2165 */
-        sizeofval = sizeof(int64);
+        sizeofval = sizeof(int64_t);
     }
     else
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                 "Invalid type for [Strip|Tile][Offset/ByteCount] tag");
+        TIFFErrorExtR(tif, module,
+                      "Invalid type for [Strip|Tile][Offset/ByteCount] tag");
         panVals[strile] = 0;
         return 0;
     }
     sizeofvalint = (int)(sizeofval);
 
-    if( tif->tif_flags&TIFF_BIGTIFF )
+    if (tif->tif_flags & TIFF_BIGTIFF)
     {
-        uint64 offset = dirent->tdir_offset.toff_long8;
-        if( bSwab )
+        uint64_t offset = dirent->tdir_offset.toff_long8;
+        if (bSwab)
             TIFFSwabLong8(&offset);
         nBaseOffset = offset;
     }
     else
     {
-        uint32 offset = dirent->tdir_offset.toff_long;
-        if( bSwab )
+        uint32_t offset = dirent->tdir_offset.toff_long;
+        if (bSwab)
             TIFFSwabLong(&offset);
         nBaseOffset = offset;
     }
     /* To avoid later unsigned integer overflows */
-    if( nBaseOffset > (uint64)TIFF_INT64_MAX )
+    if (nBaseOffset > (uint64_t)INT64_MAX)
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                 "Cannot read offset/size for strile %d", strile);
+        TIFFErrorExtR(tif, module, "Cannot read offset/size for strile %d",
+                      strile);
         panVals[strile] = 0;
         return 0;
     }
     nOffset = nBaseOffset + sizeofval * strile;
-    nOffsetStartPage =
-        (nOffset / IO_CACHE_PAGE_SIZE) * IO_CACHE_PAGE_SIZE;
+    nOffsetStartPage = (nOffset / IO_CACHE_PAGE_SIZE) * IO_CACHE_PAGE_SIZE;
     nOffsetEndPage = nOffsetStartPage + IO_CACHE_PAGE_SIZE;
 
-    if( nOffset + sizeofval > nOffsetEndPage )
+    if (nOffset + sizeofval > nOffsetEndPage)
         nOffsetEndPage += IO_CACHE_PAGE_SIZE;
 #undef IO_CACHE_PAGE_SIZE
 
     nLastStripOffset = nBaseOffset + arraySize * sizeofval;
-    if( nLastStripOffset < nOffsetEndPage )
+    if (nLastStripOffset < nOffsetEndPage)
         nOffsetEndPage = nLastStripOffset;
-    if( nOffsetStartPage >= nOffsetEndPage )
+    if (nOffsetStartPage >= nOffsetEndPage)
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                 "Cannot read offset/size for strile %d", strile);
+        TIFFErrorExtR(tif, module, "Cannot read offset/size for strile %d",
+                      strile);
         panVals[strile] = 0;
         return 0;
     }
-    if (!SeekOK(tif,nOffsetStartPage))
+    if (!SeekOK(tif, nOffsetStartPage))
     {
         panVals[strile] = 0;
         return 0;
@@ -6181,159 +7588,160 @@ int _TIFFPartialReadStripArray( TIFF* tif, TIFFDirEntry* dirent,
 
     nToRead = (tmsize_t)(nOffsetEndPage - nOffsetStartPage);
     nRead = TIFFReadFile(tif, buffer, nToRead);
-    if( nRead < nToRead )
+    if (nRead < nToRead)
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                 "Cannot read offset/size for strile around ~%d", strile);
+        TIFFErrorExtR(tif, module,
+                      "Cannot read offset/size for strile around ~%d", strile);
         return 0;
     }
     iStartBefore = -(int)((nOffset - nOffsetStartPage) / sizeofval);
-    if( strile + iStartBefore < 0 )
+    if (strile + iStartBefore < 0)
         iStartBefore = -strile;
-    for( i = iStartBefore;
-         (uint32)(strile + i) < arraySize &&
-         _TIFFUnsanitizedAddUInt64AndInt(nOffset, (i + 1) * sizeofvalint) <= nOffsetEndPage;
-         ++i )
+    for (i = iStartBefore;
+         (uint32_t)(strile + i) < arraySize &&
+         _TIFFUnsanitizedAddUInt64AndInt(nOffset, (i + 1) * sizeofvalint) <=
+             nOffsetEndPage;
+         ++i)
     {
-        if( dirent->tdir_type == TIFF_SHORT )
+        if (dirent->tdir_type == TIFF_SHORT)
         {
-            uint16 val;
+            uint16_t val;
             memcpy(&val,
                    buffer + (nOffset - nOffsetStartPage) + i * sizeofvalint,
                    sizeof(val));
-            if( bSwab )
+            if (bSwab)
                 TIFFSwabShort(&val);
             panVals[strile + i] = val;
         }
-        else if( dirent->tdir_type == TIFF_LONG )
+        else if (dirent->tdir_type == TIFF_LONG)
         {
-            uint32 val;
+            uint32_t val;
             memcpy(&val,
                    buffer + (nOffset - nOffsetStartPage) + i * sizeofvalint,
                    sizeof(val));
-            if( bSwab )
+            if (bSwab)
                 TIFFSwabLong(&val);
             panVals[strile + i] = val;
         }
-        else if( dirent->tdir_type == TIFF_LONG8 )
+        else if (dirent->tdir_type == TIFF_LONG8)
         {
-            uint64 val;
+            uint64_t val;
             memcpy(&val,
                    buffer + (nOffset - nOffsetStartPage) + i * sizeofvalint,
                    sizeof(val));
-            if( bSwab )
+            if (bSwab)
                 TIFFSwabLong8(&val);
             panVals[strile + i] = val;
         }
         else /* if( dirent->tdir_type == TIFF_SLONG8 ) */
         {
             /* Non conformant data type */
-            int64 val;
+            int64_t val;
             memcpy(&val,
                    buffer + (nOffset - nOffsetStartPage) + i * sizeofvalint,
                    sizeof(val));
-            if( bSwab )
-                TIFFSwabLong8((uint64*) &val);
-            panVals[strile + i] = (uint64) val;
+            if (bSwab)
+                TIFFSwabLong8((uint64_t *)&val);
+            panVals[strile + i] = (uint64_t)val;
         }
     }
     return 1;
 }
 
-static int _TIFFFetchStrileValue(TIFF* tif,
-                                 uint32 strile,
-                                 TIFFDirEntry* dirent,
-                                 uint64** parray)
+static int _TIFFFetchStrileValue(TIFF *tif, uint32_t strile,
+                                 TIFFDirEntry *dirent, uint64_t **parray)
 {
     static const char module[] = "_TIFFFetchStrileValue";
     TIFFDirectory *td = &tif->tif_dir;
-    if( strile >= dirent->tdir_count )
+    if (strile >= dirent->tdir_count)
     {
         return 0;
     }
-    if( strile >= td->td_stripoffsetbyteallocsize )
+    if (strile >= td->td_stripoffsetbyteallocsize)
     {
-        uint32 nStripArrayAllocBefore = td->td_stripoffsetbyteallocsize;
-        uint32 nStripArrayAllocNew;
-        uint64 nArraySize64;
+        uint32_t nStripArrayAllocBefore = td->td_stripoffsetbyteallocsize;
+        uint32_t nStripArrayAllocNew;
+        uint64_t nArraySize64;
         size_t nArraySize;
-        uint64* offsetArray;
-        uint64* bytecountArray;
+        uint64_t *offsetArray;
+        uint64_t *bytecountArray;
 
-        if( strile > 1000000 )
+        if (strile > 1000000)
         {
-            uint64 filesize = TIFFGetFileSize(tif);
+            uint64_t filesize = TIFFGetFileSize(tif);
             /* Avoid excessive memory allocation attempt */
             /* For such a big blockid we need at least a TIFF_LONG per strile */
             /* for the offset array. */
-            if( strile > filesize / sizeof(uint32) )
+            if (strile > filesize / sizeof(uint32_t))
             {
-                TIFFErrorExt(tif->tif_clientdata, module, "File too short");
+                TIFFErrorExtR(tif, module, "File too short");
                 return 0;
             }
         }
 
-        if( td->td_stripoffsetbyteallocsize == 0 &&
-            td->td_nstrips < 1024 * 1024 )
+        if (td->td_stripoffsetbyteallocsize == 0 &&
+            td->td_nstrips < 1024 * 1024)
         {
             nStripArrayAllocNew = td->td_nstrips;
         }
         else
         {
-#define TIFF_MAX(a,b) (((a)>(b)) ? (a) : (b))
-#define TIFF_MIN(a,b) (((a)<(b)) ? (a) : (b))
-            nStripArrayAllocNew = TIFF_MAX(strile + 1, 1024U * 512U );
-            if( nStripArrayAllocNew < 0xFFFFFFFFU / 2  )
+#define TIFF_MAX(a, b) (((a) > (b)) ? (a) : (b))
+#define TIFF_MIN(a, b) (((a) < (b)) ? (a) : (b))
+            nStripArrayAllocNew = TIFF_MAX(strile + 1, 1024U * 512U);
+            if (nStripArrayAllocNew < 0xFFFFFFFFU / 2)
                 nStripArrayAllocNew *= 2;
             nStripArrayAllocNew = TIFF_MIN(nStripArrayAllocNew, td->td_nstrips);
         }
-        assert( strile < nStripArrayAllocNew );
-        nArraySize64 = (uint64)sizeof(uint64) * nStripArrayAllocNew;
+        assert(strile < nStripArrayAllocNew);
+        nArraySize64 = (uint64_t)sizeof(uint64_t) * nStripArrayAllocNew;
         nArraySize = (size_t)(nArraySize64);
 #if SIZEOF_SIZE_T == 4
-        if( nArraySize != nArraySize64 )
+        if (nArraySize != nArraySize64)
         {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                        "Cannot allocate strip offset and bytecount arrays");
+            TIFFErrorExtR(tif, module,
+                          "Cannot allocate strip offset and bytecount arrays");
             return 0;
         }
 #endif
-        offsetArray = (uint64*)(
-            _TIFFrealloc( td->td_stripoffset_p, nArraySize ) );
-        bytecountArray = (uint64*)(
-            _TIFFrealloc( td->td_stripbytecount_p, nArraySize ) );
-        if( offsetArray )
+        offsetArray = (uint64_t *)(_TIFFreallocExt(tif, td->td_stripoffset_p,
+                                                   nArraySize));
+        bytecountArray = (uint64_t *)(_TIFFreallocExt(
+            tif, td->td_stripbytecount_p, nArraySize));
+        if (offsetArray)
             td->td_stripoffset_p = offsetArray;
-        if( bytecountArray )
+        if (bytecountArray)
             td->td_stripbytecount_p = bytecountArray;
-        if( offsetArray && bytecountArray )
+        if (offsetArray && bytecountArray)
         {
             td->td_stripoffsetbyteallocsize = nStripArrayAllocNew;
             /* Initialize new entries to ~0 / -1 */
-            memset(td->td_stripoffset_p + nStripArrayAllocBefore,
-                0xFF,
-                (td->td_stripoffsetbyteallocsize - nStripArrayAllocBefore) * sizeof(uint64) );
-            memset(td->td_stripbytecount_p + nStripArrayAllocBefore,
-                0xFF,
-                (td->td_stripoffsetbyteallocsize - nStripArrayAllocBefore) * sizeof(uint64) );
+            /* coverity[overrun-buffer-arg] */
+            memset(td->td_stripoffset_p + nStripArrayAllocBefore, 0xFF,
+                   (td->td_stripoffsetbyteallocsize - nStripArrayAllocBefore) *
+                       sizeof(uint64_t));
+            /* coverity[overrun-buffer-arg] */
+            memset(td->td_stripbytecount_p + nStripArrayAllocBefore, 0xFF,
+                   (td->td_stripoffsetbyteallocsize - nStripArrayAllocBefore) *
+                       sizeof(uint64_t));
         }
         else
         {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                        "Cannot allocate strip offset and bytecount arrays");
-            _TIFFfree(td->td_stripoffset_p);
+            TIFFErrorExtR(tif, module,
+                          "Cannot allocate strip offset and bytecount arrays");
+            _TIFFfreeExt(tif, td->td_stripoffset_p);
             td->td_stripoffset_p = NULL;
-            _TIFFfree(td->td_stripbytecount_p);
+            _TIFFfreeExt(tif, td->td_stripbytecount_p);
             td->td_stripbytecount_p = NULL;
             td->td_stripoffsetbyteallocsize = 0;
         }
     }
-    if( *parray == NULL || strile >= td->td_stripoffsetbyteallocsize )
+    if (*parray == NULL || strile >= td->td_stripoffsetbyteallocsize)
         return 0;
 
-    if( ~((*parray)[strile]) == 0 )
+    if (~((*parray)[strile]) == 0)
     {
-        if( !_TIFFPartialReadStripArray( tif, dirent, strile, *parray ) )
+        if (!_TIFFPartialReadStripArray(tif, dirent, strile, *parray))
         {
             (*parray)[strile] = 0;
             return 0;
@@ -6343,24 +7751,25 @@ static int _TIFFFetchStrileValue(TIFF* tif,
     return 1;
 }
 
-static uint64 _TIFFGetStrileOffsetOrByteCountValue(TIFF *tif, uint32 strile,
-                                                   TIFFDirEntry* dirent,
-                                                   uint64** parray,
-                                                   int *pbErr)
+static uint64_t _TIFFGetStrileOffsetOrByteCountValue(TIFF *tif, uint32_t strile,
+                                                     TIFFDirEntry *dirent,
+                                                     uint64_t **parray,
+                                                     int *pbErr)
 {
     TIFFDirectory *td = &tif->tif_dir;
-    if( pbErr )
+    if (pbErr)
         *pbErr = 0;
-    if( (tif->tif_flags&TIFF_DEFERSTRILELOAD) && !(tif->tif_flags&TIFF_CHOPPEDUPARRAYS) )
+    if ((tif->tif_flags & TIFF_DEFERSTRILELOAD) &&
+        !(tif->tif_flags & TIFF_CHOPPEDUPARRAYS))
     {
-        if( !(tif->tif_flags&TIFF_LAZYSTRILELOAD) ||
+        if (!(tif->tif_flags & TIFF_LAZYSTRILELOAD) ||
             /* If the values may fit in the toff_long/toff_long8 member */
             /* then use _TIFFFillStriles to simplify _TIFFFetchStrileValue */
-            dirent->tdir_count <= 4 )
+            dirent->tdir_count <= 4)
         {
-            if( !_TIFFFillStriles(tif) )
+            if (!_TIFFFillStriles(tif))
             {
-                if( pbErr )
+                if (pbErr)
                     *pbErr = 1;
                 /* Do not return, as we want this function to always */
                 /* return the same value if called several times with */
@@ -6369,73 +7778,74 @@ static uint64 _TIFFGetStrileOffsetOrByteCountValue(TIFF *tif, uint32 strile,
         }
         else
         {
-             if( !_TIFFFetchStrileValue(tif, strile, dirent, parray) )
-             {
-                if( pbErr )
+            if (!_TIFFFetchStrileValue(tif, strile, dirent, parray))
+            {
+                if (pbErr)
                     *pbErr = 1;
-                 return 0;
-             }
+                return 0;
+            }
         }
     }
-    if( *parray == NULL || strile >= td->td_nstrips )
+    if (*parray == NULL || strile >= td->td_nstrips)
     {
-        if( pbErr )
+        if (pbErr)
             *pbErr = 1;
         return 0;
     }
     return (*parray)[strile];
 }
 
-/* Return the value of the TileOffsets/StripOffsets array for the specified tile/strile */
-uint64 TIFFGetStrileOffset(TIFF *tif, uint32 strile)
+/* Return the value of the TileOffsets/StripOffsets array for the specified
+ * tile/strile */
+uint64_t TIFFGetStrileOffset(TIFF *tif, uint32_t strile)
 {
     return TIFFGetStrileOffsetWithErr(tif, strile, NULL);
 }
 
-/* Return the value of the TileOffsets/StripOffsets array for the specified tile/strile */
-uint64 TIFFGetStrileOffsetWithErr(TIFF *tif, uint32 strile, int *pbErr)
+/* Return the value of the TileOffsets/StripOffsets array for the specified
+ * tile/strile */
+uint64_t TIFFGetStrileOffsetWithErr(TIFF *tif, uint32_t strile, int *pbErr)
 {
     TIFFDirectory *td = &tif->tif_dir;
     return _TIFFGetStrileOffsetOrByteCountValue(tif, strile,
-                               &(td->td_stripoffset_entry),
-                               &(td->td_stripoffset_p), pbErr);
+                                                &(td->td_stripoffset_entry),
+                                                &(td->td_stripoffset_p), pbErr);
 }
 
-/* Return the value of the TileByteCounts/StripByteCounts array for the specified tile/strile */
-uint64 TIFFGetStrileByteCount(TIFF *tif, uint32 strile)
+/* Return the value of the TileByteCounts/StripByteCounts array for the
+ * specified tile/strile */
+uint64_t TIFFGetStrileByteCount(TIFF *tif, uint32_t strile)
 {
     return TIFFGetStrileByteCountWithErr(tif, strile, NULL);
 }
 
-/* Return the value of the TileByteCounts/StripByteCounts array for the specified tile/strile */
-uint64 TIFFGetStrileByteCountWithErr(TIFF *tif, uint32 strile, int *pbErr)
+/* Return the value of the TileByteCounts/StripByteCounts array for the
+ * specified tile/strile */
+uint64_t TIFFGetStrileByteCountWithErr(TIFF *tif, uint32_t strile, int *pbErr)
 {
     TIFFDirectory *td = &tif->tif_dir;
-    return _TIFFGetStrileOffsetOrByteCountValue(tif, strile,
-                               &(td->td_stripbytecount_entry),
-                               &(td->td_stripbytecount_p), pbErr);
+    return _TIFFGetStrileOffsetOrByteCountValue(
+        tif, strile, &(td->td_stripbytecount_entry), &(td->td_stripbytecount_p),
+        pbErr);
 }
 
+int _TIFFFillStriles(TIFF *tif) { return _TIFFFillStrilesInternal(tif, 1); }
 
-int _TIFFFillStriles( TIFF *tif )
-{
-    return _TIFFFillStrilesInternal( tif, 1 );
-}
-
-static int _TIFFFillStrilesInternal( TIFF *tif, int loadStripByteCount )
+static int _TIFFFillStrilesInternal(TIFF *tif, int loadStripByteCount)
 {
     register TIFFDirectory *td = &tif->tif_dir;
     int return_value = 1;
 
     /* Do not do anything if TIFF_DEFERSTRILELOAD is not set */
-    if( !(tif->tif_flags&TIFF_DEFERSTRILELOAD) || (tif->tif_flags&TIFF_CHOPPEDUPARRAYS) != 0 )
+    if (!(tif->tif_flags & TIFF_DEFERSTRILELOAD) ||
+        (tif->tif_flags & TIFF_CHOPPEDUPARRAYS) != 0)
         return 1;
 
-    if( tif->tif_flags&TIFF_LAZYSTRILELOAD )
+    if (tif->tif_flags & TIFF_LAZYSTRILELOAD)
     {
         /* In case of lazy loading, reload completely the arrays */
-        _TIFFfree(td->td_stripoffset_p);
-        _TIFFfree(td->td_stripbytecount_p);
+        _TIFFfreeExt(tif, td->td_stripoffset_p);
+        _TIFFfreeExt(tif, td->td_stripbytecount_p);
         td->td_stripoffset_p = NULL;
         td->td_stripbytecount_p = NULL;
         td->td_stripoffsetbyteallocsize = 0;
@@ -6443,53 +7853,46 @@ static int _TIFFFillStrilesInternal( TIFF *tif, int loadStripByteCount )
     }
 
     /* If stripoffset array is already loaded, exit with success */
-    if( td->td_stripoffset_p != NULL )
-            return 1;
+    if (td->td_stripoffset_p != NULL)
+        return 1;
 
     /* If tdir_count was canceled, then we already got there, but in error */
-    if( td->td_stripoffset_entry.tdir_count == 0 )
-            return 0;
+    if (td->td_stripoffset_entry.tdir_count == 0)
+        return 0;
 
-    if (!TIFFFetchStripThing(tif,&(td->td_stripoffset_entry),
-                                td->td_nstrips,&td->td_stripoffset_p))
+    if (!TIFFFetchStripThing(tif, &(td->td_stripoffset_entry), td->td_nstrips,
+                             &td->td_stripoffset_p))
     {
-            return_value = 0;
+        return_value = 0;
     }
 
     if (loadStripByteCount &&
-        !TIFFFetchStripThing(tif,&(td->td_stripbytecount_entry),
-                                td->td_nstrips,&td->td_stripbytecount_p))
+        !TIFFFetchStripThing(tif, &(td->td_stripbytecount_entry),
+                             td->td_nstrips, &td->td_stripbytecount_p))
     {
-            return_value = 0;
+        return_value = 0;
     }
 
-    _TIFFmemset( &(td->td_stripoffset_entry), 0, sizeof(TIFFDirEntry));
-    _TIFFmemset( &(td->td_stripbytecount_entry), 0, sizeof(TIFFDirEntry));
+    _TIFFmemset(&(td->td_stripoffset_entry), 0, sizeof(TIFFDirEntry));
+    _TIFFmemset(&(td->td_stripbytecount_entry), 0, sizeof(TIFFDirEntry));
 
 #ifdef STRIPBYTECOUNTSORTED_UNUSED
-    if (tif->tif_dir.td_nstrips > 1 && return_value == 1 ) {
-            uint32 strip;
-
-            tif->tif_dir.td_stripbytecountsorted = 1;
-            for (strip = 1; strip < tif->tif_dir.td_nstrips; strip++) {
-                    if (tif->tif_dir.td_stripoffset_p[strip - 1] >
-                        tif->tif_dir.td_stripoffset_p[strip]) {
-                            tif->tif_dir.td_stripbytecountsorted = 0;
-                            break;
-                    }
+    if (tif->tif_dir.td_nstrips > 1 && return_value == 1)
+    {
+        uint32_t strip;
+
+        tif->tif_dir.td_stripbytecountsorted = 1;
+        for (strip = 1; strip < tif->tif_dir.td_nstrips; strip++)
+        {
+            if (tif->tif_dir.td_stripoffset_p[strip - 1] >
+                tif->tif_dir.td_stripoffset_p[strip])
+            {
+                tif->tif_dir.td_stripbytecountsorted = 0;
+                break;
             }
+        }
     }
 #endif
 
     return return_value;
 }
-
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_dirwrite.c b/3rdparty/libtiff/tif_dirwrite.c
index f481250e3b29..d8844bbd8a36 100644
--- a/3rdparty/libtiff/tif_dirwrite.c
+++ b/3rdparty/libtiff/tif_dirwrite.c
@@ -28,161 +28,203 @@
  * Directory Write Support Routines.
  */
 #include "tiffiop.h"
-#include <float.h>		/*--: for Rational2Double */
-#include <math.h>		/*--: for Rational2Double */
+#include <float.h> /*--: for Rational2Double */
+#include <math.h>  /*--: for Rational2Double */
 
 #ifdef HAVE_IEEEFP
 #define TIFFCvtNativeToIEEEFloat(tif, n, fp)
 #define TIFFCvtNativeToIEEEDouble(tif, n, dp)
 #else
-extern void TIFFCvtNativeToIEEEFloat(TIFF* tif, uint32 n, float* fp);
-extern void TIFFCvtNativeToIEEEDouble(TIFF* tif, uint32 n, double* dp);
+extern void TIFFCvtNativeToIEEEFloat(TIFF *tif, uint32_t n, float *fp);
+extern void TIFFCvtNativeToIEEEDouble(TIFF *tif, uint32_t n, double *dp);
 #endif
 
-static int TIFFWriteDirectorySec(TIFF* tif, int isimage, int imagedone, uint64* pdiroff);
-
-static int TIFFWriteDirectoryTagSampleformatArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
-#if 0
-static int TIFFWriteDirectoryTagSampleformatPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value);
-#endif
-
-static int TIFFWriteDirectoryTagAscii(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, char* value);
-static int TIFFWriteDirectoryTagUndefinedArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint8* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagByte(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint8 value);
-#endif
-static int TIFFWriteDirectoryTagByteArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint8* value);
-#if 0
-static int TIFFWriteDirectoryTagBytePerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint8 value);
-#endif
-#ifdef notdef
-static int TIFFWriteDirectoryTagSbyte(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int8 value);
-#endif
-static int TIFFWriteDirectoryTagSbyteArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int8* value);
-#if 0
-static int TIFFWriteDirectoryTagSbytePerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int8 value);
-#endif
-static int TIFFWriteDirectoryTagShort(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint16 value);
-static int TIFFWriteDirectoryTagShortArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint16* value);
-static int TIFFWriteDirectoryTagShortPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint16 value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagSshort(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int16 value);
-#endif
-static int TIFFWriteDirectoryTagSshortArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int16* value);
-#if 0
-static int TIFFWriteDirectoryTagSshortPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int16 value);
-#endif
-static int TIFFWriteDirectoryTagLong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 value);
-static int TIFFWriteDirectoryTagLongArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint32* value);
-#if 0
-static int TIFFWriteDirectoryTagLongPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 value);
-#endif
-#ifdef notdef
-static int TIFFWriteDirectoryTagSlong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int32 value);
-#endif
-static int TIFFWriteDirectoryTagSlongArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int32* value);
-#if 0
-static int TIFFWriteDirectoryTagSlongPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int32 value);
-#endif
-#ifdef notdef
-static int TIFFWriteDirectoryTagLong8(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint64 value);
-#endif
-static int TIFFWriteDirectoryTagLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagSlong8(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int64 value);
-#endif
-static int TIFFWriteDirectoryTagSlong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int64* value);
-static int TIFFWriteDirectoryTagRational(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value);
-static int TIFFWriteDirectoryTagRationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value);
-static int TIFFWriteDirectoryTagSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagFloat(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value);
-#endif
-static int TIFFWriteDirectoryTagFloatArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value);
-#if 0
-static int TIFFWriteDirectoryTagFloatPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value);
-#endif
-#ifdef notdef
-static int TIFFWriteDirectoryTagDouble(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value);
-#endif
-static int TIFFWriteDirectoryTagDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
-#if 0
-static int TIFFWriteDirectoryTagDoublePerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value);
-#endif
-static int TIFFWriteDirectoryTagIfdArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint32* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagIfd8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value);
-#endif
-static int TIFFWriteDirectoryTagShortLong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 value);
-static int TIFFWriteDirectoryTagLongLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value);
-static int TIFFWriteDirectoryTagIfdIfd8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagShortLongLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value);
-#endif
-static int TIFFWriteDirectoryTagColormap(TIFF* tif, uint32* ndir, TIFFDirEntry* dir);
-static int TIFFWriteDirectoryTagTransferfunction(TIFF* tif, uint32* ndir, TIFFDirEntry* dir);
-static int TIFFWriteDirectoryTagSubifd(TIFF* tif, uint32* ndir, TIFFDirEntry* dir);
-
-static int TIFFWriteDirectoryTagCheckedAscii(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, char* value);
-static int TIFFWriteDirectoryTagCheckedUndefinedArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint8* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagCheckedByte(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint8 value);
-#endif
-static int TIFFWriteDirectoryTagCheckedByteArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint8* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagCheckedSbyte(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int8 value);
-#endif
-static int TIFFWriteDirectoryTagCheckedSbyteArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int8* value);
-static int TIFFWriteDirectoryTagCheckedShort(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint16 value);
-static int TIFFWriteDirectoryTagCheckedShortArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint16* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagCheckedSshort(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int16 value);
-#endif
-static int TIFFWriteDirectoryTagCheckedSshortArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int16* value);
-static int TIFFWriteDirectoryTagCheckedLong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 value);
-static int TIFFWriteDirectoryTagCheckedLongArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint32* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagCheckedSlong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int32 value);
-#endif
-static int TIFFWriteDirectoryTagCheckedSlongArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int32* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagCheckedLong8(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint64 value);
-#endif
-static int TIFFWriteDirectoryTagCheckedLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagCheckedSlong8(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int64 value);
-#endif
-static int TIFFWriteDirectoryTagCheckedSlong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int64* value);
-static int TIFFWriteDirectoryTagCheckedRational(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value);
-static int TIFFWriteDirectoryTagCheckedRationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value);
-static int TIFFWriteDirectoryTagCheckedSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value);
-
-/*--: Rational2Double: New functions to support true double-precision for custom rational tag types. */
-static int TIFFWriteDirectoryTagRationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
-static int TIFFWriteDirectoryTagSrationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
-static int TIFFWriteDirectoryTagCheckedRationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
-static int TIFFWriteDirectoryTagCheckedSrationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
-static void DoubleToRational(double value, uint32 *num, uint32 *denom);
-static void DoubleToSrational(double value, int32 *num, int32 *denom);
-#if 0
-static void DoubleToRational_direct(double value, unsigned long *num, unsigned long *denom);
-static void DoubleToSrational_direct(double value, long *num, long *denom);
-#endif
-
-#ifdef notdef
-static int TIFFWriteDirectoryTagCheckedFloat(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value);
-#endif
-static int TIFFWriteDirectoryTagCheckedFloatArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value);
-#ifdef notdef
-static int TIFFWriteDirectoryTagCheckedDouble(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value);
-#endif
-static int TIFFWriteDirectoryTagCheckedDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value);
-static int TIFFWriteDirectoryTagCheckedIfdArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint32* value);
-static int TIFFWriteDirectoryTagCheckedIfd8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value);
-
-static int TIFFWriteDirectoryTagData(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint16 datatype, uint32 count, uint32 datalength, void* data);
-
-static int TIFFLinkDirectory(TIFF*);
+static int TIFFWriteDirectorySec(TIFF *tif, int isimage, int imagedone,
+                                 uint64_t *pdiroff);
+
+static int TIFFWriteDirectoryTagSampleformatArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  double *value);
+
+static int TIFFWriteDirectoryTagAscii(TIFF *tif, uint32_t *ndir,
+                                      TIFFDirEntry *dir, uint16_t tag,
+                                      uint32_t count, char *value);
+static int TIFFWriteDirectoryTagUndefinedArray(TIFF *tif, uint32_t *ndir,
+                                               TIFFDirEntry *dir, uint16_t tag,
+                                               uint32_t count, uint8_t *value);
+static int TIFFWriteDirectoryTagByteArray(TIFF *tif, uint32_t *ndir,
+                                          TIFFDirEntry *dir, uint16_t tag,
+                                          uint32_t count, uint8_t *value);
+static int TIFFWriteDirectoryTagSbyteArray(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, int8_t *value);
+static int TIFFWriteDirectoryTagShort(TIFF *tif, uint32_t *ndir,
+                                      TIFFDirEntry *dir, uint16_t tag,
+                                      uint16_t value);
+static int TIFFWriteDirectoryTagShortArray(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, uint16_t *value);
+static int TIFFWriteDirectoryTagShortPerSample(TIFF *tif, uint32_t *ndir,
+                                               TIFFDirEntry *dir, uint16_t tag,
+                                               uint16_t value);
+static int TIFFWriteDirectoryTagSshortArray(TIFF *tif, uint32_t *ndir,
+                                            TIFFDirEntry *dir, uint16_t tag,
+                                            uint32_t count, int16_t *value);
+static int TIFFWriteDirectoryTagLong(TIFF *tif, uint32_t *ndir,
+                                     TIFFDirEntry *dir, uint16_t tag,
+                                     uint32_t value);
+static int TIFFWriteDirectoryTagLongArray(TIFF *tif, uint32_t *ndir,
+                                          TIFFDirEntry *dir, uint16_t tag,
+                                          uint32_t count, uint32_t *value);
+static int TIFFWriteDirectoryTagSlongArray(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, int32_t *value);
+static int TIFFWriteDirectoryTagLong8Array(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, uint64_t *value);
+static int TIFFWriteDirectoryTagSlong8Array(TIFF *tif, uint32_t *ndir,
+                                            TIFFDirEntry *dir, uint16_t tag,
+                                            uint32_t count, int64_t *value);
+static int TIFFWriteDirectoryTagRational(TIFF *tif, uint32_t *ndir,
+                                         TIFFDirEntry *dir, uint16_t tag,
+                                         double value);
+static int TIFFWriteDirectoryTagRationalArray(TIFF *tif, uint32_t *ndir,
+                                              TIFFDirEntry *dir, uint16_t tag,
+                                              uint32_t count, float *value);
+static int TIFFWriteDirectoryTagSrationalArray(TIFF *tif, uint32_t *ndir,
+                                               TIFFDirEntry *dir, uint16_t tag,
+                                               uint32_t count, float *value);
+static int TIFFWriteDirectoryTagFloatArray(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, float *value);
+static int TIFFWriteDirectoryTagDoubleArray(TIFF *tif, uint32_t *ndir,
+                                            TIFFDirEntry *dir, uint16_t tag,
+                                            uint32_t count, double *value);
+static int TIFFWriteDirectoryTagIfdArray(TIFF *tif, uint32_t *ndir,
+                                         TIFFDirEntry *dir, uint16_t tag,
+                                         uint32_t count, uint32_t *value);
+static int TIFFWriteDirectoryTagShortLong(TIFF *tif, uint32_t *ndir,
+                                          TIFFDirEntry *dir, uint16_t tag,
+                                          uint32_t value);
+static int TIFFWriteDirectoryTagLongLong8Array(TIFF *tif, uint32_t *ndir,
+                                               TIFFDirEntry *dir, uint16_t tag,
+                                               uint32_t count, uint64_t *value);
+static int TIFFWriteDirectoryTagIfdIfd8Array(TIFF *tif, uint32_t *ndir,
+                                             TIFFDirEntry *dir, uint16_t tag,
+                                             uint32_t count, uint64_t *value);
+static int TIFFWriteDirectoryTagColormap(TIFF *tif, uint32_t *ndir,
+                                         TIFFDirEntry *dir);
+static int TIFFWriteDirectoryTagTransferfunction(TIFF *tif, uint32_t *ndir,
+                                                 TIFFDirEntry *dir);
+static int TIFFWriteDirectoryTagSubifd(TIFF *tif, uint32_t *ndir,
+                                       TIFFDirEntry *dir);
+
+static int TIFFWriteDirectoryTagCheckedAscii(TIFF *tif, uint32_t *ndir,
+                                             TIFFDirEntry *dir, uint16_t tag,
+                                             uint32_t count, char *value);
+static int TIFFWriteDirectoryTagCheckedUndefinedArray(TIFF *tif, uint32_t *ndir,
+                                                      TIFFDirEntry *dir,
+                                                      uint16_t tag,
+                                                      uint32_t count,
+                                                      uint8_t *value);
+static int TIFFWriteDirectoryTagCheckedByteArray(TIFF *tif, uint32_t *ndir,
+                                                 TIFFDirEntry *dir,
+                                                 uint16_t tag, uint32_t count,
+                                                 uint8_t *value);
+static int TIFFWriteDirectoryTagCheckedSbyteArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  int8_t *value);
+static int TIFFWriteDirectoryTagCheckedShort(TIFF *tif, uint32_t *ndir,
+                                             TIFFDirEntry *dir, uint16_t tag,
+                                             uint16_t value);
+static int TIFFWriteDirectoryTagCheckedShortArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  uint16_t *value);
+static int TIFFWriteDirectoryTagCheckedSshortArray(TIFF *tif, uint32_t *ndir,
+                                                   TIFFDirEntry *dir,
+                                                   uint16_t tag, uint32_t count,
+                                                   int16_t *value);
+static int TIFFWriteDirectoryTagCheckedLong(TIFF *tif, uint32_t *ndir,
+                                            TIFFDirEntry *dir, uint16_t tag,
+                                            uint32_t value);
+static int TIFFWriteDirectoryTagCheckedLongArray(TIFF *tif, uint32_t *ndir,
+                                                 TIFFDirEntry *dir,
+                                                 uint16_t tag, uint32_t count,
+                                                 uint32_t *value);
+static int TIFFWriteDirectoryTagCheckedSlongArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  int32_t *value);
+static int TIFFWriteDirectoryTagCheckedLong8Array(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  uint64_t *value);
+static int TIFFWriteDirectoryTagCheckedSlong8Array(TIFF *tif, uint32_t *ndir,
+                                                   TIFFDirEntry *dir,
+                                                   uint16_t tag, uint32_t count,
+                                                   int64_t *value);
+static int TIFFWriteDirectoryTagCheckedRational(TIFF *tif, uint32_t *ndir,
+                                                TIFFDirEntry *dir, uint16_t tag,
+                                                double value);
+static int TIFFWriteDirectoryTagCheckedRationalArray(TIFF *tif, uint32_t *ndir,
+                                                     TIFFDirEntry *dir,
+                                                     uint16_t tag,
+                                                     uint32_t count,
+                                                     float *value);
+static int TIFFWriteDirectoryTagCheckedSrationalArray(TIFF *tif, uint32_t *ndir,
+                                                      TIFFDirEntry *dir,
+                                                      uint16_t tag,
+                                                      uint32_t count,
+                                                      float *value);
+
+/*--: Rational2Double: New functions to support true double-precision for custom
+ * rational tag types. */
+static int TIFFWriteDirectoryTagRationalDoubleArray(TIFF *tif, uint32_t *ndir,
+                                                    TIFFDirEntry *dir,
+                                                    uint16_t tag,
+                                                    uint32_t count,
+                                                    double *value);
+static int TIFFWriteDirectoryTagSrationalDoubleArray(TIFF *tif, uint32_t *ndir,
+                                                     TIFFDirEntry *dir,
+                                                     uint16_t tag,
+                                                     uint32_t count,
+                                                     double *value);
+static int
+TIFFWriteDirectoryTagCheckedRationalDoubleArray(TIFF *tif, uint32_t *ndir,
+                                                TIFFDirEntry *dir, uint16_t tag,
+                                                uint32_t count, double *value);
+static int TIFFWriteDirectoryTagCheckedSrationalDoubleArray(
+    TIFF *tif, uint32_t *ndir, TIFFDirEntry *dir, uint16_t tag, uint32_t count,
+    double *value);
+static void DoubleToRational(double value, uint32_t *num, uint32_t *denom);
+static void DoubleToSrational(double value, int32_t *num, int32_t *denom);
+
+static int TIFFWriteDirectoryTagCheckedFloatArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  float *value);
+static int TIFFWriteDirectoryTagCheckedDoubleArray(TIFF *tif, uint32_t *ndir,
+                                                   TIFFDirEntry *dir,
+                                                   uint16_t tag, uint32_t count,
+                                                   double *value);
+static int TIFFWriteDirectoryTagCheckedIfdArray(TIFF *tif, uint32_t *ndir,
+                                                TIFFDirEntry *dir, uint16_t tag,
+                                                uint32_t count,
+                                                uint32_t *value);
+static int TIFFWriteDirectoryTagCheckedIfd8Array(TIFF *tif, uint32_t *ndir,
+                                                 TIFFDirEntry *dir,
+                                                 uint16_t tag, uint32_t count,
+                                                 uint64_t *value);
+
+static int TIFFWriteDirectoryTagData(TIFF *tif, uint32_t *ndir,
+                                     TIFFDirEntry *dir, uint16_t tag,
+                                     uint16_t datatype, uint32_t count,
+                                     uint32_t datalength, void *data);
+
+static int TIFFLinkDirectory(TIFF *);
 
 /*
  * Write the contents of the current directory
@@ -190,10 +232,9 @@ static int TIFFLinkDirectory(TIFF*);
  * handle overwriting a directory with auxiliary
  * storage that's been changed.
  */
-int
-TIFFWriteDirectory(TIFF* tif)
+int TIFFWriteDirectory(TIFF *tif)
 {
-	return TIFFWriteDirectorySec(tif,TRUE,TRUE,NULL);
+    return TIFFWriteDirectorySec(tif, TRUE, TRUE, NULL);
 }
 
 /*
@@ -221,19 +262,17 @@ TIFFWriteDirectory(TIFF* tif)
  *
  * Returns 1 in case of success, 0 otherwise.
  */
-int TIFFDeferStrileArrayWriting(TIFF* tif)
+int TIFFDeferStrileArrayWriting(TIFF *tif)
 {
     static const char module[] = "TIFFDeferStrileArrayWriting";
     if (tif->tif_mode == O_RDONLY)
     {
-        TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-                     "File opened in read-only mode");
+        TIFFErrorExtR(tif, tif->tif_name, "File opened in read-only mode");
         return 0;
     }
-    if( tif->tif_diroff != 0 )
+    if (tif->tif_diroff != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                     "Directory has already been written");
+        TIFFErrorExtR(tif, module, "Directory has already been written");
         return 0;
     }
 
@@ -247,359 +286,430 @@ int TIFFDeferStrileArrayWriting(TIFF* tif)
  * written again.  This will make a partially written TIFF file
  * readable before it is successfully completed/closed.
  */
-int
-TIFFCheckpointDirectory(TIFF* tif)
+int TIFFCheckpointDirectory(TIFF *tif)
 {
-	int rc;
-	/* Setup the strips arrays, if they haven't already been. */
-	if (tif->tif_dir.td_stripoffset_p == NULL)
-	    (void) TIFFSetupStrips(tif);
-	rc = TIFFWriteDirectorySec(tif,TRUE,FALSE,NULL);
-	(void) TIFFSetWriteOffset(tif, TIFFSeekFile(tif, 0, SEEK_END));
-	return rc;
+    int rc;
+    /* Setup the strips arrays, if they haven't already been. */
+    if (tif->tif_dir.td_stripoffset_p == NULL)
+        (void)TIFFSetupStrips(tif);
+    rc = TIFFWriteDirectorySec(tif, TRUE, FALSE, NULL);
+    (void)TIFFSetWriteOffset(tif, TIFFSeekFile(tif, 0, SEEK_END));
+    return rc;
 }
 
-int
-TIFFWriteCustomDirectory(TIFF* tif, uint64* pdiroff)
+int TIFFWriteCustomDirectory(TIFF *tif, uint64_t *pdiroff)
 {
-	return TIFFWriteDirectorySec(tif,FALSE,FALSE,pdiroff);
+    return TIFFWriteDirectorySec(tif, FALSE, FALSE, pdiroff);
 }
 
 /*
  * Similar to TIFFWriteDirectory(), but if the directory has already
  * been written once, it is relocated to the end of the file, in case it
  * has changed in size.  Note that this will result in the loss of the
- * previously used directory space. 
- */ 
-int
-TIFFRewriteDirectory( TIFF *tif )
-{
-	static const char module[] = "TIFFRewriteDirectory";
-
-	/* We don't need to do anything special if it hasn't been written. */
-	if( tif->tif_diroff == 0 )
-		return TIFFWriteDirectory( tif );
-
-	/*
-	 * Find and zero the pointer to this directory, so that TIFFLinkDirectory
-	 * will cause it to be added after this directories current pre-link.
-	 */
-
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		if (tif->tif_header.classic.tiff_diroff == tif->tif_diroff)
-		{
-			tif->tif_header.classic.tiff_diroff = 0;
-			tif->tif_diroff = 0;
-
-			TIFFSeekFile(tif,4,SEEK_SET);
-			if (!WriteOK(tif, &(tif->tif_header.classic.tiff_diroff),4))
-			{
-				TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-				    "Error updating TIFF header");
-				return (0);
-			}
-		}
-		else
-		{
-			uint32 nextdir;
-			nextdir = tif->tif_header.classic.tiff_diroff;
-			while(1) {
-				uint16 dircount;
-				uint32 nextnextdir;
-
-				if (!SeekOK(tif, nextdir) ||
-				    !ReadOK(tif, &dircount, 2)) {
-					TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error fetching directory count");
-					return (0);
-				}
-				if (tif->tif_flags & TIFF_SWAB)
-					TIFFSwabShort(&dircount);
-				(void) TIFFSeekFile(tif,
-				    nextdir+2+dircount*12, SEEK_SET);
-				if (!ReadOK(tif, &nextnextdir, 4)) {
-					TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error fetching directory link");
-					return (0);
-				}
-				if (tif->tif_flags & TIFF_SWAB)
-					TIFFSwabLong(&nextnextdir);
-				if (nextnextdir==tif->tif_diroff)
-				{
-					uint32 m;
-					m=0;
-					(void) TIFFSeekFile(tif,
-					    nextdir+2+dircount*12, SEEK_SET);
-					if (!WriteOK(tif, &m, 4)) {
-						TIFFErrorExt(tif->tif_clientdata, module,
-						     "Error writing directory link");
-						return (0);
-					}
-					tif->tif_diroff=0;
-					break;
-				}
-				nextdir=nextnextdir;
-			}
-		}
-	}
-	else
-	{
-		if (tif->tif_header.big.tiff_diroff == tif->tif_diroff)
-		{
-			tif->tif_header.big.tiff_diroff = 0;
-			tif->tif_diroff = 0;
-
-			TIFFSeekFile(tif,8,SEEK_SET);
-			if (!WriteOK(tif, &(tif->tif_header.big.tiff_diroff),8))
-			{
-				TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-				    "Error updating TIFF header");
-				return (0);
-			}
-		}
-		else
-		{
-			uint64 nextdir;
-			nextdir = tif->tif_header.big.tiff_diroff;
-			while(1) {
-				uint64 dircount64;
-				uint16 dircount;
-				uint64 nextnextdir;
-
-				if (!SeekOK(tif, nextdir) ||
-				    !ReadOK(tif, &dircount64, 8)) {
-					TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error fetching directory count");
-					return (0);
-				}
-				if (tif->tif_flags & TIFF_SWAB)
-					TIFFSwabLong8(&dircount64);
-				if (dircount64>0xFFFF)
-				{
-					TIFFErrorExt(tif->tif_clientdata, module,
-					     "Sanity check on tag count failed, likely corrupt TIFF");
-					return (0);
-				}
-				dircount=(uint16)dircount64;
-				(void) TIFFSeekFile(tif,
-				    nextdir+8+dircount*20, SEEK_SET);
-				if (!ReadOK(tif, &nextnextdir, 8)) {
-					TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error fetching directory link");
-					return (0);
-				}
-				if (tif->tif_flags & TIFF_SWAB)
-					TIFFSwabLong8(&nextnextdir);
-				if (nextnextdir==tif->tif_diroff)
-				{
-					uint64 m;
-					m=0;
-					(void) TIFFSeekFile(tif,
-					    nextdir+8+dircount*20, SEEK_SET);
-					if (!WriteOK(tif, &m, 8)) {
-						TIFFErrorExt(tif->tif_clientdata, module,
-						     "Error writing directory link");
-						return (0);
-					}
-					tif->tif_diroff=0;
-					break;
-				}
-				nextdir=nextnextdir;
-			}
-		}
-	}
-
-	/*
-	 * Now use TIFFWriteDirectory() normally.
-	 */
-
-	return TIFFWriteDirectory( tif );
-}
+ * previously used directory space.
+ */
+int TIFFRewriteDirectory(TIFF *tif)
+{
+    static const char module[] = "TIFFRewriteDirectory";
 
-static int
-TIFFWriteDirectorySec(TIFF* tif, int isimage, int imagedone, uint64* pdiroff)
-{
-	static const char module[] = "TIFFWriteDirectorySec";
-	uint32 ndir;
-	TIFFDirEntry* dir;
-	uint32 dirsize;
-	void* dirmem;
-	uint32 m;
-	if (tif->tif_mode == O_RDONLY)
-		return (1);
-
-        _TIFFFillStriles( tif );
-        
-	/*
-	 * Clear write state so that subsequent images with
-	 * different characteristics get the right buffers
-	 * setup for them.
-	 */
-	if (imagedone)
-	{
-		if (tif->tif_flags & TIFF_POSTENCODE)
-		{
-			tif->tif_flags &= ~TIFF_POSTENCODE;
-			if (!(*tif->tif_postencode)(tif))
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,
-				    "Error post-encoding before directory write");
-				return (0);
-			}
-		}
-		(*tif->tif_close)(tif);       /* shutdown encoder */
-		/*
-		 * Flush any data that might have been written
-		 * by the compression close+cleanup routines.  But
-                 * be careful not to write stuff if we didn't add data
-                 * in the previous steps as the "rawcc" data may well be
-                 * a previously read tile/strip in mixed read/write mode.
-		 */
-		if (tif->tif_rawcc > 0 
-		    && (tif->tif_flags & TIFF_BEENWRITING) != 0 )
-		{
-		    if( !TIFFFlushData1(tif) )
+    /* We don't need to do anything special if it hasn't been written. */
+    if (tif->tif_diroff == 0)
+        return TIFFWriteDirectory(tif);
+
+    /*
+     * Find and zero the pointer to this directory, so that TIFFLinkDirectory
+     * will cause it to be added after this directories current pre-link.
+     */
+    uint64_t torewritediroff = tif->tif_diroff;
+
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        if (tif->tif_header.classic.tiff_diroff == tif->tif_diroff)
+        {
+            tif->tif_header.classic.tiff_diroff = 0;
+            tif->tif_diroff = 0;
+
+            TIFFSeekFile(tif, 4, SEEK_SET);
+            if (!WriteOK(tif, &(tif->tif_header.classic.tiff_diroff), 4))
+            {
+                TIFFErrorExtR(tif, tif->tif_name, "Error updating TIFF header");
+                return (0);
+            }
+        }
+        else if (tif->tif_diroff > 0xFFFFFFFFU)
+        {
+            TIFFErrorExtR(tif, module,
+                          "tif->tif_diroff exceeds 32 bit range allowed for "
+                          "Classic TIFF");
+            return (0);
+        }
+        else
+        {
+            uint32_t nextdir;
+            nextdir = tif->tif_header.classic.tiff_diroff;
+            while (1)
+            {
+                uint16_t dircount;
+                uint32_t nextnextdir;
+
+                if (!SeekOK(tif, nextdir) || !ReadOK(tif, &dircount, 2))
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "Error fetching directory count");
+                    return (0);
+                }
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabShort(&dircount);
+                (void)TIFFSeekFile(tif, nextdir + 2 + dircount * 12, SEEK_SET);
+                if (!ReadOK(tif, &nextnextdir, 4))
+                {
+                    TIFFErrorExtR(tif, module, "Error fetching directory link");
+                    return (0);
+                }
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong(&nextnextdir);
+                if (nextnextdir == tif->tif_diroff)
+                {
+                    uint32_t m;
+                    m = 0;
+                    (void)TIFFSeekFile(tif, nextdir + 2 + dircount * 12,
+                                       SEEK_SET);
+                    if (!WriteOK(tif, &m, 4))
+                    {
+                        TIFFErrorExtR(tif, module,
+                                      "Error writing directory link");
+                        return (0);
+                    }
+                    tif->tif_diroff = 0;
+                    /* Force a full-traversal to reach the zeroed pointer */
+                    tif->tif_lastdiroff = 0;
+                    break;
+                }
+                nextdir = nextnextdir;
+            }
+        }
+        /* Remove skipped offset from IFD loop directory list. */
+        _TIFFRemoveEntryFromDirectoryListByOffset(tif, torewritediroff);
+    }
+    else
+    {
+        if (tif->tif_header.big.tiff_diroff == tif->tif_diroff)
+        {
+            tif->tif_header.big.tiff_diroff = 0;
+            tif->tif_diroff = 0;
+
+            TIFFSeekFile(tif, 8, SEEK_SET);
+            if (!WriteOK(tif, &(tif->tif_header.big.tiff_diroff), 8))
+            {
+                TIFFErrorExtR(tif, tif->tif_name, "Error updating TIFF header");
+                return (0);
+            }
+        }
+        else
+        {
+            uint64_t nextdir;
+            nextdir = tif->tif_header.big.tiff_diroff;
+            while (1)
+            {
+                uint64_t dircount64;
+                uint16_t dircount;
+                uint64_t nextnextdir;
+
+                if (!SeekOK(tif, nextdir) || !ReadOK(tif, &dircount64, 8))
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "Error fetching directory count");
+                    return (0);
+                }
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(&dircount64);
+                if (dircount64 > 0xFFFF)
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "Sanity check on tag count failed, likely "
+                                  "corrupt TIFF");
+                    return (0);
+                }
+                dircount = (uint16_t)dircount64;
+                (void)TIFFSeekFile(tif, nextdir + 8 + dircount * 20, SEEK_SET);
+                if (!ReadOK(tif, &nextnextdir, 8))
+                {
+                    TIFFErrorExtR(tif, module, "Error fetching directory link");
+                    return (0);
+                }
+                if (tif->tif_flags & TIFF_SWAB)
+                    TIFFSwabLong8(&nextnextdir);
+                if (nextnextdir == tif->tif_diroff)
+                {
+                    uint64_t m;
+                    m = 0;
+                    (void)TIFFSeekFile(tif, nextdir + 8 + dircount * 20,
+                                       SEEK_SET);
+                    if (!WriteOK(tif, &m, 8))
                     {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Error flushing data before directory write");
-			return (0);
+                        TIFFErrorExtR(tif, module,
+                                      "Error writing directory link");
+                        return (0);
                     }
-		}
-		if ((tif->tif_flags & TIFF_MYBUFFER) && tif->tif_rawdata)
-		{
-			_TIFFfree(tif->tif_rawdata);
-			tif->tif_rawdata = NULL;
-			tif->tif_rawcc = 0;
-			tif->tif_rawdatasize = 0;
-                        tif->tif_rawdataoff = 0;
-                        tif->tif_rawdataloaded = 0;
-		}
-		tif->tif_flags &= ~(TIFF_BEENWRITING|TIFF_BUFFERSETUP);
-	}
-	dir=NULL;
-	dirmem=NULL;
-	dirsize=0;
-	while (1)
-	{
-		ndir=0;
-		if (isimage)
-		{
-			if (TIFFFieldSet(tif,FIELD_IMAGEDIMENSIONS))
-			{
-				if (!TIFFWriteDirectoryTagShortLong(tif,&ndir,dir,TIFFTAG_IMAGEWIDTH,tif->tif_dir.td_imagewidth))
-					goto bad;
-				if (!TIFFWriteDirectoryTagShortLong(tif,&ndir,dir,TIFFTAG_IMAGELENGTH,tif->tif_dir.td_imagelength))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_TILEDIMENSIONS))
-			{
-				if (!TIFFWriteDirectoryTagShortLong(tif,&ndir,dir,TIFFTAG_TILEWIDTH,tif->tif_dir.td_tilewidth))
-					goto bad;
-				if (!TIFFWriteDirectoryTagShortLong(tif,&ndir,dir,TIFFTAG_TILELENGTH,tif->tif_dir.td_tilelength))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_RESOLUTION))
-			{
-				if (!TIFFWriteDirectoryTagRational(tif,&ndir,dir,TIFFTAG_XRESOLUTION,tif->tif_dir.td_xresolution))
-					goto bad;
-				if (!TIFFWriteDirectoryTagRational(tif,&ndir,dir,TIFFTAG_YRESOLUTION,tif->tif_dir.td_yresolution))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_POSITION))
-			{
-				if (!TIFFWriteDirectoryTagRational(tif,&ndir,dir,TIFFTAG_XPOSITION,tif->tif_dir.td_xposition))
-					goto bad;
-				if (!TIFFWriteDirectoryTagRational(tif,&ndir,dir,TIFFTAG_YPOSITION,tif->tif_dir.td_yposition))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_SUBFILETYPE))
-			{
-				if (!TIFFWriteDirectoryTagLong(tif,&ndir,dir,TIFFTAG_SUBFILETYPE,tif->tif_dir.td_subfiletype))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_BITSPERSAMPLE))
-			{
-				if (!TIFFWriteDirectoryTagShortPerSample(tif,&ndir,dir,TIFFTAG_BITSPERSAMPLE,tif->tif_dir.td_bitspersample))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_COMPRESSION))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_COMPRESSION,tif->tif_dir.td_compression))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_PHOTOMETRIC))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_PHOTOMETRIC,tif->tif_dir.td_photometric))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_THRESHHOLDING))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_THRESHHOLDING,tif->tif_dir.td_threshholding))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_FILLORDER))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_FILLORDER,tif->tif_dir.td_fillorder))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_ORIENTATION))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_ORIENTATION,tif->tif_dir.td_orientation))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_SAMPLESPERPIXEL))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_SAMPLESPERPIXEL,tif->tif_dir.td_samplesperpixel))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_ROWSPERSTRIP))
-			{
-				if (!TIFFWriteDirectoryTagShortLong(tif,&ndir,dir,TIFFTAG_ROWSPERSTRIP,tif->tif_dir.td_rowsperstrip))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_MINSAMPLEVALUE))
-			{
-				if (!TIFFWriteDirectoryTagShortPerSample(tif,&ndir,dir,TIFFTAG_MINSAMPLEVALUE,tif->tif_dir.td_minsamplevalue))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_MAXSAMPLEVALUE))
-			{
-				if (!TIFFWriteDirectoryTagShortPerSample(tif,&ndir,dir,TIFFTAG_MAXSAMPLEVALUE,tif->tif_dir.td_maxsamplevalue))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_PLANARCONFIG))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_PLANARCONFIG,tif->tif_dir.td_planarconfig))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_RESOLUTIONUNIT))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_RESOLUTIONUNIT,tif->tif_dir.td_resolutionunit))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_PAGENUMBER))
-			{
-				if (!TIFFWriteDirectoryTagShortArray(tif,&ndir,dir,TIFFTAG_PAGENUMBER,2,&tif->tif_dir.td_pagenumber[0]))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_STRIPBYTECOUNTS))
-			{
-				if (!isTiled(tif))
-				{
-					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_STRIPBYTECOUNTS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripbytecount_p))
-						goto bad;
-				}
-				else
-				{
-					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_TILEBYTECOUNTS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripbytecount_p))
-						goto bad;
-				}
-			}
-			if (TIFFFieldSet(tif,FIELD_STRIPOFFSETS))
-			{
-				if (!isTiled(tif))
-				{
+                    tif->tif_diroff = 0;
+                    /* Force a full-traversal to reach the zeroed pointer */
+                    tif->tif_lastdiroff = 0;
+                    break;
+                }
+                nextdir = nextnextdir;
+            }
+        }
+        /* Remove skipped offset from IFD loop directory list. */
+        _TIFFRemoveEntryFromDirectoryListByOffset(tif, torewritediroff);
+    }
+
+    /*
+     * Now use TIFFWriteDirectory() normally.
+     */
+
+    return TIFFWriteDirectory(tif);
+}
+
+static int TIFFWriteDirectorySec(TIFF *tif, int isimage, int imagedone,
+                                 uint64_t *pdiroff)
+{
+    static const char module[] = "TIFFWriteDirectorySec";
+    uint32_t ndir;
+    TIFFDirEntry *dir;
+    uint32_t dirsize;
+    void *dirmem;
+    uint32_t m;
+    if (tif->tif_mode == O_RDONLY)
+        return (1);
+
+    _TIFFFillStriles(tif);
+
+    /*
+     * Clear write state so that subsequent images with
+     * different characteristics get the right buffers
+     * setup for them.
+     */
+    if (imagedone)
+    {
+        if (tif->tif_flags & TIFF_POSTENCODE)
+        {
+            tif->tif_flags &= ~TIFF_POSTENCODE;
+            if (!(*tif->tif_postencode)(tif))
+            {
+                TIFFErrorExtR(tif, module,
+                              "Error post-encoding before directory write");
+                return (0);
+            }
+        }
+        (*tif->tif_close)(tif); /* shutdown encoder */
+        /*
+         * Flush any data that might have been written
+         * by the compression close+cleanup routines.  But
+         * be careful not to write stuff if we didn't add data
+         * in the previous steps as the "rawcc" data may well be
+         * a previously read tile/strip in mixed read/write mode.
+         */
+        if (tif->tif_rawcc > 0 && (tif->tif_flags & TIFF_BEENWRITING) != 0)
+        {
+            if (!TIFFFlushData1(tif))
+            {
+                TIFFErrorExtR(tif, module,
+                              "Error flushing data before directory write");
+                return (0);
+            }
+        }
+        if ((tif->tif_flags & TIFF_MYBUFFER) && tif->tif_rawdata)
+        {
+            _TIFFfreeExt(tif, tif->tif_rawdata);
+            tif->tif_rawdata = NULL;
+            tif->tif_rawcc = 0;
+            tif->tif_rawdatasize = 0;
+            tif->tif_rawdataoff = 0;
+            tif->tif_rawdataloaded = 0;
+        }
+        tif->tif_flags &= ~(TIFF_BEENWRITING | TIFF_BUFFERSETUP);
+    }
+
+    if (TIFFFieldSet(tif, FIELD_COMPRESSION) &&
+        (tif->tif_dir.td_compression == COMPRESSION_DEFLATE))
+    {
+        TIFFWarningExtR(tif, module,
+                        "Creating TIFF with legacy Deflate codec identifier, "
+                        "COMPRESSION_ADOBE_DEFLATE is more widely supported");
+    }
+    dir = NULL;
+    dirmem = NULL;
+    dirsize = 0;
+    while (1)
+    {
+        ndir = 0;
+        if (isimage)
+        {
+            if (TIFFFieldSet(tif, FIELD_IMAGEDIMENSIONS))
+            {
+                if (!TIFFWriteDirectoryTagShortLong(tif, &ndir, dir,
+                                                    TIFFTAG_IMAGEWIDTH,
+                                                    tif->tif_dir.td_imagewidth))
+                    goto bad;
+                if (!TIFFWriteDirectoryTagShortLong(
+                        tif, &ndir, dir, TIFFTAG_IMAGELENGTH,
+                        tif->tif_dir.td_imagelength))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_TILEDIMENSIONS))
+            {
+                if (!TIFFWriteDirectoryTagShortLong(tif, &ndir, dir,
+                                                    TIFFTAG_TILEWIDTH,
+                                                    tif->tif_dir.td_tilewidth))
+                    goto bad;
+                if (!TIFFWriteDirectoryTagShortLong(tif, &ndir, dir,
+                                                    TIFFTAG_TILELENGTH,
+                                                    tif->tif_dir.td_tilelength))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_RESOLUTION))
+            {
+                if (!TIFFWriteDirectoryTagRational(tif, &ndir, dir,
+                                                   TIFFTAG_XRESOLUTION,
+                                                   tif->tif_dir.td_xresolution))
+                    goto bad;
+                if (!TIFFWriteDirectoryTagRational(tif, &ndir, dir,
+                                                   TIFFTAG_YRESOLUTION,
+                                                   tif->tif_dir.td_yresolution))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_POSITION))
+            {
+                if (!TIFFWriteDirectoryTagRational(tif, &ndir, dir,
+                                                   TIFFTAG_XPOSITION,
+                                                   tif->tif_dir.td_xposition))
+                    goto bad;
+                if (!TIFFWriteDirectoryTagRational(tif, &ndir, dir,
+                                                   TIFFTAG_YPOSITION,
+                                                   tif->tif_dir.td_yposition))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_SUBFILETYPE))
+            {
+                if (!TIFFWriteDirectoryTagLong(tif, &ndir, dir,
+                                               TIFFTAG_SUBFILETYPE,
+                                               tif->tif_dir.td_subfiletype))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_BITSPERSAMPLE))
+            {
+                if (!TIFFWriteDirectoryTagShortPerSample(
+                        tif, &ndir, dir, TIFFTAG_BITSPERSAMPLE,
+                        tif->tif_dir.td_bitspersample))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_COMPRESSION))
+            {
+                if (!TIFFWriteDirectoryTagShort(tif, &ndir, dir,
+                                                TIFFTAG_COMPRESSION,
+                                                tif->tif_dir.td_compression))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_PHOTOMETRIC))
+            {
+                if (!TIFFWriteDirectoryTagShort(tif, &ndir, dir,
+                                                TIFFTAG_PHOTOMETRIC,
+                                                tif->tif_dir.td_photometric))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_THRESHHOLDING))
+            {
+                if (!TIFFWriteDirectoryTagShort(tif, &ndir, dir,
+                                                TIFFTAG_THRESHHOLDING,
+                                                tif->tif_dir.td_threshholding))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_FILLORDER))
+            {
+                if (!TIFFWriteDirectoryTagShort(tif, &ndir, dir,
+                                                TIFFTAG_FILLORDER,
+                                                tif->tif_dir.td_fillorder))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_ORIENTATION))
+            {
+                if (!TIFFWriteDirectoryTagShort(tif, &ndir, dir,
+                                                TIFFTAG_ORIENTATION,
+                                                tif->tif_dir.td_orientation))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_SAMPLESPERPIXEL))
+            {
+                if (!TIFFWriteDirectoryTagShort(
+                        tif, &ndir, dir, TIFFTAG_SAMPLESPERPIXEL,
+                        tif->tif_dir.td_samplesperpixel))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_ROWSPERSTRIP))
+            {
+                if (!TIFFWriteDirectoryTagShortLong(
+                        tif, &ndir, dir, TIFFTAG_ROWSPERSTRIP,
+                        tif->tif_dir.td_rowsperstrip))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_MINSAMPLEVALUE))
+            {
+                if (!TIFFWriteDirectoryTagShortPerSample(
+                        tif, &ndir, dir, TIFFTAG_MINSAMPLEVALUE,
+                        tif->tif_dir.td_minsamplevalue))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_MAXSAMPLEVALUE))
+            {
+                if (!TIFFWriteDirectoryTagShortPerSample(
+                        tif, &ndir, dir, TIFFTAG_MAXSAMPLEVALUE,
+                        tif->tif_dir.td_maxsamplevalue))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_PLANARCONFIG))
+            {
+                if (!TIFFWriteDirectoryTagShort(tif, &ndir, dir,
+                                                TIFFTAG_PLANARCONFIG,
+                                                tif->tif_dir.td_planarconfig))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_RESOLUTIONUNIT))
+            {
+                if (!TIFFWriteDirectoryTagShort(tif, &ndir, dir,
+                                                TIFFTAG_RESOLUTIONUNIT,
+                                                tif->tif_dir.td_resolutionunit))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_PAGENUMBER))
+            {
+                if (!TIFFWriteDirectoryTagShortArray(
+                        tif, &ndir, dir, TIFFTAG_PAGENUMBER, 2,
+                        &tif->tif_dir.td_pagenumber[0]))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_STRIPBYTECOUNTS))
+            {
+                if (!isTiled(tif))
+                {
+                    if (!TIFFWriteDirectoryTagLongLong8Array(
+                            tif, &ndir, dir, TIFFTAG_STRIPBYTECOUNTS,
+                            tif->tif_dir.td_nstrips,
+                            tif->tif_dir.td_stripbytecount_p))
+                        goto bad;
+                }
+                else
+                {
+                    if (!TIFFWriteDirectoryTagLongLong8Array(
+                            tif, &ndir, dir, TIFFTAG_TILEBYTECOUNTS,
+                            tif->tif_dir.td_nstrips,
+                            tif->tif_dir.td_stripbytecount_p))
+                        goto bad;
+                }
+            }
+            if (TIFFFieldSet(tif, FIELD_STRIPOFFSETS))
+            {
+                if (!isTiled(tif))
+                {
                     /* td_stripoffset_p might be NULL in an odd OJPEG case. See
                      *  tif_dirread.c around line 3634.
                      * XXX: OJPEG hack.
@@ -610,1174 +720,1108 @@ TIFFWriteDirectorySec(TIFF* tif, int isimage, int imagedone, uint64* pdiroff)
                      * JpegInterchangeFormat stream.
                      * We can get here when using tiffset on such a file.
                      * See http://bugzilla.maptools.org/show_bug.cgi?id=2500
-                    */
+                     */
                     if (tif->tif_dir.td_stripoffset_p != NULL &&
-                        !TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_STRIPOFFSETS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripoffset_p))
+                        !TIFFWriteDirectoryTagLongLong8Array(
+                            tif, &ndir, dir, TIFFTAG_STRIPOFFSETS,
+                            tif->tif_dir.td_nstrips,
+                            tif->tif_dir.td_stripoffset_p))
+                        goto bad;
+                }
+                else
+                {
+                    if (!TIFFWriteDirectoryTagLongLong8Array(
+                            tif, &ndir, dir, TIFFTAG_TILEOFFSETS,
+                            tif->tif_dir.td_nstrips,
+                            tif->tif_dir.td_stripoffset_p))
+                        goto bad;
+                }
+            }
+            if (TIFFFieldSet(tif, FIELD_COLORMAP))
+            {
+                if (!TIFFWriteDirectoryTagColormap(tif, &ndir, dir))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_EXTRASAMPLES))
+            {
+                if (tif->tif_dir.td_extrasamples)
+                {
+                    uint16_t na;
+                    uint16_t *nb;
+                    TIFFGetFieldDefaulted(tif, TIFFTAG_EXTRASAMPLES, &na, &nb);
+                    if (!TIFFWriteDirectoryTagShortArray(
+                            tif, &ndir, dir, TIFFTAG_EXTRASAMPLES, na, nb))
                         goto bad;
-				}
-				else
-				{
-					if (!TIFFWriteDirectoryTagLongLong8Array(tif,&ndir,dir,TIFFTAG_TILEOFFSETS,tif->tif_dir.td_nstrips,tif->tif_dir.td_stripoffset_p))
-						goto bad;
-				}
-			}
-			if (TIFFFieldSet(tif,FIELD_COLORMAP))
-			{
-				if (!TIFFWriteDirectoryTagColormap(tif,&ndir,dir))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_EXTRASAMPLES))
-			{
-				if (tif->tif_dir.td_extrasamples)
-				{
-					uint16 na;
-					uint16* nb;
-					TIFFGetFieldDefaulted(tif,TIFFTAG_EXTRASAMPLES,&na,&nb);
-					if (!TIFFWriteDirectoryTagShortArray(tif,&ndir,dir,TIFFTAG_EXTRASAMPLES,na,nb))
-						goto bad;
-				}
-			}
-			if (TIFFFieldSet(tif,FIELD_SAMPLEFORMAT))
-			{
-				if (!TIFFWriteDirectoryTagShortPerSample(tif,&ndir,dir,TIFFTAG_SAMPLEFORMAT,tif->tif_dir.td_sampleformat))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_SMINSAMPLEVALUE))
-			{
-				if (!TIFFWriteDirectoryTagSampleformatArray(tif,&ndir,dir,TIFFTAG_SMINSAMPLEVALUE,tif->tif_dir.td_samplesperpixel,tif->tif_dir.td_sminsamplevalue))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_SMAXSAMPLEVALUE))
-			{
-				if (!TIFFWriteDirectoryTagSampleformatArray(tif,&ndir,dir,TIFFTAG_SMAXSAMPLEVALUE,tif->tif_dir.td_samplesperpixel,tif->tif_dir.td_smaxsamplevalue))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_IMAGEDEPTH))
-			{
-				if (!TIFFWriteDirectoryTagLong(tif,&ndir,dir,TIFFTAG_IMAGEDEPTH,tif->tif_dir.td_imagedepth))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_TILEDEPTH))
-			{
-				if (!TIFFWriteDirectoryTagLong(tif,&ndir,dir,TIFFTAG_TILEDEPTH,tif->tif_dir.td_tiledepth))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_HALFTONEHINTS))
-			{
-				if (!TIFFWriteDirectoryTagShortArray(tif,&ndir,dir,TIFFTAG_HALFTONEHINTS,2,&tif->tif_dir.td_halftonehints[0]))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_YCBCRSUBSAMPLING))
-			{
-				if (!TIFFWriteDirectoryTagShortArray(tif,&ndir,dir,TIFFTAG_YCBCRSUBSAMPLING,2,&tif->tif_dir.td_ycbcrsubsampling[0]))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_YCBCRPOSITIONING))
-			{
-				if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,TIFFTAG_YCBCRPOSITIONING,tif->tif_dir.td_ycbcrpositioning))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_REFBLACKWHITE))
-			{
-				if (!TIFFWriteDirectoryTagRationalArray(tif,&ndir,dir,TIFFTAG_REFERENCEBLACKWHITE,6,tif->tif_dir.td_refblackwhite))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_TRANSFERFUNCTION))
-			{
-				if (!TIFFWriteDirectoryTagTransferfunction(tif,&ndir,dir))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_INKNAMES))
-			{
-				if (!TIFFWriteDirectoryTagAscii(tif,&ndir,dir,TIFFTAG_INKNAMES,tif->tif_dir.td_inknameslen,tif->tif_dir.td_inknames))
-					goto bad;
-			}
-			if (TIFFFieldSet(tif,FIELD_SUBIFD))
-			{
-				if (!TIFFWriteDirectoryTagSubifd(tif,&ndir,dir))
-					goto bad;
-			}
-			{
-				uint32 n;
-				for (n=0; n<tif->tif_nfields; n++) {
-					const TIFFField* o;
-					o = tif->tif_fields[n];
-					if ((o->field_bit>=FIELD_CODEC)&&(TIFFFieldSet(tif,o->field_bit)))
-					{
-						switch (o->get_field_type)
-						{
-							case TIFF_SETGET_ASCII:
-								{
-									uint32 pa;
-									char* pb;
-									assert(o->field_type==TIFF_ASCII);
-									assert(o->field_readcount==TIFF_VARIABLE);
-									assert(o->field_passcount==0);
-									TIFFGetField(tif,o->field_tag,&pb);
-									pa=(uint32)(strlen(pb));
-									if (!TIFFWriteDirectoryTagAscii(tif,&ndir,dir,(uint16)o->field_tag,pa,pb))
-										goto bad;
-								}
-								break;
-							case TIFF_SETGET_UINT16:
-								{
-									uint16 p;
-									assert(o->field_type==TIFF_SHORT);
-									assert(o->field_readcount==1);
-									assert(o->field_passcount==0);
-									TIFFGetField(tif,o->field_tag,&p);
-									if (!TIFFWriteDirectoryTagShort(tif,&ndir,dir,(uint16)o->field_tag,p))
-										goto bad;
-								}
-								break;
-							case TIFF_SETGET_UINT32:
-								{
-									uint32 p;
-									assert(o->field_type==TIFF_LONG);
-									assert(o->field_readcount==1);
-									assert(o->field_passcount==0);
-									TIFFGetField(tif,o->field_tag,&p);
-									if (!TIFFWriteDirectoryTagLong(tif,&ndir,dir,(uint16)o->field_tag,p))
-										goto bad;
-								}
-								break;
-							case TIFF_SETGET_C32_UINT8:
-								{
-									uint32 pa;
-									void* pb;
-									assert(o->field_type==TIFF_UNDEFINED);
-									assert(o->field_readcount==TIFF_VARIABLE2);
-									assert(o->field_passcount==1);
-									TIFFGetField(tif,o->field_tag,&pa,&pb);
-									if (!TIFFWriteDirectoryTagUndefinedArray(tif,&ndir,dir,(uint16)o->field_tag,pa,pb))
-										goto bad;
-								}
-								break;
-							default:
-								TIFFErrorExt(tif->tif_clientdata,module,
-								            "Cannot write tag %d (%s)",
-								            TIFFFieldTag(o),
-                                                                            o->field_name ? o->field_name : "unknown");
-								goto bad;
-						}
-					}
-				}
-			}
-		}
-		for (m=0; m<(uint32)(tif->tif_dir.td_customValueCount); m++)
-		{
-                        uint16 tag = (uint16)tif->tif_dir.td_customValues[m].info->field_tag;
-                        uint32 count = tif->tif_dir.td_customValues[m].count;
-			switch (tif->tif_dir.td_customValues[m].info->field_type)
-			{
-				case TIFF_ASCII:
-					if (!TIFFWriteDirectoryTagAscii(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_UNDEFINED:
-					if (!TIFFWriteDirectoryTagUndefinedArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_BYTE:
-					if (!TIFFWriteDirectoryTagByteArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_SBYTE:
-					if (!TIFFWriteDirectoryTagSbyteArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_SHORT:
-					if (!TIFFWriteDirectoryTagShortArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_SSHORT:
-					if (!TIFFWriteDirectoryTagSshortArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_LONG:
-					if (!TIFFWriteDirectoryTagLongArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_SLONG:
-					if (!TIFFWriteDirectoryTagSlongArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_LONG8:
-					if (!TIFFWriteDirectoryTagLong8Array(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_SLONG8:
-					if (!TIFFWriteDirectoryTagSlong8Array(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_RATIONAL:
-					{
-						/*-- Rational2Double: For Rationals evaluate "set_field_type" to determine internal storage size. */
-						int tv_size;
-						tv_size = _TIFFSetGetFieldSize(tif->tif_dir.td_customValues[m].info->set_field_type);
-						if (tv_size == 8) {
-							if (!TIFFWriteDirectoryTagRationalDoubleArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-								goto bad;
-						} else {
-							/*-- default should be tv_size == 4 */
-							if (!TIFFWriteDirectoryTagRationalArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-								goto bad;
-							/*-- ToDo: After Testing, this should be removed and tv_size==4 should be set as default. */
-							if (tv_size != 4) {
-								TIFFErrorExt(0,"TIFFLib: _TIFFWriteDirectorySec()", "Rational2Double: .set_field_type in not 4 but %d", tv_size); 
-							}
-						}
-					}
-					break;
-				case TIFF_SRATIONAL:
-					{
-						/*-- Rational2Double: For Rationals evaluate "set_field_type" to determine internal storage size. */
-						int tv_size;
-						tv_size = _TIFFSetGetFieldSize(tif->tif_dir.td_customValues[m].info->set_field_type);
-						if (tv_size == 8) {
-							if (!TIFFWriteDirectoryTagSrationalDoubleArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-								goto bad;
-						} else {
-							/*-- default should be tv_size == 4 */
-							if (!TIFFWriteDirectoryTagSrationalArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-								goto bad;
-							/*-- ToDo: After Testing, this should be removed and tv_size==4 should be set as default. */
-							if (tv_size != 4) {
-								TIFFErrorExt(0,"TIFFLib: _TIFFWriteDirectorySec()", "Rational2Double: .set_field_type in not 4 but %d", tv_size); 
-							}
-						}
-					}
-					break;
-				case TIFF_FLOAT:
-					if (!TIFFWriteDirectoryTagFloatArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_DOUBLE:
-					if (!TIFFWriteDirectoryTagDoubleArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_IFD:
-					if (!TIFFWriteDirectoryTagIfdArray(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				case TIFF_IFD8:
-					if (!TIFFWriteDirectoryTagIfdIfd8Array(tif,&ndir,dir,tag,count,tif->tif_dir.td_customValues[m].value))
-						goto bad;
-					break;
-				default:
-					assert(0);   /* we should never get here */
-					break;
-			}
-		}
-		if (dir!=NULL)
-			break;
-		dir=_TIFFmalloc(ndir*sizeof(TIFFDirEntry));
-		if (dir==NULL)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-			goto bad;
-		}
-		if (isimage)
-		{
-			if ((tif->tif_diroff==0)&&(!TIFFLinkDirectory(tif)))
-				goto bad;
-		}
-		else
-			tif->tif_diroff=(TIFFSeekFile(tif,0,SEEK_END)+1)&(~((toff_t)1));
-		if (pdiroff!=NULL)
-			*pdiroff=tif->tif_diroff;
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-			dirsize=2+ndir*12+4;
-		else
-			dirsize=8+ndir*20+8;
-		tif->tif_dataoff=tif->tif_diroff+dirsize;
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-			tif->tif_dataoff=(uint32)tif->tif_dataoff;
-		if ((tif->tif_dataoff<tif->tif_diroff)||(tif->tif_dataoff<(uint64)dirsize))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Maximum TIFF file size exceeded");
-			goto bad;
-		}
-		if (tif->tif_dataoff&1)
-			tif->tif_dataoff++;
-		if (isimage)
-			tif->tif_curdir++;
-	}
-	if (isimage)
-	{
-		if (TIFFFieldSet(tif,FIELD_SUBIFD)&&(tif->tif_subifdoff==0))
-		{
-			uint32 na;
-			TIFFDirEntry* nb;
-			for (na=0, nb=dir; ; na++, nb++)
-			{
-				if( na == ndir )
-                                {
-                                    TIFFErrorExt(tif->tif_clientdata,module,
-                                                 "Cannot find SubIFD tag");
+                }
+            }
+            if (TIFFFieldSet(tif, FIELD_SAMPLEFORMAT))
+            {
+                if (!TIFFWriteDirectoryTagShortPerSample(
+                        tif, &ndir, dir, TIFFTAG_SAMPLEFORMAT,
+                        tif->tif_dir.td_sampleformat))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_SMINSAMPLEVALUE))
+            {
+                if (!TIFFWriteDirectoryTagSampleformatArray(
+                        tif, &ndir, dir, TIFFTAG_SMINSAMPLEVALUE,
+                        tif->tif_dir.td_samplesperpixel,
+                        tif->tif_dir.td_sminsamplevalue))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_SMAXSAMPLEVALUE))
+            {
+                if (!TIFFWriteDirectoryTagSampleformatArray(
+                        tif, &ndir, dir, TIFFTAG_SMAXSAMPLEVALUE,
+                        tif->tif_dir.td_samplesperpixel,
+                        tif->tif_dir.td_smaxsamplevalue))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_IMAGEDEPTH))
+            {
+                if (!TIFFWriteDirectoryTagLong(tif, &ndir, dir,
+                                               TIFFTAG_IMAGEDEPTH,
+                                               tif->tif_dir.td_imagedepth))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_TILEDEPTH))
+            {
+                if (!TIFFWriteDirectoryTagLong(tif, &ndir, dir,
+                                               TIFFTAG_TILEDEPTH,
+                                               tif->tif_dir.td_tiledepth))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_HALFTONEHINTS))
+            {
+                if (!TIFFWriteDirectoryTagShortArray(
+                        tif, &ndir, dir, TIFFTAG_HALFTONEHINTS, 2,
+                        &tif->tif_dir.td_halftonehints[0]))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_YCBCRSUBSAMPLING))
+            {
+                if (!TIFFWriteDirectoryTagShortArray(
+                        tif, &ndir, dir, TIFFTAG_YCBCRSUBSAMPLING, 2,
+                        &tif->tif_dir.td_ycbcrsubsampling[0]))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_YCBCRPOSITIONING))
+            {
+                if (!TIFFWriteDirectoryTagShort(
+                        tif, &ndir, dir, TIFFTAG_YCBCRPOSITIONING,
+                        tif->tif_dir.td_ycbcrpositioning))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_REFBLACKWHITE))
+            {
+                if (!TIFFWriteDirectoryTagRationalArray(
+                        tif, &ndir, dir, TIFFTAG_REFERENCEBLACKWHITE, 6,
+                        tif->tif_dir.td_refblackwhite))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_TRANSFERFUNCTION))
+            {
+                if (!TIFFWriteDirectoryTagTransferfunction(tif, &ndir, dir))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_INKNAMES))
+            {
+                if (!TIFFWriteDirectoryTagAscii(
+                        tif, &ndir, dir, TIFFTAG_INKNAMES,
+                        tif->tif_dir.td_inknameslen, tif->tif_dir.td_inknames))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_NUMBEROFINKS))
+            {
+                if (!TIFFWriteDirectoryTagShort(tif, &ndir, dir,
+                                                TIFFTAG_NUMBEROFINKS,
+                                                tif->tif_dir.td_numberofinks))
+                    goto bad;
+            }
+            if (TIFFFieldSet(tif, FIELD_SUBIFD))
+            {
+                if (!TIFFWriteDirectoryTagSubifd(tif, &ndir, dir))
+                    goto bad;
+            }
+            {
+                uint32_t n;
+                for (n = 0; n < tif->tif_nfields; n++)
+                {
+                    const TIFFField *o;
+                    o = tif->tif_fields[n];
+                    if ((o->field_bit >= FIELD_CODEC) &&
+                        (TIFFFieldSet(tif, o->field_bit)))
+                    {
+                        switch (o->get_field_type)
+                        {
+                            case TIFF_SETGET_ASCII:
+                            {
+                                uint32_t pa;
+                                char *pb;
+                                assert(o->field_type == TIFF_ASCII);
+                                assert(o->field_readcount == TIFF_VARIABLE);
+                                assert(o->field_passcount == 0);
+                                TIFFGetField(tif, o->field_tag, &pb);
+                                pa = (uint32_t)(strlen(pb));
+                                if (!TIFFWriteDirectoryTagAscii(
+                                        tif, &ndir, dir, (uint16_t)o->field_tag,
+                                        pa, pb))
                                     goto bad;
-                                }
-				if (nb->tdir_tag==TIFFTAG_SUBIFD)
-					break;
-			}
-			if (!(tif->tif_flags&TIFF_BIGTIFF))
-				tif->tif_subifdoff=tif->tif_diroff+2+na*12+8;
-			else
-				tif->tif_subifdoff=tif->tif_diroff+8+na*20+12;
-		}
-	}
-	dirmem=_TIFFmalloc(dirsize);
-	if (dirmem==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		goto bad;
-	}
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		uint8* n;
-		uint32 nTmp;
-		TIFFDirEntry* o;
-		n=dirmem;
-		*(uint16*)n=(uint16)ndir;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabShort((uint16*)n);
-		n+=2;
-		o=dir;
-		for (m=0; m<ndir; m++)
-		{
-			*(uint16*)n=o->tdir_tag;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabShort((uint16*)n);
-			n+=2;
-			*(uint16*)n=o->tdir_type;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabShort((uint16*)n);
-			n+=2;
-			nTmp = (uint32)o->tdir_count;
-			_TIFFmemcpy(n,&nTmp,4);
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong((uint32*)n);
-			n+=4;
-			/* This is correct. The data has been */
-			/* swabbed previously in TIFFWriteDirectoryTagData */
-			_TIFFmemcpy(n,&o->tdir_offset,4);
-			n+=4;
-			o++;
-		}
-		nTmp = (uint32)tif->tif_nextdiroff;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabLong(&nTmp);
-		_TIFFmemcpy(n,&nTmp,4);
-	}
-	else
-	{
-		uint8* n;
-		TIFFDirEntry* o;
-		n=dirmem;
-		*(uint64*)n=ndir;
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabLong8((uint64*)n);
-		n+=8;
-		o=dir;
-		for (m=0; m<ndir; m++)
-		{
-			*(uint16*)n=o->tdir_tag;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabShort((uint16*)n);
-			n+=2;
-			*(uint16*)n=o->tdir_type;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabShort((uint16*)n);
-			n+=2;
-			_TIFFmemcpy(n,&o->tdir_count,8);
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong8((uint64*)n);
-			n+=8;
-			_TIFFmemcpy(n,&o->tdir_offset,8);
-			n+=8;
-			o++;
-		}
-		_TIFFmemcpy(n,&tif->tif_nextdiroff,8);
-		if (tif->tif_flags&TIFF_SWAB)
-			TIFFSwabLong8((uint64*)n);
-	}
-	_TIFFfree(dir);
-	dir=NULL;
-	if (!SeekOK(tif,tif->tif_diroff))
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"IO error writing directory");
-		goto bad;
-	}
-	if (!WriteOK(tif,dirmem,(tmsize_t)dirsize))
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"IO error writing directory");
-		goto bad;
-	}
-	_TIFFfree(dirmem);
-	if (imagedone)
-	{
-		TIFFFreeDirectory(tif);
-		tif->tif_flags &= ~TIFF_DIRTYDIRECT;
-		tif->tif_flags &= ~TIFF_DIRTYSTRIP;
-		(*tif->tif_cleanup)(tif);
-		/*
-		* Reset directory-related state for subsequent
-		* directories.
-		*/
-		TIFFCreateDirectory(tif);
-	}
-	return(1);
+                            }
+                            break;
+                            case TIFF_SETGET_UINT16:
+                            {
+                                uint16_t p;
+                                assert(o->field_type == TIFF_SHORT);
+                                assert(o->field_readcount == 1);
+                                assert(o->field_passcount == 0);
+                                TIFFGetField(tif, o->field_tag, &p);
+                                if (!TIFFWriteDirectoryTagShort(
+                                        tif, &ndir, dir, (uint16_t)o->field_tag,
+                                        p))
+                                    goto bad;
+                            }
+                            break;
+                            case TIFF_SETGET_UINT32:
+                            {
+                                uint32_t p;
+                                assert(o->field_type == TIFF_LONG);
+                                assert(o->field_readcount == 1);
+                                assert(o->field_passcount == 0);
+                                TIFFGetField(tif, o->field_tag, &p);
+                                if (!TIFFWriteDirectoryTagLong(
+                                        tif, &ndir, dir, (uint16_t)o->field_tag,
+                                        p))
+                                    goto bad;
+                            }
+                            break;
+                            case TIFF_SETGET_C32_UINT8:
+                            {
+                                uint32_t pa;
+                                void *pb;
+                                assert(o->field_type == TIFF_UNDEFINED);
+                                assert(o->field_readcount == TIFF_VARIABLE2);
+                                assert(o->field_passcount == 1);
+                                TIFFGetField(tif, o->field_tag, &pa, &pb);
+                                if (!TIFFWriteDirectoryTagUndefinedArray(
+                                        tif, &ndir, dir, (uint16_t)o->field_tag,
+                                        pa, pb))
+                                    goto bad;
+                            }
+                            break;
+                            default:
+                                TIFFErrorExtR(
+                                    tif, module,
+                                    "Cannot write tag %" PRIu32 " (%s)",
+                                    TIFFFieldTag(o),
+                                    o->field_name ? o->field_name : "unknown");
+                                goto bad;
+                        }
+                    }
+                }
+            }
+        }
+        for (m = 0; m < (uint32_t)(tif->tif_dir.td_customValueCount); m++)
+        {
+            uint16_t tag =
+                (uint16_t)tif->tif_dir.td_customValues[m].info->field_tag;
+            uint32_t count = tif->tif_dir.td_customValues[m].count;
+            switch (tif->tif_dir.td_customValues[m].info->field_type)
+            {
+                case TIFF_ASCII:
+                    if (!TIFFWriteDirectoryTagAscii(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_UNDEFINED:
+                    if (!TIFFWriteDirectoryTagUndefinedArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_BYTE:
+                    if (!TIFFWriteDirectoryTagByteArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_SBYTE:
+                    if (!TIFFWriteDirectoryTagSbyteArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_SHORT:
+                    if (!TIFFWriteDirectoryTagShortArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_SSHORT:
+                    if (!TIFFWriteDirectoryTagSshortArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_LONG:
+                    if (!TIFFWriteDirectoryTagLongArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_SLONG:
+                    if (!TIFFWriteDirectoryTagSlongArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_LONG8:
+                    if (!TIFFWriteDirectoryTagLong8Array(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_SLONG8:
+                    if (!TIFFWriteDirectoryTagSlong8Array(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_RATIONAL:
+                {
+                    /*-- Rational2Double: For Rationals evaluate
+                     * "set_field_type" to determine internal storage size. */
+                    int tv_size;
+                    tv_size = TIFFFieldSetGetSize(
+                        tif->tif_dir.td_customValues[m].info);
+                    if (tv_size == 8)
+                    {
+                        if (!TIFFWriteDirectoryTagRationalDoubleArray(
+                                tif, &ndir, dir, tag, count,
+                                tif->tif_dir.td_customValues[m].value))
+                            goto bad;
+                    }
+                    else
+                    {
+                        /*-- default should be tv_size == 4 */
+                        if (!TIFFWriteDirectoryTagRationalArray(
+                                tif, &ndir, dir, tag, count,
+                                tif->tif_dir.td_customValues[m].value))
+                            goto bad;
+                        /*-- ToDo: After Testing, this should be removed and
+                         * tv_size==4 should be set as default. */
+                        if (tv_size != 4)
+                        {
+                            TIFFErrorExtR(tif,
+                                          "TIFFLib: _TIFFWriteDirectorySec()",
+                                          "Rational2Double: .set_field_type is "
+                                          "not 4 but %d",
+                                          tv_size);
+                        }
+                    }
+                }
+                break;
+                case TIFF_SRATIONAL:
+                {
+                    /*-- Rational2Double: For Rationals evaluate
+                     * "set_field_type" to determine internal storage size. */
+                    int tv_size;
+                    tv_size = TIFFFieldSetGetSize(
+                        tif->tif_dir.td_customValues[m].info);
+                    if (tv_size == 8)
+                    {
+                        if (!TIFFWriteDirectoryTagSrationalDoubleArray(
+                                tif, &ndir, dir, tag, count,
+                                tif->tif_dir.td_customValues[m].value))
+                            goto bad;
+                    }
+                    else
+                    {
+                        /*-- default should be tv_size == 4 */
+                        if (!TIFFWriteDirectoryTagSrationalArray(
+                                tif, &ndir, dir, tag, count,
+                                tif->tif_dir.td_customValues[m].value))
+                            goto bad;
+                        /*-- ToDo: After Testing, this should be removed and
+                         * tv_size==4 should be set as default. */
+                        if (tv_size != 4)
+                        {
+                            TIFFErrorExtR(tif,
+                                          "TIFFLib: _TIFFWriteDirectorySec()",
+                                          "Rational2Double: .set_field_type is "
+                                          "not 4 but %d",
+                                          tv_size);
+                        }
+                    }
+                }
+                break;
+                case TIFF_FLOAT:
+                    if (!TIFFWriteDirectoryTagFloatArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_DOUBLE:
+                    if (!TIFFWriteDirectoryTagDoubleArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_IFD:
+                    if (!TIFFWriteDirectoryTagIfdArray(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                case TIFF_IFD8:
+                    if (!TIFFWriteDirectoryTagIfdIfd8Array(
+                            tif, &ndir, dir, tag, count,
+                            tif->tif_dir.td_customValues[m].value))
+                        goto bad;
+                    break;
+                default:
+                    assert(0); /* we should never get here */
+                    break;
+            }
+        }
+        if (dir != NULL)
+            break;
+        dir = _TIFFmallocExt(tif, ndir * sizeof(TIFFDirEntry));
+        if (dir == NULL)
+        {
+            TIFFErrorExtR(tif, module, "Out of memory");
+            goto bad;
+        }
+        if (isimage)
+        {
+            if ((tif->tif_diroff == 0) && (!TIFFLinkDirectory(tif)))
+                goto bad;
+        }
+        else
+            tif->tif_diroff =
+                (TIFFSeekFile(tif, 0, SEEK_END) + 1) & (~((toff_t)1));
+        if (pdiroff != NULL)
+            *pdiroff = tif->tif_diroff;
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+            dirsize = 2 + ndir * 12 + 4;
+        else
+            dirsize = 8 + ndir * 20 + 8;
+        tif->tif_dataoff = tif->tif_diroff + dirsize;
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+            tif->tif_dataoff = (uint32_t)tif->tif_dataoff;
+        if ((tif->tif_dataoff < tif->tif_diroff) ||
+            (tif->tif_dataoff < (uint64_t)dirsize))
+        {
+            TIFFErrorExtR(tif, module, "Maximum TIFF file size exceeded");
+            goto bad;
+        }
+        if (tif->tif_dataoff & 1)
+            tif->tif_dataoff++;
+        if (isimage)
+        {
+            if (tif->tif_curdir == TIFF_NON_EXISTENT_DIR_NUMBER)
+                tif->tif_curdir = 0;
+            else
+                tif->tif_curdir++;
+        }
+    }
+    if (isimage)
+    {
+        if (TIFFFieldSet(tif, FIELD_SUBIFD) && (tif->tif_subifdoff == 0))
+        {
+            uint32_t na;
+            TIFFDirEntry *nb;
+            for (na = 0, nb = dir;; na++, nb++)
+            {
+                if (na == ndir)
+                {
+                    TIFFErrorExtR(tif, module, "Cannot find SubIFD tag");
+                    goto bad;
+                }
+                if (nb->tdir_tag == TIFFTAG_SUBIFD)
+                    break;
+            }
+            if (!(tif->tif_flags & TIFF_BIGTIFF))
+                tif->tif_subifdoff = tif->tif_diroff + 2 + na * 12 + 8;
+            else
+                tif->tif_subifdoff = tif->tif_diroff + 8 + na * 20 + 12;
+        }
+    }
+    dirmem = _TIFFmallocExt(tif, dirsize);
+    if (dirmem == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        goto bad;
+    }
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        uint8_t *n;
+        uint32_t nTmp;
+        TIFFDirEntry *o;
+        n = dirmem;
+        *(uint16_t *)n = (uint16_t)ndir;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabShort((uint16_t *)n);
+        n += 2;
+        o = dir;
+        for (m = 0; m < ndir; m++)
+        {
+            *(uint16_t *)n = o->tdir_tag;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort((uint16_t *)n);
+            n += 2;
+            *(uint16_t *)n = o->tdir_type;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort((uint16_t *)n);
+            n += 2;
+            nTmp = (uint32_t)o->tdir_count;
+            _TIFFmemcpy(n, &nTmp, 4);
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong((uint32_t *)n);
+            n += 4;
+            /* This is correct. The data has been */
+            /* swabbed previously in TIFFWriteDirectoryTagData */
+            _TIFFmemcpy(n, &o->tdir_offset, 4);
+            n += 4;
+            o++;
+        }
+        nTmp = (uint32_t)tif->tif_nextdiroff;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&nTmp);
+        _TIFFmemcpy(n, &nTmp, 4);
+    }
+    else
+    {
+        uint8_t *n;
+        TIFFDirEntry *o;
+        n = dirmem;
+        *(uint64_t *)n = ndir;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong8((uint64_t *)n);
+        n += 8;
+        o = dir;
+        for (m = 0; m < ndir; m++)
+        {
+            *(uint16_t *)n = o->tdir_tag;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort((uint16_t *)n);
+            n += 2;
+            *(uint16_t *)n = o->tdir_type;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort((uint16_t *)n);
+            n += 2;
+            _TIFFmemcpy(n, &o->tdir_count, 8);
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8((uint64_t *)n);
+            n += 8;
+            _TIFFmemcpy(n, &o->tdir_offset, 8);
+            n += 8;
+            o++;
+        }
+        _TIFFmemcpy(n, &tif->tif_nextdiroff, 8);
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong8((uint64_t *)n);
+    }
+    _TIFFfreeExt(tif, dir);
+    dir = NULL;
+    if (!SeekOK(tif, tif->tif_diroff))
+    {
+        TIFFErrorExtR(tif, module, "IO error writing directory");
+        goto bad;
+    }
+    if (!WriteOK(tif, dirmem, (tmsize_t)dirsize))
+    {
+        TIFFErrorExtR(tif, module, "IO error writing directory");
+        goto bad;
+    }
+    _TIFFfreeExt(tif, dirmem);
+    if (imagedone)
+    {
+        TIFFFreeDirectory(tif);
+        tif->tif_flags &= ~TIFF_DIRTYDIRECT;
+        tif->tif_flags &= ~TIFF_DIRTYSTRIP;
+        (*tif->tif_cleanup)(tif);
+        /*
+         * Reset directory-related state for subsequent
+         * directories.
+         */
+        TIFFCreateDirectory(tif);
+    }
+    return (1);
 bad:
-	if (dir!=NULL)
-		_TIFFfree(dir);
-	if (dirmem!=NULL)
-		_TIFFfree(dirmem);
-	return(0);
+    if (dir != NULL)
+        _TIFFfreeExt(tif, dir);
+    if (dirmem != NULL)
+        _TIFFfreeExt(tif, dirmem);
+    return (0);
 }
 
-static int8 TIFFClampDoubleToInt8( double val )
+static int8_t TIFFClampDoubleToInt8(double val)
 {
-    if( val > 127 )
+    if (val > 127)
         return 127;
-    if( val < -128 || val != val )
+    if (val < -128 || val != val)
         return -128;
-    return (int8)val;
+    return (int8_t)val;
 }
 
-static int16 TIFFClampDoubleToInt16( double val )
+static int16_t TIFFClampDoubleToInt16(double val)
 {
-    if( val > 32767 )
+    if (val > 32767)
         return 32767;
-    if( val < -32768 || val != val )
+    if (val < -32768 || val != val)
         return -32768;
-    return (int16)val;
+    return (int16_t)val;
 }
 
-static int32 TIFFClampDoubleToInt32( double val )
+static int32_t TIFFClampDoubleToInt32(double val)
 {
-    if( val > 0x7FFFFFFF )
+    if (val > 0x7FFFFFFF)
         return 0x7FFFFFFF;
-    if( val < -0x7FFFFFFF-1 || val != val )
-        return -0x7FFFFFFF-1;
-    return (int32)val;
+    if (val < -0x7FFFFFFF - 1 || val != val)
+        return -0x7FFFFFFF - 1;
+    return (int32_t)val;
 }
 
-static uint8 TIFFClampDoubleToUInt8( double val )
+static uint8_t TIFFClampDoubleToUInt8(double val)
 {
-    if( val < 0 )
+    if (val < 0)
         return 0;
-    if( val > 255 || val != val )
+    if (val > 255 || val != val)
         return 255;
-    return (uint8)val;
+    return (uint8_t)val;
 }
 
-static uint16 TIFFClampDoubleToUInt16( double val )
+static uint16_t TIFFClampDoubleToUInt16(double val)
 {
-    if( val < 0 )
+    if (val < 0)
         return 0;
-    if( val > 65535 || val != val )
+    if (val > 65535 || val != val)
         return 65535;
-    return (uint16)val;
+    return (uint16_t)val;
 }
 
-static uint32 TIFFClampDoubleToUInt32( double val )
+static uint32_t TIFFClampDoubleToUInt32(double val)
 {
-    if( val < 0 )
+    if (val < 0)
         return 0;
-    if( val > 0xFFFFFFFFU || val != val )
+    if (val > 0xFFFFFFFFU || val != val)
         return 0xFFFFFFFFU;
-    return (uint32)val;
-}
-
-static int
-TIFFWriteDirectoryTagSampleformatArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagSampleformatArray";
-	void* conv;
-	uint32 i;
-	int ok;
-	conv = _TIFFmalloc(count*sizeof(double));
-	if (conv == NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Out of memory");
-		return (0);
-	}
-
-	switch (tif->tif_dir.td_sampleformat)
-	{
-		case SAMPLEFORMAT_IEEEFP:
-			if (tif->tif_dir.td_bitspersample<=32)
-			{
-				for (i = 0; i < count; ++i)
-					((float*)conv)[i] = _TIFFClampDoubleToFloat(value[i]);
-				ok = TIFFWriteDirectoryTagFloatArray(tif,ndir,dir,tag,count,(float*)conv);
-			}
-			else
-			{
-				ok = TIFFWriteDirectoryTagDoubleArray(tif,ndir,dir,tag,count,value);
-			}
-			break;
-		case SAMPLEFORMAT_INT:
-			if (tif->tif_dir.td_bitspersample<=8)
-			{
-				for (i = 0; i < count; ++i)
-					((int8*)conv)[i] = TIFFClampDoubleToInt8(value[i]);
-				ok = TIFFWriteDirectoryTagSbyteArray(tif,ndir,dir,tag,count,(int8*)conv);
-			}
-			else if (tif->tif_dir.td_bitspersample<=16)
-			{
-				for (i = 0; i < count; ++i)
-					((int16*)conv)[i] = TIFFClampDoubleToInt16(value[i]);
-				ok = TIFFWriteDirectoryTagSshortArray(tif,ndir,dir,tag,count,(int16*)conv);
-			}
-			else
-			{
-				for (i = 0; i < count; ++i)
-					((int32*)conv)[i] = TIFFClampDoubleToInt32(value[i]);
-				ok = TIFFWriteDirectoryTagSlongArray(tif,ndir,dir,tag,count,(int32*)conv);
-			}
-			break;
-		case SAMPLEFORMAT_UINT:
-			if (tif->tif_dir.td_bitspersample<=8)
-			{
-				for (i = 0; i < count; ++i)
-					((uint8*)conv)[i] = TIFFClampDoubleToUInt8(value[i]);
-				ok = TIFFWriteDirectoryTagByteArray(tif,ndir,dir,tag,count,(uint8*)conv);
-			}
-			else if (tif->tif_dir.td_bitspersample<=16)
-			{
-				for (i = 0; i < count; ++i)
-					((uint16*)conv)[i] = TIFFClampDoubleToUInt16(value[i]);
-				ok = TIFFWriteDirectoryTagShortArray(tif,ndir,dir,tag,count,(uint16*)conv);
-			}
-			else
-			{
-				for (i = 0; i < count; ++i)
-					((uint32*)conv)[i] = TIFFClampDoubleToUInt32(value[i]);
-				ok = TIFFWriteDirectoryTagLongArray(tif,ndir,dir,tag,count,(uint32*)conv);
-			}
-			break;
-		default:
-			ok = 0;
-	}
-
-	_TIFFfree(conv);
-	return (ok);
-}
-
-#if 0
-static int
-TIFFWriteDirectoryTagSampleformatPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value)
-{
-	switch (tif->tif_dir.td_sampleformat)
-	{
-		case SAMPLEFORMAT_IEEEFP:
-			if (tif->tif_dir.td_bitspersample<=32)
-				return(TIFFWriteDirectoryTagFloatPerSample(tif,ndir,dir,tag,(float)value));
-			else
-				return(TIFFWriteDirectoryTagDoublePerSample(tif,ndir,dir,tag,value));
-		case SAMPLEFORMAT_INT:
-			if (tif->tif_dir.td_bitspersample<=8)
-				return(TIFFWriteDirectoryTagSbytePerSample(tif,ndir,dir,tag,(int8)value));
-			else if (tif->tif_dir.td_bitspersample<=16)
-				return(TIFFWriteDirectoryTagSshortPerSample(tif,ndir,dir,tag,(int16)value));
-			else
-				return(TIFFWriteDirectoryTagSlongPerSample(tif,ndir,dir,tag,(int32)value));
-		case SAMPLEFORMAT_UINT:
-			if (tif->tif_dir.td_bitspersample<=8)
-				return(TIFFWriteDirectoryTagBytePerSample(tif,ndir,dir,tag,(uint8)value));
-			else if (tif->tif_dir.td_bitspersample<=16)
-				return(TIFFWriteDirectoryTagShortPerSample(tif,ndir,dir,tag,(uint16)value));
-			else
-				return(TIFFWriteDirectoryTagLongPerSample(tif,ndir,dir,tag,(uint32)value));
-		default:
-			return(1);
-	}
-}
-#endif
-
-static int
-TIFFWriteDirectoryTagAscii(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, char* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedAscii(tif,ndir,dir,tag,count,value));
-}
-
-static int
-TIFFWriteDirectoryTagUndefinedArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint8* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedUndefinedArray(tif,ndir,dir,tag,count,value));
-}
-
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagByte(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint8 value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedByte(tif,ndir,dir,tag,value));
-}
-#endif
-
-static int
-TIFFWriteDirectoryTagByteArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint8* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedByteArray(tif,ndir,dir,tag,count,value));
-}
-
-#if 0
-static int
-TIFFWriteDirectoryTagBytePerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint8 value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagBytePerSample";
-	uint8* m;
-	uint8* na;
-	uint16 nb;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=_TIFFmalloc(tif->tif_dir.td_samplesperpixel*sizeof(uint8));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=m, nb=0; nb<tif->tif_dir.td_samplesperpixel; na++, nb++)
-		*na=value;
-	o=TIFFWriteDirectoryTagCheckedByteArray(tif,ndir,dir,tag,tif->tif_dir.td_samplesperpixel,m);
-	_TIFFfree(m);
-	return(o);
-}
-#endif
-
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagSbyte(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int8 value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSbyte(tif,ndir,dir,tag,value));
-}
-#endif
-
-static int
-TIFFWriteDirectoryTagSbyteArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int8* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSbyteArray(tif,ndir,dir,tag,count,value));
+    return (uint32_t)val;
 }
 
-#if 0
-static int
-TIFFWriteDirectoryTagSbytePerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int8 value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagSbytePerSample";
-	int8* m;
-	int8* na;
-	uint16 nb;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=_TIFFmalloc(tif->tif_dir.td_samplesperpixel*sizeof(int8));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=m, nb=0; nb<tif->tif_dir.td_samplesperpixel; na++, nb++)
-		*na=value;
-	o=TIFFWriteDirectoryTagCheckedSbyteArray(tif,ndir,dir,tag,tif->tif_dir.td_samplesperpixel,m);
-	_TIFFfree(m);
-	return(o);
-}
-#endif
-
-static int
-TIFFWriteDirectoryTagShort(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint16 value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedShort(tif,ndir,dir,tag,value));
-}
-
-static int
-TIFFWriteDirectoryTagShortArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint16* value)
+static int TIFFWriteDirectoryTagSampleformatArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  double *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedShortArray(tif,ndir,dir,tag,count,value));
-}
-
-static int
-TIFFWriteDirectoryTagShortPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint16 value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagShortPerSample";
-	uint16* m;
-	uint16* na;
-	uint16 nb;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=_TIFFmalloc(tif->tif_dir.td_samplesperpixel*sizeof(uint16));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=m, nb=0; nb<tif->tif_dir.td_samplesperpixel; na++, nb++)
-		*na=value;
-	o=TIFFWriteDirectoryTagCheckedShortArray(tif,ndir,dir,tag,tif->tif_dir.td_samplesperpixel,m);
-	_TIFFfree(m);
-	return(o);
-}
-
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagSshort(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int16 value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSshort(tif,ndir,dir,tag,value));
-}
-#endif
+    static const char module[] = "TIFFWriteDirectoryTagSampleformatArray";
+    void *conv;
+    uint32_t i;
+    int ok;
+    conv = _TIFFmallocExt(tif, count * sizeof(double));
+    if (conv == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
 
-static int
-TIFFWriteDirectoryTagSshortArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int16* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSshortArray(tif,ndir,dir,tag,count,value));
-}
+    switch (tif->tif_dir.td_sampleformat)
+    {
+        case SAMPLEFORMAT_IEEEFP:
+            if (tif->tif_dir.td_bitspersample <= 32)
+            {
+                for (i = 0; i < count; ++i)
+                    ((float *)conv)[i] = _TIFFClampDoubleToFloat(value[i]);
+                ok = TIFFWriteDirectoryTagFloatArray(tif, ndir, dir, tag, count,
+                                                     (float *)conv);
+            }
+            else
+            {
+                ok = TIFFWriteDirectoryTagDoubleArray(tif, ndir, dir, tag,
+                                                      count, value);
+            }
+            break;
+        case SAMPLEFORMAT_INT:
+            if (tif->tif_dir.td_bitspersample <= 8)
+            {
+                for (i = 0; i < count; ++i)
+                    ((int8_t *)conv)[i] = TIFFClampDoubleToInt8(value[i]);
+                ok = TIFFWriteDirectoryTagSbyteArray(tif, ndir, dir, tag, count,
+                                                     (int8_t *)conv);
+            }
+            else if (tif->tif_dir.td_bitspersample <= 16)
+            {
+                for (i = 0; i < count; ++i)
+                    ((int16_t *)conv)[i] = TIFFClampDoubleToInt16(value[i]);
+                ok = TIFFWriteDirectoryTagSshortArray(tif, ndir, dir, tag,
+                                                      count, (int16_t *)conv);
+            }
+            else
+            {
+                for (i = 0; i < count; ++i)
+                    ((int32_t *)conv)[i] = TIFFClampDoubleToInt32(value[i]);
+                ok = TIFFWriteDirectoryTagSlongArray(tif, ndir, dir, tag, count,
+                                                     (int32_t *)conv);
+            }
+            break;
+        case SAMPLEFORMAT_UINT:
+            if (tif->tif_dir.td_bitspersample <= 8)
+            {
+                for (i = 0; i < count; ++i)
+                    ((uint8_t *)conv)[i] = TIFFClampDoubleToUInt8(value[i]);
+                ok = TIFFWriteDirectoryTagByteArray(tif, ndir, dir, tag, count,
+                                                    (uint8_t *)conv);
+            }
+            else if (tif->tif_dir.td_bitspersample <= 16)
+            {
+                for (i = 0; i < count; ++i)
+                    ((uint16_t *)conv)[i] = TIFFClampDoubleToUInt16(value[i]);
+                ok = TIFFWriteDirectoryTagShortArray(tif, ndir, dir, tag, count,
+                                                     (uint16_t *)conv);
+            }
+            else
+            {
+                for (i = 0; i < count; ++i)
+                    ((uint32_t *)conv)[i] = TIFFClampDoubleToUInt32(value[i]);
+                ok = TIFFWriteDirectoryTagLongArray(tif, ndir, dir, tag, count,
+                                                    (uint32_t *)conv);
+            }
+            break;
+        default:
+            ok = 0;
+    }
 
-#if 0
-static int
-TIFFWriteDirectoryTagSshortPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int16 value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagSshortPerSample";
-	int16* m;
-	int16* na;
-	uint16 nb;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=_TIFFmalloc(tif->tif_dir.td_samplesperpixel*sizeof(int16));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=m, nb=0; nb<tif->tif_dir.td_samplesperpixel; na++, nb++)
-		*na=value;
-	o=TIFFWriteDirectoryTagCheckedSshortArray(tif,ndir,dir,tag,tif->tif_dir.td_samplesperpixel,m);
-	_TIFFfree(m);
-	return(o);
+    _TIFFfreeExt(tif, conv);
+    return (ok);
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagLong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 value)
+static int TIFFWriteDirectoryTagAscii(TIFF *tif, uint32_t *ndir,
+                                      TIFFDirEntry *dir, uint16_t tag,
+                                      uint32_t count, char *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedLong(tif,ndir,dir,tag,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (
+        TIFFWriteDirectoryTagCheckedAscii(tif, ndir, dir, tag, count, value));
 }
 
-static int
-TIFFWriteDirectoryTagLongArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint32* value)
+static int TIFFWriteDirectoryTagUndefinedArray(TIFF *tif, uint32_t *ndir,
+                                               TIFFDirEntry *dir, uint16_t tag,
+                                               uint32_t count, uint8_t *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedLongArray(tif,ndir,dir,tag,count,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedUndefinedArray(tif, ndir, dir, tag,
+                                                       count, value));
 }
 
-#if 0
-static int
-TIFFWriteDirectoryTagLongPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagLongPerSample";
-	uint32* m;
-	uint32* na;
-	uint16 nb;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=_TIFFmalloc(tif->tif_dir.td_samplesperpixel*sizeof(uint32));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=m, nb=0; nb<tif->tif_dir.td_samplesperpixel; na++, nb++)
-		*na=value;
-	o=TIFFWriteDirectoryTagCheckedLongArray(tif,ndir,dir,tag,tif->tif_dir.td_samplesperpixel,m);
-	_TIFFfree(m);
-	return(o);
+static int TIFFWriteDirectoryTagByteArray(TIFF *tif, uint32_t *ndir,
+                                          TIFFDirEntry *dir, uint16_t tag,
+                                          uint32_t count, uint8_t *value)
+{
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedByteArray(tif, ndir, dir, tag, count,
+                                                  value));
 }
-#endif
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagSlong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int32 value)
+static int TIFFWriteDirectoryTagSbyteArray(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, int8_t *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSlong(tif,ndir,dir,tag,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedSbyteArray(tif, ndir, dir, tag, count,
+                                                   value));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagSlongArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int32* value)
+static int TIFFWriteDirectoryTagShort(TIFF *tif, uint32_t *ndir,
+                                      TIFFDirEntry *dir, uint16_t tag,
+                                      uint16_t value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSlongArray(tif,ndir,dir,tag,count,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedShort(tif, ndir, dir, tag, value));
 }
 
-#if 0
-static int
-TIFFWriteDirectoryTagSlongPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int32 value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagSlongPerSample";
-	int32* m;
-	int32* na;
-	uint16 nb;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=_TIFFmalloc(tif->tif_dir.td_samplesperpixel*sizeof(int32));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=m, nb=0; nb<tif->tif_dir.td_samplesperpixel; na++, nb++)
-		*na=value;
-	o=TIFFWriteDirectoryTagCheckedSlongArray(tif,ndir,dir,tag,tif->tif_dir.td_samplesperpixel,m);
-	_TIFFfree(m);
-	return(o);
+static int TIFFWriteDirectoryTagShortArray(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, uint16_t *value)
+{
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedShortArray(tif, ndir, dir, tag, count,
+                                                   value));
 }
-#endif
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagLong8(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint64 value)
+static int TIFFWriteDirectoryTagShortPerSample(TIFF *tif, uint32_t *ndir,
+                                               TIFFDirEntry *dir, uint16_t tag,
+                                               uint16_t value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedLong8(tif,ndir,dir,tag,value));
+    static const char module[] = "TIFFWriteDirectoryTagShortPerSample";
+    uint16_t *m;
+    uint16_t *na;
+    uint16_t nb;
+    int o;
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    m = _TIFFmallocExt(tif, tif->tif_dir.td_samplesperpixel * sizeof(uint16_t));
+    if (m == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+    for (na = m, nb = 0; nb < tif->tif_dir.td_samplesperpixel; na++, nb++)
+        *na = value;
+    o = TIFFWriteDirectoryTagCheckedShortArray(
+        tif, ndir, dir, tag, tif->tif_dir.td_samplesperpixel, m);
+    _TIFFfreeExt(tif, m);
+    return (o);
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value)
+static int TIFFWriteDirectoryTagSshortArray(TIFF *tif, uint32_t *ndir,
+                                            TIFFDirEntry *dir, uint16_t tag,
+                                            uint32_t count, int16_t *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedLong8Array(tif,ndir,dir,tag,count,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedSshortArray(tif, ndir, dir, tag, count,
+                                                    value));
 }
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagSlong8(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int64 value)
+static int TIFFWriteDirectoryTagLong(TIFF *tif, uint32_t *ndir,
+                                     TIFFDirEntry *dir, uint16_t tag,
+                                     uint32_t value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSlong8(tif,ndir,dir,tag,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedLong(tif, ndir, dir, tag, value));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagSlong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int64* value)
+static int TIFFWriteDirectoryTagLongArray(TIFF *tif, uint32_t *ndir,
+                                          TIFFDirEntry *dir, uint16_t tag,
+                                          uint32_t count, uint32_t *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSlong8Array(tif,ndir,dir,tag,count,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedLongArray(tif, ndir, dir, tag, count,
+                                                  value));
 }
 
-static int
-TIFFWriteDirectoryTagRational(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value)
+static int TIFFWriteDirectoryTagSlongArray(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, int32_t *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedRational(tif,ndir,dir,tag,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedSlongArray(tif, ndir, dir, tag, count,
+                                                   value));
 }
 
-static int
-TIFFWriteDirectoryTagRationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedRationalArray(tif,ndir,dir,tag,count,value));
+/************************************************************************/
+/*                 TIFFWriteDirectoryTagLong8Array()                    */
+/*                                                                      */
+/*      Write either Long8 or Long array depending on file type.        */
+/************************************************************************/
+static int TIFFWriteDirectoryTagLong8Array(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, uint64_t *value)
+{
+    static const char module[] = "TIFFWriteDirectoryTagLong8Array";
+    uint64_t *ma;
+    uint32_t mb;
+    uint32_t *p;
+    uint32_t *q;
+    int o;
+
+    /* is this just a counting pass? */
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+
+    /* We always write Long8 for BigTIFF, no checking needed. */
+    if (tif->tif_flags & TIFF_BIGTIFF)
+        return (TIFFWriteDirectoryTagCheckedLong8Array(tif, ndir, dir, tag,
+                                                       count, value));
+
+    /*
+    ** For classic tiff we want to verify everything is in range for long
+    ** and convert to long format.
+    */
+    p = _TIFFmallocExt(tif, count * sizeof(uint32_t));
+    if (p == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+
+    for (q = p, ma = value, mb = 0; mb < count; ma++, mb++, q++)
+    {
+        if (*ma > 0xFFFFFFFF)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Attempt to write unsigned long value %" PRIu64
+                          " larger than 0xFFFFFFFF for tag %d in Classic TIFF "
+                          "file. TIFF file writing aborted",
+                          *ma, tag);
+            _TIFFfreeExt(tif, p);
+            return (0);
+        }
+        *q = (uint32_t)(*ma);
+    }
+
+    o = TIFFWriteDirectoryTagCheckedLongArray(tif, ndir, dir, tag, count, p);
+    _TIFFfreeExt(tif, p);
+
+    return (o);
 }
 
-static int
-TIFFWriteDirectoryTagSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSrationalArray(tif,ndir,dir,tag,count,value));
+/************************************************************************/
+/*                 TIFFWriteDirectoryTagSlong8Array()                   */
+/*                                                                      */
+/*      Write either SLong8 or SLong array depending on file type.      */
+/************************************************************************/
+static int TIFFWriteDirectoryTagSlong8Array(TIFF *tif, uint32_t *ndir,
+                                            TIFFDirEntry *dir, uint16_t tag,
+                                            uint32_t count, int64_t *value)
+{
+    static const char module[] = "TIFFWriteDirectoryTagSlong8Array";
+    int64_t *ma;
+    uint32_t mb;
+    int32_t *p;
+    int32_t *q;
+    int o;
+
+    /* is this just a counting pass? */
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    /* We always write SLong8 for BigTIFF, no checking needed. */
+    if (tif->tif_flags & TIFF_BIGTIFF)
+        return (TIFFWriteDirectoryTagCheckedSlong8Array(tif, ndir, dir, tag,
+                                                        count, value));
+
+    /*
+    ** For classic tiff we want to verify everything is in range for signed-long
+    ** and convert to signed-long format.
+    */
+    p = _TIFFmallocExt(tif, count * sizeof(uint32_t));
+    if (p == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+
+    for (q = p, ma = value, mb = 0; mb < count; ma++, mb++, q++)
+    {
+        if (*ma > (2147483647))
+        {
+            TIFFErrorExtR(tif, module,
+                          "Attempt to write signed long value %" PRIi64
+                          " larger than 0x7FFFFFFF (2147483647) for tag %d in "
+                          "Classic TIFF file. TIFF writing to file aborted",
+                          *ma, tag);
+            _TIFFfreeExt(tif, p);
+            return (0);
+        }
+        else if (*ma < (-2147483647 - 1))
+        {
+            TIFFErrorExtR(tif, module,
+                          "Attempt to write signed long value %" PRIi64
+                          " smaller than 0x80000000 (-2147483648) for tag %d "
+                          "in Classic TIFF file. TIFF writing to file aborted",
+                          *ma, tag);
+            _TIFFfreeExt(tif, p);
+            return (0);
+        }
+        *q = (int32_t)(*ma);
+    }
+
+    o = TIFFWriteDirectoryTagCheckedSlongArray(tif, ndir, dir, tag, count, p);
+    _TIFFfreeExt(tif, p);
+
+    return (o);
 }
 
-/*-- Rational2Double: additional write functions */
-static int
-TIFFWriteDirectoryTagRationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
+static int TIFFWriteDirectoryTagRational(TIFF *tif, uint32_t *ndir,
+                                         TIFFDirEntry *dir, uint16_t tag,
+                                         double value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedRationalDoubleArray(tif,ndir,dir,tag,count,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedRational(tif, ndir, dir, tag, value));
 }
 
-static int
-TIFFWriteDirectoryTagSrationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
+static int TIFFWriteDirectoryTagRationalArray(TIFF *tif, uint32_t *ndir,
+                                              TIFFDirEntry *dir, uint16_t tag,
+                                              uint32_t count, float *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedSrationalDoubleArray(tif,ndir,dir,tag,count,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedRationalArray(tif, ndir, dir, tag,
+                                                      count, value));
 }
 
-#ifdef notdef
-static int TIFFWriteDirectoryTagFloat(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value)
+static int TIFFWriteDirectoryTagSrationalArray(TIFF *tif, uint32_t *ndir,
+                                               TIFFDirEntry *dir, uint16_t tag,
+                                               uint32_t count, float *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedFloat(tif,ndir,dir,tag,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedSrationalArray(tif, ndir, dir, tag,
+                                                       count, value));
 }
-#endif
 
-static int TIFFWriteDirectoryTagFloatArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedFloatArray(tif,ndir,dir,tag,count,value));
-}
-
-#if 0
-static int TIFFWriteDirectoryTagFloatPerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagFloatPerSample";
-	float* m;
-	float* na;
-	uint16 nb;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=_TIFFmalloc(tif->tif_dir.td_samplesperpixel*sizeof(float));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=m, nb=0; nb<tif->tif_dir.td_samplesperpixel; na++, nb++)
-		*na=value;
-	o=TIFFWriteDirectoryTagCheckedFloatArray(tif,ndir,dir,tag,tif->tif_dir.td_samplesperpixel,m);
-	_TIFFfree(m);
-	return(o);
+/*-- Rational2Double: additional write functions */
+static int TIFFWriteDirectoryTagRationalDoubleArray(TIFF *tif, uint32_t *ndir,
+                                                    TIFFDirEntry *dir,
+                                                    uint16_t tag,
+                                                    uint32_t count,
+                                                    double *value)
+{
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedRationalDoubleArray(tif, ndir, dir, tag,
+                                                            count, value));
 }
-#endif
 
-#ifdef notdef
-static int TIFFWriteDirectoryTagDouble(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value)
+static int TIFFWriteDirectoryTagSrationalDoubleArray(TIFF *tif, uint32_t *ndir,
+                                                     TIFFDirEntry *dir,
+                                                     uint16_t tag,
+                                                     uint32_t count,
+                                                     double *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedDouble(tif,ndir,dir,tag,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedSrationalDoubleArray(
+        tif, ndir, dir, tag, count, value));
 }
-#endif
 
-static int TIFFWriteDirectoryTagDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
-{
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedDoubleArray(tif,ndir,dir,tag,count,value));
-}
-
-#if 0
-static int TIFFWriteDirectoryTagDoublePerSample(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagDoublePerSample";
-	double* m;
-	double* na;
-	uint16 nb;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=_TIFFmalloc(tif->tif_dir.td_samplesperpixel*sizeof(double));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=m, nb=0; nb<tif->tif_dir.td_samplesperpixel; na++, nb++)
-		*na=value;
-	o=TIFFWriteDirectoryTagCheckedDoubleArray(tif,ndir,dir,tag,tif->tif_dir.td_samplesperpixel,m);
-	_TIFFfree(m);
-	return(o);
+static int TIFFWriteDirectoryTagFloatArray(TIFF *tif, uint32_t *ndir,
+                                           TIFFDirEntry *dir, uint16_t tag,
+                                           uint32_t count, float *value)
+{
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedFloatArray(tif, ndir, dir, tag, count,
+                                                   value));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagIfdArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint32* value)
+static int TIFFWriteDirectoryTagDoubleArray(TIFF *tif, uint32_t *ndir,
+                                            TIFFDirEntry *dir, uint16_t tag,
+                                            uint32_t count, double *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedIfdArray(tif,ndir,dir,tag,count,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedDoubleArray(tif, ndir, dir, tag, count,
+                                                    value));
 }
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagIfd8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value)
+static int TIFFWriteDirectoryTagIfdArray(TIFF *tif, uint32_t *ndir,
+                                         TIFFDirEntry *dir, uint16_t tag,
+                                         uint32_t count, uint32_t *value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	return(TIFFWriteDirectoryTagCheckedIfd8Array(tif,ndir,dir,tag,count,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    return (TIFFWriteDirectoryTagCheckedIfdArray(tif, ndir, dir, tag, count,
+                                                 value));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagShortLong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 value)
+static int TIFFWriteDirectoryTagShortLong(TIFF *tif, uint32_t *ndir,
+                                          TIFFDirEntry *dir, uint16_t tag,
+                                          uint32_t value)
 {
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	if (value<=0xFFFF)
-		return(TIFFWriteDirectoryTagCheckedShort(tif,ndir,dir,tag,(uint16)value));
-	else
-		return(TIFFWriteDirectoryTagCheckedLong(tif,ndir,dir,tag,value));
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    if (value <= 0xFFFF)
+        return (TIFFWriteDirectoryTagCheckedShort(tif, ndir, dir, tag,
+                                                  (uint16_t)value));
+    else
+        return (TIFFWriteDirectoryTagCheckedLong(tif, ndir, dir, tag, value));
 }
 
-static int _WriteAsType(TIFF* tif, uint64 strile_size, uint64 uncompressed_threshold)
+static int _WriteAsType(TIFF *tif, uint64_t strile_size,
+                        uint64_t uncompressed_threshold)
 {
-    const uint16 compression = tif->tif_dir.td_compression;
-    if ( compression == COMPRESSION_NONE )
+    const uint16_t compression = tif->tif_dir.td_compression;
+    if (compression == COMPRESSION_NONE)
     {
         return strile_size > uncompressed_threshold;
     }
-    else if ( compression == COMPRESSION_JPEG ||
-              compression == COMPRESSION_LZW ||
-              compression == COMPRESSION_ADOBE_DEFLATE ||
-              compression == COMPRESSION_LZMA ||
-              compression == COMPRESSION_LERC ||
-              compression == COMPRESSION_ZSTD ||
-              compression == COMPRESSION_WEBP )
+    else if (compression == COMPRESSION_JPEG ||
+             compression == COMPRESSION_LZW ||
+             compression == COMPRESSION_ADOBE_DEFLATE ||
+             compression == COMPRESSION_DEFLATE ||
+             compression == COMPRESSION_LZMA ||
+             compression == COMPRESSION_LERC ||
+             compression == COMPRESSION_ZSTD ||
+             compression == COMPRESSION_WEBP || compression == COMPRESSION_JXL)
     {
         /* For a few select compression types, we assume that in the worst */
         /* case the compressed size will be 10 times the uncompressed size */
@@ -1787,12 +1831,12 @@ static int _WriteAsType(TIFF* tif, uint64 strile_size, uint64 uncompressed_thres
     return 1;
 }
 
-static int WriteAsLong8(TIFF* tif, uint64 strile_size)
+static int WriteAsLong8(TIFF *tif, uint64_t strile_size)
 {
     return _WriteAsType(tif, strile_size, 0xFFFFFFFFU);
 }
 
-static int WriteAsLong4(TIFF* tif, uint64 strile_size)
+static int WriteAsLong4(TIFF *tif, uint64_t strile_size)
 {
     return _WriteAsType(tif, strile_size, 0xFFFFU);
 }
@@ -1804,121 +1848,128 @@ static int WriteAsLong4(TIFF* tif, uint64 strile_size)
 /*      on strile size and Classic/BigTIFF mode.                        */
 /************************************************************************/
 
-static int
-TIFFWriteDirectoryTagLongLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value)
+static int TIFFWriteDirectoryTagLongLong8Array(TIFF *tif, uint32_t *ndir,
+                                               TIFFDirEntry *dir, uint16_t tag,
+                                               uint32_t count, uint64_t *value)
 {
     static const char module[] = "TIFFWriteDirectoryTagLongLong8Array";
     int o;
     int write_aslong4;
 
     /* is this just a counting pass? */
-    if (dir==NULL)
+    if (dir == NULL)
     {
         (*ndir)++;
-        return(1);
+        return (1);
     }
 
-    if( tif->tif_dir.td_deferstrilearraywriting )
+    if (tif->tif_dir.td_deferstrilearraywriting)
     {
-        return TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_NOTYPE, 0, 0, NULL);
+        return TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_NOTYPE, 0, 0,
+                                         NULL);
     }
 
-    if( tif->tif_flags&TIFF_BIGTIFF )
+    if (tif->tif_flags & TIFF_BIGTIFF)
     {
         int write_aslong8 = 1;
         /* In the case of ByteCounts array, we may be able to write them on */
         /* LONG if the strip/tilesize is not too big. */
-        /* Also do that for count > 1 in the case someone would want to create */
+        /* Also do that for count > 1 in the case someone would want to create
+         */
         /* a single-strip file with a growing height, in which case using */
         /* LONG8 will be safer. */
-        if( count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS )
+        if (count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS)
         {
             write_aslong8 = WriteAsLong8(tif, TIFFStripSize64(tif));
         }
-        else if( count > 1 && tag == TIFFTAG_TILEBYTECOUNTS )
+        else if (count > 1 && tag == TIFFTAG_TILEBYTECOUNTS)
         {
             write_aslong8 = WriteAsLong8(tif, TIFFTileSize64(tif));
         }
-        if( write_aslong8 )
+        if (write_aslong8)
         {
-            return TIFFWriteDirectoryTagCheckedLong8Array(tif,ndir,dir,
-                                                        tag,count,value);
+            return TIFFWriteDirectoryTagCheckedLong8Array(tif, ndir, dir, tag,
+                                                          count, value);
         }
     }
 
     write_aslong4 = 1;
-    if( count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS )
+    if (count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS)
     {
         write_aslong4 = WriteAsLong4(tif, TIFFStripSize64(tif));
     }
-    else if( count > 1 && tag == TIFFTAG_TILEBYTECOUNTS )
+    else if (count > 1 && tag == TIFFTAG_TILEBYTECOUNTS)
     {
         write_aslong4 = WriteAsLong4(tif, TIFFTileSize64(tif));
     }
-    if( write_aslong4 )
+    if (write_aslong4)
     {
         /*
         ** For classic tiff we want to verify everything is in range for LONG
         ** and convert to long format.
         */
 
-        uint32* p = _TIFFmalloc(count*sizeof(uint32));
-        uint32* q;
-        uint64* ma;
-        uint32 mb;
+        uint32_t *p = _TIFFmallocExt(tif, count * sizeof(uint32_t));
+        uint32_t *q;
+        uint64_t *ma;
+        uint32_t mb;
 
-        if (p==NULL)
+        if (p == NULL)
         {
-            TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-            return(0);
+            TIFFErrorExtR(tif, module, "Out of memory");
+            return (0);
         }
 
-        for (q=p, ma=value, mb=0; mb<count; ma++, mb++, q++)
+        for (q = p, ma = value, mb = 0; mb < count; ma++, mb++, q++)
         {
-            if (*ma>0xFFFFFFFF)
+            if (*ma > 0xFFFFFFFF)
             {
-                TIFFErrorExt(tif->tif_clientdata,module,
-                            "Attempt to write value larger than 0xFFFFFFFF in LONG array.");
-                _TIFFfree(p);
-                return(0);
+                TIFFErrorExtR(tif, module,
+                              "Attempt to write value larger than 0xFFFFFFFF "
+                              "in LONG array.");
+                _TIFFfreeExt(tif, p);
+                return (0);
             }
-            *q= (uint32)(*ma);
+            *q = (uint32_t)(*ma);
         }
 
-        o=TIFFWriteDirectoryTagCheckedLongArray(tif,ndir,dir,tag,count,p);
-        _TIFFfree(p);
+        o = TIFFWriteDirectoryTagCheckedLongArray(tif, ndir, dir, tag, count,
+                                                  p);
+        _TIFFfreeExt(tif, p);
     }
     else
     {
-        uint16* p = _TIFFmalloc(count*sizeof(uint16));
-        uint16* q;
-        uint64* ma;
-        uint32 mb;
+        uint16_t *p = _TIFFmallocExt(tif, count * sizeof(uint16_t));
+        uint16_t *q;
+        uint64_t *ma;
+        uint32_t mb;
 
-        if (p==NULL)
+        if (p == NULL)
         {
-            TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-            return(0);
+            TIFFErrorExtR(tif, module, "Out of memory");
+            return (0);
         }
 
-        for (q=p, ma=value, mb=0; mb<count; ma++, mb++, q++)
+        for (q = p, ma = value, mb = 0; mb < count; ma++, mb++, q++)
         {
-            if (*ma>0xFFFF)
+            if (*ma > 0xFFFF)
             {
                 /* Should not happen normally given the check we did before */
-                TIFFErrorExt(tif->tif_clientdata,module,
-                            "Attempt to write value larger than 0xFFFF in SHORT array.");
-                _TIFFfree(p);
-                return(0);
+                TIFFErrorExtR(tif, module,
+                              "Attempt to write value larger than 0xFFFF in "
+                              "SHORT array.");
+                _TIFFfreeExt(tif, p);
+                return (0);
             }
-            *q= (uint16)(*ma);
+            *q = (uint16_t)(*ma);
         }
 
-        o=TIFFWriteDirectoryTagCheckedShortArray(tif,ndir,dir,tag,count,p);
-        _TIFFfree(p);
+        o = TIFFWriteDirectoryTagCheckedShortArray(tif, ndir, dir, tag, count,
+                                                   p);
+        _TIFFfreeExt(tif, p);
     }
 
-    return(o);
+    return (o);
 }
 
 /************************************************************************/
@@ -1927,781 +1978,555 @@ TIFFWriteDirectoryTagLongLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir,
 /*      Write either IFD8 or IFD array depending on file type.          */
 /************************************************************************/
 
-static int
-TIFFWriteDirectoryTagIfdIfd8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value)
+static int TIFFWriteDirectoryTagIfdIfd8Array(TIFF *tif, uint32_t *ndir,
+                                             TIFFDirEntry *dir, uint16_t tag,
+                                             uint32_t count, uint64_t *value)
 {
     static const char module[] = "TIFFWriteDirectoryTagIfdIfd8Array";
-    uint64* ma;
-    uint32 mb;
-    uint32* p;
-    uint32* q;
+    uint64_t *ma;
+    uint32_t mb;
+    uint32_t *p;
+    uint32_t *q;
     int o;
 
     /* is this just a counting pass? */
-    if (dir==NULL)
+    if (dir == NULL)
     {
         (*ndir)++;
-        return(1);
+        return (1);
     }
 
     /* We always write IFD8 for BigTIFF, no checking needed. */
-    if( tif->tif_flags&TIFF_BIGTIFF )
-        return TIFFWriteDirectoryTagCheckedIfd8Array(tif,ndir,dir,
-                                                     tag,count,value);
+    if (tif->tif_flags & TIFF_BIGTIFF)
+        return TIFFWriteDirectoryTagCheckedIfd8Array(tif, ndir, dir, tag, count,
+                                                     value);
 
     /*
     ** For classic tiff we want to verify everything is in range for IFD
     ** and convert to long format.
     */
 
-    p = _TIFFmalloc(count*sizeof(uint32));
-    if (p==NULL)
+    p = _TIFFmallocExt(tif, count * sizeof(uint32_t));
+    if (p == NULL)
     {
-        TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-        return(0);
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
     }
 
-    for (q=p, ma=value, mb=0; mb<count; ma++, mb++, q++)
+    for (q = p, ma = value, mb = 0; mb < count; ma++, mb++, q++)
     {
-        if (*ma>0xFFFFFFFF)
+        if (*ma > 0xFFFFFFFF)
         {
-            TIFFErrorExt(tif->tif_clientdata,module,
-                         "Attempt to write value larger than 0xFFFFFFFF in Classic TIFF file.");
-            _TIFFfree(p);
-            return(0);
+            TIFFErrorExtR(tif, module,
+                          "Attempt to write value larger than 0xFFFFFFFF in "
+                          "Classic TIFF file.");
+            _TIFFfreeExt(tif, p);
+            return (0);
         }
-        *q= (uint32)(*ma);
+        *q = (uint32_t)(*ma);
     }
 
-    o=TIFFWriteDirectoryTagCheckedIfdArray(tif,ndir,dir,tag,count,p);
-    _TIFFfree(p);
-
-    return(o);
-}
-
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagShortLongLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagShortLongLong8Array";
-	uint64* ma;
-	uint32 mb;
-	uint8 n;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	n=0;
-	for (ma=value, mb=0; mb<count; ma++, mb++)
-	{
-		if ((n==0)&&(*ma>0xFFFF))
-			n=1;
-		if ((n==1)&&(*ma>0xFFFFFFFF))
-		{
-			n=2;
-			break;
-		}
-	}
-	if (n==0)
-	{
-		uint16* p;
-		uint16* q;
-		p=_TIFFmalloc(count*sizeof(uint16));
-		if (p==NULL)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-			return(0);
-		}
-		for (ma=value, mb=0, q=p; mb<count; ma++, mb++, q++)
-			*q=(uint16)(*ma);
-		o=TIFFWriteDirectoryTagCheckedShortArray(tif,ndir,dir,tag,count,p);
-		_TIFFfree(p);
-	}
-	else if (n==1)
-	{
-		uint32* p;
-		uint32* q;
-		p=_TIFFmalloc(count*sizeof(uint32));
-		if (p==NULL)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-			return(0);
-		}
-		for (ma=value, mb=0, q=p; mb<count; ma++, mb++, q++)
-			*q=(uint32)(*ma);
-		o=TIFFWriteDirectoryTagCheckedLongArray(tif,ndir,dir,tag,count,p);
-		_TIFFfree(p);
-	}
-	else
-	{
-		assert(n==2);
-		o=TIFFWriteDirectoryTagCheckedLong8Array(tif,ndir,dir,tag,count,value);
-	}
-	return(o);
-}
-#endif
-static int
-TIFFWriteDirectoryTagColormap(TIFF* tif, uint32* ndir, TIFFDirEntry* dir)
-{
-	static const char module[] = "TIFFWriteDirectoryTagColormap";
-	uint32 m;
-	uint16* n;
-	int o;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=(1<<tif->tif_dir.td_bitspersample);
-	n=_TIFFmalloc(3*m*sizeof(uint16));
-	if (n==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	_TIFFmemcpy(&n[0],tif->tif_dir.td_colormap[0],m*sizeof(uint16));
-	_TIFFmemcpy(&n[m],tif->tif_dir.td_colormap[1],m*sizeof(uint16));
-	_TIFFmemcpy(&n[2*m],tif->tif_dir.td_colormap[2],m*sizeof(uint16));
-	o=TIFFWriteDirectoryTagCheckedShortArray(tif,ndir,dir,TIFFTAG_COLORMAP,3*m,n);
-	_TIFFfree(n);
-	return(o);
-}
-
-static int
-TIFFWriteDirectoryTagTransferfunction(TIFF* tif, uint32* ndir, TIFFDirEntry* dir)
-{
-	static const char module[] = "TIFFWriteDirectoryTagTransferfunction";
-	uint32 m;
-	uint16 n;
-	uint16* o;
-	int p;
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=(1<<tif->tif_dir.td_bitspersample);
-	n=tif->tif_dir.td_samplesperpixel-tif->tif_dir.td_extrasamples;
-	/*
-	 * Check if the table can be written as a single column,
-	 * or if it must be written as 3 columns.  Note that we
-	 * write a 3-column tag if there are 2 samples/pixel and
-	 * a single column of data won't suffice--hmm.
-	 */
-	if (n>3)
-		n=3;
-	if (n==3)
-	{
-		if (tif->tif_dir.td_transferfunction[2] == NULL ||
-		    !_TIFFmemcmp(tif->tif_dir.td_transferfunction[0],tif->tif_dir.td_transferfunction[2],m*sizeof(uint16)))
-			n=2;
-	}
-	if (n==2)
-	{
-		if (tif->tif_dir.td_transferfunction[1] == NULL ||
-		    !_TIFFmemcmp(tif->tif_dir.td_transferfunction[0],tif->tif_dir.td_transferfunction[1],m*sizeof(uint16)))
-			n=1;
-	}
-	if (n==0)
-		n=1;
-	o=_TIFFmalloc(n*m*sizeof(uint16));
-	if (o==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	_TIFFmemcpy(&o[0],tif->tif_dir.td_transferfunction[0],m*sizeof(uint16));
-	if (n>1)
-		_TIFFmemcpy(&o[m],tif->tif_dir.td_transferfunction[1],m*sizeof(uint16));
-	if (n>2)
-		_TIFFmemcpy(&o[2*m],tif->tif_dir.td_transferfunction[2],m*sizeof(uint16));
-	p=TIFFWriteDirectoryTagCheckedShortArray(tif,ndir,dir,TIFFTAG_TRANSFERFUNCTION,n*m,o);
-	_TIFFfree(o);
-	return(p);
-}
-
-static int
-TIFFWriteDirectoryTagSubifd(TIFF* tif, uint32* ndir, TIFFDirEntry* dir)
-{
-	static const char module[] = "TIFFWriteDirectoryTagSubifd";
-	uint64 m;
-	int n;
-	if (tif->tif_dir.td_nsubifd==0)
-		return(1);
-	if (dir==NULL)
-	{
-		(*ndir)++;
-		return(1);
-	}
-	m=tif->tif_dataoff;
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		uint32* o;
-		uint64* pa;
-		uint32* pb;
-		uint16 p;
-		o=_TIFFmalloc(tif->tif_dir.td_nsubifd*sizeof(uint32));
-		if (o==NULL)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-			return(0);
-		}
-		pa=tif->tif_dir.td_subifd;
-		pb=o;
-		for (p=0; p < tif->tif_dir.td_nsubifd; p++)
-		{
-                        assert(pa != 0);
-
-                        /* Could happen if an classicTIFF has a SubIFD of type LONG8 (which is illegal) */
-                        if( *pa > 0xFFFFFFFFUL)
-                        {
-                            TIFFErrorExt(tif->tif_clientdata,module,"Illegal value for SubIFD tag");
-                            _TIFFfree(o);
-                            return(0);
-                        }
-			*pb++=(uint32)(*pa++);
-		}
-		n=TIFFWriteDirectoryTagCheckedIfdArray(tif,ndir,dir,TIFFTAG_SUBIFD,tif->tif_dir.td_nsubifd,o);
-		_TIFFfree(o);
-	}
-	else
-		n=TIFFWriteDirectoryTagCheckedIfd8Array(tif,ndir,dir,TIFFTAG_SUBIFD,tif->tif_dir.td_nsubifd,tif->tif_dir.td_subifd);
-	if (!n)
-		return(0);
-	/*
-	 * Total hack: if this directory includes a SubIFD
-	 * tag then force the next <n> directories to be
-	 * written as ``sub directories'' of this one.  This
-	 * is used to write things like thumbnails and
-	 * image masks that one wants to keep out of the
-	 * normal directory linkage access mechanism.
-	 */
-	tif->tif_flags|=TIFF_INSUBIFD;
-	tif->tif_nsubifd=tif->tif_dir.td_nsubifd;
-	if (tif->tif_dir.td_nsubifd==1)
-		tif->tif_subifdoff=0;
-	else
-		tif->tif_subifdoff=m;
-	return(1);
-}
-
-static int
-TIFFWriteDirectoryTagCheckedAscii(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, char* value)
-{
-	assert(sizeof(char)==1);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_ASCII,count,count,value));
-}
+    o = TIFFWriteDirectoryTagCheckedIfdArray(tif, ndir, dir, tag, count, p);
+    _TIFFfreeExt(tif, p);
 
-static int
-TIFFWriteDirectoryTagCheckedUndefinedArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint8* value)
-{
-	assert(sizeof(uint8)==1);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_UNDEFINED,count,count,value));
+    return (o);
 }
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagCheckedByte(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint8 value)
+static int TIFFWriteDirectoryTagColormap(TIFF *tif, uint32_t *ndir,
+                                         TIFFDirEntry *dir)
 {
-	assert(sizeof(uint8)==1);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_BYTE,1,1,&value));
-}
-#endif
+    static const char module[] = "TIFFWriteDirectoryTagColormap";
+    uint32_t m;
+    uint16_t *n;
+    int o;
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    m = (1 << tif->tif_dir.td_bitspersample);
+    n = _TIFFmallocExt(tif, 3 * m * sizeof(uint16_t));
+    if (n == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+    _TIFFmemcpy(&n[0], tif->tif_dir.td_colormap[0], m * sizeof(uint16_t));
+    _TIFFmemcpy(&n[m], tif->tif_dir.td_colormap[1], m * sizeof(uint16_t));
+    _TIFFmemcpy(&n[2 * m], tif->tif_dir.td_colormap[2], m * sizeof(uint16_t));
+    o = TIFFWriteDirectoryTagCheckedShortArray(tif, ndir, dir, TIFFTAG_COLORMAP,
+                                               3 * m, n);
+    _TIFFfreeExt(tif, n);
+    return (o);
+}
+
+static int TIFFWriteDirectoryTagTransferfunction(TIFF *tif, uint32_t *ndir,
+                                                 TIFFDirEntry *dir)
+{
+    static const char module[] = "TIFFWriteDirectoryTagTransferfunction";
+    uint32_t m;
+    uint16_t n;
+    uint16_t *o;
+    int p;
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    /* TIFFTAG_TRANSFERFUNCTION expects (1 or 3) pointer to arrays with
+     *  (1 << BitsPerSample) * uint16_t values.
+     */
+    m = (1 << tif->tif_dir.td_bitspersample);
+    /* clang-format off */
+    n = (tif->tif_dir.td_samplesperpixel - tif->tif_dir.td_extrasamples) > 1 ? 3 : 1;
+    /* clang-format on */
+
+    /* Check for proper number of transferfunctions */
+    for (int i = 0; i < n; i++)
+    {
+        if (tif->tif_dir.td_transferfunction[i] == NULL)
+        {
+            TIFFWarningExtR(
+                tif, module,
+                "Too few TransferFunctions provided. Tag not written to file");
+            return (1); /* Not an error; only tag is not written. */
+        }
+    }
+    /*
+     * Check if the table can be written as a single column,
+     * or if it must be written as 3 columns.  Note that we
+     * write a 3-column tag if there are 2 samples/pixel and
+     * a single column of data won't suffice--hmm.
+     */
+    if (n == 3)
+    {
+        if (!_TIFFmemcmp(tif->tif_dir.td_transferfunction[0],
+                         tif->tif_dir.td_transferfunction[2],
+                         m * sizeof(uint16_t)) &&
+            !_TIFFmemcmp(tif->tif_dir.td_transferfunction[0],
+                         tif->tif_dir.td_transferfunction[1],
+                         m * sizeof(uint16_t)))
+            n = 1;
+    }
+    o = _TIFFmallocExt(tif, n * m * sizeof(uint16_t));
+    if (o == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+    _TIFFmemcpy(&o[0], tif->tif_dir.td_transferfunction[0],
+                m * sizeof(uint16_t));
+    if (n > 1)
+        _TIFFmemcpy(&o[m], tif->tif_dir.td_transferfunction[1],
+                    m * sizeof(uint16_t));
+    if (n > 2)
+        _TIFFmemcpy(&o[2 * m], tif->tif_dir.td_transferfunction[2],
+                    m * sizeof(uint16_t));
+    p = TIFFWriteDirectoryTagCheckedShortArray(
+        tif, ndir, dir, TIFFTAG_TRANSFERFUNCTION, n * m, o);
+    _TIFFfreeExt(tif, o);
+    return (p);
+}
+
+static int TIFFWriteDirectoryTagSubifd(TIFF *tif, uint32_t *ndir,
+                                       TIFFDirEntry *dir)
+{
+    static const char module[] = "TIFFWriteDirectoryTagSubifd";
+    uint64_t m;
+    int n;
+    if (tif->tif_dir.td_nsubifd == 0)
+        return (1);
+    if (dir == NULL)
+    {
+        (*ndir)++;
+        return (1);
+    }
+    m = tif->tif_dataoff;
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        uint32_t *o;
+        uint64_t *pa;
+        uint32_t *pb;
+        uint16_t p;
+        o = _TIFFmallocExt(tif, tif->tif_dir.td_nsubifd * sizeof(uint32_t));
+        if (o == NULL)
+        {
+            TIFFErrorExtR(tif, module, "Out of memory");
+            return (0);
+        }
+        pa = tif->tif_dir.td_subifd;
+        pb = o;
+        for (p = 0; p < tif->tif_dir.td_nsubifd; p++)
+        {
+            assert(pa != 0);
 
-static int
-TIFFWriteDirectoryTagCheckedByteArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint8* value)
-{
-	assert(sizeof(uint8)==1);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_BYTE,count,count,value));
+            /* Could happen if an classicTIFF has a SubIFD of type LONG8 (which
+             * is illegal) */
+            if (*pa > 0xFFFFFFFFUL)
+            {
+                TIFFErrorExtR(tif, module, "Illegal value for SubIFD tag");
+                _TIFFfreeExt(tif, o);
+                return (0);
+            }
+            *pb++ = (uint32_t)(*pa++);
+        }
+        n = TIFFWriteDirectoryTagCheckedIfdArray(tif, ndir, dir, TIFFTAG_SUBIFD,
+                                                 tif->tif_dir.td_nsubifd, o);
+        _TIFFfreeExt(tif, o);
+    }
+    else
+        n = TIFFWriteDirectoryTagCheckedIfd8Array(
+            tif, ndir, dir, TIFFTAG_SUBIFD, tif->tif_dir.td_nsubifd,
+            tif->tif_dir.td_subifd);
+    if (!n)
+        return (0);
+    /*
+     * Total hack: if this directory includes a SubIFD
+     * tag then force the next <n> directories to be
+     * written as ``sub directories'' of this one.  This
+     * is used to write things like thumbnails and
+     * image masks that one wants to keep out of the
+     * normal directory linkage access mechanism.
+     */
+    tif->tif_flags |= TIFF_INSUBIFD;
+    tif->tif_nsubifd = tif->tif_dir.td_nsubifd;
+    if (tif->tif_dir.td_nsubifd == 1)
+        tif->tif_subifdoff = 0;
+    else
+        tif->tif_subifdoff = m;
+    return (1);
 }
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagCheckedSbyte(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int8 value)
+static int TIFFWriteDirectoryTagCheckedAscii(TIFF *tif, uint32_t *ndir,
+                                             TIFFDirEntry *dir, uint16_t tag,
+                                             uint32_t count, char *value)
 {
-	assert(sizeof(int8)==1);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SBYTE,1,1,&value));
+    assert(sizeof(char) == 1);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_ASCII, count,
+                                      count, value));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagCheckedSbyteArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int8* value)
+static int TIFFWriteDirectoryTagCheckedUndefinedArray(TIFF *tif, uint32_t *ndir,
+                                                      TIFFDirEntry *dir,
+                                                      uint16_t tag,
+                                                      uint32_t count,
+                                                      uint8_t *value)
 {
-	assert(sizeof(int8)==1);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SBYTE,count,count,value));
+    assert(sizeof(uint8_t) == 1);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_UNDEFINED,
+                                      count, count, value));
 }
 
-static int
-TIFFWriteDirectoryTagCheckedShort(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint16 value)
+static int TIFFWriteDirectoryTagCheckedByteArray(TIFF *tif, uint32_t *ndir,
+                                                 TIFFDirEntry *dir,
+                                                 uint16_t tag, uint32_t count,
+                                                 uint8_t *value)
 {
-	uint16 m;
-	assert(sizeof(uint16)==2);
-	m=value;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabShort(&m);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SHORT,1,2,&m));
+    assert(sizeof(uint8_t) == 1);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_BYTE, count,
+                                      count, value));
 }
 
-static int
-TIFFWriteDirectoryTagCheckedShortArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint16* value)
+static int TIFFWriteDirectoryTagCheckedSbyteArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  int8_t *value)
 {
-	assert(count<0x80000000);
-	assert(sizeof(uint16)==2);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfShort(value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SHORT,count,count*2,value));
+    assert(sizeof(int8_t) == 1);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_SBYTE, count,
+                                      count, value));
 }
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagCheckedSshort(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int16 value)
+static int TIFFWriteDirectoryTagCheckedShort(TIFF *tif, uint32_t *ndir,
+                                             TIFFDirEntry *dir, uint16_t tag,
+                                             uint16_t value)
 {
-	int16 m;
-	assert(sizeof(int16)==2);
-	m=value;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabShort((uint16*)(&m));
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SSHORT,1,2,&m));
+    uint16_t m;
+    assert(sizeof(uint16_t) == 2);
+    m = value;
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabShort(&m);
+    return (
+        TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_SHORT, 1, 2, &m));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagCheckedSshortArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int16* value)
+static int TIFFWriteDirectoryTagCheckedShortArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  uint16_t *value)
 {
-	assert(count<0x80000000);
-	assert(sizeof(int16)==2);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfShort((uint16*)value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SSHORT,count,count*2,value));
+    assert(count < 0x80000000);
+    assert(sizeof(uint16_t) == 2);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfShort(value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_SHORT, count,
+                                      count * 2, value));
 }
 
-static int
-TIFFWriteDirectoryTagCheckedLong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 value)
+static int TIFFWriteDirectoryTagCheckedSshortArray(TIFF *tif, uint32_t *ndir,
+                                                   TIFFDirEntry *dir,
+                                                   uint16_t tag, uint32_t count,
+                                                   int16_t *value)
 {
-	uint32 m;
-	assert(sizeof(uint32)==4);
-	m=value;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong(&m);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_LONG,1,4,&m));
+    assert(count < 0x80000000);
+    assert(sizeof(int16_t) == 2);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfShort((uint16_t *)value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_SSHORT, count,
+                                      count * 2, value));
 }
 
-static int
-TIFFWriteDirectoryTagCheckedLongArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint32* value)
+static int TIFFWriteDirectoryTagCheckedLong(TIFF *tif, uint32_t *ndir,
+                                            TIFFDirEntry *dir, uint16_t tag,
+                                            uint32_t value)
 {
-	assert(count<0x40000000);
-	assert(sizeof(uint32)==4);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong(value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_LONG,count,count*4,value));
+    uint32_t m;
+    assert(sizeof(uint32_t) == 4);
+    m = value;
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabLong(&m);
+    return (
+        TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_LONG, 1, 4, &m));
 }
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagCheckedSlong(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int32 value)
+static int TIFFWriteDirectoryTagCheckedLongArray(TIFF *tif, uint32_t *ndir,
+                                                 TIFFDirEntry *dir,
+                                                 uint16_t tag, uint32_t count,
+                                                 uint32_t *value)
 {
-	int32 m;
-	assert(sizeof(int32)==4);
-	m=value;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong((uint32*)(&m));
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SLONG,1,4,&m));
+    assert(count < 0x40000000);
+    assert(sizeof(uint32_t) == 4);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong(value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_LONG, count,
+                                      count * 4, value));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagCheckedSlongArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int32* value)
-{
-	assert(count<0x40000000);
-	assert(sizeof(int32)==4);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong((uint32*)value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SLONG,count,count*4,value));
+static int TIFFWriteDirectoryTagCheckedSlongArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  int32_t *value)
+{
+    assert(count < 0x40000000);
+    assert(sizeof(int32_t) == 4);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong((uint32_t *)value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_SLONG, count,
+                                      count * 4, value));
 }
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagCheckedLong8(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint64 value)
-{
-	uint64 m;
-	assert(sizeof(uint64)==8);
-	if( !(tif->tif_flags&TIFF_BIGTIFF) ) {
-		TIFFErrorExt(tif->tif_clientdata,"TIFFWriteDirectoryTagCheckedLong8","LONG8 not allowed for ClassicTIFF");
-		return(0);
-	}
-	m=value;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong8(&m);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_LONG8,1,8,&m));
+static int TIFFWriteDirectoryTagCheckedLong8Array(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  uint64_t *value)
+{
+    assert(count < 0x20000000);
+    assert(sizeof(uint64_t) == 8);
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        TIFFErrorExtR(tif, "TIFFWriteDirectoryTagCheckedLong8Array",
+                      "LONG8 not allowed for ClassicTIFF");
+        return (0);
+    }
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong8(value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_LONG8, count,
+                                      count * 8, value));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagCheckedLong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value)
+static int TIFFWriteDirectoryTagCheckedSlong8Array(TIFF *tif, uint32_t *ndir,
+                                                   TIFFDirEntry *dir,
+                                                   uint16_t tag, uint32_t count,
+                                                   int64_t *value)
 {
-	assert(count<0x20000000);
-	assert(sizeof(uint64)==8);
-	if( !(tif->tif_flags&TIFF_BIGTIFF) ) {
-		TIFFErrorExt(tif->tif_clientdata,"TIFFWriteDirectoryTagCheckedLong8Array","LONG8 not allowed for ClassicTIFF");
-		return(0);
-	}
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong8(value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_LONG8,count,count*8,value));
-}
-
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagCheckedSlong8(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, int64 value)
-{
-	int64 m;
-	assert(sizeof(int64)==8);
-	if( !(tif->tif_flags&TIFF_BIGTIFF) ) {
-		TIFFErrorExt(tif->tif_clientdata,"TIFFWriteDirectoryTagCheckedSlong8","SLONG8 not allowed for ClassicTIFF");
-		return(0);
-	}
-	m=value;
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabLong8((uint64*)(&m));
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SLONG8,1,8,&m));
+    assert(count < 0x20000000);
+    assert(sizeof(int64_t) == 8);
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        TIFFErrorExtR(tif, "TIFFWriteDirectoryTagCheckedSlong8Array",
+                      "SLONG8 not allowed for ClassicTIFF");
+        return (0);
+    }
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong8((uint64_t *)value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_SLONG8, count,
+                                      count * 8, value));
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagCheckedSlong8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, int64* value)
+static int TIFFWriteDirectoryTagCheckedRational(TIFF *tif, uint32_t *ndir,
+                                                TIFFDirEntry *dir, uint16_t tag,
+                                                double value)
 {
-	assert(count<0x20000000);
-	assert(sizeof(int64)==8);
-	if( !(tif->tif_flags&TIFF_BIGTIFF) ) {
-		TIFFErrorExt(tif->tif_clientdata,"TIFFWriteDirectoryTagCheckedSlong8Array","SLONG8 not allowed for ClassicTIFF");
-		return(0);
-	}
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong8((uint64*)value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SLONG8,count,count*8,value));
-}
-
-static int
-TIFFWriteDirectoryTagCheckedRational(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagCheckedRational";
-	uint32 m[2];
-	assert(sizeof(uint32)==4);
-	if (value < 0) 
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Negative value is illegal");
-		return 0;
-	} 
-	else if (value != value) 
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Not-a-number value is illegal");
-		return 0;
-	}
-#ifdef not_def
-	else if (value==0.0)
-	{
-		m[0]=0;
-		m[1]=1;
-	}
-	else if (value <= 0xFFFFFFFFU && value==(double)(uint32)value)
-	{
-		m[0]=(uint32)value;
-		m[1]=1;
-	}
-	else if (value<1.0)
-	{
-		m[0]=(uint32)(value*0xFFFFFFFF);
-		m[1]=0xFFFFFFFF;
-	}
-	else
-	{
-		m[0]=0xFFFFFFFF;
-		m[1]=(uint32)(0xFFFFFFFF/value);
-	}
-#else
-	/*--Rational2Double: New function also used for non-custom rational tags. 
-	 *  However, could be omitted here, because TIFFWriteDirectoryTagCheckedRational() is not used by code for custom tags,
-	 *  only by code for named-tiff-tags like FIELD_RESOLUTION and FIELD_POSITION */
-	else {
-	DoubleToRational(value, &m[0], &m[1]);
-	}
-#endif
-
-	if (tif->tif_flags&TIFF_SWAB)
-	{
-		TIFFSwabLong(&m[0]);
-		TIFFSwabLong(&m[1]);
-	}
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_RATIONAL,1,8,&m[0]));
-}
+    static const char module[] = "TIFFWriteDirectoryTagCheckedRational";
+    uint32_t m[2];
+    assert(sizeof(uint32_t) == 4);
+    if (value < 0)
+    {
+        TIFFErrorExtR(tif, module, "Negative value is illegal");
+        return 0;
+    }
+    else if (value != value)
+    {
+        TIFFErrorExtR(tif, module, "Not-a-number value is illegal");
+        return 0;
+    }
+    /*--Rational2Double: New function also used for non-custom rational tags.
+     *  However, could be omitted here, because
+     * TIFFWriteDirectoryTagCheckedRational() is not used by code for custom
+     * tags, only by code for named-tiff-tags like FIELD_RESOLUTION and
+     * FIELD_POSITION */
+    else
+    {
+        DoubleToRational(value, &m[0], &m[1]);
+    }
 
-static int
-TIFFWriteDirectoryTagCheckedRationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagCheckedRationalArray";
-	uint32* m;
-	float* na;
-	uint32* nb;
-	uint32 nc;
-	int o;
-	assert(sizeof(uint32)==4);
-	m=_TIFFmalloc(count*2*sizeof(uint32));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=value, nb=m, nc=0; nc<count; na++, nb+=2, nc++)
-	{
-#ifdef not_def
-		if (*na<=0.0 || *na != *na)
-		{
-			nb[0]=0;
-			nb[1]=1;
-		}
-		else if (*na >= 0 && *na <= (float)0xFFFFFFFFU &&
-                         *na==(float)(uint32)(*na))
-		{
-			nb[0]=(uint32)(*na);
-			nb[1]=1;
-		}
-		else if (*na<1.0)
-		{
-			nb[0]=(uint32)((double)(*na)*0xFFFFFFFF);
-			nb[1]=0xFFFFFFFF;
-		}
-		else
-		{
-			nb[0]=0xFFFFFFFF;
-			nb[1]=(uint32)((double)0xFFFFFFFF/(*na));
-		}
-#else
-		/*-- Rational2Double: Also for float precision accuracy is sometimes enhanced --*/
-		DoubleToRational(*na, &nb[0], &nb[1]);
-#endif
-	}
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong(m,count*2);
-	o=TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_RATIONAL,count,count*8,&m[0]);
-	_TIFFfree(m);
-	return(o);
+    if (tif->tif_flags & TIFF_SWAB)
+    {
+        TIFFSwabLong(&m[0]);
+        TIFFSwabLong(&m[1]);
+    }
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_RATIONAL, 1, 8,
+                                      &m[0]));
 }
 
-static int
-TIFFWriteDirectoryTagCheckedSrationalArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagCheckedSrationalArray";
-	int32* m;
-	float* na;
-	int32* nb;
-	uint32 nc;
-	int o;
-	assert(sizeof(int32)==4);
-	m=_TIFFmalloc(count*2*sizeof(int32));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=value, nb=m, nc=0; nc<count; na++, nb+=2, nc++)
-	{
-#ifdef not_def
-		if (*na<0.0)
-		{
-			if (*na==(int32)(*na))
-			{
-				nb[0]=(int32)(*na);
-				nb[1]=1;
-			}
-			else if (*na>-1.0)
-			{
-				nb[0]=-(int32)((double)(-*na)*0x7FFFFFFF);
-				nb[1]=0x7FFFFFFF;
-			}
-			else
-			{
-				nb[0]=-0x7FFFFFFF;
-				nb[1]=(int32)((double)0x7FFFFFFF/(-*na));
-			}
-		}
-		else
-		{
-			if (*na==(int32)(*na))
-			{
-				nb[0]=(int32)(*na);
-				nb[1]=1;
-			}
-			else if (*na<1.0)
-			{
-				nb[0]=(int32)((double)(*na)*0x7FFFFFFF);
-				nb[1]=0x7FFFFFFF;
-			}
-			else
-			{
-				nb[0]=0x7FFFFFFF;
-				nb[1]=(int32)((double)0x7FFFFFFF/(*na));
-			}
-		}
-#else
-		/*-- Rational2Double: Also for float precision accuracy is sometimes enhanced --*/
-		DoubleToSrational(*na, &nb[0], &nb[1]);
-#endif
-	}
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong((uint32*)m,count*2);
-	o=TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SRATIONAL,count,count*8,&m[0]);
-	_TIFFfree(m);
-	return(o);
+static int TIFFWriteDirectoryTagCheckedRationalArray(TIFF *tif, uint32_t *ndir,
+                                                     TIFFDirEntry *dir,
+                                                     uint16_t tag,
+                                                     uint32_t count,
+                                                     float *value)
+{
+    static const char module[] = "TIFFWriteDirectoryTagCheckedRationalArray";
+    uint32_t *m;
+    float *na;
+    uint32_t *nb;
+    uint32_t nc;
+    int o;
+    assert(sizeof(uint32_t) == 4);
+    m = _TIFFmallocExt(tif, count * 2 * sizeof(uint32_t));
+    if (m == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+    for (na = value, nb = m, nc = 0; nc < count; na++, nb += 2, nc++)
+    {
+        DoubleToRational(*na, &nb[0], &nb[1]);
+    }
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong(m, count * 2);
+    o = TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_RATIONAL, count,
+                                  count * 8, &m[0]);
+    _TIFFfreeExt(tif, m);
+    return (o);
+}
+
+static int TIFFWriteDirectoryTagCheckedSrationalArray(TIFF *tif, uint32_t *ndir,
+                                                      TIFFDirEntry *dir,
+                                                      uint16_t tag,
+                                                      uint32_t count,
+                                                      float *value)
+{
+    static const char module[] = "TIFFWriteDirectoryTagCheckedSrationalArray";
+    int32_t *m;
+    float *na;
+    int32_t *nb;
+    uint32_t nc;
+    int o;
+    assert(sizeof(int32_t) == 4);
+    m = _TIFFmallocExt(tif, count * 2 * sizeof(int32_t));
+    if (m == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+    for (na = value, nb = m, nc = 0; nc < count; na++, nb += 2, nc++)
+    {
+        DoubleToSrational(*na, &nb[0], &nb[1]);
+    }
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong((uint32_t *)m, count * 2);
+    o = TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_SRATIONAL, count,
+                                  count * 8, &m[0]);
+    _TIFFfreeExt(tif, m);
+    return (o);
 }
 
 /*-- Rational2Double: additional write functions for double arrays */
 static int
-TIFFWriteDirectoryTagCheckedRationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagCheckedRationalDoubleArray";
-	uint32* m;
-	double* na;
-	uint32* nb;
-	uint32 nc;
-	int o;
-	assert(sizeof(uint32)==4);
-	m=_TIFFmalloc(count*2*sizeof(uint32));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=value, nb=m, nc=0; nc<count; na++, nb+=2, nc++)
-	{
-		DoubleToRational(*na, &nb[0], &nb[1]);
-	}
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong(m,count*2);
-	o=TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_RATIONAL,count,count*8,&m[0]);
-	_TIFFfree(m);
-	return(o);
+TIFFWriteDirectoryTagCheckedRationalDoubleArray(TIFF *tif, uint32_t *ndir,
+                                                TIFFDirEntry *dir, uint16_t tag,
+                                                uint32_t count, double *value)
+{
+    static const char module[] =
+        "TIFFWriteDirectoryTagCheckedRationalDoubleArray";
+    uint32_t *m;
+    double *na;
+    uint32_t *nb;
+    uint32_t nc;
+    int o;
+    assert(sizeof(uint32_t) == 4);
+    m = _TIFFmallocExt(tif, count * 2 * sizeof(uint32_t));
+    if (m == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+    for (na = value, nb = m, nc = 0; nc < count; na++, nb += 2, nc++)
+    {
+        DoubleToRational(*na, &nb[0], &nb[1]);
+    }
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong(m, count * 2);
+    o = TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_RATIONAL, count,
+                                  count * 8, &m[0]);
+    _TIFFfreeExt(tif, m);
+    return (o);
 } /*-- TIFFWriteDirectoryTagCheckedRationalDoubleArray() ------- */
 
-static int
-TIFFWriteDirectoryTagCheckedSrationalDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
-{
-	static const char module[] = "TIFFWriteDirectoryTagCheckedSrationalDoubleArray";
-	int32* m;
-	double* na;
-	int32* nb;
-	uint32 nc;
-	int o;
-	assert(sizeof(int32)==4);
-	m=_TIFFmalloc(count*2*sizeof(int32));
-	if (m==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	for (na=value, nb=m, nc=0; nc<count; na++, nb+=2, nc++)
-	{
-		DoubleToSrational(*na, &nb[0], &nb[1]);
-	}
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong((uint32*)m,count*2);
-	o=TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_SRATIONAL,count,count*8,&m[0]);
-	_TIFFfree(m);
-	return(o);
+static int TIFFWriteDirectoryTagCheckedSrationalDoubleArray(
+    TIFF *tif, uint32_t *ndir, TIFFDirEntry *dir, uint16_t tag, uint32_t count,
+    double *value)
+{
+    static const char module[] =
+        "TIFFWriteDirectoryTagCheckedSrationalDoubleArray";
+    int32_t *m;
+    double *na;
+    int32_t *nb;
+    uint32_t nc;
+    int o;
+    assert(sizeof(int32_t) == 4);
+    m = _TIFFmallocExt(tif, count * 2 * sizeof(int32_t));
+    if (m == NULL)
+    {
+        TIFFErrorExtR(tif, module, "Out of memory");
+        return (0);
+    }
+    for (na = value, nb = m, nc = 0; nc < count; na++, nb += 2, nc++)
+    {
+        DoubleToSrational(*na, &nb[0], &nb[1]);
+    }
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong((uint32_t *)m, count * 2);
+    o = TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_SRATIONAL, count,
+                                  count * 8, &m[0]);
+    _TIFFfreeExt(tif, m);
+    return (o);
 } /*--- TIFFWriteDirectoryTagCheckedSrationalDoubleArray() -------- */
 
-#if 0
-static
-void DoubleToRational_direct(double value, unsigned long *num, unsigned long *denom)
-{
-	/*--- OLD Code for debugging and comparison  ---- */
-	/* code merged from TIFFWriteDirectoryTagCheckedRationalArray() and TIFFWriteDirectoryTagCheckedRational() */
-
-	/* First check for zero and also check for negative numbers (which are illegal for RATIONAL) 
-	 * and also check for "not-a-number". In each case just set this to zero to support also rational-arrays.
-	  */
-	if (value<=0.0 || value != value)
-	{
-		*num=0;
-		*denom=1;
-	}
-	else if (value <= 0xFFFFFFFFU &&  (value==(double)(uint32)(value)))	/* check for integer values */
-	{
-		*num=(uint32)(value);
-		*denom=1;
-	}
-	else if (value<1.0)
-	{
-		*num = (uint32)((value) * (double)0xFFFFFFFFU);
-		*denom=0xFFFFFFFFU;
-	}
-	else
-	{
-		*num=0xFFFFFFFFU;
-		*denom=(uint32)((double)0xFFFFFFFFU/(value));
-	}
-}  /*-- DoubleToRational_direct() -------------- */
-#endif
-
-#if 0
-static
-void DoubleToSrational_direct(double value,  long *num,  long *denom)
-{
-	/*--- OLD Code for debugging and comparison -- SIGNED-version ----*/
-	/*  code was amended from original TIFFWriteDirectoryTagCheckedSrationalArray() */
-
-	/* First check for zero and also check for negative numbers (which are illegal for RATIONAL)
-	 * and also check for "not-a-number". In each case just set this to zero to support also rational-arrays.
-	  */
-	if (value<0.0)
-		{
-			if (value==(int32)(value))
-			{
-				*num=(int32)(value);
-				*denom=1;
-			}
-			else if (value>-1.0)
-			{
-				*num=-(int32)((-value) * (double)0x7FFFFFFF);
-				*denom=0x7FFFFFFF;
-			}
-			else
-			{
-				*num=-0x7FFFFFFF;
-				*denom=(int32)((double)0x7FFFFFFF / (-value));
-			}
-		}
-		else
-		{
-			if (value==(int32)(value))
-			{
-				*num=(int32)(value);
-				*denom=1;
-			}
-			else if (value<1.0)
-			{
-				*num=(int32)((value)  *(double)0x7FFFFFFF);
-				*denom=0x7FFFFFFF;
-			}
-			else
-			{
-				*num=0x7FFFFFFF;
-				*denom=(int32)((double)0x7FFFFFFF / (value));
-			}
-		}
-}  /*-- DoubleToSrational_direct() --------------*/
-#endif
-
-//#define DOUBLE2RAT_DEBUGOUTPUT
-/** -----  Rational2Double: Double To Rational Conversion ----------------------------------------------------------
-* There is a mathematical theorem to convert real numbers into a rational (integer fraction) number.
-* This is called "continuous fraction" which uses the Euclidean algorithm to find the greatest common divisor (GCD).
-*  (ref. e.g. https://de.wikipedia.org/wiki/Kettenbruch or https://en.wikipedia.org/wiki/Continued_fraction
+/** -----  Rational2Double: Double To Rational Conversion
+----------------------------------------------------------
+* There is a mathematical theorem to convert real numbers into a rational
+(integer fraction) number.
+* This is called "continuous fraction" which uses the Euclidean algorithm to
+find the greatest common divisor (GCD).
+*  (ref. e.g. https://de.wikipedia.org/wiki/Kettenbruch or
+https://en.wikipedia.org/wiki/Continued_fraction
 *             https://en.wikipedia.org/wiki/Euclidean_algorithm)
 * The following functions implement the
-* - ToRationalEuclideanGCD()		auxiliary function which mainly implements euclidean GCD
-* - DoubleToRational()			conversion function for un-signed rationals
+* - ToRationalEuclideanGCD()		auxiliary function which mainly
+implements euclidean GCD
+* - DoubleToRational()			conversion function for un-signed
+rationals
 * - DoubleToSrational()			conversion function for signed rationals
 ------------------------------------------------------------------------------------------------------------------*/
 
@@ -2709,583 +2534,627 @@ void DoubleToSrational_direct(double value,  long *num,  long *denom)
 * Calculates the rational fractional of a double input value
 * using the Euclidean algorithm to find the greatest common divisor (GCD)
 ------------------------------------------------------------------------*/
-static
-void ToRationalEuclideanGCD(double value, int blnUseSignedRange, int blnUseSmallRange, unsigned long long *ullNum, unsigned long long *ullDenom)
-{
-	/* Internally, the integer variables can be bigger than the external ones,
-	* as long as the result will fit into the external variable size.
-	*/
-	unsigned long long val, numSum[3] = { 0, 1, 0 }, denomSum[3] = { 1, 0, 0 };
-	unsigned long long aux, bigNum, bigDenom;
-	unsigned long long returnLimit;
-	int i;
-	unsigned long long nMax;
-	double fMax;
-	unsigned long maxDenom;
-	/*-- nMax and fMax defines the initial accuracy of the starting fractional,
-	*   or better, the highest used integer numbers used within the starting fractional (bigNum/bigDenom).
-	*   There are two approaches, which can accidentally lead to different accuracies just depending on the value.
-	*   Therefore, blnUseSmallRange steers this behavior.
-	*   For long long nMax = ((9223372036854775807-1)/2); for long nMax = ((2147483647-1)/2);
-	*/
-	if (blnUseSmallRange) {
-		nMax = (unsigned long long)((2147483647 - 1) / 2); /* for ULONG range */
-	}
-	else {
-		nMax = ((9223372036854775807 - 1) / 2);				/* for ULLONG range */
-	}
-	fMax = (double)nMax;
-
-	/*-- For the Euclidean GCD define the denominator range, so that it stays within size of unsigned long variables.
-	*   maxDenom should be LONG_MAX for negative values and ULONG_MAX for positive ones.
-	*   Also the final returned value of ullNum and ullDenom is limited according to signed- or unsigned-range.
-	*/
-	if (blnUseSignedRange) {
-		maxDenom = 2147483647UL;  /*LONG_MAX = 0x7FFFFFFFUL*/
-		returnLimit = maxDenom;
-	}
-	else {
-		maxDenom = 0xFFFFFFFFUL;  /*ULONG_MAX = 0xFFFFFFFFUL*/
-		returnLimit = maxDenom;
-	}
-
-	/*-- First generate a rational fraction (bigNum/bigDenom) which represents the value
-	*   as a rational number with the highest accuracy. Therefore, unsigned long long (uint64) is needed.
-	*   This rational fraction is then reduced using the Euclidean algorithm to find the greatest common divisor (GCD).
-	*   bigNum   = big numinator of value without fraction (or cut residual fraction)
-	*   bigDenom = big denominator of value
-	*-- Break-criteria so that uint64 cast to "bigNum" introduces no error and bigDenom has no overflow,
-	*   and stop with enlargement of fraction when the double-value of it reaches an integer number without fractional part.
-	*/
-	bigDenom = 1;
-	while ((value != floor(value)) && (value < fMax) && (bigDenom < nMax)) {
-		bigDenom <<= 1;
-		value *= 2;
-	}
-	bigNum = (unsigned long long)value;
-
-	/*-- Start Euclidean algorithm to find the greatest common divisor (GCD) -- */
+static void ToRationalEuclideanGCD(double value, int blnUseSignedRange,
+                                   int blnUseSmallRange, uint64_t *ullNum,
+                                   uint64_t *ullDenom)
+{
+    /* Internally, the integer variables can be bigger than the external ones,
+     * as long as the result will fit into the external variable size.
+     */
+    uint64_t numSum[3] = {0, 1, 0}, denomSum[3] = {1, 0, 0};
+    uint64_t aux, bigNum, bigDenom;
+    uint64_t returnLimit;
+    int i;
+    uint64_t nMax;
+    double fMax;
+    unsigned long maxDenom;
+    /*-- nMax and fMax defines the initial accuracy of the starting fractional,
+     *   or better, the highest used integer numbers used within the starting
+     * fractional (bigNum/bigDenom). There are two approaches, which can
+     * accidentally lead to different accuracies just depending on the value.
+     *   Therefore, blnUseSmallRange steers this behavior.
+     *   For long long nMax = ((9223372036854775807-1)/2); for long nMax =
+     * ((2147483647-1)/2);
+     */
+    if (blnUseSmallRange)
+    {
+        nMax = (uint64_t)((2147483647 - 1) / 2); /* for ULONG range */
+    }
+    else
+    {
+        nMax = ((9223372036854775807 - 1) / 2); /* for ULLONG range */
+    }
+    fMax = (double)nMax;
+
+    /*-- For the Euclidean GCD define the denominator range, so that it stays
+     * within size of unsigned long variables. maxDenom should be LONG_MAX for
+     * negative values and ULONG_MAX for positive ones. Also the final returned
+     * value of ullNum and ullDenom is limited according to signed- or
+     * unsigned-range.
+     */
+    if (blnUseSignedRange)
+    {
+        maxDenom = 2147483647UL; /*LONG_MAX = 0x7FFFFFFFUL*/
+        returnLimit = maxDenom;
+    }
+    else
+    {
+        maxDenom = 0xFFFFFFFFUL; /*ULONG_MAX = 0xFFFFFFFFUL*/
+        returnLimit = maxDenom;
+    }
+
+    /*-- First generate a rational fraction (bigNum/bigDenom) which represents
+     *the value as a rational number with the highest accuracy. Therefore,
+     *uint64_t (uint64_t) is needed. This rational fraction is then reduced
+     *using the Euclidean algorithm to find the greatest common divisor (GCD).
+     *   bigNum   = big numinator of value without fraction (or cut residual
+     *fraction) bigDenom = big denominator of value
+     *-- Break-criteria so that uint64_t cast to "bigNum" introduces no error
+     *and bigDenom has no overflow, and stop with enlargement of fraction when
+     *the double-value of it reaches an integer number without fractional part.
+     */
+    bigDenom = 1;
+    while ((value != floor(value)) && (value < fMax) && (bigDenom < nMax))
+    {
+        bigDenom <<= 1;
+        value *= 2;
+    }
+    bigNum = (uint64_t)value;
+
+    /*-- Start Euclidean algorithm to find the greatest common divisor (GCD) --
+     */
 #define MAX_ITERATIONS 64
-	for (i = 0; i < MAX_ITERATIONS; i++) {
-		/* if bigDenom is not zero, calculate integer part of fraction. */
-		if (bigDenom == 0) {
-			val = 0;
-			break;
-		}
-		else {
-			val = bigNum / bigDenom;
-		}
-
-		/* Set bigDenom to reminder of bigNum/bigDenom and bigNum to previous denominator bigDenom. */
-		aux = bigNum;
-		bigNum = bigDenom;
-		bigDenom = aux % bigDenom;
-
-		/* calculate next denominator and check for its given maximum */
-		aux = val;
-		if (denomSum[1] * val + denomSum[0] >= maxDenom) {
-			aux = (maxDenom - denomSum[0]) / denomSum[1];
-			if (aux * 2 >= val || denomSum[1] >= maxDenom)
-				i = (MAX_ITERATIONS + 1);			/* exit but execute rest of for-loop */
-			else
-				break;
-		}
-		/* calculate next numerator to numSum2 and save previous one to numSum0; numSum1 just copy of numSum2. */
-		numSum[2] = aux * numSum[1] + numSum[0];
-		numSum[0] = numSum[1];
-		numSum[1] = numSum[2];
-		/* calculate next denominator to denomSum2 and save previous one to denomSum0; denomSum1 just copy of denomSum2. */
-		denomSum[2] = aux * denomSum[1] + denomSum[0];
-		denomSum[0] = denomSum[1];
-		denomSum[1] = denomSum[2];
-	}
-
-	/*-- Check and adapt for final variable size and return values; reduces internal accuracy; denominator is kept in ULONG-range with maxDenom -- */
-	while (numSum[1] > returnLimit || denomSum[1] > returnLimit) {
-		numSum[1] = numSum[1] / 2;
-		denomSum[1] = denomSum[1] / 2;
-	}
-
-	/* return values */
-	*ullNum = numSum[1];
-	*ullDenom = denomSum[1];
-
-}  /*-- ToRationalEuclideanGCD() -------------- */
+    for (i = 0; i < MAX_ITERATIONS; i++)
+    {
+        uint64_t val;
+        /* if bigDenom is not zero, calculate integer part of fraction. */
+        if (bigDenom == 0)
+        {
+            break;
+        }
+        val = bigNum / bigDenom;
+
+        /* Set bigDenom to reminder of bigNum/bigDenom and bigNum to previous
+         * denominator bigDenom. */
+        aux = bigNum;
+        bigNum = bigDenom;
+        bigDenom = aux % bigDenom;
+
+        /* calculate next denominator and check for its given maximum */
+        aux = val;
+        if (denomSum[1] * val + denomSum[0] >= maxDenom)
+        {
+            aux = (maxDenom - denomSum[0]) / denomSum[1];
+            if (aux * 2 >= val || denomSum[1] >= maxDenom)
+                i = (MAX_ITERATIONS +
+                     1); /* exit but execute rest of for-loop */
+            else
+                break;
+        }
+        /* calculate next numerator to numSum2 and save previous one to numSum0;
+         * numSum1 just copy of numSum2. */
+        numSum[2] = aux * numSum[1] + numSum[0];
+        numSum[0] = numSum[1];
+        numSum[1] = numSum[2];
+        /* calculate next denominator to denomSum2 and save previous one to
+         * denomSum0; denomSum1 just copy of denomSum2. */
+        denomSum[2] = aux * denomSum[1] + denomSum[0];
+        denomSum[0] = denomSum[1];
+        denomSum[1] = denomSum[2];
+    }
+
+    /*-- Check and adapt for final variable size and return values; reduces
+     * internal accuracy; denominator is kept in ULONG-range with maxDenom -- */
+    while (numSum[1] > returnLimit || denomSum[1] > returnLimit)
+    {
+        numSum[1] = numSum[1] / 2;
+        denomSum[1] = denomSum[1] / 2;
+    }
 
+    /* return values */
+    *ullNum = numSum[1];
+    *ullDenom = denomSum[1];
+
+} /*-- ToRationalEuclideanGCD() -------------- */
 
 /**---- DoubleToRational() -----------------------------------------------
 * Calculates the rational fractional of a double input value
 * for UN-SIGNED rationals,
 * using the Euclidean algorithm to find the greatest common divisor (GCD)
 ------------------------------------------------------------------------*/
-static
-void DoubleToRational(double value, uint32 *num, uint32 *denom)
-{
-	/*---- UN-SIGNED RATIONAL ---- */
-	double dblDiff, dblDiff2;
-	unsigned long long ullNum, ullDenom, ullNum2, ullDenom2;
-
-	/*-- Check for negative values. If so it is an error. */
-        /* Test written that way to catch NaN */
-	if (!(value >= 0)) {
-		*num = *denom = 0;
-		TIFFErrorExt(0, "TIFFLib: DoubleToRational()", " Negative Value for Unsigned Rational given.");
-		return;
-	}
-
-	/*-- Check for too big numbers (> ULONG_MAX) -- */
-	if (value > 0xFFFFFFFFUL) {
-		*num = 0xFFFFFFFFU;
-		*denom = 0;
-		return;
-	}
-	/*-- Check for easy integer numbers -- */
-	if (value == (uint32)(value)) {
-		*num = (uint32)value;
-		*denom = 1;
-		return;
-	}
-	/*-- Check for too small numbers for "unsigned long" type rationals -- */
-	if (value < 1.0 / (double)0xFFFFFFFFUL) {
-		*num = 0;
-		*denom = 0xFFFFFFFFU;
-		return;
-	}
-
-	/*-- There are two approaches using the Euclidean algorithm,
-	*   which can accidentally lead to different accuracies just depending on the value.
-	*   Try both and define which one was better.
-	*/
-	ToRationalEuclideanGCD(value, FALSE, FALSE, &ullNum, &ullDenom);
-	ToRationalEuclideanGCD(value, FALSE, TRUE, &ullNum2, &ullDenom2);
-	/*-- Double-Check, that returned values fit into ULONG :*/
-	if (ullNum > 0xFFFFFFFFUL || ullDenom > 0xFFFFFFFFUL || ullNum2 > 0xFFFFFFFFUL || ullDenom2 > 0xFFFFFFFFUL) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(0, "TIFFLib: DoubleToRational()", " Num or Denom exceeds ULONG: val=%14.6f, num=%I64u, denom=%I64u | num2=%I64u, denom2=%I64u", value, ullNum, ullDenom, ullNum2, ullDenom2);
-#else
-		TIFFErrorExt(0, "TIFFLib: DoubleToRational()", " Num or Denom exceeds ULONG: val=%14.6f, num=%12llu, denom=%12llu | num2=%12llu, denom2=%12llu", value, ullNum, ullDenom, ullNum2, ullDenom2);
-#endif
-		assert(0);
-	}
-
-	/* Check, which one has higher accuracy and take that. */
-	dblDiff = fabs(value - ((double)ullNum / (double)ullDenom));
-	dblDiff2 = fabs(value - ((double)ullNum2 / (double)ullDenom2));
-	if (dblDiff < dblDiff2) {
-		*num = (uint32)ullNum;
-		*denom = (uint32)ullDenom;
-	}
-	else {
-		*num = (uint32)ullNum2;
-		*denom = (uint32)ullDenom2;
-	}
-}  /*-- DoubleToRational() -------------- */
+static void DoubleToRational(double value, uint32_t *num, uint32_t *denom)
+{
+    /*---- UN-SIGNED RATIONAL ---- */
+    double dblDiff, dblDiff2;
+    uint64_t ullNum, ullDenom, ullNum2, ullDenom2;
+
+    /*-- Check for negative values. If so it is an error. */
+    /* Test written that way to catch NaN */
+    if (!(value >= 0))
+    {
+        *num = *denom = 0;
+        TIFFErrorExt(0, "TIFFLib: DoubleToRational()",
+                     " Negative Value for Unsigned Rational given.");
+        return;
+    }
+
+    /*-- Check for too big numbers (> ULONG_MAX) -- */
+    if (value > 0xFFFFFFFFUL)
+    {
+        *num = 0xFFFFFFFFU;
+        *denom = 0;
+        return;
+    }
+    /*-- Check for easy integer numbers -- */
+    if (value == (uint32_t)(value))
+    {
+        *num = (uint32_t)value;
+        *denom = 1;
+        return;
+    }
+    /*-- Check for too small numbers for "unsigned long" type rationals -- */
+    if (value < 1.0 / (double)0xFFFFFFFFUL)
+    {
+        *num = 0;
+        *denom = 0xFFFFFFFFU;
+        return;
+    }
+
+    /*-- There are two approaches using the Euclidean algorithm,
+     *   which can accidentally lead to different accuracies just depending on
+     * the value. Try both and define which one was better.
+     */
+    ToRationalEuclideanGCD(value, FALSE, FALSE, &ullNum, &ullDenom);
+    ToRationalEuclideanGCD(value, FALSE, TRUE, &ullNum2, &ullDenom2);
+    /*-- Double-Check, that returned values fit into ULONG :*/
+    if (ullNum > 0xFFFFFFFFUL || ullDenom > 0xFFFFFFFFUL ||
+        ullNum2 > 0xFFFFFFFFUL || ullDenom2 > 0xFFFFFFFFUL)
+    {
+        TIFFErrorExt(0, "TIFFLib: DoubleToRational()",
+                     " Num or Denom exceeds ULONG: val=%14.6f, num=%12" PRIu64
+                     ", denom=%12" PRIu64 " | num2=%12" PRIu64
+                     ", denom2=%12" PRIu64 "",
+                     value, ullNum, ullDenom, ullNum2, ullDenom2);
+        assert(0);
+    }
+
+    /* Check, which one has higher accuracy and take that. */
+    dblDiff = fabs(value - ((double)ullNum / (double)ullDenom));
+    dblDiff2 = fabs(value - ((double)ullNum2 / (double)ullDenom2));
+    if (dblDiff < dblDiff2)
+    {
+        *num = (uint32_t)ullNum;
+        *denom = (uint32_t)ullDenom;
+    }
+    else
+    {
+        *num = (uint32_t)ullNum2;
+        *denom = (uint32_t)ullDenom2;
+    }
+} /*-- DoubleToRational() -------------- */
 
 /**---- DoubleToSrational() -----------------------------------------------
 * Calculates the rational fractional of a double input value
 * for SIGNED rationals,
 * using the Euclidean algorithm to find the greatest common divisor (GCD)
 ------------------------------------------------------------------------*/
-static
-void DoubleToSrational(double value, int32 *num, int32 *denom)
-{
-	/*---- SIGNED RATIONAL ----*/
-	int neg = 1;
-	double dblDiff, dblDiff2;
-	unsigned long long ullNum, ullDenom, ullNum2, ullDenom2;
-
-	/*-- Check for negative values and use then the positive one for internal calculations, but take the sign into account before returning. */
-	if (value < 0) { neg = -1; value = -value; }
-
-	/*-- Check for too big numbers (> LONG_MAX) -- */
-	if (value > 0x7FFFFFFFL) {
-		*num = 0x7FFFFFFFL;
-		*denom = 0;
-		return;
-	}
-	/*-- Check for easy numbers -- */
-	if (value == (int32)(value)) {
-		*num = (int32)(neg * value);
-		*denom = 1;
-		return;
-	}
-	/*-- Check for too small numbers for "long" type rationals -- */
-	if (value < 1.0 / (double)0x7FFFFFFFL) {
-		*num = 0;
-		*denom = 0x7FFFFFFFL;
-		return;
-	}
-
-	/*-- There are two approaches using the Euclidean algorithm,
-	*   which can accidentally lead to different accuracies just depending on the value.
-	*   Try both and define which one was better.
-	*   Furthermore, set behavior of ToRationalEuclideanGCD() to the range of signed-long.
-	*/
-	ToRationalEuclideanGCD(value, TRUE, FALSE, &ullNum, &ullDenom);
-	ToRationalEuclideanGCD(value, TRUE, TRUE, &ullNum2, &ullDenom2);
-	/*-- Double-Check, that returned values fit into LONG :*/
-	if (ullNum > 0x7FFFFFFFL || ullDenom > 0x7FFFFFFFL || ullNum2 > 0x7FFFFFFFL || ullDenom2 > 0x7FFFFFFFL) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(0, "TIFFLib: DoubleToSrational()", " Num or Denom exceeds LONG: val=%14.6f, num=%I64u, denom=%I64u | num2=%I64u, denom2=%I64u", neg*value, ullNum, ullDenom, ullNum2, ullDenom2);
-#else
-		TIFFErrorExt(0, "TIFFLib: DoubleToSrational()", " Num or Denom exceeds LONG: val=%14.6f, num=%12llu, denom=%12llu | num2=%12llu, denom2=%12llu", neg*value, ullNum, ullDenom, ullNum2, ullDenom2);
-#endif
-		assert(0);
-	}
-
-	/* Check, which one has higher accuracy and take that. */
-	dblDiff = fabs(value - ((double)ullNum / (double)ullDenom));
-	dblDiff2 = fabs(value - ((double)ullNum2 / (double)ullDenom2));
-	if (dblDiff < dblDiff2) {
-		*num = (int32)(neg * (long)ullNum);
-		*denom = (int32)ullDenom;
-	}
-	else {
-		*num = (int32)(neg * (long)ullNum2);
-		*denom = (int32)ullDenom2;
-	}
-}  /*-- DoubleToSrational() --------------*/
-
+static void DoubleToSrational(double value, int32_t *num, int32_t *denom)
+{
+    /*---- SIGNED RATIONAL ----*/
+    int neg = 1;
+    double dblDiff, dblDiff2;
+    uint64_t ullNum, ullDenom, ullNum2, ullDenom2;
 
+    /*-- Check for negative values and use then the positive one for internal
+     * calculations, but take the sign into account before returning. */
+    if (value < 0)
+    {
+        neg = -1;
+        value = -value;
+    }
 
+    /*-- Check for too big numbers (> LONG_MAX) -- */
+    if (value > 0x7FFFFFFFL)
+    {
+        *num = 0x7FFFFFFFL;
+        *denom = 0;
+        return;
+    }
+    /*-- Check for easy numbers -- */
+    if (value == (int32_t)(value))
+    {
+        *num = (int32_t)(neg * value);
+        *denom = 1;
+        return;
+    }
+    /*-- Check for too small numbers for "long" type rationals -- */
+    if (value < 1.0 / (double)0x7FFFFFFFL)
+    {
+        *num = 0;
+        *denom = 0x7FFFFFFFL;
+        return;
+    }
 
+    /*-- There are two approaches using the Euclidean algorithm,
+     *   which can accidentally lead to different accuracies just depending on
+     * the value. Try both and define which one was better. Furthermore, set
+     * behavior of ToRationalEuclideanGCD() to the range of signed-long.
+     */
+    ToRationalEuclideanGCD(value, TRUE, FALSE, &ullNum, &ullDenom);
+    ToRationalEuclideanGCD(value, TRUE, TRUE, &ullNum2, &ullDenom2);
+    /*-- Double-Check, that returned values fit into LONG :*/
+    if (ullNum > 0x7FFFFFFFL || ullDenom > 0x7FFFFFFFL ||
+        ullNum2 > 0x7FFFFFFFL || ullDenom2 > 0x7FFFFFFFL)
+    {
+        TIFFErrorExt(0, "TIFFLib: DoubleToSrational()",
+                     " Num or Denom exceeds LONG: val=%14.6f, num=%12" PRIu64
+                     ", denom=%12" PRIu64 " | num2=%12" PRIu64
+                     ", denom2=%12" PRIu64 "",
+                     neg * value, ullNum, ullDenom, ullNum2, ullDenom2);
+        assert(0);
+    }
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagCheckedFloat(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, float value)
-{
-	float m;
-	assert(sizeof(float)==4);
-	m=value;
-	TIFFCvtNativeToIEEEFloat(tif,1,&m);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabFloat(&m);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_FLOAT,1,4,&m));
+    /* Check, which one has higher accuracy and take that. */
+    dblDiff = fabs(value - ((double)ullNum / (double)ullDenom));
+    dblDiff2 = fabs(value - ((double)ullNum2 / (double)ullDenom2));
+    if (dblDiff < dblDiff2)
+    {
+        *num = (int32_t)(neg * (long)ullNum);
+        *denom = (int32_t)ullDenom;
+    }
+    else
+    {
+        *num = (int32_t)(neg * (long)ullNum2);
+        *denom = (int32_t)ullDenom2;
+    }
+} /*-- DoubleToSrational() --------------*/
+
+static int TIFFWriteDirectoryTagCheckedFloatArray(TIFF *tif, uint32_t *ndir,
+                                                  TIFFDirEntry *dir,
+                                                  uint16_t tag, uint32_t count,
+                                                  float *value)
+{
+    assert(count < 0x40000000);
+    assert(sizeof(float) == 4);
+    TIFFCvtNativeToIEEEFloat(tif, count, &value);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfFloat(value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_FLOAT, count,
+                                      count * 4, value));
+}
+
+static int TIFFWriteDirectoryTagCheckedDoubleArray(TIFF *tif, uint32_t *ndir,
+                                                   TIFFDirEntry *dir,
+                                                   uint16_t tag, uint32_t count,
+                                                   double *value)
+{
+    assert(count < 0x20000000);
+    assert(sizeof(double) == 8);
+    TIFFCvtNativeToIEEEDouble(tif, count, &value);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfDouble(value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_DOUBLE, count,
+                                      count * 8, value));
+}
+
+static int TIFFWriteDirectoryTagCheckedIfdArray(TIFF *tif, uint32_t *ndir,
+                                                TIFFDirEntry *dir, uint16_t tag,
+                                                uint32_t count, uint32_t *value)
+{
+    assert(count < 0x40000000);
+    assert(sizeof(uint32_t) == 4);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong(value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_IFD, count,
+                                      count * 4, value));
+}
+
+static int TIFFWriteDirectoryTagCheckedIfd8Array(TIFF *tif, uint32_t *ndir,
+                                                 TIFFDirEntry *dir,
+                                                 uint16_t tag, uint32_t count,
+                                                 uint64_t *value)
+{
+    assert(count < 0x20000000);
+    assert(sizeof(uint64_t) == 8);
+    assert(tif->tif_flags & TIFF_BIGTIFF);
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfLong8(value, count);
+    return (TIFFWriteDirectoryTagData(tif, ndir, dir, tag, TIFF_IFD8, count,
+                                      count * 8, value));
+}
+
+static int TIFFWriteDirectoryTagData(TIFF *tif, uint32_t *ndir,
+                                     TIFFDirEntry *dir, uint16_t tag,
+                                     uint16_t datatype, uint32_t count,
+                                     uint32_t datalength, void *data)
+{
+    static const char module[] = "TIFFWriteDirectoryTagData";
+    uint32_t m;
+    m = 0;
+    while (m < (*ndir))
+    {
+        assert(dir[m].tdir_tag != tag);
+        if (dir[m].tdir_tag > tag)
+            break;
+        m++;
+    }
+    if (m < (*ndir))
+    {
+        uint32_t n;
+        for (n = *ndir; n > m; n--)
+            dir[n] = dir[n - 1];
+    }
+    dir[m].tdir_tag = tag;
+    dir[m].tdir_type = datatype;
+    dir[m].tdir_count = count;
+    dir[m].tdir_offset.toff_long8 = 0;
+    if (datalength <= ((tif->tif_flags & TIFF_BIGTIFF) ? 0x8U : 0x4U))
+    {
+        if (data && datalength)
+        {
+            _TIFFmemcpy(&dir[m].tdir_offset, data, datalength);
+        }
+    }
+    else
+    {
+        uint64_t na, nb;
+        na = tif->tif_dataoff;
+        nb = na + datalength;
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+            nb = (uint32_t)nb;
+        if ((nb < na) || (nb < datalength))
+        {
+            TIFFErrorExtR(tif, module, "Maximum TIFF file size exceeded");
+            return (0);
+        }
+        if (!SeekOK(tif, na))
+        {
+            TIFFErrorExtR(tif, module, "IO error writing tag data");
+            return (0);
+        }
+        if (datalength >= 0x80000000UL)
+        {
+            TIFFErrorExtR(tif, module,
+                          "libtiff does not allow writing more than 2147483647 "
+                          "bytes in a tag");
+            return (0);
+        }
+        if (!WriteOK(tif, data, (tmsize_t)datalength))
+        {
+            TIFFErrorExtR(tif, module, "IO error writing tag data");
+            return (0);
+        }
+        tif->tif_dataoff = nb;
+        if (tif->tif_dataoff & 1)
+            tif->tif_dataoff++;
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            uint32_t o;
+            o = (uint32_t)na;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong(&o);
+            _TIFFmemcpy(&dir[m].tdir_offset, &o, 4);
+        }
+        else
+        {
+            dir[m].tdir_offset.toff_long8 = na;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&dir[m].tdir_offset.toff_long8);
+        }
+    }
+    (*ndir)++;
+    return (1);
 }
-#endif
 
-static int
-TIFFWriteDirectoryTagCheckedFloatArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, float* value)
+/*
+ * Link the current directory into the directory chain for the file.
+ */
+static int TIFFLinkDirectory(TIFF *tif)
 {
-	assert(count<0x40000000);
-	assert(sizeof(float)==4);
-	TIFFCvtNativeToIEEEFloat(tif,count,&value);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfFloat(value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_FLOAT,count,count*4,value));
-}
+    static const char module[] = "TIFFLinkDirectory";
 
-#ifdef notdef
-static int
-TIFFWriteDirectoryTagCheckedDouble(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, double value)
-{
-	double m;
-	assert(sizeof(double)==8);
-	m=value;
-	TIFFCvtNativeToIEEEDouble(tif,1,&m);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabDouble(&m);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_DOUBLE,1,8,&m));
-}
-#endif
+    tif->tif_diroff = (TIFFSeekFile(tif, 0, SEEK_END) + 1) & (~((toff_t)1));
 
-static int
-TIFFWriteDirectoryTagCheckedDoubleArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, double* value)
-{
-	assert(count<0x20000000);
-	assert(sizeof(double)==8);
-	TIFFCvtNativeToIEEEDouble(tif,count,&value);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfDouble(value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_DOUBLE,count,count*8,value));
-}
+    /*
+     * Handle SubIFDs
+     */
+    if (tif->tif_flags & TIFF_INSUBIFD)
+    {
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            uint32_t m;
+            m = (uint32_t)tif->tif_diroff;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong(&m);
+            (void)TIFFSeekFile(tif, tif->tif_subifdoff, SEEK_SET);
+            if (!WriteOK(tif, &m, 4))
+            {
+                TIFFErrorExtR(tif, module,
+                              "Error writing SubIFD directory link");
+                return (0);
+            }
+            /*
+             * Advance to the next SubIFD or, if this is
+             * the last one configured, revert back to the
+             * normal directory linkage.
+             */
+            if (--tif->tif_nsubifd)
+                tif->tif_subifdoff += 4;
+            else
+                tif->tif_flags &= ~TIFF_INSUBIFD;
+            return (1);
+        }
+        else
+        {
+            uint64_t m;
+            m = tif->tif_diroff;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&m);
+            (void)TIFFSeekFile(tif, tif->tif_subifdoff, SEEK_SET);
+            if (!WriteOK(tif, &m, 8))
+            {
+                TIFFErrorExtR(tif, module,
+                              "Error writing SubIFD directory link");
+                return (0);
+            }
+            /*
+             * Advance to the next SubIFD or, if this is
+             * the last one configured, revert back to the
+             * normal directory linkage.
+             */
+            if (--tif->tif_nsubifd)
+                tif->tif_subifdoff += 8;
+            else
+                tif->tif_flags &= ~TIFF_INSUBIFD;
+            return (1);
+        }
+    }
 
-static int
-TIFFWriteDirectoryTagCheckedIfdArray(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint32* value)
-{
-	assert(count<0x40000000);
-	assert(sizeof(uint32)==4);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong(value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_IFD,count,count*4,value));
-}
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+    {
+        uint32_t m;
+        uint32_t nextdir;
+        m = (uint32_t)(tif->tif_diroff);
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&m);
+        if (tif->tif_header.classic.tiff_diroff == 0)
+        {
+            /*
+             * First directory, overwrite offset in header.
+             */
+            tif->tif_header.classic.tiff_diroff = (uint32_t)tif->tif_diroff;
+            tif->tif_lastdiroff = tif->tif_diroff;
+            (void)TIFFSeekFile(tif, 4, SEEK_SET);
+            if (!WriteOK(tif, &m, 4))
+            {
+                TIFFErrorExtR(tif, tif->tif_name, "Error writing TIFF header");
+                return (0);
+            }
+            return (1);
+        }
+        /*
+         * Not the first directory, search to the last and append.
+         */
+        if (tif->tif_lastdiroff != 0)
+        {
+            nextdir = (uint32_t)tif->tif_lastdiroff;
+        }
+        else
+        {
+            nextdir = tif->tif_header.classic.tiff_diroff;
+        }
 
-static int
-TIFFWriteDirectoryTagCheckedIfd8Array(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint32 count, uint64* value)
-{
-	assert(count<0x20000000);
-	assert(sizeof(uint64)==8);
-	assert(tif->tif_flags&TIFF_BIGTIFF);
-	if (tif->tif_flags&TIFF_SWAB)
-		TIFFSwabArrayOfLong8(value,count);
-	return(TIFFWriteDirectoryTagData(tif,ndir,dir,tag,TIFF_IFD8,count,count*8,value));
-}
+        while (1)
+        {
+            uint16_t dircount;
+            uint32_t nextnextdir;
 
-static int
-TIFFWriteDirectoryTagData(TIFF* tif, uint32* ndir, TIFFDirEntry* dir, uint16 tag, uint16 datatype, uint32 count, uint32 datalength, void* data)
-{
-	static const char module[] = "TIFFWriteDirectoryTagData";
-	uint32 m;
-	m=0;
-	while (m<(*ndir))
-	{
-		assert(dir[m].tdir_tag!=tag);
-		if (dir[m].tdir_tag>tag)
-			break;
-		m++;
-	}
-	if (m<(*ndir))
-	{
-		uint32 n;
-		for (n=*ndir; n>m; n--)
-			dir[n]=dir[n-1];
-	}
-	dir[m].tdir_tag=tag;
-	dir[m].tdir_type=datatype;
-	dir[m].tdir_count=count;
-	dir[m].tdir_offset.toff_long8 = 0;
-	if (datalength<=((tif->tif_flags&TIFF_BIGTIFF)?0x8U:0x4U))
-        {
-            if( data && datalength )
-            {
-                _TIFFmemcpy(&dir[m].tdir_offset,data,datalength);
-            }
-        }
-	else
-	{
-		uint64 na,nb;
-		na=tif->tif_dataoff;
-		nb=na+datalength;
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-			nb=(uint32)nb;
-		if ((nb<na)||(nb<datalength))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Maximum TIFF file size exceeded");
-			return(0);
-		}
-		if (!SeekOK(tif,na))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"IO error writing tag data");
-			return(0);
-		}
-		assert(datalength<0x80000000UL);
-		if (!WriteOK(tif,data,(tmsize_t)datalength))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"IO error writing tag data");
-			return(0);
-		}
-		tif->tif_dataoff=nb;
-		if (tif->tif_dataoff&1)
-			tif->tif_dataoff++;
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			uint32 o;
-			o=(uint32)na;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong(&o);
-			_TIFFmemcpy(&dir[m].tdir_offset,&o,4);
-		}
-		else
-		{
-			dir[m].tdir_offset.toff_long8 = na;
-			if (tif->tif_flags&TIFF_SWAB)
-				TIFFSwabLong8(&dir[m].tdir_offset.toff_long8);
-		}
-	}
-	(*ndir)++;
-	return(1);
-}
+            if (!SeekOK(tif, nextdir) || !ReadOK(tif, &dircount, 2))
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory count");
+                return (0);
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort(&dircount);
+            (void)TIFFSeekFile(tif, nextdir + 2 + dircount * 12, SEEK_SET);
+            if (!ReadOK(tif, &nextnextdir, 4))
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory link");
+                return (0);
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong(&nextnextdir);
+            if (nextnextdir == 0)
+            {
+                (void)TIFFSeekFile(tif, nextdir + 2 + dircount * 12, SEEK_SET);
+                if (!WriteOK(tif, &m, 4))
+                {
+                    TIFFErrorExtR(tif, module, "Error writing directory link");
+                    return (0);
+                }
+                tif->tif_lastdiroff = tif->tif_diroff;
+                break;
+            }
+            nextdir = nextnextdir;
+        }
+    }
+    else
+    {
+        uint64_t m;
+        uint64_t nextdir;
+        m = tif->tif_diroff;
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong8(&m);
+        if (tif->tif_header.big.tiff_diroff == 0)
+        {
+            /*
+             * First directory, overwrite offset in header.
+             */
+            tif->tif_header.big.tiff_diroff = tif->tif_diroff;
+            tif->tif_lastdiroff = tif->tif_diroff;
+            (void)TIFFSeekFile(tif, 8, SEEK_SET);
+            if (!WriteOK(tif, &m, 8))
+            {
+                TIFFErrorExtR(tif, tif->tif_name, "Error writing TIFF header");
+                return (0);
+            }
+            return (1);
+        }
+        /*
+         * Not the first directory, search to the last and append.
+         */
+        if (tif->tif_lastdiroff != 0)
+        {
+            nextdir = tif->tif_lastdiroff;
+        }
+        else
+        {
+            nextdir = tif->tif_header.big.tiff_diroff;
+        }
+        while (1)
+        {
+            uint64_t dircount64;
+            uint16_t dircount;
+            uint64_t nextnextdir;
 
-/*
- * Link the current directory into the directory chain for the file.
- */
-static int
-TIFFLinkDirectory(TIFF* tif)
-{
-	static const char module[] = "TIFFLinkDirectory";
-
-	tif->tif_diroff = (TIFFSeekFile(tif,0,SEEK_END)+1) & (~((toff_t)1));
-
-	/*
-	 * Handle SubIFDs
-	 */
-	if (tif->tif_flags & TIFF_INSUBIFD)
-	{
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			uint32 m;
-			m = (uint32)tif->tif_diroff;
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong(&m);
-			(void) TIFFSeekFile(tif, tif->tif_subifdoff, SEEK_SET);
-			if (!WriteOK(tif, &m, 4)) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				     "Error writing SubIFD directory link");
-				return (0);
-			}
-			/*
-			 * Advance to the next SubIFD or, if this is
-			 * the last one configured, revert back to the
-			 * normal directory linkage.
-			 */
-			if (--tif->tif_nsubifd)
-				tif->tif_subifdoff += 4;
-			else
-				tif->tif_flags &= ~TIFF_INSUBIFD;
-			return (1);
-		}
-		else
-		{
-			uint64 m;
-			m = tif->tif_diroff;
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong8(&m);
-			(void) TIFFSeekFile(tif, tif->tif_subifdoff, SEEK_SET);
-			if (!WriteOK(tif, &m, 8)) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				     "Error writing SubIFD directory link");
-				return (0);
-			}
-			/*
-			 * Advance to the next SubIFD or, if this is
-			 * the last one configured, revert back to the
-			 * normal directory linkage.
-			 */
-			if (--tif->tif_nsubifd)
-				tif->tif_subifdoff += 8;
-			else
-				tif->tif_flags &= ~TIFF_INSUBIFD;
-			return (1);
-		}
-	}
-
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-	{
-		uint32 m;
-		uint32 nextdir;
-		m = (uint32)(tif->tif_diroff);
-		if (tif->tif_flags & TIFF_SWAB)
-			TIFFSwabLong(&m);
-		if (tif->tif_header.classic.tiff_diroff == 0) {
-			/*
-			 * First directory, overwrite offset in header.
-			 */
-			tif->tif_header.classic.tiff_diroff = (uint32) tif->tif_diroff;
-			(void) TIFFSeekFile(tif,4, SEEK_SET);
-			if (!WriteOK(tif, &m, 4)) {
-				TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-					     "Error writing TIFF header");
-				return (0);
-			}
-			return (1);
-		}
-		/*
-		 * Not the first directory, search to the last and append.
-		 */
-		nextdir = tif->tif_header.classic.tiff_diroff;
-		while(1) {
-			uint16 dircount;
-			uint32 nextnextdir;
-
-			if (!SeekOK(tif, nextdir) ||
-			    !ReadOK(tif, &dircount, 2)) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error fetching directory count");
-				return (0);
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabShort(&dircount);
-			(void) TIFFSeekFile(tif,
-			    nextdir+2+dircount*12, SEEK_SET);
-			if (!ReadOK(tif, &nextnextdir, 4)) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error fetching directory link");
-				return (0);
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong(&nextnextdir);
-			if (nextnextdir==0)
-			{
-				(void) TIFFSeekFile(tif,
-				    nextdir+2+dircount*12, SEEK_SET);
-				if (!WriteOK(tif, &m, 4)) {
-					TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error writing directory link");
-					return (0);
-				}
-				break;
-			}
-			nextdir=nextnextdir;
-		}
-	}
-	else
-	{
-		uint64 m;
-		uint64 nextdir;
-		m = tif->tif_diroff;
-		if (tif->tif_flags & TIFF_SWAB)
-			TIFFSwabLong8(&m);
-		if (tif->tif_header.big.tiff_diroff == 0) {
-			/*
-			 * First directory, overwrite offset in header.
-			 */
-			tif->tif_header.big.tiff_diroff = tif->tif_diroff;
-			(void) TIFFSeekFile(tif,8, SEEK_SET);
-			if (!WriteOK(tif, &m, 8)) {
-				TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-					     "Error writing TIFF header");
-				return (0);
-			}
-			return (1);
-		}
-		/*
-		 * Not the first directory, search to the last and append.
-		 */
-		nextdir = tif->tif_header.big.tiff_diroff;
-		while(1) {
-			uint64 dircount64;
-			uint16 dircount;
-			uint64 nextnextdir;
-
-			if (!SeekOK(tif, nextdir) ||
-			    !ReadOK(tif, &dircount64, 8)) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error fetching directory count");
-				return (0);
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong8(&dircount64);
-			if (dircount64>0xFFFF)
-			{
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Sanity check on tag count failed, likely corrupt TIFF");
-				return (0);
-			}
-			dircount=(uint16)dircount64;
-			(void) TIFFSeekFile(tif,
-			    nextdir+8+dircount*20, SEEK_SET);
-			if (!ReadOK(tif, &nextnextdir, 8)) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error fetching directory link");
-				return (0);
-			}
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabLong8(&nextnextdir);
-			if (nextnextdir==0)
-			{
-				(void) TIFFSeekFile(tif,
-				    nextdir+8+dircount*20, SEEK_SET);
-				if (!WriteOK(tif, &m, 8)) {
-					TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error writing directory link");
-					return (0);
-				}
-				break;
-			}
-			nextdir=nextnextdir;
-		}
-	}
-	return (1);
+            if (!SeekOK(tif, nextdir) || !ReadOK(tif, &dircount64, 8))
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory count");
+                return (0);
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&dircount64);
+            if (dircount64 > 0xFFFF)
+            {
+                TIFFErrorExtR(
+                    tif, module,
+                    "Sanity check on tag count failed, likely corrupt TIFF");
+                return (0);
+            }
+            dircount = (uint16_t)dircount64;
+            (void)TIFFSeekFile(tif, nextdir + 8 + dircount * 20, SEEK_SET);
+            if (!ReadOK(tif, &nextnextdir, 8))
+            {
+                TIFFErrorExtR(tif, module, "Error fetching directory link");
+                return (0);
+            }
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabLong8(&nextnextdir);
+            if (nextnextdir == 0)
+            {
+                (void)TIFFSeekFile(tif, nextdir + 8 + dircount * 20, SEEK_SET);
+                if (!WriteOK(tif, &m, 8))
+                {
+                    TIFFErrorExtR(tif, module, "Error writing directory link");
+                    return (0);
+                }
+                tif->tif_lastdiroff = tif->tif_diroff;
+                break;
+            }
+            nextdir = nextnextdir;
+        }
+    }
+    return (1);
 }
 
 /************************************************************************/
@@ -3301,183 +3170,186 @@ TIFFLinkDirectory(TIFF* tif)
 /*      Returns zero on failure, and one on success.                    */
 /************************************************************************/
 
-int
-_TIFFRewriteField(TIFF* tif, uint16 tag, TIFFDataType in_datatype, 
-                  tmsize_t count, void* data)
+int _TIFFRewriteField(TIFF *tif, uint16_t tag, TIFFDataType in_datatype,
+                      tmsize_t count, void *data)
 {
     static const char module[] = "TIFFResetField";
     /* const TIFFField* fip = NULL; */
-    uint16 dircount;
+    uint16_t dircount;
     tmsize_t dirsize;
-    uint8 direntry_raw[20];
-    uint16 entry_tag = 0;
-    uint16 entry_type = 0;
-    uint64 entry_count = 0;
-    uint64 entry_offset = 0;
-    int    value_in_entry = 0;
-    uint64 read_offset;
-    uint8 *buf_to_write = NULL;
+    uint8_t direntry_raw[20];
+    uint16_t entry_tag = 0;
+    uint16_t entry_type = 0;
+    uint64_t entry_count = 0;
+    uint64_t entry_offset = 0;
+    int value_in_entry = 0;
+    uint64_t read_offset;
+    uint8_t *buf_to_write = NULL;
     TIFFDataType datatype;
 
-/* -------------------------------------------------------------------- */
-/*      Find field definition.                                          */
-/* -------------------------------------------------------------------- */
-    /*fip =*/ TIFFFindField(tif, tag, TIFF_ANY);
+    /* -------------------------------------------------------------------- */
+    /*      Find field definition.                                          */
+    /* -------------------------------------------------------------------- */
+    /*fip =*/TIFFFindField(tif, tag, TIFF_ANY);
 
-/* -------------------------------------------------------------------- */
-/*      Do some checking this is a straight forward case.               */
-/* -------------------------------------------------------------------- */
-    if( isMapped(tif) )
+    /* -------------------------------------------------------------------- */
+    /*      Do some checking this is a straight forward case.               */
+    /* -------------------------------------------------------------------- */
+    if (isMapped(tif))
     {
-        TIFFErrorExt( tif->tif_clientdata, module, 
-                      "Memory mapped files not currently supported for this operation." );
+        TIFFErrorExtR(
+            tif, module,
+            "Memory mapped files not currently supported for this operation.");
         return 0;
     }
 
-    if( tif->tif_diroff == 0 )
+    if (tif->tif_diroff == 0)
     {
-        TIFFErrorExt( tif->tif_clientdata, module, 
-                      "Attempt to reset field on directory not already on disk." );
+        TIFFErrorExtR(
+            tif, module,
+            "Attempt to reset field on directory not already on disk.");
         return 0;
     }
 
-/* -------------------------------------------------------------------- */
-/*      Read the directory entry count.                                 */
-/* -------------------------------------------------------------------- */
-    if (!SeekOK(tif, tif->tif_diroff)) {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                     "%s: Seek error accessing TIFF directory",
-                     tif->tif_name);
+    /* -------------------------------------------------------------------- */
+    /*      Read the directory entry count.                                 */
+    /* -------------------------------------------------------------------- */
+    if (!SeekOK(tif, tif->tif_diroff))
+    {
+        TIFFErrorExtR(tif, module, "%s: Seek error accessing TIFF directory",
+                      tif->tif_name);
         return 0;
     }
 
     read_offset = tif->tif_diroff;
 
-    if (!(tif->tif_flags&TIFF_BIGTIFF))
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
     {
-        if (!ReadOK(tif, &dircount, sizeof (uint16))) {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "%s: Can not read TIFF directory count",
-                         tif->tif_name);
+        if (!ReadOK(tif, &dircount, sizeof(uint16_t)))
+        {
+            TIFFErrorExtR(tif, module, "%s: Can not read TIFF directory count",
+                          tif->tif_name);
             return 0;
         }
         if (tif->tif_flags & TIFF_SWAB)
             TIFFSwabShort(&dircount);
         dirsize = 12;
         read_offset += 2;
-    } else {
-        uint64 dircount64;
-        if (!ReadOK(tif, &dircount64, sizeof (uint64))) {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "%s: Can not read TIFF directory count",
-                         tif->tif_name);
+    }
+    else
+    {
+        uint64_t dircount64;
+        if (!ReadOK(tif, &dircount64, sizeof(uint64_t)))
+        {
+            TIFFErrorExtR(tif, module, "%s: Can not read TIFF directory count",
+                          tif->tif_name);
             return 0;
         }
         if (tif->tif_flags & TIFF_SWAB)
             TIFFSwabLong8(&dircount64);
-        dircount = (uint16)dircount64;
+        dircount = (uint16_t)dircount64;
         dirsize = 20;
         read_offset += 8;
     }
 
-/* -------------------------------------------------------------------- */
-/*      Read through directory to find target tag.                      */
-/* -------------------------------------------------------------------- */
-    while( dircount > 0 )
+    /* -------------------------------------------------------------------- */
+    /*      Read through directory to find target tag.                      */
+    /* -------------------------------------------------------------------- */
+    while (dircount > 0)
     {
-        if (!ReadOK(tif, direntry_raw, dirsize)) {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "%s: Can not read TIFF directory entry.",
-                         tif->tif_name);
+        if (!ReadOK(tif, direntry_raw, dirsize))
+        {
+            TIFFErrorExtR(tif, module, "%s: Can not read TIFF directory entry.",
+                          tif->tif_name);
             return 0;
         }
 
-        memcpy( &entry_tag, direntry_raw + 0, sizeof(uint16) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabShort( &entry_tag );
+        memcpy(&entry_tag, direntry_raw + 0, sizeof(uint16_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabShort(&entry_tag);
 
-        if( entry_tag == tag )
+        if (entry_tag == tag)
             break;
 
         read_offset += dirsize;
     }
 
-    if( entry_tag != tag )
+    if (entry_tag != tag)
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                     "%s: Could not find tag %d.",
-                     tif->tif_name, tag );
+        TIFFErrorExtR(tif, module, "%s: Could not find tag %" PRIu16 ".",
+                      tif->tif_name, tag);
         return 0;
     }
 
-/* -------------------------------------------------------------------- */
-/*      Extract the type, count and offset for this entry.              */
-/* -------------------------------------------------------------------- */
-    memcpy( &entry_type, direntry_raw + 2, sizeof(uint16) );
-    if (tif->tif_flags&TIFF_SWAB)
-        TIFFSwabShort( &entry_type );
+    /* -------------------------------------------------------------------- */
+    /*      Extract the type, count and offset for this entry.              */
+    /* -------------------------------------------------------------------- */
+    memcpy(&entry_type, direntry_raw + 2, sizeof(uint16_t));
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabShort(&entry_type);
 
-    if (!(tif->tif_flags&TIFF_BIGTIFF))
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
     {
-        uint32 value;
-        
-        memcpy( &value, direntry_raw + 4, sizeof(uint32) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabLong( &value );
+        uint32_t value;
+
+        memcpy(&value, direntry_raw + 4, sizeof(uint32_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&value);
         entry_count = value;
 
-        memcpy( &value, direntry_raw + 8, sizeof(uint32) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabLong( &value );
+        memcpy(&value, direntry_raw + 8, sizeof(uint32_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&value);
         entry_offset = value;
     }
     else
     {
-        memcpy( &entry_count, direntry_raw + 4, sizeof(uint64) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabLong8( &entry_count );
+        memcpy(&entry_count, direntry_raw + 4, sizeof(uint64_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong8(&entry_count);
 
-        memcpy( &entry_offset, direntry_raw + 12, sizeof(uint64) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabLong8( &entry_offset );
+        memcpy(&entry_offset, direntry_raw + 12, sizeof(uint64_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong8(&entry_offset);
     }
 
-/* -------------------------------------------------------------------- */
-/*      When a dummy tag was written due to TIFFDeferStrileArrayWriting() */
-/* -------------------------------------------------------------------- */
-    if( entry_offset == 0 && entry_count == 0 && entry_type == 0 )
+    /* -------------------------------------------------------------------- */
+    /*      When a dummy tag was written due to TIFFDeferStrileArrayWriting() */
+    /* -------------------------------------------------------------------- */
+    if (entry_offset == 0 && entry_count == 0 && entry_type == 0)
     {
-        if( tag == TIFFTAG_TILEOFFSETS || tag == TIFFTAG_STRIPOFFSETS )
+        if (tag == TIFFTAG_TILEOFFSETS || tag == TIFFTAG_STRIPOFFSETS)
         {
-            entry_type = (tif->tif_flags&TIFF_BIGTIFF) ? TIFF_LONG8 : TIFF_LONG; 
+            entry_type =
+                (tif->tif_flags & TIFF_BIGTIFF) ? TIFF_LONG8 : TIFF_LONG;
         }
         else
         {
             int write_aslong8 = 1;
-            if( count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS )
+            if (count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS)
             {
                 write_aslong8 = WriteAsLong8(tif, TIFFStripSize64(tif));
             }
-            else if( count > 1 && tag == TIFFTAG_TILEBYTECOUNTS )
+            else if (count > 1 && tag == TIFFTAG_TILEBYTECOUNTS)
             {
                 write_aslong8 = WriteAsLong8(tif, TIFFTileSize64(tif));
             }
-            if( write_aslong8 )
+            if (write_aslong8)
             {
                 entry_type = TIFF_LONG8;
             }
             else
             {
                 int write_aslong4 = 1;
-                if( count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS )
+                if (count > 1 && tag == TIFFTAG_STRIPBYTECOUNTS)
                 {
                     write_aslong4 = WriteAsLong4(tif, TIFFStripSize64(tif));
                 }
-                else if( count > 1 && tag == TIFFTAG_TILEBYTECOUNTS )
+                else if (count > 1 && tag == TIFFTAG_TILEBYTECOUNTS)
                 {
                     write_aslong4 = WriteAsLong4(tif, TIFFTileSize64(tif));
                 }
-                if( write_aslong4 )
+                if (write_aslong4)
                 {
                     entry_type = TIFF_LONG;
                 }
@@ -3489,123 +3361,120 @@ _TIFFRewriteField(TIFF* tif, uint16 tag, TIFFDataType in_datatype,
         }
     }
 
-/* -------------------------------------------------------------------- */
-/*      What data type do we want to write this as?                     */
-/* -------------------------------------------------------------------- */
-    if( TIFFDataWidth(in_datatype) == 8 && !(tif->tif_flags&TIFF_BIGTIFF) )
+    /* -------------------------------------------------------------------- */
+    /*      What data type do we want to write this as?                     */
+    /* -------------------------------------------------------------------- */
+    if (TIFFDataWidth(in_datatype) == 8 && !(tif->tif_flags & TIFF_BIGTIFF))
     {
-        if( in_datatype == TIFF_LONG8 )
+        if (in_datatype == TIFF_LONG8)
             datatype = entry_type == TIFF_SHORT ? TIFF_SHORT : TIFF_LONG;
-        else if( in_datatype == TIFF_SLONG8 )
+        else if (in_datatype == TIFF_SLONG8)
             datatype = TIFF_SLONG;
-        else if( in_datatype == TIFF_IFD8 )
+        else if (in_datatype == TIFF_IFD8)
             datatype = TIFF_IFD;
         else
             datatype = in_datatype;
     }
     else
     {
-        if( in_datatype == TIFF_LONG8 &&
+        if (in_datatype == TIFF_LONG8 &&
             (entry_type == TIFF_SHORT || entry_type == TIFF_LONG ||
-             entry_type == TIFF_LONG8 ) )
+             entry_type == TIFF_LONG8))
             datatype = entry_type;
-        else if( in_datatype == TIFF_SLONG8 &&
-            (entry_type == TIFF_SLONG || entry_type == TIFF_SLONG8 ) )
+        else if (in_datatype == TIFF_SLONG8 &&
+                 (entry_type == TIFF_SLONG || entry_type == TIFF_SLONG8))
             datatype = entry_type;
-        else if( in_datatype == TIFF_IFD8 &&
-            (entry_type == TIFF_IFD || entry_type == TIFF_IFD8 ) )
+        else if (in_datatype == TIFF_IFD8 &&
+                 (entry_type == TIFF_IFD || entry_type == TIFF_IFD8))
             datatype = entry_type;
         else
             datatype = in_datatype;
     }
 
-/* -------------------------------------------------------------------- */
-/*      Prepare buffer of actual data to write.  This includes          */
-/*      swabbing as needed.                                             */
-/* -------------------------------------------------------------------- */
-    buf_to_write =
-	    (uint8 *)_TIFFCheckMalloc(tif, count, TIFFDataWidth(datatype),
-				      "for field buffer.");
+    /* -------------------------------------------------------------------- */
+    /*      Prepare buffer of actual data to write.  This includes          */
+    /*      swabbing as needed.                                             */
+    /* -------------------------------------------------------------------- */
+    buf_to_write = (uint8_t *)_TIFFCheckMalloc(
+        tif, count, TIFFDataWidth(datatype), "for field buffer.");
     if (!buf_to_write)
         return 0;
 
-    if( datatype == in_datatype )
-        memcpy( buf_to_write, data, count * TIFFDataWidth(datatype) );
-    else if( datatype == TIFF_SLONG && in_datatype == TIFF_SLONG8 )
+    if (datatype == in_datatype)
+        memcpy(buf_to_write, data, count * TIFFDataWidth(datatype));
+    else if (datatype == TIFF_SLONG && in_datatype == TIFF_SLONG8)
     {
-	tmsize_t i;
+        tmsize_t i;
 
-        for( i = 0; i < count; i++ )
+        for (i = 0; i < count; i++)
         {
-            ((int32 *) buf_to_write)[i] = 
-                (int32) ((int64 *) data)[i];
-            if( (int64) ((int32 *) buf_to_write)[i] != ((int64 *) data)[i] )
+            ((int32_t *)buf_to_write)[i] = (int32_t)((int64_t *)data)[i];
+            if ((int64_t)((int32_t *)buf_to_write)[i] != ((int64_t *)data)[i])
             {
-                _TIFFfree( buf_to_write );
-                TIFFErrorExt( tif->tif_clientdata, module, 
-                              "Value exceeds 32bit range of output type." );
+                _TIFFfreeExt(tif, buf_to_write);
+                TIFFErrorExtR(tif, module,
+                              "Value exceeds 32bit range of output type.");
                 return 0;
             }
         }
     }
-    else if( (datatype == TIFF_LONG && in_datatype == TIFF_LONG8)
-             || (datatype == TIFF_IFD && in_datatype == TIFF_IFD8) )
+    else if ((datatype == TIFF_LONG && in_datatype == TIFF_LONG8) ||
+             (datatype == TIFF_IFD && in_datatype == TIFF_IFD8))
     {
-	tmsize_t i;
+        tmsize_t i;
 
-        for( i = 0; i < count; i++ )
+        for (i = 0; i < count; i++)
         {
-            ((uint32 *) buf_to_write)[i] = 
-                (uint32) ((uint64 *) data)[i];
-            if( (uint64) ((uint32 *) buf_to_write)[i] != ((uint64 *) data)[i] )
+            ((uint32_t *)buf_to_write)[i] = (uint32_t)((uint64_t *)data)[i];
+            if ((uint64_t)((uint32_t *)buf_to_write)[i] !=
+                ((uint64_t *)data)[i])
             {
-                _TIFFfree( buf_to_write );
-                TIFFErrorExt( tif->tif_clientdata, module, 
-                              "Value exceeds 32bit range of output type." );
+                _TIFFfreeExt(tif, buf_to_write);
+                TIFFErrorExtR(tif, module,
+                              "Value exceeds 32bit range of output type.");
                 return 0;
             }
         }
     }
-    else if( datatype == TIFF_SHORT && in_datatype == TIFF_LONG8 )
+    else if (datatype == TIFF_SHORT && in_datatype == TIFF_LONG8)
     {
-	tmsize_t i;
+        tmsize_t i;
 
-        for( i = 0; i < count; i++ )
+        for (i = 0; i < count; i++)
         {
-            ((uint16 *) buf_to_write)[i] =
-                (uint16) ((uint64 *) data)[i];
-            if( (uint64) ((uint16 *) buf_to_write)[i] != ((uint64 *) data)[i] )
+            ((uint16_t *)buf_to_write)[i] = (uint16_t)((uint64_t *)data)[i];
+            if ((uint64_t)((uint16_t *)buf_to_write)[i] !=
+                ((uint64_t *)data)[i])
             {
-                _TIFFfree( buf_to_write );
-                TIFFErrorExt( tif->tif_clientdata, module,
-                              "Value exceeds 16bit range of output type." );
+                _TIFFfreeExt(tif, buf_to_write);
+                TIFFErrorExtR(tif, module,
+                              "Value exceeds 16bit range of output type.");
                 return 0;
             }
         }
     }
     else
     {
-        TIFFErrorExt( tif->tif_clientdata, module,
-                      "Unhandled type conversion." );
+        TIFFErrorExtR(tif, module, "Unhandled type conversion.");
         return 0;
     }
 
-    if( TIFFDataWidth(datatype) > 1 && (tif->tif_flags&TIFF_SWAB) )
+    if (TIFFDataWidth(datatype) > 1 && (tif->tif_flags & TIFF_SWAB))
     {
-        if( TIFFDataWidth(datatype) == 2 )
-            TIFFSwabArrayOfShort( (uint16 *) buf_to_write, count );
-        else if( TIFFDataWidth(datatype) == 4 )
-            TIFFSwabArrayOfLong( (uint32 *) buf_to_write, count );
-        else if( TIFFDataWidth(datatype) == 8 )
-            TIFFSwabArrayOfLong8( (uint64 *) buf_to_write, count );
+        if (TIFFDataWidth(datatype) == 2)
+            TIFFSwabArrayOfShort((uint16_t *)buf_to_write, count);
+        else if (TIFFDataWidth(datatype) == 4)
+            TIFFSwabArrayOfLong((uint32_t *)buf_to_write, count);
+        else if (TIFFDataWidth(datatype) == 8)
+            TIFFSwabArrayOfLong8((uint64_t *)buf_to_write, count);
     }
 
-/* -------------------------------------------------------------------- */
-/*      Is this a value that fits into the directory entry?             */
-/* -------------------------------------------------------------------- */
-    if (!(tif->tif_flags&TIFF_BIGTIFF))
+    /* -------------------------------------------------------------------- */
+    /*      Is this a value that fits into the directory entry?             */
+    /* -------------------------------------------------------------------- */
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
     {
-        if( TIFFDataWidth(datatype) * count <= 4 )
+        if (TIFFDataWidth(datatype) * count <= 4)
         {
             entry_offset = read_offset + 8;
             value_in_entry = 1;
@@ -3613,136 +3482,139 @@ _TIFFRewriteField(TIFF* tif, uint16 tag, TIFFDataType in_datatype,
     }
     else
     {
-        if( TIFFDataWidth(datatype) * count <= 8 )
+        if (TIFFDataWidth(datatype) * count <= 8)
         {
             entry_offset = read_offset + 12;
             value_in_entry = 1;
         }
     }
 
-    if( (tag == TIFFTAG_TILEOFFSETS || tag == TIFFTAG_STRIPOFFSETS) &&
+    if ((tag == TIFFTAG_TILEOFFSETS || tag == TIFFTAG_STRIPOFFSETS) &&
         tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
         tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
-        tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 )
+        tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0)
     {
         tif->tif_dir.td_stripoffset_entry.tdir_type = datatype;
         tif->tif_dir.td_stripoffset_entry.tdir_count = count;
     }
-    else if( (tag == TIFFTAG_TILEBYTECOUNTS || tag == TIFFTAG_STRIPBYTECOUNTS) &&
-        tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
-        tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
-        tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0 )
+    else if ((tag == TIFFTAG_TILEBYTECOUNTS ||
+              tag == TIFFTAG_STRIPBYTECOUNTS) &&
+             tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
+             tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
+             tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0)
     {
         tif->tif_dir.td_stripbytecount_entry.tdir_type = datatype;
         tif->tif_dir.td_stripbytecount_entry.tdir_count = count;
     }
 
-/* -------------------------------------------------------------------- */
-/*      If the tag type, and count match, then we just write it out     */
-/*      over the old values without altering the directory entry at     */
-/*      all.                                                            */
-/* -------------------------------------------------------------------- */
-    if( entry_count == (uint64)count && entry_type == (uint16) datatype )
+    /* -------------------------------------------------------------------- */
+    /*      If the tag type, and count match, then we just write it out     */
+    /*      over the old values without altering the directory entry at     */
+    /*      all.                                                            */
+    /* -------------------------------------------------------------------- */
+    if (entry_count == (uint64_t)count && entry_type == (uint16_t)datatype)
     {
-        if (!SeekOK(tif, entry_offset)) {
-            _TIFFfree( buf_to_write );
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "%s: Seek error accessing TIFF directory",
-                         tif->tif_name);
+        if (!SeekOK(tif, entry_offset))
+        {
+            _TIFFfreeExt(tif, buf_to_write);
+            TIFFErrorExtR(tif, module,
+                          "%s: Seek error accessing TIFF directory",
+                          tif->tif_name);
             return 0;
         }
-        if (!WriteOK(tif, buf_to_write, count*TIFFDataWidth(datatype))) {
-            _TIFFfree( buf_to_write );
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "Error writing directory link");
+        if (!WriteOK(tif, buf_to_write, count * TIFFDataWidth(datatype)))
+        {
+            _TIFFfreeExt(tif, buf_to_write);
+            TIFFErrorExtR(tif, module, "Error writing directory link");
             return (0);
         }
 
-        _TIFFfree( buf_to_write );
+        _TIFFfreeExt(tif, buf_to_write);
         return 1;
     }
 
-/* -------------------------------------------------------------------- */
-/*      Otherwise, we write the new tag data at the end of the file.    */
-/* -------------------------------------------------------------------- */
-    if( !value_in_entry )
+    /* -------------------------------------------------------------------- */
+    /*      Otherwise, we write the new tag data at the end of the file.    */
+    /* -------------------------------------------------------------------- */
+    if (!value_in_entry)
     {
-        entry_offset = TIFFSeekFile(tif,0,SEEK_END);
-        
-        if (!WriteOK(tif, buf_to_write, count*TIFFDataWidth(datatype))) {
-            _TIFFfree( buf_to_write );
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "Error writing directory link");
+        entry_offset = TIFFSeekFile(tif, 0, SEEK_END);
+
+        if (!WriteOK(tif, buf_to_write, count * TIFFDataWidth(datatype)))
+        {
+            _TIFFfreeExt(tif, buf_to_write);
+            TIFFErrorExtR(tif, module, "Error writing directory link");
             return (0);
         }
     }
     else
     {
-        memcpy( &entry_offset, buf_to_write, count*TIFFDataWidth(datatype));
+        if (count * TIFFDataWidth(datatype) == 4)
+        {
+            uint32_t value;
+            memcpy(&value, buf_to_write, count * TIFFDataWidth(datatype));
+            entry_offset = value;
+        }
+        else
+        {
+            memcpy(&entry_offset, buf_to_write,
+                   count * TIFFDataWidth(datatype));
+        }
     }
 
-    _TIFFfree( buf_to_write );
+    _TIFFfreeExt(tif, buf_to_write);
     buf_to_write = 0;
 
-/* -------------------------------------------------------------------- */
-/*      Adjust the directory entry.                                     */
-/* -------------------------------------------------------------------- */
+    /* -------------------------------------------------------------------- */
+    /*      Adjust the directory entry.                                     */
+    /* -------------------------------------------------------------------- */
     entry_type = datatype;
-    entry_count = (uint64)count;
-    memcpy( direntry_raw + 2, &entry_type, sizeof(uint16) );
-    if (tif->tif_flags&TIFF_SWAB)
-        TIFFSwabShort( (uint16 *) (direntry_raw + 2) );
+    entry_count = (uint64_t)count;
+    memcpy(direntry_raw + 2, &entry_type, sizeof(uint16_t));
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabShort((uint16_t *)(direntry_raw + 2));
 
-    if (!(tif->tif_flags&TIFF_BIGTIFF))
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
     {
-        uint32 value;
+        uint32_t value;
 
-        value = (uint32) entry_count;
-        memcpy( direntry_raw + 4, &value, sizeof(uint32) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabLong( (uint32 *) (direntry_raw + 4) );
+        value = (uint32_t)entry_count;
+        memcpy(direntry_raw + 4, &value, sizeof(uint32_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong((uint32_t *)(direntry_raw + 4));
 
-        value = (uint32) entry_offset;
-        memcpy( direntry_raw + 8, &value, sizeof(uint32) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabLong( (uint32 *) (direntry_raw + 8) );
+        value = (uint32_t)entry_offset;
+        memcpy(direntry_raw + 8, &value, sizeof(uint32_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong((uint32_t *)(direntry_raw + 8));
     }
     else
     {
-        memcpy( direntry_raw + 4, &entry_count, sizeof(uint64) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabLong8( (uint64 *) (direntry_raw + 4) );
+        memcpy(direntry_raw + 4, &entry_count, sizeof(uint64_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong8((uint64_t *)(direntry_raw + 4));
 
-        memcpy( direntry_raw + 12, &entry_offset, sizeof(uint64) );
-        if (tif->tif_flags&TIFF_SWAB)
-            TIFFSwabLong8( (uint64 *) (direntry_raw + 12) );
+        memcpy(direntry_raw + 12, &entry_offset, sizeof(uint64_t));
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong8((uint64_t *)(direntry_raw + 12));
     }
 
-/* -------------------------------------------------------------------- */
-/*      Write the directory entry out to disk.                          */
-/* -------------------------------------------------------------------- */
-    if (!SeekOK(tif, read_offset )) {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                     "%s: Seek error accessing TIFF directory",
-                     tif->tif_name);
+    /* -------------------------------------------------------------------- */
+    /*      Write the directory entry out to disk.                          */
+    /* -------------------------------------------------------------------- */
+    if (!SeekOK(tif, read_offset))
+    {
+        TIFFErrorExtR(tif, module, "%s: Seek error accessing TIFF directory",
+                      tif->tif_name);
         return 0;
     }
 
-    if (!WriteOK(tif, direntry_raw,dirsize))
+    if (!WriteOK(tif, direntry_raw, dirsize))
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                     "%s: Can not write TIFF directory entry.",
-                     tif->tif_name);
+        TIFFErrorExtR(tif, module, "%s: Can not write TIFF directory entry.",
+                      tif->tif_name);
         return 0;
     }
-    
+
     return 1;
 }
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_dumpmode.c b/3rdparty/libtiff/tif_dumpmode.c
index 4a0b07f5044e..267d5d2d7a53 100644
--- a/3rdparty/libtiff/tif_dumpmode.c
+++ b/3rdparty/libtiff/tif_dumpmode.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -29,113 +29,94 @@
  */
 #include "tiffiop.h"
 
-static int
-DumpFixupTags(TIFF* tif)
+static int DumpFixupTags(TIFF *tif)
 {
-	(void) tif;
-	return (1);
+    (void)tif;
+    return (1);
 }
 
 /*
  * Encode a hunk of pixels.
  */
-static int
-DumpModeEncode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s)
+static int DumpModeEncode(TIFF *tif, uint8_t *pp, tmsize_t cc, uint16_t s)
 {
-	(void) s;
-	while (cc > 0) {
-		tmsize_t n;
+    (void)s;
+    while (cc > 0)
+    {
+        tmsize_t n;
 
-		n = cc;
-		if (tif->tif_rawcc + n > tif->tif_rawdatasize)
-			n = tif->tif_rawdatasize - tif->tif_rawcc;
+        n = cc;
+        if (tif->tif_rawcc + n > tif->tif_rawdatasize)
+            n = tif->tif_rawdatasize - tif->tif_rawcc;
 
-		assert( n > 0 );
+        assert(n > 0);
 
-		/*
-		 * Avoid copy if client has setup raw
-		 * data buffer to avoid extra copy.
-		 */
-		if (tif->tif_rawcp != pp)
-			_TIFFmemcpy(tif->tif_rawcp, pp, n);
-		tif->tif_rawcp += n;
-		tif->tif_rawcc += n;
-		pp += n;
-		cc -= n;
-		if (tif->tif_rawcc >= tif->tif_rawdatasize &&
-		    !TIFFFlushData1(tif))
-			return (0);
-	}
-	return (1);
+        /*
+         * Avoid copy if client has setup raw
+         * data buffer to avoid extra copy.
+         */
+        if (tif->tif_rawcp != pp)
+            _TIFFmemcpy(tif->tif_rawcp, pp, n);
+        tif->tif_rawcp += n;
+        tif->tif_rawcc += n;
+        pp += n;
+        cc -= n;
+        if (tif->tif_rawcc >= tif->tif_rawdatasize && !TIFFFlushData1(tif))
+            return (0);
+    }
+    return (1);
 }
 
 /*
  * Decode a hunk of pixels.
  */
-static int
-DumpModeDecode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
+static int DumpModeDecode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "DumpModeDecode";
-	(void) s;
-	if (tif->tif_rawcc < cc) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(tif->tif_clientdata, module,
-"Not enough data for scanline %lu, expected a request for at most %I64d bytes, got a request for %I64d bytes",
-		             (unsigned long) tif->tif_row,
-		             (signed __int64) tif->tif_rawcc,
-		             (signed __int64) cc);
-#else
-		TIFFErrorExt(tif->tif_clientdata, module,
-"Not enough data for scanline %lu, expected a request for at most %lld bytes, got a request for %lld bytes",
-		             (unsigned long) tif->tif_row,
-		             (signed long long) tif->tif_rawcc,
-		             (signed long long) cc);
-#endif
-		return (0);
-	}
-	/*
-	 * Avoid copy if client has setup raw
-	 * data buffer to avoid extra copy.
-	 */
-	if (tif->tif_rawcp != buf)
-		_TIFFmemcpy(buf, tif->tif_rawcp, cc);
-	tif->tif_rawcp += cc;
-	tif->tif_rawcc -= cc;  
-	return (1);
+    static const char module[] = "DumpModeDecode";
+    (void)s;
+    if (tif->tif_rawcc < cc)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Not enough data for scanline %" PRIu32
+                      ", expected a request for at most %" TIFF_SSIZE_FORMAT
+                      " bytes, got a request for %" TIFF_SSIZE_FORMAT " bytes",
+                      tif->tif_row, tif->tif_rawcc, cc);
+        return (0);
+    }
+    /*
+     * Avoid copy if client has setup raw
+     * data buffer to avoid extra copy.
+     */
+    if (tif->tif_rawcp != buf)
+        _TIFFmemcpy(buf, tif->tif_rawcp, cc);
+    tif->tif_rawcp += cc;
+    tif->tif_rawcc -= cc;
+    return (1);
 }
 
 /*
  * Seek forwards nrows in the current strip.
  */
-static int
-DumpModeSeek(TIFF* tif, uint32 nrows)
+static int DumpModeSeek(TIFF *tif, uint32_t nrows)
 {
-	tif->tif_rawcp += nrows * tif->tif_scanlinesize;
-	tif->tif_rawcc -= nrows * tif->tif_scanlinesize;
-	return (1);
+    tif->tif_rawcp += nrows * tif->tif_scanlinesize;
+    tif->tif_rawcc -= nrows * tif->tif_scanlinesize;
+    return (1);
 }
 
 /*
  * Initialize dump mode.
  */
-int
-TIFFInitDumpMode(TIFF* tif, int scheme)
+int TIFFInitDumpMode(TIFF *tif, int scheme)
 {
-	(void) scheme;
-	tif->tif_fixuptags = DumpFixupTags;  
-	tif->tif_decoderow = DumpModeDecode;
-	tif->tif_decodestrip = DumpModeDecode;
-	tif->tif_decodetile = DumpModeDecode;
-	tif->tif_encoderow = DumpModeEncode;
-	tif->tif_encodestrip = DumpModeEncode;
-	tif->tif_encodetile = DumpModeEncode; 
-	tif->tif_seek = DumpModeSeek;
-	return (1);
+    (void)scheme;
+    tif->tif_fixuptags = DumpFixupTags;
+    tif->tif_decoderow = DumpModeDecode;
+    tif->tif_decodestrip = DumpModeDecode;
+    tif->tif_decodetile = DumpModeDecode;
+    tif->tif_encoderow = DumpModeEncode;
+    tif->tif_encodestrip = DumpModeEncode;
+    tif->tif_encodetile = DumpModeEncode;
+    tif->tif_seek = DumpModeSeek;
+    return (1);
 }
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_error.c b/3rdparty/libtiff/tif_error.c
index 651168f7dc78..ac0b9c373a36 100644
--- a/3rdparty/libtiff/tif_error.c
+++ b/3rdparty/libtiff/tif_error.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -29,58 +29,104 @@
 
 TIFFErrorHandlerExt _TIFFerrorHandlerExt = NULL;
 
-TIFFErrorHandler
-TIFFSetErrorHandler(TIFFErrorHandler handler)
+TIFFErrorHandler TIFFSetErrorHandler(TIFFErrorHandler handler)
 {
-	TIFFErrorHandler prev = _TIFFerrorHandler;
-	_TIFFerrorHandler = handler;
-	return (prev);
+    TIFFErrorHandler prev = _TIFFerrorHandler;
+    _TIFFerrorHandler = handler;
+    return (prev);
 }
 
-TIFFErrorHandlerExt
-TIFFSetErrorHandlerExt(TIFFErrorHandlerExt handler)
+TIFFErrorHandlerExt TIFFSetErrorHandlerExt(TIFFErrorHandlerExt handler)
 {
-	TIFFErrorHandlerExt prev = _TIFFerrorHandlerExt;
-	_TIFFerrorHandlerExt = handler;
-	return (prev);
+    TIFFErrorHandlerExt prev = _TIFFerrorHandlerExt;
+    _TIFFerrorHandlerExt = handler;
+    return (prev);
 }
 
-void
-TIFFError(const char* module, const char* fmt, ...)
+void TIFFError(const char *module, const char *fmt, ...)
 {
-	va_list ap;
-	if (_TIFFerrorHandler) {
-		va_start(ap, fmt);	
-		(*_TIFFerrorHandler)(module, fmt, ap);
-		va_end(ap);
-	}
-	if (_TIFFerrorHandlerExt) {
-		va_start(ap, fmt);
-		(*_TIFFerrorHandlerExt)(0, module, fmt, ap);
-		va_end(ap);
-	}
+    va_list ap;
+    if (_TIFFerrorHandler)
+    {
+        va_start(ap, fmt);
+        (*_TIFFerrorHandler)(module, fmt, ap);
+        va_end(ap);
+    }
+    if (_TIFFerrorHandlerExt)
+    {
+        va_start(ap, fmt);
+        (*_TIFFerrorHandlerExt)(0, module, fmt, ap);
+        va_end(ap);
+    }
 }
 
-void
-TIFFErrorExt(thandle_t fd, const char* module, const char* fmt, ...)
+void TIFFErrorExt(thandle_t fd, const char *module, const char *fmt, ...)
 {
-	va_list ap;
-	if (_TIFFerrorHandler) {
-		va_start(ap, fmt);
-		(*_TIFFerrorHandler)(module, fmt, ap);
-		va_end(ap);
-	}
-	if (_TIFFerrorHandlerExt) {
-		va_start(ap, fmt);
-		(*_TIFFerrorHandlerExt)(fd, module, fmt, ap);
-		va_end(ap);
-	}
+    va_list ap;
+    if (_TIFFerrorHandler)
+    {
+        va_start(ap, fmt);
+        (*_TIFFerrorHandler)(module, fmt, ap);
+        va_end(ap);
+    }
+    if (_TIFFerrorHandlerExt)
+    {
+        va_start(ap, fmt);
+        (*_TIFFerrorHandlerExt)(fd, module, fmt, ap);
+        va_end(ap);
+    }
 }
 
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
+void _TIFFErrorEarly(TIFFOpenOptions *opts, thandle_t clientdata,
+                     const char *module, const char *fmt, ...)
+{
+    va_list ap;
+    if (opts && opts->errorhandler)
+    {
+        va_start(ap, fmt);
+        int stop = opts->errorhandler(NULL, opts->errorhandler_user_data,
+                                      module, fmt, ap);
+        va_end(ap);
+        if (stop)
+            return;
+    }
+    if (_TIFFerrorHandler)
+    {
+        va_start(ap, fmt);
+        (*_TIFFerrorHandler)(module, fmt, ap);
+        va_end(ap);
+    }
+    if (_TIFFerrorHandlerExt)
+    {
+        va_start(ap, fmt);
+        (*_TIFFerrorHandlerExt)(clientdata, module, fmt, ap);
+        va_end(ap);
+    }
+}
+
+void TIFFErrorExtR(TIFF *tif, const char *module, const char *fmt, ...)
+{
+    va_list ap;
+    if (tif && tif->tif_errorhandler)
+    {
+        va_start(ap, fmt);
+        int stop = (*tif->tif_errorhandler)(
+            tif, tif->tif_errorhandler_user_data, module, fmt, ap);
+        va_end(ap);
+        if (stop)
+            return;
+    }
+    if (_TIFFerrorHandler)
+    {
+        va_start(ap, fmt);
+        (*_TIFFerrorHandler)(module, fmt, ap);
+        va_end(ap);
+    }
+    if (_TIFFerrorHandlerExt)
+    {
+        va_start(ap, fmt);
+        (*_TIFFerrorHandlerExt)(tif ? tif->tif_clientdata : NULL, module, fmt,
+                                ap);
+        va_end(ap);
+    }
+}
diff --git a/3rdparty/libtiff/tif_extension.c b/3rdparty/libtiff/tif_extension.c
index 87d3cfcbc7f8..1a09e987a5ba 100644
--- a/3rdparty/libtiff/tif_extension.c
+++ b/3rdparty/libtiff/tif_extension.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -26,26 +26,26 @@
  * TIFF Library.
  *
  * Various routines support external extension of the tag set, and other
- * application extension capabilities. 
+ * application extension capabilities.
  */
 
 #include "tiffiop.h"
 
-int TIFFGetTagListCount( TIFF *tif )
+int TIFFGetTagListCount(TIFF *tif)
 
 {
-    TIFFDirectory* td = &tif->tif_dir;
-    
+    TIFFDirectory *td = &tif->tif_dir;
+
     return td->td_customValueCount;
 }
 
-uint32 TIFFGetTagListEntry( TIFF *tif, int tag_index )
+uint32_t TIFFGetTagListEntry(TIFF *tif, int tag_index)
 
 {
-    TIFFDirectory* td = &tif->tif_dir;
+    TIFFDirectory *td = &tif->tif_dir;
 
-    if( tag_index < 0 || tag_index >= td->td_customValueCount )
-        return (uint32)(-1);
+    if (tag_index < 0 || tag_index >= td->td_customValueCount)
+        return (uint32_t)(-1);
     else
         return td->td_customValues[tag_index].info->field_tag;
 }
@@ -55,27 +55,27 @@ uint32 TIFFGetTagListEntry( TIFF *tif, int tag_index )
 ** structure to application code without giving access to the private
 ** TIFF structure.
 */
-TIFFTagMethods *TIFFAccessTagMethods( TIFF *tif )
+TIFFTagMethods *TIFFAccessTagMethods(TIFF *tif)
 
 {
     return &(tif->tif_tagmethods);
 }
 
-void *TIFFGetClientInfo( TIFF *tif, const char *name )
+void *TIFFGetClientInfo(TIFF *tif, const char *name)
 
 {
     TIFFClientInfoLink *psLink = tif->tif_clientinfo;
 
-    while( psLink != NULL && strcmp(psLink->name,name) != 0 )
+    while (psLink != NULL && strcmp(psLink->name, name) != 0)
         psLink = psLink->next;
 
-    if( psLink != NULL )
+    if (psLink != NULL)
         return psLink->data;
     else
         return NULL;
 }
 
-void TIFFSetClientInfo( TIFF *tif, void *data, const char *name )
+void TIFFSetClientInfo(TIFF *tif, void *data, const char *name)
 
 {
     TIFFClientInfoLink *psLink = tif->tif_clientinfo;
@@ -84,10 +84,10 @@ void TIFFSetClientInfo( TIFF *tif, void *data, const char *name )
     ** Do we have an existing link with this name?  If so, just
     ** set it.
     */
-    while( psLink != NULL && strcmp(psLink->name,name) != 0 )
+    while (psLink != NULL && strcmp(psLink->name, name) != 0)
         psLink = psLink->next;
 
-    if( psLink != NULL )
+    if (psLink != NULL)
     {
         psLink->data = data;
         return;
@@ -97,20 +97,14 @@ void TIFFSetClientInfo( TIFF *tif, void *data, const char *name )
     ** Create a new link.
     */
 
-    psLink = (TIFFClientInfoLink *) _TIFFmalloc(sizeof(TIFFClientInfoLink));
-    assert (psLink != NULL);
+    psLink =
+        (TIFFClientInfoLink *)_TIFFmallocExt(tif, sizeof(TIFFClientInfoLink));
+    assert(psLink != NULL);
     psLink->next = tif->tif_clientinfo;
-    psLink->name = (char *) _TIFFmalloc((tmsize_t)(strlen(name)+1));
-    assert (psLink->name != NULL);
+    psLink->name = (char *)_TIFFmallocExt(tif, (tmsize_t)(strlen(name) + 1));
+    assert(psLink->name != NULL);
     strcpy(psLink->name, name);
     psLink->data = data;
 
     tif->tif_clientinfo = psLink;
 }
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_fax3.c b/3rdparty/libtiff/tif_fax3.c
index 9ab5b26ad37a..a3c645cb68f4 100644
--- a/3rdparty/libtiff/tif_fax3.c
+++ b/3rdparty/libtiff/tif_fax3.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1990-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -37,7 +37,7 @@
  *      Copyright (C) 1990, 1995  Frank D. Cringle.
  */
 #include "tif_fax3.h"
-#define	G3CODES
+#define G3CODES
 #include "t4.h"
 #include <stdio.h>
 
@@ -45,51 +45,57 @@
  * Compression+decompression state blocks are
  * derived from this ``base state'' block.
  */
-typedef struct {
-	int      rw_mode;                /* O_RDONLY for decode, else encode */
-	int      mode;                   /* operating mode */
-	tmsize_t rowbytes;               /* bytes in a decoded scanline */
-	uint32   rowpixels;              /* pixels in a scanline */
-
-	uint16   cleanfaxdata;           /* CleanFaxData tag */
-	uint32   badfaxrun;              /* BadFaxRun tag */
-	uint32   badfaxlines;            /* BadFaxLines tag */
-	uint32   groupoptions;           /* Group 3/4 options tag */
-
-	TIFFVGetMethod  vgetparent;      /* super-class method */
-	TIFFVSetMethod  vsetparent;      /* super-class method */
-	TIFFPrintMethod printdir;        /* super-class method */
+typedef struct
+{
+    int rw_mode;        /* O_RDONLY for decode, else encode */
+    int mode;           /* operating mode */
+    tmsize_t rowbytes;  /* bytes in a decoded scanline */
+    uint32_t rowpixels; /* pixels in a scanline */
+
+    uint16_t cleanfaxdata; /* CleanFaxData tag */
+    uint32_t badfaxrun;    /* BadFaxRun tag */
+    uint32_t badfaxlines;  /* BadFaxLines tag */
+    uint32_t groupoptions; /* Group 3/4 options tag */
+
+    TIFFVGetMethod vgetparent; /* super-class method */
+    TIFFVSetMethod vsetparent; /* super-class method */
+    TIFFPrintMethod printdir;  /* super-class method */
 } Fax3BaseState;
-#define	Fax3State(tif)		((Fax3BaseState*) (tif)->tif_data)
-
-typedef enum { G3_1D, G3_2D } Ttag;
-typedef struct {
-	Fax3BaseState b;
-
-	/* Decoder state info */
-	const unsigned char* bitmap;	/* bit reversal table */
-	uint32	data;			/* current i/o byte/word */
-	int	bit;			/* current i/o bit in byte */
-	int	EOLcnt;			/* count of EOL codes recognized */
-	TIFFFaxFillFunc fill;		/* fill routine */
-	uint32*	runs;			/* b&w runs for current/previous row */
-	uint32	nruns;			/* size of the refruns / curruns arrays */
-	uint32*	refruns;		/* runs for reference line */
-	uint32*	curruns;		/* runs for current line */
-
-	/* Encoder state info */
-	Ttag    tag;			/* encoding state */
-	unsigned char*	refline;	/* reference line for 2d decoding */
-	int	k;			/* #rows left that can be 2d encoded */
-	int	maxk;			/* max #rows that can be 2d encoded */
-
-	int line;
+#define Fax3State(tif) ((Fax3BaseState *)(tif)->tif_data)
+
+typedef enum
+{
+    G3_1D,
+    G3_2D
+} Ttag;
+typedef struct
+{
+    Fax3BaseState b;
+
+    /* Decoder state info */
+    const unsigned char *bitmap; /* bit reversal table */
+    uint32_t data;               /* current i/o byte/word */
+    int bit;                     /* current i/o bit in byte */
+    int EOLcnt;                  /* count of EOL codes recognized */
+    TIFFFaxFillFunc fill;        /* fill routine */
+    uint32_t *runs;              /* b&w runs for current/previous row */
+    uint32_t nruns;              /* size of the refruns / curruns arrays */
+    uint32_t *refruns;           /* runs for reference line */
+    uint32_t *curruns;           /* runs for current line */
+
+    /* Encoder state info */
+    Ttag tag;               /* encoding state */
+    unsigned char *refline; /* reference line for 2d decoding */
+    int k;                  /* #rows left that can be 2d encoded */
+    int maxk;               /* max #rows that can be 2d encoded */
+
+    int line;
 } Fax3CodecState;
-#define DecoderState(tif) ((Fax3CodecState*) Fax3State(tif))
-#define EncoderState(tif) ((Fax3CodecState*) Fax3State(tif))
+#define DecoderState(tif) ((Fax3CodecState *)Fax3State(tif))
+#define EncoderState(tif) ((Fax3CodecState *)Fax3State(tif))
 
 #define is2DEncoding(sp) (sp->b.groupoptions & GROUP3OPT_2DENCODING)
-#define isAligned(p,t) ((((size_t)(p)) & (sizeof (t)-1)) == 0)
+#define isAligned(p, t) ((((size_t)(p)) & (sizeof(t) - 1)) == 0)
 
 /*
  * Group 3 and Group 4 Decoding.
@@ -99,76 +105,81 @@ typedef struct {
  * These macros glue the TIFF library state to
  * the state expected by Frank's decoder.
  */
-#define	DECLARE_STATE(tif, sp, mod)					\
-    static const char module[] = mod;					\
-    Fax3CodecState* sp = DecoderState(tif);				\
-    int a0;				/* reference element */		\
-    int lastx = sp->b.rowpixels;	/* last element in row */	\
-    uint32 BitAcc;			/* bit accumulator */		\
-    int BitsAvail;			/* # valid bits in BitAcc */	\
-    int RunLength;			/* length of current run */	\
-    unsigned char* cp;			/* next byte of input data */	\
-    unsigned char* ep;			/* end of input data */		\
-    uint32* pa;				/* place to stuff next run */	\
-    uint32* thisrun;			/* current row's run array */	\
-    int EOLcnt;				/* # EOL codes recognized */	\
-    const unsigned char* bitmap = sp->bitmap;	/* input data bit reverser */	\
-    const TIFFFaxTabEnt* TabEnt
-#define	DECLARE_STATE_2D(tif, sp, mod)					\
-    DECLARE_STATE(tif, sp, mod);					\
-    int b1;				/* next change on prev line */	\
-    uint32* pb				/* next run in reference line */\
-/*
- * Load any state that may be changed during decoding.
- */
-#define	CACHE_STATE(tif, sp) do {					\
-    BitAcc = sp->data;							\
-    BitsAvail = sp->bit;						\
-    EOLcnt = sp->EOLcnt;						\
-    cp = (unsigned char*) tif->tif_rawcp;				\
-    ep = cp + tif->tif_rawcc;						\
-} while (0)
+#define DECLARE_STATE(tif, sp, mod)                                            \
+    static const char module[] = mod;                                          \
+    Fax3CodecState *sp = DecoderState(tif);                                    \
+    int a0;                                   /* reference element */          \
+    int lastx = sp->b.rowpixels;              /* last element in row */        \
+    uint32_t BitAcc;                          /* bit accumulator */            \
+    int BitsAvail;                            /* # valid bits in BitAcc */     \
+    int RunLength;                            /* length of current run */      \
+    unsigned char *cp;                        /* next byte of input data */    \
+    unsigned char *ep;                        /* end of input data */          \
+    uint32_t *pa;                             /* place to stuff next run */    \
+    uint32_t *thisrun;                        /* current row's run array */    \
+    int EOLcnt;                               /* # EOL codes recognized */     \
+    const unsigned char *bitmap = sp->bitmap; /* input data bit reverser */    \
+    const TIFFFaxTabEnt *TabEnt
+#define DECLARE_STATE_2D(tif, sp, mod)                                         \
+    DECLARE_STATE(tif, sp, mod);                                               \
+    int b1; /* next change on prev line */                                     \
+    uint32_t                                                                   \
+        *pb /* next run in reference line */ /*                                \
+                                              * Load any state that may be     \
+                                              * changed during decoding.       \
+                                              */
+#define CACHE_STATE(tif, sp)                                                   \
+    do                                                                         \
+    {                                                                          \
+        BitAcc = sp->data;                                                     \
+        BitsAvail = sp->bit;                                                   \
+        EOLcnt = sp->EOLcnt;                                                   \
+        cp = (unsigned char *)tif->tif_rawcp;                                  \
+        ep = cp + tif->tif_rawcc;                                              \
+    } while (0)
 /*
  * Save state possibly changed during decoding.
  */
-#define	UNCACHE_STATE(tif, sp) do {					\
-    sp->bit = BitsAvail;						\
-    sp->data = BitAcc;							\
-    sp->EOLcnt = EOLcnt;						\
-    tif->tif_rawcc -= (tmsize_t)((uint8*) cp - tif->tif_rawcp);		\
-    tif->tif_rawcp = (uint8*) cp;					\
-} while (0)
+#define UNCACHE_STATE(tif, sp)                                                 \
+    do                                                                         \
+    {                                                                          \
+        sp->bit = BitsAvail;                                                   \
+        sp->data = BitAcc;                                                     \
+        sp->EOLcnt = EOLcnt;                                                   \
+        tif->tif_rawcc -= (tmsize_t)((uint8_t *)cp - tif->tif_rawcp);          \
+        tif->tif_rawcp = (uint8_t *)cp;                                        \
+    } while (0)
 
 /*
  * Setup state for decoding a strip.
  */
-static int
-Fax3PreDecode(TIFF* tif, uint16 s)
+static int Fax3PreDecode(TIFF *tif, uint16_t s)
 {
-	Fax3CodecState* sp = DecoderState(tif);
-
-	(void) s;
-	assert(sp != NULL);
-	sp->bit = 0;			/* force initial read */
-	sp->data = 0;
-	sp->EOLcnt = 0;			/* force initial scan for EOL */
-	/*
-	 * Decoder assumes lsb-to-msb bit order.  Note that we select
-	 * this here rather than in Fax3SetupState so that viewers can
-	 * hold the image open, fiddle with the FillOrder tag value,
-	 * and then re-decode the image.  Otherwise they'd need to close
-	 * and open the image to get the state reset.
-	 */
-	sp->bitmap =
-	    TIFFGetBitRevTable(tif->tif_dir.td_fillorder != FILLORDER_LSB2MSB);
-	sp->curruns = sp->runs;
-	if (sp->refruns) {		/* init reference line to white */
-		sp->refruns = sp->runs + sp->nruns;
-		sp->refruns[0] = (uint32) sp->b.rowpixels;
-		sp->refruns[1] = 0;
-	}
-	sp->line = 0;
-	return (1);
+    Fax3CodecState *sp = DecoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+    sp->bit = 0; /* force initial read */
+    sp->data = 0;
+    sp->EOLcnt = 0; /* force initial scan for EOL */
+    /*
+     * Decoder assumes lsb-to-msb bit order.  Note that we select
+     * this here rather than in Fax3SetupState so that viewers can
+     * hold the image open, fiddle with the FillOrder tag value,
+     * and then re-decode the image.  Otherwise they'd need to close
+     * and open the image to get the state reset.
+     */
+    sp->bitmap =
+        TIFFGetBitRevTable(tif->tif_dir.td_fillorder != FILLORDER_LSB2MSB);
+    sp->curruns = sp->runs;
+    if (sp->refruns)
+    { /* init reference line to white */
+        sp->refruns = sp->runs + sp->nruns;
+        sp->refruns[0] = (uint32_t)sp->b.rowpixels;
+        sp->refruns[1] = 0;
+    }
+    sp->line = 0;
+    return (1);
 }
 
 /*
@@ -177,49 +188,53 @@ Fax3PreDecode(TIFF* tif, uint16 s)
  * overriding the definitions used by the decoder.
  */
 
-static void
-Fax3Unexpected(const char* module, TIFF* tif, uint32 line, uint32 a0)
+static void Fax3Unexpected(const char *module, TIFF *tif, uint32_t line,
+                           uint32_t a0)
 {
-	TIFFErrorExt(tif->tif_clientdata, module, "Bad code word at line %u of %s %u (x %u)",
-	    line, isTiled(tif) ? "tile" : "strip",
-	    (isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip),
-	    a0);
+    TIFFErrorExtR(tif, module,
+                  "Bad code word at line %" PRIu32 " of %s %" PRIu32
+                  " (x %" PRIu32 ")",
+                  line, isTiled(tif) ? "tile" : "strip",
+                  (isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip), a0);
 }
-#define	unexpected(table, a0)	Fax3Unexpected(module, tif, sp->line, a0)
+#define unexpected(table, a0) Fax3Unexpected(module, tif, sp->line, a0)
 
-static void
-Fax3Extension(const char* module, TIFF* tif, uint32 line, uint32 a0)
+static void Fax3Extension(const char *module, TIFF *tif, uint32_t line,
+                          uint32_t a0)
 {
-	TIFFErrorExt(tif->tif_clientdata, module,
-	    "Uncompressed data (not supported) at line %u of %s %u (x %u)",
-	    line, isTiled(tif) ? "tile" : "strip",
-	    (isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip),
-	    a0);
+    TIFFErrorExtR(tif, module,
+                  "Uncompressed data (not supported) at line %" PRIu32
+                  " of %s %" PRIu32 " (x %" PRIu32 ")",
+                  line, isTiled(tif) ? "tile" : "strip",
+                  (isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip), a0);
 }
-#define	extension(a0)	Fax3Extension(module, tif, sp->line, a0)
+#define extension(a0) Fax3Extension(module, tif, sp->line, a0)
 
-static void
-Fax3BadLength(const char* module, TIFF* tif, uint32 line, uint32 a0, uint32 lastx)
+static void Fax3BadLength(const char *module, TIFF *tif, uint32_t line,
+                          uint32_t a0, uint32_t lastx)
 {
-	TIFFWarningExt(tif->tif_clientdata, module, "%s at line %u of %s %u (got %u, expected %u)",
-	    a0 < lastx ? "Premature EOL" : "Line length mismatch",
-	    line, isTiled(tif) ? "tile" : "strip",
-	    (isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip),
-	    a0, lastx);
+    TIFFWarningExtR(tif, module,
+                    "%s at line %" PRIu32 " of %s %" PRIu32 " (got %" PRIu32
+                    ", expected %" PRIu32 ")",
+                    a0 < lastx ? "Premature EOL" : "Line length mismatch", line,
+                    isTiled(tif) ? "tile" : "strip",
+                    (isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip), a0,
+                    lastx);
 }
-#define	badlength(a0,lastx)	Fax3BadLength(module, tif, sp->line, a0, lastx)
+#define badlength(a0, lastx) Fax3BadLength(module, tif, sp->line, a0, lastx)
 
-static void
-Fax3PrematureEOF(const char* module, TIFF* tif, uint32 line, uint32 a0)
+static void Fax3PrematureEOF(const char *module, TIFF *tif, uint32_t line,
+                             uint32_t a0)
 {
-	TIFFWarningExt(tif->tif_clientdata, module, "Premature EOF at line %u of %s %u (x %u)",
-	    line, isTiled(tif) ? "tile" : "strip",
-	    (isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip),
-	    a0);
+    TIFFWarningExtR(tif, module,
+                    "Premature EOF at line %" PRIu32 " of %s %" PRIu32
+                    " (x %" PRIu32 ")",
+                    line, isTiled(tif) ? "tile" : "strip",
+                    (isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip), a0);
 }
-#define	prematureEOF(a0)	Fax3PrematureEOF(module, tif, sp->line, a0)
+#define prematureEOF(a0) Fax3PrematureEOF(module, tif, sp->line, a0)
 
-#define	Nop
+#define Nop
 
 /**
  * Decode the requested amount of G3 1D-encoded data.
@@ -228,275 +243,240 @@ Fax3PrematureEOF(const char* module, TIFF* tif, uint32 line, uint32 a0)
  * @param s number of planes (ignored)
  * @returns 1 for success, -1 in case of error
  */
-static int
-Fax3Decode1D(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
+static int Fax3Decode1D(TIFF *tif, uint8_t *buf, tmsize_t occ, uint16_t s)
 {
-	DECLARE_STATE(tif, sp, "Fax3Decode1D");
-	(void) s;
-	if (occ % sp->b.rowbytes)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Fractional scanlines cannot be read");
-		return (-1);
-	}
-	CACHE_STATE(tif, sp);
-	thisrun = sp->curruns;
-	while (occ > 0) {
-		a0 = 0;
-		RunLength = 0;
-		pa = thisrun;
+    DECLARE_STATE(tif, sp, "Fax3Decode1D");
+    (void)s;
+    if (occ % sp->b.rowbytes)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be read");
+        return (-1);
+    }
+    CACHE_STATE(tif, sp);
+    thisrun = sp->curruns;
+    while (occ > 0)
+    {
+        a0 = 0;
+        RunLength = 0;
+        pa = thisrun;
 #ifdef FAX3_DEBUG
-		printf("\nBitAcc=%08X, BitsAvail = %d\n", BitAcc, BitsAvail);
-		printf("-------------------- %d\n", tif->tif_row);
-		fflush(stdout);
+        printf("\nBitAcc=%08" PRIX32 ", BitsAvail = %d\n", BitAcc, BitsAvail);
+        printf("-------------------- %" PRIu32 "\n", tif->tif_row);
+        fflush(stdout);
 #endif
-		SYNC_EOL(EOF1D);
-		EXPAND1D(EOF1Da);
-		(*sp->fill)(buf, thisrun, pa, lastx);
-		buf += sp->b.rowbytes;
-		occ -= sp->b.rowbytes;
-		sp->line++;
-		continue;
-	EOF1D:				/* premature EOF */
-		CLEANUP_RUNS();
-	EOF1Da:				/* premature EOF */
-		(*sp->fill)(buf, thisrun, pa, lastx);
-		UNCACHE_STATE(tif, sp);
-		return (-1);
-	}
-	UNCACHE_STATE(tif, sp);
-	return (1);
+        SYNC_EOL(EOF1D);
+        EXPAND1D(EOF1Da);
+        (*sp->fill)(buf, thisrun, pa, lastx);
+        buf += sp->b.rowbytes;
+        occ -= sp->b.rowbytes;
+        sp->line++;
+        continue;
+    EOF1D: /* premature EOF */
+        CLEANUP_RUNS();
+    EOF1Da: /* premature EOF */
+        (*sp->fill)(buf, thisrun, pa, lastx);
+        UNCACHE_STATE(tif, sp);
+        return (-1);
+    }
+    UNCACHE_STATE(tif, sp);
+    return (1);
 }
 
-#define	SWAP(t,a,b)	{ t x; x = (a); (a) = (b); (b) = x; }
+#define SWAP(t, a, b)                                                          \
+    {                                                                          \
+        t x;                                                                   \
+        x = (a);                                                               \
+        (a) = (b);                                                             \
+        (b) = x;                                                               \
+    }
 /*
  * Decode the requested amount of G3 2D-encoded data.
  */
-static int
-Fax3Decode2D(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
+static int Fax3Decode2D(TIFF *tif, uint8_t *buf, tmsize_t occ, uint16_t s)
 {
-	DECLARE_STATE_2D(tif, sp, "Fax3Decode2D");
-	int is1D;			/* current line is 1d/2d-encoded */
-	(void) s;
-	if (occ % sp->b.rowbytes)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Fractional scanlines cannot be read");
-		return (-1);
-	}
-	CACHE_STATE(tif, sp);
-	while (occ > 0) {
-		a0 = 0;
-		RunLength = 0;
-		pa = thisrun = sp->curruns;
+    DECLARE_STATE_2D(tif, sp, "Fax3Decode2D");
+    int is1D; /* current line is 1d/2d-encoded */
+    (void)s;
+    if (occ % sp->b.rowbytes)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be read");
+        return (-1);
+    }
+    CACHE_STATE(tif, sp);
+    while (occ > 0)
+    {
+        a0 = 0;
+        RunLength = 0;
+        pa = thisrun = sp->curruns;
 #ifdef FAX3_DEBUG
-		printf("\nBitAcc=%08X, BitsAvail = %d EOLcnt = %d",
-		    BitAcc, BitsAvail, EOLcnt);
+        printf("\nBitAcc=%08" PRIX32 ", BitsAvail = %d EOLcnt = %d", BitAcc,
+               BitsAvail, EOLcnt);
 #endif
-		SYNC_EOL(EOF2D);
-		NeedBits8(1, EOF2D);
-		is1D = GetBits(1);	/* 1D/2D-encoding tag bit */
-		ClrBits(1);
+        SYNC_EOL(EOF2D);
+        NeedBits8(1, EOF2D);
+        is1D = GetBits(1); /* 1D/2D-encoding tag bit */
+        ClrBits(1);
 #ifdef FAX3_DEBUG
-		printf(" %s\n-------------------- %d\n",
-		    is1D ? "1D" : "2D", tif->tif_row);
-		fflush(stdout);
+        printf(" %s\n-------------------- %" PRIu32 "\n", is1D ? "1D" : "2D",
+               tif->tif_row);
+        fflush(stdout);
 #endif
-		pb = sp->refruns;
-		b1 = *pb++;
-		if (is1D)
-			EXPAND1D(EOF2Da);
-		else
-			EXPAND2D(EOF2Da);
-		(*sp->fill)(buf, thisrun, pa, lastx);
-		if (pa < thisrun + sp->nruns) {
-			SETVALUE(0);	/* imaginary change for reference */
-		}
-		SWAP(uint32*, sp->curruns, sp->refruns);
-		buf += sp->b.rowbytes;
-		occ -= sp->b.rowbytes;
-		sp->line++;
-		continue;
-	EOF2D:				/* premature EOF */
-		CLEANUP_RUNS();
-	EOF2Da:				/* premature EOF */
-		(*sp->fill)(buf, thisrun, pa, lastx);
-		UNCACHE_STATE(tif, sp);
-		return (-1);
-	}
-	UNCACHE_STATE(tif, sp);
-	return (1);
+        pb = sp->refruns;
+        b1 = *pb++;
+        if (is1D)
+            EXPAND1D(EOF2Da);
+        else
+            EXPAND2D(EOF2Da);
+        (*sp->fill)(buf, thisrun, pa, lastx);
+        if (pa < thisrun + sp->nruns)
+        {
+            SETVALUE(0); /* imaginary change for reference */
+        }
+        SWAP(uint32_t *, sp->curruns, sp->refruns);
+        buf += sp->b.rowbytes;
+        occ -= sp->b.rowbytes;
+        sp->line++;
+        continue;
+    EOF2D: /* premature EOF */
+        CLEANUP_RUNS();
+    EOF2Da: /* premature EOF */
+        (*sp->fill)(buf, thisrun, pa, lastx);
+        UNCACHE_STATE(tif, sp);
+        return (-1);
+    }
+    UNCACHE_STATE(tif, sp);
+    return (1);
 }
 #undef SWAP
 
-/*
- * The ZERO & FILL macros must handle spans < 2*sizeof(long) bytes.
- * For machines with 64-bit longs this is <16 bytes; otherwise
- * this is <8 bytes.  We optimize the code here to reflect the
- * machine characteristics.
- */
-#if SIZEOF_UNSIGNED_LONG == 8
-# define FILL(n, cp)							    \
-    switch (n) {							    \
-    case 15:(cp)[14] = 0xff; /*-fallthrough*/ \
-    case 14:(cp)[13] = 0xff; /*-fallthrough*/ \
-    case 13:(cp)[12] = 0xff; /*-fallthrough*/ \
-    case 12:(cp)[11] = 0xff; /*-fallthrough*/ \
-    case 11:(cp)[10] = 0xff; /*-fallthrough*/ \
-    case 10: (cp)[9] = 0xff; /*-fallthrough*/ \
-    case  9: (cp)[8] = 0xff; /*-fallthrough*/ \
-    case  8: (cp)[7] = 0xff; /*-fallthrough*/ \
-    case  7: (cp)[6] = 0xff; /*-fallthrough*/ \
-    case  6: (cp)[5] = 0xff; /*-fallthrough*/ \
-    case  5: (cp)[4] = 0xff; /*-fallthrough*/ \
-    case  4: (cp)[3] = 0xff; /*-fallthrough*/ \
-    case  3: (cp)[2] = 0xff; /*-fallthrough*/ \
-    case  2: (cp)[1] = 0xff; /*-fallthrough*/ \
-    case  1: (cp)[0] = 0xff; (cp) += (n); /*-fallthrough*/ \
-    case 0:  ;			      \
-    }
-# define ZERO(n, cp)							\
-    switch (n) {							\
-    case 15:(cp)[14] = 0; /*-fallthrough*/ \
-    case 14:(cp)[13] = 0; /*-fallthrough*/ \
-    case 13:(cp)[12] = 0; /*-fallthrough*/ \
-    case 12:(cp)[11] = 0; /*-fallthrough*/ \
-    case 11:(cp)[10] = 0; /*-fallthrough*/ \
-    case 10: (cp)[9] = 0; /*-fallthrough*/ \
-    case  9: (cp)[8] = 0; /*-fallthrough*/ \
-    case  8: (cp)[7] = 0; /*-fallthrough*/ \
-    case  7: (cp)[6] = 0; /*-fallthrough*/ \
-    case  6: (cp)[5] = 0; /*-fallthrough*/ \
-    case  5: (cp)[4] = 0; /*-fallthrough*/ \
-    case  4: (cp)[3] = 0; /*-fallthrough*/ \
-    case  3: (cp)[2] = 0; /*-fallthrough*/ \
-    case  2: (cp)[1] = 0; /*-fallthrough*/ \
-    case  1: (cp)[0] = 0; (cp) += (n); /*-fallthrough*/ \
-    case 0:  ;			\
-    }
-#else
-# define FILL(n, cp)							    \
-    switch (n) {							    \
-    case 7: (cp)[6] = 0xff; /*-fallthrough*/ \
-    case 6: (cp)[5] = 0xff; /*-fallthrough*/ \
-    case 5: (cp)[4] = 0xff; /*-fallthrough*/ \
-    case 4: (cp)[3] = 0xff; /*-fallthrough*/ \
-    case 3: (cp)[2] = 0xff; /*-fallthrough*/ \
-    case 2: (cp)[1] = 0xff; /*-fallthrough*/ \
-    case 1: (cp)[0] = 0xff; (cp) += (n);  /*-fallthrough*/ \
-    case 0:  ;			    \
-    }
-# define ZERO(n, cp)							\
-    switch (n) {							\
-    case 7: (cp)[6] = 0; /*-fallthrough*/ \
-    case 6: (cp)[5] = 0; /*-fallthrough*/ \
-    case 5: (cp)[4] = 0; /*-fallthrough*/ \
-    case 4: (cp)[3] = 0; /*-fallthrough*/ \
-    case 3: (cp)[2] = 0; /*-fallthrough*/ \
-    case 2: (cp)[1] = 0; /*-fallthrough*/ \
-    case 1: (cp)[0] = 0; (cp) += (n); /*-fallthrough*/ \
-    case 0:  ;			\
-    }
-#endif
+#define FILL(n, cp)                                                            \
+    for (int32_t ifill = 0; ifill < (n); ++ifill)                              \
+    {                                                                          \
+        (cp)[ifill] = 0xff;                                                    \
+    }                                                                          \
+    (cp) += (n);
+
+#define ZERO(n, cp)                                                            \
+    for (int32_t izero = 0; izero < (n); ++izero)                              \
+    {                                                                          \
+        (cp)[izero] = 0;                                                       \
+    }                                                                          \
+    (cp) += (n);
 
 /*
  * Bit-fill a row according to the white/black
  * runs generated during G3/G4 decoding.
  */
-void
-_TIFFFax3fillruns(unsigned char* buf, uint32* runs, uint32* erun, uint32 lastx)
+void _TIFFFax3fillruns(unsigned char *buf, uint32_t *runs, uint32_t *erun,
+                       uint32_t lastx)
 {
-	static const unsigned char _fillmasks[] =
-	    { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
-	unsigned char* cp;
-	uint32 x, bx, run;
-	int32 n, nw;
-	long* lp;
-
-	if ((erun-runs)&1)
-	    *erun++ = 0;
-	x = 0;
-	for (; runs < erun; runs += 2) {
-	    run = runs[0];
-	    if (x+run > lastx || run > lastx )
-		run = runs[0] = (uint32) (lastx - x);
-	    if (run) {
-		cp = buf + (x>>3);
-		bx = x&7;
-		if (run > 8-bx) {
-		    if (bx) {			/* align to byte boundary */
-			*cp++ &= 0xff << (8-bx);
-			run -= 8-bx;
-		    }
-		    if( (n = run >> 3) != 0 ) {	/* multiple bytes to fill */
-			if ((n/sizeof (long)) > 1) {
-			    /*
-			     * Align to longword boundary and fill.
-			     */
-			    for (; n && !isAligned(cp, long); n--)
-				    *cp++ = 0x00;
-			    lp = (long*) cp;
-			    nw = (int32)(n / sizeof (long));
-			    n -= nw * sizeof (long);
-			    do {
-				    *lp++ = 0L;
-			    } while (--nw);
-			    cp = (unsigned char*) lp;
-			}
-			ZERO(n, cp);
-			run &= 7;
-		    }
-		    if (run)
-			cp[0] &= 0xff >> run;
-		} else
-		    cp[0] &= ~(_fillmasks[run]>>bx);
-		x += runs[0];
-	    }
-	    run = runs[1];
-	    if (x+run > lastx || run > lastx )
-		run = runs[1] = lastx - x;
-	    if (run) {
-		cp = buf + (x>>3);
-		bx = x&7;
-		if (run > 8-bx) {
-		    if (bx) {			/* align to byte boundary */
-			*cp++ |= 0xff >> bx;
-			run -= 8-bx;
-		    }
-		    if( (n = run>>3) != 0 ) {	/* multiple bytes to fill */
-			if ((n/sizeof (long)) > 1) {
-			    /*
-			     * Align to longword boundary and fill.
-			     */
-			    for (; n && !isAligned(cp, long); n--)
-				*cp++ = 0xff;
-			    lp = (long*) cp;
-			    nw = (int32)(n / sizeof (long));
-			    n -= nw * sizeof (long);
-			    do {
-				*lp++ = -1L;
-			    } while (--nw);
-			    cp = (unsigned char*) lp;
-			}
-			FILL(n, cp);
-			run &= 7;
-		    }
-                    /* Explicit 0xff masking to make icc -check=conversions happy */
-		    if (run)
-			cp[0] = (unsigned char)((cp[0] | (0xff00 >> run))&0xff);
-		} else
-		    cp[0] |= _fillmasks[run]>>bx;
-		x += runs[1];
-	    }
-	}
-	assert(x == lastx);
+    static const unsigned char _fillmasks[] = {0x00, 0x80, 0xc0, 0xe0, 0xf0,
+                                               0xf8, 0xfc, 0xfe, 0xff};
+    unsigned char *cp;
+    uint32_t x, bx, run;
+    int32_t n, nw;
+    int64_t *lp;
+
+    if ((erun - runs) & 1)
+        *erun++ = 0;
+    x = 0;
+    for (; runs < erun; runs += 2)
+    {
+        run = runs[0];
+        if (x + run > lastx || run > lastx)
+            run = runs[0] = (uint32_t)(lastx - x);
+        if (run)
+        {
+            cp = buf + (x >> 3);
+            bx = x & 7;
+            if (run > 8 - bx)
+            {
+                if (bx)
+                { /* align to byte boundary */
+                    *cp++ &= 0xff << (8 - bx);
+                    run -= 8 - bx;
+                }
+                if ((n = run >> 3) != 0)
+                { /* multiple bytes to fill */
+                    if ((n / sizeof(int64_t)) > 1)
+                    {
+                        /*
+                         * Align to int64_tword boundary and fill.
+                         */
+                        for (; n && !isAligned(cp, int64_t); n--)
+                            *cp++ = 0x00;
+                        lp = (int64_t *)cp;
+                        nw = (int32_t)(n / sizeof(int64_t));
+                        n -= nw * sizeof(int64_t);
+                        do
+                        {
+                            *lp++ = 0L;
+                        } while (--nw);
+                        cp = (unsigned char *)lp;
+                    }
+                    ZERO(n, cp);
+                    run &= 7;
+                }
+                if (run)
+                    cp[0] &= 0xff >> run;
+            }
+            else
+                cp[0] &= ~(_fillmasks[run] >> bx);
+            x += runs[0];
+        }
+        run = runs[1];
+        if (x + run > lastx || run > lastx)
+            run = runs[1] = lastx - x;
+        if (run)
+        {
+            cp = buf + (x >> 3);
+            bx = x & 7;
+            if (run > 8 - bx)
+            {
+                if (bx)
+                { /* align to byte boundary */
+                    *cp++ |= 0xff >> bx;
+                    run -= 8 - bx;
+                }
+                if ((n = run >> 3) != 0)
+                { /* multiple bytes to fill */
+                    if ((n / sizeof(int64_t)) > 1)
+                    {
+                        /*
+                         * Align to int64_t boundary and fill.
+                         */
+                        for (; n && !isAligned(cp, int64_t); n--)
+                            *cp++ = 0xff;
+                        lp = (int64_t *)cp;
+                        nw = (int32_t)(n / sizeof(int64_t));
+                        n -= nw * sizeof(int64_t);
+                        do
+                        {
+                            *lp++ = -1L;
+                        } while (--nw);
+                        cp = (unsigned char *)lp;
+                    }
+                    FILL(n, cp);
+                    run &= 7;
+                }
+                /* Explicit 0xff masking to make icc -check=conversions happy */
+                if (run)
+                    cp[0] = (unsigned char)((cp[0] | (0xff00 >> run)) & 0xff);
+            }
+            else
+                cp[0] |= _fillmasks[run] >> bx;
+            x += runs[1];
+        }
+    }
+    assert(x == lastx);
 }
-#undef	ZERO
-#undef	FILL
+#undef ZERO
+#undef FILL
 
-static int
-Fax3FixupTags(TIFF* tif)
+static int Fax3FixupTags(TIFF *tif)
 {
-	(void) tif;
-	return (1);
+    (void)tif;
+    return (1);
 }
 
 /*
@@ -506,175 +486,188 @@ Fax3FixupTags(TIFF* tif)
  * or not decoding or encoding is being done and whether
  * 1D- or 2D-encoded data is involved.
  */
-static int
-Fax3SetupState(TIFF* tif)
+static int Fax3SetupState(TIFF *tif)
 {
-	static const char module[] = "Fax3SetupState";
-	TIFFDirectory* td = &tif->tif_dir;
-	Fax3BaseState* sp = Fax3State(tif);
-	int needsRefLine;
-	Fax3CodecState* dsp = (Fax3CodecState*) Fax3State(tif);
-	tmsize_t rowbytes;
-	uint32 rowpixels;
-
-	if (td->td_bitspersample != 1) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Bits/sample must be 1 for Group 3/4 encoding/decoding");
-		return (0);
-	}
-	/*
-	 * Calculate the scanline/tile widths.
-	 */
-	if (isTiled(tif)) {
-		rowbytes = TIFFTileRowSize(tif);
-		rowpixels = td->td_tilewidth;
-	} else {
-		rowbytes = TIFFScanlineSize(tif);
-		rowpixels = td->td_imagewidth;
-	}
-	if ((uint64)rowbytes < ((uint64)rowpixels + 7) / 8)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"Inconsistent number of bytes per row : rowbytes=%lu rowpixels=%lu",
-			(unsigned long)(rowbytes), (unsigned long)(rowpixels));
-		return (0);
-	}
-	sp->rowbytes = rowbytes;
-	sp->rowpixels = rowpixels;
-	/*
-	 * Allocate any additional space required for decoding/encoding.
-	 */
-	needsRefLine = (
-	    (sp->groupoptions & GROUP3OPT_2DENCODING) ||
-	    td->td_compression == COMPRESSION_CCITTFAX4
-	);
-
-	/*
-	  Assure that allocation computations do not overflow.
-	  
-	  TIFFroundup and TIFFSafeMultiply return zero on integer overflow
-	*/
-	dsp->runs=(uint32*) NULL;
-	dsp->nruns = TIFFroundup_32(rowpixels,32);
-	if (needsRefLine) {
-		dsp->nruns = TIFFSafeMultiply(uint32,dsp->nruns,2);
-	}
-	if ((dsp->nruns == 0) || (TIFFSafeMultiply(uint32,dsp->nruns,2) == 0)) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "Row pixels integer overflow (rowpixels %u)",
-			     rowpixels);
-		return (0);
-	}
-	dsp->runs = (uint32*) _TIFFCheckMalloc(tif,
-					       TIFFSafeMultiply(uint32,dsp->nruns,2),
-					       sizeof (uint32),
-					       "for Group 3/4 run arrays");
-	if (dsp->runs == NULL)
-		return (0);
-	memset( dsp->runs, 0, TIFFSafeMultiply(uint32,dsp->nruns,2)*sizeof(uint32));
-	dsp->curruns = dsp->runs;
-	if (needsRefLine)
-		dsp->refruns = dsp->runs + dsp->nruns;
-	else
-		dsp->refruns = NULL;
-	if (td->td_compression == COMPRESSION_CCITTFAX3
-	    && is2DEncoding(dsp)) {	/* NB: default is 1D routine */
-		tif->tif_decoderow = Fax3Decode2D;
-		tif->tif_decodestrip = Fax3Decode2D;
-		tif->tif_decodetile = Fax3Decode2D;
-	}
-
-	if (needsRefLine) {		/* 2d encoding */
-		Fax3CodecState* esp = EncoderState(tif);
-		/*
-		 * 2d encoding requires a scanline
-		 * buffer for the ``reference line''; the
-		 * scanline against which delta encoding
-		 * is referenced.  The reference line must
-		 * be initialized to be ``white'' (done elsewhere).
-		 */
-		esp->refline = (unsigned char*) _TIFFmalloc(rowbytes);
-		if (esp->refline == NULL) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "No space for Group 3/4 reference line");
-			return (0);
-		}
-	} else					/* 1d encoding */
-		EncoderState(tif)->refline = NULL;
-
-	return (1);
+    static const char module[] = "Fax3SetupState";
+    TIFFDirectory *td = &tif->tif_dir;
+    Fax3BaseState *sp = Fax3State(tif);
+    int needsRefLine;
+    Fax3CodecState *dsp = (Fax3CodecState *)Fax3State(tif);
+    tmsize_t rowbytes;
+    uint32_t rowpixels;
+
+    if (td->td_bitspersample != 1)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Bits/sample must be 1 for Group 3/4 encoding/decoding");
+        return (0);
+    }
+    /*
+     * Calculate the scanline/tile widths.
+     */
+    if (isTiled(tif))
+    {
+        rowbytes = TIFFTileRowSize(tif);
+        rowpixels = td->td_tilewidth;
+    }
+    else
+    {
+        rowbytes = TIFFScanlineSize(tif);
+        rowpixels = td->td_imagewidth;
+    }
+    if ((int64_t)rowbytes < ((int64_t)rowpixels + 7) / 8)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Inconsistent number of bytes per row : rowbytes=%" PRId64
+                      " rowpixels=%" PRIu32,
+                      (int64_t)rowbytes, rowpixels);
+        return (0);
+    }
+    sp->rowbytes = rowbytes;
+    sp->rowpixels = rowpixels;
+    /*
+     * Allocate any additional space required for decoding/encoding.
+     */
+    needsRefLine = ((sp->groupoptions & GROUP3OPT_2DENCODING) ||
+                    td->td_compression == COMPRESSION_CCITTFAX4);
+
+    /*
+      Assure that allocation computations do not overflow.
+
+      TIFFroundup and TIFFSafeMultiply return zero on integer overflow
+    */
+    dsp->runs = (uint32_t *)NULL;
+    dsp->nruns = TIFFroundup_32(rowpixels + 1, 32);
+    if (needsRefLine)
+    {
+        dsp->nruns = TIFFSafeMultiply(uint32_t, dsp->nruns, 2);
+    }
+    if ((dsp->nruns == 0) || (TIFFSafeMultiply(uint32_t, dsp->nruns, 2) == 0))
+    {
+        TIFFErrorExtR(tif, tif->tif_name,
+                      "Row pixels integer overflow (rowpixels %" PRIu32 ")",
+                      rowpixels);
+        return (0);
+    }
+    dsp->runs = (uint32_t *)_TIFFCheckMalloc(
+        tif, TIFFSafeMultiply(uint32_t, dsp->nruns, 2), sizeof(uint32_t),
+        "for Group 3/4 run arrays");
+    if (dsp->runs == NULL)
+        return (0);
+    memset(dsp->runs, 0,
+           TIFFSafeMultiply(uint32_t, dsp->nruns, 2) * sizeof(uint32_t));
+    dsp->curruns = dsp->runs;
+    if (needsRefLine)
+        dsp->refruns = dsp->runs + dsp->nruns;
+    else
+        dsp->refruns = NULL;
+    if (td->td_compression == COMPRESSION_CCITTFAX3 && is2DEncoding(dsp))
+    { /* NB: default is 1D routine */
+        tif->tif_decoderow = Fax3Decode2D;
+        tif->tif_decodestrip = Fax3Decode2D;
+        tif->tif_decodetile = Fax3Decode2D;
+    }
+
+    if (needsRefLine)
+    { /* 2d encoding */
+        Fax3CodecState *esp = EncoderState(tif);
+        /*
+         * 2d encoding requires a scanline
+         * buffer for the ``reference line''; the
+         * scanline against which delta encoding
+         * is referenced.  The reference line must
+         * be initialized to be ``white'' (done elsewhere).
+         */
+        esp->refline = (unsigned char *)_TIFFmallocExt(tif, rowbytes);
+        if (esp->refline == NULL)
+        {
+            TIFFErrorExtR(tif, module, "No space for Group 3/4 reference line");
+            return (0);
+        }
+    }
+    else /* 1d encoding */
+        EncoderState(tif)->refline = NULL;
+
+    return (1);
 }
 
 /*
  * CCITT Group 3 FAX Encoding.
  */
 
-#define	Fax3FlushBits(tif, sp) {				\
-	if ((tif)->tif_rawcc >= (tif)->tif_rawdatasize) {	\
-		if( !TIFFFlushData1(tif) )			\
-			return 0;				\
-        }							\
-	*(tif)->tif_rawcp++ = (uint8) (sp)->data;		\
-	(tif)->tif_rawcc++;					\
-	(sp)->data = 0, (sp)->bit = 8;				\
-}
-#define	_FlushBits(tif) {					\
-	if ((tif)->tif_rawcc >= (tif)->tif_rawdatasize) {	\
-		if( !TIFFFlushData1(tif) )			\
-			return 0;				\
-        }							\
-	*(tif)->tif_rawcp++ = (uint8) data;		\
-	(tif)->tif_rawcc++;					\
-	data = 0, bit = 8;					\
-}
-static const int _msbmask[9] =
-    { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
-#define	_PutBits(tif, bits, length) {				\
-	while (length > bit) {					\
-		data |= bits >> (length - bit);			\
-		length -= bit;					\
-		_FlushBits(tif);				\
-	}							\
-        assert( length < 9 );                                   \
-	data |= (bits & _msbmask[length]) << (bit - length);	\
-	bit -= length;						\
-	if (bit == 0)						\
-		_FlushBits(tif);				\
-}
-	
+#define Fax3FlushBits(tif, sp)                                                 \
+    {                                                                          \
+        if ((tif)->tif_rawcc >= (tif)->tif_rawdatasize)                        \
+        {                                                                      \
+            if (!TIFFFlushData1(tif))                                          \
+                return 0;                                                      \
+        }                                                                      \
+        *(tif)->tif_rawcp++ = (uint8_t)(sp)->data;                             \
+        (tif)->tif_rawcc++;                                                    \
+        (sp)->data = 0, (sp)->bit = 8;                                         \
+    }
+#define _FlushBits(tif)                                                        \
+    {                                                                          \
+        if ((tif)->tif_rawcc >= (tif)->tif_rawdatasize)                        \
+        {                                                                      \
+            if (!TIFFFlushData1(tif))                                          \
+                return 0;                                                      \
+        }                                                                      \
+        *(tif)->tif_rawcp++ = (uint8_t)data;                                   \
+        (tif)->tif_rawcc++;                                                    \
+        data = 0, bit = 8;                                                     \
+    }
+static const int _msbmask[9] = {0x00, 0x01, 0x03, 0x07, 0x0f,
+                                0x1f, 0x3f, 0x7f, 0xff};
+#define _PutBits(tif, bits, length)                                            \
+    {                                                                          \
+        while (length > bit)                                                   \
+        {                                                                      \
+            data |= bits >> (length - bit);                                    \
+            length -= bit;                                                     \
+            _FlushBits(tif);                                                   \
+        }                                                                      \
+        assert(length < 9);                                                    \
+        data |= (bits & _msbmask[length]) << (bit - length);                   \
+        bit -= length;                                                         \
+        if (bit == 0)                                                          \
+            _FlushBits(tif);                                                   \
+    }
+
 /*
  * Write a variable-length bit-value to
  * the output stream.  Values are
  * assumed to be at most 16 bits.
  */
-static int
-Fax3PutBits(TIFF* tif, unsigned int bits, unsigned int length)
+static int Fax3PutBits(TIFF *tif, unsigned int bits, unsigned int length)
 {
-	Fax3CodecState* sp = EncoderState(tif);
-	unsigned int bit = sp->bit;
-	int data = sp->data;
+    Fax3CodecState *sp = EncoderState(tif);
+    unsigned int bit = sp->bit;
+    int data = sp->data;
 
-	_PutBits(tif, bits, length);
+    _PutBits(tif, bits, length);
 
-	sp->data = data;
-	sp->bit = bit;
-        return 1;
+    sp->data = data;
+    sp->bit = bit;
+    return 1;
 }
 
 /*
  * Write a code to the output stream.
  */
-#define putcode(tif, te)	Fax3PutBits(tif, (te)->code, (te)->length)
+#define putcode(tif, te) Fax3PutBits(tif, (te)->code, (te)->length)
 
 #ifdef FAX3_DEBUG
-#define	DEBUG_COLOR(w) (tab == TIFFFaxWhiteCodes ? w "W" : w "B")
-#define	DEBUG_PRINT(what,len) {						\
-    int t;								\
-    printf("%08X/%-2d: %s%5d\t", data, bit, DEBUG_COLOR(what), len);	\
-    for (t = length-1; t >= 0; t--)					\
-	putchar(code & (1<<t) ? '1' : '0');				\
-    putchar('\n');							\
-}
+#define DEBUG_COLOR(w) (tab == TIFFFaxWhiteCodes ? w "W" : w "B")
+#define DEBUG_PRINT(what, len)                                                 \
+    {                                                                          \
+        int t;                                                                 \
+        printf("%08" PRIX32 "/%-2d: %s%5d\t", data, bit, DEBUG_COLOR(what),    \
+               len);                                                           \
+        for (t = length - 1; t >= 0; t--)                                      \
+            putchar(code & (1 << t) ? '1' : '0');                              \
+        putchar('\n');                                                         \
+    }
 #endif
 
 /*
@@ -683,46 +676,47 @@ Fax3PutBits(TIFF* tif, unsigned int bits, unsigned int length)
  * appropriate table that holds the make-up and
  * terminating codes is supplied.
  */
-static int
-putspan(TIFF* tif, int32 span, const tableentry* tab)
+static int putspan(TIFF *tif, int32_t span, const tableentry *tab)
 {
-	Fax3CodecState* sp = EncoderState(tif);
-	unsigned int bit = sp->bit;
-	int data = sp->data;
-	unsigned int code, length;
-
-	while (span >= 2624) {
-		const tableentry* te = &tab[63 + (2560>>6)];
-		code = te->code;
-		length = te->length;
+    Fax3CodecState *sp = EncoderState(tif);
+    unsigned int bit = sp->bit;
+    int data = sp->data;
+    unsigned int code, length;
+
+    while (span >= 2624)
+    {
+        const tableentry *te = &tab[63 + (2560 >> 6)];
+        code = te->code;
+        length = te->length;
 #ifdef FAX3_DEBUG
-		DEBUG_PRINT("MakeUp", te->runlen);
+        DEBUG_PRINT("MakeUp", te->runlen);
 #endif
-		_PutBits(tif, code, length);
-		span -= te->runlen;
-	}
-	if (span >= 64) {
-		const tableentry* te = &tab[63 + (span>>6)];
-		assert(te->runlen == 64*(span>>6));
-		code = te->code;
-		length = te->length;
+        _PutBits(tif, code, length);
+        span -= te->runlen;
+    }
+    if (span >= 64)
+    {
+        const tableentry *te = &tab[63 + (span >> 6)];
+        assert(te->runlen == 64 * (span >> 6));
+        code = te->code;
+        length = te->length;
 #ifdef FAX3_DEBUG
-		DEBUG_PRINT("MakeUp", te->runlen);
+        DEBUG_PRINT("MakeUp", te->runlen);
 #endif
-		_PutBits(tif, code, length);
-		span -= te->runlen;
-	}
-	code = tab[span].code;
-	length = tab[span].length;
+        _PutBits(tif, code, length);
+        span -= te->runlen;
+    }
+    code = tab[span].code;
+    length = tab[span].length;
 #ifdef FAX3_DEBUG
-	DEBUG_PRINT("  Term", tab[span].runlen);
+    DEBUG_PRINT("  Term", tab[span].runlen);
 #endif
-	_PutBits(tif, code, length);
+    _PutBits(tif, code, length);
 
-	sp->data = data;
-	sp->bit = bit;
+    sp->data = data;
+    sp->bit = bit;
 
-        return 1;
+    return 1;
 }
 
 /*
@@ -731,260 +725,266 @@ putspan(TIFF* tif, int32 span, const tableentry* tab)
  * here.  We also handle writing the tag bit for the next
  * scanline when doing 2d encoding.
  */
-static int
-Fax3PutEOL(TIFF* tif)
+static int Fax3PutEOL(TIFF *tif)
 {
-	Fax3CodecState* sp = EncoderState(tif);
-	unsigned int bit = sp->bit;
-	int data = sp->data;
-	unsigned int code, length, tparm;
-
-	if (sp->b.groupoptions & GROUP3OPT_FILLBITS) {
-		/*
-		 * Force bit alignment so EOL will terminate on
-		 * a byte boundary.  That is, force the bit alignment
-		 * to 16-12 = 4 before putting out the EOL code.
-		 */
-		int align = 8 - 4;
-		if (align != sp->bit) {
-			if (align > sp->bit)
-				align = sp->bit + (8 - align);
-			else
-				align = sp->bit - align;
-			tparm=align; 
-			_PutBits(tif, 0, tparm);
-		}
-	}
-	code = EOL;
-	length = 12;
-	if (is2DEncoding(sp)) {
-		code = (code<<1) | (sp->tag == G3_1D);
-		length++;
-	}
-	_PutBits(tif, code, length);
-
-	sp->data = data;
-	sp->bit = bit;
-
-        return 1;
+    Fax3CodecState *sp = EncoderState(tif);
+    unsigned int bit = sp->bit;
+    int data = sp->data;
+    unsigned int code, length, tparm;
+
+    if (sp->b.groupoptions & GROUP3OPT_FILLBITS)
+    {
+        /*
+         * Force bit alignment so EOL will terminate on
+         * a byte boundary.  That is, force the bit alignment
+         * to 16-12 = 4 before putting out the EOL code.
+         */
+        int align = 8 - 4;
+        if (align != sp->bit)
+        {
+            if (align > sp->bit)
+                align = sp->bit + (8 - align);
+            else
+                align = sp->bit - align;
+            tparm = align;
+            _PutBits(tif, 0, tparm);
+        }
+    }
+    code = EOL;
+    length = 12;
+    if (is2DEncoding(sp))
+    {
+        code = (code << 1) | (sp->tag == G3_1D);
+        length++;
+    }
+    _PutBits(tif, code, length);
+
+    sp->data = data;
+    sp->bit = bit;
+
+    return 1;
 }
 
 /*
  * Reset encoding state at the start of a strip.
  */
-static int
-Fax3PreEncode(TIFF* tif, uint16 s)
+static int Fax3PreEncode(TIFF *tif, uint16_t s)
 {
-	Fax3CodecState* sp = EncoderState(tif);
-
-	(void) s;
-	assert(sp != NULL);
-	sp->bit = 8;
-	sp->data = 0;
-	sp->tag = G3_1D;
-	/*
-	 * This is necessary for Group 4; otherwise it isn't
-	 * needed because the first scanline of each strip ends
-	 * up being copied into the refline.
-	 */
-	if (sp->refline)
-		_TIFFmemset(sp->refline, 0x00, sp->b.rowbytes);
-	if (is2DEncoding(sp)) {
-		float res = tif->tif_dir.td_yresolution;
-		/*
-		 * The CCITT spec says that when doing 2d encoding, you
-		 * should only do it on K consecutive scanlines, where K
-		 * depends on the resolution of the image being encoded
-		 * (2 for <= 200 lpi, 4 for > 200 lpi).  Since the directory
-		 * code initializes td_yresolution to 0, this code will
-		 * select a K of 2 unless the YResolution tag is set
-		 * appropriately.  (Note also that we fudge a little here
-		 * and use 150 lpi to avoid problems with units conversion.)
-		 */
-		if (tif->tif_dir.td_resolutionunit == RESUNIT_CENTIMETER)
-			res *= 2.54f;		/* convert to inches */
-		sp->maxk = (res > 150 ? 4 : 2);
-		sp->k = sp->maxk-1;
-	} else
-		sp->k = sp->maxk = 0;
-	sp->line = 0;
-	return (1);
+    Fax3CodecState *sp = EncoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+    sp->bit = 8;
+    sp->data = 0;
+    sp->tag = G3_1D;
+    /*
+     * This is necessary for Group 4; otherwise it isn't
+     * needed because the first scanline of each strip ends
+     * up being copied into the refline.
+     */
+    if (sp->refline)
+        _TIFFmemset(sp->refline, 0x00, sp->b.rowbytes);
+    if (is2DEncoding(sp))
+    {
+        float res = tif->tif_dir.td_yresolution;
+        /*
+         * The CCITT spec says that when doing 2d encoding, you
+         * should only do it on K consecutive scanlines, where K
+         * depends on the resolution of the image being encoded
+         * (2 for <= 200 lpi, 4 for > 200 lpi).  Since the directory
+         * code initializes td_yresolution to 0, this code will
+         * select a K of 2 unless the YResolution tag is set
+         * appropriately.  (Note also that we fudge a little here
+         * and use 150 lpi to avoid problems with units conversion.)
+         */
+        if (tif->tif_dir.td_resolutionunit == RESUNIT_CENTIMETER)
+            res *= 2.54f; /* convert to inches */
+        sp->maxk = (res > 150 ? 4 : 2);
+        sp->k = sp->maxk - 1;
+    }
+    else
+        sp->k = sp->maxk = 0;
+    sp->line = 0;
+    return (1);
 }
 
 static const unsigned char zeroruns[256] = {
-    8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,	/* 0x00 - 0x0f */
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,	/* 0x10 - 0x1f */
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	/* 0x20 - 0x2f */
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	/* 0x30 - 0x3f */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 0x40 - 0x4f */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 0x50 - 0x5f */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 0x60 - 0x6f */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 0x70 - 0x7f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x80 - 0x8f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x90 - 0x9f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0xa0 - 0xaf */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0xb0 - 0xbf */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0xc0 - 0xcf */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0xd0 - 0xdf */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0xe0 - 0xef */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0xf0 - 0xff */
+    8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, /* 0x00 - 0x0f */
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0x10 - 0x1f */
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x20 - 0x2f */
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x30 - 0x3f */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40 - 0x4f */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50 - 0x5f */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60 - 0x6f */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70 - 0x7f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x9f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xa0 - 0xaf */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xb0 - 0xbf */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xc0 - 0xcf */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xd0 - 0xdf */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xe0 - 0xef */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xf0 - 0xff */
 };
 static const unsigned char oneruns[256] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x00 - 0x0f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x10 - 0x1f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x20 - 0x2f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x30 - 0x3f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x40 - 0x4f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x50 - 0x5f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x60 - 0x6f */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x70 - 0x7f */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 0x80 - 0x8f */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 0x90 - 0x9f */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 0xa0 - 0xaf */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	/* 0xb0 - 0xbf */
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	/* 0xc0 - 0xcf */
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	/* 0xd0 - 0xdf */
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,	/* 0xe0 - 0xef */
-    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8,	/* 0xf0 - 0xff */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00 - 0x0f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 - 0x1f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x4f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 - 0x5f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6f */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 - 0x7f */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80 - 0x8f */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x90 - 0x9f */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xa0 - 0xaf */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xb0 - 0xbf */
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xc0 - 0xcf */
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xd0 - 0xdf */
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xe0 - 0xef */
+    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8, /* 0xf0 - 0xff */
 };
 
-/*
- * On certain systems it pays to inline
- * the routines that find pixel spans.
- */
-#ifdef VAXC
-static	int32 find0span(unsigned char*, int32, int32);
-static	int32 find1span(unsigned char*, int32, int32);
-#pragma inline(find0span,find1span)
-#endif
-
 /*
  * Find a span of ones or zeros using the supplied
  * table.  The ``base'' of the bit string is supplied
  * along with the start+end bit indices.
  */
-inline static int32
-find0span(unsigned char* bp, int32 bs, int32 be)
+static inline int32_t find0span(unsigned char *bp, int32_t bs, int32_t be)
 {
-	int32 bits = be - bs;
-	int32 n, span;
-
-	bp += bs>>3;
-	/*
-	 * Check partial byte on lhs.
-	 */
-	if (bits > 0 && (n = (bs & 7)) != 0) {
-		span = zeroruns[(*bp << n) & 0xff];
-		if (span > 8-n)		/* table value too generous */
-			span = 8-n;
-		if (span > bits)	/* constrain span to bit range */
-			span = bits;
-		if (n+span < 8)		/* doesn't extend to edge of byte */
-			return (span);
-		bits -= span;
-		bp++;
-	} else
-		span = 0;
-	if (bits >= (int32)(2 * 8 * sizeof(long))) {
-		long* lp;
-		/*
-		 * Align to longword boundary and check longwords.
-		 */
-		while (!isAligned(bp, long)) {
-			if (*bp != 0x00)
-				return (span + zeroruns[*bp]);
-			span += 8;
-			bits -= 8;
-			bp++;
-		}
-		lp = (long*) bp;
-		while ((bits >= (int32)(8 * sizeof(long))) && (0 == *lp)) {
-			span += 8*sizeof (long);
-			bits -= 8*sizeof (long);
-			lp++;
-		}
-		bp = (unsigned char*) lp;
-	}
-	/*
-	 * Scan full bytes for all 0's.
-	 */
-	while (bits >= 8) {
-		if (*bp != 0x00)	/* end of run */
-			return (span + zeroruns[*bp]);
-		span += 8;
-		bits -= 8;
-		bp++;
-	}
-	/*
-	 * Check partial byte on rhs.
-	 */
-	if (bits > 0) {
-		n = zeroruns[*bp];
-		span += (n > bits ? bits : n);
-	}
-	return (span);
+    int32_t bits = be - bs;
+    int32_t n, span;
+
+    bp += bs >> 3;
+    /*
+     * Check partial byte on lhs.
+     */
+    if (bits > 0 && (n = (bs & 7)) != 0)
+    {
+        span = zeroruns[(*bp << n) & 0xff];
+        if (span > 8 - n) /* table value too generous */
+            span = 8 - n;
+        if (span > bits) /* constrain span to bit range */
+            span = bits;
+        if (n + span < 8) /* doesn't extend to edge of byte */
+            return (span);
+        bits -= span;
+        bp++;
+    }
+    else
+        span = 0;
+    if (bits >= (int32_t)(2 * 8 * sizeof(int64_t)))
+    {
+        int64_t *lp;
+        /*
+         * Align to int64_t boundary and check int64_t words.
+         */
+        while (!isAligned(bp, int64_t))
+        {
+            if (*bp != 0x00)
+                return (span + zeroruns[*bp]);
+            span += 8;
+            bits -= 8;
+            bp++;
+        }
+        lp = (int64_t *)bp;
+        while ((bits >= (int32_t)(8 * sizeof(int64_t))) && (0 == *lp))
+        {
+            span += 8 * sizeof(int64_t);
+            bits -= 8 * sizeof(int64_t);
+            lp++;
+        }
+        bp = (unsigned char *)lp;
+    }
+    /*
+     * Scan full bytes for all 0's.
+     */
+    while (bits >= 8)
+    {
+        if (*bp != 0x00) /* end of run */
+            return (span + zeroruns[*bp]);
+        span += 8;
+        bits -= 8;
+        bp++;
+    }
+    /*
+     * Check partial byte on rhs.
+     */
+    if (bits > 0)
+    {
+        n = zeroruns[*bp];
+        span += (n > bits ? bits : n);
+    }
+    return (span);
 }
 
-inline static int32
-find1span(unsigned char* bp, int32 bs, int32 be)
+static inline int32_t find1span(unsigned char *bp, int32_t bs, int32_t be)
 {
-	int32 bits = be - bs;
-	int32 n, span;
-
-	bp += bs>>3;
-	/*
-	 * Check partial byte on lhs.
-	 */
-	if (bits > 0 && (n = (bs & 7)) != 0) {
-		span = oneruns[(*bp << n) & 0xff];
-		if (span > 8-n)		/* table value too generous */
-			span = 8-n;
-		if (span > bits)	/* constrain span to bit range */
-			span = bits;
-		if (n+span < 8)		/* doesn't extend to edge of byte */
-			return (span);
-		bits -= span;
-		bp++;
-	} else
-		span = 0;
-	if (bits >= (int32)(2 * 8 * sizeof(long))) {
-		long* lp;
-		/*
-		 * Align to longword boundary and check longwords.
-		 */
-		while (!isAligned(bp, long)) {
-			if (*bp != 0xff)
-				return (span + oneruns[*bp]);
-			span += 8;
-			bits -= 8;
-			bp++;
-		}
-		lp = (long*) bp;
-		while ((bits >= (int32)(8 * sizeof(long))) && (~0 == *lp)) {
-			span += 8*sizeof (long);
-			bits -= 8*sizeof (long);
-			lp++;
-		}
-		bp = (unsigned char*) lp;
-	}
-	/*
-	 * Scan full bytes for all 1's.
-	 */
-	while (bits >= 8) {
-		if (*bp != 0xff)	/* end of run */
-			return (span + oneruns[*bp]);
-		span += 8;
-		bits -= 8;
-		bp++;
-	}
-	/*
-	 * Check partial byte on rhs.
-	 */
-	if (bits > 0) {
-		n = oneruns[*bp];
-		span += (n > bits ? bits : n);
-	}
-	return (span);
+    int32_t bits = be - bs;
+    int32_t n, span;
+
+    bp += bs >> 3;
+    /*
+     * Check partial byte on lhs.
+     */
+    if (bits > 0 && (n = (bs & 7)) != 0)
+    {
+        span = oneruns[(*bp << n) & 0xff];
+        if (span > 8 - n) /* table value too generous */
+            span = 8 - n;
+        if (span > bits) /* constrain span to bit range */
+            span = bits;
+        if (n + span < 8) /* doesn't extend to edge of byte */
+            return (span);
+        bits -= span;
+        bp++;
+    }
+    else
+        span = 0;
+    if (bits >= (int32_t)(2 * 8 * sizeof(int64_t)))
+    {
+        int64_t *lp;
+        /*
+         * Align to int64_t boundary and check int64_t words.
+         */
+        while (!isAligned(bp, int64_t))
+        {
+            if (*bp != 0xff)
+                return (span + oneruns[*bp]);
+            span += 8;
+            bits -= 8;
+            bp++;
+        }
+        lp = (int64_t *)bp;
+        while ((bits >= (int32_t)(8 * sizeof(int64_t))) &&
+               (~((uint64_t)0) == (uint64_t)*lp))
+        {
+            span += 8 * sizeof(int64_t);
+            bits -= 8 * sizeof(int64_t);
+            lp++;
+        }
+        bp = (unsigned char *)lp;
+    }
+    /*
+     * Scan full bytes for all 1's.
+     */
+    while (bits >= 8)
+    {
+        if (*bp != 0xff) /* end of run */
+            return (span + oneruns[*bp]);
+        span += 8;
+        bits -= 8;
+        bp++;
+    }
+    /*
+     * Check partial byte on rhs.
+     */
+    if (bits > 0)
+    {
+        n = oneruns[*bp];
+        span += (n > bits ? bits : n);
+    }
+    return (span);
 }
 
 /*
@@ -993,474 +993,501 @@ find1span(unsigned char* bp, int32 bs, int32 be)
  * color.  The end, be, is returned if no such bit
  * exists.
  */
-#define	finddiff(_cp, _bs, _be, _color)	\
-	(_bs + (_color ? find1span(_cp,_bs,_be) : find0span(_cp,_bs,_be)))
+#define finddiff(_cp, _bs, _be, _color)                                        \
+    (_bs + (_color ? find1span(_cp, _bs, _be) : find0span(_cp, _bs, _be)))
 /*
  * Like finddiff, but also check the starting bit
  * against the end in case start > end.
  */
-#define	finddiff2(_cp, _bs, _be, _color) \
-	(_bs < _be ? finddiff(_cp,_bs,_be,_color) : _be)
+#define finddiff2(_cp, _bs, _be, _color)                                       \
+    (_bs < _be ? finddiff(_cp, _bs, _be, _color) : _be)
 
 /*
  * 1d-encode a row of pixels.  The encoding is
  * a sequence of all-white or all-black spans
  * of pixels encoded with Huffman codes.
  */
-static int
-Fax3Encode1DRow(TIFF* tif, unsigned char* bp, uint32 bits)
+static int Fax3Encode1DRow(TIFF *tif, unsigned char *bp, uint32_t bits)
 {
-	Fax3CodecState* sp = EncoderState(tif);
-	int32 span;
-        uint32 bs = 0;
-
-	for (;;) {
-		span = find0span(bp, bs, bits);		/* white span */
-		if( !putspan(tif, span, TIFFFaxWhiteCodes) )
-                    return 0;
-		bs += span;
-		if (bs >= bits)
-			break;
-		span = find1span(bp, bs, bits);		/* black span */
-		if( !putspan(tif, span, TIFFFaxBlackCodes) )
-                    return 0;
-		bs += span;
-		if (bs >= bits)
-			break;
-	}
-	if (sp->b.mode & (FAXMODE_BYTEALIGN|FAXMODE_WORDALIGN)) {
-		if (sp->bit != 8)			/* byte-align */
-			Fax3FlushBits(tif, sp);
-		if ((sp->b.mode&FAXMODE_WORDALIGN) &&
-		    !isAligned(tif->tif_rawcp, uint16))
-			Fax3FlushBits(tif, sp);
-	}
-	return (1);
+    Fax3CodecState *sp = EncoderState(tif);
+    int32_t span;
+    uint32_t bs = 0;
+
+    for (;;)
+    {
+        span = find0span(bp, bs, bits); /* white span */
+        if (!putspan(tif, span, TIFFFaxWhiteCodes))
+            return 0;
+        bs += span;
+        if (bs >= bits)
+            break;
+        span = find1span(bp, bs, bits); /* black span */
+        if (!putspan(tif, span, TIFFFaxBlackCodes))
+            return 0;
+        bs += span;
+        if (bs >= bits)
+            break;
+    }
+    if (sp->b.mode & (FAXMODE_BYTEALIGN | FAXMODE_WORDALIGN))
+    {
+        if (sp->bit != 8) /* byte-align */
+            Fax3FlushBits(tif, sp);
+        if ((sp->b.mode & FAXMODE_WORDALIGN) &&
+            !isAligned(tif->tif_rawcp, uint16_t))
+            Fax3FlushBits(tif, sp);
+    }
+    return (1);
 }
 
-static const tableentry horizcode =
-    { 3, 0x1, 0 };	/* 001 */
-static const tableentry passcode =
-    { 4, 0x1, 0 };	/* 0001 */
+static const tableentry horizcode = {3, 0x1, 0}; /* 001 */
+static const tableentry passcode = {4, 0x1, 0};  /* 0001 */
 static const tableentry vcodes[7] = {
-    { 7, 0x03, 0 },	/* 0000 011 */
-    { 6, 0x03, 0 },	/* 0000 11 */
-    { 3, 0x03, 0 },	/* 011 */
-    { 1, 0x1, 0 },	/* 1 */
-    { 3, 0x2, 0 },	/* 010 */
-    { 6, 0x02, 0 },	/* 0000 10 */
-    { 7, 0x02, 0 }	/* 0000 010 */
+    {7, 0x03, 0}, /* 0000 011 */
+    {6, 0x03, 0}, /* 0000 11 */
+    {3, 0x03, 0}, /* 011 */
+    {1, 0x1, 0},  /* 1 */
+    {3, 0x2, 0},  /* 010 */
+    {6, 0x02, 0}, /* 0000 10 */
+    {7, 0x02, 0}  /* 0000 010 */
 };
 
 /*
  * 2d-encode a row of pixels.  Consult the CCITT
  * documentation for the algorithm.
  */
-static int
-Fax3Encode2DRow(TIFF* tif, unsigned char* bp, unsigned char* rp, uint32 bits)
+static int Fax3Encode2DRow(TIFF *tif, unsigned char *bp, unsigned char *rp,
+                           uint32_t bits)
 {
-#define	PIXEL(buf,ix)	((((buf)[(ix)>>3]) >> (7-((ix)&7))) & 1)
-        uint32 a0 = 0;
-	uint32 a1 = (PIXEL(bp, 0) != 0 ? 0 : finddiff(bp, 0, bits, 0));
-	uint32 b1 = (PIXEL(rp, 0) != 0 ? 0 : finddiff(rp, 0, bits, 0));
-	uint32 a2, b2;
-
-	for (;;) {
-		b2 = finddiff2(rp, b1, bits, PIXEL(rp,b1));
-		if (b2 >= a1) {
-			/* Naive computation triggers -fsanitize=undefined,unsigned-integer-overflow */
-			/* although it is correct unless the difference between both is < 31 bit */
-			/* int32 d = b1 - a1; */
-			int32 d = (b1 >= a1 && b1 - a1 <= 3U) ? (int32)(b1 - a1):
-			          (b1 < a1 && a1 - b1 <= 3U) ? -(int32)(a1 - b1) : 0x7FFFFFFF;
-			if (!(-3 <= d && d <= 3)) {	/* horizontal mode */
-				a2 = finddiff2(bp, a1, bits, PIXEL(bp,a1));
-				if( !putcode(tif, &horizcode) )
-                                    return 0;
-				if (a0+a1 == 0 || PIXEL(bp, a0) == 0) {
-					if( !putspan(tif, a1-a0, TIFFFaxWhiteCodes) )
-                                            return 0;
-					if( !putspan(tif, a2-a1, TIFFFaxBlackCodes) )
-                                            return 0;
-				} else {
-					if( !putspan(tif, a1-a0, TIFFFaxBlackCodes) )
-                                            return 0;
-					if( !putspan(tif, a2-a1, TIFFFaxWhiteCodes) )
-                                            return 0;
-				}
-				a0 = a2;
-			} else {			/* vertical mode */
-				if( !putcode(tif, &vcodes[d+3]) )
-                                    return 0;
-				a0 = a1;
-			}
-		} else {				/* pass mode */
-			if( !putcode(tif, &passcode) )
-                            return 0;
-			a0 = b2;
-		}
-		if (a0 >= bits)
-			break;
-		a1 = finddiff(bp, a0, bits, PIXEL(bp,a0));
-		b1 = finddiff(rp, a0, bits, !PIXEL(bp,a0));
-		b1 = finddiff(rp, b1, bits, PIXEL(bp,a0));
-	}
-	return (1);
+#define PIXEL(buf, ix) ((((buf)[(ix) >> 3]) >> (7 - ((ix)&7))) & 1)
+    uint32_t a0 = 0;
+    uint32_t a1 = (PIXEL(bp, 0) != 0 ? 0 : finddiff(bp, 0, bits, 0));
+    uint32_t b1 = (PIXEL(rp, 0) != 0 ? 0 : finddiff(rp, 0, bits, 0));
+    uint32_t a2, b2;
+
+    for (;;)
+    {
+        b2 = finddiff2(rp, b1, bits, PIXEL(rp, b1));
+        if (b2 >= a1)
+        {
+            /* Naive computation triggers
+             * -fsanitize=undefined,unsigned-integer-overflow */
+            /* although it is correct unless the difference between both is < 31
+             * bit */
+            /* int32_t d = b1 - a1; */
+            int32_t d = (b1 >= a1 && b1 - a1 <= 3U)  ? (int32_t)(b1 - a1)
+                        : (b1 < a1 && a1 - b1 <= 3U) ? -(int32_t)(a1 - b1)
+                                                     : 0x7FFFFFFF;
+            if (!(-3 <= d && d <= 3))
+            { /* horizontal mode */
+                a2 = finddiff2(bp, a1, bits, PIXEL(bp, a1));
+                if (!putcode(tif, &horizcode))
+                    return 0;
+                if (a0 + a1 == 0 || PIXEL(bp, a0) == 0)
+                {
+                    if (!putspan(tif, a1 - a0, TIFFFaxWhiteCodes))
+                        return 0;
+                    if (!putspan(tif, a2 - a1, TIFFFaxBlackCodes))
+                        return 0;
+                }
+                else
+                {
+                    if (!putspan(tif, a1 - a0, TIFFFaxBlackCodes))
+                        return 0;
+                    if (!putspan(tif, a2 - a1, TIFFFaxWhiteCodes))
+                        return 0;
+                }
+                a0 = a2;
+            }
+            else
+            { /* vertical mode */
+                if (!putcode(tif, &vcodes[d + 3]))
+                    return 0;
+                a0 = a1;
+            }
+        }
+        else
+        { /* pass mode */
+            if (!putcode(tif, &passcode))
+                return 0;
+            a0 = b2;
+        }
+        if (a0 >= bits)
+            break;
+        a1 = finddiff(bp, a0, bits, PIXEL(bp, a0));
+        b1 = finddiff(rp, a0, bits, !PIXEL(bp, a0));
+        b1 = finddiff(rp, b1, bits, PIXEL(bp, a0));
+    }
+    return (1);
 #undef PIXEL
 }
 
 /*
  * Encode a buffer of pixels.
  */
-static int
-Fax3Encode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int Fax3Encode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "Fax3Encode";
-	Fax3CodecState* sp = EncoderState(tif);
-	(void) s;
-	if (cc % sp->b.rowbytes)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Fractional scanlines cannot be written");
-		return (0);
-	}
-	while (cc > 0) {
-		if ((sp->b.mode & FAXMODE_NOEOL) == 0)
-                {
-			if( !Fax3PutEOL(tif) )
-                            return 0;
-                }
-		if (is2DEncoding(sp)) {
-			if (sp->tag == G3_1D) {
-				if (!Fax3Encode1DRow(tif, bp, sp->b.rowpixels))
-					return (0);
-				sp->tag = G3_2D;
-			} else {
-				if (!Fax3Encode2DRow(tif, bp, sp->refline,
-				    sp->b.rowpixels))
-					return (0);
-				sp->k--;
-			}
-			if (sp->k == 0) {
-				sp->tag = G3_1D;
-				sp->k = sp->maxk-1;
-			} else
-				_TIFFmemcpy(sp->refline, bp, sp->b.rowbytes);
-		} else {
-			if (!Fax3Encode1DRow(tif, bp, sp->b.rowpixels))
-				return (0);
-		}
-		bp += sp->b.rowbytes;
-		cc -= sp->b.rowbytes;
-	}
-	return (1);
+    static const char module[] = "Fax3Encode";
+    Fax3CodecState *sp = EncoderState(tif);
+    (void)s;
+    if (cc % sp->b.rowbytes)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be written");
+        return (0);
+    }
+    while (cc > 0)
+    {
+        if ((sp->b.mode & FAXMODE_NOEOL) == 0)
+        {
+            if (!Fax3PutEOL(tif))
+                return 0;
+        }
+        if (is2DEncoding(sp))
+        {
+            if (sp->tag == G3_1D)
+            {
+                if (!Fax3Encode1DRow(tif, bp, sp->b.rowpixels))
+                    return (0);
+                sp->tag = G3_2D;
+            }
+            else
+            {
+                if (!Fax3Encode2DRow(tif, bp, sp->refline, sp->b.rowpixels))
+                    return (0);
+                sp->k--;
+            }
+            if (sp->k == 0)
+            {
+                sp->tag = G3_1D;
+                sp->k = sp->maxk - 1;
+            }
+            else
+                _TIFFmemcpy(sp->refline, bp, sp->b.rowbytes);
+        }
+        else
+        {
+            if (!Fax3Encode1DRow(tif, bp, sp->b.rowpixels))
+                return (0);
+        }
+        bp += sp->b.rowbytes;
+        cc -= sp->b.rowbytes;
+    }
+    return (1);
 }
 
-static int
-Fax3PostEncode(TIFF* tif)
+static int Fax3PostEncode(TIFF *tif)
 {
-	Fax3CodecState* sp = EncoderState(tif);
+    Fax3CodecState *sp = EncoderState(tif);
 
-	if (sp->bit != 8)
-		Fax3FlushBits(tif, sp);
-	return (1);
+    if (sp->bit != 8)
+        Fax3FlushBits(tif, sp);
+    return (1);
 }
 
-static int
-_Fax3Close(TIFF* tif)
+static int _Fax3Close(TIFF *tif)
 {
-	if ((Fax3State(tif)->mode & FAXMODE_NORTC) == 0 && tif->tif_rawcp) {
-		Fax3CodecState* sp = EncoderState(tif);
-		unsigned int code = EOL;
-		unsigned int length = 12;
-		int i;
-
-		if (is2DEncoding(sp)) {
-			code = (code<<1) | (sp->tag == G3_1D);
-			length++;
-		}
-		for (i = 0; i < 6; i++)
-			Fax3PutBits(tif, code, length);
-		Fax3FlushBits(tif, sp);
-	}
-	return 1;
+    if ((Fax3State(tif)->mode & FAXMODE_NORTC) == 0 && tif->tif_rawcp)
+    {
+        Fax3CodecState *sp = EncoderState(tif);
+        unsigned int code = EOL;
+        unsigned int length = 12;
+        int i;
+
+        if (is2DEncoding(sp))
+        {
+            code = (code << 1) | (sp->tag == G3_1D);
+            length++;
+        }
+        for (i = 0; i < 6; i++)
+            Fax3PutBits(tif, code, length);
+        Fax3FlushBits(tif, sp);
+    }
+    return 1;
 }
 
-static void
-Fax3Close(TIFF* tif)
-{
-    _Fax3Close(tif);
-}
+static void Fax3Close(TIFF *tif) { _Fax3Close(tif); }
 
-static void
-Fax3Cleanup(TIFF* tif)
+static void Fax3Cleanup(TIFF *tif)
 {
-	Fax3CodecState* sp = DecoderState(tif);
-	
-	assert(sp != 0);
+    Fax3CodecState *sp = DecoderState(tif);
 
-	tif->tif_tagmethods.vgetfield = sp->b.vgetparent;
-	tif->tif_tagmethods.vsetfield = sp->b.vsetparent;
-	tif->tif_tagmethods.printdir = sp->b.printdir;
+    assert(sp != 0);
 
-	if (sp->runs)
-		_TIFFfree(sp->runs);
-	if (sp->refline)
-		_TIFFfree(sp->refline);
+    tif->tif_tagmethods.vgetfield = sp->b.vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->b.vsetparent;
+    tif->tif_tagmethods.printdir = sp->b.printdir;
 
-	_TIFFfree(tif->tif_data);
-	tif->tif_data = NULL;
+    if (sp->runs)
+        _TIFFfreeExt(tif, sp->runs);
+    if (sp->refline)
+        _TIFFfreeExt(tif, sp->refline);
 
-	_TIFFSetDefaultCompressionState(tif);
+    _TIFFfreeExt(tif, tif->tif_data);
+    tif->tif_data = NULL;
+
+    _TIFFSetDefaultCompressionState(tif);
 }
 
-#define	FIELD_BADFAXLINES	(FIELD_CODEC+0)
-#define	FIELD_CLEANFAXDATA	(FIELD_CODEC+1)
-#define	FIELD_BADFAXRUN		(FIELD_CODEC+2)
+#define FIELD_BADFAXLINES (FIELD_CODEC + 0)
+#define FIELD_CLEANFAXDATA (FIELD_CODEC + 1)
+#define FIELD_BADFAXRUN (FIELD_CODEC + 2)
 
-#define	FIELD_OPTIONS		(FIELD_CODEC+7)
+#define FIELD_OPTIONS (FIELD_CODEC + 7)
 
 static const TIFFField faxFields[] = {
-    { TIFFTAG_FAXMODE, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "FaxMode", NULL },
-    { TIFFTAG_FAXFILLFUNC, 0, 0, TIFF_ANY, 0, TIFF_SETGET_OTHER, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "FaxFillFunc", NULL },
-    { TIFFTAG_BADFAXLINES, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_BADFAXLINES, TRUE, FALSE, "BadFaxLines", NULL },
-    { TIFFTAG_CLEANFAXDATA, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UINT16, FIELD_CLEANFAXDATA, TRUE, FALSE, "CleanFaxData", NULL },
-    { TIFFTAG_CONSECUTIVEBADFAXLINES, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_BADFAXRUN, TRUE, FALSE, "ConsecutiveBadFaxLines", NULL }};
+    {TIFFTAG_FAXMODE, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED,
+     FIELD_PSEUDO, FALSE, FALSE, "FaxMode", NULL},
+    {TIFFTAG_FAXFILLFUNC, 0, 0, TIFF_ANY, 0, TIFF_SETGET_OTHER,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "FaxFillFunc", NULL},
+    {TIFFTAG_BADFAXLINES, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32,
+     TIFF_SETGET_UINT32, FIELD_BADFAXLINES, TRUE, FALSE, "BadFaxLines", NULL},
+    {TIFFTAG_CLEANFAXDATA, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16,
+     TIFF_SETGET_UINT16, FIELD_CLEANFAXDATA, TRUE, FALSE, "CleanFaxData", NULL},
+    {TIFFTAG_CONSECUTIVEBADFAXLINES, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32,
+     TIFF_SETGET_UINT32, FIELD_BADFAXRUN, TRUE, FALSE, "ConsecutiveBadFaxLines",
+     NULL}};
 static const TIFFField fax3Fields[] = {
-    { TIFFTAG_GROUP3OPTIONS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_OPTIONS, FALSE, FALSE, "Group3Options", NULL },
+    {TIFFTAG_GROUP3OPTIONS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32,
+     TIFF_SETGET_UINT32, FIELD_OPTIONS, FALSE, FALSE, "Group3Options", NULL},
 };
 static const TIFFField fax4Fields[] = {
-    { TIFFTAG_GROUP4OPTIONS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32, TIFF_SETGET_UINT32, FIELD_OPTIONS, FALSE, FALSE, "Group4Options", NULL },
+    {TIFFTAG_GROUP4OPTIONS, 1, 1, TIFF_LONG, 0, TIFF_SETGET_UINT32,
+     TIFF_SETGET_UINT32, FIELD_OPTIONS, FALSE, FALSE, "Group4Options", NULL},
 };
 
-static int
-Fax3VSetField(TIFF* tif, uint32 tag, va_list ap)
+static int Fax3VSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	Fax3BaseState* sp = Fax3State(tif);
-	const TIFFField* fip;
-
-	assert(sp != 0);
-	assert(sp->vsetparent != 0);
-
-	switch (tag) {
-	case TIFFTAG_FAXMODE:
-		sp->mode = (int) va_arg(ap, int);
-		return 1;			/* NB: pseudo tag */
-	case TIFFTAG_FAXFILLFUNC:
-		DecoderState(tif)->fill = va_arg(ap, TIFFFaxFillFunc);
-		return 1;			/* NB: pseudo tag */
-	case TIFFTAG_GROUP3OPTIONS:
-		/* XXX: avoid reading options if compression mismatches. */
-		if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX3)
-			sp->groupoptions = (uint32) va_arg(ap, uint32);
-		break;
-	case TIFFTAG_GROUP4OPTIONS:
-		/* XXX: avoid reading options if compression mismatches. */
-		if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX4)
-			sp->groupoptions = (uint32) va_arg(ap, uint32);
-		break;
-	case TIFFTAG_BADFAXLINES:
-		sp->badfaxlines = (uint32) va_arg(ap, uint32);
-		break;
-	case TIFFTAG_CLEANFAXDATA:
-		sp->cleanfaxdata = (uint16) va_arg(ap, uint16_vap);
-		break;
-	case TIFFTAG_CONSECUTIVEBADFAXLINES:
-		sp->badfaxrun = (uint32) va_arg(ap, uint32);
-		break;
-	default:
-		return (*sp->vsetparent)(tif, tag, ap);
-	}
-	
-	if ((fip = TIFFFieldWithTag(tif, tag)) != NULL)
-		TIFFSetFieldBit(tif, fip->field_bit);
-	else
-		return 0;
-
-	tif->tif_flags |= TIFF_DIRTYDIRECT;
-	return 1;
+    Fax3BaseState *sp = Fax3State(tif);
+    const TIFFField *fip;
+
+    assert(sp != 0);
+    assert(sp->vsetparent != 0);
+
+    switch (tag)
+    {
+        case TIFFTAG_FAXMODE:
+            sp->mode = (int)va_arg(ap, int);
+            return 1; /* NB: pseudo tag */
+        case TIFFTAG_FAXFILLFUNC:
+            DecoderState(tif)->fill = va_arg(ap, TIFFFaxFillFunc);
+            return 1; /* NB: pseudo tag */
+        case TIFFTAG_GROUP3OPTIONS:
+            /* XXX: avoid reading options if compression mismatches. */
+            if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX3)
+                sp->groupoptions = (uint32_t)va_arg(ap, uint32_t);
+            break;
+        case TIFFTAG_GROUP4OPTIONS:
+            /* XXX: avoid reading options if compression mismatches. */
+            if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX4)
+                sp->groupoptions = (uint32_t)va_arg(ap, uint32_t);
+            break;
+        case TIFFTAG_BADFAXLINES:
+            sp->badfaxlines = (uint32_t)va_arg(ap, uint32_t);
+            break;
+        case TIFFTAG_CLEANFAXDATA:
+            sp->cleanfaxdata = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_CONSECUTIVEBADFAXLINES:
+            sp->badfaxrun = (uint32_t)va_arg(ap, uint32_t);
+            break;
+        default:
+            return (*sp->vsetparent)(tif, tag, ap);
+    }
+
+    if ((fip = TIFFFieldWithTag(tif, tag)) != NULL)
+        TIFFSetFieldBit(tif, fip->field_bit);
+    else
+        return 0;
+
+    tif->tif_flags |= TIFF_DIRTYDIRECT;
+    return 1;
 }
 
-static int
-Fax3VGetField(TIFF* tif, uint32 tag, va_list ap)
+static int Fax3VGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	Fax3BaseState* sp = Fax3State(tif);
-
-	assert(sp != 0);
-
-	switch (tag) {
-	case TIFFTAG_FAXMODE:
-		*va_arg(ap, int*) = sp->mode;
-		break;
-	case TIFFTAG_FAXFILLFUNC:
-		*va_arg(ap, TIFFFaxFillFunc*) = DecoderState(tif)->fill;
-		break;
-	case TIFFTAG_GROUP3OPTIONS:
-	case TIFFTAG_GROUP4OPTIONS:
-		*va_arg(ap, uint32*) = sp->groupoptions;
-		break;
-	case TIFFTAG_BADFAXLINES:
-		*va_arg(ap, uint32*) = sp->badfaxlines;
-		break;
-	case TIFFTAG_CLEANFAXDATA:
-		*va_arg(ap, uint16*) = sp->cleanfaxdata;
-		break;
-	case TIFFTAG_CONSECUTIVEBADFAXLINES:
-		*va_arg(ap, uint32*) = sp->badfaxrun;
-		break;
-	default:
-		return (*sp->vgetparent)(tif, tag, ap);
-	}
-	return (1);
+    Fax3BaseState *sp = Fax3State(tif);
+
+    assert(sp != 0);
+
+    switch (tag)
+    {
+        case TIFFTAG_FAXMODE:
+            *va_arg(ap, int *) = sp->mode;
+            break;
+        case TIFFTAG_FAXFILLFUNC:
+            *va_arg(ap, TIFFFaxFillFunc *) = DecoderState(tif)->fill;
+            break;
+        case TIFFTAG_GROUP3OPTIONS:
+        case TIFFTAG_GROUP4OPTIONS:
+            *va_arg(ap, uint32_t *) = sp->groupoptions;
+            break;
+        case TIFFTAG_BADFAXLINES:
+            *va_arg(ap, uint32_t *) = sp->badfaxlines;
+            break;
+        case TIFFTAG_CLEANFAXDATA:
+            *va_arg(ap, uint16_t *) = sp->cleanfaxdata;
+            break;
+        case TIFFTAG_CONSECUTIVEBADFAXLINES:
+            *va_arg(ap, uint32_t *) = sp->badfaxrun;
+            break;
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
+    return (1);
 }
 
-static void
-Fax3PrintDir(TIFF* tif, FILE* fd, long flags)
+static void Fax3PrintDir(TIFF *tif, FILE *fd, long flags)
 {
-	Fax3BaseState* sp = Fax3State(tif);
-
-	assert(sp != 0);
-
-	(void) flags;
-	if (TIFFFieldSet(tif,FIELD_OPTIONS)) {
-		const char* sep = " ";
-		if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX4) {
-			fprintf(fd, "  Group 4 Options:");
-			if (sp->groupoptions & GROUP4OPT_UNCOMPRESSED)
-				fprintf(fd, "%suncompressed data", sep);
-		} else {
-
-			fprintf(fd, "  Group 3 Options:");
-			if (sp->groupoptions & GROUP3OPT_2DENCODING) {
-				fprintf(fd, "%s2-d encoding", sep);
-				sep = "+";
-			}
-			if (sp->groupoptions & GROUP3OPT_FILLBITS) {
-				fprintf(fd, "%sEOL padding", sep);
-				sep = "+";
-			}
-			if (sp->groupoptions & GROUP3OPT_UNCOMPRESSED)
-				fprintf(fd, "%suncompressed data", sep);
-		}
-		fprintf(fd, " (%lu = 0x%lx)\n",
-                        (unsigned long) sp->groupoptions,
-                        (unsigned long) sp->groupoptions);
-	}
-	if (TIFFFieldSet(tif,FIELD_CLEANFAXDATA)) {
-		fprintf(fd, "  Fax Data:");
-		switch (sp->cleanfaxdata) {
-		case CLEANFAXDATA_CLEAN:
-			fprintf(fd, " clean");
-			break;
-		case CLEANFAXDATA_REGENERATED:
-			fprintf(fd, " receiver regenerated");
-			break;
-		case CLEANFAXDATA_UNCLEAN:
-			fprintf(fd, " uncorrected errors");
-			break;
-		}
-		fprintf(fd, " (%u = 0x%x)\n",
-		    sp->cleanfaxdata, sp->cleanfaxdata);
-	}
-	if (TIFFFieldSet(tif,FIELD_BADFAXLINES))
-		fprintf(fd, "  Bad Fax Lines: %lu\n",
-                        (unsigned long) sp->badfaxlines);
-	if (TIFFFieldSet(tif,FIELD_BADFAXRUN))
-		fprintf(fd, "  Consecutive Bad Fax Lines: %lu\n",
-		    (unsigned long) sp->badfaxrun);
-	if (sp->printdir)
-		(*sp->printdir)(tif, fd, flags);
+    Fax3BaseState *sp = Fax3State(tif);
+
+    assert(sp != 0);
+
+    (void)flags;
+    if (TIFFFieldSet(tif, FIELD_OPTIONS))
+    {
+        const char *sep = " ";
+        if (tif->tif_dir.td_compression == COMPRESSION_CCITTFAX4)
+        {
+            fprintf(fd, "  Group 4 Options:");
+            if (sp->groupoptions & GROUP4OPT_UNCOMPRESSED)
+                fprintf(fd, "%suncompressed data", sep);
+        }
+        else
+        {
+
+            fprintf(fd, "  Group 3 Options:");
+            if (sp->groupoptions & GROUP3OPT_2DENCODING)
+            {
+                fprintf(fd, "%s2-d encoding", sep);
+                sep = "+";
+            }
+            if (sp->groupoptions & GROUP3OPT_FILLBITS)
+            {
+                fprintf(fd, "%sEOL padding", sep);
+                sep = "+";
+            }
+            if (sp->groupoptions & GROUP3OPT_UNCOMPRESSED)
+                fprintf(fd, "%suncompressed data", sep);
+        }
+        fprintf(fd, " (%" PRIu32 " = 0x%" PRIx32 ")\n", sp->groupoptions,
+                sp->groupoptions);
+    }
+    if (TIFFFieldSet(tif, FIELD_CLEANFAXDATA))
+    {
+        fprintf(fd, "  Fax Data:");
+        switch (sp->cleanfaxdata)
+        {
+            case CLEANFAXDATA_CLEAN:
+                fprintf(fd, " clean");
+                break;
+            case CLEANFAXDATA_REGENERATED:
+                fprintf(fd, " receiver regenerated");
+                break;
+            case CLEANFAXDATA_UNCLEAN:
+                fprintf(fd, " uncorrected errors");
+                break;
+        }
+        fprintf(fd, " (%" PRIu16 " = 0x%" PRIx16 ")\n", sp->cleanfaxdata,
+                sp->cleanfaxdata);
+    }
+    if (TIFFFieldSet(tif, FIELD_BADFAXLINES))
+        fprintf(fd, "  Bad Fax Lines: %" PRIu32 "\n", sp->badfaxlines);
+    if (TIFFFieldSet(tif, FIELD_BADFAXRUN))
+        fprintf(fd, "  Consecutive Bad Fax Lines: %" PRIu32 "\n",
+                sp->badfaxrun);
+    if (sp->printdir)
+        (*sp->printdir)(tif, fd, flags);
 }
 
-static int
-InitCCITTFax3(TIFF* tif)
+static int InitCCITTFax3(TIFF *tif)
 {
-	static const char module[] = "InitCCITTFax3";
-	Fax3BaseState* sp;
-
-	/*
-	 * Merge codec-specific tag information.
-	 */
-	if (!_TIFFMergeFields(tif, faxFields, TIFFArrayCount(faxFields))) {
-		TIFFErrorExt(tif->tif_clientdata, "InitCCITTFax3",
-			"Merging common CCITT Fax codec-specific tags failed");
-		return 0;
-	}
-
-	/*
-	 * Allocate state block so tag methods have storage to record values.
-	 */
-	tif->tif_data = (uint8*)
-		_TIFFmalloc(sizeof (Fax3CodecState));
-
-	if (tif->tif_data == NULL) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "No space for state block");
-		return (0);
-	}
-	_TIFFmemset(tif->tif_data, 0, sizeof (Fax3CodecState));
-
-	sp = Fax3State(tif);
-        sp->rw_mode = tif->tif_mode;
-
-	/*
-	 * Override parent get/set field methods.
-	 */
-	sp->vgetparent = tif->tif_tagmethods.vgetfield;
-	tif->tif_tagmethods.vgetfield = Fax3VGetField; /* hook for codec tags */
-	sp->vsetparent = tif->tif_tagmethods.vsetfield;
-	tif->tif_tagmethods.vsetfield = Fax3VSetField; /* hook for codec tags */
-	sp->printdir = tif->tif_tagmethods.printdir;
-	tif->tif_tagmethods.printdir = Fax3PrintDir;   /* hook for codec tags */
-	sp->groupoptions = 0;	
-
-	if (sp->rw_mode == O_RDONLY) /* FIXME: improve for in place update */
-		tif->tif_flags |= TIFF_NOBITREV; /* decoder does bit reversal */
-	DecoderState(tif)->runs = NULL;
-	TIFFSetField(tif, TIFFTAG_FAXFILLFUNC, _TIFFFax3fillruns);
-	EncoderState(tif)->refline = NULL;
-
-	/*
-	 * Install codec methods.
-	 */
-	tif->tif_fixuptags = Fax3FixupTags;
-	tif->tif_setupdecode = Fax3SetupState;
-	tif->tif_predecode = Fax3PreDecode;
-	tif->tif_decoderow = Fax3Decode1D;
-	tif->tif_decodestrip = Fax3Decode1D;
-	tif->tif_decodetile = Fax3Decode1D;
-	tif->tif_setupencode = Fax3SetupState;
-	tif->tif_preencode = Fax3PreEncode;
-	tif->tif_postencode = Fax3PostEncode;
-	tif->tif_encoderow = Fax3Encode;
-	tif->tif_encodestrip = Fax3Encode;
-	tif->tif_encodetile = Fax3Encode;
-	tif->tif_close = Fax3Close;
-	tif->tif_cleanup = Fax3Cleanup;
-
-	return (1);
+    static const char module[] = "InitCCITTFax3";
+    Fax3BaseState *sp;
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, faxFields, TIFFArrayCount(faxFields)))
+    {
+        TIFFErrorExtR(tif, "InitCCITTFax3",
+                      "Merging common CCITT Fax codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(Fax3CodecState));
+
+    if (tif->tif_data == NULL)
+    {
+        TIFFErrorExtR(tif, module, "No space for state block");
+        return (0);
+    }
+    _TIFFmemset(tif->tif_data, 0, sizeof(Fax3CodecState));
+
+    sp = Fax3State(tif);
+    sp->rw_mode = tif->tif_mode;
+
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = Fax3VGetField; /* hook for codec tags */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = Fax3VSetField; /* hook for codec tags */
+    sp->printdir = tif->tif_tagmethods.printdir;
+    tif->tif_tagmethods.printdir = Fax3PrintDir; /* hook for codec tags */
+    sp->groupoptions = 0;
+
+    if (sp->rw_mode == O_RDONLY) /* FIXME: improve for in place update */
+        tif->tif_flags |= TIFF_NOBITREV; /* decoder does bit reversal */
+    DecoderState(tif)->runs = NULL;
+    TIFFSetField(tif, TIFFTAG_FAXFILLFUNC, _TIFFFax3fillruns);
+    EncoderState(tif)->refline = NULL;
+
+    /*
+     * Install codec methods.
+     */
+    tif->tif_fixuptags = Fax3FixupTags;
+    tif->tif_setupdecode = Fax3SetupState;
+    tif->tif_predecode = Fax3PreDecode;
+    tif->tif_decoderow = Fax3Decode1D;
+    tif->tif_decodestrip = Fax3Decode1D;
+    tif->tif_decodetile = Fax3Decode1D;
+    tif->tif_setupencode = Fax3SetupState;
+    tif->tif_preencode = Fax3PreEncode;
+    tif->tif_postencode = Fax3PostEncode;
+    tif->tif_encoderow = Fax3Encode;
+    tif->tif_encodestrip = Fax3Encode;
+    tif->tif_encodetile = Fax3Encode;
+    tif->tif_close = Fax3Close;
+    tif->tif_cleanup = Fax3Cleanup;
+
+    return (1);
 }
 
-int
-TIFFInitCCITTFax3(TIFF* tif, int scheme)
+int TIFFInitCCITTFax3(TIFF *tif, int scheme)
 {
-	(void) scheme;
-	if (InitCCITTFax3(tif)) {
-		/*
-		 * Merge codec-specific tag information.
-		 */
-		if (!_TIFFMergeFields(tif, fax3Fields,
-				      TIFFArrayCount(fax3Fields))) {
-			TIFFErrorExt(tif->tif_clientdata, "TIFFInitCCITTFax3",
-			"Merging CCITT Fax 3 codec-specific tags failed");
-			return 0;
-		}
-
-		/*
-		 * The default format is Class/F-style w/o RTC.
-		 */
-		return TIFFSetField(tif, TIFFTAG_FAXMODE, FAXMODE_CLASSF);
-	} else
-		return 01;
+    (void)scheme;
+    if (InitCCITTFax3(tif))
+    {
+        /*
+         * Merge codec-specific tag information.
+         */
+        if (!_TIFFMergeFields(tif, fax3Fields, TIFFArrayCount(fax3Fields)))
+        {
+            TIFFErrorExtR(tif, "TIFFInitCCITTFax3",
+                          "Merging CCITT Fax 3 codec-specific tags failed");
+            return 0;
+        }
+
+        /*
+         * The default format is Class/F-style w/o RTC.
+         */
+        return TIFFSetField(tif, TIFFTAG_FAXMODE, FAXMODE_CLASSF);
+    }
+    else
+        return 01;
 }
 
 /*
@@ -1468,138 +1495,146 @@ TIFFInitCCITTFax3(TIFF* tif, int scheme)
  * Compression Scheme Support.
  */
 
-#define SWAP(t,a,b) { t x; x = (a); (a) = (b); (b) = x; }
+#define SWAP(t, a, b)                                                          \
+    {                                                                          \
+        t x;                                                                   \
+        x = (a);                                                               \
+        (a) = (b);                                                             \
+        (b) = x;                                                               \
+    }
 /*
  * Decode the requested amount of G4-encoded data.
  */
-static int
-Fax4Decode(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
+static int Fax4Decode(TIFF *tif, uint8_t *buf, tmsize_t occ, uint16_t s)
 {
-	DECLARE_STATE_2D(tif, sp, "Fax4Decode");
-	(void) s;
-	if (occ % sp->b.rowbytes)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Fractional scanlines cannot be read");
-		return (-1);
-	}
-	CACHE_STATE(tif, sp);
-	while (occ > 0) {
-		a0 = 0;
-		RunLength = 0;
-		pa = thisrun = sp->curruns;
-		pb = sp->refruns;
-		b1 = *pb++;
+    DECLARE_STATE_2D(tif, sp, "Fax4Decode");
+    (void)s;
+    if (occ % sp->b.rowbytes)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be read");
+        return (-1);
+    }
+    CACHE_STATE(tif, sp);
+    while (occ > 0)
+    {
+        a0 = 0;
+        RunLength = 0;
+        pa = thisrun = sp->curruns;
+        pb = sp->refruns;
+        b1 = *pb++;
 #ifdef FAX3_DEBUG
-		printf("\nBitAcc=%08X, BitsAvail = %d\n", BitAcc, BitsAvail);
-		printf("-------------------- %d\n", tif->tif_row);
-		fflush(stdout);
+        printf("\nBitAcc=%08" PRIX32 ", BitsAvail = %d\n", BitAcc, BitsAvail);
+        printf("-------------------- %d\n", tif->tif_row);
+        fflush(stdout);
 #endif
-		EXPAND2D(EOFG4);
-                if (EOLcnt)
-                    goto EOFG4;
-		if (((lastx + 7) >> 3) > (int)occ)	/* check for buffer overrun */
-		{
-			TIFFErrorExt(tif->tif_clientdata, module,
-			             "Buffer overrun detected : %d bytes available, %d bits needed",
-			             (int)occ, lastx);
-			return -1;
-		}
-		(*sp->fill)(buf, thisrun, pa, lastx);
-		SETVALUE(0);		/* imaginary change for reference */
-		SWAP(uint32*, sp->curruns, sp->refruns);
-		buf += sp->b.rowbytes;
-		occ -= sp->b.rowbytes;
-		sp->line++;
-		continue;
-	EOFG4:
-                NeedBits16( 13, BADG4 );
-        BADG4:
+        EXPAND2D(EOFG4);
+        if (EOLcnt)
+            goto EOFG4;
+        if (((lastx + 7) >> 3) > (int)occ) /* check for buffer overrun */
+        {
+            TIFFErrorExtR(tif, module,
+                          "Buffer overrun detected : %" TIFF_SSIZE_FORMAT
+                          " bytes available, %d bits needed",
+                          occ, lastx);
+            return -1;
+        }
+        (*sp->fill)(buf, thisrun, pa, lastx);
+        SETVALUE(0); /* imaginary change for reference */
+        SWAP(uint32_t *, sp->curruns, sp->refruns);
+        buf += sp->b.rowbytes;
+        occ -= sp->b.rowbytes;
+        sp->line++;
+        continue;
+    EOFG4:
+        NeedBits16(13, BADG4);
+    BADG4:
 #ifdef FAX3_DEBUG
-                if( GetBits(13) != 0x1001 )
-                    fputs( "Bad EOFB\n", stderr );
-#endif                
-                ClrBits( 13 );
-		if (((lastx + 7) >> 3) > (int)occ)	/* check for buffer overrun */
-		{
-			TIFFErrorExt(tif->tif_clientdata, module,
-			             "Buffer overrun detected : %d bytes available, %d bits needed",
-			             (int)occ, lastx);
-			return -1;
-		}
-		(*sp->fill)(buf, thisrun, pa, lastx);
-		UNCACHE_STATE(tif, sp);
-		return ( sp->line ? 1 : -1);	/* don't error on badly-terminated strips */
-	}
-	UNCACHE_STATE(tif, sp);
-	return (1);
+        if (GetBits(13) != 0x1001)
+            fputs("Bad EOFB\n", stderr);
+#endif
+        ClrBits(13);
+        if (((lastx + 7) >> 3) > (int)occ) /* check for buffer overrun */
+        {
+            TIFFErrorExtR(tif, module,
+                          "Buffer overrun detected : %" TIFF_SSIZE_FORMAT
+                          " bytes available, %d bits needed",
+                          occ, lastx);
+            return -1;
+        }
+        (*sp->fill)(buf, thisrun, pa, lastx);
+        UNCACHE_STATE(tif, sp);
+        return (sp->line ? 1 : -1); /* don't error on badly-terminated strips */
+    }
+    UNCACHE_STATE(tif, sp);
+    return (1);
 }
-#undef	SWAP
+#undef SWAP
 
 /*
  * Encode the requested amount of data.
  */
-static int
-Fax4Encode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int Fax4Encode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "Fax4Encode";
-	Fax3CodecState *sp = EncoderState(tif);
-	(void) s;
-	if (cc % sp->b.rowbytes)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Fractional scanlines cannot be written");
-		return (0);
-	}
-	while (cc > 0) {
-		if (!Fax3Encode2DRow(tif, bp, sp->refline, sp->b.rowpixels))
-			return (0);
-		_TIFFmemcpy(sp->refline, bp, sp->b.rowbytes);
-		bp += sp->b.rowbytes;
-		cc -= sp->b.rowbytes;
-	}
-	return (1);
+    static const char module[] = "Fax4Encode";
+    Fax3CodecState *sp = EncoderState(tif);
+    (void)s;
+    if (cc % sp->b.rowbytes)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be written");
+        return (0);
+    }
+    while (cc > 0)
+    {
+        if (!Fax3Encode2DRow(tif, bp, sp->refline, sp->b.rowpixels))
+            return (0);
+        _TIFFmemcpy(sp->refline, bp, sp->b.rowbytes);
+        bp += sp->b.rowbytes;
+        cc -= sp->b.rowbytes;
+    }
+    return (1);
 }
 
-static int
-Fax4PostEncode(TIFF* tif)
+static int Fax4PostEncode(TIFF *tif)
 {
-	Fax3CodecState *sp = EncoderState(tif);
-
-	/* terminate strip w/ EOFB */
-	Fax3PutBits(tif, EOL, 12);
-	Fax3PutBits(tif, EOL, 12);
-	if (sp->bit != 8)
-		Fax3FlushBits(tif, sp);
-	return (1);
+    Fax3CodecState *sp = EncoderState(tif);
+
+    /* terminate strip w/ EOFB */
+    Fax3PutBits(tif, EOL, 12);
+    Fax3PutBits(tif, EOL, 12);
+    if (sp->bit != 8)
+        Fax3FlushBits(tif, sp);
+    return (1);
 }
 
-int
-TIFFInitCCITTFax4(TIFF* tif, int scheme)
+int TIFFInitCCITTFax4(TIFF *tif, int scheme)
 {
-	(void) scheme;
-	if (InitCCITTFax3(tif)) {		/* reuse G3 support */
-		/*
-		 * Merge codec-specific tag information.
-		 */
-		if (!_TIFFMergeFields(tif, fax4Fields,
-				      TIFFArrayCount(fax4Fields))) {
-			TIFFErrorExt(tif->tif_clientdata, "TIFFInitCCITTFax4",
-			"Merging CCITT Fax 4 codec-specific tags failed");
-			return 0;
-		}
-
-		tif->tif_decoderow = Fax4Decode;
-		tif->tif_decodestrip = Fax4Decode;
-		tif->tif_decodetile = Fax4Decode;
-		tif->tif_encoderow = Fax4Encode;
-		tif->tif_encodestrip = Fax4Encode;
-		tif->tif_encodetile = Fax4Encode;
-		tif->tif_postencode = Fax4PostEncode;
-		/*
-		 * Suppress RTC at the end of each strip.
-		 */
-		return TIFFSetField(tif, TIFFTAG_FAXMODE, FAXMODE_NORTC);
-	} else
-		return (0);
+    (void)scheme;
+    if (InitCCITTFax3(tif))
+    { /* reuse G3 support */
+        /*
+         * Merge codec-specific tag information.
+         */
+        if (!_TIFFMergeFields(tif, fax4Fields, TIFFArrayCount(fax4Fields)))
+        {
+            TIFFErrorExtR(tif, "TIFFInitCCITTFax4",
+                          "Merging CCITT Fax 4 codec-specific tags failed");
+            return 0;
+        }
+
+        tif->tif_decoderow = Fax4Decode;
+        tif->tif_decodestrip = Fax4Decode;
+        tif->tif_decodetile = Fax4Decode;
+        tif->tif_encoderow = Fax4Encode;
+        tif->tif_encodestrip = Fax4Encode;
+        tif->tif_encodetile = Fax4Encode;
+        tif->tif_postencode = Fax4PostEncode;
+        /*
+         * Suppress RTC at the end of each strip.
+         */
+        return TIFFSetField(tif, TIFFTAG_FAXMODE, FAXMODE_NORTC);
+    }
+    else
+        return (0);
 }
 
 /*
@@ -1610,95 +1645,91 @@ TIFFInitCCITTFax4(TIFF* tif, int scheme)
 /*
  * Decode the requested amount of RLE-encoded data.
  */
-static int
-Fax3DecodeRLE(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
+static int Fax3DecodeRLE(TIFF *tif, uint8_t *buf, tmsize_t occ, uint16_t s)
 {
-	DECLARE_STATE(tif, sp, "Fax3DecodeRLE");
-	int mode = sp->b.mode;
-	(void) s;
-	if (occ % sp->b.rowbytes)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Fractional scanlines cannot be read");
-		return (-1);
-	}
-	CACHE_STATE(tif, sp);
-	thisrun = sp->curruns;
-	while (occ > 0) {
-		a0 = 0;
-		RunLength = 0;
-		pa = thisrun;
+    DECLARE_STATE(tif, sp, "Fax3DecodeRLE");
+    int mode = sp->b.mode;
+    (void)s;
+    if (occ % sp->b.rowbytes)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be read");
+        return (-1);
+    }
+    CACHE_STATE(tif, sp);
+    thisrun = sp->curruns;
+    while (occ > 0)
+    {
+        a0 = 0;
+        RunLength = 0;
+        pa = thisrun;
 #ifdef FAX3_DEBUG
-		printf("\nBitAcc=%08X, BitsAvail = %d\n", BitAcc, BitsAvail);
-		printf("-------------------- %d\n", tif->tif_row);
-		fflush(stdout);
+        printf("\nBitAcc=%08" PRIX32 ", BitsAvail = %d\n", BitAcc, BitsAvail);
+        printf("-------------------- %" PRIu32 "\n", tif->tif_row);
+        fflush(stdout);
 #endif
-		EXPAND1D(EOFRLE);
-		(*sp->fill)(buf, thisrun, pa, lastx);
-		/*
-		 * Cleanup at the end of the row.
-		 */
-		if (mode & FAXMODE_BYTEALIGN) {
-			int n = BitsAvail - (BitsAvail &~ 7);
-			ClrBits(n);
-		} else if (mode & FAXMODE_WORDALIGN) {
-			int n = BitsAvail - (BitsAvail &~ 15);
-			ClrBits(n);
-			if (BitsAvail == 0 && !isAligned(cp, uint16))
-			    cp++;
-		}
-		buf += sp->b.rowbytes;
-		occ -= sp->b.rowbytes;
-		sp->line++;
-		continue;
-	EOFRLE:				/* premature EOF */
-		(*sp->fill)(buf, thisrun, pa, lastx);
-		UNCACHE_STATE(tif, sp);
-		return (-1);
-	}
-	UNCACHE_STATE(tif, sp);
-	return (1);
+        EXPAND1D(EOFRLE);
+        (*sp->fill)(buf, thisrun, pa, lastx);
+        /*
+         * Cleanup at the end of the row.
+         */
+        if (mode & FAXMODE_BYTEALIGN)
+        {
+            int n = BitsAvail - (BitsAvail & ~7);
+            ClrBits(n);
+        }
+        else if (mode & FAXMODE_WORDALIGN)
+        {
+            int n = BitsAvail - (BitsAvail & ~15);
+            ClrBits(n);
+            if (BitsAvail == 0 && !isAligned(cp, uint16_t))
+                cp++;
+        }
+        buf += sp->b.rowbytes;
+        occ -= sp->b.rowbytes;
+        sp->line++;
+        continue;
+    EOFRLE: /* premature EOF */
+        (*sp->fill)(buf, thisrun, pa, lastx);
+        UNCACHE_STATE(tif, sp);
+        return (-1);
+    }
+    UNCACHE_STATE(tif, sp);
+    return (1);
 }
 
-int
-TIFFInitCCITTRLE(TIFF* tif, int scheme)
+int TIFFInitCCITTRLE(TIFF *tif, int scheme)
 {
-	(void) scheme;
-	if (InitCCITTFax3(tif)) {		/* reuse G3 support */
-		tif->tif_decoderow = Fax3DecodeRLE;
-		tif->tif_decodestrip = Fax3DecodeRLE;
-		tif->tif_decodetile = Fax3DecodeRLE;
-		/*
-		 * Suppress RTC+EOLs when encoding and byte-align data.
-		 */
-		return TIFFSetField(tif, TIFFTAG_FAXMODE,
-		    FAXMODE_NORTC|FAXMODE_NOEOL|FAXMODE_BYTEALIGN);
-	} else
-		return (0);
+    (void)scheme;
+    if (InitCCITTFax3(tif))
+    { /* reuse G3 support */
+        tif->tif_decoderow = Fax3DecodeRLE;
+        tif->tif_decodestrip = Fax3DecodeRLE;
+        tif->tif_decodetile = Fax3DecodeRLE;
+        /*
+         * Suppress RTC+EOLs when encoding and byte-align data.
+         */
+        return TIFFSetField(tif, TIFFTAG_FAXMODE,
+                            FAXMODE_NORTC | FAXMODE_NOEOL | FAXMODE_BYTEALIGN);
+    }
+    else
+        return (0);
 }
 
-int
-TIFFInitCCITTRLEW(TIFF* tif, int scheme)
+int TIFFInitCCITTRLEW(TIFF *tif, int scheme)
 {
-	(void) scheme;
-	if (InitCCITTFax3(tif)) {		/* reuse G3 support */
-		tif->tif_decoderow = Fax3DecodeRLE;
-		tif->tif_decodestrip = Fax3DecodeRLE;
-		tif->tif_decodetile = Fax3DecodeRLE;  
-		/*
-		 * Suppress RTC+EOLs when encoding and word-align data.
-		 */
-		return TIFFSetField(tif, TIFFTAG_FAXMODE,
-		    FAXMODE_NORTC|FAXMODE_NOEOL|FAXMODE_WORDALIGN);
-	} else
-		return (0);
+    (void)scheme;
+    if (InitCCITTFax3(tif))
+    { /* reuse G3 support */
+        tif->tif_decoderow = Fax3DecodeRLE;
+        tif->tif_decodestrip = Fax3DecodeRLE;
+        tif->tif_decodetile = Fax3DecodeRLE;
+        /*
+         * Suppress RTC+EOLs when encoding and word-align data.
+         */
+        return TIFFSetField(tif, TIFFTAG_FAXMODE,
+                            FAXMODE_NORTC | FAXMODE_NOEOL | FAXMODE_WORDALIGN);
+    }
+    else
+        return (0);
 }
 #endif /* CCITT_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_fax3.h b/3rdparty/libtiff/tif_fax3.h
index 701716cc181d..e095009bba4a 100644
--- a/3rdparty/libtiff/tif_fax3.h
+++ b/3rdparty/libtiff/tif_fax3.h
@@ -2,28 +2,28 @@
  * Copyright (c) 1990-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
 #ifndef _FAX3_
-#define	_FAX3_
+#define _FAX3_
 /*
  * TIFF Library.
  *
@@ -41,7 +41,7 @@
  * The routine must have the type signature given below;
  * for example:
  *
- * fillruns(unsigned char* buf, uint32* runs, uint32* erun, uint32 lastx)
+ * fillruns(unsigned char* buf, uint32_t* runs, uint32_t* erun, uint32_t lastx)
  *
  * where buf is place to set the bits, runs is the array of b&w run
  * lengths (white then black), erun is the last run in the array, and
@@ -50,41 +50,47 @@
  * data in the run array as needed (e.g. to append zero runs to bring
  * the count up to a nice multiple).
  */
-typedef void (*TIFFFaxFillFunc)(unsigned char*, uint32*, uint32*, uint32);
+typedef void (*TIFFFaxFillFunc)(unsigned char *, uint32_t *, uint32_t *,
+                                uint32_t);
 
 /*
  * The default run filler; made external for other decoders.
  */
 #if defined(__cplusplus)
-extern "C" {
+extern "C"
+{
 #endif
-extern void _TIFFFax3fillruns(unsigned char*, uint32*, uint32*, uint32);
+    extern void _TIFFFax3fillruns(unsigned char *, uint32_t *, uint32_t *,
+                                  uint32_t);
 #if defined(__cplusplus)
 }
 #endif
 
-
 /* finite state machine codes */
-#define S_Null     0
-#define S_Pass     1
-#define S_Horiz    2
-#define S_V0       3
-#define S_VR       4
-#define S_VL       5
-#define S_Ext      6
-#define S_TermW    7
-#define S_TermB    8
-#define S_MakeUpW  9
-#define S_MakeUpB  10
-#define S_MakeUp   11
-#define S_EOL      12
+#define S_Null 0
+#define S_Pass 1
+#define S_Horiz 2
+#define S_V0 3
+#define S_VR 4
+#define S_VL 5
+#define S_Ext 6
+#define S_TermW 7
+#define S_TermB 8
+#define S_MakeUpW 9
+#define S_MakeUpB 10
+#define S_MakeUp 11
+#define S_EOL 12
 
-/* WARNING: do not change the layout of this structure as the HylaFAX software */
-/* really depends on it. See http://bugzilla.maptools.org/show_bug.cgi?id=2636 */
-typedef struct {                /* state table entry */
-	unsigned char State;    /* see above */
-	unsigned char Width;    /* width of code in bits */
-	uint32 Param;           /* unsigned 32-bit run length in bits (holds on 16 bit actually, but cannot be changed. See above warning) */
+/* WARNING: do not change the layout of this structure as the HylaFAX software
+ */
+/* really depends on it. See http://bugzilla.maptools.org/show_bug.cgi?id=2636
+ */
+typedef struct
+{                        /* state table entry */
+    unsigned char State; /* see above */
+    unsigned char Width; /* width of code in bits */
+    uint32_t Param;      /* unsigned 32-bit run length in bits (holds on 16 bit
+                            actually, but cannot be changed. See above warning) */
 } TIFFFaxTabEnt;
 
 extern const TIFFFaxTabEnt TIFFFaxMainTable[];
@@ -108,7 +114,7 @@ extern const TIFFFaxTabEnt TIFFFaxBlackTable[];
  */
 
 #ifndef EndOfData
-#define EndOfData()	(cp >= ep)
+#define EndOfData() (cp >= ep)
 #endif
 /*
  * Need <=8 or <=16 bits of input data.  Unlike viewfax we
@@ -134,121 +140,143 @@ extern const TIFFFaxTabEnt TIFFFaxBlackTable[];
  * otherwise we should get the right answer.
  */
 #ifndef NeedBits8
-#define NeedBits8(n,eoflab) do {					\
-    if (BitsAvail < (n)) {						\
-	if (EndOfData()) {						\
-	    if (BitsAvail == 0)			/* no valid bits */	\
-		goto eoflab;						\
-	    BitsAvail = (n);			/* pad with zeros */	\
-	} else {							\
-	    BitAcc |= ((uint32) bitmap[*cp++])<<BitsAvail;		\
-	    BitsAvail += 8;						\
-	}								\
-    }									\
-} while (0)
+#define NeedBits8(n, eoflab)                                                   \
+    do                                                                         \
+    {                                                                          \
+        if (BitsAvail < (n))                                                   \
+        {                                                                      \
+            if (EndOfData())                                                   \
+            {                                                                  \
+                if (BitsAvail == 0) /* no valid bits */                        \
+                    goto eoflab;                                               \
+                BitsAvail = (n); /* pad with zeros */                          \
+            }                                                                  \
+            else                                                               \
+            {                                                                  \
+                BitAcc |= ((uint32_t)bitmap[*cp++]) << BitsAvail;              \
+                BitsAvail += 8;                                                \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
 #endif
 #ifndef NeedBits16
-#define NeedBits16(n,eoflab) do {					\
-    if (BitsAvail < (n)) {						\
-	if (EndOfData()) {						\
-	    if (BitsAvail == 0)			/* no valid bits */	\
-		goto eoflab;						\
-	    BitsAvail = (n);			/* pad with zeros */	\
-	} else {							\
-	    BitAcc |= ((uint32) bitmap[*cp++])<<BitsAvail;		\
-	    if ((BitsAvail += 8) < (n)) {				\
-		if (EndOfData()) {					\
-		    /* NB: we know BitsAvail is non-zero here */	\
-		    BitsAvail = (n);		/* pad with zeros */	\
-		} else {						\
-		    BitAcc |= ((uint32) bitmap[*cp++])<<BitsAvail;	\
-		    BitsAvail += 8;					\
-		}							\
-	    }								\
-	}								\
-    }									\
-} while (0)
+#define NeedBits16(n, eoflab)                                                  \
+    do                                                                         \
+    {                                                                          \
+        if (BitsAvail < (n))                                                   \
+        {                                                                      \
+            if (EndOfData())                                                   \
+            {                                                                  \
+                if (BitsAvail == 0) /* no valid bits */                        \
+                    goto eoflab;                                               \
+                BitsAvail = (n); /* pad with zeros */                          \
+            }                                                                  \
+            else                                                               \
+            {                                                                  \
+                BitAcc |= ((uint32_t)bitmap[*cp++]) << BitsAvail;              \
+                if ((BitsAvail += 8) < (n))                                    \
+                {                                                              \
+                    if (EndOfData())                                           \
+                    {                                                          \
+                        /* NB: we know BitsAvail is non-zero here */           \
+                        BitsAvail = (n); /* pad with zeros */                  \
+                    }                                                          \
+                    else                                                       \
+                    {                                                          \
+                        BitAcc |= ((uint32_t)bitmap[*cp++]) << BitsAvail;      \
+                        BitsAvail += 8;                                        \
+                    }                                                          \
+                }                                                              \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
 #endif
-#define GetBits(n)	(BitAcc & ((1<<(n))-1))
-#define ClrBits(n) do {							\
-    BitsAvail -= (n);							\
-    BitAcc >>= (n);							\
-} while (0)
+#define GetBits(n) (BitAcc & ((1 << (n)) - 1))
+#define ClrBits(n)                                                             \
+    do                                                                         \
+    {                                                                          \
+        BitsAvail -= (n);                                                      \
+        BitAcc >>= (n);                                                        \
+    } while (0)
 
 #ifdef FAX3_DEBUG
-static const char* StateNames[] = {
-    "Null   ",
-    "Pass   ",
-    "Horiz  ",
-    "V0     ",
-    "VR     ",
-    "VL     ",
-    "Ext    ",
-    "TermW  ",
-    "TermB  ",
-    "MakeUpW",
-    "MakeUpB",
-    "MakeUp ",
-    "EOL    ",
+static const char *StateNames[] = {
+    "Null   ", "Pass   ", "Horiz  ", "V0     ", "VR     ", "VL     ", "Ext    ",
+    "TermW  ", "TermB  ", "MakeUpW", "MakeUpB", "MakeUp ", "EOL    ",
 };
 #define DEBUG_SHOW putchar(BitAcc & (1 << t) ? '1' : '0')
-#define LOOKUP8(wid,tab,eoflab) do {					\
-    int t;								\
-    NeedBits8(wid,eoflab);						\
-    TabEnt = tab + GetBits(wid);					\
-    printf("%08lX/%d: %s%5d\t", (long) BitAcc, BitsAvail,		\
-	   StateNames[TabEnt->State], TabEnt->Param);			\
-    for (t = 0; t < TabEnt->Width; t++)					\
-	DEBUG_SHOW;							\
-    putchar('\n');							\
-    fflush(stdout);							\
-    ClrBits(TabEnt->Width);						\
-} while (0)
-#define LOOKUP16(wid,tab,eoflab) do {					\
-    int t;								\
-    NeedBits16(wid,eoflab);						\
-    TabEnt = tab + GetBits(wid);					\
-    printf("%08lX/%d: %s%5d\t", (long) BitAcc, BitsAvail,		\
-	   StateNames[TabEnt->State], TabEnt->Param);			\
-    for (t = 0; t < TabEnt->Width; t++)					\
-	DEBUG_SHOW;							\
-    putchar('\n');							\
-    fflush(stdout);							\
-    ClrBits(TabEnt->Width);						\
-} while (0)
+#define LOOKUP8(wid, tab, eoflab)                                              \
+    do                                                                         \
+    {                                                                          \
+        int t;                                                                 \
+        NeedBits8(wid, eoflab);                                                \
+        TabEnt = tab + GetBits(wid);                                           \
+        printf("%08lX/%d: %s%5d\t", (long)BitAcc, BitsAvail,                   \
+               StateNames[TabEnt->State], TabEnt->Param);                      \
+        for (t = 0; t < TabEnt->Width; t++)                                    \
+            DEBUG_SHOW;                                                        \
+        putchar('\n');                                                         \
+        fflush(stdout);                                                        \
+        ClrBits(TabEnt->Width);                                                \
+    } while (0)
+#define LOOKUP16(wid, tab, eoflab)                                             \
+    do                                                                         \
+    {                                                                          \
+        int t;                                                                 \
+        NeedBits16(wid, eoflab);                                               \
+        TabEnt = tab + GetBits(wid);                                           \
+        printf("%08lX/%d: %s%5d\t", (long)BitAcc, BitsAvail,                   \
+               StateNames[TabEnt->State], TabEnt->Param);                      \
+        for (t = 0; t < TabEnt->Width; t++)                                    \
+            DEBUG_SHOW;                                                        \
+        putchar('\n');                                                         \
+        fflush(stdout);                                                        \
+        ClrBits(TabEnt->Width);                                                \
+    } while (0)
 
-#define SETVALUE(x) do {							\
-    *pa++ = RunLength + (x);						\
-    printf("SETVALUE: %d\t%d\n", RunLength + (x), a0);			\
-    a0 += x;								\
-    RunLength = 0;							\
-} while (0)
+#define SETVALUE(x)                                                            \
+    do                                                                         \
+    {                                                                          \
+        *pa++ = RunLength + (x);                                               \
+        printf("SETVALUE: %d\t%d\n", RunLength + (x), a0);                     \
+        a0 += x;                                                               \
+        RunLength = 0;                                                         \
+    } while (0)
 #else
-#define LOOKUP8(wid,tab,eoflab) do {					\
-    NeedBits8(wid,eoflab);						\
-    TabEnt = tab + GetBits(wid);					\
-    ClrBits(TabEnt->Width);						\
-} while (0)
-#define LOOKUP16(wid,tab,eoflab) do {					\
-    NeedBits16(wid,eoflab);						\
-    TabEnt = tab + GetBits(wid);					\
-    ClrBits(TabEnt->Width);						\
-} while (0)
+#define LOOKUP8(wid, tab, eoflab)                                              \
+    do                                                                         \
+    {                                                                          \
+        NeedBits8(wid, eoflab);                                                \
+        TabEnt = tab + GetBits(wid);                                           \
+        ClrBits(TabEnt->Width);                                                \
+    } while (0)
+#define LOOKUP16(wid, tab, eoflab)                                             \
+    do                                                                         \
+    {                                                                          \
+        NeedBits16(wid, eoflab);                                               \
+        TabEnt = tab + GetBits(wid);                                           \
+        ClrBits(TabEnt->Width);                                                \
+    } while (0)
 
 /*
  * Append a run to the run length array for the
  * current row and reset decoding state.
  */
-#define SETVALUE(x) do {							\
-    if (pa >= thisrun + sp->nruns) {					\
-        TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
-                    sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
-        return (-1);							\
-    }									\
-    *pa++ = RunLength + (x);						\
-    a0 += (x);								\
-    RunLength = 0;							\
-} while (0)
+#define SETVALUE(x)                                                            \
+    do                                                                         \
+    {                                                                          \
+        if (pa >= thisrun + sp->nruns)                                         \
+        {                                                                      \
+            TIFFErrorExtR(tif, module, "Buffer overflow at line %u of %s %u",  \
+                          sp->line, isTiled(tif) ? "tile" : "strip",           \
+                          isTiled(tif) ? tif->tif_curtile                      \
+                                       : tif->tif_curstrip);                   \
+            return (-1);                                                       \
+        }                                                                      \
+        *pa++ = RunLength + (x);                                               \
+        a0 += (x);                                                             \
+        RunLength = 0;                                                         \
+    } while (0)
 #endif
 
 /*
@@ -261,51 +289,62 @@ static const char* StateNames[] = {
  * is non-zero then we still need to scan for the final flag
  * bit that is part of the EOL code.
  */
-#define	SYNC_EOL(eoflab) do {						\
-    if (EOLcnt == 0) {							\
-	for (;;) {							\
-	    NeedBits16(11,eoflab);					\
-	    if (GetBits(11) == 0)					\
-		break;							\
-	    ClrBits(1);							\
-	}								\
-    }									\
-    for (;;) {								\
-	NeedBits8(8,eoflab);						\
-	if (GetBits(8))							\
-	    break;							\
-	ClrBits(8);							\
-    }									\
-    while (GetBits(1) == 0)						\
-	ClrBits(1);							\
-    ClrBits(1);				/* EOL bit */			\
-    EOLcnt = 0;				/* reset EOL counter/flag */	\
-} while (0)
+#define SYNC_EOL(eoflab)                                                       \
+    do                                                                         \
+    {                                                                          \
+        if (EOLcnt == 0)                                                       \
+        {                                                                      \
+            for (;;)                                                           \
+            {                                                                  \
+                NeedBits16(11, eoflab);                                        \
+                if (GetBits(11) == 0)                                          \
+                    break;                                                     \
+                ClrBits(1);                                                    \
+            }                                                                  \
+        }                                                                      \
+        for (;;)                                                               \
+        {                                                                      \
+            NeedBits8(8, eoflab);                                              \
+            if (GetBits(8))                                                    \
+                break;                                                         \
+            ClrBits(8);                                                        \
+        }                                                                      \
+        while (GetBits(1) == 0)                                                \
+            ClrBits(1);                                                        \
+        ClrBits(1); /* EOL bit */                                              \
+        EOLcnt = 0; /* reset EOL counter/flag */                               \
+    } while (0)
 
 /*
  * Cleanup the array of runs after decoding a row.
  * We adjust final runs to insure the user buffer is not
  * overwritten and/or undecoded area is white filled.
  */
-#define	CLEANUP_RUNS() do {						\
-    if (RunLength)							\
-	SETVALUE(0);							\
-    if (a0 != lastx) {							\
-	badlength(a0, lastx);						\
-	while (a0 > lastx && pa > thisrun)				\
-	    a0 -= *--pa;						\
-	if (a0 < lastx) {						\
-	    if (a0 < 0)							\
-		a0 = 0;							\
-	    if ((pa-thisrun)&1)						\
-		SETVALUE(0);						\
-	    SETVALUE(lastx - a0);						\
-	} else if (a0 > lastx) {					\
-	    SETVALUE(lastx);						\
-	    SETVALUE(0);							\
-	}								\
-    }									\
-} while (0)
+#define CLEANUP_RUNS()                                                         \
+    do                                                                         \
+    {                                                                          \
+        if (RunLength)                                                         \
+            SETVALUE(0);                                                       \
+        if (a0 != lastx)                                                       \
+        {                                                                      \
+            badlength(a0, lastx);                                              \
+            while (a0 > lastx && pa > thisrun)                                 \
+                a0 -= *--pa;                                                   \
+            if (a0 < lastx)                                                    \
+            {                                                                  \
+                if (a0 < 0)                                                    \
+                    a0 = 0;                                                    \
+                if ((pa - thisrun) & 1)                                        \
+                    SETVALUE(0);                                               \
+                SETVALUE(lastx - a0);                                          \
+            }                                                                  \
+            else if (a0 > lastx)                                               \
+            {                                                                  \
+                SETVALUE(lastx);                                               \
+                SETVALUE(0);                                                   \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
 
 /*
  * Decode a line of 1D-encoded data.
@@ -319,249 +358,291 @@ static const char* StateNames[] = {
  * the original code depended on the input data being zero-padded to
  * insure the decoder recognized an EOL before running out of data.
  */
-#define EXPAND1D(eoflab) do {						\
-    for (;;) {								\
-	for (;;) {							\
-	    LOOKUP16(12, TIFFFaxWhiteTable, eof1d);			\
-	    switch (TabEnt->State) {					\
-	    case S_EOL:							\
-		EOLcnt = 1;						\
-		goto done1d;						\
-	    case S_TermW:						\
-		SETVALUE(TabEnt->Param);					\
-		goto doneWhite1d;					\
-	    case S_MakeUpW:						\
-	    case S_MakeUp:						\
-		a0 += TabEnt->Param;					\
-		RunLength += TabEnt->Param;				\
-		break;							\
-	    default:							\
-		unexpected("WhiteTable", a0);				\
-		goto done1d;						\
-	    }								\
-	}								\
-    doneWhite1d:							\
-	if (a0 >= lastx)						\
-	    goto done1d;						\
-	for (;;) {							\
-	    LOOKUP16(13, TIFFFaxBlackTable, eof1d);			\
-	    switch (TabEnt->State) {					\
-	    case S_EOL:							\
-		EOLcnt = 1;						\
-		goto done1d;						\
-	    case S_TermB:						\
-		SETVALUE(TabEnt->Param);					\
-		goto doneBlack1d;					\
-	    case S_MakeUpB:						\
-	    case S_MakeUp:						\
-		a0 += TabEnt->Param;					\
-		RunLength += TabEnt->Param;				\
-		break;							\
-	    default:							\
-		unexpected("BlackTable", a0);				\
-		goto done1d;						\
-	    }								\
-	}								\
-    doneBlack1d:							\
-	if (a0 >= lastx)						\
-	    goto done1d;						\
-        if( *(pa-1) == 0 && *(pa-2) == 0 )				\
-            pa -= 2;                                                    \
-    }									\
-eof1d:									\
-    prematureEOF(a0);							\
-    CLEANUP_RUNS();							\
-    goto eoflab;							\
-done1d:									\
-    CLEANUP_RUNS();							\
-} while (0)
+#define EXPAND1D(eoflab)                                                       \
+    do                                                                         \
+    {                                                                          \
+        for (;;)                                                               \
+        {                                                                      \
+            for (;;)                                                           \
+            {                                                                  \
+                LOOKUP16(12, TIFFFaxWhiteTable, eof1d);                        \
+                switch (TabEnt->State)                                         \
+                {                                                              \
+                    case S_EOL:                                                \
+                        EOLcnt = 1;                                            \
+                        goto done1d;                                           \
+                    case S_TermW:                                              \
+                        SETVALUE(TabEnt->Param);                               \
+                        goto doneWhite1d;                                      \
+                    case S_MakeUpW:                                            \
+                    case S_MakeUp:                                             \
+                        a0 += TabEnt->Param;                                   \
+                        RunLength += TabEnt->Param;                            \
+                        break;                                                 \
+                    default:                                                   \
+                        unexpected("WhiteTable", a0);                          \
+                        goto done1d;                                           \
+                }                                                              \
+            }                                                                  \
+        doneWhite1d:                                                           \
+            if (a0 >= lastx)                                                   \
+                goto done1d;                                                   \
+            for (;;)                                                           \
+            {                                                                  \
+                LOOKUP16(13, TIFFFaxBlackTable, eof1d);                        \
+                switch (TabEnt->State)                                         \
+                {                                                              \
+                    case S_EOL:                                                \
+                        EOLcnt = 1;                                            \
+                        goto done1d;                                           \
+                    case S_TermB:                                              \
+                        SETVALUE(TabEnt->Param);                               \
+                        goto doneBlack1d;                                      \
+                    case S_MakeUpB:                                            \
+                    case S_MakeUp:                                             \
+                        a0 += TabEnt->Param;                                   \
+                        RunLength += TabEnt->Param;                            \
+                        break;                                                 \
+                    default:                                                   \
+                        unexpected("BlackTable", a0);                          \
+                        goto done1d;                                           \
+                }                                                              \
+            }                                                                  \
+        doneBlack1d:                                                           \
+            if (a0 >= lastx)                                                   \
+                goto done1d;                                                   \
+            if (*(pa - 1) == 0 && *(pa - 2) == 0)                              \
+                pa -= 2;                                                       \
+        }                                                                      \
+    eof1d:                                                                     \
+        prematureEOF(a0);                                                      \
+        CLEANUP_RUNS();                                                        \
+        goto eoflab;                                                           \
+    done1d:                                                                    \
+        CLEANUP_RUNS();                                                        \
+    } while (0)
 
 /*
  * Update the value of b1 using the array
  * of runs for the reference line.
  */
-#define CHECK_b1 do {							\
-    if (pa != thisrun) while (b1 <= a0 && b1 < lastx) {			\
-	if( pb + 1 >= sp->refruns + sp->nruns) { 			\
-	    TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
-	                sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
-	    return (-1);						\
-	}								\
-	b1 += pb[0] + pb[1];						\
-	pb += 2;							\
-    }									\
-} while (0)
+#define CHECK_b1                                                               \
+    do                                                                         \
+    {                                                                          \
+        if (pa != thisrun)                                                     \
+            while (b1 <= a0 && b1 < lastx)                                     \
+            {                                                                  \
+                if (pb + 1 >= sp->refruns + sp->nruns)                         \
+                {                                                              \
+                    TIFFErrorExtR(                                             \
+                        tif, module, "Buffer overflow at line %u of %s %u",    \
+                        sp->line, isTiled(tif) ? "tile" : "strip",             \
+                        isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);  \
+                    return (-1);                                               \
+                }                                                              \
+                b1 += pb[0] + pb[1];                                           \
+                pb += 2;                                                       \
+            }                                                                  \
+    } while (0)
 
 /*
  * Expand a row of 2D-encoded data.
  */
-#define EXPAND2D(eoflab) do {						\
-    while (a0 < lastx) {						\
-	if (pa >= thisrun + sp->nruns) {				\
-		TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
-		             sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
-		return (-1);						\
-	}								\
-	LOOKUP8(7, TIFFFaxMainTable, eof2d);				\
-	switch (TabEnt->State) {					\
-	case S_Pass:							\
-	    CHECK_b1;							\
-	    if( pb + 1 >= sp->refruns + sp->nruns) { 			\
-	        TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
-	                sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
-	        return (-1);						\
-	    }								\
-	    b1 += *pb++;						\
-	    RunLength += b1 - a0;					\
-	    a0 = b1;							\
-	    b1 += *pb++;						\
-	    break;							\
-	case S_Horiz:							\
-	    if ((pa-thisrun)&1) {					\
-		for (;;) {	/* black first */			\
-		    LOOKUP16(13, TIFFFaxBlackTable, eof2d);		\
-		    switch (TabEnt->State) {				\
-		    case S_TermB:					\
-			SETVALUE(TabEnt->Param);				\
-			goto doneWhite2da;				\
-		    case S_MakeUpB:					\
-		    case S_MakeUp:					\
-			a0 += TabEnt->Param;				\
-			RunLength += TabEnt->Param;			\
-			break;						\
-		    default:						\
-			goto badBlack2d;				\
-		    }							\
-		}							\
-	    doneWhite2da:;						\
-		for (;;) {	/* then white */			\
-		    LOOKUP16(12, TIFFFaxWhiteTable, eof2d);		\
-		    switch (TabEnt->State) {				\
-		    case S_TermW:					\
-			SETVALUE(TabEnt->Param);				\
-			goto doneBlack2da;				\
-		    case S_MakeUpW:					\
-		    case S_MakeUp:					\
-			a0 += TabEnt->Param;				\
-			RunLength += TabEnt->Param;			\
-			break;						\
-		    default:						\
-			goto badWhite2d;				\
-		    }							\
-		}							\
-	    doneBlack2da:;						\
-	    } else {							\
-		for (;;) {	/* white first */			\
-		    LOOKUP16(12, TIFFFaxWhiteTable, eof2d);		\
-		    switch (TabEnt->State) {				\
-		    case S_TermW:					\
-			SETVALUE(TabEnt->Param);				\
-			goto doneWhite2db;				\
-		    case S_MakeUpW:					\
-		    case S_MakeUp:					\
-			a0 += TabEnt->Param;				\
-			RunLength += TabEnt->Param;			\
-			break;						\
-		    default:						\
-			goto badWhite2d;				\
-		    }							\
-		}							\
-	    doneWhite2db:;						\
-		for (;;) {	/* then black */			\
-		    LOOKUP16(13, TIFFFaxBlackTable, eof2d);		\
-		    switch (TabEnt->State) {				\
-		    case S_TermB:					\
-			SETVALUE(TabEnt->Param);				\
-			goto doneBlack2db;				\
-		    case S_MakeUpB:					\
-		    case S_MakeUp:					\
-			a0 += TabEnt->Param;				\
-			RunLength += TabEnt->Param;			\
-			break;						\
-		    default:						\
-			goto badBlack2d;				\
-		    }							\
-		}							\
-	    doneBlack2db:;						\
-	    }								\
-	    CHECK_b1;							\
-	    break;							\
-	case S_V0:							\
-	    CHECK_b1;							\
-	    SETVALUE(b1 - a0);						\
-	    if( pb >= sp->refruns + sp->nruns) { 			\
-	        TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
-	                sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
-	        return (-1);						\
-	    }								\
-	    b1 += *pb++;						\
-	    break;							\
-	case S_VR:							\
-	    CHECK_b1;							\
-	    SETVALUE(b1 - a0 + TabEnt->Param);				\
-	    if( pb >= sp->refruns + sp->nruns) { 			\
-	        TIFFErrorExt(tif->tif_clientdata, module, "Buffer overflow at line %u of %s %u",	\
-	                sp->line, isTiled(tif) ? "tile" : "strip", isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);	\
-	        return (-1);						\
-	    }								\
-	    b1 += *pb++;						\
-	    break;							\
-	case S_VL:							\
-	    CHECK_b1;							\
-	    if (b1 < (int) (a0 + TabEnt->Param)) {			\
-		unexpected("VL", a0);				\
-		goto eol2d;						\
-	    }								\
-	    SETVALUE(b1 - a0 - TabEnt->Param);				\
-	    b1 -= *--pb;						\
-	    break;							\
-	case S_Ext:							\
-	    *pa++ = lastx - a0;						\
-	    extension(a0);						\
-	    goto eol2d;							\
-	case S_EOL:							\
-	    *pa++ = lastx - a0;						\
-	    NeedBits8(4,eof2d);						\
-	    if (GetBits(4))						\
-		unexpected("EOL", a0);					\
-            ClrBits(4);                                                 \
-	    EOLcnt = 1;							\
-	    goto eol2d;							\
-	default:							\
-	badMain2d:							\
-	    unexpected("MainTable", a0);				\
-	    goto eol2d;							\
-	badBlack2d:							\
-	    unexpected("BlackTable", a0);				\
-	    goto eol2d;							\
-	badWhite2d:							\
-	    unexpected("WhiteTable", a0);				\
-	    goto eol2d;							\
-	eof2d:								\
-	    prematureEOF(a0);						\
-	    CLEANUP_RUNS();						\
-	    goto eoflab;						\
-	}								\
-    }									\
-    if (RunLength) {							\
-	if (RunLength + a0 < lastx) {					\
-	    /* expect a final V0 */					\
-	    NeedBits8(1,eof2d);						\
-	    if (!GetBits(1))						\
-		goto badMain2d;						\
-	    ClrBits(1);							\
-	}								\
-	SETVALUE(0);							\
-    }									\
-eol2d:									\
-    CLEANUP_RUNS();							\
-} while (0)
+#define EXPAND2D(eoflab)                                                       \
+    do                                                                         \
+    {                                                                          \
+        while (a0 < lastx)                                                     \
+        {                                                                      \
+            if (pa >= thisrun + sp->nruns)                                     \
+            {                                                                  \
+                TIFFErrorExtR(                                                 \
+                    tif, module, "Buffer overflow at line %u of %s %u",        \
+                    sp->line, isTiled(tif) ? "tile" : "strip",                 \
+                    isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip);      \
+                return (-1);                                                   \
+            }                                                                  \
+            LOOKUP8(7, TIFFFaxMainTable, eof2d);                               \
+            switch (TabEnt->State)                                             \
+            {                                                                  \
+                case S_Pass:                                                   \
+                    CHECK_b1;                                                  \
+                    if (pb + 1 >= sp->refruns + sp->nruns)                     \
+                    {                                                          \
+                        TIFFErrorExtR(tif, module,                             \
+                                      "Buffer overflow at line %u of %s %u",   \
+                                      sp->line,                                \
+                                      isTiled(tif) ? "tile" : "strip",         \
+                                      isTiled(tif) ? tif->tif_curtile          \
+                                                   : tif->tif_curstrip);       \
+                        return (-1);                                           \
+                    }                                                          \
+                    b1 += *pb++;                                               \
+                    RunLength += b1 - a0;                                      \
+                    a0 = b1;                                                   \
+                    b1 += *pb++;                                               \
+                    break;                                                     \
+                case S_Horiz:                                                  \
+                    if ((pa - thisrun) & 1)                                    \
+                    {                                                          \
+                        for (;;)                                               \
+                        { /* black first */                                    \
+                            LOOKUP16(13, TIFFFaxBlackTable, eof2d);            \
+                            switch (TabEnt->State)                             \
+                            {                                                  \
+                                case S_TermB:                                  \
+                                    SETVALUE(TabEnt->Param);                   \
+                                    goto doneWhite2da;                         \
+                                case S_MakeUpB:                                \
+                                case S_MakeUp:                                 \
+                                    a0 += TabEnt->Param;                       \
+                                    RunLength += TabEnt->Param;                \
+                                    break;                                     \
+                                default:                                       \
+                                    goto badBlack2d;                           \
+                            }                                                  \
+                        }                                                      \
+                    doneWhite2da:;                                             \
+                        for (;;)                                               \
+                        { /* then white */                                     \
+                            LOOKUP16(12, TIFFFaxWhiteTable, eof2d);            \
+                            switch (TabEnt->State)                             \
+                            {                                                  \
+                                case S_TermW:                                  \
+                                    SETVALUE(TabEnt->Param);                   \
+                                    goto doneBlack2da;                         \
+                                case S_MakeUpW:                                \
+                                case S_MakeUp:                                 \
+                                    a0 += TabEnt->Param;                       \
+                                    RunLength += TabEnt->Param;                \
+                                    break;                                     \
+                                default:                                       \
+                                    goto badWhite2d;                           \
+                            }                                                  \
+                        }                                                      \
+                    doneBlack2da:;                                             \
+                    }                                                          \
+                    else                                                       \
+                    {                                                          \
+                        for (;;)                                               \
+                        { /* white first */                                    \
+                            LOOKUP16(12, TIFFFaxWhiteTable, eof2d);            \
+                            switch (TabEnt->State)                             \
+                            {                                                  \
+                                case S_TermW:                                  \
+                                    SETVALUE(TabEnt->Param);                   \
+                                    goto doneWhite2db;                         \
+                                case S_MakeUpW:                                \
+                                case S_MakeUp:                                 \
+                                    a0 += TabEnt->Param;                       \
+                                    RunLength += TabEnt->Param;                \
+                                    break;                                     \
+                                default:                                       \
+                                    goto badWhite2d;                           \
+                            }                                                  \
+                        }                                                      \
+                    doneWhite2db:;                                             \
+                        for (;;)                                               \
+                        { /* then black */                                     \
+                            LOOKUP16(13, TIFFFaxBlackTable, eof2d);            \
+                            switch (TabEnt->State)                             \
+                            {                                                  \
+                                case S_TermB:                                  \
+                                    SETVALUE(TabEnt->Param);                   \
+                                    goto doneBlack2db;                         \
+                                case S_MakeUpB:                                \
+                                case S_MakeUp:                                 \
+                                    a0 += TabEnt->Param;                       \
+                                    RunLength += TabEnt->Param;                \
+                                    break;                                     \
+                                default:                                       \
+                                    goto badBlack2d;                           \
+                            }                                                  \
+                        }                                                      \
+                    doneBlack2db:;                                             \
+                    }                                                          \
+                    CHECK_b1;                                                  \
+                    break;                                                     \
+                case S_V0:                                                     \
+                    CHECK_b1;                                                  \
+                    SETVALUE(b1 - a0);                                         \
+                    if (pb >= sp->refruns + sp->nruns)                         \
+                    {                                                          \
+                        TIFFErrorExtR(tif, module,                             \
+                                      "Buffer overflow at line %u of %s %u",   \
+                                      sp->line,                                \
+                                      isTiled(tif) ? "tile" : "strip",         \
+                                      isTiled(tif) ? tif->tif_curtile          \
+                                                   : tif->tif_curstrip);       \
+                        return (-1);                                           \
+                    }                                                          \
+                    b1 += *pb++;                                               \
+                    break;                                                     \
+                case S_VR:                                                     \
+                    CHECK_b1;                                                  \
+                    SETVALUE(b1 - a0 + TabEnt->Param);                         \
+                    if (pb >= sp->refruns + sp->nruns)                         \
+                    {                                                          \
+                        TIFFErrorExtR(tif, module,                             \
+                                      "Buffer overflow at line %u of %s %u",   \
+                                      sp->line,                                \
+                                      isTiled(tif) ? "tile" : "strip",         \
+                                      isTiled(tif) ? tif->tif_curtile          \
+                                                   : tif->tif_curstrip);       \
+                        return (-1);                                           \
+                    }                                                          \
+                    b1 += *pb++;                                               \
+                    break;                                                     \
+                case S_VL:                                                     \
+                    CHECK_b1;                                                  \
+                    if (b1 < (int)(a0 + TabEnt->Param))                        \
+                    {                                                          \
+                        unexpected("VL", a0);                                  \
+                        goto eol2d;                                            \
+                    }                                                          \
+                    SETVALUE(b1 - a0 - TabEnt->Param);                         \
+                    b1 -= *--pb;                                               \
+                    break;                                                     \
+                case S_Ext:                                                    \
+                    *pa++ = lastx - a0;                                        \
+                    extension(a0);                                             \
+                    goto eol2d;                                                \
+                case S_EOL:                                                    \
+                    *pa++ = lastx - a0;                                        \
+                    NeedBits8(4, eof2d);                                       \
+                    if (GetBits(4))                                            \
+                        unexpected("EOL", a0);                                 \
+                    ClrBits(4);                                                \
+                    EOLcnt = 1;                                                \
+                    goto eol2d;                                                \
+                default:                                                       \
+                badMain2d:                                                     \
+                    unexpected("MainTable", a0);                               \
+                    goto eol2d;                                                \
+                badBlack2d:                                                    \
+                    unexpected("BlackTable", a0);                              \
+                    goto eol2d;                                                \
+                badWhite2d:                                                    \
+                    unexpected("WhiteTable", a0);                              \
+                    goto eol2d;                                                \
+                eof2d:                                                         \
+                    prematureEOF(a0);                                          \
+                    CLEANUP_RUNS();                                            \
+                    goto eoflab;                                               \
+            }                                                                  \
+        }                                                                      \
+        if (RunLength)                                                         \
+        {                                                                      \
+            if (RunLength + a0 < lastx)                                        \
+            {                                                                  \
+                /* expect a final V0 */                                        \
+                NeedBits8(1, eof2d);                                           \
+                if (!GetBits(1))                                               \
+                    goto badMain2d;                                            \
+                ClrBits(1);                                                    \
+            }                                                                  \
+            SETVALUE(0);                                                       \
+        }                                                                      \
+    eol2d:                                                                     \
+        CLEANUP_RUNS();                                                        \
+    } while (0)
 #endif /* _FAX3_ */
-/* vim: set ts=8 sts=4 sw=4 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_flush.c b/3rdparty/libtiff/tif_flush.c
index f7fa2072ab79..ff9c1e247a33 100644
--- a/3rdparty/libtiff/tif_flush.c
+++ b/3rdparty/libtiff/tif_flush.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -27,30 +27,28 @@
  */
 #include "tiffiop.h"
 
-int
-TIFFFlush(TIFF* tif)
+int TIFFFlush(TIFF *tif)
 {
-    if( tif->tif_mode == O_RDONLY )
+    if (tif->tif_mode == O_RDONLY)
         return 1;
 
     if (!TIFFFlushData(tif))
         return (0);
-                
-    /* In update (r+) mode we try to detect the case where 
-       only the strip/tile map has been altered, and we try to 
-       rewrite only that portion of the directory without 
+
+    /* In update (r+) mode we try to detect the case where
+       only the strip/tile map has been altered, and we try to
+       rewrite only that portion of the directory without
        making any other changes */
-                
-    if( (tif->tif_flags & TIFF_DIRTYSTRIP)
-        && !(tif->tif_flags & TIFF_DIRTYDIRECT) 
-        && tif->tif_mode == O_RDWR )
+
+    if ((tif->tif_flags & TIFF_DIRTYSTRIP) &&
+        !(tif->tif_flags & TIFF_DIRTYDIRECT) && tif->tif_mode == O_RDWR)
     {
-        if( TIFFForceStrileArrayWriting(tif) )
+        if (TIFFForceStrileArrayWriting(tif))
             return 1;
     }
 
-    if ((tif->tif_flags & (TIFF_DIRTYDIRECT|TIFF_DIRTYSTRIP)) 
-        && !TIFFRewriteDirectory(tif))
+    if ((tif->tif_flags & (TIFF_DIRTYDIRECT | TIFF_DIRTYSTRIP)) &&
+        !TIFFRewriteDirectory(tif))
         return (0);
 
     return (1);
@@ -75,45 +73,43 @@ TIFFFlush(TIFF* tif)
  *
  * Returns 1 in case of success, 0 otherwise.
  */
-int TIFFForceStrileArrayWriting(TIFF* tif)
+int TIFFForceStrileArrayWriting(TIFF *tif)
 {
     static const char module[] = "TIFFForceStrileArrayWriting";
     const int isTiled = TIFFIsTiled(tif);
 
     if (tif->tif_mode == O_RDONLY)
     {
-        TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-                     "File opened in read-only mode");
+        TIFFErrorExtR(tif, tif->tif_name, "File opened in read-only mode");
         return 0;
     }
-    if( tif->tif_diroff == 0 )
+    if (tif->tif_diroff == 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                     "Directory has not yet been written");
+        TIFFErrorExtR(tif, module, "Directory has not yet been written");
         return 0;
     }
-    if( (tif->tif_flags & TIFF_DIRTYDIRECT) != 0 )
+    if ((tif->tif_flags & TIFF_DIRTYDIRECT) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                     "Directory has changes other than the strile arrays. "
-                     "TIFFRewriteDirectory() should be called instead");
+        TIFFErrorExtR(tif, module,
+                      "Directory has changes other than the strile arrays. "
+                      "TIFFRewriteDirectory() should be called instead");
         return 0;
     }
 
-    if( !(tif->tif_flags & TIFF_DIRTYSTRIP) )
+    if (!(tif->tif_flags & TIFF_DIRTYSTRIP))
     {
-        if( !(tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
-             tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
-             tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
-             tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
-             tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
-             tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
-             tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
-             tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0) )
+        if (!(tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
+              tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
+              tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
+              tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
+              tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
+              tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
+              tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
+              tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0))
         {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                        "Function not called together with "
-                        "TIFFDeferStrileArrayWriting()");
+            TIFFErrorExtR(tif, module,
+                          "Function not called together with "
+                          "TIFFDeferStrileArrayWriting()");
             return 0;
         }
 
@@ -121,18 +117,14 @@ int TIFFForceStrileArrayWriting(TIFF* tif)
             return 0;
     }
 
-    if( _TIFFRewriteField( tif,
-                           isTiled ? TIFFTAG_TILEOFFSETS :
-                                     TIFFTAG_STRIPOFFSETS,
-                           TIFF_LONG8,
-                           tif->tif_dir.td_nstrips,
-                           tif->tif_dir.td_stripoffset_p )
-        && _TIFFRewriteField( tif,
-                              isTiled ? TIFFTAG_TILEBYTECOUNTS :
-                                        TIFFTAG_STRIPBYTECOUNTS,
-                              TIFF_LONG8,
-                              tif->tif_dir.td_nstrips,
-                              tif->tif_dir.td_stripbytecount_p ) )
+    if (_TIFFRewriteField(tif,
+                          isTiled ? TIFFTAG_TILEOFFSETS : TIFFTAG_STRIPOFFSETS,
+                          TIFF_LONG8, tif->tif_dir.td_nstrips,
+                          tif->tif_dir.td_stripoffset_p) &&
+        _TIFFRewriteField(
+            tif, isTiled ? TIFFTAG_TILEBYTECOUNTS : TIFFTAG_STRIPBYTECOUNTS,
+            TIFF_LONG8, tif->tif_dir.td_nstrips,
+            tif->tif_dir.td_stripbytecount_p))
     {
         tif->tif_flags &= ~TIFF_DIRTYSTRIP;
         tif->tif_flags &= ~TIFF_BEENWRITING;
@@ -149,26 +141,17 @@ int TIFFForceStrileArrayWriting(TIFF* tif)
  * is not set, so that TIFFFlush() will proceed to write out the directory.
  * The documentation says returning 1 is an error indicator, but not having
  * been writing isn't exactly a an error.  Hopefully this doesn't cause
- * problems for other people. 
+ * problems for other people.
  */
-int
-TIFFFlushData(TIFF* tif)
+int TIFFFlushData(TIFF *tif)
 {
-	if ((tif->tif_flags & TIFF_BEENWRITING) == 0)
-		return (1);
-	if (tif->tif_flags & TIFF_POSTENCODE) {
-		tif->tif_flags &= ~TIFF_POSTENCODE;
-		if (!(*tif->tif_postencode)(tif))
-			return (0);
-	}
-	return (TIFFFlushData1(tif));
+    if ((tif->tif_flags & TIFF_BEENWRITING) == 0)
+        return (1);
+    if (tif->tif_flags & TIFF_POSTENCODE)
+    {
+        tif->tif_flags &= ~TIFF_POSTENCODE;
+        if (!(*tif->tif_postencode)(tif))
+            return (0);
+    }
+    return (TIFFFlushData1(tif));
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_getimage.c b/3rdparty/libtiff/tif_getimage.c
index 3460af744e46..41f7dfd77e00 100644
--- a/3rdparty/libtiff/tif_getimage.c
+++ b/3rdparty/libtiff/tif_getimage.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1991-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -28,41 +28,50 @@
  * Read and return a packed RGBA image.
  */
 #include "tiffiop.h"
-#include <stdio.h>
 #include <limits.h>
+#include <stdio.h>
 
-static int gtTileContig(TIFFRGBAImage*, uint32*, uint32, uint32);
-static int gtTileSeparate(TIFFRGBAImage*, uint32*, uint32, uint32);
-static int gtStripContig(TIFFRGBAImage*, uint32*, uint32, uint32);
-static int gtStripSeparate(TIFFRGBAImage*, uint32*, uint32, uint32);
-static int PickContigCase(TIFFRGBAImage*);
-static int PickSeparateCase(TIFFRGBAImage*);
+static int gtTileContig(TIFFRGBAImage *, uint32_t *, uint32_t, uint32_t);
+static int gtTileSeparate(TIFFRGBAImage *, uint32_t *, uint32_t, uint32_t);
+static int gtStripContig(TIFFRGBAImage *, uint32_t *, uint32_t, uint32_t);
+static int gtStripSeparate(TIFFRGBAImage *, uint32_t *, uint32_t, uint32_t);
+static int PickContigCase(TIFFRGBAImage *);
+static int PickSeparateCase(TIFFRGBAImage *);
 
-static int BuildMapUaToAa(TIFFRGBAImage* img);
-static int BuildMapBitdepth16To8(TIFFRGBAImage* img);
+static int BuildMapUaToAa(TIFFRGBAImage *img);
+static int BuildMapBitdepth16To8(TIFFRGBAImage *img);
 
 static const char photoTag[] = "PhotometricInterpretation";
 
-/* 
+/*
  * Helper constants used in Orientation tag handling
  */
 #define FLIP_VERTICALLY 0x01
 #define FLIP_HORIZONTALLY 0x02
 
+#define EMSG_BUF_SIZE 1024
+
 /*
  * Color conversion constants. We will define display types here.
  */
 
 static const TIFFDisplay display_sRGB = {
-	{			/* XYZ -> luminance matrix */
-		{  3.2410F, -1.5374F, -0.4986F },
-		{  -0.9692F, 1.8760F, 0.0416F },
-		{  0.0556F, -0.2040F, 1.0570F }
-	},	
-	100.0F, 100.0F, 100.0F,	/* Light o/p for reference white */
-	255, 255, 255,		/* Pixel values for ref. white */
-	1.0F, 1.0F, 1.0F,	/* Residual light o/p for black pixel */
-	2.4F, 2.4F, 2.4F,	/* Gamma values for the three guns */
+    {/* XYZ -> luminance matrix */
+     {3.2410F, -1.5374F, -0.4986F},
+     {-0.9692F, 1.8760F, 0.0416F},
+     {0.0556F, -0.2040F, 1.0570F}},
+    100.0F,
+    100.0F,
+    100.0F, /* Light o/p for reference white */
+    255,
+    255,
+    255, /* Pixel values for ref. white */
+    1.0F,
+    1.0F,
+    1.0F, /* Residual light o/p for black pixel */
+    2.4F,
+    2.4F,
+    2.4F, /* Gamma values for the three guns */
 };
 
 /*
@@ -71,443 +80,525 @@ static const TIFFDisplay display_sRGB = {
  * be handled.  If 0 is returned, emsg contains the reason
  * why it is being rejected.
  */
-int
-TIFFRGBAImageOK(TIFF* tif, char emsg[1024])
-{
-	TIFFDirectory* td = &tif->tif_dir;
-	uint16 photometric;
-	int colorchannels;
-
-	if (!tif->tif_decodestatus) {
-		sprintf(emsg, "Sorry, requested compression method is not configured");
-		return (0);
-	}
-	switch (td->td_bitspersample) {
-		case 1:
-		case 2:
-		case 4:
-		case 8:
-		case 16:
-			break;
-		default:
-			sprintf(emsg, "Sorry, can not handle images with %d-bit samples",
-			    td->td_bitspersample);
-			return (0);
-	}
-        if (td->td_sampleformat == SAMPLEFORMAT_IEEEFP) {
-                sprintf(emsg, "Sorry, can not handle images with IEEE floating-point samples");
+int TIFFRGBAImageOK(TIFF *tif, char emsg[EMSG_BUF_SIZE])
+{
+    TIFFDirectory *td = &tif->tif_dir;
+    uint16_t photometric;
+    int colorchannels;
+
+    if (!tif->tif_decodestatus)
+    {
+        snprintf(emsg, EMSG_BUF_SIZE,
+                 "Sorry, requested compression method is not configured");
+        return (0);
+    }
+    switch (td->td_bitspersample)
+    {
+        case 1:
+        case 2:
+        case 4:
+        case 8:
+        case 16:
+            break;
+        default:
+            snprintf(emsg, EMSG_BUF_SIZE,
+                     "Sorry, can not handle images with %" PRIu16
+                     "-bit samples",
+                     td->td_bitspersample);
+            return (0);
+    }
+    if (td->td_sampleformat == SAMPLEFORMAT_IEEEFP)
+    {
+        snprintf(
+            emsg, EMSG_BUF_SIZE,
+            "Sorry, can not handle images with IEEE floating-point samples");
+        return (0);
+    }
+    colorchannels = td->td_samplesperpixel - td->td_extrasamples;
+    if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric))
+    {
+        switch (colorchannels)
+        {
+            case 1:
+                photometric = PHOTOMETRIC_MINISBLACK;
+                break;
+            case 3:
+                photometric = PHOTOMETRIC_RGB;
+                break;
+            default:
+                snprintf(emsg, EMSG_BUF_SIZE, "Missing needed %s tag",
+                         photoTag);
                 return (0);
         }
-	colorchannels = td->td_samplesperpixel - td->td_extrasamples;
-	if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric)) {
-		switch (colorchannels) {
-			case 1:
-				photometric = PHOTOMETRIC_MINISBLACK;
-				break;
-			case 3:
-				photometric = PHOTOMETRIC_RGB;
-				break;
-			default:
-				sprintf(emsg, "Missing needed %s tag", photoTag);
-				return (0);
-		}
-	}
-	switch (photometric) {
-		case PHOTOMETRIC_MINISWHITE:
-		case PHOTOMETRIC_MINISBLACK:
-		case PHOTOMETRIC_PALETTE:
-			if (td->td_planarconfig == PLANARCONFIG_CONTIG
-			    && td->td_samplesperpixel != 1
-			    && td->td_bitspersample < 8 ) {
-				sprintf(emsg,
-				    "Sorry, can not handle contiguous data with %s=%d, "
-				    "and %s=%d and Bits/Sample=%d",
-				    photoTag, photometric,
-				    "Samples/pixel", td->td_samplesperpixel,
-				    td->td_bitspersample);
-				return (0);
-			}
-			/*
-			 * We should likely validate that any extra samples are either
-			 * to be ignored, or are alpha, and if alpha we should try to use
-			 * them.  But for now we won't bother with this.
-			*/
-			break;
-		case PHOTOMETRIC_YCBCR:
-			/*
-			 * TODO: if at all meaningful and useful, make more complete
-			 * support check here, or better still, refactor to let supporting
-			 * code decide whether there is support and what meaningful
-			 * error to return
-			 */
-			break;
-		case PHOTOMETRIC_RGB:
-			if (colorchannels < 3) {
-				sprintf(emsg, "Sorry, can not handle RGB image with %s=%d",
-				    "Color channels", colorchannels);
-				return (0);
-			}
-			break;
-		case PHOTOMETRIC_SEPARATED:
-			{
-				uint16 inkset;
-				TIFFGetFieldDefaulted(tif, TIFFTAG_INKSET, &inkset);
-				if (inkset != INKSET_CMYK) {
-					sprintf(emsg,
-					    "Sorry, can not handle separated image with %s=%d",
-					    "InkSet", inkset);
-					return 0;
-				}
-				if (td->td_samplesperpixel < 4) {
-					sprintf(emsg,
-					    "Sorry, can not handle separated image with %s=%d",
-					    "Samples/pixel", td->td_samplesperpixel);
-					return 0;
-				}
-				break;
-			}
-		case PHOTOMETRIC_LOGL:
-			if (td->td_compression != COMPRESSION_SGILOG) {
-				sprintf(emsg, "Sorry, LogL data must have %s=%d",
-				    "Compression", COMPRESSION_SGILOG);
-				return (0);
-			}
-			break;
-		case PHOTOMETRIC_LOGLUV:
-			if (td->td_compression != COMPRESSION_SGILOG &&
-			    td->td_compression != COMPRESSION_SGILOG24) {
-				sprintf(emsg, "Sorry, LogLuv data must have %s=%d or %d",
-				    "Compression", COMPRESSION_SGILOG, COMPRESSION_SGILOG24);
-				return (0);
-			}
-			if (td->td_planarconfig != PLANARCONFIG_CONTIG) {
-				sprintf(emsg, "Sorry, can not handle LogLuv images with %s=%d",
-				    "Planarconfiguration", td->td_planarconfig);
-				return (0);
-			}
-			if ( td->td_samplesperpixel != 3 || colorchannels != 3 ) {
-                                sprintf(emsg,
-                                        "Sorry, can not handle image with %s=%d, %s=%d",
-                                        "Samples/pixel", td->td_samplesperpixel,
-                                        "colorchannels", colorchannels);
-                                return 0;
-                        }
-			break;
-		case PHOTOMETRIC_CIELAB:
-                        if ( td->td_samplesperpixel != 3 || colorchannels != 3 || td->td_bitspersample != 8 ) {
-                                sprintf(emsg,
-                                        "Sorry, can not handle image with %s=%d, %s=%d and %s=%d",
-                                        "Samples/pixel", td->td_samplesperpixel,
-                                        "colorchannels", colorchannels,
-                                        "Bits/sample", td->td_bitspersample);
-                                return 0;
-                        }
-			break;
-                default:
-			sprintf(emsg, "Sorry, can not handle image with %s=%d",
-			    photoTag, photometric);
-			return (0);
-	}
-	return (1);
-}
-
-void
-TIFFRGBAImageEnd(TIFFRGBAImage* img)
-{
-	if (img->Map) {
-		_TIFFfree(img->Map);
-		img->Map = NULL;
-	}
-	if (img->BWmap) {
-		_TIFFfree(img->BWmap);
-		img->BWmap = NULL;
-	}
-	if (img->PALmap) {
-		_TIFFfree(img->PALmap);
-		img->PALmap = NULL;
-	}
-	if (img->ycbcr) {
-		_TIFFfree(img->ycbcr);
-		img->ycbcr = NULL;
-	}
-	if (img->cielab) {
-		_TIFFfree(img->cielab);
-		img->cielab = NULL;
-	}
-	if (img->UaToAa) {
-		_TIFFfree(img->UaToAa);
-		img->UaToAa = NULL;
-	}
-	if (img->Bitdepth16To8) {
-		_TIFFfree(img->Bitdepth16To8);
-		img->Bitdepth16To8 = NULL;
-	}
-
-	if( img->redcmap ) {
-		_TIFFfree( img->redcmap );
-		_TIFFfree( img->greencmap );
-		_TIFFfree( img->bluecmap );
-                img->redcmap = img->greencmap = img->bluecmap = NULL;
-	}
-}
-
-static int
-isCCITTCompression(TIFF* tif)
-{
-    uint16 compress;
+    }
+    switch (photometric)
+    {
+        case PHOTOMETRIC_MINISWHITE:
+        case PHOTOMETRIC_MINISBLACK:
+        case PHOTOMETRIC_PALETTE:
+            if (td->td_planarconfig == PLANARCONFIG_CONTIG &&
+                td->td_samplesperpixel != 1 && td->td_bitspersample < 8)
+            {
+                snprintf(
+                    emsg, EMSG_BUF_SIZE,
+                    "Sorry, can not handle contiguous data with %s=%" PRIu16
+                    ", "
+                    "and %s=%" PRIu16 " and Bits/Sample=%" PRIu16 "",
+                    photoTag, photometric, "Samples/pixel",
+                    td->td_samplesperpixel, td->td_bitspersample);
+                return (0);
+            }
+            /*
+             * We should likely validate that any extra samples are either
+             * to be ignored, or are alpha, and if alpha we should try to use
+             * them.  But for now we won't bother with this.
+             */
+            break;
+        case PHOTOMETRIC_YCBCR:
+            /*
+             * TODO: if at all meaningful and useful, make more complete
+             * support check here, or better still, refactor to let supporting
+             * code decide whether there is support and what meaningful
+             * error to return
+             */
+            break;
+        case PHOTOMETRIC_RGB:
+            if (colorchannels < 3)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, can not handle RGB image with %s=%d",
+                         "Color channels", colorchannels);
+                return (0);
+            }
+            break;
+        case PHOTOMETRIC_SEPARATED:
+        {
+            uint16_t inkset;
+            TIFFGetFieldDefaulted(tif, TIFFTAG_INKSET, &inkset);
+            if (inkset != INKSET_CMYK)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, can not handle separated image with %s=%d",
+                         "InkSet", inkset);
+                return 0;
+            }
+            if (td->td_samplesperpixel < 4)
+            {
+                snprintf(
+                    emsg, EMSG_BUF_SIZE,
+                    "Sorry, can not handle separated image with %s=%" PRIu16,
+                    "Samples/pixel", td->td_samplesperpixel);
+                return 0;
+            }
+            break;
+        }
+        case PHOTOMETRIC_LOGL:
+            if (td->td_compression != COMPRESSION_SGILOG)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, LogL data must have %s=%d", "Compression",
+                         COMPRESSION_SGILOG);
+                return (0);
+            }
+            break;
+        case PHOTOMETRIC_LOGLUV:
+            if (td->td_compression != COMPRESSION_SGILOG &&
+                td->td_compression != COMPRESSION_SGILOG24)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, LogLuv data must have %s=%d or %d",
+                         "Compression", COMPRESSION_SGILOG,
+                         COMPRESSION_SGILOG24);
+                return (0);
+            }
+            if (td->td_planarconfig != PLANARCONFIG_CONTIG)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, can not handle LogLuv images with %s=%" PRIu16,
+                         "Planarconfiguration", td->td_planarconfig);
+                return (0);
+            }
+            if (td->td_samplesperpixel != 3 || colorchannels != 3)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, can not handle image with %s=%" PRIu16
+                         ", %s=%d",
+                         "Samples/pixel", td->td_samplesperpixel,
+                         "colorchannels", colorchannels);
+                return 0;
+            }
+            break;
+        case PHOTOMETRIC_CIELAB:
+            if (td->td_samplesperpixel != 3 || colorchannels != 3 ||
+                (td->td_bitspersample != 8 && td->td_bitspersample != 16))
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, can not handle image with %s=%" PRIu16
+                         ", %s=%d and %s=%" PRIu16,
+                         "Samples/pixel", td->td_samplesperpixel,
+                         "colorchannels", colorchannels, "Bits/sample",
+                         td->td_bitspersample);
+                return 0;
+            }
+            break;
+        default:
+            snprintf(emsg, EMSG_BUF_SIZE,
+                     "Sorry, can not handle image with %s=%" PRIu16, photoTag,
+                     photometric);
+            return (0);
+    }
+    return (1);
+}
+
+void TIFFRGBAImageEnd(TIFFRGBAImage *img)
+{
+    if (img->Map)
+    {
+        _TIFFfreeExt(img->tif, img->Map);
+        img->Map = NULL;
+    }
+    if (img->BWmap)
+    {
+        _TIFFfreeExt(img->tif, img->BWmap);
+        img->BWmap = NULL;
+    }
+    if (img->PALmap)
+    {
+        _TIFFfreeExt(img->tif, img->PALmap);
+        img->PALmap = NULL;
+    }
+    if (img->ycbcr)
+    {
+        _TIFFfreeExt(img->tif, img->ycbcr);
+        img->ycbcr = NULL;
+    }
+    if (img->cielab)
+    {
+        _TIFFfreeExt(img->tif, img->cielab);
+        img->cielab = NULL;
+    }
+    if (img->UaToAa)
+    {
+        _TIFFfreeExt(img->tif, img->UaToAa);
+        img->UaToAa = NULL;
+    }
+    if (img->Bitdepth16To8)
+    {
+        _TIFFfreeExt(img->tif, img->Bitdepth16To8);
+        img->Bitdepth16To8 = NULL;
+    }
+
+    if (img->redcmap)
+    {
+        _TIFFfreeExt(img->tif, img->redcmap);
+        _TIFFfreeExt(img->tif, img->greencmap);
+        _TIFFfreeExt(img->tif, img->bluecmap);
+        img->redcmap = img->greencmap = img->bluecmap = NULL;
+    }
+}
+
+static int isCCITTCompression(TIFF *tif)
+{
+    uint16_t compress;
     TIFFGetField(tif, TIFFTAG_COMPRESSION, &compress);
     return (compress == COMPRESSION_CCITTFAX3 ||
-	    compress == COMPRESSION_CCITTFAX4 ||
-	    compress == COMPRESSION_CCITTRLE ||
-	    compress == COMPRESSION_CCITTRLEW);
-}
-
-int
-TIFFRGBAImageBegin(TIFFRGBAImage* img, TIFF* tif, int stop, char emsg[1024])
-{
-	uint16* sampleinfo;
-	uint16 extrasamples;
-	uint16 planarconfig;
-	uint16 compress;
-	int colorchannels;
-	uint16 *red_orig, *green_orig, *blue_orig;
-	int n_color;
-	
-	if( !TIFFRGBAImageOK(tif, emsg) )
-		return 0;
-
-	/* Initialize to normal values */
-	img->row_offset = 0;
-	img->col_offset = 0;
-	img->redcmap = NULL;
-	img->greencmap = NULL;
-	img->bluecmap = NULL;
-	img->Map = NULL;
-	img->BWmap = NULL;
-	img->PALmap = NULL;
-	img->ycbcr = NULL;
-	img->cielab = NULL;
-	img->UaToAa = NULL;
-	img->Bitdepth16To8 = NULL;
-	img->req_orientation = ORIENTATION_BOTLEFT;     /* It is the default */
-
-	img->tif = tif;
-	img->stoponerr = stop;
-	TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &img->bitspersample);
-	switch (img->bitspersample) {
-		case 1:
-		case 2:
-		case 4:
-		case 8:
-		case 16:
-			break;
-		default:
-			sprintf(emsg, "Sorry, can not handle images with %d-bit samples",
-			    img->bitspersample);
-			goto fail_return;
-	}
-	img->alpha = 0;
-	TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &img->samplesperpixel);
-	TIFFGetFieldDefaulted(tif, TIFFTAG_EXTRASAMPLES,
-	    &extrasamples, &sampleinfo);
-	if (extrasamples >= 1)
-	{
-		switch (sampleinfo[0]) {
-			case EXTRASAMPLE_UNSPECIFIED:          /* Workaround for some images without */
-				if (img->samplesperpixel > 3)  /* correct info about alpha channel */
-					img->alpha = EXTRASAMPLE_ASSOCALPHA;
-				break;
-			case EXTRASAMPLE_ASSOCALPHA:           /* data is pre-multiplied */
-			case EXTRASAMPLE_UNASSALPHA:           /* data is not pre-multiplied */
-				img->alpha = sampleinfo[0];
-				break;
-		}
-	}
+            compress == COMPRESSION_CCITTFAX4 ||
+            compress == COMPRESSION_CCITTRLE ||
+            compress == COMPRESSION_CCITTRLEW);
+}
+
+int TIFFRGBAImageBegin(TIFFRGBAImage *img, TIFF *tif, int stop,
+                       char emsg[EMSG_BUF_SIZE])
+{
+    uint16_t *sampleinfo;
+    uint16_t extrasamples;
+    uint16_t planarconfig;
+    uint16_t compress;
+    int colorchannels;
+    uint16_t *red_orig, *green_orig, *blue_orig;
+    int n_color;
+
+    if (!TIFFRGBAImageOK(tif, emsg))
+        return 0;
+
+    /* Initialize to normal values */
+    img->row_offset = 0;
+    img->col_offset = 0;
+    img->redcmap = NULL;
+    img->greencmap = NULL;
+    img->bluecmap = NULL;
+    img->Map = NULL;
+    img->BWmap = NULL;
+    img->PALmap = NULL;
+    img->ycbcr = NULL;
+    img->cielab = NULL;
+    img->UaToAa = NULL;
+    img->Bitdepth16To8 = NULL;
+    img->req_orientation = ORIENTATION_BOTLEFT; /* It is the default */
+
+    img->tif = tif;
+    img->stoponerr = stop;
+    TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &img->bitspersample);
+    switch (img->bitspersample)
+    {
+        case 1:
+        case 2:
+        case 4:
+        case 8:
+        case 16:
+            break;
+        default:
+            snprintf(emsg, EMSG_BUF_SIZE,
+                     "Sorry, can not handle images with %" PRIu16
+                     "-bit samples",
+                     img->bitspersample);
+            goto fail_return;
+    }
+    img->alpha = 0;
+    TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &img->samplesperpixel);
+    TIFFGetFieldDefaulted(tif, TIFFTAG_EXTRASAMPLES, &extrasamples,
+                          &sampleinfo);
+    if (extrasamples >= 1)
+    {
+        switch (sampleinfo[0])
+        {
+            case EXTRASAMPLE_UNSPECIFIED: /* Workaround for some images without
+                                           */
+                if (img->samplesperpixel >
+                    3) /* correct info about alpha channel */
+                    img->alpha = EXTRASAMPLE_ASSOCALPHA;
+                break;
+            case EXTRASAMPLE_ASSOCALPHA: /* data is pre-multiplied */
+            case EXTRASAMPLE_UNASSALPHA: /* data is not pre-multiplied */
+                img->alpha = sampleinfo[0];
+                break;
+        }
+    }
 
 #ifdef DEFAULT_EXTRASAMPLE_AS_ALPHA
-	if( !TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &img->photometric))
-		img->photometric = PHOTOMETRIC_MINISWHITE;
-
-	if( extrasamples == 0
-	    && img->samplesperpixel == 4
-	    && img->photometric == PHOTOMETRIC_RGB )
-	{
-		img->alpha = EXTRASAMPLE_ASSOCALPHA;
-		extrasamples = 1;
-	}
+    if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &img->photometric))
+        img->photometric = PHOTOMETRIC_MINISWHITE;
+
+    if (extrasamples == 0 && img->samplesperpixel == 4 &&
+        img->photometric == PHOTOMETRIC_RGB)
+    {
+        img->alpha = EXTRASAMPLE_ASSOCALPHA;
+        extrasamples = 1;
+    }
 #endif
 
-	colorchannels = img->samplesperpixel - extrasamples;
-	TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &compress);
-	TIFFGetFieldDefaulted(tif, TIFFTAG_PLANARCONFIG, &planarconfig);
-	if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &img->photometric)) {
-		switch (colorchannels) {
-			case 1:
-				if (isCCITTCompression(tif))
-					img->photometric = PHOTOMETRIC_MINISWHITE;
-				else
-					img->photometric = PHOTOMETRIC_MINISBLACK;
-				break;
-			case 3:
-				img->photometric = PHOTOMETRIC_RGB;
-				break;
-			default:
-				sprintf(emsg, "Missing needed %s tag", photoTag);
-                                goto fail_return;
-		}
-	}
-	switch (img->photometric) {
-		case PHOTOMETRIC_PALETTE:
-			if (!TIFFGetField(tif, TIFFTAG_COLORMAP,
-			    &red_orig, &green_orig, &blue_orig)) {
-				sprintf(emsg, "Missing required \"Colormap\" tag");
-                                goto fail_return;
-			}
-
-			/* copy the colormaps so we can modify them */
-			n_color = (1U << img->bitspersample);
-			img->redcmap = (uint16 *) _TIFFmalloc(sizeof(uint16)*n_color);
-			img->greencmap = (uint16 *) _TIFFmalloc(sizeof(uint16)*n_color);
-			img->bluecmap = (uint16 *) _TIFFmalloc(sizeof(uint16)*n_color);
-			if( !img->redcmap || !img->greencmap || !img->bluecmap ) {
-				sprintf(emsg, "Out of memory for colormap copy");
-                                goto fail_return;
-			}
-
-			_TIFFmemcpy( img->redcmap, red_orig, n_color * 2 );
-			_TIFFmemcpy( img->greencmap, green_orig, n_color * 2 );
-			_TIFFmemcpy( img->bluecmap, blue_orig, n_color * 2 );
-
-			/* fall through... */
-		case PHOTOMETRIC_MINISWHITE:
-		case PHOTOMETRIC_MINISBLACK:
-			if (planarconfig == PLANARCONFIG_CONTIG
-			    && img->samplesperpixel != 1
-			    && img->bitspersample < 8 ) {
-				sprintf(emsg,
-				    "Sorry, can not handle contiguous data with %s=%d, "
-				    "and %s=%d and Bits/Sample=%d",
-				    photoTag, img->photometric,
-				    "Samples/pixel", img->samplesperpixel,
-				    img->bitspersample);
-                                goto fail_return;
-			}
-			break;
-		case PHOTOMETRIC_YCBCR:
-			/* It would probably be nice to have a reality check here. */
-			if (planarconfig == PLANARCONFIG_CONTIG)
-				/* can rely on libjpeg to convert to RGB */
-				/* XXX should restore current state on exit */
-				switch (compress) {
-					case COMPRESSION_JPEG:
-						/*
-						 * TODO: when complete tests verify complete desubsampling
-						 * and YCbCr handling, remove use of TIFFTAG_JPEGCOLORMODE in
-						 * favor of tif_getimage.c native handling
-						 */
-						TIFFSetField(tif, TIFFTAG_JPEGCOLORMODE, JPEGCOLORMODE_RGB);
-						img->photometric = PHOTOMETRIC_RGB;
-						break;
-					default:
-						/* do nothing */;
-						break;
-				}
-			/*
-			 * TODO: if at all meaningful and useful, make more complete
-			 * support check here, or better still, refactor to let supporting
-			 * code decide whether there is support and what meaningful
-			 * error to return
-			 */
-			break;
-		case PHOTOMETRIC_RGB:
-			if (colorchannels < 3) {
-				sprintf(emsg, "Sorry, can not handle RGB image with %s=%d",
-				    "Color channels", colorchannels);
-                                goto fail_return;
-			}
-			break;
-		case PHOTOMETRIC_SEPARATED:
-			{
-				uint16 inkset;
-				TIFFGetFieldDefaulted(tif, TIFFTAG_INKSET, &inkset);
-				if (inkset != INKSET_CMYK) {
-					sprintf(emsg, "Sorry, can not handle separated image with %s=%d",
-					    "InkSet", inkset);
-                                        goto fail_return;
-				}
-				if (img->samplesperpixel < 4) {
-					sprintf(emsg, "Sorry, can not handle separated image with %s=%d",
-					    "Samples/pixel", img->samplesperpixel);
-                                        goto fail_return;
-				}
-			}
-			break;
-		case PHOTOMETRIC_LOGL:
-			if (compress != COMPRESSION_SGILOG) {
-				sprintf(emsg, "Sorry, LogL data must have %s=%d",
-				    "Compression", COMPRESSION_SGILOG);
-                                goto fail_return;
-			}
-			TIFFSetField(tif, TIFFTAG_SGILOGDATAFMT, SGILOGDATAFMT_8BIT);
-			img->photometric = PHOTOMETRIC_MINISBLACK;	/* little white lie */
-			img->bitspersample = 8;
-			break;
-		case PHOTOMETRIC_LOGLUV:
-			if (compress != COMPRESSION_SGILOG && compress != COMPRESSION_SGILOG24) {
-				sprintf(emsg, "Sorry, LogLuv data must have %s=%d or %d",
-				    "Compression", COMPRESSION_SGILOG, COMPRESSION_SGILOG24);
-                                goto fail_return;
-			}
-			if (planarconfig != PLANARCONFIG_CONTIG) {
-				sprintf(emsg, "Sorry, can not handle LogLuv images with %s=%d",
-				    "Planarconfiguration", planarconfig);
-				return (0);
-			}
-			TIFFSetField(tif, TIFFTAG_SGILOGDATAFMT, SGILOGDATAFMT_8BIT);
-			img->photometric = PHOTOMETRIC_RGB;		/* little white lie */
-			img->bitspersample = 8;
-			break;
-		case PHOTOMETRIC_CIELAB:
-			break;
-		default:
-			sprintf(emsg, "Sorry, can not handle image with %s=%d",
-			    photoTag, img->photometric);
-                        goto fail_return;
-	}
-	TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &img->width);
-	TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &img->height);
-	TIFFGetFieldDefaulted(tif, TIFFTAG_ORIENTATION, &img->orientation);
-	img->isContig =
-	    !(planarconfig == PLANARCONFIG_SEPARATE && img->samplesperpixel > 1);
-	if (img->isContig) {
-		if (!PickContigCase(img)) {
-			sprintf(emsg, "Sorry, can not handle image");
-			goto fail_return;
-		}
-	} else {
-		if (!PickSeparateCase(img)) {
-			sprintf(emsg, "Sorry, can not handle image");
-			goto fail_return;
-		}
-	}
-	return 1;
-
-  fail_return:
-        TIFFRGBAImageEnd( img );
-        return 0;
+    colorchannels = img->samplesperpixel - extrasamples;
+    TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &compress);
+    TIFFGetFieldDefaulted(tif, TIFFTAG_PLANARCONFIG, &planarconfig);
+    if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &img->photometric))
+    {
+        switch (colorchannels)
+        {
+            case 1:
+                if (isCCITTCompression(tif))
+                    img->photometric = PHOTOMETRIC_MINISWHITE;
+                else
+                    img->photometric = PHOTOMETRIC_MINISBLACK;
+                break;
+            case 3:
+                img->photometric = PHOTOMETRIC_RGB;
+                break;
+            default:
+                snprintf(emsg, EMSG_BUF_SIZE, "Missing needed %s tag",
+                         photoTag);
+                goto fail_return;
+        }
+    }
+    switch (img->photometric)
+    {
+        case PHOTOMETRIC_PALETTE:
+            if (!TIFFGetField(tif, TIFFTAG_COLORMAP, &red_orig, &green_orig,
+                              &blue_orig))
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Missing required \"Colormap\" tag");
+                goto fail_return;
+            }
+
+            /* copy the colormaps so we can modify them */
+            n_color = (1U << img->bitspersample);
+            img->redcmap =
+                (uint16_t *)_TIFFmallocExt(tif, sizeof(uint16_t) * n_color);
+            img->greencmap =
+                (uint16_t *)_TIFFmallocExt(tif, sizeof(uint16_t) * n_color);
+            img->bluecmap =
+                (uint16_t *)_TIFFmallocExt(tif, sizeof(uint16_t) * n_color);
+            if (!img->redcmap || !img->greencmap || !img->bluecmap)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Out of memory for colormap copy");
+                goto fail_return;
+            }
+
+            _TIFFmemcpy(img->redcmap, red_orig, n_color * 2);
+            _TIFFmemcpy(img->greencmap, green_orig, n_color * 2);
+            _TIFFmemcpy(img->bluecmap, blue_orig, n_color * 2);
+
+            /* fall through... */
+        case PHOTOMETRIC_MINISWHITE:
+        case PHOTOMETRIC_MINISBLACK:
+            if (planarconfig == PLANARCONFIG_CONTIG &&
+                img->samplesperpixel != 1 && img->bitspersample < 8)
+            {
+                snprintf(
+                    emsg, EMSG_BUF_SIZE,
+                    "Sorry, can not handle contiguous data with %s=%" PRIu16
+                    ", "
+                    "and %s=%" PRIu16 " and Bits/Sample=%" PRIu16,
+                    photoTag, img->photometric, "Samples/pixel",
+                    img->samplesperpixel, img->bitspersample);
+                goto fail_return;
+            }
+            break;
+        case PHOTOMETRIC_YCBCR:
+            /* It would probably be nice to have a reality check here. */
+            if (planarconfig == PLANARCONFIG_CONTIG)
+                /* can rely on libjpeg to convert to RGB */
+                /* XXX should restore current state on exit */
+                switch (compress)
+                {
+                    case COMPRESSION_JPEG:
+                        /*
+                         * TODO: when complete tests verify complete
+                         * desubsampling and YCbCr handling, remove use of
+                         * TIFFTAG_JPEGCOLORMODE in favor of tif_getimage.c
+                         * native handling
+                         */
+                        TIFFSetField(tif, TIFFTAG_JPEGCOLORMODE,
+                                     JPEGCOLORMODE_RGB);
+                        img->photometric = PHOTOMETRIC_RGB;
+                        break;
+                    default:
+                        /* do nothing */;
+                        break;
+                }
+            /*
+             * TODO: if at all meaningful and useful, make more complete
+             * support check here, or better still, refactor to let supporting
+             * code decide whether there is support and what meaningful
+             * error to return
+             */
+            break;
+        case PHOTOMETRIC_RGB:
+            if (colorchannels < 3)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, can not handle RGB image with %s=%d",
+                         "Color channels", colorchannels);
+                goto fail_return;
+            }
+            break;
+        case PHOTOMETRIC_SEPARATED:
+        {
+            uint16_t inkset;
+            TIFFGetFieldDefaulted(tif, TIFFTAG_INKSET, &inkset);
+            if (inkset != INKSET_CMYK)
+            {
+                snprintf(
+                    emsg, EMSG_BUF_SIZE,
+                    "Sorry, can not handle separated image with %s=%" PRIu16,
+                    "InkSet", inkset);
+                goto fail_return;
+            }
+            if (img->samplesperpixel < 4)
+            {
+                snprintf(
+                    emsg, EMSG_BUF_SIZE,
+                    "Sorry, can not handle separated image with %s=%" PRIu16,
+                    "Samples/pixel", img->samplesperpixel);
+                goto fail_return;
+            }
+        }
+        break;
+        case PHOTOMETRIC_LOGL:
+            if (compress != COMPRESSION_SGILOG)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, LogL data must have %s=%d", "Compression",
+                         COMPRESSION_SGILOG);
+                goto fail_return;
+            }
+            TIFFSetField(tif, TIFFTAG_SGILOGDATAFMT, SGILOGDATAFMT_8BIT);
+            img->photometric = PHOTOMETRIC_MINISBLACK; /* little white lie */
+            img->bitspersample = 8;
+            break;
+        case PHOTOMETRIC_LOGLUV:
+            if (compress != COMPRESSION_SGILOG &&
+                compress != COMPRESSION_SGILOG24)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, LogLuv data must have %s=%d or %d",
+                         "Compression", COMPRESSION_SGILOG,
+                         COMPRESSION_SGILOG24);
+                goto fail_return;
+            }
+            if (planarconfig != PLANARCONFIG_CONTIG)
+            {
+                snprintf(emsg, EMSG_BUF_SIZE,
+                         "Sorry, can not handle LogLuv images with %s=%" PRIu16,
+                         "Planarconfiguration", planarconfig);
+                return (0);
+            }
+            TIFFSetField(tif, TIFFTAG_SGILOGDATAFMT, SGILOGDATAFMT_8BIT);
+            img->photometric = PHOTOMETRIC_RGB; /* little white lie */
+            img->bitspersample = 8;
+            break;
+        case PHOTOMETRIC_CIELAB:
+            break;
+        default:
+            snprintf(emsg, EMSG_BUF_SIZE,
+                     "Sorry, can not handle image with %s=%" PRIu16, photoTag,
+                     img->photometric);
+            goto fail_return;
+    }
+    TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &img->width);
+    TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &img->height);
+    TIFFGetFieldDefaulted(tif, TIFFTAG_ORIENTATION, &img->orientation);
+    img->isContig =
+        !(planarconfig == PLANARCONFIG_SEPARATE && img->samplesperpixel > 1);
+    if (img->isContig)
+    {
+        if (!PickContigCase(img))
+        {
+            snprintf(emsg, EMSG_BUF_SIZE, "Sorry, can not handle image");
+            goto fail_return;
+        }
+    }
+    else
+    {
+        if (!PickSeparateCase(img))
+        {
+            snprintf(emsg, EMSG_BUF_SIZE, "Sorry, can not handle image");
+            goto fail_return;
+        }
+    }
+    return 1;
+
+fail_return:
+    TIFFRGBAImageEnd(img);
+    return 0;
 }
 
-int
-TIFFRGBAImageGet(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
+int TIFFRGBAImageGet(TIFFRGBAImage *img, uint32_t *raster, uint32_t w,
+                     uint32_t h)
 {
-    if (img->get == NULL) {
-		TIFFErrorExt(img->tif->tif_clientdata, TIFFFileName(img->tif), "No \"get\" routine setup");
-		return (0);
-	}
-	if (img->put.any == NULL) {
-		TIFFErrorExt(img->tif->tif_clientdata, TIFFFileName(img->tif),
-		"No \"put\" routine setupl; probably can not handle image format");
-		return (0);
+    if (img->get == NULL)
+    {
+        TIFFErrorExtR(img->tif, TIFFFileName(img->tif),
+                      "No \"get\" routine setup");
+        return (0);
+    }
+    if (img->put.any == NULL)
+    {
+        TIFFErrorExtR(
+            img->tif, TIFFFileName(img->tif),
+            "No \"put\" routine setupl; probably can not handle image format");
+        return (0);
     }
     return (*img->get)(img, raster, w, h);
 }
@@ -516,24 +607,25 @@ TIFFRGBAImageGet(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
  * Read the specified image into an ABGR-format rastertaking in account
  * specified orientation.
  */
-int
-TIFFReadRGBAImageOriented(TIFF* tif,
-			  uint32 rwidth, uint32 rheight, uint32* raster,
-			  int orientation, int stop)
+int TIFFReadRGBAImageOriented(TIFF *tif, uint32_t rwidth, uint32_t rheight,
+                              uint32_t *raster, int orientation, int stop)
 {
-    char emsg[1024] = "";
+    char emsg[EMSG_BUF_SIZE] = "";
     TIFFRGBAImage img;
     int ok;
 
-	if (TIFFRGBAImageOK(tif, emsg) && TIFFRGBAImageBegin(&img, tif, stop, emsg)) {
-		img.req_orientation = (uint16)orientation;
-		/* XXX verify rwidth and rheight against width and height */
-		ok = TIFFRGBAImageGet(&img, raster+(rheight-img.height)*rwidth,
-			rwidth, img.height);
-		TIFFRGBAImageEnd(&img);
-	} else {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", emsg);
-		ok = 0;
+    if (TIFFRGBAImageOK(tif, emsg) && TIFFRGBAImageBegin(&img, tif, stop, emsg))
+    {
+        img.req_orientation = (uint16_t)orientation;
+        /* XXX verify rwidth and rheight against width and height */
+        ok = TIFFRGBAImageGet(&img, raster + (rheight - img.height) * rwidth,
+                              rwidth, img.height);
+        TIFFRGBAImageEnd(&img);
+    }
+    else
+    {
+        TIFFErrorExtR(tif, TIFFFileName(tif), "%s", emsg);
+        ok = 0;
     }
     return (ok);
 }
@@ -542,73 +634,72 @@ TIFFReadRGBAImageOriented(TIFF* tif,
  * Read the specified image into an ABGR-format raster. Use bottom left
  * origin for raster by default.
  */
-int
-TIFFReadRGBAImage(TIFF* tif,
-		  uint32 rwidth, uint32 rheight, uint32* raster, int stop)
-{
-	return TIFFReadRGBAImageOriented(tif, rwidth, rheight, raster,
-					 ORIENTATION_BOTLEFT, stop);
-}
-
-static int 
-setorientation(TIFFRGBAImage* img)
-{
-	switch (img->orientation) {
-		case ORIENTATION_TOPLEFT:
-		case ORIENTATION_LEFTTOP:
-			if (img->req_orientation == ORIENTATION_TOPRIGHT ||
-			    img->req_orientation == ORIENTATION_RIGHTTOP)
-				return FLIP_HORIZONTALLY;
-			else if (img->req_orientation == ORIENTATION_BOTRIGHT ||
-			    img->req_orientation == ORIENTATION_RIGHTBOT)
-				return FLIP_HORIZONTALLY | FLIP_VERTICALLY;
-			else if (img->req_orientation == ORIENTATION_BOTLEFT ||
-			    img->req_orientation == ORIENTATION_LEFTBOT)
-				return FLIP_VERTICALLY;
-			else
-				return 0;
-		case ORIENTATION_TOPRIGHT:
-		case ORIENTATION_RIGHTTOP:
-			if (img->req_orientation == ORIENTATION_TOPLEFT ||
-			    img->req_orientation == ORIENTATION_LEFTTOP)
-				return FLIP_HORIZONTALLY;
-			else if (img->req_orientation == ORIENTATION_BOTRIGHT ||
-			    img->req_orientation == ORIENTATION_RIGHTBOT)
-				return FLIP_VERTICALLY;
-			else if (img->req_orientation == ORIENTATION_BOTLEFT ||
-			    img->req_orientation == ORIENTATION_LEFTBOT)
-				return FLIP_HORIZONTALLY | FLIP_VERTICALLY;
-			else
-				return 0;
-		case ORIENTATION_BOTRIGHT:
-		case ORIENTATION_RIGHTBOT:
-			if (img->req_orientation == ORIENTATION_TOPLEFT ||
-			    img->req_orientation == ORIENTATION_LEFTTOP)
-				return FLIP_HORIZONTALLY | FLIP_VERTICALLY;
-			else if (img->req_orientation == ORIENTATION_TOPRIGHT ||
-			    img->req_orientation == ORIENTATION_RIGHTTOP)
-				return FLIP_VERTICALLY;
-			else if (img->req_orientation == ORIENTATION_BOTLEFT ||
-			    img->req_orientation == ORIENTATION_LEFTBOT)
-				return FLIP_HORIZONTALLY;
-			else
-				return 0;
-		case ORIENTATION_BOTLEFT:
-		case ORIENTATION_LEFTBOT:
-			if (img->req_orientation == ORIENTATION_TOPLEFT ||
-			    img->req_orientation == ORIENTATION_LEFTTOP)
-				return FLIP_VERTICALLY;
-			else if (img->req_orientation == ORIENTATION_TOPRIGHT ||
-			    img->req_orientation == ORIENTATION_RIGHTTOP)
-				return FLIP_HORIZONTALLY | FLIP_VERTICALLY;
-			else if (img->req_orientation == ORIENTATION_BOTRIGHT ||
-			    img->req_orientation == ORIENTATION_RIGHTBOT)
-				return FLIP_HORIZONTALLY;
-			else
-				return 0;
-		default:	/* NOTREACHED */
-			return 0;
-	}
+int TIFFReadRGBAImage(TIFF *tif, uint32_t rwidth, uint32_t rheight,
+                      uint32_t *raster, int stop)
+{
+    return TIFFReadRGBAImageOriented(tif, rwidth, rheight, raster,
+                                     ORIENTATION_BOTLEFT, stop);
+}
+
+static int setorientation(TIFFRGBAImage *img)
+{
+    switch (img->orientation)
+    {
+        case ORIENTATION_TOPLEFT:
+        case ORIENTATION_LEFTTOP:
+            if (img->req_orientation == ORIENTATION_TOPRIGHT ||
+                img->req_orientation == ORIENTATION_RIGHTTOP)
+                return FLIP_HORIZONTALLY;
+            else if (img->req_orientation == ORIENTATION_BOTRIGHT ||
+                     img->req_orientation == ORIENTATION_RIGHTBOT)
+                return FLIP_HORIZONTALLY | FLIP_VERTICALLY;
+            else if (img->req_orientation == ORIENTATION_BOTLEFT ||
+                     img->req_orientation == ORIENTATION_LEFTBOT)
+                return FLIP_VERTICALLY;
+            else
+                return 0;
+        case ORIENTATION_TOPRIGHT:
+        case ORIENTATION_RIGHTTOP:
+            if (img->req_orientation == ORIENTATION_TOPLEFT ||
+                img->req_orientation == ORIENTATION_LEFTTOP)
+                return FLIP_HORIZONTALLY;
+            else if (img->req_orientation == ORIENTATION_BOTRIGHT ||
+                     img->req_orientation == ORIENTATION_RIGHTBOT)
+                return FLIP_VERTICALLY;
+            else if (img->req_orientation == ORIENTATION_BOTLEFT ||
+                     img->req_orientation == ORIENTATION_LEFTBOT)
+                return FLIP_HORIZONTALLY | FLIP_VERTICALLY;
+            else
+                return 0;
+        case ORIENTATION_BOTRIGHT:
+        case ORIENTATION_RIGHTBOT:
+            if (img->req_orientation == ORIENTATION_TOPLEFT ||
+                img->req_orientation == ORIENTATION_LEFTTOP)
+                return FLIP_HORIZONTALLY | FLIP_VERTICALLY;
+            else if (img->req_orientation == ORIENTATION_TOPRIGHT ||
+                     img->req_orientation == ORIENTATION_RIGHTTOP)
+                return FLIP_VERTICALLY;
+            else if (img->req_orientation == ORIENTATION_BOTLEFT ||
+                     img->req_orientation == ORIENTATION_LEFTBOT)
+                return FLIP_HORIZONTALLY;
+            else
+                return 0;
+        case ORIENTATION_BOTLEFT:
+        case ORIENTATION_LEFTBOT:
+            if (img->req_orientation == ORIENTATION_TOPLEFT ||
+                img->req_orientation == ORIENTATION_LEFTTOP)
+                return FLIP_VERTICALLY;
+            else if (img->req_orientation == ORIENTATION_TOPRIGHT ||
+                     img->req_orientation == ORIENTATION_RIGHTTOP)
+                return FLIP_HORIZONTALLY | FLIP_VERTICALLY;
+            else if (img->req_orientation == ORIENTATION_BOTRIGHT ||
+                     img->req_orientation == ORIENTATION_RIGHTBOT)
+                return FLIP_HORIZONTALLY;
+            else
+                return 0;
+        default: /* NOTREACHED */
+            return 0;
+    }
 }
 
 /*
@@ -616,28 +707,29 @@ setorientation(TIFFRGBAImage* img)
  *	PlanarConfiguration contiguous if SamplesPerPixel > 1
  * or
  *	SamplesPerPixel == 1
- */	
-static int
-gtTileContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
+ */
+static int gtTileContig(TIFFRGBAImage *img, uint32_t *raster, uint32_t w,
+                        uint32_t h)
 {
-    TIFF* tif = img->tif;
+    TIFF *tif = img->tif;
     tileContigRoutine put = img->put.contig;
-    uint32 col, row, y, rowstoread;
+    uint32_t col, row, y, rowstoread;
     tmsize_t pos;
-    uint32 tw, th;
-    unsigned char* buf = NULL;
-    int32 fromskew, toskew;
-    uint32 nrow;
+    uint32_t tw, th;
+    unsigned char *buf = NULL;
+    int32_t fromskew, toskew;
+    uint32_t nrow;
     int ret = 1, flip;
-    uint32 this_tw, tocol;
-    int32 this_toskew, leftmost_toskew;
-    int32 leftmost_fromskew;
-    uint32 leftmost_tw;
+    uint32_t this_tw, tocol;
+    int32_t this_toskew, leftmost_toskew;
+    int32_t leftmost_fromskew;
+    uint32_t leftmost_tw;
     tmsize_t bufsize;
 
     bufsize = TIFFTileSize(tif);
-    if (bufsize == 0) {
-        TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "No space for tile buffer");
+    if (bufsize == 0)
+    {
+        TIFFErrorExtR(tif, TIFFFileName(tif), "%s", "No space for tile buffer");
         return (0);
     }
 
@@ -645,23 +737,29 @@ gtTileContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
     TIFFGetField(tif, TIFFTAG_TILELENGTH, &th);
 
     flip = setorientation(img);
-    if (flip & FLIP_VERTICALLY) {
-        if ((tw + w) > INT_MAX) {
-            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "unsupported tile size (too wide)");
+    if (flip & FLIP_VERTICALLY)
+    {
+        if ((tw + w) > INT_MAX)
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif), "%s",
+                          "unsupported tile size (too wide)");
             return (0);
         }
         y = h - 1;
-        toskew = -(int32)(tw + w);
+        toskew = -(int32_t)(tw + w);
     }
-    else {
-        if (tw > (INT_MAX + w)) {
-            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "unsupported tile size (too wide)");
+    else
+    {
+        if (tw > (INT_MAX + w))
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif), "%s",
+                          "unsupported tile size (too wide)");
             return (0);
         }
         y = 0;
-        toskew = -(int32)(tw - w);
+        toskew = -(int32_t)(tw - w);
     }
-     
+
     /*
      *	Leftmost tile is clipped on left side if col_offset > 0.
      */
@@ -671,62 +769,69 @@ gtTileContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
     for (row = 0; ret != 0 && row < h; row += nrow)
     {
         rowstoread = th - (row + img->row_offset) % th;
-    	nrow = (row + rowstoread > h ? h - row : rowstoread);
-	fromskew = leftmost_fromskew;
-	this_tw = leftmost_tw;
-	this_toskew = leftmost_toskew;
-	tocol = 0;
-	col = img->col_offset;
-	while (tocol < w)
+        nrow = (row + rowstoread > h ? h - row : rowstoread);
+        fromskew = leftmost_fromskew;
+        this_tw = leftmost_tw;
+        this_toskew = leftmost_toskew;
+        tocol = 0;
+        col = img->col_offset;
+        while (tocol < w)
         {
-	    if (_TIFFReadTileAndAllocBuffer(tif, (void**) &buf, bufsize, col,
-			     row+img->row_offset, 0, 0)==(tmsize_t)(-1) &&
+            if (_TIFFReadTileAndAllocBuffer(tif, (void **)&buf, bufsize, col,
+                                            row + img->row_offset, 0,
+                                            0) == (tmsize_t)(-1) &&
                 (buf == NULL || img->stoponerr))
             {
                 ret = 0;
                 break;
             }
-            pos = ((row+img->row_offset) % th) * TIFFTileRowSize(tif) + \
-		   ((tmsize_t) fromskew * img->samplesperpixel);
-	    if (tocol + this_tw > w) 
-	    {
-		/*
-		 * Rightmost tile is clipped on right side.
-		 */
-		fromskew = tw - (w - tocol);
-		this_tw = tw - fromskew;
-		this_toskew = toskew + fromskew;
-	    }
-	    (*put)(img, raster+y*w+tocol, tocol, y, this_tw, nrow, fromskew, this_toskew, buf + pos);
-	    tocol += this_tw;
-	    col += this_tw;
-	    /*
-	     * After the leftmost tile, tiles are no longer clipped on left side.
-	     */
-	    fromskew = 0;
-	    this_tw = tw;
-	    this_toskew = toskew;
-	}
-
-        y += ((flip & FLIP_VERTICALLY) ? -(int32) nrow : (int32) nrow);
-    }
-    _TIFFfree(buf);
-
-    if (flip & FLIP_HORIZONTALLY) {
-	    uint32 line;
-
-	    for (line = 0; line < h; line++) {
-		    uint32 *left = raster + (line * w);
-		    uint32 *right = left + w - 1;
-		    
-		    while ( left < right ) {
-			    uint32 temp = *left;
-			    *left = *right;
-			    *right = temp;
-			    left++;
-				right--;
-		    }
-	    }
+            pos = ((row + img->row_offset) % th) * TIFFTileRowSize(tif) +
+                  ((tmsize_t)fromskew * img->samplesperpixel);
+            if (tocol + this_tw > w)
+            {
+                /*
+                 * Rightmost tile is clipped on right side.
+                 */
+                fromskew = tw - (w - tocol);
+                this_tw = tw - fromskew;
+                this_toskew = toskew + fromskew;
+            }
+            tmsize_t roffset = (tmsize_t)y * w + tocol;
+            (*put)(img, raster + roffset, tocol, y, this_tw, nrow, fromskew,
+                   this_toskew, buf + pos);
+            tocol += this_tw;
+            col += this_tw;
+            /*
+             * After the leftmost tile, tiles are no longer clipped on left
+             * side.
+             */
+            fromskew = 0;
+            this_tw = tw;
+            this_toskew = toskew;
+        }
+
+        y += ((flip & FLIP_VERTICALLY) ? -(int32_t)nrow : (int32_t)nrow);
+    }
+    _TIFFfreeExt(img->tif, buf);
+
+    if (flip & FLIP_HORIZONTALLY)
+    {
+        uint32_t line;
+
+        for (line = 0; line < h; line++)
+        {
+            uint32_t *left = raster + (line * w);
+            uint32_t *right = left + w - 1;
+
+            while (left < right)
+            {
+                uint32_t temp = *left;
+                *left = *right;
+                *right = temp;
+                left++;
+                right--;
+            }
+        }
     }
 
     return (ret);
@@ -737,188 +842,203 @@ gtTileContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
  *	 SamplesPerPixel > 1
  *	 PlanarConfiguration separated
  * We assume that all such images are RGB.
- */	
-static int
-gtTileSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
-{
-	TIFF* tif = img->tif;
-	tileSeparateRoutine put = img->put.separate;
-	uint32 col, row, y, rowstoread;
-	tmsize_t pos;
-	uint32 tw, th;
-	unsigned char* buf = NULL;
-	unsigned char* p0 = NULL;
-	unsigned char* p1 = NULL;
-	unsigned char* p2 = NULL;
-	unsigned char* pa = NULL;
-	tmsize_t tilesize;
-	tmsize_t bufsize;
-	int32 fromskew, toskew;
-	int alpha = img->alpha;
-	uint32 nrow;
-	int ret = 1, flip;
-        uint16 colorchannels;
-	uint32 this_tw, tocol;
-	int32 this_toskew, leftmost_toskew;
-	int32 leftmost_fromskew;
-	uint32 leftmost_tw;
-
-	tilesize = TIFFTileSize(tif);  
-	bufsize = _TIFFMultiplySSize(tif, alpha?4:3,tilesize, "gtTileSeparate");
-	if (bufsize == 0) {
-		return (0);
-	}
-
-	TIFFGetField(tif, TIFFTAG_TILEWIDTH, &tw);
-	TIFFGetField(tif, TIFFTAG_TILELENGTH, &th);
-
-	flip = setorientation(img);
-	if (flip & FLIP_VERTICALLY) {
-		if ((tw + w) > INT_MAX) {
-            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "unsupported tile size (too wide)");
+ */
+static int gtTileSeparate(TIFFRGBAImage *img, uint32_t *raster, uint32_t w,
+                          uint32_t h)
+{
+    TIFF *tif = img->tif;
+    tileSeparateRoutine put = img->put.separate;
+    uint32_t col, row, y, rowstoread;
+    tmsize_t pos;
+    uint32_t tw, th;
+    unsigned char *buf = NULL;
+    unsigned char *p0 = NULL;
+    unsigned char *p1 = NULL;
+    unsigned char *p2 = NULL;
+    unsigned char *pa = NULL;
+    tmsize_t tilesize;
+    tmsize_t bufsize;
+    int32_t fromskew, toskew;
+    int alpha = img->alpha;
+    uint32_t nrow;
+    int ret = 1, flip;
+    uint16_t colorchannels;
+    uint32_t this_tw, tocol;
+    int32_t this_toskew, leftmost_toskew;
+    int32_t leftmost_fromskew;
+    uint32_t leftmost_tw;
+
+    tilesize = TIFFTileSize(tif);
+    bufsize =
+        _TIFFMultiplySSize(tif, alpha ? 4 : 3, tilesize, "gtTileSeparate");
+    if (bufsize == 0)
+    {
+        return (0);
+    }
+
+    TIFFGetField(tif, TIFFTAG_TILEWIDTH, &tw);
+    TIFFGetField(tif, TIFFTAG_TILELENGTH, &th);
+
+    flip = setorientation(img);
+    if (flip & FLIP_VERTICALLY)
+    {
+        if ((tw + w) > INT_MAX)
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif), "%s",
+                          "unsupported tile size (too wide)");
             return (0);
         }
-		y = h - 1;
-		toskew = -(int32)(tw + w);
-	}
-	else {
-		if (tw > (INT_MAX + w)) {
-            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", "unsupported tile size (too wide)");
+        y = h - 1;
+        toskew = -(int32_t)(tw + w);
+    }
+    else
+    {
+        if (tw > (INT_MAX + w))
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif), "%s",
+                          "unsupported tile size (too wide)");
             return (0);
         }
-		y = 0;
-		toskew = -(int32)(tw - w);
-	}
+        y = 0;
+        toskew = -(int32_t)(tw - w);
+    }
 
-        switch( img->photometric )
-        {
-          case PHOTOMETRIC_MINISWHITE:
-          case PHOTOMETRIC_MINISBLACK:
-          case PHOTOMETRIC_PALETTE:
+    switch (img->photometric)
+    {
+        case PHOTOMETRIC_MINISWHITE:
+        case PHOTOMETRIC_MINISBLACK:
+        case PHOTOMETRIC_PALETTE:
             colorchannels = 1;
             break;
 
-          default:
+        default:
             colorchannels = 3;
             break;
+    }
+
+    /*
+     *	Leftmost tile is clipped on left side if col_offset > 0.
+     */
+    leftmost_fromskew = img->col_offset % tw;
+    leftmost_tw = tw - leftmost_fromskew;
+    leftmost_toskew = toskew + leftmost_fromskew;
+    for (row = 0; ret != 0 && row < h; row += nrow)
+    {
+        rowstoread = th - (row + img->row_offset) % th;
+        nrow = (row + rowstoread > h ? h - row : rowstoread);
+        fromskew = leftmost_fromskew;
+        this_tw = leftmost_tw;
+        this_toskew = leftmost_toskew;
+        tocol = 0;
+        col = img->col_offset;
+        while (tocol < w)
+        {
+            if (buf == NULL)
+            {
+                if (_TIFFReadTileAndAllocBuffer(tif, (void **)&buf, bufsize,
+                                                col, row + img->row_offset, 0,
+                                                0) == (tmsize_t)(-1) &&
+                    (buf == NULL || img->stoponerr))
+                {
+                    ret = 0;
+                    break;
+                }
+                p0 = buf;
+                if (colorchannels == 1)
+                {
+                    p2 = p1 = p0;
+                    pa = (alpha ? (p0 + 3 * tilesize) : NULL);
+                }
+                else
+                {
+                    p1 = p0 + tilesize;
+                    p2 = p1 + tilesize;
+                    pa = (alpha ? (p2 + tilesize) : NULL);
+                }
+            }
+            else if (TIFFReadTile(tif, p0, col, row + img->row_offset, 0, 0) ==
+                         (tmsize_t)(-1) &&
+                     img->stoponerr)
+            {
+                ret = 0;
+                break;
+            }
+            if (colorchannels > 1 &&
+                TIFFReadTile(tif, p1, col, row + img->row_offset, 0, 1) ==
+                    (tmsize_t)(-1) &&
+                img->stoponerr)
+            {
+                ret = 0;
+                break;
+            }
+            if (colorchannels > 1 &&
+                TIFFReadTile(tif, p2, col, row + img->row_offset, 0, 2) ==
+                    (tmsize_t)(-1) &&
+                img->stoponerr)
+            {
+                ret = 0;
+                break;
+            }
+            if (alpha &&
+                TIFFReadTile(tif, pa, col, row + img->row_offset, 0,
+                             colorchannels) == (tmsize_t)(-1) &&
+                img->stoponerr)
+            {
+                ret = 0;
+                break;
+            }
+
+            pos = ((row + img->row_offset) % th) * TIFFTileRowSize(tif) +
+                  ((tmsize_t)fromskew * img->samplesperpixel);
+            if (tocol + this_tw > w)
+            {
+                /*
+                 * Rightmost tile is clipped on right side.
+                 */
+                fromskew = tw - (w - tocol);
+                this_tw = tw - fromskew;
+                this_toskew = toskew + fromskew;
+            }
+            tmsize_t roffset = (tmsize_t)y * w + tocol;
+            (*put)(img, raster + roffset, tocol, y, this_tw, nrow, fromskew,
+                   this_toskew, p0 + pos, p1 + pos, p2 + pos,
+                   (alpha ? (pa + pos) : NULL));
+            tocol += this_tw;
+            col += this_tw;
+            /*
+             * After the leftmost tile, tiles are no longer clipped on left
+             * side.
+             */
+            fromskew = 0;
+            this_tw = tw;
+            this_toskew = toskew;
         }
 
-	/*
-	 *	Leftmost tile is clipped on left side if col_offset > 0.
-	 */
-	leftmost_fromskew = img->col_offset % tw;
-	leftmost_tw = tw - leftmost_fromskew;
-	leftmost_toskew = toskew + leftmost_fromskew;
-	for (row = 0; ret != 0 && row < h; row += nrow)
-	{
-		rowstoread = th - (row + img->row_offset) % th;
-		nrow = (row + rowstoread > h ? h - row : rowstoread);
-		fromskew = leftmost_fromskew;
-		this_tw = leftmost_tw;
-		this_toskew = leftmost_toskew;
-		tocol = 0;
-		col = img->col_offset;
-		while (tocol < w)
-		{
-                        if( buf == NULL )
-                        {
-                            if (_TIFFReadTileAndAllocBuffer(
-                                    tif, (void**) &buf, bufsize, col,
-                                    row+img->row_offset,0,0)==(tmsize_t)(-1)
-                                && (buf == NULL || img->stoponerr))
-                            {
-                                    ret = 0;
-                                    break;
-                            }
-                            p0 = buf;
-                            if( colorchannels == 1 )
-                            {
-                                p2 = p1 = p0;
-                                pa = (alpha?(p0+3*tilesize):NULL);
-                            }
-                            else
-                            {
-                                p1 = p0 + tilesize;
-                                p2 = p1 + tilesize;
-                                pa = (alpha?(p2+tilesize):NULL);
-                            }
-                        }
-			else if (TIFFReadTile(tif, p0, col,  
-			    row+img->row_offset,0,0)==(tmsize_t)(-1) && img->stoponerr)
-			{
-				ret = 0;
-				break;
-			}
-			if (colorchannels > 1 
-                            && TIFFReadTile(tif, p1, col,  
-                                            row+img->row_offset,0,1) == (tmsize_t)(-1) 
-                            && img->stoponerr)
-			{
-				ret = 0;
-				break;
-			}
-			if (colorchannels > 1 
-                            && TIFFReadTile(tif, p2, col,  
-                                            row+img->row_offset,0,2) == (tmsize_t)(-1) 
-                            && img->stoponerr)
-			{
-				ret = 0;
-				break;
-			}
-			if (alpha
-                            && TIFFReadTile(tif,pa,col,  
-                                            row+img->row_offset,0,colorchannels) == (tmsize_t)(-1) 
-                            && img->stoponerr)
-                        {
-                            ret = 0;
-                            break;
-			}
-
-			pos = ((row+img->row_offset) % th) * TIFFTileRowSize(tif) + \
-			   ((tmsize_t) fromskew * img->samplesperpixel);
-			if (tocol + this_tw > w) 
-			{
-				/*
-				 * Rightmost tile is clipped on right side.
-				 */
-				fromskew = tw - (w - tocol);
-				this_tw = tw - fromskew;
-				this_toskew = toskew + fromskew;
-			}
-			(*put)(img, raster+y*w+tocol, tocol, y, this_tw, nrow, fromskew, this_toskew, \
-				p0 + pos, p1 + pos, p2 + pos, (alpha?(pa+pos):NULL));
-			tocol += this_tw;
-			col += this_tw;
-			/*
-			* After the leftmost tile, tiles are no longer clipped on left side.
-			*/
-			fromskew = 0;
-			this_tw = tw;
-			this_toskew = toskew;
-		}
-
-		y += ((flip & FLIP_VERTICALLY) ?-(int32) nrow : (int32) nrow);
-	}
-
-	if (flip & FLIP_HORIZONTALLY) {
-		uint32 line;
-
-		for (line = 0; line < h; line++) {
-			uint32 *left = raster + (line * w);
-			uint32 *right = left + w - 1;
-
-			while ( left < right ) {
-				uint32 temp = *left;
-				*left = *right;
-				*right = temp;
-				left++;
-				right--;
-			}
-		}
-	}
-
-	_TIFFfree(buf);
-	return (ret);
+        y += ((flip & FLIP_VERTICALLY) ? -(int32_t)nrow : (int32_t)nrow);
+    }
+
+    if (flip & FLIP_HORIZONTALLY)
+    {
+        uint32_t line;
+
+        for (line = 0; line < h; line++)
+        {
+            uint32_t *left = raster + (line * w);
+            uint32_t *right = left + w - 1;
+
+            while (left < right)
+            {
+                uint32_t temp = *left;
+                *left = *right;
+                *right = temp;
+                left++;
+                right--;
+            }
+        }
+    }
+
+    _TIFFfreeExt(img->tif, buf);
+    return (ret);
 }
 
 /*
@@ -926,98 +1046,110 @@ gtTileSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
  *	PlanarConfiguration contiguous if SamplesPerPixel > 1
  * or
  *	SamplesPerPixel == 1
- */	
-static int
-gtStripContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
-{
-	TIFF* tif = img->tif;
-	tileContigRoutine put = img->put.contig;
-	uint32 row, y, nrow, nrowsub, rowstoread;
-	tmsize_t pos;
-	unsigned char* buf = NULL;
-	uint32 rowsperstrip;
-	uint16 subsamplinghor,subsamplingver;
-	uint32 imagewidth = img->width;
-	tmsize_t scanline;
-	int32 fromskew, toskew;
-	int ret = 1, flip;
-        tmsize_t maxstripsize;
-
-	TIFFGetFieldDefaulted(tif, TIFFTAG_YCBCRSUBSAMPLING, &subsamplinghor, &subsamplingver);
-	if( subsamplingver == 0 ) {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Invalid vertical YCbCr subsampling");
-		return (0);
-	}
-	
-	maxstripsize = TIFFStripSize(tif);
-
-	flip = setorientation(img);
-	if (flip & FLIP_VERTICALLY) {
-		if ( w > INT_MAX ) {
-        	TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Width overflow");
-			return (0);
-		}
-		y = h - 1;
-		toskew = -(int32)(w + w);
-	} else {
-		y = 0;
-		toskew = -(int32)(w - w);
-	}
-
-	TIFFGetFieldDefaulted(tif, TIFFTAG_ROWSPERSTRIP, &rowsperstrip);
-
-	scanline = TIFFScanlineSize(tif);
-	fromskew = (w < imagewidth ? imagewidth - w : 0);
-	for (row = 0; row < h; row += nrow)
-	{
-		uint32 temp;
-		rowstoread = rowsperstrip - (row + img->row_offset) % rowsperstrip;
-		nrow = (row + rowstoread > h ? h - row : rowstoread);
-		nrowsub = nrow;
-		if ((nrowsub%subsamplingver)!=0)
-			nrowsub+=subsamplingver-nrowsub%subsamplingver;
-		temp = (row + img->row_offset)%rowsperstrip + nrowsub;
-		if( scanline > 0 && temp > (size_t)(TIFF_TMSIZE_T_MAX / scanline) )
-		{
-			TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Integer overflow in gtStripContig");
-			return 0;
-		}
-		if (_TIFFReadEncodedStripAndAllocBuffer(tif,
-		    TIFFComputeStrip(tif,row+img->row_offset, 0),
-		    (void**)(&buf),
-                    maxstripsize,
-		    temp * scanline)==(tmsize_t)(-1)
-		    && (buf == NULL || img->stoponerr))
-		{
-			ret = 0;
-			break;
-		}
-
-		pos = ((row + img->row_offset) % rowsperstrip) * scanline + \
-			((tmsize_t) img->col_offset * img->samplesperpixel);
-		(*put)(img, raster+y*w, 0, y, w, nrow, fromskew, toskew, buf + pos);
-		y += ((flip & FLIP_VERTICALLY) ? -(int32) nrow : (int32) nrow);
-	}
-
-	if (flip & FLIP_HORIZONTALLY) {
-		uint32 line;
-
-		for (line = 0; line < h; line++) {
-			uint32 *left = raster + (line * w);
-			uint32 *right = left + w - 1;
-
-			while ( left < right ) {
-				uint32 temp = *left;
-				*left = *right;
-				*right = temp;
-				left++;
-				right--;
-			}
-		}
-	}
-
-	_TIFFfree(buf);
-	return (ret);
+ */
+static int gtStripContig(TIFFRGBAImage *img, uint32_t *raster, uint32_t w,
+                         uint32_t h)
+{
+    TIFF *tif = img->tif;
+    tileContigRoutine put = img->put.contig;
+    uint32_t row, y, nrow, nrowsub, rowstoread;
+    tmsize_t pos;
+    unsigned char *buf = NULL;
+    uint32_t rowsperstrip;
+    uint16_t subsamplinghor, subsamplingver;
+    uint32_t imagewidth = img->width;
+    tmsize_t scanline;
+    int32_t fromskew, toskew;
+    int ret = 1, flip;
+    tmsize_t maxstripsize;
+
+    TIFFGetFieldDefaulted(tif, TIFFTAG_YCBCRSUBSAMPLING, &subsamplinghor,
+                          &subsamplingver);
+    if (subsamplingver == 0)
+    {
+        TIFFErrorExtR(tif, TIFFFileName(tif),
+                      "Invalid vertical YCbCr subsampling");
+        return (0);
+    }
+
+    maxstripsize = TIFFStripSize(tif);
+
+    flip = setorientation(img);
+    if (flip & FLIP_VERTICALLY)
+    {
+        if (w > INT_MAX)
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif), "Width overflow");
+            return (0);
+        }
+        y = h - 1;
+        toskew = -(int32_t)(w + w);
+    }
+    else
+    {
+        y = 0;
+        toskew = -(int32_t)(w - w);
+    }
+
+    TIFFGetFieldDefaulted(tif, TIFFTAG_ROWSPERSTRIP, &rowsperstrip);
+
+    scanline = TIFFScanlineSize(tif);
+    fromskew = (w < imagewidth ? imagewidth - w : 0);
+    for (row = 0; row < h; row += nrow)
+    {
+        uint32_t temp;
+        rowstoread = rowsperstrip - (row + img->row_offset) % rowsperstrip;
+        nrow = (row + rowstoread > h ? h - row : rowstoread);
+        nrowsub = nrow;
+        if ((nrowsub % subsamplingver) != 0)
+            nrowsub += subsamplingver - nrowsub % subsamplingver;
+        temp = (row + img->row_offset) % rowsperstrip + nrowsub;
+        if (scanline > 0 && temp > (size_t)(TIFF_TMSIZE_T_MAX / scanline))
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif),
+                          "Integer overflow in gtStripContig");
+            return 0;
+        }
+        if (_TIFFReadEncodedStripAndAllocBuffer(
+                tif, TIFFComputeStrip(tif, row + img->row_offset, 0),
+                (void **)(&buf), maxstripsize,
+                temp * scanline) == (tmsize_t)(-1) &&
+            (buf == NULL || img->stoponerr))
+        {
+            ret = 0;
+            break;
+        }
+
+        pos = ((row + img->row_offset) % rowsperstrip) * scanline +
+              ((tmsize_t)img->col_offset * img->samplesperpixel);
+        tmsize_t roffset = (tmsize_t)y * w;
+        (*put)(img, raster + roffset, 0, y, w, nrow, fromskew, toskew,
+               buf + pos);
+        y += ((flip & FLIP_VERTICALLY) ? -(int32_t)nrow : (int32_t)nrow);
+    }
+
+    if (flip & FLIP_HORIZONTALLY)
+    {
+        uint32_t line;
+
+        for (line = 0; line < h; line++)
+        {
+            uint32_t *left = raster + (line * w);
+            uint32_t *right = left + w - 1;
+
+            while (left < right)
+            {
+                uint32_t temp = *left;
+                *left = *right;
+                *right = temp;
+                left++;
+                right--;
+            }
+        }
+    }
+
+    _TIFFfreeExt(img->tif, buf);
+    return (ret);
 }
 
 /*
@@ -1026,157 +1158,167 @@ gtStripContig(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
  *	 PlanarConfiguration separated
  * We assume that all such images are RGB.
  */
-static int
-gtStripSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
-{
-	TIFF* tif = img->tif;
-	tileSeparateRoutine put = img->put.separate;
-	unsigned char *buf = NULL;
-	unsigned char *p0 = NULL, *p1 = NULL, *p2 = NULL, *pa = NULL;
-	uint32 row, y, nrow, rowstoread;
-	tmsize_t pos;
-	tmsize_t scanline;
-	uint32 rowsperstrip, offset_row;
-	uint32 imagewidth = img->width;
-	tmsize_t stripsize;
-	tmsize_t bufsize;
-	int32 fromskew, toskew;
-	int alpha = img->alpha;
-	int ret = 1, flip;
-        uint16 colorchannels;
-
-	stripsize = TIFFStripSize(tif);  
-	bufsize = _TIFFMultiplySSize(tif,alpha?4:3,stripsize, "gtStripSeparate");
-	if (bufsize == 0) {
-		return (0);
-	}
-
-	flip = setorientation(img);
-	if (flip & FLIP_VERTICALLY) {
-		if ( w > INT_MAX ) {
-        	TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Width overflow");
-			return (0);
-		}
-		y = h - 1;
-		toskew = -(int32)(w + w);
-	}
-	else {
-		y = 0;
-		toskew = -(int32)(w - w);
-	}
-
-        switch( img->photometric )
+static int gtStripSeparate(TIFFRGBAImage *img, uint32_t *raster, uint32_t w,
+                           uint32_t h)
+{
+    TIFF *tif = img->tif;
+    tileSeparateRoutine put = img->put.separate;
+    unsigned char *buf = NULL;
+    unsigned char *p0 = NULL, *p1 = NULL, *p2 = NULL, *pa = NULL;
+    uint32_t row, y, nrow, rowstoread;
+    tmsize_t pos;
+    tmsize_t scanline;
+    uint32_t rowsperstrip, offset_row;
+    uint32_t imagewidth = img->width;
+    tmsize_t stripsize;
+    tmsize_t bufsize;
+    int32_t fromskew, toskew;
+    int alpha = img->alpha;
+    int ret = 1, flip;
+    uint16_t colorchannels;
+
+    stripsize = TIFFStripSize(tif);
+    bufsize =
+        _TIFFMultiplySSize(tif, alpha ? 4 : 3, stripsize, "gtStripSeparate");
+    if (bufsize == 0)
+    {
+        return (0);
+    }
+
+    flip = setorientation(img);
+    if (flip & FLIP_VERTICALLY)
+    {
+        if (w > INT_MAX)
         {
-          case PHOTOMETRIC_MINISWHITE:
-          case PHOTOMETRIC_MINISBLACK:
-          case PHOTOMETRIC_PALETTE:
+            TIFFErrorExtR(tif, TIFFFileName(tif), "Width overflow");
+            return (0);
+        }
+        y = h - 1;
+        toskew = -(int32_t)(w + w);
+    }
+    else
+    {
+        y = 0;
+        toskew = -(int32_t)(w - w);
+    }
+
+    switch (img->photometric)
+    {
+        case PHOTOMETRIC_MINISWHITE:
+        case PHOTOMETRIC_MINISBLACK:
+        case PHOTOMETRIC_PALETTE:
             colorchannels = 1;
             break;
 
-          default:
+        default:
             colorchannels = 3;
             break;
+    }
+
+    TIFFGetFieldDefaulted(tif, TIFFTAG_ROWSPERSTRIP, &rowsperstrip);
+    scanline = TIFFScanlineSize(tif);
+    fromskew = (w < imagewidth ? imagewidth - w : 0);
+    for (row = 0; row < h; row += nrow)
+    {
+        uint32_t temp;
+        rowstoread = rowsperstrip - (row + img->row_offset) % rowsperstrip;
+        nrow = (row + rowstoread > h ? h - row : rowstoread);
+        offset_row = row + img->row_offset;
+        temp = (row + img->row_offset) % rowsperstrip + nrow;
+        if (scanline > 0 && temp > (size_t)(TIFF_TMSIZE_T_MAX / scanline))
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif),
+                          "Integer overflow in gtStripSeparate");
+            return 0;
+        }
+        if (buf == NULL)
+        {
+            if (_TIFFReadEncodedStripAndAllocBuffer(
+                    tif, TIFFComputeStrip(tif, offset_row, 0), (void **)&buf,
+                    bufsize, temp * scanline) == (tmsize_t)(-1) &&
+                (buf == NULL || img->stoponerr))
+            {
+                ret = 0;
+                break;
+            }
+            p0 = buf;
+            if (colorchannels == 1)
+            {
+                p2 = p1 = p0;
+                pa = (alpha ? (p0 + 3 * stripsize) : NULL);
+            }
+            else
+            {
+                p1 = p0 + stripsize;
+                p2 = p1 + stripsize;
+                pa = (alpha ? (p2 + stripsize) : NULL);
+            }
+        }
+        else if (TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 0),
+                                      p0, temp * scanline) == (tmsize_t)(-1) &&
+                 img->stoponerr)
+        {
+            ret = 0;
+            break;
+        }
+        if (colorchannels > 1 &&
+            TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 1), p1,
+                                 temp * scanline) == (tmsize_t)(-1) &&
+            img->stoponerr)
+        {
+            ret = 0;
+            break;
+        }
+        if (colorchannels > 1 &&
+            TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 2), p2,
+                                 temp * scanline) == (tmsize_t)(-1) &&
+            img->stoponerr)
+        {
+            ret = 0;
+            break;
+        }
+        if (alpha)
+        {
+            if (TIFFReadEncodedStrip(
+                    tif, TIFFComputeStrip(tif, offset_row, colorchannels), pa,
+                    temp * scanline) == (tmsize_t)(-1) &&
+                img->stoponerr)
+            {
+                ret = 0;
+                break;
+            }
         }
 
-	TIFFGetFieldDefaulted(tif, TIFFTAG_ROWSPERSTRIP, &rowsperstrip);
-	scanline = TIFFScanlineSize(tif);  
-	fromskew = (w < imagewidth ? imagewidth - w : 0);
-	for (row = 0; row < h; row += nrow)
-	{
-                uint32 temp;
-		rowstoread = rowsperstrip - (row + img->row_offset) % rowsperstrip;
-		nrow = (row + rowstoread > h ? h - row : rowstoread);
-		offset_row = row + img->row_offset;
-                temp = (row + img->row_offset)%rowsperstrip + nrow;
-                if( scanline > 0 && temp > (size_t)(TIFF_TMSIZE_T_MAX / scanline) )
-                {
-                        TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "Integer overflow in gtStripSeparate");
-                        return 0;
-                }
-                if( buf == NULL )
-                {
-                    if (_TIFFReadEncodedStripAndAllocBuffer(
-                            tif, TIFFComputeStrip(tif, offset_row, 0),
-                            (void**) &buf, bufsize,
-                            temp * scanline)==(tmsize_t)(-1)
-                        && (buf == NULL || img->stoponerr))
-                    {
-                            ret = 0;
-                            break;
-                    }
-                    p0 = buf;
-                    if( colorchannels == 1 )
-                    {
-                        p2 = p1 = p0;
-                        pa = (alpha?(p0+3*stripsize):NULL);
-                    }
-                    else
-                    {
-                        p1 = p0 + stripsize;
-                        p2 = p1 + stripsize;
-                        pa = (alpha?(p2+stripsize):NULL);
-                    }
-                }
-		else if (TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 0),
-		    p0, temp * scanline)==(tmsize_t)(-1)
-		    && img->stoponerr)
-		{
-			ret = 0;
-			break;
-		}
-		if (colorchannels > 1 
-                    && TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 1),
-                                            p1, temp * scanline) == (tmsize_t)(-1)
-		    && img->stoponerr)
-		{
-			ret = 0;
-			break;
-		}
-		if (colorchannels > 1 
-                    && TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, 2),
-                                            p2, temp * scanline) == (tmsize_t)(-1)
-		    && img->stoponerr)
-		{
-			ret = 0;
-			break;
-		}
-		if (alpha)
-		{
-			if (TIFFReadEncodedStrip(tif, TIFFComputeStrip(tif, offset_row, colorchannels),
-			    pa, temp * scanline)==(tmsize_t)(-1)
-			    && img->stoponerr)
-			{
-				ret = 0;
-				break;
-			}
-		}
-
-		pos = ((row + img->row_offset) % rowsperstrip) * scanline + \
-			((tmsize_t) img->col_offset * img->samplesperpixel);
-		(*put)(img, raster+y*w, 0, y, w, nrow, fromskew, toskew, p0 + pos, p1 + pos,
-		    p2 + pos, (alpha?(pa+pos):NULL));
-		y += ((flip & FLIP_VERTICALLY) ? -(int32) nrow : (int32) nrow);
-	}
-
-	if (flip & FLIP_HORIZONTALLY) {
-		uint32 line;
-
-		for (line = 0; line < h; line++) {
-			uint32 *left = raster + (line * w);
-			uint32 *right = left + w - 1;
-
-			while ( left < right ) {
-				uint32 temp = *left;
-				*left = *right;
-				*right = temp;
-				left++;
-				right--;
-			}
-		}
-	}
-
-	_TIFFfree(buf);
-	return (ret);
+        pos = ((row + img->row_offset) % rowsperstrip) * scanline +
+              ((tmsize_t)img->col_offset * img->samplesperpixel);
+        tmsize_t roffset = (tmsize_t)y * w;
+        (*put)(img, raster + roffset, 0, y, w, nrow, fromskew, toskew, p0 + pos,
+               p1 + pos, p2 + pos, (alpha ? (pa + pos) : NULL));
+        y += ((flip & FLIP_VERTICALLY) ? -(int32_t)nrow : (int32_t)nrow);
+    }
+
+    if (flip & FLIP_HORIZONTALLY)
+    {
+        uint32_t line;
+
+        for (line = 0; line < h; line++)
+        {
+            uint32_t *left = raster + (line * w);
+            uint32_t *right = left + w - 1;
+
+            while (left < right)
+            {
+                uint32_t temp = *left;
+                *left = *right;
+                *right = temp;
+                left++;
+                right--;
+            }
+        }
+    }
+
+    _TIFFfreeExt(img->tif, buf);
+    return (ret);
 }
 
 /*
@@ -1189,98 +1331,139 @@ gtStripSeparate(TIFFRGBAImage* img, uint32* raster, uint32 w, uint32 h)
  * PickSeparateCase analyze the parameters and select
  * the appropriate "get" and "put" routine to use.
  */
-#define	REPEAT8(op)	REPEAT4(op); REPEAT4(op)
-#define	REPEAT4(op)	REPEAT2(op); REPEAT2(op)
-#define	REPEAT2(op)	op; op
-#define	CASE8(x,op)			\
-    switch (x) {			\
-    case 7: op; /*-fallthrough*/ \
-    case 6: op; /*-fallthrough*/ \
-    case 5: op; /*-fallthrough*/ \
-    case 4: op; /*-fallthrough*/ \
-    case 3: op; /*-fallthrough*/ \
-    case 2: op; /*-fallthrough*/ \
-    case 1: op;				\
-    }
-#define	CASE4(x,op)	switch (x) { case 3: op; /*-fallthrough*/ case 2: op; /*-fallthrough*/ case 1: op; }
-#define	NOP
-
-#define	UNROLL8(w, op1, op2) {		\
-    uint32 _x;				\
-    for (_x = w; _x >= 8; _x -= 8) {	\
-	op1;				\
-	REPEAT8(op2);			\
-    }					\
-    if (_x > 0) {			\
-	op1;				\
-	CASE8(_x,op2);			\
-    }					\
-}
-#define	UNROLL4(w, op1, op2) {		\
-    uint32 _x;				\
-    for (_x = w; _x >= 4; _x -= 4) {	\
-	op1;				\
-	REPEAT4(op2);			\
-    }					\
-    if (_x > 0) {			\
-	op1;				\
-	CASE4(_x,op2);			\
-    }					\
-}
-#define	UNROLL2(w, op1, op2) {		\
-    uint32 _x;				\
-    for (_x = w; _x >= 2; _x -= 2) {	\
-	op1;				\
-	REPEAT2(op2);			\
-    }					\
-    if (_x) {				\
-	op1;				\
-	op2;				\
-    }					\
-}
-    
-#define	SKEW(r,g,b,skew)	{ r += skew; g += skew; b += skew; }
-#define	SKEW4(r,g,b,a,skew)	{ r += skew; g += skew; b += skew; a+= skew; }
-
-#define A1 (((uint32)0xffL)<<24)
-#define	PACK(r,g,b)	\
-	((uint32)(r)|((uint32)(g)<<8)|((uint32)(b)<<16)|A1)
-#define	PACK4(r,g,b,a)	\
-	((uint32)(r)|((uint32)(g)<<8)|((uint32)(b)<<16)|((uint32)(a)<<24))
-#define W2B(v) (((v)>>8)&0xff)
+#define REPEAT8(op)                                                            \
+    REPEAT4(op);                                                               \
+    REPEAT4(op)
+#define REPEAT4(op)                                                            \
+    REPEAT2(op);                                                               \
+    REPEAT2(op)
+#define REPEAT2(op)                                                            \
+    op;                                                                        \
+    op
+#define CASE8(x, op)                                                           \
+    switch (x)                                                                 \
+    {                                                                          \
+        case 7:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 6:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 5:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 4:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 3:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 2:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 1:                                                                \
+            op;                                                                \
+    }
+#define CASE4(x, op)                                                           \
+    switch (x)                                                                 \
+    {                                                                          \
+        case 3:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 2:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 1:                                                                \
+            op;                                                                \
+    }
+#define NOP
+
+#define UNROLL8(w, op1, op2)                                                   \
+    {                                                                          \
+        uint32_t _x;                                                           \
+        for (_x = w; _x >= 8; _x -= 8)                                         \
+        {                                                                      \
+            op1;                                                               \
+            REPEAT8(op2);                                                      \
+        }                                                                      \
+        if (_x > 0)                                                            \
+        {                                                                      \
+            op1;                                                               \
+            CASE8(_x, op2);                                                    \
+        }                                                                      \
+    }
+#define UNROLL4(w, op1, op2)                                                   \
+    {                                                                          \
+        uint32_t _x;                                                           \
+        for (_x = w; _x >= 4; _x -= 4)                                         \
+        {                                                                      \
+            op1;                                                               \
+            REPEAT4(op2);                                                      \
+        }                                                                      \
+        if (_x > 0)                                                            \
+        {                                                                      \
+            op1;                                                               \
+            CASE4(_x, op2);                                                    \
+        }                                                                      \
+    }
+#define UNROLL2(w, op1, op2)                                                   \
+    {                                                                          \
+        uint32_t _x;                                                           \
+        for (_x = w; _x >= 2; _x -= 2)                                         \
+        {                                                                      \
+            op1;                                                               \
+            REPEAT2(op2);                                                      \
+        }                                                                      \
+        if (_x)                                                                \
+        {                                                                      \
+            op1;                                                               \
+            op2;                                                               \
+        }                                                                      \
+    }
+
+#define SKEW(r, g, b, skew)                                                    \
+    {                                                                          \
+        r += skew;                                                             \
+        g += skew;                                                             \
+        b += skew;                                                             \
+    }
+#define SKEW4(r, g, b, a, skew)                                                \
+    {                                                                          \
+        r += skew;                                                             \
+        g += skew;                                                             \
+        b += skew;                                                             \
+        a += skew;                                                             \
+    }
+
+#define A1 (((uint32_t)0xffL) << 24)
+#define PACK(r, g, b)                                                          \
+    ((uint32_t)(r) | ((uint32_t)(g) << 8) | ((uint32_t)(b) << 16) | A1)
+#define PACK4(r, g, b, a)                                                      \
+    ((uint32_t)(r) | ((uint32_t)(g) << 8) | ((uint32_t)(b) << 16) |            \
+     ((uint32_t)(a) << 24))
+#define W2B(v) (((v) >> 8) & 0xff)
 /* TODO: PACKW should have be made redundant in favor of Bitdepth16To8 LUT */
-#define	PACKW(r,g,b)	\
-	((uint32)W2B(r)|((uint32)W2B(g)<<8)|((uint32)W2B(b)<<16)|A1)
-#define	PACKW4(r,g,b,a)	\
-	((uint32)W2B(r)|((uint32)W2B(g)<<8)|((uint32)W2B(b)<<16)|((uint32)W2B(a)<<24))
-
-#define	DECLAREContigPutFunc(name) \
-static void name(\
-    TIFFRGBAImage* img, \
-    uint32* cp, \
-    uint32 x, uint32 y, \
-    uint32 w, uint32 h, \
-    int32 fromskew, int32 toskew, \
-    unsigned char* pp \
-)
+#define PACKW(r, g, b)                                                         \
+    ((uint32_t)W2B(r) | ((uint32_t)W2B(g) << 8) | ((uint32_t)W2B(b) << 16) | A1)
+#define PACKW4(r, g, b, a)                                                     \
+    ((uint32_t)W2B(r) | ((uint32_t)W2B(g) << 8) | ((uint32_t)W2B(b) << 16) |   \
+     ((uint32_t)W2B(a) << 24))
+
+#define DECLAREContigPutFunc(name)                                             \
+    static void name(TIFFRGBAImage *img, uint32_t *cp, uint32_t x, uint32_t y, \
+                     uint32_t w, uint32_t h, int32_t fromskew, int32_t toskew, \
+                     unsigned char *pp)
 
 /*
  * 8-bit palette => colormap/RGB
  */
 DECLAREContigPutFunc(put8bitcmaptile)
 {
-    uint32** PALmap = img->PALmap;
+    uint32_t **PALmap = img->PALmap;
     int samplesperpixel = img->samplesperpixel;
 
-    (void) y;
-    for( ; h > 0; --h) {
-	for (x = w; x > 0; --x)
+    (void)y;
+    for (; h > 0; --h)
+    {
+        for (x = w; x > 0; --x)
         {
-	    *cp++ = PALmap[*pp][0];
+            *cp++ = PALmap[*pp][0];
             pp += samplesperpixel;
         }
-	cp += toskew;
-	pp += fromskew;
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1289,15 +1472,17 @@ DECLAREContigPutFunc(put8bitcmaptile)
  */
 DECLAREContigPutFunc(put4bitcmaptile)
 {
-    uint32** PALmap = img->PALmap;
+    uint32_t **PALmap = img->PALmap;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew /= 2;
-    for( ; h > 0; --h) {
-	uint32* bw;
-	UNROLL2(w, bw = PALmap[*pp++], *cp++ = *bw++);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        uint32_t *bw;
+        UNROLL2(w, bw = PALmap[*pp++], *cp++ = *bw++);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1306,15 +1491,17 @@ DECLAREContigPutFunc(put4bitcmaptile)
  */
 DECLAREContigPutFunc(put2bitcmaptile)
 {
-    uint32** PALmap = img->PALmap;
+    uint32_t **PALmap = img->PALmap;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew /= 4;
-    for( ; h > 0; --h) {
-	uint32* bw;
-	UNROLL4(w, bw = PALmap[*pp++], *cp++ = *bw++);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        uint32_t *bw;
+        UNROLL4(w, bw = PALmap[*pp++], *cp++ = *bw++);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1323,15 +1510,17 @@ DECLAREContigPutFunc(put2bitcmaptile)
  */
 DECLAREContigPutFunc(put1bitcmaptile)
 {
-    uint32** PALmap = img->PALmap;
+    uint32_t **PALmap = img->PALmap;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew /= 8;
-    for( ; h > 0; --h) {
-	uint32* bw;
-	UNROLL8(w, bw = PALmap[*pp++], *cp++ = *bw++);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        uint32_t *bw;
+        UNROLL8(w, bw = PALmap[*pp++], *cp++ = *bw++);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1341,17 +1530,18 @@ DECLAREContigPutFunc(put1bitcmaptile)
 DECLAREContigPutFunc(putgreytile)
 {
     int samplesperpixel = img->samplesperpixel;
-    uint32** BWmap = img->BWmap;
+    uint32_t **BWmap = img->BWmap;
 
-    (void) y;
-    for( ; h > 0; --h) {
-	for (x = w; x > 0; --x)
+    (void)y;
+    for (; h > 0; --h)
+    {
+        for (x = w; x > 0; --x)
         {
-	    *cp++ = BWmap[*pp][0];
+            *cp++ = BWmap[*pp][0];
             pp += samplesperpixel;
         }
-	cp += toskew;
-	pp += fromskew;
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1361,17 +1551,18 @@ DECLAREContigPutFunc(putgreytile)
 DECLAREContigPutFunc(putagreytile)
 {
     int samplesperpixel = img->samplesperpixel;
-    uint32** BWmap = img->BWmap;
+    uint32_t **BWmap = img->BWmap;
 
-    (void) y;
-    for( ; h > 0; --h) {
-	for (x = w; x > 0; --x)
+    (void)y;
+    for (; h > 0; --h)
+    {
+        for (x = w; x > 0; --x)
         {
-            *cp++ = BWmap[*pp][0] & ((uint32)*(pp+1) << 24 | ~A1);
+            *cp++ = BWmap[*pp][0] & ((uint32_t) * (pp + 1) << 24 | ~A1);
             pp += samplesperpixel;
         }
-	cp += toskew;
-	pp += fromskew;
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1381,22 +1572,23 @@ DECLAREContigPutFunc(putagreytile)
 DECLAREContigPutFunc(put16bitbwtile)
 {
     int samplesperpixel = img->samplesperpixel;
-    uint32** BWmap = img->BWmap;
+    uint32_t **BWmap = img->BWmap;
 
-    (void) y;
-    for( ; h > 0; --h) {
-        uint16 *wp = (uint16 *) pp;
+    (void)y;
+    for (; h > 0; --h)
+    {
+        uint16_t *wp = (uint16_t *)pp;
 
-	for (x = w; x > 0; --x)
+        for (x = w; x > 0; --x)
         {
             /* use high order byte of 16bit value */
 
-	    *cp++ = BWmap[*wp >> 8][0];
+            *cp++ = BWmap[*wp >> 8][0];
             pp += 2 * samplesperpixel;
             wp += samplesperpixel;
         }
-	cp += toskew;
-	pp += fromskew;
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1405,15 +1597,17 @@ DECLAREContigPutFunc(put16bitbwtile)
  */
 DECLAREContigPutFunc(put1bitbwtile)
 {
-    uint32** BWmap = img->BWmap;
+    uint32_t **BWmap = img->BWmap;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew /= 8;
-    for( ; h > 0; --h) {
-	uint32* bw;
-	UNROLL8(w, bw = BWmap[*pp++], *cp++ = *bw++);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        uint32_t *bw;
+        UNROLL8(w, bw = BWmap[*pp++], *cp++ = *bw++);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1422,15 +1616,17 @@ DECLAREContigPutFunc(put1bitbwtile)
  */
 DECLAREContigPutFunc(put2bitbwtile)
 {
-    uint32** BWmap = img->BWmap;
+    uint32_t **BWmap = img->BWmap;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew /= 4;
-    for( ; h > 0; --h) {
-	uint32* bw;
-	UNROLL4(w, bw = BWmap[*pp++], *cp++ = *bw++);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        uint32_t *bw;
+        UNROLL4(w, bw = BWmap[*pp++], *cp++ = *bw++);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1439,15 +1635,17 @@ DECLAREContigPutFunc(put2bitbwtile)
  */
 DECLAREContigPutFunc(put4bitbwtile)
 {
-    uint32** BWmap = img->BWmap;
+    uint32_t **BWmap = img->BWmap;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew /= 2;
-    for( ; h > 0; --h) {
-	uint32* bw;
-	UNROLL2(w, bw = BWmap[*pp++], *cp++ = *bw++);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        uint32_t *bw;
+        UNROLL2(w, bw = BWmap[*pp++], *cp++ = *bw++);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1458,14 +1656,15 @@ DECLAREContigPutFunc(putRGBcontig8bittile)
 {
     int samplesperpixel = img->samplesperpixel;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew *= samplesperpixel;
-    for( ; h > 0; --h) {
-	UNROLL8(w, NOP,
-	    *cp++ = PACK(pp[0], pp[1], pp[2]);
-	    pp += samplesperpixel);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        UNROLL8(w, NOP, *cp++ = PACK(pp[0], pp[1], pp[2]);
+                pp += samplesperpixel);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1477,14 +1676,15 @@ DECLAREContigPutFunc(putRGBAAcontig8bittile)
 {
     int samplesperpixel = img->samplesperpixel;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew *= samplesperpixel;
-    for( ; h > 0; --h) {
-	UNROLL8(w, NOP,
-	    *cp++ = PACK4(pp[0], pp[1], pp[2], pp[3]);
-	    pp += samplesperpixel);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        UNROLL8(w, NOP, *cp++ = PACK4(pp[0], pp[1], pp[2], pp[3]);
+                pp += samplesperpixel);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1494,24 +1694,26 @@ DECLAREContigPutFunc(putRGBAAcontig8bittile)
  */
 DECLAREContigPutFunc(putRGBUAcontig8bittile)
 {
-	int samplesperpixel = img->samplesperpixel;
-	(void) y;
-	fromskew *= samplesperpixel;
-	for( ; h > 0; --h) {
-		uint32 r, g, b, a;
-		uint8* m;
-		for (x = w; x > 0; --x) {
-			a = pp[3];
-			m = img->UaToAa+((size_t) a<<8);
-			r = m[pp[0]];
-			g = m[pp[1]];
-			b = m[pp[2]];
-			*cp++ = PACK4(r,g,b,a);
-			pp += samplesperpixel;
-		}
-		cp += toskew;
-		pp += fromskew;
-	}
+    int samplesperpixel = img->samplesperpixel;
+    (void)y;
+    fromskew *= samplesperpixel;
+    for (; h > 0; --h)
+    {
+        uint32_t r, g, b, a;
+        uint8_t *m;
+        for (x = w; x > 0; --x)
+        {
+            a = pp[3];
+            m = img->UaToAa + ((size_t)a << 8);
+            r = m[pp[0]];
+            g = m[pp[1]];
+            b = m[pp[2]];
+            *cp++ = PACK4(r, g, b, a);
+            pp += samplesperpixel;
+        }
+        cp += toskew;
+        pp += fromskew;
+    }
 }
 
 /*
@@ -1519,20 +1721,21 @@ DECLAREContigPutFunc(putRGBUAcontig8bittile)
  */
 DECLAREContigPutFunc(putRGBcontig16bittile)
 {
-	int samplesperpixel = img->samplesperpixel;
-	uint16 *wp = (uint16 *)pp;
-	(void) y;
-	fromskew *= samplesperpixel;
-	for( ; h > 0; --h) {
-		for (x = w; x > 0; --x) {
-			*cp++ = PACK(img->Bitdepth16To8[wp[0]],
-			    img->Bitdepth16To8[wp[1]],
-			    img->Bitdepth16To8[wp[2]]);
-			wp += samplesperpixel;
-		}
-		cp += toskew;
-		wp += fromskew;
-	}
+    int samplesperpixel = img->samplesperpixel;
+    uint16_t *wp = (uint16_t *)pp;
+    (void)y;
+    fromskew *= samplesperpixel;
+    for (; h > 0; --h)
+    {
+        for (x = w; x > 0; --x)
+        {
+            *cp++ = PACK(img->Bitdepth16To8[wp[0]], img->Bitdepth16To8[wp[1]],
+                         img->Bitdepth16To8[wp[2]]);
+            wp += samplesperpixel;
+        }
+        cp += toskew;
+        wp += fromskew;
+    }
 }
 
 /*
@@ -1541,21 +1744,21 @@ DECLAREContigPutFunc(putRGBcontig16bittile)
  */
 DECLAREContigPutFunc(putRGBAAcontig16bittile)
 {
-	int samplesperpixel = img->samplesperpixel;
-	uint16 *wp = (uint16 *)pp;
-	(void) y;
-	fromskew *= samplesperpixel;
-	for( ; h > 0; --h) {
-		for (x = w; x > 0; --x) {
-			*cp++ = PACK4(img->Bitdepth16To8[wp[0]],
-			    img->Bitdepth16To8[wp[1]],
-			    img->Bitdepth16To8[wp[2]],
-			    img->Bitdepth16To8[wp[3]]);
-			wp += samplesperpixel;
-		}
-		cp += toskew;
-		wp += fromskew;
-	}
+    int samplesperpixel = img->samplesperpixel;
+    uint16_t *wp = (uint16_t *)pp;
+    (void)y;
+    fromskew *= samplesperpixel;
+    for (; h > 0; --h)
+    {
+        for (x = w; x > 0; --x)
+        {
+            *cp++ = PACK4(img->Bitdepth16To8[wp[0]], img->Bitdepth16To8[wp[1]],
+                          img->Bitdepth16To8[wp[2]], img->Bitdepth16To8[wp[3]]);
+            wp += samplesperpixel;
+        }
+        cp += toskew;
+        wp += fromskew;
+    }
 }
 
 /*
@@ -1564,25 +1767,27 @@ DECLAREContigPutFunc(putRGBAAcontig16bittile)
  */
 DECLAREContigPutFunc(putRGBUAcontig16bittile)
 {
-	int samplesperpixel = img->samplesperpixel;
-	uint16 *wp = (uint16 *)pp;
-	(void) y;
-	fromskew *= samplesperpixel;
-	for( ; h > 0; --h) {
-		uint32 r,g,b,a;
-		uint8* m;
-		for (x = w; x > 0; --x) {
-			a = img->Bitdepth16To8[wp[3]];
-			m = img->UaToAa+((size_t) a<<8);
-			r = m[img->Bitdepth16To8[wp[0]]];
-			g = m[img->Bitdepth16To8[wp[1]]];
-			b = m[img->Bitdepth16To8[wp[2]]];
-			*cp++ = PACK4(r,g,b,a);
-			wp += samplesperpixel;
-		}
-		cp += toskew;
-		wp += fromskew;
-	}
+    int samplesperpixel = img->samplesperpixel;
+    uint16_t *wp = (uint16_t *)pp;
+    (void)y;
+    fromskew *= samplesperpixel;
+    for (; h > 0; --h)
+    {
+        uint32_t r, g, b, a;
+        uint8_t *m;
+        for (x = w; x > 0; --x)
+        {
+            a = img->Bitdepth16To8[wp[3]];
+            m = img->UaToAa + ((size_t)a << 8);
+            r = m[img->Bitdepth16To8[wp[0]]];
+            g = m[img->Bitdepth16To8[wp[1]]];
+            b = m[img->Bitdepth16To8[wp[2]]];
+            *cp++ = PACK4(r, g, b, a);
+            wp += samplesperpixel;
+        }
+        cp += toskew;
+        wp += fromskew;
+    }
 }
 
 /*
@@ -1593,20 +1798,18 @@ DECLAREContigPutFunc(putRGBUAcontig16bittile)
 DECLAREContigPutFunc(putRGBcontig8bitCMYKtile)
 {
     int samplesperpixel = img->samplesperpixel;
-    uint16 r, g, b, k;
+    uint16_t r, g, b, k;
 
-    (void) x; (void) y;
+    (void)x;
+    (void)y;
     fromskew *= samplesperpixel;
-    for( ; h > 0; --h) {
-	UNROLL8(w, NOP,
-	    k = 255 - pp[3];
-	    r = (k*(255-pp[0]))/255;
-	    g = (k*(255-pp[1]))/255;
-	    b = (k*(255-pp[2]))/255;
-	    *cp++ = PACK(r, g, b);
-	    pp += samplesperpixel);
-	cp += toskew;
-	pp += fromskew;
+    for (; h > 0; --h)
+    {
+        UNROLL8(w, NOP, k = 255 - pp[3]; r = (k * (255 - pp[0])) / 255;
+                g = (k * (255 - pp[1])) / 255; b = (k * (255 - pp[2])) / 255;
+                *cp++ = PACK(r, g, b); pp += samplesperpixel);
+        cp += toskew;
+        pp += fromskew;
     }
 }
 
@@ -1618,45 +1821,47 @@ DECLAREContigPutFunc(putRGBcontig8bitCMYKtile)
 DECLAREContigPutFunc(putRGBcontig8bitCMYKMaptile)
 {
     int samplesperpixel = img->samplesperpixel;
-    TIFFRGBValue* Map = img->Map;
-    uint16 r, g, b, k;
+    TIFFRGBValue *Map = img->Map;
+    uint16_t r, g, b, k;
 
-    (void) y;
+    (void)y;
     fromskew *= samplesperpixel;
-    for( ; h > 0; --h) {
-	for (x = w; x > 0; --x) {
-	    k = 255 - pp[3];
-	    r = (k*(255-pp[0]))/255;
-	    g = (k*(255-pp[1]))/255;
-	    b = (k*(255-pp[2]))/255;
-	    *cp++ = PACK(Map[r], Map[g], Map[b]);
-	    pp += samplesperpixel;
-	}
-	pp += fromskew;
-	cp += toskew;
-    }
-}
-
-#define	DECLARESepPutFunc(name) \
-static void name(\
-    TIFFRGBAImage* img,\
-    uint32* cp,\
-    uint32 x, uint32 y, \
-    uint32 w, uint32 h,\
-    int32 fromskew, int32 toskew,\
-    unsigned char* r, unsigned char* g, unsigned char* b, unsigned char* a\
-)
+    for (; h > 0; --h)
+    {
+        for (x = w; x > 0; --x)
+        {
+            k = 255 - pp[3];
+            r = (k * (255 - pp[0])) / 255;
+            g = (k * (255 - pp[1])) / 255;
+            b = (k * (255 - pp[2])) / 255;
+            *cp++ = PACK(Map[r], Map[g], Map[b]);
+            pp += samplesperpixel;
+        }
+        pp += fromskew;
+        cp += toskew;
+    }
+}
+
+#define DECLARESepPutFunc(name)                                                \
+    static void name(TIFFRGBAImage *img, uint32_t *cp, uint32_t x, uint32_t y, \
+                     uint32_t w, uint32_t h, int32_t fromskew, int32_t toskew, \
+                     unsigned char *r, unsigned char *g, unsigned char *b,     \
+                     unsigned char *a)
 
 /*
  * 8-bit unpacked samples => RGB
  */
 DECLARESepPutFunc(putRGBseparate8bittile)
 {
-    (void) img; (void) x; (void) y; (void) a;
-    for( ; h > 0; --h) {
-	UNROLL8(w, NOP, *cp++ = PACK(*r++, *g++, *b++));
-	SKEW(r, g, b, fromskew);
-	cp += toskew;
+    (void)img;
+    (void)x;
+    (void)y;
+    (void)a;
+    for (; h > 0; --h)
+    {
+        UNROLL8(w, NOP, *cp++ = PACK(*r++, *g++, *b++));
+        SKEW(r, g, b, fromskew);
+        cp += toskew;
     }
 }
 
@@ -1665,12 +1870,15 @@ DECLARESepPutFunc(putRGBseparate8bittile)
  */
 DECLARESepPutFunc(putRGBAAseparate8bittile)
 {
-	(void) img; (void) x; (void) y; 
-	for( ; h > 0; --h) {
-		UNROLL8(w, NOP, *cp++ = PACK4(*r++, *g++, *b++, *a++));
-		SKEW4(r, g, b, a, fromskew);
-		cp += toskew;
-	}
+    (void)img;
+    (void)x;
+    (void)y;
+    for (; h > 0; --h)
+    {
+        UNROLL8(w, NOP, *cp++ = PACK4(*r++, *g++, *b++, *a++));
+        SKEW4(r, g, b, a, fromskew);
+        cp += toskew;
+    }
 }
 
 /*
@@ -1678,19 +1886,22 @@ DECLARESepPutFunc(putRGBAAseparate8bittile)
  */
 DECLARESepPutFunc(putCMYKseparate8bittile)
 {
-	(void) img; (void) y;
-	for( ; h > 0; --h) {
-		uint32 rv, gv, bv, kv;
-		for (x = w; x > 0; --x) {
-			kv = 255 - *a++;
-			rv = (kv*(255-*r++))/255;
-			gv = (kv*(255-*g++))/255;
-			bv = (kv*(255-*b++))/255;
-			*cp++ = PACK4(rv,gv,bv,255);
-		}
-		SKEW4(r, g, b, a, fromskew);
-		cp += toskew;
-	}
+    (void)img;
+    (void)y;
+    for (; h > 0; --h)
+    {
+        uint32_t rv, gv, bv, kv;
+        for (x = w; x > 0; --x)
+        {
+            kv = 255 - *a++;
+            rv = (kv * (255 - *r++)) / 255;
+            gv = (kv * (255 - *g++)) / 255;
+            bv = (kv * (255 - *b++)) / 255;
+            *cp++ = PACK4(rv, gv, bv, 255);
+        }
+        SKEW4(r, g, b, a, fromskew);
+        cp += toskew;
+    }
 }
 
 /*
@@ -1698,21 +1909,24 @@ DECLARESepPutFunc(putCMYKseparate8bittile)
  */
 DECLARESepPutFunc(putRGBUAseparate8bittile)
 {
-	(void) img; (void) y;
-	for( ; h > 0; --h) {
-		uint32 rv, gv, bv, av;
-		uint8* m;
-		for (x = w; x > 0; --x) {
-			av = *a++;
-			m = img->UaToAa+((size_t) av<<8);
-			rv = m[*r++];
-			gv = m[*g++];
-			bv = m[*b++];
-			*cp++ = PACK4(rv,gv,bv,av);
-		}
-		SKEW4(r, g, b, a, fromskew);
-		cp += toskew;
-	}
+    (void)img;
+    (void)y;
+    for (; h > 0; --h)
+    {
+        uint32_t rv, gv, bv, av;
+        uint8_t *m;
+        for (x = w; x > 0; --x)
+        {
+            av = *a++;
+            m = img->UaToAa + ((size_t)av << 8);
+            rv = m[*r++];
+            gv = m[*g++];
+            bv = m[*b++];
+            *cp++ = PACK4(rv, gv, bv, av);
+        }
+        SKEW4(r, g, b, a, fromskew);
+        cp += toskew;
+    }
 }
 
 /*
@@ -1720,18 +1934,20 @@ DECLARESepPutFunc(putRGBUAseparate8bittile)
  */
 DECLARESepPutFunc(putRGBseparate16bittile)
 {
-	uint16 *wr = (uint16*) r;
-	uint16 *wg = (uint16*) g;
-	uint16 *wb = (uint16*) b;
-	(void) img; (void) y; (void) a;
-	for( ; h > 0; --h) {
-		for (x = 0; x < w; x++)
-			*cp++ = PACK(img->Bitdepth16To8[*wr++],
-			    img->Bitdepth16To8[*wg++],
-			    img->Bitdepth16To8[*wb++]);
-		SKEW(wr, wg, wb, fromskew);
-		cp += toskew;
-	}
+    uint16_t *wr = (uint16_t *)r;
+    uint16_t *wg = (uint16_t *)g;
+    uint16_t *wb = (uint16_t *)b;
+    (void)img;
+    (void)y;
+    (void)a;
+    for (; h > 0; --h)
+    {
+        for (x = 0; x < w; x++)
+            *cp++ = PACK(img->Bitdepth16To8[*wr++], img->Bitdepth16To8[*wg++],
+                         img->Bitdepth16To8[*wb++]);
+        SKEW(wr, wg, wb, fromskew);
+        cp += toskew;
+    }
 }
 
 /*
@@ -1739,20 +1955,20 @@ DECLARESepPutFunc(putRGBseparate16bittile)
  */
 DECLARESepPutFunc(putRGBAAseparate16bittile)
 {
-	uint16 *wr = (uint16*) r;
-	uint16 *wg = (uint16*) g;
-	uint16 *wb = (uint16*) b;
-	uint16 *wa = (uint16*) a;
-	(void) img; (void) y;
-	for( ; h > 0; --h) {
-		for (x = 0; x < w; x++)
-			*cp++ = PACK4(img->Bitdepth16To8[*wr++],
-			    img->Bitdepth16To8[*wg++],
-			    img->Bitdepth16To8[*wb++],
-			    img->Bitdepth16To8[*wa++]);
-		SKEW4(wr, wg, wb, wa, fromskew);
-		cp += toskew;
-	}
+    uint16_t *wr = (uint16_t *)r;
+    uint16_t *wg = (uint16_t *)g;
+    uint16_t *wb = (uint16_t *)b;
+    uint16_t *wa = (uint16_t *)a;
+    (void)img;
+    (void)y;
+    for (; h > 0; --h)
+    {
+        for (x = 0; x < w; x++)
+            *cp++ = PACK4(img->Bitdepth16To8[*wr++], img->Bitdepth16To8[*wg++],
+                          img->Bitdepth16To8[*wb++], img->Bitdepth16To8[*wa++]);
+        SKEW4(wr, wg, wb, wa, fromskew);
+        cp += toskew;
+    }
 }
 
 /*
@@ -1760,168 +1976,123 @@ DECLARESepPutFunc(putRGBAAseparate16bittile)
  */
 DECLARESepPutFunc(putRGBUAseparate16bittile)
 {
-	uint16 *wr = (uint16*) r;
-	uint16 *wg = (uint16*) g;
-	uint16 *wb = (uint16*) b;
-	uint16 *wa = (uint16*) a;
-	(void) img; (void) y;
-	for( ; h > 0; --h) {
-		uint32 r2,g2,b2,a2;
-		uint8* m;
-		for (x = w; x > 0; --x) {
-			a2 = img->Bitdepth16To8[*wa++];
-			m = img->UaToAa+((size_t) a2<<8);
-			r2 = m[img->Bitdepth16To8[*wr++]];
-			g2 = m[img->Bitdepth16To8[*wg++]];
-			b2 = m[img->Bitdepth16To8[*wb++]];
-			*cp++ = PACK4(r2,g2,b2,a2);
-		}
-		SKEW4(wr, wg, wb, wa, fromskew);
-		cp += toskew;
-	}
+    uint16_t *wr = (uint16_t *)r;
+    uint16_t *wg = (uint16_t *)g;
+    uint16_t *wb = (uint16_t *)b;
+    uint16_t *wa = (uint16_t *)a;
+    (void)img;
+    (void)y;
+    for (; h > 0; --h)
+    {
+        uint32_t r2, g2, b2, a2;
+        uint8_t *m;
+        for (x = w; x > 0; --x)
+        {
+            a2 = img->Bitdepth16To8[*wa++];
+            m = img->UaToAa + ((size_t)a2 << 8);
+            r2 = m[img->Bitdepth16To8[*wr++]];
+            g2 = m[img->Bitdepth16To8[*wg++]];
+            b2 = m[img->Bitdepth16To8[*wb++]];
+            *cp++ = PACK4(r2, g2, b2, a2);
+        }
+        SKEW4(wr, wg, wb, wa, fromskew);
+        cp += toskew;
+    }
 }
 
 /*
  * 8-bit packed CIE L*a*b 1976 samples => RGB
  */
-DECLAREContigPutFunc(putcontig8bitCIELab)
-{
-	float X, Y, Z;
-	uint32 r, g, b;
-	(void) y;
-	fromskew *= 3;
-	for( ; h > 0; --h) {
-		for (x = w; x > 0; --x) {
-			TIFFCIELabToXYZ(img->cielab,
-					(unsigned char)pp[0],
-					(signed char)pp[1],
-					(signed char)pp[2],
-					&X, &Y, &Z);
-			TIFFXYZToRGB(img->cielab, X, Y, Z, &r, &g, &b);
-			*cp++ = PACK(r, g, b);
-			pp += 3;
-		}
-		cp += toskew;
-		pp += fromskew;
-	}
+DECLAREContigPutFunc(putcontig8bitCIELab8)
+{
+    float X, Y, Z;
+    uint32_t r, g, b;
+    (void)y;
+    fromskew *= 3;
+    for (; h > 0; --h)
+    {
+        for (x = w; x > 0; --x)
+        {
+            TIFFCIELabToXYZ(img->cielab, (unsigned char)pp[0],
+                            (signed char)pp[1], (signed char)pp[2], &X, &Y, &Z);
+            TIFFXYZToRGB(img->cielab, X, Y, Z, &r, &g, &b);
+            *cp++ = PACK(r, g, b);
+            pp += 3;
+        }
+        cp += toskew;
+        pp += fromskew;
+    }
 }
 
 /*
- * YCbCr -> RGB conversion and packing routines.
+ * 16-bit packed CIE L*a*b 1976 samples => RGB
  */
-
-#define	YCbCrtoRGB(dst, Y) {						\
-	uint32 r, g, b;							\
-	TIFFYCbCrtoRGB(img->ycbcr, (Y), Cb, Cr, &r, &g, &b);		\
-	dst = PACK(r, g, b);						\
+DECLAREContigPutFunc(putcontig8bitCIELab16)
+{
+    float X, Y, Z;
+    uint32_t r, g, b;
+    uint16_t *wp = (uint16_t *)pp;
+    (void)y;
+    fromskew *= 3;
+    for (; h > 0; --h)
+    {
+        for (x = w; x > 0; --x)
+        {
+            TIFFCIELab16ToXYZ(img->cielab, (uint16_t)wp[0], (int16_t)wp[1],
+                              (int16_t)wp[2], &X, &Y, &Z);
+            TIFFXYZToRGB(img->cielab, X, Y, Z, &r, &g, &b);
+            *cp++ = PACK(r, g, b);
+            wp += 3;
+        }
+        cp += toskew;
+        wp += fromskew;
+    }
 }
 
 /*
- * 8-bit packed YCbCr samples => RGB 
- * This function is generic for different sampling sizes, 
- * and can handle blocks sizes that aren't multiples of the
- * sampling size.  However, it is substantially less optimized
- * than the specific sampling cases.  It is used as a fallback
- * for difficult blocks.
+ * YCbCr -> RGB conversion and packing routines.
  */
-#ifdef notdef
-static void putcontig8bitYCbCrGenericTile( 
-    TIFFRGBAImage* img, 
-    uint32* cp, 
-    uint32 x, uint32 y, 
-    uint32 w, uint32 h, 
-    int32 fromskew, int32 toskew, 
-    unsigned char* pp,
-    int h_group, 
-    int v_group )
-
-{
-    uint32* cp1 = cp+w+toskew;
-    uint32* cp2 = cp1+w+toskew;
-    uint32* cp3 = cp2+w+toskew;
-    int32 incr = 3*w+4*toskew;
-    int32   Cb, Cr;
-    int     group_size = v_group * h_group + 2;
-
-    (void) y;
-    fromskew = (fromskew * group_size) / h_group;
-
-    for( yy = 0; yy < h; yy++ )
-    {
-        unsigned char *pp_line;
-        int     y_line_group = yy / v_group;
-        int     y_remainder = yy - y_line_group * v_group;
-
-        pp_line = pp + v_line_group * 
-
-        
-        for( xx = 0; xx < w; xx++ )
-        {
-            Cb = pp
-        }
-    }
-    for (; h >= 4; h -= 4) {
-	x = w>>2;
-	do {
-	    Cb = pp[16];
-	    Cr = pp[17];
-
-	    YCbCrtoRGB(cp [0], pp[ 0]);
-	    YCbCrtoRGB(cp [1], pp[ 1]);
-	    YCbCrtoRGB(cp [2], pp[ 2]);
-	    YCbCrtoRGB(cp [3], pp[ 3]);
-	    YCbCrtoRGB(cp1[0], pp[ 4]);
-	    YCbCrtoRGB(cp1[1], pp[ 5]);
-	    YCbCrtoRGB(cp1[2], pp[ 6]);
-	    YCbCrtoRGB(cp1[3], pp[ 7]);
-	    YCbCrtoRGB(cp2[0], pp[ 8]);
-	    YCbCrtoRGB(cp2[1], pp[ 9]);
-	    YCbCrtoRGB(cp2[2], pp[10]);
-	    YCbCrtoRGB(cp2[3], pp[11]);
-	    YCbCrtoRGB(cp3[0], pp[12]);
-	    YCbCrtoRGB(cp3[1], pp[13]);
-	    YCbCrtoRGB(cp3[2], pp[14]);
-	    YCbCrtoRGB(cp3[3], pp[15]);
-
-	    cp += 4, cp1 += 4, cp2 += 4, cp3 += 4;
-	    pp += 18;
-	} while (--x);
-	cp += incr, cp1 += incr, cp2 += incr, cp3 += incr;
-	pp += fromskew;
+
+#define YCbCrtoRGB(dst, Y)                                                     \
+    {                                                                          \
+        uint32_t r, g, b;                                                      \
+        TIFFYCbCrtoRGB(img->ycbcr, (Y), Cb, Cr, &r, &g, &b);                   \
+        dst = PACK(r, g, b);                                                   \
     }
-}
-#endif
 
 /*
  * 8-bit packed YCbCr samples w/ 4,4 subsampling => RGB
  */
 DECLAREContigPutFunc(putcontig8bitYCbCr44tile)
 {
-    uint32* cp1 = cp+w+toskew;
-    uint32* cp2 = cp1+w+toskew;
-    uint32* cp3 = cp2+w+toskew;
-    int32 incr = 3*w+4*toskew;
+    uint32_t *cp1 = cp + w + toskew;
+    uint32_t *cp2 = cp1 + w + toskew;
+    uint32_t *cp3 = cp2 + w + toskew;
+    int32_t incr = 3 * w + 4 * toskew;
 
-    (void) y;
+    (void)y;
     /* adjust fromskew */
-    fromskew = (fromskew / 4) * (4*2+2);
-    if ((h & 3) == 0 && (w & 3) == 0) {				        
-        for (; h >= 4; h -= 4) {
-            x = w>>2;
-            do {
-                int32 Cb = pp[16];
-                int32 Cr = pp[17];
-
-                YCbCrtoRGB(cp [0], pp[ 0]);
-                YCbCrtoRGB(cp [1], pp[ 1]);
-                YCbCrtoRGB(cp [2], pp[ 2]);
-                YCbCrtoRGB(cp [3], pp[ 3]);
-                YCbCrtoRGB(cp1[0], pp[ 4]);
-                YCbCrtoRGB(cp1[1], pp[ 5]);
-                YCbCrtoRGB(cp1[2], pp[ 6]);
-                YCbCrtoRGB(cp1[3], pp[ 7]);
-                YCbCrtoRGB(cp2[0], pp[ 8]);
-                YCbCrtoRGB(cp2[1], pp[ 9]);
+    fromskew = (fromskew / 4) * (4 * 2 + 2);
+    if ((h & 3) == 0 && (w & 3) == 0)
+    {
+        for (; h >= 4; h -= 4)
+        {
+            x = w >> 2;
+            do
+            {
+                int32_t Cb = pp[16];
+                int32_t Cr = pp[17];
+
+                YCbCrtoRGB(cp[0], pp[0]);
+                YCbCrtoRGB(cp[1], pp[1]);
+                YCbCrtoRGB(cp[2], pp[2]);
+                YCbCrtoRGB(cp[3], pp[3]);
+                YCbCrtoRGB(cp1[0], pp[4]);
+                YCbCrtoRGB(cp1[1], pp[5]);
+                YCbCrtoRGB(cp1[2], pp[6]);
+                YCbCrtoRGB(cp1[3], pp[7]);
+                YCbCrtoRGB(cp2[0], pp[8]);
+                YCbCrtoRGB(cp2[1], pp[9]);
                 YCbCrtoRGB(cp2[2], pp[10]);
                 YCbCrtoRGB(cp2[3], pp[11]);
                 YCbCrtoRGB(cp3[0], pp[12]);
@@ -1941,47 +2112,80 @@ DECLAREContigPutFunc(putcontig8bitYCbCr44tile)
             cp3 += incr;
             pp += fromskew;
         }
-    } else {
-        while (h > 0) {
-            for (x = w; x > 0;) {
-                int32 Cb = pp[16];
-                int32 Cr = pp[17];
-                switch (x) {
-                default:
-                    switch (h) {
-                    default: YCbCrtoRGB(cp3[3], pp[15]); /* FALLTHROUGH */
-                    case 3:  YCbCrtoRGB(cp2[3], pp[11]); /* FALLTHROUGH */
-                    case 2:  YCbCrtoRGB(cp1[3], pp[ 7]); /* FALLTHROUGH */
-                    case 1:  YCbCrtoRGB(cp [3], pp[ 3]); /* FALLTHROUGH */
-                    }                                    /* FALLTHROUGH */
-                case 3:
-                    switch (h) {
-                    default: YCbCrtoRGB(cp3[2], pp[14]); /* FALLTHROUGH */
-                    case 3:  YCbCrtoRGB(cp2[2], pp[10]); /* FALLTHROUGH */
-                    case 2:  YCbCrtoRGB(cp1[2], pp[ 6]); /* FALLTHROUGH */
-                    case 1:  YCbCrtoRGB(cp [2], pp[ 2]); /* FALLTHROUGH */
-                    }                                    /* FALLTHROUGH */
-                case 2:
-                    switch (h) {
-                    default: YCbCrtoRGB(cp3[1], pp[13]); /* FALLTHROUGH */
-                    case 3:  YCbCrtoRGB(cp2[1], pp[ 9]); /* FALLTHROUGH */
-                    case 2:  YCbCrtoRGB(cp1[1], pp[ 5]); /* FALLTHROUGH */
-                    case 1:  YCbCrtoRGB(cp [1], pp[ 1]); /* FALLTHROUGH */
-                    }                                    /* FALLTHROUGH */
-                case 1:
-                    switch (h) {
-                    default: YCbCrtoRGB(cp3[0], pp[12]); /* FALLTHROUGH */
-                    case 3:  YCbCrtoRGB(cp2[0], pp[ 8]); /* FALLTHROUGH */
-                    case 2:  YCbCrtoRGB(cp1[0], pp[ 4]); /* FALLTHROUGH */
-                    case 1:  YCbCrtoRGB(cp [0], pp[ 0]); /* FALLTHROUGH */
-                    }                                    /* FALLTHROUGH */
+    }
+    else
+    {
+        while (h > 0)
+        {
+            for (x = w; x > 0;)
+            {
+                int32_t Cb = pp[16];
+                int32_t Cr = pp[17];
+                switch (x)
+                {
+                    default:
+                        switch (h)
+                        {
+                            default:
+                                YCbCrtoRGB(cp3[3], pp[15]); /* FALLTHROUGH */
+                            case 3:
+                                YCbCrtoRGB(cp2[3], pp[11]); /* FALLTHROUGH */
+                            case 2:
+                                YCbCrtoRGB(cp1[3], pp[7]); /* FALLTHROUGH */
+                            case 1:
+                                YCbCrtoRGB(cp[3], pp[3]); /* FALLTHROUGH */
+                        }                                 /* FALLTHROUGH */
+                    case 3:
+                        switch (h)
+                        {
+                            default:
+                                YCbCrtoRGB(cp3[2], pp[14]); /* FALLTHROUGH */
+                            case 3:
+                                YCbCrtoRGB(cp2[2], pp[10]); /* FALLTHROUGH */
+                            case 2:
+                                YCbCrtoRGB(cp1[2], pp[6]); /* FALLTHROUGH */
+                            case 1:
+                                YCbCrtoRGB(cp[2], pp[2]); /* FALLTHROUGH */
+                        }                                 /* FALLTHROUGH */
+                    case 2:
+                        switch (h)
+                        {
+                            default:
+                                YCbCrtoRGB(cp3[1], pp[13]); /* FALLTHROUGH */
+                            case 3:
+                                YCbCrtoRGB(cp2[1], pp[9]); /* FALLTHROUGH */
+                            case 2:
+                                YCbCrtoRGB(cp1[1], pp[5]); /* FALLTHROUGH */
+                            case 1:
+                                YCbCrtoRGB(cp[1], pp[1]); /* FALLTHROUGH */
+                        }                                 /* FALLTHROUGH */
+                    case 1:
+                        switch (h)
+                        {
+                            default:
+                                YCbCrtoRGB(cp3[0], pp[12]); /* FALLTHROUGH */
+                            case 3:
+                                YCbCrtoRGB(cp2[0], pp[8]); /* FALLTHROUGH */
+                            case 2:
+                                YCbCrtoRGB(cp1[0], pp[4]); /* FALLTHROUGH */
+                            case 1:
+                                YCbCrtoRGB(cp[0], pp[0]); /* FALLTHROUGH */
+                        }                                 /* FALLTHROUGH */
                 }
-                if (x < 4) {
-                    cp += x; cp1 += x; cp2 += x; cp3 += x;
+                if (x < 4)
+                {
+                    cp += x;
+                    cp1 += x;
+                    cp2 += x;
+                    cp3 += x;
                     x = 0;
                 }
-                else {
-                    cp += 4; cp1 += 4; cp2 += 4; cp3 += 4;
+                else
+                {
+                    cp += 4;
+                    cp1 += 4;
+                    cp2 += 4;
+                    cp3 += 4;
                     x -= 4;
                 }
                 pp += 18;
@@ -2003,27 +2207,30 @@ DECLAREContigPutFunc(putcontig8bitYCbCr44tile)
  */
 DECLAREContigPutFunc(putcontig8bitYCbCr42tile)
 {
-    uint32* cp1 = cp+w+toskew;
-    int32 incr = 2*toskew+w;
-
-    (void) y;
-    fromskew = (fromskew / 4) * (4*2+2);
-    if ((w & 3) == 0 && (h & 1) == 0) {
-        for (; h >= 2; h -= 2) {
-            x = w>>2;
-            do {
-                int32 Cb = pp[8];
-                int32 Cr = pp[9];
-                
-                YCbCrtoRGB(cp [0], pp[0]);
-                YCbCrtoRGB(cp [1], pp[1]);
-                YCbCrtoRGB(cp [2], pp[2]);
-                YCbCrtoRGB(cp [3], pp[3]);
+    uint32_t *cp1 = cp + w + toskew;
+    int32_t incr = 2 * toskew + w;
+
+    (void)y;
+    fromskew = (fromskew / 4) * (4 * 2 + 2);
+    if ((w & 3) == 0 && (h & 1) == 0)
+    {
+        for (; h >= 2; h -= 2)
+        {
+            x = w >> 2;
+            do
+            {
+                int32_t Cb = pp[8];
+                int32_t Cr = pp[9];
+
+                YCbCrtoRGB(cp[0], pp[0]);
+                YCbCrtoRGB(cp[1], pp[1]);
+                YCbCrtoRGB(cp[2], pp[2]);
+                YCbCrtoRGB(cp[3], pp[3]);
                 YCbCrtoRGB(cp1[0], pp[4]);
                 YCbCrtoRGB(cp1[1], pp[5]);
                 YCbCrtoRGB(cp1[2], pp[6]);
                 YCbCrtoRGB(cp1[3], pp[7]);
-                
+
                 cp += 4;
                 cp1 += 4;
                 pp += 10;
@@ -2032,39 +2239,60 @@ DECLAREContigPutFunc(putcontig8bitYCbCr42tile)
             cp1 += incr;
             pp += fromskew;
         }
-    } else {
-        while (h > 0) {
-            for (x = w; x > 0;) {
-                int32 Cb = pp[8];
-                int32 Cr = pp[9];
-                switch (x) {
-                default:
-                    switch (h) {
-                    default: YCbCrtoRGB(cp1[3], pp[ 7]); /* FALLTHROUGH */
-                    case 1:  YCbCrtoRGB(cp [3], pp[ 3]); /* FALLTHROUGH */
-                    }                                    /* FALLTHROUGH */
-                case 3:
-                    switch (h) {
-                    default: YCbCrtoRGB(cp1[2], pp[ 6]); /* FALLTHROUGH */
-                    case 1:  YCbCrtoRGB(cp [2], pp[ 2]); /* FALLTHROUGH */
-                    }                                    /* FALLTHROUGH */
-                case 2:
-                    switch (h) {
-                    default: YCbCrtoRGB(cp1[1], pp[ 5]); /* FALLTHROUGH */
-                    case 1:  YCbCrtoRGB(cp [1], pp[ 1]); /* FALLTHROUGH */
-                    }                                    /* FALLTHROUGH */
-                case 1:
-                    switch (h) {
-                    default: YCbCrtoRGB(cp1[0], pp[ 4]); /* FALLTHROUGH */
-                    case 1:  YCbCrtoRGB(cp [0], pp[ 0]); /* FALLTHROUGH */
-                    }                                    /* FALLTHROUGH */
+    }
+    else
+    {
+        while (h > 0)
+        {
+            for (x = w; x > 0;)
+            {
+                int32_t Cb = pp[8];
+                int32_t Cr = pp[9];
+                switch (x)
+                {
+                    default:
+                        switch (h)
+                        {
+                            default:
+                                YCbCrtoRGB(cp1[3], pp[7]); /* FALLTHROUGH */
+                            case 1:
+                                YCbCrtoRGB(cp[3], pp[3]); /* FALLTHROUGH */
+                        }                                 /* FALLTHROUGH */
+                    case 3:
+                        switch (h)
+                        {
+                            default:
+                                YCbCrtoRGB(cp1[2], pp[6]); /* FALLTHROUGH */
+                            case 1:
+                                YCbCrtoRGB(cp[2], pp[2]); /* FALLTHROUGH */
+                        }                                 /* FALLTHROUGH */
+                    case 2:
+                        switch (h)
+                        {
+                            default:
+                                YCbCrtoRGB(cp1[1], pp[5]); /* FALLTHROUGH */
+                            case 1:
+                                YCbCrtoRGB(cp[1], pp[1]); /* FALLTHROUGH */
+                        }                                 /* FALLTHROUGH */
+                    case 1:
+                        switch (h)
+                        {
+                            default:
+                                YCbCrtoRGB(cp1[0], pp[4]); /* FALLTHROUGH */
+                            case 1:
+                                YCbCrtoRGB(cp[0], pp[0]); /* FALLTHROUGH */
+                        }                                 /* FALLTHROUGH */
                 }
-                if (x < 4) {
-                    cp += x; cp1 += x;
+                if (x < 4)
+                {
+                    cp += x;
+                    cp1 += x;
                     x = 0;
                 }
-                else {
-                    cp += 4; cp1 += 4;
+                else
+                {
+                    cp += 4;
+                    cp1 += 4;
                     x -= 4;
                 }
                 pp += 10;
@@ -2084,44 +2312,50 @@ DECLAREContigPutFunc(putcontig8bitYCbCr42tile)
  */
 DECLAREContigPutFunc(putcontig8bitYCbCr41tile)
 {
-    (void) y;
-    fromskew = (fromskew / 4) * (4*1+2);
-    do {
-	x = w>>2;
-	while(x>0) {
-	    int32 Cb = pp[4];
-	    int32 Cr = pp[5];
-
-	    YCbCrtoRGB(cp [0], pp[0]);
-	    YCbCrtoRGB(cp [1], pp[1]);
-	    YCbCrtoRGB(cp [2], pp[2]);
-	    YCbCrtoRGB(cp [3], pp[3]);
-
-	    cp += 4;
-	    pp += 6;
-		x--;
-	}
-
-        if( (w&3) != 0 )
+    (void)y;
+    fromskew = (fromskew / 4) * (4 * 1 + 2);
+    do
+    {
+        x = w >> 2;
+        while (x > 0)
+        {
+            int32_t Cb = pp[4];
+            int32_t Cr = pp[5];
+
+            YCbCrtoRGB(cp[0], pp[0]);
+            YCbCrtoRGB(cp[1], pp[1]);
+            YCbCrtoRGB(cp[2], pp[2]);
+            YCbCrtoRGB(cp[3], pp[3]);
+
+            cp += 4;
+            pp += 6;
+            x--;
+        }
+
+        if ((w & 3) != 0)
         {
-	    int32 Cb = pp[4];
-	    int32 Cr = pp[5];
-
-            switch( (w&3) ) {
-              case 3: YCbCrtoRGB(cp [2], pp[2]); /*-fallthrough*/
-              case 2: YCbCrtoRGB(cp [1], pp[1]); /*-fallthrough*/
-              case 1: YCbCrtoRGB(cp [0], pp[0]); /*-fallthrough*/
-              case 0: break;
+            int32_t Cb = pp[4];
+            int32_t Cr = pp[5];
+
+            switch ((w & 3))
+            {
+                case 3:
+                    YCbCrtoRGB(cp[2], pp[2]); /*-fallthrough*/
+                case 2:
+                    YCbCrtoRGB(cp[1], pp[1]); /*-fallthrough*/
+                case 1:
+                    YCbCrtoRGB(cp[0], pp[0]); /*-fallthrough*/
+                case 0:
+                    break;
             }
 
-            cp += (w&3);
+            cp += (w & 3);
             pp += 6;
         }
 
-	cp += toskew;
-	pp += fromskew;
+        cp += toskew;
+        pp += fromskew;
     } while (--h);
-
 }
 
 /*
@@ -2129,57 +2363,63 @@ DECLAREContigPutFunc(putcontig8bitYCbCr41tile)
  */
 DECLAREContigPutFunc(putcontig8bitYCbCr22tile)
 {
-	uint32* cp2;
-	int32 incr = 2*toskew+w;
-	(void) y;
-	fromskew = (fromskew / 2) * (2*2+2);
-	cp2 = cp+w+toskew;
-	while (h>=2) {
-		x = w;
-		while (x>=2) {
-			uint32 Cb = pp[4];
-			uint32 Cr = pp[5];
-			YCbCrtoRGB(cp[0], pp[0]);
-			YCbCrtoRGB(cp[1], pp[1]);
-			YCbCrtoRGB(cp2[0], pp[2]);
-			YCbCrtoRGB(cp2[1], pp[3]);
-			cp += 2;
-			cp2 += 2;
-			pp += 6;
-			x -= 2;
-		}
-		if (x==1) {
-			uint32 Cb = pp[4];
-			uint32 Cr = pp[5];
-			YCbCrtoRGB(cp[0], pp[0]);
-			YCbCrtoRGB(cp2[0], pp[2]);
-			cp ++ ;
-			cp2 ++ ;
-			pp += 6;
-		}
-		cp += incr;
-		cp2 += incr;
-		pp += fromskew;
-		h-=2;
-	}
-	if (h==1) {
-		x = w;
-		while (x>=2) {
-			uint32 Cb = pp[4];
-			uint32 Cr = pp[5];
-			YCbCrtoRGB(cp[0], pp[0]);
-			YCbCrtoRGB(cp[1], pp[1]);
-			cp += 2;
-			cp2 += 2;
-			pp += 6;
-			x -= 2;
-		}
-		if (x==1) {
-			uint32 Cb = pp[4];
-			uint32 Cr = pp[5];
-			YCbCrtoRGB(cp[0], pp[0]);
-		}
-	}
+    uint32_t *cp2;
+    int32_t incr = 2 * toskew + w;
+    (void)y;
+    fromskew = (fromskew / 2) * (2 * 2 + 2);
+    cp2 = cp + w + toskew;
+    while (h >= 2)
+    {
+        x = w;
+        while (x >= 2)
+        {
+            uint32_t Cb = pp[4];
+            uint32_t Cr = pp[5];
+            YCbCrtoRGB(cp[0], pp[0]);
+            YCbCrtoRGB(cp[1], pp[1]);
+            YCbCrtoRGB(cp2[0], pp[2]);
+            YCbCrtoRGB(cp2[1], pp[3]);
+            cp += 2;
+            cp2 += 2;
+            pp += 6;
+            x -= 2;
+        }
+        if (x == 1)
+        {
+            uint32_t Cb = pp[4];
+            uint32_t Cr = pp[5];
+            YCbCrtoRGB(cp[0], pp[0]);
+            YCbCrtoRGB(cp2[0], pp[2]);
+            cp++;
+            cp2++;
+            pp += 6;
+        }
+        cp += incr;
+        cp2 += incr;
+        pp += fromskew;
+        h -= 2;
+    }
+    if (h == 1)
+    {
+        x = w;
+        while (x >= 2)
+        {
+            uint32_t Cb = pp[4];
+            uint32_t Cr = pp[5];
+            YCbCrtoRGB(cp[0], pp[0]);
+            YCbCrtoRGB(cp[1], pp[1]);
+            cp += 2;
+            cp2 += 2;
+            pp += 6;
+            x -= 2;
+        }
+        if (x == 1)
+        {
+            uint32_t Cb = pp[4];
+            uint32_t Cr = pp[5];
+            YCbCrtoRGB(cp[0], pp[0]);
+        }
+    }
 }
 
 /*
@@ -2187,36 +2427,38 @@ DECLAREContigPutFunc(putcontig8bitYCbCr22tile)
  */
 DECLAREContigPutFunc(putcontig8bitYCbCr21tile)
 {
-	(void) y;
-	fromskew = (fromskew / 2) * (2*1+2);
-	do {
-		x = w>>1;
-		while(x>0) {
-			int32 Cb = pp[2];
-			int32 Cr = pp[3];
+    (void)y;
+    fromskew = (fromskew / 2) * (2 * 1 + 2);
+    do
+    {
+        x = w >> 1;
+        while (x > 0)
+        {
+            int32_t Cb = pp[2];
+            int32_t Cr = pp[3];
 
-			YCbCrtoRGB(cp[0], pp[0]);
-			YCbCrtoRGB(cp[1], pp[1]);
+            YCbCrtoRGB(cp[0], pp[0]);
+            YCbCrtoRGB(cp[1], pp[1]);
 
-			cp += 2;
-			pp += 4;
-			x --;
-		}
+            cp += 2;
+            pp += 4;
+            x--;
+        }
 
-		if( (w&1) != 0 )
-		{
-			int32 Cb = pp[2];
-			int32 Cr = pp[3];
+        if ((w & 1) != 0)
+        {
+            int32_t Cb = pp[2];
+            int32_t Cr = pp[3];
 
-			YCbCrtoRGB(cp[0], pp[0]);
+            YCbCrtoRGB(cp[0], pp[0]);
 
-			cp += 1;
-			pp += 4;
-		}
+            cp += 1;
+            pp += 4;
+        }
 
-		cp += toskew;
-		pp += fromskew;
-	} while (--h);
+        cp += toskew;
+        pp += fromskew;
+    } while (--h);
 }
 
 /*
@@ -2224,37 +2466,41 @@ DECLAREContigPutFunc(putcontig8bitYCbCr21tile)
  */
 DECLAREContigPutFunc(putcontig8bitYCbCr12tile)
 {
-	uint32* cp2;
-	int32 incr = 2*toskew+w;
-	(void) y;
-	fromskew = (fromskew / 1) * (1 * 2 + 2);
-	cp2 = cp+w+toskew;
-	while (h>=2) {
-		x = w;
-		do {
-			uint32 Cb = pp[2];
-			uint32 Cr = pp[3];
-			YCbCrtoRGB(cp[0], pp[0]);
-			YCbCrtoRGB(cp2[0], pp[1]);
-			cp ++;
-			cp2 ++;
-			pp += 4;
-		} while (--x);
-		cp += incr;
-		cp2 += incr;
-		pp += fromskew;
-		h-=2;
-	}
-	if (h==1) {
-		x = w;
-		do {
-			uint32 Cb = pp[2];
-			uint32 Cr = pp[3];
-			YCbCrtoRGB(cp[0], pp[0]);
-			cp ++;
-			pp += 4;
-		} while (--x);
-	}
+    uint32_t *cp2;
+    int32_t incr = 2 * toskew + w;
+    (void)y;
+    fromskew = (fromskew / 1) * (1 * 2 + 2);
+    cp2 = cp + w + toskew;
+    while (h >= 2)
+    {
+        x = w;
+        do
+        {
+            uint32_t Cb = pp[2];
+            uint32_t Cr = pp[3];
+            YCbCrtoRGB(cp[0], pp[0]);
+            YCbCrtoRGB(cp2[0], pp[1]);
+            cp++;
+            cp2++;
+            pp += 4;
+        } while (--x);
+        cp += incr;
+        cp2 += incr;
+        pp += fromskew;
+        h -= 2;
+    }
+    if (h == 1)
+    {
+        x = w;
+        do
+        {
+            uint32_t Cb = pp[2];
+            uint32_t Cr = pp[3];
+            YCbCrtoRGB(cp[0], pp[0]);
+            cp++;
+            pp += 4;
+        } while (--x);
+    }
 }
 
 /*
@@ -2262,21 +2508,23 @@ DECLAREContigPutFunc(putcontig8bitYCbCr12tile)
  */
 DECLAREContigPutFunc(putcontig8bitYCbCr11tile)
 {
-	(void) y;
-	fromskew = (fromskew / 1) * (1 * 1 + 2);
-	do {
-		x = w; /* was x = w>>1; patched 2000/09/25 warmerda@home.com */
-		do {
-			int32 Cb = pp[1];
-			int32 Cr = pp[2];
+    (void)y;
+    fromskew = (fromskew / 1) * (1 * 1 + 2);
+    do
+    {
+        x = w; /* was x = w>>1; patched 2000/09/25 warmerda@home.com */
+        do
+        {
+            int32_t Cb = pp[1];
+            int32_t Cr = pp[2];
 
-			YCbCrtoRGB(*cp++, pp[0]);
+            YCbCrtoRGB(*cp++, pp[0]);
 
-			pp += 3;
-		} while (--x);
-		cp += toskew;
-		pp += fromskew;
-	} while (--h);
+            pp += 3;
+        } while (--x);
+        cp += toskew;
+        pp += fromskew;
+    } while (--h);
 }
 
 /*
@@ -2284,19 +2532,22 @@ DECLAREContigPutFunc(putcontig8bitYCbCr11tile)
  */
 DECLARESepPutFunc(putseparate8bitYCbCr11tile)
 {
-	(void) y;
-	(void) a;
-	/* TODO: naming of input vars is still off, change obfuscating declaration inside define, or resolve obfuscation */
-	for( ; h > 0; --h) {
-		x = w;
-		do {
-			uint32 dr, dg, db;
-			TIFFYCbCrtoRGB(img->ycbcr,*r++,*g++,*b++,&dr,&dg,&db);
-			*cp++ = PACK(dr,dg,db);
-		} while (--x);
-		SKEW(r, g, b, fromskew);
-		cp += toskew;
-	}
+    (void)y;
+    (void)a;
+    /* TODO: naming of input vars is still off, change obfuscating declaration
+     * inside define, or resolve obfuscation */
+    for (; h > 0; --h)
+    {
+        x = w;
+        do
+        {
+            uint32_t dr, dg, db;
+            TIFFYCbCrtoRGB(img->ycbcr, *r++, *g++, *b++, &dr, &dg, &db);
+            *cp++ = PACK(dr, dg, db);
+        } while (--x);
+        SKEW(r, g, b, fromskew);
+        cp += toskew;
+    }
 }
 #undef YCbCrtoRGB
 
@@ -2305,97 +2556,100 @@ static int isInRefBlackWhiteRange(float f)
     return f > (float)(-0x7FFFFFFF + 128) && f < (float)0x7FFFFFFF;
 }
 
-static int
-initYCbCrConversion(TIFFRGBAImage* img)
-{
-	static const char module[] = "initYCbCrConversion";
-
-	float *luma, *refBlackWhite;
-
-	if (img->ycbcr == NULL) {
-		img->ycbcr = (TIFFYCbCrToRGB*) _TIFFmalloc(
-		    TIFFroundup_32(sizeof (TIFFYCbCrToRGB), sizeof (long))  
-		    + 4*256*sizeof (TIFFRGBValue)
-		    + 2*256*sizeof (int)
-		    + 3*256*sizeof (int32)
-		    );
-		if (img->ycbcr == NULL) {
-			TIFFErrorExt(img->tif->tif_clientdata, module,
-			    "No space for YCbCr->RGB conversion state");
-			return (0);
-		}
-	}
-
-	TIFFGetFieldDefaulted(img->tif, TIFFTAG_YCBCRCOEFFICIENTS, &luma);
-	TIFFGetFieldDefaulted(img->tif, TIFFTAG_REFERENCEBLACKWHITE,
-	    &refBlackWhite);
-
-        /* Do some validation to avoid later issues. Detect NaN for now */
-        /* and also if lumaGreen is zero since we divide by it later */
-        if( luma[0] != luma[0] ||
-            luma[1] != luma[1] ||
-            luma[1] == 0.0 ||
-            luma[2] != luma[2] )
-        {
-            TIFFErrorExt(img->tif->tif_clientdata, module,
-                "Invalid values for YCbCrCoefficients tag");
-            return (0);
-        }
+static int initYCbCrConversion(TIFFRGBAImage *img)
+{
+    static const char module[] = "initYCbCrConversion";
 
-        if( !isInRefBlackWhiteRange(refBlackWhite[0]) ||
-            !isInRefBlackWhiteRange(refBlackWhite[1]) ||
-            !isInRefBlackWhiteRange(refBlackWhite[2]) ||
-            !isInRefBlackWhiteRange(refBlackWhite[3]) ||
-            !isInRefBlackWhiteRange(refBlackWhite[4]) ||
-            !isInRefBlackWhiteRange(refBlackWhite[5]) )
+    float *luma, *refBlackWhite;
+
+    if (img->ycbcr == NULL)
+    {
+        img->ycbcr = (TIFFYCbCrToRGB *)_TIFFmallocExt(
+            img->tif, TIFFroundup_32(sizeof(TIFFYCbCrToRGB), sizeof(long)) +
+                          4 * 256 * sizeof(TIFFRGBValue) +
+                          2 * 256 * sizeof(int) + 3 * 256 * sizeof(int32_t));
+        if (img->ycbcr == NULL)
         {
-            TIFFErrorExt(img->tif->tif_clientdata, module,
-                "Invalid values for ReferenceBlackWhite tag");
+            TIFFErrorExtR(img->tif, module,
+                          "No space for YCbCr->RGB conversion state");
             return (0);
         }
+    }
+
+    TIFFGetFieldDefaulted(img->tif, TIFFTAG_YCBCRCOEFFICIENTS, &luma);
+    TIFFGetFieldDefaulted(img->tif, TIFFTAG_REFERENCEBLACKWHITE,
+                          &refBlackWhite);
+
+    /* Do some validation to avoid later issues. Detect NaN for now */
+    /* and also if lumaGreen is zero since we divide by it later */
+    if (luma[0] != luma[0] || luma[1] != luma[1] || luma[1] == 0.0 ||
+        luma[2] != luma[2])
+    {
+        TIFFErrorExtR(img->tif, module,
+                      "Invalid values for YCbCrCoefficients tag");
+        return (0);
+    }
 
-	if (TIFFYCbCrToRGBInit(img->ycbcr, luma, refBlackWhite) < 0)
-		return(0);
-	return (1);
+    if (!isInRefBlackWhiteRange(refBlackWhite[0]) ||
+        !isInRefBlackWhiteRange(refBlackWhite[1]) ||
+        !isInRefBlackWhiteRange(refBlackWhite[2]) ||
+        !isInRefBlackWhiteRange(refBlackWhite[3]) ||
+        !isInRefBlackWhiteRange(refBlackWhite[4]) ||
+        !isInRefBlackWhiteRange(refBlackWhite[5]))
+    {
+        TIFFErrorExtR(img->tif, module,
+                      "Invalid values for ReferenceBlackWhite tag");
+        return (0);
+    }
+
+    if (TIFFYCbCrToRGBInit(img->ycbcr, luma, refBlackWhite) < 0)
+        return (0);
+    return (1);
 }
 
-static tileContigRoutine
-initCIELabConversion(TIFFRGBAImage* img)
+static tileContigRoutine initCIELabConversion(TIFFRGBAImage *img)
 {
-	static const char module[] = "initCIELabConversion";
+    static const char module[] = "initCIELabConversion";
 
-	float   *whitePoint;
-	float   refWhite[3];
+    float *whitePoint;
+    float refWhite[3];
+
+    TIFFGetFieldDefaulted(img->tif, TIFFTAG_WHITEPOINT, &whitePoint);
+    if (whitePoint[1] == 0.0f)
+    {
+        TIFFErrorExtR(img->tif, module, "Invalid value for WhitePoint tag.");
+        return NULL;
+    }
 
-	TIFFGetFieldDefaulted(img->tif, TIFFTAG_WHITEPOINT, &whitePoint);
-	if (whitePoint[1] == 0.0f ) {
-		TIFFErrorExt(img->tif->tif_clientdata, module,
-		    "Invalid value for WhitePoint tag.");
-		return NULL;
+    if (!img->cielab)
+    {
+        img->cielab = (TIFFCIELabToRGB *)_TIFFmallocExt(
+            img->tif, sizeof(TIFFCIELabToRGB));
+        if (!img->cielab)
+        {
+            TIFFErrorExtR(img->tif, module,
+                          "No space for CIE L*a*b*->RGB conversion state.");
+            return NULL;
         }
+    }
 
-	if (!img->cielab) {
-		img->cielab = (TIFFCIELabToRGB *)
-			_TIFFmalloc(sizeof(TIFFCIELabToRGB));
-		if (!img->cielab) {
-			TIFFErrorExt(img->tif->tif_clientdata, module,
-			    "No space for CIE L*a*b*->RGB conversion state.");
-			return NULL;
-		}
-	}
-
-	refWhite[1] = 100.0F;
-	refWhite[0] = whitePoint[0] / whitePoint[1] * refWhite[1];
-	refWhite[2] = (1.0F - whitePoint[0] - whitePoint[1])
-		      / whitePoint[1] * refWhite[1];
-	if (TIFFCIELabToRGBInit(img->cielab, &display_sRGB, refWhite) < 0) {
-		TIFFErrorExt(img->tif->tif_clientdata, module,
-		    "Failed to initialize CIE L*a*b*->RGB conversion state.");
-		_TIFFfree(img->cielab);
-		return NULL;
-	}
-
-	return putcontig8bitCIELab;
+    refWhite[1] = 100.0F;
+    refWhite[0] = whitePoint[0] / whitePoint[1] * refWhite[1];
+    refWhite[2] =
+        (1.0F - whitePoint[0] - whitePoint[1]) / whitePoint[1] * refWhite[1];
+    if (TIFFCIELabToRGBInit(img->cielab, &display_sRGB, refWhite) < 0)
+    {
+        TIFFErrorExtR(img->tif, module,
+                      "Failed to initialize CIE L*a*b*->RGB conversion state.");
+        _TIFFfreeExt(img->tif, img->cielab);
+        return NULL;
+    }
+
+    if (img->bitspersample == 8)
+        return putcontig8bitCIELab8;
+    else if (img->bitspersample == 16)
+        return putcontig8bitCIELab16;
+    return NULL;
 }
 
 /*
@@ -2405,56 +2659,62 @@ initCIELabConversion(TIFFRGBAImage* img)
  * pixel values simply by indexing into the table with one
  * number.
  */
-static int
-makebwmap(TIFFRGBAImage* img)
+static int makebwmap(TIFFRGBAImage *img)
 {
-    TIFFRGBValue* Map = img->Map;
+    TIFFRGBValue *Map = img->Map;
     int bitspersample = img->bitspersample;
     int nsamples = 8 / bitspersample;
     int i;
-    uint32* p;
+    uint32_t *p;
 
-    if( nsamples == 0 )
+    if (nsamples == 0)
         nsamples = 1;
 
-    img->BWmap = (uint32**) _TIFFmalloc(
-	256*sizeof (uint32 *)+(256*nsamples*sizeof(uint32)));
-    if (img->BWmap == NULL) {
-		TIFFErrorExt(img->tif->tif_clientdata, TIFFFileName(img->tif), "No space for B&W mapping table");
-		return (0);
-    }
-    p = (uint32*)(img->BWmap + 256);
-    for (i = 0; i < 256; i++) {
-	TIFFRGBValue c;
-	img->BWmap[i] = p;
-	switch (bitspersample) {
-#define	GREY(x)	c = Map[x]; *p++ = PACK(c,c,c);
-	case 1:
-	    GREY(i>>7);
-	    GREY((i>>6)&1);
-	    GREY((i>>5)&1);
-	    GREY((i>>4)&1);
-	    GREY((i>>3)&1);
-	    GREY((i>>2)&1);
-	    GREY((i>>1)&1);
-	    GREY(i&1);
-	    break;
-	case 2:
-	    GREY(i>>6);
-	    GREY((i>>4)&3);
-	    GREY((i>>2)&3);
-	    GREY(i&3);
-	    break;
-	case 4:
-	    GREY(i>>4);
-	    GREY(i&0xf);
-	    break;
-	case 8:
-        case 16:
-	    GREY(i);
-	    break;
-	}
-#undef	GREY
+    img->BWmap = (uint32_t **)_TIFFmallocExt(
+        img->tif,
+        256 * sizeof(uint32_t *) + (256 * nsamples * sizeof(uint32_t)));
+    if (img->BWmap == NULL)
+    {
+        TIFFErrorExtR(img->tif, TIFFFileName(img->tif),
+                      "No space for B&W mapping table");
+        return (0);
+    }
+    p = (uint32_t *)(img->BWmap + 256);
+    for (i = 0; i < 256; i++)
+    {
+        TIFFRGBValue c;
+        img->BWmap[i] = p;
+        switch (bitspersample)
+        {
+#define GREY(x)                                                                \
+    c = Map[x];                                                                \
+    *p++ = PACK(c, c, c);
+            case 1:
+                GREY(i >> 7);
+                GREY((i >> 6) & 1);
+                GREY((i >> 5) & 1);
+                GREY((i >> 4) & 1);
+                GREY((i >> 3) & 1);
+                GREY((i >> 2) & 1);
+                GREY((i >> 1) & 1);
+                GREY(i & 1);
+                break;
+            case 2:
+                GREY(i >> 6);
+                GREY((i >> 4) & 3);
+                GREY((i >> 2) & 3);
+                GREY(i & 3);
+                break;
+            case 4:
+                GREY(i >> 4);
+                GREY(i & 0xf);
+                break;
+            case 8:
+            case 16:
+                GREY(i);
+                break;
+        }
+#undef GREY
     }
     return (1);
 }
@@ -2463,75 +2723,79 @@ makebwmap(TIFFRGBAImage* img)
  * Construct a mapping table to convert from the range
  * of the data samples to [0,255] --for display.  This
  * process also handles inverting B&W images when needed.
- */ 
-static int
-setupMap(TIFFRGBAImage* img)
+ */
+static int setupMap(TIFFRGBAImage *img)
 {
-    int32 x, range;
+    int32_t x, range;
+
+    range = (int32_t)((1L << img->bitspersample) - 1);
 
-    range = (int32)((1L<<img->bitspersample)-1);
-    
     /* treat 16 bit the same as eight bit */
-    if( img->bitspersample == 16 )
-        range = (int32) 255;
-
-    img->Map = (TIFFRGBValue*) _TIFFmalloc((range+1) * sizeof (TIFFRGBValue));
-    if (img->Map == NULL) {
-		TIFFErrorExt(img->tif->tif_clientdata, TIFFFileName(img->tif),
-			"No space for photometric conversion table");
-		return (0);
-    }
-    if (img->photometric == PHOTOMETRIC_MINISWHITE) {
-	for (x = 0; x <= range; x++)
-	    img->Map[x] = (TIFFRGBValue) (((range - x) * 255) / range);
-    } else {
-	for (x = 0; x <= range; x++)
-	    img->Map[x] = (TIFFRGBValue) ((x * 255) / range);
+    if (img->bitspersample == 16)
+        range = (int32_t)255;
+
+    img->Map = (TIFFRGBValue *)_TIFFmallocExt(
+        img->tif, (range + 1) * sizeof(TIFFRGBValue));
+    if (img->Map == NULL)
+    {
+        TIFFErrorExtR(img->tif, TIFFFileName(img->tif),
+                      "No space for photometric conversion table");
+        return (0);
+    }
+    if (img->photometric == PHOTOMETRIC_MINISWHITE)
+    {
+        for (x = 0; x <= range; x++)
+            img->Map[x] = (TIFFRGBValue)(((range - x) * 255) / range);
+    }
+    else
+    {
+        for (x = 0; x <= range; x++)
+            img->Map[x] = (TIFFRGBValue)((x * 255) / range);
     }
     if (img->bitspersample <= 16 &&
-	(img->photometric == PHOTOMETRIC_MINISBLACK ||
-	 img->photometric == PHOTOMETRIC_MINISWHITE)) {
-	/*
-	 * Use photometric mapping table to construct
-	 * unpacking tables for samples <= 8 bits.
-	 */
-	if (!makebwmap(img))
-	    return (0);
-	/* no longer need Map, free it */
-	_TIFFfree(img->Map);
-	img->Map = NULL;
+        (img->photometric == PHOTOMETRIC_MINISBLACK ||
+         img->photometric == PHOTOMETRIC_MINISWHITE))
+    {
+        /*
+         * Use photometric mapping table to construct
+         * unpacking tables for samples <= 8 bits.
+         */
+        if (!makebwmap(img))
+            return (0);
+        /* no longer need Map, free it */
+        _TIFFfreeExt(img->tif, img->Map);
+        img->Map = NULL;
     }
     return (1);
 }
 
-static int
-checkcmap(TIFFRGBAImage* img)
+static int checkcmap(TIFFRGBAImage *img)
 {
-    uint16* r = img->redcmap;
-    uint16* g = img->greencmap;
-    uint16* b = img->bluecmap;
-    long n = 1L<<img->bitspersample;
+    uint16_t *r = img->redcmap;
+    uint16_t *g = img->greencmap;
+    uint16_t *b = img->bluecmap;
+    long n = 1L << img->bitspersample;
 
     while (n-- > 0)
-	if (*r++ >= 256 || *g++ >= 256 || *b++ >= 256)
-	    return (16);
+        if (*r++ >= 256 || *g++ >= 256 || *b++ >= 256)
+            return (16);
     return (8);
 }
 
-static void
-cvtcmap(TIFFRGBAImage* img)
+static void cvtcmap(TIFFRGBAImage *img)
 {
-    uint16* r = img->redcmap;
-    uint16* g = img->greencmap;
-    uint16* b = img->bluecmap;
+    uint16_t *r = img->redcmap;
+    uint16_t *g = img->greencmap;
+    uint16_t *b = img->bluecmap;
     long i;
 
-    for (i = (1L<<img->bitspersample)-1; i >= 0; i--) {
-#define	CVT(x)		((uint16)((x)>>8))
-	r[i] = CVT(r[i]);
-	g[i] = CVT(g[i]);
-	b[i] = CVT(b[i]);
-#undef	CVT
+    for (i = (1L << img->bitspersample) - 1; i >= 0; i--)
+    {
+#define CVT(x) ((uint16_t)((x) >> 8))
+        r[i] = CVT(r[i]);
+        g[i] = CVT(g[i]);
+        b[i] = CVT(b[i]);
+#undef CVT
     }
 }
 
@@ -2542,93 +2806,100 @@ cvtcmap(TIFFRGBAImage* img)
  * pixel values simply by indexing into the table with one
  * number.
  */
-static int
-makecmap(TIFFRGBAImage* img)
+static int makecmap(TIFFRGBAImage *img)
 {
     int bitspersample = img->bitspersample;
     int nsamples = 8 / bitspersample;
-    uint16* r = img->redcmap;
-    uint16* g = img->greencmap;
-    uint16* b = img->bluecmap;
-    uint32 *p;
+    uint16_t *r = img->redcmap;
+    uint16_t *g = img->greencmap;
+    uint16_t *b = img->bluecmap;
+    uint32_t *p;
     int i;
 
-    img->PALmap = (uint32**) _TIFFmalloc(
-	256*sizeof (uint32 *)+(256*nsamples*sizeof(uint32)));
-    if (img->PALmap == NULL) {
-		TIFFErrorExt(img->tif->tif_clientdata, TIFFFileName(img->tif), "No space for Palette mapping table");
-		return (0);
-	}
-    p = (uint32*)(img->PALmap + 256);
-    for (i = 0; i < 256; i++) {
-	TIFFRGBValue c;
-	img->PALmap[i] = p;
-#define	CMAP(x)	c = (TIFFRGBValue) x; *p++ = PACK(r[c]&0xff, g[c]&0xff, b[c]&0xff);
-	switch (bitspersample) {
-	case 1:
-	    CMAP(i>>7);
-	    CMAP((i>>6)&1);
-	    CMAP((i>>5)&1);
-	    CMAP((i>>4)&1);
-	    CMAP((i>>3)&1);
-	    CMAP((i>>2)&1);
-	    CMAP((i>>1)&1);
-	    CMAP(i&1);
-	    break;
-	case 2:
-	    CMAP(i>>6);
-	    CMAP((i>>4)&3);
-	    CMAP((i>>2)&3);
-	    CMAP(i&3);
-	    break;
-	case 4:
-	    CMAP(i>>4);
-	    CMAP(i&0xf);
-	    break;
-	case 8:
-	    CMAP(i);
-	    break;
-	}
+    img->PALmap = (uint32_t **)_TIFFmallocExt(
+        img->tif,
+        256 * sizeof(uint32_t *) + (256 * nsamples * sizeof(uint32_t)));
+    if (img->PALmap == NULL)
+    {
+        TIFFErrorExtR(img->tif, TIFFFileName(img->tif),
+                      "No space for Palette mapping table");
+        return (0);
+    }
+    p = (uint32_t *)(img->PALmap + 256);
+    for (i = 0; i < 256; i++)
+    {
+        TIFFRGBValue c;
+        img->PALmap[i] = p;
+#define CMAP(x)                                                                \
+    c = (TIFFRGBValue)x;                                                       \
+    *p++ = PACK(r[c] & 0xff, g[c] & 0xff, b[c] & 0xff);
+        switch (bitspersample)
+        {
+            case 1:
+                CMAP(i >> 7);
+                CMAP((i >> 6) & 1);
+                CMAP((i >> 5) & 1);
+                CMAP((i >> 4) & 1);
+                CMAP((i >> 3) & 1);
+                CMAP((i >> 2) & 1);
+                CMAP((i >> 1) & 1);
+                CMAP(i & 1);
+                break;
+            case 2:
+                CMAP(i >> 6);
+                CMAP((i >> 4) & 3);
+                CMAP((i >> 2) & 3);
+                CMAP(i & 3);
+                break;
+            case 4:
+                CMAP(i >> 4);
+                CMAP(i & 0xf);
+                break;
+            case 8:
+                CMAP(i);
+                break;
+        }
 #undef CMAP
     }
     return (1);
 }
 
-/* 
+/*
  * Construct any mapping table used
  * by the associated put routine.
  */
-static int
-buildMap(TIFFRGBAImage* img)
-{
-    switch (img->photometric) {
-    case PHOTOMETRIC_RGB:
-    case PHOTOMETRIC_YCBCR:
-    case PHOTOMETRIC_SEPARATED:
-	if (img->bitspersample == 8)
-	    break;
-	/* fall through... */
-    case PHOTOMETRIC_MINISBLACK:
-    case PHOTOMETRIC_MINISWHITE:
-	if (!setupMap(img))
-	    return (0);
-	break;
-    case PHOTOMETRIC_PALETTE:
-	/*
-	 * Convert 16-bit colormap to 8-bit (unless it looks
-	 * like an old-style 8-bit colormap).
-	 */
-	if (checkcmap(img) == 16)
-	    cvtcmap(img);
-	else
-	    TIFFWarningExt(img->tif->tif_clientdata, TIFFFileName(img->tif), "Assuming 8-bit colormap");
-	/*
-	 * Use mapping table and colormap to construct
-	 * unpacking tables for samples < 8 bits.
-	 */
-	if (img->bitspersample <= 8 && !makecmap(img))
-	    return (0);
-	break;
+static int buildMap(TIFFRGBAImage *img)
+{
+    switch (img->photometric)
+    {
+        case PHOTOMETRIC_RGB:
+        case PHOTOMETRIC_YCBCR:
+        case PHOTOMETRIC_SEPARATED:
+            if (img->bitspersample == 8)
+                break;
+            /* fall through... */
+        case PHOTOMETRIC_MINISBLACK:
+        case PHOTOMETRIC_MINISWHITE:
+            if (!setupMap(img))
+                return (0);
+            break;
+        case PHOTOMETRIC_PALETTE:
+            /*
+             * Convert 16-bit colormap to 8-bit (unless it looks
+             * like an old-style 8-bit colormap).
+             */
+            if (checkcmap(img) == 16)
+                cvtcmap(img);
+            else
+                TIFFWarningExtR(img->tif, TIFFFileName(img->tif),
+                                "Assuming 8-bit colormap");
+            /*
+             * Use mapping table and colormap to construct
+             * unpacking tables for samples < 8 bits.
+             */
+            if (img->bitspersample <= 8 && !makecmap(img))
+                return (0);
+            break;
     }
     return (1);
 }
@@ -2636,153 +2907,162 @@ buildMap(TIFFRGBAImage* img)
 /*
  * Select the appropriate conversion routine for packed data.
  */
-static int
-PickContigCase(TIFFRGBAImage* img)
-{
-	img->get = TIFFIsTiled(img->tif) ? gtTileContig : gtStripContig;
-	img->put.contig = NULL;
-	switch (img->photometric) {
-		case PHOTOMETRIC_RGB:
-			switch (img->bitspersample) {
-				case 8:
-					if (img->alpha == EXTRASAMPLE_ASSOCALPHA &&
-						img->samplesperpixel >= 4)
-						img->put.contig = putRGBAAcontig8bittile;
-					else if (img->alpha == EXTRASAMPLE_UNASSALPHA &&
-							 img->samplesperpixel >= 4)
-					{
-						if (BuildMapUaToAa(img))
-							img->put.contig = putRGBUAcontig8bittile;
-					}
-					else if( img->samplesperpixel >= 3 )
-						img->put.contig = putRGBcontig8bittile;
-					break;
-				case 16:
-					if (img->alpha == EXTRASAMPLE_ASSOCALPHA &&
-						img->samplesperpixel >=4 )
-					{
-						if (BuildMapBitdepth16To8(img))
-							img->put.contig = putRGBAAcontig16bittile;
-					}
-					else if (img->alpha == EXTRASAMPLE_UNASSALPHA &&
-							 img->samplesperpixel >=4 )
-					{
-						if (BuildMapBitdepth16To8(img) &&
-						    BuildMapUaToAa(img))
-							img->put.contig = putRGBUAcontig16bittile;
-					}
-					else if( img->samplesperpixel >=3 )
-					{
-						if (BuildMapBitdepth16To8(img))
-							img->put.contig = putRGBcontig16bittile;
-					}
-					break;
-			}
-			break;
-		case PHOTOMETRIC_SEPARATED:
-			if (img->samplesperpixel >=4 && buildMap(img)) {
-				if (img->bitspersample == 8) {
-					if (!img->Map)
-						img->put.contig = putRGBcontig8bitCMYKtile;
-					else
-						img->put.contig = putRGBcontig8bitCMYKMaptile;
-				}
-			}
-			break;
-		case PHOTOMETRIC_PALETTE:
-			if (buildMap(img)) {
-				switch (img->bitspersample) {
-					case 8:
-						img->put.contig = put8bitcmaptile;
-						break;
-					case 4:
-						img->put.contig = put4bitcmaptile;
-						break;
-					case 2:
-						img->put.contig = put2bitcmaptile;
-						break;
-					case 1:
-						img->put.contig = put1bitcmaptile;
-						break;
-				}
-			}
-			break;
-		case PHOTOMETRIC_MINISWHITE:
-		case PHOTOMETRIC_MINISBLACK:
-			if (buildMap(img)) {
-				switch (img->bitspersample) {
-					case 16:
-						img->put.contig = put16bitbwtile;
-						break;
-					case 8:
-						if (img->alpha && img->samplesperpixel == 2)
-							img->put.contig = putagreytile;
-						else
-							img->put.contig = putgreytile;
-						break;
-					case 4:
-						img->put.contig = put4bitbwtile;
-						break;
-					case 2:
-						img->put.contig = put2bitbwtile;
-						break;
-					case 1:
-						img->put.contig = put1bitbwtile;
-						break;
-				}
-			}
-			break;
-		case PHOTOMETRIC_YCBCR:
-			if ((img->bitspersample==8) && (img->samplesperpixel==3))
-			{
-				if (initYCbCrConversion(img)!=0)
-				{
-					/*
-					 * The 6.0 spec says that subsampling must be
-					 * one of 1, 2, or 4, and that vertical subsampling
-					 * must always be <= horizontal subsampling; so
-					 * there are only a few possibilities and we just
-					 * enumerate the cases.
-					 * Joris: added support for the [1,2] case, nonetheless, to accommodate
-					 * some OJPEG files
-					 */
-					uint16 SubsamplingHor;
-					uint16 SubsamplingVer;
-					TIFFGetFieldDefaulted(img->tif, TIFFTAG_YCBCRSUBSAMPLING, &SubsamplingHor, &SubsamplingVer);
-					switch ((SubsamplingHor<<4)|SubsamplingVer) {
-						case 0x44:
-							img->put.contig = putcontig8bitYCbCr44tile;
-							break;
-						case 0x42:
-							img->put.contig = putcontig8bitYCbCr42tile;
-							break;
-						case 0x41:
-							img->put.contig = putcontig8bitYCbCr41tile;
-							break;
-						case 0x22:
-							img->put.contig = putcontig8bitYCbCr22tile;
-							break;
-						case 0x21:
-							img->put.contig = putcontig8bitYCbCr21tile;
-							break;
-						case 0x12:
-							img->put.contig = putcontig8bitYCbCr12tile;
-							break;
-						case 0x11:
-							img->put.contig = putcontig8bitYCbCr11tile;
-							break;
-					}
-				}
-			}
-			break;
-		case PHOTOMETRIC_CIELAB:
-			if (img->samplesperpixel == 3 && buildMap(img)) {
-				if (img->bitspersample == 8)
-					img->put.contig = initCIELabConversion(img);
-				break;
-			}
-	}
-	return ((img->get!=NULL) && (img->put.contig!=NULL));
+static int PickContigCase(TIFFRGBAImage *img)
+{
+    img->get = TIFFIsTiled(img->tif) ? gtTileContig : gtStripContig;
+    img->put.contig = NULL;
+    switch (img->photometric)
+    {
+        case PHOTOMETRIC_RGB:
+            switch (img->bitspersample)
+            {
+                case 8:
+                    if (img->alpha == EXTRASAMPLE_ASSOCALPHA &&
+                        img->samplesperpixel >= 4)
+                        img->put.contig = putRGBAAcontig8bittile;
+                    else if (img->alpha == EXTRASAMPLE_UNASSALPHA &&
+                             img->samplesperpixel >= 4)
+                    {
+                        if (BuildMapUaToAa(img))
+                            img->put.contig = putRGBUAcontig8bittile;
+                    }
+                    else if (img->samplesperpixel >= 3)
+                        img->put.contig = putRGBcontig8bittile;
+                    break;
+                case 16:
+                    if (img->alpha == EXTRASAMPLE_ASSOCALPHA &&
+                        img->samplesperpixel >= 4)
+                    {
+                        if (BuildMapBitdepth16To8(img))
+                            img->put.contig = putRGBAAcontig16bittile;
+                    }
+                    else if (img->alpha == EXTRASAMPLE_UNASSALPHA &&
+                             img->samplesperpixel >= 4)
+                    {
+                        if (BuildMapBitdepth16To8(img) && BuildMapUaToAa(img))
+                            img->put.contig = putRGBUAcontig16bittile;
+                    }
+                    else if (img->samplesperpixel >= 3)
+                    {
+                        if (BuildMapBitdepth16To8(img))
+                            img->put.contig = putRGBcontig16bittile;
+                    }
+                    break;
+            }
+            break;
+        case PHOTOMETRIC_SEPARATED:
+            if (img->samplesperpixel >= 4 && buildMap(img))
+            {
+                if (img->bitspersample == 8)
+                {
+                    if (!img->Map)
+                        img->put.contig = putRGBcontig8bitCMYKtile;
+                    else
+                        img->put.contig = putRGBcontig8bitCMYKMaptile;
+                }
+            }
+            break;
+        case PHOTOMETRIC_PALETTE:
+            if (buildMap(img))
+            {
+                switch (img->bitspersample)
+                {
+                    case 8:
+                        img->put.contig = put8bitcmaptile;
+                        break;
+                    case 4:
+                        img->put.contig = put4bitcmaptile;
+                        break;
+                    case 2:
+                        img->put.contig = put2bitcmaptile;
+                        break;
+                    case 1:
+                        img->put.contig = put1bitcmaptile;
+                        break;
+                }
+            }
+            break;
+        case PHOTOMETRIC_MINISWHITE:
+        case PHOTOMETRIC_MINISBLACK:
+            if (buildMap(img))
+            {
+                switch (img->bitspersample)
+                {
+                    case 16:
+                        img->put.contig = put16bitbwtile;
+                        break;
+                    case 8:
+                        if (img->alpha && img->samplesperpixel == 2)
+                            img->put.contig = putagreytile;
+                        else
+                            img->put.contig = putgreytile;
+                        break;
+                    case 4:
+                        img->put.contig = put4bitbwtile;
+                        break;
+                    case 2:
+                        img->put.contig = put2bitbwtile;
+                        break;
+                    case 1:
+                        img->put.contig = put1bitbwtile;
+                        break;
+                }
+            }
+            break;
+        case PHOTOMETRIC_YCBCR:
+            if ((img->bitspersample == 8) && (img->samplesperpixel == 3))
+            {
+                if (initYCbCrConversion(img) != 0)
+                {
+                    /*
+                     * The 6.0 spec says that subsampling must be
+                     * one of 1, 2, or 4, and that vertical subsampling
+                     * must always be <= horizontal subsampling; so
+                     * there are only a few possibilities and we just
+                     * enumerate the cases.
+                     * Joris: added support for the [1,2] case, nonetheless, to
+                     * accommodate some OJPEG files
+                     */
+                    uint16_t SubsamplingHor;
+                    uint16_t SubsamplingVer;
+                    TIFFGetFieldDefaulted(img->tif, TIFFTAG_YCBCRSUBSAMPLING,
+                                          &SubsamplingHor, &SubsamplingVer);
+                    switch ((SubsamplingHor << 4) | SubsamplingVer)
+                    {
+                        case 0x44:
+                            img->put.contig = putcontig8bitYCbCr44tile;
+                            break;
+                        case 0x42:
+                            img->put.contig = putcontig8bitYCbCr42tile;
+                            break;
+                        case 0x41:
+                            img->put.contig = putcontig8bitYCbCr41tile;
+                            break;
+                        case 0x22:
+                            img->put.contig = putcontig8bitYCbCr22tile;
+                            break;
+                        case 0x21:
+                            img->put.contig = putcontig8bitYCbCr21tile;
+                            break;
+                        case 0x12:
+                            img->put.contig = putcontig8bitYCbCr12tile;
+                            break;
+                        case 0x11:
+                            img->put.contig = putcontig8bitYCbCr11tile;
+                            break;
+                    }
+                }
+            }
+            break;
+        case PHOTOMETRIC_CIELAB:
+            if (img->samplesperpixel == 3 && buildMap(img))
+            {
+                if (img->bitspersample == 8 || img->bitspersample == 16)
+                    img->put.contig = initCIELabConversion(img);
+                break;
+            }
+    }
+    return ((img->get != NULL) && (img->put.contig != NULL));
 }
 
 /*
@@ -2791,116 +3071,117 @@ PickContigCase(TIFFRGBAImage* img)
  * NB: we assume that unpacked single channel data is directed
  *	 to the "packed routines.
  */
-static int
-PickSeparateCase(TIFFRGBAImage* img)
-{
-	img->get = TIFFIsTiled(img->tif) ? gtTileSeparate : gtStripSeparate;
-	img->put.separate = NULL;
-	switch (img->photometric) {
-	case PHOTOMETRIC_MINISWHITE:
-	case PHOTOMETRIC_MINISBLACK:
-		/* greyscale images processed pretty much as RGB by gtTileSeparate */
-	case PHOTOMETRIC_RGB:
-		switch (img->bitspersample) {
-		case 8:
-			if (img->alpha == EXTRASAMPLE_ASSOCALPHA)
-				img->put.separate = putRGBAAseparate8bittile;
-			else if (img->alpha == EXTRASAMPLE_UNASSALPHA)
-			{
-				if (BuildMapUaToAa(img))
-					img->put.separate = putRGBUAseparate8bittile;
-			}
-			else
-				img->put.separate = putRGBseparate8bittile;
-			break;
-		case 16:
-			if (img->alpha == EXTRASAMPLE_ASSOCALPHA)
-			{
-				if (BuildMapBitdepth16To8(img))
-					img->put.separate = putRGBAAseparate16bittile;
-			}
-			else if (img->alpha == EXTRASAMPLE_UNASSALPHA)
-			{
-				if (BuildMapBitdepth16To8(img) &&
-				    BuildMapUaToAa(img))
-					img->put.separate = putRGBUAseparate16bittile;
-			}
-			else
-			{
-				if (BuildMapBitdepth16To8(img))
-					img->put.separate = putRGBseparate16bittile;
-			}
-			break;
-		}
-		break;
-	case PHOTOMETRIC_SEPARATED:
-		if (img->bitspersample == 8 && img->samplesperpixel == 4)
-		{
-			img->alpha = 1; // Not alpha, but seems like the only way to get 4th band
-			img->put.separate = putCMYKseparate8bittile;
-		}
-		break;
-	case PHOTOMETRIC_YCBCR:
-		if ((img->bitspersample==8) && (img->samplesperpixel==3))
-		{
-			if (initYCbCrConversion(img)!=0)
-			{
-				uint16 hs, vs;
-				TIFFGetFieldDefaulted(img->tif, TIFFTAG_YCBCRSUBSAMPLING, &hs, &vs);
-				switch ((hs<<4)|vs) {
-				case 0x11:
-					img->put.separate = putseparate8bitYCbCr11tile;
-					break;
-					/* TODO: add other cases here */
-				}
-			}
-		}
-		break;
-	}
-	return ((img->get!=NULL) && (img->put.separate!=NULL));
-}
-
-static int
-BuildMapUaToAa(TIFFRGBAImage* img)
-{
-	static const char module[]="BuildMapUaToAa";
-	uint8* m;
-	uint16 na,nv;
-	assert(img->UaToAa==NULL);
-	img->UaToAa=_TIFFmalloc(65536);
-	if (img->UaToAa==NULL)
-	{
-		TIFFErrorExt(img->tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	m=img->UaToAa;
-	for (na=0; na<256; na++)
-	{
-		for (nv=0; nv<256; nv++)
-			*m++=(uint8)((nv*na+127)/255);
-	}
-	return(1);
-}
-
-static int
-BuildMapBitdepth16To8(TIFFRGBAImage* img)
-{
-	static const char module[]="BuildMapBitdepth16To8";
-	uint8* m;
-	uint32 n;
-	assert(img->Bitdepth16To8==NULL);
-	img->Bitdepth16To8=_TIFFmalloc(65536);
-	if (img->Bitdepth16To8==NULL)
-	{
-		TIFFErrorExt(img->tif->tif_clientdata,module,"Out of memory");
-		return(0);
-	}
-	m=img->Bitdepth16To8;
-	for (n=0; n<65536; n++)
-		*m++=(uint8)((n+128)/257);
-	return(1);
+static int PickSeparateCase(TIFFRGBAImage *img)
+{
+    img->get = TIFFIsTiled(img->tif) ? gtTileSeparate : gtStripSeparate;
+    img->put.separate = NULL;
+    switch (img->photometric)
+    {
+        case PHOTOMETRIC_MINISWHITE:
+        case PHOTOMETRIC_MINISBLACK:
+            /* greyscale images processed pretty much as RGB by gtTileSeparate
+             */
+        case PHOTOMETRIC_RGB:
+            switch (img->bitspersample)
+            {
+                case 8:
+                    if (img->alpha == EXTRASAMPLE_ASSOCALPHA)
+                        img->put.separate = putRGBAAseparate8bittile;
+                    else if (img->alpha == EXTRASAMPLE_UNASSALPHA)
+                    {
+                        if (BuildMapUaToAa(img))
+                            img->put.separate = putRGBUAseparate8bittile;
+                    }
+                    else
+                        img->put.separate = putRGBseparate8bittile;
+                    break;
+                case 16:
+                    if (img->alpha == EXTRASAMPLE_ASSOCALPHA)
+                    {
+                        if (BuildMapBitdepth16To8(img))
+                            img->put.separate = putRGBAAseparate16bittile;
+                    }
+                    else if (img->alpha == EXTRASAMPLE_UNASSALPHA)
+                    {
+                        if (BuildMapBitdepth16To8(img) && BuildMapUaToAa(img))
+                            img->put.separate = putRGBUAseparate16bittile;
+                    }
+                    else
+                    {
+                        if (BuildMapBitdepth16To8(img))
+                            img->put.separate = putRGBseparate16bittile;
+                    }
+                    break;
+            }
+            break;
+        case PHOTOMETRIC_SEPARATED:
+            if (img->bitspersample == 8 && img->samplesperpixel == 4)
+            {
+                img->alpha =
+                    1; // Not alpha, but seems like the only way to get 4th band
+                img->put.separate = putCMYKseparate8bittile;
+            }
+            break;
+        case PHOTOMETRIC_YCBCR:
+            if ((img->bitspersample == 8) && (img->samplesperpixel == 3))
+            {
+                if (initYCbCrConversion(img) != 0)
+                {
+                    uint16_t hs, vs;
+                    TIFFGetFieldDefaulted(img->tif, TIFFTAG_YCBCRSUBSAMPLING,
+                                          &hs, &vs);
+                    switch ((hs << 4) | vs)
+                    {
+                        case 0x11:
+                            img->put.separate = putseparate8bitYCbCr11tile;
+                            break;
+                            /* TODO: add other cases here */
+                    }
+                }
+            }
+            break;
+    }
+    return ((img->get != NULL) && (img->put.separate != NULL));
+}
+
+static int BuildMapUaToAa(TIFFRGBAImage *img)
+{
+    static const char module[] = "BuildMapUaToAa";
+    uint8_t *m;
+    uint16_t na, nv;
+    assert(img->UaToAa == NULL);
+    img->UaToAa = _TIFFmallocExt(img->tif, 65536);
+    if (img->UaToAa == NULL)
+    {
+        TIFFErrorExtR(img->tif, module, "Out of memory");
+        return (0);
+    }
+    m = img->UaToAa;
+    for (na = 0; na < 256; na++)
+    {
+        for (nv = 0; nv < 256; nv++)
+            *m++ = (uint8_t)((nv * na + 127) / 255);
+    }
+    return (1);
 }
 
+static int BuildMapBitdepth16To8(TIFFRGBAImage *img)
+{
+    static const char module[] = "BuildMapBitdepth16To8";
+    uint8_t *m;
+    uint32_t n;
+    assert(img->Bitdepth16To8 == NULL);
+    img->Bitdepth16To8 = _TIFFmallocExt(img->tif, 65536);
+    if (img->Bitdepth16To8 == NULL)
+    {
+        TIFFErrorExtR(img->tif, module, "Out of memory");
+        return (0);
+    }
+    m = img->Bitdepth16To8;
+    for (n = 0; n < 65536; n++)
+        *m++ = (uint8_t)((n + 128) / 257);
+    return (1);
+}
 
 /*
  * Read a whole strip off data from the file, and convert to RGBA form.
@@ -2909,56 +3190,59 @@ BuildMapBitdepth16To8(TIFFRGBAImage* img)
  * organized in bottom to top form.
  */
 
-
-int
-TIFFReadRGBAStrip(TIFF* tif, uint32 row, uint32 * raster )
+int TIFFReadRGBAStrip(TIFF *tif, uint32_t row, uint32_t *raster)
 
 {
-    return TIFFReadRGBAStripExt(tif, row, raster, 0 );
+    return TIFFReadRGBAStripExt(tif, row, raster, 0);
 }
 
-int
-TIFFReadRGBAStripExt(TIFF* tif, uint32 row, uint32 * raster, int stop_on_error)
+int TIFFReadRGBAStripExt(TIFF *tif, uint32_t row, uint32_t *raster,
+                         int stop_on_error)
 
 {
-    char 	emsg[1024] = "";
+    char emsg[EMSG_BUF_SIZE] = "";
     TIFFRGBAImage img;
-    int 	ok;
-    uint32	rowsperstrip, rows_to_read;
+    int ok;
+    uint32_t rowsperstrip, rows_to_read;
 
-    if( TIFFIsTiled( tif ) )
+    if (TIFFIsTiled(tif))
     {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif),
-                  "Can't use TIFFReadRGBAStrip() with tiled file.");
-	return (0);
+        TIFFErrorExtR(tif, TIFFFileName(tif),
+                      "Can't use TIFFReadRGBAStrip() with tiled file.");
+        return (0);
     }
-    
+
     TIFFGetFieldDefaulted(tif, TIFFTAG_ROWSPERSTRIP, &rowsperstrip);
-    if( (row % rowsperstrip) != 0 )
+    if ((row % rowsperstrip) != 0)
     {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif),
-				"Row passed to TIFFReadRGBAStrip() must be first in a strip.");
-		return (0);
+        TIFFErrorExtR(
+            tif, TIFFFileName(tif),
+            "Row passed to TIFFReadRGBAStrip() must be first in a strip.");
+        return (0);
     }
 
-    if (TIFFRGBAImageOK(tif, emsg) && TIFFRGBAImageBegin(&img, tif, stop_on_error, emsg)) {
+    if (TIFFRGBAImageOK(tif, emsg) &&
+        TIFFRGBAImageBegin(&img, tif, stop_on_error, emsg))
+    {
 
         img.row_offset = row;
         img.col_offset = 0;
 
-        if( row + rowsperstrip > img.height )
+        if (row + rowsperstrip > img.height)
             rows_to_read = img.height - row;
         else
             rows_to_read = rowsperstrip;
-        
-	ok = TIFFRGBAImageGet(&img, raster, img.width, rows_to_read );
-        
-	TIFFRGBAImageEnd(&img);
-    } else {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", emsg);
-		ok = 0;
-    }
-    
+
+        ok = TIFFRGBAImageGet(&img, raster, img.width, rows_to_read);
+
+        TIFFRGBAImageEnd(&img);
+    }
+    else
+    {
+        TIFFErrorExtR(tif, TIFFFileName(tif), "%s", emsg);
+        ok = 0;
+    }
+
     return (ok);
 }
 
@@ -2968,54 +3252,53 @@ TIFFReadRGBAStripExt(TIFF* tif, uint32 row, uint32 * raster, int stop_on_error)
  * and may include zeroed areas if the tile extends off the image.
  */
 
-int
-TIFFReadRGBATile(TIFF* tif, uint32 col, uint32 row, uint32 * raster)
+int TIFFReadRGBATile(TIFF *tif, uint32_t col, uint32_t row, uint32_t *raster)
 
 {
-    return TIFFReadRGBATileExt(tif, col, row, raster, 0 );
+    return TIFFReadRGBATileExt(tif, col, row, raster, 0);
 }
 
-
-int
-TIFFReadRGBATileExt(TIFF* tif, uint32 col, uint32 row, uint32 * raster, int stop_on_error )
+int TIFFReadRGBATileExt(TIFF *tif, uint32_t col, uint32_t row, uint32_t *raster,
+                        int stop_on_error)
 {
-    char 	emsg[1024] = "";
+    char emsg[EMSG_BUF_SIZE] = "";
     TIFFRGBAImage img;
-    int 	ok;
-    uint32	tile_xsize, tile_ysize;
-    uint32	read_xsize, read_ysize;
-    uint32	i_row;
+    int ok;
+    uint32_t tile_xsize, tile_ysize;
+    uint32_t read_xsize, read_ysize;
+    uint32_t i_row;
 
     /*
      * Verify that our request is legal - on a tile file, and on a
      * tile boundary.
      */
-    
-    if( !TIFFIsTiled( tif ) )
+
+    if (!TIFFIsTiled(tif))
     {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif),
-				  "Can't use TIFFReadRGBATile() with striped file.");
-		return (0);
+        TIFFErrorExtR(tif, TIFFFileName(tif),
+                      "Can't use TIFFReadRGBATile() with striped file.");
+        return (0);
     }
-    
+
     TIFFGetFieldDefaulted(tif, TIFFTAG_TILEWIDTH, &tile_xsize);
     TIFFGetFieldDefaulted(tif, TIFFTAG_TILELENGTH, &tile_ysize);
-    if( (col % tile_xsize) != 0 || (row % tile_ysize) != 0 )
+    if ((col % tile_xsize) != 0 || (row % tile_ysize) != 0)
     {
-		TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif),
-                  "Row/col passed to TIFFReadRGBATile() must be top"
-                  "left corner of a tile.");
-	return (0);
+        TIFFErrorExtR(tif, TIFFFileName(tif),
+                      "Row/col passed to TIFFReadRGBATile() must be top"
+                      "left corner of a tile.");
+        return (0);
     }
 
     /*
      * Setup the RGBA reader.
      */
-    
-    if (!TIFFRGBAImageOK(tif, emsg) 
-	|| !TIFFRGBAImageBegin(&img, tif, stop_on_error, emsg)) {
-	    TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "%s", emsg);
-	    return( 0 );
+
+    if (!TIFFRGBAImageOK(tif, emsg) ||
+        !TIFFRGBAImageBegin(&img, tif, stop_on_error, emsg))
+    {
+        TIFFErrorExtR(tif, TIFFFileName(tif), "%s", emsg);
+        return (0);
     }
 
     /*
@@ -3025,12 +3308,12 @@ TIFFReadRGBATileExt(TIFF* tif, uint32 col, uint32 row, uint32 * raster, int stop
      * a full tile configuration afterwards.
      */
 
-    if( row + tile_ysize > img.height )
+    if (row + tile_ysize > img.height)
         read_ysize = img.height - row;
     else
         read_ysize = tile_ysize;
-    
-    if( col + tile_xsize > img.width )
+
+    if (col + tile_xsize > img.width)
         read_xsize = img.width - col;
     else
         read_xsize = tile_xsize;
@@ -3038,12 +3321,12 @@ TIFFReadRGBATileExt(TIFF* tif, uint32 col, uint32 row, uint32 * raster, int stop
     /*
      * Read the chunk of imagery.
      */
-    
+
     img.row_offset = row;
     img.col_offset = col;
 
-    ok = TIFFRGBAImageGet(&img, raster, read_xsize, read_ysize );
-        
+    ok = TIFFRGBAImageGet(&img, raster, read_xsize, read_ysize);
+
     TIFFRGBAImageEnd(&img);
 
     /*
@@ -3051,33 +3334,27 @@ TIFFReadRGBATileExt(TIFF* tif, uint32 col, uint32 row, uint32 * raster, int stop
      * shifting the data around as if a full tile of data is being returned.
      *
      * This is all the more complicated because the image is organized in
-     * bottom to top format. 
+     * bottom to top format.
      */
 
-    if( read_xsize == tile_xsize && read_ysize == tile_ysize )
-        return( ok );
+    if (read_xsize == tile_xsize && read_ysize == tile_ysize)
+        return (ok);
 
-    for( i_row = 0; i_row < read_ysize; i_row++ ) {
-        memmove( raster + (tile_ysize - i_row - 1) * tile_xsize,
-                 raster + (read_ysize - i_row - 1) * read_xsize,
-                 read_xsize * sizeof(uint32) );
-        _TIFFmemset( raster + (tile_ysize - i_row - 1) * tile_xsize+read_xsize,
-                     0, sizeof(uint32) * (tile_xsize - read_xsize) );
+    for (i_row = 0; i_row < read_ysize; i_row++)
+    {
+        memmove(raster + (size_t)(tile_ysize - i_row - 1) * tile_xsize,
+                raster + (size_t)(read_ysize - i_row - 1) * read_xsize,
+                read_xsize * sizeof(uint32_t));
+        _TIFFmemset(raster + (size_t)(tile_ysize - i_row - 1) * tile_xsize +
+                        read_xsize,
+                    0, sizeof(uint32_t) * (tile_xsize - read_xsize));
     }
 
-    for( i_row = read_ysize; i_row < tile_ysize; i_row++ ) {
-        _TIFFmemset( raster + (tile_ysize - i_row - 1) * tile_xsize,
-                     0, sizeof(uint32) * tile_xsize );
+    for (i_row = read_ysize; i_row < tile_ysize; i_row++)
+    {
+        _TIFFmemset(raster + (size_t)(tile_ysize - i_row - 1) * tile_xsize, 0,
+                    sizeof(uint32_t) * tile_xsize);
     }
 
     return (ok);
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_hash_set.c b/3rdparty/libtiff/tif_hash_set.c
new file mode 100644
index 000000000000..9792c63f47d0
--- /dev/null
+++ b/3rdparty/libtiff/tif_hash_set.c
@@ -0,0 +1,603 @@
+/**********************************************************************
+ *
+ * Name:     tif_hash_set.c
+ * Purpose:  Hash set functions.
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ **********************************************************************
+ * Copyright (c) 2008-2009, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ ****************************************************************************/
+
+#include "tif_config.h"
+
+#include "tif_hash_set.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/** List element structure. */
+typedef struct _TIFFList TIFFList;
+
+/** List element structure. */
+struct _TIFFList
+{
+    /*! Pointer to the data object. Should be allocated and freed by the
+     * caller.
+     * */
+    void *pData;
+    /*! Pointer to the next element in list. NULL, if current element is the
+     * last one.
+     */
+    struct _TIFFList *psNext;
+};
+
+struct _TIFFHashSet
+{
+    TIFFHashSetHashFunc fnHashFunc;
+    TIFFHashSetEqualFunc fnEqualFunc;
+    TIFFHashSetFreeEltFunc fnFreeEltFunc;
+    TIFFList **tabList;
+    int nSize;
+    int nIndiceAllocatedSize;
+    int nAllocatedSize;
+    TIFFList *psRecyclingList;
+    int nRecyclingListSize;
+    bool bRehash;
+#ifdef HASH_DEBUG
+    int nCollisions;
+#endif
+};
+
+static const int anPrimes[] = {
+    53,        97,        193,       389,       769,       1543,     3079,
+    6151,      12289,     24593,     49157,     98317,     196613,   393241,
+    786433,    1572869,   3145739,   6291469,   12582917,  25165843, 50331653,
+    100663319, 201326611, 402653189, 805306457, 1610612741};
+
+/************************************************************************/
+/*                    TIFFHashSetHashPointer()                          */
+/************************************************************************/
+
+/**
+ * Hash function for an arbitrary pointer
+ *
+ * @param elt the arbitrary pointer to hash
+ *
+ * @return the hash value of the pointer
+ */
+
+static unsigned long TIFFHashSetHashPointer(const void *elt)
+{
+    return (unsigned long)(uintptr_t)((void *)(elt));
+}
+
+/************************************************************************/
+/*                   TIFFHashSetEqualPointer()                          */
+/************************************************************************/
+
+/**
+ * Equality function for arbitrary pointers
+ *
+ * @param elt1 the first arbitrary pointer to compare
+ * @param elt2 the second arbitrary pointer to compare
+ *
+ * @return true if the pointers are equal
+ */
+
+static bool TIFFHashSetEqualPointer(const void *elt1, const void *elt2)
+{
+    return elt1 == elt2;
+}
+
+/************************************************************************/
+/*                          TIFFHashSetNew()                             */
+/************************************************************************/
+
+/**
+ * Creates a new hash set
+ *
+ * The hash function must return a hash value for the elements to insert.
+ * If fnHashFunc is NULL, TIFFHashSetHashPointer will be used.
+ *
+ * The equal function must return if two elements are equal.
+ * If fnEqualFunc is NULL, TIFFHashSetEqualPointer will be used.
+ *
+ * The free function is used to free elements inserted in the hash set,
+ * when the hash set is destroyed, when elements are removed or replaced.
+ * If fnFreeEltFunc is NULL, elements inserted into the hash set will not be
+ * freed.
+ *
+ * @param fnHashFunc hash function. May be NULL.
+ * @param fnEqualFunc equal function. May be NULL.
+ * @param fnFreeEltFunc element free function. May be NULL.
+ *
+ * @return a new hash set
+ */
+
+TIFFHashSet *TIFFHashSetNew(TIFFHashSetHashFunc fnHashFunc,
+                            TIFFHashSetEqualFunc fnEqualFunc,
+                            TIFFHashSetFreeEltFunc fnFreeEltFunc)
+{
+    TIFFHashSet *set = (TIFFHashSet *)malloc(sizeof(TIFFHashSet));
+    if (set == NULL)
+        return NULL;
+    set->fnHashFunc = fnHashFunc ? fnHashFunc : TIFFHashSetHashPointer;
+    set->fnEqualFunc = fnEqualFunc ? fnEqualFunc : TIFFHashSetEqualPointer;
+    set->fnFreeEltFunc = fnFreeEltFunc;
+    set->nSize = 0;
+    set->tabList = (TIFFList **)(calloc(sizeof(TIFFList *), 53));
+    if (set->tabList == NULL)
+    {
+        free(set);
+        return NULL;
+    }
+    set->nIndiceAllocatedSize = 0;
+    set->nAllocatedSize = 53;
+    set->psRecyclingList = NULL;
+    set->nRecyclingListSize = 0;
+    set->bRehash = false;
+#ifdef HASH_DEBUG
+    set->nCollisions = 0;
+#endif
+    return set;
+}
+
+/************************************************************************/
+/*                          TIFFHashSetSize()                            */
+/************************************************************************/
+
+/**
+ * Returns the number of elements inserted in the hash set
+ *
+ * Note: this is not the internal size of the hash set
+ *
+ * @param set the hash set
+ *
+ * @return the number of elements in the hash set
+ */
+
+int TIFFHashSetSize(const TIFFHashSet *set)
+{
+    assert(set != NULL);
+    return set->nSize;
+}
+
+/************************************************************************/
+/*                       TIFFHashSetGetNewListElt()                      */
+/************************************************************************/
+
+static TIFFList *TIFFHashSetGetNewListElt(TIFFHashSet *set)
+{
+    if (set->psRecyclingList)
+    {
+        TIFFList *psRet = set->psRecyclingList;
+        psRet->pData = NULL;
+        set->nRecyclingListSize--;
+        set->psRecyclingList = psRet->psNext;
+        return psRet;
+    }
+
+    return (TIFFList *)malloc(sizeof(TIFFList));
+}
+
+/************************************************************************/
+/*                       TIFFHashSetReturnListElt()                      */
+/************************************************************************/
+
+static void TIFFHashSetReturnListElt(TIFFHashSet *set, TIFFList *psList)
+{
+    if (set->nRecyclingListSize < 128)
+    {
+        psList->psNext = set->psRecyclingList;
+        set->psRecyclingList = psList;
+        set->nRecyclingListSize++;
+    }
+    else
+    {
+        free(psList);
+    }
+}
+
+/************************************************************************/
+/*                   TIFFHashSetClearInternal()                          */
+/************************************************************************/
+
+static void TIFFHashSetClearInternal(TIFFHashSet *set, bool bFinalize)
+{
+    assert(set != NULL);
+    for (int i = 0; i < set->nAllocatedSize; i++)
+    {
+        TIFFList *cur = set->tabList[i];
+        while (cur)
+        {
+            if (set->fnFreeEltFunc)
+                set->fnFreeEltFunc(cur->pData);
+            TIFFList *psNext = cur->psNext;
+            if (bFinalize)
+                free(cur);
+            else
+                TIFFHashSetReturnListElt(set, cur);
+            cur = psNext;
+        }
+        set->tabList[i] = NULL;
+    }
+    set->bRehash = false;
+}
+
+/************************************************************************/
+/*                         TIFFListDestroy()                            */
+/************************************************************************/
+
+/**
+ * Destroy a list. Caller responsible for freeing data objects contained in
+ * list elements.
+ *
+ * @param psList pointer to list head.
+ *
+ */
+
+static void TIFFListDestroy(TIFFList *psList)
+{
+    TIFFList *psCurrent = psList;
+
+    while (psCurrent)
+    {
+        TIFFList *const psNext = psCurrent->psNext;
+        free(psCurrent);
+        psCurrent = psNext;
+    }
+}
+
+/************************************************************************/
+/*                        TIFFHashSetDestroy()                          */
+/************************************************************************/
+
+/**
+ * Destroys an allocated hash set.
+ *
+ * This function also frees the elements if a free function was
+ * provided at the creation of the hash set.
+ *
+ * @param set the hash set
+ */
+
+void TIFFHashSetDestroy(TIFFHashSet *set)
+{
+    if (set)
+    {
+        TIFFHashSetClearInternal(set, true);
+        free(set->tabList);
+        TIFFListDestroy(set->psRecyclingList);
+        free(set);
+    }
+}
+
+#ifdef notused
+/************************************************************************/
+/*                        TIFFHashSetClear()                             */
+/************************************************************************/
+
+/**
+ * Clear all elements from a hash set.
+ *
+ * This function also frees the elements if a free function was
+ * provided at the creation of the hash set.
+ *
+ * @param set the hash set
+ */
+
+void TIFFHashSetClear(TIFFHashSet *set)
+{
+    TIFFHashSetClearInternal(set, false);
+    set->nIndiceAllocatedSize = 0;
+    set->nAllocatedSize = 53;
+#ifdef HASH_DEBUG
+    set->nCollisions = 0;
+#endif
+    set->nSize = 0;
+}
+
+/************************************************************************/
+/*                       TIFFHashSetForeach()                           */
+/************************************************************************/
+
+/**
+ * Walk through the hash set and runs the provided function on all the
+ * elements
+ *
+ * This function is provided the user_data argument of TIFFHashSetForeach.
+ * It must return true to go on the walk through the hash set, or FALSE to
+ * make it stop.
+ *
+ * Note : the structure of the hash set must *NOT* be modified during the
+ * walk.
+ *
+ * @param set the hash set.
+ * @param fnIterFunc the function called on each element.
+ * @param user_data the user data provided to the function.
+ */
+
+void TIFFHashSetForeach(TIFFHashSet *set, TIFFHashSetIterEltFunc fnIterFunc,
+                        void *user_data)
+{
+    assert(set != NULL);
+    if (!fnIterFunc)
+        return;
+
+    for (int i = 0; i < set->nAllocatedSize; i++)
+    {
+        TIFFList *cur = set->tabList[i];
+        while (cur)
+        {
+            if (!fnIterFunc(cur->pData, user_data))
+                return;
+
+            cur = cur->psNext;
+        }
+    }
+}
+#endif
+
+/************************************************************************/
+/*                        TIFFHashSetRehash()                           */
+/************************************************************************/
+
+static bool TIFFHashSetRehash(TIFFHashSet *set)
+{
+    int nNewAllocatedSize = anPrimes[set->nIndiceAllocatedSize];
+    TIFFList **newTabList =
+        (TIFFList **)(calloc(sizeof(TIFFList *), nNewAllocatedSize));
+    if (newTabList == NULL)
+        return false;
+#ifdef HASH_DEBUG
+    TIFFDebug("TIFFHASH",
+              "hashSet=%p, nSize=%d, nCollisions=%d, "
+              "fCollisionRate=%.02f",
+              set, set->nSize, set->nCollisions,
+              set->nCollisions * 100.0 / set->nSize);
+    set->nCollisions = 0;
+#endif
+    for (int i = 0; i < set->nAllocatedSize; i++)
+    {
+        TIFFList *cur = set->tabList[i];
+        while (cur)
+        {
+            const unsigned long nNewHashVal =
+                set->fnHashFunc(cur->pData) % nNewAllocatedSize;
+#ifdef HASH_DEBUG
+            if (newTabList[nNewHashVal])
+                set->nCollisions++;
+#endif
+            TIFFList *psNext = cur->psNext;
+            cur->psNext = newTabList[nNewHashVal];
+            newTabList[nNewHashVal] = cur;
+            cur = psNext;
+        }
+    }
+    free(set->tabList);
+    set->tabList = newTabList;
+    set->nAllocatedSize = nNewAllocatedSize;
+    set->bRehash = false;
+    return true;
+}
+
+/************************************************************************/
+/*                        TIFFHashSetFindPtr()                          */
+/************************************************************************/
+
+static void **TIFFHashSetFindPtr(TIFFHashSet *set, const void *elt)
+{
+    const unsigned long nHashVal = set->fnHashFunc(elt) % set->nAllocatedSize;
+    TIFFList *cur = set->tabList[nHashVal];
+    while (cur)
+    {
+        if (set->fnEqualFunc(cur->pData, elt))
+            return &cur->pData;
+        cur = cur->psNext;
+    }
+    return NULL;
+}
+
+/************************************************************************/
+/*                         TIFFHashSetInsert()                          */
+/************************************************************************/
+
+/**
+ * Inserts an element into a hash set.
+ *
+ * If the element was already inserted in the hash set, the previous
+ * element is replaced by the new element. If a free function was provided,
+ * it is used to free the previously inserted element
+ *
+ * @param set the hash set
+ * @param elt the new element to insert in the hash set
+ *
+ * @return true if success. If false is returned, elt has not been inserted,
+ * but TIFFHashSetInsert() will have run the free function if provided.
+ */
+
+bool TIFFHashSetInsert(TIFFHashSet *set, void *elt)
+{
+    assert(set != NULL);
+    void **pElt = TIFFHashSetFindPtr(set, elt);
+    if (pElt)
+    {
+        if (set->fnFreeEltFunc)
+            set->fnFreeEltFunc(*pElt);
+
+        *pElt = elt;
+        return true;
+    }
+
+    if (set->nSize >= 2 * set->nAllocatedSize / 3 ||
+        (set->bRehash && set->nIndiceAllocatedSize > 0 &&
+         set->nSize <= set->nAllocatedSize / 2))
+    {
+        set->nIndiceAllocatedSize++;
+        if (!TIFFHashSetRehash(set))
+        {
+            set->nIndiceAllocatedSize--;
+            if (set->fnFreeEltFunc)
+                set->fnFreeEltFunc(elt);
+            return false;
+        }
+    }
+
+    const unsigned long nHashVal = set->fnHashFunc(elt) % set->nAllocatedSize;
+#ifdef HASH_DEBUG
+    if (set->tabList[nHashVal])
+        set->nCollisions++;
+#endif
+
+    TIFFList *new_elt = TIFFHashSetGetNewListElt(set);
+    if (new_elt == NULL)
+    {
+        if (set->fnFreeEltFunc)
+            set->fnFreeEltFunc(elt);
+        return false;
+    }
+    new_elt->pData = elt;
+    new_elt->psNext = set->tabList[nHashVal];
+    set->tabList[nHashVal] = new_elt;
+    set->nSize++;
+
+    return true;
+}
+
+/************************************************************************/
+/*                        TIFFHashSetLookup()                           */
+/************************************************************************/
+
+/**
+ * Returns the element found in the hash set corresponding to the element to
+ * look up The element must not be modified.
+ *
+ * @param set the hash set
+ * @param elt the element to look up in the hash set
+ *
+ * @return the element found in the hash set or NULL
+ */
+
+void *TIFFHashSetLookup(TIFFHashSet *set, const void *elt)
+{
+    assert(set != NULL);
+    void **pElt = TIFFHashSetFindPtr(set, elt);
+    if (pElt)
+        return *pElt;
+
+    return NULL;
+}
+
+/************************************************************************/
+/*                     TIFFHashSetRemoveInternal()                      */
+/************************************************************************/
+
+static bool TIFFHashSetRemoveInternal(TIFFHashSet *set, const void *elt,
+                                      bool bDeferRehash)
+{
+    assert(set != NULL);
+    if (set->nIndiceAllocatedSize > 0 && set->nSize <= set->nAllocatedSize / 2)
+    {
+        set->nIndiceAllocatedSize--;
+        if (bDeferRehash)
+            set->bRehash = true;
+        else
+        {
+            if (!TIFFHashSetRehash(set))
+            {
+                set->nIndiceAllocatedSize++;
+                return false;
+            }
+        }
+    }
+
+    int nHashVal = (int)(set->fnHashFunc(elt) % set->nAllocatedSize);
+    TIFFList *cur = set->tabList[nHashVal];
+    TIFFList *prev = NULL;
+    while (cur)
+    {
+        if (set->fnEqualFunc(cur->pData, elt))
+        {
+            if (prev)
+                prev->psNext = cur->psNext;
+            else
+                set->tabList[nHashVal] = cur->psNext;
+
+            if (set->fnFreeEltFunc)
+                set->fnFreeEltFunc(cur->pData);
+
+            TIFFHashSetReturnListElt(set, cur);
+#ifdef HASH_DEBUG
+            if (set->tabList[nHashVal])
+                set->nCollisions--;
+#endif
+            set->nSize--;
+            return true;
+        }
+        prev = cur;
+        cur = cur->psNext;
+    }
+    return false;
+}
+
+/************************************************************************/
+/*                         TIFFHashSetRemove()                          */
+/************************************************************************/
+
+/**
+ * Removes an element from a hash set
+ *
+ * @param set the hash set
+ * @param elt the new element to remove from the hash set
+ *
+ * @return true if the element was in the hash set
+ */
+
+bool TIFFHashSetRemove(TIFFHashSet *set, const void *elt)
+{
+    return TIFFHashSetRemoveInternal(set, elt, false);
+}
+
+#ifdef notused
+/************************************************************************/
+/*                     TIFFHashSetRemoveDeferRehash()                   */
+/************************************************************************/
+
+/**
+ * Removes an element from a hash set.
+ *
+ * This will defer potential rehashing of the set to later calls to
+ * TIFFHashSetInsert() or TIFFHashSetRemove().
+ *
+ * @param set the hash set
+ * @param elt the new element to remove from the hash set
+ *
+ * @return true if the element was in the hash set
+ */
+
+bool TIFFHashSetRemoveDeferRehash(TIFFHashSet *set, const void *elt)
+{
+    return TIFFHashSetRemoveInternal(set, elt, true);
+}
+#endif
diff --git a/3rdparty/libtiff/tif_hash_set.h b/3rdparty/libtiff/tif_hash_set.h
new file mode 100644
index 000000000000..f60e2c675e53
--- /dev/null
+++ b/3rdparty/libtiff/tif_hash_set.h
@@ -0,0 +1,100 @@
+/**********************************************************************
+ * $Id$
+ *
+ * Name:     tif_hash_set.h
+ * Project:  TIFF - Common Portability Library
+ * Purpose:  Hash set functions.
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ **********************************************************************
+ * Copyright (c) 2008-2009, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ ****************************************************************************/
+
+#ifndef TIFF_HASH_SET_H_INCLUDED
+#define TIFF_HASH_SET_H_INCLUDED
+
+#include <stdbool.h>
+
+/**
+ * \file tif_hash_set.h
+ *
+ * Hash set implementation.
+ *
+ * An hash set is a data structure that holds elements that are unique
+ * according to a comparison function. Operations on the hash set, such as
+ * insertion, removal or lookup, are supposed to be fast if an efficient
+ * "hash" function is provided.
+ */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    /* Types */
+
+    /** Opaque type for a hash set */
+    typedef struct _TIFFHashSet TIFFHashSet;
+
+    /** TIFFHashSetHashFunc */
+    typedef unsigned long (*TIFFHashSetHashFunc)(const void *elt);
+
+    /** TIFFHashSetEqualFunc */
+    typedef bool (*TIFFHashSetEqualFunc)(const void *elt1, const void *elt2);
+
+    /** TIFFHashSetFreeEltFunc */
+    typedef void (*TIFFHashSetFreeEltFunc)(void *elt);
+
+    /* Functions */
+
+    TIFFHashSet *TIFFHashSetNew(TIFFHashSetHashFunc fnHashFunc,
+                                TIFFHashSetEqualFunc fnEqualFunc,
+                                TIFFHashSetFreeEltFunc fnFreeEltFunc);
+
+    void TIFFHashSetDestroy(TIFFHashSet *set);
+
+    int TIFFHashSetSize(const TIFFHashSet *set);
+
+#ifdef notused
+    void TIFFHashSetClear(TIFFHashSet *set);
+
+    /** TIFFHashSetIterEltFunc */
+    typedef int (*TIFFHashSetIterEltFunc)(void *elt, void *user_data);
+
+    void TIFFHashSetForeach(TIFFHashSet *set, TIFFHashSetIterEltFunc fnIterFunc,
+                            void *user_data);
+#endif
+
+    bool TIFFHashSetInsert(TIFFHashSet *set, void *elt);
+
+    void *TIFFHashSetLookup(TIFFHashSet *set, const void *elt);
+
+    bool TIFFHashSetRemove(TIFFHashSet *set, const void *elt);
+
+#ifdef notused
+    bool TIFFHashSetRemoveDeferRehash(TIFFHashSet *set, const void *elt);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TIFF_HASH_SET_H_INCLUDED */
diff --git a/3rdparty/libtiff/tif_jbig.c b/3rdparty/libtiff/tif_jbig.c
index a3500e0b6f3e..7e455ad1ce14 100644
--- a/3rdparty/libtiff/tif_jbig.c
+++ b/3rdparty/libtiff/tif_jbig.c
@@ -35,199 +35,197 @@
 #ifdef JBIG_SUPPORT
 #include "jbig.h"
 
-static int JBIGSetupDecode(TIFF* tif)
+static int JBIGSetupDecode(TIFF *tif)
 {
-	if (TIFFNumberOfStrips(tif) != 1)
-	{
-		TIFFErrorExt(tif->tif_clientdata, "JBIG", "Multistrip images not supported in decoder");
-		return 0;
-	}
-
-	return 1;
+    if (TIFFNumberOfStrips(tif) != 1)
+    {
+        TIFFErrorExtR(tif, "JBIG",
+                      "Multistrip images not supported in decoder");
+        return 0;
+    }
+
+    return 1;
 }
 
-static int JBIGDecode(TIFF* tif, uint8* buffer, tmsize_t size, uint16 s)
+static int JBIGDecode(TIFF *tif, uint8_t *buffer, tmsize_t size, uint16_t s)
 {
-	struct jbg_dec_state decoder;
-	int decodeStatus = 0;
-	unsigned char* pImage = NULL;
-	unsigned long decodedSize;
-	(void) s;
+    struct jbg_dec_state decoder;
+    int decodeStatus = 0;
+    unsigned char *pImage = NULL;
+    unsigned long decodedSize;
+    (void)s;
 
-	if (isFillOrder(tif, tif->tif_dir.td_fillorder))
-	{
-		TIFFReverseBits(tif->tif_rawcp, tif->tif_rawcc);
-	}
+    if (isFillOrder(tif, tif->tif_dir.td_fillorder))
+    {
+        TIFFReverseBits(tif->tif_rawcp, tif->tif_rawcc);
+    }
 
-	jbg_dec_init(&decoder);
+    jbg_dec_init(&decoder);
 
 #if defined(HAVE_JBG_NEWLEN)
-	jbg_newlen(tif->tif_rawcp, (size_t)tif->tif_rawcc);
-	/*
-	 * I do not check the return status of jbg_newlen because even if this
-	 * function fails it does not necessarily mean that decoding the image
-	 * will fail.  It is generally only needed for received fax images
-	 * that do not contain the actual length of the image in the BIE
-	 * header.  I do not log when an error occurs because that will cause
-	 * problems when converting JBIG encoded TIFF's to
-	 * PostScript.  As long as the actual image length is contained in the
-	 * BIE header jbg_dec_in should succeed.
-	 */
+    jbg_newlen(tif->tif_rawcp, (size_t)tif->tif_rawcc);
+    /*
+     * I do not check the return status of jbg_newlen because even if this
+     * function fails it does not necessarily mean that decoding the image
+     * will fail.  It is generally only needed for received fax images
+     * that do not contain the actual length of the image in the BIE
+     * header.  I do not log when an error occurs because that will cause
+     * problems when converting JBIG encoded TIFF's to
+     * PostScript.  As long as the actual image length is contained in the
+     * BIE header jbg_dec_in should succeed.
+     */
 #endif /* HAVE_JBG_NEWLEN */
 
-	decodeStatus = jbg_dec_in(&decoder, (unsigned char*)tif->tif_rawcp,
-				  (size_t)tif->tif_rawcc, NULL);
-	if (JBG_EOK != decodeStatus)
-	{
-		/*
-		 * XXX: JBG_EN constant was defined in pre-2.0 releases of the
-		 * JBIG-KIT. Since the 2.0 the error reporting functions were
-		 * changed. We will handle both cases here.
-		 */
-		TIFFErrorExt(tif->tif_clientdata,
-			     "JBIG", "Error (%d) decoding: %s",
-			     decodeStatus,
+    decodeStatus = jbg_dec_in(&decoder, (unsigned char *)tif->tif_rawcp,
+                              (size_t)tif->tif_rawcc, NULL);
+    if (JBG_EOK != decodeStatus)
+    {
+        /*
+         * XXX: JBG_EN constant was defined in pre-2.0 releases of the
+         * JBIG-KIT. Since the 2.0 the error reporting functions were
+         * changed. We will handle both cases here.
+         */
+        TIFFErrorExtR(tif, "JBIG", "Error (%d) decoding: %s", decodeStatus,
 #if defined(JBG_EN)
-			     jbg_strerror(decodeStatus, JBG_EN)
+                      jbg_strerror(decodeStatus, JBG_EN)
 #else
-			     jbg_strerror(decodeStatus)
+                      jbg_strerror(decodeStatus)
 #endif
-			     );
-		jbg_dec_free(&decoder);
-		return 0;
-	}
-
-	decodedSize = jbg_dec_getsize(&decoder);
-	if( (tmsize_t)decodedSize < size )
-	{
-	    TIFFWarningExt(tif->tif_clientdata, "JBIG",
-	                   "Only decoded %lu bytes, whereas %lu requested",
-	                   decodedSize, (unsigned long)size);
-	}
-	else if( (tmsize_t)decodedSize > size )
-	{
-	    TIFFErrorExt(tif->tif_clientdata, "JBIG",
-	                 "Decoded %lu bytes, whereas %lu were requested",
-	                 decodedSize, (unsigned long)size);
-	    jbg_dec_free(&decoder);
-	    return 0;
-	}
-	pImage = jbg_dec_getimage(&decoder, 0);
-	_TIFFmemcpy(buffer, pImage, decodedSize);
-	jbg_dec_free(&decoder);
-
-        tif->tif_rawcp += tif->tif_rawcc;
-        tif->tif_rawcc = 0;
-
-	return 1;
+        );
+        jbg_dec_free(&decoder);
+        return 0;
+    }
+
+    decodedSize = jbg_dec_getsize(&decoder);
+    if ((tmsize_t)decodedSize < size)
+    {
+        TIFFWarningExtR(tif, "JBIG",
+                        "Only decoded %lu bytes, whereas %" TIFF_SSIZE_FORMAT
+                        " requested",
+                        decodedSize, size);
+    }
+    else if ((tmsize_t)decodedSize > size)
+    {
+        TIFFErrorExtR(tif, "JBIG",
+                      "Decoded %lu bytes, whereas %" TIFF_SSIZE_FORMAT
+                      " were requested",
+                      decodedSize, size);
+        jbg_dec_free(&decoder);
+        return 0;
+    }
+    pImage = jbg_dec_getimage(&decoder, 0);
+    _TIFFmemcpy(buffer, pImage, decodedSize);
+    jbg_dec_free(&decoder);
+
+    tif->tif_rawcp += tif->tif_rawcc;
+    tif->tif_rawcc = 0;
+
+    return 1;
 }
 
-static int JBIGSetupEncode(TIFF* tif)
+static int JBIGSetupEncode(TIFF *tif)
 {
-	if (TIFFNumberOfStrips(tif) != 1)
-	{
-		TIFFErrorExt(tif->tif_clientdata, "JBIG", "Multistrip images not supported in encoder");
-		return 0;
-	}
-
-	return 1;
+    if (TIFFNumberOfStrips(tif) != 1)
+    {
+        TIFFErrorExtR(tif, "JBIG",
+                      "Multistrip images not supported in encoder");
+        return 0;
+    }
+
+    return 1;
 }
 
-static int JBIGCopyEncodedData(TIFF* tif, unsigned char* pp, size_t cc, uint16 s)
+static int JBIGCopyEncodedData(TIFF *tif, unsigned char *pp, size_t cc,
+                               uint16_t s)
 {
-	(void) s;
-	while (cc > 0)
-	{
-		tmsize_t n = (tmsize_t)cc;
-
-		if (tif->tif_rawcc + n > tif->tif_rawdatasize)
-		{
-			n = tif->tif_rawdatasize - tif->tif_rawcc;
-		}
-
-		assert(n > 0);
-		_TIFFmemcpy(tif->tif_rawcp, pp, n);
-		tif->tif_rawcp += n;
-		tif->tif_rawcc += n;
-		pp += n;
-		cc -= (size_t)n;
-		if (tif->tif_rawcc >= tif->tif_rawdatasize &&
-		    !TIFFFlushData1(tif))
-		{
-			return (-1);
-		}
-	}
-
-	return (1);
+    (void)s;
+    while (cc > 0)
+    {
+        tmsize_t n = (tmsize_t)cc;
+
+        if (tif->tif_rawcc + n > tif->tif_rawdatasize)
+        {
+            n = tif->tif_rawdatasize - tif->tif_rawcc;
+        }
+
+        assert(n > 0);
+        _TIFFmemcpy(tif->tif_rawcp, pp, n);
+        tif->tif_rawcp += n;
+        tif->tif_rawcc += n;
+        pp += n;
+        cc -= (size_t)n;
+        if (tif->tif_rawcc >= tif->tif_rawdatasize && !TIFFFlushData1(tif))
+        {
+            return (-1);
+        }
+    }
+
+    return (1);
 }
 
-static void JBIGOutputBie(unsigned char* buffer, size_t len, void* userData)
+static void JBIGOutputBie(unsigned char *buffer, size_t len, void *userData)
 {
-	TIFF* tif = (TIFF*)userData;
-
-	if (isFillOrder(tif, tif->tif_dir.td_fillorder))
-	{
-		TIFFReverseBits(buffer, (tmsize_t)len);
-	}
+    TIFF *tif = (TIFF *)userData;
 
-	JBIGCopyEncodedData(tif, buffer, len, 0);
-}
+    if (isFillOrder(tif, tif->tif_dir.td_fillorder))
+    {
+        TIFFReverseBits(buffer, (tmsize_t)len);
+    }
 
-static int JBIGEncode(TIFF* tif, uint8* buffer, tmsize_t size, uint16 s)
-{
-	TIFFDirectory* dir = &tif->tif_dir;
-	struct jbg_enc_state encoder;
-
-	(void) size, (void) s;
-
-	jbg_enc_init(&encoder,
-		     dir->td_imagewidth,
-		     dir->td_imagelength,
-		     1,
-		     &buffer,
-		     JBIGOutputBie,
-		     tif);
-	/*
-	 * jbg_enc_out does the "real" encoding.  As data is encoded,
-	 * JBIGOutputBie is called, which writes the data to the directory.
-	 */
-	jbg_enc_out(&encoder);
-	jbg_enc_free(&encoder);
-
-	return 1;
+    JBIGCopyEncodedData(tif, buffer, len, 0);
 }
 
-int TIFFInitJBIG(TIFF* tif, int scheme)
+static int JBIGEncode(TIFF *tif, uint8_t *buffer, tmsize_t size, uint16_t s)
 {
-        (void)scheme;
-	assert(scheme == COMPRESSION_JBIG);
+    TIFFDirectory *dir = &tif->tif_dir;
+    struct jbg_enc_state encoder;
 
-	/*
-	 * These flags are set so the JBIG Codec can control when to reverse
-	 * bits and when not to and to allow the jbig decoder and bit reverser
-	 * to write to memory when necessary.
-	 */
-	tif->tif_flags |= TIFF_NOBITREV;
-	tif->tif_flags &= ~TIFF_MAPPED;
+    (void)size, (void)s;
 
-	/* Setup the function pointers for encode, decode, and cleanup. */
-	tif->tif_setupdecode = JBIGSetupDecode;
-	tif->tif_decodestrip = JBIGDecode;
+    jbg_enc_init(&encoder, dir->td_imagewidth, dir->td_imagelength, 1, &buffer,
+                 JBIGOutputBie, tif);
+    /*
+     * jbg_enc_out does the "real" encoding.  As data is encoded,
+     * JBIGOutputBie is called, which writes the data to the directory.
+     */
+    jbg_enc_out(&encoder);
+    jbg_enc_free(&encoder);
 
-	tif->tif_setupencode = JBIGSetupEncode;
-	tif->tif_encodestrip = JBIGEncode;
+    return 1;
+}
 
-	return 1;
+int TIFFInitJBIG(TIFF *tif, int scheme)
+{
+    (void)scheme;
+    assert(scheme == COMPRESSION_JBIG);
+
+    /*
+     * These flags are set so the JBIG Codec can control when to reverse
+     * bits and when not to and to allow the jbig decoder and bit reverser
+     * to write to memory when necessary.
+     */
+    tif->tif_flags |= TIFF_NOBITREV;
+    tif->tif_flags &= ~TIFF_MAPPED;
+    /* We may have read from a previous IFD and thus set TIFF_BUFFERMMAP and
+     * cleared TIFF_MYBUFFER. It is necessary to restore them to their initial
+     * value to be consistent with the state of a non-memory mapped file.
+     */
+    if (tif->tif_flags & TIFF_BUFFERMMAP)
+    {
+        tif->tif_rawdata = NULL;
+        tif->tif_rawdatasize = 0;
+        tif->tif_flags &= ~TIFF_BUFFERMMAP;
+        tif->tif_flags |= TIFF_MYBUFFER;
+    }
+
+    /* Setup the function pointers for encode, decode, and cleanup. */
+    tif->tif_setupdecode = JBIGSetupDecode;
+    tif->tif_decodestrip = JBIGDecode;
+
+    tif->tif_setupencode = JBIGSetupEncode;
+    tif->tif_encodestrip = JBIGEncode;
+
+    return 1;
 }
 
 #endif /* JBIG_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_jpeg.c b/3rdparty/libtiff/tif_jpeg.c
index 6711137a92b7..250144f21120 100644
--- a/3rdparty/libtiff/tif_jpeg.c
+++ b/3rdparty/libtiff/tif_jpeg.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1994-1997 Sam Leffler
  * Copyright (c) 1994-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -44,10 +44,34 @@
  */
 #include <setjmp.h>
 
-int TIFFFillStrip(TIFF* tif, uint32 strip);
-int TIFFFillTile(TIFF* tif, uint32 tile);
-int TIFFReInitJPEG_12( TIFF *tif, int scheme, int is_encode );
-int TIFFJPEGIsFullStripRequired_12(TIFF* tif);
+/* Settings that are independent of libjpeg ABI. Used when reinitializing the */
+/* JPEGState from libjpegs 8 bit to libjpeg 12 bits, which have potentially */
+/* different ABI */
+typedef struct
+{
+    TIFFVGetMethod vgetparent;  /* super-class method */
+    TIFFVSetMethod vsetparent;  /* super-class method */
+    TIFFPrintMethod printdir;   /* super-class method */
+    TIFFStripMethod defsparent; /* super-class method */
+    TIFFTileMethod deftparent;  /* super-class method */
+
+    /* pseudo-tag fields */
+    void *jpegtables;           /* JPEGTables tag value, or NULL */
+    uint32_t jpegtables_length; /* number of bytes in same */
+    int jpegquality;            /* Compression quality level */
+    int jpegcolormode;          /* Auto RGB<=>YCbCr convert? */
+    int jpegtablesmode;         /* What to put in JPEGTables */
+
+    int ycbcrsampling_fetched;
+    int max_allowed_scan_number;
+    int has_warned_about_progressive_mode;
+} JPEGOtherSettings;
+
+int TIFFFillStrip(TIFF *tif, uint32_t strip);
+int TIFFFillTile(TIFF *tif, uint32_t tile);
+int TIFFReInitJPEG_12(TIFF *tif, const JPEGOtherSettings *otherSettings,
+                      int scheme, int is_encode);
+int TIFFJPEGIsFullStripRequired_12(TIFF *tif);
 
 /* We undefine FAR to avoid conflict with JPEG definition */
 
@@ -62,7 +86,7 @@ int TIFFJPEGIsFullStripRequired_12(TIFF* tif);
   a conflicting typedef given the headers which are included.
 */
 #if defined(__BORLANDC__) || defined(__MINGW32__)
-# define XMD_H 1
+#define XMD_H 1
 #endif
 
 /*
@@ -80,24 +104,63 @@ int TIFFJPEGIsFullStripRequired_12(TIFF* tif);
 
 /* Define "boolean" as unsigned char, not int, per Windows custom. */
 #if defined(__WIN32__) && !defined(__MINGW32__)
-# ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
-   typedef unsigned char boolean;
-# endif
-# define HAVE_BOOLEAN            /* prevent jmorecfg.h from redefining it */
+#ifndef __RPCNDR_H__ /* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
+#endif
+#define HAVE_BOOLEAN /* prevent jmorecfg.h from redefining it */
 #endif
 
-#include "jpeglib.h"
 #include "jerror.h"
+#include "jpeglib.h"
+
+/* Do optional compile-time version check */
+#if defined(EXPECTED_JPEG_LIB_VERSION) && !defined(LIBJPEG_12_PATH)
+#if EXPECTED_JPEG_LIB_VERSION != JPEG_LIB_VERSION
+#error EXPECTED_JPEG_LIB_VERSION != JPEG_LIB_VERSION
+#endif
+#endif
 
-/* 
+/*
  * Do we want to do special processing suitable for when JSAMPLE is a
- * 16bit value?  
+ * 16bit value?
+ */
+
+/* HAVE_JPEGTURBO_DUAL_MODE_8_12 is defined for libjpeg-turbo >= 2.2 which
+ * adds a dual-mode 8/12 bit API in the same library.
+ */
+
+#if defined(HAVE_JPEGTURBO_DUAL_MODE_8_12)
+#define JPEG_DUAL_MODE_8_12
+/* Start by undefining BITS_IN_JSAMPLE which is always set to 8 in libjpeg-turbo
+ * >= 2.2 Cf
+ * https://github.com/libjpeg-turbo/libjpeg-turbo/commit/8b9bc4b9635a2a047fb23ebe70c9acd728d3f99b
  */
+#undef BITS_IN_JSAMPLE
+/* libjpeg-turbo >= 2.2 adds J12xxxx datatypes for the 12-bit mode. */
+#if defined(FROM_TIF_JPEG_12)
+#define BITS_IN_JSAMPLE 12
+#define TIFF_JSAMPLE J12SAMPLE
+#define TIFF_JSAMPARRAY J12SAMPARRAY
+#define TIFF_JSAMPIMAGE J12SAMPIMAGE
+#define TIFF_JSAMPROW J12SAMPROW
+#else
+#define BITS_IN_JSAMPLE 8
+#define TIFF_JSAMPLE JSAMPLE
+#define TIFF_JSAMPARRAY JSAMPARRAY
+#define TIFF_JSAMPIMAGE JSAMPIMAGE
+#define TIFF_JSAMPROW JSAMPROW
+#endif
+#else
+#define TIFF_JSAMPLE JSAMPLE
+#define TIFF_JSAMPARRAY JSAMPARRAY
+#define TIFF_JSAMPIMAGE JSAMPIMAGE
+#define TIFF_JSAMPROW JSAMPROW
+#endif
 
 #if defined(JPEG_LIB_MK1)
-#  define JPEG_LIB_MK1_OR_12BIT 1
+#define JPEG_LIB_MK1_OR_12BIT 1
 #elif BITS_IN_JSAMPLE == 12
-#  define JPEG_LIB_MK1_OR_12BIT 1
+#define JPEG_LIB_MK1_OR_12BIT 1
 #endif
 
 /*
@@ -115,9 +178,9 @@ int TIFFJPEGIsFullStripRequired_12(TIFF* tif);
  * On some machines it may be worthwhile to use _setjmp or sigsetjmp
  * in place of plain setjmp.  These macros will make it easier.
  */
-#define SETJMP(jbuf)		setjmp(jbuf)
-#define LONGJMP(jbuf,code)	longjmp(jbuf,code)
-#define JMP_BUF			jmp_buf
+#define SETJMP(jbuf) setjmp(jbuf)
+#define LONGJMP(jbuf, code) longjmp(jbuf, code)
+#define JMP_BUF jmp_buf
 
 typedef struct jpeg_destination_mgr jpeg_destination_mgr;
 typedef struct jpeg_source_mgr jpeg_source_mgr;
@@ -136,68 +199,60 @@ typedef struct jpeg_error_mgr jpeg_error_mgr;
  *     so we can safely cast JPEGState* -> jpeg_xxx_struct*
  *     and vice versa!
  */
-typedef struct {
-	union {
-		struct jpeg_compress_struct c;
-		struct jpeg_decompress_struct d;
-		struct jpeg_common_struct comm;
-	} cinfo;			/* NB: must be first */
-	int             cinfo_initialized;
-
-	jpeg_error_mgr	err;		/* libjpeg error manager */
-	JMP_BUF		exit_jmpbuf;	/* for catching libjpeg failures */
-	
-	struct jpeg_progress_mgr progress;
-	/*
-	 * The following two members could be a union, but
-	 * they're small enough that it's not worth the effort.
-	 */
-	jpeg_destination_mgr dest;	/* data dest for compression */
-	jpeg_source_mgr	src;		/* data source for decompression */
-					/* private state */
-	TIFF*		tif;		/* back link needed by some code */
-	uint16		photometric;	/* copy of PhotometricInterpretation */
-	uint16		h_sampling;	/* luminance sampling factors */
-	uint16		v_sampling;
-	tmsize_t   	bytesperline;	/* decompressed bytes per scanline */
-	/* pointers to intermediate buffers when processing downsampled data */
-	JSAMPARRAY	ds_buffer[MAX_COMPONENTS];
-	int		scancount;	/* number of "scanlines" accumulated */
-	int		samplesperclump;
-
-	TIFFVGetMethod	vgetparent;	/* super-class method */
-	TIFFVSetMethod	vsetparent;	/* super-class method */
-	TIFFPrintMethod printdir;	/* super-class method */
-	TIFFStripMethod	defsparent;	/* super-class method */
-	TIFFTileMethod	deftparent;	/* super-class method */
-					/* pseudo-tag fields */
-	void*		jpegtables;	/* JPEGTables tag value, or NULL */
-	uint32		jpegtables_length; /* number of bytes in same */
-	int		jpegquality;	/* Compression quality level */
-	int		jpegcolormode;	/* Auto RGB<=>YCbCr convert? */
-	int		jpegtablesmode;	/* What to put in JPEGTables */
-
-        int             ycbcrsampling_fetched;
-        int             max_allowed_scan_number;
+typedef struct
+{
+    union
+    {
+        struct jpeg_compress_struct c;
+        struct jpeg_decompress_struct d;
+        struct jpeg_common_struct comm;
+    } cinfo; /* NB: must be first */
+    int cinfo_initialized;
+
+    jpeg_error_mgr err;  /* libjpeg error manager */
+    JMP_BUF exit_jmpbuf; /* for catching libjpeg failures */
+
+    struct jpeg_progress_mgr progress;
+    /*
+     * The following two members could be a union, but
+     * they're small enough that it's not worth the effort.
+     */
+    jpeg_destination_mgr dest; /* data dest for compression */
+    jpeg_source_mgr src;       /* data source for decompression */
+                               /* private state */
+    TIFF *tif;                 /* back link needed by some code */
+    uint16_t photometric;      /* copy of PhotometricInterpretation */
+    uint16_t h_sampling;       /* luminance sampling factors */
+    uint16_t v_sampling;
+    tmsize_t bytesperline; /* decompressed bytes per scanline */
+    /* pointers to intermediate buffers when processing downsampled data */
+    TIFF_JSAMPARRAY ds_buffer[MAX_COMPONENTS];
+    int scancount; /* number of "scanlines" accumulated */
+    int samplesperclump;
+
+    JPEGOtherSettings otherSettings;
 } JPEGState;
 
-#define	JState(tif)	((JPEGState*)(tif)->tif_data)
+#define JState(tif) ((JPEGState *)(tif)->tif_data)
 
-static int JPEGDecode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s);
-static int JPEGDecodeRaw(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s);
-static int JPEGEncode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s);
-static int JPEGEncodeRaw(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s);
-static int JPEGInitializeLibJPEG(TIFF * tif, int decode );
-static int DecodeRowError(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s);
+static int JPEGDecode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s);
+static int JPEGDecodeRaw(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s);
+static int JPEGEncode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s);
+static int JPEGEncodeRaw(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s);
+static int JPEGInitializeLibJPEG(TIFF *tif, int decode);
+static int DecodeRowError(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s);
 
-#define	FIELD_JPEGTABLES	(FIELD_CODEC+0)
+#define FIELD_JPEGTABLES (FIELD_CODEC + 0)
 
 static const TIFFField jpegFields[] = {
-    { TIFFTAG_JPEGTABLES, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8, TIFF_SETGET_C32_UINT8, FIELD_JPEGTABLES, FALSE, TRUE, "JPEGTables", NULL },
-    { TIFFTAG_JPEGQUALITY, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL },
-    { TIFFTAG_JPEGCOLORMODE, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "", NULL },
-    { TIFFTAG_JPEGTABLESMODE, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "", NULL }
-};
+    {TIFFTAG_JPEGTABLES, -3, -3, TIFF_UNDEFINED, 0, TIFF_SETGET_C32_UINT8,
+     TIFF_SETGET_C32_UINT8, FIELD_JPEGTABLES, FALSE, TRUE, "JPEGTables", NULL},
+    {TIFFTAG_JPEGQUALITY, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL},
+    {TIFFTAG_JPEGCOLORMODE, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "", NULL},
+    {TIFFTAG_JPEGTABLESMODE, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "", NULL}};
 
 /*
  * libjpeg interface layer.
@@ -213,16 +268,16 @@ static const TIFFField jpegFields[] = {
  * IJG routines from jerror.c).  These are used for both
  * compression and decompression.
  */
-static void
-TIFFjpeg_error_exit(j_common_ptr cinfo)
+static void TIFFjpeg_error_exit(j_common_ptr cinfo)
 {
-	JPEGState *sp = (JPEGState *) cinfo;	/* NB: cinfo assumed first */
-	char buffer[JMSG_LENGTH_MAX];
+    JPEGState *sp = (JPEGState *)cinfo; /* NB: cinfo assumed first */
+    char buffer[JMSG_LENGTH_MAX];
 
-	(*cinfo->err->format_message) (cinfo, buffer);
-	TIFFErrorExt(sp->tif->tif_clientdata, "JPEGLib", "%s", buffer);		/* display the error message */
-	jpeg_abort(cinfo);			/* clean up libjpeg state */
-	LONGJMP(sp->exit_jmpbuf, 1);		/* return to libtiff caller */
+    (*cinfo->err->format_message)(cinfo, buffer);
+    TIFFErrorExtR(sp->tif, "JPEGLib", "%s",
+                  buffer);       /* display the error message */
+    jpeg_abort(cinfo);           /* clean up libjpeg state */
+    LONGJMP(sp->exit_jmpbuf, 1); /* return to libtiff caller */
 }
 
 /*
@@ -230,203 +285,216 @@ TIFFjpeg_error_exit(j_common_ptr cinfo)
  * since error_exit does its own thing and trace_level
  * is never set > 0.
  */
-static void
-TIFFjpeg_output_message(j_common_ptr cinfo)
+static void TIFFjpeg_output_message(j_common_ptr cinfo)
 {
-	char buffer[JMSG_LENGTH_MAX];
+    char buffer[JMSG_LENGTH_MAX];
 
-	(*cinfo->err->format_message) (cinfo, buffer);
-	TIFFWarningExt(((JPEGState *) cinfo)->tif->tif_clientdata, "JPEGLib", "%s", buffer);
+    (*cinfo->err->format_message)(cinfo, buffer);
+    TIFFWarningExtR(((JPEGState *)cinfo)->tif, "JPEGLib", "%s", buffer);
 }
 
 /* Avoid the risk of denial-of-service on crafted JPEGs with an insane */
 /* number of scans. */
-/* See http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf */
-static void
-TIFFjpeg_progress_monitor(j_common_ptr cinfo)
+/* See
+ * http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf
+ */
+static void TIFFjpeg_progress_monitor(j_common_ptr cinfo)
 {
-    JPEGState *sp = (JPEGState *) cinfo;	/* NB: cinfo assumed first */
+    JPEGState *sp = (JPEGState *)cinfo; /* NB: cinfo assumed first */
     if (cinfo->is_decompressor)
     {
-        const int scan_no =
-            ((j_decompress_ptr)cinfo)->input_scan_number;
-        if (scan_no >= sp->max_allowed_scan_number)
+        const int scan_no = ((j_decompress_ptr)cinfo)->input_scan_number;
+        if (scan_no >= sp->otherSettings.max_allowed_scan_number)
         {
-            TIFFErrorExt(((JPEGState *) cinfo)->tif->tif_clientdata, 
-                     "TIFFjpeg_progress_monitor",
-                     "Scan number %d exceeds maximum scans (%d). This limit "
-                     "can be raised through the LIBTIFF_JPEG_MAX_ALLOWED_SCAN_NUMBER "
-                     "environment variable.",
-                     scan_no, sp->max_allowed_scan_number);
-
-            jpeg_abort(cinfo);			/* clean up libjpeg state */
-            LONGJMP(sp->exit_jmpbuf, 1);		/* return to libtiff caller */
+            TIFFErrorExtR(
+                ((JPEGState *)cinfo)->tif, "TIFFjpeg_progress_monitor",
+                "Scan number %d exceeds maximum scans (%d). This limit "
+                "can be raised through the "
+                "LIBTIFF_JPEG_MAX_ALLOWED_SCAN_NUMBER "
+                "environment variable.",
+                scan_no, sp->otherSettings.max_allowed_scan_number);
+
+            jpeg_abort(cinfo);           /* clean up libjpeg state */
+            LONGJMP(sp->exit_jmpbuf, 1); /* return to libtiff caller */
         }
     }
 }
 
-
 /*
  * Interface routines.  This layer of routines exists
  * primarily to limit side-effects from using setjmp.
  * Also, normal/error returns are converted into return
  * values per libtiff practice.
  */
-#define	CALLJPEG(sp, fail, op)	(SETJMP((sp)->exit_jmpbuf) ? (fail) : (op))
-#define	CALLVJPEG(sp, op)	CALLJPEG(sp, 0, ((op),1))
+#define CALLJPEG(sp, fail, op) (SETJMP((sp)->exit_jmpbuf) ? (fail) : (op))
+#define CALLVJPEG(sp, op) CALLJPEG(sp, 0, ((op), 1))
 
-static int
-TIFFjpeg_create_compress(JPEGState* sp)
+static int TIFFjpeg_create_compress(JPEGState *sp)
 {
-	/* initialize JPEG error handling */
-	sp->cinfo.c.err = jpeg_std_error(&sp->err);
-	sp->err.error_exit = TIFFjpeg_error_exit;
-	sp->err.output_message = TIFFjpeg_output_message;
+    /* initialize JPEG error handling */
+    sp->cinfo.c.err = jpeg_std_error(&sp->err);
+    sp->err.error_exit = TIFFjpeg_error_exit;
+    sp->err.output_message = TIFFjpeg_output_message;
 
-	/* set client_data to avoid UMR warning from tools like Purify */
-	sp->cinfo.c.client_data = NULL;
+    /* set client_data to avoid UMR warning from tools like Purify */
+    sp->cinfo.c.client_data = NULL;
 
-	return CALLVJPEG(sp, jpeg_create_compress(&sp->cinfo.c));
+    return CALLVJPEG(sp, jpeg_create_compress(&sp->cinfo.c));
 }
 
-static int
-TIFFjpeg_create_decompress(JPEGState* sp)
+static int TIFFjpeg_create_decompress(JPEGState *sp)
 {
-	/* initialize JPEG error handling */
-	sp->cinfo.d.err = jpeg_std_error(&sp->err);
-	sp->err.error_exit = TIFFjpeg_error_exit;
-	sp->err.output_message = TIFFjpeg_output_message;
+    /* initialize JPEG error handling */
+    sp->cinfo.d.err = jpeg_std_error(&sp->err);
+    sp->err.error_exit = TIFFjpeg_error_exit;
+    sp->err.output_message = TIFFjpeg_output_message;
 
-	/* set client_data to avoid UMR warning from tools like Purify */
-	sp->cinfo.d.client_data = NULL;
+    /* set client_data to avoid UMR warning from tools like Purify */
+    sp->cinfo.d.client_data = NULL;
 
-	return CALLVJPEG(sp, jpeg_create_decompress(&sp->cinfo.d));
+    return CALLVJPEG(sp, jpeg_create_decompress(&sp->cinfo.d));
 }
 
-static int
-TIFFjpeg_set_defaults(JPEGState* sp)
+static int TIFFjpeg_set_defaults(JPEGState *sp)
 {
-	return CALLVJPEG(sp, jpeg_set_defaults(&sp->cinfo.c));
+    return CALLVJPEG(sp, jpeg_set_defaults(&sp->cinfo.c));
 }
 
-static int
-TIFFjpeg_set_colorspace(JPEGState* sp, J_COLOR_SPACE colorspace)
+static int TIFFjpeg_set_colorspace(JPEGState *sp, J_COLOR_SPACE colorspace)
 {
-	return CALLVJPEG(sp, jpeg_set_colorspace(&sp->cinfo.c, colorspace));
+    return CALLVJPEG(sp, jpeg_set_colorspace(&sp->cinfo.c, colorspace));
 }
 
-static int
-TIFFjpeg_set_quality(JPEGState* sp, int quality, boolean force_baseline)
+static int TIFFjpeg_set_quality(JPEGState *sp, int quality,
+                                boolean force_baseline)
 {
-	return CALLVJPEG(sp,
-	    jpeg_set_quality(&sp->cinfo.c, quality, force_baseline));
+    return CALLVJPEG(sp,
+                     jpeg_set_quality(&sp->cinfo.c, quality, force_baseline));
 }
 
-static int
-TIFFjpeg_suppress_tables(JPEGState* sp, boolean suppress)
+static int TIFFjpeg_suppress_tables(JPEGState *sp, boolean suppress)
 {
-	return CALLVJPEG(sp, jpeg_suppress_tables(&sp->cinfo.c, suppress));
+    return CALLVJPEG(sp, jpeg_suppress_tables(&sp->cinfo.c, suppress));
 }
 
-static int
-TIFFjpeg_start_compress(JPEGState* sp, boolean write_all_tables)
+static int TIFFjpeg_start_compress(JPEGState *sp, boolean write_all_tables)
 {
-	return CALLVJPEG(sp,
-	    jpeg_start_compress(&sp->cinfo.c, write_all_tables));
+    return CALLVJPEG(sp, jpeg_start_compress(&sp->cinfo.c, write_all_tables));
 }
 
-static int
-TIFFjpeg_write_scanlines(JPEGState* sp, JSAMPARRAY scanlines, int num_lines)
+static int TIFFjpeg_write_scanlines(JPEGState *sp, TIFF_JSAMPARRAY scanlines,
+                                    int num_lines)
 {
-	return CALLJPEG(sp, -1, (int) jpeg_write_scanlines(&sp->cinfo.c,
-	    scanlines, (JDIMENSION) num_lines));
+#if defined(HAVE_JPEGTURBO_DUAL_MODE_8_12) && BITS_IN_JSAMPLE == 12
+    return CALLJPEG(sp, -1,
+                    (int)jpeg12_write_scanlines(&sp->cinfo.c, scanlines,
+                                                (JDIMENSION)num_lines));
+#else
+    return CALLJPEG(sp, -1,
+                    (int)jpeg_write_scanlines(&sp->cinfo.c, scanlines,
+                                              (JDIMENSION)num_lines));
+#endif
 }
 
-static int
-TIFFjpeg_write_raw_data(JPEGState* sp, JSAMPIMAGE data, int num_lines)
+static int TIFFjpeg_write_raw_data(JPEGState *sp, TIFF_JSAMPIMAGE data,
+                                   int num_lines)
 {
-	return CALLJPEG(sp, -1, (int) jpeg_write_raw_data(&sp->cinfo.c,
-	    data, (JDIMENSION) num_lines));
+#if defined(HAVE_JPEGTURBO_DUAL_MODE_8_12) && BITS_IN_JSAMPLE == 12
+    return CALLJPEG(
+        sp, -1,
+        (int)jpeg12_write_raw_data(&sp->cinfo.c, data, (JDIMENSION)num_lines));
+#else
+    return CALLJPEG(
+        sp, -1,
+        (int)jpeg_write_raw_data(&sp->cinfo.c, data, (JDIMENSION)num_lines));
+#endif
 }
 
-static int
-TIFFjpeg_finish_compress(JPEGState* sp)
+static int TIFFjpeg_finish_compress(JPEGState *sp)
 {
-	return CALLVJPEG(sp, jpeg_finish_compress(&sp->cinfo.c));
+    return CALLVJPEG(sp, jpeg_finish_compress(&sp->cinfo.c));
 }
 
-static int
-TIFFjpeg_write_tables(JPEGState* sp)
+static int TIFFjpeg_write_tables(JPEGState *sp)
 {
-	return CALLVJPEG(sp, jpeg_write_tables(&sp->cinfo.c));
+    return CALLVJPEG(sp, jpeg_write_tables(&sp->cinfo.c));
 }
 
-static int
-TIFFjpeg_read_header(JPEGState* sp, boolean require_image)
+static int TIFFjpeg_read_header(JPEGState *sp, boolean require_image)
 {
-	return CALLJPEG(sp, -1, jpeg_read_header(&sp->cinfo.d, require_image));
+    return CALLJPEG(sp, -1, jpeg_read_header(&sp->cinfo.d, require_image));
 }
 
-static int
-TIFFjpeg_has_multiple_scans(JPEGState* sp)
+static int TIFFjpeg_has_multiple_scans(JPEGState *sp)
 {
-	return CALLJPEG(sp, 0, jpeg_has_multiple_scans(&sp->cinfo.d));
+    return CALLJPEG(sp, 0, jpeg_has_multiple_scans(&sp->cinfo.d));
 }
 
-static int
-TIFFjpeg_start_decompress(JPEGState* sp)
+static int TIFFjpeg_start_decompress(JPEGState *sp)
 {
-        const char* sz_max_allowed_scan_number;
-        /* progress monitor */
-        sp->cinfo.d.progress = &sp->progress;
-        sp->progress.progress_monitor = TIFFjpeg_progress_monitor;
-        sp->max_allowed_scan_number = 100;
-        sz_max_allowed_scan_number = getenv("LIBTIFF_JPEG_MAX_ALLOWED_SCAN_NUMBER");
-        if( sz_max_allowed_scan_number )
-            sp->max_allowed_scan_number = atoi(sz_max_allowed_scan_number);
+    const char *sz_max_allowed_scan_number;
+    /* progress monitor */
+    sp->cinfo.d.progress = &sp->progress;
+    sp->progress.progress_monitor = TIFFjpeg_progress_monitor;
+    sp->otherSettings.max_allowed_scan_number = 100;
+    sz_max_allowed_scan_number = getenv("LIBTIFF_JPEG_MAX_ALLOWED_SCAN_NUMBER");
+    if (sz_max_allowed_scan_number)
+        sp->otherSettings.max_allowed_scan_number =
+            atoi(sz_max_allowed_scan_number);
 
-	return CALLVJPEG(sp, jpeg_start_decompress(&sp->cinfo.d));
+    return CALLVJPEG(sp, jpeg_start_decompress(&sp->cinfo.d));
 }
 
-static int
-TIFFjpeg_read_scanlines(JPEGState* sp, JSAMPARRAY scanlines, int max_lines)
+static int TIFFjpeg_read_scanlines(JPEGState *sp, TIFF_JSAMPARRAY scanlines,
+                                   int max_lines)
 {
-	return CALLJPEG(sp, -1, (int) jpeg_read_scanlines(&sp->cinfo.d,
-	    scanlines, (JDIMENSION) max_lines));
+#if defined(HAVE_JPEGTURBO_DUAL_MODE_8_12) && BITS_IN_JSAMPLE == 12
+    return CALLJPEG(sp, -1,
+                    (int)jpeg12_read_scanlines(&sp->cinfo.d, scanlines,
+                                               (JDIMENSION)max_lines));
+#else
+    return CALLJPEG(sp, -1,
+                    (int)jpeg_read_scanlines(&sp->cinfo.d, scanlines,
+                                             (JDIMENSION)max_lines));
+#endif
 }
 
-static int
-TIFFjpeg_read_raw_data(JPEGState* sp, JSAMPIMAGE data, int max_lines)
+static int TIFFjpeg_read_raw_data(JPEGState *sp, TIFF_JSAMPIMAGE data,
+                                  int max_lines)
 {
-	return CALLJPEG(sp, -1, (int) jpeg_read_raw_data(&sp->cinfo.d,
-	    data, (JDIMENSION) max_lines));
+#if defined(HAVE_JPEGTURBO_DUAL_MODE_8_12) && BITS_IN_JSAMPLE == 12
+    return CALLJPEG(
+        sp, -1,
+        (int)jpeg12_read_raw_data(&sp->cinfo.d, data, (JDIMENSION)max_lines));
+#else
+    return CALLJPEG(
+        sp, -1,
+        (int)jpeg_read_raw_data(&sp->cinfo.d, data, (JDIMENSION)max_lines));
+#endif
 }
 
-static int
-TIFFjpeg_finish_decompress(JPEGState* sp)
+static int TIFFjpeg_finish_decompress(JPEGState *sp)
 {
-	return CALLJPEG(sp, -1, (int) jpeg_finish_decompress(&sp->cinfo.d));
+    return CALLJPEG(sp, -1, (int)jpeg_finish_decompress(&sp->cinfo.d));
 }
 
-static int
-TIFFjpeg_abort(JPEGState* sp)
+static int TIFFjpeg_abort(JPEGState *sp)
 {
-	return CALLVJPEG(sp, jpeg_abort(&sp->cinfo.comm));
+    return CALLVJPEG(sp, jpeg_abort(&sp->cinfo.comm));
 }
 
-static int
-TIFFjpeg_destroy(JPEGState* sp)
+static int TIFFjpeg_destroy(JPEGState *sp)
 {
-	return CALLVJPEG(sp, jpeg_destroy(&sp->cinfo.comm));
+    return CALLVJPEG(sp, jpeg_destroy(&sp->cinfo.comm));
 }
 
-static JSAMPARRAY
-TIFFjpeg_alloc_sarray(JPEGState* sp, int pool_id,
-		      JDIMENSION samplesperrow, JDIMENSION numrows)
+static JSAMPARRAY TIFFjpeg_alloc_sarray(JPEGState *sp, int pool_id,
+                                        JDIMENSION samplesperrow,
+                                        JDIMENSION numrows)
 {
-	return CALLJPEG(sp, (JSAMPARRAY) NULL,
-	    (*sp->cinfo.comm.mem->alloc_sarray)
-		(&sp->cinfo.comm, pool_id, samplesperrow, numrows));
+    return CALLJPEG(sp, (JSAMPARRAY)NULL,
+                    (*sp->cinfo.comm.mem->alloc_sarray)(
+                        &sp->cinfo.comm, pool_id, samplesperrow, numrows));
 }
 
 /*
@@ -435,130 +503,128 @@ TIFFjpeg_alloc_sarray(JPEGState* sp, int pool_id,
  * libtiff output buffer.
  */
 
-static void
-std_init_destination(j_compress_ptr cinfo)
+static void std_init_destination(j_compress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
-	TIFF* tif = sp->tif;
+    JPEGState *sp = (JPEGState *)cinfo;
+    TIFF *tif = sp->tif;
 
-	sp->dest.next_output_byte = (JOCTET*) tif->tif_rawdata;
-	sp->dest.free_in_buffer = (size_t) tif->tif_rawdatasize;
+    sp->dest.next_output_byte = (JOCTET *)tif->tif_rawdata;
+    sp->dest.free_in_buffer = (size_t)tif->tif_rawdatasize;
 }
 
-static boolean
-std_empty_output_buffer(j_compress_ptr cinfo)
+static boolean std_empty_output_buffer(j_compress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
-	TIFF* tif = sp->tif;
+    JPEGState *sp = (JPEGState *)cinfo;
+    TIFF *tif = sp->tif;
 
-	/* the entire buffer has been filled */
-	tif->tif_rawcc = tif->tif_rawdatasize;
+    /* the entire buffer has been filled */
+    tif->tif_rawcc = tif->tif_rawdatasize;
 
 #ifdef IPPJ_HUFF
-       /*
-        * The Intel IPP performance library does not necessarily fill up
-        * the whole output buffer on each pass, so only dump out the parts
-        * that have been filled.
-        *   http://trac.osgeo.org/gdal/wiki/JpegIPP
-        */
-       if ( sp->dest.free_in_buffer >= 0 ) {
-               tif->tif_rawcc = tif->tif_rawdatasize - sp->dest.free_in_buffer;
-       }
+    /*
+     * The Intel IPP performance library does not necessarily fill up
+     * the whole output buffer on each pass, so only dump out the parts
+     * that have been filled.
+     *   http://trac.osgeo.org/gdal/wiki/JpegIPP
+     */
+    if (sp->dest.free_in_buffer >= 0)
+    {
+        tif->tif_rawcc = tif->tif_rawdatasize - sp->dest.free_in_buffer;
+    }
 #endif
 
-	if( !TIFFFlushData1(tif) )
-            return FALSE;
-	sp->dest.next_output_byte = (JOCTET*) tif->tif_rawdata;
-	sp->dest.free_in_buffer = (size_t) tif->tif_rawdatasize;
+    if (!TIFFFlushData1(tif))
+        return FALSE;
+    sp->dest.next_output_byte = (JOCTET *)tif->tif_rawdata;
+    sp->dest.free_in_buffer = (size_t)tif->tif_rawdatasize;
 
-	return (TRUE);
+    return (TRUE);
 }
 
-static void
-std_term_destination(j_compress_ptr cinfo)
+static void std_term_destination(j_compress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
-	TIFF* tif = sp->tif;
+    JPEGState *sp = (JPEGState *)cinfo;
+    TIFF *tif = sp->tif;
 
-	tif->tif_rawcp = (uint8*) sp->dest.next_output_byte;
-	tif->tif_rawcc =
-	    tif->tif_rawdatasize - (tmsize_t) sp->dest.free_in_buffer;
-	/* NB: libtiff does the final buffer flush */
+    tif->tif_rawcp = (uint8_t *)sp->dest.next_output_byte;
+    tif->tif_rawcc = tif->tif_rawdatasize - (tmsize_t)sp->dest.free_in_buffer;
+    /* NB: libtiff does the final buffer flush */
 }
 
-static void
-TIFFjpeg_data_dest(JPEGState* sp, TIFF* tif)
+static void TIFFjpeg_data_dest(JPEGState *sp, TIFF *tif)
 {
-	(void) tif;
-	sp->cinfo.c.dest = &sp->dest;
-	sp->dest.init_destination = std_init_destination;
-	sp->dest.empty_output_buffer = std_empty_output_buffer;
-	sp->dest.term_destination = std_term_destination;
+    (void)tif;
+    sp->cinfo.c.dest = &sp->dest;
+    sp->dest.init_destination = std_init_destination;
+    sp->dest.empty_output_buffer = std_empty_output_buffer;
+    sp->dest.term_destination = std_term_destination;
 }
 
 /*
  * Alternate destination manager for outputting to JPEGTables field.
  */
 
-static void
-tables_init_destination(j_compress_ptr cinfo)
+static void tables_init_destination(j_compress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
+    JPEGState *sp = (JPEGState *)cinfo;
 
-	/* while building, jpegtables_length is allocated buffer size */
-	sp->dest.next_output_byte = (JOCTET*) sp->jpegtables;
-	sp->dest.free_in_buffer = (size_t) sp->jpegtables_length;
+    /* while building, otherSettings.jpegtables_length is allocated buffer size
+     */
+    sp->dest.next_output_byte = (JOCTET *)sp->otherSettings.jpegtables;
+    sp->dest.free_in_buffer = (size_t)sp->otherSettings.jpegtables_length;
 }
 
-static boolean
-tables_empty_output_buffer(j_compress_ptr cinfo)
+static boolean tables_empty_output_buffer(j_compress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
-	void* newbuf;
+    JPEGState *sp = (JPEGState *)cinfo;
+    void *newbuf;
 
-	/* the entire buffer has been filled; enlarge it by 1000 bytes */
-	newbuf = _TIFFrealloc((void*) sp->jpegtables,
-			      (tmsize_t) (sp->jpegtables_length + 1000));
-	if (newbuf == NULL)
-		ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 100);
-	sp->dest.next_output_byte = (JOCTET*) newbuf + sp->jpegtables_length;
-	sp->dest.free_in_buffer = (size_t) 1000;
-	sp->jpegtables = newbuf;
-	sp->jpegtables_length += 1000;
-	return (TRUE);
+    /* the entire buffer has been filled; enlarge it by 1000 bytes */
+    newbuf =
+        _TIFFreallocExt(sp->tif, (void *)sp->otherSettings.jpegtables,
+                        (tmsize_t)(sp->otherSettings.jpegtables_length + 1000));
+    if (newbuf == NULL)
+        ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 100);
+    sp->dest.next_output_byte =
+        (JOCTET *)newbuf + sp->otherSettings.jpegtables_length;
+    sp->dest.free_in_buffer = (size_t)1000;
+    sp->otherSettings.jpegtables = newbuf;
+    sp->otherSettings.jpegtables_length += 1000;
+    return (TRUE);
 }
 
-static void
-tables_term_destination(j_compress_ptr cinfo)
+static void tables_term_destination(j_compress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
+    JPEGState *sp = (JPEGState *)cinfo;
 
-	/* set tables length to number of bytes actually emitted */
-	sp->jpegtables_length -= (uint32) sp->dest.free_in_buffer;
+    /* set tables length to number of bytes actually emitted */
+    sp->otherSettings.jpegtables_length -= (uint32_t)sp->dest.free_in_buffer;
 }
 
-static int
-TIFFjpeg_tables_dest(JPEGState* sp, TIFF* tif)
-{
-	(void) tif;
-	/*
-	 * Allocate a working buffer for building tables.
-	 * Initial size is 1000 bytes, which is usually adequate.
-	 */
-	if (sp->jpegtables)
-		_TIFFfree(sp->jpegtables);
-	sp->jpegtables_length = 1000;
-	sp->jpegtables = (void*) _TIFFmalloc((tmsize_t) sp->jpegtables_length);
-	if (sp->jpegtables == NULL) {
-		sp->jpegtables_length = 0;
-		TIFFErrorExt(sp->tif->tif_clientdata, "TIFFjpeg_tables_dest", "No space for JPEGTables");
-		return (0);
-	}
-	sp->cinfo.c.dest = &sp->dest;
-	sp->dest.init_destination = tables_init_destination;
-	sp->dest.empty_output_buffer = tables_empty_output_buffer;
-	sp->dest.term_destination = tables_term_destination;
-	return (1);
+static int TIFFjpeg_tables_dest(JPEGState *sp, TIFF *tif)
+{
+    (void)tif;
+    /*
+     * Allocate a working buffer for building tables.
+     * Initial size is 1000 bytes, which is usually adequate.
+     */
+    if (sp->otherSettings.jpegtables)
+        _TIFFfreeExt(tif, sp->otherSettings.jpegtables);
+    sp->otherSettings.jpegtables_length = 1000;
+    sp->otherSettings.jpegtables = (void *)_TIFFmallocExt(
+        tif, (tmsize_t)sp->otherSettings.jpegtables_length);
+    if (sp->otherSettings.jpegtables == NULL)
+    {
+        sp->otherSettings.jpegtables_length = 0;
+        TIFFErrorExtR(sp->tif, "TIFFjpeg_tables_dest",
+                      "No space for JPEGTables");
+        return (0);
+    }
+    sp->cinfo.c.dest = &sp->dest;
+    sp->dest.init_destination = tables_init_destination;
+    sp->dest.empty_output_buffer = tables_empty_output_buffer;
+    sp->dest.term_destination = tables_term_destination;
+    return (1);
 }
 
 /*
@@ -566,86 +632,86 @@ TIFFjpeg_tables_dest(JPEGState* sp, TIFF* tif)
  * These routines supply compressed data to libjpeg.
  */
 
-static void
-std_init_source(j_decompress_ptr cinfo)
+static void std_init_source(j_decompress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
-	TIFF* tif = sp->tif;
+    JPEGState *sp = (JPEGState *)cinfo;
+    TIFF *tif = sp->tif;
 
-	sp->src.next_input_byte = (const JOCTET*) tif->tif_rawdata;
-	sp->src.bytes_in_buffer = (size_t) tif->tif_rawcc;
+    sp->src.next_input_byte = (const JOCTET *)tif->tif_rawdata;
+    sp->src.bytes_in_buffer = (size_t)tif->tif_rawcc;
 }
 
-static boolean
-std_fill_input_buffer(j_decompress_ptr cinfo)
+static boolean std_fill_input_buffer(j_decompress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState* ) cinfo;
-	static const JOCTET dummy_EOI[2] = { 0xFF, JPEG_EOI };
+    JPEGState *sp = (JPEGState *)cinfo;
+    static const JOCTET dummy_EOI[2] = {0xFF, JPEG_EOI};
 
 #ifdef IPPJ_HUFF
-        /*
-         * The Intel IPP performance library does not necessarily read the whole
-         * input buffer in one pass, so it is possible to get here with data
-         * yet to read. 
-         * 
-         * We just return without doing anything, until the entire buffer has
-         * been read.  
-         * http://trac.osgeo.org/gdal/wiki/JpegIPP
-         */
-        if( sp->src.bytes_in_buffer > 0 ) {
-            return (TRUE);
-        }
+    /*
+     * The Intel IPP performance library does not necessarily read the whole
+     * input buffer in one pass, so it is possible to get here with data
+     * yet to read.
+     *
+     * We just return without doing anything, until the entire buffer has
+     * been read.
+     * http://trac.osgeo.org/gdal/wiki/JpegIPP
+     */
+    if (sp->src.bytes_in_buffer > 0)
+    {
+        return (TRUE);
+    }
 #endif
 
-	/*
-         * Normally the whole strip/tile is read and so we don't need to do
-         * a fill.  In the case of CHUNKY_STRIP_READ_SUPPORT we might not have
-         * all the data, but the rawdata is refreshed between scanlines and
-         * we push this into the io machinery in JPEGDecode(). 	 
-         * http://trac.osgeo.org/gdal/ticket/3894
-	 */
-        
-	WARNMS(cinfo, JWRN_JPEG_EOF);
-	/* insert a fake EOI marker */
-	sp->src.next_input_byte = dummy_EOI;
-	sp->src.bytes_in_buffer = 2;
-	return (TRUE);
+    /*
+     * Normally the whole strip/tile is read and so we don't need to do
+     * a fill.  In the case of CHUNKY_STRIP_READ_SUPPORT we might not have
+     * all the data, but the rawdata is refreshed between scanlines and
+     * we push this into the io machinery in JPEGDecode().
+     * http://trac.osgeo.org/gdal/ticket/3894
+     */
+
+    WARNMS(cinfo, JWRN_JPEG_EOF);
+    /* insert a fake EOI marker */
+    sp->src.next_input_byte = dummy_EOI;
+    sp->src.bytes_in_buffer = 2;
+    return (TRUE);
 }
 
-static void
-std_skip_input_data(j_decompress_ptr cinfo, long num_bytes)
+static void std_skip_input_data(j_decompress_ptr cinfo, long num_bytes)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
+    JPEGState *sp = (JPEGState *)cinfo;
 
-	if (num_bytes > 0) {
-		if ((size_t)num_bytes > sp->src.bytes_in_buffer) {
-			/* oops, buffer overrun */
-			(void) std_fill_input_buffer(cinfo);
-		} else {
-			sp->src.next_input_byte += (size_t) num_bytes;
-			sp->src.bytes_in_buffer -= (size_t) num_bytes;
-		}
-	}
+    if (num_bytes > 0)
+    {
+        if ((size_t)num_bytes > sp->src.bytes_in_buffer)
+        {
+            /* oops, buffer overrun */
+            (void)std_fill_input_buffer(cinfo);
+        }
+        else
+        {
+            sp->src.next_input_byte += (size_t)num_bytes;
+            sp->src.bytes_in_buffer -= (size_t)num_bytes;
+        }
+    }
 }
 
-static void
-std_term_source(j_decompress_ptr cinfo)
+static void std_term_source(j_decompress_ptr cinfo)
 {
-	/* No work necessary here */
-	(void) cinfo;
+    /* No work necessary here */
+    (void)cinfo;
 }
 
-static void
-TIFFjpeg_data_src(JPEGState* sp)
+static void TIFFjpeg_data_src(JPEGState *sp)
 {
-	sp->cinfo.d.src = &sp->src;
-	sp->src.init_source = std_init_source;
-	sp->src.fill_input_buffer = std_fill_input_buffer;
-	sp->src.skip_input_data = std_skip_input_data;
-	sp->src.resync_to_restart = jpeg_resync_to_restart;
-	sp->src.term_source = std_term_source;
-	sp->src.bytes_in_buffer = 0;		/* for safety */
-	sp->src.next_input_byte = NULL;
+    sp->cinfo.d.src = &sp->src;
+    sp->src.init_source = std_init_source;
+    sp->src.fill_input_buffer = std_fill_input_buffer;
+    sp->src.skip_input_data = std_skip_input_data;
+    sp->src.resync_to_restart = jpeg_resync_to_restart;
+    sp->src.term_source = std_term_source;
+    sp->src.bytes_in_buffer = 0; /* for safety */
+    sp->src.next_input_byte = NULL;
 }
 
 /*
@@ -653,20 +719,18 @@ TIFFjpeg_data_src(JPEGState* sp)
  * We can share all the code except for the init routine.
  */
 
-static void
-tables_init_source(j_decompress_ptr cinfo)
+static void tables_init_source(j_decompress_ptr cinfo)
 {
-	JPEGState* sp = (JPEGState*) cinfo;
+    JPEGState *sp = (JPEGState *)cinfo;
 
-	sp->src.next_input_byte = (const JOCTET*) sp->jpegtables;
-	sp->src.bytes_in_buffer = (size_t) sp->jpegtables_length;
+    sp->src.next_input_byte = (const JOCTET *)sp->otherSettings.jpegtables;
+    sp->src.bytes_in_buffer = (size_t)sp->otherSettings.jpegtables_length;
 }
 
-static void
-TIFFjpeg_tables_src(JPEGState* sp)
+static void TIFFjpeg_tables_src(JPEGState *sp)
 {
-	TIFFjpeg_data_src(sp);
-	sp->src.init_source = tables_init_source;
+    TIFFjpeg_data_src(sp);
+    sp->src.init_source = tables_init_source;
 }
 
 /*
@@ -676,31 +740,28 @@ TIFFjpeg_tables_src(JPEGState* sp)
  * when done with strip/tile.
  * This is also a handy place to compute samplesperclump, bytesperline.
  */
-static int
-alloc_downsampled_buffers(TIFF* tif, jpeg_component_info* comp_info,
-			  int num_components)
-{
-	JPEGState* sp = JState(tif);
-	int ci;
-	jpeg_component_info* compptr;
-	JSAMPARRAY buf;
-	int samples_per_clump = 0;
-
-	for (ci = 0, compptr = comp_info; ci < num_components;
-	     ci++, compptr++) {
-		samples_per_clump += compptr->h_samp_factor *
-			compptr->v_samp_factor;
-		buf = TIFFjpeg_alloc_sarray(sp, JPOOL_IMAGE,
-				compptr->width_in_blocks * DCTSIZE,
-				(JDIMENSION) (compptr->v_samp_factor*DCTSIZE));
-		if (buf == NULL)
-			return (0);
-		sp->ds_buffer[ci] = buf;
-	}
-	sp->samplesperclump = samples_per_clump;
-	return (1);
-}
+static int alloc_downsampled_buffers(TIFF *tif, jpeg_component_info *comp_info,
+                                     int num_components)
+{
+    JPEGState *sp = JState(tif);
+    int ci;
+    jpeg_component_info *compptr;
+    TIFF_JSAMPARRAY buf;
+    int samples_per_clump = 0;
 
+    for (ci = 0, compptr = comp_info; ci < num_components; ci++, compptr++)
+    {
+        samples_per_clump += compptr->h_samp_factor * compptr->v_samp_factor;
+        buf = (TIFF_JSAMPARRAY)TIFFjpeg_alloc_sarray(
+            sp, JPOOL_IMAGE, compptr->width_in_blocks * DCTSIZE,
+            (JDIMENSION)(compptr->v_samp_factor * DCTSIZE));
+        if (buf == NULL)
+            return (0);
+        sp->ds_buffer[ci] = buf;
+    }
+    sp->samplesperclump = samples_per_clump;
+    return (1);
+}
 
 /*
  * JPEG Decoding.
@@ -722,331 +783,367 @@ alloc_downsampled_buffers(TIFF* tif, jpeg_component_info* comp_info,
 #define JPEG_MARKER_COM 0xFE
 struct JPEGFixupTagsSubsamplingData
 {
-	TIFF* tif;
-	void* buffer;
-	uint32 buffersize;
-	uint8* buffercurrentbyte;
-	uint32 bufferbytesleft;
-	uint64 fileoffset;
-	uint64 filebytesleft;
-	uint8 filepositioned;
+    TIFF *tif;
+    void *buffer;
+    uint32_t buffersize;
+    uint8_t *buffercurrentbyte;
+    uint32_t bufferbytesleft;
+    uint64_t fileoffset;
+    uint64_t filebytesleft;
+    uint8_t filepositioned;
 };
-static void JPEGFixupTagsSubsampling(TIFF* tif);
-static int JPEGFixupTagsSubsamplingSec(struct JPEGFixupTagsSubsamplingData* data);
-static int JPEGFixupTagsSubsamplingReadByte(struct JPEGFixupTagsSubsamplingData* data, uint8* result);
-static int JPEGFixupTagsSubsamplingReadWord(struct JPEGFixupTagsSubsamplingData* data, uint16* result);
-static void JPEGFixupTagsSubsamplingSkip(struct JPEGFixupTagsSubsamplingData* data, uint16 skiplength);
+static void JPEGFixupTagsSubsampling(TIFF *tif);
+static int
+JPEGFixupTagsSubsamplingSec(struct JPEGFixupTagsSubsamplingData *data);
+static int
+JPEGFixupTagsSubsamplingReadByte(struct JPEGFixupTagsSubsamplingData *data,
+                                 uint8_t *result);
+static int
+JPEGFixupTagsSubsamplingReadWord(struct JPEGFixupTagsSubsamplingData *data,
+                                 uint16_t *result);
+static void
+JPEGFixupTagsSubsamplingSkip(struct JPEGFixupTagsSubsamplingData *data,
+                             uint16_t skiplength);
 
 #endif
 
-static int
-JPEGFixupTags(TIFF* tif)
+static int JPEGFixupTags(TIFF *tif)
 {
 #ifdef CHECK_JPEG_YCBCR_SUBSAMPLING
-        JPEGState* sp = JState(tif);
-	if ((tif->tif_dir.td_photometric==PHOTOMETRIC_YCBCR)&&
-	    (tif->tif_dir.td_planarconfig==PLANARCONFIG_CONTIG)&&
-	    (tif->tif_dir.td_samplesperpixel==3) &&
-            !sp->ycbcrsampling_fetched)
-		JPEGFixupTagsSubsampling(tif);
+    JPEGState *sp = JState(tif);
+    if ((tif->tif_dir.td_photometric == PHOTOMETRIC_YCBCR) &&
+        (tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG) &&
+        (tif->tif_dir.td_samplesperpixel == 3) &&
+        !sp->otherSettings.ycbcrsampling_fetched)
+        JPEGFixupTagsSubsampling(tif);
 #endif
-        
-	return(1);
+
+    return (1);
 }
 
 #ifdef CHECK_JPEG_YCBCR_SUBSAMPLING
 
-static void
-JPEGFixupTagsSubsampling(TIFF* tif)
-{
-	/*
-	 * Some JPEG-in-TIFF produces do not emit the YCBCRSUBSAMPLING values in
-	 * the TIFF tags, but still use non-default (2,2) values within the jpeg
-	 * data stream itself.  In order for TIFF applications to work properly
-	 * - for instance to get the strip buffer size right - it is imperative
-	 * that the subsampling be available before we start reading the image
-	 * data normally.  This function will attempt to analyze the first strip in
-	 * order to get the sampling values from the jpeg data stream.
-	 *
-	 * Note that JPEGPreDeocode() will produce a fairly loud warning when the
-	 * discovered sampling does not match the default sampling (2,2) or whatever
-	 * was actually in the tiff tags.
-	 *
-	 * See the bug in bugzilla for details:
-	 *
-	 * http://bugzilla.remotesensing.org/show_bug.cgi?id=168
-	 *
-	 * Frank Warmerdam, July 2002
-	 * Joris Van Damme, May 2007
-	 */
-	static const char module[] = "JPEGFixupTagsSubsampling";
-	struct JPEGFixupTagsSubsamplingData m;
-        uint64 fileoffset = TIFFGetStrileOffset(tif, 0);
-
-        if( fileoffset == 0 )
-        {
-            /* Do not even try to check if the first strip/tile does not
-               yet exist, as occurs when GDAL has created a new NULL file
-               for instance. */
-            return;
-        }
+static void JPEGFixupTagsSubsampling(TIFF *tif)
+{
+    /*
+     * Some JPEG-in-TIFF produces do not emit the YCBCRSUBSAMPLING values in
+     * the TIFF tags, but still use non-default (2,2) values within the jpeg
+     * data stream itself.  In order for TIFF applications to work properly
+     * - for instance to get the strip buffer size right - it is imperative
+     * that the subsampling be available before we start reading the image
+     * data normally.  This function will attempt to analyze the first strip in
+     * order to get the sampling values from the jpeg data stream.
+     *
+     * Note that JPEGPreDeocode() will produce a fairly loud warning when the
+     * discovered sampling does not match the default sampling (2,2) or whatever
+     * was actually in the tiff tags.
+     *
+     * See the bug in bugzilla for details:
+     *
+     * http://bugzilla.remotesensing.org/show_bug.cgi?id=168
+     *
+     * Frank Warmerdam, July 2002
+     * Joris Van Damme, May 2007
+     */
+    static const char module[] = "JPEGFixupTagsSubsampling";
+    struct JPEGFixupTagsSubsamplingData m;
+    uint64_t fileoffset = TIFFGetStrileOffset(tif, 0);
 
-	m.tif=tif;
-	m.buffersize=2048;
-	m.buffer=_TIFFmalloc(m.buffersize);
-	if (m.buffer==NULL)
-	{
-		TIFFWarningExt(tif->tif_clientdata,module,
-		    "Unable to allocate memory for auto-correcting of subsampling values; auto-correcting skipped");
-		return;
-	}
-	m.buffercurrentbyte=NULL;
-	m.bufferbytesleft=0;
-	m.fileoffset=fileoffset;
-	m.filepositioned=0;
-	m.filebytesleft=TIFFGetStrileByteCount(tif, 0);
-	if (!JPEGFixupTagsSubsamplingSec(&m))
-		TIFFWarningExt(tif->tif_clientdata,module,
-		    "Unable to auto-correct subsampling values, likely corrupt JPEG compressed data in first strip/tile; auto-correcting skipped");
-	_TIFFfree(m.buffer);
+    if (fileoffset == 0)
+    {
+        /* Do not even try to check if the first strip/tile does not
+           yet exist, as occurs when GDAL has created a new NULL file
+           for instance. */
+        return;
+    }
+
+    m.tif = tif;
+    m.buffersize = 2048;
+    m.buffer = _TIFFmallocExt(tif, m.buffersize);
+    if (m.buffer == NULL)
+    {
+        TIFFWarningExtR(tif, module,
+                        "Unable to allocate memory for auto-correcting of "
+                        "subsampling values; auto-correcting skipped");
+        return;
+    }
+    m.buffercurrentbyte = NULL;
+    m.bufferbytesleft = 0;
+    m.fileoffset = fileoffset;
+    m.filepositioned = 0;
+    m.filebytesleft = TIFFGetStrileByteCount(tif, 0);
+    if (!JPEGFixupTagsSubsamplingSec(&m))
+        TIFFWarningExtR(
+            tif, module,
+            "Unable to auto-correct subsampling values, likely corrupt JPEG "
+            "compressed data in first strip/tile; auto-correcting skipped");
+    _TIFFfreeExt(tif, m.buffer);
 }
 
 static int
-JPEGFixupTagsSubsamplingSec(struct JPEGFixupTagsSubsamplingData* data)
-{
-	static const char module[] = "JPEGFixupTagsSubsamplingSec";
-	uint8 m;
-	while (1)
-	{
-		while (1)
-		{
-			if (!JPEGFixupTagsSubsamplingReadByte(data,&m))
-				return(0);
-			if (m==255)
-				break;
-		}
-		while (1)
-		{
-			if (!JPEGFixupTagsSubsamplingReadByte(data,&m))
-				return(0);
-			if (m!=255)
-				break;
-		}
-		switch (m)
-		{
-			case JPEG_MARKER_SOI:
-				/* this type of marker has no data and should be skipped */
-				break;
-			case JPEG_MARKER_COM:
-			case JPEG_MARKER_APP0:
-			case JPEG_MARKER_APP0+1:
-			case JPEG_MARKER_APP0+2:
-			case JPEG_MARKER_APP0+3:
-			case JPEG_MARKER_APP0+4:
-			case JPEG_MARKER_APP0+5:
-			case JPEG_MARKER_APP0+6:
-			case JPEG_MARKER_APP0+7:
-			case JPEG_MARKER_APP0+8:
-			case JPEG_MARKER_APP0+9:
-			case JPEG_MARKER_APP0+10:
-			case JPEG_MARKER_APP0+11:
-			case JPEG_MARKER_APP0+12:
-			case JPEG_MARKER_APP0+13:
-			case JPEG_MARKER_APP0+14:
-			case JPEG_MARKER_APP0+15:
-			case JPEG_MARKER_DQT:
-			case JPEG_MARKER_SOS:
-			case JPEG_MARKER_DHT:
-			case JPEG_MARKER_DRI:
-				/* this type of marker has data, but it has no use to us and should be skipped */
-				{
-					uint16 n;
-					if (!JPEGFixupTagsSubsamplingReadWord(data,&n))
-						return(0);
-					if (n<2)
-						return(0);
-					n-=2;
-					if (n>0)
-						JPEGFixupTagsSubsamplingSkip(data,n);
-				}
-				break;
-			case JPEG_MARKER_SOF0: /* Baseline sequential Huffman */
-			case JPEG_MARKER_SOF1: /* Extended sequential Huffman */
-			case JPEG_MARKER_SOF2: /* Progressive Huffman: normally not allowed by TechNote, but that doesn't hurt supporting it */
-			case JPEG_MARKER_SOF9: /* Extended sequential arithmetic */
-			case JPEG_MARKER_SOF10: /* Progressive arithmetic: normally not allowed by TechNote, but that doesn't hurt supporting it */
-				/* this marker contains the subsampling factors we're scanning for */
-				{
-					uint16 n;
-					uint16 o;
-					uint8 p;
-					uint8 ph,pv;
-					if (!JPEGFixupTagsSubsamplingReadWord(data,&n))
-						return(0);
-					if (n!=8+data->tif->tif_dir.td_samplesperpixel*3)
-						return(0);
-					JPEGFixupTagsSubsamplingSkip(data,7);
-					if (!JPEGFixupTagsSubsamplingReadByte(data,&p))
-						return(0);
-					ph=(p>>4);
-					pv=(p&15);
-					JPEGFixupTagsSubsamplingSkip(data,1);
-					for (o=1; o<data->tif->tif_dir.td_samplesperpixel; o++)
-					{
-						JPEGFixupTagsSubsamplingSkip(data,1);
-						if (!JPEGFixupTagsSubsamplingReadByte(data,&p))
-							return(0);
-						if (p!=0x11)
-						{
-							TIFFWarningExt(data->tif->tif_clientdata,module,
-							    "Subsampling values inside JPEG compressed data have no TIFF equivalent, auto-correction of TIFF subsampling values failed");
-							return(1);
-						}
-						JPEGFixupTagsSubsamplingSkip(data,1);
-					}
-					if (((ph!=1)&&(ph!=2)&&(ph!=4))||((pv!=1)&&(pv!=2)&&(pv!=4)))
-					{
-						TIFFWarningExt(data->tif->tif_clientdata,module,
-						    "Subsampling values inside JPEG compressed data have no TIFF equivalent, auto-correction of TIFF subsampling values failed");
-						return(1);
-					}
-					if ((ph!=data->tif->tif_dir.td_ycbcrsubsampling[0])||(pv!=data->tif->tif_dir.td_ycbcrsubsampling[1]))
-					{
-						TIFFWarningExt(data->tif->tif_clientdata,module,
-						    "Auto-corrected former TIFF subsampling values [%d,%d] to match subsampling values inside JPEG compressed data [%d,%d]",
-						    (int)data->tif->tif_dir.td_ycbcrsubsampling[0],
-						    (int)data->tif->tif_dir.td_ycbcrsubsampling[1],
-						    (int)ph,(int)pv);
-						data->tif->tif_dir.td_ycbcrsubsampling[0]=ph;
-						data->tif->tif_dir.td_ycbcrsubsampling[1]=pv;
-					}
-				}
-				return(1);
-			default:
-				return(0);
-		}
-	}
+JPEGFixupTagsSubsamplingSec(struct JPEGFixupTagsSubsamplingData *data)
+{
+    static const char module[] = "JPEGFixupTagsSubsamplingSec";
+    uint8_t m;
+    while (1)
+    {
+        while (1)
+        {
+            if (!JPEGFixupTagsSubsamplingReadByte(data, &m))
+                return (0);
+            if (m == 255)
+                break;
+        }
+        while (1)
+        {
+            if (!JPEGFixupTagsSubsamplingReadByte(data, &m))
+                return (0);
+            if (m != 255)
+                break;
+        }
+        switch (m)
+        {
+            case JPEG_MARKER_SOI:
+                /* this type of marker has no data and should be skipped */
+                break;
+            case JPEG_MARKER_COM:
+            case JPEG_MARKER_APP0:
+            case JPEG_MARKER_APP0 + 1:
+            case JPEG_MARKER_APP0 + 2:
+            case JPEG_MARKER_APP0 + 3:
+            case JPEG_MARKER_APP0 + 4:
+            case JPEG_MARKER_APP0 + 5:
+            case JPEG_MARKER_APP0 + 6:
+            case JPEG_MARKER_APP0 + 7:
+            case JPEG_MARKER_APP0 + 8:
+            case JPEG_MARKER_APP0 + 9:
+            case JPEG_MARKER_APP0 + 10:
+            case JPEG_MARKER_APP0 + 11:
+            case JPEG_MARKER_APP0 + 12:
+            case JPEG_MARKER_APP0 + 13:
+            case JPEG_MARKER_APP0 + 14:
+            case JPEG_MARKER_APP0 + 15:
+            case JPEG_MARKER_DQT:
+            case JPEG_MARKER_SOS:
+            case JPEG_MARKER_DHT:
+            case JPEG_MARKER_DRI:
+                /* this type of marker has data, but it has no use to us and
+                 * should be skipped */
+                {
+                    uint16_t n;
+                    if (!JPEGFixupTagsSubsamplingReadWord(data, &n))
+                        return (0);
+                    if (n < 2)
+                        return (0);
+                    n -= 2;
+                    if (n > 0)
+                        JPEGFixupTagsSubsamplingSkip(data, n);
+                }
+                break;
+            case JPEG_MARKER_SOF0:  /* Baseline sequential Huffman */
+            case JPEG_MARKER_SOF1:  /* Extended sequential Huffman */
+            case JPEG_MARKER_SOF2:  /* Progressive Huffman: normally not allowed
+                                       by  TechNote, but that doesn't hurt
+                                       supporting it */
+            case JPEG_MARKER_SOF9:  /* Extended sequential arithmetic */
+            case JPEG_MARKER_SOF10: /* Progressive arithmetic: normally not
+                                       allowed by TechNote, but that doesn't
+                                       hurt supporting it */
+                /* this marker contains the subsampling factors we're scanning
+                 * for */
+                {
+                    uint16_t n;
+                    uint16_t o;
+                    uint8_t p;
+                    uint8_t ph, pv;
+                    if (!JPEGFixupTagsSubsamplingReadWord(data, &n))
+                        return (0);
+                    if (n != 8 + data->tif->tif_dir.td_samplesperpixel * 3)
+                        return (0);
+                    JPEGFixupTagsSubsamplingSkip(data, 7);
+                    if (!JPEGFixupTagsSubsamplingReadByte(data, &p))
+                        return (0);
+                    ph = (p >> 4);
+                    pv = (p & 15);
+                    JPEGFixupTagsSubsamplingSkip(data, 1);
+                    for (o = 1; o < data->tif->tif_dir.td_samplesperpixel; o++)
+                    {
+                        JPEGFixupTagsSubsamplingSkip(data, 1);
+                        if (!JPEGFixupTagsSubsamplingReadByte(data, &p))
+                            return (0);
+                        if (p != 0x11)
+                        {
+                            TIFFWarningExtR(data->tif, module,
+                                            "Subsampling values inside JPEG "
+                                            "compressed data "
+                                            "have no TIFF equivalent, "
+                                            "auto-correction of TIFF "
+                                            "subsampling values failed");
+                            return (1);
+                        }
+                        JPEGFixupTagsSubsamplingSkip(data, 1);
+                    }
+                    if (((ph != 1) && (ph != 2) && (ph != 4)) ||
+                        ((pv != 1) && (pv != 2) && (pv != 4)))
+                    {
+                        TIFFWarningExtR(data->tif, module,
+                                        "Subsampling values inside JPEG "
+                                        "compressed data have no TIFF "
+                                        "equivalent, auto-correction of TIFF "
+                                        "subsampling values failed");
+                        return (1);
+                    }
+                    if ((ph != data->tif->tif_dir.td_ycbcrsubsampling[0]) ||
+                        (pv != data->tif->tif_dir.td_ycbcrsubsampling[1]))
+                    {
+                        TIFFWarningExtR(
+                            data->tif, module,
+                            "Auto-corrected former TIFF subsampling values "
+                            "[%" PRIu16 ",%" PRIu16
+                            "] to match subsampling values inside JPEG "
+                            "compressed data [%" PRIu8 ",%" PRIu8 "]",
+                            data->tif->tif_dir.td_ycbcrsubsampling[0],
+                            data->tif->tif_dir.td_ycbcrsubsampling[1], ph, pv);
+                        data->tif->tif_dir.td_ycbcrsubsampling[0] = ph;
+                        data->tif->tif_dir.td_ycbcrsubsampling[1] = pv;
+                    }
+                }
+                return (1);
+            default:
+                return (0);
+        }
+    }
 }
 
 static int
-JPEGFixupTagsSubsamplingReadByte(struct JPEGFixupTagsSubsamplingData* data, uint8* result)
-{
-	if (data->bufferbytesleft==0)
-	{
-		uint32 m;
-		if (data->filebytesleft==0)
-			return(0);
-		if (!data->filepositioned)
-		{
-			if (TIFFSeekFile(data->tif,data->fileoffset,SEEK_SET) == (toff_t)-1)
-			{
-			    return 0;
-			}
-			data->filepositioned=1;
-		}
-		m=data->buffersize;
-		if ((uint64)m>data->filebytesleft)
-			m=(uint32)data->filebytesleft;
-		assert(m<0x80000000UL);
-		if (TIFFReadFile(data->tif,data->buffer,(tmsize_t)m)!=(tmsize_t)m)
-			return(0);
-		data->buffercurrentbyte=data->buffer;
-		data->bufferbytesleft=m;
-		data->fileoffset+=m;
-		data->filebytesleft-=m;
-	}
-	*result=*data->buffercurrentbyte;
-	data->buffercurrentbyte++;
-	data->bufferbytesleft--;
-	return(1);
+JPEGFixupTagsSubsamplingReadByte(struct JPEGFixupTagsSubsamplingData *data,
+                                 uint8_t *result)
+{
+    if (data->bufferbytesleft == 0)
+    {
+        uint32_t m;
+        if (data->filebytesleft == 0)
+            return (0);
+        if (!data->filepositioned)
+        {
+            if (TIFFSeekFile(data->tif, data->fileoffset, SEEK_SET) ==
+                (toff_t)-1)
+            {
+                return 0;
+            }
+            data->filepositioned = 1;
+        }
+        m = data->buffersize;
+        if ((uint64_t)m > data->filebytesleft)
+            m = (uint32_t)data->filebytesleft;
+        assert(m < 0x80000000UL);
+        if (TIFFReadFile(data->tif, data->buffer, (tmsize_t)m) != (tmsize_t)m)
+            return (0);
+        data->buffercurrentbyte = data->buffer;
+        data->bufferbytesleft = m;
+        data->fileoffset += m;
+        data->filebytesleft -= m;
+    }
+    *result = *data->buffercurrentbyte;
+    data->buffercurrentbyte++;
+    data->bufferbytesleft--;
+    return (1);
 }
 
 static int
-JPEGFixupTagsSubsamplingReadWord(struct JPEGFixupTagsSubsamplingData* data, uint16* result)
+JPEGFixupTagsSubsamplingReadWord(struct JPEGFixupTagsSubsamplingData *data,
+                                 uint16_t *result)
 {
-	uint8 ma;
-	uint8 mb;
-	if (!JPEGFixupTagsSubsamplingReadByte(data,&ma))
-		return(0);
-	if (!JPEGFixupTagsSubsamplingReadByte(data,&mb))
-		return(0);
-	*result=(ma<<8)|mb;
-	return(1);
+    uint8_t ma;
+    uint8_t mb;
+    if (!JPEGFixupTagsSubsamplingReadByte(data, &ma))
+        return (0);
+    if (!JPEGFixupTagsSubsamplingReadByte(data, &mb))
+        return (0);
+    *result = (ma << 8) | mb;
+    return (1);
 }
 
 static void
-JPEGFixupTagsSubsamplingSkip(struct JPEGFixupTagsSubsamplingData* data, uint16 skiplength)
-{
-	if ((uint32)skiplength<=data->bufferbytesleft)
-	{
-		data->buffercurrentbyte+=skiplength;
-		data->bufferbytesleft-=skiplength;
-	}
-	else
-	{
-		uint16 m;
-		m=(uint16)(skiplength-data->bufferbytesleft);
-		if (m<=data->filebytesleft)
-		{
-			data->bufferbytesleft=0;
-			data->fileoffset+=m;
-			data->filebytesleft-=m;
-			data->filepositioned=0;
-		}
-		else
-		{
-			data->bufferbytesleft=0;
-			data->filebytesleft=0;
-		}
-	}
+JPEGFixupTagsSubsamplingSkip(struct JPEGFixupTagsSubsamplingData *data,
+                             uint16_t skiplength)
+{
+    if ((uint32_t)skiplength <= data->bufferbytesleft)
+    {
+        data->buffercurrentbyte += skiplength;
+        data->bufferbytesleft -= skiplength;
+    }
+    else
+    {
+        uint16_t m;
+        m = (uint16_t)(skiplength - data->bufferbytesleft);
+        if (m <= data->filebytesleft)
+        {
+            data->bufferbytesleft = 0;
+            data->fileoffset += m;
+            data->filebytesleft -= m;
+            data->filepositioned = 0;
+        }
+        else
+        {
+            data->bufferbytesleft = 0;
+            data->filebytesleft = 0;
+        }
+    }
 }
 
 #endif
 
-
-static int
-JPEGSetupDecode(TIFF* tif)
+static int JPEGSetupDecode(TIFF *tif)
 {
-	JPEGState* sp = JState(tif);
-	TIFFDirectory *td = &tif->tif_dir;
+    JPEGState *sp = JState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
 
-#if defined(JPEG_DUAL_MODE_8_12) && !defined(TIFFInitJPEG)
-        if( tif->tif_dir.td_bitspersample == 12 )
-            return TIFFReInitJPEG_12( tif, COMPRESSION_JPEG, 0 );
+#if defined(JPEG_DUAL_MODE_8_12) && !defined(FROM_TIF_JPEG_12)
+    if (tif->tif_dir.td_bitspersample == 12)
+    {
+        /* We pass a pointer to a copy of otherSettings, since */
+        /* TIFFReInitJPEG_12() will clear sp */
+        JPEGOtherSettings savedOtherSettings = sp->otherSettings;
+        return TIFFReInitJPEG_12(tif, &savedOtherSettings, COMPRESSION_JPEG, 0);
+    }
 #endif
 
-	JPEGInitializeLibJPEG( tif, TRUE );
-
-	assert(sp != NULL);
-	assert(sp->cinfo.comm.is_decompressor);
-
-	/* Read JPEGTables if it is present */
-	if (TIFFFieldSet(tif,FIELD_JPEGTABLES)) {
-		TIFFjpeg_tables_src(sp);
-		if(TIFFjpeg_read_header(sp,FALSE) != JPEG_HEADER_TABLES_ONLY) {
-			TIFFErrorExt(tif->tif_clientdata, "JPEGSetupDecode", "Bogus JPEGTables field");
-			return (0);
-		}
-	}
-
-	/* Grab parameters that are same for all strips/tiles */
-	sp->photometric = td->td_photometric;
-	switch (sp->photometric) {
-	case PHOTOMETRIC_YCBCR:
-		sp->h_sampling = td->td_ycbcrsubsampling[0];
-		sp->v_sampling = td->td_ycbcrsubsampling[1];
-		break;
-	default:
-		/* TIFF 6.0 forbids subsampling of all other color spaces */
-		sp->h_sampling = 1;
-		sp->v_sampling = 1;
-		break;
-	}
-
-	/* Set up for reading normal data */
-	TIFFjpeg_data_src(sp);
-	tif->tif_postdecode = _TIFFNoPostDecode; /* override byte swapping */
-	return (1);
+    JPEGInitializeLibJPEG(tif, TRUE);
+
+    assert(sp != NULL);
+    assert(sp->cinfo.comm.is_decompressor);
+
+    /* Read JPEGTables if it is present */
+    if (TIFFFieldSet(tif, FIELD_JPEGTABLES))
+    {
+        TIFFjpeg_tables_src(sp);
+        if (TIFFjpeg_read_header(sp, FALSE) != JPEG_HEADER_TABLES_ONLY)
+        {
+            TIFFErrorExtR(tif, "JPEGSetupDecode", "Bogus JPEGTables field");
+            return (0);
+        }
+    }
+
+    /* Grab parameters that are same for all strips/tiles */
+    sp->photometric = td->td_photometric;
+    switch (sp->photometric)
+    {
+        case PHOTOMETRIC_YCBCR:
+            sp->h_sampling = td->td_ycbcrsubsampling[0];
+            sp->v_sampling = td->td_ycbcrsubsampling[1];
+            break;
+        default:
+            /* TIFF 6.0 forbids subsampling of all other color spaces */
+            sp->h_sampling = 1;
+            sp->v_sampling = 1;
+            break;
+    }
+
+    /* Set up for reading normal data */
+    TIFFjpeg_data_src(sp);
+    tif->tif_postdecode = _TIFFNoPostDecode; /* override byte swapping */
+    return (1);
 }
 
 /* Returns 1 if the full strip should be read, even when doing scanline per */
@@ -1056,14 +1153,14 @@ JPEGSetupDecode(TIFF* tif)
 /* Only reads tif->tif_dir.td_bitspersample, tif->tif_rawdata and */
 /* tif->tif_rawcc members. */
 /* Can be called independently of the usual setup/predecode/decode states */
-int TIFFJPEGIsFullStripRequired(TIFF* tif)
+int TIFFJPEGIsFullStripRequired(TIFF *tif)
 {
     int ret;
     JPEGState state;
 
-#if defined(JPEG_DUAL_MODE_8_12) && !defined(TIFFJPEGIsFullStripRequired)
-    if( tif->tif_dir.td_bitspersample == 12 )
-        return TIFFJPEGIsFullStripRequired_12( tif );
+#if defined(JPEG_DUAL_MODE_8_12) && !defined(FROM_TIF_JPEG_12)
+    if (tif->tif_dir.td_bitspersample == 12)
+        return TIFFJPEGIsFullStripRequired_12(tif);
 #endif
 
     memset(&state, 0, sizeof(JPEGState));
@@ -1088,411 +1185,451 @@ int TIFFJPEGIsFullStripRequired(TIFF* tif)
 /*
  * Set up for decoding a strip or tile.
  */
-/*ARGSUSED*/ static int
-JPEGPreDecode(TIFF* tif, uint16 s)
-{
-	JPEGState *sp = JState(tif);
-	TIFFDirectory *td = &tif->tif_dir;
-	static const char module[] = "JPEGPreDecode";
-	uint32 segment_width, segment_height;
-	int downsampled_output;
-	int ci;
-
-	assert(sp != NULL);
-  
-	if (sp->cinfo.comm.is_decompressor == 0)
-	{
-		tif->tif_setupdecode( tif );
-	}
-  
-	assert(sp->cinfo.comm.is_decompressor);
-	/*
-	 * Reset decoder state from any previous strip/tile,
-	 * in case application didn't read the whole strip.
-	 */
-	if (!TIFFjpeg_abort(sp))
-		return (0);
-	/*
-	 * Read the header for this strip/tile.
-	 */
-        
-	if (TIFFjpeg_read_header(sp, TRUE) != JPEG_HEADER_OK)
-		return (0);
-
-        tif->tif_rawcp = (uint8*) sp->src.next_input_byte;
-        tif->tif_rawcc = sp->src.bytes_in_buffer;
-
-	/*
-	 * Check image parameters and set decompression parameters.
-	 */
-	if (isTiled(tif)) {
-                segment_width = td->td_tilewidth;
-                segment_height = td->td_tilelength;
-		sp->bytesperline = TIFFTileRowSize(tif);
-	} else {
-		segment_width = td->td_imagewidth;
-		segment_height = td->td_imagelength - tif->tif_row;
-		if (segment_height > td->td_rowsperstrip)
-			segment_height = td->td_rowsperstrip;
-		sp->bytesperline = TIFFScanlineSize(tif);
-	}
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE && s > 0) {
-		/*
-		 * For PC 2, scale down the expected strip/tile size
-		 * to match a downsampled component
-		 */
-		segment_width = TIFFhowmany_32(segment_width, sp->h_sampling);
-		segment_height = TIFFhowmany_32(segment_height, sp->v_sampling);
-	}
-	if (sp->cinfo.d.image_width < segment_width ||
-	    sp->cinfo.d.image_height < segment_height) {
-		TIFFWarningExt(tif->tif_clientdata, module,
-			       "Improper JPEG strip/tile size, "
-			       "expected %dx%d, got %dx%d",
-			       segment_width, segment_height,
-			       sp->cinfo.d.image_width,
-			       sp->cinfo.d.image_height);
-	}
-	if( sp->cinfo.d.image_width == segment_width &&
-	    sp->cinfo.d.image_height > segment_height &&
-	    tif->tif_row + segment_height == td->td_imagelength &&
-	    !isTiled(tif) ) {
-		/* Some files have a last strip, that should be truncated, */
-		/* but their JPEG codestream has still the maximum strip */
-		/* height. Warn about this as this is non compliant, but */
-		/* we can safely recover from that. */
-		TIFFWarningExt(tif->tif_clientdata, module,
-			     "JPEG strip size exceeds expected dimensions,"
-			     " expected %dx%d, got %dx%d",
-			     segment_width, segment_height,
-			     sp->cinfo.d.image_width, sp->cinfo.d.image_height);
-	}
-	else if (sp->cinfo.d.image_width > segment_width ||
-		 sp->cinfo.d.image_height > segment_height) {
-		/*
-		 * This case could be dangerous, if the strip or tile size has
-		 * been reported as less than the amount of data jpeg will
-		 * return, some potential security issues arise. Catch this
-		 * case and error out.
-		 */
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "JPEG strip/tile size exceeds expected dimensions,"
-			     " expected %dx%d, got %dx%d",
-			     segment_width, segment_height,
-			     sp->cinfo.d.image_width, sp->cinfo.d.image_height);
-		return (0);
-	}
-	if (sp->cinfo.d.num_components !=
-	    (td->td_planarconfig == PLANARCONFIG_CONTIG ?
-	     td->td_samplesperpixel : 1)) {
-		TIFFErrorExt(tif->tif_clientdata, module, "Improper JPEG component count");
-		return (0);
-	}
-#ifdef JPEG_LIB_MK1
-	if (12 != td->td_bitspersample && 8 != td->td_bitspersample) {
-		TIFFErrorExt(tif->tif_clientdata, module, "Improper JPEG data precision");
-		return (0);
-	}
-	sp->cinfo.d.data_precision = td->td_bitspersample;
-	sp->cinfo.d.bits_in_jsample = td->td_bitspersample;
-#else
-	if (sp->cinfo.d.data_precision != td->td_bitspersample) {
-		TIFFErrorExt(tif->tif_clientdata, module, "Improper JPEG data precision");
-		return (0);
-	}
-#endif
-
-        /* In some cases, libjpeg needs to allocate a lot of memory */
-        /* http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf */
-        if( TIFFjpeg_has_multiple_scans(sp) )
-        {
-            /* In this case libjpeg will need to allocate memory or backing */
-            /* store for all coefficients */
-            /* See call to jinit_d_coef_controller() from master_selection() */
-            /* in libjpeg */
+/*ARGSUSED*/ static int JPEGPreDecode(TIFF *tif, uint16_t s)
+{
+    JPEGState *sp = JState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+    static const char module[] = "JPEGPreDecode";
+    uint32_t segment_width, segment_height;
+    int downsampled_output;
+    int ci;
 
-            /* 1 MB for regular libjpeg usage */
-            toff_t nRequiredMemory = 1024 * 1024;
+    assert(sp != NULL);
 
-            for (ci = 0; ci < sp->cinfo.d.num_components; ci++) {
-                const jpeg_component_info *compptr = &(sp->cinfo.d.comp_info[ci]);
-                if( compptr->h_samp_factor > 0 && compptr->v_samp_factor > 0 )
-                {
-                    nRequiredMemory += (toff_t)(
-                        ((compptr->width_in_blocks + compptr->h_samp_factor - 1) / compptr->h_samp_factor)) *
-                        ((compptr->height_in_blocks + compptr->v_samp_factor - 1) / compptr->v_samp_factor) *
-                        sizeof(JBLOCK);
-                }
-            }
+    if (sp->cinfo.comm.is_decompressor == 0)
+    {
+        tif->tif_setupdecode(tif);
+    }
 
-            if( sp->cinfo.d.mem->max_memory_to_use > 0 &&
-                nRequiredMemory > (toff_t)(sp->cinfo.d.mem->max_memory_to_use) &&
-                getenv("LIBTIFF_ALLOW_LARGE_LIBJPEG_MEM_ALLOC") == NULL )
-            {
-                TIFFErrorExt(tif->tif_clientdata, module,
-                    "Reading this image would require libjpeg to allocate "
-                    "at least %u bytes. "
-                    "This is disabled since above the %u threshold. "
-                    "You may override this restriction by defining the "
-                    "LIBTIFF_ALLOW_LARGE_LIBJPEG_MEM_ALLOC environment variable, "
-                    "or setting the JPEGMEM environment variable to a value greater "
-                    "or equal to '%uM'",
-                    (unsigned)(nRequiredMemory),
-                    (unsigned)(sp->cinfo.d.mem->max_memory_to_use),
-                    (unsigned)((nRequiredMemory + 1000000 - 1) / 1000000));
-                return 0;
-            }
-        }
+    assert(sp->cinfo.comm.is_decompressor);
+    /*
+     * Reset decoder state from any previous strip/tile,
+     * in case application didn't read the whole strip.
+     */
+    if (!TIFFjpeg_abort(sp))
+        return (0);
+    /*
+     * Read the header for this strip/tile.
+     */
 
-	if (td->td_planarconfig == PLANARCONFIG_CONTIG) {
-		/* Component 0 should have expected sampling factors */
-		if (sp->cinfo.d.comp_info[0].h_samp_factor != sp->h_sampling ||
-		    sp->cinfo.d.comp_info[0].v_samp_factor != sp->v_sampling) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				       "Improper JPEG sampling factors %d,%d\n"
-				       "Apparently should be %d,%d.",
-				       sp->cinfo.d.comp_info[0].h_samp_factor,
-				       sp->cinfo.d.comp_info[0].v_samp_factor,
-				       sp->h_sampling, sp->v_sampling);
-			return (0);
-		}
-		/* Rest should have sampling factors 1,1 */
-		for (ci = 1; ci < sp->cinfo.d.num_components; ci++) {
-			if (sp->cinfo.d.comp_info[ci].h_samp_factor != 1 ||
-			    sp->cinfo.d.comp_info[ci].v_samp_factor != 1) {
-				TIFFErrorExt(tif->tif_clientdata, module, "Improper JPEG sampling factors");
-				return (0);
-			}
-		}
-	} else {
-		/* PC 2's single component should have sampling factors 1,1 */
-		if (sp->cinfo.d.comp_info[0].h_samp_factor != 1 ||
-		    sp->cinfo.d.comp_info[0].v_samp_factor != 1) {
-			TIFFErrorExt(tif->tif_clientdata, module, "Improper JPEG sampling factors");
-			return (0);
-		}
-	}
-	downsampled_output = FALSE;
-	if (td->td_planarconfig == PLANARCONFIG_CONTIG &&
-	    sp->photometric == PHOTOMETRIC_YCBCR &&
-	    sp->jpegcolormode == JPEGCOLORMODE_RGB) {
-		/* Convert YCbCr to RGB */
-		sp->cinfo.d.jpeg_color_space = JCS_YCbCr;
-		sp->cinfo.d.out_color_space = JCS_RGB;
-	} else {
-		/* Suppress colorspace handling */
-		sp->cinfo.d.jpeg_color_space = JCS_UNKNOWN;
-		sp->cinfo.d.out_color_space = JCS_UNKNOWN;
-		if (td->td_planarconfig == PLANARCONFIG_CONTIG &&
-		    (sp->h_sampling != 1 || sp->v_sampling != 1))
-			downsampled_output = TRUE;
-		/* XXX what about up-sampling? */
-	}
-	if (downsampled_output) {
-		/* Need to use raw-data interface to libjpeg */
-		sp->cinfo.d.raw_data_out = TRUE;
-#if JPEG_LIB_VERSION >= 70
-		sp->cinfo.d.do_fancy_upsampling = FALSE;
-#endif /* JPEG_LIB_VERSION >= 70 */
-		tif->tif_decoderow = DecodeRowError;
-		tif->tif_decodestrip = JPEGDecodeRaw;
-		tif->tif_decodetile = JPEGDecodeRaw;
-	} else {
-		/* Use normal interface to libjpeg */
-		sp->cinfo.d.raw_data_out = FALSE;
-		tif->tif_decoderow = JPEGDecode;
-		tif->tif_decodestrip = JPEGDecode;
-		tif->tif_decodetile = JPEGDecode;  
-	}
-	/* Start JPEG decompressor */
-	if (!TIFFjpeg_start_decompress(sp))
-		return (0);
-	/* Allocate downsampled-data buffers if needed */
-	if (downsampled_output) {
-		if (!alloc_downsampled_buffers(tif, sp->cinfo.d.comp_info,
-					       sp->cinfo.d.num_components))
-			return (0);
-		sp->scancount = DCTSIZE;	/* mark buffer empty */
-	}
-	return (1);
-}
+    if (TIFFjpeg_read_header(sp, TRUE) != JPEG_HEADER_OK)
+        return (0);
 
-/*
- * Decode a chunk of pixels.
- * "Standard" case: returned data is not downsampled.
- */
-#if !JPEG_LIB_MK1_OR_12BIT
-static int
-JPEGDecode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
-{
-	JPEGState *sp = JState(tif);
-	tmsize_t nrows;
-	(void) s;
+    tif->tif_rawcp = (uint8_t *)sp->src.next_input_byte;
+    tif->tif_rawcc = sp->src.bytes_in_buffer;
 
+    /*
+     * Check image parameters and set decompression parameters.
+     */
+    if (isTiled(tif))
+    {
+        segment_width = td->td_tilewidth;
+        segment_height = td->td_tilelength;
+        sp->bytesperline = TIFFTileRowSize(tif);
+    }
+    else
+    {
+        segment_width = td->td_imagewidth;
+        segment_height = td->td_imagelength - tif->tif_row;
+        if (segment_height > td->td_rowsperstrip)
+            segment_height = td->td_rowsperstrip;
+        sp->bytesperline = TIFFScanlineSize(tif);
+    }
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE && s > 0)
+    {
         /*
-        ** Update available information, buffer may have been refilled
-        ** between decode requests
-        */
-	sp->src.next_input_byte = (const JOCTET*) tif->tif_rawcp;
-	sp->src.bytes_in_buffer = (size_t) tif->tif_rawcc;
+         * For PC 2, scale down the expected strip/tile size
+         * to match a downsampled component
+         */
+        segment_width = TIFFhowmany_32(segment_width, sp->h_sampling);
+        segment_height = TIFFhowmany_32(segment_height, sp->v_sampling);
+    }
+    if (sp->cinfo.d.image_width < segment_width ||
+        sp->cinfo.d.image_height < segment_height)
+    {
+        TIFFWarningExtR(tif, module,
+                        "Improper JPEG strip/tile size, "
+                        "expected %" PRIu32 "x%" PRIu32 ", got %ux%u",
+                        segment_width, segment_height, sp->cinfo.d.image_width,
+                        sp->cinfo.d.image_height);
+    }
+    if (sp->cinfo.d.image_width == segment_width &&
+        sp->cinfo.d.image_height > segment_height &&
+        tif->tif_row + segment_height == td->td_imagelength && !isTiled(tif))
+    {
+        /* Some files have a last strip, that should be truncated, */
+        /* but their JPEG codestream has still the maximum strip */
+        /* height. Warn about this as this is non compliant, but */
+        /* we can safely recover from that. */
+        TIFFWarningExtR(tif, module,
+                        "JPEG strip size exceeds expected dimensions,"
+                        " expected %" PRIu32 "x%" PRIu32 ", got %ux%u",
+                        segment_width, segment_height, sp->cinfo.d.image_width,
+                        sp->cinfo.d.image_height);
+    }
+    else if (sp->cinfo.d.image_width > segment_width ||
+             sp->cinfo.d.image_height > segment_height)
+    {
+        /*
+         * This case could be dangerous, if the strip or tile size has
+         * been reported as less than the amount of data jpeg will
+         * return, some potential security issues arise. Catch this
+         * case and error out.
+         */
+        TIFFErrorExtR(tif, module,
+                      "JPEG strip/tile size exceeds expected dimensions,"
+                      " expected %" PRIu32 "x%" PRIu32 ", got %ux%u",
+                      segment_width, segment_height, sp->cinfo.d.image_width,
+                      sp->cinfo.d.image_height);
+        return (0);
+    }
+    if (sp->cinfo.d.num_components !=
+        (td->td_planarconfig == PLANARCONFIG_CONTIG ? td->td_samplesperpixel
+                                                    : 1))
+    {
+        TIFFErrorExtR(tif, module, "Improper JPEG component count");
+        return (0);
+    }
+#ifdef JPEG_LIB_MK1
+    if (12 != td->td_bitspersample && 8 != td->td_bitspersample)
+    {
+        TIFFErrorExtR(tif, module, "Improper JPEG data precision");
+        return (0);
+    }
+    sp->cinfo.d.data_precision = td->td_bitspersample;
+    sp->cinfo.d.bits_in_jsample = td->td_bitspersample;
+#else
+    if (sp->cinfo.d.data_precision != td->td_bitspersample)
+    {
+        TIFFErrorExtR(tif, module, "Improper JPEG data precision");
+        return (0);
+    }
+#endif
 
-        if( sp->bytesperline == 0 )
-                return 0;
-        
-	nrows = cc / sp->bytesperline;
-	if (cc % sp->bytesperline)
-		TIFFWarningExt(tif->tif_clientdata, tif->tif_name,
-                               "fractional scanline not read");
+    if (sp->cinfo.d.progressive_mode &&
+        !sp->otherSettings.has_warned_about_progressive_mode)
+    {
+        TIFFWarningExtR(tif, module,
+                        "The JPEG strip/tile is encoded with progressive mode, "
+                        "which is normally not legal for JPEG-in-TIFF.\n"
+                        "libtiff should be able to decode it, but it might "
+                        "cause compatibility issues with other readers");
+        sp->otherSettings.has_warned_about_progressive_mode = TRUE;
+    }
 
-	if( nrows > (tmsize_t) sp->cinfo.d.image_height )
-		nrows = sp->cinfo.d.image_height;
+    /* In some cases, libjpeg needs to allocate a lot of memory */
+    /* http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf
+     */
+    if (TIFFjpeg_has_multiple_scans(sp))
+    {
+        /* In this case libjpeg will need to allocate memory or backing */
+        /* store for all coefficients */
+        /* See call to jinit_d_coef_controller() from master_selection() */
+        /* in libjpeg */
 
-	/* data is expected to be read in multiples of a scanline */
-	if (nrows)
+        /* 1 MB for regular libjpeg usage */
+        toff_t nRequiredMemory = 1024 * 1024;
+
+        for (ci = 0; ci < sp->cinfo.d.num_components; ci++)
         {
-                do
-                {
-                        /*
-                         * In the libjpeg6b-9a 8bit case.  We read directly into
-                         * the TIFF buffer.
-                         */
-                        JSAMPROW bufptr = (JSAMPROW)buf;
-
-                        if (TIFFjpeg_read_scanlines(sp, &bufptr, 1) != 1)
-                                return (0);
-
-                        ++tif->tif_row;
-                        buf += sp->bytesperline;
-                        cc -= sp->bytesperline;
-                } while (--nrows > 0);
+            const jpeg_component_info *compptr = &(sp->cinfo.d.comp_info[ci]);
+            if (compptr->h_samp_factor > 0 && compptr->v_samp_factor > 0)
+            {
+                nRequiredMemory +=
+                    (toff_t)(((compptr->width_in_blocks +
+                               compptr->h_samp_factor - 1) /
+                              compptr->h_samp_factor)) *
+                    ((compptr->height_in_blocks + compptr->v_samp_factor - 1) /
+                     compptr->v_samp_factor) *
+                    sizeof(JBLOCK);
+            }
         }
 
-        /* Update information on consumed data */
-        tif->tif_rawcp = (uint8*) sp->src.next_input_byte;
-        tif->tif_rawcc = sp->src.bytes_in_buffer;
-                
-	/* Close down the decompressor if we've finished the strip or tile. */
-	return sp->cinfo.d.output_scanline < sp->cinfo.d.output_height
-                || TIFFjpeg_finish_decompress(sp);
-}
-#endif /* !JPEG_LIB_MK1_OR_12BIT */
-
-#if JPEG_LIB_MK1_OR_12BIT
-/*ARGSUSED*/ static int
-JPEGDecode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
-{
-	JPEGState *sp = JState(tif);
-	tmsize_t nrows;
-	(void) s;
+        if (sp->cinfo.d.mem->max_memory_to_use > 0 &&
+            nRequiredMemory > (toff_t)(sp->cinfo.d.mem->max_memory_to_use) &&
+            getenv("LIBTIFF_ALLOW_LARGE_LIBJPEG_MEM_ALLOC") == NULL)
+        {
+            TIFFErrorExtR(
+                tif, module,
+                "Reading this image would require libjpeg to allocate "
+                "at least %" PRIu64 " bytes. "
+                "This is disabled since above the %ld threshold. "
+                "You may override this restriction by defining the "
+                "LIBTIFF_ALLOW_LARGE_LIBJPEG_MEM_ALLOC environment variable, "
+                "or setting the JPEGMEM environment variable to a value "
+                "greater "
+                "or equal to '%" PRIu64 "M'",
+                nRequiredMemory, sp->cinfo.d.mem->max_memory_to_use,
+                (nRequiredMemory + 1000000u - 1u) / 1000000u);
+            return 0;
+        }
+    }
 
-        /*
-        ** Update available information, buffer may have been refilled
-        ** between decode requests
-        */
-	sp->src.next_input_byte = (const JOCTET*) tif->tif_rawcp;
-	sp->src.bytes_in_buffer = (size_t) tif->tif_rawcc;
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG)
+    {
+        /* Component 0 should have expected sampling factors */
+        if (sp->cinfo.d.comp_info[0].h_samp_factor != sp->h_sampling ||
+            sp->cinfo.d.comp_info[0].v_samp_factor != sp->v_sampling)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Improper JPEG sampling factors %d,%d\n"
+                          "Apparently should be %" PRIu16 ",%" PRIu16 ".",
+                          sp->cinfo.d.comp_info[0].h_samp_factor,
+                          sp->cinfo.d.comp_info[0].v_samp_factor,
+                          sp->h_sampling, sp->v_sampling);
+            return (0);
+        }
+        /* Rest should have sampling factors 1,1 */
+        for (ci = 1; ci < sp->cinfo.d.num_components; ci++)
+        {
+            if (sp->cinfo.d.comp_info[ci].h_samp_factor != 1 ||
+                sp->cinfo.d.comp_info[ci].v_samp_factor != 1)
+            {
+                TIFFErrorExtR(tif, module, "Improper JPEG sampling factors");
+                return (0);
+            }
+        }
+    }
+    else
+    {
+        /* PC 2's single component should have sampling factors 1,1 */
+        if (sp->cinfo.d.comp_info[0].h_samp_factor != 1 ||
+            sp->cinfo.d.comp_info[0].v_samp_factor != 1)
+        {
+            TIFFErrorExtR(tif, module, "Improper JPEG sampling factors");
+            return (0);
+        }
+    }
+    downsampled_output = FALSE;
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG &&
+        sp->photometric == PHOTOMETRIC_YCBCR &&
+        sp->otherSettings.jpegcolormode == JPEGCOLORMODE_RGB)
+    {
+        /* Convert YCbCr to RGB */
+        sp->cinfo.d.jpeg_color_space = JCS_YCbCr;
+        sp->cinfo.d.out_color_space = JCS_RGB;
+    }
+    else
+    {
+        /* Suppress colorspace handling */
+        sp->cinfo.d.jpeg_color_space = JCS_UNKNOWN;
+        sp->cinfo.d.out_color_space = JCS_UNKNOWN;
+        if (td->td_planarconfig == PLANARCONFIG_CONTIG &&
+            (sp->h_sampling != 1 || sp->v_sampling != 1))
+            downsampled_output = TRUE;
+        /* XXX what about up-sampling? */
+    }
+    if (downsampled_output)
+    {
+        /* Need to use raw-data interface to libjpeg */
+        sp->cinfo.d.raw_data_out = TRUE;
+#if JPEG_LIB_VERSION >= 70
+        sp->cinfo.d.do_fancy_upsampling = FALSE;
+#endif /* JPEG_LIB_VERSION >= 70 */
+        tif->tif_decoderow = DecodeRowError;
+        tif->tif_decodestrip = JPEGDecodeRaw;
+        tif->tif_decodetile = JPEGDecodeRaw;
+    }
+    else
+    {
+        /* Use normal interface to libjpeg */
+        sp->cinfo.d.raw_data_out = FALSE;
+        tif->tif_decoderow = JPEGDecode;
+        tif->tif_decodestrip = JPEGDecode;
+        tif->tif_decodetile = JPEGDecode;
+    }
+    /* Start JPEG decompressor */
+    if (!TIFFjpeg_start_decompress(sp))
+        return (0);
+    /* Allocate downsampled-data buffers if needed */
+    if (downsampled_output)
+    {
+        if (!alloc_downsampled_buffers(tif, sp->cinfo.d.comp_info,
+                                       sp->cinfo.d.num_components))
+            return (0);
+        sp->scancount = DCTSIZE; /* mark buffer empty */
+    }
+    return (1);
+}
 
-        if( sp->bytesperline == 0 )
-                return 0;
-        
-	nrows = cc / sp->bytesperline;
-	if (cc % sp->bytesperline)
-		TIFFWarningExt(tif->tif_clientdata, tif->tif_name,
-                               "fractional scanline not read");
+/*
+ * Decode a chunk of pixels.
+ * "Standard" case: returned data is not downsampled.
+ */
+#if !JPEG_LIB_MK1_OR_12BIT
+static int JPEGDecode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s)
+{
+    JPEGState *sp = JState(tif);
+    tmsize_t nrows;
+    (void)s;
+
+    /*
+    ** Update available information, buffer may have been refilled
+    ** between decode requests
+    */
+    sp->src.next_input_byte = (const JOCTET *)tif->tif_rawcp;
+    sp->src.bytes_in_buffer = (size_t)tif->tif_rawcc;
+
+    if (sp->bytesperline == 0)
+        return 0;
+
+    nrows = cc / sp->bytesperline;
+    if (cc % sp->bytesperline)
+        TIFFWarningExtR(tif, tif->tif_name, "fractional scanline not read");
+
+    if (nrows > (tmsize_t)sp->cinfo.d.image_height)
+        nrows = sp->cinfo.d.image_height;
+
+    /* data is expected to be read in multiples of a scanline */
+    if (nrows)
+    {
+        do
+        {
+            /*
+             * In the libjpeg6b-9a 8bit case.  We read directly into
+             * the TIFF buffer.
+             */
+            JSAMPROW bufptr = (JSAMPROW)buf;
+
+            if (TIFFjpeg_read_scanlines(sp, &bufptr, 1) != 1)
+                return (0);
+
+            ++tif->tif_row;
+            buf += sp->bytesperline;
+            cc -= sp->bytesperline;
+        } while (--nrows > 0);
+    }
+
+    /* Update information on consumed data */
+    tif->tif_rawcp = (uint8_t *)sp->src.next_input_byte;
+    tif->tif_rawcc = sp->src.bytes_in_buffer;
+
+    /* Close down the decompressor if we've finished the strip or tile. */
+    return sp->cinfo.d.output_scanline < sp->cinfo.d.output_height ||
+           TIFFjpeg_finish_decompress(sp);
+}
+#endif /* !JPEG_LIB_MK1_OR_12BIT */
 
-	if( nrows > (tmsize_t) sp->cinfo.d.image_height )
-		nrows = sp->cinfo.d.image_height;
+#if JPEG_LIB_MK1_OR_12BIT
+/*ARGSUSED*/ static int JPEGDecode(TIFF *tif, uint8_t *buf, tmsize_t cc,
+                                   uint16_t s)
+{
+    JPEGState *sp = JState(tif);
+    tmsize_t nrows;
+    (void)s;
+
+    /*
+    ** Update available information, buffer may have been refilled
+    ** between decode requests
+    */
+    sp->src.next_input_byte = (const JOCTET *)tif->tif_rawcp;
+    sp->src.bytes_in_buffer = (size_t)tif->tif_rawcc;
+
+    if (sp->bytesperline == 0)
+        return 0;
+
+    nrows = cc / sp->bytesperline;
+    if (cc % sp->bytesperline)
+        TIFFWarningExtR(tif, tif->tif_name, "fractional scanline not read");
+
+    if (nrows > (tmsize_t)sp->cinfo.d.image_height)
+        nrows = sp->cinfo.d.image_height;
+
+    /* data is expected to be read in multiples of a scanline */
+    if (nrows)
+    {
+        TIFF_JSAMPROW line_work_buf = NULL;
 
-	/* data is expected to be read in multiples of a scanline */
-	if (nrows)
+        /*
+         * For 6B, only use temporary buffer for 12 bit imagery.
+         * For Mk1 always use it.
+         */
+        if (sp->cinfo.d.data_precision == 12)
         {
-                JSAMPROW line_work_buf = NULL;
+            line_work_buf = (TIFF_JSAMPROW)_TIFFmallocExt(
+                tif, sizeof(short) * sp->cinfo.d.output_width *
+                         sp->cinfo.d.num_components);
+        }
 
+        do
+        {
+            if (line_work_buf != NULL)
+            {
                 /*
-                 * For 6B, only use temporary buffer for 12 bit imagery.
-                 * For Mk1 always use it.
+                 * In the MK1 case, we always read into a 16bit
+                 * buffer, and then pack down to 12bit or 8bit.
+                 * In 6B case we only read into 16 bit buffer
+                 * for 12bit data, which we need to repack.
                  */
-                if( sp->cinfo.d.data_precision == 12 )
+                if (TIFFjpeg_read_scanlines(sp, &line_work_buf, 1) != 1)
+                    return (0);
+
+                if (sp->cinfo.d.data_precision == 12)
                 {
-                        line_work_buf = (JSAMPROW)
-                                _TIFFmalloc(sizeof(short) * sp->cinfo.d.output_width
-                                            * sp->cinfo.d.num_components );
+                    int value_pairs = (sp->cinfo.d.output_width *
+                                       sp->cinfo.d.num_components) /
+                                      2;
+                    int iPair;
+
+                    for (iPair = 0; iPair < value_pairs; iPair++)
+                    {
+                        unsigned char *out_ptr =
+                            ((unsigned char *)buf) + iPair * 3;
+                        TIFF_JSAMPLE *in_ptr = line_work_buf + iPair * 2;
+
+                        out_ptr[0] = (unsigned char)((in_ptr[0] & 0xff0) >> 4);
+                        out_ptr[1] =
+                            (unsigned char)(((in_ptr[0] & 0xf) << 4) |
+                                            ((in_ptr[1] & 0xf00) >> 8));
+                        out_ptr[2] = (unsigned char)(((in_ptr[1] & 0xff) >> 0));
+                    }
                 }
+                else if (sp->cinfo.d.data_precision == 8)
+                {
+                    int value_count =
+                        (sp->cinfo.d.output_width * sp->cinfo.d.num_components);
+                    int iValue;
+
+                    for (iValue = 0; iValue < value_count; iValue++)
+                    {
+                        ((unsigned char *)buf)[iValue] =
+                            line_work_buf[iValue] & 0xff;
+                    }
+                }
+            }
 
-               do
-               {
-                       if( line_work_buf != NULL )
-                       {
-                               /*
-                                * In the MK1 case, we always read into a 16bit
-                                * buffer, and then pack down to 12bit or 8bit.
-                                * In 6B case we only read into 16 bit buffer
-                                * for 12bit data, which we need to repack.
-                                */
-                               if (TIFFjpeg_read_scanlines(sp, &line_work_buf, 1) != 1)
-                                       return (0);
-
-                               if( sp->cinfo.d.data_precision == 12 )
-                               {
-                                       int value_pairs = (sp->cinfo.d.output_width
-                                                          * sp->cinfo.d.num_components) / 2;
-                                       int iPair;
-
-                                       for( iPair = 0; iPair < value_pairs; iPair++ )
-                                       {
-                                               unsigned char *out_ptr =
-                                                       ((unsigned char *) buf) + iPair * 3;
-                                               JSAMPLE *in_ptr = line_work_buf + iPair * 2;
-
-                                               out_ptr[0] = (unsigned char)((in_ptr[0] & 0xff0) >> 4);
-                                               out_ptr[1] = (unsigned char)(((in_ptr[0] & 0xf) << 4)
-                                                       | ((in_ptr[1] & 0xf00) >> 8));
-                                               out_ptr[2] = (unsigned char)(((in_ptr[1] & 0xff) >> 0));
-                                       }
-                               }
-                               else if( sp->cinfo.d.data_precision == 8 )
-                               {
-                                       int value_count = (sp->cinfo.d.output_width
-                                                          * sp->cinfo.d.num_components);
-                                       int iValue;
-
-                                       for( iValue = 0; iValue < value_count; iValue++ )
-                                       {
-                                               ((unsigned char *) buf)[iValue] =
-                                                       line_work_buf[iValue] & 0xff;
-                                       }
-                               }
-                       }
-
-                       ++tif->tif_row;
-                       buf += sp->bytesperline;
-                       cc -= sp->bytesperline;
-               } while (--nrows > 0);
-
-               if( line_work_buf != NULL )
-                       _TIFFfree( line_work_buf );
-        }
+            ++tif->tif_row;
+            buf += sp->bytesperline;
+            cc -= sp->bytesperline;
+        } while (--nrows > 0);
+
+        if (line_work_buf != NULL)
+            _TIFFfreeExt(tif, line_work_buf);
+    }
+
+    /* Update information on consumed data */
+    tif->tif_rawcp = (uint8_t *)sp->src.next_input_byte;
+    tif->tif_rawcc = sp->src.bytes_in_buffer;
 
-        /* Update information on consumed data */
-        tif->tif_rawcp = (uint8*) sp->src.next_input_byte;
-        tif->tif_rawcc = sp->src.bytes_in_buffer;
-                
-	/* Close down the decompressor if we've finished the strip or tile. */
-	return sp->cinfo.d.output_scanline < sp->cinfo.d.output_height
-                || TIFFjpeg_finish_decompress(sp);
+    /* Close down the decompressor if we've finished the strip or tile. */
+    return sp->cinfo.d.output_scanline < sp->cinfo.d.output_height ||
+           TIFFjpeg_finish_decompress(sp);
 }
 #endif /* JPEG_LIB_MK1_OR_12BIT */
 
-/*ARGSUSED*/ static int
-DecodeRowError(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
+/*ARGSUSED*/ static int DecodeRowError(TIFF *tif, uint8_t *buf, tmsize_t cc,
+                                       uint16_t s)
 
 {
-    (void) buf;
-    (void) cc;
-    (void) s;
+    (void)buf;
+    (void)cc;
+    (void)s;
 
-    TIFFErrorExt(tif->tif_clientdata, "TIFFReadScanline",
-                 "scanline oriented access is not supported for downsampled JPEG compressed images, consider enabling TIFF_JPEGCOLORMODE as JPEGCOLORMODE_RGB." );
+    TIFFErrorExtR(
+        tif, "TIFFReadScanline",
+        "scanline oriented access is not supported for downsampled JPEG "
+        "compressed images, consider enabling TIFFTAG_JPEGCOLORMODE as "
+        "JPEGCOLORMODE_RGB.");
     return 0;
 }
 
@@ -1500,926 +1637,1079 @@ DecodeRowError(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
  * Decode a chunk of pixels.
  * Returned data is downsampled per sampling factors.
  */
-/*ARGSUSED*/ static int
-JPEGDecodeRaw(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
+/*ARGSUSED*/ static int JPEGDecodeRaw(TIFF *tif, uint8_t *buf, tmsize_t cc,
+                                      uint16_t s)
 {
-	JPEGState *sp = JState(tif);
-	tmsize_t nrows;
-        TIFFDirectory *td = &tif->tif_dir;
-	(void) s;
+    JPEGState *sp = JState(tif);
+    tmsize_t nrows;
+    TIFFDirectory *td = &tif->tif_dir;
+    (void)s;
 
-        nrows = sp->cinfo.d.image_height;
-        /* For last strip, limit number of rows to its truncated height */
-        /* even if the codestream height is larger (which is not compliant, */
-        /* but that we tolerate) */
-        if( (uint32)nrows > td->td_imagelength - tif->tif_row && !isTiled(tif) )
-            nrows = td->td_imagelength - tif->tif_row;
+    nrows = sp->cinfo.d.image_height;
+    /* For last strip, limit number of rows to its truncated height */
+    /* even if the codestream height is larger (which is not compliant, */
+    /* but that we tolerate) */
+    if ((uint32_t)nrows > td->td_imagelength - tif->tif_row && !isTiled(tif))
+        nrows = td->td_imagelength - tif->tif_row;
 
-	/* data is expected to be read in multiples of a scanline */
-	if ( nrows != 0 ) {
+#if defined(JPEG_LIB_MK1_OR_12BIT)
+    unsigned short *tmpbuf = NULL;
+#endif
 
-		/* Cb,Cr both have sampling factors 1, so this is correct */
-		JDIMENSION clumps_per_line = sp->cinfo.d.comp_info[1].downsampled_width;            
-		int samples_per_clump = sp->samplesperclump;
+    /* data is expected to be read in multiples of a scanline */
+    if (nrows != 0)
+    {
+
+        /* Cb,Cr both have sampling factors 1, so this is correct */
+        JDIMENSION clumps_per_line = sp->cinfo.d.comp_info[1].downsampled_width;
+        int samples_per_clump = sp->samplesperclump;
 
 #if defined(JPEG_LIB_MK1_OR_12BIT)
-		unsigned short* tmpbuf = _TIFFmalloc(sizeof(unsigned short) *
-						     sp->cinfo.d.output_width *
-						     sp->cinfo.d.num_components);
-		if(tmpbuf==NULL) {
-                        TIFFErrorExt(tif->tif_clientdata, "JPEGDecodeRaw",
-				     "Out of memory");
-			return 0;
-                }
+        tmpbuf = _TIFFmallocExt(tif, sizeof(unsigned short) *
+                                         sp->cinfo.d.output_width *
+                                         sp->cinfo.d.num_components);
+        if (tmpbuf == NULL)
+        {
+            TIFFErrorExtR(tif, "JPEGDecodeRaw", "Out of memory");
+            return 0;
+        }
 #endif
 
-		do {
-			jpeg_component_info *compptr;
-			int ci, clumpoffset;
+        do
+        {
+            jpeg_component_info *compptr;
+            int ci, clumpoffset;
 
-                        if( cc < sp->bytesperline ) {
-				TIFFErrorExt(tif->tif_clientdata, "JPEGDecodeRaw",
-					     "application buffer not large enough for all data.");
-				return 0;
-                        }
+            if (cc < sp->bytesperline)
+            {
+                TIFFErrorExtR(
+                    tif, "JPEGDecodeRaw",
+                    "application buffer not large enough for all data.");
+                goto error;
+            }
 
-			/* Reload downsampled-data buffer if needed */
-			if (sp->scancount >= DCTSIZE) {
-				int n = sp->cinfo.d.max_v_samp_factor * DCTSIZE;
-				if (TIFFjpeg_read_raw_data(sp, sp->ds_buffer, n) != n)
-					return (0);
-				sp->scancount = 0;
-			}
-			/*
-			 * Fastest way to unseparate data is to make one pass
-			 * over the scanline for each row of each component.
-			 */
-			clumpoffset = 0;    /* first sample in clump */
-			for (ci = 0, compptr = sp->cinfo.d.comp_info;
-			     ci < sp->cinfo.d.num_components;
-			     ci++, compptr++) {
-				int hsamp = compptr->h_samp_factor;
-				int vsamp = compptr->v_samp_factor;
-				int ypos;
-
-				for (ypos = 0; ypos < vsamp; ypos++) {
-					JSAMPLE *inptr = sp->ds_buffer[ci][sp->scancount*vsamp + ypos];
-					JDIMENSION nclump;
+            /* Reload downsampled-data buffer if needed */
+            if (sp->scancount >= DCTSIZE)
+            {
+                int n = sp->cinfo.d.max_v_samp_factor * DCTSIZE;
+                if (TIFFjpeg_read_raw_data(sp, sp->ds_buffer, n) != n)
+                    goto error;
+                sp->scancount = 0;
+            }
+            /*
+             * Fastest way to unseparate data is to make one pass
+             * over the scanline for each row of each component.
+             */
+            clumpoffset = 0; /* first sample in clump */
+            for (ci = 0, compptr = sp->cinfo.d.comp_info;
+                 ci < sp->cinfo.d.num_components; ci++, compptr++)
+            {
+                int hsamp = compptr->h_samp_factor;
+                int vsamp = compptr->v_samp_factor;
+                int ypos;
+
+                for (ypos = 0; ypos < vsamp; ypos++)
+                {
+                    TIFF_JSAMPLE *inptr =
+                        sp->ds_buffer[ci][sp->scancount * vsamp + ypos];
+                    JDIMENSION nclump;
 #if defined(JPEG_LIB_MK1_OR_12BIT)
-					JSAMPLE *outptr = (JSAMPLE*)tmpbuf + clumpoffset;
+                    TIFF_JSAMPLE *outptr = (TIFF_JSAMPLE *)tmpbuf + clumpoffset;
 #else
-					JSAMPLE *outptr = (JSAMPLE*)buf + clumpoffset;
-					if (cc < (tmsize_t)(clumpoffset + (tmsize_t)samples_per_clump*(clumps_per_line-1) + hsamp)) {
-						TIFFErrorExt(tif->tif_clientdata, "JPEGDecodeRaw",
-							     "application buffer not large enough for all data, possible subsampling issue");
-						return 0;
-					}
+                    TIFF_JSAMPLE *outptr = (TIFF_JSAMPLE *)buf + clumpoffset;
+                    if (cc < (tmsize_t)(clumpoffset +
+                                        (tmsize_t)samples_per_clump *
+                                            (clumps_per_line - 1) +
+                                        hsamp))
+                    {
+                        TIFFErrorExtR(
+                            tif, "JPEGDecodeRaw",
+                            "application buffer not large enough for all data, "
+                            "possible subsampling issue");
+                        goto error;
+                    }
 #endif
 
-					if (hsamp == 1) {
-						/* fast path for at least Cb and Cr */
-						for (nclump = clumps_per_line; nclump-- > 0; ) {
-							outptr[0] = *inptr++;
-							outptr += samples_per_clump;
-						}
-					} else {
-						int xpos;
-
-						/* general case */
-						for (nclump = clumps_per_line; nclump-- > 0; ) {
-							for (xpos = 0; xpos < hsamp; xpos++)
-								outptr[xpos] = *inptr++;
-							outptr += samples_per_clump;
-						}
-					}
-					clumpoffset += hsamp;
-				}
-			}
+                    if (hsamp == 1)
+                    {
+                        /* fast path for at least Cb and Cr */
+                        for (nclump = clumps_per_line; nclump-- > 0;)
+                        {
+                            outptr[0] = *inptr++;
+                            outptr += samples_per_clump;
+                        }
+                    }
+                    else
+                    {
+                        int xpos;
+
+                        /* general case */
+                        for (nclump = clumps_per_line; nclump-- > 0;)
+                        {
+                            for (xpos = 0; xpos < hsamp; xpos++)
+                                outptr[xpos] = *inptr++;
+                            outptr += samples_per_clump;
+                        }
+                    }
+                    clumpoffset += hsamp;
+                }
+            }
 
 #if defined(JPEG_LIB_MK1_OR_12BIT)
-			{
-				if (sp->cinfo.d.data_precision == 8)
-				{
-					int i=0;
-					int len = sp->cinfo.d.output_width * sp->cinfo.d.num_components;
-					for (i=0; i<len; i++)
-					{
-						((unsigned char*)buf)[i] = tmpbuf[i] & 0xff;
-					}
-				}
-				else
-				{         /* 12-bit */
-					int value_pairs = (sp->cinfo.d.output_width
-							   * sp->cinfo.d.num_components) / 2;
-					int iPair;
-					for( iPair = 0; iPair < value_pairs; iPair++ )
-					{
-						unsigned char *out_ptr = ((unsigned char *) buf) + iPair * 3;
-						JSAMPLE *in_ptr = (JSAMPLE *) (tmpbuf + iPair * 2);
-						out_ptr[0] = (unsigned char)((in_ptr[0] & 0xff0) >> 4);
-						out_ptr[1] = (unsigned char)(((in_ptr[0] & 0xf) << 4)
-							| ((in_ptr[1] & 0xf00) >> 8));
-						out_ptr[2] = (unsigned char)(((in_ptr[1] & 0xff) >> 0));
-					}
-				}
-			}
+            {
+                if (sp->cinfo.d.data_precision == 8)
+                {
+                    int i = 0;
+                    int len =
+                        sp->cinfo.d.output_width * sp->cinfo.d.num_components;
+                    for (i = 0; i < len; i++)
+                    {
+                        ((unsigned char *)buf)[i] = tmpbuf[i] & 0xff;
+                    }
+                }
+                else
+                { /* 12-bit */
+                    int value_pairs = (sp->cinfo.d.output_width *
+                                       sp->cinfo.d.num_components) /
+                                      2;
+                    int iPair;
+                    for (iPair = 0; iPair < value_pairs; iPair++)
+                    {
+                        unsigned char *out_ptr =
+                            ((unsigned char *)buf) + iPair * 3;
+                        JSAMPLE *in_ptr = (JSAMPLE *)(tmpbuf + iPair * 2);
+                        out_ptr[0] = (unsigned char)((in_ptr[0] & 0xff0) >> 4);
+                        out_ptr[1] =
+                            (unsigned char)(((in_ptr[0] & 0xf) << 4) |
+                                            ((in_ptr[1] & 0xf00) >> 8));
+                        out_ptr[2] = (unsigned char)(((in_ptr[1] & 0xff) >> 0));
+                    }
+                }
+            }
 #endif
 
-			sp->scancount ++;
-			tif->tif_row += sp->v_sampling;
+            sp->scancount++;
+            tif->tif_row += sp->v_sampling;
 
-			buf += sp->bytesperline;
-			cc -= sp->bytesperline;
+            buf += sp->bytesperline;
+            cc -= sp->bytesperline;
 
-			nrows -= sp->v_sampling;
-		} while (nrows > 0);
+            nrows -= sp->v_sampling;
+        } while (nrows > 0);
 
 #if defined(JPEG_LIB_MK1_OR_12BIT)
-		_TIFFfree(tmpbuf);
+        _TIFFfreeExt(tif, tmpbuf);
 #endif
+    }
 
-	}
+    /* Close down the decompressor if done. */
+    return sp->cinfo.d.output_scanline < sp->cinfo.d.output_height ||
+           TIFFjpeg_finish_decompress(sp);
 
-	/* Close down the decompressor if done. */
-	return sp->cinfo.d.output_scanline < sp->cinfo.d.output_height
-		|| TIFFjpeg_finish_decompress(sp);
+error:
+#if defined(JPEG_LIB_MK1_OR_12BIT)
+    _TIFFfreeExt(tif, tmpbuf);
+#endif
+    return 0;
 }
 
-
 /*
  * JPEG Encoding.
  */
 
-static void
-unsuppress_quant_table (JPEGState* sp, int tblno)
+static void unsuppress_quant_table(JPEGState *sp, int tblno)
 {
-	JQUANT_TBL* qtbl;
+    JQUANT_TBL *qtbl;
 
-	if ((qtbl = sp->cinfo.c.quant_tbl_ptrs[tblno]) != NULL)
-		qtbl->sent_table = FALSE;
+    if ((qtbl = sp->cinfo.c.quant_tbl_ptrs[tblno]) != NULL)
+        qtbl->sent_table = FALSE;
 }
 
-static void
-suppress_quant_table (JPEGState* sp, int tblno)
+static void suppress_quant_table(JPEGState *sp, int tblno)
 {
-	JQUANT_TBL* qtbl;
+    JQUANT_TBL *qtbl;
 
-	if ((qtbl = sp->cinfo.c.quant_tbl_ptrs[tblno]) != NULL)
-		qtbl->sent_table = TRUE;
+    if ((qtbl = sp->cinfo.c.quant_tbl_ptrs[tblno]) != NULL)
+        qtbl->sent_table = TRUE;
 }
 
-static void
-unsuppress_huff_table (JPEGState* sp, int tblno)
+static void unsuppress_huff_table(JPEGState *sp, int tblno)
 {
-	JHUFF_TBL* htbl;
+    JHUFF_TBL *htbl;
 
-	if ((htbl = sp->cinfo.c.dc_huff_tbl_ptrs[tblno]) != NULL)
-		htbl->sent_table = FALSE;
-	if ((htbl = sp->cinfo.c.ac_huff_tbl_ptrs[tblno]) != NULL)
-		htbl->sent_table = FALSE;
+    if ((htbl = sp->cinfo.c.dc_huff_tbl_ptrs[tblno]) != NULL)
+        htbl->sent_table = FALSE;
+    if ((htbl = sp->cinfo.c.ac_huff_tbl_ptrs[tblno]) != NULL)
+        htbl->sent_table = FALSE;
 }
 
-static void
-suppress_huff_table (JPEGState* sp, int tblno)
+static void suppress_huff_table(JPEGState *sp, int tblno)
 {
-	JHUFF_TBL* htbl;
+    JHUFF_TBL *htbl;
 
-	if ((htbl = sp->cinfo.c.dc_huff_tbl_ptrs[tblno]) != NULL)
-		htbl->sent_table = TRUE;
-	if ((htbl = sp->cinfo.c.ac_huff_tbl_ptrs[tblno]) != NULL)
-		htbl->sent_table = TRUE;
+    if ((htbl = sp->cinfo.c.dc_huff_tbl_ptrs[tblno]) != NULL)
+        htbl->sent_table = TRUE;
+    if ((htbl = sp->cinfo.c.ac_huff_tbl_ptrs[tblno]) != NULL)
+        htbl->sent_table = TRUE;
 }
 
-static int
-prepare_JPEGTables(TIFF* tif)
-{
-	JPEGState* sp = JState(tif);
-
-	/* Initialize quant tables for current quality setting */
-	if (!TIFFjpeg_set_quality(sp, sp->jpegquality, FALSE))
-		return (0);
-	/* Mark only the tables we want for output */
-	/* NB: chrominance tables are currently used only with YCbCr */
-	if (!TIFFjpeg_suppress_tables(sp, TRUE))
-		return (0);
-	if (sp->jpegtablesmode & JPEGTABLESMODE_QUANT) {
-		unsuppress_quant_table(sp, 0);
-		if (sp->photometric == PHOTOMETRIC_YCBCR)
-			unsuppress_quant_table(sp, 1);
-	}
-	if (sp->jpegtablesmode & JPEGTABLESMODE_HUFF) {
-		unsuppress_huff_table(sp, 0);
-		if (sp->photometric == PHOTOMETRIC_YCBCR)
-			unsuppress_huff_table(sp, 1);
-	}
-	/* Direct libjpeg output into jpegtables */
-	if (!TIFFjpeg_tables_dest(sp, tif))
-		return (0);
-	/* Emit tables-only datastream */
-	if (!TIFFjpeg_write_tables(sp))
-		return (0);
-
-	return (1);
+static int prepare_JPEGTables(TIFF *tif)
+{
+    JPEGState *sp = JState(tif);
+
+    /* Initialize quant tables for current quality setting */
+    if (!TIFFjpeg_set_quality(sp, sp->otherSettings.jpegquality, FALSE))
+        return (0);
+    /* Mark only the tables we want for output */
+    /* NB: chrominance tables are currently used only with YCbCr */
+    if (!TIFFjpeg_suppress_tables(sp, TRUE))
+        return (0);
+    if (sp->otherSettings.jpegtablesmode & JPEGTABLESMODE_QUANT)
+    {
+        unsuppress_quant_table(sp, 0);
+        if (sp->photometric == PHOTOMETRIC_YCBCR)
+            unsuppress_quant_table(sp, 1);
+    }
+    if (sp->otherSettings.jpegtablesmode & JPEGTABLESMODE_HUFF)
+    {
+        unsuppress_huff_table(sp, 0);
+        if (sp->photometric == PHOTOMETRIC_YCBCR)
+            unsuppress_huff_table(sp, 1);
+    }
+    /* Direct libjpeg output into otherSettings.jpegtables */
+    if (!TIFFjpeg_tables_dest(sp, tif))
+        return (0);
+    /* Emit tables-only datastream */
+    if (!TIFFjpeg_write_tables(sp))
+        return (0);
+
+    return (1);
 }
 
-static int
-JPEGSetupEncode(TIFF* tif)
+#if defined(JPEG_LIB_VERSION_MAJOR) &&                                         \
+    (JPEG_LIB_VERSION_MAJOR > 9 ||                                             \
+     (JPEG_LIB_VERSION_MAJOR == 9 && JPEG_LIB_VERSION_MINOR >= 4))
+/* This is a modified version of std_huff_tables() from jcparam.c
+ * in libjpeg-9d because it no longer initializes default Huffman
+ * tables in jpeg_set_defaults(). */
+static void TIFF_std_huff_tables(j_compress_ptr cinfo)
 {
-	JPEGState* sp = JState(tif);
-	TIFFDirectory *td = &tif->tif_dir;
-	static const char module[] = "JPEGSetupEncode";
 
-#if defined(JPEG_DUAL_MODE_8_12) && !defined(TIFFInitJPEG)
-        if( tif->tif_dir.td_bitspersample == 12 )
-            return TIFFReInitJPEG_12( tif, COMPRESSION_JPEG, 1 );
+    if (cinfo->dc_huff_tbl_ptrs[0] == NULL)
+    {
+        (void)jpeg_std_huff_table((j_common_ptr)cinfo, TRUE, 0);
+    }
+    if (cinfo->ac_huff_tbl_ptrs[0] == NULL)
+    {
+        (void)jpeg_std_huff_table((j_common_ptr)cinfo, FALSE, 0);
+    }
+    if (cinfo->dc_huff_tbl_ptrs[1] == NULL)
+    {
+        (void)jpeg_std_huff_table((j_common_ptr)cinfo, TRUE, 1);
+    }
+    if (cinfo->ac_huff_tbl_ptrs[1] == NULL)
+    {
+        (void)jpeg_std_huff_table((j_common_ptr)cinfo, FALSE, 1);
+    }
+}
 #endif
 
-        JPEGInitializeLibJPEG( tif, FALSE );
-
-	assert(sp != NULL);
-	assert(!sp->cinfo.comm.is_decompressor);
-
-	sp->photometric = td->td_photometric;
-
-	/*
-	 * Initialize all JPEG parameters to default values.
-	 * Note that jpeg_set_defaults needs legal values for
-	 * in_color_space and input_components.
-	 */
-	if (td->td_planarconfig == PLANARCONFIG_CONTIG) {
-		sp->cinfo.c.input_components = td->td_samplesperpixel;
-		if (sp->photometric == PHOTOMETRIC_YCBCR) {
-			if (sp->jpegcolormode == JPEGCOLORMODE_RGB) {
-				sp->cinfo.c.in_color_space = JCS_RGB;
-			} else {
-				sp->cinfo.c.in_color_space = JCS_YCbCr;
-			}
-		} else {
-			if ((td->td_photometric == PHOTOMETRIC_MINISWHITE || td->td_photometric == PHOTOMETRIC_MINISBLACK) && td->td_samplesperpixel == 1)
-				sp->cinfo.c.in_color_space = JCS_GRAYSCALE;
-			else if (td->td_photometric == PHOTOMETRIC_RGB && td->td_samplesperpixel == 3)
-				sp->cinfo.c.in_color_space = JCS_RGB;
-			else if (td->td_photometric == PHOTOMETRIC_SEPARATED && td->td_samplesperpixel == 4)
-				sp->cinfo.c.in_color_space = JCS_CMYK;
-			else
-				sp->cinfo.c.in_color_space = JCS_UNKNOWN;
-		}
-	} else {
-		sp->cinfo.c.input_components = 1;
-		sp->cinfo.c.in_color_space = JCS_UNKNOWN;
-	}
-	if (!TIFFjpeg_set_defaults(sp))
-		return (0);
-	/* Set per-file parameters */
-	switch (sp->photometric) {
-	case PHOTOMETRIC_YCBCR:
-		sp->h_sampling = td->td_ycbcrsubsampling[0];
-		sp->v_sampling = td->td_ycbcrsubsampling[1];
-                if( sp->h_sampling == 0 || sp->v_sampling == 0 )
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                            "Invalig horizontal/vertical sampling value");
-                    return (0);
-                }
-                if( td->td_bitspersample > 16 )
+static int JPEGSetupEncode(TIFF *tif)
+{
+    JPEGState *sp = JState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+    static const char module[] = "JPEGSetupEncode";
+
+#if defined(JPEG_DUAL_MODE_8_12) && !defined(FROM_TIF_JPEG_12)
+    if (tif->tif_dir.td_bitspersample == 12)
+    {
+        /* We pass a pointer to a copy of otherSettings, since */
+        /* TIFFReInitJPEG_12() will clear sp */
+        JPEGOtherSettings savedOtherSettings = sp->otherSettings;
+        return TIFFReInitJPEG_12(tif, &savedOtherSettings, COMPRESSION_JPEG, 1);
+    }
+#endif
+
+    JPEGInitializeLibJPEG(tif, FALSE);
+
+    assert(sp != NULL);
+    assert(!sp->cinfo.comm.is_decompressor);
+
+    sp->photometric = td->td_photometric;
+
+    /*
+     * Initialize all JPEG parameters to default values.
+     * Note that jpeg_set_defaults needs legal values for
+     * in_color_space and input_components.
+     */
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG)
+    {
+        sp->cinfo.c.input_components = td->td_samplesperpixel;
+        if (sp->photometric == PHOTOMETRIC_YCBCR)
+        {
+            if (sp->otherSettings.jpegcolormode == JPEGCOLORMODE_RGB)
+            {
+                sp->cinfo.c.in_color_space = JCS_RGB;
+            }
+            else
+            {
+                sp->cinfo.c.in_color_space = JCS_YCbCr;
+            }
+        }
+        else
+        {
+            if ((td->td_photometric == PHOTOMETRIC_MINISWHITE ||
+                 td->td_photometric == PHOTOMETRIC_MINISBLACK) &&
+                td->td_samplesperpixel == 1)
+                sp->cinfo.c.in_color_space = JCS_GRAYSCALE;
+            else if (td->td_photometric == PHOTOMETRIC_RGB &&
+                     td->td_samplesperpixel == 3)
+                sp->cinfo.c.in_color_space = JCS_RGB;
+            else if (td->td_photometric == PHOTOMETRIC_SEPARATED &&
+                     td->td_samplesperpixel == 4)
+                sp->cinfo.c.in_color_space = JCS_CMYK;
+            else
+                sp->cinfo.c.in_color_space = JCS_UNKNOWN;
+        }
+    }
+    else
+    {
+        sp->cinfo.c.input_components = 1;
+        sp->cinfo.c.in_color_space = JCS_UNKNOWN;
+    }
+    if (!TIFFjpeg_set_defaults(sp))
+        return (0);
+
+    /* mozjpeg by default enables progressive JPEG, which is illegal in
+     * JPEG-in-TIFF */
+    /* So explicitly disable it. */
+    if (sp->cinfo.c.num_scans != 0 &&
+        (sp->otherSettings.jpegtablesmode & JPEGTABLESMODE_HUFF) != 0)
+    {
+        /* it has been found that mozjpeg could create corrupt strips/tiles */
+        /* in non optimize_coding mode. */
+        TIFFWarningExtR(
+            tif, module,
+            "mozjpeg library likely detected. Disable emission of "
+            "Huffman tables in JpegTables tag, and use optimize_coding "
+            "to avoid potential issues");
+        sp->otherSettings.jpegtablesmode &= ~JPEGTABLESMODE_HUFF;
+    }
+    sp->cinfo.c.num_scans = 0;
+    sp->cinfo.c.scan_info = NULL;
+
+    /* Set per-file parameters */
+    switch (sp->photometric)
+    {
+        case PHOTOMETRIC_YCBCR:
+            sp->h_sampling = td->td_ycbcrsubsampling[0];
+            sp->v_sampling = td->td_ycbcrsubsampling[1];
+            if (sp->h_sampling == 0 || sp->v_sampling == 0)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Invalig horizontal/vertical sampling value");
+                return (0);
+            }
+            if (td->td_bitspersample > 16)
+            {
+                TIFFErrorExtR(tif, module,
+                              "BitsPerSample %" PRIu16 " not allowed for JPEG",
+                              td->td_bitspersample);
+                return (0);
+            }
+
+            /*
+             * A ReferenceBlackWhite field *must* be present since the
+             * default value is inappropriate for YCbCr.  Fill in the
+             * proper value if application didn't set it.
+             */
+            {
+                float *ref;
+                if (!TIFFGetField(tif, TIFFTAG_REFERENCEBLACKWHITE, &ref))
                 {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                 "BitsPerSample %d not allowed for JPEG",
-                                 td->td_bitspersample);
-                    return (0);
+                    float refbw[6];
+                    long top = 1L << td->td_bitspersample;
+                    refbw[0] = 0;
+                    refbw[1] = (float)(top - 1L);
+                    refbw[2] = (float)(top >> 1);
+                    refbw[3] = refbw[1];
+                    refbw[4] = refbw[2];
+                    refbw[5] = refbw[1];
+                    TIFFSetField(tif, TIFFTAG_REFERENCEBLACKWHITE, refbw);
                 }
+            }
+            break;
+        case PHOTOMETRIC_PALETTE: /* disallowed by Tech Note */
+        case PHOTOMETRIC_MASK:
+            TIFFErrorExtR(tif, module,
+                          "PhotometricInterpretation %" PRIu16
+                          " not allowed for JPEG",
+                          sp->photometric);
+            return (0);
+        default:
+            /* TIFF 6.0 forbids subsampling of all other color spaces */
+            sp->h_sampling = 1;
+            sp->v_sampling = 1;
+            break;
+    }
+
+        /* Verify miscellaneous parameters */
 
-		/*
-		 * A ReferenceBlackWhite field *must* be present since the
-		 * default value is inappropriate for YCbCr.  Fill in the
-		 * proper value if application didn't set it.
-		 */
-		{
-			float *ref;
-			if (!TIFFGetField(tif, TIFFTAG_REFERENCEBLACKWHITE,
-					  &ref)) {
-				float refbw[6];
-				long top = 1L << td->td_bitspersample;
-				refbw[0] = 0;
-				refbw[1] = (float)(top-1L);
-				refbw[2] = (float)(top>>1);
-				refbw[3] = refbw[1];
-				refbw[4] = refbw[2];
-				refbw[5] = refbw[1];
-				TIFFSetField(tif, TIFFTAG_REFERENCEBLACKWHITE,
-					     refbw);
-			}
-		}
-		break;
-	case PHOTOMETRIC_PALETTE:		/* disallowed by Tech Note */
-	case PHOTOMETRIC_MASK:
-		TIFFErrorExt(tif->tif_clientdata, module,
-			  "PhotometricInterpretation %d not allowed for JPEG",
-			  (int) sp->photometric);
-		return (0);
-	default:
-		/* TIFF 6.0 forbids subsampling of all other color spaces */
-		sp->h_sampling = 1;
-		sp->v_sampling = 1;
-		break;
-	}
-
-	/* Verify miscellaneous parameters */
-
-	/*
-	 * This would need work if libtiff ever supports different
-	 * depths for different components, or if libjpeg ever supports
-	 * run-time selection of depth.  Neither is imminent.
-	 */
+        /*
+         * This would need work if libtiff ever supports different
+         * depths for different components, or if libjpeg ever supports
+         * run-time selection of depth.  Neither is imminent.
+         */
 #ifdef JPEG_LIB_MK1
-        /* BITS_IN_JSAMPLE now permits 8 and 12 --- dgilbert */
-	if (td->td_bitspersample != 8 && td->td_bitspersample != 12) 
+    /* BITS_IN_JSAMPLE now permits 8 and 12 --- dgilbert */
+    if (td->td_bitspersample != 8 && td->td_bitspersample != 12)
 #else
-	if (td->td_bitspersample != BITS_IN_JSAMPLE )
+    if (td->td_bitspersample != BITS_IN_JSAMPLE)
 #endif
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "BitsPerSample %d not allowed for JPEG",
-			  (int) td->td_bitspersample);
-		return (0);
-	}
-	sp->cinfo.c.data_precision = td->td_bitspersample;
+    {
+        TIFFErrorExtR(tif, module,
+                      "BitsPerSample %" PRIu16 " not allowed for JPEG",
+                      td->td_bitspersample);
+        return (0);
+    }
+    sp->cinfo.c.data_precision = td->td_bitspersample;
 #ifdef JPEG_LIB_MK1
-        sp->cinfo.c.bits_in_jsample = td->td_bitspersample;
+    sp->cinfo.c.bits_in_jsample = td->td_bitspersample;
 #endif
-	if (isTiled(tif)) {
-		if ((td->td_tilelength % (sp->v_sampling * DCTSIZE)) != 0) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				  "JPEG tile height must be multiple of %d",
-				  sp->v_sampling * DCTSIZE);
-			return (0);
-		}
-		if ((td->td_tilewidth % (sp->h_sampling * DCTSIZE)) != 0) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				  "JPEG tile width must be multiple of %d",
-				  sp->h_sampling * DCTSIZE);
-			return (0);
-		}
-	} else {
-		if (td->td_rowsperstrip < td->td_imagelength &&
-		    (td->td_rowsperstrip % (sp->v_sampling * DCTSIZE)) != 0) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				  "RowsPerStrip must be multiple of %d for JPEG",
-				  sp->v_sampling * DCTSIZE);
-			return (0);
-		}
-	}
-
-	/* Create a JPEGTables field if appropriate */
-	if (sp->jpegtablesmode & (JPEGTABLESMODE_QUANT|JPEGTABLESMODE_HUFF)) {
-                if( sp->jpegtables == NULL
-                    || memcmp(sp->jpegtables,"\0\0\0\0\0\0\0\0\0",8) == 0 )
-                {
-                        if (!prepare_JPEGTables(tif))
-                                return (0);
-                        /* Mark the field present */
-                        /* Can't use TIFFSetField since BEENWRITING is already set! */
-                        tif->tif_flags |= TIFF_DIRTYDIRECT;
-                        TIFFSetFieldBit(tif, FIELD_JPEGTABLES);
-                }
-	} else {
-		/* We do not support application-supplied JPEGTables, */
-		/* so mark the field not present */
-		TIFFClrFieldBit(tif, FIELD_JPEGTABLES);
-	}
+    if (isTiled(tif))
+    {
+        if ((td->td_tilelength % (sp->v_sampling * DCTSIZE)) != 0)
+        {
+            TIFFErrorExtR(tif, module,
+                          "JPEG tile height must be multiple of %" PRIu32,
+                          (uint32_t)(sp->v_sampling * DCTSIZE));
+            return (0);
+        }
+        if ((td->td_tilewidth % (sp->h_sampling * DCTSIZE)) != 0)
+        {
+            TIFFErrorExtR(tif, module,
+                          "JPEG tile width must be multiple of %" PRIu32,
+                          (uint32_t)(sp->h_sampling * DCTSIZE));
+            return (0);
+        }
+    }
+    else
+    {
+        if (td->td_rowsperstrip < td->td_imagelength &&
+            (td->td_rowsperstrip % (sp->v_sampling * DCTSIZE)) != 0)
+        {
+            TIFFErrorExtR(tif, module,
+                          "RowsPerStrip must be multiple of %" PRIu32
+                          " for JPEG",
+                          (uint32_t)(sp->v_sampling * DCTSIZE));
+            return (0);
+        }
+    }
 
-	/* Direct libjpeg output to libtiff's output buffer */
-	TIFFjpeg_data_dest(sp, tif);
+    /* Create a JPEGTables field if appropriate */
+    if (sp->otherSettings.jpegtablesmode &
+        (JPEGTABLESMODE_QUANT | JPEGTABLESMODE_HUFF))
+    {
+        if (sp->otherSettings.jpegtables == NULL ||
+            memcmp(sp->otherSettings.jpegtables, "\0\0\0\0\0\0\0\0\0", 8) == 0)
+        {
+#if defined(JPEG_LIB_VERSION_MAJOR) &&                                         \
+    (JPEG_LIB_VERSION_MAJOR > 9 ||                                             \
+     (JPEG_LIB_VERSION_MAJOR == 9 && JPEG_LIB_VERSION_MINOR >= 4))
+            if ((sp->otherSettings.jpegtablesmode & JPEGTABLESMODE_HUFF) != 0 &&
+                (sp->cinfo.c.dc_huff_tbl_ptrs[0] == NULL ||
+                 sp->cinfo.c.dc_huff_tbl_ptrs[1] == NULL ||
+                 sp->cinfo.c.ac_huff_tbl_ptrs[0] == NULL ||
+                 sp->cinfo.c.ac_huff_tbl_ptrs[1] == NULL))
+            {
+                /* libjpeg-9d no longer initializes default Huffman tables in */
+                /* jpeg_set_defaults() */
+                TIFF_std_huff_tables(&sp->cinfo.c);
+            }
+#endif
 
-	return (1);
+            if (!prepare_JPEGTables(tif))
+                return (0);
+            /* Mark the field present */
+            /* Can't use TIFFSetField since BEENWRITING is already set! */
+            tif->tif_flags |= TIFF_DIRTYDIRECT;
+            TIFFSetFieldBit(tif, FIELD_JPEGTABLES);
+        }
+    }
+    else
+    {
+        /* We do not support application-supplied JPEGTables, */
+        /* so mark the field not present */
+        TIFFClrFieldBit(tif, FIELD_JPEGTABLES);
+    }
+
+    /* Direct libjpeg output to libtiff's output buffer */
+    TIFFjpeg_data_dest(sp, tif);
+
+    return (1);
 }
 
 /*
  * Set encoding state at the start of a strip or tile.
  */
-static int
-JPEGPreEncode(TIFF* tif, uint16 s)
-{
-	JPEGState *sp = JState(tif);
-	TIFFDirectory *td = &tif->tif_dir;
-	static const char module[] = "JPEGPreEncode";
-	uint32 segment_width, segment_height;
-	int downsampled_input;
-
-	assert(sp != NULL);
-  
-	if (sp->cinfo.comm.is_decompressor == 1)
-	{
-		tif->tif_setupencode( tif );
-	}
-  
-	assert(!sp->cinfo.comm.is_decompressor);
-	/*
-	 * Set encoding parameters for this strip/tile.
-	 */
-	if (isTiled(tif)) {
-		segment_width = td->td_tilewidth;
-		segment_height = td->td_tilelength;
-		sp->bytesperline = TIFFTileRowSize(tif);
-	} else {
-		segment_width = td->td_imagewidth;
-		segment_height = td->td_imagelength - tif->tif_row;
-		if (segment_height > td->td_rowsperstrip)
-			segment_height = td->td_rowsperstrip;
-		sp->bytesperline = TIFFScanlineSize(tif);
-	}
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE && s > 0) {
-		/* for PC 2, scale down the strip/tile size
-		 * to match a downsampled component
-		 */
-		segment_width = TIFFhowmany_32(segment_width, sp->h_sampling); 
-		segment_height = TIFFhowmany_32(segment_height, sp->v_sampling);
-	}
-	if (segment_width > 65535 || segment_height > 65535) {
-		TIFFErrorExt(tif->tif_clientdata, module, "Strip/tile too large for JPEG");
-		return (0);
-	}
-	sp->cinfo.c.image_width = segment_width;
-	sp->cinfo.c.image_height = segment_height;
-	downsampled_input = FALSE;
-	if (td->td_planarconfig == PLANARCONFIG_CONTIG) {
-		sp->cinfo.c.input_components = td->td_samplesperpixel;
-		if (sp->photometric == PHOTOMETRIC_YCBCR) {
-			if (sp->jpegcolormode != JPEGCOLORMODE_RGB) {
-				if (sp->h_sampling != 1 || sp->v_sampling != 1)
-					downsampled_input = TRUE;
-			}
-			if (!TIFFjpeg_set_colorspace(sp, JCS_YCbCr))
-				return (0);
-			/*
-			 * Set Y sampling factors;
-			 * we assume jpeg_set_colorspace() set the rest to 1
-			 */
-			sp->cinfo.c.comp_info[0].h_samp_factor = sp->h_sampling;
-			sp->cinfo.c.comp_info[0].v_samp_factor = sp->v_sampling;
-		} else {
-			if (!TIFFjpeg_set_colorspace(sp, sp->cinfo.c.in_color_space))
-				return (0);
-			/* jpeg_set_colorspace set all sampling factors to 1 */
-		}
-	} else {
-		if (!TIFFjpeg_set_colorspace(sp, JCS_UNKNOWN))
-			return (0);
-		sp->cinfo.c.comp_info[0].component_id = s;
-		/* jpeg_set_colorspace() set sampling factors to 1 */
-		if (sp->photometric == PHOTOMETRIC_YCBCR && s > 0) {
-			sp->cinfo.c.comp_info[0].quant_tbl_no = 1;
-			sp->cinfo.c.comp_info[0].dc_tbl_no = 1;
-			sp->cinfo.c.comp_info[0].ac_tbl_no = 1;
-		}
-	}
-	/* ensure libjpeg won't write any extraneous markers */
-	sp->cinfo.c.write_JFIF_header = FALSE;
-	sp->cinfo.c.write_Adobe_marker = FALSE;
-	/* set up table handling correctly */
-	/* calling TIFFjpeg_set_quality() causes quantization tables to be flagged */
-	/* as being to be emitted, which we don't want in the JPEGTABLESMODE_QUANT */
-	/* mode, so we must manually suppress them. However TIFFjpeg_set_quality() */
-	/* should really be called when dealing with files with directories with */
-	/* mixed qualities. see http://trac.osgeo.org/gdal/ticket/3539 */
-	if (!TIFFjpeg_set_quality(sp, sp->jpegquality, FALSE))
-		return (0);
-	if (sp->jpegtablesmode & JPEGTABLESMODE_QUANT) {
-		suppress_quant_table(sp, 0);
-		suppress_quant_table(sp, 1);
-	}
-	else {
-		unsuppress_quant_table(sp, 0);
-		unsuppress_quant_table(sp, 1);
-	}
-	if (sp->jpegtablesmode & JPEGTABLESMODE_HUFF)
-	{
-		/* Explicit suppression is only needed if we did not go through the */
-		/* prepare_JPEGTables() code path, which may be the case if updating */
-		/* an existing file */
-		suppress_huff_table(sp, 0);
-		suppress_huff_table(sp, 1);
-		sp->cinfo.c.optimize_coding = FALSE;
-	}
-	else
-		sp->cinfo.c.optimize_coding = TRUE;
-	if (downsampled_input) {
-		/* Need to use raw-data interface to libjpeg */
-		sp->cinfo.c.raw_data_in = TRUE;
-		tif->tif_encoderow = JPEGEncodeRaw;
-		tif->tif_encodestrip = JPEGEncodeRaw;
-		tif->tif_encodetile = JPEGEncodeRaw;
-	} else {
-		/* Use normal interface to libjpeg */
-		sp->cinfo.c.raw_data_in = FALSE;
-		tif->tif_encoderow = JPEGEncode;
-		tif->tif_encodestrip = JPEGEncode;
-		tif->tif_encodetile = JPEGEncode;
-	}
-	/* Start JPEG compressor */
-	if (!TIFFjpeg_start_compress(sp, FALSE))
-		return (0);
-	/* Allocate downsampled-data buffers if needed */
-	if (downsampled_input) {
-		if (!alloc_downsampled_buffers(tif, sp->cinfo.c.comp_info,
-					       sp->cinfo.c.num_components))
-			return (0);
-	}
-	sp->scancount = 0;
-
-	return (1);
+static int JPEGPreEncode(TIFF *tif, uint16_t s)
+{
+    JPEGState *sp = JState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+    static const char module[] = "JPEGPreEncode";
+    uint32_t segment_width, segment_height;
+    int downsampled_input;
+
+    assert(sp != NULL);
+
+    if (sp->cinfo.comm.is_decompressor == 1)
+    {
+        tif->tif_setupencode(tif);
+    }
+
+    assert(!sp->cinfo.comm.is_decompressor);
+    /*
+     * Set encoding parameters for this strip/tile.
+     */
+    if (isTiled(tif))
+    {
+        segment_width = td->td_tilewidth;
+        segment_height = td->td_tilelength;
+        sp->bytesperline = TIFFTileRowSize(tif);
+    }
+    else
+    {
+        segment_width = td->td_imagewidth;
+        segment_height = td->td_imagelength - tif->tif_row;
+        if (segment_height > td->td_rowsperstrip)
+            segment_height = td->td_rowsperstrip;
+        sp->bytesperline = TIFFScanlineSize(tif);
+    }
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE && s > 0)
+    {
+        /* for PC 2, scale down the strip/tile size
+         * to match a downsampled component
+         */
+        segment_width = TIFFhowmany_32(segment_width, sp->h_sampling);
+        segment_height = TIFFhowmany_32(segment_height, sp->v_sampling);
+    }
+    if (segment_width > 65535 || segment_height > 65535)
+    {
+        TIFFErrorExtR(tif, module, "Strip/tile too large for JPEG");
+        return (0);
+    }
+    sp->cinfo.c.image_width = segment_width;
+    sp->cinfo.c.image_height = segment_height;
+    downsampled_input = FALSE;
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG)
+    {
+        sp->cinfo.c.input_components = td->td_samplesperpixel;
+        if (sp->photometric == PHOTOMETRIC_YCBCR)
+        {
+            if (sp->otherSettings.jpegcolormode != JPEGCOLORMODE_RGB)
+            {
+                if (sp->h_sampling != 1 || sp->v_sampling != 1)
+                    downsampled_input = TRUE;
+            }
+            if (!TIFFjpeg_set_colorspace(sp, JCS_YCbCr))
+                return (0);
+            /*
+             * Set Y sampling factors;
+             * we assume jpeg_set_colorspace() set the rest to 1
+             */
+            sp->cinfo.c.comp_info[0].h_samp_factor = sp->h_sampling;
+            sp->cinfo.c.comp_info[0].v_samp_factor = sp->v_sampling;
+        }
+        else
+        {
+            if (!TIFFjpeg_set_colorspace(sp, sp->cinfo.c.in_color_space))
+                return (0);
+            /* jpeg_set_colorspace set all sampling factors to 1 */
+        }
+    }
+    else
+    {
+        if (!TIFFjpeg_set_colorspace(sp, JCS_UNKNOWN))
+            return (0);
+        sp->cinfo.c.comp_info[0].component_id = s;
+        /* jpeg_set_colorspace() set sampling factors to 1 */
+        if (sp->photometric == PHOTOMETRIC_YCBCR && s > 0)
+        {
+            sp->cinfo.c.comp_info[0].quant_tbl_no = 1;
+            sp->cinfo.c.comp_info[0].dc_tbl_no = 1;
+            sp->cinfo.c.comp_info[0].ac_tbl_no = 1;
+        }
+    }
+    /* ensure libjpeg won't write any extraneous markers */
+    sp->cinfo.c.write_JFIF_header = FALSE;
+    sp->cinfo.c.write_Adobe_marker = FALSE;
+    /* set up table handling correctly */
+    /* calling TIFFjpeg_set_quality() causes quantization tables to be flagged
+     */
+    /* as being to be emitted, which we don't want in the JPEGTABLESMODE_QUANT
+     */
+    /* mode, so we must manually suppress them. However TIFFjpeg_set_quality()
+     */
+    /* should really be called when dealing with files with directories with */
+    /* mixed qualities. see http://trac.osgeo.org/gdal/ticket/3539 */
+    if (!TIFFjpeg_set_quality(sp, sp->otherSettings.jpegquality, FALSE))
+        return (0);
+    if (sp->otherSettings.jpegtablesmode & JPEGTABLESMODE_QUANT)
+    {
+        suppress_quant_table(sp, 0);
+        suppress_quant_table(sp, 1);
+    }
+    else
+    {
+        unsuppress_quant_table(sp, 0);
+        unsuppress_quant_table(sp, 1);
+    }
+    if (sp->otherSettings.jpegtablesmode & JPEGTABLESMODE_HUFF)
+    {
+        /* Explicit suppression is only needed if we did not go through the */
+        /* prepare_JPEGTables() code path, which may be the case if updating */
+        /* an existing file */
+        suppress_huff_table(sp, 0);
+        suppress_huff_table(sp, 1);
+        sp->cinfo.c.optimize_coding = FALSE;
+    }
+    else
+        sp->cinfo.c.optimize_coding = TRUE;
+    if (downsampled_input)
+    {
+        /* Need to use raw-data interface to libjpeg */
+        sp->cinfo.c.raw_data_in = TRUE;
+        tif->tif_encoderow = JPEGEncodeRaw;
+        tif->tif_encodestrip = JPEGEncodeRaw;
+        tif->tif_encodetile = JPEGEncodeRaw;
+    }
+    else
+    {
+        /* Use normal interface to libjpeg */
+        sp->cinfo.c.raw_data_in = FALSE;
+        tif->tif_encoderow = JPEGEncode;
+        tif->tif_encodestrip = JPEGEncode;
+        tif->tif_encodetile = JPEGEncode;
+    }
+    /* Start JPEG compressor */
+    if (!TIFFjpeg_start_compress(sp, FALSE))
+        return (0);
+    /* Allocate downsampled-data buffers if needed */
+    if (downsampled_input)
+    {
+        if (!alloc_downsampled_buffers(tif, sp->cinfo.c.comp_info,
+                                       sp->cinfo.c.num_components))
+            return (0);
+    }
+    sp->scancount = 0;
+
+    return (1);
 }
 
 /*
  * Encode a chunk of pixels.
  * "Standard" case: incoming data is not downsampled.
  */
-static int
-JPEGEncode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
-{
-	JPEGState *sp = JState(tif);
-	tmsize_t nrows;
-	JSAMPROW bufptr[1];
-        short *line16 = NULL;
-        int    line16_count = 0;
-
-	(void) s;
-	assert(sp != NULL);
-	/* data is expected to be supplied in multiples of a scanline */
-	nrows = cc / sp->bytesperline;
-	if (cc % sp->bytesperline)
-            TIFFWarningExt(tif->tif_clientdata, tif->tif_name, 
-                           "fractional scanline discarded");
-
-        /* The last strip will be limited to image size */
-        if( !isTiled(tif) && tif->tif_row+nrows > tif->tif_dir.td_imagelength )
-            nrows = tif->tif_dir.td_imagelength - tif->tif_row;
-
-        if( sp->cinfo.c.data_precision == 12 )
+static int JPEGEncode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s)
+{
+    JPEGState *sp = JState(tif);
+    tmsize_t nrows;
+    TIFF_JSAMPROW bufptr[1];
+    short *line16 = NULL;
+    int line16_count = 0;
+
+    (void)s;
+    assert(sp != NULL);
+    /* data is expected to be supplied in multiples of a scanline */
+    nrows = cc / sp->bytesperline;
+    if (cc % sp->bytesperline)
+        TIFFWarningExtR(tif, tif->tif_name, "fractional scanline discarded");
+
+    /* The last strip will be limited to image size */
+    if (!isTiled(tif) && tif->tif_row + nrows > tif->tif_dir.td_imagelength)
+        nrows = tif->tif_dir.td_imagelength - tif->tif_row;
+
+    if (sp->cinfo.c.data_precision == 12)
+    {
+        line16_count = (int)((sp->bytesperline * 2) / 3);
+        line16 = (short *)_TIFFmallocExt(tif, sizeof(short) * line16_count);
+        if (!line16)
         {
-            line16_count = (int)((sp->bytesperline * 2) / 3);
-            line16 = (short *) _TIFFmalloc(sizeof(short) * line16_count);
-            if (!line16)
-            {
-                TIFFErrorExt(tif->tif_clientdata,
-			     "JPEGEncode",
-                             "Failed to allocate memory");
+            TIFFErrorExtR(tif, "JPEGEncode", "Failed to allocate memory");
 
-                return 0;
-            }
+            return 0;
         }
-            
-	while (nrows-- > 0) {
+    }
 
-            if( sp->cinfo.c.data_precision == 12 )
-            {
+    while (nrows-- > 0)
+    {
 
-                int value_pairs = line16_count / 2;
-                int iPair;
+        if (sp->cinfo.c.data_precision == 12)
+        {
 
-		bufptr[0] = (JSAMPROW) line16;
+            int value_pairs = line16_count / 2;
+            int iPair;
 
-                for( iPair = 0; iPair < value_pairs; iPair++ )
-                {
-                    unsigned char *in_ptr =
-                        ((unsigned char *) buf) + iPair * 3;
-                    JSAMPLE *out_ptr = (JSAMPLE *) (line16 + iPair * 2);
+            bufptr[0] = (TIFF_JSAMPROW)line16;
 
-                    out_ptr[0] = (in_ptr[0] << 4) | ((in_ptr[1] & 0xf0) >> 4);
-                    out_ptr[1] = ((in_ptr[1] & 0x0f) << 8) | in_ptr[2];
-                }
-            }
-            else
+            for (iPair = 0; iPair < value_pairs; iPair++)
             {
-		bufptr[0] = (JSAMPROW) buf;
-            }
-            if (TIFFjpeg_write_scanlines(sp, bufptr, 1) != 1)
-                return (0);
-            if (nrows > 0)
-                tif->tif_row++;
-            buf += sp->bytesperline;
-	}
+                unsigned char *in_ptr = ((unsigned char *)buf) + iPair * 3;
+                TIFF_JSAMPLE *out_ptr = (TIFF_JSAMPLE *)(line16 + iPair * 2);
 
-        if( sp->cinfo.c.data_precision == 12 )
+                out_ptr[0] = (in_ptr[0] << 4) | ((in_ptr[1] & 0xf0) >> 4);
+                out_ptr[1] = ((in_ptr[1] & 0x0f) << 8) | in_ptr[2];
+            }
+        }
+        else
         {
-            _TIFFfree( line16 );
+            bufptr[0] = (TIFF_JSAMPROW)buf;
         }
-            
-	return (1);
+        if (TIFFjpeg_write_scanlines(sp, bufptr, 1) != 1)
+            return (0);
+        if (nrows > 0)
+            tif->tif_row++;
+        buf += sp->bytesperline;
+    }
+
+    if (sp->cinfo.c.data_precision == 12)
+    {
+        _TIFFfreeExt(tif, line16);
+    }
+
+    return (1);
 }
 
 /*
  * Encode a chunk of pixels.
  * Incoming data is expected to be downsampled per sampling factors.
  */
-static int
-JPEGEncodeRaw(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
-{
-	JPEGState *sp = JState(tif);
-	JSAMPLE* inptr;
-	JSAMPLE* outptr;
-	tmsize_t nrows;
-	JDIMENSION clumps_per_line, nclump;
-	int clumpoffset, ci, xpos, ypos;
-	jpeg_component_info* compptr;
-	int samples_per_clump = sp->samplesperclump;
-	tmsize_t bytesperclumpline;
-
-	(void) s;
-	assert(sp != NULL);
-	/* data is expected to be supplied in multiples of a clumpline */
-	/* a clumpline is equivalent to v_sampling desubsampled scanlines */
-	/* TODO: the following calculation of bytesperclumpline, should substitute calculation of sp->bytesperline, except that it is per v_sampling lines */
-	bytesperclumpline = ((((tmsize_t)sp->cinfo.c.image_width+sp->h_sampling-1)/sp->h_sampling)
-			     *((tmsize_t)sp->h_sampling*sp->v_sampling+2)*sp->cinfo.c.data_precision+7)
-			    /8;
-
-	nrows = ( cc / bytesperclumpline ) * sp->v_sampling;
-	if (cc % bytesperclumpline)
-		TIFFWarningExt(tif->tif_clientdata, tif->tif_name, "fractional scanline discarded");
-
-	/* Cb,Cr both have sampling factors 1, so this is correct */
-	clumps_per_line = sp->cinfo.c.comp_info[1].downsampled_width;
-
-	while (nrows > 0) {
-		/*
-		 * Fastest way to separate the data is to make one pass
-		 * over the scanline for each row of each component.
-		 */
-		clumpoffset = 0;		/* first sample in clump */
-		for (ci = 0, compptr = sp->cinfo.c.comp_info;
-		     ci < sp->cinfo.c.num_components;
-		     ci++, compptr++) {
-		    int hsamp = compptr->h_samp_factor;
-		    int vsamp = compptr->v_samp_factor;
-		    int padding = (int) (compptr->width_in_blocks * DCTSIZE -
-					 clumps_per_line * hsamp);
-		    for (ypos = 0; ypos < vsamp; ypos++) {
-			inptr = ((JSAMPLE*) buf) + clumpoffset;
-			outptr = sp->ds_buffer[ci][sp->scancount*vsamp + ypos];
-			if (hsamp == 1) {
-			    /* fast path for at least Cb and Cr */
-			    for (nclump = clumps_per_line; nclump-- > 0; ) {
-				*outptr++ = inptr[0];
-				inptr += samples_per_clump;
-			    }
-			} else {
-			    /* general case */
-			    for (nclump = clumps_per_line; nclump-- > 0; ) {
-				for (xpos = 0; xpos < hsamp; xpos++)
-				    *outptr++ = inptr[xpos];
-				inptr += samples_per_clump;
-			    }
-			}
-			/* pad each scanline as needed */
-			for (xpos = 0; xpos < padding; xpos++) {
-			    *outptr = outptr[-1];
-			    outptr++;
-			}
-			clumpoffset += hsamp;
-		    }
-		}
-		sp->scancount++;
-		if (sp->scancount >= DCTSIZE) {
-			int n = sp->cinfo.c.max_v_samp_factor * DCTSIZE;
-			if (TIFFjpeg_write_raw_data(sp, sp->ds_buffer, n) != n)
-				return (0);
-			sp->scancount = 0;
-		}
-		tif->tif_row += sp->v_sampling;
-		buf += bytesperclumpline;
-		nrows -= sp->v_sampling;
-	}
-	return (1);
+static int JPEGEncodeRaw(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s)
+{
+    JPEGState *sp = JState(tif);
+    TIFF_JSAMPLE *inptr;
+    TIFF_JSAMPLE *outptr;
+    tmsize_t nrows;
+    JDIMENSION clumps_per_line, nclump;
+    int clumpoffset, ci, xpos, ypos;
+    jpeg_component_info *compptr;
+    int samples_per_clump = sp->samplesperclump;
+    tmsize_t bytesperclumpline;
+
+    (void)s;
+    assert(sp != NULL);
+    /* data is expected to be supplied in multiples of a clumpline */
+    /* a clumpline is equivalent to v_sampling desubsampled scanlines */
+    /* TODO: the following calculation of bytesperclumpline, should substitute
+     * calculation of sp->bytesperline, except that it is per v_sampling lines
+     */
+    bytesperclumpline =
+        ((((tmsize_t)sp->cinfo.c.image_width + sp->h_sampling - 1) /
+          sp->h_sampling) *
+             ((tmsize_t)sp->h_sampling * sp->v_sampling + 2) *
+             sp->cinfo.c.data_precision +
+         7) /
+        8;
+
+    nrows = (cc / bytesperclumpline) * sp->v_sampling;
+    if (cc % bytesperclumpline)
+        TIFFWarningExtR(tif, tif->tif_name, "fractional scanline discarded");
+
+    /* Cb,Cr both have sampling factors 1, so this is correct */
+    clumps_per_line = sp->cinfo.c.comp_info[1].downsampled_width;
+
+    while (nrows > 0)
+    {
+        /*
+         * Fastest way to separate the data is to make one pass
+         * over the scanline for each row of each component.
+         */
+        clumpoffset = 0; /* first sample in clump */
+        for (ci = 0, compptr = sp->cinfo.c.comp_info;
+             ci < sp->cinfo.c.num_components; ci++, compptr++)
+        {
+            int hsamp = compptr->h_samp_factor;
+            int vsamp = compptr->v_samp_factor;
+            int padding = (int)(compptr->width_in_blocks * DCTSIZE -
+                                clumps_per_line * hsamp);
+            for (ypos = 0; ypos < vsamp; ypos++)
+            {
+                inptr = ((TIFF_JSAMPLE *)buf) + clumpoffset;
+                outptr = sp->ds_buffer[ci][sp->scancount * vsamp + ypos];
+                if (hsamp == 1)
+                {
+                    /* fast path for at least Cb and Cr */
+                    for (nclump = clumps_per_line; nclump-- > 0;)
+                    {
+                        *outptr++ = inptr[0];
+                        inptr += samples_per_clump;
+                    }
+                }
+                else
+                {
+                    /* general case */
+                    for (nclump = clumps_per_line; nclump-- > 0;)
+                    {
+                        for (xpos = 0; xpos < hsamp; xpos++)
+                            *outptr++ = inptr[xpos];
+                        inptr += samples_per_clump;
+                    }
+                }
+                /* pad each scanline as needed */
+                for (xpos = 0; xpos < padding; xpos++)
+                {
+                    *outptr = outptr[-1];
+                    outptr++;
+                }
+                clumpoffset += hsamp;
+            }
+        }
+        sp->scancount++;
+        if (sp->scancount >= DCTSIZE)
+        {
+            int n = sp->cinfo.c.max_v_samp_factor * DCTSIZE;
+            if (TIFFjpeg_write_raw_data(sp, sp->ds_buffer, n) != n)
+                return (0);
+            sp->scancount = 0;
+        }
+        tif->tif_row += sp->v_sampling;
+        buf += bytesperclumpline;
+        nrows -= sp->v_sampling;
+    }
+    return (1);
 }
 
 /*
  * Finish up at the end of a strip or tile.
  */
-static int
-JPEGPostEncode(TIFF* tif)
-{
-	JPEGState *sp = JState(tif);
-
-	if (sp->scancount > 0) {
-		/*
-		 * Need to emit a partial bufferload of downsampled data.
-		 * Pad the data vertically.
-		 */
-		int ci, ypos, n;
-		jpeg_component_info* compptr;
-
-		for (ci = 0, compptr = sp->cinfo.c.comp_info;
-		     ci < sp->cinfo.c.num_components;
-		     ci++, compptr++) {
-			int vsamp = compptr->v_samp_factor;
-			tmsize_t row_width = compptr->width_in_blocks * DCTSIZE
-				* sizeof(JSAMPLE);
-			for (ypos = sp->scancount * vsamp;
-			     ypos < DCTSIZE * vsamp; ypos++) {
-				_TIFFmemcpy((void*)sp->ds_buffer[ci][ypos],
-					    (void*)sp->ds_buffer[ci][ypos-1],
-					    row_width);
-
-			}
-		}
-		n = sp->cinfo.c.max_v_samp_factor * DCTSIZE;
-		if (TIFFjpeg_write_raw_data(sp, sp->ds_buffer, n) != n)
-			return (0);
-	}
-
-	return (TIFFjpeg_finish_compress(JState(tif)));
+static int JPEGPostEncode(TIFF *tif)
+{
+    JPEGState *sp = JState(tif);
+
+    if (sp->scancount > 0)
+    {
+        /*
+         * Need to emit a partial bufferload of downsampled data.
+         * Pad the data vertically.
+         */
+        int ci, ypos, n;
+        jpeg_component_info *compptr;
+
+        for (ci = 0, compptr = sp->cinfo.c.comp_info;
+             ci < sp->cinfo.c.num_components; ci++, compptr++)
+        {
+            int vsamp = compptr->v_samp_factor;
+            tmsize_t row_width =
+                compptr->width_in_blocks * DCTSIZE * sizeof(JSAMPLE);
+            for (ypos = sp->scancount * vsamp; ypos < DCTSIZE * vsamp; ypos++)
+            {
+                _TIFFmemcpy((void *)sp->ds_buffer[ci][ypos],
+                            (void *)sp->ds_buffer[ci][ypos - 1], row_width);
+            }
+        }
+        n = sp->cinfo.c.max_v_samp_factor * DCTSIZE;
+        if (TIFFjpeg_write_raw_data(sp, sp->ds_buffer, n) != n)
+            return (0);
+    }
+
+    return (TIFFjpeg_finish_compress(JState(tif)));
 }
 
-static void
-JPEGCleanup(TIFF* tif)
-{
-	JPEGState *sp = JState(tif);
-	
-	assert(sp != 0);
-
-	tif->tif_tagmethods.vgetfield = sp->vgetparent;
-	tif->tif_tagmethods.vsetfield = sp->vsetparent;
-	tif->tif_tagmethods.printdir = sp->printdir;
-        if( sp->cinfo_initialized )
-                TIFFjpeg_destroy(sp);	/* release libjpeg resources */
-        if (sp->jpegtables)		/* tag value */
-                _TIFFfree(sp->jpegtables);
-	_TIFFfree(tif->tif_data);	/* release local state */
-	tif->tif_data = NULL;
-
-	_TIFFSetDefaultCompressionState(tif);
-}
-
-static void 
-JPEGResetUpsampled( TIFF* tif )
-{
-	JPEGState* sp = JState(tif);
-	TIFFDirectory* td = &tif->tif_dir;
-
-	/*
-	 * Mark whether returned data is up-sampled or not so TIFFStripSize
-	 * and TIFFTileSize return values that reflect the true amount of
-	 * data.
-	 */
-	tif->tif_flags &= ~TIFF_UPSAMPLED;
-	if (td->td_planarconfig == PLANARCONFIG_CONTIG) {
-		if (td->td_photometric == PHOTOMETRIC_YCBCR &&
-		    sp->jpegcolormode == JPEGCOLORMODE_RGB) {
-			tif->tif_flags |= TIFF_UPSAMPLED;
-		} else {
+static void JPEGCleanup(TIFF *tif)
+{
+    JPEGState *sp = JState(tif);
+
+    assert(sp != 0);
+
+    tif->tif_tagmethods.vgetfield = sp->otherSettings.vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->otherSettings.vsetparent;
+    tif->tif_tagmethods.printdir = sp->otherSettings.printdir;
+    if (sp->cinfo_initialized)
+        TIFFjpeg_destroy(sp);         /* release libjpeg resources */
+    if (sp->otherSettings.jpegtables) /* tag value */
+        _TIFFfreeExt(tif, sp->otherSettings.jpegtables);
+    _TIFFfreeExt(tif, tif->tif_data); /* release local state */
+    tif->tif_data = NULL;
+
+    _TIFFSetDefaultCompressionState(tif);
+}
+
+static void JPEGResetUpsampled(TIFF *tif)
+{
+    JPEGState *sp = JState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+
+    /*
+     * Mark whether returned data is up-sampled or not so TIFFStripSize
+     * and TIFFTileSize return values that reflect the true amount of
+     * data.
+     */
+    tif->tif_flags &= ~TIFF_UPSAMPLED;
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG)
+    {
+        if (td->td_photometric == PHOTOMETRIC_YCBCR &&
+            sp->otherSettings.jpegcolormode == JPEGCOLORMODE_RGB)
+        {
+            tif->tif_flags |= TIFF_UPSAMPLED;
+        }
+        else
+        {
 #ifdef notdef
-			if (td->td_ycbcrsubsampling[0] != 1 ||
-			    td->td_ycbcrsubsampling[1] != 1)
-				; /* XXX what about up-sampling? */
+            if (td->td_ycbcrsubsampling[0] != 1 ||
+                td->td_ycbcrsubsampling[1] != 1)
+                ; /* XXX what about up-sampling? */
 #endif
-		}
-	}
+        }
+    }
 
-	/*
-	 * Must recalculate cached tile size in case sampling state changed.
-	 * Should we really be doing this now if image size isn't set? 
-	 */
-        if( tif->tif_tilesize > 0 )
-            tif->tif_tilesize = isTiled(tif) ? TIFFTileSize(tif) : (tmsize_t)(-1);   
-        if( tif->tif_scanlinesize > 0 )
-            tif->tif_scanlinesize = TIFFScanlineSize(tif); 
+    /*
+     * Must recalculate cached tile size in case sampling state changed.
+     * Should we really be doing this now if image size isn't set?
+     */
+    if (tif->tif_tilesize > 0)
+        tif->tif_tilesize = isTiled(tif) ? TIFFTileSize(tif) : (tmsize_t)(-1);
+    if (tif->tif_scanlinesize > 0)
+        tif->tif_scanlinesize = TIFFScanlineSize(tif);
 }
 
-static int
-JPEGVSetField(TIFF* tif, uint32 tag, va_list ap)
-{
-	JPEGState* sp = JState(tif);
-	const TIFFField* fip;
-	uint32 v32;
-
-	assert(sp != NULL);
-
-	switch (tag) {
-	case TIFFTAG_JPEGTABLES:
-		v32 = (uint32) va_arg(ap, uint32);
-		if (v32 == 0) {
-			/* XXX */
-			return (0);
-		}
-		_TIFFsetByteArray(&sp->jpegtables, va_arg(ap, void*), v32);
-		sp->jpegtables_length = v32;
-		TIFFSetFieldBit(tif, FIELD_JPEGTABLES);
-		break;
-	case TIFFTAG_JPEGQUALITY:
-		sp->jpegquality = (int) va_arg(ap, int);
-		return (1);			/* pseudo tag */
-	case TIFFTAG_JPEGCOLORMODE:
-		sp->jpegcolormode = (int) va_arg(ap, int);
-		JPEGResetUpsampled( tif );
-		return (1);			/* pseudo tag */
-	case TIFFTAG_PHOTOMETRIC:
-	{
-		int ret_value = (*sp->vsetparent)(tif, tag, ap);
-		JPEGResetUpsampled( tif );
-		return ret_value;
-	}
-	case TIFFTAG_JPEGTABLESMODE:
-		sp->jpegtablesmode = (int) va_arg(ap, int);
-		return (1);			/* pseudo tag */
-	case TIFFTAG_YCBCRSUBSAMPLING:
-		/* mark the fact that we have a real ycbcrsubsampling! */
-		sp->ycbcrsampling_fetched = 1;
-		/* should we be recomputing upsampling info here? */
-		return (*sp->vsetparent)(tif, tag, ap);
-	default:
-		return (*sp->vsetparent)(tif, tag, ap);
-	}
-
-	if ((fip = TIFFFieldWithTag(tif, tag)) != NULL) {
-		TIFFSetFieldBit(tif, fip->field_bit);
-	} else {
-		return (0);
-	}
-
-	tif->tif_flags |= TIFF_DIRTYDIRECT;
-	return (1);
+static int JPEGVSetField(TIFF *tif, uint32_t tag, va_list ap)
+{
+    JPEGState *sp = JState(tif);
+    const TIFFField *fip;
+    uint32_t v32;
+
+    assert(sp != NULL);
+
+    switch (tag)
+    {
+        case TIFFTAG_JPEGTABLES:
+            v32 = (uint32_t)va_arg(ap, uint32_t);
+            if (v32 == 0)
+            {
+                /* XXX */
+                return (0);
+            }
+            _TIFFsetByteArrayExt(tif, &sp->otherSettings.jpegtables,
+                                 va_arg(ap, void *), v32);
+            sp->otherSettings.jpegtables_length = v32;
+            TIFFSetFieldBit(tif, FIELD_JPEGTABLES);
+            break;
+        case TIFFTAG_JPEGQUALITY:
+            sp->otherSettings.jpegquality = (int)va_arg(ap, int);
+            return (1); /* pseudo tag */
+        case TIFFTAG_JPEGCOLORMODE:
+            sp->otherSettings.jpegcolormode = (int)va_arg(ap, int);
+            JPEGResetUpsampled(tif);
+            return (1); /* pseudo tag */
+        case TIFFTAG_PHOTOMETRIC:
+        {
+            int ret_value = (*sp->otherSettings.vsetparent)(tif, tag, ap);
+            JPEGResetUpsampled(tif);
+            return ret_value;
+        }
+        case TIFFTAG_JPEGTABLESMODE:
+            sp->otherSettings.jpegtablesmode = (int)va_arg(ap, int);
+            return (1); /* pseudo tag */
+        case TIFFTAG_YCBCRSUBSAMPLING:
+            /* mark the fact that we have a real ycbcrsubsampling! */
+            sp->otherSettings.ycbcrsampling_fetched = 1;
+            /* should we be recomputing upsampling info here? */
+            return (*sp->otherSettings.vsetparent)(tif, tag, ap);
+        default:
+            return (*sp->otherSettings.vsetparent)(tif, tag, ap);
+    }
+
+    if ((fip = TIFFFieldWithTag(tif, tag)) != NULL)
+    {
+        TIFFSetFieldBit(tif, fip->field_bit);
+    }
+    else
+    {
+        return (0);
+    }
+
+    tif->tif_flags |= TIFF_DIRTYDIRECT;
+    return (1);
 }
 
-static int
-JPEGVGetField(TIFF* tif, uint32 tag, va_list ap)
-{
-	JPEGState* sp = JState(tif);
-
-	assert(sp != NULL);
-
-	switch (tag) {
-		case TIFFTAG_JPEGTABLES:
-			*va_arg(ap, uint32*) = sp->jpegtables_length;
-			*va_arg(ap, const void**) = sp->jpegtables;
-			break;
-		case TIFFTAG_JPEGQUALITY:
-			*va_arg(ap, int*) = sp->jpegquality;
-			break;
-		case TIFFTAG_JPEGCOLORMODE:
-			*va_arg(ap, int*) = sp->jpegcolormode;
-			break;
-		case TIFFTAG_JPEGTABLESMODE:
-			*va_arg(ap, int*) = sp->jpegtablesmode;
-			break;
-		default:
-			return (*sp->vgetparent)(tif, tag, ap);
-	}
-	return (1);
+static int JPEGVGetField(TIFF *tif, uint32_t tag, va_list ap)
+{
+    JPEGState *sp = JState(tif);
+
+    assert(sp != NULL);
+
+    switch (tag)
+    {
+        case TIFFTAG_JPEGTABLES:
+            *va_arg(ap, uint32_t *) = sp->otherSettings.jpegtables_length;
+            *va_arg(ap, const void **) = sp->otherSettings.jpegtables;
+            break;
+        case TIFFTAG_JPEGQUALITY:
+            *va_arg(ap, int *) = sp->otherSettings.jpegquality;
+            break;
+        case TIFFTAG_JPEGCOLORMODE:
+            *va_arg(ap, int *) = sp->otherSettings.jpegcolormode;
+            break;
+        case TIFFTAG_JPEGTABLESMODE:
+            *va_arg(ap, int *) = sp->otherSettings.jpegtablesmode;
+            break;
+        default:
+            return (*sp->otherSettings.vgetparent)(tif, tag, ap);
+    }
+    return (1);
 }
 
-static void
-JPEGPrintDir(TIFF* tif, FILE* fd, long flags)
+static void JPEGPrintDir(TIFF *tif, FILE *fd, long flags)
 {
-	JPEGState* sp = JState(tif);
+    JPEGState *sp = JState(tif);
 
-	assert(sp != NULL);
-	(void) flags;
+    assert(sp != NULL);
+    (void)flags;
 
-        if( sp != NULL ) {
-		if (TIFFFieldSet(tif,FIELD_JPEGTABLES))
-			fprintf(fd, "  JPEG Tables: (%lu bytes)\n",
-				(unsigned long) sp->jpegtables_length);
-		if (sp->printdir)
-			(*sp->printdir)(tif, fd, flags);
-	}
+    if (sp != NULL)
+    {
+        if (TIFFFieldSet(tif, FIELD_JPEGTABLES))
+            fprintf(fd, "  JPEG Tables: (%" PRIu32 " bytes)\n",
+                    sp->otherSettings.jpegtables_length);
+        if (sp->otherSettings.printdir)
+            (*sp->otherSettings.printdir)(tif, fd, flags);
+    }
 }
 
-static uint32
-JPEGDefaultStripSize(TIFF* tif, uint32 s)
+static uint32_t JPEGDefaultStripSize(TIFF *tif, uint32_t s)
 {
-	JPEGState* sp = JState(tif);
-	TIFFDirectory *td = &tif->tif_dir;
+    JPEGState *sp = JState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
 
-	s = (*sp->defsparent)(tif, s);
-	if (s < td->td_imagelength)
-		s = TIFFroundup_32(s, td->td_ycbcrsubsampling[1] * DCTSIZE);
-	return (s);
+    s = (*sp->otherSettings.defsparent)(tif, s);
+    if (s < td->td_imagelength)
+        s = TIFFroundup_32(s, td->td_ycbcrsubsampling[1] * DCTSIZE);
+    return (s);
 }
 
-static void
-JPEGDefaultTileSize(TIFF* tif, uint32* tw, uint32* th)
+static void JPEGDefaultTileSize(TIFF *tif, uint32_t *tw, uint32_t *th)
 {
-	JPEGState* sp = JState(tif);
-	TIFFDirectory *td = &tif->tif_dir;
+    JPEGState *sp = JState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
 
-	(*sp->deftparent)(tif, tw, th);
-	*tw = TIFFroundup_32(*tw, td->td_ycbcrsubsampling[0] * DCTSIZE);
-	*th = TIFFroundup_32(*th, td->td_ycbcrsubsampling[1] * DCTSIZE);
+    (*sp->otherSettings.deftparent)(tif, tw, th);
+    *tw = TIFFroundup_32(*tw, td->td_ycbcrsubsampling[0] * DCTSIZE);
+    *th = TIFFroundup_32(*th, td->td_ycbcrsubsampling[1] * DCTSIZE);
 }
 
 /*
  * The JPEG library initialized used to be done in TIFFInitJPEG(), but
  * now that we allow a TIFF file to be opened in update mode it is necessary
  * to have some way of deciding whether compression or decompression is
- * desired other than looking at tif->tif_mode.  We accomplish this by 
+ * desired other than looking at tif->tif_mode.  We accomplish this by
  * examining {TILE/STRIP}BYTECOUNTS to see if there is a non-zero entry.
- * If so, we assume decompression is desired. 
+ * If so, we assume decompression is desired.
  *
  * This is tricky, because TIFFInitJPEG() is called while the directory is
  * being read, and generally speaking the BYTECOUNTS tag won't have been read
  * at that point.  So we try to defer jpeg library initialization till we
  * do have that tag ... basically any access that might require the compressor
- * or decompressor that occurs after the reading of the directory. 
+ * or decompressor that occurs after the reading of the directory.
  *
  * In an ideal world compressors or decompressors would be setup
  * at the point where a single tile or strip was accessed (for read or write)
@@ -2429,16 +2719,16 @@ JPEGDefaultTileSize(TIFF* tif, uint32* tw, uint32* th)
  * NFW, Feb 3rd, 2003.
  */
 
-static int JPEGInitializeLibJPEG( TIFF * tif, int decompress )
+static int JPEGInitializeLibJPEG(TIFF *tif, int decompress)
 {
-    JPEGState* sp = JState(tif);
+    JPEGState *sp = JState(tif);
 
-    if(sp->cinfo_initialized)
+    if (sp->cinfo_initialized)
     {
-        if( !decompress && sp->cinfo.comm.is_decompressor )
-            TIFFjpeg_destroy( sp );
-        else if( decompress && !sp->cinfo.comm.is_decompressor )
-            TIFFjpeg_destroy( sp );
+        if (!decompress && sp->cinfo.comm.is_decompressor)
+            TIFFjpeg_destroy(sp);
+        else if (decompress && !sp->cinfo.comm.is_decompressor)
+            TIFFjpeg_destroy(sp);
         else
             return 1;
 
@@ -2448,29 +2738,36 @@ static int JPEGInitializeLibJPEG( TIFF * tif, int decompress )
     /*
      * Initialize libjpeg.
      */
-    if ( decompress ) {
+    if (decompress)
+    {
         if (!TIFFjpeg_create_decompress(sp))
             return (0);
-    } else {
+    }
+    else
+    {
         if (!TIFFjpeg_create_compress(sp))
             return (0);
 #ifndef TIFF_JPEG_MAX_MEMORY_TO_USE
 #define TIFF_JPEG_MAX_MEMORY_TO_USE (10 * 1024 * 1024)
 #endif
         /* libjpeg turbo 1.5.2 honours max_memory_to_use, but has no backing */
-        /* store implementation, so better not set max_memory_to_use ourselves. */
+        /* store implementation, so better not set max_memory_to_use ourselves.
+         */
         /* See https://github.com/libjpeg-turbo/libjpeg-turbo/issues/162 */
-        if( sp->cinfo.c.mem->max_memory_to_use > 0 )
+        if (sp->cinfo.c.mem->max_memory_to_use > 0)
         {
             /* This is to address bug related in ticket GDAL #1795. */
             if (getenv("JPEGMEM") == NULL)
             {
-                /* Increase the max memory usable. This helps when creating files */
+                /* Increase the max memory usable. This helps when creating
+                 * files */
                 /* with "big" tile, without using libjpeg temporary files. */
                 /* For example a 512x512 tile with 3 bands */
                 /* requires 1.5 MB which is above libjpeg 1MB default */
-                if( sp->cinfo.c.mem->max_memory_to_use < TIFF_JPEG_MAX_MEMORY_TO_USE )
-                    sp->cinfo.c.mem->max_memory_to_use = TIFF_JPEG_MAX_MEMORY_TO_USE;
+                if (sp->cinfo.c.mem->max_memory_to_use <
+                    TIFF_JPEG_MAX_MEMORY_TO_USE)
+                    sp->cinfo.c.mem->max_memory_to_use =
+                        TIFF_JPEG_MAX_MEMORY_TO_USE;
             }
         }
     }
@@ -2480,124 +2777,124 @@ static int JPEGInitializeLibJPEG( TIFF * tif, int decompress )
     return 1;
 }
 
-int
-TIFFInitJPEG(TIFF* tif, int scheme)
-{
-	JPEGState* sp;
-
-        (void)scheme;
-	assert(scheme == COMPRESSION_JPEG);
-
-	/*
-	 * Merge codec-specific tag information.
-	 */
-	if (!_TIFFMergeFields(tif, jpegFields, TIFFArrayCount(jpegFields))) {
-		TIFFErrorExt(tif->tif_clientdata,
-			     "TIFFInitJPEG",
-			     "Merging JPEG codec-specific tags failed");
-		return 0;
-	}
-
-	/*
-	 * Allocate state block so tag methods have storage to record values.
-	 */
-	tif->tif_data = (uint8*) _TIFFmalloc(sizeof (JPEGState));
-
-	if (tif->tif_data == NULL) {
-		TIFFErrorExt(tif->tif_clientdata,
-			     "TIFFInitJPEG", "No space for JPEG state block");
-		return 0;
-	}
-        _TIFFmemset(tif->tif_data, 0, sizeof(JPEGState));
-
-	sp = JState(tif);
-	sp->tif = tif;				/* back link */
-
-	/*
-	 * Override parent get/set field methods.
-	 */
-	sp->vgetparent = tif->tif_tagmethods.vgetfield;
-	tif->tif_tagmethods.vgetfield = JPEGVGetField; /* hook for codec tags */
-	sp->vsetparent = tif->tif_tagmethods.vsetfield;
-	tif->tif_tagmethods.vsetfield = JPEGVSetField; /* hook for codec tags */
-	sp->printdir = tif->tif_tagmethods.printdir;
-	tif->tif_tagmethods.printdir = JPEGPrintDir;   /* hook for codec tags */
-
-	/* Default values for codec-specific fields */
-	sp->jpegtables = NULL;
-	sp->jpegtables_length = 0;
-	sp->jpegquality = 75;			/* Default IJG quality */
-	sp->jpegcolormode = JPEGCOLORMODE_RAW;
-	sp->jpegtablesmode = JPEGTABLESMODE_QUANT | JPEGTABLESMODE_HUFF;
-        sp->ycbcrsampling_fetched = 0;
-
-	/*
-	 * Install codec methods.
-	 */
-	tif->tif_fixuptags = JPEGFixupTags;
-	tif->tif_setupdecode = JPEGSetupDecode;
-	tif->tif_predecode = JPEGPreDecode;
-	tif->tif_decoderow = JPEGDecode;
-	tif->tif_decodestrip = JPEGDecode;
-	tif->tif_decodetile = JPEGDecode;
-	tif->tif_setupencode = JPEGSetupEncode;
-	tif->tif_preencode = JPEGPreEncode;
-	tif->tif_postencode = JPEGPostEncode;
-	tif->tif_encoderow = JPEGEncode;
-	tif->tif_encodestrip = JPEGEncode;
-	tif->tif_encodetile = JPEGEncode;  
-	tif->tif_cleanup = JPEGCleanup;
-	sp->defsparent = tif->tif_defstripsize;
-	tif->tif_defstripsize = JPEGDefaultStripSize;
-	sp->deftparent = tif->tif_deftilesize;
-	tif->tif_deftilesize = JPEGDefaultTileSize;
-	tif->tif_flags |= TIFF_NOBITREV;	/* no bit reversal, please */
-
-        sp->cinfo_initialized = FALSE;
-
-	/*
-        ** Create a JPEGTables field if no directory has yet been created. 
-        ** We do this just to ensure that sufficient space is reserved for
-        ** the JPEGTables field.  It will be properly created the right
-        ** size later. 
+/* Common to tif_jpeg.c and tif_jpeg_12.c */
+static void TIFFInitJPEGCommon(TIFF *tif)
+{
+    JPEGState *sp;
+
+    sp = JState(tif);
+    sp->tif = tif; /* back link */
+
+    /* Default values for codec-specific fields */
+    sp->otherSettings.jpegtables = NULL;
+    sp->otherSettings.jpegtables_length = 0;
+    sp->otherSettings.jpegquality = 75; /* Default IJG quality */
+    sp->otherSettings.jpegcolormode = JPEGCOLORMODE_RAW;
+    sp->otherSettings.jpegtablesmode =
+        JPEGTABLESMODE_QUANT | JPEGTABLESMODE_HUFF;
+    sp->otherSettings.ycbcrsampling_fetched = 0;
+
+    tif->tif_tagmethods.vgetfield = JPEGVGetField; /* hook for codec tags */
+    tif->tif_tagmethods.vsetfield = JPEGVSetField; /* hook for codec tags */
+    tif->tif_tagmethods.printdir = JPEGPrintDir;   /* hook for codec tags */
+
+    /*
+     * Install codec methods.
+     */
+    tif->tif_fixuptags = JPEGFixupTags;
+    tif->tif_setupdecode = JPEGSetupDecode;
+    tif->tif_predecode = JPEGPreDecode;
+    tif->tif_decoderow = JPEGDecode;
+    tif->tif_decodestrip = JPEGDecode;
+    tif->tif_decodetile = JPEGDecode;
+    tif->tif_setupencode = JPEGSetupEncode;
+    tif->tif_preencode = JPEGPreEncode;
+    tif->tif_postencode = JPEGPostEncode;
+    tif->tif_encoderow = JPEGEncode;
+    tif->tif_encodestrip = JPEGEncode;
+    tif->tif_encodetile = JPEGEncode;
+    tif->tif_cleanup = JPEGCleanup;
+
+    tif->tif_defstripsize = JPEGDefaultStripSize;
+    tif->tif_deftilesize = JPEGDefaultTileSize;
+    tif->tif_flags |= TIFF_NOBITREV; /* no bit reversal, please */
+    sp->cinfo_initialized = FALSE;
+}
+
+int TIFFInitJPEG(TIFF *tif, int scheme)
+{
+    JPEGState *sp;
+
+    (void)scheme;
+    assert(scheme == COMPRESSION_JPEG);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, jpegFields, TIFFArrayCount(jpegFields)))
+    {
+        TIFFErrorExtR(tif, "TIFFInitJPEG",
+                      "Merging JPEG codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(JPEGState));
+
+    if (tif->tif_data == NULL)
+    {
+        TIFFErrorExtR(tif, "TIFFInitJPEG", "No space for JPEG state block");
+        return 0;
+    }
+    _TIFFmemset(tif->tif_data, 0, sizeof(JPEGState));
+
+    sp = JState(tif);
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->otherSettings.vgetparent = tif->tif_tagmethods.vgetfield;
+    sp->otherSettings.vsetparent = tif->tif_tagmethods.vsetfield;
+    sp->otherSettings.printdir = tif->tif_tagmethods.printdir;
+
+    sp->otherSettings.defsparent = tif->tif_defstripsize;
+    sp->otherSettings.deftparent = tif->tif_deftilesize;
+
+    TIFFInitJPEGCommon(tif);
+
+    /*
+    ** Create a JPEGTables field if no directory has yet been created.
+    ** We do this just to ensure that sufficient space is reserved for
+    ** the JPEGTables field.  It will be properly created the right
+    ** size later.
+    */
+    if (tif->tif_diroff == 0)
+    {
+#define SIZE_OF_JPEGTABLES 2000
+        /*
+        The following line assumes incorrectly that all JPEG-in-TIFF files will
+        have a JPEGTABLES tag generated and causes null-filled JPEGTABLES tags
+        to be written when the JPEG data is placed with TIFFWriteRawStrip.  The
+        field bit should be set, anyway, later when actual JPEGTABLES header is
+        generated, so removing it here hopefully is harmless.
+        TIFFSetFieldBit(tif, FIELD_JPEGTABLES);
         */
-        if( tif->tif_diroff == 0 )
+        sp->otherSettings.jpegtables_length = SIZE_OF_JPEGTABLES;
+        sp->otherSettings.jpegtables =
+            (void *)_TIFFmallocExt(tif, sp->otherSettings.jpegtables_length);
+        if (sp->otherSettings.jpegtables)
         {
-#define SIZE_OF_JPEGTABLES 2000
-/*
-The following line assumes incorrectly that all JPEG-in-TIFF files will have
-a JPEGTABLES tag generated and causes null-filled JPEGTABLES tags to be written
-when the JPEG data is placed with TIFFWriteRawStrip.  The field bit should be 
-set, anyway, later when actual JPEGTABLES header is generated, so removing it 
-here hopefully is harmless.
-            TIFFSetFieldBit(tif, FIELD_JPEGTABLES);
-*/
-            sp->jpegtables_length = SIZE_OF_JPEGTABLES;
-            sp->jpegtables = (void *) _TIFFmalloc(sp->jpegtables_length);
-            if (sp->jpegtables)
-            {
-                _TIFFmemset(sp->jpegtables, 0, SIZE_OF_JPEGTABLES);
-            }
-            else
-            {
-                TIFFErrorExt(tif->tif_clientdata,
-			     "TIFFInitJPEG",
-                             "Failed to allocate memory for JPEG tables");
-                return 0;
-            }
-#undef SIZE_OF_JPEGTABLES
+            _TIFFmemset(sp->otherSettings.jpegtables, 0, SIZE_OF_JPEGTABLES);
         }
-
-	return 1;
+        else
+        {
+            TIFFErrorExtR(tif, "TIFFInitJPEG",
+                          "Failed to allocate memory for JPEG tables");
+            return 0;
+        }
+#undef SIZE_OF_JPEGTABLES
+    }
+    return 1;
 }
 #endif /* JPEG_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_jpeg_12.c b/3rdparty/libtiff/tif_jpeg_12.c
index b458c25899e2..406e1cfc3158 100644
--- a/3rdparty/libtiff/tif_jpeg_12.c
+++ b/3rdparty/libtiff/tif_jpeg_12.c
@@ -1,69 +1,63 @@
 
 #include "tiffiop.h"
 
+#if defined(HAVE_JPEGTURBO_DUAL_MODE_8_12)
+#define JPEG_DUAL_MODE_8_12
+#endif
+
 #if defined(JPEG_DUAL_MODE_8_12)
 
-#  define TIFFInitJPEG TIFFInitJPEG_12
-#  define TIFFJPEGIsFullStripRequired TIFFJPEGIsFullStripRequired_12
+#define FROM_TIF_JPEG_12
+
+#ifdef TIFFInitJPEG
+#undef TIFFInitJPEG
+#endif
+#define TIFFInitJPEG TIFFInitJPEG_12
 
-int
-TIFFInitJPEG_12(TIFF* tif, int scheme);
+#ifdef TIFFJPEGIsFullStripRequired
+#undef TIFFJPEGIsFullStripRequired
+#endif
+#define TIFFJPEGIsFullStripRequired TIFFJPEGIsFullStripRequired_12
 
-#  include LIBJPEG_12_PATH
+int TIFFInitJPEG_12(TIFF *tif, int scheme);
 
-#  include "tif_jpeg.c"
+#if !defined(HAVE_JPEGTURBO_DUAL_MODE_8_12)
+#include LIBJPEG_12_PATH
+#endif
 
-int TIFFReInitJPEG_12( TIFF *tif, int scheme, int is_encode )
+#include "tif_jpeg.c"
 
+int TIFFReInitJPEG_12(TIFF *tif, const JPEGOtherSettings *otherSettings,
+                      int scheme, int is_encode)
 {
-    JPEGState* sp;
+    JPEGState *sp;
+    uint8_t *new_tif_data;
 
+    (void)scheme;
     assert(scheme == COMPRESSION_JPEG);
 
+    new_tif_data =
+        (uint8_t *)_TIFFreallocExt(tif, tif->tif_data, sizeof(JPEGState));
+
+    if (new_tif_data == NULL)
+    {
+        TIFFErrorExtR(tif, "TIFFReInitJPEG_12",
+                      "No space for JPEG state block");
+        return 0;
+    }
+
+    tif->tif_data = new_tif_data;
+    _TIFFmemset(tif->tif_data, 0, sizeof(JPEGState));
+
+    TIFFInitJPEGCommon(tif);
+
     sp = JState(tif);
-    sp->tif = tif;				/* back link */
-
-    /*
-     * Override parent get/set field methods.
-     */
-    tif->tif_tagmethods.vgetfield = JPEGVGetField; /* hook for codec tags */
-    tif->tif_tagmethods.vsetfield = JPEGVSetField; /* hook for codec tags */
-    tif->tif_tagmethods.printdir = JPEGPrintDir;   /* hook for codec tags */
-
-    /*
-     * Install codec methods.
-     */
-    tif->tif_fixuptags = JPEGFixupTags;
-    tif->tif_setupdecode = JPEGSetupDecode;
-    tif->tif_predecode = JPEGPreDecode;
-    tif->tif_decoderow = JPEGDecode;
-    tif->tif_decodestrip = JPEGDecode;
-    tif->tif_decodetile = JPEGDecode;
-    tif->tif_setupencode = JPEGSetupEncode;
-    tif->tif_preencode = JPEGPreEncode;
-    tif->tif_postencode = JPEGPostEncode;
-    tif->tif_encoderow = JPEGEncode;
-    tif->tif_encodestrip = JPEGEncode;
-    tif->tif_encodetile = JPEGEncode;  
-    tif->tif_cleanup = JPEGCleanup;
-    tif->tif_defstripsize = JPEGDefaultStripSize;
-    tif->tif_deftilesize = JPEGDefaultTileSize;
-    tif->tif_flags |= TIFF_NOBITREV;	/* no bit reversal, please */
-
-    sp->cinfo_initialized = FALSE;
-
-    if( is_encode )
+    sp->otherSettings = *otherSettings;
+
+    if (is_encode)
         return JPEGSetupEncode(tif);
     else
         return JPEGSetupDecode(tif);
 }
 
 #endif /* defined(JPEG_DUAL_MODE_8_12) */
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_lerc.c b/3rdparty/libtiff/tif_lerc.c
new file mode 100644
index 000000000000..4f357a6011ac
--- /dev/null
+++ b/3rdparty/libtiff/tif_lerc.c
@@ -0,0 +1,1206 @@
+/*
+ * Copyright (c) 2018, Even Rouault
+ * Author: <even.rouault at spatialys.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and
+ * its documentation for any purpose is hereby granted without fee, provided
+ * that (i) the above copyright notices and this permission notice appear in
+ * all copies of the software and related documentation, and (ii) the names of
+ * Sam Leffler and Silicon Graphics may not be used in any advertising or
+ * publicity relating to the software without the specific, prior written
+ * permission of Sam Leffler and Silicon Graphics.
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
+ * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
+ * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include "tiffiop.h"
+#ifdef LERC_SUPPORT
+/*
+ * TIFF Library.
+ *
+ * LERC Compression Support
+ *
+ */
+
+#include "Lerc_c_api.h"
+#include "zlib.h"
+#ifdef ZSTD_SUPPORT
+#include "zstd.h"
+#endif
+
+#if LIBDEFLATE_SUPPORT
+#include "libdeflate.h"
+#endif
+#define LIBDEFLATE_MAX_COMPRESSION_LEVEL 12
+
+#include <assert.h>
+
+#define LSTATE_INIT_DECODE 0x01
+#define LSTATE_INIT_ENCODE 0x02
+
+#ifndef LERC_AT_LEAST_VERSION
+#define LERC_AT_LEAST_VERSION(maj, min, patch) 0
+#endif
+
+/*
+ * State block for each open TIFF file using LERC compression/decompression.
+ */
+typedef struct
+{
+    double maxzerror; /* max z error */
+    int lerc_version;
+    int additional_compression;
+    int zstd_compress_level; /* zstd */
+    int zipquality;          /* deflate */
+    int state;               /* state flags */
+
+    uint32_t segment_width;
+    uint32_t segment_height;
+
+    unsigned int uncompressed_size;
+    unsigned int uncompressed_alloc;
+    uint8_t *uncompressed_buffer;
+    unsigned int uncompressed_offset;
+
+    unsigned int mask_size;
+    uint8_t *mask_buffer;
+
+    unsigned int compressed_size;
+    void *compressed_buffer;
+
+#if LIBDEFLATE_SUPPORT
+    struct libdeflate_decompressor *libdeflate_dec;
+    struct libdeflate_compressor *libdeflate_enc;
+#endif
+
+    TIFFVGetMethod vgetparent; /* super-class method */
+    TIFFVSetMethod vsetparent; /* super-class method */
+} LERCState;
+
+#define LState(tif) ((LERCState *)(tif)->tif_data)
+#define DecoderState(tif) LState(tif)
+#define EncoderState(tif) LState(tif)
+
+static int LERCEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s);
+static int LERCDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s);
+
+static int LERCFixupTags(TIFF *tif)
+{
+    (void)tif;
+    return 1;
+}
+
+static int LERCSetupDecode(TIFF *tif)
+{
+    LERCState *sp = DecoderState(tif);
+
+    assert(sp != NULL);
+
+    /* if we were last encoding, terminate this mode */
+    if (sp->state & LSTATE_INIT_ENCODE)
+    {
+        sp->state = 0;
+    }
+
+    sp->state |= LSTATE_INIT_DECODE;
+    return 1;
+}
+
+static int GetLercDataType(TIFF *tif)
+{
+    TIFFDirectory *td = &tif->tif_dir;
+    static const char module[] = "GetLercDataType";
+
+    if (td->td_sampleformat == SAMPLEFORMAT_INT && td->td_bitspersample == 8)
+    {
+        return 0;
+    }
+
+    if (td->td_sampleformat == SAMPLEFORMAT_UINT && td->td_bitspersample == 8)
+    {
+        return 1;
+    }
+
+    if (td->td_sampleformat == SAMPLEFORMAT_INT && td->td_bitspersample == 16)
+    {
+        return 2;
+    }
+
+    if (td->td_sampleformat == SAMPLEFORMAT_UINT && td->td_bitspersample == 16)
+    {
+        return 3;
+    }
+
+    if (td->td_sampleformat == SAMPLEFORMAT_INT && td->td_bitspersample == 32)
+    {
+        return 4;
+    }
+
+    if (td->td_sampleformat == SAMPLEFORMAT_UINT && td->td_bitspersample == 32)
+    {
+        return 5;
+    }
+
+    if (td->td_sampleformat == SAMPLEFORMAT_IEEEFP &&
+        td->td_bitspersample == 32)
+    {
+        return 6;
+    }
+
+    if (td->td_sampleformat == SAMPLEFORMAT_IEEEFP &&
+        td->td_bitspersample == 64)
+    {
+        return 7;
+    }
+
+    TIFFErrorExtR(
+        tif, module,
+        "Unsupported combination of SampleFormat and td_bitspersample");
+    return -1;
+}
+
+static int SetupUncompressedBuffer(TIFF *tif, LERCState *sp, const char *module)
+{
+    TIFFDirectory *td = &tif->tif_dir;
+    uint64_t new_size_64;
+    uint64_t new_alloc_64;
+    unsigned int new_size;
+    unsigned int new_alloc;
+
+    sp->uncompressed_offset = 0;
+
+    if (isTiled(tif))
+    {
+        sp->segment_width = td->td_tilewidth;
+        sp->segment_height = td->td_tilelength;
+    }
+    else
+    {
+        sp->segment_width = td->td_imagewidth;
+        sp->segment_height = td->td_imagelength - tif->tif_row;
+        if (sp->segment_height > td->td_rowsperstrip)
+            sp->segment_height = td->td_rowsperstrip;
+    }
+
+    new_size_64 = (uint64_t)sp->segment_width * sp->segment_height *
+                  (td->td_bitspersample / 8);
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG)
+    {
+        new_size_64 *= td->td_samplesperpixel;
+    }
+
+    new_size = (unsigned int)new_size_64;
+    sp->uncompressed_size = new_size;
+
+    /* add some margin as we are going to use it also to store deflate/zstd
+     * compressed data */
+    new_alloc_64 = 100 + new_size_64 + new_size_64 / 3;
+#ifdef ZSTD_SUPPORT
+    {
+        size_t zstd_max = ZSTD_compressBound((size_t)new_size_64);
+        if (new_alloc_64 < zstd_max)
+        {
+            new_alloc_64 = zstd_max;
+        }
+    }
+#endif
+    new_alloc = (unsigned int)new_alloc_64;
+    if (new_alloc != new_alloc_64)
+    {
+        TIFFErrorExtR(tif, module, "Too large uncompressed strip/tile");
+        _TIFFfreeExt(tif, sp->uncompressed_buffer);
+        sp->uncompressed_buffer = 0;
+        sp->uncompressed_alloc = 0;
+        return 0;
+    }
+
+    if (sp->uncompressed_alloc < new_alloc)
+    {
+        _TIFFfreeExt(tif, sp->uncompressed_buffer);
+        sp->uncompressed_buffer = _TIFFmallocExt(tif, new_alloc);
+        if (!sp->uncompressed_buffer)
+        {
+            TIFFErrorExtR(tif, module, "Cannot allocate buffer");
+            _TIFFfreeExt(tif, sp->uncompressed_buffer);
+            sp->uncompressed_buffer = 0;
+            sp->uncompressed_alloc = 0;
+            return 0;
+        }
+        sp->uncompressed_alloc = new_alloc;
+    }
+
+    if ((td->td_planarconfig == PLANARCONFIG_CONTIG &&
+         td->td_extrasamples > 0 &&
+         td->td_sampleinfo[td->td_extrasamples - 1] == EXTRASAMPLE_UNASSALPHA &&
+         GetLercDataType(tif) == 1) ||
+        (td->td_sampleformat == SAMPLEFORMAT_IEEEFP &&
+         (td->td_planarconfig == PLANARCONFIG_SEPARATE ||
+          td->td_samplesperpixel == 1) &&
+         (td->td_bitspersample == 32 || td->td_bitspersample == 64)))
+    {
+        unsigned int mask_size = sp->segment_width * sp->segment_height;
+        if (sp->mask_size < mask_size)
+        {
+            void *mask_buffer =
+                _TIFFreallocExt(tif, sp->mask_buffer, mask_size);
+            if (mask_buffer == NULL)
+            {
+                TIFFErrorExtR(tif, module, "Cannot allocate buffer");
+                sp->mask_size = 0;
+                _TIFFfreeExt(tif, sp->uncompressed_buffer);
+                sp->uncompressed_buffer = 0;
+                sp->uncompressed_alloc = 0;
+                return 0;
+            }
+            sp->mask_buffer = (uint8_t *)mask_buffer;
+            sp->mask_size = mask_size;
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Setup state for decoding a strip.
+ */
+static int LERCPreDecode(TIFF *tif, uint16_t s)
+{
+    static const char module[] = "LERCPreDecode";
+    lerc_status lerc_ret;
+    TIFFDirectory *td = &tif->tif_dir;
+    LERCState *sp = DecoderState(tif);
+    int lerc_data_type;
+    unsigned int infoArray[8];
+    unsigned nomask_bands = td->td_samplesperpixel;
+    int ndims;
+    int use_mask = 0;
+    uint8_t *lerc_data = tif->tif_rawcp;
+    unsigned int lerc_data_size = (unsigned int)tif->tif_rawcc;
+
+    (void)s;
+    assert(sp != NULL);
+    if (sp->state != LSTATE_INIT_DECODE)
+        tif->tif_setupdecode(tif);
+
+    lerc_data_type = GetLercDataType(tif);
+    if (lerc_data_type < 0)
+        return 0;
+
+    if (!SetupUncompressedBuffer(tif, sp, module))
+        return 0;
+
+    if (sp->additional_compression != LERC_ADD_COMPRESSION_NONE)
+    {
+        if (sp->compressed_size < sp->uncompressed_alloc)
+        {
+            _TIFFfreeExt(tif, sp->compressed_buffer);
+            sp->compressed_buffer = _TIFFmallocExt(tif, sp->uncompressed_alloc);
+            if (!sp->compressed_buffer)
+            {
+                sp->compressed_size = 0;
+                return 0;
+            }
+            sp->compressed_size = sp->uncompressed_alloc;
+        }
+    }
+
+    if (sp->additional_compression == LERC_ADD_COMPRESSION_DEFLATE)
+    {
+#if LIBDEFLATE_SUPPORT
+        enum libdeflate_result res;
+        size_t lerc_data_sizet = 0;
+        if (sp->libdeflate_dec == NULL)
+        {
+            sp->libdeflate_dec = libdeflate_alloc_decompressor();
+            if (sp->libdeflate_dec == NULL)
+            {
+                TIFFErrorExtR(tif, module, "Cannot allocate decompressor");
+                return 0;
+            }
+        }
+
+        res = libdeflate_zlib_decompress(
+            sp->libdeflate_dec, tif->tif_rawcp, (size_t)tif->tif_rawcc,
+            sp->compressed_buffer, sp->compressed_size, &lerc_data_sizet);
+        if (res != LIBDEFLATE_SUCCESS)
+        {
+            TIFFErrorExtR(tif, module, "Decoding error at scanline %lu",
+                          (unsigned long)tif->tif_row);
+            return 0;
+        }
+        assert(lerc_data_sizet == (unsigned int)lerc_data_sizet);
+        lerc_data = sp->compressed_buffer;
+        lerc_data_size = (unsigned int)lerc_data_sizet;
+#else
+        z_stream strm;
+        int zlib_ret;
+
+        memset(&strm, 0, sizeof(strm));
+        strm.zalloc = NULL;
+        strm.zfree = NULL;
+        strm.opaque = NULL;
+        zlib_ret = inflateInit(&strm);
+        if (zlib_ret != Z_OK)
+        {
+            TIFFErrorExtR(tif, module, "inflateInit() failed");
+            inflateEnd(&strm);
+            return 0;
+        }
+
+        strm.avail_in = (uInt)tif->tif_rawcc;
+        strm.next_in = tif->tif_rawcp;
+        strm.avail_out = sp->compressed_size;
+        strm.next_out = sp->compressed_buffer;
+        zlib_ret = inflate(&strm, Z_FINISH);
+        if (zlib_ret != Z_STREAM_END && zlib_ret != Z_OK)
+        {
+            TIFFErrorExtR(tif, module, "inflate() failed");
+            inflateEnd(&strm);
+            return 0;
+        }
+        lerc_data = sp->compressed_buffer;
+        lerc_data_size = sp->compressed_size - strm.avail_out;
+        inflateEnd(&strm);
+#endif
+    }
+    else if (sp->additional_compression == LERC_ADD_COMPRESSION_ZSTD)
+    {
+#ifdef ZSTD_SUPPORT
+        size_t zstd_ret;
+
+        zstd_ret = ZSTD_decompress(sp->compressed_buffer, sp->compressed_size,
+                                   tif->tif_rawcp, tif->tif_rawcc);
+        if (ZSTD_isError(zstd_ret))
+        {
+            TIFFErrorExtR(tif, module, "Error in ZSTD_decompress(): %s",
+                          ZSTD_getErrorName(zstd_ret));
+            return 0;
+        }
+
+        lerc_data = sp->compressed_buffer;
+        lerc_data_size = (unsigned int)zstd_ret;
+#else
+        TIFFErrorExtR(tif, module, "ZSTD support missing");
+        return 0;
+#endif
+    }
+    else if (sp->additional_compression != LERC_ADD_COMPRESSION_NONE)
+    {
+        TIFFErrorExtR(tif, module, "Unhandled additional compression");
+        return 0;
+    }
+
+    lerc_ret =
+        lerc_getBlobInfo(lerc_data, lerc_data_size, infoArray, NULL, 8, 0);
+    if (lerc_ret != 0)
+    {
+        TIFFErrorExtR(tif, module, "lerc_getBlobInfo() failed");
+        return 0;
+    }
+
+    /* If the configuration is compatible of a LERC mask, and that the */
+    /* LERC info has dim == samplesperpixel - 1, then there is a LERC */
+    /* mask. */
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG && td->td_extrasamples > 0 &&
+        td->td_sampleinfo[td->td_extrasamples - 1] == EXTRASAMPLE_UNASSALPHA &&
+        GetLercDataType(tif) == 1 &&
+        infoArray[2] == td->td_samplesperpixel - 1U)
+    {
+        use_mask = 1;
+        nomask_bands--;
+    }
+    else if (td->td_sampleformat == SAMPLEFORMAT_IEEEFP &&
+             (td->td_planarconfig == PLANARCONFIG_SEPARATE ||
+              td->td_samplesperpixel == 1) &&
+             (td->td_bitspersample == 32 || td->td_bitspersample == 64))
+    {
+        use_mask = 1;
+    }
+
+    ndims = td->td_planarconfig == PLANARCONFIG_CONTIG ? nomask_bands : 1;
+
+    /* Info returned in infoArray is { version, dataType, nDim, nCols,
+        nRows, nBands, nValidPixels, blobSize } */
+    if (infoArray[0] != (unsigned)sp->lerc_version)
+    {
+        TIFFWarningExtR(tif, module,
+                        "Unexpected version number: %d. Expected: %d",
+                        infoArray[0], sp->lerc_version);
+    }
+    if (infoArray[1] != (unsigned)lerc_data_type)
+    {
+        TIFFErrorExtR(tif, module, "Unexpected dataType: %d. Expected: %d",
+                      infoArray[1], lerc_data_type);
+        return 0;
+    }
+    if (infoArray[2] != (unsigned)ndims)
+    {
+        TIFFErrorExtR(tif, module, "Unexpected nDim: %d. Expected: %d",
+                      infoArray[2], ndims);
+        return 0;
+    }
+    if (infoArray[3] != sp->segment_width)
+    {
+        TIFFErrorExtR(tif, module, "Unexpected nCols: %d. Expected: %du",
+                      infoArray[3], sp->segment_width);
+        return 0;
+    }
+    if (infoArray[4] != sp->segment_height)
+    {
+        TIFFErrorExtR(tif, module, "Unexpected nRows: %d. Expected: %u",
+                      infoArray[4], sp->segment_height);
+        return 0;
+    }
+    if (infoArray[5] != 1)
+    {
+        TIFFErrorExtR(tif, module, "Unexpected nBands: %d. Expected: %d",
+                      infoArray[5], 1);
+        return 0;
+    }
+    if (infoArray[7] != lerc_data_size)
+    {
+        TIFFErrorExtR(tif, module, "Unexpected blobSize: %d. Expected: %u",
+                      infoArray[7], lerc_data_size);
+        return 0;
+    }
+
+    lerc_ret = lerc_decode(lerc_data, lerc_data_size,
+#if LERC_AT_LEAST_VERSION(3, 0, 0)
+                           use_mask ? 1 : 0,
+#endif
+                           use_mask ? sp->mask_buffer : NULL, ndims,
+                           sp->segment_width, sp->segment_height, 1,
+                           lerc_data_type, sp->uncompressed_buffer);
+    if (lerc_ret != 0)
+    {
+        TIFFErrorExtR(tif, module, "lerc_decode() failed");
+        return 0;
+    }
+
+    /* Interleave alpha mask with other samples. */
+    if (use_mask && GetLercDataType(tif) == 1)
+    {
+        unsigned src_stride =
+            (td->td_samplesperpixel - 1) * (td->td_bitspersample / 8);
+        unsigned dst_stride =
+            td->td_samplesperpixel * (td->td_bitspersample / 8);
+        unsigned i = sp->segment_width * sp->segment_height;
+        /* Operate from end to begin to be able to move in place */
+        while (i > 0 && i > nomask_bands)
+        {
+            i--;
+            sp->uncompressed_buffer[i * dst_stride + td->td_samplesperpixel -
+                                    1] = 255 * sp->mask_buffer[i];
+            memcpy(sp->uncompressed_buffer + i * dst_stride,
+                   sp->uncompressed_buffer + i * src_stride, src_stride);
+        }
+        /* First pixels must use memmove due to overlapping areas */
+        while (i > 0)
+        {
+            i--;
+            sp->uncompressed_buffer[i * dst_stride + td->td_samplesperpixel -
+                                    1] = 255 * sp->mask_buffer[i];
+            memmove(sp->uncompressed_buffer + i * dst_stride,
+                    sp->uncompressed_buffer + i * src_stride, src_stride);
+        }
+    }
+    else if (use_mask && td->td_sampleformat == SAMPLEFORMAT_IEEEFP)
+    {
+        const unsigned nb_pixels = sp->segment_width * sp->segment_height;
+        unsigned i;
+#if WORDS_BIGENDIAN
+        const unsigned char nan_bytes[] = {0x7f, 0xc0, 0, 0};
+#else
+        const unsigned char nan_bytes[] = {0, 0, 0xc0, 0x7f};
+#endif
+        float nan_float32;
+        memcpy(&nan_float32, nan_bytes, 4);
+
+        if (td->td_bitspersample == 32)
+        {
+            for (i = 0; i < nb_pixels; i++)
+            {
+                if (sp->mask_buffer[i] == 0)
+                    ((float *)sp->uncompressed_buffer)[i] = nan_float32;
+            }
+        }
+        else
+        {
+            const double nan_float64 = nan_float32;
+            for (i = 0; i < nb_pixels; i++)
+            {
+                if (sp->mask_buffer[i] == 0)
+                    ((double *)sp->uncompressed_buffer)[i] = nan_float64;
+            }
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Decode a strip, tile or scanline.
+ */
+static int LERCDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
+{
+    static const char module[] = "LERCDecode";
+    LERCState *sp = DecoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+    assert(sp->state == LSTATE_INIT_DECODE);
+
+    if (sp->uncompressed_buffer == 0)
+    {
+        TIFFErrorExtR(tif, module, "Uncompressed buffer not allocated");
+        return 0;
+    }
+
+    if ((uint64_t)sp->uncompressed_offset + (uint64_t)occ >
+        sp->uncompressed_size)
+    {
+        TIFFErrorExtR(tif, module, "Too many bytes read");
+        return 0;
+    }
+
+    memcpy(op, sp->uncompressed_buffer + sp->uncompressed_offset, occ);
+    sp->uncompressed_offset += (unsigned)occ;
+
+    return 1;
+}
+
+static int LERCSetupEncode(TIFF *tif)
+{
+    LERCState *sp = EncoderState(tif);
+
+    assert(sp != NULL);
+    if (sp->state & LSTATE_INIT_DECODE)
+    {
+        sp->state = 0;
+    }
+
+    sp->state |= LSTATE_INIT_ENCODE;
+
+    return 1;
+}
+
+/*
+ * Reset encoding state at the start of a strip.
+ */
+static int LERCPreEncode(TIFF *tif, uint16_t s)
+{
+    static const char module[] = "LERCPreEncode";
+    LERCState *sp = EncoderState(tif);
+    int lerc_data_type;
+
+    (void)s;
+    assert(sp != NULL);
+    if (sp->state != LSTATE_INIT_ENCODE)
+        tif->tif_setupencode(tif);
+
+    lerc_data_type = GetLercDataType(tif);
+    if (lerc_data_type < 0)
+        return 0;
+
+    if (!SetupUncompressedBuffer(tif, sp, module))
+        return 0;
+
+    return 1;
+}
+
+/*
+ * Encode a chunk of pixels.
+ */
+static int LERCEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
+{
+    static const char module[] = "LERCEncode";
+    LERCState *sp = EncoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+    assert(sp->state == LSTATE_INIT_ENCODE);
+
+    if ((uint64_t)sp->uncompressed_offset + (uint64_t)cc >
+        sp->uncompressed_size)
+    {
+        TIFFErrorExtR(tif, module, "Too many bytes written");
+        return 0;
+    }
+
+    memcpy(sp->uncompressed_buffer + sp->uncompressed_offset, bp, cc);
+    sp->uncompressed_offset += (unsigned)cc;
+
+    return 1;
+}
+
+/*
+ * Finish off an encoded strip by flushing it.
+ */
+static int LERCPostEncode(TIFF *tif)
+{
+    lerc_status lerc_ret;
+    static const char module[] = "LERCPostEncode";
+    LERCState *sp = EncoderState(tif);
+    unsigned int numBytes = 0;
+    unsigned int numBytesWritten = 0;
+    TIFFDirectory *td = &tif->tif_dir;
+    int use_mask = 0;
+    unsigned dst_nbands = td->td_samplesperpixel;
+
+    if (sp->uncompressed_offset != sp->uncompressed_size)
+    {
+        TIFFErrorExtR(tif, module, "Unexpected number of bytes in the buffer");
+        return 0;
+    }
+
+    /* Extract alpha mask (if containing only 0 and 255 values, */
+    /* and compact array of regular bands */
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG && td->td_extrasamples > 0 &&
+        td->td_sampleinfo[td->td_extrasamples - 1] == EXTRASAMPLE_UNASSALPHA &&
+        GetLercDataType(tif) == 1)
+    {
+        const unsigned dst_stride =
+            (td->td_samplesperpixel - 1) * (td->td_bitspersample / 8);
+        const unsigned src_stride =
+            td->td_samplesperpixel * (td->td_bitspersample / 8);
+        unsigned i = 0;
+        const unsigned nb_pixels = sp->segment_width * sp->segment_height;
+
+        use_mask = 1;
+        for (i = 0; i < nb_pixels; i++)
+        {
+            int v = sp->uncompressed_buffer[i * src_stride +
+                                            td->td_samplesperpixel - 1];
+            if (v != 0 && v != 255)
+            {
+                use_mask = 0;
+                break;
+            }
+        }
+
+        if (use_mask)
+        {
+            dst_nbands--;
+            /* First pixels must use memmove due to overlapping areas */
+            for (i = 0; i < dst_nbands && i < nb_pixels; i++)
+            {
+                memmove(sp->uncompressed_buffer + i * dst_stride,
+                        sp->uncompressed_buffer + i * src_stride, dst_stride);
+                sp->mask_buffer[i] =
+                    sp->uncompressed_buffer[i * src_stride +
+                                            td->td_samplesperpixel - 1];
+            }
+            for (; i < nb_pixels; i++)
+            {
+                memcpy(sp->uncompressed_buffer + i * dst_stride,
+                       sp->uncompressed_buffer + i * src_stride, dst_stride);
+                sp->mask_buffer[i] =
+                    sp->uncompressed_buffer[i * src_stride +
+                                            td->td_samplesperpixel - 1];
+            }
+        }
+    }
+    else if (td->td_sampleformat == SAMPLEFORMAT_IEEEFP &&
+             (td->td_planarconfig == PLANARCONFIG_SEPARATE ||
+              dst_nbands == 1) &&
+             (td->td_bitspersample == 32 || td->td_bitspersample == 64))
+    {
+        /* Check for NaN values */
+        unsigned i;
+        const unsigned nb_pixels = sp->segment_width * sp->segment_height;
+        if (td->td_bitspersample == 32)
+        {
+            for (i = 0; i < nb_pixels; i++)
+            {
+                const float val = ((float *)sp->uncompressed_buffer)[i];
+                if (val != val)
+                {
+                    use_mask = 1;
+                    break;
+                }
+            }
+        }
+        else
+        {
+            for (i = 0; i < nb_pixels; i++)
+            {
+                const double val = ((double *)sp->uncompressed_buffer)[i];
+                if (val != val)
+                {
+                    use_mask = 1;
+                    break;
+                }
+            }
+        }
+
+        if (use_mask)
+        {
+            if (td->td_bitspersample == 32)
+            {
+                for (i = 0; i < nb_pixels; i++)
+                {
+                    const float val = ((float *)sp->uncompressed_buffer)[i];
+                    sp->mask_buffer[i] = (val == val) ? 255 : 0;
+                }
+            }
+            else
+            {
+                for (i = 0; i < nb_pixels; i++)
+                {
+                    const double val = ((double *)sp->uncompressed_buffer)[i];
+                    sp->mask_buffer[i] = (val == val) ? 255 : 0;
+                }
+            }
+        }
+    }
+
+#if 0
+        lerc_ret = lerc_computeCompressedSize(
+            sp->uncompressed_buffer,
+            sp->lerc_version,
+            GetLercDataType(tif),
+            td->td_planarconfig == PLANARCONFIG_CONTIG ?
+                dst_nbands : 1,
+            sp->segment_width,
+            sp->segment_height,
+            1,
+            use_mask ? sp->mask_buffer : NULL,
+            sp->maxzerror,
+            &numBytes);
+        if( lerc_ret != 0 )
+        {
+            TIFFErrorExtR(tif, module,
+                         "lerc_computeCompressedSize() failed");
+            return 0;
+        }
+#else
+    numBytes = sp->uncompressed_alloc;
+#endif
+
+    if (sp->compressed_size < numBytes)
+    {
+        _TIFFfreeExt(tif, sp->compressed_buffer);
+        sp->compressed_buffer = _TIFFmallocExt(tif, numBytes);
+        if (!sp->compressed_buffer)
+        {
+            sp->compressed_size = 0;
+            return 0;
+        }
+        sp->compressed_size = numBytes;
+    }
+
+    lerc_ret = lerc_encodeForVersion(
+        sp->uncompressed_buffer, sp->lerc_version, GetLercDataType(tif),
+        td->td_planarconfig == PLANARCONFIG_CONTIG ? dst_nbands : 1,
+        sp->segment_width, sp->segment_height, 1,
+#if LERC_AT_LEAST_VERSION(3, 0, 0)
+        use_mask ? 1 : 0,
+#endif
+        use_mask ? sp->mask_buffer : NULL, sp->maxzerror, sp->compressed_buffer,
+        sp->compressed_size, &numBytesWritten);
+    if (lerc_ret != 0)
+    {
+        TIFFErrorExtR(tif, module, "lerc_encode() failed");
+        return 0;
+    }
+    assert(numBytesWritten < numBytes);
+
+    if (sp->additional_compression == LERC_ADD_COMPRESSION_DEFLATE)
+    {
+#if LIBDEFLATE_SUPPORT
+        if (sp->libdeflate_enc == NULL)
+        {
+            /* To get results as good as zlib, we ask for an extra */
+            /* level of compression */
+            sp->libdeflate_enc = libdeflate_alloc_compressor(
+                sp->zipquality == Z_DEFAULT_COMPRESSION ? 7
+                : sp->zipquality >= 6 && sp->zipquality <= 9
+                    ? sp->zipquality + 1
+                    : sp->zipquality);
+            if (sp->libdeflate_enc == NULL)
+            {
+                TIFFErrorExtR(tif, module, "Cannot allocate compressor");
+                return 0;
+            }
+        }
+
+        /* Should not happen normally */
+        if (libdeflate_zlib_compress_bound(
+                sp->libdeflate_enc, numBytesWritten) > sp->uncompressed_alloc)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Output buffer for libdeflate too small");
+            return 0;
+        }
+
+        tif->tif_rawcc = libdeflate_zlib_compress(
+            sp->libdeflate_enc, sp->compressed_buffer, numBytesWritten,
+            sp->uncompressed_buffer, sp->uncompressed_alloc);
+
+        if (tif->tif_rawcc == 0)
+        {
+            TIFFErrorExtR(tif, module, "Encoder error at scanline %lu",
+                          (unsigned long)tif->tif_row);
+            return 0;
+        }
+#else
+        z_stream strm;
+        int zlib_ret;
+        int cappedQuality = sp->zipquality;
+        if (cappedQuality > Z_BEST_COMPRESSION)
+            cappedQuality = Z_BEST_COMPRESSION;
+
+        memset(&strm, 0, sizeof(strm));
+        strm.zalloc = NULL;
+        strm.zfree = NULL;
+        strm.opaque = NULL;
+        zlib_ret = deflateInit(&strm, cappedQuality);
+        if (zlib_ret != Z_OK)
+        {
+            TIFFErrorExtR(tif, module, "deflateInit() failed");
+            return 0;
+        }
+
+        strm.avail_in = numBytesWritten;
+        strm.next_in = sp->compressed_buffer;
+        strm.avail_out = sp->uncompressed_alloc;
+        strm.next_out = sp->uncompressed_buffer;
+        zlib_ret = deflate(&strm, Z_FINISH);
+        if (zlib_ret == Z_STREAM_END)
+        {
+            tif->tif_rawcc = sp->uncompressed_alloc - strm.avail_out;
+        }
+        deflateEnd(&strm);
+        if (zlib_ret != Z_STREAM_END)
+        {
+            TIFFErrorExtR(tif, module, "deflate() failed");
+            return 0;
+        }
+#endif
+        {
+            int ret;
+            uint8_t *tif_rawdata_backup = tif->tif_rawdata;
+            tif->tif_rawdata = sp->uncompressed_buffer;
+            ret = TIFFFlushData1(tif);
+            tif->tif_rawdata = tif_rawdata_backup;
+            if (!ret)
+            {
+                return 0;
+            }
+        }
+    }
+    else if (sp->additional_compression == LERC_ADD_COMPRESSION_ZSTD)
+    {
+#ifdef ZSTD_SUPPORT
+        size_t zstd_ret = ZSTD_compress(
+            sp->uncompressed_buffer, sp->uncompressed_alloc,
+            sp->compressed_buffer, numBytesWritten, sp->zstd_compress_level);
+        if (ZSTD_isError(zstd_ret))
+        {
+            TIFFErrorExtR(tif, module, "Error in ZSTD_compress(): %s",
+                          ZSTD_getErrorName(zstd_ret));
+            return 0;
+        }
+
+        {
+            int ret;
+            uint8_t *tif_rawdata_backup = tif->tif_rawdata;
+            tif->tif_rawdata = sp->uncompressed_buffer;
+            tif->tif_rawcc = zstd_ret;
+            ret = TIFFFlushData1(tif);
+            tif->tif_rawdata = tif_rawdata_backup;
+            if (!ret)
+            {
+                return 0;
+            }
+        }
+#else
+        TIFFErrorExtR(tif, module, "ZSTD support missing");
+        return 0;
+#endif
+    }
+    else if (sp->additional_compression != LERC_ADD_COMPRESSION_NONE)
+    {
+        TIFFErrorExtR(tif, module, "Unhandled additional compression");
+        return 0;
+    }
+    else
+    {
+        int ret;
+        uint8_t *tif_rawdata_backup = tif->tif_rawdata;
+        tif->tif_rawdata = sp->compressed_buffer;
+        tif->tif_rawcc = numBytesWritten;
+        ret = TIFFFlushData1(tif);
+        tif->tif_rawdata = tif_rawdata_backup;
+        if (!ret)
+            return 0;
+    }
+
+    return 1;
+}
+
+static void LERCCleanup(TIFF *tif)
+{
+    LERCState *sp = LState(tif);
+
+    assert(sp != 0);
+
+    tif->tif_tagmethods.vgetfield = sp->vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->vsetparent;
+
+    _TIFFfreeExt(tif, sp->uncompressed_buffer);
+    _TIFFfreeExt(tif, sp->compressed_buffer);
+    _TIFFfreeExt(tif, sp->mask_buffer);
+
+#if LIBDEFLATE_SUPPORT
+    if (sp->libdeflate_dec)
+        libdeflate_free_decompressor(sp->libdeflate_dec);
+    if (sp->libdeflate_enc)
+        libdeflate_free_compressor(sp->libdeflate_enc);
+#endif
+
+    _TIFFfreeExt(tif, sp);
+    tif->tif_data = NULL;
+
+    _TIFFSetDefaultCompressionState(tif);
+}
+
+static const TIFFField LERCFields[] = {
+    {TIFFTAG_LERC_PARAMETERS, TIFF_VARIABLE2, TIFF_VARIABLE2, TIFF_LONG, 0,
+     TIFF_SETGET_C32_UINT32, TIFF_SETGET_UNDEFINED, FIELD_CUSTOM, FALSE, TRUE,
+     "LercParameters", NULL},
+    {TIFFTAG_LERC_MAXZERROR, 0, 0, TIFF_ANY, 0, TIFF_SETGET_DOUBLE,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "LercMaximumError",
+     NULL},
+    {TIFFTAG_LERC_VERSION, 0, 0, TIFF_ANY, 0, TIFF_SETGET_UINT32,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "LercVersion", NULL},
+    {TIFFTAG_LERC_ADD_COMPRESSION, 0, 0, TIFF_ANY, 0, TIFF_SETGET_UINT32,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE,
+     "LercAdditionalCompression", NULL},
+    {TIFFTAG_ZSTD_LEVEL, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE,
+     "ZSTD zstd_compress_level", NULL},
+    {TIFFTAG_ZIPQUALITY, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL},
+};
+
+static int LERCVSetFieldBase(TIFF *tif, uint32_t tag, ...)
+{
+    LERCState *sp = LState(tif);
+    int ret;
+    va_list ap;
+    va_start(ap, tag);
+    ret = (*sp->vsetparent)(tif, tag, ap);
+    va_end(ap);
+    return ret;
+}
+
+static int LERCVSetField(TIFF *tif, uint32_t tag, va_list ap)
+{
+    static const char module[] = "LERCVSetField";
+    LERCState *sp = LState(tif);
+
+    switch (tag)
+    {
+        case TIFFTAG_LERC_PARAMETERS:
+        {
+            uint32_t count = va_arg(ap, int);
+            int *params = va_arg(ap, int *);
+            if (count < 2)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Invalid count for LercParameters: %u", count);
+                return 0;
+            }
+            sp->lerc_version = params[0];
+            sp->additional_compression = params[1];
+            return LERCVSetFieldBase(tif, TIFFTAG_LERC_PARAMETERS, count,
+                                     params);
+        }
+        case TIFFTAG_LERC_MAXZERROR:
+            sp->maxzerror = va_arg(ap, double);
+            return 1;
+        case TIFFTAG_LERC_VERSION:
+        {
+            int params[2] = {0, 0};
+            int version = va_arg(ap, int);
+            if (version != LERC_VERSION_2_4)
+            {
+                TIFFErrorExtR(tif, module, "Invalid value for LercVersion: %d",
+                              version);
+                return 0;
+            }
+            sp->lerc_version = version;
+            params[0] = sp->lerc_version;
+            params[1] = sp->additional_compression;
+            return LERCVSetFieldBase(tif, TIFFTAG_LERC_PARAMETERS, 2, params);
+        }
+        case TIFFTAG_LERC_ADD_COMPRESSION:
+        {
+            int params[2] = {0, 0};
+            int additional_compression = va_arg(ap, int);
+#ifndef ZSTD_SUPPORT
+            if (additional_compression == LERC_ADD_COMPRESSION_ZSTD)
+            {
+                TIFFErrorExtR(tif, module,
+                              "LERC_ZSTD requested, but ZSTD not available");
+                return 0;
+            }
+#endif
+            if (additional_compression != LERC_ADD_COMPRESSION_NONE &&
+                additional_compression != LERC_ADD_COMPRESSION_DEFLATE &&
+                additional_compression != LERC_ADD_COMPRESSION_ZSTD)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Invalid value for LercAdditionalCompression: %d",
+                              additional_compression);
+                return 0;
+            }
+            sp->additional_compression = additional_compression;
+            params[0] = sp->lerc_version;
+            params[1] = sp->additional_compression;
+            return LERCVSetFieldBase(tif, TIFFTAG_LERC_PARAMETERS, 2, params);
+        }
+#ifdef ZSTD_SUPPORT
+        case TIFFTAG_ZSTD_LEVEL:
+        {
+            sp->zstd_compress_level = (int)va_arg(ap, int);
+            if (sp->zstd_compress_level <= 0 ||
+                sp->zstd_compress_level > ZSTD_maxCLevel())
+            {
+                TIFFWarningExtR(tif, module,
+                                "ZSTD_LEVEL should be between 1 and %d",
+                                ZSTD_maxCLevel());
+            }
+            return 1;
+        }
+#endif
+        case TIFFTAG_ZIPQUALITY:
+        {
+            sp->zipquality = (int)va_arg(ap, int);
+            if (sp->zipquality < Z_DEFAULT_COMPRESSION ||
+                sp->zipquality > LIBDEFLATE_MAX_COMPRESSION_LEVEL)
+            {
+                TIFFErrorExtR(
+                    tif, module,
+                    "Invalid ZipQuality value. Should be in [-1,%d] range",
+                    LIBDEFLATE_MAX_COMPRESSION_LEVEL);
+                return 0;
+            }
+
+#if LIBDEFLATE_SUPPORT
+            if (sp->libdeflate_enc)
+            {
+                libdeflate_free_compressor(sp->libdeflate_enc);
+                sp->libdeflate_enc = NULL;
+            }
+#endif
+
+            return (1);
+        }
+        default:
+            return (*sp->vsetparent)(tif, tag, ap);
+    }
+    /*NOTREACHED*/
+}
+
+static int LERCVGetField(TIFF *tif, uint32_t tag, va_list ap)
+{
+    LERCState *sp = LState(tif);
+
+    switch (tag)
+    {
+        case TIFFTAG_LERC_MAXZERROR:
+            *va_arg(ap, double *) = sp->maxzerror;
+            break;
+        case TIFFTAG_LERC_VERSION:
+            *va_arg(ap, int *) = sp->lerc_version;
+            break;
+        case TIFFTAG_LERC_ADD_COMPRESSION:
+            *va_arg(ap, int *) = sp->additional_compression;
+            break;
+        case TIFFTAG_ZSTD_LEVEL:
+            *va_arg(ap, int *) = sp->zstd_compress_level;
+            break;
+        case TIFFTAG_ZIPQUALITY:
+            *va_arg(ap, int *) = sp->zipquality;
+            break;
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
+    return 1;
+}
+
+int TIFFInitLERC(TIFF *tif, int scheme)
+{
+    static const char module[] = "TIFFInitLERC";
+    LERCState *sp;
+
+    (void)scheme;
+    assert(scheme == COMPRESSION_LERC);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, LERCFields, TIFFArrayCount(LERCFields)))
+    {
+        TIFFErrorExtR(tif, module, "Merging LERC codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFcallocExt(tif, 1, sizeof(LERCState));
+    if (tif->tif_data == NULL)
+        goto bad;
+    sp = LState(tif);
+
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = LERCVGetField; /* hook for codec tags */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = LERCVSetField; /* hook for codec tags */
+
+    /*
+     * Install codec methods.
+     */
+    tif->tif_fixuptags = LERCFixupTags;
+    tif->tif_setupdecode = LERCSetupDecode;
+    tif->tif_predecode = LERCPreDecode;
+    tif->tif_decoderow = LERCDecode;
+    tif->tif_decodestrip = LERCDecode;
+    tif->tif_decodetile = LERCDecode;
+    tif->tif_setupencode = LERCSetupEncode;
+    tif->tif_preencode = LERCPreEncode;
+    tif->tif_postencode = LERCPostEncode;
+    tif->tif_encoderow = LERCEncode;
+    tif->tif_encodestrip = LERCEncode;
+    tif->tif_encodetile = LERCEncode;
+    tif->tif_cleanup = LERCCleanup;
+
+    /* Default values for codec-specific fields */
+    TIFFSetField(tif, TIFFTAG_LERC_VERSION, LERC_VERSION_2_4);
+    TIFFSetField(tif, TIFFTAG_LERC_ADD_COMPRESSION, LERC_ADD_COMPRESSION_NONE);
+    sp->maxzerror = 0.0;
+    sp->zstd_compress_level = 9;            /* default comp. level */
+    sp->zipquality = Z_DEFAULT_COMPRESSION; /* default comp. level */
+    sp->state = 0;
+
+    return 1;
+bad:
+    TIFFErrorExtR(tif, module, "No space for LERC state block");
+    return 0;
+}
+#endif /* LERC_SUPPORT */
diff --git a/3rdparty/libtiff/tif_luv.c b/3rdparty/libtiff/tif_luv.c
index 3bd02e88e4ca..021756d5d6dd 100644
--- a/3rdparty/libtiff/tif_luv.c
+++ b/3rdparty/libtiff/tif_luv.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1997 Greg Ward Larson
  * Copyright (c) 1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler, Greg Larson and Silicon Graphics may not be used in any
  * advertising or publicity relating to the software without the specific,
  * prior written permission of Sam Leffler, Greg Larson and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER, GREG LARSON OR SILICON GRAPHICS BE LIABLE
  * FOR ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -34,7 +34,7 @@
  * LogLuv image support uses the TIFF library to store 16 or 10-bit
  * log luminance values with 8 bits each of u and v or a 14-bit index.
  *
- * The codec can take as input and produce as output 32-bit IEEE float values 
+ * The codec can take as input and produce as output 32-bit IEEE float values
  * as well as 16-bit integer values.  A 16-bit luminance is interpreted
  * as a sign bit followed by a 15-bit integer that is converted
  * to and from a linear magnitude using the transformation:
@@ -145,9 +145,9 @@
  * quantization errors into noise.
  */
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
 
 /*
  * State block for each open TIFF
@@ -155,22 +155,23 @@
  */
 typedef struct logLuvState LogLuvState;
 
-struct logLuvState {
-        int                     encoder_state;  /* 1 if encoder correctly initialized */
-	int                     user_datafmt;   /* user data format */
-	int                     encode_meth;    /* encoding method */
-	int                     pixel_size;     /* bytes per pixel */
+struct logLuvState
+{
+    int encoder_state; /* 1 if encoder correctly initialized */
+    int user_datafmt;  /* user data format */
+    int encode_meth;   /* encoding method */
+    int pixel_size;    /* bytes per pixel */
 
-	uint8*                  tbuf;           /* translation buffer */
-	tmsize_t                tbuflen;        /* buffer length */
-	void (*tfunc)(LogLuvState*, uint8*, tmsize_t);
+    uint8_t *tbuf;    /* translation buffer */
+    tmsize_t tbuflen; /* buffer length */
+    void (*tfunc)(LogLuvState *, uint8_t *, tmsize_t);
 
-	TIFFVSetMethod          vgetparent;     /* super-class method */
-	TIFFVSetMethod          vsetparent;     /* super-class method */
+    TIFFVSetMethod vgetparent; /* super-class method */
+    TIFFVSetMethod vsetparent; /* super-class method */
 };
 
-#define DecoderState(tif)	((LogLuvState*) (tif)->tif_data)
-#define EncoderState(tif)	((LogLuvState*) (tif)->tif_data)
+#define DecoderState(tif) ((LogLuvState *)(tif)->tif_data)
+#define EncoderState(tif) ((LogLuvState *)(tif)->tif_data)
 
 #define SGILOGDATAFMT_UNKNOWN -1
 
@@ -179,214 +180,207 @@ struct logLuvState {
 /*
  * Decode a string of 16-bit gray pixels.
  */
-static int
-LogL16Decode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int LogL16Decode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "LogL16Decode";
-	LogLuvState* sp = DecoderState(tif);
-	int shft;
-	tmsize_t i;
-	tmsize_t npixels;
-	unsigned char* bp;
-	int16* tp;
-	int16 b;
-	tmsize_t cc;
-	int rc;
-
-        (void)s;
-	assert(s == 0);
-	assert(sp != NULL);
-
-	npixels = occ / sp->pixel_size;
-
-	if (sp->user_datafmt == SGILOGDATAFMT_16BIT)
-		tp = (int16*) op;
-	else {
-		if(sp->tbuflen < npixels) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-						 "Translation buffer too short");
-			return (0);
-		}
-		tp = (int16*) sp->tbuf;
-	}
-	_TIFFmemset((void*) tp, 0, npixels*sizeof (tp[0]));
-
-	bp = (unsigned char*) tif->tif_rawcp;
-	cc = tif->tif_rawcc;
-	/* get each byte string */
-	for (shft = 8; shft >= 0; shft -=8) {
-		for (i = 0; i < npixels && cc > 0; ) {
-			if (*bp >= 128) {		/* run */
-				if( cc < 2 )
-					break;
-				rc = *bp++ + (2-128);
-				b = (int16)(*bp++ << shft);
-				cc -= 2;
-				while (rc-- && i < npixels)
-					tp[i++] |= b;
-			} else {			/* non-run */
-				rc = *bp++;		/* nul is noop */
-				while (--cc && rc-- && i < npixels)
-					tp[i++] |= (int16)*bp++ << shft;
-			}
-		}
-		if (i != npixels) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Not enough data at row %lu (short %I64d pixels)",
-				     (unsigned long) tif->tif_row,
-				     (unsigned __int64) (npixels - i));
-#else
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Not enough data at row %lu (short %llu pixels)",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long long) (npixels - i));
-#endif
-			tif->tif_rawcp = (uint8*) bp;
-			tif->tif_rawcc = cc;
-			return (0);
-		}
-	}
-	(*sp->tfunc)(sp, op, npixels);
-	tif->tif_rawcp = (uint8*) bp;
-	tif->tif_rawcc = cc;
-	return (1);
+    static const char module[] = "LogL16Decode";
+    LogLuvState *sp = DecoderState(tif);
+    int shft;
+    tmsize_t i;
+    tmsize_t npixels;
+    unsigned char *bp;
+    int16_t *tp;
+    int16_t b;
+    tmsize_t cc;
+    int rc;
+
+    (void)s;
+    assert(s == 0);
+    assert(sp != NULL);
+
+    npixels = occ / sp->pixel_size;
+
+    if (sp->user_datafmt == SGILOGDATAFMT_16BIT)
+        tp = (int16_t *)op;
+    else
+    {
+        if (sp->tbuflen < npixels)
+        {
+            TIFFErrorExtR(tif, module, "Translation buffer too short");
+            return (0);
+        }
+        tp = (int16_t *)sp->tbuf;
+    }
+    _TIFFmemset((void *)tp, 0, npixels * sizeof(tp[0]));
+
+    bp = (unsigned char *)tif->tif_rawcp;
+    cc = tif->tif_rawcc;
+    /* get each byte string */
+    for (shft = 8; shft >= 0; shft -= 8)
+    {
+        for (i = 0; i < npixels && cc > 0;)
+        {
+            if (*bp >= 128)
+            { /* run */
+                if (cc < 2)
+                    break;
+                rc = *bp++ + (2 - 128);
+                b = (int16_t)(*bp++ << shft);
+                cc -= 2;
+                while (rc-- && i < npixels)
+                    tp[i++] |= b;
+            }
+            else
+            {               /* non-run */
+                rc = *bp++; /* nul is noop */
+                while (--cc && rc-- && i < npixels)
+                    tp[i++] |= (int16_t)*bp++ << shft;
+            }
+        }
+        if (i != npixels)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Not enough data at row %" PRIu32
+                          " (short %" TIFF_SSIZE_FORMAT " pixels)",
+                          tif->tif_row, npixels - i);
+            tif->tif_rawcp = (uint8_t *)bp;
+            tif->tif_rawcc = cc;
+            return (0);
+        }
+    }
+    (*sp->tfunc)(sp, op, npixels);
+    tif->tif_rawcp = (uint8_t *)bp;
+    tif->tif_rawcc = cc;
+    return (1);
 }
 
 /*
  * Decode a string of 24-bit pixels.
  */
-static int
-LogLuvDecode24(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int LogLuvDecode24(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "LogLuvDecode24";
-	LogLuvState* sp = DecoderState(tif);
-	tmsize_t cc;
-	tmsize_t i;
-	tmsize_t npixels;
-	unsigned char* bp;
-	uint32* tp;
-
-        (void)s;
-	assert(s == 0);
-	assert(sp != NULL);
-
-	npixels = occ / sp->pixel_size;
-
-	if (sp->user_datafmt == SGILOGDATAFMT_RAW)
-		tp = (uint32 *)op;
-	else {
-		if(sp->tbuflen < npixels) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-						 "Translation buffer too short");
-			return (0);
-		}
-		tp = (uint32 *) sp->tbuf;
-	}
-	/* copy to array of uint32 */
-	bp = (unsigned char*) tif->tif_rawcp;
-	cc = tif->tif_rawcc;
-	for (i = 0; i < npixels && cc >= 3; i++) {
-		tp[i] = bp[0] << 16 | bp[1] << 8 | bp[2];
-		bp += 3;
-		cc -= 3;
-	}
-	tif->tif_rawcp = (uint8*) bp;
-	tif->tif_rawcc = cc;
-	if (i != npixels) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"Not enough data at row %lu (short %I64d pixels)",
-			     (unsigned long) tif->tif_row,
-			     (unsigned __int64) (npixels - i));
-#else
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"Not enough data at row %lu (short %llu pixels)",
-			     (unsigned long) tif->tif_row,
-			     (unsigned long long) (npixels - i));
-#endif
-		return (0);
-	}
-	(*sp->tfunc)(sp, op, npixels);
-	return (1);
+    static const char module[] = "LogLuvDecode24";
+    LogLuvState *sp = DecoderState(tif);
+    tmsize_t cc;
+    tmsize_t i;
+    tmsize_t npixels;
+    unsigned char *bp;
+    uint32_t *tp;
+
+    (void)s;
+    assert(s == 0);
+    assert(sp != NULL);
+
+    npixels = occ / sp->pixel_size;
+
+    if (sp->user_datafmt == SGILOGDATAFMT_RAW)
+        tp = (uint32_t *)op;
+    else
+    {
+        if (sp->tbuflen < npixels)
+        {
+            TIFFErrorExtR(tif, module, "Translation buffer too short");
+            return (0);
+        }
+        tp = (uint32_t *)sp->tbuf;
+    }
+    /* copy to array of uint32_t */
+    bp = (unsigned char *)tif->tif_rawcp;
+    cc = tif->tif_rawcc;
+    for (i = 0; i < npixels && cc >= 3; i++)
+    {
+        tp[i] = bp[0] << 16 | bp[1] << 8 | bp[2];
+        bp += 3;
+        cc -= 3;
+    }
+    tif->tif_rawcp = (uint8_t *)bp;
+    tif->tif_rawcc = cc;
+    if (i != npixels)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Not enough data at row %" PRIu32
+                      " (short %" TIFF_SSIZE_FORMAT " pixels)",
+                      tif->tif_row, npixels - i);
+        return (0);
+    }
+    (*sp->tfunc)(sp, op, npixels);
+    return (1);
 }
 
 /*
  * Decode a string of 32-bit pixels.
  */
-static int
-LogLuvDecode32(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int LogLuvDecode32(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "LogLuvDecode32";
-	LogLuvState* sp;
-	int shft;
-	tmsize_t i;
-	tmsize_t npixels;
-	unsigned char* bp;
-	uint32* tp;
-	uint32 b;
-	tmsize_t cc;
-	int rc;
-
-        (void)s;
-	assert(s == 0);
-	sp = DecoderState(tif);
-	assert(sp != NULL);
-
-	npixels = occ / sp->pixel_size;
-
-	if (sp->user_datafmt == SGILOGDATAFMT_RAW)
-		tp = (uint32*) op;
-	else {
-		if(sp->tbuflen < npixels) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-						 "Translation buffer too short");
-			return (0);
-		}
-		tp = (uint32*) sp->tbuf;
-	}
-	_TIFFmemset((void*) tp, 0, npixels*sizeof (tp[0]));
-
-	bp = (unsigned char*) tif->tif_rawcp;
-	cc = tif->tif_rawcc;
-	/* get each byte string */
-	for (shft = 24; shft >= 0; shft -=8) {
-		for (i = 0; i < npixels && cc > 0; ) {
-			if (*bp >= 128) {		/* run */
-				if( cc < 2 )
-					break;
-				rc = *bp++ + (2-128);
-				b = (uint32)*bp++ << shft;
-				cc -= 2;
-				while (rc-- && i < npixels)
-					tp[i++] |= b;
-			} else {			/* non-run */
-				rc = *bp++;		/* nul is noop */
-				while (--cc && rc-- && i < npixels)
-					tp[i++] |= (uint32)*bp++ << shft;
-			}
-		}
-		if (i != npixels) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			TIFFErrorExt(tif->tif_clientdata, module,
-			"Not enough data at row %lu (short %I64d pixels)",
-				     (unsigned long) tif->tif_row,
-				     (unsigned __int64) (npixels - i));
-#else
-			TIFFErrorExt(tif->tif_clientdata, module,
-			"Not enough data at row %lu (short %llu pixels)",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long long) (npixels - i));
-#endif
-			tif->tif_rawcp = (uint8*) bp;
-			tif->tif_rawcc = cc;
-			return (0);
-		}
-	}
-	(*sp->tfunc)(sp, op, npixels);
-	tif->tif_rawcp = (uint8*) bp;
-	tif->tif_rawcc = cc;
-	return (1);
+    static const char module[] = "LogLuvDecode32";
+    LogLuvState *sp;
+    int shft;
+    tmsize_t i;
+    tmsize_t npixels;
+    unsigned char *bp;
+    uint32_t *tp;
+    uint32_t b;
+    tmsize_t cc;
+    int rc;
+
+    (void)s;
+    assert(s == 0);
+    sp = DecoderState(tif);
+    assert(sp != NULL);
+
+    npixels = occ / sp->pixel_size;
+
+    if (sp->user_datafmt == SGILOGDATAFMT_RAW)
+        tp = (uint32_t *)op;
+    else
+    {
+        if (sp->tbuflen < npixels)
+        {
+            TIFFErrorExtR(tif, module, "Translation buffer too short");
+            return (0);
+        }
+        tp = (uint32_t *)sp->tbuf;
+    }
+    _TIFFmemset((void *)tp, 0, npixels * sizeof(tp[0]));
+
+    bp = (unsigned char *)tif->tif_rawcp;
+    cc = tif->tif_rawcc;
+    /* get each byte string */
+    for (shft = 24; shft >= 0; shft -= 8)
+    {
+        for (i = 0; i < npixels && cc > 0;)
+        {
+            if (*bp >= 128)
+            { /* run */
+                if (cc < 2)
+                    break;
+                rc = *bp++ + (2 - 128);
+                b = (uint32_t)*bp++ << shft;
+                cc -= 2;
+                while (rc-- && i < npixels)
+                    tp[i++] |= b;
+            }
+            else
+            {               /* non-run */
+                rc = *bp++; /* nul is noop */
+                while (--cc && rc-- && i < npixels)
+                    tp[i++] |= (uint32_t)*bp++ << shft;
+            }
+        }
+        if (i != npixels)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Not enough data at row %" PRIu32
+                          " (short %" TIFF_SSIZE_FORMAT " pixels)",
+                          tif->tif_row, npixels - i);
+            tif->tif_rawcp = (uint8_t *)bp;
+            tif->tif_rawcc = cc;
+            return (0);
+        }
+    }
+    (*sp->tfunc)(sp, op, npixels);
+    tif->tif_rawcp = (uint8_t *)bp;
+    tif->tif_rawcc = cc;
+    return (1);
 }
 
 /*
@@ -394,20 +388,20 @@ LogLuvDecode32(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
  * maintain synchrony with the encode algorithm, which
  * is row by row.
  */
-static int
-LogLuvDecodeStrip(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LogLuvDecodeStrip(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	tmsize_t rowlen = TIFFScanlineSize(tif);
-
-        if (rowlen == 0)
-                return 0;
-
-	assert(cc%rowlen == 0);
-	while (cc && (*tif->tif_decoderow)(tif, bp, rowlen, s)) {
-		bp += rowlen;
-		cc -= rowlen;
-	}
-	return (cc == 0);
+    tmsize_t rowlen = TIFFScanlineSize(tif);
+
+    if (rowlen == 0)
+        return 0;
+
+    assert(cc % rowlen == 0);
+    while (cc && (*tif->tif_decoderow)(tif, bp, rowlen, s))
+    {
+        bp += rowlen;
+        cc -= rowlen;
+    }
+    return (cc == 0);
 }
 
 /*
@@ -415,314 +409,342 @@ LogLuvDecodeStrip(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
  * maintain synchrony with the encode algorithm, which
  * is row by row.
  */
-static int
-LogLuvDecodeTile(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LogLuvDecodeTile(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	tmsize_t rowlen = TIFFTileRowSize(tif);
-
-        if (rowlen == 0)
-                return 0;
-
-	assert(cc%rowlen == 0);
-	while (cc && (*tif->tif_decoderow)(tif, bp, rowlen, s)) {
-		bp += rowlen;
-		cc -= rowlen;
-	}
-	return (cc == 0);
+    tmsize_t rowlen = TIFFTileRowSize(tif);
+
+    if (rowlen == 0)
+        return 0;
+
+    assert(cc % rowlen == 0);
+    while (cc && (*tif->tif_decoderow)(tif, bp, rowlen, s))
+    {
+        bp += rowlen;
+        cc -= rowlen;
+    }
+    return (cc == 0);
 }
 
 /*
  * Encode a row of 16-bit pixels.
  */
-static int
-LogL16Encode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LogL16Encode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "LogL16Encode";
-	LogLuvState* sp = EncoderState(tif);
-	int shft;
-	tmsize_t i;
-	tmsize_t j;
-	tmsize_t npixels;
-	uint8* op;
-	int16* tp;
-	int16 b;
-	tmsize_t occ;
-	int rc=0, mask;
-	tmsize_t beg;
-
-        (void)s;
-	assert(s == 0);
-	assert(sp != NULL);
-	npixels = cc / sp->pixel_size;
-
-	if (sp->user_datafmt == SGILOGDATAFMT_16BIT)
-		tp = (int16*) bp;
-	else {
-		tp = (int16*) sp->tbuf;
-		if(sp->tbuflen < npixels) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-						 "Translation buffer too short");
-			return (0);
-		}
-		(*sp->tfunc)(sp, bp, npixels);
-	}
-	/* compress each byte string */
-	op = tif->tif_rawcp;
-	occ = tif->tif_rawdatasize - tif->tif_rawcc;
-	for (shft = 8; shft >= 0; shft -=8) {
-		for (i = 0; i < npixels; i += rc) {
-			if (occ < 4) {
-				tif->tif_rawcp = op;
-				tif->tif_rawcc = tif->tif_rawdatasize - occ;
-				if (!TIFFFlushData1(tif))
-					return (0);
-				op = tif->tif_rawcp;
-				occ = tif->tif_rawdatasize - tif->tif_rawcc;
-			}
-			mask = 0xff << shft;		/* find next run */
-			for (beg = i; beg < npixels; beg += rc) {
-				b = (int16) (tp[beg] & mask);
-				rc = 1;
-				while (rc < 127+2 && beg+rc < npixels &&
-				    (tp[beg+rc] & mask) == b)
-					rc++;
-				if (rc >= MINRUN)
-					break;		/* long enough */
-			}
-			if (beg-i > 1 && beg-i < MINRUN) {
-				b = (int16) (tp[i] & mask);/*check short run */
-				j = i+1;
-				while ((tp[j++] & mask) == b)
-					if (j == beg) {
-						*op++ = (uint8)(128-2+j-i);
-						*op++ = (uint8)(b >> shft);
-						occ -= 2;
-						i = beg;
-						break;
-					}
-			}
-			while (i < beg) {		/* write out non-run */
-				if ((j = beg-i) > 127) j = 127;
-				if (occ < j+3) {
-					tif->tif_rawcp = op;
-					tif->tif_rawcc = tif->tif_rawdatasize - occ;
-					if (!TIFFFlushData1(tif))
-						return (0);
-					op = tif->tif_rawcp;
-					occ = tif->tif_rawdatasize - tif->tif_rawcc;
-				}
-				*op++ = (uint8) j; occ--;
-				while (j--) {
-					*op++ = (uint8) (tp[i++] >> shft & 0xff);
-					occ--;
-				}
-			}
-			if (rc >= MINRUN) {		/* write out run */
-				*op++ = (uint8) (128-2+rc);
-				*op++ = (uint8) (tp[beg] >> shft & 0xff);
-				occ -= 2;
-			} else
-				rc = 0;
-		}
-	}
-	tif->tif_rawcp = op;
-	tif->tif_rawcc = tif->tif_rawdatasize - occ;
-
-	return (1);
+    static const char module[] = "LogL16Encode";
+    LogLuvState *sp = EncoderState(tif);
+    int shft;
+    tmsize_t i;
+    tmsize_t j;
+    tmsize_t npixels;
+    uint8_t *op;
+    int16_t *tp;
+    int16_t b;
+    tmsize_t occ;
+    int rc = 0, mask;
+    tmsize_t beg;
+
+    (void)s;
+    assert(s == 0);
+    assert(sp != NULL);
+    npixels = cc / sp->pixel_size;
+
+    if (sp->user_datafmt == SGILOGDATAFMT_16BIT)
+        tp = (int16_t *)bp;
+    else
+    {
+        tp = (int16_t *)sp->tbuf;
+        if (sp->tbuflen < npixels)
+        {
+            TIFFErrorExtR(tif, module, "Translation buffer too short");
+            return (0);
+        }
+        (*sp->tfunc)(sp, bp, npixels);
+    }
+    /* compress each byte string */
+    op = tif->tif_rawcp;
+    occ = tif->tif_rawdatasize - tif->tif_rawcc;
+    for (shft = 8; shft >= 0; shft -= 8)
+    {
+        for (i = 0; i < npixels; i += rc)
+        {
+            if (occ < 4)
+            {
+                tif->tif_rawcp = op;
+                tif->tif_rawcc = tif->tif_rawdatasize - occ;
+                if (!TIFFFlushData1(tif))
+                    return (0);
+                op = tif->tif_rawcp;
+                occ = tif->tif_rawdatasize - tif->tif_rawcc;
+            }
+            mask = 0xff << shft; /* find next run */
+            for (beg = i; beg < npixels; beg += rc)
+            {
+                b = (int16_t)(tp[beg] & mask);
+                rc = 1;
+                while (rc < 127 + 2 && beg + rc < npixels &&
+                       (tp[beg + rc] & mask) == b)
+                    rc++;
+                if (rc >= MINRUN)
+                    break; /* long enough */
+            }
+            if (beg - i > 1 && beg - i < MINRUN)
+            {
+                b = (int16_t)(tp[i] & mask); /*check short run */
+                j = i + 1;
+                while ((tp[j++] & mask) == b)
+                    if (j == beg)
+                    {
+                        *op++ = (uint8_t)(128 - 2 + j - i);
+                        *op++ = (uint8_t)(b >> shft);
+                        occ -= 2;
+                        i = beg;
+                        break;
+                    }
+            }
+            while (i < beg)
+            { /* write out non-run */
+                if ((j = beg - i) > 127)
+                    j = 127;
+                if (occ < j + 3)
+                {
+                    tif->tif_rawcp = op;
+                    tif->tif_rawcc = tif->tif_rawdatasize - occ;
+                    if (!TIFFFlushData1(tif))
+                        return (0);
+                    op = tif->tif_rawcp;
+                    occ = tif->tif_rawdatasize - tif->tif_rawcc;
+                }
+                *op++ = (uint8_t)j;
+                occ--;
+                while (j--)
+                {
+                    *op++ = (uint8_t)(tp[i++] >> shft & 0xff);
+                    occ--;
+                }
+            }
+            if (rc >= MINRUN)
+            { /* write out run */
+                *op++ = (uint8_t)(128 - 2 + rc);
+                *op++ = (uint8_t)(tp[beg] >> shft & 0xff);
+                occ -= 2;
+            }
+            else
+                rc = 0;
+        }
+    }
+    tif->tif_rawcp = op;
+    tif->tif_rawcc = tif->tif_rawdatasize - occ;
+
+    return (1);
 }
 
 /*
  * Encode a row of 24-bit pixels.
  */
-static int
-LogLuvEncode24(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LogLuvEncode24(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "LogLuvEncode24";
-	LogLuvState* sp = EncoderState(tif);
-	tmsize_t i;
-	tmsize_t npixels;
-	tmsize_t occ;
-	uint8* op;
-	uint32* tp;
-
-        (void)s;
-	assert(s == 0);
-	assert(sp != NULL);
-	npixels = cc / sp->pixel_size;
-
-	if (sp->user_datafmt == SGILOGDATAFMT_RAW)
-		tp = (uint32*) bp;
-	else {
-		tp = (uint32*) sp->tbuf;
-		if(sp->tbuflen < npixels) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-						 "Translation buffer too short");
-			return (0);
-		}
-		(*sp->tfunc)(sp, bp, npixels);
-	}
-	/* write out encoded pixels */
-	op = tif->tif_rawcp;
-	occ = tif->tif_rawdatasize - tif->tif_rawcc;
-	for (i = npixels; i--; ) {
-		if (occ < 3) {
-			tif->tif_rawcp = op;
-			tif->tif_rawcc = tif->tif_rawdatasize - occ;
-			if (!TIFFFlushData1(tif))
-				return (0);
-			op = tif->tif_rawcp;
-			occ = tif->tif_rawdatasize - tif->tif_rawcc;
-		}
-		*op++ = (uint8)(*tp >> 16);
-		*op++ = (uint8)(*tp >> 8 & 0xff);
-		*op++ = (uint8)(*tp++ & 0xff);
-		occ -= 3;
-	}
-	tif->tif_rawcp = op;
-	tif->tif_rawcc = tif->tif_rawdatasize - occ;
-
-	return (1);
+    static const char module[] = "LogLuvEncode24";
+    LogLuvState *sp = EncoderState(tif);
+    tmsize_t i;
+    tmsize_t npixels;
+    tmsize_t occ;
+    uint8_t *op;
+    uint32_t *tp;
+
+    (void)s;
+    assert(s == 0);
+    assert(sp != NULL);
+    npixels = cc / sp->pixel_size;
+
+    if (sp->user_datafmt == SGILOGDATAFMT_RAW)
+        tp = (uint32_t *)bp;
+    else
+    {
+        tp = (uint32_t *)sp->tbuf;
+        if (sp->tbuflen < npixels)
+        {
+            TIFFErrorExtR(tif, module, "Translation buffer too short");
+            return (0);
+        }
+        (*sp->tfunc)(sp, bp, npixels);
+    }
+    /* write out encoded pixels */
+    op = tif->tif_rawcp;
+    occ = tif->tif_rawdatasize - tif->tif_rawcc;
+    for (i = npixels; i--;)
+    {
+        if (occ < 3)
+        {
+            tif->tif_rawcp = op;
+            tif->tif_rawcc = tif->tif_rawdatasize - occ;
+            if (!TIFFFlushData1(tif))
+                return (0);
+            op = tif->tif_rawcp;
+            occ = tif->tif_rawdatasize - tif->tif_rawcc;
+        }
+        *op++ = (uint8_t)(*tp >> 16);
+        *op++ = (uint8_t)(*tp >> 8 & 0xff);
+        *op++ = (uint8_t)(*tp++ & 0xff);
+        occ -= 3;
+    }
+    tif->tif_rawcp = op;
+    tif->tif_rawcc = tif->tif_rawdatasize - occ;
+
+    return (1);
 }
 
 /*
  * Encode a row of 32-bit pixels.
  */
-static int
-LogLuvEncode32(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LogLuvEncode32(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "LogLuvEncode32";
-	LogLuvState* sp = EncoderState(tif);
-	int shft;
-	tmsize_t i;
-	tmsize_t j;
-	tmsize_t npixels;
-	uint8* op;
-	uint32* tp;
-	uint32 b;
-	tmsize_t occ;
-	int rc=0, mask;
-	tmsize_t beg;
-
-        (void)s;
-	assert(s == 0);
-	assert(sp != NULL);
-
-	npixels = cc / sp->pixel_size;
-
-	if (sp->user_datafmt == SGILOGDATAFMT_RAW)
-		tp = (uint32*) bp;
-	else {
-		tp = (uint32*) sp->tbuf;
-		if(sp->tbuflen < npixels) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-						 "Translation buffer too short");
-			return (0);
-		}
-		(*sp->tfunc)(sp, bp, npixels);
-	}
-	/* compress each byte string */
-	op = tif->tif_rawcp;
-	occ = tif->tif_rawdatasize - tif->tif_rawcc;
-	for (shft = 24; shft >= 0; shft -=8) {
-		for (i = 0; i < npixels; i += rc) {
-			if (occ < 4) {
-				tif->tif_rawcp = op;
-				tif->tif_rawcc = tif->tif_rawdatasize - occ;
-				if (!TIFFFlushData1(tif))
-					return (0);
-				op = tif->tif_rawcp;
-				occ = tif->tif_rawdatasize - tif->tif_rawcc;
-			}
-			mask = 0xff << shft;		/* find next run */
-			for (beg = i; beg < npixels; beg += rc) {
-				b = tp[beg] & mask;
-				rc = 1;
-				while (rc < 127+2 && beg+rc < npixels &&
-						(tp[beg+rc] & mask) == b)
-					rc++;
-				if (rc >= MINRUN)
-					break;		/* long enough */
-			}
-			if (beg-i > 1 && beg-i < MINRUN) {
-				b = tp[i] & mask;	/* check short run */
-				j = i+1;
-				while ((tp[j++] & mask) == b)
-					if (j == beg) {
-						*op++ = (uint8)(128-2+j-i);
-						*op++ = (uint8)(b >> shft);
-						occ -= 2;
-						i = beg;
-						break;
-					}
-			}
-			while (i < beg) {		/* write out non-run */
-				if ((j = beg-i) > 127) j = 127;
-				if (occ < j+3) {
-					tif->tif_rawcp = op;
-					tif->tif_rawcc = tif->tif_rawdatasize - occ;
-					if (!TIFFFlushData1(tif))
-						return (0);
-					op = tif->tif_rawcp;
-					occ = tif->tif_rawdatasize - tif->tif_rawcc;
-				}
-				*op++ = (uint8) j; occ--;
-				while (j--) {
-					*op++ = (uint8)(tp[i++] >> shft & 0xff);
-					occ--;
-				}
-			}
-			if (rc >= MINRUN) {		/* write out run */
-				*op++ = (uint8) (128-2+rc);
-				*op++ = (uint8)(tp[beg] >> shft & 0xff);
-				occ -= 2;
-			} else
-				rc = 0;
-		}
-	}
-	tif->tif_rawcp = op;
-	tif->tif_rawcc = tif->tif_rawdatasize - occ;
-
-	return (1);
+    static const char module[] = "LogLuvEncode32";
+    LogLuvState *sp = EncoderState(tif);
+    int shft;
+    tmsize_t i;
+    tmsize_t j;
+    tmsize_t npixels;
+    uint8_t *op;
+    uint32_t *tp;
+    uint32_t b;
+    tmsize_t occ;
+    int rc = 0;
+    tmsize_t beg;
+
+    (void)s;
+    assert(s == 0);
+    assert(sp != NULL);
+
+    npixels = cc / sp->pixel_size;
+
+    if (sp->user_datafmt == SGILOGDATAFMT_RAW)
+        tp = (uint32_t *)bp;
+    else
+    {
+        tp = (uint32_t *)sp->tbuf;
+        if (sp->tbuflen < npixels)
+        {
+            TIFFErrorExtR(tif, module, "Translation buffer too short");
+            return (0);
+        }
+        (*sp->tfunc)(sp, bp, npixels);
+    }
+    /* compress each byte string */
+    op = tif->tif_rawcp;
+    occ = tif->tif_rawdatasize - tif->tif_rawcc;
+    for (shft = 24; shft >= 0; shft -= 8)
+    {
+        const uint32_t mask = 0xffU << shft; /* find next run */
+        for (i = 0; i < npixels; i += rc)
+        {
+            if (occ < 4)
+            {
+                tif->tif_rawcp = op;
+                tif->tif_rawcc = tif->tif_rawdatasize - occ;
+                if (!TIFFFlushData1(tif))
+                    return (0);
+                op = tif->tif_rawcp;
+                occ = tif->tif_rawdatasize - tif->tif_rawcc;
+            }
+            for (beg = i; beg < npixels; beg += rc)
+            {
+                b = tp[beg] & mask;
+                rc = 1;
+                while (rc < 127 + 2 && beg + rc < npixels &&
+                       (tp[beg + rc] & mask) == b)
+                    rc++;
+                if (rc >= MINRUN)
+                    break; /* long enough */
+            }
+            if (beg - i > 1 && beg - i < MINRUN)
+            {
+                b = tp[i] & mask; /* check short run */
+                j = i + 1;
+                while ((tp[j++] & mask) == b)
+                    if (j == beg)
+                    {
+                        *op++ = (uint8_t)(128 - 2 + j - i);
+                        *op++ = (uint8_t)(b >> shft);
+                        occ -= 2;
+                        i = beg;
+                        break;
+                    }
+            }
+            while (i < beg)
+            { /* write out non-run */
+                if ((j = beg - i) > 127)
+                    j = 127;
+                if (occ < j + 3)
+                {
+                    tif->tif_rawcp = op;
+                    tif->tif_rawcc = tif->tif_rawdatasize - occ;
+                    if (!TIFFFlushData1(tif))
+                        return (0);
+                    op = tif->tif_rawcp;
+                    occ = tif->tif_rawdatasize - tif->tif_rawcc;
+                }
+                *op++ = (uint8_t)j;
+                occ--;
+                while (j--)
+                {
+                    *op++ = (uint8_t)(tp[i++] >> shft & 0xff);
+                    occ--;
+                }
+            }
+            if (rc >= MINRUN)
+            { /* write out run */
+                *op++ = (uint8_t)(128 - 2 + rc);
+                *op++ = (uint8_t)(tp[beg] >> shft & 0xff);
+                occ -= 2;
+            }
+            else
+                rc = 0;
+        }
+    }
+    tif->tif_rawcp = op;
+    tif->tif_rawcc = tif->tif_rawdatasize - occ;
+
+    return (1);
 }
 
 /*
  * Encode a strip of pixels.  We break it into rows to
  * avoid encoding runs across row boundaries.
  */
-static int
-LogLuvEncodeStrip(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LogLuvEncodeStrip(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	tmsize_t rowlen = TIFFScanlineSize(tif);
-
-        if (rowlen == 0)
-                return 0;
-
-	assert(cc%rowlen == 0);
-	while (cc && (*tif->tif_encoderow)(tif, bp, rowlen, s) == 1) {
-		bp += rowlen;
-		cc -= rowlen;
-	}
-	return (cc == 0);
+    tmsize_t rowlen = TIFFScanlineSize(tif);
+
+    if (rowlen == 0)
+        return 0;
+
+    assert(cc % rowlen == 0);
+    while (cc && (*tif->tif_encoderow)(tif, bp, rowlen, s) == 1)
+    {
+        bp += rowlen;
+        cc -= rowlen;
+    }
+    return (cc == 0);
 }
 
 /*
  * Encode a tile of pixels.  We break it into rows to
  * avoid encoding runs across row boundaries.
  */
-static int
-LogLuvEncodeTile(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LogLuvEncodeTile(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	tmsize_t rowlen = TIFFTileRowSize(tif);
-
-        if (rowlen == 0)
-                return 0;
-
-	assert(cc%rowlen == 0);
-	while (cc && (*tif->tif_encoderow)(tif, bp, rowlen, s) == 1) {
-		bp += rowlen;
-		cc -= rowlen;
-	}
-	return (cc == 0);
+    tmsize_t rowlen = TIFFTileRowSize(tif);
+
+    if (rowlen == 0)
+        return 0;
+
+    assert(cc % rowlen == 0);
+    while (cc && (*tif->tif_encoderow)(tif, bp, rowlen, s) == 1)
+    {
+        bp += rowlen;
+        cc -= rowlen;
+    }
+    return (cc == 0);
 }
 
 /*
@@ -732,190 +754,192 @@ LogLuvEncodeTile(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
 #include "uvcode.h"
 
 #ifndef UVSCALE
-#define U_NEU		0.210526316
-#define V_NEU		0.473684211
-#define UVSCALE		410.
+#define U_NEU 0.210526316
+#define V_NEU 0.473684211
+#define UVSCALE 410.
 #endif
 
-#ifndef	M_LN2
-#define M_LN2		0.69314718055994530942
+#ifndef M_LN2
+#define M_LN2 0.69314718055994530942
 #endif
 #ifndef M_PI
-#define M_PI		3.14159265358979323846
+#define M_PI 3.14159265358979323846
 #endif
 #undef log2 /* Conflict with C'99 function */
-#define log2(x)		((1./M_LN2)*log(x))
-#undef exp2  /* Conflict with C'99 function */
-#define exp2(x)		exp(M_LN2*(x))
+#define log2(x) ((1. / M_LN2) * log(x))
+#undef exp2 /* Conflict with C'99 function */
+#define exp2(x) exp(M_LN2 *(x))
 
 static int tiff_itrunc(double x, int m)
 {
-    if( m == SGILOGENCODE_NODITHER )
+    if (m == SGILOGENCODE_NODITHER)
         return (int)x;
     /* Silence CoverityScan warning about bad crypto function */
     /* coverity[dont_call] */
-    return (int)(x + rand()*(1./RAND_MAX) - .5);
+    return (int)(x + rand() * (1. / RAND_MAX) - .5);
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-double
-LogL16toY(int p16)		/* compute luminance from 16-bit LogL */
+    double
+    LogL16toY(int p16) /* compute luminance from 16-bit LogL */
 {
-	int	Le = p16 & 0x7fff;
-	double	Y;
+    int Le = p16 & 0x7fff;
+    double Y;
 
-	if (!Le)
-		return (0.);
-	Y = exp(M_LN2/256.*(Le+.5) - M_LN2*64.);
-	return (!(p16 & 0x8000) ? Y : -Y);
+    if (!Le)
+        return (0.);
+    Y = exp(M_LN2 / 256. * (Le + .5) - M_LN2 * 64.);
+    return (!(p16 & 0x8000) ? Y : -Y);
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-int
-LogL16fromY(double Y, int em)	/* get 16-bit LogL from Y */
+    int
+    LogL16fromY(double Y, int em) /* get 16-bit LogL from Y */
 {
-	if (Y >= 1.8371976e19)
-		return (0x7fff);
-	if (Y <= -1.8371976e19)
-		return (0xffff);
-	if (Y > 5.4136769e-20)
-		return tiff_itrunc(256.*(log2(Y) + 64.), em);
-	if (Y < -5.4136769e-20)
-		return (~0x7fff | tiff_itrunc(256.*(log2(-Y) + 64.), em));
-	return (0);
+    if (Y >= 1.8371976e19)
+        return (0x7fff);
+    if (Y <= -1.8371976e19)
+        return (0xffff);
+    if (Y > 5.4136769e-20)
+        return tiff_itrunc(256. * (log2(Y) + 64.), em);
+    if (Y < -5.4136769e-20)
+        return (~0x7fff | tiff_itrunc(256. * (log2(-Y) + 64.), em));
+    return (0);
 }
 
-static void
-L16toY(LogLuvState* sp, uint8* op, tmsize_t n)
+static void L16toY(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	int16* l16 = (int16*) sp->tbuf;
-	float* yp = (float*) op;
+    int16_t *l16 = (int16_t *)sp->tbuf;
+    float *yp = (float *)op;
 
-	while (n-- > 0)
-		*yp++ = (float)LogL16toY(*l16++);
+    while (n-- > 0)
+        *yp++ = (float)LogL16toY(*l16++);
 }
 
-static void
-L16toGry(LogLuvState* sp, uint8* op, tmsize_t n)
+static void L16toGry(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	int16* l16 = (int16*) sp->tbuf;
-	uint8* gp = (uint8*) op;
-
-	while (n-- > 0) {
-		double Y = LogL16toY(*l16++);
-		*gp++ = (uint8) ((Y <= 0.) ? 0 : (Y >= 1.) ? 255 : (int)(256.*sqrt(Y)));
-	}
+    int16_t *l16 = (int16_t *)sp->tbuf;
+    uint8_t *gp = (uint8_t *)op;
+
+    while (n-- > 0)
+    {
+        double Y = LogL16toY(*l16++);
+        *gp++ = (uint8_t)((Y <= 0.)   ? 0
+                          : (Y >= 1.) ? 255
+                                      : (int)(256. * sqrt(Y)));
+    }
 }
 
-static void
-L16fromY(LogLuvState* sp, uint8* op, tmsize_t n)
+static void L16fromY(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	int16* l16 = (int16*) sp->tbuf;
-	float* yp = (float*) op;
+    int16_t *l16 = (int16_t *)sp->tbuf;
+    float *yp = (float *)op;
 
-	while (n-- > 0)
-		*l16++ = (int16) (LogL16fromY(*yp++, sp->encode_meth));
+    while (n-- > 0)
+        *l16++ = (int16_t)(LogL16fromY(*yp++, sp->encode_meth));
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-void
-XYZtoRGB24(float xyz[3], uint8 rgb[3])
+    void
+    XYZtoRGB24(float *xyz, uint8_t *rgb)
 {
-	double	r, g, b;
-					/* assume CCIR-709 primaries */
-	r =  2.690*xyz[0] + -1.276*xyz[1] + -0.414*xyz[2];
-	g = -1.022*xyz[0] +  1.978*xyz[1] +  0.044*xyz[2];
-	b =  0.061*xyz[0] + -0.224*xyz[1] +  1.163*xyz[2];
-					/* assume 2.0 gamma for speed */
-	/* could use integer sqrt approx., but this is probably faster */
-	rgb[0] = (uint8)((r<=0.) ? 0 : (r >= 1.) ? 255 : (int)(256.*sqrt(r)));
-	rgb[1] = (uint8)((g<=0.) ? 0 : (g >= 1.) ? 255 : (int)(256.*sqrt(g)));
-	rgb[2] = (uint8)((b<=0.) ? 0 : (b >= 1.) ? 255 : (int)(256.*sqrt(b)));
+    double r, g, b;
+    /* assume CCIR-709 primaries */
+    r = 2.690 * xyz[0] + -1.276 * xyz[1] + -0.414 * xyz[2];
+    g = -1.022 * xyz[0] + 1.978 * xyz[1] + 0.044 * xyz[2];
+    b = 0.061 * xyz[0] + -0.224 * xyz[1] + 1.163 * xyz[2];
+    /* assume 2.0 gamma for speed */
+    /* could use integer sqrt approx., but this is probably faster */
+    rgb[0] = (uint8_t)((r <= 0.) ? 0 : (r >= 1.) ? 255 : (int)(256. * sqrt(r)));
+    rgb[1] = (uint8_t)((g <= 0.) ? 0 : (g >= 1.) ? 255 : (int)(256. * sqrt(g)));
+    rgb[2] = (uint8_t)((b <= 0.) ? 0 : (b >= 1.) ? 255 : (int)(256. * sqrt(b)));
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-double
-LogL10toY(int p10)		/* compute luminance from 10-bit LogL */
+    double
+    LogL10toY(int p10) /* compute luminance from 10-bit LogL */
 {
-	if (p10 == 0)
-		return (0.);
-	return (exp(M_LN2/64.*(p10+.5) - M_LN2*12.));
+    if (p10 == 0)
+        return (0.);
+    return (exp(M_LN2 / 64. * (p10 + .5) - M_LN2 * 12.));
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-int
-LogL10fromY(double Y, int em)	/* get 10-bit LogL from Y */
+    int
+    LogL10fromY(double Y, int em) /* get 10-bit LogL from Y */
 {
-	if (Y >= 15.742)
-		return (0x3ff);
-	else if (Y <= .00024283)
-		return (0);
-	else
-		return tiff_itrunc(64.*(log2(Y) + 12.), em);
+    if (Y >= 15.742)
+        return (0x3ff);
+    else if (Y <= .00024283)
+        return (0);
+    else
+        return tiff_itrunc(64. * (log2(Y) + 12.), em);
 }
 
-#define NANGLES		100
-#define uv2ang(u, v)	( (NANGLES*.499999999/M_PI) \
-				* atan2((v)-V_NEU,(u)-U_NEU) + .5*NANGLES )
+#define NANGLES 100
+#define uv2ang(u, v)                                                           \
+    ((NANGLES * .499999999 / M_PI) * atan2((v)-V_NEU, (u)-U_NEU) + .5 * NANGLES)
 
-static int
-oog_encode(double u, double v)		/* encode out-of-gamut chroma */
+static int oog_encode(double u, double v) /* encode out-of-gamut chroma */
 {
-	static int	oog_table[NANGLES];
-	static int	initialized = 0;
-	register int	i;
-
-	if (!initialized) {		/* set up perimeter table */
-		double	eps[NANGLES], ua, va, ang, epsa;
-		int	ui, vi, ustep;
-		for (i = NANGLES; i--; )
-			eps[i] = 2.;
-		for (vi = UV_NVS; vi--; ) {
-			va = UV_VSTART + (vi+.5)*UV_SQSIZ;
-			ustep = uv_row[vi].nus-1;
-			if (vi == UV_NVS-1 || vi == 0 || ustep <= 0)
-				ustep = 1;
-			for (ui = uv_row[vi].nus-1; ui >= 0; ui -= ustep) {
-				ua = uv_row[vi].ustart + (ui+.5)*UV_SQSIZ;
-				ang = uv2ang(ua, va);
-				i = (int) ang;
-				epsa = fabs(ang - (i+.5));
-				if (epsa < eps[i]) {
-					oog_table[i] = uv_row[vi].ncum + ui;
-					eps[i] = epsa;
-				}
-			}
-		}
-		for (i = NANGLES; i--; )	/* fill any holes */
-			if (eps[i] > 1.5) {
-				int	i1, i2;
-				for (i1 = 1; i1 < NANGLES/2; i1++)
-					if (eps[(i+i1)%NANGLES] < 1.5)
-						break;
-				for (i2 = 1; i2 < NANGLES/2; i2++)
-					if (eps[(i+NANGLES-i2)%NANGLES] < 1.5)
-						break;
-				if (i1 < i2)
-					oog_table[i] =
-						oog_table[(i+i1)%NANGLES];
-				else
-					oog_table[i] =
-						oog_table[(i+NANGLES-i2)%NANGLES];
-			}
-		initialized = 1;
-	}
-	i = (int) uv2ang(u, v);		/* look up hue angle */
-	return (oog_table[i]);
+    static int oog_table[NANGLES];
+    static int initialized = 0;
+    register int i;
+
+    if (!initialized)
+    { /* set up perimeter table */
+        double eps[NANGLES], ua, va, ang, epsa;
+        int ui, vi, ustep;
+        for (i = NANGLES; i--;)
+            eps[i] = 2.;
+        for (vi = UV_NVS; vi--;)
+        {
+            va = UV_VSTART + (vi + .5) * UV_SQSIZ;
+            ustep = uv_row[vi].nus - 1;
+            if (vi == UV_NVS - 1 || vi == 0 || ustep <= 0)
+                ustep = 1;
+            for (ui = uv_row[vi].nus - 1; ui >= 0; ui -= ustep)
+            {
+                ua = uv_row[vi].ustart + (ui + .5) * UV_SQSIZ;
+                ang = uv2ang(ua, va);
+                i = (int)ang;
+                epsa = fabs(ang - (i + .5));
+                if (epsa < eps[i])
+                {
+                    oog_table[i] = uv_row[vi].ncum + ui;
+                    eps[i] = epsa;
+                }
+            }
+        }
+        for (i = NANGLES; i--;) /* fill any holes */
+            if (eps[i] > 1.5)
+            {
+                int i1, i2;
+                for (i1 = 1; i1 < NANGLES / 2; i1++)
+                    if (eps[(i + i1) % NANGLES] < 1.5)
+                        break;
+                for (i2 = 1; i2 < NANGLES / 2; i2++)
+                    if (eps[(i + NANGLES - i2) % NANGLES] < 1.5)
+                        break;
+                if (i1 < i2)
+                    oog_table[i] = oog_table[(i + i1) % NANGLES];
+                else
+                    oog_table[i] = oog_table[(i + NANGLES - i2) % NANGLES];
+            }
+        initialized = 1;
+    }
+    i = (int)uv2ang(u, v); /* look up hue angle */
+    return (oog_table[i]);
 }
 
 #undef uv2ang
@@ -924,847 +948,891 @@ oog_encode(double u, double v)		/* encode out-of-gamut chroma */
 #if !LOGLUV_PUBLIC
 static
 #endif
-int
-uv_encode(double u, double v, int em)	/* encode (u',v') coordinates */
+    int
+    uv_encode(double u, double v, int em) /* encode (u',v') coordinates */
 {
-	register int	vi, ui;
-
-	if (v < UV_VSTART)
-		return oog_encode(u, v);
-	vi = tiff_itrunc((v - UV_VSTART)*(1./UV_SQSIZ), em);
-	if (vi >= UV_NVS)
-		return oog_encode(u, v);
-	if (u < uv_row[vi].ustart)
-		return oog_encode(u, v);
-	ui = tiff_itrunc((u - uv_row[vi].ustart)*(1./UV_SQSIZ), em);
-	if (ui >= uv_row[vi].nus)
-		return oog_encode(u, v);
-
-	return (uv_row[vi].ncum + ui);
+    register int vi, ui;
+
+    /* check for NaN */
+    if (u != u || v != v)
+    {
+        u = U_NEU;
+        v = V_NEU;
+    }
+
+    if (v < UV_VSTART)
+        return oog_encode(u, v);
+    vi = tiff_itrunc((v - UV_VSTART) * (1. / UV_SQSIZ), em);
+    if (vi >= UV_NVS)
+        return oog_encode(u, v);
+    if (u < uv_row[vi].ustart)
+        return oog_encode(u, v);
+    ui = tiff_itrunc((u - uv_row[vi].ustart) * (1. / UV_SQSIZ), em);
+    if (ui >= uv_row[vi].nus)
+        return oog_encode(u, v);
+
+    return (uv_row[vi].ncum + ui);
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-int
-uv_decode(double *up, double *vp, int c)	/* decode (u',v') index */
+    int
+    uv_decode(double *up, double *vp, int c) /* decode (u',v') index */
 {
-	int	upper, lower;
-	register int	ui, vi;
-
-	if (c < 0 || c >= UV_NDIVS)
-		return (-1);
-	lower = 0;				/* binary search */
-	upper = UV_NVS;
-	while (upper - lower > 1) {
-		vi = (lower + upper) >> 1;
-		ui = c - uv_row[vi].ncum;
-		if (ui > 0)
-			lower = vi;
-		else if (ui < 0)
-			upper = vi;
-		else {
-			lower = vi;
-			break;
-		}
-	}
-	vi = lower;
-	ui = c - uv_row[vi].ncum;
-	*up = uv_row[vi].ustart + (ui+.5)*UV_SQSIZ;
-	*vp = UV_VSTART + (vi+.5)*UV_SQSIZ;
-	return (0);
+    int upper, lower;
+    register int ui, vi;
+
+    if (c < 0 || c >= UV_NDIVS)
+        return (-1);
+    lower = 0; /* binary search */
+    upper = UV_NVS;
+    while (upper - lower > 1)
+    {
+        vi = (lower + upper) >> 1;
+        ui = c - uv_row[vi].ncum;
+        if (ui > 0)
+            lower = vi;
+        else if (ui < 0)
+            upper = vi;
+        else
+        {
+            lower = vi;
+            break;
+        }
+    }
+    vi = lower;
+    ui = c - uv_row[vi].ncum;
+    *up = uv_row[vi].ustart + (ui + .5) * UV_SQSIZ;
+    *vp = UV_VSTART + (vi + .5) * UV_SQSIZ;
+    return (0);
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-void
-LogLuv24toXYZ(uint32 p, float XYZ[3])
+    void
+    LogLuv24toXYZ(uint32_t p, float *XYZ)
 {
-	int	Ce;
-	double	L, u, v, s, x, y;
-					/* decode luminance */
-	L = LogL10toY(p>>14 & 0x3ff);
-	if (L <= 0.) {
-		XYZ[0] = XYZ[1] = XYZ[2] = 0.;
-		return;
-	}
-					/* decode color */
-	Ce = p & 0x3fff;
-	if (uv_decode(&u, &v, Ce) < 0) {
-		u = U_NEU; v = V_NEU;
-	}
-	s = 1./(6.*u - 16.*v + 12.);
-	x = 9.*u * s;
-	y = 4.*v * s;
-					/* convert to XYZ */
-	XYZ[0] = (float)(x/y * L);
-	XYZ[1] = (float)L;
-	XYZ[2] = (float)((1.-x-y)/y * L);
+    int Ce;
+    double L, u, v, s, x, y;
+    /* decode luminance */
+    L = LogL10toY(p >> 14 & 0x3ff);
+    if (L <= 0.)
+    {
+        XYZ[0] = XYZ[1] = XYZ[2] = 0.;
+        return;
+    }
+    /* decode color */
+    Ce = p & 0x3fff;
+    if (uv_decode(&u, &v, Ce) < 0)
+    {
+        u = U_NEU;
+        v = V_NEU;
+    }
+    s = 1. / (6. * u - 16. * v + 12.);
+    x = 9. * u * s;
+    y = 4. * v * s;
+    /* convert to XYZ */
+    XYZ[0] = (float)(x / y * L);
+    XYZ[1] = (float)L;
+    XYZ[2] = (float)((1. - x - y) / y * L);
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-uint32
-LogLuv24fromXYZ(float XYZ[3], int em)
+    uint32_t
+    LogLuv24fromXYZ(float *XYZ, int em)
 {
-	int	Le, Ce;
-	double	u, v, s;
-					/* encode luminance */
-	Le = LogL10fromY(XYZ[1], em);
-					/* encode color */
-	s = XYZ[0] + 15.*XYZ[1] + 3.*XYZ[2];
-	if (!Le || s <= 0.) {
-		u = U_NEU;
-		v = V_NEU;
-	} else {
-		u = 4.*XYZ[0] / s;
-		v = 9.*XYZ[1] / s;
-	}
-	Ce = uv_encode(u, v, em);
-	if (Ce < 0)			/* never happens */
-		Ce = uv_encode(U_NEU, V_NEU, SGILOGENCODE_NODITHER);
-					/* combine encodings */
-	return (Le << 14 | Ce);
+    int Le, Ce;
+    double u, v, s;
+    /* encode luminance */
+    Le = LogL10fromY(XYZ[1], em);
+    /* encode color */
+    s = XYZ[0] + 15. * XYZ[1] + 3. * XYZ[2];
+    if (!Le || s <= 0.)
+    {
+        u = U_NEU;
+        v = V_NEU;
+    }
+    else
+    {
+        u = 4. * XYZ[0] / s;
+        v = 9. * XYZ[1] / s;
+    }
+    Ce = uv_encode(u, v, em);
+    if (Ce < 0) /* never happens */
+        Ce = uv_encode(U_NEU, V_NEU, SGILOGENCODE_NODITHER);
+    /* combine encodings */
+    return (Le << 14 | Ce);
 }
 
-static void
-Luv24toXYZ(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv24toXYZ(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	float* xyz = (float*) op;
-
-	while (n-- > 0) {
-		LogLuv24toXYZ(*luv, xyz);
-		xyz += 3;
-		luv++;
-	}
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    float *xyz = (float *)op;
+
+    while (n-- > 0)
+    {
+        LogLuv24toXYZ(*luv, xyz);
+        xyz += 3;
+        luv++;
+    }
 }
 
-static void
-Luv24toLuv48(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv24toLuv48(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	int16* luv3 = (int16*) op;
-
-	while (n-- > 0) {
-		double u, v;
-
-		*luv3++ = (int16)((*luv >> 12 & 0xffd) + 13314);
-		if (uv_decode(&u, &v, *luv&0x3fff) < 0) {
-			u = U_NEU;
-			v = V_NEU;
-		}
-		*luv3++ = (int16)(u * (1L<<15));
-		*luv3++ = (int16)(v * (1L<<15));
-		luv++;
-	}
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    int16_t *luv3 = (int16_t *)op;
+
+    while (n-- > 0)
+    {
+        double u, v;
+
+        *luv3++ = (int16_t)((*luv >> 12 & 0xffd) + 13314);
+        if (uv_decode(&u, &v, *luv & 0x3fff) < 0)
+        {
+            u = U_NEU;
+            v = V_NEU;
+        }
+        *luv3++ = (int16_t)(u * (1L << 15));
+        *luv3++ = (int16_t)(v * (1L << 15));
+        luv++;
+    }
 }
 
-static void
-Luv24toRGB(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv24toRGB(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	uint8* rgb = (uint8*) op;
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    uint8_t *rgb = (uint8_t *)op;
 
-	while (n-- > 0) {
-		float xyz[3];
+    while (n-- > 0)
+    {
+        float xyz[3];
 
-		LogLuv24toXYZ(*luv++, xyz);
-		XYZtoRGB24(xyz, rgb);
-		rgb += 3;
-	}
+        LogLuv24toXYZ(*luv++, xyz);
+        XYZtoRGB24(xyz, rgb);
+        rgb += 3;
+    }
 }
 
-static void
-Luv24fromXYZ(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv24fromXYZ(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	float* xyz = (float*) op;
-
-	while (n-- > 0) {
-		*luv++ = LogLuv24fromXYZ(xyz, sp->encode_meth);
-		xyz += 3;
-	}
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    float *xyz = (float *)op;
+
+    while (n-- > 0)
+    {
+        *luv++ = LogLuv24fromXYZ(xyz, sp->encode_meth);
+        xyz += 3;
+    }
 }
 
-static void
-Luv24fromLuv48(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv24fromLuv48(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	int16* luv3 = (int16*) op;
-
-	while (n-- > 0) {
-		int Le, Ce;
-
-		if (luv3[0] <= 0)
-			Le = 0;
-		else if (luv3[0] >= (1<<12)+3314)
-			Le = (1<<10) - 1;
-		else if (sp->encode_meth == SGILOGENCODE_NODITHER)
-			Le = (luv3[0]-3314) >> 2;
-		else
-			Le = tiff_itrunc(.25*(luv3[0]-3314.), sp->encode_meth);
-
-		Ce = uv_encode((luv3[1]+.5)/(1<<15), (luv3[2]+.5)/(1<<15),
-					sp->encode_meth);
-		if (Ce < 0)	/* never happens */
-			Ce = uv_encode(U_NEU, V_NEU, SGILOGENCODE_NODITHER);
-		*luv++ = (uint32)Le << 14 | Ce;
-		luv3 += 3;
-	}
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    int16_t *luv3 = (int16_t *)op;
+
+    while (n-- > 0)
+    {
+        int Le, Ce;
+
+        if (luv3[0] <= 0)
+            Le = 0;
+        else if (luv3[0] >= (1 << 12) + 3314)
+            Le = (1 << 10) - 1;
+        else if (sp->encode_meth == SGILOGENCODE_NODITHER)
+            Le = (luv3[0] - 3314) >> 2;
+        else
+            Le = tiff_itrunc(.25 * (luv3[0] - 3314.), sp->encode_meth);
+
+        Ce = uv_encode((luv3[1] + .5) / (1 << 15), (luv3[2] + .5) / (1 << 15),
+                       sp->encode_meth);
+        if (Ce < 0) /* never happens */
+            Ce = uv_encode(U_NEU, V_NEU, SGILOGENCODE_NODITHER);
+        *luv++ = (uint32_t)Le << 14 | Ce;
+        luv3 += 3;
+    }
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-void
-LogLuv32toXYZ(uint32 p, float XYZ[3])
+    void
+    LogLuv32toXYZ(uint32_t p, float *XYZ)
 {
-	double	L, u, v, s, x, y;
-					/* decode luminance */
-	L = LogL16toY((int)p >> 16);
-	if (L <= 0.) {
-		XYZ[0] = XYZ[1] = XYZ[2] = 0.;
-		return;
-	}
-					/* decode color */
-	u = 1./UVSCALE * ((p>>8 & 0xff) + .5);
-	v = 1./UVSCALE * ((p & 0xff) + .5);
-	s = 1./(6.*u - 16.*v + 12.);
-	x = 9.*u * s;
-	y = 4.*v * s;
-					/* convert to XYZ */
-	XYZ[0] = (float)(x/y * L);
-	XYZ[1] = (float)L;
-	XYZ[2] = (float)((1.-x-y)/y * L);
+    double L, u, v, s, x, y;
+    /* decode luminance */
+    L = LogL16toY((int)p >> 16);
+    if (L <= 0.)
+    {
+        XYZ[0] = XYZ[1] = XYZ[2] = 0.;
+        return;
+    }
+    /* decode color */
+    u = 1. / UVSCALE * ((p >> 8 & 0xff) + .5);
+    v = 1. / UVSCALE * ((p & 0xff) + .5);
+    s = 1. / (6. * u - 16. * v + 12.);
+    x = 9. * u * s;
+    y = 4. * v * s;
+    /* convert to XYZ */
+    XYZ[0] = (float)(x / y * L);
+    XYZ[1] = (float)L;
+    XYZ[2] = (float)((1. - x - y) / y * L);
 }
 
 #if !LOGLUV_PUBLIC
 static
 #endif
-uint32
-LogLuv32fromXYZ(float XYZ[3], int em)
+    uint32_t
+    LogLuv32fromXYZ(float *XYZ, int em)
 {
-	unsigned int	Le, ue, ve;
-	double	u, v, s;
-					/* encode luminance */
-	Le = (unsigned int)LogL16fromY(XYZ[1], em);
-					/* encode color */
-	s = XYZ[0] + 15.*XYZ[1] + 3.*XYZ[2];
-	if (!Le || s <= 0.) {
-		u = U_NEU;
-		v = V_NEU;
-	} else {
-		u = 4.*XYZ[0] / s;
-		v = 9.*XYZ[1] / s;
-	}
-	if (u <= 0.) ue = 0;
-	else ue = tiff_itrunc(UVSCALE*u, em);
-	if (ue > 255) ue = 255;
-	if (v <= 0.) ve = 0;
-	else ve = tiff_itrunc(UVSCALE*v, em);
-	if (ve > 255) ve = 255;
-					/* combine encodings */
-	return (Le << 16 | ue << 8 | ve);
+    unsigned int Le, ue, ve;
+    double u, v, s;
+    /* encode luminance */
+    Le = (unsigned int)LogL16fromY(XYZ[1], em);
+    /* encode color */
+    s = XYZ[0] + 15. * XYZ[1] + 3. * XYZ[2];
+    if (!Le || s <= 0.)
+    {
+        u = U_NEU;
+        v = V_NEU;
+    }
+    else
+    {
+        u = 4. * XYZ[0] / s;
+        v = 9. * XYZ[1] / s;
+    }
+    if (u <= 0.)
+        ue = 0;
+    else
+        ue = tiff_itrunc(UVSCALE * u, em);
+    if (ue > 255)
+        ue = 255;
+    if (v <= 0.)
+        ve = 0;
+    else
+        ve = tiff_itrunc(UVSCALE * v, em);
+    if (ve > 255)
+        ve = 255;
+    /* combine encodings */
+    return (Le << 16 | ue << 8 | ve);
 }
 
-static void
-Luv32toXYZ(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv32toXYZ(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	float* xyz = (float*) op;
-
-	while (n-- > 0) {
-		LogLuv32toXYZ(*luv++, xyz);
-		xyz += 3;
-	}
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    float *xyz = (float *)op;
+
+    while (n-- > 0)
+    {
+        LogLuv32toXYZ(*luv++, xyz);
+        xyz += 3;
+    }
 }
 
-static void
-Luv32toLuv48(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv32toLuv48(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	int16* luv3 = (int16*) op;
-
-	while (n-- > 0) {
-		double u, v;
-
-		*luv3++ = (int16)(*luv >> 16);
-		u = 1./UVSCALE * ((*luv>>8 & 0xff) + .5);
-		v = 1./UVSCALE * ((*luv & 0xff) + .5);
-		*luv3++ = (int16)(u * (1L<<15));
-		*luv3++ = (int16)(v * (1L<<15));
-		luv++;
-	}
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    int16_t *luv3 = (int16_t *)op;
+
+    while (n-- > 0)
+    {
+        double u, v;
+
+        *luv3++ = (int16_t)(*luv >> 16);
+        u = 1. / UVSCALE * ((*luv >> 8 & 0xff) + .5);
+        v = 1. / UVSCALE * ((*luv & 0xff) + .5);
+        *luv3++ = (int16_t)(u * (1L << 15));
+        *luv3++ = (int16_t)(v * (1L << 15));
+        luv++;
+    }
 }
 
-static void
-Luv32toRGB(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv32toRGB(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	uint8* rgb = (uint8*) op;
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    uint8_t *rgb = (uint8_t *)op;
 
-	while (n-- > 0) {
-		float xyz[3];
+    while (n-- > 0)
+    {
+        float xyz[3];
 
-		LogLuv32toXYZ(*luv++, xyz);
-		XYZtoRGB24(xyz, rgb);
-		rgb += 3;
-	}
+        LogLuv32toXYZ(*luv++, xyz);
+        XYZtoRGB24(xyz, rgb);
+        rgb += 3;
+    }
 }
 
-static void
-Luv32fromXYZ(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv32fromXYZ(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;  
-	float* xyz = (float*) op;
-
-	while (n-- > 0) {
-		*luv++ = LogLuv32fromXYZ(xyz, sp->encode_meth);
-		xyz += 3;
-	}
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    float *xyz = (float *)op;
+
+    while (n-- > 0)
+    {
+        *luv++ = LogLuv32fromXYZ(xyz, sp->encode_meth);
+        xyz += 3;
+    }
 }
 
-static void
-Luv32fromLuv48(LogLuvState* sp, uint8* op, tmsize_t n)
+static void Luv32fromLuv48(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	uint32* luv = (uint32*) sp->tbuf;
-	int16* luv3 = (int16*) op;
-
-	if (sp->encode_meth == SGILOGENCODE_NODITHER) {
-		while (n-- > 0) {
-			*luv++ = (uint32)luv3[0] << 16 |
-				(luv3[1]*(uint32)(UVSCALE+.5) >> 7 & 0xff00) |
-				(luv3[2]*(uint32)(UVSCALE+.5) >> 15 & 0xff);
-			luv3 += 3;
-		}
-		return;
-	}
-	while (n-- > 0) {
-		*luv++ = (uint32)luv3[0] << 16 |
-	(tiff_itrunc(luv3[1]*(UVSCALE/(1<<15)), sp->encode_meth) << 8 & 0xff00) |
-		(tiff_itrunc(luv3[2]*(UVSCALE/(1<<15)), sp->encode_meth) & 0xff);
-		luv3 += 3;
-	}
+    uint32_t *luv = (uint32_t *)sp->tbuf;
+    int16_t *luv3 = (int16_t *)op;
+
+    if (sp->encode_meth == SGILOGENCODE_NODITHER)
+    {
+        while (n-- > 0)
+        {
+            *luv++ = (uint32_t)luv3[0] << 16 |
+                     (luv3[1] * (uint32_t)(UVSCALE + .5) >> 7 & 0xff00) |
+                     (luv3[2] * (uint32_t)(UVSCALE + .5) >> 15 & 0xff);
+            luv3 += 3;
+        }
+        return;
+    }
+    while (n-- > 0)
+    {
+        *luv++ =
+            (uint32_t)luv3[0] << 16 |
+            (tiff_itrunc(luv3[1] * (UVSCALE / (1 << 15)), sp->encode_meth)
+                 << 8 &
+             0xff00) |
+            (tiff_itrunc(luv3[2] * (UVSCALE / (1 << 15)), sp->encode_meth) &
+             0xff);
+        luv3 += 3;
+    }
 }
 
-static void
-_logLuvNop(LogLuvState* sp, uint8* op, tmsize_t n)
+static void _logLuvNop(LogLuvState *sp, uint8_t *op, tmsize_t n)
 {
-	(void) sp; (void) op; (void) n;
+    (void)sp;
+    (void)op;
+    (void)n;
 }
 
-static int
-LogL16GuessDataFmt(TIFFDirectory *td)
+static int LogL16GuessDataFmt(TIFFDirectory *td)
 {
-#define	PACK(s,b,f)	(((b)<<6)|((s)<<3)|(f))
-	switch (PACK(td->td_samplesperpixel, td->td_bitspersample, td->td_sampleformat)) {
-	case PACK(1, 32, SAMPLEFORMAT_IEEEFP):
-		return (SGILOGDATAFMT_FLOAT);
-	case PACK(1, 16, SAMPLEFORMAT_VOID):
-	case PACK(1, 16, SAMPLEFORMAT_INT):
-	case PACK(1, 16, SAMPLEFORMAT_UINT):
-		return (SGILOGDATAFMT_16BIT);
-	case PACK(1,  8, SAMPLEFORMAT_VOID):
-	case PACK(1,  8, SAMPLEFORMAT_UINT):
-		return (SGILOGDATAFMT_8BIT);
-	}
+#define PACK(s, b, f) (((b) << 6) | ((s) << 3) | (f))
+    switch (
+        PACK(td->td_samplesperpixel, td->td_bitspersample, td->td_sampleformat))
+    {
+        case PACK(1, 32, SAMPLEFORMAT_IEEEFP):
+            return (SGILOGDATAFMT_FLOAT);
+        case PACK(1, 16, SAMPLEFORMAT_VOID):
+        case PACK(1, 16, SAMPLEFORMAT_INT):
+        case PACK(1, 16, SAMPLEFORMAT_UINT):
+            return (SGILOGDATAFMT_16BIT);
+        case PACK(1, 8, SAMPLEFORMAT_VOID):
+        case PACK(1, 8, SAMPLEFORMAT_UINT):
+            return (SGILOGDATAFMT_8BIT);
+    }
 #undef PACK
-	return (SGILOGDATAFMT_UNKNOWN);
+    return (SGILOGDATAFMT_UNKNOWN);
 }
 
-static tmsize_t
-multiply_ms(tmsize_t m1, tmsize_t m2)
+static tmsize_t multiply_ms(tmsize_t m1, tmsize_t m2)
 {
-        return _TIFFMultiplySSize(NULL, m1, m2, NULL);
+    return _TIFFMultiplySSize(NULL, m1, m2, NULL);
 }
 
-static int
-LogL16InitState(TIFF* tif)
+static int LogL16InitState(TIFF *tif)
 {
-	static const char module[] = "LogL16InitState";
-	TIFFDirectory *td = &tif->tif_dir;
-	LogLuvState* sp = DecoderState(tif);
-
-	assert(sp != NULL);
-	assert(td->td_photometric == PHOTOMETRIC_LOGL);
-
-	if( td->td_samplesperpixel != 1 )
-	{
-		TIFFErrorExt(tif->tif_clientdata, module,
-		             "Sorry, can not handle LogL image with %s=%d",
-			     "Samples/pixel", td->td_samplesperpixel);
-		return 0;
-	}
-
-	/* for some reason, we can't do this in TIFFInitLogL16 */
-	if (sp->user_datafmt == SGILOGDATAFMT_UNKNOWN)
-		sp->user_datafmt = LogL16GuessDataFmt(td);
-	switch (sp->user_datafmt) {
-	case SGILOGDATAFMT_FLOAT:
-		sp->pixel_size = sizeof (float);
-		break;
-	case SGILOGDATAFMT_16BIT:
-		sp->pixel_size = sizeof (int16);
-		break;
-	case SGILOGDATAFMT_8BIT:
-		sp->pixel_size = sizeof (uint8);
-		break;
-	default:
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "No support for converting user data format to LogL");
-		return (0);
-	}
-        if( isTiled(tif) )
-            sp->tbuflen = multiply_ms(td->td_tilewidth, td->td_tilelength);
-        else if( td->td_rowsperstrip < td->td_imagelength )
-            sp->tbuflen = multiply_ms(td->td_imagewidth, td->td_rowsperstrip);
-        else
-            sp->tbuflen = multiply_ms(td->td_imagewidth, td->td_imagelength);
-	if (multiply_ms(sp->tbuflen, sizeof (int16)) == 0 ||
-	    (sp->tbuf = (uint8*) _TIFFmalloc(sp->tbuflen * sizeof (int16))) == NULL) {
-		TIFFErrorExt(tif->tif_clientdata, module, "No space for SGILog translation buffer");
-		return (0);
-	}
-	return (1);
+    static const char module[] = "LogL16InitState";
+    TIFFDirectory *td = &tif->tif_dir;
+    LogLuvState *sp = DecoderState(tif);
+
+    assert(sp != NULL);
+    assert(td->td_photometric == PHOTOMETRIC_LOGL);
+
+    if (td->td_samplesperpixel != 1)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Sorry, can not handle LogL image with %s=%" PRIu16,
+                      "Samples/pixel", td->td_samplesperpixel);
+        return 0;
+    }
+
+    /* for some reason, we can't do this in TIFFInitLogL16 */
+    if (sp->user_datafmt == SGILOGDATAFMT_UNKNOWN)
+        sp->user_datafmt = LogL16GuessDataFmt(td);
+    switch (sp->user_datafmt)
+    {
+        case SGILOGDATAFMT_FLOAT:
+            sp->pixel_size = sizeof(float);
+            break;
+        case SGILOGDATAFMT_16BIT:
+            sp->pixel_size = sizeof(int16_t);
+            break;
+        case SGILOGDATAFMT_8BIT:
+            sp->pixel_size = sizeof(uint8_t);
+            break;
+        default:
+            TIFFErrorExtR(tif, module,
+                          "No support for converting user data format to LogL");
+            return (0);
+    }
+    if (isTiled(tif))
+        sp->tbuflen = multiply_ms(td->td_tilewidth, td->td_tilelength);
+    else if (td->td_rowsperstrip < td->td_imagelength)
+        sp->tbuflen = multiply_ms(td->td_imagewidth, td->td_rowsperstrip);
+    else
+        sp->tbuflen = multiply_ms(td->td_imagewidth, td->td_imagelength);
+    if (multiply_ms(sp->tbuflen, sizeof(int16_t)) == 0 ||
+        (sp->tbuf = (uint8_t *)_TIFFmallocExt(
+             tif, sp->tbuflen * sizeof(int16_t))) == NULL)
+    {
+        TIFFErrorExtR(tif, module, "No space for SGILog translation buffer");
+        return (0);
+    }
+    return (1);
 }
 
-static int
-LogLuvGuessDataFmt(TIFFDirectory *td)
+static int LogLuvGuessDataFmt(TIFFDirectory *td)
 {
-	int guess;
-
-	/*
-	 * If the user didn't tell us their datafmt,
-	 * take our best guess from the bitspersample.
-	 */
-#define	PACK(a,b)	(((a)<<3)|(b))
-	switch (PACK(td->td_bitspersample, td->td_sampleformat)) {
-	case PACK(32, SAMPLEFORMAT_IEEEFP):
-		guess = SGILOGDATAFMT_FLOAT;
-		break;
-	case PACK(32, SAMPLEFORMAT_VOID):
-	case PACK(32, SAMPLEFORMAT_UINT):
-	case PACK(32, SAMPLEFORMAT_INT):
-		guess = SGILOGDATAFMT_RAW;
-		break;
-	case PACK(16, SAMPLEFORMAT_VOID):
-	case PACK(16, SAMPLEFORMAT_INT):
-	case PACK(16, SAMPLEFORMAT_UINT):
-		guess = SGILOGDATAFMT_16BIT;
-		break;
-	case PACK( 8, SAMPLEFORMAT_VOID):
-	case PACK( 8, SAMPLEFORMAT_UINT):
-		guess = SGILOGDATAFMT_8BIT;
-		break;
-	default:
-		guess = SGILOGDATAFMT_UNKNOWN;
-		break;
+    int guess;
+
+    /*
+     * If the user didn't tell us their datafmt,
+     * take our best guess from the bitspersample.
+     */
+#define PACK(a, b) (((a) << 3) | (b))
+    switch (PACK(td->td_bitspersample, td->td_sampleformat))
+    {
+        case PACK(32, SAMPLEFORMAT_IEEEFP):
+            guess = SGILOGDATAFMT_FLOAT;
+            break;
+        case PACK(32, SAMPLEFORMAT_VOID):
+        case PACK(32, SAMPLEFORMAT_UINT):
+        case PACK(32, SAMPLEFORMAT_INT):
+            guess = SGILOGDATAFMT_RAW;
+            break;
+        case PACK(16, SAMPLEFORMAT_VOID):
+        case PACK(16, SAMPLEFORMAT_INT):
+        case PACK(16, SAMPLEFORMAT_UINT):
+            guess = SGILOGDATAFMT_16BIT;
+            break;
+        case PACK(8, SAMPLEFORMAT_VOID):
+        case PACK(8, SAMPLEFORMAT_UINT):
+            guess = SGILOGDATAFMT_8BIT;
+            break;
+        default:
+            guess = SGILOGDATAFMT_UNKNOWN;
+            break;
 #undef PACK
-	}
-	/*
-	 * Double-check samples per pixel.
-	 */
-	switch (td->td_samplesperpixel) {
-	case 1:
-		if (guess != SGILOGDATAFMT_RAW)
-			guess = SGILOGDATAFMT_UNKNOWN;
-		break;
-	case 3:
-		if (guess == SGILOGDATAFMT_RAW)
-			guess = SGILOGDATAFMT_UNKNOWN;
-		break;
-	default:
-		guess = SGILOGDATAFMT_UNKNOWN;
-		break;
-	}
-	return (guess);
+    }
+    /*
+     * Double-check samples per pixel.
+     */
+    switch (td->td_samplesperpixel)
+    {
+        case 1:
+            if (guess != SGILOGDATAFMT_RAW)
+                guess = SGILOGDATAFMT_UNKNOWN;
+            break;
+        case 3:
+            if (guess == SGILOGDATAFMT_RAW)
+                guess = SGILOGDATAFMT_UNKNOWN;
+            break;
+        default:
+            guess = SGILOGDATAFMT_UNKNOWN;
+            break;
+    }
+    return (guess);
 }
 
-static int
-LogLuvInitState(TIFF* tif)
+static int LogLuvInitState(TIFF *tif)
 {
-	static const char module[] = "LogLuvInitState";
-	TIFFDirectory* td = &tif->tif_dir;
-	LogLuvState* sp = DecoderState(tif);
-
-	assert(sp != NULL);
-	assert(td->td_photometric == PHOTOMETRIC_LOGLUV);
-
-	/* for some reason, we can't do this in TIFFInitLogLuv */
-	if (td->td_planarconfig != PLANARCONFIG_CONTIG) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "SGILog compression cannot handle non-contiguous data");
-		return (0);
-	}
-	if (sp->user_datafmt == SGILOGDATAFMT_UNKNOWN)
-		sp->user_datafmt = LogLuvGuessDataFmt(td);
-	switch (sp->user_datafmt) {
-	case SGILOGDATAFMT_FLOAT:
-		sp->pixel_size = 3*sizeof (float);
-		break;
-	case SGILOGDATAFMT_16BIT:
-		sp->pixel_size = 3*sizeof (int16);
-		break;
-	case SGILOGDATAFMT_RAW:
-		sp->pixel_size = sizeof (uint32);
-		break;
-	case SGILOGDATAFMT_8BIT:
-		sp->pixel_size = 3*sizeof (uint8);
-		break;
-	default:
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "No support for converting user data format to LogLuv");
-		return (0);
-	}
-        if( isTiled(tif) )
-            sp->tbuflen = multiply_ms(td->td_tilewidth, td->td_tilelength);
-        else if( td->td_rowsperstrip < td->td_imagelength )
-            sp->tbuflen = multiply_ms(td->td_imagewidth, td->td_rowsperstrip);
-        else
-            sp->tbuflen = multiply_ms(td->td_imagewidth, td->td_imagelength);
-	if (multiply_ms(sp->tbuflen, sizeof (uint32)) == 0 ||
-	    (sp->tbuf = (uint8*) _TIFFmalloc(sp->tbuflen * sizeof (uint32))) == NULL) {
-		TIFFErrorExt(tif->tif_clientdata, module, "No space for SGILog translation buffer");
-		return (0);
-	}
-	return (1);
+    static const char module[] = "LogLuvInitState";
+    TIFFDirectory *td = &tif->tif_dir;
+    LogLuvState *sp = DecoderState(tif);
+
+    assert(sp != NULL);
+    assert(td->td_photometric == PHOTOMETRIC_LOGLUV);
+
+    /* for some reason, we can't do this in TIFFInitLogLuv */
+    if (td->td_planarconfig != PLANARCONFIG_CONTIG)
+    {
+        TIFFErrorExtR(tif, module,
+                      "SGILog compression cannot handle non-contiguous data");
+        return (0);
+    }
+    if (sp->user_datafmt == SGILOGDATAFMT_UNKNOWN)
+        sp->user_datafmt = LogLuvGuessDataFmt(td);
+    switch (sp->user_datafmt)
+    {
+        case SGILOGDATAFMT_FLOAT:
+            sp->pixel_size = 3 * sizeof(float);
+            break;
+        case SGILOGDATAFMT_16BIT:
+            sp->pixel_size = 3 * sizeof(int16_t);
+            break;
+        case SGILOGDATAFMT_RAW:
+            sp->pixel_size = sizeof(uint32_t);
+            break;
+        case SGILOGDATAFMT_8BIT:
+            sp->pixel_size = 3 * sizeof(uint8_t);
+            break;
+        default:
+            TIFFErrorExtR(
+                tif, module,
+                "No support for converting user data format to LogLuv");
+            return (0);
+    }
+    if (isTiled(tif))
+        sp->tbuflen = multiply_ms(td->td_tilewidth, td->td_tilelength);
+    else if (td->td_rowsperstrip < td->td_imagelength)
+        sp->tbuflen = multiply_ms(td->td_imagewidth, td->td_rowsperstrip);
+    else
+        sp->tbuflen = multiply_ms(td->td_imagewidth, td->td_imagelength);
+    if (multiply_ms(sp->tbuflen, sizeof(uint32_t)) == 0 ||
+        (sp->tbuf = (uint8_t *)_TIFFmallocExt(
+             tif, sp->tbuflen * sizeof(uint32_t))) == NULL)
+    {
+        TIFFErrorExtR(tif, module, "No space for SGILog translation buffer");
+        return (0);
+    }
+    return (1);
 }
 
-static int
-LogLuvFixupTags(TIFF* tif)
+static int LogLuvFixupTags(TIFF *tif)
 {
-	(void) tif;
-	return (1);
+    (void)tif;
+    return (1);
 }
 
-static int
-LogLuvSetupDecode(TIFF* tif)
+static int LogLuvSetupDecode(TIFF *tif)
 {
-	static const char module[] = "LogLuvSetupDecode";
-	LogLuvState* sp = DecoderState(tif);
-	TIFFDirectory* td = &tif->tif_dir;
-
-	tif->tif_postdecode = _TIFFNoPostDecode;
-	switch (td->td_photometric) {
-	case PHOTOMETRIC_LOGLUV:
-		if (!LogLuvInitState(tif))
-			break;
-		if (td->td_compression == COMPRESSION_SGILOG24) {
-			tif->tif_decoderow = LogLuvDecode24;
-			switch (sp->user_datafmt) {
-			case SGILOGDATAFMT_FLOAT:
-				sp->tfunc = Luv24toXYZ;  
-				break;
-			case SGILOGDATAFMT_16BIT:
-				sp->tfunc = Luv24toLuv48;  
-				break;
-			case SGILOGDATAFMT_8BIT:
-				sp->tfunc = Luv24toRGB;
-				break;
-			}
-		} else {
-			tif->tif_decoderow = LogLuvDecode32;
-			switch (sp->user_datafmt) {
-			case SGILOGDATAFMT_FLOAT:
-				sp->tfunc = Luv32toXYZ;
-				break;
-			case SGILOGDATAFMT_16BIT:
-				sp->tfunc = Luv32toLuv48;
-				break;
-			case SGILOGDATAFMT_8BIT:
-				sp->tfunc = Luv32toRGB;
-				break;
-			}
-		}
-		return (1);
-	case PHOTOMETRIC_LOGL:
-		if (!LogL16InitState(tif))
-			break;
-		tif->tif_decoderow = LogL16Decode;
-		switch (sp->user_datafmt) {
-		case SGILOGDATAFMT_FLOAT:
-			sp->tfunc = L16toY;
-			break;
-		case SGILOGDATAFMT_8BIT:
-			sp->tfunc = L16toGry;
-			break;
-		}
-		return (1);
-	default:
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Inappropriate photometric interpretation %d for SGILog compression; %s",
-		    td->td_photometric, "must be either LogLUV or LogL");
-		break;
-	}
-	return (0);
+    static const char module[] = "LogLuvSetupDecode";
+    LogLuvState *sp = DecoderState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+
+    tif->tif_postdecode = _TIFFNoPostDecode;
+    switch (td->td_photometric)
+    {
+        case PHOTOMETRIC_LOGLUV:
+            if (!LogLuvInitState(tif))
+                break;
+            if (td->td_compression == COMPRESSION_SGILOG24)
+            {
+                tif->tif_decoderow = LogLuvDecode24;
+                switch (sp->user_datafmt)
+                {
+                    case SGILOGDATAFMT_FLOAT:
+                        sp->tfunc = Luv24toXYZ;
+                        break;
+                    case SGILOGDATAFMT_16BIT:
+                        sp->tfunc = Luv24toLuv48;
+                        break;
+                    case SGILOGDATAFMT_8BIT:
+                        sp->tfunc = Luv24toRGB;
+                        break;
+                }
+            }
+            else
+            {
+                tif->tif_decoderow = LogLuvDecode32;
+                switch (sp->user_datafmt)
+                {
+                    case SGILOGDATAFMT_FLOAT:
+                        sp->tfunc = Luv32toXYZ;
+                        break;
+                    case SGILOGDATAFMT_16BIT:
+                        sp->tfunc = Luv32toLuv48;
+                        break;
+                    case SGILOGDATAFMT_8BIT:
+                        sp->tfunc = Luv32toRGB;
+                        break;
+                }
+            }
+            return (1);
+        case PHOTOMETRIC_LOGL:
+            if (!LogL16InitState(tif))
+                break;
+            tif->tif_decoderow = LogL16Decode;
+            switch (sp->user_datafmt)
+            {
+                case SGILOGDATAFMT_FLOAT:
+                    sp->tfunc = L16toY;
+                    break;
+                case SGILOGDATAFMT_8BIT:
+                    sp->tfunc = L16toGry;
+                    break;
+            }
+            return (1);
+        default:
+            TIFFErrorExtR(tif, module,
+                          "Inappropriate photometric interpretation %" PRIu16
+                          " for SGILog compression; %s",
+                          td->td_photometric, "must be either LogLUV or LogL");
+            break;
+    }
+    return (0);
 }
 
-static int
-LogLuvSetupEncode(TIFF* tif)
+static int LogLuvSetupEncode(TIFF *tif)
 {
-	static const char module[] = "LogLuvSetupEncode";
-	LogLuvState* sp = EncoderState(tif);
-	TIFFDirectory* td = &tif->tif_dir;
-
-	switch (td->td_photometric) {
-	case PHOTOMETRIC_LOGLUV:
-		if (!LogLuvInitState(tif))
-			return (0);
-		if (td->td_compression == COMPRESSION_SGILOG24) {
-			tif->tif_encoderow = LogLuvEncode24;
-			switch (sp->user_datafmt) {
-			case SGILOGDATAFMT_FLOAT:
-				sp->tfunc = Luv24fromXYZ;
-				break;
-			case SGILOGDATAFMT_16BIT:
-				sp->tfunc = Luv24fromLuv48;  
-				break;
-			case SGILOGDATAFMT_RAW:
-				break;
-			default:
-				goto notsupported;
-			}
-		} else {
-			tif->tif_encoderow = LogLuvEncode32;  
-			switch (sp->user_datafmt) {
-			case SGILOGDATAFMT_FLOAT:
-				sp->tfunc = Luv32fromXYZ;  
-				break;
-			case SGILOGDATAFMT_16BIT:
-				sp->tfunc = Luv32fromLuv48;  
-				break;
-			case SGILOGDATAFMT_RAW:
-				break;
-			default:
-				goto notsupported;
-			}
-		}
-		break;
-	case PHOTOMETRIC_LOGL:
-		if (!LogL16InitState(tif))
-			return (0);
-		tif->tif_encoderow = LogL16Encode;  
-		switch (sp->user_datafmt) {
-		case SGILOGDATAFMT_FLOAT:
-			sp->tfunc = L16fromY;
-			break;
-		case SGILOGDATAFMT_16BIT:
-			break;
-		default:
-			goto notsupported;
-		}
-		break;
-	default:
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Inappropriate photometric interpretation %d for SGILog compression; %s",
-		    td->td_photometric, "must be either LogLUV or LogL");
-		return (0);
-	}
-	sp->encoder_state = 1;
-	return (1);
+    static const char module[] = "LogLuvSetupEncode";
+    LogLuvState *sp = EncoderState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+
+    switch (td->td_photometric)
+    {
+        case PHOTOMETRIC_LOGLUV:
+            if (!LogLuvInitState(tif))
+                return (0);
+            if (td->td_compression == COMPRESSION_SGILOG24)
+            {
+                tif->tif_encoderow = LogLuvEncode24;
+                switch (sp->user_datafmt)
+                {
+                    case SGILOGDATAFMT_FLOAT:
+                        sp->tfunc = Luv24fromXYZ;
+                        break;
+                    case SGILOGDATAFMT_16BIT:
+                        sp->tfunc = Luv24fromLuv48;
+                        break;
+                    case SGILOGDATAFMT_RAW:
+                        break;
+                    default:
+                        goto notsupported;
+                }
+            }
+            else
+            {
+                tif->tif_encoderow = LogLuvEncode32;
+                switch (sp->user_datafmt)
+                {
+                    case SGILOGDATAFMT_FLOAT:
+                        sp->tfunc = Luv32fromXYZ;
+                        break;
+                    case SGILOGDATAFMT_16BIT:
+                        sp->tfunc = Luv32fromLuv48;
+                        break;
+                    case SGILOGDATAFMT_RAW:
+                        break;
+                    default:
+                        goto notsupported;
+                }
+            }
+            break;
+        case PHOTOMETRIC_LOGL:
+            if (!LogL16InitState(tif))
+                return (0);
+            tif->tif_encoderow = LogL16Encode;
+            switch (sp->user_datafmt)
+            {
+                case SGILOGDATAFMT_FLOAT:
+                    sp->tfunc = L16fromY;
+                    break;
+                case SGILOGDATAFMT_16BIT:
+                    break;
+                default:
+                    goto notsupported;
+            }
+            break;
+        default:
+            TIFFErrorExtR(tif, module,
+                          "Inappropriate photometric interpretation %" PRIu16
+                          " for SGILog compression; %s",
+                          td->td_photometric, "must be either LogLUV or LogL");
+            return (0);
+    }
+    sp->encoder_state = 1;
+    return (1);
 notsupported:
-	TIFFErrorExt(tif->tif_clientdata, module,
-	    "SGILog compression supported only for %s, or raw data",
-	    td->td_photometric == PHOTOMETRIC_LOGL ? "Y, L" : "XYZ, Luv");
-	return (0);
+    TIFFErrorExtR(tif, module,
+                  "SGILog compression supported only for %s, or raw data",
+                  td->td_photometric == PHOTOMETRIC_LOGL ? "Y, L" : "XYZ, Luv");
+    return (0);
 }
 
-static void
-LogLuvClose(TIFF* tif)
+static void LogLuvClose(TIFF *tif)
 {
-        LogLuvState* sp = (LogLuvState*) tif->tif_data;
-	TIFFDirectory *td = &tif->tif_dir;
-
-	assert(sp != 0);
-	/*
-	 * For consistency, we always want to write out the same
-	 * bitspersample and sampleformat for our TIFF file,
-	 * regardless of the data format being used by the application.
-	 * Since this routine is called after tags have been set but
-	 * before they have been recorded in the file, we reset them here.
-         * Note: this is really a nasty approach. See PixarLogClose
-	 */
-        if( sp->encoder_state )
-        {
-            /* See PixarLogClose. Might avoid issues with tags whose size depends
-             * on those below, but not completely sure this is enough. */
-            td->td_samplesperpixel =
-                (td->td_photometric == PHOTOMETRIC_LOGL) ? 1 : 3;
-            td->td_bitspersample = 16;
-            td->td_sampleformat = SAMPLEFORMAT_INT;
-        }
+    LogLuvState *sp = (LogLuvState *)tif->tif_data;
+    TIFFDirectory *td = &tif->tif_dir;
+
+    assert(sp != 0);
+    /*
+     * For consistency, we always want to write out the same
+     * bitspersample and sampleformat for our TIFF file,
+     * regardless of the data format being used by the application.
+     * Since this routine is called after tags have been set but
+     * before they have been recorded in the file, we reset them here.
+     * Note: this is really a nasty approach. See PixarLogClose
+     */
+    if (sp->encoder_state)
+    {
+        /* See PixarLogClose. Might avoid issues with tags whose size depends
+         * on those below, but not completely sure this is enough. */
+        td->td_samplesperpixel =
+            (td->td_photometric == PHOTOMETRIC_LOGL) ? 1 : 3;
+        td->td_bitspersample = 16;
+        td->td_sampleformat = SAMPLEFORMAT_INT;
+    }
 }
 
-static void
-LogLuvCleanup(TIFF* tif)
+static void LogLuvCleanup(TIFF *tif)
 {
-	LogLuvState* sp = (LogLuvState *)tif->tif_data;
+    LogLuvState *sp = (LogLuvState *)tif->tif_data;
 
-	assert(sp != 0);
+    assert(sp != 0);
 
-	tif->tif_tagmethods.vgetfield = sp->vgetparent;
-	tif->tif_tagmethods.vsetfield = sp->vsetparent;
+    tif->tif_tagmethods.vgetfield = sp->vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->vsetparent;
 
-	if (sp->tbuf)
-		_TIFFfree(sp->tbuf);
-	_TIFFfree(sp);
-	tif->tif_data = NULL;
+    if (sp->tbuf)
+        _TIFFfreeExt(tif, sp->tbuf);
+    _TIFFfreeExt(tif, sp);
+    tif->tif_data = NULL;
 
-	_TIFFSetDefaultCompressionState(tif);
+    _TIFFSetDefaultCompressionState(tif);
 }
 
-static int
-LogLuvVSetField(TIFF* tif, uint32 tag, va_list ap)
+static int LogLuvVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	static const char module[] = "LogLuvVSetField";
-	LogLuvState* sp = DecoderState(tif);
-	int bps, fmt;
-
-	switch (tag) {
-	case TIFFTAG_SGILOGDATAFMT:
-		sp->user_datafmt = (int) va_arg(ap, int);
-		/*
-		 * Tweak the TIFF header so that the rest of libtiff knows what
-		 * size of data will be passed between app and library, and
-		 * assume that the app knows what it is doing and is not
-		 * confused by these header manipulations...
-		 */
-		switch (sp->user_datafmt) {
-		case SGILOGDATAFMT_FLOAT:
-			bps = 32;
-			fmt = SAMPLEFORMAT_IEEEFP;
-			break;
-		case SGILOGDATAFMT_16BIT:
-			bps = 16;
-			fmt = SAMPLEFORMAT_INT;
-			break;
-		case SGILOGDATAFMT_RAW:
-			bps = 32;
-			fmt = SAMPLEFORMAT_UINT;
-			TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, 1);
-			break;
-		case SGILOGDATAFMT_8BIT:
-			bps = 8;
-			fmt = SAMPLEFORMAT_UINT;
-			break;
-		default:
-			TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			    "Unknown data format %d for LogLuv compression",
-			    sp->user_datafmt);
-			return (0);
-		}
-		TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, bps);
-		TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, fmt);
-		/*
-		 * Must recalculate sizes should bits/sample change.
-		 */
-		tif->tif_tilesize = isTiled(tif) ? TIFFTileSize(tif) : (tmsize_t) -1;
-		tif->tif_scanlinesize = TIFFScanlineSize(tif);
-		return (1);
-	case TIFFTAG_SGILOGENCODE:
-		sp->encode_meth = (int) va_arg(ap, int);
-		if (sp->encode_meth != SGILOGENCODE_NODITHER &&
-		    sp->encode_meth != SGILOGENCODE_RANDITHER) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Unknown encoding %d for LogLuv compression",
-			    sp->encode_meth);
-			return (0);
-		}
-		return (1);
-	default:
-		return (*sp->vsetparent)(tif, tag, ap);
-	}
+    static const char module[] = "LogLuvVSetField";
+    LogLuvState *sp = DecoderState(tif);
+    int bps, fmt;
+
+    switch (tag)
+    {
+        case TIFFTAG_SGILOGDATAFMT:
+            sp->user_datafmt = (int)va_arg(ap, int);
+            /*
+             * Tweak the TIFF header so that the rest of libtiff knows what
+             * size of data will be passed between app and library, and
+             * assume that the app knows what it is doing and is not
+             * confused by these header manipulations...
+             */
+            switch (sp->user_datafmt)
+            {
+                case SGILOGDATAFMT_FLOAT:
+                    bps = 32;
+                    fmt = SAMPLEFORMAT_IEEEFP;
+                    break;
+                case SGILOGDATAFMT_16BIT:
+                    bps = 16;
+                    fmt = SAMPLEFORMAT_INT;
+                    break;
+                case SGILOGDATAFMT_RAW:
+                    bps = 32;
+                    fmt = SAMPLEFORMAT_UINT;
+                    TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, 1);
+                    break;
+                case SGILOGDATAFMT_8BIT:
+                    bps = 8;
+                    fmt = SAMPLEFORMAT_UINT;
+                    break;
+                default:
+                    TIFFErrorExtR(
+                        tif, tif->tif_name,
+                        "Unknown data format %d for LogLuv compression",
+                        sp->user_datafmt);
+                    return (0);
+            }
+            TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, bps);
+            TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, fmt);
+            /*
+             * Must recalculate sizes should bits/sample change.
+             */
+            tif->tif_tilesize = isTiled(tif) ? TIFFTileSize(tif) : (tmsize_t)-1;
+            tif->tif_scanlinesize = TIFFScanlineSize(tif);
+            return (1);
+        case TIFFTAG_SGILOGENCODE:
+            sp->encode_meth = (int)va_arg(ap, int);
+            if (sp->encode_meth != SGILOGENCODE_NODITHER &&
+                sp->encode_meth != SGILOGENCODE_RANDITHER)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Unknown encoding %d for LogLuv compression",
+                              sp->encode_meth);
+                return (0);
+            }
+            return (1);
+        default:
+            return (*sp->vsetparent)(tif, tag, ap);
+    }
 }
 
-static int
-LogLuvVGetField(TIFF* tif, uint32 tag, va_list ap)
+static int LogLuvVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	LogLuvState *sp = (LogLuvState *)tif->tif_data;
-
-	switch (tag) {
-	case TIFFTAG_SGILOGDATAFMT:
-		*va_arg(ap, int*) = sp->user_datafmt;
-		return (1);
-	default:
-		return (*sp->vgetparent)(tif, tag, ap);
-	}
+    LogLuvState *sp = (LogLuvState *)tif->tif_data;
+
+    switch (tag)
+    {
+        case TIFFTAG_SGILOGDATAFMT:
+            *va_arg(ap, int *) = sp->user_datafmt;
+            return (1);
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
 }
 
 static const TIFFField LogLuvFields[] = {
-    { TIFFTAG_SGILOGDATAFMT, 0, 0, TIFF_SHORT, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "SGILogDataFmt", NULL},
-    { TIFFTAG_SGILOGENCODE, 0, 0, TIFF_SHORT, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "SGILogEncode", NULL}
-};
+    {TIFFTAG_SGILOGDATAFMT, 0, 0, TIFF_SHORT, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "SGILogDataFmt", NULL},
+    {TIFFTAG_SGILOGENCODE, 0, 0, TIFF_SHORT, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "SGILogEncode", NULL}};
 
-int
-TIFFInitSGILog(TIFF* tif, int scheme)
+int TIFFInitSGILog(TIFF *tif, int scheme)
 {
-	static const char module[] = "TIFFInitSGILog";
-	LogLuvState* sp;
-
-	assert(scheme == COMPRESSION_SGILOG24 || scheme == COMPRESSION_SGILOG);
-
-	/*
-	 * Merge codec-specific tag information.
-	 */
-	if (!_TIFFMergeFields(tif, LogLuvFields,
-			      TIFFArrayCount(LogLuvFields))) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Merging SGILog codec-specific tags failed");
-		return 0;
-	}
-
-	/*
-	 * Allocate state block so tag methods have storage to record values.
-	 */
-	tif->tif_data = (uint8*) _TIFFmalloc(sizeof (LogLuvState));
-	if (tif->tif_data == NULL)
-		goto bad;
-	sp = (LogLuvState*) tif->tif_data;
-	_TIFFmemset((void*)sp, 0, sizeof (*sp));
-	sp->user_datafmt = SGILOGDATAFMT_UNKNOWN;
-	sp->encode_meth = (scheme == COMPRESSION_SGILOG24) ?
-	    SGILOGENCODE_RANDITHER : SGILOGENCODE_NODITHER;
-	sp->tfunc = _logLuvNop;
-
-	/*
-	 * Install codec methods.
-	 * NB: tif_decoderow & tif_encoderow are filled
-	 *     in at setup time.
-	 */
-	tif->tif_fixuptags = LogLuvFixupTags;  
-	tif->tif_setupdecode = LogLuvSetupDecode;
-	tif->tif_decodestrip = LogLuvDecodeStrip;
-	tif->tif_decodetile = LogLuvDecodeTile;
-	tif->tif_setupencode = LogLuvSetupEncode;
-	tif->tif_encodestrip = LogLuvEncodeStrip;  
-	tif->tif_encodetile = LogLuvEncodeTile;
-	tif->tif_close = LogLuvClose;
-	tif->tif_cleanup = LogLuvCleanup;
-
-	/*
-	 * Override parent get/set field methods.
-	 */
-	sp->vgetparent = tif->tif_tagmethods.vgetfield;
-	tif->tif_tagmethods.vgetfield = LogLuvVGetField;   /* hook for codec tags */
-	sp->vsetparent = tif->tif_tagmethods.vsetfield;
-	tif->tif_tagmethods.vsetfield = LogLuvVSetField;   /* hook for codec tags */
-
-	return (1);
+    static const char module[] = "TIFFInitSGILog";
+    LogLuvState *sp;
+
+    assert(scheme == COMPRESSION_SGILOG24 || scheme == COMPRESSION_SGILOG);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, LogLuvFields, TIFFArrayCount(LogLuvFields)))
+    {
+        TIFFErrorExtR(tif, module, "Merging SGILog codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(LogLuvState));
+    if (tif->tif_data == NULL)
+        goto bad;
+    sp = (LogLuvState *)tif->tif_data;
+    _TIFFmemset((void *)sp, 0, sizeof(*sp));
+    sp->user_datafmt = SGILOGDATAFMT_UNKNOWN;
+    sp->encode_meth = (scheme == COMPRESSION_SGILOG24) ? SGILOGENCODE_RANDITHER
+                                                       : SGILOGENCODE_NODITHER;
+    sp->tfunc = _logLuvNop;
+
+    /*
+     * Install codec methods.
+     * NB: tif_decoderow & tif_encoderow are filled
+     *     in at setup time.
+     */
+    tif->tif_fixuptags = LogLuvFixupTags;
+    tif->tif_setupdecode = LogLuvSetupDecode;
+    tif->tif_decodestrip = LogLuvDecodeStrip;
+    tif->tif_decodetile = LogLuvDecodeTile;
+    tif->tif_setupencode = LogLuvSetupEncode;
+    tif->tif_encodestrip = LogLuvEncodeStrip;
+    tif->tif_encodetile = LogLuvEncodeTile;
+    tif->tif_close = LogLuvClose;
+    tif->tif_cleanup = LogLuvCleanup;
+
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = LogLuvVGetField; /* hook for codec tags */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = LogLuvVSetField; /* hook for codec tags */
+
+    return (1);
 bad:
-	TIFFErrorExt(tif->tif_clientdata, module,
-		     "%s: No space for LogLuv state block", tif->tif_name);
-	return (0);
+    TIFFErrorExtR(tif, module, "%s: No space for LogLuv state block",
+                  tif->tif_name);
+    return (0);
 }
 #endif /* LOGLUV_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_lzma.c b/3rdparty/libtiff/tif_lzma.c
index e150bd635d84..4cfd5e882120 100644
--- a/3rdparty/libtiff/tif_lzma.c
+++ b/3rdparty/libtiff/tif_lzma.c
@@ -33,471 +33,488 @@
  * The codec is derived from ZLIB codec (tif_zip.c).
  */
 
-#include "tif_predict.h"
 #include "lzma.h"
+#include "tif_predict.h"
 
 #include <stdio.h>
 
 /*
  * State block for each open TIFF file using LZMA2 compression/decompression.
  */
-typedef struct {
-	TIFFPredictorState predict;
-        lzma_stream	stream;
-	lzma_filter	filters[LZMA_FILTERS_MAX + 1];
-	lzma_options_delta opt_delta;		/* delta filter options */
-	lzma_options_lzma opt_lzma;		/* LZMA2 filter options */
-	int             preset;			/* compression level */
-	lzma_check	check;			/* type of the integrity check */
-	int             state;			/* state flags */
+typedef struct
+{
+    TIFFPredictorState predict;
+    lzma_stream stream;
+    lzma_filter filters[LZMA_FILTERS_MAX + 1];
+    lzma_options_delta opt_delta; /* delta filter options */
+    lzma_options_lzma opt_lzma;   /* LZMA2 filter options */
+    int preset;                   /* compression level */
+    lzma_check check;             /* type of the integrity check */
+    int state;                    /* state flags */
 #define LSTATE_INIT_DECODE 0x01
 #define LSTATE_INIT_ENCODE 0x02
 
-	TIFFVGetMethod  vgetparent;            /* super-class method */
-	TIFFVSetMethod  vsetparent;            /* super-class method */
+    TIFFVGetMethod vgetparent; /* super-class method */
+    TIFFVSetMethod vsetparent; /* super-class method */
 } LZMAState;
 
-#define LState(tif)             ((LZMAState*) (tif)->tif_data)
-#define DecoderState(tif)       LState(tif)
-#define EncoderState(tif)       LState(tif)
+#define LState(tif) ((LZMAState *)(tif)->tif_data)
+#define DecoderState(tif) LState(tif)
+#define EncoderState(tif) LState(tif)
 
-static int LZMAEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s);
-static int LZMADecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s);
+static int LZMAEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s);
+static int LZMADecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s);
 
-static const char *
-LZMAStrerror(lzma_ret ret)
+static const char *LZMAStrerror(lzma_ret ret)
 {
-	switch (ret) {
-		case LZMA_OK:
-		    return "operation completed successfully";
-		case LZMA_STREAM_END:
-		    return "end of stream was reached";
-		case LZMA_NO_CHECK:
-		    return "input stream has no integrity check";
-		case LZMA_UNSUPPORTED_CHECK:
-		    return "cannot calculate the integrity check";
-		case LZMA_GET_CHECK:
-		    return "integrity check type is now available";
-		case LZMA_MEM_ERROR:
-		    return "cannot allocate memory";
-		case LZMA_MEMLIMIT_ERROR:
-		    return "memory usage limit was reached";
-		case LZMA_FORMAT_ERROR:
-		    return "file format not recognized";
-		case LZMA_OPTIONS_ERROR:
-		    return "invalid or unsupported options";
-		case LZMA_DATA_ERROR:
-		    return "data is corrupt";
-		case LZMA_BUF_ERROR:
-		    return "no progress is possible (stream is truncated or corrupt)";
-		case LZMA_PROG_ERROR:
-		    return "programming error";
-		default:
-		    return "unidentified liblzma error";
-	}
+    switch (ret)
+    {
+        case LZMA_OK:
+            return "operation completed successfully";
+        case LZMA_STREAM_END:
+            return "end of stream was reached";
+        case LZMA_NO_CHECK:
+            return "input stream has no integrity check";
+        case LZMA_UNSUPPORTED_CHECK:
+            return "cannot calculate the integrity check";
+        case LZMA_GET_CHECK:
+            return "integrity check type is now available";
+        case LZMA_MEM_ERROR:
+            return "cannot allocate memory";
+        case LZMA_MEMLIMIT_ERROR:
+            return "memory usage limit was reached";
+        case LZMA_FORMAT_ERROR:
+            return "file format not recognized";
+        case LZMA_OPTIONS_ERROR:
+            return "invalid or unsupported options";
+        case LZMA_DATA_ERROR:
+            return "data is corrupt";
+        case LZMA_BUF_ERROR:
+            return "no progress is possible (stream is truncated or corrupt)";
+        case LZMA_PROG_ERROR:
+            return "programming error";
+        default:
+            return "unidentified liblzma error";
+    }
 }
 
-static int
-LZMAFixupTags(TIFF* tif)
+static int LZMAFixupTags(TIFF *tif)
 {
-	(void) tif;
-	return 1;
+    (void)tif;
+    return 1;
 }
 
-static int
-LZMASetupDecode(TIFF* tif)
+static int LZMASetupDecode(TIFF *tif)
 {
-	LZMAState* sp = DecoderState(tif);
-
-	assert(sp != NULL);
-        
-        /* if we were last encoding, terminate this mode */
-	if (sp->state & LSTATE_INIT_ENCODE) {
-	    lzma_end(&sp->stream);
-	    sp->state = 0;
-	}
-
-	sp->state |= LSTATE_INIT_DECODE;
-	return 1;
+    LZMAState *sp = DecoderState(tif);
+
+    assert(sp != NULL);
+
+    /* if we were last encoding, terminate this mode */
+    if (sp->state & LSTATE_INIT_ENCODE)
+    {
+        lzma_end(&sp->stream);
+        sp->state = 0;
+    }
+
+    sp->state |= LSTATE_INIT_DECODE;
+    return 1;
 }
 
 /*
  * Setup state for decoding a strip.
  */
-static int
-LZMAPreDecode(TIFF* tif, uint16 s)
+static int LZMAPreDecode(TIFF *tif, uint16_t s)
 {
-	static const char module[] = "LZMAPreDecode";
-	LZMAState* sp = DecoderState(tif);
-	lzma_ret ret;
-
-	(void) s;
-	assert(sp != NULL);
-
-	if( (sp->state & LSTATE_INIT_DECODE) == 0 )
-            tif->tif_setupdecode(tif);
-
-	sp->stream.next_in = tif->tif_rawdata;
-	sp->stream.avail_in = (size_t) tif->tif_rawcc;
-	if ((tmsize_t)sp->stream.avail_in != tif->tif_rawcc) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Liblzma cannot deal with buffers this size");
-		return 0;
-	}
-
-	/*
-	 * Disable memory limit when decoding. UINT64_MAX is a flag to disable
-	 * the limit, we are passing (uint64_t)-1 which should be the same.
-	 */
-	ret = lzma_stream_decoder(&sp->stream, (uint64_t)-1, 0);
-	if (ret != LZMA_OK) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Error initializing the stream decoder, %s",
-			     LZMAStrerror(ret));
-		return 0;
-	}
-	return 1;
+    static const char module[] = "LZMAPreDecode";
+    LZMAState *sp = DecoderState(tif);
+    lzma_ret ret;
+
+    (void)s;
+    assert(sp != NULL);
+
+    if ((sp->state & LSTATE_INIT_DECODE) == 0)
+        tif->tif_setupdecode(tif);
+
+    sp->stream.next_in = tif->tif_rawdata;
+    sp->stream.avail_in = (size_t)tif->tif_rawcc;
+    if ((tmsize_t)sp->stream.avail_in != tif->tif_rawcc)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Liblzma cannot deal with buffers this size");
+        return 0;
+    }
+
+    /*
+     * Disable memory limit when decoding. UINT64_MAX is a flag to disable
+     * the limit, we are passing (uint64_t)-1 which should be the same.
+     */
+    ret = lzma_stream_decoder(&sp->stream, (uint64_t)-1, 0);
+    if (ret != LZMA_OK)
+    {
+        TIFFErrorExtR(tif, module, "Error initializing the stream decoder, %s",
+                      LZMAStrerror(ret));
+        return 0;
+    }
+    return 1;
 }
 
-static int
-LZMADecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int LZMADecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "LZMADecode";
-	LZMAState* sp = DecoderState(tif);
-
-	(void) s;
-	assert(sp != NULL);
-	assert(sp->state == LSTATE_INIT_DECODE);
-
-        sp->stream.next_in = tif->tif_rawcp;
-        sp->stream.avail_in = (size_t) tif->tif_rawcc;
-
-	sp->stream.next_out = op;
-	sp->stream.avail_out = (size_t) occ;
-	if ((tmsize_t)sp->stream.avail_out != occ) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Liblzma cannot deal with buffers this size");
-		return 0;
-	}
-
-	do {
-		/*
-		 * Save the current stream state to properly recover from the
-		 * decoding errors later.
-		 */
-		const uint8_t *next_in = sp->stream.next_in;
-		size_t avail_in = sp->stream.avail_in;
-
-		lzma_ret ret = lzma_code(&sp->stream, LZMA_RUN);
-		if (ret == LZMA_STREAM_END)
-			break;
-		if (ret == LZMA_MEMLIMIT_ERROR) {
-			lzma_ret r = lzma_stream_decoder(&sp->stream,
-							 lzma_memusage(&sp->stream), 0);
-			if (r != LZMA_OK) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Error initializing the stream decoder, %s",
-					     LZMAStrerror(r));
-				break;
-			}
-			sp->stream.next_in = next_in;
-			sp->stream.avail_in = avail_in;
-			continue;
-		}
-		if (ret != LZMA_OK) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Decoding error at scanline %lu, %s",
-			    (unsigned long) tif->tif_row, LZMAStrerror(ret));
-			break;
-		}
-	} while (sp->stream.avail_out > 0);
-	if (sp->stream.avail_out != 0) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Not enough data at scanline %lu (short %lu bytes)",
-		    (unsigned long) tif->tif_row, (unsigned long) sp->stream.avail_out);
-		return 0;
-	}
-
-        tif->tif_rawcp = (uint8 *)sp->stream.next_in; /* cast away const */
-        tif->tif_rawcc = sp->stream.avail_in;
-        
-	return 1;
+    static const char module[] = "LZMADecode";
+    LZMAState *sp = DecoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+    assert(sp->state == LSTATE_INIT_DECODE);
+
+    sp->stream.next_in = tif->tif_rawcp;
+    sp->stream.avail_in = (size_t)tif->tif_rawcc;
+
+    sp->stream.next_out = op;
+    sp->stream.avail_out = (size_t)occ;
+    if ((tmsize_t)sp->stream.avail_out != occ)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Liblzma cannot deal with buffers this size");
+        return 0;
+    }
+
+    do
+    {
+        /*
+         * Save the current stream state to properly recover from the
+         * decoding errors later.
+         */
+        const uint8_t *next_in = sp->stream.next_in;
+        size_t avail_in = sp->stream.avail_in;
+
+        lzma_ret ret = lzma_code(&sp->stream, LZMA_RUN);
+        if (ret == LZMA_STREAM_END)
+            break;
+        if (ret == LZMA_MEMLIMIT_ERROR)
+        {
+            lzma_ret r =
+                lzma_stream_decoder(&sp->stream, lzma_memusage(&sp->stream), 0);
+            if (r != LZMA_OK)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Error initializing the stream decoder, %s",
+                              LZMAStrerror(r));
+                break;
+            }
+            sp->stream.next_in = next_in;
+            sp->stream.avail_in = avail_in;
+            continue;
+        }
+        if (ret != LZMA_OK)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Decoding error at scanline %" PRIu32 ", %s",
+                          tif->tif_row, LZMAStrerror(ret));
+            break;
+        }
+    } while (sp->stream.avail_out > 0);
+    if (sp->stream.avail_out != 0)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Not enough data at scanline %" PRIu32
+                      " (short %" TIFF_SIZE_FORMAT " bytes)",
+                      tif->tif_row, sp->stream.avail_out);
+        return 0;
+    }
+
+    tif->tif_rawcp = (uint8_t *)sp->stream.next_in; /* cast away const */
+    tif->tif_rawcc = sp->stream.avail_in;
+
+    return 1;
 }
 
-static int
-LZMASetupEncode(TIFF* tif)
+static int LZMASetupEncode(TIFF *tif)
 {
-	LZMAState* sp = EncoderState(tif);
+    LZMAState *sp = EncoderState(tif);
 
-	assert(sp != NULL);
-	if (sp->state & LSTATE_INIT_DECODE) {
-		lzma_end(&sp->stream);
-		sp->state = 0;
-	}
+    assert(sp != NULL);
+    if (sp->state & LSTATE_INIT_DECODE)
+    {
+        lzma_end(&sp->stream);
+        sp->state = 0;
+    }
 
-	sp->state |= LSTATE_INIT_ENCODE;
-	return 1;
+    sp->state |= LSTATE_INIT_ENCODE;
+    return 1;
 }
 
 /*
  * Reset encoding state at the start of a strip.
  */
-static int
-LZMAPreEncode(TIFF* tif, uint16 s)
+static int LZMAPreEncode(TIFF *tif, uint16_t s)
 {
-	static const char module[] = "LZMAPreEncode";
-	LZMAState *sp = EncoderState(tif);
-	lzma_ret ret;
-
-	(void) s;
-	assert(sp != NULL);
-	if( sp->state != LSTATE_INIT_ENCODE )
-            tif->tif_setupencode(tif);
-
-	sp->stream.next_out = tif->tif_rawdata;
-	sp->stream.avail_out = (size_t)tif->tif_rawdatasize;
-	if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Liblzma cannot deal with buffers this size");
-		return 0;
-	}
-	ret = lzma_stream_encoder(&sp->stream, sp->filters, sp->check);
-	if (ret != LZMA_OK) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"Error in lzma_stream_encoder(): %s", LZMAStrerror(ret));
-		return 0;
-	}
-	return 1;
+    static const char module[] = "LZMAPreEncode";
+    LZMAState *sp = EncoderState(tif);
+    lzma_ret ret;
+
+    (void)s;
+    assert(sp != NULL);
+    if (sp->state != LSTATE_INIT_ENCODE)
+        tif->tif_setupencode(tif);
+
+    sp->stream.next_out = tif->tif_rawdata;
+    sp->stream.avail_out = (size_t)tif->tif_rawdatasize;
+    if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Liblzma cannot deal with buffers this size");
+        return 0;
+    }
+    ret = lzma_stream_encoder(&sp->stream, sp->filters, sp->check);
+    if (ret != LZMA_OK)
+    {
+        TIFFErrorExtR(tif, module, "Error in lzma_stream_encoder(): %s",
+                      LZMAStrerror(ret));
+        return 0;
+    }
+    return 1;
 }
 
 /*
  * Encode a chunk of pixels.
  */
-static int
-LZMAEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LZMAEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "LZMAEncode";
-	LZMAState *sp = EncoderState(tif);
-
-	assert(sp != NULL);
-	assert(sp->state == LSTATE_INIT_ENCODE);
-
-	(void) s;
-	sp->stream.next_in = bp;
-	sp->stream.avail_in = (size_t) cc;
-	if ((tmsize_t)sp->stream.avail_in != cc) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Liblzma cannot deal with buffers this size");
-		return 0;
-	}
-	do {
-		lzma_ret ret = lzma_code(&sp->stream, LZMA_RUN);
-		if (ret != LZMA_OK) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				"Encoding error at scanline %lu, %s",
-				(unsigned long) tif->tif_row, LZMAStrerror(ret));
-			return 0;
-		}
-		if (sp->stream.avail_out == 0) {
-			tif->tif_rawcc = tif->tif_rawdatasize;
-			if (!TIFFFlushData1(tif))
-				return 0;
-			sp->stream.next_out = tif->tif_rawdata;
-			sp->stream.avail_out = (size_t)tif->tif_rawdatasize;  /* this is a safe typecast, as check is made already in LZMAPreEncode */
-		}
-	} while (sp->stream.avail_in > 0);
-	return 1;
+    static const char module[] = "LZMAEncode";
+    LZMAState *sp = EncoderState(tif);
+
+    assert(sp != NULL);
+    assert(sp->state == LSTATE_INIT_ENCODE);
+
+    (void)s;
+    sp->stream.next_in = bp;
+    sp->stream.avail_in = (size_t)cc;
+    if ((tmsize_t)sp->stream.avail_in != cc)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Liblzma cannot deal with buffers this size");
+        return 0;
+    }
+    do
+    {
+        lzma_ret ret = lzma_code(&sp->stream, LZMA_RUN);
+        if (ret != LZMA_OK)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Encoding error at scanline %" PRIu32 ", %s",
+                          tif->tif_row, LZMAStrerror(ret));
+            return 0;
+        }
+        if (sp->stream.avail_out == 0)
+        {
+            tif->tif_rawcc = tif->tif_rawdatasize;
+            if (!TIFFFlushData1(tif))
+                return 0;
+            sp->stream.next_out = tif->tif_rawdata;
+            sp->stream.avail_out =
+                (size_t)
+                    tif->tif_rawdatasize; /* this is a safe typecast, as check
+                                             is made already in LZMAPreEncode */
+        }
+    } while (sp->stream.avail_in > 0);
+    return 1;
 }
 
 /*
  * Finish off an encoded strip by flushing the last
  * string and tacking on an End Of Information code.
  */
-static int
-LZMAPostEncode(TIFF* tif)
+static int LZMAPostEncode(TIFF *tif)
 {
-	static const char module[] = "LZMAPostEncode";
-	LZMAState *sp = EncoderState(tif);
-	lzma_ret ret;
-
-	sp->stream.avail_in = 0;
-	do {
-		ret = lzma_code(&sp->stream, LZMA_FINISH);
-		switch (ret) {
-		case LZMA_STREAM_END:
-		case LZMA_OK:
-			if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize) {
-				tif->tif_rawcc =
-					tif->tif_rawdatasize - sp->stream.avail_out;
-				if (!TIFFFlushData1(tif))
-					return 0;
-				sp->stream.next_out = tif->tif_rawdata;
-				sp->stream.avail_out = (size_t)tif->tif_rawdatasize;  /* this is a safe typecast, as check is made already in ZIPPreEncode */
-			}
-			break;
-		default:
-			TIFFErrorExt(tif->tif_clientdata, module, "Liblzma error: %s",
-				     LZMAStrerror(ret));
-			return 0;
-		}
-	} while (ret != LZMA_STREAM_END);
-	return 1;
+    static const char module[] = "LZMAPostEncode";
+    LZMAState *sp = EncoderState(tif);
+    lzma_ret ret;
+
+    sp->stream.avail_in = 0;
+    do
+    {
+        ret = lzma_code(&sp->stream, LZMA_FINISH);
+        switch (ret)
+        {
+            case LZMA_STREAM_END:
+            case LZMA_OK:
+                if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize)
+                {
+                    tif->tif_rawcc =
+                        tif->tif_rawdatasize - sp->stream.avail_out;
+                    if (!TIFFFlushData1(tif))
+                        return 0;
+                    sp->stream.next_out = tif->tif_rawdata;
+                    sp->stream.avail_out =
+                        (size_t)
+                            tif->tif_rawdatasize; /* this is a safe typecast, as
+                                                     check is made already in
+                                                     ZIPPreEncode */
+                }
+                break;
+            default:
+                TIFFErrorExtR(tif, module, "Liblzma error: %s",
+                              LZMAStrerror(ret));
+                return 0;
+        }
+    } while (ret != LZMA_STREAM_END);
+    return 1;
 }
 
-static void
-LZMACleanup(TIFF* tif)
+static void LZMACleanup(TIFF *tif)
 {
-	LZMAState* sp = LState(tif);
+    LZMAState *sp = LState(tif);
 
-	assert(sp != 0);
+    assert(sp != 0);
 
-	(void)TIFFPredictorCleanup(tif);
+    (void)TIFFPredictorCleanup(tif);
 
-	tif->tif_tagmethods.vgetfield = sp->vgetparent;
-	tif->tif_tagmethods.vsetfield = sp->vsetparent;
+    tif->tif_tagmethods.vgetfield = sp->vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->vsetparent;
 
-	if (sp->state) {
-		lzma_end(&sp->stream);
-		sp->state = 0;
-	}
-	_TIFFfree(sp);
-	tif->tif_data = NULL;
+    if (sp->state)
+    {
+        lzma_end(&sp->stream);
+        sp->state = 0;
+    }
+    _TIFFfreeExt(tif, sp);
+    tif->tif_data = NULL;
 
-	_TIFFSetDefaultCompressionState(tif);
+    _TIFFSetDefaultCompressionState(tif);
 }
 
-static int
-LZMAVSetField(TIFF* tif, uint32 tag, va_list ap)
+static int LZMAVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	static const char module[] = "LZMAVSetField";
-	LZMAState* sp = LState(tif);
-
-	switch (tag) {
-	case TIFFTAG_LZMAPRESET:
-		sp->preset = (int) va_arg(ap, int);
-		lzma_lzma_preset(&sp->opt_lzma, sp->preset);
-		if (sp->state & LSTATE_INIT_ENCODE) {
-			lzma_ret ret = lzma_stream_encoder(&sp->stream,
-							   sp->filters,
-							   sp->check);
-			if (ret != LZMA_OK) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-					     "Liblzma error: %s",
-					     LZMAStrerror(ret));
-			}
-		}
-		return 1;
-	default:
-		return (*sp->vsetparent)(tif, tag, ap);
-	}
-	/*NOTREACHED*/
+    static const char module[] = "LZMAVSetField";
+    LZMAState *sp = LState(tif);
+
+    switch (tag)
+    {
+        case TIFFTAG_LZMAPRESET:
+            sp->preset = (int)va_arg(ap, int);
+            lzma_lzma_preset(&sp->opt_lzma, sp->preset);
+            if (sp->state & LSTATE_INIT_ENCODE)
+            {
+                lzma_ret ret =
+                    lzma_stream_encoder(&sp->stream, sp->filters, sp->check);
+                if (ret != LZMA_OK)
+                {
+                    TIFFErrorExtR(tif, module, "Liblzma error: %s",
+                                  LZMAStrerror(ret));
+                }
+            }
+            return 1;
+        default:
+            return (*sp->vsetparent)(tif, tag, ap);
+    }
+    /*NOTREACHED*/
 }
 
-static int
-LZMAVGetField(TIFF* tif, uint32 tag, va_list ap)
+static int LZMAVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	LZMAState* sp = LState(tif);
-
-	switch (tag) {
-	case TIFFTAG_LZMAPRESET:
-		*va_arg(ap, int*) = sp->preset;
-		break;
-	default:
-		return (*sp->vgetparent)(tif, tag, ap);
-	}
-	return 1;
+    LZMAState *sp = LState(tif);
+
+    switch (tag)
+    {
+        case TIFFTAG_LZMAPRESET:
+            *va_arg(ap, int *) = sp->preset;
+            break;
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
+    return 1;
 }
 
 static const TIFFField lzmaFields[] = {
-	{ TIFFTAG_LZMAPRESET, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED,
-		FIELD_PSEUDO, TRUE, FALSE, "LZMA2 Compression Preset", NULL },
+    {TIFFTAG_LZMAPRESET, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE,
+     "LZMA2 Compression Preset", NULL},
 };
 
-int
-TIFFInitLZMA(TIFF* tif, int scheme)
+int TIFFInitLZMA(TIFF *tif, int scheme)
 {
-	static const char module[] = "TIFFInitLZMA";
-	LZMAState* sp;
-	lzma_stream tmp_stream = LZMA_STREAM_INIT;
-
-        (void)scheme;
-	assert( scheme == COMPRESSION_LZMA );
-
-	/*
-	 * Merge codec-specific tag information.
-	 */
-	if (!_TIFFMergeFields(tif, lzmaFields, TIFFArrayCount(lzmaFields))) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Merging LZMA2 codec-specific tags failed");
-		return 0;
-	}
-
-	/*
-	 * Allocate state block so tag methods have storage to record values.
-	 */
-	tif->tif_data = (uint8*) _TIFFmalloc(sizeof(LZMAState));
-	if (tif->tif_data == NULL)
-		goto bad;
-	sp = LState(tif);
-	memcpy(&sp->stream, &tmp_stream, sizeof(lzma_stream));
-
-	/*
-	 * Override parent get/set field methods.
-	 */
-	sp->vgetparent = tif->tif_tagmethods.vgetfield;
-	tif->tif_tagmethods.vgetfield = LZMAVGetField;	/* hook for codec tags */
-	sp->vsetparent = tif->tif_tagmethods.vsetfield;
-	tif->tif_tagmethods.vsetfield = LZMAVSetField;	/* hook for codec tags */
-
-	/* Default values for codec-specific fields */
-	sp->preset = LZMA_PRESET_DEFAULT;		/* default comp. level */
-	sp->check = LZMA_CHECK_NONE;
-	sp->state = 0;
-
-	/* Data filters. So far we are using delta and LZMA2 filters only. */
-	sp->opt_delta.type = LZMA_DELTA_TYPE_BYTE;
-	/*
-	 * The sample size in bytes seems to be reasonable distance for delta
-	 * filter.
-	 */
-	sp->opt_delta.dist = (tif->tif_dir.td_bitspersample % 8) ?
-		1 : tif->tif_dir.td_bitspersample / 8;
-	sp->filters[0].id = LZMA_FILTER_DELTA;
-	sp->filters[0].options = &sp->opt_delta;
-
-	lzma_lzma_preset(&sp->opt_lzma, sp->preset);
-	sp->filters[1].id = LZMA_FILTER_LZMA2;
-	sp->filters[1].options = &sp->opt_lzma;
-
-	sp->filters[2].id = LZMA_VLI_UNKNOWN;
-	sp->filters[2].options = NULL;
-
-	/*
-	 * Install codec methods.
-	 */
-	tif->tif_fixuptags = LZMAFixupTags;
-	tif->tif_setupdecode = LZMASetupDecode;
-	tif->tif_predecode = LZMAPreDecode;
-	tif->tif_decoderow = LZMADecode;
-	tif->tif_decodestrip = LZMADecode;
-	tif->tif_decodetile = LZMADecode;
-	tif->tif_setupencode = LZMASetupEncode;
-	tif->tif_preencode = LZMAPreEncode;
-	tif->tif_postencode = LZMAPostEncode;
-	tif->tif_encoderow = LZMAEncode;
-	tif->tif_encodestrip = LZMAEncode;
-	tif->tif_encodetile = LZMAEncode;
-	tif->tif_cleanup = LZMACleanup;
-	/*
-	 * Setup predictor setup.
-	 */
-	(void) TIFFPredictorInit(tif);
-	return 1;
+    static const char module[] = "TIFFInitLZMA";
+    LZMAState *sp;
+    lzma_stream tmp_stream = LZMA_STREAM_INIT;
+
+    (void)scheme;
+    assert(scheme == COMPRESSION_LZMA);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, lzmaFields, TIFFArrayCount(lzmaFields)))
+    {
+        TIFFErrorExtR(tif, module, "Merging LZMA2 codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(LZMAState));
+    if (tif->tif_data == NULL)
+        goto bad;
+    sp = LState(tif);
+    memcpy(&sp->stream, &tmp_stream, sizeof(lzma_stream));
+
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = LZMAVGetField; /* hook for codec tags */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = LZMAVSetField; /* hook for codec tags */
+
+    /* Default values for codec-specific fields */
+    sp->preset = LZMA_PRESET_DEFAULT; /* default comp. level */
+    sp->check = LZMA_CHECK_NONE;
+    sp->state = 0;
+
+    /* Data filters. So far we are using delta and LZMA2 filters only. */
+    sp->opt_delta.type = LZMA_DELTA_TYPE_BYTE;
+    /*
+     * The sample size in bytes seems to be reasonable distance for delta
+     * filter.
+     */
+    sp->opt_delta.dist = (tif->tif_dir.td_bitspersample % 8)
+                             ? 1
+                             : tif->tif_dir.td_bitspersample / 8;
+    sp->filters[0].id = LZMA_FILTER_DELTA;
+    sp->filters[0].options = &sp->opt_delta;
+
+    lzma_lzma_preset(&sp->opt_lzma, sp->preset);
+    sp->filters[1].id = LZMA_FILTER_LZMA2;
+    sp->filters[1].options = &sp->opt_lzma;
+
+    sp->filters[2].id = LZMA_VLI_UNKNOWN;
+    sp->filters[2].options = NULL;
+
+    /*
+     * Install codec methods.
+     */
+    tif->tif_fixuptags = LZMAFixupTags;
+    tif->tif_setupdecode = LZMASetupDecode;
+    tif->tif_predecode = LZMAPreDecode;
+    tif->tif_decoderow = LZMADecode;
+    tif->tif_decodestrip = LZMADecode;
+    tif->tif_decodetile = LZMADecode;
+    tif->tif_setupencode = LZMASetupEncode;
+    tif->tif_preencode = LZMAPreEncode;
+    tif->tif_postencode = LZMAPostEncode;
+    tif->tif_encoderow = LZMAEncode;
+    tif->tif_encodestrip = LZMAEncode;
+    tif->tif_encodetile = LZMAEncode;
+    tif->tif_cleanup = LZMACleanup;
+    /*
+     * Setup predictor setup.
+     */
+    (void)TIFFPredictorInit(tif);
+    return 1;
 bad:
-	TIFFErrorExt(tif->tif_clientdata, module,
-		     "No space for LZMA2 state block");
-	return 0;
+    TIFFErrorExtR(tif, module, "No space for LZMA2 state block");
+    return 0;
 }
 #endif /* LZMA_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
diff --git a/3rdparty/libtiff/tif_lzw.c b/3rdparty/libtiff/tif_lzw.c
index d92d0fd354a9..d631fa104940 100644
--- a/3rdparty/libtiff/tif_lzw.c
+++ b/3rdparty/libtiff/tif_lzw.c
@@ -1,31 +1,32 @@
 /*
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
+ * Copyright (c) 2022 Even Rouault
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
 #include "tiffiop.h"
 #ifdef LZW_SUPPORT
 /*
- * TIFF Library.  
+ * TIFF Library.
  * Rev 5.0 Lempel-Ziv & Welch Compression Support
  *
  * This code is derived from the compress program whose code is
@@ -36,7 +37,13 @@
  */
 #include "tif_predict.h"
 
+#include <stdbool.h>
 #include <stdio.h>
+#include <stdlib.h>
+
+/* Select the plausible largest natural integer type for the architecture */
+#define SIZEOF_WORDTYPE SIZEOF_SIZE_T
+typedef size_t WordType;
 
 /*
  * NB: The 5.0 spec describes a different algorithm than Aldus
@@ -51,34 +58,27 @@
  *
  * Future revisions to the TIFF spec are expected to "clarify this issue".
  */
-#define LZW_COMPAT              /* include backwards compatibility code */
-/*
- * Each strip of data is supposed to be terminated by a CODE_EOI.
- * If the following #define is included, the decoder will also
- * check for end-of-strip w/o seeing this code.  This makes the
- * library more robust, but also slower.
- */
-#define LZW_CHECKEOS            /* include checks for strips w/o EOI code */
+#define LZW_COMPAT /* include backwards compatibility code */
 
-#define MAXCODE(n)	((1L<<(n))-1)
+#define MAXCODE(n) ((1L << (n)) - 1)
 /*
  * The TIFF spec specifies that encoded bit
  * strings range from 9 to 12 bits.
  */
-#define BITS_MIN        9               /* start with 9 bits */
-#define BITS_MAX        12              /* max of 12 bit strings */
+#define BITS_MIN 9  /* start with 9 bits */
+#define BITS_MAX 12 /* max of 12 bit strings */
 /* predefined codes */
-#define CODE_CLEAR      256             /* code to clear string table */
-#define CODE_EOI        257             /* end-of-information code */
-#define CODE_FIRST      258             /* first free code entry */
-#define CODE_MAX        MAXCODE(BITS_MAX)
-#define HSIZE           9001L           /* 91% occupancy */
-#define HSHIFT          (13-8)
+#define CODE_CLEAR 256 /* code to clear string table */
+#define CODE_EOI 257   /* end-of-information code */
+#define CODE_FIRST 258 /* first free code entry */
+#define CODE_MAX MAXCODE(BITS_MAX)
+#define HSIZE 9001L /* 91% occupancy */
+#define HSHIFT (13 - 8)
 #ifdef LZW_COMPAT
 /* NB: +1024 is for compatibility with old files */
-#define CSIZE           (MAXCODE(BITS_MAX)+1024L)
+#define CSIZE (MAXCODE(BITS_MAX) + 1024L)
 #else
-#define CSIZE           (MAXCODE(BITS_MAX)+1L)
+#define CSIZE (MAXCODE(BITS_MAX) + 1L)
 #endif
 
 /*
@@ -86,722 +86,936 @@
  * compression/decompression.  Note that the predictor
  * state block must be first in this data structure.
  */
-typedef struct {
-	TIFFPredictorState predict;     /* predictor super class */
+typedef struct
+{
+    TIFFPredictorState predict; /* predictor super class */
 
-	unsigned short  nbits;          /* # of bits/code */
-	unsigned short  maxcode;        /* maximum code for lzw_nbits */
-	unsigned short  free_ent;       /* next free entry in hash table */
-	unsigned long   nextdata;       /* next bits of i/o */
-	long            nextbits;       /* # of valid bits in lzw_nextdata */
+    unsigned short nbits;    /* # of bits/code */
+    unsigned short maxcode;  /* maximum code for lzw_nbits */
+    unsigned short free_ent; /* next free entry in hash table */
+    WordType nextdata;       /* next bits of i/o */
+    long nextbits;           /* # of valid bits in lzw_nextdata */
 
-	int             rw_mode;        /* preserve rw_mode from init */
+    int rw_mode; /* preserve rw_mode from init */
 } LZWBaseState;
 
-#define lzw_nbits       base.nbits
-#define lzw_maxcode     base.maxcode
-#define lzw_free_ent    base.free_ent
-#define lzw_nextdata    base.nextdata
-#define lzw_nextbits    base.nextbits
+#define lzw_nbits base.nbits
+#define lzw_maxcode base.maxcode
+#define lzw_free_ent base.free_ent
+#define lzw_nextdata base.nextdata
+#define lzw_nextbits base.nextbits
 
 /*
  * Encoding-specific state.
  */
-typedef uint16 hcode_t;			/* codes fit in 16 bits */
-typedef struct {
-	long	hash;
-	hcode_t	code;
+typedef uint16_t hcode_t; /* codes fit in 16 bits */
+typedef struct
+{
+    long hash;
+    hcode_t code;
 } hash_t;
 
 /*
  * Decoding-specific state.
  */
-typedef struct code_ent {
-	struct code_ent *next;
-	unsigned short	length;		/* string len, including this token */
-	unsigned char	value;		/* data value */
-	unsigned char	firstchar;	/* first token of string */
+typedef struct code_ent
+{
+    struct code_ent *next;
+    unsigned short length; /* string len, including this token */
+    /* firstchar should be placed immediately before value in this structure */
+    unsigned char firstchar; /* first token of string */
+    unsigned char value;     /* data value */
+    bool repeated;
 } code_t;
 
-typedef int (*decodeFunc)(TIFF*, uint8*, tmsize_t, uint16);
-
-typedef struct {
-	LZWBaseState base;
+typedef int (*decodeFunc)(TIFF *, uint8_t *, tmsize_t, uint16_t);
 
-	/* Decoding specific data */
-	long    dec_nbitsmask;		/* lzw_nbits 1 bits, right adjusted */
-	long    dec_restart;		/* restart count */
-#ifdef LZW_CHECKEOS
-	uint64  dec_bitsleft;		/* available bits in raw data */
-	tmsize_t old_tif_rawcc;         /* value of tif_rawcc at the end of the previous TIFLZWDecode() call */
-#endif
-	decodeFunc dec_decode;		/* regular or backwards compatible */
-	code_t* dec_codep;		/* current recognized code */
-	code_t* dec_oldcodep;		/* previously recognized code */
-	code_t* dec_free_entp;		/* next free entry */
-	code_t* dec_maxcodep;		/* max available entry */
-	code_t* dec_codetab;		/* kept separate for small machines */
-
-	/* Encoding specific data */
-	int     enc_oldcode;		/* last code encountered */
-	long    enc_checkpoint;		/* point at which to clear table */
-#define CHECK_GAP	10000		/* enc_ratio check interval */
-	long    enc_ratio;		/* current compression ratio */
-	long    enc_incount;		/* (input) data bytes encoded */
-	long    enc_outcount;		/* encoded (output) bytes */
-	uint8*  enc_rawlimit;		/* bound on tif_rawdata buffer */
-	hash_t* enc_hashtab;		/* kept separate for small machines */
+typedef struct
+{
+    LZWBaseState base;
+
+    /* Decoding specific data */
+    long dec_nbitsmask;     /* lzw_nbits 1 bits, right adjusted */
+    tmsize_t dec_restart;   /* restart count */
+    uint64_t dec_bitsleft;  /* available bits in raw data */
+    tmsize_t old_tif_rawcc; /* value of tif_rawcc at the end of the previous
+                               TIFLZWDecode() call */
+    decodeFunc dec_decode;  /* regular or backwards compatible */
+    code_t *dec_codep;      /* current recognized code */
+    code_t *dec_oldcodep;   /* previously recognized code */
+    code_t *dec_free_entp;  /* next free entry */
+    code_t *dec_maxcodep;   /* max available entry */
+    code_t *dec_codetab;    /* kept separate for small machines */
+    int read_error; /* whether a read error has occurred, and which should cause
+                       further reads in the same strip/tile to be aborted */
+
+    /* Encoding specific data */
+    int enc_oldcode;         /* last code encountered */
+    tmsize_t enc_checkpoint; /* point at which to clear table */
+#define CHECK_GAP 10000      /* enc_ratio check interval */
+    tmsize_t enc_ratio;      /* current compression ratio */
+    tmsize_t enc_incount;    /* (input) data bytes encoded */
+    tmsize_t enc_outcount;   /* encoded (output) bytes */
+    uint8_t *enc_rawlimit;   /* bound on tif_rawdata buffer */
+    hash_t *enc_hashtab;     /* kept separate for small machines */
 } LZWCodecState;
 
-#define LZWState(tif)		((LZWBaseState*) (tif)->tif_data)
-#define DecoderState(tif)	((LZWCodecState*) LZWState(tif))
-#define EncoderState(tif)	((LZWCodecState*) LZWState(tif))
+#define LZWState(tif) ((LZWBaseState *)(tif)->tif_data)
+#define DecoderState(tif) ((LZWCodecState *)LZWState(tif))
+#define EncoderState(tif) ((LZWCodecState *)LZWState(tif))
 
-static int LZWDecode(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s);
+static int LZWDecode(TIFF *tif, uint8_t *op0, tmsize_t occ0, uint16_t s);
 #ifdef LZW_COMPAT
-static int LZWDecodeCompat(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s);
+static int LZWDecodeCompat(TIFF *tif, uint8_t *op0, tmsize_t occ0, uint16_t s);
 #endif
-static void cl_hash(LZWCodecState*);
+static void cl_hash(LZWCodecState *);
 
 /*
  * LZW Decoder.
  */
 
-#ifdef LZW_CHECKEOS
-/*
- * This check shouldn't be necessary because each
- * strip is suppose to be terminated with CODE_EOI.
- */
-#define	NextCode(_tif, _sp, _bp, _code, _get) {				\
-	if ((_sp)->dec_bitsleft < (uint64)nbits) {			\
-		TIFFWarningExt(_tif->tif_clientdata, module,		\
-		    "LZWDecode: Strip %d not terminated with EOI code", \
-		    _tif->tif_curstrip);				\
-		_code = CODE_EOI;					\
-	} else {							\
-		_get(_sp,_bp,_code);					\
-		(_sp)->dec_bitsleft -= nbits;				\
-	}								\
-}
-#else
-#define	NextCode(tif, sp, bp, code, get) get(sp, bp, code)
-#endif
-
-static int
-LZWFixupTags(TIFF* tif)
+static int LZWFixupTags(TIFF *tif)
 {
-	(void) tif;
-	return (1);
+    (void)tif;
+    return (1);
 }
 
-static int
-LZWSetupDecode(TIFF* tif)
+static int LZWSetupDecode(TIFF *tif)
 {
-	static const char module[] = "LZWSetupDecode";
-	LZWCodecState* sp = DecoderState(tif);
-	int code;
-
-	if( sp == NULL )
-	{
-		/*
-		 * Allocate state block so tag methods have storage to record
-		 * values.
-		*/
-		tif->tif_data = (uint8*) _TIFFmalloc(sizeof(LZWCodecState));
-		if (tif->tif_data == NULL)
-		{
-			TIFFErrorExt(tif->tif_clientdata, module, "No space for LZW state block");
-			return (0);
-		}
-
-		sp = DecoderState(tif);
-		sp->dec_codetab = NULL;
-		sp->dec_decode = NULL;
-
-		/*
-		 * Setup predictor setup.
-		 */
-		(void) TIFFPredictorInit(tif);
-	}
-
-	if (sp->dec_codetab == NULL) {
-		sp->dec_codetab = (code_t*)_TIFFmalloc(CSIZE*sizeof (code_t));
-		if (sp->dec_codetab == NULL) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-				     "No space for LZW code table");
-			return (0);
-		}
-		/*
-		 * Pre-load the table.
-		 */
-		code = 255;
-		do {
-			sp->dec_codetab[code].value = (unsigned char)code;
-			sp->dec_codetab[code].firstchar = (unsigned char)code;
-			sp->dec_codetab[code].length = 1;
-			sp->dec_codetab[code].next = NULL;
-		} while (code--);
-		/*
-		 * Zero-out the unused entries
-                 */
-                /* Silence false positive */
-                /* coverity[overrun-buffer-arg] */
-                 _TIFFmemset(&sp->dec_codetab[CODE_CLEAR], 0,
-			     (CODE_FIRST - CODE_CLEAR) * sizeof (code_t));
-	}
-	return (1);
+    static const char module[] = "LZWSetupDecode";
+    LZWCodecState *sp = DecoderState(tif);
+    int code;
+
+    if (sp == NULL)
+    {
+        /*
+         * Allocate state block so tag methods have storage to record
+         * values.
+         */
+        tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(LZWCodecState));
+        if (tif->tif_data == NULL)
+        {
+            TIFFErrorExtR(tif, module, "No space for LZW state block");
+            return (0);
+        }
+
+        sp = DecoderState(tif);
+        sp->dec_codetab = NULL;
+        sp->dec_decode = NULL;
+
+        /*
+         * Setup predictor setup.
+         */
+        (void)TIFFPredictorInit(tif);
+    }
+
+    if (sp->dec_codetab == NULL)
+    {
+        sp->dec_codetab = (code_t *)_TIFFmallocExt(tif, CSIZE * sizeof(code_t));
+        if (sp->dec_codetab == NULL)
+        {
+            TIFFErrorExtR(tif, module, "No space for LZW code table");
+            return (0);
+        }
+        /*
+         * Pre-load the table.
+         */
+        code = 255;
+        do
+        {
+            sp->dec_codetab[code].firstchar = (unsigned char)code;
+            sp->dec_codetab[code].value = (unsigned char)code;
+            sp->dec_codetab[code].repeated = true;
+            sp->dec_codetab[code].length = 1;
+            sp->dec_codetab[code].next = NULL;
+        } while (code--);
+        /*
+         * Zero-out the unused entries  */
+        /* Silence false positive */
+        /* coverity[overrun-buffer-arg] */
+        memset(&sp->dec_codetab[CODE_CLEAR], 0,
+               (CODE_FIRST - CODE_CLEAR) * sizeof(code_t));
+    }
+    return (1);
 }
 
 /*
  * Setup state for decoding a strip.
  */
-static int
-LZWPreDecode(TIFF* tif, uint16 s)
+static int LZWPreDecode(TIFF *tif, uint16_t s)
 {
-	static const char module[] = "LZWPreDecode";
-	LZWCodecState *sp = DecoderState(tif);
-
-	(void) s;
-	assert(sp != NULL);
-	if( sp->dec_codetab == NULL )
+    static const char module[] = "LZWPreDecode";
+    LZWCodecState *sp = DecoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+    if (sp->dec_codetab == NULL)
+    {
+        tif->tif_setupdecode(tif);
+        if (sp->dec_codetab == NULL)
+            return (0);
+    }
+
+    /*
+     * Check for old bit-reversed codes.
+     */
+    if (tif->tif_rawcc >= 2 && tif->tif_rawdata[0] == 0 &&
+        (tif->tif_rawdata[1] & 0x1))
+    {
+#ifdef LZW_COMPAT
+        if (!sp->dec_decode)
         {
-            tif->tif_setupdecode( tif );
-	    if( sp->dec_codetab == NULL )
-		return (0);
+            TIFFWarningExtR(tif, module, "Old-style LZW codes, convert file");
+            /*
+             * Override default decoding methods with
+             * ones that deal with the old coding.
+             * Otherwise the predictor versions set
+             * above will call the compatibility routines
+             * through the dec_decode method.
+             */
+            tif->tif_decoderow = LZWDecodeCompat;
+            tif->tif_decodestrip = LZWDecodeCompat;
+            tif->tif_decodetile = LZWDecodeCompat;
+            /*
+             * If doing horizontal differencing, must
+             * re-setup the predictor logic since we
+             * switched the basic decoder methods...
+             */
+            (*tif->tif_setupdecode)(tif);
+            sp->dec_decode = LZWDecodeCompat;
         }
-
-	/*
-	 * Check for old bit-reversed codes.
-	 */
-	if (tif->tif_rawcc >= 2 &&
-	    tif->tif_rawdata[0] == 0 && (tif->tif_rawdata[1] & 0x1)) {
-#ifdef LZW_COMPAT
-		if (!sp->dec_decode) {
-			TIFFWarningExt(tif->tif_clientdata, module,
-			    "Old-style LZW codes, convert file");
-			/*
-			 * Override default decoding methods with
-			 * ones that deal with the old coding.
-			 * Otherwise the predictor versions set
-			 * above will call the compatibility routines
-			 * through the dec_decode method.
-			 */
-			tif->tif_decoderow = LZWDecodeCompat;
-			tif->tif_decodestrip = LZWDecodeCompat;
-			tif->tif_decodetile = LZWDecodeCompat;
-			/*
-			 * If doing horizontal differencing, must
-			 * re-setup the predictor logic since we
-			 * switched the basic decoder methods...
-			 */
-			(*tif->tif_setupdecode)(tif);
-			sp->dec_decode = LZWDecodeCompat;
-		}
-		sp->lzw_maxcode = MAXCODE(BITS_MIN);
-#else /* !LZW_COMPAT */
-		if (!sp->dec_decode) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Old-style LZW codes not supported");
-			sp->dec_decode = LZWDecode;
-		}
-		return (0);
-#endif/* !LZW_COMPAT */
-	} else {
-		sp->lzw_maxcode = MAXCODE(BITS_MIN)-1;
-		sp->dec_decode = LZWDecode;
-	}
-	sp->lzw_nbits = BITS_MIN;
-	sp->lzw_nextbits = 0;
-	sp->lzw_nextdata = 0;
-
-	sp->dec_restart = 0;
-	sp->dec_nbitsmask = MAXCODE(BITS_MIN);
-#ifdef LZW_CHECKEOS
-	sp->dec_bitsleft = 0;
-        sp->old_tif_rawcc = 0;
-#endif
-	sp->dec_free_entp = sp->dec_codetab + CODE_FIRST;
-	/*
-	 * Zero entries that are not yet filled in.  We do
-	 * this to guard against bogus input data that causes
-	 * us to index into undefined entries.  If you can
-	 * come up with a way to safely bounds-check input codes
-	 * while decoding then you can remove this operation.
-	 */
-	_TIFFmemset(sp->dec_free_entp, 0, (CSIZE-CODE_FIRST)*sizeof (code_t));
-	sp->dec_oldcodep = &sp->dec_codetab[-1];
-	sp->dec_maxcodep = &sp->dec_codetab[sp->dec_nbitsmask-1];
-	return (1);
+        sp->lzw_maxcode = MAXCODE(BITS_MIN);
+#else  /* !LZW_COMPAT */
+        if (!sp->dec_decode)
+        {
+            TIFFErrorExtR(tif, module, "Old-style LZW codes not supported");
+            sp->dec_decode = LZWDecode;
+        }
+        return (0);
+#endif /* !LZW_COMPAT */
+    }
+    else
+    {
+        sp->lzw_maxcode = MAXCODE(BITS_MIN) - 1;
+        sp->dec_decode = LZWDecode;
+    }
+    sp->lzw_nbits = BITS_MIN;
+    sp->lzw_nextbits = 0;
+    sp->lzw_nextdata = 0;
+
+    sp->dec_restart = 0;
+    sp->dec_nbitsmask = MAXCODE(BITS_MIN);
+    sp->dec_bitsleft = 0;
+    sp->old_tif_rawcc = 0;
+    sp->dec_free_entp = sp->dec_codetab - 1; // + CODE_FIRST;
+    /*
+     * Zero entries that are not yet filled in.  We do
+     * this to guard against bogus input data that causes
+     * us to index into undefined entries.  If you can
+     * come up with a way to safely bounds-check input codes
+     * while decoding then you can remove this operation.
+     */
+    sp->dec_oldcodep = &sp->dec_codetab[0];
+    sp->dec_maxcodep = &sp->dec_codetab[sp->dec_nbitsmask - 1];
+    sp->read_error = 0;
+    return (1);
 }
 
 /*
  * Decode a "hunk of data".
  */
-#define	GetNextCode(sp, bp, code) {				\
-	nextdata = (nextdata<<8) | *(bp)++;			\
-	nextbits += 8;						\
-	if (nextbits < nbits) {					\
-		nextdata = (nextdata<<8) | *(bp)++;		\
-		nextbits += 8;					\
-	}							\
-	code = (hcode_t)((nextdata >> (nextbits-nbits)) & nbitsmask);	\
-	nextbits -= nbits;					\
+
+/* Get the next 32 or 64-bit from the input data */
+#ifdef WORDS_BIGENDIAN
+#define GetNextData(nextdata, bp) memcpy(&nextdata, bp, sizeof(nextdata))
+#elif SIZEOF_WORDTYPE == 8
+#if defined(__GNUC__) && defined(__x86_64__)
+#define GetNextData(nextdata, bp)                                              \
+    nextdata = __builtin_bswap64(*(uint64_t *)(bp))
+#elif defined(_M_X64)
+#define GetNextData(nextdata, bp) nextdata = _byteswap_uint64(*(uint64_t *)(bp))
+#elif defined(__GNUC__)
+#define GetNextData(nextdata, bp)                                              \
+    memcpy(&nextdata, bp, sizeof(nextdata));                                   \
+    nextdata = __builtin_bswap64(nextdata)
+#else
+#define GetNextData(nextdata, bp)                                              \
+    nextdata = (((uint64_t)bp[0]) << 56) | (((uint64_t)bp[1]) << 48) |         \
+               (((uint64_t)bp[2]) << 40) | (((uint64_t)bp[3]) << 32) |         \
+               (((uint64_t)bp[4]) << 24) | (((uint64_t)bp[5]) << 16) |         \
+               (((uint64_t)bp[6]) << 8) | (((uint64_t)bp[7]))
+#endif
+#elif SIZEOF_WORDTYPE == 4
+#if defined(__GNUC__) && defined(__i386__)
+#define GetNextData(nextdata, bp)                                              \
+    nextdata = __builtin_bswap32(*(uint32_t *)(bp))
+#elif defined(_M_X86)
+#define GetNextData(nextdata, bp)                                              \
+    nextdata = _byteswap_ulong(*(unsigned long *)(bp))
+#elif defined(__GNUC__)
+#define GetNextData(nextdata, bp)                                              \
+    memcpy(&nextdata, bp, sizeof(nextdata));                                   \
+    nextdata = __builtin_bswap32(nextdata)
+#else
+#define GetNextData(nextdata, bp)                                              \
+    nextdata = (((uint32_t)bp[0]) << 24) | (((uint32_t)bp[1]) << 16) |         \
+               (((uint32_t)bp[2]) << 8) | (((uint32_t)bp[3]))
+#endif
+#else
+#error "Unhandled SIZEOF_WORDTYPE"
+#endif
+
+#define GetNextCodeLZW()                                                       \
+    do                                                                         \
+    {                                                                          \
+        nextbits -= nbits;                                                     \
+        if (nextbits < 0)                                                      \
+        {                                                                      \
+            if (dec_bitsleft >= 8 * SIZEOF_WORDTYPE)                           \
+            {                                                                  \
+                unsigned codetmp = (unsigned)(nextdata << (-nextbits));        \
+                GetNextData(nextdata, bp);                                     \
+                bp += SIZEOF_WORDTYPE;                                         \
+                nextbits += 8 * SIZEOF_WORDTYPE;                               \
+                dec_bitsleft -= 8 * SIZEOF_WORDTYPE;                           \
+                code = (WordType)((codetmp | (nextdata >> nextbits)) &         \
+                                  nbitsmask);                                  \
+                break;                                                         \
+            }                                                                  \
+            else                                                               \
+            {                                                                  \
+                if (dec_bitsleft < 8)                                          \
+                {                                                              \
+                    goto no_eoi;                                               \
+                }                                                              \
+                nextdata = (nextdata << 8) | *(bp)++;                          \
+                nextbits += 8;                                                 \
+                dec_bitsleft -= 8;                                             \
+                if (nextbits < 0)                                              \
+                {                                                              \
+                    if (dec_bitsleft < 8)                                      \
+                    {                                                          \
+                        goto no_eoi;                                           \
+                    }                                                          \
+                    nextdata = (nextdata << 8) | *(bp)++;                      \
+                    nextbits += 8;                                             \
+                    dec_bitsleft -= 8;                                         \
+                }                                                              \
+            }                                                                  \
+        }                                                                      \
+        code = (WordType)((nextdata >> nextbits) & nbitsmask);                 \
+    } while (0)
+
+static int LZWDecode(TIFF *tif, uint8_t *op0, tmsize_t occ0, uint16_t s)
+{
+    static const char module[] = "LZWDecode";
+    LZWCodecState *sp = DecoderState(tif);
+    uint8_t *op = (uint8_t *)op0;
+    tmsize_t occ = occ0;
+    uint8_t *bp;
+    long nbits, nextbits, nbitsmask;
+    WordType nextdata;
+    code_t *free_entp, *maxcodep, *oldcodep;
+
+    (void)s;
+    assert(sp != NULL);
+    assert(sp->dec_codetab != NULL);
+
+    if (sp->read_error)
+    {
+        TIFFErrorExtR(tif, module,
+                      "LZWDecode: Scanline %" PRIu32 " cannot be read due to "
+                      "previous error",
+                      tif->tif_row);
+        return 0;
+    }
+
+    /*
+     * Restart interrupted output operation.
+     */
+    if (sp->dec_restart)
+    {
+        tmsize_t residue;
+
+        code_t *codep = sp->dec_codep;
+        residue = codep->length - sp->dec_restart;
+        if (residue > occ)
+        {
+            /*
+             * Residue from previous decode is sufficient
+             * to satisfy decode request.  Skip to the
+             * start of the decoded string, place decoded
+             * values in the output buffer, and return.
+             */
+            sp->dec_restart += occ;
+            do
+            {
+                codep = codep->next;
+            } while (--residue > occ && codep);
+            if (codep)
+            {
+                uint8_t *tp = op + occ;
+                do
+                {
+                    *--tp = codep->value;
+                    codep = codep->next;
+                } while (--occ && codep);
+            }
+            return (1);
+        }
+        /*
+         * Residue satisfies only part of the decode request.
+         */
+        op += residue;
+        occ -= residue;
+        uint8_t *tp = op;
+        do
+        {
+            *--tp = codep->value;
+            codep = codep->next;
+        } while (--residue && codep);
+        sp->dec_restart = 0;
+    }
+
+    bp = (uint8_t *)tif->tif_rawcp;
+    sp->dec_bitsleft += (((uint64_t)tif->tif_rawcc - sp->old_tif_rawcc) << 3);
+    uint64_t dec_bitsleft = sp->dec_bitsleft;
+    nbits = sp->lzw_nbits;
+    nextdata = sp->lzw_nextdata;
+    nextbits = sp->lzw_nextbits;
+    nbitsmask = sp->dec_nbitsmask;
+    oldcodep = sp->dec_oldcodep;
+    free_entp = sp->dec_free_entp;
+    maxcodep = sp->dec_maxcodep;
+    code_t *const dec_codetab = sp->dec_codetab;
+    code_t *codep;
+
+    if (occ == 0)
+    {
+        goto after_loop;
+    }
+
+begin:
+{
+    WordType code;
+    GetNextCodeLZW();
+    codep = dec_codetab + code;
+    if (code >= CODE_FIRST)
+        goto code_above_or_equal_to_258;
+    if (code < 256)
+        goto code_below_256;
+    if (code == CODE_EOI)
+        goto after_loop;
+    goto code_clear;
+
+code_below_256:
+{
+    if (codep > free_entp)
+        goto error_code;
+    free_entp->next = oldcodep;
+    free_entp->firstchar = oldcodep->firstchar;
+    free_entp->length = oldcodep->length + 1;
+    free_entp->value = (uint8_t)code;
+    free_entp->repeated =
+        (bool)(oldcodep->repeated & (oldcodep->value == code));
+    if (++free_entp > maxcodep)
+    {
+        if (++nbits > BITS_MAX) /* should not happen for a conformant encoder */
+            nbits = BITS_MAX;
+        nbitsmask = MAXCODE(nbits);
+        maxcodep = dec_codetab + nbitsmask - 1;
+        if (free_entp >= &dec_codetab[CSIZE])
+        {
+            /* At that point, the next valid states are either EOI or a */
+            /* CODE_CLEAR. If a regular code is read, at the next */
+            /* attempt at registering a new entry, we will error out */
+            /* due to setting free_entp before any valid code */
+            free_entp = dec_codetab - 1;
+        }
+    }
+    oldcodep = codep;
+    *op++ = (uint8_t)code;
+    occ--;
+    if (occ == 0)
+        goto after_loop;
+    goto begin;
 }
 
-static void
-codeLoop(TIFF* tif, const char* module)
+code_above_or_equal_to_258:
 {
-	TIFFErrorExt(tif->tif_clientdata, module,
-	    "Bogus encoding, loop in the code table; scanline %d",
-	    tif->tif_row);
+    /*
+     * Add the new entry to the code table.
+     */
+
+    if (codep >= free_entp)
+    {
+        if (codep != free_entp)
+            goto error_code;
+        free_entp->value = oldcodep->firstchar;
+    }
+    else
+    {
+        free_entp->value = codep->firstchar;
+    }
+    free_entp->repeated =
+        (bool)(oldcodep->repeated & (oldcodep->value == free_entp->value));
+    free_entp->next = oldcodep;
+
+    free_entp->firstchar = oldcodep->firstchar;
+    free_entp->length = oldcodep->length + 1;
+    if (++free_entp > maxcodep)
+    {
+        if (++nbits > BITS_MAX) /* should not happen for a conformant encoder */
+            nbits = BITS_MAX;
+        nbitsmask = MAXCODE(nbits);
+        maxcodep = dec_codetab + nbitsmask - 1;
+        if (free_entp >= &dec_codetab[CSIZE])
+        {
+            /* At that point, the next valid states are either EOI or a */
+            /* CODE_CLEAR. If a regular code is read, at the next */
+            /* attempt at registering a new entry, we will error out */
+            /* due to setting free_entp before any valid code */
+            free_entp = dec_codetab - 1;
+        }
+    }
+    oldcodep = codep;
+
+    /*
+     * Code maps to a string, copy string
+     * value to output (written in reverse).
+     */
+    /* tiny bit faster on x86_64 to store in unsigned short than int */
+    unsigned short len = codep->length;
+
+    if (len < 3) /* equivalent to len == 2 given all other conditions */
+    {
+        if (occ <= 2)
+        {
+            if (occ == 2)
+            {
+                memcpy(op, &(codep->firstchar), 2);
+                op += 2;
+                occ -= 2;
+                goto after_loop;
+            }
+            goto too_short_buffer;
+        }
+
+        memcpy(op, &(codep->firstchar), 2);
+        op += 2;
+        occ -= 2;
+        goto begin; /* we can save the comparison occ > 0 */
+    }
+
+    if (len == 3)
+    {
+        if (occ <= 3)
+        {
+            if (occ == 3)
+            {
+                op[0] = codep->firstchar;
+                op[1] = codep->next->value;
+                op[2] = codep->value;
+                op += 3;
+                occ -= 3;
+                goto after_loop;
+            }
+            goto too_short_buffer;
+        }
+
+        op[0] = codep->firstchar;
+        op[1] = codep->next->value;
+        op[2] = codep->value;
+        op += 3;
+        occ -= 3;
+        goto begin; /* we can save the comparison occ > 0 */
+    }
+
+    if (len > occ)
+    {
+        goto too_short_buffer;
+    }
+
+    if (codep->repeated)
+    {
+        memset(op, codep->value, len);
+        op += len;
+        occ -= len;
+        if (occ == 0)
+            goto after_loop;
+        goto begin;
+    }
+
+    uint8_t *tp = op + len;
+
+    assert(len >= 4);
+
+    *--tp = codep->value;
+    codep = codep->next;
+    *--tp = codep->value;
+    codep = codep->next;
+    *--tp = codep->value;
+    codep = codep->next;
+    *--tp = codep->value;
+    if (tp > op)
+    {
+        do
+        {
+            codep = codep->next;
+            *--tp = codep->value;
+        } while (tp > op);
+    }
+
+    assert(occ >= len);
+    op += len;
+    occ -= len;
+    if (occ == 0)
+        goto after_loop;
+    goto begin;
 }
 
-static int
-LZWDecode(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s)
+code_clear:
 {
-	static const char module[] = "LZWDecode";
-	LZWCodecState *sp = DecoderState(tif);
-	char *op = (char*) op0;
-	long occ = (long) occ0;
-	char *tp;
-	unsigned char *bp;
-	hcode_t code;
-	int len;
-	long nbits, nextbits, nbitsmask;
-        unsigned long nextdata;
-	code_t *codep, *free_entp, *maxcodep, *oldcodep;
-
-	(void) s;
-	assert(sp != NULL);
-        assert(sp->dec_codetab != NULL);
-
-	/*
-	  Fail if value does not fit in long.
-	*/
-	if ((tmsize_t) occ != occ0)
-	        return (0);
-	/*
-	 * Restart interrupted output operation.
-	 */
-	if (sp->dec_restart) {
-		long residue;
-
-		codep = sp->dec_codep;
-		residue = codep->length - sp->dec_restart;
-		if (residue > occ) {
-			/*
-			 * Residue from previous decode is sufficient
-			 * to satisfy decode request.  Skip to the
-			 * start of the decoded string, place decoded
-			 * values in the output buffer, and return.
-			 */
-			sp->dec_restart += occ;
-			do {
-				codep = codep->next;
-			} while (--residue > occ && codep);
-			if (codep) {
-				tp = op + occ;
-				do {
-					*--tp = codep->value;
-					codep = codep->next;
-				} while (--occ && codep);
-			}
-			return (1);
-		}
-		/*
-		 * Residue satisfies only part of the decode request.
-		 */
-		op += residue;
-		occ -= residue;
-		tp = op;
-		do {
-			int t;
-			--tp;
-			t = codep->value;
-			codep = codep->next;
-			*tp = (char)t;
-		} while (--residue && codep);
-		sp->dec_restart = 0;
-	}
-
-	bp = (unsigned char *)tif->tif_rawcp;
-#ifdef LZW_CHECKEOS
-	sp->dec_bitsleft += (((uint64)tif->tif_rawcc - sp->old_tif_rawcc) << 3);
-#endif
-	nbits = sp->lzw_nbits;
-	nextdata = sp->lzw_nextdata;
-	nextbits = sp->lzw_nextbits;
-	nbitsmask = sp->dec_nbitsmask;
-	oldcodep = sp->dec_oldcodep;
-	free_entp = sp->dec_free_entp;
-	maxcodep = sp->dec_maxcodep;
-
-	while (occ > 0) {
-		NextCode(tif, sp, bp, code, GetNextCode);
-		if (code == CODE_EOI)
-			break;
-		if (code == CODE_CLEAR) {
-			do {
-				free_entp = sp->dec_codetab + CODE_FIRST;
-				_TIFFmemset(free_entp, 0,
-					    (CSIZE - CODE_FIRST) * sizeof (code_t));
-				nbits = BITS_MIN;
-				nbitsmask = MAXCODE(BITS_MIN);
-				maxcodep = sp->dec_codetab + nbitsmask-1;
-				NextCode(tif, sp, bp, code, GetNextCode);
-			} while (code == CODE_CLEAR);	/* consecutive CODE_CLEAR codes */
-			if (code == CODE_EOI)
-				break;
-			if (code > CODE_CLEAR) {
-				TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-				"LZWDecode: Corrupted LZW table at scanline %d",
-					     tif->tif_row);
-				return (0);
-			}
-			*op++ = (char)code;
-			occ--;
-			oldcodep = sp->dec_codetab + code;
-			continue;
-		}
-		codep = sp->dec_codetab + code;
-
-		/*
-		 * Add the new entry to the code table.
-		 */
-		if (free_entp < &sp->dec_codetab[0] ||
-		    free_entp >= &sp->dec_codetab[CSIZE]) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Corrupted LZW table at scanline %d",
-			    tif->tif_row);
-			return (0);
-		}
-
-		free_entp->next = oldcodep;
-		if (free_entp->next < &sp->dec_codetab[0] ||
-		    free_entp->next >= &sp->dec_codetab[CSIZE]) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Corrupted LZW table at scanline %d",
-			    tif->tif_row);
-			return (0);
-		}
-		free_entp->firstchar = free_entp->next->firstchar;
-		free_entp->length = free_entp->next->length+1;
-		free_entp->value = (codep < free_entp) ?
-		    codep->firstchar : free_entp->firstchar;
-		if (++free_entp > maxcodep) {
-			if (++nbits > BITS_MAX)		/* should not happen */
-				nbits = BITS_MAX;
-			nbitsmask = MAXCODE(nbits);
-			maxcodep = sp->dec_codetab + nbitsmask-1;
-		}
-		oldcodep = codep;
-		if (code >= 256) {
-			/*
-			 * Code maps to a string, copy string
-			 * value to output (written in reverse).
-			 */
-			if(codep->length == 0) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Wrong length of decoded string: "
-				    "data probably corrupted at scanline %d",
-				    tif->tif_row);
-				return (0);
-			}
-			if (codep->length > occ) {
-				/*
-				 * String is too long for decode buffer,
-				 * locate portion that will fit, copy to
-				 * the decode buffer, and setup restart
-				 * logic for the next decoding call.
-				 */
-				sp->dec_codep = codep;
-				do {
-					codep = codep->next;
-				} while (codep && codep->length > occ);
-				if (codep) {
-					sp->dec_restart = (long)occ;
-					tp = op + occ;
-					do  {
-						*--tp = codep->value;
-						codep = codep->next;
-					}  while (--occ && codep);
-					if (codep)
-						codeLoop(tif, module);
-				}
-				break;
-			}
-			len = codep->length;
-			tp = op + len;
-			do {
-				int t;
-				--tp;
-				t = codep->value;
-				codep = codep->next;
-				*tp = (char)t;
-			} while (codep && tp > op);
-			if (codep) {
-			    codeLoop(tif, module);
-			    break;
-			}
-			assert(occ >= len);
-			op += len;
-			occ -= len;
-		} else {
-			*op++ = (char)code;
-			occ--;
-		}
-	}
-
-	tif->tif_rawcc -= (tmsize_t)( (uint8*) bp - tif->tif_rawcp );
-	tif->tif_rawcp = (uint8*) bp;
-#ifdef LZW_CHECKEOS
-	sp->old_tif_rawcc = tif->tif_rawcc;
-#endif
-	sp->lzw_nbits = (unsigned short) nbits;
-	sp->lzw_nextdata = nextdata;
-	sp->lzw_nextbits = nextbits;
-	sp->dec_nbitsmask = nbitsmask;
-	sp->dec_oldcodep = oldcodep;
-	sp->dec_free_entp = free_entp;
-	sp->dec_maxcodep = maxcodep;
-
-	if (occ > 0) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"Not enough data at scanline %d (short %I64d bytes)",
-			     tif->tif_row, (unsigned __int64) occ);
-#else
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"Not enough data at scanline %d (short %llu bytes)",
-			     tif->tif_row, (unsigned long long) occ);
-#endif
-		return (0);
-	}
-	return (1);
+    free_entp = dec_codetab + CODE_FIRST;
+    nbits = BITS_MIN;
+    nbitsmask = MAXCODE(BITS_MIN);
+    maxcodep = dec_codetab + nbitsmask - 1;
+    do
+    {
+        GetNextCodeLZW();
+    } while (code == CODE_CLEAR); /* consecutive CODE_CLEAR codes */
+    if (code == CODE_EOI)
+        goto after_loop;
+    if (code > CODE_EOI)
+    {
+        goto error_code;
+    }
+    *op++ = (uint8_t)code;
+    occ--;
+    oldcodep = dec_codetab + code;
+    if (occ == 0)
+        goto after_loop;
+    goto begin;
+}
+}
+
+too_short_buffer:
+{
+    /*
+     * String is too long for decode buffer,
+     * locate portion that will fit, copy to
+     * the decode buffer, and setup restart
+     * logic for the next decoding call.
+     */
+    sp->dec_codep = codep;
+    do
+    {
+        codep = codep->next;
+    } while (codep->length > occ);
+
+    sp->dec_restart = occ;
+    uint8_t *tp = op + occ;
+    do
+    {
+        *--tp = codep->value;
+        codep = codep->next;
+    } while (--occ);
+}
+
+after_loop:
+    tif->tif_rawcc -= (tmsize_t)((uint8_t *)bp - tif->tif_rawcp);
+    tif->tif_rawcp = (uint8_t *)bp;
+    sp->old_tif_rawcc = tif->tif_rawcc;
+    sp->dec_bitsleft = dec_bitsleft;
+    sp->lzw_nbits = (unsigned short)nbits;
+    sp->lzw_nextdata = nextdata;
+    sp->lzw_nextbits = nextbits;
+    sp->dec_nbitsmask = nbitsmask;
+    sp->dec_oldcodep = oldcodep;
+    sp->dec_free_entp = free_entp;
+    sp->dec_maxcodep = maxcodep;
+
+    if (occ > 0)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Not enough data at scanline %" PRIu32 " (short %" PRIu64
+                      " bytes)",
+                      tif->tif_row, (uint64_t)occ);
+        return (0);
+    }
+    return (1);
+
+no_eoi:
+    sp->read_error = 1;
+    TIFFErrorExtR(tif, module,
+                  "LZWDecode: Strip %" PRIu32 " not terminated with EOI code",
+                  tif->tif_curstrip);
+    return 0;
+error_code:
+    sp->read_error = 1;
+    TIFFErrorExtR(tif, tif->tif_name, "Using code not yet in table");
+    return 0;
 }
 
 #ifdef LZW_COMPAT
+
 /*
- * Decode a "hunk of data" for old images.
+ * This check shouldn't be necessary because each
+ * strip is suppose to be terminated with CODE_EOI.
  */
-#define	GetNextCodeCompat(sp, bp, code) {			\
-	nextdata |= (unsigned long) *(bp)++ << nextbits;	\
-	nextbits += 8;						\
-	if (nextbits < nbits) {					\
-		nextdata |= (unsigned long) *(bp)++ << nextbits;\
-		nextbits += 8;					\
-	}							\
-	code = (hcode_t)(nextdata & nbitsmask);			\
-	nextdata >>= nbits;					\
-	nextbits -= nbits;					\
-}
+#define NextCode(_tif, _sp, _bp, _code, _get, dec_bitsleft)                    \
+    {                                                                          \
+        if (dec_bitsleft < (uint64_t)nbits)                                    \
+        {                                                                      \
+            TIFFWarningExtR(_tif, module,                                      \
+                            "LZWDecode: Strip %" PRIu32                        \
+                            " not terminated with EOI code",                   \
+                            _tif->tif_curstrip);                               \
+            _code = CODE_EOI;                                                  \
+        }                                                                      \
+        else                                                                   \
+        {                                                                      \
+            _get(_sp, _bp, _code);                                             \
+            dec_bitsleft -= nbits;                                             \
+        }                                                                      \
+    }
 
-static int
-LZWDecodeCompat(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s)
+/*
+ * Decode a "hunk of data" for old images.
+ */
+#define GetNextCodeCompat(sp, bp, code)                                        \
+    {                                                                          \
+        nextdata |= (unsigned long)*(bp)++ << nextbits;                        \
+        nextbits += 8;                                                         \
+        if (nextbits < nbits)                                                  \
+        {                                                                      \
+            nextdata |= (unsigned long)*(bp)++ << nextbits;                    \
+            nextbits += 8;                                                     \
+        }                                                                      \
+        code = (hcode_t)(nextdata & nbitsmask);                                \
+        nextdata >>= nbits;                                                    \
+        nextbits -= nbits;                                                     \
+    }
+
+static int LZWDecodeCompat(TIFF *tif, uint8_t *op0, tmsize_t occ0, uint16_t s)
 {
-	static const char module[] = "LZWDecodeCompat";
-	LZWCodecState *sp = DecoderState(tif);
-	char *op = (char*) op0;
-	long occ = (long) occ0;
-	char *tp;
-	unsigned char *bp;
-	int code, nbits;
-	int len;
-	long nextbits, nextdata, nbitsmask;
-	code_t *codep, *free_entp, *maxcodep, *oldcodep;
-
-	(void) s;
-	assert(sp != NULL);
-
-	/*
-	  Fail if value does not fit in long.
-	*/
-	if ((tmsize_t) occ != occ0)
-	        return (0);
-
-	/*
-	 * Restart interrupted output operation.
-	 */
-	if (sp->dec_restart) {
-		long residue;
-
-		codep = sp->dec_codep;
-		residue = codep->length - sp->dec_restart;
-		if (residue > occ) {
-			/*
-			 * Residue from previous decode is sufficient
-			 * to satisfy decode request.  Skip to the
-			 * start of the decoded string, place decoded
-			 * values in the output buffer, and return.
-			 */
-			sp->dec_restart += occ;
-			do {
-				codep = codep->next;
-			} while (--residue > occ);
-			tp = op + occ;
-			do {
-				*--tp = codep->value;
-				codep = codep->next;
-			} while (--occ);
-			return (1);
-		}
-		/*
-		 * Residue satisfies only part of the decode request.
-		 */
-		op += residue;
-		occ -= residue;
-		tp = op;
-		do {
-			*--tp = codep->value;
-			codep = codep->next;
-		} while (--residue);
-		sp->dec_restart = 0;
-	}
-
-	bp = (unsigned char *)tif->tif_rawcp;
-#ifdef LZW_CHECKEOS
-	sp->dec_bitsleft += (((uint64)tif->tif_rawcc - sp->old_tif_rawcc) << 3);
-#endif
-	nbits = sp->lzw_nbits;
-	nextdata = sp->lzw_nextdata;
-	nextbits = sp->lzw_nextbits;
-	nbitsmask = sp->dec_nbitsmask;
-	oldcodep = sp->dec_oldcodep;
-	free_entp = sp->dec_free_entp;
-	maxcodep = sp->dec_maxcodep;
-
-	while (occ > 0) {
-		NextCode(tif, sp, bp, code, GetNextCodeCompat);
-		if (code == CODE_EOI)
-			break;
-		if (code == CODE_CLEAR) {
-			do {
-				free_entp = sp->dec_codetab + CODE_FIRST;
-				_TIFFmemset(free_entp, 0,
-					    (CSIZE - CODE_FIRST) * sizeof (code_t));
-				nbits = BITS_MIN;
-				nbitsmask = MAXCODE(BITS_MIN);
-				maxcodep = sp->dec_codetab + nbitsmask;
-				NextCode(tif, sp, bp, code, GetNextCodeCompat);
-			} while (code == CODE_CLEAR);	/* consecutive CODE_CLEAR codes */
-			if (code == CODE_EOI)
-				break;
-			if (code > CODE_CLEAR) {
-				TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-				"LZWDecode: Corrupted LZW table at scanline %d",
-					     tif->tif_row);
-				return (0);
-			}
-			*op++ = (char)code;
-			occ--;
-			oldcodep = sp->dec_codetab + code;
-			continue;
-		}
-		codep = sp->dec_codetab + code;
-
-		/*
-		 * Add the new entry to the code table.
-		 */
-		if (free_entp < &sp->dec_codetab[0] ||
-		    free_entp >= &sp->dec_codetab[CSIZE]) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Corrupted LZW table at scanline %d", tif->tif_row);
-			return (0);
-		}
-
-		free_entp->next = oldcodep;
-		if (free_entp->next < &sp->dec_codetab[0] ||
-		    free_entp->next >= &sp->dec_codetab[CSIZE]) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Corrupted LZW table at scanline %d", tif->tif_row);
-			return (0);
-		}
-		free_entp->firstchar = free_entp->next->firstchar;
-		free_entp->length = free_entp->next->length+1;
-		free_entp->value = (codep < free_entp) ?
-		    codep->firstchar : free_entp->firstchar;
-		if (++free_entp > maxcodep) {
-			if (++nbits > BITS_MAX)		/* should not happen */
-				nbits = BITS_MAX;
-			nbitsmask = MAXCODE(nbits);
-			maxcodep = sp->dec_codetab + nbitsmask;
-		}
-		oldcodep = codep;
-		if (code >= 256) {
-			/*
-			 * Code maps to a string, copy string
-			 * value to output (written in reverse).
-			 */
-			if(codep->length == 0) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Wrong length of decoded "
-				    "string: data probably corrupted at scanline %d",
-				    tif->tif_row);
-				return (0);
-			}
-			if (codep->length > occ) {
-				/*
-				 * String is too long for decode buffer,
-				 * locate portion that will fit, copy to
-				 * the decode buffer, and setup restart
-				 * logic for the next decoding call.
-				 */
-				sp->dec_codep = codep;
-				do {
-					codep = codep->next;
-				} while (codep->length > occ);
-				sp->dec_restart = occ;
-				tp = op + occ;
-				do  {
-					*--tp = codep->value;
-					codep = codep->next;
-				}  while (--occ);
-				break;
-			}
-			len = codep->length;
-			tp = op + len;
-			do {
-				int t;
-				--tp;
-				t = codep->value;
-				codep = codep->next;
-				*tp = (char)t;
-			} while (codep && tp > op);
-			assert(occ >= len);
-			op += len;
-			occ -= len;
-		} else {
-			*op++ = (char)code;
-			occ--;
-		}
-	}
-
-	tif->tif_rawcc -= (tmsize_t)( (uint8*) bp - tif->tif_rawcp );
-	tif->tif_rawcp = (uint8*) bp;
-#ifdef LZW_CHECKEOS
-	sp->old_tif_rawcc = tif->tif_rawcc;
-#endif
-	sp->lzw_nbits = (unsigned short)nbits;
-	sp->lzw_nextdata = nextdata;
-	sp->lzw_nextbits = nextbits;
-	sp->dec_nbitsmask = nbitsmask;
-	sp->dec_oldcodep = oldcodep;
-	sp->dec_free_entp = free_entp;
-	sp->dec_maxcodep = maxcodep;
-
-	if (occ > 0) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"Not enough data at scanline %d (short %I64d bytes)",
-			     tif->tif_row, (unsigned __int64) occ);
-#else
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"Not enough data at scanline %d (short %llu bytes)",
-			     tif->tif_row, (unsigned long long) occ);
-#endif
-		return (0);
-	}
-	return (1);
+    static const char module[] = "LZWDecodeCompat";
+    LZWCodecState *sp = DecoderState(tif);
+    uint8_t *op = (uint8_t *)op0;
+    tmsize_t occ = occ0;
+    uint8_t *tp;
+    uint8_t *bp;
+    int code, nbits;
+    int len;
+    long nextbits, nbitsmask;
+    WordType nextdata;
+    code_t *codep, *free_entp, *maxcodep, *oldcodep;
+
+    (void)s;
+    assert(sp != NULL);
+
+    /*
+     * Restart interrupted output operation.
+     */
+    if (sp->dec_restart)
+    {
+        tmsize_t residue;
+
+        codep = sp->dec_codep;
+        residue = codep->length - sp->dec_restart;
+        if (residue > occ)
+        {
+            /*
+             * Residue from previous decode is sufficient
+             * to satisfy decode request.  Skip to the
+             * start of the decoded string, place decoded
+             * values in the output buffer, and return.
+             */
+            sp->dec_restart += occ;
+            do
+            {
+                codep = codep->next;
+            } while (--residue > occ);
+            tp = op + occ;
+            do
+            {
+                *--tp = codep->value;
+                codep = codep->next;
+            } while (--occ);
+            return (1);
+        }
+        /*
+         * Residue satisfies only part of the decode request.
+         */
+        op += residue;
+        occ -= residue;
+        tp = op;
+        do
+        {
+            *--tp = codep->value;
+            codep = codep->next;
+        } while (--residue);
+        sp->dec_restart = 0;
+    }
+
+    bp = (uint8_t *)tif->tif_rawcp;
+
+    sp->dec_bitsleft += (((uint64_t)tif->tif_rawcc - sp->old_tif_rawcc) << 3);
+    uint64_t dec_bitsleft = sp->dec_bitsleft;
+
+    nbits = sp->lzw_nbits;
+    nextdata = sp->lzw_nextdata;
+    nextbits = sp->lzw_nextbits;
+    nbitsmask = sp->dec_nbitsmask;
+    oldcodep = sp->dec_oldcodep;
+    free_entp = sp->dec_free_entp;
+    maxcodep = sp->dec_maxcodep;
+
+    while (occ > 0)
+    {
+        NextCode(tif, sp, bp, code, GetNextCodeCompat, dec_bitsleft);
+        if (code == CODE_EOI)
+            break;
+        if (code == CODE_CLEAR)
+        {
+            do
+            {
+                free_entp = sp->dec_codetab + CODE_FIRST;
+                _TIFFmemset(free_entp, 0,
+                            (CSIZE - CODE_FIRST) * sizeof(code_t));
+                nbits = BITS_MIN;
+                nbitsmask = MAXCODE(BITS_MIN);
+                maxcodep = sp->dec_codetab + nbitsmask;
+                NextCode(tif, sp, bp, code, GetNextCodeCompat, dec_bitsleft);
+            } while (code == CODE_CLEAR); /* consecutive CODE_CLEAR codes */
+            if (code == CODE_EOI)
+                break;
+            if (code > CODE_CLEAR)
+            {
+                TIFFErrorExtR(
+                    tif, tif->tif_name,
+                    "LZWDecode: Corrupted LZW table at scanline %" PRIu32,
+                    tif->tif_row);
+                return (0);
+            }
+            *op++ = (uint8_t)code;
+            occ--;
+            oldcodep = sp->dec_codetab + code;
+            continue;
+        }
+        codep = sp->dec_codetab + code;
+
+        /*
+         * Add the new entry to the code table.
+         */
+        if (free_entp < &sp->dec_codetab[0] ||
+            free_entp >= &sp->dec_codetab[CSIZE])
+        {
+            TIFFErrorExtR(tif, module,
+                          "Corrupted LZW table at scanline %" PRIu32,
+                          tif->tif_row);
+            return (0);
+        }
+
+        free_entp->next = oldcodep;
+        if (free_entp->next < &sp->dec_codetab[0] ||
+            free_entp->next >= &sp->dec_codetab[CSIZE])
+        {
+            TIFFErrorExtR(tif, module,
+                          "Corrupted LZW table at scanline %" PRIu32,
+                          tif->tif_row);
+            return (0);
+        }
+        free_entp->firstchar = free_entp->next->firstchar;
+        free_entp->length = free_entp->next->length + 1;
+        free_entp->value =
+            (codep < free_entp) ? codep->firstchar : free_entp->firstchar;
+        if (++free_entp > maxcodep)
+        {
+            if (++nbits > BITS_MAX) /* should not happen */
+                nbits = BITS_MAX;
+            nbitsmask = MAXCODE(nbits);
+            maxcodep = sp->dec_codetab + nbitsmask;
+        }
+        oldcodep = codep;
+        if (code >= 256)
+        {
+            /*
+             * Code maps to a string, copy string
+             * value to output (written in reverse).
+             */
+            if (codep->length == 0)
+            {
+                TIFFErrorExtR(
+                    tif, module,
+                    "Wrong length of decoded "
+                    "string: data probably corrupted at scanline %" PRIu32,
+                    tif->tif_row);
+                return (0);
+            }
+            if (codep->length > occ)
+            {
+                /*
+                 * String is too long for decode buffer,
+                 * locate portion that will fit, copy to
+                 * the decode buffer, and setup restart
+                 * logic for the next decoding call.
+                 */
+                sp->dec_codep = codep;
+                do
+                {
+                    codep = codep->next;
+                } while (codep->length > occ);
+                sp->dec_restart = occ;
+                tp = op + occ;
+                do
+                {
+                    *--tp = codep->value;
+                    codep = codep->next;
+                } while (--occ);
+                break;
+            }
+            len = codep->length;
+            tp = op + len;
+            do
+            {
+                *--tp = codep->value;
+                codep = codep->next;
+            } while (codep && tp > op);
+            assert(occ >= len);
+            op += len;
+            occ -= len;
+        }
+        else
+        {
+            *op++ = (uint8_t)code;
+            occ--;
+        }
+    }
+
+    tif->tif_rawcc -= (tmsize_t)((uint8_t *)bp - tif->tif_rawcp);
+    tif->tif_rawcp = (uint8_t *)bp;
+
+    sp->old_tif_rawcc = tif->tif_rawcc;
+    sp->dec_bitsleft = dec_bitsleft;
+
+    sp->lzw_nbits = (unsigned short)nbits;
+    sp->lzw_nextdata = nextdata;
+    sp->lzw_nextbits = nextbits;
+    sp->dec_nbitsmask = nbitsmask;
+    sp->dec_oldcodep = oldcodep;
+    sp->dec_free_entp = free_entp;
+    sp->dec_maxcodep = maxcodep;
+
+    if (occ > 0)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Not enough data at scanline %" PRIu32 " (short %" PRIu64
+                      " bytes)",
+                      tif->tif_row, (uint64_t)occ);
+        return (0);
+    }
+    return (1);
 }
 #endif /* LZW_COMPAT */
 
@@ -809,393 +1023,416 @@ LZWDecodeCompat(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s)
  * LZW Encoding.
  */
 
-static int
-LZWSetupEncode(TIFF* tif)
+static int LZWSetupEncode(TIFF *tif)
 {
-	static const char module[] = "LZWSetupEncode";
-	LZWCodecState* sp = EncoderState(tif);
-
-	assert(sp != NULL);
-	sp->enc_hashtab = (hash_t*) _TIFFmalloc(HSIZE*sizeof (hash_t));
-	if (sp->enc_hashtab == NULL) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "No space for LZW hash table");
-		return (0);
-	}
-	return (1);
+    static const char module[] = "LZWSetupEncode";
+    LZWCodecState *sp = EncoderState(tif);
+
+    assert(sp != NULL);
+    sp->enc_hashtab = (hash_t *)_TIFFmallocExt(tif, HSIZE * sizeof(hash_t));
+    if (sp->enc_hashtab == NULL)
+    {
+        TIFFErrorExtR(tif, module, "No space for LZW hash table");
+        return (0);
+    }
+    return (1);
 }
 
 /*
  * Reset encoding state at the start of a strip.
  */
-static int
-LZWPreEncode(TIFF* tif, uint16 s)
+static int LZWPreEncode(TIFF *tif, uint16_t s)
 {
-	LZWCodecState *sp = EncoderState(tif);
-
-	(void) s;
-	assert(sp != NULL);
-
-	if( sp->enc_hashtab == NULL )
-        {
-            tif->tif_setupencode( tif );
-        }
-
-	sp->lzw_nbits = BITS_MIN;
-	sp->lzw_maxcode = MAXCODE(BITS_MIN);
-	sp->lzw_free_ent = CODE_FIRST;
-	sp->lzw_nextbits = 0;
-	sp->lzw_nextdata = 0;
-	sp->enc_checkpoint = CHECK_GAP;
-	sp->enc_ratio = 0;
-	sp->enc_incount = 0;
-	sp->enc_outcount = 0;
-	/*
-	 * The 4 here insures there is space for 2 max-sized
-	 * codes in LZWEncode and LZWPostDecode.
-	 */
-	sp->enc_rawlimit = tif->tif_rawdata + tif->tif_rawdatasize-1 - 4;
-	cl_hash(sp);		/* clear hash table */
-	sp->enc_oldcode = (hcode_t) -1;	/* generates CODE_CLEAR in LZWEncode */
-	return (1);
+    LZWCodecState *sp = EncoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+
+    if (sp->enc_hashtab == NULL)
+    {
+        tif->tif_setupencode(tif);
+    }
+
+    sp->lzw_nbits = BITS_MIN;
+    sp->lzw_maxcode = MAXCODE(BITS_MIN);
+    sp->lzw_free_ent = CODE_FIRST;
+    sp->lzw_nextbits = 0;
+    sp->lzw_nextdata = 0;
+    sp->enc_checkpoint = CHECK_GAP;
+    sp->enc_ratio = 0;
+    sp->enc_incount = 0;
+    sp->enc_outcount = 0;
+    /*
+     * The 4 here insures there is space for 2 max-sized
+     * codes in LZWEncode and LZWPostDecode.
+     */
+    sp->enc_rawlimit = tif->tif_rawdata + tif->tif_rawdatasize - 1 - 4;
+    cl_hash(sp);                   /* clear hash table */
+    sp->enc_oldcode = (hcode_t)-1; /* generates CODE_CLEAR in LZWEncode */
+    return (1);
 }
 
-#define	CALCRATIO(sp, rat) {					\
-	if (incount > 0x007fffff) { /* NB: shift will overflow */\
-		rat = outcount >> 8;				\
-		rat = (rat == 0 ? 0x7fffffff : incount/rat);	\
-	} else							\
-		rat = (incount<<8) / outcount;			\
-}
+#define CALCRATIO(sp, rat)                                                     \
+    {                                                                          \
+        if (incount > 0x007fffff)                                              \
+        { /* NB: shift will overflow */                                        \
+            rat = outcount >> 8;                                               \
+            rat = (rat == 0 ? 0x7fffffff : incount / rat);                     \
+        }                                                                      \
+        else                                                                   \
+            rat = (incount << 8) / outcount;                                   \
+    }
 
 /* Explicit 0xff masking to make icc -check=conversions happy */
-#define	PutNextCode(op, c) {					\
-	nextdata = (nextdata << nbits) | c;			\
-	nextbits += nbits;					\
-	*op++ = (unsigned char)((nextdata >> (nextbits-8))&0xff);		\
-	nextbits -= 8;						\
-	if (nextbits >= 8) {					\
-		*op++ = (unsigned char)((nextdata >> (nextbits-8))&0xff);	\
-		nextbits -= 8;					\
-	}							\
-	outcount += nbits;					\
-}
+#define PutNextCode(op, c)                                                     \
+    {                                                                          \
+        nextdata = (nextdata << nbits) | c;                                    \
+        nextbits += nbits;                                                     \
+        *op++ = (unsigned char)((nextdata >> (nextbits - 8)) & 0xff);          \
+        nextbits -= 8;                                                         \
+        if (nextbits >= 8)                                                     \
+        {                                                                      \
+            *op++ = (unsigned char)((nextdata >> (nextbits - 8)) & 0xff);      \
+            nextbits -= 8;                                                     \
+        }                                                                      \
+        outcount += nbits;                                                     \
+    }
 
 /*
  * Encode a chunk of pixels.
  *
- * Uses an open addressing double hashing (no chaining) on the 
+ * Uses an open addressing double hashing (no chaining) on the
  * prefix code/next character combination.  We do a variant of
  * Knuth's algorithm D (vol. 3, sec. 6.4) along with G. Knott's
  * relatively-prime secondary probe.  Here, the modular division
- * first probe is gives way to a faster exclusive-or manipulation. 
+ * first probe is gives way to a faster exclusive-or manipulation.
  * Also do block compression with an adaptive reset, whereby the
  * code table is cleared when the compression ratio decreases,
  * but after the table fills.  The variable-length output codes
  * are re-sized at this point, and a CODE_CLEAR is generated
- * for the decoder. 
+ * for the decoder.
  */
-static int
-LZWEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int LZWEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	register LZWCodecState *sp = EncoderState(tif);
-	register long fcode;
-	register hash_t *hp;
-	register int h, c;
-	hcode_t ent;
-	long disp;
-	long incount, outcount, checkpoint;
-	unsigned long nextdata;
-        long nextbits;
-	int free_ent, maxcode, nbits;
-	uint8* op;
-	uint8* limit;
-
-	(void) s;
-	if (sp == NULL)
-		return (0);
-
-        assert(sp->enc_hashtab != NULL);
-
-	/*
-	 * Load local state.
-	 */
-	incount = sp->enc_incount;
-	outcount = sp->enc_outcount;
-	checkpoint = sp->enc_checkpoint;
-	nextdata = sp->lzw_nextdata;
-	nextbits = sp->lzw_nextbits;
-	free_ent = sp->lzw_free_ent;
-	maxcode = sp->lzw_maxcode;
-	nbits = sp->lzw_nbits;
-	op = tif->tif_rawcp;
-	limit = sp->enc_rawlimit;
-	ent = (hcode_t)sp->enc_oldcode;
-
-	if (ent == (hcode_t) -1 && cc > 0) {
-		/*
-		 * NB: This is safe because it can only happen
-		 *     at the start of a strip where we know there
-		 *     is space in the data buffer.
-		 */
-		PutNextCode(op, CODE_CLEAR);
-		ent = *bp++; cc--; incount++;
-	}
-	while (cc > 0) {
-		c = *bp++; cc--; incount++;
-		fcode = ((long)c << BITS_MAX) + ent;
-		h = (c << HSHIFT) ^ ent;	/* xor hashing */
+    register LZWCodecState *sp = EncoderState(tif);
+    register long fcode;
+    register hash_t *hp;
+    register int h, c;
+    hcode_t ent;
+    long disp;
+    tmsize_t incount, outcount, checkpoint;
+    WordType nextdata;
+    long nextbits;
+    int free_ent, maxcode, nbits;
+    uint8_t *op;
+    uint8_t *limit;
+
+    (void)s;
+    if (sp == NULL)
+        return (0);
+
+    assert(sp->enc_hashtab != NULL);
+
+    /*
+     * Load local state.
+     */
+    incount = sp->enc_incount;
+    outcount = sp->enc_outcount;
+    checkpoint = sp->enc_checkpoint;
+    nextdata = sp->lzw_nextdata;
+    nextbits = sp->lzw_nextbits;
+    free_ent = sp->lzw_free_ent;
+    maxcode = sp->lzw_maxcode;
+    nbits = sp->lzw_nbits;
+    op = tif->tif_rawcp;
+    limit = sp->enc_rawlimit;
+    ent = (hcode_t)sp->enc_oldcode;
+
+    if (ent == (hcode_t)-1 && cc > 0)
+    {
+        /*
+         * NB: This is safe because it can only happen
+         *     at the start of a strip where we know there
+         *     is space in the data buffer.
+         */
+        PutNextCode(op, CODE_CLEAR);
+        ent = *bp++;
+        cc--;
+        incount++;
+    }
+    while (cc > 0)
+    {
+        c = *bp++;
+        cc--;
+        incount++;
+        fcode = ((long)c << BITS_MAX) + ent;
+        h = (c << HSHIFT) ^ ent; /* xor hashing */
 #ifdef _WINDOWS
-		/*
-		 * Check hash index for an overflow.
-		 */
-		if (h >= HSIZE)
-			h -= HSIZE;
+        /*
+         * Check hash index for an overflow.
+         */
+        if (h >= HSIZE)
+            h -= HSIZE;
 #endif
-		hp = &sp->enc_hashtab[h];
-		if (hp->hash == fcode) {
-			ent = hp->code;
-			continue;
-		}
-		if (hp->hash >= 0) {
-			/*
-			 * Primary hash failed, check secondary hash.
-			 */
-			disp = HSIZE - h;
-			if (h == 0)
-				disp = 1;
-			do {
-				/*
-				 * Avoid pointer arithmetic because of
-				 * wraparound problems with segments.
-				 */
-				if ((h -= disp) < 0)
-					h += HSIZE;
-				hp = &sp->enc_hashtab[h];
-				if (hp->hash == fcode) {
-					ent = hp->code;
-					goto hit;
-				}
-			} while (hp->hash >= 0);
-		}
-		/*
-		 * New entry, emit code and add to table.
-		 */
-		/*
-		 * Verify there is space in the buffer for the code
-		 * and any potential Clear code that might be emitted
-		 * below.  The value of limit is setup so that there
-		 * are at least 4 bytes free--room for 2 codes.
-		 */
-		if (op > limit) {
-			tif->tif_rawcc = (tmsize_t)(op - tif->tif_rawdata);
-			if( !TIFFFlushData1(tif) )
-                            return 0;
-			op = tif->tif_rawdata;
-		}
-		PutNextCode(op, ent);
-		ent = (hcode_t)c;
-		hp->code = (hcode_t)(free_ent++);
-		hp->hash = fcode;
-		if (free_ent == CODE_MAX-1) {
-			/* table is full, emit clear code and reset */
-			cl_hash(sp);
-			sp->enc_ratio = 0;
-			incount = 0;
-			outcount = 0;
-			free_ent = CODE_FIRST;
-			PutNextCode(op, CODE_CLEAR);
-			nbits = BITS_MIN;
-			maxcode = MAXCODE(BITS_MIN);
-		} else {
-			/*
-			 * If the next entry is going to be too big for
-			 * the code size, then increase it, if possible.
-			 */
-			if (free_ent > maxcode) {
-				nbits++;
-				assert(nbits <= BITS_MAX);
-				maxcode = (int) MAXCODE(nbits);
-			} else if (incount >= checkpoint) {
-				long rat;
-				/*
-				 * Check compression ratio and, if things seem
-				 * to be slipping, clear the hash table and
-				 * reset state.  The compression ratio is a
-				 * 24+8-bit fractional number.
-				 */
-				checkpoint = incount+CHECK_GAP;
-				CALCRATIO(sp, rat);
-				if (rat <= sp->enc_ratio) {
-					cl_hash(sp);
-					sp->enc_ratio = 0;
-					incount = 0;
-					outcount = 0;
-					free_ent = CODE_FIRST;
-					PutNextCode(op, CODE_CLEAR);
-					nbits = BITS_MIN;
-					maxcode = MAXCODE(BITS_MIN);
-				} else
-					sp->enc_ratio = rat;
-			}
-		}
-	hit:
-		;
-	}
-
-	/*
-	 * Restore global state.
-	 */
-	sp->enc_incount = incount;
-	sp->enc_outcount = outcount;
-	sp->enc_checkpoint = checkpoint;
-	sp->enc_oldcode = ent;
-	sp->lzw_nextdata = nextdata;
-	sp->lzw_nextbits = nextbits;
-	sp->lzw_free_ent = (unsigned short)free_ent;
-	sp->lzw_maxcode = (unsigned short)maxcode;
-	sp->lzw_nbits = (unsigned short)nbits;
-	tif->tif_rawcp = op;
-	return (1);
+        hp = &sp->enc_hashtab[h];
+        if (hp->hash == fcode)
+        {
+            ent = hp->code;
+            continue;
+        }
+        if (hp->hash >= 0)
+        {
+            /*
+             * Primary hash failed, check secondary hash.
+             */
+            disp = HSIZE - h;
+            if (h == 0)
+                disp = 1;
+            do
+            {
+                /*
+                 * Avoid pointer arithmetic because of
+                 * wraparound problems with segments.
+                 */
+                if ((h -= disp) < 0)
+                    h += HSIZE;
+                hp = &sp->enc_hashtab[h];
+                if (hp->hash == fcode)
+                {
+                    ent = hp->code;
+                    goto hit;
+                }
+            } while (hp->hash >= 0);
+        }
+        /*
+         * New entry, emit code and add to table.
+         */
+        /*
+         * Verify there is space in the buffer for the code
+         * and any potential Clear code that might be emitted
+         * below.  The value of limit is setup so that there
+         * are at least 4 bytes free--room for 2 codes.
+         */
+        if (op > limit)
+        {
+            tif->tif_rawcc = (tmsize_t)(op - tif->tif_rawdata);
+            if (!TIFFFlushData1(tif))
+                return 0;
+            op = tif->tif_rawdata;
+        }
+        PutNextCode(op, ent);
+        ent = (hcode_t)c;
+        hp->code = (hcode_t)(free_ent++);
+        hp->hash = fcode;
+        if (free_ent == CODE_MAX - 1)
+        {
+            /* table is full, emit clear code and reset */
+            cl_hash(sp);
+            sp->enc_ratio = 0;
+            incount = 0;
+            outcount = 0;
+            free_ent = CODE_FIRST;
+            PutNextCode(op, CODE_CLEAR);
+            nbits = BITS_MIN;
+            maxcode = MAXCODE(BITS_MIN);
+        }
+        else
+        {
+            /*
+             * If the next entry is going to be too big for
+             * the code size, then increase it, if possible.
+             */
+            if (free_ent > maxcode)
+            {
+                nbits++;
+                assert(nbits <= BITS_MAX);
+                maxcode = (int)MAXCODE(nbits);
+            }
+            else if (incount >= checkpoint)
+            {
+                tmsize_t rat;
+                /*
+                 * Check compression ratio and, if things seem
+                 * to be slipping, clear the hash table and
+                 * reset state.  The compression ratio is a
+                 * 24+8-bit fractional number.
+                 */
+                checkpoint = incount + CHECK_GAP;
+                CALCRATIO(sp, rat);
+                if (rat <= sp->enc_ratio)
+                {
+                    cl_hash(sp);
+                    sp->enc_ratio = 0;
+                    incount = 0;
+                    outcount = 0;
+                    free_ent = CODE_FIRST;
+                    PutNextCode(op, CODE_CLEAR);
+                    nbits = BITS_MIN;
+                    maxcode = MAXCODE(BITS_MIN);
+                }
+                else
+                    sp->enc_ratio = rat;
+            }
+        }
+    hit:;
+    }
+
+    /*
+     * Restore global state.
+     */
+    sp->enc_incount = incount;
+    sp->enc_outcount = outcount;
+    sp->enc_checkpoint = checkpoint;
+    sp->enc_oldcode = ent;
+    sp->lzw_nextdata = nextdata;
+    sp->lzw_nextbits = nextbits;
+    sp->lzw_free_ent = (unsigned short)free_ent;
+    sp->lzw_maxcode = (unsigned short)maxcode;
+    sp->lzw_nbits = (unsigned short)nbits;
+    tif->tif_rawcp = op;
+    return (1);
 }
 
 /*
  * Finish off an encoded strip by flushing the last
  * string and tacking on an End Of Information code.
  */
-static int
-LZWPostEncode(TIFF* tif)
+static int LZWPostEncode(TIFF *tif)
 {
-	register LZWCodecState *sp = EncoderState(tif);
-	uint8* op = tif->tif_rawcp;
-	long nextbits = sp->lzw_nextbits;
-	unsigned long nextdata = sp->lzw_nextdata;
-	long outcount = sp->enc_outcount;
-	int nbits = sp->lzw_nbits;
-
-	if (op > sp->enc_rawlimit) {
-		tif->tif_rawcc = (tmsize_t)(op - tif->tif_rawdata);
-		if( !TIFFFlushData1(tif) )
-                    return 0;
-		op = tif->tif_rawdata;
-	}
-	if (sp->enc_oldcode != (hcode_t) -1) {
-                int free_ent = sp->lzw_free_ent;
-
-		PutNextCode(op, sp->enc_oldcode);
-		sp->enc_oldcode = (hcode_t) -1;
-                free_ent ++;
-
-                if (free_ent == CODE_MAX-1) {
-                        /* table is full, emit clear code and reset */
-                        outcount = 0;
-                        PutNextCode(op, CODE_CLEAR);
-                        nbits = BITS_MIN;
-                } else {
-                        /*
-                        * If the next entry is going to be too big for
-                        * the code size, then increase it, if possible.
-                        */
-                        if (free_ent > sp->lzw_maxcode) {
-                                nbits++;
-                                assert(nbits <= BITS_MAX);
-                        }
-                }
-	}
-	PutNextCode(op, CODE_EOI);
-        /* Explicit 0xff masking to make icc -check=conversions happy */
-	if (nextbits > 0) 
-		*op++ = (unsigned char)((nextdata << (8-nextbits))&0xff);
-	tif->tif_rawcc = (tmsize_t)(op - tif->tif_rawdata);
-	return (1);
+    register LZWCodecState *sp = EncoderState(tif);
+    uint8_t *op = tif->tif_rawcp;
+    long nextbits = sp->lzw_nextbits;
+    WordType nextdata = sp->lzw_nextdata;
+    tmsize_t outcount = sp->enc_outcount;
+    int nbits = sp->lzw_nbits;
+
+    if (op > sp->enc_rawlimit)
+    {
+        tif->tif_rawcc = (tmsize_t)(op - tif->tif_rawdata);
+        if (!TIFFFlushData1(tif))
+            return 0;
+        op = tif->tif_rawdata;
+    }
+    if (sp->enc_oldcode != (hcode_t)-1)
+    {
+        int free_ent = sp->lzw_free_ent;
+
+        PutNextCode(op, sp->enc_oldcode);
+        sp->enc_oldcode = (hcode_t)-1;
+        free_ent++;
+
+        if (free_ent == CODE_MAX - 1)
+        {
+            /* table is full, emit clear code and reset */
+            outcount = 0;
+            PutNextCode(op, CODE_CLEAR);
+            nbits = BITS_MIN;
+        }
+        else
+        {
+            /*
+             * If the next entry is going to be too big for
+             * the code size, then increase it, if possible.
+             */
+            if (free_ent > sp->lzw_maxcode)
+            {
+                nbits++;
+                assert(nbits <= BITS_MAX);
+            }
+        }
+    }
+    PutNextCode(op, CODE_EOI);
+    /* Explicit 0xff masking to make icc -check=conversions happy */
+    if (nextbits > 0)
+        *op++ = (unsigned char)((nextdata << (8 - nextbits)) & 0xff);
+    tif->tif_rawcc = (tmsize_t)(op - tif->tif_rawdata);
+    (void)outcount;
+    return (1);
 }
 
 /*
  * Reset encoding hash table.
  */
-static void
-cl_hash(LZWCodecState* sp)
+static void cl_hash(LZWCodecState *sp)
 {
-	register hash_t *hp = &sp->enc_hashtab[HSIZE-1];
-	register long i = HSIZE-8;
-
-	do {
-		i -= 8;
-		hp[-7].hash = -1;
-		hp[-6].hash = -1;
-		hp[-5].hash = -1;
-		hp[-4].hash = -1;
-		hp[-3].hash = -1;
-		hp[-2].hash = -1;
-		hp[-1].hash = -1;
-		hp[ 0].hash = -1;
-		hp -= 8;
-	} while (i >= 0);
-	for (i += 8; i > 0; i--, hp--)
-		hp->hash = -1;
+    register hash_t *hp = &sp->enc_hashtab[HSIZE - 1];
+    register long i = HSIZE - 8;
+
+    do
+    {
+        i -= 8;
+        hp[-7].hash = -1;
+        hp[-6].hash = -1;
+        hp[-5].hash = -1;
+        hp[-4].hash = -1;
+        hp[-3].hash = -1;
+        hp[-2].hash = -1;
+        hp[-1].hash = -1;
+        hp[0].hash = -1;
+        hp -= 8;
+    } while (i >= 0);
+    for (i += 8; i > 0; i--, hp--)
+        hp->hash = -1;
 }
 
-static void
-LZWCleanup(TIFF* tif)
+static void LZWCleanup(TIFF *tif)
 {
-	(void)TIFFPredictorCleanup(tif);
+    (void)TIFFPredictorCleanup(tif);
 
-	assert(tif->tif_data != 0);
+    assert(tif->tif_data != 0);
 
-	if (DecoderState(tif)->dec_codetab)
-		_TIFFfree(DecoderState(tif)->dec_codetab);
+    if (DecoderState(tif)->dec_codetab)
+        _TIFFfreeExt(tif, DecoderState(tif)->dec_codetab);
 
-	if (EncoderState(tif)->enc_hashtab)
-		_TIFFfree(EncoderState(tif)->enc_hashtab);
+    if (EncoderState(tif)->enc_hashtab)
+        _TIFFfreeExt(tif, EncoderState(tif)->enc_hashtab);
 
-	_TIFFfree(tif->tif_data);
-	tif->tif_data = NULL;
+    _TIFFfreeExt(tif, tif->tif_data);
+    tif->tif_data = NULL;
 
-	_TIFFSetDefaultCompressionState(tif);
+    _TIFFSetDefaultCompressionState(tif);
 }
 
-int
-TIFFInitLZW(TIFF* tif, int scheme)
+int TIFFInitLZW(TIFF *tif, int scheme)
 {
-	static const char module[] = "TIFFInitLZW";
-        (void)scheme;
-	assert(scheme == COMPRESSION_LZW);
-	/*
-	 * Allocate state block so tag methods have storage to record values.
-	 */
-	tif->tif_data = (uint8*) _TIFFmalloc(sizeof (LZWCodecState));
-	if (tif->tif_data == NULL)
-		goto bad;
-	DecoderState(tif)->dec_codetab = NULL;
-	DecoderState(tif)->dec_decode = NULL;
-	EncoderState(tif)->enc_hashtab = NULL;
-        LZWState(tif)->rw_mode = tif->tif_mode;
-
-	/*
-	 * Install codec methods.
-	 */
-	tif->tif_fixuptags = LZWFixupTags; 
-	tif->tif_setupdecode = LZWSetupDecode;
-	tif->tif_predecode = LZWPreDecode;
-	tif->tif_decoderow = LZWDecode;
-	tif->tif_decodestrip = LZWDecode;
-	tif->tif_decodetile = LZWDecode;
-	tif->tif_setupencode = LZWSetupEncode;
-	tif->tif_preencode = LZWPreEncode;
-	tif->tif_postencode = LZWPostEncode;
-	tif->tif_encoderow = LZWEncode;
-	tif->tif_encodestrip = LZWEncode;
-	tif->tif_encodetile = LZWEncode;
-	tif->tif_cleanup = LZWCleanup;
-	/*
-	 * Setup predictor setup.
-	 */
-	(void) TIFFPredictorInit(tif);
-	return (1);
+    static const char module[] = "TIFFInitLZW";
+    (void)scheme;
+    assert(scheme == COMPRESSION_LZW);
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(LZWCodecState));
+    if (tif->tif_data == NULL)
+        goto bad;
+    DecoderState(tif)->dec_codetab = NULL;
+    DecoderState(tif)->dec_decode = NULL;
+    EncoderState(tif)->enc_hashtab = NULL;
+    LZWState(tif)->rw_mode = tif->tif_mode;
+
+    /*
+     * Install codec methods.
+     */
+    tif->tif_fixuptags = LZWFixupTags;
+    tif->tif_setupdecode = LZWSetupDecode;
+    tif->tif_predecode = LZWPreDecode;
+    tif->tif_decoderow = LZWDecode;
+    tif->tif_decodestrip = LZWDecode;
+    tif->tif_decodetile = LZWDecode;
+    tif->tif_setupencode = LZWSetupEncode;
+    tif->tif_preencode = LZWPreEncode;
+    tif->tif_postencode = LZWPostEncode;
+    tif->tif_encoderow = LZWEncode;
+    tif->tif_encodestrip = LZWEncode;
+    tif->tif_encodetile = LZWEncode;
+    tif->tif_cleanup = LZWCleanup;
+    /*
+     * Setup predictor setup.
+     */
+    (void)TIFFPredictorInit(tif);
+    return (1);
 bad:
-	TIFFErrorExt(tif->tif_clientdata, module, 
-		     "No space for LZW state block");
-	return (0);
+    TIFFErrorExtR(tif, module, "No space for LZW state block");
+    return (0);
 }
 
 /*
@@ -1219,12 +1456,3 @@ TIFFInitLZW(TIFF* tif, int scheme)
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  */
 #endif /* LZW_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_next.c b/3rdparty/libtiff/tif_next.c
index 0ba61aed3ae2..f000574ee748 100644
--- a/3rdparty/libtiff/tif_next.c
+++ b/3rdparty/libtiff/tif_next.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -30,158 +30,165 @@
  * NeXT 2-bit Grey Scale Compression Algorithm Support
  */
 
-#define SETPIXEL(op, v) {			\
-	switch (npixels++ & 3) {		\
-	case 0:	op[0]  = (unsigned char) ((v) << 6); break;	\
-	case 1:	op[0] |= (v) << 4; break;	\
-	case 2:	op[0] |= (v) << 2; break;	\
-	case 3:	*op++ |= (v);	   op_offset++; break;	\
-	}					\
-}
+#define SETPIXEL(op, v)                                                        \
+    {                                                                          \
+        switch (npixels++ & 3)                                                 \
+        {                                                                      \
+            case 0:                                                            \
+                op[0] = (unsigned char)((v) << 6);                             \
+                break;                                                         \
+            case 1:                                                            \
+                op[0] |= (v) << 4;                                             \
+                break;                                                         \
+            case 2:                                                            \
+                op[0] |= (v) << 2;                                             \
+                break;                                                         \
+            case 3:                                                            \
+                *op++ |= (v);                                                  \
+                op_offset++;                                                   \
+                break;                                                         \
+        }                                                                      \
+    }
 
-#define LITERALROW	0x00
-#define LITERALSPAN	0x40
-#define WHITE   	((1<<2)-1)
+#define LITERALROW 0x00
+#define LITERALSPAN 0x40
+#define WHITE ((1 << 2) - 1)
 
-static int
-NeXTDecode(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
+static int NeXTDecode(TIFF *tif, uint8_t *buf, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "NeXTDecode";
-	unsigned char *bp, *op;
-	tmsize_t cc;
-	uint8* row;
-	tmsize_t scanline, n;
+    static const char module[] = "NeXTDecode";
+    unsigned char *bp, *op;
+    tmsize_t cc;
+    uint8_t *row;
+    tmsize_t scanline, n;
 
-	(void) s;
-	/*
-	 * Each scanline is assumed to start off as all
-	 * white (we assume a PhotometricInterpretation
-	 * of ``min-is-black'').
-	 */
-	for (op = (unsigned char*) buf, cc = occ; cc-- > 0;)
-		*op++ = 0xff;
+    (void)s;
+    /*
+     * Each scanline is assumed to start off as all
+     * white (we assume a PhotometricInterpretation
+     * of ``min-is-black'').
+     */
+    for (op = (unsigned char *)buf, cc = occ; cc-- > 0;)
+        *op++ = 0xff;
 
-	bp = (unsigned char *)tif->tif_rawcp;
-	cc = tif->tif_rawcc;
-	scanline = tif->tif_scanlinesize;
-	if (occ % scanline)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Fractional scanlines cannot be read");
-		return (0);
-	}
-	for (row = buf; cc > 0 && occ > 0; occ -= scanline, row += scanline) {
-		n = *bp++;
-		cc--;
-		switch (n) {
-		case LITERALROW:
-			/*
-			 * The entire scanline is given as literal values.
-			 */
-			if (cc < scanline)
-				goto bad;
-			_TIFFmemcpy(row, bp, scanline);
-			bp += scanline;
-			cc -= scanline;
-			break;
-		case LITERALSPAN: {
-			tmsize_t off;
-			/*
-			 * The scanline has a literal span that begins at some
-			 * offset.
-			 */
-			if( cc < 4 )
-				goto bad;
-			off = (bp[0] * 256) + bp[1];
-			n = (bp[2] * 256) + bp[3];
-			if (cc < 4+n || off+n > scanline)
-				goto bad;
-			_TIFFmemcpy(row+off, bp+4, n);
-			bp += 4+n;
-			cc -= 4+n;
-			break;
-		}
-		default: {
-			uint32 npixels = 0, grey;
-			tmsize_t op_offset = 0;
-			uint32 imagewidth = tif->tif_dir.td_imagewidth;
-            if( isTiled(tif) )
-                imagewidth = tif->tif_dir.td_tilewidth;
+    bp = (unsigned char *)tif->tif_rawcp;
+    cc = tif->tif_rawcc;
+    scanline = tif->tif_scanlinesize;
+    if (occ % scanline)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be read");
+        return (0);
+    }
+    for (row = buf; cc > 0 && occ > 0; occ -= scanline, row += scanline)
+    {
+        n = *bp++;
+        cc--;
+        switch (n)
+        {
+            case LITERALROW:
+                /*
+                 * The entire scanline is given as literal values.
+                 */
+                if (cc < scanline)
+                    goto bad;
+                _TIFFmemcpy(row, bp, scanline);
+                bp += scanline;
+                cc -= scanline;
+                break;
+            case LITERALSPAN:
+            {
+                tmsize_t off;
+                /*
+                 * The scanline has a literal span that begins at some
+                 * offset.
+                 */
+                if (cc < 4)
+                    goto bad;
+                off = (bp[0] * 256) + bp[1];
+                n = (bp[2] * 256) + bp[3];
+                if (cc < 4 + n || off + n > scanline)
+                    goto bad;
+                _TIFFmemcpy(row + off, bp + 4, n);
+                bp += 4 + n;
+                cc -= 4 + n;
+                break;
+            }
+            default:
+            {
+                uint32_t npixels = 0, grey;
+                tmsize_t op_offset = 0;
+                uint32_t imagewidth = tif->tif_dir.td_imagewidth;
+                if (isTiled(tif))
+                    imagewidth = tif->tif_dir.td_tilewidth;
 
-			/*
-			 * The scanline is composed of a sequence of constant
-			 * color ``runs''.  We shift into ``run mode'' and
-			 * interpret bytes as codes of the form
-			 * <color><npixels> until we've filled the scanline.
-			 */
-			op = row;
-			for (;;) {
-				grey = (uint32)((n>>6) & 0x3);
-				n &= 0x3f;
-				/*
-				 * Ensure the run does not exceed the scanline
-				 * bounds, potentially resulting in a security
-				 * issue.
-				 */
-				while (n-- > 0 && npixels < imagewidth && op_offset < scanline)
-					SETPIXEL(op, grey);
-				if (npixels >= imagewidth)
-					break;
-                if (op_offset >= scanline ) {
-                    TIFFErrorExt(tif->tif_clientdata, module, "Invalid data for scanline %ld",
-                        (long) tif->tif_row);
-                    return (0);
+                /*
+                 * The scanline is composed of a sequence of constant
+                 * color ``runs''.  We shift into ``run mode'' and
+                 * interpret bytes as codes of the form
+                 * <color><npixels> until we've filled the scanline.
+                 */
+                op = row;
+                for (;;)
+                {
+                    grey = (uint32_t)((n >> 6) & 0x3);
+                    n &= 0x3f;
+                    /*
+                     * Ensure the run does not exceed the scanline
+                     * bounds, potentially resulting in a security
+                     * issue.
+                     */
+                    while (n-- > 0 && npixels < imagewidth &&
+                           op_offset < scanline)
+                        SETPIXEL(op, grey);
+                    if (npixels >= imagewidth)
+                        break;
+                    if (op_offset >= scanline)
+                    {
+                        TIFFErrorExtR(tif, module,
+                                      "Invalid data for scanline %" PRIu32,
+                                      tif->tif_row);
+                        return (0);
+                    }
+                    if (cc == 0)
+                        goto bad;
+                    n = *bp++;
+                    cc--;
                 }
-				if (cc == 0)
-					goto bad;
-				n = *bp++;
-				cc--;
-			}
-			break;
-		}
-		}
-	}
-	tif->tif_rawcp = (uint8*) bp;
-	tif->tif_rawcc = cc;
-	return (1);
+                break;
+            }
+        }
+    }
+    tif->tif_rawcp = (uint8_t *)bp;
+    tif->tif_rawcc = cc;
+    return (1);
 bad:
-	TIFFErrorExt(tif->tif_clientdata, module, "Not enough data for scanline %ld",
-	    (long) tif->tif_row);
-	return (0);
+    TIFFErrorExtR(tif, module, "Not enough data for scanline %" PRIu32,
+                  tif->tif_row);
+    return (0);
 }
 
-static int
-NeXTPreDecode(TIFF* tif, uint16 s)
+static int NeXTPreDecode(TIFF *tif, uint16_t s)
 {
-	static const char module[] = "NeXTPreDecode";
-	TIFFDirectory *td = &tif->tif_dir;
-	(void)s;
+    static const char module[] = "NeXTPreDecode";
+    TIFFDirectory *td = &tif->tif_dir;
+    (void)s;
 
-	if( td->td_bitspersample != 2 )
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Unsupported BitsPerSample = %d",
-					 td->td_bitspersample);
-		return (0);
-	}
-	return (1);
+    if (td->td_bitspersample != 2)
+    {
+        TIFFErrorExtR(tif, module, "Unsupported BitsPerSample = %" PRIu16,
+                      td->td_bitspersample);
+        return (0);
+    }
+    return (1);
 }
-	
-int
-TIFFInitNeXT(TIFF* tif, int scheme)
+
+int TIFFInitNeXT(TIFF *tif, int scheme)
 {
-	(void) scheme;
-	tif->tif_predecode = NeXTPreDecode;  
-	tif->tif_decoderow = NeXTDecode;  
-	tif->tif_decodestrip = NeXTDecode;  
-	tif->tif_decodetile = NeXTDecode;
-	return (1);
+    (void)scheme;
+    tif->tif_predecode = NeXTPreDecode;
+    tif->tif_decoderow = NeXTDecode;
+    tif->tif_decodestrip = NeXTDecode;
+    tif->tif_decodetile = NeXTDecode;
+    return (1);
 }
 #endif /* NEXT_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_ojpeg.c b/3rdparty/libtiff/tif_ojpeg.c
index 133d1f1c49c2..ea572091e507 100644
--- a/3rdparty/libtiff/tif_ojpeg.c
+++ b/3rdparty/libtiff/tif_ojpeg.c
@@ -43,79 +43,83 @@
 
 /* What is what, and what is not?
 
-   This decoder starts with an input stream, that is essentially the JpegInterchangeFormat
-   stream, if any, followed by the strile data, if any. This stream is read in
-   OJPEGReadByte and related functions.
-
-   It analyzes the start of this stream, until it encounters non-marker data, i.e.
-   compressed image data. Some of the header markers it sees have no actual content,
-   like the SOI marker, and APP/COM markers that really shouldn't even be there. Some
-   other markers do have content, and the valuable bits and pieces of information
-   in these markers are saved, checking all to verify that the stream is more or
-   less within expected bounds. This happens inside the OJPEGReadHeaderInfoSecStreamXxx
+   This decoder starts with an input stream, that is essentially the
+   JpegInterchangeFormat stream, if any, followed by the strile data, if any.
+   This stream is read in OJPEGReadByte and related functions.
+
+   It analyzes the start of this stream, until it encounters non-marker data,
+   i.e. compressed image data. Some of the header markers it sees have no actual
+   content, like the SOI marker, and APP/COM markers that really shouldn't even
+   be there. Some other markers do have content, and the valuable bits and
+   pieces of information in these markers are saved, checking all to verify that
+   the stream is more or less within expected bounds. This happens inside the
+   OJPEGReadHeaderInfoSecStreamXxx functions.
+
+   Some OJPEG imagery contains no valid JPEG header markers. This situation is
+   picked up on if we've seen no SOF marker when we're at the start of the
+   compressed image data. In this case, the tables are read from JpegXxxTables
+   tags, and the other bits and pieces of information is initialized to its most
+   basic value. This is implemented in the OJPEGReadHeaderInfoSecTablesXxx
    functions.
 
-   Some OJPEG imagery contains no valid JPEG header markers. This situation is picked
-   up on if we've seen no SOF marker when we're at the start of the compressed image
-   data. In this case, the tables are read from JpegXxxTables tags, and the other
-   bits and pieces of information is initialized to its most basic value. This is
-   implemented in the OJPEGReadHeaderInfoSecTablesXxx functions.
-
-   When this is complete, a good and valid JPEG header can be assembled, and this is
-   passed through to LibJpeg. When that's done, the remainder of the input stream, i.e.
-   the compressed image data, can be passed through unchanged. This is done in
-   OJPEGWriteStream functions.
-
-   LibTiff rightly expects to know the subsampling values before decompression. Just like
-   in new-style JPEG-in-TIFF, though, or even more so, actually, the YCbCrsubsampling
-   tag is notoriously unreliable. To correct these tag values with the ones inside
-   the JPEG stream, the first part of the input stream is pre-scanned in
-   OJPEGSubsamplingCorrect, making no note of any other data, reporting no warnings
-   or errors, up to the point where either these values are read, or it's clear they
-   aren't there. This means that some of the data is read twice, but we feel speed
-   in correcting these values is important enough to warrant this sacrifice. Although
-   there is currently no define or other configuration mechanism to disable this behavior,
-   the actual header scanning is build to robustly respond with error report if it
-   should encounter an uncorrected mismatch of subsampling values. See
+   When this is complete, a good and valid JPEG header can be assembled, and
+   this is passed through to LibJpeg. When that's done, the remainder of the
+   input stream, i.e. the compressed image data, can be passed through
+   unchanged. This is done in OJPEGWriteStream functions.
+
+   LibTiff rightly expects to know the subsampling values before decompression.
+   Just like in new-style JPEG-in-TIFF, though, or even more so, actually, the
+   YCbCrsubsampling tag is notoriously unreliable. To correct these tag values
+   with the ones inside the JPEG stream, the first part of the input stream is
+   pre-scanned in OJPEGSubsamplingCorrect, making no note of any other data,
+   reporting no warnings or errors, up to the point where either these values
+   are read, or it's clear they aren't there. This means that some of the data
+   is read twice, but we feel speed in correcting these values is important
+   enough to warrant this sacrifice. Although there is currently no define or
+   other configuration mechanism to disable this behavior, the actual header
+   scanning is build to robustly respond with error report if it should
+   encounter an uncorrected mismatch of subsampling values. See
    OJPEGReadHeaderInfoSecStreamSof.
 
-   The restart interval and restart markers are the most tricky part... The restart
-   interval can be specified in a tag. It can also be set inside the input JPEG stream.
-   It can be used inside the input JPEG stream. If reading from strile data, we've
-   consistently discovered the need to insert restart markers in between the different
-   striles, as is also probably the most likely interpretation of the original TIFF 6.0
-   specification. With all this setting of interval, and actual use of markers that is not
-   predictable at the time of valid JPEG header assembly, the restart thing may turn
-   out the Achilles heel of this implementation. Fortunately, most OJPEG writer vendors
-   succeed in reading back what they write, which may be the reason why we've been able
-   to discover ways that seem to work.
-
-   Some special provision is made for planarconfig separate OJPEG files. These seem
-   to consistently contain header info, a SOS marker, a plane, SOS marker, plane, SOS,
-   and plane. This may or may not be a valid JPEG configuration, we don't know and don't
-   care. We want LibTiff to be able to access the planes individually, without huge
-   buffering inside LibJpeg, anyway. So we compose headers to feed to LibJpeg, in this
-   case, that allow us to pass a single plane such that LibJpeg sees a valid
-   single-channel JPEG stream. Locating subsequent SOS markers, and thus subsequent
-   planes, is done inside OJPEGReadSecondarySos.
-
-   The benefit of the scheme is... that it works, basically. We know of no other that
-   does. It works without checking software tag, or otherwise going about things in an
-   OJPEG flavor specific manner. Instead, it is a single scheme, that covers the cases
-   with and without JpegInterchangeFormat, with and without striles, with part of
-   the header in JpegInterchangeFormat and remainder in first strile, etc. It is forgiving
-   and robust, may likely work with OJPEG flavors we've not seen yet, and makes most out
-   of the data.
-
-   Another nice side-effect is that a complete JPEG single valid stream is build if
-   planarconfig is not separate (vast majority). We may one day use that to build
-   converters to JPEG, and/or to new-style JPEG compression inside TIFF.
-
-   A disadvantage is the lack of random access to the individual striles. This is the
-   reason for much of the complicated restart-and-position stuff inside OJPEGPreDecode.
-   Applications would do well accessing all striles in order, as this will result in
-   a single sequential scan of the input stream, and no restarting of LibJpeg decoding
-   session.
+   The restart interval and restart markers are the most tricky part... The
+   restart interval can be specified in a tag. It can also be set inside the
+   input JPEG stream. It can be used inside the input JPEG stream. If reading
+   from strile data, we've consistently discovered the need to insert restart
+   markers in between the different striles, as is also probably the most likely
+   interpretation of the original TIFF 6.0 specification. With all this setting
+   of interval, and actual use of markers that is not predictable at the time of
+   valid JPEG header assembly, the restart thing may turn out the Achilles heel
+   of this implementation. Fortunately, most OJPEG writer vendors succeed in
+   reading back what they write, which may be the reason why we've been able to
+   discover ways that seem to work.
+
+   Some special provision is made for planarconfig separate OJPEG files. These
+   seem to consistently contain header info, a SOS marker, a plane, SOS marker,
+   plane, SOS, and plane. This may or may not be a valid JPEG configuration, we
+   don't know and don't care. We want LibTiff to be able to access the planes
+   individually, without huge buffering inside LibJpeg, anyway. So we compose
+   headers to feed to LibJpeg, in this case, that allow us to pass a single
+   plane such that LibJpeg sees a valid single-channel JPEG stream. Locating
+   subsequent SOS markers, and thus subsequent planes, is done inside
+   OJPEGReadSecondarySos.
+
+   The benefit of the scheme is... that it works, basically. We know of no other
+   that does. It works without checking software tag, or otherwise going about
+   things in an OJPEG flavor specific manner. Instead, it is a single scheme,
+   that covers the cases with and without JpegInterchangeFormat, with and
+   without striles, with part of the header in JpegInterchangeFormat and
+   remainder in first strile, etc. It is forgiving and robust, may likely work
+   with OJPEG flavors we've not seen yet, and makes most out of the data.
+
+   Another nice side-effect is that a complete JPEG single valid stream is build
+   if planarconfig is not separate (vast majority). We may one day use that to
+   build converters to JPEG, and/or to new-style JPEG compression inside TIFF.
+
+   A disadvantage is the lack of random access to the individual striles. This
+   is the reason for much of the complicated restart-and-position stuff inside
+   OJPEGPreDecode. Applications would do well accessing all striles in order, as
+   this will result in a single sequential scan of the input stream, and no
+   restarting of LibJpeg decoding session.
 */
 
 #define WIN32_LEAN_AND_MEAN
@@ -125,30 +129,31 @@
 #ifdef OJPEG_SUPPORT
 
 /* Configuration defines here are:
- * JPEG_ENCAP_EXTERNAL: The normal way to call libjpeg, uses longjump. In some environments,
- * 	like eg LibTiffDelphi, this is not possible. For this reason, the actual calls to
- * 	libjpeg, with longjump stuff, are encapsulated in dedicated functions. When
- * 	JPEG_ENCAP_EXTERNAL is defined, these encapsulating functions are declared external
- * 	to this unit, and can be defined elsewhere to use stuff other then longjump.
- * 	The default mode, without JPEG_ENCAP_EXTERNAL, implements the call encapsulators
- * 	here, internally, with normal longjump.
- * SETJMP, LONGJMP, JMP_BUF: On some machines/environments a longjump equivalent is
- * 	conveniently available, but still it may be worthwhile to use _setjmp or sigsetjmp
- * 	in place of plain setjmp. These macros will make it easier. It is useless
- * 	to fiddle with these if you define JPEG_ENCAP_EXTERNAL.
- * OJPEG_BUFFER: Define the size of the desired buffer here. Should be small enough so as to guarantee
- * 	instant processing, optimal streaming and optimal use of processor cache, but also big
- * 	enough so as to not result in significant call overhead. It should be at least a few
- * 	bytes to accommodate some structures (this is verified in asserts), but it would not be
- * 	sensible to make it this small anyway, and it should be at most 64K since it is indexed
- * 	with uint16. We recommend 2K.
- * EGYPTIANWALK: You could also define EGYPTIANWALK here, but it is not used anywhere and has
- * 	absolutely no effect. That is why most people insist the EGYPTIANWALK is a bit silly.
+ * JPEG_ENCAP_EXTERNAL: The normal way to call libjpeg, uses longjump. In some
+ * environments, like eg LibTiffDelphi, this is not possible. For this reason,
+ * the actual calls to libjpeg, with longjump stuff, are encapsulated in
+ * dedicated functions. When JPEG_ENCAP_EXTERNAL is defined, these encapsulating
+ * functions are declared external to this unit, and can be defined elsewhere to
+ * use stuff other then longjump. The default mode, without JPEG_ENCAP_EXTERNAL,
+ * implements the call encapsulators here, internally, with normal longjump.
+ * SETJMP, LONGJMP, JMP_BUF: On some machines/environments a longjump equivalent
+ * is conveniently available, but still it may be worthwhile to use _setjmp or
+ * sigsetjmp in place of plain setjmp. These macros will make it easier. It is
+ * useless to fiddle with these if you define JPEG_ENCAP_EXTERNAL. OJPEG_BUFFER:
+ * Define the size of the desired buffer here. Should be small enough so as to
+ * guarantee instant processing, optimal streaming and optimal use of processor
+ * cache, but also big enough so as to not result in significant call overhead.
+ * It should be at least a few bytes to accommodate some structures (this is
+ * verified in asserts), but it would not be sensible to make it this small
+ * anyway, and it should be at most 64K since it is indexed with uint16_t. We
+ * recommend 2K. EGYPTIANWALK: You could also define EGYPTIANWALK here, but it
+ * is not used anywhere and has absolutely no effect. That is why most people
+ * insist the EGYPTIANWALK is a bit silly.
  */
 
 /* define LIBJPEG_ENCAP_EXTERNAL */
 #define SETJMP(jbuf) setjmp(jbuf)
-#define LONGJMP(jbuf,code) longjmp(jbuf,code)
+#define LONGJMP(jbuf, code) longjmp(jbuf, code)
 #define JMP_BUF jmp_buf
 #define OJPEG_BUFFER 2048
 /* define EGYPTIANWALK */
@@ -166,22 +171,36 @@
 #define JPEG_MARKER_APP0 0xE0
 #define JPEG_MARKER_COM 0xFE
 
-#define FIELD_OJPEG_JPEGINTERCHANGEFORMAT (FIELD_CODEC+0)
-#define FIELD_OJPEG_JPEGINTERCHANGEFORMATLENGTH (FIELD_CODEC+1)
-#define FIELD_OJPEG_JPEGQTABLES (FIELD_CODEC+2)
-#define FIELD_OJPEG_JPEGDCTABLES (FIELD_CODEC+3)
-#define FIELD_OJPEG_JPEGACTABLES (FIELD_CODEC+4)
-#define FIELD_OJPEG_JPEGPROC (FIELD_CODEC+5)
-#define FIELD_OJPEG_JPEGRESTARTINTERVAL (FIELD_CODEC+6)
+#define FIELD_OJPEG_JPEGINTERCHANGEFORMAT (FIELD_CODEC + 0)
+#define FIELD_OJPEG_JPEGINTERCHANGEFORMATLENGTH (FIELD_CODEC + 1)
+#define FIELD_OJPEG_JPEGQTABLES (FIELD_CODEC + 2)
+#define FIELD_OJPEG_JPEGDCTABLES (FIELD_CODEC + 3)
+#define FIELD_OJPEG_JPEGACTABLES (FIELD_CODEC + 4)
+#define FIELD_OJPEG_JPEGPROC (FIELD_CODEC + 5)
+#define FIELD_OJPEG_JPEGRESTARTINTERVAL (FIELD_CODEC + 6)
 
 static const TIFFField ojpegFields[] = {
-	{TIFFTAG_JPEGIFOFFSET,1,1,TIFF_LONG8,0,TIFF_SETGET_UINT64,TIFF_SETGET_UNDEFINED,FIELD_OJPEG_JPEGINTERCHANGEFORMAT,TRUE,FALSE,"JpegInterchangeFormat",NULL},
-	{TIFFTAG_JPEGIFBYTECOUNT,1,1,TIFF_LONG8,0,TIFF_SETGET_UINT64,TIFF_SETGET_UNDEFINED,FIELD_OJPEG_JPEGINTERCHANGEFORMATLENGTH,TRUE,FALSE,"JpegInterchangeFormatLength",NULL},
-	{TIFFTAG_JPEGQTABLES,TIFF_VARIABLE2,TIFF_VARIABLE2,TIFF_LONG8,0,TIFF_SETGET_C32_UINT64,TIFF_SETGET_UNDEFINED,FIELD_OJPEG_JPEGQTABLES,FALSE,TRUE,"JpegQTables",NULL},
-	{TIFFTAG_JPEGDCTABLES,TIFF_VARIABLE2,TIFF_VARIABLE2,TIFF_LONG8,0,TIFF_SETGET_C32_UINT64,TIFF_SETGET_UNDEFINED,FIELD_OJPEG_JPEGDCTABLES,FALSE,TRUE,"JpegDcTables",NULL},
-	{TIFFTAG_JPEGACTABLES,TIFF_VARIABLE2,TIFF_VARIABLE2,TIFF_LONG8,0,TIFF_SETGET_C32_UINT64,TIFF_SETGET_UNDEFINED,FIELD_OJPEG_JPEGACTABLES,FALSE,TRUE,"JpegAcTables",NULL},
-	{TIFFTAG_JPEGPROC,1,1,TIFF_SHORT,0,TIFF_SETGET_UINT16,TIFF_SETGET_UNDEFINED,FIELD_OJPEG_JPEGPROC,FALSE,FALSE,"JpegProc",NULL},
-	{TIFFTAG_JPEGRESTARTINTERVAL,1,1,TIFF_SHORT,0,TIFF_SETGET_UINT16,TIFF_SETGET_UNDEFINED,FIELD_OJPEG_JPEGRESTARTINTERVAL,FALSE,FALSE,"JpegRestartInterval",NULL},
+    {TIFFTAG_JPEGIFOFFSET, 1, 1, TIFF_LONG8, 0, TIFF_SETGET_UINT64,
+     TIFF_SETGET_UNDEFINED, FIELD_OJPEG_JPEGINTERCHANGEFORMAT, TRUE, FALSE,
+     "JpegInterchangeFormat", NULL},
+    {TIFFTAG_JPEGIFBYTECOUNT, 1, 1, TIFF_LONG8, 0, TIFF_SETGET_UINT64,
+     TIFF_SETGET_UNDEFINED, FIELD_OJPEG_JPEGINTERCHANGEFORMATLENGTH, TRUE,
+     FALSE, "JpegInterchangeFormatLength", NULL},
+    {TIFFTAG_JPEGQTABLES, TIFF_VARIABLE2, TIFF_VARIABLE2, TIFF_LONG8, 0,
+     TIFF_SETGET_C32_UINT64, TIFF_SETGET_UNDEFINED, FIELD_OJPEG_JPEGQTABLES,
+     FALSE, TRUE, "JpegQTables", NULL},
+    {TIFFTAG_JPEGDCTABLES, TIFF_VARIABLE2, TIFF_VARIABLE2, TIFF_LONG8, 0,
+     TIFF_SETGET_C32_UINT64, TIFF_SETGET_UNDEFINED, FIELD_OJPEG_JPEGDCTABLES,
+     FALSE, TRUE, "JpegDcTables", NULL},
+    {TIFFTAG_JPEGACTABLES, TIFF_VARIABLE2, TIFF_VARIABLE2, TIFF_LONG8, 0,
+     TIFF_SETGET_C32_UINT64, TIFF_SETGET_UNDEFINED, FIELD_OJPEG_JPEGACTABLES,
+     FALSE, TRUE, "JpegAcTables", NULL},
+    {TIFFTAG_JPEGPROC, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16,
+     TIFF_SETGET_UNDEFINED, FIELD_OJPEG_JPEGPROC, FALSE, FALSE, "JpegProc",
+     NULL},
+    {TIFFTAG_JPEGRESTARTINTERVAL, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16,
+     TIFF_SETGET_UNDEFINED, FIELD_OJPEG_JPEGRESTARTINTERVAL, FALSE, FALSE,
+     "JpegRestartInterval", NULL},
 };
 
 #ifndef LIBJPEG_ENCAP_EXTERNAL
@@ -201,2405 +220,2596 @@ static const TIFFField ojpegFields[] = {
   a conflicting typedef given the headers which are included.
 */
 #if defined(__BORLANDC__) || defined(__MINGW32__)
-# define XMD_H 1
+#define XMD_H 1
 #endif
 
 /* Define "boolean" as unsigned char, not int, per Windows custom. */
 #if defined(__WIN32__) && !defined(__MINGW32__)
-# ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
-   typedef unsigned char boolean;
-# endif
-# define HAVE_BOOLEAN            /* prevent jmorecfg.h from redefining it */
+#ifndef __RPCNDR_H__ /* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
+#endif
+#define HAVE_BOOLEAN /* prevent jmorecfg.h from redefining it */
 #endif
 
-#include "jpeglib.h"
 #include "jerror.h"
+#include "jpeglib.h"
 
 typedef struct jpeg_error_mgr jpeg_error_mgr;
 typedef struct jpeg_common_struct jpeg_common_struct;
 typedef struct jpeg_decompress_struct jpeg_decompress_struct;
 typedef struct jpeg_source_mgr jpeg_source_mgr;
 
-typedef enum {
-	osibsNotSetYet,
-	osibsJpegInterchangeFormat,
-	osibsStrile,
-	osibsEof
+typedef enum
+{
+    osibsNotSetYet,
+    osibsJpegInterchangeFormat,
+    osibsStrile,
+    osibsEof
 } OJPEGStateInBufferSource;
 
-typedef enum {
-	ososSoi,
-	ososQTable0,ososQTable1,ososQTable2,ososQTable3,
-	ososDcTable0,ososDcTable1,ososDcTable2,ososDcTable3,
-	ososAcTable0,ososAcTable1,ososAcTable2,ososAcTable3,
-	ososDri,
-	ososSof,
-	ososSos,
-	ososCompressed,
-	ososRst,
-	ososEoi
+typedef enum
+{
+    ososSoi,
+    ososQTable0,
+    ososQTable1,
+    ososQTable2,
+    ososQTable3,
+    ososDcTable0,
+    ososDcTable1,
+    ososDcTable2,
+    ososDcTable3,
+    ososAcTable0,
+    ososAcTable1,
+    ososAcTable2,
+    ososAcTable3,
+    ososDri,
+    ososSof,
+    ososSos,
+    ososCompressed,
+    ososRst,
+    ososEoi
 } OJPEGStateOutState;
 
-typedef struct {
-	TIFF* tif;
-        int decoder_ok;
-        int error_in_raw_data_decoding;
-	#ifndef LIBJPEG_ENCAP_EXTERNAL
-	JMP_BUF exit_jmpbuf;
-	#endif
-	TIFFVGetMethod vgetparent;
-	TIFFVSetMethod vsetparent;
-	TIFFPrintMethod printdir;
-	uint64 file_size;
-	uint32 image_width;
-	uint32 image_length;
-	uint32 strile_width;
-	uint32 strile_length;
-	uint32 strile_length_total;
-	uint8 samples_per_pixel;
-	uint8 plane_sample_offset;
-	uint8 samples_per_pixel_per_plane;
-	uint64 jpeg_interchange_format;
-	uint64 jpeg_interchange_format_length;
-	uint8 jpeg_proc;
-	uint8 subsamplingcorrect;
-	uint8 subsamplingcorrect_done;
-	uint8 subsampling_tag;
-	uint8 subsampling_hor;
-	uint8 subsampling_ver;
-	uint8 subsampling_force_desubsampling_inside_decompression;
-	uint8 qtable_offset_count;
-	uint8 dctable_offset_count;
-	uint8 actable_offset_count;
-	uint64 qtable_offset[3];
-	uint64 dctable_offset[3];
-	uint64 actable_offset[3];
-	uint8* qtable[4];
-	uint8* dctable[4];
-	uint8* actable[4];
-	uint16 restart_interval;
-	uint8 restart_index;
-	uint8 sof_log;
-	uint8 sof_marker_id;
-	uint32 sof_x;
-	uint32 sof_y;
-	uint8 sof_c[3];
-	uint8 sof_hv[3];
-	uint8 sof_tq[3];
-	uint8 sos_cs[3];
-	uint8 sos_tda[3];
-	struct {
-		uint8 log;
-		OJPEGStateInBufferSource in_buffer_source;
-		uint32 in_buffer_next_strile;
-		uint64 in_buffer_file_pos;
-		uint64 in_buffer_file_togo;
-	} sos_end[3];
-	uint8 readheader_done;
-	uint8 writeheader_done;
-	uint16 write_cursample;
-	uint32 write_curstrile;
-	uint8 libjpeg_session_active;
-	uint8 libjpeg_jpeg_query_style;
-	jpeg_error_mgr libjpeg_jpeg_error_mgr;
-	jpeg_decompress_struct libjpeg_jpeg_decompress_struct;
-	jpeg_source_mgr libjpeg_jpeg_source_mgr;
-	uint8 subsampling_convert_log;
-	uint32 subsampling_convert_ylinelen;
-	uint32 subsampling_convert_ylines;
-	uint32 subsampling_convert_clinelen;
-	uint32 subsampling_convert_clines;
-	uint32 subsampling_convert_ybuflen;
-	uint32 subsampling_convert_cbuflen;
-	uint32 subsampling_convert_ycbcrbuflen;
-	uint8* subsampling_convert_ycbcrbuf;
-	uint8* subsampling_convert_ybuf;
-	uint8* subsampling_convert_cbbuf;
-	uint8* subsampling_convert_crbuf;
-	uint32 subsampling_convert_ycbcrimagelen;
-	uint8** subsampling_convert_ycbcrimage;
-	uint32 subsampling_convert_clinelenout;
-	uint32 subsampling_convert_state;
-	uint32 bytes_per_line;   /* if the codec outputs subsampled data, a 'line' in bytes_per_line */
-	uint32 lines_per_strile; /* and lines_per_strile means subsampling_ver desubsampled rows     */
-	OJPEGStateInBufferSource in_buffer_source;
-	uint32 in_buffer_next_strile;
-	uint32 in_buffer_strile_count;
-	uint64 in_buffer_file_pos;
-	uint8 in_buffer_file_pos_log;
-	uint64 in_buffer_file_togo;
-	uint16 in_buffer_togo;
-	uint8* in_buffer_cur;
-	uint8 in_buffer[OJPEG_BUFFER];
-	OJPEGStateOutState out_state;
-	uint8 out_buffer[OJPEG_BUFFER];
-	uint8* skip_buffer;
+typedef struct
+{
+    TIFF *tif;
+    int decoder_ok;
+    int error_in_raw_data_decoding;
+#ifndef LIBJPEG_ENCAP_EXTERNAL
+    JMP_BUF exit_jmpbuf;
+#endif
+    TIFFVGetMethod vgetparent;
+    TIFFVSetMethod vsetparent;
+    TIFFPrintMethod printdir;
+    uint64_t file_size;
+    uint32_t image_width;
+    uint32_t image_length;
+    uint32_t strile_width;
+    uint32_t strile_length;
+    uint32_t strile_length_total;
+    uint8_t samples_per_pixel;
+    uint8_t plane_sample_offset;
+    uint8_t samples_per_pixel_per_plane;
+    uint64_t jpeg_interchange_format;
+    uint64_t jpeg_interchange_format_length;
+    uint8_t jpeg_proc;
+    uint8_t subsamplingcorrect;
+    uint8_t subsamplingcorrect_done;
+    uint8_t subsampling_tag;
+    uint8_t subsampling_hor;
+    uint8_t subsampling_ver;
+    uint8_t subsampling_force_desubsampling_inside_decompression;
+    uint8_t qtable_offset_count;
+    uint8_t dctable_offset_count;
+    uint8_t actable_offset_count;
+    uint64_t qtable_offset[3];
+    uint64_t dctable_offset[3];
+    uint64_t actable_offset[3];
+    uint8_t *qtable[4];
+    uint8_t *dctable[4];
+    uint8_t *actable[4];
+    uint16_t restart_interval;
+    uint8_t restart_index;
+    uint8_t sof_log;
+    uint8_t sof_marker_id;
+    uint32_t sof_x;
+    uint32_t sof_y;
+    uint8_t sof_c[3];
+    uint8_t sof_hv[3];
+    uint8_t sof_tq[3];
+    uint8_t sos_cs[3];
+    uint8_t sos_tda[3];
+    struct
+    {
+        uint8_t log;
+        OJPEGStateInBufferSource in_buffer_source;
+        uint32_t in_buffer_next_strile;
+        uint64_t in_buffer_file_pos;
+        uint64_t in_buffer_file_togo;
+    } sos_end[3];
+    uint8_t readheader_done;
+    uint8_t writeheader_done;
+    uint16_t write_cursample;
+    uint32_t write_curstrile;
+    uint8_t libjpeg_session_active;
+    uint8_t libjpeg_jpeg_query_style;
+    jpeg_error_mgr libjpeg_jpeg_error_mgr;
+    jpeg_decompress_struct libjpeg_jpeg_decompress_struct;
+    jpeg_source_mgr libjpeg_jpeg_source_mgr;
+    uint8_t subsampling_convert_log;
+    uint32_t subsampling_convert_ylinelen;
+    uint32_t subsampling_convert_ylines;
+    uint32_t subsampling_convert_clinelen;
+    uint32_t subsampling_convert_clines;
+    uint32_t subsampling_convert_ybuflen;
+    uint32_t subsampling_convert_cbuflen;
+    uint32_t subsampling_convert_ycbcrbuflen;
+    uint8_t *subsampling_convert_ycbcrbuf;
+    uint8_t *subsampling_convert_ybuf;
+    uint8_t *subsampling_convert_cbbuf;
+    uint8_t *subsampling_convert_crbuf;
+    uint32_t subsampling_convert_ycbcrimagelen;
+    uint8_t **subsampling_convert_ycbcrimage;
+    uint32_t subsampling_convert_clinelenout;
+    uint32_t subsampling_convert_state;
+    uint32_t bytes_per_line;   /* if the codec outputs subsampled data, a 'line'
+                                  in bytes_per_line */
+    uint32_t lines_per_strile; /* and lines_per_strile means subsampling_ver
+                                  desubsampled rows     */
+    OJPEGStateInBufferSource in_buffer_source;
+    uint32_t in_buffer_next_strile;
+    uint32_t in_buffer_strile_count;
+    uint64_t in_buffer_file_pos;
+    uint8_t in_buffer_file_pos_log;
+    uint64_t in_buffer_file_togo;
+    uint16_t in_buffer_togo;
+    uint8_t *in_buffer_cur;
+    uint8_t in_buffer[OJPEG_BUFFER];
+    OJPEGStateOutState out_state;
+    uint8_t out_buffer[OJPEG_BUFFER];
+    uint8_t *skip_buffer;
 } OJPEGState;
 
-static int OJPEGVGetField(TIFF* tif, uint32 tag, va_list ap);
-static int OJPEGVSetField(TIFF* tif, uint32 tag, va_list ap);
-static void OJPEGPrintDir(TIFF* tif, FILE* fd, long flags);
-
-static int OJPEGFixupTags(TIFF* tif);
-static int OJPEGSetupDecode(TIFF* tif);
-static int OJPEGPreDecode(TIFF* tif, uint16 s);
-static int OJPEGPreDecodeSkipRaw(TIFF* tif);
-static int OJPEGPreDecodeSkipScanlines(TIFF* tif);
-static int OJPEGDecode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s);
-static int OJPEGDecodeRaw(TIFF* tif, uint8* buf, tmsize_t cc);
-static int OJPEGDecodeScanlines(TIFF* tif, uint8* buf, tmsize_t cc);
-static void OJPEGPostDecode(TIFF* tif, uint8* buf, tmsize_t cc);
-static int OJPEGSetupEncode(TIFF* tif);
-static int OJPEGPreEncode(TIFF* tif, uint16 s);
-static int OJPEGEncode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s);
-static int OJPEGPostEncode(TIFF* tif);
-static void OJPEGCleanup(TIFF* tif);
-
-static void OJPEGSubsamplingCorrect(TIFF* tif);
-static int OJPEGReadHeaderInfo(TIFF* tif);
-static int OJPEGReadSecondarySos(TIFF* tif, uint16 s);
-static int OJPEGWriteHeaderInfo(TIFF* tif);
-static void OJPEGLibjpegSessionAbort(TIFF* tif);
-
-static int OJPEGReadHeaderInfoSec(TIFF* tif);
-static int OJPEGReadHeaderInfoSecStreamDri(TIFF* tif);
-static int OJPEGReadHeaderInfoSecStreamDqt(TIFF* tif);
-static int OJPEGReadHeaderInfoSecStreamDht(TIFF* tif);
-static int OJPEGReadHeaderInfoSecStreamSof(TIFF* tif, uint8 marker_id);
-static int OJPEGReadHeaderInfoSecStreamSos(TIFF* tif);
-static int OJPEGReadHeaderInfoSecTablesQTable(TIFF* tif);
-static int OJPEGReadHeaderInfoSecTablesDcTable(TIFF* tif);
-static int OJPEGReadHeaderInfoSecTablesAcTable(TIFF* tif);
-
-static int OJPEGReadBufferFill(OJPEGState* sp);
-static int OJPEGReadByte(OJPEGState* sp, uint8* byte);
-static int OJPEGReadBytePeek(OJPEGState* sp, uint8* byte);
-static void OJPEGReadByteAdvance(OJPEGState* sp);
-static int OJPEGReadWord(OJPEGState* sp, uint16* word);
-static int OJPEGReadBlock(OJPEGState* sp, uint16 len, void* mem);
-static void OJPEGReadSkip(OJPEGState* sp, uint16 len);
-
-static int OJPEGWriteStream(TIFF* tif, void** mem, uint32* len);
-static void OJPEGWriteStreamSoi(TIFF* tif, void** mem, uint32* len);
-static void OJPEGWriteStreamQTable(TIFF* tif, uint8 table_index, void** mem, uint32* len);
-static void OJPEGWriteStreamDcTable(TIFF* tif, uint8 table_index, void** mem, uint32* len);
-static void OJPEGWriteStreamAcTable(TIFF* tif, uint8 table_index, void** mem, uint32* len);
-static void OJPEGWriteStreamDri(TIFF* tif, void** mem, uint32* len);
-static void OJPEGWriteStreamSof(TIFF* tif, void** mem, uint32* len);
-static void OJPEGWriteStreamSos(TIFF* tif, void** mem, uint32* len);
-static int OJPEGWriteStreamCompressed(TIFF* tif, void** mem, uint32* len);
-static void OJPEGWriteStreamRst(TIFF* tif, void** mem, uint32* len);
-static void OJPEGWriteStreamEoi(TIFF* tif, void** mem, uint32* len);
+static int OJPEGVGetField(TIFF *tif, uint32_t tag, va_list ap);
+static int OJPEGVSetField(TIFF *tif, uint32_t tag, va_list ap);
+static void OJPEGPrintDir(TIFF *tif, FILE *fd, long flags);
+
+static int OJPEGFixupTags(TIFF *tif);
+static int OJPEGSetupDecode(TIFF *tif);
+static int OJPEGPreDecode(TIFF *tif, uint16_t s);
+static int OJPEGPreDecodeSkipRaw(TIFF *tif);
+static int OJPEGPreDecodeSkipScanlines(TIFF *tif);
+static int OJPEGDecode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s);
+static int OJPEGDecodeRaw(TIFF *tif, uint8_t *buf, tmsize_t cc);
+static int OJPEGDecodeScanlines(TIFF *tif, uint8_t *buf, tmsize_t cc);
+static void OJPEGPostDecode(TIFF *tif, uint8_t *buf, tmsize_t cc);
+static int OJPEGSetupEncode(TIFF *tif);
+static int OJPEGPreEncode(TIFF *tif, uint16_t s);
+static int OJPEGEncode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s);
+static int OJPEGPostEncode(TIFF *tif);
+static void OJPEGCleanup(TIFF *tif);
+
+static void OJPEGSubsamplingCorrect(TIFF *tif);
+static int OJPEGReadHeaderInfo(TIFF *tif);
+static int OJPEGReadSecondarySos(TIFF *tif, uint16_t s);
+static int OJPEGWriteHeaderInfo(TIFF *tif);
+static void OJPEGLibjpegSessionAbort(TIFF *tif);
+
+static int OJPEGReadHeaderInfoSec(TIFF *tif);
+static int OJPEGReadHeaderInfoSecStreamDri(TIFF *tif);
+static int OJPEGReadHeaderInfoSecStreamDqt(TIFF *tif);
+static int OJPEGReadHeaderInfoSecStreamDht(TIFF *tif);
+static int OJPEGReadHeaderInfoSecStreamSof(TIFF *tif, uint8_t marker_id);
+static int OJPEGReadHeaderInfoSecStreamSos(TIFF *tif);
+static int OJPEGReadHeaderInfoSecTablesQTable(TIFF *tif);
+static int OJPEGReadHeaderInfoSecTablesDcTable(TIFF *tif);
+static int OJPEGReadHeaderInfoSecTablesAcTable(TIFF *tif);
+
+static int OJPEGReadBufferFill(OJPEGState *sp);
+static int OJPEGReadByte(OJPEGState *sp, uint8_t *byte);
+static int OJPEGReadBytePeek(OJPEGState *sp, uint8_t *byte);
+static void OJPEGReadByteAdvance(OJPEGState *sp);
+static int OJPEGReadWord(OJPEGState *sp, uint16_t *word);
+static int OJPEGReadBlock(OJPEGState *sp, uint16_t len, void *mem);
+static void OJPEGReadSkip(OJPEGState *sp, uint16_t len);
+
+static int OJPEGWriteStream(TIFF *tif, void **mem, uint32_t *len);
+static void OJPEGWriteStreamSoi(TIFF *tif, void **mem, uint32_t *len);
+static void OJPEGWriteStreamQTable(TIFF *tif, uint8_t table_index, void **mem,
+                                   uint32_t *len);
+static void OJPEGWriteStreamDcTable(TIFF *tif, uint8_t table_index, void **mem,
+                                    uint32_t *len);
+static void OJPEGWriteStreamAcTable(TIFF *tif, uint8_t table_index, void **mem,
+                                    uint32_t *len);
+static void OJPEGWriteStreamDri(TIFF *tif, void **mem, uint32_t *len);
+static void OJPEGWriteStreamSof(TIFF *tif, void **mem, uint32_t *len);
+static void OJPEGWriteStreamSos(TIFF *tif, void **mem, uint32_t *len);
+static int OJPEGWriteStreamCompressed(TIFF *tif, void **mem, uint32_t *len);
+static void OJPEGWriteStreamRst(TIFF *tif, void **mem, uint32_t *len);
+static void OJPEGWriteStreamEoi(TIFF *tif, void **mem, uint32_t *len);
 
 #ifdef LIBJPEG_ENCAP_EXTERNAL
-extern int jpeg_create_decompress_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo);
-extern int jpeg_read_header_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, uint8 require_image);
-extern int jpeg_start_decompress_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo);
-extern int jpeg_read_scanlines_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, void* scanlines, uint32 max_lines);
-extern int jpeg_read_raw_data_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, void* data, uint32 max_lines);
-extern void jpeg_encap_unwind(TIFF* tif);
+extern int jpeg_create_decompress_encap(OJPEGState *sp,
+                                        jpeg_decompress_struct *cinfo);
+extern int jpeg_read_header_encap(OJPEGState *sp, jpeg_decompress_struct *cinfo,
+                                  uint8_t require_image);
+extern int jpeg_start_decompress_encap(OJPEGState *sp,
+                                       jpeg_decompress_struct *cinfo);
+extern int jpeg_read_scanlines_encap(OJPEGState *sp,
+                                     jpeg_decompress_struct *cinfo,
+                                     void *scanlines, uint32_t max_lines);
+extern int jpeg_read_raw_data_encap(OJPEGState *sp,
+                                    jpeg_decompress_struct *cinfo, void *data,
+                                    uint32_t max_lines);
+extern void jpeg_encap_unwind(TIFF *tif);
 #else
-static int jpeg_create_decompress_encap(OJPEGState* sp, jpeg_decompress_struct* j);
-static int jpeg_read_header_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, uint8 require_image);
-static int jpeg_start_decompress_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo);
-static int jpeg_read_scanlines_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, void* scanlines, uint32 max_lines);
-static int jpeg_read_raw_data_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, void* data, uint32 max_lines);
-static void jpeg_encap_unwind(TIFF* tif);
+static int jpeg_create_decompress_encap(OJPEGState *sp,
+                                        jpeg_decompress_struct *j);
+static int jpeg_read_header_encap(OJPEGState *sp, jpeg_decompress_struct *cinfo,
+                                  uint8_t require_image);
+static int jpeg_start_decompress_encap(OJPEGState *sp,
+                                       jpeg_decompress_struct *cinfo);
+static int jpeg_read_scanlines_encap(OJPEGState *sp,
+                                     jpeg_decompress_struct *cinfo,
+                                     void *scanlines, uint32_t max_lines);
+static int jpeg_read_raw_data_encap(OJPEGState *sp,
+                                    jpeg_decompress_struct *cinfo, void *data,
+                                    uint32_t max_lines);
+static void jpeg_encap_unwind(TIFF *tif);
 #endif
 
-static void OJPEGLibjpegJpegErrorMgrOutputMessage(jpeg_common_struct* cinfo);
-static void OJPEGLibjpegJpegErrorMgrErrorExit(jpeg_common_struct* cinfo);
-static void OJPEGLibjpegJpegSourceMgrInitSource(jpeg_decompress_struct* cinfo);
-static boolean OJPEGLibjpegJpegSourceMgrFillInputBuffer(jpeg_decompress_struct* cinfo);
-static void OJPEGLibjpegJpegSourceMgrSkipInputData(jpeg_decompress_struct* cinfo, long num_bytes);
-static boolean OJPEGLibjpegJpegSourceMgrResyncToRestart(jpeg_decompress_struct* cinfo, int desired);
-static void OJPEGLibjpegJpegSourceMgrTermSource(jpeg_decompress_struct* cinfo);
-
-int
-TIFFInitOJPEG(TIFF* tif, int scheme)
-{
-	static const char module[]="TIFFInitOJPEG";
-	OJPEGState* sp;
-
-        (void)scheme;
-	assert(scheme==COMPRESSION_OJPEG);
-
-        /*
-	 * Merge codec-specific tag information.
-	 */
-	if (!_TIFFMergeFields(tif, ojpegFields, TIFFArrayCount(ojpegFields))) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Merging Old JPEG codec-specific tags failed");
-		return 0;
-	}
-
-	/* state block */
-	sp=_TIFFmalloc(sizeof(OJPEGState));
-	if (sp==NULL)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"No space for OJPEG state block");
-		return(0);
-	}
-	_TIFFmemset(sp,0,sizeof(OJPEGState));
-	sp->tif=tif;
-	sp->jpeg_proc=1;
-	sp->subsampling_hor=2;
-	sp->subsampling_ver=2;
-	TIFFSetField(tif,TIFFTAG_YCBCRSUBSAMPLING,2,2);
-	/* tif codec methods */
-	tif->tif_fixuptags=OJPEGFixupTags;  
-	tif->tif_setupdecode=OJPEGSetupDecode;
-	tif->tif_predecode=OJPEGPreDecode;
-	tif->tif_postdecode=OJPEGPostDecode;  
-	tif->tif_decoderow=OJPEGDecode;  
-	tif->tif_decodestrip=OJPEGDecode;  
-	tif->tif_decodetile=OJPEGDecode;  
-	tif->tif_setupencode=OJPEGSetupEncode;
-	tif->tif_preencode=OJPEGPreEncode;
-	tif->tif_postencode=OJPEGPostEncode;
-	tif->tif_encoderow=OJPEGEncode;  
-	tif->tif_encodestrip=OJPEGEncode;  
-	tif->tif_encodetile=OJPEGEncode;  
-	tif->tif_cleanup=OJPEGCleanup;
-	tif->tif_data=(uint8*)sp;
-	/* tif tag methods */
-	sp->vgetparent=tif->tif_tagmethods.vgetfield;
-	tif->tif_tagmethods.vgetfield=OJPEGVGetField;
-	sp->vsetparent=tif->tif_tagmethods.vsetfield;
-	tif->tif_tagmethods.vsetfield=OJPEGVSetField;
-	sp->printdir=tif->tif_tagmethods.printdir;
-	tif->tif_tagmethods.printdir=OJPEGPrintDir;
-	/* Some OJPEG files don't have strip or tile offsets or bytecounts tags.
-	   Some others do, but have totally meaningless or corrupt values
-	   in these tags. In these cases, the JpegInterchangeFormat stream is
-	   reliable. In any case, this decoder reads the compressed data itself,
-	   from the most reliable locations, and we need to notify encapsulating
-	   LibTiff not to read raw strips or tiles for us. */
-	tif->tif_flags|=TIFF_NOREADRAW;
-	return(1);
+static void OJPEGLibjpegJpegErrorMgrOutputMessage(jpeg_common_struct *cinfo);
+static void OJPEGLibjpegJpegErrorMgrErrorExit(jpeg_common_struct *cinfo);
+static void OJPEGLibjpegJpegSourceMgrInitSource(jpeg_decompress_struct *cinfo);
+static boolean
+OJPEGLibjpegJpegSourceMgrFillInputBuffer(jpeg_decompress_struct *cinfo);
+static void
+OJPEGLibjpegJpegSourceMgrSkipInputData(jpeg_decompress_struct *cinfo,
+                                       long num_bytes);
+static boolean
+OJPEGLibjpegJpegSourceMgrResyncToRestart(jpeg_decompress_struct *cinfo,
+                                         int desired);
+static void OJPEGLibjpegJpegSourceMgrTermSource(jpeg_decompress_struct *cinfo);
+
+int TIFFInitOJPEG(TIFF *tif, int scheme)
+{
+    static const char module[] = "TIFFInitOJPEG";
+    OJPEGState *sp;
+
+    (void)scheme;
+    assert(scheme == COMPRESSION_OJPEG);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, ojpegFields, TIFFArrayCount(ojpegFields)))
+    {
+        TIFFErrorExtR(tif, module,
+                      "Merging Old JPEG codec-specific tags failed");
+        return 0;
+    }
+
+    /* state block */
+    sp = _TIFFmallocExt(tif, sizeof(OJPEGState));
+    if (sp == NULL)
+    {
+        TIFFErrorExtR(tif, module, "No space for OJPEG state block");
+        return (0);
+    }
+    _TIFFmemset(sp, 0, sizeof(OJPEGState));
+    sp->tif = tif;
+    sp->jpeg_proc = 1;
+    sp->subsampling_hor = 2;
+    sp->subsampling_ver = 2;
+    TIFFSetField(tif, TIFFTAG_YCBCRSUBSAMPLING, 2, 2);
+    /* tif codec methods */
+    tif->tif_fixuptags = OJPEGFixupTags;
+    tif->tif_setupdecode = OJPEGSetupDecode;
+    tif->tif_predecode = OJPEGPreDecode;
+    tif->tif_postdecode = OJPEGPostDecode;
+    tif->tif_decoderow = OJPEGDecode;
+    tif->tif_decodestrip = OJPEGDecode;
+    tif->tif_decodetile = OJPEGDecode;
+    tif->tif_setupencode = OJPEGSetupEncode;
+    tif->tif_preencode = OJPEGPreEncode;
+    tif->tif_postencode = OJPEGPostEncode;
+    tif->tif_encoderow = OJPEGEncode;
+    tif->tif_encodestrip = OJPEGEncode;
+    tif->tif_encodetile = OJPEGEncode;
+    tif->tif_cleanup = OJPEGCleanup;
+    tif->tif_data = (uint8_t *)sp;
+    /* tif tag methods */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = OJPEGVGetField;
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = OJPEGVSetField;
+    sp->printdir = tif->tif_tagmethods.printdir;
+    tif->tif_tagmethods.printdir = OJPEGPrintDir;
+    /* Some OJPEG files don't have strip or tile offsets or bytecounts tags.
+       Some others do, but have totally meaningless or corrupt values
+       in these tags. In these cases, the JpegInterchangeFormat stream is
+       reliable. In any case, this decoder reads the compressed data itself,
+       from the most reliable locations, and we need to notify encapsulating
+       LibTiff not to read raw strips or tiles for us. */
+    tif->tif_flags |= TIFF_NOREADRAW;
+    return (1);
 }
 
-static int
-OJPEGVGetField(TIFF* tif, uint32 tag, va_list ap)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	switch(tag)
-	{
-		case TIFFTAG_JPEGIFOFFSET:
-			*va_arg(ap,uint64*)=(uint64)sp->jpeg_interchange_format;
-			break;
-		case TIFFTAG_JPEGIFBYTECOUNT:
-			*va_arg(ap,uint64*)=(uint64)sp->jpeg_interchange_format_length;
-			break;
-		case TIFFTAG_YCBCRSUBSAMPLING:
-			if (sp->subsamplingcorrect_done==0)
-				OJPEGSubsamplingCorrect(tif);
-			*va_arg(ap,uint16*)=(uint16)sp->subsampling_hor;
-			*va_arg(ap,uint16*)=(uint16)sp->subsampling_ver;
-			break;
-		case TIFFTAG_JPEGQTABLES:
-			*va_arg(ap,uint32*)=(uint32)sp->qtable_offset_count;
-			*va_arg(ap,const void**)=(const void*)sp->qtable_offset;
-			break;
-		case TIFFTAG_JPEGDCTABLES:
-			*va_arg(ap,uint32*)=(uint32)sp->dctable_offset_count;
-			*va_arg(ap,const void**)=(const void*)sp->dctable_offset;
-			break;
-		case TIFFTAG_JPEGACTABLES:
-			*va_arg(ap,uint32*)=(uint32)sp->actable_offset_count;
-			*va_arg(ap,const void**)=(const void*)sp->actable_offset;
-			break;
-		case TIFFTAG_JPEGPROC:
-			*va_arg(ap,uint16*)=(uint16)sp->jpeg_proc;
-			break;
-		case TIFFTAG_JPEGRESTARTINTERVAL:
-			*va_arg(ap,uint16*)=sp->restart_interval;
-			break;
-		default:
-			return (*sp->vgetparent)(tif,tag,ap);
-	}
-	return (1);
+static int OJPEGVGetField(TIFF *tif, uint32_t tag, va_list ap)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    switch (tag)
+    {
+        case TIFFTAG_JPEGIFOFFSET:
+            *va_arg(ap, uint64_t *) = (uint64_t)sp->jpeg_interchange_format;
+            break;
+        case TIFFTAG_JPEGIFBYTECOUNT:
+            *va_arg(ap, uint64_t *) =
+                (uint64_t)sp->jpeg_interchange_format_length;
+            break;
+        case TIFFTAG_YCBCRSUBSAMPLING:
+            if (sp->subsamplingcorrect_done == 0)
+                OJPEGSubsamplingCorrect(tif);
+            *va_arg(ap, uint16_t *) = (uint16_t)sp->subsampling_hor;
+            *va_arg(ap, uint16_t *) = (uint16_t)sp->subsampling_ver;
+            break;
+        case TIFFTAG_JPEGQTABLES:
+            *va_arg(ap, uint32_t *) = (uint32_t)sp->qtable_offset_count;
+            *va_arg(ap, const void **) = (const void *)sp->qtable_offset;
+            break;
+        case TIFFTAG_JPEGDCTABLES:
+            *va_arg(ap, uint32_t *) = (uint32_t)sp->dctable_offset_count;
+            *va_arg(ap, const void **) = (const void *)sp->dctable_offset;
+            break;
+        case TIFFTAG_JPEGACTABLES:
+            *va_arg(ap, uint32_t *) = (uint32_t)sp->actable_offset_count;
+            *va_arg(ap, const void **) = (const void *)sp->actable_offset;
+            break;
+        case TIFFTAG_JPEGPROC:
+            *va_arg(ap, uint16_t *) = (uint16_t)sp->jpeg_proc;
+            break;
+        case TIFFTAG_JPEGRESTARTINTERVAL:
+            *va_arg(ap, uint16_t *) = sp->restart_interval;
+            break;
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
+    return (1);
 }
 
-static int
-OJPEGVSetField(TIFF* tif, uint32 tag, va_list ap)
-{
-	static const char module[]="OJPEGVSetField";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint32 ma;
-	uint64* mb;
-	uint32 n;
-	const TIFFField* fip;
-
-	switch(tag)
-	{
-		case TIFFTAG_JPEGIFOFFSET:
-			sp->jpeg_interchange_format=(uint64)va_arg(ap,uint64);
-			break;
-		case TIFFTAG_JPEGIFBYTECOUNT:
-			sp->jpeg_interchange_format_length=(uint64)va_arg(ap,uint64);
-			break;
-		case TIFFTAG_YCBCRSUBSAMPLING:
-			sp->subsampling_tag=1;
-			sp->subsampling_hor=(uint8)va_arg(ap,uint16_vap);
-			sp->subsampling_ver=(uint8)va_arg(ap,uint16_vap);
-			tif->tif_dir.td_ycbcrsubsampling[0]=sp->subsampling_hor;
-			tif->tif_dir.td_ycbcrsubsampling[1]=sp->subsampling_ver;
-			break;
-		case TIFFTAG_JPEGQTABLES:
-			ma=(uint32)va_arg(ap,uint32);
-			if (ma!=0)
-			{
-				if (ma>3)
-				{
-					TIFFErrorExt(tif->tif_clientdata,module,"JpegQTables tag has incorrect count");
-					return(0);
-				}
-				sp->qtable_offset_count=(uint8)ma;
-				mb=(uint64*)va_arg(ap,uint64*);
-				for (n=0; n<ma; n++)
-					sp->qtable_offset[n]=mb[n];
-			}
-			break;
-		case TIFFTAG_JPEGDCTABLES:
-			ma=(uint32)va_arg(ap,uint32);
-			if (ma!=0)
-			{
-				if (ma>3)
-				{
-					TIFFErrorExt(tif->tif_clientdata,module,"JpegDcTables tag has incorrect count");
-					return(0);
-				}
-				sp->dctable_offset_count=(uint8)ma;
-				mb=(uint64*)va_arg(ap,uint64*);
-				for (n=0; n<ma; n++)
-					sp->dctable_offset[n]=mb[n];
-			}
-			break;
-		case TIFFTAG_JPEGACTABLES:
-			ma=(uint32)va_arg(ap,uint32);
-			if (ma!=0)
-			{
-				if (ma>3)
-				{
-					TIFFErrorExt(tif->tif_clientdata,module,"JpegAcTables tag has incorrect count");
-					return(0);
-				}
-				sp->actable_offset_count=(uint8)ma;
-				mb=(uint64*)va_arg(ap,uint64*);
-				for (n=0; n<ma; n++)
-					sp->actable_offset[n]=mb[n];
-			}
-			break;
-		case TIFFTAG_JPEGPROC:
-			sp->jpeg_proc=(uint8)va_arg(ap,uint16_vap);
-			break;
-		case TIFFTAG_JPEGRESTARTINTERVAL:
-			sp->restart_interval=(uint16)va_arg(ap,uint16_vap);
-			break;
-		default:
-			return (*sp->vsetparent)(tif,tag,ap);
-	}
-	fip = TIFFFieldWithTag(tif,tag);
-	if( fip == NULL ) /* shouldn't happen */
-	    return(0);
-	TIFFSetFieldBit(tif,fip->field_bit);
-	tif->tif_flags|=TIFF_DIRTYDIRECT;
-	return(1);
+static int OJPEGVSetField(TIFF *tif, uint32_t tag, va_list ap)
+{
+    static const char module[] = "OJPEGVSetField";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint32_t ma;
+    uint64_t *mb;
+    uint32_t n;
+    const TIFFField *fip;
+
+    switch (tag)
+    {
+        case TIFFTAG_JPEGIFOFFSET:
+            sp->jpeg_interchange_format = (uint64_t)va_arg(ap, uint64_t);
+            break;
+        case TIFFTAG_JPEGIFBYTECOUNT:
+            sp->jpeg_interchange_format_length = (uint64_t)va_arg(ap, uint64_t);
+            break;
+        case TIFFTAG_YCBCRSUBSAMPLING:
+            sp->subsampling_tag = 1;
+            sp->subsampling_hor = (uint8_t)va_arg(ap, uint16_vap);
+            sp->subsampling_ver = (uint8_t)va_arg(ap, uint16_vap);
+            tif->tif_dir.td_ycbcrsubsampling[0] = sp->subsampling_hor;
+            tif->tif_dir.td_ycbcrsubsampling[1] = sp->subsampling_ver;
+            break;
+        case TIFFTAG_JPEGQTABLES:
+            ma = (uint32_t)va_arg(ap, uint32_t);
+            if (ma != 0)
+            {
+                if (ma > 3)
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "JpegQTables tag has incorrect count");
+                    return (0);
+                }
+                sp->qtable_offset_count = (uint8_t)ma;
+                mb = (uint64_t *)va_arg(ap, uint64_t *);
+                for (n = 0; n < ma; n++)
+                    sp->qtable_offset[n] = mb[n];
+            }
+            break;
+        case TIFFTAG_JPEGDCTABLES:
+            ma = (uint32_t)va_arg(ap, uint32_t);
+            if (ma != 0)
+            {
+                if (ma > 3)
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "JpegDcTables tag has incorrect count");
+                    return (0);
+                }
+                sp->dctable_offset_count = (uint8_t)ma;
+                mb = (uint64_t *)va_arg(ap, uint64_t *);
+                for (n = 0; n < ma; n++)
+                    sp->dctable_offset[n] = mb[n];
+            }
+            break;
+        case TIFFTAG_JPEGACTABLES:
+            ma = (uint32_t)va_arg(ap, uint32_t);
+            if (ma != 0)
+            {
+                if (ma > 3)
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "JpegAcTables tag has incorrect count");
+                    return (0);
+                }
+                sp->actable_offset_count = (uint8_t)ma;
+                mb = (uint64_t *)va_arg(ap, uint64_t *);
+                for (n = 0; n < ma; n++)
+                    sp->actable_offset[n] = mb[n];
+            }
+            break;
+        case TIFFTAG_JPEGPROC:
+            sp->jpeg_proc = (uint8_t)va_arg(ap, uint16_vap);
+            break;
+        case TIFFTAG_JPEGRESTARTINTERVAL:
+            sp->restart_interval = (uint16_t)va_arg(ap, uint16_vap);
+            break;
+        default:
+            return (*sp->vsetparent)(tif, tag, ap);
+    }
+    fip = TIFFFieldWithTag(tif, tag);
+    if (fip == NULL) /* shouldn't happen */
+        return (0);
+    TIFFSetFieldBit(tif, fip->field_bit);
+    tif->tif_flags |= TIFF_DIRTYDIRECT;
+    return (1);
 }
 
-static void
-OJPEGPrintDir(TIFF* tif, FILE* fd, long flags)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 m;
-	(void)flags;
-	assert(sp!=NULL);
-	if (TIFFFieldSet(tif,FIELD_OJPEG_JPEGINTERCHANGEFORMAT))
-		fprintf(fd,"  JpegInterchangeFormat: " TIFF_UINT64_FORMAT "\n",(TIFF_UINT64_T)sp->jpeg_interchange_format);  
-	if (TIFFFieldSet(tif,FIELD_OJPEG_JPEGINTERCHANGEFORMATLENGTH))
-		fprintf(fd,"  JpegInterchangeFormatLength: " TIFF_UINT64_FORMAT "\n",(TIFF_UINT64_T)sp->jpeg_interchange_format_length);  
-	if (TIFFFieldSet(tif,FIELD_OJPEG_JPEGQTABLES))
-	{
-		fprintf(fd,"  JpegQTables:");
-		for (m=0; m<sp->qtable_offset_count; m++)
-			fprintf(fd," " TIFF_UINT64_FORMAT,(TIFF_UINT64_T)sp->qtable_offset[m]);
-		fprintf(fd,"\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_OJPEG_JPEGDCTABLES))
-	{
-		fprintf(fd,"  JpegDcTables:");
-		for (m=0; m<sp->dctable_offset_count; m++)
-			fprintf(fd," " TIFF_UINT64_FORMAT,(TIFF_UINT64_T)sp->dctable_offset[m]);
-		fprintf(fd,"\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_OJPEG_JPEGACTABLES))
-	{
-		fprintf(fd,"  JpegAcTables:");
-		for (m=0; m<sp->actable_offset_count; m++)
-			fprintf(fd," " TIFF_UINT64_FORMAT,(TIFF_UINT64_T)sp->actable_offset[m]);
-		fprintf(fd,"\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_OJPEG_JPEGPROC))
-		fprintf(fd,"  JpegProc: %u\n",(unsigned int)sp->jpeg_proc);
-	if (TIFFFieldSet(tif,FIELD_OJPEG_JPEGRESTARTINTERVAL))
-		fprintf(fd,"  JpegRestartInterval: %u\n",(unsigned int)sp->restart_interval);
-	if (sp->printdir)
-		(*sp->printdir)(tif, fd, flags);
+static void OJPEGPrintDir(TIFF *tif, FILE *fd, long flags)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t m;
+    (void)flags;
+    assert(sp != NULL);
+    if (TIFFFieldSet(tif, FIELD_OJPEG_JPEGINTERCHANGEFORMAT))
+        fprintf(fd, "  JpegInterchangeFormat: %" PRIu64 "\n",
+                (uint64_t)sp->jpeg_interchange_format);
+    if (TIFFFieldSet(tif, FIELD_OJPEG_JPEGINTERCHANGEFORMATLENGTH))
+        fprintf(fd, "  JpegInterchangeFormatLength: %" PRIu64 "\n",
+                (uint64_t)sp->jpeg_interchange_format_length);
+    if (TIFFFieldSet(tif, FIELD_OJPEG_JPEGQTABLES))
+    {
+        fprintf(fd, "  JpegQTables:");
+        for (m = 0; m < sp->qtable_offset_count; m++)
+            fprintf(fd, " %" PRIu64, (uint64_t)sp->qtable_offset[m]);
+        fprintf(fd, "\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_OJPEG_JPEGDCTABLES))
+    {
+        fprintf(fd, "  JpegDcTables:");
+        for (m = 0; m < sp->dctable_offset_count; m++)
+            fprintf(fd, " %" PRIu64, (uint64_t)sp->dctable_offset[m]);
+        fprintf(fd, "\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_OJPEG_JPEGACTABLES))
+    {
+        fprintf(fd, "  JpegAcTables:");
+        for (m = 0; m < sp->actable_offset_count; m++)
+            fprintf(fd, " %" PRIu64, (uint64_t)sp->actable_offset[m]);
+        fprintf(fd, "\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_OJPEG_JPEGPROC))
+        fprintf(fd, "  JpegProc: %" PRIu8 "\n", sp->jpeg_proc);
+    if (TIFFFieldSet(tif, FIELD_OJPEG_JPEGRESTARTINTERVAL))
+        fprintf(fd, "  JpegRestartInterval: %" PRIu16 "\n",
+                sp->restart_interval);
+    if (sp->printdir)
+        (*sp->printdir)(tif, fd, flags);
+}
+
+static int OJPEGFixupTags(TIFF *tif)
+{
+    (void)tif;
+    return (1);
 }
 
-static int
-OJPEGFixupTags(TIFF* tif)
+static int OJPEGSetupDecode(TIFF *tif)
 {
-	(void) tif;
-	return(1);
+    static const char module[] = "OJPEGSetupDecode";
+    TIFFWarningExtR(tif, module,
+                    "Deprecated and troublesome old-style JPEG compression "
+                    "mode, please convert to new-style JPEG compression and "
+                    "notify vendor of writing software");
+    return (1);
 }
 
-static int
-OJPEGSetupDecode(TIFF* tif)
+static int OJPEGPreDecode(TIFF *tif, uint16_t s)
 {
-	static const char module[]="OJPEGSetupDecode";
-	TIFFWarningExt(tif->tif_clientdata,module,"Deprecated and troublesome old-style JPEG compression mode, please convert to new-style JPEG compression and notify vendor of writing software");
-	return(1);
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint32_t m;
+    if (sp->subsamplingcorrect_done == 0)
+        OJPEGSubsamplingCorrect(tif);
+    if (sp->readheader_done == 0)
+    {
+        if (OJPEGReadHeaderInfo(tif) == 0)
+            return (0);
+    }
+    if (sp->sos_end[s].log == 0)
+    {
+        if (OJPEGReadSecondarySos(tif, s) == 0)
+            return (0);
+    }
+    if (isTiled(tif))
+        m = tif->tif_curtile;
+    else
+        m = tif->tif_curstrip;
+    if ((sp->writeheader_done != 0) &&
+        ((sp->write_cursample != s) || (sp->write_curstrile > m)))
+    {
+        if (sp->libjpeg_session_active != 0)
+            OJPEGLibjpegSessionAbort(tif);
+        sp->writeheader_done = 0;
+    }
+    if (sp->writeheader_done == 0)
+    {
+        sp->plane_sample_offset = (uint8_t)s;
+        sp->write_cursample = s;
+        sp->write_curstrile = s * tif->tif_dir.td_stripsperimage;
+        if ((sp->in_buffer_file_pos_log == 0) ||
+            (sp->in_buffer_file_pos - sp->in_buffer_togo !=
+             sp->sos_end[s].in_buffer_file_pos))
+        {
+            sp->in_buffer_source = sp->sos_end[s].in_buffer_source;
+            sp->in_buffer_next_strile = sp->sos_end[s].in_buffer_next_strile;
+            sp->in_buffer_file_pos = sp->sos_end[s].in_buffer_file_pos;
+            sp->in_buffer_file_pos_log = 0;
+            sp->in_buffer_file_togo = sp->sos_end[s].in_buffer_file_togo;
+            sp->in_buffer_togo = 0;
+            sp->in_buffer_cur = 0;
+        }
+        if (OJPEGWriteHeaderInfo(tif) == 0)
+            return (0);
+    }
+    while (sp->write_curstrile < m)
+    {
+        if (sp->libjpeg_jpeg_query_style == 0)
+        {
+            if (OJPEGPreDecodeSkipRaw(tif) == 0)
+                return (0);
+        }
+        else
+        {
+            if (OJPEGPreDecodeSkipScanlines(tif) == 0)
+                return (0);
+        }
+        sp->write_curstrile++;
+    }
+    sp->decoder_ok = 1;
+    return (1);
 }
 
-static int
-OJPEGPreDecode(TIFF* tif, uint16 s)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint32 m;
-	if (sp->subsamplingcorrect_done==0)
-		OJPEGSubsamplingCorrect(tif);
-	if (sp->readheader_done==0)
-	{
-		if (OJPEGReadHeaderInfo(tif)==0)
-			return(0);
-	}
-	if (sp->sos_end[s].log==0)
-	{
-		if (OJPEGReadSecondarySos(tif,s)==0)
-			return(0);
-	}
-	if (isTiled(tif))
-		m=tif->tif_curtile;
-	else
-		m=tif->tif_curstrip;
-	if ((sp->writeheader_done!=0) && ((sp->write_cursample!=s) || (sp->write_curstrile>m)))
-	{
-		if (sp->libjpeg_session_active!=0)
-			OJPEGLibjpegSessionAbort(tif);
-		sp->writeheader_done=0;
-	}
-	if (sp->writeheader_done==0)
-	{
-		sp->plane_sample_offset=(uint8)s;
-		sp->write_cursample=s;
-		sp->write_curstrile=s*tif->tif_dir.td_stripsperimage;
-		if ((sp->in_buffer_file_pos_log==0) ||
-		    (sp->in_buffer_file_pos-sp->in_buffer_togo!=sp->sos_end[s].in_buffer_file_pos))
-		{
-			sp->in_buffer_source=sp->sos_end[s].in_buffer_source;
-			sp->in_buffer_next_strile=sp->sos_end[s].in_buffer_next_strile;
-			sp->in_buffer_file_pos=sp->sos_end[s].in_buffer_file_pos;
-			sp->in_buffer_file_pos_log=0;
-			sp->in_buffer_file_togo=sp->sos_end[s].in_buffer_file_togo;
-			sp->in_buffer_togo=0;
-			sp->in_buffer_cur=0;
-		}
-		if (OJPEGWriteHeaderInfo(tif)==0)
-			return(0);
-	}
-	while (sp->write_curstrile<m)          
-	{
-		if (sp->libjpeg_jpeg_query_style==0)
-		{
-			if (OJPEGPreDecodeSkipRaw(tif)==0)
-				return(0);
-		}
-		else
-		{
-			if (OJPEGPreDecodeSkipScanlines(tif)==0)
-				return(0);
-		}
-		sp->write_curstrile++;
-	}
-	sp->decoder_ok = 1;
-	return(1);
+static int OJPEGPreDecodeSkipRaw(TIFF *tif)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint32_t m;
+    m = sp->lines_per_strile;
+    if (sp->subsampling_convert_state != 0)
+    {
+        if (sp->subsampling_convert_clines - sp->subsampling_convert_state >= m)
+        {
+            sp->subsampling_convert_state += m;
+            if (sp->subsampling_convert_state == sp->subsampling_convert_clines)
+                sp->subsampling_convert_state = 0;
+            return (1);
+        }
+        m -= sp->subsampling_convert_clines - sp->subsampling_convert_state;
+        sp->subsampling_convert_state = 0;
+        sp->error_in_raw_data_decoding = 0;
+    }
+    while (m >= sp->subsampling_convert_clines)
+    {
+        if (jpeg_read_raw_data_encap(sp, &(sp->libjpeg_jpeg_decompress_struct),
+                                     sp->subsampling_convert_ycbcrimage,
+                                     sp->subsampling_ver * 8) == 0)
+            return (0);
+        m -= sp->subsampling_convert_clines;
+    }
+    if (m > 0)
+    {
+        if (jpeg_read_raw_data_encap(sp, &(sp->libjpeg_jpeg_decompress_struct),
+                                     sp->subsampling_convert_ycbcrimage,
+                                     sp->subsampling_ver * 8) == 0)
+            return (0);
+        sp->subsampling_convert_state = m;
+    }
+    return (1);
 }
 
-static int
-OJPEGPreDecodeSkipRaw(TIFF* tif)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint32 m;
-	m=sp->lines_per_strile;
-	if (sp->subsampling_convert_state!=0)
-	{
-		if (sp->subsampling_convert_clines-sp->subsampling_convert_state>=m)
-		{
-			sp->subsampling_convert_state+=m;
-			if (sp->subsampling_convert_state==sp->subsampling_convert_clines)
-				sp->subsampling_convert_state=0;
-			return(1);
-		}
-		m-=sp->subsampling_convert_clines-sp->subsampling_convert_state;
-		sp->subsampling_convert_state=0;
-                sp->error_in_raw_data_decoding=0;
-	}
-	while (m>=sp->subsampling_convert_clines)
-	{
-		if (jpeg_read_raw_data_encap(sp,&(sp->libjpeg_jpeg_decompress_struct),sp->subsampling_convert_ycbcrimage,sp->subsampling_ver*8)==0)
-			return(0);
-		m-=sp->subsampling_convert_clines;
-	}
-	if (m>0)
-	{
-		if (jpeg_read_raw_data_encap(sp,&(sp->libjpeg_jpeg_decompress_struct),sp->subsampling_convert_ycbcrimage,sp->subsampling_ver*8)==0)
-			return(0);
-		sp->subsampling_convert_state=m;
-	}
-	return(1);
+static int OJPEGPreDecodeSkipScanlines(TIFF *tif)
+{
+    static const char module[] = "OJPEGPreDecodeSkipScanlines";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint32_t m;
+    if (sp->skip_buffer == NULL)
+    {
+        sp->skip_buffer = _TIFFmallocExt(tif, sp->bytes_per_line);
+        if (sp->skip_buffer == NULL)
+        {
+            TIFFErrorExtR(tif, module, "Out of memory");
+            return (0);
+        }
+    }
+    for (m = 0; m < sp->lines_per_strile; m++)
+    {
+        if (jpeg_read_scanlines_encap(sp, &(sp->libjpeg_jpeg_decompress_struct),
+                                      &sp->skip_buffer, 1) == 0)
+            return (0);
+    }
+    return (1);
 }
 
-static int
-OJPEGPreDecodeSkipScanlines(TIFF* tif)
-{
-	static const char module[]="OJPEGPreDecodeSkipScanlines";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint32 m;
-	if (sp->skip_buffer==NULL)
-	{
-		sp->skip_buffer=_TIFFmalloc(sp->bytes_per_line);
-		if (sp->skip_buffer==NULL)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-			return(0);
-		}
-	}
-	for (m=0; m<sp->lines_per_strile; m++)
-	{
-		if (jpeg_read_scanlines_encap(sp,&(sp->libjpeg_jpeg_decompress_struct),&sp->skip_buffer,1)==0)
-			return(0);
-	}
-	return(1);
+static int OJPEGDecode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s)
+{
+    static const char module[] = "OJPEGDecode";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    (void)s;
+    if (!sp->decoder_ok)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Cannot decode: decoder not correctly initialized");
+        return 0;
+    }
+    if (sp->libjpeg_session_active == 0)
+    {
+        /* This should normally not happen, except that it does when */
+        /* using TIFFReadScanline() which calls OJPEGPostDecode() for */
+        /* each scanline, which assumes that a whole strile was read */
+        /* and may thus incorrectly consider it has read the whole image,
+         * causing */
+        /* OJPEGLibjpegSessionAbort() to be called prematurely. */
+        /* Triggered by https://gitlab.com/libtiff/libtiff/-/issues/337 */
+        TIFFErrorExtR(tif, module,
+                      "Cannot decode: libjpeg_session_active == 0");
+        return 0;
+    }
+    if (sp->error_in_raw_data_decoding)
+    {
+        return 0;
+    }
+    if (sp->libjpeg_jpeg_query_style == 0)
+    {
+        if (OJPEGDecodeRaw(tif, buf, cc) == 0)
+            return (0);
+    }
+    else
+    {
+        if (OJPEGDecodeScanlines(tif, buf, cc) == 0)
+            return (0);
+    }
+    return (1);
 }
 
-static int
-OJPEGDecode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
+static int OJPEGDecodeRaw(TIFF *tif, uint8_t *buf, tmsize_t cc)
 {
-        static const char module[]="OJPEGDecode";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	(void)s;
-        if( !sp->decoder_ok )
+    static const char module[] = "OJPEGDecodeRaw";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t *m;
+    tmsize_t n;
+    uint8_t *oy;
+    uint8_t *ocb;
+    uint8_t *ocr;
+    uint8_t *p;
+    uint32_t q;
+    uint8_t *r;
+    uint8_t sx, sy;
+    if (cc % sp->bytes_per_line != 0)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanline not read");
+        return (0);
+    }
+    assert(cc > 0);
+    m = buf;
+    n = cc;
+    do
+    {
+        if (sp->subsampling_convert_state == 0)
         {
-            TIFFErrorExt(tif->tif_clientdata,module,"Cannot decode: decoder not correctly initialized");
-            return 0;
+            if (jpeg_read_raw_data_encap(sp,
+                                         &(sp->libjpeg_jpeg_decompress_struct),
+                                         sp->subsampling_convert_ycbcrimage,
+                                         sp->subsampling_ver * 8) == 0)
+            {
+                sp->error_in_raw_data_decoding = 1;
+                return (0);
+            }
         }
-        if( sp->error_in_raw_data_decoding )
+        oy = sp->subsampling_convert_ybuf +
+             sp->subsampling_convert_state * sp->subsampling_ver *
+                 sp->subsampling_convert_ylinelen;
+        ocb = sp->subsampling_convert_cbbuf +
+              sp->subsampling_convert_state * sp->subsampling_convert_clinelen;
+        ocr = sp->subsampling_convert_crbuf +
+              sp->subsampling_convert_state * sp->subsampling_convert_clinelen;
+        p = m;
+        for (q = 0; q < sp->subsampling_convert_clinelenout; q++)
         {
-            return 0;
+            r = oy;
+            for (sy = 0; sy < sp->subsampling_ver; sy++)
+            {
+                for (sx = 0; sx < sp->subsampling_hor; sx++)
+                    *p++ = *r++;
+                r += sp->subsampling_convert_ylinelen - sp->subsampling_hor;
+            }
+            oy += sp->subsampling_hor;
+            *p++ = *ocb++;
+            *p++ = *ocr++;
         }
-	if (sp->libjpeg_jpeg_query_style==0)
-	{
-		if (OJPEGDecodeRaw(tif,buf,cc)==0)
-			return(0);
-	}
-	else
-	{
-		if (OJPEGDecodeScanlines(tif,buf,cc)==0)
-			return(0);
-	}
-	return(1);
+        sp->subsampling_convert_state++;
+        if (sp->subsampling_convert_state == sp->subsampling_convert_clines)
+            sp->subsampling_convert_state = 0;
+        m += sp->bytes_per_line;
+        n -= sp->bytes_per_line;
+    } while (n > 0);
+    return (1);
 }
 
-static int
-OJPEGDecodeRaw(TIFF* tif, uint8* buf, tmsize_t cc)
-{
-	static const char module[]="OJPEGDecodeRaw";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8* m;
-	tmsize_t n;
-	uint8* oy;
-	uint8* ocb;
-	uint8* ocr;
-	uint8* p;
-	uint32 q;
-	uint8* r;
-	uint8 sx,sy;
-	if (cc%sp->bytes_per_line!=0)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Fractional scanline not read");
-		return(0);
-	}
-	assert(cc>0);
-	m=buf;
-	n=cc;
-	do
-	{
-		if (sp->subsampling_convert_state==0)
-		{
-			if (jpeg_read_raw_data_encap(sp,&(sp->libjpeg_jpeg_decompress_struct),sp->subsampling_convert_ycbcrimage,sp->subsampling_ver*8)==0)
-			{
-				sp->error_in_raw_data_decoding = 1;
-				return(0);
-			}
-		}
-		oy=sp->subsampling_convert_ybuf+sp->subsampling_convert_state*sp->subsampling_ver*sp->subsampling_convert_ylinelen;
-		ocb=sp->subsampling_convert_cbbuf+sp->subsampling_convert_state*sp->subsampling_convert_clinelen;
-		ocr=sp->subsampling_convert_crbuf+sp->subsampling_convert_state*sp->subsampling_convert_clinelen;
-		p=m;
-		for (q=0; q<sp->subsampling_convert_clinelenout; q++)
-		{
-			r=oy;
-			for (sy=0; sy<sp->subsampling_ver; sy++)
-			{
-				for (sx=0; sx<sp->subsampling_hor; sx++)
-					*p++=*r++;
-				r+=sp->subsampling_convert_ylinelen-sp->subsampling_hor;
-			}
-			oy+=sp->subsampling_hor;
-			*p++=*ocb++;
-			*p++=*ocr++;
-		}
-		sp->subsampling_convert_state++;
-		if (sp->subsampling_convert_state==sp->subsampling_convert_clines)
-			sp->subsampling_convert_state=0;
-		m+=sp->bytes_per_line;
-		n-=sp->bytes_per_line;
-	} while(n>0);
-	return(1);
-}
-
-static int
-OJPEGDecodeScanlines(TIFF* tif, uint8* buf, tmsize_t cc)
-{
-	static const char module[]="OJPEGDecodeScanlines";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8* m;
-	tmsize_t n;
-	if (cc%sp->bytes_per_line!=0)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Fractional scanline not read");
-		return(0);
-	}
-	assert(cc>0);
-	m=buf;
-	n=cc;
-	do
-	{
-		if (jpeg_read_scanlines_encap(sp,&(sp->libjpeg_jpeg_decompress_struct),&m,1)==0)
-			return(0);
-		m+=sp->bytes_per_line;
-		n-=sp->bytes_per_line;
-	} while(n>0);
-	return(1);
+static int OJPEGDecodeScanlines(TIFF *tif, uint8_t *buf, tmsize_t cc)
+{
+    static const char module[] = "OJPEGDecodeScanlines";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t *m;
+    tmsize_t n;
+    if (cc % sp->bytes_per_line != 0)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanline not read");
+        return (0);
+    }
+    assert(cc > 0);
+    m = buf;
+    n = cc;
+    do
+    {
+        if (jpeg_read_scanlines_encap(sp, &(sp->libjpeg_jpeg_decompress_struct),
+                                      &m, 1) == 0)
+            return (0);
+        m += sp->bytes_per_line;
+        n -= sp->bytes_per_line;
+    } while (n > 0);
+    return (1);
 }
 
-static void
-OJPEGPostDecode(TIFF* tif, uint8* buf, tmsize_t cc)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	(void)buf;
-	(void)cc;
-	sp->write_curstrile++;
-	if (sp->write_curstrile%tif->tif_dir.td_stripsperimage==0)  
-	{
-		assert(sp->libjpeg_session_active!=0);
-		OJPEGLibjpegSessionAbort(tif);
-		sp->writeheader_done=0;
-	}
+static void OJPEGPostDecode(TIFF *tif, uint8_t *buf, tmsize_t cc)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    (void)buf;
+    (void)cc;
+    /* This function somehow incorrectly assumes that a whole strile was read,
+     */
+    /* which is not true when TIFFReadScanline() is called, */
+    /* and may thus incorrectly consider it has read the whole image, causing */
+    /* OJPEGLibjpegSessionAbort() to be called prematurely. */
+    /* So this logic should be fixed to take into account cc, or disable */
+    /* the scan line reading interface. */
+    /* Triggered by https://gitlab.com/libtiff/libtiff/-/issues/337 */
+    sp->write_curstrile++;
+    if (sp->write_curstrile % tif->tif_dir.td_stripsperimage == 0)
+    {
+        assert(sp->libjpeg_session_active != 0);
+        OJPEGLibjpegSessionAbort(tif);
+        sp->writeheader_done = 0;
+    }
 }
 
-static int
-OJPEGSetupEncode(TIFF* tif)
+static int OJPEGSetupEncode(TIFF *tif)
 {
-	static const char module[]="OJPEGSetupEncode";
-	TIFFErrorExt(tif->tif_clientdata,module,"OJPEG encoding not supported; use new-style JPEG compression instead");
-	return(0);
+    static const char module[] = "OJPEGSetupEncode";
+    TIFFErrorExtR(
+        tif, module,
+        "OJPEG encoding not supported; use new-style JPEG compression instead");
+    return (0);
 }
 
-static int
-OJPEGPreEncode(TIFF* tif, uint16 s)
+static int OJPEGPreEncode(TIFF *tif, uint16_t s)
 {
-	static const char module[]="OJPEGPreEncode";
-	(void)s;
-	TIFFErrorExt(tif->tif_clientdata,module,"OJPEG encoding not supported; use new-style JPEG compression instead");
-	return(0);
+    static const char module[] = "OJPEGPreEncode";
+    (void)s;
+    TIFFErrorExtR(
+        tif, module,
+        "OJPEG encoding not supported; use new-style JPEG compression instead");
+    return (0);
 }
 
-static int
-OJPEGEncode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
+static int OJPEGEncode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s)
 {
-	static const char module[]="OJPEGEncode";
-	(void)buf;
-	(void)cc;
-	(void)s;
-	TIFFErrorExt(tif->tif_clientdata,module,"OJPEG encoding not supported; use new-style JPEG compression instead");
-	return(0);
+    static const char module[] = "OJPEGEncode";
+    (void)buf;
+    (void)cc;
+    (void)s;
+    TIFFErrorExtR(
+        tif, module,
+        "OJPEG encoding not supported; use new-style JPEG compression instead");
+    return (0);
 }
 
-static int
-OJPEGPostEncode(TIFF* tif)
+static int OJPEGPostEncode(TIFF *tif)
 {
-	static const char module[]="OJPEGPostEncode";
-	TIFFErrorExt(tif->tif_clientdata,module,"OJPEG encoding not supported; use new-style JPEG compression instead");
-	return(0);
+    static const char module[] = "OJPEGPostEncode";
+    TIFFErrorExtR(
+        tif, module,
+        "OJPEG encoding not supported; use new-style JPEG compression instead");
+    return (0);
 }
 
-static void
-OJPEGCleanup(TIFF* tif)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	if (sp!=0)
-	{
-		tif->tif_tagmethods.vgetfield=sp->vgetparent;
-		tif->tif_tagmethods.vsetfield=sp->vsetparent;
-		tif->tif_tagmethods.printdir=sp->printdir;
-		if (sp->qtable[0]!=0)
-			_TIFFfree(sp->qtable[0]);
-		if (sp->qtable[1]!=0)
-			_TIFFfree(sp->qtable[1]);
-		if (sp->qtable[2]!=0)
-			_TIFFfree(sp->qtable[2]);
-		if (sp->qtable[3]!=0)
-			_TIFFfree(sp->qtable[3]);
-		if (sp->dctable[0]!=0)
-			_TIFFfree(sp->dctable[0]);
-		if (sp->dctable[1]!=0)
-			_TIFFfree(sp->dctable[1]);
-		if (sp->dctable[2]!=0)
-			_TIFFfree(sp->dctable[2]);
-		if (sp->dctable[3]!=0)
-			_TIFFfree(sp->dctable[3]);
-		if (sp->actable[0]!=0)
-			_TIFFfree(sp->actable[0]);
-		if (sp->actable[1]!=0)
-			_TIFFfree(sp->actable[1]);
-		if (sp->actable[2]!=0)
-			_TIFFfree(sp->actable[2]);
-		if (sp->actable[3]!=0)
-			_TIFFfree(sp->actable[3]);
-		if (sp->libjpeg_session_active!=0)
-			OJPEGLibjpegSessionAbort(tif);
-		if (sp->subsampling_convert_ycbcrbuf!=0)
-			_TIFFfree(sp->subsampling_convert_ycbcrbuf);
-		if (sp->subsampling_convert_ycbcrimage!=0)
-			_TIFFfree(sp->subsampling_convert_ycbcrimage);
-		if (sp->skip_buffer!=0)
-			_TIFFfree(sp->skip_buffer);
-		_TIFFfree(sp);
-		tif->tif_data=NULL;
-		_TIFFSetDefaultCompressionState(tif);
-	}
+static void OJPEGCleanup(TIFF *tif)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    if (sp != 0)
+    {
+        tif->tif_tagmethods.vgetfield = sp->vgetparent;
+        tif->tif_tagmethods.vsetfield = sp->vsetparent;
+        tif->tif_tagmethods.printdir = sp->printdir;
+        if (sp->qtable[0] != 0)
+            _TIFFfreeExt(tif, sp->qtable[0]);
+        if (sp->qtable[1] != 0)
+            _TIFFfreeExt(tif, sp->qtable[1]);
+        if (sp->qtable[2] != 0)
+            _TIFFfreeExt(tif, sp->qtable[2]);
+        if (sp->qtable[3] != 0)
+            _TIFFfreeExt(tif, sp->qtable[3]);
+        if (sp->dctable[0] != 0)
+            _TIFFfreeExt(tif, sp->dctable[0]);
+        if (sp->dctable[1] != 0)
+            _TIFFfreeExt(tif, sp->dctable[1]);
+        if (sp->dctable[2] != 0)
+            _TIFFfreeExt(tif, sp->dctable[2]);
+        if (sp->dctable[3] != 0)
+            _TIFFfreeExt(tif, sp->dctable[3]);
+        if (sp->actable[0] != 0)
+            _TIFFfreeExt(tif, sp->actable[0]);
+        if (sp->actable[1] != 0)
+            _TIFFfreeExt(tif, sp->actable[1]);
+        if (sp->actable[2] != 0)
+            _TIFFfreeExt(tif, sp->actable[2]);
+        if (sp->actable[3] != 0)
+            _TIFFfreeExt(tif, sp->actable[3]);
+        if (sp->libjpeg_session_active != 0)
+            OJPEGLibjpegSessionAbort(tif);
+        if (sp->subsampling_convert_ycbcrbuf != 0)
+            _TIFFfreeExt(tif, sp->subsampling_convert_ycbcrbuf);
+        if (sp->subsampling_convert_ycbcrimage != 0)
+            _TIFFfreeExt(tif, sp->subsampling_convert_ycbcrimage);
+        if (sp->skip_buffer != 0)
+            _TIFFfreeExt(tif, sp->skip_buffer);
+        _TIFFfreeExt(tif, sp);
+        tif->tif_data = NULL;
+        _TIFFSetDefaultCompressionState(tif);
+    }
 }
 
-static void
-OJPEGSubsamplingCorrect(TIFF* tif)
-{
-	static const char module[]="OJPEGSubsamplingCorrect";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 mh;
-	uint8 mv;
-        
-	assert(sp->subsamplingcorrect_done==0);
-	if ((tif->tif_dir.td_samplesperpixel!=3) || ((tif->tif_dir.td_photometric!=PHOTOMETRIC_YCBCR) &&
-	    (tif->tif_dir.td_photometric!=PHOTOMETRIC_ITULAB)))
-	{
-		if (sp->subsampling_tag!=0)
-			TIFFWarningExt(tif->tif_clientdata,module,"Subsampling tag not appropriate for this Photometric and/or SamplesPerPixel");
-		sp->subsampling_hor=1;
-		sp->subsampling_ver=1;
-		sp->subsampling_force_desubsampling_inside_decompression=0;
-	}
-	else
-	{
-		sp->subsamplingcorrect_done=1;
-		mh=sp->subsampling_hor;
-		mv=sp->subsampling_ver;
-		sp->subsamplingcorrect=1;
-		OJPEGReadHeaderInfoSec(tif);
-		if (sp->subsampling_force_desubsampling_inside_decompression!=0)
-		{
-			sp->subsampling_hor=1;
-			sp->subsampling_ver=1;
-		}
-		sp->subsamplingcorrect=0;
-		if (((sp->subsampling_hor!=mh) || (sp->subsampling_ver!=mv)) && (sp->subsampling_force_desubsampling_inside_decompression==0))
-		{
-			if (sp->subsampling_tag==0)
-				TIFFWarningExt(tif->tif_clientdata,module,"Subsampling tag is not set, yet subsampling inside JPEG data [%d,%d] does not match default values [2,2]; assuming subsampling inside JPEG data is correct",sp->subsampling_hor,sp->subsampling_ver);
-			else
-				TIFFWarningExt(tif->tif_clientdata,module,"Subsampling inside JPEG data [%d,%d] does not match subsampling tag values [%d,%d]; assuming subsampling inside JPEG data is correct",sp->subsampling_hor,sp->subsampling_ver,mh,mv);
-		}
-		if (sp->subsampling_force_desubsampling_inside_decompression!=0)
-		{
-			if (sp->subsampling_tag==0)
-				TIFFWarningExt(tif->tif_clientdata,module,"Subsampling tag is not set, yet subsampling inside JPEG data does not match default values [2,2] (nor any other values allowed in TIFF); assuming subsampling inside JPEG data is correct and desubsampling inside JPEG decompression");
-			else
-				TIFFWarningExt(tif->tif_clientdata,module,"Subsampling inside JPEG data does not match subsampling tag values [%d,%d] (nor any other values allowed in TIFF); assuming subsampling inside JPEG data is correct and desubsampling inside JPEG decompression",mh,mv);
-		}
-		if (sp->subsampling_force_desubsampling_inside_decompression==0)
-		{
-			if (sp->subsampling_hor<sp->subsampling_ver)
-				TIFFWarningExt(tif->tif_clientdata,module,"Subsampling values [%d,%d] are not allowed in TIFF",sp->subsampling_hor,sp->subsampling_ver);
-		}
-	}
-	sp->subsamplingcorrect_done=1;
+static void OJPEGSubsamplingCorrect(TIFF *tif)
+{
+    static const char module[] = "OJPEGSubsamplingCorrect";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t mh;
+    uint8_t mv;
+
+    assert(sp->subsamplingcorrect_done == 0);
+    if ((tif->tif_dir.td_samplesperpixel != 3) ||
+        ((tif->tif_dir.td_photometric != PHOTOMETRIC_YCBCR) &&
+         (tif->tif_dir.td_photometric != PHOTOMETRIC_ITULAB)))
+    {
+        if (sp->subsampling_tag != 0)
+            TIFFWarningExtR(tif, module,
+                            "Subsampling tag not appropriate for this "
+                            "Photometric and/or SamplesPerPixel");
+        sp->subsampling_hor = 1;
+        sp->subsampling_ver = 1;
+        sp->subsampling_force_desubsampling_inside_decompression = 0;
+    }
+    else
+    {
+        sp->subsamplingcorrect_done = 1;
+        mh = sp->subsampling_hor;
+        mv = sp->subsampling_ver;
+        sp->subsamplingcorrect = 1;
+        OJPEGReadHeaderInfoSec(tif);
+        if (sp->subsampling_force_desubsampling_inside_decompression != 0)
+        {
+            sp->subsampling_hor = 1;
+            sp->subsampling_ver = 1;
+        }
+        sp->subsamplingcorrect = 0;
+        if (((sp->subsampling_hor != mh) || (sp->subsampling_ver != mv)) &&
+            (sp->subsampling_force_desubsampling_inside_decompression == 0))
+        {
+            if (sp->subsampling_tag == 0)
+                TIFFWarningExtR(
+                    tif, module,
+                    "Subsampling tag is not set, yet subsampling inside JPEG "
+                    "data [%" PRIu8 ",%" PRIu8
+                    "] does not match default values [2,2]; assuming "
+                    "subsampling inside JPEG data is correct",
+                    sp->subsampling_hor, sp->subsampling_ver);
+            else
+                TIFFWarningExtR(
+                    tif, module,
+                    "Subsampling inside JPEG data [%" PRIu8 ",%" PRIu8
+                    "] does not match subsampling tag values [%" PRIu8
+                    ",%" PRIu8
+                    "]; assuming subsampling inside JPEG data is correct",
+                    sp->subsampling_hor, sp->subsampling_ver, mh, mv);
+        }
+        if (sp->subsampling_force_desubsampling_inside_decompression != 0)
+        {
+            if (sp->subsampling_tag == 0)
+                TIFFWarningExtR(
+                    tif, module,
+                    "Subsampling tag is not set, yet subsampling inside JPEG "
+                    "data does not match default values [2,2] (nor any other "
+                    "values allowed in TIFF); assuming subsampling inside JPEG "
+                    "data is correct and desubsampling inside JPEG "
+                    "decompression");
+            else
+                TIFFWarningExtR(
+                    tif, module,
+                    "Subsampling inside JPEG data does not match subsampling "
+                    "tag values [%" PRIu8 ",%" PRIu8
+                    "] (nor any other values allowed in TIFF); assuming "
+                    "subsampling inside JPEG data is correct and desubsampling "
+                    "inside JPEG decompression",
+                    mh, mv);
+        }
+        if (sp->subsampling_force_desubsampling_inside_decompression == 0)
+        {
+            if (sp->subsampling_hor < sp->subsampling_ver)
+                TIFFWarningExtR(tif, module,
+                                "Subsampling values [%" PRIu8 ",%" PRIu8
+                                "] are not allowed in TIFF",
+                                sp->subsampling_hor, sp->subsampling_ver);
+        }
+    }
+    sp->subsamplingcorrect_done = 1;
 }
 
-static int
-OJPEGReadHeaderInfo(TIFF* tif)
-{
-	static const char module[]="OJPEGReadHeaderInfo";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	assert(sp->readheader_done==0);
-	sp->image_width=tif->tif_dir.td_imagewidth;
-	sp->image_length=tif->tif_dir.td_imagelength;
-	if (isTiled(tif))
-	{
-		sp->strile_width=tif->tif_dir.td_tilewidth;
-		sp->strile_length=tif->tif_dir.td_tilelength;
-		sp->strile_length_total=((sp->image_length+sp->strile_length-1)/sp->strile_length)*sp->strile_length;
-	}
-	else
-	{
-		sp->strile_width=sp->image_width;
-		sp->strile_length=tif->tif_dir.td_rowsperstrip;
-                if( sp->strile_length == (uint32)-1 )
-                    sp->strile_length = sp->image_length;
-		sp->strile_length_total=sp->image_length;
-	}
-	if (tif->tif_dir.td_samplesperpixel==1)
-	{
-		sp->samples_per_pixel=1;
-		sp->plane_sample_offset=0;
-		sp->samples_per_pixel_per_plane=sp->samples_per_pixel;
-		sp->subsampling_hor=1;
-		sp->subsampling_ver=1;
-	}
-	else
-	{
-		if (tif->tif_dir.td_samplesperpixel!=3)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"SamplesPerPixel %d not supported for this compression scheme",sp->samples_per_pixel);
-			return(0);
-		}
-		sp->samples_per_pixel=3;
-		sp->plane_sample_offset=0;
-		if (tif->tif_dir.td_planarconfig==PLANARCONFIG_CONTIG)
-			sp->samples_per_pixel_per_plane=3;
-		else
-			sp->samples_per_pixel_per_plane=1;
-	}
-	if (sp->strile_length<sp->image_length)
-	{
-		if (((sp->subsampling_hor!=1) && (sp->subsampling_hor!=2) && (sp->subsampling_hor!=4)) ||
-		    ((sp->subsampling_ver!=1) && (sp->subsampling_ver!=2) && (sp->subsampling_ver!=4)))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Invalid subsampling values");
-			return(0);
-		}
-		if (sp->strile_length%(sp->subsampling_ver*8)!=0)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Incompatible vertical subsampling and image strip/tile length");
-			return(0);
-		}
-		sp->restart_interval=(uint16)(((sp->strile_width+sp->subsampling_hor*8-1)/(sp->subsampling_hor*8))*(sp->strile_length/(sp->subsampling_ver*8)));
-	}
-	if (OJPEGReadHeaderInfoSec(tif)==0)
-		return(0);
-	sp->sos_end[0].log=1;
-	sp->sos_end[0].in_buffer_source=sp->in_buffer_source;
-	sp->sos_end[0].in_buffer_next_strile=sp->in_buffer_next_strile;
-	sp->sos_end[0].in_buffer_file_pos=sp->in_buffer_file_pos-sp->in_buffer_togo;
-	sp->sos_end[0].in_buffer_file_togo=sp->in_buffer_file_togo+sp->in_buffer_togo; 
-	sp->readheader_done=1;
-	return(1);
+static int OJPEGReadHeaderInfo(TIFF *tif)
+{
+    static const char module[] = "OJPEGReadHeaderInfo";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    assert(sp->readheader_done == 0);
+    sp->image_width = tif->tif_dir.td_imagewidth;
+    sp->image_length = tif->tif_dir.td_imagelength;
+    if (isTiled(tif))
+    {
+        sp->strile_width = tif->tif_dir.td_tilewidth;
+        sp->strile_length = tif->tif_dir.td_tilelength;
+        sp->strile_length_total =
+            ((sp->image_length + sp->strile_length - 1) / sp->strile_length) *
+            sp->strile_length;
+    }
+    else
+    {
+        sp->strile_width = sp->image_width;
+        sp->strile_length = tif->tif_dir.td_rowsperstrip;
+        if (sp->strile_length == (uint32_t)-1)
+            sp->strile_length = sp->image_length;
+        sp->strile_length_total = sp->image_length;
+    }
+    if (tif->tif_dir.td_samplesperpixel == 1)
+    {
+        sp->samples_per_pixel = 1;
+        sp->plane_sample_offset = 0;
+        sp->samples_per_pixel_per_plane = sp->samples_per_pixel;
+        sp->subsampling_hor = 1;
+        sp->subsampling_ver = 1;
+    }
+    else
+    {
+        if (tif->tif_dir.td_samplesperpixel != 3)
+        {
+            TIFFErrorExtR(tif, module,
+                          "SamplesPerPixel %" PRIu8
+                          " not supported for this compression scheme",
+                          sp->samples_per_pixel);
+            return (0);
+        }
+        sp->samples_per_pixel = 3;
+        sp->plane_sample_offset = 0;
+        if (tif->tif_dir.td_planarconfig == PLANARCONFIG_CONTIG)
+            sp->samples_per_pixel_per_plane = 3;
+        else
+            sp->samples_per_pixel_per_plane = 1;
+    }
+    if (sp->strile_length < sp->image_length)
+    {
+        if (((sp->subsampling_hor != 1) && (sp->subsampling_hor != 2) &&
+             (sp->subsampling_hor != 4)) ||
+            ((sp->subsampling_ver != 1) && (sp->subsampling_ver != 2) &&
+             (sp->subsampling_ver != 4)))
+        {
+            TIFFErrorExtR(tif, module, "Invalid subsampling values");
+            return (0);
+        }
+        if (sp->strile_length % (sp->subsampling_ver * 8) != 0)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Incompatible vertical subsampling and image "
+                          "strip/tile length");
+            return (0);
+        }
+        sp->restart_interval =
+            (uint16_t)(((sp->strile_width + sp->subsampling_hor * 8 - 1) /
+                        (sp->subsampling_hor * 8)) *
+                       (sp->strile_length / (sp->subsampling_ver * 8)));
+    }
+    if (OJPEGReadHeaderInfoSec(tif) == 0)
+        return (0);
+    sp->sos_end[0].log = 1;
+    sp->sos_end[0].in_buffer_source = sp->in_buffer_source;
+    sp->sos_end[0].in_buffer_next_strile = sp->in_buffer_next_strile;
+    sp->sos_end[0].in_buffer_file_pos =
+        sp->in_buffer_file_pos - sp->in_buffer_togo;
+    sp->sos_end[0].in_buffer_file_togo =
+        sp->in_buffer_file_togo + sp->in_buffer_togo;
+    sp->readheader_done = 1;
+    return (1);
 }
 
-static int
-OJPEGReadSecondarySos(TIFF* tif, uint16 s)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 m;
-	assert(s>0);
-	assert(s<3);
-	assert(sp->sos_end[0].log!=0);
-	assert(sp->sos_end[s].log==0);
-	sp->plane_sample_offset=(uint8)(s-1);
-	while(sp->sos_end[sp->plane_sample_offset].log==0)
-		sp->plane_sample_offset--;
-	sp->in_buffer_source=sp->sos_end[sp->plane_sample_offset].in_buffer_source;
-	sp->in_buffer_next_strile=sp->sos_end[sp->plane_sample_offset].in_buffer_next_strile;
-	sp->in_buffer_file_pos=sp->sos_end[sp->plane_sample_offset].in_buffer_file_pos;
-	sp->in_buffer_file_pos_log=0;
-	sp->in_buffer_file_togo=sp->sos_end[sp->plane_sample_offset].in_buffer_file_togo;
-	sp->in_buffer_togo=0;
-	sp->in_buffer_cur=0;
-	while(sp->plane_sample_offset<s)
-	{
-		do
-		{
-			if (OJPEGReadByte(sp,&m)==0)
-				return(0);
-			if (m==255)
-			{
-				do
-				{
-					if (OJPEGReadByte(sp,&m)==0)
-						return(0);
-					if (m!=255)
-						break;
-				} while(1);
-				if (m==JPEG_MARKER_SOS)
-					break;
-			}
-		} while(1);
-		sp->plane_sample_offset++;
-		if (OJPEGReadHeaderInfoSecStreamSos(tif)==0)
-			return(0);
-		sp->sos_end[sp->plane_sample_offset].log=1;
-		sp->sos_end[sp->plane_sample_offset].in_buffer_source=sp->in_buffer_source;
-		sp->sos_end[sp->plane_sample_offset].in_buffer_next_strile=sp->in_buffer_next_strile;
-		sp->sos_end[sp->plane_sample_offset].in_buffer_file_pos=sp->in_buffer_file_pos-sp->in_buffer_togo;
-		sp->sos_end[sp->plane_sample_offset].in_buffer_file_togo=sp->in_buffer_file_togo+sp->in_buffer_togo;
-	}
-	return(1);
+static int OJPEGReadSecondarySos(TIFF *tif, uint16_t s)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t m;
+    assert(s > 0);
+    assert(s < 3);
+    assert(sp->sos_end[0].log != 0);
+    assert(sp->sos_end[s].log == 0);
+    sp->plane_sample_offset = (uint8_t)(s - 1);
+    while (sp->sos_end[sp->plane_sample_offset].log == 0)
+        sp->plane_sample_offset--;
+    sp->in_buffer_source =
+        sp->sos_end[sp->plane_sample_offset].in_buffer_source;
+    sp->in_buffer_next_strile =
+        sp->sos_end[sp->plane_sample_offset].in_buffer_next_strile;
+    sp->in_buffer_file_pos =
+        sp->sos_end[sp->plane_sample_offset].in_buffer_file_pos;
+    sp->in_buffer_file_pos_log = 0;
+    sp->in_buffer_file_togo =
+        sp->sos_end[sp->plane_sample_offset].in_buffer_file_togo;
+    sp->in_buffer_togo = 0;
+    sp->in_buffer_cur = 0;
+    while (sp->plane_sample_offset < s)
+    {
+        do
+        {
+            if (OJPEGReadByte(sp, &m) == 0)
+                return (0);
+            if (m == 255)
+            {
+                do
+                {
+                    if (OJPEGReadByte(sp, &m) == 0)
+                        return (0);
+                    if (m != 255)
+                        break;
+                } while (1);
+                if (m == JPEG_MARKER_SOS)
+                    break;
+            }
+        } while (1);
+        sp->plane_sample_offset++;
+        if (OJPEGReadHeaderInfoSecStreamSos(tif) == 0)
+            return (0);
+        sp->sos_end[sp->plane_sample_offset].log = 1;
+        sp->sos_end[sp->plane_sample_offset].in_buffer_source =
+            sp->in_buffer_source;
+        sp->sos_end[sp->plane_sample_offset].in_buffer_next_strile =
+            sp->in_buffer_next_strile;
+        sp->sos_end[sp->plane_sample_offset].in_buffer_file_pos =
+            sp->in_buffer_file_pos - sp->in_buffer_togo;
+        sp->sos_end[sp->plane_sample_offset].in_buffer_file_togo =
+            sp->in_buffer_file_togo + sp->in_buffer_togo;
+    }
+    return (1);
 }
 
-static int
-OJPEGWriteHeaderInfo(TIFF* tif)
-{
-	static const char module[]="OJPEGWriteHeaderInfo";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8** m;
-	uint32 n;
-	/* if a previous attempt failed, don't try again */
-	if (sp->libjpeg_session_active != 0) 
-		return 0;
-	sp->out_state=ososSoi;
-	sp->restart_index=0;
-	jpeg_std_error(&(sp->libjpeg_jpeg_error_mgr));
-	sp->libjpeg_jpeg_error_mgr.output_message=OJPEGLibjpegJpegErrorMgrOutputMessage;
-	sp->libjpeg_jpeg_error_mgr.error_exit=OJPEGLibjpegJpegErrorMgrErrorExit;
-	sp->libjpeg_jpeg_decompress_struct.err=&(sp->libjpeg_jpeg_error_mgr);
-	sp->libjpeg_jpeg_decompress_struct.client_data=(void*)tif;
-	if (jpeg_create_decompress_encap(sp,&(sp->libjpeg_jpeg_decompress_struct))==0)
-		return(0);
-	sp->libjpeg_session_active=1;
-	sp->libjpeg_jpeg_source_mgr.bytes_in_buffer=0;
-	sp->libjpeg_jpeg_source_mgr.init_source=OJPEGLibjpegJpegSourceMgrInitSource;
-	sp->libjpeg_jpeg_source_mgr.fill_input_buffer=OJPEGLibjpegJpegSourceMgrFillInputBuffer;
-	sp->libjpeg_jpeg_source_mgr.skip_input_data=OJPEGLibjpegJpegSourceMgrSkipInputData;
-	sp->libjpeg_jpeg_source_mgr.resync_to_restart=OJPEGLibjpegJpegSourceMgrResyncToRestart;
-	sp->libjpeg_jpeg_source_mgr.term_source=OJPEGLibjpegJpegSourceMgrTermSource;
-	sp->libjpeg_jpeg_decompress_struct.src=&(sp->libjpeg_jpeg_source_mgr);
-	if (jpeg_read_header_encap(sp,&(sp->libjpeg_jpeg_decompress_struct),1)==0)
-		return(0);
-	if ((sp->subsampling_force_desubsampling_inside_decompression==0) && (sp->samples_per_pixel_per_plane>1))
-	{
-		sp->libjpeg_jpeg_decompress_struct.raw_data_out=1;
+static int OJPEGWriteHeaderInfo(TIFF *tif)
+{
+    static const char module[] = "OJPEGWriteHeaderInfo";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t **m;
+    uint32_t n;
+    /* if a previous attempt failed, don't try again */
+    if (sp->libjpeg_session_active != 0)
+        return 0;
+    sp->out_state = ososSoi;
+    sp->restart_index = 0;
+    jpeg_std_error(&(sp->libjpeg_jpeg_error_mgr));
+    sp->libjpeg_jpeg_error_mgr.output_message =
+        OJPEGLibjpegJpegErrorMgrOutputMessage;
+    sp->libjpeg_jpeg_error_mgr.error_exit = OJPEGLibjpegJpegErrorMgrErrorExit;
+    sp->libjpeg_jpeg_decompress_struct.err = &(sp->libjpeg_jpeg_error_mgr);
+    sp->libjpeg_jpeg_decompress_struct.client_data = (void *)tif;
+    if (jpeg_create_decompress_encap(
+            sp, &(sp->libjpeg_jpeg_decompress_struct)) == 0)
+        return (0);
+    sp->libjpeg_session_active = 1;
+    sp->libjpeg_jpeg_source_mgr.bytes_in_buffer = 0;
+    sp->libjpeg_jpeg_source_mgr.init_source =
+        OJPEGLibjpegJpegSourceMgrInitSource;
+    sp->libjpeg_jpeg_source_mgr.fill_input_buffer =
+        OJPEGLibjpegJpegSourceMgrFillInputBuffer;
+    sp->libjpeg_jpeg_source_mgr.skip_input_data =
+        OJPEGLibjpegJpegSourceMgrSkipInputData;
+    sp->libjpeg_jpeg_source_mgr.resync_to_restart =
+        OJPEGLibjpegJpegSourceMgrResyncToRestart;
+    sp->libjpeg_jpeg_source_mgr.term_source =
+        OJPEGLibjpegJpegSourceMgrTermSource;
+    sp->libjpeg_jpeg_decompress_struct.src = &(sp->libjpeg_jpeg_source_mgr);
+    if (jpeg_read_header_encap(sp, &(sp->libjpeg_jpeg_decompress_struct), 1) ==
+        0)
+        return (0);
+    if ((sp->subsampling_force_desubsampling_inside_decompression == 0) &&
+        (sp->samples_per_pixel_per_plane > 1))
+    {
+        sp->libjpeg_jpeg_decompress_struct.raw_data_out = 1;
 #if JPEG_LIB_VERSION >= 70
-		sp->libjpeg_jpeg_decompress_struct.do_fancy_upsampling=FALSE;
+        sp->libjpeg_jpeg_decompress_struct.do_fancy_upsampling = FALSE;
 #endif
-		sp->libjpeg_jpeg_query_style=0;
-		if (sp->subsampling_convert_log==0)
-		{
-			assert(sp->subsampling_convert_ycbcrbuf==0);
-			assert(sp->subsampling_convert_ycbcrimage==0);
-			sp->subsampling_convert_ylinelen=((sp->strile_width+sp->subsampling_hor*8-1)/(sp->subsampling_hor*8)*sp->subsampling_hor*8);
-			sp->subsampling_convert_ylines=sp->subsampling_ver*8;
-			sp->subsampling_convert_clinelen=sp->subsampling_convert_ylinelen/sp->subsampling_hor;
-			sp->subsampling_convert_clines=8;
-			sp->subsampling_convert_ybuflen=sp->subsampling_convert_ylinelen*sp->subsampling_convert_ylines;
-			sp->subsampling_convert_cbuflen=sp->subsampling_convert_clinelen*sp->subsampling_convert_clines;
-			sp->subsampling_convert_ycbcrbuflen=sp->subsampling_convert_ybuflen+2*sp->subsampling_convert_cbuflen;
-                        /* The calloc is not normally necessary, except in some edge/broken cases */
-                        /* for example for a tiled image of height 1 with a tile height of 1 and subsampling_hor=subsampling_ver=2 */
-                        /* In that case, libjpeg will only fill the 8 first lines of the 16 lines */
-                        /* See https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16844 */
-                        /* Even if this case is allowed (?), its handling is broken because OJPEGPreDecode() should also likely */
-                        /* reset subsampling_convert_state to 0 when changing tile. */
-			sp->subsampling_convert_ycbcrbuf=_TIFFcalloc(1, sp->subsampling_convert_ycbcrbuflen);
-			if (sp->subsampling_convert_ycbcrbuf==0)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-				return(0);
-			}
-			sp->subsampling_convert_ybuf=sp->subsampling_convert_ycbcrbuf;
-			sp->subsampling_convert_cbbuf=sp->subsampling_convert_ybuf+sp->subsampling_convert_ybuflen;
-			sp->subsampling_convert_crbuf=sp->subsampling_convert_cbbuf+sp->subsampling_convert_cbuflen;
-			sp->subsampling_convert_ycbcrimagelen=3+sp->subsampling_convert_ylines+2*sp->subsampling_convert_clines;
-			sp->subsampling_convert_ycbcrimage=_TIFFmalloc(sp->subsampling_convert_ycbcrimagelen*sizeof(uint8*));
-			if (sp->subsampling_convert_ycbcrimage==0)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-				return(0);
-			}
-			m=sp->subsampling_convert_ycbcrimage;
-			*m++=(uint8*)(sp->subsampling_convert_ycbcrimage+3);
-			*m++=(uint8*)(sp->subsampling_convert_ycbcrimage+3+sp->subsampling_convert_ylines);
-			*m++=(uint8*)(sp->subsampling_convert_ycbcrimage+3+sp->subsampling_convert_ylines+sp->subsampling_convert_clines);
-			for (n=0; n<sp->subsampling_convert_ylines; n++)
-				*m++=sp->subsampling_convert_ybuf+n*sp->subsampling_convert_ylinelen;
-			for (n=0; n<sp->subsampling_convert_clines; n++)
-				*m++=sp->subsampling_convert_cbbuf+n*sp->subsampling_convert_clinelen;
-			for (n=0; n<sp->subsampling_convert_clines; n++)
-				*m++=sp->subsampling_convert_crbuf+n*sp->subsampling_convert_clinelen;
-			sp->subsampling_convert_clinelenout=sp->strile_width/sp->subsampling_hor + ((sp->strile_width % sp->subsampling_hor) != 0 ? 1 : 0);
-			sp->subsampling_convert_state=0;
-			sp->error_in_raw_data_decoding=0;
-			sp->bytes_per_line=sp->subsampling_convert_clinelenout*(sp->subsampling_ver*sp->subsampling_hor+2);
-			sp->lines_per_strile=sp->strile_length/sp->subsampling_ver + ((sp->strile_length % sp->subsampling_ver) != 0 ? 1 : 0);
-			sp->subsampling_convert_log=1;
-		}
-	}
-	else
-	{
-		sp->libjpeg_jpeg_decompress_struct.jpeg_color_space=JCS_UNKNOWN;
-		sp->libjpeg_jpeg_decompress_struct.out_color_space=JCS_UNKNOWN;
-		sp->libjpeg_jpeg_query_style=1;
-		sp->bytes_per_line=sp->samples_per_pixel_per_plane*sp->strile_width;
-		sp->lines_per_strile=sp->strile_length;
-	}
-	if (jpeg_start_decompress_encap(sp,&(sp->libjpeg_jpeg_decompress_struct))==0)
-		return(0);
-        if(sp->libjpeg_jpeg_decompress_struct.image_width != sp->strile_width ) {
-            TIFFErrorExt(tif->tif_clientdata,module,
-                         "jpeg_start_decompress() returned image_width = %d, "
-                         "expected %d",
-                         sp->libjpeg_jpeg_decompress_struct.image_width,
-                         sp->strile_width);
-            return 0;
-        }
-        if(sp->libjpeg_jpeg_decompress_struct.max_h_samp_factor != sp->subsampling_hor ||
-           sp->libjpeg_jpeg_decompress_struct.max_v_samp_factor != sp->subsampling_ver) {
-            TIFFErrorExt(tif->tif_clientdata,module,
-                         "jpeg_start_decompress() returned max_h_samp_factor = %d "
-                         "and max_v_samp_factor = %d, expected %d and %d",
-                         sp->libjpeg_jpeg_decompress_struct.max_h_samp_factor,
-                         sp->libjpeg_jpeg_decompress_struct.max_v_samp_factor,
-                         sp->subsampling_hor,
-                         sp->subsampling_ver);
-            return 0;
+        sp->libjpeg_jpeg_query_style = 0;
+        if (sp->subsampling_convert_log == 0)
+        {
+            assert(sp->subsampling_convert_ycbcrbuf == 0);
+            assert(sp->subsampling_convert_ycbcrimage == 0);
+            /* Check for division by zero. */
+            if (sp->subsampling_hor == 0 || sp->subsampling_ver == 0)
+                return (0);
+            sp->subsampling_convert_ylinelen =
+                ((sp->strile_width + sp->subsampling_hor * 8 - 1) /
+                 (sp->subsampling_hor * 8) * sp->subsampling_hor * 8);
+            sp->subsampling_convert_ylines = sp->subsampling_ver * 8;
+            sp->subsampling_convert_clinelen =
+                sp->subsampling_convert_ylinelen / sp->subsampling_hor;
+            sp->subsampling_convert_clines = 8;
+            sp->subsampling_convert_ybuflen = sp->subsampling_convert_ylinelen *
+                                              sp->subsampling_convert_ylines;
+            sp->subsampling_convert_cbuflen = sp->subsampling_convert_clinelen *
+                                              sp->subsampling_convert_clines;
+            sp->subsampling_convert_ycbcrbuflen =
+                sp->subsampling_convert_ybuflen +
+                2 * sp->subsampling_convert_cbuflen;
+            /* The calloc is not normally necessary, except in some edge/broken
+             * cases */
+            /* for example for a tiled image of height 1 with a tile height of 1
+             * and subsampling_hor=subsampling_ver=2 */
+            /* In that case, libjpeg will only fill the 8 first lines of the 16
+             * lines */
+            /* See https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=16844
+             */
+            /* Even if this case is allowed (?), its handling is broken because
+             * OJPEGPreDecode() should also likely */
+            /* reset subsampling_convert_state to 0 when changing tile. */
+            sp->subsampling_convert_ycbcrbuf =
+                _TIFFcallocExt(tif, 1, sp->subsampling_convert_ycbcrbuflen);
+            if (sp->subsampling_convert_ycbcrbuf == 0)
+            {
+                TIFFErrorExtR(tif, module, "Out of memory");
+                return (0);
+            }
+            sp->subsampling_convert_ybuf = sp->subsampling_convert_ycbcrbuf;
+            sp->subsampling_convert_cbbuf =
+                sp->subsampling_convert_ybuf + sp->subsampling_convert_ybuflen;
+            sp->subsampling_convert_crbuf =
+                sp->subsampling_convert_cbbuf + sp->subsampling_convert_cbuflen;
+            sp->subsampling_convert_ycbcrimagelen =
+                3 + sp->subsampling_convert_ylines +
+                2 * sp->subsampling_convert_clines;
+            sp->subsampling_convert_ycbcrimage = _TIFFmallocExt(
+                tif, sp->subsampling_convert_ycbcrimagelen * sizeof(uint8_t *));
+            if (sp->subsampling_convert_ycbcrimage == 0)
+            {
+                TIFFErrorExtR(tif, module, "Out of memory");
+                return (0);
+            }
+            m = sp->subsampling_convert_ycbcrimage;
+            *m++ = (uint8_t *)(sp->subsampling_convert_ycbcrimage + 3);
+            *m++ = (uint8_t *)(sp->subsampling_convert_ycbcrimage + 3 +
+                               sp->subsampling_convert_ylines);
+            *m++ = (uint8_t *)(sp->subsampling_convert_ycbcrimage + 3 +
+                               sp->subsampling_convert_ylines +
+                               sp->subsampling_convert_clines);
+            for (n = 0; n < sp->subsampling_convert_ylines; n++)
+                *m++ = sp->subsampling_convert_ybuf +
+                       n * sp->subsampling_convert_ylinelen;
+            for (n = 0; n < sp->subsampling_convert_clines; n++)
+                *m++ = sp->subsampling_convert_cbbuf +
+                       n * sp->subsampling_convert_clinelen;
+            for (n = 0; n < sp->subsampling_convert_clines; n++)
+                *m++ = sp->subsampling_convert_crbuf +
+                       n * sp->subsampling_convert_clinelen;
+            sp->subsampling_convert_clinelenout =
+                sp->strile_width / sp->subsampling_hor +
+                ((sp->strile_width % sp->subsampling_hor) != 0 ? 1 : 0);
+            sp->subsampling_convert_state = 0;
+            sp->error_in_raw_data_decoding = 0;
+            sp->bytes_per_line =
+                sp->subsampling_convert_clinelenout *
+                (sp->subsampling_ver * sp->subsampling_hor + 2);
+            sp->lines_per_strile =
+                sp->strile_length / sp->subsampling_ver +
+                ((sp->strile_length % sp->subsampling_ver) != 0 ? 1 : 0);
+            sp->subsampling_convert_log = 1;
         }
+    }
+    else
+    {
+        sp->libjpeg_jpeg_decompress_struct.jpeg_color_space = JCS_UNKNOWN;
+        sp->libjpeg_jpeg_decompress_struct.out_color_space = JCS_UNKNOWN;
+        sp->libjpeg_jpeg_query_style = 1;
+        sp->bytes_per_line = sp->samples_per_pixel_per_plane * sp->strile_width;
+        sp->lines_per_strile = sp->strile_length;
+    }
+    if (jpeg_start_decompress_encap(sp,
+                                    &(sp->libjpeg_jpeg_decompress_struct)) == 0)
+        return (0);
+    if (sp->libjpeg_jpeg_decompress_struct.image_width != sp->strile_width)
+    {
+        TIFFErrorExtR(tif, module,
+                      "jpeg_start_decompress() returned image_width = %u, "
+                      "expected %" PRIu32,
+                      sp->libjpeg_jpeg_decompress_struct.image_width,
+                      sp->strile_width);
+        return 0;
+    }
+    if (sp->libjpeg_jpeg_decompress_struct.max_h_samp_factor !=
+            sp->subsampling_hor ||
+        sp->libjpeg_jpeg_decompress_struct.max_v_samp_factor !=
+            sp->subsampling_ver)
+    {
+        TIFFErrorExtR(tif, module,
+                      "jpeg_start_decompress() returned max_h_samp_factor = %d "
+                      "and max_v_samp_factor = %d, expected %" PRIu8
+                      " and %" PRIu8,
+                      sp->libjpeg_jpeg_decompress_struct.max_h_samp_factor,
+                      sp->libjpeg_jpeg_decompress_struct.max_v_samp_factor,
+                      sp->subsampling_hor, sp->subsampling_ver);
+        return 0;
+    }
+
+    sp->writeheader_done = 1;
+    return (1);
+}
 
-	sp->writeheader_done=1;
-	return(1);
+static void OJPEGLibjpegSessionAbort(TIFF *tif)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    assert(sp->libjpeg_session_active != 0);
+    jpeg_destroy((jpeg_common_struct *)(&(sp->libjpeg_jpeg_decompress_struct)));
+    sp->libjpeg_session_active = 0;
 }
 
-static void
-OJPEGLibjpegSessionAbort(TIFF* tif)
+static int OJPEGReadHeaderInfoSec(TIFF *tif)
 {
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	assert(sp->libjpeg_session_active!=0);
-	jpeg_destroy((jpeg_common_struct*)(&(sp->libjpeg_jpeg_decompress_struct)));
-	sp->libjpeg_session_active=0;
+    static const char module[] = "OJPEGReadHeaderInfoSec";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t m;
+    uint16_t n;
+    uint8_t o;
+    if (sp->file_size == 0)
+        sp->file_size = TIFFGetFileSize(tif);
+    if (sp->jpeg_interchange_format != 0)
+    {
+        if (sp->jpeg_interchange_format >= sp->file_size)
+        {
+            sp->jpeg_interchange_format = 0;
+            sp->jpeg_interchange_format_length = 0;
+        }
+        else
+        {
+            if ((sp->jpeg_interchange_format_length == 0) ||
+                (sp->jpeg_interchange_format >
+                 UINT64_MAX - sp->jpeg_interchange_format_length) ||
+                (sp->jpeg_interchange_format +
+                     sp->jpeg_interchange_format_length >
+                 sp->file_size))
+                sp->jpeg_interchange_format_length =
+                    sp->file_size - sp->jpeg_interchange_format;
+        }
+    }
+    sp->in_buffer_source = osibsNotSetYet;
+    sp->in_buffer_next_strile = 0;
+    sp->in_buffer_strile_count = tif->tif_dir.td_nstrips;
+    sp->in_buffer_file_togo = 0;
+    sp->in_buffer_togo = 0;
+    do
+    {
+        if (OJPEGReadBytePeek(sp, &m) == 0)
+            return (0);
+        if (m != 255)
+            break;
+        OJPEGReadByteAdvance(sp);
+        do
+        {
+            if (OJPEGReadByte(sp, &m) == 0)
+                return (0);
+        } while (m == 255);
+        switch (m)
+        {
+            case JPEG_MARKER_SOI:
+                /* this type of marker has no data, and should be skipped */
+                break;
+            case JPEG_MARKER_COM:
+            case JPEG_MARKER_APP0:
+            case JPEG_MARKER_APP0 + 1:
+            case JPEG_MARKER_APP0 + 2:
+            case JPEG_MARKER_APP0 + 3:
+            case JPEG_MARKER_APP0 + 4:
+            case JPEG_MARKER_APP0 + 5:
+            case JPEG_MARKER_APP0 + 6:
+            case JPEG_MARKER_APP0 + 7:
+            case JPEG_MARKER_APP0 + 8:
+            case JPEG_MARKER_APP0 + 9:
+            case JPEG_MARKER_APP0 + 10:
+            case JPEG_MARKER_APP0 + 11:
+            case JPEG_MARKER_APP0 + 12:
+            case JPEG_MARKER_APP0 + 13:
+            case JPEG_MARKER_APP0 + 14:
+            case JPEG_MARKER_APP0 + 15:
+                /* this type of marker has data, but it has no use to us (and no
+                 * place here) and should be skipped */
+                if (OJPEGReadWord(sp, &n) == 0)
+                    return (0);
+                if (n < 2)
+                {
+                    if (sp->subsamplingcorrect == 0)
+                        TIFFErrorExtR(tif, module, "Corrupt JPEG data");
+                    return (0);
+                }
+                if (n > 2)
+                    OJPEGReadSkip(sp, n - 2);
+                break;
+            case JPEG_MARKER_DRI:
+                if (OJPEGReadHeaderInfoSecStreamDri(tif) == 0)
+                    return (0);
+                break;
+            case JPEG_MARKER_DQT:
+                if (OJPEGReadHeaderInfoSecStreamDqt(tif) == 0)
+                    return (0);
+                break;
+            case JPEG_MARKER_DHT:
+                if (OJPEGReadHeaderInfoSecStreamDht(tif) == 0)
+                    return (0);
+                break;
+            case JPEG_MARKER_SOF0:
+            case JPEG_MARKER_SOF1:
+            case JPEG_MARKER_SOF3:
+                if (OJPEGReadHeaderInfoSecStreamSof(tif, m) == 0)
+                    return (0);
+                if (sp->subsamplingcorrect != 0)
+                    return (1);
+                break;
+            case JPEG_MARKER_SOS:
+                if (sp->subsamplingcorrect != 0)
+                    return (1);
+                assert(sp->plane_sample_offset == 0);
+                if (OJPEGReadHeaderInfoSecStreamSos(tif) == 0)
+                    return (0);
+                break;
+            default:
+                TIFFErrorExtR(tif, module,
+                              "Unknown marker type %" PRIu8 " in JPEG data", m);
+                return (0);
+        }
+    } while (m != JPEG_MARKER_SOS);
+    if (sp->subsamplingcorrect)
+        return (1);
+    if (sp->sof_log == 0)
+    {
+        if (OJPEGReadHeaderInfoSecTablesQTable(tif) == 0)
+            return (0);
+        sp->sof_marker_id = JPEG_MARKER_SOF0;
+        for (o = 0; o < sp->samples_per_pixel; o++)
+            sp->sof_c[o] = o;
+        sp->sof_hv[0] = ((sp->subsampling_hor << 4) | sp->subsampling_ver);
+        for (o = 1; o < sp->samples_per_pixel; o++)
+            sp->sof_hv[o] = 17;
+        sp->sof_x = sp->strile_width;
+        sp->sof_y = sp->strile_length_total;
+        sp->sof_log = 1;
+        if (OJPEGReadHeaderInfoSecTablesDcTable(tif) == 0)
+            return (0);
+        if (OJPEGReadHeaderInfoSecTablesAcTable(tif) == 0)
+            return (0);
+        for (o = 1; o < sp->samples_per_pixel; o++)
+            sp->sos_cs[o] = o;
+    }
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSec(TIFF* tif)
-{
-	static const char module[]="OJPEGReadHeaderInfoSec";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 m;
-	uint16 n;
-	uint8 o;
-	if (sp->file_size==0)
-		sp->file_size=TIFFGetFileSize(tif);
-	if (sp->jpeg_interchange_format!=0)
-	{
-		if (sp->jpeg_interchange_format>=sp->file_size)
-		{
-			sp->jpeg_interchange_format=0;
-			sp->jpeg_interchange_format_length=0;
-		}
-		else
-		{
-			if ((sp->jpeg_interchange_format_length==0) ||
-                            (sp->jpeg_interchange_format > TIFF_UINT64_MAX - sp->jpeg_interchange_format_length) ||
-                            (sp->jpeg_interchange_format+sp->jpeg_interchange_format_length>sp->file_size))
-				sp->jpeg_interchange_format_length=sp->file_size-sp->jpeg_interchange_format;
-		}
-	}
-	sp->in_buffer_source=osibsNotSetYet;
-	sp->in_buffer_next_strile=0;
-	sp->in_buffer_strile_count=tif->tif_dir.td_nstrips;
-	sp->in_buffer_file_togo=0;
-	sp->in_buffer_togo=0;
-	do
-	{
-		if (OJPEGReadBytePeek(sp,&m)==0)
-			return(0);
-		if (m!=255)
-			break;
-		OJPEGReadByteAdvance(sp);
-		do
-		{
-			if (OJPEGReadByte(sp,&m)==0)
-				return(0);
-		} while(m==255);
-		switch(m)
-		{
-			case JPEG_MARKER_SOI:
-				/* this type of marker has no data, and should be skipped */
-				break;
-			case JPEG_MARKER_COM:
-			case JPEG_MARKER_APP0:
-			case JPEG_MARKER_APP0+1:
-			case JPEG_MARKER_APP0+2:
-			case JPEG_MARKER_APP0+3:
-			case JPEG_MARKER_APP0+4:
-			case JPEG_MARKER_APP0+5:
-			case JPEG_MARKER_APP0+6:
-			case JPEG_MARKER_APP0+7:
-			case JPEG_MARKER_APP0+8:
-			case JPEG_MARKER_APP0+9:
-			case JPEG_MARKER_APP0+10:
-			case JPEG_MARKER_APP0+11:
-			case JPEG_MARKER_APP0+12:
-			case JPEG_MARKER_APP0+13:
-			case JPEG_MARKER_APP0+14:
-			case JPEG_MARKER_APP0+15:
-				/* this type of marker has data, but it has no use to us (and no place here) and should be skipped */
-				if (OJPEGReadWord(sp,&n)==0)
-					return(0);
-				if (n<2)
-				{
-					if (sp->subsamplingcorrect==0)
-						TIFFErrorExt(tif->tif_clientdata,module,"Corrupt JPEG data");
-					return(0);
-				}
-				if (n>2)
-					OJPEGReadSkip(sp,n-2);
-				break;
-			case JPEG_MARKER_DRI:
-				if (OJPEGReadHeaderInfoSecStreamDri(tif)==0)
-					return(0);
-				break;
-			case JPEG_MARKER_DQT:
-				if (OJPEGReadHeaderInfoSecStreamDqt(tif)==0)
-					return(0);
-				break;
-			case JPEG_MARKER_DHT:
-				if (OJPEGReadHeaderInfoSecStreamDht(tif)==0)
-					return(0);
-				break;
-			case JPEG_MARKER_SOF0:
-			case JPEG_MARKER_SOF1:
-			case JPEG_MARKER_SOF3:
-				if (OJPEGReadHeaderInfoSecStreamSof(tif,m)==0)
-					return(0);
-				if (sp->subsamplingcorrect!=0)
-					return(1);
-				break;
-			case JPEG_MARKER_SOS:
-				if (sp->subsamplingcorrect!=0)
-					return(1);
-				assert(sp->plane_sample_offset==0);
-				if (OJPEGReadHeaderInfoSecStreamSos(tif)==0)
-					return(0);
-				break;
-			default:
-				TIFFErrorExt(tif->tif_clientdata,module,"Unknown marker type %d in JPEG data",m);
-				return(0);
-		}
-	} while(m!=JPEG_MARKER_SOS);
-	if (sp->subsamplingcorrect)
-		return(1);
-	if (sp->sof_log==0)
-	{
-		if (OJPEGReadHeaderInfoSecTablesQTable(tif)==0)
-			return(0);
-		sp->sof_marker_id=JPEG_MARKER_SOF0;
-		for (o=0; o<sp->samples_per_pixel; o++)
-			sp->sof_c[o]=o;
-		sp->sof_hv[0]=((sp->subsampling_hor<<4)|sp->subsampling_ver);
-		for (o=1; o<sp->samples_per_pixel; o++)
-			sp->sof_hv[o]=17;
-		sp->sof_x=sp->strile_width;
-		sp->sof_y=sp->strile_length_total;
-		sp->sof_log=1;
-		if (OJPEGReadHeaderInfoSecTablesDcTable(tif)==0)
-			return(0);
-		if (OJPEGReadHeaderInfoSecTablesAcTable(tif)==0)
-			return(0);
-		for (o=1; o<sp->samples_per_pixel; o++)
-			sp->sos_cs[o]=o;
-	}
-	return(1);
+static int OJPEGReadHeaderInfoSecStreamDri(TIFF *tif)
+{
+    /* This could easily cause trouble in some cases... but no such cases have
+       occurred so far */
+    static const char module[] = "OJPEGReadHeaderInfoSecStreamDri";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint16_t m;
+    if (OJPEGReadWord(sp, &m) == 0)
+        return (0);
+    if (m != 4)
+    {
+        TIFFErrorExtR(tif, module, "Corrupt DRI marker in JPEG data");
+        return (0);
+    }
+    if (OJPEGReadWord(sp, &m) == 0)
+        return (0);
+    sp->restart_interval = m;
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSecStreamDri(TIFF* tif)
-{
-	/* This could easily cause trouble in some cases... but no such cases have
-           occurred so far */
-	static const char module[]="OJPEGReadHeaderInfoSecStreamDri";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint16 m;
-	if (OJPEGReadWord(sp,&m)==0)
-		return(0);
-	if (m!=4)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Corrupt DRI marker in JPEG data");
-		return(0);
-	}
-	if (OJPEGReadWord(sp,&m)==0)
-		return(0);
-	sp->restart_interval=m;
-	return(1);
+static int OJPEGReadHeaderInfoSecStreamDqt(TIFF *tif)
+{
+    /* this is a table marker, and it is to be saved as a whole for exact
+     * pushing on the jpeg stream later on */
+    static const char module[] = "OJPEGReadHeaderInfoSecStreamDqt";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint16_t m;
+    uint32_t na;
+    uint8_t *nb;
+    uint8_t o;
+    if (OJPEGReadWord(sp, &m) == 0)
+        return (0);
+    if (m <= 2)
+    {
+        if (sp->subsamplingcorrect == 0)
+            TIFFErrorExtR(tif, module, "Corrupt DQT marker in JPEG data");
+        return (0);
+    }
+    if (sp->subsamplingcorrect != 0)
+        OJPEGReadSkip(sp, m - 2);
+    else
+    {
+        m -= 2;
+        do
+        {
+            if (m < 65)
+            {
+                TIFFErrorExtR(tif, module, "Corrupt DQT marker in JPEG data");
+                return (0);
+            }
+            na = sizeof(uint32_t) + 69;
+            nb = _TIFFmallocExt(tif, na);
+            if (nb == 0)
+            {
+                TIFFErrorExtR(tif, module, "Out of memory");
+                return (0);
+            }
+            *(uint32_t *)nb = na;
+            nb[sizeof(uint32_t)] = 255;
+            nb[sizeof(uint32_t) + 1] = JPEG_MARKER_DQT;
+            nb[sizeof(uint32_t) + 2] = 0;
+            nb[sizeof(uint32_t) + 3] = 67;
+            if (OJPEGReadBlock(sp, 65, &nb[sizeof(uint32_t) + 4]) == 0)
+            {
+                _TIFFfreeExt(tif, nb);
+                return (0);
+            }
+            o = nb[sizeof(uint32_t) + 4] & 15;
+            if (3 < o)
+            {
+                TIFFErrorExtR(tif, module, "Corrupt DQT marker in JPEG data");
+                _TIFFfreeExt(tif, nb);
+                return (0);
+            }
+            if (sp->qtable[o] != 0)
+                _TIFFfreeExt(tif, sp->qtable[o]);
+            sp->qtable[o] = nb;
+            m -= 65;
+        } while (m > 0);
+    }
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSecStreamDqt(TIFF* tif)
-{
-	/* this is a table marker, and it is to be saved as a whole for exact pushing on the jpeg stream later on */
-	static const char module[]="OJPEGReadHeaderInfoSecStreamDqt";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint16 m;
-	uint32 na;
-	uint8* nb;
-	uint8 o;
-	if (OJPEGReadWord(sp,&m)==0)
-		return(0);
-	if (m<=2)
-	{
-		if (sp->subsamplingcorrect==0)
-			TIFFErrorExt(tif->tif_clientdata,module,"Corrupt DQT marker in JPEG data");
-		return(0);
-	}
-	if (sp->subsamplingcorrect!=0)
-		OJPEGReadSkip(sp,m-2);
-	else
-	{
-		m-=2;
-		do
-		{
-			if (m<65)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Corrupt DQT marker in JPEG data");
-				return(0);
-			}
-			na=sizeof(uint32)+69;
-			nb=_TIFFmalloc(na);
-			if (nb==0)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-				return(0);
-			}
-			*(uint32*)nb=na;
-			nb[sizeof(uint32)]=255;
-			nb[sizeof(uint32)+1]=JPEG_MARKER_DQT;
-			nb[sizeof(uint32)+2]=0;
-			nb[sizeof(uint32)+3]=67;
-			if (OJPEGReadBlock(sp,65,&nb[sizeof(uint32)+4])==0) {
-				_TIFFfree(nb);
-				return(0);
-			}
-			o=nb[sizeof(uint32)+4]&15;
-			if (3<o)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Corrupt DQT marker in JPEG data");
-				_TIFFfree(nb);
-				return(0);
-			}
-			if (sp->qtable[o]!=0)
-				_TIFFfree(sp->qtable[o]);
-			sp->qtable[o]=nb;
-			m-=65;
-		} while(m>0);
-	}
-	return(1);
+static int OJPEGReadHeaderInfoSecStreamDht(TIFF *tif)
+{
+    /* this is a table marker, and it is to be saved as a whole for exact
+     * pushing on the jpeg stream later on */
+    /* TODO: the following assumes there is only one table in this marker... but
+     * i'm not quite sure that assumption is guaranteed correct */
+    static const char module[] = "OJPEGReadHeaderInfoSecStreamDht";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint16_t m;
+    uint32_t na;
+    uint8_t *nb;
+    uint8_t o;
+    if (OJPEGReadWord(sp, &m) == 0)
+        return (0);
+    if (m <= 2)
+    {
+        if (sp->subsamplingcorrect == 0)
+            TIFFErrorExtR(tif, module, "Corrupt DHT marker in JPEG data");
+        return (0);
+    }
+    if (sp->subsamplingcorrect != 0)
+    {
+        OJPEGReadSkip(sp, m - 2);
+    }
+    else
+    {
+        na = sizeof(uint32_t) + 2 + m;
+        nb = _TIFFmallocExt(tif, na);
+        if (nb == 0)
+        {
+            TIFFErrorExtR(tif, module, "Out of memory");
+            return (0);
+        }
+        *(uint32_t *)nb = na;
+        nb[sizeof(uint32_t)] = 255;
+        nb[sizeof(uint32_t) + 1] = JPEG_MARKER_DHT;
+        nb[sizeof(uint32_t) + 2] = (m >> 8);
+        nb[sizeof(uint32_t) + 3] = (m & 255);
+        if (OJPEGReadBlock(sp, m - 2, &nb[sizeof(uint32_t) + 4]) == 0)
+        {
+            _TIFFfreeExt(tif, nb);
+            return (0);
+        }
+        o = nb[sizeof(uint32_t) + 4];
+        if ((o & 240) == 0)
+        {
+            if (3 < o)
+            {
+                TIFFErrorExtR(tif, module, "Corrupt DHT marker in JPEG data");
+                _TIFFfreeExt(tif, nb);
+                return (0);
+            }
+            if (sp->dctable[o] != 0)
+                _TIFFfreeExt(tif, sp->dctable[o]);
+            sp->dctable[o] = nb;
+        }
+        else
+        {
+            if ((o & 240) != 16)
+            {
+                TIFFErrorExtR(tif, module, "Corrupt DHT marker in JPEG data");
+                _TIFFfreeExt(tif, nb);
+                return (0);
+            }
+            o &= 15;
+            if (3 < o)
+            {
+                TIFFErrorExtR(tif, module, "Corrupt DHT marker in JPEG data");
+                _TIFFfreeExt(tif, nb);
+                return (0);
+            }
+            if (sp->actable[o] != 0)
+                _TIFFfreeExt(tif, sp->actable[o]);
+            sp->actable[o] = nb;
+        }
+    }
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSecStreamDht(TIFF* tif)
-{
-	/* this is a table marker, and it is to be saved as a whole for exact pushing on the jpeg stream later on */
-	/* TODO: the following assumes there is only one table in this marker... but i'm not quite sure that assumption is guaranteed correct */
-	static const char module[]="OJPEGReadHeaderInfoSecStreamDht";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint16 m;
-	uint32 na;
-	uint8* nb;
-	uint8 o;
-	if (OJPEGReadWord(sp,&m)==0)
-		return(0);
-	if (m<=2)
-	{
-		if (sp->subsamplingcorrect==0)
-			TIFFErrorExt(tif->tif_clientdata,module,"Corrupt DHT marker in JPEG data");
-		return(0);
-	}
-	if (sp->subsamplingcorrect!=0)
-	{
-		OJPEGReadSkip(sp,m-2);
-	}
-	else
-	{
-		na=sizeof(uint32)+2+m;
-		nb=_TIFFmalloc(na);
-		if (nb==0)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-			return(0);
-		}
-		*(uint32*)nb=na;
-		nb[sizeof(uint32)]=255;
-		nb[sizeof(uint32)+1]=JPEG_MARKER_DHT;
-		nb[sizeof(uint32)+2]=(m>>8);
-		nb[sizeof(uint32)+3]=(m&255);
-		if (OJPEGReadBlock(sp,m-2,&nb[sizeof(uint32)+4])==0) {
-                        _TIFFfree(nb);
-			return(0);
+static int OJPEGReadHeaderInfoSecStreamSof(TIFF *tif, uint8_t marker_id)
+{
+    /* this marker needs to be checked, and part of its data needs to be saved
+     * for regeneration later on */
+    static const char module[] = "OJPEGReadHeaderInfoSecStreamSof";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint16_t m;
+    uint16_t n;
+    uint8_t o;
+    uint16_t p;
+    uint16_t q;
+    if (sp->sof_log != 0)
+    {
+        TIFFErrorExtR(tif, module, "Corrupt JPEG data");
+        return (0);
+    }
+    if (sp->subsamplingcorrect == 0)
+        sp->sof_marker_id = marker_id;
+    /* Lf: data length */
+    if (OJPEGReadWord(sp, &m) == 0)
+        return (0);
+    if (m < 11)
+    {
+        if (sp->subsamplingcorrect == 0)
+            TIFFErrorExtR(tif, module, "Corrupt SOF marker in JPEG data");
+        return (0);
+    }
+    m -= 8;
+    if (m % 3 != 0)
+    {
+        if (sp->subsamplingcorrect == 0)
+            TIFFErrorExtR(tif, module, "Corrupt SOF marker in JPEG data");
+        return (0);
+    }
+    n = m / 3;
+    if (sp->subsamplingcorrect == 0)
+    {
+        if (n != sp->samples_per_pixel)
+        {
+            TIFFErrorExtR(
+                tif, module,
+                "JPEG compressed data indicates unexpected number of samples");
+            return (0);
+        }
+    }
+    /* P: Sample precision */
+    if (OJPEGReadByte(sp, &o) == 0)
+        return (0);
+    if (o != 8)
+    {
+        if (sp->subsamplingcorrect == 0)
+            TIFFErrorExtR(tif, module,
+                          "JPEG compressed data indicates unexpected number of "
+                          "bits per sample");
+        return (0);
+    }
+    /* Y: Number of lines, X: Number of samples per line */
+    if (sp->subsamplingcorrect)
+        OJPEGReadSkip(sp, 4);
+    else
+    {
+        /* Y: Number of lines */
+        if (OJPEGReadWord(sp, &p) == 0)
+            return (0);
+        if (((uint32_t)p < sp->image_length) &&
+            ((uint32_t)p < sp->strile_length_total))
+        {
+            TIFFErrorExtR(tif, module,
+                          "JPEG compressed data indicates unexpected height");
+            return (0);
+        }
+        sp->sof_y = p;
+        /* X: Number of samples per line */
+        if (OJPEGReadWord(sp, &p) == 0)
+            return (0);
+        if (((uint32_t)p < sp->image_width) && ((uint32_t)p < sp->strile_width))
+        {
+            TIFFErrorExtR(tif, module,
+                          "JPEG compressed data indicates unexpected width");
+            return (0);
+        }
+        if ((uint32_t)p > sp->strile_width)
+        {
+            TIFFErrorExtR(tif, module,
+                          "JPEG compressed data image width exceeds expected "
+                          "image width");
+            return (0);
+        }
+        sp->sof_x = p;
+    }
+    /* Nf: Number of image components in frame */
+    if (OJPEGReadByte(sp, &o) == 0)
+        return (0);
+    if (o != n)
+    {
+        if (sp->subsamplingcorrect == 0)
+            TIFFErrorExtR(tif, module, "Corrupt SOF marker in JPEG data");
+        return (0);
+    }
+    /* per component stuff */
+    /* TODO: double-check that flow implies that n cannot be as big as to make
+     * us overflow sof_c, sof_hv and sof_tq arrays */
+    for (q = 0; q < n; q++)
+    {
+        /* C: Component identifier */
+        if (OJPEGReadByte(sp, &o) == 0)
+            return (0);
+        if (sp->subsamplingcorrect == 0)
+            sp->sof_c[q] = o;
+        /* H: Horizontal sampling factor, and V: Vertical sampling factor */
+        if (OJPEGReadByte(sp, &o) == 0)
+            return (0);
+        if (sp->subsamplingcorrect != 0)
+        {
+            if (q == 0)
+            {
+                sp->subsampling_hor = (o >> 4);
+                sp->subsampling_ver = (o & 15);
+                if (((sp->subsampling_hor != 1) && (sp->subsampling_hor != 2) &&
+                     (sp->subsampling_hor != 4)) ||
+                    ((sp->subsampling_ver != 1) && (sp->subsampling_ver != 2) &&
+                     (sp->subsampling_ver != 4)))
+                    sp->subsampling_force_desubsampling_inside_decompression =
+                        1;
+            }
+            else
+            {
+                if (o != 17)
+                    sp->subsampling_force_desubsampling_inside_decompression =
+                        1;
+            }
+        }
+        else
+        {
+            sp->sof_hv[q] = o;
+            if (sp->subsampling_force_desubsampling_inside_decompression == 0)
+            {
+                if (q == 0)
+                {
+                    if (o != ((sp->subsampling_hor << 4) | sp->subsampling_ver))
+                    {
+                        TIFFErrorExtR(tif, module,
+                                      "JPEG compressed data indicates "
+                                      "unexpected subsampling values");
+                        return (0);
+                    }
+                }
+                else
+                {
+                    if (o != 17)
+                    {
+                        TIFFErrorExtR(tif, module,
+                                      "JPEG compressed data indicates "
+                                      "unexpected subsampling values");
+                        return (0);
+                    }
                 }
-		o=nb[sizeof(uint32)+4];
-		if ((o&240)==0)
-		{
-			if (3<o)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Corrupt DHT marker in JPEG data");
-                                _TIFFfree(nb);
-				return(0);
-			}
-			if (sp->dctable[o]!=0)
-				_TIFFfree(sp->dctable[o]);
-			sp->dctable[o]=nb;
-		}
-		else
-		{
-			if ((o&240)!=16)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Corrupt DHT marker in JPEG data");
-                                _TIFFfree(nb);
-				return(0);
-			}
-			o&=15;
-			if (3<o)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Corrupt DHT marker in JPEG data");
-                                _TIFFfree(nb);
-				return(0);
-			}
-			if (sp->actable[o]!=0)
-				_TIFFfree(sp->actable[o]);
-			sp->actable[o]=nb;
-		}
-	}
-	return(1);
+            }
+        }
+        /* Tq: Quantization table destination selector */
+        if (OJPEGReadByte(sp, &o) == 0)
+            return (0);
+        if (sp->subsamplingcorrect == 0)
+            sp->sof_tq[q] = o;
+    }
+    if (sp->subsamplingcorrect == 0)
+        sp->sof_log = 1;
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSecStreamSof(TIFF* tif, uint8 marker_id)
-{
-	/* this marker needs to be checked, and part of its data needs to be saved for regeneration later on */
-	static const char module[]="OJPEGReadHeaderInfoSecStreamSof";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint16 m;
-	uint16 n;
-	uint8 o;
-	uint16 p;
-	uint16 q;
-	if (sp->sof_log!=0)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Corrupt JPEG data");
-		return(0);
-	}
-	if (sp->subsamplingcorrect==0)
-		sp->sof_marker_id=marker_id;
-	/* Lf: data length */
-	if (OJPEGReadWord(sp,&m)==0)
-		return(0);
-	if (m<11)
-	{
-		if (sp->subsamplingcorrect==0)
-			TIFFErrorExt(tif->tif_clientdata,module,"Corrupt SOF marker in JPEG data");
-		return(0);
-	}
-	m-=8;
-	if (m%3!=0)
-	{
-		if (sp->subsamplingcorrect==0)
-			TIFFErrorExt(tif->tif_clientdata,module,"Corrupt SOF marker in JPEG data");
-		return(0);
-	}
-	n=m/3;
-	if (sp->subsamplingcorrect==0)
-	{
-		if (n!=sp->samples_per_pixel)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"JPEG compressed data indicates unexpected number of samples");
-			return(0);
-		}
-	}
-	/* P: Sample precision */
-	if (OJPEGReadByte(sp,&o)==0)
-		return(0);
-	if (o!=8)
-	{
-		if (sp->subsamplingcorrect==0)
-			TIFFErrorExt(tif->tif_clientdata,module,"JPEG compressed data indicates unexpected number of bits per sample");
-		return(0);
-	}
-	/* Y: Number of lines, X: Number of samples per line */
-	if (sp->subsamplingcorrect)
-		OJPEGReadSkip(sp,4);
-	else
-	{
-		/* Y: Number of lines */
-		if (OJPEGReadWord(sp,&p)==0)
-			return(0);
-		if (((uint32)p<sp->image_length) && ((uint32)p<sp->strile_length_total))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"JPEG compressed data indicates unexpected height");
-			return(0);
-		}
-		sp->sof_y=p;
-		/* X: Number of samples per line */
-		if (OJPEGReadWord(sp,&p)==0)
-			return(0);
-		if (((uint32)p<sp->image_width) && ((uint32)p<sp->strile_width))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"JPEG compressed data indicates unexpected width");
-			return(0);
-		}
-		if ((uint32)p>sp->strile_width)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"JPEG compressed data image width exceeds expected image width");
-			return(0);
-		}
-		sp->sof_x=p;
-	}
-	/* Nf: Number of image components in frame */
-	if (OJPEGReadByte(sp,&o)==0)
-		return(0);
-	if (o!=n)
-	{
-		if (sp->subsamplingcorrect==0)
-			TIFFErrorExt(tif->tif_clientdata,module,"Corrupt SOF marker in JPEG data");
-		return(0);
-	}
-	/* per component stuff */
-	/* TODO: double-check that flow implies that n cannot be as big as to make us overflow sof_c, sof_hv and sof_tq arrays */
-	for (q=0; q<n; q++)
-	{
-		/* C: Component identifier */
-		if (OJPEGReadByte(sp,&o)==0)
-			return(0);
-		if (sp->subsamplingcorrect==0)
-			sp->sof_c[q]=o;
-		/* H: Horizontal sampling factor, and V: Vertical sampling factor */
-		if (OJPEGReadByte(sp,&o)==0)
-			return(0);
-		if (sp->subsamplingcorrect!=0)
-		{
-			if (q==0)
-			{
-				sp->subsampling_hor=(o>>4);
-				sp->subsampling_ver=(o&15);
-				if (((sp->subsampling_hor!=1) && (sp->subsampling_hor!=2) && (sp->subsampling_hor!=4)) ||
-					((sp->subsampling_ver!=1) && (sp->subsampling_ver!=2) && (sp->subsampling_ver!=4)))
-					sp->subsampling_force_desubsampling_inside_decompression=1;
-			}
-			else
-			{
-				if (o!=17)
-					sp->subsampling_force_desubsampling_inside_decompression=1;
-			}
-		}
-		else
-		{
-			sp->sof_hv[q]=o;
-			if (sp->subsampling_force_desubsampling_inside_decompression==0)
-			{
-				if (q==0)
-				{
-					if (o!=((sp->subsampling_hor<<4)|sp->subsampling_ver))
-					{
-						TIFFErrorExt(tif->tif_clientdata,module,"JPEG compressed data indicates unexpected subsampling values");
-						return(0);
-					}
-				}
-				else
-				{
-					if (o!=17)
-					{
-						TIFFErrorExt(tif->tif_clientdata,module,"JPEG compressed data indicates unexpected subsampling values");
-						return(0);
-					}
-				}
-			}
-		}
-		/* Tq: Quantization table destination selector */
-		if (OJPEGReadByte(sp,&o)==0)
-			return(0);
-		if (sp->subsamplingcorrect==0)
-			sp->sof_tq[q]=o;
-	}
-	if (sp->subsamplingcorrect==0)
-		sp->sof_log=1;
-	return(1);
+static int OJPEGReadHeaderInfoSecStreamSos(TIFF *tif)
+{
+    /* this marker needs to be checked, and part of its data needs to be saved
+     * for regeneration later on */
+    static const char module[] = "OJPEGReadHeaderInfoSecStreamSos";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint16_t m;
+    uint8_t n;
+    uint8_t o;
+    assert(sp->subsamplingcorrect == 0);
+    if (sp->sof_log == 0)
+    {
+        TIFFErrorExtR(tif, module, "Corrupt SOS marker in JPEG data");
+        return (0);
+    }
+    /* Ls */
+    if (OJPEGReadWord(sp, &m) == 0)
+        return (0);
+    if (m != 6 + sp->samples_per_pixel_per_plane * 2)
+    {
+        TIFFErrorExtR(tif, module, "Corrupt SOS marker in JPEG data");
+        return (0);
+    }
+    /* Ns */
+    if (OJPEGReadByte(sp, &n) == 0)
+        return (0);
+    if (n != sp->samples_per_pixel_per_plane)
+    {
+        TIFFErrorExtR(tif, module, "Corrupt SOS marker in JPEG data");
+        return (0);
+    }
+    /* Cs, Td, and Ta */
+    for (o = 0; o < sp->samples_per_pixel_per_plane; o++)
+    {
+        /* Cs */
+        if (OJPEGReadByte(sp, &n) == 0)
+            return (0);
+        sp->sos_cs[sp->plane_sample_offset + o] = n;
+        /* Td and Ta */
+        if (OJPEGReadByte(sp, &n) == 0)
+            return (0);
+        sp->sos_tda[sp->plane_sample_offset + o] = n;
+    }
+    /* skip Ss, Se, Ah, en Al -> no check, as per Tom Lane recommendation, as
+     * per LibJpeg source */
+    OJPEGReadSkip(sp, 3);
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSecStreamSos(TIFF* tif)
-{
-	/* this marker needs to be checked, and part of its data needs to be saved for regeneration later on */
-	static const char module[]="OJPEGReadHeaderInfoSecStreamSos";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint16 m;
-	uint8 n;
-	uint8 o;
-	assert(sp->subsamplingcorrect==0);
-	if (sp->sof_log==0)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Corrupt SOS marker in JPEG data");
-		return(0);
-	}
-	/* Ls */
-	if (OJPEGReadWord(sp,&m)==0)
-		return(0);
-	if (m!=6+sp->samples_per_pixel_per_plane*2)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Corrupt SOS marker in JPEG data");
-		return(0);
-	}
-	/* Ns */
-	if (OJPEGReadByte(sp,&n)==0)
-		return(0);
-	if (n!=sp->samples_per_pixel_per_plane)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Corrupt SOS marker in JPEG data");
-		return(0);
-	}
-	/* Cs, Td, and Ta */
-	for (o=0; o<sp->samples_per_pixel_per_plane; o++)
-	{
-		/* Cs */
-		if (OJPEGReadByte(sp,&n)==0)
-			return(0);
-		sp->sos_cs[sp->plane_sample_offset+o]=n;
-		/* Td and Ta */
-		if (OJPEGReadByte(sp,&n)==0)
-			return(0);
-		sp->sos_tda[sp->plane_sample_offset+o]=n;
-	}
-	/* skip Ss, Se, Ah, en Al -> no check, as per Tom Lane recommendation, as per LibJpeg source */
-	OJPEGReadSkip(sp,3);
-	return(1);
+static int OJPEGReadHeaderInfoSecTablesQTable(TIFF *tif)
+{
+    static const char module[] = "OJPEGReadHeaderInfoSecTablesQTable";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t m;
+    uint8_t n;
+    uint32_t oa;
+    uint8_t *ob;
+    uint32_t p;
+    if (sp->qtable_offset[0] == 0)
+    {
+        TIFFErrorExtR(tif, module, "Missing JPEG tables");
+        return (0);
+    }
+    sp->in_buffer_file_pos_log = 0;
+    for (m = 0; m < sp->samples_per_pixel; m++)
+    {
+        if ((sp->qtable_offset[m] != 0) &&
+            ((m == 0) || (sp->qtable_offset[m] != sp->qtable_offset[m - 1])))
+        {
+            for (n = 0; n < m - 1; n++)
+            {
+                if (sp->qtable_offset[m] == sp->qtable_offset[n])
+                {
+                    TIFFErrorExtR(tif, module, "Corrupt JpegQTables tag value");
+                    return (0);
+                }
+            }
+            oa = sizeof(uint32_t) + 69;
+            ob = _TIFFmallocExt(tif, oa);
+            if (ob == 0)
+            {
+                TIFFErrorExtR(tif, module, "Out of memory");
+                return (0);
+            }
+            *(uint32_t *)ob = oa;
+            ob[sizeof(uint32_t)] = 255;
+            ob[sizeof(uint32_t) + 1] = JPEG_MARKER_DQT;
+            ob[sizeof(uint32_t) + 2] = 0;
+            ob[sizeof(uint32_t) + 3] = 67;
+            ob[sizeof(uint32_t) + 4] = m;
+            TIFFSeekFile(tif, sp->qtable_offset[m], SEEK_SET);
+            p = (uint32_t)TIFFReadFile(tif, &ob[sizeof(uint32_t) + 5], 64);
+            if (p != 64)
+            {
+                _TIFFfreeExt(tif, ob);
+                return (0);
+            }
+            if (sp->qtable[m] != 0)
+                _TIFFfreeExt(tif, sp->qtable[m]);
+            sp->qtable[m] = ob;
+            sp->sof_tq[m] = m;
+        }
+        else
+            sp->sof_tq[m] = sp->sof_tq[m - 1];
+    }
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSecTablesQTable(TIFF* tif)
-{
-	static const char module[]="OJPEGReadHeaderInfoSecTablesQTable";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 m;
-	uint8 n;
-	uint32 oa;
-	uint8* ob;
-	uint32 p;
-	if (sp->qtable_offset[0]==0)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Missing JPEG tables");
-		return(0);
-	}
-	sp->in_buffer_file_pos_log=0;
-	for (m=0; m<sp->samples_per_pixel; m++)
-	{
-		if ((sp->qtable_offset[m]!=0) && ((m==0) || (sp->qtable_offset[m]!=sp->qtable_offset[m-1])))
-		{
-			for (n=0; n<m-1; n++)
-			{
-				if (sp->qtable_offset[m]==sp->qtable_offset[n])
-				{
-					TIFFErrorExt(tif->tif_clientdata,module,"Corrupt JpegQTables tag value");
-					return(0);
-				}
-			}
-			oa=sizeof(uint32)+69;
-			ob=_TIFFmalloc(oa);
-			if (ob==0)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-				return(0);
-			}
-			*(uint32*)ob=oa;
-			ob[sizeof(uint32)]=255;
-			ob[sizeof(uint32)+1]=JPEG_MARKER_DQT;
-			ob[sizeof(uint32)+2]=0;
-			ob[sizeof(uint32)+3]=67;
-			ob[sizeof(uint32)+4]=m;
-			TIFFSeekFile(tif,sp->qtable_offset[m],SEEK_SET); 
-			p=(uint32)TIFFReadFile(tif,&ob[sizeof(uint32)+5],64);
-			if (p!=64)
-                        {
-                                _TIFFfree(ob);
-				return(0);
-                        }
-			if (sp->qtable[m]!=0)
-				_TIFFfree(sp->qtable[m]);
-			sp->qtable[m]=ob;
-			sp->sof_tq[m]=m;
-		}
-		else
-			sp->sof_tq[m]=sp->sof_tq[m-1];
-	}
-	return(1);
+static int OJPEGReadHeaderInfoSecTablesDcTable(TIFF *tif)
+{
+    static const char module[] = "OJPEGReadHeaderInfoSecTablesDcTable";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t m;
+    uint8_t n;
+    uint8_t o[16];
+    uint32_t p;
+    uint32_t q;
+    uint32_t ra;
+    uint8_t *rb;
+    if (sp->dctable_offset[0] == 0)
+    {
+        TIFFErrorExtR(tif, module, "Missing JPEG tables");
+        return (0);
+    }
+    sp->in_buffer_file_pos_log = 0;
+    for (m = 0; m < sp->samples_per_pixel; m++)
+    {
+        if ((sp->dctable_offset[m] != 0) &&
+            ((m == 0) || (sp->dctable_offset[m] != sp->dctable_offset[m - 1])))
+        {
+            for (n = 0; n < m - 1; n++)
+            {
+                if (sp->dctable_offset[m] == sp->dctable_offset[n])
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "Corrupt JpegDcTables tag value");
+                    return (0);
+                }
+            }
+            TIFFSeekFile(tif, sp->dctable_offset[m], SEEK_SET);
+            p = (uint32_t)TIFFReadFile(tif, o, 16);
+            if (p != 16)
+                return (0);
+            q = 0;
+            for (n = 0; n < 16; n++)
+                q += o[n];
+            ra = sizeof(uint32_t) + 21 + q;
+            rb = _TIFFmallocExt(tif, ra);
+            if (rb == 0)
+            {
+                TIFFErrorExtR(tif, module, "Out of memory");
+                return (0);
+            }
+            *(uint32_t *)rb = ra;
+            rb[sizeof(uint32_t)] = 255;
+            rb[sizeof(uint32_t) + 1] = JPEG_MARKER_DHT;
+            rb[sizeof(uint32_t) + 2] = (uint8_t)((19 + q) >> 8);
+            rb[sizeof(uint32_t) + 3] = ((19 + q) & 255);
+            rb[sizeof(uint32_t) + 4] = m;
+            for (n = 0; n < 16; n++)
+                rb[sizeof(uint32_t) + 5 + n] = o[n];
+            p = (uint32_t)TIFFReadFile(tif, &(rb[sizeof(uint32_t) + 21]), q);
+            if (p != q)
+            {
+                _TIFFfreeExt(tif, rb);
+                return (0);
+            }
+            if (sp->dctable[m] != 0)
+                _TIFFfreeExt(tif, sp->dctable[m]);
+            sp->dctable[m] = rb;
+            sp->sos_tda[m] = (m << 4);
+        }
+        else
+            sp->sos_tda[m] = sp->sos_tda[m - 1];
+    }
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSecTablesDcTable(TIFF* tif)
-{
-	static const char module[]="OJPEGReadHeaderInfoSecTablesDcTable";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 m;
-	uint8 n;
-	uint8 o[16];
-	uint32 p;
-	uint32 q;
-	uint32 ra;
-	uint8* rb;
-	if (sp->dctable_offset[0]==0)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Missing JPEG tables");
-		return(0);
-	}
-	sp->in_buffer_file_pos_log=0;
-	for (m=0; m<sp->samples_per_pixel; m++)
-	{
-		if ((sp->dctable_offset[m]!=0) && ((m==0) || (sp->dctable_offset[m]!=sp->dctable_offset[m-1])))
-		{
-			for (n=0; n<m-1; n++)
-			{
-				if (sp->dctable_offset[m]==sp->dctable_offset[n])
-				{
-					TIFFErrorExt(tif->tif_clientdata,module,"Corrupt JpegDcTables tag value");
-					return(0);
-				}
-			}
-			TIFFSeekFile(tif,sp->dctable_offset[m],SEEK_SET);
-			p=(uint32)TIFFReadFile(tif,o,16);
-			if (p!=16)
-				return(0);
-			q=0;
-			for (n=0; n<16; n++)
-				q+=o[n];
-			ra=sizeof(uint32)+21+q;
-			rb=_TIFFmalloc(ra);
-			if (rb==0)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-				return(0);
-			}
-			*(uint32*)rb=ra;
-			rb[sizeof(uint32)]=255;
-			rb[sizeof(uint32)+1]=JPEG_MARKER_DHT;
-			rb[sizeof(uint32)+2]=(uint8)((19+q)>>8);
-			rb[sizeof(uint32)+3]=((19+q)&255);
-			rb[sizeof(uint32)+4]=m;
-			for (n=0; n<16; n++)
-				rb[sizeof(uint32)+5+n]=o[n];
-			p=(uint32)TIFFReadFile(tif,&(rb[sizeof(uint32)+21]),q);
-			if (p!=q)
-                        {
-                                _TIFFfree(rb);
-				return(0);
-                        }
-			if (sp->dctable[m]!=0)
-				_TIFFfree(sp->dctable[m]);
-			sp->dctable[m]=rb;
-			sp->sos_tda[m]=(m<<4);
-		}
-		else
-			sp->sos_tda[m]=sp->sos_tda[m-1];
-	}
-	return(1);
+static int OJPEGReadHeaderInfoSecTablesAcTable(TIFF *tif)
+{
+    static const char module[] = "OJPEGReadHeaderInfoSecTablesAcTable";
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t m;
+    uint8_t n;
+    uint8_t o[16];
+    uint32_t p;
+    uint32_t q;
+    uint32_t ra;
+    uint8_t *rb;
+    if (sp->actable_offset[0] == 0)
+    {
+        TIFFErrorExtR(tif, module, "Missing JPEG tables");
+        return (0);
+    }
+    sp->in_buffer_file_pos_log = 0;
+    for (m = 0; m < sp->samples_per_pixel; m++)
+    {
+        if ((sp->actable_offset[m] != 0) &&
+            ((m == 0) || (sp->actable_offset[m] != sp->actable_offset[m - 1])))
+        {
+            for (n = 0; n < m - 1; n++)
+            {
+                if (sp->actable_offset[m] == sp->actable_offset[n])
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "Corrupt JpegAcTables tag value");
+                    return (0);
+                }
+            }
+            TIFFSeekFile(tif, sp->actable_offset[m], SEEK_SET);
+            p = (uint32_t)TIFFReadFile(tif, o, 16);
+            if (p != 16)
+                return (0);
+            q = 0;
+            for (n = 0; n < 16; n++)
+                q += o[n];
+            ra = sizeof(uint32_t) + 21 + q;
+            rb = _TIFFmallocExt(tif, ra);
+            if (rb == 0)
+            {
+                TIFFErrorExtR(tif, module, "Out of memory");
+                return (0);
+            }
+            *(uint32_t *)rb = ra;
+            rb[sizeof(uint32_t)] = 255;
+            rb[sizeof(uint32_t) + 1] = JPEG_MARKER_DHT;
+            rb[sizeof(uint32_t) + 2] = (uint8_t)((19 + q) >> 8);
+            rb[sizeof(uint32_t) + 3] = ((19 + q) & 255);
+            rb[sizeof(uint32_t) + 4] = (16 | m);
+            for (n = 0; n < 16; n++)
+                rb[sizeof(uint32_t) + 5 + n] = o[n];
+            p = (uint32_t)TIFFReadFile(tif, &(rb[sizeof(uint32_t) + 21]), q);
+            if (p != q)
+            {
+                _TIFFfreeExt(tif, rb);
+                return (0);
+            }
+            if (sp->actable[m] != 0)
+                _TIFFfreeExt(tif, sp->actable[m]);
+            sp->actable[m] = rb;
+            sp->sos_tda[m] = (sp->sos_tda[m] | m);
+        }
+        else
+            sp->sos_tda[m] = (sp->sos_tda[m] | (sp->sos_tda[m - 1] & 15));
+    }
+    return (1);
 }
 
-static int
-OJPEGReadHeaderInfoSecTablesAcTable(TIFF* tif)
-{
-	static const char module[]="OJPEGReadHeaderInfoSecTablesAcTable";
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 m;
-	uint8 n;
-	uint8 o[16];
-	uint32 p;
-	uint32 q;
-	uint32 ra;
-	uint8* rb;
-	if (sp->actable_offset[0]==0)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,"Missing JPEG tables");
-		return(0);
-	}
-	sp->in_buffer_file_pos_log=0;
-	for (m=0; m<sp->samples_per_pixel; m++)
-	{
-		if ((sp->actable_offset[m]!=0) && ((m==0) || (sp->actable_offset[m]!=sp->actable_offset[m-1])))
-		{
-			for (n=0; n<m-1; n++)
-			{
-				if (sp->actable_offset[m]==sp->actable_offset[n])
-				{
-					TIFFErrorExt(tif->tif_clientdata,module,"Corrupt JpegAcTables tag value");
-					return(0);
-				}
-			}
-			TIFFSeekFile(tif,sp->actable_offset[m],SEEK_SET);  
-			p=(uint32)TIFFReadFile(tif,o,16);
-			if (p!=16)
-				return(0);
-			q=0;
-			for (n=0; n<16; n++)
-				q+=o[n];
-			ra=sizeof(uint32)+21+q;
-			rb=_TIFFmalloc(ra);
-			if (rb==0)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Out of memory");
-				return(0);
-			}
-			*(uint32*)rb=ra;
-			rb[sizeof(uint32)]=255;
-			rb[sizeof(uint32)+1]=JPEG_MARKER_DHT;
-			rb[sizeof(uint32)+2]=(uint8)((19+q)>>8);
-			rb[sizeof(uint32)+3]=((19+q)&255);
-			rb[sizeof(uint32)+4]=(16|m);
-			for (n=0; n<16; n++)
-				rb[sizeof(uint32)+5+n]=o[n];
-			p=(uint32)TIFFReadFile(tif,&(rb[sizeof(uint32)+21]),q);
-			if (p!=q)
+static int OJPEGReadBufferFill(OJPEGState *sp)
+{
+    uint16_t m;
+    tmsize_t n;
+    /* TODO: double-check: when subsamplingcorrect is set, no call to
+     * TIFFErrorExt or TIFFWarningExt should be made in any other case, seek or
+     * read errors should be passed through */
+    do
+    {
+        if (sp->in_buffer_file_togo != 0)
+        {
+            if (sp->in_buffer_file_pos_log == 0)
+            {
+                TIFFSeekFile(sp->tif, sp->in_buffer_file_pos, SEEK_SET);
+                sp->in_buffer_file_pos_log = 1;
+            }
+            m = OJPEG_BUFFER;
+            if ((uint64_t)m > sp->in_buffer_file_togo)
+                m = (uint16_t)sp->in_buffer_file_togo;
+            n = TIFFReadFile(sp->tif, sp->in_buffer, (tmsize_t)m);
+            if (n == 0)
+                return (0);
+            assert(n > 0);
+            assert(n <= OJPEG_BUFFER);
+            assert(n < 65536);
+            assert((uint64_t)n <= sp->in_buffer_file_togo);
+            m = (uint16_t)n;
+            sp->in_buffer_togo = m;
+            sp->in_buffer_cur = sp->in_buffer;
+            sp->in_buffer_file_togo -= m;
+            sp->in_buffer_file_pos += m;
+            break;
+        }
+        sp->in_buffer_file_pos_log = 0;
+        switch (sp->in_buffer_source)
+        {
+            case osibsNotSetYet:
+                if (sp->jpeg_interchange_format != 0)
+                {
+                    sp->in_buffer_file_pos = sp->jpeg_interchange_format;
+                    sp->in_buffer_file_togo =
+                        sp->jpeg_interchange_format_length;
+                }
+                sp->in_buffer_source = osibsJpegInterchangeFormat;
+                break;
+            case osibsJpegInterchangeFormat:
+                sp->in_buffer_source = osibsStrile;
+                break;
+            case osibsStrile:
+                if (sp->in_buffer_next_strile == sp->in_buffer_strile_count)
+                    sp->in_buffer_source = osibsEof;
+                else
+                {
+                    int err = 0;
+                    sp->in_buffer_file_pos = TIFFGetStrileOffsetWithErr(
+                        sp->tif, sp->in_buffer_next_strile, &err);
+                    if (err)
+                        return 0;
+                    if (sp->in_buffer_file_pos != 0)
+                    {
+                        uint64_t bytecount = TIFFGetStrileByteCountWithErr(
+                            sp->tif, sp->in_buffer_next_strile, &err);
+                        if (err)
+                            return 0;
+                        if (sp->in_buffer_file_pos >= sp->file_size)
+                            sp->in_buffer_file_pos = 0;
+                        else if (bytecount == 0)
+                            sp->in_buffer_file_togo =
+                                sp->file_size - sp->in_buffer_file_pos;
+                        else
                         {
-                                _TIFFfree(rb);
-				return(0);
+                            sp->in_buffer_file_togo = bytecount;
+                            if (sp->in_buffer_file_togo == 0)
+                                sp->in_buffer_file_pos = 0;
+                            else if (sp->in_buffer_file_pos >
+                                         UINT64_MAX - sp->in_buffer_file_togo ||
+                                     sp->in_buffer_file_pos +
+                                             sp->in_buffer_file_togo >
+                                         sp->file_size)
+                                sp->in_buffer_file_togo =
+                                    sp->file_size - sp->in_buffer_file_pos;
                         }
-			if (sp->actable[m]!=0)
-				_TIFFfree(sp->actable[m]);
-			sp->actable[m]=rb;
-			sp->sos_tda[m]=(sp->sos_tda[m]|m);
-		}
-		else
-			sp->sos_tda[m]=(sp->sos_tda[m]|(sp->sos_tda[m-1]&15));
-	}
-	return(1);
-}
-
-static int
-OJPEGReadBufferFill(OJPEGState* sp)
-{
-	uint16 m;
-	tmsize_t n;
-	/* TODO: double-check: when subsamplingcorrect is set, no call to TIFFErrorExt or TIFFWarningExt should be made
-	 * in any other case, seek or read errors should be passed through */
-	do
-	{
-		if (sp->in_buffer_file_togo!=0)
-		{
-			if (sp->in_buffer_file_pos_log==0)
-			{
-				TIFFSeekFile(sp->tif,sp->in_buffer_file_pos,SEEK_SET);
-				sp->in_buffer_file_pos_log=1;
-			}
-			m=OJPEG_BUFFER;
-			if ((uint64)m>sp->in_buffer_file_togo)
-				m=(uint16)sp->in_buffer_file_togo;
-			n=TIFFReadFile(sp->tif,sp->in_buffer,(tmsize_t)m);
-			if (n==0)
-				return(0);
-			assert(n>0);
-			assert(n<=OJPEG_BUFFER);
-			assert(n<65536);
-			assert((uint64)n<=sp->in_buffer_file_togo);
-			m=(uint16)n;
-			sp->in_buffer_togo=m;
-			sp->in_buffer_cur=sp->in_buffer;
-			sp->in_buffer_file_togo-=m;
-			sp->in_buffer_file_pos+=m;
-			break;
-		}
-		sp->in_buffer_file_pos_log=0;
-		switch(sp->in_buffer_source)
-		{
-			case osibsNotSetYet:
-				if (sp->jpeg_interchange_format!=0)
-				{
-					sp->in_buffer_file_pos=sp->jpeg_interchange_format;
-					sp->in_buffer_file_togo=sp->jpeg_interchange_format_length;
-				}
-				sp->in_buffer_source=osibsJpegInterchangeFormat;
-				break;
-			case osibsJpegInterchangeFormat:
-				sp->in_buffer_source=osibsStrile;
-                                break;
-			case osibsStrile:
-				if (sp->in_buffer_next_strile==sp->in_buffer_strile_count)
-					sp->in_buffer_source=osibsEof;
-				else
-				{
-					int err = 0;
-					sp->in_buffer_file_pos=TIFFGetStrileOffsetWithErr(sp->tif, sp->in_buffer_next_strile, &err);
-					if( err )
-						return 0;
-					if (sp->in_buffer_file_pos!=0)
-					{
-						uint64 bytecount = TIFFGetStrileByteCountWithErr(sp->tif, sp->in_buffer_next_strile, &err);
-						if( err )
-							return 0;
-						if (sp->in_buffer_file_pos>=sp->file_size)
-							sp->in_buffer_file_pos=0;
-						else if (bytecount==0)
-							sp->in_buffer_file_togo=sp->file_size-sp->in_buffer_file_pos;
-						else
-						{
-							sp->in_buffer_file_togo=bytecount;
-							if (sp->in_buffer_file_togo==0)
-								sp->in_buffer_file_pos=0;
-							else if (sp->in_buffer_file_pos > TIFF_UINT64_MAX - sp->in_buffer_file_togo || 
-                                                                sp->in_buffer_file_pos+sp->in_buffer_file_togo>sp->file_size)
-								sp->in_buffer_file_togo=sp->file_size-sp->in_buffer_file_pos;
-						}
-					}
-					sp->in_buffer_next_strile++;
-				}
-				break;
-			default:
-				return(0);
-		}
-	} while (1);
-	return(1);
+                    }
+                    sp->in_buffer_next_strile++;
+                }
+                break;
+            default:
+                return (0);
+        }
+    } while (1);
+    return (1);
 }
 
-static int
-OJPEGReadByte(OJPEGState* sp, uint8* byte)
-{
-	if (sp->in_buffer_togo==0)
-	{
-		if (OJPEGReadBufferFill(sp)==0)
-			return(0);
-		assert(sp->in_buffer_togo>0);
-	}
-	*byte=*(sp->in_buffer_cur);
-	sp->in_buffer_cur++;
-	sp->in_buffer_togo--;
-	return(1);
+static int OJPEGReadByte(OJPEGState *sp, uint8_t *byte)
+{
+    if (sp->in_buffer_togo == 0)
+    {
+        if (OJPEGReadBufferFill(sp) == 0)
+            return (0);
+        assert(sp->in_buffer_togo > 0);
+    }
+    *byte = *(sp->in_buffer_cur);
+    sp->in_buffer_cur++;
+    sp->in_buffer_togo--;
+    return (1);
 }
 
-static int
-OJPEGReadBytePeek(OJPEGState* sp, uint8* byte)
-{
-	if (sp->in_buffer_togo==0)
-	{
-		if (OJPEGReadBufferFill(sp)==0)
-			return(0);
-		assert(sp->in_buffer_togo>0);
-	}
-	*byte=*(sp->in_buffer_cur);
-	return(1);
+static int OJPEGReadBytePeek(OJPEGState *sp, uint8_t *byte)
+{
+    if (sp->in_buffer_togo == 0)
+    {
+        if (OJPEGReadBufferFill(sp) == 0)
+            return (0);
+        assert(sp->in_buffer_togo > 0);
+    }
+    *byte = *(sp->in_buffer_cur);
+    return (1);
 }
 
-static void
-OJPEGReadByteAdvance(OJPEGState* sp)
+static void OJPEGReadByteAdvance(OJPEGState *sp)
 {
-	assert(sp->in_buffer_togo>0);
-	sp->in_buffer_cur++;
-	sp->in_buffer_togo--;
+    assert(sp->in_buffer_togo > 0);
+    sp->in_buffer_cur++;
+    sp->in_buffer_togo--;
 }
 
-static int
-OJPEGReadWord(OJPEGState* sp, uint16* word)
-{
-	uint8 m;
-	if (OJPEGReadByte(sp,&m)==0)
-		return(0);
-	*word=(m<<8);
-	if (OJPEGReadByte(sp,&m)==0)
-		return(0);
-	*word|=m;
-	return(1);
+static int OJPEGReadWord(OJPEGState *sp, uint16_t *word)
+{
+    uint8_t m;
+    if (OJPEGReadByte(sp, &m) == 0)
+        return (0);
+    *word = (m << 8);
+    if (OJPEGReadByte(sp, &m) == 0)
+        return (0);
+    *word |= m;
+    return (1);
 }
 
-static int
-OJPEGReadBlock(OJPEGState* sp, uint16 len, void* mem)
-{
-	uint16 mlen;
-	uint8* mmem;
-	uint16 n;
-	assert(len>0);
-	mlen=len;
-	mmem=mem;
-	do
-	{
-		if (sp->in_buffer_togo==0)
-		{
-			if (OJPEGReadBufferFill(sp)==0)
-				return(0);
-			assert(sp->in_buffer_togo>0);
-		}
-		n=mlen;
-		if (n>sp->in_buffer_togo)
-			n=sp->in_buffer_togo;
-		_TIFFmemcpy(mmem,sp->in_buffer_cur,n);
-		sp->in_buffer_cur+=n;
-		sp->in_buffer_togo-=n;
-		mlen-=n;
-		mmem+=n;
-	} while(mlen>0);
-	return(1);
+static int OJPEGReadBlock(OJPEGState *sp, uint16_t len, void *mem)
+{
+    uint16_t mlen;
+    uint8_t *mmem;
+    uint16_t n;
+    assert(len > 0);
+    mlen = len;
+    mmem = mem;
+    do
+    {
+        if (sp->in_buffer_togo == 0)
+        {
+            if (OJPEGReadBufferFill(sp) == 0)
+                return (0);
+            assert(sp->in_buffer_togo > 0);
+        }
+        n = mlen;
+        if (n > sp->in_buffer_togo)
+            n = sp->in_buffer_togo;
+        _TIFFmemcpy(mmem, sp->in_buffer_cur, n);
+        sp->in_buffer_cur += n;
+        sp->in_buffer_togo -= n;
+        mlen -= n;
+        mmem += n;
+    } while (mlen > 0);
+    return (1);
 }
 
-static void
-OJPEGReadSkip(OJPEGState* sp, uint16 len)
-{
-	uint16 m;
-	uint16 n;
-	m=len;
-	n=m;
-	if (n>sp->in_buffer_togo)
-		n=sp->in_buffer_togo;
-	sp->in_buffer_cur+=n;
-	sp->in_buffer_togo-=n;
-	m-=n;
-	if (m>0)
-	{
-		assert(sp->in_buffer_togo==0);
-		n=m;
-		if ((uint64)n>sp->in_buffer_file_togo)
-			n=(uint16)sp->in_buffer_file_togo;
-		sp->in_buffer_file_pos+=n;
-		sp->in_buffer_file_togo-=n;
-		sp->in_buffer_file_pos_log=0;
-		/* we don't skip past jpeginterchangeformat/strile block...
-		 * if that is asked from us, we're dealing with totally bazurk
-		 * data anyway, and we've not seen this happening on any
-		 * testfile, so we might as well likely cause some other
-		 * meaningless error to be passed at some later time
-		 */
-	}
+static void OJPEGReadSkip(OJPEGState *sp, uint16_t len)
+{
+    uint16_t m;
+    uint16_t n;
+    m = len;
+    n = m;
+    if (n > sp->in_buffer_togo)
+        n = sp->in_buffer_togo;
+    sp->in_buffer_cur += n;
+    sp->in_buffer_togo -= n;
+    m -= n;
+    if (m > 0)
+    {
+        assert(sp->in_buffer_togo == 0);
+        n = m;
+        if ((uint64_t)n > sp->in_buffer_file_togo)
+            n = (uint16_t)sp->in_buffer_file_togo;
+        sp->in_buffer_file_pos += n;
+        sp->in_buffer_file_togo -= n;
+        sp->in_buffer_file_pos_log = 0;
+        /* we don't skip past jpeginterchangeformat/strile block...
+         * if that is asked from us, we're dealing with totally bazurk
+         * data anyway, and we've not seen this happening on any
+         * testfile, so we might as well likely cause some other
+         * meaningless error to be passed at some later time
+         */
+    }
 }
 
-static int
-OJPEGWriteStream(TIFF* tif, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	*len=0;
-	do
-	{
-		assert(sp->out_state<=ososEoi);
-		switch(sp->out_state)
-		{
-			case ososSoi:
-				OJPEGWriteStreamSoi(tif,mem,len);
-				break;
-			case ososQTable0:
-				OJPEGWriteStreamQTable(tif,0,mem,len);
-				break;
-			case ososQTable1:
-				OJPEGWriteStreamQTable(tif,1,mem,len);
-				break;
-			case ososQTable2:
-				OJPEGWriteStreamQTable(tif,2,mem,len);
-				break;
-			case ososQTable3:
-				OJPEGWriteStreamQTable(tif,3,mem,len);
-				break;
-			case ososDcTable0:
-				OJPEGWriteStreamDcTable(tif,0,mem,len);
-				break;
-			case ososDcTable1:
-				OJPEGWriteStreamDcTable(tif,1,mem,len);
-				break;
-			case ososDcTable2:
-				OJPEGWriteStreamDcTable(tif,2,mem,len);
-				break;
-			case ososDcTable3:
-				OJPEGWriteStreamDcTable(tif,3,mem,len);
-				break;
-			case ososAcTable0:
-				OJPEGWriteStreamAcTable(tif,0,mem,len);
-				break;
-			case ososAcTable1:
-				OJPEGWriteStreamAcTable(tif,1,mem,len);
-				break;
-			case ososAcTable2:
-				OJPEGWriteStreamAcTable(tif,2,mem,len);
-				break;
-			case ososAcTable3:
-				OJPEGWriteStreamAcTable(tif,3,mem,len);
-				break;
-			case ososDri:
-				OJPEGWriteStreamDri(tif,mem,len);
-				break;
-			case ososSof:
-				OJPEGWriteStreamSof(tif,mem,len);
-				break;
-			case ososSos:
-				OJPEGWriteStreamSos(tif,mem,len);
-				break;
-			case ososCompressed:
-				if (OJPEGWriteStreamCompressed(tif,mem,len)==0)
-					return(0);
-				break;
-			case ososRst:
-				OJPEGWriteStreamRst(tif,mem,len);
-				break;
-			case ososEoi:
-				OJPEGWriteStreamEoi(tif,mem,len);
-				break;
-		}
-	} while (*len==0);
-	return(1);
+static int OJPEGWriteStream(TIFF *tif, void **mem, uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    *len = 0;
+    do
+    {
+        assert(sp->out_state <= ososEoi);
+        switch (sp->out_state)
+        {
+            case ososSoi:
+                OJPEGWriteStreamSoi(tif, mem, len);
+                break;
+            case ososQTable0:
+                OJPEGWriteStreamQTable(tif, 0, mem, len);
+                break;
+            case ososQTable1:
+                OJPEGWriteStreamQTable(tif, 1, mem, len);
+                break;
+            case ososQTable2:
+                OJPEGWriteStreamQTable(tif, 2, mem, len);
+                break;
+            case ososQTable3:
+                OJPEGWriteStreamQTable(tif, 3, mem, len);
+                break;
+            case ososDcTable0:
+                OJPEGWriteStreamDcTable(tif, 0, mem, len);
+                break;
+            case ososDcTable1:
+                OJPEGWriteStreamDcTable(tif, 1, mem, len);
+                break;
+            case ososDcTable2:
+                OJPEGWriteStreamDcTable(tif, 2, mem, len);
+                break;
+            case ososDcTable3:
+                OJPEGWriteStreamDcTable(tif, 3, mem, len);
+                break;
+            case ososAcTable0:
+                OJPEGWriteStreamAcTable(tif, 0, mem, len);
+                break;
+            case ososAcTable1:
+                OJPEGWriteStreamAcTable(tif, 1, mem, len);
+                break;
+            case ososAcTable2:
+                OJPEGWriteStreamAcTable(tif, 2, mem, len);
+                break;
+            case ososAcTable3:
+                OJPEGWriteStreamAcTable(tif, 3, mem, len);
+                break;
+            case ososDri:
+                OJPEGWriteStreamDri(tif, mem, len);
+                break;
+            case ososSof:
+                OJPEGWriteStreamSof(tif, mem, len);
+                break;
+            case ososSos:
+                OJPEGWriteStreamSos(tif, mem, len);
+                break;
+            case ososCompressed:
+                if (OJPEGWriteStreamCompressed(tif, mem, len) == 0)
+                    return (0);
+                break;
+            case ososRst:
+                OJPEGWriteStreamRst(tif, mem, len);
+                break;
+            case ososEoi:
+                OJPEGWriteStreamEoi(tif, mem, len);
+                break;
+        }
+    } while (*len == 0);
+    return (1);
 }
 
-static void
-OJPEGWriteStreamSoi(TIFF* tif, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	assert(OJPEG_BUFFER>=2);
-	sp->out_buffer[0]=255;
-	sp->out_buffer[1]=JPEG_MARKER_SOI;
-	*len=2;
-	*mem=(void*)sp->out_buffer;
-	sp->out_state++;
+static void OJPEGWriteStreamSoi(TIFF *tif, void **mem, uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    assert(OJPEG_BUFFER >= 2);
+    sp->out_buffer[0] = 255;
+    sp->out_buffer[1] = JPEG_MARKER_SOI;
+    *len = 2;
+    *mem = (void *)sp->out_buffer;
+    sp->out_state++;
 }
 
-static void
-OJPEGWriteStreamQTable(TIFF* tif, uint8 table_index, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	if (sp->qtable[table_index]!=0)
-	{
-		*mem=(void*)(sp->qtable[table_index]+sizeof(uint32));
-		*len=*((uint32*)sp->qtable[table_index])-sizeof(uint32);
-	}
-	sp->out_state++;
+static void OJPEGWriteStreamQTable(TIFF *tif, uint8_t table_index, void **mem,
+                                   uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    if (sp->qtable[table_index] != 0)
+    {
+        *mem = (void *)(sp->qtable[table_index] + sizeof(uint32_t));
+        *len = *((uint32_t *)sp->qtable[table_index]) - sizeof(uint32_t);
+    }
+    sp->out_state++;
 }
 
-static void
-OJPEGWriteStreamDcTable(TIFF* tif, uint8 table_index, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	if (sp->dctable[table_index]!=0)
-	{
-		*mem=(void*)(sp->dctable[table_index]+sizeof(uint32));
-		*len=*((uint32*)sp->dctable[table_index])-sizeof(uint32);
-	}
-	sp->out_state++;
+static void OJPEGWriteStreamDcTable(TIFF *tif, uint8_t table_index, void **mem,
+                                    uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    if (sp->dctable[table_index] != 0)
+    {
+        *mem = (void *)(sp->dctable[table_index] + sizeof(uint32_t));
+        *len = *((uint32_t *)sp->dctable[table_index]) - sizeof(uint32_t);
+    }
+    sp->out_state++;
 }
 
-static void
-OJPEGWriteStreamAcTable(TIFF* tif, uint8 table_index, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	if (sp->actable[table_index]!=0)
-	{
-		*mem=(void*)(sp->actable[table_index]+sizeof(uint32));
-		*len=*((uint32*)sp->actable[table_index])-sizeof(uint32);
-	}
-	sp->out_state++;
+static void OJPEGWriteStreamAcTable(TIFF *tif, uint8_t table_index, void **mem,
+                                    uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    if (sp->actable[table_index] != 0)
+    {
+        *mem = (void *)(sp->actable[table_index] + sizeof(uint32_t));
+        *len = *((uint32_t *)sp->actable[table_index]) - sizeof(uint32_t);
+    }
+    sp->out_state++;
 }
 
-static void
-OJPEGWriteStreamDri(TIFF* tif, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	assert(OJPEG_BUFFER>=6);
-	if (sp->restart_interval!=0)
-	{
-		sp->out_buffer[0]=255;
-		sp->out_buffer[1]=JPEG_MARKER_DRI;
-		sp->out_buffer[2]=0;
-		sp->out_buffer[3]=4;
-		sp->out_buffer[4]=(sp->restart_interval>>8);
-		sp->out_buffer[5]=(sp->restart_interval&255);
-		*len=6;
-		*mem=(void*)sp->out_buffer;
-	}
-	sp->out_state++;
+static void OJPEGWriteStreamDri(TIFF *tif, void **mem, uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    assert(OJPEG_BUFFER >= 6);
+    if (sp->restart_interval != 0)
+    {
+        sp->out_buffer[0] = 255;
+        sp->out_buffer[1] = JPEG_MARKER_DRI;
+        sp->out_buffer[2] = 0;
+        sp->out_buffer[3] = 4;
+        sp->out_buffer[4] = (sp->restart_interval >> 8);
+        sp->out_buffer[5] = (sp->restart_interval & 255);
+        *len = 6;
+        *mem = (void *)sp->out_buffer;
+    }
+    sp->out_state++;
 }
 
-static void
-OJPEGWriteStreamSof(TIFF* tif, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 m;
-	assert(OJPEG_BUFFER>=2+8+sp->samples_per_pixel_per_plane*3);
-	assert(255>=8+sp->samples_per_pixel_per_plane*3);
-	sp->out_buffer[0]=255;
-	sp->out_buffer[1]=sp->sof_marker_id;
-	/* Lf */
-	sp->out_buffer[2]=0;
-	sp->out_buffer[3]=8+sp->samples_per_pixel_per_plane*3;
-	/* P */
-	sp->out_buffer[4]=8;
-	/* Y */
-	sp->out_buffer[5]=(uint8)(sp->sof_y>>8);
-	sp->out_buffer[6]=(sp->sof_y&255);
-	/* X */
-	sp->out_buffer[7]=(uint8)(sp->sof_x>>8);
-	sp->out_buffer[8]=(sp->sof_x&255);
-	/* Nf */
-	sp->out_buffer[9]=sp->samples_per_pixel_per_plane;
-	for (m=0; m<sp->samples_per_pixel_per_plane; m++)
-	{
-		/* C */
-		sp->out_buffer[10+m*3]=sp->sof_c[sp->plane_sample_offset+m];
-		/* H and V */
-		sp->out_buffer[10+m*3+1]=sp->sof_hv[sp->plane_sample_offset+m];
-		/* Tq */
-		sp->out_buffer[10+m*3+2]=sp->sof_tq[sp->plane_sample_offset+m];
-	}
-	*len=10+sp->samples_per_pixel_per_plane*3;
-	*mem=(void*)sp->out_buffer;
-	sp->out_state++;
+static void OJPEGWriteStreamSof(TIFF *tif, void **mem, uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t m;
+    assert(OJPEG_BUFFER >= 2 + 8 + sp->samples_per_pixel_per_plane * 3);
+    assert(255 >= 8 + sp->samples_per_pixel_per_plane * 3);
+    sp->out_buffer[0] = 255;
+    sp->out_buffer[1] = sp->sof_marker_id;
+    /* Lf */
+    sp->out_buffer[2] = 0;
+    sp->out_buffer[3] = 8 + sp->samples_per_pixel_per_plane * 3;
+    /* P */
+    sp->out_buffer[4] = 8;
+    /* Y */
+    sp->out_buffer[5] = (uint8_t)(sp->sof_y >> 8);
+    sp->out_buffer[6] = (sp->sof_y & 255);
+    /* X */
+    sp->out_buffer[7] = (uint8_t)(sp->sof_x >> 8);
+    sp->out_buffer[8] = (sp->sof_x & 255);
+    /* Nf */
+    sp->out_buffer[9] = sp->samples_per_pixel_per_plane;
+    for (m = 0; m < sp->samples_per_pixel_per_plane; m++)
+    {
+        /* C */
+        sp->out_buffer[10 + m * 3] = sp->sof_c[sp->plane_sample_offset + m];
+        /* H and V */
+        sp->out_buffer[10 + m * 3 + 1] =
+            sp->sof_hv[sp->plane_sample_offset + m];
+        /* Tq */
+        sp->out_buffer[10 + m * 3 + 2] =
+            sp->sof_tq[sp->plane_sample_offset + m];
+    }
+    *len = 10 + sp->samples_per_pixel_per_plane * 3;
+    *mem = (void *)sp->out_buffer;
+    sp->out_state++;
 }
 
-static void
-OJPEGWriteStreamSos(TIFF* tif, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	uint8 m;
-	assert(OJPEG_BUFFER>=2+6+sp->samples_per_pixel_per_plane*2);
-	assert(255>=6+sp->samples_per_pixel_per_plane*2);
-	sp->out_buffer[0]=255;
-	sp->out_buffer[1]=JPEG_MARKER_SOS;
-	/* Ls */
-	sp->out_buffer[2]=0;
-	sp->out_buffer[3]=6+sp->samples_per_pixel_per_plane*2;
-	/* Ns */
-	sp->out_buffer[4]=sp->samples_per_pixel_per_plane;
-	for (m=0; m<sp->samples_per_pixel_per_plane; m++)
-	{
-		/* Cs */
-		sp->out_buffer[5+m*2]=sp->sos_cs[sp->plane_sample_offset+m];
-		/* Td and Ta */
-		sp->out_buffer[5+m*2+1]=sp->sos_tda[sp->plane_sample_offset+m];
-	}
-	/* Ss */
-	sp->out_buffer[5+sp->samples_per_pixel_per_plane*2]=0;
-	/* Se */
-	sp->out_buffer[5+sp->samples_per_pixel_per_plane*2+1]=63;
-	/* Ah and Al */
-	sp->out_buffer[5+sp->samples_per_pixel_per_plane*2+2]=0;
-	*len=8+sp->samples_per_pixel_per_plane*2;
-	*mem=(void*)sp->out_buffer;
-	sp->out_state++;
+static void OJPEGWriteStreamSos(TIFF *tif, void **mem, uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    uint8_t m;
+    assert(OJPEG_BUFFER >= 2 + 6 + sp->samples_per_pixel_per_plane * 2);
+    assert(255 >= 6 + sp->samples_per_pixel_per_plane * 2);
+    sp->out_buffer[0] = 255;
+    sp->out_buffer[1] = JPEG_MARKER_SOS;
+    /* Ls */
+    sp->out_buffer[2] = 0;
+    sp->out_buffer[3] = 6 + sp->samples_per_pixel_per_plane * 2;
+    /* Ns */
+    sp->out_buffer[4] = sp->samples_per_pixel_per_plane;
+    for (m = 0; m < sp->samples_per_pixel_per_plane; m++)
+    {
+        /* Cs */
+        sp->out_buffer[5 + m * 2] = sp->sos_cs[sp->plane_sample_offset + m];
+        /* Td and Ta */
+        sp->out_buffer[5 + m * 2 + 1] =
+            sp->sos_tda[sp->plane_sample_offset + m];
+    }
+    /* Ss */
+    sp->out_buffer[5 + sp->samples_per_pixel_per_plane * 2] = 0;
+    /* Se */
+    sp->out_buffer[5 + sp->samples_per_pixel_per_plane * 2 + 1] = 63;
+    /* Ah and Al */
+    sp->out_buffer[5 + sp->samples_per_pixel_per_plane * 2 + 2] = 0;
+    *len = 8 + sp->samples_per_pixel_per_plane * 2;
+    *mem = (void *)sp->out_buffer;
+    sp->out_state++;
 }
 
-static int
-OJPEGWriteStreamCompressed(TIFF* tif, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	if (sp->in_buffer_togo==0)
-	{
-		if (OJPEGReadBufferFill(sp)==0)
-			return(0);
-		assert(sp->in_buffer_togo>0);
-	}
-	*len=sp->in_buffer_togo;
-	*mem=(void*)sp->in_buffer_cur;
-	sp->in_buffer_togo=0;
-	if (sp->in_buffer_file_togo==0)
-	{
-		switch(sp->in_buffer_source)
-		{
-			case osibsStrile:
-				if (sp->in_buffer_next_strile<sp->in_buffer_strile_count)
-					sp->out_state=ososRst;
-				else
-					sp->out_state=ososEoi;
-				break;
-			case osibsEof:
-				sp->out_state=ososEoi;
-				break;
-			default:
-				break;
-		}
-	}
-	return(1);
+static int OJPEGWriteStreamCompressed(TIFF *tif, void **mem, uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    if (sp->in_buffer_togo == 0)
+    {
+        if (OJPEGReadBufferFill(sp) == 0)
+            return (0);
+        assert(sp->in_buffer_togo > 0);
+    }
+    *len = sp->in_buffer_togo;
+    *mem = (void *)sp->in_buffer_cur;
+    sp->in_buffer_togo = 0;
+    if (sp->in_buffer_file_togo == 0)
+    {
+        switch (sp->in_buffer_source)
+        {
+            case osibsStrile:
+                if (sp->in_buffer_next_strile < sp->in_buffer_strile_count)
+                    sp->out_state = ososRst;
+                else
+                    sp->out_state = ososEoi;
+                break;
+            case osibsEof:
+                sp->out_state = ososEoi;
+                break;
+            default:
+                break;
+        }
+    }
+    return (1);
 }
 
-static void
-OJPEGWriteStreamRst(TIFF* tif, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	assert(OJPEG_BUFFER>=2);
-	sp->out_buffer[0]=255;
-	sp->out_buffer[1]=JPEG_MARKER_RST0+sp->restart_index;
-	sp->restart_index++;
-	if (sp->restart_index==8)
-		sp->restart_index=0;
-	*len=2;
-	*mem=(void*)sp->out_buffer;
-	sp->out_state=ososCompressed;
+static void OJPEGWriteStreamRst(TIFF *tif, void **mem, uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    assert(OJPEG_BUFFER >= 2);
+    sp->out_buffer[0] = 255;
+    sp->out_buffer[1] = JPEG_MARKER_RST0 + sp->restart_index;
+    sp->restart_index++;
+    if (sp->restart_index == 8)
+        sp->restart_index = 0;
+    *len = 2;
+    *mem = (void *)sp->out_buffer;
+    sp->out_state = ososCompressed;
 }
 
-static void
-OJPEGWriteStreamEoi(TIFF* tif, void** mem, uint32* len)
-{
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	assert(OJPEG_BUFFER>=2);
-	sp->out_buffer[0]=255;
-	sp->out_buffer[1]=JPEG_MARKER_EOI;
-	*len=2;
-	*mem=(void*)sp->out_buffer;
+static void OJPEGWriteStreamEoi(TIFF *tif, void **mem, uint32_t *len)
+{
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    assert(OJPEG_BUFFER >= 2);
+    sp->out_buffer[0] = 255;
+    sp->out_buffer[1] = JPEG_MARKER_EOI;
+    *len = 2;
+    *mem = (void *)sp->out_buffer;
 }
 
 #ifndef LIBJPEG_ENCAP_EXTERNAL
-static int
-jpeg_create_decompress_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo)
-{
-	if( SETJMP(sp->exit_jmpbuf) )
-		return 0;
-	else {
-		jpeg_create_decompress(cinfo);
-		return 1;
-	}
+static int jpeg_create_decompress_encap(OJPEGState *sp,
+                                        jpeg_decompress_struct *cinfo)
+{
+    if (SETJMP(sp->exit_jmpbuf))
+        return 0;
+    else
+    {
+        jpeg_create_decompress(cinfo);
+        return 1;
+    }
 }
 #endif
 
 #ifndef LIBJPEG_ENCAP_EXTERNAL
-static int
-jpeg_read_header_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, uint8 require_image)
-{
-	if( SETJMP(sp->exit_jmpbuf) )
-		return 0;
-	else {
-		jpeg_read_header(cinfo,require_image);
-		return 1;
-	}
+static int jpeg_read_header_encap(OJPEGState *sp, jpeg_decompress_struct *cinfo,
+                                  uint8_t require_image)
+{
+    if (SETJMP(sp->exit_jmpbuf))
+        return 0;
+    else
+    {
+        jpeg_read_header(cinfo, require_image);
+        return 1;
+    }
 }
 #endif
 
 #ifndef LIBJPEG_ENCAP_EXTERNAL
-static int
-jpeg_start_decompress_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo)
-{
-	if( SETJMP(sp->exit_jmpbuf) )
-		return 0;
-	else {
-		jpeg_start_decompress(cinfo);
-		return 1;
-	}
+static int jpeg_start_decompress_encap(OJPEGState *sp,
+                                       jpeg_decompress_struct *cinfo)
+{
+    if (SETJMP(sp->exit_jmpbuf))
+        return 0;
+    else
+    {
+        jpeg_start_decompress(cinfo);
+        return 1;
+    }
 }
 #endif
 
 #ifndef LIBJPEG_ENCAP_EXTERNAL
-static int
-jpeg_read_scanlines_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, void* scanlines, uint32 max_lines)
-{
-	if( SETJMP(sp->exit_jmpbuf) )
-		return 0;
-	else {
-		jpeg_read_scanlines(cinfo,scanlines,max_lines);
-		return 1;
-	}
+static int jpeg_read_scanlines_encap(OJPEGState *sp,
+                                     jpeg_decompress_struct *cinfo,
+                                     void *scanlines, uint32_t max_lines)
+{
+    if (SETJMP(sp->exit_jmpbuf))
+        return 0;
+    else
+    {
+        jpeg_read_scanlines(cinfo, scanlines, max_lines);
+        return 1;
+    }
 }
 #endif
 
 #ifndef LIBJPEG_ENCAP_EXTERNAL
-static int
-jpeg_read_raw_data_encap(OJPEGState* sp, jpeg_decompress_struct* cinfo, void* data, uint32 max_lines)
-{
-	if( SETJMP(sp->exit_jmpbuf) )
-		return 0;
-	else {
-		jpeg_read_raw_data(cinfo,data,max_lines);
-		return 1;
-	}
+static int jpeg_read_raw_data_encap(OJPEGState *sp,
+                                    jpeg_decompress_struct *cinfo, void *data,
+                                    uint32_t max_lines)
+{
+    if (SETJMP(sp->exit_jmpbuf))
+        return 0;
+    else
+    {
+        jpeg_read_raw_data(cinfo, data, max_lines);
+        return 1;
+    }
 }
 #endif
 
 #ifndef LIBJPEG_ENCAP_EXTERNAL
-static void
-jpeg_encap_unwind(TIFF* tif)
+static void jpeg_encap_unwind(TIFF *tif)
 {
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	LONGJMP(sp->exit_jmpbuf,1);
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    LONGJMP(sp->exit_jmpbuf, 1);
 }
 #endif
 
-static void
-OJPEGLibjpegJpegErrorMgrOutputMessage(jpeg_common_struct* cinfo)
+static void OJPEGLibjpegJpegErrorMgrOutputMessage(jpeg_common_struct *cinfo)
 {
-	char buffer[JMSG_LENGTH_MAX];
-	(*cinfo->err->format_message)(cinfo,buffer);
-	TIFFWarningExt(((TIFF*)(cinfo->client_data))->tif_clientdata,"LibJpeg","%s",buffer);
+    char buffer[JMSG_LENGTH_MAX];
+    (*cinfo->err->format_message)(cinfo, buffer);
+    TIFFWarningExtR(((TIFF *)(cinfo->client_data)), "LibJpeg", "%s", buffer);
 }
 
-static void
-OJPEGLibjpegJpegErrorMgrErrorExit(jpeg_common_struct* cinfo)
+static void OJPEGLibjpegJpegErrorMgrErrorExit(jpeg_common_struct *cinfo)
 {
-	char buffer[JMSG_LENGTH_MAX];
-	(*cinfo->err->format_message)(cinfo,buffer);
-	TIFFErrorExt(((TIFF*)(cinfo->client_data))->tif_clientdata,"LibJpeg","%s",buffer);
-	jpeg_encap_unwind((TIFF*)(cinfo->client_data));
+    char buffer[JMSG_LENGTH_MAX];
+    (*cinfo->err->format_message)(cinfo, buffer);
+    TIFFErrorExtR(((TIFF *)(cinfo->client_data)), "LibJpeg", "%s", buffer);
+    jpeg_encap_unwind((TIFF *)(cinfo->client_data));
 }
 
-static void
-OJPEGLibjpegJpegSourceMgrInitSource(jpeg_decompress_struct* cinfo)
+static void OJPEGLibjpegJpegSourceMgrInitSource(jpeg_decompress_struct *cinfo)
 {
-	(void)cinfo;
+    (void)cinfo;
 }
 
 static boolean
-OJPEGLibjpegJpegSourceMgrFillInputBuffer(jpeg_decompress_struct* cinfo)
-{
-	TIFF* tif=(TIFF*)cinfo->client_data;
-	OJPEGState* sp=(OJPEGState*)tif->tif_data;
-	void* mem=0;
-	uint32 len=0U;
-	if (OJPEGWriteStream(tif,&mem,&len)==0)
-	{
-		TIFFErrorExt(tif->tif_clientdata,"LibJpeg","Premature end of JPEG data");
-		jpeg_encap_unwind(tif);
-	}
-	sp->libjpeg_jpeg_source_mgr.bytes_in_buffer=len;
-	sp->libjpeg_jpeg_source_mgr.next_input_byte=mem;
-	return(1);
+OJPEGLibjpegJpegSourceMgrFillInputBuffer(jpeg_decompress_struct *cinfo)
+{
+    TIFF *tif = (TIFF *)cinfo->client_data;
+    OJPEGState *sp = (OJPEGState *)tif->tif_data;
+    void *mem = 0;
+    uint32_t len = 0U;
+    if (OJPEGWriteStream(tif, &mem, &len) == 0)
+    {
+        TIFFErrorExtR(tif, "LibJpeg", "Premature end of JPEG data");
+        jpeg_encap_unwind(tif);
+    }
+    sp->libjpeg_jpeg_source_mgr.bytes_in_buffer = len;
+    sp->libjpeg_jpeg_source_mgr.next_input_byte = mem;
+    return (1);
 }
 
 static void
-OJPEGLibjpegJpegSourceMgrSkipInputData(jpeg_decompress_struct* cinfo, long num_bytes)
+OJPEGLibjpegJpegSourceMgrSkipInputData(jpeg_decompress_struct *cinfo,
+                                       long num_bytes)
 {
-	TIFF* tif=(TIFF*)cinfo->client_data;
-	(void)num_bytes;
-	TIFFErrorExt(tif->tif_clientdata,"LibJpeg","Unexpected error");
-	jpeg_encap_unwind(tif);
+    TIFF *tif = (TIFF *)cinfo->client_data;
+    (void)num_bytes;
+    TIFFErrorExtR(tif, "LibJpeg", "Unexpected error");
+    jpeg_encap_unwind(tif);
 }
 
 #ifdef _MSC_VER
-#pragma warning( push )
-#pragma warning( disable : 4702 ) /* unreachable code */
+#pragma warning(push)
+#pragma warning(disable : 4702) /* unreachable code */
 #endif
 static boolean
-OJPEGLibjpegJpegSourceMgrResyncToRestart(jpeg_decompress_struct* cinfo, int desired)
+OJPEGLibjpegJpegSourceMgrResyncToRestart(jpeg_decompress_struct *cinfo,
+                                         int desired)
 {
-	TIFF* tif=(TIFF*)cinfo->client_data;
-	(void)desired;
-	TIFFErrorExt(tif->tif_clientdata,"LibJpeg","Unexpected error");
-	jpeg_encap_unwind(tif);
-	return(0);
+    TIFF *tif = (TIFF *)cinfo->client_data;
+    (void)desired;
+    TIFFErrorExtR(tif, "LibJpeg", "Unexpected error");
+    jpeg_encap_unwind(tif);
+    return (0);
 }
 #ifdef _MSC_VER
-#pragma warning( pop ) 
+#pragma warning(pop)
 #endif
 
-static void
-OJPEGLibjpegJpegSourceMgrTermSource(jpeg_decompress_struct* cinfo)
+static void OJPEGLibjpegJpegSourceMgrTermSource(jpeg_decompress_struct *cinfo)
 {
-	(void)cinfo;
+    (void)cinfo;
 }
 
 #endif
-
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_open.c b/3rdparty/libtiff/tif_open.c
index a0e31583a61c..23fcf81c43f0 100644
--- a/3rdparty/libtiff/tif_open.c
+++ b/3rdparty/libtiff/tif_open.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -26,481 +26,626 @@
  * TIFF Library.
  */
 #include "tiffiop.h"
+#include <limits.h>
 
 /*
  * Dummy functions to fill the omitted client procedures.
  */
-static int
-_tiffDummyMapProc(thandle_t fd, void** pbase, toff_t* psize)
+static int _tiffDummyMapProc(thandle_t fd, void **pbase, toff_t *psize)
+{
+    (void)fd;
+    (void)pbase;
+    (void)psize;
+    return (0);
+}
+
+static void _tiffDummyUnmapProc(thandle_t fd, void *base, toff_t size)
+{
+    (void)fd;
+    (void)base;
+    (void)size;
+}
+
+int _TIFFgetMode(TIFFOpenOptions *opts, thandle_t clientdata, const char *mode,
+                 const char *module)
 {
-	(void) fd; (void) pbase; (void) psize;
-	return (0);
+    int m = -1;
+
+    switch (mode[0])
+    {
+        case 'r':
+            m = O_RDONLY;
+            if (mode[1] == '+')
+                m = O_RDWR;
+            break;
+        case 'w':
+        case 'a':
+            m = O_RDWR | O_CREAT;
+            if (mode[0] == 'w')
+                m |= O_TRUNC;
+            break;
+        default:
+            _TIFFErrorEarly(opts, clientdata, module, "\"%s\": Bad mode", mode);
+            break;
+    }
+    return (m);
 }
 
-static void
-_tiffDummyUnmapProc(thandle_t fd, void* base, toff_t size)
+TIFFOpenOptions *TIFFOpenOptionsAlloc()
 {
-	(void) fd; (void) base; (void) size;
+    TIFFOpenOptions *opts =
+        (TIFFOpenOptions *)_TIFFcalloc(1, sizeof(TIFFOpenOptions));
+    return opts;
 }
 
-int
-_TIFFgetMode(const char* mode, const char* module)
+void TIFFOpenOptionsFree(TIFFOpenOptions *opts) { _TIFFfree(opts); }
+
+/** Define a limit in bytes for a single memory allocation done by libtiff.
+ *  If max_single_mem_alloc is set to 0, no other limit that the underlying
+ *  _TIFFmalloc() will be applied, which is the default.
+ */
+void TIFFOpenOptionsSetMaxSingleMemAlloc(TIFFOpenOptions *opts,
+                                         tmsize_t max_single_mem_alloc)
 {
-	int m = -1;
-
-	switch (mode[0]) {
-	case 'r':
-		m = O_RDONLY;
-		if (mode[1] == '+')
-			m = O_RDWR;
-		break;
-	case 'w':
-	case 'a':
-		m = O_RDWR|O_CREAT;
-		if (mode[0] == 'w')
-			m |= O_TRUNC;
-		break;
-	default:
-		TIFFErrorExt(0, module, "\"%s\": Bad mode", mode);
-		break;
-	}
-	return (m);
+    opts->max_single_mem_alloc = max_single_mem_alloc;
 }
 
-TIFF*
-TIFFClientOpen(
-	const char* name, const char* mode,
-	thandle_t clientdata,
-	TIFFReadWriteProc readproc,
-	TIFFReadWriteProc writeproc,
-	TIFFSeekProc seekproc,
-	TIFFCloseProc closeproc,
-	TIFFSizeProc sizeproc,
-	TIFFMapFileProc mapproc,
-	TIFFUnmapFileProc unmapproc
-)
+void TIFFOpenOptionsSetErrorHandlerExtR(TIFFOpenOptions *opts,
+                                        TIFFErrorHandlerExtR handler,
+                                        void *errorhandler_user_data)
 {
-	static const char module[] = "TIFFClientOpen";
-	TIFF *tif;
-	int m;
-	const char* cp;
-
-	/* The following are configuration checks. They should be redundant, but should not
-	 * compile to any actual code in an optimised release build anyway. If any of them
-	 * fail, (makefile-based or other) configuration is not correct */
-	assert(sizeof(uint8)==1);
-	assert(sizeof(int8)==1);
-	assert(sizeof(uint16)==2);
-	assert(sizeof(int16)==2);
-	assert(sizeof(uint32)==4);
-	assert(sizeof(int32)==4);
-	assert(sizeof(uint64)==8);
-	assert(sizeof(int64)==8);
-	assert(sizeof(tmsize_t)==sizeof(void*));
-	{
-		union{
-			uint8 a8[2];
-			uint16 a16;
-		} n;
-		n.a8[0]=1;
-		n.a8[1]=0;
-                (void)n;
-		#ifdef WORDS_BIGENDIAN
-		assert(n.a16==256);
-		#else
-		assert(n.a16==1);
-		#endif
-	}
-
-	m = _TIFFgetMode(mode, module);
-	if (m == -1)
-		goto bad2;
-	tif = (TIFF *)_TIFFmalloc((tmsize_t)(sizeof (TIFF) + strlen(name) + 1));
-	if (tif == NULL) {
-		TIFFErrorExt(clientdata, module, "%s: Out of memory (TIFF structure)", name);
-		goto bad2;
-	}
-	_TIFFmemset(tif, 0, sizeof (*tif));
-	tif->tif_name = (char *)tif + sizeof (TIFF);
-	strcpy(tif->tif_name, name);
-	tif->tif_mode = m &~ (O_CREAT|O_TRUNC);
-	tif->tif_curdir = (uint16) -1;		/* non-existent directory */
-	tif->tif_curoff = 0;
-	tif->tif_curstrip = (uint32) -1;	/* invalid strip */
-	tif->tif_row = (uint32) -1;		/* read/write pre-increment */
-	tif->tif_clientdata = clientdata;
-	if (!readproc || !writeproc || !seekproc || !closeproc || !sizeproc) {
-		TIFFErrorExt(clientdata, module,
-		    "One of the client procedures is NULL pointer.");
-		_TIFFfree(tif);
-		goto bad2;
-	}
-	tif->tif_readproc = readproc;
-	tif->tif_writeproc = writeproc;
-	tif->tif_seekproc = seekproc;
-	tif->tif_closeproc = closeproc;
-	tif->tif_sizeproc = sizeproc;
-	if (mapproc)
-		tif->tif_mapproc = mapproc;
-	else
-		tif->tif_mapproc = _tiffDummyMapProc;
-	if (unmapproc)
-		tif->tif_unmapproc = unmapproc;
-	else
-		tif->tif_unmapproc = _tiffDummyUnmapProc;
-	_TIFFSetDefaultCompressionState(tif);    /* setup default state */
-	/*
-	 * Default is to return data MSB2LSB and enable the
-	 * use of memory-mapped files and strip chopping when
-	 * a file is opened read-only.
-	 */
-	tif->tif_flags = FILLORDER_MSB2LSB;
-	if (m == O_RDONLY )
-		tif->tif_flags |= TIFF_MAPPED;
-
-	#ifdef STRIPCHOP_DEFAULT
-	if (m == O_RDONLY || m == O_RDWR)
-		tif->tif_flags |= STRIPCHOP_DEFAULT;
-	#endif
-
-	/*
-	 * Process library-specific flags in the open mode string.
-	 * The following flags may be used to control intrinsic library
-	 * behavior that may or may not be desirable (usually for
-	 * compatibility with some application that claims to support
-	 * TIFF but only supports some brain dead idea of what the
-	 * vendor thinks TIFF is):
-	 *
-	 * 'l' use little-endian byte order for creating a file
-	 * 'b' use big-endian byte order for creating a file
-	 * 'L' read/write information using LSB2MSB bit order
-	 * 'B' read/write information using MSB2LSB bit order
-	 * 'H' read/write information using host bit order
-	 * 'M' enable use of memory-mapped files when supported
-	 * 'm' disable use of memory-mapped files
-	 * 'C' enable strip chopping support when reading
-	 * 'c' disable strip chopping support
-	 * 'h' read TIFF header only, do not load the first IFD
-	 * '4' ClassicTIFF for creating a file (default)
-	 * '8' BigTIFF for creating a file
-         * 'D' enable use of deferred strip/tile offset/bytecount array loading.
-         * 'O' on-demand loading of values instead of whole array loading (implies D)
-	 *
-	 * The use of the 'l' and 'b' flags is strongly discouraged.
-	 * These flags are provided solely because numerous vendors,
-	 * typically on the PC, do not correctly support TIFF; they
-	 * only support the Intel little-endian byte order.  This
-	 * support is not configured by default because it supports
-	 * the violation of the TIFF spec that says that readers *MUST*
-	 * support both byte orders.  It is strongly recommended that
-	 * you not use this feature except to deal with busted apps
-	 * that write invalid TIFF.  And even in those cases you should
-	 * bang on the vendors to fix their software.
-	 *
-	 * The 'L', 'B', and 'H' flags are intended for applications
-	 * that can optimize operations on data by using a particular
-	 * bit order.  By default the library returns data in MSB2LSB
-	 * bit order for compatibility with older versions of this
-	 * library.  Returning data in the bit order of the native CPU
-	 * makes the most sense but also requires applications to check
-	 * the value of the FillOrder tag; something they probably do
-	 * not do right now.
-	 *
-	 * The 'M' and 'm' flags are provided because some virtual memory
-	 * systems exhibit poor behavior when large images are mapped.
-	 * These options permit clients to control the use of memory-mapped
-	 * files on a per-file basis.
-	 *
-	 * The 'C' and 'c' flags are provided because the library support
-	 * for chopping up large strips into multiple smaller strips is not
-	 * application-transparent and as such can cause problems.  The 'c'
-	 * option permits applications that only want to look at the tags,
-	 * for example, to get the unadulterated TIFF tag information.
-	 */
-	for (cp = mode; *cp; cp++)
-		switch (*cp) {
-			case 'b':
-				#ifndef WORDS_BIGENDIAN
-				if (m&O_CREAT)
-					tif->tif_flags |= TIFF_SWAB;
-				#endif
-				break;
-			case 'l':
-				#ifdef WORDS_BIGENDIAN
-				if ((m&O_CREAT))
-					tif->tif_flags |= TIFF_SWAB;
-				#endif
-				break;
-			case 'B':
-				tif->tif_flags = (tif->tif_flags &~ TIFF_FILLORDER) |
-				    FILLORDER_MSB2LSB;
-				break;
-			case 'L':
-				tif->tif_flags = (tif->tif_flags &~ TIFF_FILLORDER) |
-				    FILLORDER_LSB2MSB;
-				break;
-			case 'H':
-				tif->tif_flags = (tif->tif_flags &~ TIFF_FILLORDER) |
-				    HOST_FILLORDER;
-				break;
-			case 'M':
-				if (m == O_RDONLY)
-					tif->tif_flags |= TIFF_MAPPED;
-				break;
-			case 'm':
-				if (m == O_RDONLY)
-					tif->tif_flags &= ~TIFF_MAPPED;
-				break;
-			case 'C':
-				if (m == O_RDONLY)
-					tif->tif_flags |= TIFF_STRIPCHOP;
-				break;
-			case 'c':
-				if (m == O_RDONLY)
-					tif->tif_flags &= ~TIFF_STRIPCHOP;
-				break;
-			case 'h':
-				tif->tif_flags |= TIFF_HEADERONLY;
-				break;
-			case '8':
-				if (m&O_CREAT)
-					tif->tif_flags |= TIFF_BIGTIFF;
-				break;
-			case 'D':
-			        tif->tif_flags |= TIFF_DEFERSTRILELOAD;
-				break;
-			case 'O':
-				if( m == O_RDONLY )
-					tif->tif_flags |= (TIFF_LAZYSTRILELOAD | TIFF_DEFERSTRILELOAD);
-				break;
-		}
+    opts->errorhandler = handler;
+    opts->errorhandler_user_data = errorhandler_user_data;
+}
+
+void TIFFOpenOptionsSetWarningHandlerExtR(TIFFOpenOptions *opts,
+                                          TIFFErrorHandlerExtR handler,
+                                          void *warnhandler_user_data)
+{
+    opts->warnhandler = handler;
+    opts->warnhandler_user_data = warnhandler_user_data;
+}
+
+static void _TIFFEmitErrorAboveMaxSingleMemAlloc(TIFF *tif,
+                                                 const char *pszFunction,
+                                                 tmsize_t s)
+{
+    TIFFErrorExtR(tif, pszFunction,
+                  "Memory allocation of %" PRIu64
+                  " bytes is beyond the %" PRIu64
+                  " byte limit defined in open options",
+                  (uint64_t)s, (uint64_t)tif->tif_max_single_mem_alloc);
+}
+
+/** malloc() version that takes into account memory-specific open options */
+void *_TIFFmallocExt(TIFF *tif, tmsize_t s)
+{
+    if (tif != NULL && tif->tif_max_single_mem_alloc > 0 &&
+        s > tif->tif_max_single_mem_alloc)
+    {
+        _TIFFEmitErrorAboveMaxSingleMemAlloc(tif, "_TIFFmallocExt", s);
+        return NULL;
+    }
+    return _TIFFmalloc(s);
+}
+
+/** calloc() version that takes into account memory-specific open options */
+void *_TIFFcallocExt(TIFF *tif, tmsize_t nmemb, tmsize_t siz)
+{
+    if (tif != NULL && tif->tif_max_single_mem_alloc > 0)
+    {
+        if (nmemb <= 0 || siz <= 0 || nmemb > TIFF_TMSIZE_T_MAX / siz)
+            return NULL;
+        if (nmemb * siz > tif->tif_max_single_mem_alloc)
+        {
+            _TIFFEmitErrorAboveMaxSingleMemAlloc(tif, "_TIFFcallocExt",
+                                                 nmemb * siz);
+            return NULL;
+        }
+    }
+    return _TIFFcalloc(nmemb, siz);
+}
+
+/** realloc() version that takes into account memory-specific open options */
+void *_TIFFreallocExt(TIFF *tif, void *p, tmsize_t s)
+{
+    if (tif != NULL && tif->tif_max_single_mem_alloc > 0 &&
+        s > tif->tif_max_single_mem_alloc)
+    {
+        _TIFFEmitErrorAboveMaxSingleMemAlloc(tif, "_TIFFreallocExt", s);
+        return NULL;
+    }
+    return _TIFFrealloc(p, s);
+}
+
+/** free() version that takes into account memory-specific open options */
+void _TIFFfreeExt(TIFF *tif, void *p)
+{
+    (void)tif;
+    _TIFFfree(p);
+}
+
+TIFF *TIFFClientOpen(const char *name, const char *mode, thandle_t clientdata,
+                     TIFFReadWriteProc readproc, TIFFReadWriteProc writeproc,
+                     TIFFSeekProc seekproc, TIFFCloseProc closeproc,
+                     TIFFSizeProc sizeproc, TIFFMapFileProc mapproc,
+                     TIFFUnmapFileProc unmapproc)
+{
+    return TIFFClientOpenExt(name, mode, clientdata, readproc, writeproc,
+                             seekproc, closeproc, sizeproc, mapproc, unmapproc,
+                             NULL);
+}
+
+TIFF *TIFFClientOpenExt(const char *name, const char *mode,
+                        thandle_t clientdata, TIFFReadWriteProc readproc,
+                        TIFFReadWriteProc writeproc, TIFFSeekProc seekproc,
+                        TIFFCloseProc closeproc, TIFFSizeProc sizeproc,
+                        TIFFMapFileProc mapproc, TIFFUnmapFileProc unmapproc,
+                        TIFFOpenOptions *opts)
+{
+    static const char module[] = "TIFFClientOpenExt";
+    TIFF *tif;
+    int m;
+    const char *cp;
+
+    /* The following are configuration checks. They should be redundant, but
+     * should not compile to any actual code in an optimised release build
+     * anyway. If any of them fail, (makefile-based or other) configuration is
+     * not correct */
+    assert(sizeof(uint8_t) == 1);
+    assert(sizeof(int8_t) == 1);
+    assert(sizeof(uint16_t) == 2);
+    assert(sizeof(int16_t) == 2);
+    assert(sizeof(uint32_t) == 4);
+    assert(sizeof(int32_t) == 4);
+    assert(sizeof(uint64_t) == 8);
+    assert(sizeof(int64_t) == 8);
+    {
+        union
+        {
+            uint8_t a8[2];
+            uint16_t a16;
+        } n;
+        n.a8[0] = 1;
+        n.a8[1] = 0;
+        (void)n;
+#ifdef WORDS_BIGENDIAN
+        assert(n.a16 == 256);
+#else
+        assert(n.a16 == 1);
+#endif
+    }
+
+    m = _TIFFgetMode(opts, clientdata, mode, module);
+    if (m == -1)
+        goto bad2;
+    tmsize_t size_to_alloc = (tmsize_t)(sizeof(TIFF) + strlen(name) + 1);
+    if (opts && opts->max_single_mem_alloc > 0 &&
+        size_to_alloc > opts->max_single_mem_alloc)
+    {
+        _TIFFErrorEarly(opts, clientdata, module,
+                        "%s: Memory allocation of %" PRIu64
+                        " bytes is beyond the %" PRIu64
+                        " byte limit defined in open options",
+                        name, (uint64_t)size_to_alloc,
+                        (uint64_t)opts->max_single_mem_alloc);
+        goto bad2;
+    }
+    tif = (TIFF *)_TIFFmallocExt(NULL, size_to_alloc);
+    if (tif == NULL)
+    {
+        _TIFFErrorEarly(opts, clientdata, module,
+                        "%s: Out of memory (TIFF structure)", name);
+        goto bad2;
+    }
+    _TIFFmemset(tif, 0, sizeof(*tif));
+    tif->tif_name = (char *)tif + sizeof(TIFF);
+    strcpy(tif->tif_name, name);
+    tif->tif_mode = m & ~(O_CREAT | O_TRUNC);
+    tif->tif_curdir = TIFF_NON_EXISTENT_DIR_NUMBER; /* non-existent directory */
+    tif->tif_curoff = 0;
+    tif->tif_curstrip = (uint32_t)-1; /* invalid strip */
+    tif->tif_row = (uint32_t)-1;      /* read/write pre-increment */
+    tif->tif_clientdata = clientdata;
+    tif->tif_readproc = readproc;
+    tif->tif_writeproc = writeproc;
+    tif->tif_seekproc = seekproc;
+    tif->tif_closeproc = closeproc;
+    tif->tif_sizeproc = sizeproc;
+    tif->tif_mapproc = mapproc ? mapproc : _tiffDummyMapProc;
+    tif->tif_unmapproc = unmapproc ? unmapproc : _tiffDummyUnmapProc;
+    if (opts)
+    {
+        tif->tif_errorhandler = opts->errorhandler;
+        tif->tif_errorhandler_user_data = opts->errorhandler_user_data;
+        tif->tif_warnhandler = opts->warnhandler;
+        tif->tif_warnhandler_user_data = opts->warnhandler_user_data;
+        tif->tif_max_single_mem_alloc = opts->max_single_mem_alloc;
+    }
+
+    if (!readproc || !writeproc || !seekproc || !closeproc || !sizeproc)
+    {
+        TIFFErrorExtR(tif, module,
+                      "One of the client procedures is NULL pointer.");
+        _TIFFfreeExt(NULL, tif);
+        goto bad2;
+    }
+
+    _TIFFSetDefaultCompressionState(tif); /* setup default state */
+    /*
+     * Default is to return data MSB2LSB and enable the
+     * use of memory-mapped files and strip chopping when
+     * a file is opened read-only.
+     */
+    tif->tif_flags = FILLORDER_MSB2LSB;
+    if (m == O_RDONLY)
+        tif->tif_flags |= TIFF_MAPPED;
+
+#ifdef STRIPCHOP_DEFAULT
+    if (m == O_RDONLY || m == O_RDWR)
+        tif->tif_flags |= STRIPCHOP_DEFAULT;
+#endif
+
+    /*
+     * Process library-specific flags in the open mode string.
+     * The following flags may be used to control intrinsic library
+     * behavior that may or may not be desirable (usually for
+     * compatibility with some application that claims to support
+     * TIFF but only supports some brain dead idea of what the
+     * vendor thinks TIFF is):
+     *
+     * 'l' use little-endian byte order for creating a file
+     * 'b' use big-endian byte order for creating a file
+     * 'L' read/write information using LSB2MSB bit order
+     * 'B' read/write information using MSB2LSB bit order
+     * 'H' read/write information using host bit order
+     * 'M' enable use of memory-mapped files when supported
+     * 'm' disable use of memory-mapped files
+     * 'C' enable strip chopping support when reading
+     * 'c' disable strip chopping support
+     * 'h' read TIFF header only, do not load the first IFD
+     * '4' ClassicTIFF for creating a file (default)
+     * '8' BigTIFF for creating a file
+     * 'D' enable use of deferred strip/tile offset/bytecount array loading.
+     * 'O' on-demand loading of values instead of whole array loading (implies
+     * D)
+     *
+     * The use of the 'l' and 'b' flags is strongly discouraged.
+     * These flags are provided solely because numerous vendors,
+     * typically on the PC, do not correctly support TIFF; they
+     * only support the Intel little-endian byte order.  This
+     * support is not configured by default because it supports
+     * the violation of the TIFF spec that says that readers *MUST*
+     * support both byte orders.  It is strongly recommended that
+     * you not use this feature except to deal with busted apps
+     * that write invalid TIFF.  And even in those cases you should
+     * bang on the vendors to fix their software.
+     *
+     * The 'L', 'B', and 'H' flags are intended for applications
+     * that can optimize operations on data by using a particular
+     * bit order.  By default the library returns data in MSB2LSB
+     * bit order for compatibility with older versions of this
+     * library.  Returning data in the bit order of the native CPU
+     * makes the most sense but also requires applications to check
+     * the value of the FillOrder tag; something they probably do
+     * not do right now.
+     *
+     * The 'M' and 'm' flags are provided because some virtual memory
+     * systems exhibit poor behavior when large images are mapped.
+     * These options permit clients to control the use of memory-mapped
+     * files on a per-file basis.
+     *
+     * The 'C' and 'c' flags are provided because the library support
+     * for chopping up large strips into multiple smaller strips is not
+     * application-transparent and as such can cause problems.  The 'c'
+     * option permits applications that only want to look at the tags,
+     * for example, to get the unadulterated TIFF tag information.
+     */
+    for (cp = mode; *cp; cp++)
+        switch (*cp)
+        {
+            case 'b':
+#ifndef WORDS_BIGENDIAN
+                if (m & O_CREAT)
+                    tif->tif_flags |= TIFF_SWAB;
+#endif
+                break;
+            case 'l':
+#ifdef WORDS_BIGENDIAN
+                if ((m & O_CREAT))
+                    tif->tif_flags |= TIFF_SWAB;
+#endif
+                break;
+            case 'B':
+                tif->tif_flags =
+                    (tif->tif_flags & ~TIFF_FILLORDER) | FILLORDER_MSB2LSB;
+                break;
+            case 'L':
+                tif->tif_flags =
+                    (tif->tif_flags & ~TIFF_FILLORDER) | FILLORDER_LSB2MSB;
+                break;
+            case 'H':
+                TIFFWarningExtR(tif, name,
+                                "H(ost) mode is deprecated. Since "
+                                "libtiff 4.5.1, it is an alias of 'B' / "
+                                "FILLORDER_MSB2LSB.");
+                tif->tif_flags =
+                    (tif->tif_flags & ~TIFF_FILLORDER) | FILLORDER_MSB2LSB;
+                break;
+            case 'M':
+                if (m == O_RDONLY)
+                    tif->tif_flags |= TIFF_MAPPED;
+                break;
+            case 'm':
+                if (m == O_RDONLY)
+                    tif->tif_flags &= ~TIFF_MAPPED;
+                break;
+            case 'C':
+                if (m == O_RDONLY)
+                    tif->tif_flags |= TIFF_STRIPCHOP;
+                break;
+            case 'c':
+                if (m == O_RDONLY)
+                    tif->tif_flags &= ~TIFF_STRIPCHOP;
+                break;
+            case 'h':
+                tif->tif_flags |= TIFF_HEADERONLY;
+                break;
+            case '8':
+                if (m & O_CREAT)
+                    tif->tif_flags |= TIFF_BIGTIFF;
+                break;
+            case 'D':
+                tif->tif_flags |= TIFF_DEFERSTRILELOAD;
+                break;
+            case 'O':
+                if (m == O_RDONLY)
+                    tif->tif_flags |=
+                        (TIFF_LAZYSTRILELOAD | TIFF_DEFERSTRILELOAD);
+                break;
+        }
 
 #ifdef DEFER_STRILE_LOAD
-        /* Compatibility with old DEFER_STRILE_LOAD compilation flag */
-        /* Probably unneeded, since to the best of my knowledge (E. Rouault) */
-        /* GDAL was the only user of this, and will now use the new 'D' flag */
-        tif->tif_flags |= TIFF_DEFERSTRILELOAD;
+    /* Compatibility with old DEFER_STRILE_LOAD compilation flag */
+    /* Probably unneeded, since to the best of my knowledge (E. Rouault) */
+    /* GDAL was the only user of this, and will now use the new 'D' flag */
+    tif->tif_flags |= TIFF_DEFERSTRILELOAD;
 #endif
 
-	/*
-	 * Read in TIFF header.
-	 */
-	if ((m & O_TRUNC) ||
-	    !ReadOK(tif, &tif->tif_header, sizeof (TIFFHeaderClassic))) {
-		if (tif->tif_mode == O_RDONLY) {
-			TIFFErrorExt(tif->tif_clientdata, name,
-			    "Cannot read TIFF header");
-			goto bad;
-		}
-		/*
-		 * Setup header and write.
-		 */
-		#ifdef WORDS_BIGENDIAN
-		tif->tif_header.common.tiff_magic = (tif->tif_flags & TIFF_SWAB)
-		    ? TIFF_LITTLEENDIAN : TIFF_BIGENDIAN;
-		#else
-		tif->tif_header.common.tiff_magic = (tif->tif_flags & TIFF_SWAB)
-		    ? TIFF_BIGENDIAN : TIFF_LITTLEENDIAN;
-		#endif
-		if (!(tif->tif_flags&TIFF_BIGTIFF))
-		{
-			tif->tif_header.common.tiff_version = TIFF_VERSION_CLASSIC;
-			tif->tif_header.classic.tiff_diroff = 0;
-			if (tif->tif_flags & TIFF_SWAB)
-				TIFFSwabShort(&tif->tif_header.common.tiff_version);
-			tif->tif_header_size = sizeof(TIFFHeaderClassic);
-		}
-		else
-		{
-			tif->tif_header.common.tiff_version = TIFF_VERSION_BIG;
-			tif->tif_header.big.tiff_offsetsize = 8;
-			tif->tif_header.big.tiff_unused = 0;
-			tif->tif_header.big.tiff_diroff = 0;
-			if (tif->tif_flags & TIFF_SWAB)
-			{
-				TIFFSwabShort(&tif->tif_header.common.tiff_version);
-				TIFFSwabShort(&tif->tif_header.big.tiff_offsetsize);
-			}
-			tif->tif_header_size = sizeof (TIFFHeaderBig);
-		}
-		/*
-		 * The doc for "fopen" for some STD_C_LIBs says that if you
-		 * open a file for modify ("+"), then you must fseek (or
-		 * fflush?) between any freads and fwrites.  This is not
-		 * necessary on most systems, but has been shown to be needed
-		 * on Solaris.
-		 */
-		TIFFSeekFile( tif, 0, SEEK_SET );
-		if (!WriteOK(tif, &tif->tif_header, (tmsize_t)(tif->tif_header_size))) {
-			TIFFErrorExt(tif->tif_clientdata, name,
-			    "Error writing TIFF header");
-			goto bad;
-		}
-		/*
-		 * Setup the byte order handling.
-		 */
-		if (tif->tif_header.common.tiff_magic == TIFF_BIGENDIAN) {
-			#ifndef WORDS_BIGENDIAN
-			tif->tif_flags |= TIFF_SWAB;
-			#endif
-		} else {
-			#ifdef WORDS_BIGENDIAN
-			tif->tif_flags |= TIFF_SWAB;
-			#endif
-		}
-		/*
-		 * Setup default directory.
-		 */
-		if (!TIFFDefaultDirectory(tif))
-			goto bad;
-		tif->tif_diroff = 0;
-		tif->tif_dirlist = NULL;
-		tif->tif_dirlistsize = 0;
-		tif->tif_dirnumber = 0;
-		return (tif);
-	}
-	/*
-	 * Setup the byte order handling.
-	 */
-	if (tif->tif_header.common.tiff_magic != TIFF_BIGENDIAN &&
-	    tif->tif_header.common.tiff_magic != TIFF_LITTLEENDIAN
-	    #if MDI_SUPPORT
-	    &&
-	    #if HOST_BIGENDIAN
-	    tif->tif_header.common.tiff_magic != MDI_BIGENDIAN
-	    #else
-	    tif->tif_header.common.tiff_magic != MDI_LITTLEENDIAN
-	    #endif
-	    ) {
-		TIFFErrorExt(tif->tif_clientdata, name,
-		    "Not a TIFF or MDI file, bad magic number %d (0x%x)",
-	    #else
-	    ) {
-		TIFFErrorExt(tif->tif_clientdata, name,
-		    "Not a TIFF file, bad magic number %d (0x%x)",
-	    #endif
-		    tif->tif_header.common.tiff_magic,
-		    tif->tif_header.common.tiff_magic);
-		goto bad;
-	}
-	if (tif->tif_header.common.tiff_magic == TIFF_BIGENDIAN) {
-		#ifndef WORDS_BIGENDIAN
-		tif->tif_flags |= TIFF_SWAB;
-		#endif
-	} else {
-		#ifdef WORDS_BIGENDIAN
-		tif->tif_flags |= TIFF_SWAB;
-		#endif
-	}
-	if (tif->tif_flags & TIFF_SWAB) 
-		TIFFSwabShort(&tif->tif_header.common.tiff_version);
-	if ((tif->tif_header.common.tiff_version != TIFF_VERSION_CLASSIC)&&
-	    (tif->tif_header.common.tiff_version != TIFF_VERSION_BIG)) {
-		TIFFErrorExt(tif->tif_clientdata, name,
-		    "Not a TIFF file, bad version number %d (0x%x)",
-		    tif->tif_header.common.tiff_version,
-		    tif->tif_header.common.tiff_version);
-		goto bad;
-	}
-	if (tif->tif_header.common.tiff_version == TIFF_VERSION_CLASSIC)
-	{
-		if (tif->tif_flags & TIFF_SWAB)
-			TIFFSwabLong(&tif->tif_header.classic.tiff_diroff);
-		tif->tif_header_size = sizeof(TIFFHeaderClassic);
-	}
-	else
-	{
-		if (!ReadOK(tif, ((uint8*)(&tif->tif_header) + sizeof(TIFFHeaderClassic)), (sizeof(TIFFHeaderBig)-sizeof(TIFFHeaderClassic))))
-		{
-			TIFFErrorExt(tif->tif_clientdata, name,
-			    "Cannot read TIFF header");
-			goto bad;
-		}
-		if (tif->tif_flags & TIFF_SWAB)
-		{
-			TIFFSwabShort(&tif->tif_header.big.tiff_offsetsize);
-			TIFFSwabLong8(&tif->tif_header.big.tiff_diroff);
-		}
-		if (tif->tif_header.big.tiff_offsetsize != 8)
-		{
-			TIFFErrorExt(tif->tif_clientdata, name,
-			    "Not a TIFF file, bad BigTIFF offsetsize %d (0x%x)",
-			    tif->tif_header.big.tiff_offsetsize,
-			    tif->tif_header.big.tiff_offsetsize);
-			goto bad;
-		}
-		if (tif->tif_header.big.tiff_unused != 0)
-		{
-			TIFFErrorExt(tif->tif_clientdata, name,
-			    "Not a TIFF file, bad BigTIFF unused %d (0x%x)",
-			    tif->tif_header.big.tiff_unused,
-			    tif->tif_header.big.tiff_unused);
-			goto bad;
-		}
-		tif->tif_header_size = sizeof(TIFFHeaderBig);
-		tif->tif_flags |= TIFF_BIGTIFF;
-	}
-	tif->tif_flags |= TIFF_MYBUFFER;
-	tif->tif_rawcp = tif->tif_rawdata = 0;
-	tif->tif_rawdatasize = 0;
-        tif->tif_rawdataoff = 0;
-        tif->tif_rawdataloaded = 0;
-
-	switch (mode[0]) {
-		case 'r':
-			if (!(tif->tif_flags&TIFF_BIGTIFF))
-				tif->tif_nextdiroff = tif->tif_header.classic.tiff_diroff;
-			else
-				tif->tif_nextdiroff = tif->tif_header.big.tiff_diroff;
-			/*
-			 * Try to use a memory-mapped file if the client
-			 * has not explicitly suppressed usage with the
-			 * 'm' flag in the open mode (see above).
-			 */
-			if (tif->tif_flags & TIFF_MAPPED)
-			{
-				toff_t n;
-				if (TIFFMapFileContents(tif,(void**)(&tif->tif_base),&n))
-				{
-					tif->tif_size=(tmsize_t)n;
-					assert((toff_t)tif->tif_size==n);
-				}
-				else
-					tif->tif_flags &= ~TIFF_MAPPED;
-			}
-			/*
-			 * Sometimes we do not want to read the first directory (for example,
-			 * it may be broken) and want to proceed to other directories. I this
-			 * case we use the TIFF_HEADERONLY flag to open file and return
-			 * immediately after reading TIFF header.
-			 */
-			if (tif->tif_flags & TIFF_HEADERONLY)
-				return (tif);
-
-			/*
-			 * Setup initial directory.
-			 */
-			if (TIFFReadDirectory(tif)) {
-				tif->tif_rawcc = (tmsize_t)-1;
-				tif->tif_flags |= TIFF_BUFFERSETUP;
-				return (tif);
-			}
-			break;
-		case 'a':
-			/*
-			 * New directories are automatically append
-			 * to the end of the directory chain when they
-			 * are written out (see TIFFWriteDirectory).
-			 */
-			if (!TIFFDefaultDirectory(tif))
-				goto bad;
-			return (tif);
-	}
+    /*
+     * Read in TIFF header.
+     */
+    if ((m & O_TRUNC) ||
+        !ReadOK(tif, &tif->tif_header, sizeof(TIFFHeaderClassic)))
+    {
+        if (tif->tif_mode == O_RDONLY)
+        {
+            TIFFErrorExtR(tif, name, "Cannot read TIFF header");
+            goto bad;
+        }
+/*
+ * Setup header and write.
+ */
+#ifdef WORDS_BIGENDIAN
+        tif->tif_header.common.tiff_magic =
+            (tif->tif_flags & TIFF_SWAB) ? TIFF_LITTLEENDIAN : TIFF_BIGENDIAN;
+#else
+        tif->tif_header.common.tiff_magic =
+            (tif->tif_flags & TIFF_SWAB) ? TIFF_BIGENDIAN : TIFF_LITTLEENDIAN;
+#endif
+        if (!(tif->tif_flags & TIFF_BIGTIFF))
+        {
+            tif->tif_header.common.tiff_version = TIFF_VERSION_CLASSIC;
+            tif->tif_header.classic.tiff_diroff = 0;
+            if (tif->tif_flags & TIFF_SWAB)
+                TIFFSwabShort(&tif->tif_header.common.tiff_version);
+            tif->tif_header_size = sizeof(TIFFHeaderClassic);
+        }
+        else
+        {
+            tif->tif_header.common.tiff_version = TIFF_VERSION_BIG;
+            tif->tif_header.big.tiff_offsetsize = 8;
+            tif->tif_header.big.tiff_unused = 0;
+            tif->tif_header.big.tiff_diroff = 0;
+            if (tif->tif_flags & TIFF_SWAB)
+            {
+                TIFFSwabShort(&tif->tif_header.common.tiff_version);
+                TIFFSwabShort(&tif->tif_header.big.tiff_offsetsize);
+            }
+            tif->tif_header_size = sizeof(TIFFHeaderBig);
+        }
+        /*
+         * The doc for "fopen" for some STD_C_LIBs says that if you
+         * open a file for modify ("+"), then you must fseek (or
+         * fflush?) between any freads and fwrites.  This is not
+         * necessary on most systems, but has been shown to be needed
+         * on Solaris.
+         */
+        TIFFSeekFile(tif, 0, SEEK_SET);
+        if (!WriteOK(tif, &tif->tif_header, (tmsize_t)(tif->tif_header_size)))
+        {
+            TIFFErrorExtR(tif, name, "Error writing TIFF header");
+            goto bad;
+        }
+        /*
+         * Setup the byte order handling.
+         */
+        if (tif->tif_header.common.tiff_magic == TIFF_BIGENDIAN)
+        {
+#ifndef WORDS_BIGENDIAN
+            tif->tif_flags |= TIFF_SWAB;
+#endif
+        }
+        else
+        {
+#ifdef WORDS_BIGENDIAN
+            tif->tif_flags |= TIFF_SWAB;
+#endif
+        }
+        /*
+         * Setup default directory.
+         */
+        if (!TIFFDefaultDirectory(tif))
+            goto bad;
+        tif->tif_diroff = 0;
+        tif->tif_lastdiroff = 0;
+        tif->tif_setdirectory_force_absolute = FALSE;
+        return (tif);
+    }
+    /*
+     * Setup the byte order handling.
+     */
+    if (tif->tif_header.common.tiff_magic != TIFF_BIGENDIAN &&
+        tif->tif_header.common.tiff_magic != TIFF_LITTLEENDIAN
+#if MDI_SUPPORT
+        &&
+#if HOST_BIGENDIAN
+        tif->tif_header.common.tiff_magic != MDI_BIGENDIAN
+#else
+        tif->tif_header.common.tiff_magic != MDI_LITTLEENDIAN
+#endif
+    )
+    {
+        TIFFErrorExtR(tif, name,
+                      "Not a TIFF or MDI file, bad magic number %" PRIu16
+                      " (0x%" PRIx16 ")",
+#else
+    )
+    {
+        TIFFErrorExtR(tif, name,
+                      "Not a TIFF file, bad magic number %" PRIu16
+                      " (0x%" PRIx16 ")",
+#endif
+                      tif->tif_header.common.tiff_magic,
+                      tif->tif_header.common.tiff_magic);
+        goto bad;
+    }
+    if (tif->tif_header.common.tiff_magic == TIFF_BIGENDIAN)
+    {
+#ifndef WORDS_BIGENDIAN
+        tif->tif_flags |= TIFF_SWAB;
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+        tif->tif_flags |= TIFF_SWAB;
+#endif
+    }
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabShort(&tif->tif_header.common.tiff_version);
+    if ((tif->tif_header.common.tiff_version != TIFF_VERSION_CLASSIC) &&
+        (tif->tif_header.common.tiff_version != TIFF_VERSION_BIG))
+    {
+        TIFFErrorExtR(tif, name,
+                      "Not a TIFF file, bad version number %" PRIu16
+                      " (0x%" PRIx16 ")",
+                      tif->tif_header.common.tiff_version,
+                      tif->tif_header.common.tiff_version);
+        goto bad;
+    }
+    if (tif->tif_header.common.tiff_version == TIFF_VERSION_CLASSIC)
+    {
+        if (tif->tif_flags & TIFF_SWAB)
+            TIFFSwabLong(&tif->tif_header.classic.tiff_diroff);
+        tif->tif_header_size = sizeof(TIFFHeaderClassic);
+    }
+    else
+    {
+        if (!ReadOK(tif,
+                    ((uint8_t *)(&tif->tif_header) + sizeof(TIFFHeaderClassic)),
+                    (sizeof(TIFFHeaderBig) - sizeof(TIFFHeaderClassic))))
+        {
+            TIFFErrorExtR(tif, name, "Cannot read TIFF header");
+            goto bad;
+        }
+        if (tif->tif_flags & TIFF_SWAB)
+        {
+            TIFFSwabShort(&tif->tif_header.big.tiff_offsetsize);
+            TIFFSwabLong8(&tif->tif_header.big.tiff_diroff);
+        }
+        if (tif->tif_header.big.tiff_offsetsize != 8)
+        {
+            TIFFErrorExtR(tif, name,
+                          "Not a TIFF file, bad BigTIFF offsetsize %" PRIu16
+                          " (0x%" PRIx16 ")",
+                          tif->tif_header.big.tiff_offsetsize,
+                          tif->tif_header.big.tiff_offsetsize);
+            goto bad;
+        }
+        if (tif->tif_header.big.tiff_unused != 0)
+        {
+            TIFFErrorExtR(tif, name,
+                          "Not a TIFF file, bad BigTIFF unused %" PRIu16
+                          " (0x%" PRIx16 ")",
+                          tif->tif_header.big.tiff_unused,
+                          tif->tif_header.big.tiff_unused);
+            goto bad;
+        }
+        tif->tif_header_size = sizeof(TIFFHeaderBig);
+        tif->tif_flags |= TIFF_BIGTIFF;
+    }
+    tif->tif_flags |= TIFF_MYBUFFER;
+    tif->tif_rawcp = tif->tif_rawdata = 0;
+    tif->tif_rawdatasize = 0;
+    tif->tif_rawdataoff = 0;
+    tif->tif_rawdataloaded = 0;
+
+    switch (mode[0])
+    {
+        case 'r':
+            if (!(tif->tif_flags & TIFF_BIGTIFF))
+                tif->tif_nextdiroff = tif->tif_header.classic.tiff_diroff;
+            else
+                tif->tif_nextdiroff = tif->tif_header.big.tiff_diroff;
+            /*
+             * Try to use a memory-mapped file if the client
+             * has not explicitly suppressed usage with the
+             * 'm' flag in the open mode (see above).
+             */
+            if (tif->tif_flags & TIFF_MAPPED)
+            {
+                toff_t n;
+                if (TIFFMapFileContents(tif, (void **)(&tif->tif_base), &n))
+                {
+                    tif->tif_size = (tmsize_t)n;
+                    assert((toff_t)tif->tif_size == n);
+                }
+                else
+                    tif->tif_flags &= ~TIFF_MAPPED;
+            }
+            /*
+             * Sometimes we do not want to read the first directory (for
+             * example, it may be broken) and want to proceed to other
+             * directories. I this case we use the TIFF_HEADERONLY flag to open
+             * file and return immediately after reading TIFF header.
+             */
+            if (tif->tif_flags & TIFF_HEADERONLY)
+                return (tif);
+
+            /*
+             * Setup initial directory.
+             */
+            if (TIFFReadDirectory(tif))
+            {
+                return (tif);
+            }
+            break;
+        case 'a':
+            /*
+             * New directories are automatically append
+             * to the end of the directory chain when they
+             * are written out (see TIFFWriteDirectory).
+             */
+            if (!TIFFDefaultDirectory(tif))
+                goto bad;
+            return (tif);
+    }
 bad:
-	tif->tif_mode = O_RDONLY;	/* XXX avoid flush */
-        TIFFCleanup(tif);
+    tif->tif_mode = O_RDONLY; /* XXX avoid flush */
+    TIFFCleanup(tif);
 bad2:
-	return ((TIFF*)0);
+    return ((TIFF *)0);
 }
 
 /*
@@ -510,233 +655,154 @@ TIFFClientOpen(
 /*
  * Return open file's name.
  */
-const char *
-TIFFFileName(TIFF* tif)
-{
-	return (tif->tif_name);
-}
+const char *TIFFFileName(TIFF *tif) { return (tif->tif_name); }
 
 /*
  * Set the file name.
  */
-const char *
-TIFFSetFileName(TIFF* tif, const char *name)
+const char *TIFFSetFileName(TIFF *tif, const char *name)
 {
-	const char* old_name = tif->tif_name;
-	tif->tif_name = (char *)name;
-	return (old_name);
+    const char *old_name = tif->tif_name;
+    tif->tif_name = (char *)name;
+    return (old_name);
 }
 
 /*
  * Return open file's I/O descriptor.
  */
-int
-TIFFFileno(TIFF* tif)
-{
-	return (tif->tif_fd);
-}
+int TIFFFileno(TIFF *tif) { return (tif->tif_fd); }
 
 /*
  * Set open file's I/O descriptor, and return previous value.
  */
-int
-TIFFSetFileno(TIFF* tif, int fd)
+int TIFFSetFileno(TIFF *tif, int fd)
 {
-        int old_fd = tif->tif_fd;
-	tif->tif_fd = fd;
-	return old_fd;
+    int old_fd = tif->tif_fd;
+    tif->tif_fd = fd;
+    return old_fd;
 }
 
 /*
  * Return open file's clientdata.
  */
-thandle_t
-TIFFClientdata(TIFF* tif)
-{
-	return (tif->tif_clientdata);
-}
+thandle_t TIFFClientdata(TIFF *tif) { return (tif->tif_clientdata); }
 
 /*
  * Set open file's clientdata, and return previous value.
  */
-thandle_t
-TIFFSetClientdata(TIFF* tif, thandle_t newvalue)
+thandle_t TIFFSetClientdata(TIFF *tif, thandle_t newvalue)
 {
-	thandle_t m = tif->tif_clientdata;
-	tif->tif_clientdata = newvalue;
-	return m;
+    thandle_t m = tif->tif_clientdata;
+    tif->tif_clientdata = newvalue;
+    return m;
 }
 
 /*
  * Return read/write mode.
  */
-int
-TIFFGetMode(TIFF* tif)
-{
-	return (tif->tif_mode);
-}
+int TIFFGetMode(TIFF *tif) { return (tif->tif_mode); }
 
 /*
  * Return read/write mode.
  */
-int
-TIFFSetMode(TIFF* tif, int mode)
+int TIFFSetMode(TIFF *tif, int mode)
 {
-	int old_mode = tif->tif_mode;
-	tif->tif_mode = mode;
-	return (old_mode);
+    int old_mode = tif->tif_mode;
+    tif->tif_mode = mode;
+    return (old_mode);
 }
 
 /*
  * Return nonzero if file is organized in
  * tiles; zero if organized as strips.
  */
-int
-TIFFIsTiled(TIFF* tif)
-{
-	return (isTiled(tif));
-}
+int TIFFIsTiled(TIFF *tif) { return (isTiled(tif)); }
 
 /*
  * Return current row being read/written.
  */
-uint32
-TIFFCurrentRow(TIFF* tif)
-{
-	return (tif->tif_row);
-}
+uint32_t TIFFCurrentRow(TIFF *tif) { return (tif->tif_row); }
 
 /*
  * Return index of the current directory.
  */
-uint16
-TIFFCurrentDirectory(TIFF* tif)
-{
-	return (tif->tif_curdir);
-}
+tdir_t TIFFCurrentDirectory(TIFF *tif) { return (tif->tif_curdir); }
 
 /*
  * Return current strip.
  */
-uint32
-TIFFCurrentStrip(TIFF* tif)
-{
-	return (tif->tif_curstrip);
-}
+uint32_t TIFFCurrentStrip(TIFF *tif) { return (tif->tif_curstrip); }
 
 /*
  * Return current tile.
  */
-uint32
-TIFFCurrentTile(TIFF* tif)
-{
-	return (tif->tif_curtile);
-}
+uint32_t TIFFCurrentTile(TIFF *tif) { return (tif->tif_curtile); }
 
 /*
  * Return nonzero if the file has byte-swapped data.
  */
-int
-TIFFIsByteSwapped(TIFF* tif)
-{
-	return ((tif->tif_flags & TIFF_SWAB) != 0);
-}
+int TIFFIsByteSwapped(TIFF *tif) { return ((tif->tif_flags & TIFF_SWAB) != 0); }
 
 /*
  * Return nonzero if the data is returned up-sampled.
  */
-int
-TIFFIsUpSampled(TIFF* tif)
-{
-	return (isUpSampled(tif));
-}
+int TIFFIsUpSampled(TIFF *tif) { return (isUpSampled(tif)); }
 
 /*
  * Return nonzero if the data is returned in MSB-to-LSB bit order.
  */
-int
-TIFFIsMSB2LSB(TIFF* tif)
-{
-	return (isFillOrder(tif, FILLORDER_MSB2LSB));
-}
+int TIFFIsMSB2LSB(TIFF *tif) { return (isFillOrder(tif, FILLORDER_MSB2LSB)); }
 
 /*
  * Return nonzero if given file was written in big-endian order.
  */
-int
-TIFFIsBigEndian(TIFF* tif)
+int TIFFIsBigEndian(TIFF *tif)
 {
-	return (tif->tif_header.common.tiff_magic == TIFF_BIGENDIAN);
+    return (tif->tif_header.common.tiff_magic == TIFF_BIGENDIAN);
 }
 
 /*
- * Return pointer to file read method.
+ * Return nonzero if given file is BigTIFF style.
  */
-TIFFReadWriteProc
-TIFFGetReadProc(TIFF* tif)
+int TIFFIsBigTIFF(TIFF *tif)
 {
-	return (tif->tif_readproc);
+    return (tif->tif_header.common.tiff_version == TIFF_VERSION_BIG);
 }
 
+/*
+ * Return pointer to file read method.
+ */
+TIFFReadWriteProc TIFFGetReadProc(TIFF *tif) { return (tif->tif_readproc); }
+
 /*
  * Return pointer to file write method.
  */
-TIFFReadWriteProc
-TIFFGetWriteProc(TIFF* tif)
-{
-	return (tif->tif_writeproc);
-}
+TIFFReadWriteProc TIFFGetWriteProc(TIFF *tif) { return (tif->tif_writeproc); }
 
 /*
  * Return pointer to file seek method.
  */
-TIFFSeekProc
-TIFFGetSeekProc(TIFF* tif)
-{
-	return (tif->tif_seekproc);
-}
+TIFFSeekProc TIFFGetSeekProc(TIFF *tif) { return (tif->tif_seekproc); }
 
 /*
  * Return pointer to file close method.
  */
-TIFFCloseProc
-TIFFGetCloseProc(TIFF* tif)
-{
-	return (tif->tif_closeproc);
-}
+TIFFCloseProc TIFFGetCloseProc(TIFF *tif) { return (tif->tif_closeproc); }
 
 /*
  * Return pointer to file size requesting method.
  */
-TIFFSizeProc
-TIFFGetSizeProc(TIFF* tif)
-{
-	return (tif->tif_sizeproc);
-}
+TIFFSizeProc TIFFGetSizeProc(TIFF *tif) { return (tif->tif_sizeproc); }
 
 /*
  * Return pointer to memory mapping method.
  */
-TIFFMapFileProc
-TIFFGetMapFileProc(TIFF* tif)
-{
-	return (tif->tif_mapproc);
-}
+TIFFMapFileProc TIFFGetMapFileProc(TIFF *tif) { return (tif->tif_mapproc); }
 
 /*
  * Return pointer to memory unmapping method.
  */
-TIFFUnmapFileProc
-TIFFGetUnmapFileProc(TIFF* tif)
+TIFFUnmapFileProc TIFFGetUnmapFileProc(TIFF *tif)
 {
-	return (tif->tif_unmapproc);
+    return (tif->tif_unmapproc);
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_packbits.c b/3rdparty/libtiff/tif_packbits.c
index a8f29e8757d5..62849f8f3c16 100644
--- a/3rdparty/libtiff/tif_packbits.c
+++ b/3rdparty/libtiff/tif_packbits.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -31,157 +31,178 @@
  */
 #include <stdio.h>
 
-static int
-PackBitsPreEncode(TIFF* tif, uint16 s)
+static int PackBitsPreEncode(TIFF *tif, uint16_t s)
 {
-	(void) s;
+    (void)s;
 
-        tif->tif_data = (uint8*)_TIFFmalloc(sizeof(tmsize_t));
-	if (tif->tif_data == NULL)
-		return (0);
-	/*
-	 * Calculate the scanline/tile-width size in bytes.
-	 */
-	if (isTiled(tif))
-		*(tmsize_t*)tif->tif_data = TIFFTileRowSize(tif);
-	else
-		*(tmsize_t*)tif->tif_data = TIFFScanlineSize(tif);
-	return (1);
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(tmsize_t));
+    if (tif->tif_data == NULL)
+        return (0);
+    /*
+     * Calculate the scanline/tile-width size in bytes.
+     */
+    if (isTiled(tif))
+        *(tmsize_t *)tif->tif_data = TIFFTileRowSize(tif);
+    else
+        *(tmsize_t *)tif->tif_data = TIFFScanlineSize(tif);
+    return (1);
 }
 
-static int
-PackBitsPostEncode(TIFF* tif)
+static int PackBitsPostEncode(TIFF *tif)
 {
-        if (tif->tif_data)
-            _TIFFfree(tif->tif_data);
-	return (1);
+    if (tif->tif_data)
+        _TIFFfreeExt(tif, tif->tif_data);
+    return (1);
 }
 
 /*
  * Encode a run of pixels.
  */
-static int
-PackBitsEncode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
+static int PackBitsEncode(TIFF *tif, uint8_t *buf, tmsize_t cc, uint16_t s)
 {
-	unsigned char* bp = (unsigned char*) buf;
-	uint8* op;
-	uint8* ep;
-	uint8* lastliteral;
-	long n, slop;
-	int b;
-	enum { BASE, LITERAL, RUN, LITERAL_RUN } state;
+    unsigned char *bp = (unsigned char *)buf;
+    uint8_t *op;
+    uint8_t *ep;
+    uint8_t *lastliteral;
+    long n, slop;
+    int b;
+    enum
+    {
+        BASE,
+        LITERAL,
+        RUN,
+        LITERAL_RUN
+    } state;
 
-	(void) s;
-	op = tif->tif_rawcp;
-	ep = tif->tif_rawdata + tif->tif_rawdatasize;
-	state = BASE;
-	lastliteral = 0;
-	while (cc > 0) {
-		/*
-		 * Find the longest string of identical bytes.
-		 */
-		b = *bp++;
-		cc--;
-		n = 1;
-		for (; cc > 0 && b == *bp; cc--, bp++)
-			n++;
-	again:
-		if (op + 2 >= ep) {		/* insure space for new data */
-			/*
-			 * Be careful about writing the last
-			 * literal.  Must write up to that point
-			 * and then copy the remainder to the
-			 * front of the buffer.
-			 */
-			if (state == LITERAL || state == LITERAL_RUN) {
-				slop = (long)(op - lastliteral);
-				tif->tif_rawcc += (tmsize_t)(lastliteral - tif->tif_rawcp);
-				if (!TIFFFlushData1(tif))
-					return (0);
-				op = tif->tif_rawcp;
-				while (slop-- > 0)
-					*op++ = *lastliteral++;
-				lastliteral = tif->tif_rawcp;
-			} else {
-				tif->tif_rawcc += (tmsize_t)(op - tif->tif_rawcp);
-				if (!TIFFFlushData1(tif))
-					return (0);
-				op = tif->tif_rawcp;
-			}
-		}
-		switch (state) {
-		case BASE:		/* initial state, set run/literal */
-			if (n > 1) {
-				state = RUN;
-				if (n > 128) {
-					*op++ = (uint8) -127;
-					*op++ = (uint8) b;
-					n -= 128;
-					goto again;
-				}
-				*op++ = (uint8)(-(n-1));
-				*op++ = (uint8) b;
-			} else {
-				lastliteral = op;
-				*op++ = 0;
-				*op++ = (uint8) b;
-				state = LITERAL;
-			}
-			break;
-		case LITERAL:		/* last object was literal string */
-			if (n > 1) {
-				state = LITERAL_RUN;
-				if (n > 128) {
-					*op++ = (uint8) -127;
-					*op++ = (uint8) b;
-					n -= 128;
-					goto again;
-				}
-				*op++ = (uint8)(-(n-1));	/* encode run */
-				*op++ = (uint8) b;
-			} else {			/* extend literal */
-				if (++(*lastliteral) == 127)
-					state = BASE;
-				*op++ = (uint8) b;
-			}
-			break;
-		case RUN:		/* last object was run */
-			if (n > 1) {
-				if (n > 128) {
-					*op++ = (uint8) -127;
-					*op++ = (uint8) b;
-					n -= 128;
-					goto again;
-				}
-				*op++ = (uint8)(-(n-1));
-				*op++ = (uint8) b;
-			} else {
-				lastliteral = op;
-				*op++ = 0;
-				*op++ = (uint8) b;
-				state = LITERAL;
-			}
-			break;
-		case LITERAL_RUN:	/* literal followed by a run */
-			/*
-			 * Check to see if previous run should
-			 * be converted to a literal, in which
-			 * case we convert literal-run-literal
-			 * to a single literal.
-			 */
-			if (n == 1 && op[-2] == (uint8) -1 &&
-			    *lastliteral < 126) {
-				state = (((*lastliteral) += 2) == 127 ?
-				    BASE : LITERAL);
-				op[-2] = op[-1];	/* replicate */
-			} else
-				state = RUN;
-			goto again;
-		}
-	}
-	tif->tif_rawcc += (tmsize_t)(op - tif->tif_rawcp);
-	tif->tif_rawcp = op;
-	return (1);
+    (void)s;
+    op = tif->tif_rawcp;
+    ep = tif->tif_rawdata + tif->tif_rawdatasize;
+    state = BASE;
+    lastliteral = 0;
+    while (cc > 0)
+    {
+        /*
+         * Find the longest string of identical bytes.
+         */
+        b = *bp++;
+        cc--;
+        n = 1;
+        for (; cc > 0 && b == *bp; cc--, bp++)
+            n++;
+    again:
+        if (op + 2 >= ep)
+        { /* insure space for new data */
+            /*
+             * Be careful about writing the last
+             * literal.  Must write up to that point
+             * and then copy the remainder to the
+             * front of the buffer.
+             */
+            if (state == LITERAL || state == LITERAL_RUN)
+            {
+                slop = (long)(op - lastliteral);
+                tif->tif_rawcc += (tmsize_t)(lastliteral - tif->tif_rawcp);
+                if (!TIFFFlushData1(tif))
+                    return (0);
+                op = tif->tif_rawcp;
+                while (slop-- > 0)
+                    *op++ = *lastliteral++;
+                lastliteral = tif->tif_rawcp;
+            }
+            else
+            {
+                tif->tif_rawcc += (tmsize_t)(op - tif->tif_rawcp);
+                if (!TIFFFlushData1(tif))
+                    return (0);
+                op = tif->tif_rawcp;
+            }
+        }
+        switch (state)
+        {
+            case BASE: /* initial state, set run/literal */
+                if (n > 1)
+                {
+                    state = RUN;
+                    if (n > 128)
+                    {
+                        *op++ = (uint8_t)-127;
+                        *op++ = (uint8_t)b;
+                        n -= 128;
+                        goto again;
+                    }
+                    *op++ = (uint8_t)(-(n - 1));
+                    *op++ = (uint8_t)b;
+                }
+                else
+                {
+                    lastliteral = op;
+                    *op++ = 0;
+                    *op++ = (uint8_t)b;
+                    state = LITERAL;
+                }
+                break;
+            case LITERAL: /* last object was literal string */
+                if (n > 1)
+                {
+                    state = LITERAL_RUN;
+                    if (n > 128)
+                    {
+                        *op++ = (uint8_t)-127;
+                        *op++ = (uint8_t)b;
+                        n -= 128;
+                        goto again;
+                    }
+                    *op++ = (uint8_t)(-(n - 1)); /* encode run */
+                    *op++ = (uint8_t)b;
+                }
+                else
+                { /* extend literal */
+                    if (++(*lastliteral) == 127)
+                        state = BASE;
+                    *op++ = (uint8_t)b;
+                }
+                break;
+            case RUN: /* last object was run */
+                if (n > 1)
+                {
+                    if (n > 128)
+                    {
+                        *op++ = (uint8_t)-127;
+                        *op++ = (uint8_t)b;
+                        n -= 128;
+                        goto again;
+                    }
+                    *op++ = (uint8_t)(-(n - 1));
+                    *op++ = (uint8_t)b;
+                }
+                else
+                {
+                    lastliteral = op;
+                    *op++ = 0;
+                    *op++ = (uint8_t)b;
+                    state = LITERAL;
+                }
+                break;
+            case LITERAL_RUN: /* literal followed by a run */
+                /*
+                 * Check to see if previous run should
+                 * be converted to a literal, in which
+                 * case we convert literal-run-literal
+                 * to a single literal.
+                 */
+                if (n == 1 && op[-2] == (uint8_t)-1 && *lastliteral < 126)
+                {
+                    state = (((*lastliteral) += 2) == 127 ? BASE : LITERAL);
+                    op[-2] = op[-1]; /* replicate */
+                }
+                else
+                    state = RUN;
+                goto again;
+        }
+    }
+    tif->tif_rawcc += (tmsize_t)(op - tif->tif_rawcp);
+    tif->tif_rawcp = op;
+    return (1);
 }
 
 /*
@@ -191,119 +212,112 @@ PackBitsEncode(TIFF* tif, uint8* buf, tmsize_t cc, uint16 s)
  * the decoder if data is read, for example, by scanlines
  * when it was encoded by strips.
  */
-static int
-PackBitsEncodeChunk(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int PackBitsEncodeChunk(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	tmsize_t rowsize = *(tmsize_t*)tif->tif_data;
+    tmsize_t rowsize = *(tmsize_t *)tif->tif_data;
+
+    while (cc > 0)
+    {
+        tmsize_t chunk = rowsize;
 
-	while (cc > 0) {
-		tmsize_t chunk = rowsize;
-		
-		if( cc < chunk )
-		    chunk = cc;
+        if (cc < chunk)
+            chunk = cc;
 
-		if (PackBitsEncode(tif, bp, chunk, s) < 0)
-		    return (-1);
-		bp += chunk;
-		cc -= chunk;
-	}
-	return (1);
+        if (PackBitsEncode(tif, bp, chunk, s) < 0)
+            return (-1);
+        bp += chunk;
+        cc -= chunk;
+    }
+    return (1);
 }
 
-static int
-PackBitsDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int PackBitsDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "PackBitsDecode";
-	char *bp;
-	tmsize_t cc;
-	long n;
-	int b;
+    static const char module[] = "PackBitsDecode";
+    int8_t *bp;
+    tmsize_t cc;
+    long n;
+    int b;
 
-	(void) s;
-	bp = (char*) tif->tif_rawcp;
-	cc = tif->tif_rawcc;
-	while (cc > 0 && occ > 0) {
-		n = (long) *bp++;
-		cc--;
-		/*
-		 * Watch out for compilers that
-		 * don't sign extend chars...
-		 */
-		if (n >= 128)
-			n -= 256;
-		if (n < 0) {		/* replicate next byte -n+1 times */
-			if (n == -128)	/* nop */
-				continue;
-			n = -n + 1;
-			if( occ < (tmsize_t)n )
-			{
-				TIFFWarningExt(tif->tif_clientdata, module,
-				    "Discarding %lu bytes to avoid buffer overrun",
-				    (unsigned long) ((tmsize_t)n - occ));
-				n = (long)occ;
-			}
-			if( cc == 0 )
-			{
-				TIFFWarningExt(tif->tif_clientdata, module,
-					       "Terminating PackBitsDecode due to lack of data.");
-				break;
-			}
-			occ -= n;
-			b = *bp++;
-			cc--;
-			while (n-- > 0)
-				*op++ = (uint8) b;
-		} else {		/* copy next n+1 bytes literally */
-			if (occ < (tmsize_t)(n + 1))
-			{
-				TIFFWarningExt(tif->tif_clientdata, module,
-				    "Discarding %lu bytes to avoid buffer overrun",
-				    (unsigned long) ((tmsize_t)n - occ + 1));
-				n = (long)occ - 1;
-			}
-			if (cc < (tmsize_t) (n+1)) 
-			{
-				TIFFWarningExt(tif->tif_clientdata, module,
-					       "Terminating PackBitsDecode due to lack of data.");
-				break;
-			}
-			_TIFFmemcpy(op, bp, ++n);
-			op += n; occ -= n;
-			bp += n; cc -= n;
-		}
-	}
-	tif->tif_rawcp = (uint8*) bp;
-	tif->tif_rawcc = cc;
-	if (occ > 0) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Not enough data for scanline %lu",
-		    (unsigned long) tif->tif_row);
-		return (0);
-	}
-	return (1);
+    (void)s;
+    bp = (int8_t *)tif->tif_rawcp;
+    cc = tif->tif_rawcc;
+    while (cc > 0 && occ > 0)
+    {
+        n = (long)*bp++;
+        cc--;
+        if (n < 0)
+        {                  /* replicate next byte -n+1 times */
+            if (n == -128) /* nop */
+                continue;
+            n = -n + 1;
+            if (occ < (tmsize_t)n)
+            {
+                TIFFWarningExtR(tif, module,
+                                "Discarding %" TIFF_SSIZE_FORMAT
+                                " bytes to avoid buffer overrun",
+                                (tmsize_t)n - occ);
+                n = (long)occ;
+            }
+            if (cc == 0)
+            {
+                TIFFWarningExtR(
+                    tif, module,
+                    "Terminating PackBitsDecode due to lack of data.");
+                break;
+            }
+            occ -= n;
+            b = *bp++;
+            cc--;
+            while (n-- > 0)
+                *op++ = (uint8_t)b;
+        }
+        else
+        { /* copy next n+1 bytes literally */
+            if (occ < (tmsize_t)(n + 1))
+            {
+                TIFFWarningExtR(tif, module,
+                                "Discarding %" TIFF_SSIZE_FORMAT
+                                " bytes to avoid buffer overrun",
+                                (tmsize_t)n - occ + 1);
+                n = (long)occ - 1;
+            }
+            if (cc < (tmsize_t)(n + 1))
+            {
+                TIFFWarningExtR(
+                    tif, module,
+                    "Terminating PackBitsDecode due to lack of data.");
+                break;
+            }
+            _TIFFmemcpy(op, bp, ++n);
+            op += n;
+            occ -= n;
+            bp += n;
+            cc -= n;
+        }
+    }
+    tif->tif_rawcp = (uint8_t *)bp;
+    tif->tif_rawcc = cc;
+    if (occ > 0)
+    {
+        TIFFErrorExtR(tif, module, "Not enough data for scanline %" PRIu32,
+                      tif->tif_row);
+        return (0);
+    }
+    return (1);
 }
 
-int
-TIFFInitPackBits(TIFF* tif, int scheme)
+int TIFFInitPackBits(TIFF *tif, int scheme)
 {
-	(void) scheme;
-	tif->tif_decoderow = PackBitsDecode;
-	tif->tif_decodestrip = PackBitsDecode;
-	tif->tif_decodetile = PackBitsDecode;
-	tif->tif_preencode = PackBitsPreEncode;
-	tif->tif_postencode = PackBitsPostEncode;
-	tif->tif_encoderow = PackBitsEncode;
-	tif->tif_encodestrip = PackBitsEncodeChunk;
-	tif->tif_encodetile = PackBitsEncodeChunk;
-	return (1);
+    (void)scheme;
+    tif->tif_decoderow = PackBitsDecode;
+    tif->tif_decodestrip = PackBitsDecode;
+    tif->tif_decodetile = PackBitsDecode;
+    tif->tif_preencode = PackBitsPreEncode;
+    tif->tif_postencode = PackBitsPostEncode;
+    tif->tif_encoderow = PackBitsEncode;
+    tif->tif_encodestrip = PackBitsEncodeChunk;
+    tif->tif_encodetile = PackBitsEncodeChunk;
+    return (1);
 }
 #endif /* PACKBITS_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_pixarlog.c b/3rdparty/libtiff/tif_pixarlog.c
index f291201505d3..5c0346b6eca8 100644
--- a/3rdparty/libtiff/tif_pixarlog.c
+++ b/3rdparty/libtiff/tif_pixarlog.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1996-1997 Sam Leffler
  * Copyright (c) 1996 Pixar
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Pixar, Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Pixar, Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL PIXAR, SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -32,10 +32,10 @@
  * Contributed by Dan McCoy.
  *
  * PixarLog film support uses the TIFF library to store companded
- * 11 bit values into a tiff file, which are compressed using the 
- * zip compressor.  
+ * 11 bit values into a tiff file, which are compressed using the
+ * zip compressor.
  *
- * The codec can take as input and produce as output 32-bit IEEE float values 
+ * The codec can take as input and produce as output 32-bit IEEE float values
  * as well as 16-bit or 8-bit unsigned integer values.
  *
  * On writing any of the above are converted into the internal
@@ -49,7 +49,7 @@
  * than the human eye can perceive with extra room to allow for
  * error introduced by further image computation.  As with any quantized
  * color format, it is possible to perform image calculations which
- * expose the quantization error. This format should certainly be less 
+ * expose the quantization error. This format should certainly be less
  * susceptible to such errors than standard 8-bit encodings, but more
  * susceptible than straight 16-bit or 32-bit encodings.
  *
@@ -90,363 +90,429 @@
 #include "tif_predict.h"
 #include "zlib.h"
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
 
 /* Tables for converting to/from 11 bit coded values */
 
-#define  TSIZE	 2048		/* decode table size (11-bit tokens) */
-#define  TSIZEP1 2049		/* Plus one for slop */
-#define  ONE	 1250		/* token value of 1.0 exactly */
-#define  RATIO	 1.004		/* nominal ratio for log part */
-
-#define CODE_MASK 0x7ff         /* 11 bits. */
-
-static float  Fltsize;
-static float  LogK1, LogK2;
-
-#define REPEAT(n, op)   { int i; i=n; do { i--; op; } while (i>0); }
+#define TSIZE 2048   /* decode table size (11-bit tokens) */
+#define TSIZEP1 2049 /* Plus one for slop */
+#define ONE 1250     /* token value of 1.0 exactly */
+#define RATIO 1.004  /* nominal ratio for log part */
+
+#define CODE_MASK 0x7ff /* 11 bits. */
+
+static float Fltsize;
+static float LogK1, LogK2;
+
+#define REPEAT(n, op)                                                          \
+    {                                                                          \
+        int i;                                                                 \
+        i = n;                                                                 \
+        do                                                                     \
+        {                                                                      \
+            i--;                                                               \
+            op;                                                                \
+        } while (i > 0);                                                       \
+    }
 
-static void
-horizontalAccumulateF(uint16 *wp, int n, int stride, float *op,
-	float *ToLinearF)
+static void horizontalAccumulateF(uint16_t *wp, int n, int stride, float *op,
+                                  float *ToLinearF)
 {
-    register unsigned int  cr, cg, cb, ca, mask;
-    register float  t0, t1, t2, t3;
-
-    if (n >= stride) {
-	mask = CODE_MASK;
-	if (stride == 3) {
-	    t0 = ToLinearF[cr = (wp[0] & mask)];
-	    t1 = ToLinearF[cg = (wp[1] & mask)];
-	    t2 = ToLinearF[cb = (wp[2] & mask)];
-	    op[0] = t0;
-	    op[1] = t1;
-	    op[2] = t2;
-	    n -= 3;
-	    while (n > 0) {
-		wp += 3;
-		op += 3;
-		n -= 3;
-		t0 = ToLinearF[(cr += wp[0]) & mask];
-		t1 = ToLinearF[(cg += wp[1]) & mask];
-		t2 = ToLinearF[(cb += wp[2]) & mask];
-		op[0] = t0;
-		op[1] = t1;
-		op[2] = t2;
-	    }
-	} else if (stride == 4) {
-	    t0 = ToLinearF[cr = (wp[0] & mask)];
-	    t1 = ToLinearF[cg = (wp[1] & mask)];
-	    t2 = ToLinearF[cb = (wp[2] & mask)];
-	    t3 = ToLinearF[ca = (wp[3] & mask)];
-	    op[0] = t0;
-	    op[1] = t1;
-	    op[2] = t2;
-	    op[3] = t3;
-	    n -= 4;
-	    while (n > 0) {
-		wp += 4;
-		op += 4;
-		n -= 4;
-		t0 = ToLinearF[(cr += wp[0]) & mask];
-		t1 = ToLinearF[(cg += wp[1]) & mask];
-		t2 = ToLinearF[(cb += wp[2]) & mask];
-		t3 = ToLinearF[(ca += wp[3]) & mask];
-		op[0] = t0;
-		op[1] = t1;
-		op[2] = t2;
-		op[3] = t3;
-	    }
-	} else {
-	    REPEAT(stride, *op = ToLinearF[*wp&mask]; wp++; op++)
-	    n -= stride;
-	    while (n > 0) {
-		REPEAT(stride,
-		    wp[stride] += *wp; *op = ToLinearF[*wp&mask]; wp++; op++)
-		n -= stride;
-	    }
-	}
+    register unsigned int cr, cg, cb, ca, mask;
+    register float t0, t1, t2, t3;
+
+    if (n >= stride)
+    {
+        mask = CODE_MASK;
+        if (stride == 3)
+        {
+            t0 = ToLinearF[cr = (wp[0] & mask)];
+            t1 = ToLinearF[cg = (wp[1] & mask)];
+            t2 = ToLinearF[cb = (wp[2] & mask)];
+            op[0] = t0;
+            op[1] = t1;
+            op[2] = t2;
+            n -= 3;
+            while (n > 0)
+            {
+                wp += 3;
+                op += 3;
+                n -= 3;
+                t0 = ToLinearF[(cr += wp[0]) & mask];
+                t1 = ToLinearF[(cg += wp[1]) & mask];
+                t2 = ToLinearF[(cb += wp[2]) & mask];
+                op[0] = t0;
+                op[1] = t1;
+                op[2] = t2;
+            }
+        }
+        else if (stride == 4)
+        {
+            t0 = ToLinearF[cr = (wp[0] & mask)];
+            t1 = ToLinearF[cg = (wp[1] & mask)];
+            t2 = ToLinearF[cb = (wp[2] & mask)];
+            t3 = ToLinearF[ca = (wp[3] & mask)];
+            op[0] = t0;
+            op[1] = t1;
+            op[2] = t2;
+            op[3] = t3;
+            n -= 4;
+            while (n > 0)
+            {
+                wp += 4;
+                op += 4;
+                n -= 4;
+                t0 = ToLinearF[(cr += wp[0]) & mask];
+                t1 = ToLinearF[(cg += wp[1]) & mask];
+                t2 = ToLinearF[(cb += wp[2]) & mask];
+                t3 = ToLinearF[(ca += wp[3]) & mask];
+                op[0] = t0;
+                op[1] = t1;
+                op[2] = t2;
+                op[3] = t3;
+            }
+        }
+        else
+        {
+            REPEAT(stride, *op = ToLinearF[*wp & mask]; wp++; op++)
+            n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride, wp[stride] += *wp; *op = ToLinearF[*wp & mask];
+                       wp++; op++)
+                n -= stride;
+            }
+        }
     }
 }
 
-static void
-horizontalAccumulate12(uint16 *wp, int n, int stride, int16 *op,
-	float *ToLinearF)
+static void horizontalAccumulate12(uint16_t *wp, int n, int stride, int16_t *op,
+                                   float *ToLinearF)
 {
-    register unsigned int  cr, cg, cb, ca, mask;
-    register float  t0, t1, t2, t3;
+    register unsigned int cr, cg, cb, ca, mask;
+    register float t0, t1, t2, t3;
 
 #define SCALE12 2048.0F
-#define CLAMP12(t) (((t) < 3071) ? (uint16) (t) : 3071)
-
-    if (n >= stride) {
-	mask = CODE_MASK;
-	if (stride == 3) {
-	    t0 = ToLinearF[cr = (wp[0] & mask)] * SCALE12;
-	    t1 = ToLinearF[cg = (wp[1] & mask)] * SCALE12;
-	    t2 = ToLinearF[cb = (wp[2] & mask)] * SCALE12;
-	    op[0] = CLAMP12(t0);
-	    op[1] = CLAMP12(t1);
-	    op[2] = CLAMP12(t2);
-	    n -= 3;
-	    while (n > 0) {
-		wp += 3;
-		op += 3;
-		n -= 3;
-		t0 = ToLinearF[(cr += wp[0]) & mask] * SCALE12;
-		t1 = ToLinearF[(cg += wp[1]) & mask] * SCALE12;
-		t2 = ToLinearF[(cb += wp[2]) & mask] * SCALE12;
-		op[0] = CLAMP12(t0);
-		op[1] = CLAMP12(t1);
-		op[2] = CLAMP12(t2);
-	    }
-	} else if (stride == 4) {
-	    t0 = ToLinearF[cr = (wp[0] & mask)] * SCALE12;
-	    t1 = ToLinearF[cg = (wp[1] & mask)] * SCALE12;
-	    t2 = ToLinearF[cb = (wp[2] & mask)] * SCALE12;
-	    t3 = ToLinearF[ca = (wp[3] & mask)] * SCALE12;
-	    op[0] = CLAMP12(t0);
-	    op[1] = CLAMP12(t1);
-	    op[2] = CLAMP12(t2);
-	    op[3] = CLAMP12(t3);
-	    n -= 4;
-	    while (n > 0) {
-		wp += 4;
-		op += 4;
-		n -= 4;
-		t0 = ToLinearF[(cr += wp[0]) & mask] * SCALE12;
-		t1 = ToLinearF[(cg += wp[1]) & mask] * SCALE12;
-		t2 = ToLinearF[(cb += wp[2]) & mask] * SCALE12;
-		t3 = ToLinearF[(ca += wp[3]) & mask] * SCALE12;
-		op[0] = CLAMP12(t0);
-		op[1] = CLAMP12(t1);
-		op[2] = CLAMP12(t2);
-		op[3] = CLAMP12(t3);
-	    }
-	} else {
-	    REPEAT(stride, t0 = ToLinearF[*wp&mask] * SCALE12;
-                           *op = CLAMP12(t0); wp++; op++)
-	    n -= stride;
-	    while (n > 0) {
-		REPEAT(stride,
-		    wp[stride] += *wp; t0 = ToLinearF[wp[stride]&mask]*SCALE12;
-		    *op = CLAMP12(t0);  wp++; op++)
-		n -= stride;
-	    }
-	}
+#define CLAMP12(t) (((t) < 3071) ? (uint16_t)(t) : 3071)
+
+    if (n >= stride)
+    {
+        mask = CODE_MASK;
+        if (stride == 3)
+        {
+            t0 = ToLinearF[cr = (wp[0] & mask)] * SCALE12;
+            t1 = ToLinearF[cg = (wp[1] & mask)] * SCALE12;
+            t2 = ToLinearF[cb = (wp[2] & mask)] * SCALE12;
+            op[0] = CLAMP12(t0);
+            op[1] = CLAMP12(t1);
+            op[2] = CLAMP12(t2);
+            n -= 3;
+            while (n > 0)
+            {
+                wp += 3;
+                op += 3;
+                n -= 3;
+                t0 = ToLinearF[(cr += wp[0]) & mask] * SCALE12;
+                t1 = ToLinearF[(cg += wp[1]) & mask] * SCALE12;
+                t2 = ToLinearF[(cb += wp[2]) & mask] * SCALE12;
+                op[0] = CLAMP12(t0);
+                op[1] = CLAMP12(t1);
+                op[2] = CLAMP12(t2);
+            }
+        }
+        else if (stride == 4)
+        {
+            t0 = ToLinearF[cr = (wp[0] & mask)] * SCALE12;
+            t1 = ToLinearF[cg = (wp[1] & mask)] * SCALE12;
+            t2 = ToLinearF[cb = (wp[2] & mask)] * SCALE12;
+            t3 = ToLinearF[ca = (wp[3] & mask)] * SCALE12;
+            op[0] = CLAMP12(t0);
+            op[1] = CLAMP12(t1);
+            op[2] = CLAMP12(t2);
+            op[3] = CLAMP12(t3);
+            n -= 4;
+            while (n > 0)
+            {
+                wp += 4;
+                op += 4;
+                n -= 4;
+                t0 = ToLinearF[(cr += wp[0]) & mask] * SCALE12;
+                t1 = ToLinearF[(cg += wp[1]) & mask] * SCALE12;
+                t2 = ToLinearF[(cb += wp[2]) & mask] * SCALE12;
+                t3 = ToLinearF[(ca += wp[3]) & mask] * SCALE12;
+                op[0] = CLAMP12(t0);
+                op[1] = CLAMP12(t1);
+                op[2] = CLAMP12(t2);
+                op[3] = CLAMP12(t3);
+            }
+        }
+        else
+        {
+            REPEAT(stride, t0 = ToLinearF[*wp & mask] * SCALE12;
+                   *op = CLAMP12(t0); wp++; op++)
+            n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride, wp[stride] += *wp;
+                       t0 = ToLinearF[wp[stride] & mask] * SCALE12;
+                       *op = CLAMP12(t0); wp++; op++)
+                n -= stride;
+            }
+        }
     }
 }
 
-static void
-horizontalAccumulate16(uint16 *wp, int n, int stride, uint16 *op,
-	uint16 *ToLinear16)
+static void horizontalAccumulate16(uint16_t *wp, int n, int stride,
+                                   uint16_t *op, uint16_t *ToLinear16)
 {
-    register unsigned int  cr, cg, cb, ca, mask;
-
-    if (n >= stride) {
-	mask = CODE_MASK;
-	if (stride == 3) {
-	    op[0] = ToLinear16[cr = (wp[0] & mask)];
-	    op[1] = ToLinear16[cg = (wp[1] & mask)];
-	    op[2] = ToLinear16[cb = (wp[2] & mask)];
-	    n -= 3;
-	    while (n > 0) {
-		wp += 3;
-		op += 3;
-		n -= 3;
-		op[0] = ToLinear16[(cr += wp[0]) & mask];
-		op[1] = ToLinear16[(cg += wp[1]) & mask];
-		op[2] = ToLinear16[(cb += wp[2]) & mask];
-	    }
-	} else if (stride == 4) {
-	    op[0] = ToLinear16[cr = (wp[0] & mask)];
-	    op[1] = ToLinear16[cg = (wp[1] & mask)];
-	    op[2] = ToLinear16[cb = (wp[2] & mask)];
-	    op[3] = ToLinear16[ca = (wp[3] & mask)];
-	    n -= 4;
-	    while (n > 0) {
-		wp += 4;
-		op += 4;
-		n -= 4;
-		op[0] = ToLinear16[(cr += wp[0]) & mask];
-		op[1] = ToLinear16[(cg += wp[1]) & mask];
-		op[2] = ToLinear16[(cb += wp[2]) & mask];
-		op[3] = ToLinear16[(ca += wp[3]) & mask];
-	    }
-	} else {
-	    REPEAT(stride, *op = ToLinear16[*wp&mask]; wp++; op++)
-	    n -= stride;
-	    while (n > 0) {
-		REPEAT(stride,
-		    wp[stride] += *wp; *op = ToLinear16[*wp&mask]; wp++; op++)
-		n -= stride;
-	    }
-	}
+    register unsigned int cr, cg, cb, ca, mask;
+
+    if (n >= stride)
+    {
+        mask = CODE_MASK;
+        if (stride == 3)
+        {
+            op[0] = ToLinear16[cr = (wp[0] & mask)];
+            op[1] = ToLinear16[cg = (wp[1] & mask)];
+            op[2] = ToLinear16[cb = (wp[2] & mask)];
+            n -= 3;
+            while (n > 0)
+            {
+                wp += 3;
+                op += 3;
+                n -= 3;
+                op[0] = ToLinear16[(cr += wp[0]) & mask];
+                op[1] = ToLinear16[(cg += wp[1]) & mask];
+                op[2] = ToLinear16[(cb += wp[2]) & mask];
+            }
+        }
+        else if (stride == 4)
+        {
+            op[0] = ToLinear16[cr = (wp[0] & mask)];
+            op[1] = ToLinear16[cg = (wp[1] & mask)];
+            op[2] = ToLinear16[cb = (wp[2] & mask)];
+            op[3] = ToLinear16[ca = (wp[3] & mask)];
+            n -= 4;
+            while (n > 0)
+            {
+                wp += 4;
+                op += 4;
+                n -= 4;
+                op[0] = ToLinear16[(cr += wp[0]) & mask];
+                op[1] = ToLinear16[(cg += wp[1]) & mask];
+                op[2] = ToLinear16[(cb += wp[2]) & mask];
+                op[3] = ToLinear16[(ca += wp[3]) & mask];
+            }
+        }
+        else
+        {
+            REPEAT(stride, *op = ToLinear16[*wp & mask]; wp++; op++)
+            n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride, wp[stride] += *wp; *op = ToLinear16[*wp & mask];
+                       wp++; op++)
+                n -= stride;
+            }
+        }
     }
 }
 
-/* 
+/*
  * Returns the log encoded 11-bit values with the horizontal
  * differencing undone.
  */
-static void
-horizontalAccumulate11(uint16 *wp, int n, int stride, uint16 *op)
+static void horizontalAccumulate11(uint16_t *wp, int n, int stride,
+                                   uint16_t *op)
 {
     register unsigned int cr, cg, cb, ca, mask;
 
-    if (n >= stride) {
-	mask = CODE_MASK;
-	if (stride == 3) {
-	    op[0] = wp[0];  op[1] = wp[1];  op[2] = wp[2];
-            cr = wp[0];  cg = wp[1];  cb = wp[2];
-	    n -= 3;
-	    while (n > 0) {
-		wp += 3;
-		op += 3;
-		n -= 3;
-		op[0] = (uint16)((cr += wp[0]) & mask);
-		op[1] = (uint16)((cg += wp[1]) & mask);
-		op[2] = (uint16)((cb += wp[2]) & mask);
-	    }
-	} else if (stride == 4) {
-	    op[0] = wp[0];  op[1] = wp[1];
-	    op[2] = wp[2];  op[3] = wp[3];
-            cr = wp[0]; cg = wp[1]; cb = wp[2]; ca = wp[3];
-	    n -= 4;
-	    while (n > 0) {
-		wp += 4;
-		op += 4;
-		n -= 4;
-		op[0] = (uint16)((cr += wp[0]) & mask);
-		op[1] = (uint16)((cg += wp[1]) & mask);
-		op[2] = (uint16)((cb += wp[2]) & mask);
-		op[3] = (uint16)((ca += wp[3]) & mask);
-	    } 
-	} else {
-	    REPEAT(stride, *op = *wp&mask; wp++; op++)
-	    n -= stride;
-	    while (n > 0) {
-		REPEAT(stride,
-		    wp[stride] += *wp; *op = *wp&mask; wp++; op++)
-		n -= stride;
-	    }
-	}
+    if (n >= stride)
+    {
+        mask = CODE_MASK;
+        if (stride == 3)
+        {
+            op[0] = wp[0];
+            op[1] = wp[1];
+            op[2] = wp[2];
+            cr = wp[0];
+            cg = wp[1];
+            cb = wp[2];
+            n -= 3;
+            while (n > 0)
+            {
+                wp += 3;
+                op += 3;
+                n -= 3;
+                op[0] = (uint16_t)((cr += wp[0]) & mask);
+                op[1] = (uint16_t)((cg += wp[1]) & mask);
+                op[2] = (uint16_t)((cb += wp[2]) & mask);
+            }
+        }
+        else if (stride == 4)
+        {
+            op[0] = wp[0];
+            op[1] = wp[1];
+            op[2] = wp[2];
+            op[3] = wp[3];
+            cr = wp[0];
+            cg = wp[1];
+            cb = wp[2];
+            ca = wp[3];
+            n -= 4;
+            while (n > 0)
+            {
+                wp += 4;
+                op += 4;
+                n -= 4;
+                op[0] = (uint16_t)((cr += wp[0]) & mask);
+                op[1] = (uint16_t)((cg += wp[1]) & mask);
+                op[2] = (uint16_t)((cb += wp[2]) & mask);
+                op[3] = (uint16_t)((ca += wp[3]) & mask);
+            }
+        }
+        else
+        {
+            REPEAT(stride, *op = *wp & mask; wp++; op++)
+            n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride, wp[stride] += *wp; *op = *wp & mask; wp++; op++)
+                n -= stride;
+            }
+        }
     }
 }
 
-static void
-horizontalAccumulate8(uint16 *wp, int n, int stride, unsigned char *op,
-	unsigned char *ToLinear8)
+static void horizontalAccumulate8(uint16_t *wp, int n, int stride,
+                                  unsigned char *op, unsigned char *ToLinear8)
 {
-    register unsigned int  cr, cg, cb, ca, mask;
-
-    if (n >= stride) {
-	mask = CODE_MASK;
-	if (stride == 3) {
-	    op[0] = ToLinear8[cr = (wp[0] & mask)];
-	    op[1] = ToLinear8[cg = (wp[1] & mask)];
-	    op[2] = ToLinear8[cb = (wp[2] & mask)];
-	    n -= 3;
-	    while (n > 0) {
-		n -= 3;
-		wp += 3;
-		op += 3;
-		op[0] = ToLinear8[(cr += wp[0]) & mask];
-		op[1] = ToLinear8[(cg += wp[1]) & mask];
-		op[2] = ToLinear8[(cb += wp[2]) & mask];
-	    }
-	} else if (stride == 4) {
-	    op[0] = ToLinear8[cr = (wp[0] & mask)];
-	    op[1] = ToLinear8[cg = (wp[1] & mask)];
-	    op[2] = ToLinear8[cb = (wp[2] & mask)];
-	    op[3] = ToLinear8[ca = (wp[3] & mask)];
-	    n -= 4;
-	    while (n > 0) {
-		n -= 4;
-		wp += 4;
-		op += 4;
-		op[0] = ToLinear8[(cr += wp[0]) & mask];
-		op[1] = ToLinear8[(cg += wp[1]) & mask];
-		op[2] = ToLinear8[(cb += wp[2]) & mask];
-		op[3] = ToLinear8[(ca += wp[3]) & mask];
-	    }
-	} else {
-	    REPEAT(stride, *op = ToLinear8[*wp&mask]; wp++; op++)
-	    n -= stride;
-	    while (n > 0) {
-		REPEAT(stride,
-		    wp[stride] += *wp; *op = ToLinear8[*wp&mask]; wp++; op++)
-		n -= stride;
-	    }
-	}
+    register unsigned int cr, cg, cb, ca, mask;
+
+    if (n >= stride)
+    {
+        mask = CODE_MASK;
+        if (stride == 3)
+        {
+            op[0] = ToLinear8[cr = (wp[0] & mask)];
+            op[1] = ToLinear8[cg = (wp[1] & mask)];
+            op[2] = ToLinear8[cb = (wp[2] & mask)];
+            n -= 3;
+            while (n > 0)
+            {
+                n -= 3;
+                wp += 3;
+                op += 3;
+                op[0] = ToLinear8[(cr += wp[0]) & mask];
+                op[1] = ToLinear8[(cg += wp[1]) & mask];
+                op[2] = ToLinear8[(cb += wp[2]) & mask];
+            }
+        }
+        else if (stride == 4)
+        {
+            op[0] = ToLinear8[cr = (wp[0] & mask)];
+            op[1] = ToLinear8[cg = (wp[1] & mask)];
+            op[2] = ToLinear8[cb = (wp[2] & mask)];
+            op[3] = ToLinear8[ca = (wp[3] & mask)];
+            n -= 4;
+            while (n > 0)
+            {
+                n -= 4;
+                wp += 4;
+                op += 4;
+                op[0] = ToLinear8[(cr += wp[0]) & mask];
+                op[1] = ToLinear8[(cg += wp[1]) & mask];
+                op[2] = ToLinear8[(cb += wp[2]) & mask];
+                op[3] = ToLinear8[(ca += wp[3]) & mask];
+            }
+        }
+        else
+        {
+            REPEAT(stride, *op = ToLinear8[*wp & mask]; wp++; op++)
+            n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride, wp[stride] += *wp; *op = ToLinear8[*wp & mask];
+                       wp++; op++)
+                n -= stride;
+            }
+        }
     }
 }
 
-
-static void
-horizontalAccumulate8abgr(uint16 *wp, int n, int stride, unsigned char *op,
-	unsigned char *ToLinear8)
+static void horizontalAccumulate8abgr(uint16_t *wp, int n, int stride,
+                                      unsigned char *op,
+                                      unsigned char *ToLinear8)
 {
-    register unsigned int  cr, cg, cb, ca, mask;
-    register unsigned char  t0, t1, t2, t3;
-
-    if (n >= stride) {
-	mask = CODE_MASK;
-	if (stride == 3) {
-	    op[0] = 0;
-	    t1 = ToLinear8[cb = (wp[2] & mask)];
-	    t2 = ToLinear8[cg = (wp[1] & mask)];
-	    t3 = ToLinear8[cr = (wp[0] & mask)];
-	    op[1] = t1;
-	    op[2] = t2;
-	    op[3] = t3;
-	    n -= 3;
-	    while (n > 0) {
-		n -= 3;
-		wp += 3;
-		op += 4;
-		op[0] = 0;
-		t1 = ToLinear8[(cb += wp[2]) & mask];
-		t2 = ToLinear8[(cg += wp[1]) & mask];
-		t3 = ToLinear8[(cr += wp[0]) & mask];
-		op[1] = t1;
-		op[2] = t2;
-		op[3] = t3;
-	    }
-	} else if (stride == 4) {
-	    t0 = ToLinear8[ca = (wp[3] & mask)];
-	    t1 = ToLinear8[cb = (wp[2] & mask)];
-	    t2 = ToLinear8[cg = (wp[1] & mask)];
-	    t3 = ToLinear8[cr = (wp[0] & mask)];
-	    op[0] = t0;
-	    op[1] = t1;
-	    op[2] = t2;
-	    op[3] = t3;
-	    n -= 4;
-	    while (n > 0) {
-		n -= 4;
-		wp += 4;
-		op += 4;
-		t0 = ToLinear8[(ca += wp[3]) & mask];
-		t1 = ToLinear8[(cb += wp[2]) & mask];
-		t2 = ToLinear8[(cg += wp[1]) & mask];
-		t3 = ToLinear8[(cr += wp[0]) & mask];
-		op[0] = t0;
-		op[1] = t1;
-		op[2] = t2;
-		op[3] = t3;
-	    }
-	} else {
-	    REPEAT(stride, *op = ToLinear8[*wp&mask]; wp++; op++)
-	    n -= stride;
-	    while (n > 0) {
-		REPEAT(stride,
-		    wp[stride] += *wp; *op = ToLinear8[*wp&mask]; wp++; op++)
-		n -= stride;
-	    }
-	}
+    register unsigned int cr, cg, cb, ca, mask;
+    register unsigned char t0, t1, t2, t3;
+
+    if (n >= stride)
+    {
+        mask = CODE_MASK;
+        if (stride == 3)
+        {
+            op[0] = 0;
+            t1 = ToLinear8[cb = (wp[2] & mask)];
+            t2 = ToLinear8[cg = (wp[1] & mask)];
+            t3 = ToLinear8[cr = (wp[0] & mask)];
+            op[1] = t1;
+            op[2] = t2;
+            op[3] = t3;
+            n -= 3;
+            while (n > 0)
+            {
+                n -= 3;
+                wp += 3;
+                op += 4;
+                op[0] = 0;
+                t1 = ToLinear8[(cb += wp[2]) & mask];
+                t2 = ToLinear8[(cg += wp[1]) & mask];
+                t3 = ToLinear8[(cr += wp[0]) & mask];
+                op[1] = t1;
+                op[2] = t2;
+                op[3] = t3;
+            }
+        }
+        else if (stride == 4)
+        {
+            t0 = ToLinear8[ca = (wp[3] & mask)];
+            t1 = ToLinear8[cb = (wp[2] & mask)];
+            t2 = ToLinear8[cg = (wp[1] & mask)];
+            t3 = ToLinear8[cr = (wp[0] & mask)];
+            op[0] = t0;
+            op[1] = t1;
+            op[2] = t2;
+            op[3] = t3;
+            n -= 4;
+            while (n > 0)
+            {
+                n -= 4;
+                wp += 4;
+                op += 4;
+                t0 = ToLinear8[(ca += wp[3]) & mask];
+                t1 = ToLinear8[(cb += wp[2]) & mask];
+                t2 = ToLinear8[(cg += wp[1]) & mask];
+                t3 = ToLinear8[(cr += wp[0]) & mask];
+                op[0] = t0;
+                op[1] = t1;
+                op[2] = t2;
+                op[3] = t3;
+            }
+        }
+        else
+        {
+            REPEAT(stride, *op = ToLinear8[*wp & mask]; wp++; op++)
+            n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride, wp[stride] += *wp; *op = ToLinear8[*wp & mask];
+                       wp++; op++)
+                n -= stride;
+            }
+        }
     }
 }
 
@@ -454,110 +520,121 @@ horizontalAccumulate8abgr(uint16 *wp, int n, int stride, unsigned char *op,
  * State block for each open TIFF
  * file using PixarLog compression/decompression.
  */
-typedef	struct {
-	TIFFPredictorState	predict;
-	z_stream		stream;
-	tmsize_t		tbuf_size; /* only set/used on reading for now */
-	uint16			*tbuf; 
-	uint16			stride;
-	int			state;
-	int			user_datafmt;
-	int			quality;
+typedef struct
+{
+    TIFFPredictorState predict;
+    z_stream stream;
+    tmsize_t tbuf_size; /* only set/used on reading for now */
+    uint16_t *tbuf;
+    uint16_t stride;
+    int state;
+    int user_datafmt;
+    int quality;
 #define PLSTATE_INIT 1
 
-	TIFFVSetMethod		vgetparent;	/* super-class method */
-	TIFFVSetMethod		vsetparent;	/* super-class method */
+    TIFFVSetMethod vgetparent; /* super-class method */
+    TIFFVSetMethod vsetparent; /* super-class method */
+
+    float *ToLinearF;
+    uint16_t *ToLinear16;
+    unsigned char *ToLinear8;
+    uint16_t *FromLT2;
+    uint16_t *From14; /* Really for 16-bit data, but we shift down 2 */
+    uint16_t *From8;
 
-	float *ToLinearF;
-	uint16 *ToLinear16;
-	unsigned char *ToLinear8;
-	uint16  *FromLT2;
-	uint16  *From14; /* Really for 16-bit data, but we shift down 2 */
-	uint16  *From8;
-	
 } PixarLogState;
 
-static int
-PixarLogMakeTables(PixarLogState *sp)
+static int PixarLogMakeTables(TIFF *tif, PixarLogState *sp)
 {
 
-/*
- *    We make several tables here to convert between various external
- *    representations (float, 16-bit, and 8-bit) and the internal
- *    11-bit companded representation.  The 11-bit representation has two
- *    distinct regions.  A linear bottom end up through .018316 in steps
- *    of about .000073, and a region of constant ratio up to about 25.
- *    These floating point numbers are stored in the main table ToLinearF. 
- *    All other tables are derived from this one.  The tables (and the
- *    ratios) are continuous at the internal seam.
- */
+    /*
+     *    We make several tables here to convert between various external
+     *    representations (float, 16-bit, and 8-bit) and the internal
+     *    11-bit companded representation.  The 11-bit representation has two
+     *    distinct regions.  A linear bottom end up through .018316 in steps
+     *    of about .000073, and a region of constant ratio up to about 25.
+     *    These floating point numbers are stored in the main table ToLinearF.
+     *    All other tables are derived from this one.  The tables (and the
+     *    ratios) are continuous at the internal seam.
+     */
 
-    int  nlin, lt2size;
-    int  i, j;
-    double  b, c, linstep, v;
+    int nlin, lt2size;
+    int i, j;
+    double b, c, linstep, v;
     float *ToLinearF;
-    uint16 *ToLinear16;
+    uint16_t *ToLinear16;
     unsigned char *ToLinear8;
-    uint16  *FromLT2;
-    uint16  *From14; /* Really for 16-bit data, but we shift down 2 */
-    uint16  *From8;
-
-    c = log(RATIO);	
-    nlin = (int)(1./c);	/* nlin must be an integer */
-    c = 1./nlin;
-    b = exp(-c*ONE);	/* multiplicative scale factor [b*exp(c*ONE) = 1] */
-    linstep = b*c*exp(1.);
-
-    LogK1 = (float)(1./c);	/* if (v >= 2)  token = k1*log(v*k2) */
-    LogK2 = (float)(1./b);
-    lt2size = (int)(2./linstep) + 1;
-    FromLT2 = (uint16 *)_TIFFmalloc(lt2size*sizeof(uint16));
-    From14 = (uint16 *)_TIFFmalloc(16384*sizeof(uint16));
-    From8 = (uint16 *)_TIFFmalloc(256*sizeof(uint16));
-    ToLinearF = (float *)_TIFFmalloc(TSIZEP1 * sizeof(float));
-    ToLinear16 = (uint16 *)_TIFFmalloc(TSIZEP1 * sizeof(uint16));
-    ToLinear8 = (unsigned char *)_TIFFmalloc(TSIZEP1 * sizeof(unsigned char));
-    if (FromLT2 == NULL || From14  == NULL || From8   == NULL ||
-	 ToLinearF == NULL || ToLinear16 == NULL || ToLinear8 == NULL) {
-	if (FromLT2) _TIFFfree(FromLT2);
-	if (From14) _TIFFfree(From14);
-	if (From8) _TIFFfree(From8);
-	if (ToLinearF) _TIFFfree(ToLinearF);
-	if (ToLinear16) _TIFFfree(ToLinear16);
-	if (ToLinear8) _TIFFfree(ToLinear8);
-	sp->FromLT2 = NULL;
-	sp->From14 = NULL;
-	sp->From8 = NULL;
-	sp->ToLinearF = NULL;
-	sp->ToLinear16 = NULL;
-	sp->ToLinear8 = NULL;
-	return 0;
+    uint16_t *FromLT2;
+    uint16_t *From14; /* Really for 16-bit data, but we shift down 2 */
+    uint16_t *From8;
+
+    c = log(RATIO);
+    nlin = (int)(1. / c); /* nlin must be an integer */
+    c = 1. / nlin;
+    b = exp(-c * ONE); /* multiplicative scale factor [b*exp(c*ONE) = 1] */
+    linstep = b * c * exp(1.);
+
+    LogK1 = (float)(1. / c); /* if (v >= 2)  token = k1*log(v*k2) */
+    LogK2 = (float)(1. / b);
+    lt2size = (int)(2. / linstep) + 1;
+    FromLT2 = (uint16_t *)_TIFFmallocExt(tif, lt2size * sizeof(uint16_t));
+    From14 = (uint16_t *)_TIFFmallocExt(tif, 16384 * sizeof(uint16_t));
+    From8 = (uint16_t *)_TIFFmallocExt(tif, 256 * sizeof(uint16_t));
+    ToLinearF = (float *)_TIFFmallocExt(tif, TSIZEP1 * sizeof(float));
+    ToLinear16 = (uint16_t *)_TIFFmallocExt(tif, TSIZEP1 * sizeof(uint16_t));
+    ToLinear8 =
+        (unsigned char *)_TIFFmallocExt(tif, TSIZEP1 * sizeof(unsigned char));
+    if (FromLT2 == NULL || From14 == NULL || From8 == NULL ||
+        ToLinearF == NULL || ToLinear16 == NULL || ToLinear8 == NULL)
+    {
+        if (FromLT2)
+            _TIFFfreeExt(tif, FromLT2);
+        if (From14)
+            _TIFFfreeExt(tif, From14);
+        if (From8)
+            _TIFFfreeExt(tif, From8);
+        if (ToLinearF)
+            _TIFFfreeExt(tif, ToLinearF);
+        if (ToLinear16)
+            _TIFFfreeExt(tif, ToLinear16);
+        if (ToLinear8)
+            _TIFFfreeExt(tif, ToLinear8);
+        sp->FromLT2 = NULL;
+        sp->From14 = NULL;
+        sp->From8 = NULL;
+        sp->ToLinearF = NULL;
+        sp->ToLinear16 = NULL;
+        sp->ToLinear8 = NULL;
+        return 0;
     }
 
     j = 0;
 
-    for (i = 0; i < nlin; i++)  {
-	v = i * linstep;
-	ToLinearF[j++] = (float)v;
+    for (i = 0; i < nlin; i++)
+    {
+        v = i * linstep;
+        ToLinearF[j++] = (float)v;
     }
 
     for (i = nlin; i < TSIZE; i++)
-	ToLinearF[j++] = (float)(b*exp(c*i));
+        ToLinearF[j++] = (float)(b * exp(c * i));
 
     ToLinearF[2048] = ToLinearF[2047];
 
-    for (i = 0; i < TSIZEP1; i++)  {
-	v = ToLinearF[i]*65535.0 + 0.5;
-	ToLinear16[i] = (v > 65535.0) ? 65535 : (uint16)v;
-	v = ToLinearF[i]*255.0  + 0.5;
-	ToLinear8[i]  = (v > 255.0) ? 255 : (unsigned char)v;
+    for (i = 0; i < TSIZEP1; i++)
+    {
+        v = ToLinearF[i] * 65535.0 + 0.5;
+        ToLinear16[i] = (v > 65535.0) ? 65535 : (uint16_t)v;
+        v = ToLinearF[i] * 255.0 + 0.5;
+        ToLinear8[i] = (v > 255.0) ? 255 : (unsigned char)v;
     }
 
     j = 0;
-    for (i = 0; i < lt2size; i++)  {
-	if ((i*linstep)*(i*linstep) > ToLinearF[j]*ToLinearF[j+1])
-	    j++;
-	FromLT2[i] = (uint16)j;
+    for (i = 0; i < lt2size; i++)
+    {
+        if ((i * linstep) * (i * linstep) > ToLinearF[j] * ToLinearF[j + 1])
+            j++;
+        FromLT2[i] = (uint16_t)j;
     }
 
     /*
@@ -566,20 +643,22 @@ PixarLogMakeTables(PixarLogState *sp)
      * saves a little table space.
      */
     j = 0;
-    for (i = 0; i < 16384; i++)  {
-	while ((i/16383.)*(i/16383.) > ToLinearF[j]*ToLinearF[j+1])
-	    j++;
-	From14[i] = (uint16)j;
+    for (i = 0; i < 16384; i++)
+    {
+        while ((i / 16383.) * (i / 16383.) > ToLinearF[j] * ToLinearF[j + 1])
+            j++;
+        From14[i] = (uint16_t)j;
     }
 
     j = 0;
-    for (i = 0; i < 256; i++)  {
-	while ((i/255.)*(i/255.) > ToLinearF[j]*ToLinearF[j+1])
-	    j++;
-	From8[i] = (uint16)j;
+    for (i = 0; i < 256; i++)
+    {
+        while ((i / 255.) * (i / 255.) > ToLinearF[j] * ToLinearF[j + 1])
+            j++;
+        From8[i] = (uint16_t)j;
     }
 
-    Fltsize = (float)(lt2size/2);
+    Fltsize = (float)(lt2size / 2);
 
     sp->ToLinearF = ToLinearF;
     sp->ToLinear16 = ToLinear16;
@@ -591,622 +670,727 @@ PixarLogMakeTables(PixarLogState *sp)
     return 1;
 }
 
-#define DecoderState(tif)	((PixarLogState*) (tif)->tif_data)
-#define EncoderState(tif)	((PixarLogState*) (tif)->tif_data)
+#define DecoderState(tif) ((PixarLogState *)(tif)->tif_data)
+#define EncoderState(tif) ((PixarLogState *)(tif)->tif_data)
 
-static int PixarLogEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s);
-static int PixarLogDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s);
+static int PixarLogEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s);
+static int PixarLogDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s);
 
-#define PIXARLOGDATAFMT_UNKNOWN	-1
+#define PIXARLOGDATAFMT_UNKNOWN -1
 
-static int
-PixarLogGuessDataFmt(TIFFDirectory *td)
+static int PixarLogGuessDataFmt(TIFFDirectory *td)
 {
-	int guess = PIXARLOGDATAFMT_UNKNOWN;
-	int format = td->td_sampleformat;
-
-	/* If the user didn't tell us his datafmt,
-	 * take our best guess from the bitspersample.
-	 */
-	switch (td->td_bitspersample) {
-	 case 32:
-		if (format == SAMPLEFORMAT_IEEEFP)
-			guess = PIXARLOGDATAFMT_FLOAT;
-		break;
-	 case 16:
-		if (format == SAMPLEFORMAT_VOID || format == SAMPLEFORMAT_UINT)
-			guess = PIXARLOGDATAFMT_16BIT;
-		break;
-	 case 12:
-		if (format == SAMPLEFORMAT_VOID || format == SAMPLEFORMAT_INT)
-			guess = PIXARLOGDATAFMT_12BITPICIO;
-		break;
-	 case 11:
-		if (format == SAMPLEFORMAT_VOID || format == SAMPLEFORMAT_UINT)
-			guess = PIXARLOGDATAFMT_11BITLOG;
-		break;
-	 case 8:
-		if (format == SAMPLEFORMAT_VOID || format == SAMPLEFORMAT_UINT)
-			guess = PIXARLOGDATAFMT_8BIT;
-		break;
-	}
-
-	return guess;
+    int guess = PIXARLOGDATAFMT_UNKNOWN;
+    int format = td->td_sampleformat;
+
+    /* If the user didn't tell us his datafmt,
+     * take our best guess from the bitspersample.
+     */
+    switch (td->td_bitspersample)
+    {
+        case 32:
+            if (format == SAMPLEFORMAT_IEEEFP)
+                guess = PIXARLOGDATAFMT_FLOAT;
+            break;
+        case 16:
+            if (format == SAMPLEFORMAT_VOID || format == SAMPLEFORMAT_UINT)
+                guess = PIXARLOGDATAFMT_16BIT;
+            break;
+        case 12:
+            if (format == SAMPLEFORMAT_VOID || format == SAMPLEFORMAT_INT)
+                guess = PIXARLOGDATAFMT_12BITPICIO;
+            break;
+        case 11:
+            if (format == SAMPLEFORMAT_VOID || format == SAMPLEFORMAT_UINT)
+                guess = PIXARLOGDATAFMT_11BITLOG;
+            break;
+        case 8:
+            if (format == SAMPLEFORMAT_VOID || format == SAMPLEFORMAT_UINT)
+                guess = PIXARLOGDATAFMT_8BIT;
+            break;
+    }
+
+    return guess;
 }
 
-static tmsize_t
-multiply_ms(tmsize_t m1, tmsize_t m2)
+static tmsize_t multiply_ms(tmsize_t m1, tmsize_t m2)
 {
-        return _TIFFMultiplySSize(NULL, m1, m2, NULL);
+    return _TIFFMultiplySSize(NULL, m1, m2, NULL);
 }
 
-static tmsize_t
-add_ms(tmsize_t m1, tmsize_t m2)
+static tmsize_t add_ms(tmsize_t m1, tmsize_t m2)
 {
-        assert(m1 >= 0 && m2 >= 0);
-	/* if either input is zero, assume overflow already occurred */
-	if (m1 == 0 || m2 == 0)
-		return 0;
-	else if (m1 > TIFF_TMSIZE_T_MAX - m2)
-		return 0;
-
-	return m1 + m2;
+    assert(m1 >= 0 && m2 >= 0);
+    /* if either input is zero, assume overflow already occurred */
+    if (m1 == 0 || m2 == 0)
+        return 0;
+    else if (m1 > TIFF_TMSIZE_T_MAX - m2)
+        return 0;
+
+    return m1 + m2;
 }
 
-static int
-PixarLogFixupTags(TIFF* tif)
+static int PixarLogFixupTags(TIFF *tif)
 {
-	(void) tif;
-	return (1);
+    (void)tif;
+    return (1);
 }
 
-static int
-PixarLogSetupDecode(TIFF* tif)
+static int PixarLogSetupDecode(TIFF *tif)
 {
-	static const char module[] = "PixarLogSetupDecode";
-	TIFFDirectory *td = &tif->tif_dir;
-	PixarLogState* sp = DecoderState(tif);
-	tmsize_t tbuf_size;
-        uint32 strip_height;
-
-	assert(sp != NULL);
-
-	/* This function can possibly be called several times by */
-	/* PredictorSetupDecode() if this function succeeds but */
-	/* PredictorSetup() fails */
-	if( (sp->state & PLSTATE_INIT) != 0 )
-		return 1;
-
-        strip_height = td->td_rowsperstrip;
-        if( strip_height > td->td_imagelength )
-            strip_height = td->td_imagelength;
-
-	/* Make sure no byte swapping happens on the data
-	 * after decompression. */
-	tif->tif_postdecode = _TIFFNoPostDecode;  
-
-	/* for some reason, we can't do this in TIFFInitPixarLog */
-
-	sp->stride = (td->td_planarconfig == PLANARCONFIG_CONTIG ?
-	    td->td_samplesperpixel : 1);
-	tbuf_size = multiply_ms(multiply_ms(multiply_ms(sp->stride, td->td_imagewidth),
-				      strip_height), sizeof(uint16));
-	/* add one more stride in case input ends mid-stride */
-	tbuf_size = add_ms(tbuf_size, sizeof(uint16) * sp->stride);
-	if (tbuf_size == 0)
-		return (0);   /* TODO: this is an error return without error report through TIFFErrorExt */
-	sp->tbuf = (uint16 *) _TIFFmalloc(tbuf_size);
-	if (sp->tbuf == NULL)
-		return (0);
-	sp->tbuf_size = tbuf_size;
-	if (sp->user_datafmt == PIXARLOGDATAFMT_UNKNOWN)
-		sp->user_datafmt = PixarLogGuessDataFmt(td);
-	if (sp->user_datafmt == PIXARLOGDATAFMT_UNKNOWN) {
-                _TIFFfree(sp->tbuf);
-                sp->tbuf = NULL;
-                sp->tbuf_size = 0;
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"PixarLog compression can't handle bits depth/data format combination (depth: %d)", 
-			td->td_bitspersample);
-		return (0);
-	}
-
-	if (inflateInit(&sp->stream) != Z_OK) {
-                _TIFFfree(sp->tbuf);
-                sp->tbuf = NULL;
-                sp->tbuf_size = 0;
-		TIFFErrorExt(tif->tif_clientdata, module, "%s", sp->stream.msg ? sp->stream.msg : "(null)");
-		return (0);
-	} else {
-		sp->state |= PLSTATE_INIT;
-		return (1);
-	}
+    static const char module[] = "PixarLogSetupDecode";
+    TIFFDirectory *td = &tif->tif_dir;
+    PixarLogState *sp = DecoderState(tif);
+    tmsize_t tbuf_size;
+    uint32_t strip_height;
+
+    assert(sp != NULL);
+
+    /* This function can possibly be called several times by */
+    /* PredictorSetupDecode() if this function succeeds but */
+    /* PredictorSetup() fails */
+    if ((sp->state & PLSTATE_INIT) != 0)
+        return 1;
+
+    strip_height = td->td_rowsperstrip;
+    if (strip_height > td->td_imagelength)
+        strip_height = td->td_imagelength;
+
+    /* Make sure no byte swapping happens on the data
+     * after decompression. */
+    tif->tif_postdecode = _TIFFNoPostDecode;
+
+    /* for some reason, we can't do this in TIFFInitPixarLog */
+
+    sp->stride =
+        (td->td_planarconfig == PLANARCONFIG_CONTIG ? td->td_samplesperpixel
+                                                    : 1);
+    tbuf_size = multiply_ms(
+        multiply_ms(multiply_ms(sp->stride, td->td_imagewidth), strip_height),
+        sizeof(uint16_t));
+    /* add one more stride in case input ends mid-stride */
+    tbuf_size = add_ms(tbuf_size, sizeof(uint16_t) * sp->stride);
+    if (tbuf_size == 0)
+        return (0); /* TODO: this is an error return without error report
+                       through TIFFErrorExt */
+    sp->tbuf = (uint16_t *)_TIFFmallocExt(tif, tbuf_size);
+    if (sp->tbuf == NULL)
+        return (0);
+    sp->tbuf_size = tbuf_size;
+    if (sp->user_datafmt == PIXARLOGDATAFMT_UNKNOWN)
+        sp->user_datafmt = PixarLogGuessDataFmt(td);
+    if (sp->user_datafmt == PIXARLOGDATAFMT_UNKNOWN)
+    {
+        _TIFFfreeExt(tif, sp->tbuf);
+        sp->tbuf = NULL;
+        sp->tbuf_size = 0;
+        TIFFErrorExtR(tif, module,
+                      "PixarLog compression can't handle bits depth/data "
+                      "format combination (depth: %" PRIu16 ")",
+                      td->td_bitspersample);
+        return (0);
+    }
+
+    if (inflateInit(&sp->stream) != Z_OK)
+    {
+        _TIFFfreeExt(tif, sp->tbuf);
+        sp->tbuf = NULL;
+        sp->tbuf_size = 0;
+        TIFFErrorExtR(tif, module, "%s",
+                      sp->stream.msg ? sp->stream.msg : "(null)");
+        return (0);
+    }
+    else
+    {
+        sp->state |= PLSTATE_INIT;
+        return (1);
+    }
 }
 
 /*
  * Setup state for decoding a strip.
  */
-static int
-PixarLogPreDecode(TIFF* tif, uint16 s)
+static int PixarLogPreDecode(TIFF *tif, uint16_t s)
 {
-	static const char module[] = "PixarLogPreDecode";
-	PixarLogState* sp = DecoderState(tif);
-
-	(void) s;
-	assert(sp != NULL);
-	sp->stream.next_in = tif->tif_rawdata;
-	assert(sizeof(sp->stream.avail_in)==4);  /* if this assert gets raised,
-	    we need to simplify this code to reflect a ZLib that is likely updated
-	    to deal with 8byte memory sizes, though this code will respond
-	    appropriately even before we simplify it */
-	sp->stream.avail_in = (uInt) tif->tif_rawcc;
-	if ((tmsize_t)sp->stream.avail_in != tif->tif_rawcc)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "ZLib cannot deal with buffers this size");
-		return (0);
-	}
-	return (inflateReset(&sp->stream) == Z_OK);
+    static const char module[] = "PixarLogPreDecode";
+    PixarLogState *sp = DecoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+    sp->stream.next_in = tif->tif_rawdata;
+    assert(sizeof(sp->stream.avail_in) == 4); /* if this assert gets raised,
+         we need to simplify this code to reflect a ZLib that is likely updated
+         to deal with 8byte memory sizes, though this code will respond
+         appropriately even before we simplify it */
+    sp->stream.avail_in = (uInt)tif->tif_rawcc;
+    if ((tmsize_t)sp->stream.avail_in != tif->tif_rawcc)
+    {
+        TIFFErrorExtR(tif, module, "ZLib cannot deal with buffers this size");
+        return (0);
+    }
+    return (inflateReset(&sp->stream) == Z_OK);
 }
 
-static int
-PixarLogDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int PixarLogDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "PixarLogDecode";
-	TIFFDirectory *td = &tif->tif_dir;
-	PixarLogState* sp = DecoderState(tif);
-	tmsize_t i;
-	tmsize_t nsamples;
-	int llen;
-	uint16 *up;
-
-	switch (sp->user_datafmt) {
-	case PIXARLOGDATAFMT_FLOAT:
-		nsamples = occ / sizeof(float);	/* XXX float == 32 bits */
-		break;
-	case PIXARLOGDATAFMT_16BIT:
-	case PIXARLOGDATAFMT_12BITPICIO:
-	case PIXARLOGDATAFMT_11BITLOG:
-		nsamples = occ / sizeof(uint16); /* XXX uint16 == 16 bits */
-		break;
-	case PIXARLOGDATAFMT_8BIT:
-	case PIXARLOGDATAFMT_8BITABGR:
-		nsamples = occ;
-		break;
-	default:
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"%d bit input not supported in PixarLog",
-			td->td_bitspersample);
-		return 0;
-	}
-
-	llen = sp->stride * td->td_imagewidth;
-
-	(void) s;
-	assert(sp != NULL);
-
-        sp->stream.next_in = tif->tif_rawcp;
-	sp->stream.avail_in = (uInt) tif->tif_rawcc;
-
-	sp->stream.next_out = (unsigned char *) sp->tbuf;
-	assert(sizeof(sp->stream.avail_out)==4);  /* if this assert gets raised,
-	    we need to simplify this code to reflect a ZLib that is likely updated
-	    to deal with 8byte memory sizes, though this code will respond
-	    appropriately even before we simplify it */
-	sp->stream.avail_out = (uInt) (nsamples * sizeof(uint16));
-	if (sp->stream.avail_out != nsamples * sizeof(uint16))
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "ZLib cannot deal with buffers this size");
-		return (0);
-	}
-	/* Check that we will not fill more than what was allocated */
-	if ((tmsize_t)sp->stream.avail_out > sp->tbuf_size)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "sp->stream.avail_out > sp->tbuf_size");
-		return (0);
-	}
-	do {
-		int state = inflate(&sp->stream, Z_PARTIAL_FLUSH);
-		if (state == Z_STREAM_END) {
-			break;			/* XXX */
-		}
-		if (state == Z_DATA_ERROR) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Decoding error at scanline %lu, %s",
-			    (unsigned long) tif->tif_row, sp->stream.msg ? sp->stream.msg : "(null)");
-			return (0);
-		}
-		if (state != Z_OK) {
-			TIFFErrorExt(tif->tif_clientdata, module, "ZLib error: %s",
-			    sp->stream.msg ? sp->stream.msg : "(null)");
-			return (0);
-		}
-	} while (sp->stream.avail_out > 0);
-
-	/* hopefully, we got all the bytes we needed */
-	if (sp->stream.avail_out != 0) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Not enough data at scanline %lu (short " TIFF_UINT64_FORMAT " bytes)",
-		    (unsigned long) tif->tif_row, (TIFF_UINT64_T) sp->stream.avail_out);
-		return (0);
-	}
-
-        tif->tif_rawcp = sp->stream.next_in;
-        tif->tif_rawcc = sp->stream.avail_in;
-
-	up = sp->tbuf;
-	/* Swap bytes in the data if from a different endian machine. */
-	if (tif->tif_flags & TIFF_SWAB)
-		TIFFSwabArrayOfShort(up, nsamples);
-
-	/*
-	 * if llen is not an exact multiple of nsamples, the decode operation
-	 * may overflow the output buffer, so truncate it enough to prevent
-	 * that but still salvage as much data as possible.
-	 */
-	if (nsamples % llen) { 
-		TIFFWarningExt(tif->tif_clientdata, module,
-			"stride %lu is not a multiple of sample count, "
-			"%lu, data truncated.", (unsigned long) llen, (unsigned long) nsamples);
-		nsamples -= nsamples % llen;
-	}
-
-	for (i = 0; i < nsamples; i += llen, up += llen) {
-		switch (sp->user_datafmt)  {
-		case PIXARLOGDATAFMT_FLOAT:
-			horizontalAccumulateF(up, llen, sp->stride,
-					(float *)op, sp->ToLinearF);
-			op += llen * sizeof(float);
-			break;
-		case PIXARLOGDATAFMT_16BIT:
-			horizontalAccumulate16(up, llen, sp->stride,
-					(uint16 *)op, sp->ToLinear16);
-			op += llen * sizeof(uint16);
-			break;
-		case PIXARLOGDATAFMT_12BITPICIO:
-			horizontalAccumulate12(up, llen, sp->stride,
-					(int16 *)op, sp->ToLinearF);
-			op += llen * sizeof(int16);
-			break;
-		case PIXARLOGDATAFMT_11BITLOG:
-			horizontalAccumulate11(up, llen, sp->stride,
-					(uint16 *)op);
-			op += llen * sizeof(uint16);
-			break;
-		case PIXARLOGDATAFMT_8BIT:
-			horizontalAccumulate8(up, llen, sp->stride,
-					(unsigned char *)op, sp->ToLinear8);
-			op += llen * sizeof(unsigned char);
-			break;
-		case PIXARLOGDATAFMT_8BITABGR:
-			horizontalAccumulate8abgr(up, llen, sp->stride,
-					(unsigned char *)op, sp->ToLinear8);
-			op += llen * sizeof(unsigned char);
-			break;
-		default:
-			TIFFErrorExt(tif->tif_clientdata, module,
-				  "Unsupported bits/sample: %d",
-				  td->td_bitspersample);
-			return (0);
-		}
-	}
-
-	return (1);
+    static const char module[] = "PixarLogDecode";
+    TIFFDirectory *td = &tif->tif_dir;
+    PixarLogState *sp = DecoderState(tif);
+    tmsize_t i;
+    tmsize_t nsamples;
+    int llen;
+    uint16_t *up;
+
+    switch (sp->user_datafmt)
+    {
+        case PIXARLOGDATAFMT_FLOAT:
+            nsamples = occ / sizeof(float); /* XXX float == 32 bits */
+            break;
+        case PIXARLOGDATAFMT_16BIT:
+        case PIXARLOGDATAFMT_12BITPICIO:
+        case PIXARLOGDATAFMT_11BITLOG:
+            nsamples = occ / sizeof(uint16_t); /* XXX uint16_t == 16 bits */
+            break;
+        case PIXARLOGDATAFMT_8BIT:
+        case PIXARLOGDATAFMT_8BITABGR:
+            nsamples = occ;
+            break;
+        default:
+            TIFFErrorExtR(tif, module,
+                          "%" PRIu16 " bit input not supported in PixarLog",
+                          td->td_bitspersample);
+            return 0;
+    }
+
+    llen = sp->stride * td->td_imagewidth;
+
+    (void)s;
+    assert(sp != NULL);
+
+    sp->stream.next_in = tif->tif_rawcp;
+    sp->stream.avail_in = (uInt)tif->tif_rawcc;
+
+    sp->stream.next_out = (unsigned char *)sp->tbuf;
+    assert(sizeof(sp->stream.avail_out) == 4); /* if this assert gets raised,
+         we need to simplify this code to reflect a ZLib that is likely updated
+         to deal with 8byte memory sizes, though this code will respond
+         appropriately even before we simplify it */
+    sp->stream.avail_out = (uInt)(nsamples * sizeof(uint16_t));
+    if (sp->stream.avail_out != nsamples * sizeof(uint16_t))
+    {
+        TIFFErrorExtR(tif, module, "ZLib cannot deal with buffers this size");
+        return (0);
+    }
+    /* Check that we will not fill more than what was allocated */
+    if ((tmsize_t)sp->stream.avail_out > sp->tbuf_size)
+    {
+        TIFFErrorExtR(tif, module, "sp->stream.avail_out > sp->tbuf_size");
+        return (0);
+    }
+    do
+    {
+        int state = inflate(&sp->stream, Z_PARTIAL_FLUSH);
+        if (state == Z_STREAM_END)
+        {
+            break; /* XXX */
+        }
+        if (state == Z_DATA_ERROR)
+        {
+            TIFFErrorExtR(
+                tif, module, "Decoding error at scanline %" PRIu32 ", %s",
+                tif->tif_row, sp->stream.msg ? sp->stream.msg : "(null)");
+            return (0);
+        }
+        if (state != Z_OK)
+        {
+            TIFFErrorExtR(tif, module, "ZLib error: %s",
+                          sp->stream.msg ? sp->stream.msg : "(null)");
+            return (0);
+        }
+    } while (sp->stream.avail_out > 0);
+
+    /* hopefully, we got all the bytes we needed */
+    if (sp->stream.avail_out != 0)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Not enough data at scanline %" PRIu32
+                      " (short %u bytes)",
+                      tif->tif_row, sp->stream.avail_out);
+        return (0);
+    }
+
+    tif->tif_rawcp = sp->stream.next_in;
+    tif->tif_rawcc = sp->stream.avail_in;
+
+    up = sp->tbuf;
+    /* Swap bytes in the data if from a different endian machine. */
+    if (tif->tif_flags & TIFF_SWAB)
+        TIFFSwabArrayOfShort(up, nsamples);
+
+    /*
+     * if llen is not an exact multiple of nsamples, the decode operation
+     * may overflow the output buffer, so truncate it enough to prevent
+     * that but still salvage as much data as possible.
+     */
+    if (nsamples % llen)
+    {
+        TIFFWarningExtR(tif, module,
+                        "stride %d is not a multiple of sample count, "
+                        "%" TIFF_SSIZE_FORMAT ", data truncated.",
+                        llen, nsamples);
+        nsamples -= nsamples % llen;
+    }
+
+    for (i = 0; i < nsamples; i += llen, up += llen)
+    {
+        switch (sp->user_datafmt)
+        {
+            case PIXARLOGDATAFMT_FLOAT:
+                horizontalAccumulateF(up, llen, sp->stride, (float *)op,
+                                      sp->ToLinearF);
+                op += llen * sizeof(float);
+                break;
+            case PIXARLOGDATAFMT_16BIT:
+                horizontalAccumulate16(up, llen, sp->stride, (uint16_t *)op,
+                                       sp->ToLinear16);
+                op += llen * sizeof(uint16_t);
+                break;
+            case PIXARLOGDATAFMT_12BITPICIO:
+                horizontalAccumulate12(up, llen, sp->stride, (int16_t *)op,
+                                       sp->ToLinearF);
+                op += llen * sizeof(int16_t);
+                break;
+            case PIXARLOGDATAFMT_11BITLOG:
+                horizontalAccumulate11(up, llen, sp->stride, (uint16_t *)op);
+                op += llen * sizeof(uint16_t);
+                break;
+            case PIXARLOGDATAFMT_8BIT:
+                horizontalAccumulate8(up, llen, sp->stride, (unsigned char *)op,
+                                      sp->ToLinear8);
+                op += llen * sizeof(unsigned char);
+                break;
+            case PIXARLOGDATAFMT_8BITABGR:
+                horizontalAccumulate8abgr(up, llen, sp->stride,
+                                          (unsigned char *)op, sp->ToLinear8);
+                op += llen * sizeof(unsigned char);
+                break;
+            default:
+                TIFFErrorExtR(tif, module, "Unsupported bits/sample: %" PRIu16,
+                              td->td_bitspersample);
+                return (0);
+        }
+    }
+
+    return (1);
 }
 
-static int
-PixarLogSetupEncode(TIFF* tif)
+static int PixarLogSetupEncode(TIFF *tif)
 {
-	static const char module[] = "PixarLogSetupEncode";
-	TIFFDirectory *td = &tif->tif_dir;
-	PixarLogState* sp = EncoderState(tif);
-	tmsize_t tbuf_size;
-
-	assert(sp != NULL);
-
-	/* for some reason, we can't do this in TIFFInitPixarLog */
-
-	sp->stride = (td->td_planarconfig == PLANARCONFIG_CONTIG ?
-	    td->td_samplesperpixel : 1);
-	tbuf_size = multiply_ms(multiply_ms(multiply_ms(sp->stride, td->td_imagewidth),
-				      td->td_rowsperstrip), sizeof(uint16));
-	if (tbuf_size == 0)
-		return (0);  /* TODO: this is an error return without error report through TIFFErrorExt */
-	sp->tbuf = (uint16 *) _TIFFmalloc(tbuf_size);
-	if (sp->tbuf == NULL)
-		return (0);
-	if (sp->user_datafmt == PIXARLOGDATAFMT_UNKNOWN)
-		sp->user_datafmt = PixarLogGuessDataFmt(td);
-	if (sp->user_datafmt == PIXARLOGDATAFMT_UNKNOWN) {
-		TIFFErrorExt(tif->tif_clientdata, module, "PixarLog compression can't handle %d bit linear encodings", td->td_bitspersample);
-		return (0);
-	}
-
-	if (deflateInit(&sp->stream, sp->quality) != Z_OK) {
-		TIFFErrorExt(tif->tif_clientdata, module, "%s", sp->stream.msg ? sp->stream.msg : "(null)");
-		return (0);
-	} else {
-		sp->state |= PLSTATE_INIT;
-		return (1);
-	}
+    static const char module[] = "PixarLogSetupEncode";
+    TIFFDirectory *td = &tif->tif_dir;
+    PixarLogState *sp = EncoderState(tif);
+    tmsize_t tbuf_size;
+
+    assert(sp != NULL);
+
+    /* for some reason, we can't do this in TIFFInitPixarLog */
+
+    sp->stride =
+        (td->td_planarconfig == PLANARCONFIG_CONTIG ? td->td_samplesperpixel
+                                                    : 1);
+    tbuf_size =
+        multiply_ms(multiply_ms(multiply_ms(sp->stride, td->td_imagewidth),
+                                td->td_rowsperstrip),
+                    sizeof(uint16_t));
+    if (tbuf_size == 0)
+        return (0); /* TODO: this is an error return without error report
+                       through TIFFErrorExt */
+    sp->tbuf = (uint16_t *)_TIFFmallocExt(tif, tbuf_size);
+    if (sp->tbuf == NULL)
+        return (0);
+    if (sp->user_datafmt == PIXARLOGDATAFMT_UNKNOWN)
+        sp->user_datafmt = PixarLogGuessDataFmt(td);
+    if (sp->user_datafmt == PIXARLOGDATAFMT_UNKNOWN)
+    {
+        TIFFErrorExtR(tif, module,
+                      "PixarLog compression can't handle %" PRIu16
+                      " bit linear encodings",
+                      td->td_bitspersample);
+        return (0);
+    }
+
+    if (deflateInit(&sp->stream, sp->quality) != Z_OK)
+    {
+        TIFFErrorExtR(tif, module, "%s",
+                      sp->stream.msg ? sp->stream.msg : "(null)");
+        return (0);
+    }
+    else
+    {
+        sp->state |= PLSTATE_INIT;
+        return (1);
+    }
 }
 
 /*
  * Reset encoding state at the start of a strip.
  */
-static int
-PixarLogPreEncode(TIFF* tif, uint16 s)
+static int PixarLogPreEncode(TIFF *tif, uint16_t s)
 {
-	static const char module[] = "PixarLogPreEncode";
-	PixarLogState *sp = EncoderState(tif);
-
-	(void) s;
-	assert(sp != NULL);
-	sp->stream.next_out = tif->tif_rawdata;
-	assert(sizeof(sp->stream.avail_out)==4);  /* if this assert gets raised,
-	    we need to simplify this code to reflect a ZLib that is likely updated
-	    to deal with 8byte memory sizes, though this code will respond
-	    appropriately even before we simplify it */
-	sp->stream.avail_out = (uInt)tif->tif_rawdatasize;
-	if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "ZLib cannot deal with buffers this size");
-		return (0);
-	}
-	return (deflateReset(&sp->stream) == Z_OK);
+    static const char module[] = "PixarLogPreEncode";
+    PixarLogState *sp = EncoderState(tif);
+
+    (void)s;
+    assert(sp != NULL);
+    sp->stream.next_out = tif->tif_rawdata;
+    assert(sizeof(sp->stream.avail_out) == 4); /* if this assert gets raised,
+         we need to simplify this code to reflect a ZLib that is likely updated
+         to deal with 8byte memory sizes, though this code will respond
+         appropriately even before we simplify it */
+    sp->stream.avail_out = (uInt)tif->tif_rawdatasize;
+    if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize)
+    {
+        TIFFErrorExtR(tif, module, "ZLib cannot deal with buffers this size");
+        return (0);
+    }
+    return (deflateReset(&sp->stream) == Z_OK);
 }
 
-static void
-horizontalDifferenceF(float *ip, int n, int stride, uint16 *wp, uint16 *FromLT2)
+static void horizontalDifferenceF(float *ip, int n, int stride, uint16_t *wp,
+                                  uint16_t *FromLT2)
 {
-    int32 r1, g1, b1, a1, r2, g2, b2, a2, mask;
+    int32_t r1, g1, b1, a1, r2, g2, b2, a2, mask;
     float fltsize = Fltsize;
 
-#define  CLAMP(v) ( (v<(float)0.)   ? 0				\
-		  : (v<(float)2.)   ? FromLT2[(int)(v*fltsize)]	\
-		  : (v>(float)24.2) ? 2047			\
-		  : LogK1*log(v*LogK2) + 0.5 )
+#define CLAMP(v)                                                               \
+    ((v < (float)0.)     ? 0                                                   \
+     : (v < (float)2.)   ? FromLT2[(int)(v * fltsize)]                         \
+     : (v > (float)24.2) ? 2047                                                \
+                         : LogK1 * log(v * LogK2) + 0.5)
 
     mask = CODE_MASK;
-    if (n >= stride) {
-	if (stride == 3) {
-	    r2 = wp[0] = (uint16) CLAMP(ip[0]);
-	    g2 = wp[1] = (uint16) CLAMP(ip[1]);
-	    b2 = wp[2] = (uint16) CLAMP(ip[2]);
-	    n -= 3;
-	    while (n > 0) {
-		n -= 3;
-		wp += 3;
-		ip += 3;
-		r1 = (int32) CLAMP(ip[0]); wp[0] = (uint16)((r1-r2) & mask); r2 = r1;
-		g1 = (int32) CLAMP(ip[1]); wp[1] = (uint16)((g1-g2) & mask); g2 = g1;
-		b1 = (int32) CLAMP(ip[2]); wp[2] = (uint16)((b1-b2) & mask); b2 = b1;
-	    }
-	} else if (stride == 4) {
-	    r2 = wp[0] = (uint16) CLAMP(ip[0]);
-	    g2 = wp[1] = (uint16) CLAMP(ip[1]);
-	    b2 = wp[2] = (uint16) CLAMP(ip[2]);
-	    a2 = wp[3] = (uint16) CLAMP(ip[3]);
-	    n -= 4;
-	    while (n > 0) {
-		n -= 4;
-		wp += 4;
-		ip += 4;
-		r1 = (int32) CLAMP(ip[0]); wp[0] = (uint16)((r1-r2) & mask); r2 = r1;
-		g1 = (int32) CLAMP(ip[1]); wp[1] = (uint16)((g1-g2) & mask); g2 = g1;
-		b1 = (int32) CLAMP(ip[2]); wp[2] = (uint16)((b1-b2) & mask); b2 = b1;
-		a1 = (int32) CLAMP(ip[3]); wp[3] = (uint16)((a1-a2) & mask); a2 = a1;
-	    }
-	} else {
-        REPEAT(stride, wp[0] = (uint16) CLAMP(ip[0]); wp++; ip++)
-        n -= stride;
-        while (n > 0) {
-            REPEAT(stride,
-                wp[0] = (uint16)(((int32)CLAMP(ip[0])-(int32)CLAMP(ip[-stride])) & mask);
-                wp++; ip++)
+    if (n >= stride)
+    {
+        if (stride == 3)
+        {
+            r2 = wp[0] = (uint16_t)CLAMP(ip[0]);
+            g2 = wp[1] = (uint16_t)CLAMP(ip[1]);
+            b2 = wp[2] = (uint16_t)CLAMP(ip[2]);
+            n -= 3;
+            while (n > 0)
+            {
+                n -= 3;
+                wp += 3;
+                ip += 3;
+                r1 = (int32_t)CLAMP(ip[0]);
+                wp[0] = (uint16_t)((r1 - r2) & mask);
+                r2 = r1;
+                g1 = (int32_t)CLAMP(ip[1]);
+                wp[1] = (uint16_t)((g1 - g2) & mask);
+                g2 = g1;
+                b1 = (int32_t)CLAMP(ip[2]);
+                wp[2] = (uint16_t)((b1 - b2) & mask);
+                b2 = b1;
+            }
+        }
+        else if (stride == 4)
+        {
+            r2 = wp[0] = (uint16_t)CLAMP(ip[0]);
+            g2 = wp[1] = (uint16_t)CLAMP(ip[1]);
+            b2 = wp[2] = (uint16_t)CLAMP(ip[2]);
+            a2 = wp[3] = (uint16_t)CLAMP(ip[3]);
+            n -= 4;
+            while (n > 0)
+            {
+                n -= 4;
+                wp += 4;
+                ip += 4;
+                r1 = (int32_t)CLAMP(ip[0]);
+                wp[0] = (uint16_t)((r1 - r2) & mask);
+                r2 = r1;
+                g1 = (int32_t)CLAMP(ip[1]);
+                wp[1] = (uint16_t)((g1 - g2) & mask);
+                g2 = g1;
+                b1 = (int32_t)CLAMP(ip[2]);
+                wp[2] = (uint16_t)((b1 - b2) & mask);
+                b2 = b1;
+                a1 = (int32_t)CLAMP(ip[3]);
+                wp[3] = (uint16_t)((a1 - a2) & mask);
+                a2 = a1;
+            }
+        }
+        else
+        {
+            REPEAT(stride, wp[0] = (uint16_t)CLAMP(ip[0]); wp++; ip++)
             n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride,
+                       wp[0] = (uint16_t)(((int32_t)CLAMP(ip[0]) -
+                                           (int32_t)CLAMP(ip[-stride])) &
+                                          mask);
+                       wp++; ip++)
+                n -= stride;
+            }
         }
-	}
     }
 }
 
-static void
-horizontalDifference16(unsigned short *ip, int n, int stride, 
-	unsigned short *wp, uint16 *From14)
+static void horizontalDifference16(unsigned short *ip, int n, int stride,
+                                   unsigned short *wp, uint16_t *From14)
 {
-    register int  r1, g1, b1, a1, r2, g2, b2, a2, mask;
+    register int r1, g1, b1, a1, r2, g2, b2, a2, mask;
 
 /* assumption is unsigned pixel values */
-#undef   CLAMP
-#define  CLAMP(v) From14[(v) >> 2]
+#undef CLAMP
+#define CLAMP(v) From14[(v) >> 2]
 
     mask = CODE_MASK;
-    if (n >= stride) {
-	if (stride == 3) {
-	    r2 = wp[0] = CLAMP(ip[0]);  g2 = wp[1] = CLAMP(ip[1]);
-	    b2 = wp[2] = CLAMP(ip[2]);
-	    n -= 3;
-	    while (n > 0) {
-		n -= 3;
-		wp += 3;
-		ip += 3;
-		r1 = CLAMP(ip[0]); wp[0] = (uint16)((r1-r2) & mask); r2 = r1;
-		g1 = CLAMP(ip[1]); wp[1] = (uint16)((g1-g2) & mask); g2 = g1;
-		b1 = CLAMP(ip[2]); wp[2] = (uint16)((b1-b2) & mask); b2 = b1;
-	    }
-	} else if (stride == 4) {
-	    r2 = wp[0] = CLAMP(ip[0]);  g2 = wp[1] = CLAMP(ip[1]);
-	    b2 = wp[2] = CLAMP(ip[2]);  a2 = wp[3] = CLAMP(ip[3]);
-	    n -= 4;
-	    while (n > 0) {
-		n -= 4;
-		wp += 4;
-		ip += 4;
-		r1 = CLAMP(ip[0]); wp[0] = (uint16)((r1-r2) & mask); r2 = r1;
-		g1 = CLAMP(ip[1]); wp[1] = (uint16)((g1-g2) & mask); g2 = g1;
-		b1 = CLAMP(ip[2]); wp[2] = (uint16)((b1-b2) & mask); b2 = b1;
-		a1 = CLAMP(ip[3]); wp[3] = (uint16)((a1-a2) & mask); a2 = a1;
-	    }
-	} else {
-        REPEAT(stride, wp[0] = CLAMP(ip[0]); wp++; ip++)
-	    n -= stride;
-	    while (n > 0) {
-            REPEAT(stride,
-                wp[0] = (uint16)((CLAMP(ip[0])-CLAMP(ip[-stride])) & mask);
-                wp++; ip++)
+    if (n >= stride)
+    {
+        if (stride == 3)
+        {
+            r2 = wp[0] = CLAMP(ip[0]);
+            g2 = wp[1] = CLAMP(ip[1]);
+            b2 = wp[2] = CLAMP(ip[2]);
+            n -= 3;
+            while (n > 0)
+            {
+                n -= 3;
+                wp += 3;
+                ip += 3;
+                r1 = CLAMP(ip[0]);
+                wp[0] = (uint16_t)((r1 - r2) & mask);
+                r2 = r1;
+                g1 = CLAMP(ip[1]);
+                wp[1] = (uint16_t)((g1 - g2) & mask);
+                g2 = g1;
+                b1 = CLAMP(ip[2]);
+                wp[2] = (uint16_t)((b1 - b2) & mask);
+                b2 = b1;
+            }
+        }
+        else if (stride == 4)
+        {
+            r2 = wp[0] = CLAMP(ip[0]);
+            g2 = wp[1] = CLAMP(ip[1]);
+            b2 = wp[2] = CLAMP(ip[2]);
+            a2 = wp[3] = CLAMP(ip[3]);
+            n -= 4;
+            while (n > 0)
+            {
+                n -= 4;
+                wp += 4;
+                ip += 4;
+                r1 = CLAMP(ip[0]);
+                wp[0] = (uint16_t)((r1 - r2) & mask);
+                r2 = r1;
+                g1 = CLAMP(ip[1]);
+                wp[1] = (uint16_t)((g1 - g2) & mask);
+                g2 = g1;
+                b1 = CLAMP(ip[2]);
+                wp[2] = (uint16_t)((b1 - b2) & mask);
+                b2 = b1;
+                a1 = CLAMP(ip[3]);
+                wp[3] = (uint16_t)((a1 - a2) & mask);
+                a2 = a1;
+            }
+        }
+        else
+        {
+            REPEAT(stride, wp[0] = CLAMP(ip[0]); wp++; ip++)
             n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride,
+                       wp[0] = (uint16_t)((CLAMP(ip[0]) - CLAMP(ip[-stride])) &
+                                          mask);
+                       wp++; ip++)
+                n -= stride;
+            }
         }
-	}
     }
 }
 
-
-static void
-horizontalDifference8(unsigned char *ip, int n, int stride, 
-	unsigned short *wp, uint16 *From8)
+static void horizontalDifference8(unsigned char *ip, int n, int stride,
+                                  unsigned short *wp, uint16_t *From8)
 {
-    register int  r1, g1, b1, a1, r2, g2, b2, a2, mask;
+    register int r1, g1, b1, a1, r2, g2, b2, a2, mask;
 
-#undef	 CLAMP
-#define  CLAMP(v) (From8[(v)])
+#undef CLAMP
+#define CLAMP(v) (From8[(v)])
 
     mask = CODE_MASK;
-    if (n >= stride) {
-	if (stride == 3) {
-	    r2 = wp[0] = CLAMP(ip[0]);  g2 = wp[1] = CLAMP(ip[1]);
-	    b2 = wp[2] = CLAMP(ip[2]);
-	    n -= 3;
-	    while (n > 0) {
-		n -= 3;
-		r1 = CLAMP(ip[3]); wp[3] = (uint16)((r1-r2) & mask); r2 = r1;
-		g1 = CLAMP(ip[4]); wp[4] = (uint16)((g1-g2) & mask); g2 = g1;
-		b1 = CLAMP(ip[5]); wp[5] = (uint16)((b1-b2) & mask); b2 = b1;
-		wp += 3;
-		ip += 3;
-	    }
-	} else if (stride == 4) {
-	    r2 = wp[0] = CLAMP(ip[0]);  g2 = wp[1] = CLAMP(ip[1]);
-	    b2 = wp[2] = CLAMP(ip[2]);  a2 = wp[3] = CLAMP(ip[3]);
-	    n -= 4;
-	    while (n > 0) {
-		n -= 4;
-		r1 = CLAMP(ip[4]); wp[4] = (uint16)((r1-r2) & mask); r2 = r1;
-		g1 = CLAMP(ip[5]); wp[5] = (uint16)((g1-g2) & mask); g2 = g1;
-		b1 = CLAMP(ip[6]); wp[6] = (uint16)((b1-b2) & mask); b2 = b1;
-		a1 = CLAMP(ip[7]); wp[7] = (uint16)((a1-a2) & mask); a2 = a1;
-		wp += 4;
-		ip += 4;
-	    }
-	} else {
-        REPEAT(stride, wp[0] = CLAMP(ip[0]); wp++; ip++)
-        n -= stride;
-        while (n > 0) {
-            REPEAT(stride,
-                wp[0] = (uint16)((CLAMP(ip[0])-CLAMP(ip[-stride])) & mask);
-                wp++; ip++)
+    if (n >= stride)
+    {
+        if (stride == 3)
+        {
+            r2 = wp[0] = CLAMP(ip[0]);
+            g2 = wp[1] = CLAMP(ip[1]);
+            b2 = wp[2] = CLAMP(ip[2]);
+            n -= 3;
+            while (n > 0)
+            {
+                n -= 3;
+                r1 = CLAMP(ip[3]);
+                wp[3] = (uint16_t)((r1 - r2) & mask);
+                r2 = r1;
+                g1 = CLAMP(ip[4]);
+                wp[4] = (uint16_t)((g1 - g2) & mask);
+                g2 = g1;
+                b1 = CLAMP(ip[5]);
+                wp[5] = (uint16_t)((b1 - b2) & mask);
+                b2 = b1;
+                wp += 3;
+                ip += 3;
+            }
+        }
+        else if (stride == 4)
+        {
+            r2 = wp[0] = CLAMP(ip[0]);
+            g2 = wp[1] = CLAMP(ip[1]);
+            b2 = wp[2] = CLAMP(ip[2]);
+            a2 = wp[3] = CLAMP(ip[3]);
+            n -= 4;
+            while (n > 0)
+            {
+                n -= 4;
+                r1 = CLAMP(ip[4]);
+                wp[4] = (uint16_t)((r1 - r2) & mask);
+                r2 = r1;
+                g1 = CLAMP(ip[5]);
+                wp[5] = (uint16_t)((g1 - g2) & mask);
+                g2 = g1;
+                b1 = CLAMP(ip[6]);
+                wp[6] = (uint16_t)((b1 - b2) & mask);
+                b2 = b1;
+                a1 = CLAMP(ip[7]);
+                wp[7] = (uint16_t)((a1 - a2) & mask);
+                a2 = a1;
+                wp += 4;
+                ip += 4;
+            }
+        }
+        else
+        {
+            REPEAT(stride, wp[0] = CLAMP(ip[0]); wp++; ip++)
             n -= stride;
+            while (n > 0)
+            {
+                REPEAT(stride,
+                       wp[0] = (uint16_t)((CLAMP(ip[0]) - CLAMP(ip[-stride])) &
+                                          mask);
+                       wp++; ip++)
+                n -= stride;
+            }
         }
     }
-    }
 }
 
 /*
  * Encode a chunk of pixels.
  */
-static int
-PixarLogEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int PixarLogEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "PixarLogEncode";
-	TIFFDirectory *td = &tif->tif_dir;
-	PixarLogState *sp = EncoderState(tif);
-	tmsize_t i;
-	tmsize_t n;
-	int llen;
-	unsigned short * up;
-
-	(void) s;
-
-	switch (sp->user_datafmt) {
-	case PIXARLOGDATAFMT_FLOAT:
-		n = cc / sizeof(float);		/* XXX float == 32 bits */
-		break;
-	case PIXARLOGDATAFMT_16BIT:
-	case PIXARLOGDATAFMT_12BITPICIO:
-	case PIXARLOGDATAFMT_11BITLOG:
-		n = cc / sizeof(uint16);	/* XXX uint16 == 16 bits */
-		break;
-	case PIXARLOGDATAFMT_8BIT:
-	case PIXARLOGDATAFMT_8BITABGR:
-		n = cc;
-		break;
-	default:
-		TIFFErrorExt(tif->tif_clientdata, module,
-			"%d bit input not supported in PixarLog",
-			td->td_bitspersample);
-		return 0;
-	}
-
-	llen = sp->stride * td->td_imagewidth;
-    /* Check against the number of elements (of size uint16) of sp->tbuf */
-    if( n > ((tmsize_t)td->td_rowsperstrip * llen) )
+    static const char module[] = "PixarLogEncode";
+    TIFFDirectory *td = &tif->tif_dir;
+    PixarLogState *sp = EncoderState(tif);
+    tmsize_t i;
+    tmsize_t n;
+    int llen;
+    unsigned short *up;
+
+    (void)s;
+
+    switch (sp->user_datafmt)
+    {
+        case PIXARLOGDATAFMT_FLOAT:
+            n = cc / sizeof(float); /* XXX float == 32 bits */
+            break;
+        case PIXARLOGDATAFMT_16BIT:
+        case PIXARLOGDATAFMT_12BITPICIO:
+        case PIXARLOGDATAFMT_11BITLOG:
+            n = cc / sizeof(uint16_t); /* XXX uint16_t == 16 bits */
+            break;
+        case PIXARLOGDATAFMT_8BIT:
+        case PIXARLOGDATAFMT_8BITABGR:
+            n = cc;
+            break;
+        default:
+            TIFFErrorExtR(tif, module,
+                          "%" PRIu16 " bit input not supported in PixarLog",
+                          td->td_bitspersample);
+            return 0;
+    }
+
+    llen = sp->stride * td->td_imagewidth;
+    /* Check against the number of elements (of size uint16_t) of sp->tbuf */
+    if (n > ((tmsize_t)td->td_rowsperstrip * llen))
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                     "Too many input bytes provided");
+        TIFFErrorExtR(tif, module, "Too many input bytes provided");
         return 0;
     }
 
-	for (i = 0, up = sp->tbuf; i < n; i += llen, up += llen) {
-		switch (sp->user_datafmt)  {
-		case PIXARLOGDATAFMT_FLOAT:
-			horizontalDifferenceF((float *)bp, llen, 
-				sp->stride, up, sp->FromLT2);
-			bp += llen * sizeof(float);
-			break;
-		case PIXARLOGDATAFMT_16BIT:
-			horizontalDifference16((uint16 *)bp, llen, 
-				sp->stride, up, sp->From14);
-			bp += llen * sizeof(uint16);
-			break;
-		case PIXARLOGDATAFMT_8BIT:
-			horizontalDifference8((unsigned char *)bp, llen, 
-				sp->stride, up, sp->From8);
-			bp += llen * sizeof(unsigned char);
-			break;
-		default:
-			TIFFErrorExt(tif->tif_clientdata, module,
-				"%d bit input not supported in PixarLog",
-				td->td_bitspersample);
-			return 0;
-		}
-	}
- 
-	sp->stream.next_in = (unsigned char *) sp->tbuf;
-	assert(sizeof(sp->stream.avail_in)==4);  /* if this assert gets raised,
-	    we need to simplify this code to reflect a ZLib that is likely updated
-	    to deal with 8byte memory sizes, though this code will respond
-	    appropriately even before we simplify it */
-	sp->stream.avail_in = (uInt) (n * sizeof(uint16));
-	if ((sp->stream.avail_in / sizeof(uint16)) != (uInt) n)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "ZLib cannot deal with buffers this size");
-		return (0);
-	}
-
-	do {
-		if (deflate(&sp->stream, Z_NO_FLUSH) != Z_OK) {
-			TIFFErrorExt(tif->tif_clientdata, module, "Encoder error: %s",
-			    sp->stream.msg ? sp->stream.msg : "(null)");
-			return (0);
-		}
-		if (sp->stream.avail_out == 0) {
-			tif->tif_rawcc = tif->tif_rawdatasize;
-			if (!TIFFFlushData1(tif))
-				return 0;
-			sp->stream.next_out = tif->tif_rawdata;
-			sp->stream.avail_out = (uInt) tif->tif_rawdatasize;  /* this is a safe typecast, as check is made already in PixarLogPreEncode */
-		}
-	} while (sp->stream.avail_in > 0);
-	return (1);
+    for (i = 0, up = sp->tbuf; i < n; i += llen, up += llen)
+    {
+        switch (sp->user_datafmt)
+        {
+            case PIXARLOGDATAFMT_FLOAT:
+                horizontalDifferenceF((float *)bp, llen, sp->stride, up,
+                                      sp->FromLT2);
+                bp += llen * sizeof(float);
+                break;
+            case PIXARLOGDATAFMT_16BIT:
+                horizontalDifference16((uint16_t *)bp, llen, sp->stride, up,
+                                       sp->From14);
+                bp += llen * sizeof(uint16_t);
+                break;
+            case PIXARLOGDATAFMT_8BIT:
+                horizontalDifference8((unsigned char *)bp, llen, sp->stride, up,
+                                      sp->From8);
+                bp += llen * sizeof(unsigned char);
+                break;
+            default:
+                TIFFErrorExtR(tif, module,
+                              "%" PRIu16 " bit input not supported in PixarLog",
+                              td->td_bitspersample);
+                return 0;
+        }
+    }
+
+    sp->stream.next_in = (unsigned char *)sp->tbuf;
+    assert(sizeof(sp->stream.avail_in) == 4); /* if this assert gets raised,
+         we need to simplify this code to reflect a ZLib that is likely updated
+         to deal with 8byte memory sizes, though this code will respond
+         appropriately even before we simplify it */
+    sp->stream.avail_in = (uInt)(n * sizeof(uint16_t));
+    if ((sp->stream.avail_in / sizeof(uint16_t)) != (uInt)n)
+    {
+        TIFFErrorExtR(tif, module, "ZLib cannot deal with buffers this size");
+        return (0);
+    }
+
+    do
+    {
+        if (deflate(&sp->stream, Z_NO_FLUSH) != Z_OK)
+        {
+            TIFFErrorExtR(tif, module, "Encoder error: %s",
+                          sp->stream.msg ? sp->stream.msg : "(null)");
+            return (0);
+        }
+        if (sp->stream.avail_out == 0)
+        {
+            tif->tif_rawcc = tif->tif_rawdatasize;
+            if (!TIFFFlushData1(tif))
+                return 0;
+            sp->stream.next_out = tif->tif_rawdata;
+            sp->stream.avail_out =
+                (uInt)tif
+                    ->tif_rawdatasize; /* this is a safe typecast, as check is
+                                          made already in PixarLogPreEncode */
+        }
+    } while (sp->stream.avail_in > 0);
+    return (1);
 }
 
 /*
@@ -1214,267 +1398,273 @@ PixarLogEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
  * string and tacking on an End Of Information code.
  */
 
-static int
-PixarLogPostEncode(TIFF* tif)
+static int PixarLogPostEncode(TIFF *tif)
 {
-	static const char module[] = "PixarLogPostEncode";
-	PixarLogState *sp = EncoderState(tif);
-	int state;
-
-	sp->stream.avail_in = 0;
-
-	do {
-		state = deflate(&sp->stream, Z_FINISH);
-		switch (state) {
-		case Z_STREAM_END:
-		case Z_OK:
-		    if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize) {
-			    tif->tif_rawcc =
-				tif->tif_rawdatasize - sp->stream.avail_out;
-			    if (!TIFFFlushData1(tif))
-                                return 0;
-			    sp->stream.next_out = tif->tif_rawdata;
-			    sp->stream.avail_out = (uInt) tif->tif_rawdatasize;  /* this is a safe typecast, as check is made already in PixarLogPreEncode */
-		    }
-		    break;
-		default:
-			TIFFErrorExt(tif->tif_clientdata, module, "ZLib error: %s",
-			sp->stream.msg ? sp->stream.msg : "(null)");
-		    return (0);
-		}
-	} while (state != Z_STREAM_END);
-	return (1);
+    static const char module[] = "PixarLogPostEncode";
+    PixarLogState *sp = EncoderState(tif);
+    int state;
+
+    sp->stream.avail_in = 0;
+
+    do
+    {
+        state = deflate(&sp->stream, Z_FINISH);
+        switch (state)
+        {
+            case Z_STREAM_END:
+            case Z_OK:
+                if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize)
+                {
+                    tif->tif_rawcc =
+                        tif->tif_rawdatasize - sp->stream.avail_out;
+                    if (!TIFFFlushData1(tif))
+                        return 0;
+                    sp->stream.next_out = tif->tif_rawdata;
+                    sp->stream.avail_out =
+                        (uInt)tif->tif_rawdatasize; /* this is a safe typecast,
+                                                       as check is made already
+                                                       in PixarLogPreEncode */
+                }
+                break;
+            default:
+                TIFFErrorExtR(tif, module, "ZLib error: %s",
+                              sp->stream.msg ? sp->stream.msg : "(null)");
+                return (0);
+        }
+    } while (state != Z_STREAM_END);
+    return (1);
 }
 
-static void
-PixarLogClose(TIFF* tif)
+static void PixarLogClose(TIFF *tif)
 {
-        PixarLogState* sp = (PixarLogState*) tif->tif_data;
-	TIFFDirectory *td = &tif->tif_dir;
-
-	assert(sp != 0);
-	/* In a really sneaky (and really incorrect, and untruthful, and
-	 * troublesome, and error-prone) maneuver that completely goes against
-	 * the spirit of TIFF, and breaks TIFF, on close, we covertly
-	 * modify both bitspersample and sampleformat in the directory to
-	 * indicate 8-bit linear.  This way, the decode "just works" even for
-	 * readers that don't know about PixarLog, or how to set
-	 * the PIXARLOGDATFMT pseudo-tag.
-	 */
-
-        if (sp->state&PLSTATE_INIT) {
-            /* We test the state to avoid an issue such as in
-             * http://bugzilla.maptools.org/show_bug.cgi?id=2604
-             * What appends in that case is that the bitspersample is 1 and
-             * a TransferFunction is set. The size of the TransferFunction
-             * depends on 1<<bitspersample. So if we increase it, an access
-             * out of the buffer will happen at directory flushing.
-             * Another option would be to clear those targs. 
-             */
-            td->td_bitspersample = 8;
-            td->td_sampleformat = SAMPLEFORMAT_UINT;
-        }
+    PixarLogState *sp = (PixarLogState *)tif->tif_data;
+    TIFFDirectory *td = &tif->tif_dir;
+
+    assert(sp != 0);
+    /* In a really sneaky (and really incorrect, and untruthful, and
+     * troublesome, and error-prone) maneuver that completely goes against
+     * the spirit of TIFF, and breaks TIFF, on close, we covertly
+     * modify both bitspersample and sampleformat in the directory to
+     * indicate 8-bit linear.  This way, the decode "just works" even for
+     * readers that don't know about PixarLog, or how to set
+     * the PIXARLOGDATFMT pseudo-tag.
+     */
+
+    if (sp->state & PLSTATE_INIT)
+    {
+        /* We test the state to avoid an issue such as in
+         * http://bugzilla.maptools.org/show_bug.cgi?id=2604
+         * What appends in that case is that the bitspersample is 1 and
+         * a TransferFunction is set. The size of the TransferFunction
+         * depends on 1<<bitspersample. So if we increase it, an access
+         * out of the buffer will happen at directory flushing.
+         * Another option would be to clear those targs.
+         */
+        td->td_bitspersample = 8;
+        td->td_sampleformat = SAMPLEFORMAT_UINT;
+    }
 }
 
-static void
-PixarLogCleanup(TIFF* tif)
+static void PixarLogCleanup(TIFF *tif)
 {
-	PixarLogState* sp = (PixarLogState*) tif->tif_data;
-
-	assert(sp != 0);
-
-	(void)TIFFPredictorCleanup(tif);
-
-	tif->tif_tagmethods.vgetfield = sp->vgetparent;
-	tif->tif_tagmethods.vsetfield = sp->vsetparent;
-
-	if (sp->FromLT2) _TIFFfree(sp->FromLT2);
-	if (sp->From14) _TIFFfree(sp->From14);
-	if (sp->From8) _TIFFfree(sp->From8);
-	if (sp->ToLinearF) _TIFFfree(sp->ToLinearF);
-	if (sp->ToLinear16) _TIFFfree(sp->ToLinear16);
-	if (sp->ToLinear8) _TIFFfree(sp->ToLinear8);
-	if (sp->state&PLSTATE_INIT) {
-		if (tif->tif_mode == O_RDONLY)
-			inflateEnd(&sp->stream);
-		else
-			deflateEnd(&sp->stream);
-	}
-	if (sp->tbuf)
-		_TIFFfree(sp->tbuf);
-	_TIFFfree(sp);
-	tif->tif_data = NULL;
-
-	_TIFFSetDefaultCompressionState(tif);
+    PixarLogState *sp = (PixarLogState *)tif->tif_data;
+
+    assert(sp != 0);
+
+    (void)TIFFPredictorCleanup(tif);
+
+    tif->tif_tagmethods.vgetfield = sp->vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->vsetparent;
+
+    if (sp->FromLT2)
+        _TIFFfreeExt(tif, sp->FromLT2);
+    if (sp->From14)
+        _TIFFfreeExt(tif, sp->From14);
+    if (sp->From8)
+        _TIFFfreeExt(tif, sp->From8);
+    if (sp->ToLinearF)
+        _TIFFfreeExt(tif, sp->ToLinearF);
+    if (sp->ToLinear16)
+        _TIFFfreeExt(tif, sp->ToLinear16);
+    if (sp->ToLinear8)
+        _TIFFfreeExt(tif, sp->ToLinear8);
+    if (sp->state & PLSTATE_INIT)
+    {
+        if (tif->tif_mode == O_RDONLY)
+            inflateEnd(&sp->stream);
+        else
+            deflateEnd(&sp->stream);
+    }
+    if (sp->tbuf)
+        _TIFFfreeExt(tif, sp->tbuf);
+    _TIFFfreeExt(tif, sp);
+    tif->tif_data = NULL;
+
+    _TIFFSetDefaultCompressionState(tif);
 }
 
-static int
-PixarLogVSetField(TIFF* tif, uint32 tag, va_list ap)
+static int PixarLogVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
     static const char module[] = "PixarLogVSetField";
     PixarLogState *sp = (PixarLogState *)tif->tif_data;
     int result;
 
-    switch (tag) {
-     case TIFFTAG_PIXARLOGQUALITY:
-		sp->quality = (int) va_arg(ap, int);
-		if (tif->tif_mode != O_RDONLY && (sp->state&PLSTATE_INIT)) {
-			if (deflateParams(&sp->stream,
-			    sp->quality, Z_DEFAULT_STRATEGY) != Z_OK) {
-				TIFFErrorExt(tif->tif_clientdata, module, "ZLib error: %s",
-					sp->stream.msg ? sp->stream.msg : "(null)");
-				return (0);
-			}
-		}
-		return (1);
-     case TIFFTAG_PIXARLOGDATAFMT:
-	sp->user_datafmt = (int) va_arg(ap, int);
-	/* Tweak the TIFF header so that the rest of libtiff knows what
-	 * size of data will be passed between app and library, and
-	 * assume that the app knows what it is doing and is not
-	 * confused by these header manipulations...
-	 */
-	switch (sp->user_datafmt) {
-	 case PIXARLOGDATAFMT_8BIT:
-	 case PIXARLOGDATAFMT_8BITABGR:
-	    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 8);
-	    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_UINT);
-	    break;
-	 case PIXARLOGDATAFMT_11BITLOG:
-	    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 16);
-	    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_UINT);
-	    break;
-	 case PIXARLOGDATAFMT_12BITPICIO:
-	    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 16);
-	    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_INT);
-	    break;
-	 case PIXARLOGDATAFMT_16BIT:
-	    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 16);
-	    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_UINT);
-	    break;
-	 case PIXARLOGDATAFMT_FLOAT:
-	    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 32);
-	    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_IEEEFP);
-	    break;
-	}
-	/*
-	 * Must recalculate sizes should bits/sample change.
-	 */
-	tif->tif_tilesize = isTiled(tif) ? TIFFTileSize(tif) : (tmsize_t)(-1);
-	tif->tif_scanlinesize = TIFFScanlineSize(tif);
-	result = 1;		/* NB: pseudo tag */
-	break;
-     default:
-	result = (*sp->vsetparent)(tif, tag, ap);
+    switch (tag)
+    {
+        case TIFFTAG_PIXARLOGQUALITY:
+            sp->quality = (int)va_arg(ap, int);
+            if (tif->tif_mode != O_RDONLY && (sp->state & PLSTATE_INIT))
+            {
+                if (deflateParams(&sp->stream, sp->quality,
+                                  Z_DEFAULT_STRATEGY) != Z_OK)
+                {
+                    TIFFErrorExtR(tif, module, "ZLib error: %s",
+                                  sp->stream.msg ? sp->stream.msg : "(null)");
+                    return (0);
+                }
+            }
+            return (1);
+        case TIFFTAG_PIXARLOGDATAFMT:
+            sp->user_datafmt = (int)va_arg(ap, int);
+            /* Tweak the TIFF header so that the rest of libtiff knows what
+             * size of data will be passed between app and library, and
+             * assume that the app knows what it is doing and is not
+             * confused by these header manipulations...
+             */
+            switch (sp->user_datafmt)
+            {
+                case PIXARLOGDATAFMT_8BIT:
+                case PIXARLOGDATAFMT_8BITABGR:
+                    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 8);
+                    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_UINT);
+                    break;
+                case PIXARLOGDATAFMT_11BITLOG:
+                    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 16);
+                    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_UINT);
+                    break;
+                case PIXARLOGDATAFMT_12BITPICIO:
+                    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 16);
+                    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_INT);
+                    break;
+                case PIXARLOGDATAFMT_16BIT:
+                    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 16);
+                    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_UINT);
+                    break;
+                case PIXARLOGDATAFMT_FLOAT:
+                    TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 32);
+                    TIFFSetField(tif, TIFFTAG_SAMPLEFORMAT,
+                                 SAMPLEFORMAT_IEEEFP);
+                    break;
+            }
+            /*
+             * Must recalculate sizes should bits/sample change.
+             */
+            tif->tif_tilesize =
+                isTiled(tif) ? TIFFTileSize(tif) : (tmsize_t)(-1);
+            tif->tif_scanlinesize = TIFFScanlineSize(tif);
+            result = 1; /* NB: pseudo tag */
+            break;
+        default:
+            result = (*sp->vsetparent)(tif, tag, ap);
     }
     return (result);
 }
 
-static int
-PixarLogVGetField(TIFF* tif, uint32 tag, va_list ap)
+static int PixarLogVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
     PixarLogState *sp = (PixarLogState *)tif->tif_data;
 
-    switch (tag) {
-     case TIFFTAG_PIXARLOGQUALITY:
-	*va_arg(ap, int*) = sp->quality;
-	break;
-     case TIFFTAG_PIXARLOGDATAFMT:
-	*va_arg(ap, int*) = sp->user_datafmt;
-	break;
-     default:
-	return (*sp->vgetparent)(tif, tag, ap);
+    switch (tag)
+    {
+        case TIFFTAG_PIXARLOGQUALITY:
+            *va_arg(ap, int *) = sp->quality;
+            break;
+        case TIFFTAG_PIXARLOGDATAFMT:
+            *va_arg(ap, int *) = sp->user_datafmt;
+            break;
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
     }
     return (1);
 }
 
 static const TIFFField pixarlogFields[] = {
-    {TIFFTAG_PIXARLOGDATAFMT, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "", NULL},
-    {TIFFTAG_PIXARLOGQUALITY, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "", NULL}
-};
+    {TIFFTAG_PIXARLOGDATAFMT, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "", NULL},
+    {TIFFTAG_PIXARLOGQUALITY, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, FALSE, FALSE, "", NULL}};
 
-int
-TIFFInitPixarLog(TIFF* tif, int scheme)
+int TIFFInitPixarLog(TIFF *tif, int scheme)
 {
-	static const char module[] = "TIFFInitPixarLog";
-
-	PixarLogState* sp;
-
-        (void)scheme;
-	assert(scheme == COMPRESSION_PIXARLOG);
-
-	/*
-	 * Merge codec-specific tag information.
-	 */
-	if (!_TIFFMergeFields(tif, pixarlogFields,
-			      TIFFArrayCount(pixarlogFields))) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Merging PixarLog codec-specific tags failed");
-		return 0;
-	}
-
-	/*
-	 * Allocate state block so tag methods have storage to record values.
-	 */
-	tif->tif_data = (uint8*) _TIFFmalloc(sizeof (PixarLogState));
-	if (tif->tif_data == NULL)
-		goto bad;
-	sp = (PixarLogState*) tif->tif_data;
-	_TIFFmemset(sp, 0, sizeof (*sp));
-	sp->stream.data_type = Z_BINARY;
-	sp->user_datafmt = PIXARLOGDATAFMT_UNKNOWN;
-
-	/*
-	 * Install codec methods.
-	 */
-	tif->tif_fixuptags = PixarLogFixupTags; 
-	tif->tif_setupdecode = PixarLogSetupDecode;
-	tif->tif_predecode = PixarLogPreDecode;
-	tif->tif_decoderow = PixarLogDecode;
-	tif->tif_decodestrip = PixarLogDecode;  
-	tif->tif_decodetile = PixarLogDecode;
-	tif->tif_setupencode = PixarLogSetupEncode;
-	tif->tif_preencode = PixarLogPreEncode;
-	tif->tif_postencode = PixarLogPostEncode;
-	tif->tif_encoderow = PixarLogEncode;  
-	tif->tif_encodestrip = PixarLogEncode;
-	tif->tif_encodetile = PixarLogEncode;  
-	tif->tif_close = PixarLogClose;
-	tif->tif_cleanup = PixarLogCleanup;
-
-	/* Override SetField so we can handle our private pseudo-tag */
-	sp->vgetparent = tif->tif_tagmethods.vgetfield;
-	tif->tif_tagmethods.vgetfield = PixarLogVGetField;   /* hook for codec tags */
-	sp->vsetparent = tif->tif_tagmethods.vsetfield;
-	tif->tif_tagmethods.vsetfield = PixarLogVSetField;   /* hook for codec tags */
-
-	/* Default values for codec-specific fields */
-	sp->quality = Z_DEFAULT_COMPRESSION; /* default comp. level */
-	sp->state = 0;
-
-	/* we don't wish to use the predictor, 
-	 * the default is none, which predictor value 1
-	 */
-	(void) TIFFPredictorInit(tif);
-
-	/*
-	 * build the companding tables 
-	 */
-	PixarLogMakeTables(sp);
-
-	return (1);
+    static const char module[] = "TIFFInitPixarLog";
+
+    PixarLogState *sp;
+
+    (void)scheme;
+    assert(scheme == COMPRESSION_PIXARLOG);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, pixarlogFields, TIFFArrayCount(pixarlogFields)))
+    {
+        TIFFErrorExtR(tif, module,
+                      "Merging PixarLog codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(PixarLogState));
+    if (tif->tif_data == NULL)
+        goto bad;
+    sp = (PixarLogState *)tif->tif_data;
+    _TIFFmemset(sp, 0, sizeof(*sp));
+    sp->stream.data_type = Z_BINARY;
+    sp->user_datafmt = PIXARLOGDATAFMT_UNKNOWN;
+
+    /*
+     * Install codec methods.
+     */
+    tif->tif_fixuptags = PixarLogFixupTags;
+    tif->tif_setupdecode = PixarLogSetupDecode;
+    tif->tif_predecode = PixarLogPreDecode;
+    tif->tif_decoderow = PixarLogDecode;
+    tif->tif_decodestrip = PixarLogDecode;
+    tif->tif_decodetile = PixarLogDecode;
+    tif->tif_setupencode = PixarLogSetupEncode;
+    tif->tif_preencode = PixarLogPreEncode;
+    tif->tif_postencode = PixarLogPostEncode;
+    tif->tif_encoderow = PixarLogEncode;
+    tif->tif_encodestrip = PixarLogEncode;
+    tif->tif_encodetile = PixarLogEncode;
+    tif->tif_close = PixarLogClose;
+    tif->tif_cleanup = PixarLogCleanup;
+
+    /* Override SetField so we can handle our private pseudo-tag */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = PixarLogVGetField; /* hook for codec tags */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = PixarLogVSetField; /* hook for codec tags */
+
+    /* Default values for codec-specific fields */
+    sp->quality = Z_DEFAULT_COMPRESSION; /* default comp. level */
+    sp->state = 0;
+
+    /* we don't wish to use the predictor,
+     * the default is none, which predictor value 1
+     */
+    (void)TIFFPredictorInit(tif);
+
+    /*
+     * build the companding tables
+     */
+    PixarLogMakeTables(tif, sp);
+
+    return (1);
 bad:
-	TIFFErrorExt(tif->tif_clientdata, module,
-		     "No space for PixarLog state block");
-	return (0);
+    TIFFErrorExtR(tif, module, "No space for PixarLog state block");
+    return (0);
 }
 #endif /* PIXARLOG_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_predict.c b/3rdparty/libtiff/tif_predict.c
index c0233974590f..386b5fe82adc 100644
--- a/3rdparty/libtiff/tif_predict.c
+++ b/3rdparty/libtiff/tif_predict.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -27,246 +27,309 @@
  *
  * Predictor Tag Support (used by multiple codecs).
  */
-#include "tiffiop.h"
 #include "tif_predict.h"
+#include "tiffiop.h"
 
-#define	PredictorState(tif)	((TIFFPredictorState*) (tif)->tif_data)
-
-static int horAcc8(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int horAcc16(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int horAcc32(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int swabHorAcc16(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int swabHorAcc32(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int horDiff8(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int horDiff16(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int horDiff32(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int swabHorDiff16(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int swabHorDiff32(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int fpAcc(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int fpDiff(TIFF* tif, uint8* cp0, tmsize_t cc);
-static int PredictorDecodeRow(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s);
-static int PredictorDecodeTile(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s);
-static int PredictorEncodeRow(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s);
-static int PredictorEncodeTile(TIFF* tif, uint8* bp0, tmsize_t cc0, uint16 s);
-
-static int
-PredictorSetup(TIFF* tif)
+#define PredictorState(tif) ((TIFFPredictorState *)(tif)->tif_data)
+
+static int horAcc8(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int horAcc16(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int horAcc32(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int horAcc64(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int swabHorAcc16(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int swabHorAcc32(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int swabHorAcc64(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int horDiff8(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int horDiff16(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int horDiff32(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int horDiff64(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int swabHorDiff16(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int swabHorDiff32(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int swabHorDiff64(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int fpAcc(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int fpDiff(TIFF *tif, uint8_t *cp0, tmsize_t cc);
+static int PredictorDecodeRow(TIFF *tif, uint8_t *op0, tmsize_t occ0,
+                              uint16_t s);
+static int PredictorDecodeTile(TIFF *tif, uint8_t *op0, tmsize_t occ0,
+                               uint16_t s);
+static int PredictorEncodeRow(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s);
+static int PredictorEncodeTile(TIFF *tif, uint8_t *bp0, tmsize_t cc0,
+                               uint16_t s);
+
+static int PredictorSetup(TIFF *tif)
 {
-	static const char module[] = "PredictorSetup";
-
-	TIFFPredictorState* sp = PredictorState(tif);
-	TIFFDirectory* td = &tif->tif_dir;
-
-	switch (sp->predictor)		/* no differencing */
-	{
-		case PREDICTOR_NONE:
-			return 1;
-		case PREDICTOR_HORIZONTAL:
-			if (td->td_bitspersample != 8
-			    && td->td_bitspersample != 16
-			    && td->td_bitspersample != 32) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Horizontal differencing \"Predictor\" not supported with %d-bit samples",
-				    td->td_bitspersample);
-				return 0;
-			}
-			break;
-		case PREDICTOR_FLOATINGPOINT:
-			if (td->td_sampleformat != SAMPLEFORMAT_IEEEFP) {
-				TIFFErrorExt(tif->tif_clientdata, module,
-				    "Floating point \"Predictor\" not supported with %d data format",
-				    td->td_sampleformat);
-				return 0;
-			}
-                        if (td->td_bitspersample != 16
-                            && td->td_bitspersample != 24
-                            && td->td_bitspersample != 32
-                            && td->td_bitspersample != 64) { /* Should 64 be allowed? */
-                                TIFFErrorExt(tif->tif_clientdata, module,
-                                             "Floating point \"Predictor\" not supported with %d-bit samples",
-                                             td->td_bitspersample);
-				return 0;
-                            }
-			break;
-		default:
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "\"Predictor\" value %d not supported",
-			    sp->predictor);
-			return 0;
-	}
-	sp->stride = (td->td_planarconfig == PLANARCONFIG_CONTIG ?
-	    td->td_samplesperpixel : 1);
-	/*
-	 * Calculate the scanline/tile-width size in bytes.
-	 */
-	if (isTiled(tif))
-		sp->rowsize = TIFFTileRowSize(tif);
-	else
-		sp->rowsize = TIFFScanlineSize(tif);
-	if (sp->rowsize == 0)
-		return 0;
-
-	return 1;
+    static const char module[] = "PredictorSetup";
+
+    TIFFPredictorState *sp = PredictorState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+
+    switch (sp->predictor) /* no differencing */
+    {
+        case PREDICTOR_NONE:
+            return 1;
+        case PREDICTOR_HORIZONTAL:
+            if (td->td_bitspersample != 8 && td->td_bitspersample != 16 &&
+                td->td_bitspersample != 32 && td->td_bitspersample != 64)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Horizontal differencing \"Predictor\" not "
+                              "supported with %" PRIu16 "-bit samples",
+                              td->td_bitspersample);
+                return 0;
+            }
+            break;
+        case PREDICTOR_FLOATINGPOINT:
+            if (td->td_sampleformat != SAMPLEFORMAT_IEEEFP)
+            {
+                TIFFErrorExtR(
+                    tif, module,
+                    "Floating point \"Predictor\" not supported with %" PRIu16
+                    " data format",
+                    td->td_sampleformat);
+                return 0;
+            }
+            if (td->td_bitspersample != 16 && td->td_bitspersample != 24 &&
+                td->td_bitspersample != 32 && td->td_bitspersample != 64)
+            { /* Should 64 be allowed? */
+                TIFFErrorExtR(
+                    tif, module,
+                    "Floating point \"Predictor\" not supported with %" PRIu16
+                    "-bit samples",
+                    td->td_bitspersample);
+                return 0;
+            }
+            break;
+        default:
+            TIFFErrorExtR(tif, module, "\"Predictor\" value %d not supported",
+                          sp->predictor);
+            return 0;
+    }
+    sp->stride =
+        (td->td_planarconfig == PLANARCONFIG_CONTIG ? td->td_samplesperpixel
+                                                    : 1);
+    /*
+     * Calculate the scanline/tile-width size in bytes.
+     */
+    if (isTiled(tif))
+        sp->rowsize = TIFFTileRowSize(tif);
+    else
+        sp->rowsize = TIFFScanlineSize(tif);
+    if (sp->rowsize == 0)
+        return 0;
+
+    return 1;
 }
 
-static int
-PredictorSetupDecode(TIFF* tif)
+static int PredictorSetupDecode(TIFF *tif)
 {
-	TIFFPredictorState* sp = PredictorState(tif);
-	TIFFDirectory* td = &tif->tif_dir;
-
-	/* Note: when PredictorSetup() fails, the effets of setupdecode() */
-	/* will not be "canceled" so setupdecode() might be robust to */
-	/* be called several times. */
-	if (!(*sp->setupdecode)(tif) || !PredictorSetup(tif))
-		return 0;
-
-	if (sp->predictor == 2) {
-		switch (td->td_bitspersample) {
-			case 8:  sp->decodepfunc = horAcc8; break;
-			case 16: sp->decodepfunc = horAcc16; break;
-			case 32: sp->decodepfunc = horAcc32; break;
-		}
-		/*
-		 * Override default decoding method with one that does the
-		 * predictor stuff.
-		 */
-                if( tif->tif_decoderow != PredictorDecodeRow )
-                {
-                    sp->decoderow = tif->tif_decoderow;
-                    tif->tif_decoderow = PredictorDecodeRow;
-                    sp->decodestrip = tif->tif_decodestrip;
-                    tif->tif_decodestrip = PredictorDecodeTile;
-                    sp->decodetile = tif->tif_decodetile;
-                    tif->tif_decodetile = PredictorDecodeTile;
-                }
-
-		/*
-		 * If the data is horizontally differenced 16-bit data that
-		 * requires byte-swapping, then it must be byte swapped before
-		 * the accumulation step.  We do this with a special-purpose
-		 * routine and override the normal post decoding logic that
-		 * the library setup when the directory was read.
-		 */
-		if (tif->tif_flags & TIFF_SWAB) {
-			if (sp->decodepfunc == horAcc16) {
-				sp->decodepfunc = swabHorAcc16;
-				tif->tif_postdecode = _TIFFNoPostDecode;
-            } else if (sp->decodepfunc == horAcc32) {
-				sp->decodepfunc = swabHorAcc32;
-				tif->tif_postdecode = _TIFFNoPostDecode;
+    TIFFPredictorState *sp = PredictorState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+
+    /* Note: when PredictorSetup() fails, the effets of setupdecode() */
+    /* will not be "canceled" so setupdecode() might be robust to */
+    /* be called several times. */
+    if (!(*sp->setupdecode)(tif) || !PredictorSetup(tif))
+        return 0;
+
+    if (sp->predictor == 2)
+    {
+        switch (td->td_bitspersample)
+        {
+            case 8:
+                sp->decodepfunc = horAcc8;
+                break;
+            case 16:
+                sp->decodepfunc = horAcc16;
+                break;
+            case 32:
+                sp->decodepfunc = horAcc32;
+                break;
+            case 64:
+                sp->decodepfunc = horAcc64;
+                break;
+        }
+        /*
+         * Override default decoding method with one that does the
+         * predictor stuff.
+         */
+        if (tif->tif_decoderow != PredictorDecodeRow)
+        {
+            sp->decoderow = tif->tif_decoderow;
+            tif->tif_decoderow = PredictorDecodeRow;
+            sp->decodestrip = tif->tif_decodestrip;
+            tif->tif_decodestrip = PredictorDecodeTile;
+            sp->decodetile = tif->tif_decodetile;
+            tif->tif_decodetile = PredictorDecodeTile;
+        }
+
+        /*
+         * If the data is horizontally differenced 16-bit data that
+         * requires byte-swapping, then it must be byte swapped before
+         * the accumulation step.  We do this with a special-purpose
+         * routine and override the normal post decoding logic that
+         * the library setup when the directory was read.
+         */
+        if (tif->tif_flags & TIFF_SWAB)
+        {
+            if (sp->decodepfunc == horAcc16)
+            {
+                sp->decodepfunc = swabHorAcc16;
+                tif->tif_postdecode = _TIFFNoPostDecode;
+            }
+            else if (sp->decodepfunc == horAcc32)
+            {
+                sp->decodepfunc = swabHorAcc32;
+                tif->tif_postdecode = _TIFFNoPostDecode;
             }
-		}
-	}
-
-	else if (sp->predictor == 3) {
-		sp->decodepfunc = fpAcc;
-		/*
-		 * Override default decoding method with one that does the
-		 * predictor stuff.
-		 */
-                if( tif->tif_decoderow != PredictorDecodeRow )
-                {
-                    sp->decoderow = tif->tif_decoderow;
-                    tif->tif_decoderow = PredictorDecodeRow;
-                    sp->decodestrip = tif->tif_decodestrip;
-                    tif->tif_decodestrip = PredictorDecodeTile;
-                    sp->decodetile = tif->tif_decodetile;
-                    tif->tif_decodetile = PredictorDecodeTile;
-                }
-		/*
-		 * The data should not be swapped outside of the floating
-		 * point predictor, the accumulation routine should return
-		 * byres in the native order.
-		 */
-		if (tif->tif_flags & TIFF_SWAB) {
-			tif->tif_postdecode = _TIFFNoPostDecode;
-		}
-		/*
-		 * Allocate buffer to keep the decoded bytes before
-		 * rearranging in the right order
-		 */
-	}
-
-	return 1;
+            else if (sp->decodepfunc == horAcc64)
+            {
+                sp->decodepfunc = swabHorAcc64;
+                tif->tif_postdecode = _TIFFNoPostDecode;
+            }
+        }
+    }
+
+    else if (sp->predictor == 3)
+    {
+        sp->decodepfunc = fpAcc;
+        /*
+         * Override default decoding method with one that does the
+         * predictor stuff.
+         */
+        if (tif->tif_decoderow != PredictorDecodeRow)
+        {
+            sp->decoderow = tif->tif_decoderow;
+            tif->tif_decoderow = PredictorDecodeRow;
+            sp->decodestrip = tif->tif_decodestrip;
+            tif->tif_decodestrip = PredictorDecodeTile;
+            sp->decodetile = tif->tif_decodetile;
+            tif->tif_decodetile = PredictorDecodeTile;
+        }
+        /*
+         * The data should not be swapped outside of the floating
+         * point predictor, the accumulation routine should return
+         * byres in the native order.
+         */
+        if (tif->tif_flags & TIFF_SWAB)
+        {
+            tif->tif_postdecode = _TIFFNoPostDecode;
+        }
+        /*
+         * Allocate buffer to keep the decoded bytes before
+         * rearranging in the right order
+         */
+    }
+
+    return 1;
 }
 
-static int
-PredictorSetupEncode(TIFF* tif)
+static int PredictorSetupEncode(TIFF *tif)
 {
-	TIFFPredictorState* sp = PredictorState(tif);
-	TIFFDirectory* td = &tif->tif_dir;
-
-	if (!(*sp->setupencode)(tif) || !PredictorSetup(tif))
-		return 0;
-
-	if (sp->predictor == 2) {
-		switch (td->td_bitspersample) {
-			case 8:  sp->encodepfunc = horDiff8; break;
-			case 16: sp->encodepfunc = horDiff16; break;
-			case 32: sp->encodepfunc = horDiff32; break;
-		}
-		/*
-		 * Override default encoding method with one that does the
-		 * predictor stuff.
-		 */
-                if( tif->tif_encoderow != PredictorEncodeRow )
-                {
-                    sp->encoderow = tif->tif_encoderow;
-                    tif->tif_encoderow = PredictorEncodeRow;
-                    sp->encodestrip = tif->tif_encodestrip;
-                    tif->tif_encodestrip = PredictorEncodeTile;
-                    sp->encodetile = tif->tif_encodetile;
-                    tif->tif_encodetile = PredictorEncodeTile;
-                }
-
-                /*
-                 * If the data is horizontally differenced 16-bit data that
-                 * requires byte-swapping, then it must be byte swapped after
-                 * the differentiation step.  We do this with a special-purpose
-                 * routine and override the normal post decoding logic that
-                 * the library setup when the directory was read.
-                 */
-                if (tif->tif_flags & TIFF_SWAB) {
-                    if (sp->encodepfunc == horDiff16) {
-                            sp->encodepfunc = swabHorDiff16;
-                            tif->tif_postdecode = _TIFFNoPostDecode;
-                    } else if (sp->encodepfunc == horDiff32) {
-                            sp->encodepfunc = swabHorDiff32;
-                            tif->tif_postdecode = _TIFFNoPostDecode;
-                    }
-                }
+    TIFFPredictorState *sp = PredictorState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+
+    if (!(*sp->setupencode)(tif) || !PredictorSetup(tif))
+        return 0;
+
+    if (sp->predictor == 2)
+    {
+        switch (td->td_bitspersample)
+        {
+            case 8:
+                sp->encodepfunc = horDiff8;
+                break;
+            case 16:
+                sp->encodepfunc = horDiff16;
+                break;
+            case 32:
+                sp->encodepfunc = horDiff32;
+                break;
+            case 64:
+                sp->encodepfunc = horDiff64;
+                break;
+        }
+        /*
+         * Override default encoding method with one that does the
+         * predictor stuff.
+         */
+        if (tif->tif_encoderow != PredictorEncodeRow)
+        {
+            sp->encoderow = tif->tif_encoderow;
+            tif->tif_encoderow = PredictorEncodeRow;
+            sp->encodestrip = tif->tif_encodestrip;
+            tif->tif_encodestrip = PredictorEncodeTile;
+            sp->encodetile = tif->tif_encodetile;
+            tif->tif_encodetile = PredictorEncodeTile;
+        }
+
+        /*
+         * If the data is horizontally differenced 16-bit data that
+         * requires byte-swapping, then it must be byte swapped after
+         * the differentiation step.  We do this with a special-purpose
+         * routine and override the normal post decoding logic that
+         * the library setup when the directory was read.
+         */
+        if (tif->tif_flags & TIFF_SWAB)
+        {
+            if (sp->encodepfunc == horDiff16)
+            {
+                sp->encodepfunc = swabHorDiff16;
+                tif->tif_postdecode = _TIFFNoPostDecode;
+            }
+            else if (sp->encodepfunc == horDiff32)
+            {
+                sp->encodepfunc = swabHorDiff32;
+                tif->tif_postdecode = _TIFFNoPostDecode;
+            }
+            else if (sp->encodepfunc == horDiff64)
+            {
+                sp->encodepfunc = swabHorDiff64;
+                tif->tif_postdecode = _TIFFNoPostDecode;
+            }
         }
+    }
 
-	else if (sp->predictor == 3) {
-		sp->encodepfunc = fpDiff;
-		/*
-		 * Override default encoding method with one that does the
-		 * predictor stuff.
-		 */
-                if( tif->tif_encoderow != PredictorEncodeRow )
-                {
-                    sp->encoderow = tif->tif_encoderow;
-                    tif->tif_encoderow = PredictorEncodeRow;
-                    sp->encodestrip = tif->tif_encodestrip;
-                    tif->tif_encodestrip = PredictorEncodeTile;
-                    sp->encodetile = tif->tif_encodetile;
-                    tif->tif_encodetile = PredictorEncodeTile;
-                }
-	}
-
-	return 1;
+    else if (sp->predictor == 3)
+    {
+        sp->encodepfunc = fpDiff;
+        /*
+         * Override default encoding method with one that does the
+         * predictor stuff.
+         */
+        if (tif->tif_encoderow != PredictorEncodeRow)
+        {
+            sp->encoderow = tif->tif_encoderow;
+            tif->tif_encoderow = PredictorEncodeRow;
+            sp->encodestrip = tif->tif_encodestrip;
+            tif->tif_encodestrip = PredictorEncodeTile;
+            sp->encodetile = tif->tif_encodetile;
+            tif->tif_encodetile = PredictorEncodeTile;
+        }
+    }
+
+    return 1;
 }
 
-#define REPEAT4(n, op)		\
-    switch (n) {		\
-    default: { \
-        tmsize_t i; for (i = n-4; i > 0; i--) { op; } }  /*-fallthrough*/  \
-    case 4:  op; /*-fallthrough*/ \
-    case 3:  op; /*-fallthrough*/ \
-    case 2:  op; /*-fallthrough*/ \
-    case 1:  op; /*-fallthrough*/ \
-    case 0:  ;			\
+#define REPEAT4(n, op)                                                         \
+    switch (n)                                                                 \
+    {                                                                          \
+        default:                                                               \
+        {                                                                      \
+            tmsize_t i;                                                        \
+            for (i = n - 4; i > 0; i--)                                        \
+            {                                                                  \
+                op;                                                            \
+            }                                                                  \
+        } /*-fallthrough*/                                                     \
+        case 4:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 3:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 2:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 1:                                                                \
+            op; /*-fallthrough*/                                               \
+        case 0:;                                                               \
     }
 
 /* Remarks related to C standard compliance in all below functions : */
@@ -276,196 +339,236 @@ PredictorSetupEncode(TIFF* tif)
 /*   as to make icc -check=conversions happy (not necessary by the standard) */
 
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static int
-horAcc8(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int horAcc8(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	tmsize_t stride = PredictorState(tif)->stride;
+    tmsize_t stride = PredictorState(tif)->stride;
 
-	unsigned char* cp = (unsigned char*) cp0;
-    if((cc%stride)!=0)
+    unsigned char *cp = (unsigned char *)cp0;
+    if ((cc % stride) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "horAcc8",
-                     "%s", "(cc%stride)!=0");
+        TIFFErrorExtR(tif, "horAcc8", "%s", "(cc%stride)!=0");
         return 0;
     }
 
-	if (cc > stride) {
-		/*
-		 * Pipeline the most common cases.
-		 */
-		if (stride == 3)  {
-			unsigned int cr = cp[0];
-			unsigned int cg = cp[1];
-			unsigned int cb = cp[2];
-			cc -= 3;
-			cp += 3;
-			while (cc>0) {
-				cp[0] = (unsigned char) ((cr += cp[0]) & 0xff);
-				cp[1] = (unsigned char) ((cg += cp[1]) & 0xff);
-				cp[2] = (unsigned char) ((cb += cp[2]) & 0xff);
-				cc -= 3;
-				cp += 3;
-			}
-		} else if (stride == 4)  {
-			unsigned int cr = cp[0];
-			unsigned int cg = cp[1];
-			unsigned int cb = cp[2];
-			unsigned int ca = cp[3];
-			cc -= 4;
-			cp += 4;
-			while (cc>0) {
-				cp[0] = (unsigned char) ((cr += cp[0]) & 0xff);
-				cp[1] = (unsigned char) ((cg += cp[1]) & 0xff);
-				cp[2] = (unsigned char) ((cb += cp[2]) & 0xff);
-				cp[3] = (unsigned char) ((ca += cp[3]) & 0xff);
-				cc -= 4;
-				cp += 4;
-			}
-		} else  {
-			cc -= stride;
-			do {
-				REPEAT4(stride, cp[stride] =
-					(unsigned char) ((cp[stride] + *cp) & 0xff); cp++)
-				cc -= stride;
-			} while (cc>0);
-		}
-	}
-	return 1;
+    if (cc > stride)
+    {
+        /*
+         * Pipeline the most common cases.
+         */
+        if (stride == 3)
+        {
+            unsigned int cr = cp[0];
+            unsigned int cg = cp[1];
+            unsigned int cb = cp[2];
+            tmsize_t i = stride;
+            for (; i < cc; i += stride)
+            {
+                cp[i + 0] = (unsigned char)((cr += cp[i + 0]) & 0xff);
+                cp[i + 1] = (unsigned char)((cg += cp[i + 1]) & 0xff);
+                cp[i + 2] = (unsigned char)((cb += cp[i + 2]) & 0xff);
+            }
+        }
+        else if (stride == 4)
+        {
+            unsigned int cr = cp[0];
+            unsigned int cg = cp[1];
+            unsigned int cb = cp[2];
+            unsigned int ca = cp[3];
+            tmsize_t i = stride;
+            for (; i < cc; i += stride)
+            {
+                cp[i + 0] = (unsigned char)((cr += cp[i + 0]) & 0xff);
+                cp[i + 1] = (unsigned char)((cg += cp[i + 1]) & 0xff);
+                cp[i + 2] = (unsigned char)((cb += cp[i + 2]) & 0xff);
+                cp[i + 3] = (unsigned char)((ca += cp[i + 3]) & 0xff);
+            }
+        }
+        else
+        {
+            cc -= stride;
+            do
+            {
+                REPEAT4(stride,
+                        cp[stride] = (unsigned char)((cp[stride] + *cp) & 0xff);
+                        cp++)
+                cc -= stride;
+            } while (cc > 0);
+        }
+    }
+    return 1;
+}
+
+static int swabHorAcc16(TIFF *tif, uint8_t *cp0, tmsize_t cc)
+{
+    uint16_t *wp = (uint16_t *)cp0;
+    tmsize_t wc = cc / 2;
+
+    TIFFSwabArrayOfShort(wp, wc);
+    return horAcc16(tif, cp0, cc);
+}
+
+TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
+static int horAcc16(TIFF *tif, uint8_t *cp0, tmsize_t cc)
+{
+    tmsize_t stride = PredictorState(tif)->stride;
+    uint16_t *wp = (uint16_t *)cp0;
+    tmsize_t wc = cc / 2;
+
+    if ((cc % (2 * stride)) != 0)
+    {
+        TIFFErrorExtR(tif, "horAcc16", "%s", "cc%(2*stride))!=0");
+        return 0;
+    }
+
+    if (wc > stride)
+    {
+        wc -= stride;
+        do
+        {
+            REPEAT4(stride, wp[stride] = (uint16_t)(((unsigned int)wp[stride] +
+                                                     (unsigned int)wp[0]) &
+                                                    0xffff);
+                    wp++)
+            wc -= stride;
+        } while (wc > 0);
+    }
+    return 1;
 }
 
-static int
-swabHorAcc16(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int swabHorAcc32(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	uint16* wp = (uint16*) cp0;
-	tmsize_t wc = cc / 2;
+    uint32_t *wp = (uint32_t *)cp0;
+    tmsize_t wc = cc / 4;
 
-        TIFFSwabArrayOfShort(wp, wc);
-        return horAcc16(tif, cp0, cc);
+    TIFFSwabArrayOfLong(wp, wc);
+    return horAcc32(tif, cp0, cc);
 }
 
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static int
-horAcc16(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int horAcc32(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	tmsize_t stride = PredictorState(tif)->stride;
-	uint16* wp = (uint16*) cp0;
-	tmsize_t wc = cc / 2;
+    tmsize_t stride = PredictorState(tif)->stride;
+    uint32_t *wp = (uint32_t *)cp0;
+    tmsize_t wc = cc / 4;
 
-    if((cc%(2*stride))!=0)
+    if ((cc % (4 * stride)) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "horAcc16",
-                     "%s", "cc%(2*stride))!=0");
+        TIFFErrorExtR(tif, "horAcc32", "%s", "cc%(4*stride))!=0");
         return 0;
     }
 
-	if (wc > stride) {
-		wc -= stride;
-		do {
-			REPEAT4(stride, wp[stride] = (uint16)(((unsigned int)wp[stride] + (unsigned int)wp[0]) & 0xffff); wp++)
-			wc -= stride;
-		} while (wc > 0);
-	}
-	return 1;
+    if (wc > stride)
+    {
+        wc -= stride;
+        do
+        {
+            REPEAT4(stride, wp[stride] += wp[0]; wp++)
+            wc -= stride;
+        } while (wc > 0);
+    }
+    return 1;
 }
 
-static int
-swabHorAcc32(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int swabHorAcc64(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	uint32* wp = (uint32*) cp0;
-	tmsize_t wc = cc / 4;
+    uint64_t *wp = (uint64_t *)cp0;
+    tmsize_t wc = cc / 8;
 
-        TIFFSwabArrayOfLong(wp, wc);
-	return horAcc32(tif, cp0, cc);
+    TIFFSwabArrayOfLong8(wp, wc);
+    return horAcc64(tif, cp0, cc);
 }
 
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static int
-horAcc32(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int horAcc64(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	tmsize_t stride = PredictorState(tif)->stride;
-	uint32* wp = (uint32*) cp0;
-	tmsize_t wc = cc / 4;
+    tmsize_t stride = PredictorState(tif)->stride;
+    uint64_t *wp = (uint64_t *)cp0;
+    tmsize_t wc = cc / 8;
 
-    if((cc%(4*stride))!=0)
+    if ((cc % (8 * stride)) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "horAcc32",
-                     "%s", "cc%(4*stride))!=0");
+        TIFFErrorExtR(tif, "horAcc64", "%s", "cc%(8*stride))!=0");
         return 0;
     }
 
-	if (wc > stride) {
-		wc -= stride;
-		do {
-			REPEAT4(stride, wp[stride] += wp[0]; wp++)
-			wc -= stride;
-		} while (wc > 0);
-	}
-	return 1;
+    if (wc > stride)
+    {
+        wc -= stride;
+        do
+        {
+            REPEAT4(stride, wp[stride] += wp[0]; wp++)
+            wc -= stride;
+        } while (wc > 0);
+    }
+    return 1;
 }
 
 /*
  * Floating point predictor accumulation routine.
  */
-static int
-fpAcc(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int fpAcc(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	tmsize_t stride = PredictorState(tif)->stride;
-	uint32 bps = tif->tif_dir.td_bitspersample / 8;
-	tmsize_t wc = cc / bps;
-	tmsize_t count = cc;
-	uint8 *cp = (uint8 *) cp0;
-	uint8 *tmp;
-
-    if(cc%(bps*stride)!=0)
+    tmsize_t stride = PredictorState(tif)->stride;
+    uint32_t bps = tif->tif_dir.td_bitspersample / 8;
+    tmsize_t wc = cc / bps;
+    tmsize_t count = cc;
+    uint8_t *cp = (uint8_t *)cp0;
+    uint8_t *tmp;
+
+    if (cc % (bps * stride) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "fpAcc",
-                     "%s", "cc%(bps*stride))!=0");
+        TIFFErrorExtR(tif, "fpAcc", "%s", "cc%(bps*stride))!=0");
         return 0;
     }
 
-    tmp = (uint8 *)_TIFFmalloc(cc);
-	if (!tmp)
-		return 0;
-
-	while (count > stride) {
-		REPEAT4(stride, cp[stride] =
-                        (unsigned char) ((cp[stride] + cp[0]) & 0xff); cp++)
-		count -= stride;
-	}
-
-	_TIFFmemcpy(tmp, cp0, cc);
-	cp = (uint8 *) cp0;
-	for (count = 0; count < wc; count++) {
-		uint32 byte;
-		for (byte = 0; byte < bps; byte++) {
-			#if WORDS_BIGENDIAN
-			cp[bps * count + byte] = tmp[byte * wc + count];
-			#else
-			cp[bps * count + byte] =
-				tmp[(bps - byte - 1) * wc + count];
-			#endif
-		}
-	}
-	_TIFFfree(tmp);
+    tmp = (uint8_t *)_TIFFmallocExt(tif, cc);
+    if (!tmp)
+        return 0;
+
+    while (count > stride)
+    {
+        REPEAT4(stride,
+                cp[stride] = (unsigned char)((cp[stride] + cp[0]) & 0xff);
+                cp++)
+        count -= stride;
+    }
+
+    _TIFFmemcpy(tmp, cp0, cc);
+    cp = (uint8_t *)cp0;
+    for (count = 0; count < wc; count++)
+    {
+        uint32_t byte;
+        for (byte = 0; byte < bps; byte++)
+        {
+#if WORDS_BIGENDIAN
+            cp[bps * count + byte] = tmp[byte * wc + count];
+#else
+            cp[bps * count + byte] = tmp[(bps - byte - 1) * wc + count];
+#endif
+        }
+    }
+    _TIFFfreeExt(tif, tmp);
     return 1;
 }
 
 /*
  * Decode a scanline and apply the predictor routine.
  */
-static int
-PredictorDecodeRow(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s)
+static int PredictorDecodeRow(TIFF *tif, uint8_t *op0, tmsize_t occ0,
+                              uint16_t s)
 {
-	TIFFPredictorState *sp = PredictorState(tif);
+    TIFFPredictorState *sp = PredictorState(tif);
 
-	assert(sp != NULL);
-	assert(sp->decoderow != NULL);
-	assert(sp->decodepfunc != NULL);  
+    assert(sp != NULL);
+    assert(sp->decoderow != NULL);
+    assert(sp->decodepfunc != NULL);
 
-	if ((*sp->decoderow)(tif, op0, occ0, s)) {
-		return (*sp->decodepfunc)(tif, op0, occ0);
-	} else
-		return 0;
+    if ((*sp->decoderow)(tif, op0, occ0, s))
+    {
+        return (*sp->decodepfunc)(tif, op0, occ0);
+    }
+    else
+        return 0;
 }
 
 /*
@@ -475,123 +578,152 @@ PredictorDecodeRow(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s)
  * been calculated at pre-decode time according to the
  * strip/tile dimensions.
  */
-static int
-PredictorDecodeTile(TIFF* tif, uint8* op0, tmsize_t occ0, uint16 s)
+static int PredictorDecodeTile(TIFF *tif, uint8_t *op0, tmsize_t occ0,
+                               uint16_t s)
 {
-	TIFFPredictorState *sp = PredictorState(tif);
+    TIFFPredictorState *sp = PredictorState(tif);
 
-	assert(sp != NULL);
-	assert(sp->decodetile != NULL);
+    assert(sp != NULL);
+    assert(sp->decodetile != NULL);
 
-	if ((*sp->decodetile)(tif, op0, occ0, s)) {
-		tmsize_t rowsize = sp->rowsize;
-		assert(rowsize > 0);
-		if((occ0%rowsize) !=0)
+    if ((*sp->decodetile)(tif, op0, occ0, s))
+    {
+        tmsize_t rowsize = sp->rowsize;
+        assert(rowsize > 0);
+        if ((occ0 % rowsize) != 0)
         {
-            TIFFErrorExt(tif->tif_clientdata, "PredictorDecodeTile",
-                         "%s", "occ0%rowsize != 0");
+            TIFFErrorExtR(tif, "PredictorDecodeTile", "%s",
+                          "occ0%rowsize != 0");
             return 0;
         }
-		assert(sp->decodepfunc != NULL);
-		while (occ0 > 0) {
-			if( !(*sp->decodepfunc)(tif, op0, rowsize) )
+        assert(sp->decodepfunc != NULL);
+        while (occ0 > 0)
+        {
+            if (!(*sp->decodepfunc)(tif, op0, rowsize))
                 return 0;
-			occ0 -= rowsize;
-			op0 += rowsize;
-		}
-		return 1;
-	} else
-		return 0;
+            occ0 -= rowsize;
+            op0 += rowsize;
+        }
+        return 1;
+    }
+    else
+        return 0;
 }
 
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static int
-horDiff8(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int horDiff8(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	TIFFPredictorState* sp = PredictorState(tif);
-	tmsize_t stride = sp->stride;
-	unsigned char* cp = (unsigned char*) cp0;
+    TIFFPredictorState *sp = PredictorState(tif);
+    tmsize_t stride = sp->stride;
+    unsigned char *cp = (unsigned char *)cp0;
 
-    if((cc%stride)!=0)
+    if ((cc % stride) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "horDiff8",
-                     "%s", "(cc%stride)!=0");
+        TIFFErrorExtR(tif, "horDiff8", "%s", "(cc%stride)!=0");
         return 0;
     }
 
-	if (cc > stride) {
-		cc -= stride;
-		/*
-		 * Pipeline the most common cases.
-		 */
-		if (stride == 3) {
-			unsigned int r1, g1, b1;
-			unsigned int r2 = cp[0];
-			unsigned int g2 = cp[1];
-			unsigned  int b2 = cp[2];
-			do {
-				r1 = cp[3]; cp[3] = (unsigned char)((r1-r2)&0xff); r2 = r1;
-				g1 = cp[4]; cp[4] = (unsigned char)((g1-g2)&0xff); g2 = g1;
-				b1 = cp[5]; cp[5] = (unsigned char)((b1-b2)&0xff); b2 = b1;
-				cp += 3;
-			} while ((cc -= 3) > 0);
-		} else if (stride == 4) {
-			unsigned int r1, g1, b1, a1;
-			unsigned int r2 = cp[0];
-			unsigned int g2 = cp[1];
-			unsigned int b2 = cp[2];
-			unsigned int a2 = cp[3];
-			do {
-				r1 = cp[4]; cp[4] = (unsigned char)((r1-r2)&0xff); r2 = r1;
-				g1 = cp[5]; cp[5] = (unsigned char)((g1-g2)&0xff); g2 = g1;
-				b1 = cp[6]; cp[6] = (unsigned char)((b1-b2)&0xff); b2 = b1;
-				a1 = cp[7]; cp[7] = (unsigned char)((a1-a2)&0xff); a2 = a1;
-				cp += 4;
-			} while ((cc -= 4) > 0);
-		} else {
-			cp += cc - 1;
-			do {
-				REPEAT4(stride, cp[stride] = (unsigned char)((cp[stride] - cp[0])&0xff); cp--)
-			} while ((cc -= stride) > 0);
-		}
-	}
-	return 1;
+    if (cc > stride)
+    {
+        cc -= stride;
+        /*
+         * Pipeline the most common cases.
+         */
+        if (stride == 3)
+        {
+            unsigned int r1, g1, b1;
+            unsigned int r2 = cp[0];
+            unsigned int g2 = cp[1];
+            unsigned int b2 = cp[2];
+            do
+            {
+                r1 = cp[3];
+                cp[3] = (unsigned char)((r1 - r2) & 0xff);
+                r2 = r1;
+                g1 = cp[4];
+                cp[4] = (unsigned char)((g1 - g2) & 0xff);
+                g2 = g1;
+                b1 = cp[5];
+                cp[5] = (unsigned char)((b1 - b2) & 0xff);
+                b2 = b1;
+                cp += 3;
+            } while ((cc -= 3) > 0);
+        }
+        else if (stride == 4)
+        {
+            unsigned int r1, g1, b1, a1;
+            unsigned int r2 = cp[0];
+            unsigned int g2 = cp[1];
+            unsigned int b2 = cp[2];
+            unsigned int a2 = cp[3];
+            do
+            {
+                r1 = cp[4];
+                cp[4] = (unsigned char)((r1 - r2) & 0xff);
+                r2 = r1;
+                g1 = cp[5];
+                cp[5] = (unsigned char)((g1 - g2) & 0xff);
+                g2 = g1;
+                b1 = cp[6];
+                cp[6] = (unsigned char)((b1 - b2) & 0xff);
+                b2 = b1;
+                a1 = cp[7];
+                cp[7] = (unsigned char)((a1 - a2) & 0xff);
+                a2 = a1;
+                cp += 4;
+            } while ((cc -= 4) > 0);
+        }
+        else
+        {
+            cp += cc - 1;
+            do
+            {
+                REPEAT4(stride,
+                        cp[stride] =
+                            (unsigned char)((cp[stride] - cp[0]) & 0xff);
+                        cp--)
+            } while ((cc -= stride) > 0);
+        }
+    }
+    return 1;
 }
 
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static int
-horDiff16(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int horDiff16(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	TIFFPredictorState* sp = PredictorState(tif);
-	tmsize_t stride = sp->stride;
-	uint16 *wp = (uint16*) cp0;
-	tmsize_t wc = cc/2;
+    TIFFPredictorState *sp = PredictorState(tif);
+    tmsize_t stride = sp->stride;
+    uint16_t *wp = (uint16_t *)cp0;
+    tmsize_t wc = cc / 2;
 
-    if((cc%(2*stride))!=0)
+    if ((cc % (2 * stride)) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "horDiff8",
-                     "%s", "(cc%(2*stride))!=0");
+        TIFFErrorExtR(tif, "horDiff8", "%s", "(cc%(2*stride))!=0");
         return 0;
     }
 
-	if (wc > stride) {
-		wc -= stride;
-		wp += wc - 1;
-		do {
-			REPEAT4(stride, wp[stride] = (uint16)(((unsigned int)wp[stride] - (unsigned int)wp[0]) & 0xffff); wp--)
-			wc -= stride;
-		} while (wc > 0);
-	}
-	return 1;
+    if (wc > stride)
+    {
+        wc -= stride;
+        wp += wc - 1;
+        do
+        {
+            REPEAT4(stride, wp[stride] = (uint16_t)(((unsigned int)wp[stride] -
+                                                     (unsigned int)wp[0]) &
+                                                    0xffff);
+                    wp--)
+            wc -= stride;
+        } while (wc > 0);
+    }
+    return 1;
 }
 
-static int
-swabHorDiff16(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int swabHorDiff16(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-    uint16* wp = (uint16*) cp0;
+    uint16_t *wp = (uint16_t *)cp0;
     tmsize_t wc = cc / 2;
 
-    if( !horDiff16(tif, cp0, cc) )
+    if (!horDiff16(tif, cp0, cc))
         return 0;
 
     TIFFSwabArrayOfShort(wp, wc);
@@ -599,281 +731,316 @@ swabHorDiff16(TIFF* tif, uint8* cp0, tmsize_t cc)
 }
 
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static int
-horDiff32(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int horDiff32(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	TIFFPredictorState* sp = PredictorState(tif);
-	tmsize_t stride = sp->stride;
-	uint32 *wp = (uint32*) cp0;
-	tmsize_t wc = cc/4;
+    TIFFPredictorState *sp = PredictorState(tif);
+    tmsize_t stride = sp->stride;
+    uint32_t *wp = (uint32_t *)cp0;
+    tmsize_t wc = cc / 4;
 
-    if((cc%(4*stride))!=0)
+    if ((cc % (4 * stride)) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "horDiff32",
-                     "%s", "(cc%(4*stride))!=0");
+        TIFFErrorExtR(tif, "horDiff32", "%s", "(cc%(4*stride))!=0");
         return 0;
     }
 
-	if (wc > stride) {
-		wc -= stride;
-		wp += wc - 1;
-		do {
-			REPEAT4(stride, wp[stride] -= wp[0]; wp--)
-			wc -= stride;
-		} while (wc > 0);
-	}
-	return 1;
+    if (wc > stride)
+    {
+        wc -= stride;
+        wp += wc - 1;
+        do
+        {
+            REPEAT4(stride, wp[stride] -= wp[0]; wp--)
+            wc -= stride;
+        } while (wc > 0);
+    }
+    return 1;
 }
 
-static int
-swabHorDiff32(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int swabHorDiff32(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-    uint32* wp = (uint32*) cp0;
+    uint32_t *wp = (uint32_t *)cp0;
     tmsize_t wc = cc / 4;
 
-    if( !horDiff32(tif, cp0, cc) )
+    if (!horDiff32(tif, cp0, cc))
         return 0;
 
     TIFFSwabArrayOfLong(wp, wc);
     return 1;
 }
 
+TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
+static int horDiff64(TIFF *tif, uint8_t *cp0, tmsize_t cc)
+{
+    TIFFPredictorState *sp = PredictorState(tif);
+    tmsize_t stride = sp->stride;
+    uint64_t *wp = (uint64_t *)cp0;
+    tmsize_t wc = cc / 8;
+
+    if ((cc % (8 * stride)) != 0)
+    {
+        TIFFErrorExtR(tif, "horDiff64", "%s", "(cc%(8*stride))!=0");
+        return 0;
+    }
+
+    if (wc > stride)
+    {
+        wc -= stride;
+        wp += wc - 1;
+        do
+        {
+            REPEAT4(stride, wp[stride] -= wp[0]; wp--)
+            wc -= stride;
+        } while (wc > 0);
+    }
+    return 1;
+}
+
+static int swabHorDiff64(TIFF *tif, uint8_t *cp0, tmsize_t cc)
+{
+    uint64_t *wp = (uint64_t *)cp0;
+    tmsize_t wc = cc / 8;
+
+    if (!horDiff64(tif, cp0, cc))
+        return 0;
+
+    TIFFSwabArrayOfLong8(wp, wc);
+    return 1;
+}
+
 /*
  * Floating point predictor differencing routine.
  */
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static int
-fpDiff(TIFF* tif, uint8* cp0, tmsize_t cc)
+static int fpDiff(TIFF *tif, uint8_t *cp0, tmsize_t cc)
 {
-	tmsize_t stride = PredictorState(tif)->stride;
-	uint32 bps = tif->tif_dir.td_bitspersample / 8;
-	tmsize_t wc = cc / bps;
-	tmsize_t count;
-	uint8 *cp = (uint8 *) cp0;
-	uint8 *tmp;
-
-    if((cc%(bps*stride))!=0)
+    tmsize_t stride = PredictorState(tif)->stride;
+    uint32_t bps = tif->tif_dir.td_bitspersample / 8;
+    tmsize_t wc = cc / bps;
+    tmsize_t count;
+    uint8_t *cp = (uint8_t *)cp0;
+    uint8_t *tmp;
+
+    if ((cc % (bps * stride)) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "fpDiff",
-                     "%s", "(cc%(bps*stride))!=0");
+        TIFFErrorExtR(tif, "fpDiff", "%s", "(cc%(bps*stride))!=0");
         return 0;
     }
 
-    tmp = (uint8 *)_TIFFmalloc(cc);
-	if (!tmp)
-		return 0;
-
-	_TIFFmemcpy(tmp, cp0, cc);
-	for (count = 0; count < wc; count++) {
-		uint32 byte;
-		for (byte = 0; byte < bps; byte++) {
-			#if WORDS_BIGENDIAN
-			cp[byte * wc + count] = tmp[bps * count + byte];
-			#else
-			cp[(bps - byte - 1) * wc + count] =
-				tmp[bps * count + byte];
-			#endif
-		}
-	}
-	_TIFFfree(tmp);
-
-	cp = (uint8 *) cp0;
-	cp += cc - stride - 1;
-	for (count = cc; count > stride; count -= stride)
-		REPEAT4(stride, cp[stride] = (unsigned char)((cp[stride] - cp[0])&0xff); cp--)
+    tmp = (uint8_t *)_TIFFmallocExt(tif, cc);
+    if (!tmp)
+        return 0;
+
+    _TIFFmemcpy(tmp, cp0, cc);
+    for (count = 0; count < wc; count++)
+    {
+        uint32_t byte;
+        for (byte = 0; byte < bps; byte++)
+        {
+#if WORDS_BIGENDIAN
+            cp[byte * wc + count] = tmp[bps * count + byte];
+#else
+            cp[(bps - byte - 1) * wc + count] = tmp[bps * count + byte];
+#endif
+        }
+    }
+    _TIFFfreeExt(tif, tmp);
+
+    cp = (uint8_t *)cp0;
+    cp += cc - stride - 1;
+    for (count = cc; count > stride; count -= stride)
+        REPEAT4(stride,
+                cp[stride] = (unsigned char)((cp[stride] - cp[0]) & 0xff);
+                cp--)
     return 1;
 }
 
-static int
-PredictorEncodeRow(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int PredictorEncodeRow(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	TIFFPredictorState *sp = PredictorState(tif);
+    TIFFPredictorState *sp = PredictorState(tif);
 
-	assert(sp != NULL);
-	assert(sp->encodepfunc != NULL);
-	assert(sp->encoderow != NULL);
+    assert(sp != NULL);
+    assert(sp->encodepfunc != NULL);
+    assert(sp->encoderow != NULL);
 
-	/* XXX horizontal differencing alters user's data XXX */
-	if( !(*sp->encodepfunc)(tif, bp, cc) )
+    /* XXX horizontal differencing alters user's data XXX */
+    if (!(*sp->encodepfunc)(tif, bp, cc))
         return 0;
-	return (*sp->encoderow)(tif, bp, cc, s);
+    return (*sp->encoderow)(tif, bp, cc, s);
 }
 
-static int
-PredictorEncodeTile(TIFF* tif, uint8* bp0, tmsize_t cc0, uint16 s)
+static int PredictorEncodeTile(TIFF *tif, uint8_t *bp0, tmsize_t cc0,
+                               uint16_t s)
 {
-	static const char module[] = "PredictorEncodeTile";
-	TIFFPredictorState *sp = PredictorState(tif);
-        uint8 *working_copy;
-	tmsize_t cc = cc0, rowsize;
-	unsigned char* bp;
-        int result_code;
-
-	assert(sp != NULL);
-	assert(sp->encodepfunc != NULL);
-	assert(sp->encodetile != NULL);
-
-        /* 
-         * Do predictor manipulation in a working buffer to avoid altering
-         * the callers buffer. http://trac.osgeo.org/gdal/ticket/1965
-         */
-        working_copy = (uint8*) _TIFFmalloc(cc0);
-        if( working_copy == NULL )
-        {
-            TIFFErrorExt(tif->tif_clientdata, module, 
-                         "Out of memory allocating " TIFF_SSIZE_FORMAT " byte temp buffer.",
-                         cc0 );
-            return 0;
-        }
-        memcpy( working_copy, bp0, cc0 );
-        bp = working_copy;
+    static const char module[] = "PredictorEncodeTile";
+    TIFFPredictorState *sp = PredictorState(tif);
+    uint8_t *working_copy;
+    tmsize_t cc = cc0, rowsize;
+    unsigned char *bp;
+    int result_code;
+
+    assert(sp != NULL);
+    assert(sp->encodepfunc != NULL);
+    assert(sp->encodetile != NULL);
+
+    /*
+     * Do predictor manipulation in a working buffer to avoid altering
+     * the callers buffer. http://trac.osgeo.org/gdal/ticket/1965
+     */
+    working_copy = (uint8_t *)_TIFFmallocExt(tif, cc0);
+    if (working_copy == NULL)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Out of memory allocating %" PRId64 " byte temp buffer.",
+                      (int64_t)cc0);
+        return 0;
+    }
+    memcpy(working_copy, bp0, cc0);
+    bp = working_copy;
 
-	rowsize = sp->rowsize;
-	assert(rowsize > 0);
-	if((cc0%rowsize)!=0)
+    rowsize = sp->rowsize;
+    assert(rowsize > 0);
+    if ((cc0 % rowsize) != 0)
     {
-        TIFFErrorExt(tif->tif_clientdata, "PredictorEncodeTile",
-                     "%s", "(cc0%rowsize)!=0");
-        _TIFFfree( working_copy );
+        TIFFErrorExtR(tif, "PredictorEncodeTile", "%s", "(cc0%rowsize)!=0");
+        _TIFFfreeExt(tif, working_copy);
         return 0;
     }
-	while (cc > 0) {
-		(*sp->encodepfunc)(tif, bp, rowsize);
-		cc -= rowsize;
-		bp += rowsize;
-	}
-	result_code = (*sp->encodetile)(tif, working_copy, cc0, s);
+    while (cc > 0)
+    {
+        (*sp->encodepfunc)(tif, bp, rowsize);
+        cc -= rowsize;
+        bp += rowsize;
+    }
+    result_code = (*sp->encodetile)(tif, working_copy, cc0, s);
 
-        _TIFFfree( working_copy );
+    _TIFFfreeExt(tif, working_copy);
 
-        return result_code;
+    return result_code;
 }
 
-#define	FIELD_PREDICTOR	(FIELD_CODEC+0)		/* XXX */
+#define FIELD_PREDICTOR (FIELD_CODEC + 0) /* XXX */
 
 static const TIFFField predictFields[] = {
-    { TIFFTAG_PREDICTOR, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16, TIFF_SETGET_UINT16, FIELD_PREDICTOR, FALSE, FALSE, "Predictor", NULL },
+    {TIFFTAG_PREDICTOR, 1, 1, TIFF_SHORT, 0, TIFF_SETGET_UINT16,
+     TIFF_SETGET_UINT16, FIELD_PREDICTOR, FALSE, FALSE, "Predictor", NULL},
 };
 
-static int
-PredictorVSetField(TIFF* tif, uint32 tag, va_list ap)
+static int PredictorVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	TIFFPredictorState *sp = PredictorState(tif);
-
-	assert(sp != NULL);
-	assert(sp->vsetparent != NULL);
-
-	switch (tag) {
-	case TIFFTAG_PREDICTOR:
-		sp->predictor = (uint16) va_arg(ap, uint16_vap);
-		TIFFSetFieldBit(tif, FIELD_PREDICTOR);
-		break;
-	default:
-		return (*sp->vsetparent)(tif, tag, ap);
-	}
-	tif->tif_flags |= TIFF_DIRTYDIRECT;
-	return 1;
+    TIFFPredictorState *sp = PredictorState(tif);
+
+    assert(sp != NULL);
+    assert(sp->vsetparent != NULL);
+
+    switch (tag)
+    {
+        case TIFFTAG_PREDICTOR:
+            sp->predictor = (uint16_t)va_arg(ap, uint16_vap);
+            TIFFSetFieldBit(tif, FIELD_PREDICTOR);
+            break;
+        default:
+            return (*sp->vsetparent)(tif, tag, ap);
+    }
+    tif->tif_flags |= TIFF_DIRTYDIRECT;
+    return 1;
 }
 
-static int
-PredictorVGetField(TIFF* tif, uint32 tag, va_list ap)
+static int PredictorVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	TIFFPredictorState *sp = PredictorState(tif);
-
-	assert(sp != NULL);
-	assert(sp->vgetparent != NULL);
-
-	switch (tag) {
-	case TIFFTAG_PREDICTOR:
-		*va_arg(ap, uint16*) = (uint16)sp->predictor;
-		break;
-	default:
-		return (*sp->vgetparent)(tif, tag, ap);
-	}
-	return 1;
+    TIFFPredictorState *sp = PredictorState(tif);
+
+    assert(sp != NULL);
+    assert(sp->vgetparent != NULL);
+
+    switch (tag)
+    {
+        case TIFFTAG_PREDICTOR:
+            *va_arg(ap, uint16_t *) = (uint16_t)sp->predictor;
+            break;
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
+    return 1;
 }
 
-static void
-PredictorPrintDir(TIFF* tif, FILE* fd, long flags)
+static void PredictorPrintDir(TIFF *tif, FILE *fd, long flags)
 {
-	TIFFPredictorState* sp = PredictorState(tif);
-
-	(void) flags;
-	if (TIFFFieldSet(tif,FIELD_PREDICTOR)) {
-		fprintf(fd, "  Predictor: ");
-		switch (sp->predictor) {
-			case 1: fprintf(fd, "none "); break;
-			case 2: fprintf(fd, "horizontal differencing "); break;
-			case 3: fprintf(fd, "floating point predictor "); break;
-		}
-		fprintf(fd, "%d (0x%x)\n", sp->predictor, sp->predictor);
-	}
-	if (sp->printdir)
-		(*sp->printdir)(tif, fd, flags);
+    TIFFPredictorState *sp = PredictorState(tif);
+
+    (void)flags;
+    if (TIFFFieldSet(tif, FIELD_PREDICTOR))
+    {
+        fprintf(fd, "  Predictor: ");
+        switch (sp->predictor)
+        {
+            case 1:
+                fprintf(fd, "none ");
+                break;
+            case 2:
+                fprintf(fd, "horizontal differencing ");
+                break;
+            case 3:
+                fprintf(fd, "floating point predictor ");
+                break;
+        }
+        fprintf(fd, "%d (0x%x)\n", sp->predictor, sp->predictor);
+    }
+    if (sp->printdir)
+        (*sp->printdir)(tif, fd, flags);
 }
 
-int
-TIFFPredictorInit(TIFF* tif)
+int TIFFPredictorInit(TIFF *tif)
 {
-	TIFFPredictorState* sp = PredictorState(tif);
-
-	assert(sp != 0);
-
-	/*
-	 * Merge codec-specific tag information.
-	 */
-	if (!_TIFFMergeFields(tif, predictFields,
-			      TIFFArrayCount(predictFields))) {
-		TIFFErrorExt(tif->tif_clientdata, "TIFFPredictorInit",
-		    "Merging Predictor codec-specific tags failed");
-		return 0;
-	}
-
-	/*
-	 * Override parent get/set field methods.
-	 */
-	sp->vgetparent = tif->tif_tagmethods.vgetfield;
-	tif->tif_tagmethods.vgetfield =
-            PredictorVGetField;/* hook for predictor tag */
-	sp->vsetparent = tif->tif_tagmethods.vsetfield;
-	tif->tif_tagmethods.vsetfield =
-	    PredictorVSetField;/* hook for predictor tag */
-	sp->printdir = tif->tif_tagmethods.printdir;
-	tif->tif_tagmethods.printdir =
-            PredictorPrintDir;	/* hook for predictor tag */
-
-	sp->setupdecode = tif->tif_setupdecode;
-	tif->tif_setupdecode = PredictorSetupDecode;
-	sp->setupencode = tif->tif_setupencode;
-	tif->tif_setupencode = PredictorSetupEncode;
-
-	sp->predictor = 1;			/* default value */
-	sp->encodepfunc = NULL;			/* no predictor routine */
-	sp->decodepfunc = NULL;			/* no predictor routine */
-	return 1;
+    TIFFPredictorState *sp = PredictorState(tif);
+
+    assert(sp != 0);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, predictFields, TIFFArrayCount(predictFields)))
+    {
+        TIFFErrorExtR(tif, "TIFFPredictorInit",
+                      "Merging Predictor codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield =
+        PredictorVGetField; /* hook for predictor tag */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield =
+        PredictorVSetField; /* hook for predictor tag */
+    sp->printdir = tif->tif_tagmethods.printdir;
+    tif->tif_tagmethods.printdir =
+        PredictorPrintDir; /* hook for predictor tag */
+
+    sp->setupdecode = tif->tif_setupdecode;
+    tif->tif_setupdecode = PredictorSetupDecode;
+    sp->setupencode = tif->tif_setupencode;
+    tif->tif_setupencode = PredictorSetupEncode;
+
+    sp->predictor = 1;      /* default value */
+    sp->encodepfunc = NULL; /* no predictor routine */
+    sp->decodepfunc = NULL; /* no predictor routine */
+    return 1;
 }
 
-int
-TIFFPredictorCleanup(TIFF* tif)
+int TIFFPredictorCleanup(TIFF *tif)
 {
-	TIFFPredictorState* sp = PredictorState(tif);
+    TIFFPredictorState *sp = PredictorState(tif);
 
-	assert(sp != 0);
+    assert(sp != 0);
 
-	tif->tif_tagmethods.vgetfield = sp->vgetparent;
-	tif->tif_tagmethods.vsetfield = sp->vsetparent;
-	tif->tif_tagmethods.printdir = sp->printdir;
-	tif->tif_setupdecode = sp->setupdecode;
-	tif->tif_setupencode = sp->setupencode;
+    tif->tif_tagmethods.vgetfield = sp->vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->vsetparent;
+    tif->tif_tagmethods.printdir = sp->printdir;
+    tif->tif_setupdecode = sp->setupdecode;
+    tif->tif_setupencode = sp->setupencode;
 
-	return 1;
+    return 1;
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_predict.h b/3rdparty/libtiff/tif_predict.h
index a326b9b8f86b..de7732835209 100644
--- a/3rdparty/libtiff/tif_predict.h
+++ b/3rdparty/libtiff/tif_predict.h
@@ -23,7 +23,7 @@
  */
 
 #ifndef _TIFFPREDICT_
-#define	_TIFFPREDICT_
+#define _TIFFPREDICT_
 
 #include "tiffio.h"
 #include "tiffiop.h"
@@ -32,50 +32,43 @@
  * ``Library-private'' Support for the Predictor Tag
  */
 
-typedef int (*TIFFEncodeDecodeMethod)(TIFF* tif, uint8* buf, tmsize_t size);
+typedef int (*TIFFEncodeDecodeMethod)(TIFF *tif, uint8_t *buf, tmsize_t size);
 
 /*
  * Codecs that want to support the Predictor tag must place
  * this structure first in their private state block so that
  * the predictor code can cast tif_data to find its state.
  */
-typedef struct {
-	int             predictor;	/* predictor tag value */
-	tmsize_t        stride;		/* sample stride over data */
-	tmsize_t        rowsize;	/* tile/strip row size */
+typedef struct
+{
+    int predictor;    /* predictor tag value */
+    tmsize_t stride;  /* sample stride over data */
+    tmsize_t rowsize; /* tile/strip row size */
 
-	TIFFCodeMethod  encoderow;	/* parent codec encode/decode row */
-	TIFFCodeMethod  encodestrip;	/* parent codec encode/decode strip */
-	TIFFCodeMethod  encodetile;	/* parent codec encode/decode tile */ 
-	TIFFEncodeDecodeMethod  encodepfunc;	/* horizontal differencer */
+    TIFFCodeMethod encoderow;           /* parent codec encode/decode row */
+    TIFFCodeMethod encodestrip;         /* parent codec encode/decode strip */
+    TIFFCodeMethod encodetile;          /* parent codec encode/decode tile */
+    TIFFEncodeDecodeMethod encodepfunc; /* horizontal differencer */
 
-	TIFFCodeMethod  decoderow;	/* parent codec encode/decode row */
-	TIFFCodeMethod  decodestrip;	/* parent codec encode/decode strip */
-	TIFFCodeMethod  decodetile;	/* parent codec encode/decode tile */ 
-	TIFFEncodeDecodeMethod  decodepfunc;	/* horizontal accumulator */
+    TIFFCodeMethod decoderow;           /* parent codec encode/decode row */
+    TIFFCodeMethod decodestrip;         /* parent codec encode/decode strip */
+    TIFFCodeMethod decodetile;          /* parent codec encode/decode tile */
+    TIFFEncodeDecodeMethod decodepfunc; /* horizontal accumulator */
 
-	TIFFVGetMethod  vgetparent;	/* super-class method */
-	TIFFVSetMethod  vsetparent;	/* super-class method */
-	TIFFPrintMethod printdir;	/* super-class method */
-	TIFFBoolMethod  setupdecode;	/* super-class method */
-	TIFFBoolMethod  setupencode;	/* super-class method */
+    TIFFVGetMethod vgetparent;  /* super-class method */
+    TIFFVSetMethod vsetparent;  /* super-class method */
+    TIFFPrintMethod printdir;   /* super-class method */
+    TIFFBoolMethod setupdecode; /* super-class method */
+    TIFFBoolMethod setupencode; /* super-class method */
 } TIFFPredictorState;
 
 #if defined(__cplusplus)
-extern "C" {
+extern "C"
+{
 #endif
-extern int TIFFPredictorInit(TIFF*);
-extern int TIFFPredictorCleanup(TIFF*);
+    extern int TIFFPredictorInit(TIFF *);
+    extern int TIFFPredictorCleanup(TIFF *);
 #if defined(__cplusplus)
 }
 #endif
 #endif /* _TIFFPREDICT_ */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_print.c b/3rdparty/libtiff/tif_print.c
index a0737941f425..2b7fd1765ae9 100644
--- a/3rdparty/libtiff/tif_print.c
+++ b/3rdparty/libtiff/tif_print.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -32,687 +32,724 @@
 
 #include <ctype.h>
 
-static void
-_TIFFprintAsciiBounded(FILE* fd, const char* cp, size_t max_chars);
-
-static const char * const photoNames[] = {
-    "min-is-white",				/* PHOTOMETRIC_MINISWHITE */
-    "min-is-black",				/* PHOTOMETRIC_MINISBLACK */
-    "RGB color",				/* PHOTOMETRIC_RGB */
-    "palette color (RGB from colormap)",	/* PHOTOMETRIC_PALETTE */
-    "transparency mask",			/* PHOTOMETRIC_MASK */
-    "separated",				/* PHOTOMETRIC_SEPARATED */
-    "YCbCr",					/* PHOTOMETRIC_YCBCR */
+static void _TIFFprintAsciiBounded(FILE *fd, const char *cp, size_t max_chars);
+
+static const char *const photoNames[] = {
+    "min-is-white",                      /* PHOTOMETRIC_MINISWHITE */
+    "min-is-black",                      /* PHOTOMETRIC_MINISBLACK */
+    "RGB color",                         /* PHOTOMETRIC_RGB */
+    "palette color (RGB from colormap)", /* PHOTOMETRIC_PALETTE */
+    "transparency mask",                 /* PHOTOMETRIC_MASK */
+    "separated",                         /* PHOTOMETRIC_SEPARATED */
+    "YCbCr",                             /* PHOTOMETRIC_YCBCR */
     "7 (0x7)",
-    "CIE L*a*b*",				/* PHOTOMETRIC_CIELAB */
-    "ICC L*a*b*",				/* PHOTOMETRIC_ICCLAB */
-    "ITU L*a*b*" 				/* PHOTOMETRIC_ITULAB */
+    "CIE L*a*b*", /* PHOTOMETRIC_CIELAB */
+    "ICC L*a*b*", /* PHOTOMETRIC_ICCLAB */
+    "ITU L*a*b*"  /* PHOTOMETRIC_ITULAB */
 };
-#define	NPHOTONAMES	(sizeof (photoNames) / sizeof (photoNames[0]))
+#define NPHOTONAMES (sizeof(photoNames) / sizeof(photoNames[0]))
 
-static const char * const orientNames[] = {
+static const char *const orientNames[] = {
     "0 (0x0)",
-    "row 0 top, col 0 lhs",			/* ORIENTATION_TOPLEFT */
-    "row 0 top, col 0 rhs",			/* ORIENTATION_TOPRIGHT */
-    "row 0 bottom, col 0 rhs",			/* ORIENTATION_BOTRIGHT */
-    "row 0 bottom, col 0 lhs",			/* ORIENTATION_BOTLEFT */
-    "row 0 lhs, col 0 top",			/* ORIENTATION_LEFTTOP */
-    "row 0 rhs, col 0 top",			/* ORIENTATION_RIGHTTOP */
-    "row 0 rhs, col 0 bottom",			/* ORIENTATION_RIGHTBOT */
-    "row 0 lhs, col 0 bottom",			/* ORIENTATION_LEFTBOT */
+    "row 0 top, col 0 lhs",    /* ORIENTATION_TOPLEFT */
+    "row 0 top, col 0 rhs",    /* ORIENTATION_TOPRIGHT */
+    "row 0 bottom, col 0 rhs", /* ORIENTATION_BOTRIGHT */
+    "row 0 bottom, col 0 lhs", /* ORIENTATION_BOTLEFT */
+    "row 0 lhs, col 0 top",    /* ORIENTATION_LEFTTOP */
+    "row 0 rhs, col 0 top",    /* ORIENTATION_RIGHTTOP */
+    "row 0 rhs, col 0 bottom", /* ORIENTATION_RIGHTBOT */
+    "row 0 lhs, col 0 bottom", /* ORIENTATION_LEFTBOT */
+};
+#define NORIENTNAMES (sizeof(orientNames) / sizeof(orientNames[0]))
+
+static const struct tagname
+{
+    uint16_t tag;
+    const char *name;
+} tagnames[] = {
+    {TIFFTAG_GDAL_METADATA, "GDAL Metadata"},
+    {TIFFTAG_GDAL_NODATA, "GDAL NoDataValue"},
 };
-#define	NORIENTNAMES	(sizeof (orientNames) / sizeof (orientNames[0]))
+#define NTAGS (sizeof(tagnames) / sizeof(tagnames[0]))
 
-static void
-_TIFFPrintField(FILE* fd, const TIFFField *fip,
-		uint32 value_count, void *raw_data)
+static void _TIFFPrintField(FILE *fd, const TIFFField *fip,
+                            uint32_t value_count, void *raw_data)
 {
-	uint32 j;
-		
-	fprintf(fd, "  %s: ", fip->field_name);
-
-	for(j = 0; j < value_count; j++) {
-		if(fip->field_type == TIFF_BYTE)
-			fprintf(fd, "%u", ((uint8 *) raw_data)[j]);
-		else if(fip->field_type == TIFF_UNDEFINED)
-			fprintf(fd, "0x%x",
-			    (unsigned int) ((unsigned char *) raw_data)[j]);
-		else if(fip->field_type == TIFF_SBYTE)
-			fprintf(fd, "%d", ((int8 *) raw_data)[j]);
-		else if(fip->field_type == TIFF_SHORT)
-			fprintf(fd, "%u", ((uint16 *) raw_data)[j]);
-		else if(fip->field_type == TIFF_SSHORT)
-			fprintf(fd, "%d", ((int16 *) raw_data)[j]);
-		else if(fip->field_type == TIFF_LONG)
-			fprintf(fd, "%lu",
-			    (unsigned long)((uint32 *) raw_data)[j]);
-		else if(fip->field_type == TIFF_SLONG)
-			fprintf(fd, "%ld", (long)((int32 *) raw_data)[j]);
-		else if(fip->field_type == TIFF_IFD)
-			fprintf(fd, "0x%lx",
-				(unsigned long)((uint32 *) raw_data)[j]);
-		else if(fip->field_type == TIFF_RATIONAL
-			|| fip->field_type == TIFF_SRATIONAL
-			|| fip->field_type == TIFF_FLOAT)
-			fprintf(fd, "%f", ((float *) raw_data)[j]);
-		else if(fip->field_type == TIFF_LONG8)
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			fprintf(fd, "%I64u",
-			    (unsigned __int64)((uint64 *) raw_data)[j]);
-#else
-			fprintf(fd, "%llu",
-			    (unsigned long long)((uint64 *) raw_data)[j]);
-#endif
-		else if(fip->field_type == TIFF_SLONG8)
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			fprintf(fd, "%I64d", (__int64)((int64 *) raw_data)[j]);
-#else
-			fprintf(fd, "%lld", (long long)((int64 *) raw_data)[j]);
-#endif
-		else if(fip->field_type == TIFF_IFD8)
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			fprintf(fd, "0x%I64x",
-				(unsigned __int64)((uint64 *) raw_data)[j]);
-#else
-			fprintf(fd, "0x%llx",
-				(unsigned long long)((uint64 *) raw_data)[j]);
-#endif
-		else if(fip->field_type == TIFF_FLOAT)
-			fprintf(fd, "%f", ((float *)raw_data)[j]);
-		else if(fip->field_type == TIFF_DOUBLE)
-			fprintf(fd, "%f", ((double *) raw_data)[j]);
-		else if(fip->field_type == TIFF_ASCII) {
-			fprintf(fd, "%s", (char *) raw_data);
-			break;
-		}
-		else {
-			fprintf(fd, "<unsupported data type in TIFFPrint>");
-			break;
-		}
-
-		if(j < value_count - 1)
-			fprintf(fd, ",");
-	}
-
-	fprintf(fd, "\n");
+    uint32_t j;
+
+    /* Print a user-friendly name for tags of relatively common use, but */
+    /* which aren't registered by libtiff itself. */
+    const char *field_name = fip->field_name;
+    if (TIFFFieldIsAnonymous(fip))
+    {
+        for (size_t i = 0; i < NTAGS; ++i)
+        {
+            if (fip->field_tag == tagnames[i].tag)
+            {
+                field_name = tagnames[i].name;
+                break;
+            }
+        }
+    }
+    fprintf(fd, "  %s: ", field_name);
+
+    for (j = 0; j < value_count; j++)
+    {
+        if (fip->field_type == TIFF_BYTE)
+            fprintf(fd, "%" PRIu8, ((uint8_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_UNDEFINED)
+            fprintf(fd, "0x%" PRIx8, ((uint8_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_SBYTE)
+            fprintf(fd, "%" PRId8, ((int8_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_SHORT)
+            fprintf(fd, "%" PRIu16, ((uint16_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_SSHORT)
+            fprintf(fd, "%" PRId16, ((int16_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_LONG)
+            fprintf(fd, "%" PRIu32, ((uint32_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_SLONG)
+            fprintf(fd, "%" PRId32, ((int32_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_IFD)
+            fprintf(fd, "0x%" PRIx32, ((uint32_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_RATIONAL ||
+                 fip->field_type == TIFF_SRATIONAL)
+        {
+            int tv_size = TIFFFieldSetGetSize(fip);
+            if (tv_size == 8)
+                fprintf(fd, "%lf", ((double *)raw_data)[j]);
+            else
+                fprintf(fd, "%f", ((float *)raw_data)[j]);
+        }
+        else if (fip->field_type == TIFF_FLOAT)
+            fprintf(fd, "%f", ((float *)raw_data)[j]);
+        else if (fip->field_type == TIFF_LONG8)
+            fprintf(fd, "%" PRIu64, ((uint64_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_SLONG8)
+            fprintf(fd, "%" PRId64, ((int64_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_IFD8)
+            fprintf(fd, "0x%" PRIx64, ((uint64_t *)raw_data)[j]);
+        else if (fip->field_type == TIFF_DOUBLE)
+            fprintf(fd, "%lf", ((double *)raw_data)[j]);
+        else if (fip->field_type == TIFF_ASCII)
+        {
+            fprintf(fd, "%s", (char *)raw_data);
+            break;
+        }
+        else
+        {
+            fprintf(fd, "<unsupported data type in TIFFPrint>");
+            break;
+        }
+
+        if (j < value_count - 1)
+            fprintf(fd, ",");
+    }
+
+    fprintf(fd, "\n");
 }
 
-static int
-_TIFFPrettyPrintField(TIFF* tif, const TIFFField *fip, FILE* fd, uint32 tag,
-		      uint32 value_count, void *raw_data)
+static int _TIFFPrettyPrintField(TIFF *tif, const TIFFField *fip, FILE *fd,
+                                 uint32_t tag, uint32_t value_count,
+                                 void *raw_data)
 {
-        (void) tif;
-
-	/* do not try to pretty print auto-defined fields */
-	if (strncmp(fip->field_name,"Tag ", 4) == 0) {
-		return 0;
-	}
-        
-	switch (tag)
-	{
-		case TIFFTAG_INKSET:
-			if (value_count == 2 && fip->field_type == TIFF_SHORT) {
-				fprintf(fd, "  Ink Set: ");
-				switch (*((uint16*)raw_data)) {
-				case INKSET_CMYK:
-					fprintf(fd, "CMYK\n");
-					break;
-				default:
-					fprintf(fd, "%u (0x%x)\n",
-						*((uint16*)raw_data),
-						*((uint16*)raw_data));
-					break;
-				}
-				return 1;
-			}
-			return 0;
-
-		case TIFFTAG_DOTRANGE:
-			if (value_count == 2 && fip->field_type == TIFF_SHORT) {
-				fprintf(fd, "  Dot Range: %u-%u\n",
-					((uint16*)raw_data)[0], ((uint16*)raw_data)[1]);
-				return 1;
-			}
-			return 0;
-
-		case TIFFTAG_WHITEPOINT:
-			if (value_count == 2 && fip->field_type == TIFF_RATIONAL) {
-				fprintf(fd, "  White Point: %g-%g\n",
-					((float *)raw_data)[0], ((float *)raw_data)[1]);
-				return 1;
-			} 
-			return 0;
-
-		case TIFFTAG_XMLPACKET:
-		{
-			uint32 i;
-
-			fprintf(fd, "  XMLPacket (XMP Metadata):\n" );
-			for(i = 0; i < value_count; i++)
-				fputc(((char *)raw_data)[i], fd);
-			fprintf( fd, "\n" );
-			return 1;
-		}
-		case TIFFTAG_RICHTIFFIPTC:
-			/*
-			 * XXX: for some weird reason RichTIFFIPTC tag
-			 * defined as array of LONG values.
-			 */
-			fprintf(fd,
-			    "  RichTIFFIPTC Data: <present>, %lu bytes\n",
-			    (unsigned long) value_count * 4);
-			return 1;
-
-		case TIFFTAG_PHOTOSHOP:
-			fprintf(fd, "  Photoshop Data: <present>, %lu bytes\n",
-			    (unsigned long) value_count);
-			return 1;
-
-		case TIFFTAG_ICCPROFILE:
-			fprintf(fd, "  ICC Profile: <present>, %lu bytes\n",
-			    (unsigned long) value_count);
-			return 1;
-
-		case TIFFTAG_STONITS:
-			if (value_count == 1 && fip->field_type == TIFF_DOUBLE) { 
-				fprintf(fd,
-					"  Sample to Nits conversion factor: %.4e\n",
-					*((double*)raw_data));
-				return 1;
-			}
-			return 0;
-	}
-
-	return 0;
+    (void)tif;
+
+    /* do not try to pretty print auto-defined fields */
+    if (TIFFFieldIsAnonymous(fip))
+    {
+        return 0;
+    }
+
+    switch (tag)
+    {
+        case TIFFTAG_INKSET:
+            if (value_count == 2 && fip->field_type == TIFF_SHORT)
+            {
+                fprintf(fd, "  Ink Set: ");
+                switch (*((uint16_t *)raw_data))
+                {
+                    case INKSET_CMYK:
+                        fprintf(fd, "CMYK\n");
+                        break;
+                    default:
+                        fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n",
+                                *((uint16_t *)raw_data),
+                                *((uint16_t *)raw_data));
+                        break;
+                }
+                return 1;
+            }
+            return 0;
+
+        case TIFFTAG_DOTRANGE:
+            if (value_count == 2 && fip->field_type == TIFF_SHORT)
+            {
+                fprintf(fd, "  Dot Range: %" PRIu16 "-%" PRIu16 "\n",
+                        ((uint16_t *)raw_data)[0], ((uint16_t *)raw_data)[1]);
+                return 1;
+            }
+            return 0;
+
+        case TIFFTAG_WHITEPOINT:
+            if (value_count == 2 && fip->field_type == TIFF_RATIONAL)
+            {
+                fprintf(fd, "  White Point: %g-%g\n", ((float *)raw_data)[0],
+                        ((float *)raw_data)[1]);
+                return 1;
+            }
+            return 0;
+
+        case TIFFTAG_XMLPACKET:
+        {
+            uint32_t i;
+
+            fprintf(fd, "  XMLPacket (XMP Metadata):\n");
+            for (i = 0; i < value_count; i++)
+                fputc(((char *)raw_data)[i], fd);
+            fprintf(fd, "\n");
+            return 1;
+        }
+        case TIFFTAG_RICHTIFFIPTC:
+            fprintf(fd, "  RichTIFFIPTC Data: <present>, %" PRIu32 " bytes\n",
+                    value_count);
+            return 1;
+
+        case TIFFTAG_PHOTOSHOP:
+            fprintf(fd, "  Photoshop Data: <present>, %" PRIu32 " bytes\n",
+                    value_count);
+            return 1;
+
+        case TIFFTAG_ICCPROFILE:
+            fprintf(fd, "  ICC Profile: <present>, %" PRIu32 " bytes\n",
+                    value_count);
+            return 1;
+
+        case TIFFTAG_STONITS:
+            if (value_count == 1 && fip->field_type == TIFF_DOUBLE)
+            {
+                fprintf(fd, "  Sample to Nits conversion factor: %.4e\n",
+                        *((double *)raw_data));
+                return 1;
+            }
+            return 0;
+    }
+
+    return 0;
 }
 
 /*
  * Print the contents of the current directory
  * to the specified stdio file stream.
  */
-void
-TIFFPrintDirectory(TIFF* tif, FILE* fd, long flags)
+void TIFFPrintDirectory(TIFF *tif, FILE *fd, long flags)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-	char *sep;
-	long l, n;
-
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-	fprintf(fd, "TIFF Directory at offset 0x%I64x (%I64u)\n",
-		(unsigned __int64) tif->tif_diroff,
-		(unsigned __int64) tif->tif_diroff);
-#else
-	fprintf(fd, "TIFF Directory at offset 0x%llx (%llu)\n",
-		(unsigned long long) tif->tif_diroff,
-		(unsigned long long) tif->tif_diroff);
-#endif
-	if (TIFFFieldSet(tif,FIELD_SUBFILETYPE)) {
-		fprintf(fd, "  Subfile Type:");
-		sep = " ";
-		if (td->td_subfiletype & FILETYPE_REDUCEDIMAGE) {
-			fprintf(fd, "%sreduced-resolution image", sep);
-			sep = "/";
-		}
-		if (td->td_subfiletype & FILETYPE_PAGE) {
-			fprintf(fd, "%smulti-page document", sep);
-			sep = "/";
-		}
-		if (td->td_subfiletype & FILETYPE_MASK)
-			fprintf(fd, "%stransparency mask", sep);
-		fprintf(fd, " (%lu = 0x%lx)\n",
-		    (unsigned long) td->td_subfiletype, (long) td->td_subfiletype);
-	}
-	if (TIFFFieldSet(tif,FIELD_IMAGEDIMENSIONS)) {
-		fprintf(fd, "  Image Width: %lu Image Length: %lu",
-		    (unsigned long) td->td_imagewidth, (unsigned long) td->td_imagelength);
-		if (TIFFFieldSet(tif,FIELD_IMAGEDEPTH))
-			fprintf(fd, " Image Depth: %lu",
-			    (unsigned long) td->td_imagedepth);
-		fprintf(fd, "\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_TILEDIMENSIONS)) {
-		fprintf(fd, "  Tile Width: %lu Tile Length: %lu",
-		    (unsigned long) td->td_tilewidth, (unsigned long) td->td_tilelength);
-		if (TIFFFieldSet(tif,FIELD_TILEDEPTH))
-			fprintf(fd, " Tile Depth: %lu",
-			    (unsigned long) td->td_tiledepth);
-		fprintf(fd, "\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_RESOLUTION)) {
-		fprintf(fd, "  Resolution: %g, %g",
-		    td->td_xresolution, td->td_yresolution);
-		if (TIFFFieldSet(tif,FIELD_RESOLUTIONUNIT)) {
-			switch (td->td_resolutionunit) {
-			case RESUNIT_NONE:
-				fprintf(fd, " (unitless)");
-				break;
-			case RESUNIT_INCH:
-				fprintf(fd, " pixels/inch");
-				break;
-			case RESUNIT_CENTIMETER:
-				fprintf(fd, " pixels/cm");
-				break;
-			default:
-				fprintf(fd, " (unit %u = 0x%x)",
-				    td->td_resolutionunit,
-				    td->td_resolutionunit);
-				break;
-			}
-		}
-		fprintf(fd, "\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_POSITION))
-		fprintf(fd, "  Position: %g, %g\n",
-		    td->td_xposition, td->td_yposition);
-	if (TIFFFieldSet(tif,FIELD_BITSPERSAMPLE))
-		fprintf(fd, "  Bits/Sample: %u\n", td->td_bitspersample);
-	if (TIFFFieldSet(tif,FIELD_SAMPLEFORMAT)) {
-		fprintf(fd, "  Sample Format: ");
-		switch (td->td_sampleformat) {
-		case SAMPLEFORMAT_VOID:
-			fprintf(fd, "void\n");
-			break;
-		case SAMPLEFORMAT_INT:
-			fprintf(fd, "signed integer\n");
-			break;
-		case SAMPLEFORMAT_UINT:
-			fprintf(fd, "unsigned integer\n");
-			break;
-		case SAMPLEFORMAT_IEEEFP:
-			fprintf(fd, "IEEE floating point\n");
-			break;
-		case SAMPLEFORMAT_COMPLEXINT:
-			fprintf(fd, "complex signed integer\n");
-			break;
-		case SAMPLEFORMAT_COMPLEXIEEEFP:
-			fprintf(fd, "complex IEEE floating point\n");
-			break;
-		default:
-			fprintf(fd, "%u (0x%x)\n",
-			    td->td_sampleformat, td->td_sampleformat);
-			break;
-		}
-	}
-	if (TIFFFieldSet(tif,FIELD_COMPRESSION)) {
-		const TIFFCodec* c = TIFFFindCODEC(td->td_compression);
-		fprintf(fd, "  Compression Scheme: ");
-		if (c)
-			fprintf(fd, "%s\n", c->name);
-		else
-			fprintf(fd, "%u (0x%x)\n",
-			    td->td_compression, td->td_compression);
-	}
-	if (TIFFFieldSet(tif,FIELD_PHOTOMETRIC)) {
-		fprintf(fd, "  Photometric Interpretation: ");
-		if (td->td_photometric < NPHOTONAMES)
-			fprintf(fd, "%s\n", photoNames[td->td_photometric]);
-		else {
-			switch (td->td_photometric) {
-			case PHOTOMETRIC_LOGL:
-				fprintf(fd, "CIE Log2(L)\n");
-				break;
-			case PHOTOMETRIC_LOGLUV:
-				fprintf(fd, "CIE Log2(L) (u',v')\n");
-				break;
-			default:
-				fprintf(fd, "%u (0x%x)\n",
-				    td->td_photometric, td->td_photometric);
-				break;
-			}
-		}
-	}
-	if (TIFFFieldSet(tif,FIELD_EXTRASAMPLES) && td->td_extrasamples) {
-		uint16 i;
-		fprintf(fd, "  Extra Samples: %u<", td->td_extrasamples);
-		sep = "";
-		for (i = 0; i < td->td_extrasamples; i++) {
-			switch (td->td_sampleinfo[i]) {
-			case EXTRASAMPLE_UNSPECIFIED:
-				fprintf(fd, "%sunspecified", sep);
-				break;
-			case EXTRASAMPLE_ASSOCALPHA:
-				fprintf(fd, "%sassoc-alpha", sep);
-				break;
-			case EXTRASAMPLE_UNASSALPHA:
-				fprintf(fd, "%sunassoc-alpha", sep);
-				break;
-			default:
-				fprintf(fd, "%s%u (0x%x)", sep,
-				    td->td_sampleinfo[i], td->td_sampleinfo[i]);
-				break;
-			}
-			sep = ", ";
-		}
-		fprintf(fd, ">\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_INKNAMES)) {
-		char* cp;
-		uint16 i;
-		fprintf(fd, "  Ink Names: ");
-		i = td->td_samplesperpixel;
-		sep = "";
-		for (cp = td->td_inknames; 
-		     i > 0 && cp < td->td_inknames + td->td_inknameslen; 
-		     cp = strchr(cp,'\0')+1, i--) {
-			size_t max_chars = 
-				td->td_inknameslen - (cp - td->td_inknames);
-			fputs(sep, fd);
-			_TIFFprintAsciiBounded(fd, cp, max_chars);
-			sep = ", ";
-		}
-                fputs("\n", fd);
-	}
-	if (TIFFFieldSet(tif,FIELD_THRESHHOLDING)) {
-		fprintf(fd, "  Thresholding: ");
-		switch (td->td_threshholding) {
-		case THRESHHOLD_BILEVEL:
-			fprintf(fd, "bilevel art scan\n");
-			break;
-		case THRESHHOLD_HALFTONE:
-			fprintf(fd, "halftone or dithered scan\n");
-			break;
-		case THRESHHOLD_ERRORDIFFUSE:
-			fprintf(fd, "error diffused\n");
-			break;
-		default:
-			fprintf(fd, "%u (0x%x)\n",
-			    td->td_threshholding, td->td_threshholding);
-			break;
-		}
-	}
-	if (TIFFFieldSet(tif,FIELD_FILLORDER)) {
-		fprintf(fd, "  FillOrder: ");
-		switch (td->td_fillorder) {
-		case FILLORDER_MSB2LSB:
-			fprintf(fd, "msb-to-lsb\n");
-			break;
-		case FILLORDER_LSB2MSB:
-			fprintf(fd, "lsb-to-msb\n");
-			break;
-		default:
-			fprintf(fd, "%u (0x%x)\n",
-			    td->td_fillorder, td->td_fillorder);
-			break;
-		}
-	}
-	if (TIFFFieldSet(tif,FIELD_YCBCRSUBSAMPLING))
+    TIFFDirectory *td = &tif->tif_dir;
+    char *sep;
+    long l, n;
+
+    fprintf(fd, "TIFF Directory at offset 0x%" PRIx64 " (%" PRIu64 ")\n",
+            tif->tif_diroff, tif->tif_diroff);
+    if (TIFFFieldSet(tif, FIELD_SUBFILETYPE))
+    {
+        fprintf(fd, "  Subfile Type:");
+        sep = " ";
+        if (td->td_subfiletype & FILETYPE_REDUCEDIMAGE)
+        {
+            fprintf(fd, "%sreduced-resolution image", sep);
+            sep = "/";
+        }
+        if (td->td_subfiletype & FILETYPE_PAGE)
+        {
+            fprintf(fd, "%smulti-page document", sep);
+            sep = "/";
+        }
+        if (td->td_subfiletype & FILETYPE_MASK)
+            fprintf(fd, "%stransparency mask", sep);
+        fprintf(fd, " (%" PRIu32 " = 0x%" PRIx32 ")\n", td->td_subfiletype,
+                td->td_subfiletype);
+    }
+    if (TIFFFieldSet(tif, FIELD_IMAGEDIMENSIONS))
+    {
+        fprintf(fd, "  Image Width: %" PRIu32 " Image Length: %" PRIu32,
+                td->td_imagewidth, td->td_imagelength);
+        if (TIFFFieldSet(tif, FIELD_IMAGEDEPTH))
+            fprintf(fd, " Image Depth: %" PRIu32, td->td_imagedepth);
+        fprintf(fd, "\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_TILEDIMENSIONS))
+    {
+        fprintf(fd, "  Tile Width: %" PRIu32 " Tile Length: %" PRIu32,
+                td->td_tilewidth, td->td_tilelength);
+        if (TIFFFieldSet(tif, FIELD_TILEDEPTH))
+            fprintf(fd, " Tile Depth: %" PRIu32, td->td_tiledepth);
+        fprintf(fd, "\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_RESOLUTION))
+    {
+        fprintf(fd, "  Resolution: %g, %g", td->td_xresolution,
+                td->td_yresolution);
+        if (TIFFFieldSet(tif, FIELD_RESOLUTIONUNIT))
+        {
+            switch (td->td_resolutionunit)
+            {
+                case RESUNIT_NONE:
+                    fprintf(fd, " (unitless)");
+                    break;
+                case RESUNIT_INCH:
+                    fprintf(fd, " pixels/inch");
+                    break;
+                case RESUNIT_CENTIMETER:
+                    fprintf(fd, " pixels/cm");
+                    break;
+                default:
+                    fprintf(fd, " (unit %" PRIu16 " = 0x%" PRIx16 ")",
+                            td->td_resolutionunit, td->td_resolutionunit);
+                    break;
+            }
+        }
+        fprintf(fd, "\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_POSITION))
+        fprintf(fd, "  Position: %g, %g\n", td->td_xposition, td->td_yposition);
+    if (TIFFFieldSet(tif, FIELD_BITSPERSAMPLE))
+        fprintf(fd, "  Bits/Sample: %" PRIu16 "\n", td->td_bitspersample);
+    if (TIFFFieldSet(tif, FIELD_SAMPLEFORMAT))
+    {
+        fprintf(fd, "  Sample Format: ");
+        switch (td->td_sampleformat)
         {
-		fprintf(fd, "  YCbCr Subsampling: %u, %u\n",
-			td->td_ycbcrsubsampling[0], td->td_ycbcrsubsampling[1] );
-	}
-	if (TIFFFieldSet(tif,FIELD_YCBCRPOSITIONING)) {
-		fprintf(fd, "  YCbCr Positioning: ");
-		switch (td->td_ycbcrpositioning) {
-		case YCBCRPOSITION_CENTERED:
-			fprintf(fd, "centered\n");
-			break;
-		case YCBCRPOSITION_COSITED:
-			fprintf(fd, "cosited\n");
-			break;
-		default:
-			fprintf(fd, "%u (0x%x)\n",
-			    td->td_ycbcrpositioning, td->td_ycbcrpositioning);
-			break;
-		}
-	}
-	if (TIFFFieldSet(tif,FIELD_HALFTONEHINTS))
-		fprintf(fd, "  Halftone Hints: light %u dark %u\n",
-		    td->td_halftonehints[0], td->td_halftonehints[1]);
-	if (TIFFFieldSet(tif,FIELD_ORIENTATION)) {
-		fprintf(fd, "  Orientation: ");
-		if (td->td_orientation < NORIENTNAMES)
-			fprintf(fd, "%s\n", orientNames[td->td_orientation]);
-		else
-			fprintf(fd, "%u (0x%x)\n",
-			    td->td_orientation, td->td_orientation);
-	}
-	if (TIFFFieldSet(tif,FIELD_SAMPLESPERPIXEL))
-		fprintf(fd, "  Samples/Pixel: %u\n", td->td_samplesperpixel);
-	if (TIFFFieldSet(tif,FIELD_ROWSPERSTRIP)) {
-		fprintf(fd, "  Rows/Strip: ");
-		if (td->td_rowsperstrip == (uint32) -1)
-			fprintf(fd, "(infinite)\n");
-		else
-			fprintf(fd, "%lu\n", (unsigned long) td->td_rowsperstrip);
-	}
-	if (TIFFFieldSet(tif,FIELD_MINSAMPLEVALUE))
-		fprintf(fd, "  Min Sample Value: %u\n", td->td_minsamplevalue);
-	if (TIFFFieldSet(tif,FIELD_MAXSAMPLEVALUE))
-		fprintf(fd, "  Max Sample Value: %u\n", td->td_maxsamplevalue);
-	if (TIFFFieldSet(tif,FIELD_SMINSAMPLEVALUE)) {
-		int i;
-		int count = (tif->tif_flags & TIFF_PERSAMPLE) ? td->td_samplesperpixel : 1;
-		fprintf(fd, "  SMin Sample Value:");
-		for (i = 0; i < count; ++i)
-			fprintf(fd, " %g", td->td_sminsamplevalue[i]);
-		fprintf(fd, "\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_SMAXSAMPLEVALUE)) {
-		int i;
-		int count = (tif->tif_flags & TIFF_PERSAMPLE) ? td->td_samplesperpixel : 1;
-		fprintf(fd, "  SMax Sample Value:");
-		for (i = 0; i < count; ++i)
-			fprintf(fd, " %g", td->td_smaxsamplevalue[i]);
-		fprintf(fd, "\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_PLANARCONFIG)) {
-		fprintf(fd, "  Planar Configuration: ");
-		switch (td->td_planarconfig) {
-		case PLANARCONFIG_CONTIG:
-			fprintf(fd, "single image plane\n");
-			break;
-		case PLANARCONFIG_SEPARATE:
-			fprintf(fd, "separate image planes\n");
-			break;
-		default:
-			fprintf(fd, "%u (0x%x)\n",
-			    td->td_planarconfig, td->td_planarconfig);
-			break;
-		}
-	}
-	if (TIFFFieldSet(tif,FIELD_PAGENUMBER))
-		fprintf(fd, "  Page Number: %u-%u\n",
-		    td->td_pagenumber[0], td->td_pagenumber[1]);
-	if (TIFFFieldSet(tif,FIELD_COLORMAP)) {
-		fprintf(fd, "  Color Map: ");
-		if (flags & TIFFPRINT_COLORMAP) {
-			fprintf(fd, "\n");
-			n = 1L<<td->td_bitspersample;
-			for (l = 0; l < n; l++)
-				fprintf(fd, "   %5ld: %5u %5u %5u\n",
-				    l,
-				    td->td_colormap[0][l],
-				    td->td_colormap[1][l],
-				    td->td_colormap[2][l]);
-		} else
-			fprintf(fd, "(present)\n");
-	}
-	if (TIFFFieldSet(tif,FIELD_REFBLACKWHITE)) {
-		int i;
-		fprintf(fd, "  Reference Black/White:\n");
-		for (i = 0; i < 3; i++)
-		fprintf(fd, "    %2d: %5g %5g\n", i,
-			td->td_refblackwhite[2*i+0],
-			td->td_refblackwhite[2*i+1]);
-	}
-	if (TIFFFieldSet(tif,FIELD_TRANSFERFUNCTION)) {
-		fprintf(fd, "  Transfer Function: ");
-		if (flags & TIFFPRINT_CURVES) {
-			fprintf(fd, "\n");
-			n = 1L<<td->td_bitspersample;
-			for (l = 0; l < n; l++) {
-				uint16 i;
-				fprintf(fd, "    %2ld: %5u",
-				    l, td->td_transferfunction[0][l]);
-				for (i = 1; i < td->td_samplesperpixel - td->td_extrasamples && i < 3; i++)
-					fprintf(fd, " %5u",
-					    td->td_transferfunction[i][l]);
-				fputc('\n', fd);
-			}
-		} else
-			fprintf(fd, "(present)\n");
-	}
-	if (TIFFFieldSet(tif, FIELD_SUBIFD) && (td->td_subifd)) {
-		uint16 i;
-		fprintf(fd, "  SubIFD Offsets:");
-		for (i = 0; i < td->td_nsubifd; i++)
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			fprintf(fd, " %5I64u",
-				(unsigned __int64) td->td_subifd[i]);
-#else
-			fprintf(fd, " %5llu",
-				(unsigned long long) td->td_subifd[i]);
-#endif
-		fputc('\n', fd);
-	}
-
-	/*
-	** Custom tag support.
-	*/
-	{
-		int  i;
-		short count;
-
-		count = (short) TIFFGetTagListCount(tif);
-		for(i = 0; i < count; i++) {
-			uint32 tag = TIFFGetTagListEntry(tif, i);
-			const TIFFField *fip;
-			uint32 value_count;
-			int mem_alloc = 0;
-			void *raw_data;
-
-			fip = TIFFFieldWithTag(tif, tag);
-			if(fip == NULL)
-				continue;
-
-			if(fip->field_passcount) {
-				if (fip->field_readcount == TIFF_VARIABLE2 ) {
-					if(TIFFGetField(tif, tag, &value_count, &raw_data) != 1)
-						continue;
-				} else if (fip->field_readcount == TIFF_VARIABLE ) {
-					uint16 small_value_count;
-					if(TIFFGetField(tif, tag, &small_value_count, &raw_data) != 1)
-						continue;
-					value_count = small_value_count;
-				} else {
-					assert (fip->field_readcount == TIFF_VARIABLE
-						|| fip->field_readcount == TIFF_VARIABLE2);
-					continue;
-				} 
-			} else {
-				if (fip->field_readcount == TIFF_VARIABLE
-				    || fip->field_readcount == TIFF_VARIABLE2)
-					value_count = 1;
-				else if (fip->field_readcount == TIFF_SPP)
-					value_count = td->td_samplesperpixel;
-				else
-					value_count = fip->field_readcount;
-				if (fip->field_tag == TIFFTAG_DOTRANGE
-				    && strcmp(fip->field_name,"DotRange") == 0) {
-					/* TODO: This is an evil exception and should not have been
-					   handled this way ... likely best if we move it into
-					   the directory structure with an explicit field in 
-					   libtiff 4.1 and assign it a FIELD_ value */
-					static uint16 dotrange[2];
-					raw_data = dotrange;
-					TIFFGetField(tif, tag, dotrange+0, dotrange+1);
-				} else if (fip->field_type == TIFF_ASCII
-					   || fip->field_readcount == TIFF_VARIABLE
-					   || fip->field_readcount == TIFF_VARIABLE2
-					   || fip->field_readcount == TIFF_SPP
-					   || value_count > 1) {
-					if(TIFFGetField(tif, tag, &raw_data) != 1)
-						continue;
-				} else {
-					raw_data = _TIFFmalloc(
-					    _TIFFDataSize(fip->field_type)
-					    * value_count);
-					mem_alloc = 1;
-					if(TIFFGetField(tif, tag, raw_data) != 1) {
-						_TIFFfree(raw_data);
-						continue;
-					}
-				}
-			}
-
-			/*
-			 * Catch the tags which needs to be specially handled
-			 * and pretty print them. If tag not handled in
-			 * _TIFFPrettyPrintField() fall down and print it as
-			 * any other tag.
-			 */
-			if (!_TIFFPrettyPrintField(tif, fip, fd, tag, value_count, raw_data))
-				_TIFFPrintField(fd, fip, value_count, raw_data);
-
-			if(mem_alloc)
-				_TIFFfree(raw_data);
-		}
-	}
-        
-	if (tif->tif_tagmethods.printdir)
-		(*tif->tif_tagmethods.printdir)(tif, fd, flags);
-
-	if ((flags & TIFFPRINT_STRIPS) &&
-	    TIFFFieldSet(tif,FIELD_STRIPOFFSETS)) {
-		uint32 s;
-
-		fprintf(fd, "  %lu %s:\n",
-		    (unsigned long) td->td_nstrips,
-		    isTiled(tif) ? "Tiles" : "Strips");
-		for (s = 0; s < td->td_nstrips; s++)
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			fprintf(fd, "    %3lu: [%8I64u, %8I64u]\n",
-			    (unsigned long) s,
-			    (unsigned __int64) TIFFGetStrileOffset(tif, s),
-			    (unsigned __int64) TIFFGetStrileByteCount(tif, s));
-#else
-			fprintf(fd, "    %3lu: [%8llu, %8llu]\n",
-			    (unsigned long) s,
-			    (unsigned long long) TIFFGetStrileOffset(tif, s),
-			    (unsigned long long) TIFFGetStrileByteCount(tif, s));
-#endif
-	}
+            case SAMPLEFORMAT_VOID:
+                fprintf(fd, "void\n");
+                break;
+            case SAMPLEFORMAT_INT:
+                fprintf(fd, "signed integer\n");
+                break;
+            case SAMPLEFORMAT_UINT:
+                fprintf(fd, "unsigned integer\n");
+                break;
+            case SAMPLEFORMAT_IEEEFP:
+                fprintf(fd, "IEEE floating point\n");
+                break;
+            case SAMPLEFORMAT_COMPLEXINT:
+                fprintf(fd, "complex signed integer\n");
+                break;
+            case SAMPLEFORMAT_COMPLEXIEEEFP:
+                fprintf(fd, "complex IEEE floating point\n");
+                break;
+            default:
+                fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n",
+                        td->td_sampleformat, td->td_sampleformat);
+                break;
+        }
+    }
+    if (TIFFFieldSet(tif, FIELD_COMPRESSION))
+    {
+        const TIFFCodec *c = TIFFFindCODEC(td->td_compression);
+        fprintf(fd, "  Compression Scheme: ");
+        if (c)
+            fprintf(fd, "%s\n", c->name);
+        else
+            fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n", td->td_compression,
+                    td->td_compression);
+    }
+    if (TIFFFieldSet(tif, FIELD_PHOTOMETRIC))
+    {
+        fprintf(fd, "  Photometric Interpretation: ");
+        if (td->td_photometric < NPHOTONAMES)
+            fprintf(fd, "%s\n", photoNames[td->td_photometric]);
+        else
+        {
+            switch (td->td_photometric)
+            {
+                case PHOTOMETRIC_LOGL:
+                    fprintf(fd, "CIE Log2(L)\n");
+                    break;
+                case PHOTOMETRIC_LOGLUV:
+                    fprintf(fd, "CIE Log2(L) (u',v')\n");
+                    break;
+                default:
+                    fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n",
+                            td->td_photometric, td->td_photometric);
+                    break;
+            }
+        }
+    }
+    if (TIFFFieldSet(tif, FIELD_EXTRASAMPLES) && td->td_extrasamples)
+    {
+        uint16_t i;
+        fprintf(fd, "  Extra Samples: %" PRIu16 "<", td->td_extrasamples);
+        sep = "";
+        for (i = 0; i < td->td_extrasamples; i++)
+        {
+            switch (td->td_sampleinfo[i])
+            {
+                case EXTRASAMPLE_UNSPECIFIED:
+                    fprintf(fd, "%sunspecified", sep);
+                    break;
+                case EXTRASAMPLE_ASSOCALPHA:
+                    fprintf(fd, "%sassoc-alpha", sep);
+                    break;
+                case EXTRASAMPLE_UNASSALPHA:
+                    fprintf(fd, "%sunassoc-alpha", sep);
+                    break;
+                default:
+                    fprintf(fd, "%s%" PRIu16 " (0x%" PRIx16 ")", sep,
+                            td->td_sampleinfo[i], td->td_sampleinfo[i]);
+                    break;
+            }
+            sep = ", ";
+        }
+        fprintf(fd, ">\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_INKNAMES))
+    {
+        char *cp;
+        uint16_t i;
+        fprintf(fd, "  Ink Names: ");
+        i = td->td_samplesperpixel;
+        sep = "";
+        for (cp = td->td_inknames;
+             i > 0 && cp < td->td_inknames + td->td_inknameslen;
+             cp = strchr(cp, '\0') + 1, i--)
+        {
+            size_t max_chars = td->td_inknameslen - (cp - td->td_inknames);
+            fputs(sep, fd);
+            _TIFFprintAsciiBounded(fd, cp, max_chars);
+            sep = ", ";
+        }
+        fputs("\n", fd);
+    }
+    if (TIFFFieldSet(tif, FIELD_NUMBEROFINKS))
+    {
+        fprintf(fd, "  NumberOfInks: %d\n", td->td_numberofinks);
+    }
+    if (TIFFFieldSet(tif, FIELD_THRESHHOLDING))
+    {
+        fprintf(fd, "  Thresholding: ");
+        switch (td->td_threshholding)
+        {
+            case THRESHHOLD_BILEVEL:
+                fprintf(fd, "bilevel art scan\n");
+                break;
+            case THRESHHOLD_HALFTONE:
+                fprintf(fd, "halftone or dithered scan\n");
+                break;
+            case THRESHHOLD_ERRORDIFFUSE:
+                fprintf(fd, "error diffused\n");
+                break;
+            default:
+                fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n",
+                        td->td_threshholding, td->td_threshholding);
+                break;
+        }
+    }
+    if (TIFFFieldSet(tif, FIELD_FILLORDER))
+    {
+        fprintf(fd, "  FillOrder: ");
+        switch (td->td_fillorder)
+        {
+            case FILLORDER_MSB2LSB:
+                fprintf(fd, "msb-to-lsb\n");
+                break;
+            case FILLORDER_LSB2MSB:
+                fprintf(fd, "lsb-to-msb\n");
+                break;
+            default:
+                fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n", td->td_fillorder,
+                        td->td_fillorder);
+                break;
+        }
+    }
+    if (TIFFFieldSet(tif, FIELD_YCBCRSUBSAMPLING))
+    {
+        fprintf(fd, "  YCbCr Subsampling: %" PRIu16 ", %" PRIu16 "\n",
+                td->td_ycbcrsubsampling[0], td->td_ycbcrsubsampling[1]);
+    }
+    if (TIFFFieldSet(tif, FIELD_YCBCRPOSITIONING))
+    {
+        fprintf(fd, "  YCbCr Positioning: ");
+        switch (td->td_ycbcrpositioning)
+        {
+            case YCBCRPOSITION_CENTERED:
+                fprintf(fd, "centered\n");
+                break;
+            case YCBCRPOSITION_COSITED:
+                fprintf(fd, "cosited\n");
+                break;
+            default:
+                fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n",
+                        td->td_ycbcrpositioning, td->td_ycbcrpositioning);
+                break;
+        }
+    }
+    if (TIFFFieldSet(tif, FIELD_HALFTONEHINTS))
+        fprintf(fd, "  Halftone Hints: light %" PRIu16 " dark %" PRIu16 "\n",
+                td->td_halftonehints[0], td->td_halftonehints[1]);
+    if (TIFFFieldSet(tif, FIELD_ORIENTATION))
+    {
+        fprintf(fd, "  Orientation: ");
+        if (td->td_orientation < NORIENTNAMES)
+            fprintf(fd, "%s\n", orientNames[td->td_orientation]);
+        else
+            fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n", td->td_orientation,
+                    td->td_orientation);
+    }
+    if (TIFFFieldSet(tif, FIELD_SAMPLESPERPIXEL))
+        fprintf(fd, "  Samples/Pixel: %" PRIx16 "\n", td->td_samplesperpixel);
+    if (TIFFFieldSet(tif, FIELD_ROWSPERSTRIP))
+    {
+        fprintf(fd, "  Rows/Strip: ");
+        if (td->td_rowsperstrip == (uint32_t)-1)
+            fprintf(fd, "(infinite)\n");
+        else
+            fprintf(fd, "%" PRIu32 "\n", td->td_rowsperstrip);
+    }
+    if (TIFFFieldSet(tif, FIELD_MINSAMPLEVALUE))
+        fprintf(fd, "  Min Sample Value: %" PRIu16 "\n", td->td_minsamplevalue);
+    if (TIFFFieldSet(tif, FIELD_MAXSAMPLEVALUE))
+        fprintf(fd, "  Max Sample Value: %" PRIu16 "\n", td->td_maxsamplevalue);
+    if (TIFFFieldSet(tif, FIELD_SMINSAMPLEVALUE))
+    {
+        int i;
+        int count =
+            (tif->tif_flags & TIFF_PERSAMPLE) ? td->td_samplesperpixel : 1;
+        fprintf(fd, "  SMin Sample Value:");
+        for (i = 0; i < count; ++i)
+            fprintf(fd, " %g", td->td_sminsamplevalue[i]);
+        fprintf(fd, "\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_SMAXSAMPLEVALUE))
+    {
+        int i;
+        int count =
+            (tif->tif_flags & TIFF_PERSAMPLE) ? td->td_samplesperpixel : 1;
+        fprintf(fd, "  SMax Sample Value:");
+        for (i = 0; i < count; ++i)
+            fprintf(fd, " %g", td->td_smaxsamplevalue[i]);
+        fprintf(fd, "\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_PLANARCONFIG))
+    {
+        fprintf(fd, "  Planar Configuration: ");
+        switch (td->td_planarconfig)
+        {
+            case PLANARCONFIG_CONTIG:
+                fprintf(fd, "single image plane\n");
+                break;
+            case PLANARCONFIG_SEPARATE:
+                fprintf(fd, "separate image planes\n");
+                break;
+            default:
+                fprintf(fd, "%" PRIu16 " (0x%" PRIx16 ")\n",
+                        td->td_planarconfig, td->td_planarconfig);
+                break;
+        }
+    }
+    if (TIFFFieldSet(tif, FIELD_PAGENUMBER))
+        fprintf(fd, "  Page Number: %" PRIu16 "-%" PRIu16 "\n",
+                td->td_pagenumber[0], td->td_pagenumber[1]);
+    if (TIFFFieldSet(tif, FIELD_COLORMAP))
+    {
+        fprintf(fd, "  Color Map: ");
+        if (flags & TIFFPRINT_COLORMAP)
+        {
+            fprintf(fd, "\n");
+            n = 1L << td->td_bitspersample;
+            for (l = 0; l < n; l++)
+                fprintf(fd, "   %5ld: %5" PRIu16 " %5" PRIu16 " %5" PRIu16 "\n",
+                        l, td->td_colormap[0][l], td->td_colormap[1][l],
+                        td->td_colormap[2][l]);
+        }
+        else
+            fprintf(fd, "(present)\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_REFBLACKWHITE))
+    {
+        int i;
+        fprintf(fd, "  Reference Black/White:\n");
+        for (i = 0; i < 3; i++)
+            fprintf(fd, "    %2d: %5g %5g\n", i,
+                    td->td_refblackwhite[2 * i + 0],
+                    td->td_refblackwhite[2 * i + 1]);
+    }
+    if (TIFFFieldSet(tif, FIELD_TRANSFERFUNCTION))
+    {
+        fprintf(fd, "  Transfer Function: ");
+        if (flags & TIFFPRINT_CURVES)
+        {
+            fprintf(fd, "\n");
+            n = 1L << td->td_bitspersample;
+            for (l = 0; l < n; l++)
+            {
+                uint16_t i;
+                fprintf(fd, "    %2ld: %5" PRIu16, l,
+                        td->td_transferfunction[0][l]);
+                for (i = 1;
+                     i < td->td_samplesperpixel - td->td_extrasamples && i < 3;
+                     i++)
+                    fprintf(fd, " %5" PRIu16, td->td_transferfunction[i][l]);
+                fputc('\n', fd);
+            }
+        }
+        else
+            fprintf(fd, "(present)\n");
+    }
+    if (TIFFFieldSet(tif, FIELD_SUBIFD) && (td->td_subifd))
+    {
+        uint16_t i;
+        fprintf(fd, "  SubIFD Offsets:");
+        for (i = 0; i < td->td_nsubifd; i++)
+            fprintf(fd, " %5" PRIu64, td->td_subifd[i]);
+        fputc('\n', fd);
+    }
+
+    /*
+    ** Custom tag support.
+    */
+    {
+        int i;
+        short count;
+
+        count = (short)TIFFGetTagListCount(tif);
+        for (i = 0; i < count; i++)
+        {
+            uint32_t tag = TIFFGetTagListEntry(tif, i);
+            const TIFFField *fip;
+            uint32_t value_count;
+            int mem_alloc = 0;
+            void *raw_data = NULL;
+            uint16_t dotrange[2]; /* must be kept in that scope and not moved in
+                                     the below TIFFTAG_DOTRANGE specific case */
+
+            fip = TIFFFieldWithTag(tif, tag);
+            if (fip == NULL)
+                continue;
+
+            if (fip->field_passcount)
+            {
+                if (fip->field_readcount == TIFF_VARIABLE2)
+                {
+                    if (TIFFGetField(tif, tag, &value_count, &raw_data) != 1)
+                        continue;
+                }
+                else if (fip->field_readcount == TIFF_VARIABLE)
+                {
+                    uint16_t small_value_count;
+                    if (TIFFGetField(tif, tag, &small_value_count, &raw_data) !=
+                        1)
+                        continue;
+                    value_count = small_value_count;
+                }
+                else
+                {
+                    assert(fip->field_readcount == TIFF_VARIABLE ||
+                           fip->field_readcount == TIFF_VARIABLE2);
+                    continue;
+                }
+            }
+            else
+            {
+                if (fip->field_readcount == TIFF_VARIABLE ||
+                    fip->field_readcount == TIFF_VARIABLE2)
+                    value_count = 1;
+                else if (fip->field_readcount == TIFF_SPP)
+                    value_count = td->td_samplesperpixel;
+                else
+                    value_count = fip->field_readcount;
+                if (fip->field_tag == TIFFTAG_DOTRANGE &&
+                    strcmp(fip->field_name, "DotRange") == 0)
+                {
+                    /* TODO: This is an evil exception and should not have been
+                       handled this way ... likely best if we move it into
+                       the directory structure with an explicit field in
+                       libtiff 4.1 and assign it a FIELD_ value */
+                    raw_data = dotrange;
+                    TIFFGetField(tif, tag, dotrange + 0, dotrange + 1);
+                }
+                else if (fip->field_type == TIFF_ASCII ||
+                         fip->field_readcount == TIFF_VARIABLE ||
+                         fip->field_readcount == TIFF_VARIABLE2 ||
+                         fip->field_readcount == TIFF_SPP || value_count > 1)
+                {
+                    if (TIFFGetField(tif, tag, &raw_data) != 1)
+                        continue;
+                }
+                else
+                {
+                    /*--: Rational2Double: For Rationals evaluate
+                     * "set_field_type" to determine internal storage size. */
+                    int tv_size = TIFFFieldSetGetSize(fip);
+                    raw_data = _TIFFmallocExt(tif, tv_size * value_count);
+                    mem_alloc = 1;
+                    if (TIFFGetField(tif, tag, raw_data) != 1)
+                    {
+                        _TIFFfreeExt(tif, raw_data);
+                        continue;
+                    }
+                }
+            }
+
+            /*
+             * Catch the tags which needs to be specially handled
+             * and pretty print them. If tag not handled in
+             * _TIFFPrettyPrintField() fall down and print it as
+             * any other tag.
+             */
+            if (raw_data != NULL &&
+                !_TIFFPrettyPrintField(tif, fip, fd, tag, value_count,
+                                       raw_data))
+                _TIFFPrintField(fd, fip, value_count, raw_data);
+
+            if (mem_alloc)
+                _TIFFfreeExt(tif, raw_data);
+        }
+    }
+
+    if (tif->tif_tagmethods.printdir)
+        (*tif->tif_tagmethods.printdir)(tif, fd, flags);
+
+    if ((flags & TIFFPRINT_STRIPS) && TIFFFieldSet(tif, FIELD_STRIPOFFSETS))
+    {
+        uint32_t s;
+
+        fprintf(fd, "  %" PRIu32 " %s:\n", td->td_nstrips,
+                isTiled(tif) ? "Tiles" : "Strips");
+        for (s = 0; s < td->td_nstrips; s++)
+            fprintf(fd, "    %3" PRIu32 ": [%8" PRIu64 ", %8" PRIu64 "]\n", s,
+                    TIFFGetStrileOffset(tif, s),
+                    TIFFGetStrileByteCount(tif, s));
+    }
 }
 
-void
-_TIFFprintAscii(FILE* fd, const char* cp)
+void _TIFFprintAscii(FILE *fd, const char *cp)
 {
-	_TIFFprintAsciiBounded( fd, cp, strlen(cp));
+    _TIFFprintAsciiBounded(fd, cp, strlen(cp));
 }
 
-static void
-_TIFFprintAsciiBounded(FILE* fd, const char* cp, size_t max_chars)
+static void _TIFFprintAsciiBounded(FILE *fd, const char *cp, size_t max_chars)
 {
-	for (; max_chars > 0 && *cp != '\0'; cp++, max_chars--) {
-		const char* tp;
-
-		if (isprint((int)*cp)) {
-			fputc(*cp, fd);
-			continue;
-		}
-		for (tp = "\tt\bb\rr\nn\vv"; *tp; tp++)
-			if (*tp++ == *cp)
-				break;
-		if (*tp)
-			fprintf(fd, "\\%c", *tp);
-		else
-			fprintf(fd, "\\%03o", *cp & 0xff);
-	}
+    for (; max_chars > 0 && *cp != '\0'; cp++, max_chars--)
+    {
+        const char *tp;
+
+        if (isprint((int)*cp))
+        {
+            fputc(*cp, fd);
+            continue;
+        }
+        for (tp = "\tt\bb\rr\nn\vv"; *tp; tp++)
+            if (*tp++ == *cp)
+                break;
+        if (*tp)
+            fprintf(fd, "\\%c", *tp);
+        else
+            fprintf(fd, "\\%03o", *cp & 0xff);
+    }
 }
 
-void
-_TIFFprintAsciiTag(FILE* fd, const char* name, const char* value)
+void _TIFFprintAsciiTag(FILE *fd, const char *name, const char *value)
 {
-	fprintf(fd, "  %s: \"", name);
-	_TIFFprintAscii(fd, value);
-	fprintf(fd, "\"\n");
+    fprintf(fd, "  %s: \"", name);
+    _TIFFprintAscii(fd, value);
+    fprintf(fd, "\"\n");
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_read.c b/3rdparty/libtiff/tif_read.c
index c4c868b1c53e..4fec83969ea4 100644
--- a/3rdparty/libtiff/tif_read.c
+++ b/3rdparty/libtiff/tif_read.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -29,292 +29,273 @@
 #include "tiffiop.h"
 #include <stdio.h>
 
-int TIFFFillStrip(TIFF* tif, uint32 strip);
-int TIFFFillTile(TIFF* tif, uint32 tile);
-static int TIFFStartStrip(TIFF* tif, uint32 strip);
-static int TIFFStartTile(TIFF* tif, uint32 tile);
-static int TIFFCheckRead(TIFF*, int);
-static tmsize_t
-TIFFReadRawStrip1(TIFF* tif, uint32 strip, void* buf, tmsize_t size,const char* module);
-static tmsize_t
-TIFFReadRawTile1(TIFF* tif, uint32 tile, void* buf, tmsize_t size, const char* module);
+int TIFFFillStrip(TIFF *tif, uint32_t strip);
+int TIFFFillTile(TIFF *tif, uint32_t tile);
+static int TIFFStartStrip(TIFF *tif, uint32_t strip);
+static int TIFFStartTile(TIFF *tif, uint32_t tile);
+static int TIFFCheckRead(TIFF *, int);
+static tmsize_t TIFFReadRawStrip1(TIFF *tif, uint32_t strip, void *buf,
+                                  tmsize_t size, const char *module);
+static tmsize_t TIFFReadRawTile1(TIFF *tif, uint32_t tile, void *buf,
+                                 tmsize_t size, const char *module);
 
-#define NOSTRIP ((uint32)(-1))       /* undefined state */
-#define NOTILE ((uint32)(-1))         /* undefined state */
+#define NOSTRIP ((uint32_t)(-1)) /* undefined state */
+#define NOTILE ((uint32_t)(-1))  /* undefined state */
 
 #define INITIAL_THRESHOLD (1024 * 1024)
 #define THRESHOLD_MULTIPLIER 10
-#define MAX_THRESHOLD (THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER * INITIAL_THRESHOLD)
+#define MAX_THRESHOLD                                                          \
+    (THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER * THRESHOLD_MULTIPLIER *      \
+     INITIAL_THRESHOLD)
 
-#define TIFF_INT64_MAX ((((int64)0x7FFFFFFF) << 32) | 0xFFFFFFFF)
+#define TIFF_INT64_MAX ((((int64_t)0x7FFFFFFF) << 32) | 0xFFFFFFFF)
 
 /* Read 'size' bytes in tif_rawdata buffer starting at offset 'rawdata_offset'
  * Returns 1 in case of success, 0 otherwise. */
-static int TIFFReadAndRealloc( TIFF* tif, tmsize_t size,
-                               tmsize_t rawdata_offset,
-                               int is_strip, uint32 strip_or_tile,
-                               const char* module )
+static int TIFFReadAndRealloc(TIFF *tif, tmsize_t size, tmsize_t rawdata_offset,
+                              int is_strip, uint32_t strip_or_tile,
+                              const char *module)
 {
 #if SIZEOF_SIZE_T == 8
-        tmsize_t threshold = INITIAL_THRESHOLD;
+    tmsize_t threshold = INITIAL_THRESHOLD;
 #endif
-        tmsize_t already_read = 0;
-
+    tmsize_t already_read = 0;
 
 #if SIZEOF_SIZE_T != 8
-        /* On 32 bit processes, if the request is large enough, check against */
-        /* file size */
-        if( size > 1000 * 1000 * 1000 )
+    /* On 32 bit processes, if the request is large enough, check against */
+    /* file size */
+    if (size > 1000 * 1000 * 1000)
+    {
+        uint64_t filesize = TIFFGetFileSize(tif);
+        if ((uint64_t)size >= filesize)
         {
-            uint64 filesize = TIFFGetFileSize(tif);
-            if( (uint64)size >= filesize )
-            {
-                TIFFErrorExt(tif->tif_clientdata, module,
-                             "Chunk size requested is larger than file size.");
-                return 0;
-            }
+            TIFFErrorExtR(tif, module,
+                          "Chunk size requested is larger than file size.");
+            return 0;
         }
+    }
 #endif
 
-        /* On 64 bit processes, read first a maximum of 1 MB, then 10 MB, etc */
-        /* so as to avoid allocating too much memory in case the file is too */
-        /* short. We could ask for the file size, but this might be */
-        /* expensive with some I/O layers (think of reading a gzipped file) */
-        /* Restrict to 64 bit processes, so as to avoid reallocs() */
-        /* on 32 bit processes where virtual memory is scarce.  */
-        while( already_read < size )
-        {
-            tmsize_t bytes_read;
-            tmsize_t to_read = size - already_read;
+    /* On 64 bit processes, read first a maximum of 1 MB, then 10 MB, etc */
+    /* so as to avoid allocating too much memory in case the file is too */
+    /* short. We could ask for the file size, but this might be */
+    /* expensive with some I/O layers (think of reading a gzipped file) */
+    /* Restrict to 64 bit processes, so as to avoid reallocs() */
+    /* on 32 bit processes where virtual memory is scarce.  */
+    while (already_read < size)
+    {
+        tmsize_t bytes_read;
+        tmsize_t to_read = size - already_read;
 #if SIZEOF_SIZE_T == 8
-            if( to_read >= threshold && threshold < MAX_THRESHOLD &&
-                already_read + to_read + rawdata_offset > tif->tif_rawdatasize )
-            {
-                to_read = threshold;
-                threshold *= THRESHOLD_MULTIPLIER;
-            }
+        if (to_read >= threshold && threshold < MAX_THRESHOLD &&
+            already_read + to_read + rawdata_offset > tif->tif_rawdatasize)
+        {
+            to_read = threshold;
+            threshold *= THRESHOLD_MULTIPLIER;
+        }
 #endif
-            if (already_read + to_read + rawdata_offset > tif->tif_rawdatasize) {
-                uint8* new_rawdata;
-                assert((tif->tif_flags & TIFF_MYBUFFER) != 0);
-                tif->tif_rawdatasize = (tmsize_t)TIFFroundup_64(
-                        (uint64)already_read + to_read + rawdata_offset, 1024);
-                if (tif->tif_rawdatasize==0) {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                "Invalid buffer size");
-                    return 0;
-                }
-                new_rawdata = (uint8*) _TIFFrealloc(
-                                tif->tif_rawdata, tif->tif_rawdatasize);
-                if( new_rawdata == 0 )
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                        "No space for data buffer at scanline %lu",
-                        (unsigned long) tif->tif_row);
-                    _TIFFfree(tif->tif_rawdata);
-                    tif->tif_rawdata = 0;
-                    tif->tif_rawdatasize = 0;
-                    return 0;
-                }
-                tif->tif_rawdata = new_rawdata;
+        if (already_read + to_read + rawdata_offset > tif->tif_rawdatasize)
+        {
+            uint8_t *new_rawdata;
+            assert((tif->tif_flags & TIFF_MYBUFFER) != 0);
+            tif->tif_rawdatasize = (tmsize_t)TIFFroundup_64(
+                (uint64_t)already_read + to_read + rawdata_offset, 1024);
+            if (tif->tif_rawdatasize == 0)
+            {
+                TIFFErrorExtR(tif, module, "Invalid buffer size");
+                return 0;
             }
-            if( tif->tif_rawdata == NULL )
+            new_rawdata =
+                (uint8_t *)_TIFFrealloc(tif->tif_rawdata, tif->tif_rawdatasize);
+            if (new_rawdata == 0)
             {
-                /* should not happen in practice but helps CoverityScan */
+                TIFFErrorExtR(tif, module,
+                              "No space for data buffer at scanline %" PRIu32,
+                              tif->tif_row);
+                _TIFFfreeExt(tif, tif->tif_rawdata);
+                tif->tif_rawdata = 0;
+                tif->tif_rawdatasize = 0;
                 return 0;
             }
+            tif->tif_rawdata = new_rawdata;
+        }
+        if (tif->tif_rawdata == NULL)
+        {
+            /* should not happen in practice but helps CoverityScan */
+            return 0;
+        }
 
-            bytes_read = TIFFReadFile(tif,
-                tif->tif_rawdata + rawdata_offset + already_read, to_read);
-            already_read += bytes_read;
-            if (bytes_read != to_read) {
-                memset( tif->tif_rawdata + rawdata_offset + already_read, 0,
-                        tif->tif_rawdatasize - rawdata_offset - already_read );
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-                if( is_strip )
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                        "Read error at scanline %lu; got %I64u bytes, "
-                        "expected %I64u",
-                                        (unsigned long) tif->tif_row,
-                                        (unsigned __int64) already_read,
-                                        (unsigned __int64) size);
-                }
-                else
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                        "Read error at row %lu, col %lu, tile %lu; "
-                        "got %I64u bytes, expected %I64u",
-                                        (unsigned long) tif->tif_row,
-                                        (unsigned long) tif->tif_col,
-                                        (unsigned long) strip_or_tile,
-                                        (unsigned __int64) already_read,
-                                        (unsigned __int64) size);
-                }
-#else
-                if( is_strip )
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                        "Read error at scanline %lu; got %llu bytes, "
-                        "expected %llu",
-                                        (unsigned long) tif->tif_row,
-                                        (unsigned long long) already_read,
-                                        (unsigned long long) size);
-                }
-                else
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                        "Read error at row %lu, col %lu, tile %lu; "
-                        "got %llu bytes, expected %llu",
-                                        (unsigned long) tif->tif_row,
-                                        (unsigned long) tif->tif_col,
-                                        (unsigned long) strip_or_tile,
-                                        (unsigned long long) already_read,
-                                        (unsigned long long) size);
-                }
-#endif
-                return 0;
+        bytes_read = TIFFReadFile(
+            tif, tif->tif_rawdata + rawdata_offset + already_read, to_read);
+        already_read += bytes_read;
+        if (bytes_read != to_read)
+        {
+            memset(tif->tif_rawdata + rawdata_offset + already_read, 0,
+                   tif->tif_rawdatasize - rawdata_offset - already_read);
+            if (is_strip)
+            {
+                TIFFErrorExtR(tif, module,
+                              "Read error at scanline %" PRIu32
+                              "; got %" TIFF_SSIZE_FORMAT " bytes, "
+                              "expected %" TIFF_SSIZE_FORMAT,
+                              tif->tif_row, already_read, size);
+            }
+            else
+            {
+                TIFFErrorExtR(tif, module,
+                              "Read error at row %" PRIu32 ", col %" PRIu32
+                              ", tile %" PRIu32 "; "
+                              "got %" TIFF_SSIZE_FORMAT
+                              " bytes, expected %" TIFF_SSIZE_FORMAT "",
+                              tif->tif_row, tif->tif_col, strip_or_tile,
+                              already_read, size);
             }
+            return 0;
         }
-        return 1;
+    }
+    return 1;
 }
 
-
-static int
-TIFFFillStripPartial( TIFF *tif, int strip, tmsize_t read_ahead, int restart )
+static int TIFFFillStripPartial(TIFF *tif, int strip, tmsize_t read_ahead,
+                                int restart)
 {
-	static const char module[] = "TIFFFillStripPartial";
-	register TIFFDirectory *td = &tif->tif_dir;
-        tmsize_t unused_data;
-        uint64 read_offset;
-        tmsize_t to_read;
-        tmsize_t read_ahead_mod;
-        /* tmsize_t bytecountm; */
+    static const char module[] = "TIFFFillStripPartial";
+    register TIFFDirectory *td = &tif->tif_dir;
+    tmsize_t unused_data;
+    uint64_t read_offset;
+    tmsize_t to_read;
+    tmsize_t read_ahead_mod;
+    /* tmsize_t bytecountm; */
+
+    /*
+     * Expand raw data buffer, if needed, to hold data
+     * strip coming from file (perhaps should set upper
+     * bound on the size of a buffer we'll use?).
+     */
+
+    /* bytecountm=(tmsize_t) TIFFGetStrileByteCount(tif, strip); */
+
+    /* Not completely sure where the * 2 comes from, but probably for */
+    /* an exponentional growth strategy of tif_rawdatasize */
+    if (read_ahead < TIFF_TMSIZE_T_MAX / 2)
+        read_ahead_mod = read_ahead * 2;
+    else
+        read_ahead_mod = read_ahead;
+    if (read_ahead_mod > tif->tif_rawdatasize)
+    {
+        assert(restart);
 
-        /*
-         * Expand raw data buffer, if needed, to hold data
-         * strip coming from file (perhaps should set upper
-         * bound on the size of a buffer we'll use?).
-         */
+        tif->tif_curstrip = NOSTRIP;
+        if ((tif->tif_flags & TIFF_MYBUFFER) == 0)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Data buffer too small to hold part of strip %d",
+                          strip);
+            return (0);
+        }
+    }
 
-        /* bytecountm=(tmsize_t) TIFFGetStrileByteCount(tif, strip); */
+    if (restart)
+    {
+        tif->tif_rawdataloaded = 0;
+        tif->tif_rawdataoff = 0;
+    }
 
-        /* Not completely sure where the * 2 comes from, but probably for */
-        /* an exponentional growth strategy of tif_rawdatasize */
-        if( read_ahead < TIFF_TMSIZE_T_MAX / 2 )
-                read_ahead_mod = read_ahead * 2;
-        else
-                read_ahead_mod = read_ahead;
-        if (read_ahead_mod > tif->tif_rawdatasize) {
-                assert( restart );
-                
-                tif->tif_curstrip = NOSTRIP;
-                if ((tif->tif_flags & TIFF_MYBUFFER) == 0) {
-                        TIFFErrorExt(tif->tif_clientdata, module,
-                                     "Data buffer too small to hold part of strip %lu",
-                                     (unsigned long) strip);
-                        return (0);
-                }
-        }
+    /*
+    ** If we are reading more data, move any unused data to the
+    ** start of the buffer.
+    */
+    if (tif->tif_rawdataloaded > 0)
+        unused_data =
+            tif->tif_rawdataloaded - (tif->tif_rawcp - tif->tif_rawdata);
+    else
+        unused_data = 0;
 
-        if( restart )
-        {
-                tif->tif_rawdataloaded = 0;
-                tif->tif_rawdataoff = 0;
-        }
+    if (unused_data > 0)
+    {
+        assert((tif->tif_flags & TIFF_BUFFERMMAP) == 0);
+        memmove(tif->tif_rawdata, tif->tif_rawcp, unused_data);
+    }
 
-        /*
-        ** If we are reading more data, move any unused data to the
-        ** start of the buffer.
-        */
-        if( tif->tif_rawdataloaded > 0 )
-                unused_data = tif->tif_rawdataloaded - (tif->tif_rawcp - tif->tif_rawdata);
-        else
-                unused_data = 0;
-        
-        if( unused_data > 0 )
-        {
-		assert((tif->tif_flags&TIFF_BUFFERMMAP)==0);
-                memmove( tif->tif_rawdata, tif->tif_rawcp, unused_data );
-        }
+    /*
+    ** Seek to the point in the file where more data should be read.
+    */
+    read_offset = TIFFGetStrileOffset(tif, strip) + tif->tif_rawdataoff +
+                  tif->tif_rawdataloaded;
 
-        /*
-        ** Seek to the point in the file where more data should be read.
-        */
-        read_offset = TIFFGetStrileOffset(tif, strip)
-                + tif->tif_rawdataoff + tif->tif_rawdataloaded;
-
-        if (!SeekOK(tif, read_offset)) {
-                TIFFErrorExt(tif->tif_clientdata, module,
-                             "Seek error at scanline %lu, strip %lu",
-                             (unsigned long) tif->tif_row, (unsigned long) strip);
-                return 0;
-        }
+    if (!SeekOK(tif, read_offset))
+    {
+        TIFFErrorExtR(tif, module,
+                      "Seek error at scanline %" PRIu32 ", strip %d",
+                      tif->tif_row, strip);
+        return 0;
+    }
 
-        /*
-        ** How much do we want to read?
-        */
-        if( read_ahead_mod > tif->tif_rawdatasize )
-                to_read = read_ahead_mod - unused_data;
-        else
-                to_read = tif->tif_rawdatasize - unused_data;
-        if( (uint64) to_read > TIFFGetStrileByteCount(tif, strip)
-            - tif->tif_rawdataoff - tif->tif_rawdataloaded )
-        {
-                to_read = (tmsize_t) TIFFGetStrileByteCount(tif, strip)
-                        - tif->tif_rawdataoff - tif->tif_rawdataloaded;
-        }
+    /*
+    ** How much do we want to read?
+    */
+    if (read_ahead_mod > tif->tif_rawdatasize)
+        to_read = read_ahead_mod - unused_data;
+    else
+        to_read = tif->tif_rawdatasize - unused_data;
+    if ((uint64_t)to_read > TIFFGetStrileByteCount(tif, strip) -
+                                tif->tif_rawdataoff - tif->tif_rawdataloaded)
+    {
+        to_read = (tmsize_t)TIFFGetStrileByteCount(tif, strip) -
+                  tif->tif_rawdataoff - tif->tif_rawdataloaded;
+    }
 
-	assert((tif->tif_flags&TIFF_BUFFERMMAP)==0);
-        if( !TIFFReadAndRealloc( tif, to_read, unused_data,
-                                 1, /* is_strip */
-                                 0, /* strip_or_tile */
-                                 module) )
-        {
-                return 0;
-        }
+    assert((tif->tif_flags & TIFF_BUFFERMMAP) == 0);
+    if (!TIFFReadAndRealloc(tif, to_read, unused_data, 1, /* is_strip */
+                            0,                            /* strip_or_tile */
+                            module))
+    {
+        return 0;
+    }
 
-        tif->tif_rawdataoff = tif->tif_rawdataoff + tif->tif_rawdataloaded - unused_data ;
-        tif->tif_rawdataloaded = unused_data + to_read;
+    tif->tif_rawdataoff =
+        tif->tif_rawdataoff + tif->tif_rawdataloaded - unused_data;
+    tif->tif_rawdataloaded = unused_data + to_read;
 
-        tif->tif_rawcc = tif->tif_rawdataloaded;
-        tif->tif_rawcp = tif->tif_rawdata;
-                        
-        if (!isFillOrder(tif, td->td_fillorder) &&
-            (tif->tif_flags & TIFF_NOBITREV) == 0) {
-		assert((tif->tif_flags&TIFF_BUFFERMMAP)==0);
-                TIFFReverseBits(tif->tif_rawdata + unused_data, to_read );
-	}
+    tif->tif_rawcc = tif->tif_rawdataloaded;
+    tif->tif_rawcp = tif->tif_rawdata;
 
-        /*
-        ** When starting a strip from the beginning we need to
-        ** restart the decoder.
-        */
-        if( restart )
-        {
+    if (!isFillOrder(tif, td->td_fillorder) &&
+        (tif->tif_flags & TIFF_NOBITREV) == 0)
+    {
+        assert((tif->tif_flags & TIFF_BUFFERMMAP) == 0);
+        TIFFReverseBits(tif->tif_rawdata + unused_data, to_read);
+    }
+
+    /*
+    ** When starting a strip from the beginning we need to
+    ** restart the decoder.
+    */
+    if (restart)
+    {
 
 #ifdef JPEG_SUPPORT
-            /* A bit messy since breaks the codec abstraction. Ultimately */
-            /* there should be a function pointer for that, but it seems */
-            /* only JPEG is affected. */
-            /* For JPEG, if there are multiple scans (can generally be known */
-            /* with the  read_ahead used), we need to read the whole strip */
-            if( tif->tif_dir.td_compression==COMPRESSION_JPEG &&
-                (uint64)tif->tif_rawcc < TIFFGetStrileByteCount(tif, strip) )
+        /* A bit messy since breaks the codec abstraction. Ultimately */
+        /* there should be a function pointer for that, but it seems */
+        /* only JPEG is affected. */
+        /* For JPEG, if there are multiple scans (can generally be known */
+        /* with the  read_ahead used), we need to read the whole strip */
+        if (tif->tif_dir.td_compression == COMPRESSION_JPEG &&
+            (uint64_t)tif->tif_rawcc < TIFFGetStrileByteCount(tif, strip))
+        {
+            if (TIFFJPEGIsFullStripRequired(tif))
             {
-                if( TIFFJPEGIsFullStripRequired(tif) )
-                {
-                    return TIFFFillStrip(tif, strip);
-                }
+                return TIFFFillStrip(tif, strip);
             }
+        }
 #endif
 
-            return TIFFStartStrip(tif, strip);
-        }
-        else
-        {
-                return 1;
-        }
+        return TIFFStartStrip(tif, strip);
+    }
+    else
+    {
+        return 1;
+    }
 }
 
 /*
@@ -325,159 +306,165 @@ TIFFFillStripPartial( TIFF *tif, int strip, tmsize_t read_ahead, int restart )
  * and avoid reading the whole compressed raw data for big
  * strips.
  */
-static int
-TIFFSeek(TIFF* tif, uint32 row, uint16 sample )
+static int TIFFSeek(TIFF *tif, uint32_t row, uint16_t sample)
 {
-	register TIFFDirectory *td = &tif->tif_dir;
-	uint32 strip;
-        int    whole_strip;
-	tmsize_t read_ahead = 0;
-
-        /*
-        ** Establish what strip we are working from.
-        */
-	if (row >= td->td_imagelength) {	/* out of range */
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-		    "%lu: Row out of range, max %lu",
-		    (unsigned long) row,
-		    (unsigned long) td->td_imagelength);
-		return (0);
-	}
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE) {
-		if (sample >= td->td_samplesperpixel) {
-			TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			    "%lu: Sample out of range, max %lu",
-			    (unsigned long) sample, (unsigned long) td->td_samplesperpixel);
-			return (0);
-		}
-		strip = (uint32)sample*td->td_stripsperimage + row/td->td_rowsperstrip;
-	} else
-		strip = row / td->td_rowsperstrip;
+    register TIFFDirectory *td = &tif->tif_dir;
+    uint32_t strip;
+    int whole_strip;
+    tmsize_t read_ahead = 0;
+
+    /*
+    ** Establish what strip we are working from.
+    */
+    if (row >= td->td_imagelength)
+    { /* out of range */
+        TIFFErrorExtR(tif, tif->tif_name,
+                      "%" PRIu32 ": Row out of range, max %" PRIu32 "", row,
+                      td->td_imagelength);
+        return (0);
+    }
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+    {
+        if (sample >= td->td_samplesperpixel)
+        {
+            TIFFErrorExtR(tif, tif->tif_name,
+                          "%" PRIu16 ": Sample out of range, max %" PRIu16 "",
+                          sample, td->td_samplesperpixel);
+            return (0);
+        }
+        strip = (uint32_t)sample * td->td_stripsperimage +
+                row / td->td_rowsperstrip;
+    }
+    else
+        strip = row / td->td_rowsperstrip;
 
         /*
          * Do we want to treat this strip as one whole chunk or
          * read it a few lines at a time?
          */
 #if defined(CHUNKY_STRIP_READ_SUPPORT)
-        whole_strip = TIFFGetStrileByteCount(tif, strip) < 10
-                || isMapped(tif);
-        if( td->td_compression == COMPRESSION_LERC ||
-            td->td_compression == COMPRESSION_JBIG )
-        {
-            /* Ideally plugins should have a way to declare they don't support
-             * chunk strip */
-            whole_strip = 1;
-        }
-#else
+    whole_strip = TIFFGetStrileByteCount(tif, strip) < 10 || isMapped(tif);
+    if (td->td_compression == COMPRESSION_LERC ||
+        td->td_compression == COMPRESSION_JBIG)
+    {
+        /* Ideally plugins should have a way to declare they don't support
+         * chunk strip */
         whole_strip = 1;
+    }
+#else
+    whole_strip = 1;
 #endif
-        
-        if( !whole_strip )
+
+    if (!whole_strip)
+    {
+        /* 16 is for YCbCr mode where we may need to read 16 */
+        /* lines at a time to get a decompressed line, and 5000 */
+        /* is some constant value, for example for JPEG tables */
+        if (tif->tif_scanlinesize < TIFF_TMSIZE_T_MAX / 16 &&
+            tif->tif_scanlinesize * 16 < TIFF_TMSIZE_T_MAX - 5000)
         {
-                /* 16 is for YCbCr mode where we may need to read 16 */
-                /* lines at a time to get a decompressed line, and 5000 */
-                /* is some constant value, for example for JPEG tables */
-                if( tif->tif_scanlinesize < TIFF_TMSIZE_T_MAX / 16 &&
-                    tif->tif_scanlinesize * 16 < TIFF_TMSIZE_T_MAX - 5000 )
-                {
-                        read_ahead = tif->tif_scanlinesize * 16 + 5000;
-                }
-                else
-                {
-                        read_ahead = tif->tif_scanlinesize;
-                }
+            read_ahead = tif->tif_scanlinesize * 16 + 5000;
+        }
+        else
+        {
+            read_ahead = tif->tif_scanlinesize;
         }
+    }
+
+    /*
+     * If we haven't loaded this strip, do so now, possibly
+     * only reading the first part.
+     */
+    if (strip != tif->tif_curstrip)
+    { /* different strip, refill */
 
+        if (whole_strip)
+        {
+            if (!TIFFFillStrip(tif, strip))
+                return (0);
+        }
+        else
+        {
+            if (!TIFFFillStripPartial(tif, strip, read_ahead, 1))
+                return 0;
+        }
+    }
+
+    /*
+    ** If we already have some data loaded, do we need to read some more?
+    */
+    else if (!whole_strip)
+    {
+        if (((tif->tif_rawdata + tif->tif_rawdataloaded) - tif->tif_rawcp) <
+                read_ahead &&
+            (uint64_t)tif->tif_rawdataoff + tif->tif_rawdataloaded <
+                TIFFGetStrileByteCount(tif, strip))
+        {
+            if (!TIFFFillStripPartial(tif, strip, read_ahead, 0))
+                return 0;
+        }
+    }
+
+    if (row < tif->tif_row)
+    {
         /*
-         * If we haven't loaded this strip, do so now, possibly
-         * only reading the first part.
+         * Moving backwards within the same strip: backup
+         * to the start and then decode forward (below).
+         *
+         * NB: If you're planning on lots of random access within a
+         * strip, it's better to just read and decode the entire
+         * strip, and then access the decoded data in a random fashion.
          */
-	if (strip != tif->tif_curstrip) {	/* different strip, refill */
-                
-                if( whole_strip )
-                {
-                        if (!TIFFFillStrip(tif, strip))
-                                return (0);
-                }
-                else
-                {
-                        if( !TIFFFillStripPartial(tif,strip,read_ahead,1) )
-                                return 0;
-                }
-	}
 
-        /*
-        ** If we already have some data loaded, do we need to read some more?
-        */
-        else if( !whole_strip )
+        if (tif->tif_rawdataoff != 0)
         {
-                if( ((tif->tif_rawdata + tif->tif_rawdataloaded) - tif->tif_rawcp) < read_ahead 
-                    && (uint64) tif->tif_rawdataoff+tif->tif_rawdataloaded < TIFFGetStrileByteCount(tif, strip) )
-                {
-                        if( !TIFFFillStripPartial(tif,strip,read_ahead,0) )
-                                return 0;
-                }
+            if (!TIFFFillStripPartial(tif, strip, read_ahead, 1))
+                return 0;
+        }
+        else
+        {
+            if (!TIFFStartStrip(tif, strip))
+                return (0);
         }
+    }
 
-        if (row < tif->tif_row) {
-		/*
-		 * Moving backwards within the same strip: backup
-		 * to the start and then decode forward (below).
-		 *
-		 * NB: If you're planning on lots of random access within a
-		 * strip, it's better to just read and decode the entire
-		 * strip, and then access the decoded data in a random fashion.
-		 */
-
-                if( tif->tif_rawdataoff != 0 )
-                {
-                        if( !TIFFFillStripPartial(tif,strip,read_ahead,1) )
-                                return 0;
-                }
-                else
-                {
-                        if (!TIFFStartStrip(tif, strip))
-                                return (0);
-                }
-	}
-        
-	if (row != tif->tif_row) {
-		/*
-		 * Seek forward to the desired row.
-		 */
-
-                /* TODO: Will this really work with partial buffers? */
-                
-		if (!(*tif->tif_seek)(tif, row - tif->tif_row))
-			return (0);
-		tif->tif_row = row;
-	}
-
-	return (1);
+    if (row != tif->tif_row)
+    {
+        /*
+         * Seek forward to the desired row.
+         */
+
+        /* TODO: Will this really work with partial buffers? */
+
+        if (!(*tif->tif_seek)(tif, row - tif->tif_row))
+            return (0);
+        tif->tif_row = row;
+    }
+
+    return (1);
 }
 
-int
-TIFFReadScanline(TIFF* tif, void* buf, uint32 row, uint16 sample)
+int TIFFReadScanline(TIFF *tif, void *buf, uint32_t row, uint16_t sample)
 {
-	int e;
-
-	if (!TIFFCheckRead(tif, 0))
-		return (-1);
-	if( (e = TIFFSeek(tif, row, sample)) != 0) {
-		/*
-		 * Decompress desired row into user buffer.
-		 */
-		e = (*tif->tif_decoderow)
-		    (tif, (uint8*) buf, tif->tif_scanlinesize, sample);  
-
-		/* we are now poised at the beginning of the next row */
-		tif->tif_row = row + 1;
-
-		if (e)
-			(*tif->tif_postdecode)(tif, (uint8*) buf,
-			    tif->tif_scanlinesize);  
-	}
-	return (e > 0 ? 1 : -1);
+    int e;
+
+    if (!TIFFCheckRead(tif, 0))
+        return (-1);
+    if ((e = TIFFSeek(tif, row, sample)) != 0)
+    {
+        /*
+         * Decompress desired row into user buffer.
+         */
+        e = (*tif->tif_decoderow)(tif, (uint8_t *)buf, tif->tif_scanlinesize,
+                                  sample);
+
+        /* we are now poised at the beginning of the next row */
+        tif->tif_row = row + 1;
+
+        if (e)
+            (*tif->tif_postdecode)(tif, (uint8_t *)buf, tif->tif_scanlinesize);
+    }
+    return (e > 0 ? 1 : -1);
 }
 
 /*
@@ -485,471 +472,436 @@ TIFFReadScanline(TIFF* tif, void* buf, uint32 row, uint16 sample)
  * rows in the strip (check for truncated last strip on any
  * of the separations).
  */
-static tmsize_t TIFFReadEncodedStripGetStripSize(TIFF* tif, uint32 strip, uint16* pplane)
+static tmsize_t TIFFReadEncodedStripGetStripSize(TIFF *tif, uint32_t strip,
+                                                 uint16_t *pplane)
 {
-	static const char module[] = "TIFFReadEncodedStrip";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint32 rowsperstrip;
-	uint32 stripsperplane;
-	uint32 stripinplane;
-	uint32 rows;
-	tmsize_t stripsize;
-	if (!TIFFCheckRead(tif,0))
-		return((tmsize_t)(-1));
-	if (strip>=td->td_nstrips)
-	{
-		TIFFErrorExt(tif->tif_clientdata,module,
-		    "%lu: Strip out of range, max %lu",(unsigned long)strip,
-		    (unsigned long)td->td_nstrips);
-		return((tmsize_t)(-1));
-	}
-
-	rowsperstrip=td->td_rowsperstrip;
-	if (rowsperstrip>td->td_imagelength)
-		rowsperstrip=td->td_imagelength;
-	stripsperplane= TIFFhowmany_32_maxuint_compat(td->td_imagelength, rowsperstrip);
-	stripinplane=(strip%stripsperplane);
-	if( pplane ) *pplane=(uint16)(strip/stripsperplane);
-	rows=td->td_imagelength-stripinplane*rowsperstrip;
-	if (rows>rowsperstrip)
-		rows=rowsperstrip;
-	stripsize=TIFFVStripSize(tif,rows);
-	if (stripsize==0)
-		return((tmsize_t)(-1));
-	return stripsize;
+    static const char module[] = "TIFFReadEncodedStrip";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint32_t rowsperstrip;
+    uint32_t stripsperplane;
+    uint32_t stripinplane;
+    uint32_t rows;
+    tmsize_t stripsize;
+    if (!TIFFCheckRead(tif, 0))
+        return ((tmsize_t)(-1));
+    if (strip >= td->td_nstrips)
+    {
+        TIFFErrorExtR(tif, module,
+                      "%" PRIu32 ": Strip out of range, max %" PRIu32, strip,
+                      td->td_nstrips);
+        return ((tmsize_t)(-1));
+    }
+
+    rowsperstrip = td->td_rowsperstrip;
+    if (rowsperstrip > td->td_imagelength)
+        rowsperstrip = td->td_imagelength;
+    stripsperplane =
+        TIFFhowmany_32_maxuint_compat(td->td_imagelength, rowsperstrip);
+    stripinplane = (strip % stripsperplane);
+    if (pplane)
+        *pplane = (uint16_t)(strip / stripsperplane);
+    rows = td->td_imagelength - stripinplane * rowsperstrip;
+    if (rows > rowsperstrip)
+        rows = rowsperstrip;
+    stripsize = TIFFVStripSize(tif, rows);
+    if (stripsize == 0)
+        return ((tmsize_t)(-1));
+    return stripsize;
 }
 
 /*
  * Read a strip of data and decompress the specified
  * amount into the user-supplied buffer.
  */
-tmsize_t
-TIFFReadEncodedStrip(TIFF* tif, uint32 strip, void* buf, tmsize_t size)
+tmsize_t TIFFReadEncodedStrip(TIFF *tif, uint32_t strip, void *buf,
+                              tmsize_t size)
 {
-	static const char module[] = "TIFFReadEncodedStrip";
-	TIFFDirectory *td = &tif->tif_dir;
-	tmsize_t stripsize;
-	uint16 plane;
+    static const char module[] = "TIFFReadEncodedStrip";
+    TIFFDirectory *td = &tif->tif_dir;
+    tmsize_t stripsize;
+    uint16_t plane;
 
-	stripsize=TIFFReadEncodedStripGetStripSize(tif, strip, &plane);
-	if (stripsize==((tmsize_t)(-1)))
-		return((tmsize_t)(-1));
+    stripsize = TIFFReadEncodedStripGetStripSize(tif, strip, &plane);
+    if (stripsize == ((tmsize_t)(-1)))
+        return ((tmsize_t)(-1));
 
     /* shortcut to avoid an extra memcpy() */
-    if( td->td_compression == COMPRESSION_NONE &&
-        size!=(tmsize_t)(-1) && size >= stripsize &&
-        !isMapped(tif) &&
-        ((tif->tif_flags&TIFF_NOREADRAW)==0) )
+    if (td->td_compression == COMPRESSION_NONE && size != (tmsize_t)(-1) &&
+        size >= stripsize && !isMapped(tif) &&
+        ((tif->tif_flags & TIFF_NOREADRAW) == 0))
     {
         if (TIFFReadRawStrip1(tif, strip, buf, stripsize, module) != stripsize)
             return ((tmsize_t)(-1));
 
         if (!isFillOrder(tif, td->td_fillorder) &&
             (tif->tif_flags & TIFF_NOBITREV) == 0)
-            TIFFReverseBits(buf,stripsize);
+            TIFFReverseBits(buf, stripsize);
 
-        (*tif->tif_postdecode)(tif,buf,stripsize);
+        (*tif->tif_postdecode)(tif, buf, stripsize);
         return (stripsize);
     }
 
-	if ((size!=(tmsize_t)(-1))&&(size<stripsize))
-		stripsize=size;
-	if (!TIFFFillStrip(tif,strip))
-		return((tmsize_t)(-1));
-	if ((*tif->tif_decodestrip)(tif,buf,stripsize,plane)<=0)
-		return((tmsize_t)(-1));
-	(*tif->tif_postdecode)(tif,buf,stripsize);
-	return(stripsize);
+    if ((size != (tmsize_t)(-1)) && (size < stripsize))
+        stripsize = size;
+    if (!TIFFFillStrip(tif, strip))
+        return ((tmsize_t)(-1));
+    if ((*tif->tif_decodestrip)(tif, buf, stripsize, plane) <= 0)
+        return ((tmsize_t)(-1));
+    (*tif->tif_postdecode)(tif, buf, stripsize);
+    return (stripsize);
 }
 
-/* Variant of TIFFReadEncodedStrip() that does 
- * * if *buf == NULL, *buf = _TIFFmalloc(bufsizetoalloc) only after TIFFFillStrip() has
- *   succeeded. This avoid excessive memory allocation in case of truncated
- *   file.
+/* Variant of TIFFReadEncodedStrip() that does
+ * * if *buf == NULL, *buf = _TIFFmallocExt(tif, bufsizetoalloc) only after
+ * TIFFFillStrip() has succeeded. This avoid excessive memory allocation in case
+ * of truncated file.
  * * calls regular TIFFReadEncodedStrip() if *buf != NULL
  */
-tmsize_t
-_TIFFReadEncodedStripAndAllocBuffer(TIFF* tif, uint32 strip,
-                                    void **buf, tmsize_t bufsizetoalloc,
-                                    tmsize_t size_to_read)
+tmsize_t _TIFFReadEncodedStripAndAllocBuffer(TIFF *tif, uint32_t strip,
+                                             void **buf,
+                                             tmsize_t bufsizetoalloc,
+                                             tmsize_t size_to_read)
 {
     tmsize_t this_stripsize;
-    uint16 plane;
+    uint16_t plane;
 
-    if( *buf != NULL )
+    if (*buf != NULL)
     {
         return TIFFReadEncodedStrip(tif, strip, *buf, size_to_read);
     }
 
-    this_stripsize=TIFFReadEncodedStripGetStripSize(tif, strip, &plane);
-    if (this_stripsize==((tmsize_t)(-1)))
-            return((tmsize_t)(-1));
+    this_stripsize = TIFFReadEncodedStripGetStripSize(tif, strip, &plane);
+    if (this_stripsize == ((tmsize_t)(-1)))
+        return ((tmsize_t)(-1));
 
-    if ((size_to_read!=(tmsize_t)(-1))&&(size_to_read<this_stripsize))
-            this_stripsize=size_to_read;
-    if (!TIFFFillStrip(tif,strip))
-            return((tmsize_t)(-1));
+    if ((size_to_read != (tmsize_t)(-1)) && (size_to_read < this_stripsize))
+        this_stripsize = size_to_read;
+    if (!TIFFFillStrip(tif, strip))
+        return ((tmsize_t)(-1));
 
-    *buf = _TIFFmalloc(bufsizetoalloc);
-    if (*buf == NULL) {
-            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif), "No space for strip buffer");
-            return((tmsize_t)(-1));
+    *buf = _TIFFmallocExt(tif, bufsizetoalloc);
+    if (*buf == NULL)
+    {
+        TIFFErrorExtR(tif, TIFFFileName(tif), "No space for strip buffer");
+        return ((tmsize_t)(-1));
     }
     _TIFFmemset(*buf, 0, bufsizetoalloc);
 
-    if ((*tif->tif_decodestrip)(tif,*buf,this_stripsize,plane)<=0)
-            return((tmsize_t)(-1));
-    (*tif->tif_postdecode)(tif,*buf,this_stripsize);
-    return(this_stripsize);
-
-
-}
-
-static tmsize_t
-TIFFReadRawStrip1(TIFF* tif, uint32 strip, void* buf, tmsize_t size,
-    const char* module)
-{
-	assert((tif->tif_flags&TIFF_NOREADRAW)==0);
-	if (!isMapped(tif)) {
-		tmsize_t cc;
-
-		if (!SeekOK(tif, TIFFGetStrileOffset(tif, strip))) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Seek error at scanline %lu, strip %lu",
-			    (unsigned long) tif->tif_row, (unsigned long) strip);
-			return ((tmsize_t)(-1));
-		}
-		cc = TIFFReadFile(tif, buf, size);
-		if (cc != size) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			TIFFErrorExt(tif->tif_clientdata, module,
-		"Read error at scanline %lu; got %I64u bytes, expected %I64u",
-				     (unsigned long) tif->tif_row,
-				     (unsigned __int64) cc,
-				     (unsigned __int64) size);
-#else
-			TIFFErrorExt(tif->tif_clientdata, module,
-		"Read error at scanline %lu; got %llu bytes, expected %llu",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long long) cc,
-				     (unsigned long long) size);
-#endif
-			return ((tmsize_t)(-1));
-		}
-	} else {
-		tmsize_t ma = 0;
-		tmsize_t n;
-		if ((TIFFGetStrileOffset(tif, strip) > (uint64)TIFF_TMSIZE_T_MAX)||
-                    ((ma=(tmsize_t)TIFFGetStrileOffset(tif, strip))>tif->tif_size))
-                {
-                    n=0;
-                }
-                else if( ma > TIFF_TMSIZE_T_MAX - size )
-                {
-                    n=0;
-                }
-                else
-                {
-                    tmsize_t mb=ma+size;
-                    if (mb>tif->tif_size)
-                            n=tif->tif_size-ma;
-                    else
-                            n=size;
-                }
-		if (n!=size) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			TIFFErrorExt(tif->tif_clientdata, module,
-	"Read error at scanline %lu, strip %lu; got %I64u bytes, expected %I64u",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long) strip,
-				     (unsigned __int64) n,
-				     (unsigned __int64) size);
-#else
-			TIFFErrorExt(tif->tif_clientdata, module,
-	"Read error at scanline %lu, strip %lu; got %llu bytes, expected %llu",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long) strip,
-				     (unsigned long long) n,
-				     (unsigned long long) size);
-#endif
-			return ((tmsize_t)(-1));
-		}
-		_TIFFmemcpy(buf, tif->tif_base + ma,
-			    size);
-	}
-	return (size);
+    if ((*tif->tif_decodestrip)(tif, *buf, this_stripsize, plane) <= 0)
+        return ((tmsize_t)(-1));
+    (*tif->tif_postdecode)(tif, *buf, this_stripsize);
+    return (this_stripsize);
 }
 
-static tmsize_t
-TIFFReadRawStripOrTile2(TIFF* tif, uint32 strip_or_tile, int is_strip,
-                        tmsize_t size, const char* module)
+static tmsize_t TIFFReadRawStrip1(TIFF *tif, uint32_t strip, void *buf,
+                                  tmsize_t size, const char *module)
 {
-        assert( !isMapped(tif) );
-        assert((tif->tif_flags&TIFF_NOREADRAW)==0);
+    assert((tif->tif_flags & TIFF_NOREADRAW) == 0);
+    if (!isMapped(tif))
+    {
+        tmsize_t cc;
 
-        if (!SeekOK(tif, TIFFGetStrileOffset(tif, strip_or_tile))) {
-            if( is_strip )
-            {
-                TIFFErrorExt(tif->tif_clientdata, module,
-                    "Seek error at scanline %lu, strip %lu",
-                    (unsigned long) tif->tif_row,
-                    (unsigned long) strip_or_tile);
-            }
+        if (!SeekOK(tif, TIFFGetStrileOffset(tif, strip)))
+        {
+            TIFFErrorExtR(tif, module,
+                          "Seek error at scanline %" PRIu32 ", strip %" PRIu32,
+                          tif->tif_row, strip);
+            return ((tmsize_t)(-1));
+        }
+        cc = TIFFReadFile(tif, buf, size);
+        if (cc != size)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Read error at scanline %" PRIu32
+                          "; got %" TIFF_SSIZE_FORMAT
+                          " bytes, expected %" TIFF_SSIZE_FORMAT,
+                          tif->tif_row, cc, size);
+            return ((tmsize_t)(-1));
+        }
+    }
+    else
+    {
+        tmsize_t ma = 0;
+        tmsize_t n;
+        if ((TIFFGetStrileOffset(tif, strip) > (uint64_t)TIFF_TMSIZE_T_MAX) ||
+            ((ma = (tmsize_t)TIFFGetStrileOffset(tif, strip)) > tif->tif_size))
+        {
+            n = 0;
+        }
+        else if (ma > TIFF_TMSIZE_T_MAX - size)
+        {
+            n = 0;
+        }
+        else
+        {
+            tmsize_t mb = ma + size;
+            if (mb > tif->tif_size)
+                n = tif->tif_size - ma;
             else
-            {
-                TIFFErrorExt(tif->tif_clientdata, module,
-                    "Seek error at row %lu, col %lu, tile %lu",
-                    (unsigned long) tif->tif_row,
-                    (unsigned long) tif->tif_col,
-                    (unsigned long) strip_or_tile);
-            }
+                n = size;
+        }
+        if (n != size)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Read error at scanline %" PRIu32 ", strip %" PRIu32
+                          "; got %" TIFF_SSIZE_FORMAT
+                          " bytes, expected %" TIFF_SSIZE_FORMAT,
+                          tif->tif_row, strip, n, size);
             return ((tmsize_t)(-1));
         }
+        _TIFFmemcpy(buf, tif->tif_base + ma, size);
+    }
+    return (size);
+}
+
+static tmsize_t TIFFReadRawStripOrTile2(TIFF *tif, uint32_t strip_or_tile,
+                                        int is_strip, tmsize_t size,
+                                        const char *module)
+{
+    assert(!isMapped(tif));
+    assert((tif->tif_flags & TIFF_NOREADRAW) == 0);
 
-        if( !TIFFReadAndRealloc( tif, size, 0, is_strip,
-                                 strip_or_tile, module ) )
+    if (!SeekOK(tif, TIFFGetStrileOffset(tif, strip_or_tile)))
+    {
+        if (is_strip)
         {
-            return ((tmsize_t)(-1));
+            TIFFErrorExtR(tif, module,
+                          "Seek error at scanline %" PRIu32 ", strip %" PRIu32,
+                          tif->tif_row, strip_or_tile);
+        }
+        else
+        {
+            TIFFErrorExtR(tif, module,
+                          "Seek error at row %" PRIu32 ", col %" PRIu32
+                          ", tile %" PRIu32,
+                          tif->tif_row, tif->tif_col, strip_or_tile);
         }
+        return ((tmsize_t)(-1));
+    }
 
-        return (size);
+    if (!TIFFReadAndRealloc(tif, size, 0, is_strip, strip_or_tile, module))
+    {
+        return ((tmsize_t)(-1));
+    }
+
+    return (size);
 }
 
 /*
  * Read a strip of data from the file.
  */
-tmsize_t
-TIFFReadRawStrip(TIFF* tif, uint32 strip, void* buf, tmsize_t size)
+tmsize_t TIFFReadRawStrip(TIFF *tif, uint32_t strip, void *buf, tmsize_t size)
 {
-	static const char module[] = "TIFFReadRawStrip";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint64 bytecount64;
-	tmsize_t bytecountm;
-
-	if (!TIFFCheckRead(tif, 0))
-		return ((tmsize_t)(-1));
-	if (strip >= td->td_nstrips) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		     "%lu: Strip out of range, max %lu",
-		     (unsigned long) strip,
-		     (unsigned long) td->td_nstrips);
-		return ((tmsize_t)(-1));
-	}
-	if (tif->tif_flags&TIFF_NOREADRAW)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Compression scheme does not support access to raw uncompressed data");
-		return ((tmsize_t)(-1));
-	}
-	bytecount64 = TIFFGetStrileByteCount(tif, strip);
-	if (size != (tmsize_t)(-1) && (uint64)size <= bytecount64)
-		bytecountm = size;
-	else
-		bytecountm = _TIFFCastUInt64ToSSize(tif, bytecount64, module);
-	if( bytecountm == 0 ) {
-		return ((tmsize_t)(-1));
-	}
-	return (TIFFReadRawStrip1(tif, strip, buf, bytecountm, module));
+    static const char module[] = "TIFFReadRawStrip";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint64_t bytecount64;
+    tmsize_t bytecountm;
+
+    if (!TIFFCheckRead(tif, 0))
+        return ((tmsize_t)(-1));
+    if (strip >= td->td_nstrips)
+    {
+        TIFFErrorExtR(tif, module,
+                      "%" PRIu32 ": Strip out of range, max %" PRIu32, strip,
+                      td->td_nstrips);
+        return ((tmsize_t)(-1));
+    }
+    if (tif->tif_flags & TIFF_NOREADRAW)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Compression scheme does not support access to raw "
+                      "uncompressed data");
+        return ((tmsize_t)(-1));
+    }
+    bytecount64 = TIFFGetStrileByteCount(tif, strip);
+    if (size != (tmsize_t)(-1) && (uint64_t)size <= bytecount64)
+        bytecountm = size;
+    else
+        bytecountm = _TIFFCastUInt64ToSSize(tif, bytecount64, module);
+    if (bytecountm == 0)
+    {
+        return ((tmsize_t)(-1));
+    }
+    return (TIFFReadRawStrip1(tif, strip, buf, bytecountm, module));
 }
 
 TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-static uint64 NoSanitizeSubUInt64(uint64 a, uint64 b)
-{
-    return a - b;
-}
+static uint64_t NoSanitizeSubUInt64(uint64_t a, uint64_t b) { return a - b; }
 
 /*
  * Read the specified strip and setup for decoding. The data buffer is
  * expanded, as necessary, to hold the strip's data.
  */
-int
-TIFFFillStrip(TIFF* tif, uint32 strip)
+int TIFFFillStrip(TIFF *tif, uint32_t strip)
 {
-	static const char module[] = "TIFFFillStrip";
-	TIFFDirectory *td = &tif->tif_dir;
-
-	if ((tif->tif_flags&TIFF_NOREADRAW)==0)
-	{
-		uint64 bytecount = TIFFGetStrileByteCount(tif, strip);
-		if( bytecount == 0 || bytecount > (uint64)TIFF_INT64_MAX ) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			TIFFErrorExt(tif->tif_clientdata, module,
-				"Invalid strip byte count %I64u, strip %lu",
-				     (unsigned __int64) bytecount,
-				     (unsigned long) strip);
-#else
-			TIFFErrorExt(tif->tif_clientdata, module,
-				"Invalid strip byte count %llu, strip %lu",
-				     (unsigned long long) bytecount,
-				     (unsigned long) strip);
-#endif
-			return (0);
-		}
+    static const char module[] = "TIFFFillStrip";
+    TIFFDirectory *td = &tif->tif_dir;
+
+    if ((tif->tif_flags & TIFF_NOREADRAW) == 0)
+    {
+        uint64_t bytecount = TIFFGetStrileByteCount(tif, strip);
+        if (bytecount == 0 || bytecount > (uint64_t)TIFF_INT64_MAX)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Invalid strip byte count %" PRIu64
+                          ", strip %" PRIu32,
+                          bytecount, strip);
+            return (0);
+        }
+
+        /* To avoid excessive memory allocations: */
+        /* Byte count should normally not be larger than a number of */
+        /* times the uncompressed size plus some margin */
+        if (bytecount > 1024 * 1024)
+        {
+            /* 10 and 4096 are just values that could be adjusted. */
+            /* Hopefully they are safe enough for all codecs */
+            tmsize_t stripsize = TIFFStripSize(tif);
+            if (stripsize != 0 && (bytecount - 4096) / 10 > (uint64_t)stripsize)
+            {
+                uint64_t newbytecount = (uint64_t)stripsize * 10 + 4096;
+                TIFFErrorExtR(tif, module,
+                              "Too large strip byte count %" PRIu64
+                              ", strip %" PRIu32 ". Limiting to %" PRIu64,
+                              bytecount, strip, newbytecount);
+                bytecount = newbytecount;
+            }
+        }
+
+        if (isMapped(tif))
+        {
+            /*
+             * We must check for overflow, potentially causing
+             * an OOB read. Instead of simple
+             *
+             *  TIFFGetStrileOffset(tif, strip)+bytecount > tif->tif_size
+             *
+             * comparison (which can overflow) we do the following
+             * two comparisons:
+             */
+            if (bytecount > (uint64_t)tif->tif_size ||
+                TIFFGetStrileOffset(tif, strip) >
+                    (uint64_t)tif->tif_size - bytecount)
+            {
+                /*
+                 * This error message might seem strange, but
+                 * it's what would happen if a read were done
+                 * instead.
+                 */
+                TIFFErrorExtR(
+                    tif, module,
+
+                    "Read error on strip %" PRIu32 "; "
+                    "got %" PRIu64 " bytes, expected %" PRIu64,
+                    strip,
+                    NoSanitizeSubUInt64(tif->tif_size,
+                                        TIFFGetStrileOffset(tif, strip)),
+                    bytecount);
+                tif->tif_curstrip = NOSTRIP;
+                return (0);
+            }
+        }
 
-		/* To avoid excessive memory allocations: */
-		/* Byte count should normally not be larger than a number of */
-		/* times the uncompressed size plus some margin */
-                if( bytecount > 1024 * 1024 )
+        if (isMapped(tif) && (isFillOrder(tif, td->td_fillorder) ||
+                              (tif->tif_flags & TIFF_NOBITREV)))
+        {
+            /*
+             * The image is mapped into memory and we either don't
+             * need to flip bits or the compression routine is
+             * going to handle this operation itself.  In this
+             * case, avoid copying the raw data and instead just
+             * reference the data from the memory mapped file
+             * image.  This assumes that the decompression
+             * routines do not modify the contents of the raw data
+             * buffer (if they try to, the application will get a
+             * fault since the file is mapped read-only).
+             */
+            if ((tif->tif_flags & TIFF_MYBUFFER) && tif->tif_rawdata)
+            {
+                _TIFFfreeExt(tif, tif->tif_rawdata);
+                tif->tif_rawdata = NULL;
+                tif->tif_rawdatasize = 0;
+            }
+            tif->tif_flags &= ~TIFF_MYBUFFER;
+            tif->tif_rawdatasize = (tmsize_t)bytecount;
+            tif->tif_rawdata =
+                tif->tif_base + (tmsize_t)TIFFGetStrileOffset(tif, strip);
+            tif->tif_rawdataoff = 0;
+            tif->tif_rawdataloaded = (tmsize_t)bytecount;
+
+            /*
+             * When we have tif_rawdata reference directly into the memory
+             * mapped file we need to be pretty careful about how we use the
+             * rawdata.  It is not a general purpose working buffer as it
+             * normally otherwise is.  So we keep track of this fact to avoid
+             * using it improperly.
+             */
+            tif->tif_flags |= TIFF_BUFFERMMAP;
+        }
+        else
+        {
+            /*
+             * Expand raw data buffer, if needed, to hold data
+             * strip coming from file (perhaps should set upper
+             * bound on the size of a buffer we'll use?).
+             */
+            tmsize_t bytecountm;
+            bytecountm = (tmsize_t)bytecount;
+            if ((uint64_t)bytecountm != bytecount)
+            {
+                TIFFErrorExtR(tif, module, "Integer overflow");
+                return (0);
+            }
+            if (bytecountm > tif->tif_rawdatasize)
+            {
+                tif->tif_curstrip = NOSTRIP;
+                if ((tif->tif_flags & TIFF_MYBUFFER) == 0)
                 {
-			/* 10 and 4096 are just values that could be adjusted. */
-			/* Hopefully they are safe enough for all codecs */
-			tmsize_t stripsize = TIFFStripSize(tif);
-			if( stripsize != 0 &&
-			    (bytecount - 4096) / 10 > (uint64)stripsize  )
-			{
-				uint64 newbytecount = (uint64)stripsize * 10 + 4096;
-				if( newbytecount == 0 || newbytecount > (uint64)TIFF_INT64_MAX )
-				{
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-					TIFFWarningExt(tif->tif_clientdata, module,
-					  "Too large strip byte count %I64u, strip %lu. Limiting to %I64u",
-					     (unsigned __int64) bytecount,
-					     (unsigned long) strip,
-					     (unsigned __int64) newbytecount);
-#else
-					TIFFErrorExt(tif->tif_clientdata, module,
-					  "Too large strip byte count %llu, strip %lu. Limiting to %llu",
-					     (unsigned long long) bytecount,
-					     (unsigned long) strip,
-					     (unsigned long long) newbytecount);
-#endif
-					bytecount = newbytecount;
-				}
-			}
-		}
-
-		if (isMapped(tif)) {
-			/*
-			 * We must check for overflow, potentially causing
-			 * an OOB read. Instead of simple
-			 *
-			 *  TIFFGetStrileOffset(tif, strip)+bytecount > tif->tif_size
-			 *
-			 * comparison (which can overflow) we do the following
-			 * two comparisons:
-			 */
-			if (bytecount > (uint64)tif->tif_size ||
-			    TIFFGetStrileOffset(tif, strip) > (uint64)tif->tif_size - bytecount) {
-				/*
-				 * This error message might seem strange, but
-				 * it's what would happen if a read were done
-				 * instead.
-				 */
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-				TIFFErrorExt(tif->tif_clientdata, module,
-
-					"Read error on strip %lu; "
-					"got %I64u bytes, expected %I64u",
-					(unsigned long) strip,
-					(unsigned __int64) NoSanitizeSubUInt64(tif->tif_size, TIFFGetStrileOffset(tif, strip)),
-					(unsigned __int64) bytecount);
-#else
-				TIFFErrorExt(tif->tif_clientdata, module,
+                    TIFFErrorExtR(
+                        tif, module,
+                        "Data buffer too small to hold strip %" PRIu32, strip);
+                    return (0);
+                }
+            }
+            if (tif->tif_flags & TIFF_BUFFERMMAP)
+            {
+                tif->tif_curstrip = NOSTRIP;
+                tif->tif_rawdata = NULL;
+                tif->tif_rawdatasize = 0;
+                tif->tif_flags &= ~TIFF_BUFFERMMAP;
+            }
 
-					"Read error on strip %lu; "
-					"got %llu bytes, expected %llu",
-					(unsigned long) strip,
-					(unsigned long long) NoSanitizeSubUInt64(tif->tif_size, TIFFGetStrileOffset(tif, strip)),
-					(unsigned long long) bytecount);
-#endif
-				tif->tif_curstrip = NOSTRIP;
-				return (0);
-			}
-		}
-
-		if (isMapped(tif) &&
-		    (isFillOrder(tif, td->td_fillorder)
-		    || (tif->tif_flags & TIFF_NOBITREV))) {
-			/*
-			 * The image is mapped into memory and we either don't
-			 * need to flip bits or the compression routine is
-			 * going to handle this operation itself.  In this
-			 * case, avoid copying the raw data and instead just
-			 * reference the data from the memory mapped file
-			 * image.  This assumes that the decompression
-			 * routines do not modify the contents of the raw data
-			 * buffer (if they try to, the application will get a
-			 * fault since the file is mapped read-only).
-			 */
-			if ((tif->tif_flags & TIFF_MYBUFFER) && tif->tif_rawdata) {
-				_TIFFfree(tif->tif_rawdata);
-				tif->tif_rawdata = NULL;
-				tif->tif_rawdatasize = 0;
-			}
-			tif->tif_flags &= ~TIFF_MYBUFFER;
-			tif->tif_rawdatasize = (tmsize_t)bytecount;
-			tif->tif_rawdata = tif->tif_base + (tmsize_t)TIFFGetStrileOffset(tif, strip);
-                        tif->tif_rawdataoff = 0;
-                        tif->tif_rawdataloaded = (tmsize_t) bytecount;
-
-			/* 
-			 * When we have tif_rawdata reference directly into the memory mapped file
-			 * we need to be pretty careful about how we use the rawdata.  It is not
-			 * a general purpose working buffer as it normally otherwise is.  So we
-			 * keep track of this fact to avoid using it improperly.
-			 */
-			tif->tif_flags |= TIFF_BUFFERMMAP;
-		} else {
-			/*
-			 * Expand raw data buffer, if needed, to hold data
-			 * strip coming from file (perhaps should set upper
-			 * bound on the size of a buffer we'll use?).
-			 */
-			tmsize_t bytecountm;
-			bytecountm=(tmsize_t)bytecount;
-			if ((uint64)bytecountm!=bytecount)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
-				return(0);
-			}
-			if (bytecountm > tif->tif_rawdatasize) {
-				tif->tif_curstrip = NOSTRIP;
-				if ((tif->tif_flags & TIFF_MYBUFFER) == 0) {
-					TIFFErrorExt(tif->tif_clientdata, module,
-					    "Data buffer too small to hold strip %lu",
-					    (unsigned long) strip);
-					return (0);
-				}
-			}
-			if (tif->tif_flags&TIFF_BUFFERMMAP) {
-				tif->tif_curstrip = NOSTRIP;
-				tif->tif_rawdata = NULL;
-				tif->tif_rawdatasize = 0;
-				tif->tif_flags &= ~TIFF_BUFFERMMAP;
-			}
-
-			if( isMapped(tif) )
-			{
-				if (bytecountm > tif->tif_rawdatasize &&
-				    !TIFFReadBufferSetup(tif, 0, bytecountm))
-				{
-					return (0);
-				}
-				if (TIFFReadRawStrip1(tif, strip, tif->tif_rawdata,
-				    bytecountm, module) != bytecountm)
-				{
-					return (0);
-				}
-			}
-			else
-			{
-				if (TIFFReadRawStripOrTile2(tif, strip, 1,
-				    bytecountm, module) != bytecountm)
-				{
-					return (0);
-				}
-			}
-
-
-                        tif->tif_rawdataoff = 0;
-                        tif->tif_rawdataloaded = bytecountm;
-                        
-			if (!isFillOrder(tif, td->td_fillorder) &&
-			    (tif->tif_flags & TIFF_NOBITREV) == 0)
-				TIFFReverseBits(tif->tif_rawdata, bytecountm);
+            if (isMapped(tif))
+            {
+                if (bytecountm > tif->tif_rawdatasize &&
+                    !TIFFReadBufferSetup(tif, 0, bytecountm))
+                {
+                    return (0);
                 }
-	}
-	return (TIFFStartStrip(tif, strip));
+                if (TIFFReadRawStrip1(tif, strip, tif->tif_rawdata, bytecountm,
+                                      module) != bytecountm)
+                {
+                    return (0);
+                }
+            }
+            else
+            {
+                if (TIFFReadRawStripOrTile2(tif, strip, 1, bytecountm,
+                                            module) != bytecountm)
+                {
+                    return (0);
+                }
+            }
+
+            tif->tif_rawdataoff = 0;
+            tif->tif_rawdataloaded = bytecountm;
+
+            if (!isFillOrder(tif, td->td_fillorder) &&
+                (tif->tif_flags & TIFF_NOBITREV) == 0)
+                TIFFReverseBits(tif->tif_rawdata, bytecountm);
+        }
+    }
+    return (TIFFStartStrip(tif, strip));
 }
 
 /*
@@ -961,120 +913,162 @@ TIFFFillStrip(TIFF* tif, uint32 strip)
  * Read and decompress a tile of data.  The
  * tile is selected by the (x,y,z,s) coordinates.
  */
-tmsize_t
-TIFFReadTile(TIFF* tif, void* buf, uint32 x, uint32 y, uint32 z, uint16 s)
+tmsize_t TIFFReadTile(TIFF *tif, void *buf, uint32_t x, uint32_t y, uint32_t z,
+                      uint16_t s)
 {
-	if (!TIFFCheckRead(tif, 1) || !TIFFCheckTile(tif, x, y, z, s))
-		return ((tmsize_t)(-1));
-	return (TIFFReadEncodedTile(tif,
-	    TIFFComputeTile(tif, x, y, z, s), buf, (tmsize_t)(-1)));
+    if (!TIFFCheckRead(tif, 1) || !TIFFCheckTile(tif, x, y, z, s))
+        return ((tmsize_t)(-1));
+    return (TIFFReadEncodedTile(tif, TIFFComputeTile(tif, x, y, z, s), buf,
+                                (tmsize_t)(-1)));
 }
 
 /*
  * Read a tile of data and decompress the specified
  * amount into the user-supplied buffer.
  */
-tmsize_t
-TIFFReadEncodedTile(TIFF* tif, uint32 tile, void* buf, tmsize_t size)
+tmsize_t TIFFReadEncodedTile(TIFF *tif, uint32_t tile, void *buf, tmsize_t size)
 {
-	static const char module[] = "TIFFReadEncodedTile";
-	TIFFDirectory *td = &tif->tif_dir;
-	tmsize_t tilesize = tif->tif_tilesize;
-
-	if (!TIFFCheckRead(tif, 1))
-		return ((tmsize_t)(-1));
-	if (tile >= td->td_nstrips) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "%lu: Tile out of range, max %lu",
-		    (unsigned long) tile, (unsigned long) td->td_nstrips);
-		return ((tmsize_t)(-1));
-	}
+    static const char module[] = "TIFFReadEncodedTile";
+    TIFFDirectory *td = &tif->tif_dir;
+    tmsize_t tilesize = tif->tif_tilesize;
+
+    if (!TIFFCheckRead(tif, 1))
+        return ((tmsize_t)(-1));
+    if (tile >= td->td_nstrips)
+    {
+        TIFFErrorExtR(tif, module,
+                      "%" PRIu32 ": Tile out of range, max %" PRIu32, tile,
+                      td->td_nstrips);
+        return ((tmsize_t)(-1));
+    }
 
     /* shortcut to avoid an extra memcpy() */
-    if( td->td_compression == COMPRESSION_NONE &&
-        size!=(tmsize_t)(-1) && size >= tilesize &&
-        !isMapped(tif) &&
-        ((tif->tif_flags&TIFF_NOREADRAW)==0) )
+    if (td->td_compression == COMPRESSION_NONE && size != (tmsize_t)(-1) &&
+        size >= tilesize && !isMapped(tif) &&
+        ((tif->tif_flags & TIFF_NOREADRAW) == 0))
     {
         if (TIFFReadRawTile1(tif, tile, buf, tilesize, module) != tilesize)
             return ((tmsize_t)(-1));
 
         if (!isFillOrder(tif, td->td_fillorder) &&
             (tif->tif_flags & TIFF_NOBITREV) == 0)
-            TIFFReverseBits(buf,tilesize);
+            TIFFReverseBits(buf, tilesize);
 
-        (*tif->tif_postdecode)(tif,buf,tilesize);
+        (*tif->tif_postdecode)(tif, buf, tilesize);
         return (tilesize);
     }
 
-	if (size == (tmsize_t)(-1))
-		size = tilesize;
-	else if (size > tilesize)
-		size = tilesize;
-	if (TIFFFillTile(tif, tile) && (*tif->tif_decodetile)(tif,
-	    (uint8*) buf, size, (uint16)(tile/td->td_stripsperimage))) {
-		(*tif->tif_postdecode)(tif, (uint8*) buf, size);
-		return (size);
-	} else
-		return ((tmsize_t)(-1));
+    if (size == (tmsize_t)(-1))
+        size = tilesize;
+    else if (size > tilesize)
+        size = tilesize;
+    if (TIFFFillTile(tif, tile) &&
+        (*tif->tif_decodetile)(tif, (uint8_t *)buf, size,
+                               (uint16_t)(tile / td->td_stripsperimage)))
+    {
+        (*tif->tif_postdecode)(tif, (uint8_t *)buf, size);
+        return (size);
+    }
+    else
+        return ((tmsize_t)(-1));
 }
 
-/* Variant of TIFFReadTile() that does 
- * * if *buf == NULL, *buf = _TIFFmalloc(bufsizetoalloc) only after TIFFFillTile() has
- *   succeeded. This avoid excessive memory allocation in case of truncated
- *   file.
+/* Variant of TIFFReadTile() that does
+ * * if *buf == NULL, *buf = _TIFFmallocExt(tif, bufsizetoalloc) only after
+ * TIFFFillTile() has succeeded. This avoid excessive memory allocation in case
+ * of truncated file.
  * * calls regular TIFFReadEncodedTile() if *buf != NULL
  */
-tmsize_t
-_TIFFReadTileAndAllocBuffer(TIFF* tif,
-                            void **buf, tmsize_t bufsizetoalloc,
-                            uint32 x, uint32 y, uint32 z, uint16 s)
+tmsize_t _TIFFReadTileAndAllocBuffer(TIFF *tif, void **buf,
+                                     tmsize_t bufsizetoalloc, uint32_t x,
+                                     uint32_t y, uint32_t z, uint16_t s)
 {
     if (!TIFFCheckRead(tif, 1) || !TIFFCheckTile(tif, x, y, z, s))
-            return ((tmsize_t)(-1));
-    return (_TIFFReadEncodedTileAndAllocBuffer(tif,
-                                               TIFFComputeTile(tif, x, y, z, s),
-                                               buf, bufsizetoalloc,
-                                               (tmsize_t)(-1)));
+        return ((tmsize_t)(-1));
+    return (_TIFFReadEncodedTileAndAllocBuffer(
+        tif, TIFFComputeTile(tif, x, y, z, s), buf, bufsizetoalloc,
+        (tmsize_t)(-1)));
 }
 
-/* Variant of TIFFReadEncodedTile() that does 
- * * if *buf == NULL, *buf = _TIFFmalloc(bufsizetoalloc) only after TIFFFillTile() has
- *   succeeded. This avoid excessive memory allocation in case of truncated
- *   file.
+/* Variant of TIFFReadEncodedTile() that does
+ * * if *buf == NULL, *buf = _TIFFmallocExt(tif, bufsizetoalloc) only after
+ * TIFFFillTile() has succeeded. This avoid excessive memory allocation in case
+ * of truncated file.
  * * calls regular TIFFReadEncodedTile() if *buf != NULL
  */
-tmsize_t
-_TIFFReadEncodedTileAndAllocBuffer(TIFF* tif, uint32 tile,
-                                    void **buf, tmsize_t bufsizetoalloc,
-                                    tmsize_t size_to_read)
+tmsize_t _TIFFReadEncodedTileAndAllocBuffer(TIFF *tif, uint32_t tile,
+                                            void **buf, tmsize_t bufsizetoalloc,
+                                            tmsize_t size_to_read)
 {
     static const char module[] = "_TIFFReadEncodedTileAndAllocBuffer";
     TIFFDirectory *td = &tif->tif_dir;
     tmsize_t tilesize = tif->tif_tilesize;
 
-    if( *buf != NULL )
+    if (*buf != NULL)
     {
         return TIFFReadEncodedTile(tif, tile, *buf, size_to_read);
     }
 
     if (!TIFFCheckRead(tif, 1))
+        return ((tmsize_t)(-1));
+    if (tile >= td->td_nstrips)
+    {
+        TIFFErrorExtR(tif, module,
+                      "%" PRIu32 ": Tile out of range, max %" PRIu32, tile,
+                      td->td_nstrips);
+        return ((tmsize_t)(-1));
+    }
+
+    if (!TIFFFillTile(tif, tile))
+        return ((tmsize_t)(-1));
+
+    /* Sanity checks to avoid excessive memory allocation */
+    /* Cf https://gitlab.com/libtiff/libtiff/-/issues/479 */
+    if (td->td_compression == COMPRESSION_NONE)
+    {
+        if (tif->tif_rawdatasize != tilesize)
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif),
+                          "Invalid tile byte count for tile %u. "
+                          "Expected %" PRIu64 ", got %" PRIu64,
+                          tile, (uint64_t)tilesize,
+                          (uint64_t)tif->tif_rawdatasize);
             return ((tmsize_t)(-1));
-    if (tile >= td->td_nstrips) {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                "%lu: Tile out of range, max %lu",
-                (unsigned long) tile, (unsigned long) td->td_nstrips);
+        }
+    }
+    else
+    {
+        /* Max compression ratio experimentally determined. Might be fragile...
+         * Only apply this heuristics to situations where the memory allocation
+         * would be big, to avoid breaking nominal use cases.
+         */
+        const int maxCompressionRatio =
+            td->td_compression == COMPRESSION_ZSTD ? 33000
+            : td->td_compression == COMPRESSION_JXL
+                ?
+                /* Evaluated on a 8000x8000 tile */
+                25000 * (td->td_planarconfig == PLANARCONFIG_CONTIG
+                             ? td->td_samplesperpixel
+                             : 1)
+                : td->td_compression == COMPRESSION_LZMA ? 7000 : 1000;
+        if (bufsizetoalloc > 100 * 1000 * 1000 &&
+            tif->tif_rawdatasize < tilesize / maxCompressionRatio)
+        {
+            TIFFErrorExtR(tif, TIFFFileName(tif),
+                          "Likely invalid tile byte count for tile %u. "
+                          "Uncompressed tile size is %" PRIu64 ", "
+                          "compressed one is %" PRIu64,
+                          tile, (uint64_t)tilesize,
+                          (uint64_t)tif->tif_rawdatasize);
             return ((tmsize_t)(-1));
+        }
     }
 
-    if (!TIFFFillTile(tif,tile))
-            return((tmsize_t)(-1));
-
-    *buf = _TIFFmalloc(bufsizetoalloc);
-    if (*buf == NULL) {
-            TIFFErrorExt(tif->tif_clientdata, TIFFFileName(tif),
-                         "No space for tile buffer");
-            return((tmsize_t)(-1));
+    *buf = _TIFFmallocExt(tif, bufsizetoalloc);
+    if (*buf == NULL)
+    {
+        TIFFErrorExtR(tif, TIFFFileName(tif), "No space for tile buffer");
+        return ((tmsize_t)(-1));
     }
     _TIFFmemset(*buf, 0, bufsizetoalloc);
 
@@ -1082,287 +1076,261 @@ _TIFFReadEncodedTileAndAllocBuffer(TIFF* tif, uint32 tile,
         size_to_read = tilesize;
     else if (size_to_read > tilesize)
         size_to_read = tilesize;
-    if( (*tif->tif_decodetile)(tif,
-        (uint8*) *buf, size_to_read, (uint16)(tile/td->td_stripsperimage))) {
-        (*tif->tif_postdecode)(tif, (uint8*) *buf, size_to_read);
+    if ((*tif->tif_decodetile)(tif, (uint8_t *)*buf, size_to_read,
+                               (uint16_t)(tile / td->td_stripsperimage)))
+    {
+        (*tif->tif_postdecode)(tif, (uint8_t *)*buf, size_to_read);
         return (size_to_read);
-    } else
+    }
+    else
         return ((tmsize_t)(-1));
 }
 
-static tmsize_t
-TIFFReadRawTile1(TIFF* tif, uint32 tile, void* buf, tmsize_t size, const char* module)
+static tmsize_t TIFFReadRawTile1(TIFF *tif, uint32_t tile, void *buf,
+                                 tmsize_t size, const char *module)
 {
-	assert((tif->tif_flags&TIFF_NOREADRAW)==0);
-	if (!isMapped(tif)) {
-		tmsize_t cc;
-
-		if (!SeekOK(tif, TIFFGetStrileOffset(tif, tile))) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Seek error at row %lu, col %lu, tile %lu",
-			    (unsigned long) tif->tif_row,
-			    (unsigned long) tif->tif_col,
-			    (unsigned long) tile);
-			return ((tmsize_t)(-1));
-		}
-		cc = TIFFReadFile(tif, buf, size);
-		if (cc != size) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			TIFFErrorExt(tif->tif_clientdata, module,
-	"Read error at row %lu, col %lu; got %I64u bytes, expected %I64u",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long) tif->tif_col,
-				     (unsigned __int64) cc,
-				     (unsigned __int64) size);
-#else
-			TIFFErrorExt(tif->tif_clientdata, module,
-	"Read error at row %lu, col %lu; got %llu bytes, expected %llu",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long) tif->tif_col,
-				     (unsigned long long) cc,
-				     (unsigned long long) size);
-#endif
-			return ((tmsize_t)(-1));
-		}
-	} else {
-		tmsize_t ma,mb;
-		tmsize_t n;
-		ma=(tmsize_t)TIFFGetStrileOffset(tif, tile);
-		mb=ma+size;
-		if ((TIFFGetStrileOffset(tif, tile) > (uint64)TIFF_TMSIZE_T_MAX)||(ma>tif->tif_size))
-			n=0;
-		else if ((mb<ma)||(mb<size)||(mb>tif->tif_size))
-			n=tif->tif_size-ma;
-		else
-			n=size;
-		if (n!=size) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			TIFFErrorExt(tif->tif_clientdata, module,
-"Read error at row %lu, col %lu, tile %lu; got %I64u bytes, expected %I64u",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long) tif->tif_col,
-				     (unsigned long) tile,
-				     (unsigned __int64) n,
-				     (unsigned __int64) size);
-#else
-			TIFFErrorExt(tif->tif_clientdata, module,
-"Read error at row %lu, col %lu, tile %lu; got %llu bytes, expected %llu",
-				     (unsigned long) tif->tif_row,
-				     (unsigned long) tif->tif_col,
-				     (unsigned long) tile,
-				     (unsigned long long) n,
-				     (unsigned long long) size);
-#endif
-			return ((tmsize_t)(-1));
-		}
-		_TIFFmemcpy(buf, tif->tif_base + ma, size);
-	}
-	return (size);
+    assert((tif->tif_flags & TIFF_NOREADRAW) == 0);
+    if (!isMapped(tif))
+    {
+        tmsize_t cc;
+
+        if (!SeekOK(tif, TIFFGetStrileOffset(tif, tile)))
+        {
+            TIFFErrorExtR(tif, module,
+                          "Seek error at row %" PRIu32 ", col %" PRIu32
+                          ", tile %" PRIu32,
+                          tif->tif_row, tif->tif_col, tile);
+            return ((tmsize_t)(-1));
+        }
+        cc = TIFFReadFile(tif, buf, size);
+        if (cc != size)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Read error at row %" PRIu32 ", col %" PRIu32
+                          "; got %" TIFF_SSIZE_FORMAT
+                          " bytes, expected %" TIFF_SSIZE_FORMAT,
+                          tif->tif_row, tif->tif_col, cc, size);
+            return ((tmsize_t)(-1));
+        }
+    }
+    else
+    {
+        tmsize_t ma, mb;
+        tmsize_t n;
+        ma = (tmsize_t)TIFFGetStrileOffset(tif, tile);
+        mb = ma + size;
+        if ((TIFFGetStrileOffset(tif, tile) > (uint64_t)TIFF_TMSIZE_T_MAX) ||
+            (ma > tif->tif_size))
+            n = 0;
+        else if ((mb < ma) || (mb < size) || (mb > tif->tif_size))
+            n = tif->tif_size - ma;
+        else
+            n = size;
+        if (n != size)
+        {
+            TIFFErrorExtR(tif, module,
+                          "Read error at row %" PRIu32 ", col %" PRIu32
+                          ", tile %" PRIu32 "; got %" TIFF_SSIZE_FORMAT
+                          " bytes, expected %" TIFF_SSIZE_FORMAT,
+                          tif->tif_row, tif->tif_col, tile, n, size);
+            return ((tmsize_t)(-1));
+        }
+        _TIFFmemcpy(buf, tif->tif_base + ma, size);
+    }
+    return (size);
 }
 
 /*
  * Read a tile of data from the file.
  */
-tmsize_t
-TIFFReadRawTile(TIFF* tif, uint32 tile, void* buf, tmsize_t size)
+tmsize_t TIFFReadRawTile(TIFF *tif, uint32_t tile, void *buf, tmsize_t size)
 {
-	static const char module[] = "TIFFReadRawTile";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint64 bytecount64;
-	tmsize_t bytecountm;
-
-	if (!TIFFCheckRead(tif, 1))
-		return ((tmsize_t)(-1));
-	if (tile >= td->td_nstrips) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "%lu: Tile out of range, max %lu",
-		    (unsigned long) tile, (unsigned long) td->td_nstrips);
-		return ((tmsize_t)(-1));
-	}
-	if (tif->tif_flags&TIFF_NOREADRAW)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module,
-		"Compression scheme does not support access to raw uncompressed data");
-		return ((tmsize_t)(-1));
-	}
-	bytecount64 = TIFFGetStrileByteCount(tif, tile);
-	if (size != (tmsize_t)(-1) && (uint64)size <= bytecount64)
-		bytecountm = size;
-	else
-		bytecountm = _TIFFCastUInt64ToSSize(tif, bytecount64, module);
-	if( bytecountm == 0 ) {
-		return ((tmsize_t)(-1));
-	}
-	return (TIFFReadRawTile1(tif, tile, buf, bytecountm, module));
+    static const char module[] = "TIFFReadRawTile";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint64_t bytecount64;
+    tmsize_t bytecountm;
+
+    if (!TIFFCheckRead(tif, 1))
+        return ((tmsize_t)(-1));
+    if (tile >= td->td_nstrips)
+    {
+        TIFFErrorExtR(tif, module,
+                      "%" PRIu32 ": Tile out of range, max %" PRIu32, tile,
+                      td->td_nstrips);
+        return ((tmsize_t)(-1));
+    }
+    if (tif->tif_flags & TIFF_NOREADRAW)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Compression scheme does not support access to raw "
+                      "uncompressed data");
+        return ((tmsize_t)(-1));
+    }
+    bytecount64 = TIFFGetStrileByteCount(tif, tile);
+    if (size != (tmsize_t)(-1) && (uint64_t)size <= bytecount64)
+        bytecountm = size;
+    else
+        bytecountm = _TIFFCastUInt64ToSSize(tif, bytecount64, module);
+    if (bytecountm == 0)
+    {
+        return ((tmsize_t)(-1));
+    }
+    return (TIFFReadRawTile1(tif, tile, buf, bytecountm, module));
 }
 
 /*
  * Read the specified tile and setup for decoding. The data buffer is
  * expanded, as necessary, to hold the tile's data.
  */
-int
-TIFFFillTile(TIFF* tif, uint32 tile)
+int TIFFFillTile(TIFF *tif, uint32_t tile)
 {
-	static const char module[] = "TIFFFillTile";
-	TIFFDirectory *td = &tif->tif_dir;
-
-	if ((tif->tif_flags&TIFF_NOREADRAW)==0)
-	{
-		uint64 bytecount = TIFFGetStrileByteCount(tif, tile);
-		if( bytecount == 0 || bytecount > (uint64)TIFF_INT64_MAX ) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-			TIFFErrorExt(tif->tif_clientdata, module,
-				"%I64u: Invalid tile byte count, tile %lu",
-				     (unsigned __int64) bytecount,
-				     (unsigned long) tile);
-#else
-			TIFFErrorExt(tif->tif_clientdata, module,
-				"%llu: Invalid tile byte count, tile %lu",
-				     (unsigned long long) bytecount,
-				     (unsigned long) tile);
-#endif
-			return (0);
-		}
+    static const char module[] = "TIFFFillTile";
+    TIFFDirectory *td = &tif->tif_dir;
+
+    if ((tif->tif_flags & TIFF_NOREADRAW) == 0)
+    {
+        uint64_t bytecount = TIFFGetStrileByteCount(tif, tile);
+        if (bytecount == 0 || bytecount > (uint64_t)TIFF_INT64_MAX)
+        {
+            TIFFErrorExtR(tif, module,
+                          "%" PRIu64 ": Invalid tile byte count, tile %" PRIu32,
+                          bytecount, tile);
+            return (0);
+        }
+
+        /* To avoid excessive memory allocations: */
+        /* Byte count should normally not be larger than a number of */
+        /* times the uncompressed size plus some margin */
+        if (bytecount > 1024 * 1024)
+        {
+            /* 10 and 4096 are just values that could be adjusted. */
+            /* Hopefully they are safe enough for all codecs */
+            tmsize_t stripsize = TIFFTileSize(tif);
+            if (stripsize != 0 && (bytecount - 4096) / 10 > (uint64_t)stripsize)
+            {
+                uint64_t newbytecount = (uint64_t)stripsize * 10 + 4096;
+                TIFFErrorExtR(tif, module,
+                              "Too large tile byte count %" PRIu64
+                              ", tile %" PRIu32 ". Limiting to %" PRIu64,
+                              bytecount, tile, newbytecount);
+                bytecount = newbytecount;
+            }
+        }
+
+        if (isMapped(tif))
+        {
+            /*
+             * We must check for overflow, potentially causing
+             * an OOB read. Instead of simple
+             *
+             *  TIFFGetStrileOffset(tif, tile)+bytecount > tif->tif_size
+             *
+             * comparison (which can overflow) we do the following
+             * two comparisons:
+             */
+            if (bytecount > (uint64_t)tif->tif_size ||
+                TIFFGetStrileOffset(tif, tile) >
+                    (uint64_t)tif->tif_size - bytecount)
+            {
+                tif->tif_curtile = NOTILE;
+                return (0);
+            }
+        }
 
-		/* To avoid excessive memory allocations: */
-		/* Byte count should normally not be larger than a number of */
-		/* times the uncompressed size plus some margin */
-                if( bytecount > 1024 * 1024 )
+        if (isMapped(tif) && (isFillOrder(tif, td->td_fillorder) ||
+                              (tif->tif_flags & TIFF_NOBITREV)))
+        {
+            /*
+             * The image is mapped into memory and we either don't
+             * need to flip bits or the compression routine is
+             * going to handle this operation itself.  In this
+             * case, avoid copying the raw data and instead just
+             * reference the data from the memory mapped file
+             * image.  This assumes that the decompression
+             * routines do not modify the contents of the raw data
+             * buffer (if they try to, the application will get a
+             * fault since the file is mapped read-only).
+             */
+            if ((tif->tif_flags & TIFF_MYBUFFER) && tif->tif_rawdata)
+            {
+                _TIFFfreeExt(tif, tif->tif_rawdata);
+                tif->tif_rawdata = NULL;
+                tif->tif_rawdatasize = 0;
+            }
+            tif->tif_flags &= ~TIFF_MYBUFFER;
+
+            tif->tif_rawdatasize = (tmsize_t)bytecount;
+            tif->tif_rawdata =
+                tif->tif_base + (tmsize_t)TIFFGetStrileOffset(tif, tile);
+            tif->tif_rawdataoff = 0;
+            tif->tif_rawdataloaded = (tmsize_t)bytecount;
+            tif->tif_flags |= TIFF_BUFFERMMAP;
+        }
+        else
+        {
+            /*
+             * Expand raw data buffer, if needed, to hold data
+             * tile coming from file (perhaps should set upper
+             * bound on the size of a buffer we'll use?).
+             */
+            tmsize_t bytecountm;
+            bytecountm = (tmsize_t)bytecount;
+            if ((uint64_t)bytecountm != bytecount)
+            {
+                TIFFErrorExtR(tif, module, "Integer overflow");
+                return (0);
+            }
+            if (bytecountm > tif->tif_rawdatasize)
+            {
+                tif->tif_curtile = NOTILE;
+                if ((tif->tif_flags & TIFF_MYBUFFER) == 0)
                 {
-			/* 10 and 4096 are just values that could be adjusted. */
-			/* Hopefully they are safe enough for all codecs */
-			tmsize_t stripsize = TIFFTileSize(tif);
-			if( stripsize != 0 &&
-			    (bytecount - 4096) / 10 > (uint64)stripsize  )
-			{
-				uint64 newbytecount = (uint64)stripsize * 10 + 4096;
-				if( newbytecount == 0 || newbytecount > (uint64)TIFF_INT64_MAX )
-				{
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-					TIFFWarningExt(tif->tif_clientdata, module,
-					  "Too large tile byte count %I64u, tile %lu. Limiting to %I64u",
-					     (unsigned __int64) bytecount,
-					     (unsigned long) tile,
-					     (unsigned __int64) newbytecount);
-#else
-					TIFFErrorExt(tif->tif_clientdata, module,
-					  "Too large tile byte count %llu, tile %lu. Limiting to %llu",
-					     (unsigned long long) bytecount,
-					     (unsigned long) tile,
-					     (unsigned long long) newbytecount);
-#endif
-					bytecount = newbytecount;
-				}
-			}
-		}
-
-		if (isMapped(tif)) {
-			/*
-			 * We must check for overflow, potentially causing
-			 * an OOB read. Instead of simple
-			 *
-			 *  TIFFGetStrileOffset(tif, tile)+bytecount > tif->tif_size
-			 *
-			 * comparison (which can overflow) we do the following
-			 * two comparisons:
-			 */
-			if (bytecount > (uint64)tif->tif_size ||
-			    TIFFGetStrileOffset(tif, tile) > (uint64)tif->tif_size - bytecount) {
-				tif->tif_curtile = NOTILE;
-				return (0);
-			}
-		}
-
-		if (isMapped(tif) &&
-		    (isFillOrder(tif, td->td_fillorder)
-		     || (tif->tif_flags & TIFF_NOBITREV))) {
-			/*
-			 * The image is mapped into memory and we either don't
-			 * need to flip bits or the compression routine is
-			 * going to handle this operation itself.  In this
-			 * case, avoid copying the raw data and instead just
-			 * reference the data from the memory mapped file
-			 * image.  This assumes that the decompression
-			 * routines do not modify the contents of the raw data
-			 * buffer (if they try to, the application will get a
-			 * fault since the file is mapped read-only).
-			 */
-			if ((tif->tif_flags & TIFF_MYBUFFER) && tif->tif_rawdata) {
-				_TIFFfree(tif->tif_rawdata);
-				tif->tif_rawdata = NULL;
-				tif->tif_rawdatasize = 0;
-			}
-			tif->tif_flags &= ~TIFF_MYBUFFER;
-
-			tif->tif_rawdatasize = (tmsize_t)bytecount;
-			tif->tif_rawdata =
-				tif->tif_base + (tmsize_t)TIFFGetStrileOffset(tif, tile);
-                        tif->tif_rawdataoff = 0;
-                        tif->tif_rawdataloaded = (tmsize_t) bytecount;
-			tif->tif_flags |= TIFF_BUFFERMMAP;
-		} else {
-			/*
-			 * Expand raw data buffer, if needed, to hold data
-			 * tile coming from file (perhaps should set upper
-			 * bound on the size of a buffer we'll use?).
-			 */
-			tmsize_t bytecountm;
-			bytecountm=(tmsize_t)bytecount;
-			if ((uint64)bytecountm!=bytecount)
-			{
-				TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
-				return(0);
-			}
-			if (bytecountm > tif->tif_rawdatasize) {
-				tif->tif_curtile = NOTILE;
-				if ((tif->tif_flags & TIFF_MYBUFFER) == 0) {
-					TIFFErrorExt(tif->tif_clientdata, module,
-					    "Data buffer too small to hold tile %lu",
-					    (unsigned long) tile);
-					return (0);
-				}
-			}
-			if (tif->tif_flags&TIFF_BUFFERMMAP) {
-				tif->tif_curtile = NOTILE;
-				tif->tif_rawdata = NULL;
-				tif->tif_rawdatasize = 0;
-				tif->tif_flags &= ~TIFF_BUFFERMMAP;
-			}
-
-			if( isMapped(tif) )
-			{
-				if (bytecountm > tif->tif_rawdatasize &&
-				    !TIFFReadBufferSetup(tif, 0, bytecountm))
-				{
-					return (0);
-				}
-				if (TIFFReadRawTile1(tif, tile, tif->tif_rawdata,
-				    bytecountm, module) != bytecountm)
-				{
-					return (0);
-				}
-			}
-			else
-			{
-				if (TIFFReadRawStripOrTile2(tif, tile, 0,
-				    bytecountm, module) != bytecountm)
-				{
-					return (0);
-				}
-			}
-
-
-                        tif->tif_rawdataoff = 0;
-                        tif->tif_rawdataloaded = bytecountm;
-                        
-			if (tif->tif_rawdata != NULL &&
-                            !isFillOrder(tif, td->td_fillorder) &&
-			    (tif->tif_flags & TIFF_NOBITREV) == 0)
-				TIFFReverseBits(tif->tif_rawdata,
-                                                tif->tif_rawdataloaded);
-		}
-	}
-	return (TIFFStartTile(tif, tile));
+                    TIFFErrorExtR(tif, module,
+                                  "Data buffer too small to hold tile %" PRIu32,
+                                  tile);
+                    return (0);
+                }
+            }
+            if (tif->tif_flags & TIFF_BUFFERMMAP)
+            {
+                tif->tif_curtile = NOTILE;
+                tif->tif_rawdata = NULL;
+                tif->tif_rawdatasize = 0;
+                tif->tif_flags &= ~TIFF_BUFFERMMAP;
+            }
+
+            if (isMapped(tif))
+            {
+                if (bytecountm > tif->tif_rawdatasize &&
+                    !TIFFReadBufferSetup(tif, 0, bytecountm))
+                {
+                    return (0);
+                }
+                if (TIFFReadRawTile1(tif, tile, tif->tif_rawdata, bytecountm,
+                                     module) != bytecountm)
+                {
+                    return (0);
+                }
+            }
+            else
+            {
+                if (TIFFReadRawStripOrTile2(tif, tile, 0, bytecountm, module) !=
+                    bytecountm)
+                {
+                    return (0);
+                }
+            }
+
+            tif->tif_rawdataoff = 0;
+            tif->tif_rawdataloaded = bytecountm;
+
+            if (tif->tif_rawdata != NULL &&
+                !isFillOrder(tif, td->td_fillorder) &&
+                (tif->tif_flags & TIFF_NOBITREV) == 0)
+                TIFFReverseBits(tif->tif_rawdata, tif->tif_rawdataloaded);
+        }
+    }
+    return (TIFFStartTile(tif, tile));
 }
 
 /*
@@ -1374,180 +1342,191 @@ TIFFFillTile(TIFF* tif, uint32 tile)
  * large enough to hold any individual strip of
  * raw data.
  */
-int
-TIFFReadBufferSetup(TIFF* tif, void* bp, tmsize_t size)
+int TIFFReadBufferSetup(TIFF *tif, void *bp, tmsize_t size)
 {
-	static const char module[] = "TIFFReadBufferSetup";
-
-	assert((tif->tif_flags&TIFF_NOREADRAW)==0);
-	tif->tif_flags &= ~TIFF_BUFFERMMAP;
-
-	if (tif->tif_rawdata) {
-		if (tif->tif_flags & TIFF_MYBUFFER)
-			_TIFFfree(tif->tif_rawdata);
-		tif->tif_rawdata = NULL;
-		tif->tif_rawdatasize = 0;
-	}
-	if (bp) {
-		tif->tif_rawdatasize = size;
-		tif->tif_rawdata = (uint8*) bp;
-		tif->tif_flags &= ~TIFF_MYBUFFER;
-	} else {
-		tif->tif_rawdatasize = (tmsize_t)TIFFroundup_64((uint64)size, 1024);
-		if (tif->tif_rawdatasize==0) {
-		    TIFFErrorExt(tif->tif_clientdata, module,
-				 "Invalid buffer size");
-		    return (0);
-		}
-		/* Initialize to zero to avoid uninitialized buffers in case of */
-                /* short reads (http://bugzilla.maptools.org/show_bug.cgi?id=2651) */
-		tif->tif_rawdata = (uint8*) _TIFFcalloc(1, tif->tif_rawdatasize);
-		tif->tif_flags |= TIFF_MYBUFFER;
-	}
-	if (tif->tif_rawdata == NULL) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "No space for data buffer at scanline %lu",
-		    (unsigned long) tif->tif_row);
-		tif->tif_rawdatasize = 0;
-		return (0);
-	}
-	return (1);
+    static const char module[] = "TIFFReadBufferSetup";
+
+    assert((tif->tif_flags & TIFF_NOREADRAW) == 0);
+    tif->tif_flags &= ~TIFF_BUFFERMMAP;
+
+    if (tif->tif_rawdata)
+    {
+        if (tif->tif_flags & TIFF_MYBUFFER)
+            _TIFFfreeExt(tif, tif->tif_rawdata);
+        tif->tif_rawdata = NULL;
+        tif->tif_rawdatasize = 0;
+    }
+    if (bp)
+    {
+        tif->tif_rawdatasize = size;
+        tif->tif_rawdata = (uint8_t *)bp;
+        tif->tif_flags &= ~TIFF_MYBUFFER;
+    }
+    else
+    {
+        tif->tif_rawdatasize = (tmsize_t)TIFFroundup_64((uint64_t)size, 1024);
+        if (tif->tif_rawdatasize == 0)
+        {
+            TIFFErrorExtR(tif, module, "Invalid buffer size");
+            return (0);
+        }
+        /* Initialize to zero to avoid uninitialized buffers in case of */
+        /* short reads (http://bugzilla.maptools.org/show_bug.cgi?id=2651) */
+        tif->tif_rawdata =
+            (uint8_t *)_TIFFcallocExt(tif, 1, tif->tif_rawdatasize);
+        tif->tif_flags |= TIFF_MYBUFFER;
+    }
+    if (tif->tif_rawdata == NULL)
+    {
+        TIFFErrorExtR(tif, module,
+                      "No space for data buffer at scanline %" PRIu32,
+                      tif->tif_row);
+        tif->tif_rawdatasize = 0;
+        return (0);
+    }
+    return (1);
 }
 
 /*
  * Set state to appear as if a
  * strip has just been read in.
  */
-static int
-TIFFStartStrip(TIFF* tif, uint32 strip)
+static int TIFFStartStrip(TIFF *tif, uint32_t strip)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-
-	if ((tif->tif_flags & TIFF_CODERSETUP) == 0) {
-		if (!(*tif->tif_setupdecode)(tif))
-			return (0);
-		tif->tif_flags |= TIFF_CODERSETUP;
-	}
-	tif->tif_curstrip = strip;
-	tif->tif_row = (strip % td->td_stripsperimage) * td->td_rowsperstrip;
-        tif->tif_flags &= ~TIFF_BUF4WRITE;
-
-	if (tif->tif_flags&TIFF_NOREADRAW)
-	{
-		tif->tif_rawcp = NULL;
-		tif->tif_rawcc = 0;  
-	}
-	else
-	{
-		tif->tif_rawcp = tif->tif_rawdata;
-		if( tif->tif_rawdataloaded > 0 )
-			tif->tif_rawcc = tif->tif_rawdataloaded;
-		else
-			tif->tif_rawcc = (tmsize_t)TIFFGetStrileByteCount(tif, strip);
-	}
-	if ((*tif->tif_predecode)(tif,
-			(uint16)(strip / td->td_stripsperimage)) == 0 ) {
-            /* Needed for example for scanline access, if tif_predecode */
-            /* fails, and we try to read the same strip again. Without invalidating */
-            /* tif_curstrip, we'd call tif_decoderow() on a possibly invalid */
-            /* codec state. */
-            tif->tif_curstrip = NOSTRIP;
-            return 0;
-        }
-        return 1;
+    TIFFDirectory *td = &tif->tif_dir;
+
+    if ((tif->tif_flags & TIFF_CODERSETUP) == 0)
+    {
+        if (!(*tif->tif_setupdecode)(tif))
+            return (0);
+        tif->tif_flags |= TIFF_CODERSETUP;
+    }
+    tif->tif_curstrip = strip;
+    tif->tif_row = (strip % td->td_stripsperimage) * td->td_rowsperstrip;
+    tif->tif_flags &= ~TIFF_BUF4WRITE;
+
+    if (tif->tif_flags & TIFF_NOREADRAW)
+    {
+        tif->tif_rawcp = NULL;
+        tif->tif_rawcc = 0;
+    }
+    else
+    {
+        tif->tif_rawcp = tif->tif_rawdata;
+        if (tif->tif_rawdataloaded > 0)
+            tif->tif_rawcc = tif->tif_rawdataloaded;
+        else
+            tif->tif_rawcc = (tmsize_t)TIFFGetStrileByteCount(tif, strip);
+    }
+    if ((*tif->tif_predecode)(tif, (uint16_t)(strip / td->td_stripsperimage)) ==
+        0)
+    {
+        /* Needed for example for scanline access, if tif_predecode */
+        /* fails, and we try to read the same strip again. Without invalidating
+         */
+        /* tif_curstrip, we'd call tif_decoderow() on a possibly invalid */
+        /* codec state. */
+        tif->tif_curstrip = NOSTRIP;
+        return 0;
+    }
+    return 1;
 }
 
 /*
  * Set state to appear as if a
  * tile has just been read in.
  */
-static int
-TIFFStartTile(TIFF* tif, uint32 tile)
+static int TIFFStartTile(TIFF *tif, uint32_t tile)
 {
-        static const char module[] = "TIFFStartTile";
-	TIFFDirectory *td = &tif->tif_dir;
-        uint32 howmany32;
-
-	if ((tif->tif_flags & TIFF_CODERSETUP) == 0) {
-		if (!(*tif->tif_setupdecode)(tif))
-			return (0);
-		tif->tif_flags |= TIFF_CODERSETUP;
-	}
-	tif->tif_curtile = tile;
-        howmany32=TIFFhowmany_32(td->td_imagewidth, td->td_tilewidth);
-        if (howmany32 == 0) {
-                 TIFFErrorExt(tif->tif_clientdata,module,"Zero tiles");
-                return 0;
-        }
-	tif->tif_row = (tile % howmany32) * td->td_tilelength;
-        howmany32=TIFFhowmany_32(td->td_imagelength, td->td_tilelength);
-        if (howmany32 == 0) {
-                TIFFErrorExt(tif->tif_clientdata,module,"Zero tiles");
-                return 0;
-        }
-	tif->tif_col = (tile % howmany32) * td->td_tilewidth;
-        tif->tif_flags &= ~TIFF_BUF4WRITE;
-	if (tif->tif_flags&TIFF_NOREADRAW)
-	{
-		tif->tif_rawcp = NULL;
-		tif->tif_rawcc = 0;
-	}
-	else
-	{
-		tif->tif_rawcp = tif->tif_rawdata;
-		if( tif->tif_rawdataloaded > 0 )
-			tif->tif_rawcc = tif->tif_rawdataloaded;
-		else
-			tif->tif_rawcc = (tmsize_t)TIFFGetStrileByteCount(tif, tile);
-	}
-	return ((*tif->tif_predecode)(tif,
-			(uint16)(tile/td->td_stripsperimage)));
+    static const char module[] = "TIFFStartTile";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint32_t howmany32;
+
+    if ((tif->tif_flags & TIFF_CODERSETUP) == 0)
+    {
+        if (!(*tif->tif_setupdecode)(tif))
+            return (0);
+        tif->tif_flags |= TIFF_CODERSETUP;
+    }
+    tif->tif_curtile = tile;
+    howmany32 = TIFFhowmany_32(td->td_imagewidth, td->td_tilewidth);
+    if (howmany32 == 0)
+    {
+        TIFFErrorExtR(tif, module, "Zero tiles");
+        return 0;
+    }
+    tif->tif_row = (tile % howmany32) * td->td_tilelength;
+    howmany32 = TIFFhowmany_32(td->td_imagelength, td->td_tilelength);
+    if (howmany32 == 0)
+    {
+        TIFFErrorExtR(tif, module, "Zero tiles");
+        return 0;
+    }
+    tif->tif_col = (tile % howmany32) * td->td_tilewidth;
+    tif->tif_flags &= ~TIFF_BUF4WRITE;
+    if (tif->tif_flags & TIFF_NOREADRAW)
+    {
+        tif->tif_rawcp = NULL;
+        tif->tif_rawcc = 0;
+    }
+    else
+    {
+        tif->tif_rawcp = tif->tif_rawdata;
+        if (tif->tif_rawdataloaded > 0)
+            tif->tif_rawcc = tif->tif_rawdataloaded;
+        else
+            tif->tif_rawcc = (tmsize_t)TIFFGetStrileByteCount(tif, tile);
+    }
+    return (
+        (*tif->tif_predecode)(tif, (uint16_t)(tile / td->td_stripsperimage)));
 }
 
-static int
-TIFFCheckRead(TIFF* tif, int tiles)
+static int TIFFCheckRead(TIFF *tif, int tiles)
 {
-	if (tif->tif_mode == O_WRONLY) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name, "File not open for reading");
-		return (0);
-	}
-	if (tiles ^ isTiled(tif)) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name, tiles ?
-		    "Can not read tiles from a striped image" :
-		    "Can not read scanlines from a tiled image");
-		return (0);
-	}
-	return (1);
+    if (tif->tif_mode == O_WRONLY)
+    {
+        TIFFErrorExtR(tif, tif->tif_name, "File not open for reading");
+        return (0);
+    }
+    if (tiles ^ isTiled(tif))
+    {
+        TIFFErrorExtR(tif, tif->tif_name,
+                      tiles ? "Can not read tiles from a striped image"
+                            : "Can not read scanlines from a tiled image");
+        return (0);
+    }
+    return (1);
 }
 
 /* Use the provided input buffer (inbuf, insize) and decompress it into
  * (outbuf, outsize).
- * This function replaces the use of TIFFReadEncodedStrip()/TIFFReadEncodedTile()
- * when the user can provide the buffer for the input data, for example when
- * he wants to avoid libtiff to read the strile offset/count values from the
- * [Strip|Tile][Offsets/ByteCounts] array.
- * inbuf content must be writable (if bit reversal is needed)
- * Returns 1 in case of success, 0 otherwise.
+ * This function replaces the use of
+ * TIFFReadEncodedStrip()/TIFFReadEncodedTile() when the user can provide the
+ * buffer for the input data, for example when he wants to avoid libtiff to read
+ * the strile offset/count values from the [Strip|Tile][Offsets/ByteCounts]
+ * array. inbuf content must be writable (if bit reversal is needed) Returns 1
+ * in case of success, 0 otherwise.
  */
-int      TIFFReadFromUserBuffer(TIFF* tif, uint32 strile,
-                                void* inbuf, tmsize_t insize,
-                                void* outbuf, tmsize_t outsize)
+int TIFFReadFromUserBuffer(TIFF *tif, uint32_t strile, void *inbuf,
+                           tmsize_t insize, void *outbuf, tmsize_t outsize)
 {
     static const char module[] = "TIFFReadFromUserBuffer";
     TIFFDirectory *td = &tif->tif_dir;
     int ret = 1;
-    uint32 old_tif_flags = tif->tif_flags;
+    uint32_t old_tif_flags = tif->tif_flags;
     tmsize_t old_rawdatasize = tif->tif_rawdatasize;
-    void* old_rawdata = tif->tif_rawdata;
+    void *old_rawdata = tif->tif_rawdata;
 
-    if (tif->tif_mode == O_WRONLY) {
-        TIFFErrorExt(tif->tif_clientdata, tif->tif_name, "File not open for reading");
+    if (tif->tif_mode == O_WRONLY)
+    {
+        TIFFErrorExtR(tif, tif->tif_name, "File not open for reading");
         return 0;
     }
-    if (tif->tif_flags&TIFF_NOREADRAW)
+    if (tif->tif_flags & TIFF_NOREADRAW)
     {
-        TIFFErrorExt(tif->tif_clientdata, module,
-                "Compression scheme does not support access to raw uncompressed data");
+        TIFFErrorExtR(tif, module,
+                      "Compression scheme does not support access to raw "
+                      "uncompressed data");
         return 0;
     }
 
@@ -1564,32 +1543,33 @@ int      TIFFReadFromUserBuffer(TIFF* tif, uint32 strile,
         TIFFReverseBits(inbuf, insize);
     }
 
-    if( TIFFIsTiled(tif) )
+    if (TIFFIsTiled(tif))
     {
-        if( !TIFFStartTile(tif, strile) ||
-            !(*tif->tif_decodetile)(tif, (uint8*) outbuf, outsize, 
-                                    (uint16)(strile/td->td_stripsperimage)) )
+        if (!TIFFStartTile(tif, strile) ||
+            !(*tif->tif_decodetile)(tif, (uint8_t *)outbuf, outsize,
+                                    (uint16_t)(strile / td->td_stripsperimage)))
         {
             ret = 0;
         }
     }
     else
     {
-        uint32 rowsperstrip=td->td_rowsperstrip;
-        uint32 stripsperplane;
-        if (rowsperstrip>td->td_imagelength)
-            rowsperstrip=td->td_imagelength;
-        stripsperplane= TIFFhowmany_32_maxuint_compat(td->td_imagelength, rowsperstrip);
-        if( !TIFFStartStrip(tif, strile) ||
-            !(*tif->tif_decodestrip)(tif, (uint8*) outbuf, outsize, 
-                                     (uint16)(strile/stripsperplane)) )
+        uint32_t rowsperstrip = td->td_rowsperstrip;
+        uint32_t stripsperplane;
+        if (rowsperstrip > td->td_imagelength)
+            rowsperstrip = td->td_imagelength;
+        stripsperplane =
+            TIFFhowmany_32_maxuint_compat(td->td_imagelength, rowsperstrip);
+        if (!TIFFStartStrip(tif, strile) ||
+            !(*tif->tif_decodestrip)(tif, (uint8_t *)outbuf, outsize,
+                                     (uint16_t)(strile / stripsperplane)))
         {
             ret = 0;
         }
     }
-    if( ret )
+    if (ret)
     {
-        (*tif->tif_postdecode)(tif, (uint8*) outbuf, outsize);
+        (*tif->tif_postdecode)(tif, (uint8_t *)outbuf, outsize);
     }
 
     if (!isFillOrder(tif, td->td_fillorder) &&
@@ -1598,7 +1578,8 @@ int      TIFFReadFromUserBuffer(TIFF* tif, uint32 strile,
         TIFFReverseBits(inbuf, insize);
     }
 
-    tif->tif_flags = old_tif_flags;
+    tif->tif_flags = (old_tif_flags & (TIFF_MYBUFFER | TIFF_BUFFERMMAP)) |
+                     (tif->tif_flags & ~(TIFF_MYBUFFER | TIFF_BUFFERMMAP));
     tif->tif_rawdatasize = old_rawdatasize;
     tif->tif_rawdata = old_rawdata;
     tif->tif_rawdataoff = 0;
@@ -1607,49 +1588,37 @@ int      TIFFReadFromUserBuffer(TIFF* tif, uint32 strile,
     return ret;
 }
 
-void
-_TIFFNoPostDecode(TIFF* tif, uint8* buf, tmsize_t cc)
+void _TIFFNoPostDecode(TIFF *tif, uint8_t *buf, tmsize_t cc)
 {
-    (void) tif; (void) buf; (void) cc;
+    (void)tif;
+    (void)buf;
+    (void)cc;
 }
 
-void
-_TIFFSwab16BitData(TIFF* tif, uint8* buf, tmsize_t cc)
+void _TIFFSwab16BitData(TIFF *tif, uint8_t *buf, tmsize_t cc)
 {
-    (void) tif;
+    (void)tif;
     assert((cc & 1) == 0);
-    TIFFSwabArrayOfShort((uint16*) buf, cc/2);
+    TIFFSwabArrayOfShort((uint16_t *)buf, cc / 2);
 }
 
-void
-_TIFFSwab24BitData(TIFF* tif, uint8* buf, tmsize_t cc)
+void _TIFFSwab24BitData(TIFF *tif, uint8_t *buf, tmsize_t cc)
 {
-    (void) tif;
+    (void)tif;
     assert((cc % 3) == 0);
-    TIFFSwabArrayOfTriples((uint8*) buf, cc/3);
+    TIFFSwabArrayOfTriples((uint8_t *)buf, cc / 3);
 }
 
-void
-_TIFFSwab32BitData(TIFF* tif, uint8* buf, tmsize_t cc)
+void _TIFFSwab32BitData(TIFF *tif, uint8_t *buf, tmsize_t cc)
 {
-    (void) tif;
+    (void)tif;
     assert((cc & 3) == 0);
-    TIFFSwabArrayOfLong((uint32*) buf, cc/4);
+    TIFFSwabArrayOfLong((uint32_t *)buf, cc / 4);
 }
 
-void
-_TIFFSwab64BitData(TIFF* tif, uint8* buf, tmsize_t cc)
+void _TIFFSwab64BitData(TIFF *tif, uint8_t *buf, tmsize_t cc)
 {
-    (void) tif;
+    (void)tif;
     assert((cc & 7) == 0);
-    TIFFSwabArrayOfDouble((double*) buf, cc/8);
+    TIFFSwabArrayOfDouble((double *)buf, cc / 8);
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_stream.cxx b/3rdparty/libtiff/tif_stream.cxx
index 7f640a9c0a4a..92ea273c560a 100644
--- a/3rdparty/libtiff/tif_stream.cxx
+++ b/3rdparty/libtiff/tif_stream.cxx
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1996 Sam Leffler
  * Copyright (c) 1991-1996 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -28,9 +28,7 @@
 #include "tiffiop.h"
 #include <iostream>
 
-#ifndef __VMS
 using namespace std;
-#endif
 
 /*
   ISO C++ uses a 'std::streamsize' type to define counts.  This makes
@@ -76,359 +74,331 @@ using namespace std;
 struct tiffis_data;
 struct tiffos_data;
 
-extern "C" {
-
-	static tmsize_t _tiffosReadProc(thandle_t, void*, tmsize_t);
-	static tmsize_t _tiffisReadProc(thandle_t fd, void* buf, tmsize_t size);
-	static tmsize_t _tiffosWriteProc(thandle_t fd, void* buf, tmsize_t size);
-	static tmsize_t _tiffisWriteProc(thandle_t, void*, tmsize_t);
-	static uint64   _tiffosSeekProc(thandle_t fd, uint64 off, int whence);
-	static uint64   _tiffisSeekProc(thandle_t fd, uint64 off, int whence);
-	static uint64   _tiffosSizeProc(thandle_t fd);
-	static uint64   _tiffisSizeProc(thandle_t fd);
-	static int      _tiffosCloseProc(thandle_t fd);
-	static int      _tiffisCloseProc(thandle_t fd);
-	static int 	_tiffDummyMapProc(thandle_t , void** base, toff_t* size );
-	static void     _tiffDummyUnmapProc(thandle_t , void* base, toff_t size );
-	static TIFF*    _tiffStreamOpen(const char* name, const char* mode, void *fd);
-
-struct tiffis_data
+extern "C"
 {
-	istream	*stream;
+
+    static tmsize_t _tiffosReadProc(thandle_t, void *, tmsize_t);
+    static tmsize_t _tiffisReadProc(thandle_t fd, void *buf, tmsize_t size);
+    static tmsize_t _tiffosWriteProc(thandle_t fd, void *buf, tmsize_t size);
+    static tmsize_t _tiffisWriteProc(thandle_t, void *, tmsize_t);
+    static uint64_t _tiffosSeekProc(thandle_t fd, uint64_t off, int whence);
+    static uint64_t _tiffisSeekProc(thandle_t fd, uint64_t off, int whence);
+    static uint64_t _tiffosSizeProc(thandle_t fd);
+    static uint64_t _tiffisSizeProc(thandle_t fd);
+    static int _tiffosCloseProc(thandle_t fd);
+    static int _tiffisCloseProc(thandle_t fd);
+    static int _tiffDummyMapProc(thandle_t, void **base, toff_t *size);
+    static void _tiffDummyUnmapProc(thandle_t, void *base, toff_t size);
+    static TIFF *_tiffStreamOpen(const char *name, const char *mode, void *fd);
+
+    struct tiffis_data
+    {
+        istream *stream;
         ios::pos_type start_pos;
-};
+    };
 
-struct tiffos_data
-{
-	ostream	*stream;
-	ios::pos_type start_pos;
-};
+    struct tiffos_data
+    {
+        ostream *stream;
+        ios::pos_type start_pos;
+    };
 
-static tmsize_t
-_tiffosReadProc(thandle_t, void*, tmsize_t)
-{
-        return 0;
-}
+    static tmsize_t _tiffosReadProc(thandle_t, void *, tmsize_t) { return 0; }
 
-static tmsize_t
-_tiffisReadProc(thandle_t fd, void* buf, tmsize_t size)
-{
-        tiffis_data	*data = reinterpret_cast<tiffis_data *>(fd);
+    static tmsize_t _tiffisReadProc(thandle_t fd, void *buf, tmsize_t size)
+    {
+        tiffis_data *data = reinterpret_cast<tiffis_data *>(fd);
 
         // Verify that type does not overflow.
         streamsize request_size = size;
         if (static_cast<tmsize_t>(request_size) != size)
-          return static_cast<tmsize_t>(-1);
+            return static_cast<tmsize_t>(-1);
 
-        data->stream->read((char *) buf, request_size);
+        data->stream->read((char *)buf, request_size);
 
         return static_cast<tmsize_t>(data->stream->gcount());
-}
+    }
 
-static tmsize_t
-_tiffosWriteProc(thandle_t fd, void* buf, tmsize_t size)
-{
-	tiffos_data	*data = reinterpret_cast<tiffos_data *>(fd);
-	ostream		*os = data->stream;
-	ios::pos_type	pos = os->tellp();
+    static tmsize_t _tiffosWriteProc(thandle_t fd, void *buf, tmsize_t size)
+    {
+        tiffos_data *data = reinterpret_cast<tiffos_data *>(fd);
+        ostream *os = data->stream;
+        ios::pos_type pos = os->tellp();
 
         // Verify that type does not overflow.
         streamsize request_size = size;
         if (static_cast<tmsize_t>(request_size) != size)
-          return static_cast<tmsize_t>(-1);
-
-	os->write(reinterpret_cast<const char *>(buf), request_size);
-
-	return static_cast<tmsize_t>(os->tellp() - pos);
-}
-
-static tmsize_t
-_tiffisWriteProc(thandle_t, void*, tmsize_t)
-{
-	return 0;
-}
-
-static uint64
-_tiffosSeekProc(thandle_t fd, uint64 off, int whence)
-{
-	tiffos_data	*data = reinterpret_cast<tiffos_data *>(fd);
-	ostream		*os = data->stream;
-
-	// if the stream has already failed, don't do anything
-	if( os->fail() )
-		return static_cast<uint64>(-1);
-
-	switch(whence) {
-	case SEEK_SET:
-		{
-			// Compute 64-bit offset
-			uint64 new_offset = static_cast<uint64>(data->start_pos) + off;
-
-			// Verify that value does not overflow
-			ios::off_type offset = static_cast<ios::off_type>(new_offset);
-			if (static_cast<uint64>(offset) != new_offset)
-				return static_cast<uint64>(-1);
-			
-			os->seekp(offset, ios::beg);
-		break;
-		}
-	case SEEK_CUR:
-		{
-			// Verify that value does not overflow
-			ios::off_type offset = static_cast<ios::off_type>(off);
-			if (static_cast<uint64>(offset) != off)
-				return static_cast<uint64>(-1);
-
-			os->seekp(offset, ios::cur);
-			break;
-		}
-	case SEEK_END:
-		{
-			// Verify that value does not overflow
-			ios::off_type offset = static_cast<ios::off_type>(off);
-			if (static_cast<uint64>(offset) != off)
-				return static_cast<uint64>(-1);
-
-			os->seekp(offset, ios::end);
-			break;
-		}
-	}
-
-	// Attempt to workaround problems with seeking past the end of the
-	// stream.  ofstream doesn't have a problem with this but
-	// ostrstream/ostringstream does. In that situation, add intermediate
-	// '\0' characters.
-	if( os->fail() ) {
-#ifdef __VMS
-		int		old_state;
-#else
-		ios::iostate	old_state;
-#endif
-		ios::pos_type	origin;
-
-		old_state = os->rdstate();
-		// reset the fail bit or else tellp() won't work below
-		os->clear(os->rdstate() & ~ios::failbit);
-		switch( whence ) {
-			case SEEK_SET:
-                        default:
-				origin = data->start_pos;
-				break;
-			case SEEK_CUR:
-				origin = os->tellp();
-				break;
-			case SEEK_END:
-				os->seekp(0, ios::end);
-				origin = os->tellp();
-				break;
-		}
-		// restore original stream state
-		os->clear(old_state);	
-
-		// only do something if desired seek position is valid
-		if( (static_cast<uint64>(origin) + off) > static_cast<uint64>(data->start_pos) ) {
-			uint64	num_fill;
-
-			// clear the fail bit 
-			os->clear(os->rdstate() & ~ios::failbit);
-
-			// extend the stream to the expected size
-			os->seekp(0, ios::end);
-			num_fill = (static_cast<uint64>(origin)) + off - os->tellp();
-			for( uint64 i = 0; i < num_fill; i++ )
-				os->put('\0');
-
-			// retry the seek
-			os->seekp(static_cast<ios::off_type>(static_cast<uint64>(origin) + off), ios::beg);
-		}
-	}
-
-	return static_cast<uint64>(os->tellp());
-}
-
-static uint64
-_tiffisSeekProc(thandle_t fd, uint64 off, int whence)
-{
-	tiffis_data	*data = reinterpret_cast<tiffis_data *>(fd);
-
-	switch(whence) {
-	case SEEK_SET:
-		{
-			// Compute 64-bit offset
-			uint64 new_offset = static_cast<uint64>(data->start_pos) + off;
-			
-			// Verify that value does not overflow
-			ios::off_type offset = static_cast<ios::off_type>(new_offset);
-			if (static_cast<uint64>(offset) != new_offset)
-				return static_cast<uint64>(-1);
-
-			data->stream->seekg(offset, ios::beg);
-			break;
-		}
-	case SEEK_CUR:
-		{
-			// Verify that value does not overflow
-			ios::off_type offset = static_cast<ios::off_type>(off);
-			if (static_cast<uint64>(offset) != off)
-				return static_cast<uint64>(-1);
-
-			data->stream->seekg(offset, ios::cur);
-			break;
-		}
-	case SEEK_END:
-		{
-			// Verify that value does not overflow
-			ios::off_type offset = static_cast<ios::off_type>(off);
-			if (static_cast<uint64>(offset) != off)
-				return static_cast<uint64>(-1);
-
-			data->stream->seekg(offset, ios::end);
-			break;
-		}
-	}
-
-	return (uint64) (data->stream->tellg() - data->start_pos);
-}
-
-static uint64
-_tiffosSizeProc(thandle_t fd)
-{
-	tiffos_data	*data = reinterpret_cast<tiffos_data *>(fd);
-	ostream		*os = data->stream;
-	ios::pos_type	pos = os->tellp();
-	ios::pos_type	len;
-
-	os->seekp(0, ios::end);
-	len = os->tellp();
-	os->seekp(pos);
-
-	return (uint64) len;
-}
-
-static uint64
-_tiffisSizeProc(thandle_t fd)
-{
-	tiffis_data	*data = reinterpret_cast<tiffis_data *>(fd);
-	ios::pos_type	pos = data->stream->tellg();
-	ios::pos_type	len;
-
-	data->stream->seekg(0, ios::end);
-	len = data->stream->tellg();
-	data->stream->seekg(pos);
-
-	return (uint64) len;
-}
-
-static int
-_tiffosCloseProc(thandle_t fd)
-{
-	// Our stream was not allocated by us, so it shouldn't be closed by us.
-	delete reinterpret_cast<tiffos_data *>(fd);
-	return 0;
-}
-
-static int
-_tiffisCloseProc(thandle_t fd)
-{
-	// Our stream was not allocated by us, so it shouldn't be closed by us.
-	delete reinterpret_cast<tiffis_data *>(fd);
-	return 0;
-}
-
-static int
-_tiffDummyMapProc(thandle_t , void** base, toff_t* size )
-{
-	(void) base;
-	(void) size;
-	return (0);
-}
-
-static void
-_tiffDummyUnmapProc(thandle_t , void* base, toff_t size )
-{
-	(void) base;
-	(void) size;
-}
+            return static_cast<tmsize_t>(-1);
+
+        os->write(reinterpret_cast<const char *>(buf), request_size);
+
+        return static_cast<tmsize_t>(os->tellp() - pos);
+    }
+
+    static tmsize_t _tiffisWriteProc(thandle_t, void *, tmsize_t) { return 0; }
+
+    static uint64_t _tiffosSeekProc(thandle_t fd, uint64_t off, int whence)
+    {
+        tiffos_data *data = reinterpret_cast<tiffos_data *>(fd);
+        ostream *os = data->stream;
+
+        // if the stream has already failed, don't do anything
+        if (os->fail())
+            return static_cast<uint64_t>(-1);
+
+        switch (whence)
+        {
+            case SEEK_SET:
+            {
+                // Compute 64-bit offset
+                uint64_t new_offset =
+                    static_cast<uint64_t>(data->start_pos) + off;
+
+                // Verify that value does not overflow
+                ios::off_type offset = static_cast<ios::off_type>(new_offset);
+                if (static_cast<uint64_t>(offset) != new_offset)
+                    return static_cast<uint64_t>(-1);
+
+                os->seekp(offset, ios::beg);
+                break;
+            }
+            case SEEK_CUR:
+            {
+                // Verify that value does not overflow
+                ios::off_type offset = static_cast<ios::off_type>(off);
+                if (static_cast<uint64_t>(offset) != off)
+                    return static_cast<uint64_t>(-1);
+
+                os->seekp(offset, ios::cur);
+                break;
+            }
+            case SEEK_END:
+            {
+                // Verify that value does not overflow
+                ios::off_type offset = static_cast<ios::off_type>(off);
+                if (static_cast<uint64_t>(offset) != off)
+                    return static_cast<uint64_t>(-1);
+
+                os->seekp(offset, ios::end);
+                break;
+            }
+        }
+
+        // Attempt to workaround problems with seeking past the end of the
+        // stream.  ofstream doesn't have a problem with this but
+        // ostrstream/ostringstream does. In that situation, add intermediate
+        // '\0' characters.
+        if (os->fail())
+        {
+            ios::iostate old_state;
+            ios::pos_type origin;
+
+            old_state = os->rdstate();
+            // reset the fail bit or else tellp() won't work below
+            os->clear(os->rdstate() & ~ios::failbit);
+            switch (whence)
+            {
+                case SEEK_SET:
+                default:
+                    origin = data->start_pos;
+                    break;
+                case SEEK_CUR:
+                    origin = os->tellp();
+                    break;
+                case SEEK_END:
+                    os->seekp(0, ios::end);
+                    origin = os->tellp();
+                    break;
+            }
+            // restore original stream state
+            os->clear(old_state);
+
+            // only do something if desired seek position is valid
+            if ((static_cast<uint64_t>(origin) + off) >
+                static_cast<uint64_t>(data->start_pos))
+            {
+                uint64_t num_fill;
+
+                // clear the fail bit
+                os->clear(os->rdstate() & ~ios::failbit);
+
+                // extend the stream to the expected size
+                os->seekp(0, ios::end);
+                num_fill = (static_cast<uint64_t>(origin)) + off - os->tellp();
+                for (uint64_t i = 0; i < num_fill; i++)
+                    os->put('\0');
+
+                // retry the seek
+                os->seekp(static_cast<ios::off_type>(
+                              static_cast<uint64_t>(origin) + off),
+                          ios::beg);
+            }
+        }
+
+        return static_cast<uint64_t>(os->tellp());
+    }
+
+    static uint64_t _tiffisSeekProc(thandle_t fd, uint64_t off, int whence)
+    {
+        tiffis_data *data = reinterpret_cast<tiffis_data *>(fd);
+
+        switch (whence)
+        {
+            case SEEK_SET:
+            {
+                // Compute 64-bit offset
+                uint64_t new_offset =
+                    static_cast<uint64_t>(data->start_pos) + off;
+
+                // Verify that value does not overflow
+                ios::off_type offset = static_cast<ios::off_type>(new_offset);
+                if (static_cast<uint64_t>(offset) != new_offset)
+                    return static_cast<uint64_t>(-1);
+
+                data->stream->seekg(offset, ios::beg);
+                break;
+            }
+            case SEEK_CUR:
+            {
+                // Verify that value does not overflow
+                ios::off_type offset = static_cast<ios::off_type>(off);
+                if (static_cast<uint64_t>(offset) != off)
+                    return static_cast<uint64_t>(-1);
+
+                data->stream->seekg(offset, ios::cur);
+                break;
+            }
+            case SEEK_END:
+            {
+                // Verify that value does not overflow
+                ios::off_type offset = static_cast<ios::off_type>(off);
+                if (static_cast<uint64_t>(offset) != off)
+                    return static_cast<uint64_t>(-1);
+
+                data->stream->seekg(offset, ios::end);
+                break;
+            }
+        }
+
+        return (uint64_t)(data->stream->tellg() - data->start_pos);
+    }
+
+    static uint64_t _tiffosSizeProc(thandle_t fd)
+    {
+        tiffos_data *data = reinterpret_cast<tiffos_data *>(fd);
+        ostream *os = data->stream;
+        ios::pos_type pos = os->tellp();
+        ios::pos_type len;
+
+        os->seekp(0, ios::end);
+        len = os->tellp();
+        os->seekp(pos);
+
+        return (uint64_t)len;
+    }
+
+    static uint64_t _tiffisSizeProc(thandle_t fd)
+    {
+        tiffis_data *data = reinterpret_cast<tiffis_data *>(fd);
+        ios::pos_type pos = data->stream->tellg();
+        ios::pos_type len;
+
+        data->stream->seekg(0, ios::end);
+        len = data->stream->tellg();
+        data->stream->seekg(pos);
+
+        return (uint64_t)len;
+    }
+
+    static int _tiffosCloseProc(thandle_t fd)
+    {
+        // Our stream was not allocated by us, so it shouldn't be closed by us.
+        delete reinterpret_cast<tiffos_data *>(fd);
+        return 0;
+    }
 
-/*
- * Open a TIFF file descriptor for read/writing.
- */
-static TIFF*
-_tiffStreamOpen(const char* name, const char* mode, void *fd)
-{
-	TIFF*	tif;
-
-	if( strchr(mode, 'w') ) {
-		tiffos_data	*data = new tiffos_data;
-		data->stream = reinterpret_cast<ostream *>(fd);
-		data->start_pos = data->stream->tellp();
-
-		// Open for writing.
-		tif = TIFFClientOpen(name, mode,
-				reinterpret_cast<thandle_t>(data),
-				_tiffosReadProc,
-                                _tiffosWriteProc,
-				_tiffosSeekProc,
-                                _tiffosCloseProc,
-				_tiffosSizeProc,
-				_tiffDummyMapProc,
-                                _tiffDummyUnmapProc);
-		if (!tif) {
-			delete data;
-		}
-	} else {
-		tiffis_data	*data = new tiffis_data;
-		data->stream = reinterpret_cast<istream *>(fd);
-		data->start_pos = data->stream->tellg();
-		// Open for reading.
-		tif = TIFFClientOpen(name, mode,
-				reinterpret_cast<thandle_t>(data),
-				_tiffisReadProc,
-                                _tiffisWriteProc,
-				_tiffisSeekProc,
-                                _tiffisCloseProc,
-				_tiffisSizeProc,
-				_tiffDummyMapProc,
-                                _tiffDummyUnmapProc);
-		if (!tif) {
-			delete data;
-		}
-	}
-
-	return (tif);
-}
+    static int _tiffisCloseProc(thandle_t fd)
+    {
+        // Our stream was not allocated by us, so it shouldn't be closed by us.
+        delete reinterpret_cast<tiffis_data *>(fd);
+        return 0;
+    }
+
+    static int _tiffDummyMapProc(thandle_t, void **base, toff_t *size)
+    {
+        (void)base;
+        (void)size;
+        return (0);
+    }
+
+    static void _tiffDummyUnmapProc(thandle_t, void *base, toff_t size)
+    {
+        (void)base;
+        (void)size;
+    }
+
+    /*
+     * Open a TIFF file descriptor for read/writing.
+     */
+    static TIFF *_tiffStreamOpen(const char *name, const char *mode, void *fd)
+    {
+        TIFF *tif;
+
+        if (strchr(mode, 'w'))
+        {
+            tiffos_data *data = new tiffos_data;
+            data->stream = reinterpret_cast<ostream *>(fd);
+            data->start_pos = data->stream->tellp();
+
+            // Open for writing.
+            tif = TIFFClientOpen(
+                name, mode, reinterpret_cast<thandle_t>(data), _tiffosReadProc,
+                _tiffosWriteProc, _tiffosSeekProc, _tiffosCloseProc,
+                _tiffosSizeProc, _tiffDummyMapProc, _tiffDummyUnmapProc);
+            if (!tif)
+            {
+                delete data;
+            }
+        }
+        else
+        {
+            tiffis_data *data = new tiffis_data;
+            data->stream = reinterpret_cast<istream *>(fd);
+            data->start_pos = data->stream->tellg();
+            // Open for reading.
+            tif = TIFFClientOpen(
+                name, mode, reinterpret_cast<thandle_t>(data), _tiffisReadProc,
+                _tiffisWriteProc, _tiffisSeekProc, _tiffisCloseProc,
+                _tiffisSizeProc, _tiffDummyMapProc, _tiffDummyUnmapProc);
+            if (!tif)
+            {
+                delete data;
+            }
+        }
+
+        return (tif);
+    }
 
 } /* extern "C" */
 
-TIFF*
-TIFFStreamOpen(const char* name, ostream *os)
+TIFF *TIFFStreamOpen(const char *name, ostream *os)
 {
-	// If os is either a ostrstream or ostringstream, and has no data
-	// written to it yet, then tellp() will return -1 which will break us.
-	// We workaround this by writing out a dummy character and
-	// then seek back to the beginning.
-	if( !os->fail() && static_cast<int>(os->tellp()) < 0 ) {
-		*os << '\0';
-		os->seekp(0);
-	}
-
-	// NB: We don't support mapped files with streams so add 'm'
-	return _tiffStreamOpen(name, "wm", os);
+    // If os is either a ostrstream or ostringstream, and has no data
+    // written to it yet, then tellp() will return -1 which will break us.
+    // We workaround this by writing out a dummy character and
+    // then seek back to the beginning.
+    if (!os->fail() && static_cast<int>(os->tellp()) < 0)
+    {
+        *os << '\0';
+        os->seekp(0);
+    }
+
+    // NB: We don't support mapped files with streams so add 'm'
+    return _tiffStreamOpen(name, "wm", os);
 }
 
-TIFF*
-TIFFStreamOpen(const char* name, istream *is)
+TIFF *TIFFStreamOpen(const char *name, istream *is)
 {
-	// NB: We don't support mapped files with streams so add 'm'
-	return _tiffStreamOpen(name, "rm", is);
+    // NB: We don't support mapped files with streams so add 'm'
+    return _tiffStreamOpen(name, "rm", is);
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
-
diff --git a/3rdparty/libtiff/tif_strip.c b/3rdparty/libtiff/tif_strip.c
index c08c60a7928d..820a2544c387 100644
--- a/3rdparty/libtiff/tif_strip.c
+++ b/3rdparty/libtiff/tif_strip.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1991-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -32,153 +32,145 @@
 /*
  * Compute which strip a (row,sample) value is in.
  */
-uint32
-TIFFComputeStrip(TIFF* tif, uint32 row, uint16 sample)
+uint32_t TIFFComputeStrip(TIFF *tif, uint32_t row, uint16_t sample)
 {
-	static const char module[] = "TIFFComputeStrip";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint32 strip;
+    static const char module[] = "TIFFComputeStrip";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint32_t strip;
 
-	strip = row / td->td_rowsperstrip;
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE) {
-		if (sample >= td->td_samplesperpixel) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "%lu: Sample out of range, max %lu",
-			    (unsigned long) sample, (unsigned long) td->td_samplesperpixel);
-			return (0);
-		}
-		strip += (uint32)sample*td->td_stripsperimage;
-	}
-	return (strip);
+    strip = row / td->td_rowsperstrip;
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+    {
+        if (sample >= td->td_samplesperpixel)
+        {
+            TIFFErrorExtR(tif, module, "%lu: Sample out of range, max %lu",
+                          (unsigned long)sample,
+                          (unsigned long)td->td_samplesperpixel);
+            return (0);
+        }
+        strip += (uint32_t)sample * td->td_stripsperimage;
+    }
+    return (strip);
 }
 
 /*
  * Compute how many strips are in an image.
  */
-uint32
-TIFFNumberOfStrips(TIFF* tif)
+uint32_t TIFFNumberOfStrips(TIFF *tif)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-	uint32 nstrips;
+    TIFFDirectory *td = &tif->tif_dir;
+    uint32_t nstrips;
 
-	nstrips = (td->td_rowsperstrip == (uint32) -1 ? 1 :
-	     TIFFhowmany_32(td->td_imagelength, td->td_rowsperstrip));
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
-		nstrips = _TIFFMultiply32(tif, nstrips, (uint32)td->td_samplesperpixel,
-		    "TIFFNumberOfStrips");
-	return (nstrips);
+    nstrips = (td->td_rowsperstrip == (uint32_t)-1
+                   ? 1
+                   : TIFFhowmany_32(td->td_imagelength, td->td_rowsperstrip));
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+        nstrips =
+            _TIFFMultiply32(tif, nstrips, (uint32_t)td->td_samplesperpixel,
+                            "TIFFNumberOfStrips");
+    return (nstrips);
 }
 
 /*
  * Compute the # bytes in a variable height, row-aligned strip.
  */
-uint64
-TIFFVStripSize64(TIFF* tif, uint32 nrows)
+uint64_t TIFFVStripSize64(TIFF *tif, uint32_t nrows)
 {
-	static const char module[] = "TIFFVStripSize64";
-	TIFFDirectory *td = &tif->tif_dir;
-	if (nrows==(uint32)(-1))
-		nrows=td->td_imagelength;
-	if ((td->td_planarconfig==PLANARCONFIG_CONTIG)&&
-	    (td->td_photometric == PHOTOMETRIC_YCBCR)&&
-	    (!isUpSampled(tif)))
-	{
-		/*
-		 * Packed YCbCr data contain one Cb+Cr for every
-		 * HorizontalSampling*VerticalSampling Y values.
-		 * Must also roundup width and height when calculating
-		 * since images that are not a multiple of the
-		 * horizontal/vertical subsampling area include
-		 * YCbCr data for the extended image.
-		 */
-		uint16 ycbcrsubsampling[2];
-		uint16 samplingblock_samples;
-		uint32 samplingblocks_hor;
-		uint32 samplingblocks_ver;
-		uint64 samplingrow_samples;
-		uint64 samplingrow_size;
-		if(td->td_samplesperpixel!=3)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,
-			    "Invalid td_samplesperpixel value");
-			return 0;
-		}
-		TIFFGetFieldDefaulted(tif,TIFFTAG_YCBCRSUBSAMPLING,ycbcrsubsampling+0,
-		    ycbcrsubsampling+1);
-		if ((ycbcrsubsampling[0] != 1 && ycbcrsubsampling[0] != 2 && ycbcrsubsampling[0] != 4)
-		    ||(ycbcrsubsampling[1] != 1 && ycbcrsubsampling[1] != 2 && ycbcrsubsampling[1] != 4))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,
-				     "Invalid YCbCr subsampling (%dx%d)", 
-				     ycbcrsubsampling[0], 
-				     ycbcrsubsampling[1] );
-			return 0;
-		}
-		samplingblock_samples=ycbcrsubsampling[0]*ycbcrsubsampling[1]+2;
-		samplingblocks_hor=TIFFhowmany_32(td->td_imagewidth,ycbcrsubsampling[0]);
-		samplingblocks_ver=TIFFhowmany_32(nrows,ycbcrsubsampling[1]);
-		samplingrow_samples=_TIFFMultiply64(tif,samplingblocks_hor,samplingblock_samples,module);
-		samplingrow_size=TIFFhowmany8_64(_TIFFMultiply64(tif,samplingrow_samples,td->td_bitspersample,module));
-		return(_TIFFMultiply64(tif,samplingrow_size,samplingblocks_ver,module));
-	}
-	else
-		return(_TIFFMultiply64(tif,nrows,TIFFScanlineSize64(tif),module));
+    static const char module[] = "TIFFVStripSize64";
+    TIFFDirectory *td = &tif->tif_dir;
+    if (nrows == (uint32_t)(-1))
+        nrows = td->td_imagelength;
+    if ((td->td_planarconfig == PLANARCONFIG_CONTIG) &&
+        (td->td_photometric == PHOTOMETRIC_YCBCR) && (!isUpSampled(tif)))
+    {
+        /*
+         * Packed YCbCr data contain one Cb+Cr for every
+         * HorizontalSampling*VerticalSampling Y values.
+         * Must also roundup width and height when calculating
+         * since images that are not a multiple of the
+         * horizontal/vertical subsampling area include
+         * YCbCr data for the extended image.
+         */
+        uint16_t ycbcrsubsampling[2];
+        uint16_t samplingblock_samples;
+        uint32_t samplingblocks_hor;
+        uint32_t samplingblocks_ver;
+        uint64_t samplingrow_samples;
+        uint64_t samplingrow_size;
+        if (td->td_samplesperpixel != 3)
+        {
+            TIFFErrorExtR(tif, module, "Invalid td_samplesperpixel value");
+            return 0;
+        }
+        TIFFGetFieldDefaulted(tif, TIFFTAG_YCBCRSUBSAMPLING,
+                              ycbcrsubsampling + 0, ycbcrsubsampling + 1);
+        if ((ycbcrsubsampling[0] != 1 && ycbcrsubsampling[0] != 2 &&
+             ycbcrsubsampling[0] != 4) ||
+            (ycbcrsubsampling[1] != 1 && ycbcrsubsampling[1] != 2 &&
+             ycbcrsubsampling[1] != 4))
+        {
+            TIFFErrorExtR(tif, module, "Invalid YCbCr subsampling (%dx%d)",
+                          ycbcrsubsampling[0], ycbcrsubsampling[1]);
+            return 0;
+        }
+        samplingblock_samples = ycbcrsubsampling[0] * ycbcrsubsampling[1] + 2;
+        samplingblocks_hor =
+            TIFFhowmany_32(td->td_imagewidth, ycbcrsubsampling[0]);
+        samplingblocks_ver = TIFFhowmany_32(nrows, ycbcrsubsampling[1]);
+        samplingrow_samples = _TIFFMultiply64(tif, samplingblocks_hor,
+                                              samplingblock_samples, module);
+        samplingrow_size = TIFFhowmany8_64(_TIFFMultiply64(
+            tif, samplingrow_samples, td->td_bitspersample, module));
+        return (
+            _TIFFMultiply64(tif, samplingrow_size, samplingblocks_ver, module));
+    }
+    else
+        return (_TIFFMultiply64(tif, nrows, TIFFScanlineSize64(tif), module));
 }
-tmsize_t
-TIFFVStripSize(TIFF* tif, uint32 nrows)
+tmsize_t TIFFVStripSize(TIFF *tif, uint32_t nrows)
 {
-	static const char module[] = "TIFFVStripSize";
-	uint64 m;
-	m=TIFFVStripSize64(tif,nrows);
-        return _TIFFCastUInt64ToSSize(tif, m, module);
+    static const char module[] = "TIFFVStripSize";
+    uint64_t m;
+    m = TIFFVStripSize64(tif, nrows);
+    return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
  * Compute the # bytes in a raw strip.
  */
-uint64
-TIFFRawStripSize64(TIFF* tif, uint32 strip)
+uint64_t TIFFRawStripSize64(TIFF *tif, uint32_t strip)
 {
-	static const char module[] = "TIFFRawStripSize64";
-	uint64 bytecount = TIFFGetStrileByteCount(tif, strip);
+    static const char module[] = "TIFFRawStripSize64";
+    uint64_t bytecount = TIFFGetStrileByteCount(tif, strip);
 
-	if (bytecount == 0)
-	{
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "%I64u: Invalid strip byte count, strip %lu",
-			     (unsigned __int64) bytecount,
-			     (unsigned long) strip);
-#else
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "%llu: Invalid strip byte count, strip %lu",
-			     (unsigned long long) bytecount,
-			     (unsigned long) strip);
-#endif
-		bytecount = (uint64) -1;
-	}
+    if (bytecount == 0)
+    {
+        TIFFErrorExtR(tif, module,
+                      "%" PRIu64 ": Invalid strip byte count, strip %lu",
+                      (uint64_t)bytecount, (unsigned long)strip);
+        bytecount = (uint64_t)-1;
+    }
 
-	return bytecount;
+    return bytecount;
 }
-tmsize_t
-TIFFRawStripSize(TIFF* tif, uint32 strip)
+tmsize_t TIFFRawStripSize(TIFF *tif, uint32_t strip)
 {
-	static const char module[] = "TIFFRawStripSize";
-	uint64 m;
-	tmsize_t n;
-	m=TIFFRawStripSize64(tif,strip);
-	if (m==(uint64)(-1))
-		n=(tmsize_t)(-1);
-	else
-	{
-		n=(tmsize_t)m;
-		if ((uint64)n!=m)
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,"Integer overflow");
-			n=0;
-		}
-	}
-	return(n);
+    static const char module[] = "TIFFRawStripSize";
+    uint64_t m;
+    tmsize_t n;
+    m = TIFFRawStripSize64(tif, strip);
+    if (m == (uint64_t)(-1))
+        n = (tmsize_t)(-1);
+    else
+    {
+        n = (tmsize_t)m;
+        if ((uint64_t)n != m)
+        {
+            TIFFErrorExtR(tif, module, "Integer overflow");
+            n = 0;
+        }
+    }
+    return (n);
 }
 
 /*
@@ -189,22 +181,20 @@ TIFFRawStripSize(TIFF* tif, uint32 strip)
  * truncated to reflect the actual space required
  * to hold the strip.
  */
-uint64
-TIFFStripSize64(TIFF* tif)
+uint64_t TIFFStripSize64(TIFF *tif)
 {
-	TIFFDirectory* td = &tif->tif_dir;
-	uint32 rps = td->td_rowsperstrip;
-	if (rps > td->td_imagelength)
-		rps = td->td_imagelength;
-	return (TIFFVStripSize64(tif, rps));
+    TIFFDirectory *td = &tif->tif_dir;
+    uint32_t rps = td->td_rowsperstrip;
+    if (rps > td->td_imagelength)
+        rps = td->td_imagelength;
+    return (TIFFVStripSize64(tif, rps));
 }
-tmsize_t
-TIFFStripSize(TIFF* tif)
+tmsize_t TIFFStripSize(TIFF *tif)
 {
-	static const char module[] = "TIFFStripSize";
-	uint64 m;
-	m=TIFFStripSize64(tif);
-	return _TIFFCastUInt64ToSSize(tif, m, module);
+    static const char module[] = "TIFFStripSize";
+    uint64_t m;
+    m = TIFFStripSize64(tif);
+    return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
@@ -213,34 +203,33 @@ TIFFStripSize(TIFF* tif)
  * request is <1 then we choose a strip size according
  * to certain heuristics.
  */
-uint32
-TIFFDefaultStripSize(TIFF* tif, uint32 request)
+uint32_t TIFFDefaultStripSize(TIFF *tif, uint32_t request)
 {
-	return (*tif->tif_defstripsize)(tif, request);
+    return (*tif->tif_defstripsize)(tif, request);
 }
 
-uint32
-_TIFFDefaultStripSize(TIFF* tif, uint32 s)
+uint32_t _TIFFDefaultStripSize(TIFF *tif, uint32_t s)
 {
-	if ((int32) s < 1) {
-		/*
-		 * If RowsPerStrip is unspecified, try to break the
-		 * image up into strips that are approximately
-		 * STRIP_SIZE_DEFAULT bytes long.
-		 */
-		uint64 scanlinesize;
-		uint64 rows;
-		scanlinesize=TIFFScanlineSize64(tif);
-		if (scanlinesize==0)
-			scanlinesize=1;
-		rows=(uint64)STRIP_SIZE_DEFAULT/scanlinesize;
-		if (rows==0)
-			rows=1;
-		else if (rows>0xFFFFFFFF)
-			rows=0xFFFFFFFF;
-		s=(uint32)rows;
-	}
-	return (s);
+    if ((int32_t)s < 1)
+    {
+        /*
+         * If RowsPerStrip is unspecified, try to break the
+         * image up into strips that are approximately
+         * STRIP_SIZE_DEFAULT bytes long.
+         */
+        uint64_t scanlinesize;
+        uint64_t rows;
+        scanlinesize = TIFFScanlineSize64(tif);
+        if (scanlinesize == 0)
+            scanlinesize = 1;
+        rows = (uint64_t)STRIP_SIZE_DEFAULT / scanlinesize;
+        if (rows == 0)
+            rows = 1;
+        else if (rows > 0xFFFFFFFF)
+            rows = 0xFFFFFFFF;
+        s = (uint32_t)rows;
+    }
+    return (s);
 }
 
 /*
@@ -253,70 +242,79 @@ _TIFFDefaultStripSize(TIFF* tif, uint32 s)
  * subsampling lines divided by vertical subsampling. It should thus make
  * sense when multiplied by a multiple of vertical subsampling.
  */
-uint64
-TIFFScanlineSize64(TIFF* tif)
+uint64_t TIFFScanlineSize64(TIFF *tif)
 {
-	static const char module[] = "TIFFScanlineSize64";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint64 scanline_size;
-	if (td->td_planarconfig==PLANARCONFIG_CONTIG)
-	{
-		if ((td->td_photometric==PHOTOMETRIC_YCBCR)&&
-		    (td->td_samplesperpixel==3)&&
-		    (!isUpSampled(tif)))
-		{
-			uint16 ycbcrsubsampling[2];
-			uint16 samplingblock_samples;
-			uint32 samplingblocks_hor;
-			uint64 samplingrow_samples;
-			uint64 samplingrow_size;
-			if(td->td_samplesperpixel!=3)
-			{
-                            TIFFErrorExt(tif->tif_clientdata,module,
-                                         "Invalid td_samplesperpixel value");
-                            return 0;
-			}
-			TIFFGetFieldDefaulted(tif,TIFFTAG_YCBCRSUBSAMPLING,
-                                              ycbcrsubsampling+0,
-                                              ycbcrsubsampling+1);
-			if (((ycbcrsubsampling[0]!=1)&&(ycbcrsubsampling[0]!=2)&&(ycbcrsubsampling[0]!=4)) ||
-			    ((ycbcrsubsampling[1]!=1)&&(ycbcrsubsampling[1]!=2)&&(ycbcrsubsampling[1]!=4)))
-			{
-                            TIFFErrorExt(tif->tif_clientdata,module,
-                                         "Invalid YCbCr subsampling");
-                            return 0;
-			}
-			samplingblock_samples = ycbcrsubsampling[0]*ycbcrsubsampling[1]+2;
-			samplingblocks_hor = TIFFhowmany_32(td->td_imagewidth,ycbcrsubsampling[0]);
-			samplingrow_samples = _TIFFMultiply64(tif,samplingblocks_hor,samplingblock_samples,module);
-			samplingrow_size = TIFFhowmany_64(_TIFFMultiply64(tif,samplingrow_samples,td->td_bitspersample,module),8);
-			scanline_size = (samplingrow_size/ycbcrsubsampling[1]);
-		}
-		else
-		{
-			uint64 scanline_samples;
-			scanline_samples=_TIFFMultiply64(tif,td->td_imagewidth,td->td_samplesperpixel,module);
-			scanline_size=TIFFhowmany_64(_TIFFMultiply64(tif,scanline_samples,td->td_bitspersample,module),8);
-		}
-	}
-	else
+    static const char module[] = "TIFFScanlineSize64";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint64_t scanline_size;
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG)
+    {
+        if ((td->td_photometric == PHOTOMETRIC_YCBCR) &&
+            (td->td_samplesperpixel == 3) && (!isUpSampled(tif)))
         {
-		scanline_size=TIFFhowmany_64(_TIFFMultiply64(tif,td->td_imagewidth,td->td_bitspersample,module),8);
+            uint16_t ycbcrsubsampling[2];
+            uint16_t samplingblock_samples;
+            uint32_t samplingblocks_hor;
+            uint64_t samplingrow_samples;
+            uint64_t samplingrow_size;
+            if (td->td_samplesperpixel != 3)
+            {
+                TIFFErrorExtR(tif, module, "Invalid td_samplesperpixel value");
+                return 0;
+            }
+            TIFFGetFieldDefaulted(tif, TIFFTAG_YCBCRSUBSAMPLING,
+                                  ycbcrsubsampling + 0, ycbcrsubsampling + 1);
+            if (((ycbcrsubsampling[0] != 1) && (ycbcrsubsampling[0] != 2) &&
+                 (ycbcrsubsampling[0] != 4)) ||
+                ((ycbcrsubsampling[1] != 1) && (ycbcrsubsampling[1] != 2) &&
+                 (ycbcrsubsampling[1] != 4)))
+            {
+                TIFFErrorExtR(tif, module, "Invalid YCbCr subsampling");
+                return 0;
+            }
+            samplingblock_samples =
+                ycbcrsubsampling[0] * ycbcrsubsampling[1] + 2;
+            samplingblocks_hor =
+                TIFFhowmany_32(td->td_imagewidth, ycbcrsubsampling[0]);
+            samplingrow_samples = _TIFFMultiply64(
+                tif, samplingblocks_hor, samplingblock_samples, module);
+            samplingrow_size =
+                TIFFhowmany_64(_TIFFMultiply64(tif, samplingrow_samples,
+                                               td->td_bitspersample, module),
+                               8);
+            scanline_size = (samplingrow_size / ycbcrsubsampling[1]);
         }
-        if (scanline_size == 0)
+        else
         {
-                TIFFErrorExt(tif->tif_clientdata,module,"Computed scanline size is zero");
-                return 0;
+            uint64_t scanline_samples;
+            scanline_samples = _TIFFMultiply64(tif, td->td_imagewidth,
+                                               td->td_samplesperpixel, module);
+            scanline_size =
+                TIFFhowmany_64(_TIFFMultiply64(tif, scanline_samples,
+                                               td->td_bitspersample, module),
+                               8);
         }
-	return(scanline_size);
+    }
+    else
+    {
+        scanline_size =
+            TIFFhowmany_64(_TIFFMultiply64(tif, td->td_imagewidth,
+                                           td->td_bitspersample, module),
+                           8);
+    }
+    if (scanline_size == 0)
+    {
+        TIFFErrorExtR(tif, module, "Computed scanline size is zero");
+        return 0;
+    }
+    return (scanline_size);
 }
-tmsize_t
-TIFFScanlineSize(TIFF* tif)
+tmsize_t TIFFScanlineSize(TIFF *tif)
 {
-	static const char module[] = "TIFFScanlineSize";
-	uint64 m;
-	m=TIFFScanlineSize64(tif);
-	return _TIFFCastUInt64ToSSize(tif, m, module);
+    static const char module[] = "TIFFScanlineSize";
+    uint64_t m;
+    m = TIFFScanlineSize64(tif);
+    return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
@@ -325,35 +323,28 @@ TIFFScanlineSize(TIFF* tif)
  * I/O size returned by TIFFScanlineSize which may be less
  * if data is store as separate planes).
  */
-uint64
-TIFFRasterScanlineSize64(TIFF* tif)
+uint64_t TIFFRasterScanlineSize64(TIFF *tif)
 {
-	static const char module[] = "TIFFRasterScanlineSize64";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint64 scanline;
+    static const char module[] = "TIFFRasterScanlineSize64";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint64_t scanline;
 
-	scanline = _TIFFMultiply64(tif, td->td_bitspersample, td->td_imagewidth, module);
-	if (td->td_planarconfig == PLANARCONFIG_CONTIG) {
-		scanline = _TIFFMultiply64(tif, scanline, td->td_samplesperpixel, module);
-		return (TIFFhowmany8_64(scanline));
-	} else
-		return (_TIFFMultiply64(tif, TIFFhowmany8_64(scanline),
-		    td->td_samplesperpixel, module));
+    scanline =
+        _TIFFMultiply64(tif, td->td_bitspersample, td->td_imagewidth, module);
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG)
+    {
+        scanline =
+            _TIFFMultiply64(tif, scanline, td->td_samplesperpixel, module);
+        return (TIFFhowmany8_64(scanline));
+    }
+    else
+        return (_TIFFMultiply64(tif, TIFFhowmany8_64(scanline),
+                                td->td_samplesperpixel, module));
 }
-tmsize_t
-TIFFRasterScanlineSize(TIFF* tif)
+tmsize_t TIFFRasterScanlineSize(TIFF *tif)
 {
-	static const char module[] = "TIFFRasterScanlineSize";
-	uint64 m;
-	m=TIFFRasterScanlineSize64(tif);
-	return _TIFFCastUInt64ToSSize(tif, m, module);
+    static const char module[] = "TIFFRasterScanlineSize";
+    uint64_t m;
+    m = TIFFRasterScanlineSize64(tif);
+    return _TIFFCastUInt64ToSSize(tif, m, module);
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_swab.c b/3rdparty/libtiff/tif_swab.c
index b174ba69c02a..827b025ce7ae 100644
--- a/3rdparty/libtiff/tif_swab.c
+++ b/3rdparty/libtiff/tif_swab.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -30,169 +30,218 @@
 #include "tiffiop.h"
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabShort)
-void
-TIFFSwabShort(uint16* wp)
+void TIFFSwabShort(uint16_t *wp)
 {
-	register unsigned char* cp = (unsigned char*) wp;
-	unsigned char t;
-	assert(sizeof(uint16)==2);
-	t = cp[1]; cp[1] = cp[0]; cp[0] = t;
+    register unsigned char *cp = (unsigned char *)wp;
+    unsigned char t;
+    assert(sizeof(uint16_t) == 2);
+    t = cp[1];
+    cp[1] = cp[0];
+    cp[0] = t;
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabLong)
-void
-TIFFSwabLong(uint32* lp)
+void TIFFSwabLong(uint32_t *lp)
 {
-	register unsigned char* cp = (unsigned char*) lp;
-	unsigned char t;
-	assert(sizeof(uint32)==4);
-	t = cp[3]; cp[3] = cp[0]; cp[0] = t;
-	t = cp[2]; cp[2] = cp[1]; cp[1] = t;
+    register unsigned char *cp = (unsigned char *)lp;
+    unsigned char t;
+    assert(sizeof(uint32_t) == 4);
+    t = cp[3];
+    cp[3] = cp[0];
+    cp[0] = t;
+    t = cp[2];
+    cp[2] = cp[1];
+    cp[1] = t;
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabLong8)
-void
-TIFFSwabLong8(uint64* lp)
+void TIFFSwabLong8(uint64_t *lp)
 {
-	register unsigned char* cp = (unsigned char*) lp;
-	unsigned char t;
-	assert(sizeof(uint64)==8);
-	t = cp[7]; cp[7] = cp[0]; cp[0] = t;
-	t = cp[6]; cp[6] = cp[1]; cp[1] = t;
-	t = cp[5]; cp[5] = cp[2]; cp[2] = t;
-	t = cp[4]; cp[4] = cp[3]; cp[3] = t;
+    register unsigned char *cp = (unsigned char *)lp;
+    unsigned char t;
+    assert(sizeof(uint64_t) == 8);
+    t = cp[7];
+    cp[7] = cp[0];
+    cp[0] = t;
+    t = cp[6];
+    cp[6] = cp[1];
+    cp[1] = t;
+    t = cp[5];
+    cp[5] = cp[2];
+    cp[2] = t;
+    t = cp[4];
+    cp[4] = cp[3];
+    cp[3] = t;
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabArrayOfShort)
-void
-TIFFSwabArrayOfShort(register uint16* wp, tmsize_t n)
+void TIFFSwabArrayOfShort(register uint16_t *wp, tmsize_t n)
 {
-	register unsigned char* cp;
-	register unsigned char t;
-	assert(sizeof(uint16)==2);
-	/* XXX unroll loop some */
-	while (n-- > 0) {
-		cp = (unsigned char*) wp;
-		t = cp[1]; cp[1] = cp[0]; cp[0] = t;
-		wp++;
-	}
+    register unsigned char *cp;
+    register unsigned char t;
+    assert(sizeof(uint16_t) == 2);
+    /* XXX unroll loop some */
+    while (n-- > 0)
+    {
+        cp = (unsigned char *)wp;
+        t = cp[1];
+        cp[1] = cp[0];
+        cp[0] = t;
+        wp++;
+    }
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabArrayOfTriples)
-void
-TIFFSwabArrayOfTriples(register uint8* tp, tmsize_t n)
+void TIFFSwabArrayOfTriples(register uint8_t *tp, tmsize_t n)
 {
-	unsigned char* cp;
-	unsigned char t;
+    unsigned char *cp;
+    unsigned char t;
 
-	/* XXX unroll loop some */
-	while (n-- > 0) {
-		cp = (unsigned char*) tp;
-		t = cp[2]; cp[2] = cp[0]; cp[0] = t;
-		tp += 3;
-	}
+    /* XXX unroll loop some */
+    while (n-- > 0)
+    {
+        cp = (unsigned char *)tp;
+        t = cp[2];
+        cp[2] = cp[0];
+        cp[0] = t;
+        tp += 3;
+    }
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabArrayOfLong)
-void
-TIFFSwabArrayOfLong(register uint32* lp, tmsize_t n)
+void TIFFSwabArrayOfLong(register uint32_t *lp, tmsize_t n)
 {
-	register unsigned char *cp;
-	register unsigned char t;
-	assert(sizeof(uint32)==4);
-	/* XXX unroll loop some */
-	while (n-- > 0) {
-		cp = (unsigned char *)lp;
-		t = cp[3]; cp[3] = cp[0]; cp[0] = t;
-		t = cp[2]; cp[2] = cp[1]; cp[1] = t;
-		lp++;
-	}
+    register unsigned char *cp;
+    register unsigned char t;
+    assert(sizeof(uint32_t) == 4);
+    /* XXX unroll loop some */
+    while (n-- > 0)
+    {
+        cp = (unsigned char *)lp;
+        t = cp[3];
+        cp[3] = cp[0];
+        cp[0] = t;
+        t = cp[2];
+        cp[2] = cp[1];
+        cp[1] = t;
+        lp++;
+    }
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabArrayOfLong8)
-void
-TIFFSwabArrayOfLong8(register uint64* lp, tmsize_t n)
+void TIFFSwabArrayOfLong8(register uint64_t *lp, tmsize_t n)
 {
-	register unsigned char *cp;
-	register unsigned char t;
-	assert(sizeof(uint64)==8);
-	/* XXX unroll loop some */
-	while (n-- > 0) {
-		cp = (unsigned char *)lp;
-		t = cp[7]; cp[7] = cp[0]; cp[0] = t;
-		t = cp[6]; cp[6] = cp[1]; cp[1] = t;
-		t = cp[5]; cp[5] = cp[2]; cp[2] = t;
-		t = cp[4]; cp[4] = cp[3]; cp[3] = t;
-		lp++;
-	}
+    register unsigned char *cp;
+    register unsigned char t;
+    assert(sizeof(uint64_t) == 8);
+    /* XXX unroll loop some */
+    while (n-- > 0)
+    {
+        cp = (unsigned char *)lp;
+        t = cp[7];
+        cp[7] = cp[0];
+        cp[0] = t;
+        t = cp[6];
+        cp[6] = cp[1];
+        cp[1] = t;
+        t = cp[5];
+        cp[5] = cp[2];
+        cp[2] = t;
+        t = cp[4];
+        cp[4] = cp[3];
+        cp[3] = t;
+        lp++;
+    }
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabFloat)
-void
-TIFFSwabFloat(float* fp)
+void TIFFSwabFloat(float *fp)
 {
-	register unsigned char* cp = (unsigned char*) fp;
-	unsigned char t;
-	assert(sizeof(float)==4);
-	t = cp[3]; cp[3] = cp[0]; cp[0] = t;
-	t = cp[2]; cp[2] = cp[1]; cp[1] = t;
+    register unsigned char *cp = (unsigned char *)fp;
+    unsigned char t;
+    assert(sizeof(float) == 4);
+    t = cp[3];
+    cp[3] = cp[0];
+    cp[0] = t;
+    t = cp[2];
+    cp[2] = cp[1];
+    cp[1] = t;
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabArrayOfFloat)
-void
-TIFFSwabArrayOfFloat(register float* fp, tmsize_t n)
+void TIFFSwabArrayOfFloat(register float *fp, tmsize_t n)
 {
-	register unsigned char *cp;
-	register unsigned char t;
-	assert(sizeof(float)==4);
-	/* XXX unroll loop some */
-	while (n-- > 0) {
-		cp = (unsigned char *)fp;
-		t = cp[3]; cp[3] = cp[0]; cp[0] = t;
-		t = cp[2]; cp[2] = cp[1]; cp[1] = t;
-		fp++;
-	}
+    register unsigned char *cp;
+    register unsigned char t;
+    assert(sizeof(float) == 4);
+    /* XXX unroll loop some */
+    while (n-- > 0)
+    {
+        cp = (unsigned char *)fp;
+        t = cp[3];
+        cp[3] = cp[0];
+        cp[0] = t;
+        t = cp[2];
+        cp[2] = cp[1];
+        cp[1] = t;
+        fp++;
+    }
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabDouble)
-void
-TIFFSwabDouble(double *dp)
+void TIFFSwabDouble(double *dp)
 {
-	register unsigned char* cp = (unsigned char*) dp;
-	unsigned char t;
-	assert(sizeof(double)==8);
-	t = cp[7]; cp[7] = cp[0]; cp[0] = t;
-	t = cp[6]; cp[6] = cp[1]; cp[1] = t;
-	t = cp[5]; cp[5] = cp[2]; cp[2] = t;
-	t = cp[4]; cp[4] = cp[3]; cp[3] = t;
+    register unsigned char *cp = (unsigned char *)dp;
+    unsigned char t;
+    assert(sizeof(double) == 8);
+    t = cp[7];
+    cp[7] = cp[0];
+    cp[0] = t;
+    t = cp[6];
+    cp[6] = cp[1];
+    cp[1] = t;
+    t = cp[5];
+    cp[5] = cp[2];
+    cp[2] = t;
+    t = cp[4];
+    cp[4] = cp[3];
+    cp[3] = t;
 }
 #endif
 
 #if defined(DISABLE_CHECK_TIFFSWABMACROS) || !defined(TIFFSwabArrayOfDouble)
-void
-TIFFSwabArrayOfDouble(double* dp, tmsize_t n)
+void TIFFSwabArrayOfDouble(double *dp, tmsize_t n)
 {
-	register unsigned char *cp;
-	register unsigned char t;
-	assert(sizeof(double)==8);
-	/* XXX unroll loop some */
-	while (n-- > 0) {
-		cp = (unsigned char *)dp;
-		t = cp[7]; cp[7] = cp[0]; cp[0] = t;
-		t = cp[6]; cp[6] = cp[1]; cp[1] = t;
-		t = cp[5]; cp[5] = cp[2]; cp[2] = t;
-		t = cp[4]; cp[4] = cp[3]; cp[3] = t;
-		dp++;
-	}
+    register unsigned char *cp;
+    register unsigned char t;
+    assert(sizeof(double) == 8);
+    /* XXX unroll loop some */
+    while (n-- > 0)
+    {
+        cp = (unsigned char *)dp;
+        t = cp[7];
+        cp[7] = cp[0];
+        cp[0] = t;
+        t = cp[6];
+        cp[6] = cp[1];
+        cp[1] = t;
+        t = cp[5];
+        cp[5] = cp[2];
+        cp[2] = t;
+        t = cp[4];
+        cp[4] = cp[3];
+        cp[3] = t;
+        dp++;
+    }
 }
 #endif
 
@@ -206,105 +255,75 @@ TIFFSwabArrayOfDouble(double* dp, tmsize_t n)
  * do not reverse bit values.
  */
 static const unsigned char TIFFBitRevTable[256] = {
-    0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
-    0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
-    0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
-    0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
-    0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
-    0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
-    0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
-    0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
-    0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
-    0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
-    0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
-    0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
-    0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
-    0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
-    0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
-    0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
-    0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
-    0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
-    0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
-    0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
-    0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
-    0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
-    0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
-    0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
-    0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
-    0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
-    0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
-    0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
-    0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
-    0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
-    0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
-    0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
-};
+    0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0,
+    0x30, 0xb0, 0x70, 0xf0, 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
+    0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, 0x04, 0x84, 0x44, 0xc4,
+    0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+    0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc,
+    0x3c, 0xbc, 0x7c, 0xfc, 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
+    0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 0x0a, 0x8a, 0x4a, 0xca,
+    0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+    0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6,
+    0x36, 0xb6, 0x76, 0xf6, 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
+    0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, 0x01, 0x81, 0x41, 0xc1,
+    0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+    0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9,
+    0x39, 0xb9, 0x79, 0xf9, 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
+    0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, 0x0d, 0x8d, 0x4d, 0xcd,
+    0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+    0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3,
+    0x33, 0xb3, 0x73, 0xf3, 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
+    0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, 0x07, 0x87, 0x47, 0xc7,
+    0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+    0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf,
+    0x3f, 0xbf, 0x7f, 0xff};
 static const unsigned char TIFFNoBitRevTable[256] = {
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 
-    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 
-    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 
-    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 
-    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 
-    0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 
-    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 
-    0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 
-    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 
-    0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 
-    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 
-    0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 
-    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 
-    0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 
-    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 
-    0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 
-    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 
-    0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 
-    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 
-    0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 
-    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 
-    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 
-    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 
-    0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 
-    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 
-    0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 
-    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 
-    0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 
-    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 
-    0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 
-    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 
-    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
+    0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
+    0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+    0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53,
+    0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
+    0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+    0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
+    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
+    0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3,
+    0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb,
+    0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+    0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3,
+    0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
+    0xfc, 0xfd, 0xfe, 0xff,
 };
 
-const unsigned char*
-TIFFGetBitRevTable(int reversed)
+const unsigned char *TIFFGetBitRevTable(int reversed)
 {
-	return (reversed ? TIFFBitRevTable : TIFFNoBitRevTable);
+    return (reversed ? TIFFBitRevTable : TIFFNoBitRevTable);
 }
 
-void
-TIFFReverseBits(uint8* cp, tmsize_t n)  
+void TIFFReverseBits(uint8_t *cp, tmsize_t n)
 {
-	for (; n > 8; n -= 8) {
-		cp[0] = TIFFBitRevTable[cp[0]];
-		cp[1] = TIFFBitRevTable[cp[1]];
-		cp[2] = TIFFBitRevTable[cp[2]];
-		cp[3] = TIFFBitRevTable[cp[3]];
-		cp[4] = TIFFBitRevTable[cp[4]];
-		cp[5] = TIFFBitRevTable[cp[5]];
-		cp[6] = TIFFBitRevTable[cp[6]];
-		cp[7] = TIFFBitRevTable[cp[7]];
-		cp += 8;
-	}
-	while (n-- > 0) {
-		*cp = TIFFBitRevTable[*cp];
-		cp++;
-	}
+    for (; n > 8; n -= 8)
+    {
+        cp[0] = TIFFBitRevTable[cp[0]];
+        cp[1] = TIFFBitRevTable[cp[1]];
+        cp[2] = TIFFBitRevTable[cp[2]];
+        cp[3] = TIFFBitRevTable[cp[3]];
+        cp[4] = TIFFBitRevTable[cp[4]];
+        cp[5] = TIFFBitRevTable[cp[5]];
+        cp[6] = TIFFBitRevTable[cp[6]];
+        cp[7] = TIFFBitRevTable[cp[7]];
+        cp += 8;
+    }
+    while (n-- > 0)
+    {
+        *cp = TIFFBitRevTable[*cp];
+        cp++;
+    }
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_thunder.c b/3rdparty/libtiff/tif_thunder.c
index db6383a81aec..1f97362ca39d 100644
--- a/3rdparty/libtiff/tif_thunder.c
+++ b/3rdparty/libtiff/tif_thunder.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -41,166 +41,158 @@
  * or 3-bit delta values are used, with the deltas packed
  * into a single byte.
  */
-#define	THUNDER_DATA		0x3f	/* mask for 6-bit data */
-#define	THUNDER_CODE		0xc0	/* mask for 2-bit code word */
+#define THUNDER_DATA 0x3f /* mask for 6-bit data */
+#define THUNDER_CODE 0xc0 /* mask for 2-bit code word */
 /* code values */
-#define	THUNDER_RUN		0x00	/* run of pixels w/ encoded count */
-#define	THUNDER_2BITDELTAS	0x40	/* 3 pixels w/ encoded 2-bit deltas */
-#define	    DELTA2_SKIP		2	/* skip code for 2-bit deltas */
-#define	THUNDER_3BITDELTAS	0x80	/* 2 pixels w/ encoded 3-bit deltas */
-#define	    DELTA3_SKIP		4	/* skip code for 3-bit deltas */
-#define	THUNDER_RAW		0xc0	/* raw data encoded */
+#define THUNDER_RUN 0x00        /* run of pixels w/ encoded count */
+#define THUNDER_2BITDELTAS 0x40 /* 3 pixels w/ encoded 2-bit deltas */
+#define DELTA2_SKIP 2           /* skip code for 2-bit deltas */
+#define THUNDER_3BITDELTAS 0x80 /* 2 pixels w/ encoded 3-bit deltas */
+#define DELTA3_SKIP 4           /* skip code for 3-bit deltas */
+#define THUNDER_RAW 0xc0        /* raw data encoded */
 
-static const int twobitdeltas[4] = { 0, 1, 0, -1 };
-static const int threebitdeltas[8] = { 0, 1, 2, 3, 0, -3, -2, -1 };
+static const int twobitdeltas[4] = {0, 1, 0, -1};
+static const int threebitdeltas[8] = {0, 1, 2, 3, 0, -3, -2, -1};
 
-#define	SETPIXEL(op, v) {                     \
-	lastpixel = (v) & 0xf;                \
-        if ( npixels < maxpixels )         \
-        {                                     \
-	  if (npixels++ & 1)                  \
-	    *op++ |= lastpixel;               \
-	  else                                \
-	    op[0] = (uint8) (lastpixel << 4); \
-        }                                     \
-}
+#define SETPIXEL(op, v)                                                        \
+    {                                                                          \
+        lastpixel = (v)&0xf;                                                   \
+        if (npixels < maxpixels)                                               \
+        {                                                                      \
+            if (npixels++ & 1)                                                 \
+                *op++ |= lastpixel;                                            \
+            else                                                               \
+                op[0] = (uint8_t)(lastpixel << 4);                             \
+        }                                                                      \
+    }
 
-static int
-ThunderSetupDecode(TIFF* tif)
+static int ThunderSetupDecode(TIFF *tif)
 {
-	static const char module[] = "ThunderSetupDecode";
+    static const char module[] = "ThunderSetupDecode";
 
-        if( tif->tif_dir.td_bitspersample != 4 )
-        {
-                TIFFErrorExt(tif->tif_clientdata, module,
-                             "Wrong bitspersample value (%d), Thunder decoder only supports 4bits per sample.",
-                             (int) tif->tif_dir.td_bitspersample );
-                return 0;
-        }
-        
+    if (tif->tif_dir.td_bitspersample != 4)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Wrong bitspersample value (%d), Thunder decoder only "
+                      "supports 4bits per sample.",
+                      (int)tif->tif_dir.td_bitspersample);
+        return 0;
+    }
 
-	return (1);
+    return (1);
 }
 
-static int
-ThunderDecode(TIFF* tif, uint8* op, tmsize_t maxpixels)
+static int ThunderDecode(TIFF *tif, uint8_t *op, tmsize_t maxpixels)
 {
-	static const char module[] = "ThunderDecode";
-	register unsigned char *bp;
-	register tmsize_t cc;
-	unsigned int lastpixel;
-	tmsize_t npixels;
+    static const char module[] = "ThunderDecode";
+    register unsigned char *bp;
+    register tmsize_t cc;
+    unsigned int lastpixel;
+    tmsize_t npixels;
 
-	bp = (unsigned char *)tif->tif_rawcp;
-	cc = tif->tif_rawcc;
-	lastpixel = 0;
-	npixels = 0;
-	while (cc > 0 && npixels < maxpixels) {
-		int n, delta;
+    bp = (unsigned char *)tif->tif_rawcp;
+    cc = tif->tif_rawcc;
+    lastpixel = 0;
+    npixels = 0;
+    while (cc > 0 && npixels < maxpixels)
+    {
+        int n, delta;
 
-		n = *bp++;
-		cc--;
-		switch (n & THUNDER_CODE) {
-		case THUNDER_RUN:		/* pixel run */
-			/*
-			 * Replicate the last pixel n times,
-			 * where n is the lower-order 6 bits.
-			 */
-			if (npixels & 1) {
-				op[0] |= lastpixel;
-				lastpixel = *op++; npixels++; n--;
-			} else
-				lastpixel |= lastpixel << 4;
-			npixels += n;
-			if (npixels < maxpixels) {
-				for (; n > 0; n -= 2)
-					*op++ = (uint8) lastpixel;
-			}
-			if (n == -1)
-				*--op &= 0xf0;
-			lastpixel &= 0xf;
-			break;
-		case THUNDER_2BITDELTAS:	/* 2-bit deltas */
-			if ((delta = ((n >> 4) & 3)) != DELTA2_SKIP)
-				SETPIXEL(op, (unsigned)((int)lastpixel + twobitdeltas[delta]));
-			if ((delta = ((n >> 2) & 3)) != DELTA2_SKIP)
-				SETPIXEL(op, (unsigned)((int)lastpixel + twobitdeltas[delta]));
-			if ((delta = (n & 3)) != DELTA2_SKIP)
-				SETPIXEL(op, (unsigned)((int)lastpixel + twobitdeltas[delta]));
-			break;
-		case THUNDER_3BITDELTAS:	/* 3-bit deltas */
-			if ((delta = ((n >> 3) & 7)) != DELTA3_SKIP)
-				SETPIXEL(op, (unsigned)((int)lastpixel + threebitdeltas[delta]));
-			if ((delta = (n & 7)) != DELTA3_SKIP)
-				SETPIXEL(op, (unsigned)((int)lastpixel + threebitdeltas[delta]));
-			break;
-		case THUNDER_RAW:		/* raw data */
-			SETPIXEL(op, n);
-			break;
-		}
-	}
-	tif->tif_rawcp = (uint8*) bp;
-	tif->tif_rawcc = cc;
-	if (npixels != maxpixels) {
-#if defined(__WIN32__) && (defined(_MSC_VER) || defined(__MINGW32__))
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "%s data at scanline %lu (%I64u != %I64u)",
-			     npixels < maxpixels ? "Not enough" : "Too much",
-			     (unsigned long) tif->tif_row,
-			     (unsigned __int64) npixels,
-			     (unsigned __int64) maxpixels);
-#else
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "%s data at scanline %lu (%llu != %llu)",
-			     npixels < maxpixels ? "Not enough" : "Too much",
-			     (unsigned long) tif->tif_row,
-			     (unsigned long long) npixels,
-			     (unsigned long long) maxpixels);
-#endif
-		return (0);
-	}
+        n = *bp++;
+        cc--;
+        switch (n & THUNDER_CODE)
+        {
+            case THUNDER_RUN: /* pixel run */
+                /*
+                 * Replicate the last pixel n times,
+                 * where n is the lower-order 6 bits.
+                 */
+                if (npixels & 1)
+                {
+                    op[0] |= lastpixel;
+                    lastpixel = *op++;
+                    npixels++;
+                    n--;
+                }
+                else
+                    lastpixel |= lastpixel << 4;
+                npixels += n;
+                if (npixels < maxpixels)
+                {
+                    for (; n > 0; n -= 2)
+                        *op++ = (uint8_t)lastpixel;
+                }
+                if (n == -1)
+                    *--op &= 0xf0;
+                lastpixel &= 0xf;
+                break;
+            case THUNDER_2BITDELTAS: /* 2-bit deltas */
+                if ((delta = ((n >> 4) & 3)) != DELTA2_SKIP)
+                    SETPIXEL(op,
+                             (unsigned)((int)lastpixel + twobitdeltas[delta]));
+                if ((delta = ((n >> 2) & 3)) != DELTA2_SKIP)
+                    SETPIXEL(op,
+                             (unsigned)((int)lastpixel + twobitdeltas[delta]));
+                if ((delta = (n & 3)) != DELTA2_SKIP)
+                    SETPIXEL(op,
+                             (unsigned)((int)lastpixel + twobitdeltas[delta]));
+                break;
+            case THUNDER_3BITDELTAS: /* 3-bit deltas */
+                if ((delta = ((n >> 3) & 7)) != DELTA3_SKIP)
+                    SETPIXEL(
+                        op, (unsigned)((int)lastpixel + threebitdeltas[delta]));
+                if ((delta = (n & 7)) != DELTA3_SKIP)
+                    SETPIXEL(
+                        op, (unsigned)((int)lastpixel + threebitdeltas[delta]));
+                break;
+            case THUNDER_RAW: /* raw data */
+                SETPIXEL(op, n);
+                break;
+        }
+    }
+    tif->tif_rawcp = (uint8_t *)bp;
+    tif->tif_rawcc = cc;
+    if (npixels != maxpixels)
+    {
+        TIFFErrorExtR(tif, module,
+                      "%s data at scanline %lu (%" PRIu64 " != %" PRIu64 ")",
+                      npixels < maxpixels ? "Not enough" : "Too much",
+                      (unsigned long)tif->tif_row, (uint64_t)npixels,
+                      (uint64_t)maxpixels);
+        return (0);
+    }
 
-        return (1);
+    return (1);
 }
 
-static int
-ThunderDecodeRow(TIFF* tif, uint8* buf, tmsize_t occ, uint16 s)
+static int ThunderDecodeRow(TIFF *tif, uint8_t *buf, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "ThunderDecodeRow";
-	uint8* row = buf;
-	
-	(void) s;
-	if (occ % tif->tif_scanlinesize)
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Fractional scanlines cannot be read");
-		return (0);
-	}
-	while (occ > 0) {
-		if (!ThunderDecode(tif, row, tif->tif_dir.td_imagewidth))
-			return (0);
-		occ -= tif->tif_scanlinesize;
-		row += tif->tif_scanlinesize;
-	}
-	return (1);
+    static const char module[] = "ThunderDecodeRow";
+    uint8_t *row = buf;
+
+    (void)s;
+    if (occ % tif->tif_scanlinesize)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be read");
+        return (0);
+    }
+    while (occ > 0)
+    {
+        if (!ThunderDecode(tif, row, tif->tif_dir.td_imagewidth))
+            return (0);
+        occ -= tif->tif_scanlinesize;
+        row += tif->tif_scanlinesize;
+    }
+    return (1);
 }
 
-int
-TIFFInitThunderScan(TIFF* tif, int scheme)
+int TIFFInitThunderScan(TIFF *tif, int scheme)
 {
-	(void) scheme;
+    (void)scheme;
 
-        tif->tif_setupdecode = ThunderSetupDecode;
-	tif->tif_decoderow = ThunderDecodeRow;
-	tif->tif_decodestrip = ThunderDecodeRow; 
-	return (1);
+    tif->tif_setupdecode = ThunderSetupDecode;
+    tif->tif_decoderow = ThunderDecodeRow;
+    tif->tif_decodestrip = ThunderDecodeRow;
+    return (1);
 }
 #endif /* THUNDER_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_tile.c b/3rdparty/libtiff/tif_tile.c
index 661cc771548b..f07032f731f9 100644
--- a/3rdparty/libtiff/tif_tile.c
+++ b/3rdparty/libtiff/tif_tile.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1991-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -32,234 +32,230 @@
 /*
  * Compute which tile an (x,y,z,s) value is in.
  */
-uint32
-TIFFComputeTile(TIFF* tif, uint32 x, uint32 y, uint32 z, uint16 s)
+uint32_t TIFFComputeTile(TIFF *tif, uint32_t x, uint32_t y, uint32_t z,
+                         uint16_t s)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-	uint32 dx = td->td_tilewidth;
-	uint32 dy = td->td_tilelength;
-	uint32 dz = td->td_tiledepth;
-	uint32 tile = 1;
+    TIFFDirectory *td = &tif->tif_dir;
+    uint32_t dx = td->td_tilewidth;
+    uint32_t dy = td->td_tilelength;
+    uint32_t dz = td->td_tiledepth;
+    uint32_t tile = 1;
 
-	if (td->td_imagedepth == 1)
-		z = 0;
-	if (dx == (uint32) -1)
-		dx = td->td_imagewidth;
-	if (dy == (uint32) -1)
-		dy = td->td_imagelength;
-	if (dz == (uint32) -1)
-		dz = td->td_imagedepth;
-	if (dx != 0 && dy != 0 && dz != 0) {
-		uint32 xpt = TIFFhowmany_32(td->td_imagewidth, dx);
-		uint32 ypt = TIFFhowmany_32(td->td_imagelength, dy);
-		uint32 zpt = TIFFhowmany_32(td->td_imagedepth, dz);
+    if (td->td_imagedepth == 1)
+        z = 0;
+    if (dx == (uint32_t)-1)
+        dx = td->td_imagewidth;
+    if (dy == (uint32_t)-1)
+        dy = td->td_imagelength;
+    if (dz == (uint32_t)-1)
+        dz = td->td_imagedepth;
+    if (dx != 0 && dy != 0 && dz != 0)
+    {
+        uint32_t xpt = TIFFhowmany_32(td->td_imagewidth, dx);
+        uint32_t ypt = TIFFhowmany_32(td->td_imagelength, dy);
+        uint32_t zpt = TIFFhowmany_32(td->td_imagedepth, dz);
 
-		if (td->td_planarconfig == PLANARCONFIG_SEPARATE) 
-			tile = (xpt*ypt*zpt)*s +
-			     (xpt*ypt)*(z/dz) +
-			     xpt*(y/dy) +
-			     x/dx;
-		else
-			tile = (xpt*ypt)*(z/dz) + xpt*(y/dy) + x/dx;
-	}
-	return (tile);
+        if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+            tile = (xpt * ypt * zpt) * s + (xpt * ypt) * (z / dz) +
+                   xpt * (y / dy) + x / dx;
+        else
+            tile = (xpt * ypt) * (z / dz) + xpt * (y / dy) + x / dx;
+    }
+    return (tile);
 }
 
 /*
  * Check an (x,y,z,s) coordinate
  * against the image bounds.
  */
-int
-TIFFCheckTile(TIFF* tif, uint32 x, uint32 y, uint32 z, uint16 s)
+int TIFFCheckTile(TIFF *tif, uint32_t x, uint32_t y, uint32_t z, uint16_t s)
 {
-	TIFFDirectory *td = &tif->tif_dir;
+    TIFFDirectory *td = &tif->tif_dir;
 
-	if (x >= td->td_imagewidth) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "%lu: Col out of range, max %lu",
-			     (unsigned long) x,
-			     (unsigned long) (td->td_imagewidth - 1));
-		return (0);
-	}
-	if (y >= td->td_imagelength) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "%lu: Row out of range, max %lu",
-			     (unsigned long) y,
-			     (unsigned long) (td->td_imagelength - 1));
-		return (0);
-	}
-	if (z >= td->td_imagedepth) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "%lu: Depth out of range, max %lu",
-			     (unsigned long) z,
-			     (unsigned long) (td->td_imagedepth - 1));
-		return (0);
-	}
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE &&
-	    s >= td->td_samplesperpixel) {
-		TIFFErrorExt(tif->tif_clientdata, tif->tif_name,
-			     "%lu: Sample out of range, max %lu",
-			     (unsigned long) s,
-			     (unsigned long) (td->td_samplesperpixel - 1));
-		return (0);
-	}
-	return (1);
+    if (x >= td->td_imagewidth)
+    {
+        TIFFErrorExtR(tif, tif->tif_name, "%lu: Col out of range, max %lu",
+                      (unsigned long)x, (unsigned long)(td->td_imagewidth - 1));
+        return (0);
+    }
+    if (y >= td->td_imagelength)
+    {
+        TIFFErrorExtR(tif, tif->tif_name, "%lu: Row out of range, max %lu",
+                      (unsigned long)y,
+                      (unsigned long)(td->td_imagelength - 1));
+        return (0);
+    }
+    if (z >= td->td_imagedepth)
+    {
+        TIFFErrorExtR(tif, tif->tif_name, "%lu: Depth out of range, max %lu",
+                      (unsigned long)z, (unsigned long)(td->td_imagedepth - 1));
+        return (0);
+    }
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE &&
+        s >= td->td_samplesperpixel)
+    {
+        TIFFErrorExtR(tif, tif->tif_name, "%lu: Sample out of range, max %lu",
+                      (unsigned long)s,
+                      (unsigned long)(td->td_samplesperpixel - 1));
+        return (0);
+    }
+    return (1);
 }
 
 /*
  * Compute how many tiles are in an image.
  */
-uint32
-TIFFNumberOfTiles(TIFF* tif)
+uint32_t TIFFNumberOfTiles(TIFF *tif)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-	uint32 dx = td->td_tilewidth;
-	uint32 dy = td->td_tilelength;
-	uint32 dz = td->td_tiledepth;
-	uint32 ntiles;
+    TIFFDirectory *td = &tif->tif_dir;
+    uint32_t dx = td->td_tilewidth;
+    uint32_t dy = td->td_tilelength;
+    uint32_t dz = td->td_tiledepth;
+    uint32_t ntiles;
 
-	if (dx == (uint32) -1)
-		dx = td->td_imagewidth;
-	if (dy == (uint32) -1)
-		dy = td->td_imagelength;
-	if (dz == (uint32) -1)
-		dz = td->td_imagedepth;
-	ntiles = (dx == 0 || dy == 0 || dz == 0) ? 0 :
-	    _TIFFMultiply32(tif, _TIFFMultiply32(tif, TIFFhowmany_32(td->td_imagewidth, dx),
-	    TIFFhowmany_32(td->td_imagelength, dy),
-	    "TIFFNumberOfTiles"),
-	    TIFFhowmany_32(td->td_imagedepth, dz), "TIFFNumberOfTiles");
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
-		ntiles = _TIFFMultiply32(tif, ntiles, td->td_samplesperpixel,
-		    "TIFFNumberOfTiles");
-	return (ntiles);
+    if (dx == (uint32_t)-1)
+        dx = td->td_imagewidth;
+    if (dy == (uint32_t)-1)
+        dy = td->td_imagelength;
+    if (dz == (uint32_t)-1)
+        dz = td->td_imagedepth;
+    ntiles =
+        (dx == 0 || dy == 0 || dz == 0)
+            ? 0
+            : _TIFFMultiply32(
+                  tif,
+                  _TIFFMultiply32(tif, TIFFhowmany_32(td->td_imagewidth, dx),
+                                  TIFFhowmany_32(td->td_imagelength, dy),
+                                  "TIFFNumberOfTiles"),
+                  TIFFhowmany_32(td->td_imagedepth, dz), "TIFFNumberOfTiles");
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+        ntiles = _TIFFMultiply32(tif, ntiles, td->td_samplesperpixel,
+                                 "TIFFNumberOfTiles");
+    return (ntiles);
 }
 
 /*
  * Compute the # bytes in each row of a tile.
  */
-uint64
-TIFFTileRowSize64(TIFF* tif)
+uint64_t TIFFTileRowSize64(TIFF *tif)
 {
-        static const char module[] = "TIFFTileRowSize64";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint64 rowsize;
-	uint64 tilerowsize;
+    static const char module[] = "TIFFTileRowSize64";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint64_t rowsize;
+    uint64_t tilerowsize;
 
-	if (td->td_tilelength == 0)
-        {
-                TIFFErrorExt(tif->tif_clientdata,module,"Tile length is zero");
-                return 0;
-        }
-        if (td->td_tilewidth == 0)
+    if (td->td_tilelength == 0)
+    {
+        TIFFErrorExtR(tif, module, "Tile length is zero");
+        return 0;
+    }
+    if (td->td_tilewidth == 0)
+    {
+        TIFFErrorExtR(tif, module, "Tile width is zero");
+        return (0);
+    }
+    rowsize = _TIFFMultiply64(tif, td->td_bitspersample, td->td_tilewidth,
+                              "TIFFTileRowSize");
+    if (td->td_planarconfig == PLANARCONFIG_CONTIG)
+    {
+        if (td->td_samplesperpixel == 0)
         {
-                TIFFErrorExt(tif->tif_clientdata,module,"Tile width is zero");
-		return (0);
+            TIFFErrorExtR(tif, module, "Samples per pixel is zero");
+            return 0;
         }
-	rowsize = _TIFFMultiply64(tif, td->td_bitspersample, td->td_tilewidth,
-	    "TIFFTileRowSize");
-	if (td->td_planarconfig == PLANARCONFIG_CONTIG)
-        {
-                if (td->td_samplesperpixel == 0)
-                {
-                        TIFFErrorExt(tif->tif_clientdata,module,"Samples per pixel is zero");
-                        return 0;
-                }
-		rowsize = _TIFFMultiply64(tif, rowsize, td->td_samplesperpixel,
-		    "TIFFTileRowSize");
-        }
-        tilerowsize=TIFFhowmany8_64(rowsize);
-        if (tilerowsize == 0)
-        {
-                TIFFErrorExt(tif->tif_clientdata,module,"Computed tile row size is zero");
-                return 0;
-        }
-	return (tilerowsize);
+        rowsize = _TIFFMultiply64(tif, rowsize, td->td_samplesperpixel,
+                                  "TIFFTileRowSize");
+    }
+    tilerowsize = TIFFhowmany8_64(rowsize);
+    if (tilerowsize == 0)
+    {
+        TIFFErrorExtR(tif, module, "Computed tile row size is zero");
+        return 0;
+    }
+    return (tilerowsize);
 }
-tmsize_t
-TIFFTileRowSize(TIFF* tif)
+tmsize_t TIFFTileRowSize(TIFF *tif)
 {
-	static const char module[] = "TIFFTileRowSize";
-	uint64 m;
-	m=TIFFTileRowSize64(tif);
-	return _TIFFCastUInt64ToSSize(tif, m, module);
+    static const char module[] = "TIFFTileRowSize";
+    uint64_t m;
+    m = TIFFTileRowSize64(tif);
+    return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
  * Compute the # bytes in a variable length, row-aligned tile.
  */
-uint64
-TIFFVTileSize64(TIFF* tif, uint32 nrows)
+uint64_t TIFFVTileSize64(TIFF *tif, uint32_t nrows)
 {
-	static const char module[] = "TIFFVTileSize64";
-	TIFFDirectory *td = &tif->tif_dir;
-	if (td->td_tilelength == 0 || td->td_tilewidth == 0 ||
-	    td->td_tiledepth == 0)
-		return (0);
-	if ((td->td_planarconfig==PLANARCONFIG_CONTIG)&&
-	    (td->td_photometric==PHOTOMETRIC_YCBCR)&&
-	    (td->td_samplesperpixel==3)&&
-	    (!isUpSampled(tif)))
-	{
-		/*
-		 * Packed YCbCr data contain one Cb+Cr for every
-		 * HorizontalSampling*VerticalSampling Y values.
-		 * Must also roundup width and height when calculating
-		 * since images that are not a multiple of the
-		 * horizontal/vertical subsampling area include
-		 * YCbCr data for the extended image.
-		 */
-		uint16 ycbcrsubsampling[2];
-		uint16 samplingblock_samples;
-		uint32 samplingblocks_hor;
-		uint32 samplingblocks_ver;
-		uint64 samplingrow_samples;
-		uint64 samplingrow_size;
-		TIFFGetFieldDefaulted(tif,TIFFTAG_YCBCRSUBSAMPLING,ycbcrsubsampling+0,
-		    ycbcrsubsampling+1);
-		if ((ycbcrsubsampling[0] != 1 && ycbcrsubsampling[0] != 2 && ycbcrsubsampling[0] != 4)
-		    ||(ycbcrsubsampling[1] != 1 && ycbcrsubsampling[1] != 2 && ycbcrsubsampling[1] != 4))
-		{
-			TIFFErrorExt(tif->tif_clientdata,module,
-				     "Invalid YCbCr subsampling (%dx%d)", 
-				     ycbcrsubsampling[0], 
-				     ycbcrsubsampling[1] );
-			return 0;
-		}
-		samplingblock_samples=ycbcrsubsampling[0]*ycbcrsubsampling[1]+2;
-		samplingblocks_hor=TIFFhowmany_32(td->td_tilewidth,ycbcrsubsampling[0]);
-		samplingblocks_ver=TIFFhowmany_32(nrows,ycbcrsubsampling[1]);
-		samplingrow_samples=_TIFFMultiply64(tif,samplingblocks_hor,samplingblock_samples,module);
-		samplingrow_size=TIFFhowmany8_64(_TIFFMultiply64(tif,samplingrow_samples,td->td_bitspersample,module));
-		return(_TIFFMultiply64(tif,samplingrow_size,samplingblocks_ver,module));
-	}
-	else
-		return(_TIFFMultiply64(tif,nrows,TIFFTileRowSize64(tif),module));
+    static const char module[] = "TIFFVTileSize64";
+    TIFFDirectory *td = &tif->tif_dir;
+    if (td->td_tilelength == 0 || td->td_tilewidth == 0 ||
+        td->td_tiledepth == 0)
+        return (0);
+    if ((td->td_planarconfig == PLANARCONFIG_CONTIG) &&
+        (td->td_photometric == PHOTOMETRIC_YCBCR) &&
+        (td->td_samplesperpixel == 3) && (!isUpSampled(tif)))
+    {
+        /*
+         * Packed YCbCr data contain one Cb+Cr for every
+         * HorizontalSampling*VerticalSampling Y values.
+         * Must also roundup width and height when calculating
+         * since images that are not a multiple of the
+         * horizontal/vertical subsampling area include
+         * YCbCr data for the extended image.
+         */
+        uint16_t ycbcrsubsampling[2];
+        uint16_t samplingblock_samples;
+        uint32_t samplingblocks_hor;
+        uint32_t samplingblocks_ver;
+        uint64_t samplingrow_samples;
+        uint64_t samplingrow_size;
+        TIFFGetFieldDefaulted(tif, TIFFTAG_YCBCRSUBSAMPLING,
+                              ycbcrsubsampling + 0, ycbcrsubsampling + 1);
+        if ((ycbcrsubsampling[0] != 1 && ycbcrsubsampling[0] != 2 &&
+             ycbcrsubsampling[0] != 4) ||
+            (ycbcrsubsampling[1] != 1 && ycbcrsubsampling[1] != 2 &&
+             ycbcrsubsampling[1] != 4))
+        {
+            TIFFErrorExtR(tif, module, "Invalid YCbCr subsampling (%dx%d)",
+                          ycbcrsubsampling[0], ycbcrsubsampling[1]);
+            return 0;
+        }
+        samplingblock_samples = ycbcrsubsampling[0] * ycbcrsubsampling[1] + 2;
+        samplingblocks_hor =
+            TIFFhowmany_32(td->td_tilewidth, ycbcrsubsampling[0]);
+        samplingblocks_ver = TIFFhowmany_32(nrows, ycbcrsubsampling[1]);
+        samplingrow_samples = _TIFFMultiply64(tif, samplingblocks_hor,
+                                              samplingblock_samples, module);
+        samplingrow_size = TIFFhowmany8_64(_TIFFMultiply64(
+            tif, samplingrow_samples, td->td_bitspersample, module));
+        return (
+            _TIFFMultiply64(tif, samplingrow_size, samplingblocks_ver, module));
+    }
+    else
+        return (_TIFFMultiply64(tif, nrows, TIFFTileRowSize64(tif), module));
 }
-tmsize_t
-TIFFVTileSize(TIFF* tif, uint32 nrows)
+tmsize_t TIFFVTileSize(TIFF *tif, uint32_t nrows)
 {
-	static const char module[] = "TIFFVTileSize";
-	uint64 m;
-	m=TIFFVTileSize64(tif,nrows);
-	return _TIFFCastUInt64ToSSize(tif, m, module);
+    static const char module[] = "TIFFVTileSize";
+    uint64_t m;
+    m = TIFFVTileSize64(tif, nrows);
+    return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
  * Compute the # bytes in a row-aligned tile.
  */
-uint64
-TIFFTileSize64(TIFF* tif)
+uint64_t TIFFTileSize64(TIFF *tif)
 {
-	return (TIFFVTileSize64(tif, tif->tif_dir.td_tilelength));
+    return (TIFFVTileSize64(tif, tif->tif_dir.td_tilelength));
 }
-tmsize_t
-TIFFTileSize(TIFF* tif)
+tmsize_t TIFFTileSize(TIFF *tif)
 {
-	static const char module[] = "TIFFTileSize";
-	uint64 m;
-	m=TIFFTileSize64(tif);
-	return _TIFFCastUInt64ToSSize(tif, m, module);
+    static const char module[] = "TIFFTileSize";
+    uint64_t m;
+    m = TIFFTileSize64(tif);
+    return _TIFFCastUInt64ToSSize(tif, m, module);
 }
 
 /*
@@ -268,32 +264,21 @@ TIFFTileSize(TIFF* tif)
  * request is <1 then we choose a size according
  * to certain heuristics.
  */
-void
-TIFFDefaultTileSize(TIFF* tif, uint32* tw, uint32* th)
+void TIFFDefaultTileSize(TIFF *tif, uint32_t *tw, uint32_t *th)
 {
-	(*tif->tif_deftilesize)(tif, tw, th);
+    (*tif->tif_deftilesize)(tif, tw, th);
 }
 
-void
-_TIFFDefaultTileSize(TIFF* tif, uint32* tw, uint32* th)
+void _TIFFDefaultTileSize(TIFF *tif, uint32_t *tw, uint32_t *th)
 {
-	(void) tif;
-	if (*(int32*) tw < 1)
-		*tw = 256;
-	if (*(int32*) th < 1)
-		*th = 256;
-	/* roundup to a multiple of 16 per the spec */
-	if (*tw & 0xf)
-		*tw = TIFFroundup_32(*tw, 16);
-	if (*th & 0xf)
-		*th = TIFFroundup_32(*th, 16);
+    (void)tif;
+    if (*(int32_t *)tw < 1)
+        *tw = 256;
+    if (*(int32_t *)th < 1)
+        *th = 256;
+    /* roundup to a multiple of 16 per the spec */
+    if (*tw & 0xf)
+        *tw = TIFFroundup_32(*tw, 16);
+    if (*th & 0xf)
+        *th = TIFFroundup_32(*th, 16);
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_unix.c b/3rdparty/libtiff/tif_unix.c
index bea1ef780210..34dd53b98c9a 100644
--- a/3rdparty/libtiff/tif_unix.c
+++ b/3rdparty/libtiff/tif_unix.c
@@ -30,7 +30,7 @@
 #include "tif_config.h"
 
 #ifdef HAVE_SYS_TYPES_H
-# include <sys/types.h>
+#include <sys/types.h>
 #endif
 
 #include <errno.h>
@@ -40,215 +40,222 @@
 #include <sys/stat.h>
 
 #ifdef HAVE_UNISTD_H
-# include <unistd.h>
+#include <unistd.h>
 #endif
 
 #ifdef HAVE_FCNTL_H
-# include <fcntl.h>
+#include <fcntl.h>
 #endif
 
 #ifdef HAVE_IO_H
-# include <io.h>
+#include <io.h>
 #endif
 
 #include "tiffiop.h"
 
-
 #define TIFF_IO_MAX 2147483647U
 
-
 typedef union fd_as_handle_union
 {
-	int fd;
-	thandle_t h;
+    int fd;
+    thandle_t h;
 } fd_as_handle_union_t;
 
-static tmsize_t
-_tiffReadProc(thandle_t fd, void* buf, tmsize_t size)
+static tmsize_t _tiffReadProc(thandle_t fd, void *buf, tmsize_t size)
 {
-	fd_as_handle_union_t fdh;
-        const size_t bytes_total = (size_t) size;
-        size_t bytes_read;
-        tmsize_t count = -1;
-	if ((tmsize_t) bytes_total != size)
-	{
-		errno=EINVAL;
-		return (tmsize_t) -1;
-	}
-	fdh.h = fd;
-        for (bytes_read=0; bytes_read < bytes_total; bytes_read+=count)
-        {
-                char *buf_offset = (char *) buf+bytes_read;
-                size_t io_size = bytes_total-bytes_read;
-                if (io_size > TIFF_IO_MAX)
-                        io_size = TIFF_IO_MAX;
-                count=read(fdh.fd, buf_offset, (TIFFIOSize_t) io_size);
-                if (count <= 0)
-                        break;
-        }
-        if (count < 0)
-                return (tmsize_t)-1;
-        return (tmsize_t) bytes_read;
+    fd_as_handle_union_t fdh;
+    const size_t bytes_total = (size_t)size;
+    size_t bytes_read;
+    tmsize_t count = -1;
+    if ((tmsize_t)bytes_total != size)
+    {
+        errno = EINVAL;
+        return (tmsize_t)-1;
+    }
+    fdh.h = fd;
+    for (bytes_read = 0; bytes_read < bytes_total; bytes_read += count)
+    {
+        char *buf_offset = (char *)buf + bytes_read;
+        size_t io_size = bytes_total - bytes_read;
+        if (io_size > TIFF_IO_MAX)
+            io_size = TIFF_IO_MAX;
+        count = read(fdh.fd, buf_offset, (TIFFIOSize_t)io_size);
+        if (count <= 0)
+            break;
+    }
+    if (count < 0)
+        return (tmsize_t)-1;
+    return (tmsize_t)bytes_read;
 }
 
-static tmsize_t
-_tiffWriteProc(thandle_t fd, void* buf, tmsize_t size)
+static tmsize_t _tiffWriteProc(thandle_t fd, void *buf, tmsize_t size)
 {
-	fd_as_handle_union_t fdh;
-	const size_t bytes_total = (size_t) size;
-        size_t bytes_written;
-        tmsize_t count = -1;
-	if ((tmsize_t) bytes_total != size)
-	{
-		errno=EINVAL;
-		return (tmsize_t) -1;
-	}
-	fdh.h = fd;
-        for (bytes_written=0; bytes_written < bytes_total; bytes_written+=count)
-        {
-                const char *buf_offset = (char *) buf+bytes_written;
-                size_t io_size = bytes_total-bytes_written;
-                if (io_size > TIFF_IO_MAX)
-                        io_size = TIFF_IO_MAX;
-                count=write(fdh.fd, buf_offset, (TIFFIOSize_t) io_size);
-                if (count <= 0)
-                        break;
-        }
-        if (count < 0)
-                return (tmsize_t)-1;
-        return (tmsize_t) bytes_written;
-	/* return ((tmsize_t) write(fdh.fd, buf, bytes_total)); */
+    fd_as_handle_union_t fdh;
+    const size_t bytes_total = (size_t)size;
+    size_t bytes_written;
+    tmsize_t count = -1;
+    if ((tmsize_t)bytes_total != size)
+    {
+        errno = EINVAL;
+        return (tmsize_t)-1;
+    }
+    fdh.h = fd;
+    for (bytes_written = 0; bytes_written < bytes_total; bytes_written += count)
+    {
+        const char *buf_offset = (char *)buf + bytes_written;
+        size_t io_size = bytes_total - bytes_written;
+        if (io_size > TIFF_IO_MAX)
+            io_size = TIFF_IO_MAX;
+        count = write(fdh.fd, buf_offset, (TIFFIOSize_t)io_size);
+        if (count <= 0)
+            break;
+    }
+    if (count < 0)
+        return (tmsize_t)-1;
+    return (tmsize_t)bytes_written;
+    /* return ((tmsize_t) write(fdh.fd, buf, bytes_total)); */
 }
 
-static uint64
-_tiffSeekProc(thandle_t fd, uint64 off, int whence)
+static uint64_t _tiffSeekProc(thandle_t fd, uint64_t off, int whence)
 {
-	fd_as_handle_union_t fdh;
-	_TIFF_off_t off_io = (_TIFF_off_t) off;
-	if ((uint64) off_io != off)
-	{
-		errno=EINVAL;
-		return (uint64) -1; /* this is really gross */
-	}
-	fdh.h = fd;
-	return((uint64)_TIFF_lseek_f(fdh.fd,off_io,whence));
+    fd_as_handle_union_t fdh;
+    _TIFF_off_t off_io = (_TIFF_off_t)off;
+    if ((uint64_t)off_io != off)
+    {
+        errno = EINVAL;
+        return (uint64_t)-1; /* this is really gross */
+    }
+    fdh.h = fd;
+    return ((uint64_t)_TIFF_lseek_f(fdh.fd, off_io, whence));
 }
 
-static int
-_tiffCloseProc(thandle_t fd)
+static int _tiffCloseProc(thandle_t fd)
 {
-	fd_as_handle_union_t fdh;
-	fdh.h = fd;
-	return(close(fdh.fd));
+    fd_as_handle_union_t fdh;
+    fdh.h = fd;
+    return (close(fdh.fd));
 }
 
-static uint64
-_tiffSizeProc(thandle_t fd)
+static uint64_t _tiffSizeProc(thandle_t fd)
 {
-	_TIFF_stat_s sb;
-	fd_as_handle_union_t fdh;
-	fdh.h = fd;
-	if (_TIFF_fstat_f(fdh.fd,&sb)<0)
-		return(0);
-	else
-		return((uint64)sb.st_size);
+    _TIFF_stat_s sb;
+    fd_as_handle_union_t fdh;
+    fdh.h = fd;
+    if (_TIFF_fstat_f(fdh.fd, &sb) < 0)
+        return (0);
+    else
+        return ((uint64_t)sb.st_size);
 }
 
 #ifdef HAVE_MMAP
 #include <sys/mman.h>
 
-static int
-_tiffMapProc(thandle_t fd, void** pbase, toff_t* psize)
+static int _tiffMapProc(thandle_t fd, void **pbase, toff_t *psize)
 {
-	uint64 size64 = _tiffSizeProc(fd);
-	tmsize_t sizem = (tmsize_t)size64;
-	if (size64 && (uint64)sizem==size64) {
-		fd_as_handle_union_t fdh;
-		fdh.h = fd;
-		*pbase = (void*)
-		    mmap(0, (size_t)sizem, PROT_READ, MAP_SHARED, fdh.fd, 0);
-		if (*pbase != (void*) -1) {
-			*psize = (tmsize_t)sizem;
-			return (1);
-		}
-	}
-	return (0);
+    uint64_t size64 = _tiffSizeProc(fd);
+    tmsize_t sizem = (tmsize_t)size64;
+    if (size64 && (uint64_t)sizem == size64)
+    {
+        fd_as_handle_union_t fdh;
+        fdh.h = fd;
+        *pbase =
+            (void *)mmap(0, (size_t)sizem, PROT_READ, MAP_SHARED, fdh.fd, 0);
+        if (*pbase != (void *)-1)
+        {
+            *psize = (tmsize_t)sizem;
+            return (1);
+        }
+    }
+    return (0);
 }
 
-static void
-_tiffUnmapProc(thandle_t fd, void* base, toff_t size)
+static void _tiffUnmapProc(thandle_t fd, void *base, toff_t size)
 {
-	(void) fd;
-	(void) munmap(base, (off_t) size);
+    (void)fd;
+    (void)munmap(base, (off_t)size);
 }
-#else /* !HAVE_MMAP */
-static int
-_tiffMapProc(thandle_t fd, void** pbase, toff_t* psize)
+#else  /* !HAVE_MMAP */
+static int _tiffMapProc(thandle_t fd, void **pbase, toff_t *psize)
 {
-	(void) fd; (void) pbase; (void) psize;
-	return (0);
+    (void)fd;
+    (void)pbase;
+    (void)psize;
+    return (0);
 }
 
-static void
-_tiffUnmapProc(thandle_t fd, void* base, toff_t size)
+static void _tiffUnmapProc(thandle_t fd, void *base, toff_t size)
 {
-	(void) fd; (void) base; (void) size;
+    (void)fd;
+    (void)base;
+    (void)size;
 }
 #endif /* !HAVE_MMAP */
 
 /*
  * Open a TIFF file descriptor for read/writing.
  */
-TIFF*
-TIFFFdOpen(int fd, const char* name, const char* mode)
+TIFF *TIFFFdOpen(int fd, const char *name, const char *mode)
 {
-	TIFF* tif;
-
-	fd_as_handle_union_t fdh;
-	fdh.fd = fd;
-	tif = TIFFClientOpen(name, mode,
-	    fdh.h,
-	    _tiffReadProc, _tiffWriteProc,
-	    _tiffSeekProc, _tiffCloseProc, _tiffSizeProc,
-	    _tiffMapProc, _tiffUnmapProc);
-	if (tif)
-		tif->tif_fd = fd;
-	return (tif);
+    return TIFFFdOpenExt(fd, name, mode, NULL);
+}
+
+TIFF *TIFFFdOpenExt(int fd, const char *name, const char *mode,
+                    TIFFOpenOptions *opts)
+{
+    TIFF *tif;
+
+    fd_as_handle_union_t fdh;
+    fdh.fd = fd;
+    tif = TIFFClientOpenExt(name, mode, fdh.h, _tiffReadProc, _tiffWriteProc,
+                            _tiffSeekProc, _tiffCloseProc, _tiffSizeProc,
+                            _tiffMapProc, _tiffUnmapProc, opts);
+    if (tif)
+        tif->tif_fd = fd;
+    return (tif);
 }
 
 /*
  * Open a TIFF file for read/writing.
  */
-TIFF*
-TIFFOpen(const char* name, const char* mode)
+TIFF *TIFFOpen(const char *name, const char *mode)
 {
-	static const char module[] = "TIFFOpen";
-	int m, fd;
-	TIFF* tif;
+    return TIFFOpenExt(name, mode, NULL);
+}
 
-	m = _TIFFgetMode(mode, module);
-	if (m == -1)
-		return ((TIFF*)0);
+TIFF *TIFFOpenExt(const char *name, const char *mode, TIFFOpenOptions *opts)
+{
+    static const char module[] = "TIFFOpen";
+    int m, fd;
+    TIFF *tif;
+
+    m = _TIFFgetMode(opts, NULL, mode, module);
+    if (m == -1)
+        return ((TIFF *)0);
 
 /* for cygwin and mingw */
 #ifdef O_BINARY
-	m |= O_BINARY;
+    m |= O_BINARY;
 #endif
 
-	fd = open(name, m, 0666);
-	if (fd < 0) {
-		if (errno > 0 && strerror(errno) != NULL ) {
-			TIFFErrorExt(0, module, "%s: %s", name, strerror(errno) );
-		} else {
-			TIFFErrorExt(0, module, "%s: Cannot open", name);
-		}
-		return ((TIFF *)0);
-	}
-
-	tif = TIFFFdOpen((int)fd, name, mode);
-	if(!tif)
-		close(fd);
-	return tif;
+    fd = open(name, m, 0666);
+    if (fd < 0)
+    {
+        if (errno > 0 && strerror(errno) != NULL)
+        {
+            _TIFFErrorEarly(opts, NULL, module, "%s: %s", name,
+                            strerror(errno));
+        }
+        else
+        {
+            _TIFFErrorEarly(opts, NULL, module, "%s: Cannot open", name);
+        }
+        return ((TIFF *)0);
+    }
+
+    tif = TIFFFdOpenExt((int)fd, name, mode, opts);
+    if (!tif)
+        close(fd);
+    return tif;
 }
 
 #ifdef __WIN32__
@@ -256,129 +263,108 @@ TIFFOpen(const char* name, const char* mode)
 /*
  * Open a TIFF file with a Unicode filename, for read/writing.
  */
-TIFF*
-TIFFOpenW(const wchar_t* name, const char* mode)
+TIFF *TIFFOpenW(const wchar_t *name, const char *mode)
 {
-	static const char module[] = "TIFFOpenW";
-	int m, fd;
-	int mbsize;
-	char *mbname;
-	TIFF* tif;
+    return TIFFOpenWExt(name, mode, NULL);
+}
+TIFF *TIFFOpenWExt(const wchar_t *name, const char *mode, TIFFOpenOptions *opts)
+{
+    static const char module[] = "TIFFOpenW";
+    int m, fd;
+    int mbsize;
+    char *mbname;
+    TIFF *tif;
 
-	m = _TIFFgetMode(mode, module);
-	if (m == -1)
-		return ((TIFF*)0);
+    m = _TIFFgetMode(opts, NULL, mode, module);
+    if (m == -1)
+        return ((TIFF *)0);
 
 /* for cygwin and mingw */
 #ifdef O_BINARY
-	m |= O_BINARY;
+    m |= O_BINARY;
 #endif
 
-	fd = _wopen(name, m, 0666);
-	if (fd < 0) {
-		TIFFErrorExt(0, module, "%ls: Cannot open", name);
-		return ((TIFF *)0);
-	}
-
-	mbname = NULL;
-	mbsize = WideCharToMultiByte(CP_ACP, 0, name, -1, NULL, 0, NULL, NULL);
-	if (mbsize > 0) {
-		mbname = _TIFFmalloc(mbsize);
-		if (!mbname) {
-			TIFFErrorExt(0, module,
-			"Can't allocate space for filename conversion buffer");
-			return ((TIFF*)0);
-		}
-
-		WideCharToMultiByte(CP_ACP, 0, name, -1, mbname, mbsize,
-				    NULL, NULL);
-	}
-
-	tif = TIFFFdOpen((int)fd, (mbname != NULL) ? mbname : "<unknown>",
-			 mode);
-	
-	_TIFFfree(mbname);
-	
-	if(!tif)
-		close(fd);
-	return tif;
+    fd = _wopen(name, m, 0666);
+    if (fd < 0)
+    {
+        _TIFFErrorEarly(opts, NULL, module, "%ls: Cannot open", name);
+        return ((TIFF *)0);
+    }
+
+    mbname = NULL;
+    mbsize = WideCharToMultiByte(CP_ACP, 0, name, -1, NULL, 0, NULL, NULL);
+    if (mbsize > 0)
+    {
+        mbname = _TIFFmalloc(mbsize);
+        if (!mbname)
+        {
+            _TIFFErrorEarly(
+                opts, NULL, module,
+                "Can't allocate space for filename conversion buffer");
+            return ((TIFF *)0);
+        }
+
+        WideCharToMultiByte(CP_ACP, 0, name, -1, mbname, mbsize, NULL, NULL);
+    }
+
+    tif = TIFFFdOpenExt((int)fd, (mbname != NULL) ? mbname : "<unknown>", mode,
+                        opts);
+
+    _TIFFfree(mbname);
+
+    if (!tif)
+        close(fd);
+    return tif;
 }
 #endif
 
-void*
-_TIFFmalloc(tmsize_t s)
+void *_TIFFmalloc(tmsize_t s)
 {
-        if (s == 0)
-                return ((void *) NULL);
+    if (s == 0)
+        return ((void *)NULL);
 
-	return (malloc((size_t) s));
+    return (malloc((size_t)s));
 }
 
-void* _TIFFcalloc(tmsize_t nmemb, tmsize_t siz)
+void *_TIFFcalloc(tmsize_t nmemb, tmsize_t siz)
 {
-    if( nmemb == 0 || siz == 0 )
-        return ((void *) NULL);
+    if (nmemb == 0 || siz == 0)
+        return ((void *)NULL);
 
-    return calloc((size_t) nmemb, (size_t)siz);
+    return calloc((size_t)nmemb, (size_t)siz);
 }
 
-void
-_TIFFfree(void* p)
-{
-	free(p);
-}
+void _TIFFfree(void *p) { free(p); }
 
-void*
-_TIFFrealloc(void* p, tmsize_t s)
-{
-	return (realloc(p, (size_t) s));
-}
+void *_TIFFrealloc(void *p, tmsize_t s) { return (realloc(p, (size_t)s)); }
 
-void
-_TIFFmemset(void* p, int v, tmsize_t c)
-{
-	memset(p, v, (size_t) c);
-}
+void _TIFFmemset(void *p, int v, tmsize_t c) { memset(p, v, (size_t)c); }
 
-void
-_TIFFmemcpy(void* d, const void* s, tmsize_t c)
+void _TIFFmemcpy(void *d, const void *s, tmsize_t c)
 {
-	memcpy(d, s, (size_t) c);
+    memcpy(d, s, (size_t)c);
 }
 
-int
-_TIFFmemcmp(const void* p1, const void* p2, tmsize_t c)
+int _TIFFmemcmp(const void *p1, const void *p2, tmsize_t c)
 {
-	return (memcmp(p1, p2, (size_t) c));
+    return (memcmp(p1, p2, (size_t)c));
 }
 
-static void
-unixWarningHandler(const char* module, const char* fmt, va_list ap)
+static void unixWarningHandler(const char *module, const char *fmt, va_list ap)
 {
-	if (module != NULL)
-		fprintf(stderr, "%s: ", module);
-	fprintf(stderr, "Warning, ");
-	vfprintf(stderr, fmt, ap);
-	fprintf(stderr, ".\n");
+    if (module != NULL)
+        fprintf(stderr, "%s: ", module);
+    fprintf(stderr, "Warning, ");
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, ".\n");
 }
 TIFFErrorHandler _TIFFwarningHandler = unixWarningHandler;
 
-static void
-unixErrorHandler(const char* module, const char* fmt, va_list ap)
+static void unixErrorHandler(const char *module, const char *fmt, va_list ap)
 {
-	if (module != NULL)
-		fprintf(stderr, "%s: ", module);
-	vfprintf(stderr, fmt, ap);
-	fprintf(stderr, ".\n");
+    if (module != NULL)
+        fprintf(stderr, "%s: ", module);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, ".\n");
 }
 TIFFErrorHandler _TIFFerrorHandler = unixErrorHandler;
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_version.c b/3rdparty/libtiff/tif_version.c
index 60875bbf097a..0b6c9bc00a81 100644
--- a/3rdparty/libtiff/tif_version.c
+++ b/3rdparty/libtiff/tif_version.c
@@ -2,38 +2,27 @@
  * Copyright (c) 1992-1997 Sam Leffler
  * Copyright (c) 1992-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 #include "tiffiop.h"
 
 static const char TIFFVersion[] = TIFFLIB_VERSION_STR;
 
-const char*
-TIFFGetVersion(void)
-{
-	return (TIFFVersion);
-}
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
+const char *TIFFGetVersion(void) { return (TIFFVersion); }
diff --git a/3rdparty/libtiff/tif_warning.c b/3rdparty/libtiff/tif_warning.c
index c482785c2936..5468de55f21a 100644
--- a/3rdparty/libtiff/tif_warning.c
+++ b/3rdparty/libtiff/tif_warning.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -29,59 +29,77 @@
 
 TIFFErrorHandlerExt _TIFFwarningHandlerExt = NULL;
 
-TIFFErrorHandler
-TIFFSetWarningHandler(TIFFErrorHandler handler)
+TIFFErrorHandler TIFFSetWarningHandler(TIFFErrorHandler handler)
 {
-	TIFFErrorHandler prev = _TIFFwarningHandler;
-	_TIFFwarningHandler = handler;
-	return (prev);
+    TIFFErrorHandler prev = _TIFFwarningHandler;
+    _TIFFwarningHandler = handler;
+    return (prev);
 }
 
-TIFFErrorHandlerExt
-TIFFSetWarningHandlerExt(TIFFErrorHandlerExt handler)
+TIFFErrorHandlerExt TIFFSetWarningHandlerExt(TIFFErrorHandlerExt handler)
 {
-	TIFFErrorHandlerExt prev = _TIFFwarningHandlerExt;
-	_TIFFwarningHandlerExt = handler;
-	return (prev);
+    TIFFErrorHandlerExt prev = _TIFFwarningHandlerExt;
+    _TIFFwarningHandlerExt = handler;
+    return (prev);
 }
 
-void
-TIFFWarning(const char* module, const char* fmt, ...)
+void TIFFWarning(const char *module, const char *fmt, ...)
 {
-	va_list ap;
-	if (_TIFFwarningHandler) {
-		va_start(ap, fmt);
-		(*_TIFFwarningHandler)(module, fmt, ap);
-		va_end(ap);
-	}
-	if (_TIFFwarningHandlerExt) {
-		va_start(ap, fmt);
-		(*_TIFFwarningHandlerExt)(0, module, fmt, ap);
-		va_end(ap);
-	}
+    va_list ap;
+    if (_TIFFwarningHandler)
+    {
+        va_start(ap, fmt);
+        (*_TIFFwarningHandler)(module, fmt, ap);
+        va_end(ap);
+    }
+    if (_TIFFwarningHandlerExt)
+    {
+        va_start(ap, fmt);
+        (*_TIFFwarningHandlerExt)(0, module, fmt, ap);
+        va_end(ap);
+    }
 }
 
-void
-TIFFWarningExt(thandle_t fd, const char* module, const char* fmt, ...)
+void TIFFWarningExt(thandle_t fd, const char *module, const char *fmt, ...)
 {
-	va_list ap;
-	if (_TIFFwarningHandler) {
-		va_start(ap, fmt);	
-		(*_TIFFwarningHandler)(module, fmt, ap);
-		va_end(ap);
-	}
-	if (_TIFFwarningHandlerExt) {
-		va_start(ap, fmt);
-		(*_TIFFwarningHandlerExt)(fd, module, fmt, ap);
-		va_end(ap);
-	}
+    va_list ap;
+    if (_TIFFwarningHandler)
+    {
+        va_start(ap, fmt);
+        (*_TIFFwarningHandler)(module, fmt, ap);
+        va_end(ap);
+    }
+    if (_TIFFwarningHandlerExt)
+    {
+        va_start(ap, fmt);
+        (*_TIFFwarningHandlerExt)(fd, module, fmt, ap);
+        va_end(ap);
+    }
 }
 
-
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
+void TIFFWarningExtR(TIFF *tif, const char *module, const char *fmt, ...)
+{
+    va_list ap;
+    if (tif && tif->tif_warnhandler)
+    {
+        va_start(ap, fmt);
+        int stop = (*tif->tif_warnhandler)(tif, tif->tif_warnhandler_user_data,
+                                           module, fmt, ap);
+        va_end(ap);
+        if (stop)
+            return;
+    }
+    if (_TIFFwarningHandler)
+    {
+        va_start(ap, fmt);
+        (*_TIFFwarningHandler)(module, fmt, ap);
+        va_end(ap);
+    }
+    if (_TIFFwarningHandlerExt)
+    {
+        va_start(ap, fmt);
+        (*_TIFFwarningHandlerExt)(tif ? tif->tif_clientdata : 0, module, fmt,
+                                  ap);
+        va_end(ap);
+    }
+}
diff --git a/3rdparty/libtiff/tif_webp.c b/3rdparty/libtiff/tif_webp.c
index a00478f6b9dc..bf9d77eb9beb 100644
--- a/3rdparty/libtiff/tif_webp.c
+++ b/3rdparty/libtiff/tif_webp.c
@@ -1,26 +1,26 @@
 /*
-* Copyright (c) 2018, Mapbox
-* Author: <norman.barker at mapbox.com>
-*
-* Permission to use, copy, modify, distribute, and sell this software and
-* its documentation for any purpose is hereby granted without fee, provided
-* that (i) the above copyright notices and this permission notice appear in
-* all copies of the software and related documentation, and (ii) the names of
-* Sam Leffler and Silicon Graphics may not be used in any advertising or
-* publicity relating to the software without the specific, prior written
-* permission of Sam Leffler and Silicon Graphics.
-*
-* THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
-* WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
-*
-* IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
-* ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
-* OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-* WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
-* LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-* OF THIS SOFTWARE.
-*/
+ * Copyright (c) 2018, Mapbox
+ * Author: <norman.barker at mapbox.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and
+ * its documentation for any purpose is hereby granted without fee, provided
+ * that (i) the above copyright notices and this permission notice appear in
+ * all copies of the software and related documentation, and (ii) the names of
+ * Sam Leffler and Silicon Graphics may not be used in any advertising or
+ * publicity relating to the software without the specific, prior written
+ * permission of Sam Leffler and Silicon Graphics.
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
+ * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
+ * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
 
 #include "tiffiop.h"
 #ifdef WEBP_SUPPORT
@@ -34,6 +34,7 @@
 #include "webp/decode.h"
 #include "webp/encode.h"
 
+#include <stdbool.h>
 #include <stdio.h>
 
 #define LSTATE_INIT_DECODE 0x01
@@ -42,661 +43,839 @@
  * State block for each open TIFF
  * file using WEBP compression/decompression.
  */
-typedef struct {
-  uint16           nSamples;               /* number of samples per pixel */
-  
-  int              lossless;               /* lossy/lossless compression */
-  int              quality_level;          /* compression level */
-  WebPPicture      sPicture;               /* WebP Picture */
-  WebPConfig       sEncoderConfig;         /* WebP encoder config */
-  uint8*           pBuffer;                /* buffer to hold raw data on encoding */
-  unsigned int     buffer_offset;          /* current offset into the buffer */
-  unsigned int     buffer_size;
-  
-  WebPIDecoder*    psDecoder;              /* WebPIDecoder */
-  WebPDecBuffer    sDecBuffer;             /* Decoder buffer */
-  int              last_y;                 /* Last row decoded */
-  
-  int              state;                  /* state flags */
-  
-	TIFFVGetMethod   vgetparent;             /* super-class method */
-	TIFFVSetMethod   vsetparent;             /* super-class method */
+typedef struct
+{
+    uint16_t nSamples; /* number of samples per pixel */
+
+    int lossless;         /* lossy/lossless compression */
+    int lossless_exact;   /* lossless exact mode. If TRUE, R,G,B values in areas
+                             with alpha = 0 will be preserved */
+    int quality_level;    /* compression level */
+    WebPPicture sPicture; /* WebP Picture */
+    WebPConfig sEncoderConfig;  /* WebP encoder config */
+    uint8_t *pBuffer;           /* buffer to hold raw data on encoding */
+    unsigned int buffer_offset; /* current offset into the buffer */
+    unsigned int buffer_size;
+
+    WebPIDecoder *psDecoder;  /* WebPIDecoder */
+    WebPDecBuffer sDecBuffer; /* Decoder buffer */
+    int last_y;               /* Last row decoded */
+
+    int state; /* state flags */
+
+    TIFFVGetMethod vgetparent; /* super-class method */
+    TIFFVSetMethod vsetparent; /* super-class method */
 } WebPState;
 
-#define LState(tif)            ((WebPState*) (tif)->tif_data)
-#define DecoderState(tif)       LState(tif)
-#define EncoderState(tif)       LState(tif)
+#define LState(tif) ((WebPState *)(tif)->tif_data)
+#define DecoderState(tif) LState(tif)
+#define EncoderState(tif) LState(tif)
 
-static int TWebPEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s);
-static int TWebPDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s);
+static int TWebPEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s);
+static int TWebPDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s);
 
-static
-int TWebPDatasetWriter(const uint8_t* data, size_t data_size,
-                      const WebPPicture* const picture)
+static int TWebPDatasetWriter(const uint8_t *data, size_t data_size,
+                              const WebPPicture *const picture)
 {
-  static const char module[] = "TWebPDatasetWriter";
-  TIFF* tif = (TIFF*)(picture->custom_ptr);
-  
-  if ( (tif->tif_rawcc + (tmsize_t)data_size) > tif->tif_rawdatasize ) {
-    TIFFErrorExt(tif->tif_clientdata, module,
-                 "Buffer too small by " TIFF_SIZE_FORMAT " bytes.",
-                 (size_t) (tif->tif_rawcc + data_size - tif->tif_rawdatasize));
-    return 0;
-  } else {
-    _TIFFmemcpy(tif->tif_rawcp, data, data_size);
-    tif->tif_rawcc += data_size;
-    tif->tif_rawcp += data_size;
-    return 1;    
-  }
+    static const char module[] = "TWebPDatasetWriter";
+    TIFF *tif = (TIFF *)(picture->custom_ptr);
+
+    if ((tif->tif_rawcc + (tmsize_t)data_size) > tif->tif_rawdatasize)
+    {
+        TIFFErrorExtR(
+            tif, module, "Buffer too small by %" TIFF_SIZE_FORMAT " bytes.",
+            (size_t)(tif->tif_rawcc + data_size - tif->tif_rawdatasize));
+        return 0;
+    }
+    else
+    {
+        _TIFFmemcpy(tif->tif_rawcp, data, data_size);
+        tif->tif_rawcc += data_size;
+        tif->tif_rawcp += data_size;
+        return 1;
+    }
 }
 
 /*
  * Encode a chunk of pixels.
  */
-static int
-TWebPEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int TWebPEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-  static const char module[] = "TWebPEncode";
-  WebPState *sp = EncoderState(tif);
-  (void) s;
-
-  assert(sp != NULL);
-  assert(sp->state == LSTATE_INIT_ENCODE);
-    
-  if( (uint64)sp->buffer_offset +
-                            (uint64)cc > sp->buffer_size )
-  {
-      TIFFErrorExt(tif->tif_clientdata, module,
-                   "Too many bytes to be written");
-      return 0;
-  }
-
-  memcpy(sp->pBuffer + sp->buffer_offset,
-         bp, cc);
-  sp->buffer_offset += (unsigned)cc;
-
-  return 1;
-  
+    static const char module[] = "TWebPEncode";
+    WebPState *sp = EncoderState(tif);
+    (void)s;
+
+    assert(sp != NULL);
+    assert(sp->state == LSTATE_INIT_ENCODE);
+
+    if ((uint64_t)sp->buffer_offset + (uint64_t)cc > sp->buffer_size)
+    {
+        TIFFErrorExtR(tif, module, "Too many bytes to be written");
+        return 0;
+    }
+
+    memcpy(sp->pBuffer + sp->buffer_offset, bp, cc);
+    sp->buffer_offset += (unsigned)cc;
+
+    return 1;
 }
 
-static int
-TWebPDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int TWebPDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-  static const char module[] = "WebPDecode";
-  VP8StatusCode status = VP8_STATUS_OK;
-  WebPState *sp = DecoderState(tif);
-  (void) s;  
-
-  assert(sp != NULL);
-  assert(sp->state == LSTATE_INIT_DECODE);
-  
-  if (occ % sp->sDecBuffer.u.RGBA.stride)
-  {
-    TIFFErrorExt(tif->tif_clientdata, module,
-                 "Fractional scanlines cannot be read");
-    return 0;
-  }
+    static const char module[] = "WebPDecode";
+    VP8StatusCode status = VP8_STATUS_OK;
+    WebPState *sp = DecoderState(tif);
+    uint32_t segment_width, segment_height;
+    bool decode_whole_strile = false;
+
+    (void)s;
+
+    assert(sp != NULL);
+    assert(sp->state == LSTATE_INIT_DECODE);
+
+    if (sp->psDecoder == NULL)
+    {
+        TIFFDirectory *td = &tif->tif_dir;
+        uint32_t buffer_size;
+
+        if (isTiled(tif))
+        {
+            segment_width = td->td_tilewidth;
+            segment_height = td->td_tilelength;
+        }
+        else
+        {
+            segment_width = td->td_imagewidth;
+            segment_height = td->td_imagelength - tif->tif_row;
+            if (segment_height > td->td_rowsperstrip)
+                segment_height = td->td_rowsperstrip;
+        }
+
+        int webp_width, webp_height;
+        if (!WebPGetInfo(tif->tif_rawcp,
+                         (uint64_t)tif->tif_rawcc > UINT32_MAX
+                             ? UINT32_MAX
+                             : (uint32_t)tif->tif_rawcc,
+                         &webp_width, &webp_height))
+        {
+            TIFFErrorExtR(tif, module, "WebPGetInfo() failed");
+            return 0;
+        }
+        if ((uint32_t)webp_width != segment_width ||
+            (uint32_t)webp_height != segment_height)
+        {
+            TIFFErrorExtR(
+                tif, module, "WebP blob dimension is %dx%d. Expected %ux%u",
+                webp_width, webp_height, segment_width, segment_height);
+            return 0;
+        }
+
+#if WEBP_DECODER_ABI_VERSION >= 0x0002
+        WebPDecoderConfig config;
+        if (!WebPInitDecoderConfig(&config))
+        {
+            TIFFErrorExtR(tif, module, "WebPInitDecoderConfig() failed");
+            return 0;
+        }
+
+        const bool bWebPGetFeaturesOK =
+            WebPGetFeatures(tif->tif_rawcp,
+                            (uint64_t)tif->tif_rawcc > UINT32_MAX
+                                ? UINT32_MAX
+                                : (uint32_t)tif->tif_rawcc,
+                            &config.input) == VP8_STATUS_OK;
+
+        WebPFreeDecBuffer(&config.output);
+
+        if (!bWebPGetFeaturesOK)
+        {
+            TIFFErrorExtR(tif, module, "WebPInitDecoderConfig() failed");
+            return 0;
+        }
+
+        const int webp_bands = config.input.has_alpha ? 4 : 3;
+        if (webp_bands != sp->nSamples &&
+            /* We accept the situation where the WebP blob has only 3 bands,
+             * whereas the raster is 4 bands. This can happen when the alpha
+             * channel is fully opaque, and WebP decoding works fine in that
+             * situation.
+             */
+            !(webp_bands == 3 && sp->nSamples == 4))
+        {
+            TIFFErrorExtR(tif, module,
+                          "WebP blob band count is %d. Expected %d", webp_bands,
+                          sp->nSamples);
+            return 0;
+        }
+#endif
 
-  status = WebPIAppend(sp->psDecoder, tif->tif_rawcp, tif->tif_rawcc);
+        buffer_size = segment_width * segment_height * sp->nSamples;
+        if (occ == (tmsize_t)buffer_size)
+        {
+            /* If decoding the whole strip/tile, we can directly use the */
+            /* output buffer */
+            decode_whole_strile = true;
+        }
+        else if (sp->pBuffer == NULL || buffer_size > sp->buffer_size)
+        {
+            if (sp->pBuffer != NULL)
+            {
+                _TIFFfreeExt(tif, sp->pBuffer);
+                sp->pBuffer = NULL;
+            }
+
+            sp->pBuffer = _TIFFmallocExt(tif, buffer_size);
+            if (!sp->pBuffer)
+            {
+                TIFFErrorExtR(tif, module, "Cannot allocate buffer");
+                return 0;
+            }
+            sp->buffer_size = buffer_size;
+        }
+
+        sp->last_y = 0;
+
+        WebPInitDecBuffer(&sp->sDecBuffer);
+
+        sp->sDecBuffer.is_external_memory = 1;
+        sp->sDecBuffer.width = segment_width;
+        sp->sDecBuffer.height = segment_height;
+        sp->sDecBuffer.u.RGBA.rgba = decode_whole_strile ? op : sp->pBuffer;
+        sp->sDecBuffer.u.RGBA.stride = segment_width * sp->nSamples;
+        sp->sDecBuffer.u.RGBA.size = buffer_size;
+
+        if (sp->nSamples > 3)
+        {
+            sp->sDecBuffer.colorspace = MODE_RGBA;
+        }
+        else
+        {
+            sp->sDecBuffer.colorspace = MODE_RGB;
+        }
+
+        sp->psDecoder = WebPINewDecoder(&sp->sDecBuffer);
+
+        if (sp->psDecoder == NULL)
+        {
+            TIFFErrorExtR(tif, module, "Unable to allocate WebP decoder.");
+            return 0;
+        }
+    }
 
-  if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED) {
-    if (status == VP8_STATUS_INVALID_PARAM) {
-       TIFFErrorExt(tif->tif_clientdata, module,
-         "Invalid parameter used.");      
-    } else if (status == VP8_STATUS_OUT_OF_MEMORY) {
-      TIFFErrorExt(tif->tif_clientdata, module,
-        "Out of memory.");         
-    } else {
-      TIFFErrorExt(tif->tif_clientdata, module,
-        "Unrecognized error.");   
+    if (occ % sp->sDecBuffer.u.RGBA.stride)
+    {
+        TIFFErrorExtR(tif, module, "Fractional scanlines cannot be read");
+        return 0;
+    }
+
+    status = WebPIAppend(sp->psDecoder, tif->tif_rawcp, tif->tif_rawcc);
+
+    if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED)
+    {
+        if (status == VP8_STATUS_INVALID_PARAM)
+        {
+            TIFFErrorExtR(tif, module, "Invalid parameter used.");
+        }
+        else if (status == VP8_STATUS_OUT_OF_MEMORY)
+        {
+            TIFFErrorExtR(tif, module, "Out of memory.");
+        }
+        else
+        {
+            TIFFErrorExtR(tif, module, "Unrecognized error.");
+        }
+        return 0;
+    }
+    else
+    {
+        int current_y, stride;
+        uint8_t *buf;
+
+        /* Returns the RGB/A image decoded so far */
+        buf = WebPIDecGetRGB(sp->psDecoder, &current_y, NULL, NULL, &stride);
+
+        if ((buf != NULL) &&
+            (occ <= (tmsize_t)stride * (current_y - sp->last_y)))
+        {
+            const int numberOfExpectedLines =
+                (int)(occ / sp->sDecBuffer.u.RGBA.stride);
+            if (decode_whole_strile)
+            {
+                if (current_y != numberOfExpectedLines)
+                {
+                    TIFFErrorExtR(tif, module,
+                                  "Unable to decode WebP data: less lines than "
+                                  "expected.");
+                    return 0;
+                }
+            }
+            else
+            {
+                memcpy(op, buf + (sp->last_y * stride), occ);
+            }
+
+            tif->tif_rawcp += tif->tif_rawcc;
+            tif->tif_rawcc = 0;
+            sp->last_y += numberOfExpectedLines;
+
+            if (decode_whole_strile)
+            {
+                /* We can now free the decoder as we're completely done */
+                if (sp->psDecoder != NULL)
+                {
+                    WebPIDelete(sp->psDecoder);
+                    WebPFreeDecBuffer(&sp->sDecBuffer);
+                    sp->psDecoder = NULL;
+                }
+            }
+            return 1;
+        }
+        else
+        {
+            TIFFErrorExtR(tif, module, "Unable to decode WebP data.");
+            return 0;
+        }
     }
-    return 0;
-  } else {
-    int current_y, stride;
-    uint8_t* buf;
-
-    /* Returns the RGB/A image decoded so far */
-    buf = WebPIDecGetRGB(sp->psDecoder, &current_y, NULL, NULL, &stride);
-    
-    if ((buf != NULL) &&
-        (occ <= stride * (current_y - sp->last_y))) {
-      memcpy(op,   
-         buf + (sp->last_y * stride),
-         occ);
-
-      tif->tif_rawcp += tif->tif_rawcc;
-      tif->tif_rawcc = 0;
-      sp->last_y += occ / sp->sDecBuffer.u.RGBA.stride;
-      return 1;
-    } else {
-      TIFFErrorExt(tif->tif_clientdata, module, "Unable to decode WebP data."); 
-      return 0;
-    }
-  }
 }
 
-static int
-TWebPFixupTags(TIFF* tif)
+static int TWebPFixupTags(TIFF *tif)
 {
-  (void) tif;
-  if (tif->tif_dir.td_planarconfig != PLANARCONFIG_CONTIG) {
-    static const char module[] = "TWebPFixupTags";
-    TIFFErrorExt(tif->tif_clientdata, module,
-      "TIFF WEBP requires data to be stored contiguously in RGB e.g. RGBRGBRGB "
+    (void)tif;
+    if (tif->tif_dir.td_planarconfig != PLANARCONFIG_CONTIG)
+    {
+        static const char module[] = "TWebPFixupTags";
+        TIFFErrorExtR(tif, module,
+                      "TIFF WEBP requires data to be stored contiguously in "
+                      "RGB e.g. RGBRGBRGB "
 #if WEBP_ENCODER_ABI_VERSION >= 0x0100
-      "or RGBARGBARGBA"
+                      "or RGBARGBARGBA"
 #endif
-    );
-    return 0;
-  }
-  return 1;
+        );
+        return 0;
+    }
+    return 1;
 }
 
-static int
-TWebPSetupDecode(TIFF* tif)
+static int TWebPSetupDecode(TIFF *tif)
 {
-  static const char module[] = "WebPSetupDecode";
-  uint16 nBitsPerSample = tif->tif_dir.td_bitspersample;
-  uint16 sampleFormat = tif->tif_dir.td_sampleformat;
+    static const char module[] = "WebPSetupDecode";
+    uint16_t nBitsPerSample = tif->tif_dir.td_bitspersample;
+    uint16_t sampleFormat = tif->tif_dir.td_sampleformat;
 
-  WebPState* sp = DecoderState(tif);
-  assert(sp != NULL);
+    WebPState *sp = DecoderState(tif);
+    assert(sp != NULL);
 
-  sp->nSamples = tif->tif_dir.td_samplesperpixel;
+    sp->nSamples = tif->tif_dir.td_samplesperpixel;
 
-  /* check band count */
-  if ( sp->nSamples != 3
+    /* check band count */
+    if (sp->nSamples != 3
 #if WEBP_ENCODER_ABI_VERSION >= 0x0100
-    && sp->nSamples != 4
+        && sp->nSamples != 4
 #endif
-  )
-  {
-    TIFFErrorExt(tif->tif_clientdata, module,
-      "WEBP driver doesn't support %d bands. Must be 3 (RGB) "
-  #if WEBP_ENCODER_ABI_VERSION >= 0x0100
-      "or 4 (RGBA) "
-  #endif
-    "bands.",
-    sp->nSamples );
-    return 0;
-  }
+    )
+    {
+        TIFFErrorExtR(tif, module,
+                      "WEBP driver doesn't support %d bands. Must be 3 (RGB) "
+#if WEBP_ENCODER_ABI_VERSION >= 0x0100
+                      "or 4 (RGBA) "
+#endif
+                      "bands.",
+                      sp->nSamples);
+        return 0;
+    }
 
-  /* check bits per sample and data type */
-  if ((nBitsPerSample != 8) && (sampleFormat != 1)) {
-    TIFFErrorExt(tif->tif_clientdata, module,
-                "WEBP driver requires 8 bit unsigned data");
-    return 0;
-  }
-  
-  /* if we were last encoding, terminate this mode */
-  if (sp->state & LSTATE_INIT_ENCODE) {
-      WebPPictureFree(&sp->sPicture);
-      if (sp->pBuffer != NULL) {
-        _TIFFfree(sp->pBuffer);
-        sp->pBuffer = NULL;
-      }
-      sp->buffer_offset = 0;
-      sp->state = 0;
-  }
+    /* check bits per sample and data type */
+    if ((nBitsPerSample != 8) && (sampleFormat != 1))
+    {
+        TIFFErrorExtR(tif, module, "WEBP driver requires 8 bit unsigned data");
+        return 0;
+    }
+
+    /* if we were last encoding, terminate this mode */
+    if (sp->state & LSTATE_INIT_ENCODE)
+    {
+        WebPPictureFree(&sp->sPicture);
+        if (sp->pBuffer != NULL)
+        {
+            _TIFFfreeExt(tif, sp->pBuffer);
+            sp->pBuffer = NULL;
+        }
+        sp->buffer_offset = 0;
+        sp->state = 0;
+    }
 
-  sp->state |= LSTATE_INIT_DECODE;
+    sp->state |= LSTATE_INIT_DECODE;
 
-  return 1;
+    return 1;
 }
 
 /*
-* Setup state for decoding a strip.
-*/
-static int
-TWebPPreDecode(TIFF* tif, uint16 s)
+ * Setup state for decoding a strip.
+ */
+static int TWebPPreDecode(TIFF *tif, uint16_t s)
 {
-  static const char module[] = "TWebPPreDecode";
-  uint32 segment_width, segment_height;
-  WebPState* sp = DecoderState(tif);
-  TIFFDirectory* td = &tif->tif_dir;
-  (void) s;
-  assert(sp != NULL);
-  
-  if (isTiled(tif)) {
-    segment_width = td->td_tilewidth;
-    segment_height = td->td_tilelength;
-  } else {
-    segment_width = td->td_imagewidth;
-    segment_height = td->td_imagelength - tif->tif_row;
-    if (segment_height > td->td_rowsperstrip)
-      segment_height = td->td_rowsperstrip;
-  }
-
-  if( segment_width > 16383 || segment_height > 16383 ) {
-      TIFFErrorExt(tif->tif_clientdata, module,
-                   "WEBP maximum image dimensions are 16383 x 16383.");
-      return 0;
-  }
-
-  if( (sp->state & LSTATE_INIT_DECODE) == 0 )
-      tif->tif_setupdecode(tif);
-      
-  if (sp->psDecoder != NULL) {
-    WebPIDelete(sp->psDecoder);
-    WebPFreeDecBuffer(&sp->sDecBuffer);
-    sp->psDecoder = NULL;
-  }
-
-  sp->last_y = 0;
-  
-  WebPInitDecBuffer(&sp->sDecBuffer);
-  
-  sp->sDecBuffer.is_external_memory = 0;
-  sp->sDecBuffer.width = segment_width;
-  sp->sDecBuffer.height = segment_height;
-  sp->sDecBuffer.u.RGBA.stride = segment_width * sp->nSamples;
-  sp->sDecBuffer.u.RGBA.size = segment_width * sp->nSamples * segment_height;
-  
-  if (sp->nSamples > 3) {
-    sp->sDecBuffer.colorspace = MODE_RGBA;
-  } else {
-    sp->sDecBuffer.colorspace = MODE_RGB;
-  }
-  
-  sp->psDecoder = WebPINewDecoder(&sp->sDecBuffer);
-  
-  if (sp->psDecoder == NULL) {
-    TIFFErrorExt(tif->tif_clientdata, module,
-                "Unable to allocate WebP decoder.");
-    return 0;
-  }
-  
-  return 1;
+    static const char module[] = "TWebPPreDecode";
+    uint32_t segment_width, segment_height;
+    WebPState *sp = DecoderState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+    (void)s;
+    assert(sp != NULL);
+
+    if (isTiled(tif))
+    {
+        segment_width = td->td_tilewidth;
+        segment_height = td->td_tilelength;
+    }
+    else
+    {
+        segment_width = td->td_imagewidth;
+        segment_height = td->td_imagelength - tif->tif_row;
+        if (segment_height > td->td_rowsperstrip)
+            segment_height = td->td_rowsperstrip;
+    }
+
+    if (segment_width > 16383 || segment_height > 16383)
+    {
+        TIFFErrorExtR(tif, module,
+                      "WEBP maximum image dimensions are 16383 x 16383.");
+        return 0;
+    }
+
+    if ((sp->state & LSTATE_INIT_DECODE) == 0)
+        tif->tif_setupdecode(tif);
+
+    if (sp->psDecoder != NULL)
+    {
+        WebPIDelete(sp->psDecoder);
+        WebPFreeDecBuffer(&sp->sDecBuffer);
+        sp->psDecoder = NULL;
+    }
+
+    return 1;
 }
 
-static int
-TWebPSetupEncode(TIFF* tif)
+static int TWebPSetupEncode(TIFF *tif)
 {
-  static const char module[] = "WebPSetupEncode";
-  uint16 nBitsPerSample = tif->tif_dir.td_bitspersample;
-  uint16 sampleFormat = tif->tif_dir.td_sampleformat;
-  
-  WebPState* sp = EncoderState(tif);
-  assert(sp != NULL);
+    static const char module[] = "WebPSetupEncode";
+    uint16_t nBitsPerSample = tif->tif_dir.td_bitspersample;
+    uint16_t sampleFormat = tif->tif_dir.td_sampleformat;
+
+    WebPState *sp = EncoderState(tif);
+    assert(sp != NULL);
 
-  sp->nSamples = tif->tif_dir.td_samplesperpixel;
+    sp->nSamples = tif->tif_dir.td_samplesperpixel;
 
-  /* check band count */
-  if ( sp->nSamples != 3
+    /* check band count */
+    if (sp->nSamples != 3
 #if WEBP_ENCODER_ABI_VERSION >= 0x0100
-    && sp->nSamples != 4
+        && sp->nSamples != 4
 #endif
-  )
-  {
-    TIFFErrorExt(tif->tif_clientdata, module,
-      "WEBP driver doesn't support %d bands. Must be 3 (RGB) "
+    )
+    {
+        TIFFErrorExtR(tif, module,
+                      "WEBP driver doesn't support %d bands. Must be 3 (RGB) "
 #if WEBP_ENCODER_ABI_VERSION >= 0x0100
-      "or 4 (RGBA) "
+                      "or 4 (RGBA) "
 #endif
-    "bands.",
-    sp->nSamples );
-    return 0;
-  }
-  
-  /* check bits per sample and data type */
-  if ((nBitsPerSample != 8) || (sampleFormat != SAMPLEFORMAT_UINT)) {
-    TIFFErrorExt(tif->tif_clientdata, module,
-                "WEBP driver requires 8 bit unsigned data");
-    return 0;
-  }
-  
-  if (sp->state & LSTATE_INIT_DECODE) {
-    WebPIDelete(sp->psDecoder);
-    WebPFreeDecBuffer(&sp->sDecBuffer);
-    sp->psDecoder = NULL;
-    sp->last_y = 0;
-    sp->state = 0;
-  }
+                      "bands.",
+                      sp->nSamples);
+        return 0;
+    }
 
-  sp->state |= LSTATE_INIT_ENCODE;
+    /* check bits per sample and data type */
+    if ((nBitsPerSample != 8) || (sampleFormat != SAMPLEFORMAT_UINT))
+    {
+        TIFFErrorExtR(tif, module, "WEBP driver requires 8 bit unsigned data");
+        return 0;
+    }
 
-  if (!WebPPictureInit(&sp->sPicture)) {
-    TIFFErrorExt(tif->tif_clientdata, module,
-        "Error initializing WebP picture.");
-    return 0;
-  }
+    if (sp->state & LSTATE_INIT_DECODE)
+    {
+        WebPIDelete(sp->psDecoder);
+        WebPFreeDecBuffer(&sp->sDecBuffer);
+        sp->psDecoder = NULL;
+        sp->last_y = 0;
+        sp->state = 0;
+    }
 
-  if (!WebPConfigInitInternal(&sp->sEncoderConfig, WEBP_PRESET_DEFAULT,
-                              (float)sp->quality_level,
-                              WEBP_ENCODER_ABI_VERSION)) {
-    TIFFErrorExt(tif->tif_clientdata, module,
-      "Error creating WebP encoder configuration.");
-    return 0;
-  }
+    sp->state |= LSTATE_INIT_ENCODE;
+
+    if (!WebPPictureInit(&sp->sPicture))
+    {
+        TIFFErrorExtR(tif, module, "Error initializing WebP picture.");
+        return 0;
+    }
 
-  // WebPConfigInitInternal above sets lossless to false
-  #if WEBP_ENCODER_ABI_VERSION >= 0x0100
+    if (!WebPConfigInitInternal(&sp->sEncoderConfig, WEBP_PRESET_DEFAULT,
+                                (float)sp->quality_level,
+                                WEBP_ENCODER_ABI_VERSION))
+    {
+        TIFFErrorExtR(tif, module,
+                      "Error creating WebP encoder configuration.");
+        return 0;
+    }
+
+// WebPConfigInitInternal above sets lossless to false
+#if WEBP_ENCODER_ABI_VERSION >= 0x0100
     sp->sEncoderConfig.lossless = sp->lossless;
-    if (sp->lossless) {
-      sp->sPicture.use_argb = 1;
+    if (sp->lossless)
+    {
+        sp->sPicture.use_argb = 1;
+#if WEBP_ENCODER_ABI_VERSION >= 0x0209
+        sp->sEncoderConfig.exact = sp->lossless_exact;
+#endif
     }
-  #endif
+#endif
 
-  if (!WebPValidateConfig(&sp->sEncoderConfig)) {
-    TIFFErrorExt(tif->tif_clientdata, module,
-      "Error with WebP encoder configuration.");
-    return 0;
-  }
+    if (!WebPValidateConfig(&sp->sEncoderConfig))
+    {
+        TIFFErrorExtR(tif, module, "Error with WebP encoder configuration.");
+        return 0;
+    }
 
-  return 1;
+    return 1;
 }
 
 /*
-* Reset encoding state at the start of a strip.
-*/
-static int
-TWebPPreEncode(TIFF* tif, uint16 s)
+ * Reset encoding state at the start of a strip.
+ */
+static int TWebPPreEncode(TIFF *tif, uint16_t s)
 {
-  static const char module[] = "TWebPPreEncode";
-  uint32 segment_width, segment_height;
-  WebPState *sp = EncoderState(tif);
-  TIFFDirectory* td = &tif->tif_dir;
-
-  (void) s;
-
-  assert(sp != NULL);
-  if( sp->state != LSTATE_INIT_ENCODE )
-    tif->tif_setupencode(tif);
-
-  /*
-   * Set encoding parameters for this strip/tile.
-   */
-  if (isTiled(tif)) {
-    segment_width = td->td_tilewidth;
-    segment_height = td->td_tilelength;
-  } else {
-    segment_width = td->td_imagewidth;
-    segment_height = td->td_imagelength - tif->tif_row;
-    if (segment_height > td->td_rowsperstrip)
-      segment_height = td->td_rowsperstrip;
-  }
-
-  if( segment_width > 16383 || segment_height > 16383 ) {
-      TIFFErrorExt(tif->tif_clientdata, module, 
-                   "WEBP maximum image dimensions are 16383 x 16383.");
-      return 0;
-  }
-
-  /* set up buffer for raw data */
-  /* given above check and that nSamples <= 4, buffer_size is <= 1 GB */
-  sp->buffer_size = segment_width * segment_height * sp->nSamples;
-  
-  if (sp->pBuffer != NULL) {
-      _TIFFfree(sp->pBuffer);
-      sp->pBuffer = NULL;    
-  }
-  
-  sp->pBuffer = _TIFFmalloc(sp->buffer_size);
-  if( !sp->pBuffer) {
-      TIFFErrorExt(tif->tif_clientdata, module, "Cannot allocate buffer");
-      return 0;
-  }
-  sp->buffer_offset = 0;
-
-  sp->sPicture.width = segment_width;
-  sp->sPicture.height = segment_height;
-  sp->sPicture.writer = TWebPDatasetWriter;
-  sp->sPicture.custom_ptr = tif;
-
-  return 1;
+    static const char module[] = "TWebPPreEncode";
+    uint32_t segment_width, segment_height;
+    WebPState *sp = EncoderState(tif);
+    TIFFDirectory *td = &tif->tif_dir;
+
+    (void)s;
+
+    assert(sp != NULL);
+    if (sp->state != LSTATE_INIT_ENCODE)
+        tif->tif_setupencode(tif);
+
+    /*
+     * Set encoding parameters for this strip/tile.
+     */
+    if (isTiled(tif))
+    {
+        segment_width = td->td_tilewidth;
+        segment_height = td->td_tilelength;
+    }
+    else
+    {
+        segment_width = td->td_imagewidth;
+        segment_height = td->td_imagelength - tif->tif_row;
+        if (segment_height > td->td_rowsperstrip)
+            segment_height = td->td_rowsperstrip;
+    }
+
+    if (segment_width > 16383 || segment_height > 16383)
+    {
+        TIFFErrorExtR(tif, module,
+                      "WEBP maximum image dimensions are 16383 x 16383.");
+        return 0;
+    }
+
+    /* set up buffer for raw data */
+    /* given above check and that nSamples <= 4, buffer_size is <= 1 GB */
+    sp->buffer_size = segment_width * segment_height * sp->nSamples;
+
+    if (sp->pBuffer != NULL)
+    {
+        _TIFFfreeExt(tif, sp->pBuffer);
+        sp->pBuffer = NULL;
+    }
+
+    sp->pBuffer = _TIFFmallocExt(tif, sp->buffer_size);
+    if (!sp->pBuffer)
+    {
+        TIFFErrorExtR(tif, module, "Cannot allocate buffer");
+        return 0;
+    }
+    sp->buffer_offset = 0;
+
+    sp->sPicture.width = segment_width;
+    sp->sPicture.height = segment_height;
+    sp->sPicture.writer = TWebPDatasetWriter;
+    sp->sPicture.custom_ptr = tif;
+
+    return 1;
 }
 
 /*
-* Finish off an encoded strip by flushing it.
-*/
-static int
-TWebPPostEncode(TIFF* tif)
+ * Finish off an encoded strip by flushing it.
+ */
+static int TWebPPostEncode(TIFF *tif)
 {
-  static const char module[] = "WebPPostEncode";
-  int64_t stride;
-  WebPState *sp = EncoderState(tif);
-  assert(sp != NULL);
+    static const char module[] = "WebPPostEncode";
+    int64_t stride;
+    WebPState *sp = EncoderState(tif);
+    assert(sp != NULL);
 
-  assert(sp->state == LSTATE_INIT_ENCODE);
+    assert(sp->state == LSTATE_INIT_ENCODE);
 
-  stride = (int64_t)sp->sPicture.width * sp->nSamples;
+    stride = (int64_t)sp->sPicture.width * sp->nSamples;
 
 #if WEBP_ENCODER_ABI_VERSION >= 0x0100
-  if (sp->nSamples == 4) {
-      if (!WebPPictureImportRGBA(&sp->sPicture, sp->pBuffer, (int)stride)) {
-          TIFFErrorExt(tif->tif_clientdata, module,
-                    "WebPPictureImportRGBA() failed" );
-          return 0;
-      }
-  }
-  else
+    if (sp->nSamples == 4)
+    {
+        if (!WebPPictureImportRGBA(&sp->sPicture, sp->pBuffer, (int)stride))
+        {
+            TIFFErrorExtR(tif, module, "WebPPictureImportRGBA() failed");
+            return 0;
+        }
+    }
+    else
 #endif
-  if (!WebPPictureImportRGB(&sp->sPicture, sp->pBuffer, (int)stride)) {
-      TIFFErrorExt(tif->tif_clientdata, module,
-                    "WebPPictureImportRGB() failed");
-      return 0;
-  }
-  
-  if (!WebPEncode(&sp->sEncoderConfig, &sp->sPicture)) {
+        if (!WebPPictureImportRGB(&sp->sPicture, sp->pBuffer, (int)stride))
+    {
+        TIFFErrorExtR(tif, module, "WebPPictureImportRGB() failed");
+        return 0;
+    }
+
+    if (!WebPEncode(&sp->sEncoderConfig, &sp->sPicture))
+    {
 
 #if WEBP_ENCODER_ABI_VERSION >= 0x0100
-    const char* pszErrorMsg = NULL;
-    switch(sp->sPicture.error_code) {
-    case VP8_ENC_ERROR_OUT_OF_MEMORY:
-        pszErrorMsg = "Out of memory"; break;
-    case VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY:
-        pszErrorMsg = "Out of memory while flushing bits"; break;
-    case VP8_ENC_ERROR_NULL_PARAMETER:
-        pszErrorMsg = "A pointer parameter is NULL"; break;
-    case VP8_ENC_ERROR_INVALID_CONFIGURATION:
-        pszErrorMsg = "Configuration is invalid"; break;
-    case VP8_ENC_ERROR_BAD_DIMENSION:
-        pszErrorMsg = "Picture has invalid width/height"; break;
-    case VP8_ENC_ERROR_PARTITION0_OVERFLOW:
-        pszErrorMsg = "Partition is bigger than 512k. Try using less "
-            "SEGMENTS, or increase PARTITION_LIMIT value";
-        break;
-    case VP8_ENC_ERROR_PARTITION_OVERFLOW:
-        pszErrorMsg = "Partition is bigger than 16M";
-        break;
-    case VP8_ENC_ERROR_BAD_WRITE:
-        pszErrorMsg = "Error while fludshing bytes"; break;
-    case VP8_ENC_ERROR_FILE_TOO_BIG:
-        pszErrorMsg = "File is bigger than 4G"; break;
-    case VP8_ENC_ERROR_USER_ABORT:
-        pszErrorMsg = "User interrupted";
-        break;
-    default:
-        TIFFErrorExt(tif->tif_clientdata, module,
-                "WebPEncode returned an unknown error code: %d",
-                sp->sPicture.error_code);
-        pszErrorMsg = "Unknown WebP error type.";
-        break;
-    }
-    TIFFErrorExt(tif->tif_clientdata, module,
-             "WebPEncode() failed : %s", pszErrorMsg);
+        const char *pszErrorMsg = NULL;
+        switch (sp->sPicture.error_code)
+        {
+            case VP8_ENC_ERROR_OUT_OF_MEMORY:
+                pszErrorMsg = "Out of memory";
+                break;
+            case VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY:
+                pszErrorMsg = "Out of memory while flushing bits";
+                break;
+            case VP8_ENC_ERROR_NULL_PARAMETER:
+                pszErrorMsg = "A pointer parameter is NULL";
+                break;
+            case VP8_ENC_ERROR_INVALID_CONFIGURATION:
+                pszErrorMsg = "Configuration is invalid";
+                break;
+            case VP8_ENC_ERROR_BAD_DIMENSION:
+                pszErrorMsg = "Picture has invalid width/height";
+                break;
+            case VP8_ENC_ERROR_PARTITION0_OVERFLOW:
+                pszErrorMsg = "Partition is bigger than 512k. Try using less "
+                              "SEGMENTS, or increase PARTITION_LIMIT value";
+                break;
+            case VP8_ENC_ERROR_PARTITION_OVERFLOW:
+                pszErrorMsg = "Partition is bigger than 16M";
+                break;
+            case VP8_ENC_ERROR_BAD_WRITE:
+                pszErrorMsg = "Error while fludshing bytes";
+                break;
+            case VP8_ENC_ERROR_FILE_TOO_BIG:
+                pszErrorMsg = "File is bigger than 4G";
+                break;
+            case VP8_ENC_ERROR_USER_ABORT:
+                pszErrorMsg = "User interrupted";
+                break;
+            default:
+                TIFFErrorExtR(tif, module,
+                              "WebPEncode returned an unknown error code: %d",
+                              sp->sPicture.error_code);
+                pszErrorMsg = "Unknown WebP error type.";
+                break;
+        }
+        TIFFErrorExtR(tif, module, "WebPEncode() failed : %s", pszErrorMsg);
 #else
-    TIFFErrorExt(tif->tif_clientdata, module,
-             "Error in WebPEncode()");
+        TIFFErrorExtR(tif, module, "Error in WebPEncode()");
 #endif
-    return 0;
-  }
+        return 0;
+    }
 
-  sp->sPicture.custom_ptr = NULL;
+    sp->sPicture.custom_ptr = NULL;
 
-  if (!TIFFFlushData1(tif))
-  {
-    TIFFErrorExt(tif->tif_clientdata, module,
-      "Error flushing TIFF WebP encoder.");
-    return 0;
-  }
+    if (!TIFFFlushData1(tif))
+    {
+        TIFFErrorExtR(tif, module, "Error flushing TIFF WebP encoder.");
+        return 0;
+    }
 
-  return 1;
+    return 1;
 }
 
-static void
-TWebPCleanup(TIFF* tif)
+static void TWebPCleanup(TIFF *tif)
 {
-  WebPState* sp = LState(tif);
+    WebPState *sp = LState(tif);
 
-  assert(sp != 0);
+    assert(sp != 0);
 
-  tif->tif_tagmethods.vgetfield = sp->vgetparent;
-  tif->tif_tagmethods.vsetfield = sp->vsetparent;
+    tif->tif_tagmethods.vgetfield = sp->vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->vsetparent;
 
-  if (sp->state & LSTATE_INIT_ENCODE) {
-    WebPPictureFree(&sp->sPicture);
-  }
+    if (sp->state & LSTATE_INIT_ENCODE)
+    {
+        WebPPictureFree(&sp->sPicture);
+    }
 
-  if (sp->psDecoder != NULL) {
-    WebPIDelete(sp->psDecoder);
-    WebPFreeDecBuffer(&sp->sDecBuffer);
-    sp->psDecoder = NULL;
-    sp->last_y = 0;
-  }
-  
-  if (sp->pBuffer != NULL) {
-      _TIFFfree(sp->pBuffer);
-      sp->pBuffer = NULL;    
-  }
+    if (sp->psDecoder != NULL)
+    {
+        WebPIDelete(sp->psDecoder);
+        WebPFreeDecBuffer(&sp->sDecBuffer);
+        sp->psDecoder = NULL;
+        sp->last_y = 0;
+    }
+
+    if (sp->pBuffer != NULL)
+    {
+        _TIFFfreeExt(tif, sp->pBuffer);
+        sp->pBuffer = NULL;
+    }
 
-  _TIFFfree(tif->tif_data);
-  tif->tif_data = NULL;
+    _TIFFfreeExt(tif, tif->tif_data);
+    tif->tif_data = NULL;
 
-  _TIFFSetDefaultCompressionState(tif);
+    _TIFFSetDefaultCompressionState(tif);
 }
 
-static int
-TWebPVSetField(TIFF* tif, uint32 tag, va_list ap)
+static int TWebPVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	static const char module[] = "WebPVSetField";
-  WebPState* sp = LState(tif);
-
-  switch (tag) {
-  case TIFFTAG_WEBP_LEVEL:
-    sp->quality_level = (int) va_arg(ap, int);
-    if( sp->quality_level <= 0 ||
-        sp->quality_level > 100.0f ) {
-      TIFFWarningExt(tif->tif_clientdata, module,
-                     "WEBP_LEVEL should be between 1 and 100");
-    }
-    return 1;
-  case TIFFTAG_WEBP_LOSSLESS:
-    #if WEBP_ENCODER_ABI_VERSION >= 0x0100
-    sp->lossless = va_arg(ap, int);
-    if (sp->lossless){
-      sp->quality_level = 100;
+    static const char module[] = "WebPVSetField";
+    WebPState *sp = LState(tif);
+
+    switch (tag)
+    {
+        case TIFFTAG_WEBP_LEVEL:
+            sp->quality_level = (int)va_arg(ap, int);
+            if (sp->quality_level <= 0 || sp->quality_level > 100.0f)
+            {
+                TIFFWarningExtR(tif, module,
+                                "WEBP_LEVEL should be between 1 and 100");
+            }
+            return 1;
+        case TIFFTAG_WEBP_LOSSLESS:
+#if WEBP_ENCODER_ABI_VERSION >= 0x0100
+            sp->lossless = va_arg(ap, int);
+            if (sp->lossless)
+            {
+                sp->quality_level = 100;
+            }
+            return 1;
+#else
+            TIFFErrorExtR(
+                tif, module,
+                "Need to upgrade WEBP driver, this version doesn't support "
+                "lossless compression.");
+            return 0;
+#endif
+        case TIFFTAG_WEBP_LOSSLESS_EXACT:
+#if WEBP_ENCODER_ABI_VERSION >= 0x0209
+            sp->lossless_exact = va_arg(ap, int);
+            return 1;
+#else
+            TIFFErrorExtR(
+                tif, module,
+                "Need to upgrade WEBP driver, this version doesn't support "
+                "lossless compression.");
+            return 0;
+#endif
+        default:
+            return (*sp->vsetparent)(tif, tag, ap);
     }
-    return 1;
-    #else
-      TIFFErrorExt(tif->tif_clientdata, module,
-                  "Need to upgrade WEBP driver, this version doesn't support "
-                  "lossless compression.");
-      return 0;
-    #endif 
-  default:
-    return (*sp->vsetparent)(tif, tag, ap);
-  }
-  /*NOTREACHED*/
+    /*NOTREACHED*/
 }
 
-static int
-TWebPVGetField(TIFF* tif, uint32 tag, va_list ap)
+static int TWebPVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-  WebPState* sp = LState(tif);
-
-  switch (tag) {
-  case TIFFTAG_WEBP_LEVEL:
-    *va_arg(ap, int*) = sp->quality_level;
-    break;
-  case TIFFTAG_WEBP_LOSSLESS:
-    *va_arg(ap, int*) = sp->lossless;
-    break;
-  default:
-    return (*sp->vgetparent)(tif, tag, ap);
-  }
-  return 1;
+    WebPState *sp = LState(tif);
+
+    switch (tag)
+    {
+        case TIFFTAG_WEBP_LEVEL:
+            *va_arg(ap, int *) = sp->quality_level;
+            break;
+        case TIFFTAG_WEBP_LOSSLESS:
+            *va_arg(ap, int *) = sp->lossless;
+            break;
+        case TIFFTAG_WEBP_LOSSLESS_EXACT:
+            *va_arg(ap, int *) = sp->lossless_exact;
+            break;
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
+    return 1;
 }
 
 static const TIFFField TWebPFields[] = {
-  { TIFFTAG_WEBP_LEVEL, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
-    TIFF_SETGET_UNDEFINED,
-    FIELD_PSEUDO, TRUE, FALSE, "WEBP quality", NULL },
-  { TIFFTAG_WEBP_LOSSLESS, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
-    TIFF_SETGET_UNDEFINED,
-    FIELD_PSEUDO, TRUE, FALSE, "WEBP lossless/lossy", NULL
-  },
+    {TIFFTAG_WEBP_LEVEL, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "WEBP quality", NULL},
+    {TIFFTAG_WEBP_LOSSLESS, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "WEBP lossless/lossy",
+     NULL},
+    {TIFFTAG_WEBP_LOSSLESS_EXACT, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "WEBP exact lossless",
+     NULL},
 };
 
-int
-TIFFInitWebP(TIFF* tif, int scheme)
+int TIFFInitWebP(TIFF *tif, int scheme)
 {
-  static const char module[] = "TIFFInitWebP";
-  WebPState* sp;
-
-  (void)scheme;
-  assert( scheme == COMPRESSION_WEBP );
-
-  /*
-  * Merge codec-specific tag information.
-  */
-  if ( !_TIFFMergeFields(tif, TWebPFields, TIFFArrayCount(TWebPFields)) ) {
-    TIFFErrorExt(tif->tif_clientdata, module,
-                "Merging WebP codec-specific tags failed");
-    return 0;
-  }
-
-  /*
-  * Allocate state block so tag methods have storage to record values.
-  */
-  tif->tif_data = (uint8*) _TIFFmalloc(sizeof(WebPState));
-  if (tif->tif_data == NULL)
-    goto bad;
-  sp = LState(tif);
-
-  /*
-  * Override parent get/set field methods.
-  */
-  sp->vgetparent = tif->tif_tagmethods.vgetfield;
-  tif->tif_tagmethods.vgetfield = TWebPVGetField;	/* hook for codec tags */
-  sp->vsetparent = tif->tif_tagmethods.vsetfield;
-  tif->tif_tagmethods.vsetfield = TWebPVSetField;	/* hook for codec tags */
-
-  /* Default values for codec-specific fields */
-  sp->quality_level = 75;		/* default comp. level */
-  sp->lossless = 0; /* default to false */
-  sp->state = 0;
-  sp->nSamples = 0;
-  sp->psDecoder = NULL;
-  sp->last_y = 0;
-  
-  sp->buffer_offset = 0;
-  sp->pBuffer = NULL;
-
-  /*
-  * Install codec methods.
-  * Notes:
-  * encoderow is not supported
-  */
-  tif->tif_fixuptags = TWebPFixupTags;
-  tif->tif_setupdecode = TWebPSetupDecode;
-  tif->tif_predecode = TWebPPreDecode;
-  tif->tif_decoderow = TWebPDecode;
-  tif->tif_decodestrip = TWebPDecode;
-  tif->tif_decodetile = TWebPDecode;
-  tif->tif_setupencode = TWebPSetupEncode;
-  tif->tif_preencode = TWebPPreEncode;
-  tif->tif_postencode = TWebPPostEncode;
-  tif->tif_encoderow = TWebPEncode;
-  tif->tif_encodestrip = TWebPEncode;
-  tif->tif_encodetile = TWebPEncode;
-  tif->tif_cleanup = TWebPCleanup;
-
-  return 1;
+    static const char module[] = "TIFFInitWebP";
+    WebPState *sp;
+
+    (void)scheme;
+    assert(scheme == COMPRESSION_WEBP);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, TWebPFields, TIFFArrayCount(TWebPFields)))
+    {
+        TIFFErrorExtR(tif, module, "Merging WebP codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(WebPState));
+    if (tif->tif_data == NULL)
+        goto bad;
+    sp = LState(tif);
+
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = TWebPVGetField; /* hook for codec tags */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = TWebPVSetField; /* hook for codec tags */
+
+    /* Default values for codec-specific fields */
+    sp->quality_level = 75; /* default comp. level */
+    sp->lossless = 0;       /* default to false */
+    sp->lossless_exact = 1; /* exact lossless mode (if lossless enabled) */
+    sp->state = 0;
+    sp->nSamples = 0;
+    sp->psDecoder = NULL;
+    sp->last_y = 0;
+
+    sp->buffer_offset = 0;
+    sp->pBuffer = NULL;
+
+    /*
+     * Install codec methods.
+     * Notes:
+     * encoderow is not supported
+     */
+    tif->tif_fixuptags = TWebPFixupTags;
+    tif->tif_setupdecode = TWebPSetupDecode;
+    tif->tif_predecode = TWebPPreDecode;
+    tif->tif_decoderow = TWebPDecode;
+    tif->tif_decodestrip = TWebPDecode;
+    tif->tif_decodetile = TWebPDecode;
+    tif->tif_setupencode = TWebPSetupEncode;
+    tif->tif_preencode = TWebPPreEncode;
+    tif->tif_postencode = TWebPPostEncode;
+    tif->tif_encoderow = TWebPEncode;
+    tif->tif_encodestrip = TWebPEncode;
+    tif->tif_encodetile = TWebPEncode;
+    tif->tif_cleanup = TWebPCleanup;
+
+    return 1;
 bad:
-  TIFFErrorExt(tif->tif_clientdata, module,
-  	     "No space for WebP state block");
-  return 0;
+    TIFFErrorExtR(tif, module, "No space for WebP state block");
+    return 0;
 }
 
 #endif /* WEBP_SUPPORT */
diff --git a/3rdparty/libtiff/tif_win32.c b/3rdparty/libtiff/tif_win32.c
index 89645693940f..1a6b86dffb5a 100644
--- a/3rdparty/libtiff/tif_win32.c
+++ b/3rdparty/libtiff/tif_win32.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -28,6 +28,7 @@
  */
 
 #include "tiffiop.h"
+#include <stdlib.h>
 
 #include <windows.h>
 
@@ -54,121 +55,111 @@ static inline thandle_t thandle_from_int(int ifd)
     return (thandle_t)(intptr_t)ifd;
 }
 
-static inline int thandle_to_int(thandle_t fd)
-{
-    return (int)(intptr_t)fd;
-}
+static inline int thandle_to_int(thandle_t fd) { return (int)(intptr_t)fd; }
 
-static tmsize_t
-_tiffReadProc(thandle_t fd, void* buf, tmsize_t size)
+static tmsize_t _tiffReadProc(thandle_t fd, void *buf, tmsize_t size)
 {
-	/* tmsize_t is 64bit on 64bit systems, but the WinAPI ReadFile takes
-	 * 32bit sizes, so we loop through the data in suitable 32bit sized
-	 * chunks */
-	uint8* ma;
-	uint64 mb;
-	DWORD n;
-	DWORD o;
-	tmsize_t p;
-	ma=(uint8*)buf;
-	mb=size;
-	p=0;
-	while (mb>0)
-	{
-		n=0x80000000UL;
-		if ((uint64)n>mb)
-			n=(DWORD)mb;
-		if (!ReadFile(fd,(LPVOID)ma,n,&o,NULL))
-			return(0);
-		ma+=o;
-		mb-=o;
-		p+=o;
-		if (o!=n)
-			break;
-	}
-	return(p);
+    /* tmsize_t is 64bit on 64bit systems, but the WinAPI ReadFile takes
+     * 32bit sizes, so we loop through the data in suitable 32bit sized
+     * chunks */
+    uint8_t *ma;
+    uint64_t mb;
+    DWORD n;
+    DWORD o;
+    tmsize_t p;
+    ma = (uint8_t *)buf;
+    mb = size;
+    p = 0;
+    while (mb > 0)
+    {
+        n = 0x80000000UL;
+        if ((uint64_t)n > mb)
+            n = (DWORD)mb;
+        if (!ReadFile(fd, (LPVOID)ma, n, &o, NULL))
+            return (0);
+        ma += o;
+        mb -= o;
+        p += o;
+        if (o != n)
+            break;
+    }
+    return (p);
 }
 
-static tmsize_t
-_tiffWriteProc(thandle_t fd, void* buf, tmsize_t size)
+static tmsize_t _tiffWriteProc(thandle_t fd, void *buf, tmsize_t size)
 {
-	/* tmsize_t is 64bit on 64bit systems, but the WinAPI WriteFile takes
-	 * 32bit sizes, so we loop through the data in suitable 32bit sized
-	 * chunks */
-	uint8* ma;
-	uint64 mb;
-	DWORD n;
-	DWORD o;
-	tmsize_t p;
-	ma=(uint8*)buf;
-	mb=size;
-	p=0;
-	while (mb>0)
-	{
-		n=0x80000000UL;
-		if ((uint64)n>mb)
-			n=(DWORD)mb;
-		if (!WriteFile(fd,(LPVOID)ma,n,&o,NULL))
-			return(0);
-		ma+=o;
-		mb-=o;
-		p+=o;
-		if (o!=n)
-			break;
-	}
-	return(p);
+    /* tmsize_t is 64bit on 64bit systems, but the WinAPI WriteFile takes
+     * 32bit sizes, so we loop through the data in suitable 32bit sized
+     * chunks */
+    uint8_t *ma;
+    uint64_t mb;
+    DWORD n;
+    DWORD o;
+    tmsize_t p;
+    ma = (uint8_t *)buf;
+    mb = size;
+    p = 0;
+    while (mb > 0)
+    {
+        n = 0x80000000UL;
+        if ((uint64_t)n > mb)
+            n = (DWORD)mb;
+        if (!WriteFile(fd, (LPVOID)ma, n, &o, NULL))
+            return (0);
+        ma += o;
+        mb -= o;
+        p += o;
+        if (o != n)
+            break;
+    }
+    return (p);
 }
 
-static uint64
-_tiffSeekProc(thandle_t fd, uint64 off, int whence)
+static uint64_t _tiffSeekProc(thandle_t fd, uint64_t off, int whence)
 {
-	LARGE_INTEGER offli;
-	DWORD dwMoveMethod;
-	offli.QuadPart = off;
-	switch(whence)
-	{
-		case SEEK_SET:
-			dwMoveMethod = FILE_BEGIN;
-			break;
-		case SEEK_CUR:
-			dwMoveMethod = FILE_CURRENT;
-			break;
-		case SEEK_END:
-			dwMoveMethod = FILE_END;
-			break;
-		default:
-			dwMoveMethod = FILE_BEGIN;
-			break;
-	}
-	offli.LowPart=SetFilePointer(fd,offli.LowPart,&offli.HighPart,dwMoveMethod);
-	if ((offli.LowPart==INVALID_SET_FILE_POINTER)&&(GetLastError()!=NO_ERROR))
-		offli.QuadPart=0;
-	return(offli.QuadPart);
+    LARGE_INTEGER offli;
+    DWORD dwMoveMethod;
+    offli.QuadPart = off;
+    switch (whence)
+    {
+        case SEEK_SET:
+            dwMoveMethod = FILE_BEGIN;
+            break;
+        case SEEK_CUR:
+            dwMoveMethod = FILE_CURRENT;
+            break;
+        case SEEK_END:
+            dwMoveMethod = FILE_END;
+            break;
+        default:
+            dwMoveMethod = FILE_BEGIN;
+            break;
+    }
+    offli.LowPart =
+        SetFilePointer(fd, offli.LowPart, &offli.HighPart, dwMoveMethod);
+    if ((offli.LowPart == INVALID_SET_FILE_POINTER) &&
+        (GetLastError() != NO_ERROR))
+        offli.QuadPart = 0;
+    return (offli.QuadPart);
 }
 
-static int
-_tiffCloseProc(thandle_t fd)
-{
-	return (CloseHandle(fd) ? 0 : -1);
-}
+static int _tiffCloseProc(thandle_t fd) { return (CloseHandle(fd) ? 0 : -1); }
 
-static uint64
-_tiffSizeProc(thandle_t fd)
+static uint64_t _tiffSizeProc(thandle_t fd)
 {
-	LARGE_INTEGER m;
-	if (GetFileSizeEx(fd,&m))
-		return(m.QuadPart);
-	else
-		return(0);
+    LARGE_INTEGER m;
+    if (GetFileSizeEx(fd, &m))
+        return (m.QuadPart);
+    else
+        return (0);
 }
 
-static int
-_tiffDummyMapProc(thandle_t fd, void** pbase, toff_t* psize)
+static int _tiffDummyMapProc(thandle_t fd, void **pbase, toff_t *psize)
 {
-	(void) fd;
-	(void) pbase;
-	(void) psize;
-	return (0);
+    (void)fd;
+    (void)pbase;
+    (void)psize;
+    return (0);
 }
 
 /*
@@ -182,45 +173,42 @@ _tiffDummyMapProc(thandle_t fd, void** pbase, toff_t* psize)
  * This removes a nasty OS dependency and cures a problem
  * with Visual C++ 5.0
  */
-static int
-_tiffMapProc(thandle_t fd, void** pbase, toff_t* psize)
+static int _tiffMapProc(thandle_t fd, void **pbase, toff_t *psize)
 {
-	uint64 size;
-	tmsize_t sizem;
-	HANDLE hMapFile;
-
-	size = _tiffSizeProc(fd);
-	sizem = (tmsize_t)size;
-	if (!size || (uint64)sizem!=size)
-		return (0);
-
-	/* By passing in 0 for the maximum file size, it specifies that we
-	   create a file mapping object for the full file size. */
-	hMapFile = CreateFileMapping(fd, NULL, PAGE_READONLY, 0, 0, NULL);
-	if (hMapFile == NULL)
-		return (0);
-	*pbase = MapViewOfFile(hMapFile, FILE_MAP_READ, 0, 0, 0);
-	CloseHandle(hMapFile);
-	if (*pbase == NULL)
-		return (0);
-	*psize = size;
-	return(1);
+    uint64_t size;
+    tmsize_t sizem;
+    HANDLE hMapFile;
+
+    size = _tiffSizeProc(fd);
+    sizem = (tmsize_t)size;
+    if (!size || (uint64_t)sizem != size)
+        return (0);
+
+    /* By passing in 0 for the maximum file size, it specifies that we
+       create a file mapping object for the full file size. */
+    hMapFile = CreateFileMapping(fd, NULL, PAGE_READONLY, 0, 0, NULL);
+    if (hMapFile == NULL)
+        return (0);
+    *pbase = MapViewOfFile(hMapFile, FILE_MAP_READ, 0, 0, 0);
+    CloseHandle(hMapFile);
+    if (*pbase == NULL)
+        return (0);
+    *psize = size;
+    return (1);
 }
 
-static void
-_tiffDummyUnmapProc(thandle_t fd, void* base, toff_t size)
+static void _tiffDummyUnmapProc(thandle_t fd, void *base, toff_t size)
 {
-	(void) fd;
-	(void) base;
-	(void) size;
+    (void)fd;
+    (void)base;
+    (void)size;
 }
 
-static void
-_tiffUnmapProc(thandle_t fd, void* base, toff_t size)
+static void _tiffUnmapProc(thandle_t fd, void *base, toff_t size)
 {
-	(void) fd;
-	(void) size;
-	UnmapViewOfFile(base);
+    (void)fd;
+    (void)size;
+    UnmapViewOfFile(base);
 }
 
 /*
@@ -228,29 +216,36 @@ _tiffUnmapProc(thandle_t fd, void* base, toff_t size)
  * Note that TIFFFdOpen and TIFFOpen recognise the character 'u' in the mode
  * string, which forces the file to be opened unmapped.
  */
-TIFF*
-TIFFFdOpen(int ifd, const char* name, const char* mode)
+TIFF *TIFFFdOpen(int ifd, const char *name, const char *mode)
 {
-	TIFF* tif;
-	int fSuppressMap;
-	int m;
-	fSuppressMap=0;
-	for (m=0; mode[m]!=0; m++)
-	{
-		if (mode[m]=='u')
-		{
-			fSuppressMap=1;
-			break;
-		}
-	}
-	tif = TIFFClientOpen(name, mode, thandle_from_int(ifd),
-			_tiffReadProc, _tiffWriteProc,
-			_tiffSeekProc, _tiffCloseProc, _tiffSizeProc,
-			fSuppressMap ? _tiffDummyMapProc : _tiffMapProc,
-			fSuppressMap ? _tiffDummyUnmapProc : _tiffUnmapProc);
-	if (tif)
-		tif->tif_fd = ifd;
-	return (tif);
+    return TIFFFdOpenExt(ifd, name, mode, NULL);
+}
+
+TIFF *TIFFFdOpenExt(int ifd, const char *name, const char *mode,
+                    TIFFOpenOptions *opts)
+{
+    TIFF *tif;
+    int fSuppressMap;
+    int m;
+
+    fSuppressMap = 0;
+    for (m = 0; mode[m] != 0; m++)
+    {
+        if (mode[m] == 'u')
+        {
+            fSuppressMap = 1;
+            break;
+        }
+    }
+
+    tif = TIFFClientOpenExt(
+        name, mode, thandle_from_int(ifd), _tiffReadProc, _tiffWriteProc,
+        _tiffSeekProc, _tiffCloseProc, _tiffSizeProc,
+        fSuppressMap ? _tiffDummyMapProc : _tiffMapProc,
+        fSuppressMap ? _tiffDummyUnmapProc : _tiffUnmapProc, opts);
+    if (tif)
+        tif->tif_fd = ifd;
+    return (tif);
 }
 
 #ifndef _WIN32_WCE
@@ -258,184 +253,190 @@ TIFFFdOpen(int ifd, const char* name, const char* mode)
 /*
  * Open a TIFF file for read/writing.
  */
-TIFF*
-TIFFOpen(const char* name, const char* mode)
+TIFF *TIFFOpen(const char *name, const char *mode)
+{
+    return TIFFOpenExt(name, mode, NULL);
+}
+
+TIFF *TIFFOpenExt(const char *name, const char *mode, TIFFOpenOptions *opts)
 {
-	static const char module[] = "TIFFOpen";
-	thandle_t fd;
-	int m;
-	DWORD dwMode;
-	TIFF* tif;
-
-	m = _TIFFgetMode(mode, module);
-
-	switch(m) {
-		case O_RDONLY:			dwMode = OPEN_EXISTING; break;
-		case O_RDWR:			dwMode = OPEN_ALWAYS;   break;
-		case O_RDWR|O_CREAT:		dwMode = OPEN_ALWAYS;   break;
-		case O_RDWR|O_TRUNC:		dwMode = CREATE_ALWAYS; break;
-		case O_RDWR|O_CREAT|O_TRUNC:	dwMode = CREATE_ALWAYS; break;
-		default:			return ((TIFF*)0);
-	}
-        
-	fd = (thandle_t)CreateFileA(name,
-		(m == O_RDONLY)?GENERIC_READ:(GENERIC_READ | GENERIC_WRITE),
-		FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, dwMode,
-		(m == O_RDONLY)?FILE_ATTRIBUTE_READONLY:FILE_ATTRIBUTE_NORMAL,
-		NULL);
-	if (fd == INVALID_HANDLE_VALUE) {
-		TIFFErrorExt(0, module, "%s: Cannot open", name);
-		return ((TIFF *)0);
-	}
-
-	tif = TIFFFdOpen(thandle_to_int(fd), name, mode);
-	if(!tif)
-		CloseHandle(fd);
-	return tif;
+    static const char module[] = "TIFFOpen";
+    thandle_t fd;
+    int m;
+    DWORD dwMode;
+    TIFF *tif;
+
+    m = _TIFFgetMode(opts, NULL, mode, module);
+
+    switch (m)
+    {
+        case O_RDONLY:
+            dwMode = OPEN_EXISTING;
+            break;
+        case O_RDWR:
+            dwMode = OPEN_EXISTING;
+            break;
+        case O_RDWR | O_CREAT:
+            dwMode = OPEN_ALWAYS;
+            break;
+        case O_RDWR | O_TRUNC:
+            dwMode = CREATE_ALWAYS;
+            break;
+        case O_RDWR | O_CREAT | O_TRUNC:
+            dwMode = CREATE_ALWAYS;
+            break;
+        default:
+            return ((TIFF *)0);
+    }
+
+    fd = (thandle_t)CreateFileA(
+        name, (m == O_RDONLY) ? GENERIC_READ : (GENERIC_READ | GENERIC_WRITE),
+        FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, dwMode,
+        (m == O_RDONLY) ? FILE_ATTRIBUTE_READONLY : FILE_ATTRIBUTE_NORMAL,
+        NULL);
+    if (fd == INVALID_HANDLE_VALUE)
+    {
+        _TIFFErrorEarly(opts, NULL, module, "%s: Cannot open", name);
+        return ((TIFF *)0);
+    }
+
+    tif = TIFFFdOpenExt(thandle_to_int(fd), name, mode, opts);
+    if (!tif)
+        CloseHandle(fd);
+    return tif;
 }
 
 /*
  * Open a TIFF file with a Unicode filename, for read/writing.
  */
-TIFF*
-TIFFOpenW(const wchar_t* name, const char* mode)
+TIFF *TIFFOpenW(const wchar_t *name, const char *mode)
 {
-	static const char module[] = "TIFFOpenW";
-	thandle_t fd;
-	int m;
-	DWORD dwMode;
-	int mbsize;
-	char *mbname;
-	TIFF *tif;
-
-	m = _TIFFgetMode(mode, module);
-
-	switch(m) {
-		case O_RDONLY:			dwMode = OPEN_EXISTING; break;
-		case O_RDWR:			dwMode = OPEN_ALWAYS;   break;
-		case O_RDWR|O_CREAT:		dwMode = OPEN_ALWAYS;   break;
-		case O_RDWR|O_TRUNC:		dwMode = CREATE_ALWAYS; break;
-		case O_RDWR|O_CREAT|O_TRUNC:	dwMode = CREATE_ALWAYS; break;
-		default:			return ((TIFF*)0);
-	}
-
-	fd = (thandle_t)CreateFileW(name,
-		(m == O_RDONLY)?GENERIC_READ:(GENERIC_READ|GENERIC_WRITE),
-		FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, dwMode,
-		(m == O_RDONLY)?FILE_ATTRIBUTE_READONLY:FILE_ATTRIBUTE_NORMAL,
-		NULL);
-	if (fd == INVALID_HANDLE_VALUE) {
-		TIFFErrorExt(0, module, "%S: Cannot open", name);
-		return ((TIFF *)0);
-	}
-
-	mbname = NULL;
-	mbsize = WideCharToMultiByte(CP_ACP, 0, name, -1, NULL, 0, NULL, NULL);
-	if (mbsize > 0) {
-		mbname = (char *)_TIFFmalloc(mbsize);
-		if (!mbname) {
-			TIFFErrorExt(0, module,
-			"Can't allocate space for filename conversion buffer");
-			return ((TIFF*)0);
-		}
-
-		WideCharToMultiByte(CP_ACP, 0, name, -1, mbname, mbsize,
-				    NULL, NULL);
-	}
-
-	tif = TIFFFdOpen(thandle_to_int(fd),
-			 (mbname != NULL) ? mbname : "<unknown>", mode);
-	if(!tif)
-		CloseHandle(fd);
-
-	_TIFFfree(mbname);
-
-	return tif;
+    return TIFFOpenWExt(name, mode, NULL);
+}
+
+TIFF *TIFFOpenWExt(const wchar_t *name, const char *mode, TIFFOpenOptions *opts)
+{
+    static const char module[] = "TIFFOpenW";
+    thandle_t fd;
+    int m;
+    DWORD dwMode;
+    int mbsize;
+    char *mbname;
+    TIFF *tif;
+
+    m = _TIFFgetMode(opts, NULL, mode, module);
+
+    switch (m)
+    {
+        case O_RDONLY:
+            dwMode = OPEN_EXISTING;
+            break;
+        case O_RDWR:
+            dwMode = OPEN_EXISTING;
+            break;
+        case O_RDWR | O_CREAT:
+            dwMode = OPEN_ALWAYS;
+            break;
+        case O_RDWR | O_TRUNC:
+            dwMode = CREATE_ALWAYS;
+            break;
+        case O_RDWR | O_CREAT | O_TRUNC:
+            dwMode = CREATE_ALWAYS;
+            break;
+        default:
+            return ((TIFF *)0);
+    }
+
+    fd = (thandle_t)CreateFileW(
+        name, (m == O_RDONLY) ? GENERIC_READ : (GENERIC_READ | GENERIC_WRITE),
+        FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, dwMode,
+        (m == O_RDONLY) ? FILE_ATTRIBUTE_READONLY : FILE_ATTRIBUTE_NORMAL,
+        NULL);
+    if (fd == INVALID_HANDLE_VALUE)
+    {
+        _TIFFErrorEarly(opts, NULL, module, "%S: Cannot open", name);
+        return ((TIFF *)0);
+    }
+
+    mbname = NULL;
+    mbsize = WideCharToMultiByte(CP_ACP, 0, name, -1, NULL, 0, NULL, NULL);
+    if (mbsize > 0)
+    {
+        mbname = (char *)_TIFFmalloc(mbsize);
+        if (!mbname)
+        {
+            _TIFFErrorEarly(
+                opts, NULL, module,
+                "Can't allocate space for filename conversion buffer");
+            return ((TIFF *)0);
+        }
+
+        WideCharToMultiByte(CP_ACP, 0, name, -1, mbname, mbsize, NULL, NULL);
+    }
+
+    tif = TIFFFdOpenExt(thandle_to_int(fd),
+                        (mbname != NULL) ? mbname : "<unknown>", mode, opts);
+    if (!tif)
+        CloseHandle(fd);
+
+    _TIFFfree(mbname);
+
+    return tif;
 }
 
 #endif /* ndef _WIN32_WCE */
 
-void*
-_TIFFmalloc(tmsize_t s)
+void *_TIFFmalloc(tmsize_t s)
 {
-        if (s == 0)
-                return ((void *) NULL);
+    if (s == 0)
+        return ((void *)NULL);
 
-	return (malloc((size_t) s));
+    return (malloc((size_t)s));
 }
 
-void* _TIFFcalloc(tmsize_t nmemb, tmsize_t siz)
+void *_TIFFcalloc(tmsize_t nmemb, tmsize_t siz)
 {
-    if( nmemb == 0 || siz == 0 )
-        return ((void *) NULL);
+    if (nmemb == 0 || siz == 0)
+        return ((void *)NULL);
 
-    return calloc((size_t) nmemb, (size_t)siz);
+    return calloc((size_t)nmemb, (size_t)siz);
 }
 
-void
-_TIFFfree(void* p)
-{
-	free(p);
-}
+void _TIFFfree(void *p) { free(p); }
 
-void*
-_TIFFrealloc(void* p, tmsize_t s)
-{
-	return (realloc(p, (size_t) s));
-}
+void *_TIFFrealloc(void *p, tmsize_t s) { return (realloc(p, (size_t)s)); }
 
-void
-_TIFFmemset(void* p, int v, tmsize_t c)
-{
-	memset(p, v, (size_t) c);
-}
+void _TIFFmemset(void *p, int v, tmsize_t c) { memset(p, v, (size_t)c); }
 
-void
-_TIFFmemcpy(void* d, const void* s, tmsize_t c)
+void _TIFFmemcpy(void *d, const void *s, tmsize_t c)
 {
-	memcpy(d, s, (size_t) c);
+    memcpy(d, s, (size_t)c);
 }
 
-int
-_TIFFmemcmp(const void* p1, const void* p2, tmsize_t c)
+int _TIFFmemcmp(const void *p1, const void *p2, tmsize_t c)
 {
-	return (memcmp(p1, p2, (size_t) c));
+    return (memcmp(p1, p2, (size_t)c));
 }
 
 #ifndef _WIN32_WCE
 
-#if (_MSC_VER < 1500)
-#  define vsnprintf _vsnprintf
-#endif
-
-static void
-Win32WarningHandler(const char* module, const char* fmt, va_list ap)
+static void Win32WarningHandler(const char *module, const char *fmt, va_list ap)
 {
-	if (module != NULL)
-		fprintf(stderr, "%s: ", module);
-	fprintf(stderr, "Warning, ");
-	vfprintf(stderr, fmt, ap);
-	fprintf(stderr, ".\n");
+    if (module != NULL)
+        fprintf(stderr, "%s: ", module);
+    fprintf(stderr, "Warning, ");
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, ".\n");
 }
 TIFFErrorHandler _TIFFwarningHandler = Win32WarningHandler;
 
-static void
-Win32ErrorHandler(const char* module, const char* fmt, va_list ap)
+static void Win32ErrorHandler(const char *module, const char *fmt, va_list ap)
 {
-	if (module != NULL)
-		fprintf(stderr, "%s: ", module);
-	vfprintf(stderr, fmt, ap);
-	fprintf(stderr, ".\n");
+    if (module != NULL)
+        fprintf(stderr, "%s: ", module);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, ".\n");
 }
 TIFFErrorHandler _TIFFerrorHandler = Win32ErrorHandler;
 
 #endif /* ndef _WIN32_WCE */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_write.c b/3rdparty/libtiff/tif_write.c
index 3af69ab4e747..6631a782fd3c 100644
--- a/3rdparty/libtiff/tif_write.c
+++ b/3rdparty/libtiff/tif_write.c
@@ -30,174 +30,178 @@
 #include "tiffiop.h"
 #include <stdio.h>
 
-#define STRIPINCR	20		/* expansion factor on strip array */
+#define STRIPINCR 20 /* expansion factor on strip array */
 
-#define WRITECHECKSTRIPS(tif, module)				\
-	(((tif)->tif_flags&TIFF_BEENWRITING) || TIFFWriteCheck((tif),0,module))
-#define WRITECHECKTILES(tif, module)				\
-	(((tif)->tif_flags&TIFF_BEENWRITING) || TIFFWriteCheck((tif),1,module))
-#define BUFFERCHECK(tif)					\
-	((((tif)->tif_flags & TIFF_BUFFERSETUP) && tif->tif_rawdata) ||	\
-	    TIFFWriteBufferSetup((tif), NULL, (tmsize_t) -1))
+#define WRITECHECKSTRIPS(tif, module)                                          \
+    (((tif)->tif_flags & TIFF_BEENWRITING) || TIFFWriteCheck((tif), 0, module))
+#define WRITECHECKTILES(tif, module)                                           \
+    (((tif)->tif_flags & TIFF_BEENWRITING) || TIFFWriteCheck((tif), 1, module))
+#define BUFFERCHECK(tif)                                                       \
+    ((((tif)->tif_flags & TIFF_BUFFERSETUP) && tif->tif_rawdata) ||            \
+     TIFFWriteBufferSetup((tif), NULL, (tmsize_t)-1))
 
-static int TIFFGrowStrips(TIFF* tif, uint32 delta, const char* module);
-static int TIFFAppendToStrip(TIFF* tif, uint32 strip, uint8* data, tmsize_t cc);
+static int TIFFGrowStrips(TIFF *tif, uint32_t delta, const char *module);
+static int TIFFAppendToStrip(TIFF *tif, uint32_t strip, uint8_t *data,
+                             tmsize_t cc);
 
-int
-TIFFWriteScanline(TIFF* tif, void* buf, uint32 row, uint16 sample)
+int TIFFWriteScanline(TIFF *tif, void *buf, uint32_t row, uint16_t sample)
 {
-	static const char module[] = "TIFFWriteScanline";
-	register TIFFDirectory *td;
-	int status, imagegrew = 0;
-	uint32 strip;
-
-	if (!WRITECHECKSTRIPS(tif, module))
-		return (-1);
-	/*
-	 * Handle delayed allocation of data buffer.  This
-	 * permits it to be sized more intelligently (using
-	 * directory information).
-	 */
-	if (!BUFFERCHECK(tif))
-		return (-1);
-        tif->tif_flags |= TIFF_BUF4WRITE; /* not strictly sure this is right*/
-
-	td = &tif->tif_dir;
-	/*
-	 * Extend image length if needed
-	 * (but only for PlanarConfig=1).
-	 */
-	if (row >= td->td_imagelength) {	/* extend image */
-		if (td->td_planarconfig == PLANARCONFIG_SEPARATE) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Can not change \"ImageLength\" when using separate planes");
-			return (-1);
-		}
-		td->td_imagelength = row+1;
-		imagegrew = 1;
-	}
-	/*
-	 * Calculate strip and check for crossings.
-	 */
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE) {
-		if (sample >= td->td_samplesperpixel) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "%lu: Sample out of range, max %lu",
-			    (unsigned long) sample, (unsigned long) td->td_samplesperpixel);
-			return (-1);
-		}
-		strip = sample*td->td_stripsperimage + row/td->td_rowsperstrip;
-	} else
-		strip = row / td->td_rowsperstrip;
-	/*
-	 * Check strip array to make sure there's space. We don't support
-	 * dynamically growing files that have data organized in separate
-	 * bitplanes because it's too painful.  In that case we require that
-	 * the imagelength be set properly before the first write (so that the
-	 * strips array will be fully allocated above).
-	 */
-	if (strip >= td->td_nstrips && !TIFFGrowStrips(tif, 1, module))
-		return (-1);
-	if (strip != tif->tif_curstrip) {
-		/*
-		 * Changing strips -- flush any data present.
-		 */
-		if (!TIFFFlushData(tif))
-			return (-1);
-		tif->tif_curstrip = strip;
-		/*
-		 * Watch out for a growing image.  The value of strips/image
-		 * will initially be 1 (since it can't be deduced until the
-		 * imagelength is known).
-		 */
-		if (strip >= td->td_stripsperimage && imagegrew)
-			td->td_stripsperimage =
-			    TIFFhowmany_32(td->td_imagelength,td->td_rowsperstrip);
-                if (td->td_stripsperimage == 0) {
-                        TIFFErrorExt(tif->tif_clientdata, module, "Zero strips per image");
-                        return (-1);
-                }
-		tif->tif_row =
-		    (strip % td->td_stripsperimage) * td->td_rowsperstrip;
-		if ((tif->tif_flags & TIFF_CODERSETUP) == 0) {
-			if (!(*tif->tif_setupencode)(tif))
-				return (-1);
-			tif->tif_flags |= TIFF_CODERSETUP;
-		}
-        
-		tif->tif_rawcc = 0;
-		tif->tif_rawcp = tif->tif_rawdata;
-
-		if( td->td_stripbytecount_p[strip] > 0 )
-		{
-			/* if we are writing over existing tiles, zero length */
-			td->td_stripbytecount_p[strip] = 0;
-
-			/* this forces TIFFAppendToStrip() to do a seek */
-			tif->tif_curoff = 0;
-		}
-
-		if (!(*tif->tif_preencode)(tif, sample))
-			return (-1);
-		tif->tif_flags |= TIFF_POSTENCODE;
-	}
-	/*
-	 * Ensure the write is either sequential or at the
-	 * beginning of a strip (or that we can randomly
-	 * access the data -- i.e. no encoding).
-	 */
-	if (row != tif->tif_row) {
-		if (row < tif->tif_row) {
-			/*
-			 * Moving backwards within the same strip:
-			 * backup to the start and then decode
-			 * forward (below).
-			 */
-			tif->tif_row = (strip % td->td_stripsperimage) *
-			    td->td_rowsperstrip;
-			tif->tif_rawcp = tif->tif_rawdata;
-		}
-		/*
-		 * Seek forward to the desired row.
-		 */
-		if (!(*tif->tif_seek)(tif, row - tif->tif_row))
-			return (-1);
-		tif->tif_row = row;
-	}
-
-	/* swab if needed - note that source buffer will be altered */
-	tif->tif_postdecode( tif, (uint8*) buf, tif->tif_scanlinesize );
-
-	status = (*tif->tif_encoderow)(tif, (uint8*) buf,
-	    tif->tif_scanlinesize, sample);
-
-        /* we are now poised at the beginning of the next row */
-	tif->tif_row = row + 1;
-	return (status);
+    static const char module[] = "TIFFWriteScanline";
+    register TIFFDirectory *td;
+    int status, imagegrew = 0;
+    uint32_t strip;
+
+    if (!WRITECHECKSTRIPS(tif, module))
+        return (-1);
+    /*
+     * Handle delayed allocation of data buffer.  This
+     * permits it to be sized more intelligently (using
+     * directory information).
+     */
+    if (!BUFFERCHECK(tif))
+        return (-1);
+    tif->tif_flags |= TIFF_BUF4WRITE; /* not strictly sure this is right*/
+
+    td = &tif->tif_dir;
+    /*
+     * Extend image length if needed
+     * (but only for PlanarConfig=1).
+     */
+    if (row >= td->td_imagelength)
+    { /* extend image */
+        if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+        {
+            TIFFErrorExtR(
+                tif, module,
+                "Can not change \"ImageLength\" when using separate planes");
+            return (-1);
+        }
+        td->td_imagelength = row + 1;
+        imagegrew = 1;
+    }
+    /*
+     * Calculate strip and check for crossings.
+     */
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+    {
+        if (sample >= td->td_samplesperpixel)
+        {
+            TIFFErrorExtR(tif, module, "%lu: Sample out of range, max %lu",
+                          (unsigned long)sample,
+                          (unsigned long)td->td_samplesperpixel);
+            return (-1);
+        }
+        strip = sample * td->td_stripsperimage + row / td->td_rowsperstrip;
+    }
+    else
+        strip = row / td->td_rowsperstrip;
+    /*
+     * Check strip array to make sure there's space. We don't support
+     * dynamically growing files that have data organized in separate
+     * bitplanes because it's too painful.  In that case we require that
+     * the imagelength be set properly before the first write (so that the
+     * strips array will be fully allocated above).
+     */
+    if (strip >= td->td_nstrips && !TIFFGrowStrips(tif, 1, module))
+        return (-1);
+    if (strip != tif->tif_curstrip)
+    {
+        /*
+         * Changing strips -- flush any data present.
+         */
+        if (!TIFFFlushData(tif))
+            return (-1);
+        tif->tif_curstrip = strip;
+        /*
+         * Watch out for a growing image.  The value of strips/image
+         * will initially be 1 (since it can't be deduced until the
+         * imagelength is known).
+         */
+        if (strip >= td->td_stripsperimage && imagegrew)
+            td->td_stripsperimage =
+                TIFFhowmany_32(td->td_imagelength, td->td_rowsperstrip);
+        if (td->td_stripsperimage == 0)
+        {
+            TIFFErrorExtR(tif, module, "Zero strips per image");
+            return (-1);
+        }
+        tif->tif_row = (strip % td->td_stripsperimage) * td->td_rowsperstrip;
+        if ((tif->tif_flags & TIFF_CODERSETUP) == 0)
+        {
+            if (!(*tif->tif_setupencode)(tif))
+                return (-1);
+            tif->tif_flags |= TIFF_CODERSETUP;
+        }
+
+        tif->tif_rawcc = 0;
+        tif->tif_rawcp = tif->tif_rawdata;
+
+        /* this informs TIFFAppendToStrip() we have changed strip */
+        tif->tif_curoff = 0;
+
+        if (!(*tif->tif_preencode)(tif, sample))
+            return (-1);
+        tif->tif_flags |= TIFF_POSTENCODE;
+    }
+    /*
+     * Ensure the write is either sequential or at the
+     * beginning of a strip (or that we can randomly
+     * access the data -- i.e. no encoding).
+     */
+    if (row != tif->tif_row)
+    {
+        if (row < tif->tif_row)
+        {
+            /*
+             * Moving backwards within the same strip:
+             * backup to the start and then decode
+             * forward (below).
+             */
+            tif->tif_row =
+                (strip % td->td_stripsperimage) * td->td_rowsperstrip;
+            tif->tif_rawcp = tif->tif_rawdata;
+        }
+        /*
+         * Seek forward to the desired row.
+         */
+        if (!(*tif->tif_seek)(tif, row - tif->tif_row))
+            return (-1);
+        tif->tif_row = row;
+    }
+
+    /* swab if needed - note that source buffer will be altered */
+    tif->tif_postdecode(tif, (uint8_t *)buf, tif->tif_scanlinesize);
+
+    status = (*tif->tif_encoderow)(tif, (uint8_t *)buf, tif->tif_scanlinesize,
+                                   sample);
+
+    /* we are now poised at the beginning of the next row */
+    tif->tif_row = row + 1;
+    return (status);
 }
 
-/* Make sure that at the first attempt of rewriting a tile/strip, we will have */
+/* Make sure that at the first attempt of rewriting a tile/strip, we will have
+ */
 /* more bytes available in the output buffer than the previous byte count, */
-/* so that TIFFAppendToStrip() will detect the overflow when it is called the first */
+/* so that TIFFAppendToStrip() will detect the overflow when it is called the
+ * first */
 /* time if the new compressed tile is bigger than the older one. (GDAL #4771) */
-static int _TIFFReserveLargeEnoughWriteBuffer(TIFF* tif, uint32 strip_or_tile)
+static int _TIFFReserveLargeEnoughWriteBuffer(TIFF *tif, uint32_t strip_or_tile)
 {
     TIFFDirectory *td = &tif->tif_dir;
-    if( td->td_stripbytecount_p[strip_or_tile] > 0 )
+    if (td->td_stripbytecount_p[strip_or_tile] > 0)
     {
         /* The +1 is to ensure at least one extra bytes */
         /* The +4 is because the LZW encoder flushes 4 bytes before the limit */
-        uint64 safe_buffer_size = (uint64)(td->td_stripbytecount_p[strip_or_tile] + 1 + 4);
-        if( tif->tif_rawdatasize <= (tmsize_t)safe_buffer_size )
+        uint64_t safe_buffer_size =
+            (uint64_t)(td->td_stripbytecount_p[strip_or_tile] + 1 + 4);
+        if (tif->tif_rawdatasize <= (tmsize_t)safe_buffer_size)
         {
-            if( !(TIFFWriteBufferSetup(tif, NULL,
-                (tmsize_t)TIFFroundup_64(safe_buffer_size, 1024))) )
+            if (!(TIFFWriteBufferSetup(
+                    tif, NULL,
+                    (tmsize_t)TIFFroundup_64(safe_buffer_size, 1024))))
                 return 0;
         }
-
-        /* Force TIFFAppendToStrip() to consider placing data at end
-            of file. */
-        tif->tif_curoff = 0;
     }
     return 1;
 }
@@ -208,103 +212,112 @@ static int _TIFFReserveLargeEnoughWriteBuffer(TIFF* tif, uint32 strip_or_tile)
  *
  * NB: Image length must be setup before writing.
  */
-tmsize_t
-TIFFWriteEncodedStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc)
+tmsize_t TIFFWriteEncodedStrip(TIFF *tif, uint32_t strip, void *data,
+                               tmsize_t cc)
 {
-	static const char module[] = "TIFFWriteEncodedStrip";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint16 sample;
-
-	if (!WRITECHECKSTRIPS(tif, module))
-		return ((tmsize_t) -1);
-	/*
-	 * Check strip array to make sure there's space.
-	 * We don't support dynamically growing files that
-	 * have data organized in separate bitplanes because
-	 * it's too painful.  In that case we require that
-	 * the imagelength be set properly before the first
-	 * write (so that the strips array will be fully
-	 * allocated above).
-	 */
-	if (strip >= td->td_nstrips) {
-		if (td->td_planarconfig == PLANARCONFIG_SEPARATE) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Can not grow image by strips when using separate planes");
-			return ((tmsize_t) -1);
-		}
-		if (!TIFFGrowStrips(tif, 1, module))
-			return ((tmsize_t) -1);
-		td->td_stripsperimage =
-		    TIFFhowmany_32(td->td_imagelength, td->td_rowsperstrip);  
-	}
-	/*
-	 * Handle delayed allocation of data buffer.  This
-	 * permits it to be sized according to the directory
-	 * info.
-	 */
-	if (!BUFFERCHECK(tif))
-		return ((tmsize_t) -1);
-
-        tif->tif_flags |= TIFF_BUF4WRITE;
-	tif->tif_curstrip = strip;
-
-	if( !_TIFFReserveLargeEnoughWriteBuffer(tif, strip) ) {
-            return ((tmsize_t)(-1));
+    static const char module[] = "TIFFWriteEncodedStrip";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint16_t sample;
+
+    if (!WRITECHECKSTRIPS(tif, module))
+        return ((tmsize_t)-1);
+    /*
+     * Check strip array to make sure there's space.
+     * We don't support dynamically growing files that
+     * have data organized in separate bitplanes because
+     * it's too painful.  In that case we require that
+     * the imagelength be set properly before the first
+     * write (so that the strips array will be fully
+     * allocated above).
+     */
+    if (strip >= td->td_nstrips)
+    {
+        if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+        {
+            TIFFErrorExtR(
+                tif, module,
+                "Can not grow image by strips when using separate planes");
+            return ((tmsize_t)-1);
         }
+        if (!TIFFGrowStrips(tif, 1, module))
+            return ((tmsize_t)-1);
+        td->td_stripsperimage =
+            TIFFhowmany_32(td->td_imagelength, td->td_rowsperstrip);
+    }
+    /*
+     * Handle delayed allocation of data buffer.  This
+     * permits it to be sized according to the directory
+     * info.
+     */
+    if (!BUFFERCHECK(tif))
+        return ((tmsize_t)-1);
 
-        tif->tif_rawcc = 0;
-        tif->tif_rawcp = tif->tif_rawdata;
+    tif->tif_flags |= TIFF_BUF4WRITE;
 
-        if (td->td_stripsperimage == 0) {
-                TIFFErrorExt(tif->tif_clientdata, module, "Zero strips per image");
-                return ((tmsize_t) -1);
-        }
+    tif->tif_curstrip = strip;
 
-	tif->tif_row = (strip % td->td_stripsperimage) * td->td_rowsperstrip;
-	if ((tif->tif_flags & TIFF_CODERSETUP) == 0) {
-		if (!(*tif->tif_setupencode)(tif))
-			return ((tmsize_t) -1);
-		tif->tif_flags |= TIFF_CODERSETUP;
-	}
+    /* this informs TIFFAppendToStrip() we have changed or reset strip */
+    tif->tif_curoff = 0;
 
-	tif->tif_flags &= ~TIFF_POSTENCODE;
+    if (!_TIFFReserveLargeEnoughWriteBuffer(tif, strip))
+    {
+        return ((tmsize_t)(-1));
+    }
+
+    tif->tif_rawcc = 0;
+    tif->tif_rawcp = tif->tif_rawdata;
+
+    if (td->td_stripsperimage == 0)
+    {
+        TIFFErrorExtR(tif, module, "Zero strips per image");
+        return ((tmsize_t)-1);
+    }
+
+    tif->tif_row = (strip % td->td_stripsperimage) * td->td_rowsperstrip;
+    if ((tif->tif_flags & TIFF_CODERSETUP) == 0)
+    {
+        if (!(*tif->tif_setupencode)(tif))
+            return ((tmsize_t)-1);
+        tif->tif_flags |= TIFF_CODERSETUP;
+    }
+
+    tif->tif_flags &= ~TIFF_POSTENCODE;
 
     /* shortcut to avoid an extra memcpy() */
-    if( td->td_compression == COMPRESSION_NONE )
+    if (td->td_compression == COMPRESSION_NONE)
     {
         /* swab if needed - note that source buffer will be altered */
-        tif->tif_postdecode( tif, (uint8*) data, cc );
+        tif->tif_postdecode(tif, (uint8_t *)data, cc);
 
         if (!isFillOrder(tif, td->td_fillorder) &&
             (tif->tif_flags & TIFF_NOBITREV) == 0)
-            TIFFReverseBits((uint8*) data, cc);
+            TIFFReverseBits((uint8_t *)data, cc);
 
-        if (cc > 0 &&
-            !TIFFAppendToStrip(tif, strip, (uint8*) data, cc))
-            return ((tmsize_t) -1);
+        if (cc > 0 && !TIFFAppendToStrip(tif, strip, (uint8_t *)data, cc))
+            return ((tmsize_t)-1);
         return (cc);
     }
 
-	sample = (uint16)(strip / td->td_stripsperimage);
-	if (!(*tif->tif_preencode)(tif, sample))
-		return ((tmsize_t) -1);
+    sample = (uint16_t)(strip / td->td_stripsperimage);
+    if (!(*tif->tif_preencode)(tif, sample))
+        return ((tmsize_t)-1);
 
-        /* swab if needed - note that source buffer will be altered */
-	tif->tif_postdecode( tif, (uint8*) data, cc );
-
-	if (!(*tif->tif_encodestrip)(tif, (uint8*) data, cc, sample))
-		return ((tmsize_t) -1);
-	if (!(*tif->tif_postencode)(tif))
-		return ((tmsize_t) -1);
-	if (!isFillOrder(tif, td->td_fillorder) &&
-	    (tif->tif_flags & TIFF_NOBITREV) == 0)
-		TIFFReverseBits(tif->tif_rawdata, tif->tif_rawcc);
-	if (tif->tif_rawcc > 0 &&
-	    !TIFFAppendToStrip(tif, strip, tif->tif_rawdata, tif->tif_rawcc))
-		return ((tmsize_t) -1);
-	tif->tif_rawcc = 0;
-	tif->tif_rawcp = tif->tif_rawdata;
-	return (cc);
+    /* swab if needed - note that source buffer will be altered */
+    tif->tif_postdecode(tif, (uint8_t *)data, cc);
+
+    if (!(*tif->tif_encodestrip)(tif, (uint8_t *)data, cc, sample))
+        return ((tmsize_t)-1);
+    if (!(*tif->tif_postencode)(tif))
+        return ((tmsize_t)-1);
+    if (!isFillOrder(tif, td->td_fillorder) &&
+        (tif->tif_flags & TIFF_NOBITREV) == 0)
+        TIFFReverseBits(tif->tif_rawdata, tif->tif_rawcc);
+    if (tif->tif_rawcc > 0 &&
+        !TIFFAppendToStrip(tif, strip, tif->tif_rawdata, tif->tif_rawcc))
+        return ((tmsize_t)-1);
+    tif->tif_rawcc = 0;
+    tif->tif_rawcp = tif->tif_rawdata;
+    return (cc);
 }
 
 /*
@@ -312,67 +325,78 @@ TIFFWriteEncodedStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc)
  *
  * NB: Image length must be setup before writing.
  */
-tmsize_t
-TIFFWriteRawStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc)
+tmsize_t TIFFWriteRawStrip(TIFF *tif, uint32_t strip, void *data, tmsize_t cc)
 {
-	static const char module[] = "TIFFWriteRawStrip";
-	TIFFDirectory *td = &tif->tif_dir;
-
-	if (!WRITECHECKSTRIPS(tif, module))
-		return ((tmsize_t) -1);
-	/*
-	 * Check strip array to make sure there's space.
-	 * We don't support dynamically growing files that
-	 * have data organized in separate bitplanes because
-	 * it's too painful.  In that case we require that
-	 * the imagelength be set properly before the first
-	 * write (so that the strips array will be fully
-	 * allocated above).
-	 */
-	if (strip >= td->td_nstrips) {
-		if (td->td_planarconfig == PLANARCONFIG_SEPARATE) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Can not grow image by strips when using separate planes");
-			return ((tmsize_t) -1);
-		}
-		/*
-		 * Watch out for a growing image.  The value of
-		 * strips/image will initially be 1 (since it
-		 * can't be deduced until the imagelength is known).
-		 */
-		if (strip >= td->td_stripsperimage)
-			td->td_stripsperimage =
-			    TIFFhowmany_32(td->td_imagelength,td->td_rowsperstrip);
-		if (!TIFFGrowStrips(tif, 1, module))
-			return ((tmsize_t) -1);
-	}
-	tif->tif_curstrip = strip;
-        if (td->td_stripsperimage == 0) {
-                TIFFErrorExt(tif->tif_clientdata, module,"Zero strips per image");
-                return ((tmsize_t) -1);
+    static const char module[] = "TIFFWriteRawStrip";
+    TIFFDirectory *td = &tif->tif_dir;
+
+    if (!WRITECHECKSTRIPS(tif, module))
+        return ((tmsize_t)-1);
+    /*
+     * Check strip array to make sure there's space.
+     * We don't support dynamically growing files that
+     * have data organized in separate bitplanes because
+     * it's too painful.  In that case we require that
+     * the imagelength be set properly before the first
+     * write (so that the strips array will be fully
+     * allocated above).
+     */
+    if (strip >= td->td_nstrips)
+    {
+        if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+        {
+            TIFFErrorExtR(
+                tif, module,
+                "Can not grow image by strips when using separate planes");
+            return ((tmsize_t)-1);
         }
-	tif->tif_row = (strip % td->td_stripsperimage) * td->td_rowsperstrip;
-	return (TIFFAppendToStrip(tif, strip, (uint8*) data, cc) ?
-	    cc : (tmsize_t) -1);
+        /*
+         * Watch out for a growing image.  The value of
+         * strips/image will initially be 1 (since it
+         * can't be deduced until the imagelength is known).
+         */
+        if (strip >= td->td_stripsperimage)
+            td->td_stripsperimage =
+                TIFFhowmany_32(td->td_imagelength, td->td_rowsperstrip);
+        if (!TIFFGrowStrips(tif, 1, module))
+            return ((tmsize_t)-1);
+    }
+
+    if (tif->tif_curstrip != strip)
+    {
+        tif->tif_curstrip = strip;
+
+        /* this informs TIFFAppendToStrip() we have changed or reset strip */
+        tif->tif_curoff = 0;
+    }
+
+    if (td->td_stripsperimage == 0)
+    {
+        TIFFErrorExtR(tif, module, "Zero strips per image");
+        return ((tmsize_t)-1);
+    }
+    tif->tif_row = (strip % td->td_stripsperimage) * td->td_rowsperstrip;
+    return (TIFFAppendToStrip(tif, strip, (uint8_t *)data, cc) ? cc
+                                                               : (tmsize_t)-1);
 }
 
 /*
  * Write and compress a tile of data.  The
  * tile is selected by the (x,y,z,s) coordinates.
  */
-tmsize_t
-TIFFWriteTile(TIFF* tif, void* buf, uint32 x, uint32 y, uint32 z, uint16 s)
+tmsize_t TIFFWriteTile(TIFF *tif, void *buf, uint32_t x, uint32_t y, uint32_t z,
+                       uint16_t s)
 {
-	if (!TIFFCheckTile(tif, x, y, z, s))
-		return ((tmsize_t)(-1));
-	/*
-	 * NB: A tile size of -1 is used instead of tif_tilesize knowing
-	 *     that TIFFWriteEncodedTile will clamp this to the tile size.
-	 *     This is done because the tile size may not be defined until
-	 *     after the output buffer is setup in TIFFWriteBufferSetup.
-	 */
-	return (TIFFWriteEncodedTile(tif,
-	    TIFFComputeTile(tif, x, y, z, s), buf, (tmsize_t)(-1)));
+    if (!TIFFCheckTile(tif, x, y, z, s))
+        return ((tmsize_t)(-1));
+    /*
+     * NB: A tile size of -1 is used instead of tif_tilesize knowing
+     *     that TIFFWriteEncodedTile will clamp this to the tile size.
+     *     This is done because the tile size may not be defined until
+     *     after the output buffer is setup in TIFFWriteBufferSetup.
+     */
+    return (TIFFWriteEncodedTile(tif, TIFFComputeTile(tif, x, y, z, s), buf,
+                                 (tmsize_t)(-1)));
 }
 
 /*
@@ -387,104 +411,111 @@ TIFFWriteTile(TIFF* tif, void* buf, uint32 x, uint32 y, uint32 z, uint16 s)
  *     interface does not support automatically growing
  *     the image on each write (as TIFFWriteScanline does).
  */
-tmsize_t
-TIFFWriteEncodedTile(TIFF* tif, uint32 tile, void* data, tmsize_t cc)
+tmsize_t TIFFWriteEncodedTile(TIFF *tif, uint32_t tile, void *data, tmsize_t cc)
 {
-	static const char module[] = "TIFFWriteEncodedTile";
-	TIFFDirectory *td;
-	uint16 sample;
-        uint32 howmany32;
-
-	if (!WRITECHECKTILES(tif, module))
-		return ((tmsize_t)(-1));
-	td = &tif->tif_dir;
-	if (tile >= td->td_nstrips) {
-		TIFFErrorExt(tif->tif_clientdata, module, "Tile %lu out of range, max %lu",
-		    (unsigned long) tile, (unsigned long) td->td_nstrips);
-		return ((tmsize_t)(-1));
-	}
-	/*
-	 * Handle delayed allocation of data buffer.  This
-	 * permits it to be sized more intelligently (using
-	 * directory information).
-	 */
-	if (!BUFFERCHECK(tif))
-		return ((tmsize_t)(-1));
-
-        tif->tif_flags |= TIFF_BUF4WRITE;
-	tif->tif_curtile = tile;
-
-        if( !_TIFFReserveLargeEnoughWriteBuffer(tif, tile) ) {
+    static const char module[] = "TIFFWriteEncodedTile";
+    TIFFDirectory *td;
+    uint16_t sample;
+    uint32_t howmany32;
+
+    if (!WRITECHECKTILES(tif, module))
+        return ((tmsize_t)(-1));
+    td = &tif->tif_dir;
+    if (tile >= td->td_nstrips)
+    {
+        TIFFErrorExtR(tif, module, "Tile %lu out of range, max %lu",
+                      (unsigned long)tile, (unsigned long)td->td_nstrips);
+        return ((tmsize_t)(-1));
+    }
+    /*
+     * Handle delayed allocation of data buffer.  This
+     * permits it to be sized more intelligently (using
+     * directory information).
+     */
+    if (!BUFFERCHECK(tif))
+        return ((tmsize_t)(-1));
+
+    tif->tif_flags |= TIFF_BUF4WRITE;
+
+    tif->tif_curtile = tile;
+
+    /* this informs TIFFAppendToStrip() we have changed or reset tile */
+    tif->tif_curoff = 0;
+
+    if (!_TIFFReserveLargeEnoughWriteBuffer(tif, tile))
+    {
+        return ((tmsize_t)(-1));
+    }
+
+    tif->tif_rawcc = 0;
+    tif->tif_rawcp = tif->tif_rawdata;
+
+    /*
+     * Compute tiles per row & per column to compute
+     * current row and column
+     */
+    howmany32 = TIFFhowmany_32(td->td_imagelength, td->td_tilelength);
+    if (howmany32 == 0)
+    {
+        TIFFErrorExtR(tif, module, "Zero tiles");
+        return ((tmsize_t)(-1));
+    }
+    tif->tif_row = (tile % howmany32) * td->td_tilelength;
+    howmany32 = TIFFhowmany_32(td->td_imagewidth, td->td_tilewidth);
+    if (howmany32 == 0)
+    {
+        TIFFErrorExtR(tif, module, "Zero tiles");
+        return ((tmsize_t)(-1));
+    }
+    tif->tif_col = (tile % howmany32) * td->td_tilewidth;
+
+    if ((tif->tif_flags & TIFF_CODERSETUP) == 0)
+    {
+        if (!(*tif->tif_setupencode)(tif))
             return ((tmsize_t)(-1));
-        }
+        tif->tif_flags |= TIFF_CODERSETUP;
+    }
+    tif->tif_flags &= ~TIFF_POSTENCODE;
 
-	tif->tif_rawcc = 0;
-	tif->tif_rawcp = tif->tif_rawdata;
-
-	/* 
-	 * Compute tiles per row & per column to compute
-	 * current row and column
-	 */
-        howmany32=TIFFhowmany_32(td->td_imagelength, td->td_tilelength);
-        if (howmany32 == 0) {
-                 TIFFErrorExt(tif->tif_clientdata,module,"Zero tiles");
-                return ((tmsize_t)(-1));
-        }
-	tif->tif_row = (tile % howmany32) * td->td_tilelength;
-        howmany32=TIFFhowmany_32(td->td_imagewidth, td->td_tilewidth);
-        if (howmany32 == 0) {
-                 TIFFErrorExt(tif->tif_clientdata,module,"Zero tiles");
-                return ((tmsize_t)(-1));
-        }
-	tif->tif_col = (tile % howmany32) * td->td_tilewidth;
-
-	if ((tif->tif_flags & TIFF_CODERSETUP) == 0) {
-		if (!(*tif->tif_setupencode)(tif))
-			return ((tmsize_t)(-1));
-		tif->tif_flags |= TIFF_CODERSETUP;
-	}
-	tif->tif_flags &= ~TIFF_POSTENCODE;
-
-	/*
-	 * Clamp write amount to the tile size.  This is mostly
-	 * done so that callers can pass in some large number
-	 * (e.g. -1) and have the tile size used instead.
-	 */
-	if ( cc < 1 || cc > tif->tif_tilesize)
-		cc = tif->tif_tilesize;
+    /*
+     * Clamp write amount to the tile size.  This is mostly
+     * done so that callers can pass in some large number
+     * (e.g. -1) and have the tile size used instead.
+     */
+    if (cc < 1 || cc > tif->tif_tilesize)
+        cc = tif->tif_tilesize;
 
     /* shortcut to avoid an extra memcpy() */
-    if( td->td_compression == COMPRESSION_NONE )
+    if (td->td_compression == COMPRESSION_NONE)
     {
         /* swab if needed - note that source buffer will be altered */
-        tif->tif_postdecode( tif, (uint8*) data, cc );
+        tif->tif_postdecode(tif, (uint8_t *)data, cc);
 
         if (!isFillOrder(tif, td->td_fillorder) &&
             (tif->tif_flags & TIFF_NOBITREV) == 0)
-            TIFFReverseBits((uint8*) data, cc);
+            TIFFReverseBits((uint8_t *)data, cc);
 
-        if (cc > 0 &&
-            !TIFFAppendToStrip(tif, tile, (uint8*) data, cc))
-            return ((tmsize_t) -1);
+        if (cc > 0 && !TIFFAppendToStrip(tif, tile, (uint8_t *)data, cc))
+            return ((tmsize_t)-1);
         return (cc);
     }
 
-    sample = (uint16)(tile/td->td_stripsperimage);
+    sample = (uint16_t)(tile / td->td_stripsperimage);
     if (!(*tif->tif_preencode)(tif, sample))
         return ((tmsize_t)(-1));
     /* swab if needed - note that source buffer will be altered */
-    tif->tif_postdecode( tif, (uint8*) data, cc );
+    tif->tif_postdecode(tif, (uint8_t *)data, cc);
 
-    if (!(*tif->tif_encodetile)(tif, (uint8*) data, cc, sample))
-            return ((tmsize_t) -1);
+    if (!(*tif->tif_encodetile)(tif, (uint8_t *)data, cc, sample))
+        return ((tmsize_t)-1);
     if (!(*tif->tif_postencode)(tif))
-            return ((tmsize_t)(-1));
+        return ((tmsize_t)(-1));
     if (!isFillOrder(tif, td->td_fillorder) &&
         (tif->tif_flags & TIFF_NOBITREV) == 0)
-            TIFFReverseBits((uint8*)tif->tif_rawdata, tif->tif_rawcc);
-    if (tif->tif_rawcc > 0 && !TIFFAppendToStrip(tif, tile,
-        tif->tif_rawdata, tif->tif_rawcc))
-            return ((tmsize_t)(-1));
+        TIFFReverseBits((uint8_t *)tif->tif_rawdata, tif->tif_rawcc);
+    if (tif->tif_rawcc > 0 &&
+        !TIFFAppendToStrip(tif, tile, tif->tif_rawdata, tif->tif_rawcc))
+        return ((tmsize_t)(-1));
     tif->tif_rawcc = 0;
     tif->tif_rawcp = tif->tif_rawdata;
     return (cc);
@@ -499,66 +530,64 @@ TIFFWriteEncodedTile(TIFF* tif, uint32 tile, void* data, tmsize_t cc)
  *     interface does not support automatically growing
  *     the image on each write (as TIFFWriteScanline does).
  */
-tmsize_t
-TIFFWriteRawTile(TIFF* tif, uint32 tile, void* data, tmsize_t cc)
+tmsize_t TIFFWriteRawTile(TIFF *tif, uint32_t tile, void *data, tmsize_t cc)
 {
-	static const char module[] = "TIFFWriteRawTile";
-
-	if (!WRITECHECKTILES(tif, module))
-		return ((tmsize_t)(-1));
-	if (tile >= tif->tif_dir.td_nstrips) {
-		TIFFErrorExt(tif->tif_clientdata, module, "Tile %lu out of range, max %lu",
-		    (unsigned long) tile,
-		    (unsigned long) tif->tif_dir.td_nstrips);
-		return ((tmsize_t)(-1));
-	}
-	return (TIFFAppendToStrip(tif, tile, (uint8*) data, cc) ?
-	    cc : (tmsize_t)(-1));
+    static const char module[] = "TIFFWriteRawTile";
+
+    if (!WRITECHECKTILES(tif, module))
+        return ((tmsize_t)(-1));
+    if (tile >= tif->tif_dir.td_nstrips)
+    {
+        TIFFErrorExtR(tif, module, "Tile %lu out of range, max %lu",
+                      (unsigned long)tile,
+                      (unsigned long)tif->tif_dir.td_nstrips);
+        return ((tmsize_t)(-1));
+    }
+    return (TIFFAppendToStrip(tif, tile, (uint8_t *)data, cc) ? cc
+                                                              : (tmsize_t)(-1));
 }
 
-#define	isUnspecified(tif, f) \
-    (TIFFFieldSet(tif,f) && (tif)->tif_dir.td_imagelength == 0)
+#define isUnspecified(tif, f)                                                  \
+    (TIFFFieldSet(tif, f) && (tif)->tif_dir.td_imagelength == 0)
 
-int
-TIFFSetupStrips(TIFF* tif)
+int TIFFSetupStrips(TIFF *tif)
 {
-	TIFFDirectory* td = &tif->tif_dir;
-
-	if (isTiled(tif))
-		td->td_stripsperimage =
-		    isUnspecified(tif, FIELD_TILEDIMENSIONS) ?
-			td->td_samplesperpixel : TIFFNumberOfTiles(tif);
-	else
-		td->td_stripsperimage =
-		    isUnspecified(tif, FIELD_ROWSPERSTRIP) ?
-			td->td_samplesperpixel : TIFFNumberOfStrips(tif);
-	td->td_nstrips = td->td_stripsperimage;
-        /* TIFFWriteDirectoryTagData has a limitation to 0x80000000U bytes */
-        if( td->td_nstrips >= 0x80000000U / ((tif->tif_flags&TIFF_BIGTIFF)?0x8U:0x4U) )
-        {
-            TIFFErrorExt(tif->tif_clientdata, "TIFFSetupStrips",
-                         "Too large Strip/Tile Offsets/ByteCounts arrays");
-            return 0;
-        }
-	if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
-		td->td_stripsperimage /= td->td_samplesperpixel;
-	td->td_stripoffset_p = (uint64 *)
-            _TIFFCheckMalloc(tif, td->td_nstrips, sizeof (uint64),
-                             "for \"StripOffsets\" array");
-	td->td_stripbytecount_p = (uint64 *)
-            _TIFFCheckMalloc(tif, td->td_nstrips, sizeof (uint64),
-                             "for \"StripByteCounts\" array");
-	if (td->td_stripoffset_p == NULL || td->td_stripbytecount_p == NULL)
-		return (0);
-	/*
-	 * Place data at the end-of-file
-	 * (by setting offsets to zero).
-	 */
-	_TIFFmemset(td->td_stripoffset_p, 0, td->td_nstrips*sizeof (uint64));
-	_TIFFmemset(td->td_stripbytecount_p, 0, td->td_nstrips*sizeof (uint64));
-	TIFFSetFieldBit(tif, FIELD_STRIPOFFSETS);
-	TIFFSetFieldBit(tif, FIELD_STRIPBYTECOUNTS);
-	return (1);
+    TIFFDirectory *td = &tif->tif_dir;
+
+    if (isTiled(tif))
+        td->td_stripsperimage = isUnspecified(tif, FIELD_TILEDIMENSIONS)
+                                    ? td->td_samplesperpixel
+                                    : TIFFNumberOfTiles(tif);
+    else
+        td->td_stripsperimage = isUnspecified(tif, FIELD_ROWSPERSTRIP)
+                                    ? td->td_samplesperpixel
+                                    : TIFFNumberOfStrips(tif);
+    td->td_nstrips = td->td_stripsperimage;
+    /* TIFFWriteDirectoryTagData has a limitation to 0x80000000U bytes */
+    if (td->td_nstrips >=
+        0x80000000U / ((tif->tif_flags & TIFF_BIGTIFF) ? 0x8U : 0x4U))
+    {
+        TIFFErrorExtR(tif, "TIFFSetupStrips",
+                      "Too large Strip/Tile Offsets/ByteCounts arrays");
+        return 0;
+    }
+    if (td->td_planarconfig == PLANARCONFIG_SEPARATE)
+        td->td_stripsperimage /= td->td_samplesperpixel;
+    td->td_stripoffset_p = (uint64_t *)_TIFFCheckMalloc(
+        tif, td->td_nstrips, sizeof(uint64_t), "for \"StripOffsets\" array");
+    td->td_stripbytecount_p = (uint64_t *)_TIFFCheckMalloc(
+        tif, td->td_nstrips, sizeof(uint64_t), "for \"StripByteCounts\" array");
+    if (td->td_stripoffset_p == NULL || td->td_stripbytecount_p == NULL)
+        return (0);
+    /*
+     * Place data at the end-of-file
+     * (by setting offsets to zero).
+     */
+    _TIFFmemset(td->td_stripoffset_p, 0, td->td_nstrips * sizeof(uint64_t));
+    _TIFFmemset(td->td_stripbytecount_p, 0, td->td_nstrips * sizeof(uint64_t));
+    TIFFSetFieldBit(tif, FIELD_STRIPOFFSETS);
+    TIFFSetFieldBit(tif, FIELD_STRIPBYTECOUNTS);
+    return (1);
 }
 #undef isUnspecified
 
@@ -568,241 +597,325 @@ TIFFSetupStrips(TIFF* tif)
  * we also "freeze" the state of the directory so
  * that important information is not changed.
  */
-int
-TIFFWriteCheck(TIFF* tif, int tiles, const char* module)
+int TIFFWriteCheck(TIFF *tif, int tiles, const char *module)
 {
-	if (tif->tif_mode == O_RDONLY) {
-		TIFFErrorExt(tif->tif_clientdata, module, "File not open for writing");
-		return (0);
-	}
-	if (tiles ^ isTiled(tif)) {
-		TIFFErrorExt(tif->tif_clientdata, module, tiles ?
-		    "Can not write tiles to a striped image" :
-		    "Can not write scanlines to a tiled image");
-		return (0);
-	}
-
-        _TIFFFillStriles( tif );
-        
-	/*
-	 * On the first write verify all the required information
-	 * has been setup and initialize any data structures that
-	 * had to wait until directory information was set.
-	 * Note that a lot of our work is assumed to remain valid
-	 * because we disallow any of the important parameters
-	 * from changing after we start writing (i.e. once
-	 * TIFF_BEENWRITING is set, TIFFSetField will only allow
-	 * the image's length to be changed).
-	 */
-	if (!TIFFFieldSet(tif, FIELD_IMAGEDIMENSIONS)) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Must set \"ImageWidth\" before writing data");
-		return (0);
-	}
-	if (tif->tif_dir.td_samplesperpixel == 1) {
-		/* 
-		 * Planarconfiguration is irrelevant in case of single band
-		 * images and need not be included. We will set it anyway,
-		 * because this field is used in other parts of library even
-		 * in the single band case.
-		 */
-		if (!TIFFFieldSet(tif, FIELD_PLANARCONFIG))
-                    tif->tif_dir.td_planarconfig = PLANARCONFIG_CONTIG;
-	} else {
-		if (!TIFFFieldSet(tif, FIELD_PLANARCONFIG)) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Must set \"PlanarConfiguration\" before writing data");
-			return (0);
-		}
-	}
-	if (tif->tif_dir.td_stripoffset_p == NULL && !TIFFSetupStrips(tif)) {
-		tif->tif_dir.td_nstrips = 0;
-		TIFFErrorExt(tif->tif_clientdata, module, "No space for %s arrays",
-		    isTiled(tif) ? "tile" : "strip");
-		return (0);
-	}
-	if (isTiled(tif))
-	{
-		tif->tif_tilesize = TIFFTileSize(tif);
-		if (tif->tif_tilesize == 0)
-			return (0);
-	}
-	else
-		tif->tif_tilesize = (tmsize_t)(-1);
-	tif->tif_scanlinesize = TIFFScanlineSize(tif);
-	if (tif->tif_scanlinesize == 0)
-		return (0);
-	tif->tif_flags |= TIFF_BEENWRITING;
-
-        if( tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
-            tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
-            tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
-            tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
-            tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
-            tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
-            tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
-            tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0 &&
-            !(tif->tif_flags & TIFF_DIRTYDIRECT)  )
-        {
-            TIFFForceStrileArrayWriting(tif);
-        }
+    if (tif->tif_mode == O_RDONLY)
+    {
+        TIFFErrorExtR(tif, module, "File not open for writing");
+        return (0);
+    }
+    if (tiles ^ isTiled(tif))
+    {
+        TIFFErrorExtR(tif, module,
+                      tiles ? "Can not write tiles to a striped image"
+                            : "Can not write scanlines to a tiled image");
+        return (0);
+    }
+
+    _TIFFFillStriles(tif);
+
+    /*
+     * On the first write verify all the required information
+     * has been setup and initialize any data structures that
+     * had to wait until directory information was set.
+     * Note that a lot of our work is assumed to remain valid
+     * because we disallow any of the important parameters
+     * from changing after we start writing (i.e. once
+     * TIFF_BEENWRITING is set, TIFFSetField will only allow
+     * the image's length to be changed).
+     */
+    if (!TIFFFieldSet(tif, FIELD_IMAGEDIMENSIONS))
+    {
+        TIFFErrorExtR(tif, module,
+                      "Must set \"ImageWidth\" before writing data");
+        return (0);
+    }
+    if (tif->tif_dir.td_stripoffset_p == NULL && !TIFFSetupStrips(tif))
+    {
+        tif->tif_dir.td_nstrips = 0;
+        TIFFErrorExtR(tif, module, "No space for %s arrays",
+                      isTiled(tif) ? "tile" : "strip");
+        return (0);
+    }
+    if (isTiled(tif))
+    {
+        tif->tif_tilesize = TIFFTileSize(tif);
+        if (tif->tif_tilesize == 0)
+            return (0);
+    }
+    else
+        tif->tif_tilesize = (tmsize_t)(-1);
+    tif->tif_scanlinesize = TIFFScanlineSize(tif);
+    if (tif->tif_scanlinesize == 0)
+        return (0);
+    tif->tif_flags |= TIFF_BEENWRITING;
+
+    if (tif->tif_dir.td_stripoffset_entry.tdir_tag != 0 &&
+        tif->tif_dir.td_stripoffset_entry.tdir_count == 0 &&
+        tif->tif_dir.td_stripoffset_entry.tdir_type == 0 &&
+        tif->tif_dir.td_stripoffset_entry.tdir_offset.toff_long8 == 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_tag != 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_count == 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_type == 0 &&
+        tif->tif_dir.td_stripbytecount_entry.tdir_offset.toff_long8 == 0 &&
+        !(tif->tif_flags & TIFF_DIRTYDIRECT))
+    {
+        TIFFForceStrileArrayWriting(tif);
+    }
 
-	return (1);
+    return (1);
 }
 
 /*
  * Setup the raw data buffer used for encoding.
  */
-int
-TIFFWriteBufferSetup(TIFF* tif, void* bp, tmsize_t size)
+int TIFFWriteBufferSetup(TIFF *tif, void *bp, tmsize_t size)
 {
-	static const char module[] = "TIFFWriteBufferSetup";
-
-	if (tif->tif_rawdata) {
-		if (tif->tif_flags & TIFF_MYBUFFER) {
-			_TIFFfree(tif->tif_rawdata);
-			tif->tif_flags &= ~TIFF_MYBUFFER;
-		}
-		tif->tif_rawdata = NULL;
-	}
-	if (size == (tmsize_t)(-1)) {
-		size = (isTiled(tif) ?
-		    tif->tif_tilesize : TIFFStripSize(tif));
-
-                /* Adds 10% margin for cases where compression would expand a bit */
-                if( size < TIFF_TMSIZE_T_MAX - size / 10 )
-                    size += size / 10;
-		/*
-		 * Make raw data buffer at least 8K
-		 */
-		if (size < 8*1024)
-			size = 8*1024;
-		bp = NULL;			/* NB: force malloc */
-	}
-	if (bp == NULL) {
-		bp = _TIFFmalloc(size);
-		if (bp == NULL) {
-			TIFFErrorExt(tif->tif_clientdata, module, "No space for output buffer");
-			return (0);
-		}
-		tif->tif_flags |= TIFF_MYBUFFER;
-	} else
-		tif->tif_flags &= ~TIFF_MYBUFFER;
-	tif->tif_rawdata = (uint8*) bp;
-	tif->tif_rawdatasize = size;
-	tif->tif_rawcc = 0;
-	tif->tif_rawcp = tif->tif_rawdata;
-	tif->tif_flags |= TIFF_BUFFERSETUP;
-	return (1);
+    static const char module[] = "TIFFWriteBufferSetup";
+
+    if (tif->tif_rawdata)
+    {
+        if (tif->tif_flags & TIFF_MYBUFFER)
+        {
+            _TIFFfreeExt(tif, tif->tif_rawdata);
+            tif->tif_flags &= ~TIFF_MYBUFFER;
+        }
+        tif->tif_rawdata = NULL;
+    }
+    if (size == (tmsize_t)(-1))
+    {
+        size = (isTiled(tif) ? tif->tif_tilesize : TIFFStripSize(tif));
+
+        /* Adds 10% margin for cases where compression would expand a bit */
+        if (size < TIFF_TMSIZE_T_MAX - size / 10)
+            size += size / 10;
+        /*
+         * Make raw data buffer at least 8K
+         */
+        if (size < 8 * 1024)
+            size = 8 * 1024;
+        bp = NULL; /* NB: force malloc */
+    }
+    if (bp == NULL)
+    {
+        bp = _TIFFmallocExt(tif, size);
+        if (bp == NULL)
+        {
+            TIFFErrorExtR(tif, module, "No space for output buffer");
+            return (0);
+        }
+        tif->tif_flags |= TIFF_MYBUFFER;
+    }
+    else
+        tif->tif_flags &= ~TIFF_MYBUFFER;
+    tif->tif_rawdata = (uint8_t *)bp;
+    tif->tif_rawdatasize = size;
+    tif->tif_rawcc = 0;
+    tif->tif_rawcp = tif->tif_rawdata;
+    tif->tif_flags |= TIFF_BUFFERSETUP;
+    return (1);
 }
 
 /*
  * Grow the strip data structures by delta strips.
  */
-static int
-TIFFGrowStrips(TIFF* tif, uint32 delta, const char* module)
+static int TIFFGrowStrips(TIFF *tif, uint32_t delta, const char *module)
 {
-	TIFFDirectory *td = &tif->tif_dir;
-	uint64* new_stripoffset;
-	uint64* new_stripbytecount;
-
-	assert(td->td_planarconfig == PLANARCONFIG_CONTIG);
-	new_stripoffset = (uint64*)_TIFFrealloc(td->td_stripoffset_p,
-		(td->td_nstrips + delta) * sizeof (uint64));
-	new_stripbytecount = (uint64*)_TIFFrealloc(td->td_stripbytecount_p,
-		(td->td_nstrips + delta) * sizeof (uint64));
-	if (new_stripoffset == NULL || new_stripbytecount == NULL) {
-		if (new_stripoffset)
-			_TIFFfree(new_stripoffset);
-		if (new_stripbytecount)
-			_TIFFfree(new_stripbytecount);
-		td->td_nstrips = 0;
-		TIFFErrorExt(tif->tif_clientdata, module, "No space to expand strip arrays");
-		return (0);
-	}
-	td->td_stripoffset_p = new_stripoffset;
-	td->td_stripbytecount_p = new_stripbytecount;
-	_TIFFmemset(td->td_stripoffset_p + td->td_nstrips,
-		    0, delta*sizeof (uint64));
-	_TIFFmemset(td->td_stripbytecount_p + td->td_nstrips,
-		    0, delta*sizeof (uint64));
-	td->td_nstrips += delta;
-        tif->tif_flags |= TIFF_DIRTYDIRECT;
-
-	return (1);
+    TIFFDirectory *td = &tif->tif_dir;
+    uint64_t *new_stripoffset;
+    uint64_t *new_stripbytecount;
+
+    assert(td->td_planarconfig == PLANARCONFIG_CONTIG);
+    new_stripoffset = (uint64_t *)_TIFFreallocExt(
+        tif, td->td_stripoffset_p, (td->td_nstrips + delta) * sizeof(uint64_t));
+    new_stripbytecount = (uint64_t *)_TIFFreallocExt(
+        tif, td->td_stripbytecount_p,
+        (td->td_nstrips + delta) * sizeof(uint64_t));
+    if (new_stripoffset == NULL || new_stripbytecount == NULL)
+    {
+        if (new_stripoffset)
+            _TIFFfreeExt(tif, new_stripoffset);
+        if (new_stripbytecount)
+            _TIFFfreeExt(tif, new_stripbytecount);
+        td->td_nstrips = 0;
+        TIFFErrorExtR(tif, module, "No space to expand strip arrays");
+        return (0);
+    }
+    td->td_stripoffset_p = new_stripoffset;
+    td->td_stripbytecount_p = new_stripbytecount;
+    _TIFFmemset(td->td_stripoffset_p + td->td_nstrips, 0,
+                delta * sizeof(uint64_t));
+    _TIFFmemset(td->td_stripbytecount_p + td->td_nstrips, 0,
+                delta * sizeof(uint64_t));
+    td->td_nstrips += delta;
+    tif->tif_flags |= TIFF_DIRTYDIRECT;
+
+    return (1);
 }
 
 /*
  * Append the data to the specified strip.
  */
-static int
-TIFFAppendToStrip(TIFF* tif, uint32 strip, uint8* data, tmsize_t cc)
+static int TIFFAppendToStrip(TIFF *tif, uint32_t strip, uint8_t *data,
+                             tmsize_t cc)
 {
-	static const char module[] = "TIFFAppendToStrip";
-	TIFFDirectory *td = &tif->tif_dir;
-	uint64 m;
-        int64 old_byte_count = -1;
+    static const char module[] = "TIFFAppendToStrip";
+    TIFFDirectory *td = &tif->tif_dir;
+    uint64_t m;
+    int64_t old_byte_count = -1;
 
-	if (td->td_stripoffset_p[strip] == 0 || tif->tif_curoff == 0) {
-            assert(td->td_nstrips > 0);
+    if (tif->tif_curoff == 0)
+        tif->tif_lastvalidoff = 0;
 
-            if( td->td_stripbytecount_p[strip] != 0 
-                && td->td_stripoffset_p[strip] != 0 
-                && td->td_stripbytecount_p[strip] >= (uint64) cc )
-            {
-                /* 
-                 * There is already tile data on disk, and the new tile
-                 * data we have will fit in the same space.  The only 
-                 * aspect of this that is risky is that there could be
-                 * more data to append to this strip before we are done
-                 * depending on how we are getting called.
-                 */
-                if (!SeekOK(tif, td->td_stripoffset_p[strip])) {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                 "Seek error at scanline %lu",
-                                 (unsigned long)tif->tif_row);
-                    return (0);
-                }
-            }
-            else
+    if (td->td_stripoffset_p[strip] == 0 || tif->tif_curoff == 0)
+    {
+        assert(td->td_nstrips > 0);
+
+        if (td->td_stripbytecount_p[strip] != 0 &&
+            td->td_stripoffset_p[strip] != 0 &&
+            td->td_stripbytecount_p[strip] >= (uint64_t)cc)
+        {
+            /*
+             * There is already tile data on disk, and the new tile
+             * data we have will fit in the same space.  The only
+             * aspect of this that is risky is that there could be
+             * more data to append to this strip before we are done
+             * depending on how we are getting called.
+             */
+            if (!SeekOK(tif, td->td_stripoffset_p[strip]))
             {
-                /* 
-                 * Seek to end of file, and set that as our location to 
-                 * write this strip.
-                 */
-                td->td_stripoffset_p[strip] = TIFFSeekFile(tif, 0, SEEK_END);
-                tif->tif_flags |= TIFF_DIRTYSTRIP;
+                TIFFErrorExtR(tif, module, "Seek error at scanline %lu",
+                              (unsigned long)tif->tif_row);
+                return (0);
             }
 
-            tif->tif_curoff = td->td_stripoffset_p[strip];
-
+            tif->tif_lastvalidoff =
+                td->td_stripoffset_p[strip] + td->td_stripbytecount_p[strip];
+        }
+        else
+        {
             /*
-             * We are starting a fresh strip/tile, so set the size to zero.
+             * Seek to end of file, and set that as our location to
+             * write this strip.
              */
-            old_byte_count = td->td_stripbytecount_p[strip];
-            td->td_stripbytecount_p[strip] = 0;
-	}
-
-	m = tif->tif_curoff+cc;
-	if (!(tif->tif_flags&TIFF_BIGTIFF))
-		m = (uint32)m;
-	if ((m<tif->tif_curoff)||(m<(uint64)cc))
-	{
-		TIFFErrorExt(tif->tif_clientdata, module, "Maximum TIFF file size exceeded");
-		return (0);
-	}
-	if (!WriteOK(tif, data, cc)) {
-		TIFFErrorExt(tif->tif_clientdata, module, "Write error at scanline %lu",
-		    (unsigned long) tif->tif_row);
-		    return (0);
-	}
-	tif->tif_curoff = m;
-	td->td_stripbytecount_p[strip] += cc;
-
-        if( (int64) td->td_stripbytecount_p[strip] != old_byte_count )
+            td->td_stripoffset_p[strip] = TIFFSeekFile(tif, 0, SEEK_END);
             tif->tif_flags |= TIFF_DIRTYSTRIP;
-            
-	return (1);
+        }
+
+        tif->tif_curoff = td->td_stripoffset_p[strip];
+
+        /*
+         * We are starting a fresh strip/tile, so set the size to zero.
+         */
+        old_byte_count = td->td_stripbytecount_p[strip];
+        td->td_stripbytecount_p[strip] = 0;
+    }
+
+    m = tif->tif_curoff + cc;
+    if (!(tif->tif_flags & TIFF_BIGTIFF))
+        m = (uint32_t)m;
+    if ((m < tif->tif_curoff) || (m < (uint64_t)cc))
+    {
+        TIFFErrorExtR(tif, module, "Maximum TIFF file size exceeded");
+        return (0);
+    }
+
+    if (tif->tif_lastvalidoff != 0 && m > tif->tif_lastvalidoff &&
+        td->td_stripbytecount_p[strip] > 0)
+    {
+        /* Ouch: we have detected that we are rewriting in place a strip/tile */
+        /* with several calls to TIFFAppendToStrip(). The first call was with */
+        /* a size smaller than the previous size of the strip/tile, so we */
+        /* opted to rewrite in place, but a following call causes us to go */
+        /* outsize of the strip/tile area, so we have to finally go for a */
+        /* append-at-end-of-file strategy, and start by moving what we already
+         */
+        /* wrote. */
+        tmsize_t tempSize;
+        void *temp;
+        uint64_t offsetRead;
+        uint64_t offsetWrite;
+        uint64_t toCopy = td->td_stripbytecount_p[strip];
+
+        if (toCopy < 1024 * 1024)
+            tempSize = (tmsize_t)toCopy;
+        else
+            tempSize = 1024 * 1024;
+
+        offsetRead = td->td_stripoffset_p[strip];
+        offsetWrite = TIFFSeekFile(tif, 0, SEEK_END);
+
+        m = offsetWrite + toCopy + cc;
+        if (!(tif->tif_flags & TIFF_BIGTIFF) && m != (uint32_t)m)
+        {
+            TIFFErrorExtR(tif, module, "Maximum TIFF file size exceeded");
+            return (0);
+        }
+
+        temp = _TIFFmallocExt(tif, tempSize);
+        if (temp == NULL)
+        {
+            TIFFErrorExtR(tif, module, "No space for output buffer");
+            return (0);
+        }
+
+        tif->tif_flags |= TIFF_DIRTYSTRIP;
+
+        td->td_stripoffset_p[strip] = offsetWrite;
+        td->td_stripbytecount_p[strip] = 0;
+
+        /* Move data written by previous calls to us at end of file */
+        while (toCopy > 0)
+        {
+            if (!SeekOK(tif, offsetRead))
+            {
+                TIFFErrorExtR(tif, module, "Seek error");
+                _TIFFfreeExt(tif, temp);
+                return (0);
+            }
+            if (!ReadOK(tif, temp, tempSize))
+            {
+                TIFFErrorExtR(tif, module, "Cannot read");
+                _TIFFfreeExt(tif, temp);
+                return (0);
+            }
+            if (!SeekOK(tif, offsetWrite))
+            {
+                TIFFErrorExtR(tif, module, "Seek error");
+                _TIFFfreeExt(tif, temp);
+                return (0);
+            }
+            if (!WriteOK(tif, temp, tempSize))
+            {
+                TIFFErrorExtR(tif, module, "Cannot write");
+                _TIFFfreeExt(tif, temp);
+                return (0);
+            }
+            offsetRead += tempSize;
+            offsetWrite += tempSize;
+            td->td_stripbytecount_p[strip] += tempSize;
+            toCopy -= tempSize;
+        }
+        _TIFFfreeExt(tif, temp);
+
+        /* Append the data of this call */
+        offsetWrite += cc;
+        m = offsetWrite;
+    }
+
+    if (!WriteOK(tif, data, cc))
+    {
+        TIFFErrorExtR(tif, module, "Write error at scanline %lu",
+                      (unsigned long)tif->tif_row);
+        return (0);
+    }
+    tif->tif_curoff = m;
+    td->td_stripbytecount_p[strip] += cc;
+
+    if ((int64_t)td->td_stripbytecount_p[strip] != old_byte_count)
+        tif->tif_flags |= TIFF_DIRTYSTRIP;
+
+    return (1);
 }
 
 /*
@@ -810,29 +923,28 @@ TIFFAppendToStrip(TIFF* tif, uint32 strip, uint8* data, tmsize_t cc)
  * called by ``encodestrip routines'' w/o concern
  * for infinite recursion.
  */
-int
-TIFFFlushData1(TIFF* tif)
+int TIFFFlushData1(TIFF *tif)
 {
-	if (tif->tif_rawcc > 0 && tif->tif_flags & TIFF_BUF4WRITE ) {
-		if (!isFillOrder(tif, tif->tif_dir.td_fillorder) &&
-		    (tif->tif_flags & TIFF_NOBITREV) == 0)
-			TIFFReverseBits((uint8*)tif->tif_rawdata,
-			    tif->tif_rawcc);
-		if (!TIFFAppendToStrip(tif,
-		    isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip,
-		    tif->tif_rawdata, tif->tif_rawcc))
+    if (tif->tif_rawcc > 0 && tif->tif_flags & TIFF_BUF4WRITE)
+    {
+        if (!isFillOrder(tif, tif->tif_dir.td_fillorder) &&
+            (tif->tif_flags & TIFF_NOBITREV) == 0)
+            TIFFReverseBits((uint8_t *)tif->tif_rawdata, tif->tif_rawcc);
+        if (!TIFFAppendToStrip(
+                tif, isTiled(tif) ? tif->tif_curtile : tif->tif_curstrip,
+                tif->tif_rawdata, tif->tif_rawcc))
         {
             /* We update those variables even in case of error since there's */
             /* code that doesn't really check the return code of this */
             /* function */
             tif->tif_rawcc = 0;
             tif->tif_rawcp = tif->tif_rawdata;
-			return (0);
+            return (0);
         }
-		tif->tif_rawcc = 0;
-		tif->tif_rawcp = tif->tif_rawdata;
-	}
-	return (1);
+        tif->tif_rawcc = 0;
+        tif->tif_rawcp = tif->tif_rawdata;
+    }
+    return (1);
 }
 
 /*
@@ -841,17 +953,8 @@ TIFFFlushData1(TIFF* tif)
  * (very carefully), or to 0 so that the next write gets
  * appended to the end of the file.
  */
-void
-TIFFSetWriteOffset(TIFF* tif, toff_t off)
+void TIFFSetWriteOffset(TIFF *tif, toff_t off)
 {
-	tif->tif_curoff = off;
+    tif->tif_curoff = off;
+    tif->tif_lastvalidoff = 0;
 }
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_zip.c b/3rdparty/libtiff/tif_zip.c
index e71c312c80f1..fcf510044c50 100644
--- a/3rdparty/libtiff/tif_zip.c
+++ b/3rdparty/libtiff/tif_zip.c
@@ -2,23 +2,23 @@
  * Copyright (c) 1995-1997 Sam Leffler
  * Copyright (c) 1995-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
@@ -57,644 +57,675 @@
 #error "Antiquated ZLIB software; you must use version 1.0 or later"
 #endif
 
-#define SAFE_MSG(sp)   ((sp)->stream.msg == NULL ? "" : (sp)->stream.msg)
+#define SAFE_MSG(sp) ((sp)->stream.msg == NULL ? "" : (sp)->stream.msg)
 
 /*
  * State block for each open TIFF
  * file using ZIP compression/decompression.
  */
-typedef struct {
-	TIFFPredictorState predict;
-        z_stream        stream;
-	int             zipquality;            /* compression level */
-	int             state;                 /* state flags */
-	int             subcodec;              /* DEFLATE_SUBCODEC_ZLIB or DEFLATE_SUBCODEC_LIBDEFLATE */
+typedef struct
+{
+    TIFFPredictorState predict;
+    z_stream stream;
+    int zipquality; /* compression level */
+    int state;      /* state flags */
+    int subcodec;   /* DEFLATE_SUBCODEC_ZLIB or DEFLATE_SUBCODEC_LIBDEFLATE */
 #if LIBDEFLATE_SUPPORT
-	int             libdeflate_state;       /* -1 = until first time ZIPEncode() / ZIPDecode() is called, 0 = use zlib, 1 = use libdeflate */
-	struct libdeflate_decompressor* libdeflate_dec;
-	struct libdeflate_compressor*   libdeflate_enc;
+    int libdeflate_state; /* -1 = until first time ZIPEncode() / ZIPDecode() is
+                             called, 0 = use zlib, 1 = use libdeflate */
+    struct libdeflate_decompressor *libdeflate_dec;
+    struct libdeflate_compressor *libdeflate_enc;
 #endif
 #define ZSTATE_INIT_DECODE 0x01
 #define ZSTATE_INIT_ENCODE 0x02
 
-	TIFFVGetMethod  vgetparent;            /* super-class method */
-	TIFFVSetMethod  vsetparent;            /* super-class method */
+    TIFFVGetMethod vgetparent; /* super-class method */
+    TIFFVSetMethod vsetparent; /* super-class method */
 } ZIPState;
 
-#define ZState(tif)             ((ZIPState*) (tif)->tif_data)
-#define DecoderState(tif)       ZState(tif)
-#define EncoderState(tif)       ZState(tif)
+#define ZState(tif) ((ZIPState *)(tif)->tif_data)
+#define DecoderState(tif) ZState(tif)
+#define EncoderState(tif) ZState(tif)
 
-static int ZIPEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s);
-static int ZIPDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s);
+static int ZIPEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s);
+static int ZIPDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s);
 
-static int
-ZIPFixupTags(TIFF* tif)
+static int ZIPFixupTags(TIFF *tif)
 {
-	(void) tif;
-	return (1);
+    (void)tif;
+    return (1);
 }
 
-static int
-ZIPSetupDecode(TIFF* tif)
+static int ZIPSetupDecode(TIFF *tif)
 {
-	static const char module[] = "ZIPSetupDecode";
-	ZIPState* sp = DecoderState(tif);
-
-	assert(sp != NULL);
-        
-        /* if we were last encoding, terminate this mode */
-	if (sp->state & ZSTATE_INIT_ENCODE) {
-	    deflateEnd(&sp->stream);
-	    sp->state = 0;
-	}
-
-	/* This function can possibly be called several times by */
-	/* PredictorSetupDecode() if this function succeeds but */
-	/* PredictorSetup() fails */
-	if ((sp->state & ZSTATE_INIT_DECODE) == 0 &&
-	    inflateInit(&sp->stream) != Z_OK) {
-		TIFFErrorExt(tif->tif_clientdata, module, "%s", SAFE_MSG(sp));
-		return (0);
-	} else {
-		sp->state |= ZSTATE_INIT_DECODE;
-		return (1);
-	}
+    static const char module[] = "ZIPSetupDecode";
+    ZIPState *sp = DecoderState(tif);
+
+    assert(sp != NULL);
+
+    /* if we were last encoding, terminate this mode */
+    if (sp->state & ZSTATE_INIT_ENCODE)
+    {
+        deflateEnd(&sp->stream);
+        sp->state = 0;
+    }
+
+    /* This function can possibly be called several times by */
+    /* PredictorSetupDecode() if this function succeeds but */
+    /* PredictorSetup() fails */
+    if ((sp->state & ZSTATE_INIT_DECODE) == 0 &&
+        inflateInit(&sp->stream) != Z_OK)
+    {
+        TIFFErrorExtR(tif, module, "%s", SAFE_MSG(sp));
+        return (0);
+    }
+    else
+    {
+        sp->state |= ZSTATE_INIT_DECODE;
+        return (1);
+    }
 }
 
 /*
  * Setup state for decoding a strip.
  */
-static int
-ZIPPreDecode(TIFF* tif, uint16 s)
+static int ZIPPreDecode(TIFF *tif, uint16_t s)
 {
-	ZIPState* sp = DecoderState(tif);
+    ZIPState *sp = DecoderState(tif);
 
-	(void) s;
-	assert(sp != NULL);
+    (void)s;
+    assert(sp != NULL);
 
-	if( (sp->state & ZSTATE_INIT_DECODE) == 0 )
-            tif->tif_setupdecode( tif );
+    if ((sp->state & ZSTATE_INIT_DECODE) == 0)
+        tif->tif_setupdecode(tif);
 
 #if LIBDEFLATE_SUPPORT
-        sp->libdeflate_state = -1;
+    sp->libdeflate_state = -1;
 #endif
-	sp->stream.next_in = tif->tif_rawdata;
-	assert(sizeof(sp->stream.avail_in)==4);  /* if this assert gets raised,
-	    we need to simplify this code to reflect a ZLib that is likely updated
-	    to deal with 8byte memory sizes, though this code will respond
-	    appropriately even before we simplify it */
-	sp->stream.avail_in = (uint64)tif->tif_rawcc < 0xFFFFFFFFU ? (uInt) tif->tif_rawcc : 0xFFFFFFFFU;
-	return (inflateReset(&sp->stream) == Z_OK);
+    sp->stream.next_in = tif->tif_rawdata;
+    assert(sizeof(sp->stream.avail_in) == 4); /* if this assert gets raised,
+         we need to simplify this code to reflect a ZLib that is likely updated
+         to deal with 8byte memory sizes, though this code will respond
+         appropriately even before we simplify it */
+    sp->stream.avail_in = (uint64_t)tif->tif_rawcc < 0xFFFFFFFFU
+                              ? (uInt)tif->tif_rawcc
+                              : 0xFFFFFFFFU;
+    return (inflateReset(&sp->stream) == Z_OK);
 }
 
-static int
-ZIPDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int ZIPDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-	static const char module[] = "ZIPDecode";
-	ZIPState* sp = DecoderState(tif);
+    static const char module[] = "ZIPDecode";
+    ZIPState *sp = DecoderState(tif);
 
-	(void) s;
-	assert(sp != NULL);
-	assert(sp->state == ZSTATE_INIT_DECODE);
+    (void)s;
+    assert(sp != NULL);
+    assert(sp->state == ZSTATE_INIT_DECODE);
 
 #if LIBDEFLATE_SUPPORT
-        if( sp->libdeflate_state == 1 )
-            return 0;
-
-        /* If we have libdeflate support and we are asked to read a whole */
-        /* strip/tile, then go for using it */
-        do {
-            TIFFDirectory *td = &tif->tif_dir;
-
-            if( sp->libdeflate_state == 0 )
+    if (sp->libdeflate_state == 1)
+        return 0;
+
+    /* If we have libdeflate support and we are asked to read a whole */
+    /* strip/tile, then go for using it */
+    do
+    {
+        TIFFDirectory *td = &tif->tif_dir;
+
+        if (sp->libdeflate_state == 0)
+            break;
+        if (sp->subcodec == DEFLATE_SUBCODEC_ZLIB)
+            break;
+
+        /* Check if we are in the situation where we can use libdeflate */
+        if (isTiled(tif))
+        {
+            if (TIFFTileSize64(tif) != (uint64_t)occ)
                 break;
-            if( sp->subcodec == DEFLATE_SUBCODEC_ZLIB )
+        }
+        else
+        {
+            uint32_t strip_height = td->td_imagelength - tif->tif_row;
+            if (strip_height > td->td_rowsperstrip)
+                strip_height = td->td_rowsperstrip;
+            if (TIFFVStripSize64(tif, strip_height) != (uint64_t)occ)
                 break;
-
-            /* Check if we are in the situation where we can use libdeflate */
-            if (isTiled(tif)) {
-                if( TIFFTileSize64(tif) != (uint64)occ )
-                    break;
-            } else {
-                uint32 strip_height = td->td_imagelength - tif->tif_row;
-                if (strip_height > td->td_rowsperstrip)
-                    strip_height = td->td_rowsperstrip;
-                if( TIFFVStripSize64(tif, strip_height) != (uint64)occ )
-                    break;
-            }
-
-            /* Check for overflow */
-            if( (size_t)tif->tif_rawcc != (uint64)tif->tif_rawcc )
-                break;
-            if( (size_t)occ != (uint64)occ )
-                break;
-
-            /* Go for decompression using libdeflate */
+        }
+
+        /* Check for overflow */
+        if ((size_t)tif->tif_rawcc != (uint64_t)tif->tif_rawcc)
+            break;
+        if ((size_t)occ != (uint64_t)occ)
+            break;
+
+        /* Go for decompression using libdeflate */
+        {
+            enum libdeflate_result res;
+            if (sp->libdeflate_dec == NULL)
             {
-                enum libdeflate_result res;
-                if( sp->libdeflate_dec == NULL )
+                sp->libdeflate_dec = libdeflate_alloc_decompressor();
+                if (sp->libdeflate_dec == NULL)
                 {
-                    sp->libdeflate_dec = libdeflate_alloc_decompressor();
-                    if( sp->libdeflate_dec == NULL )
-                    {
-                        break;
-                    }
+                    break;
                 }
+            }
 
-                sp->libdeflate_state = 1;
-
-                res = libdeflate_zlib_decompress(
-                    sp->libdeflate_dec, tif->tif_rawcp, (size_t)tif->tif_rawcc, op, (size_t)occ, NULL);
+            sp->libdeflate_state = 1;
 
-                tif->tif_rawcp += tif->tif_rawcc;
-                tif->tif_rawcc = 0;
+            res = libdeflate_zlib_decompress(sp->libdeflate_dec, tif->tif_rawcp,
+                                             (size_t)tif->tif_rawcc, op,
+                                             (size_t)occ, NULL);
 
-                /* We accept LIBDEFLATE_INSUFFICIENT_SPACE has a return */
-                /* There are odd files in the wild where the last strip, when */
-                /* it is smaller in height than td_rowsperstrip, actually contains */
-                /* data for td_rowsperstrip lines. Just ignore that silently. */
-                if( res != LIBDEFLATE_SUCCESS &&
-                    res != LIBDEFLATE_INSUFFICIENT_SPACE )
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                 "Decoding error at scanline %lu",
-                                 (unsigned long) tif->tif_row);
-                    return 0;
-                }
+            tif->tif_rawcp += tif->tif_rawcc;
+            tif->tif_rawcc = 0;
 
-                return 1;
+            /* We accept LIBDEFLATE_INSUFFICIENT_SPACE has a return */
+            /* There are odd files in the wild where the last strip, when */
+            /* it is smaller in height than td_rowsperstrip, actually contains
+             */
+            /* data for td_rowsperstrip lines. Just ignore that silently. */
+            if (res != LIBDEFLATE_SUCCESS &&
+                res != LIBDEFLATE_INSUFFICIENT_SPACE)
+            {
+                TIFFErrorExtR(tif, module, "Decoding error at scanline %lu",
+                              (unsigned long)tif->tif_row);
+                return 0;
             }
-        } while(0);
-        sp->libdeflate_state = 0;
+
+            return 1;
+        }
+    } while (0);
+    sp->libdeflate_state = 0;
 #endif /* LIBDEFLATE_SUPPORT */
 
-        sp->stream.next_in = tif->tif_rawcp;
-        
-	sp->stream.next_out = op;
-	assert(sizeof(sp->stream.avail_out)==4);  /* if this assert gets raised,
-	    we need to simplify this code to reflect a ZLib that is likely updated
-	    to deal with 8byte memory sizes, though this code will respond
-	    appropriately even before we simplify it */
-	do {
-                int state;
-                uInt avail_in_before = (uint64)tif->tif_rawcc <= 0xFFFFFFFFU ? (uInt)tif->tif_rawcc : 0xFFFFFFFFU;
-                uInt avail_out_before = (uint64)occ < 0xFFFFFFFFU ? (uInt) occ : 0xFFFFFFFFU;
-                sp->stream.avail_in = avail_in_before;
-                sp->stream.avail_out = avail_out_before;
-		state = inflate(&sp->stream, Z_PARTIAL_FLUSH);
-		tif->tif_rawcc -= (avail_in_before - sp->stream.avail_in);
-                occ -= (avail_out_before - sp->stream.avail_out);
-		if (state == Z_STREAM_END)
-			break;
-		if (state == Z_DATA_ERROR) {
-			TIFFErrorExt(tif->tif_clientdata, module,
-			    "Decoding error at scanline %lu, %s",
-			     (unsigned long) tif->tif_row, SAFE_MSG(sp));
-			return (0);
-		}
-		if (state != Z_OK) {
-			TIFFErrorExt(tif->tif_clientdata, module, 
-				     "ZLib error: %s", SAFE_MSG(sp));
-			return (0);
-		}
-	} while (occ > 0);
-	if (occ != 0) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-		    "Not enough data at scanline %lu (short " TIFF_UINT64_FORMAT " bytes)",
-		    (unsigned long) tif->tif_row, (TIFF_UINT64_T) occ);
-		return (0);
-	}
-
-        tif->tif_rawcp = sp->stream.next_in;
-
-	return (1);
+    sp->stream.next_in = tif->tif_rawcp;
+
+    sp->stream.next_out = op;
+    assert(sizeof(sp->stream.avail_out) == 4); /* if this assert gets raised,
+         we need to simplify this code to reflect a ZLib that is likely updated
+         to deal with 8byte memory sizes, though this code will respond
+         appropriately even before we simplify it */
+    do
+    {
+        int state;
+        uInt avail_in_before = (uint64_t)tif->tif_rawcc <= 0xFFFFFFFFU
+                                   ? (uInt)tif->tif_rawcc
+                                   : 0xFFFFFFFFU;
+        uInt avail_out_before =
+            (uint64_t)occ < 0xFFFFFFFFU ? (uInt)occ : 0xFFFFFFFFU;
+        sp->stream.avail_in = avail_in_before;
+        sp->stream.avail_out = avail_out_before;
+        /* coverity[overrun-buffer-arg] */
+        state = inflate(&sp->stream, Z_PARTIAL_FLUSH);
+        tif->tif_rawcc -= (avail_in_before - sp->stream.avail_in);
+        occ -= (avail_out_before - sp->stream.avail_out);
+        if (state == Z_STREAM_END)
+            break;
+        if (state == Z_DATA_ERROR)
+        {
+            TIFFErrorExtR(tif, module, "Decoding error at scanline %lu, %s",
+                          (unsigned long)tif->tif_row, SAFE_MSG(sp));
+            return (0);
+        }
+        if (state != Z_OK)
+        {
+            TIFFErrorExtR(tif, module, "ZLib error: %s", SAFE_MSG(sp));
+            return (0);
+        }
+    } while (occ > 0);
+    if (occ != 0)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Not enough data at scanline %lu (short %" PRIu64
+                      " bytes)",
+                      (unsigned long)tif->tif_row, (uint64_t)occ);
+        return (0);
+    }
+
+    tif->tif_rawcp = sp->stream.next_in;
+
+    return (1);
 }
 
-static int
-ZIPSetupEncode(TIFF* tif)
+static int ZIPSetupEncode(TIFF *tif)
 {
-	static const char module[] = "ZIPSetupEncode";
-	ZIPState* sp = EncoderState(tif);
-        int cappedQuality;
-
-	assert(sp != NULL);
-	if (sp->state & ZSTATE_INIT_DECODE) {
-		inflateEnd(&sp->stream);
-		sp->state = 0;
-	}
-
-        cappedQuality = sp->zipquality;
-        if( cappedQuality > Z_BEST_COMPRESSION )
-            cappedQuality = Z_BEST_COMPRESSION;
-
-	if (deflateInit(&sp->stream, cappedQuality) != Z_OK) {
-		TIFFErrorExt(tif->tif_clientdata, module, "%s", SAFE_MSG(sp));
-		return (0);
-	} else {
-		sp->state |= ZSTATE_INIT_ENCODE;
-		return (1);
-	}
+    static const char module[] = "ZIPSetupEncode";
+    ZIPState *sp = EncoderState(tif);
+    int cappedQuality;
+
+    assert(sp != NULL);
+    if (sp->state & ZSTATE_INIT_DECODE)
+    {
+        inflateEnd(&sp->stream);
+        sp->state = 0;
+    }
+
+    cappedQuality = sp->zipquality;
+    if (cappedQuality > Z_BEST_COMPRESSION)
+        cappedQuality = Z_BEST_COMPRESSION;
+
+    if (deflateInit(&sp->stream, cappedQuality) != Z_OK)
+    {
+        TIFFErrorExtR(tif, module, "%s", SAFE_MSG(sp));
+        return (0);
+    }
+    else
+    {
+        sp->state |= ZSTATE_INIT_ENCODE;
+        return (1);
+    }
 }
 
 /*
  * Reset encoding state at the start of a strip.
  */
-static int
-ZIPPreEncode(TIFF* tif, uint16 s)
+static int ZIPPreEncode(TIFF *tif, uint16_t s)
 {
-	ZIPState *sp = EncoderState(tif);
+    ZIPState *sp = EncoderState(tif);
 
-	(void) s;
-	assert(sp != NULL);
-	if( sp->state != ZSTATE_INIT_ENCODE )
-            tif->tif_setupencode( tif );
+    (void)s;
+    assert(sp != NULL);
+    if (sp->state != ZSTATE_INIT_ENCODE)
+        tif->tif_setupencode(tif);
 
 #if LIBDEFLATE_SUPPORT
-        sp->libdeflate_state = -1;
+    sp->libdeflate_state = -1;
 #endif
-	sp->stream.next_out = tif->tif_rawdata;
-	assert(sizeof(sp->stream.avail_out)==4);  /* if this assert gets raised,
-	    we need to simplify this code to reflect a ZLib that is likely updated
-	    to deal with 8byte memory sizes, though this code will respond
-	    appropriately even before we simplify it */
-	sp->stream.avail_out = (uint64)tif->tif_rawdatasize <= 0xFFFFFFFFU ? (uInt)tif->tif_rawdatasize : 0xFFFFFFFFU;
-	return (deflateReset(&sp->stream) == Z_OK);
+    sp->stream.next_out = tif->tif_rawdata;
+    assert(sizeof(sp->stream.avail_out) == 4); /* if this assert gets raised,
+         we need to simplify this code to reflect a ZLib that is likely updated
+         to deal with 8byte memory sizes, though this code will respond
+         appropriately even before we simplify it */
+    sp->stream.avail_out = (uint64_t)tif->tif_rawdatasize <= 0xFFFFFFFFU
+                               ? (uInt)tif->tif_rawdatasize
+                               : 0xFFFFFFFFU;
+    return (deflateReset(&sp->stream) == Z_OK);
 }
 
 /*
  * Encode a chunk of pixels.
  */
-static int
-ZIPEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+static int ZIPEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-	static const char module[] = "ZIPEncode";
-	ZIPState *sp = EncoderState(tif);
+    static const char module[] = "ZIPEncode";
+    ZIPState *sp = EncoderState(tif);
 
-	assert(sp != NULL);
-	assert(sp->state == ZSTATE_INIT_ENCODE);
+    assert(sp != NULL);
+    assert(sp->state == ZSTATE_INIT_ENCODE);
 
-	(void) s;
+    (void)s;
 
 #if LIBDEFLATE_SUPPORT
-        if( sp->libdeflate_state == 1 )
-            return 0;
-
-        /* If we have libdeflate support and we are asked to write a whole */
-        /* strip/tile, then go for using it */
-        do {
-            TIFFDirectory *td = &tif->tif_dir;
-
-            if( sp->libdeflate_state == 0 )
-                break;
-            if( sp->subcodec == DEFLATE_SUBCODEC_ZLIB )
-                break;
-
-            /* Libdeflate does not support the 0-compression level */
-            if( sp->zipquality == Z_NO_COMPRESSION )
-                break;
-
-            /* Check if we are in the situation where we can use libdeflate */
-            if (isTiled(tif)) {
-                if( TIFFTileSize64(tif) != (uint64)cc )
-                    break;
-            } else {
-                uint32 strip_height = td->td_imagelength - tif->tif_row;
-                if (strip_height > td->td_rowsperstrip)
-                    strip_height = td->td_rowsperstrip;
-                if( TIFFVStripSize64(tif, strip_height) != (uint64)cc )
-                    break;
-            }
-
-            /* Check for overflow */
-            if( (size_t)tif->tif_rawdatasize != (uint64)tif->tif_rawdatasize )
+    if (sp->libdeflate_state == 1)
+        return 0;
+
+    /* If we have libdeflate support and we are asked to write a whole */
+    /* strip/tile, then go for using it */
+    do
+    {
+        TIFFDirectory *td = &tif->tif_dir;
+
+        if (sp->libdeflate_state == 0)
+            break;
+        if (sp->subcodec == DEFLATE_SUBCODEC_ZLIB)
+            break;
+
+        /* Libdeflate does not support the 0-compression level */
+        if (sp->zipquality == Z_NO_COMPRESSION)
+            break;
+
+        /* Check if we are in the situation where we can use libdeflate */
+        if (isTiled(tif))
+        {
+            if (TIFFTileSize64(tif) != (uint64_t)cc)
                 break;
-            if( (size_t)cc != (uint64)cc )
+        }
+        else
+        {
+            uint32_t strip_height = td->td_imagelength - tif->tif_row;
+            if (strip_height > td->td_rowsperstrip)
+                strip_height = td->td_rowsperstrip;
+            if (TIFFVStripSize64(tif, strip_height) != (uint64_t)cc)
                 break;
-
-            /* Go for compression using libdeflate */
+        }
+
+        /* Check for overflow */
+        if ((size_t)tif->tif_rawdatasize != (uint64_t)tif->tif_rawdatasize)
+            break;
+        if ((size_t)cc != (uint64_t)cc)
+            break;
+
+        /* Go for compression using libdeflate */
+        {
+            size_t nCompressedBytes;
+            if (sp->libdeflate_enc == NULL)
             {
-                size_t nCompressedBytes;
-                if( sp->libdeflate_enc == NULL )
-                {
-                    /* To get results as good as zlib, we asked for an extra */
-                    /* level of compression */
-                    sp->libdeflate_enc = libdeflate_alloc_compressor(
-                        sp->zipquality == Z_DEFAULT_COMPRESSION ? 7 :
-                        sp->zipquality >= 6 && sp->zipquality <= 9 ? sp->zipquality + 1 :
-                        sp->zipquality);
-                    if( sp->libdeflate_enc == NULL )
-                    {
-                        TIFFErrorExt(tif->tif_clientdata, module,
-                                    "Cannot allocate compressor");
-                        break;
-                    }
-                }
-
-                /* Make sure the output buffer is large enough for the worse case. */
-                /* In TIFFWriteBufferSetup(), when libtiff allocates the buffer */
-                /* we've taken a 10% margin over the uncompressed size, which should */
-                /* be large enough even for the the worse case scenario. */
-                if( libdeflate_zlib_compress_bound(sp->libdeflate_enc, (size_t)cc) >
-                        (size_t)tif->tif_rawdatasize)
+                /* To get results as good as zlib, we asked for an extra */
+                /* level of compression */
+                sp->libdeflate_enc = libdeflate_alloc_compressor(
+                    sp->zipquality == Z_DEFAULT_COMPRESSION ? 7
+                    : sp->zipquality >= 6 && sp->zipquality <= 9
+                        ? sp->zipquality + 1
+                        : sp->zipquality);
+                if (sp->libdeflate_enc == NULL)
                 {
+                    TIFFErrorExtR(tif, module, "Cannot allocate compressor");
                     break;
                 }
+            }
 
-                sp->libdeflate_state = 1;
-                nCompressedBytes = libdeflate_zlib_compress(
-                    sp->libdeflate_enc, bp, (size_t)cc, tif->tif_rawdata, (size_t)tif->tif_rawdatasize);
+            /* Make sure the output buffer is large enough for the worse case.
+             */
+            /* In TIFFWriteBufferSetup(), when libtiff allocates the buffer */
+            /* we've taken a 10% margin over the uncompressed size, which should
+             */
+            /* be large enough even for the the worse case scenario. */
+            if (libdeflate_zlib_compress_bound(sp->libdeflate_enc, (size_t)cc) >
+                (size_t)tif->tif_rawdatasize)
+            {
+                break;
+            }
 
-                if( nCompressedBytes == 0 )
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                 "Encoder error at scanline %lu",
-                                 (unsigned long) tif->tif_row);
-                    return 0;
-                }
+            sp->libdeflate_state = 1;
+            nCompressedBytes = libdeflate_zlib_compress(
+                sp->libdeflate_enc, bp, (size_t)cc, tif->tif_rawdata,
+                (size_t)tif->tif_rawdatasize);
 
-                tif->tif_rawcc = nCompressedBytes;
+            if (nCompressedBytes == 0)
+            {
+                TIFFErrorExtR(tif, module, "Encoder error at scanline %lu",
+                              (unsigned long)tif->tif_row);
+                return 0;
+            }
 
-                if( !TIFFFlushData1(tif) )
-                    return 0;
+            tif->tif_rawcc = nCompressedBytes;
 
-                return 1;
-            }
-        } while(0);
-        sp->libdeflate_state = 0;
+            if (!TIFFFlushData1(tif))
+                return 0;
+
+            return 1;
+        }
+    } while (0);
+    sp->libdeflate_state = 0;
 #endif /* LIBDEFLATE_SUPPORT */
 
-	sp->stream.next_in = bp;
-	assert(sizeof(sp->stream.avail_in)==4);  /* if this assert gets raised,
-	    we need to simplify this code to reflect a ZLib that is likely updated
-	    to deal with 8byte memory sizes, though this code will respond
-	    appropriately even before we simplify it */
-	do {
-                uInt avail_in_before = (uint64)cc <= 0xFFFFFFFFU ? (uInt)cc : 0xFFFFFFFFU;
-                sp->stream.avail_in = avail_in_before;
-		if (deflate(&sp->stream, Z_NO_FLUSH) != Z_OK) {
-			TIFFErrorExt(tif->tif_clientdata, module, 
-				     "Encoder error: %s",
-				     SAFE_MSG(sp));
-			return (0);
-		}
-		if (sp->stream.avail_out == 0) {
-			tif->tif_rawcc = tif->tif_rawdatasize;
-			if (!TIFFFlushData1(tif))
-				return 0;
-			sp->stream.next_out = tif->tif_rawdata;
-			sp->stream.avail_out = (uint64)tif->tif_rawdatasize <= 0xFFFFFFFFU ? (uInt)tif->tif_rawdatasize : 0xFFFFFFFFU;
-		}
-		cc -= (avail_in_before - sp->stream.avail_in);
-	} while (cc > 0);
-	return (1);
+    sp->stream.next_in = bp;
+    assert(sizeof(sp->stream.avail_in) == 4); /* if this assert gets raised,
+         we need to simplify this code to reflect a ZLib that is likely updated
+         to deal with 8byte memory sizes, though this code will respond
+         appropriately even before we simplify it */
+    do
+    {
+        uInt avail_in_before =
+            (uint64_t)cc <= 0xFFFFFFFFU ? (uInt)cc : 0xFFFFFFFFU;
+        sp->stream.avail_in = avail_in_before;
+        /* coverity[overrun-buffer-arg] */
+        if (deflate(&sp->stream, Z_NO_FLUSH) != Z_OK)
+        {
+            TIFFErrorExtR(tif, module, "Encoder error: %s", SAFE_MSG(sp));
+            return (0);
+        }
+        if (sp->stream.avail_out == 0)
+        {
+            tif->tif_rawcc = tif->tif_rawdatasize;
+            if (!TIFFFlushData1(tif))
+                return 0;
+            sp->stream.next_out = tif->tif_rawdata;
+            sp->stream.avail_out = (uint64_t)tif->tif_rawdatasize <= 0xFFFFFFFFU
+                                       ? (uInt)tif->tif_rawdatasize
+                                       : 0xFFFFFFFFU;
+        }
+        cc -= (avail_in_before - sp->stream.avail_in);
+    } while (cc > 0);
+    return (1);
 }
 
 /*
  * Finish off an encoded strip by flushing the last
  * string and tacking on an End Of Information code.
  */
-static int
-ZIPPostEncode(TIFF* tif)
+static int ZIPPostEncode(TIFF *tif)
 {
-	static const char module[] = "ZIPPostEncode";
-	ZIPState *sp = EncoderState(tif);
-	int state;
+    static const char module[] = "ZIPPostEncode";
+    ZIPState *sp = EncoderState(tif);
+    int state;
 
 #if LIBDEFLATE_SUPPORT
-        if( sp->libdeflate_state == 1 )
-            return 1;
+    if (sp->libdeflate_state == 1)
+        return 1;
 #endif
 
-	sp->stream.avail_in = 0;
-	do {
-		state = deflate(&sp->stream, Z_FINISH);
-		switch (state) {
-		case Z_STREAM_END:
-		case Z_OK:
-			if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize)
-			{
-				tif->tif_rawcc =  tif->tif_rawdatasize - sp->stream.avail_out;
-				if (!TIFFFlushData1(tif))
-					return 0;
-				sp->stream.next_out = tif->tif_rawdata;
-				sp->stream.avail_out = (uint64)tif->tif_rawdatasize <= 0xFFFFFFFFU ? (uInt)tif->tif_rawdatasize : 0xFFFFFFFFU;
-			}
-			break;
-		default:
-			TIFFErrorExt(tif->tif_clientdata, module, 
-				     "ZLib error: %s", SAFE_MSG(sp));
-			return (0);
-		}
-	} while (state != Z_STREAM_END);
-	return (1);
+    sp->stream.avail_in = 0;
+    do
+    {
+        state = deflate(&sp->stream, Z_FINISH);
+        switch (state)
+        {
+            case Z_STREAM_END:
+            case Z_OK:
+                if ((tmsize_t)sp->stream.avail_out != tif->tif_rawdatasize)
+                {
+                    tif->tif_rawcc =
+                        tif->tif_rawdatasize - sp->stream.avail_out;
+                    if (!TIFFFlushData1(tif))
+                        return 0;
+                    sp->stream.next_out = tif->tif_rawdata;
+                    sp->stream.avail_out =
+                        (uint64_t)tif->tif_rawdatasize <= 0xFFFFFFFFU
+                            ? (uInt)tif->tif_rawdatasize
+                            : 0xFFFFFFFFU;
+                }
+                break;
+            default:
+                TIFFErrorExtR(tif, module, "ZLib error: %s", SAFE_MSG(sp));
+                return (0);
+        }
+    } while (state != Z_STREAM_END);
+    return (1);
 }
 
-static void
-ZIPCleanup(TIFF* tif)
+static void ZIPCleanup(TIFF *tif)
 {
-	ZIPState* sp = ZState(tif);
+    ZIPState *sp = ZState(tif);
 
-	assert(sp != 0);
+    assert(sp != 0);
 
-	(void)TIFFPredictorCleanup(tif);
+    (void)TIFFPredictorCleanup(tif);
 
-	tif->tif_tagmethods.vgetfield = sp->vgetparent;
-	tif->tif_tagmethods.vsetfield = sp->vsetparent;
+    tif->tif_tagmethods.vgetfield = sp->vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->vsetparent;
 
-	if (sp->state & ZSTATE_INIT_ENCODE) {
-		deflateEnd(&sp->stream);
-		sp->state = 0;
-	} else if( sp->state & ZSTATE_INIT_DECODE) {
-		inflateEnd(&sp->stream);
-		sp->state = 0;
-	}
+    if (sp->state & ZSTATE_INIT_ENCODE)
+    {
+        deflateEnd(&sp->stream);
+        sp->state = 0;
+    }
+    else if (sp->state & ZSTATE_INIT_DECODE)
+    {
+        inflateEnd(&sp->stream);
+        sp->state = 0;
+    }
 
 #if LIBDEFLATE_SUPPORT
-        if( sp->libdeflate_dec )
-            libdeflate_free_decompressor(sp->libdeflate_dec);
-        if( sp->libdeflate_enc )
-            libdeflate_free_compressor(sp->libdeflate_enc);
+    if (sp->libdeflate_dec)
+        libdeflate_free_decompressor(sp->libdeflate_dec);
+    if (sp->libdeflate_enc)
+        libdeflate_free_compressor(sp->libdeflate_enc);
 #endif
 
-	_TIFFfree(sp);
-	tif->tif_data = NULL;
+    _TIFFfreeExt(tif, sp);
+    tif->tif_data = NULL;
 
-	_TIFFSetDefaultCompressionState(tif);
+    _TIFFSetDefaultCompressionState(tif);
 }
 
-static int
-ZIPVSetField(TIFF* tif, uint32 tag, va_list ap)
+static int ZIPVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	static const char module[] = "ZIPVSetField";
-	ZIPState* sp = ZState(tif);
-
-	switch (tag) {
-	case TIFFTAG_ZIPQUALITY:
-		sp->zipquality = (int) va_arg(ap, int);
-                if( sp->zipquality < Z_DEFAULT_COMPRESSION ||
-                    sp->zipquality > LIBDEFLATE_MAX_COMPRESSION_LEVEL ) {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                 "Invalid ZipQuality value. Should be in [-1,%d] range",
-                                 LIBDEFLATE_MAX_COMPRESSION_LEVEL);
-                    return 0;
-                }
-
-                if ( sp->state&ZSTATE_INIT_ENCODE ) {
-                        int cappedQuality = sp->zipquality;
-                        if( cappedQuality > Z_BEST_COMPRESSION )
-                            cappedQuality = Z_BEST_COMPRESSION;
-			if (deflateParams(&sp->stream,
-			    cappedQuality, Z_DEFAULT_STRATEGY) != Z_OK) {
-				TIFFErrorExt(tif->tif_clientdata, module, "ZLib error: %s",
-					     SAFE_MSG(sp));
-				return (0);
-			}
-		}
+    static const char module[] = "ZIPVSetField";
+    ZIPState *sp = ZState(tif);
+
+    switch (tag)
+    {
+        case TIFFTAG_ZIPQUALITY:
+            sp->zipquality = (int)va_arg(ap, int);
+            if (sp->zipquality < Z_DEFAULT_COMPRESSION ||
+                sp->zipquality > LIBDEFLATE_MAX_COMPRESSION_LEVEL)
+            {
+                TIFFErrorExtR(
+                    tif, module,
+                    "Invalid ZipQuality value. Should be in [-1,%d] range",
+                    LIBDEFLATE_MAX_COMPRESSION_LEVEL);
+                return 0;
+            }
 
-#if LIBDEFLATE_SUPPORT
-                if( sp->libdeflate_enc )
+            if (sp->state & ZSTATE_INIT_ENCODE)
+            {
+                int cappedQuality = sp->zipquality;
+                if (cappedQuality > Z_BEST_COMPRESSION)
+                    cappedQuality = Z_BEST_COMPRESSION;
+                if (deflateParams(&sp->stream, cappedQuality,
+                                  Z_DEFAULT_STRATEGY) != Z_OK)
                 {
-                    libdeflate_free_compressor(sp->libdeflate_enc);
-                    sp->libdeflate_enc = NULL;
+                    TIFFErrorExtR(tif, module, "ZLib error: %s", SAFE_MSG(sp));
+                    return (0);
                 }
+            }
+
+#if LIBDEFLATE_SUPPORT
+            if (sp->libdeflate_enc)
+            {
+                libdeflate_free_compressor(sp->libdeflate_enc);
+                sp->libdeflate_enc = NULL;
+            }
 #endif
 
-		return (1);
+            return (1);
 
         case TIFFTAG_DEFLATE_SUBCODEC:
-                sp->subcodec = (int) va_arg(ap, int);
-                if( sp->subcodec != DEFLATE_SUBCODEC_ZLIB &&
-                    sp->subcodec != DEFLATE_SUBCODEC_LIBDEFLATE )
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                 "Invalid DeflateCodec value.");
-                    return 0;
-                }
+            sp->subcodec = (int)va_arg(ap, int);
+            if (sp->subcodec != DEFLATE_SUBCODEC_ZLIB &&
+                sp->subcodec != DEFLATE_SUBCODEC_LIBDEFLATE)
+            {
+                TIFFErrorExtR(tif, module, "Invalid DeflateCodec value.");
+                return 0;
+            }
 #if !LIBDEFLATE_SUPPORT
-                if( sp->subcodec == DEFLATE_SUBCODEC_LIBDEFLATE )
-                {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                 "DeflateCodec = DEFLATE_SUBCODEC_LIBDEFLATE unsupported in this build");
-                    return 0;
-                }
+            if (sp->subcodec == DEFLATE_SUBCODEC_LIBDEFLATE)
+            {
+                TIFFErrorExtR(tif, module,
+                              "DeflateCodec = DEFLATE_SUBCODEC_LIBDEFLATE "
+                              "unsupported in this build");
+                return 0;
+            }
 #endif
-                return 1;
+            return 1;
 
-	default:
-		return (*sp->vsetparent)(tif, tag, ap);
-	}
-	/*NOTREACHED*/
+        default:
+            return (*sp->vsetparent)(tif, tag, ap);
+    }
+    /*NOTREACHED*/
 }
 
-static int
-ZIPVGetField(TIFF* tif, uint32 tag, va_list ap)
+static int ZIPVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	ZIPState* sp = ZState(tif);
+    ZIPState *sp = ZState(tif);
 
-	switch (tag) {
-	case TIFFTAG_ZIPQUALITY:
-		*va_arg(ap, int*) = sp->zipquality;
-		break;
+    switch (tag)
+    {
+        case TIFFTAG_ZIPQUALITY:
+            *va_arg(ap, int *) = sp->zipquality;
+            break;
 
         case TIFFTAG_DEFLATE_SUBCODEC:
-		*va_arg(ap, int*) = sp->subcodec;
-		break;
+            *va_arg(ap, int *) = sp->subcodec;
+            break;
 
-	default:
-		return (*sp->vgetparent)(tif, tag, ap);
-	}
-	return (1);
+        default:
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
+    return (1);
 }
 
 static const TIFFField zipFields[] = {
-    { TIFFTAG_ZIPQUALITY, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL },
-    { TIFFTAG_DEFLATE_SUBCODEC, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT, TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL },
+    {TIFFTAG_ZIPQUALITY, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL},
+    {TIFFTAG_DEFLATE_SUBCODEC, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "", NULL},
 };
 
-int
-TIFFInitZIP(TIFF* tif, int scheme)
+int TIFFInitZIP(TIFF *tif, int scheme)
 {
-	static const char module[] = "TIFFInitZIP";
-	ZIPState* sp;
+    static const char module[] = "TIFFInitZIP";
+    ZIPState *sp;
 
-	assert( (scheme == COMPRESSION_DEFLATE)
-		|| (scheme == COMPRESSION_ADOBE_DEFLATE));
+    assert((scheme == COMPRESSION_DEFLATE) ||
+           (scheme == COMPRESSION_ADOBE_DEFLATE));
 #ifdef NDEBUG
-	(void)scheme;
+    (void)scheme;
 #endif
 
-	/*
-	 * Merge codec-specific tag information.
-	 */
-	if (!_TIFFMergeFields(tif, zipFields, TIFFArrayCount(zipFields))) {
-		TIFFErrorExt(tif->tif_clientdata, module,
-			     "Merging Deflate codec-specific tags failed");
-		return 0;
-	}
-
-	/*
-	 * Allocate state block so tag methods have storage to record values.
-	 */
-	tif->tif_data = (uint8*) _TIFFcalloc(sizeof (ZIPState), 1);
-	if (tif->tif_data == NULL)
-		goto bad;
-	sp = ZState(tif);
-	sp->stream.zalloc = NULL;
-	sp->stream.zfree = NULL;
-	sp->stream.opaque = NULL;
-	sp->stream.data_type = Z_BINARY;
-
-	/*
-	 * Override parent get/set field methods.
-	 */
-	sp->vgetparent = tif->tif_tagmethods.vgetfield;
-	tif->tif_tagmethods.vgetfield = ZIPVGetField; /* hook for codec tags */
-	sp->vsetparent = tif->tif_tagmethods.vsetfield;
-	tif->tif_tagmethods.vsetfield = ZIPVSetField; /* hook for codec tags */
-
-	/* Default values for codec-specific fields */
-	sp->zipquality = Z_DEFAULT_COMPRESSION;	/* default comp. level */
-	sp->state = 0;
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, zipFields, TIFFArrayCount(zipFields)))
+    {
+        TIFFErrorExtR(tif, module,
+                      "Merging Deflate codec-specific tags failed");
+        return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFcallocExt(tif, sizeof(ZIPState), 1);
+    if (tif->tif_data == NULL)
+        goto bad;
+    sp = ZState(tif);
+    sp->stream.zalloc = NULL;
+    sp->stream.zfree = NULL;
+    sp->stream.opaque = NULL;
+    sp->stream.data_type = Z_BINARY;
+
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = ZIPVGetField; /* hook for codec tags */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = ZIPVSetField; /* hook for codec tags */
+
+    /* Default values for codec-specific fields */
+    sp->zipquality = Z_DEFAULT_COMPRESSION; /* default comp. level */
+    sp->state = 0;
 #if LIBDEFLATE_SUPPORT
-        sp->subcodec = DEFLATE_SUBCODEC_LIBDEFLATE;
+    sp->subcodec = DEFLATE_SUBCODEC_LIBDEFLATE;
 #else
-        sp->subcodec = DEFLATE_SUBCODEC_ZLIB;
+    sp->subcodec = DEFLATE_SUBCODEC_ZLIB;
 #endif
 
-	/*
-	 * Install codec methods.
-	 */
-	tif->tif_fixuptags = ZIPFixupTags; 
-	tif->tif_setupdecode = ZIPSetupDecode;
-	tif->tif_predecode = ZIPPreDecode;
-	tif->tif_decoderow = ZIPDecode;
-	tif->tif_decodestrip = ZIPDecode;
-	tif->tif_decodetile = ZIPDecode;  
-	tif->tif_setupencode = ZIPSetupEncode;
-	tif->tif_preencode = ZIPPreEncode;
-	tif->tif_postencode = ZIPPostEncode;
-	tif->tif_encoderow = ZIPEncode;
-	tif->tif_encodestrip = ZIPEncode;
-	tif->tif_encodetile = ZIPEncode;
-	tif->tif_cleanup = ZIPCleanup;
-	/*
-	 * Setup predictor setup.
-	 */
-	(void) TIFFPredictorInit(tif);
-	return (1);
+    /*
+     * Install codec methods.
+     */
+    tif->tif_fixuptags = ZIPFixupTags;
+    tif->tif_setupdecode = ZIPSetupDecode;
+    tif->tif_predecode = ZIPPreDecode;
+    tif->tif_decoderow = ZIPDecode;
+    tif->tif_decodestrip = ZIPDecode;
+    tif->tif_decodetile = ZIPDecode;
+    tif->tif_setupencode = ZIPSetupEncode;
+    tif->tif_preencode = ZIPPreEncode;
+    tif->tif_postencode = ZIPPostEncode;
+    tif->tif_encoderow = ZIPEncode;
+    tif->tif_encodestrip = ZIPEncode;
+    tif->tif_encodetile = ZIPEncode;
+    tif->tif_cleanup = ZIPCleanup;
+    /*
+     * Setup predictor setup.
+     */
+    (void)TIFFPredictorInit(tif);
+    return (1);
 bad:
-	TIFFErrorExt(tif->tif_clientdata, module,
-		     "No space for ZIP state block");
-	return (0);
+    TIFFErrorExtR(tif, module, "No space for ZIP state block");
+    return (0);
 }
 #endif /* ZIP_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tif_zstd.c b/3rdparty/libtiff/tif_zstd.c
index 66135e03c1ab..646993103d25 100644
--- a/3rdparty/libtiff/tif_zstd.c
+++ b/3rdparty/libtiff/tif_zstd.c
@@ -1,35 +1,35 @@
 /*
-* Copyright (c) 2017, Planet Labs
-* Author: <even.rouault at spatialys.com>
-*
-* Permission to use, copy, modify, distribute, and sell this software and
-* its documentation for any purpose is hereby granted without fee, provided
-* that (i) the above copyright notices and this permission notice appear in
-* all copies of the software and related documentation, and (ii) the names of
-* Sam Leffler and Silicon Graphics may not be used in any advertising or
-* publicity relating to the software without the specific, prior written
-* permission of Sam Leffler and Silicon Graphics.
-*
-* THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
-* WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
-*
-* IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
-* ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
-* OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-* WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
-* LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-* OF THIS SOFTWARE.
-*/
+ * Copyright (c) 2017, Planet Labs
+ * Author: <even.rouault at spatialys.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and
+ * its documentation for any purpose is hereby granted without fee, provided
+ * that (i) the above copyright notices and this permission notice appear in
+ * all copies of the software and related documentation, and (ii) the names of
+ * Sam Leffler and Silicon Graphics may not be used in any advertising or
+ * publicity relating to the software without the specific, prior written
+ * permission of Sam Leffler and Silicon Graphics.
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
+ * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
+ * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
 
 #include "tiffiop.h"
 #ifdef ZSTD_SUPPORT
 /*
-* TIFF Library.
-*
-* ZSTD Compression Support
-*
-*/
+ * TIFF Library.
+ *
+ * ZSTD Compression Support
+ *
+ */
 
 #include "tif_predict.h"
 #include "zstd.h"
@@ -37,406 +37,400 @@
 #include <stdio.h>
 
 /*
-* State block for each open TIFF file using ZSTD compression/decompression.
-*/
-typedef struct {
-        TIFFPredictorState predict;
-        ZSTD_DStream*   dstream;
-        ZSTD_CStream*   cstream;
-        int             compression_level;      /* compression level */
-        ZSTD_outBuffer  out_buffer;
-        int             state;                  /* state flags */
+ * State block for each open TIFF file using ZSTD compression/decompression.
+ */
+typedef struct
+{
+    TIFFPredictorState predict;
+    ZSTD_DStream *dstream;
+    ZSTD_CStream *cstream;
+    int compression_level; /* compression level */
+    ZSTD_outBuffer out_buffer;
+    int state; /* state flags */
 #define LSTATE_INIT_DECODE 0x01
 #define LSTATE_INIT_ENCODE 0x02
 
-        TIFFVGetMethod  vgetparent;            /* super-class method */
-        TIFFVSetMethod  vsetparent;            /* super-class method */
+    TIFFVGetMethod vgetparent; /* super-class method */
+    TIFFVSetMethod vsetparent; /* super-class method */
 } ZSTDState;
 
-#define LState(tif)             ((ZSTDState*) (tif)->tif_data)
-#define DecoderState(tif)       LState(tif)
-#define EncoderState(tif)       LState(tif)
+#define LState(tif) ((ZSTDState *)(tif)->tif_data)
+#define DecoderState(tif) LState(tif)
+#define EncoderState(tif) LState(tif)
 
-static int ZSTDEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s);
-static int ZSTDDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s);
+static int ZSTDEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s);
+static int ZSTDDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s);
 
-static int
-ZSTDFixupTags(TIFF* tif)
+static int ZSTDFixupTags(TIFF *tif)
 {
-        (void) tif;
-        return 1;
+    (void)tif;
+    return 1;
 }
 
-static int
-ZSTDSetupDecode(TIFF* tif)
+static int ZSTDSetupDecode(TIFF *tif)
 {
-        ZSTDState* sp = DecoderState(tif);
+    ZSTDState *sp = DecoderState(tif);
 
-        assert(sp != NULL);
+    assert(sp != NULL);
 
-        /* if we were last encoding, terminate this mode */
-        if (sp->state & LSTATE_INIT_ENCODE) {
-            ZSTD_freeCStream(sp->cstream);
-            sp->cstream = NULL;
-            sp->state = 0;
-        }
+    /* if we were last encoding, terminate this mode */
+    if (sp->state & LSTATE_INIT_ENCODE)
+    {
+        ZSTD_freeCStream(sp->cstream);
+        sp->cstream = NULL;
+        sp->state = 0;
+    }
 
-        sp->state |= LSTATE_INIT_DECODE;
-        return 1;
+    sp->state |= LSTATE_INIT_DECODE;
+    return 1;
 }
 
 /*
-* Setup state for decoding a strip.
-*/
-static int
-ZSTDPreDecode(TIFF* tif, uint16 s)
+ * Setup state for decoding a strip.
+ */
+static int ZSTDPreDecode(TIFF *tif, uint16_t s)
 {
-        static const char module[] = "ZSTDPreDecode";
-        ZSTDState* sp = DecoderState(tif);
-        size_t zstd_ret;
+    static const char module[] = "ZSTDPreDecode";
+    ZSTDState *sp = DecoderState(tif);
+    size_t zstd_ret;
 
-        (void) s;
-        assert(sp != NULL);
+    (void)s;
+    assert(sp != NULL);
 
-        if( (sp->state & LSTATE_INIT_DECODE) == 0 )
-            tif->tif_setupdecode(tif);
-
-        if( sp->dstream )
-        {
-            ZSTD_freeDStream(sp->dstream);
-            sp->dstream = NULL;
-        }
+    if ((sp->state & LSTATE_INIT_DECODE) == 0)
+        tif->tif_setupdecode(tif);
 
+    if (sp->dstream == NULL)
+    {
         sp->dstream = ZSTD_createDStream();
-        if( sp->dstream == NULL ) {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "Cannot allocate decompression stream");
-            return 0;
-        }
-        zstd_ret = ZSTD_initDStream(sp->dstream);
-        if( ZSTD_isError(zstd_ret) ) {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "Error in ZSTD_initDStream(): %s",
-                         ZSTD_getErrorName(zstd_ret));
+        if (sp->dstream == NULL)
+        {
+            TIFFErrorExtR(tif, module, "Cannot allocate decompression stream");
             return 0;
         }
+    }
+
+    zstd_ret = ZSTD_initDStream(sp->dstream);
+    if (ZSTD_isError(zstd_ret))
+    {
+        TIFFErrorExtR(tif, module, "Error in ZSTD_initDStream(): %s",
+                      ZSTD_getErrorName(zstd_ret));
+        return 0;
+    }
 
-        return 1;
+    return 1;
 }
 
-static int
-ZSTDDecode(TIFF* tif, uint8* op, tmsize_t occ, uint16 s)
+static int ZSTDDecode(TIFF *tif, uint8_t *op, tmsize_t occ, uint16_t s)
 {
-        static const char module[] = "ZSTDDecode";
-        ZSTDState* sp = DecoderState(tif);
-        ZSTD_inBuffer   in_buffer;
-        ZSTD_outBuffer  out_buffer;
-        size_t          zstd_ret;
-
-        (void) s;
-        assert(sp != NULL);
-        assert(sp->state == LSTATE_INIT_DECODE);
-
-        in_buffer.src = tif->tif_rawcp;
-        in_buffer.size = (size_t) tif->tif_rawcc;
-        in_buffer.pos = 0;
-
-        out_buffer.dst = op;
-        out_buffer.size = (size_t) occ;
-        out_buffer.pos = 0;
-
-        do {
-                zstd_ret = ZSTD_decompressStream(sp->dstream, &out_buffer,
-                                                 &in_buffer);
-                if( ZSTD_isError(zstd_ret) ) {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                "Error in ZSTD_decompressStream(): %s",
-                                ZSTD_getErrorName(zstd_ret));
-                    return 0;
-                }
-        } while( zstd_ret != 0 &&
-                 in_buffer.pos < in_buffer.size &&
-                 out_buffer.pos < out_buffer.size );
-
-        if (out_buffer.pos < (size_t)occ) {
-                TIFFErrorExt(tif->tif_clientdata, module,
-                    "Not enough data at scanline %lu (short %lu bytes)",
-                    (unsigned long) tif->tif_row,
-                    (unsigned long) (size_t)occ - out_buffer.pos);
-                return 0;
+    static const char module[] = "ZSTDDecode";
+    ZSTDState *sp = DecoderState(tif);
+    ZSTD_inBuffer in_buffer;
+    ZSTD_outBuffer out_buffer;
+    size_t zstd_ret;
+
+    (void)s;
+    assert(sp != NULL);
+    assert(sp->state == LSTATE_INIT_DECODE);
+
+    in_buffer.src = tif->tif_rawcp;
+    in_buffer.size = (size_t)tif->tif_rawcc;
+    in_buffer.pos = 0;
+
+    out_buffer.dst = op;
+    out_buffer.size = (size_t)occ;
+    out_buffer.pos = 0;
+
+    do
+    {
+        zstd_ret = ZSTD_decompressStream(sp->dstream, &out_buffer, &in_buffer);
+        if (ZSTD_isError(zstd_ret))
+        {
+            TIFFErrorExtR(tif, module, "Error in ZSTD_decompressStream(): %s",
+                          ZSTD_getErrorName(zstd_ret));
+            return 0;
         }
+    } while (zstd_ret != 0 && in_buffer.pos < in_buffer.size &&
+             out_buffer.pos < out_buffer.size);
+
+    if (out_buffer.pos < (size_t)occ)
+    {
+        TIFFErrorExtR(tif, module,
+                      "Not enough data at scanline %lu (short %lu bytes)",
+                      (unsigned long)tif->tif_row,
+                      (unsigned long)((size_t)occ - out_buffer.pos));
+        return 0;
+    }
 
-        tif->tif_rawcp += in_buffer.pos;
-        tif->tif_rawcc -= in_buffer.pos;
+    tif->tif_rawcp += in_buffer.pos;
+    tif->tif_rawcc -= in_buffer.pos;
 
-        return 1;
+    return 1;
 }
 
-static int
-ZSTDSetupEncode(TIFF* tif)
+static int ZSTDSetupEncode(TIFF *tif)
 {
-        ZSTDState* sp = EncoderState(tif);
+    ZSTDState *sp = EncoderState(tif);
 
-        assert(sp != NULL);
-        if (sp->state & LSTATE_INIT_DECODE) {
-                ZSTD_freeDStream(sp->dstream);
-                sp->dstream = NULL;
-                sp->state = 0;
-        }
+    assert(sp != NULL);
+    if (sp->state & LSTATE_INIT_DECODE)
+    {
+        ZSTD_freeDStream(sp->dstream);
+        sp->dstream = NULL;
+        sp->state = 0;
+    }
 
-        sp->state |= LSTATE_INIT_ENCODE;
-        return 1;
+    sp->state |= LSTATE_INIT_ENCODE;
+    return 1;
 }
 
 /*
-* Reset encoding state at the start of a strip.
-*/
-static int
-ZSTDPreEncode(TIFF* tif, uint16 s)
+ * Reset encoding state at the start of a strip.
+ */
+static int ZSTDPreEncode(TIFF *tif, uint16_t s)
 {
-        static const char module[] = "ZSTDPreEncode";
-        ZSTDState *sp = EncoderState(tif);
-        size_t zstd_ret;
-
-        (void) s;
-        assert(sp != NULL);
-        if( sp->state != LSTATE_INIT_ENCODE )
-            tif->tif_setupencode(tif);
-
-        if (sp->cstream) {
-            ZSTD_freeCStream(sp->cstream);
-            sp->cstream = NULL;
-        }
+    static const char module[] = "ZSTDPreEncode";
+    ZSTDState *sp = EncoderState(tif);
+    size_t zstd_ret;
+
+    (void)s;
+    assert(sp != NULL);
+    if (sp->state != LSTATE_INIT_ENCODE)
+        tif->tif_setupencode(tif);
+
+    if (sp->cstream == NULL)
+    {
         sp->cstream = ZSTD_createCStream();
-        if( sp->cstream == NULL ) {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "Cannot allocate compression stream");
+        if (sp->cstream == NULL)
+        {
+            TIFFErrorExtR(tif, module, "Cannot allocate compression stream");
             return 0;
         }
+    }
 
-        zstd_ret = ZSTD_initCStream(sp->cstream, sp->compression_level);
-        if( ZSTD_isError(zstd_ret) ) {
-            TIFFErrorExt(tif->tif_clientdata, module,
-                         "Error in ZSTD_initCStream(): %s",
-                         ZSTD_getErrorName(zstd_ret));
-            return 0;
-        }
+    zstd_ret = ZSTD_initCStream(sp->cstream, sp->compression_level);
+    if (ZSTD_isError(zstd_ret))
+    {
+        TIFFErrorExtR(tif, module, "Error in ZSTD_initCStream(): %s",
+                      ZSTD_getErrorName(zstd_ret));
+        return 0;
+    }
 
-        sp->out_buffer.dst = tif->tif_rawdata;
-        sp->out_buffer.size = (size_t)tif->tif_rawdatasize;
-        sp->out_buffer.pos = 0;
+    sp->out_buffer.dst = tif->tif_rawdata;
+    sp->out_buffer.size = (size_t)tif->tif_rawdatasize;
+    sp->out_buffer.pos = 0;
 
-        return 1;
+    return 1;
 }
 
 /*
-* Encode a chunk of pixels.
-*/
-static int
-ZSTDEncode(TIFF* tif, uint8* bp, tmsize_t cc, uint16 s)
+ * Encode a chunk of pixels.
+ */
+static int ZSTDEncode(TIFF *tif, uint8_t *bp, tmsize_t cc, uint16_t s)
 {
-        static const char module[] = "ZSTDEncode";
-        ZSTDState *sp = EncoderState(tif);
-        ZSTD_inBuffer   in_buffer;
-        size_t          zstd_ret;
-
-        assert(sp != NULL);
-        assert(sp->state == LSTATE_INIT_ENCODE);
-
-        (void) s;
-
-        in_buffer.src = bp;
-        in_buffer.size = (size_t)cc;
-        in_buffer.pos = 0;
-
-        do {
-                zstd_ret = ZSTD_compressStream(sp->cstream, &sp->out_buffer,
-                                               &in_buffer);
-                if( ZSTD_isError(zstd_ret) ) {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                "Error in ZSTD_compressStream(): %s",
-                                ZSTD_getErrorName(zstd_ret));
-                    return 0;
-                }
-                if( sp->out_buffer.pos == sp->out_buffer.size ) {
-                        tif->tif_rawcc = tif->tif_rawdatasize;
-                        if (!TIFFFlushData1(tif))
-                                return 0;
-                        sp->out_buffer.dst = tif->tif_rawcp;
-                        sp->out_buffer.pos = 0;
-                }
-        } while( in_buffer.pos < in_buffer.size );
-
-        return 1;
+    static const char module[] = "ZSTDEncode";
+    ZSTDState *sp = EncoderState(tif);
+    ZSTD_inBuffer in_buffer;
+    size_t zstd_ret;
+
+    assert(sp != NULL);
+    assert(sp->state == LSTATE_INIT_ENCODE);
+
+    (void)s;
+
+    in_buffer.src = bp;
+    in_buffer.size = (size_t)cc;
+    in_buffer.pos = 0;
+
+    do
+    {
+        zstd_ret =
+            ZSTD_compressStream(sp->cstream, &sp->out_buffer, &in_buffer);
+        if (ZSTD_isError(zstd_ret))
+        {
+            TIFFErrorExtR(tif, module, "Error in ZSTD_compressStream(): %s",
+                          ZSTD_getErrorName(zstd_ret));
+            return 0;
+        }
+        if (sp->out_buffer.pos == sp->out_buffer.size)
+        {
+            tif->tif_rawcc = tif->tif_rawdatasize;
+            if (!TIFFFlushData1(tif))
+                return 0;
+            sp->out_buffer.dst = tif->tif_rawcp;
+            sp->out_buffer.pos = 0;
+        }
+    } while (in_buffer.pos < in_buffer.size);
+
+    return 1;
 }
 
 /*
-* Finish off an encoded strip by flushing it.
-*/
-static int
-ZSTDPostEncode(TIFF* tif)
+ * Finish off an encoded strip by flushing it.
+ */
+static int ZSTDPostEncode(TIFF *tif)
 {
-        static const char module[] = "ZSTDPostEncode";
-        ZSTDState *sp = EncoderState(tif);
-        size_t          zstd_ret;
-
-        do {
-                zstd_ret = ZSTD_endStream(sp->cstream, &sp->out_buffer);
-                if( ZSTD_isError(zstd_ret) ) {
-                    TIFFErrorExt(tif->tif_clientdata, module,
-                                "Error in ZSTD_endStream(): %s",
-                                ZSTD_getErrorName(zstd_ret));
-                    return 0;
-                }
-                if( sp->out_buffer.pos > 0 ) {
-                        tif->tif_rawcc = sp->out_buffer.pos;
-                        if (!TIFFFlushData1(tif))
-                                return 0;
-                        sp->out_buffer.dst = tif->tif_rawcp;
-                        sp->out_buffer.pos = 0;
-                }
-        } while (zstd_ret != 0);
-        return 1;
+    static const char module[] = "ZSTDPostEncode";
+    ZSTDState *sp = EncoderState(tif);
+    size_t zstd_ret;
+
+    do
+    {
+        zstd_ret = ZSTD_endStream(sp->cstream, &sp->out_buffer);
+        if (ZSTD_isError(zstd_ret))
+        {
+            TIFFErrorExtR(tif, module, "Error in ZSTD_endStream(): %s",
+                          ZSTD_getErrorName(zstd_ret));
+            return 0;
+        }
+        if (sp->out_buffer.pos > 0)
+        {
+            tif->tif_rawcc = sp->out_buffer.pos;
+            if (!TIFFFlushData1(tif))
+                return 0;
+            sp->out_buffer.dst = tif->tif_rawcp;
+            sp->out_buffer.pos = 0;
+        }
+    } while (zstd_ret != 0);
+    return 1;
 }
 
-static void
-ZSTDCleanup(TIFF* tif)
+static void ZSTDCleanup(TIFF *tif)
 {
-        ZSTDState* sp = LState(tif);
+    ZSTDState *sp = LState(tif);
 
-        assert(sp != 0);
+    assert(sp != 0);
 
-        (void)TIFFPredictorCleanup(tif);
+    (void)TIFFPredictorCleanup(tif);
 
-        tif->tif_tagmethods.vgetfield = sp->vgetparent;
-        tif->tif_tagmethods.vsetfield = sp->vsetparent;
+    tif->tif_tagmethods.vgetfield = sp->vgetparent;
+    tif->tif_tagmethods.vsetfield = sp->vsetparent;
 
-        if (sp->dstream) {
-            ZSTD_freeDStream(sp->dstream);
-            sp->dstream = NULL;
-        }
-        if (sp->cstream) {
-            ZSTD_freeCStream(sp->cstream);
-            sp->cstream = NULL;
-        }
-        _TIFFfree(sp);
-        tif->tif_data = NULL;
+    if (sp->dstream)
+    {
+        ZSTD_freeDStream(sp->dstream);
+        sp->dstream = NULL;
+    }
+    if (sp->cstream)
+    {
+        ZSTD_freeCStream(sp->cstream);
+        sp->cstream = NULL;
+    }
+    _TIFFfreeExt(tif, sp);
+    tif->tif_data = NULL;
 
-        _TIFFSetDefaultCompressionState(tif);
+    _TIFFSetDefaultCompressionState(tif);
 }
 
-static int
-ZSTDVSetField(TIFF* tif, uint32 tag, va_list ap)
+static int ZSTDVSetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-	static const char module[] = "ZSTDVSetField";
-        ZSTDState* sp = LState(tif);
+    static const char module[] = "ZSTDVSetField";
+    ZSTDState *sp = LState(tif);
 
-        switch (tag) {
+    switch (tag)
+    {
         case TIFFTAG_ZSTD_LEVEL:
-                sp->compression_level = (int) va_arg(ap, int);
-                if( sp->compression_level <= 0 ||
-                    sp->compression_level > ZSTD_maxCLevel() )
-                {
-                    TIFFWarningExt(tif->tif_clientdata, module,
-                                   "ZSTD_LEVEL should be between 1 and %d",
-                                   ZSTD_maxCLevel());
-                }
-                return 1;
+            sp->compression_level = (int)va_arg(ap, int);
+            if (sp->compression_level <= 0 ||
+                sp->compression_level > ZSTD_maxCLevel())
+            {
+                TIFFWarningExtR(tif, module,
+                                "ZSTD_LEVEL should be between 1 and %d",
+                                ZSTD_maxCLevel());
+            }
+            return 1;
         default:
-                return (*sp->vsetparent)(tif, tag, ap);
-        }
-        /*NOTREACHED*/
+            return (*sp->vsetparent)(tif, tag, ap);
+    }
+    /*NOTREACHED*/
 }
 
-static int
-ZSTDVGetField(TIFF* tif, uint32 tag, va_list ap)
+static int ZSTDVGetField(TIFF *tif, uint32_t tag, va_list ap)
 {
-        ZSTDState* sp = LState(tif);
+    ZSTDState *sp = LState(tif);
 
-        switch (tag) {
+    switch (tag)
+    {
         case TIFFTAG_ZSTD_LEVEL:
-                *va_arg(ap, int*) = sp->compression_level;
-                break;
+            *va_arg(ap, int *) = sp->compression_level;
+            break;
         default:
-                return (*sp->vgetparent)(tif, tag, ap);
-        }
-        return 1;
+            return (*sp->vgetparent)(tif, tag, ap);
+    }
+    return 1;
 }
 
 static const TIFFField ZSTDFields[] = {
-        { TIFFTAG_ZSTD_LEVEL, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
-          TIFF_SETGET_UNDEFINED,
-          FIELD_PSEUDO, TRUE, FALSE, "ZSTD compression_level", NULL },
+    {TIFFTAG_ZSTD_LEVEL, 0, 0, TIFF_ANY, 0, TIFF_SETGET_INT,
+     TIFF_SETGET_UNDEFINED, FIELD_PSEUDO, TRUE, FALSE, "ZSTD compression_level",
+     NULL},
 };
 
-int
-TIFFInitZSTD(TIFF* tif, int scheme)
+int TIFFInitZSTD(TIFF *tif, int scheme)
 {
-        static const char module[] = "TIFFInitZSTD";
-        ZSTDState* sp;
-
-        assert( scheme == COMPRESSION_ZSTD );
-
-        /*
-        * Merge codec-specific tag information.
-        */
-        if (!_TIFFMergeFields(tif, ZSTDFields, TIFFArrayCount(ZSTDFields))) {
-                TIFFErrorExt(tif->tif_clientdata, module,
-                            "Merging ZSTD codec-specific tags failed");
-                return 0;
-        }
-
-        /*
-        * Allocate state block so tag methods have storage to record values.
-        */
-        tif->tif_data = (uint8*) _TIFFmalloc(sizeof(ZSTDState));
-        if (tif->tif_data == NULL)
-                goto bad;
-        sp = LState(tif);
-
-        /*
-        * Override parent get/set field methods.
-        */
-        sp->vgetparent = tif->tif_tagmethods.vgetfield;
-        tif->tif_tagmethods.vgetfield = ZSTDVGetField;	/* hook for codec tags */
-        sp->vsetparent = tif->tif_tagmethods.vsetfield;
-        tif->tif_tagmethods.vsetfield = ZSTDVSetField;	/* hook for codec tags */
-
-        /* Default values for codec-specific fields */
-        sp->compression_level = 9;		/* default comp. level */
-        sp->state = 0;
-        sp->dstream = 0;
-        sp->cstream = 0;
-        sp->out_buffer.dst = NULL;
-        sp->out_buffer.size = 0;
-        sp->out_buffer.pos = 0;
-
-        /*
-        * Install codec methods.
-        */
-        tif->tif_fixuptags = ZSTDFixupTags;
-        tif->tif_setupdecode = ZSTDSetupDecode;
-        tif->tif_predecode = ZSTDPreDecode;
-        tif->tif_decoderow = ZSTDDecode;
-        tif->tif_decodestrip = ZSTDDecode;
-        tif->tif_decodetile = ZSTDDecode;
-        tif->tif_setupencode = ZSTDSetupEncode;
-        tif->tif_preencode = ZSTDPreEncode;
-        tif->tif_postencode = ZSTDPostEncode;
-        tif->tif_encoderow = ZSTDEncode;
-        tif->tif_encodestrip = ZSTDEncode;
-        tif->tif_encodetile = ZSTDEncode;
-        tif->tif_cleanup = ZSTDCleanup;
-        /*
-        * Setup predictor setup.
-        */
-        (void) TIFFPredictorInit(tif);
-        return 1;
-bad:
-        TIFFErrorExt(tif->tif_clientdata, module,
-                    "No space for ZSTD state block");
+    static const char module[] = "TIFFInitZSTD";
+    ZSTDState *sp;
+
+    (void)scheme;
+    assert(scheme == COMPRESSION_ZSTD);
+
+    /*
+     * Merge codec-specific tag information.
+     */
+    if (!_TIFFMergeFields(tif, ZSTDFields, TIFFArrayCount(ZSTDFields)))
+    {
+        TIFFErrorExtR(tif, module, "Merging ZSTD codec-specific tags failed");
         return 0;
+    }
+
+    /*
+     * Allocate state block so tag methods have storage to record values.
+     */
+    tif->tif_data = (uint8_t *)_TIFFmallocExt(tif, sizeof(ZSTDState));
+    if (tif->tif_data == NULL)
+        goto bad;
+    sp = LState(tif);
+
+    /*
+     * Override parent get/set field methods.
+     */
+    sp->vgetparent = tif->tif_tagmethods.vgetfield;
+    tif->tif_tagmethods.vgetfield = ZSTDVGetField; /* hook for codec tags */
+    sp->vsetparent = tif->tif_tagmethods.vsetfield;
+    tif->tif_tagmethods.vsetfield = ZSTDVSetField; /* hook for codec tags */
+
+    /* Default values for codec-specific fields */
+    sp->compression_level = 9; /* default comp. level */
+    sp->state = 0;
+    sp->dstream = 0;
+    sp->cstream = 0;
+    sp->out_buffer.dst = NULL;
+    sp->out_buffer.size = 0;
+    sp->out_buffer.pos = 0;
+
+    /*
+     * Install codec methods.
+     */
+    tif->tif_fixuptags = ZSTDFixupTags;
+    tif->tif_setupdecode = ZSTDSetupDecode;
+    tif->tif_predecode = ZSTDPreDecode;
+    tif->tif_decoderow = ZSTDDecode;
+    tif->tif_decodestrip = ZSTDDecode;
+    tif->tif_decodetile = ZSTDDecode;
+    tif->tif_setupencode = ZSTDSetupEncode;
+    tif->tif_preencode = ZSTDPreEncode;
+    tif->tif_postencode = ZSTDPostEncode;
+    tif->tif_encoderow = ZSTDEncode;
+    tif->tif_encodestrip = ZSTDEncode;
+    tif->tif_encodetile = ZSTDEncode;
+    tif->tif_cleanup = ZSTDCleanup;
+    /*
+     * Setup predictor setup.
+     */
+    (void)TIFFPredictorInit(tif);
+    return 1;
+bad:
+    TIFFErrorExtR(tif, module, "No space for ZSTD state block");
+    return 0;
 }
 #endif /* ZSTD_SUPPORT */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
diff --git a/3rdparty/libtiff/tiff.h b/3rdparty/libtiff/tiff.h
index 2d4a47679d26..d8da33dc3837 100644
--- a/3rdparty/libtiff/tiff.h
+++ b/3rdparty/libtiff/tiff.h
@@ -2,28 +2,28 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
 #ifndef _TIFF_
-#define	_TIFF_
+#define _TIFF_
 
 #include "tiffconf.h"
 
@@ -48,32 +48,46 @@
 #define TIFF_VERSION_CLASSIC 42
 #define TIFF_VERSION_BIG 43
 
-#define TIFF_BIGENDIAN      0x4d4d
-#define TIFF_LITTLEENDIAN   0x4949
-#define MDI_LITTLEENDIAN    0x5045
-#define MDI_BIGENDIAN       0x4550
+#define TIFF_BIGENDIAN 0x4d4d
+#define TIFF_LITTLEENDIAN 0x4949
+#define MDI_LITTLEENDIAN 0x5045
+#define MDI_BIGENDIAN 0x4550
 
 /*
  * Intrinsic data types required by the file format:
  *
- * 8-bit quantities     int8/uint8
- * 16-bit quantities    int16/uint16
- * 32-bit quantities    int32/uint32
- * 64-bit quantities    int64/uint64
+ * 8-bit quantities     int8_t/uint_8_t
+ * 16-bit quantities    int16_t/uint_16_t
+ * 32-bit quantities    int32_t/uint_32_t
+ * 64-bit quantities    int64_t/uint_64_t
  * strings              unsigned char*
  */
+#ifdef __GNUC__
+#define TIFF_GCC_DEPRECATED __attribute__((deprecated))
+#else
+#define TIFF_GCC_DEPRECATED
+#endif
+#ifdef _MSC_VER
+#define TIFF_MSC_DEPRECATED                                                    \
+    __declspec(deprecated("libtiff type deprecated; please use corresponding " \
+                          "C99 stdint.h type"))
+#else
+#define TIFF_MSC_DEPRECATED
+#endif
 
-typedef TIFF_INT8_T   int8;
-typedef TIFF_UINT8_T  uint8;
+#ifndef TIFF_DISABLE_DEPRECATED
+typedef TIFF_MSC_DEPRECATED int8_t int8 TIFF_GCC_DEPRECATED;
+typedef TIFF_MSC_DEPRECATED uint8_t uint8 TIFF_GCC_DEPRECATED;
 
-typedef TIFF_INT16_T  int16;
-typedef TIFF_UINT16_T uint16;
+typedef TIFF_MSC_DEPRECATED int16_t int16 TIFF_GCC_DEPRECATED;
+typedef TIFF_MSC_DEPRECATED uint16_t uint16 TIFF_GCC_DEPRECATED;
 
-typedef TIFF_INT32_T  int32;
-typedef TIFF_UINT32_T uint32;
+typedef TIFF_MSC_DEPRECATED int32_t int32 TIFF_GCC_DEPRECATED;
+typedef TIFF_MSC_DEPRECATED uint32_t uint32 TIFF_GCC_DEPRECATED;
 
-typedef TIFF_INT64_T  int64;
-typedef TIFF_UINT64_T uint64;
+typedef TIFF_MSC_DEPRECATED int64_t int64 TIFF_GCC_DEPRECATED;
+typedef TIFF_MSC_DEPRECATED uint64_t uint64 TIFF_GCC_DEPRECATED;
+#endif /* TIFF_DISABLE_DEPRECATED */
 
 /*
  * Some types as promoted in a variable argument list
@@ -88,24 +102,26 @@ typedef int uint16_vap;
 /*
  * TIFF header.
  */
-typedef struct {
-	uint16 tiff_magic;      /* magic number (defines byte order) */
-	uint16 tiff_version;    /* TIFF version number */
+typedef struct
+{
+    uint16_t tiff_magic;   /* magic number (defines byte order) */
+    uint16_t tiff_version; /* TIFF version number */
 } TIFFHeaderCommon;
-typedef struct {
-	uint16 tiff_magic;      /* magic number (defines byte order) */
-	uint16 tiff_version;    /* TIFF version number */
-	uint32 tiff_diroff;     /* byte offset to first directory */
+typedef struct
+{
+    uint16_t tiff_magic;   /* magic number (defines byte order) */
+    uint16_t tiff_version; /* TIFF version number */
+    uint32_t tiff_diroff;  /* byte offset to first directory */
 } TIFFHeaderClassic;
-typedef struct {
-	uint16 tiff_magic;      /* magic number (defines byte order) */
-	uint16 tiff_version;    /* TIFF version number */
-	uint16 tiff_offsetsize; /* size of offsets, should be 8 */
-	uint16 tiff_unused;     /* unused word, should be 0 */
-	uint64 tiff_diroff;     /* byte offset to first directory */
+typedef struct
+{
+    uint16_t tiff_magic;      /* magic number (defines byte order) */
+    uint16_t tiff_version;    /* TIFF version number */
+    uint16_t tiff_offsetsize; /* size of offsets, should be 8 */
+    uint16_t tiff_unused;     /* unused word, should be 0 */
+    uint64_t tiff_diroff;     /* byte offset to first directory */
 } TIFFHeaderBig;
 
-
 /*
  * NB: In the comments below,
  *  - items marked with a + are obsoleted by revision 5.0,
@@ -120,464 +136,551 @@ typedef struct {
  *
  * Note: RATIONALs are the ratio of two 32-bit integer values.
  *--:
- * Note2: TIFF_IFD8 data type is used in tiffFields[]-tag definition in order to distinguish the write-handling 
-          of those tags between ClassicTIFF and BigTiff:
-		  For ClassicTIFF libtiff writes a 32-bit value and the TIFF_IFD type-id into the file
-		  For BigTIFF     libtiff writes a 64-bit value and the TIFF_IFD8 type-id into the file
+ * Note2: TIFF_IFD8 data type is used in tiffFields[]-tag definition in order to
+ distinguish the write-handling of those tags between ClassicTIFF and BigTiff:
+                  For ClassicTIFF libtiff writes a 32-bit value and the TIFF_IFD
+ type-id into the file For BigTIFF     libtiff writes a 64-bit value and the
+ TIFF_IFD8 type-id into the file
  */
-typedef enum {
-	TIFF_NOTYPE = 0,      /* placeholder */
-	TIFF_BYTE = 1,        /* 8-bit unsigned integer */
-	TIFF_ASCII = 2,       /* 8-bit bytes w/ last byte null */
-	TIFF_SHORT = 3,       /* 16-bit unsigned integer */
-	TIFF_LONG = 4,        /* 32-bit unsigned integer */
-	TIFF_RATIONAL = 5,    /* 64-bit unsigned fraction */
-	TIFF_SBYTE = 6,       /* !8-bit signed integer */
-	TIFF_UNDEFINED = 7,   /* !8-bit untyped data */
-	TIFF_SSHORT = 8,      /* !16-bit signed integer */
-	TIFF_SLONG = 9,       /* !32-bit signed integer */
-	TIFF_SRATIONAL = 10,  /* !64-bit signed fraction */
-	TIFF_FLOAT = 11,      /* !32-bit IEEE floating point */
-	TIFF_DOUBLE = 12,     /* !64-bit IEEE floating point */
-	TIFF_IFD = 13,        /* %32-bit unsigned integer (offset) */
-	TIFF_LONG8 = 16,      /* BigTIFF 64-bit unsigned integer */
-	TIFF_SLONG8 = 17,     /* BigTIFF 64-bit signed integer */
-	TIFF_IFD8 = 18        /* BigTIFF 64-bit unsigned integer (offset) */
+typedef enum
+{
+    TIFF_NOTYPE = 0,     /* placeholder */
+    TIFF_BYTE = 1,       /* 8-bit unsigned integer */
+    TIFF_ASCII = 2,      /* 8-bit bytes w/ last byte null */
+    TIFF_SHORT = 3,      /* 16-bit unsigned integer */
+    TIFF_LONG = 4,       /* 32-bit unsigned integer */
+    TIFF_RATIONAL = 5,   /* 64-bit unsigned fraction */
+    TIFF_SBYTE = 6,      /* !8-bit signed integer */
+    TIFF_UNDEFINED = 7,  /* !8-bit untyped data */
+    TIFF_SSHORT = 8,     /* !16-bit signed integer */
+    TIFF_SLONG = 9,      /* !32-bit signed integer */
+    TIFF_SRATIONAL = 10, /* !64-bit signed fraction */
+    TIFF_FLOAT = 11,     /* !32-bit IEEE floating point */
+    TIFF_DOUBLE = 12,    /* !64-bit IEEE floating point */
+    TIFF_IFD = 13,       /* %32-bit unsigned integer (offset) */
+    TIFF_LONG8 = 16,     /* BigTIFF 64-bit unsigned integer */
+    TIFF_SLONG8 = 17,    /* BigTIFF 64-bit signed integer */
+    TIFF_IFD8 = 18       /* BigTIFF 64-bit unsigned integer (offset) */
 } TIFFDataType;
 
 /*
  * TIFF Tag Definitions.
  */
-#define	TIFFTAG_SUBFILETYPE		254	/* subfile data descriptor */
-#define	    FILETYPE_REDUCEDIMAGE	0x1	/* reduced resolution version */
-#define	    FILETYPE_PAGE		0x2	/* one page of many */
-#define	    FILETYPE_MASK		0x4	/* transparency mask */
-#define	TIFFTAG_OSUBFILETYPE		255	/* +kind of data in subfile */
-#define	    OFILETYPE_IMAGE		1	/* full resolution image data */
-#define	    OFILETYPE_REDUCEDIMAGE	2	/* reduced size image data */
-#define	    OFILETYPE_PAGE		3	/* one page of many */
-#define	TIFFTAG_IMAGEWIDTH		256	/* image width in pixels */
-#define	TIFFTAG_IMAGELENGTH		257	/* image height in pixels */
-#define	TIFFTAG_BITSPERSAMPLE		258	/* bits per channel (sample) */
-#define	TIFFTAG_COMPRESSION		259	/* data compression technique */
-#define	    COMPRESSION_NONE		1	/* dump mode */
-#define	    COMPRESSION_CCITTRLE	2	/* CCITT modified Huffman RLE */
-#define	    COMPRESSION_CCITTFAX3	3	/* CCITT Group 3 fax encoding */
-#define     COMPRESSION_CCITT_T4        3       /* CCITT T.4 (TIFF 6 name) */
-#define	    COMPRESSION_CCITTFAX4	4	/* CCITT Group 4 fax encoding */
-#define     COMPRESSION_CCITT_T6        4       /* CCITT T.6 (TIFF 6 name) */
-#define	    COMPRESSION_LZW		5       /* Lempel-Ziv  & Welch */
-#define	    COMPRESSION_OJPEG		6	/* !6.0 JPEG */
-#define	    COMPRESSION_JPEG		7	/* %JPEG DCT compression */
-#define     COMPRESSION_T85			9	/* !TIFF/FX T.85 JBIG compression */
-#define     COMPRESSION_T43			10	/* !TIFF/FX T.43 colour by layered JBIG compression */
-#define	    COMPRESSION_NEXT		32766	/* NeXT 2-bit RLE */
-#define	    COMPRESSION_CCITTRLEW	32771	/* #1 w/ word alignment */
-#define	    COMPRESSION_PACKBITS	32773	/* Macintosh RLE */
-#define	    COMPRESSION_THUNDERSCAN	32809	/* ThunderScan RLE */
+/* clang-format off */   /* for better readability of tag comments */
+#define TIFFTAG_SUBFILETYPE 254       /* subfile data descriptor */
+#define FILETYPE_REDUCEDIMAGE 0x1     /* reduced resolution version */
+#define FILETYPE_PAGE 0x2             /* one page of many */
+#define FILETYPE_MASK 0x4             /* transparency mask */
+#define TIFFTAG_OSUBFILETYPE 255      /* +kind of data in subfile */
+#define OFILETYPE_IMAGE 1             /* full resolution image data */
+#define OFILETYPE_REDUCEDIMAGE 2      /* reduced size image data */
+#define OFILETYPE_PAGE 3              /* one page of many */
+#define TIFFTAG_IMAGEWIDTH 256        /* image width in pixels */
+#define TIFFTAG_IMAGELENGTH 257       /* image height in pixels */
+#define TIFFTAG_BITSPERSAMPLE 258     /* bits per channel (sample) */
+#define TIFFTAG_COMPRESSION 259       /* data compression technique */
+#define COMPRESSION_NONE 1            /* dump mode */
+#define COMPRESSION_CCITTRLE 2        /* CCITT modified Huffman RLE */
+#define COMPRESSION_CCITTFAX3 3       /* CCITT Group 3 fax encoding */
+#define COMPRESSION_CCITT_T4 3        /* CCITT T.4 (TIFF 6 name) */
+#define COMPRESSION_CCITTFAX4 4       /* CCITT Group 4 fax encoding */
+#define COMPRESSION_CCITT_T6 4        /* CCITT T.6 (TIFF 6 name) */
+#define COMPRESSION_LZW 5             /* Lempel-Ziv  & Welch */
+#define COMPRESSION_OJPEG 6           /* !6.0 JPEG */
+#define COMPRESSION_JPEG 7            /* %JPEG DCT compression */
+#define COMPRESSION_T85 9             /* !TIFF/FX T.85 JBIG compression */
+#define COMPRESSION_T43 10            /* !TIFF/FX T.43 colour by layered JBIG compression */
+#define COMPRESSION_NEXT 32766        /* NeXT 2-bit RLE */
+#define COMPRESSION_CCITTRLEW 32771   /* #1 w/ word alignment */
+#define COMPRESSION_PACKBITS 32773    /* Macintosh RLE */
+#define COMPRESSION_THUNDERSCAN 32809 /* ThunderScan RLE */
 /* codes 32895-32898 are reserved for ANSI IT8 TIFF/IT <dkelly@apago.com) */
-#define	    COMPRESSION_IT8CTPAD	32895   /* IT8 CT w/padding */
-#define	    COMPRESSION_IT8LW		32896   /* IT8 Linework RLE */
-#define	    COMPRESSION_IT8MP		32897   /* IT8 Monochrome picture */
-#define	    COMPRESSION_IT8BL		32898   /* IT8 Binary line art */
+#define COMPRESSION_IT8CTPAD 32895 /* IT8 CT w/padding */
+#define COMPRESSION_IT8LW 32896    /* IT8 Linework RLE */
+#define COMPRESSION_IT8MP 32897    /* IT8 Monochrome picture */
+#define COMPRESSION_IT8BL 32898    /* IT8 Binary line art */
 /* compression codes 32908-32911 are reserved for Pixar */
-#define     COMPRESSION_PIXARFILM	32908   /* Pixar companded 10bit LZW */
-#define	    COMPRESSION_PIXARLOG	32909   /* Pixar companded 11bit ZIP */
-#define	    COMPRESSION_DEFLATE		32946	/* Deflate compression */
-#define     COMPRESSION_ADOBE_DEFLATE   8       /* Deflate compression,
-						   as recognized by Adobe */
+#define COMPRESSION_PIXARFILM 32908 /* Pixar companded 10bit LZW */
+#define COMPRESSION_PIXARLOG 32909  /* Pixar companded 11bit ZIP */
+#define COMPRESSION_DEFLATE 32946   /* Deflate compression, legacy tag */
+#define COMPRESSION_ADOBE_DEFLATE 8 /* Deflate compression, as recognized by Adobe */
 /* compression code 32947 is reserved for Oceana Matrix <dev@oceana.com> */
-#define     COMPRESSION_DCS             32947   /* Kodak DCS encoding */
-#define	    COMPRESSION_JBIG		34661	/* ISO JBIG */
-#define     COMPRESSION_SGILOG		34676	/* SGI Log Luminance RLE */
-#define     COMPRESSION_SGILOG24	34677	/* SGI Log 24-bit packed */
-#define     COMPRESSION_JP2000          34712   /* Leadtools JPEG2000 */
-#define     COMPRESSION_LERC            34887   /* ESRI Lerc codec: https://github.com/Esri/lerc */
+#define COMPRESSION_DCS 32947      /* Kodak DCS encoding */
+#define COMPRESSION_JBIG 34661     /* ISO JBIG */
+#define COMPRESSION_SGILOG 34676   /* SGI Log Luminance RLE */
+#define COMPRESSION_SGILOG24 34677 /* SGI Log 24-bit packed */
+#define COMPRESSION_JP2000 34712   /* Leadtools JPEG2000 */
+#define COMPRESSION_LERC 34887     /* ESRI Lerc codec: https://github.com/Esri/lerc */
 /* compression codes 34887-34889 are reserved for ESRI */
-#define	    COMPRESSION_LZMA		34925	/* LZMA2 */
-#define	    COMPRESSION_ZSTD		50000	/* ZSTD: WARNING not registered in Adobe-maintained registry */
-#define	    COMPRESSION_WEBP		50001	/* WEBP: WARNING not registered in Adobe-maintained registry */
-#define	TIFFTAG_PHOTOMETRIC		262	/* photometric interpretation */
-#define	    PHOTOMETRIC_MINISWHITE	0	/* min value is white */
-#define	    PHOTOMETRIC_MINISBLACK	1	/* min value is black */
-#define	    PHOTOMETRIC_RGB		2	/* RGB color model */
-#define	    PHOTOMETRIC_PALETTE		3	/* color map indexed */
-#define	    PHOTOMETRIC_MASK		4	/* $holdout mask */
-#define	    PHOTOMETRIC_SEPARATED	5	/* !color separations */
-#define	    PHOTOMETRIC_YCBCR		6	/* !CCIR 601 */
-#define	    PHOTOMETRIC_CIELAB		8	/* !1976 CIE L*a*b* */
-#define	    PHOTOMETRIC_ICCLAB		9	/* ICC L*a*b* [Adobe TIFF Technote 4] */
-#define	    PHOTOMETRIC_ITULAB		10	/* ITU L*a*b* */
-#define	    PHOTOMETRIC_CFA		32803	/* color filter array */
-#define     PHOTOMETRIC_LOGL		32844	/* CIE Log2(L) */
-#define     PHOTOMETRIC_LOGLUV		32845	/* CIE Log2(L) (u',v') */
-#define	TIFFTAG_THRESHHOLDING		263	/* +thresholding used on data */
-#define	    THRESHHOLD_BILEVEL		1	/* b&w art scan */
-#define	    THRESHHOLD_HALFTONE		2	/* or dithered scan */
-#define	    THRESHHOLD_ERRORDIFFUSE	3	/* usually floyd-steinberg */
-#define	TIFFTAG_CELLWIDTH		264	/* +dithering matrix width */
-#define	TIFFTAG_CELLLENGTH		265	/* +dithering matrix height */
-#define	TIFFTAG_FILLORDER		266	/* data order within a byte */
-#define	    FILLORDER_MSB2LSB		1	/* most significant -> least */
-#define	    FILLORDER_LSB2MSB		2	/* least significant -> most */
-#define	TIFFTAG_DOCUMENTNAME		269	/* name of doc. image is from */
-#define	TIFFTAG_IMAGEDESCRIPTION	270	/* info about image */
-#define	TIFFTAG_MAKE			271	/* scanner manufacturer name */
-#define	TIFFTAG_MODEL			272	/* scanner model name/number */
-#define	TIFFTAG_STRIPOFFSETS		273	/* offsets to data strips */
-#define	TIFFTAG_ORIENTATION		274	/* +image orientation */
-#define	    ORIENTATION_TOPLEFT		1	/* row 0 top, col 0 lhs */
-#define	    ORIENTATION_TOPRIGHT	2	/* row 0 top, col 0 rhs */
-#define	    ORIENTATION_BOTRIGHT	3	/* row 0 bottom, col 0 rhs */
-#define	    ORIENTATION_BOTLEFT		4	/* row 0 bottom, col 0 lhs */
-#define	    ORIENTATION_LEFTTOP		5	/* row 0 lhs, col 0 top */
-#define	    ORIENTATION_RIGHTTOP	6	/* row 0 rhs, col 0 top */
-#define	    ORIENTATION_RIGHTBOT	7	/* row 0 rhs, col 0 bottom */
-#define	    ORIENTATION_LEFTBOT		8	/* row 0 lhs, col 0 bottom */
-#define	TIFFTAG_SAMPLESPERPIXEL		277	/* samples per pixel */
-#define	TIFFTAG_ROWSPERSTRIP		278	/* rows per strip of data */
-#define	TIFFTAG_STRIPBYTECOUNTS		279	/* bytes counts for strips */
-#define	TIFFTAG_MINSAMPLEVALUE		280	/* +minimum sample value */
-#define	TIFFTAG_MAXSAMPLEVALUE		281	/* +maximum sample value */
-#define	TIFFTAG_XRESOLUTION		282	/* pixels/resolution in x */
-#define	TIFFTAG_YRESOLUTION		283	/* pixels/resolution in y */
-#define	TIFFTAG_PLANARCONFIG		284	/* storage organization */
-#define	    PLANARCONFIG_CONTIG		1	/* single image plane */
-#define	    PLANARCONFIG_SEPARATE	2	/* separate planes of data */
-#define	TIFFTAG_PAGENAME		285	/* page name image is from */
-#define	TIFFTAG_XPOSITION		286	/* x page offset of image lhs */
-#define	TIFFTAG_YPOSITION		287	/* y page offset of image lhs */
-#define	TIFFTAG_FREEOFFSETS		288	/* +byte offset to free block */
-#define	TIFFTAG_FREEBYTECOUNTS		289	/* +sizes of free blocks */
-#define	TIFFTAG_GRAYRESPONSEUNIT	290	/* $gray scale curve accuracy */
-#define	    GRAYRESPONSEUNIT_10S	1	/* tenths of a unit */
-#define	    GRAYRESPONSEUNIT_100S	2	/* hundredths of a unit */
-#define	    GRAYRESPONSEUNIT_1000S	3	/* thousandths of a unit */
-#define	    GRAYRESPONSEUNIT_10000S	4	/* ten-thousandths of a unit */
-#define	    GRAYRESPONSEUNIT_100000S	5	/* hundred-thousandths */
-#define	TIFFTAG_GRAYRESPONSECURVE	291	/* $gray scale response curve */
-#define	TIFFTAG_GROUP3OPTIONS		292	/* 32 flag bits */
-#define	TIFFTAG_T4OPTIONS		292	/* TIFF 6.0 proper name alias */
-#define	    GROUP3OPT_2DENCODING	0x1	/* 2-dimensional coding */
-#define	    GROUP3OPT_UNCOMPRESSED	0x2	/* data not compressed */
-#define	    GROUP3OPT_FILLBITS		0x4	/* fill to byte boundary */
-#define	TIFFTAG_GROUP4OPTIONS		293	/* 32 flag bits */
-#define TIFFTAG_T6OPTIONS               293     /* TIFF 6.0 proper name */
-#define	    GROUP4OPT_UNCOMPRESSED	0x2	/* data not compressed */
-#define	TIFFTAG_RESOLUTIONUNIT		296	/* units of resolutions */
-#define	    RESUNIT_NONE		1	/* no meaningful units */
-#define	    RESUNIT_INCH		2	/* english */
-#define	    RESUNIT_CENTIMETER		3	/* metric */
-#define	TIFFTAG_PAGENUMBER		297	/* page numbers of multi-page */
-#define	TIFFTAG_COLORRESPONSEUNIT	300	/* $color curve accuracy */
-#define	    COLORRESPONSEUNIT_10S	1	/* tenths of a unit */
-#define	    COLORRESPONSEUNIT_100S	2	/* hundredths of a unit */
-#define	    COLORRESPONSEUNIT_1000S	3	/* thousandths of a unit */
-#define	    COLORRESPONSEUNIT_10000S	4	/* ten-thousandths of a unit */
-#define	    COLORRESPONSEUNIT_100000S	5	/* hundred-thousandths */
-#define	TIFFTAG_TRANSFERFUNCTION	301	/* !colorimetry info */
-#define	TIFFTAG_SOFTWARE		305	/* name & release */
-#define	TIFFTAG_DATETIME		306	/* creation date and time */
-#define	TIFFTAG_ARTIST			315	/* creator of image */
-#define	TIFFTAG_HOSTCOMPUTER		316	/* machine where created */
-#define	TIFFTAG_PREDICTOR		317	/* prediction scheme w/ LZW */
-#define     PREDICTOR_NONE		1	/* no prediction scheme used */
-#define     PREDICTOR_HORIZONTAL	2	/* horizontal differencing */
-#define     PREDICTOR_FLOATINGPOINT	3	/* floating point predictor */
-#define	TIFFTAG_WHITEPOINT		318	/* image white point */
-#define	TIFFTAG_PRIMARYCHROMATICITIES	319	/* !primary chromaticities */
-#define	TIFFTAG_COLORMAP		320	/* RGB map for palette image */
-#define	TIFFTAG_HALFTONEHINTS		321	/* !highlight+shadow info */
-#define	TIFFTAG_TILEWIDTH		322	/* !tile width in pixels */
-#define	TIFFTAG_TILELENGTH		323	/* !tile height in pixels */
-#define TIFFTAG_TILEOFFSETS		324	/* !offsets to data tiles */
-#define TIFFTAG_TILEBYTECOUNTS		325	/* !byte counts for tiles */
-#define	TIFFTAG_BADFAXLINES		326	/* lines w/ wrong pixel count */
-#define	TIFFTAG_CLEANFAXDATA		327	/* regenerated line info */
-#define	    CLEANFAXDATA_CLEAN		0	/* no errors detected */
-#define	    CLEANFAXDATA_REGENERATED	1	/* receiver regenerated lines */
-#define	    CLEANFAXDATA_UNCLEAN	2	/* uncorrected errors exist */
-#define	TIFFTAG_CONSECUTIVEBADFAXLINES	328	/* max consecutive bad lines */
-#define	TIFFTAG_SUBIFD			330	/* subimage descriptors */
-#define	TIFFTAG_INKSET			332	/* !inks in separated image */
-#define	    INKSET_CMYK			1	/* !cyan-magenta-yellow-black color */
-#define	    INKSET_MULTIINK		2	/* !multi-ink or hi-fi color */
-#define	TIFFTAG_INKNAMES		333	/* !ascii names of inks */
-#define	TIFFTAG_NUMBEROFINKS		334	/* !number of inks */
-#define	TIFFTAG_DOTRANGE		336	/* !0% and 100% dot codes */
-#define	TIFFTAG_TARGETPRINTER		337	/* !separation target */
-#define	TIFFTAG_EXTRASAMPLES		338	/* !info about extra samples */
-#define	    EXTRASAMPLE_UNSPECIFIED	0	/* !unspecified data */
-#define	    EXTRASAMPLE_ASSOCALPHA	1	/* !associated alpha data */
-#define	    EXTRASAMPLE_UNASSALPHA	2	/* !unassociated alpha data */
-#define	TIFFTAG_SAMPLEFORMAT		339	/* !data sample format */
-#define	    SAMPLEFORMAT_UINT		1	/* !unsigned integer data */
-#define	    SAMPLEFORMAT_INT		2	/* !signed integer data */
-#define	    SAMPLEFORMAT_IEEEFP		3	/* !IEEE floating point data */
-#define	    SAMPLEFORMAT_VOID		4	/* !untyped data */
-#define	    SAMPLEFORMAT_COMPLEXINT	5	/* !complex signed int */
-#define	    SAMPLEFORMAT_COMPLEXIEEEFP	6	/* !complex ieee floating */
-#define	TIFFTAG_SMINSAMPLEVALUE		340	/* !variable MinSampleValue */
-#define	TIFFTAG_SMAXSAMPLEVALUE		341	/* !variable MaxSampleValue */
-#define	TIFFTAG_CLIPPATH		343	/* %ClipPath
-						   [Adobe TIFF technote 2] */
-#define	TIFFTAG_XCLIPPATHUNITS		344	/* %XClipPathUnits
-						   [Adobe TIFF technote 2] */
-#define	TIFFTAG_YCLIPPATHUNITS		345	/* %YClipPathUnits
-						   [Adobe TIFF technote 2] */
-#define	TIFFTAG_INDEXED			346	/* %Indexed
-						   [Adobe TIFF Technote 3] */
-#define	TIFFTAG_JPEGTABLES		347	/* %JPEG table stream */
-#define	TIFFTAG_OPIPROXY		351	/* %OPI Proxy [Adobe TIFF technote] */
+#define COMPRESSION_LZMA 34925             /* LZMA2 */
+#define COMPRESSION_ZSTD 50000             /* ZSTD: WARNING not registered in Adobe-maintained registry */
+#define COMPRESSION_WEBP 50001             /* WEBP: WARNING not registered in Adobe-maintained registry */
+#define COMPRESSION_JXL 50002              /* JPEGXL: WARNING not registered in Adobe-maintained registry */
+#define TIFFTAG_PHOTOMETRIC 262            /* photometric interpretation */
+#define PHOTOMETRIC_MINISWHITE 0           /* min value is white */
+#define PHOTOMETRIC_MINISBLACK 1           /* min value is black */
+#define PHOTOMETRIC_RGB 2                  /* RGB color model */
+#define PHOTOMETRIC_PALETTE 3              /* color map indexed */
+#define PHOTOMETRIC_MASK 4                 /* $holdout mask */
+#define PHOTOMETRIC_SEPARATED 5            /* !color separations */
+#define PHOTOMETRIC_YCBCR 6                /* !CCIR 601 */
+#define PHOTOMETRIC_CIELAB 8               /* !1976 CIE L*a*b* */
+#define PHOTOMETRIC_ICCLAB 9               /* ICC L*a*b* [Adobe TIFF Technote 4] */
+#define PHOTOMETRIC_ITULAB 10              /* ITU L*a*b* */
+#define PHOTOMETRIC_CFA 32803              /* color filter array */
+#define PHOTOMETRIC_LOGL 32844             /* CIE Log2(L) */
+#define PHOTOMETRIC_LOGLUV 32845           /* CIE Log2(L) (u',v') */
+#define TIFFTAG_THRESHHOLDING 263          /* +thresholding used on data */
+#define THRESHHOLD_BILEVEL 1               /* b&w art scan */
+#define THRESHHOLD_HALFTONE 2              /* or dithered scan */
+#define THRESHHOLD_ERRORDIFFUSE 3          /* usually floyd-steinberg */
+#define TIFFTAG_CELLWIDTH 264              /* +dithering matrix width */
+#define TIFFTAG_CELLLENGTH 265             /* +dithering matrix height */
+#define TIFFTAG_FILLORDER 266              /* data order within a byte */
+#define FILLORDER_MSB2LSB 1                /* most significant -> least */
+#define FILLORDER_LSB2MSB 2                /* least significant -> most */
+#define TIFFTAG_DOCUMENTNAME 269           /* name of doc. image is from */
+#define TIFFTAG_IMAGEDESCRIPTION 270       /* info about image */
+#define TIFFTAG_MAKE 271                   /* scanner manufacturer name */
+#define TIFFTAG_MODEL 272                  /* scanner model name/number */
+#define TIFFTAG_STRIPOFFSETS 273           /* offsets to data strips */
+#define TIFFTAG_ORIENTATION 274            /* +image orientation */
+#define ORIENTATION_TOPLEFT 1              /* row 0 top, col 0 lhs */
+#define ORIENTATION_TOPRIGHT 2             /* row 0 top, col 0 rhs */
+#define ORIENTATION_BOTRIGHT 3             /* row 0 bottom, col 0 rhs */
+#define ORIENTATION_BOTLEFT 4              /* row 0 bottom, col 0 lhs */
+#define ORIENTATION_LEFTTOP 5              /* row 0 lhs, col 0 top */
+#define ORIENTATION_RIGHTTOP 6             /* row 0 rhs, col 0 top */
+#define ORIENTATION_RIGHTBOT 7             /* row 0 rhs, col 0 bottom */
+#define ORIENTATION_LEFTBOT 8              /* row 0 lhs, col 0 bottom */
+#define TIFFTAG_SAMPLESPERPIXEL 277        /* samples per pixel */
+#define TIFFTAG_ROWSPERSTRIP 278           /* rows per strip of data */
+#define TIFFTAG_STRIPBYTECOUNTS 279        /* bytes counts for strips */
+#define TIFFTAG_MINSAMPLEVALUE 280         /* +minimum sample value */
+#define TIFFTAG_MAXSAMPLEVALUE 281         /* +maximum sample value */
+#define TIFFTAG_XRESOLUTION 282            /* pixels/resolution in x */
+#define TIFFTAG_YRESOLUTION 283            /* pixels/resolution in y */
+#define TIFFTAG_PLANARCONFIG 284           /* storage organization */
+#define PLANARCONFIG_CONTIG 1              /* single image plane */
+#define PLANARCONFIG_SEPARATE 2            /* separate planes of data */
+#define TIFFTAG_PAGENAME 285               /* page name image is from */
+#define TIFFTAG_XPOSITION 286              /* x page offset of image lhs */
+#define TIFFTAG_YPOSITION 287              /* y page offset of image lhs */
+#define TIFFTAG_FREEOFFSETS 288            /* +byte offset to free block */
+#define TIFFTAG_FREEBYTECOUNTS 289         /* +sizes of free blocks */
+#define TIFFTAG_GRAYRESPONSEUNIT 290       /* $gray scale curve accuracy */
+#define GRAYRESPONSEUNIT_10S 1             /* tenths of a unit */
+#define GRAYRESPONSEUNIT_100S 2            /* hundredths of a unit */
+#define GRAYRESPONSEUNIT_1000S 3           /* thousandths of a unit */
+#define GRAYRESPONSEUNIT_10000S 4          /* ten-thousandths of a unit */
+#define GRAYRESPONSEUNIT_100000S 5         /* hundred-thousandths */
+#define TIFFTAG_GRAYRESPONSECURVE 291      /* $gray scale response curve */
+#define TIFFTAG_GROUP3OPTIONS 292          /* 32 flag bits */
+#define TIFFTAG_T4OPTIONS 292              /* TIFF 6.0 proper name alias */
+#define GROUP3OPT_2DENCODING 0x1           /* 2-dimensional coding */
+#define GROUP3OPT_UNCOMPRESSED 0x2         /* data not compressed */
+#define GROUP3OPT_FILLBITS 0x4             /* fill to byte boundary */
+#define TIFFTAG_GROUP4OPTIONS 293          /* 32 flag bits */
+#define TIFFTAG_T6OPTIONS 293              /* TIFF 6.0 proper name */
+#define GROUP4OPT_UNCOMPRESSED 0x2         /* data not compressed */
+#define TIFFTAG_RESOLUTIONUNIT 296         /* units of resolutions */
+#define RESUNIT_NONE 1                     /* no meaningful units */
+#define RESUNIT_INCH 2                     /* english */
+#define RESUNIT_CENTIMETER 3               /* metric */
+#define TIFFTAG_PAGENUMBER 297             /* page numbers of multi-page */
+#define TIFFTAG_COLORRESPONSEUNIT 300      /* $color curve accuracy */
+#define COLORRESPONSEUNIT_10S 1            /* tenths of a unit */
+#define COLORRESPONSEUNIT_100S 2           /* hundredths of a unit */
+#define COLORRESPONSEUNIT_1000S 3          /* thousandths of a unit */
+#define COLORRESPONSEUNIT_10000S 4         /* ten-thousandths of a unit */
+#define COLORRESPONSEUNIT_100000S 5        /* hundred-thousandths */
+#define TIFFTAG_TRANSFERFUNCTION 301       /* !colorimetry info */
+#define TIFFTAG_SOFTWARE 305               /* name & release */
+#define TIFFTAG_DATETIME 306               /* creation date and time */
+#define TIFFTAG_ARTIST 315                 /* creator of image */
+#define TIFFTAG_HOSTCOMPUTER 316           /* machine where created */
+#define TIFFTAG_PREDICTOR 317              /* prediction scheme w/ LZW */
+#define PREDICTOR_NONE 1                   /* no prediction scheme used */
+#define PREDICTOR_HORIZONTAL 2             /* horizontal differencing */
+#define PREDICTOR_FLOATINGPOINT 3          /* floating point predictor */
+#define TIFFTAG_WHITEPOINT 318             /* image white point */
+#define TIFFTAG_PRIMARYCHROMATICITIES 319  /* !primary chromaticities */
+#define TIFFTAG_COLORMAP 320               /* RGB map for palette image */
+#define TIFFTAG_HALFTONEHINTS 321          /* !highlight+shadow info */
+#define TIFFTAG_TILEWIDTH 322              /* !tile width in pixels */
+#define TIFFTAG_TILELENGTH 323             /* !tile height in pixels */
+#define TIFFTAG_TILEOFFSETS 324            /* !offsets to data tiles */
+#define TIFFTAG_TILEBYTECOUNTS 325         /* !byte counts for tiles */
+#define TIFFTAG_BADFAXLINES 326            /* lines w/ wrong pixel count */
+#define TIFFTAG_CLEANFAXDATA 327           /* regenerated line info */
+#define CLEANFAXDATA_CLEAN 0               /* no errors detected */
+#define CLEANFAXDATA_REGENERATED 1         /* receiver regenerated lines */
+#define CLEANFAXDATA_UNCLEAN 2             /* uncorrected errors exist */
+#define TIFFTAG_CONSECUTIVEBADFAXLINES 328 /* max consecutive bad lines */
+#define TIFFTAG_SUBIFD 330                 /* subimage descriptors */
+#define TIFFTAG_INKSET 332                 /* !inks in separated image */
+#define INKSET_CMYK 1                      /* !cyan-magenta-yellow-black color */
+#define INKSET_MULTIINK 2                  /* !multi-ink or hi-fi color */
+#define TIFFTAG_INKNAMES 333               /* !ascii names of inks */
+#define TIFFTAG_NUMBEROFINKS 334           /* !number of inks */
+#define TIFFTAG_DOTRANGE 336               /* !0% and 100% dot codes */
+#define TIFFTAG_TARGETPRINTER 337          /* !separation target */
+#define TIFFTAG_EXTRASAMPLES 338           /* !info about extra samples */
+#define EXTRASAMPLE_UNSPECIFIED 0          /* !unspecified data */
+#define EXTRASAMPLE_ASSOCALPHA 1           /* !associated alpha data */
+#define EXTRASAMPLE_UNASSALPHA 2           /* !unassociated alpha data */
+#define TIFFTAG_SAMPLEFORMAT 339           /* !data sample format */
+#define SAMPLEFORMAT_UINT 1                /* !unsigned integer data */
+#define SAMPLEFORMAT_INT 2                 /* !signed integer data */
+#define SAMPLEFORMAT_IEEEFP 3              /* !IEEE floating point data */
+#define SAMPLEFORMAT_VOID 4                /* !untyped data */
+#define SAMPLEFORMAT_COMPLEXINT 5          /* !complex signed int */
+#define SAMPLEFORMAT_COMPLEXIEEEFP 6       /* !complex ieee floating */
+#define TIFFTAG_SMINSAMPLEVALUE 340        /* !variable MinSampleValue */
+#define TIFFTAG_SMAXSAMPLEVALUE 341        /* !variable MaxSampleValue */
+#define TIFFTAG_CLIPPATH 343               /* %ClipPath [Adobe TIFF technote 2] */
+#define TIFFTAG_XCLIPPATHUNITS 344         /* %XClipPathUnits [Adobe TIFF technote 2] */
+#define TIFFTAG_YCLIPPATHUNITS 345         /* %YClipPathUnits [Adobe TIFF technote 2] */
+#define TIFFTAG_INDEXED 346                /* %Indexed [Adobe TIFF Technote 3] */
+#define TIFFTAG_JPEGTABLES 347             /* %JPEG table stream */
+#define TIFFTAG_OPIPROXY 351               /* %OPI Proxy [Adobe TIFF technote] */
 /* Tags 400-435 are from the TIFF/FX spec */
-#define TIFFTAG_GLOBALPARAMETERSIFD	400	/* ! */
-#define TIFFTAG_PROFILETYPE			401	/* ! */
-#define     PROFILETYPE_UNSPECIFIED	0	/* ! */
-#define     PROFILETYPE_G3_FAX		1	/* ! */
-#define TIFFTAG_FAXPROFILE			402	/* ! */
-#define     FAXPROFILE_S			1	/* !TIFF/FX FAX profile S */
-#define     FAXPROFILE_F			2	/* !TIFF/FX FAX profile F */
-#define     FAXPROFILE_J			3	/* !TIFF/FX FAX profile J */
-#define     FAXPROFILE_C			4	/* !TIFF/FX FAX profile C */
-#define     FAXPROFILE_L			5	/* !TIFF/FX FAX profile L */
-#define     FAXPROFILE_M			6	/* !TIFF/FX FAX profile LM */
-#define TIFFTAG_CODINGMETHODS		403	/* !TIFF/FX coding methods */
-#define     CODINGMETHODS_T4_1D		(1 << 1)	/* !T.4 1D */
-#define     CODINGMETHODS_T4_2D		(1 << 2)	/* !T.4 2D */
-#define     CODINGMETHODS_T6		(1 << 3)	/* !T.6 */
-#define     CODINGMETHODS_T85 		(1 << 4)	/* !T.85 JBIG */
-#define     CODINGMETHODS_T42 		(1 << 5)	/* !T.42 JPEG */
-#define     CODINGMETHODS_T43		(1 << 6)	/* !T.43 colour by layered JBIG */
-#define TIFFTAG_VERSIONYEAR			404	/* !TIFF/FX version year */
-#define TIFFTAG_MODENUMBER			405	/* !TIFF/FX mode number */
-#define TIFFTAG_DECODE				433	/* !TIFF/FX decode */
-#define TIFFTAG_IMAGEBASECOLOR		434	/* !TIFF/FX image base colour */
-#define TIFFTAG_T82OPTIONS			435	/* !TIFF/FX T.82 options */
+#define TIFFTAG_GLOBALPARAMETERSIFD 400 /* ! */
+#define TIFFTAG_PROFILETYPE 401         /* ! */
+#define PROFILETYPE_UNSPECIFIED 0       /* ! */
+#define PROFILETYPE_G3_FAX 1            /* ! */
+#define TIFFTAG_FAXPROFILE 402          /* ! */
+#define FAXPROFILE_S 1                  /* !TIFF/FX FAX profile S */
+#define FAXPROFILE_F 2                  /* !TIFF/FX FAX profile F */
+#define FAXPROFILE_J 3                  /* !TIFF/FX FAX profile J */
+#define FAXPROFILE_C 4                  /* !TIFF/FX FAX profile C */
+#define FAXPROFILE_L 5                  /* !TIFF/FX FAX profile L */
+#define FAXPROFILE_M 6                  /* !TIFF/FX FAX profile LM */
+#define TIFFTAG_CODINGMETHODS 403       /* !TIFF/FX coding methods */
+#define CODINGMETHODS_T4_1D (1 << 1)    /* !T.4 1D */
+#define CODINGMETHODS_T4_2D (1 << 2)    /* !T.4 2D */
+#define CODINGMETHODS_T6 (1 << 3)       /* !T.6 */
+#define CODINGMETHODS_T85 (1 << 4)      /* !T.85 JBIG */
+#define CODINGMETHODS_T42 (1 << 5)      /* !T.42 JPEG */
+#define CODINGMETHODS_T43 (1 << 6)      /* !T.43 colour by layered JBIG */
+#define TIFFTAG_VERSIONYEAR 404         /* !TIFF/FX version year */
+#define TIFFTAG_MODENUMBER 405          /* !TIFF/FX mode number */
+#define TIFFTAG_DECODE 433              /* !TIFF/FX decode */
+#define TIFFTAG_IMAGEBASECOLOR 434      /* !TIFF/FX image base colour */
+#define TIFFTAG_T82OPTIONS 435          /* !TIFF/FX T.82 options */
 /*
  * Tags 512-521 are obsoleted by Technical Note #2 which specifies a
  * revised JPEG-in-TIFF scheme.
  */
-#define	TIFFTAG_JPEGPROC		512	/* !JPEG processing algorithm */
-#define	    JPEGPROC_BASELINE		1	/* !baseline sequential */
-#define	    JPEGPROC_LOSSLESS		14	/* !Huffman coded lossless */
-#define	TIFFTAG_JPEGIFOFFSET		513	/* !pointer to SOI marker */
-#define	TIFFTAG_JPEGIFBYTECOUNT		514	/* !JFIF stream length */
-#define	TIFFTAG_JPEGRESTARTINTERVAL	515	/* !restart interval length */
-#define	TIFFTAG_JPEGLOSSLESSPREDICTORS	517	/* !lossless proc predictor */
-#define	TIFFTAG_JPEGPOINTTRANSFORM	518	/* !lossless point transform */
-#define	TIFFTAG_JPEGQTABLES		519	/* !Q matrix offsets */
-#define	TIFFTAG_JPEGDCTABLES		520	/* !DCT table offsets */
-#define	TIFFTAG_JPEGACTABLES		521	/* !AC coefficient offsets */
-#define	TIFFTAG_YCBCRCOEFFICIENTS	529	/* !RGB -> YCbCr transform */
-#define	TIFFTAG_YCBCRSUBSAMPLING	530	/* !YCbCr subsampling factors */
-#define	TIFFTAG_YCBCRPOSITIONING	531	/* !subsample positioning */
-#define	    YCBCRPOSITION_CENTERED	1	/* !as in PostScript Level 2 */
-#define	    YCBCRPOSITION_COSITED	2	/* !as in CCIR 601-1 */
-#define	TIFFTAG_REFERENCEBLACKWHITE	532	/* !colorimetry info */
-#define TIFFTAG_STRIPROWCOUNTS		559 /* !TIFF/FX strip row counts */
-#define	TIFFTAG_XMLPACKET		700	/* %XML packet
-						   [Adobe XMP Specification,
-						   January 2004 */
-#define TIFFTAG_OPIIMAGEID		32781	/* %OPI ImageID
-						   [Adobe TIFF technote] */
-#define	TIFFTAG_TIFFANNOTATIONDATA	32932	/* http://web.archive.org/web/20050309141348/http://www.kofile.com/support%20pro/faqs/annospec.htm */
+#define TIFFTAG_JPEGPROC 512               /* !JPEG processing algorithm */
+#define JPEGPROC_BASELINE 1                /* !baseline sequential */
+#define JPEGPROC_LOSSLESS 14               /* !Huffman coded lossless */
+#define TIFFTAG_JPEGIFOFFSET 513           /* !pointer to SOI marker */
+#define TIFFTAG_JPEGIFBYTECOUNT 514        /* !JFIF stream length */
+#define TIFFTAG_JPEGRESTARTINTERVAL 515    /* !restart interval length */
+#define TIFFTAG_JPEGLOSSLESSPREDICTORS 517 /* !lossless proc predictor */
+#define TIFFTAG_JPEGPOINTTRANSFORM 518     /* !lossless point transform */
+#define TIFFTAG_JPEGQTABLES 519            /* !Q matrix offsets */
+#define TIFFTAG_JPEGDCTABLES 520           /* !DCT table offsets */
+#define TIFFTAG_JPEGACTABLES 521           /* !AC coefficient offsets */
+#define TIFFTAG_YCBCRCOEFFICIENTS 529      /* !RGB -> YCbCr transform */
+#define TIFFTAG_YCBCRSUBSAMPLING 530       /* !YCbCr subsampling factors */
+#define TIFFTAG_YCBCRPOSITIONING 531       /* !subsample positioning */
+#define YCBCRPOSITION_CENTERED 1           /* !as in PostScript Level 2 */
+#define YCBCRPOSITION_COSITED 2            /* !as in CCIR 601-1 */
+#define TIFFTAG_REFERENCEBLACKWHITE 532    /* !colorimetry info */
+#define TIFFTAG_STRIPROWCOUNTS 559         /* !TIFF/FX strip row counts */
+#define TIFFTAG_XMLPACKET 700              /* %XML packet [Adobe XMP Specification, January 2004 */
+#define TIFFTAG_OPIIMAGEID 32781           /* %OPI ImageID [Adobe TIFF technote] */
+/* For eiStream Annotation Specification, Version 1.00.06 see
+ * http://web.archive.org/web/20050309141348/http://www.kofile.com/support%20pro/faqs/annospec.htm */
+#define TIFFTAG_TIFFANNOTATIONDATA 32932
 /* tags 32952-32956 are private tags registered to Island Graphics */
-#define TIFFTAG_REFPTS			32953	/* image reference points */
-#define TIFFTAG_REGIONTACKPOINT		32954	/* region-xform tack point */
-#define TIFFTAG_REGIONWARPCORNERS	32955	/* warp quadrilateral */
-#define TIFFTAG_REGIONAFFINE		32956	/* affine transformation mat */
+#define TIFFTAG_REFPTS 32953            /* image reference points */
+#define TIFFTAG_REGIONTACKPOINT 32954   /* region-xform tack point */
+#define TIFFTAG_REGIONWARPCORNERS 32955 /* warp quadrilateral */
+#define TIFFTAG_REGIONAFFINE 32956      /* affine transformation mat */
 /* tags 32995-32999 are private tags registered to SGI */
-#define	TIFFTAG_MATTEING		32995	/* $use ExtraSamples */
-#define	TIFFTAG_DATATYPE		32996	/* $use SampleFormat */
-#define	TIFFTAG_IMAGEDEPTH		32997	/* z depth of image */
-#define	TIFFTAG_TILEDEPTH		32998	/* z depth/data tile */
+#define TIFFTAG_MATTEING 32995   /* $use ExtraSamples */
+#define TIFFTAG_DATATYPE 32996   /* $use SampleFormat */
+#define TIFFTAG_IMAGEDEPTH 32997 /* z depth of image */
+#define TIFFTAG_TILEDEPTH 32998  /* z depth/data tile */
 /* tags 33300-33309 are private tags registered to Pixar */
 /*
  * TIFFTAG_PIXAR_IMAGEFULLWIDTH and TIFFTAG_PIXAR_IMAGEFULLLENGTH
- * are set when an image has been cropped out of a larger image.  
+ * are set when an image has been cropped out of a larger image.
  * They reflect the size of the original uncropped image.
  * The TIFFTAG_XPOSITION and TIFFTAG_YPOSITION can be used
  * to determine the position of the smaller image in the larger one.
  */
-#define TIFFTAG_PIXAR_IMAGEFULLWIDTH    33300   /* full image size in x */
-#define TIFFTAG_PIXAR_IMAGEFULLLENGTH   33301   /* full image size in y */
- /* Tags 33302-33306 are used to identify special image modes and data
-  * used by Pixar's texture formats.
-  */
-#define TIFFTAG_PIXAR_TEXTUREFORMAT	33302	/* texture map format */
-#define TIFFTAG_PIXAR_WRAPMODES		33303	/* s & t wrap modes */
-#define TIFFTAG_PIXAR_FOVCOT		33304	/* cotan(fov) for env. maps */
+#define TIFFTAG_PIXAR_IMAGEFULLWIDTH 33300  /* full image size in x */
+#define TIFFTAG_PIXAR_IMAGEFULLLENGTH 33301 /* full image size in y */
+/* Tags 33302-33306 are used to identify special image modes and data
+ * used by Pixar's texture formats.
+ */
+#define TIFFTAG_PIXAR_TEXTUREFORMAT 33302 /* texture map format */
+#define TIFFTAG_PIXAR_WRAPMODES 33303     /* s & t wrap modes */
+#define TIFFTAG_PIXAR_FOVCOT 33304        /* cotan(fov) for env. maps */
 #define TIFFTAG_PIXAR_MATRIX_WORLDTOSCREEN 33305
 #define TIFFTAG_PIXAR_MATRIX_WORLDTOCAMERA 33306
 /* tag 33405 is a private tag registered to Eastman Kodak */
-#define TIFFTAG_WRITERSERIALNUMBER      33405   /* device serial number */
-#define TIFFTAG_CFAREPEATPATTERNDIM	33421	/* dimensions of CFA pattern */
-#define TIFFTAG_CFAPATTERN		33422	/* color filter array pattern */
+#define TIFFTAG_WRITERSERIALNUMBER 33405  /* device serial number */
+#define TIFFTAG_CFAREPEATPATTERNDIM 33421 /* (alias for TIFFTAG_EP_CFAREPEATPATTERNDIM)*/
+#define TIFFTAG_CFAPATTERN 33422          /* (alias for TIFFTAG_EP_CFAPATTERN) */
+#define TIFFTAG_BATTERYLEVEL 33423        /* (alias for TIFFTAG_EP_BATTERYLEVEL) */
 /* tag 33432 is listed in the 6.0 spec w/ unknown ownership */
-#define	TIFFTAG_COPYRIGHT		33432	/* copyright string */
-/* Tags 33445-33452 are used for GEL fileformat, see
- * http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf
+#define TIFFTAG_COPYRIGHT 33432 /* copyright string */
+/* Tags 33445-33452 are used for Molecular Dynamics GEL fileformat,
+ * see http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf
+ * (2023: the above web site is unavailable but tags are explained briefly at
+ * https://www.awaresystems.be/imaging/tiff/tifftags/docs/gel.html
  */
-#define	TIFFTAG_MD_FILETAG		33445	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
-#define	TIFFTAG_MD_SCALEPIXEL	33446	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
-#define	TIFFTAG_MD_COLORTABLE	33447	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
-#define	TIFFTAG_MD_LABNAME	33448	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
-#define	TIFFTAG_MD_SAMPLEINFO	33449	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
-#define	TIFFTAG_MD_PREPDATE	33450	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
-#define	TIFFTAG_MD_PREPTIME	33451	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
-#define	TIFFTAG_MD_FILEUNITS	33452	/* http://research.stowers-institute.org/mcm/efg/ScientificSoftware/Utility/TiffTags/GEL-FileFormat.pdf */
+#define TIFFTAG_MD_FILETAG 33445    /* Specifies the pixel data format encoding in the GEL file format. */
+#define TIFFTAG_MD_SCALEPIXEL 33446 /* scale factor */
+#define TIFFTAG_MD_COLORTABLE 33447 /* conversion from 16bit to 8bit */
+#define TIFFTAG_MD_LABNAME 33448    /* name of the lab that scanned this file. */
+#define TIFFTAG_MD_SAMPLEINFO 33449 /* information about the scanned GEL sample */
+#define TIFFTAG_MD_PREPDATE 33450   /* information about the date the sample was prepared YY/MM/DD */
+#define TIFFTAG_MD_PREPTIME 33451   /* information about the time the sample was prepared HH:MM*/
+#define TIFFTAG_MD_FILEUNITS 33452  /* Units for data in this file, as used in the GEL file format. */
 /* IPTC TAG from RichTIFF specifications */
-#define TIFFTAG_RICHTIFFIPTC		33723
-#define	TIFFTAG_INGR_PACKET_DATA_TAG	33918	/* Intergraph Application specific storage. */
-#define	TIFFTAG_INGR_FLAG_REGISTERS	33919	/* Intergraph Application specific flags. */
-#define	TIFFTAG_IRASB_TRANSORMATION_MATRIX	33920	/* Originally part of Intergraph's GeoTIFF tags, but likely understood by IrasB only. */
-#define	TIFFTAG_MODELTIEPOINTTAG	33922	/* GeoTIFF */
+#define TIFFTAG_RICHTIFFIPTC 33723
+#define TIFFTAG_INGR_PACKET_DATA_TAG 33918       /* Intergraph Application specific storage. */
+#define TIFFTAG_INGR_FLAG_REGISTERS 33919        /* Intergraph Application specific flags. */
+#define TIFFTAG_IRASB_TRANSORMATION_MATRIX 33920 /* Originally part of Intergraph's GeoTIFF tags, but likely understood by IrasB only. */
+#define TIFFTAG_MODELTIEPOINTTAG 33922           /* GeoTIFF */
 /* 34016-34029 are reserved for ANSI IT8 TIFF/IT <dkelly@apago.com) */
-#define TIFFTAG_IT8SITE			34016	/* site name */
-#define TIFFTAG_IT8COLORSEQUENCE	34017	/* color seq. [RGB,CMYK,etc] */
-#define TIFFTAG_IT8HEADER		34018	/* DDES Header */
-#define TIFFTAG_IT8RASTERPADDING	34019	/* raster scanline padding */
-#define TIFFTAG_IT8BITSPERRUNLENGTH	34020	/* # of bits in short run */
-#define TIFFTAG_IT8BITSPEREXTENDEDRUNLENGTH 34021/* # of bits in long run */
-#define TIFFTAG_IT8COLORTABLE		34022	/* LW colortable */
-#define TIFFTAG_IT8IMAGECOLORINDICATOR	34023	/* BP/BL image color switch */
-#define TIFFTAG_IT8BKGCOLORINDICATOR	34024	/* BP/BL bg color switch */
-#define TIFFTAG_IT8IMAGECOLORVALUE	34025	/* BP/BL image color value */
-#define TIFFTAG_IT8BKGCOLORVALUE	34026	/* BP/BL bg color value */
-#define TIFFTAG_IT8PIXELINTENSITYRANGE	34027	/* MP pixel intensity value */
-#define TIFFTAG_IT8TRANSPARENCYINDICATOR 34028	/* HC transparency switch */
-#define TIFFTAG_IT8COLORCHARACTERIZATION 34029	/* color character. table */
-#define TIFFTAG_IT8HCUSAGE		34030	/* HC usage indicator */
-#define TIFFTAG_IT8TRAPINDICATOR	34031	/* Trapping indicator
-						   (untrapped=0, trapped=1) */
-#define TIFFTAG_IT8CMYKEQUIVALENT	34032	/* CMYK color equivalents */
+#define TIFFTAG_IT8SITE 34016                     /* site name */
+#define TIFFTAG_IT8COLORSEQUENCE 34017            /* color seq. [RGB,CMYK,etc] */
+#define TIFFTAG_IT8HEADER 34018                   /* DDES Header */
+#define TIFFTAG_IT8RASTERPADDING 34019            /* raster scanline padding */
+#define TIFFTAG_IT8BITSPERRUNLENGTH 34020         /* # of bits in short run */
+#define TIFFTAG_IT8BITSPEREXTENDEDRUNLENGTH 34021 /* # of bits in long run */
+#define TIFFTAG_IT8COLORTABLE 34022               /* LW colortable */
+#define TIFFTAG_IT8IMAGECOLORINDICATOR 34023      /* BP/BL image color switch */
+#define TIFFTAG_IT8BKGCOLORINDICATOR 34024        /* BP/BL bg color switch */
+#define TIFFTAG_IT8IMAGECOLORVALUE 34025          /* BP/BL image color value */
+#define TIFFTAG_IT8BKGCOLORVALUE 34026            /* BP/BL bg color value */
+#define TIFFTAG_IT8PIXELINTENSITYRANGE 34027      /* MP pixel intensity value */
+#define TIFFTAG_IT8TRANSPARENCYINDICATOR 34028    /* HC transparency switch */
+#define TIFFTAG_IT8COLORCHARACTERIZATION 34029    /* color character. table */
+#define TIFFTAG_IT8HCUSAGE 34030                  /* HC usage indicator */
+#define TIFFTAG_IT8TRAPINDICATOR 34031            /* Trapping indicator (untrapped=0, trapped=1) */
+#define TIFFTAG_IT8CMYKEQUIVALENT 34032           /* CMYK color equivalents */
 /* tags 34232-34236 are private tags registered to Texas Instruments */
-#define TIFFTAG_FRAMECOUNT              34232   /* Sequence Frame Count */
-#define TIFFTAG_MODELTRANSFORMATIONTAG	34264	/* Used in interchangeable GeoTIFF files */
+#define TIFFTAG_FRAMECOUNT 34232             /* Sequence Frame Count */
+#define TIFFTAG_MODELTRANSFORMATIONTAG 34264 /* Used in interchangeable GeoTIFF files */
 /* tag 34377 is private tag registered to Adobe for PhotoShop */
-#define TIFFTAG_PHOTOSHOP		34377 
+#define TIFFTAG_PHOTOSHOP 34377
 /* tags 34665, 34853 and 40965 are documented in EXIF specification */
-#define TIFFTAG_EXIFIFD			34665	/* Pointer to EXIF private directory */
+#define TIFFTAG_EXIFIFD 34665 /* Pointer to EXIF private directory */
 /* tag 34750 is a private tag registered to Adobe? */
-#define TIFFTAG_ICCPROFILE		34675	/* ICC profile data */
-#define TIFFTAG_IMAGELAYER		34732	/* !TIFF/FX image layer information */
+#define TIFFTAG_ICCPROFILE 34675 /* ICC profile data */
+#define TIFFTAG_IMAGELAYER 34732 /* !TIFF/FX image layer information */
 /* tag 34750 is a private tag registered to Pixel Magic */
-#define	TIFFTAG_JBIGOPTIONS		34750	/* JBIG options */
-#define TIFFTAG_GPSIFD			34853	/* Pointer to GPS private directory */
+#define TIFFTAG_JBIGOPTIONS 34750 /* JBIG options */
+#define TIFFTAG_GPSIFD 34853      /* Pointer to EXIF GPS private directory */
 /* tags 34908-34914 are private tags registered to SGI */
-#define	TIFFTAG_FAXRECVPARAMS		34908	/* encoded Class 2 ses. parms */
-#define	TIFFTAG_FAXSUBADDRESS		34909	/* received SubAddr string */
-#define	TIFFTAG_FAXRECVTIME		34910	/* receive time (secs) */
-#define	TIFFTAG_FAXDCS			34911	/* encoded fax ses. params, Table 2/T.30 */
+#define TIFFTAG_FAXRECVPARAMS 34908 /* encoded Class 2 ses. params */
+#define TIFFTAG_FAXSUBADDRESS 34909 /* received SubAddr string */
+#define TIFFTAG_FAXRECVTIME 34910   /* receive time (secs) */
+#define TIFFTAG_FAXDCS 34911        /* encoded fax ses. params, Table 2/T.30 */
 /* tags 37439-37443 are registered to SGI <gregl@sgi.com> */
-#define TIFFTAG_STONITS			37439	/* Sample value to Nits */
+#define TIFFTAG_STONITS 37439 /* Sample value to Nits */
 /* tag 34929 is a private tag registered to FedEx */
-#define	TIFFTAG_FEDEX_EDR		34929	/* unknown use */
-#define	TIFFTAG_IMAGESOURCEDATA		37724	/* http://justsolve.archiveteam.org/wiki/PSD, http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/ */
-#define TIFFTAG_INTEROPERABILITYIFD	40965	/* Pointer to Interoperability private directory */
-#define	TIFFTAG_GDAL_METADATA		42112	/* Used by the GDAL library */
-#define	TIFFTAG_GDAL_NODATA		42113	/* Used by the GDAL library */
-#define	TIFFTAG_OCE_SCANJOB_DESCRIPTION	50215	/* Used in the Oce scanning process */
-#define	TIFFTAG_OCE_APPLICATION_SELECTOR	50216	/* Used in the Oce scanning process. */
-#define	TIFFTAG_OCE_IDENTIFICATION_NUMBER	50217
-#define	TIFFTAG_OCE_IMAGELOGIC_CHARACTERISTICS	50218
-
+#define TIFFTAG_FEDEX_EDR 34929                /* unknown use */
+#define TIFFTAG_IMAGESOURCEDATA 37724          /* http://justsolve.archiveteam.org/wiki/PSD, http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/ */
+#define TIFFTAG_INTEROPERABILITYIFD 40965      /* Pointer to EXIF Interoperability private directory */
+#define TIFFTAG_GDAL_METADATA 42112            /* Used by the GDAL library */
+#define TIFFTAG_GDAL_NODATA 42113              /* Used by the GDAL library */
+#define TIFFTAG_OCE_SCANJOB_DESCRIPTION 50215  /* Used in the Oce scanning process */
+#define TIFFTAG_OCE_APPLICATION_SELECTOR 50216 /* Used in the Oce scanning process. */
+#define TIFFTAG_OCE_IDENTIFICATION_NUMBER 50217
+#define TIFFTAG_OCE_IMAGELOGIC_CHARACTERISTICS 50218
 /* tags 50674 to 50677 are reserved for ESRI */
-#define TIFFTAG_LERC_PARAMETERS         50674   /* Stores LERC version and additional compression method */
+#define TIFFTAG_LERC_PARAMETERS 50674 /* Stores LERC version and additional compression method */
+
 /* Adobe Digital Negative (DNG) format tags */
-#define TIFFTAG_DNGVERSION		50706	/* &DNG version number */
-#define TIFFTAG_DNGBACKWARDVERSION	50707	/* &DNG compatibility version */
-#define TIFFTAG_UNIQUECAMERAMODEL	50708	/* &name for the camera model */
-#define TIFFTAG_LOCALIZEDCAMERAMODEL	50709	/* &localized camera model
-						   name */
-#define TIFFTAG_CFAPLANECOLOR		50710	/* &CFAPattern->LinearRaw space
-						   mapping */
-#define TIFFTAG_CFALAYOUT		50711	/* &spatial layout of the CFA */
-#define TIFFTAG_LINEARIZATIONTABLE	50712	/* &lookup table description */
-#define TIFFTAG_BLACKLEVELREPEATDIM	50713	/* &repeat pattern size for
-						   the BlackLevel tag */
-#define TIFFTAG_BLACKLEVEL		50714	/* &zero light encoding level */
-#define TIFFTAG_BLACKLEVELDELTAH	50715	/* &zero light encoding level
-						   differences (columns) */
-#define TIFFTAG_BLACKLEVELDELTAV	50716	/* &zero light encoding level
-						   differences (rows) */
-#define TIFFTAG_WHITELEVEL		50717	/* &fully saturated encoding
-						   level */
-#define TIFFTAG_DEFAULTSCALE		50718	/* &default scale factors */
-#define TIFFTAG_DEFAULTCROPORIGIN	50719	/* &origin of the final image
-						   area */
-#define TIFFTAG_DEFAULTCROPSIZE		50720	/* &size of the final image 
-						   area */
-#define TIFFTAG_COLORMATRIX1		50721	/* &XYZ->reference color space
-						   transformation matrix 1 */
-#define TIFFTAG_COLORMATRIX2		50722	/* &XYZ->reference color space
-						   transformation matrix 2 */
-#define TIFFTAG_CAMERACALIBRATION1	50723	/* &calibration matrix 1 */
-#define TIFFTAG_CAMERACALIBRATION2	50724	/* &calibration matrix 2 */
-#define TIFFTAG_REDUCTIONMATRIX1	50725	/* &dimensionality reduction
-						   matrix 1 */
-#define TIFFTAG_REDUCTIONMATRIX2	50726	/* &dimensionality reduction
-						   matrix 2 */
-#define TIFFTAG_ANALOGBALANCE		50727	/* &gain applied the stored raw
-						   values*/
-#define TIFFTAG_ASSHOTNEUTRAL		50728	/* &selected white balance in
-						   linear reference space */
-#define TIFFTAG_ASSHOTWHITEXY		50729	/* &selected white balance in
-						   x-y chromaticity
-						   coordinates */
-#define TIFFTAG_BASELINEEXPOSURE	50730	/* &how much to move the zero
-						   point */
-#define TIFFTAG_BASELINENOISE		50731	/* &relative noise level */
-#define TIFFTAG_BASELINESHARPNESS	50732	/* &relative amount of
-						   sharpening */
-#define TIFFTAG_BAYERGREENSPLIT		50733	/* &how closely the values of
-						   the green pixels in the
-						   blue/green rows track the
-						   values of the green pixels
-						   in the red/green rows */
-#define TIFFTAG_LINEARRESPONSELIMIT	50734	/* &non-linear encoding range */
-#define TIFFTAG_CAMERASERIALNUMBER	50735	/* &camera's serial number */
-#define TIFFTAG_LENSINFO		50736	/* info about the lens */
-#define TIFFTAG_CHROMABLURRADIUS	50737	/* &chroma blur radius */
-#define TIFFTAG_ANTIALIASSTRENGTH	50738	/* &relative strength of the
-						   camera's anti-alias filter */
-#define TIFFTAG_SHADOWSCALE		50739	/* &used by Adobe Camera Raw */
-#define TIFFTAG_DNGPRIVATEDATA		50740	/* &manufacturer's private data */
-#define TIFFTAG_MAKERNOTESAFETY		50741	/* &whether the EXIF MakerNote
-						   tag is safe to preserve
-						   along with the rest of the
-						   EXIF data */
-#define	TIFFTAG_CALIBRATIONILLUMINANT1	50778	/* &illuminant 1 */
-#define TIFFTAG_CALIBRATIONILLUMINANT2	50779	/* &illuminant 2 */
-#define TIFFTAG_BESTQUALITYSCALE	50780	/* &best quality multiplier */
-#define TIFFTAG_RAWDATAUNIQUEID		50781	/* &unique identifier for
-						   the raw image data */
-#define TIFFTAG_ORIGINALRAWFILENAME	50827	/* &file name of the original
-						   raw file */
-#define TIFFTAG_ORIGINALRAWFILEDATA	50828	/* &contents of the original
-						   raw file */
-#define TIFFTAG_ACTIVEAREA		50829	/* &active (non-masked) pixels
-						   of the sensor */
-#define TIFFTAG_MASKEDAREAS		50830	/* &list of coordinates
-						   of fully masked pixels */
-#define TIFFTAG_ASSHOTICCPROFILE	50831	/* &these two tags used to */
-#define TIFFTAG_ASSHOTPREPROFILEMATRIX	50832	/* map cameras's color space
-						   into ICC profile space */
-#define TIFFTAG_CURRENTICCPROFILE	50833	/* & */
-#define TIFFTAG_CURRENTPREPROFILEMATRIX	50834	/* & */
+#define TIFFTAG_DNGVERSION 50706           /* &DNG version number */
+#define TIFFTAG_DNGBACKWARDVERSION 50707   /* &DNG compatibility version */
+#define TIFFTAG_UNIQUECAMERAMODEL 50708    /* &name for the camera model */
+#define TIFFTAG_LOCALIZEDCAMERAMODEL 50709 /* &localized camera model name (UTF-8) */
+#define TIFFTAG_CFAPLANECOLOR 50710        /* &CFAPattern->LinearRaw space mapping */
+#define TIFFTAG_CFALAYOUT 50711            /* &spatial layout of the CFA */
+#define TIFFTAG_LINEARIZATIONTABLE 50712   /* &lookup table description */
+#define TIFFTAG_BLACKLEVELREPEATDIM 50713  /* &repeat pattern size for the BlackLevel tag */
+#define TIFFTAG_BLACKLEVEL 50714           /* &zero light encoding level */
+#define TIFFTAG_BLACKLEVELDELTAH 50715     /* &zero light encoding level differences (columns) */
+#define TIFFTAG_BLACKLEVELDELTAV 50716     /* &zero light encoding level differences (rows) */
+#define TIFFTAG_WHITELEVEL 50717           /* &fully saturated encoding level */
+#define TIFFTAG_DEFAULTSCALE 50718         /* &default scale factors */
+#define TIFFTAG_DEFAULTCROPORIGIN 50719    /* &origin of the final image area */
+#define TIFFTAG_DEFAULTCROPSIZE 50720      /* &size of the final image area */
+#define TIFFTAG_COLORMATRIX1 50721         /* &XYZ->reference color space transformation matrix 1 */
+#define TIFFTAG_COLORMATRIX2 50722         /* &XYZ->reference color space transformation matrix 2 */
+#define TIFFTAG_CAMERACALIBRATION1 50723   /* &calibration matrix 1 */
+#define TIFFTAG_CAMERACALIBRATION2 50724   /* &calibration matrix 2 */
+#define TIFFTAG_REDUCTIONMATRIX1 50725     /* &dimensionality reduction matrix 1 */
+#define TIFFTAG_REDUCTIONMATRIX2 50726     /* &dimensionality reduction matrix 2 */
+#define TIFFTAG_ANALOGBALANCE 50727        /* &gain applied the stored raw values*/
+#define TIFFTAG_ASSHOTNEUTRAL 50728        /* &selected white balance in linear reference space */
+#define TIFFTAG_ASSHOTWHITEXY 50729        /* &selected white balance in x-y chromaticity coordinates */
+#define TIFFTAG_BASELINEEXPOSURE 50730     /* &how much to move the zero point */
+#define TIFFTAG_BASELINENOISE 50731        /* &relative noise level */
+#define TIFFTAG_BASELINESHARPNESS 50732    /* &relative amount of sharpening */
+/* TIFFTAG_BAYERGREENSPLIT: &how closely the values of the green pixels in the blue/green rows
+ * track the values of the green pixels in the red/green rows */
+#define TIFFTAG_BAYERGREENSPLIT 50733
+#define TIFFTAG_LINEARRESPONSELIMIT 50734     /* &non-linear encoding range */
+#define TIFFTAG_CAMERASERIALNUMBER 50735      /* &camera's serial number */
+#define TIFFTAG_LENSINFO 50736                /* info about the lens */
+#define TIFFTAG_CHROMABLURRADIUS 50737        /* &chroma blur radius */
+#define TIFFTAG_ANTIALIASSTRENGTH 50738       /* &relative strength of the camera's anti-alias filter */
+#define TIFFTAG_SHADOWSCALE 50739             /* &used by Adobe Camera Raw */
+#define TIFFTAG_DNGPRIVATEDATA 50740          /* &manufacturer's private data */
+#define TIFFTAG_MAKERNOTESAFETY 50741         /* &whether the EXIF MakerNote tag is safe to preserve along with the rest of the EXIF data */
+#define TIFFTAG_CALIBRATIONILLUMINANT1 50778  /* &illuminant 1 */
+#define TIFFTAG_CALIBRATIONILLUMINANT2 50779  /* &illuminant 2 */
+#define TIFFTAG_BESTQUALITYSCALE 50780        /* &best quality multiplier */
+#define TIFFTAG_RAWDATAUNIQUEID 50781         /* &unique identifier for the raw image data */
+#define TIFFTAG_ORIGINALRAWFILENAME 50827     /* &file name of the original raw file (UTF-8) */
+#define TIFFTAG_ORIGINALRAWFILEDATA 50828     /* &contents of the original raw file */
+#define TIFFTAG_ACTIVEAREA 50829              /* &active (non-masked) pixels of the sensor */
+#define TIFFTAG_MASKEDAREAS 50830             /* &list of coordinates of fully masked pixels */
+#define TIFFTAG_ASSHOTICCPROFILE 50831        /* &these two tags used to */
+#define TIFFTAG_ASSHOTPREPROFILEMATRIX 50832  /* map cameras's color space  into ICC profile space */
+#define TIFFTAG_CURRENTICCPROFILE 50833       /* & */
+#define TIFFTAG_CURRENTPREPROFILEMATRIX 50834 /* & */
 
-#define TIFFTAG_RPCCOEFFICIENT          50844   /* Define by GDAL for geospatial georeferencing through RPC: http://geotiff.maptools.org/rpc_prop.html */
+/* DNG 1.2.0.0 */
+#define TIFFTAG_COLORIMETRICREFERENCE 50879       /* &colorimetric reference */
+#define TIFFTAG_CAMERACALIBRATIONSIGNATURE 50931  /* &camera calibration signature (UTF-8) */
+#define TIFFTAG_PROFILECALIBRATIONSIGNATURE 50932 /* &profile calibration signature (UTF-8) */
+/* TIFFTAG_EXTRACAMERAPROFILES 50933 &extra camera profiles : is already defined for GeoTIFF DGIWG */
+#define TIFFTAG_ASSHOTPROFILENAME 50934           /* &as shot profile name (UTF-8) */
+#define TIFFTAG_NOISEREDUCTIONAPPLIED 50935       /* &amount of applied noise reduction */
+#define TIFFTAG_PROFILENAME 50936                 /* &camera profile name (UTF-8) */
+#define TIFFTAG_PROFILEHUESATMAPDIMS 50937        /* &dimensions of HSV mapping */
+#define TIFFTAG_PROFILEHUESATMAPDATA1 50938       /* &first HSV mapping table */
+#define TIFFTAG_PROFILEHUESATMAPDATA2 50939       /* &second HSV mapping table */
+#define TIFFTAG_PROFILETONECURVE 50940            /* &default tone curve */
+#define TIFFTAG_PROFILEEMBEDPOLICY 50941          /* &profile embedding policy */
+#define TIFFTAG_PROFILECOPYRIGHT 50942            /* &profile copyright information (UTF-8) */
+#define TIFFTAG_FORWARDMATRIX1 50964              /* &matrix for mapping white balanced camera colors to XYZ D50 */
+#define TIFFTAG_FORWARDMATRIX2 50965              /* &matrix for mapping white balanced camera colors to XYZ D50 */
+#define TIFFTAG_PREVIEWAPPLICATIONNAME 50966      /* &name of application that created preview (UTF-8) */
+#define TIFFTAG_PREVIEWAPPLICATIONVERSION 50967   /* &version of application that created preview (UTF-8) */
+#define TIFFTAG_PREVIEWSETTINGSNAME 50968         /* &name of conversion settings (UTF-8) */
+#define TIFFTAG_PREVIEWSETTINGSDIGEST 50969       /* &unique id of conversion settings */
+#define TIFFTAG_PREVIEWCOLORSPACE 50970           /* &preview color space */
+#define TIFFTAG_PREVIEWDATETIME 50971             /* &date/time preview was rendered */
+#define TIFFTAG_RAWIMAGEDIGEST 50972              /* &md5 of raw image data */
+#define TIFFTAG_ORIGINALRAWFILEDIGEST 50973       /* &md5 of the data stored in the OriginalRawFileData tag */
+#define TIFFTAG_SUBTILEBLOCKSIZE 50974            /* &subtile block size */
+#define TIFFTAG_ROWINTERLEAVEFACTOR 50975         /* &number of interleaved fields */
+#define TIFFTAG_PROFILELOOKTABLEDIMS 50981        /* &num of input samples in each dim of default "look" table */
+#define TIFFTAG_PROFILELOOKTABLEDATA 50982        /* &default "look" table for use as starting point */
 
-#define	TIFFTAG_ALIAS_LAYER_METADATA	50784	/* Alias Sketchbook Pro layer usage description. */
+/* DNG 1.3.0.0 */
+#define TIFFTAG_OPCODELIST1 51008  /* &opcodes that should be applied to raw image after reading */
+#define TIFFTAG_OPCODELIST2 51009  /* &opcodes that should be applied after mapping to linear reference */
+#define TIFFTAG_OPCODELIST3 51022  /* &opcodes that should be applied after demosaicing */
+#define TIFFTAG_NOISEPROFILE 51041 /* &noise profile */
 
-/* GeoTIFF DGIWG */
-#define TIFFTAG_TIFF_RSID               50908   /* https://www.awaresystems.be/imaging/tiff/tifftags/tiff_rsid.html */
-#define TIFFTAG_GEO_METADATA            50909   /* https://www.awaresystems.be/imaging/tiff/tifftags/geo_metadata.html */
+/* DNG 1.4.0.0 */
+#define TIFFTAG_DEFAULTUSERCROP 51125              /* &default user crop rectangle in relative coords */
+#define TIFFTAG_DEFAULTBLACKRENDER 51110           /* &black rendering hint */
+#define TIFFTAG_BASELINEEXPOSUREOFFSET 51109       /* &baseline exposure offset */
+#define TIFFTAG_PROFILELOOKTABLEENCODING 51108     /* &3D LookTable indexing conversion */
+#define TIFFTAG_PROFILEHUESATMAPENCODING 51107     /* &3D HueSatMap indexing conversion */
+#define TIFFTAG_ORIGINALDEFAULTFINALSIZE 51089     /* &default final size of larger original file for this proxy */
+#define TIFFTAG_ORIGINALBESTQUALITYFINALSIZE 51090 /* &best quality final size of larger original file for this proxy */
+#define TIFFTAG_ORIGINALDEFAULTCROPSIZE 51091      /* &the default crop size of larger original file for this proxy */
+#define TIFFTAG_NEWRAWIMAGEDIGEST 51111            /* &modified MD5 digest of the raw image data */
+#define TIFFTAG_RAWTOPREVIEWGAIN 51112             /* &The gain between the main raw FD and the preview IFD containing this tag */
 
-#define TIFFTAG_EXTRACAMERAPROFILES 50933  /* http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/products/photoshop/pdfs/dng_spec_1.4.0.0.pdf */
+/* DNG 1.5.0.0 */
+#define TIFFTAG_DEPTHFORMAT 51177      /* &encoding of the depth data in the file */
+#define TIFFTAG_DEPTHNEAR 51178        /* &distance from the camera represented by value 0 in the depth map */
+#define TIFFTAG_DEPTHFAR 51179         /* &distance from the camera represented by the maximum value in the depth map */
+#define TIFFTAG_DEPTHUNITS 51180       /* &measurement units for DepthNear and DepthFar */
+#define TIFFTAG_DEPTHMEASURETYPE 51181 /* &measurement geometry for the depth map */
+#define TIFFTAG_ENHANCEPARAMS 51182    /* &a string that documents how the enhanced image data was processed. */
+
+/* DNG 1.6.0.0 */
+#define TIFFTAG_PROFILEGAINTABLEMAP 52525    /* &spatially varying gain tables that can be applied as starting point */
+#define TIFFTAG_SEMANTICNAME 52526           /* &a string that identifies the semantic mask */
+#define TIFFTAG_SEMANTICINSTANCEID 52528     /* &a string that identifies a specific instance in a semantic mask */
+#define TIFFTAG_MASKSUBAREA 52536            /* &the crop rectangle of this IFD's mask, relative to the main image */
+#define TIFFTAG_RGBTABLES 52543              /* &color transforms to apply to masked image regions */
+#define TIFFTAG_CALIBRATIONILLUMINANT3 52529 /* &the illuminant used for the third set of color calibration tags */
+#define TIFFTAG_COLORMATRIX3 52531           /* &matrix to convert XYZ values to reference camera native color space under CalibrationIlluminant3 */
+#define TIFFTAG_CAMERACALIBRATION3 52530     /* &matrix to transform reference camera native space values to individual camera native space values under CalibrationIlluminant3 */
+#define TIFFTAG_REDUCTIONMATRIX3 52538       /* &dimensionality reduction matrix for use in color conversion to XYZ under CalibrationIlluminant3 */
+#define TIFFTAG_PROFILEHUESATMAPDATA3 52537  /* &the data for the third HSV table */
+#define TIFFTAG_FORWARDMATRIX3 52532         /* &matrix to map white balanced camera colors to XYZ D50 */
+#define TIFFTAG_ILLUMINANTDATA1 52533        /* &data for the first calibration illuminant */
+#define TIFFTAG_ILLUMINANTDATA2 52534        /* &data for the second calibration illuminant */
+#define TIFFTAG_ILLUMINANTDATA3 53535        /* &data for the third calibration illuminant */
+
+/* TIFF/EP */
+#define TIFFTAG_EP_CFAREPEATPATTERNDIM 33421      /* dimensions of CFA pattern */
+#define TIFFTAG_EP_CFAPATTERN 33422               /* color filter array pattern */
+#define TIFFTAG_EP_BATTERYLEVEL 33423             /* battery level (rational or ASCII) */
+#define TIFFTAG_EP_INTERLACE 34857                /* Number of multi-field images */
+/* TIFFTAG_EP_IPTC_NAA and TIFFTAG_RICHTIFFIPTC share the same tag number (33723)
+ *   LibTIFF type is UNDEFINED or BYTE, but often times incorrectly specified as LONG,
+ *   because TIFF/EP (ISO/DIS 12234-2) specifies type LONG or ASCII. */
+#define TIFFTAG_EP_IPTC_NAA 33723                 /* Alias IPTC/NAA Newspaper Association RichTIFF */
+#define TIFFTAG_EP_TIMEZONEOFFSET 34858           /* Time zone offset relative to UTC */
+#define TIFFTAG_EP_SELFTIMERMODE 34859            /* Number of seconds capture was delayed from button press */
+#define TIFFTAG_EP_FLASHENERGY 37387              /* Flash energy, or range if there is uncertainty */
+#define TIFFTAG_EP_SPATIALFREQUENCYRESPONSE 37388 /* Spatial frequency response */
+#define TIFFTAG_EP_NOISE 37389                    /* Camera noise measurement values */
+#define TIFFTAG_EP_FOCALPLANEXRESOLUTION 37390    /* Focal plane X resolution */
+#define TIFFTAG_EP_FOCALPLANEYRESOLUTION 37391    /* Focal plane Y resolution */
+#define TIFFTAG_EP_FOCALPLANERESOLUTIONUNIT 37392 /* Focal plane resolution unit */
+#define TIFFTAG_EP_IMAGENUMBER 37393              /* Number of image when several of burst shot stored in same TIFF/EP */
+#define TIFFTAG_EP_SECURITYCLASSIFICATION 37394   /* Security classification */
+#define TIFFTAG_EP_IMAGEHISTORY 37395             /* Record of what has been done to the image */
+#define TIFFTAG_EP_EXPOSUREINDEX 37397            /* Exposure index */
+#define TIFFTAG_EP_STANDARDID 37398               /* TIFF/EP standard version, n.n.n.n */
+#define TIFFTAG_EP_SENSINGMETHOD 37399            /* Type of image sensor */
+/*
+ * TIFF/EP tags equivalent to EXIF tags
+ *     Note that TIFF-EP and EXIF use nearly the same metadata tag set, but TIFF-EP stores the tags in IFD 0,
+ *     while EXIF store the tags in a separate IFD. Either location is allowed by DNG, but the EXIF location is preferred.
+ */
+#define TIFFTAG_EP_EXPOSURETIME 33434             /* Exposure time */
+#define TIFFTAG_EP_FNUMBER 33437                  /* F number */
+#define TIFFTAG_EP_EXPOSUREPROGRAM 34850          /* Exposure program */
+#define TIFFTAG_EP_SPECTRALSENSITIVITY 34852      /* Spectral sensitivity */
+#define TIFFTAG_EP_ISOSPEEDRATINGS 34855          /* ISO speed rating */
+#define TIFFTAG_EP_OECF 34856                     /* Optoelectric conversion factor */
+#define TIFFTAG_EP_DATETIMEORIGINAL 36867         /* Date and time of original data generation */
+#define TIFFTAG_EP_COMPRESSEDBITSPERPIXEL 37122   /* Image compression mode */
+#define TIFFTAG_EP_SHUTTERSPEEDVALUE 37377        /* Shutter speed */
+#define TIFFTAG_EP_APERTUREVALUE 37378            /* Aperture */
+#define TIFFTAG_EP_BRIGHTNESSVALUE 37379          /* Brightness */
+#define TIFFTAG_EP_EXPOSUREBIASVALUE 37380        /* Exposure bias */
+#define TIFFTAG_EP_MAXAPERTUREVALUE 37381         /* Maximum lens aperture */
+#define TIFFTAG_EP_SUBJECTDISTANCE 37382          /* Subject distance */
+#define TIFFTAG_EP_METERINGMODE 37383             /* Metering mode */
+#define TIFFTAG_EP_LIGHTSOURCE 37384              /* Light source */
+#define TIFFTAG_EP_FLASH 37385                    /* Flash */
+#define TIFFTAG_EP_FOCALLENGTH 37386              /* Lens focal length */
+#define TIFFTAG_EP_SUBJECTLOCATION 37396          /* Subject location (area) */
+
+#define TIFFTAG_RPCCOEFFICIENT 50844       /* Define by GDAL for geospatial georeferencing through RPC: http://geotiff.maptools.org/rpc_prop.html */
+#define TIFFTAG_ALIAS_LAYER_METADATA 50784 /* Alias Sketchbook Pro layer usage description. */
+
+/* GeoTIFF DGIWG */
+#define TIFFTAG_TIFF_RSID 50908           /* https://www.awaresystems.be/imaging/tiff/tifftags/tiff_rsid.html */
+#define TIFFTAG_GEO_METADATA 50909        /* https://www.awaresystems.be/imaging/tiff/tifftags/geo_metadata.html */
+#define TIFFTAG_EXTRACAMERAPROFILES 50933 /* http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/products/photoshop/pdfs/dng_spec_1.4.0.0.pdf */
 
 /* tag 65535 is an undefined tag used by Eastman Kodak */
-#define TIFFTAG_DCSHUESHIFTVALUES       65535   /* hue shift correction data */
+#define TIFFTAG_DCSHUESHIFTVALUES 65535 /* hue shift correction data */
 
 /*
  * The following are ``pseudo tags'' that can be used to control
@@ -590,211 +693,207 @@ typedef enum {
  * http://www.remotesensing.org/libtiff/bugs.html with the appropriate
  * C definitions to add.
  */
-#define	TIFFTAG_FAXMODE			65536	/* Group 3/4 format control */
-#define	    FAXMODE_CLASSIC	0x0000		/* default, include RTC */
-#define	    FAXMODE_NORTC	0x0001		/* no RTC at end of data */
-#define	    FAXMODE_NOEOL	0x0002		/* no EOL code at end of row */
-#define	    FAXMODE_BYTEALIGN	0x0004		/* byte align row */
-#define	    FAXMODE_WORDALIGN	0x0008		/* word align row */
-#define	    FAXMODE_CLASSF	FAXMODE_NORTC	/* TIFF Class F */
-#define	TIFFTAG_JPEGQUALITY		65537	/* Compression quality level */
+#define TIFFTAG_FAXMODE 65536        /* Group 3/4 format control */
+#define FAXMODE_CLASSIC 0x0000       /* default, include RTC */
+#define FAXMODE_NORTC 0x0001         /* no RTC at end of data */
+#define FAXMODE_NOEOL 0x0002         /* no EOL code at end of row */
+#define FAXMODE_BYTEALIGN 0x0004     /* byte align row */
+#define FAXMODE_WORDALIGN 0x0008     /* word align row */
+#define FAXMODE_CLASSF FAXMODE_NORTC /* TIFF Class F */
+#define TIFFTAG_JPEGQUALITY 65537    /* Compression quality level */
 /* Note: quality level is on the IJG 0-100 scale.  Default value is 75 */
-#define	TIFFTAG_JPEGCOLORMODE		65538	/* Auto RGB<=>YCbCr convert? */
-#define	    JPEGCOLORMODE_RAW	0x0000		/* no conversion (default) */
-#define	    JPEGCOLORMODE_RGB	0x0001		/* do auto conversion */
-#define	TIFFTAG_JPEGTABLESMODE		65539	/* What to put in JPEGTables */
-#define	    JPEGTABLESMODE_QUANT 0x0001		/* include quantization tbls */
-#define	    JPEGTABLESMODE_HUFF	0x0002		/* include Huffman tbls */
+#define TIFFTAG_JPEGCOLORMODE 65538  /* Auto RGB<=>YCbCr convert? */
+#define JPEGCOLORMODE_RAW 0x0000     /* no conversion (default) */
+#define JPEGCOLORMODE_RGB 0x0001     /* do auto conversion */
+#define TIFFTAG_JPEGTABLESMODE 65539 /* What to put in JPEGTables */
+#define JPEGTABLESMODE_QUANT 0x0001  /* include quantization tbls */
+#define JPEGTABLESMODE_HUFF 0x0002   /* include Huffman tbls */
 /* Note: default is JPEGTABLESMODE_QUANT | JPEGTABLESMODE_HUFF */
-#define	TIFFTAG_FAXFILLFUNC		65540	/* G3/G4 fill function */
-#define	TIFFTAG_PIXARLOGDATAFMT		65549	/* PixarLogCodec I/O data sz */
-#define	    PIXARLOGDATAFMT_8BIT	0	/* regular u_char samples */
-#define	    PIXARLOGDATAFMT_8BITABGR	1	/* ABGR-order u_chars */
-#define	    PIXARLOGDATAFMT_11BITLOG	2	/* 11-bit log-encoded (raw) */
-#define	    PIXARLOGDATAFMT_12BITPICIO	3	/* as per PICIO (1.0==2048) */
-#define	    PIXARLOGDATAFMT_16BIT	4	/* signed short samples */
-#define	    PIXARLOGDATAFMT_FLOAT	5	/* IEEE float samples */
+#define TIFFTAG_FAXFILLFUNC 65540     /* G3/G4 fill function */
+#define TIFFTAG_PIXARLOGDATAFMT 65549 /* PixarLogCodec I/O data sz */
+#define PIXARLOGDATAFMT_8BIT 0        /* regular u_char samples */
+#define PIXARLOGDATAFMT_8BITABGR 1    /* ABGR-order u_chars */
+#define PIXARLOGDATAFMT_11BITLOG 2    /* 11-bit log-encoded (raw) */
+#define PIXARLOGDATAFMT_12BITPICIO 3  /* as per PICIO (1.0==2048) */
+#define PIXARLOGDATAFMT_16BIT 4       /* signed short samples */
+#define PIXARLOGDATAFMT_FLOAT 5       /* IEEE float samples */
 /* 65550-65556 are allocated to Oceana Matrix <dev@oceana.com> */
-#define TIFFTAG_DCSIMAGERTYPE           65550   /* imager model & filter */
-#define     DCSIMAGERMODEL_M3           0       /* M3 chip (1280 x 1024) */
-#define     DCSIMAGERMODEL_M5           1       /* M5 chip (1536 x 1024) */
-#define     DCSIMAGERMODEL_M6           2       /* M6 chip (3072 x 2048) */
-#define     DCSIMAGERFILTER_IR          0       /* infrared filter */
-#define     DCSIMAGERFILTER_MONO        1       /* monochrome filter */
-#define     DCSIMAGERFILTER_CFA         2       /* color filter array */
-#define     DCSIMAGERFILTER_OTHER       3       /* other filter */
-#define TIFFTAG_DCSINTERPMODE           65551   /* interpolation mode */
-#define     DCSINTERPMODE_NORMAL        0x0     /* whole image, default */
-#define     DCSINTERPMODE_PREVIEW       0x1     /* preview of image (384x256) */
-#define TIFFTAG_DCSBALANCEARRAY         65552   /* color balance values */
-#define TIFFTAG_DCSCORRECTMATRIX        65553   /* color correction values */
-#define TIFFTAG_DCSGAMMA                65554   /* gamma value */
-#define TIFFTAG_DCSTOESHOULDERPTS       65555   /* toe & shoulder points */
-#define TIFFTAG_DCSCALIBRATIONFD        65556   /* calibration file desc */
+#define TIFFTAG_DCSIMAGERTYPE 65550     /* imager model & filter */
+#define DCSIMAGERMODEL_M3 0             /* M3 chip (1280 x 1024) */
+#define DCSIMAGERMODEL_M5 1             /* M5 chip (1536 x 1024) */
+#define DCSIMAGERMODEL_M6 2             /* M6 chip (3072 x 2048) */
+#define DCSIMAGERFILTER_IR 0            /* infrared filter */
+#define DCSIMAGERFILTER_MONO 1          /* monochrome filter */
+#define DCSIMAGERFILTER_CFA 2           /* color filter array */
+#define DCSIMAGERFILTER_OTHER 3         /* other filter */
+#define TIFFTAG_DCSINTERPMODE 65551     /* interpolation mode */
+#define DCSINTERPMODE_NORMAL 0x0        /* whole image, default */
+#define DCSINTERPMODE_PREVIEW 0x1       /* preview of image (384x256) */
+#define TIFFTAG_DCSBALANCEARRAY 65552   /* color balance values */
+#define TIFFTAG_DCSCORRECTMATRIX 65553  /* color correction values */
+#define TIFFTAG_DCSGAMMA 65554          /* gamma value */
+#define TIFFTAG_DCSTOESHOULDERPTS 65555 /* toe & shoulder points */
+#define TIFFTAG_DCSCALIBRATIONFD 65556  /* calibration file desc */
 /* Note: quality level is on the ZLIB 1-9 scale. Default value is -1 */
-#define	TIFFTAG_ZIPQUALITY		65557	/* compression quality level */
-#define	TIFFTAG_PIXARLOGQUALITY		65558	/* PixarLog uses same scale */
+#define TIFFTAG_ZIPQUALITY 65557      /* compression quality level */
+#define TIFFTAG_PIXARLOGQUALITY 65558 /* PixarLog uses same scale */
 /* 65559 is allocated to Oceana Matrix <dev@oceana.com> */
-#define TIFFTAG_DCSCLIPRECTANGLE	65559	/* area of image to acquire */
-#define TIFFTAG_SGILOGDATAFMT		65560	/* SGILog user data format */
-#define     SGILOGDATAFMT_FLOAT		0	/* IEEE float samples */
-#define     SGILOGDATAFMT_16BIT		1	/* 16-bit samples */
-#define     SGILOGDATAFMT_RAW		2	/* uninterpreted data */
-#define     SGILOGDATAFMT_8BIT		3	/* 8-bit RGB monitor values */
-#define TIFFTAG_SGILOGENCODE		65561 /* SGILog data encoding control*/
-#define     SGILOGENCODE_NODITHER	0     /* do not dither encoded values*/
-#define     SGILOGENCODE_RANDITHER	1     /* randomly dither encd values */
-#define	TIFFTAG_LZMAPRESET		65562	/* LZMA2 preset (compression level) */
-#define TIFFTAG_PERSAMPLE       65563	/* interface for per sample tags */
-#define     PERSAMPLE_MERGED        0	/* present as a single value */
-#define     PERSAMPLE_MULTI         1	/* present as multiple values */
-#define TIFFTAG_ZSTD_LEVEL      65564    /* ZSTD compression level */
-#define TIFFTAG_LERC_VERSION            65565 /* LERC version */
-#define     LERC_VERSION_2_4            4
-#define TIFFTAG_LERC_ADD_COMPRESSION    65566 /* LERC additional compression */
-#define     LERC_ADD_COMPRESSION_NONE    0
-#define     LERC_ADD_COMPRESSION_DEFLATE 1
-#define     LERC_ADD_COMPRESSION_ZSTD    2
-#define TIFFTAG_LERC_MAXZERROR          65567    /* LERC maximum error */
-#define TIFFTAG_WEBP_LEVEL		  65568	/* WebP compression level */
-#define TIFFTAG_WEBP_LOSSLESS		65569	/* WebP lossless/lossy */
-#define	TIFFTAG_DEFLATE_SUBCODEC	65570	/* ZIP codec: to get/set the sub-codec to use. Will default to libdeflate when available */
-#define     DEFLATE_SUBCODEC_ZLIB       0
-#define     DEFLATE_SUBCODEC_LIBDEFLATE 1
+#define TIFFTAG_DCSCLIPRECTANGLE 65559 /* area of image to acquire */
+#define TIFFTAG_SGILOGDATAFMT 65560    /* SGILog user data format */
+#define SGILOGDATAFMT_FLOAT 0          /* IEEE float samples */
+#define SGILOGDATAFMT_16BIT 1          /* 16-bit samples */
+#define SGILOGDATAFMT_RAW 2            /* uninterpreted data */
+#define SGILOGDATAFMT_8BIT 3           /* 8-bit RGB monitor values */
+#define TIFFTAG_SGILOGENCODE 65561     /* SGILog data encoding control*/
+#define SGILOGENCODE_NODITHER 0        /* do not dither encoded values*/
+#define SGILOGENCODE_RANDITHER 1       /* randomly dither encd values */
+#define TIFFTAG_LZMAPRESET 65562       /* LZMA2 preset (compression level) */
+#define TIFFTAG_PERSAMPLE 65563        /* interface for per sample tags */
+#define PERSAMPLE_MERGED 0             /* present as a single value */
+#define PERSAMPLE_MULTI 1              /* present as multiple values */
+#define TIFFTAG_ZSTD_LEVEL 65564       /* ZSTD compression level */
+#define TIFFTAG_LERC_VERSION 65565     /* LERC version */
+#define LERC_VERSION_2_4 4
+#define TIFFTAG_LERC_ADD_COMPRESSION 65566 /* LERC additional compression */
+#define LERC_ADD_COMPRESSION_NONE 0
+#define LERC_ADD_COMPRESSION_DEFLATE 1
+#define LERC_ADD_COMPRESSION_ZSTD 2
+#define TIFFTAG_LERC_MAXZERROR 65567   /* LERC maximum error */
+#define TIFFTAG_WEBP_LEVEL 65568       /* WebP compression level */
+#define TIFFTAG_WEBP_LOSSLESS 65569    /* WebP lossless/lossy */
+#define TIFFTAG_WEBP_LOSSLESS_EXACT 65571  /* WebP lossless exact mode. Set-only mode. Default is 1. Can be set to 0 to increase compression rate, but R,G,B in areas where alpha = 0 will not be preserved */
+#define TIFFTAG_DEFLATE_SUBCODEC 65570 /* ZIP codec: to get/set the sub-codec to use. Will default to libdeflate when available */
+#define DEFLATE_SUBCODEC_ZLIB 0
+#define DEFLATE_SUBCODEC_LIBDEFLATE 1
 
 /*
  * EXIF tags
  */
-#define EXIFTAG_EXPOSURETIME		33434	/* Exposure time */
-#define EXIFTAG_FNUMBER			33437	/* F number */
-#define EXIFTAG_EXPOSUREPROGRAM		34850	/* Exposure program */
-#define EXIFTAG_SPECTRALSENSITIVITY	34852	/* Spectral sensitivity */
-#define EXIFTAG_ISOSPEEDRATINGS		34855	/* ISO speed rating */
-#define EXIFTAG_PHOTOGRAPHICSENSITIVITY	34855 /* Photographic Sensitivity (new name for tag 34855) */
-#define EXIFTAG_OECF			34856	/* Optoelectric conversion factor */
-#define EXIFTAG_EXIFVERSION		36864	/* Exif version */
-#define EXIFTAG_DATETIMEORIGINAL	36867	/* Date and time of original
-						   data generation */
-#define EXIFTAG_DATETIMEDIGITIZED	36868	/* Date and time of digital
-						   data generation */
-#define EXIFTAG_COMPONENTSCONFIGURATION	37121	/* Meaning of each component */
-#define EXIFTAG_COMPRESSEDBITSPERPIXEL	37122	/* Image compression mode */
-#define EXIFTAG_SHUTTERSPEEDVALUE	37377	/* Shutter speed */
-#define EXIFTAG_APERTUREVALUE		37378	/* Aperture */
-#define EXIFTAG_BRIGHTNESSVALUE		37379	/* Brightness */
-#define EXIFTAG_EXPOSUREBIASVALUE	37380	/* Exposure bias */
-#define EXIFTAG_MAXAPERTUREVALUE	37381	/* Maximum lens aperture */
-#define EXIFTAG_SUBJECTDISTANCE		37382	/* Subject distance */
-#define EXIFTAG_METERINGMODE		37383	/* Metering mode */
-#define EXIFTAG_LIGHTSOURCE		37384	/* Light source */
-#define EXIFTAG_FLASH			37385	/* Flash */
-#define EXIFTAG_FOCALLENGTH		37386	/* Lens focal length */
-#define EXIFTAG_SUBJECTAREA		37396	/* Subject area */
-#define EXIFTAG_MAKERNOTE		37500	/* Manufacturer notes */
-#define EXIFTAG_USERCOMMENT		37510	/* User comments */
-#define EXIFTAG_SUBSECTIME		37520	/* DateTime subseconds */
-#define EXIFTAG_SUBSECTIMEORIGINAL	37521	/* DateTimeOriginal subseconds */
-#define EXIFTAG_SUBSECTIMEDIGITIZED	37522	/* DateTimeDigitized subseconds */
-#define EXIFTAG_FLASHPIXVERSION		40960	/* Supported Flashpix version */
-#define EXIFTAG_COLORSPACE		40961	/* Color space information */
-#define EXIFTAG_PIXELXDIMENSION		40962	/* Valid image width */
-#define EXIFTAG_PIXELYDIMENSION		40963	/* Valid image height */
-#define EXIFTAG_RELATEDSOUNDFILE	40964	/* Related audio file */
-#define EXIFTAG_FLASHENERGY		41483	/* Flash energy */
-#define EXIFTAG_SPATIALFREQUENCYRESPONSE 41484	/* Spatial frequency response */
-#define EXIFTAG_FOCALPLANEXRESOLUTION	41486	/* Focal plane X resolution */
-#define EXIFTAG_FOCALPLANEYRESOLUTION	41487	/* Focal plane Y resolution */
-#define EXIFTAG_FOCALPLANERESOLUTIONUNIT 41488	/* Focal plane resolution unit */
-#define EXIFTAG_SUBJECTLOCATION		41492	/* Subject location */
-#define EXIFTAG_EXPOSUREINDEX		41493	/* Exposure index */
-#define EXIFTAG_SENSINGMETHOD		41495	/* Sensing method */
-#define EXIFTAG_FILESOURCE		41728	/* File source */
-#define EXIFTAG_SCENETYPE		41729	/* Scene type */
-#define EXIFTAG_CFAPATTERN		41730	/* CFA pattern */
-#define EXIFTAG_CUSTOMRENDERED		41985	/* Custom image processing */
-#define EXIFTAG_EXPOSUREMODE		41986	/* Exposure mode */
-#define EXIFTAG_WHITEBALANCE		41987	/* White balance */
-#define EXIFTAG_DIGITALZOOMRATIO	41988	/* Digital zoom ratio */
-#define EXIFTAG_FOCALLENGTHIN35MMFILM	41989	/* Focal length in 35 mm film */
-#define EXIFTAG_SCENECAPTURETYPE	41990	/* Scene capture type */
-#define EXIFTAG_GAINCONTROL		41991	/* Gain control */
-#define EXIFTAG_CONTRAST		41992	/* Contrast */
-#define EXIFTAG_SATURATION		41993	/* Saturation */
-#define EXIFTAG_SHARPNESS		41994	/* Sharpness */
-#define EXIFTAG_DEVICESETTINGDESCRIPTION 41995	/* Device settings description */
-#define EXIFTAG_SUBJECTDISTANCERANGE	41996	/* Subject distance range */
-#define EXIFTAG_IMAGEUNIQUEID		42016	/* Unique image ID */
+#define EXIFTAG_EXPOSURETIME 33434             /* Exposure time */
+#define EXIFTAG_FNUMBER 33437                  /* F number */
+#define EXIFTAG_EXPOSUREPROGRAM 34850          /* Exposure program */
+#define EXIFTAG_SPECTRALSENSITIVITY 34852      /* Spectral sensitivity */
+/* After EXIF 2.2.1 ISOSpeedRatings is named PhotographicSensitivity.
+   In addition, while "Count=Any", only 1 count should be used. */
+#define EXIFTAG_ISOSPEEDRATINGS 34855          /* ISO speed rating */
+#define EXIFTAG_PHOTOGRAPHICSENSITIVITY 34855  /* Photographic Sensitivity (new name for tag 34855) */
+#define EXIFTAG_OECF 34856                     /* Optoelectric conversion factor */
+#define EXIFTAG_EXIFVERSION 36864              /* Exif version */
+#define EXIFTAG_DATETIMEORIGINAL 36867         /* Date and time of original data generation */
+#define EXIFTAG_DATETIMEDIGITIZED 36868        /* Date and time of digital data generation */
+#define EXIFTAG_COMPONENTSCONFIGURATION 37121  /* Meaning of each component */
+#define EXIFTAG_COMPRESSEDBITSPERPIXEL 37122   /* Image compression mode */
+#define EXIFTAG_SHUTTERSPEEDVALUE 37377        /* Shutter speed */
+#define EXIFTAG_APERTUREVALUE 37378            /* Aperture */
+#define EXIFTAG_BRIGHTNESSVALUE 37379          /* Brightness */
+#define EXIFTAG_EXPOSUREBIASVALUE 37380        /* Exposure bias */
+#define EXIFTAG_MAXAPERTUREVALUE 37381         /* Maximum lens aperture */
+#define EXIFTAG_SUBJECTDISTANCE 37382          /* Subject distance */
+#define EXIFTAG_METERINGMODE 37383             /* Metering mode */
+#define EXIFTAG_LIGHTSOURCE 37384              /* Light source */
+#define EXIFTAG_FLASH 37385                    /* Flash */
+#define EXIFTAG_FOCALLENGTH 37386              /* Lens focal length */
+#define EXIFTAG_SUBJECTAREA 37396              /* Subject area */
+#define EXIFTAG_MAKERNOTE 37500                /* Manufacturer notes */
+#define EXIFTAG_USERCOMMENT 37510              /* User comments */
+#define EXIFTAG_SUBSECTIME 37520               /* DateTime subseconds */
+#define EXIFTAG_SUBSECTIMEORIGINAL 37521       /* DateTimeOriginal subseconds */
+#define EXIFTAG_SUBSECTIMEDIGITIZED 37522      /* DateTimeDigitized subseconds */
+#define EXIFTAG_FLASHPIXVERSION 40960          /* Supported Flashpix version */
+#define EXIFTAG_COLORSPACE 40961               /* Color space information */
+#define EXIFTAG_PIXELXDIMENSION 40962          /* Valid image width */
+#define EXIFTAG_PIXELYDIMENSION 40963          /* Valid image height */
+#define EXIFTAG_RELATEDSOUNDFILE 40964         /* Related audio file */
+#define EXIFTAG_FLASHENERGY 41483              /* Flash energy */
+#define EXIFTAG_SPATIALFREQUENCYRESPONSE 41484 /* Spatial frequency response */
+#define EXIFTAG_FOCALPLANEXRESOLUTION 41486    /* Focal plane X resolution */
+#define EXIFTAG_FOCALPLANEYRESOLUTION 41487    /* Focal plane Y resolution */
+#define EXIFTAG_FOCALPLANERESOLUTIONUNIT 41488 /* Focal plane resolution unit */
+#define EXIFTAG_SUBJECTLOCATION 41492          /* Subject location */
+#define EXIFTAG_EXPOSUREINDEX 41493            /* Exposure index */
+#define EXIFTAG_SENSINGMETHOD 41495            /* Sensing method */
+#define EXIFTAG_FILESOURCE 41728               /* File source */
+#define EXIFTAG_SCENETYPE 41729                /* Scene type */
+#define EXIFTAG_CFAPATTERN 41730               /* CFA pattern */
+#define EXIFTAG_CUSTOMRENDERED 41985           /* Custom image processing */
+#define EXIFTAG_EXPOSUREMODE 41986             /* Exposure mode */
+#define EXIFTAG_WHITEBALANCE 41987             /* White balance */
+#define EXIFTAG_DIGITALZOOMRATIO 41988         /* Digital zoom ratio */
+#define EXIFTAG_FOCALLENGTHIN35MMFILM 41989    /* Focal length in 35 mm film */
+#define EXIFTAG_SCENECAPTURETYPE 41990         /* Scene capture type */
+#define EXIFTAG_GAINCONTROL 41991              /* Gain control */
+#define EXIFTAG_CONTRAST 41992                 /* Contrast */
+#define EXIFTAG_SATURATION 41993               /* Saturation */
+#define EXIFTAG_SHARPNESS 41994                /* Sharpness */
+#define EXIFTAG_DEVICESETTINGDESCRIPTION 41995 /* Device settings description */
+#define EXIFTAG_SUBJECTDISTANCERANGE 41996     /* Subject distance range */
+#define EXIFTAG_IMAGEUNIQUEID 42016            /* Unique image ID */
 
 /*--: New for EXIF-Version 2.32, May 2019 ... */
-#define EXIFTAG_SENSITIVITYTYPE		34864		/* The SensitivityType tag indicates which one of the parameters of ISO12232 is the PhotographicSensitivity tag. */
-#define EXIFTAG_STANDARDOUTPUTSENSITIVITY		34865		/* This tag indicates the standard output sensitivity value of a camera or input device defined in ISO 12232. */
-#define EXIFTAG_RECOMMENDEDEXPOSUREINDEX		34866		/* recommended exposure index */
-#define EXIFTAG_ISOSPEED		34867		/* ISO speed value */
-#define EXIFTAG_ISOSPEEDLATITUDEYYY		34868		/* ISO speed latitude yyy */
-#define EXIFTAG_ISOSPEEDLATITUDEZZZ		34869		/* ISO speed latitude zzz */
-#define EXIFTAG_OFFSETTIME		36880		/* offset from UTC of the time of DateTime tag. */
-#define EXIFTAG_OFFSETTIMEORIGINAL		36881		/* offset from UTC of the time of DateTimeOriginal tag. */
-#define EXIFTAG_OFFSETTIMEDIGITIZED		36882		/* offset from UTC of the time of DateTimeDigitized tag. */
-#define EXIFTAG_TEMPERATURE		37888		/* Temperature as the ambient situation at the shot in dergee Celsius */
-#define EXIFTAG_HUMIDITY		37889		/* Humidity as the ambient situation at the shot in percent */
-#define EXIFTAG_PRESSURE		37890		/* Pressure as the ambient situation at the shot hecto-Pascal (hPa) */
-#define EXIFTAG_WATERDEPTH		37891		/* WaterDepth as the ambient situation at the shot in meter (m) */
-#define EXIFTAG_ACCELERATION		37892		/* Acceleration (a scalar regardless of direction) as the ambient situation at the shot in units of mGal (10-5 m/s^2) */
-#define EXIFTAG_CAMERAELEVATIONANGLE		37893		/* Elevation/depression. angle of the orientation of the camera(imaging optical axis) as the ambient situation at the shot in degree from -180deg to +180deg. */
-#define EXIFTAG_CAMERAOWNERNAME		42032		/* owner of a camera */
-#define EXIFTAG_BODYSERIALNUMBER		42033		/* serial number of the body of the camera */
-#define EXIFTAG_LENSSPECIFICATION		42034		/* minimum focal length (in mm), maximum focal length (in mm), minimum F number in the minimum focal length, and minimum F number in the maximum focal length, */
-#define EXIFTAG_LENSMAKE		42035		/* the lens manufacturer */
-#define EXIFTAG_LENSMODEL		42036		/* the lens model name and model number */
-#define EXIFTAG_LENSSERIALNUMBER		42037		/* the serial number of the interchangeable lens */
-#define EXIFTAG_GAMMA		42240		/* value of coefficient gamma */
-#define EXIFTAG_COMPOSITEIMAGE		42080	/* composite image */
-#define EXIFTAG_SOURCEIMAGENUMBEROFCOMPOSITEIMAGE		42081	/* source image number of composite image */
-#define EXIFTAG_SOURCEEXPOSURETIMESOFCOMPOSITEIMAGE		42082	/* source exposure times of composite image */
+#define EXIFTAG_SENSITIVITYTYPE 34864           /* The SensitivityType tag indicates which one of the parameters of ISO12232 is the PhotographicSensitivity tag. */
+#define EXIFTAG_STANDARDOUTPUTSENSITIVITY 34865 /* This tag indicates the standard output sensitivity value of a camera or input device defined in ISO 12232. */
+#define EXIFTAG_RECOMMENDEDEXPOSUREINDEX 34866  /* recommended exposure index   */
+#define EXIFTAG_ISOSPEED 34867                  /* ISO speed value */
+#define EXIFTAG_ISOSPEEDLATITUDEYYY 34868       /* ISO speed latitude yyy */
+#define EXIFTAG_ISOSPEEDLATITUDEZZZ 34869       /* ISO speed latitude zzz */
+#define EXIFTAG_OFFSETTIME 36880                /* offset from UTC of the time of DateTime tag. */
+#define EXIFTAG_OFFSETTIMEORIGINAL 36881        /* offset from UTC of the time of DateTimeOriginal tag. */
+#define EXIFTAG_OFFSETTIMEDIGITIZED 36882       /* offset from UTC of the time of DateTimeDigitized tag. */
+#define EXIFTAG_TEMPERATURE 37888               /* Temperature as the ambient situation at the shot in dergee Celsius */
+#define EXIFTAG_HUMIDITY 37889                  /* Humidity as the ambient situation at the shot in percent */
+#define EXIFTAG_PRESSURE 37890                  /* Pressure as the ambient situation at the shot hecto-Pascal (hPa) */
+#define EXIFTAG_WATERDEPTH 37891                /* WaterDepth as the ambient situation at the shot in meter (m) */
+#define EXIFTAG_ACCELERATION 37892              /* Acceleration (a scalar regardless of direction) as the ambientsituation at the shot in units of mGal (10-5 m/s^2) */
+/* EXIFTAG_CAMERAELEVATIONANGLE: Elevation/depression. angle of the orientation of the  camera(imaging optical axis)
+ *                               as the ambient situation at the shot in degree from -180deg to +180deg. */
+#define EXIFTAG_CAMERAELEVATIONANGLE 37893
+#define EXIFTAG_CAMERAOWNERNAME 42032  /* owner of a camera */
+#define EXIFTAG_BODYSERIALNUMBER 42033 /* serial number of the body of the camera */
+/* EXIFTAG_LENSSPECIFICATION: minimum focal length (in mm), maximum focal length (in mm),minimum F number in the minimum focal length,
+ *                            and minimum F number in the maximum focal length, */
+#define EXIFTAG_LENSSPECIFICATION 42034
+#define EXIFTAG_LENSMAKE 42035                            /* the lens manufacturer */
+#define EXIFTAG_LENSMODEL 42036                           /* the lens model name and model number */
+#define EXIFTAG_LENSSERIALNUMBER 42037                    /* the serial number of the interchangeable lens */
+#define EXIFTAG_GAMMA 42240                               /* value of coefficient gamma */
+#define EXIFTAG_COMPOSITEIMAGE 42080                      /* composite image */
+#define EXIFTAG_SOURCEIMAGENUMBEROFCOMPOSITEIMAGE 42081   /* source image number of composite image */
+#define EXIFTAG_SOURCEEXPOSURETIMESOFCOMPOSITEIMAGE 42082 /* source exposure times of composite image */
 
 /*
  * EXIF-GPS tags  (Version 2.31, July 2016)
  */
-#define		GPSTAG_VERSIONID		0			/* 	Indicates the version of GPSInfoIFD.	 */
-#define		GPSTAG_LATITUDEREF		1			/* 	Indicates whether the latitude is north or south latitude.	 */
-#define		GPSTAG_LATITUDE		2			/* 	Indicates the latitude.	 */
-#define		GPSTAG_LONGITUDEREF		3			/* 	Indicates whether the longitude is east or west longitude.	 */
-#define		GPSTAG_LONGITUDE		4			/* 	Indicates the longitude.	 */
-#define		GPSTAG_ALTITUDEREF		5			/* 	Indicates the altitude used as the reference altitude.	 */
-#define		GPSTAG_ALTITUDE		6			/* 	Indicates the altitude based on the reference in GPSAltitudeRef.	 */
-#define		GPSTAG_TIMESTAMP		7			/* 	Indicates the time as UTC (Coordinated Universal Time).	 */
-#define		GPSTAG_SATELLITES		8			/* 	Indicates the GPS satellites used for measurements.	 */
-#define		GPSTAG_STATUS		9			/* 	Indicates the status of the GPS receiver when the image is recorded.	 */
-#define		GPSTAG_MEASUREMODE		10			/* 	Indicates the GPS measurement mode.	 */
-#define		GPSTAG_DOP		11			/* 	Indicates the GPS DOP (data degree of precision).	 */
-#define		GPSTAG_SPEEDREF		12			/* 	Indicates the unit used to express the GPS receiver speed of movement.	 */
-#define		GPSTAG_SPEED		13			/* 	Indicates the speed of GPS receiver movement.	 */
-#define		GPSTAG_TRACKREF		14			/* 	Indicates the reference for giving the direction of GPS receiver movement.	 */
-#define		GPSTAG_TRACK		15			/* 	Indicates the direction of GPS receiver movement.	 */
-#define		GPSTAG_IMGDIRECTIONREF		16			/* 	Indicates the reference for giving the direction of the image when it is captured.	 */
-#define		GPSTAG_IMGDIRECTION		17			/* 	Indicates the direction of the image when it was captured.	 */
-#define		GPSTAG_MAPDATUM		18			/* 	Indicates the geodetic survey data used by the GPS receiver. (e.g. WGS-84)	 */
-#define		GPSTAG_DESTLATITUDEREF		19			/* 	Indicates whether the latitude of the destination point is north or south latitude.	 */
-#define		GPSTAG_DESTLATITUDE		20			/* 	Indicates the latitude of the destination point.	 */
-#define		GPSTAG_DESTLONGITUDEREF		21			/* 	Indicates whether the longitude of the destination point is east or west longitude.	 */
-#define		GPSTAG_DESTLONGITUDE		22			/* 	Indicates the longitude of the destination point.	 */
-#define		GPSTAG_DESTBEARINGREF		23			/* 	Indicates the reference used for giving the bearing to the destination point.	 */
-#define		GPSTAG_DESTBEARING		24			/* 	Indicates the bearing to the destination point.	 */
-#define		GPSTAG_DESTDISTANCEREF		25			/* 	Indicates the unit used to express the distance to the destination point.	 */
-#define		GPSTAG_DESTDISTANCE		26			/* 	Indicates the distance to the destination point.	 */
-#define		GPSTAG_PROCESSINGMETHOD		27			/* 	A character string recording the name of the method used for location finding.	 */
-#define		GPSTAG_AREAINFORMATION		28			/* 	A character string recording the name of the GPS area.	 */
-#define		GPSTAG_DATESTAMP		29			/* 	A character string recording date and time information relative to UTC (Coordinated Universal Time).	 */
-#define		GPSTAG_DIFFERENTIAL		30			/* 	Indicates whether differential correction is applied to the GPS receiver.	 */
-#define		GPSTAG_GPSHPOSITIONINGERROR		31			/* Indicates horizontal positioning errors in meters.		 */
+#define GPSTAG_VERSIONID 0             /* Indicates the version of GPSInfoIFD. */
+#define GPSTAG_LATITUDEREF 1           /* Indicates whether the latitude is north or south latitude. */
+#define GPSTAG_LATITUDE 2              /* Indicates the latitude. */
+#define GPSTAG_LONGITUDEREF 3          /* Indicates whether the longitude is east or west longitude. */
+#define GPSTAG_LONGITUDE 4             /* Indicates the longitude. */
+#define GPSTAG_ALTITUDEREF 5           /* Indicates the altitude used as the reference altitude. */
+#define GPSTAG_ALTITUDE 6              /* Indicates the altitude based on the reference in GPSAltitudeRef. */
+#define GPSTAG_TIMESTAMP 7             /*Indicates the time as UTC (Coordinated Universal Time). */
+#define GPSTAG_SATELLITES 8            /*Indicates the GPS satellites used for measurements. */
+#define GPSTAG_STATUS 9                /* Indicates the status of the GPS receiver when the image is  recorded. */
+#define GPSTAG_MEASUREMODE 10          /* Indicates the GPS measurement mode. */
+#define GPSTAG_DOP 11                  /* Indicates the GPS DOP (data degree of precision). */
+#define GPSTAG_SPEEDREF 12             /* Indicates the unit used to express the GPS receiver speed of movement. */
+#define GPSTAG_SPEED 13                /* Indicates the speed of GPS receiver movement. */
+#define GPSTAG_TRACKREF 14             /* Indicates the reference for giving the direction of GPS receiver movement. */
+#define GPSTAG_TRACK 15                /* Indicates the direction of GPS receiver movement. */
+#define GPSTAG_IMGDIRECTIONREF 16      /* Indicates the reference for giving the direction of the image when it is captured. */
+#define GPSTAG_IMGDIRECTION 17         /* Indicates the direction of the image when it was captured. */
+#define GPSTAG_MAPDATUM 18             /* Indicates the geodetic survey data used by the GPS receiver. (e.g. WGS-84) */
+#define GPSTAG_DESTLATITUDEREF 19      /* Indicates whether the latitude of the destination point is north or south latitude. */
+#define GPSTAG_DESTLATITUDE 20         /* Indicates the latitude of the destination point. */
+#define GPSTAG_DESTLONGITUDEREF 21     /* Indicates whether the longitude of the destination point is east or west longitude. */
+#define GPSTAG_DESTLONGITUDE 22        /* Indicates the longitude of the destination point. */
+#define GPSTAG_DESTBEARINGREF 23       /* Indicates the reference used for giving the bearing to the destination point. */
+#define GPSTAG_DESTBEARING 24          /* Indicates the bearing to the destination point. */
+#define GPSTAG_DESTDISTANCEREF 25      /* Indicates the unit used to express the distance to the destination point. */
+#define GPSTAG_DESTDISTANCE 26         /* Indicates the distance to the destination point. */
+#define GPSTAG_PROCESSINGMETHOD 27     /* A character string recording the name of the method used for location finding. */
+#define GPSTAG_AREAINFORMATION 28      /* A character string recording the name of the GPS area. */
+#define GPSTAG_DATESTAMP 29            /* A character string recording date and time information relative to UTC (Coordinated Universal Time). */
+#define GPSTAG_DIFFERENTIAL 30         /* Indicates whether differential correction is applied to the GPS receiver. */
+#define GPSTAG_GPSHPOSITIONINGERROR 31 /* Indicates horizontal positioning errors in meters. */
 
 #endif /* _TIFF_ */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tiffconf.h.cmake.in b/3rdparty/libtiff/tiffconf.h.cmake.in
index 9b4b03282031..306874f5a7fc 100644
--- a/3rdparty/libtiff/tiffconf.h.cmake.in
+++ b/3rdparty/libtiff/tiffconf.h.cmake.in
@@ -4,9 +4,21 @@
   from this file in your programs.
 */
 
+/* clang-format off */
+/* clang-format disabled because CMake scripts are very sensitive to the
+ * formatting of this file. configure_file variables of type "@VAR@" are
+ * modified by clang-format and won't be substituted.
+ */
+
 #ifndef _TIFFCONF_
 #define _TIFFCONF_
 
+
+#include <stddef.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+
 /* Signed 16-bit type */
 #define TIFF_INT16_T @TIFF_INT16_T@
 
@@ -31,23 +43,26 @@
 /* Unsigned 8-bit type */
 #define TIFF_UINT8_T @TIFF_UINT8_T@
 
-/* Unsigned size type */
-#define TIFF_SIZE_T @TIFF_SIZE_T@
-
 /* Signed size type */
 #define TIFF_SSIZE_T @TIFF_SSIZE_T@
 
-/* Pointer difference type */
-#define TIFF_PTRDIFF_T @TIFF_PTRDIFF_T@
-
 /* Compatibility stuff. */
 
-/* Define as 0 or 1 according to the floating point format suported by the
+/* Define as 0 or 1 according to the floating point format supported by the
    machine */
 #cmakedefine HAVE_IEEEFP 1
 
-/* Set the native cpu bit order (FILLORDER_LSB2MSB or FILLORDER_MSB2LSB) */
-#define HOST_FILLORDER @HOST_FILLORDER@
+/* The concept of HOST_FILLORDER is broken. Since libtiff 4.5.1
+ * this macro will always be hardcoded to FILLORDER_LSB2MSB on all
+ * architectures, to reflect past long behavior of doing so on x86 architecture.
+ * Note however that the default FillOrder used by libtiff is FILLORDER_MSB2LSB,
+ * as mandated per the TIFF specification.
+ * The influence of HOST_FILLORDER is only when passing the 'H' mode in
+ * TIFFOpen().
+ * You should NOT rely on this macro to decide the CPU endianness!
+ * This macro will be removed in libtiff 4.6
+ */
+#define HOST_FILLORDER FILLORDER_LSB2MSB
 
 /* Native cpu byte order: 1 if big-endian (Motorola) or 0 if little-endian
    (Intel) */
@@ -62,6 +77,9 @@
 /* Support JBIG compression (requires JBIG-KIT library) */
 #cmakedefine JBIG_SUPPORT
 
+/* Support LERC compression */
+#cmakedefine LERC_SUPPORT 1
+
 /* Support LogLuv high dynamic range encoding */
 #cmakedefine LOGLUV_SUPPORT 1
 
@@ -91,8 +109,8 @@
 #cmakedefine LIBDEFLATE_SUPPORT 1
 
 /* Support strip chopping (whether or not to convert single-strip uncompressed
-   images to mutiple strips of ~8Kb to reduce memory usage) */
-#cmakedefine STRIPCHOP_DEFAULT 1
+   images to multiple strips of ~8Kb to reduce memory usage) */
+#cmakedefine STRIPCHOP_DEFAULT TIFF_STRIPCHOP
 
 /* Enable SubIFD tag (330) support */
 #cmakedefine SUBIFD_SUPPORT 1
@@ -122,3 +140,5 @@
 #define IPTC_SUPPORT
 
 #endif /* _TIFFCONF_ */
+
+/* clang-format on */
diff --git a/3rdparty/libtiff/tiffio.h b/3rdparty/libtiff/tiffio.h
index 6274f0989e81..20460542f668 100644
--- a/3rdparty/libtiff/tiffio.h
+++ b/3rdparty/libtiff/tiffio.h
@@ -23,7 +23,7 @@
  */
 
 #ifndef _TIFFIO_
-#define	_TIFFIO_
+#define _TIFFIO_
 
 /*
  * TIFF I/O Library Definitions.
@@ -60,20 +60,22 @@ typedef struct tiff TIFF;
  */
 /*
  * this is the machine addressing size type, only it's signed, so make it
- * int32 on 32bit machines, int64 on 64bit machines
+ * int32_t on 32bit machines, int64_t on 64bit machines
  */
 typedef TIFF_SSIZE_T tmsize_t;
-typedef uint64 toff_t;          /* file offset */
+#define TIFF_TMSIZE_T_MAX (tmsize_t)(SIZE_MAX >> 1)
+
+typedef uint64_t toff_t; /* file offset */
 /* the following are deprecated and should be replaced by their defining
    counterparts */
-typedef uint32 ttag_t;          /* directory tag */
-typedef uint16 tdir_t;          /* directory index */
-typedef uint16 tsample_t;       /* sample number */
-typedef uint32 tstrile_t;       /* strip or tile number */
-typedef tstrile_t tstrip_t;     /* strip number */
-typedef tstrile_t ttile_t;      /* tile number */
-typedef tmsize_t tsize_t;       /* i/o size in bytes */
-typedef void* tdata_t;          /* image data ref */
+typedef uint32_t ttag_t;    /* directory tag */
+typedef uint32_t tdir_t;    /* directory index */
+typedef uint16_t tsample_t; /* sample number */
+typedef uint32_t tstrile_t; /* strip or tile number */
+typedef tstrile_t tstrip_t; /* strip number */
+typedef tstrile_t ttile_t;  /* tile number */
+typedef tmsize_t tsize_t;   /* i/o size in bytes */
+typedef void *tdata_t;      /* image data ref */
 
 #if !defined(__WIN32__) && (defined(_WIN32) || defined(WIN32))
 #define __WIN32__
@@ -87,21 +89,22 @@ typedef void* tdata_t;          /* image data ref */
  */
 
 #if defined(_WINDOWS) || defined(__WIN32__) || defined(_Windows)
-#  if !defined(__CYGWIN) && !defined(AVOID_WIN32_FILEIO) && !defined(USE_WIN32_FILEIO)
-#    define AVOID_WIN32_FILEIO
-#  endif
+#if !defined(__CYGWIN) && !defined(AVOID_WIN32_FILEIO) &&                      \
+    !defined(USE_WIN32_FILEIO)
+#define AVOID_WIN32_FILEIO
+#endif
 #endif
 
 #if defined(USE_WIN32_FILEIO)
-# define VC_EXTRALEAN
-# include <windows.h>
-# ifdef __WIN32__
-DECLARE_HANDLE(thandle_t);     /* Win32 file handle */
-# else
-typedef HFILE thandle_t;       /* client data handle */
-# endif /* __WIN32__ */
+#define VC_EXTRALEAN
+#include <windows.h>
+#ifdef __WIN32__
+DECLARE_HANDLE(thandle_t); /* Win32 file handle */
+#else
+typedef HFILE thandle_t; /* client data handle */
+#endif /* __WIN32__ */
 #else
-typedef void* thandle_t;       /* client data handle */
+typedef void *thandle_t; /* client data handle */
 #endif /* USE_WIN32_FILEIO */
 
 /*
@@ -110,15 +113,15 @@ typedef void* thandle_t;       /* client data handle */
  * very large.   Bit-or these flags to enable printing
  * multiple items.
  */
-#define TIFFPRINT_NONE	       0x0    /* no extra info */
-#define TIFFPRINT_STRIPS       0x1    /* strips/tiles info */
-#define TIFFPRINT_CURVES       0x2    /* color/gray response curves */
-#define TIFFPRINT_COLORMAP     0x4    /* colormap */
-#define TIFFPRINT_JPEGQTABLES  0x100  /* JPEG Q matrices */
-#define TIFFPRINT_JPEGACTABLES 0x200  /* JPEG AC tables */
-#define TIFFPRINT_JPEGDCTABLES 0x200  /* JPEG DC tables */
-
-/* 
+#define TIFFPRINT_NONE 0x0           /* no extra info */
+#define TIFFPRINT_STRIPS 0x1         /* strips/tiles info */
+#define TIFFPRINT_CURVES 0x2         /* color/gray response curves */
+#define TIFFPRINT_COLORMAP 0x4       /* colormap */
+#define TIFFPRINT_JPEGQTABLES 0x100  /* JPEG Q matrices */
+#define TIFFPRINT_JPEGACTABLES 0x200 /* JPEG AC tables */
+#define TIFFPRINT_JPEGDCTABLES 0x200 /* JPEG DC tables */
+
+/*
  * Colour conversion stuff
  */
 
@@ -133,42 +136,45 @@ typedef void* thandle_t;       /* client data handle */
 
 /* Structure for holding information about a display device. */
 
-typedef unsigned char TIFFRGBValue;               /* 8-bit samples */
-
-typedef struct {
-	float d_mat[3][3];                        /* XYZ -> luminance matrix */
-	float d_YCR;                              /* Light o/p for reference white */
-	float d_YCG;
-	float d_YCB;
-	uint32 d_Vrwr;                            /* Pixel values for ref. white */
-	uint32 d_Vrwg;
-	uint32 d_Vrwb;
-	float d_Y0R;                              /* Residual light for black pixel */
-	float d_Y0G;
-	float d_Y0B;
-	float d_gammaR;                           /* Gamma values for the three guns */
-	float d_gammaG;
-	float d_gammaB;
+typedef unsigned char TIFFRGBValue; /* 8-bit samples */
+
+typedef struct
+{
+    float d_mat[3][3]; /* XYZ -> luminance matrix */
+    float d_YCR;       /* Light o/p for reference white */
+    float d_YCG;
+    float d_YCB;
+    uint32_t d_Vrwr; /* Pixel values for ref. white */
+    uint32_t d_Vrwg;
+    uint32_t d_Vrwb;
+    float d_Y0R; /* Residual light for black pixel */
+    float d_Y0G;
+    float d_Y0B;
+    float d_gammaR; /* Gamma values for the three guns */
+    float d_gammaG;
+    float d_gammaB;
 } TIFFDisplay;
 
-typedef struct {                                  /* YCbCr->RGB support */
-	TIFFRGBValue* clamptab;                   /* range clamping table */
-	int* Cr_r_tab;
-	int* Cb_b_tab;
-	int32* Cr_g_tab;
-	int32* Cb_g_tab;
-	int32* Y_tab;
+typedef struct
+{                           /* YCbCr->RGB support */
+    TIFFRGBValue *clamptab; /* range clamping table */
+    int *Cr_r_tab;
+    int *Cb_b_tab;
+    int32_t *Cr_g_tab;
+    int32_t *Cb_g_tab;
+    int32_t *Y_tab;
 } TIFFYCbCrToRGB;
 
-typedef struct {                                  /* CIE Lab 1976->RGB support */
-	int range;                                /* Size of conversion table */
+typedef struct
+{              /* CIE Lab 1976->RGB support */
+    int range; /* Size of conversion table */
 #define CIELABTORGB_TABLE_RANGE 1500
-	float rstep, gstep, bstep;
-	float X0, Y0, Z0;                         /* Reference white point */
-	TIFFDisplay display;
-	float Yr2r[CIELABTORGB_TABLE_RANGE + 1];  /* Conversion of Yr to r */
-	float Yg2g[CIELABTORGB_TABLE_RANGE + 1];  /* Conversion of Yg to g */
-	float Yb2b[CIELABTORGB_TABLE_RANGE + 1];  /* Conversion of Yb to b */
+    float rstep, gstep, bstep;
+    float X0, Y0, Z0; /* Reference white point */
+    TIFFDisplay display;
+    float Yr2r[CIELABTORGB_TABLE_RANGE + 1]; /* Conversion of Yr to r */
+    float Yg2g[CIELABTORGB_TABLE_RANGE + 1]; /* Conversion of Yg to g */
+    float Yb2b[CIELABTORGB_TABLE_RANGE + 1]; /* Conversion of Yb to b */
 } TIFFCIELabToRGB;
 
 /*
@@ -178,63 +184,66 @@ typedef struct _TIFFRGBAImage TIFFRGBAImage;
 /*
  * The image reading and conversion routines invoke
  * ``put routines'' to copy/image/whatever tiles of
- * raw image data.  A default set of routines are 
+ * raw image data.  A default set of routines are
  * provided to convert/copy raw image data to 8-bit
  * packed ABGR format rasters.  Applications can supply
  * alternate routines that unpack the data into a
  * different format or, for example, unpack the data
  * and draw the unpacked raster on the display.
  */
-typedef void (*tileContigRoutine)
-    (TIFFRGBAImage*, uint32*, uint32, uint32, uint32, uint32, int32, int32,
-	unsigned char*);
-typedef void (*tileSeparateRoutine)
-    (TIFFRGBAImage*, uint32*, uint32, uint32, uint32, uint32, int32, int32,
-	unsigned char*, unsigned char*, unsigned char*, unsigned char*);
+typedef void (*tileContigRoutine)(TIFFRGBAImage *, uint32_t *, uint32_t,
+                                  uint32_t, uint32_t, uint32_t, int32_t,
+                                  int32_t, unsigned char *);
+typedef void (*tileSeparateRoutine)(TIFFRGBAImage *, uint32_t *, uint32_t,
+                                    uint32_t, uint32_t, uint32_t, int32_t,
+                                    int32_t, unsigned char *, unsigned char *,
+                                    unsigned char *, unsigned char *);
 /*
  * RGBA-reader state.
  */
-struct _TIFFRGBAImage {
-	TIFF* tif;                              /* image handle */
-	int stoponerr;                          /* stop on read error */
-	int isContig;                           /* data is packed/separate */
-	int alpha;                              /* type of alpha data present */
-	uint32 width;                           /* image width */
-	uint32 height;                          /* image height */
-	uint16 bitspersample;                   /* image bits/sample */
-	uint16 samplesperpixel;                 /* image samples/pixel */
-	uint16 orientation;                     /* image orientation */
-	uint16 req_orientation;                 /* requested orientation */
-	uint16 photometric;                     /* image photometric interp */
-	uint16* redcmap;                        /* colormap palette */
-	uint16* greencmap;
-	uint16* bluecmap;
-	/* get image data routine */
-	int (*get)(TIFFRGBAImage*, uint32*, uint32, uint32);
-	/* put decoded strip/tile */
-	union {
-	    void (*any)(TIFFRGBAImage*);
-	    tileContigRoutine contig;
-	    tileSeparateRoutine separate;
-	} put;
-	TIFFRGBValue* Map;                      /* sample mapping array */
-	uint32** BWmap;                         /* black&white map */
-	uint32** PALmap;                        /* palette image map */
-	TIFFYCbCrToRGB* ycbcr;                  /* YCbCr conversion state */
-	TIFFCIELabToRGB* cielab;                /* CIE L*a*b conversion state */
-
-	uint8* UaToAa;                          /* Unassociated alpha to associated alpha conversion LUT */
-	uint8* Bitdepth16To8;                   /* LUT for conversion from 16bit to 8bit values */
-
-	int row_offset;
-	int col_offset;
+struct _TIFFRGBAImage
+{
+    TIFF *tif;                /* image handle */
+    int stoponerr;            /* stop on read error */
+    int isContig;             /* data is packed/separate */
+    int alpha;                /* type of alpha data present */
+    uint32_t width;           /* image width */
+    uint32_t height;          /* image height */
+    uint16_t bitspersample;   /* image bits/sample */
+    uint16_t samplesperpixel; /* image samples/pixel */
+    uint16_t orientation;     /* image orientation */
+    uint16_t req_orientation; /* requested orientation */
+    uint16_t photometric;     /* image photometric interp */
+    uint16_t *redcmap;        /* colormap palette */
+    uint16_t *greencmap;
+    uint16_t *bluecmap;
+    /* get image data routine */
+    int (*get)(TIFFRGBAImage *, uint32_t *, uint32_t, uint32_t);
+    /* put decoded strip/tile */
+    union
+    {
+        void (*any)(TIFFRGBAImage *);
+        tileContigRoutine contig;
+        tileSeparateRoutine separate;
+    } put;
+    TIFFRGBValue *Map;       /* sample mapping array */
+    uint32_t **BWmap;        /* black&white map */
+    uint32_t **PALmap;       /* palette image map */
+    TIFFYCbCrToRGB *ycbcr;   /* YCbCr conversion state */
+    TIFFCIELabToRGB *cielab; /* CIE L*a*b conversion state */
+
+    uint8_t *UaToAa; /* Unassociated alpha to associated alpha conversion LUT */
+    uint8_t *Bitdepth16To8; /* LUT for conversion from 16bit to 8bit values */
+
+    int row_offset;
+    int col_offset;
 };
 
 /*
  * Macros for extracting components from the
  * packed ABGR form returned by TIFFReadRGBAImage.
  */
-#define TIFFGetR(abgr) ((abgr) & 0xff)
+#define TIFFGetR(abgr) ((abgr)&0xff)
 #define TIFFGetG(abgr) (((abgr) >> 8) & 0xff)
 #define TIFFGetB(abgr) (((abgr) >> 16) & 0xff)
 #define TIFFGetA(abgr) (((abgr) >> 24) & 0xff)
@@ -246,327 +255,399 @@ struct _TIFFRGBAImage {
  * More codecs may be registered through calls to the library
  * and/or the builtin implementations may be overridden.
  */
-typedef int (*TIFFInitMethod)(TIFF*, int);
-typedef struct {
-	char* name;
-	uint16 scheme;
-	TIFFInitMethod init;
+typedef int (*TIFFInitMethod)(TIFF *, int);
+typedef struct
+{
+    char *name;
+    uint16_t scheme;
+    TIFFInitMethod init;
 } TIFFCodec;
 
-#include <stdio.h>
+typedef struct
+{
+    uint32_t uNum;
+    uint32_t uDenom;
+} TIFFRational_t;
+
 #include <stdarg.h>
+#include <stdio.h>
 
 /* share internal LogLuv conversion routines? */
 #ifndef LOGLUV_PUBLIC
 #define LOGLUV_PUBLIC 1
 #endif
 
-#if defined(__GNUC__) || defined(__attribute__)
-#  define TIFF_ATTRIBUTE(x)    __attribute__(x)
+#if defined(__GNUC__) || defined(__clang__) || defined(__attribute__)
+#define TIFF_ATTRIBUTE(x) __attribute__(x)
 #else
-#  define TIFF_ATTRIBUTE(x) /*nothing*/
+#define TIFF_ATTRIBUTE(x) /*nothing*/
 #endif
 
 #if defined(c_plusplus) || defined(__cplusplus)
-extern "C" {
+extern "C"
+{
 #endif
-typedef void (*TIFFErrorHandler)(const char*, const char*, va_list);
-typedef void (*TIFFErrorHandlerExt)(thandle_t, const char*, const char*, va_list);
-typedef tmsize_t (*TIFFReadWriteProc)(thandle_t, void*, tmsize_t);
-typedef toff_t (*TIFFSeekProc)(thandle_t, toff_t, int);
-typedef int (*TIFFCloseProc)(thandle_t);
-typedef toff_t (*TIFFSizeProc)(thandle_t);
-typedef int (*TIFFMapFileProc)(thandle_t, void** base, toff_t* size);
-typedef void (*TIFFUnmapFileProc)(thandle_t, void* base, toff_t size);
-typedef void (*TIFFExtendProc)(TIFF*);
-
-extern const char* TIFFGetVersion(void);
-
-extern const TIFFCodec* TIFFFindCODEC(uint16);
-extern TIFFCodec* TIFFRegisterCODEC(uint16, const char*, TIFFInitMethod);
-extern void TIFFUnRegisterCODEC(TIFFCodec*);
-extern int TIFFIsCODECConfigured(uint16);
-extern TIFFCodec* TIFFGetConfiguredCODECs(void);
-
-/*
- * Auxiliary functions.
- */
-
-extern void* _TIFFmalloc(tmsize_t s);
-extern void* _TIFFcalloc(tmsize_t nmemb, tmsize_t siz);
-extern void* _TIFFrealloc(void* p, tmsize_t s);
-extern void _TIFFmemset(void* p, int v, tmsize_t c);
-extern void _TIFFmemcpy(void* d, const void* s, tmsize_t c);
-extern int _TIFFmemcmp(const void* p1, const void* p2, tmsize_t c);
-extern void _TIFFfree(void* p);
-
-/*
-** Stuff, related to tag handling and creating custom tags.
-*/
-extern int TIFFGetTagListCount( TIFF * );
-extern uint32 TIFFGetTagListEntry( TIFF *, int tag_index );
-    
-#define TIFF_ANY       TIFF_NOTYPE     /* for field descriptor searching */
-#define TIFF_VARIABLE  -1              /* marker for variable length tags */
-#define TIFF_SPP       -2              /* marker for SamplesPerPixel tags */
-#define TIFF_VARIABLE2 -3              /* marker for uint32 var-length tags */
-
-#define FIELD_CUSTOM    65
-
-typedef struct _TIFFField TIFFField;
-typedef struct _TIFFFieldArray TIFFFieldArray;
-
-extern const TIFFField* TIFFFindField(TIFF *, uint32, TIFFDataType);
-extern const TIFFField* TIFFFieldWithTag(TIFF*, uint32);
-extern const TIFFField* TIFFFieldWithName(TIFF*, const char *);
-
-extern uint32 TIFFFieldTag(const TIFFField*);
-extern const char* TIFFFieldName(const TIFFField*);
-extern TIFFDataType TIFFFieldDataType(const TIFFField*);
-extern int TIFFFieldPassCount(const TIFFField*);
-extern int TIFFFieldReadCount(const TIFFField*);
-extern int TIFFFieldWriteCount(const TIFFField*);
-
-typedef int (*TIFFVSetMethod)(TIFF*, uint32, va_list);
-typedef int (*TIFFVGetMethod)(TIFF*, uint32, va_list);
-typedef void (*TIFFPrintMethod)(TIFF*, FILE*, long);
-
-typedef struct {
-    TIFFVSetMethod vsetfield; /* tag set routine */
-    TIFFVGetMethod vgetfield; /* tag get routine */
-    TIFFPrintMethod printdir; /* directory print routine */
-} TIFFTagMethods;
-
-extern  TIFFTagMethods *TIFFAccessTagMethods(TIFF *);
-extern  void *TIFFGetClientInfo(TIFF *, const char *);
-extern  void TIFFSetClientInfo(TIFF *, void *, const char *);
-
-extern void TIFFCleanup(TIFF* tif);
-extern void TIFFClose(TIFF* tif);
-extern int TIFFFlush(TIFF* tif);
-extern int TIFFFlushData(TIFF* tif);
-extern int TIFFGetField(TIFF* tif, uint32 tag, ...);
-extern int TIFFVGetField(TIFF* tif, uint32 tag, va_list ap);
-extern int TIFFGetFieldDefaulted(TIFF* tif, uint32 tag, ...);
-extern int TIFFVGetFieldDefaulted(TIFF* tif, uint32 tag, va_list ap);
-extern int TIFFReadDirectory(TIFF* tif);
-extern int TIFFReadCustomDirectory(TIFF* tif, toff_t diroff, const TIFFFieldArray* infoarray);
-extern int TIFFReadEXIFDirectory(TIFF* tif, toff_t diroff);
-extern int TIFFReadGPSDirectory(TIFF* tif, toff_t diroff);
-extern uint64 TIFFScanlineSize64(TIFF* tif);
-extern tmsize_t TIFFScanlineSize(TIFF* tif);
-extern uint64 TIFFRasterScanlineSize64(TIFF* tif);
-extern tmsize_t TIFFRasterScanlineSize(TIFF* tif);
-extern uint64 TIFFStripSize64(TIFF* tif);
-extern tmsize_t TIFFStripSize(TIFF* tif);
-extern uint64 TIFFRawStripSize64(TIFF* tif, uint32 strip);
-extern tmsize_t TIFFRawStripSize(TIFF* tif, uint32 strip);
-extern uint64 TIFFVStripSize64(TIFF* tif, uint32 nrows);
-extern tmsize_t TIFFVStripSize(TIFF* tif, uint32 nrows);
-extern uint64 TIFFTileRowSize64(TIFF* tif);
-extern tmsize_t TIFFTileRowSize(TIFF* tif);
-extern uint64 TIFFTileSize64(TIFF* tif);
-extern tmsize_t TIFFTileSize(TIFF* tif);
-extern uint64 TIFFVTileSize64(TIFF* tif, uint32 nrows);
-extern tmsize_t TIFFVTileSize(TIFF* tif, uint32 nrows);
-extern uint32 TIFFDefaultStripSize(TIFF* tif, uint32 request);
-extern void TIFFDefaultTileSize(TIFF*, uint32*, uint32*);
-extern int TIFFFileno(TIFF*);
-extern int TIFFSetFileno(TIFF*, int);
-extern thandle_t TIFFClientdata(TIFF*);
-extern thandle_t TIFFSetClientdata(TIFF*, thandle_t);
-extern int TIFFGetMode(TIFF*);
-extern int TIFFSetMode(TIFF*, int);
-extern int TIFFIsTiled(TIFF*);
-extern int TIFFIsByteSwapped(TIFF*);
-extern int TIFFIsUpSampled(TIFF*);
-extern int TIFFIsMSB2LSB(TIFF*);
-extern int TIFFIsBigEndian(TIFF*);
-extern TIFFReadWriteProc TIFFGetReadProc(TIFF*);
-extern TIFFReadWriteProc TIFFGetWriteProc(TIFF*);
-extern TIFFSeekProc TIFFGetSeekProc(TIFF*);                                                          
-extern TIFFCloseProc TIFFGetCloseProc(TIFF*);
-extern TIFFSizeProc TIFFGetSizeProc(TIFF*);
-extern TIFFMapFileProc TIFFGetMapFileProc(TIFF*);
-extern TIFFUnmapFileProc TIFFGetUnmapFileProc(TIFF*);
-extern uint32 TIFFCurrentRow(TIFF*);
-extern uint16 TIFFCurrentDirectory(TIFF*);
-extern uint16 TIFFNumberOfDirectories(TIFF*);
-extern uint64 TIFFCurrentDirOffset(TIFF*);
-extern uint32 TIFFCurrentStrip(TIFF*);
-extern uint32 TIFFCurrentTile(TIFF* tif);
-extern int TIFFReadBufferSetup(TIFF* tif, void* bp, tmsize_t size);
-extern int TIFFWriteBufferSetup(TIFF* tif, void* bp, tmsize_t size);  
-extern int TIFFSetupStrips(TIFF *);
-extern int TIFFWriteCheck(TIFF*, int, const char *);
-extern void TIFFFreeDirectory(TIFF*);
-extern int TIFFCreateDirectory(TIFF*);
-extern int TIFFCreateCustomDirectory(TIFF*,const TIFFFieldArray*);
-extern int TIFFCreateEXIFDirectory(TIFF*);
-extern int TIFFCreateGPSDirectory(TIFF*);
-extern int TIFFLastDirectory(TIFF*);
-extern int TIFFSetDirectory(TIFF*, uint16);
-extern int TIFFSetSubDirectory(TIFF*, uint64);
-extern int TIFFUnlinkDirectory(TIFF*, uint16);
-extern int TIFFSetField(TIFF*, uint32, ...);
-extern int TIFFVSetField(TIFF*, uint32, va_list);
-extern int TIFFUnsetField(TIFF*, uint32);
-extern int TIFFWriteDirectory(TIFF *);
-extern int TIFFWriteCustomDirectory(TIFF *, uint64 *);
-extern int TIFFCheckpointDirectory(TIFF *);
-extern int TIFFRewriteDirectory(TIFF *);
-extern int TIFFDeferStrileArrayWriting(TIFF *);
-extern int TIFFForceStrileArrayWriting(TIFF* );
+    typedef void (*TIFFErrorHandler)(const char *, const char *, va_list);
+    typedef void (*TIFFErrorHandlerExt)(thandle_t, const char *, const char *,
+                                        va_list);
+    typedef int (*TIFFErrorHandlerExtR)(TIFF *, void *user_data, const char *,
+                                        const char *, va_list);
+    typedef tmsize_t (*TIFFReadWriteProc)(thandle_t, void *, tmsize_t);
+    typedef toff_t (*TIFFSeekProc)(thandle_t, toff_t, int);
+    typedef int (*TIFFCloseProc)(thandle_t);
+    typedef toff_t (*TIFFSizeProc)(thandle_t);
+    typedef int (*TIFFMapFileProc)(thandle_t, void **base, toff_t *size);
+    typedef void (*TIFFUnmapFileProc)(thandle_t, void *base, toff_t size);
+    typedef void (*TIFFExtendProc)(TIFF *);
+
+    extern const char *TIFFGetVersion(void);
+
+    extern const TIFFCodec *TIFFFindCODEC(uint16_t);
+    extern TIFFCodec *TIFFRegisterCODEC(uint16_t, const char *, TIFFInitMethod);
+    extern void TIFFUnRegisterCODEC(TIFFCodec *);
+    extern int TIFFIsCODECConfigured(uint16_t);
+    extern TIFFCodec *TIFFGetConfiguredCODECs(void);
+
+    /*
+     * Auxiliary functions.
+     */
+
+    extern void *_TIFFmalloc(tmsize_t s);
+    extern void *_TIFFcalloc(tmsize_t nmemb, tmsize_t siz);
+    extern void *_TIFFrealloc(void *p, tmsize_t s);
+    extern void _TIFFmemset(void *p, int v, tmsize_t c);
+    extern void _TIFFmemcpy(void *d, const void *s, tmsize_t c);
+    extern int _TIFFmemcmp(const void *p1, const void *p2, tmsize_t c);
+    extern void _TIFFfree(void *p);
+
+    /*
+    ** Stuff, related to tag handling and creating custom tags.
+    */
+    extern int TIFFGetTagListCount(TIFF *);
+    extern uint32_t TIFFGetTagListEntry(TIFF *, int tag_index);
+
+#define TIFF_ANY TIFF_NOTYPE /* for field descriptor searching */
+#define TIFF_VARIABLE -1     /* marker for variable length tags */
+#define TIFF_SPP -2          /* marker for SamplesPerPixel tags */
+#define TIFF_VARIABLE2 -3    /* marker for uint32_t var-length tags */
+
+#define FIELD_CUSTOM 65
+
+    typedef struct _TIFFField TIFFField;
+    typedef struct _TIFFFieldArray TIFFFieldArray;
+
+    extern const TIFFField *TIFFFindField(TIFF *, uint32_t, TIFFDataType);
+    extern const TIFFField *TIFFFieldWithTag(TIFF *, uint32_t);
+    extern const TIFFField *TIFFFieldWithName(TIFF *, const char *);
+
+    extern uint32_t TIFFFieldTag(const TIFFField *);
+    extern const char *TIFFFieldName(const TIFFField *);
+    extern TIFFDataType TIFFFieldDataType(const TIFFField *);
+    extern int TIFFFieldPassCount(const TIFFField *);
+    extern int TIFFFieldReadCount(const TIFFField *);
+    extern int TIFFFieldWriteCount(const TIFFField *);
+    extern int
+    TIFFFieldSetGetSize(const TIFFField *); /* returns internal storage size of
+                                               TIFFSetGetFieldType in bytes. */
+    extern int TIFFFieldSetGetCountSize(
+        const TIFFField *); /* returns size of count parameter 0=none,
+                               2=uint16_t, 4=uint32_t */
+    extern int TIFFFieldIsAnonymous(const TIFFField *);
+
+    typedef int (*TIFFVSetMethod)(TIFF *, uint32_t, va_list);
+    typedef int (*TIFFVGetMethod)(TIFF *, uint32_t, va_list);
+    typedef void (*TIFFPrintMethod)(TIFF *, FILE *, long);
+
+    typedef struct
+    {
+        TIFFVSetMethod vsetfield; /* tag set routine */
+        TIFFVGetMethod vgetfield; /* tag get routine */
+        TIFFPrintMethod printdir; /* directory print routine */
+    } TIFFTagMethods;
+
+    extern TIFFTagMethods *TIFFAccessTagMethods(TIFF *);
+    extern void *TIFFGetClientInfo(TIFF *, const char *);
+    extern void TIFFSetClientInfo(TIFF *, void *, const char *);
+
+    extern void TIFFCleanup(TIFF *tif);
+    extern void TIFFClose(TIFF *tif);
+    extern int TIFFFlush(TIFF *tif);
+    extern int TIFFFlushData(TIFF *tif);
+    extern int TIFFGetField(TIFF *tif, uint32_t tag, ...);
+    extern int TIFFVGetField(TIFF *tif, uint32_t tag, va_list ap);
+    extern int TIFFGetFieldDefaulted(TIFF *tif, uint32_t tag, ...);
+    extern int TIFFVGetFieldDefaulted(TIFF *tif, uint32_t tag, va_list ap);
+    extern int TIFFReadDirectory(TIFF *tif);
+    extern int TIFFReadCustomDirectory(TIFF *tif, toff_t diroff,
+                                       const TIFFFieldArray *infoarray);
+    extern int TIFFReadEXIFDirectory(TIFF *tif, toff_t diroff);
+    extern int TIFFReadGPSDirectory(TIFF *tif, toff_t diroff);
+    extern uint64_t TIFFScanlineSize64(TIFF *tif);
+    extern tmsize_t TIFFScanlineSize(TIFF *tif);
+    extern uint64_t TIFFRasterScanlineSize64(TIFF *tif);
+    extern tmsize_t TIFFRasterScanlineSize(TIFF *tif);
+    extern uint64_t TIFFStripSize64(TIFF *tif);
+    extern tmsize_t TIFFStripSize(TIFF *tif);
+    extern uint64_t TIFFRawStripSize64(TIFF *tif, uint32_t strip);
+    extern tmsize_t TIFFRawStripSize(TIFF *tif, uint32_t strip);
+    extern uint64_t TIFFVStripSize64(TIFF *tif, uint32_t nrows);
+    extern tmsize_t TIFFVStripSize(TIFF *tif, uint32_t nrows);
+    extern uint64_t TIFFTileRowSize64(TIFF *tif);
+    extern tmsize_t TIFFTileRowSize(TIFF *tif);
+    extern uint64_t TIFFTileSize64(TIFF *tif);
+    extern tmsize_t TIFFTileSize(TIFF *tif);
+    extern uint64_t TIFFVTileSize64(TIFF *tif, uint32_t nrows);
+    extern tmsize_t TIFFVTileSize(TIFF *tif, uint32_t nrows);
+    extern uint32_t TIFFDefaultStripSize(TIFF *tif, uint32_t request);
+    extern void TIFFDefaultTileSize(TIFF *, uint32_t *, uint32_t *);
+    extern int TIFFFileno(TIFF *);
+    extern int TIFFSetFileno(TIFF *, int);
+    extern thandle_t TIFFClientdata(TIFF *);
+    extern thandle_t TIFFSetClientdata(TIFF *, thandle_t);
+    extern int TIFFGetMode(TIFF *);
+    extern int TIFFSetMode(TIFF *, int);
+    extern int TIFFIsTiled(TIFF *);
+    extern int TIFFIsByteSwapped(TIFF *);
+    extern int TIFFIsUpSampled(TIFF *);
+    extern int TIFFIsMSB2LSB(TIFF *);
+    extern int TIFFIsBigEndian(TIFF *);
+    extern int TIFFIsBigTIFF(TIFF *);
+    extern TIFFReadWriteProc TIFFGetReadProc(TIFF *);
+    extern TIFFReadWriteProc TIFFGetWriteProc(TIFF *);
+    extern TIFFSeekProc TIFFGetSeekProc(TIFF *);
+    extern TIFFCloseProc TIFFGetCloseProc(TIFF *);
+    extern TIFFSizeProc TIFFGetSizeProc(TIFF *);
+    extern TIFFMapFileProc TIFFGetMapFileProc(TIFF *);
+    extern TIFFUnmapFileProc TIFFGetUnmapFileProc(TIFF *);
+    extern uint32_t TIFFCurrentRow(TIFF *);
+    extern tdir_t TIFFCurrentDirectory(TIFF *);
+    extern tdir_t TIFFNumberOfDirectories(TIFF *);
+    extern uint64_t TIFFCurrentDirOffset(TIFF *);
+    extern uint32_t TIFFCurrentStrip(TIFF *);
+    extern uint32_t TIFFCurrentTile(TIFF *tif);
+    extern int TIFFReadBufferSetup(TIFF *tif, void *bp, tmsize_t size);
+    extern int TIFFWriteBufferSetup(TIFF *tif, void *bp, tmsize_t size);
+    extern int TIFFSetupStrips(TIFF *);
+    extern int TIFFWriteCheck(TIFF *, int, const char *);
+    extern void TIFFFreeDirectory(TIFF *);
+    extern int TIFFCreateDirectory(TIFF *);
+    extern int TIFFCreateCustomDirectory(TIFF *, const TIFFFieldArray *);
+    extern int TIFFCreateEXIFDirectory(TIFF *);
+    extern int TIFFCreateGPSDirectory(TIFF *);
+    extern int TIFFLastDirectory(TIFF *);
+    extern int TIFFSetDirectory(TIFF *, tdir_t);
+    extern int TIFFSetSubDirectory(TIFF *, uint64_t);
+    extern int TIFFUnlinkDirectory(TIFF *, tdir_t);
+    extern int TIFFSetField(TIFF *, uint32_t, ...);
+    extern int TIFFVSetField(TIFF *, uint32_t, va_list);
+    extern int TIFFUnsetField(TIFF *, uint32_t);
+    extern int TIFFWriteDirectory(TIFF *);
+    extern int TIFFWriteCustomDirectory(TIFF *, uint64_t *);
+    extern int TIFFCheckpointDirectory(TIFF *);
+    extern int TIFFRewriteDirectory(TIFF *);
+    extern int TIFFDeferStrileArrayWriting(TIFF *);
+    extern int TIFFForceStrileArrayWriting(TIFF *);
 
 #if defined(c_plusplus) || defined(__cplusplus)
-extern void TIFFPrintDirectory(TIFF*, FILE*, long = 0);
-extern int TIFFReadScanline(TIFF* tif, void* buf, uint32 row, uint16 sample = 0);
-extern int TIFFWriteScanline(TIFF* tif, void* buf, uint32 row, uint16 sample = 0);
-extern int TIFFReadRGBAImage(TIFF*, uint32, uint32, uint32*, int = 0);
-extern int TIFFReadRGBAImageOriented(TIFF*, uint32, uint32, uint32*,
-    int = ORIENTATION_BOTLEFT, int = 0);
+    extern void TIFFPrintDirectory(TIFF *, FILE *, long = 0);
+    extern int TIFFReadScanline(TIFF *tif, void *buf, uint32_t row,
+                                uint16_t sample = 0);
+    extern int TIFFWriteScanline(TIFF *tif, void *buf, uint32_t row,
+                                 uint16_t sample = 0);
+    extern int TIFFReadRGBAImage(TIFF *, uint32_t, uint32_t, uint32_t *,
+                                 int = 0);
+    extern int TIFFReadRGBAImageOriented(TIFF *, uint32_t, uint32_t, uint32_t *,
+                                         int = ORIENTATION_BOTLEFT, int = 0);
 #else
-extern void TIFFPrintDirectory(TIFF*, FILE*, long);
-extern int TIFFReadScanline(TIFF* tif, void* buf, uint32 row, uint16 sample);
-extern int TIFFWriteScanline(TIFF* tif, void* buf, uint32 row, uint16 sample);
-extern int TIFFReadRGBAImage(TIFF*, uint32, uint32, uint32*, int);
-extern int TIFFReadRGBAImageOriented(TIFF*, uint32, uint32, uint32*, int, int);
+extern void TIFFPrintDirectory(TIFF *, FILE *, long);
+extern int TIFFReadScanline(TIFF *tif, void *buf, uint32_t row,
+                            uint16_t sample);
+extern int TIFFWriteScanline(TIFF *tif, void *buf, uint32_t row,
+                             uint16_t sample);
+extern int TIFFReadRGBAImage(TIFF *, uint32_t, uint32_t, uint32_t *, int);
+extern int TIFFReadRGBAImageOriented(TIFF *, uint32_t, uint32_t, uint32_t *,
+                                     int, int);
 #endif
 
-extern int TIFFReadRGBAStrip(TIFF*, uint32, uint32 * );
-extern int TIFFReadRGBATile(TIFF*, uint32, uint32, uint32 * );
-extern int TIFFReadRGBAStripExt(TIFF*, uint32, uint32 *, int stop_on_error );
-extern int TIFFReadRGBATileExt(TIFF*, uint32, uint32, uint32 *, int stop_on_error );
-extern int TIFFRGBAImageOK(TIFF*, char [1024]);
-extern int TIFFRGBAImageBegin(TIFFRGBAImage*, TIFF*, int, char [1024]);
-extern int TIFFRGBAImageGet(TIFFRGBAImage*, uint32*, uint32, uint32);
-extern void TIFFRGBAImageEnd(TIFFRGBAImage*);
-extern TIFF* TIFFOpen(const char*, const char*);
-# ifdef __WIN32__
-extern TIFF* TIFFOpenW(const wchar_t*, const char*);
-# endif /* __WIN32__ */
-extern TIFF* TIFFFdOpen(int, const char*, const char*);
-extern TIFF* TIFFClientOpen(const char*, const char*,
-	    thandle_t,
-	    TIFFReadWriteProc, TIFFReadWriteProc,
-	    TIFFSeekProc, TIFFCloseProc,
-	    TIFFSizeProc,
-	    TIFFMapFileProc, TIFFUnmapFileProc);
-extern const char* TIFFFileName(TIFF*);
-extern const char* TIFFSetFileName(TIFF*, const char *);
-extern void TIFFError(const char*, const char*, ...) TIFF_ATTRIBUTE((__format__ (__printf__,2,3)));
-extern void TIFFErrorExt(thandle_t, const char*, const char*, ...) TIFF_ATTRIBUTE((__format__ (__printf__,3,4)));
-extern void TIFFWarning(const char*, const char*, ...) TIFF_ATTRIBUTE((__format__ (__printf__,2,3)));
-extern void TIFFWarningExt(thandle_t, const char*, const char*, ...) TIFF_ATTRIBUTE((__format__ (__printf__,3,4)));
-extern TIFFErrorHandler TIFFSetErrorHandler(TIFFErrorHandler);
-extern TIFFErrorHandlerExt TIFFSetErrorHandlerExt(TIFFErrorHandlerExt);
-extern TIFFErrorHandler TIFFSetWarningHandler(TIFFErrorHandler);
-extern TIFFErrorHandlerExt TIFFSetWarningHandlerExt(TIFFErrorHandlerExt);
-extern TIFFExtendProc TIFFSetTagExtender(TIFFExtendProc);
-extern uint32 TIFFComputeTile(TIFF* tif, uint32 x, uint32 y, uint32 z, uint16 s);
-extern int TIFFCheckTile(TIFF* tif, uint32 x, uint32 y, uint32 z, uint16 s);
-extern uint32 TIFFNumberOfTiles(TIFF*);
-extern tmsize_t TIFFReadTile(TIFF* tif, void* buf, uint32 x, uint32 y, uint32 z, uint16 s);  
-extern tmsize_t TIFFWriteTile(TIFF* tif, void* buf, uint32 x, uint32 y, uint32 z, uint16 s);
-extern uint32 TIFFComputeStrip(TIFF*, uint32, uint16);
-extern uint32 TIFFNumberOfStrips(TIFF*);
-extern tmsize_t TIFFReadEncodedStrip(TIFF* tif, uint32 strip, void* buf, tmsize_t size);
-extern tmsize_t TIFFReadRawStrip(TIFF* tif, uint32 strip, void* buf, tmsize_t size);  
-extern tmsize_t TIFFReadEncodedTile(TIFF* tif, uint32 tile, void* buf, tmsize_t size);  
-extern tmsize_t TIFFReadRawTile(TIFF* tif, uint32 tile, void* buf, tmsize_t size);  
-extern int      TIFFReadFromUserBuffer(TIFF* tif, uint32 strile,
-                                       void* inbuf, tmsize_t insize,
-                                       void* outbuf, tmsize_t outsize);
-extern tmsize_t TIFFWriteEncodedStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc);
-extern tmsize_t TIFFWriteRawStrip(TIFF* tif, uint32 strip, void* data, tmsize_t cc);  
-extern tmsize_t TIFFWriteEncodedTile(TIFF* tif, uint32 tile, void* data, tmsize_t cc);  
-extern tmsize_t TIFFWriteRawTile(TIFF* tif, uint32 tile, void* data, tmsize_t cc);  
-extern int TIFFDataWidth(TIFFDataType);    /* table of tag datatype widths */
-extern void TIFFSetWriteOffset(TIFF* tif, toff_t off);
-extern void TIFFSwabShort(uint16*);
-extern void TIFFSwabLong(uint32*);
-extern void TIFFSwabLong8(uint64*);
-extern void TIFFSwabFloat(float*);
-extern void TIFFSwabDouble(double*);
-extern void TIFFSwabArrayOfShort(uint16* wp, tmsize_t n);
-extern void TIFFSwabArrayOfTriples(uint8* tp, tmsize_t n);
-extern void TIFFSwabArrayOfLong(uint32* lp, tmsize_t n);
-extern void TIFFSwabArrayOfLong8(uint64* lp, tmsize_t n);
-extern void TIFFSwabArrayOfFloat(float* fp, tmsize_t n);
-extern void TIFFSwabArrayOfDouble(double* dp, tmsize_t n);
-extern void TIFFReverseBits(uint8* cp, tmsize_t n);
-extern const unsigned char* TIFFGetBitRevTable(int);
-
-extern uint64 TIFFGetStrileOffset(TIFF *tif, uint32 strile);
-extern uint64 TIFFGetStrileByteCount(TIFF *tif, uint32 strile);
-extern uint64 TIFFGetStrileOffsetWithErr(TIFF *tif, uint32 strile, int *pbErr);
-extern uint64 TIFFGetStrileByteCountWithErr(TIFF *tif, uint32 strile, int *pbErr);
+    extern int TIFFReadRGBAStrip(TIFF *, uint32_t, uint32_t *);
+    extern int TIFFReadRGBATile(TIFF *, uint32_t, uint32_t, uint32_t *);
+    extern int TIFFReadRGBAStripExt(TIFF *, uint32_t, uint32_t *,
+                                    int stop_on_error);
+    extern int TIFFReadRGBATileExt(TIFF *, uint32_t, uint32_t, uint32_t *,
+                                   int stop_on_error);
+    extern int TIFFRGBAImageOK(TIFF *, char[1024]);
+    extern int TIFFRGBAImageBegin(TIFFRGBAImage *, TIFF *, int, char[1024]);
+    extern int TIFFRGBAImageGet(TIFFRGBAImage *, uint32_t *, uint32_t,
+                                uint32_t);
+    extern void TIFFRGBAImageEnd(TIFFRGBAImage *);
+
+    extern const char *TIFFFileName(TIFF *);
+    extern const char *TIFFSetFileName(TIFF *, const char *);
+    extern void TIFFError(const char *, const char *, ...)
+        TIFF_ATTRIBUTE((__format__(__printf__, 2, 3)));
+    extern void TIFFErrorExt(thandle_t, const char *, const char *, ...)
+        TIFF_ATTRIBUTE((__format__(__printf__, 3, 4)));
+    extern void TIFFWarning(const char *, const char *, ...)
+        TIFF_ATTRIBUTE((__format__(__printf__, 2, 3)));
+    extern void TIFFWarningExt(thandle_t, const char *, const char *, ...)
+        TIFF_ATTRIBUTE((__format__(__printf__, 3, 4)));
+    extern TIFFErrorHandler TIFFSetErrorHandler(TIFFErrorHandler);
+    extern TIFFErrorHandlerExt TIFFSetErrorHandlerExt(TIFFErrorHandlerExt);
+    extern TIFFErrorHandler TIFFSetWarningHandler(TIFFErrorHandler);
+    extern TIFFErrorHandlerExt TIFFSetWarningHandlerExt(TIFFErrorHandlerExt);
+
+    extern void TIFFWarningExtR(TIFF *, const char *, const char *, ...)
+        TIFF_ATTRIBUTE((__format__(__printf__, 3, 4)));
+    extern void TIFFErrorExtR(TIFF *, const char *, const char *, ...)
+        TIFF_ATTRIBUTE((__format__(__printf__, 3, 4)));
+
+    typedef struct TIFFOpenOptions TIFFOpenOptions;
+    extern TIFFOpenOptions *TIFFOpenOptionsAlloc(void);
+    extern void TIFFOpenOptionsFree(TIFFOpenOptions *);
+    extern void
+    TIFFOpenOptionsSetMaxSingleMemAlloc(TIFFOpenOptions *opts,
+                                        tmsize_t max_single_mem_alloc);
+    extern void
+    TIFFOpenOptionsSetErrorHandlerExtR(TIFFOpenOptions *opts,
+                                       TIFFErrorHandlerExtR handler,
+                                       void *errorhandler_user_data);
+    extern void
+    TIFFOpenOptionsSetWarningHandlerExtR(TIFFOpenOptions *opts,
+                                         TIFFErrorHandlerExtR handler,
+                                         void *warnhandler_user_data);
+
+    extern TIFF *TIFFOpen(const char *, const char *);
+    extern TIFF *TIFFOpenExt(const char *, const char *, TIFFOpenOptions *opts);
+#ifdef __WIN32__
+    extern TIFF *TIFFOpenW(const wchar_t *, const char *);
+    extern TIFF *TIFFOpenWExt(const wchar_t *, const char *,
+                              TIFFOpenOptions *opts);
+#endif /* __WIN32__ */
+    extern TIFF *TIFFFdOpen(int, const char *, const char *);
+    extern TIFF *TIFFFdOpenExt(int, const char *, const char *,
+                               TIFFOpenOptions *opts);
+    extern TIFF *TIFFClientOpen(const char *, const char *, thandle_t,
+                                TIFFReadWriteProc, TIFFReadWriteProc,
+                                TIFFSeekProc, TIFFCloseProc, TIFFSizeProc,
+                                TIFFMapFileProc, TIFFUnmapFileProc);
+    extern TIFF *TIFFClientOpenExt(const char *, const char *, thandle_t,
+                                   TIFFReadWriteProc, TIFFReadWriteProc,
+                                   TIFFSeekProc, TIFFCloseProc, TIFFSizeProc,
+                                   TIFFMapFileProc, TIFFUnmapFileProc,
+                                   TIFFOpenOptions *opts);
+    extern TIFFExtendProc TIFFSetTagExtender(TIFFExtendProc);
+    extern uint32_t TIFFComputeTile(TIFF *tif, uint32_t x, uint32_t y,
+                                    uint32_t z, uint16_t s);
+    extern int TIFFCheckTile(TIFF *tif, uint32_t x, uint32_t y, uint32_t z,
+                             uint16_t s);
+    extern uint32_t TIFFNumberOfTiles(TIFF *);
+    extern tmsize_t TIFFReadTile(TIFF *tif, void *buf, uint32_t x, uint32_t y,
+                                 uint32_t z, uint16_t s);
+    extern tmsize_t TIFFWriteTile(TIFF *tif, void *buf, uint32_t x, uint32_t y,
+                                  uint32_t z, uint16_t s);
+    extern uint32_t TIFFComputeStrip(TIFF *, uint32_t, uint16_t);
+    extern uint32_t TIFFNumberOfStrips(TIFF *);
+    extern tmsize_t TIFFReadEncodedStrip(TIFF *tif, uint32_t strip, void *buf,
+                                         tmsize_t size);
+    extern tmsize_t TIFFReadRawStrip(TIFF *tif, uint32_t strip, void *buf,
+                                     tmsize_t size);
+    extern tmsize_t TIFFReadEncodedTile(TIFF *tif, uint32_t tile, void *buf,
+                                        tmsize_t size);
+    extern tmsize_t TIFFReadRawTile(TIFF *tif, uint32_t tile, void *buf,
+                                    tmsize_t size);
+    extern int TIFFReadFromUserBuffer(TIFF *tif, uint32_t strile, void *inbuf,
+                                      tmsize_t insize, void *outbuf,
+                                      tmsize_t outsize);
+    extern tmsize_t TIFFWriteEncodedStrip(TIFF *tif, uint32_t strip, void *data,
+                                          tmsize_t cc);
+    extern tmsize_t TIFFWriteRawStrip(TIFF *tif, uint32_t strip, void *data,
+                                      tmsize_t cc);
+    extern tmsize_t TIFFWriteEncodedTile(TIFF *tif, uint32_t tile, void *data,
+                                         tmsize_t cc);
+    extern tmsize_t TIFFWriteRawTile(TIFF *tif, uint32_t tile, void *data,
+                                     tmsize_t cc);
+    extern int TIFFDataWidth(
+        TIFFDataType); /* table of tag datatype widths within TIFF file. */
+    extern void TIFFSetWriteOffset(TIFF *tif, toff_t off);
+    extern void TIFFSwabShort(uint16_t *);
+    extern void TIFFSwabLong(uint32_t *);
+    extern void TIFFSwabLong8(uint64_t *);
+    extern void TIFFSwabFloat(float *);
+    extern void TIFFSwabDouble(double *);
+    extern void TIFFSwabArrayOfShort(uint16_t *wp, tmsize_t n);
+    extern void TIFFSwabArrayOfTriples(uint8_t *tp, tmsize_t n);
+    extern void TIFFSwabArrayOfLong(uint32_t *lp, tmsize_t n);
+    extern void TIFFSwabArrayOfLong8(uint64_t *lp, tmsize_t n);
+    extern void TIFFSwabArrayOfFloat(float *fp, tmsize_t n);
+    extern void TIFFSwabArrayOfDouble(double *dp, tmsize_t n);
+    extern void TIFFReverseBits(uint8_t *cp, tmsize_t n);
+    extern const unsigned char *TIFFGetBitRevTable(int);
+
+    extern uint64_t TIFFGetStrileOffset(TIFF *tif, uint32_t strile);
+    extern uint64_t TIFFGetStrileByteCount(TIFF *tif, uint32_t strile);
+    extern uint64_t TIFFGetStrileOffsetWithErr(TIFF *tif, uint32_t strile,
+                                               int *pbErr);
+    extern uint64_t TIFFGetStrileByteCountWithErr(TIFF *tif, uint32_t strile,
+                                                  int *pbErr);
 
 #ifdef LOGLUV_PUBLIC
-#define U_NEU		0.210526316
-#define V_NEU		0.473684211
-#define UVSCALE		410.
-extern double LogL16toY(int);
-extern double LogL10toY(int);
-extern void XYZtoRGB24(float*, uint8*);
-extern int uv_decode(double*, double*, int);
-extern void LogLuv24toXYZ(uint32, float*);
-extern void LogLuv32toXYZ(uint32, float*);
+#define U_NEU 0.210526316
+#define V_NEU 0.473684211
+#define UVSCALE 410.
+    extern double LogL16toY(int);
+    extern double LogL10toY(int);
+    extern void XYZtoRGB24(float *, uint8_t *);
+    extern int uv_decode(double *, double *, int);
+    extern void LogLuv24toXYZ(uint32_t, float *);
+    extern void LogLuv32toXYZ(uint32_t, float *);
 #if defined(c_plusplus) || defined(__cplusplus)
-extern int LogL16fromY(double, int = SGILOGENCODE_NODITHER);
-extern int LogL10fromY(double, int = SGILOGENCODE_NODITHER);
-extern int uv_encode(double, double, int = SGILOGENCODE_NODITHER);
-extern uint32 LogLuv24fromXYZ(float*, int = SGILOGENCODE_NODITHER);
-extern uint32 LogLuv32fromXYZ(float*, int = SGILOGENCODE_NODITHER);
+    extern int LogL16fromY(double, int = SGILOGENCODE_NODITHER);
+    extern int LogL10fromY(double, int = SGILOGENCODE_NODITHER);
+    extern int uv_encode(double, double, int = SGILOGENCODE_NODITHER);
+    extern uint32_t LogLuv24fromXYZ(float *, int = SGILOGENCODE_NODITHER);
+    extern uint32_t LogLuv32fromXYZ(float *, int = SGILOGENCODE_NODITHER);
 #else
-extern int LogL16fromY(double, int);
-extern int LogL10fromY(double, int);
-extern int uv_encode(double, double, int);
-extern uint32 LogLuv24fromXYZ(float*, int);
-extern uint32 LogLuv32fromXYZ(float*, int);
+    extern int LogL16fromY(double, int);
+    extern int LogL10fromY(double, int);
+    extern int uv_encode(double, double, int);
+    extern uint32_t LogLuv24fromXYZ(float *, int);
+    extern uint32_t LogLuv32fromXYZ(float *, int);
 #endif
 #endif /* LOGLUV_PUBLIC */
 
-extern int TIFFCIELabToRGBInit(TIFFCIELabToRGB*, const TIFFDisplay *, float*);
-extern void TIFFCIELabToXYZ(TIFFCIELabToRGB *, uint32, int32, int32,
-    float *, float *, float *);
-extern void TIFFXYZToRGB(TIFFCIELabToRGB *, float, float, float,
-    uint32 *, uint32 *, uint32 *);
-
-extern int TIFFYCbCrToRGBInit(TIFFYCbCrToRGB*, float*, float*);
-extern void TIFFYCbCrtoRGB(TIFFYCbCrToRGB *, uint32, int32, int32,
-    uint32 *, uint32 *, uint32 *);
+    extern int TIFFCIELabToRGBInit(TIFFCIELabToRGB *, const TIFFDisplay *,
+                                   float *);
+    extern void TIFFCIELabToXYZ(TIFFCIELabToRGB *, uint32_t, int32_t, int32_t,
+                                float *, float *, float *);
+    extern void TIFFXYZToRGB(TIFFCIELabToRGB *, float, float, float, uint32_t *,
+                             uint32_t *, uint32_t *);
+
+    extern int TIFFYCbCrToRGBInit(TIFFYCbCrToRGB *, float *, float *);
+    extern void TIFFYCbCrtoRGB(TIFFYCbCrToRGB *, uint32_t, int32_t, int32_t,
+                               uint32_t *, uint32_t *, uint32_t *);
+
+    /****************************************************************************
+     *               O B S O L E T E D    I N T E R F A C E S
+     *
+     * Don't use this stuff in your applications, it may be removed in the
+     *future libtiff versions.
+     ****************************************************************************/
+    typedef struct
+    {
+        ttag_t field_tag;               /* field's tag */
+        short field_readcount;          /* read count/TIFF_VARIABLE/TIFF_SPP */
+        short field_writecount;         /* write count/TIFF_VARIABLE */
+        TIFFDataType field_type;        /* type of associated data */
+        unsigned short field_bit;       /* bit in fieldsset bit vector */
+        unsigned char field_oktochange; /* if true, can change while writing */
+        unsigned char field_passcount;  /* if true, pass dir count on set */
+        char *field_name;               /* ASCII name */
+    } TIFFFieldInfo;
+
+    extern int TIFFMergeFieldInfo(TIFF *, const TIFFFieldInfo[], uint32_t);
 
-/****************************************************************************
- *               O B S O L E T E D    I N T E R F A C E S
- *
- * Don't use this stuff in your applications, it may be removed in the future
- * libtiff versions.
- ****************************************************************************/
-typedef	struct {
-	ttag_t	field_tag;		/* field's tag */
-	short	field_readcount;	/* read count/TIFF_VARIABLE/TIFF_SPP */
-	short	field_writecount;	/* write count/TIFF_VARIABLE */
-	TIFFDataType field_type;	/* type of associated data */
-        unsigned short field_bit;	/* bit in fieldsset bit vector */
-	unsigned char field_oktochange;	/* if true, can change while writing */
-	unsigned char field_passcount;	/* if true, pass dir count on set */
-	char	*field_name;		/* ASCII name */
-} TIFFFieldInfo;
-
-extern int TIFFMergeFieldInfo(TIFF*, const TIFFFieldInfo[], uint32);
-        
 #if defined(c_plusplus) || defined(__cplusplus)
 }
 #endif
 
 #endif /* _TIFFIO_ */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tiffio.hxx b/3rdparty/libtiff/tiffio.hxx
index df2cbbceb734..6182449b1384 100644
--- a/3rdparty/libtiff/tiffio.hxx
+++ b/3rdparty/libtiff/tiffio.hxx
@@ -2,47 +2,38 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
 #ifndef _TIFFIO_HXX_
-#define	_TIFFIO_HXX_
+#define _TIFFIO_HXX_
 
 /*
  * TIFF I/O library definitions which provide C++ streams API.
  */
 
-#include <iostream>
 #include "tiff.h"
 #include "tiffio.h"
+#include <iostream>
 
-extern TIFF* TIFFStreamOpen(const char*, std::ostream *);
-extern TIFF* TIFFStreamOpen(const char*, std::istream *);
+extern TIFF *TIFFStreamOpen(const char *, std::ostream *);
+extern TIFF *TIFFStreamOpen(const char *, std::istream *);
 
 #endif /* _TIFFIO_HXX_ */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c++
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tiffiop.h b/3rdparty/libtiff/tiffiop.h
index 39b54c896670..fbf7b0700bf6 100644
--- a/3rdparty/libtiff/tiffiop.h
+++ b/3rdparty/libtiff/tiffiop.h
@@ -2,28 +2,28 @@
  * Copyright (c) 1988-1997 Sam Leffler
  * Copyright (c) 1991-1997 Silicon Graphics, Inc.
  *
- * Permission to use, copy, modify, distribute, and sell this software and 
+ * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
  * that (i) the above copyright notices and this permission notice appear in
  * all copies of the software and related documentation, and (ii) the names of
  * Sam Leffler and Silicon Graphics may not be used in any advertising or
  * publicity relating to the software without the specific, prior written
  * permission of Sam Leffler and Silicon Graphics.
- * 
- * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, 
- * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY 
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  
- * 
+ *
+ * THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ *
  * IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
  * ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
  * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF 
- * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 
+ * WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+ * LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THIS SOFTWARE.
  */
 
 #ifndef _TIFFIOP_
-#define	_TIFFIOP_
+#define _TIFFIOP_
 /*
  * ``Library-private'' definitions.
  */
@@ -31,66 +31,48 @@
 #include "tif_config.h"
 
 #ifdef HAVE_FCNTL_H
-# include <fcntl.h>
+#include <fcntl.h>
 #endif
 
 #ifdef HAVE_SYS_TYPES_H
-# include <sys/types.h>
+#include <sys/types.h>
 #endif
 
-#ifdef HAVE_STRING_H
-# include <string.h>
-#endif
+#include <string.h>
 
 #ifdef HAVE_ASSERT_H
-# include <assert.h>
-#else
-# define assert(x) 
-#endif
-
-#ifdef HAVE_SEARCH_H
-# include <search.h>
+#include <assert.h>
 #else
-extern void *lfind(const void *, const void *, size_t *, size_t,
-		   int (*)(const void *, const void *));
-#endif
-
-#if !defined(HAVE_SNPRINTF) && !defined(HAVE__SNPRINTF)
-#undef snprintf
-#define snprintf _TIFF_snprintf_f
-extern int snprintf(char* str, size_t size, const char* format, ...);
+#define assert(x)
 #endif
 
+#include "tif_hash_set.h"
 #include "tiffio.h"
 
 #include "tif_dir.h"
 
+#include <limits.h>
+
 #ifndef STRIP_SIZE_DEFAULT
-# define STRIP_SIZE_DEFAULT 8192
+#define STRIP_SIZE_DEFAULT 8192
 #endif
 
-#define    streq(a,b)      (strcmp(a,b) == 0)
-#define    strneq(a,b,n)   (strncmp(a,b,n) == 0)
-
-#ifndef TRUE
-#define	TRUE	1
-#define	FALSE	0
+#ifndef TIFF_MAX_DIR_COUNT
+#define TIFF_MAX_DIR_COUNT 1048576
 #endif
 
-#define TIFF_SIZE_T_MAX ((size_t) ~ ((size_t)0))
-#define TIFF_TMSIZE_T_MAX (tmsize_t)(TIFF_SIZE_T_MAX >> 1)
+#define TIFF_NON_EXISTENT_DIR_NUMBER UINT_MAX
 
-/*
- * Largest 32-bit unsigned integer value.
- */
-#define TIFF_UINT32_MAX 0xFFFFFFFFU
+#define streq(a, b) (strcmp(a, b) == 0)
+#define strneq(a, b, n) (strncmp(a, b, n) == 0)
 
-/*
- * Largest 64-bit unsigned integer value.
- */
-#define TIFF_UINT64_MAX (((uint64)(TIFF_UINT32_MAX)) << 32 | TIFF_UINT32_MAX)
+#ifndef TRUE
+#define TRUE 1
+#define FALSE 0
+#endif
 
-typedef struct client_info {
+typedef struct client_info
+{
     struct client_info *next;
     void *data;
     char *name;
@@ -100,187 +82,231 @@ typedef struct client_info {
  * Typedefs for ``method pointers'' used internally.
  * these are deprecated and provided only for backwards compatibility.
  */
-typedef unsigned char tidataval_t;    /* internal image data value type */
-typedef tidataval_t* tidata_t;        /* reference to internal image data */
-
-typedef void (*TIFFVoidMethod)(TIFF*);
-typedef int (*TIFFBoolMethod)(TIFF*);
-typedef int (*TIFFPreMethod)(TIFF*, uint16);
-typedef int (*TIFFCodeMethod)(TIFF* tif, uint8* buf, tmsize_t size, uint16 sample);
-typedef int (*TIFFSeekMethod)(TIFF*, uint32);
-typedef void (*TIFFPostMethod)(TIFF* tif, uint8* buf, tmsize_t size);
-typedef uint32 (*TIFFStripMethod)(TIFF*, uint32);
-typedef void (*TIFFTileMethod)(TIFF*, uint32*, uint32*);
-
-struct tiff {
-	char*                tif_name;         /* name of open file */
-	int                  tif_fd;           /* open file descriptor */
-	int                  tif_mode;         /* open mode (O_*) */
-	uint32               tif_flags;
-	#define TIFF_FILLORDER   0x00003U /* natural bit fill order for machine */
-	#define TIFF_DIRTYHEADER 0x00004U /* header must be written on close */
-	#define TIFF_DIRTYDIRECT 0x00008U /* current directory must be written */
-	#define TIFF_BUFFERSETUP 0x00010U /* data buffers setup */
-	#define TIFF_CODERSETUP  0x00020U /* encoder/decoder setup done */
-	#define TIFF_BEENWRITING 0x00040U /* written 1+ scanlines to file */
-	#define TIFF_SWAB        0x00080U /* byte swap file information */
-	#define TIFF_NOBITREV    0x00100U /* inhibit bit reversal logic */
-	#define TIFF_MYBUFFER    0x00200U /* my raw data buffer; free on close */
-	#define TIFF_ISTILED     0x00400U /* file is tile, not strip- based */
-	#define TIFF_MAPPED      0x00800U /* file is mapped into memory */
-	#define TIFF_POSTENCODE  0x01000U /* need call to postencode routine */
-	#define TIFF_INSUBIFD    0x02000U /* currently writing a subifd */
-	#define TIFF_UPSAMPLED   0x04000U /* library is doing data up-sampling */
-	#define TIFF_STRIPCHOP   0x08000U /* enable strip chopping support */
-	#define TIFF_HEADERONLY  0x10000U /* read header only, do not process the first directory */
-	#define TIFF_NOREADRAW   0x20000U /* skip reading of raw uncompressed image data */
-	#define TIFF_INCUSTOMIFD 0x40000U /* currently writing a custom IFD */
-	#define TIFF_BIGTIFF     0x80000U /* read/write bigtiff */
-        #define TIFF_BUF4WRITE  0x100000U /* rawcc bytes are for writing */
-        #define TIFF_DIRTYSTRIP 0x200000U /* stripoffsets/stripbytecount dirty*/
-        #define TIFF_PERSAMPLE  0x400000U /* get/set per sample tags as arrays */
-        #define TIFF_BUFFERMMAP 0x800000U /* read buffer (tif_rawdata) points into mmap() memory */
-        #define TIFF_DEFERSTRILELOAD 0x1000000U /* defer strip/tile offset/bytecount array loading. */
-        #define TIFF_LAZYSTRILELOAD  0x2000000U /* lazy/ondemand loading of strip/tile offset/bytecount values. Only used if TIFF_DEFERSTRILELOAD is set and in read-only mode */
-        #define TIFF_CHOPPEDUPARRAYS 0x4000000U /* set when allocChoppedUpStripArrays() has modified strip array */
-	uint64               tif_diroff;       /* file offset of current directory */
-	uint64               tif_nextdiroff;   /* file offset of following directory */
-	uint64*              tif_dirlist;      /* list of offsets to already seen directories to prevent IFD looping */
-	uint16               tif_dirlistsize;  /* number of entries in offset list */
-	uint16               tif_dirnumber;    /* number of already seen directories */
-	TIFFDirectory        tif_dir;          /* internal rep of current directory */
-	TIFFDirectory        tif_customdir;    /* custom IFDs are separated from the main ones */
-	union {
-		TIFFHeaderCommon common;
-		TIFFHeaderClassic classic;
-		TIFFHeaderBig big;
-	} tif_header;
-	uint16               tif_header_size;  /* file's header block and its length */
-	uint32               tif_row;          /* current scanline */
-	uint16               tif_curdir;       /* current directory (index) */
-	uint32               tif_curstrip;     /* current strip for read/write */
-	uint64               tif_curoff;       /* current offset for read/write */
-	uint64               tif_dataoff;      /* current offset for writing dir */
-	/* SubIFD support */
-	uint16               tif_nsubifd;      /* remaining subifds to write */
-	uint64               tif_subifdoff;    /* offset for patching SubIFD link */
-	/* tiling support */
-	uint32               tif_col;          /* current column (offset by row too) */
-	uint32               tif_curtile;      /* current tile for read/write */
-	tmsize_t             tif_tilesize;     /* # of bytes in a tile */
-	/* compression scheme hooks */
-	int                  tif_decodestatus;
-	TIFFBoolMethod       tif_fixuptags;    /* called in TIFFReadDirectory */
-	TIFFBoolMethod       tif_setupdecode;  /* called once before predecode */
-	TIFFPreMethod        tif_predecode;    /* pre- row/strip/tile decoding */
-	TIFFBoolMethod       tif_setupencode;  /* called once before preencode */
-	int                  tif_encodestatus;
-	TIFFPreMethod        tif_preencode;    /* pre- row/strip/tile encoding */
-	TIFFBoolMethod       tif_postencode;   /* post- row/strip/tile encoding */
-	TIFFCodeMethod       tif_decoderow;    /* scanline decoding routine */
-	TIFFCodeMethod       tif_encoderow;    /* scanline encoding routine */
-	TIFFCodeMethod       tif_decodestrip;  /* strip decoding routine */
-	TIFFCodeMethod       tif_encodestrip;  /* strip encoding routine */
-	TIFFCodeMethod       tif_decodetile;   /* tile decoding routine */
-	TIFFCodeMethod       tif_encodetile;   /* tile encoding routine */
-	TIFFVoidMethod       tif_close;        /* cleanup-on-close routine */
-	TIFFSeekMethod       tif_seek;         /* position within a strip routine */
-	TIFFVoidMethod       tif_cleanup;      /* cleanup state routine */
-	TIFFStripMethod      tif_defstripsize; /* calculate/constrain strip size */
-	TIFFTileMethod       tif_deftilesize;  /* calculate/constrain tile size */
-	uint8*               tif_data;         /* compression scheme private data */
-	/* input/output buffering */
-	tmsize_t             tif_scanlinesize; /* # of bytes in a scanline */
-	tmsize_t             tif_scanlineskew; /* scanline skew for reading strips */
-	uint8*               tif_rawdata;      /* raw data buffer */
-	tmsize_t             tif_rawdatasize;  /* # of bytes in raw data buffer */
-        tmsize_t             tif_rawdataoff;   /* rawdata offset within strip */
-        tmsize_t             tif_rawdataloaded;/* amount of data in rawdata */
-	uint8*               tif_rawcp;        /* current spot in raw buffer */
-	tmsize_t             tif_rawcc;        /* bytes unread from raw buffer */
-	/* memory-mapped file support */
-	uint8*               tif_base;         /* base of mapped file */
-	tmsize_t             tif_size;         /* size of mapped file region (bytes, thus tmsize_t) */
-	TIFFMapFileProc      tif_mapproc;      /* map file method */
-	TIFFUnmapFileProc    tif_unmapproc;    /* unmap file method */
-	/* input/output callback methods */
-	thandle_t            tif_clientdata;   /* callback parameter */
-	TIFFReadWriteProc    tif_readproc;     /* read method */
-	TIFFReadWriteProc    tif_writeproc;    /* write method */
-	TIFFSeekProc         tif_seekproc;     /* lseek method */
-	TIFFCloseProc        tif_closeproc;    /* close method */
-	TIFFSizeProc         tif_sizeproc;     /* filesize method */
-	/* post-decoding support */
-	TIFFPostMethod       tif_postdecode;   /* post decoding routine */
-	/* tag support */
-	TIFFField**          tif_fields;       /* sorted table of registered tags */
-	size_t               tif_nfields;      /* # entries in registered tag table */
-	const TIFFField*     tif_foundfield;   /* cached pointer to already found tag */
-	TIFFTagMethods       tif_tagmethods;   /* tag get/set/print routines */
-	TIFFClientInfoLink*  tif_clientinfo;   /* extra client information. */
-	/* Backward compatibility stuff. We need these two fields for
-	 * setting up an old tag extension scheme. */
-	TIFFFieldArray*      tif_fieldscompat;
-	size_t               tif_nfieldscompat;
+typedef unsigned char tidataval_t; /* internal image data value type */
+typedef tidataval_t *tidata_t;     /* reference to internal image data */
+
+typedef void (*TIFFVoidMethod)(TIFF *);
+typedef int (*TIFFBoolMethod)(TIFF *);
+typedef int (*TIFFPreMethod)(TIFF *, uint16_t);
+typedef int (*TIFFCodeMethod)(TIFF *tif, uint8_t *buf, tmsize_t size,
+                              uint16_t sample);
+typedef int (*TIFFSeekMethod)(TIFF *, uint32_t);
+typedef void (*TIFFPostMethod)(TIFF *tif, uint8_t *buf, tmsize_t size);
+typedef uint32_t (*TIFFStripMethod)(TIFF *, uint32_t);
+typedef void (*TIFFTileMethod)(TIFF *, uint32_t *, uint32_t *);
+
+struct TIFFOffsetAndDirNumber
+{
+    uint64_t offset;
+    tdir_t dirNumber;
+};
+typedef struct TIFFOffsetAndDirNumber TIFFOffsetAndDirNumber;
+
+struct tiff
+{
+    char *tif_name; /* name of open file */
+    int tif_fd;     /* open file descriptor */
+    int tif_mode;   /* open mode (O_*) */
+    uint32_t tif_flags;
+#define TIFF_FILLORDER 0x00003U   /* natural bit fill order for machine */
+#define TIFF_DIRTYHEADER 0x00004U /* header must be written on close */
+#define TIFF_DIRTYDIRECT 0x00008U /* current directory must be written */
+#define TIFF_BUFFERSETUP 0x00010U /* data buffers setup */
+#define TIFF_CODERSETUP 0x00020U  /* encoder/decoder setup done */
+#define TIFF_BEENWRITING 0x00040U /* written 1+ scanlines to file */
+#define TIFF_SWAB 0x00080U        /* byte swap file information */
+#define TIFF_NOBITREV 0x00100U    /* inhibit bit reversal logic */
+#define TIFF_MYBUFFER 0x00200U    /* my raw data buffer; free on close */
+#define TIFF_ISTILED 0x00400U     /* file is tile, not strip- based */
+#define TIFF_MAPPED 0x00800U      /* file is mapped into memory */
+#define TIFF_POSTENCODE 0x01000U  /* need call to postencode routine */
+#define TIFF_INSUBIFD 0x02000U    /* currently writing a subifd */
+#define TIFF_UPSAMPLED 0x04000U   /* library is doing data up-sampling */
+#define TIFF_STRIPCHOP 0x08000U   /* enable strip chopping support */
+#define TIFF_HEADERONLY                                                        \
+    0x10000U /* read header only, do not process the first directory */
+#define TIFF_NOREADRAW                                                         \
+    0x20000U /* skip reading of raw uncompressed image data */
+#define TIFF_INCUSTOMIFD 0x40000U /* currently writing a custom IFD */
+#define TIFF_BIGTIFF 0x80000U     /* read/write bigtiff */
+#define TIFF_BUF4WRITE 0x100000U  /* rawcc bytes are for writing */
+#define TIFF_DIRTYSTRIP 0x200000U /* stripoffsets/stripbytecount dirty*/
+#define TIFF_PERSAMPLE 0x400000U  /* get/set per sample tags as arrays */
+#define TIFF_BUFFERMMAP                                                        \
+    0x800000U /* read buffer (tif_rawdata) points into mmap() memory */
+#define TIFF_DEFERSTRILELOAD                                                   \
+    0x1000000U /* defer strip/tile offset/bytecount array loading. */
+#define TIFF_LAZYSTRILELOAD                                                    \
+    0x2000000U /* lazy/ondemand loading of strip/tile offset/bytecount values. \
+                  Only used if TIFF_DEFERSTRILELOAD is set and in read-only    \
+                  mode */
+#define TIFF_CHOPPEDUPARRAYS                                                   \
+    0x4000000U /* set when allocChoppedUpStripArrays() has modified strip      \
+                  array */
+    uint64_t tif_diroff;     /* file offset of current directory */
+    uint64_t tif_nextdiroff; /* file offset of following directory */
+    uint64_t tif_lastdiroff; /* file offset of last directory written so far */
+    TIFFHashSet *tif_map_dir_offset_to_number;
+    TIFFHashSet *tif_map_dir_number_to_offset;
+    int tif_setdirectory_force_absolute; /* switch between relative and absolute
+                                            stepping in TIFFSetDirectory() */
+    TIFFDirectory tif_dir;               /* internal rep of current directory */
+    TIFFDirectory
+        tif_customdir; /* custom IFDs are separated from the main ones */
+    union
+    {
+        TIFFHeaderCommon common;
+        TIFFHeaderClassic classic;
+        TIFFHeaderBig big;
+    } tif_header;
+    uint16_t tif_header_size;  /* file's header block and its length */
+    uint32_t tif_row;          /* current scanline */
+    tdir_t tif_curdir;         /* current directory (index) */
+    uint32_t tif_curstrip;     /* current strip for read/write */
+    uint64_t tif_curoff;       /* current offset for read/write */
+    uint64_t tif_lastvalidoff; /* last valid offset allowed for rewrite in
+                                  place. Used only by TIFFAppendToStrip() */
+    uint64_t tif_dataoff;      /* current offset for writing dir */
+    /* SubIFD support */
+    uint16_t tif_nsubifd;   /* remaining subifds to write */
+    uint64_t tif_subifdoff; /* offset for patching SubIFD link */
+    /* tiling support */
+    uint32_t tif_col;      /* current column (offset by row too) */
+    uint32_t tif_curtile;  /* current tile for read/write */
+    tmsize_t tif_tilesize; /* # of bytes in a tile */
+    /* compression scheme hooks */
+    int tif_decodestatus;
+    TIFFBoolMethod tif_fixuptags;   /* called in TIFFReadDirectory */
+    TIFFBoolMethod tif_setupdecode; /* called once before predecode */
+    TIFFPreMethod tif_predecode;    /* pre- row/strip/tile decoding */
+    TIFFBoolMethod tif_setupencode; /* called once before preencode */
+    int tif_encodestatus;
+    TIFFPreMethod tif_preencode;      /* pre- row/strip/tile encoding */
+    TIFFBoolMethod tif_postencode;    /* post- row/strip/tile encoding */
+    TIFFCodeMethod tif_decoderow;     /* scanline decoding routine */
+    TIFFCodeMethod tif_encoderow;     /* scanline encoding routine */
+    TIFFCodeMethod tif_decodestrip;   /* strip decoding routine */
+    TIFFCodeMethod tif_encodestrip;   /* strip encoding routine */
+    TIFFCodeMethod tif_decodetile;    /* tile decoding routine */
+    TIFFCodeMethod tif_encodetile;    /* tile encoding routine */
+    TIFFVoidMethod tif_close;         /* cleanup-on-close routine */
+    TIFFSeekMethod tif_seek;          /* position within a strip routine */
+    TIFFVoidMethod tif_cleanup;       /* cleanup state routine */
+    TIFFStripMethod tif_defstripsize; /* calculate/constrain strip size */
+    TIFFTileMethod tif_deftilesize;   /* calculate/constrain tile size */
+    uint8_t *tif_data;                /* compression scheme private data */
+    /* input/output buffering */
+    tmsize_t tif_scanlinesize;  /* # of bytes in a scanline */
+    tmsize_t tif_scanlineskew;  /* scanline skew for reading strips */
+    uint8_t *tif_rawdata;       /* raw data buffer */
+    tmsize_t tif_rawdatasize;   /* # of bytes in raw data buffer */
+    tmsize_t tif_rawdataoff;    /* rawdata offset within strip */
+    tmsize_t tif_rawdataloaded; /* amount of data in rawdata */
+    uint8_t *tif_rawcp;         /* current spot in raw buffer */
+    tmsize_t tif_rawcc;         /* bytes unread from raw buffer */
+    /* memory-mapped file support */
+    uint8_t *tif_base; /* base of mapped file */
+    tmsize_t tif_size; /* size of mapped file region (bytes, thus tmsize_t) */
+    TIFFMapFileProc tif_mapproc;     /* map file method */
+    TIFFUnmapFileProc tif_unmapproc; /* unmap file method */
+    /* input/output callback methods */
+    thandle_t tif_clientdata;        /* callback parameter */
+    TIFFReadWriteProc tif_readproc;  /* read method */
+    TIFFReadWriteProc tif_writeproc; /* write method */
+    TIFFSeekProc tif_seekproc;       /* lseek method */
+    TIFFCloseProc tif_closeproc;     /* close method */
+    TIFFSizeProc tif_sizeproc;       /* filesize method */
+    /* post-decoding support */
+    TIFFPostMethod tif_postdecode; /* post decoding routine */
+    /* tag support */
+    TIFFField **tif_fields;          /* sorted table of registered tags */
+    size_t tif_nfields;              /* # entries in registered tag table */
+    const TIFFField *tif_foundfield; /* cached pointer to already found tag */
+    TIFFTagMethods tif_tagmethods;   /* tag get/set/print routines */
+    TIFFClientInfoLink *tif_clientinfo; /* extra client information. */
+    /* Backward compatibility stuff. We need these two fields for
+     * setting up an old tag extension scheme. */
+    TIFFFieldArray *tif_fieldscompat;
+    size_t tif_nfieldscompat;
+    /* Error handler support */
+    TIFFErrorHandlerExtR tif_errorhandler;
+    void *tif_errorhandler_user_data;
+    TIFFErrorHandlerExtR tif_warnhandler;
+    void *tif_warnhandler_user_data;
+    tmsize_t tif_max_single_mem_alloc; /* in bytes. 0 for unlimited */
+};
+
+struct TIFFOpenOptions
+{
+    TIFFErrorHandlerExtR errorhandler; /* may be NULL */
+    void *errorhandler_user_data;      /* may be NULL */
+    TIFFErrorHandlerExtR warnhandler;  /* may be NULL */
+    void *warnhandler_user_data;       /* may be NULL */
+    tmsize_t max_single_mem_alloc;     /* in bytes. 0 for unlimited */
 };
 
-#define isPseudoTag(t) (t > 0xffff)            /* is tag value normal or pseudo */
+#define isPseudoTag(t) (t > 0xffff) /* is tag value normal or pseudo */
 
 #define isTiled(tif) (((tif)->tif_flags & TIFF_ISTILED) != 0)
 #define isMapped(tif) (((tif)->tif_flags & TIFF_MAPPED) != 0)
 #define isFillOrder(tif, o) (((tif)->tif_flags & (o)) != 0)
 #define isUpSampled(tif) (((tif)->tif_flags & TIFF_UPSAMPLED) != 0)
-#define TIFFReadFile(tif, buf, size) \
-	((*(tif)->tif_readproc)((tif)->tif_clientdata,(buf),(size)))
-#define TIFFWriteFile(tif, buf, size) \
-	((*(tif)->tif_writeproc)((tif)->tif_clientdata,(buf),(size)))
-#define TIFFSeekFile(tif, off, whence) \
-	((*(tif)->tif_seekproc)((tif)->tif_clientdata,(off),(whence)))
-#define TIFFCloseFile(tif) \
-	((*(tif)->tif_closeproc)((tif)->tif_clientdata))
-#define TIFFGetFileSize(tif) \
-	((*(tif)->tif_sizeproc)((tif)->tif_clientdata))
-#define TIFFMapFileContents(tif, paddr, psize) \
-	((*(tif)->tif_mapproc)((tif)->tif_clientdata,(paddr),(psize)))
-#define TIFFUnmapFileContents(tif, addr, size) \
-	((*(tif)->tif_unmapproc)((tif)->tif_clientdata,(addr),(size)))
+#define TIFFReadFile(tif, buf, size)                                           \
+    ((*(tif)->tif_readproc)((tif)->tif_clientdata, (buf), (size)))
+#define TIFFWriteFile(tif, buf, size)                                          \
+    ((*(tif)->tif_writeproc)((tif)->tif_clientdata, (buf), (size)))
+#define TIFFSeekFile(tif, off, whence)                                         \
+    ((*(tif)->tif_seekproc)((tif)->tif_clientdata, (off), (whence)))
+#define TIFFCloseFile(tif) ((*(tif)->tif_closeproc)((tif)->tif_clientdata))
+#define TIFFGetFileSize(tif) ((*(tif)->tif_sizeproc)((tif)->tif_clientdata))
+#define TIFFMapFileContents(tif, paddr, psize)                                 \
+    ((*(tif)->tif_mapproc)((tif)->tif_clientdata, (paddr), (psize)))
+#define TIFFUnmapFileContents(tif, addr, size)                                 \
+    ((*(tif)->tif_unmapproc)((tif)->tif_clientdata, (addr), (size)))
 
 /*
  * Default Read/Seek/Write definitions.
  */
 #ifndef ReadOK
-#define ReadOK(tif, buf, size) \
-	(TIFFReadFile((tif),(buf),(size))==(size))
+#define ReadOK(tif, buf, size) (TIFFReadFile((tif), (buf), (size)) == (size))
 #endif
 #ifndef SeekOK
 #define SeekOK(tif, off) _TIFFSeekOK(tif, off)
 #endif
 #ifndef WriteOK
-#define WriteOK(tif, buf, size) \
-	(TIFFWriteFile((tif),(buf),(size))==(size))
+#define WriteOK(tif, buf, size) (TIFFWriteFile((tif), (buf), (size)) == (size))
 #endif
 
-/* NB: the uint32 casts are to silence certain ANSI-C compilers */
-#define TIFFhowmany_32(x, y) (((uint32)x < (0xffffffff - (uint32)(y-1))) ? \
-			   ((((uint32)(x))+(((uint32)(y))-1))/((uint32)(y))) : \
-			   0U)
+/* NB: the uint32_t casts are to silence certain ANSI-C compilers */
+#define TIFFhowmany_32(x, y)                                                   \
+    (((uint32_t)x < (0xffffffff - (uint32_t)(y - 1)))                          \
+         ? ((((uint32_t)(x)) + (((uint32_t)(y)) - 1)) / ((uint32_t)(y)))       \
+         : 0U)
 /* Variant of TIFFhowmany_32() that doesn't return 0 if x close to MAXUINT. */
 /* Caution: TIFFhowmany_32_maxuint_compat(x,y)*y might overflow */
-#define TIFFhowmany_32_maxuint_compat(x, y) \
-			   (((uint32)(x) / (uint32)(y)) + ((((uint32)(x) % (uint32)(y)) != 0) ? 1 : 0))
-#define TIFFhowmany8_32(x) (((x)&0x07)?((uint32)(x)>>3)+1:(uint32)(x)>>3)
-#define TIFFroundup_32(x, y) (TIFFhowmany_32(x,y)*(y))
-#define TIFFhowmany_64(x, y) ((((uint64)(x))+(((uint64)(y))-1))/((uint64)(y)))
-#define TIFFhowmany8_64(x) (((x)&0x07)?((uint64)(x)>>3)+1:(uint64)(x)>>3)
-#define TIFFroundup_64(x, y) (TIFFhowmany_64(x,y)*(y))
-
-/* Safe multiply which returns zero if there is an *unsigned* integer overflow. This macro is not safe for *signed* integer types */
-#define TIFFSafeMultiply(t,v,m) ((((t)(m) != (t)0) && (((t)(((v)*(m))/(m))) == (t)(v))) ? (t)((v)*(m)) : (t)0)
-
-#define TIFFmax(A,B) ((A)>(B)?(A):(B))
-#define TIFFmin(A,B) ((A)<(B)?(A):(B))
-
-#define TIFFArrayCount(a) (sizeof (a) / sizeof ((a)[0]))
+#define TIFFhowmany_32_maxuint_compat(x, y)                                    \
+    (((uint32_t)(x) / (uint32_t)(y)) +                                         \
+     ((((uint32_t)(x) % (uint32_t)(y)) != 0) ? 1 : 0))
+#define TIFFhowmany8_32(x)                                                     \
+    (((x)&0x07) ? ((uint32_t)(x) >> 3) + 1 : (uint32_t)(x) >> 3)
+#define TIFFroundup_32(x, y) (TIFFhowmany_32(x, y) * (y))
+#define TIFFhowmany_64(x, y)                                                   \
+    ((((uint64_t)(x)) + (((uint64_t)(y)) - 1)) / ((uint64_t)(y)))
+#define TIFFhowmany8_64(x)                                                     \
+    (((x)&0x07) ? ((uint64_t)(x) >> 3) + 1 : (uint64_t)(x) >> 3)
+#define TIFFroundup_64(x, y) (TIFFhowmany_64(x, y) * (y))
+
+/* Safe multiply which returns zero if there is an *unsigned* integer overflow.
+ * This macro is not safe for *signed* integer types */
+#define TIFFSafeMultiply(t, v, m)                                              \
+    ((((t)(m) != (t)0) && (((t)(((v) * (m)) / (m))) == (t)(v)))                \
+         ? (t)((v) * (m))                                                      \
+         : (t)0)
+
+#define TIFFmax(A, B) ((A) > (B) ? (A) : (B))
+#define TIFFmin(A, B) ((A) < (B) ? (A) : (B))
+
+#define TIFFArrayCount(a) (sizeof(a) / sizeof((a)[0]))
 
 /*
   Support for large files.
@@ -301,28 +327,31 @@ struct tiff {
   must be available on the target computer in order for the program to run.
 */
 #if defined(HAVE_FSEEKO)
-#  define fseek(stream,offset,whence)  fseeko(stream,offset,whence)
-#  define ftell(stream,offset,whence)  ftello(stream,offset,whence)
+#define fseek(stream, offset, whence) fseeko(stream, offset, whence)
+#define ftell(stream, offset, whence) ftello(stream, offset, whence)
 #endif
 #endif
-#if defined(__WIN32__) && \
-        !(defined(_MSC_VER) && _MSC_VER < 1400) && \
-        !(defined(__MSVCRT_VERSION__) && __MSVCRT_VERSION__ < 0x800)
+#if defined(__WIN32__) && !(defined(_MSC_VER) && _MSC_VER < 1400) &&           \
+    !(defined(__MSVCRT_VERSION__) && __MSVCRT_VERSION__ < 0x800)
 typedef unsigned int TIFFIOSize_t;
-#define _TIFF_lseek_f(fildes,offset,whence)  _lseeki64(fildes,/* __int64 */ offset,whence)
+#define _TIFF_lseek_f(fildes, offset, whence)                                  \
+    _lseeki64(fildes, /* __int64 */ offset, whence)
 /* #define _TIFF_tell_f(fildes) /\* __int64 *\/ _telli64(fildes) */
-#define _TIFF_fseek_f(stream,offset,whence) _fseeki64(stream,/* __int64 */ offset,whence)
-#define _TIFF_fstat_f(fildes,stat_buff) _fstati64(fildes,/* struct _stati64 */ stat_buff)
+#define _TIFF_fseek_f(stream, offset, whence)                                  \
+    _fseeki64(stream, /* __int64 */ offset, whence)
+#define _TIFF_fstat_f(fildes, stat_buff)                                       \
+    _fstati64(fildes, /* struct _stati64 */ stat_buff)
 /* #define _TIFF_ftell_f(stream) /\* __int64 *\/ _ftelli64(stream) */
-/* #define _TIFF_stat_f(path,stat_buff) _stati64(path,/\* struct _stati64 *\/ stat_buff) */
+/* #define _TIFF_stat_f(path,stat_buff) _stati64(path,/\* struct _stati64 *\/
+ * stat_buff) */
 #define _TIFF_stat_s struct _stati64
 #define _TIFF_off_t __int64
 #else
 typedef size_t TIFFIOSize_t;
-#define _TIFF_lseek_f(fildes,offset,whence) lseek(fildes,offset,whence)
+#define _TIFF_lseek_f(fildes, offset, whence) lseek(fildes, offset, whence)
 /* #define _TIFF_tell_f(fildes) (_TIFF_lseek_f(fildes,0,SEEK_CUR)) */
-#define _TIFF_fseek_f(stream,offset,whence) fseek(stream,offset,whence)
-#define _TIFF_fstat_f(fildes,stat_buff) fstat(fildes,stat_buff)
+#define _TIFF_fseek_f(stream, offset, whence) fseek(stream, offset, whence)
+#define _TIFF_fstat_f(fildes, stat_buff) fstat(fildes, stat_buff)
 /* #define _TIFF_ftell_f(stream) ftell(stream) */
 /* #define _TIFF_stat_f(path,stat_buff) stat(path,stat_buff) */
 #define _TIFF_stat_s struct stat
@@ -331,7 +360,8 @@ typedef size_t TIFFIOSize_t;
 
 #if defined(__has_attribute) && defined(__clang__)
 #if __has_attribute(no_sanitize)
-#define TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW __attribute__((no_sanitize("unsigned-integer-overflow")))
+#define TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW                                  \
+    __attribute__((no_sanitize("unsigned-integer-overflow")))
 #else
 #define TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
 #endif
@@ -339,139 +369,155 @@ typedef size_t TIFFIOSize_t;
 #define TIFF_NOSANITIZE_UNSIGNED_INT_OVERFLOW
 #endif
 
-
 #if defined(__cplusplus)
-extern "C" {
-#endif
-extern int _TIFFgetMode(const char* mode, const char* module);
-extern int _TIFFNoRowEncode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s);
-extern int _TIFFNoStripEncode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s);
-extern int _TIFFNoTileEncode(TIFF*, uint8* pp, tmsize_t cc, uint16 s);
-extern int _TIFFNoRowDecode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s);
-extern int _TIFFNoStripDecode(TIFF* tif, uint8* pp, tmsize_t cc, uint16 s);
-extern int _TIFFNoTileDecode(TIFF*, uint8* pp, tmsize_t cc, uint16 s);
-extern void _TIFFNoPostDecode(TIFF* tif, uint8* buf, tmsize_t cc);
-extern int _TIFFNoPreCode(TIFF* tif, uint16 s);
-extern int _TIFFNoSeek(TIFF* tif, uint32 off);
-extern void _TIFFSwab16BitData(TIFF* tif, uint8* buf, tmsize_t cc);
-extern void _TIFFSwab24BitData(TIFF* tif, uint8* buf, tmsize_t cc);
-extern void _TIFFSwab32BitData(TIFF* tif, uint8* buf, tmsize_t cc);
-extern void _TIFFSwab64BitData(TIFF* tif, uint8* buf, tmsize_t cc);
-extern int TIFFFlushData1(TIFF* tif);
-extern int TIFFDefaultDirectory(TIFF* tif);
-extern void _TIFFSetDefaultCompressionState(TIFF* tif);
-extern int _TIFFRewriteField(TIFF *, uint16, TIFFDataType, tmsize_t, void *);
-extern int TIFFSetCompressionScheme(TIFF* tif, int scheme);
-extern int TIFFSetDefaultCompressionState(TIFF* tif);
-extern uint32 _TIFFDefaultStripSize(TIFF* tif, uint32 s);
-extern void _TIFFDefaultTileSize(TIFF* tif, uint32* tw, uint32* th);
-extern int _TIFFDataSize(TIFFDataType type);
-
-/*--: Rational2Double: Return size of TIFFSetGetFieldType in bytes. */
-extern int _TIFFSetGetFieldSize(TIFFSetGetFieldType setgettype);
-
-extern void _TIFFsetByteArray(void**, void*, uint32);
-extern void _TIFFsetString(char**, char*);
-extern void _TIFFsetShortArray(uint16**, uint16*, uint32);
-extern void _TIFFsetLongArray(uint32**, uint32*, uint32);
-extern void _TIFFsetFloatArray(float**, float*, uint32);
-extern void _TIFFsetDoubleArray(double**, double*, uint32);
-
-extern void _TIFFprintAscii(FILE*, const char*);
-extern void _TIFFprintAsciiTag(FILE*, const char*, const char*);
-
-extern TIFFErrorHandler _TIFFwarningHandler;
-extern TIFFErrorHandler _TIFFerrorHandler;
-extern TIFFErrorHandlerExt _TIFFwarningHandlerExt;
-extern TIFFErrorHandlerExt _TIFFerrorHandlerExt;
-
-extern uint32 _TIFFMultiply32(TIFF*, uint32, uint32, const char*);
-extern uint64 _TIFFMultiply64(TIFF*, uint64, uint64, const char*);
-extern tmsize_t _TIFFMultiplySSize(TIFF*, tmsize_t, tmsize_t, const char*);
-extern tmsize_t _TIFFCastUInt64ToSSize(TIFF*, uint64, const char*);
-extern void* _TIFFCheckMalloc(TIFF*, tmsize_t, tmsize_t, const char*);
-extern void* _TIFFCheckRealloc(TIFF*, void*, tmsize_t, tmsize_t, const char*);
-
-extern double _TIFFUInt64ToDouble(uint64);
-extern float _TIFFUInt64ToFloat(uint64);
-
-extern float _TIFFClampDoubleToFloat(double);
-
-extern tmsize_t
-_TIFFReadEncodedStripAndAllocBuffer(TIFF* tif, uint32 strip,
-                                    void **buf, tmsize_t bufsizetoalloc,
-                                    tmsize_t size_to_read);
-extern tmsize_t
-_TIFFReadEncodedTileAndAllocBuffer(TIFF* tif, uint32 tile,
-                                    void **buf, tmsize_t bufsizetoalloc,
-                                    tmsize_t size_to_read);
-extern tmsize_t
-_TIFFReadTileAndAllocBuffer(TIFF* tif,
-                            void **buf, tmsize_t bufsizetoalloc,
-                            uint32 x, uint32 y, uint32 z, uint16 s);
-extern int _TIFFSeekOK(TIFF* tif, toff_t off);
-
-extern int TIFFInitDumpMode(TIFF*, int);
+extern "C"
+{
+#endif
+    extern int _TIFFgetMode(TIFFOpenOptions *opts, thandle_t clientdata,
+                            const char *mode, const char *module);
+    extern int _TIFFNoRowEncode(TIFF *tif, uint8_t *pp, tmsize_t cc,
+                                uint16_t s);
+    extern int _TIFFNoStripEncode(TIFF *tif, uint8_t *pp, tmsize_t cc,
+                                  uint16_t s);
+    extern int _TIFFNoTileEncode(TIFF *, uint8_t *pp, tmsize_t cc, uint16_t s);
+    extern int _TIFFNoRowDecode(TIFF *tif, uint8_t *pp, tmsize_t cc,
+                                uint16_t s);
+    extern int _TIFFNoStripDecode(TIFF *tif, uint8_t *pp, tmsize_t cc,
+                                  uint16_t s);
+    extern int _TIFFNoTileDecode(TIFF *, uint8_t *pp, tmsize_t cc, uint16_t s);
+    extern void _TIFFNoPostDecode(TIFF *tif, uint8_t *buf, tmsize_t cc);
+    extern int _TIFFNoPreCode(TIFF *tif, uint16_t s);
+    extern int _TIFFNoSeek(TIFF *tif, uint32_t off);
+    extern void _TIFFSwab16BitData(TIFF *tif, uint8_t *buf, tmsize_t cc);
+    extern void _TIFFSwab24BitData(TIFF *tif, uint8_t *buf, tmsize_t cc);
+    extern void _TIFFSwab32BitData(TIFF *tif, uint8_t *buf, tmsize_t cc);
+    extern void _TIFFSwab64BitData(TIFF *tif, uint8_t *buf, tmsize_t cc);
+    extern int TIFFFlushData1(TIFF *tif);
+    extern int TIFFDefaultDirectory(TIFF *tif);
+    extern void _TIFFSetDefaultCompressionState(TIFF *tif);
+    extern int _TIFFRewriteField(TIFF *, uint16_t, TIFFDataType, tmsize_t,
+                                 void *);
+    extern int TIFFSetCompressionScheme(TIFF *tif, int scheme);
+    extern int TIFFSetDefaultCompressionState(TIFF *tif);
+    extern uint32_t _TIFFDefaultStripSize(TIFF *tif, uint32_t s);
+    extern void _TIFFDefaultTileSize(TIFF *tif, uint32_t *tw, uint32_t *th);
+
+    extern void _TIFFsetByteArray(void **, const void *, uint32_t);
+    extern void _TIFFsetByteArrayExt(TIFF *, void **, const void *, uint32_t);
+    extern void _TIFFsetShortArray(uint16_t **, const uint16_t *, uint32_t);
+    extern void _TIFFsetShortArrayExt(TIFF *, uint16_t **, const uint16_t *,
+                                      uint32_t);
+    extern void _TIFFsetLongArray(uint32_t **, const uint32_t *, uint32_t);
+    extern void _TIFFsetLongArrayExt(TIFF *, uint32_t **, const uint32_t *,
+                                     uint32_t);
+    extern void _TIFFsetFloatArray(float **, const float *, uint32_t);
+    extern void _TIFFsetFloatArrayExt(TIFF *, float **, const float *,
+                                      uint32_t);
+    extern void _TIFFsetDoubleArray(double **, const double *, uint32_t);
+    extern void _TIFFsetDoubleArrayExt(TIFF *, double **, const double *,
+                                       uint32_t);
+
+    extern void _TIFFprintAscii(FILE *, const char *);
+    extern void _TIFFprintAsciiTag(FILE *, const char *, const char *);
+
+    extern TIFFErrorHandler _TIFFwarningHandler;
+    extern TIFFErrorHandler _TIFFerrorHandler;
+    extern TIFFErrorHandlerExt _TIFFwarningHandlerExt;
+    extern TIFFErrorHandlerExt _TIFFerrorHandlerExt;
+    void _TIFFErrorEarly(TIFFOpenOptions *opts, thandle_t clientdata,
+                         const char *module, const char *fmt, ...)
+        TIFF_ATTRIBUTE((__format__(__printf__, 4, 5)));
+
+    extern uint32_t _TIFFMultiply32(TIFF *, uint32_t, uint32_t, const char *);
+    extern uint64_t _TIFFMultiply64(TIFF *, uint64_t, uint64_t, const char *);
+    extern tmsize_t _TIFFMultiplySSize(TIFF *, tmsize_t, tmsize_t,
+                                       const char *);
+    extern tmsize_t _TIFFCastUInt64ToSSize(TIFF *, uint64_t, const char *);
+    extern void *_TIFFCheckMalloc(TIFF *, tmsize_t, tmsize_t, const char *);
+    extern void *_TIFFCheckRealloc(TIFF *, void *, tmsize_t, tmsize_t,
+                                   const char *);
+
+    extern double _TIFFUInt64ToDouble(uint64_t);
+    extern float _TIFFUInt64ToFloat(uint64_t);
+
+    extern float _TIFFClampDoubleToFloat(double);
+    extern uint32_t _TIFFClampDoubleToUInt32(double);
+
+    extern void _TIFFCleanupIFDOffsetAndNumberMaps(TIFF *tif);
+
+    extern tmsize_t _TIFFReadEncodedStripAndAllocBuffer(TIFF *tif,
+                                                        uint32_t strip,
+                                                        void **buf,
+                                                        tmsize_t bufsizetoalloc,
+                                                        tmsize_t size_to_read);
+    extern tmsize_t _TIFFReadEncodedTileAndAllocBuffer(TIFF *tif, uint32_t tile,
+                                                       void **buf,
+                                                       tmsize_t bufsizetoalloc,
+                                                       tmsize_t size_to_read);
+    extern tmsize_t _TIFFReadTileAndAllocBuffer(TIFF *tif, void **buf,
+                                                tmsize_t bufsizetoalloc,
+                                                uint32_t x, uint32_t y,
+                                                uint32_t z, uint16_t s);
+    extern int _TIFFSeekOK(TIFF *tif, toff_t off);
+
+    extern int TIFFInitDumpMode(TIFF *, int);
 #ifdef PACKBITS_SUPPORT
-extern int TIFFInitPackBits(TIFF*, int);
+    extern int TIFFInitPackBits(TIFF *, int);
 #endif
 #ifdef CCITT_SUPPORT
-extern int TIFFInitCCITTRLE(TIFF*, int), TIFFInitCCITTRLEW(TIFF*, int);
-extern int TIFFInitCCITTFax3(TIFF*, int), TIFFInitCCITTFax4(TIFF*, int);
+    extern int TIFFInitCCITTRLE(TIFF *, int), TIFFInitCCITTRLEW(TIFF *, int);
+    extern int TIFFInitCCITTFax3(TIFF *, int), TIFFInitCCITTFax4(TIFF *, int);
 #endif
 #ifdef THUNDER_SUPPORT
-extern int TIFFInitThunderScan(TIFF*, int);
+    extern int TIFFInitThunderScan(TIFF *, int);
 #endif
 #ifdef NEXT_SUPPORT
-extern int TIFFInitNeXT(TIFF*, int);
+    extern int TIFFInitNeXT(TIFF *, int);
 #endif
 #ifdef LZW_SUPPORT
-extern int TIFFInitLZW(TIFF*, int);
+    extern int TIFFInitLZW(TIFF *, int);
 #endif
 #ifdef OJPEG_SUPPORT
-extern int TIFFInitOJPEG(TIFF*, int);
+    extern int TIFFInitOJPEG(TIFF *, int);
 #endif
 #ifdef JPEG_SUPPORT
-extern int TIFFInitJPEG(TIFF*, int);
-extern int TIFFJPEGIsFullStripRequired(TIFF*);
+    extern int TIFFInitJPEG(TIFF *, int);
+    extern int TIFFJPEGIsFullStripRequired(TIFF *);
 #endif
 #ifdef JBIG_SUPPORT
-extern int TIFFInitJBIG(TIFF*, int);
+    extern int TIFFInitJBIG(TIFF *, int);
 #endif
 #ifdef ZIP_SUPPORT
-extern int TIFFInitZIP(TIFF*, int);
+    extern int TIFFInitZIP(TIFF *, int);
 #endif
 #ifdef PIXARLOG_SUPPORT
-extern int TIFFInitPixarLog(TIFF*, int);
+    extern int TIFFInitPixarLog(TIFF *, int);
 #endif
 #ifdef LOGLUV_SUPPORT
-extern int TIFFInitSGILog(TIFF*, int);
+    extern int TIFFInitSGILog(TIFF *, int);
+#endif
+#ifdef LERC_SUPPORT
+    extern int TIFFInitLERC(TIFF *tif, int);
 #endif
 #ifdef LZMA_SUPPORT
-extern int TIFFInitLZMA(TIFF*, int);
+    extern int TIFFInitLZMA(TIFF *, int);
 #endif
 #ifdef ZSTD_SUPPORT
-extern int TIFFInitZSTD(TIFF*, int);
+    extern int TIFFInitZSTD(TIFF *, int);
 #endif
 #ifdef WEBP_SUPPORT
-extern int TIFFInitWebP(TIFF*, int);
-#endif
-#ifdef VMS
-extern const TIFFCodec _TIFFBuiltinCODECS[];
-#else
-extern TIFFCodec _TIFFBuiltinCODECS[];
+    extern int TIFFInitWebP(TIFF *, int);
 #endif
+    extern const TIFFCodec _TIFFBuiltinCODECS[];
+    extern void TIFFCIELab16ToXYZ(TIFFCIELabToRGB *, uint32_t l, int32_t a,
+                                  int32_t b, float *, float *, float *);
+
+    extern void *_TIFFmallocExt(TIFF *tif, tmsize_t s);
+    extern void *_TIFFcallocExt(TIFF *tif, tmsize_t nmemb, tmsize_t siz);
+    extern void *_TIFFreallocExt(TIFF *tif, void *p, tmsize_t s);
+    extern void _TIFFfreeExt(TIFF *tif, void *p);
 
 #if defined(__cplusplus)
 }
 #endif
 #endif /* _TIFFIOP_ */
-
-/* vim: set ts=8 sts=8 sw=8 noet: */
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libtiff/tiffvers.h b/3rdparty/libtiff/tiffvers.h
deleted file mode 100644
index 0cce798b8378..000000000000
--- a/3rdparty/libtiff/tiffvers.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#define TIFFLIB_VERSION_STR "LIBTIFF, Version 4.2.0\nCopyright (c) 1988-1996 Sam Leffler\nCopyright (c) 1991-1996 Silicon Graphics, Inc."
-/*
- * This define can be used in code that requires
- * compilation-related definitions specific to a
- * version or versions of the library.  Runtime
- * version checking should be done based on the
- * string returned by TIFFGetVersion.
- */
-#define TIFFLIB_VERSION 20201219
diff --git a/3rdparty/libtiff/tiffvers.h.cmake.in b/3rdparty/libtiff/tiffvers.h.cmake.in
new file mode 100644
index 000000000000..28984998072c
--- /dev/null
+++ b/3rdparty/libtiff/tiffvers.h.cmake.in
@@ -0,0 +1,36 @@
+/* tiffvers.h version information is updated according to version information
+ * in configure.ac */
+
+/* clang-format off */
+
+/* clang-format disabled because FindTIFF.cmake is very sensitive to the
+ * formatting of below line being a single line.
+ * Furthermore, configure_file variables of type "@VAR@" are
+ * modified by clang-format and won't be substituted by CMake.
+ */
+#define TIFFLIB_VERSION_STR "LIBTIFF, Version @LIBTIFF_VERSION@\nCopyright (c) 1988-1996 Sam Leffler\nCopyright (c) 1991-1996 Silicon Graphics, Inc."
+/*
+ * This define can be used in code that requires
+ * compilation-related definitions specific to a
+ * version or versions of the library.  Runtime
+ * version checking should be done based on the
+ * string returned by TIFFGetVersion.
+ */
+#define TIFFLIB_VERSION @LIBTIFF_RELEASE_DATE@
+
+/* The following defines have been added in 4.5.0 */
+#define TIFFLIB_MAJOR_VERSION @LIBTIFF_MAJOR_VERSION@
+#define TIFFLIB_MINOR_VERSION @LIBTIFF_MINOR_VERSION@
+#define TIFFLIB_MICRO_VERSION @LIBTIFF_MICRO_VERSION@
+#define TIFFLIB_VERSION_STR_MAJ_MIN_MIC "@LIBTIFF_VERSION@"
+
+/* Macro added in 4.5.0. Returns TRUE if the current libtiff version is
+ * greater or equal to major.minor.micro
+ */
+#define TIFFLIB_AT_LEAST(major, minor, micro) \
+    (TIFFLIB_MAJOR_VERSION > (major) || \
+     (TIFFLIB_MAJOR_VERSION == (major) && TIFFLIB_MINOR_VERSION > (minor)) || \
+     (TIFFLIB_MAJOR_VERSION == (major) && TIFFLIB_MINOR_VERSION == (minor) && \
+      TIFFLIB_MICRO_VERSION >= (micro)))
+
+/* clang-format on */
diff --git a/3rdparty/libtiff/uvcode.h b/3rdparty/libtiff/uvcode.h
index 6286cfbb0c17..fc877292449d 100644
--- a/3rdparty/libtiff/uvcode.h
+++ b/3rdparty/libtiff/uvcode.h
@@ -1,180 +1,93 @@
 /* Version 1.0 generated April 7, 1997 by Greg Ward Larson, SGI */
-#define UV_SQSIZ	(float)0.003500
-#define UV_NDIVS	16289
-#define UV_VSTART	(float)0.016940
-#define UV_NVS		163
-static const struct {
-	float	ustart;
-	short	nus, ncum;
-}	uv_row[UV_NVS] = {
-	{ (float)0.247663,	4,	0 },
-	{ (float)0.243779,	6,	4 },
-	{ (float)0.241684,	7,	10 },
-	{ (float)0.237874,	9,	17 },
-	{ (float)0.235906,	10,	26 },
-	{ (float)0.232153,	12,	36 },
-	{ (float)0.228352,	14,	48 },
-	{ (float)0.226259,	15,	62 },
-	{ (float)0.222371,	17,	77 },
-	{ (float)0.220410,	18,	94 },
-	{ (float)0.214710,	21,	112 },
-	{ (float)0.212714,	22,	133 },
-	{ (float)0.210721,	23,	155 },
-	{ (float)0.204976,	26,	178 },
-	{ (float)0.202986,	27,	204 },
-	{ (float)0.199245,	29,	231 },
-	{ (float)0.195525,	31,	260 },
-	{ (float)0.193560,	32,	291 },
-	{ (float)0.189878,	34,	323 },
-	{ (float)0.186216,	36,	357 },
-	{ (float)0.186216,	36,	393 },
-	{ (float)0.182592,	38,	429 },
-	{ (float)0.179003,	40,	467 },
-	{ (float)0.175466,	42,	507 },
-	{ (float)0.172001,	44,	549 },
-	{ (float)0.172001,	44,	593 },
-	{ (float)0.168612,	46,	637 },
-	{ (float)0.168612,	46,	683 },
-	{ (float)0.163575,	49,	729 },
-	{ (float)0.158642,	52,	778 },
-	{ (float)0.158642,	52,	830 },
-	{ (float)0.158642,	52,	882 },
-	{ (float)0.153815,	55,	934 },
-	{ (float)0.153815,	55,	989 },
-	{ (float)0.149097,	58,	1044 },
-	{ (float)0.149097,	58,	1102 },
-	{ (float)0.142746,	62,	1160 },
-	{ (float)0.142746,	62,	1222 },
-	{ (float)0.142746,	62,	1284 },
-	{ (float)0.138270,	65,	1346 },
-	{ (float)0.138270,	65,	1411 },
-	{ (float)0.138270,	65,	1476 },
-	{ (float)0.132166,	69,	1541 },
-	{ (float)0.132166,	69,	1610 },
-	{ (float)0.126204,	73,	1679 },
-	{ (float)0.126204,	73,	1752 },
-	{ (float)0.126204,	73,	1825 },
-	{ (float)0.120381,	77,	1898 },
-	{ (float)0.120381,	77,	1975 },
-	{ (float)0.120381,	77,	2052 },
-	{ (float)0.120381,	77,	2129 },
-	{ (float)0.112962,	82,	2206 },
-	{ (float)0.112962,	82,	2288 },
-	{ (float)0.112962,	82,	2370 },
-	{ (float)0.107450,	86,	2452 },
-	{ (float)0.107450,	86,	2538 },
-	{ (float)0.107450,	86,	2624 },
-	{ (float)0.107450,	86,	2710 },
-	{ (float)0.100343,	91,	2796 },
-	{ (float)0.100343,	91,	2887 },
-	{ (float)0.100343,	91,	2978 },
-	{ (float)0.095126,	95,	3069 },
-	{ (float)0.095126,	95,	3164 },
-	{ (float)0.095126,	95,	3259 },
-	{ (float)0.095126,	95,	3354 },
-	{ (float)0.088276,	100,	3449 },
-	{ (float)0.088276,	100,	3549 },
-	{ (float)0.088276,	100,	3649 },
-	{ (float)0.088276,	100,	3749 },
-	{ (float)0.081523,	105,	3849 },
-	{ (float)0.081523,	105,	3954 },
-	{ (float)0.081523,	105,	4059 },
-	{ (float)0.081523,	105,	4164 },
-	{ (float)0.074861,	110,	4269 },
-	{ (float)0.074861,	110,	4379 },
-	{ (float)0.074861,	110,	4489 },
-	{ (float)0.074861,	110,	4599 },
-	{ (float)0.068290,	115,	4709 },
-	{ (float)0.068290,	115,	4824 },
-	{ (float)0.068290,	115,	4939 },
-	{ (float)0.068290,	115,	5054 },
-	{ (float)0.063573,	119,	5169 },
-	{ (float)0.063573,	119,	5288 },
-	{ (float)0.063573,	119,	5407 },
-	{ (float)0.063573,	119,	5526 },
-	{ (float)0.057219,	124,	5645 },
-	{ (float)0.057219,	124,	5769 },
-	{ (float)0.057219,	124,	5893 },
-	{ (float)0.057219,	124,	6017 },
-	{ (float)0.050985,	129,	6141 },
-	{ (float)0.050985,	129,	6270 },
-	{ (float)0.050985,	129,	6399 },
-	{ (float)0.050985,	129,	6528 },
-	{ (float)0.050985,	129,	6657 },
-	{ (float)0.044859,	134,	6786 },
-	{ (float)0.044859,	134,	6920 },
-	{ (float)0.044859,	134,	7054 },
-	{ (float)0.044859,	134,	7188 },
-	{ (float)0.040571,	138,	7322 },
-	{ (float)0.040571,	138,	7460 },
-	{ (float)0.040571,	138,	7598 },
-	{ (float)0.040571,	138,	7736 },
-	{ (float)0.036339,	142,	7874 },
-	{ (float)0.036339,	142,	8016 },
-	{ (float)0.036339,	142,	8158 },
-	{ (float)0.036339,	142,	8300 },
-	{ (float)0.032139,	146,	8442 },
-	{ (float)0.032139,	146,	8588 },
-	{ (float)0.032139,	146,	8734 },
-	{ (float)0.032139,	146,	8880 },
-	{ (float)0.027947,	150,	9026 },
-	{ (float)0.027947,	150,	9176 },
-	{ (float)0.027947,	150,	9326 },
-	{ (float)0.023739,	154,	9476 },
-	{ (float)0.023739,	154,	9630 },
-	{ (float)0.023739,	154,	9784 },
-	{ (float)0.023739,	154,	9938 },
-	{ (float)0.019504,	158,	10092 },
-	{ (float)0.019504,	158,	10250 },
-	{ (float)0.019504,	158,	10408 },
-	{ (float)0.016976,	161,	10566 },
-	{ (float)0.016976,	161,	10727 },
-	{ (float)0.016976,	161,	10888 },
-	{ (float)0.016976,	161,	11049 },
-	{ (float)0.012639,	165,	11210 },
-	{ (float)0.012639,	165,	11375 },
-	{ (float)0.012639,	165,	11540 },
-	{ (float)0.009991,	168,	11705 },
-	{ (float)0.009991,	168,	11873 },
-	{ (float)0.009991,	168,	12041 },
-	{ (float)0.009016,	170,	12209 },
-	{ (float)0.009016,	170,	12379 },
-	{ (float)0.009016,	170,	12549 },
-	{ (float)0.006217,	173,	12719 },
-	{ (float)0.006217,	173,	12892 },
-	{ (float)0.005097,	175,	13065 },
-	{ (float)0.005097,	175,	13240 },
-	{ (float)0.005097,	175,	13415 },
-	{ (float)0.003909,	177,	13590 },
-	{ (float)0.003909,	177,	13767 },
-	{ (float)0.002340,	177,	13944 },
-	{ (float)0.002389,	170,	14121 },
-	{ (float)0.001068,	164,	14291 },
-	{ (float)0.001653,	157,	14455 },
-	{ (float)0.000717,	150,	14612 },
-	{ (float)0.001614,	143,	14762 },
-	{ (float)0.000270,	136,	14905 },
-	{ (float)0.000484,	129,	15041 },
-	{ (float)0.001103,	123,	15170 },
-	{ (float)0.001242,	115,	15293 },
-	{ (float)0.001188,	109,	15408 },
-	{ (float)0.001011,	103,	15517 },
-	{ (float)0.000709,	97,	15620 },
-	{ (float)0.000301,	89,	15717 },
-	{ (float)0.002416,	82,	15806 },
-	{ (float)0.003251,	76,	15888 },
-	{ (float)0.003246,	69,	15964 },
-	{ (float)0.004141,	62,	16033 },
-	{ (float)0.005963,	55,	16095 },
-	{ (float)0.008839,	47,	16150 },
-	{ (float)0.010490,	40,	16197 },
-	{ (float)0.016994,	31,	16237 },
-	{ (float)0.023659,	21,	16268 },
+#define UV_SQSIZ (float)0.003500
+#define UV_NDIVS 16289
+#define UV_VSTART (float)0.016940
+#define UV_NVS 163
+static const struct
+{
+    float ustart;
+    short nus, ncum;
+} uv_row[UV_NVS] = {
+    {(float)0.247663, 4, 0},       {(float)0.243779, 6, 4},
+    {(float)0.241684, 7, 10},      {(float)0.237874, 9, 17},
+    {(float)0.235906, 10, 26},     {(float)0.232153, 12, 36},
+    {(float)0.228352, 14, 48},     {(float)0.226259, 15, 62},
+    {(float)0.222371, 17, 77},     {(float)0.220410, 18, 94},
+    {(float)0.214710, 21, 112},    {(float)0.212714, 22, 133},
+    {(float)0.210721, 23, 155},    {(float)0.204976, 26, 178},
+    {(float)0.202986, 27, 204},    {(float)0.199245, 29, 231},
+    {(float)0.195525, 31, 260},    {(float)0.193560, 32, 291},
+    {(float)0.189878, 34, 323},    {(float)0.186216, 36, 357},
+    {(float)0.186216, 36, 393},    {(float)0.182592, 38, 429},
+    {(float)0.179003, 40, 467},    {(float)0.175466, 42, 507},
+    {(float)0.172001, 44, 549},    {(float)0.172001, 44, 593},
+    {(float)0.168612, 46, 637},    {(float)0.168612, 46, 683},
+    {(float)0.163575, 49, 729},    {(float)0.158642, 52, 778},
+    {(float)0.158642, 52, 830},    {(float)0.158642, 52, 882},
+    {(float)0.153815, 55, 934},    {(float)0.153815, 55, 989},
+    {(float)0.149097, 58, 1044},   {(float)0.149097, 58, 1102},
+    {(float)0.142746, 62, 1160},   {(float)0.142746, 62, 1222},
+    {(float)0.142746, 62, 1284},   {(float)0.138270, 65, 1346},
+    {(float)0.138270, 65, 1411},   {(float)0.138270, 65, 1476},
+    {(float)0.132166, 69, 1541},   {(float)0.132166, 69, 1610},
+    {(float)0.126204, 73, 1679},   {(float)0.126204, 73, 1752},
+    {(float)0.126204, 73, 1825},   {(float)0.120381, 77, 1898},
+    {(float)0.120381, 77, 1975},   {(float)0.120381, 77, 2052},
+    {(float)0.120381, 77, 2129},   {(float)0.112962, 82, 2206},
+    {(float)0.112962, 82, 2288},   {(float)0.112962, 82, 2370},
+    {(float)0.107450, 86, 2452},   {(float)0.107450, 86, 2538},
+    {(float)0.107450, 86, 2624},   {(float)0.107450, 86, 2710},
+    {(float)0.100343, 91, 2796},   {(float)0.100343, 91, 2887},
+    {(float)0.100343, 91, 2978},   {(float)0.095126, 95, 3069},
+    {(float)0.095126, 95, 3164},   {(float)0.095126, 95, 3259},
+    {(float)0.095126, 95, 3354},   {(float)0.088276, 100, 3449},
+    {(float)0.088276, 100, 3549},  {(float)0.088276, 100, 3649},
+    {(float)0.088276, 100, 3749},  {(float)0.081523, 105, 3849},
+    {(float)0.081523, 105, 3954},  {(float)0.081523, 105, 4059},
+    {(float)0.081523, 105, 4164},  {(float)0.074861, 110, 4269},
+    {(float)0.074861, 110, 4379},  {(float)0.074861, 110, 4489},
+    {(float)0.074861, 110, 4599},  {(float)0.068290, 115, 4709},
+    {(float)0.068290, 115, 4824},  {(float)0.068290, 115, 4939},
+    {(float)0.068290, 115, 5054},  {(float)0.063573, 119, 5169},
+    {(float)0.063573, 119, 5288},  {(float)0.063573, 119, 5407},
+    {(float)0.063573, 119, 5526},  {(float)0.057219, 124, 5645},
+    {(float)0.057219, 124, 5769},  {(float)0.057219, 124, 5893},
+    {(float)0.057219, 124, 6017},  {(float)0.050985, 129, 6141},
+    {(float)0.050985, 129, 6270},  {(float)0.050985, 129, 6399},
+    {(float)0.050985, 129, 6528},  {(float)0.050985, 129, 6657},
+    {(float)0.044859, 134, 6786},  {(float)0.044859, 134, 6920},
+    {(float)0.044859, 134, 7054},  {(float)0.044859, 134, 7188},
+    {(float)0.040571, 138, 7322},  {(float)0.040571, 138, 7460},
+    {(float)0.040571, 138, 7598},  {(float)0.040571, 138, 7736},
+    {(float)0.036339, 142, 7874},  {(float)0.036339, 142, 8016},
+    {(float)0.036339, 142, 8158},  {(float)0.036339, 142, 8300},
+    {(float)0.032139, 146, 8442},  {(float)0.032139, 146, 8588},
+    {(float)0.032139, 146, 8734},  {(float)0.032139, 146, 8880},
+    {(float)0.027947, 150, 9026},  {(float)0.027947, 150, 9176},
+    {(float)0.027947, 150, 9326},  {(float)0.023739, 154, 9476},
+    {(float)0.023739, 154, 9630},  {(float)0.023739, 154, 9784},
+    {(float)0.023739, 154, 9938},  {(float)0.019504, 158, 10092},
+    {(float)0.019504, 158, 10250}, {(float)0.019504, 158, 10408},
+    {(float)0.016976, 161, 10566}, {(float)0.016976, 161, 10727},
+    {(float)0.016976, 161, 10888}, {(float)0.016976, 161, 11049},
+    {(float)0.012639, 165, 11210}, {(float)0.012639, 165, 11375},
+    {(float)0.012639, 165, 11540}, {(float)0.009991, 168, 11705},
+    {(float)0.009991, 168, 11873}, {(float)0.009991, 168, 12041},
+    {(float)0.009016, 170, 12209}, {(float)0.009016, 170, 12379},
+    {(float)0.009016, 170, 12549}, {(float)0.006217, 173, 12719},
+    {(float)0.006217, 173, 12892}, {(float)0.005097, 175, 13065},
+    {(float)0.005097, 175, 13240}, {(float)0.005097, 175, 13415},
+    {(float)0.003909, 177, 13590}, {(float)0.003909, 177, 13767},
+    {(float)0.002340, 177, 13944}, {(float)0.002389, 170, 14121},
+    {(float)0.001068, 164, 14291}, {(float)0.001653, 157, 14455},
+    {(float)0.000717, 150, 14612}, {(float)0.001614, 143, 14762},
+    {(float)0.000270, 136, 14905}, {(float)0.000484, 129, 15041},
+    {(float)0.001103, 123, 15170}, {(float)0.001242, 115, 15293},
+    {(float)0.001188, 109, 15408}, {(float)0.001011, 103, 15517},
+    {(float)0.000709, 97, 15620},  {(float)0.000301, 89, 15717},
+    {(float)0.002416, 82, 15806},  {(float)0.003251, 76, 15888},
+    {(float)0.003246, 69, 15964},  {(float)0.004141, 62, 16033},
+    {(float)0.005963, 55, 16095},  {(float)0.008839, 47, 16150},
+    {(float)0.010490, 40, 16197},  {(float)0.016994, 31, 16237},
+    {(float)0.023659, 21, 16268},
 };
-/*
- * Local Variables:
- * mode: c
- * c-basic-offset: 8
- * fill-column: 78
- * End:
- */
diff --git a/3rdparty/libwebp/CMakeLists.txt b/3rdparty/libwebp/CMakeLists.txt
index 723575c8db3d..f3b6ebd0d620 100644
--- a/3rdparty/libwebp/CMakeLists.txt
+++ b/3rdparty/libwebp/CMakeLists.txt
@@ -9,8 +9,8 @@ if(ANDROID)
   ocv_include_directories(${CPUFEATURES_INCLUDE_DIRS})
 endif()
 
-file(GLOB lib_srcs src/dec/*.c src/demux/*.c src/dsp/*.c src/enc/*.c src/mux/*.c src/utils/*.c src/webp/*.c)
-file(GLOB lib_hdrs src/dec/*.h src/demux/*.h src/dsp/*.h src/enc/*.h src/mux/*.h src/utils/*.h src/webp/*.h)
+file(GLOB lib_srcs sharpyuv/*.c src/dec/*.c src/demux/*.c src/dsp/*.c src/enc/*.c src/mux/*.c src/utils/*.c src/webp/*.c)
+file(GLOB lib_hdrs sharpyuv/*.h src/dec/*.h src/demux/*.h src/dsp/*.h src/enc/*.h src/mux/*.h src/utils/*.h src/webp/*.h)
 
 # FIXIT
 if(ANDROID AND ARMEABI_V7A AND NOT NEON)
@@ -21,12 +21,6 @@ if(ANDROID AND ARMEABI_V7A AND NOT NEON)
   endforeach()
 endif()
 
-# FIX for quant.h - requires C99 for() loops
-ocv_check_flag_support(C "-std=c99" _varname "${CMAKE_C_FLAGS}")
-if(${_varname})
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
-endif()
-
 
 # ----------------------------------------------------------------------------------
 #         Define the library target:
diff --git a/3rdparty/libwebp/patches/20190910-msa-asm-patch.diff b/3rdparty/libwebp/patches/20190910-msa-asm-patch.diff
deleted file mode 100644
index 1be213520312..000000000000
--- a/3rdparty/libwebp/patches/20190910-msa-asm-patch.diff
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/3rdparty/libwebp/src/dsp/msa_macro.h b/3rdparty/libwebp/src/dsp/msa_macro.h
-index de026a1d9e..a16c0bb300 100644
---- a/3rdparty/libwebp/src/dsp/msa_macro.h
-+++ b/3rdparty/libwebp/src/dsp/msa_macro.h
-@@ -73,7 +73,7 @@
-   static inline TYPE FUNC_NAME(const void* const psrc) {  \
-     const uint8_t* const psrc_m = (const uint8_t*)psrc;   \
-     TYPE val_m;                                           \
--    asm volatile (                                        \
-+    __asm__ volatile (                                        \
-       "" #INSTR " %[val_m], %[psrc_m]  \n\t"              \
-       : [val_m] "=r" (val_m)                              \
-       : [psrc_m] "m" (*psrc_m));                          \
-@@ -86,7 +86,7 @@
-   static inline void FUNC_NAME(TYPE val, void* const pdst) { \
-     uint8_t* const pdst_m = (uint8_t*)pdst;                  \
-     TYPE val_m = val;                                        \
--    asm volatile (                                           \
-+    __asm__ volatile (                                           \
-       " " #INSTR "  %[val_m],  %[pdst_m]  \n\t"              \
-       : [pdst_m] "=m" (*pdst_m)                              \
-       : [val_m] "r" (val_m));                                \
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv.c b/3rdparty/libwebp/sharpyuv/sharpyuv.c
new file mode 100644
index 000000000000..7cbf668fbbb5
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv.c
@@ -0,0 +1,574 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "src/webp/types.h"
+#include "sharpyuv/sharpyuv_cpu.h"
+#include "sharpyuv/sharpyuv_dsp.h"
+#include "sharpyuv/sharpyuv_gamma.h"
+
+//------------------------------------------------------------------------------
+
+int SharpYuvGetVersion(void) {
+  return SHARPYUV_VERSION;
+}
+
+//------------------------------------------------------------------------------
+// Sharp RGB->YUV conversion
+
+static const int kNumIterations = 4;
+
+#define YUV_FIX 16  // fixed-point precision for RGB->YUV
+static const int kYuvHalf = 1 << (YUV_FIX - 1);
+
+// Max bit depth so that intermediate calculations fit in 16 bits.
+static const int kMaxBitDepth = 14;
+
+// Returns the precision shift to use based on the input rgb_bit_depth.
+static int GetPrecisionShift(int rgb_bit_depth) {
+  // Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
+  // bits if needed.
+  return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
+                                               : (kMaxBitDepth - rgb_bit_depth);
+}
+
+typedef int16_t fixed_t;      // signed type with extra precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra precision for W
+
+//------------------------------------------------------------------------------
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static uint16_t clip(fixed_t v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static fixed_y_t clip_bit_depth(int y, int bit_depth) {
+  const int max = (1 << bit_depth) - 1;
+  return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int64_t r, int64_t g, int64_t b) {
+  const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
+  return (int)(luma >> YUV_FIX);
+}
+
+static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+                          int rgb_bit_depth,
+                          SharpYuvTransferFunctionType transfer_type) {
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  const uint32_t A = SharpYuvGammaToLinear(a, bit_depth, transfer_type);
+  const uint32_t B = SharpYuvGammaToLinear(b, bit_depth, transfer_type);
+  const uint32_t C = SharpYuvGammaToLinear(c, bit_depth, transfer_type);
+  const uint32_t D = SharpYuvGammaToLinear(d, bit_depth, transfer_type);
+  return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth,
+                               transfer_type);
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
+                                int rgb_bit_depth,
+                                SharpYuvTransferFunctionType transfer_type) {
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  int i = 0;
+  do {
+    const uint32_t R =
+        SharpYuvGammaToLinear(src[0 * w + i], bit_depth, transfer_type);
+    const uint32_t G =
+        SharpYuvGammaToLinear(src[1 * w + i], bit_depth, transfer_type);
+    const uint32_t B =
+        SharpYuvGammaToLinear(src[2 * w + i], bit_depth, transfer_type);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth, transfer_type);
+  } while (++i < w);
+}
+
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w, int rgb_bit_depth,
+                         SharpYuvTransferFunctionType transfer_type) {
+  int i = 0;
+  do {
+    const int r =
+        ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
+                  src2[0 * uv_w + 1], rgb_bit_depth, transfer_type);
+    const int g =
+        ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
+                  src2[2 * uv_w + 1], rgb_bit_depth, transfer_type);
+    const int b =
+        ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
+                  src2[4 * uv_w + 1], rgb_bit_depth, transfer_type);
+    const int W = RGBToGray(r, g, b);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
+  } while (++i < uv_w);
+}
+
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i = 0;
+  assert(w > 0);
+  do {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
+  } while (++i < w);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_bit_depth(v0 + W0, bit_depth);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int Shift(int v, int shift) {
+  return (shift >= 0) ? (v << shift) : (v >> -shift);
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int rgb_step,
+                         int rgb_bit_depth,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  // Convert the rgb_step from a number of bytes to a number of uint8_t or
+  // uint16_t values depending the bit depth.
+  const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
+  int i = 0;
+  const int w = (pic_width + 1) & ~1;
+  do {
+    const int off = i * step;
+    const int shift = GetPrecisionShift(rgb_bit_depth);
+    if (rgb_bit_depth == 8) {
+      dst[i + 0 * w] = Shift(r_ptr[off], shift);
+      dst[i + 1 * w] = Shift(g_ptr[off], shift);
+      dst[i + 2 * w] = Shift(b_ptr[off], shift);
+    } else {
+      dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
+      dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
+      dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
+    }
+  } while (++i < pic_width);
+  if (pic_width & 1) {  // replicate rightmost pixel
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
+                               int w,
+                               fixed_y_t* out1,
+                               fixed_y_t* out2,
+                               int rgb_bit_depth) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);
+
+    SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
+                      bit_depth);
+    SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
+                      bit_depth);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0], bit_depth);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w], bit_depth);
+    }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
+  }
+}
+
+static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
+                                         const int coeffs[4], int sfix) {
+  const int srounder = 1 << (YUV_FIX + sfix - 1);
+  const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
+                   coeffs[3] + srounder;
+  return (luma >> (YUV_FIX + sfix));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
+                            uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
+                            int u_stride, uint8_t* v_ptr, int v_stride,
+                            int rgb_bit_depth,
+                            int yuv_bit_depth, int width, int height,
+                            const SharpYuvConversionMatrix* yuv_matrix) {
+  int i, j;
+  const fixed_t* const best_uv_base = best_uv;
+  const int w = (width + 1) & ~1;
+  const int h = (height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  const int sfix = GetPrecisionShift(rgb_bit_depth);
+  const int yuv_max = (1 << yuv_bit_depth) - 1;
+
+  best_uv = best_uv_base;
+  j = 0;
+  do {
+    i = 0;
+    do {
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
+      if (yuv_bit_depth <= 8) {
+        y_ptr[i] = clip_8b(y);
+      } else {
+        ((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
+      }
+    } while (++i < width);
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    y_ptr += y_stride;
+  } while (++j < height);
+
+  best_uv = best_uv_base;
+  j = 0;
+  do {
+    i = 0;
+    do {
+      // Note r, g and b values here are off by W, but a constant offset on all
+      // 3 components doesn't change the value of u and v with a YCbCr matrix.
+      const int r = best_uv[i + 0 * uv_w];
+      const int g = best_uv[i + 1 * uv_w];
+      const int b = best_uv[i + 2 * uv_w];
+      const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
+      const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
+      if (yuv_bit_depth <= 8) {
+        u_ptr[i] = clip_8b(u);
+        v_ptr[i] = clip_8b(v);
+      } else {
+        ((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
+        ((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
+      }
+    } while (++i < uv_w);
+    best_uv += 3 * uv_w;
+    u_ptr += u_stride;
+    v_ptr += v_stride;
+  } while (++j < uv_h);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+static void* SafeMalloc(uint64_t nmemb, size_t size) {
+  const uint64_t total_size = nmemb * (uint64_t)size;
+  if (total_size != (size_t)total_size) return NULL;
+  return malloc((size_t)total_size);
+}
+
+#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((uint64_t)(W) * (H), sizeof(T)))
+
+static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
+                            const uint8_t* b_ptr, int rgb_step, int rgb_stride,
+                            int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
+                            uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
+                            int v_stride, int yuv_bit_depth, int width,
+                            int height,
+                            const SharpYuvConversionMatrix* yuv_matrix,
+                            SharpYuvTransferFunctionType transfer_type) {
+  // we expand the right/bottom border if needed
+  const int w = (width + 1) & ~1;
+  const int h = (height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  const int y_bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
+  int ok;
+  assert(w > 0);
+  assert(h > 0);
+
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = 0;
+    goto End;
+  }
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < height; j += 2) {
+    const int is_last_row = (j == height - 1);
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
+                 src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
+                   rgb_step, rgb_bit_depth, width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w, rgb_bit_depth, transfer_type);
+    UpdateW(src2, target_y + w, w, rgb_bit_depth, transfer_type);
+    UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth, transfer_type);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
+    j = 0;
+    do {
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
+                           src1, src2, rgb_bit_depth);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth, transfer_type);
+      UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth, transfer_type);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth, transfer_type);
+
+      // update two rows of Y and one row of RGB
+      diff_y_sum +=
+          SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w, y_bit_depth);
+      SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
+      j += 2;
+    } while (j < h);
+    // test exit condition
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
+    }
+    prev_diff_y_sum = diff_y_sum;
+  }
+
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
+                        u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
+                        width, height, yuv_matrix);
+
+ End:
+  free(best_y_base);
+  free(best_uv_base);
+  free(target_y_base);
+  free(target_uv_base);
+  free(best_rgb_y);
+  free(best_rgb_uv);
+  free(tmp_buffer);
+  return ok;
+}
+
+#undef SAFE_ALLOC
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define LOCK_ACCESS \
+    static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \
+    if (pthread_mutex_lock(&sharpyuv_lock)) return
+#define UNLOCK_ACCESS_AND_RETURN                  \
+    do {                                          \
+      (void)pthread_mutex_unlock(&sharpyuv_lock); \
+      return;                                     \
+    } while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define LOCK_ACCESS do {} while (0)
+#define UNLOCK_ACCESS_AND_RETURN return
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Hidden exported init function.
+// By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed,
+// users can declare it as extern and call it with an alternate VP8CPUInfo
+// function.
+extern VP8CPUInfo SharpYuvGetCPUInfo;
+SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func);
+void SharpYuvInit(VP8CPUInfo cpu_info_func) {
+  static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
+      (VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
+  LOCK_ACCESS;
+  // Only update SharpYuvGetCPUInfo when called from external code to avoid a
+  // race on reading the value in SharpYuvConvert().
+  if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) {
+    SharpYuvGetCPUInfo = cpu_info_func;
+  }
+  if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) {
+    UNLOCK_ACCESS_AND_RETURN;
+  }
+
+  SharpYuvInitDsp();
+  SharpYuvInitGammaTables();
+
+  sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo;
+  UNLOCK_ACCESS_AND_RETURN;
+}
+
+int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr,
+                    int rgb_step, int rgb_stride, int rgb_bit_depth,
+                    void* y_ptr, int y_stride, void* u_ptr, int u_stride,
+                    void* v_ptr, int v_stride, int yuv_bit_depth, int width,
+                    int height, const SharpYuvConversionMatrix* yuv_matrix) {
+  SharpYuvOptions options;
+  options.yuv_matrix = yuv_matrix;
+  options.transfer_type = kSharpYuvTransferFunctionSrgb;
+  return SharpYuvConvertWithOptions(
+      r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, rgb_bit_depth, y_ptr, y_stride,
+      u_ptr, u_stride, v_ptr, v_stride, yuv_bit_depth, width, height, &options);
+}
+
+int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix* yuv_matrix,
+                                SharpYuvOptions* options, int version) {
+  const int major = (version >> 24);
+  const int minor = (version >> 16) & 0xff;
+  if (options == NULL || yuv_matrix == NULL ||
+      (major == SHARPYUV_VERSION_MAJOR && major == 0 &&
+       minor != SHARPYUV_VERSION_MINOR) ||
+      (major != SHARPYUV_VERSION_MAJOR)) {
+    return 0;
+  }
+  options->yuv_matrix = yuv_matrix;
+  options->transfer_type = kSharpYuvTransferFunctionSrgb;
+  return 1;
+}
+
+int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr,
+                               const void* b_ptr, int rgb_step, int rgb_stride,
+                               int rgb_bit_depth, void* y_ptr, int y_stride,
+                               void* u_ptr, int u_stride, void* v_ptr,
+                               int v_stride, int yuv_bit_depth, int width,
+                               int height, const SharpYuvOptions* options) {
+  const SharpYuvConversionMatrix* yuv_matrix = options->yuv_matrix;
+  SharpYuvTransferFunctionType transfer_type = options->transfer_type;
+  SharpYuvConversionMatrix scaled_matrix;
+  const int rgb_max = (1 << rgb_bit_depth) - 1;
+  const int rgb_round = 1 << (rgb_bit_depth - 1);
+  const int yuv_max = (1 << yuv_bit_depth) - 1;
+  const int sfix = GetPrecisionShift(rgb_bit_depth);
+
+  if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
+      r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL ||
+      u_ptr == NULL || v_ptr == NULL) {
+    return 0;
+  }
+  if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
+      rgb_bit_depth != 16) {
+    return 0;
+  }
+  if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
+    return 0;
+  }
+  if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride % 2 != 0)) {
+    // Step/stride should be even for uint16_t buffers.
+    return 0;
+  }
+  if (yuv_bit_depth > 8 &&
+      (y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) {
+    // Stride should be even for uint16_t buffers.
+    return 0;
+  }
+  // The address of the function pointer is used to avoid a read race.
+  SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo);
+
+  // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
+  // rgb->yuv conversion matrix.
+  if (rgb_bit_depth == yuv_bit_depth) {
+    memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
+  } else {
+    int i;
+    for (i = 0; i < 3; ++i) {
+      scaled_matrix.rgb_to_y[i] =
+          (yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
+      scaled_matrix.rgb_to_u[i] =
+          (yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
+      scaled_matrix.rgb_to_v[i] =
+          (yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
+    }
+  }
+  // Also incorporate precision change scaling.
+  scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
+  scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
+  scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
+
+  return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
+                          rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
+                          v_ptr, v_stride, yuv_bit_depth, width, height,
+                          &scaled_matrix, transfer_type);
+}
+
+//------------------------------------------------------------------------------
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv.h b/3rdparty/libwebp/sharpyuv/sharpyuv.h
new file mode 100644
index 000000000000..fe95891599eb
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv.h
@@ -0,0 +1,172 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_H_
+#define WEBP_SHARPYUV_SHARPYUV_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef SHARPYUV_EXTERN
+#ifdef WEBP_EXTERN
+#define SHARPYUV_EXTERN WEBP_EXTERN
+#else
+// This explicitly marks library functions and allows for changing the
+// signature for e.g., Windows DLL builds.
+#if defined(_WIN32) && defined(WEBP_DLL)
+#define SHARPYUV_EXTERN __declspec(dllexport)
+#elif defined(__GNUC__) && __GNUC__ >= 4
+#define SHARPYUV_EXTERN extern __attribute__((visibility("default")))
+#else
+#define SHARPYUV_EXTERN extern
+#endif /* defined(_WIN32) && defined(WEBP_DLL) */
+#endif /* WEBP_EXTERN */
+#endif /* SHARPYUV_EXTERN */
+
+#ifndef SHARPYUV_INLINE
+#ifdef WEBP_INLINE
+#define SHARPYUV_INLINE WEBP_INLINE
+#else
+#ifndef _MSC_VER
+#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#define SHARPYUV_INLINE inline
+#else
+#define SHARPYUV_INLINE
+#endif
+#else
+#define SHARPYUV_INLINE __forceinline
+#endif /* _MSC_VER */
+#endif /* WEBP_INLINE */
+#endif /* SHARPYUV_INLINE */
+
+// SharpYUV API version following the convention from semver.org
+#define SHARPYUV_VERSION_MAJOR 0
+#define SHARPYUV_VERSION_MINOR 4
+#define SHARPYUV_VERSION_PATCH 0
+// Version as a uint32_t. The major number is the high 8 bits.
+// The minor number is the middle 8 bits. The patch number is the low 16 bits.
+#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
+  (((MAJOR) << 24) | ((MINOR) << 16) | (PATCH))
+#define SHARPYUV_VERSION                                                \
+  SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \
+                        SHARPYUV_VERSION_PATCH)
+
+// Returns the library's version number, packed in hexadecimal. See
+// SHARPYUV_VERSION.
+SHARPYUV_EXTERN int SharpYuvGetVersion(void);
+
+// RGB to YUV conversion matrix, in 16 bit fixed point.
+// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
+// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
+// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
+// Then y, u and v values are divided by 1<<16 and rounded.
+typedef struct {
+  int rgb_to_y[4];
+  int rgb_to_u[4];
+  int rgb_to_v[4];
+} SharpYuvConversionMatrix;
+
+typedef struct SharpYuvOptions SharpYuvOptions;
+
+// Enums for transfer functions, as defined in H.273,
+// https://www.itu.int/rec/T-REC-H.273-202107-I/en
+typedef enum SharpYuvTransferFunctionType {
+  // 0 is reserved
+  kSharpYuvTransferFunctionBt709 = 1,
+  // 2 is unspecified
+  // 3 is reserved
+  kSharpYuvTransferFunctionBt470M = 4,
+  kSharpYuvTransferFunctionBt470Bg = 5,
+  kSharpYuvTransferFunctionBt601 = 6,
+  kSharpYuvTransferFunctionSmpte240 = 7,
+  kSharpYuvTransferFunctionLinear = 8,
+  kSharpYuvTransferFunctionLog100 = 9,
+  kSharpYuvTransferFunctionLog100_Sqrt10 = 10,
+  kSharpYuvTransferFunctionIec61966 = 11,
+  kSharpYuvTransferFunctionBt1361 = 12,
+  kSharpYuvTransferFunctionSrgb = 13,
+  kSharpYuvTransferFunctionBt2020_10Bit = 14,
+  kSharpYuvTransferFunctionBt2020_12Bit = 15,
+  kSharpYuvTransferFunctionSmpte2084 = 16,  // PQ
+  kSharpYuvTransferFunctionSmpte428 = 17,
+  kSharpYuvTransferFunctionHlg = 18,
+  kSharpYuvTransferFunctionNum
+} SharpYuvTransferFunctionType;
+
+// Converts RGB to YUV420 using a downsampling algorithm that minimizes
+// artefacts caused by chroma subsampling.
+// This is slower than standard downsampling (averaging of 4 UV values).
+// Assumes that the image will be upsampled using a bilinear filter. If nearest
+// neighbor is used instead, the upsampled image might look worse than with
+// standard downsampling.
+// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point
+//     to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise.
+// rgb_step: distance in bytes between two horizontally adjacent pixels on the
+//     r, g and b channels. If rgb_bit_depth is > 8, it should be a
+//     multiple of 2.
+// rgb_stride: distance in bytes between two vertically adjacent pixels on the
+//     r, g, and b channels. If rgb_bit_depth is > 8, it should be a
+//     multiple of 2.
+// rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16.
+//     Note: 16 bit input is truncated to 14 bits before conversion to yuv.
+// yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12.
+// y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels.  Should
+//     point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers
+//     otherwise.
+// y_stride, u_stride, v_stride: distance in bytes between two vertically
+//     adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
+//     should be multiples of 2.
+// width, height: width and height of the image in pixels
+// This function calls SharpYuvConvertWithOptions with a default transfer
+// function of kSharpYuvTransferFunctionSrgb.
+SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
+                                    const void* b_ptr, int rgb_step,
+                                    int rgb_stride, int rgb_bit_depth,
+                                    void* y_ptr, int y_stride, void* u_ptr,
+                                    int u_stride, void* v_ptr, int v_stride,
+                                    int yuv_bit_depth, int width, int height,
+                                    const SharpYuvConversionMatrix* yuv_matrix);
+
+struct SharpYuvOptions {
+  // This matrix cannot be NULL and can be initialized by
+  // SharpYuvComputeConversionMatrix.
+  const SharpYuvConversionMatrix* yuv_matrix;
+  SharpYuvTransferFunctionType transfer_type;
+};
+
+// Internal, version-checked, entry point
+SHARPYUV_EXTERN int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix*,
+                                                SharpYuvOptions*, int);
+
+// Should always be called, to initialize a fresh SharpYuvOptions
+// structure before modification. SharpYuvOptionsInit() must have succeeded
+// before using the 'options' object.
+static SHARPYUV_INLINE int SharpYuvOptionsInit(
+    const SharpYuvConversionMatrix* yuv_matrix, SharpYuvOptions* options) {
+  return SharpYuvOptionsInitInternal(yuv_matrix, options, SHARPYUV_VERSION);
+}
+
+SHARPYUV_EXTERN int SharpYuvConvertWithOptions(
+    const void* r_ptr, const void* g_ptr, const void* b_ptr, int rgb_step,
+    int rgb_stride, int rgb_bit_depth, void* y_ptr, int y_stride, void* u_ptr,
+    int u_stride, void* v_ptr, int v_stride, int yuv_bit_depth, int width,
+    int height, const SharpYuvOptions* options);
+
+// TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
+// support (it's rarely used in practice, especially for images).
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_H_
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.c b/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.c
new file mode 100644
index 000000000000..29425a0c4918
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.c
@@ -0,0 +1,14 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+#include "sharpyuv/sharpyuv_cpu.h"
+
+// Include src/dsp/cpu.c to create SharpYuvGetCPUInfo from VP8GetCPUInfo. The
+// function pointer is renamed in sharpyuv_cpu.h.
+#include "src/dsp/cpu.c"
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.h b/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.h
new file mode 100644
index 000000000000..176ca3eb1682
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_cpu.h
@@ -0,0 +1,22 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+#ifndef WEBP_SHARPYUV_SHARPYUV_CPU_H_
+#define WEBP_SHARPYUV_SHARPYUV_CPU_H_
+
+#include "sharpyuv/sharpyuv.h"
+
+// Avoid exporting SharpYuvGetCPUInfo in shared object / DLL builds.
+// SharpYuvInit() replaces the use of the function pointer.
+#undef WEBP_EXTERN
+#define WEBP_EXTERN extern
+#define VP8GetCPUInfo SharpYuvGetCPUInfo
+#include "src/dsp/cpu.h"
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_CPU_H_
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_csp.c b/3rdparty/libwebp/sharpyuv/sharpyuv_csp.c
new file mode 100644
index 000000000000..0ad22be9458c
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_csp.c
@@ -0,0 +1,110 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Colorspace utilities.
+
+#include "sharpyuv/sharpyuv_csp.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>
+
+static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); }
+
+void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
+                                     SharpYuvConversionMatrix* matrix) {
+  const float kr = yuv_color_space->kr;
+  const float kb = yuv_color_space->kb;
+  const float kg = 1.0f - kr - kb;
+  const float cr = 0.5f / (1.0f - kb);
+  const float cb = 0.5f / (1.0f - kr);
+
+  const int shift = yuv_color_space->bit_depth - 8;
+
+  const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
+  float scale_y = 1.0f;
+  float add_y = 0.0f;
+  float scale_u = cr;
+  float scale_v = cb;
+  float add_uv = (float)(128 << shift);
+  assert(yuv_color_space->bit_depth >= 8);
+
+  if (yuv_color_space->range == kSharpYuvRangeLimited) {
+    scale_y *= (219 << shift) / denom;
+    scale_u *= (224 << shift) / denom;
+    scale_v *= (224 << shift) / denom;
+    add_y = (float)(16 << shift);
+  }
+
+  matrix->rgb_to_y[0] = ToFixed16(kr * scale_y);
+  matrix->rgb_to_y[1] = ToFixed16(kg * scale_y);
+  matrix->rgb_to_y[2] = ToFixed16(kb * scale_y);
+  matrix->rgb_to_y[3] = ToFixed16(add_y);
+
+  matrix->rgb_to_u[0] = ToFixed16(-kr * scale_u);
+  matrix->rgb_to_u[1] = ToFixed16(-kg * scale_u);
+  matrix->rgb_to_u[2] = ToFixed16((1 - kb) * scale_u);
+  matrix->rgb_to_u[3] = ToFixed16(add_uv);
+
+  matrix->rgb_to_v[0] = ToFixed16((1 - kr) * scale_v);
+  matrix->rgb_to_v[1] = ToFixed16(-kg * scale_v);
+  matrix->rgb_to_v[2] = ToFixed16(-kb * scale_v);
+  matrix->rgb_to_v[3] = ToFixed16(add_uv);
+}
+
+// Matrices are in YUV_FIX fixed point precision.
+// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
+static const SharpYuvConversionMatrix kWebpMatrix = {
+  {16839, 33059, 6420, 16 << 16},
+  {-9719, -19081, 28800, 128 << 16},
+  {28800, -24116, -4684, 128 << 16},
+};
+// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
+static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
+  {16829, 33039, 6416, 16 << 16},
+  {-9714, -19071, 28784, 128 << 16},
+  {28784, -24103, -4681, 128 << 16},
+};
+// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
+static const SharpYuvConversionMatrix kRec601FullMatrix = {
+  {19595, 38470, 7471, 0},
+  {-11058, -21710, 32768, 128 << 16},
+  {32768, -27439, -5329, 128 << 16},
+};
+// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
+static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
+  {11966, 40254, 4064, 16 << 16},
+  {-6596, -22189, 28784, 128 << 16},
+  {28784, -26145, -2639, 128 << 16},
+};
+// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
+static const SharpYuvConversionMatrix kRec709FullMatrix = {
+  {13933, 46871, 4732, 0},
+  {-7509, -25259, 32768, 128 << 16},
+  {32768, -29763, -3005, 128 << 16},
+};
+
+const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+    SharpYuvMatrixType matrix_type) {
+  switch (matrix_type) {
+    case kSharpYuvMatrixWebp:
+      return &kWebpMatrix;
+    case kSharpYuvMatrixRec601Limited:
+      return &kRec601LimitedMatrix;
+    case kSharpYuvMatrixRec601Full:
+      return &kRec601FullMatrix;
+    case kSharpYuvMatrixRec709Limited:
+      return &kRec709LimitedMatrix;
+    case kSharpYuvMatrixRec709Full:
+      return &kRec709FullMatrix;
+    case kSharpYuvMatrixNum:
+      return NULL;
+  }
+  return NULL;
+}
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_csp.h b/3rdparty/libwebp/sharpyuv/sharpyuv_csp.h
new file mode 100644
index 000000000000..3214e3ac6075
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_csp.h
@@ -0,0 +1,60 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Colorspace utilities.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_CSP_H_
+#define WEBP_SHARPYUV_SHARPYUV_CSP_H_
+
+#include "sharpyuv/sharpyuv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Range of YUV values.
+typedef enum {
+  kSharpYuvRangeFull,     // YUV values between [0;255] (for 8 bit)
+  kSharpYuvRangeLimited   // Y in [16;235], YUV in [16;240] (for 8 bit)
+} SharpYuvRange;
+
+// Constants that define a YUV color space.
+typedef struct {
+  // Kr and Kb are defined such that:
+  // Y = Kr * r + Kg * g + Kb * b where Kg = 1 - Kr - Kb.
+  float kr;
+  float kb;
+  int bit_depth;  // 8, 10 or 12
+  SharpYuvRange range;
+} SharpYuvColorSpace;
+
+// Fills in 'matrix' for the given YUVColorSpace.
+SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
+    const SharpYuvColorSpace* yuv_color_space,
+    SharpYuvConversionMatrix* matrix);
+
+// Enums for precomputed conversion matrices.
+typedef enum {
+  kSharpYuvMatrixWebp = 0,
+  kSharpYuvMatrixRec601Limited,
+  kSharpYuvMatrixRec601Full,
+  kSharpYuvMatrixRec709Limited,
+  kSharpYuvMatrixRec709Full,
+  kSharpYuvMatrixNum
+} SharpYuvMatrixType;
+
+// Returns a pointer to a matrix for one of the predefined colorspaces.
+SHARPYUV_EXTERN const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+    SharpYuvMatrixType matrix_type);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_CSP_H_
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.c b/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.c
new file mode 100644
index 000000000000..94a40ec68645
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.c
@@ -0,0 +1,104 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "sharpyuv/sharpyuv_cpu.h"
+#include "src/webp/types.h"
+
+//-----------------------------------------------------------------------------
+
+#if !WEBP_NEON_OMIT_C_CODE
+static uint16_t clip(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_C(const uint16_t* ref, const uint16_t* src,
+                                  uint16_t* dst, int len, int bit_depth) {
+  uint64_t diff = 0;
+  int i;
+  const int max_y = (1 << bit_depth) - 1;
+  for (i = 0; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip(new_y, max_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_C(const int16_t* ref, const int16_t* src,
+                                int16_t* dst, int len) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow_C(const int16_t* A, const int16_t* B, int len,
+                                const uint16_t* best_y, uint16_t* out,
+                                int bit_depth) {
+  int i;
+  const int max_y = (1 << bit_depth) - 1;
+  for (i = 0; i < len; ++i, ++A, ++B) {
+    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
+    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
+    out[2 * i + 0] = clip(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//-----------------------------------------------------------------------------
+
+uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
+                            uint16_t* dst, int len, int bit_depth);
+void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst,
+                          int len);
+void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
+                          const uint16_t* best_y, uint16_t* out, int bit_depth);
+
+extern VP8CPUInfo SharpYuvGetCPUInfo;
+extern void InitSharpYuvSSE2(void);
+extern void InitSharpYuvNEON(void);
+
+void SharpYuvInitDsp(void) {
+#if !WEBP_NEON_OMIT_C_CODE
+  SharpYuvUpdateY = SharpYuvUpdateY_C;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_C;
+  SharpYuvFilterRow = SharpYuvFilterRow_C;
+#endif
+
+  if (SharpYuvGetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+    if (SharpYuvGetCPUInfo(kSSE2)) {
+      InitSharpYuvSSE2();
+    }
+#endif  // WEBP_HAVE_SSE2
+  }
+
+#if defined(WEBP_HAVE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (SharpYuvGetCPUInfo != NULL && SharpYuvGetCPUInfo(kNEON))) {
+    InitSharpYuvNEON();
+  }
+#endif  // WEBP_HAVE_NEON
+
+  assert(SharpYuvUpdateY != NULL);
+  assert(SharpYuvUpdateRGB != NULL);
+  assert(SharpYuvFilterRow != NULL);
+}
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.h b/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.h
new file mode 100644
index 000000000000..805fbadbf657
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_dsp.h
@@ -0,0 +1,28 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_
+#define WEBP_SHARPYUV_SHARPYUV_DSP_H_
+
+#include "sharpyuv/sharpyuv_cpu.h"
+#include "src/webp/types.h"
+
+extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
+                                   uint16_t* dst, int len, int bit_depth);
+extern void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref,
+                                 int16_t* dst, int len);
+extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
+                                 const uint16_t* best_y, uint16_t* out,
+                                 int bit_depth);
+
+void SharpYuvInitDsp(void);
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_DSP_H_
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.c b/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.c
new file mode 100644
index 000000000000..09028428aced
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.c
@@ -0,0 +1,419 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Gamma correction utilities.
+
+#include "sharpyuv/sharpyuv_gamma.h"
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+
+#include "src/webp/types.h"
+
+// Gamma correction compensates loss of resolution during chroma subsampling.
+// Size of pre-computed table for converting from gamma to linear.
+#define GAMMA_TO_LINEAR_TAB_BITS 10
+#define GAMMA_TO_LINEAR_TAB_SIZE (1 << GAMMA_TO_LINEAR_TAB_BITS)
+static uint32_t kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 2];
+#define LINEAR_TO_GAMMA_TAB_BITS 9
+#define LINEAR_TO_GAMMA_TAB_SIZE (1 << LINEAR_TO_GAMMA_TAB_BITS)
+static uint32_t kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 2];
+
+static const double kGammaF = 1. / 0.45;
+#define GAMMA_TO_LINEAR_BITS 16
+
+static volatile int kGammaTablesSOk = 0;
+void SharpYuvInitGammaTables(void) {
+  assert(GAMMA_TO_LINEAR_BITS <= 16);
+  if (!kGammaTablesSOk) {
+    int v;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
+    // Precompute gamma to linear table.
+    {
+      const double norm = 1. / GAMMA_TO_LINEAR_TAB_SIZE;
+      const double a_rec = 1. / (1. + a);
+      for (v = 0; v <= GAMMA_TO_LINEAR_TAB_SIZE; ++v) {
+        const double g = norm * v;
+        double value;
+        if (g <= thresh * 4.5) {
+          value = g / 4.5;
+        } else {
+          value = pow(a_rec * (g + a), kGammaF);
+        }
+        kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
+      }
+      // to prevent small rounding errors to cause read-overflow:
+      kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 1] =
+          kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE];
+    }
+    // Precompute linear to gamma table.
+    {
+      const double scale = 1. / LINEAR_TO_GAMMA_TAB_SIZE;
+      for (v = 0; v <= LINEAR_TO_GAMMA_TAB_SIZE; ++v) {
+        const double g = scale * v;
+        double value;
+        if (g <= thresh) {
+          value = 4.5 * g;
+        } else {
+          value = (1. + a) * pow(g, 1. / kGammaF) - a;
+        }
+        kLinearToGammaTabS[v] =
+            (uint32_t)(final_scale * value + 0.5);
+      }
+      // to prevent small rounding errors to cause read-overflow:
+      kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 1] =
+          kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE];
+    }
+    kGammaTablesSOk = 1;
+  }
+}
+
+static WEBP_INLINE int Shift(int v, int shift) {
+  return (shift >= 0) ? (v << shift) : (v >> -shift);
+}
+
+static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab,
+                                                    int tab_pos_shift_right,
+                                                    int tab_value_shift) {
+  const uint32_t tab_pos = Shift(v, -tab_pos_shift_right);
+  // fractional part, in 'tab_pos_shift' fixed-point precision
+  const uint32_t x = v - (tab_pos << tab_pos_shift_right);  // fractional part
+  // v0 / v1 are in kGammaToLinearBits fixed-point precision (range [0..1])
+  const uint32_t v0 = Shift(tab[tab_pos + 0], tab_value_shift);
+  const uint32_t v1 = Shift(tab[tab_pos + 1], tab_value_shift);
+  // Final interpolation.
+  const uint32_t v2 = (v1 - v0) * x;  // note: v1 >= v0.
+  const int half =
+      (tab_pos_shift_right > 0) ? 1 << (tab_pos_shift_right - 1) : 0;
+  const uint32_t result = v0 + ((v2 + half) >> tab_pos_shift_right);
+  return result;
+}
+
+static uint32_t ToLinearSrgb(uint16_t v, int bit_depth) {
+  const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth;
+  if (shift > 0) {
+    return kGammaToLinearTabS[v << shift];
+  }
+  return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0);
+}
+
+static uint16_t FromLinearSrgb(uint32_t value, int bit_depth) {
+  return FixedPointInterpolation(
+      value, kLinearToGammaTabS,
+      (GAMMA_TO_LINEAR_BITS - LINEAR_TO_GAMMA_TAB_BITS),
+      bit_depth - GAMMA_TO_LINEAR_BITS);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#define CLAMP(x, low, high) \
+  (((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x)))
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+static WEBP_INLINE float Roundf(float x) {
+  if (x < 0)
+    return (float)ceil((double)(x - 0.5f));
+  else
+    return (float)floor((double)(x + 0.5f));
+}
+
+static WEBP_INLINE float Powf(float base, float exp) {
+  return (float)pow((double)base, (double)exp);
+}
+
+static WEBP_INLINE float Log10f(float x) { return (float)log10((double)x); }
+
+static float ToLinear709(float gamma) {
+  if (gamma < 0.f) {
+    return 0.f;
+  } else if (gamma < 4.5f * 0.018053968510807f) {
+    return gamma / 4.5f;
+  } else if (gamma < 1.f) {
+    return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
+  }
+  return 1.f;
+}
+
+static float FromLinear709(float linear) {
+  if (linear < 0.f) {
+    return 0.f;
+  } else if (linear < 0.018053968510807f) {
+    return linear * 4.5f;
+  } else if (linear < 1.f) {
+    return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
+  }
+  return 1.f;
+}
+
+static float ToLinear470M(float gamma) {
+  return Powf(CLAMP(gamma, 0.f, 1.f), 2.2f);
+}
+
+static float FromLinear470M(float linear) {
+  return Powf(CLAMP(linear, 0.f, 1.f), 1.f / 2.2f);
+}
+
+static float ToLinear470Bg(float gamma) {
+  return Powf(CLAMP(gamma, 0.f, 1.f), 2.8f);
+}
+
+static float FromLinear470Bg(float linear) {
+  return Powf(CLAMP(linear, 0.f, 1.f), 1.f / 2.8f);
+}
+
+static float ToLinearSmpte240(float gamma) {
+  if (gamma < 0.f) {
+    return 0.f;
+  } else if (gamma < 4.f * 0.022821585529445f) {
+    return gamma / 4.f;
+  } else if (gamma < 1.f) {
+    return Powf((gamma + 0.111572195921731f) / 1.111572195921731f, 1.f / 0.45f);
+  }
+  return 1.f;
+}
+
+static float FromLinearSmpte240(float linear) {
+  if (linear < 0.f) {
+    return 0.f;
+  } else if (linear < 0.022821585529445f) {
+    return linear * 4.f;
+  } else if (linear < 1.f) {
+    return 1.111572195921731f * Powf(linear, 0.45f) - 0.111572195921731f;
+  }
+  return 1.f;
+}
+
+static float ToLinearLog100(float gamma) {
+  // The function is non-bijective so choose the middle of [0, 0.01].
+  const float mid_interval = 0.01f / 2.f;
+  return (gamma <= 0.0f) ? mid_interval
+                          : Powf(10.0f, 2.f * (MIN(gamma, 1.f) - 1.0f));
+}
+
+static float FromLinearLog100(float linear) {
+  return (linear < 0.01f) ? 0.0f : 1.0f + Log10f(MIN(linear, 1.f)) / 2.0f;
+}
+
+static float ToLinearLog100Sqrt10(float gamma) {
+  // The function is non-bijective so choose the middle of [0, 0.00316227766f[.
+  const float mid_interval = 0.00316227766f / 2.f;
+  return (gamma <= 0.0f) ? mid_interval
+                          : Powf(10.0f, 2.5f * (MIN(gamma, 1.f) - 1.0f));
+}
+
+static float FromLinearLog100Sqrt10(float linear) {
+  return (linear < 0.00316227766f) ? 0.0f
+                                  : 1.0f + Log10f(MIN(linear, 1.f)) / 2.5f;
+}
+
+static float ToLinearIec61966(float gamma) {
+  if (gamma <= -4.5f * 0.018053968510807f) {
+    return Powf((-gamma + 0.09929682680944f) / -1.09929682680944f, 1.f / 0.45f);
+  } else if (gamma < 4.5f * 0.018053968510807f) {
+    return gamma / 4.5f;
+  }
+  return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
+}
+
+static float FromLinearIec61966(float linear) {
+  if (linear <= -0.018053968510807f) {
+    return -1.09929682680944f * Powf(-linear, 0.45f) + 0.09929682680944f;
+  } else if (linear < 0.018053968510807f) {
+    return linear * 4.5f;
+  }
+  return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
+}
+
+static float ToLinearBt1361(float gamma) {
+  if (gamma < -0.25f) {
+    return -0.25f;
+  } else if (gamma < 0.f) {
+    return Powf((gamma - 0.02482420670236f) / -0.27482420670236f, 1.f / 0.45f) /
+           -4.f;
+  } else if (gamma < 4.5f * 0.018053968510807f) {
+    return gamma / 4.5f;
+  } else if (gamma < 1.f) {
+    return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
+  }
+  return 1.f;
+}
+
+static float FromLinearBt1361(float linear) {
+  if (linear < -0.25f) {
+    return -0.25f;
+  } else if (linear < 0.f) {
+    return -0.27482420670236f * Powf(-4.f * linear, 0.45f) + 0.02482420670236f;
+  } else if (linear < 0.018053968510807f) {
+    return linear * 4.5f;
+  } else if (linear < 1.f) {
+    return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
+  }
+  return 1.f;
+}
+
+static float ToLinearPq(float gamma) {
+  if (gamma > 0.f) {
+    const float pow_gamma = Powf(gamma, 32.f / 2523.f);
+    const float num = MAX(pow_gamma - 107.f / 128.f, 0.0f);
+    const float den = MAX(2413.f / 128.f - 2392.f / 128.f * pow_gamma, FLT_MIN);
+    return Powf(num / den, 4096.f / 653.f);
+  }
+  return 0.f;
+}
+
+static float FromLinearPq(float linear) {
+  if (linear > 0.f) {
+    const float pow_linear = Powf(linear, 653.f / 4096.f);
+    const float num = 107.f / 128.f + 2413.f / 128.f * pow_linear;
+    const float den = 1.0f + 2392.f / 128.f * pow_linear;
+    return Powf(num / den, 2523.f / 32.f);
+  }
+  return 0.f;
+}
+
+static float ToLinearSmpte428(float gamma) {
+  return Powf(MAX(gamma, 0.f), 2.6f) / 0.91655527974030934f;
+}
+
+static float FromLinearSmpte428(float linear) {
+  return Powf(0.91655527974030934f * MAX(linear, 0.f), 1.f / 2.6f);
+}
+
+// Conversion in BT.2100 requires RGB info. Simplify to gamma correction here.
+static float ToLinearHlg(float gamma) {
+  if (gamma < 0.f) {
+    return 0.f;
+  } else if (gamma <= 0.5f) {
+    return Powf((gamma * gamma) * (1.f / 3.f), 1.2f);
+  }
+  return Powf((expf((gamma - 0.55991073f) / 0.17883277f) + 0.28466892f) / 12.0f,
+              1.2f);
+}
+
+static float FromLinearHlg(float linear) {
+  linear = Powf(linear, 1.f / 1.2f);
+  if (linear < 0.f) {
+    return 0.f;
+  } else if (linear <= (1.f / 12.f)) {
+    return sqrtf(3.f * linear);
+  }
+  return 0.17883277f * logf(12.f * linear - 0.28466892f) + 0.55991073f;
+}
+
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth,
+                               SharpYuvTransferFunctionType transfer_type) {
+  float v_float, linear;
+  if (transfer_type == kSharpYuvTransferFunctionSrgb) {
+    return ToLinearSrgb(v, bit_depth);
+  }
+  v_float = (float)v / ((1 << bit_depth) - 1);
+  switch (transfer_type) {
+    case kSharpYuvTransferFunctionBt709:
+    case kSharpYuvTransferFunctionBt601:
+    case kSharpYuvTransferFunctionBt2020_10Bit:
+    case kSharpYuvTransferFunctionBt2020_12Bit:
+      linear = ToLinear709(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt470M:
+      linear = ToLinear470M(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt470Bg:
+      linear = ToLinear470Bg(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte240:
+      linear = ToLinearSmpte240(v_float);
+      break;
+    case kSharpYuvTransferFunctionLinear:
+      return v;
+    case kSharpYuvTransferFunctionLog100:
+      linear = ToLinearLog100(v_float);
+      break;
+    case kSharpYuvTransferFunctionLog100_Sqrt10:
+      linear = ToLinearLog100Sqrt10(v_float);
+      break;
+    case kSharpYuvTransferFunctionIec61966:
+      linear = ToLinearIec61966(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt1361:
+      linear = ToLinearBt1361(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte2084:
+      linear = ToLinearPq(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte428:
+      linear = ToLinearSmpte428(v_float);
+      break;
+    case kSharpYuvTransferFunctionHlg:
+      linear = ToLinearHlg(v_float);
+      break;
+    default:
+      assert(0);
+      linear = 0;
+      break;
+  }
+  return (uint32_t)Roundf(linear * ((1 << 16) - 1));
+}
+
+uint16_t SharpYuvLinearToGamma(uint32_t v, int bit_depth,
+                               SharpYuvTransferFunctionType transfer_type) {
+  float v_float, linear;
+  if (transfer_type == kSharpYuvTransferFunctionSrgb) {
+    return FromLinearSrgb(v, bit_depth);
+  }
+  v_float = (float)v / ((1 << 16) - 1);
+  switch (transfer_type) {
+    case kSharpYuvTransferFunctionBt709:
+    case kSharpYuvTransferFunctionBt601:
+    case kSharpYuvTransferFunctionBt2020_10Bit:
+    case kSharpYuvTransferFunctionBt2020_12Bit:
+      linear = FromLinear709(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt470M:
+      linear = FromLinear470M(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt470Bg:
+      linear = FromLinear470Bg(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte240:
+      linear = FromLinearSmpte240(v_float);
+      break;
+    case kSharpYuvTransferFunctionLinear:
+      return v;
+    case kSharpYuvTransferFunctionLog100:
+      linear = FromLinearLog100(v_float);
+      break;
+    case kSharpYuvTransferFunctionLog100_Sqrt10:
+      linear = FromLinearLog100Sqrt10(v_float);
+      break;
+    case kSharpYuvTransferFunctionIec61966:
+      linear = FromLinearIec61966(v_float);
+      break;
+    case kSharpYuvTransferFunctionBt1361:
+      linear = FromLinearBt1361(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte2084:
+      linear = FromLinearPq(v_float);
+      break;
+    case kSharpYuvTransferFunctionSmpte428:
+      linear = FromLinearSmpte428(v_float);
+      break;
+    case kSharpYuvTransferFunctionHlg:
+      linear = FromLinearHlg(v_float);
+      break;
+    default:
+      assert(0);
+      linear = 0;
+      break;
+  }
+  return (uint16_t)Roundf(linear * ((1 << bit_depth) - 1));
+}
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.h b/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.h
new file mode 100644
index 000000000000..b8ba7e98705e
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_gamma.h
@@ -0,0 +1,38 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Gamma correction utilities.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
+#define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
+
+#include "sharpyuv/sharpyuv.h"
+#include "src/webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Initializes precomputed tables. Must be called once before calling
+// SharpYuvGammaToLinear or SharpYuvLinearToGamma.
+void SharpYuvInitGammaTables(void);
+
+// Converts a 'bit_depth'-bit gamma color value to a 16-bit linear value.
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth,
+                               SharpYuvTransferFunctionType transfer_type);
+
+// Converts a 16-bit linear color value to a 'bit_depth'-bit gamma value.
+uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth,
+                               SharpYuvTransferFunctionType transfer_type);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_neon.c b/3rdparty/libwebp/sharpyuv/sharpyuv_neon.c
new file mode 100644
index 000000000000..5840914865e0
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_neon.c
@@ -0,0 +1,181 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_NEON)
+#include <assert.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+static uint16_t clip_NEON(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const int16x8_t zero = vdupq_n_s16(0);
+  const int16x8_t max = vdupq_n_s16(max_y);
+  uint64x2_t sum = vdupq_n_u64(0);
+  uint64_t diff;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+    const int16x8_t D = vsubq_s16(A, B);       // diff_y
+    const int16x8_t F = vaddq_s16(C, D);       // new_y
+    const uint16x8_t H =
+        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
+    vst1q_u16(dst + i, H);
+    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+  }
+  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)(dst[i]) + diff_y;
+    dst[i] = clip_NEON(new_y, max_y);
+    diff += (uint64_t)(abs(diff_y));
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vld1q_s16(ref + i);
+    const int16x8_t B = vld1q_s16(src + i);
+    const int16x8_t C = vld1q_s16(dst + i);
+    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
+    const int16x8_t E = vaddq_s16(C, D);   // new_uv
+    vst1q_s16(dst + i, E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow16_NEON(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const int16x8_t max = vdupq_n_s16(max_y);
+  const int16x8_t zero = vdupq_n_s16(0);
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t a0 = vld1q_s16(A + i + 0);
+    const int16x8_t a1 = vld1q_s16(A + i + 1);
+    const int16x8_t b0 = vld1q_s16(B + i + 0);
+    const int16x8_t b1 = vld1q_s16(B + i + 1);
+    const int16x8_t a0b1 = vaddq_s16(a0, b1);
+    const int16x8_t a1b0 = vaddq_s16(a1, b0);
+    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
+    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
+    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
+    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+    const int16x8_t e0 = vrhaddq_s16(c1, a0);
+    const int16x8_t e1 = vrhaddq_s16(c0, a1);
+    const int16x8x2_t f = vzipq_s16(e0, e1);
+    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow32_NEON(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const uint16x8_t max = vdupq_n_u16(max_y);
+  for (i = 0; i + 4 <= len; i += 4) {
+    const int16x4_t a0 = vld1_s16(A + i + 0);
+    const int16x4_t a1 = vld1_s16(A + i + 1);
+    const int16x4_t b0 = vld1_s16(B + i + 0);
+    const int16x4_t b1 = vld1_s16(B + i + 1);
+    const int32x4_t a0b1 = vaddl_s16(a0, b1);
+    const int32x4_t a1b0 = vaddl_s16(a1, b0);
+    const int32x4_t a0a1b0b1 = vaddq_s32(a0b1, a1b0);  // A0+A1+B0+B1
+    const int32x4_t a0b1_2 = vaddq_s32(a0b1, a0b1);    // 2*(A0+B1)
+    const int32x4_t a1b0_2 = vaddq_s32(a1b0, a1b0);    // 2*(A1+B0)
+    const int32x4_t c0 = vshrq_n_s32(vaddq_s32(a0b1_2, a0a1b0b1), 3);
+    const int32x4_t c1 = vshrq_n_s32(vaddq_s32(a1b0_2, a0a1b0b1), 3);
+    const int32x4_t e0 = vrhaddq_s32(c1, vmovl_s16(a0));
+    const int32x4_t e1 = vrhaddq_s32(c0, vmovl_s16(a1));
+    const int32x4x2_t f = vzipq_s32(e0, e1);
+
+    const int16x8_t g = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i));
+    const int32x4_t h0 = vaddw_s16(f.val[0], vget_low_s16(g));
+    const int32x4_t h1 = vaddw_s16(f.val[1], vget_high_s16(g));
+    const uint16x8_t i_16 = vcombine_u16(vqmovun_s32(h0), vqmovun_s32(h1));
+    const uint16x8_t i_clamped = vminq_u16(i_16, max);
+    vst1q_u16(out + 2 * i + 0, i_clamped);
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out,
+                                   int bit_depth) {
+  if (bit_depth <= 10) {
+    SharpYuvFilterRow16_NEON(A, B, len, best_y, out, bit_depth);
+  } else {
+    SharpYuvFilterRow32_NEON(A, B, len, best_y, out, bit_depth);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void InitSharpYuvNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) {
+  SharpYuvUpdateY = SharpYuvUpdateY_NEON;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON;
+  SharpYuvFilterRow = SharpYuvFilterRow_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+extern void InitSharpYuvNEON(void);
+
+void InitSharpYuvNEON(void) {}
+
+#endif  // WEBP_USE_NEON
diff --git a/3rdparty/libwebp/sharpyuv/sharpyuv_sse2.c b/3rdparty/libwebp/sharpyuv/sharpyuv_sse2.c
new file mode 100644
index 000000000000..9744d1bb6cfe
--- /dev/null
+++ b/3rdparty/libwebp/sharpyuv/sharpyuv_sse2.c
@@ -0,0 +1,201 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <stdlib.h>
+#include <emmintrin.h>
+
+static uint16_t clip_SSE2(int v, int max) {
+  return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  uint64_t diff = 0;
+  uint32_t tmp[4];
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i sum = zero;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
+    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
+    const __m128i F = _mm_add_epi16(C, D);       // new_y
+    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
+    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
+    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
+    _mm_storeu_si128((__m128i*)(dst + i), H);
+    sum = _mm_add_epi32(sum, I);
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_SSE2(new_y, max_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i = 0;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
+    const __m128i E = _mm_add_epi16(C, D);   // new_uv
+    _mm_storeu_si128((__m128i*)(dst + i), E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const __m128i kCst8 = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
+    const __m128i a0b1 = _mm_add_epi16(a0, b1);
+    const __m128i a1b0 = _mm_add_epi16(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi16(c1, a0);
+    const __m128i d1 = _mm_add_epi16(c0, a1);
+    const __m128i e0 = _mm_srai_epi16(d0, 1);
+    const __m128i e1 = _mm_srai_epi16(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
+    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
+    const __m128i h0 = _mm_add_epi16(g0, f0);
+    const __m128i h1 = _mm_add_epi16(g1, f1);
+    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
+    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static WEBP_INLINE __m128i s16_to_s32(__m128i in) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16);
+}
+
+static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B,
+                                     int len, const uint16_t* best_y,
+                                     uint16_t* out, int bit_depth) {
+  const int max_y = (1 << bit_depth) - 1;
+  int i;
+  const __m128i kCst8 = _mm_set1_epi32(8);
+  const __m128i max = _mm_set1_epi16(max_y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 4 <= len; i += 4) {
+    const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0)));
+    const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1)));
+    const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0)));
+    const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1)));
+    const __m128i a0b1 = _mm_add_epi32(a0, b1);
+    const __m128i a1b0 = _mm_add_epi32(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1);  // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0);  // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi32(c1, a0);
+    const __m128i d1 = _mm_add_epi32(c0, a1);
+    const __m128i e0 = _mm_srai_epi32(d0, 1);
+    const __m128i e1 = _mm_srai_epi32(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi32(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi32(e0, e1);
+    const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1));
+    const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), final);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
+    out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
+  }
+}
+
+static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out,
+                                   int bit_depth) {
+  if (bit_depth <= 10) {
+    SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth);
+  } else {
+    SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void InitSharpYuvSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) {
+  SharpYuvUpdateY = SharpYuvUpdateY_SSE2;
+  SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2;
+  SharpYuvFilterRow = SharpYuvFilterRow_SSE2;
+}
+#else  // !WEBP_USE_SSE2
+
+extern void InitSharpYuvSSE2(void);
+
+void InitSharpYuvSSE2(void) {}
+
+#endif  // WEBP_USE_SSE2
diff --git a/3rdparty/libwebp/src/dec/alpha_dec.c b/3rdparty/libwebp/src/dec/alpha_dec.c
index bce735bfc248..b6c874fb84dc 100644
--- a/3rdparty/libwebp/src/dec/alpha_dec.c
+++ b/3rdparty/libwebp/src/dec/alpha_dec.c
@@ -13,18 +13,20 @@
 
 #include <stdlib.h>
 #include "src/dec/alphai_dec.h"
+#include "src/dec/vp8_dec.h"
 #include "src/dec/vp8i_dec.h"
 #include "src/dec/vp8li_dec.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/quant_levels_dec_utils.h"
 #include "src/utils/utils.h"
 #include "src/webp/format_constants.h"
+#include "src/webp/types.h"
 
 //------------------------------------------------------------------------------
 // ALPHDecoder object.
 
 // Allocates a new alpha decoder instance.
-static ALPHDecoder* ALPHNew(void) {
+WEBP_NODISCARD static ALPHDecoder* ALPHNew(void) {
   ALPHDecoder* const dec = (ALPHDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
   return dec;
 }
@@ -45,9 +47,9 @@ static void ALPHDelete(ALPHDecoder* const dec) {
 // header for alpha data stored using lossless compression.
 // Returns false in case of error in alpha header (data too short, invalid
 // compression method or filter, error in lossless header data etc).
-static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
-                    size_t data_size, const VP8Io* const src_io,
-                    uint8_t* output) {
+WEBP_NODISCARD static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
+                                   size_t data_size, const VP8Io* const src_io,
+                                   uint8_t* output) {
   int ok = 0;
   const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
   const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
@@ -79,7 +81,9 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
   }
 
   // Copy the necessary parameters from src_io to io
-  VP8InitIo(io);
+  if (!VP8InitIo(io)) {
+    return 0;
+  }
   WebPInitCustomIo(NULL, io);
   io->opaque = dec;
   io->width = src_io->width;
@@ -107,7 +111,8 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
 // starting from row number 'row'. It assumes that rows up to (row - 1) have
 // already been decoded.
 // Returns false in case of bitstream error.
-static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
+WEBP_NODISCARD static int ALPHDecode(VP8Decoder* const dec, int row,
+                                     int num_rows) {
   ALPHDecoder* const alph_dec = dec->alph_dec_;
   const int width = alph_dec->width_;
   const int height = alph_dec->io_.crop_bottom;
@@ -117,21 +122,12 @@ static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
     const uint8_t* deltas = dec->alpha_data_ + ALPHA_HEADER_LEN + row * width;
     uint8_t* dst = dec->alpha_plane_ + row * width;
     assert(deltas <= &dec->alpha_data_[dec->alpha_data_size_]);
-    if (alph_dec->filter_ != WEBP_FILTER_NONE) {
-      assert(WebPUnfilters[alph_dec->filter_] != NULL);
-      for (y = 0; y < num_rows; ++y) {
-        WebPUnfilters[alph_dec->filter_](prev_line, deltas, dst, width);
-        prev_line = dst;
-        dst += width;
-        deltas += width;
-      }
-    } else {
-      for (y = 0; y < num_rows; ++y) {
-        memcpy(dst, deltas, width * sizeof(*dst));
-        prev_line = dst;
-        dst += width;
-        deltas += width;
-      }
+    assert(WebPUnfilters[alph_dec->filter_] != NULL);
+    for (y = 0; y < num_rows; ++y) {
+      WebPUnfilters[alph_dec->filter_](prev_line, deltas, dst, width);
+      prev_line = dst;
+      dst += width;
+      deltas += width;
     }
     dec->alpha_prev_line_ = prev_line;
   } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
@@ -147,7 +143,8 @@ static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
   return 1;
 }
 
-static int AllocateAlphaPlane(VP8Decoder* const dec, const VP8Io* const io) {
+WEBP_NODISCARD static int AllocateAlphaPlane(VP8Decoder* const dec,
+                                             const VP8Io* const io) {
   const int stride = io->width;
   const int height = io->crop_bottom;
   const uint64_t alpha_size = (uint64_t)stride * height;
@@ -155,7 +152,8 @@ static int AllocateAlphaPlane(VP8Decoder* const dec, const VP8Io* const io) {
   dec->alpha_plane_mem_ =
       (uint8_t*)WebPSafeMalloc(alpha_size, sizeof(*dec->alpha_plane_));
   if (dec->alpha_plane_mem_ == NULL) {
-    return 0;
+    return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                       "Alpha decoder initialization failed.");
   }
   dec->alpha_plane_ = dec->alpha_plane_mem_;
   dec->alpha_prev_line_ = NULL;
@@ -174,25 +172,34 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec) {
 //------------------------------------------------------------------------------
 // Main entry point.
 
-const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
-                                      const VP8Io* const io,
-                                      int row, int num_rows) {
+WEBP_NODISCARD const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                                     const VP8Io* const io,
+                                                     int row, int num_rows) {
   const int width = io->width;
   const int height = io->crop_bottom;
 
   assert(dec != NULL && io != NULL);
 
   if (row < 0 || num_rows <= 0 || row + num_rows > height) {
-    return NULL;    // sanity check.
+    return NULL;
   }
 
   if (!dec->is_alpha_decoded_) {
     if (dec->alph_dec_ == NULL) {    // Initialize decoder.
       dec->alph_dec_ = ALPHNew();
-      if (dec->alph_dec_ == NULL) return NULL;
+      if (dec->alph_dec_ == NULL) {
+        VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                    "Alpha decoder initialization failed.");
+        return NULL;
+      }
       if (!AllocateAlphaPlane(dec, io)) goto Error;
       if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
                     io, dec->alpha_plane_)) {
+        VP8LDecoder* const vp8l_dec = dec->alph_dec_->vp8l_dec_;
+        VP8SetError(dec,
+                    (vp8l_dec == NULL) ? VP8_STATUS_OUT_OF_MEMORY
+                                       : vp8l_dec->status_,
+                    "Alpha decoder initialization failed.");
         goto Error;
       }
       // if we allowed use of alpha dithering, check whether it's needed at all
diff --git a/3rdparty/libwebp/src/dec/buffer_dec.c b/3rdparty/libwebp/src/dec/buffer_dec.c
index 3cd94eb4d930..11ce76f19e2b 100644
--- a/3rdparty/libwebp/src/dec/buffer_dec.c
+++ b/3rdparty/libwebp/src/dec/buffer_dec.c
@@ -75,7 +75,7 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
     const WebPRGBABuffer* const buf = &buffer->u.RGBA;
     const int stride = abs(buf->stride);
     const uint64_t size =
-        MIN_BUFFER_SIZE(width * kModeBpp[mode], height, stride);
+        MIN_BUFFER_SIZE((uint64_t)width * kModeBpp[mode], height, stride);
     ok &= (size <= buf->size);
     ok &= (stride >= width * kModeBpp[mode]);
     ok &= (buf->rgba != NULL);
@@ -102,7 +102,7 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     int stride;
     uint64_t size;
 
-    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
+    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 31)) {
       return VP8_STATUS_INVALID_PARAM;
     }
     stride = w * kModeBpp[mode];
@@ -117,7 +117,6 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     }
     total_size = size + 2 * uv_size + a_size;
 
-    // Security/sanity checks
     output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
     if (output == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
@@ -156,11 +155,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
   }
   if (WebPIsRGBMode(buffer->colorspace)) {
     WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    buf->rgba += (buffer->height - 1) * buf->stride;
+    buf->rgba += (int64_t)(buffer->height - 1) * buf->stride;
     buf->stride = -buf->stride;
   } else {
     WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const int H = buffer->height;
+    const int64_t H = buffer->height;
     buf->y += (H - 1) * buf->y_stride;
     buf->y_stride = -buf->y_stride;
     buf->u += ((H - 1) >> 1) * buf->u_stride;
@@ -188,8 +187,7 @@ VP8StatusCode WebPAllocateDecBuffer(int width, int height,
       const int ch = options->crop_height;
       const int x = options->crop_left & ~1;
       const int y = options->crop_top & ~1;
-      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
-          x + cw > width || y + ch > height) {
+      if (!WebPCheckCropDimensions(width, height, x, y, cw, ch)) {
         return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
       }
       width = cw;
diff --git a/3rdparty/libwebp/src/dec/frame_dec.c b/3rdparty/libwebp/src/dec/frame_dec.c
index 04609a8e56be..91ca1f8609a9 100644
--- a/3rdparty/libwebp/src/dec/frame_dec.c
+++ b/3rdparty/libwebp/src/dec/frame_dec.c
@@ -705,7 +705,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
                         + cache_size + alpha_size + WEBP_ALIGN_CST;
   uint8_t* mem;
 
-  if (needed != (size_t)needed) return 0;  // check for overflow
+  if (!CheckSizeOverflow(needed)) return 0;  // check for overflow
   if (needed > dec->mem_size_) {
     WebPSafeFree(dec->mem_);
     dec->mem_size_ = 0;
diff --git a/3rdparty/libwebp/src/dec/idec_dec.c b/3rdparty/libwebp/src/dec/idec_dec.c
index 9035df5659da..ad042a1ffcbe 100644
--- a/3rdparty/libwebp/src/dec/idec_dec.c
+++ b/3rdparty/libwebp/src/dec/idec_dec.c
@@ -17,8 +17,10 @@
 
 #include "src/dec/alphai_dec.h"
 #include "src/dec/webpi_dec.h"
+#include "src/dec/vp8_dec.h"
 #include "src/dec/vp8i_dec.h"
 #include "src/utils/utils.h"
+#include "src/webp/decode.h"
 
 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
@@ -161,8 +163,9 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
 
 // Appends data to the end of MemBuffer->buf_. It expands the allocated memory
 // size if required and also updates VP8BitReader's if new memory is allocated.
-static int AppendToMemBuffer(WebPIDecoder* const idec,
-                             const uint8_t* const data, size_t data_size) {
+WEBP_NODISCARD static int AppendToMemBuffer(WebPIDecoder* const idec,
+                                            const uint8_t* const data,
+                                            size_t data_size) {
   VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   MemBuffer* const mem = &idec->mem_;
   const int need_compressed_alpha = NeedCompressedAlpha(idec);
@@ -203,8 +206,9 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
   return 1;
 }
 
-static int RemapMemBuffer(WebPIDecoder* const idec,
-                          const uint8_t* const data, size_t data_size) {
+WEBP_NODISCARD static int RemapMemBuffer(WebPIDecoder* const idec,
+                                         const uint8_t* const data,
+                                         size_t data_size) {
   MemBuffer* const mem = &idec->mem_;
   const uint8_t* const old_buf = mem->buf_;
   const uint8_t* const old_start =
@@ -237,7 +241,8 @@ static void ClearMemBuffer(MemBuffer* const mem) {
   }
 }
 
-static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
+WEBP_NODISCARD static int CheckMemBufferMode(MemBuffer* const mem,
+                                             MemBufferMode expected) {
   if (mem->mode_ == MEM_MODE_NONE) {
     mem->mode_ = expected;    // switch to the expected mode
   } else if (mem->mode_ != expected) {
@@ -248,7 +253,7 @@ static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
 }
 
 // To be called last.
-static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
+WEBP_NODISCARD static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
   const WebPDecoderOptions* const options = idec->params_.options;
   WebPDecBuffer* const output = idec->params_.output;
 
@@ -258,8 +263,10 @@ static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
     if (status != VP8_STATUS_OK) return status;
   }
   if (idec->final_output_ != NULL) {
-    WebPCopyDecBufferPixels(output, idec->final_output_);  // do the slow-copy
+    const VP8StatusCode status = WebPCopyDecBufferPixels(
+        output, idec->final_output_);  // do the slow-copy
     WebPFreeDecBuffer(&idec->output_);
+    if (status != VP8_STATUS_OK) return status;
     *output = *idec->final_output_;
     idec->final_output_ = NULL;
   }
@@ -288,7 +295,7 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
   if (idec->state_ == STATE_VP8_DATA) {
     // Synchronize the thread, clean-up and check for errors.
-    VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
+    (void)VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
   }
   idec->state_ = STATE_ERROR;
   return error;
@@ -329,6 +336,7 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
     if (dec == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
+    dec->incremental_ = 1;
     idec->dec_ = dec;
     dec->alpha_data_ = headers.alpha_data;
     dec->alpha_data_size_ = headers.alpha_data_size;
@@ -601,8 +609,9 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Internal constructor
 
-static WebPIDecoder* NewDecoder(WebPDecBuffer* const output_buffer,
-                                const WebPBitstreamFeatures* const features) {
+WEBP_NODISCARD static WebPIDecoder* NewDecoder(
+    WebPDecBuffer* const output_buffer,
+    const WebPBitstreamFeatures* const features) {
   WebPIDecoder* idec = (WebPIDecoder*)WebPSafeCalloc(1ULL, sizeof(*idec));
   if (idec == NULL) {
     return NULL;
@@ -614,8 +623,10 @@ static WebPIDecoder* NewDecoder(WebPDecBuffer* const output_buffer,
   idec->last_mb_y_ = -1;
 
   InitMemBuffer(&idec->mem_);
-  WebPInitDecBuffer(&idec->output_);
-  VP8InitIo(&idec->io_);
+  if (!WebPInitDecBuffer(&idec->output_) || !VP8InitIo(&idec->io_)) {
+    WebPSafeFree(idec);
+    return NULL;
+  }
 
   WebPResetDecParams(&idec->params_);
   if (output_buffer == NULL || WebPAvoidSlowMemory(output_buffer, features)) {
@@ -674,7 +685,8 @@ void WebPIDelete(WebPIDecoder* idec) {
     if (!idec->is_lossless_) {
       if (idec->state_ == STATE_VP8_DATA) {
         // Synchronize the thread, clean-up and check for errors.
-        VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
+        // TODO(vrabaud) do we care about the return result?
+        (void)VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
       }
       VP8Delete((VP8Decoder*)idec->dec_);
     } else {
@@ -851,8 +863,8 @@ const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
   return src;
 }
 
-uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
-                        int* width, int* height, int* stride) {
+WEBP_NODISCARD uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
+                                       int* width, int* height, int* stride) {
   const WebPDecBuffer* const src = GetOutputBuffer(idec);
   if (src == NULL) return NULL;
   if (src->colorspace >= MODE_YUV) {
@@ -867,10 +879,10 @@ uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
   return src->u.RGBA.rgba;
 }
 
-uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
-                         uint8_t** u, uint8_t** v, uint8_t** a,
-                         int* width, int* height,
-                         int* stride, int* uv_stride, int* a_stride) {
+WEBP_NODISCARD uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
+                                        uint8_t** u, uint8_t** v, uint8_t** a,
+                                        int* width, int* height, int* stride,
+                                        int* uv_stride, int* a_stride) {
   const WebPDecBuffer* const src = GetOutputBuffer(idec);
   if (src == NULL) return NULL;
   if (src->colorspace < MODE_YUV) {
diff --git a/3rdparty/libwebp/src/dec/io_dec.c b/3rdparty/libwebp/src/dec/io_dec.c
index 29dc6345dfd1..5ef6298886eb 100644
--- a/3rdparty/libwebp/src/dec/io_dec.c
+++ b/3rdparty/libwebp/src/dec/io_dec.c
@@ -298,46 +298,57 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int uv_out_height = (out_height + 1) >> 1;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
-  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  // scratch memory for luma rescaler
+  const size_t work_size = 2 * (size_t)out_width;
   const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
-  size_t tmp_size, rescaler_size;
+  uint64_t total_size;
+  size_t rescaler_size;
   rescaler_t* work;
   WebPRescaler* scalers;
   const int num_rescalers = has_alpha ? 4 : 3;
 
-  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
+  total_size = ((uint64_t)work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
-    tmp_size += work_size * sizeof(*work);
+    total_size += (uint64_t)work_size * sizeof(*work);
   }
   rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+  total_size += rescaler_size;
+  if (!CheckSizeOverflow(total_size)) {
+    return 0;
+  }
 
-  p->memory = WebPSafeMalloc(1ULL, tmp_size + rescaler_size);
+  p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
 
-  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + tmp_size);
+  scalers = (WebPRescaler*)WEBP_ALIGN(
+      (const uint8_t*)work + total_size - rescaler_size);
   p->scaler_y = &scalers[0];
   p->scaler_u = &scalers[1];
   p->scaler_v = &scalers[2];
   p->scaler_a = has_alpha ? &scalers[3] : NULL;
 
-  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
-                   buf->y, out_width, out_height, buf->y_stride, 1,
-                   work);
-  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
-                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   work + work_size);
-  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
-                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   work + work_size + uv_work_size);
+  if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+                        buf->y, out_width, out_height, buf->y_stride, 1,
+                        work) ||
+      !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+                        buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+                        work + work_size) ||
+      !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+                        buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+                        work + work_size + uv_work_size)) {
+    return 0;
+  }
   p->emit = EmitRescaledYUV;
 
   if (has_alpha) {
-    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
-                     buf->a, out_width, out_height, buf->a_stride, 1,
-                     work + work_size + 2 * uv_work_size);
+    if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+                          buf->a, out_width, out_height, buf->a_stride, 1,
+                          work + work_size + 2 * uv_work_size)) {
+      return 0;
+    }
     p->emit_alpha = EmitRescaledAlphaYUV;
     WebPInitAlphaProcessing();
   }
@@ -480,51 +491,58 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int out_height = io->scaled_height;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
-  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  // scratch memory for one rescaler
+  const size_t work_size = 2 * (size_t)out_width;
   rescaler_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2, total_size, rescaler_size;
+  uint64_t tmp_size1, tmp_size2, total_size;
+  size_t rescaler_size;
   WebPRescaler* scalers;
   const int num_rescalers = has_alpha ? 4 : 3;
 
-  tmp_size1 = 3 * work_size;
-  tmp_size2 = 3 * out_width;
-  if (has_alpha) {
-    tmp_size1 += work_size;
-    tmp_size2 += out_width;
-  }
+  tmp_size1 = (uint64_t)num_rescalers * work_size;
+  tmp_size2 = (uint64_t)num_rescalers * out_width;
   total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
   rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+  total_size += rescaler_size;
+  if (!CheckSizeOverflow(total_size)) {
+    return 0;
+  }
 
-  p->memory = WebPSafeMalloc(1ULL, total_size + rescaler_size);
+  p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
 
-  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + total_size);
+  scalers = (WebPRescaler*)WEBP_ALIGN(
+      (const uint8_t*)work + total_size - rescaler_size);
   p->scaler_y = &scalers[0];
   p->scaler_u = &scalers[1];
   p->scaler_v = &scalers[2];
   p->scaler_a = has_alpha ? &scalers[3] : NULL;
 
-  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
-                   tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   work + 0 * work_size);
-  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
-                   tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   work + 1 * work_size);
-  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
-                   tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   work + 2 * work_size);
+  if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+                        tmp + 0 * out_width, out_width, out_height, 0, 1,
+                        work + 0 * work_size) ||
+      !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+                        tmp + 1 * out_width, out_width, out_height, 0, 1,
+                        work + 1 * work_size) ||
+      !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+                        tmp + 2 * out_width, out_width, out_height, 0, 1,
+                        work + 2 * work_size)) {
+    return 0;
+  }
   p->emit = EmitRescaledRGB;
   WebPInitYUV444Converters();
 
   if (has_alpha) {
-    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
-                     tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     work + 3 * work_size);
+    if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+                          tmp + 3 * out_width, out_width, out_height, 0, 1,
+                          work + 3 * work_size)) {
+      return 0;
+    }
     p->emit_alpha = EmitRescaledAlphaRGB;
     if (p->output->colorspace == MODE_RGBA_4444 ||
         p->output->colorspace == MODE_rgbA_4444) {
diff --git a/3rdparty/libwebp/src/dec/tree_dec.c b/3rdparty/libwebp/src/dec/tree_dec.c
index 1c6fdea27cc6..243460595329 100644
--- a/3rdparty/libwebp/src/dec/tree_dec.c
+++ b/3rdparty/libwebp/src/dec/tree_dec.c
@@ -12,10 +12,11 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "src/dec/vp8i_dec.h"
+#include "src/dsp/cpu.h"
 #include "src/utils/bit_reader_inl_utils.h"
 
 #if !defined(USE_GENERIC_TREE)
-#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64
 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
 #define USE_GENERIC_TREE 1   // ALTERNATE_CODE
 #else
diff --git a/3rdparty/libwebp/src/dec/vp8_dec.c b/3rdparty/libwebp/src/dec/vp8_dec.c
index 8f736974784e..2ee890060599 100644
--- a/3rdparty/libwebp/src/dec/vp8_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8_dec.c
@@ -86,6 +86,8 @@ void VP8Delete(VP8Decoder* const dec) {
 
 int VP8SetError(VP8Decoder* const dec,
                 VP8StatusCode error, const char* const msg) {
+  // VP8_STATUS_SUSPENDED is only meaningful in incremental decoding.
+  assert(dec->incremental_ || error != VP8_STATUS_SUSPENDED);
   // The oldest error reported takes precedence over the new one.
   if (dec->status_ == VP8_STATUS_OK) {
     dec->status_ = error;
@@ -190,12 +192,12 @@ static int ParseSegmentHeader(VP8BitReader* br,
 }
 
 // Paragraph 9.5
-// This function returns VP8_STATUS_SUSPENDED if we don't have all the
-// necessary data in 'buf'.
-// This case is not necessarily an error (for incremental decoding).
-// Still, no bitreader is ever initialized to make it possible to read
-// unavailable memory.
-// If we don't even have the partitions' sizes, than VP8_STATUS_NOT_ENOUGH_DATA
+// If we don't have all the necessary data in 'buf', this function returns
+// VP8_STATUS_SUSPENDED in incremental decoding, VP8_STATUS_NOT_ENOUGH_DATA
+// otherwise.
+// In incremental decoding, this case is not necessarily an error. Still, no
+// bitreader is ever initialized to make it possible to read unavailable memory.
+// If we don't even have the partitions' sizes, then VP8_STATUS_NOT_ENOUGH_DATA
 // is returned, and this is an unrecoverable error.
 // If the partitions were positioned ok, VP8_STATUS_OK is returned.
 static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
@@ -225,8 +227,10 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
     sz += 3;
   }
   VP8InitBitReader(dec->parts_ + last_part, part_start, size_left);
-  return (part_start < buf_end) ? VP8_STATUS_OK :
-           VP8_STATUS_SUSPENDED;   // Init is ok, but there's not enough data
+  if (part_start < buf_end) return VP8_STATUS_OK;
+  return dec->incremental_
+             ? VP8_STATUS_SUSPENDED  // Init is ok, but there's not enough data
+             : VP8_STATUS_NOT_ENOUGH_DATA;
 }
 
 // Paragraph 9.4
@@ -335,7 +339,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     io->scaled_width = io->width;
     io->scaled_height = io->height;
 
-    io->mb_w = io->width;   // sanity check
+    io->mb_w = io->width;   // for soundness
     io->mb_h = io->height;  // ditto
 
     VP8ResetProba(&dec->proba_);
@@ -403,7 +407,7 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+// See section 13-2: https://datatracker.ietf.org/doc/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
   int v;
   if (!VP8GetBit(br, p[3], "coeffs")) {
@@ -494,6 +498,8 @@ static int GetCoeffsAlt(VP8BitReader* const br,
   return 16;
 }
 
+extern VP8CPUInfo VP8GetCPUInfo;
+
 WEBP_DSP_INIT_FUNC(InitGetCoeffs) {
   if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
     GetCoeffs = GetCoeffsAlt;
diff --git a/3rdparty/libwebp/src/dec/vp8_dec.h b/3rdparty/libwebp/src/dec/vp8_dec.h
index a05405df72ee..91fe10409312 100644
--- a/3rdparty/libwebp/src/dec/vp8_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8_dec.h
@@ -15,6 +15,7 @@
 #define WEBP_DEC_VP8_DEC_H_
 
 #include "src/webp/decode.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -108,16 +109,14 @@ struct VP8Io {
 };
 
 // Internal, version-checked, entry point
-int VP8InitIoInternal(VP8Io* const, int);
+WEBP_NODISCARD int VP8InitIoInternal(VP8Io* const, int);
 
 // Set the custom IO function pointers and user-data. The setter for IO hooks
 // should be called before initiating incremental decoding. Returns true if
 // WebPIDecoder object is successfully modified, false otherwise.
-int WebPISetIOHooks(WebPIDecoder* const idec,
-                    VP8IoPutHook put,
-                    VP8IoSetupHook setup,
-                    VP8IoTeardownHook teardown,
-                    void* user_data);
+WEBP_NODISCARD int WebPISetIOHooks(WebPIDecoder* const idec, VP8IoPutHook put,
+                                   VP8IoSetupHook setup,
+                                   VP8IoTeardownHook teardown, void* user_data);
 
 // Main decoding object. This is an opaque structure.
 typedef struct VP8Decoder VP8Decoder;
@@ -128,17 +127,17 @@ VP8Decoder* VP8New(void);
 // Must be called to make sure 'io' is initialized properly.
 // Returns false in case of version mismatch. Upon such failure, no other
 // decoding function should be called (VP8Decode, VP8GetHeaders, ...)
-static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
+WEBP_NODISCARD static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
   return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }
 
 // Decode the VP8 frame header. Returns true if ok.
 // Note: 'io->data' must be pointing to the start of the VP8 frame header.
-int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
 
 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
 // Returns false in case of error.
-int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
 
 // Return current status of the decoder:
 VP8StatusCode VP8Status(VP8Decoder* const dec);
diff --git a/3rdparty/libwebp/src/dec/vp8i_dec.h b/3rdparty/libwebp/src/dec/vp8i_dec.h
index a0c0af15799e..cb21d475ae03 100644
--- a/3rdparty/libwebp/src/dec/vp8i_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8i_dec.h
@@ -21,6 +21,7 @@
 #include "src/utils/random_utils.h"
 #include "src/utils/thread_utils.h"
 #include "src/dsp/dsp.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -31,7 +32,7 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 2
+#define DEC_MIN_VERSION 4
 #define DEC_REV_VERSION 0
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
@@ -186,6 +187,7 @@ struct VP8Decoder {
 
   // Main data source
   VP8BitReader br_;
+  int incremental_;  // if true, incremental decoding is expected
 
   // headers
   VP8FrameHeader   frm_hdr_;
@@ -281,7 +283,7 @@ int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
 void VP8ParseQuant(VP8Decoder* const dec);
 
 // in frame.c
-int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@@ -289,7 +291,7 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
-int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
 // Return the multi-threading method to use (0=off), depending
 // on options and bitstream size. Only for lossy decoding.
 int VP8GetThreadMethod(const WebPDecoderOptions* const options,
@@ -299,11 +301,12 @@ int VP8GetThreadMethod(const WebPDecoderOptions* const options,
 void VP8InitDithering(const WebPDecoderOptions* const options,
                       VP8Decoder* const dec);
 // Process the last decoded row (filtering + output).
-int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
-int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
+WEBP_NODISCARD int VP8DecodeMB(VP8Decoder* const dec,
+                               VP8BitReader* const token_br);
 
 // in alpha.c
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
diff --git a/3rdparty/libwebp/src/dec/vp8l_dec.c b/3rdparty/libwebp/src/dec/vp8l_dec.c
index 2d603b437974..11c00ea964a9 100644
--- a/3rdparty/libwebp/src/dec/vp8l_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8l_dec.c
@@ -12,6 +12,7 @@
 // Authors: Vikas Arora (vikaas.arora@gmail.com)
 //          Jyrki Alakuijala (jyrki@google.com)
 
+#include <assert.h>
 #include <stdlib.h>
 
 #include "src/dec/alphai_dec.h"
@@ -84,7 +85,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // to 256 (green component values) + 24 (length prefix values)
 // + color_cache_size (between 0 and 2048).
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
-// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+// https://github.com/madler/zlib/blob/v1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
 static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 654,
@@ -101,6 +102,14 @@ static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 2704
 };
 
+static int VP8LSetError(VP8LDecoder* const dec, VP8StatusCode error) {
+  // The oldest error reported takes precedence over the new one.
+  if (dec->status_ == VP8_STATUS_OK || dec->status_ == VP8_STATUS_SUSPENDED) {
+    dec->status_ = error;
+  }
+  return 0;
+}
+
 static int DecodeImageStream(int xsize, int ysize,
                              int is_level0,
                              VP8LDecoder* const dec,
@@ -178,7 +187,7 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
 
 //------------------------------------------------------------------------------
 // Decodes the next Huffman code from bit-stream.
-// FillBitWindow(br) needs to be called at minimum every second call
+// VP8LFillBitWindow(br) needs to be called at minimum every second call
 // to ReadSymbol, in order to pre-fetch enough bits.
 static WEBP_INLINE int ReadSymbol(const HuffmanCode* table,
                                   VP8LBitReader* const br) {
@@ -253,11 +262,11 @@ static int ReadHuffmanCodeLengths(
   int symbol;
   int max_symbol;
   int prev_code_len = DEFAULT_CODE_LENGTH;
-  HuffmanCode table[1 << LENGTHS_TABLE_BITS];
+  HuffmanTables tables;
 
-  if (!VP8LBuildHuffmanTable(table, LENGTHS_TABLE_BITS,
-                             code_length_code_lengths,
-                             NUM_CODE_LENGTH_CODES)) {
+  if (!VP8LHuffmanTablesAllocate(1 << LENGTHS_TABLE_BITS, &tables) ||
+      !VP8LBuildHuffmanTable(&tables, LENGTHS_TABLE_BITS,
+                             code_length_code_lengths, NUM_CODE_LENGTH_CODES)) {
     goto End;
   }
 
@@ -277,7 +286,7 @@ static int ReadHuffmanCodeLengths(
     int code_len;
     if (max_symbol-- == 0) break;
     VP8LFillBitWindow(br);
-    p = &table[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
+    p = &tables.curr_segment->start[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
     VP8LSetBitPos(br, br->bit_pos_ + p->bits);
     code_len = p->value;
     if (code_len < kCodeLengthLiterals) {
@@ -300,14 +309,16 @@ static int ReadHuffmanCodeLengths(
   ok = 1;
 
  End:
-  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+  VP8LHuffmanTablesDeallocate(&tables);
+  if (!ok) return VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
   return ok;
 }
 
 // 'code_lengths' is pre-allocated temporary buffer, used for creating Huffman
 // tree.
 static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
-                           int* const code_lengths, HuffmanCode* const table) {
+                           int* const code_lengths,
+                           HuffmanTables* const table) {
   int ok = 0;
   int size = 0;
   VP8LBitReader* const br = &dec->br_;
@@ -321,7 +332,7 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
     // The first code is either 1 bit or 8 bit code.
     int symbol = VP8LReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
     code_lengths[symbol] = 1;
-    // The second code (if present), is always 8 bit long.
+    // The second code (if present), is always 8 bits long.
     if (num_symbols == 2) {
       symbol = VP8LReadBits(br, 8);
       code_lengths[symbol] = 1;
@@ -331,10 +342,7 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
     int i;
     int code_length_code_lengths[NUM_CODE_LENGTH_CODES] = { 0 };
     const int num_codes = VP8LReadBits(br, 4) + 4;
-    if (num_codes > NUM_CODE_LENGTH_CODES) {
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-      return 0;
-    }
+    assert(num_codes <= NUM_CODE_LENGTH_CODES);
 
     for (i = 0; i < num_codes; ++i) {
       code_length_code_lengths[kCodeLengthCodeOrder[i]] = VP8LReadBits(br, 3);
@@ -349,36 +357,35 @@ static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
                                  code_lengths, alphabet_size);
   }
   if (!ok || size == 0) {
-    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
   }
   return size;
 }
 
 static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
                             int color_cache_bits, int allow_recursion) {
-  int i, j;
+  int i;
   VP8LBitReader* const br = &dec->br_;
   VP8LMetadata* const hdr = &dec->hdr_;
   uint32_t* huffman_image = NULL;
   HTreeGroup* htree_groups = NULL;
-  HuffmanCode* huffman_tables = NULL;
-  HuffmanCode* huffman_table = NULL;
+  HuffmanTables* huffman_tables = &hdr->huffman_tables_;
   int num_htree_groups = 1;
   int num_htree_groups_max = 1;
-  int max_alphabet_size = 0;
-  int* code_lengths = NULL;
-  const int table_size = kTableSize[color_cache_bits];
   int* mapping = NULL;
   int ok = 0;
 
+  // Check the table has been 0 initialized (through InitMetadata).
+  assert(huffman_tables->root.start == NULL);
+  assert(huffman_tables->curr_segment == NULL);
+
   if (allow_recursion && VP8LReadBits(br, 1)) {
     // use meta Huffman codes.
     const int huffman_precision = VP8LReadBits(br, 3) + 2;
     const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
     const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
     const int huffman_pixs = huffman_xsize * huffman_ysize;
-    if (!DecodeImageStream(huffman_xsize, huffman_ysize, 0, dec,
+    if (!DecodeImageStream(huffman_xsize, huffman_ysize, /*is_level0=*/0, dec,
                            &huffman_image)) {
       goto Error;
     }
@@ -402,7 +409,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
       // values [0, num_htree_groups)
       mapping = (int*)WebPSafeMalloc(num_htree_groups_max, sizeof(*mapping));
       if (mapping == NULL) {
-        dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+        VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
         goto Error;
       }
       // -1 means a value is unmapped, and therefore unused in the Huffman
@@ -421,29 +428,55 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 
   if (br->eos_) goto Error;
 
-  // Find maximum alphabet size for the htree group.
-  for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-    int alphabet_size = kAlphabetSize[j];
-    if (j == 0 && color_cache_bits > 0) {
-      alphabet_size += 1 << color_cache_bits;
-    }
-    if (max_alphabet_size < alphabet_size) {
-      max_alphabet_size = alphabet_size;
-    }
+  if (!ReadHuffmanCodesHelper(color_cache_bits, num_htree_groups,
+                              num_htree_groups_max, mapping, dec,
+                              huffman_tables, &htree_groups)) {
+    goto Error;
   }
+  ok = 1;
 
-  code_lengths = (int*)WebPSafeCalloc((uint64_t)max_alphabet_size,
-                                      sizeof(*code_lengths));
-  huffman_tables = (HuffmanCode*)WebPSafeMalloc(num_htree_groups * table_size,
-                                                sizeof(*huffman_tables));
-  htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
+  // All OK. Finalize pointers.
+  hdr->huffman_image_ = huffman_image;
+  hdr->num_htree_groups_ = num_htree_groups;
+  hdr->htree_groups_ = htree_groups;
+
+ Error:
+  WebPSafeFree(mapping);
+  if (!ok) {
+    WebPSafeFree(huffman_image);
+    VP8LHuffmanTablesDeallocate(huffman_tables);
+    VP8LHtreeGroupsFree(htree_groups);
+  }
+  return ok;
+}
 
-  if (htree_groups == NULL || code_lengths == NULL || huffman_tables == NULL) {
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+int ReadHuffmanCodesHelper(int color_cache_bits, int num_htree_groups,
+                           int num_htree_groups_max, const int* const mapping,
+                           VP8LDecoder* const dec,
+                           HuffmanTables* const huffman_tables,
+                           HTreeGroup** const htree_groups) {
+  int i, j, ok = 0;
+  const int max_alphabet_size =
+      kAlphabetSize[0] + ((color_cache_bits > 0) ? 1 << color_cache_bits : 0);
+  const int table_size = kTableSize[color_cache_bits];
+  int* code_lengths = NULL;
+
+  if ((mapping == NULL && num_htree_groups != num_htree_groups_max) ||
+      num_htree_groups > num_htree_groups_max) {
+    goto Error;
+  }
+
+  code_lengths =
+      (int*)WebPSafeCalloc((uint64_t)max_alphabet_size, sizeof(*code_lengths));
+  *htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
+
+  if (*htree_groups == NULL || code_lengths == NULL ||
+      !VP8LHuffmanTablesAllocate(num_htree_groups * table_size,
+                                 huffman_tables)) {
+    VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
     goto Error;
   }
 
-  huffman_table = huffman_tables;
   for (i = 0; i < num_htree_groups_max; ++i) {
     // If the index "i" is unused in the Huffman image, just make sure the
     // coefficients are valid but do not store them.
@@ -460,7 +493,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
       }
     } else {
       HTreeGroup* const htree_group =
-          &htree_groups[(mapping == NULL) ? i : mapping[i]];
+          &(*htree_groups)[(mapping == NULL) ? i : mapping[i]];
       HuffmanCode** const htrees = htree_group->htrees;
       int size;
       int total_size = 0;
@@ -468,19 +501,20 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
       int max_bits = 0;
       for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
         int alphabet_size = kAlphabetSize[j];
-        htrees[j] = huffman_table;
         if (j == 0 && color_cache_bits > 0) {
           alphabet_size += (1 << color_cache_bits);
         }
-        size = ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_table);
+        size =
+            ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_tables);
+        htrees[j] = huffman_tables->curr_segment->curr_table;
         if (size == 0) {
           goto Error;
         }
         if (is_trivial_literal && kLiteralMap[j] == 1) {
-          is_trivial_literal = (huffman_table->bits == 0);
+          is_trivial_literal = (htrees[j]->bits == 0);
         }
-        total_size += huffman_table->bits;
-        huffman_table += size;
+        total_size += htrees[j]->bits;
+        huffman_tables->curr_segment->curr_table += size;
         if (j <= ALPHA) {
           int local_max_bits = code_lengths[0];
           int k;
@@ -511,19 +545,12 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   }
   ok = 1;
 
-  // All OK. Finalize pointers.
-  hdr->huffman_image_ = huffman_image;
-  hdr->num_htree_groups_ = num_htree_groups;
-  hdr->htree_groups_ = htree_groups;
-  hdr->huffman_tables_ = huffman_tables;
-
  Error:
   WebPSafeFree(code_lengths);
-  WebPSafeFree(mapping);
   if (!ok) {
-    WebPSafeFree(huffman_image);
-    WebPSafeFree(huffman_tables);
-    VP8LHtreeGroupsFree(htree_groups);
+    VP8LHuffmanTablesDeallocate(huffman_tables);
+    VP8LHtreeGroupsFree(*htree_groups);
+    *htree_groups = NULL;
   }
   return ok;
 }
@@ -547,8 +574,7 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
                                scaled_data_size * sizeof(*scaled_data);
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(memory_size, sizeof(*memory));
   if (memory == NULL) {
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
   }
   assert(dec->rescaler_memory == NULL);
   dec->rescaler_memory = memory;
@@ -559,8 +585,11 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   memory += work_size * sizeof(*work);
   scaled_data = (uint32_t*)memory;
 
-  WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
-                   out_width, out_height, 0, num_channels, work);
+  if (!WebPRescalerInit(dec->rescaler, in_width, in_height,
+                        (uint8_t*)scaled_data, out_width, out_height,
+                        0, num_channels, work)) {
+    return 0;
+  }
   return 1;
 }
 #endif   // WEBP_REDUCE_SIZE
@@ -574,13 +603,14 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
   uint32_t* const src = (uint32_t*)rescaler->dst;
+  uint8_t* dst = rgba;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
-    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
     WebPRescalerExportRow(rescaler);
     WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
+    dst += rgba_stride;
     ++num_lines_out;
   }
   return num_lines_out;
@@ -594,8 +624,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
   int num_lines_in = 0;
   int num_lines_out = 0;
   while (num_lines_in < mb_h) {
-    uint8_t* const row_in = in + num_lines_in * in_stride;
-    uint8_t* const row_out = out + num_lines_out * out_stride;
+    uint8_t* const row_in = in + (uint64_t)num_lines_in * in_stride;
+    uint8_t* const row_out = out + (uint64_t)num_lines_out * out_stride;
     const int lines_left = mb_h - num_lines_in;
     const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
     int lines_imported;
@@ -796,7 +826,8 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
       const WebPDecBuffer* const output = dec->output_;
       if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
-        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+        uint8_t* const rgba =
+            buf->rgba + (int64_t)dec->last_out_row_ * buf->stride;
         const int num_rows_out =
 #if !defined(WEBP_REDUCE_SIZE)
          io->use_scaling ?
@@ -1077,12 +1108,10 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
  End:
   br->eos_ = VP8LIsEndOfStream(br);
   if (!ok || (br->eos_ && pos < end)) {
-    ok = 0;
-    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
-                            : VP8_STATUS_BITSTREAM_ERROR;
-  } else {
-    dec->last_pixel_ = pos;
+    return VP8LSetError(
+        dec, br->eos_ ? VP8_STATUS_SUSPENDED : VP8_STATUS_BITSTREAM_ERROR);
   }
+  dec->last_pixel_ = pos;
   return ok;
 }
 
@@ -1232,9 +1261,20 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
   }
 
   br->eos_ = VP8LIsEndOfStream(br);
-  if (dec->incremental_ && br->eos_ && src < src_end) {
+  // In incremental decoding:
+  // br->eos_ && src < src_last: if 'br' reached the end of the buffer and
+  // 'src_last' has not been reached yet, there is not enough data. 'dec' has to
+  // be reset until there is more data.
+  // !br->eos_ && src < src_last: this cannot happen as either the buffer is
+  // fully read, either enough has been read to reach 'src_last'.
+  // src >= src_last: 'src_last' is reached, all is fine. 'src' can actually go
+  // beyond 'src_last' in case the image is cropped and an LZ77 goes further.
+  // The buffer might have been enough or there is some left. 'br->eos_' does
+  // not matter.
+  assert(!dec->incremental_ || (br->eos_ && src < src_last) || src >= src_last);
+  if (dec->incremental_ && br->eos_ && src < src_last) {
     RestoreState(dec);
-  } else if (!br->eos_) {
+  } else if ((dec->incremental_ && src >= src_last) || !br->eos_) {
     // Process the remaining rows corresponding to last row-block.
     if (process_func != NULL) {
       process_func(dec, row > last_row ? last_row : row);
@@ -1249,8 +1289,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
   return 1;
 
  Error:
-  dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
-  return 0;
+  return VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
 }
 
 // -----------------------------------------------------------------------------
@@ -1276,7 +1315,7 @@ static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
     uint8_t* const new_data = (uint8_t*)new_color_map;
     new_color_map[0] = transform->data_[0];
     for (i = 4; i < 4 * num_colors; ++i) {
-      // Equivalent to AddPixelEq(), on a byte-basis.
+      // Equivalent to VP8LAddPixels(), on a byte-basis.
       new_data[i] = (data[i] + new_data[i - 4]) & 0xff;
     }
     for (; i < 4 * final_num_colors; ++i) {
@@ -1317,7 +1356,7 @@ static int ReadTransform(int* const xsize, int const* ysize,
                                                transform->bits_),
                              VP8LSubSampleSize(transform->ysize_,
                                                transform->bits_),
-                             0, dec, &transform->data_);
+                             /*is_level0=*/0, dec, &transform->data_);
       break;
     case COLOR_INDEXING_TRANSFORM: {
        const int num_colors = VP8LReadBits(br, 8) + 1;
@@ -1327,11 +1366,14 @@ static int ReadTransform(int* const xsize, int const* ysize,
                       : 3;
        *xsize = VP8LSubSampleSize(transform->xsize_, bits);
        transform->bits_ = bits;
-       ok = DecodeImageStream(num_colors, 1, 0, dec, &transform->data_);
-       ok = ok && ExpandColorMap(num_colors, transform);
+       ok = DecodeImageStream(num_colors, /*ysize=*/1, /*is_level0=*/0, dec,
+                              &transform->data_);
+       if (ok && !ExpandColorMap(num_colors, transform)) {
+         return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
+       }
       break;
     }
-    case SUBTRACT_GREEN:
+    case SUBTRACT_GREEN_TRANSFORM:
       break;
     default:
       assert(0);    // can't happen
@@ -1353,7 +1395,7 @@ static void ClearMetadata(VP8LMetadata* const hdr) {
   assert(hdr != NULL);
 
   WebPSafeFree(hdr->huffman_image_);
-  WebPSafeFree(hdr->huffman_tables_);
+  VP8LHuffmanTablesDeallocate(&hdr->huffman_tables_);
   VP8LHtreeGroupsFree(hdr->htree_groups_);
   VP8LColorCacheClear(&hdr->color_cache_);
   VP8LColorCacheClear(&hdr->saved_color_cache_);
@@ -1434,7 +1476,7 @@ static int DecodeImageStream(int xsize, int ysize,
     color_cache_bits = VP8LReadBits(br, 4);
     ok = (color_cache_bits >= 1 && color_cache_bits <= MAX_CACHE_BITS);
     if (!ok) {
-      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
       goto End;
     }
   }
@@ -1443,7 +1485,7 @@ static int DecodeImageStream(int xsize, int ysize,
   ok = ok && ReadHuffmanCodes(dec, transform_xsize, transform_ysize,
                               color_cache_bits, is_level0);
   if (!ok) {
-    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
     goto End;
   }
 
@@ -1451,8 +1493,7 @@ static int DecodeImageStream(int xsize, int ysize,
   if (color_cache_bits > 0) {
     hdr->color_cache_size_ = 1 << color_cache_bits;
     if (!VP8LColorCacheInit(&hdr->color_cache_, color_cache_bits)) {
-      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-      ok = 0;
+      ok = VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
       goto End;
     }
   } else {
@@ -1469,8 +1510,7 @@ static int DecodeImageStream(int xsize, int ysize,
     const uint64_t total_size = (uint64_t)transform_xsize * transform_ysize;
     data = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*data));
     if (data == NULL) {
-      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-      ok = 0;
+      ok = VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
       goto End;
     }
   }
@@ -1514,9 +1554,8 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
   assert(dec->width_ <= final_width);
   dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
   if (dec->pixels_ == NULL) {
-    dec->argb_cache_ = NULL;    // for sanity check
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-    return 0;
+    dec->argb_cache_ = NULL;    // for soundness
+    return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
   }
   dec->argb_cache_ = dec->pixels_ + num_pixels + cache_top_pixels;
   return 1;
@@ -1524,11 +1563,10 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
 
 static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
   const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
-  dec->argb_cache_ = NULL;    // for sanity check
+  dec->argb_cache_ = NULL;    // for soundness
   dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
   if (dec->pixels_ == NULL) {
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
   }
   return 1;
 }
@@ -1583,7 +1621,8 @@ int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
   dec->status_ = VP8_STATUS_OK;
   VP8LInitBitReader(&dec->br_, data, data_size);
 
-  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, 1, dec, NULL)) {
+  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, /*is_level0=*/1,
+                         dec, /*decoded_data=*/NULL)) {
     goto Err;
   }
 
@@ -1638,22 +1677,24 @@ int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {
 
   if (dec == NULL) return 0;
   if (io == NULL) {
-    dec->status_ = VP8_STATUS_INVALID_PARAM;
-    return 0;
+    return VP8LSetError(dec, VP8_STATUS_INVALID_PARAM);
   }
 
   dec->io_ = io;
   dec->status_ = VP8_STATUS_OK;
   VP8LInitBitReader(&dec->br_, io->data, io->data_size);
   if (!ReadImageInfo(&dec->br_, &width, &height, &has_alpha)) {
-    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    VP8LSetError(dec, VP8_STATUS_BITSTREAM_ERROR);
     goto Error;
   }
   dec->state_ = READ_DIM;
   io->width = width;
   io->height = height;
 
-  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Error;
+  if (!DecodeImageStream(width, height, /*is_level0=*/1, dec,
+                         /*decoded_data=*/NULL)) {
+    goto Error;
+  }
   return 1;
 
  Error:
@@ -1666,10 +1707,9 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
   VP8Io* io = NULL;
   WebPDecParams* params = NULL;
 
-  // Sanity checks.
   if (dec == NULL) return 0;
 
-  assert(dec->hdr_.huffman_tables_ != NULL);
+  assert(dec->hdr_.huffman_tables_.root.start != NULL);
   assert(dec->hdr_.htree_groups_ != NULL);
   assert(dec->hdr_.num_htree_groups_ > 0);
 
@@ -1684,7 +1724,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
     assert(dec->output_ != NULL);
 
     if (!WebPIoInitFromOptions(params->options, io, MODE_BGRA)) {
-      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      VP8LSetError(dec, VP8_STATUS_INVALID_PARAM);
       goto Err;
     }
 
@@ -1694,7 +1734,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
     if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
 #else
     if (io->use_scaling) {
-      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      VP8LSetError(dec, VP8_STATUS_INVALID_PARAM);
       goto Err;
     }
 #endif
@@ -1712,7 +1752,7 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
           dec->hdr_.saved_color_cache_.colors_ == NULL) {
         if (!VP8LColorCacheInit(&dec->hdr_.saved_color_cache_,
                                 dec->hdr_.color_cache_.hash_bits_)) {
-          dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+          VP8LSetError(dec, VP8_STATUS_OUT_OF_MEMORY);
           goto Err;
         }
       }
diff --git a/3rdparty/libwebp/src/dec/vp8li_dec.h b/3rdparty/libwebp/src/dec/vp8li_dec.h
index 72b2e8612084..9a13bcc98d25 100644
--- a/3rdparty/libwebp/src/dec/vp8li_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8li_dec.h
@@ -20,6 +20,7 @@
 #include "src/utils/bit_reader_utils.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/huffman_utils.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -51,7 +52,7 @@ typedef struct {
   uint32_t*       huffman_image_;
   int             num_htree_groups_;
   HTreeGroup*     htree_groups_;
-  HuffmanCode*    huffman_tables_;
+  HuffmanTables   huffman_tables_;
 } VP8LMetadata;
 
 typedef struct VP8LDecoder VP8LDecoder;
@@ -99,25 +100,26 @@ struct ALPHDecoder;  // Defined in dec/alphai.h.
 
 // Decodes image header for alpha data stored using lossless compression.
 // Returns false in case of error.
-int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
-                          const uint8_t* const data, size_t data_size);
+WEBP_NODISCARD int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
+                                         const uint8_t* const data,
+                                         size_t data_size);
 
 // Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
 // already decoded in previous call(s), it will resume decoding from where it
 // was paused.
 // Returns false in case of bitstream error.
-int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
-                               int last_row);
+WEBP_NODISCARD int VP8LDecodeAlphaImageStream(
+    struct ALPHDecoder* const alph_dec, int last_row);
 
 // Allocates and initialize a new lossless decoder instance.
-VP8LDecoder* VP8LNew(void);
+WEBP_NODISCARD VP8LDecoder* VP8LNew(void);
 
 // Decodes the image header. Returns false in case of error.
-int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
+WEBP_NODISCARD int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
 
 // Decodes an image. It's required to decode the lossless header before calling
 // this function. Returns false in case of error, with updated dec->status_.
-int VP8LDecodeImage(VP8LDecoder* const dec);
+WEBP_NODISCARD int VP8LDecodeImage(VP8LDecoder* const dec);
 
 // Resets the decoder in its initial state, reclaiming memory.
 // Preserves the dec->status_ value.
@@ -126,6 +128,18 @@ void VP8LClear(VP8LDecoder* const dec);
 // Clears and deallocate a lossless decoder instance.
 void VP8LDelete(VP8LDecoder* const dec);
 
+// Helper function for reading the different Huffman codes and storing them in
+// 'huffman_tables' and 'htree_groups'.
+// If mapping is NULL 'num_htree_groups_max' must equal 'num_htree_groups'.
+// If it is not NULL, it maps 'num_htree_groups_max' indices to the
+// 'num_htree_groups' groups. If 'num_htree_groups_max' > 'num_htree_groups',
+// some of those indices map to -1. This is used for non-balanced codes to
+// limit memory usage.
+WEBP_NODISCARD int ReadHuffmanCodesHelper(
+    int color_cache_bits, int num_htree_groups, int num_htree_groups_max,
+    const int* const mapping, VP8LDecoder* const dec,
+    HuffmanTables* const huffman_tables, HTreeGroup** const htree_groups);
+
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus
diff --git a/3rdparty/libwebp/src/dec/webp_dec.c b/3rdparty/libwebp/src/dec/webp_dec.c
index 42d098874d07..49ef205c8b60 100644
--- a/3rdparty/libwebp/src/dec/webp_dec.c
+++ b/3rdparty/libwebp/src/dec/webp_dec.c
@@ -13,11 +13,14 @@
 
 #include <stdlib.h>
 
+#include "src/dec/vp8_dec.h"
 #include "src/dec/vp8i_dec.h"
 #include "src/dec/vp8li_dec.h"
 #include "src/dec/webpi_dec.h"
 #include "src/utils/utils.h"
 #include "src/webp/mux_types.h"  // ALPHA_FLAG
+#include "src/webp/decode.h"
+#include "src/webp/types.h"
 
 //------------------------------------------------------------------------------
 // RIFF layout is:
@@ -179,7 +182,7 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
       return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
     }
     // For odd-sized chunk-payload, there's one byte padding at the end.
-    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
+    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1u;
     total_size += disk_chunk_size;
 
     // Check that total bytes skipped so far does not exceed riff_size.
@@ -444,8 +447,9 @@ void WebPResetDecParams(WebPDecParams* const params) {
 // "Into" decoding variants
 
 // Main flow
-static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
-                                WebPDecParams* const params) {
+WEBP_NODISCARD static VP8StatusCode DecodeInto(const uint8_t* const data,
+                                               size_t data_size,
+                                               WebPDecParams* const params) {
   VP8StatusCode status;
   VP8Io io;
   WebPHeaderStructure headers;
@@ -459,7 +463,9 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
   }
 
   assert(params != NULL);
-  VP8InitIo(&io);
+  if (!VP8InitIo(&io)) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
   io.data = headers.data + headers.offset;
   io.data_size = headers.data_size - headers.offset;
   WebPInitCustomIo(params, &io);  // Plug the I/O functions.
@@ -523,17 +529,16 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
 }
 
 // Helpers
-static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
-                                     const uint8_t* const data,
-                                     size_t data_size,
-                                     uint8_t* const rgba,
-                                     int stride, size_t size) {
+WEBP_NODISCARD static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
+                                                    const uint8_t* const data,
+                                                    size_t data_size,
+                                                    uint8_t* const rgba,
+                                                    int stride, size_t size) {
   WebPDecParams params;
   WebPDecBuffer buf;
-  if (rgba == NULL) {
+  if (rgba == NULL || !WebPInitDecBuffer(&buf)) {
     return NULL;
   }
-  WebPInitDecBuffer(&buf);
   WebPResetDecParams(&params);
   params.output = &buf;
   buf.colorspace    = colorspace;
@@ -578,8 +583,7 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, size_t data_size,
                            uint8_t* v, size_t v_size, int v_stride) {
   WebPDecParams params;
   WebPDecBuffer output;
-  if (luma == NULL) return NULL;
-  WebPInitDecBuffer(&output);
+  if (luma == NULL || !WebPInitDecBuffer(&output)) return NULL;
   WebPResetDecParams(&params);
   params.output = &output;
   output.colorspace      = MODE_YUV;
@@ -601,13 +605,17 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, size_t data_size,
 
 //------------------------------------------------------------------------------
 
-static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* const data,
-                       size_t data_size, int* const width, int* const height,
-                       WebPDecBuffer* const keep_info) {
+WEBP_NODISCARD static uint8_t* Decode(WEBP_CSP_MODE mode,
+                                      const uint8_t* const data,
+                                      size_t data_size, int* const width,
+                                      int* const height,
+                                      WebPDecBuffer* const keep_info) {
   WebPDecParams params;
   WebPDecBuffer output;
 
-  WebPInitDecBuffer(&output);
+  if (!WebPInitDecBuffer(&output)) {
+    return NULL;
+  }
   WebPResetDecParams(&params);
   params.output = &output;
   output.colorspace = mode;
@@ -658,19 +666,26 @@ uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
 uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
                        int* width, int* height, uint8_t** u, uint8_t** v,
                        int* stride, int* uv_stride) {
-  WebPDecBuffer output;   // only to preserve the side-infos
-  uint8_t* const out = Decode(MODE_YUV, data, data_size,
-                              width, height, &output);
-
-  if (out != NULL) {
-    const WebPYUVABuffer* const buf = &output.u.YUVA;
-    *u = buf->u;
-    *v = buf->v;
-    *stride = buf->y_stride;
-    *uv_stride = buf->u_stride;
-    assert(buf->u_stride == buf->v_stride);
-  }
-  return out;
+  // data, width and height are checked by Decode().
+  if (u == NULL || v == NULL || stride == NULL || uv_stride == NULL) {
+    return NULL;
+  }
+
+  {
+    WebPDecBuffer output;   // only to preserve the side-infos
+    uint8_t* const out = Decode(MODE_YUV, data, data_size,
+                                width, height, &output);
+
+    if (out != NULL) {
+      const WebPYUVABuffer* const buf = &output.u.YUVA;
+      *u = buf->u;
+      *v = buf->v;
+      *stride = buf->y_stride;
+      *uv_stride = buf->u_stride;
+      assert(buf->u_stride == buf->v_stride);
+    }
+    return out;
+  }
 }
 
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
@@ -726,7 +741,9 @@ int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
   }
   memset(config, 0, sizeof(*config));
   DefaultFeatures(&config->input);
-  WebPInitDecBuffer(&config->output);
+  if (!WebPInitDecBuffer(&config->output)) {
+    return 0;
+  }
   return 1;
 }
 
@@ -765,7 +782,9 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
   if (WebPAvoidSlowMemory(params.output, &config->input)) {
     // decoding to slow memory: use a temporary in-mem buffer to decode into.
     WebPDecBuffer in_mem_buffer;
-    WebPInitDecBuffer(&in_mem_buffer);
+    if (!WebPInitDecBuffer(&in_mem_buffer)) {
+      return VP8_STATUS_INVALID_PARAM;
+    }
     in_mem_buffer.colorspace = config->output.colorspace;
     in_mem_buffer.width = config->input.width;
     in_mem_buffer.height = config->input.height;
@@ -785,6 +804,13 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
 //------------------------------------------------------------------------------
 // Cropping and rescaling.
 
+int WebPCheckCropDimensions(int image_width, int image_height,
+                            int x, int y, int w, int h) {
+  return !(x < 0 || y < 0 || w <= 0 || h <= 0 ||
+           x >= image_width || w > image_width || w > image_width - x ||
+           y >= image_height || h > image_height || h > image_height - y);
+}
+
 int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
                           VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
   const int W = io->width;
@@ -792,7 +818,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   int x = 0, y = 0, w = W, h = H;
 
   // Cropping
-  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  io->use_cropping = (options != NULL) && options->use_cropping;
   if (io->use_cropping) {
     w = options->crop_width;
     h = options->crop_height;
@@ -802,7 +828,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
       x &= ~1;
       y &= ~1;
     }
-    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+    if (!WebPCheckCropDimensions(W, H, x, y, w, h)) {
       return 0;  // out of frame boundary error
     }
   }
@@ -814,7 +840,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   io->mb_h = h;
 
   // Scaling
-  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  io->use_scaling = (options != NULL) && options->use_scaling;
   if (io->use_scaling) {
     int scaled_width = options->scaled_width;
     int scaled_height = options->scaled_height;
@@ -835,8 +861,8 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 
   if (io->use_scaling) {
     // disable filter (only for large downscaling ratio).
-    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
-                           (io->scaled_height < H * 3 / 4);
+    io->bypass_filtering |= (io->scaled_width < W * 3 / 4) &&
+                            (io->scaled_height < H * 3 / 4);
     io->fancy_upsampling = 0;
   }
   return 1;
diff --git a/3rdparty/libwebp/src/dec/webpi_dec.h b/3rdparty/libwebp/src/dec/webpi_dec.h
index 24baff5d27a8..77bf5264b779 100644
--- a/3rdparty/libwebp/src/dec/webpi_dec.h
+++ b/3rdparty/libwebp/src/dec/webpi_dec.h
@@ -20,6 +20,7 @@ extern "C" {
 
 #include "src/utils/rescaler_utils.h"
 #include "src/dec/vp8_dec.h"
+#include "src/webp/decode.h"
 
 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
@@ -77,14 +78,19 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
 //------------------------------------------------------------------------------
 // Misc utils
 
+// Returns true if crop dimensions are within image bounds.
+int WebPCheckCropDimensions(int image_width, int image_height,
+                            int x, int y, int w, int h);
+
 // Initializes VP8Io with custom setup, io and teardown functions. The default
 // hooks will use the supplied 'params' as io->opaque handle.
 void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
 
 // Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
 // to the *compressed* format, not the output one.
-int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
-                          VP8Io* const io, WEBP_CSP_MODE src_colorspace);
+WEBP_NODISCARD int WebPIoInitFromOptions(
+    const WebPDecoderOptions* const options, VP8Io* const io,
+    WEBP_CSP_MODE src_colorspace);
 
 //------------------------------------------------------------------------------
 // Internal functions regarding WebPDecBuffer memory (in buffer.c).
diff --git a/3rdparty/libwebp/src/demux/anim_decode.c b/3rdparty/libwebp/src/demux/anim_decode.c
index 3dcacc35d675..27f0e2b0bb3d 100644
--- a/3rdparty/libwebp/src/demux/anim_decode.c
+++ b/3rdparty/libwebp/src/demux/anim_decode.c
@@ -20,9 +20,18 @@
 #include "src/utils/utils.h"
 #include "src/webp/decode.h"
 #include "src/webp/demux.h"
+#include "src/webp/types.h"
 
 #define NUM_CHANNELS 4
 
+// Channel extraction from a uint32_t representation of a uint8_t RGBA/BGRA
+// buffer.
+#ifdef WORDS_BIGENDIAN
+#define CHANNEL_SHIFT(i) (24 - (i) * 8)
+#else
+#define CHANNEL_SHIFT(i) ((i) * 8)
+#endif
+
 typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
 static void BlendPixelRowNonPremult(uint32_t* const src,
                                     const uint32_t* const dst, int num_pixels);
@@ -60,8 +69,9 @@ int WebPAnimDecoderOptionsInitInternal(WebPAnimDecoderOptions* dec_options,
   return 1;
 }
 
-static int ApplyDecoderOptions(const WebPAnimDecoderOptions* const dec_options,
-                               WebPAnimDecoder* const dec) {
+WEBP_NODISCARD static int ApplyDecoderOptions(
+    const WebPAnimDecoderOptions* const dec_options,
+    WebPAnimDecoder* const dec) {
   WEBP_CSP_MODE mode;
   WebPDecoderConfig* config = &dec->config_;
   assert(dec_options != NULL);
@@ -74,7 +84,9 @@ static int ApplyDecoderOptions(const WebPAnimDecoderOptions* const dec_options,
   dec->blend_func_ = (mode == MODE_RGBA || mode == MODE_BGRA)
                          ? &BlendPixelRowNonPremult
                          : &BlendPixelRowPremult;
-  WebPInitDecoderConfig(config);
+  if (!WebPInitDecoderConfig(config)) {
+    return 0;
+  }
   config->output.colorspace = mode;
   config->output.is_external_memory = 1;
   config->options.use_threads = dec_options->use_threads;
@@ -87,11 +99,19 @@ WebPAnimDecoder* WebPAnimDecoderNewInternal(
     int abi_version) {
   WebPAnimDecoderOptions options;
   WebPAnimDecoder* dec = NULL;
+  WebPBitstreamFeatures features;
   if (webp_data == NULL ||
       WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
     return NULL;
   }
 
+  // Validate the bitstream before doing expensive allocations. The demuxer may
+  // be more tolerant than the decoder.
+  if (WebPGetFeatures(webp_data->bytes, webp_data->size, &features) !=
+      VP8_STATUS_OK) {
+    return NULL;
+  }
+
   // Note: calloc() so that the pointer members are initialized to NULL.
   dec = (WebPAnimDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
   if (dec == NULL) goto Error;
@@ -141,11 +161,11 @@ static int IsFullFrame(int width, int height, int canvas_width,
 }
 
 // Clear the canvas to transparent.
-static int ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
-                          uint32_t canvas_height) {
+WEBP_NODISCARD static int ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
+                                         uint32_t canvas_height) {
   const uint64_t size =
       (uint64_t)canvas_width * canvas_height * NUM_CHANNELS * sizeof(*buf);
-  if (size != (size_t)size) return 0;
+  if (!CheckSizeOverflow(size)) return 0;
   memset(buf, 0, (size_t)size);
   return 1;
 }
@@ -163,10 +183,10 @@ static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
 }
 
 // Copy width * height pixels from 'src' to 'dst'.
-static int CopyCanvas(const uint8_t* src, uint8_t* dst,
-                      uint32_t width, uint32_t height) {
+WEBP_NODISCARD static int CopyCanvas(const uint8_t* src, uint8_t* dst,
+                                     uint32_t width, uint32_t height) {
   const uint64_t size = (uint64_t)width * height * NUM_CHANNELS;
-  if (size != (size_t)size) return 0;
+  if (!CheckSizeOverflow(size)) return 0;
   assert(src != NULL && dst != NULL);
   memcpy(dst, src, (size_t)size);
   return 1;
@@ -201,35 +221,35 @@ static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
   const uint8_t dst_channel = (dst >> shift) & 0xff;
   const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
   assert(blend_unscaled < (1ULL << 32) / scale);
-  return (blend_unscaled * scale) >> 24;
+  return (blend_unscaled * scale) >> CHANNEL_SHIFT(3);
 }
 
 // Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
 static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
 
   if (src_a == 0) {
     return dst;
   } else {
-    const uint8_t dst_a = (dst >> 24) & 0xff;
+    const uint8_t dst_a = (dst >> CHANNEL_SHIFT(3)) & 0xff;
     // This is the approximate integer arithmetic for the actual formula:
     // dst_factor_a = (dst_a * (255 - src_a)) / 255.
     const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
     const uint8_t blend_a = src_a + dst_factor_a;
     const uint32_t scale = (1UL << 24) / blend_a;
 
-    const uint8_t blend_r =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
-    const uint8_t blend_g =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
-    const uint8_t blend_b =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
+    const uint8_t blend_r = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(0));
+    const uint8_t blend_g = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(1));
+    const uint8_t blend_b = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(2));
     assert(src_a + dst_factor_a < 256);
 
-    return (blend_r << 0) |
-           (blend_g << 8) |
-           (blend_b << 16) |
-           ((uint32_t)blend_a << 24);
+    return ((uint32_t)blend_r << CHANNEL_SHIFT(0)) |
+           ((uint32_t)blend_g << CHANNEL_SHIFT(1)) |
+           ((uint32_t)blend_b << CHANNEL_SHIFT(2)) |
+           ((uint32_t)blend_a << CHANNEL_SHIFT(3));
   }
 }
 
@@ -239,7 +259,7 @@ static void BlendPixelRowNonPremult(uint32_t* const src,
                                     const uint32_t* const dst, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
     if (src_alpha != 0xff) {
       src[i] = BlendPixelNonPremult(src[i], dst[i]);
     }
@@ -256,7 +276,7 @@ static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {
 
 // Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
 static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
   return src + ChannelwiseMultiply(dst, 256 - src_a);
 }
 
@@ -266,7 +286,7 @@ static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
                                  int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
     if (src_alpha != 0xff) {
       src[i] = BlendPixelPremult(src[i], dst[i]);
     }
@@ -408,7 +428,9 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
   WebPDemuxReleaseIterator(&dec->prev_iter_);
   dec->prev_iter_ = iter;
   dec->prev_frame_was_keyframe_ = is_key_frame;
-  CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height);
+  if (!CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height)) {
+    goto Error;
+  }
   if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
     ZeroFillFrameRect(dec->prev_frame_disposed_, width * NUM_CHANNELS,
                       dec->prev_iter_.x_offset, dec->prev_iter_.y_offset,
diff --git a/3rdparty/libwebp/src/demux/demux.c b/3rdparty/libwebp/src/demux/demux.c
index 860e2ce7615e..d01c6a746456 100644
--- a/3rdparty/libwebp/src/demux/demux.c
+++ b/3rdparty/libwebp/src/demux/demux.c
@@ -24,7 +24,7 @@
 #include "src/webp/format_constants.h"
 
 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 2
+#define DMUX_MIN_VERSION 4
 #define DMUX_REV_VERSION 0
 
 typedef struct {
@@ -221,12 +221,16 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
     const size_t chunk_start_offset = mem->start_;
     const uint32_t fourcc = ReadLE32(mem);
     const uint32_t payload_size = ReadLE32(mem);
-    const uint32_t payload_size_padded = payload_size + (payload_size & 1);
-    const size_t payload_available = (payload_size_padded > MemDataSize(mem))
-                                   ? MemDataSize(mem) : payload_size_padded;
-    const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
+    uint32_t payload_size_padded;
+    size_t payload_available;
+    size_t chunk_size;
 
     if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+    payload_size_padded = payload_size + (payload_size & 1);
+    payload_available = (payload_size_padded > MemDataSize(mem))
+                      ? MemDataSize(mem) : payload_size_padded;
+    chunk_size = CHUNK_HEADER_SIZE + payload_available;
     if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
     if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
 
@@ -451,9 +455,11 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
     const size_t chunk_start_offset = mem->start_;
     const uint32_t fourcc = ReadLE32(mem);
     const uint32_t chunk_size = ReadLE32(mem);
-    const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
+    uint32_t chunk_size_padded;
 
     if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+    chunk_size_padded = chunk_size + (chunk_size & 1);
     if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
 
     switch (fourcc) {
@@ -608,7 +614,6 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
 
   while (f != NULL) {
     const int cur_frame_set = f->frame_num_;
-    int frame_count = 0;
 
     // Check frame properties.
     for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
@@ -643,8 +648,6 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
                             dmux->canvas_width_, dmux->canvas_height_)) {
         return 0;
       }
-
-      ++frame_count;
     }
   }
   return 1;
diff --git a/3rdparty/libwebp/src/dsp/alpha_processing.c b/3rdparty/libwebp/src/dsp/alpha_processing.c
index 3a27990ddc57..1d152f24dada 100644
--- a/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+                   const uint8_t* WEBP_RESTRICT const alpha,
                    int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
 #undef MFIX
 
 void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
-void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+                    const uint8_t* WEBP_RESTRICT const alpha,
                     int width, int inverse);
 
 //------------------------------------------------------------------------------
@@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
   }
 }
 
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+                  const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                   int width, int num_rows, int inverse) {
   int n;
   for (n = 0; n < num_rows; ++n) {
@@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
 }
 
 #if !WEBP_NEON_OMIT_C_CODE
-static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
+static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                            int width, int height,
-                           uint8_t* dst, int dst_stride) {
+                           uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   uint32_t alpha_mask = 0xff;
   int i, j;
 
@@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xff);
 }
 
-static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
-                                   int width, int height,
-                                   uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha,
+                                   int alpha_stride, int width, int height,
+                                   uint32_t* WEBP_RESTRICT dst,
+                                   int dst_stride) {
   int i, j;
   for (j = 0; j < height; ++j) {
     for (i = 0; i < width; ++i) {
@@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                           int width, int height,
-                          uint8_t* alpha, int alpha_stride) {
+                          uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   uint8_t alpha_mask = 0xff;
   int i, j;
 
@@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
   return (alpha_mask == 0xff);
 }
 
-static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
+static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb,
+                           uint8_t* WEBP_RESTRICT alpha, int size) {
   int i;
   for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
@@ -372,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
 }
 
 #ifdef WORDS_BIGENDIAN
-static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                       const uint8_t* b, int len, uint32_t* out) {
+static void PackARGB_C(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT r,
+                       const uint8_t* WEBP_RESTRICT g,
+                       const uint8_t* WEBP_RESTRICT b,
+                       int len, uint32_t* WEBP_RESTRICT out) {
   int i;
   for (i = 0; i < len; ++i) {
     out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
@@ -381,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
 }
 #endif
 
-static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                      int len, int step, uint32_t* out) {
+static void PackRGB_C(const uint8_t* WEBP_RESTRICT r,
+                      const uint8_t* WEBP_RESTRICT g,
+                      const uint8_t* WEBP_RESTRICT b,
+                      int len, int step, uint32_t* WEBP_RESTRICT out) {
   int i, offset = 0;
   for (i = 0; i < len; ++i) {
     out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
@@ -392,16 +401,22 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
 
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
-int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
-int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                         uint8_t* WEBP_RESTRICT, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                                 uint32_t* WEBP_RESTRICT, int);
+int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                        uint8_t* WEBP_RESTRICT, int);
+void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+                         uint8_t* WEBP_RESTRICT alpha, int size);
 #ifdef WORDS_BIGENDIAN
 void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                      const uint8_t* b, int, uint32_t*);
 #endif
-void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out);
+void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+                    const uint8_t* WEBP_RESTRICT g,
+                    const uint8_t* WEBP_RESTRICT b,
+                    int len, int step, uint32_t* WEBP_RESTRICT out);
 
 int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 int (*WebPHasAlpha32b)(const uint8_t* src, int length);
@@ -410,6 +425,7 @@ void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 //------------------------------------------------------------------------------
 // Init function
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void WebPInitAlphaProcessingMIPSdspR2(void);
 extern void WebPInitAlphaProcessingSSE2(void);
 extern void WebPInitAlphaProcessingSSE41(void);
@@ -438,10 +454,10 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitAlphaProcessingSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         WebPInitAlphaProcessingSSE41();
       }
@@ -455,7 +471,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitAlphaProcessingNEON();
diff --git a/3rdparty/libwebp/src/dsp/alpha_processing_neon.c b/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
index 9d55421704cc..6716fb77f0d8 100644
--- a/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
@@ -80,10 +80,10 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
-  uint32_t alpha_mask = 0xffffffffu;
+static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
+                              int alpha_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffu;
   uint8x8_t mask8 = vdup_n_u8(0xff);
   uint32_t tmp[2];
   int i, j;
@@ -107,14 +107,16 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
     dst += dst_stride;
   }
   vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask *= 0x01010101;
   alpha_mask &= tmp[0];
   alpha_mask &= tmp[1];
   return (alpha_mask != 0xffffffffu);
 }
 
-static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
+                                      int alpha_stride, int width, int height,
+                                      uint32_t* WEBP_RESTRICT dst,
+                                      int dst_stride) {
   int i, j;
   uint8x8x4_t greens;   // leave A/R/B channels zero'd.
   greens.val[0] = vdup_n_u8(0);
@@ -131,10 +133,10 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                              int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
-  uint32_t alpha_mask = 0xffffffffu;
+                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
+  uint32_t alpha_mask = 0xffu;
   uint8x8_t mask8 = vdup_n_u8(0xff);
   uint32_t tmp[2];
   int i, j;
@@ -156,13 +158,14 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
     alpha += alpha_stride;
   }
   vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask *= 0x01010101;
   alpha_mask &= tmp[0];
   alpha_mask &= tmp[1];
   return (alpha_mask == 0xffffffffu);
 }
 
-static void ExtractGreen_NEON(const uint32_t* argb,
-                              uint8_t* alpha, int size) {
+static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb,
+                              uint8_t* WEBP_RESTRICT alpha, int size) {
   int i;
   for (i = 0; i + 16 <= size; i += 16) {
     const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
diff --git a/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
index f6c6e0fb1a6d..aa0cc2848ae9 100644
--- a/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -18,16 +18,16 @@
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+                              int alpha_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
   int i, j;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
-  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00);  // to preserve RGB
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
@@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
   return (alpha_and != 0xff);
 }
 
-static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+                                      int alpha_stride, int width, int height,
+                                      uint32_t* WEBP_RESTRICT dst,
+                                      int dst_stride) {
   int i, j;
   const __m128i zero = _mm_setzero_si128();
   const int limit = width & ~15;
@@ -98,15 +99,15 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                              int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
+                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
   int i, j;
-  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
-  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  const __m128i a_mask = _mm_set1_epi32(0xff);  // to preserve alpha
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
@@ -143,6 +144,46 @@ static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
   return (alpha_and == 0xff);
 }
 
+static void ExtractGreen_SSE2(const uint32_t* WEBP_RESTRICT argb,
+                              uint8_t* WEBP_RESTRICT alpha, int size) {
+  int i;
+  const __m128i mask = _mm_set1_epi32(0xff);
+  const __m128i* src = (const __m128i*)argb;
+
+  for (i = 0; i + 16 <= size; i += 16, src += 4) {
+    const __m128i a0 = _mm_loadu_si128(src + 0);
+    const __m128i a1 = _mm_loadu_si128(src + 1);
+    const __m128i a2 = _mm_loadu_si128(src + 2);
+    const __m128i a3 = _mm_loadu_si128(src + 3);
+    const __m128i b0 = _mm_srli_epi32(a0, 8);
+    const __m128i b1 = _mm_srli_epi32(a1, 8);
+    const __m128i b2 = _mm_srli_epi32(a2, 8);
+    const __m128i b3 = _mm_srli_epi32(a3, 8);
+    const __m128i c0 = _mm_and_si128(b0, mask);
+    const __m128i c1 = _mm_and_si128(b1, mask);
+    const __m128i c2 = _mm_and_si128(b2, mask);
+    const __m128i c3 = _mm_and_si128(b3, mask);
+    const __m128i d0 = _mm_packs_epi32(c0, c1);
+    const __m128i d1 = _mm_packs_epi32(c2, c3);
+    const __m128i e = _mm_packus_epi16(d0, d1);
+    // store
+    _mm_storeu_si128((__m128i*)&alpha[i], e);
+  }
+  if (i + 8 <= size) {
+    const __m128i a0 = _mm_loadu_si128(src + 0);
+    const __m128i a1 = _mm_loadu_si128(src + 1);
+    const __m128i b0 = _mm_srli_epi32(a0, 8);
+    const __m128i b1 = _mm_srli_epi32(a1, 8);
+    const __m128i c0 = _mm_and_si128(b0, mask);
+    const __m128i c1 = _mm_and_si128(b1, mask);
+    const __m128i d = _mm_packs_epi32(c0, c1);
+    const __m128i e = _mm_packus_epi16(d, d);
+    _mm_storel_epi64((__m128i*)&alpha[i], e);
+    i += 8;
+  }
+  for (; i < size; ++i) alpha[i] = argb[i] >> 8;
+}
+
 //------------------------------------------------------------------------------
 // Non-dither premultiplied modes
 
@@ -177,7 +218,7 @@ static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
 static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
                                     int w, int h, int stride) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i kMult = _mm_set1_epi16(0x8081u);
+  const __m128i kMult = _mm_set1_epi16((short)0x8081);
   const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);
   const int kSpan = 4;
   while (h-- > 0) {
@@ -266,7 +307,7 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
 }
 
 static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
-  const __m128i m_color = _mm_set1_epi32(color);
+  const __m128i m_color = _mm_set1_epi32((int)color);
   const __m128i zero = _mm_setzero_si128();
   int i = 0;
   for (; i + 8 <= length; i += 8) {
@@ -317,7 +358,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
   if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
 }
 
-static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
+static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
+                         const uint8_t* WEBP_RESTRICT const alpha,
                          int width, int inverse) {
   int x = 0;
   if (!inverse) {
@@ -352,6 +394,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
   WebPDispatchAlpha = DispatchAlpha_SSE2;
   WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
   WebPExtractAlpha = ExtractAlpha_SSE2;
+  WebPExtractGreen = ExtractGreen_SSE2;
 
   WebPHasAlpha8b = HasAlpha8b_SSE2;
   WebPHasAlpha32b = HasAlpha32b_SSE2;
diff --git a/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c b/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
index 56040f9c8801..1156ac3417b2 100644
--- a/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
@@ -19,14 +19,14 @@
 
 //------------------------------------------------------------------------------
 
-static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
-                              int width, int height,
-                              uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
+                              int argb_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
   int i, j;
-  const __m128i all_0xff = _mm_set1_epi32(~0u);
+  const __m128i all_0xff = _mm_set1_epi32(~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
diff --git a/3rdparty/libwebp/src/dsp/cost.c b/3rdparty/libwebp/src/dsp/cost.c
index cc681cdd4bf1..73d2140177cb 100644
--- a/3rdparty/libwebp/src/dsp/cost.c
+++ b/3rdparty/libwebp/src/dsp/cost.c
@@ -374,6 +374,7 @@ static void SetResidualCoeffs_C(const int16_t* const coeffs,
 VP8GetResidualCostFunc VP8GetResidualCost;
 VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8EncDspCostInitMIPS32(void);
 extern void VP8EncDspCostInitMIPSdspR2(void);
 extern void VP8EncDspCostInitSSE2(void);
@@ -395,12 +396,12 @@ WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
       VP8EncDspCostInitMIPSdspR2();
     }
 #endif
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspCostInitSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
     if (VP8GetCPUInfo(kNEON)) {
       VP8EncDspCostInitNEON();
     }
diff --git a/3rdparty/libwebp/src/dsp/cost_neon.c b/3rdparty/libwebp/src/dsp/cost_neon.c
index 8cc8ce58aa14..6582669cb3f9 100644
--- a/3rdparty/libwebp/src/dsp/cost_neon.c
+++ b/3rdparty/libwebp/src/dsp/cost_neon.c
@@ -29,7 +29,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
   const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
   const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
 
-#ifdef __aarch64__
+#if WEBP_AARCH64
   res->last = vmaxvq_u8(masked) - 1;
 #else
   const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
@@ -43,7 +43,7 @@ static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
 
   vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
   --res->last;
-#endif  // __aarch64__
+#endif  // WEBP_AARCH64
 
   res->coeffs = coeffs;
 }
diff --git a/3rdparty/libwebp/src/dsp/cpu.c b/3rdparty/libwebp/src/dsp/cpu.c
index 4ca90d88bf8c..8ba8f683357b 100644
--- a/3rdparty/libwebp/src/dsp/cpu.c
+++ b/3rdparty/libwebp/src/dsp/cpu.c
@@ -11,7 +11,7 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)
 
-#include "src/dsp/dsp.h"
+#include "src/dsp/cpu.h"
 
 #if defined(WEBP_HAVE_NEON_RTCD)
 #include <stdio.h>
@@ -36,18 +36,6 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
     : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
     : "a"(info_type), "c"(0));
 }
-#elif defined(__x86_64__) && \
-      (defined(__code_model_medium__) || defined(__code_model_large__)) && \
-      defined(__PIC__)
-static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "xchg{q}\t{%%rbx}, %q1\n"
-    "cpuid\n"
-    "xchg{q}\t{%%rbx}, %q1\n"
-    : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
-      "=d"(cpu_info[3])
-    : "a"(info_type), "c"(0));
-}
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
   __asm__ volatile (
@@ -173,6 +161,7 @@ static int x86CPUInfo(CPUFeature feature) {
   }
   return 0;
 }
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
 #elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
 static int AndroidCPUInfo(CPUFeature feature) {
@@ -184,22 +173,23 @@ static int AndroidCPUInfo(CPUFeature feature) {
   }
   return 0;
 }
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
 #elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
 // Use compile flags as an indicator of SIMD support instead of a runtime check.
 static int wasmCPUInfo(CPUFeature feature) {
   switch (feature) {
-#ifdef WEBP_USE_SSE2
+#ifdef WEBP_HAVE_SSE2
     case kSSE2:
       return 1;
 #endif
-#ifdef WEBP_USE_SSE41
+#ifdef WEBP_HAVE_SSE41
     case kSSE3:
     case kSlowSSSE3:
     case kSSE4_1:
       return 1;
 #endif
-#ifdef WEBP_USE_NEON
+#ifdef WEBP_HAVE_NEON
     case kNEON:
       return 1;
 #endif
@@ -208,10 +198,12 @@ static int wasmCPUInfo(CPUFeature feature) {
   }
   return 0;
 }
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
-#elif defined(WEBP_USE_NEON)
-// define a dummy function to enable turning off NEON at runtime by setting
-// VP8DecGetCPUInfo = NULL
+#elif defined(WEBP_HAVE_NEON)
+// In most cases this function doesn't check for NEON support (it's assumed by
+// the configuration), but enables turning off NEON at runtime, for testing
+// purposes, by setting VP8GetCPUInfo = NULL.
 static int armCPUInfo(CPUFeature feature) {
   if (feature != kNEON) return 0;
 #if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
@@ -235,6 +227,7 @@ static int armCPUInfo(CPUFeature feature) {
   return 1;
 #endif
 }
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
 #elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \
       defined(WEBP_USE_MSA)
@@ -246,7 +239,9 @@ static int mipsCPUInfo(CPUFeature feature) {
   }
 
 }
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif
diff --git a/3rdparty/libwebp/src/dsp/cpu.h b/3rdparty/libwebp/src/dsp/cpu.h
new file mode 100644
index 000000000000..c86540f28013
--- /dev/null
+++ b/3rdparty/libwebp/src/dsp/cpu.h
@@ -0,0 +1,266 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   CPU detection functions and macros.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_CPU_H_
+#define WEBP_DSP_CPU_H_
+
+#include <stddef.h>
+
+#ifdef HAVE_CONFIG_H
+#include "src/webp/config.h"
+#endif
+
+#include "src/webp/types.h"
+
+#if defined(__GNUC__)
+#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_GCC_VERSION 0
+#define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#if defined(__clang__)
+#define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+#define LOCAL_CLANG_PREREQ(maj, min) \
+  (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_CLANG_VERSION 0
+#define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+//------------------------------------------------------------------------------
+// x86 defines.
+
+#if !defined(HAVE_CONFIG_H)
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
+#endif
+#endif
+
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
+// files without intrinsics, allowing the corresponding Init() to be called.
+// Files containing intrinsics will need to be built targeting the instruction
+// set so should succeed on one of the earlier tests.
+#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
+#define WEBP_USE_SSE2
+#endif
+
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
+#define WEBP_HAVE_SSE2
+#endif
+
+#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
+#define WEBP_USE_SSE41
+#endif
+
+#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
+#define WEBP_HAVE_SSE41
+#endif
+
+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
+//------------------------------------------------------------------------------
+// Arm defines.
+
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if ((defined(__ARM_NEON__) || defined(__aarch64__)) &&       \
+     (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
+    !defined(__native_client__)
+#define WEBP_USE_NEON
+#endif
+
+#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
+    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
+#define WEBP_ANDROID_NEON  // Android targets that may have NEON
+#define WEBP_USE_NEON
+#endif
+
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with
+// vtbl4_u8(); a fix was made in 16.6.
+#if defined(_MSC_VER) && \
+    ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+     (_MSC_VER >= 1926 && (defined(_M_ARM64) || defined(_M_ARM64EC))))
+#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#define WEBP_AARCH64 1
+#else
+#define WEBP_AARCH64 0
+#endif
+
+#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
+#define WEBP_HAVE_NEON
+#endif
+
+//------------------------------------------------------------------------------
+// MIPS defines.
+
+#if defined(__mips__) && !defined(__mips64) && defined(__mips_isa_rev) && \
+    (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
+#define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
+#define WEBP_USE_MIPS_DSP_R2
+#endif
+#endif
+#endif
+
+#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
+#define WEBP_USE_MSA
+#endif
+
+//------------------------------------------------------------------------------
+
+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64)
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
+//------------------------------------------------------------------------------
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
+#endif
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define WEBP_MSAN
+#endif
+#endif
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define WEBP_DSP_INIT(func)                                         \
+  do {                                                              \
+    static volatile VP8CPUInfo func##_last_cpuinfo_used =           \
+        (VP8CPUInfo)&func##_last_cpuinfo_used;                      \
+    static pthread_mutex_t func##_lock = PTHREAD_MUTEX_INITIALIZER; \
+    if (pthread_mutex_lock(&func##_lock)) break;                    \
+    if (func##_last_cpuinfo_used != VP8GetCPUInfo) func();          \
+    func##_last_cpuinfo_used = VP8GetCPUInfo;                       \
+    (void)pthread_mutex_unlock(&func##_lock);                       \
+  } while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define WEBP_DSP_INIT(func)                               \
+  do {                                                    \
+    static volatile VP8CPUInfo func##_last_cpuinfo_used = \
+        (VP8CPUInfo)&func##_last_cpuinfo_used;            \
+    if (func##_last_cpuinfo_used == VP8GetCPUInfo) break; \
+    func();                                               \
+    func##_last_cpuinfo_used = VP8GetCPUInfo;             \
+  } while (0)
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Defines an Init + helper function that control multiple initialization of
+// function pointers / tables.
+/* Usage:
+   WEBP_DSP_INIT_FUNC(InitFunc) {
+     ...function body
+   }
+*/
+#define WEBP_DSP_INIT_FUNC(name)                                            \
+  static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void);                  \
+  WEBP_TSAN_IGNORE_FUNCTION void name(void) { WEBP_DSP_INIT(name##_body); } \
+  static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void)
+
+#define WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures. This is only meant to silence unaligned loads on platforms that
+// are known to support them.
+#undef WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNDEF __attribute__((no_sanitize("undefined")))
+
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures related to unsigned integer overflows. This is only meant to
+// silence cases where this well defined behavior is expected.
+#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
+// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define WEBP_SWAP_16BIT_CSP 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) &&                   \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+typedef enum {
+  kSSE2,
+  kSSE3,
+  kSlowSSSE3,  // special feature for slow SSSE3 architectures
+  kSSE4_1,
+  kAVX,
+  kAVX2,
+  kNEON,
+  kMIPS32,
+  kMIPSdspR2,
+  kMSA
+} CPUFeature;
+
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+
+#endif  // WEBP_DSP_CPU_H_
diff --git a/3rdparty/libwebp/src/dsp/dec.c b/3rdparty/libwebp/src/dsp/dec.c
index 1119842dd3de..451d649d58e4 100644
--- a/3rdparty/libwebp/src/dsp/dec.c
+++ b/3rdparty/libwebp/src/dsp/dec.c
@@ -37,9 +37,6 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
   STORE(3, y, DC - (d));            \
 } while (0)
 
-#define MUL1(a) ((((a) * 20091) >> 16) + (a))
-#define MUL2(a) (((a) * 35468) >> 16)
-
 #if !WEBP_NEON_OMIT_C_CODE
 static void TransformOne_C(const int16_t* in, uint8_t* dst) {
   int C[4 * 4], *tmp;
@@ -48,8 +45,10 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
   for (i = 0; i < 4; ++i) {    // vertical pass
     const int a = in[0] + in[8];    // [-4096, 4094]
     const int b = in[0] - in[8];    // [-4095, 4095]
-    const int c = MUL2(in[4]) - MUL1(in[12]);   // [-3783, 3783]
-    const int d = MUL1(in[4]) + MUL2(in[12]);   // [-3785, 3781]
+    const int c = WEBP_TRANSFORM_AC3_MUL2(in[4]) -
+                  WEBP_TRANSFORM_AC3_MUL1(in[12]);  // [-3783, 3783]
+    const int d = WEBP_TRANSFORM_AC3_MUL1(in[4]) +
+                  WEBP_TRANSFORM_AC3_MUL2(in[12]);  // [-3785, 3781]
     tmp[0] = a + d;   // [-7881, 7875]
     tmp[1] = b + c;   // [-7878, 7878]
     tmp[2] = b - c;   // [-7878, 7878]
@@ -69,8 +68,10 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
     const int dc = tmp[0] + 4;
     const int a =  dc +  tmp[8];
     const int b =  dc -  tmp[8];
-    const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
-    const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
+    const int c =
+        WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]);
+    const int d =
+        WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]);
     STORE(0, 0, a + d);
     STORE(1, 0, b + c);
     STORE(2, 0, b - c);
@@ -83,17 +84,15 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
 static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
   const int a = in[0] + 4;
-  const int c4 = MUL2(in[4]);
-  const int d4 = MUL1(in[4]);
-  const int c1 = MUL2(in[1]);
-  const int d1 = MUL1(in[1]);
+  const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
+  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
+  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
   STORE2(0, a + d4, d1, c1);
   STORE2(1, a + c4, d1, c1);
   STORE2(2, a - c4, d1, c1);
   STORE2(3, a - d4, d1, c1);
 }
-#undef MUL1
-#undef MUL2
 #undef STORE2
 
 static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
@@ -734,6 +733,7 @@ VP8SimpleFilterFunc VP8SimpleHFilter16i;
 void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
                             int dst_stride);
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8DspInitSSE2(void);
 extern void VP8DspInitSSE41(void);
 extern void VP8DspInitNEON(void);
@@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8DspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         VP8DspInitSSE41();
       }
@@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8DspInitNEON();
diff --git a/3rdparty/libwebp/src/dsp/dec_mips32.c b/3rdparty/libwebp/src/dsp/dec_mips32.c
index e4e70966d24f..f0e7de4ac4fc 100644
--- a/3rdparty/libwebp/src/dsp/dec_mips32.c
+++ b/3rdparty/libwebp/src/dsp/dec_mips32.c
@@ -18,8 +18,8 @@
 
 #include "src/dsp/mips_macro.h"
 
-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
+static const int kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int kC2 = WEBP_TRANSFORM_AC3_C2;
 
 static WEBP_INLINE int abs_mips32(int x) {
   const int sign = x >> 31;
@@ -219,7 +219,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
   int temp0, temp1, temp2, temp3, temp4;
   int temp5, temp6, temp7, temp8, temp9;
   int temp10, temp11, temp12, temp13, temp14;
-  int temp15, temp16, temp17, temp18;
+  int temp15, temp16, temp17, temp18, temp19;
   int16_t* p_in = (int16_t*)in;
 
   // loops unrolled and merged to avoid usage of tmp buffer
@@ -233,16 +233,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
     "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
     "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
-    "mul      %[temp17], %[temp12], %[kC1]             \n\t"
-    "mul      %[temp4],  %[temp4],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp12)
+    MUL_SHIFT_C1_IO(temp4, temp19)
     "mul      %[temp12], %[temp12], %[kC2]             \n\t"
     "lh       %[temp1],  2(%[in])                      \n\t"
     "lh       %[temp5],  10(%[in])                     \n\t"
     "lh       %[temp9],  18(%[in])                     \n\t"
     "lh       %[temp13], 26(%[in])                     \n\t"
     "sra      %[temp8],  %[temp8],  16                 \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
-    "sra      %[temp4],  %[temp4],  16                 \n\t"
     "sra      %[temp12], %[temp12], 16                 \n\t"
     "lh       %[temp2],  4(%[in])                      \n\t"
     "lh       %[temp6],  12(%[in])                     \n\t"
@@ -261,49 +259,43 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
     "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
     "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
-    "mul      %[temp17], %[temp13], %[kC1]             \n\t"
-    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp13)
+    MUL_SHIFT_C1_IO(temp5, temp19)
     "mul      %[temp13], %[temp13], %[kC2]             \n\t"
     "sra      %[temp9],  %[temp9],  16                 \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
     "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
-    "sra      %[temp5],  %[temp5],  16                 \n\t"
     "sra      %[temp13], %[temp13], 16                 \n\t"
     "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
     "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
     "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
-    "mul      %[temp17], %[temp14], %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp14)
     "mul      %[temp14], %[temp14], %[kC2]             \n\t"
     "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
     "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
     "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
     "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
     "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
-    "mul      %[temp6],  %[temp6],  %[kC1]             \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
+    MUL_SHIFT_C1_IO(temp6, temp19)
     "sra      %[temp14], %[temp14], 16                 \n\t"
     "sra      %[temp10], %[temp10], 16                 \n\t"
-    "sra      %[temp6],  %[temp6],  16                 \n\t"
     "subu     %[temp17], %[temp10], %[temp17]          \n\t"
     "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
     "addu     %[temp10], %[temp16], %[temp6]           \n\t"
     "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
     "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
     "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
-    "mul      %[temp17], %[temp15], %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp15)
     "mul      %[temp15], %[temp15], %[kC2]             \n\t"
     "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
     "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
     "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
-    "mul      %[temp7],  %[temp7],  %[kC1]             \n\t"
+    MUL_SHIFT_C1_IO(temp7, temp19)
     "addiu    %[temp8],  %[temp8],  4                  \n\t"
     "addiu    %[temp12], %[temp12], 4                  \n\t"
     "addiu    %[temp0],  %[temp0],  4                  \n\t"
     "addiu    %[temp4],  %[temp4],  4                  \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
     "sra      %[temp15], %[temp15], 16                 \n\t"
     "sra      %[temp11], %[temp11], 16                 \n\t"
-    "sra      %[temp7],  %[temp7],  16                 \n\t"
     "subu     %[temp17], %[temp11], %[temp17]          \n\t"
     "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
     "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
@@ -313,48 +305,40 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
     "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
     "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
-    "mul      %[temp17], %[temp11], %[kC1]             \n\t"
-    "mul      %[temp9],  %[temp9],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp17, temp11)
+    MUL_SHIFT_C1_IO(temp9, temp19)
     "mul      %[temp11], %[temp11], %[kC2]             \n\t"
     "sra      %[temp10], %[temp10], 16                 \n\t"
-    "sra      %[temp17], %[temp17], 16                 \n\t"
-    "sra      %[temp9],  %[temp9],  16                 \n\t"
     "sra      %[temp11], %[temp11], 16                 \n\t"
     "subu     %[temp17], %[temp10], %[temp17]          \n\t"
     "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
     "addu     %[temp10], %[temp12], %[temp14]          \n\t"
     "subu     %[temp12], %[temp12], %[temp14]          \n\t"
     "mul      %[temp14], %[temp13], %[kC2]             \n\t"
-    "mul      %[temp9],  %[temp15], %[kC1]             \n\t"
-    "mul      %[temp13], %[temp13], %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp9, temp15)
+    MUL_SHIFT_C1_IO(temp13, temp19)
     "mul      %[temp15], %[temp15], %[kC2]             \n\t"
     "sra      %[temp14], %[temp14], 16                 \n\t"
-    "sra      %[temp9],  %[temp9],  16                 \n\t"
-    "sra      %[temp13], %[temp13], 16                 \n\t"
     "sra      %[temp15], %[temp15], 16                 \n\t"
     "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
     "addu     %[temp15], %[temp13], %[temp15]          \n\t"
     "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
     "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
     "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
-    "mul      %[temp13], %[temp3],  %[kC1]             \n\t"
-    "mul      %[temp1],  %[temp1],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp13, temp3)
+    MUL_SHIFT_C1_IO(temp1, temp19)
     "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
     "sra      %[temp2],  %[temp2],  16                 \n\t"
-    "sra      %[temp13], %[temp13], 16                 \n\t"
-    "sra      %[temp1],  %[temp1],  16                 \n\t"
     "sra      %[temp3],  %[temp3],  16                 \n\t"
     "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
     "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
     "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
     "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
     "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
-    "mul      %[temp1],  %[temp7],  %[kC1]             \n\t"
-    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    MUL_SHIFT_C1(temp1, temp7)
+    MUL_SHIFT_C1_IO(temp5, temp19)
     "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
     "sra      %[temp6],  %[temp6],  16                 \n\t"
-    "sra      %[temp1],  %[temp1],  16                 \n\t"
-    "sra      %[temp5],  %[temp5],  16                 \n\t"
     "sra      %[temp7],  %[temp7],  16                 \n\t"
     "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
     "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
@@ -542,7 +526,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
       [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
       [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
       [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
-      [temp18]"=&r"(temp18)
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19)
     : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
     : "memory", "hi", "lo"
   );
diff --git a/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c b/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
index b0936bc46e97..0ba706a2ef82 100644
--- a/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
+++ b/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
@@ -18,10 +18,8 @@
 
 #include "src/dsp/mips_macro.h"
 
-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
-
-#define MUL(a, b) (((a) * (b)) >> 16)
+static const int kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int kC2 = WEBP_TRANSFORM_AC3_C2;
 
 static void TransformDC(const int16_t* in, uint8_t* dst) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
@@ -49,10 +47,10 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
 
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const int a = in[0] + 4;
-  int c4 = MUL(in[4], kC2);
-  const int d4 = MUL(in[4], kC1);
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
+  int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
+  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
+  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
 
@@ -479,8 +477,6 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 
-#undef MUL
-
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
diff --git a/3rdparty/libwebp/src/dsp/dec_msa.c b/3rdparty/libwebp/src/dsp/dec_msa.c
index 8090622b7b86..58d173019218 100644
--- a/3rdparty/libwebp/src/dsp/dec_msa.c
+++ b/3rdparty/libwebp/src/dsp/dec_msa.c
@@ -37,8 +37,6 @@
   d1_m = d_tmp1_m + d_tmp2_m;                                    \
   BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
 }
-#define MULT1(a) ((((a) * 20091) >> 16) + (a))
-#define MULT2(a) (((a) * 35468) >> 16)
 
 static void TransformOne(const int16_t* in, uint8_t* dst) {
   v8i16 input0, input1;
@@ -124,10 +122,10 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
 
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const int a = in[0] + 4;
-  const int c4 = MULT2(in[4]);
-  const int d4 = MULT1(in[4]);
-  const int in2 = MULT2(in[1]);
-  const int in3 = MULT1(in[1]);
+  const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
+  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
+  const int in2 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int in3 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
   v4i32 tmp0 = { 0 };
   v4i32 out0 = __msa_fill_w(a + d4);
   v4i32 out1 = __msa_fill_w(a + c4);
diff --git a/3rdparty/libwebp/src/dsp/dec_neon.c b/3rdparty/libwebp/src/dsp/dec_neon.c
index fa851707e265..83b3a1f970a3 100644
--- a/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -1000,8 +1000,9 @@ static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
 
-static const int16_t kC1 = 20091;
-static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
+static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int16_t kC2 =
+    WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
 
 #if defined(WEBP_USE_INTRINSICS)
 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
@@ -1255,15 +1256,12 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
 
 //------------------------------------------------------------------------------
 
-#define MUL(a, b) (((a) * (b)) >> 16)
 static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
-  static const int kC1_full = 20091 + (1 << 16);
-  static const int kC2_full = 35468;
   const int16x4_t A = vld1_dup_s16(in);
-  const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
-  const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
-  const int c1 = MUL(in[1], kC2_full);
-  const int d1 = MUL(in[1], kC1_full);
+  const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
+  const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
+  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
                       (uint64_t)( c1 & 0xffff) << 16 |
                       (uint64_t)(-c1 & 0xffff) << 32 |
@@ -1274,7 +1272,6 @@ static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
   Add4x4_NEON(m0_m1, m2_m3, dst);
 }
-#undef MUL
 
 //------------------------------------------------------------------------------
 // 4x4
@@ -1428,7 +1425,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
-#if defined(__aarch64__)
+#if WEBP_AARCH64
     const uint16_t p2 = vaddlv_u8(A);
     sum_top = vdupq_n_u16(p2);
 #else
@@ -1511,7 +1508,7 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
-#if defined(__aarch64__)
+#if WEBP_AARCH64
     const uint16_t p3 = vaddlvq_u8(A);
     sum_top = vdupq_n_u16(p3);
 #else
diff --git a/3rdparty/libwebp/src/dsp/dec_sse2.c b/3rdparty/libwebp/src/dsp/dec_sse2.c
index 873aa59e8a97..ff3a28555b45 100644
--- a/3rdparty/libwebp/src/dsp/dec_sse2.c
+++ b/3rdparty/libwebp/src/dsp/dec_sse2.c
@@ -158,10 +158,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
       dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
     } else {
       // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
+      dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS));
     }
     // Convert to 16b.
     dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -187,24 +187,22 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
       _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
     } else {
       // Store four bytes/pixels per line.
-      WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
-      WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
-      WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
-      WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
+      WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+      WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+      WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+      WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
     }
   }
 }
 
 #if (USE_TRANSFORM_AC3 == 1)
-#define MUL(a, b) (((a) * (b)) >> 16)
+
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
-  static const int kC1 = 20091 + (1 << 16);
-  static const int kC2 = 35468;
   const __m128i A = _mm_set1_epi16(in[0] + 4);
-  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
-  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
+  const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
+  const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
+  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
+  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
   const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
   const __m128i B = _mm_adds_epi16(A, CD);
   const __m128i m0 = _mm_adds_epi16(B, d4);
@@ -213,10 +211,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const __m128i m3 = _mm_subs_epi16(B, d4);
   const __m128i zero = _mm_setzero_si128();
   // Load the source pixels.
-  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
-  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
-  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
-  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
+  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS));
   // Convert to 16b.
   dst0 = _mm_unpacklo_epi8(dst0, zero);
   dst1 = _mm_unpacklo_epi8(dst1, zero);
@@ -233,12 +231,12 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   dst2 = _mm_packus_epi16(dst2, dst2);
   dst3 = _mm_packus_epi16(dst3, dst3);
   // Store the results.
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
 }
-#undef MUL
+
 #endif   // USE_TRANSFORM_AC3
 
 //------------------------------------------------------------------------------
@@ -259,15 +257,15 @@ static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
   *x = _mm_packs_epi16(lo_1, hi_1);
 }
 
-#define FLIP_SIGN_BIT2(a, b) {                                                 \
+#define FLIP_SIGN_BIT2(a, b) do {                                              \
   (a) = _mm_xor_si128(a, sign_bit);                                            \
   (b) = _mm_xor_si128(b, sign_bit);                                            \
-}
+} while (0)
 
-#define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
+#define FLIP_SIGN_BIT4(a, b, c, d) do {                                        \
   FLIP_SIGN_BIT2(a, b);                                                        \
   FLIP_SIGN_BIT2(c, d);                                                        \
-}
+} while (0)
 
 // input/output is uint8_t
 static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
@@ -477,11 +475,11 @@ static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
   // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
   // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
   const __m128i A0 = _mm_set_epi32(
-      WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]),
-      WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride]));
+      WebPMemToInt32(&b[6 * stride]), WebPMemToInt32(&b[2 * stride]),
+      WebPMemToInt32(&b[4 * stride]), WebPMemToInt32(&b[0 * stride]));
   const __m128i A1 = _mm_set_epi32(
-      WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]),
-      WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride]));
+      WebPMemToInt32(&b[7 * stride]), WebPMemToInt32(&b[3 * stride]),
+      WebPMemToInt32(&b[5 * stride]), WebPMemToInt32(&b[1 * stride]));
 
   // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
   // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
@@ -540,7 +538,7 @@ static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
                                       uint8_t* dst, int stride) {
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
+    WebPInt32ToMem(dst, _mm_cvtsi128_si32(*x));
     *x = _mm_srli_si128(*x, 4);
   }
 }
@@ -645,12 +643,12 @@ static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
   (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
 } while (0)
 
-#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
+#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) do {                          \
   (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]);                        \
   (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]);                        \
   (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]);                        \
   (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]);                        \
-}
+} while (0)
 
 #define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
   const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
@@ -658,18 +656,18 @@ static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
   (p) = _mm_unpacklo_epi64(U, V);                                              \
 } while (0)
 
-#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
+#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) do {                     \
   LOADUV_H_EDGE(e1, u, v, 0 * (stride));                                       \
   LOADUV_H_EDGE(e2, u, v, 1 * (stride));                                       \
   LOADUV_H_EDGE(e3, u, v, 2 * (stride));                                       \
   LOADUV_H_EDGE(e4, u, v, 3 * (stride));                                       \
-}
+} while (0)
 
-#define STOREUV(p, u, v, stride) {                                             \
+#define STOREUV(p, u, v, stride) do {                                          \
   _mm_storel_epi64((__m128i*)&(u)[(stride)], p);                               \
   (p) = _mm_srli_si128(p, 8);                                                  \
   _mm_storel_epi64((__m128i*)&(v)[(stride)], p);                               \
-}
+} while (0)
 
 static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
                                          const __m128i* const p0,
@@ -908,10 +906,10 @@ static void VE4_SSE2(uint8_t* dst) {    // vertical
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
   const __m128i b = _mm_subs_epu8(a, lsb);
   const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
-  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  const int vals = _mm_cvtsi128_si32(avg);
   int i;
   for (i = 0; i < 4; ++i) {
-    WebPUint32ToMem(dst + i * BPS, vals);
+    WebPInt32ToMem(dst + i * BPS, vals);
   }
 }
 
@@ -925,10 +923,10 @@ static void LD4_SSE2(uint8_t* dst) {   // Down-Left
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
@@ -946,10 +944,10 @@ static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
 
   // these two are hard to implement in SSE2, so we keep the C-version:
   DST(0, 2) = AVG3(J, I, X);
@@ -970,11 +968,12 @@ static void VL4_SSE2(uint8_t* dst) {   // Vertical-Left
   const __m128i abbc = _mm_or_si128(ab, bc);
   const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
   const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
-  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+  const uint32_t extra_out =
+      (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
 
   // these two are hard to get and irregular
   DST(3, 2) = (extra_out >> 0) & 0xff;
@@ -990,7 +989,7 @@ static void RD4_SSE2(uint8_t* dst) {   // Down-right
   const uint32_t K = dst[-1 + 2 * BPS];
   const uint32_t L = dst[-1 + 3 * BPS];
   const __m128i LKJI_____ =
-      _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24));
+      _mm_cvtsi32_si128((int)(L | (K << 8) | (J << 16) | (I << 24)));
   const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD);
   const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
   const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
@@ -998,10 +997,10 @@ static void RD4_SSE2(uint8_t* dst) {   // Down-right
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 #undef DST
@@ -1015,13 +1014,13 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
   const __m128i zero = _mm_setzero_si128();
   int y;
   if (size == 4) {
-    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
     const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
     for (y = 0; y < 4; ++y, dst += BPS) {
       const int val = dst[-1] - top[-1];
       const __m128i base = _mm_set1_epi16(val);
       const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
-      WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+      WebPInt32ToMem(dst, _mm_cvtsi128_si32(out));
     }
   } else if (size == 8) {
     const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
@@ -1062,7 +1061,7 @@ static void VE16_SSE2(uint8_t* dst) {
 static void HE16_SSE2(uint8_t* dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
-    const __m128i values = _mm_set1_epi8(dst[-1]);
+    const __m128i values = _mm_set1_epi8((char)dst[-1]);
     _mm_storeu_si128((__m128i*)dst, values);
     dst += BPS;
   }
@@ -1070,7 +1069,7 @@ static void HE16_SSE2(uint8_t* dst) {     // horizontal
 
 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 16; ++j) {
     _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
   }
@@ -1130,7 +1129,7 @@ static void VE8uv_SSE2(uint8_t* dst) {    // vertical
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 8; ++j) {
     _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
   }
diff --git a/3rdparty/libwebp/src/dsp/dec_sse41.c b/3rdparty/libwebp/src/dsp/dec_sse41.c
index 8f18506d54bf..08a363027226 100644
--- a/3rdparty/libwebp/src/dsp/dec_sse41.c
+++ b/3rdparty/libwebp/src/dsp/dec_sse41.c
@@ -23,7 +23,7 @@ static void HE16_SSE41(uint8_t* dst) {     // horizontal
   int j;
   const __m128i kShuffle3 = _mm_set1_epi8(3);
   for (j = 16; j > 0; --j) {
-    const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4));
+    const __m128i in = _mm_cvtsi32_si128(WebPMemToInt32(dst - 4));
     const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
     _mm_storeu_si128((__m128i*)dst, values);
     dst += BPS;
diff --git a/3rdparty/libwebp/src/dsp/dsp.h b/3rdparty/libwebp/src/dsp/dsp.h
index 298c721ae2d1..23bc29651452 100644
--- a/3rdparty/libwebp/src/dsp/dsp.h
+++ b/3rdparty/libwebp/src/dsp/dsp.h
@@ -18,6 +18,7 @@
 #include "src/webp/config.h"
 #endif
 
+#include "src/dsp/cpu.h"
 #include "src/webp/types.h"
 
 #ifdef __cplusplus
@@ -27,205 +28,22 @@ extern "C" {
 #define BPS 32   // this is the common stride for enc/dec
 
 //------------------------------------------------------------------------------
-// CPU detection
-
+// WEBP_RESTRICT
+
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
 #if defined(__GNUC__)
-# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
-# define LOCAL_GCC_PREREQ(maj, min) \
-    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_GCC_VERSION 0
-# define LOCAL_GCC_PREREQ(maj, min) 0
-#endif
-
-#if defined(__clang__)
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif
-
-#ifndef __has_builtin
-# define __has_builtin(x) 0
-#endif
-
-#if !defined(HAVE_CONFIG_H)
-#if defined(_MSC_VER) && _MSC_VER > 1310 && \
-    (defined(_M_X64) || defined(_M_IX86))
-#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
-    (defined(_M_X64) || defined(_M_IX86))
-#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
-#endif
-#endif
-
-// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
-// files without intrinsics, allowing the corresponding Init() to be called.
-// Files containing intrinsics will need to be built targeting the instruction
-// set so should succeed on one of the earlier tests.
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
-#define WEBP_USE_SSE2
-#endif
-
-#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
-#define WEBP_USE_SSE41
-#endif
-
-#undef WEBP_MSC_SSE41
-#undef WEBP_MSC_SSE2
-
-// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
-// inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || \
-     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
-    !defined(__native_client__)
-#define WEBP_USE_NEON
-#endif
-
-#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
-    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
-#define WEBP_ANDROID_NEON  // Android targets that may have NEON
-#define WEBP_USE_NEON
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
-#define WEBP_USE_NEON
-#define WEBP_USE_INTRINSICS
-#endif
-
-#if defined(__mips__) && !defined(__mips64) && \
-    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
-#define WEBP_USE_MIPS32
-#if (__mips_isa_rev >= 2)
-#define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
-#define WEBP_USE_MIPS_DSP_R2
-#endif
-#endif
-#endif
-
-#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
-#define WEBP_USE_MSA
-#endif
-
-#ifndef WEBP_DSP_OMIT_C_CODE
-#define WEBP_DSP_OMIT_C_CODE 1
-#endif
-
-#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
-#define WEBP_NEON_OMIT_C_CODE 1
-#else
-#define WEBP_NEON_OMIT_C_CODE 0
-#endif
-
-#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
-#define WEBP_NEON_WORK_AROUND_GCC 1
+#define WEBP_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define WEBP_RESTRICT __restrict
 #else
-#define WEBP_NEON_WORK_AROUND_GCC 0
+#define WEBP_RESTRICT
 #endif
 
-// This macro prevents thread_sanitizer from reporting known concurrent writes.
-#define WEBP_TSAN_IGNORE_FUNCTION
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#undef WEBP_TSAN_IGNORE_FUNCTION
-#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
-#endif
-#endif
-
-#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
-#include <pthread.h>  // NOLINT
-
-#define WEBP_DSP_INIT(func) do {                                    \
-  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
-      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
-  static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \
-  if (pthread_mutex_lock(&func ## _lock)) break;                    \
-  if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func();          \
-  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
-  (void)pthread_mutex_unlock(&func ## _lock);                       \
-} while (0)
-#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
-#define WEBP_DSP_INIT(func) do {                                    \
-  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
-      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
-  if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break;           \
-  func();                                                           \
-  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
-} while (0)
-#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
-
-// Defines an Init + helper function that control multiple initialization of
-// function pointers / tables.
-/* Usage:
-   WEBP_DSP_INIT_FUNC(InitFunc) {
-     ...function body
-   }
-*/
-#define WEBP_DSP_INIT_FUNC(name)                             \
-  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \
-  WEBP_TSAN_IGNORE_FUNCTION void name(void) {                \
-    WEBP_DSP_INIT(name ## _body);                            \
-  }                                                          \
-  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void)
-
-#define WEBP_UBSAN_IGNORE_UNDEF
-#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#if defined(__clang__) && defined(__has_attribute)
-#if __has_attribute(no_sanitize)
-// This macro prevents the undefined behavior sanitizer from reporting
-// failures. This is only meant to silence unaligned loads on platforms that
-// are known to support them.
-#undef WEBP_UBSAN_IGNORE_UNDEF
-#define WEBP_UBSAN_IGNORE_UNDEF \
-  __attribute__((no_sanitize("undefined")))
-
-// This macro prevents the undefined behavior sanitizer from reporting
-// failures related to unsigned integer overflows. This is only meant to
-// silence cases where this well defined behavior is expected.
-#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
-  __attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
-#endif
-
-// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
-// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
-#if !defined(WEBP_OFFSET_PTR)
-#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
-#endif
-
-// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
-#if !defined(WEBP_SWAP_16BIT_CSP)
-#define WEBP_SWAP_16BIT_CSP 0
-#endif
-
-// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
-#if !defined(WORDS_BIGENDIAN) && \
-    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
-     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
-#define WORDS_BIGENDIAN
-#endif
-
-typedef enum {
-  kSSE2,
-  kSSE3,
-  kSlowSSSE3,  // special feature for slow SSSE3 architectures
-  kSSE4_1,
-  kAVX,
-  kAVX2,
-  kNEON,
-  kMIPS32,
-  kMIPSdspR2,
-  kMSA
-} CPUFeature;
-// returns true if the CPU supports the feature.
-typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 
 //------------------------------------------------------------------------------
 // Init stub generator
@@ -385,6 +203,11 @@ extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
 extern VP8WHT VP8TransformWHT;
 
+#define WEBP_TRANSFORM_AC3_C1 20091
+#define WEBP_TRANSFORM_AC3_C2 35468
+#define WEBP_TRANSFORM_AC3_MUL1(a) ((((a) * WEBP_TRANSFORM_AC3_C1) >> 16) + (a))
+#define WEBP_TRANSFORM_AC3_MUL2(a) (((a) * WEBP_TRANSFORM_AC3_C2) >> 16)
+
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
 typedef void (*VP8PredFunc)(uint8_t* dst);
@@ -514,15 +337,6 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
                                     uint8_t* u, uint8_t* v, int width);
 
-// utilities for accurate RGB->YUV conversion
-extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
-                                       uint16_t* dst, int len);
-extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
-                                     int16_t* dst, int len);
-extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
-                                     int len,
-                                     const uint16_t* best_y, uint16_t* out);
-
 // Must be called before using the above.
 void WebPInitConvertARGBToYUV(void);
 
@@ -578,26 +392,29 @@ extern void (*WebPApplyAlphaMultiply4444)(
 
 // Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
 // Returns true if alpha[] plane has non-trivial values different from 0xff.
-extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
-                                int width, int height,
-                                uint8_t* dst, int dst_stride);
+extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha,
+                                int alpha_stride, int width, int height,
+                                uint8_t* WEBP_RESTRICT dst, int dst_stride);
 
 // Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
 // A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
-extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
-                                        int width, int height,
-                                        uint32_t* dst, int dst_stride);
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha,
+                                        int alpha_stride, int width, int height,
+                                        uint32_t* WEBP_RESTRICT dst,
+                                        int dst_stride);
 
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
-extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
-                               int width, int height,
-                               uint8_t* alpha, int alpha_stride);
+extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb,
+                               int argb_stride, int width, int height,
+                               uint8_t* WEBP_RESTRICT alpha,
+                               int alpha_stride);
 
 // Extract the green values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlphaToGreen).
-extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+                                uint8_t* WEBP_RESTRICT alpha, int size);
 
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
@@ -610,29 +427,35 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
                       int inverse);
 
 // Same for a row of single values, with side alpha values.
-extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+                           const uint8_t* WEBP_RESTRICT const alpha,
                            int width, int inverse);
 
 // Same a WebPMultRow(), but for several 'num_rows' rows.
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+                  const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                   int width, int num_rows, int inverse);
 
 // Plain-C versions, used as fallback by some implementations.
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+                   const uint8_t* WEBP_RESTRICT const alpha,
                    int width, int inverse);
 void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
 
 #ifdef WORDS_BIGENDIAN
 // ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
-                            const uint8_t* g, const uint8_t* b, int len,
-                            uint32_t* out);
+extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a,
+                            const uint8_t* WEBP_RESTRICT r,
+                            const uint8_t* WEBP_RESTRICT g,
+                            const uint8_t* WEBP_RESTRICT b,
+                            int len, uint32_t* WEBP_RESTRICT out);
 #endif
 
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                           int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+                           const uint8_t* WEBP_RESTRICT g,
+                           const uint8_t* WEBP_RESTRICT b,
+                           int len, int step, uint32_t* WEBP_RESTRICT out);
 
 // This function returns true if src[i] contains a value different from 0xff.
 extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
diff --git a/3rdparty/libwebp/src/dsp/enc.c b/3rdparty/libwebp/src/dsp/enc.c
index 2fddbc4c5247..395ad05b0bcb 100644
--- a/3rdparty/libwebp/src/dsp/enc.c
+++ b/3rdparty/libwebp/src/dsp/enc.c
@@ -109,10 +109,6 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 #define STORE(x, y, v) \
   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
 
-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
-#define MUL(a, b) (((a) * (b)) >> 16)
-
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
                                       uint8_t* dst) {
   int C[4 * 4], *tmp;
@@ -121,8 +117,10 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   for (i = 0; i < 4; ++i) {    // vertical pass
     const int a = in[0] + in[8];
     const int b = in[0] - in[8];
-    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
-    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
+    const int c =
+        WEBP_TRANSFORM_AC3_MUL2(in[4]) - WEBP_TRANSFORM_AC3_MUL1(in[12]);
+    const int d =
+        WEBP_TRANSFORM_AC3_MUL1(in[4]) + WEBP_TRANSFORM_AC3_MUL2(in[12]);
     tmp[0] = a + d;
     tmp[1] = b + c;
     tmp[2] = b - c;
@@ -134,10 +132,12 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   tmp = C;
   for (i = 0; i < 4; ++i) {    // horizontal pass
     const int dc = tmp[0] + 4;
-    const int a =  dc +  tmp[8];
-    const int b =  dc -  tmp[8];
-    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
-    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    const int a = dc + tmp[8];
+    const int b = dc - tmp[8];
+    const int c =
+        WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]);
+    const int d =
+        WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]);
     STORE(0, i, a + d);
     STORE(1, i, b + c);
     STORE(2, i, b - c);
@@ -222,7 +222,6 @@ static void FTransformWHT_C(const int16_t* in, int16_t* out) {
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
-#undef MUL
 #undef STORE
 
 //------------------------------------------------------------------------------
@@ -732,6 +731,7 @@ VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
 VP8BlockCopy VP8Copy16x8;
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8EncDspInitSSE2(void);
 extern void VP8EncDspInitSSE41(void);
 extern void VP8EncDspInitNEON(void);
@@ -773,10 +773,10 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         VP8EncDspInitSSE41();
       }
@@ -800,7 +800,7 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8EncDspInitNEON();
diff --git a/3rdparty/libwebp/src/dsp/enc_mips32.c b/3rdparty/libwebp/src/dsp/enc_mips32.c
index 618f0fc0ee8b..50518a5f1aed 100644
--- a/3rdparty/libwebp/src/dsp/enc_mips32.c
+++ b/3rdparty/libwebp/src/dsp/enc_mips32.c
@@ -21,8 +21,8 @@
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/cost_enc.h"
 
-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
+static const int kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int kC2 = WEBP_TRANSFORM_AC3_C2;
 
 // macro for one vertical pass in ITransformOne
 // MUL macro inlined
@@ -30,7 +30,7 @@ static const int kC2 = 35468;
 // A..D - offsets in bytes to load from in buffer
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 // TEMP4..TEMP5 - temporary registers
-#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
+#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
   "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
   "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
   "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
@@ -38,12 +38,10 @@ static const int kC2 = 35468;
   "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
   "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
   "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
-  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
-  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  MUL_SHIFT_C1_IO(temp17, temp18)                                           \
+  MUL_SHIFT_C1(temp18, temp19)                                              \
   "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
   "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
-  "sra     %[temp18],      %[temp18],      16                \n\n"          \
-  "sra     %[temp17],      %[temp17],      16                \n\n"          \
   "sra     %[temp19],      %[temp19],      16                \n\n"          \
   "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
   "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
@@ -58,17 +56,15 @@ static const int kC2 = 35468;
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A - offset in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
   "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \
   "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
   "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
   "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \
-  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \
-  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \
+  MUL_SHIFT_C1_IO(TEMP4, TEMP8)                                               \
+  MUL_SHIFT_C1(TEMP8, TEMP12)                                                 \
   "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \
   "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \
-  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \
-  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \
   "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \
   "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
   "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \
diff --git a/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c b/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
index 9ddd895086ff..e1431f3bef2c 100644
--- a/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
+++ b/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
@@ -20,8 +20,8 @@
 #include "src/enc/cost_enc.h"
 #include "src/enc/vp8i_enc.h"
 
-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
+static const int kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int kC2 = WEBP_TRANSFORM_AC3_C2;
 
 // O - output
 // I - input (macro doesn't change it)
diff --git a/3rdparty/libwebp/src/dsp/enc_neon.c b/3rdparty/libwebp/src/dsp/enc_neon.c
index 43bf1245c536..6f641c9a7618 100644
--- a/3rdparty/libwebp/src/dsp/enc_neon.c
+++ b/3rdparty/libwebp/src/dsp/enc_neon.c
@@ -9,7 +9,7 @@
 //
 // ARM NEON version of speed-critical encoding functions.
 //
-// adapted from libvpx (http://www.webmproject.org/code/)
+// adapted from libvpx (https://www.webmproject.org/code/)
 
 #include "src/dsp/dsp.h"
 
@@ -27,8 +27,9 @@
 // This code is pretty much the same as TransformOne in the dec_neon.c, except
 // for subtraction to *ref. See the comments there for algorithmic explanations.
 
-static const int16_t kC1 = 20091;
-static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
+static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
+static const int16_t kC2 =
+    WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
 
 // This code works but is *slower* than the inlined-asm version below
 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
@@ -764,9 +765,14 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
 
 // Horizontal sum of all four uint32_t values in 'sum'.
 static int SumToInt_NEON(uint32x4_t sum) {
+#if WEBP_AARCH64
+  return (int)vaddvq_u32(sum);
+#else
   const uint64x2_t sum2 = vpaddlq_u32(sum);
-  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
-  return (int)sum3;
+  const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
+                                   vreinterpret_u32_u64(vget_high_u64(sum2)));
+  return (int)vget_lane_u32(sum3, 0);
+#endif
 }
 
 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
@@ -860,7 +866,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
   uint8x8x4_t shuffles;
   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
   // non-standard versions there.
-#if defined(__APPLE__) && defined(__aarch64__) && \
+#if defined(__APPLE__) && WEBP_AARCH64 && \
     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
   uint8x16x2_t all_out;
   INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
diff --git a/3rdparty/libwebp/src/dsp/enc_sse2.c b/3rdparty/libwebp/src/dsp/enc_sse2.c
index b2e78ed9411f..010624a2f712 100644
--- a/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -25,9 +25,160 @@
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
-// Does one or two inverse transforms.
-static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                            int do_two) {
+// Does one inverse transform.
+static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in,
+                                uint8_t* dst) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1k2 = _mm_set_epi16(-30068, -30068, -30068, -30068,
+                                     20091, 20091, 20091, 20091);
+  const __m128i k2k1 = _mm_set_epi16(20091, 20091, 20091, 20091,
+                                     -30068, -30068, -30068, -30068);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i zero_four = _mm_set_epi16(0, 0, 0, 0, 4, 4, 4, 4);
+  __m128i T01, T23;
+
+  // Load and concatenate the transform coefficients.
+  const __m128i in01 = _mm_loadu_si128((const __m128i*)&in[0]);
+  const __m128i in23 = _mm_loadu_si128((const __m128i*)&in[8]);
+  // a00 a10 a20 a30   a01 a11 a21 a31
+  // a02 a12 a22 a32   a03 a13 a23 a33
+
+  // Vertical pass and subsequent transpose.
+  {
+    const __m128i in1 = _mm_unpackhi_epi64(in01, in01);
+    const __m128i in3 = _mm_unpackhi_epi64(in23, in23);
+
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i a_d3 = _mm_add_epi16(in01, in23);
+    const __m128i b_c3 = _mm_sub_epi16(in01, in23);
+    const __m128i c1d1 = _mm_mulhi_epi16(in1, k2k1);
+    const __m128i c2d2 = _mm_mulhi_epi16(in3, k1k2);
+    const __m128i c3 = _mm_unpackhi_epi64(b_c3, b_c3);
+    const __m128i c4 = _mm_sub_epi16(c1d1, c2d2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    const __m128i d4u = _mm_add_epi16(c1d1, c2d2);
+    const __m128i du = _mm_add_epi16(a_d3, d4u);
+    const __m128i d = _mm_unpackhi_epi64(du, du);
+
+    // Second pass.
+    const __m128i comb_ab = _mm_unpacklo_epi64(a_d3, b_c3);
+    const __m128i comb_dc = _mm_unpacklo_epi64(d, c);
+
+    const __m128i tmp01 = _mm_add_epi16(comb_ab, comb_dc);
+    const __m128i tmp32 = _mm_sub_epi16(comb_ab, comb_dc);
+    const __m128i tmp23 = _mm_shuffle_epi32(tmp32, _MM_SHUFFLE(1, 0, 3, 2));
+
+    const __m128i transpose_0 = _mm_unpacklo_epi16(tmp01, tmp23);
+    const __m128i transpose_1 = _mm_unpackhi_epi16(tmp01, tmp23);
+    // a00 a20 a01 a21   a02 a22 a03 a23
+    // a10 a30 a11 a31   a12 a32 a13 a33
+
+    T01 = _mm_unpacklo_epi16(transpose_0, transpose_1);
+    T23 = _mm_unpackhi_epi16(transpose_0, transpose_1);
+    // a00 a10 a20 a30   a01 a11 a21 a31
+    // a02 a12 a22 a32   a03 a13 a23 a33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    const __m128i T1 = _mm_unpackhi_epi64(T01, T01);
+    const __m128i T3 = _mm_unpackhi_epi64(T23, T23);
+
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i dc = _mm_add_epi16(T01, zero_four);
+
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i a_d3 = _mm_add_epi16(dc, T23);
+    const __m128i b_c3 = _mm_sub_epi16(dc, T23);
+    const __m128i c1d1 = _mm_mulhi_epi16(T1, k2k1);
+    const __m128i c2d2 = _mm_mulhi_epi16(T3, k1k2);
+    const __m128i c3 = _mm_unpackhi_epi64(b_c3, b_c3);
+    const __m128i c4 = _mm_sub_epi16(c1d1, c2d2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    const __m128i d4u = _mm_add_epi16(c1d1, c2d2);
+    const __m128i du = _mm_add_epi16(a_d3, d4u);
+    const __m128i d = _mm_unpackhi_epi64(du, du);
+
+    // Second pass.
+    const __m128i comb_ab = _mm_unpacklo_epi64(a_d3, b_c3);
+    const __m128i comb_dc = _mm_unpacklo_epi64(d, c);
+
+    const __m128i tmp01 = _mm_add_epi16(comb_ab, comb_dc);
+    const __m128i tmp32 = _mm_sub_epi16(comb_ab, comb_dc);
+    const __m128i tmp23 = _mm_shuffle_epi32(tmp32, _MM_SHUFFLE(1, 0, 3, 2));
+
+    const __m128i shifted01 = _mm_srai_epi16(tmp01, 3);
+    const __m128i shifted23 = _mm_srai_epi16(tmp23, 3);
+    // a00 a01 a02 a03   a10 a11 a12 a13
+    // a20 a21 a22 a23   a30 a31 a32 a33
+
+    const __m128i transpose_0 = _mm_unpacklo_epi16(shifted01, shifted23);
+    const __m128i transpose_1 = _mm_unpackhi_epi16(shifted01, shifted23);
+    // a00 a20 a01 a21   a02 a22 a03 a23
+    // a10 a30 a11 a31   a12 a32 a13 a33
+
+    T01 = _mm_unpacklo_epi16(transpose_0, transpose_1);
+    T23 = _mm_unpackhi_epi16(transpose_0, transpose_1);
+    // a00 a10 a20 a30   a01 a11 a21 a31
+    // a02 a12 a22 a32   a03 a13 a23 a33
+  }
+
+  // Add inverse transform to 'ref' and store.
+  {
+    // Load the reference(s).
+    __m128i ref01, ref23, ref0123;
+    int32_t buf[4];
+
+    // Load four bytes/pixels per line.
+    const __m128i ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS]));
+    const __m128i ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS]));
+    const __m128i ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS]));
+    const __m128i ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS]));
+    ref01 = _mm_unpacklo_epi32(ref0, ref1);
+    ref23 = _mm_unpacklo_epi32(ref2, ref3);
+
+    // Convert to 16b.
+    ref01 = _mm_unpacklo_epi8(ref01, zero);
+    ref23 = _mm_unpacklo_epi8(ref23, zero);
+    // Add the inverse transform(s).
+    ref01 = _mm_add_epi16(ref01, T01);
+    ref23 = _mm_add_epi16(ref23, T23);
+    // Unsigned saturate to 8b.
+    ref0123 = _mm_packus_epi16(ref01, ref23);
+
+    _mm_storeu_si128((__m128i *)buf, ref0123);
+
+    // Store four bytes/pixels per line.
+    WebPInt32ToMem(&dst[0 * BPS], buf[0]);
+    WebPInt32ToMem(&dst[1 * BPS], buf[1]);
+    WebPInt32ToMem(&dst[2 * BPS], buf[2]);
+    WebPInt32ToMem(&dst[3 * BPS], buf[3]);
+  }
+}
+
+// Does two inverse transforms.
+static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in,
+                                uint8_t* dst) {
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -49,33 +200,21 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
   __m128i T0, T1, T2, T3;
 
   // Load and concatenate the transform coefficients (we'll do two inverse
-  // transforms in parallel). In the case of only one inverse transform, the
-  // second half of the vectors will just contain random value we'll never
-  // use nor store.
+  // transforms in parallel).
   __m128i in0, in1, in2, in3;
   {
-    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
-    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
-    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
-    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
-    // a00 a10 a20 a30   x x x x
-    // a01 a11 a21 a31   x x x x
-    // a02 a12 a22 a32   x x x x
-    // a03 a13 a23 a33   x x x x
-    if (do_two) {
-      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
-      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
-      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
-      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
-      in0 = _mm_unpacklo_epi64(in0, inB0);
-      in1 = _mm_unpacklo_epi64(in1, inB1);
-      in2 = _mm_unpacklo_epi64(in2, inB2);
-      in3 = _mm_unpacklo_epi64(in3, inB3);
-      // a00 a10 a20 a30   b00 b10 b20 b30
-      // a01 a11 a21 a31   b01 b11 b21 b31
-      // a02 a12 a22 a32   b02 b12 b22 b32
-      // a03 a13 a23 a33   b03 b13 b23 b33
-    }
+    const __m128i tmp0 = _mm_loadu_si128((const __m128i*)&in[0]);
+    const __m128i tmp1 = _mm_loadu_si128((const __m128i*)&in[8]);
+    const __m128i tmp2 = _mm_loadu_si128((const __m128i*)&in[16]);
+    const __m128i tmp3 = _mm_loadu_si128((const __m128i*)&in[24]);
+    in0 = _mm_unpacklo_epi64(tmp0, tmp2);
+    in1 = _mm_unpackhi_epi64(tmp0, tmp2);
+    in2 = _mm_unpacklo_epi64(tmp1, tmp3);
+    in3 = _mm_unpackhi_epi64(tmp1, tmp3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
   }
 
   // Vertical pass and subsequent transpose.
@@ -148,19 +287,11 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
     const __m128i zero = _mm_setzero_si128();
     // Load the reference(s).
     __m128i ref0, ref1, ref2, ref3;
-    if (do_two) {
-      // Load eight bytes/pixels per line.
-      ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
-      ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
-      ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
-      ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
-    } else {
-      // Load four bytes/pixels per line.
-      ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS]));
-      ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS]));
-      ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS]));
-      ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS]));
-    }
+    // Load eight bytes/pixels per line.
+    ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+    ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+    ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+    ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
     // Convert to 16b.
     ref0 = _mm_unpacklo_epi8(ref0, zero);
     ref1 = _mm_unpacklo_epi8(ref1, zero);
@@ -176,20 +307,21 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
     ref1 = _mm_packus_epi16(ref1, ref1);
     ref2 = _mm_packus_epi16(ref2, ref2);
     ref3 = _mm_packus_epi16(ref3, ref3);
-    // Store the results.
-    if (do_two) {
-      // Store eight bytes/pixels per line.
-      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
-      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
-      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
-      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
-    } else {
-      // Store four bytes/pixels per line.
-      WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
-      WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
-      WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
-      WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
-    }
+    // Store eight bytes/pixels per line.
+    _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
+    _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
+    _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
+    _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
+  }
+}
+
+// Does one or two inverse transforms.
+static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                            int do_two) {
+  if (do_two) {
+    ITransform_Two_SSE2(ref, in, dst);
+  } else {
+    ITransform_One_SSE2(ref, in, dst);
   }
 }
 
@@ -481,7 +613,7 @@ static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 8; ++j) {
     _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
   }
@@ -489,7 +621,7 @@ static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
 
 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 16; ++j) {
     _mm_store_si128((__m128i*)(dst + j * BPS), values);
   }
@@ -540,7 +672,7 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
 static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 8; ++j) {
-    const __m128i values = _mm_set1_epi8(left[j]);
+    const __m128i values = _mm_set1_epi8((char)left[j]);
     _mm_storel_epi64((__m128i*)dst, values);
     dst += BPS;
   }
@@ -549,7 +681,7 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
 static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 16; ++j) {
-    const __m128i values = _mm_set1_epi8(left[j]);
+    const __m128i values = _mm_set1_epi8((char)left[j]);
     _mm_store_si128((__m128i*)dst, values);
     dst += BPS;
   }
@@ -722,10 +854,10 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
   const __m128i b = _mm_subs_epu8(a, lsb);
   const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
-  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  const int vals = _mm_cvtsi128_si32(avg);
   int i;
   for (i = 0; i < 4; ++i) {
-    WebPUint32ToMem(dst + i * BPS, vals);
+    WebPInt32ToMem(dst + i * BPS, vals);
   }
 }
 
@@ -760,10 +892,10 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
@@ -782,10 +914,10 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
 
   // these two are hard to implement in SSE2, so we keep the C-version:
   DST(0, 2) = AVG3(J, I, X);
@@ -807,11 +939,12 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
   const __m128i abbc = _mm_or_si128(ab, bc);
   const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
   const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
-  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+  const uint32_t extra_out =
+      (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
 
   // these two are hard to get and irregular
   DST(3, 2) = (extra_out >> 0) & 0xff;
@@ -829,10 +962,10 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
@@ -875,14 +1008,14 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
 
 static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
   const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
   int y;
   for (y = 0; y < 4; ++y, dst += BPS) {
     const int val = top[-2 - y] - top[-1];
     const __m128i base = _mm_set1_epi16(val);
     const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
-    WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+    WebPInt32ToMem(dst, _mm_cvtsi128_si32(out));
   }
 }
 
diff --git a/3rdparty/libwebp/src/dsp/filters.c b/3rdparty/libwebp/src/dsp/filters.c
index 9e910d99c92a..c9232ff16a6e 100644
--- a/3rdparty/libwebp/src/dsp/filters.c
+++ b/3rdparty/libwebp/src/dsp/filters.c
@@ -19,14 +19,16 @@
 //------------------------------------------------------------------------------
 // Helpful macro.
 
-# define SANITY_CHECK(in, out)                                                 \
-  assert((in) != NULL);                                                        \
-  assert((out) != NULL);                                                       \
-  assert(width > 0);                                                           \
-  assert(height > 0);                                                          \
-  assert(stride >= width);                                                     \
-  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
-  (void)height;  // Silence unused warning.
+#define DCHECK(in, out)                                                        \
+  do {                                                                         \
+    assert((in) != NULL);                                                      \
+    assert((out) != NULL);                                                     \
+    assert(width > 0);                                                         \
+    assert(height > 0);                                                        \
+    assert(stride >= width);                                                   \
+    assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
+    (void)height;  /* Silence unused warning. */                               \
+  } while (0)
 
 #if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
@@ -49,7 +51,7 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
   preds = inverse ? out : in;
@@ -86,7 +88,7 @@ static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
   preds = inverse ? out : in;
@@ -131,7 +133,7 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
   preds = inverse ? out : in;
@@ -165,7 +167,7 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
-#undef SANITY_CHECK
+#undef DCHECK
 
 //------------------------------------------------------------------------------
 
@@ -189,6 +191,12 @@ static void GradientFilter_C(const uint8_t* data, int width, int height,
 
 //------------------------------------------------------------------------------
 
+static void NoneUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                           uint8_t* out, int width) {
+  (void)prev;
+  if (out != in) memcpy(out, in, width * sizeof(*out));
+}
+
 static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
                                  uint8_t* out, int width) {
   uint8_t pred = (prev == NULL) ? 0 : prev[0];
@@ -233,13 +241,14 @@ static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
 WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
 WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8FiltersInitMIPSdspR2(void);
 extern void VP8FiltersInitMSA(void);
 extern void VP8FiltersInitNEON(void);
 extern void VP8FiltersInitSSE2(void);
 
 WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
-  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
+  WebPUnfilters[WEBP_FILTER_NONE] = NoneUnfilter_C;
 #if !WEBP_NEON_OMIT_C_CODE
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
@@ -254,7 +263,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
 #endif
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8FiltersInitSSE2();
     }
@@ -271,13 +280,14 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8FiltersInitNEON();
   }
 #endif
 
+  assert(WebPUnfilters[WEBP_FILTER_NONE] != NULL);
   assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
   assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
   assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
diff --git a/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c b/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
index 9382b12823d7..eca866f57894 100644
--- a/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
+++ b/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
@@ -24,14 +24,16 @@
 //------------------------------------------------------------------------------
 // Helpful macro.
 
-# define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
-  assert(width > 0);                                                           \
-  assert(height > 0);                                                          \
-  assert(stride >= width);                                                     \
-  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
-  (void)height;  // Silence unused warning.
+#define DCHECK(in, out)                                                        \
+  do {                                                                         \
+    assert(in != NULL);                                                        \
+    assert(out != NULL);                                                       \
+    assert(width > 0);                                                         \
+    assert(height > 0);                                                        \
+    assert(stride >= width);                                                   \
+    assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
+    (void)height;  /* Silence unused warning. */                               \
+  } while (0)
 
 #define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do {                        \
     const uint8_t* psrc = (uint8_t*)(SRC);                                     \
@@ -200,7 +202,7 @@ static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
   preds = in;
@@ -248,7 +250,7 @@ static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
   preds = in;
@@ -316,7 +318,7 @@ static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
   preds = in;
@@ -378,7 +380,7 @@ static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
 #undef DO_PREDICT_LINE_VERTICAL
 #undef PREDICT_LINE_ONE_PASS
 #undef DO_PREDICT_LINE
-#undef SANITY_CHECK
+#undef DCHECK
 
 //------------------------------------------------------------------------------
 // Entry point
diff --git a/3rdparty/libwebp/src/dsp/filters_msa.c b/3rdparty/libwebp/src/dsp/filters_msa.c
index 14c437d141b3..33a1b20b70d6 100644
--- a/3rdparty/libwebp/src/dsp/filters_msa.c
+++ b/3rdparty/libwebp/src/dsp/filters_msa.c
@@ -56,12 +56,14 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 //------------------------------------------------------------------------------
 // Helpful macro.
 
-#define SANITY_CHECK(in, out)  \
-  assert(in != NULL);          \
-  assert(out != NULL);         \
-  assert(width > 0);           \
-  assert(height > 0);          \
-  assert(stride >= width);
+#define DCHECK(in, out)        \
+  do {                         \
+    assert(in != NULL);        \
+    assert(out != NULL);       \
+    assert(width > 0);         \
+    assert(height > 0);        \
+    assert(stride >= width);   \
+  } while (0)
 
 //------------------------------------------------------------------------------
 // Horrizontal filter
@@ -72,7 +74,7 @@ static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
   const uint8_t* in = data;
   uint8_t* out = filtered_data;
   int row = 1;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
 
   // Leftmost pixel is the same as input for topmost scanline.
   out[0] = in[0];
@@ -135,7 +137,7 @@ static void GradientFilter_MSA(const uint8_t* data, int width, int height,
   const uint8_t* preds = data;
   uint8_t* out = filtered_data;
   int row = 1;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
 
   // left prediction for top scan-line
   out[0] = in[0];
@@ -163,7 +165,7 @@ static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
   const uint8_t* preds = data;
   uint8_t* out = filtered_data;
   int row = 1;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
 
   // Very first top-left pixel is copied.
   out[0] = in[0];
@@ -182,7 +184,7 @@ static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
   }
 }
 
-#undef SANITY_CHECK
+#undef DCHECK
 
 //------------------------------------------------------------------------------
 // Entry point
diff --git a/3rdparty/libwebp/src/dsp/filters_neon.c b/3rdparty/libwebp/src/dsp/filters_neon.c
index 3e6a578ea7b1..b49e515af1f0 100644
--- a/3rdparty/libwebp/src/dsp/filters_neon.c
+++ b/3rdparty/libwebp/src/dsp/filters_neon.c
@@ -21,14 +21,16 @@
 //------------------------------------------------------------------------------
 // Helpful macros.
 
-# define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
-  assert(width > 0);                                                           \
-  assert(height > 0);                                                          \
-  assert(stride >= width);                                                     \
-  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
-  (void)height;  // Silence unused warning.
+#define DCHECK(in, out)                                                        \
+  do {                                                                         \
+    assert(in != NULL);                                                        \
+    assert(out != NULL);                                                       \
+    assert(width > 0);                                                         \
+    assert(height > 0);                                                        \
+    assert(stride >= width);                                                   \
+    assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
+    (void)height;  /* Silence unused warning. */                               \
+  } while (0)
 
 // load eight u8 and widen to s16
 #define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A))
@@ -71,7 +73,7 @@ static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
                                                 uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
 
@@ -110,7 +112,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
                                               uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
 
@@ -172,7 +174,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
                                               uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
 
@@ -201,7 +203,7 @@ static void GradientFilter_NEON(const uint8_t* data, int width, int height,
                         filtered_data);
 }
 
-#undef SANITY_CHECK
+#undef DCHECK
 
 //------------------------------------------------------------------------------
 // Inverse transforms
diff --git a/3rdparty/libwebp/src/dsp/filters_sse2.c b/3rdparty/libwebp/src/dsp/filters_sse2.c
index 4b3f2d020f40..bb4b5d587483 100644
--- a/3rdparty/libwebp/src/dsp/filters_sse2.c
+++ b/3rdparty/libwebp/src/dsp/filters_sse2.c
@@ -23,14 +23,16 @@
 //------------------------------------------------------------------------------
 // Helpful macro.
 
-# define SANITY_CHECK(in, out)                                                 \
-  assert((in) != NULL);                                                        \
-  assert((out) != NULL);                                                       \
-  assert(width > 0);                                                           \
-  assert(height > 0);                                                          \
-  assert(stride >= width);                                                     \
-  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
-  (void)height;  // Silence unused warning.
+#define DCHECK(in, out)                                                        \
+  do {                                                                         \
+    assert((in) != NULL);                                                      \
+    assert((out) != NULL);                                                     \
+    assert(width > 0);                                                         \
+    assert(height > 0);                                                        \
+    assert(stride >= width);                                                   \
+    assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
+    (void)height;  /* Silence unused warning. */                               \
+  } while (0)
 
 static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
                                 uint8_t* dst, int length) {
@@ -78,7 +80,7 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
                                                 uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
 
@@ -111,7 +113,7 @@ static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
                                               uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
 
@@ -174,7 +176,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
                                               uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
-  SANITY_CHECK(in, out);
+  DCHECK(in, out);
   in += start_offset;
   out += start_offset;
 
@@ -197,7 +199,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
   }
 }
 
-#undef SANITY_CHECK
+#undef DCHECK
 
 //------------------------------------------------------------------------------
 
@@ -320,7 +322,12 @@ extern void VP8FiltersInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+#if defined(CHROMIUM)
+  // TODO(crbug.com/654974)
+  (void)VerticalUnfilter_SSE2;
+#else
   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+#endif
   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
 
   WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
diff --git a/3rdparty/libwebp/src/dsp/lossless.c b/3rdparty/libwebp/src/dsp/lossless.c
index 46b220e2edc9..9f8120945397 100644
--- a/3rdparty/libwebp/src/dsp/lossless.c
+++ b/3rdparty/libwebp/src/dsp/lossless.c
@@ -49,7 +49,7 @@ static WEBP_INLINE uint32_t Clip255(uint32_t a) {
 }
 
 static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
-  return Clip255(a + b - c);
+  return Clip255((uint32_t)(a + b - c));
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
@@ -66,7 +66,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
 }
 
 static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
-  return Clip255(a + (a - b) / 2);
+  return Clip255((uint32_t)(a + (a - b) / 2));
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
@@ -107,63 +107,77 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
-  return left;
+  return *left;
 }
-uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average3(*left, top[0], top[1]);
   return pred;
 }
-uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[0]);
   return pred;
 }
-uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Average4(*left, top[-1], top[0], top[1]);
   return pred;
 }
-uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], *left, top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(*left, top[0], top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(*left, top[0], top[-1]);
   return pred;
 }
 
@@ -279,10 +293,10 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
     const uint32_t red = argb >> 16;
     int new_red = red & 0xff;
     int new_blue = argb & 0xff;
-    new_red += ColorTransformDelta(m->green_to_red_, green);
+    new_red += ColorTransformDelta((int8_t)m->green_to_red_, green);
     new_red &= 0xff;
-    new_blue += ColorTransformDelta(m->green_to_blue_, green);
-    new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red);
+    new_blue += ColorTransformDelta((int8_t)m->green_to_blue_, green);
+    new_blue += ColorTransformDelta((int8_t)m->red_to_blue_, (int8_t)new_red);
     new_blue &= 0xff;
     dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
@@ -381,7 +395,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
   assert(row_start < row_end);
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
-    case SUBTRACT_GREEN:
+    case SUBTRACT_GREEN_TRANSFORM:
       VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
       break;
     case PREDICTOR_TRANSFORM:
@@ -574,7 +588,9 @@ VP8LConvertFunc VP8LConvertBGRAToBGR;
 VP8LMapARGBFunc VP8LMapColor32b;
 VP8LMapAlphaFunc VP8LMapColor8b;
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitSSE41(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
 extern void VP8LDspInitMSA(void);
@@ -621,9 +637,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8LDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8LDspInitSSE41();
+      }
+#endif
     }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
@@ -638,7 +659,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8LDspInitNEON();
diff --git a/3rdparty/libwebp/src/dsp/lossless.h b/3rdparty/libwebp/src/dsp/lossless.h
index ebd316d1ed7b..0bf10a1a3dab 100644
--- a/3rdparty/libwebp/src/dsp/lossless.h
+++ b/3rdparty/libwebp/src/dsp/lossless.h
@@ -28,23 +28,38 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Decoding
 
-typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
+                                      const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
 
-uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top);
 
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
@@ -167,9 +182,9 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 // -----------------------------------------------------------------------------
 // Huffman-cost related functions.
 
-typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
-typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
-                                       int length);
+typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length);
+typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+                                         int length);
 typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
                                                 const int Y[256]);
 
@@ -183,7 +198,7 @@ typedef struct {        // small struct to hold counters
 } VP8LStreaks;
 
 typedef struct {            // small struct to hold bit entropy results
-  double entropy;           // entropy
+  float entropy;            // entropy
   uint32_t sum;             // sum of the population
   int nonzeros;             // number of non-zero elements in the population
   uint32_t max_val;         // maximum value in the population
diff --git a/3rdparty/libwebp/src/dsp/lossless_common.h b/3rdparty/libwebp/src/dsp/lossless_common.h
index 96a106f9eebc..d6139b2b577d 100644
--- a/3rdparty/libwebp/src/dsp/lossless_common.h
+++ b/3rdparty/libwebp/src/dsp/lossless_common.h
@@ -16,9 +16,9 @@
 #ifndef WEBP_DSP_LOSSLESS_COMMON_H_
 #define WEBP_DSP_LOSSLESS_COMMON_H_
 
-#include "src/webp/types.h"
-
+#include "src/dsp/cpu.h"
 #include "src/utils/utils.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -166,7 +166,7 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
 }
 
 //------------------------------------------------------------------------------
-// Transform-related functions use din both encoding and decoding.
+// Transform-related functions used in both encoding and decoding.
 
 // Macros used to create a batch predictor that iteratively uses a
 // one-pixel predictor.
@@ -179,7 +179,7 @@ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
   int x;                                                             \
   assert(upper != NULL);                                             \
   for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x);        \
+    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);       \
     out[x] = VP8LAddPixels(in[x], pred);                             \
   }                                                                  \
 }
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc.c b/3rdparty/libwebp/src/dsp/lossless_enc.c
index a0c7ab911798..997d56c2ad30 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -329,6 +329,15 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
 static float FastSLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+    // use clz if available
+    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t y = 1 << log_cnt;
+    int correction = 0;
+    const float v_f = (float)v;
+    const uint32_t orig_v = v;
+    v >>= log_cnt;
+#else
     int log_cnt = 0;
     uint32_t y = 1;
     int correction = 0;
@@ -339,6 +348,7 @@ static float FastSLog2Slow_C(uint32_t v) {
       v = v >> 1;
       y = y << 1;
     } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
     // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
     // Xf = floor(Xf) * (1 + (v % y) / v)
     // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
@@ -355,6 +365,14 @@ static float FastSLog2Slow_C(uint32_t v) {
 static float FastLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+    // use clz if available
+    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t y = 1 << log_cnt;
+    const uint32_t orig_v = v;
+    double log_2;
+    v >>= log_cnt;
+#else
     int log_cnt = 0;
     uint32_t y = 1;
     const uint32_t orig_v = v;
@@ -364,6 +382,7 @@ static float FastLog2Slow_C(uint32_t v) {
       v = v >> 1;
       y = y << 1;
     } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
     log_2 = kLog2Table[v] + log_cnt;
     if (orig_v >= APPROX_LOG_MAX) {
       // Since the division is still expensive, add this correction factor only
@@ -383,7 +402,7 @@ static float FastLog2Slow_C(uint32_t v) {
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
 static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
   int i;
-  double retval = 0.;
+  float retval = 0.f;
   int sumX = 0, sumXY = 0;
   for (i = 0; i < 256; ++i) {
     const int x = X[i];
@@ -399,7 +418,7 @@ static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
     }
   }
   retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
-  return (float)retval;
+  return retval;
 }
 
 void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
@@ -503,11 +522,11 @@ static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const int argb = argb_data[i];
+    const int argb = (int)argb_data[i];
     const int green = (argb >> 8) & 0xff;
     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
     const uint32_t new_b = (((argb >>  0) & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
+    argb_data[i] = ((uint32_t)argb & 0xff00ff00u) | (new_r << 16) | new_b;
   }
 }
 
@@ -528,10 +547,10 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
     const int8_t red   = U32ToS8(argb >> 16);
     int new_red = red & 0xff;
     int new_blue = argb & 0xff;
-    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red -= ColorTransformDelta((int8_t)m->green_to_red_, green);
     new_red &= 0xff;
-    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
-    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue -= ColorTransformDelta((int8_t)m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta((int8_t)m->red_to_blue_, red);
     new_blue &= 0xff;
     data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
@@ -541,7 +560,7 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
                                              uint32_t argb) {
   const int8_t green = U32ToS8(argb >> 8);
   int new_red = argb >> 16;
-  new_red -= ColorTransformDelta(green_to_red, green);
+  new_red -= ColorTransformDelta((int8_t)green_to_red, green);
   return (new_red & 0xff);
 }
 
@@ -550,9 +569,9 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
                                               uint32_t argb) {
   const int8_t green = U32ToS8(argb >>  8);
   const int8_t red   = U32ToS8(argb >> 16);
-  uint8_t new_blue = argb & 0xff;
-  new_blue -= ColorTransformDelta(green_to_blue, green);
-  new_blue -= ColorTransformDelta(red_to_blue, red);
+  int new_blue = argb & 0xff;
+  new_blue -= ColorTransformDelta((int8_t)green_to_blue, green);
+  new_blue -= ColorTransformDelta((int8_t)red_to_blue, red);
   return (new_blue & 0xff);
 }
 
@@ -617,20 +636,25 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
 
 //------------------------------------------------------------------------------
 
-static double ExtraCost_C(const uint32_t* population, int length) {
+static uint32_t ExtraCost_C(const uint32_t* population, int length) {
   int i;
-  double cost = 0.;
-  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+  uint32_t cost = population[4] + population[5];
+  assert(length % 2 == 0);
+  for (i = 2; i < length / 2 - 1; ++i) {
+    cost += i * (population[2 * i + 2] + population[2 * i + 3]);
+  }
   return cost;
 }
 
-static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
-                                  int length) {
+static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+                                    int length) {
   int i;
-  double cost = 0.;
-  for (i = 2; i < length - 2; ++i) {
-    const int xy = X[i + 2] + Y[i + 2];
-    cost += (i >> 1) * xy;
+  uint32_t cost = X[4] + Y[4] + X[5] + Y[5];
+  assert(length % 2 == 0);
+  for (i = 2; i < length / 2 - 1; ++i) {
+    const int xy0 = X[2 * i + 2] + Y[2 * i + 2];
+    const int xy1 = X[2 * i + 3] + Y[2 * i + 3];
+    cost += i * (xy0 + xy1);
   }
   return cost;
 }
@@ -726,7 +750,7 @@ static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
   assert(upper != NULL);                                                   \
   for (x = 0; x < num_pixels; ++x) {                                       \
     const uint32_t pred =                                                  \
-        VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x);              \
+        VP8LPredictor##PREDICTOR_I##_C(&in[x - 1], upper + x);             \
     out[x] = VP8LSubPixels(in[x], pred);                                   \
   }                                                                        \
 }
@@ -772,6 +796,7 @@ VP8LBundleColorMapFunc VP8LBundleColorMap;
 VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
 VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8LEncDspInitSSE2(void);
 extern void VP8LEncDspInitSSE41(void);
 extern void VP8LEncDspInitNEON(void);
@@ -843,10 +868,10 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8LEncDspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         VP8LEncDspInitSSE41();
       }
@@ -870,7 +895,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8LEncDspInitNEON();
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c b/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
index 0412a093cf9a..e10f12da9d58 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -103,8 +103,8 @@ static float FastLog2Slow_MIPS32(uint32_t v) {
 //     cost += i * *(pop + 1);
 //     pop += 2;
 //   }
-//   return (double)cost;
-static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
+//   return cost;
+static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) {
   int i, temp0, temp1;
   const uint32_t* pop = &population[4];
   const uint32_t* const LoopEnd = &population[length];
@@ -130,7 +130,7 @@ static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
     : "memory", "hi", "lo"
   );
 
-  return (double)((int64_t)temp0 << 32 | temp1);
+  return ((int64_t)temp0 << 32 | temp1);
 }
 
 // C version of this function:
@@ -148,9 +148,9 @@ static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
 //     pX += 2;
 //     pY += 2;
 //   }
-//   return (double)cost;
-static double ExtraCostCombined_MIPS32(const uint32_t* const X,
-                                       const uint32_t* const Y, int length) {
+//   return cost;
+static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
+                                         const uint32_t* const Y, int length) {
   int i, temp0, temp1, temp2, temp3;
   const uint32_t* pX = &X[4];
   const uint32_t* pY = &Y[4];
@@ -183,7 +183,7 @@ static double ExtraCostCombined_MIPS32(const uint32_t* const X,
     : "memory", "hi", "lo"
   );
 
-  return (double)((int64_t)temp0 << 32 | temp1);
+  return ((int64_t)temp0 << 32 | temp1);
 }
 
 #define HUFFMAN_COST_PASS                                 \
@@ -347,24 +347,24 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
 static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
                              uint32_t* pout, int size) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  const uint32_t end = ((size) / 4) * 4;
+  const int end = ((size) / 4) * 4;
   const uint32_t* const LoopEnd = pa + end;
   int i;
   ASM_START
   ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
   ASM_END_0
-  for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
+  for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];
 }
 
 static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  const uint32_t end = ((size) / 4) * 4;
+  const int end = ((size) / 4) * 4;
   const uint32_t* const LoopEnd = pa + end;
   int i;
   ASM_START
   ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
   ASM_END_1
-  for (i = end; i < size; ++i) pout[i] += pa[i];
+  for (i = 0; i < size - end; ++i) pout[i] += pa[i];
 }
 
 #undef ASM_END_1
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc_neon.c b/3rdparty/libwebp/src/dsp/lossless_enc_neon.c
index 7c7b73f8b692..e32c7961a239 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc_neon.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_neon.c
@@ -25,7 +25,7 @@
 
 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
 // non-standard versions there.
-#if defined(__APPLE__) && defined(__aarch64__) && \
+#if defined(__APPLE__) && WEBP_AARCH64 && \
     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
 #define USE_VTBLQ
 #endif
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 90c263735f58..66cbaab7720a 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -54,8 +54,8 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m,
   const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
                                      CST_5b(m->green_to_blue_));
   const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
-  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
-  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);  // alpha-green masks
+  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);       // red-blue masks
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
@@ -232,79 +232,55 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
 //------------------------------------------------------------------------------
 // Entropy
 
-// Checks whether the X or Y contribution is worth computing and adding.
-// Used in loop unrolling.
-#define ANALYZE_X_OR_Y(x_or_y, j)                                           \
-  do {                                                                      \
-    if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
-  } while (0)
-
-// Checks whether the X + Y contribution is worth computing and adding.
-// Used in loop unrolling.
-#define ANALYZE_XY(j)                  \
-  do {                                 \
-    if (tmp[j] != 0) {                 \
-      retval -= VP8LFastSLog2(tmp[j]); \
-      ANALYZE_X_OR_Y(X, j);            \
-    }                                  \
-  } while (0)
-
-#if !(defined(__i386__) || defined(_M_IX86))
+// TODO(https://crbug.com/webp/499): this function produces different results
+// from the C code due to use of double/float resulting in output differences
+// when compared to -noasm.
+#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
+
 static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
   int i;
-  double retval = 0.;
-  int sumX, sumXY;
-  int32_t tmp[4];
-  __m128i zero = _mm_setzero_si128();
-  // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
-  __m128i sumXY_128 = zero;
-  __m128i sumX_128 = zero;
-
-  for (i = 0; i < 256; i += 4) {
-    const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
-    const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
-
-    // Check if any X is non-zero: this actually provides a speedup as X is
-    // usually sparse.
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
-      const __m128i xy_128 = _mm_add_epi32(x, y);
-      sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
-
-      sumX_128 = _mm_add_epi32(sumX_128, x);
-
-      // Analyze the different X + Y.
-      _mm_storeu_si128((__m128i*)tmp, xy_128);
-
-      ANALYZE_XY(0);
-      ANALYZE_XY(1);
-      ANALYZE_XY(2);
-      ANALYZE_XY(3);
-    } else {
-      // X is fully 0, so only deal with Y.
-      sumXY_128 = _mm_add_epi32(sumXY_128, y);
-
-      ANALYZE_X_OR_Y(Y, 0);
-      ANALYZE_X_OR_Y(Y, 1);
-      ANALYZE_X_OR_Y(Y, 2);
-      ANALYZE_X_OR_Y(Y, 3);
+  float retval = 0.f;
+  int sumX = 0, sumXY = 0;
+  const __m128i zero = _mm_setzero_si128();
+
+  for (i = 0; i < 256; i += 16) {
+    const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i +  0));
+    const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i +  0));
+    const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i +  4));
+    const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i +  4));
+    const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i +  8));
+    const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i +  8));
+    const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
+    const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
+    const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
+                                       _mm_packs_epi32(x2, x3));
+    const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
+                                       _mm_packs_epi32(y2, y3));
+    const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
+    int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
+    while (my) {
+      const int32_t j = BitsCtz(my);
+      int xy;
+      if ((mx >> j) & 1) {
+        const int x = X[i + j];
+        sumXY += x;
+        retval -= VP8LFastSLog2(x);
+      }
+      xy = X[i + j] + Y[i + j];
+      sumX += xy;
+      retval -= VP8LFastSLog2(xy);
+      my &= my - 1;
     }
   }
-
-  // Sum up sumX_128 to get sumX.
-  _mm_storeu_si128((__m128i*)tmp, sumX_128);
-  sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
-
-  // Sum up sumXY_128 to get sumXY.
-  _mm_storeu_si128((__m128i*)tmp, sumXY_128);
-  sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
-
   retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
-  return (float)retval;
+  return retval;
 }
-#endif  // !(defined(__i386__) || defined(_M_IX86))
 
-#undef ANALYZE_X_OR_Y
-#undef ANALYZE_XY
+#else
+
+#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC   // won't be faster
+
+#endif
 
 //------------------------------------------------------------------------------
 
@@ -400,7 +376,7 @@ static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
       break;
     }
     case 2: {
-      const __m128i mask_or = _mm_set1_epi32(0xff000000);
+      const __m128i mask_or = _mm_set1_epi32((int)0xff000000);
       const __m128i mul_cst = _mm_set1_epi16(0x0104);
       const __m128i mask_mul = _mm_set1_epi16(0x0f00);
       for (x = 0; x + 16 <= width; x += 16, dst += 4) {
@@ -451,7 +427,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
 static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     const __m128i res = _mm_sub_epi8(src, black);
@@ -662,10 +638,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
   VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
   VP8LAddVector = AddVector_SSE2;
   VP8LAddVectorEq = AddVectorEq_SSE2;
-  // TODO(https://crbug.com/webp/499): this function produces different results
-  // from the C code due to use of double/float resulting in output differences
-  // when compared to -noasm.
-#if !(defined(__i386__) || defined(_M_IX86))
+#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
   VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
 #endif
   VP8LVectorMismatch = VectorMismatch_SSE2;
diff --git a/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 719d8ed25e15..7ab83c2604b4 100644
--- a/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -18,8 +18,53 @@
 #include <smmintrin.h>
 #include "src/dsp/lossless.h"
 
-// For sign-extended multiplying constants, pre-shifted by 5:
-#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
+//------------------------------------------------------------------------------
+// Cost operations.
+
+static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) {
+  cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8));
+  cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4));
+  return _mm_cvtsi128_si32(cost);
+}
+
+static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) {
+  int i;
+  __m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]);
+  assert(length % 8 == 0);
+
+  for (i = 8; i + 8 <= length; i += 8) {
+    const int j = (i - 2) >> 1;
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+    const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j);
+    const __m128i a2 = _mm_hadd_epi32(a0, a1);
+    const __m128i mul = _mm_mullo_epi32(a2, w);
+    cost = _mm_add_epi32(mul, cost);
+  }
+  return HorizontalSum_SSE41(cost);
+}
+
+static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a,
+                                        const uint32_t* const b, int length) {
+  int i;
+  __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
+                               _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
+  assert(length % 8 == 0);
+
+  for (i = 8; i + 8 <= length; i += 8) {
+    const int j = (i - 2) >> 1;
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
+    const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j);
+    const __m128i a2 = _mm_hadd_epi32(a0, a1);
+    const __m128i b2 = _mm_hadd_epi32(b0, b1);
+    const __m128i mul = _mm_mullo_epi32(_mm_add_epi32(a2, b2), w);
+    cost = _mm_add_epi32(mul, cost);
+  }
+  return HorizontalSum_SSE41(cost);
+}
 
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
@@ -44,46 +89,50 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
 //------------------------------------------------------------------------------
 // Color Transform
 
-#define SPAN 8
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
 static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
                                              int tile_width, int tile_height,
                                              int green_to_blue, int red_to_blue,
                                              int histo[]) {
-  const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
-  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
-  const __m128i mask_g = _mm_set1_epi16((short)0xff00);   // green mask
-  const __m128i mask_gb = _mm_set1_epi32(0xffff);         // green/blue mask
-  const __m128i mask_b = _mm_set1_epi16(0x00ff);          // blue mask
-  const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
-                                            -1, -1, -1, -1, -1, -1, -1);
-  const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                            2, -1, 6, -1, 10, -1, 14);
-  int y;
-  for (y = 0; y < tile_height; ++y) {
-    const uint32_t* const src = argb + y * stride;
-    int i, x;
-    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
-      uint16_t values[SPAN];
-      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
-      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
-      const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
-      const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
-      const __m128i r = _mm_or_si128(r0, r1);         // r 0
-      const __m128i gb0 = _mm_and_si128(in0, mask_gb);
-      const __m128i gb1 = _mm_and_si128(in1, mask_gb);
-      const __m128i gb = _mm_packus_epi32(gb0, gb1);  // g b
-      const __m128i g = _mm_and_si128(gb, mask_g);    // g 0
-      const __m128i A = _mm_mulhi_epi16(r, mults_r);  // x dbr
-      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dbg
-      const __m128i C = _mm_sub_epi8(gb, B);          // x b'
-      const __m128i D = _mm_sub_epi8(C, A);           // x b''
-      const __m128i E = _mm_and_si128(D, mask_b);     // 0 b''
-      _mm_storeu_si128((__m128i*)values, E);
-      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+  const __m128i mult =
+      MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
+  const __m128i perm =
+      _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
+  if (tile_width >= 4) {
+    int y;
+    for (y = 0; y < tile_height; ++y) {
+      const uint32_t* const src = argb + y * stride;
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+      const __m128i B1 = _mm_shuffle_epi8(A1, perm);
+      const __m128i C1 = _mm_mulhi_epi16(B1, mult);
+      const __m128i D1 = _mm_sub_epi16(A1, C1);
+      __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
+      int x;
+      for (x = 4; x + 4 <= tile_width; x += 4) {
+        const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+        __m128i B2, C2, D2;
+        ++histo[_mm_extract_epi8(E,  0)];
+        B2 = _mm_shuffle_epi8(A2, perm);
+        ++histo[_mm_extract_epi8(E,  4)];
+        C2 = _mm_mulhi_epi16(B2, mult);
+        ++histo[_mm_extract_epi8(E,  8)];
+        D2 = _mm_sub_epi16(A2, C2);
+        ++histo[_mm_extract_epi8(E, 12)];
+        E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
+      }
+      ++histo[_mm_extract_epi8(E,  0)];
+      ++histo[_mm_extract_epi8(E,  4)];
+      ++histo[_mm_extract_epi8(E,  8)];
+      ++histo[_mm_extract_epi8(E, 12)];
     }
   }
   {
-    const int left_over = tile_width & (SPAN - 1);
+    const int left_over = tile_width & 3;
     if (left_over > 0) {
       VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
                                        left_over, tile_height,
@@ -95,33 +144,37 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
 static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
                                             int tile_width, int tile_height,
                                             int green_to_red, int histo[]) {
-  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
-  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
-  const __m128i mask = _mm_set1_epi16(0xff);
-
-  int y;
-  for (y = 0; y < tile_height; ++y) {
-    const uint32_t* const src = argb + y * stride;
-    int i, x;
-    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
-      uint16_t values[SPAN];
-      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
-      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
-      const __m128i g0 = _mm_and_si128(in0, mask_g);  // 0 0  | g 0
-      const __m128i g1 = _mm_and_si128(in1, mask_g);
-      const __m128i g = _mm_packus_epi32(g0, g1);     // g 0
-      const __m128i A0 = _mm_srli_epi32(in0, 16);     // 0 0  | x r
-      const __m128i A1 = _mm_srli_epi32(in1, 16);
-      const __m128i A = _mm_packus_epi32(A0, A1);     // x r
-      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dr
-      const __m128i C = _mm_sub_epi8(A, B);           // x r'
-      const __m128i D = _mm_and_si128(C, mask);       // 0 r'
-      _mm_storeu_si128((__m128i*)values, D);
-      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+
+  const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
+  if (tile_width >= 4) {
+    int y;
+    for (y = 0; y < tile_height; ++y) {
+      const uint32_t* const src = argb + y * stride;
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+      const __m128i B1 = _mm_and_si128(A1, mask_g);
+      const __m128i C1 = _mm_madd_epi16(B1, mult);
+      __m128i D = _mm_sub_epi16(A1, C1);
+      int x;
+      for (x = 4; x + 4 <= tile_width; x += 4) {
+        const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+        __m128i B2, C2;
+        ++histo[_mm_extract_epi8(D,  2)];
+        B2 = _mm_and_si128(A2, mask_g);
+        ++histo[_mm_extract_epi8(D,  6)];
+        C2 = _mm_madd_epi16(B2, mult);
+        ++histo[_mm_extract_epi8(D, 10)];
+        ++histo[_mm_extract_epi8(D, 14)];
+        D = _mm_sub_epi16(A2, C2);
+      }
+      ++histo[_mm_extract_epi8(D,  2)];
+      ++histo[_mm_extract_epi8(D,  6)];
+      ++histo[_mm_extract_epi8(D, 10)];
+      ++histo[_mm_extract_epi8(D, 14)];
     }
   }
   {
-    const int left_over = tile_width & (SPAN - 1);
+    const int left_over = tile_width & 3;
     if (left_over > 0) {
       VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
                                       left_over, tile_height, green_to_red,
@@ -130,12 +183,16 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
   }
 }
 
+#undef MK_CST_16
+
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8LEncDspInitSSE41(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
+  VP8LExtraCost = ExtraCost_SSE41;
+  VP8LExtraCostCombined = ExtraCostCombined_SSE41;
   VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
   VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
   VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
diff --git a/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 9888854d5719..bfe5ea6b3865 100644
--- a/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
+++ b/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -188,46 +188,51 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return Average2(Average2(a0, a1), Average2(a2, a3));
 }
 
-static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average3(left, top[0], top[1]);
+static uint32_t Predictor5_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average3(*left, top[0], top[1]);
 }
 
-static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average2(left, top[-1]);
+static uint32_t Predictor6_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[-1]);
 }
 
-static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average2(left, top[0]);
+static uint32_t Predictor7_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[0]);
 }
 
-static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
   (void)left;
   return Average2(top[-1], top[0]);
 }
 
-static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
   (void)left;
   return Average2(top[0], top[1]);
 }
 
-static uint32_t Predictor10_MIPSdspR2(uint32_t left,
+static uint32_t Predictor10_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return Average4(left, top[-1], top[0], top[1]);
+  return Average4(*left, top[-1], top[0], top[1]);
 }
 
-static uint32_t Predictor11_MIPSdspR2(uint32_t left,
+static uint32_t Predictor11_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return Select(top[0], left, top[-1]);
+  return Select(top[0], *left, top[-1]);
 }
 
-static uint32_t Predictor12_MIPSdspR2(uint32_t left,
+static uint32_t Predictor12_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return ClampedAddSubtractFull(left, top[0], top[-1]);
+  return ClampedAddSubtractFull(*left, top[0], top[-1]);
 }
 
-static uint32_t Predictor13_MIPSdspR2(uint32_t left,
+static uint32_t Predictor13_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return ClampedAddSubtractHalf(*left, top[0], top[-1]);
 }
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
diff --git a/3rdparty/libwebp/src/dsp/lossless_neon.c b/3rdparty/libwebp/src/dsp/lossless_neon.c
index 76a1b6f8732c..e9960db38a79 100644
--- a/3rdparty/libwebp/src/dsp/lossless_neon.c
+++ b/3rdparty/libwebp/src/dsp/lossless_neon.c
@@ -146,9 +146,9 @@ static void ConvertBGRAToRGB_NEON(const uint32_t* src,
 #define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN)))
 #define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN)))
 #define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN)))
-#define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0);
-#define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0);
-#define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN)));
+#define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0)
+#define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0)
+#define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN)))
 #define ROTATE32_LEFT(L) vextq_u8((L), (L), 12)    // D|C|B|A -> C|B|A|D
 
 static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) {
@@ -188,17 +188,21 @@ static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
   return avg;
 }
 
-static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
-  return Average3_NEON(left, top[0], top[1]);
+static uint32_t Predictor5_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average3_NEON(*left, top[0], top[1]);
 }
-static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[-1]);
+static uint32_t Predictor6_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[-1]);
 }
-static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[0]);
+static uint32_t Predictor7_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[0]);
 }
-static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
-  return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+static uint32_t Predictor13_NEON(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]);
 }
 
 // Batch versions of those functions.
@@ -494,7 +498,7 @@ static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
 
 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
 // non-standard versions there.
-#if defined(__APPLE__) && defined(__aarch64__) && \
+#if defined(__APPLE__) && WEBP_AARCH64 && \
     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
 #define USE_VTBLQ
 #endif
diff --git a/3rdparty/libwebp/src/dsp/lossless_sse2.c b/3rdparty/libwebp/src/dsp/lossless_sse2.c
index aef0cee1b370..4b6a532c239c 100644
--- a/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -18,7 +18,6 @@
 #include "src/dsp/common_sse2.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
-#include <assert.h>
 #include <emmintrin.h>
 
 //------------------------------------------------------------------------------
@@ -28,23 +27,22 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
                                                         uint32_t c1,
                                                         uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
+  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
   const __m128i V1 = _mm_add_epi16(C0, C1);
   const __m128i V2 = _mm_sub_epi16(V1, C2);
   const __m128i b = _mm_packus_epi16(V2, V2);
-  const uint32_t output = _mm_cvtsi128_si32(b);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(b);
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
                                                         uint32_t c1,
                                                         uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
+  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
   const __m128i avg = _mm_add_epi16(C1, C0);
   const __m128i A0 = _mm_srli_epi16(avg, 1);
   const __m128i A1 = _mm_sub_epi16(A0, B0);
@@ -53,16 +51,15 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
   const __m128i A3 = _mm_srai_epi16(A2, 1);
   const __m128i A4 = _mm_add_epi16(A0, A3);
   const __m128i A5 = _mm_packus_epi16(A4, A4);
-  const uint32_t output = _mm_cvtsi128_si32(A5);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A5);
 }
 
 static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
   int pa_minus_pb;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_cvtsi32_si128(a);
-  const __m128i B0 = _mm_cvtsi32_si128(b);
-  const __m128i C0 = _mm_cvtsi32_si128(c);
+  const __m128i A0 = _mm_cvtsi32_si128((int)a);
+  const __m128i B0 = _mm_cvtsi32_si128((int)b);
+  const __m128i C0 = _mm_cvtsi32_si128((int)c);
   const __m128i AC0 = _mm_subs_epu8(A0, C0);
   const __m128i CA0 = _mm_subs_epu8(C0, A0);
   const __m128i BC0 = _mm_subs_epu8(B0, C0);
@@ -95,8 +92,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
                                              __m128i* const avg) {
   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
   const __m128i ones = _mm_set1_epi8(1);
-  const __m128i A0 = _mm_cvtsi32_si128(a0);
-  const __m128i A1 = _mm_cvtsi32_si128(a1);
+  const __m128i A0 = _mm_cvtsi32_si128((int)a0);
+  const __m128i A1 = _mm_cvtsi32_si128((int)a1);
   const __m128i avg1 = _mm_avg_epu8(A0, A1);
   const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
   *avg = _mm_sub_epi8(avg1, one);
@@ -104,8 +101,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
 
 static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
-  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
   const __m128i sum = _mm_add_epi16(A1, A0);
   return _mm_srli_epi16(sum, 1);
 }
@@ -113,19 +110,18 @@ static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
 static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
   __m128i output;
   Average2_uint32_SSE2(a0, a1, &output);
-  return _mm_cvtsi128_si32(output);
+  return (uint32_t)_mm_cvtsi128_si32(output);
 }
 
 static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
                                           uint32_t a2) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
-  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
   const __m128i sum = _mm_add_epi16(avg1, A1);
   const __m128i avg2 = _mm_srli_epi16(sum, 1);
   const __m128i A2 = _mm_packus_epi16(avg2, avg2);
-  const uint32_t output = _mm_cvtsi128_si32(A2);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A2);
 }
 
 static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
@@ -135,46 +131,54 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
   const __m128i sum = _mm_add_epi16(avg2, avg1);
   const __m128i avg3 = _mm_srli_epi16(sum, 1);
   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
-  const uint32_t output = _mm_cvtsi128_si32(A0);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A0);
 }
 
-static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
+static uint32_t Predictor5_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[-1]);
+static uint32_t Predictor6_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[0]);
+static uint32_t Predictor7_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
+static uint32_t Predictor10_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
+static uint32_t Predictor11_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor12_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor13_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
   return pred;
 }
 
@@ -184,7 +188,7 @@ static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     const __m128i res = _mm_add_epi8(src, black);
@@ -200,7 +204,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  __m128i prev = _mm_set1_epi32(out[-1]);
+  __m128i prev = _mm_set1_epi32((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     // a | b | c | d
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@@ -277,12 +281,12 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 #undef GENERATE_PREDICTOR_2
 
 // Predictor10: average of (average of (L,TL), average of (T, TR)).
-#define DO_PRED10(OUT) do {               \
-  __m128i avgLTL, avg;                    \
-  Average2_m128i(&L, &TL, &avgLTL);       \
-  Average2_m128i(&avgTTR, &avgLTL, &avg); \
-  L = _mm_add_epi8(avg, src);             \
-  out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
+#define DO_PRED10(OUT) do {                         \
+  __m128i avgLTL, avg;                              \
+  Average2_m128i(&L, &TL, &avgLTL);                 \
+  Average2_m128i(&avgTTR, &avgLTL, &avg);           \
+  L = _mm_add_epi8(avg, src);                       \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);  \
 } while (0)
 
 #define DO_PRED10_SHIFT do {                                  \
@@ -295,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
-  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
@@ -328,7 +332,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
   const __m128i B = _mm_andnot_si128(mask, T);                         \
   const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
   L = _mm_add_epi8(src, pred);                                         \
-  out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);                     \
 } while (0)
 
 #define DO_PRED11_SHIFT do {                                \
@@ -343,7 +347,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
   __m128i pa;
-  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
@@ -376,12 +380,12 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
 #undef DO_PRED11_SHIFT
 
 // Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT) do {            \
-  const __m128i all = _mm_add_epi16(L, (DIFF));    \
-  const __m128i alls = _mm_packus_epi16(all, all); \
-  const __m128i res = _mm_add_epi8(src, alls);     \
-  out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
-  L = _mm_unpacklo_epi8(res, zero);                \
+#define DO_PRED12(DIFF, LANE, OUT) do {              \
+  const __m128i all = _mm_add_epi16(L, (DIFF));      \
+  const __m128i alls = _mm_packus_epi16(all, all);   \
+  const __m128i res = _mm_add_epi8(src, alls);       \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
+  L = _mm_unpacklo_epi8(res, zero);                  \
 } while (0)
 
 #define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
@@ -394,7 +398,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
+  const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
   __m128i L = _mm_unpacklo_epi8(L8, zero);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     // Load 4 pixels at a time.
@@ -460,7 +464,7 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
   const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
 #undef MK_CST_16
 #undef CST
-  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);  // alpha-green masks
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
@@ -524,7 +528,7 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
 
 static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
                                    int num_pixels, uint8_t* dst) {
-  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
+  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   while (num_pixels >= 8) {
@@ -553,7 +557,7 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
 static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
                                        int num_pixels, uint8_t* dst) {
   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
-  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   while (num_pixels >= 8) {
@@ -588,8 +592,8 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
 
 static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
                                      int num_pixels, uint8_t* dst) {
-  const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
-  const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
+  const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
+  const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
diff --git a/3rdparty/libwebp/src/dsp/lossless_sse41.c b/3rdparty/libwebp/src/dsp/lossless_sse41.c
new file mode 100644
index 000000000000..bb7ce7611fa9
--- /dev/null
+++ b/3rdparty/libwebp/src/dsp/lossless_sse41.c
@@ -0,0 +1,133 @@
+// Copyright 2021 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 variant of methods for lossless decoder
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "src/dsp/common_sse41.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
+                                        const uint32_t* const src,
+                                        int num_pixels, uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
+  const __m128i mults_rb =
+      _mm_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 |
+                           (CST(green_to_blue_) & 0xffff)));
+  const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
+#undef CST
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);
+  const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
+                                      -1, 9, -1, 9, -1, 13, -1, 13);
+  const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
+                                      -1, 10, -1, -1, -1, 14, -1, -1);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
+    const __m128i C = _mm_mulhi_epi16(B, mults_rb);
+    const __m128i D = _mm_add_epi8(A, C);
+    const __m128i E = _mm_shuffle_epi8(D, perm2);
+    const __m128i F = _mm_mulhi_epi16(E, mults_b2);
+    const __m128i G = _mm_add_epi8(D, F);
+    const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
+    _mm_storeu_si128((__m128i*)&dst[i], out);
+  }
+  // Fall-back to C-version for left-overs.
+  if (i != num_pixels) {
+    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#define ARGB_TO_RGB_SSE41 do {                        \
+  while (num_pixels >= 16) {                          \
+    const __m128i in0 = _mm_loadu_si128(in + 0);      \
+    const __m128i in1 = _mm_loadu_si128(in + 1);      \
+    const __m128i in2 = _mm_loadu_si128(in + 2);      \
+    const __m128i in3 = _mm_loadu_si128(in + 3);      \
+    const __m128i a0 = _mm_shuffle_epi8(in0, perm0);  \
+    const __m128i a1 = _mm_shuffle_epi8(in1, perm1);  \
+    const __m128i a2 = _mm_shuffle_epi8(in2, perm2);  \
+    const __m128i a3 = _mm_shuffle_epi8(in3, perm3);  \
+    const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
+    const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
+    const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
+    _mm_storeu_si128(out + 0, b0);                    \
+    _mm_storeu_si128(out + 1, b1);                    \
+    _mm_storeu_si128(out + 2, b2);                    \
+    in += 4;                                          \
+    out += 3;                                         \
+    num_pixels -= 16;                                 \
+  }                                                   \
+} while (0)
+
+static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
+                                   uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
+                                      8, 14, 13, 12, -1, -1, -1, -1);
+  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+  ARGB_TO_RGB_SSE41;
+
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
+                                      12, 13, 14, -1, -1, -1, -1);
+  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+  ARGB_TO_RGB_SSE41;
+
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+#undef ARGB_TO_RGB_SSE41
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
+  VP8LTransformColorInverse = TransformColorInverse_SSE41;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/3rdparty/libwebp/src/dsp/mips_macro.h b/3rdparty/libwebp/src/dsp/mips_macro.h
index 44aba9b71d88..e810d3d38244 100644
--- a/3rdparty/libwebp/src/dsp/mips_macro.h
+++ b/3rdparty/libwebp/src/dsp/mips_macro.h
@@ -45,28 +45,38 @@
   "ulw    %[" #O2 "],    " #I3 "+" XSTR(I9) "*" #I7 "(%[" #I0 "])       \n\t"  \
   "ulw    %[" #O3 "],    " #I4 "+" XSTR(I9) "*" #I8 "(%[" #I0 "])       \n\t"
 
+
+// O - output
+// I - input (macro doesn't change it so it should be different from I)
+#define MUL_SHIFT_C1(O, I)                                                     \
+  "mul              %[" #O "],    %[" #I "],    %[kC1]        \n\t"            \
+  "sra              %[" #O "],    %[" #O "],    16            \n\t"            \
+  "addu             %[" #O "],    %[" #O "],    %[" #I "]     \n\t"
+#define MUL_SHIFT_C2(O, I) \
+  "mul              %[" #O "],    %[" #I "],    %[kC2]        \n\t"            \
+  "sra              %[" #O "],    %[" #O "],    16            \n\t"
+
+// Same as #define MUL_SHIFT_C1 but I and O are the same. It stores the
+// intermediary result in TMP.
+#define MUL_SHIFT_C1_IO(IO, TMP)                                               \
+  "mul              %[" #TMP "],  %[" #IO  "], %[kC1]     \n\t"                \
+  "sra              %[" #TMP "],  %[" #TMP "], 16         \n\t"                \
+  "addu             %[" #IO  "],  %[" #TMP "], %[" #IO "] \n\t"
+
 // O - output
 // IO - input/output
 // I - input (macro doesn't change it)
 #define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7,                          \
                       IO0, IO1, IO2, IO3,                                      \
                       I0, I1, I2, I3, I4, I5, I6, I7)                          \
-  "mul              %[" #O0 "],   %[" #I0 "],   %[kC2]        \n\t"            \
-  "mul              %[" #O1 "],   %[" #I0 "],   %[kC1]        \n\t"            \
-  "mul              %[" #O2 "],   %[" #I1 "],   %[kC2]        \n\t"            \
-  "mul              %[" #O3 "],   %[" #I1 "],   %[kC1]        \n\t"            \
-  "mul              %[" #O4 "],   %[" #I2 "],   %[kC2]        \n\t"            \
-  "mul              %[" #O5 "],   %[" #I2 "],   %[kC1]        \n\t"            \
-  "mul              %[" #O6 "],   %[" #I3 "],   %[kC2]        \n\t"            \
-  "mul              %[" #O7 "],   %[" #I3 "],   %[kC1]        \n\t"            \
-  "sra              %[" #O0 "],   %[" #O0 "],   16            \n\t"            \
-  "sra              %[" #O1 "],   %[" #O1 "],   16            \n\t"            \
-  "sra              %[" #O2 "],   %[" #O2 "],   16            \n\t"            \
-  "sra              %[" #O3 "],   %[" #O3 "],   16            \n\t"            \
-  "sra              %[" #O4 "],   %[" #O4 "],   16            \n\t"            \
-  "sra              %[" #O5 "],   %[" #O5 "],   16            \n\t"            \
-  "sra              %[" #O6 "],   %[" #O6 "],   16            \n\t"            \
-  "sra              %[" #O7 "],   %[" #O7 "],   16            \n\t"            \
+  MUL_SHIFT_C2(O0, I0)                                                         \
+  MUL_SHIFT_C1(O1, I0)                                                         \
+  MUL_SHIFT_C2(O2, I1)                                                         \
+  MUL_SHIFT_C1(O3, I1)                                                         \
+  MUL_SHIFT_C2(O4, I2)                                                         \
+  MUL_SHIFT_C1(O5, I2)                                                         \
+  MUL_SHIFT_C2(O6, I3)                                                         \
+  MUL_SHIFT_C1(O7, I3)                                                         \
   "addu             %[" #IO0 "],  %[" #IO0 "],  %[" #I4 "]    \n\t"            \
   "addu             %[" #IO1 "],  %[" #IO1 "],  %[" #I5 "]    \n\t"            \
   "subu             %[" #IO2 "],  %[" #IO2 "],  %[" #I6 "]    \n\t"            \
diff --git a/3rdparty/libwebp/src/dsp/msa_macro.h b/3rdparty/libwebp/src/dsp/msa_macro.h
index a16c0bb3009b..90adbbc3197e 100644
--- a/3rdparty/libwebp/src/dsp/msa_macro.h
+++ b/3rdparty/libwebp/src/dsp/msa_macro.h
@@ -14,6 +14,10 @@
 #ifndef WEBP_DSP_MSA_MACRO_H_
 #define WEBP_DSP_MSA_MACRO_H_
 
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
 #include <stdint.h>
 #include <msa.h>
 
@@ -69,27 +73,25 @@
 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 
-#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME)             \
-  static inline TYPE FUNC_NAME(const void* const psrc) {  \
-    const uint8_t* const psrc_m = (const uint8_t*)psrc;   \
-    TYPE val_m;                                           \
-    __asm__ volatile (                                        \
-      "" #INSTR " %[val_m], %[psrc_m]  \n\t"              \
-      : [val_m] "=r" (val_m)                              \
-      : [psrc_m] "m" (*psrc_m));                          \
-    return val_m;                                         \
+#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME)               \
+  static inline TYPE FUNC_NAME(const void* const psrc) {    \
+    const uint8_t* const psrc_m = (const uint8_t*)psrc;     \
+    TYPE val_m;                                             \
+    __asm__ volatile("" #INSTR " %[val_m], %[psrc_m]  \n\t" \
+                     : [val_m] "=r"(val_m)                  \
+                     : [psrc_m] "m"(*psrc_m));              \
+    return val_m;                                           \
   }
 
 #define MSA_LOAD(psrc, FUNC_NAME)  FUNC_NAME(psrc)
 
-#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME)               \
-  static inline void FUNC_NAME(TYPE val, void* const pdst) { \
-    uint8_t* const pdst_m = (uint8_t*)pdst;                  \
-    TYPE val_m = val;                                        \
-    __asm__ volatile (                                           \
-      " " #INSTR "  %[val_m],  %[pdst_m]  \n\t"              \
-      : [pdst_m] "=m" (*pdst_m)                              \
-      : [val_m] "r" (val_m));                                \
+#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME)                 \
+  static inline void FUNC_NAME(TYPE val, void* const pdst) {   \
+    uint8_t* const pdst_m = (uint8_t*)pdst;                    \
+    TYPE val_m = val;                                          \
+    __asm__ volatile(" " #INSTR "  %[val_m],  %[pdst_m]  \n\t" \
+                     : [pdst_m] "=m"(*pdst_m)                  \
+                     : [val_m] "r"(val_m));                    \
   }
 
 #define MSA_STORE(val, pdst, FUNC_NAME)  FUNC_NAME(val, pdst)
@@ -1389,4 +1391,5 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 } while (0)
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
+#endif  // WEBP_USE_MSA
 #endif  // WEBP_DSP_MSA_MACRO_H_
diff --git a/3rdparty/libwebp/src/dsp/neon.h b/3rdparty/libwebp/src/dsp/neon.h
index aa1dea130106..14acb4044ba6 100644
--- a/3rdparty/libwebp/src/dsp/neon.h
+++ b/3rdparty/libwebp/src/dsp/neon.h
@@ -12,14 +12,16 @@
 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_
 
-#include <arm_neon.h>
-
 #include "src/dsp/dsp.h"
 
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except newer clang/gcc or aarch64 where the inline assembly is
 // incompatible.
-#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
+#if LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 9) || WEBP_AARCH64
 #define WEBP_USE_INTRINSICS   // use intrinsics when possible
 #endif
 
@@ -44,7 +46,7 @@
 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
-#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || WEBP_AARCH64)
 #define WORK_AROUND_GCC
 #endif
 
@@ -98,4 +100,5 @@ static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
 } while (0)
 #endif
 
+#endif  // WEBP_USE_NEON
 #endif  // WEBP_DSP_NEON_H_
diff --git a/3rdparty/libwebp/src/dsp/quant.h b/3rdparty/libwebp/src/dsp/quant.h
index 5e8dba8d19e8..dcbc11c77c59 100644
--- a/3rdparty/libwebp/src/dsp/quant.h
+++ b/3rdparty/libwebp/src/dsp/quant.h
@@ -21,18 +21,24 @@
 
 #define IsFlat IsFlat_NEON
 
-static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+static uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if WEBP_AARCH64
+  return vaddvq_u32(a);
+#else
   const uint64x2_t b = vpaddlq_u32(a);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                  vreinterpret_u32_u64(vget_high_u64(b)));
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
 }
 
 static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
                               int thresh) {
   const int16x8_t tst_ones = vdupq_n_s16(-1);
   uint32x4_t sum = vdupq_n_u32(0);
+  int i;
 
-  for (int i = 0; i < num_blocks; ++i) {
+  for (i = 0; i < num_blocks; ++i) {
     // Set DC to zero.
     const int16x8_t a_0 = vsetq_lane_s16(0, vld1q_s16(levels), 0);
     const int16x8_t a_1 = vld1q_s16(levels + 8);
@@ -45,7 +51,7 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
 
     levels += 16;
   }
-  return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0);
+  return thresh >= (int)horizontal_add_uint32x4(sum);
 }
 
 #else
diff --git a/3rdparty/libwebp/src/dsp/rescaler.c b/3rdparty/libwebp/src/dsp/rescaler.c
index c5a01e82df5c..325d8be1808b 100644
--- a/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/3rdparty/libwebp/src/dsp/rescaler.c
@@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
     int x_out = channel;
     // simple bilinear interpolation
     int accum = wrk->x_add;
-    int left = src[x_in];
-    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    rescaler_t left = (rescaler_t)src[x_in];
+    rescaler_t right =
+        (wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left;
     x_in += x_stride;
     while (1) {
       wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
@@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
         left = right;
         x_in += x_stride;
         assert(x_in < wrk->src_width * x_stride);
-        right = src[x_in];
+        right = (rescaler_t)src[x_in];
         accum += wrk->x_add;
       }
     }
@@ -196,6 +197,7 @@ WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
 WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
 WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void WebPRescalerDspInitSSE2(void);
 extern void WebPRescalerDspInitMIPS32(void);
 extern void WebPRescalerDspInitMIPSdspR2(void);
@@ -213,7 +215,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
   WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPRescalerDspInitSSE2();
     }
@@ -235,7 +237,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPRescalerDspInitNEON();
diff --git a/3rdparty/libwebp/src/dsp/rescaler_neon.c b/3rdparty/libwebp/src/dsp/rescaler_neon.c
index b976a852cfcd..957a92dbc9fc 100644
--- a/3rdparty/libwebp/src/dsp/rescaler_neon.c
+++ b/3rdparty/libwebp/src/dsp/rescaler_neon.c
@@ -32,7 +32,7 @@
 #define STORE_32x8(SRC0, SRC1, DST) do {                              \
     vst1q_u32((DST) + 0, SRC0);                                       \
     vst1q_u32((DST) + 4, SRC1);                                       \
-} while (0);
+} while (0)
 
 #if (WEBP_RESCALER_RFIX == 32)
 #define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1))
diff --git a/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/3rdparty/libwebp/src/dsp/rescaler_sse2.c
index d7effea16ea2..3f18e94e9359 100644
--- a/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -85,7 +85,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
       const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
       const __m128i out = _mm_madd_epi16(cur_pixels, mult);
       assert(sizeof(*frow) == sizeof(uint32_t));
-      WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
+      WebPInt32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
       frow += 1;
       if (frow >= frow_end) break;
       accum -= wrk->x_sub;
@@ -132,7 +132,7 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
     __m128i base = zero;
     accum += wrk->x_add;
     while (accum > 0) {
-      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
+      const __m128i A = _mm_cvtsi32_si128(WebPMemToInt32(src));
       src += 4;
       base = _mm_unpacklo_epi8(A, zero);
       // To avoid overflow, we need: base * x_add / x_sub < 32768
@@ -198,7 +198,7 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
                                         const __m128i* const mult,
                                         uint8_t* const dst) {
   const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
-  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
+  const __m128i mask = _mm_set_epi32(~0, 0, ~0, 0);
   const __m128i B0 = _mm_mul_epu32(*A0, *mult);
   const __m128i B1 = _mm_mul_epu32(*A1, *mult);
   const __m128i B2 = _mm_mul_epu32(*A2, *mult);
diff --git a/3rdparty/libwebp/src/dsp/ssim.c b/3rdparty/libwebp/src/dsp/ssim.c
index 989ce8254c9f..9a1341ed9585 100644
--- a/3rdparty/libwebp/src/dsp/ssim.c
+++ b/3rdparty/libwebp/src/dsp/ssim.c
@@ -137,6 +137,7 @@ VP8SSIMGetClippedFunc VP8SSIMGetClipped;
 VP8AccumulateSSEFunc VP8AccumulateSSE;
 #endif
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8SSIMDspInitSSE2(void);
 
 WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
@@ -150,7 +151,7 @@ WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
 #endif
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8SSIMDspInitSSE2();
     }
diff --git a/3rdparty/libwebp/src/dsp/upsampling.c b/3rdparty/libwebp/src/dsp/upsampling.c
index 9b60da5bbb2a..983b9c42d36c 100644
--- a/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/3rdparty/libwebp/src/dsp/upsampling.c
@@ -215,6 +215,7 @@ static void EmptyYuv444Func(const uint8_t* y,
 
 WebPYUV444Converter WebPYUV444Converters[MODE_LAST];
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void WebPInitYUV444ConvertersMIPSdspR2(void);
 extern void WebPInitYUV444ConvertersSSE2(void);
 extern void WebPInitYUV444ConvertersSSE41(void);
@@ -233,12 +234,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
   WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitYUV444ConvertersSSE2();
     }
 #endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitYUV444ConvertersSSE41();
     }
@@ -278,12 +279,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitUpsamplersSSE2();
     }
 #endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitUpsamplersSSE41();
     }
@@ -300,7 +301,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitUpsamplersNEON();
diff --git a/3rdparty/libwebp/src/dsp/upsampling_neon.c b/3rdparty/libwebp/src/dsp/upsampling_neon.c
index 6ba71a7de537..bbc000ca2d38 100644
--- a/3rdparty/libwebp/src/dsp/upsampling_neon.c
+++ b/3rdparty/libwebp/src/dsp/upsampling_neon.c
@@ -111,7 +111,7 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
   vst4_u8(out, v255_r_g_b);                                             \
 } while (0)
 
-#if !defined(WEBP_SWAP_16BIT_CSP)
+#if (WEBP_SWAP_16BIT_CSP == 0)
 #define ZIP_U8(lo, hi) vzip_u8((lo), (hi))
 #else
 #define ZIP_U8(lo, hi) vzip_u8((hi), (lo))
diff --git a/3rdparty/libwebp/src/dsp/upsampling_sse2.c b/3rdparty/libwebp/src/dsp/upsampling_sse2.c
index 340f1e2ac238..77b4f7221ebd 100644
--- a/3rdparty/libwebp/src/dsp/upsampling_sse2.c
+++ b/3rdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -58,7 +58,7 @@
 } while (0)
 
 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
-#define UPSAMPLE_32PIXELS(r1, r2, out) {                                       \
+#define UPSAMPLE_32PIXELS(r1, r2, out) do {                                    \
   const __m128i one = _mm_set1_epi8(1);                                        \
   const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]);                 \
   const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]);                 \
@@ -85,7 +85,7 @@
   /* pack the alternate pixels */                                              \
   PACK_AND_STORE(a, b, diag1, diag2, (out) +      0);  /* store top */         \
   PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32);  /* store bottom */      \
-}
+} while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
 static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
@@ -121,7 +121,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
   int uv_pos, pos;                                                             \
   /* 16byte-aligned array to cache reconstructed u and v */                    \
   uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
-  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15);  \
   uint8_t* const r_v = r_u + 32;                                               \
                                                                                \
   assert(top_y != NULL);                                                       \
@@ -229,11 +229,11 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
   }                                                                            \
 }
 
-YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
-YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
+YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4)
+YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4)
 #if !defined(WEBP_REDUCE_CSP)
-YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
-YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
+YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3)
+YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3)
 YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
 YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
             WebPYuv444ToRgba4444_C, 2)
diff --git a/3rdparty/libwebp/src/dsp/upsampling_sse41.c b/3rdparty/libwebp/src/dsp/upsampling_sse41.c
index 648d4560279c..e38c88d5e670 100644
--- a/3rdparty/libwebp/src/dsp/upsampling_sse41.c
+++ b/3rdparty/libwebp/src/dsp/upsampling_sse41.c
@@ -60,7 +60,7 @@
 } while (0)
 
 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
-#define UPSAMPLE_32PIXELS(r1, r2, out) {                                       \
+#define UPSAMPLE_32PIXELS(r1, r2, out) do {                                    \
   const __m128i one = _mm_set1_epi8(1);                                        \
   const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]);                 \
   const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]);                 \
@@ -87,7 +87,7 @@
   /* pack the alternate pixels */                                              \
   PACK_AND_STORE(a, b, diag1, diag2, (out) +      0);  /* store top */         \
   PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32);  /* store bottom */      \
-}
+} while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
 static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
@@ -217,8 +217,8 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
 }
 
 #if !defined(WEBP_REDUCE_CSP)
-YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3);
-YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3);
+YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3)
+YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3)
 #endif  // WEBP_REDUCE_CSP
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) {
diff --git a/3rdparty/libwebp/src/dsp/yuv.c b/3rdparty/libwebp/src/dsp/yuv.c
index 14e67fc28ef8..8a04b85d82dd 100644
--- a/3rdparty/libwebp/src/dsp/yuv.c
+++ b/3rdparty/libwebp/src/dsp/yuv.c
@@ -70,6 +70,7 @@ void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
 
 WebPSamplerRowFunc WebPSamplers[MODE_LAST];
 
+extern VP8CPUInfo VP8GetCPUInfo;
 extern void WebPInitSamplersSSE2(void);
 extern void WebPInitSamplersSSE41(void);
 extern void WebPInitSamplersMIPS32(void);
@@ -90,16 +91,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitSamplersSSE2();
     }
-#endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif  // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitSamplersSSE41();
     }
-#endif  // WEBP_USE_SSE41
+#endif  // WEBP_HAVE_SSE41
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       WebPInitSamplersMIPS32();
@@ -194,50 +195,6 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
 
 //-----------------------------------------------------------------------------
 
-#if !WEBP_NEON_OMIT_C_CODE
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
-                                  uint16_t* dst, int len) {
-  uint64_t diff = 0;
-  int i;
-  for (i = 0; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)dst[i] + diff_y;
-    dst[i] = clip_y(new_y);
-    diff += (uint64_t)abs(diff_y);
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
-                                int16_t* dst, int len) {
-  int i;
-  for (i = 0; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
-                                const uint16_t* best_y, uint16_t* out) {
-  int i;
-  for (i = 0; i < len; ++i, ++A, ++B) {
-    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
-    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
-    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
-  }
-}
-#endif  // !WEBP_NEON_OMIT_C_CODE
-
-#undef MAX_Y
-
-//-----------------------------------------------------------------------------
-
 void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
 void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
 void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
@@ -247,18 +204,9 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
 void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
                             int src_width, int do_store);
 
-uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
-                                uint16_t* dst, int len);
-void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
-                              int16_t* dst, int len);
-void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
-                              const uint16_t* best_y, uint16_t* out);
-
 extern void WebPInitConvertARGBToYUVSSE2(void);
 extern void WebPInitConvertARGBToYUVSSE41(void);
 extern void WebPInitConvertARGBToYUVNEON(void);
-extern void WebPInitSharpYUVSSE2(void);
-extern void WebPInitSharpYUVNEON(void);
 
 WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
   WebPConvertARGBToY = ConvertARGBToY_C;
@@ -269,40 +217,29 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
 
   WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
 
-#if !WEBP_NEON_OMIT_C_CODE
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
-#endif
-
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitConvertARGBToYUVSSE2();
-      WebPInitSharpYUVSSE2();
     }
-#endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif  // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitConvertARGBToYUVSSE41();
     }
-#endif  // WEBP_USE_SSE41
+#endif  // WEBP_HAVE_SSE41
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitConvertARGBToYUVNEON();
-    WebPInitSharpYUVNEON();
   }
-#endif  // WEBP_USE_NEON
+#endif  // WEBP_HAVE_NEON
 
   assert(WebPConvertARGBToY != NULL);
   assert(WebPConvertARGBToUV != NULL);
   assert(WebPConvertRGB24ToY != NULL);
   assert(WebPConvertBGR24ToY != NULL);
   assert(WebPConvertRGBA32ToUV != NULL);
-  assert(WebPSharpYUVUpdateY != NULL);
-  assert(WebPSharpYUVUpdateRGB != NULL);
-  assert(WebPSharpYUVFilterRow != NULL);
 }
diff --git a/3rdparty/libwebp/src/dsp/yuv.h b/3rdparty/libwebp/src/dsp/yuv.h
index c12be1d094b6..66a397d117b4 100644
--- a/3rdparty/libwebp/src/dsp/yuv.h
+++ b/3rdparty/libwebp/src/dsp/yuv.h
@@ -10,7 +10,7 @@
 // inline YUV<->RGB conversion function
 //
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
+// More information at: https://en.wikipedia.org/wiki/YCbCr
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
diff --git a/3rdparty/libwebp/src/dsp/yuv_neon.c b/3rdparty/libwebp/src/dsp/yuv_neon.c
index a34d60248f6a..ff77b009801d 100644
--- a/3rdparty/libwebp/src/dsp/yuv_neon.c
+++ b/3rdparty/libwebp/src/dsp/yuv_neon.c
@@ -173,116 +173,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
 }
 
-//------------------------------------------------------------------------------
-
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y_NEON(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
-                                     uint16_t* dst, int len) {
-  int i;
-  const int16x8_t zero = vdupq_n_s16(0);
-  const int16x8_t max = vdupq_n_s16(MAX_Y);
-  uint64x2_t sum = vdupq_n_u64(0);
-  uint64_t diff;
-
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
-    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
-    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
-    const int16x8_t D = vsubq_s16(A, B);       // diff_y
-    const int16x8_t F = vaddq_s16(C, D);       // new_y
-    const uint16x8_t H =
-        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
-    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
-    vst1q_u16(dst + i, H);
-    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
-  }
-  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
-  for (; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)(dst[i]) + diff_y;
-    dst[i] = clip_y_NEON(new_y);
-    diff += (uint64_t)(abs(diff_y));
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
-                                   int16_t* dst, int len) {
-  int i;
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t A = vld1q_s16(ref + i);
-    const int16x8_t B = vld1q_s16(src + i);
-    const int16x8_t C = vld1q_s16(dst + i);
-    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
-    const int16x8_t E = vaddq_s16(C, D);   // new_uv
-    vst1q_s16(dst + i, E);
-  }
-  for (; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
-                                   const uint16_t* best_y, uint16_t* out) {
-  int i;
-  const int16x8_t max = vdupq_n_s16(MAX_Y);
-  const int16x8_t zero = vdupq_n_s16(0);
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t a0 = vld1q_s16(A + i + 0);
-    const int16x8_t a1 = vld1q_s16(A + i + 1);
-    const int16x8_t b0 = vld1q_s16(B + i + 0);
-    const int16x8_t b1 = vld1q_s16(B + i + 1);
-    const int16x8_t a0b1 = vaddq_s16(a0, b1);
-    const int16x8_t a1b0 = vaddq_s16(a1, b0);
-    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
-    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
-    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
-    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
-    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
-    const int16x8_t d0 = vaddq_s16(c1, a0);
-    const int16x8_t d1 = vaddq_s16(c0, a1);
-    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
-    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
-    const int16x8x2_t f = vzipq_s16(e0, e1);
-    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
-    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
-    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
-    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
-    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
-    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
-    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
-    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
-  }
-  for (; i < len; ++i) {
-    const int a0b1 = A[i + 0] + B[i + 1];
-    const int a1b0 = A[i + 1] + B[i + 0];
-    const int a0a1b0b1 = a0b1 + a1b0 + 8;
-    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
-    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
-    out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
-  }
-}
-#undef MAX_Y
-
-//------------------------------------------------------------------------------
-
-extern void WebPInitSharpYUVNEON(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
-}
-
 #else  // !WEBP_USE_NEON
 
 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
-WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
 
 #endif  // WEBP_USE_NEON
diff --git a/3rdparty/libwebp/src/dsp/yuv_sse2.c b/3rdparty/libwebp/src/dsp/yuv_sse2.c
index baa48d537175..01a48f9af2c6 100644
--- a/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -15,10 +15,12 @@
 
 #if defined(WEBP_USE_SSE2)
 
-#include "src/dsp/common_sse2.h"
 #include <stdlib.h>
 #include <emmintrin.h>
 
+#include "src/dsp/common_sse2.h"
+#include "src/utils/utils.h"
+
 //-----------------------------------------------------------------------------
 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
 
@@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
 // Load and replicate the U/V samples
 static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
   return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
 }
@@ -130,7 +132,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
   const __m128i rg0 = _mm_packus_epi16(*B, *A);
   const __m128i ba0 = _mm_packus_epi16(*R, *G);
 #endif
-  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
   const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0);  // rbrbrbrbrb...
   const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0);  // gagagagaga...
   const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
@@ -147,9 +149,10 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
   const __m128i r0 = _mm_packus_epi16(*R, *R);
   const __m128i g0 = _mm_packus_epi16(*G, *G);
   const __m128i b0 = _mm_packus_epi16(*B, *B);
-  const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
+  const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)0xf8));
   const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
-  const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
+  const __m128i g1 =
+      _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)0xe0)), 5);
   const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
   const __m128i rg = _mm_or_si128(r1, g1);
   const __m128i gb = _mm_or_si128(g2, b1);
@@ -747,128 +750,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
 }
 
-//------------------------------------------------------------------------------
-
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
-                                     uint16_t* dst, int len) {
-  uint64_t diff = 0;
-  uint32_t tmp[4];
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i max = _mm_set1_epi16(MAX_Y);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i sum = zero;
-
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
-    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
-    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
-    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
-    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
-    const __m128i F = _mm_add_epi16(C, D);       // new_y
-    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
-    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
-    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
-    _mm_storeu_si128((__m128i*)(dst + i), H);
-    sum = _mm_add_epi32(sum, I);
-  }
-  _mm_storeu_si128((__m128i*)tmp, sum);
-  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
-  for (; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)dst[i] + diff_y;
-    dst[i] = clip_y(new_y);
-    diff += (uint64_t)abs(diff_y);
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
-                                   int16_t* dst, int len) {
-  int i = 0;
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
-    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
-    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
-    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
-    const __m128i E = _mm_add_epi16(C, D);   // new_uv
-    _mm_storeu_si128((__m128i*)(dst + i), E);
-  }
-  for (; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
-                                   const uint16_t* best_y, uint16_t* out) {
-  int i;
-  const __m128i kCst8 = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(MAX_Y);
-  const __m128i zero = _mm_setzero_si128();
-  for (i = 0; i + 8 <= len; i += 8) {
-    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
-    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
-    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
-    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
-    const __m128i a0b1 = _mm_add_epi16(a0, b1);
-    const __m128i a1b0 = _mm_add_epi16(a1, b0);
-    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
-    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
-    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
-    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
-    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
-    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
-    const __m128i d0 = _mm_add_epi16(c1, a0);
-    const __m128i d1 = _mm_add_epi16(c0, a1);
-    const __m128i e0 = _mm_srai_epi16(d0, 1);
-    const __m128i e1 = _mm_srai_epi16(d1, 1);
-    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
-    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
-    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
-    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
-    const __m128i h0 = _mm_add_epi16(g0, f0);
-    const __m128i h1 = _mm_add_epi16(g1, f1);
-    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
-    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
-    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
-    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
-  }
-  for (; i < len; ++i) {
-    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
-    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
-    // We reuse the common sub-expressions.
-    const int a0b1 = A[i + 0] + B[i + 1];
-    const int a1b0 = A[i + 1] + B[i + 0];
-    const int a0a1b0b1 = a0b1 + a1b0 + 8;
-    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
-    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
-    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
-  }
-}
-
-#undef MAX_Y
-
-//------------------------------------------------------------------------------
-
-extern void WebPInitSharpYUVSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
-}
-
 #else  // !WEBP_USE_SSE2
 
 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
-WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
 
 #endif  // WEBP_USE_SSE2
diff --git a/3rdparty/libwebp/src/dsp/yuv_sse41.c b/3rdparty/libwebp/src/dsp/yuv_sse41.c
index 579d1f7402c2..f79b802e4712 100644
--- a/3rdparty/libwebp/src/dsp/yuv_sse41.c
+++ b/3rdparty/libwebp/src/dsp/yuv_sse41.c
@@ -15,10 +15,12 @@
 
 #if defined(WEBP_USE_SSE41)
 
-#include "src/dsp/common_sse41.h"
 #include <stdlib.h>
 #include <smmintrin.h>
 
+#include "src/dsp/common_sse41.h"
+#include "src/utils/utils.h"
+
 //-----------------------------------------------------------------------------
 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
 
@@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
 // Load and replicate the U/V samples
 static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
   return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
 }
diff --git a/3rdparty/libwebp/src/enc/alpha_enc.c b/3rdparty/libwebp/src/enc/alpha_enc.c
index dce9ca957d3a..c11a261c8a4d 100644
--- a/3rdparty/libwebp/src/enc/alpha_enc.c
+++ b/3rdparty/libwebp/src/enc/alpha_enc.c
@@ -13,12 +13,14 @@
 
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "src/enc/vp8i_enc.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/filters_utils.h"
 #include "src/utils/quant_levels_utils.h"
 #include "src/utils/utils.h"
+#include "src/webp/encode.h"
 #include "src/webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
@@ -54,7 +56,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   WebPConfig config;
   WebPPicture picture;
 
-  WebPPictureInit(&picture);
+  if (!WebPPictureInit(&picture)) return 0;
   picture.width = width;
   picture.height = height;
   picture.use_argb = 1;
@@ -65,7 +67,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
                            picture.argb, picture.argb_stride);
 
-  WebPConfigInit(&config);
+  if (!WebPConfigInit(&config)) return 0;
   config.lossless = 1;
   // Enable exact, or it would alter RGB values of transparent alpha, which is
   // normally OK but not here since we are not encoding the input image but  an
@@ -82,11 +84,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
       (use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
-  // TODO(urvang): Temporary fix to avoid generating images that trigger
-  // a decoder bug related to alpha with color cache.
-  // See: https://code.google.com/p/webp/issues/detail?id=239
-  // Need to re-enable this later.
-  ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
+  ok = VP8LEncodeStream(&config, &picture, bw);
   WebPPictureFree(&picture);
   ok = ok && !bw->error_;
   if (!ok) {
@@ -140,6 +138,11 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
                               !reduce_levels, &tmp_bw, &result->stats);
     if (ok) {
       output = VP8LBitWriterFinish(&tmp_bw);
+      if (tmp_bw.error_) {
+        VP8LBitWriterWipeOut(&tmp_bw);
+        memset(&result->bw, 0, sizeof(result->bw));
+        return 0;
+      }
       output_size = VP8LBitWriterNumBytes(&tmp_bw);
       if (output_size > data_size) {
         // compressed size is larger than source! Revert to uncompressed mode.
@@ -148,6 +151,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
       }
     } else {
       VP8LBitWriterWipeOut(&tmp_bw);
+      memset(&result->bw, 0, sizeof(result->bw));
       return 0;
     }
   }
@@ -162,7 +166,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   header = method | (filter << 2);
   if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
 
-  VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
+  if (!VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size)) ok = 0;
   ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
   ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
 
@@ -303,7 +307,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
   int ok = 1;
   const int reduce_levels = (quality < 100);
 
-  // quick sanity checks
+  // quick correctness checks
   assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
   assert(enc != NULL && pic != NULL && pic->a != NULL);
   assert(output != NULL && output_size != NULL);
@@ -312,11 +316,11 @@ static int EncodeAlpha(VP8Encoder* const enc,
   assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
 
   if (quality < 0 || quality > 100) {
-    return 0;
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   }
 
   if (method < ALPHA_NO_COMPRESSION || method > ALPHA_LOSSLESS_COMPRESSION) {
-    return 0;
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   }
 
   if (method == ALPHA_NO_COMPRESSION) {
@@ -326,7 +330,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
 
   quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
   if (quant_alpha == NULL) {
-    return 0;
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
 
   // Extract alpha data (width x height) from raw_data (stride x height).
@@ -346,6 +350,9 @@ static int EncodeAlpha(VP8Encoder* const enc,
     ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
                                filter, reduce_levels, effort_level, output,
                                output_size, pic->stats);
+    if (!ok) {
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);  // imprecise
+    }
 #if !defined(WEBP_DISABLE_STATS)
     if (pic->stats != NULL) {  // need stats?
       pic->stats->coded_size += (int)(*output_size);
@@ -361,7 +368,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
 //------------------------------------------------------------------------------
 // Main calls
 
-static int CompressAlphaJob(void* arg1, void* dummy) {
+static int CompressAlphaJob(void* arg1, void* unused) {
   VP8Encoder* const enc = (VP8Encoder*)arg1;
   const WebPConfig* config = enc->config_;
   uint8_t* alpha_data = NULL;
@@ -375,13 +382,13 @@ static int CompressAlphaJob(void* arg1, void* dummy) {
                    filter, effort_level, &alpha_data, &alpha_size)) {
     return 0;
   }
-  if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
+  if (alpha_size != (uint32_t)alpha_size) {  // Soundness check.
     WebPSafeFree(alpha_data);
     return 0;
   }
   enc->alpha_data_size_ = (uint32_t)alpha_size;
   enc->alpha_data_ = alpha_data;
-  (void)dummy;
+  (void)unused;
   return 1;
 }
 
@@ -405,7 +412,7 @@ int VP8EncStartAlpha(VP8Encoder* const enc) {
       WebPWorker* const worker = &enc->alpha_worker_;
       // Makes sure worker is good to go.
       if (!WebPGetWorkerInterface()->Reset(worker)) {
-        return 0;
+        return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
       }
       WebPGetWorkerInterface()->Launch(worker);
       return 1;
diff --git a/3rdparty/libwebp/src/enc/analysis_enc.c b/3rdparty/libwebp/src/enc/analysis_enc.c
index ebb784261c63..962eaa998f87 100644
--- a/3rdparty/libwebp/src/enc/analysis_enc.c
+++ b/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -391,12 +391,14 @@ static int DoSegmentsJob(void* arg1, void* arg2) {
   return ok;
 }
 
+#ifdef WEBP_USE_THREAD
 static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
   int i;
   for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
   dst->alpha += src->alpha;
   dst->uv_alpha += src->uv_alpha;
 }
+#endif
 
 // initialize the job struct with some tasks to perform
 static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
@@ -425,10 +427,10 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
       (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
   if (do_segments) {
     const int last_row = enc->mb_h_;
-    // We give a little more than a half work to the main thread.
-    const int split_row = (9 * last_row + 15) >> 4;
     const int total_mb = last_row * enc->mb_w_;
 #ifdef WEBP_USE_THREAD
+    // We give a little more than a half work to the main thread.
+    const int split_row = (9 * last_row + 15) >> 4;
     const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
     const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
 #else
@@ -438,6 +440,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
         WebPGetWorkerInterface();
     SegmentJob main_job;
     if (do_mt) {
+#ifdef WEBP_USE_THREAD
       SegmentJob side_job;
       // Note the use of '&' instead of '&&' because we must call the functions
       // no matter what.
@@ -455,6 +458,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
       }
       worker_interface->End(&side_job.worker);
       if (ok) MergeJobs(&side_job, &main_job);  // merge results together
+#endif  // WEBP_USE_THREAD
     } else {
       // Even for single-thread case, we use the generic Worker tools.
       InitSegmentJob(enc, &main_job, 0, last_row);
@@ -470,6 +474,10 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
   } else {   // Use only one default segment.
     ResetAllMBInfo(enc);
   }
+  if (!ok) {
+    return WebPEncodingSetError(enc->pic_,
+                                VP8_ENC_ERROR_OUT_OF_MEMORY);  // imprecise
+  }
   return ok;
 }
 
diff --git a/3rdparty/libwebp/src/enc/backward_references_cost_enc.c b/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
index 516abd73eb45..6968ef3c9f3e 100644
--- a/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
+++ b/3rdparty/libwebp/src/enc/backward_references_cost_enc.c
@@ -15,10 +15,11 @@
 //
 
 #include <assert.h>
+#include <float.h>
 
+#include "src/dsp/lossless_common.h"
 #include "src/enc/backward_references_enc.h"
 #include "src/enc/histogram_enc.h"
-#include "src/dsp/lossless_common.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/utils.h"
 
@@ -30,15 +31,15 @@ extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
                                       const PixOrCopy v);
 
 typedef struct {
-  double alpha_[VALUES_IN_BYTE];
-  double red_[VALUES_IN_BYTE];
-  double blue_[VALUES_IN_BYTE];
-  double distance_[NUM_DISTANCE_CODES];
-  double* literal_;
+  float alpha_[VALUES_IN_BYTE];
+  float red_[VALUES_IN_BYTE];
+  float blue_[VALUES_IN_BYTE];
+  float distance_[NUM_DISTANCE_CODES];
+  float* literal_;
 } CostModel;
 
 static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const uint32_t population_counts[], double output[]) {
+    int num_symbols, const uint32_t population_counts[], float output[]) {
   uint32_t sum = 0;
   int nonzeros = 0;
   int i;
@@ -51,7 +52,7 @@ static void ConvertPopulationCountTableToBitEstimates(
   if (nonzeros <= 1) {
     memset(output, 0, num_symbols * sizeof(*output));
   } else {
-    const double logsum = VP8LFastLog2(sum);
+    const float logsum = VP8LFastLog2(sum);
     for (i = 0; i < num_symbols; ++i) {
       output[i] = logsum - VP8LFastLog2(population_counts[i]);
     }
@@ -75,8 +76,8 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
   }
 
   ConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(histo->palette_code_bits_),
-      histo->literal_, m->literal_);
+      VP8LHistogramNumCodes(histo->palette_code_bits_), histo->literal_,
+      m->literal_);
   ConvertPopulationCountTableToBitEstimates(
       VALUES_IN_BYTE, histo->red_, m->red_);
   ConvertPopulationCountTableToBitEstimates(
@@ -92,27 +93,27 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
   return ok;
 }
 
-static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+static WEBP_INLINE float GetLiteralCost(const CostModel* const m, uint32_t v) {
   return m->alpha_[v >> 24] +
          m->red_[(v >> 16) & 0xff] +
          m->literal_[(v >> 8) & 0xff] +
          m->blue_[v & 0xff];
 }
 
-static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+static WEBP_INLINE float GetCacheCost(const CostModel* const m, uint32_t idx) {
   const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
   return m->literal_[literal_idx];
 }
 
-static WEBP_INLINE double GetLengthCost(const CostModel* const m,
-                                        uint32_t length) {
+static WEBP_INLINE float GetLengthCost(const CostModel* const m,
+                                       uint32_t length) {
   int code, extra_bits;
   VP8LPrefixEncodeBits(length, &code, &extra_bits);
   return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
 }
 
-static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
-                                          uint32_t distance) {
+static WEBP_INLINE float GetDistanceCost(const CostModel* const m,
+                                         uint32_t distance) {
   int code, extra_bits;
   VP8LPrefixEncodeBits(distance, &code, &extra_bits);
   return m->distance_[code] + extra_bits;
@@ -122,20 +123,20 @@ static WEBP_INLINE void AddSingleLiteralWithCostModel(
     const uint32_t* const argb, VP8LColorCache* const hashers,
     const CostModel* const cost_model, int idx, int use_color_cache,
     float prev_cost, float* const cost, uint16_t* const dist_array) {
-  double cost_val = prev_cost;
+  float cost_val = prev_cost;
   const uint32_t color = argb[idx];
   const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
   if (ix >= 0) {
     // use_color_cache is true and hashers contains color
-    const double mul0 = 0.68;
+    const float mul0 = 0.68f;
     cost_val += GetCacheCost(cost_model, ix) * mul0;
   } else {
-    const double mul1 = 0.82;
+    const float mul1 = 0.82f;
     if (use_color_cache) VP8LColorCacheInsert(hashers, color);
     cost_val += GetLiteralCost(cost_model, color) * mul1;
   }
   if (cost[idx] > cost_val) {
-    cost[idx] = (float)cost_val;
+    cost[idx] = cost_val;
     dist_array[idx] = 1;  // only one is inserted.
   }
 }
@@ -172,7 +173,7 @@ struct CostInterval {
 
 // The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
 typedef struct {
-  double cost_;
+  float cost_;
   int start_;
   int end_;       // Exclusive.
 } CostCacheInterval;
@@ -187,7 +188,7 @@ typedef struct {
   int count_;  // The number of stored intervals.
   CostCacheInterval* cache_intervals_;
   size_t cache_intervals_size_;
-  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  float cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
   float* costs_;
   uint16_t* dist_array_;
   // Most of the time, we only need few intervals -> use a free-list, to avoid
@@ -262,10 +263,13 @@ static int CostManagerInit(CostManager* const manager,
   CostManagerInitFreeList(manager);
 
   // Fill in the cost_cache_.
+  // Has to be done in two passes due to a GCC bug on i686
+  // related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
+  for (i = 0; i < cost_cache_size; ++i) {
+    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+  }
   manager->cache_intervals_size_ = 1;
-  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
   for (i = 1; i < cost_cache_size; ++i) {
-    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
     // Get the number of bound intervals.
     if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
       ++manager->cache_intervals_size_;
@@ -294,7 +298,7 @@ static int CostManagerInit(CostManager* const manager,
     cur->end_ = 1;
     cur->cost_ = manager->cost_cache_[0];
     for (i = 1; i < cost_cache_size; ++i) {
-      const double cost_val = manager->cost_cache_[i];
+      const float cost_val = manager->cost_cache_[i];
       if (cost_val != cur->cost_) {
         ++cur;
         // Initialize an interval.
@@ -303,6 +307,8 @@ static int CostManagerInit(CostManager* const manager,
       }
       cur->end_ = i + 1;
     }
+    assert((size_t)(cur - manager->cache_intervals_) + 1 ==
+           manager->cache_intervals_size_);
   }
 
   manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
@@ -311,7 +317,7 @@ static int CostManagerInit(CostManager* const manager,
     return 0;
   }
   // Set the initial costs_ high for every pixel as we will keep the minimum.
-  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = FLT_MAX;
 
   return 1;
 }
@@ -457,7 +463,7 @@ static WEBP_INLINE void InsertInterval(CostManager* const manager,
 // If handling the interval or one of its subintervals becomes to heavy, its
 // contribution is added to the costs right away.
 static WEBP_INLINE void PushInterval(CostManager* const manager,
-                                     double distance_cost, int position,
+                                     float distance_cost, int position,
                                      int len) {
   size_t i;
   CostInterval* interval = manager->head_;
@@ -474,7 +480,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
       const int k = j - position;
       float cost_tmp;
       assert(k >= 0 && k < MAX_LENGTH);
-      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+      cost_tmp = distance_cost + manager->cost_cache_[k];
 
       if (manager->costs_[j] > cost_tmp) {
         manager->costs_[j] = cost_tmp;
@@ -492,7 +498,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
     const int end = position + (cost_cache_intervals[i].end_ > len
                                  ? len
                                  : cost_cache_intervals[i].end_);
-    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+    const float cost = distance_cost + cost_cache_intervals[i].cost_;
 
     for (; interval != NULL && interval->start_ < end;
          interval = interval_next) {
@@ -570,22 +576,21 @@ static int BackwardReferencesHashChainDistanceOnly(
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
   const size_t literal_array_size =
-      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
-                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
+      sizeof(float) * (VP8LHistogramNumCodes(cache_bits));
   const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
   CostModel* const cost_model =
       (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
   VP8LColorCache hashers;
   CostManager* cost_manager =
-      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+      (CostManager*)WebPSafeCalloc(1ULL, sizeof(*cost_manager));
   int offset_prev = -1, len_prev = -1;
-  double offset_cost = -1;
+  float offset_cost = -1.f;
   int first_offset_is_constant = -1;  // initialized with 'impossible' value
   int reach = 0;
 
   if (cost_model == NULL || cost_manager == NULL) goto Error;
 
-  cost_model->literal_ = (double*)(cost_model + 1);
+  cost_model->literal_ = (float*)(cost_model + 1);
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
@@ -675,7 +680,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   }
 
   ok = !refs->error_;
-Error:
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
   CostManagerClear(cost_manager);
   WebPSafeFree(cost_model);
diff --git a/3rdparty/libwebp/src/enc/backward_references_enc.c b/3rdparty/libwebp/src/enc/backward_references_enc.c
index 519b36a09153..dc98bf171943 100644
--- a/3rdparty/libwebp/src/enc/backward_references_enc.c
+++ b/3rdparty/libwebp/src/enc/backward_references_enc.c
@@ -10,6 +10,8 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //
 
+#include "src/enc/backward_references_enc.h"
+
 #include <assert.h>
 #include <float.h>
 #include <math.h>
@@ -17,10 +19,11 @@
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
-#include "src/enc/backward_references_enc.h"
 #include "src/enc/histogram_enc.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/utils.h"
+#include "src/webp/encode.h"
 
 #define MIN_BLOCK_SIZE 256  // minimum block size for backward references
 
@@ -255,10 +258,13 @@ static WEBP_INLINE int MaxFindCopyLength(int len) {
 
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                       const uint32_t* const argb, int xsize, int ysize,
-                      int low_effort) {
+                      int low_effort, const WebPPicture* const pic,
+                      int percent_range, int* const percent) {
   const int size = xsize * ysize;
   const int iter_max = GetMaxItersForQuality(quality);
   const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
+  int remaining_percent = percent_range;
+  int percent_start = *percent;
   int pos;
   int argb_comp;
   uint32_t base_position;
@@ -276,7 +282,12 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
 
   hash_to_first_index =
       (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
-  if (hash_to_first_index == NULL) return 0;
+  if (hash_to_first_index == NULL) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+
+  percent_range = remaining_percent / 2;
+  remaining_percent -= percent_range;
 
   // Set the int32_t array to -1.
   memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
@@ -323,12 +334,22 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
       hash_to_first_index[hash_code] = pos++;
       argb_comp = argb_comp_next;
     }
+
+    if (!WebPReportProgress(
+            pic, percent_start + percent_range * pos / (size - 2), percent)) {
+      WebPSafeFree(hash_to_first_index);
+      return 0;
+    }
   }
   // Process the penultimate pixel.
   chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
 
   WebPSafeFree(hash_to_first_index);
 
+  percent_start += percent_range;
+  if (!WebPReportProgress(pic, percent_start, percent)) return 0;
+  percent_range = remaining_percent;
+
   // Find the best match interval at each pixel, defined by an offset to the
   // pixel and a length. The right-most pixel cannot match anything to the right
   // (hence a best length of 0) and the left-most pixel nothing to the left
@@ -417,8 +438,17 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
         max_base_position = base_position;
       }
     }
+
+    if (!WebPReportProgress(pic,
+                            percent_start + percent_range *
+                                                (size - 2 - base_position) /
+                                                (size - 2),
+                            percent)) {
+      return 0;
+    }
   }
-  return 1;
+
+  return WebPReportProgress(pic, percent_start + percent_range, percent);
 }
 
 static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
@@ -728,7 +758,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
                                   int* const best_cache_bits) {
   int i;
   const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
-  double entropy_min = MAX_ENTROPY;
+  float entropy_min = MAX_ENTROPY;
   int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
   VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
@@ -813,14 +843,14 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
   }
 
   for (i = 0; i <= cache_bits_max; ++i) {
-    const double entropy = VP8LHistogramEstimateBits(histos[i]);
+    const float entropy = VP8LHistogramEstimateBits(histos[i]);
     if (i == 0 || entropy < entropy_min) {
       entropy_min = entropy;
       *best_cache_bits = i;
     }
   }
   ok = 1;
-Error:
+ Error:
   for (i = 0; i <= cache_bits_max; ++i) {
     if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
     VP8LFreeHistogram(histos[i]);
@@ -890,7 +920,7 @@ static int GetBackwardReferences(int width, int height,
   int i, lz77_type;
   // Index 0 is for a color cache, index 1 for no cache (if needed).
   int lz77_types_best[2] = {0, 0};
-  double bit_costs_best[2] = {DBL_MAX, DBL_MAX};
+  float bit_costs_best[2] = {FLT_MAX, FLT_MAX};
   VP8LHashChain hash_chain_box;
   VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
   int status = 0;
@@ -902,7 +932,7 @@ static int GetBackwardReferences(int width, int height,
   for (lz77_type = 1; lz77_types_to_try;
        lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
     int res = 0;
-    double bit_cost = 0.;
+    float bit_cost = 0.f;
     if ((lz77_types_to_try & lz77_type) == 0) continue;
     switch (lz77_type) {
       case kLZ77RLE:
@@ -976,15 +1006,16 @@ static int GetBackwardReferences(int width, int height,
       const VP8LHashChain* const hash_chain_tmp =
           (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
       const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
-      if (VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
-                                               hash_chain_tmp, &refs[i],
-                                               refs_tmp)) {
-        double bit_cost_trace;
-        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
-        bit_cost_trace = VP8LHistogramEstimateBits(histo);
-        if (bit_cost_trace < bit_costs_best[i]) {
-          BackwardRefsSwap(refs_tmp, &refs[i]);
-        }
+      float bit_cost_trace;
+      if (!VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
+                                                hash_chain_tmp, &refs[i],
+                                                refs_tmp)) {
+        goto Error;
+      }
+      VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+      bit_cost_trace = VP8LHistogramEstimateBits(histo);
+      if (bit_cost_trace < bit_costs_best[i]) {
+        BackwardRefsSwap(refs_tmp, &refs[i]);
       }
     }
 
@@ -1000,31 +1031,35 @@ static int GetBackwardReferences(int width, int height,
   }
   status = 1;
 
-Error:
+ Error:
   VP8LHashChainClear(&hash_chain_box);
   VP8LFreeHistogram(histo);
   return status;
 }
 
-WebPEncodingError VP8LGetBackwardReferences(
+int VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
     int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
     const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
-    int* const cache_bits_best) {
+    int* const cache_bits_best, const WebPPicture* const pic, int percent_range,
+    int* const percent) {
   if (low_effort) {
     VP8LBackwardRefs* refs_best;
     *cache_bits_best = cache_bits_max;
     refs_best = GetBackwardReferencesLowEffort(
         width, height, argb, cache_bits_best, hash_chain, refs);
-    if (refs_best == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    if (refs_best == NULL) {
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    }
     // Set it in first position.
     BackwardRefsSwap(refs_best, &refs[0]);
   } else {
     if (!GetBackwardReferences(width, height, argb, quality, lz77_types_to_try,
                                cache_bits_max, do_no_cache, hash_chain, refs,
                                cache_bits_best)) {
-      return VP8_ENC_ERROR_OUT_OF_MEMORY;
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     }
   }
-  return VP8_ENC_OK;
+
+  return WebPReportProgress(pic, *percent + percent_range, percent);
 }
diff --git a/3rdparty/libwebp/src/enc/backward_references_enc.h b/3rdparty/libwebp/src/enc/backward_references_enc.h
index 4c0267b41e90..4dff1c27b57c 100644
--- a/3rdparty/libwebp/src/enc/backward_references_enc.h
+++ b/3rdparty/libwebp/src/enc/backward_references_enc.h
@@ -134,10 +134,11 @@ struct VP8LHashChain {
 
 // Must be called first, to set size.
 int VP8LHashChainInit(VP8LHashChain* const p, int size);
-// Pre-compute the best matches for argb.
+// Pre-compute the best matches for argb. pic and percent are for progress.
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                       const uint32_t* const argb, int xsize, int ysize,
-                      int low_effort);
+                      int low_effort, const WebPPicture* const pic,
+                      int percent_range, int* const percent);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
 static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
@@ -227,11 +228,14 @@ enum VP8LLZ77Type {
 // VP8LBackwardRefs is put in the first element, the best value with no-cache in
 // the second element.
 // In both cases, the last element is used as temporary internally.
-WebPEncodingError VP8LGetBackwardReferences(
+// pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
+int VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
     int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
     const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
-    int* const cache_bits_best);
+    int* const cache_bits_best, const WebPPicture* const pic, int percent_range,
+    int* const percent);
 
 #ifdef __cplusplus
 }
diff --git a/3rdparty/libwebp/src/enc/frame_enc.c b/3rdparty/libwebp/src/enc/frame_enc.c
index af538d83bacd..01860ca757e6 100644
--- a/3rdparty/libwebp/src/enc/frame_enc.c
+++ b/3rdparty/libwebp/src/enc/frame_enc.c
@@ -578,7 +578,7 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
   uint64_t size = 0;
   uint64_t size_p0 = 0;
   uint64_t distortion = 0;
-  const uint64_t pixel_count = nb_mbs * 384;
+  const uint64_t pixel_count = (uint64_t)nb_mbs * 384;
 
   VP8IteratorInit(enc, &it);
   SetLoopParams(enc, s->q);
@@ -689,7 +689,7 @@ static int PreLoopInitialize(VP8Encoder* const enc) {
   }
   if (!ok) {
     VP8EncFreeBitWriters(enc);  // malloc error occurred
-    WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
   return ok;
 }
@@ -719,6 +719,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
   } else {
     // Something bad happened -> need to do some memory cleanup.
     VP8EncFreeBitWriters(enc);
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
   return ok;
 }
@@ -754,6 +755,11 @@ int VP8EncLoop(VP8Encoder* const enc) {
     // *then* decide how to code the skip decision if there's one.
     if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
       CodeResiduals(it.bw_, &it, &info);
+      if (it.bw_->error_) {
+        // enc->pic_->error_code is set in PostLoopFinalize().
+        ok = 0;
+        break;
+      }
     } else {   // reset predictors after a skip
       ResetAfterSkip(&it);
     }
@@ -778,11 +784,12 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
   // Roughly refresh the proba eight times per pass
   int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
   int num_pass_left = enc->config_->pass;
+  int remaining_progress = 40;  // percents
   const int do_search = enc->do_search_;
   VP8EncIterator it;
   VP8EncProba* const proba = &enc->proba_;
   const VP8RDLevel rd_opt = enc->rd_opt_level_;
-  const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
+  const uint64_t pixel_count = (uint64_t)enc->mb_w_ * enc->mb_h_ * 384;
   PassStats stats;
   int ok;
 
@@ -805,6 +812,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     uint64_t size_p0 = 0;
     uint64_t distortion = 0;
     int cnt = max_count;
+    // The final number of passes is not trivial to know in advance.
+    const int pass_progress = remaining_progress / (2 + num_pass_left);
+    remaining_progress -= pass_progress;
     VP8IteratorInit(enc, &it);
     SetLoopParams(enc, stats.q);
     if (is_last_pass) {
@@ -832,7 +842,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
         StoreSideInfo(&it);
         VP8StoreFilterStats(&it);
         VP8IteratorExport(&it);
-        ok = VP8IteratorProgress(&it, 20);
+        ok = VP8IteratorProgress(&it, pass_progress);
       }
       VP8IteratorSaveBoundary(&it);
     } while (ok && VP8IteratorNext(&it));
@@ -878,7 +888,8 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
                        (const uint8_t*)proba->coeffs_, 1);
   }
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + remaining_progress,
+                                &enc->percent_);
   return PostLoopFinalize(&it, ok);
 }
 
diff --git a/3rdparty/libwebp/src/enc/histogram_enc.c b/3rdparty/libwebp/src/enc/histogram_enc.c
index edc6e4faa43b..3ca67b3ad09b 100644
--- a/3rdparty/libwebp/src/enc/histogram_enc.c
+++ b/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -13,15 +13,17 @@
 #include "src/webp/config.h"
 #endif
 
+#include <float.h>
 #include <math.h>
 
-#include "src/enc/backward_references_enc.h"
-#include "src/enc/histogram_enc.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/utils/utils.h"
 
-#define MAX_COST 1.e38
+#define MAX_BIT_COST FLT_MAX
 
 // Number of partitions for the three dominant (literal, red and blue) symbol
 // costs.
@@ -228,8 +230,8 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
 // -----------------------------------------------------------------------------
 // Entropy-related functions.
 
-static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
-  double mix;
+static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) {
+  float mix;
   if (entropy->nonzeros < 5) {
     if (entropy->nonzeros <= 1) {
       return 0;
@@ -238,67 +240,67 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
     // Let's mix in a bit of entropy to favor good clustering when
     // distributions of these are combined.
     if (entropy->nonzeros == 2) {
-      return 0.99 * entropy->sum + 0.01 * entropy->entropy;
+      return 0.99f * entropy->sum + 0.01f * entropy->entropy;
     }
     // No matter what the entropy says, we cannot be better than min_limit
     // with Huffman coding. I am mixing a bit of entropy into the
     // min_limit since it produces much better (~0.5 %) compression results
     // perhaps because of better entropy clustering.
     if (entropy->nonzeros == 3) {
-      mix = 0.95;
+      mix = 0.95f;
     } else {
-      mix = 0.7;  // nonzeros == 4.
+      mix = 0.7f;  // nonzeros == 4.
     }
   } else {
-    mix = 0.627;
+    mix = 0.627f;
   }
 
   {
-    double min_limit = 2 * entropy->sum - entropy->max_val;
-    min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+    float min_limit = 2.f * entropy->sum - entropy->max_val;
+    min_limit = mix * min_limit + (1.f - mix) * entropy->entropy;
     return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
   }
 }
 
-double VP8LBitsEntropy(const uint32_t* const array, int n) {
+float VP8LBitsEntropy(const uint32_t* const array, int n) {
   VP8LBitEntropy entropy;
   VP8LBitsEntropyUnrefined(array, n, &entropy);
 
   return BitsEntropyRefine(&entropy);
 }
 
-static double InitialHuffmanCost(void) {
+static float InitialHuffmanCost(void) {
   // Small bias because Huffman code length is typically not stored in
   // full length.
   static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
-  static const double kSmallBias = 9.1;
+  static const float kSmallBias = 9.1f;
   return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
 }
 
 // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
-static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+static float FinalHuffmanCost(const VP8LStreaks* const stats) {
   // The constants in this function are experimental and got rounded from
   // their original values in 1/8 when switched to 1/1024.
-  double retval = InitialHuffmanCost();
+  float retval = InitialHuffmanCost();
   // Second coefficient: Many zeros in the histogram are covered efficiently
   // by a run-length encode. Originally 2/8.
-  retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  retval += stats->counts[0] * 1.5625f + 0.234375f * stats->streaks[0][1];
   // Second coefficient: Constant values are encoded less efficiently, but still
   // RLE'ed. Originally 6/8.
-  retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  retval += stats->counts[1] * 2.578125f + 0.703125f * stats->streaks[1][1];
   // 0s are usually encoded more efficiently than non-0s.
   // Originally 15/8.
-  retval += 1.796875 * stats->streaks[0][0];
+  retval += 1.796875f * stats->streaks[0][0];
   // Originally 26/8.
-  retval += 3.28125 * stats->streaks[1][0];
+  retval += 3.28125f * stats->streaks[1][0];
   return retval;
 }
 
 // Get the symbol entropy for the distribution 'population'.
 // Set 'trivial_sym', if there's only one symbol present in the distribution.
-static double PopulationCost(const uint32_t* const population, int length,
-                             uint32_t* const trivial_sym,
-                             uint8_t* const is_used) {
+static float PopulationCost(const uint32_t* const population, int length,
+                            uint32_t* const trivial_sym,
+                            uint8_t* const is_used) {
   VP8LBitEntropy bit_entropy;
   VP8LStreaks stats;
   VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
@@ -314,11 +316,10 @@ static double PopulationCost(const uint32_t* const population, int length,
 
 // trivial_at_end is 1 if the two histograms only have one element that is
 // non-zero: both the zero-th one, or both the last one.
-static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
-                                             const uint32_t* const Y,
-                                             int length, int is_X_used,
-                                             int is_Y_used,
-                                             int trivial_at_end) {
+static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
+                                            const uint32_t* const Y, int length,
+                                            int is_X_used, int is_Y_used,
+                                            int trivial_at_end) {
   VP8LStreaks stats;
   if (trivial_at_end) {
     // This configuration is due to palettization that transforms an indexed
@@ -356,16 +357,18 @@ static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
-double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
-  return
-      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
-                     NULL, &p->is_used_[0])
-      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1])
-      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2])
-      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3])
-      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4])
-      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
-      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
+float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
+  return PopulationCost(p->literal_,
+                        VP8LHistogramNumCodes(p->palette_code_bits_), NULL,
+                        &p->is_used_[0]) +
+         PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1]) +
+         PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2]) +
+         PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) +
+         PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL,
+                        &p->is_used_[4]) +
+         (float)VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES,
+                              NUM_LENGTH_CODES) +
+         (float)VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
 
 // -----------------------------------------------------------------------------
@@ -373,17 +376,16 @@ double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
 
 static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
                                        const VP8LHistogram* const b,
-                                       double cost_threshold,
-                                       double* cost) {
+                                       float cost_threshold, float* cost) {
   const int palette_code_bits = a->palette_code_bits_;
   int trivial_at_end = 0;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   *cost += GetCombinedEntropy(a->literal_, b->literal_,
                               VP8LHistogramNumCodes(palette_code_bits),
                               a->is_used_[0], b->is_used_[0], 0);
-  *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
-                                 b->literal_ + NUM_LITERAL_CODES,
-                                 NUM_LENGTH_CODES);
+  *cost += (float)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
+                                        b->literal_ + NUM_LITERAL_CODES,
+                                        NUM_LENGTH_CODES);
   if (*cost > cost_threshold) return 0;
 
   if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
@@ -417,8 +419,8 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
   *cost +=
       GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
                          a->is_used_[4], b->is_used_[4], 0);
-  *cost +=
-      VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  *cost += (float)VP8LExtraCostCombined(a->distance_, b->distance_,
+                                        NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
 
   return 1;
@@ -439,12 +441,11 @@ static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
 // Since the previous score passed is 'cost_threshold', we only need to compare
 // the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
 // early.
-static double HistogramAddEval(const VP8LHistogram* const a,
-                               const VP8LHistogram* const b,
-                               VP8LHistogram* const out,
-                               double cost_threshold) {
-  double cost = 0;
-  const double sum_cost = a->bit_cost_ + b->bit_cost_;
+static float HistogramAddEval(const VP8LHistogram* const a,
+                              const VP8LHistogram* const b,
+                              VP8LHistogram* const out, float cost_threshold) {
+  float cost = 0;
+  const float sum_cost = a->bit_cost_ + b->bit_cost_;
   cost_threshold += sum_cost;
 
   if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
@@ -459,10 +460,10 @@ static double HistogramAddEval(const VP8LHistogram* const a,
 // Same as HistogramAddEval(), except that the resulting histogram
 // is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
 // the term C(b) which is constant over all the evaluations.
-static double HistogramAddThresh(const VP8LHistogram* const a,
-                                 const VP8LHistogram* const b,
-                                 double cost_threshold) {
-  double cost;
+static float HistogramAddThresh(const VP8LHistogram* const a,
+                                const VP8LHistogram* const b,
+                                float cost_threshold) {
+  float cost;
   assert(a != NULL && b != NULL);
   cost = -a->bit_cost_;
   GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
@@ -473,24 +474,22 @@ static double HistogramAddThresh(const VP8LHistogram* const a,
 
 // The structure to keep track of cost range for the three dominant entropy
 // symbols.
-// TODO(skal): Evaluate if float can be used here instead of double for
-// representing the entropy costs.
 typedef struct {
-  double literal_max_;
-  double literal_min_;
-  double red_max_;
-  double red_min_;
-  double blue_max_;
-  double blue_min_;
+  float literal_max_;
+  float literal_min_;
+  float red_max_;
+  float red_min_;
+  float blue_max_;
+  float blue_min_;
 } DominantCostRange;
 
 static void DominantCostRangeInit(DominantCostRange* const c) {
   c->literal_max_ = 0.;
-  c->literal_min_ = MAX_COST;
+  c->literal_min_ = MAX_BIT_COST;
   c->red_max_ = 0.;
-  c->red_min_ = MAX_COST;
+  c->red_min_ = MAX_BIT_COST;
   c->blue_max_ = 0.;
-  c->blue_min_ = MAX_COST;
+  c->blue_min_ = MAX_BIT_COST;
 }
 
 static void UpdateDominantCostRange(
@@ -505,16 +504,15 @@ static void UpdateDominantCostRange(
 
 static void UpdateHistogramCost(VP8LHistogram* const h) {
   uint32_t alpha_sym, red_sym, blue_sym;
-  const double alpha_cost =
-      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym,
-                     &h->is_used_[3]);
-  const double distance_cost =
+  const float alpha_cost =
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
+  const float distance_cost =
       PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
-      VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
+      (float)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
   h->literal_cost_ =
       PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
-          VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
+      (float)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
   h->red_cost_ =
       PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
   h->blue_cost_ =
@@ -529,10 +527,10 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
   }
 }
 
-static int GetBinIdForEntropy(double min, double max, double val) {
-  const double range = max - min;
+static int GetBinIdForEntropy(float min, float max, float val) {
+  const float range = max - min;
   if (range > 0.) {
-    const double delta = val - min;
+    const float delta = val - min;
     return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
   } else {
     return 0;
@@ -641,15 +639,11 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
 
 // Merges some histograms with same bin_id together if it's advantageous.
 // Sets the remaining histograms to NULL.
-static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
-                                       int* num_used,
-                                       const uint16_t* const clusters,
-                                       uint16_t* const cluster_mappings,
-                                       VP8LHistogram* cur_combo,
-                                       const uint16_t* const bin_map,
-                                       int num_bins,
-                                       double combine_cost_factor,
-                                       int low_effort) {
+static void HistogramCombineEntropyBin(
+    VP8LHistogramSet* const image_histo, int* num_used,
+    const uint16_t* const clusters, uint16_t* const cluster_mappings,
+    VP8LHistogram* cur_combo, const uint16_t* const bin_map, int num_bins,
+    float combine_cost_factor, int low_effort) {
   VP8LHistogram** const histograms = image_histo->histograms;
   int idx;
   struct {
@@ -679,11 +673,10 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
       cluster_mappings[clusters[idx]] = clusters[first];
     } else {
       // try to merge #idx into #first (both share the same bin_id)
-      const double bit_cost = histograms[idx]->bit_cost_;
-      const double bit_cost_thresh = -bit_cost * combine_cost_factor;
-      const double curr_cost_diff =
-          HistogramAddEval(histograms[first], histograms[idx],
-                           cur_combo, bit_cost_thresh);
+      const float bit_cost = histograms[idx]->bit_cost_;
+      const float bit_cost_thresh = -bit_cost * combine_cost_factor;
+      const float curr_cost_diff = HistogramAddEval(
+          histograms[first], histograms[idx], cur_combo, bit_cost_thresh);
       if (curr_cost_diff < bit_cost_thresh) {
         // Try to merge two histograms only if the combo is a trivial one or
         // the two candidate histograms are already non-trivial.
@@ -731,8 +724,8 @@ static uint32_t MyRand(uint32_t* const seed) {
 typedef struct {
   int idx1;
   int idx2;
-  double cost_diff;
-  double cost_combo;
+  float cost_diff;
+  float cost_combo;
 } HistogramPair;
 
 typedef struct {
@@ -787,10 +780,9 @@ static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
 // Update the cost diff and combo of a pair of histograms. This needs to be
 // called when the the histograms have been merged with a third one.
 static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
-                                 const VP8LHistogram* const h2,
-                                 double threshold,
+                                 const VP8LHistogram* const h2, float threshold,
                                  HistogramPair* const pair) {
-  const double sum_cost = h1->bit_cost_ + h2->bit_cost_;
+  const float sum_cost = h1->bit_cost_ + h2->bit_cost_;
   pair->cost_combo = 0.;
   GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
   pair->cost_diff = pair->cost_combo - sum_cost;
@@ -799,9 +791,9 @@ static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
 // Create a pair from indices "idx1" and "idx2" provided its cost
 // is inferior to "threshold", a negative entropy.
 // It returns the cost of the pair, or 0. if it superior to threshold.
-static double HistoQueuePush(HistoQueue* const histo_queue,
-                             VP8LHistogram** const histograms, int idx1,
-                             int idx2, double threshold) {
+static float HistoQueuePush(HistoQueue* const histo_queue,
+                            VP8LHistogram** const histograms, int idx1,
+                            int idx2, float threshold) {
   const VP8LHistogram* h1;
   const VP8LHistogram* h2;
   HistogramPair pair;
@@ -945,8 +937,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
            ++tries_with_no_success < num_tries_no_success;
        ++iter) {
     int* mapping_index;
-    double best_cost =
-        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
+    float best_cost =
+        (histo_queue.size == 0) ? 0.f : histo_queue.queue[0].cost_diff;
     int best_idx1 = -1, best_idx2 = 1;
     const uint32_t rand_range = (*num_used - 1) * (*num_used);
     // (*num_used) / 2 was chosen empirically. Less means faster but worse
@@ -955,7 +947,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
 
     // Pick random samples.
     for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
-      double curr_cost;
+      float curr_cost;
       // Choose two different histograms at random and try to combine them.
       const uint32_t tmp = MyRand(&seed) % rand_range;
       uint32_t idx1 = tmp / (*num_used - 1);
@@ -1034,7 +1026,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
   *do_greedy = (*num_used <= min_cluster_size);
   ok = 1;
 
-End:
+ End:
   HistoQueueClear(&histo_queue);
   WebPSafeFree(mappings);
   return ok;
@@ -1057,7 +1049,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   if (out_size > 1) {
     for (i = 0; i < in_size; ++i) {
       int best_out = 0;
-      double best_bits = MAX_COST;
+      float best_bits = MAX_BIT_COST;
       int k;
       if (in_histo[i] == NULL) {
         // Arbitrarily set to the previous value if unused to help future LZ77.
@@ -1065,7 +1057,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
         continue;
       }
       for (k = 0; k < out_size; ++k) {
-        double cur_bits;
+        float cur_bits;
         cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
         if (k == 0 || cur_bits < best_bits) {
           best_bits = cur_bits;
@@ -1093,13 +1085,13 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   }
 }
 
-static double GetCombineCostFactor(int histo_size, int quality) {
-  double combine_cost_factor = 0.16;
+static float GetCombineCostFactor(int histo_size, int quality) {
+  float combine_cost_factor = 0.16f;
   if (quality < 90) {
-    if (histo_size > 256) combine_cost_factor /= 2.;
-    if (histo_size > 512) combine_cost_factor /= 2.;
-    if (histo_size > 1024) combine_cost_factor /= 2.;
-    if (quality <= 50) combine_cost_factor /= 2.;
+    if (histo_size > 256) combine_cost_factor /= 2.f;
+    if (histo_size > 512) combine_cost_factor /= 2.f;
+    if (histo_size > 1024) combine_cost_factor /= 2.f;
+    if (quality <= 50) combine_cost_factor /= 2.f;
   }
   return combine_cost_factor;
 }
@@ -1169,15 +1161,17 @@ static void RemoveEmptyHistograms(VP8LHistogramSet* const image_histo) {
 }
 
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
-                             const VP8LBackwardRefs* const refs,
-                             int quality, int low_effort,
-                             int histo_bits, int cache_bits,
+                             const VP8LBackwardRefs* const refs, int quality,
+                             int low_effort, int histogram_bits, int cache_bits,
                              VP8LHistogramSet* const image_histo,
                              VP8LHistogram* const tmp_histo,
-                             uint16_t* const histogram_symbols) {
-  int ok = 0;
-  const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
-  const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
+                             uint16_t* const histogram_symbols,
+                             const WebPPicture* const pic, int percent_range,
+                             int* const percent) {
+  const int histo_xsize =
+      histogram_bits ? VP8LSubSampleSize(xsize, histogram_bits) : 1;
+  const int histo_ysize =
+      histogram_bits ? VP8LSubSampleSize(ysize, histogram_bits) : 1;
   const int image_histo_raw_size = histo_xsize * histo_ysize;
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
@@ -1187,13 +1181,16 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
   int entropy_combine;
   uint16_t* const map_tmp =
-      WebPSafeMalloc(2 * image_histo_raw_size, sizeof(map_tmp));
+      WebPSafeMalloc(2 * image_histo_raw_size, sizeof(*map_tmp));
   uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
   int num_used = image_histo_raw_size;
-  if (orig_histo == NULL || map_tmp == NULL) goto Error;
+  if (orig_histo == NULL || map_tmp == NULL) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto Error;
+  }
 
   // Construct the histograms from backward references.
-  HistogramBuild(xsize, histo_bits, refs, orig_histo);
+  HistogramBuild(xsize, histogram_bits, refs, orig_histo);
   // Copies the histograms and computes its bit_cost.
   // histogram_symbols is optimized
   HistogramCopyAndAnalyze(orig_histo, image_histo, &num_used,
@@ -1204,16 +1201,15 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
 
   if (entropy_combine) {
     uint16_t* const bin_map = map_tmp;
-    const double combine_cost_factor =
+    const float combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
     const uint32_t num_clusters = num_used;
 
     HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    HistogramCombineEntropyBin(image_histo, &num_used, histogram_symbols,
-                               cluster_mappings, tmp_histo, bin_map,
-                               entropy_combine_num_bins, combine_cost_factor,
-                               low_effort);
+    HistogramCombineEntropyBin(
+        image_histo, &num_used, histogram_symbols, cluster_mappings, tmp_histo,
+        bin_map, entropy_combine_num_bins, combine_cost_factor, low_effort);
     OptimizeHistogramSymbols(image_histo, cluster_mappings, num_clusters,
                              map_tmp, histogram_symbols);
   }
@@ -1227,11 +1223,13 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
     int do_greedy;
     if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
                                     &do_greedy)) {
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
       goto Error;
     }
     if (do_greedy) {
       RemoveEmptyHistograms(image_histo);
       if (!HistogramCombineGreedy(image_histo, &num_used)) {
+        WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
         goto Error;
       }
     }
@@ -1241,10 +1239,12 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   RemoveEmptyHistograms(image_histo);
   HistogramRemap(orig_histo, image_histo, histogram_symbols);
 
-  ok = 1;
+  if (!WebPReportProgress(pic, *percent + percent_range, percent)) {
+    goto Error;
+  }
 
  Error:
   VP8LFreeHistogramSet(orig_histo);
   WebPSafeFree(map_tmp);
-  return ok;
+  return (pic->error_code == VP8_ENC_OK);
 }
diff --git a/3rdparty/libwebp/src/enc/histogram_enc.h b/3rdparty/libwebp/src/enc/histogram_enc.h
index 54c2d2178393..4c0bb97464de 100644
--- a/3rdparty/libwebp/src/enc/histogram_enc.h
+++ b/3rdparty/libwebp/src/enc/histogram_enc.h
@@ -40,10 +40,10 @@ typedef struct {
   int palette_code_bits_;
   uint32_t trivial_symbol_;  // True, if histograms for Red, Blue & Alpha
                              // literal symbols are single valued.
-  double bit_cost_;          // cached value of bit cost.
-  double literal_cost_;      // Cached values of dominant entropy costs:
-  double red_cost_;          // literal, red & blue.
-  double blue_cost_;
+  float bit_cost_;           // cached value of bit cost.
+  float literal_cost_;       // Cached values of dominant entropy costs:
+  float red_cost_;           // literal, red & blue.
+  float blue_cost_;
   uint8_t is_used_[5];       // 5 for literal, red, blue, alpha, distance
 } VP8LHistogram;
 
@@ -64,8 +64,8 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
                          const VP8LBackwardRefs* const refs,
                          int palette_code_bits);
 
-// Return the size of the histogram for a given palette_code_bits.
-int VP8LGetHistogramSize(int palette_code_bits);
+// Return the size of the histogram for a given cache_bits.
+int VP8LGetHistogramSize(int cache_bits);
 
 // Set the palette_code_bits and reset the stats.
 // If init_arrays is true, the arrays are also filled with 0's.
@@ -105,21 +105,23 @@ static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
       ((palette_code_bits > 0) ? (1 << palette_code_bits) : 0);
 }
 
-// Builds the histogram image.
+// Builds the histogram image. pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
-                             const VP8LBackwardRefs* const refs,
-                             int quality, int low_effort,
-                             int histogram_bits, int cache_bits,
-                             VP8LHistogramSet* const image_in,
+                             const VP8LBackwardRefs* const refs, int quality,
+                             int low_effort, int histogram_bits, int cache_bits,
+                             VP8LHistogramSet* const image_histo,
                              VP8LHistogram* const tmp_histo,
-                             uint16_t* const histogram_symbols);
+                             uint16_t* const histogram_symbols,
+                             const WebPPicture* const pic, int percent_range,
+                             int* const percent);
 
 // Returns the entropy for the symbols in the input array.
-double VP8LBitsEntropy(const uint32_t* const array, int n);
+float VP8LBitsEntropy(const uint32_t* const array, int n);
 
 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
-double VP8LHistogramEstimateBits(VP8LHistogram* const p);
+float VP8LHistogramEstimateBits(VP8LHistogram* const p);
 
 #ifdef __cplusplus
 }
diff --git a/3rdparty/libwebp/src/enc/picture_csp_enc.c b/3rdparty/libwebp/src/enc/picture_csp_enc.c
index 35eede96355b..a9280e6c3050 100644
--- a/3rdparty/libwebp/src/enc/picture_csp_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_csp_enc.c
@@ -15,12 +15,19 @@
 #include <stdlib.h>
 #include <math.h>
 
+#include "sharpyuv/sharpyuv.h"
+#include "sharpyuv/sharpyuv_csp.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/utils/random_utils.h"
 #include "src/utils/utils.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/yuv.h"
+#include "src/dsp/cpu.h"
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>
+#endif
 
 // Uncomment to disable gamma-compression during RGB->U/V averaging
 #define USE_GAMMA_COMPRESSION
@@ -62,10 +69,12 @@ static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
 int WebPPictureHasTransparency(const WebPPicture* picture) {
   if (picture == NULL) return 0;
   if (picture->use_argb) {
-    const int alpha_offset = ALPHA_OFFSET;
-    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
-                          picture->width, picture->height,
-                          4, picture->argb_stride * sizeof(*picture->argb));
+    if (picture->argb != NULL) {
+      return CheckNonOpaque((const uint8_t*)picture->argb + ALPHA_OFFSET,
+                            picture->width, picture->height,
+                            4, picture->argb_stride * sizeof(*picture->argb));
+    }
+    return 0;
   }
   return CheckNonOpaque(picture->a, picture->width, picture->height,
                         1, picture->a_stride);
@@ -76,30 +85,31 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 
 #if defined(USE_GAMMA_COMPRESSION)
 
-// gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80      // for now we use a different gamma value than kGammaF
-#define kGammaFix 12     // fixed-point precision for linear values
-#define kGammaScale ((1 << kGammaFix) - 1)
-#define kGammaTabFix 7   // fixed-point fractional bits precision
-#define kGammaTabScale (1 << kGammaTabFix)
-#define kGammaTabRounder (kGammaTabScale >> 1)
-#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+// Gamma correction compensates loss of resolution during chroma subsampling.
+#define GAMMA_FIX 12      // fixed-point precision for linear values
+#define GAMMA_TAB_FIX 7   // fixed-point fractional bits precision
+#define GAMMA_TAB_SIZE (1 << (GAMMA_FIX - GAMMA_TAB_FIX))
+static const double kGamma = 0.80;
+static const int kGammaScale = ((1 << GAMMA_FIX) - 1);
+static const int kGammaTabScale = (1 << GAMMA_TAB_FIX);
+static const int kGammaTabRounder = (1 << GAMMA_TAB_FIX >> 1);
 
-static int kLinearToGammaTab[kGammaTabSize + 1];
+static int kLinearToGammaTab[GAMMA_TAB_SIZE + 1];
 static uint16_t kGammaToLinearTab[256];
 static volatile int kGammaTablesOk = 0;
 static void InitGammaTables(void);
+extern VP8CPUInfo VP8GetCPUInfo;
 
 WEBP_DSP_INIT_FUNC(InitGammaTables) {
   if (!kGammaTablesOk) {
     int v;
-    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+    const double scale = (double)(1 << GAMMA_TAB_FIX) / kGammaScale;
     const double norm = 1. / 255.;
     for (v = 0; v <= 255; ++v) {
       kGammaToLinearTab[v] =
           (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
     }
-    for (v = 0; v <= kGammaTabSize; ++v) {
+    for (v = 0; v <= GAMMA_TAB_SIZE; ++v) {
       kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
     }
     kGammaTablesOk = 1;
@@ -111,12 +121,12 @@ static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
 }
 
 static WEBP_INLINE int Interpolate(int v) {
-  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int tab_pos = v >> (GAMMA_TAB_FIX + 2);    // integer part
   const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
   const int v0 = kLinearToGammaTab[tab_pos];
   const int v1 = kLinearToGammaTab[tab_pos + 1];
   const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  assert(tab_pos + 1 < kGammaTabSize + 1);
+  assert(tab_pos + 1 < GAMMA_TAB_SIZE + 1);
   return y;
 }
 
@@ -124,7 +134,7 @@ static WEBP_INLINE int Interpolate(int v) {
 // U/V value, suitable for RGBToU/V calls.
 static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
   const int y = Interpolate(base_value << shift);   // final uplifted value
-  return (y + kGammaTabRounder) >> kGammaTabFix;    // descale
+  return (y + kGammaTabRounder) >> GAMMA_TAB_FIX;    // descale
 }
 
 #else
@@ -158,415 +168,26 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
 //------------------------------------------------------------------------------
 // Sharp RGB->YUV conversion
 
-static const int kNumIterations = 4;
 static const int kMinDimensionIterativeConversion = 4;
 
-// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
-// banding sometimes. Better use extra precision.
-#define SFIX 2                // fixed-point precision of RGB and Y/W
-typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
-typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
-
-#define SHALF (1 << SFIX >> 1)
-#define MAX_Y_T ((256 << SFIX) - 1)
-#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// We use tables of different size and precision for the Rec709 / BT2020
-// transfer function.
-#define kGammaF (1./0.45)
-static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
-#define GAMMA_TO_LINEAR_BITS 14
-static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
-static volatile int kGammaTablesSOk = 0;
-static void InitGammaTablesS(void);
-
-WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
-  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
-  if (!kGammaTablesSOk) {
-    int v;
-    const double norm = 1. / MAX_Y_T;
-    const double scale = 1. / kGammaTabSize;
-    const double a = 0.09929682680944;
-    const double thresh = 0.018053968510807;
-    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
-    for (v = 0; v <= MAX_Y_T; ++v) {
-      const double g = norm * v;
-      double value;
-      if (g <= thresh * 4.5) {
-        value = g / 4.5;
-      } else {
-        const double a_rec = 1. / (1. + a);
-        value = pow(a_rec * (g + a), kGammaF);
-      }
-      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
-    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
-      const double g = scale * v;
-      double value;
-      if (g <= thresh) {
-        value = 4.5 * g;
-      } else {
-        value = (1. + a) * pow(g, 1. / kGammaF) - a;
-      }
-      // we already incorporate the 1/2 rounding constant here
-      kLinearToGammaTabS[v] =
-          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
-    }
-    // to prevent small rounding errors to cause read-overflow:
-    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
-    kGammaTablesSOk = 1;
-  }
-}
-
-// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
-static WEBP_INLINE uint32_t GammaToLinearS(int v) {
-  return kGammaToLinearTabS[v];
-}
-
-static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
-  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
-  const uint32_t v = value * kGammaTabSize;
-  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
-  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
-  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
-  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
-  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
-  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
-  // Final interpolation. Note that rounding is already included.
-  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
-  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
-  return result;
-}
-
-#else
-
-static void InitGammaTablesS(void) {}
-static WEBP_INLINE uint32_t GammaToLinearS(int v) {
-  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
-}
-static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
-  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
-}
-
-#endif    // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-
-static uint8_t clip_8b(fixed_t v) {
-  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
-}
-
-static fixed_y_t clip_y(int y) {
-  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
-}
-
-//------------------------------------------------------------------------------
-
-static int RGBToGray(int r, int g, int b) {
-  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
-  return (luma >> YUV_FIX);
-}
-
-static uint32_t ScaleDown(int a, int b, int c, int d) {
-  const uint32_t A = GammaToLinearS(a);
-  const uint32_t B = GammaToLinearS(b);
-  const uint32_t C = GammaToLinearS(c);
-  const uint32_t D = GammaToLinearS(d);
-  return LinearToGammaS((A + B + C + D + 2) >> 2);
-}
-
-static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
-    const uint32_t R = GammaToLinearS(src[0 * w + i]);
-    const uint32_t G = GammaToLinearS(src[1 * w + i]);
-    const uint32_t B = GammaToLinearS(src[2 * w + i]);
-    const uint32_t Y = RGBToGray(R, G, B);
-    dst[i] = (fixed_y_t)LinearToGammaS(Y);
-  }
-}
-
-static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
-                         fixed_t* dst, int uv_w) {
-  int i;
-  for (i = 0; i < uv_w; ++i) {
-    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
-                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
-    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
-                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
-    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
-                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
-    const int W = RGBToGray(r, g, b);
-    dst[0 * uv_w] = (fixed_t)(r - W);
-    dst[1 * uv_w] = (fixed_t)(g - W);
-    dst[2 * uv_w] = (fixed_t)(b - W);
-    dst  += 1;
-    src1 += 2;
-    src2 += 2;
-  }
-}
-
-static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
-    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
-  }
-}
-
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
-  const int v0 = (A * 3 + B + 2) >> 2;
-  return clip_y(v0 + W0);
-}
-
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
-  return ((fixed_y_t)a << SFIX) | SHALF;
-}
-
-static void ImportOneRow(const uint8_t* const r_ptr,
-                         const uint8_t* const g_ptr,
-                         const uint8_t* const b_ptr,
-                         int step,
-                         int pic_width,
-                         fixed_y_t* const dst) {
-  int i;
-  const int w = (pic_width + 1) & ~1;
-  for (i = 0; i < pic_width; ++i) {
-    const int off = i * step;
-    dst[i + 0 * w] = UpLift(r_ptr[off]);
-    dst[i + 1 * w] = UpLift(g_ptr[off]);
-    dst[i + 2 * w] = UpLift(b_ptr[off]);
-  }
-  if (pic_width & 1) {  // replicate rightmost pixel
-    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
-    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
-    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
-  }
-}
-
-static void InterpolateTwoRows(const fixed_y_t* const best_y,
-                               const fixed_t* prev_uv,
-                               const fixed_t* cur_uv,
-                               const fixed_t* next_uv,
-                               int w,
-                               fixed_y_t* out1,
-                               fixed_y_t* out2) {
-  const int uv_w = w >> 1;
-  const int len = (w - 1) >> 1;   // length to filter
-  int k = 3;
-  while (k-- > 0) {   // process each R/G/B segments in turn
-    // special boundary case for i==0
-    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
-    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
-
-    WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
-    WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
-
-    // special boundary case for i == w - 1 when w is even
-    if (!(w & 1)) {
-      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
-                            best_y[w - 1 + 0]);
-      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
-                            best_y[w - 1 + w]);
-    }
-    out1 += w;
-    out2 += w;
-    prev_uv += uv_w;
-    cur_uv  += uv_w;
-    next_uv += uv_w;
-  }
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
-  const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
-  return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
-  const int u =  -9719 * r - 19081 * g + 28800 * b + SROUNDER;
-  return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
-}
-
-static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
-  const int v = +28800 * r - 24116 * g -  4684 * b + SROUNDER;
-  return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
-}
-
-static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
-                            WebPPicture* const picture) {
-  int i, j;
-  uint8_t* dst_y = picture->y;
-  uint8_t* dst_u = picture->u;
-  uint8_t* dst_v = picture->v;
-  const fixed_t* const best_uv_base = best_uv;
-  const int w = (picture->width + 1) & ~1;
-  const int h = (picture->height + 1) & ~1;
-  const int uv_w = w >> 1;
-  const int uv_h = h >> 1;
-  for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
-    for (i = 0; i < picture->width; ++i) {
-      const int off = (i >> 1);
-      const int W = best_y[i];
-      const int r = best_uv[off + 0 * uv_w] + W;
-      const int g = best_uv[off + 1 * uv_w] + W;
-      const int b = best_uv[off + 2 * uv_w] + W;
-      dst_y[i] = ConvertRGBToY(r, g, b);
-    }
-    best_y += w;
-    best_uv += (j & 1) * 3 * uv_w;
-    dst_y += picture->y_stride;
-  }
-  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
-    for (i = 0; i < uv_w; ++i) {
-      const int off = i;
-      const int r = best_uv[off + 0 * uv_w];
-      const int g = best_uv[off + 1 * uv_w];
-      const int b = best_uv[off + 2 * uv_w];
-      dst_u[i] = ConvertRGBToU(r, g, b);
-      dst_v[i] = ConvertRGBToV(r, g, b);
-    }
-    best_uv += 3 * uv_w;
-    dst_u += picture->uv_stride;
-    dst_v += picture->uv_stride;
-  }
-  return 1;
-}
-
 //------------------------------------------------------------------------------
 // Main function
 
-#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
-
 static int PreprocessARGB(const uint8_t* r_ptr,
                           const uint8_t* g_ptr,
                           const uint8_t* b_ptr,
                           int step, int rgb_stride,
                           WebPPicture* const picture) {
-  // we expand the right/bottom border if needed
-  const int w = (picture->width + 1) & ~1;
-  const int h = (picture->height + 1) & ~1;
-  const int uv_w = w >> 1;
-  const int uv_h = h >> 1;
-  uint64_t prev_diff_y_sum = ~0;
-  int j, iter;
-
-  // TODO(skal): allocate one big memory chunk. But for now, it's easier
-  // for valgrind debugging to have several chunks.
-  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
-  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
-  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
-  fixed_y_t* best_y = best_y_base;
-  fixed_y_t* target_y = target_y_base;
-  fixed_t* best_uv = best_uv_base;
-  fixed_t* target_uv = target_uv_base;
-  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
-  int ok;
-
-  if (best_y_base == NULL || best_uv_base == NULL ||
-      target_y_base == NULL || target_uv_base == NULL ||
-      best_rgb_y == NULL || best_rgb_uv == NULL ||
-      tmp_buffer == NULL) {
-    ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
-    goto End;
-  }
-  assert(picture->width >= kMinDimensionIterativeConversion);
-  assert(picture->height >= kMinDimensionIterativeConversion);
-
-  WebPInitConvertARGBToYUV();
-
-  // Import RGB samples to W/RGB representation.
-  for (j = 0; j < picture->height; j += 2) {
-    const int is_last_row = (j == picture->height - 1);
-    fixed_y_t* const src1 = tmp_buffer + 0 * w;
-    fixed_y_t* const src2 = tmp_buffer + 3 * w;
-
-    // prepare two rows of input
-    ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
-    if (!is_last_row) {
-      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
-                   step, picture->width, src2);
-    } else {
-      memcpy(src2, src1, 3 * w * sizeof(*src2));
-    }
-    StoreGray(src1, best_y + 0, w);
-    StoreGray(src2, best_y + w, w);
-
-    UpdateW(src1, target_y, w);
-    UpdateW(src2, target_y + w, w);
-    UpdateChroma(src1, src2, target_uv, uv_w);
-    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
-    best_y += 2 * w;
-    best_uv += 3 * uv_w;
-    target_y += 2 * w;
-    target_uv += 3 * uv_w;
-    r_ptr += 2 * rgb_stride;
-    g_ptr += 2 * rgb_stride;
-    b_ptr += 2 * rgb_stride;
-  }
-
-  // Iterate and resolve clipping conflicts.
-  for (iter = 0; iter < kNumIterations; ++iter) {
-    const fixed_t* cur_uv = best_uv_base;
-    const fixed_t* prev_uv = best_uv_base;
-    uint64_t diff_y_sum = 0;
-
-    best_y = best_y_base;
-    best_uv = best_uv_base;
-    target_y = target_y_base;
-    target_uv = target_uv_base;
-    for (j = 0; j < h; j += 2) {
-      fixed_y_t* const src1 = tmp_buffer + 0 * w;
-      fixed_y_t* const src2 = tmp_buffer + 3 * w;
-      {
-        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
-        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
-        prev_uv = cur_uv;
-        cur_uv = next_uv;
-      }
-
-      UpdateW(src1, best_rgb_y + 0 * w, w);
-      UpdateW(src2, best_rgb_y + 1 * w, w);
-      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
-
-      // update two rows of Y and one row of RGB
-      diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
-      WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
-
-      best_y += 2 * w;
-      best_uv += 3 * uv_w;
-      target_y += 2 * w;
-      target_uv += 3 * uv_w;
-    }
-    // test exit condition
-    if (iter > 0) {
-      if (diff_y_sum < diff_y_threshold) break;
-      if (diff_y_sum > prev_diff_y_sum) break;
-    }
-    prev_diff_y_sum = diff_y_sum;
+  const int ok = SharpYuvConvert(
+      r_ptr, g_ptr, b_ptr, step, rgb_stride, /*rgb_bit_depth=*/8,
+      picture->y, picture->y_stride, picture->u, picture->uv_stride, picture->v,
+      picture->uv_stride, /*yuv_bit_depth=*/8, picture->width,
+      picture->height, SharpYuvGetConversionMatrix(kSharpYuvMatrixWebp));
+  if (!ok) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-  // final reconstruction
-  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
-
- End:
-  WebPSafeFree(best_y_base);
-  WebPSafeFree(best_uv_base);
-  WebPSafeFree(target_y_base);
-  WebPSafeFree(target_uv_base);
-  WebPSafeFree(best_rgb_y);
-  WebPSafeFree(best_rgb_uv);
-  WebPSafeFree(tmp_buffer);
   return ok;
 }
-#undef SAFE_ALLOC
 
 //------------------------------------------------------------------------------
 // "Fast" regular RGB->YUV
@@ -591,8 +212,8 @@ static const int kAlphaFix = 19;
 // and constant are adjusted very tightly to fit 32b arithmetic.
 // In particular, they use the fact that the operands for 'v / a' are actually
 // derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
-// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
-// overflow is: kGammaFix + kAlphaFix <= 31.
+// with ai in [0..255] and pi in [0..1<<GAMMA_FIX). The constraint to avoid
+// overflow is: GAMMA_FIX + kAlphaFix <= 31.
 static const uint32_t kInvAlpha[4 * 0xff + 1] = {
   0,  /* alpha = 0 */
   524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
@@ -818,11 +439,20 @@ static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
     dst[0] = SUM4(r_ptr + j, step);
     dst[1] = SUM4(g_ptr + j, step);
     dst[2] = SUM4(b_ptr + j, step);
+    // MemorySanitizer may raise false positives with data that passes through
+    // RGBA32PackedToPlanar_16b_SSE41() due to incorrect modeling of shuffles.
+    // See https://crbug.com/webp/573.
+#ifdef WEBP_MSAN
+    dst[3] = 0;
+#endif
   }
   if (width & 1) {
     dst[0] = SUM2(r_ptr + j);
     dst[1] = SUM2(g_ptr + j);
     dst[2] = SUM2(b_ptr + j);
+#ifdef WEBP_MSAN
+    dst[3] = 0;
+#endif
   }
 }
 
@@ -839,6 +469,8 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
   }
 }
 
+extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
+
 static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
                               const uint8_t* g_ptr,
                               const uint8_t* b_ptr,
@@ -863,18 +495,18 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
     use_iterative_conversion = 0;
   }
 
-  if (!WebPPictureAllocYUVA(picture, width, height)) {
+  if (!WebPPictureAllocYUVA(picture)) {
     return 0;
   }
   if (has_alpha) {
     assert(step == 4);
 #if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
-    assert(kAlphaFix + kGammaFix <= 31);
+    assert(kAlphaFix + GAMMA_FIX <= 31);
 #endif
   }
 
   if (use_iterative_conversion) {
-    InitGammaTablesS();
+    SharpYuvInit(VP8GetCPUInfo);
     if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
       return 0;
     }
@@ -903,7 +535,9 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
     WebPInitConvertARGBToYUV();
     InitGammaTables();
 
-    if (tmp_rgb == NULL) return 0;  // malloc error
+    if (tmp_rgb == NULL) {
+      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    }
 
     // Downsample Y/U/V planes, two rows at a time
     for (y = 0; y < (height >> 1); ++y) {
@@ -1044,7 +678,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   }
   // Allocate a new argb buffer (discarding the previous one).
-  if (!WebPPictureAllocARGB(picture, picture->width, picture->height)) return 0;
+  if (!WebPPictureAllocARGB(picture)) return 0;
   picture->use_argb = 1;
 
   // Convert
@@ -1106,6 +740,8 @@ static int Import(WebPPicture* const picture,
   const int width = picture->width;
   const int height = picture->height;
 
+  if (abs(rgb_stride) < (import_alpha ? 4 : 3) * width) return 0;
+
   if (!picture->use_argb) {
     const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
     return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
@@ -1163,24 +799,24 @@ static int Import(WebPPicture* const picture,
 #if !defined(WEBP_REDUCE_CSP)
 
 int WebPPictureImportBGR(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return (picture != NULL && rgb != NULL)
-             ? Import(picture, rgb, rgb_stride, 3, 1, 0)
+                         const uint8_t* bgr, int bgr_stride) {
+  return (picture != NULL && bgr != NULL)
+             ? Import(picture, bgr, bgr_stride, 3, 1, 0)
              : 0;
 }
 
 int WebPPictureImportBGRA(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+                          const uint8_t* bgra, int bgra_stride) {
+  return (picture != NULL && bgra != NULL)
+             ? Import(picture, bgra, bgra_stride, 4, 1, 1)
              : 0;
 }
 
 
 int WebPPictureImportBGRX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+                          const uint8_t* bgrx, int bgrx_stride) {
+  return (picture != NULL && bgrx != NULL)
+             ? Import(picture, bgrx, bgrx_stride, 4, 1, 0)
              : 0;
 }
 
@@ -1201,9 +837,9 @@ int WebPPictureImportRGBA(WebPPicture* picture,
 }
 
 int WebPPictureImportRGBX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+                          const uint8_t* rgbx, int rgbx_stride) {
+  return (picture != NULL && rgbx != NULL)
+             ? Import(picture, rgbx, rgbx_stride, 4, 0, 0)
              : 0;
 }
 
diff --git a/3rdparty/libwebp/src/enc/picture_enc.c b/3rdparty/libwebp/src/enc/picture_enc.c
index c691622d03cd..5a2703541f2d 100644
--- a/3rdparty/libwebp/src/enc/picture_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_enc.c
@@ -12,10 +12,10 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
+#include <limits.h>
 #include <stdlib.h>
 
 #include "src/enc/vp8i_enc.h"
-#include "src/dsp/dsp.h"
 #include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -45,6 +45,22 @@ int WebPPictureInitInternal(WebPPicture* picture, int version) {
 
 //------------------------------------------------------------------------------
 
+int WebPValidatePicture(const WebPPicture* const picture) {
+  if (picture == NULL) return 0;
+  if (picture->width <= 0 || picture->height <= 0) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  if (picture->width <= 0 || picture->width / 4 > INT_MAX / 4 ||
+      picture->height <= 0 || picture->height / 4 > INT_MAX / 4) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  if (picture->colorspace != WEBP_YUV420 &&
+      picture->colorspace != WEBP_YUV420A) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  return 1;
+}
+
 static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
   picture->memory_argb_ = NULL;
   picture->argb = NULL;
@@ -63,18 +79,17 @@ void WebPPictureResetBuffers(WebPPicture* const picture) {
   WebPPictureResetBufferYUVA(picture);
 }
 
-int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
+int WebPPictureAllocARGB(WebPPicture* const picture) {
   void* memory;
+  const int width = picture->width;
+  const int height = picture->height;
   const uint64_t argb_size = (uint64_t)width * height;
 
-  assert(picture != NULL);
+  if (!WebPValidatePicture(picture)) return 0;
 
   WebPSafeFree(picture->memory_argb_);
   WebPPictureResetBufferARGB(picture);
 
-  if (width <= 0 || height <= 0) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
-  }
   // allocate a new buffer.
   memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
   if (memory == NULL) {
@@ -86,10 +101,10 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
   return 1;
 }
 
-int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
-  const WebPEncCSP uv_csp =
-      (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+int WebPPictureAllocYUVA(WebPPicture* const picture) {
   const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const int width = picture->width;
+  const int height = picture->height;
   const int y_stride = width;
   const int uv_width = (int)(((int64_t)width + 1) >> 1);
   const int uv_height = (int)(((int64_t)height + 1) >> 1);
@@ -98,15 +113,11 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
   uint64_t y_size, uv_size, a_size, total_size;
   uint8_t* mem;
 
-  assert(picture != NULL);
+  if (!WebPValidatePicture(picture)) return 0;
 
   WebPSafeFree(picture->memory_);
   WebPPictureResetBufferYUVA(picture);
 
-  if (uv_csp != WEBP_YUV420) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
-  }
-
   // alpha
   a_width = has_alpha ? width : 0;
   a_stride = a_width;
@@ -152,15 +163,12 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
 
 int WebPPictureAlloc(WebPPicture* picture) {
   if (picture != NULL) {
-    const int width = picture->width;
-    const int height = picture->height;
-
     WebPPictureFree(picture);   // erase previous buffer
 
     if (!picture->use_argb) {
-      return WebPPictureAllocYUVA(picture, width, height);
+      return WebPPictureAllocYUVA(picture);
     } else {
-      return WebPPictureAllocARGB(picture, width, height);
+      return WebPPictureAllocARGB(picture);
     }
   }
   return 1;
diff --git a/3rdparty/libwebp/src/enc/picture_rescale_enc.c b/3rdparty/libwebp/src/enc/picture_rescale_enc.c
index 58a6ae7b9de8..ea90d825484e 100644
--- a/3rdparty/libwebp/src/enc/picture_rescale_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_rescale_enc.c
@@ -13,14 +13,15 @@
 
 #include "src/webp/encode.h"
 
-#if !defined(WEBP_REDUCE_SIZE)
-
 #include <assert.h>
 #include <stdlib.h>
 
 #include "src/enc/vp8i_enc.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
 #include "src/utils/rescaler_utils.h"
 #include "src/utils/utils.h"
+#endif  // !defined(WEBP_REDUCE_SIZE)
 
 #define HALVE(x) (((x) + 1) >> 1)
 
@@ -56,6 +57,7 @@ static int AdjustAndCheckRectangle(const WebPPicture* const pic,
   return 1;
 }
 
+#if !defined(WEBP_REDUCE_SIZE)
 int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   if (src == NULL || dst == NULL) return 0;
   if (src == dst) return 1;
@@ -81,6 +83,7 @@ int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   }
   return 1;
 }
+#endif  // !defined(WEBP_REDUCE_SIZE)
 
 int WebPPictureIsView(const WebPPicture* picture) {
   if (picture == NULL) return 0;
@@ -120,6 +123,7 @@ int WebPPictureView(const WebPPicture* src,
   return 1;
 }
 
+#if !defined(WEBP_REDUCE_SIZE)
 //------------------------------------------------------------------------------
 // Picture cropping
 
@@ -133,7 +137,9 @@ int WebPPictureCrop(WebPPicture* pic,
   PictureGrabSpecs(pic, &tmp);
   tmp.width = width;
   tmp.height = height;
-  if (!WebPPictureAlloc(&tmp)) return 0;
+  if (!WebPPictureAlloc(&tmp)) {
+    return WebPEncodingSetError(pic, tmp.error_code);
+  }
 
   if (!pic->use_argb) {
     const int y_offset = top * pic->y_stride + left;
@@ -164,22 +170,25 @@ int WebPPictureCrop(WebPPicture* pic,
 //------------------------------------------------------------------------------
 // Simple picture rescaler
 
-static void RescalePlane(const uint8_t* src,
-                         int src_width, int src_height, int src_stride,
-                         uint8_t* dst,
-                         int dst_width, int dst_height, int dst_stride,
-                         rescaler_t* const work,
-                         int num_channels) {
+static int RescalePlane(const uint8_t* src,
+                        int src_width, int src_height, int src_stride,
+                        uint8_t* dst,
+                        int dst_width, int dst_height, int dst_stride,
+                        rescaler_t* const work,
+                        int num_channels) {
   WebPRescaler rescaler;
   int y = 0;
-  WebPRescalerInit(&rescaler, src_width, src_height,
-                   dst, dst_width, dst_height, dst_stride,
-                   num_channels, work);
+  if (!WebPRescalerInit(&rescaler, src_width, src_height,
+                        dst, dst_width, dst_height, dst_stride,
+                        num_channels, work)) {
+    return 0;
+  }
   while (y < src_height) {
     y += WebPRescalerImport(&rescaler, src_height - y,
                             src + y * src_stride, src_stride);
     WebPRescalerExport(&rescaler);
   }
+  return 1;
 }
 
 static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
@@ -195,73 +204,76 @@ static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
   }
 }
 
-int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+int WebPPictureRescale(WebPPicture* picture, int width, int height) {
   WebPPicture tmp;
   int prev_width, prev_height;
   rescaler_t* work;
 
-  if (pic == NULL) return 0;
-  prev_width = pic->width;
-  prev_height = pic->height;
+  if (picture == NULL) return 0;
+  prev_width = picture->width;
+  prev_height = picture->height;
   if (!WebPRescalerGetScaledDimensions(
           prev_width, prev_height, &width, &height)) {
-    return 0;
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
 
-  PictureGrabSpecs(pic, &tmp);
+  PictureGrabSpecs(picture, &tmp);
   tmp.width = width;
   tmp.height = height;
-  if (!WebPPictureAlloc(&tmp)) return 0;
+  if (!WebPPictureAlloc(&tmp)) {
+    return WebPEncodingSetError(picture, tmp.error_code);
+  }
 
-  if (!pic->use_argb) {
+  if (!picture->use_argb) {
     work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
     if (work == NULL) {
       WebPPictureFree(&tmp);
-      return 0;
+      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     }
     // If present, we need to rescale alpha first (for AlphaMultiplyY).
-    if (pic->a != NULL) {
+    if (picture->a != NULL) {
       WebPInitAlphaProcessing();
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
+      if (!RescalePlane(picture->a, prev_width, prev_height, picture->a_stride,
+                        tmp.a, width, height, tmp.a_stride, work, 1)) {
+        return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+      }
     }
 
     // We take transparency into account on the luma plane only. That's not
     // totally exact blending, but still is a good approximation.
-    AlphaMultiplyY(pic, 0);
-    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
-                 tmp.y, width, height, tmp.y_stride, work, 1);
+    AlphaMultiplyY(picture, 0);
+    if (!RescalePlane(picture->y, prev_width, prev_height, picture->y_stride,
+                      tmp.y, width, height, tmp.y_stride, work, 1) ||
+        !RescalePlane(picture->u, HALVE(prev_width), HALVE(prev_height),
+                      picture->uv_stride, tmp.u, HALVE(width), HALVE(height),
+                      tmp.uv_stride, work, 1) ||
+        !RescalePlane(picture->v, HALVE(prev_width), HALVE(prev_height),
+                      picture->uv_stride, tmp.v, HALVE(width), HALVE(height),
+                      tmp.uv_stride, work, 1)) {
+      return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+    }
     AlphaMultiplyY(&tmp, 1);
-
-    RescalePlane(pic->u,
-                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                 tmp.u,
-                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
-    RescalePlane(pic->v,
-                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                 tmp.v,
-                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
   } else {
     work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
     if (work == NULL) {
       WebPPictureFree(&tmp);
-      return 0;
+      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     }
     // In order to correctly interpolate colors, we need to apply the alpha
     // weighting first (black-matting), scale the RGB values, and remove
     // the premultiplication afterward (while preserving the alpha channel).
     WebPInitAlphaProcessing();
-    AlphaMultiplyARGB(pic, 0);
-    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
-                 pic->argb_stride * 4,
-                 (uint8_t*)tmp.argb, width, height,
-                 tmp.argb_stride * 4,
-                 work, 4);
+    AlphaMultiplyARGB(picture, 0);
+    if (!RescalePlane((const uint8_t*)picture->argb, prev_width, prev_height,
+                      picture->argb_stride * 4, (uint8_t*)tmp.argb, width,
+                      height, tmp.argb_stride * 4, work, 4)) {
+      return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+    }
     AlphaMultiplyARGB(&tmp, 1);
   }
-  WebPPictureFree(pic);
+  WebPPictureFree(picture);
   WebPSafeFree(work);
-  *pic = tmp;
+  *picture = tmp;
   return 1;
 }
 
@@ -273,23 +285,6 @@ int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   return 0;
 }
 
-int WebPPictureIsView(const WebPPicture* picture) {
-  (void)picture;
-  return 0;
-}
-
-int WebPPictureView(const WebPPicture* src,
-                    int left, int top, int width, int height,
-                    WebPPicture* dst) {
-  (void)src;
-  (void)left;
-  (void)top;
-  (void)width;
-  (void)height;
-  (void)dst;
-  return 0;
-}
-
 int WebPPictureCrop(WebPPicture* pic,
                     int left, int top, int width, int height) {
   (void)pic;
diff --git a/3rdparty/libwebp/src/enc/picture_tools_enc.c b/3rdparty/libwebp/src/enc/picture_tools_enc.c
index 38cb01534a3f..147cc18608c4 100644
--- a/3rdparty/libwebp/src/enc/picture_tools_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_tools_enc.c
@@ -190,27 +190,28 @@ static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
   return (0xff000000u | (r << 16) | (g << 8) | b);
 }
 
-void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+void WebPBlendAlpha(WebPPicture* picture, uint32_t background_rgb) {
   const int red = (background_rgb >> 16) & 0xff;
   const int green = (background_rgb >> 8) & 0xff;
   const int blue = (background_rgb >> 0) & 0xff;
   int x, y;
-  if (pic == NULL) return;
-  if (!pic->use_argb) {
-    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+  if (picture == NULL) return;
+  if (!picture->use_argb) {
+    // omit last pixel during u/v loop
+    const int uv_width = (picture->width >> 1);
     const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
     // VP8RGBToU/V expects the u/v values summed over four pixels
     const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
     const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
-    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    uint8_t* y_ptr = pic->y;
-    uint8_t* u_ptr = pic->u;
-    uint8_t* v_ptr = pic->v;
-    uint8_t* a_ptr = pic->a;
+    const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
+    uint8_t* y_ptr = picture->y;
+    uint8_t* u_ptr = picture->u;
+    uint8_t* v_ptr = picture->v;
+    uint8_t* a_ptr = picture->a;
     if (!has_alpha || a_ptr == NULL) return;    // nothing to do
-    for (y = 0; y < pic->height; ++y) {
+    for (y = 0; y < picture->height; ++y) {
       // Luma blending
-      for (x = 0; x < pic->width; ++x) {
+      for (x = 0; x < picture->width; ++x) {
         const uint8_t alpha = a_ptr[x];
         if (alpha < 0xff) {
           y_ptr[x] = BLEND(Y0, y_ptr[x], alpha);
@@ -219,7 +220,7 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
       // Chroma blending every even line
       if ((y & 1) == 0) {
         uint8_t* const a_ptr2 =
-            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+            (y + 1 == picture->height) ? a_ptr : a_ptr + picture->a_stride;
         for (x = 0; x < uv_width; ++x) {
           // Average four alpha values into a single blending weight.
           // TODO(skal): might lead to visible contouring. Can we do better?
@@ -229,24 +230,24 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
           u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
           v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
         }
-        if (pic->width & 1) {   // rightmost pixel
+        if (picture->width & 1) {  // rightmost pixel
           const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
           u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
           v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
         }
       } else {
-        u_ptr += pic->uv_stride;
-        v_ptr += pic->uv_stride;
+        u_ptr += picture->uv_stride;
+        v_ptr += picture->uv_stride;
       }
-      memset(a_ptr, 0xff, pic->width);  // reset alpha value to opaque
-      a_ptr += pic->a_stride;
-      y_ptr += pic->y_stride;
+      memset(a_ptr, 0xff, picture->width);  // reset alpha value to opaque
+      a_ptr += picture->a_stride;
+      y_ptr += picture->y_stride;
     }
   } else {
-    uint32_t* argb = pic->argb;
+    uint32_t* argb = picture->argb;
     const uint32_t background = MakeARGB32(red, green, blue);
-    for (y = 0; y < pic->height; ++y) {
-      for (x = 0; x < pic->width; ++x) {
+    for (y = 0; y < picture->height; ++y) {
+      for (x = 0; x < picture->width; ++x) {
         const int alpha = (argb[x] >> 24) & 0xff;
         if (alpha != 0xff) {
           if (alpha > 0) {
@@ -262,7 +263,7 @@ void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
           }
         }
       }
-      argb += pic->argb_stride;
+      argb += picture->argb_stride;
     }
   }
 }
diff --git a/3rdparty/libwebp/src/enc/predictor_enc.c b/3rdparty/libwebp/src/enc/predictor_enc.c
index 2e6762ea0dd2..b3d44b59d506 100644
--- a/3rdparty/libwebp/src/enc/predictor_enc.c
+++ b/3rdparty/libwebp/src/enc/predictor_enc.c
@@ -16,6 +16,7 @@
 
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
+#include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8li_enc.h"
 
 #define MAX_DIFF_COST (1e30f)
@@ -31,10 +32,10 @@ static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
 // Methods to calculate Entropy (Shannon).
 
 static float PredictionCostSpatial(const int counts[256], int weight_0,
-                                   double exp_val) {
+                                   float exp_val) {
   const int significant_symbols = 256 >> 4;
-  const double exp_decay_factor = 0.6;
-  double bits = weight_0 * counts[0];
+  const float exp_decay_factor = 0.6f;
+  float bits = (float)weight_0 * counts[0];
   int i;
   for (i = 1; i < significant_symbols; ++i) {
     bits += exp_val * (counts[i] + counts[256 - i]);
@@ -46,9 +47,9 @@ static float PredictionCostSpatial(const int counts[256], int weight_0,
 static float PredictionCostSpatialHistogram(const int accumulated[4][256],
                                             const int tile[4][256]) {
   int i;
-  double retval = 0;
+  float retval = 0.f;
   for (i = 0; i < 4; ++i) {
-    const double kExpValue = 0.94;
+    const float kExpValue = 0.94f;
     retval += PredictionCostSpatial(tile[i], 1, kExpValue);
     retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
   }
@@ -249,7 +250,7 @@ static WEBP_INLINE void GetResidual(
       } else if (x == 0) {
         predict = upper_row[x];  // Top.
       } else {
-        predict = pred_func(current_row[x - 1], upper_row + x);
+        predict = pred_func(&current_row[x - 1], upper_row + x);
       }
 #if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
@@ -472,12 +473,15 @@ static void CopyImageWithPrediction(int width, int height,
 // with respect to predictions. If near_lossless_quality < 100, applies
 // near lossless processing, shaving off more bits of residuals for lower
 // qualities.
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int near_lossless_quality,
-                       int exact, int used_subtract_green) {
+int VP8LResidualImage(int width, int height, int bits, int low_effort,
+                      uint32_t* const argb, uint32_t* const argb_scratch,
+                      uint32_t* const image, int near_lossless_quality,
+                      int exact, int used_subtract_green,
+                      const WebPPicture* const pic, int percent_range,
+                      int* const percent) {
   const int tiles_per_row = VP8LSubSampleSize(width, bits);
   const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  int percent_start = *percent;
   int tile_y;
   int histo[4][256];
   const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
@@ -491,17 +495,24 @@ void VP8LResidualImage(int width, int height, int bits, int low_effort,
     for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
       int tile_x;
       for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
-            bits, histo, argb_scratch, argb, max_quantization, exact,
-            used_subtract_green, image);
+        const int pred = GetBestPredictorForTile(
+            width, height, tile_x, tile_y, bits, histo, argb_scratch, argb,
+            max_quantization, exact, used_subtract_green, image);
         image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
       }
+
+      if (!WebPReportProgress(
+              pic, percent_start + percent_range * tile_y / tiles_per_col,
+              percent)) {
+        return 0;
+      }
     }
   }
 
   CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
                           low_effort, max_quantization, exact,
                           used_subtract_green);
+  return WebPReportProgress(pic, percent_start + percent_range, percent);
 }
 
 //------------------------------------------------------------------------------
@@ -532,7 +543,7 @@ static float PredictionCostCrossColor(const int accumulated[256],
                                       const int counts[256]) {
   // Favor low entropy, locally and globally.
   // Favor small absolute values for PredictionCostSpatial
-  static const double kExpValue = 2.4;
+  static const float kExpValue = 2.4f;
   return VP8LCombinedShannonEntropy(counts, accumulated) +
          PredictionCostSpatial(counts, 3, kExpValue);
 }
@@ -714,11 +725,14 @@ static void CopyTileWithColorTransform(int xsize, int ysize,
   }
 }
 
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image) {
+int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                            uint32_t* const argb, uint32_t* image,
+                            const WebPPicture* const pic, int percent_range,
+                            int* const percent) {
   const int max_tile_size = 1 << bits;
   const int tile_xsize = VP8LSubSampleSize(width, bits);
   const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int percent_start = *percent;
   int accumulated_red_histo[256] = { 0 };
   int accumulated_blue_histo[256] = { 0 };
   int tile_x, tile_y;
@@ -768,5 +782,11 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
         }
       }
     }
+    if (!WebPReportProgress(
+            pic, percent_start + percent_range * tile_y / tile_ysize,
+            percent)) {
+      return 0;
+    }
   }
+  return 1;
 }
diff --git a/3rdparty/libwebp/src/enc/quant_enc.c b/3rdparty/libwebp/src/enc/quant_enc.c
index 01eb565c7f9c..6d8202d27714 100644
--- a/3rdparty/libwebp/src/enc/quant_enc.c
+++ b/3rdparty/libwebp/src/enc/quant_enc.c
@@ -533,7 +533,8 @@ static void InitScore(VP8ModeScore* const rd) {
   rd->score = MAX_COST;
 }
 
-static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
+                      const VP8ModeScore* WEBP_RESTRICT const src) {
   dst->D  = src->D;
   dst->SD = src->SD;
   dst->R  = src->R;
@@ -542,7 +543,8 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   dst->score = src->score;
 }
 
-static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
+                     const VP8ModeScore* WEBP_RESTRICT const src) {
   dst->D  += src->D;
   dst->SD += src->SD;
   dst->R  += src->R;
@@ -585,15 +587,18 @@ static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
   return rate * lambda + RD_DISTO_MULT * distortion;
 }
 
-static int TrellisQuantizeBlock(const VP8Encoder* const enc,
+// Coefficient type.
+enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
+
+static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
                                 int16_t in[16], int16_t out[16],
                                 int ctx0, int coeff_type,
-                                const VP8Matrix* const mtx,
+                                const VP8Matrix* WEBP_RESTRICT const mtx,
                                 int lambda) {
   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
   CostArrayPtr const costs =
       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
-  const int first = (coeff_type == 0) ? 1 : 0;
+  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
   Node nodes[16][NUM_NODES];
   ScoreState score_states[2][NUM_NODES];
   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
@@ -657,16 +662,17 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
-      int level = level0 + m;
+      const int level = level0 + m;
       const int ctx = (level > 2) ? 2 : level;
       const int band = VP8EncBands[n + 1];
       score_t base_score;
-      score_t best_cur_score = MAX_COST;
-      int best_prev = 0;   // default, in case
+      score_t best_cur_score;
+      int best_prev;
+      score_t cost, score;
 
-      ss_cur[m].score = MAX_COST;
       ss_cur[m].costs = costs[n + 1][ctx];
       if (level < 0 || level > thresh_level) {
+        ss_cur[m].score = MAX_COST;
         // Node is dead.
         continue;
       }
@@ -682,18 +688,24 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       }
 
       // Inspect all possible non-dead predecessors. Retain only the best one.
-      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
+      // The base_score is added to all scores so it is only added for the final
+      // value after the loop.
+      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
+      best_cur_score =
+          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
+      best_prev = -MIN_DELTA;
+      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
         // eliminated since their score can't be better than the current best.
-        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
+        cost = VP8LevelCost(ss_prev[p].costs, level);
         // Examine node assuming it's a non-terminal one.
-        const score_t score =
-            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
         if (score < best_cur_score) {
           best_cur_score = score;
           best_prev = p;
         }
       }
+      best_cur_score += base_score;
       // Store best finding in current node.
       cur->sign = sign;
       cur->level = level;
@@ -701,11 +713,11 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       ss_cur[m].score = best_cur_score;
 
       // Now, record best terminal node (and thus best entry in the graph).
-      if (level != 0) {
+      if (level != 0 && best_cur_score < best_score) {
         const score_t last_pos_cost =
             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
-        const score_t score = best_cur_score + last_pos_score;
+        score = best_cur_score + last_pos_score;
         if (score < best_score) {
           best_score = score;
           best_path[0] = n;                     // best eob position
@@ -717,10 +729,16 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
   }
 
   // Fresh start
-  memset(in + first, 0, (16 - first) * sizeof(*in));
-  memset(out + first, 0, (16 - first) * sizeof(*out));
+  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
+  if (coeff_type == TYPE_I16_AC) {
+    memset(in + 1, 0, 15 * sizeof(*in));
+    memset(out + 1, 0, 15 * sizeof(*out));
+  } else {
+    memset(in, 0, 16 * sizeof(*in));
+    memset(out, 0, 16 * sizeof(*out));
+  }
   if (best_path[0] == -1) {
-    return 0;   // skip!
+    return 0;  // skip!
   }
 
   {
@@ -751,9 +769,9 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
 // all at once. Output is the reconstructed block in *yuv_out, and the
 // quantized levels in *levels.
 
-static int ReconstructIntra16(VP8EncIterator* const it,
-                              VP8ModeScore* const rd,
-                              uint8_t* const yuv_out,
+static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
+                              VP8ModeScore* WEBP_RESTRICT const rd,
+                              uint8_t* WEBP_RESTRICT const yuv_out,
                               int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
@@ -775,9 +793,9 @@ static int ReconstructIntra16(VP8EncIterator* const it,
     for (y = 0, n = 0; y < 4; ++y) {
       for (x = 0; x < 4; ++x, ++n) {
         const int ctx = it->top_nz_[x] + it->left_nz_[y];
-        const int non_zero =
-            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
-                                 &dqm->y1_, dqm->lambda_trellis_i16_);
+        const int non_zero = TrellisQuantizeBlock(
+            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
+            dqm->lambda_trellis_i16_);
         it->top_nz_[x] = it->left_nz_[y] = non_zero;
         rd->y_ac_levels[n][0] = 0;
         nz |= non_zero << n;
@@ -803,10 +821,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
   return nz;
 }
 
-static int ReconstructIntra4(VP8EncIterator* const it,
+static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
                              int16_t levels[16],
-                             const uint8_t* const src,
-                             uint8_t* const yuv_out,
+                             const uint8_t* WEBP_RESTRICT const src,
+                             uint8_t* WEBP_RESTRICT const yuv_out,
                              int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
@@ -818,7 +836,7 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   if (DO_TRELLIS_I4 && it->do_trellis_) {
     const int x = it->i4_ & 3, y = it->i4_ >> 2;
     const int ctx = it->top_nz_[x] + it->left_nz_[y];
-    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
+    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
                               dqm->lambda_trellis_i4_);
   } else {
     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
@@ -839,7 +857,8 @@ static int ReconstructIntra4(VP8EncIterator* const it,
 
 // Quantize as usual, but also compute and return the quantization error.
 // Error is already divided by DSHIFT.
-static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
+                          const VP8Matrix* WEBP_RESTRICT const mtx) {
   int V = *v;
   const int sign = (V < 0);
   if (sign) V = -V;
@@ -853,9 +872,10 @@ static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
   return (sign ? -V : V) >> DSCALE;
 }
 
-static void CorrectDCValues(const VP8EncIterator* const it,
-                            const VP8Matrix* const mtx,
-                            int16_t tmp[][16], VP8ModeScore* const rd) {
+static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
+                            const VP8Matrix* WEBP_RESTRICT const mtx,
+                            int16_t tmp[][16],
+                            VP8ModeScore* WEBP_RESTRICT const rd) {
   //         | top[0] | top[1]
   // --------+--------+---------
   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
@@ -886,8 +906,8 @@ static void CorrectDCValues(const VP8EncIterator* const it,
   }
 }
 
-static void StoreDiffusionErrors(VP8EncIterator* const it,
-                                 const VP8ModeScore* const rd) {
+static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
+                                 const VP8ModeScore* WEBP_RESTRICT const rd) {
   int ch;
   for (ch = 0; ch <= 1; ++ch) {
     int8_t* const top = it->top_derr_[it->x_][ch];
@@ -906,8 +926,9 @@ static void StoreDiffusionErrors(VP8EncIterator* const it,
 
 //------------------------------------------------------------------------------
 
-static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
-                         uint8_t* const yuv_out, int mode) {
+static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
+                         VP8ModeScore* WEBP_RESTRICT const rd,
+                         uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
@@ -927,9 +948,9 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
       for (y = 0; y < 2; ++y) {
         for (x = 0; x < 2; ++x, ++n) {
           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-          const int non_zero =
-              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
-                                   &dqm->uv_, dqm->lambda_trellis_uv_);
+          const int non_zero = TrellisQuantizeBlock(
+              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
+              dqm->lambda_trellis_uv_);
           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
           nz |= non_zero << n;
         }
@@ -978,7 +999,8 @@ static void SwapOut(VP8EncIterator* const it) {
   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }
 
-static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
+static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
+                            VP8ModeScore* WEBP_RESTRICT rd) {
   const int kNumBlocks = 16;
   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
@@ -1038,7 +1060,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
 //------------------------------------------------------------------------------
 
 // return the cost array corresponding to the surrounding prediction modes.
-static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
+static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
                                      const uint8_t modes[16]) {
   const int preds_w = it->enc_->preds_w_;
   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
@@ -1047,7 +1069,8 @@ static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
   return VP8FixedCostsI4[top][left];
 }
 
-static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+                          VP8ModeScore* WEBP_RESTRICT const rd) {
   const VP8Encoder* const enc = it->enc_;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i4_;
@@ -1143,7 +1166,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
 //------------------------------------------------------------------------------
 
-static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
+                       VP8ModeScore* WEBP_RESTRICT const rd) {
   const int kNumBlocks = 8;
   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
@@ -1195,7 +1219,8 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 // Final reconstruction and quantization.
 
-static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
+                           VP8ModeScore* WEBP_RESTRICT const rd) {
   const VP8Encoder* const enc = it->enc_;
   const int is_i16 = (it->mb_->type_ == 1);
   int nz = 0;
@@ -1220,9 +1245,9 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
 }
 
 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
-static void RefineUsingDistortion(VP8EncIterator* const it,
+static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
                                   int try_both_modes, int refine_uv_mode,
-                                  VP8ModeScore* const rd) {
+                                  VP8ModeScore* WEBP_RESTRICT const rd) {
   score_t best_score = MAX_COST;
   int nz = 0;
   int mode;
@@ -1336,7 +1361,8 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
 //------------------------------------------------------------------------------
 // Entry point
 
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
+                VP8ModeScore* WEBP_RESTRICT const rd,
                 VP8RDLevel rd_opt) {
   int is_skipped;
   const int method = it->enc_->method_;
diff --git a/3rdparty/libwebp/src/enc/syntax_enc.c b/3rdparty/libwebp/src/enc/syntax_enc.c
index a9e5a6cf0fec..9b8f524d6981 100644
--- a/3rdparty/libwebp/src/enc/syntax_enc.c
+++ b/3rdparty/libwebp/src/enc/syntax_enc.c
@@ -258,7 +258,10 @@ static int EmitPartitionsSize(const VP8Encoder* const enc,
     buf[3 * p + 1] = (part_size >>  8) & 0xff;
     buf[3 * p + 2] = (part_size >> 16) & 0xff;
   }
-  return p ? pic->writer(buf, 3 * p, pic) : 1;
+  if (p && !pic->writer(buf, 3 * p, pic)) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
+  }
+  return 1;
 }
 
 //------------------------------------------------------------------------------
@@ -349,7 +352,7 @@ int VP8EncWrite(VP8Encoder* const enc) {
                                        (enc->alpha_data_size_ & 1);
     riff_size += CHUNK_HEADER_SIZE + padded_alpha_size;
   }
-  // Sanity check.
+  // RIFF size should fit in 32-bits.
   if (riff_size > 0xfffffffeU) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
   }
@@ -381,6 +384,7 @@ int VP8EncWrite(VP8Encoder* const enc) {
 
   enc->coded_size_ = (int)(CHUNK_HEADER_SIZE + riff_size);
   ok = ok && WebPReportProgress(pic, final_percent, &enc->percent_);
+  if (!ok) WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
   return ok;
 }
 
diff --git a/3rdparty/libwebp/src/enc/vp8i_enc.h b/3rdparty/libwebp/src/enc/vp8i_enc.h
index 0e35562a8c9a..00ff1be79515 100644
--- a/3rdparty/libwebp/src/enc/vp8i_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -31,7 +31,7 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 2
+#define ENC_MIN_VERSION 4
 #define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
@@ -286,8 +286,7 @@ int VP8IteratorNext(VP8EncIterator* const it);
 // save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
 void VP8IteratorSaveBoundary(VP8EncIterator* const it);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
-int VP8IteratorProgress(const VP8EncIterator* const it,
-                        int final_delta_percent);
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta);
 // Intra4x4 iterations
 void VP8IteratorStartI4(VP8EncIterator* const it);
 // returns true if not done.
@@ -471,7 +470,8 @@ int VP8EncAnalyze(VP8Encoder* const enc);
 // Sets up segment's quantization values, base_quant_ and filter strengths.
 void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
 // Pick best modes and fills the levels. Returns true if skipped.
-int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
+                VP8ModeScore* WEBP_RESTRICT const rd,
                 VP8RDLevel rd_opt);
 
   // in alpha.c
@@ -491,19 +491,24 @@ int VP8FilterStrengthFromDelta(int sharpness, int delta);
 
   // misc utils for picture_*.c:
 
+// Returns true if 'picture' is non-NULL and dimensions/colorspace are within
+// their valid ranges. If returning false, the 'error_code' in 'picture' is
+// updated.
+int WebPValidatePicture(const WebPPicture* const picture);
+
 // Remove reference to the ARGB/YUVA buffer (doesn't free anything).
 void WebPPictureResetBuffers(WebPPicture* const picture);
 
-// Allocates ARGB buffer of given dimension (previous one is always free'd).
-// Preserves the YUV(A) buffer. Returns false in case of error (invalid param,
-// out-of-memory).
-int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
+// Allocates ARGB buffer according to set width/height (previous one is
+// always free'd). Preserves the YUV(A) buffer. Returns false in case of error
+// (invalid param, out-of-memory).
+int WebPPictureAllocARGB(WebPPicture* const picture);
 
-// Allocates YUVA buffer of given dimension (previous one is always free'd).
-// Uses picture->csp to determine whether an alpha buffer is needed.
+// Allocates YUVA buffer according to set width/height (previous one is always
+// free'd). Uses picture->csp to determine whether an alpha buffer is needed.
 // Preserves the ARGB buffer.
 // Returns false in case of error (invalid param, out-of-memory).
-int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
+int WebPPictureAllocYUVA(WebPPicture* const picture);
 
 // Replace samples that are fully transparent by 'color' to help compressibility
 // (no guarantee, though). Assumes pic->use_argb is true.
diff --git a/3rdparty/libwebp/src/enc/vp8l_enc.c b/3rdparty/libwebp/src/enc/vp8l_enc.c
index 0b44ebe46ec5..40eafa41698f 100644
--- a/3rdparty/libwebp/src/enc/vp8l_enc.c
+++ b/3rdparty/libwebp/src/enc/vp8l_enc.c
@@ -15,128 +15,25 @@
 #include <assert.h>
 #include <stdlib.h>
 
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 #include "src/enc/backward_references_enc.h"
 #include "src/enc/histogram_enc.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8li_enc.h"
-#include "src/dsp/lossless.h"
-#include "src/dsp/lossless_common.h"
 #include "src/utils/bit_writer_utils.h"
 #include "src/utils/huffman_encode_utils.h"
+#include "src/utils/palette.h"
 #include "src/utils/utils.h"
+#include "src/webp/encode.h"
 #include "src/webp/format_constants.h"
 
 // Maximum number of histogram images (sub-blocks).
 #define MAX_HUFF_IMAGE_SIZE       2600
 
-// Palette reordering for smaller sum of deltas (and for smaller storage).
-
-static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
-  const uint32_t a = WebPMemToUint32((uint8_t*)p1);
-  const uint32_t b = WebPMemToUint32((uint8_t*)p2);
-  assert(a != b);
-  return (a < b) ? -1 : 1;
-}
-
-static WEBP_INLINE uint32_t PaletteComponentDistance(uint32_t v) {
-  return (v <= 128) ? v : (256 - v);
-}
-
-// Computes a value that is related to the entropy created by the
-// palette entry diff.
-//
-// Note that the last & 0xff is a no-operation in the next statement, but
-// removed by most compilers and is here only for regularity of the code.
-static WEBP_INLINE uint32_t PaletteColorDistance(uint32_t col1, uint32_t col2) {
-  const uint32_t diff = VP8LSubPixels(col1, col2);
-  const int kMoreWeightForRGBThanForAlpha = 9;
-  uint32_t score;
-  score =  PaletteComponentDistance((diff >>  0) & 0xff);
-  score += PaletteComponentDistance((diff >>  8) & 0xff);
-  score += PaletteComponentDistance((diff >> 16) & 0xff);
-  score *= kMoreWeightForRGBThanForAlpha;
-  score += PaletteComponentDistance((diff >> 24) & 0xff);
-  return score;
-}
-
-static WEBP_INLINE void SwapColor(uint32_t* const col1, uint32_t* const col2) {
-  const uint32_t tmp = *col1;
-  *col1 = *col2;
-  *col2 = tmp;
-}
-
-static void GreedyMinimizeDeltas(uint32_t palette[], int num_colors) {
-  // Find greedily always the closest color of the predicted color to minimize
-  // deltas in the palette. This reduces storage needs since the
-  // palette is stored with delta encoding.
-  uint32_t predict = 0x00000000;
-  int i, k;
-  for (i = 0; i < num_colors; ++i) {
-    int best_ix = i;
-    uint32_t best_score = ~0U;
-    for (k = i; k < num_colors; ++k) {
-      const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
-      if (best_score > cur_score) {
-        best_score = cur_score;
-        best_ix = k;
-      }
-    }
-    SwapColor(&palette[best_ix], &palette[i]);
-    predict = palette[i];
-  }
-}
-
-// The palette has been sorted by alpha. This function checks if the other
-// components of the palette have a monotonic development with regards to
-// position in the palette. If all have monotonic development, there is
-// no benefit to re-organize them greedily. A monotonic development
-// would be spotted in green-only situations (like lossy alpha) or gray-scale
-// images.
-static int PaletteHasNonMonotonousDeltas(uint32_t palette[], int num_colors) {
-  uint32_t predict = 0x000000;
-  int i;
-  uint8_t sign_found = 0x00;
-  for (i = 0; i < num_colors; ++i) {
-    const uint32_t diff = VP8LSubPixels(palette[i], predict);
-    const uint8_t rd = (diff >> 16) & 0xff;
-    const uint8_t gd = (diff >>  8) & 0xff;
-    const uint8_t bd = (diff >>  0) & 0xff;
-    if (rd != 0x00) {
-      sign_found |= (rd < 0x80) ? 1 : 2;
-    }
-    if (gd != 0x00) {
-      sign_found |= (gd < 0x80) ? 8 : 16;
-    }
-    if (bd != 0x00) {
-      sign_found |= (bd < 0x80) ? 64 : 128;
-    }
-    predict = palette[i];
-  }
-  return (sign_found & (sign_found << 1)) != 0;  // two consequent signs.
-}
-
 // -----------------------------------------------------------------------------
 // Palette
 
-// If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
-// creates a palette and returns true, else returns false.
-static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
-                                   int low_effort,
-                                   uint32_t palette[MAX_PALETTE_SIZE],
-                                   int* const palette_size) {
-  const int num_colors = WebPGetColorPalette(pic, palette);
-  if (num_colors > MAX_PALETTE_SIZE) {
-    *palette_size = 0;
-    return 0;
-  }
-  *palette_size = num_colors;
-  qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
-  if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) {
-    GreedyMinimizeDeltas(palette, num_colors);
-  }
-  return 1;
-}
-
 // These five modes are evaluated and their respective entropy is computed.
 typedef enum {
   kDirect = 0,
@@ -165,10 +62,11 @@ typedef enum {
   kHistoTotal  // Must be last.
 } HistoIx;
 
-static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
-  const int green = p >> 8;  // The upper bits are masked away later.
-  ++r[((p >> 16) - green) & 0xff];
-  ++b[((p >>  0) - green) & 0xff];
+static void AddSingleSubGreen(uint32_t p,
+                              uint32_t* const r, uint32_t* const b) {
+  const int green = (int)p >> 8;  // The upper bits are masked away later.
+  ++r[(((int)p >> 16) - green) & 0xff];
+  ++b[(((int)p >>  0) - green) & 0xff];
 }
 
 static void AddSingle(uint32_t p,
@@ -242,8 +140,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
       curr_row += argb_stride;
     }
     {
-      double entropy_comp[kHistoTotal];
-      double entropy[kNumEntropyIx];
+      float entropy_comp[kHistoTotal];
+      float entropy[kNumEntropyIx];
       int k;
       int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
       int j;
@@ -362,11 +260,14 @@ typedef struct {
 } CrunchSubConfig;
 typedef struct {
   int entropy_idx_;
+  PaletteSorting palette_sorting_type_;
   CrunchSubConfig sub_configs_[CRUNCH_SUBCONFIGS_MAX];
   int sub_configs_size_;
 } CrunchConfig;
 
-#define CRUNCH_CONFIGS_MAX kNumEntropyIx
+// +2 because we add a palette sorting configuration for kPalette and
+// kPaletteAndSpatial.
+#define CRUNCH_CONFIGS_MAX (kNumEntropyIx + 2 * kPaletteSortingNum)
 
 static int EncoderAnalyze(VP8LEncoder* const enc,
                           CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX],
@@ -386,9 +287,12 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
   int do_no_cache = 0;
   assert(pic != NULL && pic->argb != NULL);
 
-  use_palette =
-      AnalyzeAndCreatePalette(pic, low_effort,
-                              enc->palette_, &enc->palette_size_);
+  // Check whether a palette is possible.
+  enc->palette_size_ = GetColorPalette(pic, enc->palette_sorted_);
+  use_palette = (enc->palette_size_ <= MAX_PALETTE_SIZE);
+  if (!use_palette) {
+    enc->palette_size_ = 0;
+  }
 
   // Empirical bit sizes.
   enc->histo_bits_ = GetHistoBits(method, use_palette,
@@ -398,6 +302,8 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
   if (low_effort) {
     // AnalyzeEntropy is somewhat slow.
     crunch_configs[0].entropy_idx_ = use_palette ? kPalette : kSpatialSubGreen;
+    crunch_configs[0].palette_sorting_type_ =
+        use_palette ? kSortedDefault : kUnusedPalette;
     n_lz77s = 1;
     *crunch_configs_size = 1;
   } else {
@@ -418,13 +324,37 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
         // a palette.
         if ((i != kPalette && i != kPaletteAndSpatial) || use_palette) {
           assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX);
-          crunch_configs[(*crunch_configs_size)++].entropy_idx_ = i;
+          if (use_palette && (i == kPalette || i == kPaletteAndSpatial)) {
+            int sorting_method;
+            for (sorting_method = 0; sorting_method < kPaletteSortingNum;
+                 ++sorting_method) {
+              const PaletteSorting typed_sorting_method =
+                  (PaletteSorting)sorting_method;
+              // TODO(vrabaud) kSortedDefault should be tested. It is omitted
+              // for now for backward compatibility.
+              if (typed_sorting_method == kUnusedPalette ||
+                  typed_sorting_method == kSortedDefault) {
+                continue;
+              }
+              crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+              crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                  typed_sorting_method;
+              ++*crunch_configs_size;
+            }
+          } else {
+            crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kUnusedPalette;
+            ++*crunch_configs_size;
+          }
         }
       }
     } else {
       // Only choose the guessed best transform.
       *crunch_configs_size = 1;
       crunch_configs[0].entropy_idx_ = min_entropy_ix;
+      crunch_configs[0].palette_sorting_type_ =
+          use_palette ? kMinimizeDelta : kUnusedPalette;
       if (config->quality >= 75 && method == 5) {
         // Test with and without color cache.
         do_no_cache = 1;
@@ -432,6 +362,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
         if (min_entropy_ix == kPalette) {
           *crunch_configs_size = 2;
           crunch_configs[1].entropy_idx_ = kPaletteAndSpatial;
+          crunch_configs[1].palette_sorting_type_ = kMinimizeDelta;
         }
       }
     }
@@ -730,11 +661,11 @@ static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
   VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
 }
 
-static WebPEncodingError StoreImageToBitMask(
+static int StoreImageToBitMask(
     VP8LBitWriter* const bw, int width, int histo_bits,
     const VP8LBackwardRefs* const refs,
     const uint16_t* histogram_symbols,
-    const HuffmanTreeCode* const huffman_codes) {
+    const HuffmanTreeCode* const huffman_codes, const WebPPicture* const pic) {
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
   const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
   // x and y trace the position in the image.
@@ -787,44 +718,52 @@ static WebPEncodingError StoreImageToBitMask(
     }
     VP8LRefsCursorNext(&c);
   }
-  return bw->error_ ? VP8_ENC_ERROR_OUT_OF_MEMORY : VP8_ENC_OK;
+  if (bw->error_) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  return 1;
 }
 
-// Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
-static WebPEncodingError EncodeImageNoHuffman(
-    VP8LBitWriter* const bw, const uint32_t* const argb,
-    VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_array,
-    int width, int height, int quality, int low_effort) {
+// Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31.
+// pic and percent are for progress.
+static int EncodeImageNoHuffman(VP8LBitWriter* const bw,
+                                const uint32_t* const argb,
+                                VP8LHashChain* const hash_chain,
+                                VP8LBackwardRefs* const refs_array, int width,
+                                int height, int quality, int low_effort,
+                                const WebPPicture* const pic, int percent_range,
+                                int* const percent) {
   int i;
   int max_tokens = 0;
-  WebPEncodingError err = VP8_ENC_OK;
   VP8LBackwardRefs* refs;
   HuffmanTreeToken* tokens = NULL;
-  HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
-  const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
+  HuffmanTreeCode huffman_codes[5] = {{0, NULL, NULL}};
+  const uint16_t histogram_symbols[1] = {0};  // only one tree, one symbol
   int cache_bits = 0;
   VP8LHistogramSet* histogram_image = NULL;
   HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
-        3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+      3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
   if (huff_tree == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
   // Calculate backward references from ARGB image.
-  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
-                         low_effort)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height, low_effort,
+                         pic, percent_range / 2, percent)) {
+    goto Error;
+  }
+  if (!VP8LGetBackwardReferences(width, height, argb, quality, /*low_effort=*/0,
+                                 kLZ77Standard | kLZ77RLE, cache_bits,
+                                 /*do_no_cache=*/0, hash_chain, refs_array,
+                                 &cache_bits, pic,
+                                 percent_range - percent_range / 2, percent)) {
     goto Error;
   }
-  err = VP8LGetBackwardReferences(
-      width, height, argb, quality, /*low_effort=*/0, kLZ77Standard | kLZ77RLE,
-      cache_bits, /*do_no_cache=*/0, hash_chain, refs_array, &cache_bits);
-  if (err != VP8_ENC_OK) goto Error;
   refs = &refs_array[0];
   histogram_image = VP8LAllocateHistogramSet(1, cache_bits);
   if (histogram_image == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
   VP8LHistogramSetClear(histogram_image);
@@ -835,7 +774,7 @@ static WebPEncodingError EncodeImageNoHuffman(
   // Create Huffman bit lengths and codes for each histogram image.
   assert(histogram_image->size == 1);
   if (!GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
@@ -852,7 +791,7 @@ static WebPEncodingError EncodeImageNoHuffman(
 
   tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
   if (tokens == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
@@ -864,27 +803,32 @@ static WebPEncodingError EncodeImageNoHuffman(
   }
 
   // Store actual literals.
-  err = StoreImageToBitMask(bw, width, 0, refs, histogram_symbols,
-                            huffman_codes);
+  if (!StoreImageToBitMask(bw, width, 0, refs, histogram_symbols, huffman_codes,
+                           pic)) {
+    goto Error;
+  }
 
  Error:
   WebPSafeFree(tokens);
   WebPSafeFree(huff_tree);
   VP8LFreeHistogramSet(histogram_image);
   WebPSafeFree(huffman_codes[0].codes);
-  return err;
+  return (pic->error_code == VP8_ENC_OK);
 }
 
-static WebPEncodingError EncodeImageInternal(
+// pic and percent are for progress.
+static int EncodeImageInternal(
     VP8LBitWriter* const bw, const uint32_t* const argb,
     VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
-    int height, int quality, int low_effort, int use_cache,
-    const CrunchConfig* const config, int* cache_bits, int histogram_bits,
-    size_t init_byte_position, int* const hdr_size, int* const data_size) {
-  WebPEncodingError err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    int height, int quality, int low_effort, const CrunchConfig* const config,
+    int* cache_bits, int histogram_bits, size_t init_byte_position,
+    int* const hdr_size, int* const data_size, const WebPPicture* const pic,
+    int percent_range, int* const percent) {
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
+  int remaining_percent = percent_range;
+  int percent_start = *percent;
   VP8LHistogramSet* histogram_image = NULL;
   VP8LHistogram* tmp_histo = NULL;
   int histogram_image_size = 0;
@@ -893,9 +837,8 @@ static WebPEncodingError EncodeImageInternal(
       3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
-  uint16_t* const histogram_symbols =
-      (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
-                                sizeof(*histogram_symbols));
+  uint16_t* const histogram_symbols = (uint16_t*)WebPSafeMalloc(
+      histogram_image_xysize, sizeof(*histogram_symbols));
   int sub_configs_idx;
   int cache_bits_init, write_histogram_image;
   VP8LBitWriter bw_init = *bw, bw_best;
@@ -907,38 +850,52 @@ static WebPEncodingError EncodeImageInternal(
   assert(hdr_size != NULL);
   assert(data_size != NULL);
 
-  // Make sure we can allocate the different objects.
   memset(&hash_chain_histogram, 0, sizeof(hash_chain_histogram));
+  if (!VP8LBitWriterInit(&bw_best, 0)) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto Error;
+  }
+
+  // Make sure we can allocate the different objects.
   if (huff_tree == NULL || histogram_symbols == NULL ||
-      !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize) ||
-      !VP8LHashChainFill(hash_chain, quality, argb, width, height,
-                         low_effort)) {
+      !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize)) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
-  if (use_cache) {
-    // If the value is different from zero, it has been set during the
-    // palette analysis.
-    cache_bits_init = (*cache_bits == 0) ? MAX_COLOR_CACHE_BITS : *cache_bits;
-  } else {
-    cache_bits_init = 0;
+
+  percent_range = remaining_percent / 5;
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort, pic, percent_range, percent)) {
+    goto Error;
   }
+  percent_start += percent_range;
+  remaining_percent -= percent_range;
+
+  // If the value is different from zero, it has been set during the palette
+  // analysis.
+  cache_bits_init = (*cache_bits == 0) ? MAX_COLOR_CACHE_BITS : *cache_bits;
   // If several iterations will happen, clone into bw_best.
-  if (!VP8LBitWriterInit(&bw_best, 0) ||
-      ((config->sub_configs_size_ > 1 ||
-        config->sub_configs_[0].do_no_cache_) &&
-       !VP8LBitWriterClone(bw, &bw_best))) {
+  if ((config->sub_configs_size_ > 1 || config->sub_configs_[0].do_no_cache_) &&
+      !VP8LBitWriterClone(bw, &bw_best)) {
+    WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
+
   for (sub_configs_idx = 0; sub_configs_idx < config->sub_configs_size_;
        ++sub_configs_idx) {
     const CrunchSubConfig* const sub_config =
         &config->sub_configs_[sub_configs_idx];
     int cache_bits_best, i_cache;
-    err = VP8LGetBackwardReferences(width, height, argb, quality, low_effort,
-                                    sub_config->lz77_, cache_bits_init,
-                                    sub_config->do_no_cache_, hash_chain,
-                                    &refs_array[0], &cache_bits_best);
-    if (err != VP8_ENC_OK) goto Error;
+    int i_remaining_percent = remaining_percent / config->sub_configs_size_;
+    int i_percent_range = i_remaining_percent / 4;
+    i_remaining_percent -= i_percent_range;
+
+    if (!VP8LGetBackwardReferences(
+            width, height, argb, quality, low_effort, sub_config->lz77_,
+            cache_bits_init, sub_config->do_no_cache_, hash_chain,
+            &refs_array[0], &cache_bits_best, pic, i_percent_range, percent)) {
+      goto Error;
+    }
 
     for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
       const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
@@ -953,11 +910,17 @@ static WebPEncodingError EncodeImageInternal(
       histogram_image =
           VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits_tmp);
       tmp_histo = VP8LAllocateHistogram(cache_bits_tmp);
-      if (histogram_image == NULL || tmp_histo == NULL ||
-          !VP8LGetHistoImageSymbols(width, height, &refs_array[i_cache],
-                                    quality, low_effort, histogram_bits,
-                                    cache_bits_tmp, histogram_image, tmp_histo,
-                                    histogram_symbols)) {
+      if (histogram_image == NULL || tmp_histo == NULL) {
+        WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+        goto Error;
+      }
+
+      i_percent_range = i_remaining_percent / 3;
+      i_remaining_percent -= i_percent_range;
+      if (!VP8LGetHistoImageSymbols(
+              width, height, &refs_array[i_cache], quality, low_effort,
+              histogram_bits, cache_bits_tmp, histogram_image, tmp_histo,
+              histogram_symbols, pic, i_percent_range, percent)) {
         goto Error;
       }
       // Create Huffman bit lengths and codes for each histogram image.
@@ -970,6 +933,7 @@ static WebPEncodingError EncodeImageInternal(
       // GetHuffBitLengthsAndCodes().
       if (huffman_codes == NULL ||
           !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+        WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
         goto Error;
       }
       // Free combined histograms.
@@ -992,12 +956,14 @@ static WebPEncodingError EncodeImageInternal(
       write_histogram_image = (histogram_image_size > 1);
       VP8LPutBits(bw, write_histogram_image, 1);
       if (write_histogram_image) {
-        uint32_t* const histogram_argb =
-            (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
-                                      sizeof(*histogram_argb));
+        uint32_t* const histogram_argb = (uint32_t*)WebPSafeMalloc(
+            histogram_image_xysize, sizeof(*histogram_argb));
         int max_index = 0;
         uint32_t i;
-        if (histogram_argb == NULL) goto Error;
+        if (histogram_argb == NULL) {
+          WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+          goto Error;
+        }
         for (i = 0; i < histogram_image_xysize; ++i) {
           const int symbol_index = histogram_symbols[i] & 0xffff;
           histogram_argb[i] = (symbol_index << 8);
@@ -1008,12 +974,17 @@ static WebPEncodingError EncodeImageInternal(
         histogram_image_size = max_index;
 
         VP8LPutBits(bw, histogram_bits - 2, 3);
-        err = EncodeImageNoHuffman(
-            bw, histogram_argb, &hash_chain_histogram, &refs_array[2],
-            VP8LSubSampleSize(width, histogram_bits),
-            VP8LSubSampleSize(height, histogram_bits), quality, low_effort);
+        i_percent_range = i_remaining_percent / 2;
+        i_remaining_percent -= i_percent_range;
+        if (!EncodeImageNoHuffman(
+                bw, histogram_argb, &hash_chain_histogram, &refs_array[2],
+                VP8LSubSampleSize(width, histogram_bits),
+                VP8LSubSampleSize(height, histogram_bits), quality, low_effort,
+                pic, i_percent_range, percent)) {
+          WebPSafeFree(histogram_argb);
+          goto Error;
+        }
         WebPSafeFree(histogram_argb);
-        if (err != VP8_ENC_OK) goto Error;
       }
 
       // Store Huffman codes.
@@ -1028,7 +999,10 @@ static WebPEncodingError EncodeImageInternal(
           }
         }
         tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
-        if (tokens == NULL) goto Error;
+        if (tokens == NULL) {
+          WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+          goto Error;
+        }
         for (i = 0; i < 5 * histogram_image_size; ++i) {
           HuffmanTreeCode* const codes = &huffman_codes[i];
           StoreHuffmanCode(bw, huff_tree, tokens, codes);
@@ -1037,9 +1011,10 @@ static WebPEncodingError EncodeImageInternal(
       }
       // Store actual literals.
       hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
-      err = StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
-                                histogram_symbols, huffman_codes);
-      if (err != VP8_ENC_OK) goto Error;
+      if (!StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
+                               histogram_symbols, huffman_codes, pic)) {
+        goto Error;
+      }
       // Keep track of the smallest image so far.
       if (VP8LBitWriterNumBytes(bw) < bw_size_best) {
         bw_size_best = VP8LBitWriterNumBytes(bw);
@@ -1059,7 +1034,10 @@ static WebPEncodingError EncodeImageInternal(
     }
   }
   VP8LBitWriterSwap(bw, &bw_best);
-  err = VP8_ENC_OK;
+
+  if (!WebPReportProgress(pic, percent_start + remaining_percent, percent)) {
+    goto Error;
+  }
 
  Error:
   WebPSafeFree(tokens);
@@ -1073,7 +1051,7 @@ static WebPEncodingError EncodeImageInternal(
   }
   WebPSafeFree(histogram_symbols);
   VP8LBitWriterWipeOut(&bw_best);
-  return err;
+  return (pic->error_code == VP8_ENC_OK);
 }
 
 // -----------------------------------------------------------------------------
@@ -1082,26 +1060,27 @@ static WebPEncodingError EncodeImageInternal(
 static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
                                VP8LBitWriter* const bw) {
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
-  VP8LPutBits(bw, SUBTRACT_GREEN, 2);
+  VP8LPutBits(bw, SUBTRACT_GREEN_TRANSFORM, 2);
   VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
 }
 
-static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
-                                            int width, int height,
-                                            int quality, int low_effort,
-                                            int used_subtract_green,
-                                            VP8LBitWriter* const bw) {
+static int ApplyPredictFilter(const VP8LEncoder* const enc, int width,
+                              int height, int quality, int low_effort,
+                              int used_subtract_green, VP8LBitWriter* const bw,
+                              int percent_range, int* const percent) {
   const int pred_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, pred_bits);
   const int transform_height = VP8LSubSampleSize(height, pred_bits);
   // we disable near-lossless quantization if palette is used.
-  const int near_lossless_strength = enc->use_palette_ ? 100
-                                   : enc->config_->near_lossless;
+  const int near_lossless_strength =
+      enc->use_palette_ ? 100 : enc->config_->near_lossless;
 
-  VP8LResidualImage(width, height, pred_bits, low_effort, enc->argb_,
-                    enc->argb_scratch_, enc->transform_data_,
-                    near_lossless_strength, enc->config_->exact,
-                    used_subtract_green);
+  if (!VP8LResidualImage(
+          width, height, pred_bits, low_effort, enc->argb_, enc->argb_scratch_,
+          enc->transform_data_, near_lossless_strength, enc->config_->exact,
+          used_subtract_green, enc->pic_, percent_range / 2, percent)) {
+    return 0;
+  }
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   assert(pred_bits >= 2);
@@ -1109,19 +1088,23 @@ static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
   return EncodeImageNoHuffman(
       bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
       (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
-      quality, low_effort);
+      quality, low_effort, enc->pic_, percent_range - percent_range / 2,
+      percent);
 }
 
-static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
-                                               int width, int height,
-                                               int quality, int low_effort,
-                                               VP8LBitWriter* const bw) {
+static int ApplyCrossColorFilter(const VP8LEncoder* const enc, int width,
+                                 int height, int quality, int low_effort,
+                                 VP8LBitWriter* const bw, int percent_range,
+                                 int* const percent) {
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
   const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
 
-  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
-                          enc->argb_, enc->transform_data_);
+  if (!VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
+                               enc->argb_, enc->transform_data_, enc->pic_,
+                               percent_range / 2, percent)) {
+    return 0;
+  }
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
   assert(ccolor_transform_bits >= 2);
@@ -1129,23 +1112,21 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
   return EncodeImageNoHuffman(
       bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
       (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
-      quality, low_effort);
+      quality, low_effort, enc->pic_, percent_range - percent_range / 2,
+      percent);
 }
 
 // -----------------------------------------------------------------------------
 
-static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
-                                         size_t riff_size, size_t vp8l_size) {
+static int WriteRiffHeader(const WebPPicture* const pic, size_t riff_size,
+                           size_t vp8l_size) {
   uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
     'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P',
     'V', 'P', '8', 'L', 0, 0, 0, 0, VP8L_MAGIC_BYTE,
   };
   PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
   PutLE32(riff + RIFF_HEADER_SIZE + TAG_SIZE, (uint32_t)vp8l_size);
-  if (!pic->writer(riff, sizeof(riff), pic)) {
-    return VP8_ENC_ERROR_BAD_WRITE;
-  }
-  return VP8_ENC_OK;
+  return pic->writer(riff, sizeof(riff), pic);
 }
 
 static int WriteImageSize(const WebPPicture* const pic,
@@ -1165,36 +1146,32 @@ static int WriteRealAlphaAndVersion(VP8LBitWriter* const bw, int has_alpha) {
   return !bw->error_;
 }
 
-static WebPEncodingError WriteImage(const WebPPicture* const pic,
-                                    VP8LBitWriter* const bw,
-                                    size_t* const coded_size) {
-  WebPEncodingError err = VP8_ENC_OK;
+static int WriteImage(const WebPPicture* const pic, VP8LBitWriter* const bw,
+                      size_t* const coded_size) {
   const uint8_t* const webpll_data = VP8LBitWriterFinish(bw);
   const size_t webpll_size = VP8LBitWriterNumBytes(bw);
   const size_t vp8l_size = VP8L_SIGNATURE_SIZE + webpll_size;
   const size_t pad = vp8l_size & 1;
   const size_t riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8l_size + pad;
+  *coded_size = 0;
 
-  err = WriteRiffHeader(pic, riff_size, vp8l_size);
-  if (err != VP8_ENC_OK) goto Error;
+  if (bw->error_) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
 
-  if (!pic->writer(webpll_data, webpll_size, pic)) {
-    err = VP8_ENC_ERROR_BAD_WRITE;
-    goto Error;
+  if (!WriteRiffHeader(pic, riff_size, vp8l_size) ||
+      !pic->writer(webpll_data, webpll_size, pic)) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
   }
 
   if (pad) {
     const uint8_t pad_byte[1] = { 0 };
     if (!pic->writer(pad_byte, 1, pic)) {
-      err = VP8_ENC_ERROR_BAD_WRITE;
-      goto Error;
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
     }
   }
   *coded_size = CHUNK_HEADER_SIZE + riff_size;
-  return VP8_ENC_OK;
-
- Error:
-  return err;
+  return 1;
 }
 
 // -----------------------------------------------------------------------------
@@ -1210,36 +1187,32 @@ static void ClearTransformBuffer(VP8LEncoder* const enc) {
 // Flags influencing the memory allocated:
 //  enc->transform_bits_
 //  enc->use_predict_, enc->use_cross_color_
-static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
-                                                 int width, int height) {
-  WebPEncodingError err = VP8_ENC_OK;
-  const uint64_t image_size = width * height;
+static int AllocateTransformBuffer(VP8LEncoder* const enc, int width,
+                                   int height) {
+  const uint64_t image_size = (uint64_t)width * height;
   // VP8LResidualImage needs room for 2 scanlines of uint32 pixels with an extra
   // pixel in each, plus 2 regular scanlines of bytes.
   // TODO(skal): Clean up by using arithmetic in bytes instead of words.
   const uint64_t argb_scratch_size =
-      enc->use_predict_
-          ? (width + 1) * 2 +
-            (width * 2 + sizeof(uint32_t) - 1) / sizeof(uint32_t)
-          : 0;
+      enc->use_predict_ ? (width + 1) * 2 + (width * 2 + sizeof(uint32_t) - 1) /
+                                                sizeof(uint32_t)
+                        : 0;
   const uint64_t transform_data_size =
       (enc->use_predict_ || enc->use_cross_color_)
-          ? VP8LSubSampleSize(width, enc->transform_bits_) *
+          ? (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) *
                 VP8LSubSampleSize(height, enc->transform_bits_)
           : 0;
   const uint64_t max_alignment_in_words =
       (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
-  const uint64_t mem_size =
-      image_size + max_alignment_in_words +
-      argb_scratch_size + max_alignment_in_words +
-      transform_data_size;
+  const uint64_t mem_size = image_size + max_alignment_in_words +
+                            argb_scratch_size + max_alignment_in_words +
+                            transform_data_size;
   uint32_t* mem = enc->transform_mem_;
   if (mem == NULL || mem_size > enc->transform_mem_size_) {
     ClearTransformBuffer(enc);
     mem = (uint32_t*)WebPSafeMalloc(mem_size, sizeof(*mem));
     if (mem == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
+      return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
     }
     enc->transform_mem_ = mem;
     enc->transform_mem_size_ = (size_t)mem_size;
@@ -1252,19 +1225,16 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
   enc->transform_data_ = mem;
 
   enc->current_width_ = width;
- Error:
-  return err;
+  return 1;
 }
 
-static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
-  WebPEncodingError err = VP8_ENC_OK;
+static int MakeInputImageCopy(VP8LEncoder* const enc) {
   const WebPPicture* const picture = enc->pic_;
   const int width = picture->width;
   const int height = picture->height;
 
-  err = AllocateTransformBuffer(enc, width, height);
-  if (err != VP8_ENC_OK) return err;
-  if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK;
+  if (!AllocateTransformBuffer(enc, width, height)) return 0;
+  if (enc->argb_content_ == kEncoderARGB) return 1;
 
   {
     uint32_t* dst = enc->argb_;
@@ -1278,27 +1248,11 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
   }
   enc->argb_content_ = kEncoderARGB;
   assert(enc->current_width_ == width);
-  return VP8_ENC_OK;
+  return 1;
 }
 
 // -----------------------------------------------------------------------------
 
-static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
-                                        int hi) {
-  int low = 0;
-  if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
-  while (1) {
-    const int mid = (low + hi) >> 1;
-    if (sorted[mid] == color) {
-      return mid;
-    } else if (sorted[mid] < color) {
-      low = mid;
-    } else {
-      hi = mid;
-    }
-  }
-}
-
 #define APPLY_PALETTE_GREEDY_MAX 4
 
 static WEBP_INLINE uint32_t SearchColorGreedy(const uint32_t palette[],
@@ -1333,17 +1287,6 @@ static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
          (32 - PALETTE_INV_SIZE_BITS);
 }
 
-// Sort palette in increasing order and prepare an inverse mapping array.
-static void PrepareMapToPalette(const uint32_t palette[], int num_colors,
-                                uint32_t sorted[], uint32_t idx_map[]) {
-  int i;
-  memcpy(sorted, palette, num_colors * sizeof(*sorted));
-  qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
-  for (i = 0; i < num_colors; ++i) {
-    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
-  }
-}
-
 // Use 1 pixel cache for ARGB pixels.
 #define APPLY_PALETTE_FOR(COLOR_INDEX) do {         \
   uint32_t prev_pix = palette[0];                   \
@@ -1367,16 +1310,18 @@ static void PrepareMapToPalette(const uint32_t palette[], int num_colors,
 // using 'row' as a temporary buffer of size 'width'.
 // We assume that all src[] values have a corresponding entry in the palette.
 // Note: src[] can be the same as dst[]
-static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
-                                      uint32_t* dst, uint32_t dst_stride,
-                                      const uint32_t* palette, int palette_size,
-                                      int width, int height, int xbits) {
+static int ApplyPalette(const uint32_t* src, uint32_t src_stride, uint32_t* dst,
+                        uint32_t dst_stride, const uint32_t* palette,
+                        int palette_size, int width, int height, int xbits,
+                        const WebPPicture* const pic) {
   // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
   // made to work in-place.
   uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
   int x, y;
 
-  if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (tmp_row == NULL) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
 
   if (palette_size < APPLY_PALETTE_GREEDY_MAX) {
     APPLY_PALETTE_FOR(SearchColorGreedy(palette, palette_size, pix));
@@ -1421,7 +1366,7 @@ static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
     }
   }
   WebPSafeFree(tmp_row);
-  return VP8_ENC_OK;
+  return 1;
 }
 #undef APPLY_PALETTE_FOR
 #undef PALETTE_INV_SIZE_BITS
@@ -1429,9 +1374,7 @@ static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
 #undef APPLY_PALETTE_GREEDY_MAX
 
 // Note: Expects "enc->palette_" to be set properly.
-static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
-                                             int in_place) {
-  WebPEncodingError err = VP8_ENC_OK;
+static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
@@ -1449,19 +1392,22 @@ static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
     xbits = (palette_size <= 16) ? 1 : 0;
   }
 
-  err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
-  if (err != VP8_ENC_OK) return err;
-
-  err = ApplyPalette(src, src_stride,
+  if (!AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height)) {
+    return 0;
+  }
+  if (!ApplyPalette(src, src_stride,
                      enc->argb_, enc->current_width_,
-                     palette, palette_size, width, height, xbits);
+                     palette, palette_size, width, height, xbits, pic)) {
+    return 0;
+  }
   enc->argb_content_ = kEncoderPalette;
-  return err;
+  return 1;
 }
 
 // Save palette_[] to bitstream.
 static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
-                                       VP8LEncoder* const enc) {
+                                       VP8LEncoder* const enc,
+                                       int percent_range, int* const percent) {
   int i;
   uint32_t tmp_palette[MAX_PALETTE_SIZE];
   const int palette_size = enc->palette_size_;
@@ -1476,7 +1422,7 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
   tmp_palette[0] = palette[0];
   return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
                               &enc->refs_[0], palette_size, 1, /*quality=*/20,
-                              low_effort);
+                              low_effort, enc->pic_, percent_range, percent);
 }
 
 // -----------------------------------------------------------------------------
@@ -1516,11 +1462,9 @@ typedef struct {
   const WebPPicture* picture_;
   VP8LBitWriter* bw_;
   VP8LEncoder* enc_;
-  int use_cache_;
   CrunchConfig crunch_configs_[CRUNCH_CONFIGS_MAX];
   int num_crunch_configs_;
   int red_and_blue_always_zero_;
-  WebPEncodingError err_;
   WebPAuxStats* stats_;
 } StreamEncodeContext;
 
@@ -1530,14 +1474,12 @@ static int EncodeStreamHook(void* input, void* data2) {
   const WebPPicture* const picture = params->picture_;
   VP8LBitWriter* const bw = params->bw_;
   VP8LEncoder* const enc = params->enc_;
-  const int use_cache = params->use_cache_;
   const CrunchConfig* const crunch_configs = params->crunch_configs_;
   const int num_crunch_configs = params->num_crunch_configs_;
   const int red_and_blue_always_zero = params->red_and_blue_always_zero_;
 #if !defined(WEBP_DISABLE_STATS)
   WebPAuxStats* const stats = params->stats_;
 #endif
-  WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
   const int low_effort = (config->method == 0);
 #if (WEBP_NEAR_LOSSLESS == 1)
@@ -1545,6 +1487,7 @@ static int EncodeStreamHook(void* input, void* data2) {
 #endif
   const int height = picture->height;
   const size_t byte_position = VP8LBitWriterNumBytes(bw);
+  int percent = 2;  // for WebPProgressHook
 #if (WEBP_NEAR_LOSSLESS == 1)
   int use_near_lossless = 0;
 #endif
@@ -1558,12 +1501,13 @@ static int EncodeStreamHook(void* input, void* data2) {
 
   if (!VP8LBitWriterInit(&bw_best, 0) ||
       (num_crunch_configs > 1 && !VP8LBitWriterClone(bw, &bw_best))) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
   for (idx = 0; idx < num_crunch_configs; ++idx) {
     const int entropy_idx = crunch_configs[idx].entropy_idx_;
+    int remaining_percent = 97 / num_crunch_configs, percent_range;
     enc->use_palette_ =
         (entropy_idx == kPalette) || (entropy_idx == kPaletteAndSpatial);
     enc->use_subtract_green_ =
@@ -1571,7 +1515,8 @@ static int EncodeStreamHook(void* input, void* data2) {
     enc->use_predict_ = (entropy_idx == kSpatial) ||
                         (entropy_idx == kSpatialSubGreen) ||
                         (entropy_idx == kPaletteAndSpatial);
-    if (low_effort) {
+    // When using a palette, R/B==0, hence no need to test for cross-color.
+    if (low_effort || enc->use_palette_) {
       enc->use_cross_color_ = 0;
     } else {
       enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
@@ -1586,11 +1531,10 @@ static int EncodeStreamHook(void* input, void* data2) {
     use_near_lossless = (config->near_lossless < 100) && !enc->use_palette_ &&
                         !enc->use_predict_;
     if (use_near_lossless) {
-      err = AllocateTransformBuffer(enc, width, height);
-      if (err != VP8_ENC_OK) goto Error;
+      if (!AllocateTransformBuffer(enc, width, height)) goto Error;
       if ((enc->argb_content_ != kEncoderNearLossless) &&
           !VP8ApplyNearLossless(picture, config->near_lossless, enc->argb_)) {
-        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
         goto Error;
       }
       enc->argb_content_ = kEncoderNearLossless;
@@ -1603,13 +1547,21 @@ static int EncodeStreamHook(void* input, void* data2) {
 
     // Encode palette
     if (enc->use_palette_) {
-      err = EncodePalette(bw, low_effort, enc);
-      if (err != VP8_ENC_OK) goto Error;
-      err = MapImageFromPalette(enc, use_delta_palette);
-      if (err != VP8_ENC_OK) goto Error;
+      if (!PaletteSort(crunch_configs[idx].palette_sorting_type_, enc->pic_,
+                       enc->palette_sorted_, enc->palette_size_,
+                       enc->palette_)) {
+        WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+        goto Error;
+      }
+      percent_range = remaining_percent / 4;
+      if (!EncodePalette(bw, low_effort, enc, percent_range, &percent)) {
+        goto Error;
+      }
+      remaining_percent -= percent_range;
+      if (!MapImageFromPalette(enc, use_delta_palette)) goto Error;
       // If using a color cache, do not have it bigger than the number of
       // colors.
-      if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+      if (enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
         enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
       }
     }
@@ -1617,8 +1569,7 @@ static int EncodeStreamHook(void* input, void* data2) {
       // In case image is not packed.
       if (enc->argb_content_ != kEncoderNearLossless &&
           enc->argb_content_ != kEncoderPalette) {
-        err = MakeInputImageCopy(enc);
-        if (err != VP8_ENC_OK) goto Error;
+        if (!MakeInputImageCopy(enc)) goto Error;
       }
 
       // -----------------------------------------------------------------------
@@ -1629,15 +1580,22 @@ static int EncodeStreamHook(void* input, void* data2) {
       }
 
       if (enc->use_predict_) {
-        err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
-                                 low_effort, enc->use_subtract_green_, bw);
-        if (err != VP8_ENC_OK) goto Error;
+        percent_range = remaining_percent / 3;
+        if (!ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                                low_effort, enc->use_subtract_green_, bw,
+                                percent_range, &percent)) {
+          goto Error;
+        }
+        remaining_percent -= percent_range;
       }
 
       if (enc->use_cross_color_) {
-        err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
-                                    low_effort, bw);
-        if (err != VP8_ENC_OK) goto Error;
+        percent_range = remaining_percent / 2;
+        if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+                                   low_effort, bw, percent_range, &percent)) {
+          goto Error;
+        }
+        remaining_percent -= percent_range;
       }
     }
 
@@ -1645,12 +1603,13 @@ static int EncodeStreamHook(void* input, void* data2) {
 
     // -------------------------------------------------------------------------
     // Encode and write the transformed image.
-    err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
-                              enc->current_width_, height, quality, low_effort,
-                              use_cache, &crunch_configs[idx],
-                              &enc->cache_bits_, enc->histo_bits_,
-                              byte_position, &hdr_size, &data_size);
-    if (err != VP8_ENC_OK) goto Error;
+    if (!EncodeImageInternal(
+            bw, enc->argb_, &enc->hash_chain_, enc->refs_, enc->current_width_,
+            height, quality, low_effort, &crunch_configs[idx],
+            &enc->cache_bits_, enc->histo_bits_, byte_position, &hdr_size,
+            &data_size, picture, remaining_percent, &percent)) {
+      goto Error;
+    }
 
     // If we are better than what we already have.
     if (VP8LBitWriterNumBytes(bw) < best_size) {
@@ -1680,18 +1639,15 @@ static int EncodeStreamHook(void* input, void* data2) {
   }
   VP8LBitWriterSwap(&bw_best, bw);
 
-Error:
+ Error:
   VP8LBitWriterWipeOut(&bw_best);
-  params->err_ = err;
   // The hook should return false in case of error.
-  return (err == VP8_ENC_OK);
+  return (params->picture_->error_code == VP8_ENC_OK);
 }
 
-WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
-                                   const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw_main,
-                                   int use_cache) {
-  WebPEncodingError err = VP8_ENC_OK;
+int VP8LEncodeStream(const WebPConfig* const config,
+                     const WebPPicture* const picture,
+                     VP8LBitWriter* const bw_main) {
   VP8LEncoder* const enc_main = VP8LEncoderNew(config, picture);
   VP8LEncoder* enc_side = NULL;
   CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX];
@@ -1703,15 +1659,25 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // The main thread uses picture->stats, the side thread uses stats_side.
   WebPAuxStats stats_side;
   VP8LBitWriter bw_side;
+  WebPPicture picture_side;
   const WebPWorkerInterface* const worker_interface = WebPGetWorkerInterface();
   int ok_main;
 
+  if (enc_main == NULL || !VP8LBitWriterInit(&bw_side, 0)) {
+    VP8LEncoderDelete(enc_main);
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+
+  // Avoid "garbage value" error from Clang's static analysis tool.
+  if (!WebPPictureInit(&picture_side)) {
+    goto Error;
+  }
+
   // Analyze image (entropy, num_palettes etc)
-  if (enc_main == NULL ||
-      !EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
+  if (!EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
                       &red_and_blue_always_zero) ||
-      !EncoderInit(enc_main) || !VP8LBitWriterInit(&bw_side, 0)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      !EncoderInit(enc_main)) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
@@ -1740,25 +1706,31 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
       StreamEncodeContext* const param =
           (idx == 0) ? &params_main : &params_side;
       param->config_ = config;
-      param->picture_ = picture;
-      param->use_cache_ = use_cache;
       param->red_and_blue_always_zero_ = red_and_blue_always_zero;
       if (idx == 0) {
+        param->picture_ = picture;
         param->stats_ = picture->stats;
         param->bw_ = bw_main;
         param->enc_ = enc_main;
       } else {
+        // Create a side picture (error_code is not thread-safe).
+        if (!WebPPictureView(picture, /*left=*/0, /*top=*/0, picture->width,
+                             picture->height, &picture_side)) {
+          assert(0);
+        }
+        picture_side.progress_hook = NULL;  // Progress hook is not thread-safe.
+        param->picture_ = &picture_side;  // No need to free a view afterwards.
         param->stats_ = (picture->stats == NULL) ? NULL : &stats_side;
         // Create a side bit writer.
         if (!VP8LBitWriterClone(bw_main, &bw_side)) {
-          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
           goto Error;
         }
         param->bw_ = &bw_side;
         // Create a side encoder.
-        enc_side = VP8LEncoderNew(config, picture);
+        enc_side = VP8LEncoderNew(config, &picture_side);
         if (enc_side == NULL || !EncoderInit(enc_side)) {
-          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
           goto Error;
         }
         // Copy the values that were computed for the main encoder.
@@ -1767,6 +1739,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
         enc_side->palette_size_ = enc_main->palette_size_;
         memcpy(enc_side->palette_, enc_main->palette_,
                sizeof(enc_main->palette_));
+        memcpy(enc_side->palette_sorted_, enc_main->palette_sorted_,
+               sizeof(enc_main->palette_sorted_));
         param->enc_ = enc_side;
       }
       // Create the workers.
@@ -1780,7 +1754,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // Start the second thread if needed.
   if (num_crunch_configs_side != 0) {
     if (!worker_interface->Reset(&worker_side)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
       goto Error;
     }
 #if !defined(WEBP_DISABLE_STATS)
@@ -1790,8 +1764,6 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
       memcpy(&stats_side, picture->stats, sizeof(stats_side));
     }
 #endif
-    // This line is only useful to remove a Clang static analyzer warning.
-    params_side.err_ = VP8_ENC_OK;
     worker_interface->Launch(&worker_side);
   }
   // Execute the main thread.
@@ -1803,7 +1775,10 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     const int ok_side = worker_interface->Sync(&worker_side);
     worker_interface->End(&worker_side);
     if (!ok_main || !ok_side) {
-      err = ok_main ? params_side.err_ : params_main.err_;
+      if (picture->error_code == VP8_ENC_OK) {
+        assert(picture_side.error_code != VP8_ENC_OK);
+        WebPEncodingSetError(picture, picture_side.error_code);
+      }
       goto Error;
     }
     if (VP8LBitWriterNumBytes(&bw_side) < VP8LBitWriterNumBytes(bw_main)) {
@@ -1814,18 +1789,13 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
       }
 #endif
     }
-  } else {
-    if (!ok_main) {
-      err = params_main.err_;
-      goto Error;
-    }
   }
 
-Error:
+ Error:
   VP8LBitWriterWipeOut(&bw_side);
   VP8LEncoderDelete(enc_main);
   VP8LEncoderDelete(enc_side);
-  return err;
+  return (picture->error_code == VP8_ENC_OK);
 }
 
 #undef CRUNCH_CONFIGS_MAX
@@ -1838,15 +1808,12 @@ int VP8LEncodeImage(const WebPConfig* const config,
   size_t coded_size;
   int percent = 0;
   int initial_size;
-  WebPEncodingError err = VP8_ENC_OK;
   VP8LBitWriter bw;
 
   if (picture == NULL) return 0;
 
   if (config == NULL || picture->argb == NULL) {
-    err = VP8_ENC_ERROR_NULL_PARAMETER;
-    WebPEncodingSetError(picture, err);
-    return 0;
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
   }
 
   width = picture->width;
@@ -1856,13 +1823,13 @@ int VP8LEncodeImage(const WebPConfig* const config,
   initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
       width * height : width * height * 2;
   if (!VP8LBitWriterInit(&bw, initial_size)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
   if (!WebPReportProgress(picture, 1, &percent)) {
  UserAbort:
-    err = VP8_ENC_ERROR_USER_ABORT;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_USER_ABORT);
     goto Error;
   }
   // Reset stats (for pure lossless coding)
@@ -1878,28 +1845,26 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
   // Write image size.
   if (!WriteImageSize(picture, &bw)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
   has_alpha = WebPPictureHasTransparency(picture);
   // Write the non-trivial Alpha flag and lossless version.
   if (!WriteRealAlphaAndVersion(&bw, has_alpha)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
   }
 
-  if (!WebPReportProgress(picture, 5, &percent)) goto UserAbort;
+  if (!WebPReportProgress(picture, 2, &percent)) goto UserAbort;
 
   // Encode main image stream.
-  err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
-  if (err != VP8_ENC_OK) goto Error;
+  if (!VP8LEncodeStream(config, picture, &bw)) goto Error;
 
-  if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
+  if (!WebPReportProgress(picture, 99, &percent)) goto UserAbort;
 
   // Finish the RIFF chunk.
-  err = WriteImage(picture, &bw, &coded_size);
-  if (err != VP8_ENC_OK) goto Error;
+  if (!WriteImage(picture, &bw, &coded_size)) goto Error;
 
   if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
 
@@ -1918,13 +1883,11 @@ int VP8LEncodeImage(const WebPConfig* const config,
   }
 
  Error:
-  if (bw.error_) err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-  VP8LBitWriterWipeOut(&bw);
-  if (err != VP8_ENC_OK) {
-    WebPEncodingSetError(picture, err);
-    return 0;
+  if (bw.error_) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-  return 1;
+  VP8LBitWriterWipeOut(&bw);
+  return (picture->error_code == VP8_ENC_OK);
 }
 
 //------------------------------------------------------------------------------
diff --git a/3rdparty/libwebp/src/enc/vp8li_enc.h b/3rdparty/libwebp/src/enc/vp8li_enc.h
index 94210ce9f3bd..c5b60dcb394d 100644
--- a/3rdparty/libwebp/src/enc/vp8li_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8li_enc.h
@@ -69,6 +69,8 @@ typedef struct {
   int use_palette_;
   int palette_size_;
   uint32_t palette_[MAX_PALETTE_SIZE];
+  // Sorted version of palette_ for cache purposes.
+  uint32_t palette_sorted_[MAX_PALETTE_SIZE];
 
   // Some 'scratch' (potentially large) objects.
   struct VP8LBackwardRefs refs_[4];  // Backward Refs array for temporaries.
@@ -86,10 +88,9 @@ int VP8LEncodeImage(const WebPConfig* const config,
                     const WebPPicture* const picture);
 
 // Encodes the main image stream using the supplied bit writer.
-// If 'use_cache' is false, disables the use of color cache.
-WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
-                                   const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw, int use_cache);
+// Returns false in case of error (stored in picture->error_code).
+int VP8LEncodeStream(const WebPConfig* const config,
+                     const WebPPicture* const picture, VP8LBitWriter* const bw);
 
 #if (WEBP_NEAR_LOSSLESS == 1)
 // in near_lossless.c
@@ -101,13 +102,18 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
 //------------------------------------------------------------------------------
 // Image transforms in predictor.c.
 
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int near_lossless, int exact,
-                       int used_subtract_green);
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image);
+// pic and percent are for progress.
+// Returns false in case of error (stored in pic->error_code).
+int VP8LResidualImage(int width, int height, int bits, int low_effort,
+                      uint32_t* const argb, uint32_t* const argb_scratch,
+                      uint32_t* const image, int near_lossless, int exact,
+                      int used_subtract_green, const WebPPicture* const pic,
+                      int percent_range, int* const percent);
+
+int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                            uint32_t* const argb, uint32_t* image,
+                            const WebPPicture* const pic, int percent_range,
+                            int* const percent);
 
 //------------------------------------------------------------------------------
 
diff --git a/3rdparty/libwebp/src/enc/webp_enc.c b/3rdparty/libwebp/src/enc/webp_enc.c
index ce2db2e94bcf..583fe6a8bbd6 100644
--- a/3rdparty/libwebp/src/enc/webp_enc.c
+++ b/3rdparty/libwebp/src/enc/webp_enc.c
@@ -307,7 +307,10 @@ int WebPEncodingSetError(const WebPPicture* const pic,
                          WebPEncodingError error) {
   assert((int)error < VP8_ENC_ERROR_LAST);
   assert((int)error >= VP8_ENC_OK);
-  ((WebPPicture*)pic)->error_code = error;
+  // The oldest error reported takes precedence over the new one.
+  if (pic->error_code == VP8_ENC_OK) {
+    ((WebPPicture*)pic)->error_code = error;
+  }
   return 0;
 }
 
@@ -317,8 +320,7 @@ int WebPReportProgress(const WebPPicture* const pic,
     *percent_store = percent;
     if (pic->progress_hook && !pic->progress_hook(percent, pic)) {
       // user abort requested
-      WebPEncodingSetError(pic, VP8_ENC_ERROR_USER_ABORT);
-      return 0;
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_USER_ABORT);
     }
   }
   return 1;  // ok
@@ -329,16 +331,14 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   int ok = 0;
   if (pic == NULL) return 0;
 
-  WebPEncodingSetError(pic, VP8_ENC_OK);  // all ok so far
+  pic->error_code = VP8_ENC_OK;  // all ok so far
   if (config == NULL) {  // bad params
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
   }
   if (!WebPValidateConfig(config)) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   }
-  if (pic->width <= 0 || pic->height <= 0) {
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
-  }
+  if (!WebPValidatePicture(pic)) return 0;
   if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
   }
diff --git a/3rdparty/libwebp/src/mux/anim_encode.c b/3rdparty/libwebp/src/mux/anim_encode.c
index 7be99068f687..31bd0457bf7d 100644
--- a/3rdparty/libwebp/src/mux/anim_encode.c
+++ b/3rdparty/libwebp/src/mux/anim_encode.c
@@ -22,6 +22,7 @@
 #include "src/webp/encode.h"
 #include "src/webp/format_constants.h"
 #include "src/webp/mux.h"
+#include "src/webp/types.h"
 
 #if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf
@@ -248,9 +249,6 @@ WebPAnimEncoder* WebPAnimEncoderNewInternal(
 
   enc = (WebPAnimEncoder*)WebPSafeCalloc(1, sizeof(*enc));
   if (enc == NULL) return NULL;
-  // sanity inits, so we can call WebPAnimEncoderDelete():
-  enc->encoded_frames_ = NULL;
-  enc->mux_ = NULL;
   MarkNoError(enc);
 
   // Dimensions and options.
@@ -421,7 +419,7 @@ static void MinimizeChangeRectangle(const WebPPicture* const src,
   const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
   const int max_allowed_diff = is_lossless ? 0 : max_allowed_diff_lossy;
 
-  // Sanity checks.
+  // Assumption/correctness checks.
   assert(src->width == dst->width && src->height == dst->height);
   assert(rect->x_offset_ + rect->width_ <= dst->width);
   assert(rect->y_offset_ + rect->height_ <= dst->height);
@@ -596,16 +594,17 @@ int WebPAnimEncoderRefineRect(
     int is_lossless, float quality, int* const x_offset, int* const y_offset,
     int* const width, int* const height) {
   FrameRectangle rect;
-  const int right = clip(*x_offset + *width, 0, curr_canvas->width);
-  const int left = clip(*x_offset, 0, curr_canvas->width - 1);
-  const int bottom = clip(*y_offset + *height, 0, curr_canvas->height);
-  const int top = clip(*y_offset, 0, curr_canvas->height - 1);
+  int right, left, bottom, top;
   if (prev_canvas == NULL || curr_canvas == NULL ||
       prev_canvas->width != curr_canvas->width ||
       prev_canvas->height != curr_canvas->height ||
       !prev_canvas->use_argb || !curr_canvas->use_argb) {
     return 0;
   }
+  right = clip(*x_offset + *width, 0, curr_canvas->width);
+  left = clip(*x_offset, 0, curr_canvas->width - 1);
+  bottom = clip(*y_offset + *height, 0, curr_canvas->height);
+  top = clip(*y_offset, 0, curr_canvas->height - 1);
   rect.x_offset_ = left;
   rect.y_offset_ = top;
   rect.width_ = clip(right - left, 0, curr_canvas->width - rect.x_offset_);
@@ -949,7 +948,8 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) {
   int new_duration;
 
   assert(enc->count_ >= 1);
-  assert(prev_enc_frame->sub_frame_.duration ==
+  assert(!prev_enc_frame->is_key_frame_ ||
+         prev_enc_frame->sub_frame_.duration ==
          prev_enc_frame->key_frame_.duration);
   assert(prev_enc_frame->sub_frame_.duration ==
          (prev_enc_frame->sub_frame_.duration & (MAX_DURATION - 1)));
@@ -966,7 +966,7 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) {
       0x10, 0x88, 0x88, 0x08
     };
     const WebPData lossless_1x1 = {
-        lossless_1x1_bytes, sizeof(lossless_1x1_bytes)
+      lossless_1x1_bytes, sizeof(lossless_1x1_bytes)
     };
     const uint8_t lossy_1x1_bytes[] = {
       0x52, 0x49, 0x46, 0x46, 0x40, 0x00, 0x00, 0x00, 0x57, 0x45, 0x42, 0x50,
@@ -1358,6 +1358,12 @@ int WebPAnimEncoderAdd(WebPAnimEncoder* enc, WebPPicture* frame, int timestamp,
     if (!IncreasePreviousDuration(enc, (int)prev_frame_duration)) {
       return 0;
     }
+    // IncreasePreviousDuration() may add a frame to avoid exceeding
+    // MAX_DURATION which could cause CacheFrame() to over read encoded_frames_
+    // before the next flush.
+    if (enc->count_ == enc->size_ && !FlushFrames(enc)) {
+      return 0;
+    }
   } else {
     enc->first_timestamp_ = timestamp;
   }
@@ -1393,7 +1399,10 @@ int WebPAnimEncoderAdd(WebPAnimEncoder* enc, WebPPicture* frame, int timestamp,
     }
     config = *encoder_config;
   } else {
-    WebPConfigInit(&config);
+    if (!WebPConfigInit(&config)) {
+      MarkError(enc, "Cannot Init config");
+      return 0;
+    }
     config.lossless = 1;
   }
   assert(enc->curr_canvas_ == NULL);
@@ -1414,12 +1423,14 @@ int WebPAnimEncoderAdd(WebPAnimEncoder* enc, WebPPicture* frame, int timestamp,
 // -----------------------------------------------------------------------------
 // Bitstream assembly.
 
-static int DecodeFrameOntoCanvas(const WebPMuxFrameInfo* const frame,
-                                 WebPPicture* const canvas) {
+WEBP_NODISCARD static int DecodeFrameOntoCanvas(
+    const WebPMuxFrameInfo* const frame, WebPPicture* const canvas) {
   const WebPData* const image = &frame->bitstream;
   WebPPicture sub_image;
   WebPDecoderConfig config;
-  WebPInitDecoderConfig(&config);
+  if (!WebPInitDecoderConfig(&config)) {
+    return 0;
+  }
   WebPUtilClearPic(canvas, NULL);
   if (WebPGetFeatures(image->bytes, image->size, &config.input) !=
       VP8_STATUS_OK) {
@@ -1578,4 +1589,23 @@ const char* WebPAnimEncoderGetError(WebPAnimEncoder* enc) {
   return enc->error_str_;
 }
 
+WebPMuxError WebPAnimEncoderSetChunk(
+    WebPAnimEncoder* enc, const char fourcc[4], const WebPData* chunk_data,
+    int copy_data) {
+  if (enc == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  return WebPMuxSetChunk(enc->mux_, fourcc, chunk_data, copy_data);
+}
+
+WebPMuxError WebPAnimEncoderGetChunk(
+    const WebPAnimEncoder* enc, const char fourcc[4], WebPData* chunk_data) {
+  if (enc == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  return WebPMuxGetChunk(enc->mux_, fourcc, chunk_data);
+}
+
+WebPMuxError WebPAnimEncoderDeleteChunk(
+    WebPAnimEncoder* enc, const char fourcc[4]) {
+  if (enc == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  return WebPMuxDeleteChunk(enc->mux_, fourcc);
+}
+
 // -----------------------------------------------------------------------------
diff --git a/3rdparty/libwebp/src/mux/muxedit.c b/3rdparty/libwebp/src/mux/muxedit.c
index ccf14b2a0c51..48c6834a4dd6 100644
--- a/3rdparty/libwebp/src/mux/muxedit.c
+++ b/3rdparty/libwebp/src/mux/muxedit.c
@@ -66,13 +66,16 @@ void WebPMuxDelete(WebPMux* mux) {
 
 // Handy MACRO, makes MuxSet() very symmetric to MuxGet().
 #define SWITCH_ID_LIST(INDEX, LIST)                                            \
-  if (idx == (INDEX)) {                                                        \
-    err = ChunkAssignData(&chunk, data, copy_data, tag);                       \
-    if (err == WEBP_MUX_OK) {                                                  \
-      err = ChunkSetHead(&chunk, (LIST));                                      \
+  do {                                                                         \
+    if (idx == (INDEX)) {                                                      \
+      err = ChunkAssignData(&chunk, data, copy_data, tag);                     \
+      if (err == WEBP_MUX_OK) {                                                \
+        err = ChunkSetHead(&chunk, (LIST));                                    \
+        if (err != WEBP_MUX_OK) ChunkRelease(&chunk);                          \
+      }                                                                        \
+      return err;                                                              \
     }                                                                          \
-    return err;                                                                \
-  }
+  } while (0)
 
 static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag,
                            const WebPData* const data, int copy_data) {
@@ -235,7 +238,6 @@ WebPMuxError WebPMuxSetImage(WebPMux* mux, const WebPData* bitstream,
   WebPMuxImage wpi;
   WebPMuxError err;
 
-  // Sanity checks.
   if (mux == NULL || bitstream == NULL || bitstream->bytes == NULL ||
       bitstream->size > MAX_CHUNK_PAYLOAD) {
     return WEBP_MUX_INVALID_ARGUMENT;
@@ -267,7 +269,6 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* info,
   WebPMuxImage wpi;
   WebPMuxError err;
 
-  // Sanity checks.
   if (mux == NULL || info == NULL) return WEBP_MUX_INVALID_ARGUMENT;
 
   if (info->id != WEBP_CHUNK_ANMF) return WEBP_MUX_INVALID_ARGUMENT;
@@ -556,7 +557,8 @@ static WebPMuxError MuxCleanup(WebPMux* const mux) {
   if (num_frames == 1) {
     WebPMuxImage* frame = NULL;
     err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, 1, &frame);
-    assert(err == WEBP_MUX_OK);  // We know that one frame does exist.
+    if (err != WEBP_MUX_OK) return err;
+    // We know that one frame does exist.
     assert(frame != NULL);
     if (frame->header_ != NULL &&
         ((mux->canvas_width_ == 0 && mux->canvas_height_ == 0) ||
diff --git a/3rdparty/libwebp/src/mux/muxi.h b/3rdparty/libwebp/src/mux/muxi.h
index 2289822e8f88..74ae3fac12bb 100644
--- a/3rdparty/libwebp/src/mux/muxi.h
+++ b/3rdparty/libwebp/src/mux/muxi.h
@@ -28,7 +28,7 @@ extern "C" {
 // Defines and constants.
 
 #define MUX_MAJ_VERSION 1
-#define MUX_MIN_VERSION 2
+#define MUX_MIN_VERSION 4
 #define MUX_REV_VERSION 0
 
 // Chunk object.
diff --git a/3rdparty/libwebp/src/mux/muxinternal.c b/3rdparty/libwebp/src/mux/muxinternal.c
index b9ee6717d3a4..75b6b416b993 100644
--- a/3rdparty/libwebp/src/mux/muxinternal.c
+++ b/3rdparty/libwebp/src/mux/muxinternal.c
@@ -155,17 +155,18 @@ WebPMuxError ChunkSetHead(WebPChunk* const chunk,
 
 WebPMuxError ChunkAppend(WebPChunk* const chunk,
                          WebPChunk*** const chunk_list) {
+  WebPMuxError err;
   assert(chunk_list != NULL && *chunk_list != NULL);
 
   if (**chunk_list == NULL) {
-    ChunkSetHead(chunk, *chunk_list);
+    err = ChunkSetHead(chunk, *chunk_list);
   } else {
     WebPChunk* last_chunk = **chunk_list;
     while (last_chunk->next_ != NULL) last_chunk = last_chunk->next_;
-    ChunkSetHead(chunk, &last_chunk->next_);
-    *chunk_list = &last_chunk->next_;
+    err = ChunkSetHead(chunk, &last_chunk->next_);
+    if (err == WEBP_MUX_OK) *chunk_list = &last_chunk->next_;
   }
-  return WEBP_MUX_OK;
+  return err;
 }
 
 //------------------------------------------------------------------------------
diff --git a/3rdparty/libwebp/src/mux/muxread.c b/3rdparty/libwebp/src/mux/muxread.c
index 0101fde15da0..afd3542e1241 100644
--- a/3rdparty/libwebp/src/mux/muxread.c
+++ b/3rdparty/libwebp/src/mux/muxread.c
@@ -21,20 +21,23 @@
 
 // Handy MACRO.
 #define SWITCH_ID_LIST(INDEX, LIST)                                           \
-  if (idx == (INDEX)) {                                                       \
-    const WebPChunk* const chunk = ChunkSearchList((LIST), nth,               \
-                                                   kChunks[(INDEX)].tag);     \
-    if (chunk) {                                                              \
-      *data = chunk->data_;                                                   \
-      return WEBP_MUX_OK;                                                     \
-    } else {                                                                  \
-      return WEBP_MUX_NOT_FOUND;                                              \
+  do {                                                                        \
+    if (idx == (INDEX)) {                                                     \
+      const WebPChunk* const chunk = ChunkSearchList((LIST), nth,             \
+                                                     kChunks[(INDEX)].tag);   \
+      if (chunk) {                                                            \
+        *data = chunk->data_;                                                 \
+        return WEBP_MUX_OK;                                                   \
+      } else {                                                                \
+        return WEBP_MUX_NOT_FOUND;                                            \
+      }                                                                       \
     }                                                                         \
-  }
+  } while (0)
 
 static WebPMuxError MuxGet(const WebPMux* const mux, CHUNK_INDEX idx,
                            uint32_t nth, WebPData* const data) {
   assert(mux != NULL);
+  assert(idx != IDX_LAST_CHUNK);
   assert(!IsWPI(kChunks[idx].id));
   WebPDataInit(data);
 
@@ -56,7 +59,7 @@ static WebPMuxError ChunkVerifyAndAssign(WebPChunk* chunk,
   uint32_t chunk_size;
   WebPData chunk_data;
 
-  // Sanity checks.
+  // Correctness checks.
   if (data_size < CHUNK_HEADER_SIZE) return WEBP_MUX_NOT_ENOUGH_DATA;
   chunk_size = GetLE32(data + TAG_SIZE);
   if (chunk_size > MAX_CHUNK_PAYLOAD) return WEBP_MUX_BAD_DATA;
@@ -116,9 +119,12 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
     // Each of ANMF chunk contain a header at the beginning. So, its size should
     // be at least 'hdr_size'.
     if (size < hdr_size) goto Fail;
-    ChunkAssignData(&subchunk, &temp, copy_data, chunk->tag_);
+    if (ChunkAssignData(&subchunk, &temp, copy_data,
+                        chunk->tag_) != WEBP_MUX_OK) {
+      goto Fail;
+    }
   }
-  ChunkSetHead(&subchunk, &wpi->header_);
+  if (ChunkSetHead(&subchunk, &wpi->header_) != WEBP_MUX_OK) goto Fail;
   wpi->is_partial_ = 1;  // Waiting for ALPH and/or VP8/VP8L chunks.
 
   // Rest of the chunks.
@@ -186,7 +192,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   WebPChunk** chunk_list_ends[WEBP_CHUNK_NIL + 1] = { NULL };
   ChunkInit(&chunk);
 
-  // Sanity checks.
   if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_MUX_ABI_VERSION)) {
     return NULL;  // version mismatch
   }
@@ -427,6 +432,7 @@ WebPMuxError WebPMuxGetChunk(const WebPMux* mux, const char fourcc[4],
     return WEBP_MUX_INVALID_ARGUMENT;
   }
   idx = ChunkGetIndexFromFourCC(fourcc);
+  assert(idx != IDX_LAST_CHUNK);
   if (IsWPI(kChunks[idx].id)) {     // An image chunk.
     return WEBP_MUX_INVALID_ARGUMENT;
   } else if (idx != IDX_UNKNOWN) {  // A known chunk type.
@@ -481,7 +487,6 @@ WebPMuxError WebPMuxGetFrame(
   WebPMuxError err;
   WebPMuxImage* wpi;
 
-  // Sanity checks.
   if (mux == NULL || frame == NULL) {
     return WEBP_MUX_INVALID_ARGUMENT;
   }
diff --git a/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h b/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
index 46b38807062c..24f3af7b5454 100644
--- a/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
+++ b/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -55,7 +55,7 @@ void VP8LoadFinalBytes(VP8BitReader* const br);
 
 // makes sure br->value_ has at least BITS bits worth of data
 static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-void VP8LoadNewBytes(VP8BitReader* const br) {
+void VP8LoadNewBytes(VP8BitReader* WEBP_RESTRICT const br) {
   assert(br != NULL && br->buf_ != NULL);
   // Read 'BITS' bits at a time if possible.
   if (br->buf_ < br->buf_max_) {
@@ -104,7 +104,7 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
 }
 
 // Read a bit with proba 'prob'. Speed-critical function!
-static WEBP_INLINE int VP8GetBit(VP8BitReader* const br,
+static WEBP_INLINE int VP8GetBit(VP8BitReader* WEBP_RESTRICT const br,
                                  int prob, const char label[]) {
   // Don't move this declaration! It makes a big speed difference to store
   // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
@@ -137,7 +137,8 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br,
 
 // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
 static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
-int VP8GetSigned(VP8BitReader* const br, int v, const char label[]) {
+int VP8GetSigned(VP8BitReader* WEBP_RESTRICT const br, int v,
+                 const char label[]) {
   if (br->bits_ < 0) {
     VP8LoadNewBytes(br);
   }
@@ -147,15 +148,15 @@ int VP8GetSigned(VP8BitReader* const br, int v, const char label[]) {
     const range_t value = (range_t)(br->value_ >> pos);
     const int32_t mask = (int32_t)(split - value) >> 31;  // -1 or 0
     br->bits_ -= 1;
-    br->range_ += mask;
+    br->range_ += (range_t)mask;
     br->range_ |= 1;
-    br->value_ -= (bit_t)((split + 1) & mask) << pos;
+    br->value_ -= (bit_t)((split + 1) & (uint32_t)mask) << pos;
     BT_TRACK(br);
     return (v ^ mask) - mask;
   }
 }
 
-static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br,
+static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* WEBP_RESTRICT const br,
                                     int prob, const char label[]) {
   // Don't move this declaration! It makes a big speed difference to store
   // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
diff --git a/3rdparty/libwebp/src/utils/bit_reader_utils.c b/3rdparty/libwebp/src/utils/bit_reader_utils.c
index 857cd6098882..a26557aa49f9 100644
--- a/3rdparty/libwebp/src/utils/bit_reader_utils.c
+++ b/3rdparty/libwebp/src/utils/bit_reader_utils.c
@@ -15,6 +15,7 @@
 #include "src/webp/config.h"
 #endif
 
+#include "src/dsp/cpu.h"
 #include "src/utils/bit_reader_inl_utils.h"
 #include "src/utils/utils.h"
 
@@ -121,7 +122,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
 
 #define VP8L_LOG8_WBITS 4  // Number of bytes needed to store VP8L_WBITS bits.
 
-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+#if defined(__arm__) || defined(_M_ARM) || WEBP_AARCH64 || \
     defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64)
 #define VP8L_USE_FAST_LOAD
diff --git a/3rdparty/libwebp/src/utils/bit_reader_utils.h b/3rdparty/libwebp/src/utils/bit_reader_utils.h
index e64156e31817..25ff31e5d97a 100644
--- a/3rdparty/libwebp/src/utils/bit_reader_utils.h
+++ b/3rdparty/libwebp/src/utils/bit_reader_utils.h
@@ -19,6 +19,7 @@
 #ifdef _MSC_VER
 #include <stdlib.h>  // _byteswap_ulong
 #endif
+#include "src/dsp/cpu.h"
 #include "src/webp/types.h"
 
 // Warning! This macro triggers quite some MACRO wizardry around func signature!
@@ -64,7 +65,7 @@ extern "C" {
 #define BITS 56
 #elif defined(__arm__) || defined(_M_ARM)      // ARM
 #define BITS 24
-#elif defined(__aarch64__)                     // ARM 64bit
+#elif WEBP_AARCH64                             // ARM 64bit
 #define BITS 56
 #elif defined(__mips__)                        // MIPS
 #define BITS 24
diff --git a/3rdparty/libwebp/src/utils/bit_writer_utils.c b/3rdparty/libwebp/src/utils/bit_writer_utils.c
index bef0e31ca5ea..2f408508f114 100644
--- a/3rdparty/libwebp/src/utils/bit_writer_utils.c
+++ b/3rdparty/libwebp/src/utils/bit_writer_utils.c
@@ -278,7 +278,7 @@ void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
   // If needed, make some room by flushing some bits out.
   if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
     const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
-    if (extra_size != (size_t)extra_size ||
+    if (!CheckSizeOverflow(extra_size) ||
         !VP8LBitWriterResize(bw, (size_t)extra_size)) {
       bw->cur_ = bw->buf_;
       bw->error_ = 1;
@@ -314,7 +314,7 @@ void VP8LPutBitsInternal(VP8LBitWriter* const bw, uint32_t bits, int n_bits) {
     while (used >= VP8L_WRITER_BITS) {
       if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
         const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
-        if (extra_size != (size_t)extra_size ||
+        if (!CheckSizeOverflow(extra_size) ||
             !VP8LBitWriterResize(bw, (size_t)extra_size)) {
           bw->cur_ = bw->buf_;
           bw->error_ = 1;
diff --git a/3rdparty/libwebp/src/utils/color_cache_utils.c b/3rdparty/libwebp/src/utils/color_cache_utils.c
index b09f538e8be6..7b5222b6e554 100644
--- a/3rdparty/libwebp/src/utils/color_cache_utils.c
+++ b/3rdparty/libwebp/src/utils/color_cache_utils.c
@@ -20,22 +20,22 @@
 //------------------------------------------------------------------------------
 // VP8LColorCache.
 
-int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
+int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits) {
   const int hash_size = 1 << hash_bits;
-  assert(cc != NULL);
+  assert(color_cache != NULL);
   assert(hash_bits > 0);
-  cc->colors_ = (uint32_t*)WebPSafeCalloc((uint64_t)hash_size,
-                                          sizeof(*cc->colors_));
-  if (cc->colors_ == NULL) return 0;
-  cc->hash_shift_ = 32 - hash_bits;
-  cc->hash_bits_ = hash_bits;
+  color_cache->colors_ = (uint32_t*)WebPSafeCalloc(
+      (uint64_t)hash_size, sizeof(*color_cache->colors_));
+  if (color_cache->colors_ == NULL) return 0;
+  color_cache->hash_shift_ = 32 - hash_bits;
+  color_cache->hash_bits_ = hash_bits;
   return 1;
 }
 
-void VP8LColorCacheClear(VP8LColorCache* const cc) {
-  if (cc != NULL) {
-    WebPSafeFree(cc->colors_);
-    cc->colors_ = NULL;
+void VP8LColorCacheClear(VP8LColorCache* const color_cache) {
+  if (color_cache != NULL) {
+    WebPSafeFree(color_cache->colors_);
+    color_cache->colors_ = NULL;
   }
 }
 
diff --git a/3rdparty/libwebp/src/utils/huffman_encode_utils.c b/3rdparty/libwebp/src/utils/huffman_encode_utils.c
index 6f3b1bbe020f..585db9195184 100644
--- a/3rdparty/libwebp/src/utils/huffman_encode_utils.c
+++ b/3rdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -161,7 +161,7 @@ static void SetBitDepths(const HuffmanTree* const tree,
 // especially when population counts are longer than 2**tree_limit, but
 // we are not planning to use this with extremely long blocks.
 //
-// See http://en.wikipedia.org/wiki/Huffman_coding
+// See https://en.wikipedia.org/wiki/Huffman_coding
 static void GenerateOptimalTree(const uint32_t* const histogram,
                                 int histogram_size,
                                 HuffmanTree* tree, int tree_depth_limit,
@@ -404,8 +404,7 @@ static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
 // Main entry point
 
 void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
-                           uint8_t* const buf_rle,
-                           HuffmanTree* const huff_tree,
+                           uint8_t* const buf_rle, HuffmanTree* const huff_tree,
                            HuffmanTreeCode* const huff_code) {
   const int num_symbols = huff_code->num_symbols;
   memset(buf_rle, 0, num_symbols * sizeof(*buf_rle));
diff --git a/3rdparty/libwebp/src/utils/huffman_encode_utils.h b/3rdparty/libwebp/src/utils/huffman_encode_utils.h
index 3e6763ce49db..3f7f1d8074c2 100644
--- a/3rdparty/libwebp/src/utils/huffman_encode_utils.h
+++ b/3rdparty/libwebp/src/utils/huffman_encode_utils.h
@@ -51,7 +51,7 @@ int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
 // huffman code tree.
 void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
                            uint8_t* const buf_rle, HuffmanTree* const huff_tree,
-                           HuffmanTreeCode* const tree);
+                           HuffmanTreeCode* const huff_code);
 
 #ifdef __cplusplus
 }
diff --git a/3rdparty/libwebp/src/utils/huffman_utils.c b/3rdparty/libwebp/src/utils/huffman_utils.c
index 0cba0fbb7d4f..16f9faaa9a07 100644
--- a/3rdparty/libwebp/src/utils/huffman_utils.c
+++ b/3rdparty/libwebp/src/utils/huffman_utils.c
@@ -122,6 +122,9 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
     const int symbol_code_length = code_lengths[symbol];
     if (code_lengths[symbol] > 0) {
       if (sorted != NULL) {
+        if(offset[symbol_code_length] >= code_lengths_size) {
+            return 0;
+        }
         sorted[offset[symbol_code_length]++] = symbol;
       } else {
         offset[symbol_code_length]++;
@@ -142,7 +145,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 
   {
     int step;              // step size to replicate values in current table
-    uint32_t low = -1;     // low bits for current root entry
+    uint32_t low = 0xffffffffu;        // low bits for current root entry
     uint32_t mask = total_size - 1;    // mask for low bits
     uint32_t key = 0;      // reversed prefix code
     int num_nodes = 1;     // number of Huffman tree nodes
@@ -177,21 +180,24 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       if (num_open < 0) {
         return 0;
       }
-      if (root_table == NULL) continue;
       for (; count[len] > 0; --count[len]) {
         HuffmanCode code;
         if ((key & mask) != low) {
-          table += table_size;
+          if (root_table != NULL) table += table_size;
           table_bits = NextTableBitSize(count, len, root_bits);
           table_size = 1 << table_bits;
           total_size += table_size;
           low = key & mask;
-          root_table[low].bits = (uint8_t)(table_bits + root_bits);
-          root_table[low].value = (uint16_t)((table - root_table) - low);
+          if (root_table != NULL) {
+            root_table[low].bits = (uint8_t)(table_bits + root_bits);
+            root_table[low].value = (uint16_t)((table - root_table) - low);
+          }
+        }
+        if (root_table != NULL) {
+          code.bits = (uint8_t)(len - root_bits);
+          code.value = (uint16_t)sorted[symbol++];
+          ReplicateValue(&table[key >> root_bits], step, table_size, code);
         }
-        code.bits = (uint8_t)(len - root_bits);
-        code.value = (uint16_t)sorted[symbol++];
-        ReplicateValue(&table[key >> root_bits], step, table_size, code);
         key = GetNextKey(key, len);
       }
     }
@@ -211,25 +217,83 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
   ((1 << MAX_CACHE_BITS) + NUM_LITERAL_CODES + NUM_LENGTH_CODES)
 // Cut-off value for switching between heap and stack allocation.
 #define SORTED_SIZE_CUTOFF 512
-int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+int VP8LBuildHuffmanTable(HuffmanTables* const root_table, int root_bits,
                           const int code_lengths[], int code_lengths_size) {
-  int total_size;
+  const int total_size =
+      BuildHuffmanTable(NULL, root_bits, code_lengths, code_lengths_size, NULL);
   assert(code_lengths_size <= MAX_CODE_LENGTHS_SIZE);
-  if (root_table == NULL) {
-    total_size = BuildHuffmanTable(NULL, root_bits,
-                                   code_lengths, code_lengths_size, NULL);
-  } else if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
+  if (total_size == 0 || root_table == NULL) return total_size;
+
+  if (root_table->curr_segment->curr_table + total_size >=
+      root_table->curr_segment->start + root_table->curr_segment->size) {
+    // If 'root_table' does not have enough memory, allocate a new segment.
+    // The available part of root_table->curr_segment is left unused because we
+    // need a contiguous buffer.
+    const int segment_size = root_table->curr_segment->size;
+    struct HuffmanTablesSegment* next =
+        (HuffmanTablesSegment*)WebPSafeMalloc(1, sizeof(*next));
+    if (next == NULL) return 0;
+    // Fill the new segment.
+    // We need at least 'total_size' but if that value is small, it is better to
+    // allocate a big chunk to prevent more allocations later. 'segment_size' is
+    // therefore chosen (any other arbitrary value could be chosen).
+    next->size = total_size > segment_size ? total_size : segment_size;
+    next->start =
+        (HuffmanCode*)WebPSafeMalloc(next->size, sizeof(*next->start));
+    if (next->start == NULL) {
+      WebPSafeFree(next);
+      return 0;
+    }
+    next->curr_table = next->start;
+    next->next = NULL;
+    // Point to the new segment.
+    root_table->curr_segment->next = next;
+    root_table->curr_segment = next;
+  }
+  if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
     // use local stack-allocated array.
     uint16_t sorted[SORTED_SIZE_CUTOFF];
-    total_size = BuildHuffmanTable(root_table, root_bits,
-                                   code_lengths, code_lengths_size, sorted);
-  } else {   // rare case. Use heap allocation.
+    BuildHuffmanTable(root_table->curr_segment->curr_table, root_bits,
+                      code_lengths, code_lengths_size, sorted);
+  } else {  // rare case. Use heap allocation.
     uint16_t* const sorted =
         (uint16_t*)WebPSafeMalloc(code_lengths_size, sizeof(*sorted));
     if (sorted == NULL) return 0;
-    total_size = BuildHuffmanTable(root_table, root_bits,
-                                   code_lengths, code_lengths_size, sorted);
+    BuildHuffmanTable(root_table->curr_segment->curr_table, root_bits,
+                      code_lengths, code_lengths_size, sorted);
     WebPSafeFree(sorted);
   }
   return total_size;
 }
+
+int VP8LHuffmanTablesAllocate(int size, HuffmanTables* huffman_tables) {
+  // Have 'segment' point to the first segment for now, 'root'.
+  HuffmanTablesSegment* const root = &huffman_tables->root;
+  huffman_tables->curr_segment = root;
+  root->next = NULL;
+  // Allocate root.
+  root->start = (HuffmanCode*)WebPSafeMalloc(size, sizeof(*root->start));
+  if (root->start == NULL) return 0;
+  root->curr_table = root->start;
+  root->size = size;
+  return 1;
+}
+
+void VP8LHuffmanTablesDeallocate(HuffmanTables* const huffman_tables) {
+  HuffmanTablesSegment *current, *next;
+  if (huffman_tables == NULL) return;
+  // Free the root node.
+  current = &huffman_tables->root;
+  next = current->next;
+  WebPSafeFree(current->start);
+  current->start = NULL;
+  current->next = NULL;
+  current = next;
+  // Free the following nodes.
+  while (current != NULL) {
+    next = current->next;
+    WebPSafeFree(current->start);
+    WebPSafeFree(current);
+    current = next;
+  }
+}
diff --git a/3rdparty/libwebp/src/utils/huffman_utils.h b/3rdparty/libwebp/src/utils/huffman_utils.h
index 13b7ad1ac40c..d511dc052c43 100644
--- a/3rdparty/libwebp/src/utils/huffman_utils.h
+++ b/3rdparty/libwebp/src/utils/huffman_utils.h
@@ -43,6 +43,30 @@ typedef struct {
                     // or non-literal symbol otherwise
 } HuffmanCode32;
 
+// Contiguous memory segment of HuffmanCodes.
+typedef struct HuffmanTablesSegment {
+  HuffmanCode* start;
+  // Pointer to where we are writing into the segment. Starts at 'start' and
+  // cannot go beyond 'start' + 'size'.
+  HuffmanCode* curr_table;
+  // Pointer to the next segment in the chain.
+  struct HuffmanTablesSegment* next;
+  int size;
+} HuffmanTablesSegment;
+
+// Chained memory segments of HuffmanCodes.
+typedef struct HuffmanTables {
+  HuffmanTablesSegment root;
+  // Currently processed segment. At first, this is 'root'.
+  HuffmanTablesSegment* curr_segment;
+} HuffmanTables;
+
+// Allocates a HuffmanTables with 'size' contiguous HuffmanCodes. Returns 0 on
+// memory allocation error, 1 otherwise.
+WEBP_NODISCARD int VP8LHuffmanTablesAllocate(int size,
+                                             HuffmanTables* huffman_tables);
+void VP8LHuffmanTablesDeallocate(HuffmanTables* const huffman_tables);
+
 #define HUFFMAN_PACKED_BITS 6
 #define HUFFMAN_PACKED_TABLE_SIZE (1u << HUFFMAN_PACKED_BITS)
 
@@ -68,7 +92,7 @@ struct HTreeGroup {
 };
 
 // Creates the instance of HTreeGroup with specified number of tree-groups.
-HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups);
+WEBP_NODISCARD HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups);
 
 // Releases the memory allocated for HTreeGroup.
 void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups);
@@ -78,10 +102,10 @@ void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups);
 // the huffman table.
 // Returns built table size or 0 in case of error (invalid tree or
 // memory error).
-// If root_table is NULL, it returns 0 if a lookup cannot be built, something
-// > 0 otherwise (but not the table size).
-int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
-                          const int code_lengths[], int code_lengths_size);
+WEBP_NODISCARD int VP8LBuildHuffmanTable(HuffmanTables* const root_table,
+                                         int root_bits,
+                                         const int code_lengths[],
+                                         int code_lengths_size);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/3rdparty/libwebp/src/utils/palette.c b/3rdparty/libwebp/src/utils/palette.c
new file mode 100644
index 000000000000..515da2101950
--- /dev/null
+++ b/3rdparty/libwebp/src/utils/palette.c
@@ -0,0 +1,402 @@
+// Copyright 2023 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for palette analysis.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#include "src/utils/palette.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "src/dsp/lossless_common.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/encode.h"
+#include "src/webp/format_constants.h"
+
+// -----------------------------------------------------------------------------
+
+// Palette reordering for smaller sum of deltas (and for smaller storage).
+
+static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
+  const uint32_t a = WebPMemToUint32((uint8_t*)p1);
+  const uint32_t b = WebPMemToUint32((uint8_t*)p2);
+  assert(a != b);
+  return (a < b) ? -1 : 1;
+}
+
+static WEBP_INLINE uint32_t PaletteComponentDistance(uint32_t v) {
+  return (v <= 128) ? v : (256 - v);
+}
+
+// Computes a value that is related to the entropy created by the
+// palette entry diff.
+//
+// Note that the last & 0xff is a no-operation in the next statement, but
+// removed by most compilers and is here only for regularity of the code.
+static WEBP_INLINE uint32_t PaletteColorDistance(uint32_t col1, uint32_t col2) {
+  const uint32_t diff = VP8LSubPixels(col1, col2);
+  const int kMoreWeightForRGBThanForAlpha = 9;
+  uint32_t score;
+  score = PaletteComponentDistance((diff >> 0) & 0xff);
+  score += PaletteComponentDistance((diff >> 8) & 0xff);
+  score += PaletteComponentDistance((diff >> 16) & 0xff);
+  score *= kMoreWeightForRGBThanForAlpha;
+  score += PaletteComponentDistance((diff >> 24) & 0xff);
+  return score;
+}
+
+static WEBP_INLINE void SwapColor(uint32_t* const col1, uint32_t* const col2) {
+  const uint32_t tmp = *col1;
+  *col1 = *col2;
+  *col2 = tmp;
+}
+
+int SearchColorNoIdx(const uint32_t sorted[], uint32_t color, int num_colors) {
+  int low = 0, hi = num_colors;
+  if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
+  while (1) {
+    const int mid = (low + hi) >> 1;
+    if (sorted[mid] == color) {
+      return mid;
+    } else if (sorted[mid] < color) {
+      low = mid;
+    } else {
+      hi = mid;
+    }
+  }
+  assert(0);
+  return 0;
+}
+
+void PrepareMapToPalette(const uint32_t palette[], uint32_t num_colors,
+                         uint32_t sorted[], uint32_t idx_map[]) {
+  uint32_t i;
+  memcpy(sorted, palette, num_colors * sizeof(*sorted));
+  qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
+  for (i = 0; i < num_colors; ++i) {
+    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#define COLOR_HASH_SIZE (MAX_PALETTE_SIZE * 4)
+#define COLOR_HASH_RIGHT_SHIFT 22  // 32 - log2(COLOR_HASH_SIZE).
+
+int GetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
+  int i;
+  int x, y;
+  int num_colors = 0;
+  uint8_t in_use[COLOR_HASH_SIZE] = {0};
+  uint32_t colors[COLOR_HASH_SIZE] = {0};
+  const uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
+  uint32_t last_pix = ~argb[0];  // so we're sure that last_pix != argb[0]
+  assert(pic != NULL);
+  assert(pic->use_argb);
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int key;
+      if (argb[x] == last_pix) {
+        continue;
+      }
+      last_pix = argb[x];
+      key = VP8LHashPix(last_pix, COLOR_HASH_RIGHT_SHIFT);
+      while (1) {
+        if (!in_use[key]) {
+          colors[key] = last_pix;
+          in_use[key] = 1;
+          ++num_colors;
+          if (num_colors > MAX_PALETTE_SIZE) {
+            return MAX_PALETTE_SIZE + 1;  // Exact count not needed.
+          }
+          break;
+        } else if (colors[key] == last_pix) {
+          break;  // The color is already there.
+        } else {
+          // Some other color sits here, so do linear conflict resolution.
+          ++key;
+          key &= (COLOR_HASH_SIZE - 1);  // Key mask.
+        }
+      }
+    }
+    argb += pic->argb_stride;
+  }
+
+  if (palette != NULL) {  // Fill the colors into palette.
+    num_colors = 0;
+    for (i = 0; i < COLOR_HASH_SIZE; ++i) {
+      if (in_use[i]) {
+        palette[num_colors] = colors[i];
+        ++num_colors;
+      }
+    }
+    qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
+  }
+  return num_colors;
+}
+
+#undef COLOR_HASH_SIZE
+#undef COLOR_HASH_RIGHT_SHIFT
+
+// -----------------------------------------------------------------------------
+
+// The palette has been sorted by alpha. This function checks if the other
+// components of the palette have a monotonic development with regards to
+// position in the palette. If all have monotonic development, there is
+// no benefit to re-organize them greedily. A monotonic development
+// would be spotted in green-only situations (like lossy alpha) or gray-scale
+// images.
+static int PaletteHasNonMonotonousDeltas(const uint32_t* const palette,
+                                         int num_colors) {
+  uint32_t predict = 0x000000;
+  int i;
+  uint8_t sign_found = 0x00;
+  for (i = 0; i < num_colors; ++i) {
+    const uint32_t diff = VP8LSubPixels(palette[i], predict);
+    const uint8_t rd = (diff >> 16) & 0xff;
+    const uint8_t gd = (diff >> 8) & 0xff;
+    const uint8_t bd = (diff >> 0) & 0xff;
+    if (rd != 0x00) {
+      sign_found |= (rd < 0x80) ? 1 : 2;
+    }
+    if (gd != 0x00) {
+      sign_found |= (gd < 0x80) ? 8 : 16;
+    }
+    if (bd != 0x00) {
+      sign_found |= (bd < 0x80) ? 64 : 128;
+    }
+    predict = palette[i];
+  }
+  return (sign_found & (sign_found << 1)) != 0;  // two consequent signs.
+}
+
+static void PaletteSortMinimizeDeltas(const uint32_t* const palette_sorted,
+                                      int num_colors, uint32_t* const palette) {
+  uint32_t predict = 0x00000000;
+  int i, k;
+  memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
+  if (!PaletteHasNonMonotonousDeltas(palette_sorted, num_colors)) return;
+  // Find greedily always the closest color of the predicted color to minimize
+  // deltas in the palette. This reduces storage needs since the
+  // palette is stored with delta encoding.
+  for (i = 0; i < num_colors; ++i) {
+    int best_ix = i;
+    uint32_t best_score = ~0U;
+    for (k = i; k < num_colors; ++k) {
+      const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
+      if (best_score > cur_score) {
+        best_score = cur_score;
+        best_ix = k;
+      }
+    }
+    SwapColor(&palette[best_ix], &palette[i]);
+    predict = palette[i];
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+
+// Finds the biggest cooccurrence in the matrix.
+static void CoOccurrenceFindMax(const uint32_t* const cooccurrence,
+                                uint32_t num_colors, uint8_t* const c1,
+                                uint8_t* const c2) {
+  // Find the index that is most frequently located adjacent to other
+  // (different) indexes.
+  uint32_t best_sum = 0u;
+  uint32_t i, j, best_cooccurrence;
+  *c1 = 0u;
+  for (i = 0; i < num_colors; ++i) {
+    uint32_t sum = 0;
+    for (j = 0; j < num_colors; ++j) sum += cooccurrence[i * num_colors + j];
+    if (sum > best_sum) {
+      best_sum = sum;
+      *c1 = i;
+    }
+  }
+  // Find the index that is most frequently found adjacent to *c1.
+  *c2 = 0u;
+  best_cooccurrence = 0u;
+  for (i = 0; i < num_colors; ++i) {
+    if (cooccurrence[*c1 * num_colors + i] > best_cooccurrence) {
+      best_cooccurrence = cooccurrence[*c1 * num_colors + i];
+      *c2 = i;
+    }
+  }
+  assert(*c1 != *c2);
+}
+
+// Builds the cooccurrence matrix
+static int CoOccurrenceBuild(const WebPPicture* const pic,
+                             const uint32_t* const palette, uint32_t num_colors,
+                             uint32_t* cooccurrence) {
+  uint32_t *lines, *line_top, *line_current, *line_tmp;
+  int x, y;
+  const uint32_t* src = pic->argb;
+  uint32_t prev_pix = ~src[0];
+  uint32_t prev_idx = 0u;
+  uint32_t idx_map[MAX_PALETTE_SIZE] = {0};
+  uint32_t palette_sorted[MAX_PALETTE_SIZE];
+  lines = (uint32_t*)WebPSafeMalloc(2 * pic->width, sizeof(*lines));
+  if (lines == NULL) {
+    return 0;
+  }
+  line_top = &lines[0];
+  line_current = &lines[pic->width];
+  PrepareMapToPalette(palette, num_colors, palette_sorted, idx_map);
+  for (y = 0; y < pic->height; ++y) {
+    for (x = 0; x < pic->width; ++x) {
+      const uint32_t pix = src[x];
+      if (pix != prev_pix) {
+        prev_idx = idx_map[SearchColorNoIdx(palette_sorted, pix, num_colors)];
+        prev_pix = pix;
+      }
+      line_current[x] = prev_idx;
+      // 4-connectivity is what works best as mentioned in "On the relation
+      // between Memon's and the modified Zeng's palette reordering methods".
+      if (x > 0 && prev_idx != line_current[x - 1]) {
+        const uint32_t left_idx = line_current[x - 1];
+        ++cooccurrence[prev_idx * num_colors + left_idx];
+        ++cooccurrence[left_idx * num_colors + prev_idx];
+      }
+      if (y > 0 && prev_idx != line_top[x]) {
+        const uint32_t top_idx = line_top[x];
+        ++cooccurrence[prev_idx * num_colors + top_idx];
+        ++cooccurrence[top_idx * num_colors + prev_idx];
+      }
+    }
+    line_tmp = line_top;
+    line_top = line_current;
+    line_current = line_tmp;
+    src += pic->argb_stride;
+  }
+  WebPSafeFree(lines);
+  return 1;
+}
+
+struct Sum {
+  uint8_t index;
+  uint32_t sum;
+};
+
+static int PaletteSortModifiedZeng(const WebPPicture* const pic,
+                                   const uint32_t* const palette_in,
+                                   uint32_t num_colors,
+                                   uint32_t* const palette) {
+  uint32_t i, j, ind;
+  uint8_t remapping[MAX_PALETTE_SIZE];
+  uint32_t* cooccurrence;
+  struct Sum sums[MAX_PALETTE_SIZE];
+  uint32_t first, last;
+  uint32_t num_sums;
+  // TODO(vrabaud) check whether one color images should use palette or not.
+  if (num_colors <= 1) return 1;
+  // Build the co-occurrence matrix.
+  cooccurrence =
+      (uint32_t*)WebPSafeCalloc(num_colors * num_colors, sizeof(*cooccurrence));
+  if (cooccurrence == NULL) {
+    return 0;
+  }
+  if (!CoOccurrenceBuild(pic, palette_in, num_colors, cooccurrence)) {
+    WebPSafeFree(cooccurrence);
+    return 0;
+  }
+
+  // Initialize the mapping list with the two best indices.
+  CoOccurrenceFindMax(cooccurrence, num_colors, &remapping[0], &remapping[1]);
+
+  // We need to append and prepend to the list of remapping. To this end, we
+  // actually define the next start/end of the list as indices in a vector (with
+  // a wrap around when the end is reached).
+  first = 0;
+  last = 1;
+  num_sums = num_colors - 2;  // -2 because we know the first two values
+  if (num_sums > 0) {
+    // Initialize the sums with the first two remappings and find the best one
+    struct Sum* best_sum = &sums[0];
+    best_sum->index = 0u;
+    best_sum->sum = 0u;
+    for (i = 0, j = 0; i < num_colors; ++i) {
+      if (i == remapping[0] || i == remapping[1]) continue;
+      sums[j].index = i;
+      sums[j].sum = cooccurrence[i * num_colors + remapping[0]] +
+                    cooccurrence[i * num_colors + remapping[1]];
+      if (sums[j].sum > best_sum->sum) best_sum = &sums[j];
+      ++j;
+    }
+
+    while (num_sums > 0) {
+      const uint8_t best_index = best_sum->index;
+      // Compute delta to know if we need to prepend or append the best index.
+      int32_t delta = 0;
+      const int32_t n = num_colors - num_sums;
+      for (ind = first, j = 0; (ind + j) % num_colors != last + 1; ++j) {
+        const uint16_t l_j = remapping[(ind + j) % num_colors];
+        delta += (n - 1 - 2 * (int32_t)j) *
+                 (int32_t)cooccurrence[best_index * num_colors + l_j];
+      }
+      if (delta > 0) {
+        first = (first == 0) ? num_colors - 1 : first - 1;
+        remapping[first] = best_index;
+      } else {
+        ++last;
+        remapping[last] = best_index;
+      }
+      // Remove best_sum from sums.
+      *best_sum = sums[num_sums - 1];
+      --num_sums;
+      // Update all the sums and find the best one.
+      best_sum = &sums[0];
+      for (i = 0; i < num_sums; ++i) {
+        sums[i].sum += cooccurrence[best_index * num_colors + sums[i].index];
+        if (sums[i].sum > best_sum->sum) best_sum = &sums[i];
+      }
+    }
+  }
+  assert((last + 1) % num_colors == first);
+  WebPSafeFree(cooccurrence);
+
+  // Re-map the palette.
+  for (i = 0; i < num_colors; ++i) {
+    palette[i] = palette_in[remapping[(first + i) % num_colors]];
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+
+int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
+                const uint32_t* const palette_sorted, uint32_t num_colors,
+                uint32_t* const palette) {
+  switch (method) {
+    case kSortedDefault:
+      // Nothing to do, we have already sorted the palette.
+      memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
+      return 1;
+    case kMinimizeDelta:
+      PaletteSortMinimizeDeltas(palette_sorted, num_colors, palette);
+      return 1;
+    case kModifiedZeng:
+      return PaletteSortModifiedZeng(pic, palette_sorted, num_colors, palette);
+    case kUnusedPalette:
+    case kPaletteSortingNum:
+      break;
+  }
+
+  assert(0);
+  return 0;
+}
diff --git a/3rdparty/libwebp/src/utils/palette.h b/3rdparty/libwebp/src/utils/palette.h
new file mode 100644
index 000000000000..34479e463fe3
--- /dev/null
+++ b/3rdparty/libwebp/src/utils/palette.h
@@ -0,0 +1,60 @@
+// Copyright 2023 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for palette analysis.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#ifndef WEBP_UTILS_PALETTE_H_
+#define WEBP_UTILS_PALETTE_H_
+
+#include "src/webp/types.h"
+
+struct WebPPicture;
+
+// The different ways a palette can be sorted.
+typedef enum PaletteSorting {
+  kSortedDefault = 0,
+  // Sorts by minimizing L1 deltas between consecutive colors, giving more
+  // weight to RGB colors.
+  kMinimizeDelta = 1,
+  // Implements the modified Zeng method from "A Survey on Palette Reordering
+  // Methods for Improving the Compression of Color-Indexed Images" by Armando
+  // J. Pinho and Antonio J. R. Neves.
+  kModifiedZeng = 2,
+  kUnusedPalette = 3,
+  kPaletteSortingNum = 4
+} PaletteSorting;
+
+// Returns the index of 'color' in the sorted palette 'sorted' of size
+// 'num_colors'.
+int SearchColorNoIdx(const uint32_t sorted[], uint32_t color, int num_colors);
+
+// Sort palette in increasing order and prepare an inverse mapping array.
+void PrepareMapToPalette(const uint32_t palette[], uint32_t num_colors,
+                         uint32_t sorted[], uint32_t idx_map[]);
+
+// Returns count of unique colors in 'pic', assuming pic->use_argb is true.
+// If the unique color count is more than MAX_PALETTE_SIZE, returns
+// MAX_PALETTE_SIZE+1.
+// If 'palette' is not NULL and the number of unique colors is less than or
+// equal to MAX_PALETTE_SIZE, also outputs the actual unique colors into
+// 'palette' in a sorted order. Note: 'palette' is assumed to be an array
+// already allocated with at least MAX_PALETTE_SIZE elements.
+int GetColorPalette(const struct WebPPicture* const pic,
+                    uint32_t* const palette);
+
+// Sorts the palette according to the criterion defined by 'method'.
+// 'palette_sorted' is the input palette sorted lexicographically, as done in
+// PrepareMapToPalette. Returns 0 on memory allocation error.
+int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
+                const uint32_t* const palette_sorted, uint32_t num_colors,
+                uint32_t* const palette);
+
+#endif  // WEBP_UTILS_PALETTE_H_
diff --git a/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c b/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
index f65b6cdbb696..97e78937043e 100644
--- a/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
+++ b/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -30,7 +30,7 @@
 
 #define DFIX 4           // extra precision for ordered dithering
 #define DSIZE 4          // dithering size (must be a power of two)
-// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+// cf. https://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
   {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
   { 12,  4, 14,  6 },
diff --git a/3rdparty/libwebp/src/utils/rescaler_utils.c b/3rdparty/libwebp/src/utils/rescaler_utils.c
index 4bcae24af54a..a0581a14b1a8 100644
--- a/3rdparty/libwebp/src/utils/rescaler_utils.c
+++ b/3rdparty/libwebp/src/utils/rescaler_utils.c
@@ -12,66 +12,74 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include "src/dsp/dsp.h"
 #include "src/utils/rescaler_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 
-void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst,
-                      int dst_width, int dst_height, int dst_stride,
-                      int num_channels, rescaler_t* const work) {
+int WebPRescalerInit(WebPRescaler* const rescaler,
+                     int src_width, int src_height,
+                     uint8_t* const dst,
+                     int dst_width, int dst_height, int dst_stride,
+                     int num_channels, rescaler_t* const work) {
   const int x_add = src_width, x_sub = dst_width;
   const int y_add = src_height, y_sub = dst_height;
-  wrk->x_expand = (src_width < dst_width);
-  wrk->y_expand = (src_height < dst_height);
-  wrk->src_width = src_width;
-  wrk->src_height = src_height;
-  wrk->dst_width = dst_width;
-  wrk->dst_height = dst_height;
-  wrk->src_y = 0;
-  wrk->dst_y = 0;
-  wrk->dst = dst;
-  wrk->dst_stride = dst_stride;
-  wrk->num_channels = num_channels;
+  const uint64_t total_size = 2ull * dst_width * num_channels * sizeof(*work);
+  if (!CheckSizeOverflow(total_size)) return 0;
+
+  rescaler->x_expand = (src_width < dst_width);
+  rescaler->y_expand = (src_height < dst_height);
+  rescaler->src_width = src_width;
+  rescaler->src_height = src_height;
+  rescaler->dst_width = dst_width;
+  rescaler->dst_height = dst_height;
+  rescaler->src_y = 0;
+  rescaler->dst_y = 0;
+  rescaler->dst = dst;
+  rescaler->dst_stride = dst_stride;
+  rescaler->num_channels = num_channels;
 
   // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
-  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  if (!wrk->x_expand) {  // fx_scale is not used otherwise
-    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+  rescaler->x_add = rescaler->x_expand ? (x_sub - 1) : x_add;
+  rescaler->x_sub = rescaler->x_expand ? (x_add - 1) : x_sub;
+  if (!rescaler->x_expand) {  // fx_scale is not used otherwise
+    rescaler->fx_scale = WEBP_RESCALER_FRAC(1, rescaler->x_sub);
   }
   // vertical scaling parameters
-  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
-  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
-  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
-  if (!wrk->y_expand) {
+  rescaler->y_add = rescaler->y_expand ? y_add - 1 : y_add;
+  rescaler->y_sub = rescaler->y_expand ? y_sub - 1 : y_sub;
+  rescaler->y_accum = rescaler->y_expand ? rescaler->y_sub : rescaler->y_add;
+  if (!rescaler->y_expand) {
     // This is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
-    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= wrk->y_add, and
-    // wrk->x_add >= 1;
-    const uint64_t ratio =
-        (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= rescaler->y_add
+    // and rescaler->x_add >= 1;
+    const uint64_t num = (uint64_t)dst_height * WEBP_RESCALER_ONE;
+    const uint64_t den = (uint64_t)rescaler->x_add * rescaler->y_add;
+    const uint64_t ratio = num / den;
     if (ratio != (uint32_t)ratio) {
       // When ratio == WEBP_RESCALER_ONE, we can't represent the ratio with the
       // current fixed-point precision. This happens when src_height ==
-      // wrk->y_add (which == src_height), and wrk->x_add == 1.
+      // rescaler->y_add (which == src_height), and rescaler->x_add == 1.
       // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
-      wrk->fxy_scale = 0;
+      rescaler->fxy_scale = 0;
     } else {
-      wrk->fxy_scale = (uint32_t)ratio;
+      rescaler->fxy_scale = (uint32_t)ratio;
     }
-    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+    rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->y_sub);
   } else {
-    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
-    // wrk->fxy_scale is unused here.
+    rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->x_add);
+    // rescaler->fxy_scale is unused here.
   }
-  wrk->irow = work;
-  wrk->frow = work + num_channels * dst_width;
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+  rescaler->irow = work;
+  rescaler->frow = work + num_channels * dst_width;
+  memset(work, 0, (size_t)total_size);
 
   WebPRescalerDspInit();
+  return 1;
 }
 
 int WebPRescalerGetScaledDimensions(int src_width, int src_height,
@@ -82,6 +90,7 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
   {
     int width = *scaled_width;
     int height = *scaled_height;
+    const int max_size = INT_MAX / 2;
 
     // if width is unspecified, scale original proportionally to height ratio.
     if (width == 0 && src_height > 0) {
@@ -94,7 +103,7 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
           (int)(((uint64_t)src_height * width + src_width - 1) / src_width);
     }
     // Check if the overall dimensions still make sense.
-    if (width <= 0 || height <= 0) {
+    if (width <= 0 || height <= 0 || width > max_size || height > max_size) {
       return 0;
     }
 
@@ -107,31 +116,34 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
 //------------------------------------------------------------------------------
 // all-in-one calls
 
-int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
-  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
+int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
+                           int max_num_lines) {
+  const int num_lines =
+      (rescaler->y_accum + rescaler->y_sub - 1) / rescaler->y_sub;
   return (num_lines > max_num_lines) ? max_num_lines : num_lines;
 }
 
-int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
+int WebPRescalerImport(WebPRescaler* const rescaler, int num_lines,
                        const uint8_t* src, int src_stride) {
   int total_imported = 0;
-  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
-    if (wrk->y_expand) {
-      rescaler_t* const tmp = wrk->irow;
-      wrk->irow = wrk->frow;
-      wrk->frow = tmp;
+  while (total_imported < num_lines &&
+         !WebPRescalerHasPendingOutput(rescaler)) {
+    if (rescaler->y_expand) {
+      rescaler_t* const tmp = rescaler->irow;
+      rescaler->irow = rescaler->frow;
+      rescaler->frow = tmp;
     }
-    WebPRescalerImportRow(wrk, src);
-    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+    WebPRescalerImportRow(rescaler, src);
+    if (!rescaler->y_expand) {    // Accumulate the contribution of the new row.
       int x;
-      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
-        wrk->irow[x] += wrk->frow[x];
+      for (x = 0; x < rescaler->num_channels * rescaler->dst_width; ++x) {
+        rescaler->irow[x] += rescaler->frow[x];
       }
     }
-    ++wrk->src_y;
+    ++rescaler->src_y;
     src += src_stride;
     ++total_imported;
-    wrk->y_accum -= wrk->y_sub;
+    rescaler->y_accum -= rescaler->y_sub;
   }
   return total_imported;
 }
diff --git a/3rdparty/libwebp/src/utils/rescaler_utils.h b/3rdparty/libwebp/src/utils/rescaler_utils.h
index ca41e42c4a53..ef201ef86c19 100644
--- a/3rdparty/libwebp/src/utils/rescaler_utils.h
+++ b/3rdparty/libwebp/src/utils/rescaler_utils.h
@@ -47,12 +47,13 @@ struct WebPRescaler {
 };
 
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
-void WebPRescalerInit(WebPRescaler* const rescaler,
-                      int src_width, int src_height,
-                      uint8_t* const dst,
-                      int dst_width, int dst_height, int dst_stride,
-                      int num_channels,
-                      rescaler_t* const work);
+// Returns false in case of error.
+int WebPRescalerInit(WebPRescaler* const rescaler,
+                     int src_width, int src_height,
+                     uint8_t* const dst,
+                     int dst_width, int dst_height, int dst_stride,
+                     int num_channels,
+                     rescaler_t* const work);
 
 // If either 'scaled_width' or 'scaled_height' (but not both) is 0 the value
 // will be calculated preserving the aspect ratio, otherwise the values are
diff --git a/3rdparty/libwebp/src/utils/utils.c b/3rdparty/libwebp/src/utils/utils.c
index 6080e19e2176..408ce88f67f6 100644
--- a/3rdparty/libwebp/src/utils/utils.c
+++ b/3rdparty/libwebp/src/utils/utils.c
@@ -11,19 +11,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include "src/utils/utils.h"
+
 #include <stdlib.h>
 #include <string.h>  // for memcpy()
-#include "src/webp/decode.h"
+
+#include "src/utils/palette.h"
 #include "src/webp/encode.h"
-#include "src/webp/format_constants.h"  // for MAX_PALETTE_SIZE
-#include "src/utils/color_cache_utils.h"
-#include "src/utils/utils.h"
 
 // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
 // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
 // and not multi-thread safe!).
 // An interesting alternative is valgrind's 'massif' tool:
-//    http://valgrind.org/docs/manual/ms-manual.html
+//    https://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
                --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
@@ -101,6 +101,9 @@ static void Increment(int* const v) {
 #if defined(MALLOC_LIMIT)
     {
       const char* const malloc_limit_str = getenv("MALLOC_LIMIT");
+#if MALLOC_LIMIT > 1
+      mem_limit = (size_t)MALLOC_LIMIT;
+#endif
       if (malloc_limit_str != NULL) {
         mem_limit = atoi(malloc_limit_str);
       }
@@ -169,16 +172,16 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   const uint64_t total_size = nmemb * size;
   if (nmemb == 0) return 1;
   if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
-  if (total_size != (size_t)total_size) return 0;
+  if (!CheckSizeOverflow(total_size)) return 0;
 #if defined(PRINT_MEM_INFO) && defined(MALLOC_FAIL_AT)
   if (countdown_to_fail > 0 && --countdown_to_fail == 0) {
     return 0;    // fake fail!
   }
 #endif
-#if defined(MALLOC_LIMIT)
+#if defined(PRINT_MEM_INFO) && defined(MALLOC_LIMIT)
   if (mem_limit > 0) {
     const uint64_t new_total_mem = (uint64_t)total_mem + total_size;
-    if (new_total_mem != (size_t)new_total_mem ||
+    if (!CheckSizeOverflow(new_total_mem) ||
         new_total_mem > mem_limit) {
       return 0;   // fake fail!
     }
@@ -249,66 +252,10 @@ void WebPCopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
 
 //------------------------------------------------------------------------------
 
-#define COLOR_HASH_SIZE         (MAX_PALETTE_SIZE * 4)
-#define COLOR_HASH_RIGHT_SHIFT  22  // 32 - log2(COLOR_HASH_SIZE).
-
 int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
-  int i;
-  int x, y;
-  int num_colors = 0;
-  uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
-  uint32_t colors[COLOR_HASH_SIZE];
-  const uint32_t* argb = pic->argb;
-  const int width = pic->width;
-  const int height = pic->height;
-  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
-  assert(pic != NULL);
-  assert(pic->use_argb);
-
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      int key;
-      if (argb[x] == last_pix) {
-        continue;
-      }
-      last_pix = argb[x];
-      key = VP8LHashPix(last_pix, COLOR_HASH_RIGHT_SHIFT);
-      while (1) {
-        if (!in_use[key]) {
-          colors[key] = last_pix;
-          in_use[key] = 1;
-          ++num_colors;
-          if (num_colors > MAX_PALETTE_SIZE) {
-            return MAX_PALETTE_SIZE + 1;  // Exact count not needed.
-          }
-          break;
-        } else if (colors[key] == last_pix) {
-          break;  // The color is already there.
-        } else {
-          // Some other color sits here, so do linear conflict resolution.
-          ++key;
-          key &= (COLOR_HASH_SIZE - 1);  // Key mask.
-        }
-      }
-    }
-    argb += pic->argb_stride;
-  }
-
-  if (palette != NULL) {  // Fill the colors into palette.
-    num_colors = 0;
-    for (i = 0; i < COLOR_HASH_SIZE; ++i) {
-      if (in_use[i]) {
-        palette[num_colors] = colors[i];
-        ++num_colors;
-      }
-    }
-  }
-  return num_colors;
+  return GetColorPalette(pic, palette);
 }
 
-#undef COLOR_HASH_SIZE
-#undef COLOR_HASH_RIGHT_SHIFT
-
 //------------------------------------------------------------------------------
 
 #if defined(WEBP_NEED_LOG_TABLE_8BIT)
diff --git a/3rdparty/libwebp/src/utils/utils.h b/3rdparty/libwebp/src/utils/utils.h
index 2a3ec926784e..b2241fbf9bf7 100644
--- a/3rdparty/libwebp/src/utils/utils.h
+++ b/3rdparty/libwebp/src/utils/utils.h
@@ -20,9 +20,7 @@
 #endif
 
 #include <assert.h>
-#include <limits.h>
 
-#include "src/dsp/dsp.h"
 #include "src/webp/types.h"
 
 #ifdef __cplusplus
@@ -42,6 +40,10 @@ extern "C" {
 #endif
 #endif  // WEBP_MAX_ALLOCABLE_MEMORY
 
+static WEBP_INLINE int CheckSizeOverflow(uint64_t size) {
+  return size == (size_t)size;
+}
+
 // size-checking safe malloc/calloc: verify that the requested size is not too
 // large, or return NULL. You don't need to call these for constructs like
 // malloc(sizeof(foo)), but only if there's picture-dependent size involved
@@ -60,7 +62,8 @@ WEBP_EXTERN void WebPSafeFree(void* const ptr);
 // Alignment
 
 #define WEBP_ALIGN_CST 31
-#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST)
+#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & \
+                         ~(uintptr_t)WEBP_ALIGN_CST)
 
 #include <string.h>
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
@@ -69,10 +72,19 @@ static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
   memcpy(&A, ptr, sizeof(A));
   return A;
 }
+
+static WEBP_INLINE int32_t WebPMemToInt32(const uint8_t* const ptr) {
+  return (int32_t)WebPMemToUint32(ptr);
+}
+
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
   memcpy(ptr, &val, sizeof(val));
 }
 
+static WEBP_INLINE void WebPInt32ToMem(uint8_t* const ptr, int val) {
+  WebPUint32ToMem(ptr, (uint32_t)val);
+}
+
 //------------------------------------------------------------------------------
 // Reading/writing data.
 
@@ -107,24 +119,33 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
-// Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+// Returns (int)floor(log2(n)). n must be > 0.
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
   return 31 ^ __builtin_clz(n);
 }
+// counts the number of trailing zero
+static WEBP_INLINE int BitsCtz(uint32_t n) { return __builtin_ctz(n); }
 #elif defined(_MSC_VER) && _MSC_VER > 1310 && \
       (defined(_M_X64) || defined(_M_IX86))
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
+  unsigned long first_set_bit;  // NOLINT (runtime/int)
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#else   // default: use the C-version.
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+  unsigned long first_set_bit;  // NOLINT (runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+#else   // default: use the (slow) C-version.
+#define WEBP_HAVE_SLOW_CLZ_CTZ   // signal that the Clz/Ctz function are slow
 // Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
 // based on table or not. Can be used as fallback if clz() is not available.
 #define WEBP_NEED_LOG_TABLE_8BIT
@@ -139,6 +160,15 @@ static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
 }
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
+
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+  int i;
+  for (i = 0; i < 32; ++i, n >>= 1) {
+    if (n & 1) return i;
+  }
+  return 32;
+}
+
 #endif
 
 //------------------------------------------------------------------------------
@@ -166,6 +196,7 @@ WEBP_EXTERN void WebPCopyPixels(const struct WebPPicture* const src,
 // MAX_PALETTE_SIZE, also outputs the actual unique colors into 'palette'.
 // Note: 'palette' is assumed to be an array already allocated with at least
 // MAX_PALETTE_SIZE elements.
+// TODO(vrabaud) remove whenever we can break the ABI.
 WEBP_EXTERN int WebPGetColorPalette(const struct WebPPicture* const pic,
                                     uint32_t* const palette);
 
diff --git a/3rdparty/libwebp/src/webp/decode.h b/3rdparty/libwebp/src/webp/decode.h
index 44fcd64a84d4..d6895f5c5550 100644
--- a/3rdparty/libwebp/src/webp/decode.h
+++ b/3rdparty/libwebp/src/webp/decode.h
@@ -48,48 +48,47 @@ WEBP_EXTERN int WebPGetDecoderVersion(void);
 // RIFF + VP8X + (optional chunks) + VP8(L)
 // ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
 // VP8(L)     <-- Not a valid WebP format: only allowed for internal purpose.
-WEBP_EXTERN int WebPGetInfo(const uint8_t* data, size_t data_size,
-                            int* width, int* height);
+WEBP_NODISCARD WEBP_EXTERN int WebPGetInfo(
+    const uint8_t* data, size_t data_size, int* width, int* height);
 
 // Decodes WebP images pointed to by 'data' and returns RGBA samples, along
 // with the dimensions in *width and *height. The ordering of samples in
 // memory is R, G, B, A, R, G, B, A... in scan order (endian-independent).
 // The returned pointer should be deleted calling WebPFree().
 // Returns NULL in case of error.
-WEBP_EXTERN uint8_t* WebPDecodeRGBA(const uint8_t* data, size_t data_size,
-                                    int* width, int* height);
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeRGBA(
+    const uint8_t* data, size_t data_size, int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning A, R, G, B, A, R, G, B... ordered data.
-WEBP_EXTERN uint8_t* WebPDecodeARGB(const uint8_t* data, size_t data_size,
-                                    int* width, int* height);
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeARGB(
+    const uint8_t* data, size_t data_size, int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning B, G, R, A, B, G, R, A... ordered data.
-WEBP_EXTERN uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
-                                    int* width, int* height);
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeBGRA(
+    const uint8_t* data, size_t data_size, int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning R, G, B, R, G, B... ordered data.
 // If the bitstream contains transparency, it is ignored.
-WEBP_EXTERN uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
-                                   int* width, int* height);
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeRGB(
+    const uint8_t* data, size_t data_size, int* width, int* height);
 
 // Same as WebPDecodeRGB, but returning B, G, R, B, G, R... ordered data.
-WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
-                                   int* width, int* height);
-
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeBGR(
+    const uint8_t* data, size_t data_size, int* width, int* height);
 
 // Decode WebP images pointed to by 'data' to Y'UV format(*). The pointer
 // returned is the Y samples buffer. Upon return, *u and *v will point to
 // the U and V chroma data. These U and V buffers need NOT be passed to
 // WebPFree(), unlike the returned Y luma one. The dimension of the U and V
-// planes are both (*width + 1) / 2 and (*height + 1)/ 2.
+// planes are both (*width + 1) / 2 and (*height + 1) / 2.
 // Upon return, the Y buffer has a stride returned as '*stride', while U and V
 // have a common stride returned as '*uv_stride'.
-// Return NULL in case of error.
-// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
-WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
-                                   int* width, int* height,
-                                   uint8_t** u, uint8_t** v,
-                                   int* stride, int* uv_stride);
+// 'width' and 'height' may be NULL, the other pointers must not be.
+// Returns NULL in case of error.
+// (*) Also named Y'CbCr. See: https://en.wikipedia.org/wiki/YCbCr
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeYUV(
+    const uint8_t* data, size_t data_size, int* width, int* height,
+    uint8_t** u, uint8_t** v, int* stride, int* uv_stride);
 
 // These five functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
@@ -99,22 +98,22 @@ WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
 // The parameter 'output_stride' specifies the distance (in bytes)
 // between scanlines. Hence, output_buffer_size is expected to be at least
 // output_stride x picture-height.
-WEBP_EXTERN uint8_t* WebPDecodeRGBAInto(
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeRGBAInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN uint8_t* WebPDecodeARGBInto(
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeARGBInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN uint8_t* WebPDecodeBGRAInto(
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeBGRAInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
 // RGB and BGR variants. Here too the transparency information, if present,
 // will be dropped and ignored.
-WEBP_EXTERN uint8_t* WebPDecodeRGBInto(
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeRGBInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN uint8_t* WebPDecodeBGRInto(
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeBGRInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
@@ -125,7 +124,7 @@ WEBP_EXTERN uint8_t* WebPDecodeBGRInto(
 // 'u_size' and 'v_size' respectively.
 // Pointer to the luma plane ('*luma') is returned or NULL if an error occurred
 // during decoding (or because some buffers were found to be too small).
-WEBP_EXTERN uint8_t* WebPDecodeYUVInto(
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPDecodeYUVInto(
     const uint8_t* data, size_t data_size,
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
@@ -216,11 +215,11 @@ struct WebPDecBuffer {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN int WebPInitDecBufferInternal(WebPDecBuffer*, int);
+WEBP_NODISCARD WEBP_EXTERN int WebPInitDecBufferInternal(WebPDecBuffer*, int);
 
 // Initialize the structure as empty. Must be called before any other use.
 // Returns false in case of version mismatch
-static WEBP_INLINE int WebPInitDecBuffer(WebPDecBuffer* buffer) {
+WEBP_NODISCARD static WEBP_INLINE int WebPInitDecBuffer(WebPDecBuffer* buffer) {
   return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION);
 }
 
@@ -231,7 +230,7 @@ WEBP_EXTERN void WebPFreeDecBuffer(WebPDecBuffer* buffer);
 //------------------------------------------------------------------------------
 // Enumeration of the status codes
 
-typedef enum VP8StatusCode {
+typedef enum WEBP_NODISCARD VP8StatusCode {
   VP8_STATUS_OK = 0,
   VP8_STATUS_OUT_OF_MEMORY,
   VP8_STATUS_INVALID_PARAM,
@@ -250,23 +249,24 @@ typedef enum VP8StatusCode {
 // WebPIDecoder object. This object can be left in a SUSPENDED state if the
 // picture is only partially decoded, pending additional input.
 // Code example:
-//
-//   WebPInitDecBuffer(&output_buffer);
-//   output_buffer.colorspace = mode;
-//   ...
-//   WebPIDecoder* idec = WebPINewDecoder(&output_buffer);
-//   while (additional_data_is_available) {
-//     // ... (get additional data in some new_data[] buffer)
-//     status = WebPIAppend(idec, new_data, new_data_size);
-//     if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED) {
-//       break;    // an error occurred.
-//     }
-//
-//     // The above call decodes the current available buffer.
-//     // Part of the image can now be refreshed by calling
-//     // WebPIDecGetRGB()/WebPIDecGetYUVA() etc.
-//   }
-//   WebPIDelete(idec);
+/*
+     WebPInitDecBuffer(&output_buffer);
+     output_buffer.colorspace = mode;
+     ...
+     WebPIDecoder* idec = WebPINewDecoder(&output_buffer);
+     while (additional_data_is_available) {
+       // ... (get additional data in some new_data[] buffer)
+       status = WebPIAppend(idec, new_data, new_data_size);
+       if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED) {
+         break;    // an error occurred.
+       }
+
+       // The above call decodes the current available buffer.
+       // Part of the image can now be refreshed by calling
+       // WebPIDecGetRGB()/WebPIDecGetYUVA() etc.
+     }
+     WebPIDelete(idec);
+*/
 
 // Creates a new incremental decoder with the supplied buffer parameter.
 // This output_buffer can be passed NULL, in which case a default output buffer
@@ -280,7 +280,8 @@ typedef enum VP8StatusCode {
 // within valid bounds.
 // All other fields of WebPDecBuffer MUST remain constant between calls.
 // Returns NULL if the allocation failed.
-WEBP_EXTERN WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer);
+WEBP_NODISCARD WEBP_EXTERN WebPIDecoder* WebPINewDecoder(
+    WebPDecBuffer* output_buffer);
 
 // This function allocates and initializes an incremental-decoder object, which
 // will output the RGB/A samples specified by 'csp' into a preallocated
@@ -292,7 +293,7 @@ WEBP_EXTERN WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer);
 // colorspace 'csp' is taken into account for allocating this buffer. All other
 // parameters are ignored.
 // Returns NULL if the allocation failed, or if some parameters are invalid.
-WEBP_EXTERN WebPIDecoder* WebPINewRGB(
+WEBP_NODISCARD WEBP_EXTERN WebPIDecoder* WebPINewRGB(
     WEBP_CSP_MODE csp,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
@@ -307,7 +308,7 @@ WEBP_EXTERN WebPIDecoder* WebPINewRGB(
 // In this case, the output buffer will be automatically allocated (using
 // MODE_YUVA) when decoding starts. All parameters are then ignored.
 // Returns NULL if the allocation failed or if a parameter is invalid.
-WEBP_EXTERN WebPIDecoder* WebPINewYUVA(
+WEBP_NODISCARD WEBP_EXTERN WebPIDecoder* WebPINewYUVA(
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
     uint8_t* v, size_t v_size, int v_stride,
@@ -315,7 +316,7 @@ WEBP_EXTERN WebPIDecoder* WebPINewYUVA(
 
 // Deprecated version of the above, without the alpha plane.
 // Kept for backward compatibility.
-WEBP_EXTERN WebPIDecoder* WebPINewYUV(
+WEBP_NODISCARD WEBP_EXTERN WebPIDecoder* WebPINewYUV(
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
     uint8_t* v, size_t v_size, int v_stride);
@@ -345,21 +346,21 @@ WEBP_EXTERN VP8StatusCode WebPIUpdate(
 // (*last_y, *width etc.) can be NULL if corresponding information is not
 // needed. The values in these pointers are only valid on successful (non-NULL)
 // return.
-WEBP_EXTERN uint8_t* WebPIDecGetRGB(
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPIDecGetRGB(
     const WebPIDecoder* idec, int* last_y,
     int* width, int* height, int* stride);
 
 // Same as above function to get a YUVA image. Returns pointer to the luma
 // plane or NULL in case of error. If there is no alpha information
 // the alpha pointer '*a' will be returned NULL.
-WEBP_EXTERN uint8_t* WebPIDecGetYUVA(
+WEBP_NODISCARD WEBP_EXTERN uint8_t* WebPIDecGetYUVA(
     const WebPIDecoder* idec, int* last_y,
     uint8_t** u, uint8_t** v, uint8_t** a,
     int* width, int* height, int* stride, int* uv_stride, int* a_stride);
 
 // Deprecated alpha-less version of WebPIDecGetYUVA(): it will ignore the
 // alpha information (if present). Kept for backward compatibility.
-static WEBP_INLINE uint8_t* WebPIDecGetYUV(
+WEBP_NODISCARD static WEBP_INLINE uint8_t* WebPIDecGetYUV(
     const WebPIDecoder* idec, int* last_y, uint8_t** u, uint8_t** v,
     int* width, int* height, int* stride, int* uv_stride) {
   return WebPIDecGetYUVA(idec, last_y, u, v, NULL, width, height,
@@ -372,7 +373,7 @@ static WEBP_INLINE uint8_t* WebPIDecGetYUV(
 // Returns NULL in case the incremental decoder object is in an invalid state.
 // Otherwise returns the pointer to the internal representation. This structure
 // is read-only, tied to WebPIDecoder's lifespan and should not be modified.
-WEBP_EXTERN const WebPDecBuffer* WebPIDecodedArea(
+WEBP_NODISCARD WEBP_EXTERN const WebPDecBuffer* WebPIDecodedArea(
     const WebPIDecoder* idec, int* left, int* top, int* width, int* height);
 
 //------------------------------------------------------------------------------
@@ -388,7 +389,7 @@ WEBP_EXTERN const WebPDecBuffer* WebPIDecodedArea(
      CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
 
      // C) Adjust 'config', if needed
-     config.no_fancy_upsampling = 1;
+     config.options.no_fancy_upsampling = 1;
      config.output.colorspace = MODE_BGRA;
      // etc.
 
@@ -467,12 +468,14 @@ struct WebPDecoderConfig {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN int WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
+WEBP_NODISCARD WEBP_EXTERN int WebPInitDecoderConfigInternal(WebPDecoderConfig*,
+                                                             int);
 
 // Initialize the configuration as empty. This function must always be
 // called first, unless WebPGetFeatures() is to be called.
 // Returns false in case of mismatched version.
-static WEBP_INLINE int WebPInitDecoderConfig(WebPDecoderConfig* config) {
+WEBP_NODISCARD static WEBP_INLINE int WebPInitDecoderConfig(
+    WebPDecoderConfig* config) {
   return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION);
 }
 
@@ -487,8 +490,8 @@ static WEBP_INLINE int WebPInitDecoderConfig(WebPDecoderConfig* config) {
 // The return WebPIDecoder object must always be deleted calling WebPIDelete().
 // Returns NULL in case of error (and config->status will then reflect
 // the error condition, if available).
-WEBP_EXTERN WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
-                                      WebPDecoderConfig* config);
+WEBP_NODISCARD WEBP_EXTERN WebPIDecoder* WebPIDecode(
+    const uint8_t* data, size_t data_size, WebPDecoderConfig* config);
 
 // Non-incremental version. This version decodes the full data at once, taking
 // 'config' into account. Returns decoding status (which should be VP8_STATUS_OK
diff --git a/3rdparty/libwebp/src/webp/demux.h b/3rdparty/libwebp/src/webp/demux.h
index 846eeb15a965..8d246550ca6e 100644
--- a/3rdparty/libwebp/src/webp/demux.h
+++ b/3rdparty/libwebp/src/webp/demux.h
@@ -50,6 +50,7 @@
 
 #include "./decode.h"     // for WEBP_CSP_MODE
 #include "./mux_types.h"
+#include "./types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -85,13 +86,13 @@ typedef enum WebPDemuxState {
 } WebPDemuxState;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN WebPDemuxer* WebPDemuxInternal(
+WEBP_NODISCARD WEBP_EXTERN WebPDemuxer* WebPDemuxInternal(
     const WebPData*, int, WebPDemuxState*, int);
 
 // Parses the full WebP file given by 'data'. For single images the WebP file
 // header alone or the file header and the chunk header may be absent.
 // Returns a WebPDemuxer object on successful parse, NULL otherwise.
-static WEBP_INLINE WebPDemuxer* WebPDemux(const WebPData* data) {
+WEBP_NODISCARD static WEBP_INLINE WebPDemuxer* WebPDemux(const WebPData* data) {
   return WebPDemuxInternal(data, 0, NULL, WEBP_DEMUX_ABI_VERSION);
 }
 
@@ -103,7 +104,7 @@ static WEBP_INLINE WebPDemuxer* WebPDemux(const WebPData* data) {
 // If this data is volatile, the demuxer object should be deleted (by calling
 // WebPDemuxDelete()) and WebPDemuxPartial() called again on the new data.
 // This is usually an inexpensive operation.
-static WEBP_INLINE WebPDemuxer* WebPDemuxPartial(
+WEBP_NODISCARD static WEBP_INLINE WebPDemuxer* WebPDemuxPartial(
     const WebPData* data, WebPDemuxState* state) {
   return WebPDemuxInternal(data, 1, state, WEBP_DEMUX_ABI_VERSION);
 }
@@ -164,14 +165,14 @@ struct WebPIterator {
 // Returns false if 'dmux' is NULL or frame 'frame_number' is not present.
 // Call WebPDemuxReleaseIterator() when use of the iterator is complete.
 // NOTE: 'dmux' must persist for the lifetime of 'iter'.
-WEBP_EXTERN int WebPDemuxGetFrame(
+WEBP_NODISCARD WEBP_EXTERN int WebPDemuxGetFrame(
     const WebPDemuxer* dmux, int frame_number, WebPIterator* iter);
 
 // Sets 'iter->fragment' to point to the next ('iter->frame_num' + 1) or
 // previous ('iter->frame_num' - 1) frame. These functions do not loop.
 // Returns true on success, false otherwise.
-WEBP_EXTERN int WebPDemuxNextFrame(WebPIterator* iter);
-WEBP_EXTERN int WebPDemuxPrevFrame(WebPIterator* iter);
+WEBP_NODISCARD WEBP_EXTERN int WebPDemuxNextFrame(WebPIterator* iter);
+WEBP_NODISCARD WEBP_EXTERN int WebPDemuxPrevFrame(WebPIterator* iter);
 
 // Releases any memory associated with 'iter'.
 // Must be called before any subsequent calls to WebPDemuxGetChunk() on the same
@@ -202,15 +203,16 @@ struct WebPChunkIterator {
 // payloads are accessed through WebPDemuxGetFrame() and related functions.
 // Call WebPDemuxReleaseChunkIterator() when use of the iterator is complete.
 // NOTE: 'dmux' must persist for the lifetime of the iterator.
-WEBP_EXTERN int WebPDemuxGetChunk(const WebPDemuxer* dmux,
-                                  const char fourcc[4], int chunk_number,
-                                  WebPChunkIterator* iter);
+WEBP_NODISCARD WEBP_EXTERN int WebPDemuxGetChunk(const WebPDemuxer* dmux,
+                                                 const char fourcc[4],
+                                                 int chunk_number,
+                                                 WebPChunkIterator* iter);
 
 // Sets 'iter->chunk' to point to the next ('iter->chunk_num' + 1) or previous
 // ('iter->chunk_num' - 1) chunk. These functions do not loop.
 // Returns true on success, false otherwise.
-WEBP_EXTERN int WebPDemuxNextChunk(WebPChunkIterator* iter);
-WEBP_EXTERN int WebPDemuxPrevChunk(WebPChunkIterator* iter);
+WEBP_NODISCARD WEBP_EXTERN int WebPDemuxNextChunk(WebPChunkIterator* iter);
+WEBP_NODISCARD WEBP_EXTERN int WebPDemuxPrevChunk(WebPChunkIterator* iter);
 
 // Releases any memory associated with 'iter'.
 // Must be called before destroying the associated WebPDemuxer with
@@ -257,21 +259,21 @@ struct WebPAnimDecoderOptions {
 };
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN int WebPAnimDecoderOptionsInitInternal(
+WEBP_NODISCARD WEBP_EXTERN int WebPAnimDecoderOptionsInitInternal(
     WebPAnimDecoderOptions*, int);
 
 // Should always be called, to initialize a fresh WebPAnimDecoderOptions
 // structure before modification. Returns false in case of version mismatch.
 // WebPAnimDecoderOptionsInit() must have succeeded before using the
 // 'dec_options' object.
-static WEBP_INLINE int WebPAnimDecoderOptionsInit(
+WEBP_NODISCARD static WEBP_INLINE int WebPAnimDecoderOptionsInit(
     WebPAnimDecoderOptions* dec_options) {
   return WebPAnimDecoderOptionsInitInternal(dec_options,
                                             WEBP_DEMUX_ABI_VERSION);
 }
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN WebPAnimDecoder* WebPAnimDecoderNewInternal(
+WEBP_NODISCARD WEBP_EXTERN WebPAnimDecoder* WebPAnimDecoderNewInternal(
     const WebPData*, const WebPAnimDecoderOptions*, int);
 
 // Creates and initializes a WebPAnimDecoder object.
@@ -284,7 +286,7 @@ WEBP_EXTERN WebPAnimDecoder* WebPAnimDecoderNewInternal(
 // Returns:
 //   A pointer to the newly created WebPAnimDecoder object, or NULL in case of
 //   parsing error, invalid option or memory error.
-static WEBP_INLINE WebPAnimDecoder* WebPAnimDecoderNew(
+WEBP_NODISCARD static WEBP_INLINE WebPAnimDecoder* WebPAnimDecoderNew(
     const WebPData* webp_data, const WebPAnimDecoderOptions* dec_options) {
   return WebPAnimDecoderNewInternal(webp_data, dec_options,
                                     WEBP_DEMUX_ABI_VERSION);
@@ -306,8 +308,8 @@ struct WebPAnimInfo {
 //   info - (out) global information fetched from the animation.
 // Returns:
 //   True on success.
-WEBP_EXTERN int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
-                                       WebPAnimInfo* info);
+WEBP_NODISCARD WEBP_EXTERN int WebPAnimDecoderGetInfo(
+    const WebPAnimDecoder* dec, WebPAnimInfo* info);
 
 // Fetch the next frame from 'dec' based on options supplied to
 // WebPAnimDecoderNew(). This will be a fully reconstructed canvas of size
@@ -321,8 +323,9 @@ WEBP_EXTERN int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
 // Returns:
 //   False if any of the arguments are NULL, or if there is a parsing or
 //   decoding error, or if there are no more frames. Otherwise, returns true.
-WEBP_EXTERN int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
-                                       uint8_t** buf, int* timestamp);
+WEBP_NODISCARD WEBP_EXTERN int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
+                                                      uint8_t** buf,
+                                                      int* timestamp);
 
 // Check if there are more frames left to decode.
 // Parameters:
@@ -330,7 +333,8 @@ WEBP_EXTERN int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
 // Returns:
 //   True if 'dec' is not NULL and some frames are yet to be decoded.
 //   Otherwise, returns false.
-WEBP_EXTERN int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec);
+WEBP_NODISCARD WEBP_EXTERN int WebPAnimDecoderHasMoreFrames(
+    const WebPAnimDecoder* dec);
 
 // Resets the WebPAnimDecoder object, so that next call to
 // WebPAnimDecoderGetNext() will restart decoding from 1st frame. This would be
@@ -348,7 +352,7 @@ WEBP_EXTERN void WebPAnimDecoderReset(WebPAnimDecoder* dec);
 //
 // Parameters:
 //   dec - (in) decoder instance from which the demuxer object is to be fetched.
-WEBP_EXTERN const WebPDemuxer* WebPAnimDecoderGetDemuxer(
+WEBP_NODISCARD WEBP_EXTERN const WebPDemuxer* WebPAnimDecoderGetDemuxer(
     const WebPAnimDecoder* dec);
 
 // Deletes the WebPAnimDecoder object.
diff --git a/3rdparty/libwebp/src/webp/encode.h b/3rdparty/libwebp/src/webp/encode.h
index b4c599df8765..f3d59297c8c5 100644
--- a/3rdparty/libwebp/src/webp/encode.h
+++ b/3rdparty/libwebp/src/webp/encode.h
@@ -164,13 +164,14 @@ typedef enum WebPPreset {
 } WebPPreset;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN int WebPConfigInitInternal(WebPConfig*, WebPPreset, float, int);
+WEBP_NODISCARD WEBP_EXTERN int WebPConfigInitInternal(WebPConfig*, WebPPreset,
+                                                      float, int);
 
 // Should always be called, to initialize a fresh WebPConfig structure before
 // modification. Returns false in case of version mismatch. WebPConfigInit()
 // must have succeeded before using the 'config' object.
 // Note that the default values are lossless=0 and quality=75.
-static WEBP_INLINE int WebPConfigInit(WebPConfig* config) {
+WEBP_NODISCARD static WEBP_INLINE int WebPConfigInit(WebPConfig* config) {
   return WebPConfigInitInternal(config, WEBP_PRESET_DEFAULT, 75.f,
                                 WEBP_ENCODER_ABI_VERSION);
 }
@@ -179,8 +180,9 @@ static WEBP_INLINE int WebPConfigInit(WebPConfig* config) {
 // set of parameters (referred to by 'preset') and a given quality factor.
 // This function can be called as a replacement to WebPConfigInit(). Will
 // return false in case of error.
-static WEBP_INLINE int WebPConfigPreset(WebPConfig* config,
-                                        WebPPreset preset, float quality) {
+WEBP_NODISCARD static WEBP_INLINE int WebPConfigPreset(WebPConfig* config,
+                                                       WebPPreset preset,
+                                                       float quality) {
   return WebPConfigInitInternal(config, preset, quality,
                                 WEBP_ENCODER_ABI_VERSION);
 }
@@ -191,11 +193,12 @@ static WEBP_INLINE int WebPConfigPreset(WebPConfig* config,
 // speed and final compressed size.
 // This function will overwrite several fields from config: 'method', 'quality'
 // and 'lossless'. Returns false in case of parameter error.
-WEBP_EXTERN int WebPConfigLosslessPreset(WebPConfig* config, int level);
+WEBP_NODISCARD WEBP_EXTERN int WebPConfigLosslessPreset(WebPConfig* config,
+                                                        int level);
 
 // Returns true if 'config' is non-NULL and all configuration parameters are
 // within their valid ranges.
-WEBP_EXTERN int WebPValidateConfig(const WebPConfig* config);
+WEBP_NODISCARD WEBP_EXTERN int WebPValidateConfig(const WebPConfig* config);
 
 //------------------------------------------------------------------------------
 // Input / Output
@@ -255,8 +258,8 @@ WEBP_EXTERN void WebPMemoryWriterClear(WebPMemoryWriter* writer);
 // The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon
 // completion, writer.mem and writer.size will hold the coded data.
 // writer.mem must be freed by calling WebPMemoryWriterClear.
-WEBP_EXTERN int WebPMemoryWrite(const uint8_t* data, size_t data_size,
-                                const WebPPicture* picture);
+WEBP_NODISCARD WEBP_EXTERN int WebPMemoryWrite(
+    const uint8_t* data, size_t data_size, const WebPPicture* picture);
 
 // Progress hook, called from time to time to report progress. It can return
 // false to request an abort of the encoding process, or true otherwise if
@@ -364,13 +367,13 @@ struct WebPPicture {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN int WebPPictureInitInternal(WebPPicture*, int);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureInitInternal(WebPPicture*, int);
 
 // Should always be called, to initialize the structure. Returns false in case
 // of version mismatch. WebPPictureInit() must have succeeded before using the
 // 'picture' object.
 // Note that, by default, use_argb is false and colorspace is WEBP_YUV420.
-static WEBP_INLINE int WebPPictureInit(WebPPicture* picture) {
+WEBP_NODISCARD static WEBP_INLINE int WebPPictureInit(WebPPicture* picture) {
   return WebPPictureInitInternal(picture, WEBP_ENCODER_ABI_VERSION);
 }
 
@@ -381,7 +384,7 @@ static WEBP_INLINE int WebPPictureInit(WebPPicture* picture) {
 // Allocate y/u/v buffers as per colorspace/width/height specification.
 // Note! This function will free the previous buffer if needed.
 // Returns false in case of memory error.
-WEBP_EXTERN int WebPPictureAlloc(WebPPicture* picture);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureAlloc(WebPPicture* picture);
 
 // Release the memory allocated by WebPPictureAlloc() or WebPPictureImport*().
 // Note that this function does _not_ free the memory used by the 'picture'
@@ -394,7 +397,8 @@ WEBP_EXTERN void WebPPictureFree(WebPPicture* picture);
 // will fully own the copied pixels (this is not a view). The 'dst' picture need
 // not be initialized as its content is overwritten.
 // Returns false in case of memory allocation error.
-WEBP_EXTERN int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureCopy(const WebPPicture* src,
+                                               WebPPicture* dst);
 
 // Compute the single distortion for packed planes of samples.
 // 'src' will be compared to 'ref', and the raw distortion stored into
@@ -403,19 +407,18 @@ WEBP_EXTERN int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
 // 'x_step' is the horizontal stride (in bytes) between samples.
 // 'src/ref_stride' is the byte distance between rows.
 // Returns false in case of error (bad parameter, memory allocation error, ...).
-WEBP_EXTERN int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
-                                    const uint8_t* ref, size_t ref_stride,
-                                    int width, int height,
-                                    size_t x_step,
-                                    int type,   // 0 = PSNR, 1 = SSIM, 2 = LSIM
-                                    float* distortion, float* result);
+WEBP_NODISCARD WEBP_EXTERN int WebPPlaneDistortion(
+    const uint8_t* src, size_t src_stride,
+    const uint8_t* ref, size_t ref_stride, int width, int height, size_t x_step,
+    int type,  // 0 = PSNR, 1 = SSIM, 2 = LSIM
+    float* distortion, float* result);
 
 // Compute PSNR, SSIM or LSIM distortion metric between two pictures. Results
 // are in dB, stored in result[] in the B/G/R/A/All order. The distortion is
 // always performed using ARGB samples. Hence if the input is YUV(A), the
 // picture will be internally converted to ARGB (just for the measurement).
 // Warning: this function is rather CPU-intensive.
-WEBP_EXTERN int WebPPictureDistortion(
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureDistortion(
     const WebPPicture* src, const WebPPicture* ref,
     int metric_type,           // 0 = PSNR, 1 = SSIM, 2 = LSIM
     float result[5]);
@@ -428,8 +431,8 @@ WEBP_EXTERN int WebPPictureDistortion(
 // must be fully be comprised inside the 'src' source picture. If the source
 // picture uses the YUV420 colorspace, the top and left coordinates will be
 // snapped to even values.
-WEBP_EXTERN int WebPPictureCrop(WebPPicture* picture,
-                                int left, int top, int width, int height);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureCrop(
+    WebPPicture* picture, int left, int top, int width, int height);
 
 // Extracts a view from 'src' picture into 'dst'. The rectangle for the view
 // is defined by the top-left corner pixel coordinates (left, top) as well
@@ -441,10 +444,10 @@ WEBP_EXTERN int WebPPictureCrop(WebPPicture* picture,
 // the original dimension will be lost). Picture 'dst' need not be initialized
 // with WebPPictureInit() if it is different from 'src', since its content will
 // be overwritten.
-// Returns false in case of memory allocation error or invalid parameters.
-WEBP_EXTERN int WebPPictureView(const WebPPicture* src,
-                                int left, int top, int width, int height,
-                                WebPPicture* dst);
+// Returns false in case of invalid parameters.
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureView(
+    const WebPPicture* src, int left, int top, int width, int height,
+    WebPPicture* dst);
 
 // Returns true if the 'picture' is actually a view and therefore does
 // not own the memory for pixels.
@@ -455,29 +458,30 @@ WEBP_EXTERN int WebPPictureIsView(const WebPPicture* picture);
 // dimension will be calculated preserving the aspect ratio.
 // No gamma correction is applied.
 // Returns false in case of error (invalid parameter or insufficient memory).
-WEBP_EXTERN int WebPPictureRescale(WebPPicture* pic, int width, int height);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureRescale(WebPPicture* picture,
+                                                  int width, int height);
 
 // Colorspace conversion function to import RGB samples.
 // Previous buffer will be free'd, if any.
 // *rgb buffer should have a size of at least height * rgb_stride.
 // Returns false in case of memory error.
-WEBP_EXTERN int WebPPictureImportRGB(
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureImportRGB(
     WebPPicture* picture, const uint8_t* rgb, int rgb_stride);
 // Same, but for RGBA buffer.
-WEBP_EXTERN int WebPPictureImportRGBA(
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureImportRGBA(
     WebPPicture* picture, const uint8_t* rgba, int rgba_stride);
 // Same, but for RGBA buffer. Imports the RGB direct from the 32-bit format
 // input buffer ignoring the alpha channel. Avoids needing to copy the data
 // to a temporary 24-bit RGB buffer to import the RGB only.
-WEBP_EXTERN int WebPPictureImportRGBX(
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureImportRGBX(
     WebPPicture* picture, const uint8_t* rgbx, int rgbx_stride);
 
 // Variants of the above, but taking BGR(A|X) input.
-WEBP_EXTERN int WebPPictureImportBGR(
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureImportBGR(
     WebPPicture* picture, const uint8_t* bgr, int bgr_stride);
-WEBP_EXTERN int WebPPictureImportBGRA(
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureImportBGRA(
     WebPPicture* picture, const uint8_t* bgra, int bgra_stride);
-WEBP_EXTERN int WebPPictureImportBGRX(
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureImportBGRX(
     WebPPicture* picture, const uint8_t* bgrx, int bgrx_stride);
 
 // Converts picture->argb data to the YUV420A format. The 'colorspace'
@@ -486,24 +490,24 @@ WEBP_EXTERN int WebPPictureImportBGRX(
 // non-opaque transparent values is detected, and 'colorspace' will be
 // adjusted accordingly. Note that this method is lossy.
 // Returns false in case of error.
-WEBP_EXTERN int WebPPictureARGBToYUVA(WebPPicture* picture,
-                                      WebPEncCSP /*colorspace = WEBP_YUV420*/);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureARGBToYUVA(
+    WebPPicture* picture, WebPEncCSP /*colorspace = WEBP_YUV420*/);
 
 // Same as WebPPictureARGBToYUVA(), but the conversion is done using
 // pseudo-random dithering with a strength 'dithering' between
 // 0.0 (no dithering) and 1.0 (maximum dithering). This is useful
 // for photographic picture.
-WEBP_EXTERN int WebPPictureARGBToYUVADithered(
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureARGBToYUVADithered(
     WebPPicture* picture, WebPEncCSP colorspace, float dithering);
 
-// Performs 'sharp' RGBA->YUVA420 downsampling and colorspace conversion.
+// Performs 'sharp' RGBA->YUVA420 downsampling and colorspace conversion
 // Downsampling is handled with extra care in case of color clipping. This
 // method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better
 // and sharper YUV representation.
 // Returns false in case of error.
-WEBP_EXTERN int WebPPictureSharpARGBToYUVA(WebPPicture* picture);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureSharpARGBToYUVA(WebPPicture* picture);
 // kept for backward compatibility:
-WEBP_EXTERN int WebPPictureSmartARGBToYUVA(WebPPicture* picture);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureSmartARGBToYUVA(WebPPicture* picture);
 
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
 // The input format must be YUV_420 or YUV_420A. The conversion from YUV420 to
@@ -511,7 +515,7 @@ WEBP_EXTERN int WebPPictureSmartARGBToYUVA(WebPPicture* picture);
 // Note that the use of this colorspace is discouraged if one has access to the
 // raw ARGB samples, since using YUV420 is comparatively lossy.
 // Returns false in case of error.
-WEBP_EXTERN int WebPPictureYUVAToARGB(WebPPicture* picture);
+WEBP_NODISCARD WEBP_EXTERN int WebPPictureYUVAToARGB(WebPPicture* picture);
 
 // Helper function: given a width x height plane of RGBA or YUV(A) samples
 // clean-up or smoothen the YUV or RGB samples under fully transparent area,
@@ -526,7 +530,7 @@ WEBP_EXTERN int WebPPictureHasTransparency(const WebPPicture* picture);
 // Remove the transparency information (if present) by blending the color with
 // the background color 'background_rgb' (specified as 24bit RGB triplet).
 // After this call, all alpha values are reset to 0xff.
-WEBP_EXTERN void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
+WEBP_EXTERN void WebPBlendAlpha(WebPPicture* picture, uint32_t background_rgb);
 
 //------------------------------------------------------------------------------
 // Main call
@@ -541,7 +545,8 @@ WEBP_EXTERN void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
 // the former for lossy encoding, and the latter for lossless encoding
 // (when config.lossless is true). Automatic conversion from one format to
 // another is provided but they both incur some loss.
-WEBP_EXTERN int WebPEncode(const WebPConfig* config, WebPPicture* picture);
+WEBP_NODISCARD WEBP_EXTERN int WebPEncode(const WebPConfig* config,
+                                          WebPPicture* picture);
 
 //------------------------------------------------------------------------------
 
diff --git a/3rdparty/libwebp/src/webp/format_constants.h b/3rdparty/libwebp/src/webp/format_constants.h
index eca6981a47d0..999035c5d265 100644
--- a/3rdparty/libwebp/src/webp/format_constants.h
+++ b/3rdparty/libwebp/src/webp/format_constants.h
@@ -55,7 +55,7 @@
 typedef enum {
   PREDICTOR_TRANSFORM      = 0,
   CROSS_COLOR_TRANSFORM    = 1,
-  SUBTRACT_GREEN           = 2,
+  SUBTRACT_GREEN_TRANSFORM = 2,
   COLOR_INDEXING_TRANSFORM = 3
 } VP8LImageTransformType;
 
diff --git a/3rdparty/libwebp/src/webp/mux.h b/3rdparty/libwebp/src/webp/mux.h
index 7d27489a4027..8fb067e43545 100644
--- a/3rdparty/libwebp/src/webp/mux.h
+++ b/3rdparty/libwebp/src/webp/mux.h
@@ -16,12 +16,13 @@
 #define WEBP_WEBP_MUX_H_
 
 #include "./mux_types.h"
+#include "./types.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define WEBP_MUX_ABI_VERSION 0x0108        // MAJOR(8b) + MINOR(8b)
+#define WEBP_MUX_ABI_VERSION 0x0109        // MAJOR(8b) + MINOR(8b)
 
 //------------------------------------------------------------------------------
 // Mux API
@@ -70,7 +71,7 @@ typedef struct WebPMuxAnimParams WebPMuxAnimParams;
 typedef struct WebPAnimEncoderOptions WebPAnimEncoderOptions;
 
 // Error codes
-typedef enum WebPMuxError {
+typedef enum WEBP_NODISCARD WebPMuxError {
   WEBP_MUX_OK                 =  1,
   WEBP_MUX_NOT_FOUND          =  0,
   WEBP_MUX_INVALID_ARGUMENT   = -1,
@@ -104,13 +105,13 @@ WEBP_EXTERN int WebPGetMuxVersion(void);
 // Life of a Mux object
 
 // Internal, version-checked, entry point
-WEBP_EXTERN WebPMux* WebPNewInternal(int);
+WEBP_NODISCARD WEBP_EXTERN WebPMux* WebPNewInternal(int);
 
 // Creates an empty mux object.
 // Returns:
 //   A pointer to the newly created empty mux object.
 //   Or NULL in case of memory error.
-static WEBP_INLINE WebPMux* WebPMuxNew(void) {
+WEBP_NODISCARD static WEBP_INLINE WebPMux* WebPMuxNew(void) {
   return WebPNewInternal(WEBP_MUX_ABI_VERSION);
 }
 
@@ -123,18 +124,21 @@ WEBP_EXTERN void WebPMuxDelete(WebPMux* mux);
 // Mux creation.
 
 // Internal, version-checked, entry point
-WEBP_EXTERN WebPMux* WebPMuxCreateInternal(const WebPData*, int, int);
+WEBP_NODISCARD WEBP_EXTERN WebPMux* WebPMuxCreateInternal(const WebPData*, int,
+                                                          int);
 
 // Creates a mux object from raw data given in WebP RIFF format.
 // Parameters:
 //   bitstream - (in) the bitstream data in WebP RIFF format
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               object and value 0 indicates data will NOT be copied.
+//               object and value 0 indicates data will NOT be copied. If the
+//               data is not copied, it must exist for the lifetime of the
+//               mux object.
 // Returns:
 //   A pointer to the mux object created from given data - on success.
 //   NULL - In case of invalid data or memory error.
-static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
-                                          int copy_data) {
+WEBP_NODISCARD static WEBP_INLINE WebPMux* WebPMuxCreate(
+    const WebPData* bitstream, int copy_data) {
   return WebPMuxCreateInternal(bitstream, copy_data, WEBP_MUX_ABI_VERSION);
 }
 
@@ -154,7 +158,9 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
 //                 e.g., "ICCP", "XMP ", "EXIF" etc.
 //   chunk_data - (in) the chunk data to be added
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               object and value 0 indicates data will NOT be copied.
+//               object and value 0 indicates data will NOT be copied. If the
+//               data is not copied, it must exist until a call to
+//               WebPMuxAssemble() is made.
 // Returns:
 //   WEBP_MUX_INVALID_ARGUMENT - if mux, fourcc or chunk_data is NULL
 //                               or if fourcc corresponds to an image chunk.
@@ -217,7 +223,9 @@ struct WebPMuxFrameInfo {
 //   bitstream - (in) can be a raw VP8/VP8L bitstream or a single-image
 //               WebP file (non-animated)
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               object and value 0 indicates data will NOT be copied.
+//               object and value 0 indicates data will NOT be copied. If the
+//               data is not copied, it must exist until a call to
+//               WebPMuxAssemble() is made.
 // Returns:
 //   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
@@ -235,7 +243,9 @@ WEBP_EXTERN WebPMuxError WebPMuxSetImage(
 //   mux - (in/out) object to which the frame is to be added
 //   frame - (in) frame data.
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               object and value 0 indicates data will NOT be copied.
+//               object and value 0 indicates data will NOT be copied. If the
+//               data is not copied, it must exist until a call to
+//               WebPMuxAssemble() is made.
 // Returns:
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or frame is NULL
 //                               or if content of 'frame' is invalid.
@@ -449,7 +459,7 @@ WEBP_EXTERN int WebPAnimEncoderOptionsInitInternal(
 // structure before modification. Returns false in case of version mismatch.
 // WebPAnimEncoderOptionsInit() must have succeeded before using the
 // 'enc_options' object.
-static WEBP_INLINE int WebPAnimEncoderOptionsInit(
+WEBP_NODISCARD static WEBP_INLINE int WebPAnimEncoderOptionsInit(
     WebPAnimEncoderOptions* enc_options) {
   return WebPAnimEncoderOptionsInitInternal(enc_options, WEBP_MUX_ABI_VERSION);
 }
@@ -490,7 +500,7 @@ static WEBP_INLINE WebPAnimEncoder* WebPAnimEncoderNew(
 // Returns:
 //   On error, returns false and frame->error_code is set appropriately.
 //   Otherwise, returns true.
-WEBP_EXTERN int WebPAnimEncoderAdd(
+WEBP_NODISCARD WEBP_EXTERN int WebPAnimEncoderAdd(
     WebPAnimEncoder* enc, struct WebPPicture* frame, int timestamp_ms,
     const struct WebPConfig* config);
 
@@ -503,8 +513,8 @@ WEBP_EXTERN int WebPAnimEncoderAdd(
 //   webp_data - (out) generated WebP bitstream.
 // Returns:
 //   True on success.
-WEBP_EXTERN int WebPAnimEncoderAssemble(WebPAnimEncoder* enc,
-                                        WebPData* webp_data);
+WEBP_NODISCARD WEBP_EXTERN int WebPAnimEncoderAssemble(WebPAnimEncoder* enc,
+                                                       WebPData* webp_data);
 
 // Get error string corresponding to the most recent call using 'enc'. The
 // returned string is owned by 'enc' and is valid only until the next call to
@@ -521,6 +531,57 @@ WEBP_EXTERN const char* WebPAnimEncoderGetError(WebPAnimEncoder* enc);
 //   enc - (in/out) object to be deleted
 WEBP_EXTERN void WebPAnimEncoderDelete(WebPAnimEncoder* enc);
 
+//------------------------------------------------------------------------------
+// Non-image chunks.
+
+// Note: Only non-image related chunks should be managed through chunk APIs.
+// (Image related chunks are: "ANMF", "VP8 ", "VP8L" and "ALPH").
+
+// Adds a chunk with id 'fourcc' and data 'chunk_data' in the enc object.
+// Any existing chunk(s) with the same id will be removed.
+// Parameters:
+//   enc - (in/out) object to which the chunk is to be added
+//   fourcc - (in) a character array containing the fourcc of the given chunk;
+//                 e.g., "ICCP", "XMP ", "EXIF", etc.
+//   chunk_data - (in) the chunk data to be added
+//   copy_data - (in) value 1 indicates given data WILL be copied to the enc
+//               object and value 0 indicates data will NOT be copied. If the
+//               data is not copied, it must exist until a call to
+//               WebPAnimEncoderAssemble() is made.
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if enc, fourcc or chunk_data is NULL.
+//   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN WebPMuxError WebPAnimEncoderSetChunk(
+    WebPAnimEncoder* enc, const char fourcc[4], const WebPData* chunk_data,
+    int copy_data);
+
+// Gets a reference to the data of the chunk with id 'fourcc' in the enc object.
+// The caller should NOT free the returned data.
+// Parameters:
+//   enc - (in) object from which the chunk data is to be fetched
+//   fourcc - (in) a character array containing the fourcc of the chunk;
+//                 e.g., "ICCP", "XMP ", "EXIF", etc.
+//   chunk_data - (out) returned chunk data
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if enc, fourcc or chunk_data is NULL.
+//   WEBP_MUX_NOT_FOUND - If enc does not contain a chunk with the given id.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN WebPMuxError WebPAnimEncoderGetChunk(
+    const WebPAnimEncoder* enc, const char fourcc[4], WebPData* chunk_data);
+
+// Deletes the chunk with the given 'fourcc' from the enc object.
+// Parameters:
+//   enc - (in/out) object from which the chunk is to be deleted
+//   fourcc - (in) a character array containing the fourcc of the chunk;
+//                 e.g., "ICCP", "XMP ", "EXIF", etc.
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if enc or fourcc is NULL.
+//   WEBP_MUX_NOT_FOUND - If enc does not contain a chunk with the given fourcc.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN WebPMuxError WebPAnimEncoderDeleteChunk(
+    WebPAnimEncoder* enc, const char fourcc[4]);
+
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus
diff --git a/3rdparty/libwebp/src/webp/mux_types.h b/3rdparty/libwebp/src/webp/mux_types.h
index 2fe819583917..c585d2082f70 100644
--- a/3rdparty/libwebp/src/webp/mux_types.h
+++ b/3rdparty/libwebp/src/webp/mux_types.h
@@ -79,7 +79,8 @@ static WEBP_INLINE void WebPDataClear(WebPData* webp_data) {
 
 // Allocates necessary storage for 'dst' and copies the contents of 'src'.
 // Returns true on success.
-static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
+WEBP_NODISCARD static WEBP_INLINE int WebPDataCopy(const WebPData* src,
+                                                   WebPData* dst) {
   if (src == NULL || dst == NULL) return 0;
   WebPDataInit(dst);
   if (src->bytes != NULL && src->size != 0) {
diff --git a/3rdparty/libwebp/src/webp/types.h b/3rdparty/libwebp/src/webp/types.h
index 47f7f2b00706..9c17edec45db 100644
--- a/3rdparty/libwebp/src/webp/types.h
+++ b/3rdparty/libwebp/src/webp/types.h
@@ -36,14 +36,39 @@ typedef long long int int64_t;
 #define WEBP_INLINE __forceinline
 #endif  /* _MSC_VER */
 
+#ifndef WEBP_NODISCARD
+#if defined(WEBP_ENABLE_NODISCARD) && WEBP_ENABLE_NODISCARD
+#if (defined(__cplusplus) && __cplusplus >= 201700L) || \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L)
+#define WEBP_NODISCARD [[nodiscard]]
+#else
+// gcc's __has_attribute does not work for enums.
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(warn_unused_result)
+#define WEBP_NODISCARD __attribute__((warn_unused_result))
+#else
+#define WEBP_NODISCARD
+#endif  /* __has_attribute(warn_unused_result) */
+#else
+#define WEBP_NODISCARD
+#endif  /* defined(__clang__) && defined(__has_attribute) */
+#endif  /* (defined(__cplusplus) && __cplusplus >= 201700L) ||
+           (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) */
+#else
+#define WEBP_NODISCARD
+#endif  /* defined(WEBP_ENABLE_NODISCARD) && WEBP_ENABLE_NODISCARD */
+#endif  /* WEBP_NODISCARD */
+
 #ifndef WEBP_EXTERN
 // This explicitly marks library functions and allows for changing the
 // signature for e.g., Windows DLL builds.
-# if defined(__GNUC__) && __GNUC__ >= 4
+# if defined(_WIN32) && defined(WEBP_DLL)
+#  define WEBP_EXTERN __declspec(dllexport)
+# elif defined(__GNUC__) && __GNUC__ >= 4
 #  define WEBP_EXTERN extern __attribute__ ((visibility ("default")))
 # else
 #  define WEBP_EXTERN extern
-# endif  /* __GNUC__ >= 4 */
+# endif  /* defined(_WIN32) && defined(WEBP_DLL) */
 #endif  /* WEBP_EXTERN */
 
 // Macro to check ABI compatibility (same major revision number)
@@ -56,7 +81,7 @@ extern "C" {
 // Allocates 'size' bytes of memory. Returns NULL upon error. Memory
 // must be deallocated by calling WebPFree(). This function is made available
 // by the core 'libwebp' library.
-WEBP_EXTERN void* WebPMalloc(size_t size);
+WEBP_NODISCARD WEBP_EXTERN void* WebPMalloc(size_t size);
 
 // Releases memory returned by the WebPDecode*() functions (from decode.h).
 WEBP_EXTERN void WebPFree(void* ptr);
diff --git a/3rdparty/ndsrvp/CMakeLists.txt b/3rdparty/ndsrvp/CMakeLists.txt
new file mode 100644
index 000000000000..bc9a3a26dc8e
--- /dev/null
+++ b/3rdparty/ndsrvp/CMakeLists.txt
@@ -0,0 +1,34 @@
+message(STATUS "##########")
+message(STATUS "# NDSRVP #")
+message(STATUS "##########")
+
+cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
+
+# project setup
+
+set(NDSRVP_INCLUDE_DIR include)
+set(NDSRVP_SOURCE_DIR src)
+
+file(GLOB ndsrvp_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${NDSRVP_INCLUDE_DIR}/*.hpp")
+file(GLOB ndsrvp_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${NDSRVP_SOURCE_DIR}/*.cpp")
+
+add_library(ndsrvp_hal STATIC)
+target_sources(ndsrvp_hal PRIVATE ${ndsrvp_headers} ${ndsrvp_sources})
+
+set_target_properties(ndsrvp_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(ndsrvp_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+target_include_directories(ndsrvp_hal PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_SOURCE_DIR}/modules/core/include
+  ${CMAKE_SOURCE_DIR}/modules/imgproc/include
+  ${CMAKE_SOURCE_DIR}/modules/features2d/include)
+
+# project info
+
+set(NDSRVP_HAL_FOUND TRUE CACHE INTERNAL "")
+set(NDSRVP_HAL_VERSION "0.0.1" CACHE INTERNAL "")
+set(NDSRVP_HAL_LIBRARIES "ndsrvp_hal" CACHE INTERNAL "")
+set(NDSRVP_HAL_HEADERS "ndsrvp_hal.hpp" CACHE INTERNAL "")
+set(NDSRVP_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "")
diff --git a/3rdparty/ndsrvp/include/core.hpp b/3rdparty/ndsrvp/include/core.hpp
new file mode 100644
index 000000000000..190a1b926b60
--- /dev/null
+++ b/3rdparty/ndsrvp/include/core.hpp
@@ -0,0 +1,532 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_CORE_HPP
+#define OPENCV_NDSRVP_CORE_HPP
+
+namespace cv {
+
+namespace ndsrvp {
+
+template <typename srctype, typename dsttype,
+    typename vsrctype, typename vdsttype, int nlane,
+    template <typename src, typename dst> typename operators_t,
+    typename... params_t>
+int elemwise_binop(const srctype* src1_data, size_t src1_step,
+    const srctype* src2_data, size_t src2_step,
+    dsttype* dst_data, size_t dst_step,
+    int width, int height, params_t... params)
+{
+    src1_step /= sizeof(srctype);
+    src2_step /= sizeof(srctype);
+    dst_step /= sizeof(dsttype);
+
+    operators_t<srctype, dsttype> operators;
+
+    int i, j;
+    for (i = 0; i < height; ++i) {
+        const srctype* src1_row = src1_data + (src1_step * i);
+        const srctype* src2_row = src2_data + (src2_step * i);
+        dsttype* dst_row = dst_data + (dst_step * i);
+
+        j = 0;
+        for (; j + nlane <= width; j += nlane) {
+            register vsrctype vs1 = *(vsrctype*)(src1_row + j);
+            register vsrctype vs2 = *(vsrctype*)(src2_row + j);
+
+            *(vdsttype*)(dst_row + j) = operators.vector(vs1, vs2, params...);
+        }
+        for (; j < width; j++)
+            dst_row[j] = operators.scalar(src1_row[j], src2_row[j], params...);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename srctype, typename dsttype,
+    typename vsrctype, typename vdsttype, int nlane,
+    template <typename src, typename dst> typename operators_t,
+    typename... params_t>
+int elemwise_unop(const srctype* src_data, size_t src_step,
+    dsttype* dst_data, size_t dst_step,
+    int width, int height, params_t... params)
+{
+    src_step /= sizeof(srctype);
+    dst_step /= sizeof(dsttype);
+
+    operators_t<srctype, dsttype> operators;
+
+    int i, j;
+    for (i = 0; i < height; ++i) {
+        const srctype* src_row = src_data + (src_step * i);
+        dsttype* dst_row = dst_data + (dst_step * i);
+
+        j = 0;
+        for (; j + nlane <= width; j += nlane) {
+            register vsrctype vs = *(vsrctype*)(src_row + j);
+
+            *(vdsttype*)(dst_row + j) = operators.vector(vs, params...);
+        }
+        for (; j < width; j++)
+            dst_row[j] = operators.scalar(src_row[j], params...);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+// ################ add ################
+
+template <typename src, typename dst>
+struct operators_add_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_ukadd8(a, b); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__ukadd8(a, b); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_kadd8(a, b); }
+    inline schar scalar(schar a, schar b) { return __nds__kadd8(a, b); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_ukadd16(a, b); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__ukadd16(a, b); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_kadd16(a, b); }
+    inline short scalar(short a, short b) { return __nds__kadd16(a, b); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_kadd32(a, b); }
+    inline int scalar(int a, int b) { return __nds__kadd32(a, b); }
+};
+
+#undef cv_hal_add8u
+#define cv_hal_add8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_add_t>)
+
+#undef cv_hal_add8s
+#define cv_hal_add8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_add_t>)
+
+#undef cv_hal_add16u
+#define cv_hal_add16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_add_t>)
+
+#undef cv_hal_add16s
+#define cv_hal_add16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_add_t>)
+
+#undef cv_hal_add32s
+#define cv_hal_add32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_add_t>)
+
+// ################ sub ################
+
+template <typename src, typename dst>
+struct operators_sub_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_uksub8(a, b); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__uksub8(a, b); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_ksub8(a, b); }
+    inline schar scalar(schar a, schar b) { return __nds__ksub8(a, b); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_uksub16(a, b); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__uksub16(a, b); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_ksub16(a, b); }
+    inline short scalar(short a, short b) { return __nds__ksub16(a, b); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_ksub32(a, b); }
+    inline int scalar(int a, int b) { return __nds__ksub32(a, b); }
+};
+
+#undef cv_hal_sub8u
+#define cv_hal_sub8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_sub_t>)
+
+#undef cv_hal_sub8s
+#define cv_hal_sub8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_sub_t>)
+
+#undef cv_hal_sub16u
+#define cv_hal_sub16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_sub_t>)
+
+#undef cv_hal_sub16s
+#define cv_hal_sub16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_sub_t>)
+
+#undef cv_hal_sub32s
+#define cv_hal_sub32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_sub_t>)
+
+// ################ max ################
+
+template <typename src, typename dst>
+struct operators_max_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_umax8(a, b); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__umax8(a, b); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_smax8(a, b); }
+    inline schar scalar(schar a, schar b) { return __nds__smax8(a, b); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_umax16(a, b); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__umax16(a, b); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_smax16(a, b); }
+    inline short scalar(short a, short b) { return __nds__smax16(a, b); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_smax32(a, b); }
+    inline int scalar(int a, int b) { return __nds__smax32(a, b); }
+};
+
+#undef cv_hal_max8u
+#define cv_hal_max8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_max_t>)
+
+#undef cv_hal_max8s
+#define cv_hal_max8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_max_t>)
+
+#undef cv_hal_max16u
+#define cv_hal_max16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_max_t>)
+
+#undef cv_hal_max16s
+#define cv_hal_max16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_max_t>)
+
+#undef cv_hal_max32s
+#define cv_hal_max32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_max_t>)
+
+// ################ min ################
+
+template <typename src, typename dst>
+struct operators_min_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_umin8(a, b); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__umin8(a, b); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_smin8(a, b); }
+    inline schar scalar(schar a, schar b) { return __nds__smin8(a, b); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_umin16(a, b); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__umin16(a, b); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_smin16(a, b); }
+    inline short scalar(short a, short b) { return __nds__smin16(a, b); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_smin32(a, b); }
+    inline int scalar(int a, int b) { return __nds__smin32(a, b); }
+};
+
+#undef cv_hal_min8u
+#define cv_hal_min8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_min_t>)
+
+#undef cv_hal_min8s
+#define cv_hal_min8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_min_t>)
+
+#undef cv_hal_min16u
+#define cv_hal_min16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_min_t>)
+
+#undef cv_hal_min16s
+#define cv_hal_min16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_min_t>)
+
+#undef cv_hal_min32s
+#define cv_hal_min32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_min_t>)
+
+// ################ absdiff ################
+
+template <typename src, typename dst>
+struct operators_absdiff_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return __nds__v_uksub8(__nds__v_umax8(a, b), __nds__v_umin8(a, b)); }
+    inline uchar scalar(uchar a, uchar b) { return __nds__uksub8(__nds__umax8(a, b), __nds__umin8(a, b)); }
+
+    inline int8x8_t vector(int8x8_t a, int8x8_t b) { return __nds__v_ksub8(__nds__v_smax8(a, b), __nds__v_smin8(a, b)); }
+    inline schar scalar(schar a, schar b) { return __nds__ksub8(__nds__smax8(a, b), __nds__smin8(a, b)); }
+
+    inline uint16x4_t vector(uint16x4_t a, uint16x4_t b) { return __nds__v_uksub16(__nds__v_umax16(a, b), __nds__v_umin16(a, b)); }
+    inline ushort scalar(ushort a, ushort b) { return __nds__uksub16(__nds__umax16(a, b), __nds__umin16(a, b)); }
+
+    inline int16x4_t vector(int16x4_t a, int16x4_t b) { return __nds__v_ksub16(__nds__v_smax16(a, b), __nds__v_smin16(a, b)); }
+    inline short scalar(short a, short b) { return __nds__ksub16(__nds__smax16(a, b), __nds__smin16(a, b)); }
+
+    inline int32x2_t vector(int32x2_t a, int32x2_t b) { return __nds__v_ksub32(__nds__v_smax32(a, b), __nds__v_smin32(a, b)); }
+    inline int scalar(int a, int b) { return __nds__ksub32(__nds__smax32(a, b), __nds__smin32(a, b)); }
+};
+
+#undef cv_hal_absdiff8u
+#define cv_hal_absdiff8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_absdiff_t>)
+
+#undef cv_hal_absdiff8s
+#define cv_hal_absdiff8s (cv::ndsrvp::elemwise_binop<schar, schar, int8x8_t, int8x8_t, 8, cv::ndsrvp::operators_absdiff_t>)
+
+#undef cv_hal_absdiff16u
+#define cv_hal_absdiff16u (cv::ndsrvp::elemwise_binop<ushort, ushort, uint16x4_t, uint16x4_t, 4, cv::ndsrvp::operators_absdiff_t>)
+
+#undef cv_hal_absdiff16s
+#define cv_hal_absdiff16s (cv::ndsrvp::elemwise_binop<short, short, int16x4_t, int16x4_t, 4, cv::ndsrvp::operators_absdiff_t>)
+
+#undef cv_hal_absdiff32s
+#define cv_hal_absdiff32s (cv::ndsrvp::elemwise_binop<int, int, int32x2_t, int32x2_t, 2, cv::ndsrvp::operators_absdiff_t>)
+
+// ################ bitwise ################
+
+template <typename src, typename dst>
+struct operators_and_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a & b; }
+    inline uchar scalar(uchar a, uchar b) { return a & b; }
+};
+
+#undef cv_hal_and8u
+#define cv_hal_and8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_and_t>)
+
+template <typename src, typename dst>
+struct operators_or_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a | b; }
+    inline uchar scalar(uchar a, uchar b) { return a | b; }
+};
+
+#undef cv_hal_or8u
+#define cv_hal_or8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_or_t>)
+
+template <typename src, typename dst>
+struct operators_xor_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b) { return a ^ b; }
+    inline uchar scalar(uchar a, uchar b) { return a ^ b; }
+};
+
+#undef cv_hal_xor8u
+#define cv_hal_xor8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_xor_t>)
+
+template <typename src, typename dst>
+struct operators_not_t {
+    inline uint8x8_t vector(uint8x8_t a) { return ~a; }
+    inline uchar scalar(uchar a) { return ~a; }
+};
+
+#undef cv_hal_not8u
+#define cv_hal_not8u (cv::ndsrvp::elemwise_unop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_not_t>)
+
+// ################ cmp ################
+
+template <typename src, typename dst>
+struct operators_cmp_t {
+    inline uint8x8_t vector(uint8x8_t a, uint8x8_t b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__v_ucmpeq8(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__v_ucmplt8(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__v_ucmple8(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__v_ucmplt8(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__v_ucmple8(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__v_ucmpeq8(a, b);
+        default:
+            return uint8x8_t();
+        }
+    }
+    inline uchar scalar(uchar a, uchar b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__cmpeq8(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__ucmplt8(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__ucmple8(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__ucmplt8(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__ucmple8(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__cmpeq8(a, b);
+        default:
+            return 0;
+        }
+    }
+
+    inline uint8x8_t vector(int8x8_t a, int8x8_t b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__v_scmpeq8(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__v_scmplt8(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__v_scmple8(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__v_scmplt8(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__v_scmple8(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__v_scmpeq8(a, b);
+        default:
+            return uint8x8_t();
+        }
+    }
+    inline uchar scalar(schar a, schar b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__cmpeq8(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__scmplt8(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__scmple8(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__scmplt8(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__scmple8(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__cmpeq8(a, b);
+        default:
+            return 0;
+        }
+    }
+
+    inline uint8x4_t vector(uint16x4_t a, uint16x4_t b, int operation)
+    {
+        register unsigned long cmp;
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            cmp = (unsigned long)__nds__v_ucmpeq16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_GT:
+            cmp = (unsigned long)__nds__v_ucmplt16(b, a) >> 8;
+            break;
+        case CV_HAL_CMP_GE:
+            cmp = (unsigned long)__nds__v_ucmple16(b, a) >> 8;
+            break;
+        case CV_HAL_CMP_LT:
+            cmp = (unsigned long)__nds__v_ucmplt16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_LE:
+            cmp = (unsigned long)__nds__v_ucmple16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_NE:
+            cmp = ~(unsigned long)__nds__v_ucmpeq16(a, b) >> 8;
+            break;
+        default:
+            return uint8x4_t();
+        }
+        return (uint8x4_t)(unsigned int)__nds__pkbb16(cmp >> 32, cmp);
+    }
+    inline uchar scalar(ushort a, ushort b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__cmpeq16(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__ucmplt16(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__ucmple16(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__ucmplt16(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__ucmple16(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__cmpeq16(a, b);
+        default:
+            return 0;
+        }
+    }
+
+    inline uint8x4_t vector(int16x4_t a, int16x4_t b, int operation)
+    {
+        register unsigned long cmp;
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            cmp = (unsigned long)__nds__v_scmpeq16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_GT:
+            cmp = (unsigned long)__nds__v_scmplt16(b, a) >> 8;
+            break;
+        case CV_HAL_CMP_GE:
+            cmp = (unsigned long)__nds__v_scmple16(b, a) >> 8;
+            break;
+        case CV_HAL_CMP_LT:
+            cmp = (unsigned long)__nds__v_scmplt16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_LE:
+            cmp = (unsigned long)__nds__v_scmple16(a, b) >> 8;
+            break;
+        case CV_HAL_CMP_NE:
+            cmp = ~(unsigned long)__nds__v_scmpeq16(a, b) >> 8;
+            break;
+        default:
+            return uint8x4_t();
+        }
+        return (uint8x4_t)(unsigned int)__nds__pkbb16(cmp >> 32, cmp);
+    }
+    inline uchar scalar(short a, short b, int operation)
+    {
+        switch (operation) {
+        case CV_HAL_CMP_EQ:
+            return __nds__cmpeq16(a, b);
+        case CV_HAL_CMP_GT:
+            return __nds__scmplt16(b, a);
+        case CV_HAL_CMP_GE:
+            return __nds__scmple16(b, a);
+        case CV_HAL_CMP_LT:
+            return __nds__scmplt16(a, b);
+        case CV_HAL_CMP_LE:
+            return __nds__scmple16(a, b);
+        case CV_HAL_CMP_NE:
+            return ~__nds__cmpeq16(a, b);
+        default:
+            return 0;
+        }
+    }
+};
+
+#undef cv_hal_cmp8u
+#define cv_hal_cmp8u (cv::ndsrvp::elemwise_binop<uchar, uchar, uint8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_cmp_t>)
+
+#undef cv_hal_cmp8s
+#define cv_hal_cmp8s (cv::ndsrvp::elemwise_binop<schar, uchar, int8x8_t, uint8x8_t, 8, cv::ndsrvp::operators_cmp_t>)
+
+#undef cv_hal_cmp16u
+#define cv_hal_cmp16u (cv::ndsrvp::elemwise_binop<ushort, uchar, uint16x4_t, uint8x4_t, 4, cv::ndsrvp::operators_cmp_t>)
+
+#undef cv_hal_cmp16s
+#define cv_hal_cmp16s (cv::ndsrvp::elemwise_binop<short, uchar, int16x4_t, uint8x4_t, 4, cv::ndsrvp::operators_cmp_t>)
+
+// ################ split ################
+
+/*template <typename srctype, typename vsrctype, int nlane>
+int split(const srctype* src_data, srctype** dst_data, int len, int cn)
+{
+    int i, j;
+    for (i = 0; i < len; i++) {
+        for (j = 0; j < cn; j++) {
+            dst_data[j][i] = src_data[i * cn + j];
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_split8u
+#define cv_hal_split8u (cv::ndsrvp::split<uchar, uint8x8_t, 8>)
+
+#undef cv_hal_split16u
+#define cv_hal_split16u (cv::ndsrvp::split<ushort, uint16x4_t, 4>)
+
+#undef cv_hal_split32s
+#define cv_hal_split32s (cv::ndsrvp::split<int, int32x2_t, 2>)*/
+
+// ################ merge ################
+
+/*template <typename srctype, typename vsrctype, int nlane>
+int merge(const srctype** src_data, srctype* dst_data, int len, int cn)
+{
+    int i, j;
+    for (i = 0; i < len; i++) {
+        for (j = 0; j < cn; j++) {
+            dst_data[i * cn + j] = src_data[j][i];
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u (cv::ndsrvp::merge<uchar, uint8x8_t, 8>)
+
+#undef cv_hal_merge16u
+#define cv_hal_merge16u (cv::ndsrvp::merge<ushort, uint16x4_t, 4>)
+
+#undef cv_hal_merge32s
+#define cv_hal_merge32s (cv::ndsrvp::merge<int, int32x2_t, 2>)*/
+
+} // namespace ndsrvp
+
+} // namespace cv
+
+#endif
diff --git a/3rdparty/ndsrvp/include/features2d.hpp b/3rdparty/ndsrvp/include/features2d.hpp
new file mode 100644
index 000000000000..1f6180a7958f
--- /dev/null
+++ b/3rdparty/ndsrvp/include/features2d.hpp
@@ -0,0 +1,8 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_FEATURES2D_HPP
+#define OPENCV_NDSRVP_FEATURES2D_HPP
+
+#endif
diff --git a/3rdparty/ndsrvp/include/imgproc.hpp b/3rdparty/ndsrvp/include/imgproc.hpp
new file mode 100644
index 000000000000..3a572172a831
--- /dev/null
+++ b/3rdparty/ndsrvp/include/imgproc.hpp
@@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_IMGPROC_HPP
+#define OPENCV_NDSRVP_IMGPROC_HPP
+
+namespace cv {
+
+// ################ remap ################
+
+void remap(InputArray _src, OutputArray _dst,
+    InputArray _map1, InputArray _map2,
+    int interpolation, int borderType, const Scalar& borderValue);
+
+namespace ndsrvp {
+
+enum InterpolationMasks {
+    INTER_BITS = 5,
+    INTER_BITS2 = INTER_BITS * 2,
+    INTER_TAB_SIZE = 1 << INTER_BITS,
+    INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE
+};
+
+// ################ integral ################
+
+int integral(int depth, int sdepth, int sqdepth,
+    const uchar* src, size_t _srcstep,
+    uchar* sum, size_t _sumstep,
+    uchar* sqsum, size_t,
+    uchar* tilted, size_t,
+    int width, int height, int cn);
+
+#undef cv_hal_integral
+#define cv_hal_integral (cv::ndsrvp::integral)
+
+// ################ warpAffine ################
+
+int warpAffine(int src_type,
+    const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
+    const double M[6], int interpolation, int borderType, const double borderValue[4]);
+
+#undef cv_hal_warpAffine
+#define cv_hal_warpAffine (cv::ndsrvp::warpAffine)
+
+// ################ warpPerspective ################
+
+int warpPerspective(int src_type,
+    const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
+    const double M[9], int interpolation, int borderType, const double borderValue[4]);
+
+#undef cv_hal_warpPerspective
+#define cv_hal_warpPerspective (cv::ndsrvp::warpPerspective)
+
+// ################ threshold ################
+
+int threshold(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int depth, int cn,
+    double thresh, double maxValue, int thresholdType);
+
+#undef cv_hal_threshold
+#define cv_hal_threshold (cv::ndsrvp::threshold)
+
+} // namespace ndsrvp
+
+} // namespace cv
+
+#endif
diff --git a/3rdparty/ndsrvp/ndsrvp_hal.hpp b/3rdparty/ndsrvp/ndsrvp_hal.hpp
new file mode 100644
index 000000000000..7f126365205a
--- /dev/null
+++ b/3rdparty/ndsrvp/ndsrvp_hal.hpp
@@ -0,0 +1,15 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#ifndef OPENCV_NDSRVP_HAL_HPP
+#define OPENCV_NDSRVP_HAL_HPP
+
+#include "opencv2/core/mat.hpp"
+#include <nds_intrinsic.h>
+
+#include "include/core.hpp"
+#include "include/imgproc.hpp"
+#include "include/features2d.hpp"
+
+#endif
diff --git a/3rdparty/ndsrvp/src/integral.cpp b/3rdparty/ndsrvp/src/integral.cpp
new file mode 100644
index 000000000000..37030a8d4c95
--- /dev/null
+++ b/3rdparty/ndsrvp/src/integral.cpp
@@ -0,0 +1,210 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "ndsrvp_hal.hpp"
+
+namespace cv {
+
+namespace ndsrvp {
+
+int integral(int depth, int sdepth, int sqdepth,
+    const uchar* src, size_t _srcstep,
+    uchar* _sum, size_t _sumstep,
+    uchar* _sqsum, size_t,
+    uchar* _tilted, size_t,
+    int width, int height, int cn)
+{
+    // 8-bit unsigned integer, 32-bit signed integer only
+    if (!(depth == CV_8U && sdepth == CV_32S))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    // too small image
+    if (!(width >> 8 || height >> 8 || cn == 4))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    int* sum = (int*)_sum;
+    double* sqsum = (double*)_sqsum;
+    int* tilted = (int*)_tilted;
+
+    if (sqsum || tilted || cn > 4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    sqdepth = sqdepth;
+    width *= cn;
+
+    memset(sum, 0, (width + cn) * sizeof(int));
+
+    if (cn == 1) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src + _srcstep * i;
+            int* prev_sum_row = (int*)((uchar*)sum + _sumstep * i) + cn;
+            int* sum_row = (int*)((uchar*)sum + _sumstep * (i + 1)) + cn;
+
+            sum_row[-1] = 0;
+
+            int32x2_t prev = { 0, 0 };
+            int j = 0;
+
+            for (; j + 8 <= width; j += 8) {
+                unsigned long vs8x8 = *(unsigned long*)(src_row + j); 
+
+                unsigned long vs810 = __nds__zunpkd810(vs8x8);
+                unsigned long vs832 = __nds__zunpkd832(vs8x8);
+
+                int16x4_t vs16x4 = (int16x4_t)__nds__pkbb32(vs832, vs810);
+
+                vs16x4 += (int16x4_t)((unsigned long)vs16x4 << 16); // gcc vector extension
+                vs16x4 += (int16x4_t)((unsigned long)vs16x4 << 32); // '+' is add16
+
+                //*(int32x2_t*)(sum_row + j) = (int32x2_t) { vs16x4[0], vs16x4[1] } + *(int32x2_t*)(prev_sum_row + j) + prev;
+                //*(int32x2_t*)(sum_row + j + 2) = (int32x2_t) { vs16x4[2], vs16x4[3] } + *(int32x2_t*)(prev_sum_row + j + 2) + prev;
+                // performance loss for unknown reason, commented out, use the following code instead
+
+                sum_row[j] = prev_sum_row[j] + prev[0] + vs16x4[0];
+                sum_row[j + 1] = prev_sum_row[j + 1] + prev[1] + vs16x4[1];
+                sum_row[j + 2] = prev_sum_row[j + 2] + prev[0] + vs16x4[2];
+                sum_row[j + 3] = prev_sum_row[j + 3] + prev[1] + vs16x4[3];
+
+                prev += vs16x4[3]; // prev += (int32x2_t){vs16x4[3], vs16x4[3]};
+
+                vs16x4 = (int16x4_t)__nds__pktt32(vs832, vs810);
+
+                vs16x4 += (int16x4_t)((unsigned long)vs16x4 << 16);
+                vs16x4 += (int16x4_t)((unsigned long)vs16x4 << 32);
+
+                //*(int32x2_t*)(sum_row + j + 4) = (int32x2_t) { vs16x4[0], vs16x4[1] } + *(int32x2_t*)(prev_sum_row + j + 4) + prev;
+                //*(int32x2_t*)(sum_row + j + 6) = (int32x2_t) { vs16x4[2], vs16x4[3] } + *(int32x2_t*)(prev_sum_row + j + 6) + prev;
+                // performance loss for unknown reason, commented out, use the following code instead
+
+                sum_row[j + 4] = prev_sum_row[j + 4] + prev[0] + vs16x4[0];
+                sum_row[j + 5] = prev_sum_row[j + 5] + prev[1] + vs16x4[1];
+                sum_row[j + 6] = prev_sum_row[j + 6] + prev[0] + vs16x4[2];
+                sum_row[j + 7] = prev_sum_row[j + 7] + prev[1] + vs16x4[3];
+
+                prev += vs16x4[3];
+            }
+
+            for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
+                sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
+        }
+    } else if (cn == 2) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src + _srcstep * i;
+            int* prev_sum_row = (int*)((uchar*)sum + _sumstep * i) + cn;
+            int* sum_row = (int*)((uchar*)sum + _sumstep * (i + 1)) + cn;
+
+            sum_row[-1] = sum_row[-2] = 0;
+
+            int32x2_t prev = { 0, 0 };
+            int j = 0;
+            for (; j + 8 <= width; j += 8) {
+                uint8x8_t vs8x8 = *(uint8x8_t*)(src_row + j);
+
+                uint16x4_t vs16x4_1 = __nds__v_zunpkd820(vs8x8);
+                uint16x4_t vs16x4_2 = __nds__v_zunpkd831(vs8x8);
+
+                vs16x4_1 += (int16x4_t)((unsigned long)vs16x4_1 << 16);
+                vs16x4_1 += (int16x4_t)((unsigned long)vs16x4_1 << 32);
+
+                vs16x4_2 += (int16x4_t)((unsigned long)vs16x4_2 << 16);
+                vs16x4_2 += (int16x4_t)((unsigned long)vs16x4_2 << 32);
+
+                *(int32x2_t*)(sum_row + j) = (int32x2_t) { vs16x4_1[0], vs16x4_2[0] } + *(int32x2_t*)(prev_sum_row + j) + prev;
+                *(int32x2_t*)(sum_row + j + 2) = (int32x2_t) { vs16x4_1[1], vs16x4_2[1] } + *(int32x2_t*)(prev_sum_row + j + 2) + prev;
+                *(int32x2_t*)(sum_row + j + 2 * 2) = (int32x2_t) { vs16x4_1[2], vs16x4_2[2] } + *(int32x2_t*)(prev_sum_row + j + 2 * 2) + prev;
+                *(int32x2_t*)(sum_row + j + 2 * 3) = (int32x2_t) { vs16x4_1[3], vs16x4_2[3] } + *(int32x2_t*)(prev_sum_row + j + 2 * 3) + prev;
+
+                prev += (int32x2_t) { vs16x4_1[3], vs16x4_2[3] };
+            }
+
+            for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1],
+                     v1 = sum_row[j - 2] - prev_sum_row[j - 2];
+                 j < width; j += 2) {
+                sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
+                sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
+            }
+        }
+    } else if (cn == 3) {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        /* disabled because of unaligned memory access, difficulty in vectorization, etc.
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src + _srcstep * i;
+            int* prev_sum_row = (int*)((uchar*)sum + _sumstep * i) + cn;
+            int* sum_row = (int*)((uchar*)sum + _sumstep * (i + 1)) + cn;
+
+            sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
+
+            int32x2_t prev_ptr[2] = { { 0, 0 }, { 0, 0 } };
+            int j = 0;
+            for (; j + 3 <= width; j += 3) {
+                //uint8x4_t vs8x4 = *(uint8x4_t*)(src_row + j);
+                // performance loss for unknown reason, commented out, use the following code instead
+
+                uint8x4_t vs8x4 = (uint8x4_t){ src_row[j], src_row[j + 1], src_row[j + 2], 0};
+
+                // [ 0 | 2 | 1 | 3 ]
+                int16x4_t vs16x4 = (int16x4_t)__nds__pkbb32(__nds__zunpkd831((unsigned int)vs8x4), __nds__zunpkd820((unsigned int)vs8x4));
+
+                // [ b | t | b | t ]
+                prev_ptr[0] += (int32x2_t)__nds__pkbb16(0, (unsigned long)vs16x4);
+                prev_ptr[1] += (int32x2_t)__nds__pktt16(0, (unsigned long)vs16x4);
+
+                //*(int32x4_t*)(sum_row + j) = *(int32x4_t*)(prev_sum_row + j) + *(int32x4_t*)prev_ptr;
+                // performance loss for unknown reason, commented out, use the following code instead
+
+                sum_row[j] = prev_sum_row[j] + prev_ptr[0][0];
+                sum_row[j + 1] = prev_sum_row[j + 1] + prev_ptr[0][1];
+                sum_row[j + 2] = prev_sum_row[j + 2] + prev_ptr[1][0];
+            }
+
+            for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1],
+                     v2 = sum_row[j - 2] - prev_sum_row[j - 2],
+                     v1 = sum_row[j - 3] - prev_sum_row[j - 3];
+                 j < width; j += 3) {
+                sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
+                sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
+                sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
+            }
+        }*/
+    } else if (cn == 4) {
+        for (int i = 0; i < height; ++i) {
+            const uchar* src_row = src + _srcstep * i;
+            int* prev_sum_row = (int*)((uchar*)sum + _sumstep * i) + cn;
+            int* sum_row = (int*)((uchar*)sum + _sumstep * (i + 1)) + cn;
+
+            sum_row[-1] = sum_row[-2] = sum_row[-3] = sum_row[-4] = 0;
+
+            int32x2_t prev_ptr[2] = { { 0, 0 }, { 0, 0 } };
+            int j = 0;
+            for (; j + 4 <= width; j += 4) {
+                uint8x4_t vs8x4 = *(uint8x4_t*)(src_row + j);
+
+                // [ 0 | 2 | 1 | 3 ]
+                int16x4_t vs16x4 = (int16x4_t)__nds__pkbb32(__nds__zunpkd831((unsigned int)vs8x4), __nds__zunpkd820((unsigned int)vs8x4));
+
+                // [ b | t | b | t ]
+                prev_ptr[0] += (int32x2_t)__nds__pkbb16(0, (unsigned long)vs16x4);
+                prev_ptr[1] += (int32x2_t)__nds__pktt16(0, (unsigned long)vs16x4);
+
+                *(int32x4_t*)(sum_row + j) = *(int32x4_t*)(prev_sum_row + j) + *(int32x4_t*)prev_ptr;
+            }
+
+            for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1],
+                     v3 = sum_row[j - 2] - prev_sum_row[j - 2],
+                     v2 = sum_row[j - 3] - prev_sum_row[j - 3],
+                     v1 = sum_row[j - 4] - prev_sum_row[j - 4];
+                 j < width; j += 4) {
+                sum_row[j] = (v1 += src_row[j]) + prev_sum_row[j];
+                sum_row[j + 1] = (v2 += src_row[j + 1]) + prev_sum_row[j + 1];
+                sum_row[j + 2] = (v3 += src_row[j + 2]) + prev_sum_row[j + 2];
+                sum_row[j + 3] = (v4 += src_row[j + 3]) + prev_sum_row[j + 3];
+            }
+        }
+    }
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/ndsrvp/src/threshold.cpp b/3rdparty/ndsrvp/src/threshold.cpp
new file mode 100644
index 000000000000..06de591feff8
--- /dev/null
+++ b/3rdparty/ndsrvp/src/threshold.cpp
@@ -0,0 +1,177 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv {
+
+namespace ndsrvp {
+
+template <typename type, typename vtype>
+class operators_threshold_t {
+public:
+    virtual ~operators_threshold_t() {};
+    virtual inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval)
+    {
+        (void)src;
+        (void)thresh;
+        (void)maxval;
+        CV_Error(cv::Error::StsBadArg, "");
+        return vtype();
+    }
+    virtual inline type scalar(const type& src, const type& thresh, const type& maxval)
+    {
+        (void)src;
+        (void)thresh;
+        (void)maxval;
+        CV_Error(cv::Error::StsBadArg, "");
+        return type();
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshBinary : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        return (vtype)__nds__bpick((long)maxval, (long)0, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        return src > thresh ? maxval : 0;
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshBinaryInv : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        return (vtype)__nds__bpick((long)0, (long)maxval, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        return src > thresh ? 0 : maxval;
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshTrunc : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        (void)maxval;
+        return (vtype)__nds__bpick((long)thresh, (long)src, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        (void)maxval;
+        return src > thresh ? thresh : src;
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshToZero : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        (void)maxval;
+        return (vtype)__nds__bpick((long)src, (long)0, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        (void)maxval;
+        return src > thresh ? src : 0;
+    }
+};
+
+template <typename type, typename vtype>
+class opThreshToZeroInv : public operators_threshold_t<type, vtype> {
+    inline vtype vector(const vtype& src, const vtype& thresh, const vtype& maxval) override
+    {
+        (void)maxval;
+        return (vtype)__nds__bpick((long)0, (long)src, (long)(src > thresh));
+    }
+    inline type scalar(const type& src, const type& thresh, const type& maxval) override
+    {
+        (void)maxval;
+        return src > thresh ? 0 : src;
+    }
+};
+
+template <typename type, typename vtype, int nlane>
+static void threshold_op(const type* src_data, size_t src_step,
+    type* dst_data, size_t dst_step,
+    int width, int height, int cn,
+    type thresh, type maxval, int thtype)
+{
+    int i, j;
+    width *= cn;
+    src_step /= sizeof(type);
+    dst_step /= sizeof(type);
+    vtype vthresh;
+    vtype vmaxval;
+    for (i = 0; i < nlane; i++) {
+        vthresh[i] = thresh;
+        vmaxval[i] = maxval;
+    }
+
+    operators_threshold_t<type, vtype>* op;
+    switch (thtype) {
+    case CV_HAL_THRESH_BINARY:
+        op = new opThreshBinary<type, vtype>();
+        break;
+    case CV_HAL_THRESH_BINARY_INV:
+        op = new opThreshBinaryInv<type, vtype>();
+        break;
+    case CV_HAL_THRESH_TRUNC:
+        op = new opThreshTrunc<type, vtype>();
+        break;
+    case CV_HAL_THRESH_TOZERO:
+        op = new opThreshToZero<type, vtype>();
+        break;
+    case CV_HAL_THRESH_TOZERO_INV:
+        op = new opThreshToZeroInv<type, vtype>();
+        break;
+    default:
+        CV_Error(cv::Error::StsBadArg, "");
+        return;
+    }
+
+    for (i = 0; i < height; i++, src_data += src_step, dst_data += dst_step) {
+        for (j = 0; j <= width - nlane; j += nlane) {
+            vtype vs = *(vtype*)(src_data + j);
+            *(vtype*)(dst_data + j) = op->vector(vs, vthresh, vmaxval);
+        }
+        for (; j < width; j++) {
+            dst_data[j] = op->scalar(src_data[j], thresh, maxval);
+        }
+    }
+
+    delete op;
+    return;
+}
+
+int threshold(const uchar* src_data, size_t src_step,
+    uchar* dst_data, size_t dst_step,
+    int width, int height, int depth, int cn,
+    double thresh, double maxValue, int thresholdType)
+{
+    if (width <= 255 && height <= 255) // slower at small size
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (depth == CV_8U) {
+        threshold_op<uchar, uint8x8_t, 8>((uchar*)src_data, src_step, (uchar*)dst_data, dst_step, width, height, cn, (uchar)thresh, (uchar)maxValue, thresholdType);
+        return CV_HAL_ERROR_OK;
+    } else if (depth == CV_16S) {
+        threshold_op<short, int16x4_t, 4>((short*)src_data, src_step, (short*)dst_data, dst_step, width, height, cn, (short)thresh, (short)maxValue, thresholdType);
+        return CV_HAL_ERROR_OK;
+    } else if (depth == CV_16U) {
+        threshold_op<ushort, uint16x4_t, 4>((ushort*)src_data, src_step, (ushort*)dst_data, dst_step, width, height, cn, (ushort)thresh, (ushort)maxValue, thresholdType);
+        return CV_HAL_ERROR_OK;
+    } else {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/ndsrvp/src/warpAffine.cpp b/3rdparty/ndsrvp/src/warpAffine.cpp
new file mode 100644
index 000000000000..d54e4dc23700
--- /dev/null
+++ b/3rdparty/ndsrvp/src/warpAffine.cpp
@@ -0,0 +1,153 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv {
+
+namespace ndsrvp {
+
+class WarpAffineInvoker : public ParallelLoopBody {
+public:
+    WarpAffineInvoker(const Mat& _src, Mat& _dst, int _interpolation, int _borderType,
+        const Scalar& _borderValue, int* _adelta, int* _bdelta, const double* _M)
+        : ParallelLoopBody()
+        , src(_src)
+        , dst(_dst)
+        , interpolation(_interpolation)
+        , borderType(_borderType)
+        , borderValue(_borderValue)
+        , adelta(_adelta)
+        , bdelta(_bdelta)
+        , M(_M)
+    {
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        const int BLOCK_SZ = 64;
+        AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
+        short *XY = __XY.data(), *A = __A.data();
+        const int AB_BITS = MAX(10, (int)INTER_BITS);
+        const int AB_SCALE = 1 << AB_BITS;
+        int round_delta = interpolation == CV_HAL_INTER_NEAREST ? AB_SCALE / 2 : AB_SCALE / INTER_TAB_SIZE / 2, x, y, x1, y1;
+
+        int bh0 = std::min(BLOCK_SZ / 2, dst.rows);
+        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, dst.cols);
+        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, dst.rows);
+
+        for (y = range.start; y < range.end; y += bh0) {
+            for (x = 0; x < dst.cols; x += bw0) {
+                int bw = std::min(bw0, dst.cols - x);
+                int bh = std::min(bh0, range.end - y);
+
+                Mat _XY(bh, bw, CV_16SC2, XY);
+                Mat dpart(dst, Rect(x, y, bw, bh));
+
+                for (y1 = 0; y1 < bh; y1++) {
+                    short* xy = XY + y1 * bw * 2;
+                    int X0 = saturate_cast<int>((M[1] * (y + y1) + M[2]) * AB_SCALE) + round_delta;
+                    int Y0 = saturate_cast<int>((M[4] * (y + y1) + M[5]) * AB_SCALE) + round_delta;
+
+                    if (interpolation == CV_HAL_INTER_NEAREST) {
+                        x1 = 0;
+
+                        for (; x1 < bw; x1 += 2) {
+                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
+                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
+
+                            vX = __nds__v_sclip32(__nds__v_sra32(vX, AB_BITS), 15);
+                            vY = __nds__v_sclip32(__nds__v_sra32(vY, AB_BITS), 15);
+
+                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
+                        }
+
+                        for (; x1 < bw; x1++) {
+                            int X = (X0 + adelta[x + x1]) >> AB_BITS;
+                            int Y = (Y0 + bdelta[x + x1]) >> AB_BITS;
+                            xy[x1 * 2] = saturate_cast<short>(X);
+                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
+                        }
+                    } else {
+                        short* alpha = A + y1 * bw;
+                        x1 = 0;
+
+                        const int INTER_MASK = INTER_TAB_SIZE - 1;
+                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+                        for (; x1 < bw; x1 += 2) {
+                            int32x2_t vX = { X0 + adelta[x + x1], X0 + adelta[x + x1 + 1] };
+                            int32x2_t vY = { Y0 + bdelta[x + x1], Y0 + bdelta[x + x1 + 1] };
+                            vX = __nds__v_sra32(vX, (AB_BITS - INTER_BITS));
+                            vY = __nds__v_sra32(vY, (AB_BITS - INTER_BITS));
+
+                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+                        }
+
+                        for (; x1 < bw; x1++) {
+                            int X = (X0 + adelta[x + x1]) >> (AB_BITS - INTER_BITS);
+                            int Y = (Y0 + bdelta[x + x1]) >> (AB_BITS - INTER_BITS);
+                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
+                        }
+                    }
+                }
+
+                if (interpolation == CV_HAL_INTER_NEAREST)
+                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
+                else {
+                    Mat _matA(bh, bw, CV_16U, A);
+                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
+                }
+            }
+        }
+    }
+
+private:
+    Mat src;
+    Mat dst;
+    int interpolation, borderType;
+    Scalar borderValue;
+    int *adelta, *bdelta;
+    const double* M;
+};
+
+int warpAffine(int src_type,
+    const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
+    const double M[6], int interpolation, int borderType, const double borderValue[4])
+{
+    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+
+    int x;
+    AutoBuffer<int> _abdelta(dst.cols * 2);
+    int *adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    const int AB_SCALE = 1 << AB_BITS;
+
+    for (x = 0; x < dst.cols; x++) {
+        adelta[x] = saturate_cast<int>(M[0] * x * AB_SCALE);
+        bdelta[x] = saturate_cast<int>(M[3] * x * AB_SCALE);
+    }
+
+    Range range(0, dst.rows);
+    WarpAffineInvoker invoker(src, dst, interpolation, borderType,
+        Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
+        adelta, bdelta, M);
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/ndsrvp/src/warpPerspective.cpp b/3rdparty/ndsrvp/src/warpPerspective.cpp
new file mode 100644
index 000000000000..b4fa423ed7ee
--- /dev/null
+++ b/3rdparty/ndsrvp/src/warpPerspective.cpp
@@ -0,0 +1,159 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.	
+
+#include "ndsrvp_hal.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv {
+
+namespace ndsrvp {
+
+class WarpPerspectiveInvoker : public ParallelLoopBody {
+public:
+    WarpPerspectiveInvoker(const Mat& _src, Mat& _dst, const double* _M, int _interpolation,
+        int _borderType, const Scalar& _borderValue)
+        : ParallelLoopBody()
+        , src(_src)
+        , dst(_dst)
+        , M(_M)
+        , interpolation(_interpolation)
+        , borderType(_borderType)
+        , borderValue(_borderValue)
+    {
+    }
+
+    virtual void operator()(const Range& range) const CV_OVERRIDE
+    {
+        const int BLOCK_SZ = 32;
+        short XY[BLOCK_SZ * BLOCK_SZ * 2], A[BLOCK_SZ * BLOCK_SZ];
+        int x, y, y1, width = dst.cols, height = dst.rows;
+
+        int bh0 = std::min(BLOCK_SZ / 2, height);
+        int bw0 = std::min(BLOCK_SZ * BLOCK_SZ / bh0, width);
+        bh0 = std::min(BLOCK_SZ * BLOCK_SZ / bw0, height);
+
+        for (y = range.start; y < range.end; y += bh0) {
+            for (x = 0; x < width; x += bw0) {
+                int bw = std::min(bw0, width - x);
+                int bh = std::min(bh0, range.end - y); // height
+
+                Mat _XY(bh, bw, CV_16SC2, XY);
+                Mat dpart(dst, Rect(x, y, bw, bh));
+
+                for (y1 = 0; y1 < bh; y1++) {
+                    short* xy = XY + y1 * bw * 2;
+                    double X0 = M[0] * x + M[1] * (y + y1) + M[2];
+                    double Y0 = M[3] * x + M[4] * (y + y1) + M[5];
+                    double W0 = M[6] * x + M[7] * (y + y1) + M[8];
+
+                    if (interpolation == CV_HAL_INTER_NEAREST) {
+                        int x1 = 0;
+
+                        for (; x1 < bw; x1 += 2) {
+                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+                            W1 = W1 ? 1. / W1 : 0;
+                            W2 = W2 ? 1. / W2 : 0;
+                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+                            vX = __nds__v_sclip32(vX, 15);
+                            vY = __nds__v_sclip32(vY, 15);
+
+                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vY, (unsigned long)vX);
+                        }
+
+                        for (; x1 < bw; x1++) {
+                            double W = W0 + M[6] * x1;
+                            W = W ? 1. / W : 0;
+                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+                            int X = saturate_cast<int>(fX);
+                            int Y = saturate_cast<int>(fY);
+
+                            xy[x1 * 2] = saturate_cast<short>(X);
+                            xy[x1 * 2 + 1] = saturate_cast<short>(Y);
+                        }
+                    } else {
+                        short* alpha = A + y1 * bw;
+                        int x1 = 0;
+
+                        const int INTER_MASK = INTER_TAB_SIZE - 1;
+                        const uint32x2_t vmask = { INTER_MASK, INTER_MASK };
+                        for (; x1 < bw; x1 += 2) {
+                            double W1 = W0 + M[6] * x1, W2 = W1 + M[6];
+                            W1 = W1 ? INTER_TAB_SIZE / W1 : 0;
+                            W2 = W2 ? INTER_TAB_SIZE / W2 : 0;
+                            double fX1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W1));
+                            double fX2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * (x1 + 1)) * W2));
+                            double fY1 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W1));
+                            double fY2 = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * (x1 + 1)) * W2));
+
+                            int32x2_t vX = {saturate_cast<int>(fX1), saturate_cast<int>(fX2)};
+                            int32x2_t vY = {saturate_cast<int>(fY1), saturate_cast<int>(fY2)};
+
+                            int32x2_t vx = __nds__v_sclip32(__nds__v_sra32(vX, INTER_BITS), 15);
+                            int32x2_t vy = __nds__v_sclip32(__nds__v_sra32(vY, INTER_BITS), 15);
+
+                            *(uint16x4_t*)(xy + x1 * 2) = (uint16x4_t)__nds__pkbb16((unsigned long)vy, (unsigned long)vx);
+
+                            uint32x2_t valpha = __nds__v_uadd32(__nds__v_sll32((uint32x2_t)(vY & vmask), INTER_BITS), (uint32x2_t)(vX & vmask));
+                            *(int16x2_t*)(alpha + x1) = (int16x2_t) { (short)(valpha[0]), (short)(valpha[1]) };
+                        }
+
+                        for (; x1 < bw; x1++) {
+                            double W = W0 + M[6] * x1;
+                            W = W ? INTER_TAB_SIZE / W : 0;
+                            double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0] * x1) * W));
+                            double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3] * x1) * W));
+                            int X = saturate_cast<int>(fX);
+                            int Y = saturate_cast<int>(fY);
+
+                            xy[x1 * 2] = saturate_cast<short>(X >> INTER_BITS);
+                            xy[x1 * 2 + 1] = saturate_cast<short>(Y >> INTER_BITS);
+                            alpha[x1] = (short)((Y & (INTER_TAB_SIZE - 1)) * INTER_TAB_SIZE + (X & (INTER_TAB_SIZE - 1)));
+                        }
+                    }
+                }
+
+                if (interpolation == CV_HAL_INTER_NEAREST)
+                    remap(src, dpart, _XY, Mat(), interpolation, borderType, borderValue);
+                else {
+                    Mat _matA(bh, bw, CV_16U, A);
+                    remap(src, dpart, _XY, _matA, interpolation, borderType, borderValue);
+                }
+            }
+        }
+    }
+
+private:
+    Mat src;
+    Mat dst;
+    const double* M;
+    int interpolation, borderType;
+    Scalar borderValue;
+};
+
+int warpPerspective(int src_type,
+    const uchar* src_data, size_t src_step, int src_width, int src_height,
+    uchar* dst_data, size_t dst_step, int dst_width, int dst_height,
+    const double M[9], int interpolation, int borderType, const double borderValue[4])
+{
+    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+
+    Range range(0, dst.rows);
+    WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
+    parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
+    return CV_HAL_ERROR_OK;
+}
+
+} // namespace ndsrvp
+
+} // namespace cv
diff --git a/3rdparty/openexr/IlmImf/ImfConvert.cpp b/3rdparty/openexr/IlmImf/ImfConvert.cpp
index cce7163c196b..62c0305935ce 100644
--- a/3rdparty/openexr/IlmImf/ImfConvert.cpp
+++ b/3rdparty/openexr/IlmImf/ImfConvert.cpp
@@ -107,7 +107,7 @@ floatToUint (float f)
     if (isNegative (f) || isNan (f))
 	return 0;
 
-    if (isInfinity (f) || f > UINT_MAX)
+    if (isInfinity (f) || f > (float)UINT_MAX)
 	return UINT_MAX;
 
     return (unsigned int) f;
diff --git a/3rdparty/openjpeg/CMakeLists.txt b/3rdparty/openjpeg/CMakeLists.txt
index d3db9e8c47ea..188381f1e282 100644
--- a/3rdparty/openjpeg/CMakeLists.txt
+++ b/3rdparty/openjpeg/CMakeLists.txt
@@ -16,6 +16,7 @@ ocv_warnings_disable(CMAKE_C_FLAGS
     -Wunused-but-set-variable # clang15
     -Wmissing-prototypes # clang, function opj_t1_ht_decode_cblk
     -Wmissing-declarations # gcc, function opj_t1_ht_decode_cblk
+    -Wdocumentation # clang
 )
 
 #-----------------------------------------------------------------------------
diff --git a/3rdparty/openjpeg/openjp2/bio.c b/3rdparty/openjpeg/openjp2/bio.c
index 09dcd7f52492..8106df754edb 100644
--- a/3rdparty/openjpeg/openjp2/bio.c
+++ b/3rdparty/openjpeg/openjp2/bio.c
@@ -43,12 +43,6 @@
 /** @name Local static functions */
 /*@{*/
 
-/**
-Write a bit
-@param bio BIO handle
-@param b Bit to write (0 or 1)
-*/
-static void opj_bio_putbit(opj_bio_t *bio, OPJ_UINT32 b);
 /**
 Read a bit
 @param bio BIO handle
@@ -100,16 +94,6 @@ static OPJ_BOOL opj_bio_bytein(opj_bio_t *bio)
     return OPJ_TRUE;
 }
 
-static void opj_bio_putbit(opj_bio_t *bio, OPJ_UINT32 b)
-{
-    if (bio->ct == 0) {
-        opj_bio_byteout(
-            bio); /* MSD: why not check the return value of this function ? */
-    }
-    bio->ct--;
-    bio->buf |= b << bio->ct;
-}
-
 static OPJ_UINT32 opj_bio_getbit(opj_bio_t *bio)
 {
     if (bio->ct == 0) {
@@ -162,6 +146,16 @@ void opj_bio_init_dec(opj_bio_t *bio, OPJ_BYTE *bp, OPJ_UINT32 len)
     bio->ct = 0;
 }
 
+void opj_bio_putbit(opj_bio_t *bio, OPJ_UINT32 b)
+{
+    if (bio->ct == 0) {
+        opj_bio_byteout(
+            bio); /* MSD: why not check the return value of this function ? */
+    }
+    bio->ct--;
+    bio->buf |= b << bio->ct;
+}
+
 void opj_bio_write(opj_bio_t *bio, OPJ_UINT32 v, OPJ_UINT32 n)
 {
     OPJ_INT32 i;
diff --git a/3rdparty/openjpeg/openjp2/bio.h b/3rdparty/openjpeg/openjp2/bio.h
index 448fdda2190c..d482f9ead5a7 100644
--- a/3rdparty/openjpeg/openjp2/bio.h
+++ b/3rdparty/openjpeg/openjp2/bio.h
@@ -106,6 +106,14 @@ Write bits
 @param n Number of bits to write
 */
 void opj_bio_write(opj_bio_t *bio, OPJ_UINT32 v, OPJ_UINT32 n);
+
+/**
+Write a bit
+@param bio BIO handle
+@param b Bit to write (0 or 1)
+*/
+void opj_bio_putbit(opj_bio_t *bio, OPJ_UINT32 b);
+
 /**
 Read bits
 @param bio BIO handle
diff --git a/3rdparty/openjpeg/openjp2/dwt.c b/3rdparty/openjpeg/openjp2/dwt.c
index abc500eca6bb..6b18c5dd6e9d 100644
--- a/3rdparty/openjpeg/openjp2/dwt.c
+++ b/3rdparty/openjpeg/openjp2/dwt.c
@@ -2083,7 +2083,9 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
     OPJ_SIZE_T h_mem_size;
     int num_threads;
 
-    if (numres == 1U) {
+    /* Not entirely sure for the return code of w == 0 which is triggered per */
+    /* https://github.com/uclouvain/openjpeg/issues/1505 */
+    if (numres == 1U || w == 0) {
         return OPJ_TRUE;
     }
     num_threads = opj_thread_pool_get_thread_count(tp);
diff --git a/3rdparty/openjpeg/openjp2/ht_dec.c b/3rdparty/openjpeg/openjp2/ht_dec.c
index 1eb4d525f115..a554b24a6a2c 100644
--- a/3rdparty/openjpeg/openjp2/ht_dec.c
+++ b/3rdparty/openjpeg/openjp2/ht_dec.c
@@ -55,6 +55,16 @@
 #define OPJ_COMPILER_GNUC
 #endif
 
+#if defined(OPJ_COMPILER_MSVC) && defined(_M_ARM64) \
+    && !defined(_M_ARM64EC) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
+    && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#define MSVC_NEON_INTRINSICS
+#endif
+
+#ifdef MSVC_NEON_INTRINSICS
+#include <arm64_neon.h>
+#endif
+
 //************************************************************************/
 /** @brief Displays the error message for disabling the decoding of SPP and
   * MRP passes
@@ -69,8 +79,11 @@ static OPJ_BOOL only_cleanup_pass_is_decoded = OPJ_FALSE;
 static INLINE
 OPJ_UINT32 population_count(OPJ_UINT32 val)
 {
-#ifdef OPJ_COMPILER_MSVC
+#if defined(OPJ_COMPILER_MSVC) && (defined(_M_IX86) || defined(_M_AMD64))
     return (OPJ_UINT32)__popcnt(val);
+#elif defined(OPJ_COMPILER_MSVC) && defined(MSVC_NEON_INTRINSICS)
+    const __n64 temp = neon_cnt(__uint64ToN64_v(val));
+    return neon_addv8(temp).n8_i8[0];
 #elif (defined OPJ_COMPILER_GNUC)
     return (OPJ_UINT32)__builtin_popcount(val);
 #else
@@ -294,7 +307,7 @@ void mel_decode(dec_mel_t *melp)
   *  @param [in]  scup is the length of MEL+VLC segments
   */
 static INLINE
-void mel_init(dec_mel_t *melp, OPJ_UINT8* bbuf, int lcup, int scup)
+OPJ_BOOL mel_init(dec_mel_t *melp, OPJ_UINT8* bbuf, int lcup, int scup)
 {
     int num;
     int i;
@@ -316,7 +329,9 @@ void mel_init(dec_mel_t *melp, OPJ_UINT8* bbuf, int lcup, int scup)
         OPJ_UINT64 d;
         int d_bits;
 
-        assert(melp->unstuff == OPJ_FALSE || melp->data[0] <= 0x8F);
+        if (melp->unstuff == OPJ_TRUE && melp->data[0] > 0x8F) {
+            return OPJ_FALSE;
+        }
         d = (melp->size > 0) ? *melp->data : 0xFF; // if buffer is consumed
         // set data to 0xFF
         if (melp->size == 1) {
@@ -332,6 +347,7 @@ void mel_init(dec_mel_t *melp, OPJ_UINT8* bbuf, int lcup, int scup)
     }
     melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
     // is the MSB
+    return OPJ_TRUE;
 }
 
 //************************************************************************/
@@ -1063,7 +1079,7 @@ static OPJ_BOOL opj_t1_allocate_buffers(
         if (flagssize > t1->flagssize) {
 
             opj_aligned_free(t1->flags);
-            t1->flags = (opj_flag_t*) opj_aligned_malloc(flagssize);
+            t1->flags = (opj_flag_t*) opj_aligned_malloc(flagssize * sizeof(opj_flag_t));
             if (!t1->flags) {
                 /* FIXME event manager error callback */
                 return OPJ_FALSE;
@@ -1071,7 +1087,7 @@ static OPJ_BOOL opj_t1_allocate_buffers(
         }
         t1->flagssize = flagssize;
 
-        memset(t1->flags, 0, flagssize);
+        memset(t1->flags, 0, flagssize * sizeof(opj_flag_t));
     }
 
     t1->w = w;
@@ -1080,6 +1096,26 @@ static OPJ_BOOL opj_t1_allocate_buffers(
     return OPJ_TRUE;
 }
 
+/**
+Decode 1 HT code-block
+@param t1 T1 handle
+@param cblk Code-block coding parameters
+@param orient
+@param roishift Region of interest shifting value
+@param cblksty Code-block style
+@param p_manager the event manager
+@param p_manager_mutex mutex for the event manager
+@param check_pterm whether PTERM correct termination should be checked
+*/
+OPJ_BOOL opj_t1_ht_decode_cblk(opj_t1_t *t1,
+                               opj_tcd_cblk_dec_t* cblk,
+                               OPJ_UINT32 orient,
+                               OPJ_UINT32 roishift,
+                               OPJ_UINT32 cblksty,
+                               opj_event_mgr_t *p_manager,
+                               opj_mutex_t* p_manager_mutex,
+                               OPJ_BOOL check_pterm);
+
 //************************************************************************/
 /** @brief Decodes one codeblock, processing the cleanup, siginificance
   *         propagation, and magnitude refinement pass
@@ -1187,6 +1223,9 @@ OPJ_BOOL opj_t1_ht_decode_cblk(opj_t1_t *t1,
 
         /* Concatenate all chunks */
         cblkdata = t1->cblkdatabuffer;
+        if (cblkdata == NULL) {
+            return OPJ_FALSE;
+        }
         cblk_len = 0;
         for (i = 0; i < cblk->numchunks; i++) {
             memcpy(cblkdata + cblk_len, cblk->chunks[i].data, cblk->chunks[i].len);
@@ -1374,7 +1413,17 @@ OPJ_BOOL opj_t1_ht_decode_cblk(opj_t1_t *t1,
     }
 
     // init structures
-    mel_init(&mel, coded_data, lcup, scup);
+    if (mel_init(&mel, coded_data, lcup, scup) == OPJ_FALSE) {
+        if (p_manager_mutex) {
+            opj_mutex_lock(p_manager_mutex);
+        }
+        opj_event_msg(p_manager, EVT_ERROR, "Malformed HT codeblock. "
+                      "Incorrect MEL segment sequence.\n");
+        if (p_manager_mutex) {
+            opj_mutex_unlock(p_manager_mutex);
+        }
+        return OPJ_FALSE;
+    }
     rev_init(&vlc, coded_data, lcup, scup);
     frwd_init(&magsgn, coded_data, lcup - scup, 0xFF);
     if (num_passes > 1) { // needs to be tested
diff --git a/3rdparty/openjpeg/openjp2/j2k.c b/3rdparty/openjpeg/openjp2/j2k.c
index bcce3165686e..c0551870b2bd 100644
--- a/3rdparty/openjpeg/openjp2/j2k.c
+++ b/3rdparty/openjpeg/openjp2/j2k.c
@@ -2333,10 +2333,8 @@ static OPJ_BOOL opj_j2k_read_siz(opj_j2k_t *p_j2k,
     }
 
     /* Compute the number of tiles */
-    l_cp->tw = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(l_image->x1 - l_cp->tx0),
-                                           (OPJ_INT32)l_cp->tdx);
-    l_cp->th = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(l_image->y1 - l_cp->ty0),
-                                           (OPJ_INT32)l_cp->tdy);
+    l_cp->tw = opj_uint_ceildiv(l_image->x1 - l_cp->tx0, l_cp->tdx);
+    l_cp->th = opj_uint_ceildiv(l_image->y1 - l_cp->ty0, l_cp->tdy);
 
     /* Check that the number of tiles is valid */
     if (l_cp->tw == 0 || l_cp->th == 0 || l_cp->tw > 65535 / l_cp->th) {
@@ -2353,12 +2351,12 @@ static OPJ_BOOL opj_j2k_read_siz(opj_j2k_t *p_j2k,
             (p_j2k->m_specific_param.m_decoder.m_start_tile_x - l_cp->tx0) / l_cp->tdx;
         p_j2k->m_specific_param.m_decoder.m_start_tile_y =
             (p_j2k->m_specific_param.m_decoder.m_start_tile_y - l_cp->ty0) / l_cp->tdy;
-        p_j2k->m_specific_param.m_decoder.m_end_tile_x = (OPJ_UINT32)opj_int_ceildiv((
-                    OPJ_INT32)(p_j2k->m_specific_param.m_decoder.m_end_tile_x - l_cp->tx0),
-                (OPJ_INT32)l_cp->tdx);
-        p_j2k->m_specific_param.m_decoder.m_end_tile_y = (OPJ_UINT32)opj_int_ceildiv((
-                    OPJ_INT32)(p_j2k->m_specific_param.m_decoder.m_end_tile_y - l_cp->ty0),
-                (OPJ_INT32)l_cp->tdy);
+        p_j2k->m_specific_param.m_decoder.m_end_tile_x = opj_uint_ceildiv(
+                    p_j2k->m_specific_param.m_decoder.m_end_tile_x - l_cp->tx0,
+                    l_cp->tdx);
+        p_j2k->m_specific_param.m_decoder.m_end_tile_y = opj_uint_ceildiv(
+                    p_j2k->m_specific_param.m_decoder.m_end_tile_y - l_cp->ty0,
+                    l_cp->tdy);
     } else {
         p_j2k->m_specific_param.m_decoder.m_start_tile_x = 0;
         p_j2k->m_specific_param.m_decoder.m_start_tile_y = 0;
@@ -3961,9 +3959,12 @@ static OPJ_BOOL opj_j2k_merge_ppm(opj_cp_t *p_cp, opj_event_mgr_t * p_manager)
                     opj_read_bytes(l_data, &l_N_ppm, 4);
                     l_data += 4;
                     l_data_size -= 4;
-                    l_ppm_data_size +=
-                        l_N_ppm; /* can't overflow, max 256 markers of max 65536 bytes, that is when PPM markers are not corrupted which is checked elsewhere */
 
+                    if (l_ppm_data_size > UINT_MAX - l_N_ppm) {
+                        opj_event_msg(p_manager, EVT_ERROR, "Too large value for Nppm\n");
+                        return OPJ_FALSE;
+                    }
+                    l_ppm_data_size += l_N_ppm;
                     if (l_data_size >= l_N_ppm) {
                         l_data_size -= l_N_ppm;
                         l_data += l_N_ppm;
@@ -6726,7 +6727,7 @@ OPJ_BOOL opj_j2k_set_threads(opj_j2k_t *j2k, OPJ_UINT32 num_threads)
     return OPJ_FALSE;
 }
 
-static int opj_j2k_get_default_thread_count()
+static int opj_j2k_get_default_thread_count(void)
 {
     const char* num_threads_str = getenv("OPJ_NUM_THREADS");
     int num_cpus;
@@ -7666,6 +7667,27 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
         return OPJ_FALSE;
     }
 
+    if (parameters->cp_fixed_alloc) {
+        if (parameters->cp_matrice == NULL) {
+            opj_event_msg(p_manager, EVT_ERROR,
+                          "cp_fixed_alloc set, but cp_matrice missing\n");
+            return OPJ_FALSE;
+        }
+
+        if (parameters->tcp_numlayers > J2K_TCD_MATRIX_MAX_LAYER_COUNT) {
+            opj_event_msg(p_manager, EVT_ERROR,
+                          "tcp_numlayers when cp_fixed_alloc set should not exceed %d\n",
+                          J2K_TCD_MATRIX_MAX_LAYER_COUNT);
+            return OPJ_FALSE;
+        }
+        if (parameters->numresolution > J2K_TCD_MATRIX_MAX_RESOLUTION_COUNT) {
+            opj_event_msg(p_manager, EVT_ERROR,
+                          "numresolution when cp_fixed_alloc set should not exceed %d\n",
+                          J2K_TCD_MATRIX_MAX_RESOLUTION_COUNT);
+            return OPJ_FALSE;
+        }
+    }
+
     p_j2k->m_specific_param.m_encoder.m_nb_comps = image->numcomps;
 
     /* keep a link to cp so that we can destroy it later in j2k_destroy_compress */
@@ -7796,7 +7818,7 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
                                        image->comps[0].h * image->comps[0].prec) /
                                       ((double)parameters->tcp_rates[parameters->tcp_numlayers - 1] * 8 *
                                        image->comps[0].dx * image->comps[0].dy));
-            if (temp_size > INT_MAX) {
+            if (temp_size > (OPJ_FLOAT32)INT_MAX) {
                 parameters->max_cs_size = INT_MAX;
             } else {
                 parameters->max_cs_size = (int) floor(temp_size);
@@ -7885,15 +7907,17 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
     cp->m_specific_param.m_enc.m_max_comp_size = (OPJ_UINT32)
             parameters->max_comp_size;
     cp->rsiz = parameters->rsiz;
-    cp->m_specific_param.m_enc.m_disto_alloc = (OPJ_UINT32)
-            parameters->cp_disto_alloc & 1u;
-    cp->m_specific_param.m_enc.m_fixed_alloc = (OPJ_UINT32)
-            parameters->cp_fixed_alloc & 1u;
-    cp->m_specific_param.m_enc.m_fixed_quality = (OPJ_UINT32)
-            parameters->cp_fixed_quality & 1u;
-
-    /* mod fixed_quality */
-    if (parameters->cp_fixed_alloc && parameters->cp_matrice) {
+    if (parameters->cp_fixed_alloc) {
+        cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy = FIXED_LAYER;
+    } else if (parameters->cp_fixed_quality) {
+        cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy =
+            FIXED_DISTORTION_RATIO;
+    } else {
+        cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy =
+            RATE_DISTORTION_RATIO;
+    }
+
+    if (parameters->cp_fixed_alloc) {
         size_t array_size = (size_t)parameters->tcp_numlayers *
                             (size_t)parameters->numresolution * 3 * sizeof(OPJ_INT32);
         cp->m_specific_param.m_enc.m_matrice = (OPJ_INT32 *) opj_malloc(array_size);
@@ -7931,21 +7955,24 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
 
         /* UniPG>> */
 #ifdef USE_JPWL
-        cp->comment = (char*)opj_malloc(clen + strlen(version) + 11);
+        const size_t cp_comment_buf_size = clen + strlen(version) + 11;
+        cp->comment = (char*)opj_malloc(cp_comment_buf_size);
         if (!cp->comment) {
             opj_event_msg(p_manager, EVT_ERROR,
                           "Not enough memory to allocate comment string\n");
             return OPJ_FALSE;
         }
-        sprintf(cp->comment, "%s%s with JPWL", comment, version);
+        snprintf(cp->comment, cp_comment_buf_size, "%s%s with JPWL",
+                 comment, version);
 #else
-        cp->comment = (char*)opj_malloc(clen + strlen(version) + 1);
+        const size_t cp_comment_buf_size = clen + strlen(version) + 1;
+        cp->comment = (char*)opj_malloc(cp_comment_buf_size);
         if (!cp->comment) {
             opj_event_msg(p_manager, EVT_ERROR,
                           "Not enough memory to allocate comment string\n");
             return OPJ_FALSE;
         }
-        sprintf(cp->comment, "%s%s", comment, version);
+        snprintf(cp->comment, cp_comment_buf_size, "%s%s", comment, version);
 #endif
         /* <<UniPG */
     }
@@ -7963,10 +7990,8 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
             opj_event_msg(p_manager, EVT_ERROR, "Invalid tile height\n");
             return OPJ_FALSE;
         }
-        cp->tw = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(image->x1 - cp->tx0),
-                                             (OPJ_INT32)cp->tdx);
-        cp->th = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)(image->y1 - cp->ty0),
-                                             (OPJ_INT32)cp->tdy);
+        cp->tw = opj_uint_ceildiv(image->x1 - cp->tx0, cp->tdx);
+        cp->th = opj_uint_ceildiv(image->y1 - cp->ty0, cp->tdy);
         /* Check that the number of tiles is valid */
         if (cp->tw > 65535 / cp->th) {
             opj_event_msg(p_manager, EVT_ERROR,
@@ -8051,22 +8076,25 @@ OPJ_BOOL opj_j2k_setup_encoder(opj_j2k_t *p_j2k,
 
     for (tileno = 0; tileno < cp->tw * cp->th; tileno++) {
         opj_tcp_t *tcp = &cp->tcps[tileno];
+        const OPJ_BOOL fixed_distoratio =
+            cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy ==
+            FIXED_DISTORTION_RATIO;
         tcp->numlayers = (OPJ_UINT32)parameters->tcp_numlayers;
 
         for (j = 0; j < tcp->numlayers; j++) {
             if (OPJ_IS_CINEMA(cp->rsiz) || OPJ_IS_IMF(cp->rsiz)) {
-                if (cp->m_specific_param.m_enc.m_fixed_quality) {
+                if (fixed_distoratio) {
                     tcp->distoratio[j] = parameters->tcp_distoratio[j];
                 }
                 tcp->rates[j] = parameters->tcp_rates[j];
             } else {
-                if (cp->m_specific_param.m_enc.m_fixed_quality) {       /* add fixed_quality */
+                if (fixed_distoratio) {
                     tcp->distoratio[j] = parameters->tcp_distoratio[j];
                 } else {
                     tcp->rates[j] = parameters->tcp_rates[j];
                 }
             }
-            if (!cp->m_specific_param.m_enc.m_fixed_quality &&
+            if (!fixed_distoratio &&
                     tcp->rates[j] <= 1.0) {
                 tcp->rates[j] = 0.0;    /* force lossless */
             }
@@ -10160,10 +10188,8 @@ static OPJ_BOOL opj_j2k_update_image_dimensions(opj_image_t* p_image,
             return OPJ_FALSE;
         }
 
-        l_img_comp->x0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->x0,
-                         (OPJ_INT32)l_img_comp->dx);
-        l_img_comp->y0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->y0,
-                         (OPJ_INT32)l_img_comp->dy);
+        l_img_comp->x0 = opj_uint_ceildiv(p_image->x0, l_img_comp->dx);
+        l_img_comp->y0 = opj_uint_ceildiv(p_image->y0, l_img_comp->dy);
         l_comp_x1 = opj_int_ceildiv((OPJ_INT32)p_image->x1, (OPJ_INT32)l_img_comp->dx);
         l_comp_y1 = opj_int_ceildiv((OPJ_INT32)p_image->y1, (OPJ_INT32)l_img_comp->dy);
 
@@ -10366,8 +10392,8 @@ OPJ_BOOL opj_j2k_set_decode_area(opj_j2k_t *p_j2k,
         p_j2k->m_specific_param.m_decoder.m_end_tile_x = l_cp->tw;
         p_image->x1 = l_image->x1;
     } else {
-        p_j2k->m_specific_param.m_decoder.m_end_tile_x = (OPJ_UINT32)opj_int_ceildiv(
-                    p_end_x - (OPJ_INT32)l_cp->tx0, (OPJ_INT32)l_cp->tdx);
+        p_j2k->m_specific_param.m_decoder.m_end_tile_x = opj_uint_ceildiv((
+                    OPJ_UINT32)p_end_x - l_cp->tx0, l_cp->tdx);
         p_image->x1 = (OPJ_UINT32)p_end_x;
     }
 
@@ -10390,8 +10416,8 @@ OPJ_BOOL opj_j2k_set_decode_area(opj_j2k_t *p_j2k,
         p_j2k->m_specific_param.m_decoder.m_end_tile_y = l_cp->th;
         p_image->y1 = l_image->y1;
     } else {
-        p_j2k->m_specific_param.m_decoder.m_end_tile_y = (OPJ_UINT32)opj_int_ceildiv(
-                    p_end_y - (OPJ_INT32)l_cp->ty0, (OPJ_INT32)l_cp->tdy);
+        p_j2k->m_specific_param.m_decoder.m_end_tile_y = opj_uint_ceildiv((
+                    OPJ_UINT32)p_end_y - l_cp->ty0, l_cp->tdy);
         p_image->y1 = (OPJ_UINT32)p_end_y;
     }
     /* ----- */
@@ -11078,6 +11104,10 @@ static OPJ_BOOL opj_j2k_read_SQcd_SQcc(opj_j2k_t *p_j2k,
                 l_tccp->stepsizes[l_band_no].mant = 0;
             }
         }
+
+        if (*p_header_size < l_num_band) {
+            return OPJ_FALSE;
+        }
         *p_header_size = *p_header_size - l_num_band;
     } else {
         for (l_band_no = 0; l_band_no < l_num_band; l_band_no++) {
@@ -11088,6 +11118,10 @@ static OPJ_BOOL opj_j2k_read_SQcd_SQcc(opj_j2k_t *p_j2k,
                 l_tccp->stepsizes[l_band_no].mant = l_tmp & 0x7ff;
             }
         }
+
+        if (*p_header_size < 2 * l_num_band) {
+            return OPJ_FALSE;
+        }
         *p_header_size = *p_header_size - 2 * l_num_band;
     }
 
@@ -11315,9 +11349,12 @@ static void opj_j2k_dump_MH_info(opj_j2k_t* p_j2k, FILE* out_stream)
 
     fprintf(out_stream, "Codestream info from main header: {\n");
 
-    fprintf(out_stream, "\t tx0=%d, ty0=%d\n", p_j2k->m_cp.tx0, p_j2k->m_cp.ty0);
-    fprintf(out_stream, "\t tdx=%d, tdy=%d\n", p_j2k->m_cp.tdx, p_j2k->m_cp.tdy);
-    fprintf(out_stream, "\t tw=%d, th=%d\n", p_j2k->m_cp.tw, p_j2k->m_cp.th);
+    fprintf(out_stream, "\t tx0=%" PRIu32 ", ty0=%" PRIu32 "\n", p_j2k->m_cp.tx0,
+            p_j2k->m_cp.ty0);
+    fprintf(out_stream, "\t tdx=%" PRIu32 ", tdy=%" PRIu32 "\n", p_j2k->m_cp.tdx,
+            p_j2k->m_cp.tdy);
+    fprintf(out_stream, "\t tw=%" PRIu32 ", th=%" PRIu32 "\n", p_j2k->m_cp.tw,
+            p_j2k->m_cp.th);
     opj_j2k_dump_tile_info(p_j2k->m_specific_param.m_decoder.m_default_tcp,
                            (OPJ_INT32)p_j2k->m_private_image->numcomps, out_stream);
     fprintf(out_stream, "}\n");
@@ -11947,7 +11984,7 @@ static OPJ_BOOL opj_j2k_move_data_from_codec_to_output_image(opj_j2k_t * p_j2k,
             p_image->comps[compno].data = p_j2k->m_output_image->comps[compno].data;
 #if 0
             char fn[256];
-            sprintf(fn, "/tmp/%d.raw", compno);
+            snprintf(fn, sizeof fn, "/tmp/%d.raw", compno);
             FILE *debug = fopen(fn, "wb");
             fwrite(p_image->comps[compno].data, sizeof(OPJ_INT32),
                    p_image->comps[compno].w * p_image->comps[compno].h, debug);
@@ -12073,10 +12110,8 @@ OPJ_BOOL opj_j2k_get_tile(opj_j2k_t *p_j2k,
 
         l_img_comp->factor = p_j2k->m_private_image->comps[compno].factor;
 
-        l_img_comp->x0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->x0,
-                         (OPJ_INT32)l_img_comp->dx);
-        l_img_comp->y0 = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)p_image->y0,
-                         (OPJ_INT32)l_img_comp->dy);
+        l_img_comp->x0 = opj_uint_ceildiv(p_image->x0, l_img_comp->dx);
+        l_img_comp->y0 = opj_uint_ceildiv(p_image->y0, l_img_comp->dy);
         l_comp_x1 = opj_int_ceildiv((OPJ_INT32)p_image->x1, (OPJ_INT32)l_img_comp->dx);
         l_comp_y1 = opj_int_ceildiv((OPJ_INT32)p_image->y1, (OPJ_INT32)l_img_comp->dy);
 
@@ -12456,12 +12491,9 @@ static void opj_get_tile_dimensions(opj_image_t * l_image,
 
     *l_width  = (OPJ_UINT32)(l_tilec->x1 - l_tilec->x0);
     *l_height = (OPJ_UINT32)(l_tilec->y1 - l_tilec->y0);
-    *l_offset_x = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)l_image->x0,
-                  (OPJ_INT32)l_img_comp->dx);
-    *l_offset_y = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)l_image->y0,
-                  (OPJ_INT32)l_img_comp->dy);
-    *l_image_width = (OPJ_UINT32)opj_int_ceildiv((OPJ_INT32)l_image->x1 -
-                     (OPJ_INT32)l_image->x0, (OPJ_INT32)l_img_comp->dx);
+    *l_offset_x = opj_uint_ceildiv(l_image->x0, l_img_comp->dx);
+    *l_offset_y = opj_uint_ceildiv(l_image->y0, l_img_comp->dy);
+    *l_image_width = opj_uint_ceildiv(l_image->x1 - l_image->x0, l_img_comp->dx);
     *l_stride = *l_image_width - *l_width;
     *l_tile_offset = ((OPJ_UINT32)l_tilec->x0 - *l_offset_x) + ((
                          OPJ_UINT32)l_tilec->y0 - *l_offset_y) * *l_image_width;
diff --git a/3rdparty/openjpeg/openjp2/j2k.h b/3rdparty/openjpeg/openjp2/j2k.h
index 04fba645affe..e0b9688a353f 100644
--- a/3rdparty/openjpeg/openjp2/j2k.h
+++ b/3rdparty/openjpeg/openjp2/j2k.h
@@ -113,6 +113,9 @@ The functions in J2K.C have for goal to read/write the several parts of the code
 
 #define J2K_MAX_POCS    32      /**< Maximum number of POCs */
 
+#define J2K_TCD_MATRIX_MAX_LAYER_COUNT 10
+#define J2K_TCD_MATRIX_MAX_RESOLUTION_COUNT 10
+
 /* ----------------------------------------------------------------------- */
 
 /**
@@ -272,7 +275,7 @@ typedef struct opj_tcp {
     OPJ_UINT32 ppt_data_size;
     /** size of ppt_data*/
     OPJ_UINT32 ppt_len;
-    /** add fixed_quality */
+    /** PSNR values */
     OPJ_FLOAT32 distoratio[100];
     /** tile-component coding parameters */
     opj_tccp_t *tccps;
@@ -314,6 +317,14 @@ typedef struct opj_tcp {
 } opj_tcp_t;
 
 
+/**
+Rate allocation strategy
+*/
+typedef enum {
+    RATE_DISTORTION_RATIO = 0,    /** allocation by rate/distortion */
+    FIXED_DISTORTION_RATIO = 1,   /** allocation by fixed distortion ratio (PSNR) (fixed quality) */
+    FIXED_LAYER = 2,              /** allocation by fixed layer (number of passes per layer / resolution / subband) */
+} J2K_QUALITY_LAYER_ALLOCATION_STRATEGY;
 
 
 typedef struct opj_encoding_param {
@@ -325,12 +336,8 @@ typedef struct opj_encoding_param {
     OPJ_INT32 *m_matrice;
     /** Flag determining tile part generation*/
     OPJ_BYTE m_tp_flag;
-    /** allocation by rate/distortion */
-    OPJ_BITFIELD m_disto_alloc : 1;
-    /** allocation by fixed layer */
-    OPJ_BITFIELD m_fixed_alloc : 1;
-    /** add fixed_quality */
-    OPJ_BITFIELD m_fixed_quality : 1;
+    /** Quality layer allocation strategy */
+    J2K_QUALITY_LAYER_ALLOCATION_STRATEGY m_quality_layer_alloc_strategy;
     /** Enabling Tile part generation*/
     OPJ_BITFIELD m_tp_on : 1;
 }
diff --git a/3rdparty/openjpeg/openjp2/jp2.c b/3rdparty/openjpeg/openjp2/jp2.c
index 17572195e391..6015190e1f5b 100644
--- a/3rdparty/openjpeg/openjp2/jp2.c
+++ b/3rdparty/openjpeg/openjp2/jp2.c
@@ -1108,7 +1108,7 @@ static OPJ_BOOL opj_jp2_apply_pclr(opj_image_t *image,
         pcol = cmap[i].pcol;
         src = old_comps[cmp].data;
         assert(src); /* verified above */
-        max = new_comps[pcol].w * new_comps[pcol].h;
+        max = new_comps[i].w * new_comps[i].h;
 
         /* Direct use: */
         if (cmap[i].mtyp == 0) {
@@ -1594,22 +1594,10 @@ static OPJ_BOOL opj_jp2_read_colr(opj_jp2_t *jp2,
     return OPJ_TRUE;
 }
 
-OPJ_BOOL opj_jp2_decode(opj_jp2_t *jp2,
-                        opj_stream_private_t *p_stream,
-                        opj_image_t* p_image,
-                        opj_event_mgr_t * p_manager)
+static OPJ_BOOL opj_jp2_apply_color_postprocessing(opj_jp2_t *jp2,
+        opj_image_t* p_image,
+        opj_event_mgr_t * p_manager)
 {
-    if (!p_image) {
-        return OPJ_FALSE;
-    }
-
-    /* J2K decoding */
-    if (! opj_j2k_decode(jp2->j2k, p_stream, p_image, p_manager)) {
-        opj_event_msg(p_manager, EVT_ERROR,
-                      "Failed to decode the codestream in the JP2 file\n");
-        return OPJ_FALSE;
-    }
-
     if (jp2->j2k->m_specific_param.m_decoder.m_numcomps_to_decode) {
         /* Bypass all JP2 component transforms */
         return OPJ_TRUE;
@@ -1620,21 +1608,6 @@ OPJ_BOOL opj_jp2_decode(opj_jp2_t *jp2,
             return OPJ_FALSE;
         }
 
-        /* Set Image Color Space */
-        if (jp2->enumcs == 16) {
-            p_image->color_space = OPJ_CLRSPC_SRGB;
-        } else if (jp2->enumcs == 17) {
-            p_image->color_space = OPJ_CLRSPC_GRAY;
-        } else if (jp2->enumcs == 18) {
-            p_image->color_space = OPJ_CLRSPC_SYCC;
-        } else if (jp2->enumcs == 24) {
-            p_image->color_space = OPJ_CLRSPC_EYCC;
-        } else if (jp2->enumcs == 12) {
-            p_image->color_space = OPJ_CLRSPC_CMYK;
-        } else {
-            p_image->color_space = OPJ_CLRSPC_UNKNOWN;
-        }
-
         if (jp2->color.jp2_pclr) {
             /* Part 1, I.5.3.4: Either both or none : */
             if (!jp2->color.jp2_pclr->cmap) {
@@ -1650,17 +1623,30 @@ OPJ_BOOL opj_jp2_decode(opj_jp2_t *jp2,
         if (jp2->color.jp2_cdef) {
             opj_jp2_apply_cdef(p_image, &(jp2->color), p_manager);
         }
-
-        if (jp2->color.icc_profile_buf) {
-            p_image->icc_profile_buf = jp2->color.icc_profile_buf;
-            p_image->icc_profile_len = jp2->color.icc_profile_len;
-            jp2->color.icc_profile_buf = NULL;
-        }
     }
 
     return OPJ_TRUE;
 }
 
+OPJ_BOOL opj_jp2_decode(opj_jp2_t *jp2,
+                        opj_stream_private_t *p_stream,
+                        opj_image_t* p_image,
+                        opj_event_mgr_t * p_manager)
+{
+    if (!p_image) {
+        return OPJ_FALSE;
+    }
+
+    /* J2K decoding */
+    if (! opj_j2k_decode(jp2->j2k, p_stream, p_image, p_manager)) {
+        opj_event_msg(p_manager, EVT_ERROR,
+                      "Failed to decode the codestream in the JP2 file\n");
+        return OPJ_FALSE;
+    }
+
+    return opj_jp2_apply_color_postprocessing(jp2, p_image, p_manager);
+}
+
 static OPJ_BOOL opj_jp2_write_jp2h(opj_jp2_t *jp2,
                                    opj_stream_private_t *stream,
                                    opj_event_mgr_t * p_manager
@@ -2843,6 +2829,8 @@ OPJ_BOOL opj_jp2_read_header(opj_stream_private_t *p_stream,
                              opj_event_mgr_t * p_manager
                             )
 {
+    int ret;
+
     /* preconditions */
     assert(jp2 != 00);
     assert(p_stream != 00);
@@ -2876,10 +2864,34 @@ OPJ_BOOL opj_jp2_read_header(opj_stream_private_t *p_stream,
         return OPJ_FALSE;
     }
 
-    return opj_j2k_read_header(p_stream,
-                               jp2->j2k,
-                               p_image,
-                               p_manager);
+    ret = opj_j2k_read_header(p_stream,
+                              jp2->j2k,
+                              p_image,
+                              p_manager);
+
+    if (p_image && *p_image) {
+        /* Set Image Color Space */
+        if (jp2->enumcs == 16) {
+            (*p_image)->color_space = OPJ_CLRSPC_SRGB;
+        } else if (jp2->enumcs == 17) {
+            (*p_image)->color_space = OPJ_CLRSPC_GRAY;
+        } else if (jp2->enumcs == 18) {
+            (*p_image)->color_space = OPJ_CLRSPC_SYCC;
+        } else if (jp2->enumcs == 24) {
+            (*p_image)->color_space = OPJ_CLRSPC_EYCC;
+        } else if (jp2->enumcs == 12) {
+            (*p_image)->color_space = OPJ_CLRSPC_CMYK;
+        } else {
+            (*p_image)->color_space = OPJ_CLRSPC_UNKNOWN;
+        }
+
+        if (jp2->color.icc_profile_buf) {
+            (*p_image)->icc_profile_buf = jp2->color.icc_profile_buf;
+            (*p_image)->icc_profile_len = jp2->color.icc_profile_len;
+            jp2->color.icc_profile_buf = NULL;
+        }
+    }
+    return ret;
 }
 
 static OPJ_BOOL opj_jp2_setup_encoding_validation(opj_jp2_t *jp2,
@@ -3123,53 +3135,7 @@ OPJ_BOOL opj_jp2_get_tile(opj_jp2_t *p_jp2,
         return OPJ_FALSE;
     }
 
-    if (p_jp2->j2k->m_specific_param.m_decoder.m_numcomps_to_decode) {
-        /* Bypass all JP2 component transforms */
-        return OPJ_TRUE;
-    }
-
-    if (!opj_jp2_check_color(p_image, &(p_jp2->color), p_manager)) {
-        return OPJ_FALSE;
-    }
-
-    /* Set Image Color Space */
-    if (p_jp2->enumcs == 16) {
-        p_image->color_space = OPJ_CLRSPC_SRGB;
-    } else if (p_jp2->enumcs == 17) {
-        p_image->color_space = OPJ_CLRSPC_GRAY;
-    } else if (p_jp2->enumcs == 18) {
-        p_image->color_space = OPJ_CLRSPC_SYCC;
-    } else if (p_jp2->enumcs == 24) {
-        p_image->color_space = OPJ_CLRSPC_EYCC;
-    } else if (p_jp2->enumcs == 12) {
-        p_image->color_space = OPJ_CLRSPC_CMYK;
-    } else {
-        p_image->color_space = OPJ_CLRSPC_UNKNOWN;
-    }
-
-    if (p_jp2->color.jp2_pclr) {
-        /* Part 1, I.5.3.4: Either both or none : */
-        if (!p_jp2->color.jp2_pclr->cmap) {
-            opj_jp2_free_pclr(&(p_jp2->color));
-        } else {
-            if (!opj_jp2_apply_pclr(p_image, &(p_jp2->color), p_manager)) {
-                return OPJ_FALSE;
-            }
-        }
-    }
-
-    /* Apply the color space if needed */
-    if (p_jp2->color.jp2_cdef) {
-        opj_jp2_apply_cdef(p_image, &(p_jp2->color), p_manager);
-    }
-
-    if (p_jp2->color.icc_profile_buf) {
-        p_image->icc_profile_buf = p_jp2->color.icc_profile_buf;
-        p_image->icc_profile_len = p_jp2->color.icc_profile_len;
-        p_jp2->color.icc_profile_buf = NULL;
-    }
-
-    return OPJ_TRUE;
+    return opj_jp2_apply_color_postprocessing(p_jp2, p_image, p_manager);
 }
 
 /* ----------------------------------------------------------------------- */
diff --git a/3rdparty/openjpeg/openjp2/libopenjp2.pc.cmake.in b/3rdparty/openjpeg/openjp2/libopenjp2.pc.cmake.in
index 62159b00a4b1..2ade312b2948 100644
--- a/3rdparty/openjpeg/openjp2/libopenjp2.pc.cmake.in
+++ b/3rdparty/openjpeg/openjp2/libopenjp2.pc.cmake.in
@@ -1,9 +1,9 @@
 prefix=@CMAKE_INSTALL_PREFIX@
-bindir=${prefix}/@OPENJPEG_INSTALL_BIN_DIR@
-mandir=${prefix}/@OPENJPEG_INSTALL_MAN_DIR@
-docdir=${prefix}/@OPENJPEG_INSTALL_DOC_DIR@
-libdir=${prefix}/@OPENJPEG_INSTALL_LIB_DIR@
-includedir=${prefix}/@OPENJPEG_INSTALL_INCLUDE_DIR@
+bindir=@bindir@
+mandir=@mandir@
+docdir=@docdir@
+libdir=@libdir@
+includedir=@includedir@
 
 Name: openjp2
 Description: JPEG2000 library (Part 1 and 2)
@@ -12,3 +12,4 @@ Version: @OPENJPEG_VERSION@
 Libs: -L${libdir} -lopenjp2
 Libs.private: -lm
 Cflags: -I${includedir}
+Cflags.private: -DOPJ_STATIC
diff --git a/3rdparty/openjpeg/openjp2/openjpeg.c b/3rdparty/openjpeg/openjp2/openjpeg.c
index 29d3ee528ccc..382d8f4f0f12 100644
--- a/3rdparty/openjpeg/openjp2/openjpeg.c
+++ b/3rdparty/openjpeg/openjp2/openjpeg.c
@@ -144,6 +144,11 @@ static void opj_close_from_file(void* p_user_data)
 /* ---------------------------------------------------------------------- */
 #ifdef _WIN32
 #ifndef OPJ_STATIC
+
+/* declaration to avoid warning: no previous prototype for 'DllMain' */
+BOOL APIENTRY
+DllMain(HINSTANCE hModule, DWORD ul_reason_for_call, LPVOID lpReserved);
+
 BOOL APIENTRY
 DllMain(HINSTANCE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
 {
@@ -433,7 +438,7 @@ OPJ_BOOL OPJ_CALLCONV opj_setup_decoder(opj_codec_t *p_codec,
     return OPJ_FALSE;
 }
 
-OPJ_API OPJ_BOOL OPJ_CALLCONV opj_decoder_set_strict_mode(opj_codec_t *p_codec,
+OPJ_BOOL OPJ_CALLCONV opj_decoder_set_strict_mode(opj_codec_t *p_codec,
         OPJ_BOOL strict)
 {
     if (p_codec) {
diff --git a/3rdparty/openjpeg/openjp2/openjpeg.h b/3rdparty/openjpeg/openjp2/openjpeg.h
index ebce53db0d82..67d168bb5785 100644
--- a/3rdparty/openjpeg/openjp2/openjpeg.h
+++ b/3rdparty/openjpeg/openjp2/openjpeg.h
@@ -122,7 +122,7 @@ typedef float         OPJ_FLOAT32;
 typedef double        OPJ_FLOAT64;
 typedef unsigned char OPJ_BYTE;
 
-#include "opj_stdint.h"
+#include <stdint.h>
 
 typedef int8_t   OPJ_INT8;
 typedef uint8_t  OPJ_UINT8;
@@ -138,6 +138,8 @@ typedef int64_t  OPJ_OFF_T; /* 64-bit file offset type */
 #include <stdio.h>
 typedef size_t   OPJ_SIZE_T;
 
+#include "opj_config.h"
+
 /* Avoid compile-time warning because parameter is not used */
 #define OPJ_ARG_NOT_USED(x) (void)(x)
 
@@ -405,7 +407,7 @@ typedef struct opj_cparameters {
     int cp_disto_alloc;
     /** allocation by fixed layer */
     int cp_fixed_alloc;
-    /** add fixed_quality */
+    /** allocation by fixed quality (PSNR) */
     int cp_fixed_quality;
     /** fixed layer */
     int *cp_matrice;
@@ -829,9 +831,9 @@ typedef struct opj_tile_info {
     int pdy[33];
     /** information concerning packets inside tile */
     opj_packet_info_t *packet;
-    /** add fixed_quality */
+    /** number of pixels of the tile */
     int numpix;
-    /** add fixed_quality */
+    /** distortion of the tile */
     double distotile;
     /** number of markers */
     int marknum;
diff --git a/3rdparty/openjpeg/openjp2/opj_config.h.cmake.in b/3rdparty/openjpeg/openjp2/opj_config.h.cmake.in
index 5f762ca3daa9..64884b65248d 100644
--- a/3rdparty/openjpeg/openjp2/opj_config.h.cmake.in
+++ b/3rdparty/openjpeg/openjp2/opj_config.h.cmake.in
@@ -1,5 +1,7 @@
+#ifndef OPJ_CONFIG_H_INCLUDED
+#define OPJ_CONFIG_H_INCLUDED
+
 /* create opj_config.h for CMake */
-#cmakedefine OPJ_HAVE_STDINT_H 		@OPJ_HAVE_STDINT_H@
 
 /*--------------------------------------------------------------------------*/
 /* OpenJPEG Versioning                                                      */
@@ -8,3 +10,5 @@
 #define OPJ_VERSION_MAJOR @OPENJPEG_VERSION_MAJOR@
 #define OPJ_VERSION_MINOR @OPENJPEG_VERSION_MINOR@
 #define OPJ_VERSION_BUILD @OPENJPEG_VERSION_BUILD@
+
+#endif
diff --git a/3rdparty/openjpeg/openjp2/opj_config_private.h.cmake.in b/3rdparty/openjpeg/openjp2/opj_config_private.h.cmake.in
index c41f9066242d..c559282c578b 100644
--- a/3rdparty/openjpeg/openjp2/opj_config_private.h.cmake.in
+++ b/3rdparty/openjpeg/openjp2/opj_config_private.h.cmake.in
@@ -1,5 +1,4 @@
 /* create opj_config_private.h for CMake */
-#cmakedefine OPJ_HAVE_INTTYPES_H 	@OPJ_HAVE_INTTYPES_H@
 
 #define OPJ_PACKAGE_VERSION "@PACKAGE_VERSION@"
 
@@ -11,6 +10,8 @@
 /*#cmakedefine HAVE_SYS_STAT_H @HAVE_SYS_STAT_H@*/
 /*#cmakedefine HAVE_SYS_TYPES_H @HAVE_SYS_TYPES_H@ */
 /*#cmakedefine HAVE_UNISTD_H @HAVE_UNISTD_H@*/
+/*#cmakedefine HAVE_INTTYPES_H @HAVE_INTTYPES_H@ */
+/*#cmakedefine HAVE_STDINT_H @HAVE_STDINT_H@ */
 
 #cmakedefine _LARGEFILE_SOURCE
 #cmakedefine _LARGE_FILES
diff --git a/3rdparty/openjpeg/openjp2/opj_includes.h b/3rdparty/openjpeg/openjp2/opj_includes.h
index 0a8628c96b30..13613ce521c1 100644
--- a/3rdparty/openjpeg/openjp2/opj_includes.h
+++ b/3rdparty/openjpeg/openjp2/opj_includes.h
@@ -55,6 +55,8 @@
 #include <ctype.h>
 #include <assert.h>
 #include <limits.h>
+#include <stdint.h>
+#include <inttypes.h>
 
 /*
   Use fseeko() and ftello() if they are available since they use
@@ -218,7 +220,6 @@ typedef unsigned int OPJ_BITFIELD;
 
 #define OPJ_UNUSED(x) (void)x
 
-#include "opj_inttypes.h"
 #include "opj_clock.h"
 #include "opj_malloc.h"
 #include "event.h"
diff --git a/3rdparty/openjpeg/openjp2/opj_intmath.h b/3rdparty/openjpeg/openjp2/opj_intmath.h
index 1b0c9d033283..cce7a3cafa72 100644
--- a/3rdparty/openjpeg/openjp2/opj_intmath.h
+++ b/3rdparty/openjpeg/openjp2/opj_intmath.h
@@ -173,6 +173,17 @@ static INLINE OPJ_UINT32  opj_uint_ceildiv(OPJ_UINT32  a, OPJ_UINT32  b)
     return (OPJ_UINT32)(((OPJ_UINT64)a + b - 1) / b);
 }
 
+/**
+Divide an integer and round upwards
+@return Returns a divided by b
+*/
+static INLINE OPJ_UINT32  opj_uint64_ceildiv_res_uint32(OPJ_UINT64 a,
+        OPJ_UINT64 b)
+{
+    assert(b);
+    return (OPJ_UINT32)((a + b - 1) / b);
+}
+
 /**
 Divide an integer by a power of 2 and round upwards
 @return Returns a divided by 2^b
diff --git a/3rdparty/openjpeg/openjp2/pi.c b/3rdparty/openjpeg/openjp2/pi.c
index 38f1ba5a70f3..15ac331425d9 100644
--- a/3rdparty/openjpeg/openjp2/pi.c
+++ b/3rdparty/openjpeg/openjp2/pi.c
@@ -411,41 +411,37 @@ static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi)
                     }
                     res = &comp->resolutions[pi->resno];
                     levelno = comp->numresolutions - 1 - pi->resno;
-                    /* Avoids division by zero */
-                    /* Relates to id_000004,sig_06,src_000679,op_arith8,pos_49,val_-17 */
-                    /* of  https://github.com/uclouvain/openjpeg/issues/938 */
-                    if (levelno >= 32 ||
-                            ((comp->dx << levelno) >> levelno) != comp->dx ||
-                            ((comp->dy << levelno) >> levelno) != comp->dy) {
-                        continue;
-                    }
-                    if ((comp->dx << levelno) > INT_MAX ||
-                            (comp->dy << levelno) > INT_MAX) {
+
+                    if ((OPJ_UINT32)(((OPJ_UINT64)comp->dx << levelno) >> levelno) != comp->dx ||
+                            (OPJ_UINT32)(((OPJ_UINT64)comp->dy << levelno) >> levelno) != comp->dy) {
                         continue;
                     }
-                    trx0 = opj_uint_ceildiv(pi->tx0, (comp->dx << levelno));
-                    try0 = opj_uint_ceildiv(pi->ty0, (comp->dy << levelno));
-                    trx1 = opj_uint_ceildiv(pi->tx1, (comp->dx << levelno));
-                    try1 = opj_uint_ceildiv(pi->ty1, (comp->dy << levelno));
+
+                    trx0 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->tx0,
+                                                         ((OPJ_UINT64)comp->dx << levelno));
+                    try0 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->ty0,
+                                                         ((OPJ_UINT64)comp->dy << levelno));
+                    trx1 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->tx1,
+                                                         ((OPJ_UINT64)comp->dx << levelno));
+                    try1 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->ty1,
+                                                         ((OPJ_UINT64)comp->dy << levelno));
                     rpx = res->pdx + levelno;
                     rpy = res->pdy + levelno;
 
-                    /* To avoid divisions by zero / undefined behaviour on shift */
-                    /* in below tests */
-                    /* Fixes reading id:000026,sig:08,src:002419,op:int32,pos:60,val:+32 */
-                    /* of https://github.com/uclouvain/openjpeg/issues/938 */
-                    if (rpx >= 31 || ((comp->dx << rpx) >> rpx) != comp->dx ||
-                            rpy >= 31 || ((comp->dy << rpy) >> rpy) != comp->dy) {
+                    if ((OPJ_UINT32)(((OPJ_UINT64)comp->dx << rpx) >> rpx) != comp->dx ||
+                            (OPJ_UINT32)(((OPJ_UINT64)comp->dy << rpy) >> rpy) != comp->dy) {
                         continue;
                     }
 
                     /* See ISO-15441. B.12.1.3 Resolution level-position-component-layer progression */
-                    if (!((pi->y % (comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
-                            ((try0 << levelno) % (1U << rpy))))) {
+                    if (!(((OPJ_UINT64)pi->y % ((OPJ_UINT64)comp->dy << rpy) == 0) ||
+                            ((pi->y == pi->ty0) &&
+                             (((OPJ_UINT64)try0 << levelno) % ((OPJ_UINT64)1U << rpy))))) {
                         continue;
                     }
-                    if (!((pi->x % (comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
-                            ((trx0 << levelno) % (1U << rpx))))) {
+                    if (!(((OPJ_UINT64)pi->x % ((OPJ_UINT64)comp->dx << rpx) == 0) ||
+                            ((pi->x == pi->tx0) &&
+                             (((OPJ_UINT64)trx0 << levelno) % ((OPJ_UINT64)1U << rpx))))) {
                         continue;
                     }
 
@@ -457,11 +453,11 @@ static OPJ_BOOL opj_pi_next_rpcl(opj_pi_iterator_t * pi)
                         continue;
                     }
 
-                    prci = opj_uint_floordivpow2(opj_uint_ceildiv(pi->x,
-                                                 (comp->dx << levelno)), res->pdx)
+                    prci = opj_uint_floordivpow2(opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->x,
+                                                 ((OPJ_UINT64)comp->dx << levelno)), res->pdx)
                            - opj_uint_floordivpow2(trx0, res->pdx);
-                    prcj = opj_uint_floordivpow2(opj_uint_ceildiv(pi->y,
-                                                 (comp->dy << levelno)), res->pdy)
+                    prcj = opj_uint_floordivpow2(opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->y,
+                                                 ((OPJ_UINT64)comp->dy << levelno)), res->pdy)
                            - opj_uint_floordivpow2(try0, res->pdy);
                     pi->precno = prci + prcj * res->pw;
                     for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) {
@@ -549,41 +545,37 @@ static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi)
                     OPJ_UINT32 prci, prcj;
                     res = &comp->resolutions[pi->resno];
                     levelno = comp->numresolutions - 1 - pi->resno;
-                    /* Avoids division by zero */
-                    /* Relates to id_000004,sig_06,src_000679,op_arith8,pos_49,val_-17 */
-                    /* of  https://github.com/uclouvain/openjpeg/issues/938 */
-                    if (levelno >= 32 ||
-                            ((comp->dx << levelno) >> levelno) != comp->dx ||
-                            ((comp->dy << levelno) >> levelno) != comp->dy) {
-                        continue;
-                    }
-                    if ((comp->dx << levelno) > INT_MAX ||
-                            (comp->dy << levelno) > INT_MAX) {
+
+                    if ((OPJ_UINT32)(((OPJ_UINT64)comp->dx << levelno) >> levelno) != comp->dx ||
+                            (OPJ_UINT32)(((OPJ_UINT64)comp->dy << levelno) >> levelno) != comp->dy) {
                         continue;
                     }
-                    trx0 = opj_uint_ceildiv(pi->tx0, (comp->dx << levelno));
-                    try0 = opj_uint_ceildiv(pi->ty0, (comp->dy << levelno));
-                    trx1 = opj_uint_ceildiv(pi->tx1, (comp->dx << levelno));
-                    try1 = opj_uint_ceildiv(pi->ty1, (comp->dy << levelno));
+
+                    trx0 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->tx0,
+                                                         ((OPJ_UINT64)comp->dx << levelno));
+                    try0 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->ty0,
+                                                         ((OPJ_UINT64)comp->dy << levelno));
+                    trx1 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->tx1,
+                                                         ((OPJ_UINT64)comp->dx << levelno));
+                    try1 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->ty1,
+                                                         ((OPJ_UINT64)comp->dy << levelno));
                     rpx = res->pdx + levelno;
                     rpy = res->pdy + levelno;
 
-                    /* To avoid divisions by zero / undefined behaviour on shift */
-                    /* in below tests */
-                    /* Relates to id:000019,sig:08,src:001098,op:flip1,pos:49 */
-                    /* of https://github.com/uclouvain/openjpeg/issues/938 */
-                    if (rpx >= 31 || ((comp->dx << rpx) >> rpx) != comp->dx ||
-                            rpy >= 31 || ((comp->dy << rpy) >> rpy) != comp->dy) {
+                    if ((OPJ_UINT32)(((OPJ_UINT64)comp->dx << rpx) >> rpx) != comp->dx ||
+                            (OPJ_UINT32)(((OPJ_UINT64)comp->dy << rpy) >> rpy) != comp->dy) {
                         continue;
                     }
 
                     /* See ISO-15441. B.12.1.4 Position-component-resolution level-layer progression */
-                    if (!((pi->y % (comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
-                            ((try0 << levelno) % (1U << rpy))))) {
+                    if (!(((OPJ_UINT64)pi->y % ((OPJ_UINT64)comp->dy << rpy) == 0) ||
+                            ((pi->y == pi->ty0) &&
+                             (((OPJ_UINT64)try0 << levelno) % ((OPJ_UINT64)1U << rpy))))) {
                         continue;
                     }
-                    if (!((pi->x % (comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
-                            ((trx0 << levelno) % (1U << rpx))))) {
+                    if (!(((OPJ_UINT64)pi->x % ((OPJ_UINT64)comp->dx << rpx) == 0) ||
+                            ((pi->x == pi->tx0) &&
+                             (((OPJ_UINT64)trx0 << levelno) % ((OPJ_UINT64)1U << rpx))))) {
                         continue;
                     }
 
@@ -595,11 +587,11 @@ static OPJ_BOOL opj_pi_next_pcrl(opj_pi_iterator_t * pi)
                         continue;
                     }
 
-                    prci = opj_uint_floordivpow2(opj_uint_ceildiv(pi->x,
-                                                 (comp->dx << levelno)), res->pdx)
+                    prci = opj_uint_floordivpow2(opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->x,
+                                                 ((OPJ_UINT64)comp->dx << levelno)), res->pdx)
                            - opj_uint_floordivpow2(trx0, res->pdx);
-                    prcj = opj_uint_floordivpow2(opj_uint_ceildiv(pi->y,
-                                                 (comp->dy << levelno)), res->pdy)
+                    prcj = opj_uint_floordivpow2(opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->y,
+                                                 ((OPJ_UINT64)comp->dy << levelno)), res->pdy)
                            - opj_uint_floordivpow2(try0, res->pdy);
                     pi->precno = prci + prcj * res->pw;
                     for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) {
@@ -685,40 +677,37 @@ static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi)
                     OPJ_UINT32 prci, prcj;
                     res = &comp->resolutions[pi->resno];
                     levelno = comp->numresolutions - 1 - pi->resno;
-                    /* Avoids division by zero on id_000004,sig_06,src_000679,op_arith8,pos_49,val_-17 */
-                    /* of  https://github.com/uclouvain/openjpeg/issues/938 */
-                    if (levelno >= 32 ||
-                            ((comp->dx << levelno) >> levelno) != comp->dx ||
-                            ((comp->dy << levelno) >> levelno) != comp->dy) {
-                        continue;
-                    }
-                    if ((comp->dx << levelno) > INT_MAX ||
-                            (comp->dy << levelno) > INT_MAX) {
+
+                    if ((OPJ_UINT32)(((OPJ_UINT64)comp->dx << levelno) >> levelno) != comp->dx ||
+                            (OPJ_UINT32)(((OPJ_UINT64)comp->dy << levelno) >> levelno) != comp->dy) {
                         continue;
                     }
-                    trx0 = opj_uint_ceildiv(pi->tx0, (comp->dx << levelno));
-                    try0 = opj_uint_ceildiv(pi->ty0, (comp->dy << levelno));
-                    trx1 = opj_uint_ceildiv(pi->tx1, (comp->dx << levelno));
-                    try1 = opj_uint_ceildiv(pi->ty1, (comp->dy << levelno));
+
+                    trx0 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->tx0,
+                                                         ((OPJ_UINT64)comp->dx << levelno));
+                    try0 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->ty0,
+                                                         ((OPJ_UINT64)comp->dy << levelno));
+                    trx1 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->tx1,
+                                                         ((OPJ_UINT64)comp->dx << levelno));
+                    try1 = opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->ty1,
+                                                         ((OPJ_UINT64)comp->dy << levelno));
                     rpx = res->pdx + levelno;
                     rpy = res->pdy + levelno;
 
-                    /* To avoid divisions by zero / undefined behaviour on shift */
-                    /* in below tests */
-                    /* Fixes reading id:000019,sig:08,src:001098,op:flip1,pos:49 */
-                    /* of https://github.com/uclouvain/openjpeg/issues/938 */
-                    if (rpx >= 31 || ((comp->dx << rpx) >> rpx) != comp->dx ||
-                            rpy >= 31 || ((comp->dy << rpy) >> rpy) != comp->dy) {
+                    if ((OPJ_UINT32)(((OPJ_UINT64)comp->dx << rpx) >> rpx) != comp->dx ||
+                            (OPJ_UINT32)(((OPJ_UINT64)comp->dy << rpy) >> rpy) != comp->dy) {
                         continue;
                     }
 
                     /* See ISO-15441. B.12.1.5 Component-position-resolution level-layer progression */
-                    if (!((pi->y % (comp->dy << rpy) == 0) || ((pi->y == pi->ty0) &&
-                            ((try0 << levelno) % (1U << rpy))))) {
+                    if (!(((OPJ_UINT64)pi->y % ((OPJ_UINT64)comp->dy << rpy) == 0) ||
+                            ((pi->y == pi->ty0) &&
+                             (((OPJ_UINT64)try0 << levelno) % ((OPJ_UINT64)1U << rpy))))) {
                         continue;
                     }
-                    if (!((pi->x % (comp->dx << rpx) == 0) || ((pi->x == pi->tx0) &&
-                            ((trx0 << levelno) % (1U << rpx))))) {
+                    if (!(((OPJ_UINT64)pi->x % ((OPJ_UINT64)comp->dx << rpx) == 0) ||
+                            ((pi->x == pi->tx0) &&
+                             (((OPJ_UINT64)trx0 << levelno) % ((OPJ_UINT64)1U << rpx))))) {
                         continue;
                     }
 
@@ -730,11 +719,11 @@ static OPJ_BOOL opj_pi_next_cprl(opj_pi_iterator_t * pi)
                         continue;
                     }
 
-                    prci = opj_uint_floordivpow2(opj_uint_ceildiv(pi->x,
-                                                 (comp->dx << levelno)), res->pdx)
+                    prci = opj_uint_floordivpow2(opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->x,
+                                                 ((OPJ_UINT64)comp->dx << levelno)), res->pdx)
                            - opj_uint_floordivpow2(trx0, res->pdx);
-                    prcj = opj_uint_floordivpow2(opj_uint_ceildiv(pi->y,
-                                                 (comp->dy << levelno)), res->pdy)
+                    prcj = opj_uint_floordivpow2(opj_uint64_ceildiv_res_uint32((OPJ_UINT64)pi->y,
+                                                 ((OPJ_UINT64)comp->dy << levelno)), res->pdy)
                            - opj_uint_floordivpow2(try0, res->pdy);
                     pi->precno = (OPJ_UINT32)(prci + prcj * res->pw);
                     for (pi->layno = pi->poc.layno0; pi->layno < pi->poc.layno1; pi->layno++) {
@@ -837,18 +826,24 @@ static void opj_get_encoding_parameters(const opj_image_t *p_image,
 
         /* use custom size for precincts */
         for (resno = 0; resno < l_tccp->numresolutions; ++resno) {
-            OPJ_UINT32 l_dx, l_dy;
+            OPJ_UINT64 l_dx, l_dy;
 
             /* precinct width and height */
             l_pdx = l_tccp->prcw[resno];
             l_pdy = l_tccp->prch[resno];
 
-            l_dx = l_img_comp->dx * (1u << (l_pdx + l_tccp->numresolutions - 1 - resno));
-            l_dy = l_img_comp->dy * (1u << (l_pdy + l_tccp->numresolutions - 1 - resno));
+            l_dx = l_img_comp->dx * ((OPJ_UINT64)1u << (l_pdx + l_tccp->numresolutions - 1 -
+                                     resno));
+            l_dy = l_img_comp->dy * ((OPJ_UINT64)1u << (l_pdy + l_tccp->numresolutions - 1 -
+                                     resno));
 
             /* take the minimum size for dx for each comp and resolution */
-            *p_dx_min = opj_uint_min(*p_dx_min, l_dx);
-            *p_dy_min = opj_uint_min(*p_dy_min, l_dy);
+            if (l_dx <= UINT_MAX) {
+                *p_dx_min = opj_uint_min(*p_dx_min, (OPJ_UINT32)l_dx);
+            }
+            if (l_dy <= UINT_MAX) {
+                *p_dy_min = opj_uint_min(*p_dy_min, (OPJ_UINT32)l_dy);
+            }
 
             /* various calculations of extents */
             l_level_no = l_tccp->numresolutions - 1 - resno;
diff --git a/3rdparty/openjpeg/openjp2/t1.c b/3rdparty/openjpeg/openjp2/t1.c
index f5fd233917d2..52e466eb974a 100644
--- a/3rdparty/openjpeg/openjp2/t1.c
+++ b/3rdparty/openjpeg/openjp2/t1.c
@@ -1410,7 +1410,6 @@ static void opj_t1_dec_clnpass(
 }
 
 
-/** mod fixed_quality */
 static OPJ_FLOAT64 opj_t1_getwmsedec(
     OPJ_INT32 nmsedec,
     OPJ_UINT32 compno,
@@ -2313,7 +2312,7 @@ OPJ_BOOL opj_t1_encode_cblks(opj_tcd_t* tcd,
     OPJ_UINT32 compno, resno, bandno, precno, cblkno;
     opj_mutex_t* mutex = opj_mutex_create();
 
-    tile->distotile = 0;        /* fixed_quality */
+    tile->distotile = 0;
 
     for (compno = 0; compno < tile->numcomps; ++compno) {
         opj_tcd_tilecomp_t* tilec = &tile->comps[compno];
@@ -2401,7 +2400,6 @@ static int opj_t1_enc_is_term_pass(opj_tcd_cblk_enc_t* cblk,
 }
 
 
-/** mod fixed_quality */
 static OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1,
                                       opj_tcd_cblk_enc_t* cblk,
                                       OPJ_UINT32 orient,
@@ -2443,6 +2441,13 @@ static OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1,
             OPJ_INT32 tmp = *datap;
             if (tmp < 0) {
                 OPJ_UINT32 tmp_unsigned;
+                if (tmp == INT_MIN) {
+                    /* To avoid undefined behaviour when negating INT_MIN */
+                    /* but if we go here, it means we have supplied an input */
+                    /* with more bit depth than we we can really support. */
+                    /* Cf https://github.com/uclouvain/openjpeg/issues/1432 */
+                    tmp = INT_MIN + 1;
+                }
                 max = opj_int_max(max, -tmp);
                 tmp_unsigned = opj_to_smr(tmp);
                 memcpy(datap, &tmp_unsigned, sizeof(OPJ_INT32));
@@ -2498,7 +2503,6 @@ static OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1,
             break;
         }
 
-        /* fixed_quality */
         tempwmsedec = opj_t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid,
                                         stepsize, numcomps, mct_norms, mct_numcomps) ;
         cumwmsedec += tempwmsedec;
diff --git a/3rdparty/openjpeg/openjp2/t1_ht_generate_luts.c b/3rdparty/openjpeg/openjp2/t1_ht_generate_luts.c
index 6876e3fd7f0a..22382a5a4af5 100644
--- a/3rdparty/openjpeg/openjp2/t1_ht_generate_luts.c
+++ b/3rdparty/openjpeg/openjp2/t1_ht_generate_luts.c
@@ -38,12 +38,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
-
-typedef int OPJ_BOOL;
-#define OPJ_TRUE 1
-#define OPJ_FALSE 0
-
-#include "opj_stdint.h"
+#include <stdint.h>
 
 typedef int8_t   OPJ_INT8;
 typedef uint8_t  OPJ_UINT8;
@@ -53,6 +48,9 @@ typedef int32_t  OPJ_INT32;
 typedef uint32_t OPJ_UINT32;
 typedef int64_t  OPJ_INT64;
 typedef uint64_t OPJ_UINT64;
+typedef int OPJ_BOOL;
+#define OPJ_TRUE 1
+#define OPJ_FALSE 0
 
 //************************************************************************/
 /** @brief HT decoding tables, as given in the standard
diff --git a/3rdparty/openjpeg/openjp2/t2.c b/3rdparty/openjpeg/openjp2/t2.c
index ebda005267e6..781a6a59a165 100644
--- a/3rdparty/openjpeg/openjp2/t2.c
+++ b/3rdparty/openjpeg/openjp2/t2.c
@@ -167,9 +167,9 @@ static OPJ_BOOL opj_t2_init_seg(opj_tcd_cblk_dec_t* cblk,
 static void opj_t2_putcommacode(opj_bio_t *bio, OPJ_INT32 n)
 {
     while (--n >= 0) {
-        opj_bio_write(bio, 1, 1);
+        opj_bio_putbit(bio, 1);
     }
-    opj_bio_write(bio, 0, 1);
+    opj_bio_putbit(bio, 0);
 }
 
 static OPJ_UINT32 opj_t2_getcommacode(opj_bio_t *bio)
@@ -184,7 +184,7 @@ static OPJ_UINT32 opj_t2_getcommacode(opj_bio_t *bio)
 static void opj_t2_putnumpasses(opj_bio_t *bio, OPJ_UINT32 n)
 {
     if (n == 1) {
-        opj_bio_write(bio, 0, 1);
+        opj_bio_putbit(bio, 0);
     } else if (n == 2) {
         opj_bio_write(bio, 2, 2);
     } else if (n <= 5) {
@@ -801,7 +801,7 @@ static OPJ_BOOL opj_t2_encode_packet(OPJ_UINT32 tileno,
         }
     }
 #endif
-    opj_bio_write(bio, packet_empty ? 0 : 1, 1);           /* Empty header bit */
+    opj_bio_putbit(bio, packet_empty ? 0 : 1);           /* Empty header bit */
 
     /* Writing Packet header */
     band = res->bands;
@@ -849,7 +849,7 @@ static OPJ_BOOL opj_t2_encode_packet(OPJ_UINT32 tileno,
             if (!cblk->numpasses) {
                 opj_tgt_encode(bio, prc->incltree, cblkno, (OPJ_INT32)(layno + 1));
             } else {
-                opj_bio_write(bio, layer->numpasses != 0, 1);
+                opj_bio_putbit(bio, layer->numpasses != 0);
             }
 
             /* if cblk not included, go to the next cblk  */
@@ -978,7 +978,9 @@ static OPJ_BOOL opj_t2_encode_packet(OPJ_UINT32 tileno,
                 return OPJ_FALSE;
             }
 
-            memcpy(c, layer->data, layer->len);
+            if (p_t2_mode == FINAL_PASS) {
+                memcpy(c, layer->data, layer->len);
+            }
             cblk->numpasses += layer->numpasses;
             c += layer->len;
             length -= layer->len;
@@ -1227,9 +1229,17 @@ static OPJ_BOOL opj_t2_read_packet_header(opj_t2_t* p_t2,
                 while (!opj_tgt_decode(l_bio, l_prc->imsbtree, cblkno, (OPJ_INT32)i)) {
                     ++i;
                 }
-
                 l_cblk->Mb = (OPJ_UINT32)l_band->numbps;
-                l_cblk->numbps = (OPJ_UINT32)l_band->numbps + 1 - i;
+                if ((OPJ_UINT32)l_band->numbps + 1 < i) {
+                    /* Not totally sure what we should do in that situation,
+                     * but that avoids the integer overflow of
+                     * https://github.com/uclouvain/openjpeg/pull/1488
+                     * while keeping the regression test suite happy.
+                     */
+                    l_cblk->numbps = (OPJ_UINT32)(l_band->numbps + 1 - (int)i);
+                } else {
+                    l_cblk->numbps = (OPJ_UINT32)l_band->numbps + 1 - i;
+                }
                 l_cblk->numlenbits = 3;
             }
 
@@ -1590,6 +1600,7 @@ static OPJ_BOOL opj_t2_skip_packet_data(opj_t2_t* p_t2,
                                       "skip: segment too long (%d) with max (%d) for codeblock %d (p=%d, b=%d, r=%d, c=%d)\n",
                                       l_seg->newlen, p_max_length, cblkno, p_pi->precno, bandno, p_pi->resno,
                                       p_pi->compno);
+                        return OPJ_TRUE;
                     }
                 }
 
diff --git a/3rdparty/openjpeg/openjp2/tcd.c b/3rdparty/openjpeg/openjp2/tcd.c
index 6442669d60a8..687aa61bb094 100644
--- a/3rdparty/openjpeg/openjp2/tcd.c
+++ b/3rdparty/openjpeg/openjp2/tcd.c
@@ -42,6 +42,8 @@
 #include "opj_includes.h"
 #include "opj_common.h"
 
+// #define DEBUG_RATE_ALLOC
+
 /* ----------------------------------------------------------------------- */
 
 /* TODO MSD: */
@@ -143,6 +145,9 @@ static OPJ_BOOL opj_tcd_code_block_enc_allocate_data(opj_tcd_cblk_enc_t *
  */
 static void opj_tcd_code_block_enc_deallocate(opj_tcd_precinct_t * p_precinct);
 
+static
+void opj_tcd_makelayer_fixed(opj_tcd_t *tcd, OPJ_UINT32 layno,
+                             OPJ_UINT32 final);
 
 /**
 Free the memory allocated for encoding
@@ -224,6 +229,7 @@ opj_tcd_t* opj_tcd_create(OPJ_BOOL p_is_decoder)
 
 /* ----------------------------------------------------------------------- */
 
+static
 void opj_tcd_rateallocate_fixed(opj_tcd_t *tcd)
 {
     OPJ_UINT32 layno;
@@ -234,17 +240,23 @@ void opj_tcd_rateallocate_fixed(opj_tcd_t *tcd)
 }
 
 
-void opj_tcd_makelayer(opj_tcd_t *tcd,
-                       OPJ_UINT32 layno,
-                       OPJ_FLOAT64 thresh,
-                       OPJ_UINT32 final)
+/* ----------------------------------------------------------------------- */
+
+/** Returns OPJ_TRUE if the layer allocation is unchanged w.r.t to the previous
+ * invokation with a different threshold */
+static
+OPJ_BOOL opj_tcd_makelayer(opj_tcd_t *tcd,
+                           OPJ_UINT32 layno,
+                           OPJ_FLOAT64 thresh,
+                           OPJ_UINT32 final)
 {
     OPJ_UINT32 compno, resno, bandno, precno, cblkno;
     OPJ_UINT32 passno;
 
     opj_tcd_tile_t *tcd_tile = tcd->tcd_image->tiles;
+    OPJ_BOOL layer_allocation_is_same = OPJ_TRUE;
 
-    tcd_tile->distolayer[layno] = 0;        /* fixed_quality */
+    tcd_tile->distolayer[layno] = 0;
 
     for (compno = 0; compno < tcd_tile->numcomps; compno++) {
         opj_tcd_tilecomp_t *tilec = &tcd_tile->comps[compno];
@@ -304,7 +316,10 @@ void opj_tcd_makelayer(opj_tcd_t *tcd,
                             }
                         }
 
-                        layer->numpasses = n - cblk->numpassesinlayers;
+                        if (layer->numpasses != n - cblk->numpassesinlayers) {
+                            layer_allocation_is_same = OPJ_FALSE;
+                            layer->numpasses = n - cblk->numpassesinlayers;
+                        }
 
                         if (!layer->numpasses) {
                             layer->disto = 0;
@@ -323,7 +338,7 @@ void opj_tcd_makelayer(opj_tcd_t *tcd,
                                            cblk->passes[cblk->numpassesinlayers - 1].distortiondec;
                         }
 
-                        tcd_tile->distolayer[layno] += layer->disto;    /* fixed_quality */
+                        tcd_tile->distolayer[layno] += layer->disto;
 
                         if (final) {
                             cblk->numpassesinlayers = n;
@@ -333,14 +348,17 @@ void opj_tcd_makelayer(opj_tcd_t *tcd,
             }
         }
     }
+    return layer_allocation_is_same;
 }
 
+/** For m_quality_layer_alloc_strategy == FIXED_LAYER */
+static
 void opj_tcd_makelayer_fixed(opj_tcd_t *tcd, OPJ_UINT32 layno,
                              OPJ_UINT32 final)
 {
     OPJ_UINT32 compno, resno, bandno, precno, cblkno;
     OPJ_INT32 value;                        /*, matrice[tcd_tcp->numlayers][tcd_tile->comps[0].numresolutions][3]; */
-    OPJ_INT32 matrice[10][10][3];
+    OPJ_INT32 matrice[J2K_TCD_MATRIX_MAX_LAYER_COUNT][J2K_TCD_MATRIX_MAX_RESOLUTION_COUNT][3];
     OPJ_UINT32 i, j, k;
 
     opj_cp_t *cp = tcd->cp;
@@ -440,6 +458,11 @@ void opj_tcd_makelayer_fixed(opj_tcd_t *tcd, OPJ_UINT32 layno,
     }
 }
 
+/** Rate allocation for the following methods:
+ * - allocation by rate/distortio (m_quality_layer_alloc_strategy == RATE_DISTORTION_RATIO)
+ * - allocation by fixed quality  (m_quality_layer_alloc_strategy == FIXED_DISTORTION_RATIO)
+ */
+static
 OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
                               OPJ_BYTE *dest,
                               OPJ_UINT32 * p_data_written,
@@ -450,8 +473,8 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
     OPJ_UINT32 compno, resno, bandno, precno, cblkno, layno;
     OPJ_UINT32 passno;
     OPJ_FLOAT64 min, max;
-    OPJ_FLOAT64 cumdisto[100];      /* fixed_quality */
-    const OPJ_FLOAT64 K = 1;                /* 1.1; fixed_quality */
+    OPJ_FLOAT64 cumdisto[100];
+    const OPJ_FLOAT64 K = 1;
     OPJ_FLOAT64 maxSE = 0;
 
     opj_cp_t *cp = tcd->cp;
@@ -461,7 +484,7 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
     min = DBL_MAX;
     max = 0;
 
-    tcd_tile->numpix = 0;           /* fixed_quality */
+    tcd_tile->numpix = 0;
 
     for (compno = 0; compno < tcd_tile->numcomps; compno++) {
         opj_tcd_tilecomp_t *tilec = &tcd_tile->comps[compno];
@@ -511,9 +534,12 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
                             }
                         } /* passno */
 
-                        /* fixed_quality */
-                        tcd_tile->numpix += ((cblk->x1 - cblk->x0) * (cblk->y1 - cblk->y0));
-                        tilec->numpix += ((cblk->x1 - cblk->x0) * (cblk->y1 - cblk->y0));
+                        {
+                            const OPJ_SIZE_T cblk_pix_count = (OPJ_SIZE_T)((cblk->x1 - cblk->x0) *
+                                                              (cblk->y1 - cblk->y0));
+                            tcd_tile->numpix += cblk_pix_count;
+                            tilec->numpix += cblk_pix_count;
+                        }
                     } /* cbklno */
                 } /* precno */
             } /* bandno */
@@ -527,8 +553,8 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
     /* index file */
     if (cstr_info) {
         opj_tile_info_t *tile_info = &cstr_info->tile[tcd->tcd_tileno];
-        tile_info->numpix = tcd_tile->numpix;
-        tile_info->distotile = tcd_tile->distotile;
+        tile_info->numpix = (int)tcd_tile->numpix;
+        tile_info->distotile = (int)tcd_tile->distotile;
         tile_info->thresh = (OPJ_FLOAT64 *) opj_malloc(tcd_tcp->numlayers * sizeof(
                                 OPJ_FLOAT64));
         if (!tile_info->thresh) {
@@ -545,35 +571,54 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
         OPJ_FLOAT64 goodthresh = 0;
         OPJ_FLOAT64 stable_thresh = 0;
         OPJ_UINT32 i;
-        OPJ_FLOAT64 distotarget;                /* fixed_quality */
+        OPJ_FLOAT64 distotarget;
 
-        /* fixed_quality */
         distotarget = tcd_tile->distotile - ((K * maxSE) / pow((OPJ_FLOAT32)10,
                                              tcd_tcp->distoratio[layno] / 10));
 
         /* Don't try to find an optimal threshold but rather take everything not included yet, if
-          -r xx,yy,zz,0   (disto_alloc == 1 and rates == 0)
-          -q xx,yy,zz,0   (fixed_quality == 1 and distoratio == 0)
+          -r xx,yy,zz,0   (m_quality_layer_alloc_strategy == RATE_DISTORTION_RATIO and rates == NULL)
+          -q xx,yy,zz,0   (m_quality_layer_alloc_strategy == FIXED_DISTORTION_RATIO and distoratio == NULL)
           ==> possible to have some lossy layers and the last layer for sure lossless */
-        if (((cp->m_specific_param.m_enc.m_disto_alloc == 1) &&
+        if (((cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy ==
+                RATE_DISTORTION_RATIO) &&
                 (tcd_tcp->rates[layno] > 0.0f)) ||
-                ((cp->m_specific_param.m_enc.m_fixed_quality == 1) &&
+                ((cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy ==
+                  FIXED_DISTORTION_RATIO) &&
                  (tcd_tcp->distoratio[layno] > 0.0))) {
             opj_t2_t*t2 = opj_t2_create(tcd->image, cp);
             OPJ_FLOAT64 thresh = 0;
+            OPJ_BOOL last_layer_allocation_ok = OPJ_FALSE;
 
             if (t2 == 00) {
                 return OPJ_FALSE;
             }
 
             for (i = 0; i < 128; ++i) {
-                OPJ_FLOAT64 distoachieved = 0;  /* fixed_quality */
-
-                thresh = (lo + hi) / 2;
-
-                opj_tcd_makelayer(tcd, layno, thresh, 0);
+                OPJ_FLOAT64 distoachieved = 0;
+                OPJ_BOOL layer_allocation_is_same;
+
+                OPJ_FLOAT64 new_thresh = (lo + hi) / 2;
+                /* Stop iterating when the threshold has stabilized enough */
+                /* 0.5 * 1e-5 is somewhat arbitrary, but has been selected */
+                /* so that this doesn't change the results of the regression */
+                /* test suite. */
+                if (fabs(new_thresh - thresh) <= 0.5 * 1e-5 * thresh) {
+                    break;
+                }
+                thresh = new_thresh;
+#ifdef DEBUG_RATE_ALLOC
+                opj_event_msg(p_manager, EVT_INFO, "layno=%u, iter=%u, thresh=%g",
+                              layno, i, new_thresh);
+#endif
 
-                if (cp->m_specific_param.m_enc.m_fixed_quality) {       /* fixed_quality */
+                layer_allocation_is_same = opj_tcd_makelayer(tcd, layno, thresh, 0) && i != 0;
+#ifdef DEBUG_RATE_ALLOC
+                opj_event_msg(p_manager, EVT_INFO, "--> layer_allocation_is_same = %d",
+                              layer_allocation_is_same);
+#endif
+                if (cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy ==
+                        FIXED_DISTORTION_RATIO) {
                     if (OPJ_IS_CINEMA(cp->rsiz) || OPJ_IS_IMF(cp->rsiz)) {
                         if (! opj_t2_encode_packets(t2, tcd->tcd_tileno, tcd_tile, layno + 1, dest,
                                                     p_data_written, maxlen, cstr_info, NULL, tcd->cur_tp_num, tcd->tp_pos,
@@ -605,17 +650,41 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
                         }
                         lo = thresh;
                     }
-                } else {
-                    if (! opj_t2_encode_packets(t2, tcd->tcd_tileno, tcd_tile, layno + 1, dest,
-                                                p_data_written, maxlen, cstr_info, NULL, tcd->cur_tp_num, tcd->tp_pos,
-                                                tcd->cur_pino,
-                                                THRESH_CALC, p_manager)) {
-                        /* TODO: what to do with l ??? seek / tell ??? */
-                        /* opj_event_msg(tcd->cinfo, EVT_INFO, "rate alloc: len=%d, max=%d\n", l, maxlen); */
+                } else { /* Disto/rate based optimization */
+                    /* Check if the layer allocation done by opj_tcd_makelayer()
+                     * is compatible of the maximum rate allocation. If not,
+                     * retry with a higher threshold.
+                     * If OK, try with a lower threshold.
+                     * Call opj_t2_encode_packets() only if opj_tcd_makelayer()
+                     * has resulted in different truncation points since its last
+                     * call. */
+                    if ((layer_allocation_is_same && !last_layer_allocation_ok) ||
+                            (!layer_allocation_is_same &&
+                             ! opj_t2_encode_packets(t2, tcd->tcd_tileno, tcd_tile, layno + 1, dest,
+                                                     p_data_written, maxlen, cstr_info, NULL, tcd->cur_tp_num, tcd->tp_pos,
+                                                     tcd->cur_pino,
+                                                     THRESH_CALC, p_manager))) {
+
+#ifdef DEBUG_RATE_ALLOC
+                        if (!layer_allocation_is_same) {
+                            opj_event_msg(p_manager, EVT_INFO,
+                                          "--> check rate alloc failed (> maxlen=%u)\n", maxlen);
+                        }
+#endif
+                        last_layer_allocation_ok = OPJ_FALSE;
                         lo = thresh;
                         continue;
                     }
 
+#ifdef DEBUG_RATE_ALLOC
+                    if (!layer_allocation_is_same) {
+                        opj_event_msg(p_manager, EVT_INFO,
+                                      "--> check rate alloc success (len=%u <= maxlen=%u)\n", *p_data_written,
+                                      maxlen);
+                    }
+#endif
+
+                    last_layer_allocation_ok = OPJ_TRUE;
                     hi = thresh;
                     stable_thresh = thresh;
                 }
@@ -635,7 +704,6 @@ OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
 
         opj_tcd_makelayer(tcd, layno, goodthresh, 1);
 
-        /* fixed_quality */
         cumdisto[layno] = (layno == 0) ? tcd_tile->distolayer[0] :
                           (cumdisto[layno - 1] + tcd_tile->distolayer[layno]);
     }
@@ -2247,6 +2315,9 @@ static OPJ_BOOL opj_tcd_dc_level_shift_decode(opj_tcd_t *p_tcd)
             l_max = (OPJ_INT32)((1U << l_img_comp->prec) - 1);
         }
 
+        if (l_width == 0 || l_height == 0) {
+            continue;
+        }
 
         if (l_tccp->qmfbid == 1) {
             for (j = 0; j < l_height; ++j) {
@@ -2262,7 +2333,7 @@ static OPJ_BOOL opj_tcd_dc_level_shift_decode(opj_tcd_t *p_tcd)
             for (j = 0; j < l_height; ++j) {
                 for (i = 0; i < l_width; ++i) {
                     OPJ_FLOAT32 l_value = *((OPJ_FLOAT32 *) l_current_ptr);
-                    if (l_value > INT_MAX) {
+                    if (l_value > (OPJ_FLOAT32)INT_MAX) {
                         *l_current_ptr = l_max;
                     } else if (l_value < INT_MIN) {
                         *l_current_ptr = l_min;
@@ -2599,10 +2670,10 @@ static OPJ_BOOL opj_tcd_rate_allocate_encode(opj_tcd_t *p_tcd,
         p_cstr_info->index_write = 0;
     }
 
-    if (l_cp->m_specific_param.m_enc.m_disto_alloc ||
-            l_cp->m_specific_param.m_enc.m_fixed_quality)  {
-        /* fixed_quality */
-        /* Normal Rate/distortion allocation */
+    if (l_cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy ==
+            RATE_DISTORTION_RATIO ||
+            l_cp->m_specific_param.m_enc.m_quality_layer_alloc_strategy ==
+            FIXED_DISTORTION_RATIO)  {
         if (! opj_tcd_rateallocate(p_tcd, p_dest_data, &l_nb_written, p_max_dest_size,
                                    p_cstr_info, p_manager)) {
             return OPJ_FALSE;
diff --git a/3rdparty/openjpeg/openjp2/tcd.h b/3rdparty/openjpeg/openjp2/tcd.h
index 340c2bf8a646..f659869a1344 100644
--- a/3rdparty/openjpeg/openjp2/tcd.h
+++ b/3rdparty/openjpeg/openjp2/tcd.h
@@ -222,8 +222,8 @@ typedef struct opj_tcd_tilecomp {
     OPJ_UINT32 win_x1;
     OPJ_UINT32 win_y1;
 
-    /* add fixed_quality */
-    OPJ_INT32 numpix;
+    /* number of pixels */
+    OPJ_SIZE_T numpix;
 } opj_tcd_tilecomp_t;
 
 
@@ -235,9 +235,9 @@ typedef struct opj_tcd_tile {
     OPJ_INT32 x0, y0, x1, y1;
     OPJ_UINT32 numcomps;            /* number of components in tile */
     opj_tcd_tilecomp_t *comps;  /* Components information */
-    OPJ_INT32 numpix;               /* add fixed_quality */
-    OPJ_FLOAT64 distotile;          /* add fixed_quality */
-    OPJ_FLOAT64 distolayer[100];    /* add fixed_quality */
+    OPJ_SIZE_T numpix;               /* number of pixels */
+    OPJ_FLOAT64 distotile;          /* distortion of the tile */
+    OPJ_FLOAT64 distolayer[100];    /* distortion per layer */
     OPJ_UINT32 packno;              /* packet number */
 } opj_tcd_tile_t;
 
@@ -369,23 +369,6 @@ OPJ_BOOL opj_tcd_init(opj_tcd_t *p_tcd,
 OPJ_BOOL opj_tcd_init_decode_tile(opj_tcd_t *p_tcd, OPJ_UINT32 p_tile_no,
                                   opj_event_mgr_t* p_manager);
 
-void opj_tcd_makelayer_fixed(opj_tcd_t *tcd, OPJ_UINT32 layno,
-                             OPJ_UINT32 final);
-
-void opj_tcd_rateallocate_fixed(opj_tcd_t *tcd);
-
-void opj_tcd_makelayer(opj_tcd_t *tcd,
-                       OPJ_UINT32 layno,
-                       OPJ_FLOAT64 thresh,
-                       OPJ_UINT32 final);
-
-OPJ_BOOL opj_tcd_rateallocate(opj_tcd_t *tcd,
-                              OPJ_BYTE *dest,
-                              OPJ_UINT32 * p_data_written,
-                              OPJ_UINT32 len,
-                              opj_codestream_info_t *cstr_info,
-                              opj_event_mgr_t *p_manager);
-
 /**
  * Gets the maximum tile size that will be taken by the tile once decoded.
  */
diff --git a/3rdparty/openjpeg/openjp2/tgt.c b/3rdparty/openjpeg/openjp2/tgt.c
index 0cbad12c42ef..711d753f46c1 100644
--- a/3rdparty/openjpeg/openjp2/tgt.c
+++ b/3rdparty/openjpeg/openjp2/tgt.c
@@ -287,12 +287,12 @@ void opj_tgt_encode(opj_bio_t *bio, opj_tgt_tree_t *tree, OPJ_UINT32 leafno,
         while (low < threshold) {
             if (low >= node->value) {
                 if (!node->known) {
-                    opj_bio_write(bio, 1, 1);
+                    opj_bio_putbit(bio, 1);
                     node->known = 1;
                 }
                 break;
             }
-            opj_bio_write(bio, 0, 1);
+            opj_bio_putbit(bio, 0);
             ++low;
         }
 
diff --git a/3rdparty/openjpeg/openjp2/thread.c b/3rdparty/openjpeg/openjp2/thread.c
index f2fca2ee4af8..240810b1c44b 100644
--- a/3rdparty/openjpeg/openjp2/thread.c
+++ b/3rdparty/openjpeg/openjp2/thread.c
@@ -221,7 +221,7 @@ struct opj_thread_t {
     HANDLE hThread;
 };
 
-unsigned int __stdcall opj_thread_callback_adapter(void *info)
+static unsigned int __stdcall opj_thread_callback_adapter(void *info)
 {
     opj_thread_t* thread = (opj_thread_t*) info;
     HANDLE hEvent = NULL;
diff --git a/3rdparty/orbbecsdk/orbbecsdk.cmake b/3rdparty/orbbecsdk/orbbecsdk.cmake
new file mode 100644
index 000000000000..db51aee9c4ed
--- /dev/null
+++ b/3rdparty/orbbecsdk/orbbecsdk.cmake
@@ -0,0 +1,18 @@
+function(download_orbbec_sdk root_var)
+    set(ORBBECSDK_DOWNLOAD_DIR "${OpenCV_BINARY_DIR}/3rdparty/orbbecsdk")
+    set(ORBBECSDK_FILE_HASH_CMAKE "e7566fa915a1b0c02640df41891916fe")
+    ocv_download(FILENAME "v1.9.4.tar.gz"
+                HASH ${ORBBECSDK_FILE_HASH_CMAKE}
+                URL "https://github.com/orbbec/OrbbecSDK/archive/refs/tags/v1.9.4/"
+                DESTINATION_DIR ${ORBBECSDK_DOWNLOAD_DIR}
+                ID OrbbecSDK
+                STATUS res
+                UNPACK RELATIVE_URL
+                )
+    if(${res})
+        message(STATUS "orbbec sdk downloaded to: ${ORBBECSDK_DOWNLOAD_DIR}")
+        set(${root_var} "${ORBBECSDK_DOWNLOAD_DIR}/OrbbecSDK-1.9.4" PARENT_SCOPE)
+    else()
+        message(FATAL_ERROR "Failed to download orbbec sdk")
+    endif()
+endfunction()
\ No newline at end of file
diff --git a/3rdparty/protobuf/CMakeLists.txt b/3rdparty/protobuf/CMakeLists.txt
index e39de9823ae2..7df035cac9cf 100644
--- a/3rdparty/protobuf/CMakeLists.txt
+++ b/3rdparty/protobuf/CMakeLists.txt
@@ -26,6 +26,9 @@ else()
                                        -Wsuggest-override -Winconsistent-missing-override
                                        -Wimplicit-fallthrough
                                        -Warray-bounds  # GCC 9+
+                                       -Wstringop-overflow -Wstringop-overread # GCC 11-12
+                                       -Wextra-semi # clang
+                                       -Wcomma # clang
   )
 endif()
 if(CV_ICC)
diff --git a/3rdparty/readme.txt b/3rdparty/readme.txt
index c3068521e3af..4ef1966f7a4c 100644
--- a/3rdparty/readme.txt
+++ b/3rdparty/readme.txt
@@ -31,7 +31,7 @@ libpng                Portable Network Graphics library.
                       
 libspng               Portable Network Graphics library.
                       The license and copyright notes can be found in libspng/LICENSE.
-                      See libspng home page https://www.libspng.org
+                      See libspng home page https://libspng.org
                       for details and links to the source code
                       
                       WITH_SPNG CMake option must be ON to add libspng support to imgcodecs
@@ -39,7 +39,8 @@ libspng               Portable Network Graphics library.
 libtiff               Tag Image File Format (TIFF) Software
                       Copyright (c) 1988-1997 Sam Leffler
                       Copyright (c) 1991-1997 Silicon Graphics, Inc.
-                      See libtiff home page http://www.libtiff.org/
+                      See libtiff home page #1 http://www.simplesystems.org/libtiff/
+                                            #2 https://libtiff.gitlab.io/libtiff/
                       for details and links to the source code
 
                       WITH_TIFF CMake option must be ON to add libtiff & zlib support to imgcodecs.
@@ -48,6 +49,14 @@ zlib                  General purpose LZ77 compression library
                       Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler.
                       See zlib home page http://www.zlib.net
                       for details and links to the source code
+
+zlib-ng               zlib data compression library for the next generation systems
+                      (C) 1995-2013 Jean-loup Gailly and Mark Adler
+                      See zlib-ng official GitHub repository
+                      https://github.com/zlib-ng/zlib-ng.git
+                      for details and links to source code
+
+                      WITH_ZLIB_NG CMake option must be ON to use zlib-ng as the zlib implementation.
 ------------------------------------------------------------------------------------
 jasper                JasPer is a collection of software
                       (i.e., a library and application programs) for the coding
diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index 50f3e6ccf150..2083415c61f7 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -5,8 +5,8 @@ if (WIN32 AND NOT ARM)
   message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!")
 endif()
 
-ocv_update(OPENCV_TBB_RELEASE "v2020.2")
-ocv_update(OPENCV_TBB_RELEASE_MD5 "5af6f6c2a24c2043e62e47205e273b1f")
+ocv_update(OPENCV_TBB_RELEASE "v2021.11.0")
+ocv_update(OPENCV_TBB_RELEASE_MD5 "b301151120b08a17e98dcdda6e4f6011")
 ocv_update(OPENCV_TBB_FILENAME "${OPENCV_TBB_RELEASE}.tar.gz")
 string(REGEX REPLACE "^v" "" OPENCV_TBB_RELEASE_ "${OPENCV_TBB_RELEASE}")
 #ocv_update(OPENCV_TBB_SUBDIR ...)
@@ -17,7 +17,7 @@ ocv_download(FILENAME ${OPENCV_TBB_FILENAME}
              URL
                "${OPENCV_TBB_URL}"
                "$ENV{OPENCV_TBB_URL}"
-               "https://github.com/01org/tbb/archive/"
+               "https://github.com/oneapi-src/oneTBB/archive/refs/tags/"
              DESTINATION_DIR ${tbb_src_dir}
              ID TBB
              STATUS res
@@ -44,7 +44,6 @@ ocv_include_directories("${tbb_src_dir}/include"
 
 file(GLOB lib_srcs "${tbb_src_dir}/src/tbb/*.cpp")
 file(GLOB lib_hdrs "${tbb_src_dir}/src/tbb/*.h")
-list(APPEND lib_srcs "${tbb_src_dir}/src/rml/client/rml_tbb.cpp")
 ocv_list_filterout(lib_srcs "${tbb_src_dir}/src/tbb/tbbbind.cpp")  # hwloc.h requirement
 ocv_list_filterout(lib_srcs "${tbb_src_dir}/src/tbb/tbb_bind.cpp")  # hwloc.h requirement 2020.1+
 
@@ -100,6 +99,8 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS
     -Wimplicit-fallthrough             # TBB 2018 under GCC 7+
     -Wmissing-prototypes               # MacOSX, Android/Clang
     -Wundef -Wmissing-declarations     # TBB 2019
+    -Wnon-virtual-dtor                 # oneTBB-2020.2 Android
+    -Wunused-but-set-variable          # oneTBB-2020.2 Android
 )
 
 set(TBB_SOURCE_FILES ${lib_srcs} ${lib_hdrs})
@@ -168,6 +169,6 @@ ocv_install_target(tbb EXPORT OpenCVModules
     OPTIONAL
     )
 
-ocv_install_3rdparty_licenses(tbb "${tbb_src_dir}/LICENSE" "${tbb_src_dir}/README")
+ocv_install_3rdparty_licenses(tbb "${tbb_src_dir}/LICENSE.txt" "${tbb_src_dir}/README.md")
 
 ocv_tbb_read_version("${tbb_src_dir}/include" tbb)
diff --git a/3rdparty/zlib-ng/CMakeLists.txt b/3rdparty/zlib-ng/CMakeLists.txt
new file mode 100644
index 000000000000..c05511ca8718
--- /dev/null
+++ b/3rdparty/zlib-ng/CMakeLists.txt
@@ -0,0 +1,796 @@
+project(${ZLIB_LIBRARY} LANGUAGES C)
+
+if("c_std_11" IN_LIST CMAKE_C_COMPILE_FEATURES)
+  set(CMAKE_C_STANDARD 11)          # The C standard whose features are requested to build this target
+else()
+  set(CMAKE_C_STANDARD 99)
+endif()
+set(CMAKE_C_STANDARD_REQUIRED ON) # Boolean describing whether the value of C_STANDARD is a requirement
+set(CMAKE_C_EXTENSIONS OFF)       # Boolean specifying whether compiler specific extensions are requested
+
+include(CheckTypeSize)
+include(CheckSymbolExists)
+include(CheckFunctionExists)
+include(CheckIncludeFile)
+include(CheckCSourceCompiles)
+include(CheckCSourceRuns)
+include(CheckCCompilerFlag)
+include(CMakeDependentOption)
+
+if(X86_64 OR X86)
+  set(BASEARCH_X86_FOUND TRUE)
+endif()
+if(AARCH64 OR ARM)
+  set(BASEARCH_ARM_FOUND TRUE)
+endif()
+if(PPC64LE OR PPC64)
+  set(BASEARCH_PPC_FOUND TRUE)
+endif()
+if(RISCV)
+  set(BASEARCH_RISCV_FOUND TRUE)
+endif()
+
+include(cmake/detect-intrinsics.cmake)
+include(cmake/fallback-macros.cmake)
+
+set(ZLIB_SYMBOL_PREFIX "")
+
+if(BASEARCH_X86_FOUND)
+  set(WITH_AVX2 ON)
+  set(WITH_AVX512 ON)
+  set(WITH_AVX512VNNI ON)
+  set(WITH_SSE2 ON)
+  set(WITH_SSSE3 ON)
+  set(WITH_SSE42 ON)
+  set(WITH_PCLMULQDQ ON)
+  set(WITH_VPCLMULQDQ ON)
+endif()
+if(BASEARCH_ARM_FOUND)
+  set(WITH_ACLE ON)
+  set(WITH_NEON ON)
+  if(ARM)
+    set(WITH_ARMV6 ON)
+  else()
+    set(WITH_ARMV6 OFF)
+  endif()
+endif()
+if(BASEARCH_PPC_FOUND)
+  set(WITH_ALTIVEC ON)
+  set(WITH_POWER8 ON)
+  set(WITH_POWER9 ON)
+endif()
+if(BASEARCH_RISCV_FOUND)
+  set(WITH_RVV ON)
+endif()
+
+
+add_definitions(-DZLIB_COMPAT)
+
+add_definitions(-DWITH_GZFILEOP)
+
+if(CMAKE_C_COMPILER_ID MATCHES "^Intel")
+  set(WARNFLAGS_DISABLE)
+elseif(MSVC)
+  # Minimum supported MSVC version is 1800 = Visual Studio 12.0/2013
+  # See also https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
+  if(MSVC_VERSION VERSION_LESS 1800)
+    message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).")
+  endif()
+  # TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination
+  # (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should
+  # avoid mistakes.
+  # /Oi ?
+  set(WARNFLAGS_DISABLE)
+  if(BASEARCH_ARM_FOUND)
+      add_definitions(-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE)
+      if(NOT "${ARCH}" MATCHES "aarch64")
+          set(NEONFLAG "/arch:VFPv4")
+      endif()
+  endif()
+elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+  set(WARNFLAGS_DISABLE)
+  # Check whether -fno-lto is available
+  set(CMAKE_REQUIRED_FLAGS "-fno-lto")
+  check_c_source_compiles(
+    "int main() { return 0; }"
+    FNO_LTO_AVAILABLE FAIL_REGEX "not supported")
+  set(CMAKE_REQUIRED_FLAGS)
+  if(FNO_LTO_AVAILABLE)
+    set(ZNOLTOFLAG "-fno-lto")
+  endif()
+  if(BASEARCH_ARM_FOUND)
+    if(ARM AND NOT CMAKE_C_FLAGS MATCHES "-mfloat-abi")
+      # Auto-detect support for ARM floating point ABI
+      check_include_file(features.h HAVE_FEATURES_H)
+      if(HAVE_FEATURES_H)
+        set(CMAKE_REQUIRED_FLAGS -mfloat-abi=softfp)
+        check_c_source_compiles(
+          "#include <features.h>
+          int main() { return 0; }"
+          HAVE_FLOATABI_SOFTFP)
+        if(HAVE_FLOATABI_SOFTFP)
+          set(FLOATABI -mfloat-abi=softfp)
+        else()
+          set(CMAKE_REQUIRED_FLAGS -mfloat-abi=hard)
+          check_c_source_compiles(
+            "#include <features.h>
+            int main() { return 0; }"
+            HAVE_FLOATABI_HARD)
+          if(HAVE_FLOATABI_HARD)
+            set(FLOATABI -mfloat-abi=hard)
+          endif()
+        endif()
+        set(CMAKE_REQUIRED_FLAGS)
+      endif()
+      if(FLOATABI)
+        message(STATUS "${ZLIB_LIBRARY} ARM floating point arch: ${FLOATABI}")
+        add_compile_options(${FLOATABI})
+      else()
+        message(STATUS "${ZLIB_LIBRARY} ARM floating point arch not auto-detected")
+      endif()
+    endif()
+  endif()
+  if(FNO_LTO_AVAILABLE)
+    set(NOLTOFLAG ${ZNOLTOFLAG})
+  endif()
+  if(MINGW)
+    # Add `-Wno-pedantic-ms-format` only if the toolchain supports it
+    check_c_compiler_flag(-Wno-pedantic-ms-format HAVE_NO_PEDANTIC_MS_FORMAT)
+    if(HAVE_NO_PEDANTIC_MS_FORMAT)
+      list(APPEND WARNFLAGS_DISABLE -Wno-pedantic-ms-format)
+    endif()
+  endif()
+endif()
+
+# Force disable LTO
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
+
+# Apply warning compiler flags
+add_compile_options(${WARNFLAGS_DISABLE})
+
+# Replace optimization level 3 added by default with level 2
+if(NOT MSVC AND NOT CMAKE_C_FLAGS MATCHES "([\\/\\-]O)3")
+  string(REGEX REPLACE "([\\/\\-]O)3" "\\12"
+    CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+endif()
+
+#
+# Check for standard/system includes
+#
+check_include_file(arm_acle.h  HAVE_ARM_ACLE_H)
+if(HAVE_ARM_ACLE_H)
+  add_definitions(-DHAVE_ARM_ACLE_H)
+endif()
+check_include_file(sys/auxv.h  HAVE_SYS_AUXV_H)
+if(HAVE_SYS_AUXV_H)
+  add_definitions(-DHAVE_SYS_AUXV_H)
+endif()
+check_include_file(sys/sdt.h   HAVE_SYS_SDT_H)
+if(HAVE_SYS_SDT_H)
+  add_definitions(-DHAVE_SYS_SDT_H)
+endif()
+check_include_file(unistd.h    HAVE_UNISTD_H)
+
+#
+# Check to see if we have large file support
+#
+set(CMAKE_REQUIRED_DEFINITIONS -D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
+check_type_size(off64_t OFF64_T)
+if(HAVE_OFF64_T)
+  add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
+else()
+  check_type_size(_off64_t _OFF64_T)
+  if(HAVE__OFF64_T)
+    add_definitions(-D_LARGEFILE64_SOURCE=1 -D__USE_LARGEFILE64)
+  else()
+    check_type_size(__off64_t __OFF64_T)
+  endif()
+endif()
+set(CMAKE_REQUIRED_DEFINITIONS) # clear variable
+
+#
+# Check for fseeko and other optional functions
+#
+check_function_exists(fseeko HAVE_FSEEKO)
+if(NOT HAVE_FSEEKO)
+  add_definitions(-DNO_FSEEKO)
+endif()
+
+check_function_exists(strerror HAVE_STRERROR)
+if(NOT HAVE_STRERROR)
+  add_definitions(-DNO_STRERROR)
+endif()
+
+set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112L)
+check_symbol_exists(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
+if(HAVE_POSIX_MEMALIGN)
+  add_definitions(-DHAVE_POSIX_MEMALIGN)
+endif()
+set(CMAKE_REQUIRED_DEFINITIONS)
+
+set(CMAKE_REQUIRED_DEFINITIONS -D_ISOC11_SOURCE=1)
+check_symbol_exists(aligned_alloc stdlib.h HAVE_ALIGNED_ALLOC)
+if(HAVE_ALIGNED_ALLOC)
+  add_definitions(-DHAVE_ALIGNED_ALLOC)
+endif()
+set(CMAKE_REQUIRED_DEFINITIONS)
+
+#
+# Check if we can hide zlib internal symbols that are linked between separate source files using hidden
+#
+check_c_source_compiles(
+  "#define Z_INTERNAL __attribute__((visibility (\"hidden\")))
+  int Z_INTERNAL foo;
+  int main() {
+      return 0;
+  }"
+  HAVE_ATTRIBUTE_VISIBILITY_HIDDEN FAIL_REGEX "visibility")
+if(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN)
+  add_definitions(-DHAVE_VISIBILITY_HIDDEN)
+endif()
+
+#
+# Check if we can hide zlib internal symbols that are linked between separate source files using internal
+#
+check_c_source_compiles(
+  "#define Z_INTERNAL __attribute__((visibility (\"internal\")))
+  int Z_INTERNAL foo;
+  int main() {
+      return 0;
+  }"
+  HAVE_ATTRIBUTE_VISIBILITY_INTERNAL FAIL_REGEX "visibility")
+if(HAVE_ATTRIBUTE_VISIBILITY_INTERNAL)
+  add_definitions(-DHAVE_VISIBILITY_INTERNAL)
+endif()
+
+#
+# Check for __attribute__((aligned(x))) support in the compiler
+#
+check_c_source_compiles(
+  "int main(void) {
+      __attribute__((aligned(8))) int test = 0;
+      (void)test;
+      return 0;
+  }"
+  HAVE_ATTRIBUTE_ALIGNED FAIL_REGEX "aligned")
+if(HAVE_ATTRIBUTE_ALIGNED)
+  add_definitions(-DHAVE_ATTRIBUTE_ALIGNED)
+endif()
+
+#
+# check for __builtin_ctz() support in the compiler
+#
+check_c_source_compiles(
+  "int main(void) {
+      unsigned int zero = 0;
+      long test = __builtin_ctz(zero);
+      (void)test;
+      return 0;
+  }"
+  HAVE_BUILTIN_CTZ
+)
+if(HAVE_BUILTIN_CTZ)
+  add_definitions(-DHAVE_BUILTIN_CTZ)
+endif()
+
+#
+# check for __builtin_ctzll() support in the compiler
+#
+check_c_source_compiles(
+  "int main(void) {
+      unsigned int zero = 0;
+      long test = __builtin_ctzll(zero);
+      (void)test;
+      return 0;
+  }"
+  HAVE_BUILTIN_CTZLL
+)
+if(HAVE_BUILTIN_CTZLL)
+  add_definitions(-DHAVE_BUILTIN_CTZLL)
+endif()
+
+#
+# check for ptrdiff_t support
+#
+check_c_source_compiles(
+  "#include <stddef.h>
+    int main() {
+        ptrdiff_t *a;
+        (void)a;
+        return 0;
+  }"
+  HAVE_PTRDIFF_T
+)
+if(NOT HAVE_PTRDIFF_T)
+  set(NEED_PTRDIFF_T 1)
+
+  check_type_size("void *" SIZEOF_DATA_PTR)
+  message(STATUS "sizeof(void *) is ${SIZEOF_DATA_PTR} bytes")
+
+  if(${SIZEOF_DATA_PTR} MATCHES "4")
+    set(PTRDIFF_TYPE "uint32_t")
+  elseif(${SIZEOF_DATA_PTR} MATCHES "8")
+    set(PTRDIFF_TYPE "uint64_t")
+  else()
+    message(FATAL_ERROR "sizeof(void *) is neither 32 nor 64 bit")
+  endif()
+endif()
+
+if(MSVC)
+  add_definitions(-D_CRT_SECURE_NO_DEPRECATE)
+  add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE)
+endif()
+
+set(ZLIB_ARCH_SRCS)
+set(ZLIB_ARCH_HDRS)
+set(ARCHDIR "arch/generic")
+if(BASEARCH_X86_FOUND)
+  set(ARCHDIR "arch/x86")
+endif()
+if(BASEARCH_ARM_FOUND)
+  set(ARCHDIR "arch/arm")
+endif()
+if(BASEARCH_PPC_FOUND)
+  set(ARCHDIR "arch/power")
+endif()
+if(BASEARCH_RISCV_FOUND)
+  set(ARCHDIR "arch/riscv")
+endif()
+
+if(NOT CV_DISABLE_OPTIMIZATION)
+  if(BASEARCH_ARM_FOUND)
+    add_definitions(-DARM_FEATURES)
+    if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+      if("${ARCH}" MATCHES "aarch64")
+        check_c_source_compiles(
+          "#include <sys/auxv.h>
+          int main() {
+              return (getauxval(AT_HWCAP) & HWCAP_CRC32);
+          }"
+          ARM_AUXV_HAS_CRC32
+        )
+        if(ARM_AUXV_HAS_CRC32)
+          add_definitions(-DARM_AUXV_HAS_CRC32)
+        else()
+          message(STATUS "HWCAP_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
+        endif()
+      else()
+        check_c_source_compiles(
+          "#include <sys/auxv.h>
+          int main() {
+              return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
+          }"
+          ARM_AUXV_HAS_CRC32
+        )
+        if(ARM_AUXV_HAS_CRC32)
+          add_definitions(-DARM_AUXV_HAS_CRC32)
+        else()
+          check_c_source_compiles(
+            "#include <sys/auxv.h>
+            #include <asm/hwcap.h>
+            int main() {
+                return (getauxval(AT_HWCAP2) & HWCAP2_CRC32);
+            }"
+            ARM_HWCAP_HAS_CRC32
+          )
+          if(ARM_HWCAP_HAS_CRC32)
+            add_definitions(-DARM_AUXV_HAS_CRC32 -DARM_ASM_HWCAP)
+          else()
+            message(STATUS "HWCAP2_CRC32 not present in sys/auxv.h; cannot detect support at runtime.")
+          endif()
+        endif()
+        check_c_source_compiles(
+          "#include <sys/auxv.h>
+          int main() {
+            return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON);
+          }"
+          ARM_AUXV_HAS_NEON
+        )
+        if(ARM_AUXV_HAS_NEON)
+          add_definitions(-DARM_AUXV_HAS_NEON)
+        else()
+          check_c_source_compiles(
+            "#include <sys/auxv.h>
+            int main() {
+              return (getauxval(AT_HWCAP) & HWCAP_NEON);
+            }"
+            ARM_AUXV_HAS_NEON
+          )
+          if (ARM_AUXV_HAS_NEON)
+            add_definitions(-DARM_AUXV_HAS_NEON)
+          else()
+            message(STATUS "Neither HWCAP_ARM_NEON or HWCAP_NEON present in sys/auxv.h; cannot detect support at runtime.")
+          endif()
+        endif()
+      endif()
+    endif()
+    list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm_features.h)
+    list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/arm_features.c)
+    if(WITH_ACLE)
+      check_acle_compiler_flag()
+      if(HAVE_ACLE_FLAG)
+        add_definitions(-DARM_ACLE)
+        set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
+        set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}")
+        list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
+      else()
+        set(WITH_ACLE OFF)
+      endif()
+    else()
+      set(WITH_ACLE OFF)
+    endif()
+    if(WITH_NEON)
+      check_neon_compiler_flag()
+      if(NEON_AVAILABLE)
+        add_definitions(-DARM_NEON)
+        set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
+          ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
+        list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
+        set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
+        if(MSVC)
+          add_definitions(-D__ARM_NEON__)
+        endif()
+        check_neon_ld4_intrinsics()
+        if(NEON_HAS_LD4)
+          add_definitions(-DARM_NEON_HASLD4)
+        endif()
+      else()
+        set(WITH_NEON OFF)
+      endif()
+    endif()
+    if(WITH_ARMV6)
+      check_armv6_compiler_flag()
+      if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN)
+        add_definitions(-DARM_SIMD)
+        set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c)
+        set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}")
+        list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS})
+        if(HAVE_ARMV6_INTRIN)
+          add_definitions(-DARM_SIMD_INTRIN)
+        endif()
+      else()
+        set(WITH_ARMV6 OFF)
+      endif()
+    else()
+      set(WITH_ARMV6 OFF)
+    endif()
+  endif()
+  if(BASEARCH_PPC_FOUND)
+    # Common arch detection code
+    if(WITH_ALTIVEC)
+      check_ppc_intrinsics()
+    endif()
+    if(WITH_POWER8)
+      check_power8_intrinsics()
+    endif()
+    if(WITH_POWER9)
+      check_power9_intrinsics()
+    endif()
+    if(HAVE_VMX OR HAVE_POWER8_INTRIN OR HAVE_POWER9_INTRIN)
+      add_definitions(-DPOWER_FEATURES)
+      list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power_features.h)
+      list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power_features.c)
+    endif()
+    # VMX specific options and files
+    if(WITH_ALTIVEC)
+      if(HAVE_VMX)
+        add_definitions(-DPPC_FEATURES)
+        if(HAVE_ALTIVEC)
+          add_definitions(-DPPC_VMX)
+          set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c)
+          list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS})
+          set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}")
+        else()
+          set(WITH_ALTIVEC OFF)
+        endif()
+      endif()
+    endif()
+    # Power8 specific options and files
+    if(WITH_POWER8)
+      if(HAVE_POWER8_INTRIN)
+        add_definitions(-DPOWER8_VSX)
+        set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c)
+        if("${ARCH}" MATCHES "powerpc64(le)?")
+          add_definitions(-DPOWER8_VSX_CRC32)
+          list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c)
+        endif()
+        list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
+        set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_POWER8 OFF)
+      endif()
+    endif()
+    # Power9 specific options and files
+    if(WITH_POWER9)
+      if(HAVE_POWER9_INTRIN)
+        add_definitions(-DPOWER9)
+        set(POWER9_SRCS ${ARCHDIR}/compare256_power9.c)
+        list(APPEND ZLIB_ARCH_SRCS ${POWER9_SRCS})
+        set_property(SOURCE ${POWER9_SRCS} PROPERTY COMPILE_FLAGS "${POWER9FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_POWER9 OFF)
+      endif()
+    endif()
+  endif()
+  if(BASEARCH_RISCV_FOUND)
+    if(WITH_RVV)
+      check_rvv_intrinsics()
+      if(HAVE_RVV_INTRIN)
+        add_definitions(-DRISCV_FEATURES)
+        add_definitions(-DRISCV_RVV)
+        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h)
+        list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c)
+        # FIXME: we will not set compile flags for riscv_features.c when
+        # the kernels update hwcap or hwprobe for riscv
+        set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c)
+        list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS})
+        set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_RVV OFF)
+      endif()
+    endif()
+  endif()
+  if(BASEARCH_X86_FOUND)
+    add_definitions(-DX86_FEATURES)
+    list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86_features.h)
+    list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86_features.c)
+    if(MSVC)
+      list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
+    endif()
+    if(WITH_AVX2)
+      check_avx2_intrinsics()
+      if(HAVE_AVX2_INTRIN)
+        add_definitions(-DX86_AVX2)
+        set(AVX2_SRCS ${ARCHDIR}/slide_hash_avx2.c)
+        list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx2.c)
+        list(APPEND AVX2_SRCS ${ARCHDIR}/compare256_avx2.c)
+        list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx2.c)
+        list(APPEND ZLIB_ARCH_SRCS ${AVX2_SRCS})
+        set_property(SOURCE ${AVX2_SRCS} PROPERTY COMPILE_FLAGS "${AVX2FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_AVX2 OFF)
+      endif()
+    endif()
+    if(WITH_AVX512)
+      check_avx512_intrinsics()
+      if(HAVE_AVX512_INTRIN)
+        add_definitions(-DX86_AVX512)
+        list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c)
+        list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS})
+        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h)
+        if(HAVE_MASK_INTRIN)
+          add_definitions(-DX86_MASK_INTRIN)
+        endif()
+        set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_AVX512 OFF)
+      endif()
+    endif()
+    if(WITH_AVX512VNNI)
+      check_avx512vnni_intrinsics()
+      if(HAVE_AVX512VNNI_INTRIN)
+        add_definitions(-DX86_AVX512VNNI)
+        list(APPEND AVX512VNNI_SRCS ${ARCHDIR}/adler32_avx512_vnni.c)
+        list(APPEND ZLIB_ARCH_SRCS ${AVX512VNNI_SRCS})
+        set_property(SOURCE ${AVX512VNNI_SRCS} PROPERTY COMPILE_FLAGS "${AVX512VNNIFLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_AVX512VNNI OFF)
+      endif()
+    endif()
+    if(WITH_SSE42)
+      check_sse42_intrinsics()
+      if(HAVE_SSE42_INTRIN)
+        add_definitions(-DX86_SSE42)
+        set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
+        list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
+        set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_SSE42 OFF)
+      endif()
+    endif()
+    if(WITH_SSE2)
+      check_sse2_intrinsics()
+      if(HAVE_SSE2_INTRIN)
+        add_definitions(-DX86_SSE2)
+        set(SSE2_SRCS ${ARCHDIR}/chunkset_sse2.c ${ARCHDIR}/compare256_sse2.c ${ARCHDIR}/slide_hash_sse2.c)
+        list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
+        if(NOT ${ARCH} MATCHES "x86_64")
+          set_property(SOURCE ${SSE2_SRCS} PROPERTY COMPILE_FLAGS "${SSE2FLAG} ${NOLTOFLAG}")
+          add_definitions(-DX86_NOCHECK_SSE2)
+        endif()
+      else()
+        set(WITH_SSE2 OFF)
+      endif()
+    endif()
+    if(WITH_SSSE3)
+      check_ssse3_intrinsics()
+      if(HAVE_SSSE3_INTRIN)
+        add_definitions(-DX86_SSSE3)
+        set(SSSE3_SRCS ${ARCHDIR}/adler32_ssse3.c ${ARCHDIR}/chunkset_ssse3.c)
+        list(APPEND ZLIB_ARCH_SRCS ${SSSE3_SRCS})
+        set_property(SOURCE ${SSSE3_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${NOLTOFLAG}")
+      else()
+        set(WITH_SSSE3 OFF)
+      endif()
+    endif()
+    if(WITH_PCLMULQDQ AND WITH_SSSE3 AND WITH_SSE42)
+      check_pclmulqdq_intrinsics()
+      if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
+        add_definitions(-DX86_PCLMULQDQ_CRC)
+        set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
+        list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
+        set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
+
+        if(WITH_VPCLMULQDQ AND WITH_AVX512)
+          check_vpclmulqdq_intrinsics()
+          if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
+            add_definitions(-DX86_VPCLMULQDQ_CRC)
+            set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
+            list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
+            set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+          else()
+            set(WITH_VPCLMULQDQ OFF)
+          endif()
+        else()
+          set(WITH_VPCLMULQDQ OFF)
+        endif()
+      else()
+        set(WITH_PCLMULQDQ OFF)
+        set(WITH_VPCLMULQDQ OFF)
+      endif()
+    else()
+      set(WITH_PCLMULQDQ OFF)
+      set(WITH_VPCLMULQDQ OFF)
+    endif()
+    check_xsave_intrinsics()
+    if(HAVE_XSAVE_INTRIN)
+      set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
+    endif()
+  endif()
+endif()
+
+#============================================================================
+# zconf.h
+#============================================================================
+
+macro(generate_cmakein input output)
+  file(REMOVE ${output})
+  file(STRINGS ${input} _lines)
+  foreach(_line IN LISTS _lines)
+    string(REGEX REPLACE "#ifdef HAVE_UNISTD_H.*" "@ZCONF_UNISTD_LINE@" _line "${_line}")
+    string(REGEX REPLACE "#ifdef NEED_PTRDIFF_T.*" "@ZCONF_PTRDIFF_LINE@" _line "${_line}")
+    if(NEED_PTRDIFF_T)
+      string(REGEX REPLACE "typedef PTRDIFF_TYPE" "typedef @PTRDIFF_TYPE@" _line "${_line}")
+    endif()
+    file(APPEND ${output} "${_line}\n")
+  endforeach()
+endmacro(generate_cmakein)
+
+generate_cmakein( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.in ${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein )
+
+#============================================================================
+# zlib
+#============================================================================
+
+set(ZLIB_PUBLIC_HDRS
+    ${CMAKE_CURRENT_BINARY_DIR}/zconf.h
+    ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling.h
+    ${CMAKE_CURRENT_BINARY_DIR}/zlib.h
+)
+set(ZLIB_PRIVATE_HDRS
+    adler32_p.h
+    chunkset_tpl.h
+    compare256_rle.h
+    cpu_features.h
+    crc32_braid_p.h
+    crc32_braid_comb_p.h
+    crc32_braid_tbl.h
+    crc32_fold.h
+    deflate.h
+    deflate_p.h
+    functable.h
+    inffast_tpl.h
+    inffixed_tbl.h
+    inflate.h
+    inflate_p.h
+    inftrees.h
+    insert_string_tpl.h
+    match_tpl.h
+    trees.h
+    trees_emit.h
+    trees_tbl.h
+    zbuild.h
+    zendian.h
+    zutil.h
+)
+set(ZLIB_SRCS
+    adler32.c
+    adler32_fold.c
+    chunkset.c
+    compare256.c
+    compress.c
+    cpu_features.c
+    crc32_braid.c
+    crc32_braid_comb.c
+    crc32_fold.c
+    deflate.c
+    deflate_fast.c
+    deflate_huff.c
+    deflate_medium.c
+    deflate_quick.c
+    deflate_rle.c
+    deflate_slow.c
+    deflate_stored.c
+    functable.c
+    infback.c
+    inflate.c
+    inftrees.c
+    insert_string.c
+    insert_string_roll.c
+    slide_hash.c
+    trees.c
+    uncompr.c
+    zutil.c
+)
+
+set(ZLIB_GZFILE_PRIVATE_HDRS
+    gzguts.h
+)
+set(ZLIB_GZFILE_SRCS
+    gzlib.c
+    ${CMAKE_CURRENT_BINARY_DIR}/gzread.c
+    gzwrite.c
+)
+
+set(ZLIB_ALL_SRCS ${ZLIB_SRCS} ${ZLIB_ARCH_HDRS} ${ZLIB_ARCH_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
+list(APPEND ZLIB_ALL_SRCS ${ZLIB_GZFILE_PRIVATE_HDRS} ${ZLIB_GZFILE_SRCS})
+
+add_library(zlib STATIC ${ZLIB_ALL_SRCS})
+
+target_include_directories(zlib PUBLIC
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR};${CMAKE_CURRENT_SOURCE_DIR}>"
+  "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
+
+if(HAVE_UNISTD_H)
+  SET(ZCONF_UNISTD_LINE "#if 1    /* was set to #if 1 by configure/cmake/etc */")
+else()
+  SET(ZCONF_UNISTD_LINE "#if 0    /* was set to #if 0 by configure/cmake/etc */")
+endif()
+if(NEED_PTRDIFF_T)
+  SET(ZCONF_PTRDIFF_LINE "#if 1    /* was set to #if 1 by configure/cmake/etc */")
+else()
+  SET(ZCONF_PTRDIFF_LINE "#ifdef NEED_PTRDIFF_T    /* may be set to #if 1 by configure/cmake/etc */")
+endif()
+
+configure_file(${CMAKE_CURRENT_BINARY_DIR}/zconf.h.cmakein
+  ${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/zlib.h @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gzread.c.in
+  ${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling.h.empty
+  ${CMAKE_CURRENT_BINARY_DIR}/zlib_name_mangling${SUFFIX}.h COPYONLY)
+
+ocv_warnings_disable(CMAKE_C_FLAGS -Wmissing-prototypes
+  -Wundef
+  -Wmissing-declarations
+)
+
+set_target_properties(${ZLIB_LIBRARY} PROPERTIES
+  OUTPUT_NAME ${ZLIB_LIBRARY}
+  DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+  COMPILE_PDB_NAME ${ZLIB_LIBRARY}
+  COMPILE_PDB_NAME_DEBUG "${ZLIB_LIBRARY}${OPENCV_DEBUG_POSTFIX}"
+  ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
+)
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(${ZLIB_LIBRARY} PROPERTIES FOLDER "3rdparty")
+endif()
+
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+
+ocv_install_3rdparty_licenses(${ZLIB_LIBRARY} LICENSE.md)
diff --git a/3rdparty/zlib-ng/LICENSE.md b/3rdparty/zlib-ng/LICENSE.md
new file mode 100644
index 000000000000..adb48d47296b
--- /dev/null
+++ b/3rdparty/zlib-ng/LICENSE.md
@@ -0,0 +1,19 @@
+(C) 1995-2013 Jean-loup Gailly and Mark Adler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source distribution.
diff --git a/3rdparty/zlib-ng/README.md b/3rdparty/zlib-ng/README.md
new file mode 100644
index 000000000000..4f9fe09c6911
--- /dev/null
+++ b/3rdparty/zlib-ng/README.md
@@ -0,0 +1,229 @@
+| CI | Stable | Develop |
+|:---|:-------|:--------|
+| GitHub Actions | [![Stable CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Astable) <br> [![Stable Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Astable) <br> [![Stable NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=stable)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Astable) | [![Develop CMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/cmake.yml?query=branch%3Adevelop) <br> [![Develop Configure](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/configure.yml?query=branch%3Adevelop) <br> [![Develop NMake](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml/badge.svg?branch=develop)](https://github.com/zlib-ng/zlib-ng/actions/workflows/nmake.yml?query=branch%3Adevelop) |
+| CodeFactor     | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/stable)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/stable) | [![CodeFactor](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/badge/develop)](https://www.codefactor.io/repository/github/zlib-ng/zlib-ng/overview/develop) |
+| OSS-Fuzz       | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) | [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/zlib-ng.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:zlib-ng) |
+| Codecov        | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/stable/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/stable) | [![codecov](https://codecov.io/github/zlib-ng/zlib-ng/branch/develop/graph/badge.svg?token=uKsgK9LIuC)](https://codecov.io/github/zlib-ng/zlib-ng/tree/develop) |
+
+## zlib-ng
+*zlib data compression library for the next generation systems*
+
+Maintained by Hans Kristian Rosbach
+          aka Dead2 (zlib-ng àt circlestorm dót org)
+
+Features
+--------
+
+* Zlib compatible API with support for dual-linking
+* Modernized native API based on zlib API for ease of porting
+* Modern C11 syntax and a clean code layout
+* Deflate medium and quick algorithms based on Intel’s zlib fork
+* Support for CPU intrinsics when available
+  * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
+  * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
+  * Hash table implementation using CRC32-C intrinsics on x86 and ARM
+  * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
+  * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
+  * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
+  * Support for hardware-accelerated deflate using IBM Z DFLTCC
+* Unaligned memory read/writes and large bit buffer improvements
+* Includes improvements from Cloudflare and Intel forks
+* Configure, CMake, and NMake build system support
+* Comprehensive set of CMake unit tests
+* Code sanitizers, fuzzing, and coverage
+* GitHub Actions continuous integration on Windows, macOS, and Linux
+  * Emulated CI for ARM, AARCH64, PPC, PPC64, RISCV, SPARC64, S390x using qemu
+
+
+History
+-------
+
+The motivation for this fork was seeing several 3rd party contributions with new optimizations not getting
+implemented into the official zlib repository.
+
+Mark Adler has been maintaining zlib for a very long time, and he has done a great job and hopefully he will continue
+for a long time yet. The idea of zlib-ng is not to replace zlib, but to co-exist as a drop-in replacement with a
+lower threshold for code change.
+
+zlib has a long history and is incredibly portable, even supporting many systems that predate the Internet.<br>
+That is great, but it can complicate further development and maintainability. The zlib code contains many workarounds
+for really old compilers or to accommodate systems with limitations such as operating in a 16-bit environment.
+
+Many of these workarounds are only maintenance burdens, some of them are pretty huge code-wise. With many workarounds
+cluttered throughout the code, it makes it harder for new programmers with an idea/interest for zlib to contribute.
+
+I decided to make a fork, merge all the Intel optimizations, some of the Cloudflare optimizations, plus a couple other
+smaller patches. Then started cleaning out workarounds, various dead code, all contrib and example code.<br>
+The result is a better performing and easier to maintain zlib-ng.
+
+A lot of improvements have gone into zlib-ng since its start, and numerous people and companies have contributed both
+small and big improvements, or valuable testing.
+
+
+Build
+-----
+<sup>Please read LICENSE.md, it is very simple and very liberal.</sup>
+
+There are two ways to build zlib-ng:
+
+### Cmake
+
+To build zlib-ng using the cross-platform makefile generator cmake.
+
+```
+cmake .
+cmake --build . --config Release
+ctest --verbose -C Release
+```
+
+Alternatively, you can use the cmake configuration GUI tool ccmake:
+
+```
+ccmake .
+```
+
+### Configure
+
+To build zlib-ng using the bash configure script:
+
+```
+./configure
+make
+make test
+```
+
+Build Options
+-------------
+
+| CMake                    | configure                | Description                                                                           | Default |
+|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------|
+| ZLIB_COMPAT              | --zlib-compat            | Compile with zlib compatible API                                                      | OFF     |
+| ZLIB_ENABLE_TESTS        |                          | Build test binaries                                                                   | ON      |
+| WITH_GZFILEOP            | --without-gzfileops      | Compile with support for gzFile related functions                                     | ON      |
+| WITH_OPTIM               | --without-optimizations  | Build with optimisations                                                              | ON      |
+| WITH_NEW_STRATEGIES      | --without-new-strategies | Use new strategies                                                                    | ON      |
+| WITH_NATIVE_INSTRUCTIONS |                          | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
+| WITH_SANITIZER           |                          | Build with sanitizer (memory, address, undefined)                                     | OFF     |
+| WITH_GTEST               |                          | Build gtest_zlib                                                                      | ON      |
+| WITH_FUZZERS             |                          | Build test/fuzz                                                                       | OFF     |
+| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
+| WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
+| WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
+
+
+Install
+-------
+
+WARNING: We do not recommend manually installing unless you really know what you are doing, because this can
+potentially override the system default zlib library, and any incompatibility or wrong configuration of zlib-ng
+can make the whole system unusable, requiring recovery or reinstall.
+If you still want a manual install, we recommend using the /opt/ path prefix.
+
+For Linux distros, an alternative way to use zlib-ng (if compiled in zlib-compat mode) instead of zlib, is through
+the use of the _LD_PRELOAD_ environment variable. If the program is dynamically linked with zlib, then the program
+will temporarily attempt to use zlib-ng instead, without risking system-wide instability.
+
+```
+LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
+```
+
+### Cmake
+
+To install zlib-ng system-wide using cmake:
+
+```sh or powershell
+cmake --build . --target install
+```
+
+### Configure
+
+To install zlib-ng system-wide using the configure script:
+
+```sh
+make install
+```
+
+### CPack
+
+After building with cmake, an installation package can be created using cpack. By default a tgz package is created,
+but you can append `-G <format>` to each command to generate alternative packages types (TGZ, ZIP, RPM, DEB). To easily
+create a rpm or deb package, you would use `-G RPM` or `-G DEB` respectively.
+
+```sh or powershell
+cd build
+cpack --config CPackConfig.cmake
+cpack --config CPackSourceConfig.cmake
+```
+
+### Vcpkg
+
+Alternatively, you can build and install zlib-ng using the [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
+
+```sh or powershell
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh # "./bootstrap-vcpkg.bat" for powershell
+./vcpkg integrate install
+./vcpkg install zlib-ng
+```
+
+The zlib-ng port in vcpkg is kept up to date by Microsoft team members and community contributors.
+If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+Contributing
+------------
+
+Zlib-ng is aiming to be open to contributions, and we would be delighted to receive pull requests on github.
+Help with testing and reviewing pull requests etc is also very much appreciated.
+
+Please check the Wiki for more info: [Contributing](https://github.com/zlib-ng/zlib-ng/wiki/Contributing)
+
+Acknowledgments
+----------------
+
+Thanks go out to all the people and companies who have taken the time to contribute
+code reviews, testing and/or patches. Zlib-ng would not have been nearly as good without you.
+
+The deflate format used by zlib was defined by Phil Katz.<br>
+The deflate and zlib specifications were written by L. Peter Deutsch.
+
+zlib was originally created by Jean-loup Gailly (compression) and Mark Adler (decompression).
+
+
+Advanced Build Options
+----------------------
+
+| CMake                           | configure             | Description                                                         | Default                |
+|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
+| FORCE_SSE2                      | --force-sse2          | Skip runtime check for SSE2 instructions (Always on for x86_64)     | OFF (x86)              |
+| WITH_AVX2                       |                       | Build with AVX2 intrinsics                                          | ON                     |
+| WITH_AVX512                     |                       | Build with AVX512 intrinsics                                        | ON                     |
+| WITH_AVX512VNNI                 |                       | Build with AVX512VNNI intrinsics                                    | ON                     |
+| WITH_SSE2                       |                       | Build with SSE2 intrinsics                                          | ON                     |
+| WITH_SSSE3                      |                       | Build with SSSE3 intrinsics                                         | ON                     |
+| WITH_SSE42                      |                       | Build with SSE42 intrinsics                                         | ON                     |
+| WITH_PCLMULQDQ                  |                       | Build with PCLMULQDQ intrinsics                                     | ON                     |
+| WITH_VPCLMULQDQ                 | --without-vpclmulqdq  | Build with VPCLMULQDQ intrinsics                                    | ON                     |
+| WITH_ACLE                       | --without-acle        | Build with ACLE intrinsics                                          | ON                     |
+| WITH_NEON                       | --without-neon        | Build with NEON intrinsics                                          | ON                     |
+| WITH_ARMV6                      | --without-armv6       | Build with ARMv6 intrinsics                                         | ON                     |
+| WITH_ALTIVEC                    | --without-altivec     | Build with AltiVec (VMX) intrinsics                                 | ON                     |
+| WITH_POWER8                     | --without-power8      | Build with POWER8 optimisations                                     | ON                     |
+| WITH_RVV                        |                       | Build with RVV intrinsics                                           | ON                     |
+| WITH_CRC32_VX                   | --without-crc32-vx    | Build with vectorized CRC32 on IBM Z                                | ON                     |
+| WITH_DFLTCC_DEFLATE             | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z               | OFF                    |
+| WITH_DFLTCC_INFLATE             | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z             | OFF                    |
+| WITH_UNALIGNED                  | --without-unaligned   | Allow optimizations that use unaligned reads if safe on current arch| ON                     |
+| WITH_INFLATE_STRICT             |                       | Build with strict inflate distance checking                         | OFF                    |
+| WITH_INFLATE_ALLOW_INVALID_DIST |                       | Build with zero fill for inflate invalid distances                  | OFF                    |
+| INSTALL_UTILS                   |                       | Copy minigzip and minideflate during install                        | OFF                    |
+| ZLIBNG_ENABLE_TESTS             |                       | Test zlib-ng specific API                                           | ON                     |
+
+
+Related Projects
+----------------
+
+* Fork of the popular minizip                   https://github.com/zlib-ng/minizip-ng
+* Python tool to benchmark minigzip/minideflate https://github.com/zlib-ng/deflatebench
+* Python tool to benchmark pigz                 https://github.com/zlib-ng/pigzbench
+* 3rd party patches for zlib-ng compatibility   https://github.com/zlib-ng/patches
diff --git a/3rdparty/zlib-ng/adler32.c b/3rdparty/zlib-ng/adler32.c
new file mode 100644
index 000000000000..95ac13c3046b
--- /dev/null
+++ b/3rdparty/zlib-ng/adler32.c
@@ -0,0 +1,115 @@
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_p.h"
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    unsigned n;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    /* do length NMAX blocks -- requires just one modulo operation */
+    while (len >= NMAX) {
+        len -= NMAX;
+#ifdef UNROLL_MORE
+        n = NMAX / 16;          /* NMAX is divisible by 16 */
+#else
+        n = NMAX / 8;           /* NMAX is divisible by 8 */
+#endif
+        do {
+#ifdef UNROLL_MORE
+            DO16(adler, sum2, buf);          /* 16 sums unrolled */
+            buf += 16;
+#else
+            DO8(adler, sum2, buf, 0);         /* 8 sums unrolled */
+            buf += 8;
+#endif
+        } while (--n);
+        adler %= BASE;
+        sum2 %= BASE;
+    }
+
+    /* do remaining bytes (less than NMAX, still just one modulo) */
+    return adler32_len_64(adler, buf, len, sum2);
+}
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32_z)(unsigned long adler, const unsigned char *buf, size_t len) {
+    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(adler32_z)(uint32_t adler, const unsigned char *buf, size_t len) {
+    return functable.adler32(adler, buf, len);
+}
+#endif
+
+/* ========================================================================= */
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32)(unsigned long adler, const unsigned char *buf, unsigned int len) {
+    return (unsigned long)functable.adler32((uint32_t)adler, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(adler32)(uint32_t adler, const unsigned char *buf, uint32_t len) {
+    return functable.adler32(adler, buf, len);
+}
+#endif
+
+/* ========================================================================= */
+static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
+    uint32_t sum1;
+    uint32_t sum2;
+    unsigned rem;
+
+    /* for negative len, return invalid adler32 as a clue for debugging */
+    if (len2 < 0)
+        return 0xffffffff;
+
+    /* the derivation of this formula is left as an exercise for the reader */
+    len2 %= BASE;                 /* assumes len2 >= 0 */
+    rem = (unsigned)len2;
+    sum1 = adler1 & 0xffff;
+    sum2 = rem * sum1;
+    sum2 %= BASE;
+    sum1 += (adler2 & 0xffff) + BASE - 1;
+    sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum2 >= ((unsigned long)BASE << 1)) sum2 -= ((unsigned long)BASE << 1);
+    if (sum2 >= BASE) sum2 -= BASE;
+    return sum1 | (sum2 << 16);
+}
+
+/* ========================================================================= */
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off_t len2) {
+    return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
+}
+
+unsigned long Z_EXPORT PREFIX4(adler32_combine)(unsigned long adler1, unsigned long adler2, z_off64_t len2) {
+    return (unsigned long)adler32_combine_((uint32_t)adler1, (uint32_t)adler2, len2);
+}
+#else
+uint32_t Z_EXPORT PREFIX4(adler32_combine)(uint32_t adler1, uint32_t adler2, z_off64_t len2) {
+    return adler32_combine_(adler1, adler2, len2);
+}
+#endif
diff --git a/3rdparty/zlib-ng/adler32_fold.c b/3rdparty/zlib-ng/adler32_fold.c
new file mode 100644
index 000000000000..e2f6f9ac7dd2
--- /dev/null
+++ b/3rdparty/zlib-ng/adler32_fold.c
@@ -0,0 +1,16 @@
+/* adler32_fold.c -- adler32 folding interface
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "functable.h"
+#include "adler32_fold.h"
+
+#include <limits.h>
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    adler = functable.adler32(adler, src, len);
+    memcpy(dst, src, len);
+    return adler;
+}
diff --git a/3rdparty/zlib-ng/adler32_fold.h b/3rdparty/zlib-ng/adler32_fold.h
new file mode 100644
index 000000000000..20aa1c7400b7
--- /dev/null
+++ b/3rdparty/zlib-ng/adler32_fold.h
@@ -0,0 +1,11 @@
+/* adler32_fold.h -- adler32 folding interface
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_FOLD_H_
+#define ADLER32_FOLD_H_
+
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+
+#endif
diff --git a/3rdparty/zlib-ng/adler32_p.h b/3rdparty/zlib-ng/adler32_p.h
new file mode 100644
index 000000000000..38ba2ad72149
--- /dev/null
+++ b/3rdparty/zlib-ng/adler32_p.h
@@ -0,0 +1,70 @@
+/* adler32_p.h -- Private inline functions and macros shared with
+ *                different computation of the Adler-32 checksum
+ *                of a data stream.
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_P_H
+#define ADLER32_P_H
+
+#define BASE 65521U     /* largest prime smaller than 65536 */
+#define NMAX 5552
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+#define DO1(sum1, sum2, buf, i)  {(sum1) += buf[(i)]; (sum2) += (sum1);}
+#define DO2(sum1, sum2, buf, i)  {DO1(sum1, sum2, buf, i); DO1(sum1, sum2, buf, i+1);}
+#define DO4(sum1, sum2, buf, i)  {DO2(sum1, sum2, buf, i); DO2(sum1, sum2, buf, i+2);}
+#define DO8(sum1, sum2, buf, i)  {DO4(sum1, sum2, buf, i); DO4(sum1, sum2, buf, i+4);}
+#define DO16(sum1, sum2, buf)    {DO8(sum1, sum2, buf, 0); DO8(sum1, sum2, buf, 8);}
+
+static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
+    adler += buf[0];
+    adler %= BASE;
+    sum2 += adler;
+    sum2 %= BASE;
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
+    while (len) {
+        --len;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+    while (len--) {
+        *dst = *buf++;
+        adler += *dst++;
+        sum2 += adler;
+    }
+    adler %= BASE;
+    sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
+#ifdef UNROLL_MORE
+    while (len >= 16) {
+        len -= 16;
+        DO16(adler, sum2, buf);
+        buf += 16;
+#else
+    while (len >= 8) {
+        len -= 8;
+        DO8(adler, sum2, buf, 0);
+        buf += 8;
+#endif
+    }
+    /* Process tail (len < 16).  */
+    return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif /* ADLER32_P_H */
diff --git a/3rdparty/zlib-ng/arch/.gitignore b/3rdparty/zlib-ng/arch/.gitignore
new file mode 100644
index 000000000000..2c3af0a08cbd
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/.gitignore
@@ -0,0 +1,2 @@
+# ignore Makefiles; they're all automatically generated
+Makefile
diff --git a/3rdparty/zlib-ng/arch/arm/Makefile.in b/3rdparty/zlib-ng/arch/arm/Makefile.in
new file mode 100644
index 000000000000..9d05b00b54ed
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/Makefile.in
@@ -0,0 +1,85 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+ACLEFLAG=
+NEONFLAG=
+ARMV6FLAG=
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	adler32_neon.o adler32_neon.lo \
+	arm_features.o arm_features.lo \
+	chunkset_neon.o chunkset_neon.lo \
+	compare256_neon.o compare256_neon.lo \
+	crc32_acle.o crc32_acle.lo \
+	slide_hash_neon.o slide_hash_neon.lo \
+	slide_hash_armv6.o slide_hash_armv6.lo \
+	insert_string_acle.o insert_string_acle.lo
+
+adler32_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+arm_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+arm_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
+
+chunkset_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+chunkset_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+
+compare256_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+crc32_acle.o:
+	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+
+crc32_acle.lo:
+	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+
+slide_hash_neon.o:
+	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_neon.lo:
+	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+
+slide_hash_armv6.o:
+	$(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+slide_hash_armv6.lo:
+	$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+insert_string_acle.o:
+	$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+
+insert_string_acle.lo:
+	$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/3rdparty/zlib-ng/arch/arm/acle_intrins.h b/3rdparty/zlib-ng/arch/arm/acle_intrins.h
new file mode 100644
index 000000000000..a67cf3aabdf0
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/acle_intrins.h
@@ -0,0 +1,35 @@
+#ifndef ARM_ACLE_INTRINS_H
+#define ARM_ACLE_INTRINS_H
+
+#include <stdint.h>
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif defined(HAVE_ARM_ACLE_H)
+#  include <arm_acle.h>
+#endif
+
+#ifdef ARM_ACLE
+#if defined(__aarch64__)
+#  define Z_TARGET_CRC Z_TARGET("+crc")
+#else
+#  define Z_TARGET_CRC
+#endif
+#endif
+
+#ifdef ARM_SIMD
+#ifdef _MSC_VER
+typedef uint32_t uint16x2_t;
+
+#define __uqsub16 _arm_uqsub16
+#elif !defined(ARM_SIMD_INTRIN)
+typedef uint32_t uint16x2_t;
+
+static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
+    uint16x2_t __c;
+    __asm__ __volatile__("uqsub16 %0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+    return __c;
+}
+#endif
+#endif
+
+#endif // include guard ARM_ACLE_INTRINS_H
diff --git a/3rdparty/zlib-ng/arch/arm/adler32_neon.c b/3rdparty/zlib-ng/arch/arm/adler32_neon.c
new file mode 100644
index 000000000000..f1c43ff04749
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/adler32_neon.c
@@ -0,0 +1,215 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors:
+ *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+
+static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    static const uint16_t ALIGNED_(16) taps[64] = {
+        64, 63, 62, 61, 60, 59, 58, 57,
+        56, 55, 54, 53, 52, 51, 50, 49,
+        48, 47, 46, 45, 44, 43, 42, 41,
+        40, 39, 38, 37, 36, 35, 34, 33,
+        32, 31, 30, 29, 28, 27, 26, 25,
+        24, 23, 22, 21, 20, 19, 18, 17,
+        16, 15, 14, 13, 12, 11, 10, 9,
+        8, 7, 6, 5, 4, 3, 2, 1 };
+
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
+
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    size_t num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (size_t i = 0; i < num_iter; ++i) {
+        uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+        hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+        s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+        s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+        s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+        s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+        adacc_prev = adacc;
+        buf += 64;
+    }
+
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8(buf);
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
+    }
+
+    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1)
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (buf == NULL)
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < 16)
+        return adler32_len_16(adler, buf, len, sum2);
+
+    uint32_t pair[2];
+    int n = NMAX;
+    unsigned int done = 0;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* If memory is not SIMD aligned, do scalar sums to an aligned
+     * offset, provided that doing so doesn't completely eliminate
+     * SIMD operation. Aligned loads are still faster on ARM, even
+     * though there's no explicit aligned load instruction */
+    unsigned int align_offset = ((uintptr_t)buf & 15);
+    unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
+
+    if (align_offset && len >= (16 + align_adj)) {
+        NEON_handle_tail(pair, buf, align_adj);
+        n -= align_adj;
+        done += align_adj;
+
+    } else {
+        /* If here, we failed the len criteria test, it wouldn't be
+         * worthwhile to do scalar aligning sums */
+        align_adj = 0;
+    }
+
+    while (done < len) {
+        int remaining = (int)(len - done);
+        n = MIN(remaining, (done == align_adj) ? n : NMAX);
+
+        if (n < 16)
+            break;
+
+        NEON_accum32(pair, buf + done, n >> 4);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        int actual_nsums = (n >> 4) << 4;
+        done += actual_nsums;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        NEON_handle_tail(pair, (buf + done), len - done);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/arm/arm_features.c b/3rdparty/zlib-ng/arch/arm/arm_features.c
new file mode 100644
index 000000000000..a0e070ba9561
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.c
@@ -0,0 +1,100 @@
+#include "../../zbuild.h"
+#include "arm_features.h"
+
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+#  include <sys/auxv.h>
+#  ifdef ARM_ASM_HWCAP
+#    include <asm/hwcap.h>
+#  endif
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+#  include <machine/armreg.h>
+#  ifndef ID_AA64ISAR0_CRC32_VAL
+#    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
+#  endif
+#elif defined(__APPLE__)
+#  if !defined(_DARWIN_C_SOURCE)
+#    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+#  endif
+#  include <sys/sysctl.h>
+#elif defined(_WIN32)
+#  include <windows.h>
+#endif
+
+static int arm_has_crc32() {
+#if defined(__linux__) && defined(ARM_AUXV_HAS_CRC32)
+#  ifdef HWCAP_CRC32
+    return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0;
+#  else
+    return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0 ? 1 : 0;
+#  endif
+#elif defined(__FreeBSD__) && defined(__aarch64__)
+    return getenv("QEMU_EMULATING") == NULL
+      && ID_AA64ISAR0_CRC32_VAL(READ_SPECIALREG(id_aa64isar0_el1)) >= ID_AA64ISAR0_CRC32_BASE;
+#elif defined(__APPLE__)
+    int hascrc32;
+    size_t size = sizeof(hascrc32);
+    return sysctlbyname("hw.optional.armv8_crc32", &hascrc32, &size, NULL, 0) == 0
+      && hascrc32 == 1;
+#elif defined(_WIN32)
+    return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+#elif defined(ARM_NOCHECK_ACLE)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+/* AArch64 has neon. */
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+static inline int arm_has_neon() {
+#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON)
+#  ifdef HWCAP_ARM_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0 ? 1 : 0;
+#  else
+    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0;
+#  endif
+#elif defined(__APPLE__)
+    int hasneon;
+    size_t size = sizeof(hasneon);
+    return sysctlbyname("hw.optional.neon", &hasneon, &size, NULL, 0) == 0
+      && hasneon == 1;
+#elif defined(_M_ARM) && defined(WINAPI_FAMILY_PARTITION)
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+    return 1; /* Always supported */
+#  endif
+#endif
+
+#if defined(ARM_NOCHECK_NEON)
+    return 1;
+#else
+    return 0;
+#endif
+}
+#endif
+
+/* AArch64 does not have ARMv6 SIMD. */
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+static inline int arm_has_simd() {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+    const char *platform = (const char *)getauxval(AT_PLATFORM);
+    return strncmp(platform, "v6l", 3) == 0
+        || strncmp(platform, "v7l", 3) == 0
+        || strncmp(platform, "v8l", 3) == 0;
+#elif defined(ARM_NOCHECK_SIMD)
+    return 1;
+#else
+    return 0;
+#endif
+}
+#endif
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    features->has_simd = 0; /* never available */
+    features->has_neon = 1; /* always available */
+#else
+    features->has_simd = arm_has_simd();
+    features->has_neon = arm_has_neon();
+#endif
+    features->has_crc32 = arm_has_crc32();
+}
diff --git a/3rdparty/zlib-ng/arch/arm/arm_features.h b/3rdparty/zlib-ng/arch/arm/arm_features.h
new file mode 100644
index 000000000000..eca078e310ea
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/arm_features.h
@@ -0,0 +1,16 @@
+/* arm_features.h -- check for ARM features.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ARM_H_
+#define ARM_H_
+
+struct arm_cpu_features {
+    int has_simd;
+    int has_neon;
+    int has_crc32;
+};
+
+void Z_INTERNAL arm_check_features(struct arm_cpu_features *features);
+
+#endif /* ARM_H_ */
diff --git a/3rdparty/zlib-ng/arch/arm/chunkset_neon.c b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
new file mode 100644
index 000000000000..f9a444b0681f
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/chunkset_neon.c
@@ -0,0 +1,99 @@
+/* chunkset_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../generic/chunk_permute_table.h"
+
+typedef uint8x16_t chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    uint16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    uint64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
+}
+
+#define CHUNKSIZE        chunksize_neon
+#define CHUNKCOPY        chunkcopy_neon
+#define CHUNKUNROLL      chunkunroll_neon
+#define CHUNKMEMSET      chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vld1q_u8(s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vst1q_u8(out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    *chunk_rem = lut_rem.remval;
+
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 16 - dist);
+
+    /* This version of table is only available on aarch64 */
+#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
+    uint8x16_t ret_vec = vld1q_u8(buf);
+
+    uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
+    return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+    uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+    perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
+    perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
+    a = vld1_u8(buf);
+    b = vld1_u8(buf + 8);
+    ret0 = vtbl1_u8(a, perm_vec0);
+    uint8x8x2_t ab = {{a, b}};
+    ret1 = vtbl2_u8(ab, perm_vec1);
+    return vcombine_u8(ret0, ret1);
+#endif
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_neon
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/arm/compare256_neon.c b/3rdparty/zlib-ng/arch/arm/compare256_neon.c
new file mode 100644
index 000000000000..7daeba411ece
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/compare256_neon.c
@@ -0,0 +1,59 @@
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#include "neon_intrins.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint8x16_t a, b, cmp;
+        uint64_t lane;
+
+        a = vld1q_u8(src0);
+        b = vld1q_u8(src1);
+
+        cmp = veorq_u8(a, b);
+
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+        if (lane) {
+            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+            return len + match_byte;
+        }
+        len += 8;
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+        if (lane) {
+            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+            return len + match_byte;
+        }
+        len += 8;
+
+        src0 += 16, src1 += 16;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/arm/crc32_acle.c b/3rdparty/zlib-ng/arch/arm/crc32_acle.c
new file mode 100644
index 000000000000..ac7d6ff66b3e
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/crc32_acle.c
@@ -0,0 +1,78 @@
+/* crc32_acle.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
+ * Copyright (C) 2016 Yang Zhang
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+*/
+
+#ifdef ARM_ACLE
+#include "acle_intrins.h"
+#include "../../zbuild.h"
+
+Z_INTERNAL Z_TARGET_CRC uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
+    Z_REGISTER uint32_t c;
+    Z_REGISTER const uint16_t *buf2;
+    Z_REGISTER const uint32_t *buf4;
+    Z_REGISTER const uint64_t *buf8;
+
+    c = ~crc;
+
+    if (UNLIKELY(len == 1)) {
+        c = __crc32b(c, *buf);
+        c = ~c;
+        return c;
+    }
+
+    if ((ptrdiff_t)buf & (sizeof(uint64_t) - 1)) {
+        if (len && ((ptrdiff_t)buf & 1)) {
+            c = __crc32b(c, *buf++);
+            len--;
+        }
+
+        if ((len >= sizeof(uint16_t)) && ((ptrdiff_t)buf & sizeof(uint16_t))) {
+            buf2 = (const uint16_t *) buf;
+            c = __crc32h(c, *buf2++);
+            len -= sizeof(uint16_t);
+            buf4 = (const uint32_t *) buf2;
+        } else {
+            buf4 = (const uint32_t *) buf;
+        }
+
+        if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
+            c = __crc32w(c, *buf4++);
+            len -= sizeof(uint32_t);
+        }
+
+        buf8 = (const uint64_t *) buf4;
+    } else {
+        buf8 = (const uint64_t *) buf;
+    }
+
+    while (len >= sizeof(uint64_t)) {
+        c = __crc32d(c, *buf8++);
+        len -= sizeof(uint64_t);
+    }
+
+    if (len >= sizeof(uint32_t)) {
+        buf4 = (const uint32_t *) buf8;
+        c = __crc32w(c, *buf4++);
+        len -= sizeof(uint32_t);
+        buf2 = (const uint16_t *) buf4;
+    } else {
+        buf2 = (const uint16_t *) buf8;
+    }
+
+    if (len >= sizeof(uint16_t)) {
+        c = __crc32h(c, *buf2++);
+        len -= sizeof(uint16_t);
+    }
+
+    buf = (const unsigned char *) buf2;
+    if (len) {
+        c = __crc32b(c, *buf);
+    }
+
+    c = ~c;
+    return c;
+}
+#endif
diff --git a/3rdparty/zlib-ng/arch/arm/insert_string_acle.c b/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
new file mode 100644
index 000000000000..aa8385c712e6
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/insert_string_acle.c
@@ -0,0 +1,24 @@
+/* insert_string_acle.c -- insert_string integer hash variant using ACLE's CRC instructions
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#ifdef ARM_ACLE
+#include "acle_intrins.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#define HASH_CALC(s, h, val) \
+    h = __crc32w(0, val)
+
+#define HASH_CALC_VAR       h
+#define HASH_CALC_VAR_INIT  uint32_t h = 0
+
+#define UPDATE_HASH         Z_TARGET_CRC update_hash_acle
+#define INSERT_STRING       Z_TARGET_CRC insert_string_acle
+#define QUICK_INSERT_STRING Z_TARGET_CRC quick_insert_string_acle
+
+#include "../../insert_string_tpl.h"
+#endif
diff --git a/3rdparty/zlib-ng/arch/arm/neon_intrins.h b/3rdparty/zlib-ng/arch/arm/neon_intrins.h
new file mode 100644
index 000000000000..51df77dbe685
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/neon_intrins.h
@@ -0,0 +1,58 @@
+#ifndef ARM_NEON_INTRINS_H
+#define ARM_NEON_INTRINS_H
+
+#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+/* arm64_neon.h is MSVC specific */
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+
+#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+/* Compatibility shim for the _high family of functions */
+#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
+#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
+#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
+#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
+#endif
+
+#ifdef ARM_NEON
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+    out.val[0] = vqsubq_u16(a.val[0], b); \
+    out.val[1] = vqsubq_u16(a.val[1], b); \
+    out.val[2] = vqsubq_u16(a.val[2], b); \
+    out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+
+#  ifndef ARM_NEON_HASLD4
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t const *a) {
+    uint16x8x4_t ret = (uint16x8x4_t) {{
+                          vld1q_u16(a),
+                          vld1q_u16(a+8),
+                          vld1q_u16(a+16),
+                          vld1q_u16(a+24)}};
+    return ret;
+}
+
+static inline uint8x16x4_t vld1q_u8_x4(uint8_t const *a) {
+    uint8x16x4_t ret = (uint8x16x4_t) {{
+                          vld1q_u8(a),
+                          vld1q_u8(a+16),
+                          vld1q_u8(a+32),
+                          vld1q_u8(a+48)}};
+    return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+    vst1q_u16(p, a.val[0]);
+    vst1q_u16(p + 8, a.val[1]);
+    vst1q_u16(p + 16, a.val[2]);
+    vst1q_u16(p + 24, a.val[3]);
+}
+#  endif // HASLD4 check
+#endif
+
+#endif // include guard ARM_NEON_INTRINS_H
diff --git a/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
new file mode 100644
index 000000000000..0a2eeccf9269
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_armv6.c
@@ -0,0 +1,47 @@
+/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
+ * Copyright (C) 2023 Cameron Cawley
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(ARM_SIMD)
+#include "acle_intrins.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x2_t v;
+    uint16x2_t p0, p1, p2, p3;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = wsize | (wsize << 16);
+
+    n = size / (sizeof(uint16x2_t) * 4);
+    do {
+        p0 = *((const uint16x2_t *)(table));
+        p1 = *((const uint16x2_t *)(table+2));
+        p2 = *((const uint16x2_t *)(table+4));
+        p3 = *((const uint16x2_t *)(table+6));
+        p0 = __uqsub16(p0, v);
+        p1 = __uqsub16(p1, v);
+        p2 = __uqsub16(p2, v);
+        p3 = __uqsub16(p3, v);
+        *((uint16x2_t *)(table)) = p0;
+        *((uint16x2_t *)(table+2)) = p1;
+        *((uint16x2_t *)(table+4)) = p2;
+        *((uint16x2_t *)(table+6)) = p3;
+        table += 8;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
new file mode 100644
index 000000000000..a96ca11799b5
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/arm/slide_hash_neon.c
@@ -0,0 +1,46 @@
+/* slide_hash_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017-2020 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef ARM_NEON
+#include "neon_intrins.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    Z_REGISTER uint16x8_t v;
+    uint16x8x4_t p0, p1;
+    Z_REGISTER size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = vdupq_n_u16(wsize);
+
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p0 = vld1q_u16_x4(table);
+        p1 = vld1q_u16_x4(table+32);
+        vqsubq_u16_x4_x1(p0, p0, v);
+        vqsubq_u16_x4_x1(p1, p1, v);
+        vst1q_u16_x4(table, p0);
+        vst1q_u16_x4(table+32, p1);
+        table += 64;
+    } while (--n);
+}
+
+Z_INTERNAL void slide_hash_neon(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
diff --git a/3rdparty/zlib-ng/arch/generic/Makefile.in b/3rdparty/zlib-ng/arch/generic/Makefile.in
new file mode 100644
index 000000000000..c717026f86e4
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/generic/Makefile.in
@@ -0,0 +1,24 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all:
+
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~ \
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/3rdparty/zlib-ng/arch/generic/chunk_permute_table.h b/3rdparty/zlib-ng/arch/generic/chunk_permute_table.h
new file mode 100644
index 000000000000..bad66ccc774b
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/generic/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSSE3 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+    0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+    0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+    0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+    /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+     * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+     * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+     * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+     * this is what we're dealt.
+     */
+
+    16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+    16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+    16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+    16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+    16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+    16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+    16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+    16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+    uint16_t idx;
+    uint16_t remval;
+} lut_rem_pair;
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/power/Makefile.in b/3rdparty/zlib-ng/arch/power/Makefile.in
new file mode 100644
index 000000000000..e2bec5e510e7
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/Makefile.in
@@ -0,0 +1,93 @@
+# Makefile for POWER-specific files
+# Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+P8FLAGS=-mcpu=power8
+P9FLAGS=-mcpu=power9
+PPCFLAGS=-maltivec
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: power_features.o \
+     power_features.lo \
+     adler32_power8.o \
+     adler32_power8.lo \
+     adler32_vmx.o \
+     adler32_vmx.lo \
+     chunkset_power8.o \
+     chunkset_power8.lo \
+     compare256_power9.o \
+     compare256_power9.lo \
+     crc32_power8.o \
+     crc32_power8.lo \
+     slide_hash_power8.o \
+     slide_hash_power8.lo \
+     slide_hash_vmx.o \
+     slide_hash_vmx.lo
+
+power_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+power_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power_features.c
+
+adler32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
+
+adler32_vmx.o:
+	$(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+	$(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+chunkset_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+chunkset_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
+
+compare256_power9.o:
+	$(CC) $(CFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+compare256_power9.lo:
+	$(CC) $(SFLAGS) $(P9FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_power9.c
+
+crc32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+crc32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+slide_hash_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
+
+slide_hash_vmx.o:
+	$(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+	$(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/3rdparty/zlib-ng/arch/power/adler32_power8.c b/3rdparty/zlib-ng/arch/power/adler32_power8.c
new file mode 100644
index 000000000000..4aaea9f50b3a
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/adler32_power8.c
@@ -0,0 +1,153 @@
+/* Adler32 for POWER8 using VSX instructions.
+ * Copyright (C) 2020 IBM Corporation
+ * Author: Rogerio Alves <rcardoso@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start  _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ */
+
+#ifdef POWER8_VSX
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "adler32_p.h"
+
+/* Vector across sum unsigned int (saturate).  */
+static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
+    __b = vec_sld(__a, __a, 8);
+    __b = vec_add(__b, __a);
+    __a = vec_sld(__b, __b, 4);
+    __a = vec_add(__a, __b);
+
+    return __a;
+}
+
+Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = (adler >> 16) & 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(s1, buf, s2);
+
+    /* If buffer is empty or len=0 we need to return adler initial value.  */
+    if (UNLIKELY(buf == NULL))
+        return 1;
+
+    /* This is faster than VSX code for len < 64.  */
+    if (len < 64)
+        return adler32_len_64(s1, buf, len, s2);
+
+    /* Use POWER VSX instructions for len >= 64. */
+    const vector unsigned int v_zeros = { 0 };
+    const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+         6, 5, 4, 3, 2, 1};
+    const vector unsigned char vsh = vec_splat_u8(4);
+    const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+    vector unsigned int vs1 = { 0 };
+    vector unsigned int vs2 = { 0 };
+    vector unsigned int vs1_save = { 0 };
+    vector unsigned int vsum1, vsum2;
+    vector unsigned char vbuf;
+    int n;
+
+    vs1[0] = s1;
+    vs2[0] = s2;
+
+    /* Do length bigger than NMAX in blocks of NMAX size.  */
+    while (len >= NMAX) {
+        len -= NMAX;
+        n = NMAX / 16;
+        do {
+            vbuf = vec_xl(0, (unsigned char *) buf);
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        } while (--n);
+        /* Once each block of NMAX size.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+
+        /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
+        vs1[0] = vs1[0] % BASE;
+        /* vs2[0] = s2_i + 16*s1_save +
+           sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
+        vs2[0] = vs2[0] % BASE;
+
+        vs1 = vec_and(vs1, vmask);
+        vs2 = vec_and(vs2, vmask);
+        vs1_save = v_zeros;
+    }
+
+    /* len is less than NMAX one modulo is needed.  */
+    if (len >= 16) {
+        while (len >= 16) {
+            len -= 16;
+
+            vbuf = vec_xl(0, (unsigned char *) buf);
+
+            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+            /* Save vs1.  */
+            vs1_save = vec_add(vs1_save, vs1);
+            /* Accumulate the sums.  */
+            vs1 = vec_add(vsum1, vs1);
+            vs2 = vec_add(vsum2, vs2);
+
+            buf += 16;
+        }
+        /* Since the size will be always less than NMAX we do this once.  */
+        vs1 = vec_sumsu(vs1, vsum1);
+        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+        vs2 = vec_add(vs1_save, vs2);
+        vs2 = vec_sumsu(vs2, vsum2);
+    }
+    /* Copy result back to s1, s2 (mod 65521).  */
+    s1 = vs1[0] % BASE;
+    s2 = vs2[0] % BASE;
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(s1, buf, len, s2);
+}
+
+#endif /* POWER8_VSX */
diff --git a/3rdparty/zlib-ng/arch/power/adler32_vmx.c b/3rdparty/zlib-ng/arch/power/adler32_vmx.c
new file mode 100644
index 000000000000..3470c28a12f7
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/adler32_vmx.c
@@ -0,0 +1,186 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2023 Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2021 Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX
+#include <altivec.h>
+#include "zbuild.h"
+#include "zendian.h"
+#include "adler32_p.h"
+
+#define vmx_zero()  (vec_splat_u32(0))
+
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
+    /* Different taps for the separable components of sums */
+    const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
+    const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
+    const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
+    const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+    /* As silly and inefficient as it seems, creating 1 permutation vector to permute
+     * a 2 element vector from a single load + a subsequent shift is just barely faster
+     * than doing 2 indexed insertions into zero initialized vectors from unaligned memory. */
+    const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+    const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
+    vector unsigned int  adacc, s2acc;
+    vector unsigned int pair_vec = vec_ld(0, s);
+    adacc = vec_perm(pair_vec, pair_vec, s0_perm);
+#if BYTE_ORDER == LITTLE_ENDIAN
+    s2acc = vec_sro(pair_vec, shift_vec);
+#else
+    s2acc = vec_slo(pair_vec, shift_vec);
+#endif
+
+    vector unsigned int zero = vmx_zero();
+    vector unsigned int s3acc = zero;
+    vector unsigned int s3acc_0 = zero;
+    vector unsigned int adacc_prev = adacc;
+    vector unsigned int adacc_prev_0 = zero;
+
+    vector unsigned int s2acc_0 = zero;
+    vector unsigned int s2acc_1 = zero;
+    vector unsigned int s2acc_2 = zero;
+
+    /* Maintain a running sum of a second half, this might help use break yet another
+     * data dependency bubble in the sum */
+    vector unsigned int adacc_0 = zero;
+
+    int num_iter = len / 4;
+    int rem = len & 3;
+
+    for (int i = 0; i < num_iter; ++i) {
+        vector unsigned char d0 = vec_ld(0, buf);
+        vector unsigned char d1 = vec_ld(16, buf);
+        vector unsigned char d2 = vec_ld(32, buf);
+        vector unsigned char d3 = vec_ld(48, buf);
+
+        /* The core operation of the loop, basically
+         * what is being unrolled below */
+        adacc = vec_sum4s(d0, adacc);
+        s3acc = vec_add(s3acc, adacc_prev);
+        s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
+        s2acc = vec_msum(t0, d0, s2acc);
+
+        /* interleave dependent sums in here */
+        adacc_0 = vec_sum4s(d1, adacc_0);
+        s2acc_0 = vec_msum(t1, d1, s2acc_0);
+        adacc = vec_sum4s(d2, adacc);
+        s2acc_1 = vec_msum(t2, d2, s2acc_1);
+        s2acc_2 = vec_msum(t3, d3, s2acc_2);
+        adacc_0 = vec_sum4s(d3, adacc_0);
+
+        adacc_prev = adacc;
+        adacc_prev_0 = adacc_0;
+        buf += 64;
+    }
+
+    adacc = vec_add(adacc, adacc_0);
+    s3acc = vec_add(s3acc, s3acc_0);
+    s3acc = vec_sl(s3acc, vec_splat_u32(6));
+
+    if (rem) {
+        adacc_prev = vec_add(adacc_prev_0, adacc_prev);
+        adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
+        while (rem--) {
+            vector unsigned char d0 = vec_ld(0, buf);
+            adacc = vec_sum4s(d0, adacc);
+            s3acc = vec_add(s3acc, adacc_prev);
+            s2acc = vec_msum(t3, d0, s2acc);
+            adacc_prev = vec_sl(adacc, vec_splat_u32(4));
+            buf += 16;
+        }
+    }
+
+
+    /* Sum up independent second sums */
+    s2acc = vec_add(s2acc, s2acc_0);
+    s2acc_2 = vec_add(s2acc_1, s2acc_2);
+    s2acc = vec_add(s2acc, s2acc_2);
+
+    s2acc = vec_add(s2acc, s3acc);
+
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
+    adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
+    s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
+
+    vec_ste(adacc, 0, s);
+    vec_ste(s2acc, 0, s+1);
+}
+
+Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+    uint32_t pair[16] ALIGNED_(16);
+    memset(&pair[2], 0, 14);
+    int n = NMAX;
+    unsigned int done = 0, i;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    // Align buffer
+    unsigned int al = 0;
+    if ((uintptr_t)buf & 0xf) {
+        al = 16-((uintptr_t)buf & 0xf);
+        if (al > len) {
+            al=len;
+        }
+        vmx_handle_head_or_tail(pair, buf, al);
+
+        done += al;
+        /* Rather than rebasing, we can reduce the max sums for the
+         * first round only */
+        n -= al;
+    }
+    for (i = al; i < len; i += n) {
+        int remaining = (int)(len-i);
+        n = MIN(remaining, (i == al) ? n : NMAX);
+
+        if (n < 16)
+            break;
+
+        vmx_accum32(pair, buf + i, n / 16);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        done += (n / 16) * 16;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        vmx_handle_head_or_tail(pair, (buf + done), len - done);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+#endif
diff --git a/3rdparty/zlib-ng/arch/power/chunkset_power8.c b/3rdparty/zlib-ng/arch/power/chunkset_power8.c
new file mode 100644
index 000000000000..7cbb8029b3b1
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/chunkset_power8.c
@@ -0,0 +1,55 @@
+/* chunkset_power8.c -- VSX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+#include <altivec.h>
+#include "../../zbuild.h"
+
+typedef vector unsigned char chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    uint16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    uint64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = (vector unsigned char)vec_splats((unsigned long long)tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = vec_xl(0, s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    vec_xst(*chunk, 0, out);
+}
+
+#define CHUNKSIZE        chunksize_power8
+#define CHUNKCOPY        chunkcopy_power8
+#define CHUNKUNROLL      chunkunroll_power8
+#define CHUNKMEMSET      chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_power8
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/power/compare256_power9.c b/3rdparty/zlib-ng/arch/power/compare256_power9.c
new file mode 100644
index 000000000000..9b0ddaf80045
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/compare256_power9.c
@@ -0,0 +1,64 @@
+/* compare256_power9.c - Power9 version of compare256
+ * Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER9
+#include <altivec.h>
+#include "../../zbuild.h"
+#include "../../zendian.h"
+
+/* Older versions of GCC misimplemented semantics for these bit counting builtins.
+ * https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3f30f2d1dbb3228b8468b26239fe60c2974ce2ac */
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 12)
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vctzlsbb(vc)
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = __builtin_vec_vclzlsbb(vc)
+#endif
+#else
+#  define zng_vec_vctzlsbb(vc, len) len = vec_cntlz_lsbb(vc)
+#endif
+
+static inline uint32_t compare256_power9_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0, cmplen;
+
+    do {
+        vector unsigned char vsrc0, vsrc1, vc;
+
+        vsrc0 = *((vector unsigned char *)src0);
+        vsrc1 = *((vector unsigned char *)src1);
+
+        /* Compare 16 bytes at a time. Each byte of vc will be either
+         * all ones or all zeroes, depending on the result of the comparison. */
+        vc = (vector unsigned char)vec_cmpne(vsrc0, vsrc1);
+
+        /* Since the index of matching bytes will contain only zeroes
+         * on vc (since we used cmpne), counting the number of consecutive
+         * bytes where LSB == 0 is the same as counting the length of the match. */
+        zng_vec_vctzlsbb(vc, cmplen);
+        if (cmplen != 16)
+            return len + cmplen;
+
+        src0 += 16, src1 += 16, len += 16;
+    } while (len < 256);
+
+   return 256;
+}
+
+Z_INTERNAL uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_power9_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_power9
+#define COMPARE256          compare256_power9_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/power/crc32_constants.h b/3rdparty/zlib-ng/arch/power/crc32_constants.h
new file mode 100644
index 000000000000..8c8f2153b60e
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/crc32_constants.h
@@ -0,0 +1,1123 @@
+/* Constants table used by crc32_power8.c
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * This file was automatically generated, DO NOT EDIT IT MANUALLY.
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zendian.h"
+#include "zbuild.h"
+
+/* Reduce 262144 kbits to 1024 bits */
+static const __vector unsigned long long vcrc_const[255] ALIGNED_(16) = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+    { 0x0000000099ea94a8, 0x00000001651797d2 },
+    /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+    { 0x00000000945a8420, 0x0000000021e0d56c },
+    /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+    { 0x0000000030762706, 0x000000000f95ecaa },
+    /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+    { 0x00000001a52fc582, 0x00000001ebd224ac },
+    /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+    { 0x00000001a4a7167a, 0x000000000ccb97ca },
+    /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+    { 0x000000000c18249a, 0x00000001006ec8a8 },
+    /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+    { 0x00000000a924ae7c, 0x000000014f58f196 },
+    /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+    { 0x00000001e12ccc12, 0x00000001a7192ca6 },
+    /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+    { 0x00000000a0b9d4ac, 0x000000019a64bab2 },
+    /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+    { 0x0000000095e8ddfe, 0x0000000014f4ed2e },
+    /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+    { 0x00000000233fddc4, 0x000000011092b6a2 },
+    /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+    { 0x00000001b4529b62, 0x00000000c8a1629c },
+    /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+    { 0x00000001a7fa0e64, 0x000000017bf32e8e },
+    /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+    { 0x00000001b5334592, 0x00000001f8cc6582 },
+    /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+    { 0x000000011f8ee1b4, 0x000000008631ddf0 },
+    /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+    { 0x000000006252e632, 0x000000007e5a76d0 },
+    /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+    { 0x00000000ab973e84, 0x000000002b09b31c },
+    /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+    { 0x000000007734f5ec, 0x00000001b2df1f84 },
+    /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+    { 0x000000007c547798, 0x00000001d6f56afc },
+    /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+    { 0x000000007ec40210, 0x00000001b9b5e70c },
+    /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+    { 0x00000001ab1695a8, 0x0000000034b626d2 },
+    /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+    { 0x0000000090494bba, 0x000000014c53479a },
+    /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+    { 0x00000001123fb816, 0x00000001a6d179a4 },
+    /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+    { 0x00000001e188c74c, 0x000000015abd16b4 },
+    /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+    { 0x00000001c2d3451c, 0x00000000018f9852 },
+    /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+    { 0x00000000f55cf1ca, 0x000000001fb3084a },
+    /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+    { 0x00000001a0531540, 0x00000000c53dfb04 },
+    /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+    { 0x0000000132cd7ebc, 0x00000000e10c9ad6 },
+    /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+    { 0x0000000073ab7f36, 0x0000000025aa994a },
+    /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+    { 0x0000000041aed1c2, 0x00000000fa3a74c4 },
+    /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+    { 0x0000000136c53800, 0x0000000033eb3f40 },
+    /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+    { 0x0000000126835a30, 0x000000017193f296 },
+    /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+    { 0x000000006241b502, 0x0000000043f6c86a },
+    /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+    { 0x00000000d5196ad4, 0x000000016b513ec6 },
+    /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+    { 0x000000009cfa769a, 0x00000000c8f25b4e },
+    /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+    { 0x00000000920e5df4, 0x00000001a45048ec },
+    /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+    { 0x0000000169dc310e, 0x000000000c441004 },
+    /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+    { 0x0000000009fc331c, 0x000000000e17cad6 },
+    /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+    { 0x000000010d94a81e, 0x00000001253ae964 },
+    /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+    { 0x0000000027a20ab2, 0x00000001d7c88ebc },
+    /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+    { 0x0000000114f87504, 0x00000001e7ca913a },
+    /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+    { 0x000000004b076d96, 0x0000000033ed078a },
+    /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+    { 0x00000000da4d1e74, 0x00000000e1839c78 },
+    /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+    { 0x000000001b81f672, 0x00000001322b267e },
+    /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+    { 0x000000009367c988, 0x00000000638231b6 },
+    /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+    { 0x00000001717214ca, 0x00000001ee7f16f4 },
+    /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+    { 0x000000009f47d820, 0x0000000117d9924a },
+    /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+    { 0x000000010d9a47d2, 0x00000000e1a9e0c4 },
+    /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+    { 0x00000000a696c58c, 0x00000001403731dc },
+    /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+    { 0x000000002aa28ec6, 0x00000001a5ea9682 },
+    /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+    { 0x00000001fe18fd9a, 0x0000000101c5c578 },
+    /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+    { 0x000000019d4fc1ae, 0x00000000dddf6494 },
+    /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+    { 0x00000001ba0e3dea, 0x00000000f1c3db28 },
+    /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+    { 0x0000000074b59a5e, 0x000000013112fb9c },
+    /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+    { 0x00000000f2b5ea98, 0x00000000b680b906 },
+    /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+    { 0x0000000187132676, 0x000000001a282932 },
+    /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+    { 0x000000010a8c6ad4, 0x0000000089406e7e },
+    /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+    { 0x00000001e21dfe70, 0x00000001def6be8c },
+    /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+    { 0x00000001da0050e4, 0x0000000075258728 },
+    /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+    { 0x00000000772172ae, 0x000000019536090a },
+    /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+    { 0x00000000e47724aa, 0x00000000f2455bfc },
+    /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+    { 0x000000003cd63ac4, 0x000000018c40baf4 },
+    /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+    { 0x00000001bf47d352, 0x000000004cd390d4 },
+    /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+    { 0x000000018dc1d708, 0x00000001e4ece95a },
+    /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+    { 0x000000002d4620a4, 0x000000001a3ee918 },
+    /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+    { 0x0000000058fd1740, 0x000000007c652fb8 },
+    /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+    { 0x00000000dadd9bfc, 0x000000011c67842c },
+    /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+    { 0x00000001ea2140be, 0x00000000254f759c },
+    /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+    { 0x000000009de128ba, 0x000000007ece94ca },
+    /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+    { 0x000000013ac3aa8e, 0x0000000038f258c2 },
+    /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+    { 0x0000000099980562, 0x00000001cdf17b00 },
+    /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+    { 0x00000001c1579c86, 0x000000011f882c16 },
+    /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+    { 0x0000000068dbbf94, 0x0000000100093fc8 },
+    /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+    { 0x000000004509fb04, 0x00000001cd684f16 },
+    /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+    { 0x00000001202f6398, 0x000000004bc6a70a },
+    /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+    { 0x000000013aea243e, 0x000000004fc7e8e4 },
+    /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+    { 0x00000001b4052ae6, 0x0000000130103f1c },
+    /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+    { 0x00000001cd2a0ae8, 0x0000000111b0024c },
+    /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+    { 0x00000001fe4aa8b4, 0x000000010b3079da },
+    /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+    { 0x00000001d1559a42, 0x000000010192bcc2 },
+    /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+    { 0x00000001f3e05ecc, 0x0000000074838d50 },
+    /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+    { 0x0000000104ddd2cc, 0x000000001b20f520 },
+    /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+    { 0x000000015393153c, 0x0000000050c3590a },
+    /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+    { 0x0000000057e942c6, 0x00000000b41cac8e },
+    /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+    { 0x000000012c633850, 0x000000000c72cc78 },
+    /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+    { 0x00000000ebcaae4c, 0x0000000030cdb032 },
+    /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+    { 0x000000013ee532a6, 0x000000013e09fc32 },
+    /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+    { 0x00000001bf0cbc7e, 0x000000001ed624d2 },
+    /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+    { 0x00000000d50b7a5a, 0x00000000781aee1a },
+    /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+    { 0x0000000002fca6e8, 0x00000001c4d8348c },
+    /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+    { 0x000000007af40044, 0x0000000057a40336 },
+    /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+    { 0x0000000016178744, 0x0000000085544940 },
+    /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+    { 0x000000014c177458, 0x000000019cd21e80 },
+    /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+    { 0x000000011b6ddf04, 0x000000013eb95bc0 },
+    /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+    { 0x00000001f3e29ccc, 0x00000001dfc9fdfc },
+    /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+    { 0x0000000135ae7562, 0x00000000cd028bc2 },
+    /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+    { 0x0000000190ef812c, 0x0000000090db8c44 },
+    /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+    { 0x0000000067a2c786, 0x000000010010a4ce },
+    /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+    { 0x0000000048b9496c, 0x00000001c8f4c72c },
+    /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+    { 0x000000015a422de6, 0x000000001c26170c },
+    /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+    { 0x00000001ef0e3640, 0x00000000e3fccf68 },
+    /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+    { 0x00000001006d2d26, 0x00000000d513ed24 },
+    /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+    { 0x00000001170d56d6, 0x00000000141beada },
+    /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+    { 0x00000000a5fb613c, 0x000000011071aea0 },
+    /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+    { 0x0000000040bbf7fc, 0x000000012e19080a },
+    /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+    { 0x000000016ac3a5b2, 0x0000000100ecf826 },
+    /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+    { 0x00000000abf16230, 0x0000000069b09412 },
+    /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+    { 0x00000001ebe23fac, 0x0000000122297bac },
+    /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+    { 0x000000008b6a0894, 0x00000000e9e4b068 },
+    /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+    { 0x00000001288ea478, 0x000000004b38651a },
+    /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+    { 0x000000016619c442, 0x00000001468360e2 },
+    /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+    { 0x0000000086230038, 0x00000000121c2408 },
+    /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+    { 0x000000017746a756, 0x00000000da7e7d08 },
+    /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+    { 0x0000000191b8f8f8, 0x00000001058d7652 },
+    /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+    { 0x000000008e167708, 0x000000014a098a90 },
+    /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+    { 0x0000000148b22d54, 0x0000000020dbe72e },
+    /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+    { 0x0000000044ba2c3c, 0x000000011e7323e8 },
+    /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+    { 0x00000000b54d2b52, 0x00000000d5d4bf94 },
+    /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+    { 0x0000000005a4fd8a, 0x0000000199d8746c },
+    /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+    { 0x0000000139f9fc46, 0x00000000ce9ca8a0 },
+    /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+    { 0x000000015a1fa824, 0x00000000136edece },
+    /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+    { 0x000000000a61ae4c, 0x000000019b92a068 },
+    /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+    { 0x0000000145e9113e, 0x0000000071d62206 },
+    /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+    { 0x000000006a348448, 0x00000000dfc50158 },
+    /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+    { 0x000000004d80a08c, 0x00000001517626bc },
+    /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+    { 0x000000014b6837a0, 0x0000000148d1e4fa },
+    /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+    { 0x000000016896a7fc, 0x0000000094d8266e },
+    /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+    { 0x000000014f187140, 0x00000000606c5e34 },
+    /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+    { 0x000000019581b9da, 0x000000019766beaa },
+    /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+    { 0x00000001091bc984, 0x00000001d80c506c },
+    /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+    { 0x000000001067223c, 0x000000001e73837c },
+    /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+    { 0x00000001ab16ea02, 0x0000000064d587de },
+    /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+    { 0x000000013c4598a8, 0x00000000f4a507b0 },
+    /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+    { 0x00000000b3735430, 0x0000000040e342fc },
+    /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+    { 0x00000001bb3fc0c0, 0x00000001d5ad9c3a },
+    /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+    { 0x00000001570ae19c, 0x0000000094a691a4 },
+    /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+    { 0x00000001ea910712, 0x00000001271ecdfa },
+    /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+    { 0x0000000167127128, 0x000000009e54475a },
+    /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+    { 0x0000000019e790a2, 0x00000000c9c099ee },
+    /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+    { 0x000000003788f710, 0x000000009a2f736c },
+    /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+    { 0x00000001682a160e, 0x00000000bb9f4996 },
+    /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+    { 0x000000007f0ebd2e, 0x00000001db688050 },
+    /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+    { 0x000000002b032080, 0x00000000e9b10af4 },
+    /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+    { 0x00000000cfd1664a, 0x000000012d4545e4 },
+    /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+    { 0x00000000aa1181c2, 0x000000000361139c },
+    /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+    { 0x00000000ddd08002, 0x00000001a5a1a3a8 },
+    /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+    { 0x00000000e8dd0446, 0x000000006844e0b0 },
+    /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+    { 0x00000001bbd94a00, 0x00000000c3762f28 },
+    /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+    { 0x00000000ab6cd180, 0x00000001d26287a2 },
+    /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+    { 0x0000000031803ce2, 0x00000001f6f0bba8 },
+    /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+    { 0x0000000024f40b0c, 0x000000002ffabd62 },
+    /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+    { 0x00000001ba1d9834, 0x00000000fb4516b8 },
+    /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+    { 0x0000000104de61aa, 0x000000018cfa961c },
+    /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+    { 0x0000000113e40d46, 0x000000019e588d52 },
+    /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+    { 0x00000001415598a0, 0x00000001180f0bbc },
+    /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+    { 0x00000000bf6c8c90, 0x00000000e1d9177a },
+    /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+    { 0x00000001788b0504, 0x0000000105abc27c },
+    /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+    { 0x0000000038385d02, 0x00000000972e4a58 },
+    /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+    { 0x00000001b6c83844, 0x0000000183499a5e },
+    /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+    { 0x0000000051061a8a, 0x00000001c96a8cca },
+    /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+    { 0x000000017351388a, 0x00000001a1a5b60c },
+    /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+    { 0x0000000132928f92, 0x00000000e4b6ac9c },
+    /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+    { 0x00000000e6b4f48a, 0x00000001807e7f5a },
+    /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+    { 0x0000000039d15e90, 0x000000017a7e3bc8 },
+    /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+    { 0x00000000312d6074, 0x00000000d73975da },
+    /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+    { 0x000000017bbb2cc4, 0x000000017375d038 },
+    /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+    { 0x000000016ded3e18, 0x00000000193680bc },
+    /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+    { 0x00000000f1638b16, 0x00000000999b06f6 },
+    /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+    { 0x00000001d38b9ecc, 0x00000001f685d2b8 },
+    /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+    { 0x000000018b8d09dc, 0x00000001f4ecbed2 },
+    /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+    { 0x00000000e7bc27d2, 0x00000000ba16f1a0 },
+    /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+    { 0x00000000275e1e96, 0x0000000115aceac4 },
+    /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+    { 0x00000000e2e3031e, 0x00000001aeff6292 },
+    /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+    { 0x00000001041c84d8, 0x000000009640124c },
+    /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+    { 0x00000000706ce672, 0x0000000114f41f02 },
+    /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+    { 0x000000015d5070da, 0x000000009c5f3586 },
+    /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+    { 0x0000000038f9493a, 0x00000001878275fa },
+    /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+    { 0x00000000a3348a76, 0x00000000ddc42ce8 },
+    /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+    { 0x00000001ad0aab92, 0x0000000181d2c73a },
+    /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+    { 0x000000019e85f712, 0x0000000141c9320a },
+    /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+    { 0x000000005a871e76, 0x000000015235719a },
+    /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+    { 0x000000017249c662, 0x00000000be27d804 },
+    /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+    { 0x000000003a084712, 0x000000006242d45a },
+    /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+    { 0x00000000ed438478, 0x000000009a53638e },
+    /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+    { 0x00000000abac34cc, 0x00000001001ecfb6 },
+    /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+    { 0x000000005f35ef3e, 0x000000016d7c2d64 },
+    /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+    { 0x0000000047d6608c, 0x00000001d0ce46c0 },
+    /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+    { 0x000000002d01470e, 0x0000000124c907b4 },
+    /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+    { 0x0000000158bbc7b0, 0x0000000018a555ca },
+    /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+    { 0x00000000c0a23e8e, 0x000000006b0980bc },
+    /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+    { 0x00000001ebd85c88, 0x000000008bbba964 },
+    /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+    { 0x000000019ee20bb2, 0x00000001070a5a1e },
+    /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+    { 0x00000001acabf2d6, 0x000000002204322a },
+    /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+    { 0x00000001b7963d56, 0x00000000a27524d0 },
+    /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+    { 0x000000017bffa1fe, 0x0000000020b1e4ba },
+    /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+    { 0x000000001f15333e, 0x0000000032cc27fc },
+    /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+    { 0x000000018593129e, 0x0000000044dd22b8 },
+    /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+    { 0x000000019cb32602, 0x00000000dffc9e0a },
+    /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+    { 0x0000000142b05cc8, 0x00000001b7a0ed14 },
+    /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+    { 0x00000001be49e7a4, 0x00000000c7842488 },
+    /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+    { 0x0000000108f69d6c, 0x00000001c02a4fee },
+    /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+    { 0x000000006c0971f0, 0x000000003c273778 },
+    /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+    { 0x000000005b16467a, 0x00000001d63f8894 },
+    /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+    { 0x00000001551a628e, 0x000000006be557d6 },
+    /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+    { 0x000000019e42ea92, 0x000000006a7806ea },
+    /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+    { 0x000000012fa83ff2, 0x000000016155aa0c },
+    /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+    { 0x000000011ca9cde0, 0x00000000908650ac },
+    /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+    { 0x00000000c8e5cd74, 0x00000000aa5a8084 },
+    /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+    { 0x0000000096c27f0c, 0x0000000191bb500a },
+    /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+    { 0x000000002baed926, 0x0000000064e9bed0 },
+    /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+    { 0x000000017c8de8d2, 0x000000009444f302 },
+    /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+    { 0x00000000d43d6068, 0x000000019db07d3c },
+    /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+    { 0x00000000cb2c4b26, 0x00000001359e3e6e },
+    /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+    { 0x0000000145b8da26, 0x00000001e4f10dd2 },
+    /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+    { 0x000000018fff4b08, 0x0000000124f5735e },
+    /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+    { 0x0000000150b58ed0, 0x0000000124760a4c },
+    /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+    { 0x00000001549f39bc, 0x000000000f1fc186 },
+    /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+    { 0x00000000ef4d2f42, 0x00000000150e4cc4 },
+    /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+    { 0x00000001b1468572, 0x000000002a6204e8 },
+    /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+    { 0x000000013d7403b2, 0x00000000beb1d432 },
+    /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+    { 0x00000001a4681842, 0x0000000135f3f1f0 },
+    /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+    { 0x0000000167714492, 0x0000000074fe2232 },
+    /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+    { 0x00000001e599099a, 0x000000001ac6e2ba },
+    /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+    { 0x00000000fe128194, 0x0000000013fca91e },
+    /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+    { 0x0000000077e8b990, 0x0000000183f4931e },
+    /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+    { 0x00000001a267f63a, 0x00000000b6d9b4e4 },
+    /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+    { 0x00000001945c245a, 0x00000000b5188656 },
+    /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+    { 0x0000000149002e76, 0x0000000027a81a84 },
+    /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+    { 0x00000001bb8310a4, 0x0000000125699258 },
+    /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+    { 0x000000019ec60bcc, 0x00000001b23de796 },
+    /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+    { 0x000000012d8590ae, 0x00000000fe4365dc },
+    /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+    { 0x0000000065b00684, 0x00000000c68f497a },
+    /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+    { 0x000000015e5aeadc, 0x00000000fbf521ee },
+    /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+    { 0x00000000b77ff2b0, 0x000000015eac3378 },
+    /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+    { 0x0000000188da2ff6, 0x0000000134914b90 },
+    /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+    { 0x0000000063da929a, 0x0000000016335cfe },
+    /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+    { 0x00000001389caa80, 0x000000010372d10c },
+    /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+    { 0x000000013db599d2, 0x000000015097b908 },
+    /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+    { 0x0000000122505a86, 0x00000001227a7572 },
+    /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+    { 0x000000016bd72746, 0x000000009a8f75c0 },
+    /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+    { 0x00000001c3faf1d4, 0x00000000682c77a2 },
+    /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+    { 0x00000001111c826c, 0x00000000231f091c },
+    /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+    { 0x00000000153e9fb2, 0x000000007d4439f2 },
+    /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+    { 0x000000002b1f7b60, 0x000000017e221efc },
+    /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+    { 0x00000000b1dba570, 0x0000000167457c38 },
+    /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+    { 0x00000001f6397b76, 0x00000000bdf081c4 },
+    /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+    { 0x0000000156335214, 0x000000016286d6b0 },
+    /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+    { 0x00000001d70e3986, 0x00000000c84f001c },
+    /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+    { 0x000000003701a774, 0x0000000064efe7c0 },
+    /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+    { 0x00000000ac81ef72, 0x000000000ac2d904 },
+    /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+    { 0x0000000133212464, 0x00000000fd226d14 },
+    /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+    { 0x00000000e4e45610, 0x000000011cfd42e0 },
+    /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+    { 0x000000000c1bd370, 0x000000016e5a5678 },
+    /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+    { 0x00000001a7b9e7a6, 0x00000001d888fe22 },
+    /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+    { 0x000000007d657a10, 0x00000001af77fcd4 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+    /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+    { 0x00000001651797d2, 0x0000000099ea94a8 },
+    /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+    { 0x0000000021e0d56c, 0x00000000945a8420 },
+    /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+    { 0x000000000f95ecaa, 0x0000000030762706 },
+    /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+    { 0x00000001ebd224ac, 0x00000001a52fc582 },
+    /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+    { 0x000000000ccb97ca, 0x00000001a4a7167a },
+    /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+    { 0x00000001006ec8a8, 0x000000000c18249a },
+    /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+    { 0x000000014f58f196, 0x00000000a924ae7c },
+    /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+    { 0x00000001a7192ca6, 0x00000001e12ccc12 },
+    /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+    { 0x000000019a64bab2, 0x00000000a0b9d4ac },
+    /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+    { 0x0000000014f4ed2e, 0x0000000095e8ddfe },
+    /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+    { 0x000000011092b6a2, 0x00000000233fddc4 },
+    /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+    { 0x00000000c8a1629c, 0x00000001b4529b62 },
+    /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+    { 0x000000017bf32e8e, 0x00000001a7fa0e64 },
+    /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+    { 0x00000001f8cc6582, 0x00000001b5334592 },
+    /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+    { 0x000000008631ddf0, 0x000000011f8ee1b4 },
+    /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+    { 0x000000007e5a76d0, 0x000000006252e632 },
+    /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+    { 0x000000002b09b31c, 0x00000000ab973e84 },
+    /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+    { 0x00000001b2df1f84, 0x000000007734f5ec },
+    /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+    { 0x00000001d6f56afc, 0x000000007c547798 },
+    /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+    { 0x00000001b9b5e70c, 0x000000007ec40210 },
+    /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+    { 0x0000000034b626d2, 0x00000001ab1695a8 },
+    /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+    { 0x000000014c53479a, 0x0000000090494bba },
+    /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+    { 0x00000001a6d179a4, 0x00000001123fb816 },
+    /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+    { 0x000000015abd16b4, 0x00000001e188c74c },
+    /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+    { 0x00000000018f9852, 0x00000001c2d3451c },
+    /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+    { 0x000000001fb3084a, 0x00000000f55cf1ca },
+    /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+    { 0x00000000c53dfb04, 0x00000001a0531540 },
+    /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+    { 0x00000000e10c9ad6, 0x0000000132cd7ebc },
+    /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+    { 0x0000000025aa994a, 0x0000000073ab7f36 },
+    /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+    { 0x00000000fa3a74c4, 0x0000000041aed1c2 },
+    /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+    { 0x0000000033eb3f40, 0x0000000136c53800 },
+    /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+    { 0x000000017193f296, 0x0000000126835a30 },
+    /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+    { 0x0000000043f6c86a, 0x000000006241b502 },
+    /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+    { 0x000000016b513ec6, 0x00000000d5196ad4 },
+    /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+    { 0x00000000c8f25b4e, 0x000000009cfa769a },
+    /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+    { 0x00000001a45048ec, 0x00000000920e5df4 },
+    /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+    { 0x000000000c441004, 0x0000000169dc310e },
+    /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+    { 0x000000000e17cad6, 0x0000000009fc331c },
+    /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+    { 0x00000001253ae964, 0x000000010d94a81e },
+    /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+    { 0x00000001d7c88ebc, 0x0000000027a20ab2 },
+    /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+    { 0x00000001e7ca913a, 0x0000000114f87504 },
+    /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+    { 0x0000000033ed078a, 0x000000004b076d96 },
+    /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+    { 0x00000000e1839c78, 0x00000000da4d1e74 },
+    /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+    { 0x00000001322b267e, 0x000000001b81f672 },
+    /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+    { 0x00000000638231b6, 0x000000009367c988 },
+    /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+    { 0x00000001ee7f16f4, 0x00000001717214ca },
+    /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+    { 0x0000000117d9924a, 0x000000009f47d820 },
+    /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+    { 0x00000000e1a9e0c4, 0x000000010d9a47d2 },
+    /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+    { 0x00000001403731dc, 0x00000000a696c58c },
+    /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+    { 0x00000001a5ea9682, 0x000000002aa28ec6 },
+    /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+    { 0x0000000101c5c578, 0x00000001fe18fd9a },
+    /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+    { 0x00000000dddf6494, 0x000000019d4fc1ae },
+    /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+    { 0x00000000f1c3db28, 0x00000001ba0e3dea },
+    /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+    { 0x000000013112fb9c, 0x0000000074b59a5e },
+    /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+    { 0x00000000b680b906, 0x00000000f2b5ea98 },
+    /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+    { 0x000000001a282932, 0x0000000187132676 },
+    /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+    { 0x0000000089406e7e, 0x000000010a8c6ad4 },
+    /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+    { 0x00000001def6be8c, 0x00000001e21dfe70 },
+    /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+    { 0x0000000075258728, 0x00000001da0050e4 },
+    /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+    { 0x000000019536090a, 0x00000000772172ae },
+    /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+    { 0x00000000f2455bfc, 0x00000000e47724aa },
+    /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+    { 0x000000018c40baf4, 0x000000003cd63ac4 },
+    /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+    { 0x000000004cd390d4, 0x00000001bf47d352 },
+    /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+    { 0x00000001e4ece95a, 0x000000018dc1d708 },
+    /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+    { 0x000000001a3ee918, 0x000000002d4620a4 },
+    /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+    { 0x000000007c652fb8, 0x0000000058fd1740 },
+    /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+    { 0x000000011c67842c, 0x00000000dadd9bfc },
+    /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+    { 0x00000000254f759c, 0x00000001ea2140be },
+    /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+    { 0x000000007ece94ca, 0x000000009de128ba },
+    /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+    { 0x0000000038f258c2, 0x000000013ac3aa8e },
+    /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+    { 0x00000001cdf17b00, 0x0000000099980562 },
+    /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+    { 0x000000011f882c16, 0x00000001c1579c86 },
+    /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+    { 0x0000000100093fc8, 0x0000000068dbbf94 },
+    /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+    { 0x00000001cd684f16, 0x000000004509fb04 },
+    /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+    { 0x000000004bc6a70a, 0x00000001202f6398 },
+    /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+    { 0x000000004fc7e8e4, 0x000000013aea243e },
+    /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+    { 0x0000000130103f1c, 0x00000001b4052ae6 },
+    /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+    { 0x0000000111b0024c, 0x00000001cd2a0ae8 },
+    /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+    { 0x000000010b3079da, 0x00000001fe4aa8b4 },
+    /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+    { 0x000000010192bcc2, 0x00000001d1559a42 },
+    /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+    { 0x0000000074838d50, 0x00000001f3e05ecc },
+    /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+    { 0x000000001b20f520, 0x0000000104ddd2cc },
+    /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+    { 0x0000000050c3590a, 0x000000015393153c },
+    /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+    { 0x00000000b41cac8e, 0x0000000057e942c6 },
+    /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+    { 0x000000000c72cc78, 0x000000012c633850 },
+    /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+    { 0x0000000030cdb032, 0x00000000ebcaae4c },
+    /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+    { 0x000000013e09fc32, 0x000000013ee532a6 },
+    /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+    { 0x000000001ed624d2, 0x00000001bf0cbc7e },
+    /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+    { 0x00000000781aee1a, 0x00000000d50b7a5a },
+    /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+    { 0x00000001c4d8348c, 0x0000000002fca6e8 },
+    /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+    { 0x0000000057a40336, 0x000000007af40044 },
+    /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+    { 0x0000000085544940, 0x0000000016178744 },
+    /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+    { 0x000000019cd21e80, 0x000000014c177458 },
+    /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+    { 0x000000013eb95bc0, 0x000000011b6ddf04 },
+    /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+    { 0x00000001dfc9fdfc, 0x00000001f3e29ccc },
+    /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+    { 0x00000000cd028bc2, 0x0000000135ae7562 },
+    /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+    { 0x0000000090db8c44, 0x0000000190ef812c },
+    /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+    { 0x000000010010a4ce, 0x0000000067a2c786 },
+    /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+    { 0x00000001c8f4c72c, 0x0000000048b9496c },
+    /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+    { 0x000000001c26170c, 0x000000015a422de6 },
+    /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+    { 0x00000000e3fccf68, 0x00000001ef0e3640 },
+    /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+    { 0x00000000d513ed24, 0x00000001006d2d26 },
+    /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+    { 0x00000000141beada, 0x00000001170d56d6 },
+    /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+    { 0x000000011071aea0, 0x00000000a5fb613c },
+    /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+    { 0x000000012e19080a, 0x0000000040bbf7fc },
+    /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+    { 0x0000000100ecf826, 0x000000016ac3a5b2 },
+    /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+    { 0x0000000069b09412, 0x00000000abf16230 },
+    /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+    { 0x0000000122297bac, 0x00000001ebe23fac },
+    /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+    { 0x00000000e9e4b068, 0x000000008b6a0894 },
+    /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+    { 0x000000004b38651a, 0x00000001288ea478 },
+    /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+    { 0x00000001468360e2, 0x000000016619c442 },
+    /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+    { 0x00000000121c2408, 0x0000000086230038 },
+    /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+    { 0x00000000da7e7d08, 0x000000017746a756 },
+    /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+    { 0x00000001058d7652, 0x0000000191b8f8f8 },
+    /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+    { 0x000000014a098a90, 0x000000008e167708 },
+    /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+    { 0x0000000020dbe72e, 0x0000000148b22d54 },
+    /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+    { 0x000000011e7323e8, 0x0000000044ba2c3c },
+    /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+    { 0x00000000d5d4bf94, 0x00000000b54d2b52 },
+    /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+    { 0x0000000199d8746c, 0x0000000005a4fd8a },
+    /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+    { 0x00000000ce9ca8a0, 0x0000000139f9fc46 },
+    /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+    { 0x00000000136edece, 0x000000015a1fa824 },
+    /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+    { 0x000000019b92a068, 0x000000000a61ae4c },
+    /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+    { 0x0000000071d62206, 0x0000000145e9113e },
+    /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+    { 0x00000000dfc50158, 0x000000006a348448 },
+    /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+    { 0x00000001517626bc, 0x000000004d80a08c },
+    /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+    { 0x0000000148d1e4fa, 0x000000014b6837a0 },
+    /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+    { 0x0000000094d8266e, 0x000000016896a7fc },
+    /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+    { 0x00000000606c5e34, 0x000000014f187140 },
+    /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+    { 0x000000019766beaa, 0x000000019581b9da },
+    /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+    { 0x00000001d80c506c, 0x00000001091bc984 },
+    /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+    { 0x000000001e73837c, 0x000000001067223c },
+    /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+    { 0x0000000064d587de, 0x00000001ab16ea02 },
+    /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+    { 0x00000000f4a507b0, 0x000000013c4598a8 },
+    /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+    { 0x0000000040e342fc, 0x00000000b3735430 },
+    /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+    { 0x00000001d5ad9c3a, 0x00000001bb3fc0c0 },
+    /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+    { 0x0000000094a691a4, 0x00000001570ae19c },
+    /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+    { 0x00000001271ecdfa, 0x00000001ea910712 },
+    /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+    { 0x000000009e54475a, 0x0000000167127128 },
+    /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+    { 0x00000000c9c099ee, 0x0000000019e790a2 },
+    /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+    { 0x000000009a2f736c, 0x000000003788f710 },
+    /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+    { 0x00000000bb9f4996, 0x00000001682a160e },
+    /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+    { 0x00000001db688050, 0x000000007f0ebd2e },
+    /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+    { 0x00000000e9b10af4, 0x000000002b032080 },
+    /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+    { 0x000000012d4545e4, 0x00000000cfd1664a },
+    /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+    { 0x000000000361139c, 0x00000000aa1181c2 },
+    /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+    { 0x00000001a5a1a3a8, 0x00000000ddd08002 },
+    /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+    { 0x000000006844e0b0, 0x00000000e8dd0446 },
+    /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+    { 0x00000000c3762f28, 0x00000001bbd94a00 },
+    /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+    { 0x00000001d26287a2, 0x00000000ab6cd180 },
+    /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+    { 0x00000001f6f0bba8, 0x0000000031803ce2 },
+    /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+    { 0x000000002ffabd62, 0x0000000024f40b0c },
+    /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+    { 0x00000000fb4516b8, 0x00000001ba1d9834 },
+    /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+    { 0x000000018cfa961c, 0x0000000104de61aa },
+    /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+    { 0x000000019e588d52, 0x0000000113e40d46 },
+    /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+    { 0x00000001180f0bbc, 0x00000001415598a0 },
+    /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+    { 0x00000000e1d9177a, 0x00000000bf6c8c90 },
+    /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+    { 0x0000000105abc27c, 0x00000001788b0504 },
+    /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+    { 0x00000000972e4a58, 0x0000000038385d02 },
+    /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+    { 0x0000000183499a5e, 0x00000001b6c83844 },
+    /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+    { 0x00000001c96a8cca, 0x0000000051061a8a },
+    /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+    { 0x00000001a1a5b60c, 0x000000017351388a },
+    /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+    { 0x00000000e4b6ac9c, 0x0000000132928f92 },
+    /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+    { 0x00000001807e7f5a, 0x00000000e6b4f48a },
+    /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+    { 0x000000017a7e3bc8, 0x0000000039d15e90 },
+    /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+    { 0x00000000d73975da, 0x00000000312d6074 },
+    /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+    { 0x000000017375d038, 0x000000017bbb2cc4 },
+    /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+    { 0x00000000193680bc, 0x000000016ded3e18 },
+    /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+    { 0x00000000999b06f6, 0x00000000f1638b16 },
+    /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+    { 0x00000001f685d2b8, 0x00000001d38b9ecc },
+    /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+    { 0x00000001f4ecbed2, 0x000000018b8d09dc },
+    /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+    { 0x00000000ba16f1a0, 0x00000000e7bc27d2 },
+    /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+    { 0x0000000115aceac4, 0x00000000275e1e96 },
+    /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+    { 0x00000001aeff6292, 0x00000000e2e3031e },
+    /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+    { 0x000000009640124c, 0x00000001041c84d8 },
+    /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+    { 0x0000000114f41f02, 0x00000000706ce672 },
+    /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+    { 0x000000009c5f3586, 0x000000015d5070da },
+    /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+    { 0x00000001878275fa, 0x0000000038f9493a },
+    /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+    { 0x00000000ddc42ce8, 0x00000000a3348a76 },
+    /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+    { 0x0000000181d2c73a, 0x00000001ad0aab92 },
+    /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+    { 0x0000000141c9320a, 0x000000019e85f712 },
+    /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+    { 0x000000015235719a, 0x000000005a871e76 },
+    /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+    { 0x00000000be27d804, 0x000000017249c662 },
+    /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+    { 0x000000006242d45a, 0x000000003a084712 },
+    /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+    { 0x000000009a53638e, 0x00000000ed438478 },
+    /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+    { 0x00000001001ecfb6, 0x00000000abac34cc },
+    /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+    { 0x000000016d7c2d64, 0x000000005f35ef3e },
+    /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+    { 0x00000001d0ce46c0, 0x0000000047d6608c },
+    /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+    { 0x0000000124c907b4, 0x000000002d01470e },
+    /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+    { 0x0000000018a555ca, 0x0000000158bbc7b0 },
+    /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+    { 0x000000006b0980bc, 0x00000000c0a23e8e },
+    /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+    { 0x000000008bbba964, 0x00000001ebd85c88 },
+    /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+    { 0x00000001070a5a1e, 0x000000019ee20bb2 },
+    /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+    { 0x000000002204322a, 0x00000001acabf2d6 },
+    /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+    { 0x00000000a27524d0, 0x00000001b7963d56 },
+    /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+    { 0x0000000020b1e4ba, 0x000000017bffa1fe },
+    /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+    { 0x0000000032cc27fc, 0x000000001f15333e },
+    /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+    { 0x0000000044dd22b8, 0x000000018593129e },
+    /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+    { 0x00000000dffc9e0a, 0x000000019cb32602 },
+    /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+    { 0x00000001b7a0ed14, 0x0000000142b05cc8 },
+    /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+    { 0x00000000c7842488, 0x00000001be49e7a4 },
+    /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+    { 0x00000001c02a4fee, 0x0000000108f69d6c },
+    /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+    { 0x000000003c273778, 0x000000006c0971f0 },
+    /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+    { 0x00000001d63f8894, 0x000000005b16467a },
+    /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+    { 0x000000006be557d6, 0x00000001551a628e },
+    /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+    { 0x000000006a7806ea, 0x000000019e42ea92 },
+    /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+    { 0x000000016155aa0c, 0x000000012fa83ff2 },
+    /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+    { 0x00000000908650ac, 0x000000011ca9cde0 },
+    /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+    { 0x00000000aa5a8084, 0x00000000c8e5cd74 },
+    /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+    { 0x0000000191bb500a, 0x0000000096c27f0c },
+    /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+    { 0x0000000064e9bed0, 0x000000002baed926 },
+    /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+    { 0x000000009444f302, 0x000000017c8de8d2 },
+    /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+    { 0x000000019db07d3c, 0x00000000d43d6068 },
+    /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+    { 0x00000001359e3e6e, 0x00000000cb2c4b26 },
+    /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+    { 0x00000001e4f10dd2, 0x0000000145b8da26 },
+    /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+    { 0x0000000124f5735e, 0x000000018fff4b08 },
+    /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+    { 0x0000000124760a4c, 0x0000000150b58ed0 },
+    /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+    { 0x000000000f1fc186, 0x00000001549f39bc },
+    /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+    { 0x00000000150e4cc4, 0x00000000ef4d2f42 },
+    /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+    { 0x000000002a6204e8, 0x00000001b1468572 },
+    /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+    { 0x00000000beb1d432, 0x000000013d7403b2 },
+    /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+    { 0x0000000135f3f1f0, 0x00000001a4681842 },
+    /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+    { 0x0000000074fe2232, 0x0000000167714492 },
+    /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+    { 0x000000001ac6e2ba, 0x00000001e599099a },
+    /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+    { 0x0000000013fca91e, 0x00000000fe128194 },
+    /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+    { 0x0000000183f4931e, 0x0000000077e8b990 },
+    /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+    { 0x00000000b6d9b4e4, 0x00000001a267f63a },
+    /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+    { 0x00000000b5188656, 0x00000001945c245a },
+    /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+    { 0x0000000027a81a84, 0x0000000149002e76 },
+    /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+    { 0x0000000125699258, 0x00000001bb8310a4 },
+    /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+    { 0x00000001b23de796, 0x000000019ec60bcc },
+    /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+    { 0x00000000fe4365dc, 0x000000012d8590ae },
+    /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+    { 0x00000000c68f497a, 0x0000000065b00684 },
+    /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+    { 0x00000000fbf521ee, 0x000000015e5aeadc },
+    /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+    { 0x000000015eac3378, 0x00000000b77ff2b0 },
+    /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+    { 0x0000000134914b90, 0x0000000188da2ff6 },
+    /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+    { 0x0000000016335cfe, 0x0000000063da929a },
+    /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+    { 0x000000010372d10c, 0x00000001389caa80 },
+    /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+    { 0x000000015097b908, 0x000000013db599d2 },
+    /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+    { 0x00000001227a7572, 0x0000000122505a86 },
+    /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+    { 0x000000009a8f75c0, 0x000000016bd72746 },
+    /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+    { 0x00000000682c77a2, 0x00000001c3faf1d4 },
+    /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+    { 0x00000000231f091c, 0x00000001111c826c },
+    /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+    { 0x000000007d4439f2, 0x00000000153e9fb2 },
+    /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+    { 0x000000017e221efc, 0x000000002b1f7b60 },
+    /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+    { 0x0000000167457c38, 0x00000000b1dba570 },
+    /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+    { 0x00000000bdf081c4, 0x00000001f6397b76 },
+    /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+    { 0x000000016286d6b0, 0x0000000156335214 },
+    /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+    { 0x00000000c84f001c, 0x00000001d70e3986 },
+    /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+    { 0x0000000064efe7c0, 0x000000003701a774 },
+    /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+    { 0x000000000ac2d904, 0x00000000ac81ef72 },
+    /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+    { 0x00000000fd226d14, 0x0000000133212464 },
+    /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+    { 0x000000011cfd42e0, 0x00000000e4e45610 },
+    /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+    { 0x000000016e5a5678, 0x000000000c1bd370 },
+    /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+    { 0x00000001d888fe22, 0x00000001a7b9e7a6 },
+    /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+    { 0x00000001af77fcd4, 0x000000007d657a10 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
+
+/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+
+static const __vector unsigned long long vcrc_short_const[16] ALIGNED_(16) = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x)  */
+    { 0x99168a18ec447f11, 0xed837b2613e8221e },
+    /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x)  */
+    { 0xe23e954e8fd2cd3c, 0xc8acdd8147b9ce5a },
+    /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x)  */
+    { 0x92f8befe6b1d2b53, 0xd9ad6d87d4277e25 },
+    /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x)  */
+    { 0xf38a3556291ea462, 0xc10ec5e033fbca3b },
+    /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x)  */
+    { 0x974ac56262b6ca4b, 0xc0b55b0e82e02e2f },
+    /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x)  */
+    { 0x855712b3784d2a56, 0x71aa1df0e172334d },
+    /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x)  */
+    { 0xa5abe9f80eaee722, 0xfee3053e3969324d },
+    /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x)  */
+    { 0x1fa0943ddb54814c, 0xf44779b93eb2bd08 },
+    /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x)  */
+    { 0xa53ff440d7bbfe6a, 0xf5449b3f00cc3374 },
+    /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x)  */
+    { 0xebe7e3566325605c, 0x6f8346e1d777606e },
+    /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x)  */
+    { 0xc65a272ce5b592b8, 0xe3ab4f2ac0b95347 },
+    /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x)  */
+    { 0x5705a9ca4721589f, 0xaa2215ea329ecc11 },
+    /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x)  */
+    { 0xe3720acb88d14467, 0x1ed8f66ed95efd26 },
+    /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x)  */
+    { 0xba1aca0315141c31, 0x78ed02d5a700e96a },
+    /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x)  */
+    { 0xad2a31b3ed627dae, 0xba8ccbe832b39da3 },
+    /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x)  */
+    { 0x6655004fa06a2517, 0xedb88320b1e6b092 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+    /* x^1952 mod p(x) , x^1984 mod p(x) , x^2016 mod p(x) , x^2048 mod p(x)  */
+    { 0xed837b2613e8221e, 0x99168a18ec447f11 },
+    /* x^1824 mod p(x) , x^1856 mod p(x) , x^1888 mod p(x) , x^1920 mod p(x)  */
+    { 0xc8acdd8147b9ce5a, 0xe23e954e8fd2cd3c },
+    /* x^1696 mod p(x) , x^1728 mod p(x) , x^1760 mod p(x) , x^1792 mod p(x)  */
+    { 0xd9ad6d87d4277e25, 0x92f8befe6b1d2b53 },
+    /* x^1568 mod p(x) , x^1600 mod p(x) , x^1632 mod p(x) , x^1664 mod p(x)  */
+    { 0xc10ec5e033fbca3b, 0xf38a3556291ea462 },
+    /* x^1440 mod p(x) , x^1472 mod p(x) , x^1504 mod p(x) , x^1536 mod p(x)  */
+    { 0xc0b55b0e82e02e2f, 0x974ac56262b6ca4b },
+    /* x^1312 mod p(x) , x^1344 mod p(x) , x^1376 mod p(x) , x^1408 mod p(x)  */
+    { 0x71aa1df0e172334d, 0x855712b3784d2a56 },
+    /* x^1184 mod p(x) , x^1216 mod p(x) , x^1248 mod p(x) , x^1280 mod p(x)  */
+    { 0xfee3053e3969324d, 0xa5abe9f80eaee722 },
+    /* x^1056 mod p(x) , x^1088 mod p(x) , x^1120 mod p(x) , x^1152 mod p(x)  */
+    { 0xf44779b93eb2bd08, 0x1fa0943ddb54814c },
+    /* x^928 mod p(x) , x^960 mod p(x) , x^992 mod p(x) , x^1024 mod p(x)  */
+    { 0xf5449b3f00cc3374, 0xa53ff440d7bbfe6a },
+    /* x^800 mod p(x) , x^832 mod p(x) , x^864 mod p(x) , x^896 mod p(x)  */
+    { 0x6f8346e1d777606e, 0xebe7e3566325605c },
+    /* x^672 mod p(x) , x^704 mod p(x) , x^736 mod p(x) , x^768 mod p(x)  */
+    { 0xe3ab4f2ac0b95347, 0xc65a272ce5b592b8 },
+    /* x^544 mod p(x) , x^576 mod p(x) , x^608 mod p(x) , x^640 mod p(x)  */
+    { 0xaa2215ea329ecc11, 0x5705a9ca4721589f },
+    /* x^416 mod p(x) , x^448 mod p(x) , x^480 mod p(x) , x^512 mod p(x)  */
+    { 0x1ed8f66ed95efd26, 0xe3720acb88d14467 },
+    /* x^288 mod p(x) , x^320 mod p(x) , x^352 mod p(x) , x^384 mod p(x)  */
+    { 0x78ed02d5a700e96a, 0xba1aca0315141c31 },
+    /* x^160 mod p(x) , x^192 mod p(x) , x^224 mod p(x) , x^256 mod p(x)  */
+    { 0xba8ccbe832b39da3, 0xad2a31b3ed627dae },
+    /* x^32 mod p(x) , x^64 mod p(x) , x^96 mod p(x) , x^128 mod p(x)  */
+    { 0xedb88320b1e6b092, 0x6655004fa06a2517 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
+
+/* Barrett constants */
+/* 33 bit reflected Barrett constant m - (4^32)/n */
+
+static const __vector unsigned long long v_Barrett_const[2] ALIGNED_(16) = {
+    /* x^64 div p(x)  */
+#if BYTE_ORDER == LITTLE_ENDIAN
+    { 0x00000001f7011641, 0x0000000000000000 },
+    { 0x00000001db710641, 0x0000000000000000 }
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+    { 0x0000000000000000, 0x00000001f7011641 },
+    { 0x0000000000000000, 0x00000001db710641 }
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+};
diff --git a/3rdparty/zlib-ng/arch/power/crc32_power8.c b/3rdparty/zlib-ng/arch/power/crc32_power8.c
new file mode 100644
index 000000000000..1cb5f299f3d9
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/crc32_power8.c
@@ -0,0 +1,589 @@
+/* crc32 for POWER8 using VSX instructions
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Author: Rogerio Alves <rogealve@br.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * This code uses gcc vector builtins instead using assembly directly.
+ */
+
+#include <altivec.h>
+#include "zendian.h"
+#include "zbuild.h"
+
+#include "crc32_constants.h"
+#include "crc32_braid_tbl.h"
+
+#if defined (__clang__)
+#include "fallback_builtins.h"
+#endif
+
+#define MAX_SIZE    32768
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
+    while (len--)
+        crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+    return crc;
+}
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
+
+Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
+    unsigned int prealign;
+    unsigned int tail;
+
+    unsigned long len = (unsigned long) _len;
+
+    if (p == (const unsigned char *) 0x0)
+        return 0;
+
+    crc ^= 0xffffffff;
+
+    if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+        crc = crc32_align(crc, p, len);
+        goto out;
+    }
+
+    if ((unsigned long)p & VMX_ALIGN_MASK) {
+        prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+        crc = crc32_align(crc, p, prealign);
+        len -= prealign;
+        p += prealign;
+    }
+
+    crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+
+    tail = len & VMX_ALIGN_MASK;
+    if (tail) {
+        p += len & ~VMX_ALIGN_MASK;
+        crc = crc32_align(crc, p, tail);
+    }
+
+out:
+    crc ^= 0xffffffff;
+
+    return crc;
+}
+
+/* When we have a load-store in a single-dispatch group and address overlap
+ * such that forward is not allowed (load-hit-store) the group must be flushed.
+ * A group ending NOP prevents the flush.
+ */
+#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define BYTESWAP_DATA
+#endif
+
+#ifdef BYTESWAP_DATA
+#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
+#if BYTE_ORDER == LITTLE_ENDIAN
+/* Byte reverse permute constant LE. */
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
+#else
+static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
+#endif
+#else
+#define VEC_PERM(vr, va, vb, vc)
+#endif
+
+static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
+
+    const __vector unsigned long long vzero = {0,0};
+    const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
+
+    const __vector unsigned long long vmask_32bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
+
+    const __vector unsigned long long vmask_64bit =
+        (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
+
+    __vector unsigned long long vcrc;
+
+    __vector unsigned long long vconst1, vconst2;
+
+    /* vdata0-vdata7 will contain our data (p). */
+    __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
+
+    /* v0-v7 will contain our checksums */
+    __vector unsigned long long v0 = {0,0};
+    __vector unsigned long long v1 = {0,0};
+    __vector unsigned long long v2 = {0,0};
+    __vector unsigned long long v3 = {0,0};
+    __vector unsigned long long v4 = {0,0};
+    __vector unsigned long long v5 = {0,0};
+    __vector unsigned long long v6 = {0,0};
+    __vector unsigned long long v7 = {0,0};
+
+
+    /* Vector auxiliary variables. */
+    __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
+
+    unsigned int offset; /* Constant table offset. */
+
+    unsigned long i; /* Counter. */
+    unsigned long chunks;
+
+    unsigned long block_size;
+    int next_block = 0;
+
+    /* Align by 128 bits. The last 128 bit block will be processed at end. */
+    unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
+
+    vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
+
+    /* Short version. */
+    if (len < 256) {
+        /* Calculate where in the constant table we need to start. */
+        offset = 256 - len;
+
+        vconst1 = vec_ld(offset, vcrc_short_const);
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+
+        /* xor initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+        v0 = vec_xor(v0, vdata0);
+
+        for (i = 16; i < len; i += 16) {
+            vconst1 = vec_ld(offset + i, vcrc_short_const);
+            vdata0 = vec_ld(i, (__vector unsigned long long*) p);
+            VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
+            vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
+            v0 = vec_xor(v0, vdata0);
+        }
+    } else {
+
+        /* Load initial values. */
+        vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+        vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+        VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+        vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+        vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+        VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+        vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+        vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+        VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+        vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+        vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+
+        VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+        VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+        /* xor in initial value */
+        vdata0 = vec_xor(vdata0, vcrc);
+
+        p = (char *)p + 128;
+
+        do {
+            /* Checksum in blocks of MAX_SIZE. */
+            block_size = length;
+            if (block_size > MAX_SIZE) {
+                block_size = MAX_SIZE;
+            }
+
+            length = length - block_size;
+
+            /*
+             * Work out the offset into the constants table to start at. Each
+             * constant is 16 bytes, and it is used against 128 bytes of input
+             * data - 128 / 16 = 8
+             */
+            offset = (MAX_SIZE/8) - (block_size/8);
+            /* We reduce our final 128 bytes in a separate step */
+            chunks = (block_size/128)-1;
+
+            vconst1 = vec_ld(offset, vcrc_const);
+
+            va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                           (__vector unsigned long long)vconst1);
+            va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                           (__vector unsigned long long)vconst1);
+            va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                           (__vector unsigned long long)vconst1);
+            va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                           (__vector unsigned long long)vconst1);
+            va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                           (__vector unsigned long long)vconst1);
+            va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                           (__vector unsigned long long)vconst1);
+            va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                           (__vector unsigned long long)vconst1);
+            va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                           (__vector unsigned long long)vconst1);
+
+            if (chunks > 1) {
+                offset += 16;
+                vconst2 = vec_ld(offset, vcrc_const);
+                GROUP_ENDING_NOP;
+
+                vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+
+                vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+
+                vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+
+                vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+
+                vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+
+                vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+
+                vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                p = (char *)p + 128;
+
+                /*
+                 * main loop. Each iteration calculates the CRC for a 128-byte
+                 * block.
+                 */
+                for (i = 0; i < chunks-2; i++) {
+                    vconst1 = vec_ld(offset, vcrc_const);
+                    offset += 16;
+                    GROUP_ENDING_NOP;
+
+                    v0 = vec_xor(v0, va0);
+                    va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata0 = vec_ld(0, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v1 = vec_xor(v1, va1);
+                    va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata1 = vec_ld(16, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v2 = vec_xor(v2, va2);
+                    va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
+                                                   vdata2, (__vector unsigned long long)vconst2);
+                    vdata2 = vec_ld(32, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v3 = vec_xor(v3, va3);
+                    va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                                   (__vector unsigned long long)vconst2);
+                    vdata3 = vec_ld(48, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
+
+                    vconst2 = vec_ld(offset, vcrc_const);
+                    GROUP_ENDING_NOP;
+
+                    v4 = vec_xor(v4, va4);
+                    va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata4 = vec_ld(64, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v5 = vec_xor(v5, va5);
+                    va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata5 = vec_ld(80, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v6 = vec_xor(v6, va6);
+                    va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata6 = vec_ld(96, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
+                    GROUP_ENDING_NOP;
+
+                    v7 = vec_xor(v7, va7);
+                    va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                                   (__vector unsigned long long)vconst1);
+                    vdata7 = vec_ld(112, (__vector unsigned long long*) p);
+                    VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
+
+                    p = (char *)p + 128;
+                }
+
+                /* First cool down */
+                vconst1 = vec_ld(offset, vcrc_const);
+                offset += 16;
+
+                v0 = vec_xor(v0, va0);
+                va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v1 = vec_xor(v1, va1);
+                va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v2 = vec_xor(v2, va2);
+                va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v3 = vec_xor(v3, va3);
+                va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v4 = vec_xor(v4, va4);
+                va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v5 = vec_xor(v5, va5);
+                va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v6 = vec_xor(v6, va6);
+                va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
+                                               (__vector unsigned long long)vconst1);
+                GROUP_ENDING_NOP;
+
+                v7 = vec_xor(v7, va7);
+                va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
+                                               (__vector unsigned long long)vconst1);
+            }/* else */
+
+            /* Second cool down. */
+            v0 = vec_xor(v0, va0);
+            v1 = vec_xor(v1, va1);
+            v2 = vec_xor(v2, va2);
+            v3 = vec_xor(v3, va3);
+            v4 = vec_xor(v4, va4);
+            v5 = vec_xor(v5, va5);
+            v6 = vec_xor(v6, va6);
+            v7 = vec_xor(v7, va7);
+
+            /*
+             * vpmsumd produces a 96 bit result in the least significant bits
+             * of the register. Since we are bit reflected we have to shift it
+             * left 32 bits so it occupies the least significant bits in the
+             * bit reflected domain.
+             */
+            v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                                      (__vector unsigned char)vzero, 4);
+            v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
+                                                      (__vector unsigned char)vzero, 4);
+            v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
+                                                      (__vector unsigned char)vzero, 4);
+            v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
+                                                      (__vector unsigned char)vzero, 4);
+            v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
+                                                      (__vector unsigned char)vzero, 4);
+            v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
+                                                      (__vector unsigned char)vzero, 4);
+            v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
+                                                      (__vector unsigned char)vzero, 4);
+            v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
+                                                      (__vector unsigned char)vzero, 4);
+
+            /* xor with the last 1024 bits. */
+            va0 = vec_ld(0, (__vector unsigned long long*) p);
+            VEC_PERM(va0, va0, va0, vperm_const);
+
+            va1 = vec_ld(16, (__vector unsigned long long*) p);
+            VEC_PERM(va1, va1, va1, vperm_const);
+
+            va2 = vec_ld(32, (__vector unsigned long long*) p);
+            VEC_PERM(va2, va2, va2, vperm_const);
+
+            va3 = vec_ld(48, (__vector unsigned long long*) p);
+            VEC_PERM(va3, va3, va3, vperm_const);
+
+            va4 = vec_ld(64, (__vector unsigned long long*) p);
+            VEC_PERM(va4, va4, va4, vperm_const);
+
+            va5 = vec_ld(80, (__vector unsigned long long*) p);
+            VEC_PERM(va5, va5, va5, vperm_const);
+
+            va6 = vec_ld(96, (__vector unsigned long long*) p);
+            VEC_PERM(va6, va6, va6, vperm_const);
+
+            va7 = vec_ld(112, (__vector unsigned long long*) p);
+            VEC_PERM(va7, va7, va7, vperm_const);
+
+            p = (char *)p + 128;
+
+            vdata0 = vec_xor(v0, va0);
+            vdata1 = vec_xor(v1, va1);
+            vdata2 = vec_xor(v2, va2);
+            vdata3 = vec_xor(v3, va3);
+            vdata4 = vec_xor(v4, va4);
+            vdata5 = vec_xor(v5, va5);
+            vdata6 = vec_xor(v6, va6);
+            vdata7 = vec_xor(v7, va7);
+
+            /* Check if we have more blocks to process */
+            next_block = 0;
+            if (length != 0) {
+                next_block = 1;
+
+                /* zero v0-v7 */
+                v0 = vec_xor(v0, v0);
+                v1 = vec_xor(v1, v1);
+                v2 = vec_xor(v2, v2);
+                v3 = vec_xor(v3, v3);
+                v4 = vec_xor(v4, v4);
+                v5 = vec_xor(v5, v5);
+                v6 = vec_xor(v6, v6);
+                v7 = vec_xor(v7, v7);
+            }
+            length = length + 128;
+
+        } while (next_block);
+
+        /* Calculate how many bytes we have left. */
+        length = (len & 127);
+
+        /* Calculate where in (short) constant table we need to start. */
+        offset = 128 - length;
+
+        v0 = vec_ld(offset, vcrc_short_const);
+        v1 = vec_ld(offset + 16, vcrc_short_const);
+        v2 = vec_ld(offset + 32, vcrc_short_const);
+        v3 = vec_ld(offset + 48, vcrc_short_const);
+        v4 = vec_ld(offset + 64, vcrc_short_const);
+        v5 = vec_ld(offset + 80, vcrc_short_const);
+        v6 = vec_ld(offset + 96, vcrc_short_const);
+        v7 = vec_ld(offset + 112, vcrc_short_const);
+
+        offset += 128;
+
+        v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata0, (__vector unsigned int)v0);
+        v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata1, (__vector unsigned int)v1);
+        v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata2, (__vector unsigned int)v2);
+        v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata3, (__vector unsigned int)v3);
+        v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata4, (__vector unsigned int)v4);
+        v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata5, (__vector unsigned int)v5);
+        v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata6, (__vector unsigned int)v6);
+        v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+            (__vector unsigned int)vdata7, (__vector unsigned int)v7);
+
+        /* Now reduce the tail (0-112 bytes). */
+        for (i = 0; i < length; i+=16) {
+            vdata0 = vec_ld(i,(__vector unsigned long long*)p);
+            VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
+            va0 = vec_ld(offset + i,vcrc_short_const);
+            va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
+                (__vector unsigned int)vdata0, (__vector unsigned int)va0);
+            v0 = vec_xor(v0, va0);
+        }
+
+        /* xor all parallel chunks together. */
+        v0 = vec_xor(v0, v1);
+        v2 = vec_xor(v2, v3);
+        v4 = vec_xor(v4, v5);
+        v6 = vec_xor(v6, v7);
+
+        v0 = vec_xor(v0, v2);
+        v4 = vec_xor(v4, v6);
+
+        v0 = vec_xor(v0, v4);
+    }
+
+    /* Barrett Reduction */
+    vconst1 = vec_ld(0, v_Barrett_const);
+    vconst2 = vec_ld(16, v_Barrett_const);
+
+    v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)v0, 8);
+    v0 = vec_xor(v1,v0);
+
+    /* shift left one bit */
+    __vector unsigned char vsht_splat = vec_splat_u8 (1);
+    v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
+
+    v0 = vec_and(v0, vmask_64bit);
+
+    /*
+     * The reflected version of Barrett reduction. Instead of bit
+     * reflecting our data (which is expensive to do), we bit reflect our
+     * constants and our algorithm, which means the intermediate data in
+     * our vector registers goes from 0-63 instead of 63-0. We can reflect
+     * the algorithm because we don't carry in mod 2 arithmetic.
+     */
+
+    /* bottom 32 bits of a */
+    v1 = vec_and(v0, vmask_32bit);
+
+    /* ma */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst1);
+
+    /* bottom 32bits of ma */
+    v1 = vec_and(v1, vmask_32bit);
+    /* qn */
+    v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
+                                  (__vector unsigned long long)vconst2);
+    /* a - qn, subtraction is xor in GF(2) */
+    v0 = vec_xor (v0, v1);
+
+    /*
+     * Since we are bit reflected, the result (ie the low 32 bits) is in
+     * the high 32 bits. We just need to shift it left 4 bytes
+     * V0 [ 0 1 X 3 ]
+     * V0 [ 0 X 2 3 ]
+     */
+
+    /* shift result into top 64 bits of */
+    v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
+                                              (__vector unsigned char)vzero, 4);
+
+#if BYTE_ORDER == BIG_ENDIAN
+    return v0[0];
+#else
+    return v0[1];
+#endif
+}
diff --git a/3rdparty/zlib-ng/arch/power/fallback_builtins.h b/3rdparty/zlib-ng/arch/power/fallback_builtins.h
new file mode 100644
index 000000000000..ed9584617b15
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/fallback_builtins.h
@@ -0,0 +1,31 @@
+/* Helper functions to work around issues with clang builtins
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Authors:
+ *   Daniel Black <daniel@linux.vnet.ibm.com>
+ *   Rogerio Alves <rogealve@br.ibm.com>
+ *   Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_BUILTINS_H
+#define POWER_BUILTINS_H
+
+/*
+ * These stubs fix clang incompatibilities with GCC builtins.
+ */
+
+#ifndef __builtin_crypto_vpmsumw
+#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
+#endif
+#ifndef __builtin_crypto_vpmsumd
+#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
+#endif
+
+static inline __vector unsigned long long __attribute__((overloadable))
+vec_ld(int __a, const __vector unsigned long long* __b) {
+    return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/power/power_features.c b/3rdparty/zlib-ng/arch/power/power_features.c
new file mode 100644
index 000000000000..f73503734b13
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/power_features.c
@@ -0,0 +1,46 @@
+/* power_features.c - POWER feature check
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021-2022 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#endif
+#ifdef __FreeBSD__
+#  include <machine/cpu.h>
+#endif
+#include "../../zbuild.h"
+#include "power_features.h"
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features) {
+#ifdef PPC_FEATURES
+    unsigned long hwcap;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
+    hwcap = getauxval(AT_HWCAP);
+#endif
+
+    if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+        features->has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
+    unsigned long hwcap2;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+#else
+    hwcap2 = getauxval(AT_HWCAP2);
+#endif
+
+#ifdef POWER8_VSX
+    if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+        features->has_arch_2_07 = 1;
+#endif
+#ifdef POWER9
+    if (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+        features->has_arch_3_00 = 1;
+#endif
+#endif
+}
diff --git a/3rdparty/zlib-ng/arch/power/power_features.h b/3rdparty/zlib-ng/arch/power/power_features.h
new file mode 100644
index 000000000000..9252364cc48d
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/power_features.h
@@ -0,0 +1,18 @@
+/* power_features.h -- check for POWER CPU features
+ * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef POWER_H_
+#define POWER_H_
+
+struct power_cpu_features {
+    int has_altivec;
+    int has_arch_2_07;
+    int has_arch_3_00;
+};
+
+void Z_INTERNAL power_check_features(struct power_cpu_features *features);
+
+#endif /* POWER_H_ */
diff --git a/3rdparty/zlib-ng/arch/power/slide_hash_power8.c b/3rdparty/zlib-ng/arch/power/slide_hash_power8.c
new file mode 100644
index 000000000000..d01e0acd5661
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/slide_hash_power8.c
@@ -0,0 +1,12 @@
+/* Optimized slide_hash for POWER processors
+ * Copyright (C) 2019-2020 IBM Corporation
+ * Author: Matheus Castanho <msc@linux.ibm.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef POWER8_VSX
+
+#define SLIDE_PPC slide_hash_power8
+#include "slide_ppc_tpl.h"
+
+#endif /* POWER8_VSX */
diff --git a/3rdparty/zlib-ng/arch/power/slide_hash_vmx.c b/3rdparty/zlib-ng/arch/power/slide_hash_vmx.c
new file mode 100644
index 000000000000..5a87ef7d9aa0
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/slide_hash_vmx.c
@@ -0,0 +1,10 @@
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX
+
+#define SLIDE_PPC slide_hash_vmx
+#include "slide_ppc_tpl.h"
+
+#endif /* PPC_VMX */
diff --git a/3rdparty/zlib-ng/arch/power/slide_ppc_tpl.h b/3rdparty/zlib-ng/arch/power/slide_ppc_tpl.h
new file mode 100644
index 000000000000..5c17e38fb310
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/power/slide_ppc_tpl.h
@@ -0,0 +1,31 @@
+/* Optimized slide_hash for PowerPC processors
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    const vector unsigned short vmx_wsize = vec_splats(wsize);
+    Pos *p = table;
+
+    do {
+        vector unsigned short value, result;
+
+        value = vec_ld(0, p);
+        result = vec_subs(value, vmx_wsize);
+        vec_st(result, 0, p);
+
+        p += 8;
+        entries -= 8;
+   } while (entries > 0);
+}
+
+void Z_INTERNAL SLIDE_PPC(deflate_state *s) {
+    uint16_t wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
diff --git a/3rdparty/zlib-ng/arch/riscv/README.md b/3rdparty/zlib-ng/arch/riscv/README.md
new file mode 100644
index 000000000000..013095c3732f
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/riscv/README.md
@@ -0,0 +1,45 @@
+# Building RISC-V Target with Cmake #
+
+> **Warning**
+> Runtime rvv detection (using `hwcap`) requires linux kernel 6.5 or newer.
+>
+> When running on older kernels, we fall back to compile-time detection, potentially this can cause crashes if rvv is enabled at compile but not supported by the target cpu.
+> Therefore if older kernel support is needed, rvv should be disabled if the target cpu does not support it.
+## Prerequisite: Build RISC-V Clang Toolchain and QEMU ##
+
+If you don't have prebuilt clang and riscv64 qemu, you can refer to the [script](https://github.com/sifive/prepare-riscv-toolchain-qemu/blob/main/prepare_riscv_toolchain_qemu.sh) to get the source. Copy the script to the zlib-ng root directory, and run it to download the source and build them. Modify the content according to your conditions (e.g., toolchain version).
+
+```bash
+./prepare_riscv_toolchain_qemu.sh
+```
+
+After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`.
+
+`build-toolchain-qemu/riscv-clang/` is your `TOOLCHAIN_PATH`.
+`build-toolchain-qemu/riscv-qemu/bin/qemu-riscv64` is your `QEMU_PATH`.
+
+You can also download the prebuilt toolchain & qemu from [the release page](https://github.com/sifive/prepare-riscv-toolchain-qemu/releases), and enjoy using them.
+
+## Cross-Compile for RISC-V Target ##
+
+```bash
+cmake -G Ninja -B ./build-riscv \
+  -D CMAKE_TOOLCHAIN_FILE=./cmake/toolchain-riscv.cmake \
+  -D CMAKE_INSTALL_PREFIX=./build-riscv/install \
+  -D TOOLCHAIN_PATH={TOOLCHAIN_PATH} \
+  -D QEMU_PATH={QEMU_PATH} \
+  .
+
+cmake --build ./build-riscv
+```
+
+Disable the option if there is no RVV support:
+```
+-D WITH_RVV=OFF
+```
+
+## Run Unittests on User Mode QEMU ##
+
+```bash
+cd ./build-riscv && ctest --verbose
+```
diff --git a/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
new file mode 100644
index 000000000000..da46f37e73c1
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/riscv/adler32_rvv.c
@@ -0,0 +1,132 @@
+/* adler32_rvv.c - RVV version of adler32
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+
+static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1) {
+        if (COPY) memcpy(dst, src, 1);
+        return adler32_len_1(adler, src, sum2);
+    }
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (src == NULL)
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < 16) {
+        if (COPY) memcpy(dst, src, len);
+        return adler32_len_16(adler, src, len, sum2);
+    }
+
+    size_t left = len;
+    size_t vl = __riscv_vsetvlmax_e8m1();
+    vl = vl > 256 ? 256 : vl;
+    vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint16m2_t v_buf16_accu;
+
+    /*
+     * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
+     * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
+     * accumulators to boost performance.
+     *
+     * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
+     * vl > 256 (255 * 256 <= UINT16_MAX).
+     *
+     * We accumulate 8-bit data into a 16-bit accumulator and then
+     * move the data into the 32-bit accumulator at the last iteration.
+     */
+    size_t block_size = (256 / vl) * vl;
+    size_t nmax_limit = (NMAX / block_size);
+    size_t cnt = 0;
+    while (left >= block_size) {
+        v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+        size_t subprob = block_size;
+        while (subprob > 0) {
+            vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+            if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+            v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+            v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+            src += vl;
+            if (COPY) dst += vl;
+            subprob -= vl;
+        }
+        v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
+        v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+        left -= block_size;
+        /* do modulo once each block of NMAX size */
+        if (++cnt >= nmax_limit) {
+            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+            cnt = 0;
+        }
+    }
+    /* the left len <= 256 now, we can use 16-bit accum safely */
+    v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+    size_t res = left;
+    while (left >= vl) {
+        vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
+        if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
+        v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+        v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+        src += vl;
+        if (COPY) dst += vl;
+        left -= vl;
+    }
+    v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+    v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+
+    vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
+    vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
+    vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
+
+    v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
+
+    vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
+    uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
+
+    sum2 += (sum2_sum + adler * (len - left));
+
+    vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
+    uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
+
+    adler += adler_sum;
+
+    while (left--) {
+        if (COPY) *dst++ = *src;
+        adler += *src++;
+        sum2 += adler;
+    }
+
+    sum2 %= BASE;
+    adler %= BASE;
+
+    return adler | (sum2 << 16);
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_rvv_impl(adler, dst, src, len, 1);
+}
+
+Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
+    return adler32_rvv_impl(adler, NULL, buf, len, 0);
+}
+
+#endif // RISCV_RVV
diff --git a/3rdparty/zlib-ng/arch/riscv/chunkset_rvv.c b/3rdparty/zlib-ng/arch/riscv/chunkset_rvv.c
new file mode 100644
index 000000000000..ee43bde2f71d
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/riscv/chunkset_rvv.c
@@ -0,0 +1,121 @@
+/* chunkset_rvv.c - RVV version of chunkset
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include <riscv_vector.h>
+#include "zbuild.h"
+
+/*
+ * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * so we prefer using large size chunk and copy memory as much as possible.
+ */
+#define CHUNK_SIZE 32
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+#define CHUNK_MEMSET_RVV_IMPL(elen)                                     \
+do {                                                                    \
+    size_t vl, len = CHUNK_SIZE / sizeof(uint##elen##_t);               \
+    uint##elen##_t val = *(uint##elen##_t*)from;                        \
+    uint##elen##_t* chunk_p = (uint##elen##_t*)chunk;                   \
+    do {                                                                \
+        vl = __riscv_vsetvl_e##elen##m4(len);                           \
+        vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
+        __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl);          \
+        len -= vl; chunk_p += vl;                                       \
+    } while (len > 0);                                                  \
+} while (0)
+
+/* We don't have a 32-byte datatype for RISC-V arch. */
+typedef struct chunk_s {
+    uint64_t data[4];
+} chunk_t;
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(16);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    CHUNK_MEMSET_RVV_IMPL(64);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    memcpy(chunk->data, (uint8_t *)s, CHUNK_SIZE);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    memcpy(out, chunk->data, CHUNK_SIZE);
+}
+
+#define CHUNKSIZE        chunksize_rvv
+#define CHUNKCOPY        chunkcopy_rvv
+#define CHUNKUNROLL      chunkunroll_rvv
+#define CHUNKMEMSET      chunkmemset_rvv
+#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv
+
+#define HAVE_CHUNKCOPY
+
+/*
+ * Assuming that the length is non-zero, and that `from` lags `out` by at least
+ * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h.
+ *
+ * We load/store a single chunk once in the `CHUNKCOPY`.
+ * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
+ * such that, we prefer copy large memory size once to make good use of the the RVV advance.
+ * 
+ * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
+ * but we still copy as much memory as possible for some conditions.
+ * 
+ * case 1: out - from >= len (no overlap)
+ *         We can use memcpy to copy `len` size once
+ *         because the memory layout would be the same.
+ *
+ * case 2: overlap
+ *         We copy N chunks using memcpy at once, aiming to achieve our goal: 
+ *         to copy as much memory as possible.
+ * 
+ *         After using a single memcpy to copy N chunks, we have to use series of
+ *         loadchunk and storechunk to ensure the result is correct.
+ */
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+    int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+    memcpy(out, from, sizeof(chunk_t));
+    out += align;
+    from += align;
+    len -= align;
+    ptrdiff_t dist = out - from;
+    if (dist >= len) {
+        memcpy(out, from, len);
+        out += len;
+        from += len;
+        return out;
+    }
+    if (dist >= sizeof(chunk_t)) {
+        dist = (dist / sizeof(chunk_t)) * sizeof(chunk_t);
+        memcpy(out, from, dist);
+        out += dist;
+        from += dist;
+        len -= dist;
+    }
+    while (len > 0) {
+        memcpy(out, from, sizeof(chunk_t));
+        out += sizeof(chunk_t);
+        from += sizeof(chunk_t);
+        len -= sizeof(chunk_t);
+    }
+    return out;
+}
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_rvv
+
+#include "inffast_tpl.h"
diff --git a/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
new file mode 100644
index 000000000000..0fd6082c44d0
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/riscv/compare256_rvv.c
@@ -0,0 +1,47 @@
+/* compare256_rvv.c - RVV version of compare256
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "../../zbuild.h"
+#include "fallback_builtins.h"
+
+#include <riscv_vector.h>
+
+static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    size_t vl;
+    long found_diff;
+    do {
+        vl = __riscv_vsetvl_e8m4(256 - len);
+        vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+        vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+        vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
+        found_diff = __riscv_vfirst_m_b2(v_mask, vl);
+        if (found_diff >= 0)
+            return len + (uint32_t)found_diff;
+        src0 += vl, src1 += vl, len += vl;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_rvv_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_rvv
+#define COMPARE256          compare256_rvv_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_rvv
+#define COMPARE256          compare256_rvv_static
+
+#include "match_tpl.h"
+
+#endif // RISCV_RVV
diff --git a/3rdparty/zlib-ng/arch/riscv/riscv_features.c b/3rdparty/zlib-ng/arch/riscv/riscv_features.c
new file mode 100644
index 000000000000..b066f427e0fc
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.c
@@ -0,0 +1,45 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/utsname.h>
+
+#include "../../zbuild.h"
+#include "riscv_features.h"
+
+#define ISA_V_HWCAP (1 << ('v' - 'a'))
+
+int Z_INTERNAL is_kernel_version_greater_or_equal_to_6_5() {
+    struct utsname buffer;
+    uname(&buffer);
+
+    int major, minor;
+    if (sscanf(buffer.release, "%d.%d", &major, &minor) != 2) {
+        // Something bad with uname()
+        return 0;
+    }
+
+    if (major > 6 || major == 6 && minor >= 5)
+        return 1;
+    return 0;
+}
+
+void Z_INTERNAL riscv_check_features_compile_time(struct riscv_cpu_features *features) {
+#if defined(__riscv_v) && defined(__linux__)
+    features->has_rvv = 1;
+#else
+    features->has_rvv = 0;
+#endif
+}
+
+void Z_INTERNAL riscv_check_features_runtime(struct riscv_cpu_features *features) {
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+    features->has_rvv = hw_cap & ISA_V_HWCAP;
+}
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features) {
+    if (is_kernel_version_greater_or_equal_to_6_5())
+        riscv_check_features_runtime(features);
+    else
+        riscv_check_features_compile_time(features);
+}
diff --git a/3rdparty/zlib-ng/arch/riscv/riscv_features.h b/3rdparty/zlib-ng/arch/riscv/riscv_features.h
new file mode 100644
index 000000000000..c76e967c36ce
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/riscv/riscv_features.h
@@ -0,0 +1,18 @@
+/* riscv_features.h -- check for riscv features.
+ *
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef RISCV_H_
+#define RISCV_H_
+
+struct riscv_cpu_features {
+    int has_rvv;
+};
+
+void Z_INTERNAL riscv_check_features(struct riscv_cpu_features *features);
+
+#endif /* RISCV_H_ */
diff --git a/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
new file mode 100644
index 000000000000..1164e89ba250
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/riscv/slide_hash_rvv.c
@@ -0,0 +1,34 @@
+/* slide_hash_rvv.c - RVV version of slide_hash
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include <riscv_vector.h>
+
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    size_t vl;
+    while (entries > 0) {
+        vl = __riscv_vsetvl_e16m4(entries);
+        vuint16m4_t v_tab = __riscv_vle16_v_u16m4(table, vl);
+        vuint16m4_t v_diff = __riscv_vsub_vx_u16m4(v_tab, wsize, vl);
+        vbool4_t mask = __riscv_vmsltu_vx_u16m4_b4(v_tab, wsize, vl);
+        v_tab = __riscv_vmerge_vxm_u16m4(v_diff, 0, mask, vl);
+        __riscv_vse16_v_u16m4(table, v_tab, vl);
+        table += vl, entries -= vl;
+    }
+}
+
+Z_INTERNAL void slide_hash_rvv(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+
+#endif // RISCV_RVV
diff --git a/3rdparty/zlib-ng/arch/x86/Makefile.in b/3rdparty/zlib-ng/arch/x86/Makefile.in
new file mode 100644
index 000000000000..7c052469b298
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/Makefile.in
@@ -0,0 +1,147 @@
+# Makefile for zlib
+# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler
+# For conditions of distribution and use, see copyright notice in zlib.h
+
+CC=
+CFLAGS=
+SFLAGS=
+INCLUDES=
+SUFFIX=
+
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
+AVX512VNNIFLAG=-mavx512vnni
+AVX2FLAG=-mavx2
+SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
+SSE42FLAG=-msse4.2
+PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
+XSAVEFLAG=-mxsave
+NOLTOFLAG=
+
+SRCDIR=.
+SRCTOP=../..
+TOPDIR=$(SRCTOP)
+
+all: \
+	x86_features.o x86_features.lo \
+	adler32_avx2.o adler32_avx2.lo \
+	adler32_avx512.o adler32_avx512.lo \
+	adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+	adler32_sse42.o adler32_sse42.lo \
+	adler32_ssse3.o adler32_ssse3.lo \
+	chunkset_avx2.o chunkset_avx2.lo \
+	chunkset_sse2.o chunkset_sse2.lo \
+	chunkset_ssse3.o chunkset_ssse3.lo \
+	compare256_avx2.o compare256_avx2.lo \
+	compare256_sse2.o compare256_sse2.lo \
+	insert_string_sse42.o insert_string_sse42.lo \
+	crc32_pclmulqdq.o crc32_pclmulqdq.lo \
+	crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
+	slide_hash_avx2.o slide_hash_avx2.lo \
+	slide_hash_sse2.o slide_hash_sse2.lo
+
+x86_features.o:
+	$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+x86_features.lo:
+	$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+chunkset_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c
+
+chunkset_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_ssse3.o:
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+chunkset_ssse3.lo:
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_ssse3.c
+
+compare256_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
+
+compare256_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+compare256_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
+
+insert_string_sse42.o:
+	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
+
+insert_string_sse42.lo:
+	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
+
+crc32_pclmulqdq.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_pclmulqdq.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
+
+crc32_vpclmulqdq.o:
+	$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+
+crc32_vpclmulqdq.lo:
+	$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
+
+slide_hash_avx2.o:
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_avx2.lo:
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
+
+slide_hash_sse2.o:
+	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+slide_hash_sse2.lo:
+	$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
+
+adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
+	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
+	$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+	$(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+	$(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+	$(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+	$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+mostlyclean: clean
+clean:
+	rm -f *.o *.lo *~
+	rm -rf objs
+	rm -f *.gcda *.gcno *.gcov
+
+distclean: clean
+	rm -f Makefile
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx2.c b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
new file mode 100644
index 000000000000..e3ac6705cef3
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx2.c
@@ -0,0 +1,154 @@
+/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2022 Adam Stylinski
+ * Authors:
+ *   Brian Bockelman <bockelman@gmail.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX2
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "adler32_avx2_p.h"
+#include "x86_intrins.h"
+
+#ifdef X86_SSE42
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
+
+#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
+#define sub32(a, b, c) adler32_ssse3(a, b, c)
+#else
+#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
+#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
+#endif
+
+static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16) {
+        if (COPY) {
+            return adler32_copy_len_16(adler0, src, dst, len, adler1);
+        } else {
+            return adler32_len_16(adler0, src, len, adler1);
+        }
+    } else if (len < 32) {
+        if (COPY) {
+            return copy_sub32(adler, dst, src, len);
+        } else {
+            return sub32(adler, src, len);
+        }
+    }
+
+    __m256i vs1, vs2;
+
+    const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+                                           14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m256i dot3v = _mm256_set1_epi16(1);
+    const __m256i zero = _mm256_setzero_si256();
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+
+        size_t k = MIN(len, NMAX);
+        k -= k % 32;
+        len -= k;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+            */
+            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+
+            if (COPY) {
+                _mm256_storeu_si256((__m256i*)dst, vbuf);
+                dst += 32;
+            }
+ 
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+            vs2 = _mm256_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        /* Defer the multiplication with 32 to outside of the loop */
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+
+        /* The compiler is generating the following sequence for this integer modulus
+         * when done the scalar way, in GPRs:
+
+         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+
+         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
+         ...
+         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
+         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
+         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
+         shr    $0x2f,%rsi // shift right by 47
+         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
+         ...
+         // repeats for each element with vpextract instructions
+
+         This is tricky with AVX2 for a number of reasons:
+             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
+             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
+                 back down to 32 bit precision later (there is in AVX512)
+             3.) Full width integer multiplications aren't cheap
+
+         We can, however, do a relatively cheap sequence for horizontal sums.
+         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
+         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
+         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
+         performed on the maximum possible inputs before overflow
+         */
+
+
+         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
+          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
+          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
+          * what the compiler is doing to avoid integer divisions. */
+         adler0 = partial_hsum256(vs1) % BASE;
+         adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_fold_copy_impl(adler, NULL, src, len, 0);
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_fold_copy_impl(adler, dst, src, len, 1);
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx2_p.h b/3rdparty/zlib-ng/arch/x86/adler32_avx2_p.h
new file mode 100644
index 000000000000..f0f8a4a887b1
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx2_p.h
@@ -0,0 +1,32 @@
+/* adler32_avx2_p.h -- adler32 avx2 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX2_P_H_
+#define ADLER32_AVX2_P_H_
+
+#if defined(X86_AVX2) || defined(X86_AVX512VNNI)
+
+/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
+static inline uint32_t hsum256(__m256i x) {
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
+                                  _mm256_castsi256_si128(x));
+    __m128i sum2  = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros */
+    const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
+    __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
+    __m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
+    __m128i sum2  = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
+    __m128i sum3  = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx512.c b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
new file mode 100644
index 000000000000..aa6cc170185b
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512.c
@@ -0,0 +1,115 @@
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "../../cpu_features.h"
+#include <immintrin.h>
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+
+static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 64) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        if (COPY) {
+            __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
+            __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
+            _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
+        }
+
+#ifdef X86_AVX2
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+    }
+
+    __m512i vbuf, vs1_0, vs3;
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+    const __m512i dot3v = _mm512_set1_epi16(1);
+    const __m512i zero = _mm512_setzero_si512();
+    size_t k;
+
+    while (len >= 64) {
+        __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        vs1_0 = vs1;
+        vs3 = _mm512_setzero_si512();
+
+        k = MIN(len, NMAX);
+        k -= k % 64;
+        len -= k;
+
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf = _mm512_loadu_si512(src);
+
+            if (COPY) {
+                _mm512_storeu_si512(dst, vbuf);
+                dst += 64;
+            }
+
+            src += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+            __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+            vs1 = _mm512_add_epi32(vs1_sad, vs1);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm512_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    return adler32_fold_copy_impl(adler, dst, src, len, 1);
+}
+
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+    return adler32_fold_copy_impl(adler, NULL, src, len, 0);
+}
+
+#endif
+
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx512_p.h b/3rdparty/zlib-ng/arch/x86/adler32_avx512_p.h
new file mode 100644
index 000000000000..5b79d2ab6ee7
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512_p.h
@@ -0,0 +1,46 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+    __m256i a = _mm512_extracti64x4_epi64(x, 1);
+    __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+    __m256i a_plus_b = _mm256_add_epi32(a, b);
+    __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+    __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+    __m128i c_plus_d = _mm_add_epi32(c, d);
+
+    __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+    __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+    return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+    /* We need a permutation vector to extract every other integer. The
+     * rest are going to be zeros. Marking this const so the compiler stands
+     * a better chance of keeping this resident in a register through entire
+     * loop execution. We certainly have enough zmm registers (32) */
+    const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+                                               1, 1, 1, 1, 1,  1,  1,  1);
+
+    __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+    /* From here, it's a simple 256 bit wide reduction sum */
+    __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+
+    /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+     * pretty slow, much slower than the longer instruction sequence below */
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+                                  _mm256_castsi256_si128(non_zero_avx));
+    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
new file mode 100644
index 000000000000..771f7ebe043f
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/adler32_avx512_vnni.c
@@ -0,0 +1,225 @@
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512VNNI
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../cpu_features.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "x86_intrins.h"
+#include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 32)
+#if defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+
+    if (len < 64)
+#ifdef X86_AVX2
+        return adler32_avx2(adler, src, len);
+#elif defined(X86_SSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+
+    const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+                                          38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                                          56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+    const __m512i zero = _mm512_setzero_si512();
+    __m512i vs1, vs2;
+
+    while (len >= 64) {
+        vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 64;
+        len -= k;
+        __m512i vs1_0 = vs1;
+        __m512i vs3 = _mm512_setzero_si512();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m512i vs2_1 = _mm512_setzero_si512();
+        __m512i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 128) {
+            vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+            src += 64;
+            k -= 64;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 128) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm512_loadu_si512((__m512i*)src);
+            vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+            src += 128;
+            k -= 128;
+
+            __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs3 = _mm512_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm512_add_epi32(vs3, vs1);
+            vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+            vs1 = _mm512_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm512_slli_epi32(vs3, 6);
+        vs2 = _mm512_add_epi32(vs2, vs3);
+        vs2 = _mm512_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    if (src == NULL) return 1L;
+    if (len == 0) return adler;
+
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel_copy:
+    if (len < 32) {
+        /* This handles the remaining copies, just call normal adler checksum after this */
+        __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+        __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+        _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+#if defined(X86_SSSE3)
+        return adler32_ssse3(adler, src, len);
+#else
+        return adler32_len_16(adler0, src, len, adler1);
+#endif
+    }
+
+    const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+    const __m256i zero = _mm256_setzero_si256();
+    __m256i vs1, vs2;
+
+    while (len >= 32) {
+        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+        size_t k = MIN(len, NMAX);
+        k -= k % 32;
+        len -= k;
+        __m256i vs1_0 = vs1;
+        __m256i vs3 = _mm256_setzero_si256();
+        /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+        __m256i vs2_1 = _mm256_setzero_si256();
+        __m256i vbuf0, vbuf1;
+
+        /* Remainder peeling */
+        if (k % 64) {
+            vbuf1 = _mm256_loadu_si256((__m256i*)src);
+            _mm256_storeu_si256((__m256i*)dst, vbuf1);
+            dst += 32;
+
+            src += 32;
+            k -= 32;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        /* Manually unrolled this loop by 2 for an decent amount of ILP */
+        while (k >= 64) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+            */
+            vbuf0 = _mm256_loadu_si256((__m256i*)src);
+            vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+            _mm256_storeu_si256((__m256i*)dst, vbuf0);
+            _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+            dst += 64;
+            src += 64;
+            k -= 64;
+
+            __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs3 = _mm256_add_epi32(vs3, vs1_0);
+            /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+             * instructions to eliminate them */
+            vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+            vs3 = _mm256_add_epi32(vs3, vs1);
+            vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+            vs1 = _mm256_add_epi32(vs1, vs1_sad);
+            vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm256_slli_epi32(vs3, 5);
+        vs2 = _mm256_add_epi32(vs2, vs3);
+        vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+        adler0 = partial_hsum256(vs1) % BASE;
+        adler1 = hsum256(vs2) % BASE;
+    }
+
+    adler = adler0 | (adler1 << 16);
+
+    /* Process tail (len < 64). */
+    if (len) {
+        goto rem_peel_copy;
+    }
+
+    return adler;
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_sse42.c b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
new file mode 100644
index 000000000000..257a360982ed
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/adler32_sse42.c
@@ -0,0 +1,121 @@
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "adler32_ssse3_p.h"
+#include <immintrin.h>
+
+#ifdef X86_SSE42
+
+Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+    uint32_t adler0, adler1;
+    adler1 = (adler >> 16) & 0xffff;
+    adler0 = adler & 0xffff;
+
+rem_peel:
+    if (len < 16) {
+       return adler32_copy_len_16(adler0, src, dst, len, adler1);
+    }
+
+    __m128i vbuf, vbuf_0;
+    __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            v_sad_sum2, vsum2, vsum2_0;
+    __m128i zero = _mm_setzero_si128();
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    size_t k;
+
+    while (len >= 16) {
+
+        k = MIN(len, NMAX);
+        k -= k % 16;
+        len -= k;
+
+        vs1 = _mm_cvtsi32_si128(adler0);
+        vs2 = _mm_cvtsi32_si128(adler1);
+
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+            src += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+            dst += 32;
+
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_loadu_si128((__m128i*)src);
+            src += 16;
+            k -= 16;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+
+            _mm_storeu_si128((__m128i*)dst, vbuf);
+            dst += 16;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        adler0 = partial_hsum(vs1) % BASE;
+        adler1 = hsum(vs2) % BASE;
+    }
+
+    /* If this is true, there's fewer than 16 elements remaining */
+    if (len) {
+        goto rem_peel;
+    }
+
+    return adler0 | (adler1 << 16);
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
new file mode 100644
index 000000000000..ae819d632e53
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/adler32_ssse3.c
@@ -0,0 +1,156 @@
+/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ *   Adam Stylinski <kungfujesus06@gmail.com>
+ *   Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
+    uint32_t sum2;
+
+     /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    const __m128i dot3v = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+
+    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+    /* If our buffer is unaligned (likely), make the determination whether
+     * or not there's enough of a buffer to consume to make the scalar, aligning
+     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+     * load. This is a pretty simple test, just test if 16 - the remainder + len is
+     * < 16 */
+    size_t max_iters = NMAX;
+    size_t rem = (uintptr_t)buf & 15;
+    size_t align_offset = 16 - rem;
+    size_t k = 0;
+    if (rem) {
+        if (len < 16 + align_offset) {
+            /* Let's eat the cost of this one unaligned load so that
+             * we don't completely skip over the vectorization. Doing
+             * 16 bytes at a time unaligned is better than 16 + <= 15
+             * sums */
+            vbuf = _mm_loadu_si128((__m128i*)buf);
+            len -= 16;
+            buf += 16;
+            vs1 = _mm_cvtsi32_si128(adler);
+            vs2 = _mm_cvtsi32_si128(sum2);
+            vs3 = _mm_setzero_si128();
+            vs1_0 = vs1;
+            goto unaligned_jmp;
+        }
+
+        for (size_t i = 0; i < align_offset; ++i) {
+            adler += *(buf++);
+            sum2 += adler;
+        }
+
+        /* lop off the max number of sums based on the scalar sums done
+         * above */
+        len -= align_offset;
+        max_iters -= align_offset;
+    }
+
+
+    while (len >= 16) {
+        vs1 = _mm_cvtsi32_si128(adler);
+        vs2 = _mm_cvtsi32_si128(sum2);
+        vs3 = _mm_setzero_si128();
+        vs2_0 = _mm_setzero_si128();
+        vs1_0 = vs1;
+
+        k = (len < max_iters ? len : max_iters);
+        k -= k % 16;
+        len -= k;
+
+        while (k >= 32) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)buf);
+            vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
+            buf += 32;
+            k -= 32;
+
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+
+            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+            vs1_0 = vs1;
+        }
+
+        vs2 = _mm_add_epi32(vs2_0, vs2);
+        vs3 = _mm_slli_epi32(vs3, 5);
+        vs2 = _mm_add_epi32(vs3, vs2);
+        vs3 = _mm_setzero_si128();
+
+        while (k >= 16) {
+            /*
+               vs1 = adler + sum(c[i])
+               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+            */
+            vbuf = _mm_load_si128((__m128i*)buf);
+            buf += 16;
+            k -= 16;
+
+unaligned_jmp:
+            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+            vs3 = _mm_add_epi32(vs1_0, vs3);
+            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+            vs2 = _mm_add_epi32(vsum2, vs2);
+            vs1_0 = vs1;
+        }
+
+        vs3 = _mm_slli_epi32(vs3, 4);
+        vs2 = _mm_add_epi32(vs2, vs3);
+
+        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+         * a partial reduction sum implicitly and only summing to integers in vector positions
+         * 0 and 2. This saves us some contention on the shuffle port(s) */
+        adler = partial_hsum(vs1) % BASE;
+        sum2 = hsum(vs2) % BASE;
+        max_iters = NMAX;
+    }
+
+    /* Process tail (len < 16).  */
+    return adler32_len_16(adler, buf, len, sum2);
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/adler32_ssse3_p.h b/3rdparty/zlib-ng/arch/x86/adler32_ssse3_p.h
new file mode 100644
index 000000000000..d7ec3fe0d5a3
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/adler32_ssse3_p.h
@@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+    __m128i second_int = _mm_srli_si128(x, 8);
+    __m128i sum = _mm_add_epi32(x, second_int);
+    return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+    __m128i sum1 = _mm_unpackhi_epi64(x, x);
+    __m128i sum2 = _mm_add_epi32(x, sum1);
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);
+    return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/chunkset_avx2.c b/3rdparty/zlib-ng/arch/x86/chunkset_avx2.c
new file mode 100644
index 000000000000..70620b91542e
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_avx2.c
@@ -0,0 +1,133 @@
+/* chunkset_avx2.c -- AVX2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+
+#ifdef X86_AVX2
+#include <immintrin.h>
+#include "../generic/chunk_permute_table.h"
+
+typedef __m256i chunk_t;
+
+#define CHUNK_SIZE 32
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+    { 0, 2},                /* 3 */
+    { 0, 0},                /* don't care */
+    { 1 * 32, 2},           /* 5 */
+    { 2 * 32, 2},           /* 6 */
+    { 3 * 32, 4},           /* 7 */
+    { 0 * 32, 0},           /* don't care */
+    { 4 * 32, 5},           /* 9 */
+    { 5 * 32, 22},          /* 10 */
+    { 6 * 32, 21},          /* 11 */
+    { 7 * 32, 20},          /* 12 */
+    { 8 * 32, 6},           /* 13 */
+    { 9 * 32, 4},           /* 14 */
+    {10 * 32, 2},           /* 15 */
+    { 0 * 32, 0},           /* don't care */
+    {11 * 32, 15},          /* 17 */
+    {11 * 32 + 16, 14},     /* 18 */
+    {11 * 32 + 16 * 2, 13}, /* 19 */
+    {11 * 32 + 16 * 3, 12}, /* 20 */
+    {11 * 32 + 16 * 4, 11}, /* 21 */
+    {11 * 32 + 16 * 5, 10}, /* 22 */
+    {11 * 32 + 16 * 6,  9}, /* 23 */
+    {11 * 32 + 16 * 7,  8}, /* 24 */
+    {11 * 32 + 16 * 8,  7}, /* 25 */
+    {11 * 32 + 16 * 9,  6}, /* 26 */
+    {11 * 32 + 16 * 10, 5}, /* 27 */
+    {11 * 32 + 16 * 11, 4}, /* 28 */
+    {11 * 32 + 16 * 12, 3}, /* 29 */
+    {11 * 32 + 16 * 13, 2}, /* 30 */
+    {11 * 32 + 16 * 14, 1}  /* 31 */
+};
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm256_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m256i ret_vec;
+    /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+     * compiling this to a shared load for all branches, preferring the simpler code.  Given that the buf value isn't in
+     * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+    *chunk_rem = lut_rem.remval;
+
+    /* See note in chunkset_ssse3.c for why this is ok */
+    __msan_unpoison(buf + dist, 32 - dist);
+
+    if (dist < 16) {
+        /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+         * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+         * shuffles and combining the halves later */
+        const __m256i permute_xform =
+            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+    } else if (dist == 16) {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+    } else {
+        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+        __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+        /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+        __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+        __m128i xlane_res  = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+        /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+         * shuffle those values */
+        __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+    }
+
+    return ret_vec;
+}
+
+#define CHUNKSIZE        chunksize_avx2
+#define CHUNKCOPY        chunkcopy_avx2
+#define CHUNKUNROLL      chunkunroll_avx2
+#define CHUNKMEMSET      chunkmemset_avx2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_avx2
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/chunkset_sse2.c b/3rdparty/zlib-ng/arch/x86/chunkset_sse2.c
new file mode 100644
index 000000000000..c402c0ee18f6
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_sse2.c
@@ -0,0 +1,56 @@
+/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+#ifdef X86_SSE2
+#include <immintrin.h>
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_sse2
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKMEMSET      chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_sse2
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
new file mode 100644
index 000000000000..c06d1b37bd7e
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/chunkset_ssse3.c
@@ -0,0 +1,101 @@
+/* chunkset_ssse3.c -- SSSE3 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
+ * code size by sharing the chunkcopy functions, which will certainly compile
+ * to identical machine code */
+#if defined(X86_SSSE3) && defined(X86_SSE2)
+#include <immintrin.h>
+#include "../generic/chunk_permute_table.h"
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+#define HAVE_CHUNKCOPY
+#define HAVE_CHUNKUNROLL
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    /* Important to note:
+     * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+     * bytes we willingly and purposefully load uninitialized that we swizzle over
+     * in a vector register, anyway.  If what we assume is wrong about what is used,
+     * the memory sanitizer will still usefully flag it */
+    __msan_unpoison(buf + dist, 16 - dist);
+    ret_vec = _mm_loadu_si128((__m128i*)buf);
+    *chunk_rem = lut_rem.remval;
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
+
+#define CHUNKSIZE        chunksize_ssse3
+#define CHUNKMEMSET      chunkmemset_ssse3
+#define CHUNKMEMSET_SAFE chunkmemset_safe_ssse3
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_ssse3
+
+#include "inffast_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/compare256_avx2.c b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
new file mode 100644
index 000000000000..1318a0e333a4
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/compare256_avx2.c
@@ -0,0 +1,63 @@
+/* compare256_avx2.c -- AVX2 version of compare256
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+        mask = (unsigned)_mm256_movemask_epi8(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_avx2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_avx2
+#define COMPARE256          compare256_avx2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_avx2
+#define COMPARE256          compare256_avx2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/compare256_sse2.c b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
new file mode 100644
index 000000000000..aad4bd240d20
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/compare256_sse2.c
@@ -0,0 +1,96 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    int align_offset = ((uintptr_t)src0) & 15;
+    const uint8_t *end0 = src0 + 256;
+    const uint8_t *end1 = src1 + 256;
+    __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+    /* Do the first load unaligned, than all subsequent ones we have at least
+     * one aligned load. Sadly aligning both loads is probably unrealistic */
+    xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+    xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+    xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+    unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+    /* Compiler _may_ turn this branch into a ptest + movemask,
+     * since a lot of those uops are shared and fused */
+    if (mask != 0xFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+        return len + match_byte;
+    }
+
+    int align_adv = 16 - align_offset;
+    len += align_adv;
+    src0 += align_adv;
+    src1 += align_adv;
+
+    /* Do a flooring division (should just be a shift right) */
+    int num_iter = (256 - len) / 16;
+
+    for (int i = 0; i < num_iter; ++i) {
+        xmm_src0 = _mm_load_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        /* Compiler _may_ turn this branch into a ptest + movemask,
+         * since a lot of those uops are shared and fused */
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        len += 16, src0 += 16, src1 += 16;
+    }
+
+    if (align_offset) {
+        src0 = end0 - 16;
+        src1 = end1 - 16;
+        len = 256 - 16;
+
+        xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+        mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_sse2
+#define COMPARE256          compare256_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_sse2
+#define COMPARE256          compare256_sse2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
new file mode 100644
index 000000000000..3e799283173c
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
@@ -0,0 +1,186 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef COPY
+Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+#endif
+    unsigned long algn_diff;
+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
+    __m128i xmm_crc_part = _mm_setzero_si128();
+#ifdef COPY
+    char ALIGNED_(16) partial_buf[16] = { 0 };
+#else
+    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
+    int32_t first = init_crc != 0;
+
+    /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
+     * bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to
+     * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
+     * by definition can be up to 15 bytes + one full vector load. */
+    assert(len >= 31 || first == 0);
+#endif
+    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+    if (len < 16) {
+#ifdef COPY
+        if (len == 0)
+            return;
+
+        memcpy(partial_buf, src, len);
+        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
+        memcpy(dst, partial_buf, len);
+#endif
+        goto partial;
+    }
+
+    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
+    if (algn_diff) {
+        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+        dst += algn_diff;
+#else
+        XOR_INITIAL128(xmm_crc_part);
+
+        if (algn_diff < 4 && init_crc != 0) {
+            xmm_t0 = xmm_crc_part;
+            xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
+            fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+            xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+            src += 16;
+            len -= 16;
+        }
+#endif
+
+        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+
+        src += algn_diff;
+        len -= algn_diff;
+    }
+
+#ifdef X86_VPCLMULQDQ
+    if (len >= 256) {
+#ifdef COPY
+        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
+        dst += n;
+#else
+        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
+            xmm_initial, first);
+        first = 0;
+#endif
+        len -= n;
+        src += n;
+    }
+#endif
+
+    while (len >= 64) {
+        len -= 64;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+        src += 64;
+
+        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+        dst += 64;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+
+        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+    }
+
+    /*
+     * len = num bytes left - 64
+     */
+    if (len >= 48) {
+        len -= 48;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        src += 48;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+        dst += 48;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+    } else if (len >= 32) {
+        len -= 32;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        src += 32;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        dst += 32;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+    } else if (len >= 16) {
+        len -= 16;
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        src += 16;
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        dst += 16;
+#else
+        XOR_INITIAL128(xmm_t0);
+#endif
+        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+    }
+
+partial:
+    if (len) {
+        memcpy(&xmm_crc_part, src, len);
+#ifdef COPY
+        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
+        memcpy(dst, partial_buf, len);
+#endif
+        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+    }
+
+    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+}
diff --git a/3rdparty/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h b/3rdparty/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
new file mode 100644
index 000000000000..3ea5c33055b0
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
@@ -0,0 +1,107 @@
+/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef COPY
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
+    __m128i init_crc, int32_t first) {
+    __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
+#endif
+    __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+    __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+    __m512i z0, z1, z2, z3;
+    size_t len_tmp = len;
+    const __m512i zmm_fold4 = _mm512_set4_epi32(
+        0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+    const __m512i zmm_fold16 = _mm512_set4_epi32(
+        0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+    // zmm register init
+    zmm_crc0 = _mm512_setzero_si512();
+    zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+#ifndef COPY
+    XOR_INITIAL512(zmm_t0);
+#endif
+    zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+    zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+    zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+    /* already have intermediate CRC in xmm registers
+        * fold4 with 4 xmm_crc to get zmm_crc0
+    */
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
+
+#ifdef COPY
+    _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+    _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+    _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+    _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+    dst += 256;
+#endif
+    len -= 256;
+    src += 256;
+
+    // fold-16 loops
+    while (len >= 256) {
+        zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+        zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+        zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+        zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+        z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
+        z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
+        z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
+        z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
+
+        zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
+        zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
+        zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
+        zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
+
+        zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
+        zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
+        zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
+        zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
+
+#ifdef COPY
+        _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+        _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+        _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+        _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+        dst += 256;
+#endif
+        len -= 256;
+        src += 256;
+    }
+    // zmm_crc[0,1,2,3] -> zmm_crc0
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
+
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
+
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
+
+    // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+    *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
+    *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
+    *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
+    *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
+
+    return (len_tmp - len);  // return n bytes processed
+}
diff --git a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq.c b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq.c
new file mode 100644
index 000000000000..9383b7a2ba00
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq.c
@@ -0,0 +1,30 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#define CRC32_FOLD_COPY  crc32_fold_pclmulqdq_copy
+#define CRC32_FOLD       crc32_fold_pclmulqdq
+#define CRC32_FOLD_RESET crc32_fold_pclmulqdq_reset
+#define CRC32_FOLD_FINAL crc32_fold_pclmulqdq_final
+#define CRC32            crc32_pclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
new file mode 100644
index 000000000000..05d3b15257f7
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
@@ -0,0 +1,363 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include <immintrin.h>
+#include <wmmintrin.h>
+#include <smmintrin.h> // _mm_extract_epi32
+#ifdef X86_VPCLMULQDQ
+#  include <immintrin.h>
+#endif
+
+#include "../../crc32_fold.h"
+#include "../../crc32_braid_p.h"
+#include "x86_intrins.h"
+#include <assert.h>
+
+#ifdef X86_VPCLMULQDQ
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
+    int32_t first);
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc3, ps_res;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
+
+    *xmm_crc0 = *xmm_crc1;
+    *xmm_crc1 = *xmm_crc2;
+    *xmm_crc2 = x_tmp3;
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3, x_tmp2;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
+
+    x_tmp3 = *xmm_crc3;
+    x_tmp2 = *xmm_crc2;
+
+    *xmm_crc3 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
+
+    *xmm_crc2 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
+
+    *xmm_crc0 = x_tmp2;
+    *xmm_crc1 = x_tmp3;
+    *xmm_crc2 = _mm_castps_si128(ps_res20);
+    *xmm_crc3 = _mm_castps_si128(ps_res31);
+}
+
+static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc2;
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
+
+    *xmm_crc2 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
+
+    *xmm_crc1 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
+
+    *xmm_crc0 = x_tmp3;
+    *xmm_crc1 = _mm_castps_si128(ps_res10);
+    *xmm_crc2 = _mm_castps_si128(ps_res21);
+    *xmm_crc3 = _mm_castps_si128(ps_res32);
+}
+
+static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
+    __m128 ps_t0, ps_t1, ps_t2, ps_t3;
+    __m128 ps_res0, ps_res1, ps_res2, ps_res3;
+
+    x_tmp0 = *xmm_crc0;
+    x_tmp1 = *xmm_crc1;
+    x_tmp2 = *xmm_crc2;
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_t0 = _mm_castsi128_ps(x_tmp0);
+    ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
+
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_t1 = _mm_castsi128_ps(x_tmp1);
+    ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
+
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_t2 = _mm_castsi128_ps(x_tmp2);
+    ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
+
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+    x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_t3 = _mm_castsi128_ps(x_tmp3);
+    ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
+
+    *xmm_crc0 = _mm_castps_si128(ps_res0);
+    *xmm_crc1 = _mm_castps_si128(ps_res1);
+    *xmm_crc2 = _mm_castps_si128(ps_res2);
+    *xmm_crc3 = _mm_castps_si128(ps_res3);
+}
+
+static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
+    0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
+    0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
+    0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
+    0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
+    0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
+    0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
+    0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl  9 (16 - 7)/shr7 */
+    0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl  8 (16 - 8)/shr8 */
+    0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl  7 (16 - 9)/shr9 */
+    0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl  6 (16 -10)/shr10*/
+    0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl  5 (16 -11)/shr11*/
+    0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl  4 (16 -12)/shr12*/
+    0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
+    0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
+    0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
+};
+
+static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
+                         __m128i *xmm_crc3, __m128i *xmm_crc_part) {
+    const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
+                                             0x00000001, 0xc6e41596);
+    const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
+
+    __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
+    __m128i xmm_a0_0, xmm_a0_1;
+    __m128 ps_crc3, psa0_0, psa0_1, ps_res;
+
+    xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
+    xmm_shr = xmm_shl;
+    xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
+
+    xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
+
+    *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
+    xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
+    *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
+
+    *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
+    xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
+    *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
+
+    *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
+    xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
+    *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
+
+    *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
+    *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
+    *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
+
+    xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
+    xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
+
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    psa0_0 = _mm_castsi128_ps(xmm_a0_0);
+    psa0_1 = _mm_castsi128_ps(xmm_a0_1);
+
+    ps_res = _mm_xor_ps(ps_crc3, psa0_0);
+    ps_res = _mm_xor_ps(ps_res, psa0_1);
+
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
+    *fold0 = _mm_load_si128(fold + 0);
+    *fold1 = _mm_load_si128(fold + 1);
+    *fold2 = _mm_load_si128(fold + 2);
+    *fold3 = _mm_load_si128(fold + 3);
+}
+
+static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __m128i *fold1,
+                                   const __m128i *fold2, const __m128i *fold3) {
+    _mm_storeu_si128(fold + 0, *fold0);
+    _mm_storeu_si128(fold + 1, *fold1);
+    _mm_storeu_si128(fold + 2, *fold2);
+    _mm_storeu_si128(fold + 3, *fold3);
+}
+
+Z_INTERNAL uint32_t CRC32_FOLD_RESET(crc32_fold *crc) {
+    __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+    __m128i xmm_zero = _mm_setzero_si128();
+    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
+    return 0;
+}
+
+#define ONCE(op)                 if (first) { first = 0; op; }
+#define XOR_INITIAL128(where)    ONCE(where = _mm_xor_si128(where, xmm_initial))
+#ifdef X86_VPCLMULQDQ
+#  define XOR_INITIAL512(where)  ONCE(where = _mm512_xor_si512(where, zmm_initial))
+#endif
+
+#ifdef X86_VPCLMULQDQ
+#  include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
+#include "crc32_fold_pclmulqdq_tpl.h"
+#define COPY
+#ifdef X86_VPCLMULQDQ
+#  include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
+#include "crc32_fold_pclmulqdq_tpl.h"
+
+static const unsigned ALIGNED_(16) crc_k[] = {
+    0xccaa009e, 0x00000000, /* rk1 */
+    0x751997d0, 0x00000001, /* rk2 */
+    0xccaa009e, 0x00000000, /* rk5 */
+    0x63cd6124, 0x00000001, /* rk6 */
+    0xf7011640, 0x00000001, /* rk7 */
+    0xdb710640, 0x00000001  /* rk8 */
+};
+
+static const unsigned ALIGNED_(16) crc_mask[4] = {
+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
+};
+
+static const unsigned ALIGNED_(16) crc_mask2[4] = {
+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+};
+
+Z_INTERNAL uint32_t CRC32_FOLD_FINAL(crc32_fold *crc) {
+    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
+    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
+    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
+    __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
+
+    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+    /*
+     * k1
+     */
+    crc_fold = _mm_load_si128((__m128i *)crc_k);
+
+    x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
+    xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
+
+    x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
+    xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
+
+    x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
+    xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+
+    /*
+     * k5
+     */
+    crc_fold = _mm_load_si128((__m128i *)(crc_k + 4));
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
+
+    /*
+     * k7
+     */
+    xmm_crc1 = xmm_crc3;
+    xmm_crc2 = xmm_crc3;
+    crc_fold = _mm_load_si128((__m128i *)(crc_k + 8));
+
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
+
+    xmm_crc2 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
+
+    crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
+
+    return crc->value;
+}
+
+Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) {
+    /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
+     * these short lengths might also prove to be effective */
+    if (len < 64)
+        return PREFIX(crc32_braid)(crc32, buf, len);
+
+    crc32_fold ALIGNED_(16) crc_state;
+    CRC32_FOLD_RESET(&crc_state);
+    CRC32_FOLD(&crc_state, buf, len, crc32);
+    return CRC32_FOLD_FINAL(&crc_state);
+}
diff --git a/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
new file mode 100644
index 000000000000..ec641b43263b
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/crc32_vpclmulqdq.c
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+
+#define X86_VPCLMULQDQ
+#define CRC32_FOLD_COPY  crc32_fold_vpclmulqdq_copy
+#define CRC32_FOLD       crc32_fold_vpclmulqdq
+#define CRC32_FOLD_RESET crc32_fold_vpclmulqdq_reset
+#define CRC32_FOLD_FINAL crc32_fold_vpclmulqdq_final
+#define CRC32            crc32_vpclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c b/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
new file mode 100644
index 000000000000..ae092a7e477f
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/insert_string_sse42.c
@@ -0,0 +1,24 @@
+/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#ifdef X86_SSE42
+#include "../../zbuild.h"
+#include <nmmintrin.h>
+#include "../../deflate.h"
+
+#define HASH_CALC(s, h, val)\
+    h = _mm_crc32_u32(h, val)
+
+#define HASH_CALC_VAR       h
+#define HASH_CALC_VAR_INIT  uint32_t h = 0
+
+#define UPDATE_HASH         update_hash_sse42
+#define INSERT_STRING       insert_string_sse42
+#define QUICK_INSERT_STRING quick_insert_string_sse42
+
+#include "../../insert_string_tpl.h"
+#endif
diff --git a/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
new file mode 100644
index 000000000000..94fe10c7bf4a
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_avx2.c
@@ -0,0 +1,39 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+    table += entries;
+    table -= 16;
+
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)table);
+        result = _mm256_subs_epu16(value, wsize);
+        _mm256_storeu_si256((__m256i *)table, result);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+
+    slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+    slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
diff --git a/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
new file mode 100644
index 000000000000..5daac4a73981
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/slide_hash_sse2.c
@@ -0,0 +1,62 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+                                    uint32_t entries1, const __m128i wsize) {
+    uint32_t entries;
+    Pos *table;
+    __m128i value0, value1, result0, result1;
+
+    int on_chain = 0;
+
+next_chain:
+    table = (on_chain) ? table1 : table0;
+    entries = (on_chain) ? entries1 : entries0;
+
+    table += entries;
+    table -= 16;
+
+    /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+     * Our alloc function is aligned to 64 byte boundaries */
+    do {
+        value0 = _mm_load_si128((__m128i *)table);
+        value1 = _mm_load_si128((__m128i *)(table + 8));
+        result0 = _mm_subs_epu16(value0, wsize);
+        result1 = _mm_subs_epu16(value1, wsize);
+        _mm_store_si128((__m128i *)table, result0);
+        _mm_store_si128((__m128i *)(table + 8), result1);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+
+    ++on_chain;
+    if (on_chain > 1) {
+        return;
+    } else {
+        goto next_chain;
+    }
+}
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+    assert(((uintptr_t)s->head & 15) == 0);
+    assert(((uintptr_t)s->prev & 15) == 0);
+
+    slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
diff --git a/3rdparty/zlib-ng/arch/x86/x86_features.c b/3rdparty/zlib-ng/arch/x86/x86_features.c
new file mode 100644
index 000000000000..8d11564c24f9
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.c
@@ -0,0 +1,97 @@
+/* x86_features.c - x86 feature check
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Author:
+ *  Jim Kukunas
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "x86_features.h"
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#else
+// Newer versions of GCC and clang come with cpuid.h
+#  include <cpuid.h>
+#endif
+
+#include <string.h>
+
+static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _MSC_VER
+    unsigned int registers[4];
+    __cpuid((int *)registers, info);
+
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#else
+    __cpuid(info, *eax, *ebx, *ecx, *edx);
+#endif
+}
+
+static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
+#ifdef _MSC_VER
+    unsigned int registers[4];
+    __cpuidex((int *)registers, info, subinfo);
+
+    *eax = registers[0];
+    *ebx = registers[1];
+    *ecx = registers[2];
+    *edx = registers[3];
+#else
+    __cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
+#endif
+}
+
+static inline uint64_t xgetbv(unsigned int xcr) {
+#ifdef _MSC_VER
+    return _xgetbv(xcr);
+#else
+    uint32_t eax, edx;
+    __asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
+    return (uint64_t)(edx) << 32 | eax;
+#endif
+}
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
+    unsigned eax, ebx, ecx, edx;
+    unsigned maxbasic;
+
+    cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+    cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
+
+    features->has_sse2 = edx & 0x4000000;
+    features->has_ssse3 = ecx & 0x200;
+    features->has_sse42 = ecx & 0x100000;
+    features->has_pclmulqdq = ecx & 0x2;
+
+    if (ecx & 0x08000000) {
+        uint64_t xfeature = xgetbv(0);
+
+        features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
+        features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
+    }
+
+    if (maxbasic >= 7) {
+        cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
+
+        // check BMI1 bit
+        // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+        features->has_vpclmulqdq = ecx & 0x400;
+
+        // check AVX2 bit if the OS supports saving YMM registers
+        if (features->has_os_save_ymm) {
+            features->has_avx2 = ebx & 0x20;
+        }
+
+        // check AVX512 bits if the OS supports saving ZMM registers
+        if (features->has_os_save_zmm) {
+            features->has_avx512 = ebx & 0x00010000;
+            features->has_avx512vnni = ecx & 0x800;
+        }
+    }
+}
diff --git a/3rdparty/zlib-ng/arch/x86/x86_features.h b/3rdparty/zlib-ng/arch/x86/x86_features.h
new file mode 100644
index 000000000000..4a36bde835d3
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/x86_features.h
@@ -0,0 +1,24 @@
+/* x86_features.h -- check for CPU features
+* Copyright (C) 2013 Intel Corporation Jim Kukunas
+* For conditions of distribution and use, see copyright notice in zlib.h
+*/
+
+#ifndef X86_FEATURES_H_
+#define X86_FEATURES_H_
+
+struct x86_cpu_features {
+    int has_avx2;
+    int has_avx512;
+    int has_avx512vnni;
+    int has_sse2;
+    int has_ssse3;
+    int has_sse42;
+    int has_pclmulqdq;
+    int has_vpclmulqdq;
+    int has_os_save_ymm;
+    int has_os_save_zmm;
+};
+
+void Z_INTERNAL x86_check_features(struct x86_cpu_features *features);
+
+#endif /* CPU_H_ */
diff --git a/3rdparty/zlib-ng/arch/x86/x86_intrins.h b/3rdparty/zlib-ng/arch/x86/x86_intrins.h
new file mode 100644
index 000000000000..52e1085d66f9
--- /dev/null
+++ b/3rdparty/zlib-ng/arch/x86/x86_intrins.h
@@ -0,0 +1,87 @@
+#ifndef X86_INTRINS_H
+#define X86_INTRINS_H
+
+/* Unfortunately GCC didn't support these things until version 10.
+ * Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
+ */
+#ifdef __AVX2__
+#include <immintrin.h>
+
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
+    || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+    __m128i r;
+    __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+    return _mm256_castsi128_si256(r);
+}
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+    __m128i r;
+    __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
+    return _mm512_castsi128_si512(r);
+}
+#endif // __AVX512F__
+#endif // gcc/AppleClang version test
+
+#endif // __AVX2__
+
+/* GCC <9 is missing some AVX512 intrinsics.
+ */
+#ifdef __AVX512F__
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
+#include <immintrin.h>
+
+#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
+                              ((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
+
+static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
+                                      char __q59, char __q58, char __q57, char __q56,
+                                      char __q55, char __q54, char __q53, char __q52,
+                                      char __q51, char __q50, char __q49, char __q48,
+                                      char __q47, char __q46, char __q45, char __q44,
+                                      char __q43, char __q42, char __q41, char __q40,
+                                      char __q39, char __q38, char __q37, char __q36,
+                                      char __q35, char __q34, char __q33, char __q32,
+                                      char __q31, char __q30, char __q29, char __q28,
+                                      char __q27, char __q26, char __q25, char __q24,
+                                      char __q23, char __q22, char __q21, char __q20,
+                                      char __q19, char __q18, char __q17, char __q16,
+                                      char __q15, char __q14, char __q13, char __q12,
+                                      char __q11, char __q10, char __q09, char __q08,
+                                      char __q07, char __q06, char __q05, char __q04,
+                                      char __q03, char __q02, char __q01, char __q00) {
+    return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
+                            PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
+                            PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
+                            PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
+                            PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
+                            PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
+                            PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
+                            PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
+}
+
+#undef PACK
+
+#endif // gcc version test
+#endif // __AVX512F__
+
+/* Missing zero-extension AVX and AVX512 intrinsics.
+ * Fixed in Microsoft Visual Studio 2017 version 15.7
+ * https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#ifdef __AVX2__
+static inline __m256i _mm256_zextsi128_si256(__m128i a) {
+    return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
+}
+#endif // __AVX2__
+
+#ifdef __AVX512F__
+static inline __m512i _mm512_zextsi128_si512(__m128i a) {
+    return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
+}
+#endif // __AVX512F__
+#endif // defined(_MSC_VER) && _MSC_VER < 1914
+
+#endif // include guard X86_INTRINS_H
diff --git a/3rdparty/zlib-ng/chunkset.c b/3rdparty/zlib-ng/chunkset.c
new file mode 100644
index 000000000000..7b2bb7ba3676
--- /dev/null
+++ b/3rdparty/zlib-ng/chunkset.c
@@ -0,0 +1,42 @@
+/* chunkset.c -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+typedef uint64_t chunk_t;
+
+#define CHUNK_SIZE 8
+
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    uint8_t *dest = (uint8_t *)chunk;
+    memcpy(dest, from, sizeof(uint32_t));
+    memcpy(dest+4, from, sizeof(uint32_t));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    memcpy(chunk, from, sizeof(uint64_t));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    memcpy(chunk, (uint8_t *)s, sizeof(uint64_t));
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    memcpy(out, chunk, sizeof(uint64_t));
+}
+
+#define CHUNKSIZE        chunksize_c
+#define CHUNKCOPY        chunkcopy_c
+#define CHUNKUNROLL      chunkunroll_c
+#define CHUNKMEMSET      chunkmemset_c
+#define CHUNKMEMSET_SAFE chunkmemset_safe_c
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST     inflate_fast_c
+
+#include "inffast_tpl.h"
diff --git a/3rdparty/zlib-ng/chunkset_tpl.h b/3rdparty/zlib-ng/chunkset_tpl.h
new file mode 100644
index 000000000000..f909a12557f0
--- /dev/null
+++ b/3rdparty/zlib-ng/chunkset_tpl.h
@@ -0,0 +1,200 @@
+/* chunkset_tpl.h -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include <stdlib.h>
+
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
+#endif
+
+/* Returns the chunk size */
+Z_INTERNAL uint32_t CHUNKSIZE(void) {
+    return sizeof(chunk_t);
+}
+
+/* Behave like memcpy, but assume that it's OK to overwrite at least
+   chunk_t bytes of output even if the length is shorter than this,
+   that the length is non-zero, and that `from` lags `out` by at least
+   sizeof chunk_t bytes (or that they don't overlap at all or simply that
+   the distance is less than the length of the copy).
+
+   Aside from better memory bus utilisation, this means that short copies
+   (chunk_t bytes or fewer) will fall straight through the loop
+   without iteration, which will hopefully make the branch prediction more
+   reliable. */
+#ifndef HAVE_CHUNKCOPY
+Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    Assert(len > 0, "chunkcopy should never have a length 0");
+    chunk_t chunk;
+    int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+    loadchunk(from, &chunk);
+    storechunk(out, &chunk);
+    out += align;
+    from += align;
+    len -= align;
+    while (len > 0) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += sizeof(chunk_t);
+        from += sizeof(chunk_t);
+        len -= sizeof(chunk_t);
+    }
+    return out;
+}
+#endif
+
+/* Perform short copies until distance can be rewritten as being at least
+   sizeof chunk_t.
+
+   This assumes that it's OK to overwrite at least the first
+   2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
+   This assumption holds because inflate_fast() starts every iteration with at
+   least 258 bytes of output space available (258 being the maximum length
+   output from a single token; see inflate_fast()'s assumptions below). */
+#ifndef HAVE_CHUNKUNROLL
+Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+    unsigned char const *from = out - *dist;
+    chunk_t chunk;
+    while (*dist < *len && *dist < sizeof(chunk_t)) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += *dist;
+        *len -= *dist;
+        *dist += *dist;
+    }
+    return out;
+}
+#endif
+
+#ifndef HAVE_CHUNK_MAG
+/* Loads a magazine to feed into memory of the pattern */
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+        /* This code takes string of length dist from "from" and repeats
+         * it for as many times as can fit in a chunk_t (vector register) */
+        uint32_t cpy_dist;
+        uint32_t bytes_remaining = sizeof(chunk_t);
+        chunk_t chunk_load;
+        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
+        while (bytes_remaining) {
+            cpy_dist = MIN(dist, bytes_remaining);
+            memcpy(cur_chunk, buf, cpy_dist);
+            bytes_remaining -= cpy_dist;
+            cur_chunk += cpy_dist;
+            /* This allows us to bypass an expensive integer division since we're effectively
+             * counting in this loop, anyway */
+            *chunk_rem = cpy_dist;
+        }
+
+        return chunk_load;
+}
+#endif
+
+/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
+   Return OUT + LEN. */
+Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
+    /* Debug performance related issues when len < sizeof(uint64_t):
+       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
+    Assert(dist > 0, "chunkmemset cannot have a distance 0");
+    /* Only AVX2 */
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+    if (len <= 16) {
+        return chunkmemset_ssse3(out, dist, len);
+    }
+#endif
+
+    uint8_t *from = out - dist;
+
+    if (dist == 1) {
+        memset(out, *from, len);
+        return out + len;
+    } else if (dist > sizeof(chunk_t)) {
+        return CHUNKCOPY(out, out - dist, len);
+    }
+
+    chunk_t chunk_load;
+    uint32_t chunk_mod = 0;
+
+    /* TODO: possibly build up a permutation table for this if not an even modulus */
+#ifdef HAVE_CHUNKMEMSET_2
+    if (dist == 2) {
+        chunkmemset_2(from, &chunk_load);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_4
+    if (dist == 4) {
+        chunkmemset_4(from, &chunk_load);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_8
+    if (dist == 8) {
+        chunkmemset_8(from, &chunk_load);
+    } else if (dist == sizeof(chunk_t)) {
+        loadchunk(from, &chunk_load);
+    } else
+#endif
+    {
+        chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
+    }
+
+    /* If we're lucky enough and dist happens to be an even modulus of our vector length,
+     * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
+    if (chunk_mod == 0) {
+        while (len >= (2 * sizeof(chunk_t))) {
+            storechunk(out, &chunk_load);
+            storechunk(out + sizeof(chunk_t), &chunk_load);
+            out += 2 * sizeof(chunk_t);
+            len -= 2 * sizeof(chunk_t);
+        }
+    }
+
+    /* If we don't have a "dist" length that divides evenly into a vector
+     * register, we can write the whole vector register but we need only
+     * advance by the amount of the whole string that fits in our chunk_t.
+     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
+    uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
+    while (len >= sizeof(chunk_t)) {
+        storechunk(out, &chunk_load);
+        len -= adv_amount;
+        out += adv_amount;
+    }
+
+    if (len) {
+        memcpy(out, &chunk_load, len);
+        out += len;
+    }
+
+    return out;
+}
+
+Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
+#if !defined(UNALIGNED64_OK)
+#  if !defined(UNALIGNED_OK)
+    static const uint32_t align_mask = 7;
+#  else
+    static const uint32_t align_mask = 3;
+#  endif
+#endif
+
+    len = MIN(len, left);
+    uint8_t *from = out - dist;
+#if !defined(UNALIGNED64_OK)
+    while (((uintptr_t)out & align_mask) && (len > 0)) {
+        *out++ = *from++;
+        --len;
+        --left;
+    }
+#endif
+    if (left < (unsigned)(3 * sizeof(chunk_t))) {
+        while (len > 0) {
+            *out++ = *from++;
+            --len;
+        }
+        return out;
+    }
+    if (len)
+        return CHUNKMEMSET(out, dist, len);
+
+    return out;
+}
diff --git a/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
new file mode 100644
index 000000000000..74ac3910b8f4
--- /dev/null
+++ b/3rdparty/zlib-ng/cmake/detect-intrinsics.cmake
@@ -0,0 +1,543 @@
+# detect-intrinsics.cmake -- Detect compiler intrinsics support
+# Licensed under the Zlib license, see LICENSE.md for details
+
+macro(check_acle_compiler_flag)
+    if(MSVC)
+        # Both ARM and ARM64-targeting msvc support intrinsics, but
+        # ARM msvc is missing some intrinsics introduced with ARMv8, e.g. crc32
+        if(MSVC_C_ARCHITECTURE_ID STREQUAL "ARM64")
+            set(HAVE_ACLE_FLAG TRUE)
+        endif()
+    else()
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+            if(NOT NATIVEFLAG)
+                set(ACLEFLAG "-march=armv8-a+crc" CACHE INTERNAL "Compiler option to enable ACLE support")
+            endif()
+        endif()
+        # Check whether compiler supports ACLE flag
+        set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+        check_c_source_compiles(
+            "int main() { return 0; }"
+            HAVE_ACLE_FLAG FAIL_REGEX "not supported")
+        if(NOT NATIVEFLAG AND NOT HAVE_ACLE_FLAG)
+            set(ACLEFLAG "-march=armv8-a+crc+simd" CACHE INTERNAL "Compiler option to enable ACLE support" FORCE)
+            # Check whether compiler supports ACLE flag
+            set(CMAKE_REQUIRED_FLAGS "${ACLEFLAG}")
+            check_c_source_compiles(
+                "int main() { return 0; }"
+                HAVE_ACLE_FLAG2 FAIL_REGEX "not supported")
+            set(HAVE_ACLE_FLAG ${HAVE_ACLE_FLAG2} CACHE INTERNAL "Have compiler option to enable ACLE intrinsics" FORCE)
+            unset(HAVE_ACLE_FLAG2 CACHE) # Don't cache this internal variable
+        endif()
+        set(CMAKE_REQUIRED_FLAGS)
+    endif()
+endmacro()
+
+macro(check_armv6_compiler_flag)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6)
+            if(HAVE_MARCH_ARMV6)
+                set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports ARMv6 inline asm
+    set(CMAKE_REQUIRED_FLAGS "${ARMV6FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "unsigned int f(unsigned int a, unsigned int b) {
+            unsigned int c;
+            __asm__ __volatile__ ( \"uqsub16 %0, %1, %2\" : \"=r\" (c) : \"r\" (a), \"r\" (b) );
+            return (int)c;
+        }
+        int main(void) { return f(1,2); }"
+        HAVE_ARMV6_INLINE_ASM
+    )
+    # Check whether compiler supports ARMv6 intrinsics
+    check_c_source_compiles(
+        "#if defined(_MSC_VER)
+        #include <intrin.h>
+        #else
+        #include <arm_acle.h>
+        #endif
+        unsigned int f(unsigned int a, unsigned int b) {
+        #if defined(_MSC_VER)
+            return _arm_uqsub16(a, b);
+        #else
+            return __uqsub16(a, b);
+        #endif
+        }
+        int main(void) { return 0; }"
+        HAVE_ARMV6_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx512_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+        else()
+            set(AVX512FLAG "/arch:AVX512")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
+            # instruction scheduling unless you specify a reasonable -mtune= target
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl")
+            if(NOT MSVC)
+                check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
+                if(HAVE_CASCADE_LAKE)
+                    set(AVX512FLAG "${AVX512FLAG} -mtune=cascadelake")
+                else()
+                    set(AVX512FLAG "${AVX512FLAG} -mtune=skylake-avx512")
+                endif()
+                unset(HAVE_CASCADE_LAKE)
+            endif()
+        endif()
+    elseif(MSVC)
+        set(AVX512FLAG "/arch:AVX512")
+    endif()
+    # Check whether compiler supports AVX512 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${AVX512FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m512i f(__m512i y) {
+          __m512i x = _mm512_set1_epi8(2);
+          return _mm512_sub_epi8(x, y);
+        }
+        int main(void) { return 0; }"
+        HAVE_AVX512_INTRIN
+    )
+
+    # Evidently both GCC and clang were late to implementing these
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __mmask16 f(__mmask16 x) { return _knot_mask16(x); }
+        int main(void) { return 0; }"
+        HAVE_MASK_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx512vnni_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX512VNNIFLAG "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
+        else()
+            set(AVX512VNNIFLAG "/arch:AVX512")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni")
+            if(NOT MSVC)
+                check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
+                if(HAVE_CASCADE_LAKE)
+                    set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=cascadelake")
+                else()
+                    set(AVX512VNNIFLAG "${AVX512VNNIFLAG} -mtune=skylake-avx512")
+                endif()
+                unset(HAVE_CASCADE_LAKE)
+            endif()
+        endif()
+    elseif(MSVC)
+        set(AVX512VNNIFLAG "/arch:AVX512")
+    endif()
+
+    # Check whether compiler supports AVX512vnni intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${AVX512VNNIFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m512i f(__m512i x, __m512i y) {
+            __m512i z = _mm512_setzero_epi32();
+            return _mm512_dpbusd_epi32(z, x, y);
+        }
+        int main(void) { return 0; }"
+        HAVE_AVX512VNNI_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_avx2_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(AVX2FLAG "-mavx2")
+        else()
+            set(AVX2FLAG "/arch:AVX2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(AVX2FLAG "-mavx2")
+        endif()
+    elseif(MSVC)
+        set(AVX2FLAG "/arch:AVX2")
+    endif()
+    # Check whether compiler supports AVX2 intrinics
+    set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m256i f(__m256i x) {
+            const __m256i y = _mm256_set1_epi16(1);
+            return _mm256_subs_epu16(x, y);
+        }
+        int main(void) { return 0; }"
+        HAVE_AVX2_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_neon_compiler_flag)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            if("${ARCH}" MATCHES "aarch64")
+                set(NEONFLAG "-march=armv8-a+simd")
+            else()
+                set(NEONFLAG "-mfpu=neon")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports NEON flag
+    set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#if defined(_M_ARM64) || defined(_M_ARM64EC)
+        #  include <arm64_neon.h>
+        #else
+        #  include <arm_neon.h>
+        #endif
+        int main() { return 0; }"
+        NEON_AVAILABLE FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_neon_ld4_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            if("${ARCH}" MATCHES "aarch64")
+                set(NEONFLAG "-march=armv8-a+simd")
+            else()
+                set(NEONFLAG "-mfpu=neon")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports loading 4 neon vecs into a register range
+    set(CMAKE_REQUIRED_FLAGS "${NEONFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+        #  include <arm64_neon.h>
+        #else
+        #  include <arm_neon.h>
+        #endif
+        int32x4x4_t f(int var[16]) { return vld1q_s32_x4(var); }
+        int main(void) { return 0; }"
+        NEON_HAS_LD4)
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_pclmulqdq_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(PCLMULFLAG "-mpclmul")
+        endif()
+    endif()
+    # Check whether compiler supports PCLMULQDQ intrinsics
+    if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
+        # The pclmul code currently crashes on Mac in 32bit mode. Avoid for now.
+        set(CMAKE_REQUIRED_FLAGS "${PCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+        check_c_source_compiles(
+            "#include <immintrin.h>
+            #include <wmmintrin.h>
+            __m128i f(__m128i a, __m128i b) { return _mm_clmulepi64_si128(a, b, 0x10); }
+            int main(void) { return 0; }"
+            HAVE_PCLMULQDQ_INTRIN
+        )
+        set(CMAKE_REQUIRED_FLAGS)
+    else()
+        set(HAVE_PCLMULQDQ_INTRIN OFF)
+    endif()
+endmacro()
+
+macro(check_vpclmulqdq_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(VPCLMULFLAG "-mvpclmulqdq -mavx512f")
+        endif()
+    endif()
+    # Check whether compiler supports VPCLMULQDQ intrinsics
+    if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
+        set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+        check_c_source_compiles(
+            "#include <immintrin.h>
+            #include <wmmintrin.h>
+            __m512i f(__m512i a) {
+                __m512i b = _mm512_setzero_si512();
+                return _mm512_clmulepi64_epi128(a, b, 0x10);
+            }
+            int main(void) { return 0; }"
+            HAVE_VPCLMULQDQ_INTRIN
+        )
+        set(CMAKE_REQUIRED_FLAGS)
+    else()
+        set(HAVE_VPCLMULQDQ_INTRIN OFF)
+    endif()
+endmacro()
+
+macro(check_ppc_intrinsics)
+    # Check if compiler supports AltiVec
+    set(CMAKE_REQUIRED_FLAGS "-maltivec ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <altivec.h>
+        int main(void)
+        {
+            vector int a = vec_splats(0);
+            vector int b = vec_splats(0);
+            a = vec_add(a, b);
+            return 0;
+        }"
+        HAVE_ALTIVEC
+        )
+    set(CMAKE_REQUIRED_FLAGS)
+
+    if(HAVE_ALTIVEC)
+        set(PPCFLAGS "-maltivec")
+    endif()
+
+    set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <altivec.h>
+        int main(void)
+        {
+            vector int a = vec_splats(0);
+            vector int b = vec_splats(0);
+            a = vec_add(a, b);
+            return 0;
+        }"
+        HAVE_NOVSX
+        )
+    set(CMAKE_REQUIRED_FLAGS)
+
+    if(HAVE_NOVSX)
+        set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
+    endif()
+
+    # Check if we have what we need for AltiVec optimizations
+    set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifdef __FreeBSD__
+        #include <machine/cpu.h>
+        #endif
+        int main() {
+        #ifdef __FreeBSD__
+            unsigned long hwcap;
+            elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+            return (hwcap & PPC_FEATURE_HAS_ALTIVEC);
+        #else
+            return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
+        #endif
+        }"
+        HAVE_VMX
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_power8_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(POWER8FLAG "-mcpu=power8")
+        endif()
+    endif()
+    # Check if we have what we need for POWER8 optimizations
+    set(CMAKE_REQUIRED_FLAGS "${POWER8FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifdef __FreeBSD__
+        #include <machine/cpu.h>
+        #endif
+        int main() {
+        #ifdef __FreeBSD__
+            unsigned long hwcap;
+            elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+            return (hwcap & PPC_FEATURE2_ARCH_2_07);
+        #else
+            return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07);
+        #endif
+        }"
+        HAVE_POWER8_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_rvv_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(RISCVFLAG "-march=rv64gcv")
+        endif()
+    endif()
+    # Check whether compiler supports RVV
+    set(CMAKE_REQUIRED_FLAGS "${RISCVFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <riscv_vector.h>
+        int main() {
+            return 0;
+        }"
+        HAVE_RVV_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_s390_intrinsics)
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifndef HWCAP_S390_VXRS
+        #define HWCAP_S390_VXRS HWCAP_S390_VX
+        #endif
+        int main() {
+            return (getauxval(AT_HWCAP) & HWCAP_S390_VXRS);
+        }"
+        HAVE_S390_INTRIN
+    )
+endmacro()
+
+macro(check_power9_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(POWER9FLAG "-mcpu=power9")
+        endif()
+    endif()
+    # Check if we have what we need for POWER9 optimizations
+    set(CMAKE_REQUIRED_FLAGS "${POWER9FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        #ifdef __FreeBSD__
+        #include <machine/cpu.h>
+        #endif
+        int main() {
+        #ifdef __FreeBSD__
+            unsigned long hwcap;
+            elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+            return (hwcap & PPC_FEATURE2_ARCH_3_00);
+        #else
+            return (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_3_00);
+        #endif
+        }"
+        HAVE_POWER9_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_sse2_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSE2FLAG "-msse2")
+        else()
+            set(SSE2FLAG "/arch:SSE2")
+        endif()
+    elseif(MSVC)
+        if(NOT "${ARCH}" MATCHES "x86_64")
+            set(SSE2FLAG "/arch:SSE2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSE2FLAG "-msse2")
+        endif()
+    endif()
+    # Check whether compiler supports SSE2 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSE2FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m128i f(__m128i x, __m128i y) { return _mm_sad_epu8(x, y); }
+        int main(void) { return 0; }"
+        HAVE_SSE2_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_ssse3_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSSE3FLAG "-mssse3")
+        else()
+            set(SSSE3FLAG "/arch:SSSE3")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSSE3FLAG "-mssse3")
+        endif()
+    endif()
+    # Check whether compiler supports SSSE3 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <immintrin.h>
+        __m128i f(__m128i u) {
+          __m128i v = _mm_set1_epi32(1);
+          return _mm_hadd_epi32(u, v);
+        }
+        int main(void) { return 0; }"
+        HAVE_SSSE3_INTRIN
+    )
+endmacro()
+
+macro(check_sse42_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+        if(CMAKE_HOST_UNIX OR APPLE)
+            set(SSE42FLAG "-msse4.2")
+        else()
+            set(SSE42FLAG "/arch:SSE4.2")
+        endif()
+    elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(SSE42FLAG "-msse4.2")
+        endif()
+    endif()
+    # Check whether compiler supports SSE4.2 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSE42FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <nmmintrin.h>
+        unsigned int f(unsigned int a, unsigned int b) { return _mm_crc32_u32(a, b); }
+        int main(void) { return 0; }"
+        HAVE_SSE42_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_vgfma_intrinsics)
+    if(NOT NATIVEFLAG)
+        set(VGFMAFLAG "-march=z13")
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+            set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
+        endif()
+        if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
+        endif()
+    endif()
+    # Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
+    set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <vecintrin.h>
+        int main(void) {
+            unsigned long long a __attribute__((vector_size(16))) = { 0 };
+            unsigned long long b __attribute__((vector_size(16))) = { 0 };
+            unsigned char c __attribute__((vector_size(16))) = { 0 };
+            c = vec_gfmsum_accum_128(a, b, c);
+            return c[0];
+        }"
+        HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
+macro(check_xsave_intrinsics)
+    if(NOT NATIVEFLAG AND NOT MSVC)
+        set(XSAVEFLAG "-mxsave")
+    endif()
+    set(CMAKE_REQUIRED_FLAGS "${XSAVEFLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#ifdef _MSC_VER
+        #  include <intrin.h>
+        #else
+        #  include <x86gprintrin.h>
+        #endif
+        unsigned int f(unsigned int a) { return (int) _xgetbv(a); }
+        int main(void) { return 0; }"
+        HAVE_XSAVE_INTRIN FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
diff --git a/3rdparty/zlib-ng/cmake/fallback-macros.cmake b/3rdparty/zlib-ng/cmake/fallback-macros.cmake
new file mode 100644
index 000000000000..8bc6cf25be92
--- /dev/null
+++ b/3rdparty/zlib-ng/cmake/fallback-macros.cmake
@@ -0,0 +1,19 @@
+# fallback-macros.cmake -- CMake fallback macros
+# Copyright (C) 2022 Nathan Moinvaziri
+# Licensed under the Zlib license, see LICENSE.md for details
+
+# CMake less than version 3.5.2
+if(NOT COMMAND add_compile_options)
+    macro(add_compile_options options)
+        string(APPEND CMAKE_C_FLAGS ${options})
+        string(APPEND CMAKE_CXX_FLAGS ${options})
+    endmacro()
+endif()
+
+# CMake less than version 3.14
+if(NOT COMMAND add_link_options)
+    macro(add_link_options options)
+        string(APPEND CMAKE_EXE_LINKER_FLAGS ${options})
+        string(APPEND CMAKE_SHARED_LINKER_FLAGS ${options})
+    endmacro()
+endif()
diff --git a/3rdparty/zlib-ng/compare256.c b/3rdparty/zlib-ng/compare256.c
new file mode 100644
index 000000000000..82551cdd579e
--- /dev/null
+++ b/3rdparty/zlib-ng/compare256.c
@@ -0,0 +1,180 @@
+/* compare256.c -- 256 byte memory comparison with match length return
+ * Copyright (C) 2020 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "fallback_builtins.h"
+
+/* ALIGNED, byte comparison */
+static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src0 += 1, src1 += 1, len += 1;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_c_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_c
+#define COMPARE256          compare256_c_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_c
+#define COMPARE256          compare256_c_static
+
+#include "match_tpl.h"
+
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+/* 16-bit unaligned integer comparison */
+static inline uint32_t compare256_unaligned_16_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        if (zng_memcmp_2(src0, src1) != 0)
+            return len + (*src0 == *src1);
+        src0 += 2, src1 += 2, len += 2;
+
+        if (zng_memcmp_2(src0, src1) != 0)
+            return len + (*src0 == *src1);
+        src0 += 2, src1 += 2, len += 2;
+
+        if (zng_memcmp_2(src0, src1) != 0)
+            return len + (*src0 == *src1);
+        src0 += 2, src1 += 2, len += 2;
+
+        if (zng_memcmp_2(src0, src1) != 0)
+            return len + (*src0 == *src1);
+        src0 += 2, src1 += 2, len += 2;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_unaligned_16_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_unaligned_16
+#define COMPARE256          compare256_unaligned_16_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_unaligned_16
+#define COMPARE256          compare256_unaligned_16_static
+
+#include "match_tpl.h"
+
+#ifdef HAVE_BUILTIN_CTZ
+/* 32-bit unaligned integer comparison */
+static inline uint32_t compare256_unaligned_32_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint32_t sv, mv, diff;
+
+        memcpy(&sv, src0, sizeof(sv));
+        memcpy(&mv, src1, sizeof(mv));
+
+        diff = sv ^ mv;
+        if (diff) {
+            uint32_t match_byte = __builtin_ctz(diff) / 8;
+            return len + match_byte;
+        }
+
+        src0 += 4, src1 += 4, len += 4;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_unaligned_32_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_unaligned_32
+#define COMPARE256          compare256_unaligned_32_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_unaligned_32
+#define COMPARE256          compare256_unaligned_32_static
+
+#include "match_tpl.h"
+
+#endif
+
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+/* UNALIGNED64_OK, 64-bit integer comparison */
+static inline uint32_t compare256_unaligned_64_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint64_t sv, mv, diff;
+
+        memcpy(&sv, src0, sizeof(sv));
+        memcpy(&mv, src1, sizeof(mv));
+
+        diff = sv ^ mv;
+        if (diff) {
+            uint64_t match_byte = __builtin_ctzll(diff) / 8;
+            return len + (uint32_t)match_byte;
+        }
+
+        src0 += 8, src1 += 8, len += 8;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_unaligned_64_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_unaligned_64
+#define COMPARE256          compare256_unaligned_64_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_unaligned_64
+#define COMPARE256          compare256_unaligned_64_static
+
+#include "match_tpl.h"
+
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/compare256_rle.h b/3rdparty/zlib-ng/compare256_rle.h
new file mode 100644
index 000000000000..0f3998d4a3f3
--- /dev/null
+++ b/3rdparty/zlib-ng/compare256_rle.h
@@ -0,0 +1,134 @@
+/* compare256_rle.h -- 256 byte run-length encoding comparison
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "fallback_builtins.h"
+
+typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
+
+/* ALIGNED, byte comparison */
+static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+        if (*src0 != *src1)
+            return len;
+        src1 += 1, len += 1;
+    } while (len < 256);
+
+    return 256;
+}
+
+#ifdef UNALIGNED_OK
+/* 16-bit unaligned integer comparison */
+static inline uint32_t compare256_rle_unaligned_16(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    uint16_t src0_cmp, src1_cmp;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+
+    do {
+        memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+        if (src0_cmp != src1_cmp)
+            return len + (*src0 == *src1);
+        src1 += 2, len += 2;
+        memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+        if (src0_cmp != src1_cmp)
+            return len + (*src0 == *src1);
+        src1 += 2, len += 2;
+        memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+        if (src0_cmp != src1_cmp)
+            return len + (*src0 == *src1);
+        src1 += 2, len += 2;
+        memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+        if (src0_cmp != src1_cmp)
+            return len + (*src0 == *src1);
+        src1 += 2, len += 2;
+    } while (len < 256);
+
+    return 256;
+}
+
+#ifdef HAVE_BUILTIN_CTZ
+/* 32-bit unaligned integer comparison */
+static inline uint32_t compare256_rle_unaligned_32(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t sv, len = 0;
+    uint16_t src0_cmp;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+    sv = ((uint32_t)src0_cmp << 16) | src0_cmp;
+
+    do {
+        uint32_t mv, diff;
+
+        memcpy(&mv, src1, sizeof(mv));
+
+        diff = sv ^ mv;
+        if (diff) {
+            uint32_t match_byte = __builtin_ctz(diff) / 8;
+            return len + match_byte;
+        }
+
+        src1 += 4, len += 4;
+    } while (len < 256);
+
+    return 256;
+}
+
+#endif
+
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+/* 64-bit unaligned integer comparison */
+static inline uint32_t compare256_rle_unaligned_64(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t src0_cmp32, len = 0;
+    uint16_t src0_cmp;
+    uint64_t sv;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+    src0_cmp32 = ((uint32_t)src0_cmp << 16) | src0_cmp;
+    sv = ((uint64_t)src0_cmp32 << 32) | src0_cmp32;
+
+    do {
+        uint64_t mv, diff;
+
+        memcpy(&mv, src1, sizeof(mv));
+
+        diff = sv ^ mv;
+        if (diff) {
+            uint64_t match_byte = __builtin_ctzll(diff) / 8;
+            return len + (uint32_t)match_byte;
+        }
+
+        src1 += 8, len += 8;
+    } while (len < 256);
+
+    return 256;
+}
+
+#endif
+
+#endif
+
diff --git a/3rdparty/zlib-ng/compress.c b/3rdparty/zlib-ng/compress.c
new file mode 100644
index 000000000000..66118e4f4b71
--- /dev/null
+++ b/3rdparty/zlib-ng/compress.c
@@ -0,0 +1,98 @@
+/* compress.c -- compress a memory buffer
+ * Copyright (C) 1995-2005, 2014, 2016 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+
+/* ===========================================================================
+ *  Architecture-specific hooks.
+ */
+#ifdef S390_DFLTCC_DEFLATE
+#  include "arch/s390/dfltcc_common.h"
+#else
+/* Returns the upper bound on compressed data length based on uncompressed data length, assuming default settings.
+ * Zero means that arch-specific deflation code behaves identically to the regular zlib-ng algorithms. */
+#  define DEFLATE_BOUND_COMPLEN(source_len) 0
+#endif
+
+/* ===========================================================================
+     Compresses the source buffer into the destination buffer. The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer. Upon entry, destLen is the total size of the
+   destination buffer, which must be at least 0.1% larger than sourceLen plus
+   12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+int Z_EXPORT PREFIX(compress2)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source,
+                        z_uintmax_t sourceLen, int level) {
+    PREFIX3(stream) stream;
+    int err;
+    const unsigned int max = (unsigned int)-1;
+    z_size_t left;
+
+    left = *destLen;
+    *destLen = 0;
+
+    stream.zalloc = NULL;
+    stream.zfree = NULL;
+    stream.opaque = NULL;
+
+    err = PREFIX(deflateInit)(&stream, level);
+    if (err != Z_OK)
+        return err;
+
+    stream.next_out = dest;
+    stream.avail_out = 0;
+    stream.next_in = (z_const unsigned char *)source;
+    stream.avail_in = 0;
+
+    do {
+        if (stream.avail_out == 0) {
+            stream.avail_out = left > (unsigned long)max ? max : (unsigned int)left;
+            left -= stream.avail_out;
+        }
+        if (stream.avail_in == 0) {
+            stream.avail_in = sourceLen > (unsigned long)max ? max : (unsigned int)sourceLen;
+            sourceLen -= stream.avail_in;
+        }
+        err = PREFIX(deflate)(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH);
+    } while (err == Z_OK);
+
+    *destLen = stream.total_out;
+    PREFIX(deflateEnd)(&stream);
+    return err == Z_STREAM_END ? Z_OK : err;
+}
+
+/* ===========================================================================
+ */
+int Z_EXPORT PREFIX(compress)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source, z_uintmax_t sourceLen) {
+    return PREFIX(compress2)(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
+}
+
+/* ===========================================================================
+   If the default memLevel or windowBits for deflateInit() is changed, then
+   this function needs to be updated.
+ */
+z_uintmax_t Z_EXPORT PREFIX(compressBound)(z_uintmax_t sourceLen) {
+    z_uintmax_t complen = DEFLATE_BOUND_COMPLEN(sourceLen);
+
+    if (complen > 0)
+        /* Architecture-specific code provided an upper bound. */
+        return complen + ZLIB_WRAPLEN;
+
+#ifndef NO_QUICK_STRATEGY
+    return sourceLen                       /* The source size itself */
+      + (sourceLen == 0 ? 1 : 0)           /* Always at least one byte for any input */
+      + (sourceLen < 9 ? 1 : 0)            /* One extra byte for lengths less than 9 */
+      + DEFLATE_QUICK_OVERHEAD(sourceLen)  /* Source encoding overhead, padded to next full byte */
+      + DEFLATE_BLOCK_OVERHEAD             /* Deflate block overhead bytes */
+      + ZLIB_WRAPLEN;                      /* zlib wrapper */
+#else
+    return sourceLen + (sourceLen >> 4) + 7 + ZLIB_WRAPLEN;
+#endif
+}
diff --git a/3rdparty/zlib-ng/cpu_features.c b/3rdparty/zlib-ng/cpu_features.c
new file mode 100644
index 000000000000..3585172e5d20
--- /dev/null
+++ b/3rdparty/zlib-ng/cpu_features.c
@@ -0,0 +1,23 @@
+/* cpu_features.c -- CPU architecture feature check
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "cpu_features.h"
+#include <string.h>
+
+Z_INTERNAL void cpu_check_features(struct cpu_features *features) {
+    memset(features, 0, sizeof(struct cpu_features));
+#if defined(X86_FEATURES)
+    x86_check_features(&features->x86);
+#elif defined(ARM_FEATURES)
+    arm_check_features(&features->arm);
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+    power_check_features(&features->power);
+#elif defined(S390_FEATURES)
+    s390_check_features(&features->s390);
+#elif defined(RISCV_FEATURES)
+    riscv_check_features(&features->riscv);
+#endif
+}
diff --git a/3rdparty/zlib-ng/cpu_features.h b/3rdparty/zlib-ng/cpu_features.h
new file mode 100644
index 000000000000..00fa6c747c5f
--- /dev/null
+++ b/3rdparty/zlib-ng/cpu_features.h
@@ -0,0 +1,303 @@
+/* cpu_features.h -- CPU architecture feature check
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CPU_FEATURES_H_
+#define CPU_FEATURES_H_
+
+#include "adler32_fold.h"
+#include "crc32_fold.h"
+
+#if defined(X86_FEATURES)
+#  include "arch/x86/x86_features.h"
+#  include "fallback_builtins.h"
+#elif defined(ARM_FEATURES)
+#  include "arch/arm/arm_features.h"
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+#  include "arch/power/power_features.h"
+#elif defined(S390_FEATURES)
+#  include "arch/s390/s390_features.h"
+#elif defined(RISCV_FEATURES)
+#  include "arch/riscv/riscv_features.h"
+#endif
+
+struct cpu_features {
+#if defined(X86_FEATURES)
+    struct x86_cpu_features x86;
+#elif defined(ARM_FEATURES)
+    struct arm_cpu_features arm;
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+    struct power_cpu_features power;
+#elif defined(S390_FEATURES)
+    struct s390_cpu_features s390;
+#elif defined(RISCV_FEATURES)
+    struct riscv_cpu_features riscv;
+#else
+    char empty;
+#endif
+};
+
+extern void cpu_check_features(struct cpu_features *features);
+
+/* adler32 */
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
+
+extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
+#ifdef ARM_NEON
+extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef PPC_VMX
+extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_SSSE3
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_AVX2
+extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_AVX512
+extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef X86_AVX512VNNI
+extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+#ifdef POWER8_VSX
+extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
+#endif
+
+/* adler32 folding */
+#ifdef RISCV_RVV
+extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_SSE42
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX2
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI
+extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
+/* CRC32 folding */
+#ifdef X86_PCLMULQDQ_CRC
+extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
+extern void     crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern void     crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
+extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
+extern void     crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern void     crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
+extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
+
+/* memory chunking */
+extern uint32_t chunksize_c(void);
+extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#ifdef X86_SSE2
+extern uint32_t chunksize_sse2(void);
+extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef X86_SSSE3
+extern uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef X86_AVX2
+extern uint32_t chunksize_avx2(void);
+extern uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef ARM_NEON
+extern uint32_t chunksize_neon(void);
+extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef POWER8_VSX
+extern uint32_t chunksize_power8(void);
+extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t chunksize_rvv(void);
+extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+
+#ifdef ZLIB_COMPAT
+typedef struct z_stream_s z_stream;
+#else
+typedef struct zng_stream_s zng_stream;
+#endif
+
+/* inflate fast loop */
+extern void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
+#ifdef X86_SSE2
+extern void inflate_fast_sse2(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef X86_SSSE3
+extern void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef X86_AVX2
+extern void inflate_fast_avx2(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef ARM_NEON
+extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef POWER8_VSX
+extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
+#endif
+#ifdef RISCV_RVV
+extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start);
+#endif
+
+/* CRC32 */
+typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
+
+extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
+#ifdef ARM_ACLE
+extern uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);
+#elif defined(POWER8_VSX)
+extern uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
+#elif defined(S390_CRC32_VX)
+extern uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len);
+#endif
+
+/* compare256 */
+typedef uint32_t (*compare256_func)(const uint8_t *src0, const uint8_t *src1);
+
+extern uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+extern uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);
+#ifdef HAVE_BUILTIN_CTZ
+extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1);
+#endif
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
+#endif
+#endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
+#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+#endif
+#ifdef POWER9
+extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+#endif
+
+#ifdef DEFLATE_H_
+/* insert_string */
+extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
+#ifdef X86_SSE42
+extern void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count);
+#elif defined(ARM_ACLE)
+extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
+#endif
+
+/* longest_match */
+extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
+#ifdef HAVE_BUILTIN_CTZ
+extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
+#endif
+#endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef POWER9
+extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
+#endif
+
+/* longest_match_slow */
+extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
+extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED64_OK
+extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
+#endif
+#endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef POWER9
+extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef RISCV_RVV
+extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
+#endif
+
+/* quick_insert_string */
+extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
+#ifdef X86_SSE42
+extern Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str);
+#elif defined(ARM_ACLE)
+extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
+#endif
+
+/* slide_hash */
+typedef void (*slide_hash_func)(deflate_state *s);
+
+#ifdef X86_SSE2
+extern void slide_hash_sse2(deflate_state *s);
+#endif
+#if defined(ARM_SIMD)
+extern void slide_hash_armv6(deflate_state *s);
+#endif
+#if defined(ARM_NEON)
+extern void slide_hash_neon(deflate_state *s);
+#endif
+#if defined(PPC_VMX)
+extern void slide_hash_vmx(deflate_state *s);
+#endif
+#if defined(POWER8_VSX)
+extern void slide_hash_power8(deflate_state *s);
+#endif
+#if defined(RISCV_RVV)
+extern void slide_hash_rvv(deflate_state *s);
+#endif
+#ifdef X86_AVX2
+extern void slide_hash_avx2(deflate_state *s);
+#endif
+
+/* update_hash */
+extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
+#ifdef X86_SSE42
+extern uint32_t update_hash_sse42(deflate_state *const s, uint32_t h, uint32_t val);
+#elif defined(ARM_ACLE)
+extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
+#endif
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/crc32_braid.c b/3rdparty/zlib-ng/crc32_braid.c
new file mode 100644
index 000000000000..96754b53dff9
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32_braid.c
@@ -0,0 +1,267 @@
+/* crc32_braid.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "functable.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+
+/* ========================================================================= */
+
+const uint32_t * Z_EXPORT PREFIX(get_crc_table)(void) {
+    return (const uint32_t *)crc_table;
+}
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32_z)(unsigned long crc, const unsigned char *buf, size_t len) {
+    if (buf == NULL) return 0;
+
+    return (unsigned long)functable.crc32((uint32_t)crc, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(crc32_z)(uint32_t crc, const unsigned char *buf, size_t len) {
+    if (buf == NULL) return 0;
+
+    return functable.crc32(crc, buf, len);
+}
+#endif
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32)(unsigned long crc, const unsigned char *buf, unsigned int len) {
+    return (unsigned long)PREFIX(crc32_z)((uint32_t)crc, buf, len);
+}
+#else
+uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t len) {
+    return PREFIX(crc32_z)(crc, buf, len);
+}
+#endif
+
+/* ========================================================================= */
+
+/*
+  A CRC of a message is computed on N braids of words in the message, where
+  each word consists of W bytes (4 or 8). If N is 3, for example, then three
+  running sparse CRCs are calculated respectively on each braid, at these
+  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
+  This is done starting at a word boundary, and continues until as many blocks
+  of N * W bytes as are available have been processed. The results are combined
+  into a single CRC at the end. For this code, N must be in the range 1..6 and
+  W must be 4 or 8. The upper limit on N can be increased if desired by adding
+  more #if blocks, extending the patterns apparent in the code. In addition,
+  crc32 tables would need to be regenerated, if the maximum N value is increased.
+
+  N and W are chosen empirically by benchmarking the execution time on a given
+  processor. The choices for N and W below were based on testing on Intel Kaby
+  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
+  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
+  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
+  They were all tested with either gcc or clang, all using the -O3 optimization
+  level. Your mileage may vary.
+*/
+
+/* ========================================================================= */
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define ZSWAPWORD(word) (word)
+#  define BRAID_TABLE crc_braid_table
+#elif BYTE_ORDER == BIG_ENDIAN
+#  if W == 8
+#    define ZSWAPWORD(word) ZSWAP64(word)
+#  elif W == 4
+#    define ZSWAPWORD(word) ZSWAP32(word)
+#  endif
+#  define BRAID_TABLE crc_braid_big_table
+#else
+#  error "No endian defined"
+#endif
+#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+#ifdef W
+/*
+  Return the CRC of the W bytes in the word_t data, taking the
+  least-significant byte of the word as the first byte of data, without any pre
+  or post conditioning. This is used to combine the CRCs of each braid.
+ */
+#if BYTE_ORDER == LITTLE_ENDIAN
+static uint32_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < W; k++)
+        data = (data >> 8) ^ crc_table[data & 0xff];
+    return (uint32_t)data;
+}
+#elif BYTE_ORDER == BIG_ENDIAN
+static z_word_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < W; k++)
+        data = (data << 8) ^
+            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
+    return data;
+}
+#endif /* BYTE_ORDER */
+
+#endif /* W */
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
+    Z_REGISTER uint32_t c;
+
+    /* Pre-condition the CRC */
+    c = (~crc) & 0xffffffff;
+
+#ifdef W
+    /* If provided enough bytes, do a braided CRC calculation. */
+    if (len >= N * W + W - 1) {
+        size_t blks;
+        z_word_t const *words;
+        int k;
+
+        /* Compute the CRC up to a z_word_t boundary. */
+        while (len && ((uintptr_t)buf & (W - 1)) != 0) {
+            len--;
+            DO1;
+        }
+
+        /* Compute the CRC on as many N z_word_t blocks as are available. */
+        blks = len / (N * W);
+        len -= blks * N * W;
+        words = (z_word_t const *)buf;
+
+        z_word_t crc0, word0, comb;
+#if N > 1
+        z_word_t crc1, word1;
+#if N > 2
+        z_word_t crc2, word2;
+#if N > 3
+        z_word_t crc3, word3;
+#if N > 4
+        z_word_t crc4, word4;
+#if N > 5
+        z_word_t crc5, word5;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Initialize the CRC for each braid. */
+        crc0 = ZSWAPWORD(c);
+#if N > 1
+        crc1 = 0;
+#if N > 2
+        crc2 = 0;
+#if N > 3
+        crc3 = 0;
+#if N > 4
+        crc4 = 0;
+#if N > 5
+        crc5 = 0;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
+        while (--blks) {
+            /* Load the word for each braid into registers. */
+            word0 = crc0 ^ words[0];
+#if N > 1
+            word1 = crc1 ^ words[1];
+#if N > 2
+            word2 = crc2 ^ words[2];
+#if N > 3
+            word3 = crc3 ^ words[3];
+#if N > 4
+            word4 = crc4 ^ words[4];
+#if N > 5
+            word5 = crc5 ^ words[5];
+#endif
+#endif
+#endif
+#endif
+#endif
+            words += N;
+
+            /* Compute and update the CRC for each word. The loop should get unrolled. */
+            crc0 = BRAID_TABLE[0][word0 & 0xff];
+#if N > 1
+            crc1 = BRAID_TABLE[0][word1 & 0xff];
+#if N > 2
+            crc2 = BRAID_TABLE[0][word2 & 0xff];
+#if N > 3
+            crc3 = BRAID_TABLE[0][word3 & 0xff];
+#if N > 4
+            crc4 = BRAID_TABLE[0][word4 & 0xff];
+#if N > 5
+            crc5 = BRAID_TABLE[0][word5 & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            for (k = 1; k < W; k++) {
+                crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
+#if N > 1
+                crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
+#if N > 2
+                crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
+#if N > 3
+                crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
+#if N > 4
+                crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
+#if N > 5
+                crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            }
+        }
+
+        /* Process the last block, combining the CRCs of the N braids at the same time. */
+        comb = crc_word(crc0 ^ words[0]);
+#if N > 1
+        comb = crc_word(crc1 ^ words[1] ^ comb);
+#if N > 2
+        comb = crc_word(crc2 ^ words[2] ^ comb);
+#if N > 3
+        comb = crc_word(crc3 ^ words[3] ^ comb);
+#if N > 4
+        comb = crc_word(crc4 ^ words[4] ^ comb);
+#if N > 5
+        comb = crc_word(crc5 ^ words[5] ^ comb);
+#endif
+#endif
+#endif
+#endif
+#endif
+        words += N;
+        c = ZSWAPWORD(comb);
+
+        /* Update the pointer to the remaining bytes to process. */
+        buf = (const unsigned char *)words;
+    }
+
+#endif /* W */
+
+    /* Complete the computation of the CRC on any remaining bytes. */
+    while (len >= 8) {
+        len -= 8;
+        DO8;
+    }
+    while (len) {
+        len--;
+        DO1;
+    }
+
+    /* Return the CRC, post-conditioned. */
+    return c ^ 0xffffffff;
+}
diff --git a/3rdparty/zlib-ng/crc32_braid_comb.c b/3rdparty/zlib-ng/crc32_braid_comb.c
new file mode 100644
index 000000000000..75fb47425873
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32_braid_comb.c
@@ -0,0 +1,57 @@
+/* crc32_braid_comb.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32_braid_comb_p.h"
+
+/* ========================================================================= */
+static uint32_t crc32_combine_(uint32_t crc1, uint32_t crc2, z_off64_t len2) {
+    return multmodp(x2nmodp(len2, 3), crc1) ^ crc2;
+}
+static uint32_t crc32_combine_gen_(z_off64_t len2) {
+     return x2nmodp(len2, 3);
+}
+static uint32_t crc32_combine_op_(uint32_t crc1, uint32_t crc2, const uint32_t op) {
+    return multmodp(op, crc1) ^ crc2;
+}
+
+/* ========================================================================= */
+
+#ifdef ZLIB_COMPAT
+unsigned long Z_EXPORT PREFIX(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off_t len2) {
+    return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2);
+}
+unsigned long Z_EXPORT PREFIX4(crc32_combine)(unsigned long crc1, unsigned long crc2, z_off64_t len2) {
+    return (unsigned long)crc32_combine_((uint32_t)crc1, (uint32_t)crc2, len2);
+}
+unsigned long Z_EXPORT PREFIX(crc32_combine_gen)(z_off_t len2) {
+    return crc32_combine_gen_(len2);
+}
+unsigned long Z_EXPORT PREFIX4(crc32_combine_gen)(z_off64_t len2) {
+    return crc32_combine_gen_(len2);
+}
+unsigned long Z_EXPORT PREFIX(crc32_combine_op)(unsigned long crc1, unsigned long crc2, const unsigned long op) {
+    return (unsigned long)crc32_combine_op_((uint32_t)crc1, (uint32_t)crc2, (uint32_t)op);
+}
+#else
+uint32_t Z_EXPORT PREFIX4(crc32_combine)(uint32_t crc1, uint32_t crc2, z_off64_t len2) {
+    return crc32_combine_(crc1, crc2, len2);
+}
+uint32_t Z_EXPORT PREFIX(crc32_combine_gen)(z_off64_t len2) {
+    return crc32_combine_gen_(len2);
+}
+uint32_t Z_EXPORT PREFIX(crc32_combine_op)(uint32_t crc1, uint32_t crc2, const uint32_t op) {
+    return crc32_combine_op_(crc1, crc2, op);
+}
+#endif
+
+/* ========================================================================= */
diff --git a/3rdparty/zlib-ng/crc32_braid_comb_p.h b/3rdparty/zlib-ng/crc32_braid_comb_p.h
new file mode 100644
index 000000000000..a269e7f5b79f
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32_braid_comb_p.h
@@ -0,0 +1,42 @@
+#ifndef CRC32_BRAID_COMB_P_H_
+#define CRC32_BRAID_COMB_P_H_
+
+/*
+  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
+  reflected. For speed, this requires that a not be zero.
+ */
+static uint32_t multmodp(uint32_t a, uint32_t b) {
+    uint32_t m, p;
+
+    m = (uint32_t)1 << 31;
+    p = 0;
+    for (;;) {
+        if (a & m) {
+            p ^= b;
+            if ((a & (m - 1)) == 0)
+                break;
+        }
+        m >>= 1;
+        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
+    }
+    return p;
+}
+
+/*
+  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
+  initialized.
+ */
+static uint32_t x2nmodp(z_off64_t n, unsigned k) {
+    uint32_t p;
+
+    p = (uint32_t)1 << 31;           /* x^0 == 1 */
+    while (n) {
+        if (n & 1)
+            p = multmodp(x2n_table[k & 31], p);
+        n >>= 1;
+        k++;
+    }
+    return p;
+}
+
+#endif /* CRC32_BRAID_COMB_P_H_ */
diff --git a/3rdparty/zlib-ng/crc32_braid_p.h b/3rdparty/zlib-ng/crc32_braid_p.h
new file mode 100644
index 000000000000..1d8a07068a4c
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32_braid_p.h
@@ -0,0 +1,50 @@
+#ifndef CRC32_BRAID_P_H_
+#define CRC32_BRAID_P_H_
+
+#include "zbuild.h"
+#include "zendian.h"
+
+/* Define N */
+#ifdef Z_TESTN
+#  define N Z_TESTN
+#else
+#  define N 5
+#endif
+#if N < 1 || N > 6
+#  error N must be in 1..6
+#endif
+
+/*
+  Define W and the associated z_word_t type. If W is not defined, then a
+  braided calculation is not used, and the associated tables and code are not
+  compiled.
+ */
+#ifdef Z_TESTW
+#  if Z_TESTW-1 != -1
+#    define W Z_TESTW
+#  endif
+#else
+#  ifndef W
+#    if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__)
+#      define W 8
+#    else
+#      define W 4
+#    endif
+#  endif
+#endif
+#ifdef W
+#  if W == 8
+     typedef uint64_t z_word_t;
+#  else
+#    undef W
+#    define W 4
+     typedef uint32_t z_word_t;
+#  endif
+#endif
+
+/* CRC polynomial. */
+#define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
+
+extern uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
+
+#endif /* CRC32_BRAID_P_H_ */
diff --git a/3rdparty/zlib-ng/crc32_braid_tbl.h b/3rdparty/zlib-ng/crc32_braid_tbl.h
new file mode 100644
index 000000000000..84d79a69e7dc
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32_braid_tbl.h
@@ -0,0 +1,9446 @@
+#ifndef CRC32_BRAID_TBL_H_
+#define CRC32_BRAID_TBL_H_
+
+/* crc32_braid_tbl.h -- tables for braided CRC calculation
+ * Generated automatically by makecrct.c
+ */
+
+static const uint32_t crc_table[] = {
+    0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+    0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+    0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+    0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+    0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+    0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+    0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+    0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+    0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+    0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+    0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+    0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+    0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+    0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+    0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+    0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+    0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+    0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+    0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+    0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+    0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+    0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+    0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+    0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+    0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+    0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+    0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+    0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+    0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+    0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+    0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+    0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+    0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+    0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+    0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+    0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+    0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+    0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+    0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+    0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+    0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+    0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+    0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+    0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+    0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+    0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+    0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+    0x2d02ef8d};
+
+#ifdef W
+
+#if W == 8
+
+static const z_word_t crc_big_table[] = {
+    0x0000000000000000, 0x9630077700000000, 0x2c610eee00000000,
+    0xba51099900000000, 0x19c46d0700000000, 0x8ff46a7000000000,
+    0x35a563e900000000, 0xa395649e00000000, 0x3288db0e00000000,
+    0xa4b8dc7900000000, 0x1ee9d5e000000000, 0x88d9d29700000000,
+    0x2b4cb60900000000, 0xbd7cb17e00000000, 0x072db8e700000000,
+    0x911dbf9000000000, 0x6410b71d00000000, 0xf220b06a00000000,
+    0x4871b9f300000000, 0xde41be8400000000, 0x7dd4da1a00000000,
+    0xebe4dd6d00000000, 0x51b5d4f400000000, 0xc785d38300000000,
+    0x56986c1300000000, 0xc0a86b6400000000, 0x7af962fd00000000,
+    0xecc9658a00000000, 0x4f5c011400000000, 0xd96c066300000000,
+    0x633d0ffa00000000, 0xf50d088d00000000, 0xc8206e3b00000000,
+    0x5e10694c00000000, 0xe44160d500000000, 0x727167a200000000,
+    0xd1e4033c00000000, 0x47d4044b00000000, 0xfd850dd200000000,
+    0x6bb50aa500000000, 0xfaa8b53500000000, 0x6c98b24200000000,
+    0xd6c9bbdb00000000, 0x40f9bcac00000000, 0xe36cd83200000000,
+    0x755cdf4500000000, 0xcf0dd6dc00000000, 0x593dd1ab00000000,
+    0xac30d92600000000, 0x3a00de5100000000, 0x8051d7c800000000,
+    0x1661d0bf00000000, 0xb5f4b42100000000, 0x23c4b35600000000,
+    0x9995bacf00000000, 0x0fa5bdb800000000, 0x9eb8022800000000,
+    0x0888055f00000000, 0xb2d90cc600000000, 0x24e90bb100000000,
+    0x877c6f2f00000000, 0x114c685800000000, 0xab1d61c100000000,
+    0x3d2d66b600000000, 0x9041dc7600000000, 0x0671db0100000000,
+    0xbc20d29800000000, 0x2a10d5ef00000000, 0x8985b17100000000,
+    0x1fb5b60600000000, 0xa5e4bf9f00000000, 0x33d4b8e800000000,
+    0xa2c9077800000000, 0x34f9000f00000000, 0x8ea8099600000000,
+    0x18980ee100000000, 0xbb0d6a7f00000000, 0x2d3d6d0800000000,
+    0x976c649100000000, 0x015c63e600000000, 0xf4516b6b00000000,
+    0x62616c1c00000000, 0xd830658500000000, 0x4e0062f200000000,
+    0xed95066c00000000, 0x7ba5011b00000000, 0xc1f4088200000000,
+    0x57c40ff500000000, 0xc6d9b06500000000, 0x50e9b71200000000,
+    0xeab8be8b00000000, 0x7c88b9fc00000000, 0xdf1ddd6200000000,
+    0x492dda1500000000, 0xf37cd38c00000000, 0x654cd4fb00000000,
+    0x5861b24d00000000, 0xce51b53a00000000, 0x7400bca300000000,
+    0xe230bbd400000000, 0x41a5df4a00000000, 0xd795d83d00000000,
+    0x6dc4d1a400000000, 0xfbf4d6d300000000, 0x6ae9694300000000,
+    0xfcd96e3400000000, 0x468867ad00000000, 0xd0b860da00000000,
+    0x732d044400000000, 0xe51d033300000000, 0x5f4c0aaa00000000,
+    0xc97c0ddd00000000, 0x3c71055000000000, 0xaa41022700000000,
+    0x10100bbe00000000, 0x86200cc900000000, 0x25b5685700000000,
+    0xb3856f2000000000, 0x09d466b900000000, 0x9fe461ce00000000,
+    0x0ef9de5e00000000, 0x98c9d92900000000, 0x2298d0b000000000,
+    0xb4a8d7c700000000, 0x173db35900000000, 0x810db42e00000000,
+    0x3b5cbdb700000000, 0xad6cbac000000000, 0x2083b8ed00000000,
+    0xb6b3bf9a00000000, 0x0ce2b60300000000, 0x9ad2b17400000000,
+    0x3947d5ea00000000, 0xaf77d29d00000000, 0x1526db0400000000,
+    0x8316dc7300000000, 0x120b63e300000000, 0x843b649400000000,
+    0x3e6a6d0d00000000, 0xa85a6a7a00000000, 0x0bcf0ee400000000,
+    0x9dff099300000000, 0x27ae000a00000000, 0xb19e077d00000000,
+    0x44930ff000000000, 0xd2a3088700000000, 0x68f2011e00000000,
+    0xfec2066900000000, 0x5d5762f700000000, 0xcb67658000000000,
+    0x71366c1900000000, 0xe7066b6e00000000, 0x761bd4fe00000000,
+    0xe02bd38900000000, 0x5a7ada1000000000, 0xcc4add6700000000,
+    0x6fdfb9f900000000, 0xf9efbe8e00000000, 0x43beb71700000000,
+    0xd58eb06000000000, 0xe8a3d6d600000000, 0x7e93d1a100000000,
+    0xc4c2d83800000000, 0x52f2df4f00000000, 0xf167bbd100000000,
+    0x6757bca600000000, 0xdd06b53f00000000, 0x4b36b24800000000,
+    0xda2b0dd800000000, 0x4c1b0aaf00000000, 0xf64a033600000000,
+    0x607a044100000000, 0xc3ef60df00000000, 0x55df67a800000000,
+    0xef8e6e3100000000, 0x79be694600000000, 0x8cb361cb00000000,
+    0x1a8366bc00000000, 0xa0d26f2500000000, 0x36e2685200000000,
+    0x95770ccc00000000, 0x03470bbb00000000, 0xb916022200000000,
+    0x2f26055500000000, 0xbe3bbac500000000, 0x280bbdb200000000,
+    0x925ab42b00000000, 0x046ab35c00000000, 0xa7ffd7c200000000,
+    0x31cfd0b500000000, 0x8b9ed92c00000000, 0x1daede5b00000000,
+    0xb0c2649b00000000, 0x26f263ec00000000, 0x9ca36a7500000000,
+    0x0a936d0200000000, 0xa906099c00000000, 0x3f360eeb00000000,
+    0x8567077200000000, 0x1357000500000000, 0x824abf9500000000,
+    0x147ab8e200000000, 0xae2bb17b00000000, 0x381bb60c00000000,
+    0x9b8ed29200000000, 0x0dbed5e500000000, 0xb7efdc7c00000000,
+    0x21dfdb0b00000000, 0xd4d2d38600000000, 0x42e2d4f100000000,
+    0xf8b3dd6800000000, 0x6e83da1f00000000, 0xcd16be8100000000,
+    0x5b26b9f600000000, 0xe177b06f00000000, 0x7747b71800000000,
+    0xe65a088800000000, 0x706a0fff00000000, 0xca3b066600000000,
+    0x5c0b011100000000, 0xff9e658f00000000, 0x69ae62f800000000,
+    0xd3ff6b6100000000, 0x45cf6c1600000000, 0x78e20aa000000000,
+    0xeed20dd700000000, 0x5483044e00000000, 0xc2b3033900000000,
+    0x612667a700000000, 0xf71660d000000000, 0x4d47694900000000,
+    0xdb776e3e00000000, 0x4a6ad1ae00000000, 0xdc5ad6d900000000,
+    0x660bdf4000000000, 0xf03bd83700000000, 0x53aebca900000000,
+    0xc59ebbde00000000, 0x7fcfb24700000000, 0xe9ffb53000000000,
+    0x1cf2bdbd00000000, 0x8ac2baca00000000, 0x3093b35300000000,
+    0xa6a3b42400000000, 0x0536d0ba00000000, 0x9306d7cd00000000,
+    0x2957de5400000000, 0xbf67d92300000000, 0x2e7a66b300000000,
+    0xb84a61c400000000, 0x021b685d00000000, 0x942b6f2a00000000,
+    0x37be0bb400000000, 0xa18e0cc300000000, 0x1bdf055a00000000,
+    0x8def022d00000000};
+
+#else /* W == 4 */
+
+static const z_word_t crc_big_table[] = {
+    0x00000000, 0x96300777, 0x2c610eee, 0xba510999, 0x19c46d07,
+    0x8ff46a70, 0x35a563e9, 0xa395649e, 0x3288db0e, 0xa4b8dc79,
+    0x1ee9d5e0, 0x88d9d297, 0x2b4cb609, 0xbd7cb17e, 0x072db8e7,
+    0x911dbf90, 0x6410b71d, 0xf220b06a, 0x4871b9f3, 0xde41be84,
+    0x7dd4da1a, 0xebe4dd6d, 0x51b5d4f4, 0xc785d383, 0x56986c13,
+    0xc0a86b64, 0x7af962fd, 0xecc9658a, 0x4f5c0114, 0xd96c0663,
+    0x633d0ffa, 0xf50d088d, 0xc8206e3b, 0x5e10694c, 0xe44160d5,
+    0x727167a2, 0xd1e4033c, 0x47d4044b, 0xfd850dd2, 0x6bb50aa5,
+    0xfaa8b535, 0x6c98b242, 0xd6c9bbdb, 0x40f9bcac, 0xe36cd832,
+    0x755cdf45, 0xcf0dd6dc, 0x593dd1ab, 0xac30d926, 0x3a00de51,
+    0x8051d7c8, 0x1661d0bf, 0xb5f4b421, 0x23c4b356, 0x9995bacf,
+    0x0fa5bdb8, 0x9eb80228, 0x0888055f, 0xb2d90cc6, 0x24e90bb1,
+    0x877c6f2f, 0x114c6858, 0xab1d61c1, 0x3d2d66b6, 0x9041dc76,
+    0x0671db01, 0xbc20d298, 0x2a10d5ef, 0x8985b171, 0x1fb5b606,
+    0xa5e4bf9f, 0x33d4b8e8, 0xa2c90778, 0x34f9000f, 0x8ea80996,
+    0x18980ee1, 0xbb0d6a7f, 0x2d3d6d08, 0x976c6491, 0x015c63e6,
+    0xf4516b6b, 0x62616c1c, 0xd8306585, 0x4e0062f2, 0xed95066c,
+    0x7ba5011b, 0xc1f40882, 0x57c40ff5, 0xc6d9b065, 0x50e9b712,
+    0xeab8be8b, 0x7c88b9fc, 0xdf1ddd62, 0x492dda15, 0xf37cd38c,
+    0x654cd4fb, 0x5861b24d, 0xce51b53a, 0x7400bca3, 0xe230bbd4,
+    0x41a5df4a, 0xd795d83d, 0x6dc4d1a4, 0xfbf4d6d3, 0x6ae96943,
+    0xfcd96e34, 0x468867ad, 0xd0b860da, 0x732d0444, 0xe51d0333,
+    0x5f4c0aaa, 0xc97c0ddd, 0x3c710550, 0xaa410227, 0x10100bbe,
+    0x86200cc9, 0x25b56857, 0xb3856f20, 0x09d466b9, 0x9fe461ce,
+    0x0ef9de5e, 0x98c9d929, 0x2298d0b0, 0xb4a8d7c7, 0x173db359,
+    0x810db42e, 0x3b5cbdb7, 0xad6cbac0, 0x2083b8ed, 0xb6b3bf9a,
+    0x0ce2b603, 0x9ad2b174, 0x3947d5ea, 0xaf77d29d, 0x1526db04,
+    0x8316dc73, 0x120b63e3, 0x843b6494, 0x3e6a6d0d, 0xa85a6a7a,
+    0x0bcf0ee4, 0x9dff0993, 0x27ae000a, 0xb19e077d, 0x44930ff0,
+    0xd2a30887, 0x68f2011e, 0xfec20669, 0x5d5762f7, 0xcb676580,
+    0x71366c19, 0xe7066b6e, 0x761bd4fe, 0xe02bd389, 0x5a7ada10,
+    0xcc4add67, 0x6fdfb9f9, 0xf9efbe8e, 0x43beb717, 0xd58eb060,
+    0xe8a3d6d6, 0x7e93d1a1, 0xc4c2d838, 0x52f2df4f, 0xf167bbd1,
+    0x6757bca6, 0xdd06b53f, 0x4b36b248, 0xda2b0dd8, 0x4c1b0aaf,
+    0xf64a0336, 0x607a0441, 0xc3ef60df, 0x55df67a8, 0xef8e6e31,
+    0x79be6946, 0x8cb361cb, 0x1a8366bc, 0xa0d26f25, 0x36e26852,
+    0x95770ccc, 0x03470bbb, 0xb9160222, 0x2f260555, 0xbe3bbac5,
+    0x280bbdb2, 0x925ab42b, 0x046ab35c, 0xa7ffd7c2, 0x31cfd0b5,
+    0x8b9ed92c, 0x1daede5b, 0xb0c2649b, 0x26f263ec, 0x9ca36a75,
+    0x0a936d02, 0xa906099c, 0x3f360eeb, 0x85670772, 0x13570005,
+    0x824abf95, 0x147ab8e2, 0xae2bb17b, 0x381bb60c, 0x9b8ed292,
+    0x0dbed5e5, 0xb7efdc7c, 0x21dfdb0b, 0xd4d2d386, 0x42e2d4f1,
+    0xf8b3dd68, 0x6e83da1f, 0xcd16be81, 0x5b26b9f6, 0xe177b06f,
+    0x7747b718, 0xe65a0888, 0x706a0fff, 0xca3b0666, 0x5c0b0111,
+    0xff9e658f, 0x69ae62f8, 0xd3ff6b61, 0x45cf6c16, 0x78e20aa0,
+    0xeed20dd7, 0x5483044e, 0xc2b30339, 0x612667a7, 0xf71660d0,
+    0x4d476949, 0xdb776e3e, 0x4a6ad1ae, 0xdc5ad6d9, 0x660bdf40,
+    0xf03bd837, 0x53aebca9, 0xc59ebbde, 0x7fcfb247, 0xe9ffb530,
+    0x1cf2bdbd, 0x8ac2baca, 0x3093b353, 0xa6a3b424, 0x0536d0ba,
+    0x9306d7cd, 0x2957de54, 0xbf67d923, 0x2e7a66b3, 0xb84a61c4,
+    0x021b685d, 0x942b6f2a, 0x37be0bb4, 0xa18e0cc3, 0x1bdf055a,
+    0x8def022d};
+
+#endif
+
+#endif /* W */
+
+#if N == 1
+
+#if W == 8
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa,
+    0x48e00e64, 0xc66f0987, 0x0ac50919, 0xd3e51bb5, 0x1f4f1b2b,
+    0x91c01cc8, 0x5d6a1c56, 0x57af154f, 0x9b0515d1, 0x158a1232,
+    0xd92012ac, 0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
+    0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832, 0xaf5e2a9e,
+    0x63f42a00, 0xed7b2de3, 0x21d12d7d, 0x2b142464, 0xe7be24fa,
+    0x69312319, 0xa59b2387, 0xf9766256, 0x35dc62c8, 0xbb53652b,
+    0x77f965b5, 0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
+    0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00, 0xaed97719,
+    0x62737787, 0xecfc7064, 0x205670fa, 0x85cd537d, 0x496753e3,
+    0xc7e85400, 0x0b42549e, 0x01875d87, 0xcd2d5d19, 0x43a25afa,
+    0x8f085a64, 0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
+    0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1, 0x299dc2ed,
+    0xe537c273, 0x6bb8c590, 0xa712c50e, 0xadd7cc17, 0x617dcc89,
+    0xeff2cb6a, 0x2358cbf4, 0xfa78d958, 0x36d2d9c6, 0xb85dde25,
+    0x74f7debb, 0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
+    0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425, 0xd16cfd3c,
+    0x1dc6fda2, 0x9349fa41, 0x5fe3fadf, 0x86c3e873, 0x4a69e8ed,
+    0xc4e6ef0e, 0x084cef90, 0x0289e689, 0xce23e617, 0x40ace1f4,
+    0x8c06e16a, 0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
+    0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2, 0x030ebb0e,
+    0xcfa4bb90, 0x412bbc73, 0x8d81bced, 0x8744b5f4, 0x4beeb56a,
+    0xc561b289, 0x09cbb217, 0xac509190, 0x60fa910e, 0xee7596ed,
+    0x22df9673, 0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
+    0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6, 0xfbff84df,
+    0x37558441, 0xb9da83a2, 0x7570833c, 0x533b85da, 0x9f918544,
+    0x111e82a7, 0xddb48239, 0xd7718b20, 0x1bdb8bbe, 0x95548c5d,
+    0x59fe8cc3, 0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
+    0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776, 0x2f80b4f1,
+    0xe32ab46f, 0x6da5b38c, 0xa10fb312, 0xabcaba0b, 0x6760ba95,
+    0xe9efbd76, 0x2545bde8, 0xfc65af44, 0x30cfafda, 0xbe40a839,
+    0x72eaa8a7, 0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
+    0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f, 0x2e07e976,
+    0xe2ade9e8, 0x6c22ee0b, 0xa088ee95, 0x79a8fc39, 0xb502fca7,
+    0x3b8dfb44, 0xf727fbda, 0xfde2f2c3, 0x3148f25d, 0xbfc7f5be,
+    0x736df520, 0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
+    0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe, 0x0513cd12,
+    0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1, 0x8159c3e8, 0x4df3c376,
+    0xc37cc495, 0x0fd6c40b, 0x7aa64737, 0xb60c47a9, 0x3883404a,
+    0xf42940d4, 0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
+    0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61, 0x2d095278,
+    0xe1a352e6, 0x6f2c5505, 0xa386559b, 0x061d761c, 0xcab77682,
+    0x44387161, 0x889271ff, 0x825778e6, 0x4efd7878, 0xc0727f9b,
+    0x0cd87f05, 0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
+    0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0, 0x83d02561,
+    0x4f7a25ff, 0xc1f5221c, 0x0d5f2282, 0x079a2b9b, 0xcb302b05,
+    0x45bf2ce6, 0x89152c78, 0x50353ed4, 0x9c9f3e4a, 0x121039a9,
+    0xdeba3937, 0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
+    0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9, 0x7b211ab0,
+    0xb78b1a2e, 0x39041dcd, 0xf5ae1d53, 0x2c8e0fff, 0xe0240f61,
+    0x6eab0882, 0xa201081c, 0xa8c40105, 0x646e019b, 0xeae10678,
+    0x264b06e6},
+   {0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d, 0xf44f2413,
+    0x52382fa7, 0x63d0353a, 0xc5a73e8e, 0x33ef4e67, 0x959845d3,
+    0xa4705f4e, 0x020754fa, 0xc7a06a74, 0x61d761c0, 0x503f7b5d,
+    0xf64870e9, 0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
+    0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240, 0x5431d2a9,
+    0xf246d91d, 0xc3aec380, 0x65d9c834, 0xa07ef6ba, 0x0609fd0e,
+    0x37e1e793, 0x9196ec27, 0xcfbd399c, 0x69ca3228, 0x582228b5,
+    0xfe552301, 0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
+    0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66, 0x081d53e8,
+    0xae6a585c, 0x9f8242c1, 0x39f54975, 0xa863a552, 0x0e14aee6,
+    0x3ffcb47b, 0x998bbfcf, 0x5c2c8141, 0xfa5b8af5, 0xcbb39068,
+    0x6dc49bdc, 0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
+    0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb, 0x440b7579,
+    0xe27c7ecd, 0xd3946450, 0x75e36fe4, 0xb044516a, 0x16335ade,
+    0x27db4043, 0x81ac4bf7, 0x77e43b1e, 0xd19330aa, 0xe07b2a37,
+    0x460c2183, 0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
+    0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a, 0xd79acda4,
+    0x71edc610, 0x4005dc8d, 0xe672d739, 0x103aa7d0, 0xb64dac64,
+    0x87a5b6f9, 0x21d2bd4d, 0xe47583c3, 0x42028877, 0x73ea92ea,
+    0xd59d995e, 0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
+    0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b, 0xb8590282,
+    0x1e2e0936, 0x2fc613ab, 0x89b1181f, 0x4c162691, 0xea612d25,
+    0xdb8937b8, 0x7dfe3c0c, 0xec68d02b, 0x4a1fdb9f, 0x7bf7c102,
+    0xdd80cab6, 0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
+    0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1, 0x2bc8ba5f,
+    0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2, 0x8816eaf2, 0x2e61e146,
+    0x1f89fbdb, 0xb9fef06f, 0x7c59cee1, 0xda2ec555, 0xebc6dfc8,
+    0x4db1d47c, 0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
+    0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b, 0xefc8763c,
+    0x49bf7d88, 0x78576715, 0xde206ca1, 0x1b87522f, 0xbdf0599b,
+    0x8c184306, 0x2a6f48b2, 0xdc27385b, 0x7a5033ef, 0x4bb82972,
+    0xedcf22c6, 0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
+    0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3, 0xb3e4f77d,
+    0x1593fcc9, 0x247be654, 0x820cede0, 0x74449d09, 0xd23396bd,
+    0xe3db8c20, 0x45ac8794, 0x800bb91a, 0x267cb2ae, 0x1794a833,
+    0xb1e3a387, 0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
+    0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e, 0x139a01c7,
+    0xb5ed0a73, 0x840510ee, 0x22721b5a, 0xe7d525d4, 0x41a22e60,
+    0x704a34fd, 0xd63d3f49, 0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2,
+    0xfdf58516, 0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
+    0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71, 0x0bbdf5ff,
+    0xadcafe4b, 0x9c22e4d6, 0x3a55ef62, 0xabc30345, 0x0db408f1,
+    0x3c5c126c, 0x9a2b19d8, 0x5f8c2756, 0xf9fb2ce2, 0xc813367f,
+    0x6e643dcb, 0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
+    0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac, 0x03a0a617,
+    0xa5d7ada3, 0x943fb73e, 0x3248bc8a, 0xf7ef8204, 0x519889b0,
+    0x6070932d, 0xc6079899, 0x304fe870, 0x9638e3c4, 0xa7d0f959,
+    0x01a7f2ed, 0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
+    0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044, 0x90311eca,
+    0x3646157e, 0x07ae0fe3, 0xa1d90457, 0x579174be, 0xf1e67f0a,
+    0xc00e6597, 0x66796e23, 0xa3de50ad, 0x05a95b19, 0x34414184,
+    0x92364a30},
+   {0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae, 0x9b914216,
+    0x50cd91b3, 0xd659e31d, 0x1d0530b8, 0xec53826d, 0x270f51c8,
+    0xa19b2366, 0x6ac7f0c3, 0x77c2c07b, 0xbc9e13de, 0x3a0a6170,
+    0xf156b2d5, 0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
+    0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223, 0xef8580f6,
+    0x24d95353, 0xa24d21fd, 0x6911f258, 0x7414c2e0, 0xbf481145,
+    0x39dc63eb, 0xf280b04e, 0x07ac0536, 0xccf0d693, 0x4a64a43d,
+    0x81387798, 0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
+    0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5, 0x706ec54d,
+    0xbb3216e8, 0x3da66446, 0xf6fab7e3, 0x047a07ad, 0xcf26d408,
+    0x49b2a6a6, 0x82ee7503, 0x9feb45bb, 0x54b7961e, 0xd223e4b0,
+    0x197f3715, 0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
+    0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578, 0x0f580a6c,
+    0xc404d9c9, 0x4290ab67, 0x89cc78c2, 0x94c9487a, 0x5f959bdf,
+    0xd901e971, 0x125d3ad4, 0xe30b8801, 0x28575ba4, 0xaec3290a,
+    0x659ffaaf, 0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
+    0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59, 0x971f4ae1,
+    0x5c439944, 0xdad7ebea, 0x118b384f, 0xe0dd8a9a, 0x2b81593f,
+    0xad152b91, 0x6649f834, 0x7b4cc88c, 0xb0101b29, 0x36846987,
+    0xfdd8ba22, 0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
+    0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2, 0xe4a78d37,
+    0x2ffb5e92, 0xa96f2c3c, 0x6233ff99, 0x7f36cf21, 0xb46a1c84,
+    0x32fe6e2a, 0xf9a2bd8f, 0x0b220dc1, 0xc07ede64, 0x46eaacca,
+    0x8db67f6f, 0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
+    0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02, 0x7ce0cdba,
+    0xb7bc1e1f, 0x31286cb1, 0xfa74bf14, 0x1eb014d8, 0xd5ecc77d,
+    0x5378b5d3, 0x98246676, 0x852156ce, 0x4e7d856b, 0xc8e9f7c5,
+    0x03b52460, 0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
+    0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d, 0x1d661643,
+    0xd63ac5e6, 0x50aeb748, 0x9bf264ed, 0x86f75455, 0x4dab87f0,
+    0xcb3ff55e, 0x006326fb, 0xf135942e, 0x3a69478b, 0xbcfd3525,
+    0x77a1e680, 0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
+    0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340, 0x828d53f8,
+    0x49d1805d, 0xcf45f2f3, 0x04192156, 0xf54f9383, 0x3e134026,
+    0xb8873288, 0x73dbe12d, 0x6eded195, 0xa5820230, 0x2316709e,
+    0xe84aa33b, 0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
+    0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd, 0xf6999118,
+    0x3dc542bd, 0xbb513013, 0x700de3b6, 0x6d08d30e, 0xa65400ab,
+    0x20c07205, 0xeb9ca1a0, 0x11e81eb4, 0xdab4cd11, 0x5c20bfbf,
+    0x977c6c1a, 0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
+    0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77, 0x662adecf,
+    0xad760d6a, 0x2be27fc4, 0xe0beac61, 0x123e1c2f, 0xd962cf8a,
+    0x5ff6bd24, 0x94aa6e81, 0x89af5e39, 0x42f38d9c, 0xc467ff32,
+    0x0f3b2c97, 0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
+    0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa, 0x16441b82,
+    0xdd18c827, 0x5b8cba89, 0x90d0692c, 0x8dd55994, 0x46898a31,
+    0xc01df89f, 0x0b412b3a, 0xfa1799ef, 0x314b4a4a, 0xb7df38e4,
+    0x7c83eb41, 0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
+    0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7, 0x8e035b0f,
+    0x455f88aa, 0xc3cbfa04, 0x089729a1, 0xf9c19b74, 0x329d48d1,
+    0xb4093a7f, 0x7f55e9da, 0x6250d962, 0xa90c0ac7, 0x2f987869,
+    0xe4c4abcc},
+   {0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0, 0xf580a6c0,
+    0xc8e08f70, 0x8f40f5a0, 0xb220dc10, 0x30704bc1, 0x0d106271,
+    0x4ab018a1, 0x77d03111, 0xc5f0ed01, 0xf890c4b1, 0xbf30be61,
+    0x825097d1, 0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
+    0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92, 0x5090dc43,
+    0x6df0f5f3, 0x2a508f23, 0x1730a693, 0xa5107a83, 0x98705333,
+    0xdfd029e3, 0xe2b00053, 0xc1c12f04, 0xfca106b4, 0xbb017c64,
+    0x866155d4, 0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
+    0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15, 0x0431c205,
+    0x3951ebb5, 0x7ef19165, 0x4391b8d5, 0xa121b886, 0x9c419136,
+    0xdbe1ebe6, 0xe681c256, 0x54a11e46, 0x69c137f6, 0x2e614d26,
+    0x13016496, 0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
+    0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57, 0x58f35849,
+    0x659371f9, 0x22330b29, 0x1f532299, 0xad73fe89, 0x9013d739,
+    0xd7b3ade9, 0xead38459, 0x68831388, 0x55e33a38, 0x124340e8,
+    0x2f236958, 0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
+    0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b, 0xcd93690b,
+    0xf0f340bb, 0xb7533a6b, 0x8a3313db, 0x0863840a, 0x3503adba,
+    0x72a3d76a, 0x4fc3feda, 0xfde322ca, 0xc0830b7a, 0x872371aa,
+    0xba43581a, 0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
+    0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d, 0xa9423c8c,
+    0x9422153c, 0xd3826fec, 0xeee2465c, 0x5cc29a4c, 0x61a2b3fc,
+    0x2602c92c, 0x1b62e09c, 0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af,
+    0xbe729a1f, 0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
+    0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de, 0x3c220dce,
+    0x0142247e, 0x46e25eae, 0x7b82771e, 0xb1e6b092, 0x8c869922,
+    0xcb26e3f2, 0xf646ca42, 0x44661652, 0x79063fe2, 0x3ea64532,
+    0x03c66c82, 0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
+    0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743, 0xd1062710,
+    0xec660ea0, 0xabc67470, 0x96a65dc0, 0x248681d0, 0x19e6a860,
+    0x5e46d2b0, 0x6326fb00, 0xe1766cd1, 0xdc164561, 0x9bb63fb1,
+    0xa6d61601, 0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
+    0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546, 0x85a73956,
+    0xb8c710e6, 0xff676a36, 0xc2074386, 0x4057d457, 0x7d37fde7,
+    0x3a978737, 0x07f7ae87, 0xb5d77297, 0x88b75b27, 0xcf1721f7,
+    0xf2770847, 0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
+    0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404, 0x20b743d5,
+    0x1dd76a65, 0x5a7710b5, 0x67173905, 0xd537e515, 0xe857cca5,
+    0xaff7b675, 0x92979fc5, 0xe915e8db, 0xd475c16b, 0x93d5bbbb,
+    0xaeb5920b, 0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
+    0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca, 0x2ce505da,
+    0x11852c6a, 0x562556ba, 0x6b457f0a, 0x89f57f59, 0xb49556e9,
+    0xf3352c39, 0xce550589, 0x7c75d999, 0x4115f029, 0x06b58af9,
+    0x3bd5a349, 0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
+    0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888, 0x28d4c7df,
+    0x15b4ee6f, 0x521494bf, 0x6f74bd0f, 0xdd54611f, 0xe03448af,
+    0xa794327f, 0x9af41bcf, 0x18a48c1e, 0x25c4a5ae, 0x6264df7e,
+    0x5f04f6ce, 0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
+    0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d, 0xbdb4f69d,
+    0x80d4df2d, 0xc774a5fd, 0xfa148c4d, 0x78441b9c, 0x4524322c,
+    0x028448fc, 0x3fe4614c, 0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c,
+    0xca64c78c},
+   {0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, 0x8f629757,
+    0x37def032, 0x256b5fdc, 0x9dd738b9, 0xc5b428ef, 0x7d084f8a,
+    0x6fbde064, 0xd7018701, 0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733,
+    0x58631056, 0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
+    0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26, 0x95ad7f70,
+    0x2d111815, 0x3fa4b7fb, 0x8718d09e, 0x1acfe827, 0xa2738f42,
+    0xb0c620ac, 0x087a47c9, 0xa032af3e, 0x188ec85b, 0x0a3b67b5,
+    0xb28700d0, 0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
+    0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f, 0xeae41086,
+    0x525877e3, 0x40edd80d, 0xf851bf68, 0xf02bf8a1, 0x48979fc4,
+    0x5a22302a, 0xe29e574f, 0x7f496ff6, 0xc7f50893, 0xd540a77d,
+    0x6dfcc018, 0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
+    0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7, 0x9b14583d,
+    0x23a83f58, 0x311d90b6, 0x89a1f7d3, 0x1476cf6a, 0xaccaa80f,
+    0xbe7f07e1, 0x06c36084, 0x5ea070d2, 0xe61c17b7, 0xf4a9b859,
+    0x4c15df3c, 0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
+    0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c, 0x446f98f5,
+    0xfcd3ff90, 0xee66507e, 0x56da371b, 0x0eb9274d, 0xb6054028,
+    0xa4b0efc6, 0x1c0c88a3, 0x81dbb01a, 0x3967d77f, 0x2bd27891,
+    0x936e1ff4, 0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
+    0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba, 0xfe92dfec,
+    0x462eb889, 0x549b1767, 0xec277002, 0x71f048bb, 0xc94c2fde,
+    0xdbf98030, 0x6345e755, 0x6b3fa09c, 0xd383c7f9, 0xc1366817,
+    0x798a0f72, 0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
+    0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d, 0x21e91f24,
+    0x99557841, 0x8be0d7af, 0x335cb0ca, 0xed59b63b, 0x55e5d15e,
+    0x47507eb0, 0xffec19d5, 0x623b216c, 0xda874609, 0xc832e9e7,
+    0x708e8e82, 0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
+    0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d, 0xbd40e1a4,
+    0x05fc86c1, 0x1749292f, 0xaff54e4a, 0x322276f3, 0x8a9e1196,
+    0x982bbe78, 0x2097d91d, 0x78f4c94b, 0xc048ae2e, 0xd2fd01c0,
+    0x6a4166a5, 0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
+    0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb, 0xc2098e52,
+    0x7ab5e937, 0x680046d9, 0xd0bc21bc, 0x88df31ea, 0x3063568f,
+    0x22d6f961, 0x9a6a9e04, 0x07bda6bd, 0xbf01c1d8, 0xadb46e36,
+    0x15080953, 0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
+    0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623, 0xd8c66675,
+    0x607a0110, 0x72cfaefe, 0xca73c99b, 0x57a4f122, 0xef189647,
+    0xfdad39a9, 0x45115ecc, 0x764dee06, 0xcef18963, 0xdc44268d,
+    0x64f841e8, 0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
+    0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907, 0x3c9b51be,
+    0x842736db, 0x96929935, 0x2e2efe50, 0x2654b999, 0x9ee8defc,
+    0x8c5d7112, 0x34e11677, 0xa9362ece, 0x118a49ab, 0x033fe645,
+    0xbb838120, 0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
+    0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf, 0xd67f4138,
+    0x6ec3265d, 0x7c7689b3, 0xc4caeed6, 0x591dd66f, 0xe1a1b10a,
+    0xf3141ee4, 0x4ba87981, 0x13cb69d7, 0xab770eb2, 0xb9c2a15c,
+    0x017ec639, 0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
+    0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949, 0x090481f0,
+    0xb1b8e695, 0xa30d497b, 0x1bb12e1e, 0x43d23e48, 0xfb6e592d,
+    0xe9dbf6c3, 0x516791a6, 0xccb0a91f, 0x740cce7a, 0x66b96194,
+    0xde0506f1},
+   {0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59, 0x0709a8dc,
+    0x06cbc2eb, 0x048d7cb2, 0x054f1685, 0x0e1351b8, 0x0fd13b8f,
+    0x0d9785d6, 0x0c55efe1, 0x091af964, 0x08d89353, 0x0a9e2d0a,
+    0x0b5c473d, 0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
+    0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5, 0x1235f2c8,
+    0x13f798ff, 0x11b126a6, 0x10734c91, 0x153c5a14, 0x14fe3023,
+    0x16b88e7a, 0x177ae44d, 0x384d46e0, 0x398f2cd7, 0x3bc9928e,
+    0x3a0bf8b9, 0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
+    0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901, 0x3157bf84,
+    0x3095d5b3, 0x32d36bea, 0x331101dd, 0x246be590, 0x25a98fa7,
+    0x27ef31fe, 0x262d5bc9, 0x23624d4c, 0x22a0277b, 0x20e69922,
+    0x2124f315, 0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
+    0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad, 0x709a8dc0,
+    0x7158e7f7, 0x731e59ae, 0x72dc3399, 0x7793251c, 0x76514f2b,
+    0x7417f172, 0x75d59b45, 0x7e89dc78, 0x7f4bb64f, 0x7d0d0816,
+    0x7ccf6221, 0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
+    0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9, 0x6bb5866c,
+    0x6a77ec5b, 0x68315202, 0x69f33835, 0x62af7f08, 0x636d153f,
+    0x612bab66, 0x60e9c151, 0x65a6d7d4, 0x6464bde3, 0x662203ba,
+    0x67e0698d, 0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
+    0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5, 0x46c49a98,
+    0x4706f0af, 0x45404ef6, 0x448224c1, 0x41cd3244, 0x400f5873,
+    0x4249e62a, 0x438b8c1d, 0x54f16850, 0x55330267, 0x5775bc3e,
+    0x56b7d609, 0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
+    0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1, 0x5deb9134,
+    0x5c29fb03, 0x5e6f455a, 0x5fad2f6d, 0xe1351b80, 0xe0f771b7,
+    0xe2b1cfee, 0xe373a5d9, 0xe63cb35c, 0xe7fed96b, 0xe5b86732,
+    0xe47a0d05, 0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
+    0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd, 0xfd13b8f0,
+    0xfcd1d2c7, 0xfe976c9e, 0xff5506a9, 0xfa1a102c, 0xfbd87a1b,
+    0xf99ec442, 0xf85cae75, 0xf300e948, 0xf2c2837f, 0xf0843d26,
+    0xf1465711, 0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
+    0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339, 0xde71f5bc,
+    0xdfb39f8b, 0xddf521d2, 0xdc374be5, 0xd76b0cd8, 0xd6a966ef,
+    0xd4efd8b6, 0xd52db281, 0xd062a404, 0xd1a0ce33, 0xd3e6706a,
+    0xd2241a5d, 0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
+    0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895, 0xcb4dafa8,
+    0xca8fc59f, 0xc8c97bc6, 0xc90b11f1, 0xcc440774, 0xcd866d43,
+    0xcfc0d31a, 0xce02b92d, 0x91af9640, 0x906dfc77, 0x922b422e,
+    0x93e92819, 0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
+    0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1, 0x98b56f24,
+    0x99770513, 0x9b31bb4a, 0x9af3d17d, 0x8d893530, 0x8c4b5f07,
+    0x8e0de15e, 0x8fcf8b69, 0x8a809dec, 0x8b42f7db, 0x89044982,
+    0x88c623b5, 0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
+    0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d, 0xa9e2d0a0,
+    0xa820ba97, 0xaa6604ce, 0xaba46ef9, 0xaeeb787c, 0xaf29124b,
+    0xad6fac12, 0xacadc625, 0xa7f18118, 0xa633eb2f, 0xa4755576,
+    0xa5b73f41, 0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
+    0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89, 0xb2cddb0c,
+    0xb30fb13b, 0xb1490f62, 0xb08b6555, 0xbbd72268, 0xba15485f,
+    0xb853f606, 0xb9919c31, 0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda,
+    0xbe9834ed},
+   {0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3, 0x646cc504,
+    0x7d77f445, 0x565aa786, 0x4f4196c7, 0xc8d98a08, 0xd1c2bb49,
+    0xfaefe88a, 0xe3f4d9cb, 0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e,
+    0x87981ccf, 0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
+    0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496, 0x821b9859,
+    0x9b00a918, 0xb02dfadb, 0xa936cb9a, 0xe6775d5d, 0xff6c6c1c,
+    0xd4413fdf, 0xcd5a0e9e, 0x958424a2, 0x8c9f15e3, 0xa7b24620,
+    0xbea97761, 0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
+    0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69, 0x39316bae,
+    0x202a5aef, 0x0b07092c, 0x121c386d, 0xdf4636f3, 0xc65d07b2,
+    0xed705471, 0xf46b6530, 0xbb2af3f7, 0xa231c2b6, 0x891c9175,
+    0x9007a034, 0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
+    0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c, 0xf0794f05,
+    0xe9627e44, 0xc24f2d87, 0xdb541cc6, 0x94158a01, 0x8d0ebb40,
+    0xa623e883, 0xbf38d9c2, 0x38a0c50d, 0x21bbf44c, 0x0a96a78f,
+    0x138d96ce, 0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
+    0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97, 0xded79850,
+    0xc7cca911, 0xece1fad2, 0xf5facb93, 0x7262d75c, 0x6b79e61d,
+    0x4054b5de, 0x594f849f, 0x160e1258, 0x0f152319, 0x243870da,
+    0x3d23419b, 0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
+    0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60, 0xad24e1af,
+    0xb43fd0ee, 0x9f12832d, 0x8609b26c, 0xc94824ab, 0xd05315ea,
+    0xfb7e4629, 0xe2657768, 0x2f3f79f6, 0x362448b7, 0x1d091b74,
+    0x04122a35, 0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
+    0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d, 0x838a36fa,
+    0x9a9107bb, 0xb1bc5478, 0xa8a76539, 0x3b83984b, 0x2298a90a,
+    0x09b5fac9, 0x10aecb88, 0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd,
+    0x74c20e8c, 0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
+    0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484, 0x71418a1a,
+    0x685abb5b, 0x4377e898, 0x5a6cd9d9, 0x152d4f1e, 0x0c367e5f,
+    0x271b2d9c, 0x3e001cdd, 0xb9980012, 0xa0833153, 0x8bae6290,
+    0x92b553d1, 0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
+    0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a, 0xca6b79ed,
+    0xd37048ac, 0xf85d1b6f, 0xe1462a2e, 0x66de36e1, 0x7fc507a0,
+    0x54e85463, 0x4df36522, 0x02b2f3e5, 0x1ba9c2a4, 0x30849167,
+    0x299fa026, 0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
+    0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f, 0x2c1c24b0,
+    0x350715f1, 0x1e2a4632, 0x07317773, 0x4870e1b4, 0x516bd0f5,
+    0x7a468336, 0x635db277, 0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc,
+    0xe0d7848d, 0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
+    0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85, 0x674f9842,
+    0x7e54a903, 0x5579fac0, 0x4c62cb81, 0x8138c51f, 0x9823f45e,
+    0xb30ea79d, 0xaa1596dc, 0xe554001b, 0xfc4f315a, 0xd7626299,
+    0xce7953d8, 0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
+    0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0, 0x5e7ef3ec,
+    0x4765c2ad, 0x6c48916e, 0x7553a02f, 0x3a1236e8, 0x230907a9,
+    0x0824546a, 0x113f652b, 0x96a779e4, 0x8fbc48a5, 0xa4911b66,
+    0xbd8a2a27, 0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
+    0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e, 0x70d024b9,
+    0x69cb15f8, 0x42e6463b, 0x5bfd777a, 0xdc656bb5, 0xc57e5af4,
+    0xee530937, 0xf7483876, 0xb809aeb1, 0xa1129ff0, 0x8a3fcc33,
+    0x9324fd72},
+   {0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+    0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+    0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+    0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+    0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+    0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+    0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+    0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+    0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+    0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+    0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+    0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+    0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+    0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+    0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+    0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+    0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+    0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+    0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+    0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+    0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+    0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+    0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+    0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+    0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+    0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+    0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+    0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+    0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+    0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+    0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+    0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+    0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+    0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+    0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+    0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+    0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+    0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+    0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+    0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+    0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+    0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+    0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+    0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+    0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+    0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+    0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+    0x2d02ef8d}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x0000000000000000, 0x9630077700000000, 0x2c610eee00000000,
+    0xba51099900000000, 0x19c46d0700000000, 0x8ff46a7000000000,
+    0x35a563e900000000, 0xa395649e00000000, 0x3288db0e00000000,
+    0xa4b8dc7900000000, 0x1ee9d5e000000000, 0x88d9d29700000000,
+    0x2b4cb60900000000, 0xbd7cb17e00000000, 0x072db8e700000000,
+    0x911dbf9000000000, 0x6410b71d00000000, 0xf220b06a00000000,
+    0x4871b9f300000000, 0xde41be8400000000, 0x7dd4da1a00000000,
+    0xebe4dd6d00000000, 0x51b5d4f400000000, 0xc785d38300000000,
+    0x56986c1300000000, 0xc0a86b6400000000, 0x7af962fd00000000,
+    0xecc9658a00000000, 0x4f5c011400000000, 0xd96c066300000000,
+    0x633d0ffa00000000, 0xf50d088d00000000, 0xc8206e3b00000000,
+    0x5e10694c00000000, 0xe44160d500000000, 0x727167a200000000,
+    0xd1e4033c00000000, 0x47d4044b00000000, 0xfd850dd200000000,
+    0x6bb50aa500000000, 0xfaa8b53500000000, 0x6c98b24200000000,
+    0xd6c9bbdb00000000, 0x40f9bcac00000000, 0xe36cd83200000000,
+    0x755cdf4500000000, 0xcf0dd6dc00000000, 0x593dd1ab00000000,
+    0xac30d92600000000, 0x3a00de5100000000, 0x8051d7c800000000,
+    0x1661d0bf00000000, 0xb5f4b42100000000, 0x23c4b35600000000,
+    0x9995bacf00000000, 0x0fa5bdb800000000, 0x9eb8022800000000,
+    0x0888055f00000000, 0xb2d90cc600000000, 0x24e90bb100000000,
+    0x877c6f2f00000000, 0x114c685800000000, 0xab1d61c100000000,
+    0x3d2d66b600000000, 0x9041dc7600000000, 0x0671db0100000000,
+    0xbc20d29800000000, 0x2a10d5ef00000000, 0x8985b17100000000,
+    0x1fb5b60600000000, 0xa5e4bf9f00000000, 0x33d4b8e800000000,
+    0xa2c9077800000000, 0x34f9000f00000000, 0x8ea8099600000000,
+    0x18980ee100000000, 0xbb0d6a7f00000000, 0x2d3d6d0800000000,
+    0x976c649100000000, 0x015c63e600000000, 0xf4516b6b00000000,
+    0x62616c1c00000000, 0xd830658500000000, 0x4e0062f200000000,
+    0xed95066c00000000, 0x7ba5011b00000000, 0xc1f4088200000000,
+    0x57c40ff500000000, 0xc6d9b06500000000, 0x50e9b71200000000,
+    0xeab8be8b00000000, 0x7c88b9fc00000000, 0xdf1ddd6200000000,
+    0x492dda1500000000, 0xf37cd38c00000000, 0x654cd4fb00000000,
+    0x5861b24d00000000, 0xce51b53a00000000, 0x7400bca300000000,
+    0xe230bbd400000000, 0x41a5df4a00000000, 0xd795d83d00000000,
+    0x6dc4d1a400000000, 0xfbf4d6d300000000, 0x6ae9694300000000,
+    0xfcd96e3400000000, 0x468867ad00000000, 0xd0b860da00000000,
+    0x732d044400000000, 0xe51d033300000000, 0x5f4c0aaa00000000,
+    0xc97c0ddd00000000, 0x3c71055000000000, 0xaa41022700000000,
+    0x10100bbe00000000, 0x86200cc900000000, 0x25b5685700000000,
+    0xb3856f2000000000, 0x09d466b900000000, 0x9fe461ce00000000,
+    0x0ef9de5e00000000, 0x98c9d92900000000, 0x2298d0b000000000,
+    0xb4a8d7c700000000, 0x173db35900000000, 0x810db42e00000000,
+    0x3b5cbdb700000000, 0xad6cbac000000000, 0x2083b8ed00000000,
+    0xb6b3bf9a00000000, 0x0ce2b60300000000, 0x9ad2b17400000000,
+    0x3947d5ea00000000, 0xaf77d29d00000000, 0x1526db0400000000,
+    0x8316dc7300000000, 0x120b63e300000000, 0x843b649400000000,
+    0x3e6a6d0d00000000, 0xa85a6a7a00000000, 0x0bcf0ee400000000,
+    0x9dff099300000000, 0x27ae000a00000000, 0xb19e077d00000000,
+    0x44930ff000000000, 0xd2a3088700000000, 0x68f2011e00000000,
+    0xfec2066900000000, 0x5d5762f700000000, 0xcb67658000000000,
+    0x71366c1900000000, 0xe7066b6e00000000, 0x761bd4fe00000000,
+    0xe02bd38900000000, 0x5a7ada1000000000, 0xcc4add6700000000,
+    0x6fdfb9f900000000, 0xf9efbe8e00000000, 0x43beb71700000000,
+    0xd58eb06000000000, 0xe8a3d6d600000000, 0x7e93d1a100000000,
+    0xc4c2d83800000000, 0x52f2df4f00000000, 0xf167bbd100000000,
+    0x6757bca600000000, 0xdd06b53f00000000, 0x4b36b24800000000,
+    0xda2b0dd800000000, 0x4c1b0aaf00000000, 0xf64a033600000000,
+    0x607a044100000000, 0xc3ef60df00000000, 0x55df67a800000000,
+    0xef8e6e3100000000, 0x79be694600000000, 0x8cb361cb00000000,
+    0x1a8366bc00000000, 0xa0d26f2500000000, 0x36e2685200000000,
+    0x95770ccc00000000, 0x03470bbb00000000, 0xb916022200000000,
+    0x2f26055500000000, 0xbe3bbac500000000, 0x280bbdb200000000,
+    0x925ab42b00000000, 0x046ab35c00000000, 0xa7ffd7c200000000,
+    0x31cfd0b500000000, 0x8b9ed92c00000000, 0x1daede5b00000000,
+    0xb0c2649b00000000, 0x26f263ec00000000, 0x9ca36a7500000000,
+    0x0a936d0200000000, 0xa906099c00000000, 0x3f360eeb00000000,
+    0x8567077200000000, 0x1357000500000000, 0x824abf9500000000,
+    0x147ab8e200000000, 0xae2bb17b00000000, 0x381bb60c00000000,
+    0x9b8ed29200000000, 0x0dbed5e500000000, 0xb7efdc7c00000000,
+    0x21dfdb0b00000000, 0xd4d2d38600000000, 0x42e2d4f100000000,
+    0xf8b3dd6800000000, 0x6e83da1f00000000, 0xcd16be8100000000,
+    0x5b26b9f600000000, 0xe177b06f00000000, 0x7747b71800000000,
+    0xe65a088800000000, 0x706a0fff00000000, 0xca3b066600000000,
+    0x5c0b011100000000, 0xff9e658f00000000, 0x69ae62f800000000,
+    0xd3ff6b6100000000, 0x45cf6c1600000000, 0x78e20aa000000000,
+    0xeed20dd700000000, 0x5483044e00000000, 0xc2b3033900000000,
+    0x612667a700000000, 0xf71660d000000000, 0x4d47694900000000,
+    0xdb776e3e00000000, 0x4a6ad1ae00000000, 0xdc5ad6d900000000,
+    0x660bdf4000000000, 0xf03bd83700000000, 0x53aebca900000000,
+    0xc59ebbde00000000, 0x7fcfb24700000000, 0xe9ffb53000000000,
+    0x1cf2bdbd00000000, 0x8ac2baca00000000, 0x3093b35300000000,
+    0xa6a3b42400000000, 0x0536d0ba00000000, 0x9306d7cd00000000,
+    0x2957de5400000000, 0xbf67d92300000000, 0x2e7a66b300000000,
+    0xb84a61c400000000, 0x021b685d00000000, 0x942b6f2a00000000,
+    0x37be0bb400000000, 0xa18e0cc300000000, 0x1bdf055a00000000,
+    0x8def022d00000000},
+   {0x0000000000000000, 0x41311b1900000000, 0x8262363200000000,
+    0xc3532d2b00000000, 0x04c56c6400000000, 0x45f4777d00000000,
+    0x86a75a5600000000, 0xc796414f00000000, 0x088ad9c800000000,
+    0x49bbc2d100000000, 0x8ae8effa00000000, 0xcbd9f4e300000000,
+    0x0c4fb5ac00000000, 0x4d7eaeb500000000, 0x8e2d839e00000000,
+    0xcf1c988700000000, 0x5112c24a00000000, 0x1023d95300000000,
+    0xd370f47800000000, 0x9241ef6100000000, 0x55d7ae2e00000000,
+    0x14e6b53700000000, 0xd7b5981c00000000, 0x9684830500000000,
+    0x59981b8200000000, 0x18a9009b00000000, 0xdbfa2db000000000,
+    0x9acb36a900000000, 0x5d5d77e600000000, 0x1c6c6cff00000000,
+    0xdf3f41d400000000, 0x9e0e5acd00000000, 0xa224849500000000,
+    0xe3159f8c00000000, 0x2046b2a700000000, 0x6177a9be00000000,
+    0xa6e1e8f100000000, 0xe7d0f3e800000000, 0x2483dec300000000,
+    0x65b2c5da00000000, 0xaaae5d5d00000000, 0xeb9f464400000000,
+    0x28cc6b6f00000000, 0x69fd707600000000, 0xae6b313900000000,
+    0xef5a2a2000000000, 0x2c09070b00000000, 0x6d381c1200000000,
+    0xf33646df00000000, 0xb2075dc600000000, 0x715470ed00000000,
+    0x30656bf400000000, 0xf7f32abb00000000, 0xb6c231a200000000,
+    0x75911c8900000000, 0x34a0079000000000, 0xfbbc9f1700000000,
+    0xba8d840e00000000, 0x79dea92500000000, 0x38efb23c00000000,
+    0xff79f37300000000, 0xbe48e86a00000000, 0x7d1bc54100000000,
+    0x3c2ade5800000000, 0x054f79f000000000, 0x447e62e900000000,
+    0x872d4fc200000000, 0xc61c54db00000000, 0x018a159400000000,
+    0x40bb0e8d00000000, 0x83e823a600000000, 0xc2d938bf00000000,
+    0x0dc5a03800000000, 0x4cf4bb2100000000, 0x8fa7960a00000000,
+    0xce968d1300000000, 0x0900cc5c00000000, 0x4831d74500000000,
+    0x8b62fa6e00000000, 0xca53e17700000000, 0x545dbbba00000000,
+    0x156ca0a300000000, 0xd63f8d8800000000, 0x970e969100000000,
+    0x5098d7de00000000, 0x11a9ccc700000000, 0xd2fae1ec00000000,
+    0x93cbfaf500000000, 0x5cd7627200000000, 0x1de6796b00000000,
+    0xdeb5544000000000, 0x9f844f5900000000, 0x58120e1600000000,
+    0x1923150f00000000, 0xda70382400000000, 0x9b41233d00000000,
+    0xa76bfd6500000000, 0xe65ae67c00000000, 0x2509cb5700000000,
+    0x6438d04e00000000, 0xa3ae910100000000, 0xe29f8a1800000000,
+    0x21cca73300000000, 0x60fdbc2a00000000, 0xafe124ad00000000,
+    0xeed03fb400000000, 0x2d83129f00000000, 0x6cb2098600000000,
+    0xab2448c900000000, 0xea1553d000000000, 0x29467efb00000000,
+    0x687765e200000000, 0xf6793f2f00000000, 0xb748243600000000,
+    0x741b091d00000000, 0x352a120400000000, 0xf2bc534b00000000,
+    0xb38d485200000000, 0x70de657900000000, 0x31ef7e6000000000,
+    0xfef3e6e700000000, 0xbfc2fdfe00000000, 0x7c91d0d500000000,
+    0x3da0cbcc00000000, 0xfa368a8300000000, 0xbb07919a00000000,
+    0x7854bcb100000000, 0x3965a7a800000000, 0x4b98833b00000000,
+    0x0aa9982200000000, 0xc9fab50900000000, 0x88cbae1000000000,
+    0x4f5def5f00000000, 0x0e6cf44600000000, 0xcd3fd96d00000000,
+    0x8c0ec27400000000, 0x43125af300000000, 0x022341ea00000000,
+    0xc1706cc100000000, 0x804177d800000000, 0x47d7369700000000,
+    0x06e62d8e00000000, 0xc5b500a500000000, 0x84841bbc00000000,
+    0x1a8a417100000000, 0x5bbb5a6800000000, 0x98e8774300000000,
+    0xd9d96c5a00000000, 0x1e4f2d1500000000, 0x5f7e360c00000000,
+    0x9c2d1b2700000000, 0xdd1c003e00000000, 0x120098b900000000,
+    0x533183a000000000, 0x9062ae8b00000000, 0xd153b59200000000,
+    0x16c5f4dd00000000, 0x57f4efc400000000, 0x94a7c2ef00000000,
+    0xd596d9f600000000, 0xe9bc07ae00000000, 0xa88d1cb700000000,
+    0x6bde319c00000000, 0x2aef2a8500000000, 0xed796bca00000000,
+    0xac4870d300000000, 0x6f1b5df800000000, 0x2e2a46e100000000,
+    0xe136de6600000000, 0xa007c57f00000000, 0x6354e85400000000,
+    0x2265f34d00000000, 0xe5f3b20200000000, 0xa4c2a91b00000000,
+    0x6791843000000000, 0x26a09f2900000000, 0xb8aec5e400000000,
+    0xf99fdefd00000000, 0x3accf3d600000000, 0x7bfde8cf00000000,
+    0xbc6ba98000000000, 0xfd5ab29900000000, 0x3e099fb200000000,
+    0x7f3884ab00000000, 0xb0241c2c00000000, 0xf115073500000000,
+    0x32462a1e00000000, 0x7377310700000000, 0xb4e1704800000000,
+    0xf5d06b5100000000, 0x3683467a00000000, 0x77b25d6300000000,
+    0x4ed7facb00000000, 0x0fe6e1d200000000, 0xccb5ccf900000000,
+    0x8d84d7e000000000, 0x4a1296af00000000, 0x0b238db600000000,
+    0xc870a09d00000000, 0x8941bb8400000000, 0x465d230300000000,
+    0x076c381a00000000, 0xc43f153100000000, 0x850e0e2800000000,
+    0x42984f6700000000, 0x03a9547e00000000, 0xc0fa795500000000,
+    0x81cb624c00000000, 0x1fc5388100000000, 0x5ef4239800000000,
+    0x9da70eb300000000, 0xdc9615aa00000000, 0x1b0054e500000000,
+    0x5a314ffc00000000, 0x996262d700000000, 0xd85379ce00000000,
+    0x174fe14900000000, 0x567efa5000000000, 0x952dd77b00000000,
+    0xd41ccc6200000000, 0x138a8d2d00000000, 0x52bb963400000000,
+    0x91e8bb1f00000000, 0xd0d9a00600000000, 0xecf37e5e00000000,
+    0xadc2654700000000, 0x6e91486c00000000, 0x2fa0537500000000,
+    0xe836123a00000000, 0xa907092300000000, 0x6a54240800000000,
+    0x2b653f1100000000, 0xe479a79600000000, 0xa548bc8f00000000,
+    0x661b91a400000000, 0x272a8abd00000000, 0xe0bccbf200000000,
+    0xa18dd0eb00000000, 0x62defdc000000000, 0x23efe6d900000000,
+    0xbde1bc1400000000, 0xfcd0a70d00000000, 0x3f838a2600000000,
+    0x7eb2913f00000000, 0xb924d07000000000, 0xf815cb6900000000,
+    0x3b46e64200000000, 0x7a77fd5b00000000, 0xb56b65dc00000000,
+    0xf45a7ec500000000, 0x370953ee00000000, 0x763848f700000000,
+    0xb1ae09b800000000, 0xf09f12a100000000, 0x33cc3f8a00000000,
+    0x72fd249300000000},
+   {0x0000000000000000, 0x376ac20100000000, 0x6ed4840300000000,
+    0x59be460200000000, 0xdca8090700000000, 0xebc2cb0600000000,
+    0xb27c8d0400000000, 0x85164f0500000000, 0xb851130e00000000,
+    0x8f3bd10f00000000, 0xd685970d00000000, 0xe1ef550c00000000,
+    0x64f91a0900000000, 0x5393d80800000000, 0x0a2d9e0a00000000,
+    0x3d475c0b00000000, 0x70a3261c00000000, 0x47c9e41d00000000,
+    0x1e77a21f00000000, 0x291d601e00000000, 0xac0b2f1b00000000,
+    0x9b61ed1a00000000, 0xc2dfab1800000000, 0xf5b5691900000000,
+    0xc8f2351200000000, 0xff98f71300000000, 0xa626b11100000000,
+    0x914c731000000000, 0x145a3c1500000000, 0x2330fe1400000000,
+    0x7a8eb81600000000, 0x4de47a1700000000, 0xe0464d3800000000,
+    0xd72c8f3900000000, 0x8e92c93b00000000, 0xb9f80b3a00000000,
+    0x3cee443f00000000, 0x0b84863e00000000, 0x523ac03c00000000,
+    0x6550023d00000000, 0x58175e3600000000, 0x6f7d9c3700000000,
+    0x36c3da3500000000, 0x01a9183400000000, 0x84bf573100000000,
+    0xb3d5953000000000, 0xea6bd33200000000, 0xdd01113300000000,
+    0x90e56b2400000000, 0xa78fa92500000000, 0xfe31ef2700000000,
+    0xc95b2d2600000000, 0x4c4d622300000000, 0x7b27a02200000000,
+    0x2299e62000000000, 0x15f3242100000000, 0x28b4782a00000000,
+    0x1fdeba2b00000000, 0x4660fc2900000000, 0x710a3e2800000000,
+    0xf41c712d00000000, 0xc376b32c00000000, 0x9ac8f52e00000000,
+    0xada2372f00000000, 0xc08d9a7000000000, 0xf7e7587100000000,
+    0xae591e7300000000, 0x9933dc7200000000, 0x1c25937700000000,
+    0x2b4f517600000000, 0x72f1177400000000, 0x459bd57500000000,
+    0x78dc897e00000000, 0x4fb64b7f00000000, 0x16080d7d00000000,
+    0x2162cf7c00000000, 0xa474807900000000, 0x931e427800000000,
+    0xcaa0047a00000000, 0xfdcac67b00000000, 0xb02ebc6c00000000,
+    0x87447e6d00000000, 0xdefa386f00000000, 0xe990fa6e00000000,
+    0x6c86b56b00000000, 0x5bec776a00000000, 0x0252316800000000,
+    0x3538f36900000000, 0x087faf6200000000, 0x3f156d6300000000,
+    0x66ab2b6100000000, 0x51c1e96000000000, 0xd4d7a66500000000,
+    0xe3bd646400000000, 0xba03226600000000, 0x8d69e06700000000,
+    0x20cbd74800000000, 0x17a1154900000000, 0x4e1f534b00000000,
+    0x7975914a00000000, 0xfc63de4f00000000, 0xcb091c4e00000000,
+    0x92b75a4c00000000, 0xa5dd984d00000000, 0x989ac44600000000,
+    0xaff0064700000000, 0xf64e404500000000, 0xc124824400000000,
+    0x4432cd4100000000, 0x73580f4000000000, 0x2ae6494200000000,
+    0x1d8c8b4300000000, 0x5068f15400000000, 0x6702335500000000,
+    0x3ebc755700000000, 0x09d6b75600000000, 0x8cc0f85300000000,
+    0xbbaa3a5200000000, 0xe2147c5000000000, 0xd57ebe5100000000,
+    0xe839e25a00000000, 0xdf53205b00000000, 0x86ed665900000000,
+    0xb187a45800000000, 0x3491eb5d00000000, 0x03fb295c00000000,
+    0x5a456f5e00000000, 0x6d2fad5f00000000, 0x801b35e100000000,
+    0xb771f7e000000000, 0xeecfb1e200000000, 0xd9a573e300000000,
+    0x5cb33ce600000000, 0x6bd9fee700000000, 0x3267b8e500000000,
+    0x050d7ae400000000, 0x384a26ef00000000, 0x0f20e4ee00000000,
+    0x569ea2ec00000000, 0x61f460ed00000000, 0xe4e22fe800000000,
+    0xd388ede900000000, 0x8a36abeb00000000, 0xbd5c69ea00000000,
+    0xf0b813fd00000000, 0xc7d2d1fc00000000, 0x9e6c97fe00000000,
+    0xa90655ff00000000, 0x2c101afa00000000, 0x1b7ad8fb00000000,
+    0x42c49ef900000000, 0x75ae5cf800000000, 0x48e900f300000000,
+    0x7f83c2f200000000, 0x263d84f000000000, 0x115746f100000000,
+    0x944109f400000000, 0xa32bcbf500000000, 0xfa958df700000000,
+    0xcdff4ff600000000, 0x605d78d900000000, 0x5737bad800000000,
+    0x0e89fcda00000000, 0x39e33edb00000000, 0xbcf571de00000000,
+    0x8b9fb3df00000000, 0xd221f5dd00000000, 0xe54b37dc00000000,
+    0xd80c6bd700000000, 0xef66a9d600000000, 0xb6d8efd400000000,
+    0x81b22dd500000000, 0x04a462d000000000, 0x33cea0d100000000,
+    0x6a70e6d300000000, 0x5d1a24d200000000, 0x10fe5ec500000000,
+    0x27949cc400000000, 0x7e2adac600000000, 0x494018c700000000,
+    0xcc5657c200000000, 0xfb3c95c300000000, 0xa282d3c100000000,
+    0x95e811c000000000, 0xa8af4dcb00000000, 0x9fc58fca00000000,
+    0xc67bc9c800000000, 0xf1110bc900000000, 0x740744cc00000000,
+    0x436d86cd00000000, 0x1ad3c0cf00000000, 0x2db902ce00000000,
+    0x4096af9100000000, 0x77fc6d9000000000, 0x2e422b9200000000,
+    0x1928e99300000000, 0x9c3ea69600000000, 0xab54649700000000,
+    0xf2ea229500000000, 0xc580e09400000000, 0xf8c7bc9f00000000,
+    0xcfad7e9e00000000, 0x9613389c00000000, 0xa179fa9d00000000,
+    0x246fb59800000000, 0x1305779900000000, 0x4abb319b00000000,
+    0x7dd1f39a00000000, 0x3035898d00000000, 0x075f4b8c00000000,
+    0x5ee10d8e00000000, 0x698bcf8f00000000, 0xec9d808a00000000,
+    0xdbf7428b00000000, 0x8249048900000000, 0xb523c68800000000,
+    0x88649a8300000000, 0xbf0e588200000000, 0xe6b01e8000000000,
+    0xd1dadc8100000000, 0x54cc938400000000, 0x63a6518500000000,
+    0x3a18178700000000, 0x0d72d58600000000, 0xa0d0e2a900000000,
+    0x97ba20a800000000, 0xce0466aa00000000, 0xf96ea4ab00000000,
+    0x7c78ebae00000000, 0x4b1229af00000000, 0x12ac6fad00000000,
+    0x25c6adac00000000, 0x1881f1a700000000, 0x2feb33a600000000,
+    0x765575a400000000, 0x413fb7a500000000, 0xc429f8a000000000,
+    0xf3433aa100000000, 0xaafd7ca300000000, 0x9d97bea200000000,
+    0xd073c4b500000000, 0xe71906b400000000, 0xbea740b600000000,
+    0x89cd82b700000000, 0x0cdbcdb200000000, 0x3bb10fb300000000,
+    0x620f49b100000000, 0x55658bb000000000, 0x6822d7bb00000000,
+    0x5f4815ba00000000, 0x06f653b800000000, 0x319c91b900000000,
+    0xb48adebc00000000, 0x83e01cbd00000000, 0xda5e5abf00000000,
+    0xed3498be00000000},
+   {0x0000000000000000, 0x6567bcb800000000, 0x8bc809aa00000000,
+    0xeeafb51200000000, 0x5797628f00000000, 0x32f0de3700000000,
+    0xdc5f6b2500000000, 0xb938d79d00000000, 0xef28b4c500000000,
+    0x8a4f087d00000000, 0x64e0bd6f00000000, 0x018701d700000000,
+    0xb8bfd64a00000000, 0xddd86af200000000, 0x3377dfe000000000,
+    0x5610635800000000, 0x9f57195000000000, 0xfa30a5e800000000,
+    0x149f10fa00000000, 0x71f8ac4200000000, 0xc8c07bdf00000000,
+    0xada7c76700000000, 0x4308727500000000, 0x266fcecd00000000,
+    0x707fad9500000000, 0x1518112d00000000, 0xfbb7a43f00000000,
+    0x9ed0188700000000, 0x27e8cf1a00000000, 0x428f73a200000000,
+    0xac20c6b000000000, 0xc9477a0800000000, 0x3eaf32a000000000,
+    0x5bc88e1800000000, 0xb5673b0a00000000, 0xd00087b200000000,
+    0x6938502f00000000, 0x0c5fec9700000000, 0xe2f0598500000000,
+    0x8797e53d00000000, 0xd187866500000000, 0xb4e03add00000000,
+    0x5a4f8fcf00000000, 0x3f28337700000000, 0x8610e4ea00000000,
+    0xe377585200000000, 0x0dd8ed4000000000, 0x68bf51f800000000,
+    0xa1f82bf000000000, 0xc49f974800000000, 0x2a30225a00000000,
+    0x4f579ee200000000, 0xf66f497f00000000, 0x9308f5c700000000,
+    0x7da740d500000000, 0x18c0fc6d00000000, 0x4ed09f3500000000,
+    0x2bb7238d00000000, 0xc518969f00000000, 0xa07f2a2700000000,
+    0x1947fdba00000000, 0x7c20410200000000, 0x928ff41000000000,
+    0xf7e848a800000000, 0x3d58149b00000000, 0x583fa82300000000,
+    0xb6901d3100000000, 0xd3f7a18900000000, 0x6acf761400000000,
+    0x0fa8caac00000000, 0xe1077fbe00000000, 0x8460c30600000000,
+    0xd270a05e00000000, 0xb7171ce600000000, 0x59b8a9f400000000,
+    0x3cdf154c00000000, 0x85e7c2d100000000, 0xe0807e6900000000,
+    0x0e2fcb7b00000000, 0x6b4877c300000000, 0xa20f0dcb00000000,
+    0xc768b17300000000, 0x29c7046100000000, 0x4ca0b8d900000000,
+    0xf5986f4400000000, 0x90ffd3fc00000000, 0x7e5066ee00000000,
+    0x1b37da5600000000, 0x4d27b90e00000000, 0x284005b600000000,
+    0xc6efb0a400000000, 0xa3880c1c00000000, 0x1ab0db8100000000,
+    0x7fd7673900000000, 0x9178d22b00000000, 0xf41f6e9300000000,
+    0x03f7263b00000000, 0x66909a8300000000, 0x883f2f9100000000,
+    0xed58932900000000, 0x546044b400000000, 0x3107f80c00000000,
+    0xdfa84d1e00000000, 0xbacff1a600000000, 0xecdf92fe00000000,
+    0x89b82e4600000000, 0x67179b5400000000, 0x027027ec00000000,
+    0xbb48f07100000000, 0xde2f4cc900000000, 0x3080f9db00000000,
+    0x55e7456300000000, 0x9ca03f6b00000000, 0xf9c783d300000000,
+    0x176836c100000000, 0x720f8a7900000000, 0xcb375de400000000,
+    0xae50e15c00000000, 0x40ff544e00000000, 0x2598e8f600000000,
+    0x73888bae00000000, 0x16ef371600000000, 0xf840820400000000,
+    0x9d273ebc00000000, 0x241fe92100000000, 0x4178559900000000,
+    0xafd7e08b00000000, 0xcab05c3300000000, 0x3bb659ed00000000,
+    0x5ed1e55500000000, 0xb07e504700000000, 0xd519ecff00000000,
+    0x6c213b6200000000, 0x094687da00000000, 0xe7e932c800000000,
+    0x828e8e7000000000, 0xd49eed2800000000, 0xb1f9519000000000,
+    0x5f56e48200000000, 0x3a31583a00000000, 0x83098fa700000000,
+    0xe66e331f00000000, 0x08c1860d00000000, 0x6da63ab500000000,
+    0xa4e140bd00000000, 0xc186fc0500000000, 0x2f29491700000000,
+    0x4a4ef5af00000000, 0xf376223200000000, 0x96119e8a00000000,
+    0x78be2b9800000000, 0x1dd9972000000000, 0x4bc9f47800000000,
+    0x2eae48c000000000, 0xc001fdd200000000, 0xa566416a00000000,
+    0x1c5e96f700000000, 0x79392a4f00000000, 0x97969f5d00000000,
+    0xf2f123e500000000, 0x05196b4d00000000, 0x607ed7f500000000,
+    0x8ed162e700000000, 0xebb6de5f00000000, 0x528e09c200000000,
+    0x37e9b57a00000000, 0xd946006800000000, 0xbc21bcd000000000,
+    0xea31df8800000000, 0x8f56633000000000, 0x61f9d62200000000,
+    0x049e6a9a00000000, 0xbda6bd0700000000, 0xd8c101bf00000000,
+    0x366eb4ad00000000, 0x5309081500000000, 0x9a4e721d00000000,
+    0xff29cea500000000, 0x11867bb700000000, 0x74e1c70f00000000,
+    0xcdd9109200000000, 0xa8beac2a00000000, 0x4611193800000000,
+    0x2376a58000000000, 0x7566c6d800000000, 0x10017a6000000000,
+    0xfeaecf7200000000, 0x9bc973ca00000000, 0x22f1a45700000000,
+    0x479618ef00000000, 0xa939adfd00000000, 0xcc5e114500000000,
+    0x06ee4d7600000000, 0x6389f1ce00000000, 0x8d2644dc00000000,
+    0xe841f86400000000, 0x51792ff900000000, 0x341e934100000000,
+    0xdab1265300000000, 0xbfd69aeb00000000, 0xe9c6f9b300000000,
+    0x8ca1450b00000000, 0x620ef01900000000, 0x07694ca100000000,
+    0xbe519b3c00000000, 0xdb36278400000000, 0x3599929600000000,
+    0x50fe2e2e00000000, 0x99b9542600000000, 0xfcdee89e00000000,
+    0x12715d8c00000000, 0x7716e13400000000, 0xce2e36a900000000,
+    0xab498a1100000000, 0x45e63f0300000000, 0x208183bb00000000,
+    0x7691e0e300000000, 0x13f65c5b00000000, 0xfd59e94900000000,
+    0x983e55f100000000, 0x2106826c00000000, 0x44613ed400000000,
+    0xaace8bc600000000, 0xcfa9377e00000000, 0x38417fd600000000,
+    0x5d26c36e00000000, 0xb389767c00000000, 0xd6eecac400000000,
+    0x6fd61d5900000000, 0x0ab1a1e100000000, 0xe41e14f300000000,
+    0x8179a84b00000000, 0xd769cb1300000000, 0xb20e77ab00000000,
+    0x5ca1c2b900000000, 0x39c67e0100000000, 0x80fea99c00000000,
+    0xe599152400000000, 0x0b36a03600000000, 0x6e511c8e00000000,
+    0xa716668600000000, 0xc271da3e00000000, 0x2cde6f2c00000000,
+    0x49b9d39400000000, 0xf081040900000000, 0x95e6b8b100000000,
+    0x7b490da300000000, 0x1e2eb11b00000000, 0x483ed24300000000,
+    0x2d596efb00000000, 0xc3f6dbe900000000, 0xa691675100000000,
+    0x1fa9b0cc00000000, 0x7ace0c7400000000, 0x9461b96600000000,
+    0xf10605de00000000},
+   {0x0000000000000000, 0xb029603d00000000, 0x6053c07a00000000,
+    0xd07aa04700000000, 0xc0a680f500000000, 0x708fe0c800000000,
+    0xa0f5408f00000000, 0x10dc20b200000000, 0xc14b703000000000,
+    0x7162100d00000000, 0xa118b04a00000000, 0x1131d07700000000,
+    0x01edf0c500000000, 0xb1c490f800000000, 0x61be30bf00000000,
+    0xd197508200000000, 0x8297e06000000000, 0x32be805d00000000,
+    0xe2c4201a00000000, 0x52ed402700000000, 0x4231609500000000,
+    0xf21800a800000000, 0x2262a0ef00000000, 0x924bc0d200000000,
+    0x43dc905000000000, 0xf3f5f06d00000000, 0x238f502a00000000,
+    0x93a6301700000000, 0x837a10a500000000, 0x3353709800000000,
+    0xe329d0df00000000, 0x5300b0e200000000, 0x042fc1c100000000,
+    0xb406a1fc00000000, 0x647c01bb00000000, 0xd455618600000000,
+    0xc489413400000000, 0x74a0210900000000, 0xa4da814e00000000,
+    0x14f3e17300000000, 0xc564b1f100000000, 0x754dd1cc00000000,
+    0xa537718b00000000, 0x151e11b600000000, 0x05c2310400000000,
+    0xb5eb513900000000, 0x6591f17e00000000, 0xd5b8914300000000,
+    0x86b821a100000000, 0x3691419c00000000, 0xe6ebe1db00000000,
+    0x56c281e600000000, 0x461ea15400000000, 0xf637c16900000000,
+    0x264d612e00000000, 0x9664011300000000, 0x47f3519100000000,
+    0xf7da31ac00000000, 0x27a091eb00000000, 0x9789f1d600000000,
+    0x8755d16400000000, 0x377cb15900000000, 0xe706111e00000000,
+    0x572f712300000000, 0x4958f35800000000, 0xf971936500000000,
+    0x290b332200000000, 0x9922531f00000000, 0x89fe73ad00000000,
+    0x39d7139000000000, 0xe9adb3d700000000, 0x5984d3ea00000000,
+    0x8813836800000000, 0x383ae35500000000, 0xe840431200000000,
+    0x5869232f00000000, 0x48b5039d00000000, 0xf89c63a000000000,
+    0x28e6c3e700000000, 0x98cfa3da00000000, 0xcbcf133800000000,
+    0x7be6730500000000, 0xab9cd34200000000, 0x1bb5b37f00000000,
+    0x0b6993cd00000000, 0xbb40f3f000000000, 0x6b3a53b700000000,
+    0xdb13338a00000000, 0x0a84630800000000, 0xbaad033500000000,
+    0x6ad7a37200000000, 0xdafec34f00000000, 0xca22e3fd00000000,
+    0x7a0b83c000000000, 0xaa71238700000000, 0x1a5843ba00000000,
+    0x4d77329900000000, 0xfd5e52a400000000, 0x2d24f2e300000000,
+    0x9d0d92de00000000, 0x8dd1b26c00000000, 0x3df8d25100000000,
+    0xed82721600000000, 0x5dab122b00000000, 0x8c3c42a900000000,
+    0x3c15229400000000, 0xec6f82d300000000, 0x5c46e2ee00000000,
+    0x4c9ac25c00000000, 0xfcb3a26100000000, 0x2cc9022600000000,
+    0x9ce0621b00000000, 0xcfe0d2f900000000, 0x7fc9b2c400000000,
+    0xafb3128300000000, 0x1f9a72be00000000, 0x0f46520c00000000,
+    0xbf6f323100000000, 0x6f15927600000000, 0xdf3cf24b00000000,
+    0x0eaba2c900000000, 0xbe82c2f400000000, 0x6ef862b300000000,
+    0xded1028e00000000, 0xce0d223c00000000, 0x7e24420100000000,
+    0xae5ee24600000000, 0x1e77827b00000000, 0x92b0e6b100000000,
+    0x2299868c00000000, 0xf2e326cb00000000, 0x42ca46f600000000,
+    0x5216664400000000, 0xe23f067900000000, 0x3245a63e00000000,
+    0x826cc60300000000, 0x53fb968100000000, 0xe3d2f6bc00000000,
+    0x33a856fb00000000, 0x838136c600000000, 0x935d167400000000,
+    0x2374764900000000, 0xf30ed60e00000000, 0x4327b63300000000,
+    0x102706d100000000, 0xa00e66ec00000000, 0x7074c6ab00000000,
+    0xc05da69600000000, 0xd081862400000000, 0x60a8e61900000000,
+    0xb0d2465e00000000, 0x00fb266300000000, 0xd16c76e100000000,
+    0x614516dc00000000, 0xb13fb69b00000000, 0x0116d6a600000000,
+    0x11caf61400000000, 0xa1e3962900000000, 0x7199366e00000000,
+    0xc1b0565300000000, 0x969f277000000000, 0x26b6474d00000000,
+    0xf6cce70a00000000, 0x46e5873700000000, 0x5639a78500000000,
+    0xe610c7b800000000, 0x366a67ff00000000, 0x864307c200000000,
+    0x57d4574000000000, 0xe7fd377d00000000, 0x3787973a00000000,
+    0x87aef70700000000, 0x9772d7b500000000, 0x275bb78800000000,
+    0xf72117cf00000000, 0x470877f200000000, 0x1408c71000000000,
+    0xa421a72d00000000, 0x745b076a00000000, 0xc472675700000000,
+    0xd4ae47e500000000, 0x648727d800000000, 0xb4fd879f00000000,
+    0x04d4e7a200000000, 0xd543b72000000000, 0x656ad71d00000000,
+    0xb510775a00000000, 0x0539176700000000, 0x15e537d500000000,
+    0xa5cc57e800000000, 0x75b6f7af00000000, 0xc59f979200000000,
+    0xdbe815e900000000, 0x6bc175d400000000, 0xbbbbd59300000000,
+    0x0b92b5ae00000000, 0x1b4e951c00000000, 0xab67f52100000000,
+    0x7b1d556600000000, 0xcb34355b00000000, 0x1aa365d900000000,
+    0xaa8a05e400000000, 0x7af0a5a300000000, 0xcad9c59e00000000,
+    0xda05e52c00000000, 0x6a2c851100000000, 0xba56255600000000,
+    0x0a7f456b00000000, 0x597ff58900000000, 0xe95695b400000000,
+    0x392c35f300000000, 0x890555ce00000000, 0x99d9757c00000000,
+    0x29f0154100000000, 0xf98ab50600000000, 0x49a3d53b00000000,
+    0x983485b900000000, 0x281de58400000000, 0xf86745c300000000,
+    0x484e25fe00000000, 0x5892054c00000000, 0xe8bb657100000000,
+    0x38c1c53600000000, 0x88e8a50b00000000, 0xdfc7d42800000000,
+    0x6feeb41500000000, 0xbf94145200000000, 0x0fbd746f00000000,
+    0x1f6154dd00000000, 0xaf4834e000000000, 0x7f3294a700000000,
+    0xcf1bf49a00000000, 0x1e8ca41800000000, 0xaea5c42500000000,
+    0x7edf646200000000, 0xcef6045f00000000, 0xde2a24ed00000000,
+    0x6e0344d000000000, 0xbe79e49700000000, 0x0e5084aa00000000,
+    0x5d50344800000000, 0xed79547500000000, 0x3d03f43200000000,
+    0x8d2a940f00000000, 0x9df6b4bd00000000, 0x2ddfd48000000000,
+    0xfda574c700000000, 0x4d8c14fa00000000, 0x9c1b447800000000,
+    0x2c32244500000000, 0xfc48840200000000, 0x4c61e43f00000000,
+    0x5cbdc48d00000000, 0xec94a4b000000000, 0x3cee04f700000000,
+    0x8cc764ca00000000},
+   {0x0000000000000000, 0xa5d35ccb00000000, 0x0ba1c84d00000000,
+    0xae72948600000000, 0x1642919b00000000, 0xb391cd5000000000,
+    0x1de359d600000000, 0xb830051d00000000, 0x6d8253ec00000000,
+    0xc8510f2700000000, 0x66239ba100000000, 0xc3f0c76a00000000,
+    0x7bc0c27700000000, 0xde139ebc00000000, 0x70610a3a00000000,
+    0xd5b256f100000000, 0x9b02d60300000000, 0x3ed18ac800000000,
+    0x90a31e4e00000000, 0x3570428500000000, 0x8d40479800000000,
+    0x28931b5300000000, 0x86e18fd500000000, 0x2332d31e00000000,
+    0xf68085ef00000000, 0x5353d92400000000, 0xfd214da200000000,
+    0x58f2116900000000, 0xe0c2147400000000, 0x451148bf00000000,
+    0xeb63dc3900000000, 0x4eb080f200000000, 0x3605ac0700000000,
+    0x93d6f0cc00000000, 0x3da4644a00000000, 0x9877388100000000,
+    0x20473d9c00000000, 0x8594615700000000, 0x2be6f5d100000000,
+    0x8e35a91a00000000, 0x5b87ffeb00000000, 0xfe54a32000000000,
+    0x502637a600000000, 0xf5f56b6d00000000, 0x4dc56e7000000000,
+    0xe81632bb00000000, 0x4664a63d00000000, 0xe3b7faf600000000,
+    0xad077a0400000000, 0x08d426cf00000000, 0xa6a6b24900000000,
+    0x0375ee8200000000, 0xbb45eb9f00000000, 0x1e96b75400000000,
+    0xb0e423d200000000, 0x15377f1900000000, 0xc08529e800000000,
+    0x6556752300000000, 0xcb24e1a500000000, 0x6ef7bd6e00000000,
+    0xd6c7b87300000000, 0x7314e4b800000000, 0xdd66703e00000000,
+    0x78b52cf500000000, 0x6c0a580f00000000, 0xc9d904c400000000,
+    0x67ab904200000000, 0xc278cc8900000000, 0x7a48c99400000000,
+    0xdf9b955f00000000, 0x71e901d900000000, 0xd43a5d1200000000,
+    0x01880be300000000, 0xa45b572800000000, 0x0a29c3ae00000000,
+    0xaffa9f6500000000, 0x17ca9a7800000000, 0xb219c6b300000000,
+    0x1c6b523500000000, 0xb9b80efe00000000, 0xf7088e0c00000000,
+    0x52dbd2c700000000, 0xfca9464100000000, 0x597a1a8a00000000,
+    0xe14a1f9700000000, 0x4499435c00000000, 0xeaebd7da00000000,
+    0x4f388b1100000000, 0x9a8adde000000000, 0x3f59812b00000000,
+    0x912b15ad00000000, 0x34f8496600000000, 0x8cc84c7b00000000,
+    0x291b10b000000000, 0x8769843600000000, 0x22bad8fd00000000,
+    0x5a0ff40800000000, 0xffdca8c300000000, 0x51ae3c4500000000,
+    0xf47d608e00000000, 0x4c4d659300000000, 0xe99e395800000000,
+    0x47ecadde00000000, 0xe23ff11500000000, 0x378da7e400000000,
+    0x925efb2f00000000, 0x3c2c6fa900000000, 0x99ff336200000000,
+    0x21cf367f00000000, 0x841c6ab400000000, 0x2a6efe3200000000,
+    0x8fbda2f900000000, 0xc10d220b00000000, 0x64de7ec000000000,
+    0xcaacea4600000000, 0x6f7fb68d00000000, 0xd74fb39000000000,
+    0x729cef5b00000000, 0xdcee7bdd00000000, 0x793d271600000000,
+    0xac8f71e700000000, 0x095c2d2c00000000, 0xa72eb9aa00000000,
+    0x02fde56100000000, 0xbacde07c00000000, 0x1f1ebcb700000000,
+    0xb16c283100000000, 0x14bf74fa00000000, 0xd814b01e00000000,
+    0x7dc7ecd500000000, 0xd3b5785300000000, 0x7666249800000000,
+    0xce56218500000000, 0x6b857d4e00000000, 0xc5f7e9c800000000,
+    0x6024b50300000000, 0xb596e3f200000000, 0x1045bf3900000000,
+    0xbe372bbf00000000, 0x1be4777400000000, 0xa3d4726900000000,
+    0x06072ea200000000, 0xa875ba2400000000, 0x0da6e6ef00000000,
+    0x4316661d00000000, 0xe6c53ad600000000, 0x48b7ae5000000000,
+    0xed64f29b00000000, 0x5554f78600000000, 0xf087ab4d00000000,
+    0x5ef53fcb00000000, 0xfb26630000000000, 0x2e9435f100000000,
+    0x8b47693a00000000, 0x2535fdbc00000000, 0x80e6a17700000000,
+    0x38d6a46a00000000, 0x9d05f8a100000000, 0x33776c2700000000,
+    0x96a430ec00000000, 0xee111c1900000000, 0x4bc240d200000000,
+    0xe5b0d45400000000, 0x4063889f00000000, 0xf8538d8200000000,
+    0x5d80d14900000000, 0xf3f245cf00000000, 0x5621190400000000,
+    0x83934ff500000000, 0x2640133e00000000, 0x883287b800000000,
+    0x2de1db7300000000, 0x95d1de6e00000000, 0x300282a500000000,
+    0x9e70162300000000, 0x3ba34ae800000000, 0x7513ca1a00000000,
+    0xd0c096d100000000, 0x7eb2025700000000, 0xdb615e9c00000000,
+    0x63515b8100000000, 0xc682074a00000000, 0x68f093cc00000000,
+    0xcd23cf0700000000, 0x189199f600000000, 0xbd42c53d00000000,
+    0x133051bb00000000, 0xb6e30d7000000000, 0x0ed3086d00000000,
+    0xab0054a600000000, 0x0572c02000000000, 0xa0a19ceb00000000,
+    0xb41ee81100000000, 0x11cdb4da00000000, 0xbfbf205c00000000,
+    0x1a6c7c9700000000, 0xa25c798a00000000, 0x078f254100000000,
+    0xa9fdb1c700000000, 0x0c2eed0c00000000, 0xd99cbbfd00000000,
+    0x7c4fe73600000000, 0xd23d73b000000000, 0x77ee2f7b00000000,
+    0xcfde2a6600000000, 0x6a0d76ad00000000, 0xc47fe22b00000000,
+    0x61acbee000000000, 0x2f1c3e1200000000, 0x8acf62d900000000,
+    0x24bdf65f00000000, 0x816eaa9400000000, 0x395eaf8900000000,
+    0x9c8df34200000000, 0x32ff67c400000000, 0x972c3b0f00000000,
+    0x429e6dfe00000000, 0xe74d313500000000, 0x493fa5b300000000,
+    0xececf97800000000, 0x54dcfc6500000000, 0xf10fa0ae00000000,
+    0x5f7d342800000000, 0xfaae68e300000000, 0x821b441600000000,
+    0x27c818dd00000000, 0x89ba8c5b00000000, 0x2c69d09000000000,
+    0x9459d58d00000000, 0x318a894600000000, 0x9ff81dc000000000,
+    0x3a2b410b00000000, 0xef9917fa00000000, 0x4a4a4b3100000000,
+    0xe438dfb700000000, 0x41eb837c00000000, 0xf9db866100000000,
+    0x5c08daaa00000000, 0xf27a4e2c00000000, 0x57a912e700000000,
+    0x1919921500000000, 0xbccacede00000000, 0x12b85a5800000000,
+    0xb76b069300000000, 0x0f5b038e00000000, 0xaa885f4500000000,
+    0x04facbc300000000, 0xa129970800000000, 0x749bc1f900000000,
+    0xd1489d3200000000, 0x7f3a09b400000000, 0xdae9557f00000000,
+    0x62d9506200000000, 0xc70a0ca900000000, 0x6978982f00000000,
+    0xccabc4e400000000},
+   {0x0000000000000000, 0xb40b77a600000000, 0x29119f9700000000,
+    0x9d1ae83100000000, 0x13244ff400000000, 0xa72f385200000000,
+    0x3a35d06300000000, 0x8e3ea7c500000000, 0x674eef3300000000,
+    0xd345989500000000, 0x4e5f70a400000000, 0xfa54070200000000,
+    0x746aa0c700000000, 0xc061d76100000000, 0x5d7b3f5000000000,
+    0xe97048f600000000, 0xce9cde6700000000, 0x7a97a9c100000000,
+    0xe78d41f000000000, 0x5386365600000000, 0xddb8919300000000,
+    0x69b3e63500000000, 0xf4a90e0400000000, 0x40a279a200000000,
+    0xa9d2315400000000, 0x1dd946f200000000, 0x80c3aec300000000,
+    0x34c8d96500000000, 0xbaf67ea000000000, 0x0efd090600000000,
+    0x93e7e13700000000, 0x27ec969100000000, 0x9c39bdcf00000000,
+    0x2832ca6900000000, 0xb528225800000000, 0x012355fe00000000,
+    0x8f1df23b00000000, 0x3b16859d00000000, 0xa60c6dac00000000,
+    0x12071a0a00000000, 0xfb7752fc00000000, 0x4f7c255a00000000,
+    0xd266cd6b00000000, 0x666dbacd00000000, 0xe8531d0800000000,
+    0x5c586aae00000000, 0xc142829f00000000, 0x7549f53900000000,
+    0x52a563a800000000, 0xe6ae140e00000000, 0x7bb4fc3f00000000,
+    0xcfbf8b9900000000, 0x41812c5c00000000, 0xf58a5bfa00000000,
+    0x6890b3cb00000000, 0xdc9bc46d00000000, 0x35eb8c9b00000000,
+    0x81e0fb3d00000000, 0x1cfa130c00000000, 0xa8f164aa00000000,
+    0x26cfc36f00000000, 0x92c4b4c900000000, 0x0fde5cf800000000,
+    0xbbd52b5e00000000, 0x79750b4400000000, 0xcd7e7ce200000000,
+    0x506494d300000000, 0xe46fe37500000000, 0x6a5144b000000000,
+    0xde5a331600000000, 0x4340db2700000000, 0xf74bac8100000000,
+    0x1e3be47700000000, 0xaa3093d100000000, 0x372a7be000000000,
+    0x83210c4600000000, 0x0d1fab8300000000, 0xb914dc2500000000,
+    0x240e341400000000, 0x900543b200000000, 0xb7e9d52300000000,
+    0x03e2a28500000000, 0x9ef84ab400000000, 0x2af33d1200000000,
+    0xa4cd9ad700000000, 0x10c6ed7100000000, 0x8ddc054000000000,
+    0x39d772e600000000, 0xd0a73a1000000000, 0x64ac4db600000000,
+    0xf9b6a58700000000, 0x4dbdd22100000000, 0xc38375e400000000,
+    0x7788024200000000, 0xea92ea7300000000, 0x5e999dd500000000,
+    0xe54cb68b00000000, 0x5147c12d00000000, 0xcc5d291c00000000,
+    0x78565eba00000000, 0xf668f97f00000000, 0x42638ed900000000,
+    0xdf7966e800000000, 0x6b72114e00000000, 0x820259b800000000,
+    0x36092e1e00000000, 0xab13c62f00000000, 0x1f18b18900000000,
+    0x9126164c00000000, 0x252d61ea00000000, 0xb83789db00000000,
+    0x0c3cfe7d00000000, 0x2bd068ec00000000, 0x9fdb1f4a00000000,
+    0x02c1f77b00000000, 0xb6ca80dd00000000, 0x38f4271800000000,
+    0x8cff50be00000000, 0x11e5b88f00000000, 0xa5eecf2900000000,
+    0x4c9e87df00000000, 0xf895f07900000000, 0x658f184800000000,
+    0xd1846fee00000000, 0x5fbac82b00000000, 0xebb1bf8d00000000,
+    0x76ab57bc00000000, 0xc2a0201a00000000, 0xf2ea168800000000,
+    0x46e1612e00000000, 0xdbfb891f00000000, 0x6ff0feb900000000,
+    0xe1ce597c00000000, 0x55c52eda00000000, 0xc8dfc6eb00000000,
+    0x7cd4b14d00000000, 0x95a4f9bb00000000, 0x21af8e1d00000000,
+    0xbcb5662c00000000, 0x08be118a00000000, 0x8680b64f00000000,
+    0x328bc1e900000000, 0xaf9129d800000000, 0x1b9a5e7e00000000,
+    0x3c76c8ef00000000, 0x887dbf4900000000, 0x1567577800000000,
+    0xa16c20de00000000, 0x2f52871b00000000, 0x9b59f0bd00000000,
+    0x0643188c00000000, 0xb2486f2a00000000, 0x5b3827dc00000000,
+    0xef33507a00000000, 0x7229b84b00000000, 0xc622cfed00000000,
+    0x481c682800000000, 0xfc171f8e00000000, 0x610df7bf00000000,
+    0xd506801900000000, 0x6ed3ab4700000000, 0xdad8dce100000000,
+    0x47c234d000000000, 0xf3c9437600000000, 0x7df7e4b300000000,
+    0xc9fc931500000000, 0x54e67b2400000000, 0xe0ed0c8200000000,
+    0x099d447400000000, 0xbd9633d200000000, 0x208cdbe300000000,
+    0x9487ac4500000000, 0x1ab90b8000000000, 0xaeb27c2600000000,
+    0x33a8941700000000, 0x87a3e3b100000000, 0xa04f752000000000,
+    0x1444028600000000, 0x895eeab700000000, 0x3d559d1100000000,
+    0xb36b3ad400000000, 0x07604d7200000000, 0x9a7aa54300000000,
+    0x2e71d2e500000000, 0xc7019a1300000000, 0x730aedb500000000,
+    0xee10058400000000, 0x5a1b722200000000, 0xd425d5e700000000,
+    0x602ea24100000000, 0xfd344a7000000000, 0x493f3dd600000000,
+    0x8b9f1dcc00000000, 0x3f946a6a00000000, 0xa28e825b00000000,
+    0x1685f5fd00000000, 0x98bb523800000000, 0x2cb0259e00000000,
+    0xb1aacdaf00000000, 0x05a1ba0900000000, 0xecd1f2ff00000000,
+    0x58da855900000000, 0xc5c06d6800000000, 0x71cb1ace00000000,
+    0xfff5bd0b00000000, 0x4bfecaad00000000, 0xd6e4229c00000000,
+    0x62ef553a00000000, 0x4503c3ab00000000, 0xf108b40d00000000,
+    0x6c125c3c00000000, 0xd8192b9a00000000, 0x56278c5f00000000,
+    0xe22cfbf900000000, 0x7f3613c800000000, 0xcb3d646e00000000,
+    0x224d2c9800000000, 0x96465b3e00000000, 0x0b5cb30f00000000,
+    0xbf57c4a900000000, 0x3169636c00000000, 0x856214ca00000000,
+    0x1878fcfb00000000, 0xac738b5d00000000, 0x17a6a00300000000,
+    0xa3add7a500000000, 0x3eb73f9400000000, 0x8abc483200000000,
+    0x0482eff700000000, 0xb089985100000000, 0x2d93706000000000,
+    0x999807c600000000, 0x70e84f3000000000, 0xc4e3389600000000,
+    0x59f9d0a700000000, 0xedf2a70100000000, 0x63cc00c400000000,
+    0xd7c7776200000000, 0x4add9f5300000000, 0xfed6e8f500000000,
+    0xd93a7e6400000000, 0x6d3109c200000000, 0xf02be1f300000000,
+    0x4420965500000000, 0xca1e319000000000, 0x7e15463600000000,
+    0xe30fae0700000000, 0x5704d9a100000000, 0xbe74915700000000,
+    0x0a7fe6f100000000, 0x97650ec000000000, 0x236e796600000000,
+    0xad50dea300000000, 0x195ba90500000000, 0x8441413400000000,
+    0x304a369200000000},
+   {0x0000000000000000, 0x9e00aacc00000000, 0x7d07254200000000,
+    0xe3078f8e00000000, 0xfa0e4a8400000000, 0x640ee04800000000,
+    0x87096fc600000000, 0x1909c50a00000000, 0xb51be5d300000000,
+    0x2b1b4f1f00000000, 0xc81cc09100000000, 0x561c6a5d00000000,
+    0x4f15af5700000000, 0xd115059b00000000, 0x32128a1500000000,
+    0xac1220d900000000, 0x2b31bb7c00000000, 0xb53111b000000000,
+    0x56369e3e00000000, 0xc83634f200000000, 0xd13ff1f800000000,
+    0x4f3f5b3400000000, 0xac38d4ba00000000, 0x32387e7600000000,
+    0x9e2a5eaf00000000, 0x002af46300000000, 0xe32d7bed00000000,
+    0x7d2dd12100000000, 0x6424142b00000000, 0xfa24bee700000000,
+    0x1923316900000000, 0x87239ba500000000, 0x566276f900000000,
+    0xc862dc3500000000, 0x2b6553bb00000000, 0xb565f97700000000,
+    0xac6c3c7d00000000, 0x326c96b100000000, 0xd16b193f00000000,
+    0x4f6bb3f300000000, 0xe379932a00000000, 0x7d7939e600000000,
+    0x9e7eb66800000000, 0x007e1ca400000000, 0x1977d9ae00000000,
+    0x8777736200000000, 0x6470fcec00000000, 0xfa70562000000000,
+    0x7d53cd8500000000, 0xe353674900000000, 0x0054e8c700000000,
+    0x9e54420b00000000, 0x875d870100000000, 0x195d2dcd00000000,
+    0xfa5aa24300000000, 0x645a088f00000000, 0xc848285600000000,
+    0x5648829a00000000, 0xb54f0d1400000000, 0x2b4fa7d800000000,
+    0x324662d200000000, 0xac46c81e00000000, 0x4f41479000000000,
+    0xd141ed5c00000000, 0xedc29d2900000000, 0x73c237e500000000,
+    0x90c5b86b00000000, 0x0ec512a700000000, 0x17ccd7ad00000000,
+    0x89cc7d6100000000, 0x6acbf2ef00000000, 0xf4cb582300000000,
+    0x58d978fa00000000, 0xc6d9d23600000000, 0x25de5db800000000,
+    0xbbdef77400000000, 0xa2d7327e00000000, 0x3cd798b200000000,
+    0xdfd0173c00000000, 0x41d0bdf000000000, 0xc6f3265500000000,
+    0x58f38c9900000000, 0xbbf4031700000000, 0x25f4a9db00000000,
+    0x3cfd6cd100000000, 0xa2fdc61d00000000, 0x41fa499300000000,
+    0xdffae35f00000000, 0x73e8c38600000000, 0xede8694a00000000,
+    0x0eefe6c400000000, 0x90ef4c0800000000, 0x89e6890200000000,
+    0x17e623ce00000000, 0xf4e1ac4000000000, 0x6ae1068c00000000,
+    0xbba0ebd000000000, 0x25a0411c00000000, 0xc6a7ce9200000000,
+    0x58a7645e00000000, 0x41aea15400000000, 0xdfae0b9800000000,
+    0x3ca9841600000000, 0xa2a92eda00000000, 0x0ebb0e0300000000,
+    0x90bba4cf00000000, 0x73bc2b4100000000, 0xedbc818d00000000,
+    0xf4b5448700000000, 0x6ab5ee4b00000000, 0x89b261c500000000,
+    0x17b2cb0900000000, 0x909150ac00000000, 0x0e91fa6000000000,
+    0xed9675ee00000000, 0x7396df2200000000, 0x6a9f1a2800000000,
+    0xf49fb0e400000000, 0x17983f6a00000000, 0x899895a600000000,
+    0x258ab57f00000000, 0xbb8a1fb300000000, 0x588d903d00000000,
+    0xc68d3af100000000, 0xdf84fffb00000000, 0x4184553700000000,
+    0xa283dab900000000, 0x3c83707500000000, 0xda853b5300000000,
+    0x4485919f00000000, 0xa7821e1100000000, 0x3982b4dd00000000,
+    0x208b71d700000000, 0xbe8bdb1b00000000, 0x5d8c549500000000,
+    0xc38cfe5900000000, 0x6f9ede8000000000, 0xf19e744c00000000,
+    0x1299fbc200000000, 0x8c99510e00000000, 0x9590940400000000,
+    0x0b903ec800000000, 0xe897b14600000000, 0x76971b8a00000000,
+    0xf1b4802f00000000, 0x6fb42ae300000000, 0x8cb3a56d00000000,
+    0x12b30fa100000000, 0x0bbacaab00000000, 0x95ba606700000000,
+    0x76bdefe900000000, 0xe8bd452500000000, 0x44af65fc00000000,
+    0xdaafcf3000000000, 0x39a840be00000000, 0xa7a8ea7200000000,
+    0xbea12f7800000000, 0x20a185b400000000, 0xc3a60a3a00000000,
+    0x5da6a0f600000000, 0x8ce74daa00000000, 0x12e7e76600000000,
+    0xf1e068e800000000, 0x6fe0c22400000000, 0x76e9072e00000000,
+    0xe8e9ade200000000, 0x0bee226c00000000, 0x95ee88a000000000,
+    0x39fca87900000000, 0xa7fc02b500000000, 0x44fb8d3b00000000,
+    0xdafb27f700000000, 0xc3f2e2fd00000000, 0x5df2483100000000,
+    0xbef5c7bf00000000, 0x20f56d7300000000, 0xa7d6f6d600000000,
+    0x39d65c1a00000000, 0xdad1d39400000000, 0x44d1795800000000,
+    0x5dd8bc5200000000, 0xc3d8169e00000000, 0x20df991000000000,
+    0xbedf33dc00000000, 0x12cd130500000000, 0x8ccdb9c900000000,
+    0x6fca364700000000, 0xf1ca9c8b00000000, 0xe8c3598100000000,
+    0x76c3f34d00000000, 0x95c47cc300000000, 0x0bc4d60f00000000,
+    0x3747a67a00000000, 0xa9470cb600000000, 0x4a40833800000000,
+    0xd44029f400000000, 0xcd49ecfe00000000, 0x5349463200000000,
+    0xb04ec9bc00000000, 0x2e4e637000000000, 0x825c43a900000000,
+    0x1c5ce96500000000, 0xff5b66eb00000000, 0x615bcc2700000000,
+    0x7852092d00000000, 0xe652a3e100000000, 0x05552c6f00000000,
+    0x9b5586a300000000, 0x1c761d0600000000, 0x8276b7ca00000000,
+    0x6171384400000000, 0xff71928800000000, 0xe678578200000000,
+    0x7878fd4e00000000, 0x9b7f72c000000000, 0x057fd80c00000000,
+    0xa96df8d500000000, 0x376d521900000000, 0xd46add9700000000,
+    0x4a6a775b00000000, 0x5363b25100000000, 0xcd63189d00000000,
+    0x2e64971300000000, 0xb0643ddf00000000, 0x6125d08300000000,
+    0xff257a4f00000000, 0x1c22f5c100000000, 0x82225f0d00000000,
+    0x9b2b9a0700000000, 0x052b30cb00000000, 0xe62cbf4500000000,
+    0x782c158900000000, 0xd43e355000000000, 0x4a3e9f9c00000000,
+    0xa939101200000000, 0x3739bade00000000, 0x2e307fd400000000,
+    0xb030d51800000000, 0x53375a9600000000, 0xcd37f05a00000000,
+    0x4a146bff00000000, 0xd414c13300000000, 0x37134ebd00000000,
+    0xa913e47100000000, 0xb01a217b00000000, 0x2e1a8bb700000000,
+    0xcd1d043900000000, 0x531daef500000000, 0xff0f8e2c00000000,
+    0x610f24e000000000, 0x8208ab6e00000000, 0x1c0801a200000000,
+    0x0501c4a800000000, 0x9b016e6400000000, 0x7806e1ea00000000,
+    0xe6064b2600000000}};
+
+#else /* W == 4 */
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, 0x8f629757,
+    0x37def032, 0x256b5fdc, 0x9dd738b9, 0xc5b428ef, 0x7d084f8a,
+    0x6fbde064, 0xd7018701, 0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733,
+    0x58631056, 0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
+    0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26, 0x95ad7f70,
+    0x2d111815, 0x3fa4b7fb, 0x8718d09e, 0x1acfe827, 0xa2738f42,
+    0xb0c620ac, 0x087a47c9, 0xa032af3e, 0x188ec85b, 0x0a3b67b5,
+    0xb28700d0, 0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
+    0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f, 0xeae41086,
+    0x525877e3, 0x40edd80d, 0xf851bf68, 0xf02bf8a1, 0x48979fc4,
+    0x5a22302a, 0xe29e574f, 0x7f496ff6, 0xc7f50893, 0xd540a77d,
+    0x6dfcc018, 0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
+    0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7, 0x9b14583d,
+    0x23a83f58, 0x311d90b6, 0x89a1f7d3, 0x1476cf6a, 0xaccaa80f,
+    0xbe7f07e1, 0x06c36084, 0x5ea070d2, 0xe61c17b7, 0xf4a9b859,
+    0x4c15df3c, 0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
+    0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c, 0x446f98f5,
+    0xfcd3ff90, 0xee66507e, 0x56da371b, 0x0eb9274d, 0xb6054028,
+    0xa4b0efc6, 0x1c0c88a3, 0x81dbb01a, 0x3967d77f, 0x2bd27891,
+    0x936e1ff4, 0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
+    0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba, 0xfe92dfec,
+    0x462eb889, 0x549b1767, 0xec277002, 0x71f048bb, 0xc94c2fde,
+    0xdbf98030, 0x6345e755, 0x6b3fa09c, 0xd383c7f9, 0xc1366817,
+    0x798a0f72, 0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
+    0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d, 0x21e91f24,
+    0x99557841, 0x8be0d7af, 0x335cb0ca, 0xed59b63b, 0x55e5d15e,
+    0x47507eb0, 0xffec19d5, 0x623b216c, 0xda874609, 0xc832e9e7,
+    0x708e8e82, 0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
+    0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d, 0xbd40e1a4,
+    0x05fc86c1, 0x1749292f, 0xaff54e4a, 0x322276f3, 0x8a9e1196,
+    0x982bbe78, 0x2097d91d, 0x78f4c94b, 0xc048ae2e, 0xd2fd01c0,
+    0x6a4166a5, 0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
+    0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb, 0xc2098e52,
+    0x7ab5e937, 0x680046d9, 0xd0bc21bc, 0x88df31ea, 0x3063568f,
+    0x22d6f961, 0x9a6a9e04, 0x07bda6bd, 0xbf01c1d8, 0xadb46e36,
+    0x15080953, 0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
+    0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623, 0xd8c66675,
+    0x607a0110, 0x72cfaefe, 0xca73c99b, 0x57a4f122, 0xef189647,
+    0xfdad39a9, 0x45115ecc, 0x764dee06, 0xcef18963, 0xdc44268d,
+    0x64f841e8, 0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
+    0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907, 0x3c9b51be,
+    0x842736db, 0x96929935, 0x2e2efe50, 0x2654b999, 0x9ee8defc,
+    0x8c5d7112, 0x34e11677, 0xa9362ece, 0x118a49ab, 0x033fe645,
+    0xbb838120, 0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
+    0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf, 0xd67f4138,
+    0x6ec3265d, 0x7c7689b3, 0xc4caeed6, 0x591dd66f, 0xe1a1b10a,
+    0xf3141ee4, 0x4ba87981, 0x13cb69d7, 0xab770eb2, 0xb9c2a15c,
+    0x017ec639, 0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
+    0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949, 0x090481f0,
+    0xb1b8e695, 0xa30d497b, 0x1bb12e1e, 0x43d23e48, 0xfb6e592d,
+    0xe9dbf6c3, 0x516791a6, 0xccb0a91f, 0x740cce7a, 0x66b96194,
+    0xde0506f1},
+   {0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59, 0x0709a8dc,
+    0x06cbc2eb, 0x048d7cb2, 0x054f1685, 0x0e1351b8, 0x0fd13b8f,
+    0x0d9785d6, 0x0c55efe1, 0x091af964, 0x08d89353, 0x0a9e2d0a,
+    0x0b5c473d, 0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
+    0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5, 0x1235f2c8,
+    0x13f798ff, 0x11b126a6, 0x10734c91, 0x153c5a14, 0x14fe3023,
+    0x16b88e7a, 0x177ae44d, 0x384d46e0, 0x398f2cd7, 0x3bc9928e,
+    0x3a0bf8b9, 0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
+    0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901, 0x3157bf84,
+    0x3095d5b3, 0x32d36bea, 0x331101dd, 0x246be590, 0x25a98fa7,
+    0x27ef31fe, 0x262d5bc9, 0x23624d4c, 0x22a0277b, 0x20e69922,
+    0x2124f315, 0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
+    0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad, 0x709a8dc0,
+    0x7158e7f7, 0x731e59ae, 0x72dc3399, 0x7793251c, 0x76514f2b,
+    0x7417f172, 0x75d59b45, 0x7e89dc78, 0x7f4bb64f, 0x7d0d0816,
+    0x7ccf6221, 0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
+    0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9, 0x6bb5866c,
+    0x6a77ec5b, 0x68315202, 0x69f33835, 0x62af7f08, 0x636d153f,
+    0x612bab66, 0x60e9c151, 0x65a6d7d4, 0x6464bde3, 0x662203ba,
+    0x67e0698d, 0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
+    0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5, 0x46c49a98,
+    0x4706f0af, 0x45404ef6, 0x448224c1, 0x41cd3244, 0x400f5873,
+    0x4249e62a, 0x438b8c1d, 0x54f16850, 0x55330267, 0x5775bc3e,
+    0x56b7d609, 0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
+    0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1, 0x5deb9134,
+    0x5c29fb03, 0x5e6f455a, 0x5fad2f6d, 0xe1351b80, 0xe0f771b7,
+    0xe2b1cfee, 0xe373a5d9, 0xe63cb35c, 0xe7fed96b, 0xe5b86732,
+    0xe47a0d05, 0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
+    0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd, 0xfd13b8f0,
+    0xfcd1d2c7, 0xfe976c9e, 0xff5506a9, 0xfa1a102c, 0xfbd87a1b,
+    0xf99ec442, 0xf85cae75, 0xf300e948, 0xf2c2837f, 0xf0843d26,
+    0xf1465711, 0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
+    0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339, 0xde71f5bc,
+    0xdfb39f8b, 0xddf521d2, 0xdc374be5, 0xd76b0cd8, 0xd6a966ef,
+    0xd4efd8b6, 0xd52db281, 0xd062a404, 0xd1a0ce33, 0xd3e6706a,
+    0xd2241a5d, 0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
+    0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895, 0xcb4dafa8,
+    0xca8fc59f, 0xc8c97bc6, 0xc90b11f1, 0xcc440774, 0xcd866d43,
+    0xcfc0d31a, 0xce02b92d, 0x91af9640, 0x906dfc77, 0x922b422e,
+    0x93e92819, 0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
+    0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1, 0x98b56f24,
+    0x99770513, 0x9b31bb4a, 0x9af3d17d, 0x8d893530, 0x8c4b5f07,
+    0x8e0de15e, 0x8fcf8b69, 0x8a809dec, 0x8b42f7db, 0x89044982,
+    0x88c623b5, 0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
+    0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d, 0xa9e2d0a0,
+    0xa820ba97, 0xaa6604ce, 0xaba46ef9, 0xaeeb787c, 0xaf29124b,
+    0xad6fac12, 0xacadc625, 0xa7f18118, 0xa633eb2f, 0xa4755576,
+    0xa5b73f41, 0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
+    0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89, 0xb2cddb0c,
+    0xb30fb13b, 0xb1490f62, 0xb08b6555, 0xbbd72268, 0xba15485f,
+    0xb853f606, 0xb9919c31, 0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda,
+    0xbe9834ed},
+   {0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3, 0x646cc504,
+    0x7d77f445, 0x565aa786, 0x4f4196c7, 0xc8d98a08, 0xd1c2bb49,
+    0xfaefe88a, 0xe3f4d9cb, 0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e,
+    0x87981ccf, 0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
+    0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496, 0x821b9859,
+    0x9b00a918, 0xb02dfadb, 0xa936cb9a, 0xe6775d5d, 0xff6c6c1c,
+    0xd4413fdf, 0xcd5a0e9e, 0x958424a2, 0x8c9f15e3, 0xa7b24620,
+    0xbea97761, 0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
+    0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69, 0x39316bae,
+    0x202a5aef, 0x0b07092c, 0x121c386d, 0xdf4636f3, 0xc65d07b2,
+    0xed705471, 0xf46b6530, 0xbb2af3f7, 0xa231c2b6, 0x891c9175,
+    0x9007a034, 0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
+    0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c, 0xf0794f05,
+    0xe9627e44, 0xc24f2d87, 0xdb541cc6, 0x94158a01, 0x8d0ebb40,
+    0xa623e883, 0xbf38d9c2, 0x38a0c50d, 0x21bbf44c, 0x0a96a78f,
+    0x138d96ce, 0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
+    0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97, 0xded79850,
+    0xc7cca911, 0xece1fad2, 0xf5facb93, 0x7262d75c, 0x6b79e61d,
+    0x4054b5de, 0x594f849f, 0x160e1258, 0x0f152319, 0x243870da,
+    0x3d23419b, 0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
+    0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60, 0xad24e1af,
+    0xb43fd0ee, 0x9f12832d, 0x8609b26c, 0xc94824ab, 0xd05315ea,
+    0xfb7e4629, 0xe2657768, 0x2f3f79f6, 0x362448b7, 0x1d091b74,
+    0x04122a35, 0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
+    0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d, 0x838a36fa,
+    0x9a9107bb, 0xb1bc5478, 0xa8a76539, 0x3b83984b, 0x2298a90a,
+    0x09b5fac9, 0x10aecb88, 0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd,
+    0x74c20e8c, 0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
+    0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484, 0x71418a1a,
+    0x685abb5b, 0x4377e898, 0x5a6cd9d9, 0x152d4f1e, 0x0c367e5f,
+    0x271b2d9c, 0x3e001cdd, 0xb9980012, 0xa0833153, 0x8bae6290,
+    0x92b553d1, 0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
+    0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a, 0xca6b79ed,
+    0xd37048ac, 0xf85d1b6f, 0xe1462a2e, 0x66de36e1, 0x7fc507a0,
+    0x54e85463, 0x4df36522, 0x02b2f3e5, 0x1ba9c2a4, 0x30849167,
+    0x299fa026, 0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
+    0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f, 0x2c1c24b0,
+    0x350715f1, 0x1e2a4632, 0x07317773, 0x4870e1b4, 0x516bd0f5,
+    0x7a468336, 0x635db277, 0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc,
+    0xe0d7848d, 0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
+    0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85, 0x674f9842,
+    0x7e54a903, 0x5579fac0, 0x4c62cb81, 0x8138c51f, 0x9823f45e,
+    0xb30ea79d, 0xaa1596dc, 0xe554001b, 0xfc4f315a, 0xd7626299,
+    0xce7953d8, 0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
+    0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0, 0x5e7ef3ec,
+    0x4765c2ad, 0x6c48916e, 0x7553a02f, 0x3a1236e8, 0x230907a9,
+    0x0824546a, 0x113f652b, 0x96a779e4, 0x8fbc48a5, 0xa4911b66,
+    0xbd8a2a27, 0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
+    0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e, 0x70d024b9,
+    0x69cb15f8, 0x42e6463b, 0x5bfd777a, 0xdc656bb5, 0xc57e5af4,
+    0xee530937, 0xf7483876, 0xb809aeb1, 0xa1129ff0, 0x8a3fcc33,
+    0x9324fd72},
+   {0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+    0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+    0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+    0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+    0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+    0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+    0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+    0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+    0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+    0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+    0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+    0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+    0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+    0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+    0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+    0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+    0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+    0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+    0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+    0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+    0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+    0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+    0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+    0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+    0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+    0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+    0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+    0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+    0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+    0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+    0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+    0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+    0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+    0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+    0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+    0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+    0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+    0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+    0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+    0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+    0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+    0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+    0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+    0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+    0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+    0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+    0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+    0x2d02ef8d}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x00000000, 0x96300777, 0x2c610eee, 0xba510999, 0x19c46d07,
+    0x8ff46a70, 0x35a563e9, 0xa395649e, 0x3288db0e, 0xa4b8dc79,
+    0x1ee9d5e0, 0x88d9d297, 0x2b4cb609, 0xbd7cb17e, 0x072db8e7,
+    0x911dbf90, 0x6410b71d, 0xf220b06a, 0x4871b9f3, 0xde41be84,
+    0x7dd4da1a, 0xebe4dd6d, 0x51b5d4f4, 0xc785d383, 0x56986c13,
+    0xc0a86b64, 0x7af962fd, 0xecc9658a, 0x4f5c0114, 0xd96c0663,
+    0x633d0ffa, 0xf50d088d, 0xc8206e3b, 0x5e10694c, 0xe44160d5,
+    0x727167a2, 0xd1e4033c, 0x47d4044b, 0xfd850dd2, 0x6bb50aa5,
+    0xfaa8b535, 0x6c98b242, 0xd6c9bbdb, 0x40f9bcac, 0xe36cd832,
+    0x755cdf45, 0xcf0dd6dc, 0x593dd1ab, 0xac30d926, 0x3a00de51,
+    0x8051d7c8, 0x1661d0bf, 0xb5f4b421, 0x23c4b356, 0x9995bacf,
+    0x0fa5bdb8, 0x9eb80228, 0x0888055f, 0xb2d90cc6, 0x24e90bb1,
+    0x877c6f2f, 0x114c6858, 0xab1d61c1, 0x3d2d66b6, 0x9041dc76,
+    0x0671db01, 0xbc20d298, 0x2a10d5ef, 0x8985b171, 0x1fb5b606,
+    0xa5e4bf9f, 0x33d4b8e8, 0xa2c90778, 0x34f9000f, 0x8ea80996,
+    0x18980ee1, 0xbb0d6a7f, 0x2d3d6d08, 0x976c6491, 0x015c63e6,
+    0xf4516b6b, 0x62616c1c, 0xd8306585, 0x4e0062f2, 0xed95066c,
+    0x7ba5011b, 0xc1f40882, 0x57c40ff5, 0xc6d9b065, 0x50e9b712,
+    0xeab8be8b, 0x7c88b9fc, 0xdf1ddd62, 0x492dda15, 0xf37cd38c,
+    0x654cd4fb, 0x5861b24d, 0xce51b53a, 0x7400bca3, 0xe230bbd4,
+    0x41a5df4a, 0xd795d83d, 0x6dc4d1a4, 0xfbf4d6d3, 0x6ae96943,
+    0xfcd96e34, 0x468867ad, 0xd0b860da, 0x732d0444, 0xe51d0333,
+    0x5f4c0aaa, 0xc97c0ddd, 0x3c710550, 0xaa410227, 0x10100bbe,
+    0x86200cc9, 0x25b56857, 0xb3856f20, 0x09d466b9, 0x9fe461ce,
+    0x0ef9de5e, 0x98c9d929, 0x2298d0b0, 0xb4a8d7c7, 0x173db359,
+    0x810db42e, 0x3b5cbdb7, 0xad6cbac0, 0x2083b8ed, 0xb6b3bf9a,
+    0x0ce2b603, 0x9ad2b174, 0x3947d5ea, 0xaf77d29d, 0x1526db04,
+    0x8316dc73, 0x120b63e3, 0x843b6494, 0x3e6a6d0d, 0xa85a6a7a,
+    0x0bcf0ee4, 0x9dff0993, 0x27ae000a, 0xb19e077d, 0x44930ff0,
+    0xd2a30887, 0x68f2011e, 0xfec20669, 0x5d5762f7, 0xcb676580,
+    0x71366c19, 0xe7066b6e, 0x761bd4fe, 0xe02bd389, 0x5a7ada10,
+    0xcc4add67, 0x6fdfb9f9, 0xf9efbe8e, 0x43beb717, 0xd58eb060,
+    0xe8a3d6d6, 0x7e93d1a1, 0xc4c2d838, 0x52f2df4f, 0xf167bbd1,
+    0x6757bca6, 0xdd06b53f, 0x4b36b248, 0xda2b0dd8, 0x4c1b0aaf,
+    0xf64a0336, 0x607a0441, 0xc3ef60df, 0x55df67a8, 0xef8e6e31,
+    0x79be6946, 0x8cb361cb, 0x1a8366bc, 0xa0d26f25, 0x36e26852,
+    0x95770ccc, 0x03470bbb, 0xb9160222, 0x2f260555, 0xbe3bbac5,
+    0x280bbdb2, 0x925ab42b, 0x046ab35c, 0xa7ffd7c2, 0x31cfd0b5,
+    0x8b9ed92c, 0x1daede5b, 0xb0c2649b, 0x26f263ec, 0x9ca36a75,
+    0x0a936d02, 0xa906099c, 0x3f360eeb, 0x85670772, 0x13570005,
+    0x824abf95, 0x147ab8e2, 0xae2bb17b, 0x381bb60c, 0x9b8ed292,
+    0x0dbed5e5, 0xb7efdc7c, 0x21dfdb0b, 0xd4d2d386, 0x42e2d4f1,
+    0xf8b3dd68, 0x6e83da1f, 0xcd16be81, 0x5b26b9f6, 0xe177b06f,
+    0x7747b718, 0xe65a0888, 0x706a0fff, 0xca3b0666, 0x5c0b0111,
+    0xff9e658f, 0x69ae62f8, 0xd3ff6b61, 0x45cf6c16, 0x78e20aa0,
+    0xeed20dd7, 0x5483044e, 0xc2b30339, 0x612667a7, 0xf71660d0,
+    0x4d476949, 0xdb776e3e, 0x4a6ad1ae, 0xdc5ad6d9, 0x660bdf40,
+    0xf03bd837, 0x53aebca9, 0xc59ebbde, 0x7fcfb247, 0xe9ffb530,
+    0x1cf2bdbd, 0x8ac2baca, 0x3093b353, 0xa6a3b424, 0x0536d0ba,
+    0x9306d7cd, 0x2957de54, 0xbf67d923, 0x2e7a66b3, 0xb84a61c4,
+    0x021b685d, 0x942b6f2a, 0x37be0bb4, 0xa18e0cc3, 0x1bdf055a,
+    0x8def022d},
+   {0x00000000, 0x41311b19, 0x82623632, 0xc3532d2b, 0x04c56c64,
+    0x45f4777d, 0x86a75a56, 0xc796414f, 0x088ad9c8, 0x49bbc2d1,
+    0x8ae8effa, 0xcbd9f4e3, 0x0c4fb5ac, 0x4d7eaeb5, 0x8e2d839e,
+    0xcf1c9887, 0x5112c24a, 0x1023d953, 0xd370f478, 0x9241ef61,
+    0x55d7ae2e, 0x14e6b537, 0xd7b5981c, 0x96848305, 0x59981b82,
+    0x18a9009b, 0xdbfa2db0, 0x9acb36a9, 0x5d5d77e6, 0x1c6c6cff,
+    0xdf3f41d4, 0x9e0e5acd, 0xa2248495, 0xe3159f8c, 0x2046b2a7,
+    0x6177a9be, 0xa6e1e8f1, 0xe7d0f3e8, 0x2483dec3, 0x65b2c5da,
+    0xaaae5d5d, 0xeb9f4644, 0x28cc6b6f, 0x69fd7076, 0xae6b3139,
+    0xef5a2a20, 0x2c09070b, 0x6d381c12, 0xf33646df, 0xb2075dc6,
+    0x715470ed, 0x30656bf4, 0xf7f32abb, 0xb6c231a2, 0x75911c89,
+    0x34a00790, 0xfbbc9f17, 0xba8d840e, 0x79dea925, 0x38efb23c,
+    0xff79f373, 0xbe48e86a, 0x7d1bc541, 0x3c2ade58, 0x054f79f0,
+    0x447e62e9, 0x872d4fc2, 0xc61c54db, 0x018a1594, 0x40bb0e8d,
+    0x83e823a6, 0xc2d938bf, 0x0dc5a038, 0x4cf4bb21, 0x8fa7960a,
+    0xce968d13, 0x0900cc5c, 0x4831d745, 0x8b62fa6e, 0xca53e177,
+    0x545dbbba, 0x156ca0a3, 0xd63f8d88, 0x970e9691, 0x5098d7de,
+    0x11a9ccc7, 0xd2fae1ec, 0x93cbfaf5, 0x5cd76272, 0x1de6796b,
+    0xdeb55440, 0x9f844f59, 0x58120e16, 0x1923150f, 0xda703824,
+    0x9b41233d, 0xa76bfd65, 0xe65ae67c, 0x2509cb57, 0x6438d04e,
+    0xa3ae9101, 0xe29f8a18, 0x21cca733, 0x60fdbc2a, 0xafe124ad,
+    0xeed03fb4, 0x2d83129f, 0x6cb20986, 0xab2448c9, 0xea1553d0,
+    0x29467efb, 0x687765e2, 0xf6793f2f, 0xb7482436, 0x741b091d,
+    0x352a1204, 0xf2bc534b, 0xb38d4852, 0x70de6579, 0x31ef7e60,
+    0xfef3e6e7, 0xbfc2fdfe, 0x7c91d0d5, 0x3da0cbcc, 0xfa368a83,
+    0xbb07919a, 0x7854bcb1, 0x3965a7a8, 0x4b98833b, 0x0aa99822,
+    0xc9fab509, 0x88cbae10, 0x4f5def5f, 0x0e6cf446, 0xcd3fd96d,
+    0x8c0ec274, 0x43125af3, 0x022341ea, 0xc1706cc1, 0x804177d8,
+    0x47d73697, 0x06e62d8e, 0xc5b500a5, 0x84841bbc, 0x1a8a4171,
+    0x5bbb5a68, 0x98e87743, 0xd9d96c5a, 0x1e4f2d15, 0x5f7e360c,
+    0x9c2d1b27, 0xdd1c003e, 0x120098b9, 0x533183a0, 0x9062ae8b,
+    0xd153b592, 0x16c5f4dd, 0x57f4efc4, 0x94a7c2ef, 0xd596d9f6,
+    0xe9bc07ae, 0xa88d1cb7, 0x6bde319c, 0x2aef2a85, 0xed796bca,
+    0xac4870d3, 0x6f1b5df8, 0x2e2a46e1, 0xe136de66, 0xa007c57f,
+    0x6354e854, 0x2265f34d, 0xe5f3b202, 0xa4c2a91b, 0x67918430,
+    0x26a09f29, 0xb8aec5e4, 0xf99fdefd, 0x3accf3d6, 0x7bfde8cf,
+    0xbc6ba980, 0xfd5ab299, 0x3e099fb2, 0x7f3884ab, 0xb0241c2c,
+    0xf1150735, 0x32462a1e, 0x73773107, 0xb4e17048, 0xf5d06b51,
+    0x3683467a, 0x77b25d63, 0x4ed7facb, 0x0fe6e1d2, 0xccb5ccf9,
+    0x8d84d7e0, 0x4a1296af, 0x0b238db6, 0xc870a09d, 0x8941bb84,
+    0x465d2303, 0x076c381a, 0xc43f1531, 0x850e0e28, 0x42984f67,
+    0x03a9547e, 0xc0fa7955, 0x81cb624c, 0x1fc53881, 0x5ef42398,
+    0x9da70eb3, 0xdc9615aa, 0x1b0054e5, 0x5a314ffc, 0x996262d7,
+    0xd85379ce, 0x174fe149, 0x567efa50, 0x952dd77b, 0xd41ccc62,
+    0x138a8d2d, 0x52bb9634, 0x91e8bb1f, 0xd0d9a006, 0xecf37e5e,
+    0xadc26547, 0x6e91486c, 0x2fa05375, 0xe836123a, 0xa9070923,
+    0x6a542408, 0x2b653f11, 0xe479a796, 0xa548bc8f, 0x661b91a4,
+    0x272a8abd, 0xe0bccbf2, 0xa18dd0eb, 0x62defdc0, 0x23efe6d9,
+    0xbde1bc14, 0xfcd0a70d, 0x3f838a26, 0x7eb2913f, 0xb924d070,
+    0xf815cb69, 0x3b46e642, 0x7a77fd5b, 0xb56b65dc, 0xf45a7ec5,
+    0x370953ee, 0x763848f7, 0xb1ae09b8, 0xf09f12a1, 0x33cc3f8a,
+    0x72fd2493},
+   {0x00000000, 0x376ac201, 0x6ed48403, 0x59be4602, 0xdca80907,
+    0xebc2cb06, 0xb27c8d04, 0x85164f05, 0xb851130e, 0x8f3bd10f,
+    0xd685970d, 0xe1ef550c, 0x64f91a09, 0x5393d808, 0x0a2d9e0a,
+    0x3d475c0b, 0x70a3261c, 0x47c9e41d, 0x1e77a21f, 0x291d601e,
+    0xac0b2f1b, 0x9b61ed1a, 0xc2dfab18, 0xf5b56919, 0xc8f23512,
+    0xff98f713, 0xa626b111, 0x914c7310, 0x145a3c15, 0x2330fe14,
+    0x7a8eb816, 0x4de47a17, 0xe0464d38, 0xd72c8f39, 0x8e92c93b,
+    0xb9f80b3a, 0x3cee443f, 0x0b84863e, 0x523ac03c, 0x6550023d,
+    0x58175e36, 0x6f7d9c37, 0x36c3da35, 0x01a91834, 0x84bf5731,
+    0xb3d59530, 0xea6bd332, 0xdd011133, 0x90e56b24, 0xa78fa925,
+    0xfe31ef27, 0xc95b2d26, 0x4c4d6223, 0x7b27a022, 0x2299e620,
+    0x15f32421, 0x28b4782a, 0x1fdeba2b, 0x4660fc29, 0x710a3e28,
+    0xf41c712d, 0xc376b32c, 0x9ac8f52e, 0xada2372f, 0xc08d9a70,
+    0xf7e75871, 0xae591e73, 0x9933dc72, 0x1c259377, 0x2b4f5176,
+    0x72f11774, 0x459bd575, 0x78dc897e, 0x4fb64b7f, 0x16080d7d,
+    0x2162cf7c, 0xa4748079, 0x931e4278, 0xcaa0047a, 0xfdcac67b,
+    0xb02ebc6c, 0x87447e6d, 0xdefa386f, 0xe990fa6e, 0x6c86b56b,
+    0x5bec776a, 0x02523168, 0x3538f369, 0x087faf62, 0x3f156d63,
+    0x66ab2b61, 0x51c1e960, 0xd4d7a665, 0xe3bd6464, 0xba032266,
+    0x8d69e067, 0x20cbd748, 0x17a11549, 0x4e1f534b, 0x7975914a,
+    0xfc63de4f, 0xcb091c4e, 0x92b75a4c, 0xa5dd984d, 0x989ac446,
+    0xaff00647, 0xf64e4045, 0xc1248244, 0x4432cd41, 0x73580f40,
+    0x2ae64942, 0x1d8c8b43, 0x5068f154, 0x67023355, 0x3ebc7557,
+    0x09d6b756, 0x8cc0f853, 0xbbaa3a52, 0xe2147c50, 0xd57ebe51,
+    0xe839e25a, 0xdf53205b, 0x86ed6659, 0xb187a458, 0x3491eb5d,
+    0x03fb295c, 0x5a456f5e, 0x6d2fad5f, 0x801b35e1, 0xb771f7e0,
+    0xeecfb1e2, 0xd9a573e3, 0x5cb33ce6, 0x6bd9fee7, 0x3267b8e5,
+    0x050d7ae4, 0x384a26ef, 0x0f20e4ee, 0x569ea2ec, 0x61f460ed,
+    0xe4e22fe8, 0xd388ede9, 0x8a36abeb, 0xbd5c69ea, 0xf0b813fd,
+    0xc7d2d1fc, 0x9e6c97fe, 0xa90655ff, 0x2c101afa, 0x1b7ad8fb,
+    0x42c49ef9, 0x75ae5cf8, 0x48e900f3, 0x7f83c2f2, 0x263d84f0,
+    0x115746f1, 0x944109f4, 0xa32bcbf5, 0xfa958df7, 0xcdff4ff6,
+    0x605d78d9, 0x5737bad8, 0x0e89fcda, 0x39e33edb, 0xbcf571de,
+    0x8b9fb3df, 0xd221f5dd, 0xe54b37dc, 0xd80c6bd7, 0xef66a9d6,
+    0xb6d8efd4, 0x81b22dd5, 0x04a462d0, 0x33cea0d1, 0x6a70e6d3,
+    0x5d1a24d2, 0x10fe5ec5, 0x27949cc4, 0x7e2adac6, 0x494018c7,
+    0xcc5657c2, 0xfb3c95c3, 0xa282d3c1, 0x95e811c0, 0xa8af4dcb,
+    0x9fc58fca, 0xc67bc9c8, 0xf1110bc9, 0x740744cc, 0x436d86cd,
+    0x1ad3c0cf, 0x2db902ce, 0x4096af91, 0x77fc6d90, 0x2e422b92,
+    0x1928e993, 0x9c3ea696, 0xab546497, 0xf2ea2295, 0xc580e094,
+    0xf8c7bc9f, 0xcfad7e9e, 0x9613389c, 0xa179fa9d, 0x246fb598,
+    0x13057799, 0x4abb319b, 0x7dd1f39a, 0x3035898d, 0x075f4b8c,
+    0x5ee10d8e, 0x698bcf8f, 0xec9d808a, 0xdbf7428b, 0x82490489,
+    0xb523c688, 0x88649a83, 0xbf0e5882, 0xe6b01e80, 0xd1dadc81,
+    0x54cc9384, 0x63a65185, 0x3a181787, 0x0d72d586, 0xa0d0e2a9,
+    0x97ba20a8, 0xce0466aa, 0xf96ea4ab, 0x7c78ebae, 0x4b1229af,
+    0x12ac6fad, 0x25c6adac, 0x1881f1a7, 0x2feb33a6, 0x765575a4,
+    0x413fb7a5, 0xc429f8a0, 0xf3433aa1, 0xaafd7ca3, 0x9d97bea2,
+    0xd073c4b5, 0xe71906b4, 0xbea740b6, 0x89cd82b7, 0x0cdbcdb2,
+    0x3bb10fb3, 0x620f49b1, 0x55658bb0, 0x6822d7bb, 0x5f4815ba,
+    0x06f653b8, 0x319c91b9, 0xb48adebc, 0x83e01cbd, 0xda5e5abf,
+    0xed3498be},
+   {0x00000000, 0x6567bcb8, 0x8bc809aa, 0xeeafb512, 0x5797628f,
+    0x32f0de37, 0xdc5f6b25, 0xb938d79d, 0xef28b4c5, 0x8a4f087d,
+    0x64e0bd6f, 0x018701d7, 0xb8bfd64a, 0xddd86af2, 0x3377dfe0,
+    0x56106358, 0x9f571950, 0xfa30a5e8, 0x149f10fa, 0x71f8ac42,
+    0xc8c07bdf, 0xada7c767, 0x43087275, 0x266fcecd, 0x707fad95,
+    0x1518112d, 0xfbb7a43f, 0x9ed01887, 0x27e8cf1a, 0x428f73a2,
+    0xac20c6b0, 0xc9477a08, 0x3eaf32a0, 0x5bc88e18, 0xb5673b0a,
+    0xd00087b2, 0x6938502f, 0x0c5fec97, 0xe2f05985, 0x8797e53d,
+    0xd1878665, 0xb4e03add, 0x5a4f8fcf, 0x3f283377, 0x8610e4ea,
+    0xe3775852, 0x0dd8ed40, 0x68bf51f8, 0xa1f82bf0, 0xc49f9748,
+    0x2a30225a, 0x4f579ee2, 0xf66f497f, 0x9308f5c7, 0x7da740d5,
+    0x18c0fc6d, 0x4ed09f35, 0x2bb7238d, 0xc518969f, 0xa07f2a27,
+    0x1947fdba, 0x7c204102, 0x928ff410, 0xf7e848a8, 0x3d58149b,
+    0x583fa823, 0xb6901d31, 0xd3f7a189, 0x6acf7614, 0x0fa8caac,
+    0xe1077fbe, 0x8460c306, 0xd270a05e, 0xb7171ce6, 0x59b8a9f4,
+    0x3cdf154c, 0x85e7c2d1, 0xe0807e69, 0x0e2fcb7b, 0x6b4877c3,
+    0xa20f0dcb, 0xc768b173, 0x29c70461, 0x4ca0b8d9, 0xf5986f44,
+    0x90ffd3fc, 0x7e5066ee, 0x1b37da56, 0x4d27b90e, 0x284005b6,
+    0xc6efb0a4, 0xa3880c1c, 0x1ab0db81, 0x7fd76739, 0x9178d22b,
+    0xf41f6e93, 0x03f7263b, 0x66909a83, 0x883f2f91, 0xed589329,
+    0x546044b4, 0x3107f80c, 0xdfa84d1e, 0xbacff1a6, 0xecdf92fe,
+    0x89b82e46, 0x67179b54, 0x027027ec, 0xbb48f071, 0xde2f4cc9,
+    0x3080f9db, 0x55e74563, 0x9ca03f6b, 0xf9c783d3, 0x176836c1,
+    0x720f8a79, 0xcb375de4, 0xae50e15c, 0x40ff544e, 0x2598e8f6,
+    0x73888bae, 0x16ef3716, 0xf8408204, 0x9d273ebc, 0x241fe921,
+    0x41785599, 0xafd7e08b, 0xcab05c33, 0x3bb659ed, 0x5ed1e555,
+    0xb07e5047, 0xd519ecff, 0x6c213b62, 0x094687da, 0xe7e932c8,
+    0x828e8e70, 0xd49eed28, 0xb1f95190, 0x5f56e482, 0x3a31583a,
+    0x83098fa7, 0xe66e331f, 0x08c1860d, 0x6da63ab5, 0xa4e140bd,
+    0xc186fc05, 0x2f294917, 0x4a4ef5af, 0xf3762232, 0x96119e8a,
+    0x78be2b98, 0x1dd99720, 0x4bc9f478, 0x2eae48c0, 0xc001fdd2,
+    0xa566416a, 0x1c5e96f7, 0x79392a4f, 0x97969f5d, 0xf2f123e5,
+    0x05196b4d, 0x607ed7f5, 0x8ed162e7, 0xebb6de5f, 0x528e09c2,
+    0x37e9b57a, 0xd9460068, 0xbc21bcd0, 0xea31df88, 0x8f566330,
+    0x61f9d622, 0x049e6a9a, 0xbda6bd07, 0xd8c101bf, 0x366eb4ad,
+    0x53090815, 0x9a4e721d, 0xff29cea5, 0x11867bb7, 0x74e1c70f,
+    0xcdd91092, 0xa8beac2a, 0x46111938, 0x2376a580, 0x7566c6d8,
+    0x10017a60, 0xfeaecf72, 0x9bc973ca, 0x22f1a457, 0x479618ef,
+    0xa939adfd, 0xcc5e1145, 0x06ee4d76, 0x6389f1ce, 0x8d2644dc,
+    0xe841f864, 0x51792ff9, 0x341e9341, 0xdab12653, 0xbfd69aeb,
+    0xe9c6f9b3, 0x8ca1450b, 0x620ef019, 0x07694ca1, 0xbe519b3c,
+    0xdb362784, 0x35999296, 0x50fe2e2e, 0x99b95426, 0xfcdee89e,
+    0x12715d8c, 0x7716e134, 0xce2e36a9, 0xab498a11, 0x45e63f03,
+    0x208183bb, 0x7691e0e3, 0x13f65c5b, 0xfd59e949, 0x983e55f1,
+    0x2106826c, 0x44613ed4, 0xaace8bc6, 0xcfa9377e, 0x38417fd6,
+    0x5d26c36e, 0xb389767c, 0xd6eecac4, 0x6fd61d59, 0x0ab1a1e1,
+    0xe41e14f3, 0x8179a84b, 0xd769cb13, 0xb20e77ab, 0x5ca1c2b9,
+    0x39c67e01, 0x80fea99c, 0xe5991524, 0x0b36a036, 0x6e511c8e,
+    0xa7166686, 0xc271da3e, 0x2cde6f2c, 0x49b9d394, 0xf0810409,
+    0x95e6b8b1, 0x7b490da3, 0x1e2eb11b, 0x483ed243, 0x2d596efb,
+    0xc3f6dbe9, 0xa6916751, 0x1fa9b0cc, 0x7ace0c74, 0x9461b966,
+    0xf10605de}};
+
+#endif /* W */
+
+#endif /* N == 1 */
+#if N == 2
+
+#if W == 8
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0xae689191, 0x87a02563, 0x29c8b4f2, 0xd4314c87,
+    0x7a59dd16, 0x539169e4, 0xfdf9f875, 0x73139f4f, 0xdd7b0ede,
+    0xf4b3ba2c, 0x5adb2bbd, 0xa722d3c8, 0x094a4259, 0x2082f6ab,
+    0x8eea673a, 0xe6273e9e, 0x484faf0f, 0x61871bfd, 0xcfef8a6c,
+    0x32167219, 0x9c7ee388, 0xb5b6577a, 0x1bdec6eb, 0x9534a1d1,
+    0x3b5c3040, 0x129484b2, 0xbcfc1523, 0x4105ed56, 0xef6d7cc7,
+    0xc6a5c835, 0x68cd59a4, 0x173f7b7d, 0xb957eaec, 0x909f5e1e,
+    0x3ef7cf8f, 0xc30e37fa, 0x6d66a66b, 0x44ae1299, 0xeac68308,
+    0x642ce432, 0xca4475a3, 0xe38cc151, 0x4de450c0, 0xb01da8b5,
+    0x1e753924, 0x37bd8dd6, 0x99d51c47, 0xf11845e3, 0x5f70d472,
+    0x76b86080, 0xd8d0f111, 0x25290964, 0x8b4198f5, 0xa2892c07,
+    0x0ce1bd96, 0x820bdaac, 0x2c634b3d, 0x05abffcf, 0xabc36e5e,
+    0x563a962b, 0xf85207ba, 0xd19ab348, 0x7ff222d9, 0x2e7ef6fa,
+    0x8016676b, 0xa9ded399, 0x07b64208, 0xfa4fba7d, 0x54272bec,
+    0x7def9f1e, 0xd3870e8f, 0x5d6d69b5, 0xf305f824, 0xdacd4cd6,
+    0x74a5dd47, 0x895c2532, 0x2734b4a3, 0x0efc0051, 0xa09491c0,
+    0xc859c864, 0x663159f5, 0x4ff9ed07, 0xe1917c96, 0x1c6884e3,
+    0xb2001572, 0x9bc8a180, 0x35a03011, 0xbb4a572b, 0x1522c6ba,
+    0x3cea7248, 0x9282e3d9, 0x6f7b1bac, 0xc1138a3d, 0xe8db3ecf,
+    0x46b3af5e, 0x39418d87, 0x97291c16, 0xbee1a8e4, 0x10893975,
+    0xed70c100, 0x43185091, 0x6ad0e463, 0xc4b875f2, 0x4a5212c8,
+    0xe43a8359, 0xcdf237ab, 0x639aa63a, 0x9e635e4f, 0x300bcfde,
+    0x19c37b2c, 0xb7abeabd, 0xdf66b319, 0x710e2288, 0x58c6967a,
+    0xf6ae07eb, 0x0b57ff9e, 0xa53f6e0f, 0x8cf7dafd, 0x229f4b6c,
+    0xac752c56, 0x021dbdc7, 0x2bd50935, 0x85bd98a4, 0x784460d1,
+    0xd62cf140, 0xffe445b2, 0x518cd423, 0x5cfdedf4, 0xf2957c65,
+    0xdb5dc897, 0x75355906, 0x88cca173, 0x26a430e2, 0x0f6c8410,
+    0xa1041581, 0x2fee72bb, 0x8186e32a, 0xa84e57d8, 0x0626c649,
+    0xfbdf3e3c, 0x55b7afad, 0x7c7f1b5f, 0xd2178ace, 0xbadad36a,
+    0x14b242fb, 0x3d7af609, 0x93126798, 0x6eeb9fed, 0xc0830e7c,
+    0xe94bba8e, 0x47232b1f, 0xc9c94c25, 0x67a1ddb4, 0x4e696946,
+    0xe001f8d7, 0x1df800a2, 0xb3909133, 0x9a5825c1, 0x3430b450,
+    0x4bc29689, 0xe5aa0718, 0xcc62b3ea, 0x620a227b, 0x9ff3da0e,
+    0x319b4b9f, 0x1853ff6d, 0xb63b6efc, 0x38d109c6, 0x96b99857,
+    0xbf712ca5, 0x1119bd34, 0xece04541, 0x4288d4d0, 0x6b406022,
+    0xc528f1b3, 0xade5a817, 0x038d3986, 0x2a458d74, 0x842d1ce5,
+    0x79d4e490, 0xd7bc7501, 0xfe74c1f3, 0x501c5062, 0xdef63758,
+    0x709ea6c9, 0x5956123b, 0xf73e83aa, 0x0ac77bdf, 0xa4afea4e,
+    0x8d675ebc, 0x230fcf2d, 0x72831b0e, 0xdceb8a9f, 0xf5233e6d,
+    0x5b4baffc, 0xa6b25789, 0x08dac618, 0x211272ea, 0x8f7ae37b,
+    0x01908441, 0xaff815d0, 0x8630a122, 0x285830b3, 0xd5a1c8c6,
+    0x7bc95957, 0x5201eda5, 0xfc697c34, 0x94a42590, 0x3accb401,
+    0x130400f3, 0xbd6c9162, 0x40956917, 0xeefdf886, 0xc7354c74,
+    0x695ddde5, 0xe7b7badf, 0x49df2b4e, 0x60179fbc, 0xce7f0e2d,
+    0x3386f658, 0x9dee67c9, 0xb426d33b, 0x1a4e42aa, 0x65bc6073,
+    0xcbd4f1e2, 0xe21c4510, 0x4c74d481, 0xb18d2cf4, 0x1fe5bd65,
+    0x362d0997, 0x98459806, 0x16afff3c, 0xb8c76ead, 0x910fda5f,
+    0x3f674bce, 0xc29eb3bb, 0x6cf6222a, 0x453e96d8, 0xeb560749,
+    0x839b5eed, 0x2df3cf7c, 0x043b7b8e, 0xaa53ea1f, 0x57aa126a,
+    0xf9c283fb, 0xd00a3709, 0x7e62a698, 0xf088c1a2, 0x5ee05033,
+    0x7728e4c1, 0xd9407550, 0x24b98d25, 0x8ad11cb4, 0xa319a846,
+    0x0d7139d7},
+   {0x00000000, 0xb9fbdbe8, 0xa886b191, 0x117d6a79, 0x8a7c6563,
+    0x3387be8b, 0x22fad4f2, 0x9b010f1a, 0xcf89cc87, 0x7672176f,
+    0x670f7d16, 0xdef4a6fe, 0x45f5a9e4, 0xfc0e720c, 0xed731875,
+    0x5488c39d, 0x44629f4f, 0xfd9944a7, 0xece42ede, 0x551ff536,
+    0xce1efa2c, 0x77e521c4, 0x66984bbd, 0xdf639055, 0x8beb53c8,
+    0x32108820, 0x236de259, 0x9a9639b1, 0x019736ab, 0xb86ced43,
+    0xa911873a, 0x10ea5cd2, 0x88c53e9e, 0x313ee576, 0x20438f0f,
+    0x99b854e7, 0x02b95bfd, 0xbb428015, 0xaa3fea6c, 0x13c43184,
+    0x474cf219, 0xfeb729f1, 0xefca4388, 0x56319860, 0xcd30977a,
+    0x74cb4c92, 0x65b626eb, 0xdc4dfd03, 0xcca7a1d1, 0x755c7a39,
+    0x64211040, 0xdddacba8, 0x46dbc4b2, 0xff201f5a, 0xee5d7523,
+    0x57a6aecb, 0x032e6d56, 0xbad5b6be, 0xaba8dcc7, 0x1253072f,
+    0x89520835, 0x30a9d3dd, 0x21d4b9a4, 0x982f624c, 0xcafb7b7d,
+    0x7300a095, 0x627dcaec, 0xdb861104, 0x40871e1e, 0xf97cc5f6,
+    0xe801af8f, 0x51fa7467, 0x0572b7fa, 0xbc896c12, 0xadf4066b,
+    0x140fdd83, 0x8f0ed299, 0x36f50971, 0x27886308, 0x9e73b8e0,
+    0x8e99e432, 0x37623fda, 0x261f55a3, 0x9fe48e4b, 0x04e58151,
+    0xbd1e5ab9, 0xac6330c0, 0x1598eb28, 0x411028b5, 0xf8ebf35d,
+    0xe9969924, 0x506d42cc, 0xcb6c4dd6, 0x7297963e, 0x63eafc47,
+    0xda1127af, 0x423e45e3, 0xfbc59e0b, 0xeab8f472, 0x53432f9a,
+    0xc8422080, 0x71b9fb68, 0x60c49111, 0xd93f4af9, 0x8db78964,
+    0x344c528c, 0x253138f5, 0x9ccae31d, 0x07cbec07, 0xbe3037ef,
+    0xaf4d5d96, 0x16b6867e, 0x065cdaac, 0xbfa70144, 0xaeda6b3d,
+    0x1721b0d5, 0x8c20bfcf, 0x35db6427, 0x24a60e5e, 0x9d5dd5b6,
+    0xc9d5162b, 0x702ecdc3, 0x6153a7ba, 0xd8a87c52, 0x43a97348,
+    0xfa52a8a0, 0xeb2fc2d9, 0x52d41931, 0x4e87f0bb, 0xf77c2b53,
+    0xe601412a, 0x5ffa9ac2, 0xc4fb95d8, 0x7d004e30, 0x6c7d2449,
+    0xd586ffa1, 0x810e3c3c, 0x38f5e7d4, 0x29888dad, 0x90735645,
+    0x0b72595f, 0xb28982b7, 0xa3f4e8ce, 0x1a0f3326, 0x0ae56ff4,
+    0xb31eb41c, 0xa263de65, 0x1b98058d, 0x80990a97, 0x3962d17f,
+    0x281fbb06, 0x91e460ee, 0xc56ca373, 0x7c97789b, 0x6dea12e2,
+    0xd411c90a, 0x4f10c610, 0xf6eb1df8, 0xe7967781, 0x5e6dac69,
+    0xc642ce25, 0x7fb915cd, 0x6ec47fb4, 0xd73fa45c, 0x4c3eab46,
+    0xf5c570ae, 0xe4b81ad7, 0x5d43c13f, 0x09cb02a2, 0xb030d94a,
+    0xa14db333, 0x18b668db, 0x83b767c1, 0x3a4cbc29, 0x2b31d650,
+    0x92ca0db8, 0x8220516a, 0x3bdb8a82, 0x2aa6e0fb, 0x935d3b13,
+    0x085c3409, 0xb1a7efe1, 0xa0da8598, 0x19215e70, 0x4da99ded,
+    0xf4524605, 0xe52f2c7c, 0x5cd4f794, 0xc7d5f88e, 0x7e2e2366,
+    0x6f53491f, 0xd6a892f7, 0x847c8bc6, 0x3d87502e, 0x2cfa3a57,
+    0x9501e1bf, 0x0e00eea5, 0xb7fb354d, 0xa6865f34, 0x1f7d84dc,
+    0x4bf54741, 0xf20e9ca9, 0xe373f6d0, 0x5a882d38, 0xc1892222,
+    0x7872f9ca, 0x690f93b3, 0xd0f4485b, 0xc01e1489, 0x79e5cf61,
+    0x6898a518, 0xd1637ef0, 0x4a6271ea, 0xf399aa02, 0xe2e4c07b,
+    0x5b1f1b93, 0x0f97d80e, 0xb66c03e6, 0xa711699f, 0x1eeab277,
+    0x85ebbd6d, 0x3c106685, 0x2d6d0cfc, 0x9496d714, 0x0cb9b558,
+    0xb5426eb0, 0xa43f04c9, 0x1dc4df21, 0x86c5d03b, 0x3f3e0bd3,
+    0x2e4361aa, 0x97b8ba42, 0xc33079df, 0x7acba237, 0x6bb6c84e,
+    0xd24d13a6, 0x494c1cbc, 0xf0b7c754, 0xe1caad2d, 0x583176c5,
+    0x48db2a17, 0xf120f1ff, 0xe05d9b86, 0x59a6406e, 0xc2a74f74,
+    0x7b5c949c, 0x6a21fee5, 0xd3da250d, 0x8752e690, 0x3ea93d78,
+    0x2fd45701, 0x962f8ce9, 0x0d2e83f3, 0xb4d5581b, 0xa5a83262,
+    0x1c53e98a},
+   {0x00000000, 0x9d0fe176, 0xe16ec4ad, 0x7c6125db, 0x19ac8f1b,
+    0x84a36e6d, 0xf8c24bb6, 0x65cdaac0, 0x33591e36, 0xae56ff40,
+    0xd237da9b, 0x4f383bed, 0x2af5912d, 0xb7fa705b, 0xcb9b5580,
+    0x5694b4f6, 0x66b23c6c, 0xfbbddd1a, 0x87dcf8c1, 0x1ad319b7,
+    0x7f1eb377, 0xe2115201, 0x9e7077da, 0x037f96ac, 0x55eb225a,
+    0xc8e4c32c, 0xb485e6f7, 0x298a0781, 0x4c47ad41, 0xd1484c37,
+    0xad2969ec, 0x3026889a, 0xcd6478d8, 0x506b99ae, 0x2c0abc75,
+    0xb1055d03, 0xd4c8f7c3, 0x49c716b5, 0x35a6336e, 0xa8a9d218,
+    0xfe3d66ee, 0x63328798, 0x1f53a243, 0x825c4335, 0xe791e9f5,
+    0x7a9e0883, 0x06ff2d58, 0x9bf0cc2e, 0xabd644b4, 0x36d9a5c2,
+    0x4ab88019, 0xd7b7616f, 0xb27acbaf, 0x2f752ad9, 0x53140f02,
+    0xce1bee74, 0x988f5a82, 0x0580bbf4, 0x79e19e2f, 0xe4ee7f59,
+    0x8123d599, 0x1c2c34ef, 0x604d1134, 0xfd42f042, 0x41b9f7f1,
+    0xdcb61687, 0xa0d7335c, 0x3dd8d22a, 0x581578ea, 0xc51a999c,
+    0xb97bbc47, 0x24745d31, 0x72e0e9c7, 0xefef08b1, 0x938e2d6a,
+    0x0e81cc1c, 0x6b4c66dc, 0xf64387aa, 0x8a22a271, 0x172d4307,
+    0x270bcb9d, 0xba042aeb, 0xc6650f30, 0x5b6aee46, 0x3ea74486,
+    0xa3a8a5f0, 0xdfc9802b, 0x42c6615d, 0x1452d5ab, 0x895d34dd,
+    0xf53c1106, 0x6833f070, 0x0dfe5ab0, 0x90f1bbc6, 0xec909e1d,
+    0x719f7f6b, 0x8cdd8f29, 0x11d26e5f, 0x6db34b84, 0xf0bcaaf2,
+    0x95710032, 0x087ee144, 0x741fc49f, 0xe91025e9, 0xbf84911f,
+    0x228b7069, 0x5eea55b2, 0xc3e5b4c4, 0xa6281e04, 0x3b27ff72,
+    0x4746daa9, 0xda493bdf, 0xea6fb345, 0x77605233, 0x0b0177e8,
+    0x960e969e, 0xf3c33c5e, 0x6eccdd28, 0x12adf8f3, 0x8fa21985,
+    0xd936ad73, 0x44394c05, 0x385869de, 0xa55788a8, 0xc09a2268,
+    0x5d95c31e, 0x21f4e6c5, 0xbcfb07b3, 0x8373efe2, 0x1e7c0e94,
+    0x621d2b4f, 0xff12ca39, 0x9adf60f9, 0x07d0818f, 0x7bb1a454,
+    0xe6be4522, 0xb02af1d4, 0x2d2510a2, 0x51443579, 0xcc4bd40f,
+    0xa9867ecf, 0x34899fb9, 0x48e8ba62, 0xd5e75b14, 0xe5c1d38e,
+    0x78ce32f8, 0x04af1723, 0x99a0f655, 0xfc6d5c95, 0x6162bde3,
+    0x1d039838, 0x800c794e, 0xd698cdb8, 0x4b972cce, 0x37f60915,
+    0xaaf9e863, 0xcf3442a3, 0x523ba3d5, 0x2e5a860e, 0xb3556778,
+    0x4e17973a, 0xd318764c, 0xaf795397, 0x3276b2e1, 0x57bb1821,
+    0xcab4f957, 0xb6d5dc8c, 0x2bda3dfa, 0x7d4e890c, 0xe041687a,
+    0x9c204da1, 0x012facd7, 0x64e20617, 0xf9ede761, 0x858cc2ba,
+    0x188323cc, 0x28a5ab56, 0xb5aa4a20, 0xc9cb6ffb, 0x54c48e8d,
+    0x3109244d, 0xac06c53b, 0xd067e0e0, 0x4d680196, 0x1bfcb560,
+    0x86f35416, 0xfa9271cd, 0x679d90bb, 0x02503a7b, 0x9f5fdb0d,
+    0xe33efed6, 0x7e311fa0, 0xc2ca1813, 0x5fc5f965, 0x23a4dcbe,
+    0xbeab3dc8, 0xdb669708, 0x4669767e, 0x3a0853a5, 0xa707b2d3,
+    0xf1930625, 0x6c9ce753, 0x10fdc288, 0x8df223fe, 0xe83f893e,
+    0x75306848, 0x09514d93, 0x945eace5, 0xa478247f, 0x3977c509,
+    0x4516e0d2, 0xd81901a4, 0xbdd4ab64, 0x20db4a12, 0x5cba6fc9,
+    0xc1b58ebf, 0x97213a49, 0x0a2edb3f, 0x764ffee4, 0xeb401f92,
+    0x8e8db552, 0x13825424, 0x6fe371ff, 0xf2ec9089, 0x0fae60cb,
+    0x92a181bd, 0xeec0a466, 0x73cf4510, 0x1602efd0, 0x8b0d0ea6,
+    0xf76c2b7d, 0x6a63ca0b, 0x3cf77efd, 0xa1f89f8b, 0xdd99ba50,
+    0x40965b26, 0x255bf1e6, 0xb8541090, 0xc435354b, 0x593ad43d,
+    0x691c5ca7, 0xf413bdd1, 0x8872980a, 0x157d797c, 0x70b0d3bc,
+    0xedbf32ca, 0x91de1711, 0x0cd1f667, 0x5a454291, 0xc74aa3e7,
+    0xbb2b863c, 0x2624674a, 0x43e9cd8a, 0xdee62cfc, 0xa2870927,
+    0x3f88e851},
+   {0x00000000, 0xdd96d985, 0x605cb54b, 0xbdca6cce, 0xc0b96a96,
+    0x1d2fb313, 0xa0e5dfdd, 0x7d730658, 0x5a03d36d, 0x87950ae8,
+    0x3a5f6626, 0xe7c9bfa3, 0x9abab9fb, 0x472c607e, 0xfae60cb0,
+    0x2770d535, 0xb407a6da, 0x69917f5f, 0xd45b1391, 0x09cdca14,
+    0x74becc4c, 0xa92815c9, 0x14e27907, 0xc974a082, 0xee0475b7,
+    0x3392ac32, 0x8e58c0fc, 0x53ce1979, 0x2ebd1f21, 0xf32bc6a4,
+    0x4ee1aa6a, 0x937773ef, 0xb37e4bf5, 0x6ee89270, 0xd322febe,
+    0x0eb4273b, 0x73c72163, 0xae51f8e6, 0x139b9428, 0xce0d4dad,
+    0xe97d9898, 0x34eb411d, 0x89212dd3, 0x54b7f456, 0x29c4f20e,
+    0xf4522b8b, 0x49984745, 0x940e9ec0, 0x0779ed2f, 0xdaef34aa,
+    0x67255864, 0xbab381e1, 0xc7c087b9, 0x1a565e3c, 0xa79c32f2,
+    0x7a0aeb77, 0x5d7a3e42, 0x80ece7c7, 0x3d268b09, 0xe0b0528c,
+    0x9dc354d4, 0x40558d51, 0xfd9fe19f, 0x2009381a, 0xbd8d91ab,
+    0x601b482e, 0xddd124e0, 0x0047fd65, 0x7d34fb3d, 0xa0a222b8,
+    0x1d684e76, 0xc0fe97f3, 0xe78e42c6, 0x3a189b43, 0x87d2f78d,
+    0x5a442e08, 0x27372850, 0xfaa1f1d5, 0x476b9d1b, 0x9afd449e,
+    0x098a3771, 0xd41ceef4, 0x69d6823a, 0xb4405bbf, 0xc9335de7,
+    0x14a58462, 0xa96fe8ac, 0x74f93129, 0x5389e41c, 0x8e1f3d99,
+    0x33d55157, 0xee4388d2, 0x93308e8a, 0x4ea6570f, 0xf36c3bc1,
+    0x2efae244, 0x0ef3da5e, 0xd36503db, 0x6eaf6f15, 0xb339b690,
+    0xce4ab0c8, 0x13dc694d, 0xae160583, 0x7380dc06, 0x54f00933,
+    0x8966d0b6, 0x34acbc78, 0xe93a65fd, 0x944963a5, 0x49dfba20,
+    0xf415d6ee, 0x29830f6b, 0xbaf47c84, 0x6762a501, 0xdaa8c9cf,
+    0x073e104a, 0x7a4d1612, 0xa7dbcf97, 0x1a11a359, 0xc7877adc,
+    0xe0f7afe9, 0x3d61766c, 0x80ab1aa2, 0x5d3dc327, 0x204ec57f,
+    0xfdd81cfa, 0x40127034, 0x9d84a9b1, 0xa06a2517, 0x7dfcfc92,
+    0xc036905c, 0x1da049d9, 0x60d34f81, 0xbd459604, 0x008ffaca,
+    0xdd19234f, 0xfa69f67a, 0x27ff2fff, 0x9a354331, 0x47a39ab4,
+    0x3ad09cec, 0xe7464569, 0x5a8c29a7, 0x871af022, 0x146d83cd,
+    0xc9fb5a48, 0x74313686, 0xa9a7ef03, 0xd4d4e95b, 0x094230de,
+    0xb4885c10, 0x691e8595, 0x4e6e50a0, 0x93f88925, 0x2e32e5eb,
+    0xf3a43c6e, 0x8ed73a36, 0x5341e3b3, 0xee8b8f7d, 0x331d56f8,
+    0x13146ee2, 0xce82b767, 0x7348dba9, 0xaede022c, 0xd3ad0474,
+    0x0e3bddf1, 0xb3f1b13f, 0x6e6768ba, 0x4917bd8f, 0x9481640a,
+    0x294b08c4, 0xf4ddd141, 0x89aed719, 0x54380e9c, 0xe9f26252,
+    0x3464bbd7, 0xa713c838, 0x7a8511bd, 0xc74f7d73, 0x1ad9a4f6,
+    0x67aaa2ae, 0xba3c7b2b, 0x07f617e5, 0xda60ce60, 0xfd101b55,
+    0x2086c2d0, 0x9d4cae1e, 0x40da779b, 0x3da971c3, 0xe03fa846,
+    0x5df5c488, 0x80631d0d, 0x1de7b4bc, 0xc0716d39, 0x7dbb01f7,
+    0xa02dd872, 0xdd5ede2a, 0x00c807af, 0xbd026b61, 0x6094b2e4,
+    0x47e467d1, 0x9a72be54, 0x27b8d29a, 0xfa2e0b1f, 0x875d0d47,
+    0x5acbd4c2, 0xe701b80c, 0x3a976189, 0xa9e01266, 0x7476cbe3,
+    0xc9bca72d, 0x142a7ea8, 0x695978f0, 0xb4cfa175, 0x0905cdbb,
+    0xd493143e, 0xf3e3c10b, 0x2e75188e, 0x93bf7440, 0x4e29adc5,
+    0x335aab9d, 0xeecc7218, 0x53061ed6, 0x8e90c753, 0xae99ff49,
+    0x730f26cc, 0xcec54a02, 0x13539387, 0x6e2095df, 0xb3b64c5a,
+    0x0e7c2094, 0xd3eaf911, 0xf49a2c24, 0x290cf5a1, 0x94c6996f,
+    0x495040ea, 0x342346b2, 0xe9b59f37, 0x547ff3f9, 0x89e92a7c,
+    0x1a9e5993, 0xc7088016, 0x7ac2ecd8, 0xa754355d, 0xda273305,
+    0x07b1ea80, 0xba7b864e, 0x67ed5fcb, 0x409d8afe, 0x9d0b537b,
+    0x20c13fb5, 0xfd57e630, 0x8024e068, 0x5db239ed, 0xe0785523,
+    0x3dee8ca6},
+   {0x00000000, 0x9ba54c6f, 0xec3b9e9f, 0x779ed2f0, 0x03063b7f,
+    0x98a37710, 0xef3da5e0, 0x7498e98f, 0x060c76fe, 0x9da93a91,
+    0xea37e861, 0x7192a40e, 0x050a4d81, 0x9eaf01ee, 0xe931d31e,
+    0x72949f71, 0x0c18edfc, 0x97bda193, 0xe0237363, 0x7b863f0c,
+    0x0f1ed683, 0x94bb9aec, 0xe325481c, 0x78800473, 0x0a149b02,
+    0x91b1d76d, 0xe62f059d, 0x7d8a49f2, 0x0912a07d, 0x92b7ec12,
+    0xe5293ee2, 0x7e8c728d, 0x1831dbf8, 0x83949797, 0xf40a4567,
+    0x6faf0908, 0x1b37e087, 0x8092ace8, 0xf70c7e18, 0x6ca93277,
+    0x1e3dad06, 0x8598e169, 0xf2063399, 0x69a37ff6, 0x1d3b9679,
+    0x869eda16, 0xf10008e6, 0x6aa54489, 0x14293604, 0x8f8c7a6b,
+    0xf812a89b, 0x63b7e4f4, 0x172f0d7b, 0x8c8a4114, 0xfb1493e4,
+    0x60b1df8b, 0x122540fa, 0x89800c95, 0xfe1ede65, 0x65bb920a,
+    0x11237b85, 0x8a8637ea, 0xfd18e51a, 0x66bda975, 0x3063b7f0,
+    0xabc6fb9f, 0xdc58296f, 0x47fd6500, 0x33658c8f, 0xa8c0c0e0,
+    0xdf5e1210, 0x44fb5e7f, 0x366fc10e, 0xadca8d61, 0xda545f91,
+    0x41f113fe, 0x3569fa71, 0xaeccb61e, 0xd95264ee, 0x42f72881,
+    0x3c7b5a0c, 0xa7de1663, 0xd040c493, 0x4be588fc, 0x3f7d6173,
+    0xa4d82d1c, 0xd346ffec, 0x48e3b383, 0x3a772cf2, 0xa1d2609d,
+    0xd64cb26d, 0x4de9fe02, 0x3971178d, 0xa2d45be2, 0xd54a8912,
+    0x4eefc57d, 0x28526c08, 0xb3f72067, 0xc469f297, 0x5fccbef8,
+    0x2b545777, 0xb0f11b18, 0xc76fc9e8, 0x5cca8587, 0x2e5e1af6,
+    0xb5fb5699, 0xc2658469, 0x59c0c806, 0x2d582189, 0xb6fd6de6,
+    0xc163bf16, 0x5ac6f379, 0x244a81f4, 0xbfefcd9b, 0xc8711f6b,
+    0x53d45304, 0x274cba8b, 0xbce9f6e4, 0xcb772414, 0x50d2687b,
+    0x2246f70a, 0xb9e3bb65, 0xce7d6995, 0x55d825fa, 0x2140cc75,
+    0xbae5801a, 0xcd7b52ea, 0x56de1e85, 0x60c76fe0, 0xfb62238f,
+    0x8cfcf17f, 0x1759bd10, 0x63c1549f, 0xf86418f0, 0x8ffaca00,
+    0x145f866f, 0x66cb191e, 0xfd6e5571, 0x8af08781, 0x1155cbee,
+    0x65cd2261, 0xfe686e0e, 0x89f6bcfe, 0x1253f091, 0x6cdf821c,
+    0xf77ace73, 0x80e41c83, 0x1b4150ec, 0x6fd9b963, 0xf47cf50c,
+    0x83e227fc, 0x18476b93, 0x6ad3f4e2, 0xf176b88d, 0x86e86a7d,
+    0x1d4d2612, 0x69d5cf9d, 0xf27083f2, 0x85ee5102, 0x1e4b1d6d,
+    0x78f6b418, 0xe353f877, 0x94cd2a87, 0x0f6866e8, 0x7bf08f67,
+    0xe055c308, 0x97cb11f8, 0x0c6e5d97, 0x7efac2e6, 0xe55f8e89,
+    0x92c15c79, 0x09641016, 0x7dfcf999, 0xe659b5f6, 0x91c76706,
+    0x0a622b69, 0x74ee59e4, 0xef4b158b, 0x98d5c77b, 0x03708b14,
+    0x77e8629b, 0xec4d2ef4, 0x9bd3fc04, 0x0076b06b, 0x72e22f1a,
+    0xe9476375, 0x9ed9b185, 0x057cfdea, 0x71e41465, 0xea41580a,
+    0x9ddf8afa, 0x067ac695, 0x50a4d810, 0xcb01947f, 0xbc9f468f,
+    0x273a0ae0, 0x53a2e36f, 0xc807af00, 0xbf997df0, 0x243c319f,
+    0x56a8aeee, 0xcd0de281, 0xba933071, 0x21367c1e, 0x55ae9591,
+    0xce0bd9fe, 0xb9950b0e, 0x22304761, 0x5cbc35ec, 0xc7197983,
+    0xb087ab73, 0x2b22e71c, 0x5fba0e93, 0xc41f42fc, 0xb381900c,
+    0x2824dc63, 0x5ab04312, 0xc1150f7d, 0xb68bdd8d, 0x2d2e91e2,
+    0x59b6786d, 0xc2133402, 0xb58de6f2, 0x2e28aa9d, 0x489503e8,
+    0xd3304f87, 0xa4ae9d77, 0x3f0bd118, 0x4b933897, 0xd03674f8,
+    0xa7a8a608, 0x3c0dea67, 0x4e997516, 0xd53c3979, 0xa2a2eb89,
+    0x3907a7e6, 0x4d9f4e69, 0xd63a0206, 0xa1a4d0f6, 0x3a019c99,
+    0x448dee14, 0xdf28a27b, 0xa8b6708b, 0x33133ce4, 0x478bd56b,
+    0xdc2e9904, 0xabb04bf4, 0x3015079b, 0x428198ea, 0xd924d485,
+    0xaeba0675, 0x351f4a1a, 0x4187a395, 0xda22effa, 0xadbc3d0a,
+    0x36197165},
+   {0x00000000, 0xc18edfc0, 0x586cb9c1, 0x99e26601, 0xb0d97382,
+    0x7157ac42, 0xe8b5ca43, 0x293b1583, 0xbac3e145, 0x7b4d3e85,
+    0xe2af5884, 0x23218744, 0x0a1a92c7, 0xcb944d07, 0x52762b06,
+    0x93f8f4c6, 0xaef6c4cb, 0x6f781b0b, 0xf69a7d0a, 0x3714a2ca,
+    0x1e2fb749, 0xdfa16889, 0x46430e88, 0x87cdd148, 0x1435258e,
+    0xd5bbfa4e, 0x4c599c4f, 0x8dd7438f, 0xa4ec560c, 0x656289cc,
+    0xfc80efcd, 0x3d0e300d, 0x869c8fd7, 0x47125017, 0xdef03616,
+    0x1f7ee9d6, 0x3645fc55, 0xf7cb2395, 0x6e294594, 0xafa79a54,
+    0x3c5f6e92, 0xfdd1b152, 0x6433d753, 0xa5bd0893, 0x8c861d10,
+    0x4d08c2d0, 0xd4eaa4d1, 0x15647b11, 0x286a4b1c, 0xe9e494dc,
+    0x7006f2dd, 0xb1882d1d, 0x98b3389e, 0x593de75e, 0xc0df815f,
+    0x01515e9f, 0x92a9aa59, 0x53277599, 0xcac51398, 0x0b4bcc58,
+    0x2270d9db, 0xe3fe061b, 0x7a1c601a, 0xbb92bfda, 0xd64819ef,
+    0x17c6c62f, 0x8e24a02e, 0x4faa7fee, 0x66916a6d, 0xa71fb5ad,
+    0x3efdd3ac, 0xff730c6c, 0x6c8bf8aa, 0xad05276a, 0x34e7416b,
+    0xf5699eab, 0xdc528b28, 0x1ddc54e8, 0x843e32e9, 0x45b0ed29,
+    0x78bedd24, 0xb93002e4, 0x20d264e5, 0xe15cbb25, 0xc867aea6,
+    0x09e97166, 0x900b1767, 0x5185c8a7, 0xc27d3c61, 0x03f3e3a1,
+    0x9a1185a0, 0x5b9f5a60, 0x72a44fe3, 0xb32a9023, 0x2ac8f622,
+    0xeb4629e2, 0x50d49638, 0x915a49f8, 0x08b82ff9, 0xc936f039,
+    0xe00de5ba, 0x21833a7a, 0xb8615c7b, 0x79ef83bb, 0xea17777d,
+    0x2b99a8bd, 0xb27bcebc, 0x73f5117c, 0x5ace04ff, 0x9b40db3f,
+    0x02a2bd3e, 0xc32c62fe, 0xfe2252f3, 0x3fac8d33, 0xa64eeb32,
+    0x67c034f2, 0x4efb2171, 0x8f75feb1, 0x169798b0, 0xd7194770,
+    0x44e1b3b6, 0x856f6c76, 0x1c8d0a77, 0xdd03d5b7, 0xf438c034,
+    0x35b61ff4, 0xac5479f5, 0x6ddaa635, 0x77e1359f, 0xb66fea5f,
+    0x2f8d8c5e, 0xee03539e, 0xc738461d, 0x06b699dd, 0x9f54ffdc,
+    0x5eda201c, 0xcd22d4da, 0x0cac0b1a, 0x954e6d1b, 0x54c0b2db,
+    0x7dfba758, 0xbc757898, 0x25971e99, 0xe419c159, 0xd917f154,
+    0x18992e94, 0x817b4895, 0x40f59755, 0x69ce82d6, 0xa8405d16,
+    0x31a23b17, 0xf02ce4d7, 0x63d41011, 0xa25acfd1, 0x3bb8a9d0,
+    0xfa367610, 0xd30d6393, 0x1283bc53, 0x8b61da52, 0x4aef0592,
+    0xf17dba48, 0x30f36588, 0xa9110389, 0x689fdc49, 0x41a4c9ca,
+    0x802a160a, 0x19c8700b, 0xd846afcb, 0x4bbe5b0d, 0x8a3084cd,
+    0x13d2e2cc, 0xd25c3d0c, 0xfb67288f, 0x3ae9f74f, 0xa30b914e,
+    0x62854e8e, 0x5f8b7e83, 0x9e05a143, 0x07e7c742, 0xc6691882,
+    0xef520d01, 0x2edcd2c1, 0xb73eb4c0, 0x76b06b00, 0xe5489fc6,
+    0x24c64006, 0xbd242607, 0x7caaf9c7, 0x5591ec44, 0x941f3384,
+    0x0dfd5585, 0xcc738a45, 0xa1a92c70, 0x6027f3b0, 0xf9c595b1,
+    0x384b4a71, 0x11705ff2, 0xd0fe8032, 0x491ce633, 0x889239f3,
+    0x1b6acd35, 0xdae412f5, 0x430674f4, 0x8288ab34, 0xabb3beb7,
+    0x6a3d6177, 0xf3df0776, 0x3251d8b6, 0x0f5fe8bb, 0xced1377b,
+    0x5733517a, 0x96bd8eba, 0xbf869b39, 0x7e0844f9, 0xe7ea22f8,
+    0x2664fd38, 0xb59c09fe, 0x7412d63e, 0xedf0b03f, 0x2c7e6fff,
+    0x05457a7c, 0xc4cba5bc, 0x5d29c3bd, 0x9ca71c7d, 0x2735a3a7,
+    0xe6bb7c67, 0x7f591a66, 0xbed7c5a6, 0x97ecd025, 0x56620fe5,
+    0xcf8069e4, 0x0e0eb624, 0x9df642e2, 0x5c789d22, 0xc59afb23,
+    0x041424e3, 0x2d2f3160, 0xeca1eea0, 0x754388a1, 0xb4cd5761,
+    0x89c3676c, 0x484db8ac, 0xd1afdead, 0x1021016d, 0x391a14ee,
+    0xf894cb2e, 0x6176ad2f, 0xa0f872ef, 0x33008629, 0xf28e59e9,
+    0x6b6c3fe8, 0xaae2e028, 0x83d9f5ab, 0x42572a6b, 0xdbb54c6a,
+    0x1a3b93aa},
+   {0x00000000, 0xefc26b3e, 0x04f5d03d, 0xeb37bb03, 0x09eba07a,
+    0xe629cb44, 0x0d1e7047, 0xe2dc1b79, 0x13d740f4, 0xfc152bca,
+    0x172290c9, 0xf8e0fbf7, 0x1a3ce08e, 0xf5fe8bb0, 0x1ec930b3,
+    0xf10b5b8d, 0x27ae81e8, 0xc86cead6, 0x235b51d5, 0xcc993aeb,
+    0x2e452192, 0xc1874aac, 0x2ab0f1af, 0xc5729a91, 0x3479c11c,
+    0xdbbbaa22, 0x308c1121, 0xdf4e7a1f, 0x3d926166, 0xd2500a58,
+    0x3967b15b, 0xd6a5da65, 0x4f5d03d0, 0xa09f68ee, 0x4ba8d3ed,
+    0xa46ab8d3, 0x46b6a3aa, 0xa974c894, 0x42437397, 0xad8118a9,
+    0x5c8a4324, 0xb348281a, 0x587f9319, 0xb7bdf827, 0x5561e35e,
+    0xbaa38860, 0x51943363, 0xbe56585d, 0x68f38238, 0x8731e906,
+    0x6c065205, 0x83c4393b, 0x61182242, 0x8eda497c, 0x65edf27f,
+    0x8a2f9941, 0x7b24c2cc, 0x94e6a9f2, 0x7fd112f1, 0x901379cf,
+    0x72cf62b6, 0x9d0d0988, 0x763ab28b, 0x99f8d9b5, 0x9eba07a0,
+    0x71786c9e, 0x9a4fd79d, 0x758dbca3, 0x9751a7da, 0x7893cce4,
+    0x93a477e7, 0x7c661cd9, 0x8d6d4754, 0x62af2c6a, 0x89989769,
+    0x665afc57, 0x8486e72e, 0x6b448c10, 0x80733713, 0x6fb15c2d,
+    0xb9148648, 0x56d6ed76, 0xbde15675, 0x52233d4b, 0xb0ff2632,
+    0x5f3d4d0c, 0xb40af60f, 0x5bc89d31, 0xaac3c6bc, 0x4501ad82,
+    0xae361681, 0x41f47dbf, 0xa32866c6, 0x4cea0df8, 0xa7ddb6fb,
+    0x481fddc5, 0xd1e70470, 0x3e256f4e, 0xd512d44d, 0x3ad0bf73,
+    0xd80ca40a, 0x37cecf34, 0xdcf97437, 0x333b1f09, 0xc2304484,
+    0x2df22fba, 0xc6c594b9, 0x2907ff87, 0xcbdbe4fe, 0x24198fc0,
+    0xcf2e34c3, 0x20ec5ffd, 0xf6498598, 0x198beea6, 0xf2bc55a5,
+    0x1d7e3e9b, 0xffa225e2, 0x10604edc, 0xfb57f5df, 0x14959ee1,
+    0xe59ec56c, 0x0a5cae52, 0xe16b1551, 0x0ea97e6f, 0xec756516,
+    0x03b70e28, 0xe880b52b, 0x0742de15, 0xe6050901, 0x09c7623f,
+    0xe2f0d93c, 0x0d32b202, 0xefeea97b, 0x002cc245, 0xeb1b7946,
+    0x04d91278, 0xf5d249f5, 0x1a1022cb, 0xf12799c8, 0x1ee5f2f6,
+    0xfc39e98f, 0x13fb82b1, 0xf8cc39b2, 0x170e528c, 0xc1ab88e9,
+    0x2e69e3d7, 0xc55e58d4, 0x2a9c33ea, 0xc8402893, 0x278243ad,
+    0xccb5f8ae, 0x23779390, 0xd27cc81d, 0x3dbea323, 0xd6891820,
+    0x394b731e, 0xdb976867, 0x34550359, 0xdf62b85a, 0x30a0d364,
+    0xa9580ad1, 0x469a61ef, 0xadaddaec, 0x426fb1d2, 0xa0b3aaab,
+    0x4f71c195, 0xa4467a96, 0x4b8411a8, 0xba8f4a25, 0x554d211b,
+    0xbe7a9a18, 0x51b8f126, 0xb364ea5f, 0x5ca68161, 0xb7913a62,
+    0x5853515c, 0x8ef68b39, 0x6134e007, 0x8a035b04, 0x65c1303a,
+    0x871d2b43, 0x68df407d, 0x83e8fb7e, 0x6c2a9040, 0x9d21cbcd,
+    0x72e3a0f3, 0x99d41bf0, 0x761670ce, 0x94ca6bb7, 0x7b080089,
+    0x903fbb8a, 0x7ffdd0b4, 0x78bf0ea1, 0x977d659f, 0x7c4ade9c,
+    0x9388b5a2, 0x7154aedb, 0x9e96c5e5, 0x75a17ee6, 0x9a6315d8,
+    0x6b684e55, 0x84aa256b, 0x6f9d9e68, 0x805ff556, 0x6283ee2f,
+    0x8d418511, 0x66763e12, 0x89b4552c, 0x5f118f49, 0xb0d3e477,
+    0x5be45f74, 0xb426344a, 0x56fa2f33, 0xb938440d, 0x520fff0e,
+    0xbdcd9430, 0x4cc6cfbd, 0xa304a483, 0x48331f80, 0xa7f174be,
+    0x452d6fc7, 0xaaef04f9, 0x41d8bffa, 0xae1ad4c4, 0x37e20d71,
+    0xd820664f, 0x3317dd4c, 0xdcd5b672, 0x3e09ad0b, 0xd1cbc635,
+    0x3afc7d36, 0xd53e1608, 0x24354d85, 0xcbf726bb, 0x20c09db8,
+    0xcf02f686, 0x2ddeedff, 0xc21c86c1, 0x292b3dc2, 0xc6e956fc,
+    0x104c8c99, 0xff8ee7a7, 0x14b95ca4, 0xfb7b379a, 0x19a72ce3,
+    0xf66547dd, 0x1d52fcde, 0xf29097e0, 0x039bcc6d, 0xec59a753,
+    0x076e1c50, 0xe8ac776e, 0x0a706c17, 0xe5b20729, 0x0e85bc2a,
+    0xe147d714},
+   {0x00000000, 0x177b1443, 0x2ef62886, 0x398d3cc5, 0x5dec510c,
+    0x4a97454f, 0x731a798a, 0x64616dc9, 0xbbd8a218, 0xaca3b65b,
+    0x952e8a9e, 0x82559edd, 0xe634f314, 0xf14fe757, 0xc8c2db92,
+    0xdfb9cfd1, 0xacc04271, 0xbbbb5632, 0x82366af7, 0x954d7eb4,
+    0xf12c137d, 0xe657073e, 0xdfda3bfb, 0xc8a12fb8, 0x1718e069,
+    0x0063f42a, 0x39eec8ef, 0x2e95dcac, 0x4af4b165, 0x5d8fa526,
+    0x640299e3, 0x73798da0, 0x82f182a3, 0x958a96e0, 0xac07aa25,
+    0xbb7cbe66, 0xdf1dd3af, 0xc866c7ec, 0xf1ebfb29, 0xe690ef6a,
+    0x392920bb, 0x2e5234f8, 0x17df083d, 0x00a41c7e, 0x64c571b7,
+    0x73be65f4, 0x4a335931, 0x5d484d72, 0x2e31c0d2, 0x394ad491,
+    0x00c7e854, 0x17bcfc17, 0x73dd91de, 0x64a6859d, 0x5d2bb958,
+    0x4a50ad1b, 0x95e962ca, 0x82927689, 0xbb1f4a4c, 0xac645e0f,
+    0xc80533c6, 0xdf7e2785, 0xe6f31b40, 0xf1880f03, 0xde920307,
+    0xc9e91744, 0xf0642b81, 0xe71f3fc2, 0x837e520b, 0x94054648,
+    0xad887a8d, 0xbaf36ece, 0x654aa11f, 0x7231b55c, 0x4bbc8999,
+    0x5cc79dda, 0x38a6f013, 0x2fdde450, 0x1650d895, 0x012bccd6,
+    0x72524176, 0x65295535, 0x5ca469f0, 0x4bdf7db3, 0x2fbe107a,
+    0x38c50439, 0x014838fc, 0x16332cbf, 0xc98ae36e, 0xdef1f72d,
+    0xe77ccbe8, 0xf007dfab, 0x9466b262, 0x831da621, 0xba909ae4,
+    0xadeb8ea7, 0x5c6381a4, 0x4b1895e7, 0x7295a922, 0x65eebd61,
+    0x018fd0a8, 0x16f4c4eb, 0x2f79f82e, 0x3802ec6d, 0xe7bb23bc,
+    0xf0c037ff, 0xc94d0b3a, 0xde361f79, 0xba5772b0, 0xad2c66f3,
+    0x94a15a36, 0x83da4e75, 0xf0a3c3d5, 0xe7d8d796, 0xde55eb53,
+    0xc92eff10, 0xad4f92d9, 0xba34869a, 0x83b9ba5f, 0x94c2ae1c,
+    0x4b7b61cd, 0x5c00758e, 0x658d494b, 0x72f65d08, 0x169730c1,
+    0x01ec2482, 0x38611847, 0x2f1a0c04, 0x6655004f, 0x712e140c,
+    0x48a328c9, 0x5fd83c8a, 0x3bb95143, 0x2cc24500, 0x154f79c5,
+    0x02346d86, 0xdd8da257, 0xcaf6b614, 0xf37b8ad1, 0xe4009e92,
+    0x8061f35b, 0x971ae718, 0xae97dbdd, 0xb9eccf9e, 0xca95423e,
+    0xddee567d, 0xe4636ab8, 0xf3187efb, 0x97791332, 0x80020771,
+    0xb98f3bb4, 0xaef42ff7, 0x714de026, 0x6636f465, 0x5fbbc8a0,
+    0x48c0dce3, 0x2ca1b12a, 0x3bdaa569, 0x025799ac, 0x152c8def,
+    0xe4a482ec, 0xf3df96af, 0xca52aa6a, 0xdd29be29, 0xb948d3e0,
+    0xae33c7a3, 0x97befb66, 0x80c5ef25, 0x5f7c20f4, 0x480734b7,
+    0x718a0872, 0x66f11c31, 0x029071f8, 0x15eb65bb, 0x2c66597e,
+    0x3b1d4d3d, 0x4864c09d, 0x5f1fd4de, 0x6692e81b, 0x71e9fc58,
+    0x15889191, 0x02f385d2, 0x3b7eb917, 0x2c05ad54, 0xf3bc6285,
+    0xe4c776c6, 0xdd4a4a03, 0xca315e40, 0xae503389, 0xb92b27ca,
+    0x80a61b0f, 0x97dd0f4c, 0xb8c70348, 0xafbc170b, 0x96312bce,
+    0x814a3f8d, 0xe52b5244, 0xf2504607, 0xcbdd7ac2, 0xdca66e81,
+    0x031fa150, 0x1464b513, 0x2de989d6, 0x3a929d95, 0x5ef3f05c,
+    0x4988e41f, 0x7005d8da, 0x677ecc99, 0x14074139, 0x037c557a,
+    0x3af169bf, 0x2d8a7dfc, 0x49eb1035, 0x5e900476, 0x671d38b3,
+    0x70662cf0, 0xafdfe321, 0xb8a4f762, 0x8129cba7, 0x9652dfe4,
+    0xf233b22d, 0xe548a66e, 0xdcc59aab, 0xcbbe8ee8, 0x3a3681eb,
+    0x2d4d95a8, 0x14c0a96d, 0x03bbbd2e, 0x67dad0e7, 0x70a1c4a4,
+    0x492cf861, 0x5e57ec22, 0x81ee23f3, 0x969537b0, 0xaf180b75,
+    0xb8631f36, 0xdc0272ff, 0xcb7966bc, 0xf2f45a79, 0xe58f4e3a,
+    0x96f6c39a, 0x818dd7d9, 0xb800eb1c, 0xaf7bff5f, 0xcb1a9296,
+    0xdc6186d5, 0xe5ecba10, 0xf297ae53, 0x2d2e6182, 0x3a5575c1,
+    0x03d84904, 0x14a35d47, 0x70c2308e, 0x67b924cd, 0x5e341808,
+    0x494f0c4b}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x0000000000000000, 0x43147b1700000000, 0x8628f62e00000000,
+    0xc53c8d3900000000, 0x0c51ec5d00000000, 0x4f45974a00000000,
+    0x8a791a7300000000, 0xc96d616400000000, 0x18a2d8bb00000000,
+    0x5bb6a3ac00000000, 0x9e8a2e9500000000, 0xdd9e558200000000,
+    0x14f334e600000000, 0x57e74ff100000000, 0x92dbc2c800000000,
+    0xd1cfb9df00000000, 0x7142c0ac00000000, 0x3256bbbb00000000,
+    0xf76a368200000000, 0xb47e4d9500000000, 0x7d132cf100000000,
+    0x3e0757e600000000, 0xfb3bdadf00000000, 0xb82fa1c800000000,
+    0x69e0181700000000, 0x2af4630000000000, 0xefc8ee3900000000,
+    0xacdc952e00000000, 0x65b1f44a00000000, 0x26a58f5d00000000,
+    0xe399026400000000, 0xa08d797300000000, 0xa382f18200000000,
+    0xe0968a9500000000, 0x25aa07ac00000000, 0x66be7cbb00000000,
+    0xafd31ddf00000000, 0xecc766c800000000, 0x29fbebf100000000,
+    0x6aef90e600000000, 0xbb20293900000000, 0xf834522e00000000,
+    0x3d08df1700000000, 0x7e1ca40000000000, 0xb771c56400000000,
+    0xf465be7300000000, 0x3159334a00000000, 0x724d485d00000000,
+    0xd2c0312e00000000, 0x91d44a3900000000, 0x54e8c70000000000,
+    0x17fcbc1700000000, 0xde91dd7300000000, 0x9d85a66400000000,
+    0x58b92b5d00000000, 0x1bad504a00000000, 0xca62e99500000000,
+    0x8976928200000000, 0x4c4a1fbb00000000, 0x0f5e64ac00000000,
+    0xc63305c800000000, 0x85277edf00000000, 0x401bf3e600000000,
+    0x030f88f100000000, 0x070392de00000000, 0x4417e9c900000000,
+    0x812b64f000000000, 0xc23f1fe700000000, 0x0b527e8300000000,
+    0x4846059400000000, 0x8d7a88ad00000000, 0xce6ef3ba00000000,
+    0x1fa14a6500000000, 0x5cb5317200000000, 0x9989bc4b00000000,
+    0xda9dc75c00000000, 0x13f0a63800000000, 0x50e4dd2f00000000,
+    0x95d8501600000000, 0xd6cc2b0100000000, 0x7641527200000000,
+    0x3555296500000000, 0xf069a45c00000000, 0xb37ddf4b00000000,
+    0x7a10be2f00000000, 0x3904c53800000000, 0xfc38480100000000,
+    0xbf2c331600000000, 0x6ee38ac900000000, 0x2df7f1de00000000,
+    0xe8cb7ce700000000, 0xabdf07f000000000, 0x62b2669400000000,
+    0x21a61d8300000000, 0xe49a90ba00000000, 0xa78eebad00000000,
+    0xa481635c00000000, 0xe795184b00000000, 0x22a9957200000000,
+    0x61bdee6500000000, 0xa8d08f0100000000, 0xebc4f41600000000,
+    0x2ef8792f00000000, 0x6dec023800000000, 0xbc23bbe700000000,
+    0xff37c0f000000000, 0x3a0b4dc900000000, 0x791f36de00000000,
+    0xb07257ba00000000, 0xf3662cad00000000, 0x365aa19400000000,
+    0x754eda8300000000, 0xd5c3a3f000000000, 0x96d7d8e700000000,
+    0x53eb55de00000000, 0x10ff2ec900000000, 0xd9924fad00000000,
+    0x9a8634ba00000000, 0x5fbab98300000000, 0x1caec29400000000,
+    0xcd617b4b00000000, 0x8e75005c00000000, 0x4b498d6500000000,
+    0x085df67200000000, 0xc130971600000000, 0x8224ec0100000000,
+    0x4718613800000000, 0x040c1a2f00000000, 0x4f00556600000000,
+    0x0c142e7100000000, 0xc928a34800000000, 0x8a3cd85f00000000,
+    0x4351b93b00000000, 0x0045c22c00000000, 0xc5794f1500000000,
+    0x866d340200000000, 0x57a28ddd00000000, 0x14b6f6ca00000000,
+    0xd18a7bf300000000, 0x929e00e400000000, 0x5bf3618000000000,
+    0x18e71a9700000000, 0xdddb97ae00000000, 0x9ecfecb900000000,
+    0x3e4295ca00000000, 0x7d56eedd00000000, 0xb86a63e400000000,
+    0xfb7e18f300000000, 0x3213799700000000, 0x7107028000000000,
+    0xb43b8fb900000000, 0xf72ff4ae00000000, 0x26e04d7100000000,
+    0x65f4366600000000, 0xa0c8bb5f00000000, 0xe3dcc04800000000,
+    0x2ab1a12c00000000, 0x69a5da3b00000000, 0xac99570200000000,
+    0xef8d2c1500000000, 0xec82a4e400000000, 0xaf96dff300000000,
+    0x6aaa52ca00000000, 0x29be29dd00000000, 0xe0d348b900000000,
+    0xa3c733ae00000000, 0x66fbbe9700000000, 0x25efc58000000000,
+    0xf4207c5f00000000, 0xb734074800000000, 0x72088a7100000000,
+    0x311cf16600000000, 0xf871900200000000, 0xbb65eb1500000000,
+    0x7e59662c00000000, 0x3d4d1d3b00000000, 0x9dc0644800000000,
+    0xded41f5f00000000, 0x1be8926600000000, 0x58fce97100000000,
+    0x9191881500000000, 0xd285f30200000000, 0x17b97e3b00000000,
+    0x54ad052c00000000, 0x8562bcf300000000, 0xc676c7e400000000,
+    0x034a4add00000000, 0x405e31ca00000000, 0x893350ae00000000,
+    0xca272bb900000000, 0x0f1ba68000000000, 0x4c0fdd9700000000,
+    0x4803c7b800000000, 0x0b17bcaf00000000, 0xce2b319600000000,
+    0x8d3f4a8100000000, 0x44522be500000000, 0x074650f200000000,
+    0xc27addcb00000000, 0x816ea6dc00000000, 0x50a11f0300000000,
+    0x13b5641400000000, 0xd689e92d00000000, 0x959d923a00000000,
+    0x5cf0f35e00000000, 0x1fe4884900000000, 0xdad8057000000000,
+    0x99cc7e6700000000, 0x3941071400000000, 0x7a557c0300000000,
+    0xbf69f13a00000000, 0xfc7d8a2d00000000, 0x3510eb4900000000,
+    0x7604905e00000000, 0xb3381d6700000000, 0xf02c667000000000,
+    0x21e3dfaf00000000, 0x62f7a4b800000000, 0xa7cb298100000000,
+    0xe4df529600000000, 0x2db233f200000000, 0x6ea648e500000000,
+    0xab9ac5dc00000000, 0xe88ebecb00000000, 0xeb81363a00000000,
+    0xa8954d2d00000000, 0x6da9c01400000000, 0x2ebdbb0300000000,
+    0xe7d0da6700000000, 0xa4c4a17000000000, 0x61f82c4900000000,
+    0x22ec575e00000000, 0xf323ee8100000000, 0xb037959600000000,
+    0x750b18af00000000, 0x361f63b800000000, 0xff7202dc00000000,
+    0xbc6679cb00000000, 0x795af4f200000000, 0x3a4e8fe500000000,
+    0x9ac3f69600000000, 0xd9d78d8100000000, 0x1ceb00b800000000,
+    0x5fff7baf00000000, 0x96921acb00000000, 0xd58661dc00000000,
+    0x10baece500000000, 0x53ae97f200000000, 0x82612e2d00000000,
+    0xc175553a00000000, 0x0449d80300000000, 0x475da31400000000,
+    0x8e30c27000000000, 0xcd24b96700000000, 0x0818345e00000000,
+    0x4b0c4f4900000000},
+   {0x0000000000000000, 0x3e6bc2ef00000000, 0x3dd0f50400000000,
+    0x03bb37eb00000000, 0x7aa0eb0900000000, 0x44cb29e600000000,
+    0x47701e0d00000000, 0x791bdce200000000, 0xf440d71300000000,
+    0xca2b15fc00000000, 0xc990221700000000, 0xf7fbe0f800000000,
+    0x8ee03c1a00000000, 0xb08bfef500000000, 0xb330c91e00000000,
+    0x8d5b0bf100000000, 0xe881ae2700000000, 0xd6ea6cc800000000,
+    0xd5515b2300000000, 0xeb3a99cc00000000, 0x9221452e00000000,
+    0xac4a87c100000000, 0xaff1b02a00000000, 0x919a72c500000000,
+    0x1cc1793400000000, 0x22aabbdb00000000, 0x21118c3000000000,
+    0x1f7a4edf00000000, 0x6661923d00000000, 0x580a50d200000000,
+    0x5bb1673900000000, 0x65daa5d600000000, 0xd0035d4f00000000,
+    0xee689fa000000000, 0xedd3a84b00000000, 0xd3b86aa400000000,
+    0xaaa3b64600000000, 0x94c874a900000000, 0x9773434200000000,
+    0xa91881ad00000000, 0x24438a5c00000000, 0x1a2848b300000000,
+    0x19937f5800000000, 0x27f8bdb700000000, 0x5ee3615500000000,
+    0x6088a3ba00000000, 0x6333945100000000, 0x5d5856be00000000,
+    0x3882f36800000000, 0x06e9318700000000, 0x0552066c00000000,
+    0x3b39c48300000000, 0x4222186100000000, 0x7c49da8e00000000,
+    0x7ff2ed6500000000, 0x41992f8a00000000, 0xccc2247b00000000,
+    0xf2a9e69400000000, 0xf112d17f00000000, 0xcf79139000000000,
+    0xb662cf7200000000, 0x88090d9d00000000, 0x8bb23a7600000000,
+    0xb5d9f89900000000, 0xa007ba9e00000000, 0x9e6c787100000000,
+    0x9dd74f9a00000000, 0xa3bc8d7500000000, 0xdaa7519700000000,
+    0xe4cc937800000000, 0xe777a49300000000, 0xd91c667c00000000,
+    0x54476d8d00000000, 0x6a2caf6200000000, 0x6997988900000000,
+    0x57fc5a6600000000, 0x2ee7868400000000, 0x108c446b00000000,
+    0x1337738000000000, 0x2d5cb16f00000000, 0x488614b900000000,
+    0x76edd65600000000, 0x7556e1bd00000000, 0x4b3d235200000000,
+    0x3226ffb000000000, 0x0c4d3d5f00000000, 0x0ff60ab400000000,
+    0x319dc85b00000000, 0xbcc6c3aa00000000, 0x82ad014500000000,
+    0x811636ae00000000, 0xbf7df44100000000, 0xc66628a300000000,
+    0xf80dea4c00000000, 0xfbb6dda700000000, 0xc5dd1f4800000000,
+    0x7004e7d100000000, 0x4e6f253e00000000, 0x4dd412d500000000,
+    0x73bfd03a00000000, 0x0aa40cd800000000, 0x34cfce3700000000,
+    0x3774f9dc00000000, 0x091f3b3300000000, 0x844430c200000000,
+    0xba2ff22d00000000, 0xb994c5c600000000, 0x87ff072900000000,
+    0xfee4dbcb00000000, 0xc08f192400000000, 0xc3342ecf00000000,
+    0xfd5fec2000000000, 0x988549f600000000, 0xa6ee8b1900000000,
+    0xa555bcf200000000, 0x9b3e7e1d00000000, 0xe225a2ff00000000,
+    0xdc4e601000000000, 0xdff557fb00000000, 0xe19e951400000000,
+    0x6cc59ee500000000, 0x52ae5c0a00000000, 0x51156be100000000,
+    0x6f7ea90e00000000, 0x166575ec00000000, 0x280eb70300000000,
+    0x2bb580e800000000, 0x15de420700000000, 0x010905e600000000,
+    0x3f62c70900000000, 0x3cd9f0e200000000, 0x02b2320d00000000,
+    0x7ba9eeef00000000, 0x45c22c0000000000, 0x46791beb00000000,
+    0x7812d90400000000, 0xf549d2f500000000, 0xcb22101a00000000,
+    0xc89927f100000000, 0xf6f2e51e00000000, 0x8fe939fc00000000,
+    0xb182fb1300000000, 0xb239ccf800000000, 0x8c520e1700000000,
+    0xe988abc100000000, 0xd7e3692e00000000, 0xd4585ec500000000,
+    0xea339c2a00000000, 0x932840c800000000, 0xad43822700000000,
+    0xaef8b5cc00000000, 0x9093772300000000, 0x1dc87cd200000000,
+    0x23a3be3d00000000, 0x201889d600000000, 0x1e734b3900000000,
+    0x676897db00000000, 0x5903553400000000, 0x5ab862df00000000,
+    0x64d3a03000000000, 0xd10a58a900000000, 0xef619a4600000000,
+    0xecdaadad00000000, 0xd2b16f4200000000, 0xabaab3a000000000,
+    0x95c1714f00000000, 0x967a46a400000000, 0xa811844b00000000,
+    0x254a8fba00000000, 0x1b214d5500000000, 0x189a7abe00000000,
+    0x26f1b85100000000, 0x5fea64b300000000, 0x6181a65c00000000,
+    0x623a91b700000000, 0x5c51535800000000, 0x398bf68e00000000,
+    0x07e0346100000000, 0x045b038a00000000, 0x3a30c16500000000,
+    0x432b1d8700000000, 0x7d40df6800000000, 0x7efbe88300000000,
+    0x40902a6c00000000, 0xcdcb219d00000000, 0xf3a0e37200000000,
+    0xf01bd49900000000, 0xce70167600000000, 0xb76bca9400000000,
+    0x8900087b00000000, 0x8abb3f9000000000, 0xb4d0fd7f00000000,
+    0xa10ebf7800000000, 0x9f657d9700000000, 0x9cde4a7c00000000,
+    0xa2b5889300000000, 0xdbae547100000000, 0xe5c5969e00000000,
+    0xe67ea17500000000, 0xd815639a00000000, 0x554e686b00000000,
+    0x6b25aa8400000000, 0x689e9d6f00000000, 0x56f55f8000000000,
+    0x2fee836200000000, 0x1185418d00000000, 0x123e766600000000,
+    0x2c55b48900000000, 0x498f115f00000000, 0x77e4d3b000000000,
+    0x745fe45b00000000, 0x4a3426b400000000, 0x332ffa5600000000,
+    0x0d4438b900000000, 0x0eff0f5200000000, 0x3094cdbd00000000,
+    0xbdcfc64c00000000, 0x83a404a300000000, 0x801f334800000000,
+    0xbe74f1a700000000, 0xc76f2d4500000000, 0xf904efaa00000000,
+    0xfabfd84100000000, 0xc4d41aae00000000, 0x710de23700000000,
+    0x4f6620d800000000, 0x4cdd173300000000, 0x72b6d5dc00000000,
+    0x0bad093e00000000, 0x35c6cbd100000000, 0x367dfc3a00000000,
+    0x08163ed500000000, 0x854d352400000000, 0xbb26f7cb00000000,
+    0xb89dc02000000000, 0x86f602cf00000000, 0xffedde2d00000000,
+    0xc1861cc200000000, 0xc23d2b2900000000, 0xfc56e9c600000000,
+    0x998c4c1000000000, 0xa7e78eff00000000, 0xa45cb91400000000,
+    0x9a377bfb00000000, 0xe32ca71900000000, 0xdd4765f600000000,
+    0xdefc521d00000000, 0xe09790f200000000, 0x6dcc9b0300000000,
+    0x53a759ec00000000, 0x501c6e0700000000, 0x6e77ace800000000,
+    0x176c700a00000000, 0x2907b2e500000000, 0x2abc850e00000000,
+    0x14d747e100000000},
+   {0x0000000000000000, 0xc0df8ec100000000, 0xc1b96c5800000000,
+    0x0166e29900000000, 0x8273d9b000000000, 0x42ac577100000000,
+    0x43cab5e800000000, 0x83153b2900000000, 0x45e1c3ba00000000,
+    0x853e4d7b00000000, 0x8458afe200000000, 0x4487212300000000,
+    0xc7921a0a00000000, 0x074d94cb00000000, 0x062b765200000000,
+    0xc6f4f89300000000, 0xcbc4f6ae00000000, 0x0b1b786f00000000,
+    0x0a7d9af600000000, 0xcaa2143700000000, 0x49b72f1e00000000,
+    0x8968a1df00000000, 0x880e434600000000, 0x48d1cd8700000000,
+    0x8e25351400000000, 0x4efabbd500000000, 0x4f9c594c00000000,
+    0x8f43d78d00000000, 0x0c56eca400000000, 0xcc89626500000000,
+    0xcdef80fc00000000, 0x0d300e3d00000000, 0xd78f9c8600000000,
+    0x1750124700000000, 0x1636f0de00000000, 0xd6e97e1f00000000,
+    0x55fc453600000000, 0x9523cbf700000000, 0x9445296e00000000,
+    0x549aa7af00000000, 0x926e5f3c00000000, 0x52b1d1fd00000000,
+    0x53d7336400000000, 0x9308bda500000000, 0x101d868c00000000,
+    0xd0c2084d00000000, 0xd1a4ead400000000, 0x117b641500000000,
+    0x1c4b6a2800000000, 0xdc94e4e900000000, 0xddf2067000000000,
+    0x1d2d88b100000000, 0x9e38b39800000000, 0x5ee73d5900000000,
+    0x5f81dfc000000000, 0x9f5e510100000000, 0x59aaa99200000000,
+    0x9975275300000000, 0x9813c5ca00000000, 0x58cc4b0b00000000,
+    0xdbd9702200000000, 0x1b06fee300000000, 0x1a601c7a00000000,
+    0xdabf92bb00000000, 0xef1948d600000000, 0x2fc6c61700000000,
+    0x2ea0248e00000000, 0xee7faa4f00000000, 0x6d6a916600000000,
+    0xadb51fa700000000, 0xacd3fd3e00000000, 0x6c0c73ff00000000,
+    0xaaf88b6c00000000, 0x6a2705ad00000000, 0x6b41e73400000000,
+    0xab9e69f500000000, 0x288b52dc00000000, 0xe854dc1d00000000,
+    0xe9323e8400000000, 0x29edb04500000000, 0x24ddbe7800000000,
+    0xe40230b900000000, 0xe564d22000000000, 0x25bb5ce100000000,
+    0xa6ae67c800000000, 0x6671e90900000000, 0x67170b9000000000,
+    0xa7c8855100000000, 0x613c7dc200000000, 0xa1e3f30300000000,
+    0xa085119a00000000, 0x605a9f5b00000000, 0xe34fa47200000000,
+    0x23902ab300000000, 0x22f6c82a00000000, 0xe22946eb00000000,
+    0x3896d45000000000, 0xf8495a9100000000, 0xf92fb80800000000,
+    0x39f036c900000000, 0xbae50de000000000, 0x7a3a832100000000,
+    0x7b5c61b800000000, 0xbb83ef7900000000, 0x7d7717ea00000000,
+    0xbda8992b00000000, 0xbcce7bb200000000, 0x7c11f57300000000,
+    0xff04ce5a00000000, 0x3fdb409b00000000, 0x3ebda20200000000,
+    0xfe622cc300000000, 0xf35222fe00000000, 0x338dac3f00000000,
+    0x32eb4ea600000000, 0xf234c06700000000, 0x7121fb4e00000000,
+    0xb1fe758f00000000, 0xb098971600000000, 0x704719d700000000,
+    0xb6b3e14400000000, 0x766c6f8500000000, 0x770a8d1c00000000,
+    0xb7d503dd00000000, 0x34c038f400000000, 0xf41fb63500000000,
+    0xf57954ac00000000, 0x35a6da6d00000000, 0x9f35e17700000000,
+    0x5fea6fb600000000, 0x5e8c8d2f00000000, 0x9e5303ee00000000,
+    0x1d4638c700000000, 0xdd99b60600000000, 0xdcff549f00000000,
+    0x1c20da5e00000000, 0xdad422cd00000000, 0x1a0bac0c00000000,
+    0x1b6d4e9500000000, 0xdbb2c05400000000, 0x58a7fb7d00000000,
+    0x987875bc00000000, 0x991e972500000000, 0x59c119e400000000,
+    0x54f117d900000000, 0x942e991800000000, 0x95487b8100000000,
+    0x5597f54000000000, 0xd682ce6900000000, 0x165d40a800000000,
+    0x173ba23100000000, 0xd7e42cf000000000, 0x1110d46300000000,
+    0xd1cf5aa200000000, 0xd0a9b83b00000000, 0x107636fa00000000,
+    0x93630dd300000000, 0x53bc831200000000, 0x52da618b00000000,
+    0x9205ef4a00000000, 0x48ba7df100000000, 0x8865f33000000000,
+    0x890311a900000000, 0x49dc9f6800000000, 0xcac9a44100000000,
+    0x0a162a8000000000, 0x0b70c81900000000, 0xcbaf46d800000000,
+    0x0d5bbe4b00000000, 0xcd84308a00000000, 0xcce2d21300000000,
+    0x0c3d5cd200000000, 0x8f2867fb00000000, 0x4ff7e93a00000000,
+    0x4e910ba300000000, 0x8e4e856200000000, 0x837e8b5f00000000,
+    0x43a1059e00000000, 0x42c7e70700000000, 0x821869c600000000,
+    0x010d52ef00000000, 0xc1d2dc2e00000000, 0xc0b43eb700000000,
+    0x006bb07600000000, 0xc69f48e500000000, 0x0640c62400000000,
+    0x072624bd00000000, 0xc7f9aa7c00000000, 0x44ec915500000000,
+    0x84331f9400000000, 0x8555fd0d00000000, 0x458a73cc00000000,
+    0x702ca9a100000000, 0xb0f3276000000000, 0xb195c5f900000000,
+    0x714a4b3800000000, 0xf25f701100000000, 0x3280fed000000000,
+    0x33e61c4900000000, 0xf339928800000000, 0x35cd6a1b00000000,
+    0xf512e4da00000000, 0xf474064300000000, 0x34ab888200000000,
+    0xb7beb3ab00000000, 0x77613d6a00000000, 0x7607dff300000000,
+    0xb6d8513200000000, 0xbbe85f0f00000000, 0x7b37d1ce00000000,
+    0x7a51335700000000, 0xba8ebd9600000000, 0x399b86bf00000000,
+    0xf944087e00000000, 0xf822eae700000000, 0x38fd642600000000,
+    0xfe099cb500000000, 0x3ed6127400000000, 0x3fb0f0ed00000000,
+    0xff6f7e2c00000000, 0x7c7a450500000000, 0xbca5cbc400000000,
+    0xbdc3295d00000000, 0x7d1ca79c00000000, 0xa7a3352700000000,
+    0x677cbbe600000000, 0x661a597f00000000, 0xa6c5d7be00000000,
+    0x25d0ec9700000000, 0xe50f625600000000, 0xe46980cf00000000,
+    0x24b60e0e00000000, 0xe242f69d00000000, 0x229d785c00000000,
+    0x23fb9ac500000000, 0xe324140400000000, 0x60312f2d00000000,
+    0xa0eea1ec00000000, 0xa188437500000000, 0x6157cdb400000000,
+    0x6c67c38900000000, 0xacb84d4800000000, 0xaddeafd100000000,
+    0x6d01211000000000, 0xee141a3900000000, 0x2ecb94f800000000,
+    0x2fad766100000000, 0xef72f8a000000000, 0x2986003300000000,
+    0xe9598ef200000000, 0xe83f6c6b00000000, 0x28e0e2aa00000000,
+    0xabf5d98300000000, 0x6b2a574200000000, 0x6a4cb5db00000000,
+    0xaa933b1a00000000},
+   {0x0000000000000000, 0x6f4ca59b00000000, 0x9f9e3bec00000000,
+    0xf0d29e7700000000, 0x7f3b060300000000, 0x1077a39800000000,
+    0xe0a53def00000000, 0x8fe9987400000000, 0xfe760c0600000000,
+    0x913aa99d00000000, 0x61e837ea00000000, 0x0ea4927100000000,
+    0x814d0a0500000000, 0xee01af9e00000000, 0x1ed331e900000000,
+    0x719f947200000000, 0xfced180c00000000, 0x93a1bd9700000000,
+    0x637323e000000000, 0x0c3f867b00000000, 0x83d61e0f00000000,
+    0xec9abb9400000000, 0x1c4825e300000000, 0x7304807800000000,
+    0x029b140a00000000, 0x6dd7b19100000000, 0x9d052fe600000000,
+    0xf2498a7d00000000, 0x7da0120900000000, 0x12ecb79200000000,
+    0xe23e29e500000000, 0x8d728c7e00000000, 0xf8db311800000000,
+    0x9797948300000000, 0x67450af400000000, 0x0809af6f00000000,
+    0x87e0371b00000000, 0xe8ac928000000000, 0x187e0cf700000000,
+    0x7732a96c00000000, 0x06ad3d1e00000000, 0x69e1988500000000,
+    0x993306f200000000, 0xf67fa36900000000, 0x79963b1d00000000,
+    0x16da9e8600000000, 0xe60800f100000000, 0x8944a56a00000000,
+    0x0436291400000000, 0x6b7a8c8f00000000, 0x9ba812f800000000,
+    0xf4e4b76300000000, 0x7b0d2f1700000000, 0x14418a8c00000000,
+    0xe49314fb00000000, 0x8bdfb16000000000, 0xfa40251200000000,
+    0x950c808900000000, 0x65de1efe00000000, 0x0a92bb6500000000,
+    0x857b231100000000, 0xea37868a00000000, 0x1ae518fd00000000,
+    0x75a9bd6600000000, 0xf0b7633000000000, 0x9ffbc6ab00000000,
+    0x6f2958dc00000000, 0x0065fd4700000000, 0x8f8c653300000000,
+    0xe0c0c0a800000000, 0x10125edf00000000, 0x7f5efb4400000000,
+    0x0ec16f3600000000, 0x618dcaad00000000, 0x915f54da00000000,
+    0xfe13f14100000000, 0x71fa693500000000, 0x1eb6ccae00000000,
+    0xee6452d900000000, 0x8128f74200000000, 0x0c5a7b3c00000000,
+    0x6316dea700000000, 0x93c440d000000000, 0xfc88e54b00000000,
+    0x73617d3f00000000, 0x1c2dd8a400000000, 0xecff46d300000000,
+    0x83b3e34800000000, 0xf22c773a00000000, 0x9d60d2a100000000,
+    0x6db24cd600000000, 0x02fee94d00000000, 0x8d17713900000000,
+    0xe25bd4a200000000, 0x12894ad500000000, 0x7dc5ef4e00000000,
+    0x086c522800000000, 0x6720f7b300000000, 0x97f269c400000000,
+    0xf8becc5f00000000, 0x7757542b00000000, 0x181bf1b000000000,
+    0xe8c96fc700000000, 0x8785ca5c00000000, 0xf61a5e2e00000000,
+    0x9956fbb500000000, 0x698465c200000000, 0x06c8c05900000000,
+    0x8921582d00000000, 0xe66dfdb600000000, 0x16bf63c100000000,
+    0x79f3c65a00000000, 0xf4814a2400000000, 0x9bcdefbf00000000,
+    0x6b1f71c800000000, 0x0453d45300000000, 0x8bba4c2700000000,
+    0xe4f6e9bc00000000, 0x142477cb00000000, 0x7b68d25000000000,
+    0x0af7462200000000, 0x65bbe3b900000000, 0x95697dce00000000,
+    0xfa25d85500000000, 0x75cc402100000000, 0x1a80e5ba00000000,
+    0xea527bcd00000000, 0x851ede5600000000, 0xe06fc76000000000,
+    0x8f2362fb00000000, 0x7ff1fc8c00000000, 0x10bd591700000000,
+    0x9f54c16300000000, 0xf01864f800000000, 0x00cafa8f00000000,
+    0x6f865f1400000000, 0x1e19cb6600000000, 0x71556efd00000000,
+    0x8187f08a00000000, 0xeecb551100000000, 0x6122cd6500000000,
+    0x0e6e68fe00000000, 0xfebcf68900000000, 0x91f0531200000000,
+    0x1c82df6c00000000, 0x73ce7af700000000, 0x831ce48000000000,
+    0xec50411b00000000, 0x63b9d96f00000000, 0x0cf57cf400000000,
+    0xfc27e28300000000, 0x936b471800000000, 0xe2f4d36a00000000,
+    0x8db876f100000000, 0x7d6ae88600000000, 0x12264d1d00000000,
+    0x9dcfd56900000000, 0xf28370f200000000, 0x0251ee8500000000,
+    0x6d1d4b1e00000000, 0x18b4f67800000000, 0x77f853e300000000,
+    0x872acd9400000000, 0xe866680f00000000, 0x678ff07b00000000,
+    0x08c355e000000000, 0xf811cb9700000000, 0x975d6e0c00000000,
+    0xe6c2fa7e00000000, 0x898e5fe500000000, 0x795cc19200000000,
+    0x1610640900000000, 0x99f9fc7d00000000, 0xf6b559e600000000,
+    0x0667c79100000000, 0x692b620a00000000, 0xe459ee7400000000,
+    0x8b154bef00000000, 0x7bc7d59800000000, 0x148b700300000000,
+    0x9b62e87700000000, 0xf42e4dec00000000, 0x04fcd39b00000000,
+    0x6bb0760000000000, 0x1a2fe27200000000, 0x756347e900000000,
+    0x85b1d99e00000000, 0xeafd7c0500000000, 0x6514e47100000000,
+    0x0a5841ea00000000, 0xfa8adf9d00000000, 0x95c67a0600000000,
+    0x10d8a45000000000, 0x7f9401cb00000000, 0x8f469fbc00000000,
+    0xe00a3a2700000000, 0x6fe3a25300000000, 0x00af07c800000000,
+    0xf07d99bf00000000, 0x9f313c2400000000, 0xeeaea85600000000,
+    0x81e20dcd00000000, 0x713093ba00000000, 0x1e7c362100000000,
+    0x9195ae5500000000, 0xfed90bce00000000, 0x0e0b95b900000000,
+    0x6147302200000000, 0xec35bc5c00000000, 0x837919c700000000,
+    0x73ab87b000000000, 0x1ce7222b00000000, 0x930eba5f00000000,
+    0xfc421fc400000000, 0x0c9081b300000000, 0x63dc242800000000,
+    0x1243b05a00000000, 0x7d0f15c100000000, 0x8ddd8bb600000000,
+    0xe2912e2d00000000, 0x6d78b65900000000, 0x023413c200000000,
+    0xf2e68db500000000, 0x9daa282e00000000, 0xe803954800000000,
+    0x874f30d300000000, 0x779daea400000000, 0x18d10b3f00000000,
+    0x9738934b00000000, 0xf87436d000000000, 0x08a6a8a700000000,
+    0x67ea0d3c00000000, 0x1675994e00000000, 0x79393cd500000000,
+    0x89eba2a200000000, 0xe6a7073900000000, 0x694e9f4d00000000,
+    0x06023ad600000000, 0xf6d0a4a100000000, 0x999c013a00000000,
+    0x14ee8d4400000000, 0x7ba228df00000000, 0x8b70b6a800000000,
+    0xe43c133300000000, 0x6bd58b4700000000, 0x04992edc00000000,
+    0xf44bb0ab00000000, 0x9b07153000000000, 0xea98814200000000,
+    0x85d424d900000000, 0x7506baae00000000, 0x1a4a1f3500000000,
+    0x95a3874100000000, 0xfaef22da00000000, 0x0a3dbcad00000000,
+    0x6571193600000000},
+   {0x0000000000000000, 0x85d996dd00000000, 0x4bb55c6000000000,
+    0xce6ccabd00000000, 0x966ab9c000000000, 0x13b32f1d00000000,
+    0xdddfe5a000000000, 0x5806737d00000000, 0x6dd3035a00000000,
+    0xe80a958700000000, 0x26665f3a00000000, 0xa3bfc9e700000000,
+    0xfbb9ba9a00000000, 0x7e602c4700000000, 0xb00ce6fa00000000,
+    0x35d5702700000000, 0xdaa607b400000000, 0x5f7f916900000000,
+    0x91135bd400000000, 0x14cacd0900000000, 0x4cccbe7400000000,
+    0xc91528a900000000, 0x0779e21400000000, 0x82a074c900000000,
+    0xb77504ee00000000, 0x32ac923300000000, 0xfcc0588e00000000,
+    0x7919ce5300000000, 0x211fbd2e00000000, 0xa4c62bf300000000,
+    0x6aaae14e00000000, 0xef73779300000000, 0xf54b7eb300000000,
+    0x7092e86e00000000, 0xbefe22d300000000, 0x3b27b40e00000000,
+    0x6321c77300000000, 0xe6f851ae00000000, 0x28949b1300000000,
+    0xad4d0dce00000000, 0x98987de900000000, 0x1d41eb3400000000,
+    0xd32d218900000000, 0x56f4b75400000000, 0x0ef2c42900000000,
+    0x8b2b52f400000000, 0x4547984900000000, 0xc09e0e9400000000,
+    0x2fed790700000000, 0xaa34efda00000000, 0x6458256700000000,
+    0xe181b3ba00000000, 0xb987c0c700000000, 0x3c5e561a00000000,
+    0xf2329ca700000000, 0x77eb0a7a00000000, 0x423e7a5d00000000,
+    0xc7e7ec8000000000, 0x098b263d00000000, 0x8c52b0e000000000,
+    0xd454c39d00000000, 0x518d554000000000, 0x9fe19ffd00000000,
+    0x1a38092000000000, 0xab918dbd00000000, 0x2e481b6000000000,
+    0xe024d1dd00000000, 0x65fd470000000000, 0x3dfb347d00000000,
+    0xb822a2a000000000, 0x764e681d00000000, 0xf397fec000000000,
+    0xc6428ee700000000, 0x439b183a00000000, 0x8df7d28700000000,
+    0x082e445a00000000, 0x5028372700000000, 0xd5f1a1fa00000000,
+    0x1b9d6b4700000000, 0x9e44fd9a00000000, 0x71378a0900000000,
+    0xf4ee1cd400000000, 0x3a82d66900000000, 0xbf5b40b400000000,
+    0xe75d33c900000000, 0x6284a51400000000, 0xace86fa900000000,
+    0x2931f97400000000, 0x1ce4895300000000, 0x993d1f8e00000000,
+    0x5751d53300000000, 0xd28843ee00000000, 0x8a8e309300000000,
+    0x0f57a64e00000000, 0xc13b6cf300000000, 0x44e2fa2e00000000,
+    0x5edaf30e00000000, 0xdb0365d300000000, 0x156faf6e00000000,
+    0x90b639b300000000, 0xc8b04ace00000000, 0x4d69dc1300000000,
+    0x830516ae00000000, 0x06dc807300000000, 0x3309f05400000000,
+    0xb6d0668900000000, 0x78bcac3400000000, 0xfd653ae900000000,
+    0xa563499400000000, 0x20badf4900000000, 0xeed615f400000000,
+    0x6b0f832900000000, 0x847cf4ba00000000, 0x01a5626700000000,
+    0xcfc9a8da00000000, 0x4a103e0700000000, 0x12164d7a00000000,
+    0x97cfdba700000000, 0x59a3111a00000000, 0xdc7a87c700000000,
+    0xe9aff7e000000000, 0x6c76613d00000000, 0xa21aab8000000000,
+    0x27c33d5d00000000, 0x7fc54e2000000000, 0xfa1cd8fd00000000,
+    0x3470124000000000, 0xb1a9849d00000000, 0x17256aa000000000,
+    0x92fcfc7d00000000, 0x5c9036c000000000, 0xd949a01d00000000,
+    0x814fd36000000000, 0x049645bd00000000, 0xcafa8f0000000000,
+    0x4f2319dd00000000, 0x7af669fa00000000, 0xff2fff2700000000,
+    0x3143359a00000000, 0xb49aa34700000000, 0xec9cd03a00000000,
+    0x694546e700000000, 0xa7298c5a00000000, 0x22f01a8700000000,
+    0xcd836d1400000000, 0x485afbc900000000, 0x8636317400000000,
+    0x03efa7a900000000, 0x5be9d4d400000000, 0xde30420900000000,
+    0x105c88b400000000, 0x95851e6900000000, 0xa0506e4e00000000,
+    0x2589f89300000000, 0xebe5322e00000000, 0x6e3ca4f300000000,
+    0x363ad78e00000000, 0xb3e3415300000000, 0x7d8f8bee00000000,
+    0xf8561d3300000000, 0xe26e141300000000, 0x67b782ce00000000,
+    0xa9db487300000000, 0x2c02deae00000000, 0x7404add300000000,
+    0xf1dd3b0e00000000, 0x3fb1f1b300000000, 0xba68676e00000000,
+    0x8fbd174900000000, 0x0a64819400000000, 0xc4084b2900000000,
+    0x41d1ddf400000000, 0x19d7ae8900000000, 0x9c0e385400000000,
+    0x5262f2e900000000, 0xd7bb643400000000, 0x38c813a700000000,
+    0xbd11857a00000000, 0x737d4fc700000000, 0xf6a4d91a00000000,
+    0xaea2aa6700000000, 0x2b7b3cba00000000, 0xe517f60700000000,
+    0x60ce60da00000000, 0x551b10fd00000000, 0xd0c2862000000000,
+    0x1eae4c9d00000000, 0x9b77da4000000000, 0xc371a93d00000000,
+    0x46a83fe000000000, 0x88c4f55d00000000, 0x0d1d638000000000,
+    0xbcb4e71d00000000, 0x396d71c000000000, 0xf701bb7d00000000,
+    0x72d82da000000000, 0x2ade5edd00000000, 0xaf07c80000000000,
+    0x616b02bd00000000, 0xe4b2946000000000, 0xd167e44700000000,
+    0x54be729a00000000, 0x9ad2b82700000000, 0x1f0b2efa00000000,
+    0x470d5d8700000000, 0xc2d4cb5a00000000, 0x0cb801e700000000,
+    0x8961973a00000000, 0x6612e0a900000000, 0xe3cb767400000000,
+    0x2da7bcc900000000, 0xa87e2a1400000000, 0xf078596900000000,
+    0x75a1cfb400000000, 0xbbcd050900000000, 0x3e1493d400000000,
+    0x0bc1e3f300000000, 0x8e18752e00000000, 0x4074bf9300000000,
+    0xc5ad294e00000000, 0x9dab5a3300000000, 0x1872ccee00000000,
+    0xd61e065300000000, 0x53c7908e00000000, 0x49ff99ae00000000,
+    0xcc260f7300000000, 0x024ac5ce00000000, 0x8793531300000000,
+    0xdf95206e00000000, 0x5a4cb6b300000000, 0x94207c0e00000000,
+    0x11f9ead300000000, 0x242c9af400000000, 0xa1f50c2900000000,
+    0x6f99c69400000000, 0xea40504900000000, 0xb246233400000000,
+    0x379fb5e900000000, 0xf9f37f5400000000, 0x7c2ae98900000000,
+    0x93599e1a00000000, 0x168008c700000000, 0xd8ecc27a00000000,
+    0x5d3554a700000000, 0x053327da00000000, 0x80eab10700000000,
+    0x4e867bba00000000, 0xcb5fed6700000000, 0xfe8a9d4000000000,
+    0x7b530b9d00000000, 0xb53fc12000000000, 0x30e657fd00000000,
+    0x68e0248000000000, 0xed39b25d00000000, 0x235578e000000000,
+    0xa68cee3d00000000},
+   {0x0000000000000000, 0x76e10f9d00000000, 0xadc46ee100000000,
+    0xdb25617c00000000, 0x1b8fac1900000000, 0x6d6ea38400000000,
+    0xb64bc2f800000000, 0xc0aacd6500000000, 0x361e593300000000,
+    0x40ff56ae00000000, 0x9bda37d200000000, 0xed3b384f00000000,
+    0x2d91f52a00000000, 0x5b70fab700000000, 0x80559bcb00000000,
+    0xf6b4945600000000, 0x6c3cb26600000000, 0x1addbdfb00000000,
+    0xc1f8dc8700000000, 0xb719d31a00000000, 0x77b31e7f00000000,
+    0x015211e200000000, 0xda77709e00000000, 0xac967f0300000000,
+    0x5a22eb5500000000, 0x2cc3e4c800000000, 0xf7e685b400000000,
+    0x81078a2900000000, 0x41ad474c00000000, 0x374c48d100000000,
+    0xec6929ad00000000, 0x9a88263000000000, 0xd87864cd00000000,
+    0xae996b5000000000, 0x75bc0a2c00000000, 0x035d05b100000000,
+    0xc3f7c8d400000000, 0xb516c74900000000, 0x6e33a63500000000,
+    0x18d2a9a800000000, 0xee663dfe00000000, 0x9887326300000000,
+    0x43a2531f00000000, 0x35435c8200000000, 0xf5e991e700000000,
+    0x83089e7a00000000, 0x582dff0600000000, 0x2eccf09b00000000,
+    0xb444d6ab00000000, 0xc2a5d93600000000, 0x1980b84a00000000,
+    0x6f61b7d700000000, 0xafcb7ab200000000, 0xd92a752f00000000,
+    0x020f145300000000, 0x74ee1bce00000000, 0x825a8f9800000000,
+    0xf4bb800500000000, 0x2f9ee17900000000, 0x597feee400000000,
+    0x99d5238100000000, 0xef342c1c00000000, 0x34114d6000000000,
+    0x42f042fd00000000, 0xf1f7b94100000000, 0x8716b6dc00000000,
+    0x5c33d7a000000000, 0x2ad2d83d00000000, 0xea78155800000000,
+    0x9c991ac500000000, 0x47bc7bb900000000, 0x315d742400000000,
+    0xc7e9e07200000000, 0xb108efef00000000, 0x6a2d8e9300000000,
+    0x1ccc810e00000000, 0xdc664c6b00000000, 0xaa8743f600000000,
+    0x71a2228a00000000, 0x07432d1700000000, 0x9dcb0b2700000000,
+    0xeb2a04ba00000000, 0x300f65c600000000, 0x46ee6a5b00000000,
+    0x8644a73e00000000, 0xf0a5a8a300000000, 0x2b80c9df00000000,
+    0x5d61c64200000000, 0xabd5521400000000, 0xdd345d8900000000,
+    0x06113cf500000000, 0x70f0336800000000, 0xb05afe0d00000000,
+    0xc6bbf19000000000, 0x1d9e90ec00000000, 0x6b7f9f7100000000,
+    0x298fdd8c00000000, 0x5f6ed21100000000, 0x844bb36d00000000,
+    0xf2aabcf000000000, 0x3200719500000000, 0x44e17e0800000000,
+    0x9fc41f7400000000, 0xe92510e900000000, 0x1f9184bf00000000,
+    0x69708b2200000000, 0xb255ea5e00000000, 0xc4b4e5c300000000,
+    0x041e28a600000000, 0x72ff273b00000000, 0xa9da464700000000,
+    0xdf3b49da00000000, 0x45b36fea00000000, 0x3352607700000000,
+    0xe877010b00000000, 0x9e960e9600000000, 0x5e3cc3f300000000,
+    0x28ddcc6e00000000, 0xf3f8ad1200000000, 0x8519a28f00000000,
+    0x73ad36d900000000, 0x054c394400000000, 0xde69583800000000,
+    0xa88857a500000000, 0x68229ac000000000, 0x1ec3955d00000000,
+    0xc5e6f42100000000, 0xb307fbbc00000000, 0xe2ef738300000000,
+    0x940e7c1e00000000, 0x4f2b1d6200000000, 0x39ca12ff00000000,
+    0xf960df9a00000000, 0x8f81d00700000000, 0x54a4b17b00000000,
+    0x2245bee600000000, 0xd4f12ab000000000, 0xa210252d00000000,
+    0x7935445100000000, 0x0fd44bcc00000000, 0xcf7e86a900000000,
+    0xb99f893400000000, 0x62bae84800000000, 0x145be7d500000000,
+    0x8ed3c1e500000000, 0xf832ce7800000000, 0x2317af0400000000,
+    0x55f6a09900000000, 0x955c6dfc00000000, 0xe3bd626100000000,
+    0x3898031d00000000, 0x4e790c8000000000, 0xb8cd98d600000000,
+    0xce2c974b00000000, 0x1509f63700000000, 0x63e8f9aa00000000,
+    0xa34234cf00000000, 0xd5a33b5200000000, 0x0e865a2e00000000,
+    0x786755b300000000, 0x3a97174e00000000, 0x4c7618d300000000,
+    0x975379af00000000, 0xe1b2763200000000, 0x2118bb5700000000,
+    0x57f9b4ca00000000, 0x8cdcd5b600000000, 0xfa3dda2b00000000,
+    0x0c894e7d00000000, 0x7a6841e000000000, 0xa14d209c00000000,
+    0xd7ac2f0100000000, 0x1706e26400000000, 0x61e7edf900000000,
+    0xbac28c8500000000, 0xcc23831800000000, 0x56aba52800000000,
+    0x204aaab500000000, 0xfb6fcbc900000000, 0x8d8ec45400000000,
+    0x4d24093100000000, 0x3bc506ac00000000, 0xe0e067d000000000,
+    0x9601684d00000000, 0x60b5fc1b00000000, 0x1654f38600000000,
+    0xcd7192fa00000000, 0xbb909d6700000000, 0x7b3a500200000000,
+    0x0ddb5f9f00000000, 0xd6fe3ee300000000, 0xa01f317e00000000,
+    0x1318cac200000000, 0x65f9c55f00000000, 0xbedca42300000000,
+    0xc83dabbe00000000, 0x089766db00000000, 0x7e76694600000000,
+    0xa553083a00000000, 0xd3b207a700000000, 0x250693f100000000,
+    0x53e79c6c00000000, 0x88c2fd1000000000, 0xfe23f28d00000000,
+    0x3e893fe800000000, 0x4868307500000000, 0x934d510900000000,
+    0xe5ac5e9400000000, 0x7f2478a400000000, 0x09c5773900000000,
+    0xd2e0164500000000, 0xa40119d800000000, 0x64abd4bd00000000,
+    0x124adb2000000000, 0xc96fba5c00000000, 0xbf8eb5c100000000,
+    0x493a219700000000, 0x3fdb2e0a00000000, 0xe4fe4f7600000000,
+    0x921f40eb00000000, 0x52b58d8e00000000, 0x2454821300000000,
+    0xff71e36f00000000, 0x8990ecf200000000, 0xcb60ae0f00000000,
+    0xbd81a19200000000, 0x66a4c0ee00000000, 0x1045cf7300000000,
+    0xd0ef021600000000, 0xa60e0d8b00000000, 0x7d2b6cf700000000,
+    0x0bca636a00000000, 0xfd7ef73c00000000, 0x8b9ff8a100000000,
+    0x50ba99dd00000000, 0x265b964000000000, 0xe6f15b2500000000,
+    0x901054b800000000, 0x4b3535c400000000, 0x3dd43a5900000000,
+    0xa75c1c6900000000, 0xd1bd13f400000000, 0x0a98728800000000,
+    0x7c797d1500000000, 0xbcd3b07000000000, 0xca32bfed00000000,
+    0x1117de9100000000, 0x67f6d10c00000000, 0x9142455a00000000,
+    0xe7a34ac700000000, 0x3c862bbb00000000, 0x4a67242600000000,
+    0x8acde94300000000, 0xfc2ce6de00000000, 0x270987a200000000,
+    0x51e8883f00000000},
+   {0x0000000000000000, 0xe8dbfbb900000000, 0x91b186a800000000,
+    0x796a7d1100000000, 0x63657c8a00000000, 0x8bbe873300000000,
+    0xf2d4fa2200000000, 0x1a0f019b00000000, 0x87cc89cf00000000,
+    0x6f17727600000000, 0x167d0f6700000000, 0xfea6f4de00000000,
+    0xe4a9f54500000000, 0x0c720efc00000000, 0x751873ed00000000,
+    0x9dc3885400000000, 0x4f9f624400000000, 0xa74499fd00000000,
+    0xde2ee4ec00000000, 0x36f51f5500000000, 0x2cfa1ece00000000,
+    0xc421e57700000000, 0xbd4b986600000000, 0x559063df00000000,
+    0xc853eb8b00000000, 0x2088103200000000, 0x59e26d2300000000,
+    0xb139969a00000000, 0xab36970100000000, 0x43ed6cb800000000,
+    0x3a8711a900000000, 0xd25cea1000000000, 0x9e3ec58800000000,
+    0x76e53e3100000000, 0x0f8f432000000000, 0xe754b89900000000,
+    0xfd5bb90200000000, 0x158042bb00000000, 0x6cea3faa00000000,
+    0x8431c41300000000, 0x19f24c4700000000, 0xf129b7fe00000000,
+    0x8843caef00000000, 0x6098315600000000, 0x7a9730cd00000000,
+    0x924ccb7400000000, 0xeb26b66500000000, 0x03fd4ddc00000000,
+    0xd1a1a7cc00000000, 0x397a5c7500000000, 0x4010216400000000,
+    0xa8cbdadd00000000, 0xb2c4db4600000000, 0x5a1f20ff00000000,
+    0x23755dee00000000, 0xcbaea65700000000, 0x566d2e0300000000,
+    0xbeb6d5ba00000000, 0xc7dca8ab00000000, 0x2f07531200000000,
+    0x3508528900000000, 0xddd3a93000000000, 0xa4b9d42100000000,
+    0x4c622f9800000000, 0x7d7bfbca00000000, 0x95a0007300000000,
+    0xecca7d6200000000, 0x041186db00000000, 0x1e1e874000000000,
+    0xf6c57cf900000000, 0x8faf01e800000000, 0x6774fa5100000000,
+    0xfab7720500000000, 0x126c89bc00000000, 0x6b06f4ad00000000,
+    0x83dd0f1400000000, 0x99d20e8f00000000, 0x7109f53600000000,
+    0x0863882700000000, 0xe0b8739e00000000, 0x32e4998e00000000,
+    0xda3f623700000000, 0xa3551f2600000000, 0x4b8ee49f00000000,
+    0x5181e50400000000, 0xb95a1ebd00000000, 0xc03063ac00000000,
+    0x28eb981500000000, 0xb528104100000000, 0x5df3ebf800000000,
+    0x249996e900000000, 0xcc426d5000000000, 0xd64d6ccb00000000,
+    0x3e96977200000000, 0x47fcea6300000000, 0xaf2711da00000000,
+    0xe3453e4200000000, 0x0b9ec5fb00000000, 0x72f4b8ea00000000,
+    0x9a2f435300000000, 0x802042c800000000, 0x68fbb97100000000,
+    0x1191c46000000000, 0xf94a3fd900000000, 0x6489b78d00000000,
+    0x8c524c3400000000, 0xf538312500000000, 0x1de3ca9c00000000,
+    0x07eccb0700000000, 0xef3730be00000000, 0x965d4daf00000000,
+    0x7e86b61600000000, 0xacda5c0600000000, 0x4401a7bf00000000,
+    0x3d6bdaae00000000, 0xd5b0211700000000, 0xcfbf208c00000000,
+    0x2764db3500000000, 0x5e0ea62400000000, 0xb6d55d9d00000000,
+    0x2b16d5c900000000, 0xc3cd2e7000000000, 0xbaa7536100000000,
+    0x527ca8d800000000, 0x4873a94300000000, 0xa0a852fa00000000,
+    0xd9c22feb00000000, 0x3119d45200000000, 0xbbf0874e00000000,
+    0x532b7cf700000000, 0x2a4101e600000000, 0xc29afa5f00000000,
+    0xd895fbc400000000, 0x304e007d00000000, 0x49247d6c00000000,
+    0xa1ff86d500000000, 0x3c3c0e8100000000, 0xd4e7f53800000000,
+    0xad8d882900000000, 0x4556739000000000, 0x5f59720b00000000,
+    0xb78289b200000000, 0xcee8f4a300000000, 0x26330f1a00000000,
+    0xf46fe50a00000000, 0x1cb41eb300000000, 0x65de63a200000000,
+    0x8d05981b00000000, 0x970a998000000000, 0x7fd1623900000000,
+    0x06bb1f2800000000, 0xee60e49100000000, 0x73a36cc500000000,
+    0x9b78977c00000000, 0xe212ea6d00000000, 0x0ac911d400000000,
+    0x10c6104f00000000, 0xf81debf600000000, 0x817796e700000000,
+    0x69ac6d5e00000000, 0x25ce42c600000000, 0xcd15b97f00000000,
+    0xb47fc46e00000000, 0x5ca43fd700000000, 0x46ab3e4c00000000,
+    0xae70c5f500000000, 0xd71ab8e400000000, 0x3fc1435d00000000,
+    0xa202cb0900000000, 0x4ad930b000000000, 0x33b34da100000000,
+    0xdb68b61800000000, 0xc167b78300000000, 0x29bc4c3a00000000,
+    0x50d6312b00000000, 0xb80dca9200000000, 0x6a51208200000000,
+    0x828adb3b00000000, 0xfbe0a62a00000000, 0x133b5d9300000000,
+    0x09345c0800000000, 0xe1efa7b100000000, 0x9885daa000000000,
+    0x705e211900000000, 0xed9da94d00000000, 0x054652f400000000,
+    0x7c2c2fe500000000, 0x94f7d45c00000000, 0x8ef8d5c700000000,
+    0x66232e7e00000000, 0x1f49536f00000000, 0xf792a8d600000000,
+    0xc68b7c8400000000, 0x2e50873d00000000, 0x573afa2c00000000,
+    0xbfe1019500000000, 0xa5ee000e00000000, 0x4d35fbb700000000,
+    0x345f86a600000000, 0xdc847d1f00000000, 0x4147f54b00000000,
+    0xa99c0ef200000000, 0xd0f673e300000000, 0x382d885a00000000,
+    0x222289c100000000, 0xcaf9727800000000, 0xb3930f6900000000,
+    0x5b48f4d000000000, 0x89141ec000000000, 0x61cfe57900000000,
+    0x18a5986800000000, 0xf07e63d100000000, 0xea71624a00000000,
+    0x02aa99f300000000, 0x7bc0e4e200000000, 0x931b1f5b00000000,
+    0x0ed8970f00000000, 0xe6036cb600000000, 0x9f6911a700000000,
+    0x77b2ea1e00000000, 0x6dbdeb8500000000, 0x8566103c00000000,
+    0xfc0c6d2d00000000, 0x14d7969400000000, 0x58b5b90c00000000,
+    0xb06e42b500000000, 0xc9043fa400000000, 0x21dfc41d00000000,
+    0x3bd0c58600000000, 0xd30b3e3f00000000, 0xaa61432e00000000,
+    0x42bab89700000000, 0xdf7930c300000000, 0x37a2cb7a00000000,
+    0x4ec8b66b00000000, 0xa6134dd200000000, 0xbc1c4c4900000000,
+    0x54c7b7f000000000, 0x2dadcae100000000, 0xc576315800000000,
+    0x172adb4800000000, 0xfff120f100000000, 0x869b5de000000000,
+    0x6e40a65900000000, 0x744fa7c200000000, 0x9c945c7b00000000,
+    0xe5fe216a00000000, 0x0d25dad300000000, 0x90e6528700000000,
+    0x783da93e00000000, 0x0157d42f00000000, 0xe98c2f9600000000,
+    0xf3832e0d00000000, 0x1b58d5b400000000, 0x6232a8a500000000,
+    0x8ae9531c00000000},
+   {0x0000000000000000, 0x919168ae00000000, 0x6325a08700000000,
+    0xf2b4c82900000000, 0x874c31d400000000, 0x16dd597a00000000,
+    0xe469915300000000, 0x75f8f9fd00000000, 0x4f9f137300000000,
+    0xde0e7bdd00000000, 0x2cbab3f400000000, 0xbd2bdb5a00000000,
+    0xc8d322a700000000, 0x59424a0900000000, 0xabf6822000000000,
+    0x3a67ea8e00000000, 0x9e3e27e600000000, 0x0faf4f4800000000,
+    0xfd1b876100000000, 0x6c8aefcf00000000, 0x1972163200000000,
+    0x88e37e9c00000000, 0x7a57b6b500000000, 0xebc6de1b00000000,
+    0xd1a1349500000000, 0x40305c3b00000000, 0xb284941200000000,
+    0x2315fcbc00000000, 0x56ed054100000000, 0xc77c6def00000000,
+    0x35c8a5c600000000, 0xa459cd6800000000, 0x7d7b3f1700000000,
+    0xecea57b900000000, 0x1e5e9f9000000000, 0x8fcff73e00000000,
+    0xfa370ec300000000, 0x6ba6666d00000000, 0x9912ae4400000000,
+    0x0883c6ea00000000, 0x32e42c6400000000, 0xa37544ca00000000,
+    0x51c18ce300000000, 0xc050e44d00000000, 0xb5a81db000000000,
+    0x2439751e00000000, 0xd68dbd3700000000, 0x471cd59900000000,
+    0xe34518f100000000, 0x72d4705f00000000, 0x8060b87600000000,
+    0x11f1d0d800000000, 0x6409292500000000, 0xf598418b00000000,
+    0x072c89a200000000, 0x96bde10c00000000, 0xacda0b8200000000,
+    0x3d4b632c00000000, 0xcfffab0500000000, 0x5e6ec3ab00000000,
+    0x2b963a5600000000, 0xba0752f800000000, 0x48b39ad100000000,
+    0xd922f27f00000000, 0xfaf67e2e00000000, 0x6b67168000000000,
+    0x99d3dea900000000, 0x0842b60700000000, 0x7dba4ffa00000000,
+    0xec2b275400000000, 0x1e9fef7d00000000, 0x8f0e87d300000000,
+    0xb5696d5d00000000, 0x24f805f300000000, 0xd64ccdda00000000,
+    0x47dda57400000000, 0x32255c8900000000, 0xa3b4342700000000,
+    0x5100fc0e00000000, 0xc09194a000000000, 0x64c859c800000000,
+    0xf559316600000000, 0x07edf94f00000000, 0x967c91e100000000,
+    0xe384681c00000000, 0x721500b200000000, 0x80a1c89b00000000,
+    0x1130a03500000000, 0x2b574abb00000000, 0xbac6221500000000,
+    0x4872ea3c00000000, 0xd9e3829200000000, 0xac1b7b6f00000000,
+    0x3d8a13c100000000, 0xcf3edbe800000000, 0x5eafb34600000000,
+    0x878d413900000000, 0x161c299700000000, 0xe4a8e1be00000000,
+    0x7539891000000000, 0x00c170ed00000000, 0x9150184300000000,
+    0x63e4d06a00000000, 0xf275b8c400000000, 0xc812524a00000000,
+    0x59833ae400000000, 0xab37f2cd00000000, 0x3aa69a6300000000,
+    0x4f5e639e00000000, 0xdecf0b3000000000, 0x2c7bc31900000000,
+    0xbdeaabb700000000, 0x19b366df00000000, 0x88220e7100000000,
+    0x7a96c65800000000, 0xeb07aef600000000, 0x9eff570b00000000,
+    0x0f6e3fa500000000, 0xfddaf78c00000000, 0x6c4b9f2200000000,
+    0x562c75ac00000000, 0xc7bd1d0200000000, 0x3509d52b00000000,
+    0xa498bd8500000000, 0xd160447800000000, 0x40f12cd600000000,
+    0xb245e4ff00000000, 0x23d48c5100000000, 0xf4edfd5c00000000,
+    0x657c95f200000000, 0x97c85ddb00000000, 0x0659357500000000,
+    0x73a1cc8800000000, 0xe230a42600000000, 0x10846c0f00000000,
+    0x811504a100000000, 0xbb72ee2f00000000, 0x2ae3868100000000,
+    0xd8574ea800000000, 0x49c6260600000000, 0x3c3edffb00000000,
+    0xadafb75500000000, 0x5f1b7f7c00000000, 0xce8a17d200000000,
+    0x6ad3daba00000000, 0xfb42b21400000000, 0x09f67a3d00000000,
+    0x9867129300000000, 0xed9feb6e00000000, 0x7c0e83c000000000,
+    0x8eba4be900000000, 0x1f2b234700000000, 0x254cc9c900000000,
+    0xb4dda16700000000, 0x4669694e00000000, 0xd7f801e000000000,
+    0xa200f81d00000000, 0x339190b300000000, 0xc125589a00000000,
+    0x50b4303400000000, 0x8996c24b00000000, 0x1807aae500000000,
+    0xeab362cc00000000, 0x7b220a6200000000, 0x0edaf39f00000000,
+    0x9f4b9b3100000000, 0x6dff531800000000, 0xfc6e3bb600000000,
+    0xc609d13800000000, 0x5798b99600000000, 0xa52c71bf00000000,
+    0x34bd191100000000, 0x4145e0ec00000000, 0xd0d4884200000000,
+    0x2260406b00000000, 0xb3f128c500000000, 0x17a8e5ad00000000,
+    0x86398d0300000000, 0x748d452a00000000, 0xe51c2d8400000000,
+    0x90e4d47900000000, 0x0175bcd700000000, 0xf3c174fe00000000,
+    0x62501c5000000000, 0x5837f6de00000000, 0xc9a69e7000000000,
+    0x3b12565900000000, 0xaa833ef700000000, 0xdf7bc70a00000000,
+    0x4eeaafa400000000, 0xbc5e678d00000000, 0x2dcf0f2300000000,
+    0x0e1b837200000000, 0x9f8aebdc00000000, 0x6d3e23f500000000,
+    0xfcaf4b5b00000000, 0x8957b2a600000000, 0x18c6da0800000000,
+    0xea72122100000000, 0x7be37a8f00000000, 0x4184900100000000,
+    0xd015f8af00000000, 0x22a1308600000000, 0xb330582800000000,
+    0xc6c8a1d500000000, 0x5759c97b00000000, 0xa5ed015200000000,
+    0x347c69fc00000000, 0x9025a49400000000, 0x01b4cc3a00000000,
+    0xf300041300000000, 0x62916cbd00000000, 0x1769954000000000,
+    0x86f8fdee00000000, 0x744c35c700000000, 0xe5dd5d6900000000,
+    0xdfbab7e700000000, 0x4e2bdf4900000000, 0xbc9f176000000000,
+    0x2d0e7fce00000000, 0x58f6863300000000, 0xc967ee9d00000000,
+    0x3bd326b400000000, 0xaa424e1a00000000, 0x7360bc6500000000,
+    0xe2f1d4cb00000000, 0x10451ce200000000, 0x81d4744c00000000,
+    0xf42c8db100000000, 0x65bde51f00000000, 0x97092d3600000000,
+    0x0698459800000000, 0x3cffaf1600000000, 0xad6ec7b800000000,
+    0x5fda0f9100000000, 0xce4b673f00000000, 0xbbb39ec200000000,
+    0x2a22f66c00000000, 0xd8963e4500000000, 0x490756eb00000000,
+    0xed5e9b8300000000, 0x7ccff32d00000000, 0x8e7b3b0400000000,
+    0x1fea53aa00000000, 0x6a12aa5700000000, 0xfb83c2f900000000,
+    0x09370ad000000000, 0x98a6627e00000000, 0xa2c188f000000000,
+    0x3350e05e00000000, 0xc1e4287700000000, 0x507540d900000000,
+    0x258db92400000000, 0xb41cd18a00000000, 0x46a819a300000000,
+    0xd739710d00000000}};
+
+#else /* W == 4 */
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa,
+    0x48e00e64, 0xc66f0987, 0x0ac50919, 0xd3e51bb5, 0x1f4f1b2b,
+    0x91c01cc8, 0x5d6a1c56, 0x57af154f, 0x9b0515d1, 0x158a1232,
+    0xd92012ac, 0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
+    0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832, 0xaf5e2a9e,
+    0x63f42a00, 0xed7b2de3, 0x21d12d7d, 0x2b142464, 0xe7be24fa,
+    0x69312319, 0xa59b2387, 0xf9766256, 0x35dc62c8, 0xbb53652b,
+    0x77f965b5, 0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
+    0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00, 0xaed97719,
+    0x62737787, 0xecfc7064, 0x205670fa, 0x85cd537d, 0x496753e3,
+    0xc7e85400, 0x0b42549e, 0x01875d87, 0xcd2d5d19, 0x43a25afa,
+    0x8f085a64, 0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
+    0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1, 0x299dc2ed,
+    0xe537c273, 0x6bb8c590, 0xa712c50e, 0xadd7cc17, 0x617dcc89,
+    0xeff2cb6a, 0x2358cbf4, 0xfa78d958, 0x36d2d9c6, 0xb85dde25,
+    0x74f7debb, 0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
+    0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425, 0xd16cfd3c,
+    0x1dc6fda2, 0x9349fa41, 0x5fe3fadf, 0x86c3e873, 0x4a69e8ed,
+    0xc4e6ef0e, 0x084cef90, 0x0289e689, 0xce23e617, 0x40ace1f4,
+    0x8c06e16a, 0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
+    0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2, 0x030ebb0e,
+    0xcfa4bb90, 0x412bbc73, 0x8d81bced, 0x8744b5f4, 0x4beeb56a,
+    0xc561b289, 0x09cbb217, 0xac509190, 0x60fa910e, 0xee7596ed,
+    0x22df9673, 0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
+    0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6, 0xfbff84df,
+    0x37558441, 0xb9da83a2, 0x7570833c, 0x533b85da, 0x9f918544,
+    0x111e82a7, 0xddb48239, 0xd7718b20, 0x1bdb8bbe, 0x95548c5d,
+    0x59fe8cc3, 0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
+    0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776, 0x2f80b4f1,
+    0xe32ab46f, 0x6da5b38c, 0xa10fb312, 0xabcaba0b, 0x6760ba95,
+    0xe9efbd76, 0x2545bde8, 0xfc65af44, 0x30cfafda, 0xbe40a839,
+    0x72eaa8a7, 0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
+    0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f, 0x2e07e976,
+    0xe2ade9e8, 0x6c22ee0b, 0xa088ee95, 0x79a8fc39, 0xb502fca7,
+    0x3b8dfb44, 0xf727fbda, 0xfde2f2c3, 0x3148f25d, 0xbfc7f5be,
+    0x736df520, 0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
+    0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe, 0x0513cd12,
+    0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1, 0x8159c3e8, 0x4df3c376,
+    0xc37cc495, 0x0fd6c40b, 0x7aa64737, 0xb60c47a9, 0x3883404a,
+    0xf42940d4, 0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
+    0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61, 0x2d095278,
+    0xe1a352e6, 0x6f2c5505, 0xa386559b, 0x061d761c, 0xcab77682,
+    0x44387161, 0x889271ff, 0x825778e6, 0x4efd7878, 0xc0727f9b,
+    0x0cd87f05, 0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
+    0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0, 0x83d02561,
+    0x4f7a25ff, 0xc1f5221c, 0x0d5f2282, 0x079a2b9b, 0xcb302b05,
+    0x45bf2ce6, 0x89152c78, 0x50353ed4, 0x9c9f3e4a, 0x121039a9,
+    0xdeba3937, 0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
+    0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9, 0x7b211ab0,
+    0xb78b1a2e, 0x39041dcd, 0xf5ae1d53, 0x2c8e0fff, 0xe0240f61,
+    0x6eab0882, 0xa201081c, 0xa8c40105, 0x646e019b, 0xeae10678,
+    0x264b06e6},
+   {0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d, 0xf44f2413,
+    0x52382fa7, 0x63d0353a, 0xc5a73e8e, 0x33ef4e67, 0x959845d3,
+    0xa4705f4e, 0x020754fa, 0xc7a06a74, 0x61d761c0, 0x503f7b5d,
+    0xf64870e9, 0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
+    0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240, 0x5431d2a9,
+    0xf246d91d, 0xc3aec380, 0x65d9c834, 0xa07ef6ba, 0x0609fd0e,
+    0x37e1e793, 0x9196ec27, 0xcfbd399c, 0x69ca3228, 0x582228b5,
+    0xfe552301, 0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
+    0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66, 0x081d53e8,
+    0xae6a585c, 0x9f8242c1, 0x39f54975, 0xa863a552, 0x0e14aee6,
+    0x3ffcb47b, 0x998bbfcf, 0x5c2c8141, 0xfa5b8af5, 0xcbb39068,
+    0x6dc49bdc, 0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
+    0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb, 0x440b7579,
+    0xe27c7ecd, 0xd3946450, 0x75e36fe4, 0xb044516a, 0x16335ade,
+    0x27db4043, 0x81ac4bf7, 0x77e43b1e, 0xd19330aa, 0xe07b2a37,
+    0x460c2183, 0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
+    0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a, 0xd79acda4,
+    0x71edc610, 0x4005dc8d, 0xe672d739, 0x103aa7d0, 0xb64dac64,
+    0x87a5b6f9, 0x21d2bd4d, 0xe47583c3, 0x42028877, 0x73ea92ea,
+    0xd59d995e, 0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
+    0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b, 0xb8590282,
+    0x1e2e0936, 0x2fc613ab, 0x89b1181f, 0x4c162691, 0xea612d25,
+    0xdb8937b8, 0x7dfe3c0c, 0xec68d02b, 0x4a1fdb9f, 0x7bf7c102,
+    0xdd80cab6, 0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
+    0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1, 0x2bc8ba5f,
+    0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2, 0x8816eaf2, 0x2e61e146,
+    0x1f89fbdb, 0xb9fef06f, 0x7c59cee1, 0xda2ec555, 0xebc6dfc8,
+    0x4db1d47c, 0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
+    0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b, 0xefc8763c,
+    0x49bf7d88, 0x78576715, 0xde206ca1, 0x1b87522f, 0xbdf0599b,
+    0x8c184306, 0x2a6f48b2, 0xdc27385b, 0x7a5033ef, 0x4bb82972,
+    0xedcf22c6, 0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
+    0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3, 0xb3e4f77d,
+    0x1593fcc9, 0x247be654, 0x820cede0, 0x74449d09, 0xd23396bd,
+    0xe3db8c20, 0x45ac8794, 0x800bb91a, 0x267cb2ae, 0x1794a833,
+    0xb1e3a387, 0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
+    0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e, 0x139a01c7,
+    0xb5ed0a73, 0x840510ee, 0x22721b5a, 0xe7d525d4, 0x41a22e60,
+    0x704a34fd, 0xd63d3f49, 0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2,
+    0xfdf58516, 0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
+    0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71, 0x0bbdf5ff,
+    0xadcafe4b, 0x9c22e4d6, 0x3a55ef62, 0xabc30345, 0x0db408f1,
+    0x3c5c126c, 0x9a2b19d8, 0x5f8c2756, 0xf9fb2ce2, 0xc813367f,
+    0x6e643dcb, 0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
+    0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac, 0x03a0a617,
+    0xa5d7ada3, 0x943fb73e, 0x3248bc8a, 0xf7ef8204, 0x519889b0,
+    0x6070932d, 0xc6079899, 0x304fe870, 0x9638e3c4, 0xa7d0f959,
+    0x01a7f2ed, 0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
+    0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044, 0x90311eca,
+    0x3646157e, 0x07ae0fe3, 0xa1d90457, 0x579174be, 0xf1e67f0a,
+    0xc00e6597, 0x66796e23, 0xa3de50ad, 0x05a95b19, 0x34414184,
+    0x92364a30},
+   {0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae, 0x9b914216,
+    0x50cd91b3, 0xd659e31d, 0x1d0530b8, 0xec53826d, 0x270f51c8,
+    0xa19b2366, 0x6ac7f0c3, 0x77c2c07b, 0xbc9e13de, 0x3a0a6170,
+    0xf156b2d5, 0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
+    0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223, 0xef8580f6,
+    0x24d95353, 0xa24d21fd, 0x6911f258, 0x7414c2e0, 0xbf481145,
+    0x39dc63eb, 0xf280b04e, 0x07ac0536, 0xccf0d693, 0x4a64a43d,
+    0x81387798, 0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
+    0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5, 0x706ec54d,
+    0xbb3216e8, 0x3da66446, 0xf6fab7e3, 0x047a07ad, 0xcf26d408,
+    0x49b2a6a6, 0x82ee7503, 0x9feb45bb, 0x54b7961e, 0xd223e4b0,
+    0x197f3715, 0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
+    0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578, 0x0f580a6c,
+    0xc404d9c9, 0x4290ab67, 0x89cc78c2, 0x94c9487a, 0x5f959bdf,
+    0xd901e971, 0x125d3ad4, 0xe30b8801, 0x28575ba4, 0xaec3290a,
+    0x659ffaaf, 0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
+    0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59, 0x971f4ae1,
+    0x5c439944, 0xdad7ebea, 0x118b384f, 0xe0dd8a9a, 0x2b81593f,
+    0xad152b91, 0x6649f834, 0x7b4cc88c, 0xb0101b29, 0x36846987,
+    0xfdd8ba22, 0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
+    0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2, 0xe4a78d37,
+    0x2ffb5e92, 0xa96f2c3c, 0x6233ff99, 0x7f36cf21, 0xb46a1c84,
+    0x32fe6e2a, 0xf9a2bd8f, 0x0b220dc1, 0xc07ede64, 0x46eaacca,
+    0x8db67f6f, 0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
+    0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02, 0x7ce0cdba,
+    0xb7bc1e1f, 0x31286cb1, 0xfa74bf14, 0x1eb014d8, 0xd5ecc77d,
+    0x5378b5d3, 0x98246676, 0x852156ce, 0x4e7d856b, 0xc8e9f7c5,
+    0x03b52460, 0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
+    0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d, 0x1d661643,
+    0xd63ac5e6, 0x50aeb748, 0x9bf264ed, 0x86f75455, 0x4dab87f0,
+    0xcb3ff55e, 0x006326fb, 0xf135942e, 0x3a69478b, 0xbcfd3525,
+    0x77a1e680, 0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
+    0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340, 0x828d53f8,
+    0x49d1805d, 0xcf45f2f3, 0x04192156, 0xf54f9383, 0x3e134026,
+    0xb8873288, 0x73dbe12d, 0x6eded195, 0xa5820230, 0x2316709e,
+    0xe84aa33b, 0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
+    0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd, 0xf6999118,
+    0x3dc542bd, 0xbb513013, 0x700de3b6, 0x6d08d30e, 0xa65400ab,
+    0x20c07205, 0xeb9ca1a0, 0x11e81eb4, 0xdab4cd11, 0x5c20bfbf,
+    0x977c6c1a, 0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
+    0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77, 0x662adecf,
+    0xad760d6a, 0x2be27fc4, 0xe0beac61, 0x123e1c2f, 0xd962cf8a,
+    0x5ff6bd24, 0x94aa6e81, 0x89af5e39, 0x42f38d9c, 0xc467ff32,
+    0x0f3b2c97, 0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
+    0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa, 0x16441b82,
+    0xdd18c827, 0x5b8cba89, 0x90d0692c, 0x8dd55994, 0x46898a31,
+    0xc01df89f, 0x0b412b3a, 0xfa1799ef, 0x314b4a4a, 0xb7df38e4,
+    0x7c83eb41, 0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
+    0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7, 0x8e035b0f,
+    0x455f88aa, 0xc3cbfa04, 0x089729a1, 0xf9c19b74, 0x329d48d1,
+    0xb4093a7f, 0x7f55e9da, 0x6250d962, 0xa90c0ac7, 0x2f987869,
+    0xe4c4abcc},
+   {0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0, 0xf580a6c0,
+    0xc8e08f70, 0x8f40f5a0, 0xb220dc10, 0x30704bc1, 0x0d106271,
+    0x4ab018a1, 0x77d03111, 0xc5f0ed01, 0xf890c4b1, 0xbf30be61,
+    0x825097d1, 0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
+    0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92, 0x5090dc43,
+    0x6df0f5f3, 0x2a508f23, 0x1730a693, 0xa5107a83, 0x98705333,
+    0xdfd029e3, 0xe2b00053, 0xc1c12f04, 0xfca106b4, 0xbb017c64,
+    0x866155d4, 0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
+    0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15, 0x0431c205,
+    0x3951ebb5, 0x7ef19165, 0x4391b8d5, 0xa121b886, 0x9c419136,
+    0xdbe1ebe6, 0xe681c256, 0x54a11e46, 0x69c137f6, 0x2e614d26,
+    0x13016496, 0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
+    0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57, 0x58f35849,
+    0x659371f9, 0x22330b29, 0x1f532299, 0xad73fe89, 0x9013d739,
+    0xd7b3ade9, 0xead38459, 0x68831388, 0x55e33a38, 0x124340e8,
+    0x2f236958, 0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
+    0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b, 0xcd93690b,
+    0xf0f340bb, 0xb7533a6b, 0x8a3313db, 0x0863840a, 0x3503adba,
+    0x72a3d76a, 0x4fc3feda, 0xfde322ca, 0xc0830b7a, 0x872371aa,
+    0xba43581a, 0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
+    0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d, 0xa9423c8c,
+    0x9422153c, 0xd3826fec, 0xeee2465c, 0x5cc29a4c, 0x61a2b3fc,
+    0x2602c92c, 0x1b62e09c, 0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af,
+    0xbe729a1f, 0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
+    0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de, 0x3c220dce,
+    0x0142247e, 0x46e25eae, 0x7b82771e, 0xb1e6b092, 0x8c869922,
+    0xcb26e3f2, 0xf646ca42, 0x44661652, 0x79063fe2, 0x3ea64532,
+    0x03c66c82, 0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
+    0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743, 0xd1062710,
+    0xec660ea0, 0xabc67470, 0x96a65dc0, 0x248681d0, 0x19e6a860,
+    0x5e46d2b0, 0x6326fb00, 0xe1766cd1, 0xdc164561, 0x9bb63fb1,
+    0xa6d61601, 0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
+    0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546, 0x85a73956,
+    0xb8c710e6, 0xff676a36, 0xc2074386, 0x4057d457, 0x7d37fde7,
+    0x3a978737, 0x07f7ae87, 0xb5d77297, 0x88b75b27, 0xcf1721f7,
+    0xf2770847, 0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
+    0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404, 0x20b743d5,
+    0x1dd76a65, 0x5a7710b5, 0x67173905, 0xd537e515, 0xe857cca5,
+    0xaff7b675, 0x92979fc5, 0xe915e8db, 0xd475c16b, 0x93d5bbbb,
+    0xaeb5920b, 0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
+    0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca, 0x2ce505da,
+    0x11852c6a, 0x562556ba, 0x6b457f0a, 0x89f57f59, 0xb49556e9,
+    0xf3352c39, 0xce550589, 0x7c75d999, 0x4115f029, 0x06b58af9,
+    0x3bd5a349, 0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
+    0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888, 0x28d4c7df,
+    0x15b4ee6f, 0x521494bf, 0x6f74bd0f, 0xdd54611f, 0xe03448af,
+    0xa794327f, 0x9af41bcf, 0x18a48c1e, 0x25c4a5ae, 0x6264df7e,
+    0x5f04f6ce, 0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
+    0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d, 0xbdb4f69d,
+    0x80d4df2d, 0xc774a5fd, 0xfa148c4d, 0x78441b9c, 0x4524322c,
+    0x028448fc, 0x3fe4614c, 0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c,
+    0xca64c78c}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x00000000, 0xb029603d, 0x6053c07a, 0xd07aa047, 0xc0a680f5,
+    0x708fe0c8, 0xa0f5408f, 0x10dc20b2, 0xc14b7030, 0x7162100d,
+    0xa118b04a, 0x1131d077, 0x01edf0c5, 0xb1c490f8, 0x61be30bf,
+    0xd1975082, 0x8297e060, 0x32be805d, 0xe2c4201a, 0x52ed4027,
+    0x42316095, 0xf21800a8, 0x2262a0ef, 0x924bc0d2, 0x43dc9050,
+    0xf3f5f06d, 0x238f502a, 0x93a63017, 0x837a10a5, 0x33537098,
+    0xe329d0df, 0x5300b0e2, 0x042fc1c1, 0xb406a1fc, 0x647c01bb,
+    0xd4556186, 0xc4894134, 0x74a02109, 0xa4da814e, 0x14f3e173,
+    0xc564b1f1, 0x754dd1cc, 0xa537718b, 0x151e11b6, 0x05c23104,
+    0xb5eb5139, 0x6591f17e, 0xd5b89143, 0x86b821a1, 0x3691419c,
+    0xe6ebe1db, 0x56c281e6, 0x461ea154, 0xf637c169, 0x264d612e,
+    0x96640113, 0x47f35191, 0xf7da31ac, 0x27a091eb, 0x9789f1d6,
+    0x8755d164, 0x377cb159, 0xe706111e, 0x572f7123, 0x4958f358,
+    0xf9719365, 0x290b3322, 0x9922531f, 0x89fe73ad, 0x39d71390,
+    0xe9adb3d7, 0x5984d3ea, 0x88138368, 0x383ae355, 0xe8404312,
+    0x5869232f, 0x48b5039d, 0xf89c63a0, 0x28e6c3e7, 0x98cfa3da,
+    0xcbcf1338, 0x7be67305, 0xab9cd342, 0x1bb5b37f, 0x0b6993cd,
+    0xbb40f3f0, 0x6b3a53b7, 0xdb13338a, 0x0a846308, 0xbaad0335,
+    0x6ad7a372, 0xdafec34f, 0xca22e3fd, 0x7a0b83c0, 0xaa712387,
+    0x1a5843ba, 0x4d773299, 0xfd5e52a4, 0x2d24f2e3, 0x9d0d92de,
+    0x8dd1b26c, 0x3df8d251, 0xed827216, 0x5dab122b, 0x8c3c42a9,
+    0x3c152294, 0xec6f82d3, 0x5c46e2ee, 0x4c9ac25c, 0xfcb3a261,
+    0x2cc90226, 0x9ce0621b, 0xcfe0d2f9, 0x7fc9b2c4, 0xafb31283,
+    0x1f9a72be, 0x0f46520c, 0xbf6f3231, 0x6f159276, 0xdf3cf24b,
+    0x0eaba2c9, 0xbe82c2f4, 0x6ef862b3, 0xded1028e, 0xce0d223c,
+    0x7e244201, 0xae5ee246, 0x1e77827b, 0x92b0e6b1, 0x2299868c,
+    0xf2e326cb, 0x42ca46f6, 0x52166644, 0xe23f0679, 0x3245a63e,
+    0x826cc603, 0x53fb9681, 0xe3d2f6bc, 0x33a856fb, 0x838136c6,
+    0x935d1674, 0x23747649, 0xf30ed60e, 0x4327b633, 0x102706d1,
+    0xa00e66ec, 0x7074c6ab, 0xc05da696, 0xd0818624, 0x60a8e619,
+    0xb0d2465e, 0x00fb2663, 0xd16c76e1, 0x614516dc, 0xb13fb69b,
+    0x0116d6a6, 0x11caf614, 0xa1e39629, 0x7199366e, 0xc1b05653,
+    0x969f2770, 0x26b6474d, 0xf6cce70a, 0x46e58737, 0x5639a785,
+    0xe610c7b8, 0x366a67ff, 0x864307c2, 0x57d45740, 0xe7fd377d,
+    0x3787973a, 0x87aef707, 0x9772d7b5, 0x275bb788, 0xf72117cf,
+    0x470877f2, 0x1408c710, 0xa421a72d, 0x745b076a, 0xc4726757,
+    0xd4ae47e5, 0x648727d8, 0xb4fd879f, 0x04d4e7a2, 0xd543b720,
+    0x656ad71d, 0xb510775a, 0x05391767, 0x15e537d5, 0xa5cc57e8,
+    0x75b6f7af, 0xc59f9792, 0xdbe815e9, 0x6bc175d4, 0xbbbbd593,
+    0x0b92b5ae, 0x1b4e951c, 0xab67f521, 0x7b1d5566, 0xcb34355b,
+    0x1aa365d9, 0xaa8a05e4, 0x7af0a5a3, 0xcad9c59e, 0xda05e52c,
+    0x6a2c8511, 0xba562556, 0x0a7f456b, 0x597ff589, 0xe95695b4,
+    0x392c35f3, 0x890555ce, 0x99d9757c, 0x29f01541, 0xf98ab506,
+    0x49a3d53b, 0x983485b9, 0x281de584, 0xf86745c3, 0x484e25fe,
+    0x5892054c, 0xe8bb6571, 0x38c1c536, 0x88e8a50b, 0xdfc7d428,
+    0x6feeb415, 0xbf941452, 0x0fbd746f, 0x1f6154dd, 0xaf4834e0,
+    0x7f3294a7, 0xcf1bf49a, 0x1e8ca418, 0xaea5c425, 0x7edf6462,
+    0xcef6045f, 0xde2a24ed, 0x6e0344d0, 0xbe79e497, 0x0e5084aa,
+    0x5d503448, 0xed795475, 0x3d03f432, 0x8d2a940f, 0x9df6b4bd,
+    0x2ddfd480, 0xfda574c7, 0x4d8c14fa, 0x9c1b4478, 0x2c322445,
+    0xfc488402, 0x4c61e43f, 0x5cbdc48d, 0xec94a4b0, 0x3cee04f7,
+    0x8cc764ca},
+   {0x00000000, 0xa5d35ccb, 0x0ba1c84d, 0xae729486, 0x1642919b,
+    0xb391cd50, 0x1de359d6, 0xb830051d, 0x6d8253ec, 0xc8510f27,
+    0x66239ba1, 0xc3f0c76a, 0x7bc0c277, 0xde139ebc, 0x70610a3a,
+    0xd5b256f1, 0x9b02d603, 0x3ed18ac8, 0x90a31e4e, 0x35704285,
+    0x8d404798, 0x28931b53, 0x86e18fd5, 0x2332d31e, 0xf68085ef,
+    0x5353d924, 0xfd214da2, 0x58f21169, 0xe0c21474, 0x451148bf,
+    0xeb63dc39, 0x4eb080f2, 0x3605ac07, 0x93d6f0cc, 0x3da4644a,
+    0x98773881, 0x20473d9c, 0x85946157, 0x2be6f5d1, 0x8e35a91a,
+    0x5b87ffeb, 0xfe54a320, 0x502637a6, 0xf5f56b6d, 0x4dc56e70,
+    0xe81632bb, 0x4664a63d, 0xe3b7faf6, 0xad077a04, 0x08d426cf,
+    0xa6a6b249, 0x0375ee82, 0xbb45eb9f, 0x1e96b754, 0xb0e423d2,
+    0x15377f19, 0xc08529e8, 0x65567523, 0xcb24e1a5, 0x6ef7bd6e,
+    0xd6c7b873, 0x7314e4b8, 0xdd66703e, 0x78b52cf5, 0x6c0a580f,
+    0xc9d904c4, 0x67ab9042, 0xc278cc89, 0x7a48c994, 0xdf9b955f,
+    0x71e901d9, 0xd43a5d12, 0x01880be3, 0xa45b5728, 0x0a29c3ae,
+    0xaffa9f65, 0x17ca9a78, 0xb219c6b3, 0x1c6b5235, 0xb9b80efe,
+    0xf7088e0c, 0x52dbd2c7, 0xfca94641, 0x597a1a8a, 0xe14a1f97,
+    0x4499435c, 0xeaebd7da, 0x4f388b11, 0x9a8adde0, 0x3f59812b,
+    0x912b15ad, 0x34f84966, 0x8cc84c7b, 0x291b10b0, 0x87698436,
+    0x22bad8fd, 0x5a0ff408, 0xffdca8c3, 0x51ae3c45, 0xf47d608e,
+    0x4c4d6593, 0xe99e3958, 0x47ecadde, 0xe23ff115, 0x378da7e4,
+    0x925efb2f, 0x3c2c6fa9, 0x99ff3362, 0x21cf367f, 0x841c6ab4,
+    0x2a6efe32, 0x8fbda2f9, 0xc10d220b, 0x64de7ec0, 0xcaacea46,
+    0x6f7fb68d, 0xd74fb390, 0x729cef5b, 0xdcee7bdd, 0x793d2716,
+    0xac8f71e7, 0x095c2d2c, 0xa72eb9aa, 0x02fde561, 0xbacde07c,
+    0x1f1ebcb7, 0xb16c2831, 0x14bf74fa, 0xd814b01e, 0x7dc7ecd5,
+    0xd3b57853, 0x76662498, 0xce562185, 0x6b857d4e, 0xc5f7e9c8,
+    0x6024b503, 0xb596e3f2, 0x1045bf39, 0xbe372bbf, 0x1be47774,
+    0xa3d47269, 0x06072ea2, 0xa875ba24, 0x0da6e6ef, 0x4316661d,
+    0xe6c53ad6, 0x48b7ae50, 0xed64f29b, 0x5554f786, 0xf087ab4d,
+    0x5ef53fcb, 0xfb266300, 0x2e9435f1, 0x8b47693a, 0x2535fdbc,
+    0x80e6a177, 0x38d6a46a, 0x9d05f8a1, 0x33776c27, 0x96a430ec,
+    0xee111c19, 0x4bc240d2, 0xe5b0d454, 0x4063889f, 0xf8538d82,
+    0x5d80d149, 0xf3f245cf, 0x56211904, 0x83934ff5, 0x2640133e,
+    0x883287b8, 0x2de1db73, 0x95d1de6e, 0x300282a5, 0x9e701623,
+    0x3ba34ae8, 0x7513ca1a, 0xd0c096d1, 0x7eb20257, 0xdb615e9c,
+    0x63515b81, 0xc682074a, 0x68f093cc, 0xcd23cf07, 0x189199f6,
+    0xbd42c53d, 0x133051bb, 0xb6e30d70, 0x0ed3086d, 0xab0054a6,
+    0x0572c020, 0xa0a19ceb, 0xb41ee811, 0x11cdb4da, 0xbfbf205c,
+    0x1a6c7c97, 0xa25c798a, 0x078f2541, 0xa9fdb1c7, 0x0c2eed0c,
+    0xd99cbbfd, 0x7c4fe736, 0xd23d73b0, 0x77ee2f7b, 0xcfde2a66,
+    0x6a0d76ad, 0xc47fe22b, 0x61acbee0, 0x2f1c3e12, 0x8acf62d9,
+    0x24bdf65f, 0x816eaa94, 0x395eaf89, 0x9c8df342, 0x32ff67c4,
+    0x972c3b0f, 0x429e6dfe, 0xe74d3135, 0x493fa5b3, 0xececf978,
+    0x54dcfc65, 0xf10fa0ae, 0x5f7d3428, 0xfaae68e3, 0x821b4416,
+    0x27c818dd, 0x89ba8c5b, 0x2c69d090, 0x9459d58d, 0x318a8946,
+    0x9ff81dc0, 0x3a2b410b, 0xef9917fa, 0x4a4a4b31, 0xe438dfb7,
+    0x41eb837c, 0xf9db8661, 0x5c08daaa, 0xf27a4e2c, 0x57a912e7,
+    0x19199215, 0xbccacede, 0x12b85a58, 0xb76b0693, 0x0f5b038e,
+    0xaa885f45, 0x04facbc3, 0xa1299708, 0x749bc1f9, 0xd1489d32,
+    0x7f3a09b4, 0xdae9557f, 0x62d95062, 0xc70a0ca9, 0x6978982f,
+    0xccabc4e4},
+   {0x00000000, 0xb40b77a6, 0x29119f97, 0x9d1ae831, 0x13244ff4,
+    0xa72f3852, 0x3a35d063, 0x8e3ea7c5, 0x674eef33, 0xd3459895,
+    0x4e5f70a4, 0xfa540702, 0x746aa0c7, 0xc061d761, 0x5d7b3f50,
+    0xe97048f6, 0xce9cde67, 0x7a97a9c1, 0xe78d41f0, 0x53863656,
+    0xddb89193, 0x69b3e635, 0xf4a90e04, 0x40a279a2, 0xa9d23154,
+    0x1dd946f2, 0x80c3aec3, 0x34c8d965, 0xbaf67ea0, 0x0efd0906,
+    0x93e7e137, 0x27ec9691, 0x9c39bdcf, 0x2832ca69, 0xb5282258,
+    0x012355fe, 0x8f1df23b, 0x3b16859d, 0xa60c6dac, 0x12071a0a,
+    0xfb7752fc, 0x4f7c255a, 0xd266cd6b, 0x666dbacd, 0xe8531d08,
+    0x5c586aae, 0xc142829f, 0x7549f539, 0x52a563a8, 0xe6ae140e,
+    0x7bb4fc3f, 0xcfbf8b99, 0x41812c5c, 0xf58a5bfa, 0x6890b3cb,
+    0xdc9bc46d, 0x35eb8c9b, 0x81e0fb3d, 0x1cfa130c, 0xa8f164aa,
+    0x26cfc36f, 0x92c4b4c9, 0x0fde5cf8, 0xbbd52b5e, 0x79750b44,
+    0xcd7e7ce2, 0x506494d3, 0xe46fe375, 0x6a5144b0, 0xde5a3316,
+    0x4340db27, 0xf74bac81, 0x1e3be477, 0xaa3093d1, 0x372a7be0,
+    0x83210c46, 0x0d1fab83, 0xb914dc25, 0x240e3414, 0x900543b2,
+    0xb7e9d523, 0x03e2a285, 0x9ef84ab4, 0x2af33d12, 0xa4cd9ad7,
+    0x10c6ed71, 0x8ddc0540, 0x39d772e6, 0xd0a73a10, 0x64ac4db6,
+    0xf9b6a587, 0x4dbdd221, 0xc38375e4, 0x77880242, 0xea92ea73,
+    0x5e999dd5, 0xe54cb68b, 0x5147c12d, 0xcc5d291c, 0x78565eba,
+    0xf668f97f, 0x42638ed9, 0xdf7966e8, 0x6b72114e, 0x820259b8,
+    0x36092e1e, 0xab13c62f, 0x1f18b189, 0x9126164c, 0x252d61ea,
+    0xb83789db, 0x0c3cfe7d, 0x2bd068ec, 0x9fdb1f4a, 0x02c1f77b,
+    0xb6ca80dd, 0x38f42718, 0x8cff50be, 0x11e5b88f, 0xa5eecf29,
+    0x4c9e87df, 0xf895f079, 0x658f1848, 0xd1846fee, 0x5fbac82b,
+    0xebb1bf8d, 0x76ab57bc, 0xc2a0201a, 0xf2ea1688, 0x46e1612e,
+    0xdbfb891f, 0x6ff0feb9, 0xe1ce597c, 0x55c52eda, 0xc8dfc6eb,
+    0x7cd4b14d, 0x95a4f9bb, 0x21af8e1d, 0xbcb5662c, 0x08be118a,
+    0x8680b64f, 0x328bc1e9, 0xaf9129d8, 0x1b9a5e7e, 0x3c76c8ef,
+    0x887dbf49, 0x15675778, 0xa16c20de, 0x2f52871b, 0x9b59f0bd,
+    0x0643188c, 0xb2486f2a, 0x5b3827dc, 0xef33507a, 0x7229b84b,
+    0xc622cfed, 0x481c6828, 0xfc171f8e, 0x610df7bf, 0xd5068019,
+    0x6ed3ab47, 0xdad8dce1, 0x47c234d0, 0xf3c94376, 0x7df7e4b3,
+    0xc9fc9315, 0x54e67b24, 0xe0ed0c82, 0x099d4474, 0xbd9633d2,
+    0x208cdbe3, 0x9487ac45, 0x1ab90b80, 0xaeb27c26, 0x33a89417,
+    0x87a3e3b1, 0xa04f7520, 0x14440286, 0x895eeab7, 0x3d559d11,
+    0xb36b3ad4, 0x07604d72, 0x9a7aa543, 0x2e71d2e5, 0xc7019a13,
+    0x730aedb5, 0xee100584, 0x5a1b7222, 0xd425d5e7, 0x602ea241,
+    0xfd344a70, 0x493f3dd6, 0x8b9f1dcc, 0x3f946a6a, 0xa28e825b,
+    0x1685f5fd, 0x98bb5238, 0x2cb0259e, 0xb1aacdaf, 0x05a1ba09,
+    0xecd1f2ff, 0x58da8559, 0xc5c06d68, 0x71cb1ace, 0xfff5bd0b,
+    0x4bfecaad, 0xd6e4229c, 0x62ef553a, 0x4503c3ab, 0xf108b40d,
+    0x6c125c3c, 0xd8192b9a, 0x56278c5f, 0xe22cfbf9, 0x7f3613c8,
+    0xcb3d646e, 0x224d2c98, 0x96465b3e, 0x0b5cb30f, 0xbf57c4a9,
+    0x3169636c, 0x856214ca, 0x1878fcfb, 0xac738b5d, 0x17a6a003,
+    0xa3add7a5, 0x3eb73f94, 0x8abc4832, 0x0482eff7, 0xb0899851,
+    0x2d937060, 0x999807c6, 0x70e84f30, 0xc4e33896, 0x59f9d0a7,
+    0xedf2a701, 0x63cc00c4, 0xd7c77762, 0x4add9f53, 0xfed6e8f5,
+    0xd93a7e64, 0x6d3109c2, 0xf02be1f3, 0x44209655, 0xca1e3190,
+    0x7e154636, 0xe30fae07, 0x5704d9a1, 0xbe749157, 0x0a7fe6f1,
+    0x97650ec0, 0x236e7966, 0xad50dea3, 0x195ba905, 0x84414134,
+    0x304a3692},
+   {0x00000000, 0x9e00aacc, 0x7d072542, 0xe3078f8e, 0xfa0e4a84,
+    0x640ee048, 0x87096fc6, 0x1909c50a, 0xb51be5d3, 0x2b1b4f1f,
+    0xc81cc091, 0x561c6a5d, 0x4f15af57, 0xd115059b, 0x32128a15,
+    0xac1220d9, 0x2b31bb7c, 0xb53111b0, 0x56369e3e, 0xc83634f2,
+    0xd13ff1f8, 0x4f3f5b34, 0xac38d4ba, 0x32387e76, 0x9e2a5eaf,
+    0x002af463, 0xe32d7bed, 0x7d2dd121, 0x6424142b, 0xfa24bee7,
+    0x19233169, 0x87239ba5, 0x566276f9, 0xc862dc35, 0x2b6553bb,
+    0xb565f977, 0xac6c3c7d, 0x326c96b1, 0xd16b193f, 0x4f6bb3f3,
+    0xe379932a, 0x7d7939e6, 0x9e7eb668, 0x007e1ca4, 0x1977d9ae,
+    0x87777362, 0x6470fcec, 0xfa705620, 0x7d53cd85, 0xe3536749,
+    0x0054e8c7, 0x9e54420b, 0x875d8701, 0x195d2dcd, 0xfa5aa243,
+    0x645a088f, 0xc8482856, 0x5648829a, 0xb54f0d14, 0x2b4fa7d8,
+    0x324662d2, 0xac46c81e, 0x4f414790, 0xd141ed5c, 0xedc29d29,
+    0x73c237e5, 0x90c5b86b, 0x0ec512a7, 0x17ccd7ad, 0x89cc7d61,
+    0x6acbf2ef, 0xf4cb5823, 0x58d978fa, 0xc6d9d236, 0x25de5db8,
+    0xbbdef774, 0xa2d7327e, 0x3cd798b2, 0xdfd0173c, 0x41d0bdf0,
+    0xc6f32655, 0x58f38c99, 0xbbf40317, 0x25f4a9db, 0x3cfd6cd1,
+    0xa2fdc61d, 0x41fa4993, 0xdffae35f, 0x73e8c386, 0xede8694a,
+    0x0eefe6c4, 0x90ef4c08, 0x89e68902, 0x17e623ce, 0xf4e1ac40,
+    0x6ae1068c, 0xbba0ebd0, 0x25a0411c, 0xc6a7ce92, 0x58a7645e,
+    0x41aea154, 0xdfae0b98, 0x3ca98416, 0xa2a92eda, 0x0ebb0e03,
+    0x90bba4cf, 0x73bc2b41, 0xedbc818d, 0xf4b54487, 0x6ab5ee4b,
+    0x89b261c5, 0x17b2cb09, 0x909150ac, 0x0e91fa60, 0xed9675ee,
+    0x7396df22, 0x6a9f1a28, 0xf49fb0e4, 0x17983f6a, 0x899895a6,
+    0x258ab57f, 0xbb8a1fb3, 0x588d903d, 0xc68d3af1, 0xdf84fffb,
+    0x41845537, 0xa283dab9, 0x3c837075, 0xda853b53, 0x4485919f,
+    0xa7821e11, 0x3982b4dd, 0x208b71d7, 0xbe8bdb1b, 0x5d8c5495,
+    0xc38cfe59, 0x6f9ede80, 0xf19e744c, 0x1299fbc2, 0x8c99510e,
+    0x95909404, 0x0b903ec8, 0xe897b146, 0x76971b8a, 0xf1b4802f,
+    0x6fb42ae3, 0x8cb3a56d, 0x12b30fa1, 0x0bbacaab, 0x95ba6067,
+    0x76bdefe9, 0xe8bd4525, 0x44af65fc, 0xdaafcf30, 0x39a840be,
+    0xa7a8ea72, 0xbea12f78, 0x20a185b4, 0xc3a60a3a, 0x5da6a0f6,
+    0x8ce74daa, 0x12e7e766, 0xf1e068e8, 0x6fe0c224, 0x76e9072e,
+    0xe8e9ade2, 0x0bee226c, 0x95ee88a0, 0x39fca879, 0xa7fc02b5,
+    0x44fb8d3b, 0xdafb27f7, 0xc3f2e2fd, 0x5df24831, 0xbef5c7bf,
+    0x20f56d73, 0xa7d6f6d6, 0x39d65c1a, 0xdad1d394, 0x44d17958,
+    0x5dd8bc52, 0xc3d8169e, 0x20df9910, 0xbedf33dc, 0x12cd1305,
+    0x8ccdb9c9, 0x6fca3647, 0xf1ca9c8b, 0xe8c35981, 0x76c3f34d,
+    0x95c47cc3, 0x0bc4d60f, 0x3747a67a, 0xa9470cb6, 0x4a408338,
+    0xd44029f4, 0xcd49ecfe, 0x53494632, 0xb04ec9bc, 0x2e4e6370,
+    0x825c43a9, 0x1c5ce965, 0xff5b66eb, 0x615bcc27, 0x7852092d,
+    0xe652a3e1, 0x05552c6f, 0x9b5586a3, 0x1c761d06, 0x8276b7ca,
+    0x61713844, 0xff719288, 0xe6785782, 0x7878fd4e, 0x9b7f72c0,
+    0x057fd80c, 0xa96df8d5, 0x376d5219, 0xd46add97, 0x4a6a775b,
+    0x5363b251, 0xcd63189d, 0x2e649713, 0xb0643ddf, 0x6125d083,
+    0xff257a4f, 0x1c22f5c1, 0x82225f0d, 0x9b2b9a07, 0x052b30cb,
+    0xe62cbf45, 0x782c1589, 0xd43e3550, 0x4a3e9f9c, 0xa9391012,
+    0x3739bade, 0x2e307fd4, 0xb030d518, 0x53375a96, 0xcd37f05a,
+    0x4a146bff, 0xd414c133, 0x37134ebd, 0xa913e471, 0xb01a217b,
+    0x2e1a8bb7, 0xcd1d0439, 0x531daef5, 0xff0f8e2c, 0x610f24e0,
+    0x8208ab6e, 0x1c0801a2, 0x0501c4a8, 0x9b016e64, 0x7806e1ea,
+    0xe6064b26}};
+
+#endif /* W */
+
+#endif /* N == 2 */
+#if N == 3
+
+#if W == 8
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0x81256527, 0xd93bcc0f, 0x581ea928, 0x69069e5f,
+    0xe823fb78, 0xb03d5250, 0x31183777, 0xd20d3cbe, 0x53285999,
+    0x0b36f0b1, 0x8a139596, 0xbb0ba2e1, 0x3a2ec7c6, 0x62306eee,
+    0xe3150bc9, 0x7f6b7f3d, 0xfe4e1a1a, 0xa650b332, 0x2775d615,
+    0x166de162, 0x97488445, 0xcf562d6d, 0x4e73484a, 0xad664383,
+    0x2c4326a4, 0x745d8f8c, 0xf578eaab, 0xc460dddc, 0x4545b8fb,
+    0x1d5b11d3, 0x9c7e74f4, 0xfed6fe7a, 0x7ff39b5d, 0x27ed3275,
+    0xa6c85752, 0x97d06025, 0x16f50502, 0x4eebac2a, 0xcfcec90d,
+    0x2cdbc2c4, 0xadfea7e3, 0xf5e00ecb, 0x74c56bec, 0x45dd5c9b,
+    0xc4f839bc, 0x9ce69094, 0x1dc3f5b3, 0x81bd8147, 0x0098e460,
+    0x58864d48, 0xd9a3286f, 0xe8bb1f18, 0x699e7a3f, 0x3180d317,
+    0xb0a5b630, 0x53b0bdf9, 0xd295d8de, 0x8a8b71f6, 0x0bae14d1,
+    0x3ab623a6, 0xbb934681, 0xe38defa9, 0x62a88a8e, 0x26dcfab5,
+    0xa7f99f92, 0xffe736ba, 0x7ec2539d, 0x4fda64ea, 0xceff01cd,
+    0x96e1a8e5, 0x17c4cdc2, 0xf4d1c60b, 0x75f4a32c, 0x2dea0a04,
+    0xaccf6f23, 0x9dd75854, 0x1cf23d73, 0x44ec945b, 0xc5c9f17c,
+    0x59b78588, 0xd892e0af, 0x808c4987, 0x01a92ca0, 0x30b11bd7,
+    0xb1947ef0, 0xe98ad7d8, 0x68afb2ff, 0x8bbab936, 0x0a9fdc11,
+    0x52817539, 0xd3a4101e, 0xe2bc2769, 0x6399424e, 0x3b87eb66,
+    0xbaa28e41, 0xd80a04cf, 0x592f61e8, 0x0131c8c0, 0x8014ade7,
+    0xb10c9a90, 0x3029ffb7, 0x6837569f, 0xe91233b8, 0x0a073871,
+    0x8b225d56, 0xd33cf47e, 0x52199159, 0x6301a62e, 0xe224c309,
+    0xba3a6a21, 0x3b1f0f06, 0xa7617bf2, 0x26441ed5, 0x7e5ab7fd,
+    0xff7fd2da, 0xce67e5ad, 0x4f42808a, 0x175c29a2, 0x96794c85,
+    0x756c474c, 0xf449226b, 0xac578b43, 0x2d72ee64, 0x1c6ad913,
+    0x9d4fbc34, 0xc551151c, 0x4474703b, 0x4db9f56a, 0xcc9c904d,
+    0x94823965, 0x15a75c42, 0x24bf6b35, 0xa59a0e12, 0xfd84a73a,
+    0x7ca1c21d, 0x9fb4c9d4, 0x1e91acf3, 0x468f05db, 0xc7aa60fc,
+    0xf6b2578b, 0x779732ac, 0x2f899b84, 0xaeacfea3, 0x32d28a57,
+    0xb3f7ef70, 0xebe94658, 0x6acc237f, 0x5bd41408, 0xdaf1712f,
+    0x82efd807, 0x03cabd20, 0xe0dfb6e9, 0x61fad3ce, 0x39e47ae6,
+    0xb8c11fc1, 0x89d928b6, 0x08fc4d91, 0x50e2e4b9, 0xd1c7819e,
+    0xb36f0b10, 0x324a6e37, 0x6a54c71f, 0xeb71a238, 0xda69954f,
+    0x5b4cf068, 0x03525940, 0x82773c67, 0x616237ae, 0xe0475289,
+    0xb859fba1, 0x397c9e86, 0x0864a9f1, 0x8941ccd6, 0xd15f65fe,
+    0x507a00d9, 0xcc04742d, 0x4d21110a, 0x153fb822, 0x941add05,
+    0xa502ea72, 0x24278f55, 0x7c39267d, 0xfd1c435a, 0x1e094893,
+    0x9f2c2db4, 0xc732849c, 0x4617e1bb, 0x770fd6cc, 0xf62ab3eb,
+    0xae341ac3, 0x2f117fe4, 0x6b650fdf, 0xea406af8, 0xb25ec3d0,
+    0x337ba6f7, 0x02639180, 0x8346f4a7, 0xdb585d8f, 0x5a7d38a8,
+    0xb9683361, 0x384d5646, 0x6053ff6e, 0xe1769a49, 0xd06ead3e,
+    0x514bc819, 0x09556131, 0x88700416, 0x140e70e2, 0x952b15c5,
+    0xcd35bced, 0x4c10d9ca, 0x7d08eebd, 0xfc2d8b9a, 0xa43322b2,
+    0x25164795, 0xc6034c5c, 0x4726297b, 0x1f388053, 0x9e1de574,
+    0xaf05d203, 0x2e20b724, 0x763e1e0c, 0xf71b7b2b, 0x95b3f1a5,
+    0x14969482, 0x4c883daa, 0xcdad588d, 0xfcb56ffa, 0x7d900add,
+    0x258ea3f5, 0xa4abc6d2, 0x47becd1b, 0xc69ba83c, 0x9e850114,
+    0x1fa06433, 0x2eb85344, 0xaf9d3663, 0xf7839f4b, 0x76a6fa6c,
+    0xead88e98, 0x6bfdebbf, 0x33e34297, 0xb2c627b0, 0x83de10c7,
+    0x02fb75e0, 0x5ae5dcc8, 0xdbc0b9ef, 0x38d5b226, 0xb9f0d701,
+    0xe1ee7e29, 0x60cb1b0e, 0x51d32c79, 0xd0f6495e, 0x88e8e076,
+    0x09cd8551},
+   {0x00000000, 0x9b73ead4, 0xed96d3e9, 0x76e5393d, 0x005ca193,
+    0x9b2f4b47, 0xedca727a, 0x76b998ae, 0x00b94326, 0x9bcaa9f2,
+    0xed2f90cf, 0x765c7a1b, 0x00e5e2b5, 0x9b960861, 0xed73315c,
+    0x7600db88, 0x0172864c, 0x9a016c98, 0xece455a5, 0x7797bf71,
+    0x012e27df, 0x9a5dcd0b, 0xecb8f436, 0x77cb1ee2, 0x01cbc56a,
+    0x9ab82fbe, 0xec5d1683, 0x772efc57, 0x019764f9, 0x9ae48e2d,
+    0xec01b710, 0x77725dc4, 0x02e50c98, 0x9996e64c, 0xef73df71,
+    0x740035a5, 0x02b9ad0b, 0x99ca47df, 0xef2f7ee2, 0x745c9436,
+    0x025c4fbe, 0x992fa56a, 0xefca9c57, 0x74b97683, 0x0200ee2d,
+    0x997304f9, 0xef963dc4, 0x74e5d710, 0x03978ad4, 0x98e46000,
+    0xee01593d, 0x7572b3e9, 0x03cb2b47, 0x98b8c193, 0xee5df8ae,
+    0x752e127a, 0x032ec9f2, 0x985d2326, 0xeeb81a1b, 0x75cbf0cf,
+    0x03726861, 0x980182b5, 0xeee4bb88, 0x7597515c, 0x05ca1930,
+    0x9eb9f3e4, 0xe85ccad9, 0x732f200d, 0x0596b8a3, 0x9ee55277,
+    0xe8006b4a, 0x7373819e, 0x05735a16, 0x9e00b0c2, 0xe8e589ff,
+    0x7396632b, 0x052ffb85, 0x9e5c1151, 0xe8b9286c, 0x73cac2b8,
+    0x04b89f7c, 0x9fcb75a8, 0xe92e4c95, 0x725da641, 0x04e43eef,
+    0x9f97d43b, 0xe972ed06, 0x720107d2, 0x0401dc5a, 0x9f72368e,
+    0xe9970fb3, 0x72e4e567, 0x045d7dc9, 0x9f2e971d, 0xe9cbae20,
+    0x72b844f4, 0x072f15a8, 0x9c5cff7c, 0xeab9c641, 0x71ca2c95,
+    0x0773b43b, 0x9c005eef, 0xeae567d2, 0x71968d06, 0x0796568e,
+    0x9ce5bc5a, 0xea008567, 0x71736fb3, 0x07caf71d, 0x9cb91dc9,
+    0xea5c24f4, 0x712fce20, 0x065d93e4, 0x9d2e7930, 0xebcb400d,
+    0x70b8aad9, 0x06013277, 0x9d72d8a3, 0xeb97e19e, 0x70e40b4a,
+    0x06e4d0c2, 0x9d973a16, 0xeb72032b, 0x7001e9ff, 0x06b87151,
+    0x9dcb9b85, 0xeb2ea2b8, 0x705d486c, 0x0b943260, 0x90e7d8b4,
+    0xe602e189, 0x7d710b5d, 0x0bc893f3, 0x90bb7927, 0xe65e401a,
+    0x7d2daace, 0x0b2d7146, 0x905e9b92, 0xe6bba2af, 0x7dc8487b,
+    0x0b71d0d5, 0x90023a01, 0xe6e7033c, 0x7d94e9e8, 0x0ae6b42c,
+    0x91955ef8, 0xe77067c5, 0x7c038d11, 0x0aba15bf, 0x91c9ff6b,
+    0xe72cc656, 0x7c5f2c82, 0x0a5ff70a, 0x912c1dde, 0xe7c924e3,
+    0x7cbace37, 0x0a035699, 0x9170bc4d, 0xe7958570, 0x7ce66fa4,
+    0x09713ef8, 0x9202d42c, 0xe4e7ed11, 0x7f9407c5, 0x092d9f6b,
+    0x925e75bf, 0xe4bb4c82, 0x7fc8a656, 0x09c87dde, 0x92bb970a,
+    0xe45eae37, 0x7f2d44e3, 0x0994dc4d, 0x92e73699, 0xe4020fa4,
+    0x7f71e570, 0x0803b8b4, 0x93705260, 0xe5956b5d, 0x7ee68189,
+    0x085f1927, 0x932cf3f3, 0xe5c9cace, 0x7eba201a, 0x08bafb92,
+    0x93c91146, 0xe52c287b, 0x7e5fc2af, 0x08e65a01, 0x9395b0d5,
+    0xe57089e8, 0x7e03633c, 0x0e5e2b50, 0x952dc184, 0xe3c8f8b9,
+    0x78bb126d, 0x0e028ac3, 0x95716017, 0xe394592a, 0x78e7b3fe,
+    0x0ee76876, 0x959482a2, 0xe371bb9f, 0x7802514b, 0x0ebbc9e5,
+    0x95c82331, 0xe32d1a0c, 0x785ef0d8, 0x0f2cad1c, 0x945f47c8,
+    0xe2ba7ef5, 0x79c99421, 0x0f700c8f, 0x9403e65b, 0xe2e6df66,
+    0x799535b2, 0x0f95ee3a, 0x94e604ee, 0xe2033dd3, 0x7970d707,
+    0x0fc94fa9, 0x94baa57d, 0xe25f9c40, 0x792c7694, 0x0cbb27c8,
+    0x97c8cd1c, 0xe12df421, 0x7a5e1ef5, 0x0ce7865b, 0x97946c8f,
+    0xe17155b2, 0x7a02bf66, 0x0c0264ee, 0x97718e3a, 0xe194b707,
+    0x7ae75dd3, 0x0c5ec57d, 0x972d2fa9, 0xe1c81694, 0x7abbfc40,
+    0x0dc9a184, 0x96ba4b50, 0xe05f726d, 0x7b2c98b9, 0x0d950017,
+    0x96e6eac3, 0xe003d3fe, 0x7b70392a, 0x0d70e2a2, 0x96030876,
+    0xe0e6314b, 0x7b95db9f, 0x0d2c4331, 0x965fa9e5, 0xe0ba90d8,
+    0x7bc97a0c},
+   {0x00000000, 0x172864c0, 0x2e50c980, 0x3978ad40, 0x5ca19300,
+    0x4b89f7c0, 0x72f15a80, 0x65d93e40, 0xb9432600, 0xae6b42c0,
+    0x9713ef80, 0x803b8b40, 0xe5e2b500, 0xf2cad1c0, 0xcbb27c80,
+    0xdc9a1840, 0xa9f74a41, 0xbedf2e81, 0x87a783c1, 0x908fe701,
+    0xf556d941, 0xe27ebd81, 0xdb0610c1, 0xcc2e7401, 0x10b46c41,
+    0x079c0881, 0x3ee4a5c1, 0x29ccc101, 0x4c15ff41, 0x5b3d9b81,
+    0x624536c1, 0x756d5201, 0x889f92c3, 0x9fb7f603, 0xa6cf5b43,
+    0xb1e73f83, 0xd43e01c3, 0xc3166503, 0xfa6ec843, 0xed46ac83,
+    0x31dcb4c3, 0x26f4d003, 0x1f8c7d43, 0x08a41983, 0x6d7d27c3,
+    0x7a554303, 0x432dee43, 0x54058a83, 0x2168d882, 0x3640bc42,
+    0x0f381102, 0x181075c2, 0x7dc94b82, 0x6ae12f42, 0x53998202,
+    0x44b1e6c2, 0x982bfe82, 0x8f039a42, 0xb67b3702, 0xa15353c2,
+    0xc48a6d82, 0xd3a20942, 0xeadaa402, 0xfdf2c0c2, 0xca4e23c7,
+    0xdd664707, 0xe41eea47, 0xf3368e87, 0x96efb0c7, 0x81c7d407,
+    0xb8bf7947, 0xaf971d87, 0x730d05c7, 0x64256107, 0x5d5dcc47,
+    0x4a75a887, 0x2fac96c7, 0x3884f207, 0x01fc5f47, 0x16d43b87,
+    0x63b96986, 0x74910d46, 0x4de9a006, 0x5ac1c4c6, 0x3f18fa86,
+    0x28309e46, 0x11483306, 0x066057c6, 0xdafa4f86, 0xcdd22b46,
+    0xf4aa8606, 0xe382e2c6, 0x865bdc86, 0x9173b846, 0xa80b1506,
+    0xbf2371c6, 0x42d1b104, 0x55f9d5c4, 0x6c817884, 0x7ba91c44,
+    0x1e702204, 0x095846c4, 0x3020eb84, 0x27088f44, 0xfb929704,
+    0xecbaf3c4, 0xd5c25e84, 0xc2ea3a44, 0xa7330404, 0xb01b60c4,
+    0x8963cd84, 0x9e4ba944, 0xeb26fb45, 0xfc0e9f85, 0xc57632c5,
+    0xd25e5605, 0xb7876845, 0xa0af0c85, 0x99d7a1c5, 0x8effc505,
+    0x5265dd45, 0x454db985, 0x7c3514c5, 0x6b1d7005, 0x0ec44e45,
+    0x19ec2a85, 0x209487c5, 0x37bce305, 0x4fed41cf, 0x58c5250f,
+    0x61bd884f, 0x7695ec8f, 0x134cd2cf, 0x0464b60f, 0x3d1c1b4f,
+    0x2a347f8f, 0xf6ae67cf, 0xe186030f, 0xd8feae4f, 0xcfd6ca8f,
+    0xaa0ff4cf, 0xbd27900f, 0x845f3d4f, 0x9377598f, 0xe61a0b8e,
+    0xf1326f4e, 0xc84ac20e, 0xdf62a6ce, 0xbabb988e, 0xad93fc4e,
+    0x94eb510e, 0x83c335ce, 0x5f592d8e, 0x4871494e, 0x7109e40e,
+    0x662180ce, 0x03f8be8e, 0x14d0da4e, 0x2da8770e, 0x3a8013ce,
+    0xc772d30c, 0xd05ab7cc, 0xe9221a8c, 0xfe0a7e4c, 0x9bd3400c,
+    0x8cfb24cc, 0xb583898c, 0xa2abed4c, 0x7e31f50c, 0x691991cc,
+    0x50613c8c, 0x4749584c, 0x2290660c, 0x35b802cc, 0x0cc0af8c,
+    0x1be8cb4c, 0x6e85994d, 0x79adfd8d, 0x40d550cd, 0x57fd340d,
+    0x32240a4d, 0x250c6e8d, 0x1c74c3cd, 0x0b5ca70d, 0xd7c6bf4d,
+    0xc0eedb8d, 0xf99676cd, 0xeebe120d, 0x8b672c4d, 0x9c4f488d,
+    0xa537e5cd, 0xb21f810d, 0x85a36208, 0x928b06c8, 0xabf3ab88,
+    0xbcdbcf48, 0xd902f108, 0xce2a95c8, 0xf7523888, 0xe07a5c48,
+    0x3ce04408, 0x2bc820c8, 0x12b08d88, 0x0598e948, 0x6041d708,
+    0x7769b3c8, 0x4e111e88, 0x59397a48, 0x2c542849, 0x3b7c4c89,
+    0x0204e1c9, 0x152c8509, 0x70f5bb49, 0x67dddf89, 0x5ea572c9,
+    0x498d1609, 0x95170e49, 0x823f6a89, 0xbb47c7c9, 0xac6fa309,
+    0xc9b69d49, 0xde9ef989, 0xe7e654c9, 0xf0ce3009, 0x0d3cf0cb,
+    0x1a14940b, 0x236c394b, 0x34445d8b, 0x519d63cb, 0x46b5070b,
+    0x7fcdaa4b, 0x68e5ce8b, 0xb47fd6cb, 0xa357b20b, 0x9a2f1f4b,
+    0x8d077b8b, 0xe8de45cb, 0xfff6210b, 0xc68e8c4b, 0xd1a6e88b,
+    0xa4cbba8a, 0xb3e3de4a, 0x8a9b730a, 0x9db317ca, 0xf86a298a,
+    0xef424d4a, 0xd63ae00a, 0xc11284ca, 0x1d889c8a, 0x0aa0f84a,
+    0x33d8550a, 0x24f031ca, 0x41290f8a, 0x56016b4a, 0x6f79c60a,
+    0x7851a2ca},
+   {0x00000000, 0x9fda839e, 0xe4c4017d, 0x7b1e82e3, 0x12f904bb,
+    0x8d238725, 0xf63d05c6, 0x69e78658, 0x25f20976, 0xba288ae8,
+    0xc136080b, 0x5eec8b95, 0x370b0dcd, 0xa8d18e53, 0xd3cf0cb0,
+    0x4c158f2e, 0x4be412ec, 0xd43e9172, 0xaf201391, 0x30fa900f,
+    0x591d1657, 0xc6c795c9, 0xbdd9172a, 0x220394b4, 0x6e161b9a,
+    0xf1cc9804, 0x8ad21ae7, 0x15089979, 0x7cef1f21, 0xe3359cbf,
+    0x982b1e5c, 0x07f19dc2, 0x97c825d8, 0x0812a646, 0x730c24a5,
+    0xecd6a73b, 0x85312163, 0x1aeba2fd, 0x61f5201e, 0xfe2fa380,
+    0xb23a2cae, 0x2de0af30, 0x56fe2dd3, 0xc924ae4d, 0xa0c32815,
+    0x3f19ab8b, 0x44072968, 0xdbddaaf6, 0xdc2c3734, 0x43f6b4aa,
+    0x38e83649, 0xa732b5d7, 0xced5338f, 0x510fb011, 0x2a1132f2,
+    0xb5cbb16c, 0xf9de3e42, 0x6604bddc, 0x1d1a3f3f, 0x82c0bca1,
+    0xeb273af9, 0x74fdb967, 0x0fe33b84, 0x9039b81a, 0xf4e14df1,
+    0x6b3bce6f, 0x10254c8c, 0x8fffcf12, 0xe618494a, 0x79c2cad4,
+    0x02dc4837, 0x9d06cba9, 0xd1134487, 0x4ec9c719, 0x35d745fa,
+    0xaa0dc664, 0xc3ea403c, 0x5c30c3a2, 0x272e4141, 0xb8f4c2df,
+    0xbf055f1d, 0x20dfdc83, 0x5bc15e60, 0xc41bddfe, 0xadfc5ba6,
+    0x3226d838, 0x49385adb, 0xd6e2d945, 0x9af7566b, 0x052dd5f5,
+    0x7e335716, 0xe1e9d488, 0x880e52d0, 0x17d4d14e, 0x6cca53ad,
+    0xf310d033, 0x63296829, 0xfcf3ebb7, 0x87ed6954, 0x1837eaca,
+    0x71d06c92, 0xee0aef0c, 0x95146def, 0x0aceee71, 0x46db615f,
+    0xd901e2c1, 0xa21f6022, 0x3dc5e3bc, 0x542265e4, 0xcbf8e67a,
+    0xb0e66499, 0x2f3ce707, 0x28cd7ac5, 0xb717f95b, 0xcc097bb8,
+    0x53d3f826, 0x3a347e7e, 0xa5eefde0, 0xdef07f03, 0x412afc9d,
+    0x0d3f73b3, 0x92e5f02d, 0xe9fb72ce, 0x7621f150, 0x1fc67708,
+    0x801cf496, 0xfb027675, 0x64d8f5eb, 0x32b39da3, 0xad691e3d,
+    0xd6779cde, 0x49ad1f40, 0x204a9918, 0xbf901a86, 0xc48e9865,
+    0x5b541bfb, 0x174194d5, 0x889b174b, 0xf38595a8, 0x6c5f1636,
+    0x05b8906e, 0x9a6213f0, 0xe17c9113, 0x7ea6128d, 0x79578f4f,
+    0xe68d0cd1, 0x9d938e32, 0x02490dac, 0x6bae8bf4, 0xf474086a,
+    0x8f6a8a89, 0x10b00917, 0x5ca58639, 0xc37f05a7, 0xb8618744,
+    0x27bb04da, 0x4e5c8282, 0xd186011c, 0xaa9883ff, 0x35420061,
+    0xa57bb87b, 0x3aa13be5, 0x41bfb906, 0xde653a98, 0xb782bcc0,
+    0x28583f5e, 0x5346bdbd, 0xcc9c3e23, 0x8089b10d, 0x1f533293,
+    0x644db070, 0xfb9733ee, 0x9270b5b6, 0x0daa3628, 0x76b4b4cb,
+    0xe96e3755, 0xee9faa97, 0x71452909, 0x0a5babea, 0x95812874,
+    0xfc66ae2c, 0x63bc2db2, 0x18a2af51, 0x87782ccf, 0xcb6da3e1,
+    0x54b7207f, 0x2fa9a29c, 0xb0732102, 0xd994a75a, 0x464e24c4,
+    0x3d50a627, 0xa28a25b9, 0xc652d052, 0x598853cc, 0x2296d12f,
+    0xbd4c52b1, 0xd4abd4e9, 0x4b715777, 0x306fd594, 0xafb5560a,
+    0xe3a0d924, 0x7c7a5aba, 0x0764d859, 0x98be5bc7, 0xf159dd9f,
+    0x6e835e01, 0x159ddce2, 0x8a475f7c, 0x8db6c2be, 0x126c4120,
+    0x6972c3c3, 0xf6a8405d, 0x9f4fc605, 0x0095459b, 0x7b8bc778,
+    0xe45144e6, 0xa844cbc8, 0x379e4856, 0x4c80cab5, 0xd35a492b,
+    0xbabdcf73, 0x25674ced, 0x5e79ce0e, 0xc1a34d90, 0x519af58a,
+    0xce407614, 0xb55ef4f7, 0x2a847769, 0x4363f131, 0xdcb972af,
+    0xa7a7f04c, 0x387d73d2, 0x7468fcfc, 0xebb27f62, 0x90acfd81,
+    0x0f767e1f, 0x6691f847, 0xf94b7bd9, 0x8255f93a, 0x1d8f7aa4,
+    0x1a7ee766, 0x85a464f8, 0xfebae61b, 0x61606585, 0x0887e3dd,
+    0x975d6043, 0xec43e2a0, 0x7399613e, 0x3f8cee10, 0xa0566d8e,
+    0xdb48ef6d, 0x44926cf3, 0x2d75eaab, 0xb2af6935, 0xc9b1ebd6,
+    0x566b6848},
+   {0x00000000, 0x65673b46, 0xcace768c, 0xafa94dca, 0x4eedeb59,
+    0x2b8ad01f, 0x84239dd5, 0xe144a693, 0x9ddbd6b2, 0xf8bcedf4,
+    0x5715a03e, 0x32729b78, 0xd3363deb, 0xb65106ad, 0x19f84b67,
+    0x7c9f7021, 0xe0c6ab25, 0x85a19063, 0x2a08dda9, 0x4f6fe6ef,
+    0xae2b407c, 0xcb4c7b3a, 0x64e536f0, 0x01820db6, 0x7d1d7d97,
+    0x187a46d1, 0xb7d30b1b, 0xd2b4305d, 0x33f096ce, 0x5697ad88,
+    0xf93ee042, 0x9c59db04, 0x1afc500b, 0x7f9b6b4d, 0xd0322687,
+    0xb5551dc1, 0x5411bb52, 0x31768014, 0x9edfcdde, 0xfbb8f698,
+    0x872786b9, 0xe240bdff, 0x4de9f035, 0x288ecb73, 0xc9ca6de0,
+    0xacad56a6, 0x03041b6c, 0x6663202a, 0xfa3afb2e, 0x9f5dc068,
+    0x30f48da2, 0x5593b6e4, 0xb4d71077, 0xd1b02b31, 0x7e1966fb,
+    0x1b7e5dbd, 0x67e12d9c, 0x028616da, 0xad2f5b10, 0xc8486056,
+    0x290cc6c5, 0x4c6bfd83, 0xe3c2b049, 0x86a58b0f, 0x35f8a016,
+    0x509f9b50, 0xff36d69a, 0x9a51eddc, 0x7b154b4f, 0x1e727009,
+    0xb1db3dc3, 0xd4bc0685, 0xa82376a4, 0xcd444de2, 0x62ed0028,
+    0x078a3b6e, 0xe6ce9dfd, 0x83a9a6bb, 0x2c00eb71, 0x4967d037,
+    0xd53e0b33, 0xb0593075, 0x1ff07dbf, 0x7a9746f9, 0x9bd3e06a,
+    0xfeb4db2c, 0x511d96e6, 0x347aada0, 0x48e5dd81, 0x2d82e6c7,
+    0x822bab0d, 0xe74c904b, 0x060836d8, 0x636f0d9e, 0xccc64054,
+    0xa9a17b12, 0x2f04f01d, 0x4a63cb5b, 0xe5ca8691, 0x80adbdd7,
+    0x61e91b44, 0x048e2002, 0xab276dc8, 0xce40568e, 0xb2df26af,
+    0xd7b81de9, 0x78115023, 0x1d766b65, 0xfc32cdf6, 0x9955f6b0,
+    0x36fcbb7a, 0x539b803c, 0xcfc25b38, 0xaaa5607e, 0x050c2db4,
+    0x606b16f2, 0x812fb061, 0xe4488b27, 0x4be1c6ed, 0x2e86fdab,
+    0x52198d8a, 0x377eb6cc, 0x98d7fb06, 0xfdb0c040, 0x1cf466d3,
+    0x79935d95, 0xd63a105f, 0xb35d2b19, 0x6bf1402c, 0x0e967b6a,
+    0xa13f36a0, 0xc4580de6, 0x251cab75, 0x407b9033, 0xefd2ddf9,
+    0x8ab5e6bf, 0xf62a969e, 0x934dadd8, 0x3ce4e012, 0x5983db54,
+    0xb8c77dc7, 0xdda04681, 0x72090b4b, 0x176e300d, 0x8b37eb09,
+    0xee50d04f, 0x41f99d85, 0x249ea6c3, 0xc5da0050, 0xa0bd3b16,
+    0x0f1476dc, 0x6a734d9a, 0x16ec3dbb, 0x738b06fd, 0xdc224b37,
+    0xb9457071, 0x5801d6e2, 0x3d66eda4, 0x92cfa06e, 0xf7a89b28,
+    0x710d1027, 0x146a2b61, 0xbbc366ab, 0xdea45ded, 0x3fe0fb7e,
+    0x5a87c038, 0xf52e8df2, 0x9049b6b4, 0xecd6c695, 0x89b1fdd3,
+    0x2618b019, 0x437f8b5f, 0xa23b2dcc, 0xc75c168a, 0x68f55b40,
+    0x0d926006, 0x91cbbb02, 0xf4ac8044, 0x5b05cd8e, 0x3e62f6c8,
+    0xdf26505b, 0xba416b1d, 0x15e826d7, 0x708f1d91, 0x0c106db0,
+    0x697756f6, 0xc6de1b3c, 0xa3b9207a, 0x42fd86e9, 0x279abdaf,
+    0x8833f065, 0xed54cb23, 0x5e09e03a, 0x3b6edb7c, 0x94c796b6,
+    0xf1a0adf0, 0x10e40b63, 0x75833025, 0xda2a7def, 0xbf4d46a9,
+    0xc3d23688, 0xa6b50dce, 0x091c4004, 0x6c7b7b42, 0x8d3fddd1,
+    0xe858e697, 0x47f1ab5d, 0x2296901b, 0xbecf4b1f, 0xdba87059,
+    0x74013d93, 0x116606d5, 0xf022a046, 0x95459b00, 0x3aecd6ca,
+    0x5f8bed8c, 0x23149dad, 0x4673a6eb, 0xe9daeb21, 0x8cbdd067,
+    0x6df976f4, 0x089e4db2, 0xa7370078, 0xc2503b3e, 0x44f5b031,
+    0x21928b77, 0x8e3bc6bd, 0xeb5cfdfb, 0x0a185b68, 0x6f7f602e,
+    0xc0d62de4, 0xa5b116a2, 0xd92e6683, 0xbc495dc5, 0x13e0100f,
+    0x76872b49, 0x97c38dda, 0xf2a4b69c, 0x5d0dfb56, 0x386ac010,
+    0xa4331b14, 0xc1542052, 0x6efd6d98, 0x0b9a56de, 0xeadef04d,
+    0x8fb9cb0b, 0x201086c1, 0x4577bd87, 0x39e8cda6, 0x5c8ff6e0,
+    0xf326bb2a, 0x9641806c, 0x770526ff, 0x12621db9, 0xbdcb5073,
+    0xd8ac6b35},
+   {0x00000000, 0xd7e28058, 0x74b406f1, 0xa35686a9, 0xe9680de2,
+    0x3e8a8dba, 0x9ddc0b13, 0x4a3e8b4b, 0x09a11d85, 0xde439ddd,
+    0x7d151b74, 0xaaf79b2c, 0xe0c91067, 0x372b903f, 0x947d1696,
+    0x439f96ce, 0x13423b0a, 0xc4a0bb52, 0x67f63dfb, 0xb014bda3,
+    0xfa2a36e8, 0x2dc8b6b0, 0x8e9e3019, 0x597cb041, 0x1ae3268f,
+    0xcd01a6d7, 0x6e57207e, 0xb9b5a026, 0xf38b2b6d, 0x2469ab35,
+    0x873f2d9c, 0x50ddadc4, 0x26847614, 0xf166f64c, 0x523070e5,
+    0x85d2f0bd, 0xcfec7bf6, 0x180efbae, 0xbb587d07, 0x6cbafd5f,
+    0x2f256b91, 0xf8c7ebc9, 0x5b916d60, 0x8c73ed38, 0xc64d6673,
+    0x11afe62b, 0xb2f96082, 0x651be0da, 0x35c64d1e, 0xe224cd46,
+    0x41724bef, 0x9690cbb7, 0xdcae40fc, 0x0b4cc0a4, 0xa81a460d,
+    0x7ff8c655, 0x3c67509b, 0xeb85d0c3, 0x48d3566a, 0x9f31d632,
+    0xd50f5d79, 0x02eddd21, 0xa1bb5b88, 0x7659dbd0, 0x4d08ec28,
+    0x9aea6c70, 0x39bcead9, 0xee5e6a81, 0xa460e1ca, 0x73826192,
+    0xd0d4e73b, 0x07366763, 0x44a9f1ad, 0x934b71f5, 0x301df75c,
+    0xe7ff7704, 0xadc1fc4f, 0x7a237c17, 0xd975fabe, 0x0e977ae6,
+    0x5e4ad722, 0x89a8577a, 0x2afed1d3, 0xfd1c518b, 0xb722dac0,
+    0x60c05a98, 0xc396dc31, 0x14745c69, 0x57ebcaa7, 0x80094aff,
+    0x235fcc56, 0xf4bd4c0e, 0xbe83c745, 0x6961471d, 0xca37c1b4,
+    0x1dd541ec, 0x6b8c9a3c, 0xbc6e1a64, 0x1f389ccd, 0xc8da1c95,
+    0x82e497de, 0x55061786, 0xf650912f, 0x21b21177, 0x622d87b9,
+    0xb5cf07e1, 0x16998148, 0xc17b0110, 0x8b458a5b, 0x5ca70a03,
+    0xfff18caa, 0x28130cf2, 0x78cea136, 0xaf2c216e, 0x0c7aa7c7,
+    0xdb98279f, 0x91a6acd4, 0x46442c8c, 0xe512aa25, 0x32f02a7d,
+    0x716fbcb3, 0xa68d3ceb, 0x05dbba42, 0xd2393a1a, 0x9807b151,
+    0x4fe53109, 0xecb3b7a0, 0x3b5137f8, 0x9a11d850, 0x4df35808,
+    0xeea5dea1, 0x39475ef9, 0x7379d5b2, 0xa49b55ea, 0x07cdd343,
+    0xd02f531b, 0x93b0c5d5, 0x4452458d, 0xe704c324, 0x30e6437c,
+    0x7ad8c837, 0xad3a486f, 0x0e6ccec6, 0xd98e4e9e, 0x8953e35a,
+    0x5eb16302, 0xfde7e5ab, 0x2a0565f3, 0x603beeb8, 0xb7d96ee0,
+    0x148fe849, 0xc36d6811, 0x80f2fedf, 0x57107e87, 0xf446f82e,
+    0x23a47876, 0x699af33d, 0xbe787365, 0x1d2ef5cc, 0xcacc7594,
+    0xbc95ae44, 0x6b772e1c, 0xc821a8b5, 0x1fc328ed, 0x55fda3a6,
+    0x821f23fe, 0x2149a557, 0xf6ab250f, 0xb534b3c1, 0x62d63399,
+    0xc180b530, 0x16623568, 0x5c5cbe23, 0x8bbe3e7b, 0x28e8b8d2,
+    0xff0a388a, 0xafd7954e, 0x78351516, 0xdb6393bf, 0x0c8113e7,
+    0x46bf98ac, 0x915d18f4, 0x320b9e5d, 0xe5e91e05, 0xa67688cb,
+    0x71940893, 0xd2c28e3a, 0x05200e62, 0x4f1e8529, 0x98fc0571,
+    0x3baa83d8, 0xec480380, 0xd7193478, 0x00fbb420, 0xa3ad3289,
+    0x744fb2d1, 0x3e71399a, 0xe993b9c2, 0x4ac53f6b, 0x9d27bf33,
+    0xdeb829fd, 0x095aa9a5, 0xaa0c2f0c, 0x7deeaf54, 0x37d0241f,
+    0xe032a447, 0x436422ee, 0x9486a2b6, 0xc45b0f72, 0x13b98f2a,
+    0xb0ef0983, 0x670d89db, 0x2d330290, 0xfad182c8, 0x59870461,
+    0x8e658439, 0xcdfa12f7, 0x1a1892af, 0xb94e1406, 0x6eac945e,
+    0x24921f15, 0xf3709f4d, 0x502619e4, 0x87c499bc, 0xf19d426c,
+    0x267fc234, 0x8529449d, 0x52cbc4c5, 0x18f54f8e, 0xcf17cfd6,
+    0x6c41497f, 0xbba3c927, 0xf83c5fe9, 0x2fdedfb1, 0x8c885918,
+    0x5b6ad940, 0x1154520b, 0xc6b6d253, 0x65e054fa, 0xb202d4a2,
+    0xe2df7966, 0x353df93e, 0x966b7f97, 0x4189ffcf, 0x0bb77484,
+    0xdc55f4dc, 0x7f037275, 0xa8e1f22d, 0xeb7e64e3, 0x3c9ce4bb,
+    0x9fca6212, 0x4828e24a, 0x02166901, 0xd5f4e959, 0x76a26ff0,
+    0xa140efa8},
+   {0x00000000, 0xef52b6e1, 0x05d46b83, 0xea86dd62, 0x0ba8d706,
+    0xe4fa61e7, 0x0e7cbc85, 0xe12e0a64, 0x1751ae0c, 0xf80318ed,
+    0x1285c58f, 0xfdd7736e, 0x1cf9790a, 0xf3abcfeb, 0x192d1289,
+    0xf67fa468, 0x2ea35c18, 0xc1f1eaf9, 0x2b77379b, 0xc425817a,
+    0x250b8b1e, 0xca593dff, 0x20dfe09d, 0xcf8d567c, 0x39f2f214,
+    0xd6a044f5, 0x3c269997, 0xd3742f76, 0x325a2512, 0xdd0893f3,
+    0x378e4e91, 0xd8dcf870, 0x5d46b830, 0xb2140ed1, 0x5892d3b3,
+    0xb7c06552, 0x56ee6f36, 0xb9bcd9d7, 0x533a04b5, 0xbc68b254,
+    0x4a17163c, 0xa545a0dd, 0x4fc37dbf, 0xa091cb5e, 0x41bfc13a,
+    0xaeed77db, 0x446baab9, 0xab391c58, 0x73e5e428, 0x9cb752c9,
+    0x76318fab, 0x9963394a, 0x784d332e, 0x971f85cf, 0x7d9958ad,
+    0x92cbee4c, 0x64b44a24, 0x8be6fcc5, 0x616021a7, 0x8e329746,
+    0x6f1c9d22, 0x804e2bc3, 0x6ac8f6a1, 0x859a4040, 0xba8d7060,
+    0x55dfc681, 0xbf591be3, 0x500bad02, 0xb125a766, 0x5e771187,
+    0xb4f1cce5, 0x5ba37a04, 0xaddcde6c, 0x428e688d, 0xa808b5ef,
+    0x475a030e, 0xa674096a, 0x4926bf8b, 0xa3a062e9, 0x4cf2d408,
+    0x942e2c78, 0x7b7c9a99, 0x91fa47fb, 0x7ea8f11a, 0x9f86fb7e,
+    0x70d44d9f, 0x9a5290fd, 0x7500261c, 0x837f8274, 0x6c2d3495,
+    0x86abe9f7, 0x69f95f16, 0x88d75572, 0x6785e393, 0x8d033ef1,
+    0x62518810, 0xe7cbc850, 0x08997eb1, 0xe21fa3d3, 0x0d4d1532,
+    0xec631f56, 0x0331a9b7, 0xe9b774d5, 0x06e5c234, 0xf09a665c,
+    0x1fc8d0bd, 0xf54e0ddf, 0x1a1cbb3e, 0xfb32b15a, 0x146007bb,
+    0xfee6dad9, 0x11b46c38, 0xc9689448, 0x263a22a9, 0xccbcffcb,
+    0x23ee492a, 0xc2c0434e, 0x2d92f5af, 0xc71428cd, 0x28469e2c,
+    0xde393a44, 0x316b8ca5, 0xdbed51c7, 0x34bfe726, 0xd591ed42,
+    0x3ac35ba3, 0xd04586c1, 0x3f173020, 0xae6be681, 0x41395060,
+    0xabbf8d02, 0x44ed3be3, 0xa5c33187, 0x4a918766, 0xa0175a04,
+    0x4f45ece5, 0xb93a488d, 0x5668fe6c, 0xbcee230e, 0x53bc95ef,
+    0xb2929f8b, 0x5dc0296a, 0xb746f408, 0x581442e9, 0x80c8ba99,
+    0x6f9a0c78, 0x851cd11a, 0x6a4e67fb, 0x8b606d9f, 0x6432db7e,
+    0x8eb4061c, 0x61e6b0fd, 0x97991495, 0x78cba274, 0x924d7f16,
+    0x7d1fc9f7, 0x9c31c393, 0x73637572, 0x99e5a810, 0x76b71ef1,
+    0xf32d5eb1, 0x1c7fe850, 0xf6f93532, 0x19ab83d3, 0xf88589b7,
+    0x17d73f56, 0xfd51e234, 0x120354d5, 0xe47cf0bd, 0x0b2e465c,
+    0xe1a89b3e, 0x0efa2ddf, 0xefd427bb, 0x0086915a, 0xea004c38,
+    0x0552fad9, 0xdd8e02a9, 0x32dcb448, 0xd85a692a, 0x3708dfcb,
+    0xd626d5af, 0x3974634e, 0xd3f2be2c, 0x3ca008cd, 0xcadfaca5,
+    0x258d1a44, 0xcf0bc726, 0x205971c7, 0xc1777ba3, 0x2e25cd42,
+    0xc4a31020, 0x2bf1a6c1, 0x14e696e1, 0xfbb42000, 0x1132fd62,
+    0xfe604b83, 0x1f4e41e7, 0xf01cf706, 0x1a9a2a64, 0xf5c89c85,
+    0x03b738ed, 0xece58e0c, 0x0663536e, 0xe931e58f, 0x081fefeb,
+    0xe74d590a, 0x0dcb8468, 0xe2993289, 0x3a45caf9, 0xd5177c18,
+    0x3f91a17a, 0xd0c3179b, 0x31ed1dff, 0xdebfab1e, 0x3439767c,
+    0xdb6bc09d, 0x2d1464f5, 0xc246d214, 0x28c00f76, 0xc792b997,
+    0x26bcb3f3, 0xc9ee0512, 0x2368d870, 0xcc3a6e91, 0x49a02ed1,
+    0xa6f29830, 0x4c744552, 0xa326f3b3, 0x4208f9d7, 0xad5a4f36,
+    0x47dc9254, 0xa88e24b5, 0x5ef180dd, 0xb1a3363c, 0x5b25eb5e,
+    0xb4775dbf, 0x555957db, 0xba0be13a, 0x508d3c58, 0xbfdf8ab9,
+    0x670372c9, 0x8851c428, 0x62d7194a, 0x8d85afab, 0x6caba5cf,
+    0x83f9132e, 0x697fce4c, 0x862d78ad, 0x7052dcc5, 0x9f006a24,
+    0x7586b746, 0x9ad401a7, 0x7bfa0bc3, 0x94a8bd22, 0x7e2e6040,
+    0x917cd6a1},
+   {0x00000000, 0x87a6cb43, 0xd43c90c7, 0x539a5b84, 0x730827cf,
+    0xf4aeec8c, 0xa734b708, 0x20927c4b, 0xe6104f9e, 0x61b684dd,
+    0x322cdf59, 0xb58a141a, 0x95186851, 0x12bea312, 0x4124f896,
+    0xc68233d5, 0x1751997d, 0x90f7523e, 0xc36d09ba, 0x44cbc2f9,
+    0x6459beb2, 0xe3ff75f1, 0xb0652e75, 0x37c3e536, 0xf141d6e3,
+    0x76e71da0, 0x257d4624, 0xa2db8d67, 0x8249f12c, 0x05ef3a6f,
+    0x567561eb, 0xd1d3aaa8, 0x2ea332fa, 0xa905f9b9, 0xfa9fa23d,
+    0x7d39697e, 0x5dab1535, 0xda0dde76, 0x899785f2, 0x0e314eb1,
+    0xc8b37d64, 0x4f15b627, 0x1c8feda3, 0x9b2926e0, 0xbbbb5aab,
+    0x3c1d91e8, 0x6f87ca6c, 0xe821012f, 0x39f2ab87, 0xbe5460c4,
+    0xedce3b40, 0x6a68f003, 0x4afa8c48, 0xcd5c470b, 0x9ec61c8f,
+    0x1960d7cc, 0xdfe2e419, 0x58442f5a, 0x0bde74de, 0x8c78bf9d,
+    0xaceac3d6, 0x2b4c0895, 0x78d65311, 0xff709852, 0x5d4665f4,
+    0xdae0aeb7, 0x897af533, 0x0edc3e70, 0x2e4e423b, 0xa9e88978,
+    0xfa72d2fc, 0x7dd419bf, 0xbb562a6a, 0x3cf0e129, 0x6f6abaad,
+    0xe8cc71ee, 0xc85e0da5, 0x4ff8c6e6, 0x1c629d62, 0x9bc45621,
+    0x4a17fc89, 0xcdb137ca, 0x9e2b6c4e, 0x198da70d, 0x391fdb46,
+    0xbeb91005, 0xed234b81, 0x6a8580c2, 0xac07b317, 0x2ba17854,
+    0x783b23d0, 0xff9de893, 0xdf0f94d8, 0x58a95f9b, 0x0b33041f,
+    0x8c95cf5c, 0x73e5570e, 0xf4439c4d, 0xa7d9c7c9, 0x207f0c8a,
+    0x00ed70c1, 0x874bbb82, 0xd4d1e006, 0x53772b45, 0x95f51890,
+    0x1253d3d3, 0x41c98857, 0xc66f4314, 0xe6fd3f5f, 0x615bf41c,
+    0x32c1af98, 0xb56764db, 0x64b4ce73, 0xe3120530, 0xb0885eb4,
+    0x372e95f7, 0x17bce9bc, 0x901a22ff, 0xc380797b, 0x4426b238,
+    0x82a481ed, 0x05024aae, 0x5698112a, 0xd13eda69, 0xf1aca622,
+    0x760a6d61, 0x259036e5, 0xa236fda6, 0xba8ccbe8, 0x3d2a00ab,
+    0x6eb05b2f, 0xe916906c, 0xc984ec27, 0x4e222764, 0x1db87ce0,
+    0x9a1eb7a3, 0x5c9c8476, 0xdb3a4f35, 0x88a014b1, 0x0f06dff2,
+    0x2f94a3b9, 0xa83268fa, 0xfba8337e, 0x7c0ef83d, 0xaddd5295,
+    0x2a7b99d6, 0x79e1c252, 0xfe470911, 0xded5755a, 0x5973be19,
+    0x0ae9e59d, 0x8d4f2ede, 0x4bcd1d0b, 0xcc6bd648, 0x9ff18dcc,
+    0x1857468f, 0x38c53ac4, 0xbf63f187, 0xecf9aa03, 0x6b5f6140,
+    0x942ff912, 0x13893251, 0x401369d5, 0xc7b5a296, 0xe727dedd,
+    0x6081159e, 0x331b4e1a, 0xb4bd8559, 0x723fb68c, 0xf5997dcf,
+    0xa603264b, 0x21a5ed08, 0x01379143, 0x86915a00, 0xd50b0184,
+    0x52adcac7, 0x837e606f, 0x04d8ab2c, 0x5742f0a8, 0xd0e43beb,
+    0xf07647a0, 0x77d08ce3, 0x244ad767, 0xa3ec1c24, 0x656e2ff1,
+    0xe2c8e4b2, 0xb152bf36, 0x36f47475, 0x1666083e, 0x91c0c37d,
+    0xc25a98f9, 0x45fc53ba, 0xe7caae1c, 0x606c655f, 0x33f63edb,
+    0xb450f598, 0x94c289d3, 0x13644290, 0x40fe1914, 0xc758d257,
+    0x01dae182, 0x867c2ac1, 0xd5e67145, 0x5240ba06, 0x72d2c64d,
+    0xf5740d0e, 0xa6ee568a, 0x21489dc9, 0xf09b3761, 0x773dfc22,
+    0x24a7a7a6, 0xa3016ce5, 0x839310ae, 0x0435dbed, 0x57af8069,
+    0xd0094b2a, 0x168b78ff, 0x912db3bc, 0xc2b7e838, 0x4511237b,
+    0x65835f30, 0xe2259473, 0xb1bfcff7, 0x361904b4, 0xc9699ce6,
+    0x4ecf57a5, 0x1d550c21, 0x9af3c762, 0xba61bb29, 0x3dc7706a,
+    0x6e5d2bee, 0xe9fbe0ad, 0x2f79d378, 0xa8df183b, 0xfb4543bf,
+    0x7ce388fc, 0x5c71f4b7, 0xdbd73ff4, 0x884d6470, 0x0febaf33,
+    0xde38059b, 0x599eced8, 0x0a04955c, 0x8da25e1f, 0xad302254,
+    0x2a96e917, 0x790cb293, 0xfeaa79d0, 0x38284a05, 0xbf8e8146,
+    0xec14dac2, 0x6bb21181, 0x4b206dca, 0xcc86a689, 0x9f1cfd0d,
+    0x18ba364e}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x0000000000000000, 0x43cba68700000000, 0xc7903cd400000000,
+    0x845b9a5300000000, 0xcf27087300000000, 0x8cecaef400000000,
+    0x08b734a700000000, 0x4b7c922000000000, 0x9e4f10e600000000,
+    0xdd84b66100000000, 0x59df2c3200000000, 0x1a148ab500000000,
+    0x5168189500000000, 0x12a3be1200000000, 0x96f8244100000000,
+    0xd53382c600000000, 0x7d99511700000000, 0x3e52f79000000000,
+    0xba096dc300000000, 0xf9c2cb4400000000, 0xb2be596400000000,
+    0xf175ffe300000000, 0x752e65b000000000, 0x36e5c33700000000,
+    0xe3d641f100000000, 0xa01de77600000000, 0x24467d2500000000,
+    0x678ddba200000000, 0x2cf1498200000000, 0x6f3aef0500000000,
+    0xeb61755600000000, 0xa8aad3d100000000, 0xfa32a32e00000000,
+    0xb9f905a900000000, 0x3da29ffa00000000, 0x7e69397d00000000,
+    0x3515ab5d00000000, 0x76de0dda00000000, 0xf285978900000000,
+    0xb14e310e00000000, 0x647db3c800000000, 0x27b6154f00000000,
+    0xa3ed8f1c00000000, 0xe026299b00000000, 0xab5abbbb00000000,
+    0xe8911d3c00000000, 0x6cca876f00000000, 0x2f0121e800000000,
+    0x87abf23900000000, 0xc46054be00000000, 0x403bceed00000000,
+    0x03f0686a00000000, 0x488cfa4a00000000, 0x0b475ccd00000000,
+    0x8f1cc69e00000000, 0xccd7601900000000, 0x19e4e2df00000000,
+    0x5a2f445800000000, 0xde74de0b00000000, 0x9dbf788c00000000,
+    0xd6c3eaac00000000, 0x95084c2b00000000, 0x1153d67800000000,
+    0x529870ff00000000, 0xf465465d00000000, 0xb7aee0da00000000,
+    0x33f57a8900000000, 0x703edc0e00000000, 0x3b424e2e00000000,
+    0x7889e8a900000000, 0xfcd272fa00000000, 0xbf19d47d00000000,
+    0x6a2a56bb00000000, 0x29e1f03c00000000, 0xadba6a6f00000000,
+    0xee71cce800000000, 0xa50d5ec800000000, 0xe6c6f84f00000000,
+    0x629d621c00000000, 0x2156c49b00000000, 0x89fc174a00000000,
+    0xca37b1cd00000000, 0x4e6c2b9e00000000, 0x0da78d1900000000,
+    0x46db1f3900000000, 0x0510b9be00000000, 0x814b23ed00000000,
+    0xc280856a00000000, 0x17b307ac00000000, 0x5478a12b00000000,
+    0xd0233b7800000000, 0x93e89dff00000000, 0xd8940fdf00000000,
+    0x9b5fa95800000000, 0x1f04330b00000000, 0x5ccf958c00000000,
+    0x0e57e57300000000, 0x4d9c43f400000000, 0xc9c7d9a700000000,
+    0x8a0c7f2000000000, 0xc170ed0000000000, 0x82bb4b8700000000,
+    0x06e0d1d400000000, 0x452b775300000000, 0x9018f59500000000,
+    0xd3d3531200000000, 0x5788c94100000000, 0x14436fc600000000,
+    0x5f3ffde600000000, 0x1cf45b6100000000, 0x98afc13200000000,
+    0xdb6467b500000000, 0x73ceb46400000000, 0x300512e300000000,
+    0xb45e88b000000000, 0xf7952e3700000000, 0xbce9bc1700000000,
+    0xff221a9000000000, 0x7b7980c300000000, 0x38b2264400000000,
+    0xed81a48200000000, 0xae4a020500000000, 0x2a11985600000000,
+    0x69da3ed100000000, 0x22a6acf100000000, 0x616d0a7600000000,
+    0xe536902500000000, 0xa6fd36a200000000, 0xe8cb8cba00000000,
+    0xab002a3d00000000, 0x2f5bb06e00000000, 0x6c9016e900000000,
+    0x27ec84c900000000, 0x6427224e00000000, 0xe07cb81d00000000,
+    0xa3b71e9a00000000, 0x76849c5c00000000, 0x354f3adb00000000,
+    0xb114a08800000000, 0xf2df060f00000000, 0xb9a3942f00000000,
+    0xfa6832a800000000, 0x7e33a8fb00000000, 0x3df80e7c00000000,
+    0x9552ddad00000000, 0xd6997b2a00000000, 0x52c2e17900000000,
+    0x110947fe00000000, 0x5a75d5de00000000, 0x19be735900000000,
+    0x9de5e90a00000000, 0xde2e4f8d00000000, 0x0b1dcd4b00000000,
+    0x48d66bcc00000000, 0xcc8df19f00000000, 0x8f46571800000000,
+    0xc43ac53800000000, 0x87f163bf00000000, 0x03aaf9ec00000000,
+    0x40615f6b00000000, 0x12f92f9400000000, 0x5132891300000000,
+    0xd569134000000000, 0x96a2b5c700000000, 0xddde27e700000000,
+    0x9e15816000000000, 0x1a4e1b3300000000, 0x5985bdb400000000,
+    0x8cb63f7200000000, 0xcf7d99f500000000, 0x4b2603a600000000,
+    0x08eda52100000000, 0x4391370100000000, 0x005a918600000000,
+    0x84010bd500000000, 0xc7caad5200000000, 0x6f607e8300000000,
+    0x2cabd80400000000, 0xa8f0425700000000, 0xeb3be4d000000000,
+    0xa04776f000000000, 0xe38cd07700000000, 0x67d74a2400000000,
+    0x241ceca300000000, 0xf12f6e6500000000, 0xb2e4c8e200000000,
+    0x36bf52b100000000, 0x7574f43600000000, 0x3e08661600000000,
+    0x7dc3c09100000000, 0xf9985ac200000000, 0xba53fc4500000000,
+    0x1caecae700000000, 0x5f656c6000000000, 0xdb3ef63300000000,
+    0x98f550b400000000, 0xd389c29400000000, 0x9042641300000000,
+    0x1419fe4000000000, 0x57d258c700000000, 0x82e1da0100000000,
+    0xc12a7c8600000000, 0x4571e6d500000000, 0x06ba405200000000,
+    0x4dc6d27200000000, 0x0e0d74f500000000, 0x8a56eea600000000,
+    0xc99d482100000000, 0x61379bf000000000, 0x22fc3d7700000000,
+    0xa6a7a72400000000, 0xe56c01a300000000, 0xae10938300000000,
+    0xeddb350400000000, 0x6980af5700000000, 0x2a4b09d000000000,
+    0xff788b1600000000, 0xbcb32d9100000000, 0x38e8b7c200000000,
+    0x7b23114500000000, 0x305f836500000000, 0x739425e200000000,
+    0xf7cfbfb100000000, 0xb404193600000000, 0xe69c69c900000000,
+    0xa557cf4e00000000, 0x210c551d00000000, 0x62c7f39a00000000,
+    0x29bb61ba00000000, 0x6a70c73d00000000, 0xee2b5d6e00000000,
+    0xade0fbe900000000, 0x78d3792f00000000, 0x3b18dfa800000000,
+    0xbf4345fb00000000, 0xfc88e37c00000000, 0xb7f4715c00000000,
+    0xf43fd7db00000000, 0x70644d8800000000, 0x33afeb0f00000000,
+    0x9b0538de00000000, 0xd8ce9e5900000000, 0x5c95040a00000000,
+    0x1f5ea28d00000000, 0x542230ad00000000, 0x17e9962a00000000,
+    0x93b20c7900000000, 0xd079aafe00000000, 0x054a283800000000,
+    0x46818ebf00000000, 0xc2da14ec00000000, 0x8111b26b00000000,
+    0xca6d204b00000000, 0x89a686cc00000000, 0x0dfd1c9f00000000,
+    0x4e36ba1800000000},
+   {0x0000000000000000, 0xe1b652ef00000000, 0x836bd40500000000,
+    0x62dd86ea00000000, 0x06d7a80b00000000, 0xe761fae400000000,
+    0x85bc7c0e00000000, 0x640a2ee100000000, 0x0cae511700000000,
+    0xed1803f800000000, 0x8fc5851200000000, 0x6e73d7fd00000000,
+    0x0a79f91c00000000, 0xebcfabf300000000, 0x89122d1900000000,
+    0x68a47ff600000000, 0x185ca32e00000000, 0xf9eaf1c100000000,
+    0x9b37772b00000000, 0x7a8125c400000000, 0x1e8b0b2500000000,
+    0xff3d59ca00000000, 0x9de0df2000000000, 0x7c568dcf00000000,
+    0x14f2f23900000000, 0xf544a0d600000000, 0x9799263c00000000,
+    0x762f74d300000000, 0x12255a3200000000, 0xf39308dd00000000,
+    0x914e8e3700000000, 0x70f8dcd800000000, 0x30b8465d00000000,
+    0xd10e14b200000000, 0xb3d3925800000000, 0x5265c0b700000000,
+    0x366fee5600000000, 0xd7d9bcb900000000, 0xb5043a5300000000,
+    0x54b268bc00000000, 0x3c16174a00000000, 0xdda045a500000000,
+    0xbf7dc34f00000000, 0x5ecb91a000000000, 0x3ac1bf4100000000,
+    0xdb77edae00000000, 0xb9aa6b4400000000, 0x581c39ab00000000,
+    0x28e4e57300000000, 0xc952b79c00000000, 0xab8f317600000000,
+    0x4a39639900000000, 0x2e334d7800000000, 0xcf851f9700000000,
+    0xad58997d00000000, 0x4ceecb9200000000, 0x244ab46400000000,
+    0xc5fce68b00000000, 0xa721606100000000, 0x4697328e00000000,
+    0x229d1c6f00000000, 0xc32b4e8000000000, 0xa1f6c86a00000000,
+    0x40409a8500000000, 0x60708dba00000000, 0x81c6df5500000000,
+    0xe31b59bf00000000, 0x02ad0b5000000000, 0x66a725b100000000,
+    0x8711775e00000000, 0xe5ccf1b400000000, 0x047aa35b00000000,
+    0x6cdedcad00000000, 0x8d688e4200000000, 0xefb508a800000000,
+    0x0e035a4700000000, 0x6a0974a600000000, 0x8bbf264900000000,
+    0xe962a0a300000000, 0x08d4f24c00000000, 0x782c2e9400000000,
+    0x999a7c7b00000000, 0xfb47fa9100000000, 0x1af1a87e00000000,
+    0x7efb869f00000000, 0x9f4dd47000000000, 0xfd90529a00000000,
+    0x1c26007500000000, 0x74827f8300000000, 0x95342d6c00000000,
+    0xf7e9ab8600000000, 0x165ff96900000000, 0x7255d78800000000,
+    0x93e3856700000000, 0xf13e038d00000000, 0x1088516200000000,
+    0x50c8cbe700000000, 0xb17e990800000000, 0xd3a31fe200000000,
+    0x32154d0d00000000, 0x561f63ec00000000, 0xb7a9310300000000,
+    0xd574b7e900000000, 0x34c2e50600000000, 0x5c669af000000000,
+    0xbdd0c81f00000000, 0xdf0d4ef500000000, 0x3ebb1c1a00000000,
+    0x5ab132fb00000000, 0xbb07601400000000, 0xd9dae6fe00000000,
+    0x386cb41100000000, 0x489468c900000000, 0xa9223a2600000000,
+    0xcbffbccc00000000, 0x2a49ee2300000000, 0x4e43c0c200000000,
+    0xaff5922d00000000, 0xcd2814c700000000, 0x2c9e462800000000,
+    0x443a39de00000000, 0xa58c6b3100000000, 0xc751eddb00000000,
+    0x26e7bf3400000000, 0x42ed91d500000000, 0xa35bc33a00000000,
+    0xc18645d000000000, 0x2030173f00000000, 0x81e66bae00000000,
+    0x6050394100000000, 0x028dbfab00000000, 0xe33bed4400000000,
+    0x8731c3a500000000, 0x6687914a00000000, 0x045a17a000000000,
+    0xe5ec454f00000000, 0x8d483ab900000000, 0x6cfe685600000000,
+    0x0e23eebc00000000, 0xef95bc5300000000, 0x8b9f92b200000000,
+    0x6a29c05d00000000, 0x08f446b700000000, 0xe942145800000000,
+    0x99bac88000000000, 0x780c9a6f00000000, 0x1ad11c8500000000,
+    0xfb674e6a00000000, 0x9f6d608b00000000, 0x7edb326400000000,
+    0x1c06b48e00000000, 0xfdb0e66100000000, 0x9514999700000000,
+    0x74a2cb7800000000, 0x167f4d9200000000, 0xf7c91f7d00000000,
+    0x93c3319c00000000, 0x7275637300000000, 0x10a8e59900000000,
+    0xf11eb77600000000, 0xb15e2df300000000, 0x50e87f1c00000000,
+    0x3235f9f600000000, 0xd383ab1900000000, 0xb78985f800000000,
+    0x563fd71700000000, 0x34e251fd00000000, 0xd554031200000000,
+    0xbdf07ce400000000, 0x5c462e0b00000000, 0x3e9ba8e100000000,
+    0xdf2dfa0e00000000, 0xbb27d4ef00000000, 0x5a91860000000000,
+    0x384c00ea00000000, 0xd9fa520500000000, 0xa9028edd00000000,
+    0x48b4dc3200000000, 0x2a695ad800000000, 0xcbdf083700000000,
+    0xafd526d600000000, 0x4e63743900000000, 0x2cbef2d300000000,
+    0xcd08a03c00000000, 0xa5acdfca00000000, 0x441a8d2500000000,
+    0x26c70bcf00000000, 0xc771592000000000, 0xa37b77c100000000,
+    0x42cd252e00000000, 0x2010a3c400000000, 0xc1a6f12b00000000,
+    0xe196e61400000000, 0x0020b4fb00000000, 0x62fd321100000000,
+    0x834b60fe00000000, 0xe7414e1f00000000, 0x06f71cf000000000,
+    0x642a9a1a00000000, 0x859cc8f500000000, 0xed38b70300000000,
+    0x0c8ee5ec00000000, 0x6e53630600000000, 0x8fe531e900000000,
+    0xebef1f0800000000, 0x0a594de700000000, 0x6884cb0d00000000,
+    0x893299e200000000, 0xf9ca453a00000000, 0x187c17d500000000,
+    0x7aa1913f00000000, 0x9b17c3d000000000, 0xff1ded3100000000,
+    0x1eabbfde00000000, 0x7c76393400000000, 0x9dc06bdb00000000,
+    0xf564142d00000000, 0x14d246c200000000, 0x760fc02800000000,
+    0x97b992c700000000, 0xf3b3bc2600000000, 0x1205eec900000000,
+    0x70d8682300000000, 0x916e3acc00000000, 0xd12ea04900000000,
+    0x3098f2a600000000, 0x5245744c00000000, 0xb3f326a300000000,
+    0xd7f9084200000000, 0x364f5aad00000000, 0x5492dc4700000000,
+    0xb5248ea800000000, 0xdd80f15e00000000, 0x3c36a3b100000000,
+    0x5eeb255b00000000, 0xbf5d77b400000000, 0xdb57595500000000,
+    0x3ae10bba00000000, 0x583c8d5000000000, 0xb98adfbf00000000,
+    0xc972036700000000, 0x28c4518800000000, 0x4a19d76200000000,
+    0xabaf858d00000000, 0xcfa5ab6c00000000, 0x2e13f98300000000,
+    0x4cce7f6900000000, 0xad782d8600000000, 0xc5dc527000000000,
+    0x246a009f00000000, 0x46b7867500000000, 0xa701d49a00000000,
+    0xc30bfa7b00000000, 0x22bda89400000000, 0x40602e7e00000000,
+    0xa1d67c9100000000},
+   {0x0000000000000000, 0x5880e2d700000000, 0xf106b47400000000,
+    0xa98656a300000000, 0xe20d68e900000000, 0xba8d8a3e00000000,
+    0x130bdc9d00000000, 0x4b8b3e4a00000000, 0x851da10900000000,
+    0xdd9d43de00000000, 0x741b157d00000000, 0x2c9bf7aa00000000,
+    0x6710c9e000000000, 0x3f902b3700000000, 0x96167d9400000000,
+    0xce969f4300000000, 0x0a3b421300000000, 0x52bba0c400000000,
+    0xfb3df66700000000, 0xa3bd14b000000000, 0xe8362afa00000000,
+    0xb0b6c82d00000000, 0x19309e8e00000000, 0x41b07c5900000000,
+    0x8f26e31a00000000, 0xd7a601cd00000000, 0x7e20576e00000000,
+    0x26a0b5b900000000, 0x6d2b8bf300000000, 0x35ab692400000000,
+    0x9c2d3f8700000000, 0xc4addd5000000000, 0x1476842600000000,
+    0x4cf666f100000000, 0xe570305200000000, 0xbdf0d28500000000,
+    0xf67beccf00000000, 0xaefb0e1800000000, 0x077d58bb00000000,
+    0x5ffdba6c00000000, 0x916b252f00000000, 0xc9ebc7f800000000,
+    0x606d915b00000000, 0x38ed738c00000000, 0x73664dc600000000,
+    0x2be6af1100000000, 0x8260f9b200000000, 0xdae01b6500000000,
+    0x1e4dc63500000000, 0x46cd24e200000000, 0xef4b724100000000,
+    0xb7cb909600000000, 0xfc40aedc00000000, 0xa4c04c0b00000000,
+    0x0d461aa800000000, 0x55c6f87f00000000, 0x9b50673c00000000,
+    0xc3d085eb00000000, 0x6a56d34800000000, 0x32d6319f00000000,
+    0x795d0fd500000000, 0x21dded0200000000, 0x885bbba100000000,
+    0xd0db597600000000, 0x28ec084d00000000, 0x706cea9a00000000,
+    0xd9eabc3900000000, 0x816a5eee00000000, 0xcae160a400000000,
+    0x9261827300000000, 0x3be7d4d000000000, 0x6367360700000000,
+    0xadf1a94400000000, 0xf5714b9300000000, 0x5cf71d3000000000,
+    0x0477ffe700000000, 0x4ffcc1ad00000000, 0x177c237a00000000,
+    0xbefa75d900000000, 0xe67a970e00000000, 0x22d74a5e00000000,
+    0x7a57a88900000000, 0xd3d1fe2a00000000, 0x8b511cfd00000000,
+    0xc0da22b700000000, 0x985ac06000000000, 0x31dc96c300000000,
+    0x695c741400000000, 0xa7caeb5700000000, 0xff4a098000000000,
+    0x56cc5f2300000000, 0x0e4cbdf400000000, 0x45c783be00000000,
+    0x1d47616900000000, 0xb4c137ca00000000, 0xec41d51d00000000,
+    0x3c9a8c6b00000000, 0x641a6ebc00000000, 0xcd9c381f00000000,
+    0x951cdac800000000, 0xde97e48200000000, 0x8617065500000000,
+    0x2f9150f600000000, 0x7711b22100000000, 0xb9872d6200000000,
+    0xe107cfb500000000, 0x4881991600000000, 0x10017bc100000000,
+    0x5b8a458b00000000, 0x030aa75c00000000, 0xaa8cf1ff00000000,
+    0xf20c132800000000, 0x36a1ce7800000000, 0x6e212caf00000000,
+    0xc7a77a0c00000000, 0x9f2798db00000000, 0xd4aca69100000000,
+    0x8c2c444600000000, 0x25aa12e500000000, 0x7d2af03200000000,
+    0xb3bc6f7100000000, 0xeb3c8da600000000, 0x42badb0500000000,
+    0x1a3a39d200000000, 0x51b1079800000000, 0x0931e54f00000000,
+    0xa0b7b3ec00000000, 0xf837513b00000000, 0x50d8119a00000000,
+    0x0858f34d00000000, 0xa1dea5ee00000000, 0xf95e473900000000,
+    0xb2d5797300000000, 0xea559ba400000000, 0x43d3cd0700000000,
+    0x1b532fd000000000, 0xd5c5b09300000000, 0x8d45524400000000,
+    0x24c304e700000000, 0x7c43e63000000000, 0x37c8d87a00000000,
+    0x6f483aad00000000, 0xc6ce6c0e00000000, 0x9e4e8ed900000000,
+    0x5ae3538900000000, 0x0263b15e00000000, 0xabe5e7fd00000000,
+    0xf365052a00000000, 0xb8ee3b6000000000, 0xe06ed9b700000000,
+    0x49e88f1400000000, 0x11686dc300000000, 0xdffef28000000000,
+    0x877e105700000000, 0x2ef846f400000000, 0x7678a42300000000,
+    0x3df39a6900000000, 0x657378be00000000, 0xccf52e1d00000000,
+    0x9475ccca00000000, 0x44ae95bc00000000, 0x1c2e776b00000000,
+    0xb5a821c800000000, 0xed28c31f00000000, 0xa6a3fd5500000000,
+    0xfe231f8200000000, 0x57a5492100000000, 0x0f25abf600000000,
+    0xc1b334b500000000, 0x9933d66200000000, 0x30b580c100000000,
+    0x6835621600000000, 0x23be5c5c00000000, 0x7b3ebe8b00000000,
+    0xd2b8e82800000000, 0x8a380aff00000000, 0x4e95d7af00000000,
+    0x1615357800000000, 0xbf9363db00000000, 0xe713810c00000000,
+    0xac98bf4600000000, 0xf4185d9100000000, 0x5d9e0b3200000000,
+    0x051ee9e500000000, 0xcb8876a600000000, 0x9308947100000000,
+    0x3a8ec2d200000000, 0x620e200500000000, 0x29851e4f00000000,
+    0x7105fc9800000000, 0xd883aa3b00000000, 0x800348ec00000000,
+    0x783419d700000000, 0x20b4fb0000000000, 0x8932ada300000000,
+    0xd1b24f7400000000, 0x9a39713e00000000, 0xc2b993e900000000,
+    0x6b3fc54a00000000, 0x33bf279d00000000, 0xfd29b8de00000000,
+    0xa5a95a0900000000, 0x0c2f0caa00000000, 0x54afee7d00000000,
+    0x1f24d03700000000, 0x47a432e000000000, 0xee22644300000000,
+    0xb6a2869400000000, 0x720f5bc400000000, 0x2a8fb91300000000,
+    0x8309efb000000000, 0xdb890d6700000000, 0x9002332d00000000,
+    0xc882d1fa00000000, 0x6104875900000000, 0x3984658e00000000,
+    0xf712facd00000000, 0xaf92181a00000000, 0x06144eb900000000,
+    0x5e94ac6e00000000, 0x151f922400000000, 0x4d9f70f300000000,
+    0xe419265000000000, 0xbc99c48700000000, 0x6c429df100000000,
+    0x34c27f2600000000, 0x9d44298500000000, 0xc5c4cb5200000000,
+    0x8e4ff51800000000, 0xd6cf17cf00000000, 0x7f49416c00000000,
+    0x27c9a3bb00000000, 0xe95f3cf800000000, 0xb1dfde2f00000000,
+    0x1859888c00000000, 0x40d96a5b00000000, 0x0b52541100000000,
+    0x53d2b6c600000000, 0xfa54e06500000000, 0xa2d402b200000000,
+    0x6679dfe200000000, 0x3ef93d3500000000, 0x977f6b9600000000,
+    0xcfff894100000000, 0x8474b70b00000000, 0xdcf455dc00000000,
+    0x7572037f00000000, 0x2df2e1a800000000, 0xe3647eeb00000000,
+    0xbbe49c3c00000000, 0x1262ca9f00000000, 0x4ae2284800000000,
+    0x0169160200000000, 0x59e9f4d500000000, 0xf06fa27600000000,
+    0xa8ef40a100000000},
+   {0x0000000000000000, 0x463b676500000000, 0x8c76ceca00000000,
+    0xca4da9af00000000, 0x59ebed4e00000000, 0x1fd08a2b00000000,
+    0xd59d238400000000, 0x93a644e100000000, 0xb2d6db9d00000000,
+    0xf4edbcf800000000, 0x3ea0155700000000, 0x789b723200000000,
+    0xeb3d36d300000000, 0xad0651b600000000, 0x674bf81900000000,
+    0x21709f7c00000000, 0x25abc6e000000000, 0x6390a18500000000,
+    0xa9dd082a00000000, 0xefe66f4f00000000, 0x7c402bae00000000,
+    0x3a7b4ccb00000000, 0xf036e56400000000, 0xb60d820100000000,
+    0x977d1d7d00000000, 0xd1467a1800000000, 0x1b0bd3b700000000,
+    0x5d30b4d200000000, 0xce96f03300000000, 0x88ad975600000000,
+    0x42e03ef900000000, 0x04db599c00000000, 0x0b50fc1a00000000,
+    0x4d6b9b7f00000000, 0x872632d000000000, 0xc11d55b500000000,
+    0x52bb115400000000, 0x1480763100000000, 0xdecddf9e00000000,
+    0x98f6b8fb00000000, 0xb986278700000000, 0xffbd40e200000000,
+    0x35f0e94d00000000, 0x73cb8e2800000000, 0xe06dcac900000000,
+    0xa656adac00000000, 0x6c1b040300000000, 0x2a20636600000000,
+    0x2efb3afa00000000, 0x68c05d9f00000000, 0xa28df43000000000,
+    0xe4b6935500000000, 0x7710d7b400000000, 0x312bb0d100000000,
+    0xfb66197e00000000, 0xbd5d7e1b00000000, 0x9c2de16700000000,
+    0xda16860200000000, 0x105b2fad00000000, 0x566048c800000000,
+    0xc5c60c2900000000, 0x83fd6b4c00000000, 0x49b0c2e300000000,
+    0x0f8ba58600000000, 0x16a0f83500000000, 0x509b9f5000000000,
+    0x9ad636ff00000000, 0xdced519a00000000, 0x4f4b157b00000000,
+    0x0970721e00000000, 0xc33ddbb100000000, 0x8506bcd400000000,
+    0xa47623a800000000, 0xe24d44cd00000000, 0x2800ed6200000000,
+    0x6e3b8a0700000000, 0xfd9dcee600000000, 0xbba6a98300000000,
+    0x71eb002c00000000, 0x37d0674900000000, 0x330b3ed500000000,
+    0x753059b000000000, 0xbf7df01f00000000, 0xf946977a00000000,
+    0x6ae0d39b00000000, 0x2cdbb4fe00000000, 0xe6961d5100000000,
+    0xa0ad7a3400000000, 0x81dde54800000000, 0xc7e6822d00000000,
+    0x0dab2b8200000000, 0x4b904ce700000000, 0xd836080600000000,
+    0x9e0d6f6300000000, 0x5440c6cc00000000, 0x127ba1a900000000,
+    0x1df0042f00000000, 0x5bcb634a00000000, 0x9186cae500000000,
+    0xd7bdad8000000000, 0x441be96100000000, 0x02208e0400000000,
+    0xc86d27ab00000000, 0x8e5640ce00000000, 0xaf26dfb200000000,
+    0xe91db8d700000000, 0x2350117800000000, 0x656b761d00000000,
+    0xf6cd32fc00000000, 0xb0f6559900000000, 0x7abbfc3600000000,
+    0x3c809b5300000000, 0x385bc2cf00000000, 0x7e60a5aa00000000,
+    0xb42d0c0500000000, 0xf2166b6000000000, 0x61b02f8100000000,
+    0x278b48e400000000, 0xedc6e14b00000000, 0xabfd862e00000000,
+    0x8a8d195200000000, 0xccb67e3700000000, 0x06fbd79800000000,
+    0x40c0b0fd00000000, 0xd366f41c00000000, 0x955d937900000000,
+    0x5f103ad600000000, 0x192b5db300000000, 0x2c40f16b00000000,
+    0x6a7b960e00000000, 0xa0363fa100000000, 0xe60d58c400000000,
+    0x75ab1c2500000000, 0x33907b4000000000, 0xf9ddd2ef00000000,
+    0xbfe6b58a00000000, 0x9e962af600000000, 0xd8ad4d9300000000,
+    0x12e0e43c00000000, 0x54db835900000000, 0xc77dc7b800000000,
+    0x8146a0dd00000000, 0x4b0b097200000000, 0x0d306e1700000000,
+    0x09eb378b00000000, 0x4fd050ee00000000, 0x859df94100000000,
+    0xc3a69e2400000000, 0x5000dac500000000, 0x163bbda000000000,
+    0xdc76140f00000000, 0x9a4d736a00000000, 0xbb3dec1600000000,
+    0xfd068b7300000000, 0x374b22dc00000000, 0x717045b900000000,
+    0xe2d6015800000000, 0xa4ed663d00000000, 0x6ea0cf9200000000,
+    0x289ba8f700000000, 0x27100d7100000000, 0x612b6a1400000000,
+    0xab66c3bb00000000, 0xed5da4de00000000, 0x7efbe03f00000000,
+    0x38c0875a00000000, 0xf28d2ef500000000, 0xb4b6499000000000,
+    0x95c6d6ec00000000, 0xd3fdb18900000000, 0x19b0182600000000,
+    0x5f8b7f4300000000, 0xcc2d3ba200000000, 0x8a165cc700000000,
+    0x405bf56800000000, 0x0660920d00000000, 0x02bbcb9100000000,
+    0x4480acf400000000, 0x8ecd055b00000000, 0xc8f6623e00000000,
+    0x5b5026df00000000, 0x1d6b41ba00000000, 0xd726e81500000000,
+    0x911d8f7000000000, 0xb06d100c00000000, 0xf656776900000000,
+    0x3c1bdec600000000, 0x7a20b9a300000000, 0xe986fd4200000000,
+    0xafbd9a2700000000, 0x65f0338800000000, 0x23cb54ed00000000,
+    0x3ae0095e00000000, 0x7cdb6e3b00000000, 0xb696c79400000000,
+    0xf0ada0f100000000, 0x630be41000000000, 0x2530837500000000,
+    0xef7d2ada00000000, 0xa9464dbf00000000, 0x8836d2c300000000,
+    0xce0db5a600000000, 0x04401c0900000000, 0x427b7b6c00000000,
+    0xd1dd3f8d00000000, 0x97e658e800000000, 0x5dabf14700000000,
+    0x1b90962200000000, 0x1f4bcfbe00000000, 0x5970a8db00000000,
+    0x933d017400000000, 0xd506661100000000, 0x46a022f000000000,
+    0x009b459500000000, 0xcad6ec3a00000000, 0x8ced8b5f00000000,
+    0xad9d142300000000, 0xeba6734600000000, 0x21ebdae900000000,
+    0x67d0bd8c00000000, 0xf476f96d00000000, 0xb24d9e0800000000,
+    0x780037a700000000, 0x3e3b50c200000000, 0x31b0f54400000000,
+    0x778b922100000000, 0xbdc63b8e00000000, 0xfbfd5ceb00000000,
+    0x685b180a00000000, 0x2e607f6f00000000, 0xe42dd6c000000000,
+    0xa216b1a500000000, 0x83662ed900000000, 0xc55d49bc00000000,
+    0x0f10e01300000000, 0x492b877600000000, 0xda8dc39700000000,
+    0x9cb6a4f200000000, 0x56fb0d5d00000000, 0x10c06a3800000000,
+    0x141b33a400000000, 0x522054c100000000, 0x986dfd6e00000000,
+    0xde569a0b00000000, 0x4df0deea00000000, 0x0bcbb98f00000000,
+    0xc186102000000000, 0x87bd774500000000, 0xa6cde83900000000,
+    0xe0f68f5c00000000, 0x2abb26f300000000, 0x6c80419600000000,
+    0xff26057700000000, 0xb91d621200000000, 0x7350cbbd00000000,
+    0x356bacd800000000},
+   {0x0000000000000000, 0x9e83da9f00000000, 0x7d01c4e400000000,
+    0xe3821e7b00000000, 0xbb04f91200000000, 0x2587238d00000000,
+    0xc6053df600000000, 0x5886e76900000000, 0x7609f22500000000,
+    0xe88a28ba00000000, 0x0b0836c100000000, 0x958bec5e00000000,
+    0xcd0d0b3700000000, 0x538ed1a800000000, 0xb00ccfd300000000,
+    0x2e8f154c00000000, 0xec12e44b00000000, 0x72913ed400000000,
+    0x911320af00000000, 0x0f90fa3000000000, 0x57161d5900000000,
+    0xc995c7c600000000, 0x2a17d9bd00000000, 0xb494032200000000,
+    0x9a1b166e00000000, 0x0498ccf100000000, 0xe71ad28a00000000,
+    0x7999081500000000, 0x211fef7c00000000, 0xbf9c35e300000000,
+    0x5c1e2b9800000000, 0xc29df10700000000, 0xd825c89700000000,
+    0x46a6120800000000, 0xa5240c7300000000, 0x3ba7d6ec00000000,
+    0x6321318500000000, 0xfda2eb1a00000000, 0x1e20f56100000000,
+    0x80a32ffe00000000, 0xae2c3ab200000000, 0x30afe02d00000000,
+    0xd32dfe5600000000, 0x4dae24c900000000, 0x1528c3a000000000,
+    0x8bab193f00000000, 0x6829074400000000, 0xf6aadddb00000000,
+    0x34372cdc00000000, 0xaab4f64300000000, 0x4936e83800000000,
+    0xd7b532a700000000, 0x8f33d5ce00000000, 0x11b00f5100000000,
+    0xf232112a00000000, 0x6cb1cbb500000000, 0x423edef900000000,
+    0xdcbd046600000000, 0x3f3f1a1d00000000, 0xa1bcc08200000000,
+    0xf93a27eb00000000, 0x67b9fd7400000000, 0x843be30f00000000,
+    0x1ab8399000000000, 0xf14de1f400000000, 0x6fce3b6b00000000,
+    0x8c4c251000000000, 0x12cfff8f00000000, 0x4a4918e600000000,
+    0xd4cac27900000000, 0x3748dc0200000000, 0xa9cb069d00000000,
+    0x874413d100000000, 0x19c7c94e00000000, 0xfa45d73500000000,
+    0x64c60daa00000000, 0x3c40eac300000000, 0xa2c3305c00000000,
+    0x41412e2700000000, 0xdfc2f4b800000000, 0x1d5f05bf00000000,
+    0x83dcdf2000000000, 0x605ec15b00000000, 0xfedd1bc400000000,
+    0xa65bfcad00000000, 0x38d8263200000000, 0xdb5a384900000000,
+    0x45d9e2d600000000, 0x6b56f79a00000000, 0xf5d52d0500000000,
+    0x1657337e00000000, 0x88d4e9e100000000, 0xd0520e8800000000,
+    0x4ed1d41700000000, 0xad53ca6c00000000, 0x33d010f300000000,
+    0x2968296300000000, 0xb7ebf3fc00000000, 0x5469ed8700000000,
+    0xcaea371800000000, 0x926cd07100000000, 0x0cef0aee00000000,
+    0xef6d149500000000, 0x71eece0a00000000, 0x5f61db4600000000,
+    0xc1e201d900000000, 0x22601fa200000000, 0xbce3c53d00000000,
+    0xe465225400000000, 0x7ae6f8cb00000000, 0x9964e6b000000000,
+    0x07e73c2f00000000, 0xc57acd2800000000, 0x5bf917b700000000,
+    0xb87b09cc00000000, 0x26f8d35300000000, 0x7e7e343a00000000,
+    0xe0fdeea500000000, 0x037ff0de00000000, 0x9dfc2a4100000000,
+    0xb3733f0d00000000, 0x2df0e59200000000, 0xce72fbe900000000,
+    0x50f1217600000000, 0x0877c61f00000000, 0x96f41c8000000000,
+    0x757602fb00000000, 0xebf5d86400000000, 0xa39db33200000000,
+    0x3d1e69ad00000000, 0xde9c77d600000000, 0x401fad4900000000,
+    0x18994a2000000000, 0x861a90bf00000000, 0x65988ec400000000,
+    0xfb1b545b00000000, 0xd594411700000000, 0x4b179b8800000000,
+    0xa89585f300000000, 0x36165f6c00000000, 0x6e90b80500000000,
+    0xf013629a00000000, 0x13917ce100000000, 0x8d12a67e00000000,
+    0x4f8f577900000000, 0xd10c8de600000000, 0x328e939d00000000,
+    0xac0d490200000000, 0xf48bae6b00000000, 0x6a0874f400000000,
+    0x898a6a8f00000000, 0x1709b01000000000, 0x3986a55c00000000,
+    0xa7057fc300000000, 0x448761b800000000, 0xda04bb2700000000,
+    0x82825c4e00000000, 0x1c0186d100000000, 0xff8398aa00000000,
+    0x6100423500000000, 0x7bb87ba500000000, 0xe53ba13a00000000,
+    0x06b9bf4100000000, 0x983a65de00000000, 0xc0bc82b700000000,
+    0x5e3f582800000000, 0xbdbd465300000000, 0x233e9ccc00000000,
+    0x0db1898000000000, 0x9332531f00000000, 0x70b04d6400000000,
+    0xee3397fb00000000, 0xb6b5709200000000, 0x2836aa0d00000000,
+    0xcbb4b47600000000, 0x55376ee900000000, 0x97aa9fee00000000,
+    0x0929457100000000, 0xeaab5b0a00000000, 0x7428819500000000,
+    0x2cae66fc00000000, 0xb22dbc6300000000, 0x51afa21800000000,
+    0xcf2c788700000000, 0xe1a36dcb00000000, 0x7f20b75400000000,
+    0x9ca2a92f00000000, 0x022173b000000000, 0x5aa794d900000000,
+    0xc4244e4600000000, 0x27a6503d00000000, 0xb9258aa200000000,
+    0x52d052c600000000, 0xcc53885900000000, 0x2fd1962200000000,
+    0xb1524cbd00000000, 0xe9d4abd400000000, 0x7757714b00000000,
+    0x94d56f3000000000, 0x0a56b5af00000000, 0x24d9a0e300000000,
+    0xba5a7a7c00000000, 0x59d8640700000000, 0xc75bbe9800000000,
+    0x9fdd59f100000000, 0x015e836e00000000, 0xe2dc9d1500000000,
+    0x7c5f478a00000000, 0xbec2b68d00000000, 0x20416c1200000000,
+    0xc3c3726900000000, 0x5d40a8f600000000, 0x05c64f9f00000000,
+    0x9b45950000000000, 0x78c78b7b00000000, 0xe64451e400000000,
+    0xc8cb44a800000000, 0x56489e3700000000, 0xb5ca804c00000000,
+    0x2b495ad300000000, 0x73cfbdba00000000, 0xed4c672500000000,
+    0x0ece795e00000000, 0x904da3c100000000, 0x8af59a5100000000,
+    0x147640ce00000000, 0xf7f45eb500000000, 0x6977842a00000000,
+    0x31f1634300000000, 0xaf72b9dc00000000, 0x4cf0a7a700000000,
+    0xd2737d3800000000, 0xfcfc687400000000, 0x627fb2eb00000000,
+    0x81fdac9000000000, 0x1f7e760f00000000, 0x47f8916600000000,
+    0xd97b4bf900000000, 0x3af9558200000000, 0xa47a8f1d00000000,
+    0x66e77e1a00000000, 0xf864a48500000000, 0x1be6bafe00000000,
+    0x8565606100000000, 0xdde3870800000000, 0x43605d9700000000,
+    0xa0e243ec00000000, 0x3e61997300000000, 0x10ee8c3f00000000,
+    0x8e6d56a000000000, 0x6def48db00000000, 0xf36c924400000000,
+    0xabea752d00000000, 0x3569afb200000000, 0xd6ebb1c900000000,
+    0x48686b5600000000},
+   {0x0000000000000000, 0xc064281700000000, 0x80c9502e00000000,
+    0x40ad783900000000, 0x0093a15c00000000, 0xc0f7894b00000000,
+    0x805af17200000000, 0x403ed96500000000, 0x002643b900000000,
+    0xc0426bae00000000, 0x80ef139700000000, 0x408b3b8000000000,
+    0x00b5e2e500000000, 0xc0d1caf200000000, 0x807cb2cb00000000,
+    0x40189adc00000000, 0x414af7a900000000, 0x812edfbe00000000,
+    0xc183a78700000000, 0x01e78f9000000000, 0x41d956f500000000,
+    0x81bd7ee200000000, 0xc11006db00000000, 0x01742ecc00000000,
+    0x416cb41000000000, 0x81089c0700000000, 0xc1a5e43e00000000,
+    0x01c1cc2900000000, 0x41ff154c00000000, 0x819b3d5b00000000,
+    0xc136456200000000, 0x01526d7500000000, 0xc3929f8800000000,
+    0x03f6b79f00000000, 0x435bcfa600000000, 0x833fe7b100000000,
+    0xc3013ed400000000, 0x036516c300000000, 0x43c86efa00000000,
+    0x83ac46ed00000000, 0xc3b4dc3100000000, 0x03d0f42600000000,
+    0x437d8c1f00000000, 0x8319a40800000000, 0xc3277d6d00000000,
+    0x0343557a00000000, 0x43ee2d4300000000, 0x838a055400000000,
+    0x82d8682100000000, 0x42bc403600000000, 0x0211380f00000000,
+    0xc275101800000000, 0x824bc97d00000000, 0x422fe16a00000000,
+    0x0282995300000000, 0xc2e6b14400000000, 0x82fe2b9800000000,
+    0x429a038f00000000, 0x02377bb600000000, 0xc25353a100000000,
+    0x826d8ac400000000, 0x4209a2d300000000, 0x02a4daea00000000,
+    0xc2c0f2fd00000000, 0xc7234eca00000000, 0x074766dd00000000,
+    0x47ea1ee400000000, 0x878e36f300000000, 0xc7b0ef9600000000,
+    0x07d4c78100000000, 0x4779bfb800000000, 0x871d97af00000000,
+    0xc7050d7300000000, 0x0761256400000000, 0x47cc5d5d00000000,
+    0x87a8754a00000000, 0xc796ac2f00000000, 0x07f2843800000000,
+    0x475ffc0100000000, 0x873bd41600000000, 0x8669b96300000000,
+    0x460d917400000000, 0x06a0e94d00000000, 0xc6c4c15a00000000,
+    0x86fa183f00000000, 0x469e302800000000, 0x0633481100000000,
+    0xc657600600000000, 0x864ffada00000000, 0x462bd2cd00000000,
+    0x0686aaf400000000, 0xc6e282e300000000, 0x86dc5b8600000000,
+    0x46b8739100000000, 0x06150ba800000000, 0xc67123bf00000000,
+    0x04b1d14200000000, 0xc4d5f95500000000, 0x8478816c00000000,
+    0x441ca97b00000000, 0x0422701e00000000, 0xc446580900000000,
+    0x84eb203000000000, 0x448f082700000000, 0x049792fb00000000,
+    0xc4f3baec00000000, 0x845ec2d500000000, 0x443aeac200000000,
+    0x040433a700000000, 0xc4601bb000000000, 0x84cd638900000000,
+    0x44a94b9e00000000, 0x45fb26eb00000000, 0x859f0efc00000000,
+    0xc53276c500000000, 0x05565ed200000000, 0x456887b700000000,
+    0x850cafa000000000, 0xc5a1d79900000000, 0x05c5ff8e00000000,
+    0x45dd655200000000, 0x85b94d4500000000, 0xc514357c00000000,
+    0x05701d6b00000000, 0x454ec40e00000000, 0x852aec1900000000,
+    0xc587942000000000, 0x05e3bc3700000000, 0xcf41ed4f00000000,
+    0x0f25c55800000000, 0x4f88bd6100000000, 0x8fec957600000000,
+    0xcfd24c1300000000, 0x0fb6640400000000, 0x4f1b1c3d00000000,
+    0x8f7f342a00000000, 0xcf67aef600000000, 0x0f0386e100000000,
+    0x4faefed800000000, 0x8fcad6cf00000000, 0xcff40faa00000000,
+    0x0f9027bd00000000, 0x4f3d5f8400000000, 0x8f59779300000000,
+    0x8e0b1ae600000000, 0x4e6f32f100000000, 0x0ec24ac800000000,
+    0xcea662df00000000, 0x8e98bbba00000000, 0x4efc93ad00000000,
+    0x0e51eb9400000000, 0xce35c38300000000, 0x8e2d595f00000000,
+    0x4e49714800000000, 0x0ee4097100000000, 0xce80216600000000,
+    0x8ebef80300000000, 0x4edad01400000000, 0x0e77a82d00000000,
+    0xce13803a00000000, 0x0cd372c700000000, 0xccb75ad000000000,
+    0x8c1a22e900000000, 0x4c7e0afe00000000, 0x0c40d39b00000000,
+    0xcc24fb8c00000000, 0x8c8983b500000000, 0x4cedaba200000000,
+    0x0cf5317e00000000, 0xcc91196900000000, 0x8c3c615000000000,
+    0x4c58494700000000, 0x0c66902200000000, 0xcc02b83500000000,
+    0x8cafc00c00000000, 0x4ccbe81b00000000, 0x4d99856e00000000,
+    0x8dfdad7900000000, 0xcd50d54000000000, 0x0d34fd5700000000,
+    0x4d0a243200000000, 0x8d6e0c2500000000, 0xcdc3741c00000000,
+    0x0da75c0b00000000, 0x4dbfc6d700000000, 0x8ddbeec000000000,
+    0xcd7696f900000000, 0x0d12beee00000000, 0x4d2c678b00000000,
+    0x8d484f9c00000000, 0xcde537a500000000, 0x0d811fb200000000,
+    0x0862a38500000000, 0xc8068b9200000000, 0x88abf3ab00000000,
+    0x48cfdbbc00000000, 0x08f102d900000000, 0xc8952ace00000000,
+    0x883852f700000000, 0x485c7ae000000000, 0x0844e03c00000000,
+    0xc820c82b00000000, 0x888db01200000000, 0x48e9980500000000,
+    0x08d7416000000000, 0xc8b3697700000000, 0x881e114e00000000,
+    0x487a395900000000, 0x4928542c00000000, 0x894c7c3b00000000,
+    0xc9e1040200000000, 0x09852c1500000000, 0x49bbf57000000000,
+    0x89dfdd6700000000, 0xc972a55e00000000, 0x09168d4900000000,
+    0x490e179500000000, 0x896a3f8200000000, 0xc9c747bb00000000,
+    0x09a36fac00000000, 0x499db6c900000000, 0x89f99ede00000000,
+    0xc954e6e700000000, 0x0930cef000000000, 0xcbf03c0d00000000,
+    0x0b94141a00000000, 0x4b396c2300000000, 0x8b5d443400000000,
+    0xcb639d5100000000, 0x0b07b54600000000, 0x4baacd7f00000000,
+    0x8bcee56800000000, 0xcbd67fb400000000, 0x0bb257a300000000,
+    0x4b1f2f9a00000000, 0x8b7b078d00000000, 0xcb45dee800000000,
+    0x0b21f6ff00000000, 0x4b8c8ec600000000, 0x8be8a6d100000000,
+    0x8abacba400000000, 0x4adee3b300000000, 0x0a739b8a00000000,
+    0xca17b39d00000000, 0x8a296af800000000, 0x4a4d42ef00000000,
+    0x0ae03ad600000000, 0xca8412c100000000, 0x8a9c881d00000000,
+    0x4af8a00a00000000, 0x0a55d83300000000, 0xca31f02400000000,
+    0x8a0f294100000000, 0x4a6b015600000000, 0x0ac6796f00000000,
+    0xcaa2517800000000},
+   {0x0000000000000000, 0xd4ea739b00000000, 0xe9d396ed00000000,
+    0x3d39e57600000000, 0x93a15c0000000000, 0x474b2f9b00000000,
+    0x7a72caed00000000, 0xae98b97600000000, 0x2643b90000000000,
+    0xf2a9ca9b00000000, 0xcf902fed00000000, 0x1b7a5c7600000000,
+    0xb5e2e50000000000, 0x6108969b00000000, 0x5c3173ed00000000,
+    0x88db007600000000, 0x4c86720100000000, 0x986c019a00000000,
+    0xa555e4ec00000000, 0x71bf977700000000, 0xdf272e0100000000,
+    0x0bcd5d9a00000000, 0x36f4b8ec00000000, 0xe21ecb7700000000,
+    0x6ac5cb0100000000, 0xbe2fb89a00000000, 0x83165dec00000000,
+    0x57fc2e7700000000, 0xf964970100000000, 0x2d8ee49a00000000,
+    0x10b701ec00000000, 0xc45d727700000000, 0x980ce50200000000,
+    0x4ce6969900000000, 0x71df73ef00000000, 0xa535007400000000,
+    0x0badb90200000000, 0xdf47ca9900000000, 0xe27e2fef00000000,
+    0x36945c7400000000, 0xbe4f5c0200000000, 0x6aa52f9900000000,
+    0x579ccaef00000000, 0x8376b97400000000, 0x2dee000200000000,
+    0xf904739900000000, 0xc43d96ef00000000, 0x10d7e57400000000,
+    0xd48a970300000000, 0x0060e49800000000, 0x3d5901ee00000000,
+    0xe9b3727500000000, 0x472bcb0300000000, 0x93c1b89800000000,
+    0xaef85dee00000000, 0x7a122e7500000000, 0xf2c92e0300000000,
+    0x26235d9800000000, 0x1b1ab8ee00000000, 0xcff0cb7500000000,
+    0x6168720300000000, 0xb582019800000000, 0x88bbe4ee00000000,
+    0x5c51977500000000, 0x3019ca0500000000, 0xe4f3b99e00000000,
+    0xd9ca5ce800000000, 0x0d202f7300000000, 0xa3b8960500000000,
+    0x7752e59e00000000, 0x4a6b00e800000000, 0x9e81737300000000,
+    0x165a730500000000, 0xc2b0009e00000000, 0xff89e5e800000000,
+    0x2b63967300000000, 0x85fb2f0500000000, 0x51115c9e00000000,
+    0x6c28b9e800000000, 0xb8c2ca7300000000, 0x7c9fb80400000000,
+    0xa875cb9f00000000, 0x954c2ee900000000, 0x41a65d7200000000,
+    0xef3ee40400000000, 0x3bd4979f00000000, 0x06ed72e900000000,
+    0xd207017200000000, 0x5adc010400000000, 0x8e36729f00000000,
+    0xb30f97e900000000, 0x67e5e47200000000, 0xc97d5d0400000000,
+    0x1d972e9f00000000, 0x20aecbe900000000, 0xf444b87200000000,
+    0xa8152f0700000000, 0x7cff5c9c00000000, 0x41c6b9ea00000000,
+    0x952cca7100000000, 0x3bb4730700000000, 0xef5e009c00000000,
+    0xd267e5ea00000000, 0x068d967100000000, 0x8e56960700000000,
+    0x5abce59c00000000, 0x678500ea00000000, 0xb36f737100000000,
+    0x1df7ca0700000000, 0xc91db99c00000000, 0xf4245cea00000000,
+    0x20ce2f7100000000, 0xe4935d0600000000, 0x30792e9d00000000,
+    0x0d40cbeb00000000, 0xd9aab87000000000, 0x7732010600000000,
+    0xa3d8729d00000000, 0x9ee197eb00000000, 0x4a0be47000000000,
+    0xc2d0e40600000000, 0x163a979d00000000, 0x2b0372eb00000000,
+    0xffe9017000000000, 0x5171b80600000000, 0x859bcb9d00000000,
+    0xb8a22eeb00000000, 0x6c485d7000000000, 0x6032940b00000000,
+    0xb4d8e79000000000, 0x89e102e600000000, 0x5d0b717d00000000,
+    0xf393c80b00000000, 0x2779bb9000000000, 0x1a405ee600000000,
+    0xceaa2d7d00000000, 0x46712d0b00000000, 0x929b5e9000000000,
+    0xafa2bbe600000000, 0x7b48c87d00000000, 0xd5d0710b00000000,
+    0x013a029000000000, 0x3c03e7e600000000, 0xe8e9947d00000000,
+    0x2cb4e60a00000000, 0xf85e959100000000, 0xc56770e700000000,
+    0x118d037c00000000, 0xbf15ba0a00000000, 0x6bffc99100000000,
+    0x56c62ce700000000, 0x822c5f7c00000000, 0x0af75f0a00000000,
+    0xde1d2c9100000000, 0xe324c9e700000000, 0x37ceba7c00000000,
+    0x9956030a00000000, 0x4dbc709100000000, 0x708595e700000000,
+    0xa46fe67c00000000, 0xf83e710900000000, 0x2cd4029200000000,
+    0x11ede7e400000000, 0xc507947f00000000, 0x6b9f2d0900000000,
+    0xbf755e9200000000, 0x824cbbe400000000, 0x56a6c87f00000000,
+    0xde7dc80900000000, 0x0a97bb9200000000, 0x37ae5ee400000000,
+    0xe3442d7f00000000, 0x4ddc940900000000, 0x9936e79200000000,
+    0xa40f02e400000000, 0x70e5717f00000000, 0xb4b8030800000000,
+    0x6052709300000000, 0x5d6b95e500000000, 0x8981e67e00000000,
+    0x27195f0800000000, 0xf3f32c9300000000, 0xcecac9e500000000,
+    0x1a20ba7e00000000, 0x92fbba0800000000, 0x4611c99300000000,
+    0x7b282ce500000000, 0xafc25f7e00000000, 0x015ae60800000000,
+    0xd5b0959300000000, 0xe88970e500000000, 0x3c63037e00000000,
+    0x502b5e0e00000000, 0x84c12d9500000000, 0xb9f8c8e300000000,
+    0x6d12bb7800000000, 0xc38a020e00000000, 0x1760719500000000,
+    0x2a5994e300000000, 0xfeb3e77800000000, 0x7668e70e00000000,
+    0xa282949500000000, 0x9fbb71e300000000, 0x4b51027800000000,
+    0xe5c9bb0e00000000, 0x3123c89500000000, 0x0c1a2de300000000,
+    0xd8f05e7800000000, 0x1cad2c0f00000000, 0xc8475f9400000000,
+    0xf57ebae200000000, 0x2194c97900000000, 0x8f0c700f00000000,
+    0x5be6039400000000, 0x66dfe6e200000000, 0xb235957900000000,
+    0x3aee950f00000000, 0xee04e69400000000, 0xd33d03e200000000,
+    0x07d7707900000000, 0xa94fc90f00000000, 0x7da5ba9400000000,
+    0x409c5fe200000000, 0x94762c7900000000, 0xc827bb0c00000000,
+    0x1ccdc89700000000, 0x21f42de100000000, 0xf51e5e7a00000000,
+    0x5b86e70c00000000, 0x8f6c949700000000, 0xb25571e100000000,
+    0x66bf027a00000000, 0xee64020c00000000, 0x3a8e719700000000,
+    0x07b794e100000000, 0xd35de77a00000000, 0x7dc55e0c00000000,
+    0xa92f2d9700000000, 0x9416c8e100000000, 0x40fcbb7a00000000,
+    0x84a1c90d00000000, 0x504bba9600000000, 0x6d725fe000000000,
+    0xb9982c7b00000000, 0x1700950d00000000, 0xc3eae69600000000,
+    0xfed303e000000000, 0x2a39707b00000000, 0xa2e2700d00000000,
+    0x7608039600000000, 0x4b31e6e000000000, 0x9fdb957b00000000,
+    0x31432c0d00000000, 0xe5a95f9600000000, 0xd890bae000000000,
+    0x0c7ac97b00000000},
+   {0x0000000000000000, 0x2765258100000000, 0x0fcc3bd900000000,
+    0x28a91e5800000000, 0x5f9e066900000000, 0x78fb23e800000000,
+    0x50523db000000000, 0x7737183100000000, 0xbe3c0dd200000000,
+    0x9959285300000000, 0xb1f0360b00000000, 0x9695138a00000000,
+    0xe1a20bbb00000000, 0xc6c72e3a00000000, 0xee6e306200000000,
+    0xc90b15e300000000, 0x3d7f6b7f00000000, 0x1a1a4efe00000000,
+    0x32b350a600000000, 0x15d6752700000000, 0x62e16d1600000000,
+    0x4584489700000000, 0x6d2d56cf00000000, 0x4a48734e00000000,
+    0x834366ad00000000, 0xa426432c00000000, 0x8c8f5d7400000000,
+    0xabea78f500000000, 0xdcdd60c400000000, 0xfbb8454500000000,
+    0xd3115b1d00000000, 0xf4747e9c00000000, 0x7afed6fe00000000,
+    0x5d9bf37f00000000, 0x7532ed2700000000, 0x5257c8a600000000,
+    0x2560d09700000000, 0x0205f51600000000, 0x2aaceb4e00000000,
+    0x0dc9cecf00000000, 0xc4c2db2c00000000, 0xe3a7fead00000000,
+    0xcb0ee0f500000000, 0xec6bc57400000000, 0x9b5cdd4500000000,
+    0xbc39f8c400000000, 0x9490e69c00000000, 0xb3f5c31d00000000,
+    0x4781bd8100000000, 0x60e4980000000000, 0x484d865800000000,
+    0x6f28a3d900000000, 0x181fbbe800000000, 0x3f7a9e6900000000,
+    0x17d3803100000000, 0x30b6a5b000000000, 0xf9bdb05300000000,
+    0xded895d200000000, 0xf6718b8a00000000, 0xd114ae0b00000000,
+    0xa623b63a00000000, 0x814693bb00000000, 0xa9ef8de300000000,
+    0x8e8aa86200000000, 0xb5fadc2600000000, 0x929ff9a700000000,
+    0xba36e7ff00000000, 0x9d53c27e00000000, 0xea64da4f00000000,
+    0xcd01ffce00000000, 0xe5a8e19600000000, 0xc2cdc41700000000,
+    0x0bc6d1f400000000, 0x2ca3f47500000000, 0x040aea2d00000000,
+    0x236fcfac00000000, 0x5458d79d00000000, 0x733df21c00000000,
+    0x5b94ec4400000000, 0x7cf1c9c500000000, 0x8885b75900000000,
+    0xafe092d800000000, 0x87498c8000000000, 0xa02ca90100000000,
+    0xd71bb13000000000, 0xf07e94b100000000, 0xd8d78ae900000000,
+    0xffb2af6800000000, 0x36b9ba8b00000000, 0x11dc9f0a00000000,
+    0x3975815200000000, 0x1e10a4d300000000, 0x6927bce200000000,
+    0x4e42996300000000, 0x66eb873b00000000, 0x418ea2ba00000000,
+    0xcf040ad800000000, 0xe8612f5900000000, 0xc0c8310100000000,
+    0xe7ad148000000000, 0x909a0cb100000000, 0xb7ff293000000000,
+    0x9f56376800000000, 0xb83312e900000000, 0x7138070a00000000,
+    0x565d228b00000000, 0x7ef43cd300000000, 0x5991195200000000,
+    0x2ea6016300000000, 0x09c324e200000000, 0x216a3aba00000000,
+    0x060f1f3b00000000, 0xf27b61a700000000, 0xd51e442600000000,
+    0xfdb75a7e00000000, 0xdad27fff00000000, 0xade567ce00000000,
+    0x8a80424f00000000, 0xa2295c1700000000, 0x854c799600000000,
+    0x4c476c7500000000, 0x6b2249f400000000, 0x438b57ac00000000,
+    0x64ee722d00000000, 0x13d96a1c00000000, 0x34bc4f9d00000000,
+    0x1c1551c500000000, 0x3b70744400000000, 0x6af5b94d00000000,
+    0x4d909ccc00000000, 0x6539829400000000, 0x425ca71500000000,
+    0x356bbf2400000000, 0x120e9aa500000000, 0x3aa784fd00000000,
+    0x1dc2a17c00000000, 0xd4c9b49f00000000, 0xf3ac911e00000000,
+    0xdb058f4600000000, 0xfc60aac700000000, 0x8b57b2f600000000,
+    0xac32977700000000, 0x849b892f00000000, 0xa3feacae00000000,
+    0x578ad23200000000, 0x70eff7b300000000, 0x5846e9eb00000000,
+    0x7f23cc6a00000000, 0x0814d45b00000000, 0x2f71f1da00000000,
+    0x07d8ef8200000000, 0x20bdca0300000000, 0xe9b6dfe000000000,
+    0xced3fa6100000000, 0xe67ae43900000000, 0xc11fc1b800000000,
+    0xb628d98900000000, 0x914dfc0800000000, 0xb9e4e25000000000,
+    0x9e81c7d100000000, 0x100b6fb300000000, 0x376e4a3200000000,
+    0x1fc7546a00000000, 0x38a271eb00000000, 0x4f9569da00000000,
+    0x68f04c5b00000000, 0x4059520300000000, 0x673c778200000000,
+    0xae37626100000000, 0x895247e000000000, 0xa1fb59b800000000,
+    0x869e7c3900000000, 0xf1a9640800000000, 0xd6cc418900000000,
+    0xfe655fd100000000, 0xd9007a5000000000, 0x2d7404cc00000000,
+    0x0a11214d00000000, 0x22b83f1500000000, 0x05dd1a9400000000,
+    0x72ea02a500000000, 0x558f272400000000, 0x7d26397c00000000,
+    0x5a431cfd00000000, 0x9348091e00000000, 0xb42d2c9f00000000,
+    0x9c8432c700000000, 0xbbe1174600000000, 0xccd60f7700000000,
+    0xebb32af600000000, 0xc31a34ae00000000, 0xe47f112f00000000,
+    0xdf0f656b00000000, 0xf86a40ea00000000, 0xd0c35eb200000000,
+    0xf7a67b3300000000, 0x8091630200000000, 0xa7f4468300000000,
+    0x8f5d58db00000000, 0xa8387d5a00000000, 0x613368b900000000,
+    0x46564d3800000000, 0x6eff536000000000, 0x499a76e100000000,
+    0x3ead6ed000000000, 0x19c84b5100000000, 0x3161550900000000,
+    0x1604708800000000, 0xe2700e1400000000, 0xc5152b9500000000,
+    0xedbc35cd00000000, 0xcad9104c00000000, 0xbdee087d00000000,
+    0x9a8b2dfc00000000, 0xb22233a400000000, 0x9547162500000000,
+    0x5c4c03c600000000, 0x7b29264700000000, 0x5380381f00000000,
+    0x74e51d9e00000000, 0x03d205af00000000, 0x24b7202e00000000,
+    0x0c1e3e7600000000, 0x2b7b1bf700000000, 0xa5f1b39500000000,
+    0x8294961400000000, 0xaa3d884c00000000, 0x8d58adcd00000000,
+    0xfa6fb5fc00000000, 0xdd0a907d00000000, 0xf5a38e2500000000,
+    0xd2c6aba400000000, 0x1bcdbe4700000000, 0x3ca89bc600000000,
+    0x1401859e00000000, 0x3364a01f00000000, 0x4453b82e00000000,
+    0x63369daf00000000, 0x4b9f83f700000000, 0x6cfaa67600000000,
+    0x988ed8ea00000000, 0xbfebfd6b00000000, 0x9742e33300000000,
+    0xb027c6b200000000, 0xc710de8300000000, 0xe075fb0200000000,
+    0xc8dce55a00000000, 0xefb9c0db00000000, 0x26b2d53800000000,
+    0x01d7f0b900000000, 0x297eeee100000000, 0x0e1bcb6000000000,
+    0x792cd35100000000, 0x5e49f6d000000000, 0x76e0e88800000000,
+    0x5185cd0900000000}};
+
+#else /* W == 4 */
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0x9ba54c6f, 0xec3b9e9f, 0x779ed2f0, 0x03063b7f,
+    0x98a37710, 0xef3da5e0, 0x7498e98f, 0x060c76fe, 0x9da93a91,
+    0xea37e861, 0x7192a40e, 0x050a4d81, 0x9eaf01ee, 0xe931d31e,
+    0x72949f71, 0x0c18edfc, 0x97bda193, 0xe0237363, 0x7b863f0c,
+    0x0f1ed683, 0x94bb9aec, 0xe325481c, 0x78800473, 0x0a149b02,
+    0x91b1d76d, 0xe62f059d, 0x7d8a49f2, 0x0912a07d, 0x92b7ec12,
+    0xe5293ee2, 0x7e8c728d, 0x1831dbf8, 0x83949797, 0xf40a4567,
+    0x6faf0908, 0x1b37e087, 0x8092ace8, 0xf70c7e18, 0x6ca93277,
+    0x1e3dad06, 0x8598e169, 0xf2063399, 0x69a37ff6, 0x1d3b9679,
+    0x869eda16, 0xf10008e6, 0x6aa54489, 0x14293604, 0x8f8c7a6b,
+    0xf812a89b, 0x63b7e4f4, 0x172f0d7b, 0x8c8a4114, 0xfb1493e4,
+    0x60b1df8b, 0x122540fa, 0x89800c95, 0xfe1ede65, 0x65bb920a,
+    0x11237b85, 0x8a8637ea, 0xfd18e51a, 0x66bda975, 0x3063b7f0,
+    0xabc6fb9f, 0xdc58296f, 0x47fd6500, 0x33658c8f, 0xa8c0c0e0,
+    0xdf5e1210, 0x44fb5e7f, 0x366fc10e, 0xadca8d61, 0xda545f91,
+    0x41f113fe, 0x3569fa71, 0xaeccb61e, 0xd95264ee, 0x42f72881,
+    0x3c7b5a0c, 0xa7de1663, 0xd040c493, 0x4be588fc, 0x3f7d6173,
+    0xa4d82d1c, 0xd346ffec, 0x48e3b383, 0x3a772cf2, 0xa1d2609d,
+    0xd64cb26d, 0x4de9fe02, 0x3971178d, 0xa2d45be2, 0xd54a8912,
+    0x4eefc57d, 0x28526c08, 0xb3f72067, 0xc469f297, 0x5fccbef8,
+    0x2b545777, 0xb0f11b18, 0xc76fc9e8, 0x5cca8587, 0x2e5e1af6,
+    0xb5fb5699, 0xc2658469, 0x59c0c806, 0x2d582189, 0xb6fd6de6,
+    0xc163bf16, 0x5ac6f379, 0x244a81f4, 0xbfefcd9b, 0xc8711f6b,
+    0x53d45304, 0x274cba8b, 0xbce9f6e4, 0xcb772414, 0x50d2687b,
+    0x2246f70a, 0xb9e3bb65, 0xce7d6995, 0x55d825fa, 0x2140cc75,
+    0xbae5801a, 0xcd7b52ea, 0x56de1e85, 0x60c76fe0, 0xfb62238f,
+    0x8cfcf17f, 0x1759bd10, 0x63c1549f, 0xf86418f0, 0x8ffaca00,
+    0x145f866f, 0x66cb191e, 0xfd6e5571, 0x8af08781, 0x1155cbee,
+    0x65cd2261, 0xfe686e0e, 0x89f6bcfe, 0x1253f091, 0x6cdf821c,
+    0xf77ace73, 0x80e41c83, 0x1b4150ec, 0x6fd9b963, 0xf47cf50c,
+    0x83e227fc, 0x18476b93, 0x6ad3f4e2, 0xf176b88d, 0x86e86a7d,
+    0x1d4d2612, 0x69d5cf9d, 0xf27083f2, 0x85ee5102, 0x1e4b1d6d,
+    0x78f6b418, 0xe353f877, 0x94cd2a87, 0x0f6866e8, 0x7bf08f67,
+    0xe055c308, 0x97cb11f8, 0x0c6e5d97, 0x7efac2e6, 0xe55f8e89,
+    0x92c15c79, 0x09641016, 0x7dfcf999, 0xe659b5f6, 0x91c76706,
+    0x0a622b69, 0x74ee59e4, 0xef4b158b, 0x98d5c77b, 0x03708b14,
+    0x77e8629b, 0xec4d2ef4, 0x9bd3fc04, 0x0076b06b, 0x72e22f1a,
+    0xe9476375, 0x9ed9b185, 0x057cfdea, 0x71e41465, 0xea41580a,
+    0x9ddf8afa, 0x067ac695, 0x50a4d810, 0xcb01947f, 0xbc9f468f,
+    0x273a0ae0, 0x53a2e36f, 0xc807af00, 0xbf997df0, 0x243c319f,
+    0x56a8aeee, 0xcd0de281, 0xba933071, 0x21367c1e, 0x55ae9591,
+    0xce0bd9fe, 0xb9950b0e, 0x22304761, 0x5cbc35ec, 0xc7197983,
+    0xb087ab73, 0x2b22e71c, 0x5fba0e93, 0xc41f42fc, 0xb381900c,
+    0x2824dc63, 0x5ab04312, 0xc1150f7d, 0xb68bdd8d, 0x2d2e91e2,
+    0x59b6786d, 0xc2133402, 0xb58de6f2, 0x2e28aa9d, 0x489503e8,
+    0xd3304f87, 0xa4ae9d77, 0x3f0bd118, 0x4b933897, 0xd03674f8,
+    0xa7a8a608, 0x3c0dea67, 0x4e997516, 0xd53c3979, 0xa2a2eb89,
+    0x3907a7e6, 0x4d9f4e69, 0xd63a0206, 0xa1a4d0f6, 0x3a019c99,
+    0x448dee14, 0xdf28a27b, 0xa8b6708b, 0x33133ce4, 0x478bd56b,
+    0xdc2e9904, 0xabb04bf4, 0x3015079b, 0x428198ea, 0xd924d485,
+    0xaeba0675, 0x351f4a1a, 0x4187a395, 0xda22effa, 0xadbc3d0a,
+    0x36197165},
+   {0x00000000, 0xc18edfc0, 0x586cb9c1, 0x99e26601, 0xb0d97382,
+    0x7157ac42, 0xe8b5ca43, 0x293b1583, 0xbac3e145, 0x7b4d3e85,
+    0xe2af5884, 0x23218744, 0x0a1a92c7, 0xcb944d07, 0x52762b06,
+    0x93f8f4c6, 0xaef6c4cb, 0x6f781b0b, 0xf69a7d0a, 0x3714a2ca,
+    0x1e2fb749, 0xdfa16889, 0x46430e88, 0x87cdd148, 0x1435258e,
+    0xd5bbfa4e, 0x4c599c4f, 0x8dd7438f, 0xa4ec560c, 0x656289cc,
+    0xfc80efcd, 0x3d0e300d, 0x869c8fd7, 0x47125017, 0xdef03616,
+    0x1f7ee9d6, 0x3645fc55, 0xf7cb2395, 0x6e294594, 0xafa79a54,
+    0x3c5f6e92, 0xfdd1b152, 0x6433d753, 0xa5bd0893, 0x8c861d10,
+    0x4d08c2d0, 0xd4eaa4d1, 0x15647b11, 0x286a4b1c, 0xe9e494dc,
+    0x7006f2dd, 0xb1882d1d, 0x98b3389e, 0x593de75e, 0xc0df815f,
+    0x01515e9f, 0x92a9aa59, 0x53277599, 0xcac51398, 0x0b4bcc58,
+    0x2270d9db, 0xe3fe061b, 0x7a1c601a, 0xbb92bfda, 0xd64819ef,
+    0x17c6c62f, 0x8e24a02e, 0x4faa7fee, 0x66916a6d, 0xa71fb5ad,
+    0x3efdd3ac, 0xff730c6c, 0x6c8bf8aa, 0xad05276a, 0x34e7416b,
+    0xf5699eab, 0xdc528b28, 0x1ddc54e8, 0x843e32e9, 0x45b0ed29,
+    0x78bedd24, 0xb93002e4, 0x20d264e5, 0xe15cbb25, 0xc867aea6,
+    0x09e97166, 0x900b1767, 0x5185c8a7, 0xc27d3c61, 0x03f3e3a1,
+    0x9a1185a0, 0x5b9f5a60, 0x72a44fe3, 0xb32a9023, 0x2ac8f622,
+    0xeb4629e2, 0x50d49638, 0x915a49f8, 0x08b82ff9, 0xc936f039,
+    0xe00de5ba, 0x21833a7a, 0xb8615c7b, 0x79ef83bb, 0xea17777d,
+    0x2b99a8bd, 0xb27bcebc, 0x73f5117c, 0x5ace04ff, 0x9b40db3f,
+    0x02a2bd3e, 0xc32c62fe, 0xfe2252f3, 0x3fac8d33, 0xa64eeb32,
+    0x67c034f2, 0x4efb2171, 0x8f75feb1, 0x169798b0, 0xd7194770,
+    0x44e1b3b6, 0x856f6c76, 0x1c8d0a77, 0xdd03d5b7, 0xf438c034,
+    0x35b61ff4, 0xac5479f5, 0x6ddaa635, 0x77e1359f, 0xb66fea5f,
+    0x2f8d8c5e, 0xee03539e, 0xc738461d, 0x06b699dd, 0x9f54ffdc,
+    0x5eda201c, 0xcd22d4da, 0x0cac0b1a, 0x954e6d1b, 0x54c0b2db,
+    0x7dfba758, 0xbc757898, 0x25971e99, 0xe419c159, 0xd917f154,
+    0x18992e94, 0x817b4895, 0x40f59755, 0x69ce82d6, 0xa8405d16,
+    0x31a23b17, 0xf02ce4d7, 0x63d41011, 0xa25acfd1, 0x3bb8a9d0,
+    0xfa367610, 0xd30d6393, 0x1283bc53, 0x8b61da52, 0x4aef0592,
+    0xf17dba48, 0x30f36588, 0xa9110389, 0x689fdc49, 0x41a4c9ca,
+    0x802a160a, 0x19c8700b, 0xd846afcb, 0x4bbe5b0d, 0x8a3084cd,
+    0x13d2e2cc, 0xd25c3d0c, 0xfb67288f, 0x3ae9f74f, 0xa30b914e,
+    0x62854e8e, 0x5f8b7e83, 0x9e05a143, 0x07e7c742, 0xc6691882,
+    0xef520d01, 0x2edcd2c1, 0xb73eb4c0, 0x76b06b00, 0xe5489fc6,
+    0x24c64006, 0xbd242607, 0x7caaf9c7, 0x5591ec44, 0x941f3384,
+    0x0dfd5585, 0xcc738a45, 0xa1a92c70, 0x6027f3b0, 0xf9c595b1,
+    0x384b4a71, 0x11705ff2, 0xd0fe8032, 0x491ce633, 0x889239f3,
+    0x1b6acd35, 0xdae412f5, 0x430674f4, 0x8288ab34, 0xabb3beb7,
+    0x6a3d6177, 0xf3df0776, 0x3251d8b6, 0x0f5fe8bb, 0xced1377b,
+    0x5733517a, 0x96bd8eba, 0xbf869b39, 0x7e0844f9, 0xe7ea22f8,
+    0x2664fd38, 0xb59c09fe, 0x7412d63e, 0xedf0b03f, 0x2c7e6fff,
+    0x05457a7c, 0xc4cba5bc, 0x5d29c3bd, 0x9ca71c7d, 0x2735a3a7,
+    0xe6bb7c67, 0x7f591a66, 0xbed7c5a6, 0x97ecd025, 0x56620fe5,
+    0xcf8069e4, 0x0e0eb624, 0x9df642e2, 0x5c789d22, 0xc59afb23,
+    0x041424e3, 0x2d2f3160, 0xeca1eea0, 0x754388a1, 0xb4cd5761,
+    0x89c3676c, 0x484db8ac, 0xd1afdead, 0x1021016d, 0x391a14ee,
+    0xf894cb2e, 0x6176ad2f, 0xa0f872ef, 0x33008629, 0xf28e59e9,
+    0x6b6c3fe8, 0xaae2e028, 0x83d9f5ab, 0x42572a6b, 0xdbb54c6a,
+    0x1a3b93aa},
+   {0x00000000, 0xefc26b3e, 0x04f5d03d, 0xeb37bb03, 0x09eba07a,
+    0xe629cb44, 0x0d1e7047, 0xe2dc1b79, 0x13d740f4, 0xfc152bca,
+    0x172290c9, 0xf8e0fbf7, 0x1a3ce08e, 0xf5fe8bb0, 0x1ec930b3,
+    0xf10b5b8d, 0x27ae81e8, 0xc86cead6, 0x235b51d5, 0xcc993aeb,
+    0x2e452192, 0xc1874aac, 0x2ab0f1af, 0xc5729a91, 0x3479c11c,
+    0xdbbbaa22, 0x308c1121, 0xdf4e7a1f, 0x3d926166, 0xd2500a58,
+    0x3967b15b, 0xd6a5da65, 0x4f5d03d0, 0xa09f68ee, 0x4ba8d3ed,
+    0xa46ab8d3, 0x46b6a3aa, 0xa974c894, 0x42437397, 0xad8118a9,
+    0x5c8a4324, 0xb348281a, 0x587f9319, 0xb7bdf827, 0x5561e35e,
+    0xbaa38860, 0x51943363, 0xbe56585d, 0x68f38238, 0x8731e906,
+    0x6c065205, 0x83c4393b, 0x61182242, 0x8eda497c, 0x65edf27f,
+    0x8a2f9941, 0x7b24c2cc, 0x94e6a9f2, 0x7fd112f1, 0x901379cf,
+    0x72cf62b6, 0x9d0d0988, 0x763ab28b, 0x99f8d9b5, 0x9eba07a0,
+    0x71786c9e, 0x9a4fd79d, 0x758dbca3, 0x9751a7da, 0x7893cce4,
+    0x93a477e7, 0x7c661cd9, 0x8d6d4754, 0x62af2c6a, 0x89989769,
+    0x665afc57, 0x8486e72e, 0x6b448c10, 0x80733713, 0x6fb15c2d,
+    0xb9148648, 0x56d6ed76, 0xbde15675, 0x52233d4b, 0xb0ff2632,
+    0x5f3d4d0c, 0xb40af60f, 0x5bc89d31, 0xaac3c6bc, 0x4501ad82,
+    0xae361681, 0x41f47dbf, 0xa32866c6, 0x4cea0df8, 0xa7ddb6fb,
+    0x481fddc5, 0xd1e70470, 0x3e256f4e, 0xd512d44d, 0x3ad0bf73,
+    0xd80ca40a, 0x37cecf34, 0xdcf97437, 0x333b1f09, 0xc2304484,
+    0x2df22fba, 0xc6c594b9, 0x2907ff87, 0xcbdbe4fe, 0x24198fc0,
+    0xcf2e34c3, 0x20ec5ffd, 0xf6498598, 0x198beea6, 0xf2bc55a5,
+    0x1d7e3e9b, 0xffa225e2, 0x10604edc, 0xfb57f5df, 0x14959ee1,
+    0xe59ec56c, 0x0a5cae52, 0xe16b1551, 0x0ea97e6f, 0xec756516,
+    0x03b70e28, 0xe880b52b, 0x0742de15, 0xe6050901, 0x09c7623f,
+    0xe2f0d93c, 0x0d32b202, 0xefeea97b, 0x002cc245, 0xeb1b7946,
+    0x04d91278, 0xf5d249f5, 0x1a1022cb, 0xf12799c8, 0x1ee5f2f6,
+    0xfc39e98f, 0x13fb82b1, 0xf8cc39b2, 0x170e528c, 0xc1ab88e9,
+    0x2e69e3d7, 0xc55e58d4, 0x2a9c33ea, 0xc8402893, 0x278243ad,
+    0xccb5f8ae, 0x23779390, 0xd27cc81d, 0x3dbea323, 0xd6891820,
+    0x394b731e, 0xdb976867, 0x34550359, 0xdf62b85a, 0x30a0d364,
+    0xa9580ad1, 0x469a61ef, 0xadaddaec, 0x426fb1d2, 0xa0b3aaab,
+    0x4f71c195, 0xa4467a96, 0x4b8411a8, 0xba8f4a25, 0x554d211b,
+    0xbe7a9a18, 0x51b8f126, 0xb364ea5f, 0x5ca68161, 0xb7913a62,
+    0x5853515c, 0x8ef68b39, 0x6134e007, 0x8a035b04, 0x65c1303a,
+    0x871d2b43, 0x68df407d, 0x83e8fb7e, 0x6c2a9040, 0x9d21cbcd,
+    0x72e3a0f3, 0x99d41bf0, 0x761670ce, 0x94ca6bb7, 0x7b080089,
+    0x903fbb8a, 0x7ffdd0b4, 0x78bf0ea1, 0x977d659f, 0x7c4ade9c,
+    0x9388b5a2, 0x7154aedb, 0x9e96c5e5, 0x75a17ee6, 0x9a6315d8,
+    0x6b684e55, 0x84aa256b, 0x6f9d9e68, 0x805ff556, 0x6283ee2f,
+    0x8d418511, 0x66763e12, 0x89b4552c, 0x5f118f49, 0xb0d3e477,
+    0x5be45f74, 0xb426344a, 0x56fa2f33, 0xb938440d, 0x520fff0e,
+    0xbdcd9430, 0x4cc6cfbd, 0xa304a483, 0x48331f80, 0xa7f174be,
+    0x452d6fc7, 0xaaef04f9, 0x41d8bffa, 0xae1ad4c4, 0x37e20d71,
+    0xd820664f, 0x3317dd4c, 0xdcd5b672, 0x3e09ad0b, 0xd1cbc635,
+    0x3afc7d36, 0xd53e1608, 0x24354d85, 0xcbf726bb, 0x20c09db8,
+    0xcf02f686, 0x2ddeedff, 0xc21c86c1, 0x292b3dc2, 0xc6e956fc,
+    0x104c8c99, 0xff8ee7a7, 0x14b95ca4, 0xfb7b379a, 0x19a72ce3,
+    0xf66547dd, 0x1d52fcde, 0xf29097e0, 0x039bcc6d, 0xec59a753,
+    0x076e1c50, 0xe8ac776e, 0x0a706c17, 0xe5b20729, 0x0e85bc2a,
+    0xe147d714},
+   {0x00000000, 0x177b1443, 0x2ef62886, 0x398d3cc5, 0x5dec510c,
+    0x4a97454f, 0x731a798a, 0x64616dc9, 0xbbd8a218, 0xaca3b65b,
+    0x952e8a9e, 0x82559edd, 0xe634f314, 0xf14fe757, 0xc8c2db92,
+    0xdfb9cfd1, 0xacc04271, 0xbbbb5632, 0x82366af7, 0x954d7eb4,
+    0xf12c137d, 0xe657073e, 0xdfda3bfb, 0xc8a12fb8, 0x1718e069,
+    0x0063f42a, 0x39eec8ef, 0x2e95dcac, 0x4af4b165, 0x5d8fa526,
+    0x640299e3, 0x73798da0, 0x82f182a3, 0x958a96e0, 0xac07aa25,
+    0xbb7cbe66, 0xdf1dd3af, 0xc866c7ec, 0xf1ebfb29, 0xe690ef6a,
+    0x392920bb, 0x2e5234f8, 0x17df083d, 0x00a41c7e, 0x64c571b7,
+    0x73be65f4, 0x4a335931, 0x5d484d72, 0x2e31c0d2, 0x394ad491,
+    0x00c7e854, 0x17bcfc17, 0x73dd91de, 0x64a6859d, 0x5d2bb958,
+    0x4a50ad1b, 0x95e962ca, 0x82927689, 0xbb1f4a4c, 0xac645e0f,
+    0xc80533c6, 0xdf7e2785, 0xe6f31b40, 0xf1880f03, 0xde920307,
+    0xc9e91744, 0xf0642b81, 0xe71f3fc2, 0x837e520b, 0x94054648,
+    0xad887a8d, 0xbaf36ece, 0x654aa11f, 0x7231b55c, 0x4bbc8999,
+    0x5cc79dda, 0x38a6f013, 0x2fdde450, 0x1650d895, 0x012bccd6,
+    0x72524176, 0x65295535, 0x5ca469f0, 0x4bdf7db3, 0x2fbe107a,
+    0x38c50439, 0x014838fc, 0x16332cbf, 0xc98ae36e, 0xdef1f72d,
+    0xe77ccbe8, 0xf007dfab, 0x9466b262, 0x831da621, 0xba909ae4,
+    0xadeb8ea7, 0x5c6381a4, 0x4b1895e7, 0x7295a922, 0x65eebd61,
+    0x018fd0a8, 0x16f4c4eb, 0x2f79f82e, 0x3802ec6d, 0xe7bb23bc,
+    0xf0c037ff, 0xc94d0b3a, 0xde361f79, 0xba5772b0, 0xad2c66f3,
+    0x94a15a36, 0x83da4e75, 0xf0a3c3d5, 0xe7d8d796, 0xde55eb53,
+    0xc92eff10, 0xad4f92d9, 0xba34869a, 0x83b9ba5f, 0x94c2ae1c,
+    0x4b7b61cd, 0x5c00758e, 0x658d494b, 0x72f65d08, 0x169730c1,
+    0x01ec2482, 0x38611847, 0x2f1a0c04, 0x6655004f, 0x712e140c,
+    0x48a328c9, 0x5fd83c8a, 0x3bb95143, 0x2cc24500, 0x154f79c5,
+    0x02346d86, 0xdd8da257, 0xcaf6b614, 0xf37b8ad1, 0xe4009e92,
+    0x8061f35b, 0x971ae718, 0xae97dbdd, 0xb9eccf9e, 0xca95423e,
+    0xddee567d, 0xe4636ab8, 0xf3187efb, 0x97791332, 0x80020771,
+    0xb98f3bb4, 0xaef42ff7, 0x714de026, 0x6636f465, 0x5fbbc8a0,
+    0x48c0dce3, 0x2ca1b12a, 0x3bdaa569, 0x025799ac, 0x152c8def,
+    0xe4a482ec, 0xf3df96af, 0xca52aa6a, 0xdd29be29, 0xb948d3e0,
+    0xae33c7a3, 0x97befb66, 0x80c5ef25, 0x5f7c20f4, 0x480734b7,
+    0x718a0872, 0x66f11c31, 0x029071f8, 0x15eb65bb, 0x2c66597e,
+    0x3b1d4d3d, 0x4864c09d, 0x5f1fd4de, 0x6692e81b, 0x71e9fc58,
+    0x15889191, 0x02f385d2, 0x3b7eb917, 0x2c05ad54, 0xf3bc6285,
+    0xe4c776c6, 0xdd4a4a03, 0xca315e40, 0xae503389, 0xb92b27ca,
+    0x80a61b0f, 0x97dd0f4c, 0xb8c70348, 0xafbc170b, 0x96312bce,
+    0x814a3f8d, 0xe52b5244, 0xf2504607, 0xcbdd7ac2, 0xdca66e81,
+    0x031fa150, 0x1464b513, 0x2de989d6, 0x3a929d95, 0x5ef3f05c,
+    0x4988e41f, 0x7005d8da, 0x677ecc99, 0x14074139, 0x037c557a,
+    0x3af169bf, 0x2d8a7dfc, 0x49eb1035, 0x5e900476, 0x671d38b3,
+    0x70662cf0, 0xafdfe321, 0xb8a4f762, 0x8129cba7, 0x9652dfe4,
+    0xf233b22d, 0xe548a66e, 0xdcc59aab, 0xcbbe8ee8, 0x3a3681eb,
+    0x2d4d95a8, 0x14c0a96d, 0x03bbbd2e, 0x67dad0e7, 0x70a1c4a4,
+    0x492cf861, 0x5e57ec22, 0x81ee23f3, 0x969537b0, 0xaf180b75,
+    0xb8631f36, 0xdc0272ff, 0xcb7966bc, 0xf2f45a79, 0xe58f4e3a,
+    0x96f6c39a, 0x818dd7d9, 0xb800eb1c, 0xaf7bff5f, 0xcb1a9296,
+    0xdc6186d5, 0xe5ecba10, 0xf297ae53, 0x2d2e6182, 0x3a5575c1,
+    0x03d84904, 0x14a35d47, 0x70c2308e, 0x67b924cd, 0x5e341808,
+    0x494f0c4b}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x00000000, 0x43147b17, 0x8628f62e, 0xc53c8d39, 0x0c51ec5d,
+    0x4f45974a, 0x8a791a73, 0xc96d6164, 0x18a2d8bb, 0x5bb6a3ac,
+    0x9e8a2e95, 0xdd9e5582, 0x14f334e6, 0x57e74ff1, 0x92dbc2c8,
+    0xd1cfb9df, 0x7142c0ac, 0x3256bbbb, 0xf76a3682, 0xb47e4d95,
+    0x7d132cf1, 0x3e0757e6, 0xfb3bdadf, 0xb82fa1c8, 0x69e01817,
+    0x2af46300, 0xefc8ee39, 0xacdc952e, 0x65b1f44a, 0x26a58f5d,
+    0xe3990264, 0xa08d7973, 0xa382f182, 0xe0968a95, 0x25aa07ac,
+    0x66be7cbb, 0xafd31ddf, 0xecc766c8, 0x29fbebf1, 0x6aef90e6,
+    0xbb202939, 0xf834522e, 0x3d08df17, 0x7e1ca400, 0xb771c564,
+    0xf465be73, 0x3159334a, 0x724d485d, 0xd2c0312e, 0x91d44a39,
+    0x54e8c700, 0x17fcbc17, 0xde91dd73, 0x9d85a664, 0x58b92b5d,
+    0x1bad504a, 0xca62e995, 0x89769282, 0x4c4a1fbb, 0x0f5e64ac,
+    0xc63305c8, 0x85277edf, 0x401bf3e6, 0x030f88f1, 0x070392de,
+    0x4417e9c9, 0x812b64f0, 0xc23f1fe7, 0x0b527e83, 0x48460594,
+    0x8d7a88ad, 0xce6ef3ba, 0x1fa14a65, 0x5cb53172, 0x9989bc4b,
+    0xda9dc75c, 0x13f0a638, 0x50e4dd2f, 0x95d85016, 0xd6cc2b01,
+    0x76415272, 0x35552965, 0xf069a45c, 0xb37ddf4b, 0x7a10be2f,
+    0x3904c538, 0xfc384801, 0xbf2c3316, 0x6ee38ac9, 0x2df7f1de,
+    0xe8cb7ce7, 0xabdf07f0, 0x62b26694, 0x21a61d83, 0xe49a90ba,
+    0xa78eebad, 0xa481635c, 0xe795184b, 0x22a99572, 0x61bdee65,
+    0xa8d08f01, 0xebc4f416, 0x2ef8792f, 0x6dec0238, 0xbc23bbe7,
+    0xff37c0f0, 0x3a0b4dc9, 0x791f36de, 0xb07257ba, 0xf3662cad,
+    0x365aa194, 0x754eda83, 0xd5c3a3f0, 0x96d7d8e7, 0x53eb55de,
+    0x10ff2ec9, 0xd9924fad, 0x9a8634ba, 0x5fbab983, 0x1caec294,
+    0xcd617b4b, 0x8e75005c, 0x4b498d65, 0x085df672, 0xc1309716,
+    0x8224ec01, 0x47186138, 0x040c1a2f, 0x4f005566, 0x0c142e71,
+    0xc928a348, 0x8a3cd85f, 0x4351b93b, 0x0045c22c, 0xc5794f15,
+    0x866d3402, 0x57a28ddd, 0x14b6f6ca, 0xd18a7bf3, 0x929e00e4,
+    0x5bf36180, 0x18e71a97, 0xdddb97ae, 0x9ecfecb9, 0x3e4295ca,
+    0x7d56eedd, 0xb86a63e4, 0xfb7e18f3, 0x32137997, 0x71070280,
+    0xb43b8fb9, 0xf72ff4ae, 0x26e04d71, 0x65f43666, 0xa0c8bb5f,
+    0xe3dcc048, 0x2ab1a12c, 0x69a5da3b, 0xac995702, 0xef8d2c15,
+    0xec82a4e4, 0xaf96dff3, 0x6aaa52ca, 0x29be29dd, 0xe0d348b9,
+    0xa3c733ae, 0x66fbbe97, 0x25efc580, 0xf4207c5f, 0xb7340748,
+    0x72088a71, 0x311cf166, 0xf8719002, 0xbb65eb15, 0x7e59662c,
+    0x3d4d1d3b, 0x9dc06448, 0xded41f5f, 0x1be89266, 0x58fce971,
+    0x91918815, 0xd285f302, 0x17b97e3b, 0x54ad052c, 0x8562bcf3,
+    0xc676c7e4, 0x034a4add, 0x405e31ca, 0x893350ae, 0xca272bb9,
+    0x0f1ba680, 0x4c0fdd97, 0x4803c7b8, 0x0b17bcaf, 0xce2b3196,
+    0x8d3f4a81, 0x44522be5, 0x074650f2, 0xc27addcb, 0x816ea6dc,
+    0x50a11f03, 0x13b56414, 0xd689e92d, 0x959d923a, 0x5cf0f35e,
+    0x1fe48849, 0xdad80570, 0x99cc7e67, 0x39410714, 0x7a557c03,
+    0xbf69f13a, 0xfc7d8a2d, 0x3510eb49, 0x7604905e, 0xb3381d67,
+    0xf02c6670, 0x21e3dfaf, 0x62f7a4b8, 0xa7cb2981, 0xe4df5296,
+    0x2db233f2, 0x6ea648e5, 0xab9ac5dc, 0xe88ebecb, 0xeb81363a,
+    0xa8954d2d, 0x6da9c014, 0x2ebdbb03, 0xe7d0da67, 0xa4c4a170,
+    0x61f82c49, 0x22ec575e, 0xf323ee81, 0xb0379596, 0x750b18af,
+    0x361f63b8, 0xff7202dc, 0xbc6679cb, 0x795af4f2, 0x3a4e8fe5,
+    0x9ac3f696, 0xd9d78d81, 0x1ceb00b8, 0x5fff7baf, 0x96921acb,
+    0xd58661dc, 0x10baece5, 0x53ae97f2, 0x82612e2d, 0xc175553a,
+    0x0449d803, 0x475da314, 0x8e30c270, 0xcd24b967, 0x0818345e,
+    0x4b0c4f49},
+   {0x00000000, 0x3e6bc2ef, 0x3dd0f504, 0x03bb37eb, 0x7aa0eb09,
+    0x44cb29e6, 0x47701e0d, 0x791bdce2, 0xf440d713, 0xca2b15fc,
+    0xc9902217, 0xf7fbe0f8, 0x8ee03c1a, 0xb08bfef5, 0xb330c91e,
+    0x8d5b0bf1, 0xe881ae27, 0xd6ea6cc8, 0xd5515b23, 0xeb3a99cc,
+    0x9221452e, 0xac4a87c1, 0xaff1b02a, 0x919a72c5, 0x1cc17934,
+    0x22aabbdb, 0x21118c30, 0x1f7a4edf, 0x6661923d, 0x580a50d2,
+    0x5bb16739, 0x65daa5d6, 0xd0035d4f, 0xee689fa0, 0xedd3a84b,
+    0xd3b86aa4, 0xaaa3b646, 0x94c874a9, 0x97734342, 0xa91881ad,
+    0x24438a5c, 0x1a2848b3, 0x19937f58, 0x27f8bdb7, 0x5ee36155,
+    0x6088a3ba, 0x63339451, 0x5d5856be, 0x3882f368, 0x06e93187,
+    0x0552066c, 0x3b39c483, 0x42221861, 0x7c49da8e, 0x7ff2ed65,
+    0x41992f8a, 0xccc2247b, 0xf2a9e694, 0xf112d17f, 0xcf791390,
+    0xb662cf72, 0x88090d9d, 0x8bb23a76, 0xb5d9f899, 0xa007ba9e,
+    0x9e6c7871, 0x9dd74f9a, 0xa3bc8d75, 0xdaa75197, 0xe4cc9378,
+    0xe777a493, 0xd91c667c, 0x54476d8d, 0x6a2caf62, 0x69979889,
+    0x57fc5a66, 0x2ee78684, 0x108c446b, 0x13377380, 0x2d5cb16f,
+    0x488614b9, 0x76edd656, 0x7556e1bd, 0x4b3d2352, 0x3226ffb0,
+    0x0c4d3d5f, 0x0ff60ab4, 0x319dc85b, 0xbcc6c3aa, 0x82ad0145,
+    0x811636ae, 0xbf7df441, 0xc66628a3, 0xf80dea4c, 0xfbb6dda7,
+    0xc5dd1f48, 0x7004e7d1, 0x4e6f253e, 0x4dd412d5, 0x73bfd03a,
+    0x0aa40cd8, 0x34cfce37, 0x3774f9dc, 0x091f3b33, 0x844430c2,
+    0xba2ff22d, 0xb994c5c6, 0x87ff0729, 0xfee4dbcb, 0xc08f1924,
+    0xc3342ecf, 0xfd5fec20, 0x988549f6, 0xa6ee8b19, 0xa555bcf2,
+    0x9b3e7e1d, 0xe225a2ff, 0xdc4e6010, 0xdff557fb, 0xe19e9514,
+    0x6cc59ee5, 0x52ae5c0a, 0x51156be1, 0x6f7ea90e, 0x166575ec,
+    0x280eb703, 0x2bb580e8, 0x15de4207, 0x010905e6, 0x3f62c709,
+    0x3cd9f0e2, 0x02b2320d, 0x7ba9eeef, 0x45c22c00, 0x46791beb,
+    0x7812d904, 0xf549d2f5, 0xcb22101a, 0xc89927f1, 0xf6f2e51e,
+    0x8fe939fc, 0xb182fb13, 0xb239ccf8, 0x8c520e17, 0xe988abc1,
+    0xd7e3692e, 0xd4585ec5, 0xea339c2a, 0x932840c8, 0xad438227,
+    0xaef8b5cc, 0x90937723, 0x1dc87cd2, 0x23a3be3d, 0x201889d6,
+    0x1e734b39, 0x676897db, 0x59035534, 0x5ab862df, 0x64d3a030,
+    0xd10a58a9, 0xef619a46, 0xecdaadad, 0xd2b16f42, 0xabaab3a0,
+    0x95c1714f, 0x967a46a4, 0xa811844b, 0x254a8fba, 0x1b214d55,
+    0x189a7abe, 0x26f1b851, 0x5fea64b3, 0x6181a65c, 0x623a91b7,
+    0x5c515358, 0x398bf68e, 0x07e03461, 0x045b038a, 0x3a30c165,
+    0x432b1d87, 0x7d40df68, 0x7efbe883, 0x40902a6c, 0xcdcb219d,
+    0xf3a0e372, 0xf01bd499, 0xce701676, 0xb76bca94, 0x8900087b,
+    0x8abb3f90, 0xb4d0fd7f, 0xa10ebf78, 0x9f657d97, 0x9cde4a7c,
+    0xa2b58893, 0xdbae5471, 0xe5c5969e, 0xe67ea175, 0xd815639a,
+    0x554e686b, 0x6b25aa84, 0x689e9d6f, 0x56f55f80, 0x2fee8362,
+    0x1185418d, 0x123e7666, 0x2c55b489, 0x498f115f, 0x77e4d3b0,
+    0x745fe45b, 0x4a3426b4, 0x332ffa56, 0x0d4438b9, 0x0eff0f52,
+    0x3094cdbd, 0xbdcfc64c, 0x83a404a3, 0x801f3348, 0xbe74f1a7,
+    0xc76f2d45, 0xf904efaa, 0xfabfd841, 0xc4d41aae, 0x710de237,
+    0x4f6620d8, 0x4cdd1733, 0x72b6d5dc, 0x0bad093e, 0x35c6cbd1,
+    0x367dfc3a, 0x08163ed5, 0x854d3524, 0xbb26f7cb, 0xb89dc020,
+    0x86f602cf, 0xffedde2d, 0xc1861cc2, 0xc23d2b29, 0xfc56e9c6,
+    0x998c4c10, 0xa7e78eff, 0xa45cb914, 0x9a377bfb, 0xe32ca719,
+    0xdd4765f6, 0xdefc521d, 0xe09790f2, 0x6dcc9b03, 0x53a759ec,
+    0x501c6e07, 0x6e77ace8, 0x176c700a, 0x2907b2e5, 0x2abc850e,
+    0x14d747e1},
+   {0x00000000, 0xc0df8ec1, 0xc1b96c58, 0x0166e299, 0x8273d9b0,
+    0x42ac5771, 0x43cab5e8, 0x83153b29, 0x45e1c3ba, 0x853e4d7b,
+    0x8458afe2, 0x44872123, 0xc7921a0a, 0x074d94cb, 0x062b7652,
+    0xc6f4f893, 0xcbc4f6ae, 0x0b1b786f, 0x0a7d9af6, 0xcaa21437,
+    0x49b72f1e, 0x8968a1df, 0x880e4346, 0x48d1cd87, 0x8e253514,
+    0x4efabbd5, 0x4f9c594c, 0x8f43d78d, 0x0c56eca4, 0xcc896265,
+    0xcdef80fc, 0x0d300e3d, 0xd78f9c86, 0x17501247, 0x1636f0de,
+    0xd6e97e1f, 0x55fc4536, 0x9523cbf7, 0x9445296e, 0x549aa7af,
+    0x926e5f3c, 0x52b1d1fd, 0x53d73364, 0x9308bda5, 0x101d868c,
+    0xd0c2084d, 0xd1a4ead4, 0x117b6415, 0x1c4b6a28, 0xdc94e4e9,
+    0xddf20670, 0x1d2d88b1, 0x9e38b398, 0x5ee73d59, 0x5f81dfc0,
+    0x9f5e5101, 0x59aaa992, 0x99752753, 0x9813c5ca, 0x58cc4b0b,
+    0xdbd97022, 0x1b06fee3, 0x1a601c7a, 0xdabf92bb, 0xef1948d6,
+    0x2fc6c617, 0x2ea0248e, 0xee7faa4f, 0x6d6a9166, 0xadb51fa7,
+    0xacd3fd3e, 0x6c0c73ff, 0xaaf88b6c, 0x6a2705ad, 0x6b41e734,
+    0xab9e69f5, 0x288b52dc, 0xe854dc1d, 0xe9323e84, 0x29edb045,
+    0x24ddbe78, 0xe40230b9, 0xe564d220, 0x25bb5ce1, 0xa6ae67c8,
+    0x6671e909, 0x67170b90, 0xa7c88551, 0x613c7dc2, 0xa1e3f303,
+    0xa085119a, 0x605a9f5b, 0xe34fa472, 0x23902ab3, 0x22f6c82a,
+    0xe22946eb, 0x3896d450, 0xf8495a91, 0xf92fb808, 0x39f036c9,
+    0xbae50de0, 0x7a3a8321, 0x7b5c61b8, 0xbb83ef79, 0x7d7717ea,
+    0xbda8992b, 0xbcce7bb2, 0x7c11f573, 0xff04ce5a, 0x3fdb409b,
+    0x3ebda202, 0xfe622cc3, 0xf35222fe, 0x338dac3f, 0x32eb4ea6,
+    0xf234c067, 0x7121fb4e, 0xb1fe758f, 0xb0989716, 0x704719d7,
+    0xb6b3e144, 0x766c6f85, 0x770a8d1c, 0xb7d503dd, 0x34c038f4,
+    0xf41fb635, 0xf57954ac, 0x35a6da6d, 0x9f35e177, 0x5fea6fb6,
+    0x5e8c8d2f, 0x9e5303ee, 0x1d4638c7, 0xdd99b606, 0xdcff549f,
+    0x1c20da5e, 0xdad422cd, 0x1a0bac0c, 0x1b6d4e95, 0xdbb2c054,
+    0x58a7fb7d, 0x987875bc, 0x991e9725, 0x59c119e4, 0x54f117d9,
+    0x942e9918, 0x95487b81, 0x5597f540, 0xd682ce69, 0x165d40a8,
+    0x173ba231, 0xd7e42cf0, 0x1110d463, 0xd1cf5aa2, 0xd0a9b83b,
+    0x107636fa, 0x93630dd3, 0x53bc8312, 0x52da618b, 0x9205ef4a,
+    0x48ba7df1, 0x8865f330, 0x890311a9, 0x49dc9f68, 0xcac9a441,
+    0x0a162a80, 0x0b70c819, 0xcbaf46d8, 0x0d5bbe4b, 0xcd84308a,
+    0xcce2d213, 0x0c3d5cd2, 0x8f2867fb, 0x4ff7e93a, 0x4e910ba3,
+    0x8e4e8562, 0x837e8b5f, 0x43a1059e, 0x42c7e707, 0x821869c6,
+    0x010d52ef, 0xc1d2dc2e, 0xc0b43eb7, 0x006bb076, 0xc69f48e5,
+    0x0640c624, 0x072624bd, 0xc7f9aa7c, 0x44ec9155, 0x84331f94,
+    0x8555fd0d, 0x458a73cc, 0x702ca9a1, 0xb0f32760, 0xb195c5f9,
+    0x714a4b38, 0xf25f7011, 0x3280fed0, 0x33e61c49, 0xf3399288,
+    0x35cd6a1b, 0xf512e4da, 0xf4740643, 0x34ab8882, 0xb7beb3ab,
+    0x77613d6a, 0x7607dff3, 0xb6d85132, 0xbbe85f0f, 0x7b37d1ce,
+    0x7a513357, 0xba8ebd96, 0x399b86bf, 0xf944087e, 0xf822eae7,
+    0x38fd6426, 0xfe099cb5, 0x3ed61274, 0x3fb0f0ed, 0xff6f7e2c,
+    0x7c7a4505, 0xbca5cbc4, 0xbdc3295d, 0x7d1ca79c, 0xa7a33527,
+    0x677cbbe6, 0x661a597f, 0xa6c5d7be, 0x25d0ec97, 0xe50f6256,
+    0xe46980cf, 0x24b60e0e, 0xe242f69d, 0x229d785c, 0x23fb9ac5,
+    0xe3241404, 0x60312f2d, 0xa0eea1ec, 0xa1884375, 0x6157cdb4,
+    0x6c67c389, 0xacb84d48, 0xaddeafd1, 0x6d012110, 0xee141a39,
+    0x2ecb94f8, 0x2fad7661, 0xef72f8a0, 0x29860033, 0xe9598ef2,
+    0xe83f6c6b, 0x28e0e2aa, 0xabf5d983, 0x6b2a5742, 0x6a4cb5db,
+    0xaa933b1a},
+   {0x00000000, 0x6f4ca59b, 0x9f9e3bec, 0xf0d29e77, 0x7f3b0603,
+    0x1077a398, 0xe0a53def, 0x8fe99874, 0xfe760c06, 0x913aa99d,
+    0x61e837ea, 0x0ea49271, 0x814d0a05, 0xee01af9e, 0x1ed331e9,
+    0x719f9472, 0xfced180c, 0x93a1bd97, 0x637323e0, 0x0c3f867b,
+    0x83d61e0f, 0xec9abb94, 0x1c4825e3, 0x73048078, 0x029b140a,
+    0x6dd7b191, 0x9d052fe6, 0xf2498a7d, 0x7da01209, 0x12ecb792,
+    0xe23e29e5, 0x8d728c7e, 0xf8db3118, 0x97979483, 0x67450af4,
+    0x0809af6f, 0x87e0371b, 0xe8ac9280, 0x187e0cf7, 0x7732a96c,
+    0x06ad3d1e, 0x69e19885, 0x993306f2, 0xf67fa369, 0x79963b1d,
+    0x16da9e86, 0xe60800f1, 0x8944a56a, 0x04362914, 0x6b7a8c8f,
+    0x9ba812f8, 0xf4e4b763, 0x7b0d2f17, 0x14418a8c, 0xe49314fb,
+    0x8bdfb160, 0xfa402512, 0x950c8089, 0x65de1efe, 0x0a92bb65,
+    0x857b2311, 0xea37868a, 0x1ae518fd, 0x75a9bd66, 0xf0b76330,
+    0x9ffbc6ab, 0x6f2958dc, 0x0065fd47, 0x8f8c6533, 0xe0c0c0a8,
+    0x10125edf, 0x7f5efb44, 0x0ec16f36, 0x618dcaad, 0x915f54da,
+    0xfe13f141, 0x71fa6935, 0x1eb6ccae, 0xee6452d9, 0x8128f742,
+    0x0c5a7b3c, 0x6316dea7, 0x93c440d0, 0xfc88e54b, 0x73617d3f,
+    0x1c2dd8a4, 0xecff46d3, 0x83b3e348, 0xf22c773a, 0x9d60d2a1,
+    0x6db24cd6, 0x02fee94d, 0x8d177139, 0xe25bd4a2, 0x12894ad5,
+    0x7dc5ef4e, 0x086c5228, 0x6720f7b3, 0x97f269c4, 0xf8becc5f,
+    0x7757542b, 0x181bf1b0, 0xe8c96fc7, 0x8785ca5c, 0xf61a5e2e,
+    0x9956fbb5, 0x698465c2, 0x06c8c059, 0x8921582d, 0xe66dfdb6,
+    0x16bf63c1, 0x79f3c65a, 0xf4814a24, 0x9bcdefbf, 0x6b1f71c8,
+    0x0453d453, 0x8bba4c27, 0xe4f6e9bc, 0x142477cb, 0x7b68d250,
+    0x0af74622, 0x65bbe3b9, 0x95697dce, 0xfa25d855, 0x75cc4021,
+    0x1a80e5ba, 0xea527bcd, 0x851ede56, 0xe06fc760, 0x8f2362fb,
+    0x7ff1fc8c, 0x10bd5917, 0x9f54c163, 0xf01864f8, 0x00cafa8f,
+    0x6f865f14, 0x1e19cb66, 0x71556efd, 0x8187f08a, 0xeecb5511,
+    0x6122cd65, 0x0e6e68fe, 0xfebcf689, 0x91f05312, 0x1c82df6c,
+    0x73ce7af7, 0x831ce480, 0xec50411b, 0x63b9d96f, 0x0cf57cf4,
+    0xfc27e283, 0x936b4718, 0xe2f4d36a, 0x8db876f1, 0x7d6ae886,
+    0x12264d1d, 0x9dcfd569, 0xf28370f2, 0x0251ee85, 0x6d1d4b1e,
+    0x18b4f678, 0x77f853e3, 0x872acd94, 0xe866680f, 0x678ff07b,
+    0x08c355e0, 0xf811cb97, 0x975d6e0c, 0xe6c2fa7e, 0x898e5fe5,
+    0x795cc192, 0x16106409, 0x99f9fc7d, 0xf6b559e6, 0x0667c791,
+    0x692b620a, 0xe459ee74, 0x8b154bef, 0x7bc7d598, 0x148b7003,
+    0x9b62e877, 0xf42e4dec, 0x04fcd39b, 0x6bb07600, 0x1a2fe272,
+    0x756347e9, 0x85b1d99e, 0xeafd7c05, 0x6514e471, 0x0a5841ea,
+    0xfa8adf9d, 0x95c67a06, 0x10d8a450, 0x7f9401cb, 0x8f469fbc,
+    0xe00a3a27, 0x6fe3a253, 0x00af07c8, 0xf07d99bf, 0x9f313c24,
+    0xeeaea856, 0x81e20dcd, 0x713093ba, 0x1e7c3621, 0x9195ae55,
+    0xfed90bce, 0x0e0b95b9, 0x61473022, 0xec35bc5c, 0x837919c7,
+    0x73ab87b0, 0x1ce7222b, 0x930eba5f, 0xfc421fc4, 0x0c9081b3,
+    0x63dc2428, 0x1243b05a, 0x7d0f15c1, 0x8ddd8bb6, 0xe2912e2d,
+    0x6d78b659, 0x023413c2, 0xf2e68db5, 0x9daa282e, 0xe8039548,
+    0x874f30d3, 0x779daea4, 0x18d10b3f, 0x9738934b, 0xf87436d0,
+    0x08a6a8a7, 0x67ea0d3c, 0x1675994e, 0x79393cd5, 0x89eba2a2,
+    0xe6a70739, 0x694e9f4d, 0x06023ad6, 0xf6d0a4a1, 0x999c013a,
+    0x14ee8d44, 0x7ba228df, 0x8b70b6a8, 0xe43c1333, 0x6bd58b47,
+    0x04992edc, 0xf44bb0ab, 0x9b071530, 0xea988142, 0x85d424d9,
+    0x7506baae, 0x1a4a1f35, 0x95a38741, 0xfaef22da, 0x0a3dbcad,
+    0x65711936}};
+
+#endif /* W */
+
+#endif /* N == 3 */
+#if N == 4
+
+#if W == 8
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0xf1da05aa, 0x38c50d15, 0xc91f08bf, 0x718a1a2a,
+    0x80501f80, 0x494f173f, 0xb8951295, 0xe3143454, 0x12ce31fe,
+    0xdbd13941, 0x2a0b3ceb, 0x929e2e7e, 0x63442bd4, 0xaa5b236b,
+    0x5b8126c1, 0x1d596ee9, 0xec836b43, 0x259c63fc, 0xd4466656,
+    0x6cd374c3, 0x9d097169, 0x541679d6, 0xa5cc7c7c, 0xfe4d5abd,
+    0x0f975f17, 0xc68857a8, 0x37525202, 0x8fc74097, 0x7e1d453d,
+    0xb7024d82, 0x46d84828, 0x3ab2ddd2, 0xcb68d878, 0x0277d0c7,
+    0xf3add56d, 0x4b38c7f8, 0xbae2c252, 0x73fdcaed, 0x8227cf47,
+    0xd9a6e986, 0x287cec2c, 0xe163e493, 0x10b9e139, 0xa82cf3ac,
+    0x59f6f606, 0x90e9feb9, 0x6133fb13, 0x27ebb33b, 0xd631b691,
+    0x1f2ebe2e, 0xeef4bb84, 0x5661a911, 0xa7bbacbb, 0x6ea4a404,
+    0x9f7ea1ae, 0xc4ff876f, 0x352582c5, 0xfc3a8a7a, 0x0de08fd0,
+    0xb5759d45, 0x44af98ef, 0x8db09050, 0x7c6a95fa, 0x7565bba4,
+    0x84bfbe0e, 0x4da0b6b1, 0xbc7ab31b, 0x04efa18e, 0xf535a424,
+    0x3c2aac9b, 0xcdf0a931, 0x96718ff0, 0x67ab8a5a, 0xaeb482e5,
+    0x5f6e874f, 0xe7fb95da, 0x16219070, 0xdf3e98cf, 0x2ee49d65,
+    0x683cd54d, 0x99e6d0e7, 0x50f9d858, 0xa123ddf2, 0x19b6cf67,
+    0xe86ccacd, 0x2173c272, 0xd0a9c7d8, 0x8b28e119, 0x7af2e4b3,
+    0xb3edec0c, 0x4237e9a6, 0xfaa2fb33, 0x0b78fe99, 0xc267f626,
+    0x33bdf38c, 0x4fd76676, 0xbe0d63dc, 0x77126b63, 0x86c86ec9,
+    0x3e5d7c5c, 0xcf8779f6, 0x06987149, 0xf74274e3, 0xacc35222,
+    0x5d195788, 0x94065f37, 0x65dc5a9d, 0xdd494808, 0x2c934da2,
+    0xe58c451d, 0x145640b7, 0x528e089f, 0xa3540d35, 0x6a4b058a,
+    0x9b910020, 0x230412b5, 0xd2de171f, 0x1bc11fa0, 0xea1b1a0a,
+    0xb19a3ccb, 0x40403961, 0x895f31de, 0x78853474, 0xc01026e1,
+    0x31ca234b, 0xf8d52bf4, 0x090f2e5e, 0xeacb7748, 0x1b1172e2,
+    0xd20e7a5d, 0x23d47ff7, 0x9b416d62, 0x6a9b68c8, 0xa3846077,
+    0x525e65dd, 0x09df431c, 0xf80546b6, 0x311a4e09, 0xc0c04ba3,
+    0x78555936, 0x898f5c9c, 0x40905423, 0xb14a5189, 0xf79219a1,
+    0x06481c0b, 0xcf5714b4, 0x3e8d111e, 0x8618038b, 0x77c20621,
+    0xbedd0e9e, 0x4f070b34, 0x14862df5, 0xe55c285f, 0x2c4320e0,
+    0xdd99254a, 0x650c37df, 0x94d63275, 0x5dc93aca, 0xac133f60,
+    0xd079aa9a, 0x21a3af30, 0xe8bca78f, 0x1966a225, 0xa1f3b0b0,
+    0x5029b51a, 0x9936bda5, 0x68ecb80f, 0x336d9ece, 0xc2b79b64,
+    0x0ba893db, 0xfa729671, 0x42e784e4, 0xb33d814e, 0x7a2289f1,
+    0x8bf88c5b, 0xcd20c473, 0x3cfac1d9, 0xf5e5c966, 0x043fcccc,
+    0xbcaade59, 0x4d70dbf3, 0x846fd34c, 0x75b5d6e6, 0x2e34f027,
+    0xdfeef58d, 0x16f1fd32, 0xe72bf898, 0x5fbeea0d, 0xae64efa7,
+    0x677be718, 0x96a1e2b2, 0x9faeccec, 0x6e74c946, 0xa76bc1f9,
+    0x56b1c453, 0xee24d6c6, 0x1ffed36c, 0xd6e1dbd3, 0x273bde79,
+    0x7cbaf8b8, 0x8d60fd12, 0x447ff5ad, 0xb5a5f007, 0x0d30e292,
+    0xfceae738, 0x35f5ef87, 0xc42fea2d, 0x82f7a205, 0x732da7af,
+    0xba32af10, 0x4be8aaba, 0xf37db82f, 0x02a7bd85, 0xcbb8b53a,
+    0x3a62b090, 0x61e39651, 0x903993fb, 0x59269b44, 0xa8fc9eee,
+    0x10698c7b, 0xe1b389d1, 0x28ac816e, 0xd97684c4, 0xa51c113e,
+    0x54c61494, 0x9dd91c2b, 0x6c031981, 0xd4960b14, 0x254c0ebe,
+    0xec530601, 0x1d8903ab, 0x4608256a, 0xb7d220c0, 0x7ecd287f,
+    0x8f172dd5, 0x37823f40, 0xc6583aea, 0x0f473255, 0xfe9d37ff,
+    0xb8457fd7, 0x499f7a7d, 0x808072c2, 0x715a7768, 0xc9cf65fd,
+    0x38156057, 0xf10a68e8, 0x00d06d42, 0x5b514b83, 0xaa8b4e29,
+    0x63944696, 0x924e433c, 0x2adb51a9, 0xdb015403, 0x121e5cbc,
+    0xe3c45916},
+   {0x00000000, 0x0ee7e8d1, 0x1dcfd1a2, 0x13283973, 0x3b9fa344,
+    0x35784b95, 0x265072e6, 0x28b79a37, 0x773f4688, 0x79d8ae59,
+    0x6af0972a, 0x64177ffb, 0x4ca0e5cc, 0x42470d1d, 0x516f346e,
+    0x5f88dcbf, 0xee7e8d10, 0xe09965c1, 0xf3b15cb2, 0xfd56b463,
+    0xd5e12e54, 0xdb06c685, 0xc82efff6, 0xc6c91727, 0x9941cb98,
+    0x97a62349, 0x848e1a3a, 0x8a69f2eb, 0xa2de68dc, 0xac39800d,
+    0xbf11b97e, 0xb1f651af, 0x078c1c61, 0x096bf4b0, 0x1a43cdc3,
+    0x14a42512, 0x3c13bf25, 0x32f457f4, 0x21dc6e87, 0x2f3b8656,
+    0x70b35ae9, 0x7e54b238, 0x6d7c8b4b, 0x639b639a, 0x4b2cf9ad,
+    0x45cb117c, 0x56e3280f, 0x5804c0de, 0xe9f29171, 0xe71579a0,
+    0xf43d40d3, 0xfadaa802, 0xd26d3235, 0xdc8adae4, 0xcfa2e397,
+    0xc1450b46, 0x9ecdd7f9, 0x902a3f28, 0x8302065b, 0x8de5ee8a,
+    0xa55274bd, 0xabb59c6c, 0xb89da51f, 0xb67a4dce, 0x0f1838c2,
+    0x01ffd013, 0x12d7e960, 0x1c3001b1, 0x34879b86, 0x3a607357,
+    0x29484a24, 0x27afa2f5, 0x78277e4a, 0x76c0969b, 0x65e8afe8,
+    0x6b0f4739, 0x43b8dd0e, 0x4d5f35df, 0x5e770cac, 0x5090e47d,
+    0xe166b5d2, 0xef815d03, 0xfca96470, 0xf24e8ca1, 0xdaf91696,
+    0xd41efe47, 0xc736c734, 0xc9d12fe5, 0x9659f35a, 0x98be1b8b,
+    0x8b9622f8, 0x8571ca29, 0xadc6501e, 0xa321b8cf, 0xb00981bc,
+    0xbeee696d, 0x089424a3, 0x0673cc72, 0x155bf501, 0x1bbc1dd0,
+    0x330b87e7, 0x3dec6f36, 0x2ec45645, 0x2023be94, 0x7fab622b,
+    0x714c8afa, 0x6264b389, 0x6c835b58, 0x4434c16f, 0x4ad329be,
+    0x59fb10cd, 0x571cf81c, 0xe6eaa9b3, 0xe80d4162, 0xfb257811,
+    0xf5c290c0, 0xdd750af7, 0xd392e226, 0xc0badb55, 0xce5d3384,
+    0x91d5ef3b, 0x9f3207ea, 0x8c1a3e99, 0x82fdd648, 0xaa4a4c7f,
+    0xa4ada4ae, 0xb7859ddd, 0xb962750c, 0x1e307184, 0x10d79955,
+    0x03ffa026, 0x0d1848f7, 0x25afd2c0, 0x2b483a11, 0x38600362,
+    0x3687ebb3, 0x690f370c, 0x67e8dfdd, 0x74c0e6ae, 0x7a270e7f,
+    0x52909448, 0x5c777c99, 0x4f5f45ea, 0x41b8ad3b, 0xf04efc94,
+    0xfea91445, 0xed812d36, 0xe366c5e7, 0xcbd15fd0, 0xc536b701,
+    0xd61e8e72, 0xd8f966a3, 0x8771ba1c, 0x899652cd, 0x9abe6bbe,
+    0x9459836f, 0xbcee1958, 0xb209f189, 0xa121c8fa, 0xafc6202b,
+    0x19bc6de5, 0x175b8534, 0x0473bc47, 0x0a945496, 0x2223cea1,
+    0x2cc42670, 0x3fec1f03, 0x310bf7d2, 0x6e832b6d, 0x6064c3bc,
+    0x734cfacf, 0x7dab121e, 0x551c8829, 0x5bfb60f8, 0x48d3598b,
+    0x4634b15a, 0xf7c2e0f5, 0xf9250824, 0xea0d3157, 0xe4ead986,
+    0xcc5d43b1, 0xc2baab60, 0xd1929213, 0xdf757ac2, 0x80fda67d,
+    0x8e1a4eac, 0x9d3277df, 0x93d59f0e, 0xbb620539, 0xb585ede8,
+    0xa6add49b, 0xa84a3c4a, 0x11284946, 0x1fcfa197, 0x0ce798e4,
+    0x02007035, 0x2ab7ea02, 0x245002d3, 0x37783ba0, 0x399fd371,
+    0x66170fce, 0x68f0e71f, 0x7bd8de6c, 0x753f36bd, 0x5d88ac8a,
+    0x536f445b, 0x40477d28, 0x4ea095f9, 0xff56c456, 0xf1b12c87,
+    0xe29915f4, 0xec7efd25, 0xc4c96712, 0xca2e8fc3, 0xd906b6b0,
+    0xd7e15e61, 0x886982de, 0x868e6a0f, 0x95a6537c, 0x9b41bbad,
+    0xb3f6219a, 0xbd11c94b, 0xae39f038, 0xa0de18e9, 0x16a45527,
+    0x1843bdf6, 0x0b6b8485, 0x058c6c54, 0x2d3bf663, 0x23dc1eb2,
+    0x30f427c1, 0x3e13cf10, 0x619b13af, 0x6f7cfb7e, 0x7c54c20d,
+    0x72b32adc, 0x5a04b0eb, 0x54e3583a, 0x47cb6149, 0x492c8998,
+    0xf8dad837, 0xf63d30e6, 0xe5150995, 0xebf2e144, 0xc3457b73,
+    0xcda293a2, 0xde8aaad1, 0xd06d4200, 0x8fe59ebf, 0x8102766e,
+    0x922a4f1d, 0x9ccda7cc, 0xb47a3dfb, 0xba9dd52a, 0xa9b5ec59,
+    0xa7520488},
+   {0x00000000, 0x3c60e308, 0x78c1c610, 0x44a12518, 0xf1838c20,
+    0xcde36f28, 0x89424a30, 0xb522a938, 0x38761e01, 0x0416fd09,
+    0x40b7d811, 0x7cd73b19, 0xc9f59221, 0xf5957129, 0xb1345431,
+    0x8d54b739, 0x70ec3c02, 0x4c8cdf0a, 0x082dfa12, 0x344d191a,
+    0x816fb022, 0xbd0f532a, 0xf9ae7632, 0xc5ce953a, 0x489a2203,
+    0x74fac10b, 0x305be413, 0x0c3b071b, 0xb919ae23, 0x85794d2b,
+    0xc1d86833, 0xfdb88b3b, 0xe1d87804, 0xddb89b0c, 0x9919be14,
+    0xa5795d1c, 0x105bf424, 0x2c3b172c, 0x689a3234, 0x54fad13c,
+    0xd9ae6605, 0xe5ce850d, 0xa16fa015, 0x9d0f431d, 0x282dea25,
+    0x144d092d, 0x50ec2c35, 0x6c8ccf3d, 0x91344406, 0xad54a70e,
+    0xe9f58216, 0xd595611e, 0x60b7c826, 0x5cd72b2e, 0x18760e36,
+    0x2416ed3e, 0xa9425a07, 0x9522b90f, 0xd1839c17, 0xede37f1f,
+    0x58c1d627, 0x64a1352f, 0x20001037, 0x1c60f33f, 0x18c1f649,
+    0x24a11541, 0x60003059, 0x5c60d351, 0xe9427a69, 0xd5229961,
+    0x9183bc79, 0xade35f71, 0x20b7e848, 0x1cd70b40, 0x58762e58,
+    0x6416cd50, 0xd1346468, 0xed548760, 0xa9f5a278, 0x95954170,
+    0x682dca4b, 0x544d2943, 0x10ec0c5b, 0x2c8cef53, 0x99ae466b,
+    0xa5cea563, 0xe16f807b, 0xdd0f6373, 0x505bd44a, 0x6c3b3742,
+    0x289a125a, 0x14faf152, 0xa1d8586a, 0x9db8bb62, 0xd9199e7a,
+    0xe5797d72, 0xf9198e4d, 0xc5796d45, 0x81d8485d, 0xbdb8ab55,
+    0x089a026d, 0x34fae165, 0x705bc47d, 0x4c3b2775, 0xc16f904c,
+    0xfd0f7344, 0xb9ae565c, 0x85ceb554, 0x30ec1c6c, 0x0c8cff64,
+    0x482dda7c, 0x744d3974, 0x89f5b24f, 0xb5955147, 0xf134745f,
+    0xcd549757, 0x78763e6f, 0x4416dd67, 0x00b7f87f, 0x3cd71b77,
+    0xb183ac4e, 0x8de34f46, 0xc9426a5e, 0xf5228956, 0x4000206e,
+    0x7c60c366, 0x38c1e67e, 0x04a10576, 0x3183ec92, 0x0de30f9a,
+    0x49422a82, 0x7522c98a, 0xc00060b2, 0xfc6083ba, 0xb8c1a6a2,
+    0x84a145aa, 0x09f5f293, 0x3595119b, 0x71343483, 0x4d54d78b,
+    0xf8767eb3, 0xc4169dbb, 0x80b7b8a3, 0xbcd75bab, 0x416fd090,
+    0x7d0f3398, 0x39ae1680, 0x05cef588, 0xb0ec5cb0, 0x8c8cbfb8,
+    0xc82d9aa0, 0xf44d79a8, 0x7919ce91, 0x45792d99, 0x01d80881,
+    0x3db8eb89, 0x889a42b1, 0xb4faa1b9, 0xf05b84a1, 0xcc3b67a9,
+    0xd05b9496, 0xec3b779e, 0xa89a5286, 0x94fab18e, 0x21d818b6,
+    0x1db8fbbe, 0x5919dea6, 0x65793dae, 0xe82d8a97, 0xd44d699f,
+    0x90ec4c87, 0xac8caf8f, 0x19ae06b7, 0x25cee5bf, 0x616fc0a7,
+    0x5d0f23af, 0xa0b7a894, 0x9cd74b9c, 0xd8766e84, 0xe4168d8c,
+    0x513424b4, 0x6d54c7bc, 0x29f5e2a4, 0x159501ac, 0x98c1b695,
+    0xa4a1559d, 0xe0007085, 0xdc60938d, 0x69423ab5, 0x5522d9bd,
+    0x1183fca5, 0x2de31fad, 0x29421adb, 0x1522f9d3, 0x5183dccb,
+    0x6de33fc3, 0xd8c196fb, 0xe4a175f3, 0xa00050eb, 0x9c60b3e3,
+    0x113404da, 0x2d54e7d2, 0x69f5c2ca, 0x559521c2, 0xe0b788fa,
+    0xdcd76bf2, 0x98764eea, 0xa416ade2, 0x59ae26d9, 0x65cec5d1,
+    0x216fe0c9, 0x1d0f03c1, 0xa82daaf9, 0x944d49f1, 0xd0ec6ce9,
+    0xec8c8fe1, 0x61d838d8, 0x5db8dbd0, 0x1919fec8, 0x25791dc0,
+    0x905bb4f8, 0xac3b57f0, 0xe89a72e8, 0xd4fa91e0, 0xc89a62df,
+    0xf4fa81d7, 0xb05ba4cf, 0x8c3b47c7, 0x3919eeff, 0x05790df7,
+    0x41d828ef, 0x7db8cbe7, 0xf0ec7cde, 0xcc8c9fd6, 0x882dbace,
+    0xb44d59c6, 0x016ff0fe, 0x3d0f13f6, 0x79ae36ee, 0x45ced5e6,
+    0xb8765edd, 0x8416bdd5, 0xc0b798cd, 0xfcd77bc5, 0x49f5d2fd,
+    0x759531f5, 0x313414ed, 0x0d54f7e5, 0x800040dc, 0xbc60a3d4,
+    0xf8c186cc, 0xc4a165c4, 0x7183ccfc, 0x4de32ff4, 0x09420aec,
+    0x3522e9e4},
+   {0x00000000, 0x6307d924, 0xc60fb248, 0xa5086b6c, 0x576e62d1,
+    0x3469bbf5, 0x9161d099, 0xf26609bd, 0xaedcc5a2, 0xcddb1c86,
+    0x68d377ea, 0x0bd4aece, 0xf9b2a773, 0x9ab57e57, 0x3fbd153b,
+    0x5cbacc1f, 0x86c88d05, 0xe5cf5421, 0x40c73f4d, 0x23c0e669,
+    0xd1a6efd4, 0xb2a136f0, 0x17a95d9c, 0x74ae84b8, 0x281448a7,
+    0x4b139183, 0xee1bfaef, 0x8d1c23cb, 0x7f7a2a76, 0x1c7df352,
+    0xb975983e, 0xda72411a, 0xd6e01c4b, 0xb5e7c56f, 0x10efae03,
+    0x73e87727, 0x818e7e9a, 0xe289a7be, 0x4781ccd2, 0x248615f6,
+    0x783cd9e9, 0x1b3b00cd, 0xbe336ba1, 0xdd34b285, 0x2f52bb38,
+    0x4c55621c, 0xe95d0970, 0x8a5ad054, 0x5028914e, 0x332f486a,
+    0x96272306, 0xf520fa22, 0x0746f39f, 0x64412abb, 0xc14941d7,
+    0xa24e98f3, 0xfef454ec, 0x9df38dc8, 0x38fbe6a4, 0x5bfc3f80,
+    0xa99a363d, 0xca9def19, 0x6f958475, 0x0c925d51, 0x76b13ed7,
+    0x15b6e7f3, 0xb0be8c9f, 0xd3b955bb, 0x21df5c06, 0x42d88522,
+    0xe7d0ee4e, 0x84d7376a, 0xd86dfb75, 0xbb6a2251, 0x1e62493d,
+    0x7d659019, 0x8f0399a4, 0xec044080, 0x490c2bec, 0x2a0bf2c8,
+    0xf079b3d2, 0x937e6af6, 0x3676019a, 0x5571d8be, 0xa717d103,
+    0xc4100827, 0x6118634b, 0x021fba6f, 0x5ea57670, 0x3da2af54,
+    0x98aac438, 0xfbad1d1c, 0x09cb14a1, 0x6acccd85, 0xcfc4a6e9,
+    0xacc37fcd, 0xa051229c, 0xc356fbb8, 0x665e90d4, 0x055949f0,
+    0xf73f404d, 0x94389969, 0x3130f205, 0x52372b21, 0x0e8de73e,
+    0x6d8a3e1a, 0xc8825576, 0xab858c52, 0x59e385ef, 0x3ae45ccb,
+    0x9fec37a7, 0xfcebee83, 0x2699af99, 0x459e76bd, 0xe0961dd1,
+    0x8391c4f5, 0x71f7cd48, 0x12f0146c, 0xb7f87f00, 0xd4ffa624,
+    0x88456a3b, 0xeb42b31f, 0x4e4ad873, 0x2d4d0157, 0xdf2b08ea,
+    0xbc2cd1ce, 0x1924baa2, 0x7a236386, 0xed627dae, 0x8e65a48a,
+    0x2b6dcfe6, 0x486a16c2, 0xba0c1f7f, 0xd90bc65b, 0x7c03ad37,
+    0x1f047413, 0x43beb80c, 0x20b96128, 0x85b10a44, 0xe6b6d360,
+    0x14d0dadd, 0x77d703f9, 0xd2df6895, 0xb1d8b1b1, 0x6baaf0ab,
+    0x08ad298f, 0xada542e3, 0xcea29bc7, 0x3cc4927a, 0x5fc34b5e,
+    0xfacb2032, 0x99ccf916, 0xc5763509, 0xa671ec2d, 0x03798741,
+    0x607e5e65, 0x921857d8, 0xf11f8efc, 0x5417e590, 0x37103cb4,
+    0x3b8261e5, 0x5885b8c1, 0xfd8dd3ad, 0x9e8a0a89, 0x6cec0334,
+    0x0febda10, 0xaae3b17c, 0xc9e46858, 0x955ea447, 0xf6597d63,
+    0x5351160f, 0x3056cf2b, 0xc230c696, 0xa1371fb2, 0x043f74de,
+    0x6738adfa, 0xbd4aece0, 0xde4d35c4, 0x7b455ea8, 0x1842878c,
+    0xea248e31, 0x89235715, 0x2c2b3c79, 0x4f2ce55d, 0x13962942,
+    0x7091f066, 0xd5999b0a, 0xb69e422e, 0x44f84b93, 0x27ff92b7,
+    0x82f7f9db, 0xe1f020ff, 0x9bd34379, 0xf8d49a5d, 0x5ddcf131,
+    0x3edb2815, 0xccbd21a8, 0xafbaf88c, 0x0ab293e0, 0x69b54ac4,
+    0x350f86db, 0x56085fff, 0xf3003493, 0x9007edb7, 0x6261e40a,
+    0x01663d2e, 0xa46e5642, 0xc7698f66, 0x1d1bce7c, 0x7e1c1758,
+    0xdb147c34, 0xb813a510, 0x4a75acad, 0x29727589, 0x8c7a1ee5,
+    0xef7dc7c1, 0xb3c70bde, 0xd0c0d2fa, 0x75c8b996, 0x16cf60b2,
+    0xe4a9690f, 0x87aeb02b, 0x22a6db47, 0x41a10263, 0x4d335f32,
+    0x2e348616, 0x8b3ced7a, 0xe83b345e, 0x1a5d3de3, 0x795ae4c7,
+    0xdc528fab, 0xbf55568f, 0xe3ef9a90, 0x80e843b4, 0x25e028d8,
+    0x46e7f1fc, 0xb481f841, 0xd7862165, 0x728e4a09, 0x1189932d,
+    0xcbfbd237, 0xa8fc0b13, 0x0df4607f, 0x6ef3b95b, 0x9c95b0e6,
+    0xff9269c2, 0x5a9a02ae, 0x399ddb8a, 0x65271795, 0x0620ceb1,
+    0xa328a5dd, 0xc02f7cf9, 0x32497544, 0x514eac60, 0xf446c70c,
+    0x97411e28},
+   {0x00000000, 0x01b5fd1d, 0x036bfa3a, 0x02de0727, 0x06d7f474,
+    0x07620969, 0x05bc0e4e, 0x0409f353, 0x0dafe8e8, 0x0c1a15f5,
+    0x0ec412d2, 0x0f71efcf, 0x0b781c9c, 0x0acde181, 0x0813e6a6,
+    0x09a61bbb, 0x1b5fd1d0, 0x1aea2ccd, 0x18342bea, 0x1981d6f7,
+    0x1d8825a4, 0x1c3dd8b9, 0x1ee3df9e, 0x1f562283, 0x16f03938,
+    0x1745c425, 0x159bc302, 0x142e3e1f, 0x1027cd4c, 0x11923051,
+    0x134c3776, 0x12f9ca6b, 0x36bfa3a0, 0x370a5ebd, 0x35d4599a,
+    0x3461a487, 0x306857d4, 0x31ddaac9, 0x3303adee, 0x32b650f3,
+    0x3b104b48, 0x3aa5b655, 0x387bb172, 0x39ce4c6f, 0x3dc7bf3c,
+    0x3c724221, 0x3eac4506, 0x3f19b81b, 0x2de07270, 0x2c558f6d,
+    0x2e8b884a, 0x2f3e7557, 0x2b378604, 0x2a827b19, 0x285c7c3e,
+    0x29e98123, 0x204f9a98, 0x21fa6785, 0x232460a2, 0x22919dbf,
+    0x26986eec, 0x272d93f1, 0x25f394d6, 0x244669cb, 0x6d7f4740,
+    0x6ccaba5d, 0x6e14bd7a, 0x6fa14067, 0x6ba8b334, 0x6a1d4e29,
+    0x68c3490e, 0x6976b413, 0x60d0afa8, 0x616552b5, 0x63bb5592,
+    0x620ea88f, 0x66075bdc, 0x67b2a6c1, 0x656ca1e6, 0x64d95cfb,
+    0x76209690, 0x77956b8d, 0x754b6caa, 0x74fe91b7, 0x70f762e4,
+    0x71429ff9, 0x739c98de, 0x722965c3, 0x7b8f7e78, 0x7a3a8365,
+    0x78e48442, 0x7951795f, 0x7d588a0c, 0x7ced7711, 0x7e337036,
+    0x7f868d2b, 0x5bc0e4e0, 0x5a7519fd, 0x58ab1eda, 0x591ee3c7,
+    0x5d171094, 0x5ca2ed89, 0x5e7ceaae, 0x5fc917b3, 0x566f0c08,
+    0x57daf115, 0x5504f632, 0x54b10b2f, 0x50b8f87c, 0x510d0561,
+    0x53d30246, 0x5266ff5b, 0x409f3530, 0x412ac82d, 0x43f4cf0a,
+    0x42413217, 0x4648c144, 0x47fd3c59, 0x45233b7e, 0x4496c663,
+    0x4d30ddd8, 0x4c8520c5, 0x4e5b27e2, 0x4feedaff, 0x4be729ac,
+    0x4a52d4b1, 0x488cd396, 0x49392e8b, 0xdafe8e80, 0xdb4b739d,
+    0xd99574ba, 0xd82089a7, 0xdc297af4, 0xdd9c87e9, 0xdf4280ce,
+    0xdef77dd3, 0xd7516668, 0xd6e49b75, 0xd43a9c52, 0xd58f614f,
+    0xd186921c, 0xd0336f01, 0xd2ed6826, 0xd358953b, 0xc1a15f50,
+    0xc014a24d, 0xc2caa56a, 0xc37f5877, 0xc776ab24, 0xc6c35639,
+    0xc41d511e, 0xc5a8ac03, 0xcc0eb7b8, 0xcdbb4aa5, 0xcf654d82,
+    0xced0b09f, 0xcad943cc, 0xcb6cbed1, 0xc9b2b9f6, 0xc80744eb,
+    0xec412d20, 0xedf4d03d, 0xef2ad71a, 0xee9f2a07, 0xea96d954,
+    0xeb232449, 0xe9fd236e, 0xe848de73, 0xe1eec5c8, 0xe05b38d5,
+    0xe2853ff2, 0xe330c2ef, 0xe73931bc, 0xe68ccca1, 0xe452cb86,
+    0xe5e7369b, 0xf71efcf0, 0xf6ab01ed, 0xf47506ca, 0xf5c0fbd7,
+    0xf1c90884, 0xf07cf599, 0xf2a2f2be, 0xf3170fa3, 0xfab11418,
+    0xfb04e905, 0xf9daee22, 0xf86f133f, 0xfc66e06c, 0xfdd31d71,
+    0xff0d1a56, 0xfeb8e74b, 0xb781c9c0, 0xb63434dd, 0xb4ea33fa,
+    0xb55fcee7, 0xb1563db4, 0xb0e3c0a9, 0xb23dc78e, 0xb3883a93,
+    0xba2e2128, 0xbb9bdc35, 0xb945db12, 0xb8f0260f, 0xbcf9d55c,
+    0xbd4c2841, 0xbf922f66, 0xbe27d27b, 0xacde1810, 0xad6be50d,
+    0xafb5e22a, 0xae001f37, 0xaa09ec64, 0xabbc1179, 0xa962165e,
+    0xa8d7eb43, 0xa171f0f8, 0xa0c40de5, 0xa21a0ac2, 0xa3aff7df,
+    0xa7a6048c, 0xa613f991, 0xa4cdfeb6, 0xa57803ab, 0x813e6a60,
+    0x808b977d, 0x8255905a, 0x83e06d47, 0x87e99e14, 0x865c6309,
+    0x8482642e, 0x85379933, 0x8c918288, 0x8d247f95, 0x8ffa78b2,
+    0x8e4f85af, 0x8a4676fc, 0x8bf38be1, 0x892d8cc6, 0x889871db,
+    0x9a61bbb0, 0x9bd446ad, 0x990a418a, 0x98bfbc97, 0x9cb64fc4,
+    0x9d03b2d9, 0x9fddb5fe, 0x9e6848e3, 0x97ce5358, 0x967bae45,
+    0x94a5a962, 0x9510547f, 0x9119a72c, 0x90ac5a31, 0x92725d16,
+    0x93c7a00b},
+   {0x00000000, 0x6e8c1b41, 0xdd183682, 0xb3942dc3, 0x61416b45,
+    0x0fcd7004, 0xbc595dc7, 0xd2d54686, 0xc282d68a, 0xac0ecdcb,
+    0x1f9ae008, 0x7116fb49, 0xa3c3bdcf, 0xcd4fa68e, 0x7edb8b4d,
+    0x1057900c, 0x5e74ab55, 0x30f8b014, 0x836c9dd7, 0xede08696,
+    0x3f35c010, 0x51b9db51, 0xe22df692, 0x8ca1edd3, 0x9cf67ddf,
+    0xf27a669e, 0x41ee4b5d, 0x2f62501c, 0xfdb7169a, 0x933b0ddb,
+    0x20af2018, 0x4e233b59, 0xbce956aa, 0xd2654deb, 0x61f16028,
+    0x0f7d7b69, 0xdda83def, 0xb32426ae, 0x00b00b6d, 0x6e3c102c,
+    0x7e6b8020, 0x10e79b61, 0xa373b6a2, 0xcdffade3, 0x1f2aeb65,
+    0x71a6f024, 0xc232dde7, 0xacbec6a6, 0xe29dfdff, 0x8c11e6be,
+    0x3f85cb7d, 0x5109d03c, 0x83dc96ba, 0xed508dfb, 0x5ec4a038,
+    0x3048bb79, 0x201f2b75, 0x4e933034, 0xfd071df7, 0x938b06b6,
+    0x415e4030, 0x2fd25b71, 0x9c4676b2, 0xf2ca6df3, 0xa2a3ab15,
+    0xcc2fb054, 0x7fbb9d97, 0x113786d6, 0xc3e2c050, 0xad6edb11,
+    0x1efaf6d2, 0x7076ed93, 0x60217d9f, 0x0ead66de, 0xbd394b1d,
+    0xd3b5505c, 0x016016da, 0x6fec0d9b, 0xdc782058, 0xb2f43b19,
+    0xfcd70040, 0x925b1b01, 0x21cf36c2, 0x4f432d83, 0x9d966b05,
+    0xf31a7044, 0x408e5d87, 0x2e0246c6, 0x3e55d6ca, 0x50d9cd8b,
+    0xe34de048, 0x8dc1fb09, 0x5f14bd8f, 0x3198a6ce, 0x820c8b0d,
+    0xec80904c, 0x1e4afdbf, 0x70c6e6fe, 0xc352cb3d, 0xadded07c,
+    0x7f0b96fa, 0x11878dbb, 0xa213a078, 0xcc9fbb39, 0xdcc82b35,
+    0xb2443074, 0x01d01db7, 0x6f5c06f6, 0xbd894070, 0xd3055b31,
+    0x609176f2, 0x0e1d6db3, 0x403e56ea, 0x2eb24dab, 0x9d266068,
+    0xf3aa7b29, 0x217f3daf, 0x4ff326ee, 0xfc670b2d, 0x92eb106c,
+    0x82bc8060, 0xec309b21, 0x5fa4b6e2, 0x3128ada3, 0xe3fdeb25,
+    0x8d71f064, 0x3ee5dda7, 0x5069c6e6, 0x9e36506b, 0xf0ba4b2a,
+    0x432e66e9, 0x2da27da8, 0xff773b2e, 0x91fb206f, 0x226f0dac,
+    0x4ce316ed, 0x5cb486e1, 0x32389da0, 0x81acb063, 0xef20ab22,
+    0x3df5eda4, 0x5379f6e5, 0xe0eddb26, 0x8e61c067, 0xc042fb3e,
+    0xaecee07f, 0x1d5acdbc, 0x73d6d6fd, 0xa103907b, 0xcf8f8b3a,
+    0x7c1ba6f9, 0x1297bdb8, 0x02c02db4, 0x6c4c36f5, 0xdfd81b36,
+    0xb1540077, 0x638146f1, 0x0d0d5db0, 0xbe997073, 0xd0156b32,
+    0x22df06c1, 0x4c531d80, 0xffc73043, 0x914b2b02, 0x439e6d84,
+    0x2d1276c5, 0x9e865b06, 0xf00a4047, 0xe05dd04b, 0x8ed1cb0a,
+    0x3d45e6c9, 0x53c9fd88, 0x811cbb0e, 0xef90a04f, 0x5c048d8c,
+    0x328896cd, 0x7cabad94, 0x1227b6d5, 0xa1b39b16, 0xcf3f8057,
+    0x1deac6d1, 0x7366dd90, 0xc0f2f053, 0xae7eeb12, 0xbe297b1e,
+    0xd0a5605f, 0x63314d9c, 0x0dbd56dd, 0xdf68105b, 0xb1e40b1a,
+    0x027026d9, 0x6cfc3d98, 0x3c95fb7e, 0x5219e03f, 0xe18dcdfc,
+    0x8f01d6bd, 0x5dd4903b, 0x33588b7a, 0x80cca6b9, 0xee40bdf8,
+    0xfe172df4, 0x909b36b5, 0x230f1b76, 0x4d830037, 0x9f5646b1,
+    0xf1da5df0, 0x424e7033, 0x2cc26b72, 0x62e1502b, 0x0c6d4b6a,
+    0xbff966a9, 0xd1757de8, 0x03a03b6e, 0x6d2c202f, 0xdeb80dec,
+    0xb03416ad, 0xa06386a1, 0xceef9de0, 0x7d7bb023, 0x13f7ab62,
+    0xc122ede4, 0xafaef6a5, 0x1c3adb66, 0x72b6c027, 0x807cadd4,
+    0xeef0b695, 0x5d649b56, 0x33e88017, 0xe13dc691, 0x8fb1ddd0,
+    0x3c25f013, 0x52a9eb52, 0x42fe7b5e, 0x2c72601f, 0x9fe64ddc,
+    0xf16a569d, 0x23bf101b, 0x4d330b5a, 0xfea72699, 0x902b3dd8,
+    0xde080681, 0xb0841dc0, 0x03103003, 0x6d9c2b42, 0xbf496dc4,
+    0xd1c57685, 0x62515b46, 0x0cdd4007, 0x1c8ad00b, 0x7206cb4a,
+    0xc192e689, 0xaf1efdc8, 0x7dcbbb4e, 0x1347a00f, 0xa0d38dcc,
+    0xce5f968d},
+   {0x00000000, 0xe71da697, 0x154a4b6f, 0xf257edf8, 0x2a9496de,
+    0xcd893049, 0x3fdeddb1, 0xd8c37b26, 0x55292dbc, 0xb2348b2b,
+    0x406366d3, 0xa77ec044, 0x7fbdbb62, 0x98a01df5, 0x6af7f00d,
+    0x8dea569a, 0xaa525b78, 0x4d4ffdef, 0xbf181017, 0x5805b680,
+    0x80c6cda6, 0x67db6b31, 0x958c86c9, 0x7291205e, 0xff7b76c4,
+    0x1866d053, 0xea313dab, 0x0d2c9b3c, 0xd5efe01a, 0x32f2468d,
+    0xc0a5ab75, 0x27b80de2, 0x8fd5b0b1, 0x68c81626, 0x9a9ffbde,
+    0x7d825d49, 0xa541266f, 0x425c80f8, 0xb00b6d00, 0x5716cb97,
+    0xdafc9d0d, 0x3de13b9a, 0xcfb6d662, 0x28ab70f5, 0xf0680bd3,
+    0x1775ad44, 0xe52240bc, 0x023fe62b, 0x2587ebc9, 0xc29a4d5e,
+    0x30cda0a6, 0xd7d00631, 0x0f137d17, 0xe80edb80, 0x1a593678,
+    0xfd4490ef, 0x70aec675, 0x97b360e2, 0x65e48d1a, 0x82f92b8d,
+    0x5a3a50ab, 0xbd27f63c, 0x4f701bc4, 0xa86dbd53, 0xc4da6723,
+    0x23c7c1b4, 0xd1902c4c, 0x368d8adb, 0xee4ef1fd, 0x0953576a,
+    0xfb04ba92, 0x1c191c05, 0x91f34a9f, 0x76eeec08, 0x84b901f0,
+    0x63a4a767, 0xbb67dc41, 0x5c7a7ad6, 0xae2d972e, 0x493031b9,
+    0x6e883c5b, 0x89959acc, 0x7bc27734, 0x9cdfd1a3, 0x441caa85,
+    0xa3010c12, 0x5156e1ea, 0xb64b477d, 0x3ba111e7, 0xdcbcb770,
+    0x2eeb5a88, 0xc9f6fc1f, 0x11358739, 0xf62821ae, 0x047fcc56,
+    0xe3626ac1, 0x4b0fd792, 0xac127105, 0x5e459cfd, 0xb9583a6a,
+    0x619b414c, 0x8686e7db, 0x74d10a23, 0x93ccacb4, 0x1e26fa2e,
+    0xf93b5cb9, 0x0b6cb141, 0xec7117d6, 0x34b26cf0, 0xd3afca67,
+    0x21f8279f, 0xc6e58108, 0xe15d8cea, 0x06402a7d, 0xf417c785,
+    0x130a6112, 0xcbc91a34, 0x2cd4bca3, 0xde83515b, 0x399ef7cc,
+    0xb474a156, 0x536907c1, 0xa13eea39, 0x46234cae, 0x9ee03788,
+    0x79fd911f, 0x8baa7ce7, 0x6cb7da70, 0x52c5c807, 0xb5d86e90,
+    0x478f8368, 0xa09225ff, 0x78515ed9, 0x9f4cf84e, 0x6d1b15b6,
+    0x8a06b321, 0x07ece5bb, 0xe0f1432c, 0x12a6aed4, 0xf5bb0843,
+    0x2d787365, 0xca65d5f2, 0x3832380a, 0xdf2f9e9d, 0xf897937f,
+    0x1f8a35e8, 0xedddd810, 0x0ac07e87, 0xd20305a1, 0x351ea336,
+    0xc7494ece, 0x2054e859, 0xadbebec3, 0x4aa31854, 0xb8f4f5ac,
+    0x5fe9533b, 0x872a281d, 0x60378e8a, 0x92606372, 0x757dc5e5,
+    0xdd1078b6, 0x3a0dde21, 0xc85a33d9, 0x2f47954e, 0xf784ee68,
+    0x109948ff, 0xe2cea507, 0x05d30390, 0x8839550a, 0x6f24f39d,
+    0x9d731e65, 0x7a6eb8f2, 0xa2adc3d4, 0x45b06543, 0xb7e788bb,
+    0x50fa2e2c, 0x774223ce, 0x905f8559, 0x620868a1, 0x8515ce36,
+    0x5dd6b510, 0xbacb1387, 0x489cfe7f, 0xaf8158e8, 0x226b0e72,
+    0xc576a8e5, 0x3721451d, 0xd03ce38a, 0x08ff98ac, 0xefe23e3b,
+    0x1db5d3c3, 0xfaa87554, 0x961faf24, 0x710209b3, 0x8355e44b,
+    0x644842dc, 0xbc8b39fa, 0x5b969f6d, 0xa9c17295, 0x4edcd402,
+    0xc3368298, 0x242b240f, 0xd67cc9f7, 0x31616f60, 0xe9a21446,
+    0x0ebfb2d1, 0xfce85f29, 0x1bf5f9be, 0x3c4df45c, 0xdb5052cb,
+    0x2907bf33, 0xce1a19a4, 0x16d96282, 0xf1c4c415, 0x039329ed,
+    0xe48e8f7a, 0x6964d9e0, 0x8e797f77, 0x7c2e928f, 0x9b333418,
+    0x43f04f3e, 0xa4ede9a9, 0x56ba0451, 0xb1a7a2c6, 0x19ca1f95,
+    0xfed7b902, 0x0c8054fa, 0xeb9df26d, 0x335e894b, 0xd4432fdc,
+    0x2614c224, 0xc10964b3, 0x4ce33229, 0xabfe94be, 0x59a97946,
+    0xbeb4dfd1, 0x6677a4f7, 0x816a0260, 0x733def98, 0x9420490f,
+    0xb39844ed, 0x5485e27a, 0xa6d20f82, 0x41cfa915, 0x990cd233,
+    0x7e1174a4, 0x8c46995c, 0x6b5b3fcb, 0xe6b16951, 0x01accfc6,
+    0xf3fb223e, 0x14e684a9, 0xcc25ff8f, 0x2b385918, 0xd96fb4e0,
+    0x3e721277},
+   {0x00000000, 0xa58b900e, 0x9066265d, 0x35edb653, 0xfbbd4afb,
+    0x5e36daf5, 0x6bdb6ca6, 0xce50fca8, 0x2c0b93b7, 0x898003b9,
+    0xbc6db5ea, 0x19e625e4, 0xd7b6d94c, 0x723d4942, 0x47d0ff11,
+    0xe25b6f1f, 0x5817276e, 0xfd9cb760, 0xc8710133, 0x6dfa913d,
+    0xa3aa6d95, 0x0621fd9b, 0x33cc4bc8, 0x9647dbc6, 0x741cb4d9,
+    0xd19724d7, 0xe47a9284, 0x41f1028a, 0x8fa1fe22, 0x2a2a6e2c,
+    0x1fc7d87f, 0xba4c4871, 0xb02e4edc, 0x15a5ded2, 0x20486881,
+    0x85c3f88f, 0x4b930427, 0xee189429, 0xdbf5227a, 0x7e7eb274,
+    0x9c25dd6b, 0x39ae4d65, 0x0c43fb36, 0xa9c86b38, 0x67989790,
+    0xc213079e, 0xf7feb1cd, 0x527521c3, 0xe83969b2, 0x4db2f9bc,
+    0x785f4fef, 0xddd4dfe1, 0x13842349, 0xb60fb347, 0x83e20514,
+    0x2669951a, 0xc432fa05, 0x61b96a0b, 0x5454dc58, 0xf1df4c56,
+    0x3f8fb0fe, 0x9a0420f0, 0xafe996a3, 0x0a6206ad, 0xbb2d9bf9,
+    0x1ea60bf7, 0x2b4bbda4, 0x8ec02daa, 0x4090d102, 0xe51b410c,
+    0xd0f6f75f, 0x757d6751, 0x9726084e, 0x32ad9840, 0x07402e13,
+    0xa2cbbe1d, 0x6c9b42b5, 0xc910d2bb, 0xfcfd64e8, 0x5976f4e6,
+    0xe33abc97, 0x46b12c99, 0x735c9aca, 0xd6d70ac4, 0x1887f66c,
+    0xbd0c6662, 0x88e1d031, 0x2d6a403f, 0xcf312f20, 0x6ababf2e,
+    0x5f57097d, 0xfadc9973, 0x348c65db, 0x9107f5d5, 0xa4ea4386,
+    0x0161d388, 0x0b03d525, 0xae88452b, 0x9b65f378, 0x3eee6376,
+    0xf0be9fde, 0x55350fd0, 0x60d8b983, 0xc553298d, 0x27084692,
+    0x8283d69c, 0xb76e60cf, 0x12e5f0c1, 0xdcb50c69, 0x793e9c67,
+    0x4cd32a34, 0xe958ba3a, 0x5314f24b, 0xf69f6245, 0xc372d416,
+    0x66f94418, 0xa8a9b8b0, 0x0d2228be, 0x38cf9eed, 0x9d440ee3,
+    0x7f1f61fc, 0xda94f1f2, 0xef7947a1, 0x4af2d7af, 0x84a22b07,
+    0x2129bb09, 0x14c40d5a, 0xb14f9d54, 0xad2a31b3, 0x08a1a1bd,
+    0x3d4c17ee, 0x98c787e0, 0x56977b48, 0xf31ceb46, 0xc6f15d15,
+    0x637acd1b, 0x8121a204, 0x24aa320a, 0x11478459, 0xb4cc1457,
+    0x7a9ce8ff, 0xdf1778f1, 0xeafacea2, 0x4f715eac, 0xf53d16dd,
+    0x50b686d3, 0x655b3080, 0xc0d0a08e, 0x0e805c26, 0xab0bcc28,
+    0x9ee67a7b, 0x3b6dea75, 0xd936856a, 0x7cbd1564, 0x4950a337,
+    0xecdb3339, 0x228bcf91, 0x87005f9f, 0xb2ede9cc, 0x176679c2,
+    0x1d047f6f, 0xb88fef61, 0x8d625932, 0x28e9c93c, 0xe6b93594,
+    0x4332a59a, 0x76df13c9, 0xd35483c7, 0x310fecd8, 0x94847cd6,
+    0xa169ca85, 0x04e25a8b, 0xcab2a623, 0x6f39362d, 0x5ad4807e,
+    0xff5f1070, 0x45135801, 0xe098c80f, 0xd5757e5c, 0x70feee52,
+    0xbeae12fa, 0x1b2582f4, 0x2ec834a7, 0x8b43a4a9, 0x6918cbb6,
+    0xcc935bb8, 0xf97eedeb, 0x5cf57de5, 0x92a5814d, 0x372e1143,
+    0x02c3a710, 0xa748371e, 0x1607aa4a, 0xb38c3a44, 0x86618c17,
+    0x23ea1c19, 0xedbae0b1, 0x483170bf, 0x7ddcc6ec, 0xd85756e2,
+    0x3a0c39fd, 0x9f87a9f3, 0xaa6a1fa0, 0x0fe18fae, 0xc1b17306,
+    0x643ae308, 0x51d7555b, 0xf45cc555, 0x4e108d24, 0xeb9b1d2a,
+    0xde76ab79, 0x7bfd3b77, 0xb5adc7df, 0x102657d1, 0x25cbe182,
+    0x8040718c, 0x621b1e93, 0xc7908e9d, 0xf27d38ce, 0x57f6a8c0,
+    0x99a65468, 0x3c2dc466, 0x09c07235, 0xac4be23b, 0xa629e496,
+    0x03a27498, 0x364fc2cb, 0x93c452c5, 0x5d94ae6d, 0xf81f3e63,
+    0xcdf28830, 0x6879183e, 0x8a227721, 0x2fa9e72f, 0x1a44517c,
+    0xbfcfc172, 0x719f3dda, 0xd414add4, 0xe1f91b87, 0x44728b89,
+    0xfe3ec3f8, 0x5bb553f6, 0x6e58e5a5, 0xcbd375ab, 0x05838903,
+    0xa008190d, 0x95e5af5e, 0x306e3f50, 0xd235504f, 0x77bec041,
+    0x42537612, 0xe7d8e61c, 0x29881ab4, 0x8c038aba, 0xb9ee3ce9,
+    0x1c65ace7}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x0000000000000000, 0x0e908ba500000000, 0x5d26669000000000,
+    0x53b6ed3500000000, 0xfb4abdfb00000000, 0xf5da365e00000000,
+    0xa66cdb6b00000000, 0xa8fc50ce00000000, 0xb7930b2c00000000,
+    0xb903808900000000, 0xeab56dbc00000000, 0xe425e61900000000,
+    0x4cd9b6d700000000, 0x42493d7200000000, 0x11ffd04700000000,
+    0x1f6f5be200000000, 0x6e27175800000000, 0x60b79cfd00000000,
+    0x330171c800000000, 0x3d91fa6d00000000, 0x956daaa300000000,
+    0x9bfd210600000000, 0xc84bcc3300000000, 0xc6db479600000000,
+    0xd9b41c7400000000, 0xd72497d100000000, 0x84927ae400000000,
+    0x8a02f14100000000, 0x22fea18f00000000, 0x2c6e2a2a00000000,
+    0x7fd8c71f00000000, 0x71484cba00000000, 0xdc4e2eb000000000,
+    0xd2dea51500000000, 0x8168482000000000, 0x8ff8c38500000000,
+    0x2704934b00000000, 0x299418ee00000000, 0x7a22f5db00000000,
+    0x74b27e7e00000000, 0x6bdd259c00000000, 0x654dae3900000000,
+    0x36fb430c00000000, 0x386bc8a900000000, 0x9097986700000000,
+    0x9e0713c200000000, 0xcdb1fef700000000, 0xc321755200000000,
+    0xb26939e800000000, 0xbcf9b24d00000000, 0xef4f5f7800000000,
+    0xe1dfd4dd00000000, 0x4923841300000000, 0x47b30fb600000000,
+    0x1405e28300000000, 0x1a95692600000000, 0x05fa32c400000000,
+    0x0b6ab96100000000, 0x58dc545400000000, 0x564cdff100000000,
+    0xfeb08f3f00000000, 0xf020049a00000000, 0xa396e9af00000000,
+    0xad06620a00000000, 0xf99b2dbb00000000, 0xf70ba61e00000000,
+    0xa4bd4b2b00000000, 0xaa2dc08e00000000, 0x02d1904000000000,
+    0x0c411be500000000, 0x5ff7f6d000000000, 0x51677d7500000000,
+    0x4e08269700000000, 0x4098ad3200000000, 0x132e400700000000,
+    0x1dbecba200000000, 0xb5429b6c00000000, 0xbbd210c900000000,
+    0xe864fdfc00000000, 0xe6f4765900000000, 0x97bc3ae300000000,
+    0x992cb14600000000, 0xca9a5c7300000000, 0xc40ad7d600000000,
+    0x6cf6871800000000, 0x62660cbd00000000, 0x31d0e18800000000,
+    0x3f406a2d00000000, 0x202f31cf00000000, 0x2ebfba6a00000000,
+    0x7d09575f00000000, 0x7399dcfa00000000, 0xdb658c3400000000,
+    0xd5f5079100000000, 0x8643eaa400000000, 0x88d3610100000000,
+    0x25d5030b00000000, 0x2b4588ae00000000, 0x78f3659b00000000,
+    0x7663ee3e00000000, 0xde9fbef000000000, 0xd00f355500000000,
+    0x83b9d86000000000, 0x8d2953c500000000, 0x9246082700000000,
+    0x9cd6838200000000, 0xcf606eb700000000, 0xc1f0e51200000000,
+    0x690cb5dc00000000, 0x679c3e7900000000, 0x342ad34c00000000,
+    0x3aba58e900000000, 0x4bf2145300000000, 0x45629ff600000000,
+    0x16d472c300000000, 0x1844f96600000000, 0xb0b8a9a800000000,
+    0xbe28220d00000000, 0xed9ecf3800000000, 0xe30e449d00000000,
+    0xfc611f7f00000000, 0xf2f194da00000000, 0xa14779ef00000000,
+    0xafd7f24a00000000, 0x072ba28400000000, 0x09bb292100000000,
+    0x5a0dc41400000000, 0x549d4fb100000000, 0xb3312aad00000000,
+    0xbda1a10800000000, 0xee174c3d00000000, 0xe087c79800000000,
+    0x487b975600000000, 0x46eb1cf300000000, 0x155df1c600000000,
+    0x1bcd7a6300000000, 0x04a2218100000000, 0x0a32aa2400000000,
+    0x5984471100000000, 0x5714ccb400000000, 0xffe89c7a00000000,
+    0xf17817df00000000, 0xa2cefaea00000000, 0xac5e714f00000000,
+    0xdd163df500000000, 0xd386b65000000000, 0x80305b6500000000,
+    0x8ea0d0c000000000, 0x265c800e00000000, 0x28cc0bab00000000,
+    0x7b7ae69e00000000, 0x75ea6d3b00000000, 0x6a8536d900000000,
+    0x6415bd7c00000000, 0x37a3504900000000, 0x3933dbec00000000,
+    0x91cf8b2200000000, 0x9f5f008700000000, 0xcce9edb200000000,
+    0xc279661700000000, 0x6f7f041d00000000, 0x61ef8fb800000000,
+    0x3259628d00000000, 0x3cc9e92800000000, 0x9435b9e600000000,
+    0x9aa5324300000000, 0xc913df7600000000, 0xc78354d300000000,
+    0xd8ec0f3100000000, 0xd67c849400000000, 0x85ca69a100000000,
+    0x8b5ae20400000000, 0x23a6b2ca00000000, 0x2d36396f00000000,
+    0x7e80d45a00000000, 0x70105fff00000000, 0x0158134500000000,
+    0x0fc898e000000000, 0x5c7e75d500000000, 0x52eefe7000000000,
+    0xfa12aebe00000000, 0xf482251b00000000, 0xa734c82e00000000,
+    0xa9a4438b00000000, 0xb6cb186900000000, 0xb85b93cc00000000,
+    0xebed7ef900000000, 0xe57df55c00000000, 0x4d81a59200000000,
+    0x43112e3700000000, 0x10a7c30200000000, 0x1e3748a700000000,
+    0x4aaa071600000000, 0x443a8cb300000000, 0x178c618600000000,
+    0x191cea2300000000, 0xb1e0baed00000000, 0xbf70314800000000,
+    0xecc6dc7d00000000, 0xe25657d800000000, 0xfd390c3a00000000,
+    0xf3a9879f00000000, 0xa01f6aaa00000000, 0xae8fe10f00000000,
+    0x0673b1c100000000, 0x08e33a6400000000, 0x5b55d75100000000,
+    0x55c55cf400000000, 0x248d104e00000000, 0x2a1d9beb00000000,
+    0x79ab76de00000000, 0x773bfd7b00000000, 0xdfc7adb500000000,
+    0xd157261000000000, 0x82e1cb2500000000, 0x8c71408000000000,
+    0x931e1b6200000000, 0x9d8e90c700000000, 0xce387df200000000,
+    0xc0a8f65700000000, 0x6854a69900000000, 0x66c42d3c00000000,
+    0x3572c00900000000, 0x3be24bac00000000, 0x96e429a600000000,
+    0x9874a20300000000, 0xcbc24f3600000000, 0xc552c49300000000,
+    0x6dae945d00000000, 0x633e1ff800000000, 0x3088f2cd00000000,
+    0x3e18796800000000, 0x2177228a00000000, 0x2fe7a92f00000000,
+    0x7c51441a00000000, 0x72c1cfbf00000000, 0xda3d9f7100000000,
+    0xd4ad14d400000000, 0x871bf9e100000000, 0x898b724400000000,
+    0xf8c33efe00000000, 0xf653b55b00000000, 0xa5e5586e00000000,
+    0xab75d3cb00000000, 0x0389830500000000, 0x0d1908a000000000,
+    0x5eafe59500000000, 0x503f6e3000000000, 0x4f5035d200000000,
+    0x41c0be7700000000, 0x1276534200000000, 0x1ce6d8e700000000,
+    0xb41a882900000000, 0xba8a038c00000000, 0xe93ceeb900000000,
+    0xe7ac651c00000000},
+   {0x0000000000000000, 0x97a61de700000000, 0x6f4b4a1500000000,
+    0xf8ed57f200000000, 0xde96942a00000000, 0x493089cd00000000,
+    0xb1ddde3f00000000, 0x267bc3d800000000, 0xbc2d295500000000,
+    0x2b8b34b200000000, 0xd366634000000000, 0x44c07ea700000000,
+    0x62bbbd7f00000000, 0xf51da09800000000, 0x0df0f76a00000000,
+    0x9a56ea8d00000000, 0x785b52aa00000000, 0xeffd4f4d00000000,
+    0x171018bf00000000, 0x80b6055800000000, 0xa6cdc68000000000,
+    0x316bdb6700000000, 0xc9868c9500000000, 0x5e20917200000000,
+    0xc4767bff00000000, 0x53d0661800000000, 0xab3d31ea00000000,
+    0x3c9b2c0d00000000, 0x1ae0efd500000000, 0x8d46f23200000000,
+    0x75aba5c000000000, 0xe20db82700000000, 0xb1b0d58f00000000,
+    0x2616c86800000000, 0xdefb9f9a00000000, 0x495d827d00000000,
+    0x6f2641a500000000, 0xf8805c4200000000, 0x006d0bb000000000,
+    0x97cb165700000000, 0x0d9dfcda00000000, 0x9a3be13d00000000,
+    0x62d6b6cf00000000, 0xf570ab2800000000, 0xd30b68f000000000,
+    0x44ad751700000000, 0xbc4022e500000000, 0x2be63f0200000000,
+    0xc9eb872500000000, 0x5e4d9ac200000000, 0xa6a0cd3000000000,
+    0x3106d0d700000000, 0x177d130f00000000, 0x80db0ee800000000,
+    0x7836591a00000000, 0xef9044fd00000000, 0x75c6ae7000000000,
+    0xe260b39700000000, 0x1a8de46500000000, 0x8d2bf98200000000,
+    0xab503a5a00000000, 0x3cf627bd00000000, 0xc41b704f00000000,
+    0x53bd6da800000000, 0x2367dac400000000, 0xb4c1c72300000000,
+    0x4c2c90d100000000, 0xdb8a8d3600000000, 0xfdf14eee00000000,
+    0x6a57530900000000, 0x92ba04fb00000000, 0x051c191c00000000,
+    0x9f4af39100000000, 0x08ecee7600000000, 0xf001b98400000000,
+    0x67a7a46300000000, 0x41dc67bb00000000, 0xd67a7a5c00000000,
+    0x2e972dae00000000, 0xb931304900000000, 0x5b3c886e00000000,
+    0xcc9a958900000000, 0x3477c27b00000000, 0xa3d1df9c00000000,
+    0x85aa1c4400000000, 0x120c01a300000000, 0xeae1565100000000,
+    0x7d474bb600000000, 0xe711a13b00000000, 0x70b7bcdc00000000,
+    0x885aeb2e00000000, 0x1ffcf6c900000000, 0x3987351100000000,
+    0xae2128f600000000, 0x56cc7f0400000000, 0xc16a62e300000000,
+    0x92d70f4b00000000, 0x057112ac00000000, 0xfd9c455e00000000,
+    0x6a3a58b900000000, 0x4c419b6100000000, 0xdbe7868600000000,
+    0x230ad17400000000, 0xb4accc9300000000, 0x2efa261e00000000,
+    0xb95c3bf900000000, 0x41b16c0b00000000, 0xd61771ec00000000,
+    0xf06cb23400000000, 0x67caafd300000000, 0x9f27f82100000000,
+    0x0881e5c600000000, 0xea8c5de100000000, 0x7d2a400600000000,
+    0x85c717f400000000, 0x12610a1300000000, 0x341ac9cb00000000,
+    0xa3bcd42c00000000, 0x5b5183de00000000, 0xccf79e3900000000,
+    0x56a174b400000000, 0xc107695300000000, 0x39ea3ea100000000,
+    0xae4c234600000000, 0x8837e09e00000000, 0x1f91fd7900000000,
+    0xe77caa8b00000000, 0x70dab76c00000000, 0x07c8c55200000000,
+    0x906ed8b500000000, 0x68838f4700000000, 0xff2592a000000000,
+    0xd95e517800000000, 0x4ef84c9f00000000, 0xb6151b6d00000000,
+    0x21b3068a00000000, 0xbbe5ec0700000000, 0x2c43f1e000000000,
+    0xd4aea61200000000, 0x4308bbf500000000, 0x6573782d00000000,
+    0xf2d565ca00000000, 0x0a38323800000000, 0x9d9e2fdf00000000,
+    0x7f9397f800000000, 0xe8358a1f00000000, 0x10d8dded00000000,
+    0x877ec00a00000000, 0xa10503d200000000, 0x36a31e3500000000,
+    0xce4e49c700000000, 0x59e8542000000000, 0xc3bebead00000000,
+    0x5418a34a00000000, 0xacf5f4b800000000, 0x3b53e95f00000000,
+    0x1d282a8700000000, 0x8a8e376000000000, 0x7263609200000000,
+    0xe5c57d7500000000, 0xb67810dd00000000, 0x21de0d3a00000000,
+    0xd9335ac800000000, 0x4e95472f00000000, 0x68ee84f700000000,
+    0xff48991000000000, 0x07a5cee200000000, 0x9003d30500000000,
+    0x0a55398800000000, 0x9df3246f00000000, 0x651e739d00000000,
+    0xf2b86e7a00000000, 0xd4c3ada200000000, 0x4365b04500000000,
+    0xbb88e7b700000000, 0x2c2efa5000000000, 0xce23427700000000,
+    0x59855f9000000000, 0xa168086200000000, 0x36ce158500000000,
+    0x10b5d65d00000000, 0x8713cbba00000000, 0x7ffe9c4800000000,
+    0xe85881af00000000, 0x720e6b2200000000, 0xe5a876c500000000,
+    0x1d45213700000000, 0x8ae33cd000000000, 0xac98ff0800000000,
+    0x3b3ee2ef00000000, 0xc3d3b51d00000000, 0x5475a8fa00000000,
+    0x24af1f9600000000, 0xb309027100000000, 0x4be4558300000000,
+    0xdc42486400000000, 0xfa398bbc00000000, 0x6d9f965b00000000,
+    0x9572c1a900000000, 0x02d4dc4e00000000, 0x988236c300000000,
+    0x0f242b2400000000, 0xf7c97cd600000000, 0x606f613100000000,
+    0x4614a2e900000000, 0xd1b2bf0e00000000, 0x295fe8fc00000000,
+    0xbef9f51b00000000, 0x5cf44d3c00000000, 0xcb5250db00000000,
+    0x33bf072900000000, 0xa4191ace00000000, 0x8262d91600000000,
+    0x15c4c4f100000000, 0xed29930300000000, 0x7a8f8ee400000000,
+    0xe0d9646900000000, 0x777f798e00000000, 0x8f922e7c00000000,
+    0x1834339b00000000, 0x3e4ff04300000000, 0xa9e9eda400000000,
+    0x5104ba5600000000, 0xc6a2a7b100000000, 0x951fca1900000000,
+    0x02b9d7fe00000000, 0xfa54800c00000000, 0x6df29deb00000000,
+    0x4b895e3300000000, 0xdc2f43d400000000, 0x24c2142600000000,
+    0xb36409c100000000, 0x2932e34c00000000, 0xbe94feab00000000,
+    0x4679a95900000000, 0xd1dfb4be00000000, 0xf7a4776600000000,
+    0x60026a8100000000, 0x98ef3d7300000000, 0x0f49209400000000,
+    0xed4498b300000000, 0x7ae2855400000000, 0x820fd2a600000000,
+    0x15a9cf4100000000, 0x33d20c9900000000, 0xa474117e00000000,
+    0x5c99468c00000000, 0xcb3f5b6b00000000, 0x5169b1e600000000,
+    0xc6cfac0100000000, 0x3e22fbf300000000, 0xa984e61400000000,
+    0x8fff25cc00000000, 0x1859382b00000000, 0xe0b46fd900000000,
+    0x7712723e00000000},
+   {0x0000000000000000, 0x411b8c6e00000000, 0x823618dd00000000,
+    0xc32d94b300000000, 0x456b416100000000, 0x0470cd0f00000000,
+    0xc75d59bc00000000, 0x8646d5d200000000, 0x8ad682c200000000,
+    0xcbcd0eac00000000, 0x08e09a1f00000000, 0x49fb167100000000,
+    0xcfbdc3a300000000, 0x8ea64fcd00000000, 0x4d8bdb7e00000000,
+    0x0c90571000000000, 0x55ab745e00000000, 0x14b0f83000000000,
+    0xd79d6c8300000000, 0x9686e0ed00000000, 0x10c0353f00000000,
+    0x51dbb95100000000, 0x92f62de200000000, 0xd3eda18c00000000,
+    0xdf7df69c00000000, 0x9e667af200000000, 0x5d4bee4100000000,
+    0x1c50622f00000000, 0x9a16b7fd00000000, 0xdb0d3b9300000000,
+    0x1820af2000000000, 0x593b234e00000000, 0xaa56e9bc00000000,
+    0xeb4d65d200000000, 0x2860f16100000000, 0x697b7d0f00000000,
+    0xef3da8dd00000000, 0xae2624b300000000, 0x6d0bb00000000000,
+    0x2c103c6e00000000, 0x20806b7e00000000, 0x619be71000000000,
+    0xa2b673a300000000, 0xe3adffcd00000000, 0x65eb2a1f00000000,
+    0x24f0a67100000000, 0xe7dd32c200000000, 0xa6c6beac00000000,
+    0xfffd9de200000000, 0xbee6118c00000000, 0x7dcb853f00000000,
+    0x3cd0095100000000, 0xba96dc8300000000, 0xfb8d50ed00000000,
+    0x38a0c45e00000000, 0x79bb483000000000, 0x752b1f2000000000,
+    0x3430934e00000000, 0xf71d07fd00000000, 0xb6068b9300000000,
+    0x30405e4100000000, 0x715bd22f00000000, 0xb276469c00000000,
+    0xf36dcaf200000000, 0x15aba3a200000000, 0x54b02fcc00000000,
+    0x979dbb7f00000000, 0xd686371100000000, 0x50c0e2c300000000,
+    0x11db6ead00000000, 0xd2f6fa1e00000000, 0x93ed767000000000,
+    0x9f7d216000000000, 0xde66ad0e00000000, 0x1d4b39bd00000000,
+    0x5c50b5d300000000, 0xda16600100000000, 0x9b0dec6f00000000,
+    0x582078dc00000000, 0x193bf4b200000000, 0x4000d7fc00000000,
+    0x011b5b9200000000, 0xc236cf2100000000, 0x832d434f00000000,
+    0x056b969d00000000, 0x44701af300000000, 0x875d8e4000000000,
+    0xc646022e00000000, 0xcad6553e00000000, 0x8bcdd95000000000,
+    0x48e04de300000000, 0x09fbc18d00000000, 0x8fbd145f00000000,
+    0xcea6983100000000, 0x0d8b0c8200000000, 0x4c9080ec00000000,
+    0xbffd4a1e00000000, 0xfee6c67000000000, 0x3dcb52c300000000,
+    0x7cd0dead00000000, 0xfa960b7f00000000, 0xbb8d871100000000,
+    0x78a013a200000000, 0x39bb9fcc00000000, 0x352bc8dc00000000,
+    0x743044b200000000, 0xb71dd00100000000, 0xf6065c6f00000000,
+    0x704089bd00000000, 0x315b05d300000000, 0xf276916000000000,
+    0xb36d1d0e00000000, 0xea563e4000000000, 0xab4db22e00000000,
+    0x6860269d00000000, 0x297baaf300000000, 0xaf3d7f2100000000,
+    0xee26f34f00000000, 0x2d0b67fc00000000, 0x6c10eb9200000000,
+    0x6080bc8200000000, 0x219b30ec00000000, 0xe2b6a45f00000000,
+    0xa3ad283100000000, 0x25ebfde300000000, 0x64f0718d00000000,
+    0xa7dde53e00000000, 0xe6c6695000000000, 0x6b50369e00000000,
+    0x2a4bbaf000000000, 0xe9662e4300000000, 0xa87da22d00000000,
+    0x2e3b77ff00000000, 0x6f20fb9100000000, 0xac0d6f2200000000,
+    0xed16e34c00000000, 0xe186b45c00000000, 0xa09d383200000000,
+    0x63b0ac8100000000, 0x22ab20ef00000000, 0xa4edf53d00000000,
+    0xe5f6795300000000, 0x26dbede000000000, 0x67c0618e00000000,
+    0x3efb42c000000000, 0x7fe0ceae00000000, 0xbccd5a1d00000000,
+    0xfdd6d67300000000, 0x7b9003a100000000, 0x3a8b8fcf00000000,
+    0xf9a61b7c00000000, 0xb8bd971200000000, 0xb42dc00200000000,
+    0xf5364c6c00000000, 0x361bd8df00000000, 0x770054b100000000,
+    0xf146816300000000, 0xb05d0d0d00000000, 0x737099be00000000,
+    0x326b15d000000000, 0xc106df2200000000, 0x801d534c00000000,
+    0x4330c7ff00000000, 0x022b4b9100000000, 0x846d9e4300000000,
+    0xc576122d00000000, 0x065b869e00000000, 0x47400af000000000,
+    0x4bd05de000000000, 0x0acbd18e00000000, 0xc9e6453d00000000,
+    0x88fdc95300000000, 0x0ebb1c8100000000, 0x4fa090ef00000000,
+    0x8c8d045c00000000, 0xcd96883200000000, 0x94adab7c00000000,
+    0xd5b6271200000000, 0x169bb3a100000000, 0x57803fcf00000000,
+    0xd1c6ea1d00000000, 0x90dd667300000000, 0x53f0f2c000000000,
+    0x12eb7eae00000000, 0x1e7b29be00000000, 0x5f60a5d000000000,
+    0x9c4d316300000000, 0xdd56bd0d00000000, 0x5b1068df00000000,
+    0x1a0be4b100000000, 0xd926700200000000, 0x983dfc6c00000000,
+    0x7efb953c00000000, 0x3fe0195200000000, 0xfccd8de100000000,
+    0xbdd6018f00000000, 0x3b90d45d00000000, 0x7a8b583300000000,
+    0xb9a6cc8000000000, 0xf8bd40ee00000000, 0xf42d17fe00000000,
+    0xb5369b9000000000, 0x761b0f2300000000, 0x3700834d00000000,
+    0xb146569f00000000, 0xf05ddaf100000000, 0x33704e4200000000,
+    0x726bc22c00000000, 0x2b50e16200000000, 0x6a4b6d0c00000000,
+    0xa966f9bf00000000, 0xe87d75d100000000, 0x6e3ba00300000000,
+    0x2f202c6d00000000, 0xec0db8de00000000, 0xad1634b000000000,
+    0xa18663a000000000, 0xe09defce00000000, 0x23b07b7d00000000,
+    0x62abf71300000000, 0xe4ed22c100000000, 0xa5f6aeaf00000000,
+    0x66db3a1c00000000, 0x27c0b67200000000, 0xd4ad7c8000000000,
+    0x95b6f0ee00000000, 0x569b645d00000000, 0x1780e83300000000,
+    0x91c63de100000000, 0xd0ddb18f00000000, 0x13f0253c00000000,
+    0x52eba95200000000, 0x5e7bfe4200000000, 0x1f60722c00000000,
+    0xdc4de69f00000000, 0x9d566af100000000, 0x1b10bf2300000000,
+    0x5a0b334d00000000, 0x9926a7fe00000000, 0xd83d2b9000000000,
+    0x810608de00000000, 0xc01d84b000000000, 0x0330100300000000,
+    0x422b9c6d00000000, 0xc46d49bf00000000, 0x8576c5d100000000,
+    0x465b516200000000, 0x0740dd0c00000000, 0x0bd08a1c00000000,
+    0x4acb067200000000, 0x89e692c100000000, 0xc8fd1eaf00000000,
+    0x4ebbcb7d00000000, 0x0fa0471300000000, 0xcc8dd3a000000000,
+    0x8d965fce00000000},
+   {0x0000000000000000, 0x1dfdb50100000000, 0x3afa6b0300000000,
+    0x2707de0200000000, 0x74f4d70600000000, 0x6909620700000000,
+    0x4e0ebc0500000000, 0x53f3090400000000, 0xe8e8af0d00000000,
+    0xf5151a0c00000000, 0xd212c40e00000000, 0xcfef710f00000000,
+    0x9c1c780b00000000, 0x81e1cd0a00000000, 0xa6e6130800000000,
+    0xbb1ba60900000000, 0xd0d15f1b00000000, 0xcd2cea1a00000000,
+    0xea2b341800000000, 0xf7d6811900000000, 0xa425881d00000000,
+    0xb9d83d1c00000000, 0x9edfe31e00000000, 0x8322561f00000000,
+    0x3839f01600000000, 0x25c4451700000000, 0x02c39b1500000000,
+    0x1f3e2e1400000000, 0x4ccd271000000000, 0x5130921100000000,
+    0x76374c1300000000, 0x6bcaf91200000000, 0xa0a3bf3600000000,
+    0xbd5e0a3700000000, 0x9a59d43500000000, 0x87a4613400000000,
+    0xd457683000000000, 0xc9aadd3100000000, 0xeead033300000000,
+    0xf350b63200000000, 0x484b103b00000000, 0x55b6a53a00000000,
+    0x72b17b3800000000, 0x6f4cce3900000000, 0x3cbfc73d00000000,
+    0x2142723c00000000, 0x0645ac3e00000000, 0x1bb8193f00000000,
+    0x7072e02d00000000, 0x6d8f552c00000000, 0x4a888b2e00000000,
+    0x57753e2f00000000, 0x0486372b00000000, 0x197b822a00000000,
+    0x3e7c5c2800000000, 0x2381e92900000000, 0x989a4f2000000000,
+    0x8567fa2100000000, 0xa260242300000000, 0xbf9d912200000000,
+    0xec6e982600000000, 0xf1932d2700000000, 0xd694f32500000000,
+    0xcb69462400000000, 0x40477f6d00000000, 0x5dbaca6c00000000,
+    0x7abd146e00000000, 0x6740a16f00000000, 0x34b3a86b00000000,
+    0x294e1d6a00000000, 0x0e49c36800000000, 0x13b4766900000000,
+    0xa8afd06000000000, 0xb552656100000000, 0x9255bb6300000000,
+    0x8fa80e6200000000, 0xdc5b076600000000, 0xc1a6b26700000000,
+    0xe6a16c6500000000, 0xfb5cd96400000000, 0x9096207600000000,
+    0x8d6b957700000000, 0xaa6c4b7500000000, 0xb791fe7400000000,
+    0xe462f77000000000, 0xf99f427100000000, 0xde989c7300000000,
+    0xc365297200000000, 0x787e8f7b00000000, 0x65833a7a00000000,
+    0x4284e47800000000, 0x5f79517900000000, 0x0c8a587d00000000,
+    0x1177ed7c00000000, 0x3670337e00000000, 0x2b8d867f00000000,
+    0xe0e4c05b00000000, 0xfd19755a00000000, 0xda1eab5800000000,
+    0xc7e31e5900000000, 0x9410175d00000000, 0x89eda25c00000000,
+    0xaeea7c5e00000000, 0xb317c95f00000000, 0x080c6f5600000000,
+    0x15f1da5700000000, 0x32f6045500000000, 0x2f0bb15400000000,
+    0x7cf8b85000000000, 0x61050d5100000000, 0x4602d35300000000,
+    0x5bff665200000000, 0x30359f4000000000, 0x2dc82a4100000000,
+    0x0acff44300000000, 0x1732414200000000, 0x44c1484600000000,
+    0x593cfd4700000000, 0x7e3b234500000000, 0x63c6964400000000,
+    0xd8dd304d00000000, 0xc520854c00000000, 0xe2275b4e00000000,
+    0xffdaee4f00000000, 0xac29e74b00000000, 0xb1d4524a00000000,
+    0x96d38c4800000000, 0x8b2e394900000000, 0x808efeda00000000,
+    0x9d734bdb00000000, 0xba7495d900000000, 0xa78920d800000000,
+    0xf47a29dc00000000, 0xe9879cdd00000000, 0xce8042df00000000,
+    0xd37df7de00000000, 0x686651d700000000, 0x759be4d600000000,
+    0x529c3ad400000000, 0x4f618fd500000000, 0x1c9286d100000000,
+    0x016f33d000000000, 0x2668edd200000000, 0x3b9558d300000000,
+    0x505fa1c100000000, 0x4da214c000000000, 0x6aa5cac200000000,
+    0x77587fc300000000, 0x24ab76c700000000, 0x3956c3c600000000,
+    0x1e511dc400000000, 0x03aca8c500000000, 0xb8b70ecc00000000,
+    0xa54abbcd00000000, 0x824d65cf00000000, 0x9fb0d0ce00000000,
+    0xcc43d9ca00000000, 0xd1be6ccb00000000, 0xf6b9b2c900000000,
+    0xeb4407c800000000, 0x202d41ec00000000, 0x3dd0f4ed00000000,
+    0x1ad72aef00000000, 0x072a9fee00000000, 0x54d996ea00000000,
+    0x492423eb00000000, 0x6e23fde900000000, 0x73de48e800000000,
+    0xc8c5eee100000000, 0xd5385be000000000, 0xf23f85e200000000,
+    0xefc230e300000000, 0xbc3139e700000000, 0xa1cc8ce600000000,
+    0x86cb52e400000000, 0x9b36e7e500000000, 0xf0fc1ef700000000,
+    0xed01abf600000000, 0xca0675f400000000, 0xd7fbc0f500000000,
+    0x8408c9f100000000, 0x99f57cf000000000, 0xbef2a2f200000000,
+    0xa30f17f300000000, 0x1814b1fa00000000, 0x05e904fb00000000,
+    0x22eedaf900000000, 0x3f136ff800000000, 0x6ce066fc00000000,
+    0x711dd3fd00000000, 0x561a0dff00000000, 0x4be7b8fe00000000,
+    0xc0c981b700000000, 0xdd3434b600000000, 0xfa33eab400000000,
+    0xe7ce5fb500000000, 0xb43d56b100000000, 0xa9c0e3b000000000,
+    0x8ec73db200000000, 0x933a88b300000000, 0x28212eba00000000,
+    0x35dc9bbb00000000, 0x12db45b900000000, 0x0f26f0b800000000,
+    0x5cd5f9bc00000000, 0x41284cbd00000000, 0x662f92bf00000000,
+    0x7bd227be00000000, 0x1018deac00000000, 0x0de56bad00000000,
+    0x2ae2b5af00000000, 0x371f00ae00000000, 0x64ec09aa00000000,
+    0x7911bcab00000000, 0x5e1662a900000000, 0x43ebd7a800000000,
+    0xf8f071a100000000, 0xe50dc4a000000000, 0xc20a1aa200000000,
+    0xdff7afa300000000, 0x8c04a6a700000000, 0x91f913a600000000,
+    0xb6fecda400000000, 0xab0378a500000000, 0x606a3e8100000000,
+    0x7d978b8000000000, 0x5a90558200000000, 0x476de08300000000,
+    0x149ee98700000000, 0x09635c8600000000, 0x2e64828400000000,
+    0x3399378500000000, 0x8882918c00000000, 0x957f248d00000000,
+    0xb278fa8f00000000, 0xaf854f8e00000000, 0xfc76468a00000000,
+    0xe18bf38b00000000, 0xc68c2d8900000000, 0xdb71988800000000,
+    0xb0bb619a00000000, 0xad46d49b00000000, 0x8a410a9900000000,
+    0x97bcbf9800000000, 0xc44fb69c00000000, 0xd9b2039d00000000,
+    0xfeb5dd9f00000000, 0xe348689e00000000, 0x5853ce9700000000,
+    0x45ae7b9600000000, 0x62a9a59400000000, 0x7f54109500000000,
+    0x2ca7199100000000, 0x315aac9000000000, 0x165d729200000000,
+    0x0ba0c79300000000},
+   {0x0000000000000000, 0x24d9076300000000, 0x48b20fc600000000,
+    0x6c6b08a500000000, 0xd1626e5700000000, 0xf5bb693400000000,
+    0x99d0619100000000, 0xbd0966f200000000, 0xa2c5dcae00000000,
+    0x861cdbcd00000000, 0xea77d36800000000, 0xceaed40b00000000,
+    0x73a7b2f900000000, 0x577eb59a00000000, 0x3b15bd3f00000000,
+    0x1fccba5c00000000, 0x058dc88600000000, 0x2154cfe500000000,
+    0x4d3fc74000000000, 0x69e6c02300000000, 0xd4efa6d100000000,
+    0xf036a1b200000000, 0x9c5da91700000000, 0xb884ae7400000000,
+    0xa748142800000000, 0x8391134b00000000, 0xeffa1bee00000000,
+    0xcb231c8d00000000, 0x762a7a7f00000000, 0x52f37d1c00000000,
+    0x3e9875b900000000, 0x1a4172da00000000, 0x4b1ce0d600000000,
+    0x6fc5e7b500000000, 0x03aeef1000000000, 0x2777e87300000000,
+    0x9a7e8e8100000000, 0xbea789e200000000, 0xd2cc814700000000,
+    0xf615862400000000, 0xe9d93c7800000000, 0xcd003b1b00000000,
+    0xa16b33be00000000, 0x85b234dd00000000, 0x38bb522f00000000,
+    0x1c62554c00000000, 0x70095de900000000, 0x54d05a8a00000000,
+    0x4e91285000000000, 0x6a482f3300000000, 0x0623279600000000,
+    0x22fa20f500000000, 0x9ff3460700000000, 0xbb2a416400000000,
+    0xd74149c100000000, 0xf3984ea200000000, 0xec54f4fe00000000,
+    0xc88df39d00000000, 0xa4e6fb3800000000, 0x803ffc5b00000000,
+    0x3d369aa900000000, 0x19ef9dca00000000, 0x7584956f00000000,
+    0x515d920c00000000, 0xd73eb17600000000, 0xf3e7b61500000000,
+    0x9f8cbeb000000000, 0xbb55b9d300000000, 0x065cdf2100000000,
+    0x2285d84200000000, 0x4eeed0e700000000, 0x6a37d78400000000,
+    0x75fb6dd800000000, 0x51226abb00000000, 0x3d49621e00000000,
+    0x1990657d00000000, 0xa499038f00000000, 0x804004ec00000000,
+    0xec2b0c4900000000, 0xc8f20b2a00000000, 0xd2b379f000000000,
+    0xf66a7e9300000000, 0x9a01763600000000, 0xbed8715500000000,
+    0x03d117a700000000, 0x270810c400000000, 0x4b63186100000000,
+    0x6fba1f0200000000, 0x7076a55e00000000, 0x54afa23d00000000,
+    0x38c4aa9800000000, 0x1c1dadfb00000000, 0xa114cb0900000000,
+    0x85cdcc6a00000000, 0xe9a6c4cf00000000, 0xcd7fc3ac00000000,
+    0x9c2251a000000000, 0xb8fb56c300000000, 0xd4905e6600000000,
+    0xf049590500000000, 0x4d403ff700000000, 0x6999389400000000,
+    0x05f2303100000000, 0x212b375200000000, 0x3ee78d0e00000000,
+    0x1a3e8a6d00000000, 0x765582c800000000, 0x528c85ab00000000,
+    0xef85e35900000000, 0xcb5ce43a00000000, 0xa737ec9f00000000,
+    0x83eeebfc00000000, 0x99af992600000000, 0xbd769e4500000000,
+    0xd11d96e000000000, 0xf5c4918300000000, 0x48cdf77100000000,
+    0x6c14f01200000000, 0x007ff8b700000000, 0x24a6ffd400000000,
+    0x3b6a458800000000, 0x1fb342eb00000000, 0x73d84a4e00000000,
+    0x57014d2d00000000, 0xea082bdf00000000, 0xced12cbc00000000,
+    0xa2ba241900000000, 0x8663237a00000000, 0xae7d62ed00000000,
+    0x8aa4658e00000000, 0xe6cf6d2b00000000, 0xc2166a4800000000,
+    0x7f1f0cba00000000, 0x5bc60bd900000000, 0x37ad037c00000000,
+    0x1374041f00000000, 0x0cb8be4300000000, 0x2861b92000000000,
+    0x440ab18500000000, 0x60d3b6e600000000, 0xdddad01400000000,
+    0xf903d77700000000, 0x9568dfd200000000, 0xb1b1d8b100000000,
+    0xabf0aa6b00000000, 0x8f29ad0800000000, 0xe342a5ad00000000,
+    0xc79ba2ce00000000, 0x7a92c43c00000000, 0x5e4bc35f00000000,
+    0x3220cbfa00000000, 0x16f9cc9900000000, 0x093576c500000000,
+    0x2dec71a600000000, 0x4187790300000000, 0x655e7e6000000000,
+    0xd857189200000000, 0xfc8e1ff100000000, 0x90e5175400000000,
+    0xb43c103700000000, 0xe561823b00000000, 0xc1b8855800000000,
+    0xadd38dfd00000000, 0x890a8a9e00000000, 0x3403ec6c00000000,
+    0x10daeb0f00000000, 0x7cb1e3aa00000000, 0x5868e4c900000000,
+    0x47a45e9500000000, 0x637d59f600000000, 0x0f16515300000000,
+    0x2bcf563000000000, 0x96c630c200000000, 0xb21f37a100000000,
+    0xde743f0400000000, 0xfaad386700000000, 0xe0ec4abd00000000,
+    0xc4354dde00000000, 0xa85e457b00000000, 0x8c87421800000000,
+    0x318e24ea00000000, 0x1557238900000000, 0x793c2b2c00000000,
+    0x5de52c4f00000000, 0x4229961300000000, 0x66f0917000000000,
+    0x0a9b99d500000000, 0x2e429eb600000000, 0x934bf84400000000,
+    0xb792ff2700000000, 0xdbf9f78200000000, 0xff20f0e100000000,
+    0x7943d39b00000000, 0x5d9ad4f800000000, 0x31f1dc5d00000000,
+    0x1528db3e00000000, 0xa821bdcc00000000, 0x8cf8baaf00000000,
+    0xe093b20a00000000, 0xc44ab56900000000, 0xdb860f3500000000,
+    0xff5f085600000000, 0x933400f300000000, 0xb7ed079000000000,
+    0x0ae4616200000000, 0x2e3d660100000000, 0x42566ea400000000,
+    0x668f69c700000000, 0x7cce1b1d00000000, 0x58171c7e00000000,
+    0x347c14db00000000, 0x10a513b800000000, 0xadac754a00000000,
+    0x8975722900000000, 0xe51e7a8c00000000, 0xc1c77def00000000,
+    0xde0bc7b300000000, 0xfad2c0d000000000, 0x96b9c87500000000,
+    0xb260cf1600000000, 0x0f69a9e400000000, 0x2bb0ae8700000000,
+    0x47dba62200000000, 0x6302a14100000000, 0x325f334d00000000,
+    0x1686342e00000000, 0x7aed3c8b00000000, 0x5e343be800000000,
+    0xe33d5d1a00000000, 0xc7e45a7900000000, 0xab8f52dc00000000,
+    0x8f5655bf00000000, 0x909aefe300000000, 0xb443e88000000000,
+    0xd828e02500000000, 0xfcf1e74600000000, 0x41f881b400000000,
+    0x652186d700000000, 0x094a8e7200000000, 0x2d93891100000000,
+    0x37d2fbcb00000000, 0x130bfca800000000, 0x7f60f40d00000000,
+    0x5bb9f36e00000000, 0xe6b0959c00000000, 0xc26992ff00000000,
+    0xae029a5a00000000, 0x8adb9d3900000000, 0x9517276500000000,
+    0xb1ce200600000000, 0xdda528a300000000, 0xf97c2fc000000000,
+    0x4475493200000000, 0x60ac4e5100000000, 0x0cc746f400000000,
+    0x281e419700000000},
+   {0x0000000000000000, 0x08e3603c00000000, 0x10c6c17800000000,
+    0x1825a14400000000, 0x208c83f100000000, 0x286fe3cd00000000,
+    0x304a428900000000, 0x38a922b500000000, 0x011e763800000000,
+    0x09fd160400000000, 0x11d8b74000000000, 0x193bd77c00000000,
+    0x2192f5c900000000, 0x297195f500000000, 0x315434b100000000,
+    0x39b7548d00000000, 0x023cec7000000000, 0x0adf8c4c00000000,
+    0x12fa2d0800000000, 0x1a194d3400000000, 0x22b06f8100000000,
+    0x2a530fbd00000000, 0x3276aef900000000, 0x3a95cec500000000,
+    0x03229a4800000000, 0x0bc1fa7400000000, 0x13e45b3000000000,
+    0x1b073b0c00000000, 0x23ae19b900000000, 0x2b4d798500000000,
+    0x3368d8c100000000, 0x3b8bb8fd00000000, 0x0478d8e100000000,
+    0x0c9bb8dd00000000, 0x14be199900000000, 0x1c5d79a500000000,
+    0x24f45b1000000000, 0x2c173b2c00000000, 0x34329a6800000000,
+    0x3cd1fa5400000000, 0x0566aed900000000, 0x0d85cee500000000,
+    0x15a06fa100000000, 0x1d430f9d00000000, 0x25ea2d2800000000,
+    0x2d094d1400000000, 0x352cec5000000000, 0x3dcf8c6c00000000,
+    0x0644349100000000, 0x0ea754ad00000000, 0x1682f5e900000000,
+    0x1e6195d500000000, 0x26c8b76000000000, 0x2e2bd75c00000000,
+    0x360e761800000000, 0x3eed162400000000, 0x075a42a900000000,
+    0x0fb9229500000000, 0x179c83d100000000, 0x1f7fe3ed00000000,
+    0x27d6c15800000000, 0x2f35a16400000000, 0x3710002000000000,
+    0x3ff3601c00000000, 0x49f6c11800000000, 0x4115a12400000000,
+    0x5930006000000000, 0x51d3605c00000000, 0x697a42e900000000,
+    0x619922d500000000, 0x79bc839100000000, 0x715fe3ad00000000,
+    0x48e8b72000000000, 0x400bd71c00000000, 0x582e765800000000,
+    0x50cd166400000000, 0x686434d100000000, 0x608754ed00000000,
+    0x78a2f5a900000000, 0x7041959500000000, 0x4bca2d6800000000,
+    0x43294d5400000000, 0x5b0cec1000000000, 0x53ef8c2c00000000,
+    0x6b46ae9900000000, 0x63a5cea500000000, 0x7b806fe100000000,
+    0x73630fdd00000000, 0x4ad45b5000000000, 0x42373b6c00000000,
+    0x5a129a2800000000, 0x52f1fa1400000000, 0x6a58d8a100000000,
+    0x62bbb89d00000000, 0x7a9e19d900000000, 0x727d79e500000000,
+    0x4d8e19f900000000, 0x456d79c500000000, 0x5d48d88100000000,
+    0x55abb8bd00000000, 0x6d029a0800000000, 0x65e1fa3400000000,
+    0x7dc45b7000000000, 0x75273b4c00000000, 0x4c906fc100000000,
+    0x44730ffd00000000, 0x5c56aeb900000000, 0x54b5ce8500000000,
+    0x6c1cec3000000000, 0x64ff8c0c00000000, 0x7cda2d4800000000,
+    0x74394d7400000000, 0x4fb2f58900000000, 0x475195b500000000,
+    0x5f7434f100000000, 0x579754cd00000000, 0x6f3e767800000000,
+    0x67dd164400000000, 0x7ff8b70000000000, 0x771bd73c00000000,
+    0x4eac83b100000000, 0x464fe38d00000000, 0x5e6a42c900000000,
+    0x568922f500000000, 0x6e20004000000000, 0x66c3607c00000000,
+    0x7ee6c13800000000, 0x7605a10400000000, 0x92ec833100000000,
+    0x9a0fe30d00000000, 0x822a424900000000, 0x8ac9227500000000,
+    0xb26000c000000000, 0xba8360fc00000000, 0xa2a6c1b800000000,
+    0xaa45a18400000000, 0x93f2f50900000000, 0x9b11953500000000,
+    0x8334347100000000, 0x8bd7544d00000000, 0xb37e76f800000000,
+    0xbb9d16c400000000, 0xa3b8b78000000000, 0xab5bd7bc00000000,
+    0x90d06f4100000000, 0x98330f7d00000000, 0x8016ae3900000000,
+    0x88f5ce0500000000, 0xb05cecb000000000, 0xb8bf8c8c00000000,
+    0xa09a2dc800000000, 0xa8794df400000000, 0x91ce197900000000,
+    0x992d794500000000, 0x8108d80100000000, 0x89ebb83d00000000,
+    0xb1429a8800000000, 0xb9a1fab400000000, 0xa1845bf000000000,
+    0xa9673bcc00000000, 0x96945bd000000000, 0x9e773bec00000000,
+    0x86529aa800000000, 0x8eb1fa9400000000, 0xb618d82100000000,
+    0xbefbb81d00000000, 0xa6de195900000000, 0xae3d796500000000,
+    0x978a2de800000000, 0x9f694dd400000000, 0x874cec9000000000,
+    0x8faf8cac00000000, 0xb706ae1900000000, 0xbfe5ce2500000000,
+    0xa7c06f6100000000, 0xaf230f5d00000000, 0x94a8b7a000000000,
+    0x9c4bd79c00000000, 0x846e76d800000000, 0x8c8d16e400000000,
+    0xb424345100000000, 0xbcc7546d00000000, 0xa4e2f52900000000,
+    0xac01951500000000, 0x95b6c19800000000, 0x9d55a1a400000000,
+    0x857000e000000000, 0x8d9360dc00000000, 0xb53a426900000000,
+    0xbdd9225500000000, 0xa5fc831100000000, 0xad1fe32d00000000,
+    0xdb1a422900000000, 0xd3f9221500000000, 0xcbdc835100000000,
+    0xc33fe36d00000000, 0xfb96c1d800000000, 0xf375a1e400000000,
+    0xeb5000a000000000, 0xe3b3609c00000000, 0xda04341100000000,
+    0xd2e7542d00000000, 0xcac2f56900000000, 0xc221955500000000,
+    0xfa88b7e000000000, 0xf26bd7dc00000000, 0xea4e769800000000,
+    0xe2ad16a400000000, 0xd926ae5900000000, 0xd1c5ce6500000000,
+    0xc9e06f2100000000, 0xc1030f1d00000000, 0xf9aa2da800000000,
+    0xf1494d9400000000, 0xe96cecd000000000, 0xe18f8cec00000000,
+    0xd838d86100000000, 0xd0dbb85d00000000, 0xc8fe191900000000,
+    0xc01d792500000000, 0xf8b45b9000000000, 0xf0573bac00000000,
+    0xe8729ae800000000, 0xe091fad400000000, 0xdf629ac800000000,
+    0xd781faf400000000, 0xcfa45bb000000000, 0xc7473b8c00000000,
+    0xffee193900000000, 0xf70d790500000000, 0xef28d84100000000,
+    0xe7cbb87d00000000, 0xde7cecf000000000, 0xd69f8ccc00000000,
+    0xceba2d8800000000, 0xc6594db400000000, 0xfef06f0100000000,
+    0xf6130f3d00000000, 0xee36ae7900000000, 0xe6d5ce4500000000,
+    0xdd5e76b800000000, 0xd5bd168400000000, 0xcd98b7c000000000,
+    0xc57bd7fc00000000, 0xfdd2f54900000000, 0xf531957500000000,
+    0xed14343100000000, 0xe5f7540d00000000, 0xdc40008000000000,
+    0xd4a360bc00000000, 0xcc86c1f800000000, 0xc465a1c400000000,
+    0xfccc837100000000, 0xf42fe34d00000000, 0xec0a420900000000,
+    0xe4e9223500000000},
+   {0x0000000000000000, 0xd1e8e70e00000000, 0xa2d1cf1d00000000,
+    0x7339281300000000, 0x44a39f3b00000000, 0x954b783500000000,
+    0xe672502600000000, 0x379ab72800000000, 0x88463f7700000000,
+    0x59aed87900000000, 0x2a97f06a00000000, 0xfb7f176400000000,
+    0xcce5a04c00000000, 0x1d0d474200000000, 0x6e346f5100000000,
+    0xbfdc885f00000000, 0x108d7eee00000000, 0xc16599e000000000,
+    0xb25cb1f300000000, 0x63b456fd00000000, 0x542ee1d500000000,
+    0x85c606db00000000, 0xf6ff2ec800000000, 0x2717c9c600000000,
+    0x98cb419900000000, 0x4923a69700000000, 0x3a1a8e8400000000,
+    0xebf2698a00000000, 0xdc68dea200000000, 0x0d8039ac00000000,
+    0x7eb911bf00000000, 0xaf51f6b100000000, 0x611c8c0700000000,
+    0xb0f46b0900000000, 0xc3cd431a00000000, 0x1225a41400000000,
+    0x25bf133c00000000, 0xf457f43200000000, 0x876edc2100000000,
+    0x56863b2f00000000, 0xe95ab37000000000, 0x38b2547e00000000,
+    0x4b8b7c6d00000000, 0x9a639b6300000000, 0xadf92c4b00000000,
+    0x7c11cb4500000000, 0x0f28e35600000000, 0xdec0045800000000,
+    0x7191f2e900000000, 0xa07915e700000000, 0xd3403df400000000,
+    0x02a8dafa00000000, 0x35326dd200000000, 0xe4da8adc00000000,
+    0x97e3a2cf00000000, 0x460b45c100000000, 0xf9d7cd9e00000000,
+    0x283f2a9000000000, 0x5b06028300000000, 0x8aeee58d00000000,
+    0xbd7452a500000000, 0x6c9cb5ab00000000, 0x1fa59db800000000,
+    0xce4d7ab600000000, 0xc238180f00000000, 0x13d0ff0100000000,
+    0x60e9d71200000000, 0xb101301c00000000, 0x869b873400000000,
+    0x5773603a00000000, 0x244a482900000000, 0xf5a2af2700000000,
+    0x4a7e277800000000, 0x9b96c07600000000, 0xe8afe86500000000,
+    0x39470f6b00000000, 0x0eddb84300000000, 0xdf355f4d00000000,
+    0xac0c775e00000000, 0x7de4905000000000, 0xd2b566e100000000,
+    0x035d81ef00000000, 0x7064a9fc00000000, 0xa18c4ef200000000,
+    0x9616f9da00000000, 0x47fe1ed400000000, 0x34c736c700000000,
+    0xe52fd1c900000000, 0x5af3599600000000, 0x8b1bbe9800000000,
+    0xf822968b00000000, 0x29ca718500000000, 0x1e50c6ad00000000,
+    0xcfb821a300000000, 0xbc8109b000000000, 0x6d69eebe00000000,
+    0xa324940800000000, 0x72cc730600000000, 0x01f55b1500000000,
+    0xd01dbc1b00000000, 0xe7870b3300000000, 0x366fec3d00000000,
+    0x4556c42e00000000, 0x94be232000000000, 0x2b62ab7f00000000,
+    0xfa8a4c7100000000, 0x89b3646200000000, 0x585b836c00000000,
+    0x6fc1344400000000, 0xbe29d34a00000000, 0xcd10fb5900000000,
+    0x1cf81c5700000000, 0xb3a9eae600000000, 0x62410de800000000,
+    0x117825fb00000000, 0xc090c2f500000000, 0xf70a75dd00000000,
+    0x26e292d300000000, 0x55dbbac000000000, 0x84335dce00000000,
+    0x3befd59100000000, 0xea07329f00000000, 0x993e1a8c00000000,
+    0x48d6fd8200000000, 0x7f4c4aaa00000000, 0xaea4ada400000000,
+    0xdd9d85b700000000, 0x0c7562b900000000, 0x8471301e00000000,
+    0x5599d71000000000, 0x26a0ff0300000000, 0xf748180d00000000,
+    0xc0d2af2500000000, 0x113a482b00000000, 0x6203603800000000,
+    0xb3eb873600000000, 0x0c370f6900000000, 0xdddfe86700000000,
+    0xaee6c07400000000, 0x7f0e277a00000000, 0x4894905200000000,
+    0x997c775c00000000, 0xea455f4f00000000, 0x3badb84100000000,
+    0x94fc4ef000000000, 0x4514a9fe00000000, 0x362d81ed00000000,
+    0xe7c566e300000000, 0xd05fd1cb00000000, 0x01b736c500000000,
+    0x728e1ed600000000, 0xa366f9d800000000, 0x1cba718700000000,
+    0xcd52968900000000, 0xbe6bbe9a00000000, 0x6f83599400000000,
+    0x5819eebc00000000, 0x89f109b200000000, 0xfac821a100000000,
+    0x2b20c6af00000000, 0xe56dbc1900000000, 0x34855b1700000000,
+    0x47bc730400000000, 0x9654940a00000000, 0xa1ce232200000000,
+    0x7026c42c00000000, 0x031fec3f00000000, 0xd2f70b3100000000,
+    0x6d2b836e00000000, 0xbcc3646000000000, 0xcffa4c7300000000,
+    0x1e12ab7d00000000, 0x29881c5500000000, 0xf860fb5b00000000,
+    0x8b59d34800000000, 0x5ab1344600000000, 0xf5e0c2f700000000,
+    0x240825f900000000, 0x57310dea00000000, 0x86d9eae400000000,
+    0xb1435dcc00000000, 0x60abbac200000000, 0x139292d100000000,
+    0xc27a75df00000000, 0x7da6fd8000000000, 0xac4e1a8e00000000,
+    0xdf77329d00000000, 0x0e9fd59300000000, 0x390562bb00000000,
+    0xe8ed85b500000000, 0x9bd4ada600000000, 0x4a3c4aa800000000,
+    0x4649281100000000, 0x97a1cf1f00000000, 0xe498e70c00000000,
+    0x3570000200000000, 0x02eab72a00000000, 0xd302502400000000,
+    0xa03b783700000000, 0x71d39f3900000000, 0xce0f176600000000,
+    0x1fe7f06800000000, 0x6cded87b00000000, 0xbd363f7500000000,
+    0x8aac885d00000000, 0x5b446f5300000000, 0x287d474000000000,
+    0xf995a04e00000000, 0x56c456ff00000000, 0x872cb1f100000000,
+    0xf41599e200000000, 0x25fd7eec00000000, 0x1267c9c400000000,
+    0xc38f2eca00000000, 0xb0b606d900000000, 0x615ee1d700000000,
+    0xde82698800000000, 0x0f6a8e8600000000, 0x7c53a69500000000,
+    0xadbb419b00000000, 0x9a21f6b300000000, 0x4bc911bd00000000,
+    0x38f039ae00000000, 0xe918dea000000000, 0x2755a41600000000,
+    0xf6bd431800000000, 0x85846b0b00000000, 0x546c8c0500000000,
+    0x63f63b2d00000000, 0xb21edc2300000000, 0xc127f43000000000,
+    0x10cf133e00000000, 0xaf139b6100000000, 0x7efb7c6f00000000,
+    0x0dc2547c00000000, 0xdc2ab37200000000, 0xebb0045a00000000,
+    0x3a58e35400000000, 0x4961cb4700000000, 0x98892c4900000000,
+    0x37d8daf800000000, 0xe6303df600000000, 0x950915e500000000,
+    0x44e1f2eb00000000, 0x737b45c300000000, 0xa293a2cd00000000,
+    0xd1aa8ade00000000, 0x00426dd000000000, 0xbf9ee58f00000000,
+    0x6e76028100000000, 0x1d4f2a9200000000, 0xcca7cd9c00000000,
+    0xfb3d7ab400000000, 0x2ad59dba00000000, 0x59ecb5a900000000,
+    0x880452a700000000},
+   {0x0000000000000000, 0xaa05daf100000000, 0x150dc53800000000,
+    0xbf081fc900000000, 0x2a1a8a7100000000, 0x801f508000000000,
+    0x3f174f4900000000, 0x951295b800000000, 0x543414e300000000,
+    0xfe31ce1200000000, 0x4139d1db00000000, 0xeb3c0b2a00000000,
+    0x7e2e9e9200000000, 0xd42b446300000000, 0x6b235baa00000000,
+    0xc126815b00000000, 0xe96e591d00000000, 0x436b83ec00000000,
+    0xfc639c2500000000, 0x566646d400000000, 0xc374d36c00000000,
+    0x6971099d00000000, 0xd679165400000000, 0x7c7ccca500000000,
+    0xbd5a4dfe00000000, 0x175f970f00000000, 0xa85788c600000000,
+    0x0252523700000000, 0x9740c78f00000000, 0x3d451d7e00000000,
+    0x824d02b700000000, 0x2848d84600000000, 0xd2ddb23a00000000,
+    0x78d868cb00000000, 0xc7d0770200000000, 0x6dd5adf300000000,
+    0xf8c7384b00000000, 0x52c2e2ba00000000, 0xedcafd7300000000,
+    0x47cf278200000000, 0x86e9a6d900000000, 0x2cec7c2800000000,
+    0x93e463e100000000, 0x39e1b91000000000, 0xacf32ca800000000,
+    0x06f6f65900000000, 0xb9fee99000000000, 0x13fb336100000000,
+    0x3bb3eb2700000000, 0x91b631d600000000, 0x2ebe2e1f00000000,
+    0x84bbf4ee00000000, 0x11a9615600000000, 0xbbacbba700000000,
+    0x04a4a46e00000000, 0xaea17e9f00000000, 0x6f87ffc400000000,
+    0xc582253500000000, 0x7a8a3afc00000000, 0xd08fe00d00000000,
+    0x459d75b500000000, 0xef98af4400000000, 0x5090b08d00000000,
+    0xfa956a7c00000000, 0xa4bb657500000000, 0x0ebebf8400000000,
+    0xb1b6a04d00000000, 0x1bb37abc00000000, 0x8ea1ef0400000000,
+    0x24a435f500000000, 0x9bac2a3c00000000, 0x31a9f0cd00000000,
+    0xf08f719600000000, 0x5a8aab6700000000, 0xe582b4ae00000000,
+    0x4f876e5f00000000, 0xda95fbe700000000, 0x7090211600000000,
+    0xcf983edf00000000, 0x659de42e00000000, 0x4dd53c6800000000,
+    0xe7d0e69900000000, 0x58d8f95000000000, 0xf2dd23a100000000,
+    0x67cfb61900000000, 0xcdca6ce800000000, 0x72c2732100000000,
+    0xd8c7a9d000000000, 0x19e1288b00000000, 0xb3e4f27a00000000,
+    0x0cecedb300000000, 0xa6e9374200000000, 0x33fba2fa00000000,
+    0x99fe780b00000000, 0x26f667c200000000, 0x8cf3bd3300000000,
+    0x7666d74f00000000, 0xdc630dbe00000000, 0x636b127700000000,
+    0xc96ec88600000000, 0x5c7c5d3e00000000, 0xf67987cf00000000,
+    0x4971980600000000, 0xe37442f700000000, 0x2252c3ac00000000,
+    0x8857195d00000000, 0x375f069400000000, 0x9d5adc6500000000,
+    0x084849dd00000000, 0xa24d932c00000000, 0x1d458ce500000000,
+    0xb740561400000000, 0x9f088e5200000000, 0x350d54a300000000,
+    0x8a054b6a00000000, 0x2000919b00000000, 0xb512042300000000,
+    0x1f17ded200000000, 0xa01fc11b00000000, 0x0a1a1bea00000000,
+    0xcb3c9ab100000000, 0x6139404000000000, 0xde315f8900000000,
+    0x7434857800000000, 0xe12610c000000000, 0x4b23ca3100000000,
+    0xf42bd5f800000000, 0x5e2e0f0900000000, 0x4877cbea00000000,
+    0xe272111b00000000, 0x5d7a0ed200000000, 0xf77fd42300000000,
+    0x626d419b00000000, 0xc8689b6a00000000, 0x776084a300000000,
+    0xdd655e5200000000, 0x1c43df0900000000, 0xb64605f800000000,
+    0x094e1a3100000000, 0xa34bc0c000000000, 0x3659557800000000,
+    0x9c5c8f8900000000, 0x2354904000000000, 0x89514ab100000000,
+    0xa11992f700000000, 0x0b1c480600000000, 0xb41457cf00000000,
+    0x1e118d3e00000000, 0x8b03188600000000, 0x2106c27700000000,
+    0x9e0eddbe00000000, 0x340b074f00000000, 0xf52d861400000000,
+    0x5f285ce500000000, 0xe020432c00000000, 0x4a2599dd00000000,
+    0xdf370c6500000000, 0x7532d69400000000, 0xca3ac95d00000000,
+    0x603f13ac00000000, 0x9aaa79d000000000, 0x30afa32100000000,
+    0x8fa7bce800000000, 0x25a2661900000000, 0xb0b0f3a100000000,
+    0x1ab5295000000000, 0xa5bd369900000000, 0x0fb8ec6800000000,
+    0xce9e6d3300000000, 0x649bb7c200000000, 0xdb93a80b00000000,
+    0x719672fa00000000, 0xe484e74200000000, 0x4e813db300000000,
+    0xf189227a00000000, 0x5b8cf88b00000000, 0x73c420cd00000000,
+    0xd9c1fa3c00000000, 0x66c9e5f500000000, 0xcccc3f0400000000,
+    0x59deaabc00000000, 0xf3db704d00000000, 0x4cd36f8400000000,
+    0xe6d6b57500000000, 0x27f0342e00000000, 0x8df5eedf00000000,
+    0x32fdf11600000000, 0x98f82be700000000, 0x0deabe5f00000000,
+    0xa7ef64ae00000000, 0x18e77b6700000000, 0xb2e2a19600000000,
+    0xecccae9f00000000, 0x46c9746e00000000, 0xf9c16ba700000000,
+    0x53c4b15600000000, 0xc6d624ee00000000, 0x6cd3fe1f00000000,
+    0xd3dbe1d600000000, 0x79de3b2700000000, 0xb8f8ba7c00000000,
+    0x12fd608d00000000, 0xadf57f4400000000, 0x07f0a5b500000000,
+    0x92e2300d00000000, 0x38e7eafc00000000, 0x87eff53500000000,
+    0x2dea2fc400000000, 0x05a2f78200000000, 0xafa72d7300000000,
+    0x10af32ba00000000, 0xbaaae84b00000000, 0x2fb87df300000000,
+    0x85bda70200000000, 0x3ab5b8cb00000000, 0x90b0623a00000000,
+    0x5196e36100000000, 0xfb93399000000000, 0x449b265900000000,
+    0xee9efca800000000, 0x7b8c691000000000, 0xd189b3e100000000,
+    0x6e81ac2800000000, 0xc48476d900000000, 0x3e111ca500000000,
+    0x9414c65400000000, 0x2b1cd99d00000000, 0x8119036c00000000,
+    0x140b96d400000000, 0xbe0e4c2500000000, 0x010653ec00000000,
+    0xab03891d00000000, 0x6a25084600000000, 0xc020d2b700000000,
+    0x7f28cd7e00000000, 0xd52d178f00000000, 0x403f823700000000,
+    0xea3a58c600000000, 0x5532470f00000000, 0xff379dfe00000000,
+    0xd77f45b800000000, 0x7d7a9f4900000000, 0xc272808000000000,
+    0x68775a7100000000, 0xfd65cfc900000000, 0x5760153800000000,
+    0xe8680af100000000, 0x426dd00000000000, 0x834b515b00000000,
+    0x294e8baa00000000, 0x9646946300000000, 0x3c434e9200000000,
+    0xa951db2a00000000, 0x035401db00000000, 0xbc5c1e1200000000,
+    0x1659c4e300000000}};
+
+#else /* W == 4 */
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0xae689191, 0x87a02563, 0x29c8b4f2, 0xd4314c87,
+    0x7a59dd16, 0x539169e4, 0xfdf9f875, 0x73139f4f, 0xdd7b0ede,
+    0xf4b3ba2c, 0x5adb2bbd, 0xa722d3c8, 0x094a4259, 0x2082f6ab,
+    0x8eea673a, 0xe6273e9e, 0x484faf0f, 0x61871bfd, 0xcfef8a6c,
+    0x32167219, 0x9c7ee388, 0xb5b6577a, 0x1bdec6eb, 0x9534a1d1,
+    0x3b5c3040, 0x129484b2, 0xbcfc1523, 0x4105ed56, 0xef6d7cc7,
+    0xc6a5c835, 0x68cd59a4, 0x173f7b7d, 0xb957eaec, 0x909f5e1e,
+    0x3ef7cf8f, 0xc30e37fa, 0x6d66a66b, 0x44ae1299, 0xeac68308,
+    0x642ce432, 0xca4475a3, 0xe38cc151, 0x4de450c0, 0xb01da8b5,
+    0x1e753924, 0x37bd8dd6, 0x99d51c47, 0xf11845e3, 0x5f70d472,
+    0x76b86080, 0xd8d0f111, 0x25290964, 0x8b4198f5, 0xa2892c07,
+    0x0ce1bd96, 0x820bdaac, 0x2c634b3d, 0x05abffcf, 0xabc36e5e,
+    0x563a962b, 0xf85207ba, 0xd19ab348, 0x7ff222d9, 0x2e7ef6fa,
+    0x8016676b, 0xa9ded399, 0x07b64208, 0xfa4fba7d, 0x54272bec,
+    0x7def9f1e, 0xd3870e8f, 0x5d6d69b5, 0xf305f824, 0xdacd4cd6,
+    0x74a5dd47, 0x895c2532, 0x2734b4a3, 0x0efc0051, 0xa09491c0,
+    0xc859c864, 0x663159f5, 0x4ff9ed07, 0xe1917c96, 0x1c6884e3,
+    0xb2001572, 0x9bc8a180, 0x35a03011, 0xbb4a572b, 0x1522c6ba,
+    0x3cea7248, 0x9282e3d9, 0x6f7b1bac, 0xc1138a3d, 0xe8db3ecf,
+    0x46b3af5e, 0x39418d87, 0x97291c16, 0xbee1a8e4, 0x10893975,
+    0xed70c100, 0x43185091, 0x6ad0e463, 0xc4b875f2, 0x4a5212c8,
+    0xe43a8359, 0xcdf237ab, 0x639aa63a, 0x9e635e4f, 0x300bcfde,
+    0x19c37b2c, 0xb7abeabd, 0xdf66b319, 0x710e2288, 0x58c6967a,
+    0xf6ae07eb, 0x0b57ff9e, 0xa53f6e0f, 0x8cf7dafd, 0x229f4b6c,
+    0xac752c56, 0x021dbdc7, 0x2bd50935, 0x85bd98a4, 0x784460d1,
+    0xd62cf140, 0xffe445b2, 0x518cd423, 0x5cfdedf4, 0xf2957c65,
+    0xdb5dc897, 0x75355906, 0x88cca173, 0x26a430e2, 0x0f6c8410,
+    0xa1041581, 0x2fee72bb, 0x8186e32a, 0xa84e57d8, 0x0626c649,
+    0xfbdf3e3c, 0x55b7afad, 0x7c7f1b5f, 0xd2178ace, 0xbadad36a,
+    0x14b242fb, 0x3d7af609, 0x93126798, 0x6eeb9fed, 0xc0830e7c,
+    0xe94bba8e, 0x47232b1f, 0xc9c94c25, 0x67a1ddb4, 0x4e696946,
+    0xe001f8d7, 0x1df800a2, 0xb3909133, 0x9a5825c1, 0x3430b450,
+    0x4bc29689, 0xe5aa0718, 0xcc62b3ea, 0x620a227b, 0x9ff3da0e,
+    0x319b4b9f, 0x1853ff6d, 0xb63b6efc, 0x38d109c6, 0x96b99857,
+    0xbf712ca5, 0x1119bd34, 0xece04541, 0x4288d4d0, 0x6b406022,
+    0xc528f1b3, 0xade5a817, 0x038d3986, 0x2a458d74, 0x842d1ce5,
+    0x79d4e490, 0xd7bc7501, 0xfe74c1f3, 0x501c5062, 0xdef63758,
+    0x709ea6c9, 0x5956123b, 0xf73e83aa, 0x0ac77bdf, 0xa4afea4e,
+    0x8d675ebc, 0x230fcf2d, 0x72831b0e, 0xdceb8a9f, 0xf5233e6d,
+    0x5b4baffc, 0xa6b25789, 0x08dac618, 0x211272ea, 0x8f7ae37b,
+    0x01908441, 0xaff815d0, 0x8630a122, 0x285830b3, 0xd5a1c8c6,
+    0x7bc95957, 0x5201eda5, 0xfc697c34, 0x94a42590, 0x3accb401,
+    0x130400f3, 0xbd6c9162, 0x40956917, 0xeefdf886, 0xc7354c74,
+    0x695ddde5, 0xe7b7badf, 0x49df2b4e, 0x60179fbc, 0xce7f0e2d,
+    0x3386f658, 0x9dee67c9, 0xb426d33b, 0x1a4e42aa, 0x65bc6073,
+    0xcbd4f1e2, 0xe21c4510, 0x4c74d481, 0xb18d2cf4, 0x1fe5bd65,
+    0x362d0997, 0x98459806, 0x16afff3c, 0xb8c76ead, 0x910fda5f,
+    0x3f674bce, 0xc29eb3bb, 0x6cf6222a, 0x453e96d8, 0xeb560749,
+    0x839b5eed, 0x2df3cf7c, 0x043b7b8e, 0xaa53ea1f, 0x57aa126a,
+    0xf9c283fb, 0xd00a3709, 0x7e62a698, 0xf088c1a2, 0x5ee05033,
+    0x7728e4c1, 0xd9407550, 0x24b98d25, 0x8ad11cb4, 0xa319a846,
+    0x0d7139d7},
+   {0x00000000, 0xb9fbdbe8, 0xa886b191, 0x117d6a79, 0x8a7c6563,
+    0x3387be8b, 0x22fad4f2, 0x9b010f1a, 0xcf89cc87, 0x7672176f,
+    0x670f7d16, 0xdef4a6fe, 0x45f5a9e4, 0xfc0e720c, 0xed731875,
+    0x5488c39d, 0x44629f4f, 0xfd9944a7, 0xece42ede, 0x551ff536,
+    0xce1efa2c, 0x77e521c4, 0x66984bbd, 0xdf639055, 0x8beb53c8,
+    0x32108820, 0x236de259, 0x9a9639b1, 0x019736ab, 0xb86ced43,
+    0xa911873a, 0x10ea5cd2, 0x88c53e9e, 0x313ee576, 0x20438f0f,
+    0x99b854e7, 0x02b95bfd, 0xbb428015, 0xaa3fea6c, 0x13c43184,
+    0x474cf219, 0xfeb729f1, 0xefca4388, 0x56319860, 0xcd30977a,
+    0x74cb4c92, 0x65b626eb, 0xdc4dfd03, 0xcca7a1d1, 0x755c7a39,
+    0x64211040, 0xdddacba8, 0x46dbc4b2, 0xff201f5a, 0xee5d7523,
+    0x57a6aecb, 0x032e6d56, 0xbad5b6be, 0xaba8dcc7, 0x1253072f,
+    0x89520835, 0x30a9d3dd, 0x21d4b9a4, 0x982f624c, 0xcafb7b7d,
+    0x7300a095, 0x627dcaec, 0xdb861104, 0x40871e1e, 0xf97cc5f6,
+    0xe801af8f, 0x51fa7467, 0x0572b7fa, 0xbc896c12, 0xadf4066b,
+    0x140fdd83, 0x8f0ed299, 0x36f50971, 0x27886308, 0x9e73b8e0,
+    0x8e99e432, 0x37623fda, 0x261f55a3, 0x9fe48e4b, 0x04e58151,
+    0xbd1e5ab9, 0xac6330c0, 0x1598eb28, 0x411028b5, 0xf8ebf35d,
+    0xe9969924, 0x506d42cc, 0xcb6c4dd6, 0x7297963e, 0x63eafc47,
+    0xda1127af, 0x423e45e3, 0xfbc59e0b, 0xeab8f472, 0x53432f9a,
+    0xc8422080, 0x71b9fb68, 0x60c49111, 0xd93f4af9, 0x8db78964,
+    0x344c528c, 0x253138f5, 0x9ccae31d, 0x07cbec07, 0xbe3037ef,
+    0xaf4d5d96, 0x16b6867e, 0x065cdaac, 0xbfa70144, 0xaeda6b3d,
+    0x1721b0d5, 0x8c20bfcf, 0x35db6427, 0x24a60e5e, 0x9d5dd5b6,
+    0xc9d5162b, 0x702ecdc3, 0x6153a7ba, 0xd8a87c52, 0x43a97348,
+    0xfa52a8a0, 0xeb2fc2d9, 0x52d41931, 0x4e87f0bb, 0xf77c2b53,
+    0xe601412a, 0x5ffa9ac2, 0xc4fb95d8, 0x7d004e30, 0x6c7d2449,
+    0xd586ffa1, 0x810e3c3c, 0x38f5e7d4, 0x29888dad, 0x90735645,
+    0x0b72595f, 0xb28982b7, 0xa3f4e8ce, 0x1a0f3326, 0x0ae56ff4,
+    0xb31eb41c, 0xa263de65, 0x1b98058d, 0x80990a97, 0x3962d17f,
+    0x281fbb06, 0x91e460ee, 0xc56ca373, 0x7c97789b, 0x6dea12e2,
+    0xd411c90a, 0x4f10c610, 0xf6eb1df8, 0xe7967781, 0x5e6dac69,
+    0xc642ce25, 0x7fb915cd, 0x6ec47fb4, 0xd73fa45c, 0x4c3eab46,
+    0xf5c570ae, 0xe4b81ad7, 0x5d43c13f, 0x09cb02a2, 0xb030d94a,
+    0xa14db333, 0x18b668db, 0x83b767c1, 0x3a4cbc29, 0x2b31d650,
+    0x92ca0db8, 0x8220516a, 0x3bdb8a82, 0x2aa6e0fb, 0x935d3b13,
+    0x085c3409, 0xb1a7efe1, 0xa0da8598, 0x19215e70, 0x4da99ded,
+    0xf4524605, 0xe52f2c7c, 0x5cd4f794, 0xc7d5f88e, 0x7e2e2366,
+    0x6f53491f, 0xd6a892f7, 0x847c8bc6, 0x3d87502e, 0x2cfa3a57,
+    0x9501e1bf, 0x0e00eea5, 0xb7fb354d, 0xa6865f34, 0x1f7d84dc,
+    0x4bf54741, 0xf20e9ca9, 0xe373f6d0, 0x5a882d38, 0xc1892222,
+    0x7872f9ca, 0x690f93b3, 0xd0f4485b, 0xc01e1489, 0x79e5cf61,
+    0x6898a518, 0xd1637ef0, 0x4a6271ea, 0xf399aa02, 0xe2e4c07b,
+    0x5b1f1b93, 0x0f97d80e, 0xb66c03e6, 0xa711699f, 0x1eeab277,
+    0x85ebbd6d, 0x3c106685, 0x2d6d0cfc, 0x9496d714, 0x0cb9b558,
+    0xb5426eb0, 0xa43f04c9, 0x1dc4df21, 0x86c5d03b, 0x3f3e0bd3,
+    0x2e4361aa, 0x97b8ba42, 0xc33079df, 0x7acba237, 0x6bb6c84e,
+    0xd24d13a6, 0x494c1cbc, 0xf0b7c754, 0xe1caad2d, 0x583176c5,
+    0x48db2a17, 0xf120f1ff, 0xe05d9b86, 0x59a6406e, 0xc2a74f74,
+    0x7b5c949c, 0x6a21fee5, 0xd3da250d, 0x8752e690, 0x3ea93d78,
+    0x2fd45701, 0x962f8ce9, 0x0d2e83f3, 0xb4d5581b, 0xa5a83262,
+    0x1c53e98a},
+   {0x00000000, 0x9d0fe176, 0xe16ec4ad, 0x7c6125db, 0x19ac8f1b,
+    0x84a36e6d, 0xf8c24bb6, 0x65cdaac0, 0x33591e36, 0xae56ff40,
+    0xd237da9b, 0x4f383bed, 0x2af5912d, 0xb7fa705b, 0xcb9b5580,
+    0x5694b4f6, 0x66b23c6c, 0xfbbddd1a, 0x87dcf8c1, 0x1ad319b7,
+    0x7f1eb377, 0xe2115201, 0x9e7077da, 0x037f96ac, 0x55eb225a,
+    0xc8e4c32c, 0xb485e6f7, 0x298a0781, 0x4c47ad41, 0xd1484c37,
+    0xad2969ec, 0x3026889a, 0xcd6478d8, 0x506b99ae, 0x2c0abc75,
+    0xb1055d03, 0xd4c8f7c3, 0x49c716b5, 0x35a6336e, 0xa8a9d218,
+    0xfe3d66ee, 0x63328798, 0x1f53a243, 0x825c4335, 0xe791e9f5,
+    0x7a9e0883, 0x06ff2d58, 0x9bf0cc2e, 0xabd644b4, 0x36d9a5c2,
+    0x4ab88019, 0xd7b7616f, 0xb27acbaf, 0x2f752ad9, 0x53140f02,
+    0xce1bee74, 0x988f5a82, 0x0580bbf4, 0x79e19e2f, 0xe4ee7f59,
+    0x8123d599, 0x1c2c34ef, 0x604d1134, 0xfd42f042, 0x41b9f7f1,
+    0xdcb61687, 0xa0d7335c, 0x3dd8d22a, 0x581578ea, 0xc51a999c,
+    0xb97bbc47, 0x24745d31, 0x72e0e9c7, 0xefef08b1, 0x938e2d6a,
+    0x0e81cc1c, 0x6b4c66dc, 0xf64387aa, 0x8a22a271, 0x172d4307,
+    0x270bcb9d, 0xba042aeb, 0xc6650f30, 0x5b6aee46, 0x3ea74486,
+    0xa3a8a5f0, 0xdfc9802b, 0x42c6615d, 0x1452d5ab, 0x895d34dd,
+    0xf53c1106, 0x6833f070, 0x0dfe5ab0, 0x90f1bbc6, 0xec909e1d,
+    0x719f7f6b, 0x8cdd8f29, 0x11d26e5f, 0x6db34b84, 0xf0bcaaf2,
+    0x95710032, 0x087ee144, 0x741fc49f, 0xe91025e9, 0xbf84911f,
+    0x228b7069, 0x5eea55b2, 0xc3e5b4c4, 0xa6281e04, 0x3b27ff72,
+    0x4746daa9, 0xda493bdf, 0xea6fb345, 0x77605233, 0x0b0177e8,
+    0x960e969e, 0xf3c33c5e, 0x6eccdd28, 0x12adf8f3, 0x8fa21985,
+    0xd936ad73, 0x44394c05, 0x385869de, 0xa55788a8, 0xc09a2268,
+    0x5d95c31e, 0x21f4e6c5, 0xbcfb07b3, 0x8373efe2, 0x1e7c0e94,
+    0x621d2b4f, 0xff12ca39, 0x9adf60f9, 0x07d0818f, 0x7bb1a454,
+    0xe6be4522, 0xb02af1d4, 0x2d2510a2, 0x51443579, 0xcc4bd40f,
+    0xa9867ecf, 0x34899fb9, 0x48e8ba62, 0xd5e75b14, 0xe5c1d38e,
+    0x78ce32f8, 0x04af1723, 0x99a0f655, 0xfc6d5c95, 0x6162bde3,
+    0x1d039838, 0x800c794e, 0xd698cdb8, 0x4b972cce, 0x37f60915,
+    0xaaf9e863, 0xcf3442a3, 0x523ba3d5, 0x2e5a860e, 0xb3556778,
+    0x4e17973a, 0xd318764c, 0xaf795397, 0x3276b2e1, 0x57bb1821,
+    0xcab4f957, 0xb6d5dc8c, 0x2bda3dfa, 0x7d4e890c, 0xe041687a,
+    0x9c204da1, 0x012facd7, 0x64e20617, 0xf9ede761, 0x858cc2ba,
+    0x188323cc, 0x28a5ab56, 0xb5aa4a20, 0xc9cb6ffb, 0x54c48e8d,
+    0x3109244d, 0xac06c53b, 0xd067e0e0, 0x4d680196, 0x1bfcb560,
+    0x86f35416, 0xfa9271cd, 0x679d90bb, 0x02503a7b, 0x9f5fdb0d,
+    0xe33efed6, 0x7e311fa0, 0xc2ca1813, 0x5fc5f965, 0x23a4dcbe,
+    0xbeab3dc8, 0xdb669708, 0x4669767e, 0x3a0853a5, 0xa707b2d3,
+    0xf1930625, 0x6c9ce753, 0x10fdc288, 0x8df223fe, 0xe83f893e,
+    0x75306848, 0x09514d93, 0x945eace5, 0xa478247f, 0x3977c509,
+    0x4516e0d2, 0xd81901a4, 0xbdd4ab64, 0x20db4a12, 0x5cba6fc9,
+    0xc1b58ebf, 0x97213a49, 0x0a2edb3f, 0x764ffee4, 0xeb401f92,
+    0x8e8db552, 0x13825424, 0x6fe371ff, 0xf2ec9089, 0x0fae60cb,
+    0x92a181bd, 0xeec0a466, 0x73cf4510, 0x1602efd0, 0x8b0d0ea6,
+    0xf76c2b7d, 0x6a63ca0b, 0x3cf77efd, 0xa1f89f8b, 0xdd99ba50,
+    0x40965b26, 0x255bf1e6, 0xb8541090, 0xc435354b, 0x593ad43d,
+    0x691c5ca7, 0xf413bdd1, 0x8872980a, 0x157d797c, 0x70b0d3bc,
+    0xedbf32ca, 0x91de1711, 0x0cd1f667, 0x5a454291, 0xc74aa3e7,
+    0xbb2b863c, 0x2624674a, 0x43e9cd8a, 0xdee62cfc, 0xa2870927,
+    0x3f88e851},
+   {0x00000000, 0xdd96d985, 0x605cb54b, 0xbdca6cce, 0xc0b96a96,
+    0x1d2fb313, 0xa0e5dfdd, 0x7d730658, 0x5a03d36d, 0x87950ae8,
+    0x3a5f6626, 0xe7c9bfa3, 0x9abab9fb, 0x472c607e, 0xfae60cb0,
+    0x2770d535, 0xb407a6da, 0x69917f5f, 0xd45b1391, 0x09cdca14,
+    0x74becc4c, 0xa92815c9, 0x14e27907, 0xc974a082, 0xee0475b7,
+    0x3392ac32, 0x8e58c0fc, 0x53ce1979, 0x2ebd1f21, 0xf32bc6a4,
+    0x4ee1aa6a, 0x937773ef, 0xb37e4bf5, 0x6ee89270, 0xd322febe,
+    0x0eb4273b, 0x73c72163, 0xae51f8e6, 0x139b9428, 0xce0d4dad,
+    0xe97d9898, 0x34eb411d, 0x89212dd3, 0x54b7f456, 0x29c4f20e,
+    0xf4522b8b, 0x49984745, 0x940e9ec0, 0x0779ed2f, 0xdaef34aa,
+    0x67255864, 0xbab381e1, 0xc7c087b9, 0x1a565e3c, 0xa79c32f2,
+    0x7a0aeb77, 0x5d7a3e42, 0x80ece7c7, 0x3d268b09, 0xe0b0528c,
+    0x9dc354d4, 0x40558d51, 0xfd9fe19f, 0x2009381a, 0xbd8d91ab,
+    0x601b482e, 0xddd124e0, 0x0047fd65, 0x7d34fb3d, 0xa0a222b8,
+    0x1d684e76, 0xc0fe97f3, 0xe78e42c6, 0x3a189b43, 0x87d2f78d,
+    0x5a442e08, 0x27372850, 0xfaa1f1d5, 0x476b9d1b, 0x9afd449e,
+    0x098a3771, 0xd41ceef4, 0x69d6823a, 0xb4405bbf, 0xc9335de7,
+    0x14a58462, 0xa96fe8ac, 0x74f93129, 0x5389e41c, 0x8e1f3d99,
+    0x33d55157, 0xee4388d2, 0x93308e8a, 0x4ea6570f, 0xf36c3bc1,
+    0x2efae244, 0x0ef3da5e, 0xd36503db, 0x6eaf6f15, 0xb339b690,
+    0xce4ab0c8, 0x13dc694d, 0xae160583, 0x7380dc06, 0x54f00933,
+    0x8966d0b6, 0x34acbc78, 0xe93a65fd, 0x944963a5, 0x49dfba20,
+    0xf415d6ee, 0x29830f6b, 0xbaf47c84, 0x6762a501, 0xdaa8c9cf,
+    0x073e104a, 0x7a4d1612, 0xa7dbcf97, 0x1a11a359, 0xc7877adc,
+    0xe0f7afe9, 0x3d61766c, 0x80ab1aa2, 0x5d3dc327, 0x204ec57f,
+    0xfdd81cfa, 0x40127034, 0x9d84a9b1, 0xa06a2517, 0x7dfcfc92,
+    0xc036905c, 0x1da049d9, 0x60d34f81, 0xbd459604, 0x008ffaca,
+    0xdd19234f, 0xfa69f67a, 0x27ff2fff, 0x9a354331, 0x47a39ab4,
+    0x3ad09cec, 0xe7464569, 0x5a8c29a7, 0x871af022, 0x146d83cd,
+    0xc9fb5a48, 0x74313686, 0xa9a7ef03, 0xd4d4e95b, 0x094230de,
+    0xb4885c10, 0x691e8595, 0x4e6e50a0, 0x93f88925, 0x2e32e5eb,
+    0xf3a43c6e, 0x8ed73a36, 0x5341e3b3, 0xee8b8f7d, 0x331d56f8,
+    0x13146ee2, 0xce82b767, 0x7348dba9, 0xaede022c, 0xd3ad0474,
+    0x0e3bddf1, 0xb3f1b13f, 0x6e6768ba, 0x4917bd8f, 0x9481640a,
+    0x294b08c4, 0xf4ddd141, 0x89aed719, 0x54380e9c, 0xe9f26252,
+    0x3464bbd7, 0xa713c838, 0x7a8511bd, 0xc74f7d73, 0x1ad9a4f6,
+    0x67aaa2ae, 0xba3c7b2b, 0x07f617e5, 0xda60ce60, 0xfd101b55,
+    0x2086c2d0, 0x9d4cae1e, 0x40da779b, 0x3da971c3, 0xe03fa846,
+    0x5df5c488, 0x80631d0d, 0x1de7b4bc, 0xc0716d39, 0x7dbb01f7,
+    0xa02dd872, 0xdd5ede2a, 0x00c807af, 0xbd026b61, 0x6094b2e4,
+    0x47e467d1, 0x9a72be54, 0x27b8d29a, 0xfa2e0b1f, 0x875d0d47,
+    0x5acbd4c2, 0xe701b80c, 0x3a976189, 0xa9e01266, 0x7476cbe3,
+    0xc9bca72d, 0x142a7ea8, 0x695978f0, 0xb4cfa175, 0x0905cdbb,
+    0xd493143e, 0xf3e3c10b, 0x2e75188e, 0x93bf7440, 0x4e29adc5,
+    0x335aab9d, 0xeecc7218, 0x53061ed6, 0x8e90c753, 0xae99ff49,
+    0x730f26cc, 0xcec54a02, 0x13539387, 0x6e2095df, 0xb3b64c5a,
+    0x0e7c2094, 0xd3eaf911, 0xf49a2c24, 0x290cf5a1, 0x94c6996f,
+    0x495040ea, 0x342346b2, 0xe9b59f37, 0x547ff3f9, 0x89e92a7c,
+    0x1a9e5993, 0xc7088016, 0x7ac2ecd8, 0xa754355d, 0xda273305,
+    0x07b1ea80, 0xba7b864e, 0x67ed5fcb, 0x409d8afe, 0x9d0b537b,
+    0x20c13fb5, 0xfd57e630, 0x8024e068, 0x5db239ed, 0xe0785523,
+    0x3dee8ca6}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x00000000, 0x85d996dd, 0x4bb55c60, 0xce6ccabd, 0x966ab9c0,
+    0x13b32f1d, 0xdddfe5a0, 0x5806737d, 0x6dd3035a, 0xe80a9587,
+    0x26665f3a, 0xa3bfc9e7, 0xfbb9ba9a, 0x7e602c47, 0xb00ce6fa,
+    0x35d57027, 0xdaa607b4, 0x5f7f9169, 0x91135bd4, 0x14cacd09,
+    0x4cccbe74, 0xc91528a9, 0x0779e214, 0x82a074c9, 0xb77504ee,
+    0x32ac9233, 0xfcc0588e, 0x7919ce53, 0x211fbd2e, 0xa4c62bf3,
+    0x6aaae14e, 0xef737793, 0xf54b7eb3, 0x7092e86e, 0xbefe22d3,
+    0x3b27b40e, 0x6321c773, 0xe6f851ae, 0x28949b13, 0xad4d0dce,
+    0x98987de9, 0x1d41eb34, 0xd32d2189, 0x56f4b754, 0x0ef2c429,
+    0x8b2b52f4, 0x45479849, 0xc09e0e94, 0x2fed7907, 0xaa34efda,
+    0x64582567, 0xe181b3ba, 0xb987c0c7, 0x3c5e561a, 0xf2329ca7,
+    0x77eb0a7a, 0x423e7a5d, 0xc7e7ec80, 0x098b263d, 0x8c52b0e0,
+    0xd454c39d, 0x518d5540, 0x9fe19ffd, 0x1a380920, 0xab918dbd,
+    0x2e481b60, 0xe024d1dd, 0x65fd4700, 0x3dfb347d, 0xb822a2a0,
+    0x764e681d, 0xf397fec0, 0xc6428ee7, 0x439b183a, 0x8df7d287,
+    0x082e445a, 0x50283727, 0xd5f1a1fa, 0x1b9d6b47, 0x9e44fd9a,
+    0x71378a09, 0xf4ee1cd4, 0x3a82d669, 0xbf5b40b4, 0xe75d33c9,
+    0x6284a514, 0xace86fa9, 0x2931f974, 0x1ce48953, 0x993d1f8e,
+    0x5751d533, 0xd28843ee, 0x8a8e3093, 0x0f57a64e, 0xc13b6cf3,
+    0x44e2fa2e, 0x5edaf30e, 0xdb0365d3, 0x156faf6e, 0x90b639b3,
+    0xc8b04ace, 0x4d69dc13, 0x830516ae, 0x06dc8073, 0x3309f054,
+    0xb6d06689, 0x78bcac34, 0xfd653ae9, 0xa5634994, 0x20badf49,
+    0xeed615f4, 0x6b0f8329, 0x847cf4ba, 0x01a56267, 0xcfc9a8da,
+    0x4a103e07, 0x12164d7a, 0x97cfdba7, 0x59a3111a, 0xdc7a87c7,
+    0xe9aff7e0, 0x6c76613d, 0xa21aab80, 0x27c33d5d, 0x7fc54e20,
+    0xfa1cd8fd, 0x34701240, 0xb1a9849d, 0x17256aa0, 0x92fcfc7d,
+    0x5c9036c0, 0xd949a01d, 0x814fd360, 0x049645bd, 0xcafa8f00,
+    0x4f2319dd, 0x7af669fa, 0xff2fff27, 0x3143359a, 0xb49aa347,
+    0xec9cd03a, 0x694546e7, 0xa7298c5a, 0x22f01a87, 0xcd836d14,
+    0x485afbc9, 0x86363174, 0x03efa7a9, 0x5be9d4d4, 0xde304209,
+    0x105c88b4, 0x95851e69, 0xa0506e4e, 0x2589f893, 0xebe5322e,
+    0x6e3ca4f3, 0x363ad78e, 0xb3e34153, 0x7d8f8bee, 0xf8561d33,
+    0xe26e1413, 0x67b782ce, 0xa9db4873, 0x2c02deae, 0x7404add3,
+    0xf1dd3b0e, 0x3fb1f1b3, 0xba68676e, 0x8fbd1749, 0x0a648194,
+    0xc4084b29, 0x41d1ddf4, 0x19d7ae89, 0x9c0e3854, 0x5262f2e9,
+    0xd7bb6434, 0x38c813a7, 0xbd11857a, 0x737d4fc7, 0xf6a4d91a,
+    0xaea2aa67, 0x2b7b3cba, 0xe517f607, 0x60ce60da, 0x551b10fd,
+    0xd0c28620, 0x1eae4c9d, 0x9b77da40, 0xc371a93d, 0x46a83fe0,
+    0x88c4f55d, 0x0d1d6380, 0xbcb4e71d, 0x396d71c0, 0xf701bb7d,
+    0x72d82da0, 0x2ade5edd, 0xaf07c800, 0x616b02bd, 0xe4b29460,
+    0xd167e447, 0x54be729a, 0x9ad2b827, 0x1f0b2efa, 0x470d5d87,
+    0xc2d4cb5a, 0x0cb801e7, 0x8961973a, 0x6612e0a9, 0xe3cb7674,
+    0x2da7bcc9, 0xa87e2a14, 0xf0785969, 0x75a1cfb4, 0xbbcd0509,
+    0x3e1493d4, 0x0bc1e3f3, 0x8e18752e, 0x4074bf93, 0xc5ad294e,
+    0x9dab5a33, 0x1872ccee, 0xd61e0653, 0x53c7908e, 0x49ff99ae,
+    0xcc260f73, 0x024ac5ce, 0x87935313, 0xdf95206e, 0x5a4cb6b3,
+    0x94207c0e, 0x11f9ead3, 0x242c9af4, 0xa1f50c29, 0x6f99c694,
+    0xea405049, 0xb2462334, 0x379fb5e9, 0xf9f37f54, 0x7c2ae989,
+    0x93599e1a, 0x168008c7, 0xd8ecc27a, 0x5d3554a7, 0x053327da,
+    0x80eab107, 0x4e867bba, 0xcb5fed67, 0xfe8a9d40, 0x7b530b9d,
+    0xb53fc120, 0x30e657fd, 0x68e02480, 0xed39b25d, 0x235578e0,
+    0xa68cee3d},
+   {0x00000000, 0x76e10f9d, 0xadc46ee1, 0xdb25617c, 0x1b8fac19,
+    0x6d6ea384, 0xb64bc2f8, 0xc0aacd65, 0x361e5933, 0x40ff56ae,
+    0x9bda37d2, 0xed3b384f, 0x2d91f52a, 0x5b70fab7, 0x80559bcb,
+    0xf6b49456, 0x6c3cb266, 0x1addbdfb, 0xc1f8dc87, 0xb719d31a,
+    0x77b31e7f, 0x015211e2, 0xda77709e, 0xac967f03, 0x5a22eb55,
+    0x2cc3e4c8, 0xf7e685b4, 0x81078a29, 0x41ad474c, 0x374c48d1,
+    0xec6929ad, 0x9a882630, 0xd87864cd, 0xae996b50, 0x75bc0a2c,
+    0x035d05b1, 0xc3f7c8d4, 0xb516c749, 0x6e33a635, 0x18d2a9a8,
+    0xee663dfe, 0x98873263, 0x43a2531f, 0x35435c82, 0xf5e991e7,
+    0x83089e7a, 0x582dff06, 0x2eccf09b, 0xb444d6ab, 0xc2a5d936,
+    0x1980b84a, 0x6f61b7d7, 0xafcb7ab2, 0xd92a752f, 0x020f1453,
+    0x74ee1bce, 0x825a8f98, 0xf4bb8005, 0x2f9ee179, 0x597feee4,
+    0x99d52381, 0xef342c1c, 0x34114d60, 0x42f042fd, 0xf1f7b941,
+    0x8716b6dc, 0x5c33d7a0, 0x2ad2d83d, 0xea781558, 0x9c991ac5,
+    0x47bc7bb9, 0x315d7424, 0xc7e9e072, 0xb108efef, 0x6a2d8e93,
+    0x1ccc810e, 0xdc664c6b, 0xaa8743f6, 0x71a2228a, 0x07432d17,
+    0x9dcb0b27, 0xeb2a04ba, 0x300f65c6, 0x46ee6a5b, 0x8644a73e,
+    0xf0a5a8a3, 0x2b80c9df, 0x5d61c642, 0xabd55214, 0xdd345d89,
+    0x06113cf5, 0x70f03368, 0xb05afe0d, 0xc6bbf190, 0x1d9e90ec,
+    0x6b7f9f71, 0x298fdd8c, 0x5f6ed211, 0x844bb36d, 0xf2aabcf0,
+    0x32007195, 0x44e17e08, 0x9fc41f74, 0xe92510e9, 0x1f9184bf,
+    0x69708b22, 0xb255ea5e, 0xc4b4e5c3, 0x041e28a6, 0x72ff273b,
+    0xa9da4647, 0xdf3b49da, 0x45b36fea, 0x33526077, 0xe877010b,
+    0x9e960e96, 0x5e3cc3f3, 0x28ddcc6e, 0xf3f8ad12, 0x8519a28f,
+    0x73ad36d9, 0x054c3944, 0xde695838, 0xa88857a5, 0x68229ac0,
+    0x1ec3955d, 0xc5e6f421, 0xb307fbbc, 0xe2ef7383, 0x940e7c1e,
+    0x4f2b1d62, 0x39ca12ff, 0xf960df9a, 0x8f81d007, 0x54a4b17b,
+    0x2245bee6, 0xd4f12ab0, 0xa210252d, 0x79354451, 0x0fd44bcc,
+    0xcf7e86a9, 0xb99f8934, 0x62bae848, 0x145be7d5, 0x8ed3c1e5,
+    0xf832ce78, 0x2317af04, 0x55f6a099, 0x955c6dfc, 0xe3bd6261,
+    0x3898031d, 0x4e790c80, 0xb8cd98d6, 0xce2c974b, 0x1509f637,
+    0x63e8f9aa, 0xa34234cf, 0xd5a33b52, 0x0e865a2e, 0x786755b3,
+    0x3a97174e, 0x4c7618d3, 0x975379af, 0xe1b27632, 0x2118bb57,
+    0x57f9b4ca, 0x8cdcd5b6, 0xfa3dda2b, 0x0c894e7d, 0x7a6841e0,
+    0xa14d209c, 0xd7ac2f01, 0x1706e264, 0x61e7edf9, 0xbac28c85,
+    0xcc238318, 0x56aba528, 0x204aaab5, 0xfb6fcbc9, 0x8d8ec454,
+    0x4d240931, 0x3bc506ac, 0xe0e067d0, 0x9601684d, 0x60b5fc1b,
+    0x1654f386, 0xcd7192fa, 0xbb909d67, 0x7b3a5002, 0x0ddb5f9f,
+    0xd6fe3ee3, 0xa01f317e, 0x1318cac2, 0x65f9c55f, 0xbedca423,
+    0xc83dabbe, 0x089766db, 0x7e766946, 0xa553083a, 0xd3b207a7,
+    0x250693f1, 0x53e79c6c, 0x88c2fd10, 0xfe23f28d, 0x3e893fe8,
+    0x48683075, 0x934d5109, 0xe5ac5e94, 0x7f2478a4, 0x09c57739,
+    0xd2e01645, 0xa40119d8, 0x64abd4bd, 0x124adb20, 0xc96fba5c,
+    0xbf8eb5c1, 0x493a2197, 0x3fdb2e0a, 0xe4fe4f76, 0x921f40eb,
+    0x52b58d8e, 0x24548213, 0xff71e36f, 0x8990ecf2, 0xcb60ae0f,
+    0xbd81a192, 0x66a4c0ee, 0x1045cf73, 0xd0ef0216, 0xa60e0d8b,
+    0x7d2b6cf7, 0x0bca636a, 0xfd7ef73c, 0x8b9ff8a1, 0x50ba99dd,
+    0x265b9640, 0xe6f15b25, 0x901054b8, 0x4b3535c4, 0x3dd43a59,
+    0xa75c1c69, 0xd1bd13f4, 0x0a987288, 0x7c797d15, 0xbcd3b070,
+    0xca32bfed, 0x1117de91, 0x67f6d10c, 0x9142455a, 0xe7a34ac7,
+    0x3c862bbb, 0x4a672426, 0x8acde943, 0xfc2ce6de, 0x270987a2,
+    0x51e8883f},
+   {0x00000000, 0xe8dbfbb9, 0x91b186a8, 0x796a7d11, 0x63657c8a,
+    0x8bbe8733, 0xf2d4fa22, 0x1a0f019b, 0x87cc89cf, 0x6f177276,
+    0x167d0f67, 0xfea6f4de, 0xe4a9f545, 0x0c720efc, 0x751873ed,
+    0x9dc38854, 0x4f9f6244, 0xa74499fd, 0xde2ee4ec, 0x36f51f55,
+    0x2cfa1ece, 0xc421e577, 0xbd4b9866, 0x559063df, 0xc853eb8b,
+    0x20881032, 0x59e26d23, 0xb139969a, 0xab369701, 0x43ed6cb8,
+    0x3a8711a9, 0xd25cea10, 0x9e3ec588, 0x76e53e31, 0x0f8f4320,
+    0xe754b899, 0xfd5bb902, 0x158042bb, 0x6cea3faa, 0x8431c413,
+    0x19f24c47, 0xf129b7fe, 0x8843caef, 0x60983156, 0x7a9730cd,
+    0x924ccb74, 0xeb26b665, 0x03fd4ddc, 0xd1a1a7cc, 0x397a5c75,
+    0x40102164, 0xa8cbdadd, 0xb2c4db46, 0x5a1f20ff, 0x23755dee,
+    0xcbaea657, 0x566d2e03, 0xbeb6d5ba, 0xc7dca8ab, 0x2f075312,
+    0x35085289, 0xddd3a930, 0xa4b9d421, 0x4c622f98, 0x7d7bfbca,
+    0x95a00073, 0xecca7d62, 0x041186db, 0x1e1e8740, 0xf6c57cf9,
+    0x8faf01e8, 0x6774fa51, 0xfab77205, 0x126c89bc, 0x6b06f4ad,
+    0x83dd0f14, 0x99d20e8f, 0x7109f536, 0x08638827, 0xe0b8739e,
+    0x32e4998e, 0xda3f6237, 0xa3551f26, 0x4b8ee49f, 0x5181e504,
+    0xb95a1ebd, 0xc03063ac, 0x28eb9815, 0xb5281041, 0x5df3ebf8,
+    0x249996e9, 0xcc426d50, 0xd64d6ccb, 0x3e969772, 0x47fcea63,
+    0xaf2711da, 0xe3453e42, 0x0b9ec5fb, 0x72f4b8ea, 0x9a2f4353,
+    0x802042c8, 0x68fbb971, 0x1191c460, 0xf94a3fd9, 0x6489b78d,
+    0x8c524c34, 0xf5383125, 0x1de3ca9c, 0x07eccb07, 0xef3730be,
+    0x965d4daf, 0x7e86b616, 0xacda5c06, 0x4401a7bf, 0x3d6bdaae,
+    0xd5b02117, 0xcfbf208c, 0x2764db35, 0x5e0ea624, 0xb6d55d9d,
+    0x2b16d5c9, 0xc3cd2e70, 0xbaa75361, 0x527ca8d8, 0x4873a943,
+    0xa0a852fa, 0xd9c22feb, 0x3119d452, 0xbbf0874e, 0x532b7cf7,
+    0x2a4101e6, 0xc29afa5f, 0xd895fbc4, 0x304e007d, 0x49247d6c,
+    0xa1ff86d5, 0x3c3c0e81, 0xd4e7f538, 0xad8d8829, 0x45567390,
+    0x5f59720b, 0xb78289b2, 0xcee8f4a3, 0x26330f1a, 0xf46fe50a,
+    0x1cb41eb3, 0x65de63a2, 0x8d05981b, 0x970a9980, 0x7fd16239,
+    0x06bb1f28, 0xee60e491, 0x73a36cc5, 0x9b78977c, 0xe212ea6d,
+    0x0ac911d4, 0x10c6104f, 0xf81debf6, 0x817796e7, 0x69ac6d5e,
+    0x25ce42c6, 0xcd15b97f, 0xb47fc46e, 0x5ca43fd7, 0x46ab3e4c,
+    0xae70c5f5, 0xd71ab8e4, 0x3fc1435d, 0xa202cb09, 0x4ad930b0,
+    0x33b34da1, 0xdb68b618, 0xc167b783, 0x29bc4c3a, 0x50d6312b,
+    0xb80dca92, 0x6a512082, 0x828adb3b, 0xfbe0a62a, 0x133b5d93,
+    0x09345c08, 0xe1efa7b1, 0x9885daa0, 0x705e2119, 0xed9da94d,
+    0x054652f4, 0x7c2c2fe5, 0x94f7d45c, 0x8ef8d5c7, 0x66232e7e,
+    0x1f49536f, 0xf792a8d6, 0xc68b7c84, 0x2e50873d, 0x573afa2c,
+    0xbfe10195, 0xa5ee000e, 0x4d35fbb7, 0x345f86a6, 0xdc847d1f,
+    0x4147f54b, 0xa99c0ef2, 0xd0f673e3, 0x382d885a, 0x222289c1,
+    0xcaf97278, 0xb3930f69, 0x5b48f4d0, 0x89141ec0, 0x61cfe579,
+    0x18a59868, 0xf07e63d1, 0xea71624a, 0x02aa99f3, 0x7bc0e4e2,
+    0x931b1f5b, 0x0ed8970f, 0xe6036cb6, 0x9f6911a7, 0x77b2ea1e,
+    0x6dbdeb85, 0x8566103c, 0xfc0c6d2d, 0x14d79694, 0x58b5b90c,
+    0xb06e42b5, 0xc9043fa4, 0x21dfc41d, 0x3bd0c586, 0xd30b3e3f,
+    0xaa61432e, 0x42bab897, 0xdf7930c3, 0x37a2cb7a, 0x4ec8b66b,
+    0xa6134dd2, 0xbc1c4c49, 0x54c7b7f0, 0x2dadcae1, 0xc5763158,
+    0x172adb48, 0xfff120f1, 0x869b5de0, 0x6e40a659, 0x744fa7c2,
+    0x9c945c7b, 0xe5fe216a, 0x0d25dad3, 0x90e65287, 0x783da93e,
+    0x0157d42f, 0xe98c2f96, 0xf3832e0d, 0x1b58d5b4, 0x6232a8a5,
+    0x8ae9531c},
+   {0x00000000, 0x919168ae, 0x6325a087, 0xf2b4c829, 0x874c31d4,
+    0x16dd597a, 0xe4699153, 0x75f8f9fd, 0x4f9f1373, 0xde0e7bdd,
+    0x2cbab3f4, 0xbd2bdb5a, 0xc8d322a7, 0x59424a09, 0xabf68220,
+    0x3a67ea8e, 0x9e3e27e6, 0x0faf4f48, 0xfd1b8761, 0x6c8aefcf,
+    0x19721632, 0x88e37e9c, 0x7a57b6b5, 0xebc6de1b, 0xd1a13495,
+    0x40305c3b, 0xb2849412, 0x2315fcbc, 0x56ed0541, 0xc77c6def,
+    0x35c8a5c6, 0xa459cd68, 0x7d7b3f17, 0xecea57b9, 0x1e5e9f90,
+    0x8fcff73e, 0xfa370ec3, 0x6ba6666d, 0x9912ae44, 0x0883c6ea,
+    0x32e42c64, 0xa37544ca, 0x51c18ce3, 0xc050e44d, 0xb5a81db0,
+    0x2439751e, 0xd68dbd37, 0x471cd599, 0xe34518f1, 0x72d4705f,
+    0x8060b876, 0x11f1d0d8, 0x64092925, 0xf598418b, 0x072c89a2,
+    0x96bde10c, 0xacda0b82, 0x3d4b632c, 0xcfffab05, 0x5e6ec3ab,
+    0x2b963a56, 0xba0752f8, 0x48b39ad1, 0xd922f27f, 0xfaf67e2e,
+    0x6b671680, 0x99d3dea9, 0x0842b607, 0x7dba4ffa, 0xec2b2754,
+    0x1e9fef7d, 0x8f0e87d3, 0xb5696d5d, 0x24f805f3, 0xd64ccdda,
+    0x47dda574, 0x32255c89, 0xa3b43427, 0x5100fc0e, 0xc09194a0,
+    0x64c859c8, 0xf5593166, 0x07edf94f, 0x967c91e1, 0xe384681c,
+    0x721500b2, 0x80a1c89b, 0x1130a035, 0x2b574abb, 0xbac62215,
+    0x4872ea3c, 0xd9e38292, 0xac1b7b6f, 0x3d8a13c1, 0xcf3edbe8,
+    0x5eafb346, 0x878d4139, 0x161c2997, 0xe4a8e1be, 0x75398910,
+    0x00c170ed, 0x91501843, 0x63e4d06a, 0xf275b8c4, 0xc812524a,
+    0x59833ae4, 0xab37f2cd, 0x3aa69a63, 0x4f5e639e, 0xdecf0b30,
+    0x2c7bc319, 0xbdeaabb7, 0x19b366df, 0x88220e71, 0x7a96c658,
+    0xeb07aef6, 0x9eff570b, 0x0f6e3fa5, 0xfddaf78c, 0x6c4b9f22,
+    0x562c75ac, 0xc7bd1d02, 0x3509d52b, 0xa498bd85, 0xd1604478,
+    0x40f12cd6, 0xb245e4ff, 0x23d48c51, 0xf4edfd5c, 0x657c95f2,
+    0x97c85ddb, 0x06593575, 0x73a1cc88, 0xe230a426, 0x10846c0f,
+    0x811504a1, 0xbb72ee2f, 0x2ae38681, 0xd8574ea8, 0x49c62606,
+    0x3c3edffb, 0xadafb755, 0x5f1b7f7c, 0xce8a17d2, 0x6ad3daba,
+    0xfb42b214, 0x09f67a3d, 0x98671293, 0xed9feb6e, 0x7c0e83c0,
+    0x8eba4be9, 0x1f2b2347, 0x254cc9c9, 0xb4dda167, 0x4669694e,
+    0xd7f801e0, 0xa200f81d, 0x339190b3, 0xc125589a, 0x50b43034,
+    0x8996c24b, 0x1807aae5, 0xeab362cc, 0x7b220a62, 0x0edaf39f,
+    0x9f4b9b31, 0x6dff5318, 0xfc6e3bb6, 0xc609d138, 0x5798b996,
+    0xa52c71bf, 0x34bd1911, 0x4145e0ec, 0xd0d48842, 0x2260406b,
+    0xb3f128c5, 0x17a8e5ad, 0x86398d03, 0x748d452a, 0xe51c2d84,
+    0x90e4d479, 0x0175bcd7, 0xf3c174fe, 0x62501c50, 0x5837f6de,
+    0xc9a69e70, 0x3b125659, 0xaa833ef7, 0xdf7bc70a, 0x4eeaafa4,
+    0xbc5e678d, 0x2dcf0f23, 0x0e1b8372, 0x9f8aebdc, 0x6d3e23f5,
+    0xfcaf4b5b, 0x8957b2a6, 0x18c6da08, 0xea721221, 0x7be37a8f,
+    0x41849001, 0xd015f8af, 0x22a13086, 0xb3305828, 0xc6c8a1d5,
+    0x5759c97b, 0xa5ed0152, 0x347c69fc, 0x9025a494, 0x01b4cc3a,
+    0xf3000413, 0x62916cbd, 0x17699540, 0x86f8fdee, 0x744c35c7,
+    0xe5dd5d69, 0xdfbab7e7, 0x4e2bdf49, 0xbc9f1760, 0x2d0e7fce,
+    0x58f68633, 0xc967ee9d, 0x3bd326b4, 0xaa424e1a, 0x7360bc65,
+    0xe2f1d4cb, 0x10451ce2, 0x81d4744c, 0xf42c8db1, 0x65bde51f,
+    0x97092d36, 0x06984598, 0x3cffaf16, 0xad6ec7b8, 0x5fda0f91,
+    0xce4b673f, 0xbbb39ec2, 0x2a22f66c, 0xd8963e45, 0x490756eb,
+    0xed5e9b83, 0x7ccff32d, 0x8e7b3b04, 0x1fea53aa, 0x6a12aa57,
+    0xfb83c2f9, 0x09370ad0, 0x98a6627e, 0xa2c188f0, 0x3350e05e,
+    0xc1e42877, 0x507540d9, 0x258db924, 0xb41cd18a, 0x46a819a3,
+    0xd739710d}};
+
+#endif /* W */
+
+#endif /* N == 4 */
+#if N == 5
+
+#if W == 8
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0xaf449247, 0x85f822cf, 0x2abcb088, 0xd08143df,
+    0x7fc5d198, 0x55796110, 0xfa3df357, 0x7a7381ff, 0xd53713b8,
+    0xff8ba330, 0x50cf3177, 0xaaf2c220, 0x05b65067, 0x2f0ae0ef,
+    0x804e72a8, 0xf4e703fe, 0x5ba391b9, 0x711f2131, 0xde5bb376,
+    0x24664021, 0x8b22d266, 0xa19e62ee, 0x0edaf0a9, 0x8e948201,
+    0x21d01046, 0x0b6ca0ce, 0xa4283289, 0x5e15c1de, 0xf1515399,
+    0xdbede311, 0x74a97156, 0x32bf01bd, 0x9dfb93fa, 0xb7472372,
+    0x1803b135, 0xe23e4262, 0x4d7ad025, 0x67c660ad, 0xc882f2ea,
+    0x48cc8042, 0xe7881205, 0xcd34a28d, 0x627030ca, 0x984dc39d,
+    0x370951da, 0x1db5e152, 0xb2f17315, 0xc6580243, 0x691c9004,
+    0x43a0208c, 0xece4b2cb, 0x16d9419c, 0xb99dd3db, 0x93216353,
+    0x3c65f114, 0xbc2b83bc, 0x136f11fb, 0x39d3a173, 0x96973334,
+    0x6caac063, 0xc3ee5224, 0xe952e2ac, 0x461670eb, 0x657e037a,
+    0xca3a913d, 0xe08621b5, 0x4fc2b3f2, 0xb5ff40a5, 0x1abbd2e2,
+    0x3007626a, 0x9f43f02d, 0x1f0d8285, 0xb04910c2, 0x9af5a04a,
+    0x35b1320d, 0xcf8cc15a, 0x60c8531d, 0x4a74e395, 0xe53071d2,
+    0x91990084, 0x3edd92c3, 0x1461224b, 0xbb25b00c, 0x4118435b,
+    0xee5cd11c, 0xc4e06194, 0x6ba4f3d3, 0xebea817b, 0x44ae133c,
+    0x6e12a3b4, 0xc15631f3, 0x3b6bc2a4, 0x942f50e3, 0xbe93e06b,
+    0x11d7722c, 0x57c102c7, 0xf8859080, 0xd2392008, 0x7d7db24f,
+    0x87404118, 0x2804d35f, 0x02b863d7, 0xadfcf190, 0x2db28338,
+    0x82f6117f, 0xa84aa1f7, 0x070e33b0, 0xfd33c0e7, 0x527752a0,
+    0x78cbe228, 0xd78f706f, 0xa3260139, 0x0c62937e, 0x26de23f6,
+    0x899ab1b1, 0x73a742e6, 0xdce3d0a1, 0xf65f6029, 0x591bf26e,
+    0xd95580c6, 0x76111281, 0x5cada209, 0xf3e9304e, 0x09d4c319,
+    0xa690515e, 0x8c2ce1d6, 0x23687391, 0xcafc06f4, 0x65b894b3,
+    0x4f04243b, 0xe040b67c, 0x1a7d452b, 0xb539d76c, 0x9f8567e4,
+    0x30c1f5a3, 0xb08f870b, 0x1fcb154c, 0x3577a5c4, 0x9a333783,
+    0x600ec4d4, 0xcf4a5693, 0xe5f6e61b, 0x4ab2745c, 0x3e1b050a,
+    0x915f974d, 0xbbe327c5, 0x14a7b582, 0xee9a46d5, 0x41ded492,
+    0x6b62641a, 0xc426f65d, 0x446884f5, 0xeb2c16b2, 0xc190a63a,
+    0x6ed4347d, 0x94e9c72a, 0x3bad556d, 0x1111e5e5, 0xbe5577a2,
+    0xf8430749, 0x5707950e, 0x7dbb2586, 0xd2ffb7c1, 0x28c24496,
+    0x8786d6d1, 0xad3a6659, 0x027ef41e, 0x823086b6, 0x2d7414f1,
+    0x07c8a479, 0xa88c363e, 0x52b1c569, 0xfdf5572e, 0xd749e7a6,
+    0x780d75e1, 0x0ca404b7, 0xa3e096f0, 0x895c2678, 0x2618b43f,
+    0xdc254768, 0x7361d52f, 0x59dd65a7, 0xf699f7e0, 0x76d78548,
+    0xd993170f, 0xf32fa787, 0x5c6b35c0, 0xa656c697, 0x091254d0,
+    0x23aee458, 0x8cea761f, 0xaf82058e, 0x00c697c9, 0x2a7a2741,
+    0x853eb506, 0x7f034651, 0xd047d416, 0xfafb649e, 0x55bff6d9,
+    0xd5f18471, 0x7ab51636, 0x5009a6be, 0xff4d34f9, 0x0570c7ae,
+    0xaa3455e9, 0x8088e561, 0x2fcc7726, 0x5b650670, 0xf4219437,
+    0xde9d24bf, 0x71d9b6f8, 0x8be445af, 0x24a0d7e8, 0x0e1c6760,
+    0xa158f527, 0x2116878f, 0x8e5215c8, 0xa4eea540, 0x0baa3707,
+    0xf197c450, 0x5ed35617, 0x746fe69f, 0xdb2b74d8, 0x9d3d0433,
+    0x32799674, 0x18c526fc, 0xb781b4bb, 0x4dbc47ec, 0xe2f8d5ab,
+    0xc8446523, 0x6700f764, 0xe74e85cc, 0x480a178b, 0x62b6a703,
+    0xcdf23544, 0x37cfc613, 0x988b5454, 0xb237e4dc, 0x1d73769b,
+    0x69da07cd, 0xc69e958a, 0xec222502, 0x4366b745, 0xb95b4412,
+    0x161fd655, 0x3ca366dd, 0x93e7f49a, 0x13a98632, 0xbced1475,
+    0x9651a4fd, 0x391536ba, 0xc328c5ed, 0x6c6c57aa, 0x46d0e722,
+    0xe9947565},
+   {0x00000000, 0x4e890ba9, 0x9d121752, 0xd39b1cfb, 0xe15528e5,
+    0xafdc234c, 0x7c473fb7, 0x32ce341e, 0x19db578b, 0x57525c22,
+    0x84c940d9, 0xca404b70, 0xf88e7f6e, 0xb60774c7, 0x659c683c,
+    0x2b156395, 0x33b6af16, 0x7d3fa4bf, 0xaea4b844, 0xe02db3ed,
+    0xd2e387f3, 0x9c6a8c5a, 0x4ff190a1, 0x01789b08, 0x2a6df89d,
+    0x64e4f334, 0xb77fefcf, 0xf9f6e466, 0xcb38d078, 0x85b1dbd1,
+    0x562ac72a, 0x18a3cc83, 0x676d5e2c, 0x29e45585, 0xfa7f497e,
+    0xb4f642d7, 0x863876c9, 0xc8b17d60, 0x1b2a619b, 0x55a36a32,
+    0x7eb609a7, 0x303f020e, 0xe3a41ef5, 0xad2d155c, 0x9fe32142,
+    0xd16a2aeb, 0x02f13610, 0x4c783db9, 0x54dbf13a, 0x1a52fa93,
+    0xc9c9e668, 0x8740edc1, 0xb58ed9df, 0xfb07d276, 0x289cce8d,
+    0x6615c524, 0x4d00a6b1, 0x0389ad18, 0xd012b1e3, 0x9e9bba4a,
+    0xac558e54, 0xe2dc85fd, 0x31479906, 0x7fce92af, 0xcedabc58,
+    0x8053b7f1, 0x53c8ab0a, 0x1d41a0a3, 0x2f8f94bd, 0x61069f14,
+    0xb29d83ef, 0xfc148846, 0xd701ebd3, 0x9988e07a, 0x4a13fc81,
+    0x049af728, 0x3654c336, 0x78ddc89f, 0xab46d464, 0xe5cfdfcd,
+    0xfd6c134e, 0xb3e518e7, 0x607e041c, 0x2ef70fb5, 0x1c393bab,
+    0x52b03002, 0x812b2cf9, 0xcfa22750, 0xe4b744c5, 0xaa3e4f6c,
+    0x79a55397, 0x372c583e, 0x05e26c20, 0x4b6b6789, 0x98f07b72,
+    0xd67970db, 0xa9b7e274, 0xe73ee9dd, 0x34a5f526, 0x7a2cfe8f,
+    0x48e2ca91, 0x066bc138, 0xd5f0ddc3, 0x9b79d66a, 0xb06cb5ff,
+    0xfee5be56, 0x2d7ea2ad, 0x63f7a904, 0x51399d1a, 0x1fb096b3,
+    0xcc2b8a48, 0x82a281e1, 0x9a014d62, 0xd48846cb, 0x07135a30,
+    0x499a5199, 0x7b546587, 0x35dd6e2e, 0xe64672d5, 0xa8cf797c,
+    0x83da1ae9, 0xcd531140, 0x1ec80dbb, 0x50410612, 0x628f320c,
+    0x2c0639a5, 0xff9d255e, 0xb1142ef7, 0x46c47ef1, 0x084d7558,
+    0xdbd669a3, 0x955f620a, 0xa7915614, 0xe9185dbd, 0x3a834146,
+    0x740a4aef, 0x5f1f297a, 0x119622d3, 0xc20d3e28, 0x8c843581,
+    0xbe4a019f, 0xf0c30a36, 0x235816cd, 0x6dd11d64, 0x7572d1e7,
+    0x3bfbda4e, 0xe860c6b5, 0xa6e9cd1c, 0x9427f902, 0xdaaef2ab,
+    0x0935ee50, 0x47bce5f9, 0x6ca9866c, 0x22208dc5, 0xf1bb913e,
+    0xbf329a97, 0x8dfcae89, 0xc375a520, 0x10eeb9db, 0x5e67b272,
+    0x21a920dd, 0x6f202b74, 0xbcbb378f, 0xf2323c26, 0xc0fc0838,
+    0x8e750391, 0x5dee1f6a, 0x136714c3, 0x38727756, 0x76fb7cff,
+    0xa5606004, 0xebe96bad, 0xd9275fb3, 0x97ae541a, 0x443548e1,
+    0x0abc4348, 0x121f8fcb, 0x5c968462, 0x8f0d9899, 0xc1849330,
+    0xf34aa72e, 0xbdc3ac87, 0x6e58b07c, 0x20d1bbd5, 0x0bc4d840,
+    0x454dd3e9, 0x96d6cf12, 0xd85fc4bb, 0xea91f0a5, 0xa418fb0c,
+    0x7783e7f7, 0x390aec5e, 0x881ec2a9, 0xc697c900, 0x150cd5fb,
+    0x5b85de52, 0x694bea4c, 0x27c2e1e5, 0xf459fd1e, 0xbad0f6b7,
+    0x91c59522, 0xdf4c9e8b, 0x0cd78270, 0x425e89d9, 0x7090bdc7,
+    0x3e19b66e, 0xed82aa95, 0xa30ba13c, 0xbba86dbf, 0xf5216616,
+    0x26ba7aed, 0x68337144, 0x5afd455a, 0x14744ef3, 0xc7ef5208,
+    0x896659a1, 0xa2733a34, 0xecfa319d, 0x3f612d66, 0x71e826cf,
+    0x432612d1, 0x0daf1978, 0xde340583, 0x90bd0e2a, 0xef739c85,
+    0xa1fa972c, 0x72618bd7, 0x3ce8807e, 0x0e26b460, 0x40afbfc9,
+    0x9334a332, 0xddbda89b, 0xf6a8cb0e, 0xb821c0a7, 0x6bbadc5c,
+    0x2533d7f5, 0x17fde3eb, 0x5974e842, 0x8aeff4b9, 0xc466ff10,
+    0xdcc53393, 0x924c383a, 0x41d724c1, 0x0f5e2f68, 0x3d901b76,
+    0x731910df, 0xa0820c24, 0xee0b078d, 0xc51e6418, 0x8b976fb1,
+    0x580c734a, 0x168578e3, 0x244b4cfd, 0x6ac24754, 0xb9595baf,
+    0xf7d05006},
+   {0x00000000, 0x8d88fde2, 0xc060fd85, 0x4de80067, 0x5bb0fd4b,
+    0xd63800a9, 0x9bd000ce, 0x1658fd2c, 0xb761fa96, 0x3ae90774,
+    0x77010713, 0xfa89faf1, 0xecd107dd, 0x6159fa3f, 0x2cb1fa58,
+    0xa13907ba, 0xb5b2f36d, 0x383a0e8f, 0x75d20ee8, 0xf85af30a,
+    0xee020e26, 0x638af3c4, 0x2e62f3a3, 0xa3ea0e41, 0x02d309fb,
+    0x8f5bf419, 0xc2b3f47e, 0x4f3b099c, 0x5963f4b0, 0xd4eb0952,
+    0x99030935, 0x148bf4d7, 0xb014e09b, 0x3d9c1d79, 0x70741d1e,
+    0xfdfce0fc, 0xeba41dd0, 0x662ce032, 0x2bc4e055, 0xa64c1db7,
+    0x07751a0d, 0x8afde7ef, 0xc715e788, 0x4a9d1a6a, 0x5cc5e746,
+    0xd14d1aa4, 0x9ca51ac3, 0x112de721, 0x05a613f6, 0x882eee14,
+    0xc5c6ee73, 0x484e1391, 0x5e16eebd, 0xd39e135f, 0x9e761338,
+    0x13feeeda, 0xb2c7e960, 0x3f4f1482, 0x72a714e5, 0xff2fe907,
+    0xe977142b, 0x64ffe9c9, 0x2917e9ae, 0xa49f144c, 0xbb58c777,
+    0x36d03a95, 0x7b383af2, 0xf6b0c710, 0xe0e83a3c, 0x6d60c7de,
+    0x2088c7b9, 0xad003a5b, 0x0c393de1, 0x81b1c003, 0xcc59c064,
+    0x41d13d86, 0x5789c0aa, 0xda013d48, 0x97e93d2f, 0x1a61c0cd,
+    0x0eea341a, 0x8362c9f8, 0xce8ac99f, 0x4302347d, 0x555ac951,
+    0xd8d234b3, 0x953a34d4, 0x18b2c936, 0xb98bce8c, 0x3403336e,
+    0x79eb3309, 0xf463ceeb, 0xe23b33c7, 0x6fb3ce25, 0x225bce42,
+    0xafd333a0, 0x0b4c27ec, 0x86c4da0e, 0xcb2cda69, 0x46a4278b,
+    0x50fcdaa7, 0xdd742745, 0x909c2722, 0x1d14dac0, 0xbc2ddd7a,
+    0x31a52098, 0x7c4d20ff, 0xf1c5dd1d, 0xe79d2031, 0x6a15ddd3,
+    0x27fdddb4, 0xaa752056, 0xbefed481, 0x33762963, 0x7e9e2904,
+    0xf316d4e6, 0xe54e29ca, 0x68c6d428, 0x252ed44f, 0xa8a629ad,
+    0x099f2e17, 0x8417d3f5, 0xc9ffd392, 0x44772e70, 0x522fd35c,
+    0xdfa72ebe, 0x924f2ed9, 0x1fc7d33b, 0xadc088af, 0x2048754d,
+    0x6da0752a, 0xe02888c8, 0xf67075e4, 0x7bf88806, 0x36108861,
+    0xbb987583, 0x1aa17239, 0x97298fdb, 0xdac18fbc, 0x5749725e,
+    0x41118f72, 0xcc997290, 0x817172f7, 0x0cf98f15, 0x18727bc2,
+    0x95fa8620, 0xd8128647, 0x559a7ba5, 0x43c28689, 0xce4a7b6b,
+    0x83a27b0c, 0x0e2a86ee, 0xaf138154, 0x229b7cb6, 0x6f737cd1,
+    0xe2fb8133, 0xf4a37c1f, 0x792b81fd, 0x34c3819a, 0xb94b7c78,
+    0x1dd46834, 0x905c95d6, 0xddb495b1, 0x503c6853, 0x4664957f,
+    0xcbec689d, 0x860468fa, 0x0b8c9518, 0xaab592a2, 0x273d6f40,
+    0x6ad56f27, 0xe75d92c5, 0xf1056fe9, 0x7c8d920b, 0x3165926c,
+    0xbced6f8e, 0xa8669b59, 0x25ee66bb, 0x680666dc, 0xe58e9b3e,
+    0xf3d66612, 0x7e5e9bf0, 0x33b69b97, 0xbe3e6675, 0x1f0761cf,
+    0x928f9c2d, 0xdf679c4a, 0x52ef61a8, 0x44b79c84, 0xc93f6166,
+    0x84d76101, 0x095f9ce3, 0x16984fd8, 0x9b10b23a, 0xd6f8b25d,
+    0x5b704fbf, 0x4d28b293, 0xc0a04f71, 0x8d484f16, 0x00c0b2f4,
+    0xa1f9b54e, 0x2c7148ac, 0x619948cb, 0xec11b529, 0xfa494805,
+    0x77c1b5e7, 0x3a29b580, 0xb7a14862, 0xa32abcb5, 0x2ea24157,
+    0x634a4130, 0xeec2bcd2, 0xf89a41fe, 0x7512bc1c, 0x38fabc7b,
+    0xb5724199, 0x144b4623, 0x99c3bbc1, 0xd42bbba6, 0x59a34644,
+    0x4ffbbb68, 0xc273468a, 0x8f9b46ed, 0x0213bb0f, 0xa68caf43,
+    0x2b0452a1, 0x66ec52c6, 0xeb64af24, 0xfd3c5208, 0x70b4afea,
+    0x3d5caf8d, 0xb0d4526f, 0x11ed55d5, 0x9c65a837, 0xd18da850,
+    0x5c0555b2, 0x4a5da89e, 0xc7d5557c, 0x8a3d551b, 0x07b5a8f9,
+    0x133e5c2e, 0x9eb6a1cc, 0xd35ea1ab, 0x5ed65c49, 0x488ea165,
+    0xc5065c87, 0x88ee5ce0, 0x0566a102, 0xa45fa6b8, 0x29d75b5a,
+    0x643f5b3d, 0xe9b7a6df, 0xffef5bf3, 0x7267a611, 0x3f8fa676,
+    0xb2075b94},
+   {0x00000000, 0x80f0171f, 0xda91287f, 0x5a613f60, 0x6e5356bf,
+    0xeea341a0, 0xb4c27ec0, 0x343269df, 0xdca6ad7e, 0x5c56ba61,
+    0x06378501, 0x86c7921e, 0xb2f5fbc1, 0x3205ecde, 0x6864d3be,
+    0xe894c4a1, 0x623c5cbd, 0xe2cc4ba2, 0xb8ad74c2, 0x385d63dd,
+    0x0c6f0a02, 0x8c9f1d1d, 0xd6fe227d, 0x560e3562, 0xbe9af1c3,
+    0x3e6ae6dc, 0x640bd9bc, 0xe4fbcea3, 0xd0c9a77c, 0x5039b063,
+    0x0a588f03, 0x8aa8981c, 0xc478b97a, 0x4488ae65, 0x1ee99105,
+    0x9e19861a, 0xaa2befc5, 0x2adbf8da, 0x70bac7ba, 0xf04ad0a5,
+    0x18de1404, 0x982e031b, 0xc24f3c7b, 0x42bf2b64, 0x768d42bb,
+    0xf67d55a4, 0xac1c6ac4, 0x2cec7ddb, 0xa644e5c7, 0x26b4f2d8,
+    0x7cd5cdb8, 0xfc25daa7, 0xc817b378, 0x48e7a467, 0x12869b07,
+    0x92768c18, 0x7ae248b9, 0xfa125fa6, 0xa07360c6, 0x208377d9,
+    0x14b11e06, 0x94410919, 0xce203679, 0x4ed02166, 0x538074b5,
+    0xd37063aa, 0x89115cca, 0x09e14bd5, 0x3dd3220a, 0xbd233515,
+    0xe7420a75, 0x67b21d6a, 0x8f26d9cb, 0x0fd6ced4, 0x55b7f1b4,
+    0xd547e6ab, 0xe1758f74, 0x6185986b, 0x3be4a70b, 0xbb14b014,
+    0x31bc2808, 0xb14c3f17, 0xeb2d0077, 0x6bdd1768, 0x5fef7eb7,
+    0xdf1f69a8, 0x857e56c8, 0x058e41d7, 0xed1a8576, 0x6dea9269,
+    0x378bad09, 0xb77bba16, 0x8349d3c9, 0x03b9c4d6, 0x59d8fbb6,
+    0xd928eca9, 0x97f8cdcf, 0x1708dad0, 0x4d69e5b0, 0xcd99f2af,
+    0xf9ab9b70, 0x795b8c6f, 0x233ab30f, 0xa3caa410, 0x4b5e60b1,
+    0xcbae77ae, 0x91cf48ce, 0x113f5fd1, 0x250d360e, 0xa5fd2111,
+    0xff9c1e71, 0x7f6c096e, 0xf5c49172, 0x7534866d, 0x2f55b90d,
+    0xafa5ae12, 0x9b97c7cd, 0x1b67d0d2, 0x4106efb2, 0xc1f6f8ad,
+    0x29623c0c, 0xa9922b13, 0xf3f31473, 0x7303036c, 0x47316ab3,
+    0xc7c17dac, 0x9da042cc, 0x1d5055d3, 0xa700e96a, 0x27f0fe75,
+    0x7d91c115, 0xfd61d60a, 0xc953bfd5, 0x49a3a8ca, 0x13c297aa,
+    0x933280b5, 0x7ba64414, 0xfb56530b, 0xa1376c6b, 0x21c77b74,
+    0x15f512ab, 0x950505b4, 0xcf643ad4, 0x4f942dcb, 0xc53cb5d7,
+    0x45cca2c8, 0x1fad9da8, 0x9f5d8ab7, 0xab6fe368, 0x2b9ff477,
+    0x71fecb17, 0xf10edc08, 0x199a18a9, 0x996a0fb6, 0xc30b30d6,
+    0x43fb27c9, 0x77c94e16, 0xf7395909, 0xad586669, 0x2da87176,
+    0x63785010, 0xe388470f, 0xb9e9786f, 0x39196f70, 0x0d2b06af,
+    0x8ddb11b0, 0xd7ba2ed0, 0x574a39cf, 0xbfdefd6e, 0x3f2eea71,
+    0x654fd511, 0xe5bfc20e, 0xd18dabd1, 0x517dbcce, 0x0b1c83ae,
+    0x8bec94b1, 0x01440cad, 0x81b41bb2, 0xdbd524d2, 0x5b2533cd,
+    0x6f175a12, 0xefe74d0d, 0xb586726d, 0x35766572, 0xdde2a1d3,
+    0x5d12b6cc, 0x077389ac, 0x87839eb3, 0xb3b1f76c, 0x3341e073,
+    0x6920df13, 0xe9d0c80c, 0xf4809ddf, 0x74708ac0, 0x2e11b5a0,
+    0xaee1a2bf, 0x9ad3cb60, 0x1a23dc7f, 0x4042e31f, 0xc0b2f400,
+    0x282630a1, 0xa8d627be, 0xf2b718de, 0x72470fc1, 0x4675661e,
+    0xc6857101, 0x9ce44e61, 0x1c14597e, 0x96bcc162, 0x164cd67d,
+    0x4c2de91d, 0xccddfe02, 0xf8ef97dd, 0x781f80c2, 0x227ebfa2,
+    0xa28ea8bd, 0x4a1a6c1c, 0xcaea7b03, 0x908b4463, 0x107b537c,
+    0x24493aa3, 0xa4b92dbc, 0xfed812dc, 0x7e2805c3, 0x30f824a5,
+    0xb00833ba, 0xea690cda, 0x6a991bc5, 0x5eab721a, 0xde5b6505,
+    0x843a5a65, 0x04ca4d7a, 0xec5e89db, 0x6cae9ec4, 0x36cfa1a4,
+    0xb63fb6bb, 0x820ddf64, 0x02fdc87b, 0x589cf71b, 0xd86ce004,
+    0x52c47818, 0xd2346f07, 0x88555067, 0x08a54778, 0x3c972ea7,
+    0xbc6739b8, 0xe60606d8, 0x66f611c7, 0x8e62d566, 0x0e92c279,
+    0x54f3fd19, 0xd403ea06, 0xe03183d9, 0x60c194c6, 0x3aa0aba6,
+    0xba50bcb9},
+   {0x00000000, 0x9570d495, 0xf190af6b, 0x64e07bfe, 0x38505897,
+    0xad208c02, 0xc9c0f7fc, 0x5cb02369, 0x70a0b12e, 0xe5d065bb,
+    0x81301e45, 0x1440cad0, 0x48f0e9b9, 0xdd803d2c, 0xb96046d2,
+    0x2c109247, 0xe141625c, 0x7431b6c9, 0x10d1cd37, 0x85a119a2,
+    0xd9113acb, 0x4c61ee5e, 0x288195a0, 0xbdf14135, 0x91e1d372,
+    0x049107e7, 0x60717c19, 0xf501a88c, 0xa9b18be5, 0x3cc15f70,
+    0x5821248e, 0xcd51f01b, 0x19f3c2f9, 0x8c83166c, 0xe8636d92,
+    0x7d13b907, 0x21a39a6e, 0xb4d34efb, 0xd0333505, 0x4543e190,
+    0x695373d7, 0xfc23a742, 0x98c3dcbc, 0x0db30829, 0x51032b40,
+    0xc473ffd5, 0xa093842b, 0x35e350be, 0xf8b2a0a5, 0x6dc27430,
+    0x09220fce, 0x9c52db5b, 0xc0e2f832, 0x55922ca7, 0x31725759,
+    0xa40283cc, 0x8812118b, 0x1d62c51e, 0x7982bee0, 0xecf26a75,
+    0xb042491c, 0x25329d89, 0x41d2e677, 0xd4a232e2, 0x33e785f2,
+    0xa6975167, 0xc2772a99, 0x5707fe0c, 0x0bb7dd65, 0x9ec709f0,
+    0xfa27720e, 0x6f57a69b, 0x434734dc, 0xd637e049, 0xb2d79bb7,
+    0x27a74f22, 0x7b176c4b, 0xee67b8de, 0x8a87c320, 0x1ff717b5,
+    0xd2a6e7ae, 0x47d6333b, 0x233648c5, 0xb6469c50, 0xeaf6bf39,
+    0x7f866bac, 0x1b661052, 0x8e16c4c7, 0xa2065680, 0x37768215,
+    0x5396f9eb, 0xc6e62d7e, 0x9a560e17, 0x0f26da82, 0x6bc6a17c,
+    0xfeb675e9, 0x2a14470b, 0xbf64939e, 0xdb84e860, 0x4ef43cf5,
+    0x12441f9c, 0x8734cb09, 0xe3d4b0f7, 0x76a46462, 0x5ab4f625,
+    0xcfc422b0, 0xab24594e, 0x3e548ddb, 0x62e4aeb2, 0xf7947a27,
+    0x937401d9, 0x0604d54c, 0xcb552557, 0x5e25f1c2, 0x3ac58a3c,
+    0xafb55ea9, 0xf3057dc0, 0x6675a955, 0x0295d2ab, 0x97e5063e,
+    0xbbf59479, 0x2e8540ec, 0x4a653b12, 0xdf15ef87, 0x83a5ccee,
+    0x16d5187b, 0x72356385, 0xe745b710, 0x67cf0be4, 0xf2bfdf71,
+    0x965fa48f, 0x032f701a, 0x5f9f5373, 0xcaef87e6, 0xae0ffc18,
+    0x3b7f288d, 0x176fbaca, 0x821f6e5f, 0xe6ff15a1, 0x738fc134,
+    0x2f3fe25d, 0xba4f36c8, 0xdeaf4d36, 0x4bdf99a3, 0x868e69b8,
+    0x13febd2d, 0x771ec6d3, 0xe26e1246, 0xbede312f, 0x2baee5ba,
+    0x4f4e9e44, 0xda3e4ad1, 0xf62ed896, 0x635e0c03, 0x07be77fd,
+    0x92cea368, 0xce7e8001, 0x5b0e5494, 0x3fee2f6a, 0xaa9efbff,
+    0x7e3cc91d, 0xeb4c1d88, 0x8fac6676, 0x1adcb2e3, 0x466c918a,
+    0xd31c451f, 0xb7fc3ee1, 0x228cea74, 0x0e9c7833, 0x9becaca6,
+    0xff0cd758, 0x6a7c03cd, 0x36cc20a4, 0xa3bcf431, 0xc75c8fcf,
+    0x522c5b5a, 0x9f7dab41, 0x0a0d7fd4, 0x6eed042a, 0xfb9dd0bf,
+    0xa72df3d6, 0x325d2743, 0x56bd5cbd, 0xc3cd8828, 0xefdd1a6f,
+    0x7aadcefa, 0x1e4db504, 0x8b3d6191, 0xd78d42f8, 0x42fd966d,
+    0x261ded93, 0xb36d3906, 0x54288e16, 0xc1585a83, 0xa5b8217d,
+    0x30c8f5e8, 0x6c78d681, 0xf9080214, 0x9de879ea, 0x0898ad7f,
+    0x24883f38, 0xb1f8ebad, 0xd5189053, 0x406844c6, 0x1cd867af,
+    0x89a8b33a, 0xed48c8c4, 0x78381c51, 0xb569ec4a, 0x201938df,
+    0x44f94321, 0xd18997b4, 0x8d39b4dd, 0x18496048, 0x7ca91bb6,
+    0xe9d9cf23, 0xc5c95d64, 0x50b989f1, 0x3459f20f, 0xa129269a,
+    0xfd9905f3, 0x68e9d166, 0x0c09aa98, 0x99797e0d, 0x4ddb4cef,
+    0xd8ab987a, 0xbc4be384, 0x293b3711, 0x758b1478, 0xe0fbc0ed,
+    0x841bbb13, 0x116b6f86, 0x3d7bfdc1, 0xa80b2954, 0xcceb52aa,
+    0x599b863f, 0x052ba556, 0x905b71c3, 0xf4bb0a3d, 0x61cbdea8,
+    0xac9a2eb3, 0x39eafa26, 0x5d0a81d8, 0xc87a554d, 0x94ca7624,
+    0x01baa2b1, 0x655ad94f, 0xf02a0dda, 0xdc3a9f9d, 0x494a4b08,
+    0x2daa30f6, 0xb8dae463, 0xe46ac70a, 0x711a139f, 0x15fa6861,
+    0x808abcf4},
+   {0x00000000, 0xcf9e17c8, 0x444d29d1, 0x8bd33e19, 0x889a53a2,
+    0x4704446a, 0xccd77a73, 0x03496dbb, 0xca45a105, 0x05dbb6cd,
+    0x8e0888d4, 0x41969f1c, 0x42dff2a7, 0x8d41e56f, 0x0692db76,
+    0xc90cccbe, 0x4ffa444b, 0x80645383, 0x0bb76d9a, 0xc4297a52,
+    0xc76017e9, 0x08fe0021, 0x832d3e38, 0x4cb329f0, 0x85bfe54e,
+    0x4a21f286, 0xc1f2cc9f, 0x0e6cdb57, 0x0d25b6ec, 0xc2bba124,
+    0x49689f3d, 0x86f688f5, 0x9ff48896, 0x506a9f5e, 0xdbb9a147,
+    0x1427b68f, 0x176edb34, 0xd8f0ccfc, 0x5323f2e5, 0x9cbde52d,
+    0x55b12993, 0x9a2f3e5b, 0x11fc0042, 0xde62178a, 0xdd2b7a31,
+    0x12b56df9, 0x996653e0, 0x56f84428, 0xd00eccdd, 0x1f90db15,
+    0x9443e50c, 0x5bddf2c4, 0x58949f7f, 0x970a88b7, 0x1cd9b6ae,
+    0xd347a166, 0x1a4b6dd8, 0xd5d57a10, 0x5e064409, 0x919853c1,
+    0x92d13e7a, 0x5d4f29b2, 0xd69c17ab, 0x19020063, 0xe498176d,
+    0x2b0600a5, 0xa0d53ebc, 0x6f4b2974, 0x6c0244cf, 0xa39c5307,
+    0x284f6d1e, 0xe7d17ad6, 0x2eddb668, 0xe143a1a0, 0x6a909fb9,
+    0xa50e8871, 0xa647e5ca, 0x69d9f202, 0xe20acc1b, 0x2d94dbd3,
+    0xab625326, 0x64fc44ee, 0xef2f7af7, 0x20b16d3f, 0x23f80084,
+    0xec66174c, 0x67b52955, 0xa82b3e9d, 0x6127f223, 0xaeb9e5eb,
+    0x256adbf2, 0xeaf4cc3a, 0xe9bda181, 0x2623b649, 0xadf08850,
+    0x626e9f98, 0x7b6c9ffb, 0xb4f28833, 0x3f21b62a, 0xf0bfa1e2,
+    0xf3f6cc59, 0x3c68db91, 0xb7bbe588, 0x7825f240, 0xb1293efe,
+    0x7eb72936, 0xf564172f, 0x3afa00e7, 0x39b36d5c, 0xf62d7a94,
+    0x7dfe448d, 0xb2605345, 0x3496dbb0, 0xfb08cc78, 0x70dbf261,
+    0xbf45e5a9, 0xbc0c8812, 0x73929fda, 0xf841a1c3, 0x37dfb60b,
+    0xfed37ab5, 0x314d6d7d, 0xba9e5364, 0x750044ac, 0x76492917,
+    0xb9d73edf, 0x320400c6, 0xfd9a170e, 0x1241289b, 0xdddf3f53,
+    0x560c014a, 0x99921682, 0x9adb7b39, 0x55456cf1, 0xde9652e8,
+    0x11084520, 0xd804899e, 0x179a9e56, 0x9c49a04f, 0x53d7b787,
+    0x509eda3c, 0x9f00cdf4, 0x14d3f3ed, 0xdb4de425, 0x5dbb6cd0,
+    0x92257b18, 0x19f64501, 0xd66852c9, 0xd5213f72, 0x1abf28ba,
+    0x916c16a3, 0x5ef2016b, 0x97fecdd5, 0x5860da1d, 0xd3b3e404,
+    0x1c2df3cc, 0x1f649e77, 0xd0fa89bf, 0x5b29b7a6, 0x94b7a06e,
+    0x8db5a00d, 0x422bb7c5, 0xc9f889dc, 0x06669e14, 0x052ff3af,
+    0xcab1e467, 0x4162da7e, 0x8efccdb6, 0x47f00108, 0x886e16c0,
+    0x03bd28d9, 0xcc233f11, 0xcf6a52aa, 0x00f44562, 0x8b277b7b,
+    0x44b96cb3, 0xc24fe446, 0x0dd1f38e, 0x8602cd97, 0x499cda5f,
+    0x4ad5b7e4, 0x854ba02c, 0x0e989e35, 0xc10689fd, 0x080a4543,
+    0xc794528b, 0x4c476c92, 0x83d97b5a, 0x809016e1, 0x4f0e0129,
+    0xc4dd3f30, 0x0b4328f8, 0xf6d93ff6, 0x3947283e, 0xb2941627,
+    0x7d0a01ef, 0x7e436c54, 0xb1dd7b9c, 0x3a0e4585, 0xf590524d,
+    0x3c9c9ef3, 0xf302893b, 0x78d1b722, 0xb74fa0ea, 0xb406cd51,
+    0x7b98da99, 0xf04be480, 0x3fd5f348, 0xb9237bbd, 0x76bd6c75,
+    0xfd6e526c, 0x32f045a4, 0x31b9281f, 0xfe273fd7, 0x75f401ce,
+    0xba6a1606, 0x7366dab8, 0xbcf8cd70, 0x372bf369, 0xf8b5e4a1,
+    0xfbfc891a, 0x34629ed2, 0xbfb1a0cb, 0x702fb703, 0x692db760,
+    0xa6b3a0a8, 0x2d609eb1, 0xe2fe8979, 0xe1b7e4c2, 0x2e29f30a,
+    0xa5facd13, 0x6a64dadb, 0xa3681665, 0x6cf601ad, 0xe7253fb4,
+    0x28bb287c, 0x2bf245c7, 0xe46c520f, 0x6fbf6c16, 0xa0217bde,
+    0x26d7f32b, 0xe949e4e3, 0x629adafa, 0xad04cd32, 0xae4da089,
+    0x61d3b741, 0xea008958, 0x259e9e90, 0xec92522e, 0x230c45e6,
+    0xa8df7bff, 0x67416c37, 0x6408018c, 0xab961644, 0x2045285d,
+    0xefdb3f95},
+   {0x00000000, 0x24825136, 0x4904a26c, 0x6d86f35a, 0x920944d8,
+    0xb68b15ee, 0xdb0de6b4, 0xff8fb782, 0xff638ff1, 0xdbe1dec7,
+    0xb6672d9d, 0x92e57cab, 0x6d6acb29, 0x49e89a1f, 0x246e6945,
+    0x00ec3873, 0x25b619a3, 0x01344895, 0x6cb2bbcf, 0x4830eaf9,
+    0xb7bf5d7b, 0x933d0c4d, 0xfebbff17, 0xda39ae21, 0xdad59652,
+    0xfe57c764, 0x93d1343e, 0xb7536508, 0x48dcd28a, 0x6c5e83bc,
+    0x01d870e6, 0x255a21d0, 0x4b6c3346, 0x6fee6270, 0x0268912a,
+    0x26eac01c, 0xd965779e, 0xfde726a8, 0x9061d5f2, 0xb4e384c4,
+    0xb40fbcb7, 0x908ded81, 0xfd0b1edb, 0xd9894fed, 0x2606f86f,
+    0x0284a959, 0x6f025a03, 0x4b800b35, 0x6eda2ae5, 0x4a587bd3,
+    0x27de8889, 0x035cd9bf, 0xfcd36e3d, 0xd8513f0b, 0xb5d7cc51,
+    0x91559d67, 0x91b9a514, 0xb53bf422, 0xd8bd0778, 0xfc3f564e,
+    0x03b0e1cc, 0x2732b0fa, 0x4ab443a0, 0x6e361296, 0x96d8668c,
+    0xb25a37ba, 0xdfdcc4e0, 0xfb5e95d6, 0x04d12254, 0x20537362,
+    0x4dd58038, 0x6957d10e, 0x69bbe97d, 0x4d39b84b, 0x20bf4b11,
+    0x043d1a27, 0xfbb2ada5, 0xdf30fc93, 0xb2b60fc9, 0x96345eff,
+    0xb36e7f2f, 0x97ec2e19, 0xfa6add43, 0xdee88c75, 0x21673bf7,
+    0x05e56ac1, 0x6863999b, 0x4ce1c8ad, 0x4c0df0de, 0x688fa1e8,
+    0x050952b2, 0x218b0384, 0xde04b406, 0xfa86e530, 0x9700166a,
+    0xb382475c, 0xddb455ca, 0xf93604fc, 0x94b0f7a6, 0xb032a690,
+    0x4fbd1112, 0x6b3f4024, 0x06b9b37e, 0x223be248, 0x22d7da3b,
+    0x06558b0d, 0x6bd37857, 0x4f512961, 0xb0de9ee3, 0x945ccfd5,
+    0xf9da3c8f, 0xdd586db9, 0xf8024c69, 0xdc801d5f, 0xb106ee05,
+    0x9584bf33, 0x6a0b08b1, 0x4e895987, 0x230faadd, 0x078dfbeb,
+    0x0761c398, 0x23e392ae, 0x4e6561f4, 0x6ae730c2, 0x95688740,
+    0xb1ead676, 0xdc6c252c, 0xf8ee741a, 0xf6c1cb59, 0xd2439a6f,
+    0xbfc56935, 0x9b473803, 0x64c88f81, 0x404adeb7, 0x2dcc2ded,
+    0x094e7cdb, 0x09a244a8, 0x2d20159e, 0x40a6e6c4, 0x6424b7f2,
+    0x9bab0070, 0xbf295146, 0xd2afa21c, 0xf62df32a, 0xd377d2fa,
+    0xf7f583cc, 0x9a737096, 0xbef121a0, 0x417e9622, 0x65fcc714,
+    0x087a344e, 0x2cf86578, 0x2c145d0b, 0x08960c3d, 0x6510ff67,
+    0x4192ae51, 0xbe1d19d3, 0x9a9f48e5, 0xf719bbbf, 0xd39bea89,
+    0xbdadf81f, 0x992fa929, 0xf4a95a73, 0xd02b0b45, 0x2fa4bcc7,
+    0x0b26edf1, 0x66a01eab, 0x42224f9d, 0x42ce77ee, 0x664c26d8,
+    0x0bcad582, 0x2f4884b4, 0xd0c73336, 0xf4456200, 0x99c3915a,
+    0xbd41c06c, 0x981be1bc, 0xbc99b08a, 0xd11f43d0, 0xf59d12e6,
+    0x0a12a564, 0x2e90f452, 0x43160708, 0x6794563e, 0x67786e4d,
+    0x43fa3f7b, 0x2e7ccc21, 0x0afe9d17, 0xf5712a95, 0xd1f37ba3,
+    0xbc7588f9, 0x98f7d9cf, 0x6019add5, 0x449bfce3, 0x291d0fb9,
+    0x0d9f5e8f, 0xf210e90d, 0xd692b83b, 0xbb144b61, 0x9f961a57,
+    0x9f7a2224, 0xbbf87312, 0xd67e8048, 0xf2fcd17e, 0x0d7366fc,
+    0x29f137ca, 0x4477c490, 0x60f595a6, 0x45afb476, 0x612de540,
+    0x0cab161a, 0x2829472c, 0xd7a6f0ae, 0xf324a198, 0x9ea252c2,
+    0xba2003f4, 0xbacc3b87, 0x9e4e6ab1, 0xf3c899eb, 0xd74ac8dd,
+    0x28c57f5f, 0x0c472e69, 0x61c1dd33, 0x45438c05, 0x2b759e93,
+    0x0ff7cfa5, 0x62713cff, 0x46f36dc9, 0xb97cda4b, 0x9dfe8b7d,
+    0xf0787827, 0xd4fa2911, 0xd4161162, 0xf0944054, 0x9d12b30e,
+    0xb990e238, 0x461f55ba, 0x629d048c, 0x0f1bf7d6, 0x2b99a6e0,
+    0x0ec38730, 0x2a41d606, 0x47c7255c, 0x6345746a, 0x9ccac3e8,
+    0xb84892de, 0xd5ce6184, 0xf14c30b2, 0xf1a008c1, 0xd52259f7,
+    0xb8a4aaad, 0x9c26fb9b, 0x63a94c19, 0x472b1d2f, 0x2aadee75,
+    0x0e2fbf43},
+   {0x00000000, 0x36f290f3, 0x6de521e6, 0x5b17b115, 0xdbca43cc,
+    0xed38d33f, 0xb62f622a, 0x80ddf2d9, 0x6ce581d9, 0x5a17112a,
+    0x0100a03f, 0x37f230cc, 0xb72fc215, 0x81dd52e6, 0xdacae3f3,
+    0xec387300, 0xd9cb03b2, 0xef399341, 0xb42e2254, 0x82dcb2a7,
+    0x0201407e, 0x34f3d08d, 0x6fe46198, 0x5916f16b, 0xb52e826b,
+    0x83dc1298, 0xd8cba38d, 0xee39337e, 0x6ee4c1a7, 0x58165154,
+    0x0301e041, 0x35f370b2, 0x68e70125, 0x5e1591d6, 0x050220c3,
+    0x33f0b030, 0xb32d42e9, 0x85dfd21a, 0xdec8630f, 0xe83af3fc,
+    0x040280fc, 0x32f0100f, 0x69e7a11a, 0x5f1531e9, 0xdfc8c330,
+    0xe93a53c3, 0xb22de2d6, 0x84df7225, 0xb12c0297, 0x87de9264,
+    0xdcc92371, 0xea3bb382, 0x6ae6415b, 0x5c14d1a8, 0x070360bd,
+    0x31f1f04e, 0xddc9834e, 0xeb3b13bd, 0xb02ca2a8, 0x86de325b,
+    0x0603c082, 0x30f15071, 0x6be6e164, 0x5d147197, 0xd1ce024a,
+    0xe73c92b9, 0xbc2b23ac, 0x8ad9b35f, 0x0a044186, 0x3cf6d175,
+    0x67e16060, 0x5113f093, 0xbd2b8393, 0x8bd91360, 0xd0cea275,
+    0xe63c3286, 0x66e1c05f, 0x501350ac, 0x0b04e1b9, 0x3df6714a,
+    0x080501f8, 0x3ef7910b, 0x65e0201e, 0x5312b0ed, 0xd3cf4234,
+    0xe53dd2c7, 0xbe2a63d2, 0x88d8f321, 0x64e08021, 0x521210d2,
+    0x0905a1c7, 0x3ff73134, 0xbf2ac3ed, 0x89d8531e, 0xd2cfe20b,
+    0xe43d72f8, 0xb929036f, 0x8fdb939c, 0xd4cc2289, 0xe23eb27a,
+    0x62e340a3, 0x5411d050, 0x0f066145, 0x39f4f1b6, 0xd5cc82b6,
+    0xe33e1245, 0xb829a350, 0x8edb33a3, 0x0e06c17a, 0x38f45189,
+    0x63e3e09c, 0x5511706f, 0x60e200dd, 0x5610902e, 0x0d07213b,
+    0x3bf5b1c8, 0xbb284311, 0x8ddad3e2, 0xd6cd62f7, 0xe03ff204,
+    0x0c078104, 0x3af511f7, 0x61e2a0e2, 0x57103011, 0xd7cdc2c8,
+    0xe13f523b, 0xba28e32e, 0x8cda73dd, 0x78ed02d5, 0x4e1f9226,
+    0x15082333, 0x23fab3c0, 0xa3274119, 0x95d5d1ea, 0xcec260ff,
+    0xf830f00c, 0x1408830c, 0x22fa13ff, 0x79eda2ea, 0x4f1f3219,
+    0xcfc2c0c0, 0xf9305033, 0xa227e126, 0x94d571d5, 0xa1260167,
+    0x97d49194, 0xccc32081, 0xfa31b072, 0x7aec42ab, 0x4c1ed258,
+    0x1709634d, 0x21fbf3be, 0xcdc380be, 0xfb31104d, 0xa026a158,
+    0x96d431ab, 0x1609c372, 0x20fb5381, 0x7bece294, 0x4d1e7267,
+    0x100a03f0, 0x26f89303, 0x7def2216, 0x4b1db2e5, 0xcbc0403c,
+    0xfd32d0cf, 0xa62561da, 0x90d7f129, 0x7cef8229, 0x4a1d12da,
+    0x110aa3cf, 0x27f8333c, 0xa725c1e5, 0x91d75116, 0xcac0e003,
+    0xfc3270f0, 0xc9c10042, 0xff3390b1, 0xa42421a4, 0x92d6b157,
+    0x120b438e, 0x24f9d37d, 0x7fee6268, 0x491cf29b, 0xa524819b,
+    0x93d61168, 0xc8c1a07d, 0xfe33308e, 0x7eeec257, 0x481c52a4,
+    0x130be3b1, 0x25f97342, 0xa923009f, 0x9fd1906c, 0xc4c62179,
+    0xf234b18a, 0x72e94353, 0x441bd3a0, 0x1f0c62b5, 0x29fef246,
+    0xc5c68146, 0xf33411b5, 0xa823a0a0, 0x9ed13053, 0x1e0cc28a,
+    0x28fe5279, 0x73e9e36c, 0x451b739f, 0x70e8032d, 0x461a93de,
+    0x1d0d22cb, 0x2bffb238, 0xab2240e1, 0x9dd0d012, 0xc6c76107,
+    0xf035f1f4, 0x1c0d82f4, 0x2aff1207, 0x71e8a312, 0x471a33e1,
+    0xc7c7c138, 0xf13551cb, 0xaa22e0de, 0x9cd0702d, 0xc1c401ba,
+    0xf7369149, 0xac21205c, 0x9ad3b0af, 0x1a0e4276, 0x2cfcd285,
+    0x77eb6390, 0x4119f363, 0xad218063, 0x9bd31090, 0xc0c4a185,
+    0xf6363176, 0x76ebc3af, 0x4019535c, 0x1b0ee249, 0x2dfc72ba,
+    0x180f0208, 0x2efd92fb, 0x75ea23ee, 0x4318b31d, 0xc3c541c4,
+    0xf537d137, 0xae206022, 0x98d2f0d1, 0x74ea83d1, 0x42181322,
+    0x190fa237, 0x2ffd32c4, 0xaf20c01d, 0x99d250ee, 0xc2c5e1fb,
+    0xf4377108}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x0000000000000000, 0xf390f23600000000, 0xe621e56d00000000,
+    0x15b1175b00000000, 0xcc43cadb00000000, 0x3fd338ed00000000,
+    0x2a622fb600000000, 0xd9f2dd8000000000, 0xd981e56c00000000,
+    0x2a11175a00000000, 0x3fa0000100000000, 0xcc30f23700000000,
+    0x15c22fb700000000, 0xe652dd8100000000, 0xf3e3cada00000000,
+    0x007338ec00000000, 0xb203cbd900000000, 0x419339ef00000000,
+    0x54222eb400000000, 0xa7b2dc8200000000, 0x7e40010200000000,
+    0x8dd0f33400000000, 0x9861e46f00000000, 0x6bf1165900000000,
+    0x6b822eb500000000, 0x9812dc8300000000, 0x8da3cbd800000000,
+    0x7e3339ee00000000, 0xa7c1e46e00000000, 0x5451165800000000,
+    0x41e0010300000000, 0xb270f33500000000, 0x2501e76800000000,
+    0xd691155e00000000, 0xc320020500000000, 0x30b0f03300000000,
+    0xe9422db300000000, 0x1ad2df8500000000, 0x0f63c8de00000000,
+    0xfcf33ae800000000, 0xfc80020400000000, 0x0f10f03200000000,
+    0x1aa1e76900000000, 0xe931155f00000000, 0x30c3c8df00000000,
+    0xc3533ae900000000, 0xd6e22db200000000, 0x2572df8400000000,
+    0x97022cb100000000, 0x6492de8700000000, 0x7123c9dc00000000,
+    0x82b33bea00000000, 0x5b41e66a00000000, 0xa8d1145c00000000,
+    0xbd60030700000000, 0x4ef0f13100000000, 0x4e83c9dd00000000,
+    0xbd133beb00000000, 0xa8a22cb000000000, 0x5b32de8600000000,
+    0x82c0030600000000, 0x7150f13000000000, 0x64e1e66b00000000,
+    0x9771145d00000000, 0x4a02ced100000000, 0xb9923ce700000000,
+    0xac232bbc00000000, 0x5fb3d98a00000000, 0x8641040a00000000,
+    0x75d1f63c00000000, 0x6060e16700000000, 0x93f0135100000000,
+    0x93832bbd00000000, 0x6013d98b00000000, 0x75a2ced000000000,
+    0x86323ce600000000, 0x5fc0e16600000000, 0xac50135000000000,
+    0xb9e1040b00000000, 0x4a71f63d00000000, 0xf801050800000000,
+    0x0b91f73e00000000, 0x1e20e06500000000, 0xedb0125300000000,
+    0x3442cfd300000000, 0xc7d23de500000000, 0xd2632abe00000000,
+    0x21f3d88800000000, 0x2180e06400000000, 0xd210125200000000,
+    0xc7a1050900000000, 0x3431f73f00000000, 0xedc32abf00000000,
+    0x1e53d88900000000, 0x0be2cfd200000000, 0xf8723de400000000,
+    0x6f0329b900000000, 0x9c93db8f00000000, 0x8922ccd400000000,
+    0x7ab23ee200000000, 0xa340e36200000000, 0x50d0115400000000,
+    0x4561060f00000000, 0xb6f1f43900000000, 0xb682ccd500000000,
+    0x45123ee300000000, 0x50a329b800000000, 0xa333db8e00000000,
+    0x7ac1060e00000000, 0x8951f43800000000, 0x9ce0e36300000000,
+    0x6f70115500000000, 0xdd00e26000000000, 0x2e90105600000000,
+    0x3b21070d00000000, 0xc8b1f53b00000000, 0x114328bb00000000,
+    0xe2d3da8d00000000, 0xf762cdd600000000, 0x04f23fe000000000,
+    0x0481070c00000000, 0xf711f53a00000000, 0xe2a0e26100000000,
+    0x1130105700000000, 0xc8c2cdd700000000, 0x3b523fe100000000,
+    0x2ee328ba00000000, 0xdd73da8c00000000, 0xd502ed7800000000,
+    0x26921f4e00000000, 0x3323081500000000, 0xc0b3fa2300000000,
+    0x194127a300000000, 0xead1d59500000000, 0xff60c2ce00000000,
+    0x0cf030f800000000, 0x0c83081400000000, 0xff13fa2200000000,
+    0xeaa2ed7900000000, 0x19321f4f00000000, 0xc0c0c2cf00000000,
+    0x335030f900000000, 0x26e127a200000000, 0xd571d59400000000,
+    0x670126a100000000, 0x9491d49700000000, 0x8120c3cc00000000,
+    0x72b031fa00000000, 0xab42ec7a00000000, 0x58d21e4c00000000,
+    0x4d63091700000000, 0xbef3fb2100000000, 0xbe80c3cd00000000,
+    0x4d1031fb00000000, 0x58a126a000000000, 0xab31d49600000000,
+    0x72c3091600000000, 0x8153fb2000000000, 0x94e2ec7b00000000,
+    0x67721e4d00000000, 0xf0030a1000000000, 0x0393f82600000000,
+    0x1622ef7d00000000, 0xe5b21d4b00000000, 0x3c40c0cb00000000,
+    0xcfd032fd00000000, 0xda6125a600000000, 0x29f1d79000000000,
+    0x2982ef7c00000000, 0xda121d4a00000000, 0xcfa30a1100000000,
+    0x3c33f82700000000, 0xe5c125a700000000, 0x1651d79100000000,
+    0x03e0c0ca00000000, 0xf07032fc00000000, 0x4200c1c900000000,
+    0xb19033ff00000000, 0xa42124a400000000, 0x57b1d69200000000,
+    0x8e430b1200000000, 0x7dd3f92400000000, 0x6862ee7f00000000,
+    0x9bf21c4900000000, 0x9b8124a500000000, 0x6811d69300000000,
+    0x7da0c1c800000000, 0x8e3033fe00000000, 0x57c2ee7e00000000,
+    0xa4521c4800000000, 0xb1e30b1300000000, 0x4273f92500000000,
+    0x9f0023a900000000, 0x6c90d19f00000000, 0x7921c6c400000000,
+    0x8ab134f200000000, 0x5343e97200000000, 0xa0d31b4400000000,
+    0xb5620c1f00000000, 0x46f2fe2900000000, 0x4681c6c500000000,
+    0xb51134f300000000, 0xa0a023a800000000, 0x5330d19e00000000,
+    0x8ac20c1e00000000, 0x7952fe2800000000, 0x6ce3e97300000000,
+    0x9f731b4500000000, 0x2d03e87000000000, 0xde931a4600000000,
+    0xcb220d1d00000000, 0x38b2ff2b00000000, 0xe14022ab00000000,
+    0x12d0d09d00000000, 0x0761c7c600000000, 0xf4f135f000000000,
+    0xf4820d1c00000000, 0x0712ff2a00000000, 0x12a3e87100000000,
+    0xe1331a4700000000, 0x38c1c7c700000000, 0xcb5135f100000000,
+    0xdee022aa00000000, 0x2d70d09c00000000, 0xba01c4c100000000,
+    0x499136f700000000, 0x5c2021ac00000000, 0xafb0d39a00000000,
+    0x76420e1a00000000, 0x85d2fc2c00000000, 0x9063eb7700000000,
+    0x63f3194100000000, 0x638021ad00000000, 0x9010d39b00000000,
+    0x85a1c4c000000000, 0x763136f600000000, 0xafc3eb7600000000,
+    0x5c53194000000000, 0x49e20e1b00000000, 0xba72fc2d00000000,
+    0x08020f1800000000, 0xfb92fd2e00000000, 0xee23ea7500000000,
+    0x1db3184300000000, 0xc441c5c300000000, 0x37d137f500000000,
+    0x226020ae00000000, 0xd1f0d29800000000, 0xd183ea7400000000,
+    0x2213184200000000, 0x37a20f1900000000, 0xc432fd2f00000000,
+    0x1dc020af00000000, 0xee50d29900000000, 0xfbe1c5c200000000,
+    0x087137f400000000},
+   {0x0000000000000000, 0x3651822400000000, 0x6ca2044900000000,
+    0x5af3866d00000000, 0xd844099200000000, 0xee158bb600000000,
+    0xb4e60ddb00000000, 0x82b78fff00000000, 0xf18f63ff00000000,
+    0xc7dee1db00000000, 0x9d2d67b600000000, 0xab7ce59200000000,
+    0x29cb6a6d00000000, 0x1f9ae84900000000, 0x45696e2400000000,
+    0x7338ec0000000000, 0xa319b62500000000, 0x9548340100000000,
+    0xcfbbb26c00000000, 0xf9ea304800000000, 0x7b5dbfb700000000,
+    0x4d0c3d9300000000, 0x17ffbbfe00000000, 0x21ae39da00000000,
+    0x5296d5da00000000, 0x64c757fe00000000, 0x3e34d19300000000,
+    0x086553b700000000, 0x8ad2dc4800000000, 0xbc835e6c00000000,
+    0xe670d80100000000, 0xd0215a2500000000, 0x46336c4b00000000,
+    0x7062ee6f00000000, 0x2a91680200000000, 0x1cc0ea2600000000,
+    0x9e7765d900000000, 0xa826e7fd00000000, 0xf2d5619000000000,
+    0xc484e3b400000000, 0xb7bc0fb400000000, 0x81ed8d9000000000,
+    0xdb1e0bfd00000000, 0xed4f89d900000000, 0x6ff8062600000000,
+    0x59a9840200000000, 0x035a026f00000000, 0x350b804b00000000,
+    0xe52ada6e00000000, 0xd37b584a00000000, 0x8988de2700000000,
+    0xbfd95c0300000000, 0x3d6ed3fc00000000, 0x0b3f51d800000000,
+    0x51ccd7b500000000, 0x679d559100000000, 0x14a5b99100000000,
+    0x22f43bb500000000, 0x7807bdd800000000, 0x4e563ffc00000000,
+    0xcce1b00300000000, 0xfab0322700000000, 0xa043b44a00000000,
+    0x9612366e00000000, 0x8c66d89600000000, 0xba375ab200000000,
+    0xe0c4dcdf00000000, 0xd6955efb00000000, 0x5422d10400000000,
+    0x6273532000000000, 0x3880d54d00000000, 0x0ed1576900000000,
+    0x7de9bb6900000000, 0x4bb8394d00000000, 0x114bbf2000000000,
+    0x271a3d0400000000, 0xa5adb2fb00000000, 0x93fc30df00000000,
+    0xc90fb6b200000000, 0xff5e349600000000, 0x2f7f6eb300000000,
+    0x192eec9700000000, 0x43dd6afa00000000, 0x758ce8de00000000,
+    0xf73b672100000000, 0xc16ae50500000000, 0x9b99636800000000,
+    0xadc8e14c00000000, 0xdef00d4c00000000, 0xe8a18f6800000000,
+    0xb252090500000000, 0x84038b2100000000, 0x06b404de00000000,
+    0x30e586fa00000000, 0x6a16009700000000, 0x5c4782b300000000,
+    0xca55b4dd00000000, 0xfc0436f900000000, 0xa6f7b09400000000,
+    0x90a632b000000000, 0x1211bd4f00000000, 0x24403f6b00000000,
+    0x7eb3b90600000000, 0x48e23b2200000000, 0x3bdad72200000000,
+    0x0d8b550600000000, 0x5778d36b00000000, 0x6129514f00000000,
+    0xe39edeb000000000, 0xd5cf5c9400000000, 0x8f3cdaf900000000,
+    0xb96d58dd00000000, 0x694c02f800000000, 0x5f1d80dc00000000,
+    0x05ee06b100000000, 0x33bf849500000000, 0xb1080b6a00000000,
+    0x8759894e00000000, 0xddaa0f2300000000, 0xebfb8d0700000000,
+    0x98c3610700000000, 0xae92e32300000000, 0xf461654e00000000,
+    0xc230e76a00000000, 0x4087689500000000, 0x76d6eab100000000,
+    0x2c256cdc00000000, 0x1a74eef800000000, 0x59cbc1f600000000,
+    0x6f9a43d200000000, 0x3569c5bf00000000, 0x0338479b00000000,
+    0x818fc86400000000, 0xb7de4a4000000000, 0xed2dcc2d00000000,
+    0xdb7c4e0900000000, 0xa844a20900000000, 0x9e15202d00000000,
+    0xc4e6a64000000000, 0xf2b7246400000000, 0x7000ab9b00000000,
+    0x465129bf00000000, 0x1ca2afd200000000, 0x2af32df600000000,
+    0xfad277d300000000, 0xcc83f5f700000000, 0x9670739a00000000,
+    0xa021f1be00000000, 0x22967e4100000000, 0x14c7fc6500000000,
+    0x4e347a0800000000, 0x7865f82c00000000, 0x0b5d142c00000000,
+    0x3d0c960800000000, 0x67ff106500000000, 0x51ae924100000000,
+    0xd3191dbe00000000, 0xe5489f9a00000000, 0xbfbb19f700000000,
+    0x89ea9bd300000000, 0x1ff8adbd00000000, 0x29a92f9900000000,
+    0x735aa9f400000000, 0x450b2bd000000000, 0xc7bca42f00000000,
+    0xf1ed260b00000000, 0xab1ea06600000000, 0x9d4f224200000000,
+    0xee77ce4200000000, 0xd8264c6600000000, 0x82d5ca0b00000000,
+    0xb484482f00000000, 0x3633c7d000000000, 0x006245f400000000,
+    0x5a91c39900000000, 0x6cc041bd00000000, 0xbce11b9800000000,
+    0x8ab099bc00000000, 0xd0431fd100000000, 0xe6129df500000000,
+    0x64a5120a00000000, 0x52f4902e00000000, 0x0807164300000000,
+    0x3e56946700000000, 0x4d6e786700000000, 0x7b3ffa4300000000,
+    0x21cc7c2e00000000, 0x179dfe0a00000000, 0x952a71f500000000,
+    0xa37bf3d100000000, 0xf98875bc00000000, 0xcfd9f79800000000,
+    0xd5ad196000000000, 0xe3fc9b4400000000, 0xb90f1d2900000000,
+    0x8f5e9f0d00000000, 0x0de910f200000000, 0x3bb892d600000000,
+    0x614b14bb00000000, 0x571a969f00000000, 0x24227a9f00000000,
+    0x1273f8bb00000000, 0x48807ed600000000, 0x7ed1fcf200000000,
+    0xfc66730d00000000, 0xca37f12900000000, 0x90c4774400000000,
+    0xa695f56000000000, 0x76b4af4500000000, 0x40e52d6100000000,
+    0x1a16ab0c00000000, 0x2c47292800000000, 0xaef0a6d700000000,
+    0x98a124f300000000, 0xc252a29e00000000, 0xf40320ba00000000,
+    0x873bccba00000000, 0xb16a4e9e00000000, 0xeb99c8f300000000,
+    0xddc84ad700000000, 0x5f7fc52800000000, 0x692e470c00000000,
+    0x33ddc16100000000, 0x058c434500000000, 0x939e752b00000000,
+    0xa5cff70f00000000, 0xff3c716200000000, 0xc96df34600000000,
+    0x4bda7cb900000000, 0x7d8bfe9d00000000, 0x277878f000000000,
+    0x1129fad400000000, 0x621116d400000000, 0x544094f000000000,
+    0x0eb3129d00000000, 0x38e290b900000000, 0xba551f4600000000,
+    0x8c049d6200000000, 0xd6f71b0f00000000, 0xe0a6992b00000000,
+    0x3087c30e00000000, 0x06d6412a00000000, 0x5c25c74700000000,
+    0x6a74456300000000, 0xe8c3ca9c00000000, 0xde9248b800000000,
+    0x8461ced500000000, 0xb2304cf100000000, 0xc108a0f100000000,
+    0xf75922d500000000, 0xadaaa4b800000000, 0x9bfb269c00000000,
+    0x194ca96300000000, 0x2f1d2b4700000000, 0x75eead2a00000000,
+    0x43bf2f0e00000000},
+   {0x0000000000000000, 0xc8179ecf00000000, 0xd1294d4400000000,
+    0x193ed38b00000000, 0xa2539a8800000000, 0x6a44044700000000,
+    0x737ad7cc00000000, 0xbb6d490300000000, 0x05a145ca00000000,
+    0xcdb6db0500000000, 0xd488088e00000000, 0x1c9f964100000000,
+    0xa7f2df4200000000, 0x6fe5418d00000000, 0x76db920600000000,
+    0xbecc0cc900000000, 0x4b44fa4f00000000, 0x8353648000000000,
+    0x9a6db70b00000000, 0x527a29c400000000, 0xe91760c700000000,
+    0x2100fe0800000000, 0x383e2d8300000000, 0xf029b34c00000000,
+    0x4ee5bf8500000000, 0x86f2214a00000000, 0x9fccf2c100000000,
+    0x57db6c0e00000000, 0xecb6250d00000000, 0x24a1bbc200000000,
+    0x3d9f684900000000, 0xf588f68600000000, 0x9688f49f00000000,
+    0x5e9f6a5000000000, 0x47a1b9db00000000, 0x8fb6271400000000,
+    0x34db6e1700000000, 0xfcccf0d800000000, 0xe5f2235300000000,
+    0x2de5bd9c00000000, 0x9329b15500000000, 0x5b3e2f9a00000000,
+    0x4200fc1100000000, 0x8a1762de00000000, 0x317a2bdd00000000,
+    0xf96db51200000000, 0xe053669900000000, 0x2844f85600000000,
+    0xddcc0ed000000000, 0x15db901f00000000, 0x0ce5439400000000,
+    0xc4f2dd5b00000000, 0x7f9f945800000000, 0xb7880a9700000000,
+    0xaeb6d91c00000000, 0x66a147d300000000, 0xd86d4b1a00000000,
+    0x107ad5d500000000, 0x0944065e00000000, 0xc153989100000000,
+    0x7a3ed19200000000, 0xb2294f5d00000000, 0xab179cd600000000,
+    0x6300021900000000, 0x6d1798e400000000, 0xa500062b00000000,
+    0xbc3ed5a000000000, 0x74294b6f00000000, 0xcf44026c00000000,
+    0x07539ca300000000, 0x1e6d4f2800000000, 0xd67ad1e700000000,
+    0x68b6dd2e00000000, 0xa0a143e100000000, 0xb99f906a00000000,
+    0x71880ea500000000, 0xcae547a600000000, 0x02f2d96900000000,
+    0x1bcc0ae200000000, 0xd3db942d00000000, 0x265362ab00000000,
+    0xee44fc6400000000, 0xf77a2fef00000000, 0x3f6db12000000000,
+    0x8400f82300000000, 0x4c1766ec00000000, 0x5529b56700000000,
+    0x9d3e2ba800000000, 0x23f2276100000000, 0xebe5b9ae00000000,
+    0xf2db6a2500000000, 0x3accf4ea00000000, 0x81a1bde900000000,
+    0x49b6232600000000, 0x5088f0ad00000000, 0x989f6e6200000000,
+    0xfb9f6c7b00000000, 0x3388f2b400000000, 0x2ab6213f00000000,
+    0xe2a1bff000000000, 0x59ccf6f300000000, 0x91db683c00000000,
+    0x88e5bbb700000000, 0x40f2257800000000, 0xfe3e29b100000000,
+    0x3629b77e00000000, 0x2f1764f500000000, 0xe700fa3a00000000,
+    0x5c6db33900000000, 0x947a2df600000000, 0x8d44fe7d00000000,
+    0x455360b200000000, 0xb0db963400000000, 0x78cc08fb00000000,
+    0x61f2db7000000000, 0xa9e545bf00000000, 0x12880cbc00000000,
+    0xda9f927300000000, 0xc3a141f800000000, 0x0bb6df3700000000,
+    0xb57ad3fe00000000, 0x7d6d4d3100000000, 0x64539eba00000000,
+    0xac44007500000000, 0x1729497600000000, 0xdf3ed7b900000000,
+    0xc600043200000000, 0x0e179afd00000000, 0x9b28411200000000,
+    0x533fdfdd00000000, 0x4a010c5600000000, 0x8216929900000000,
+    0x397bdb9a00000000, 0xf16c455500000000, 0xe85296de00000000,
+    0x2045081100000000, 0x9e8904d800000000, 0x569e9a1700000000,
+    0x4fa0499c00000000, 0x87b7d75300000000, 0x3cda9e5000000000,
+    0xf4cd009f00000000, 0xedf3d31400000000, 0x25e44ddb00000000,
+    0xd06cbb5d00000000, 0x187b259200000000, 0x0145f61900000000,
+    0xc95268d600000000, 0x723f21d500000000, 0xba28bf1a00000000,
+    0xa3166c9100000000, 0x6b01f25e00000000, 0xd5cdfe9700000000,
+    0x1dda605800000000, 0x04e4b3d300000000, 0xccf32d1c00000000,
+    0x779e641f00000000, 0xbf89fad000000000, 0xa6b7295b00000000,
+    0x6ea0b79400000000, 0x0da0b58d00000000, 0xc5b72b4200000000,
+    0xdc89f8c900000000, 0x149e660600000000, 0xaff32f0500000000,
+    0x67e4b1ca00000000, 0x7eda624100000000, 0xb6cdfc8e00000000,
+    0x0801f04700000000, 0xc0166e8800000000, 0xd928bd0300000000,
+    0x113f23cc00000000, 0xaa526acf00000000, 0x6245f40000000000,
+    0x7b7b278b00000000, 0xb36cb94400000000, 0x46e44fc200000000,
+    0x8ef3d10d00000000, 0x97cd028600000000, 0x5fda9c4900000000,
+    0xe4b7d54a00000000, 0x2ca04b8500000000, 0x359e980e00000000,
+    0xfd8906c100000000, 0x43450a0800000000, 0x8b5294c700000000,
+    0x926c474c00000000, 0x5a7bd98300000000, 0xe116908000000000,
+    0x29010e4f00000000, 0x303fddc400000000, 0xf828430b00000000,
+    0xf63fd9f600000000, 0x3e28473900000000, 0x271694b200000000,
+    0xef010a7d00000000, 0x546c437e00000000, 0x9c7bddb100000000,
+    0x85450e3a00000000, 0x4d5290f500000000, 0xf39e9c3c00000000,
+    0x3b8902f300000000, 0x22b7d17800000000, 0xeaa04fb700000000,
+    0x51cd06b400000000, 0x99da987b00000000, 0x80e44bf000000000,
+    0x48f3d53f00000000, 0xbd7b23b900000000, 0x756cbd7600000000,
+    0x6c526efd00000000, 0xa445f03200000000, 0x1f28b93100000000,
+    0xd73f27fe00000000, 0xce01f47500000000, 0x06166aba00000000,
+    0xb8da667300000000, 0x70cdf8bc00000000, 0x69f32b3700000000,
+    0xa1e4b5f800000000, 0x1a89fcfb00000000, 0xd29e623400000000,
+    0xcba0b1bf00000000, 0x03b72f7000000000, 0x60b72d6900000000,
+    0xa8a0b3a600000000, 0xb19e602d00000000, 0x7989fee200000000,
+    0xc2e4b7e100000000, 0x0af3292e00000000, 0x13cdfaa500000000,
+    0xdbda646a00000000, 0x651668a300000000, 0xad01f66c00000000,
+    0xb43f25e700000000, 0x7c28bb2800000000, 0xc745f22b00000000,
+    0x0f526ce400000000, 0x166cbf6f00000000, 0xde7b21a000000000,
+    0x2bf3d72600000000, 0xe3e449e900000000, 0xfada9a6200000000,
+    0x32cd04ad00000000, 0x89a04dae00000000, 0x41b7d36100000000,
+    0x588900ea00000000, 0x909e9e2500000000, 0x2e5292ec00000000,
+    0xe6450c2300000000, 0xff7bdfa800000000, 0x376c416700000000,
+    0x8c01086400000000, 0x441696ab00000000, 0x5d28452000000000,
+    0x953fdbef00000000},
+   {0x0000000000000000, 0x95d4709500000000, 0x6baf90f100000000,
+    0xfe7be06400000000, 0x9758503800000000, 0x028c20ad00000000,
+    0xfcf7c0c900000000, 0x6923b05c00000000, 0x2eb1a07000000000,
+    0xbb65d0e500000000, 0x451e308100000000, 0xd0ca401400000000,
+    0xb9e9f04800000000, 0x2c3d80dd00000000, 0xd24660b900000000,
+    0x4792102c00000000, 0x5c6241e100000000, 0xc9b6317400000000,
+    0x37cdd11000000000, 0xa219a18500000000, 0xcb3a11d900000000,
+    0x5eee614c00000000, 0xa095812800000000, 0x3541f1bd00000000,
+    0x72d3e19100000000, 0xe707910400000000, 0x197c716000000000,
+    0x8ca801f500000000, 0xe58bb1a900000000, 0x705fc13c00000000,
+    0x8e24215800000000, 0x1bf051cd00000000, 0xf9c2f31900000000,
+    0x6c16838c00000000, 0x926d63e800000000, 0x07b9137d00000000,
+    0x6e9aa32100000000, 0xfb4ed3b400000000, 0x053533d000000000,
+    0x90e1434500000000, 0xd773536900000000, 0x42a723fc00000000,
+    0xbcdcc39800000000, 0x2908b30d00000000, 0x402b035100000000,
+    0xd5ff73c400000000, 0x2b8493a000000000, 0xbe50e33500000000,
+    0xa5a0b2f800000000, 0x3074c26d00000000, 0xce0f220900000000,
+    0x5bdb529c00000000, 0x32f8e2c000000000, 0xa72c925500000000,
+    0x5957723100000000, 0xcc8302a400000000, 0x8b11128800000000,
+    0x1ec5621d00000000, 0xe0be827900000000, 0x756af2ec00000000,
+    0x1c4942b000000000, 0x899d322500000000, 0x77e6d24100000000,
+    0xe232a2d400000000, 0xf285e73300000000, 0x675197a600000000,
+    0x992a77c200000000, 0x0cfe075700000000, 0x65ddb70b00000000,
+    0xf009c79e00000000, 0x0e7227fa00000000, 0x9ba6576f00000000,
+    0xdc34474300000000, 0x49e037d600000000, 0xb79bd7b200000000,
+    0x224fa72700000000, 0x4b6c177b00000000, 0xdeb867ee00000000,
+    0x20c3878a00000000, 0xb517f71f00000000, 0xaee7a6d200000000,
+    0x3b33d64700000000, 0xc548362300000000, 0x509c46b600000000,
+    0x39bff6ea00000000, 0xac6b867f00000000, 0x5210661b00000000,
+    0xc7c4168e00000000, 0x805606a200000000, 0x1582763700000000,
+    0xebf9965300000000, 0x7e2de6c600000000, 0x170e569a00000000,
+    0x82da260f00000000, 0x7ca1c66b00000000, 0xe975b6fe00000000,
+    0x0b47142a00000000, 0x9e9364bf00000000, 0x60e884db00000000,
+    0xf53cf44e00000000, 0x9c1f441200000000, 0x09cb348700000000,
+    0xf7b0d4e300000000, 0x6264a47600000000, 0x25f6b45a00000000,
+    0xb022c4cf00000000, 0x4e5924ab00000000, 0xdb8d543e00000000,
+    0xb2aee46200000000, 0x277a94f700000000, 0xd901749300000000,
+    0x4cd5040600000000, 0x572555cb00000000, 0xc2f1255e00000000,
+    0x3c8ac53a00000000, 0xa95eb5af00000000, 0xc07d05f300000000,
+    0x55a9756600000000, 0xabd2950200000000, 0x3e06e59700000000,
+    0x7994f5bb00000000, 0xec40852e00000000, 0x123b654a00000000,
+    0x87ef15df00000000, 0xeecca58300000000, 0x7b18d51600000000,
+    0x8563357200000000, 0x10b745e700000000, 0xe40bcf6700000000,
+    0x71dfbff200000000, 0x8fa45f9600000000, 0x1a702f0300000000,
+    0x73539f5f00000000, 0xe687efca00000000, 0x18fc0fae00000000,
+    0x8d287f3b00000000, 0xcaba6f1700000000, 0x5f6e1f8200000000,
+    0xa115ffe600000000, 0x34c18f7300000000, 0x5de23f2f00000000,
+    0xc8364fba00000000, 0x364dafde00000000, 0xa399df4b00000000,
+    0xb8698e8600000000, 0x2dbdfe1300000000, 0xd3c61e7700000000,
+    0x46126ee200000000, 0x2f31debe00000000, 0xbae5ae2b00000000,
+    0x449e4e4f00000000, 0xd14a3eda00000000, 0x96d82ef600000000,
+    0x030c5e6300000000, 0xfd77be0700000000, 0x68a3ce9200000000,
+    0x01807ece00000000, 0x94540e5b00000000, 0x6a2fee3f00000000,
+    0xfffb9eaa00000000, 0x1dc93c7e00000000, 0x881d4ceb00000000,
+    0x7666ac8f00000000, 0xe3b2dc1a00000000, 0x8a916c4600000000,
+    0x1f451cd300000000, 0xe13efcb700000000, 0x74ea8c2200000000,
+    0x33789c0e00000000, 0xa6acec9b00000000, 0x58d70cff00000000,
+    0xcd037c6a00000000, 0xa420cc3600000000, 0x31f4bca300000000,
+    0xcf8f5cc700000000, 0x5a5b2c5200000000, 0x41ab7d9f00000000,
+    0xd47f0d0a00000000, 0x2a04ed6e00000000, 0xbfd09dfb00000000,
+    0xd6f32da700000000, 0x43275d3200000000, 0xbd5cbd5600000000,
+    0x2888cdc300000000, 0x6f1addef00000000, 0xfacead7a00000000,
+    0x04b54d1e00000000, 0x91613d8b00000000, 0xf8428dd700000000,
+    0x6d96fd4200000000, 0x93ed1d2600000000, 0x06396db300000000,
+    0x168e285400000000, 0x835a58c100000000, 0x7d21b8a500000000,
+    0xe8f5c83000000000, 0x81d6786c00000000, 0x140208f900000000,
+    0xea79e89d00000000, 0x7fad980800000000, 0x383f882400000000,
+    0xadebf8b100000000, 0x539018d500000000, 0xc644684000000000,
+    0xaf67d81c00000000, 0x3ab3a88900000000, 0xc4c848ed00000000,
+    0x511c387800000000, 0x4aec69b500000000, 0xdf38192000000000,
+    0x2143f94400000000, 0xb49789d100000000, 0xddb4398d00000000,
+    0x4860491800000000, 0xb61ba97c00000000, 0x23cfd9e900000000,
+    0x645dc9c500000000, 0xf189b95000000000, 0x0ff2593400000000,
+    0x9a2629a100000000, 0xf30599fd00000000, 0x66d1e96800000000,
+    0x98aa090c00000000, 0x0d7e799900000000, 0xef4cdb4d00000000,
+    0x7a98abd800000000, 0x84e34bbc00000000, 0x11373b2900000000,
+    0x78148b7500000000, 0xedc0fbe000000000, 0x13bb1b8400000000,
+    0x866f6b1100000000, 0xc1fd7b3d00000000, 0x54290ba800000000,
+    0xaa52ebcc00000000, 0x3f869b5900000000, 0x56a52b0500000000,
+    0xc3715b9000000000, 0x3d0abbf400000000, 0xa8decb6100000000,
+    0xb32e9aac00000000, 0x26faea3900000000, 0xd8810a5d00000000,
+    0x4d557ac800000000, 0x2476ca9400000000, 0xb1a2ba0100000000,
+    0x4fd95a6500000000, 0xda0d2af000000000, 0x9d9f3adc00000000,
+    0x084b4a4900000000, 0xf630aa2d00000000, 0x63e4dab800000000,
+    0x0ac76ae400000000, 0x9f131a7100000000, 0x6168fa1500000000,
+    0xf4bc8a8000000000},
+   {0x0000000000000000, 0x1f17f08000000000, 0x7f2891da00000000,
+    0x603f615a00000000, 0xbf56536e00000000, 0xa041a3ee00000000,
+    0xc07ec2b400000000, 0xdf69323400000000, 0x7eada6dc00000000,
+    0x61ba565c00000000, 0x0185370600000000, 0x1e92c78600000000,
+    0xc1fbf5b200000000, 0xdeec053200000000, 0xbed3646800000000,
+    0xa1c494e800000000, 0xbd5c3c6200000000, 0xa24bcce200000000,
+    0xc274adb800000000, 0xdd635d3800000000, 0x020a6f0c00000000,
+    0x1d1d9f8c00000000, 0x7d22fed600000000, 0x62350e5600000000,
+    0xc3f19abe00000000, 0xdce66a3e00000000, 0xbcd90b6400000000,
+    0xa3cefbe400000000, 0x7ca7c9d000000000, 0x63b0395000000000,
+    0x038f580a00000000, 0x1c98a88a00000000, 0x7ab978c400000000,
+    0x65ae884400000000, 0x0591e91e00000000, 0x1a86199e00000000,
+    0xc5ef2baa00000000, 0xdaf8db2a00000000, 0xbac7ba7000000000,
+    0xa5d04af000000000, 0x0414de1800000000, 0x1b032e9800000000,
+    0x7b3c4fc200000000, 0x642bbf4200000000, 0xbb428d7600000000,
+    0xa4557df600000000, 0xc46a1cac00000000, 0xdb7dec2c00000000,
+    0xc7e544a600000000, 0xd8f2b42600000000, 0xb8cdd57c00000000,
+    0xa7da25fc00000000, 0x78b317c800000000, 0x67a4e74800000000,
+    0x079b861200000000, 0x188c769200000000, 0xb948e27a00000000,
+    0xa65f12fa00000000, 0xc66073a000000000, 0xd977832000000000,
+    0x061eb11400000000, 0x1909419400000000, 0x793620ce00000000,
+    0x6621d04e00000000, 0xb574805300000000, 0xaa6370d300000000,
+    0xca5c118900000000, 0xd54be10900000000, 0x0a22d33d00000000,
+    0x153523bd00000000, 0x750a42e700000000, 0x6a1db26700000000,
+    0xcbd9268f00000000, 0xd4ced60f00000000, 0xb4f1b75500000000,
+    0xabe647d500000000, 0x748f75e100000000, 0x6b98856100000000,
+    0x0ba7e43b00000000, 0x14b014bb00000000, 0x0828bc3100000000,
+    0x173f4cb100000000, 0x77002deb00000000, 0x6817dd6b00000000,
+    0xb77eef5f00000000, 0xa8691fdf00000000, 0xc8567e8500000000,
+    0xd7418e0500000000, 0x76851aed00000000, 0x6992ea6d00000000,
+    0x09ad8b3700000000, 0x16ba7bb700000000, 0xc9d3498300000000,
+    0xd6c4b90300000000, 0xb6fbd85900000000, 0xa9ec28d900000000,
+    0xcfcdf89700000000, 0xd0da081700000000, 0xb0e5694d00000000,
+    0xaff299cd00000000, 0x709babf900000000, 0x6f8c5b7900000000,
+    0x0fb33a2300000000, 0x10a4caa300000000, 0xb1605e4b00000000,
+    0xae77aecb00000000, 0xce48cf9100000000, 0xd15f3f1100000000,
+    0x0e360d2500000000, 0x1121fda500000000, 0x711e9cff00000000,
+    0x6e096c7f00000000, 0x7291c4f500000000, 0x6d86347500000000,
+    0x0db9552f00000000, 0x12aea5af00000000, 0xcdc7979b00000000,
+    0xd2d0671b00000000, 0xb2ef064100000000, 0xadf8f6c100000000,
+    0x0c3c622900000000, 0x132b92a900000000, 0x7314f3f300000000,
+    0x6c03037300000000, 0xb36a314700000000, 0xac7dc1c700000000,
+    0xcc42a09d00000000, 0xd355501d00000000, 0x6ae900a700000000,
+    0x75fef02700000000, 0x15c1917d00000000, 0x0ad661fd00000000,
+    0xd5bf53c900000000, 0xcaa8a34900000000, 0xaa97c21300000000,
+    0xb580329300000000, 0x1444a67b00000000, 0x0b5356fb00000000,
+    0x6b6c37a100000000, 0x747bc72100000000, 0xab12f51500000000,
+    0xb405059500000000, 0xd43a64cf00000000, 0xcb2d944f00000000,
+    0xd7b53cc500000000, 0xc8a2cc4500000000, 0xa89dad1f00000000,
+    0xb78a5d9f00000000, 0x68e36fab00000000, 0x77f49f2b00000000,
+    0x17cbfe7100000000, 0x08dc0ef100000000, 0xa9189a1900000000,
+    0xb60f6a9900000000, 0xd6300bc300000000, 0xc927fb4300000000,
+    0x164ec97700000000, 0x095939f700000000, 0x696658ad00000000,
+    0x7671a82d00000000, 0x1050786300000000, 0x0f4788e300000000,
+    0x6f78e9b900000000, 0x706f193900000000, 0xaf062b0d00000000,
+    0xb011db8d00000000, 0xd02ebad700000000, 0xcf394a5700000000,
+    0x6efddebf00000000, 0x71ea2e3f00000000, 0x11d54f6500000000,
+    0x0ec2bfe500000000, 0xd1ab8dd100000000, 0xcebc7d5100000000,
+    0xae831c0b00000000, 0xb194ec8b00000000, 0xad0c440100000000,
+    0xb21bb48100000000, 0xd224d5db00000000, 0xcd33255b00000000,
+    0x125a176f00000000, 0x0d4de7ef00000000, 0x6d7286b500000000,
+    0x7265763500000000, 0xd3a1e2dd00000000, 0xccb6125d00000000,
+    0xac89730700000000, 0xb39e838700000000, 0x6cf7b1b300000000,
+    0x73e0413300000000, 0x13df206900000000, 0x0cc8d0e900000000,
+    0xdf9d80f400000000, 0xc08a707400000000, 0xa0b5112e00000000,
+    0xbfa2e1ae00000000, 0x60cbd39a00000000, 0x7fdc231a00000000,
+    0x1fe3424000000000, 0x00f4b2c000000000, 0xa130262800000000,
+    0xbe27d6a800000000, 0xde18b7f200000000, 0xc10f477200000000,
+    0x1e66754600000000, 0x017185c600000000, 0x614ee49c00000000,
+    0x7e59141c00000000, 0x62c1bc9600000000, 0x7dd64c1600000000,
+    0x1de92d4c00000000, 0x02feddcc00000000, 0xdd97eff800000000,
+    0xc2801f7800000000, 0xa2bf7e2200000000, 0xbda88ea200000000,
+    0x1c6c1a4a00000000, 0x037beaca00000000, 0x63448b9000000000,
+    0x7c537b1000000000, 0xa33a492400000000, 0xbc2db9a400000000,
+    0xdc12d8fe00000000, 0xc305287e00000000, 0xa524f83000000000,
+    0xba3308b000000000, 0xda0c69ea00000000, 0xc51b996a00000000,
+    0x1a72ab5e00000000, 0x05655bde00000000, 0x655a3a8400000000,
+    0x7a4dca0400000000, 0xdb895eec00000000, 0xc49eae6c00000000,
+    0xa4a1cf3600000000, 0xbbb63fb600000000, 0x64df0d8200000000,
+    0x7bc8fd0200000000, 0x1bf79c5800000000, 0x04e06cd800000000,
+    0x1878c45200000000, 0x076f34d200000000, 0x6750558800000000,
+    0x7847a50800000000, 0xa72e973c00000000, 0xb83967bc00000000,
+    0xd80606e600000000, 0xc711f66600000000, 0x66d5628e00000000,
+    0x79c2920e00000000, 0x19fdf35400000000, 0x06ea03d400000000,
+    0xd98331e000000000, 0xc694c16000000000, 0xa6aba03a00000000,
+    0xb9bc50ba00000000},
+   {0x0000000000000000, 0xe2fd888d00000000, 0x85fd60c000000000,
+    0x6700e84d00000000, 0x4bfdb05b00000000, 0xa90038d600000000,
+    0xce00d09b00000000, 0x2cfd581600000000, 0x96fa61b700000000,
+    0x7407e93a00000000, 0x1307017700000000, 0xf1fa89fa00000000,
+    0xdd07d1ec00000000, 0x3ffa596100000000, 0x58fab12c00000000,
+    0xba0739a100000000, 0x6df3b2b500000000, 0x8f0e3a3800000000,
+    0xe80ed27500000000, 0x0af35af800000000, 0x260e02ee00000000,
+    0xc4f38a6300000000, 0xa3f3622e00000000, 0x410eeaa300000000,
+    0xfb09d30200000000, 0x19f45b8f00000000, 0x7ef4b3c200000000,
+    0x9c093b4f00000000, 0xb0f4635900000000, 0x5209ebd400000000,
+    0x3509039900000000, 0xd7f48b1400000000, 0x9be014b000000000,
+    0x791d9c3d00000000, 0x1e1d747000000000, 0xfce0fcfd00000000,
+    0xd01da4eb00000000, 0x32e02c6600000000, 0x55e0c42b00000000,
+    0xb71d4ca600000000, 0x0d1a750700000000, 0xefe7fd8a00000000,
+    0x88e715c700000000, 0x6a1a9d4a00000000, 0x46e7c55c00000000,
+    0xa41a4dd100000000, 0xc31aa59c00000000, 0x21e72d1100000000,
+    0xf613a60500000000, 0x14ee2e8800000000, 0x73eec6c500000000,
+    0x91134e4800000000, 0xbdee165e00000000, 0x5f139ed300000000,
+    0x3813769e00000000, 0xdaeefe1300000000, 0x60e9c7b200000000,
+    0x82144f3f00000000, 0xe514a77200000000, 0x07e92fff00000000,
+    0x2b1477e900000000, 0xc9e9ff6400000000, 0xaee9172900000000,
+    0x4c149fa400000000, 0x77c758bb00000000, 0x953ad03600000000,
+    0xf23a387b00000000, 0x10c7b0f600000000, 0x3c3ae8e000000000,
+    0xdec7606d00000000, 0xb9c7882000000000, 0x5b3a00ad00000000,
+    0xe13d390c00000000, 0x03c0b18100000000, 0x64c059cc00000000,
+    0x863dd14100000000, 0xaac0895700000000, 0x483d01da00000000,
+    0x2f3de99700000000, 0xcdc0611a00000000, 0x1a34ea0e00000000,
+    0xf8c9628300000000, 0x9fc98ace00000000, 0x7d34024300000000,
+    0x51c95a5500000000, 0xb334d2d800000000, 0xd4343a9500000000,
+    0x36c9b21800000000, 0x8cce8bb900000000, 0x6e33033400000000,
+    0x0933eb7900000000, 0xebce63f400000000, 0xc7333be200000000,
+    0x25ceb36f00000000, 0x42ce5b2200000000, 0xa033d3af00000000,
+    0xec274c0b00000000, 0x0edac48600000000, 0x69da2ccb00000000,
+    0x8b27a44600000000, 0xa7dafc5000000000, 0x452774dd00000000,
+    0x22279c9000000000, 0xc0da141d00000000, 0x7add2dbc00000000,
+    0x9820a53100000000, 0xff204d7c00000000, 0x1dddc5f100000000,
+    0x31209de700000000, 0xd3dd156a00000000, 0xb4ddfd2700000000,
+    0x562075aa00000000, 0x81d4febe00000000, 0x6329763300000000,
+    0x04299e7e00000000, 0xe6d416f300000000, 0xca294ee500000000,
+    0x28d4c66800000000, 0x4fd42e2500000000, 0xad29a6a800000000,
+    0x172e9f0900000000, 0xf5d3178400000000, 0x92d3ffc900000000,
+    0x702e774400000000, 0x5cd32f5200000000, 0xbe2ea7df00000000,
+    0xd92e4f9200000000, 0x3bd3c71f00000000, 0xaf88c0ad00000000,
+    0x4d75482000000000, 0x2a75a06d00000000, 0xc88828e000000000,
+    0xe47570f600000000, 0x0688f87b00000000, 0x6188103600000000,
+    0x837598bb00000000, 0x3972a11a00000000, 0xdb8f299700000000,
+    0xbc8fc1da00000000, 0x5e72495700000000, 0x728f114100000000,
+    0x907299cc00000000, 0xf772718100000000, 0x158ff90c00000000,
+    0xc27b721800000000, 0x2086fa9500000000, 0x478612d800000000,
+    0xa57b9a5500000000, 0x8986c24300000000, 0x6b7b4ace00000000,
+    0x0c7ba28300000000, 0xee862a0e00000000, 0x548113af00000000,
+    0xb67c9b2200000000, 0xd17c736f00000000, 0x3381fbe200000000,
+    0x1f7ca3f400000000, 0xfd812b7900000000, 0x9a81c33400000000,
+    0x787c4bb900000000, 0x3468d41d00000000, 0xd6955c9000000000,
+    0xb195b4dd00000000, 0x53683c5000000000, 0x7f95644600000000,
+    0x9d68eccb00000000, 0xfa68048600000000, 0x18958c0b00000000,
+    0xa292b5aa00000000, 0x406f3d2700000000, 0x276fd56a00000000,
+    0xc5925de700000000, 0xe96f05f100000000, 0x0b928d7c00000000,
+    0x6c92653100000000, 0x8e6fedbc00000000, 0x599b66a800000000,
+    0xbb66ee2500000000, 0xdc66066800000000, 0x3e9b8ee500000000,
+    0x1266d6f300000000, 0xf09b5e7e00000000, 0x979bb63300000000,
+    0x75663ebe00000000, 0xcf61071f00000000, 0x2d9c8f9200000000,
+    0x4a9c67df00000000, 0xa861ef5200000000, 0x849cb74400000000,
+    0x66613fc900000000, 0x0161d78400000000, 0xe39c5f0900000000,
+    0xd84f981600000000, 0x3ab2109b00000000, 0x5db2f8d600000000,
+    0xbf4f705b00000000, 0x93b2284d00000000, 0x714fa0c000000000,
+    0x164f488d00000000, 0xf4b2c00000000000, 0x4eb5f9a100000000,
+    0xac48712c00000000, 0xcb48996100000000, 0x29b511ec00000000,
+    0x054849fa00000000, 0xe7b5c17700000000, 0x80b5293a00000000,
+    0x6248a1b700000000, 0xb5bc2aa300000000, 0x5741a22e00000000,
+    0x30414a6300000000, 0xd2bcc2ee00000000, 0xfe419af800000000,
+    0x1cbc127500000000, 0x7bbcfa3800000000, 0x994172b500000000,
+    0x23464b1400000000, 0xc1bbc39900000000, 0xa6bb2bd400000000,
+    0x4446a35900000000, 0x68bbfb4f00000000, 0x8a4673c200000000,
+    0xed469b8f00000000, 0x0fbb130200000000, 0x43af8ca600000000,
+    0xa152042b00000000, 0xc652ec6600000000, 0x24af64eb00000000,
+    0x08523cfd00000000, 0xeaafb47000000000, 0x8daf5c3d00000000,
+    0x6f52d4b000000000, 0xd555ed1100000000, 0x37a8659c00000000,
+    0x50a88dd100000000, 0xb255055c00000000, 0x9ea85d4a00000000,
+    0x7c55d5c700000000, 0x1b553d8a00000000, 0xf9a8b50700000000,
+    0x2e5c3e1300000000, 0xcca1b69e00000000, 0xaba15ed300000000,
+    0x495cd65e00000000, 0x65a18e4800000000, 0x875c06c500000000,
+    0xe05cee8800000000, 0x02a1660500000000, 0xb8a65fa400000000,
+    0x5a5bd72900000000, 0x3d5b3f6400000000, 0xdfa6b7e900000000,
+    0xf35befff00000000, 0x11a6677200000000, 0x76a68f3f00000000,
+    0x945b07b200000000},
+   {0x0000000000000000, 0xa90b894e00000000, 0x5217129d00000000,
+    0xfb1c9bd300000000, 0xe52855e100000000, 0x4c23dcaf00000000,
+    0xb73f477c00000000, 0x1e34ce3200000000, 0x8b57db1900000000,
+    0x225c525700000000, 0xd940c98400000000, 0x704b40ca00000000,
+    0x6e7f8ef800000000, 0xc77407b600000000, 0x3c689c6500000000,
+    0x9563152b00000000, 0x16afb63300000000, 0xbfa43f7d00000000,
+    0x44b8a4ae00000000, 0xedb32de000000000, 0xf387e3d200000000,
+    0x5a8c6a9c00000000, 0xa190f14f00000000, 0x089b780100000000,
+    0x9df86d2a00000000, 0x34f3e46400000000, 0xcfef7fb700000000,
+    0x66e4f6f900000000, 0x78d038cb00000000, 0xd1dbb18500000000,
+    0x2ac72a5600000000, 0x83cca31800000000, 0x2c5e6d6700000000,
+    0x8555e42900000000, 0x7e497ffa00000000, 0xd742f6b400000000,
+    0xc976388600000000, 0x607db1c800000000, 0x9b612a1b00000000,
+    0x326aa35500000000, 0xa709b67e00000000, 0x0e023f3000000000,
+    0xf51ea4e300000000, 0x5c152dad00000000, 0x4221e39f00000000,
+    0xeb2a6ad100000000, 0x1036f10200000000, 0xb93d784c00000000,
+    0x3af1db5400000000, 0x93fa521a00000000, 0x68e6c9c900000000,
+    0xc1ed408700000000, 0xdfd98eb500000000, 0x76d207fb00000000,
+    0x8dce9c2800000000, 0x24c5156600000000, 0xb1a6004d00000000,
+    0x18ad890300000000, 0xe3b112d000000000, 0x4aba9b9e00000000,
+    0x548e55ac00000000, 0xfd85dce200000000, 0x0699473100000000,
+    0xaf92ce7f00000000, 0x58bcdace00000000, 0xf1b7538000000000,
+    0x0aabc85300000000, 0xa3a0411d00000000, 0xbd948f2f00000000,
+    0x149f066100000000, 0xef839db200000000, 0x468814fc00000000,
+    0xd3eb01d700000000, 0x7ae0889900000000, 0x81fc134a00000000,
+    0x28f79a0400000000, 0x36c3543600000000, 0x9fc8dd7800000000,
+    0x64d446ab00000000, 0xcddfcfe500000000, 0x4e136cfd00000000,
+    0xe718e5b300000000, 0x1c047e6000000000, 0xb50ff72e00000000,
+    0xab3b391c00000000, 0x0230b05200000000, 0xf92c2b8100000000,
+    0x5027a2cf00000000, 0xc544b7e400000000, 0x6c4f3eaa00000000,
+    0x9753a57900000000, 0x3e582c3700000000, 0x206ce20500000000,
+    0x89676b4b00000000, 0x727bf09800000000, 0xdb7079d600000000,
+    0x74e2b7a900000000, 0xdde93ee700000000, 0x26f5a53400000000,
+    0x8ffe2c7a00000000, 0x91cae24800000000, 0x38c16b0600000000,
+    0xc3ddf0d500000000, 0x6ad6799b00000000, 0xffb56cb000000000,
+    0x56bee5fe00000000, 0xada27e2d00000000, 0x04a9f76300000000,
+    0x1a9d395100000000, 0xb396b01f00000000, 0x488a2bcc00000000,
+    0xe181a28200000000, 0x624d019a00000000, 0xcb4688d400000000,
+    0x305a130700000000, 0x99519a4900000000, 0x8765547b00000000,
+    0x2e6edd3500000000, 0xd57246e600000000, 0x7c79cfa800000000,
+    0xe91ada8300000000, 0x401153cd00000000, 0xbb0dc81e00000000,
+    0x1206415000000000, 0x0c328f6200000000, 0xa539062c00000000,
+    0x5e259dff00000000, 0xf72e14b100000000, 0xf17ec44600000000,
+    0x58754d0800000000, 0xa369d6db00000000, 0x0a625f9500000000,
+    0x145691a700000000, 0xbd5d18e900000000, 0x4641833a00000000,
+    0xef4a0a7400000000, 0x7a291f5f00000000, 0xd322961100000000,
+    0x283e0dc200000000, 0x8135848c00000000, 0x9f014abe00000000,
+    0x360ac3f000000000, 0xcd16582300000000, 0x641dd16d00000000,
+    0xe7d1727500000000, 0x4edafb3b00000000, 0xb5c660e800000000,
+    0x1ccde9a600000000, 0x02f9279400000000, 0xabf2aeda00000000,
+    0x50ee350900000000, 0xf9e5bc4700000000, 0x6c86a96c00000000,
+    0xc58d202200000000, 0x3e91bbf100000000, 0x979a32bf00000000,
+    0x89aefc8d00000000, 0x20a575c300000000, 0xdbb9ee1000000000,
+    0x72b2675e00000000, 0xdd20a92100000000, 0x742b206f00000000,
+    0x8f37bbbc00000000, 0x263c32f200000000, 0x3808fcc000000000,
+    0x9103758e00000000, 0x6a1fee5d00000000, 0xc314671300000000,
+    0x5677723800000000, 0xff7cfb7600000000, 0x046060a500000000,
+    0xad6be9eb00000000, 0xb35f27d900000000, 0x1a54ae9700000000,
+    0xe148354400000000, 0x4843bc0a00000000, 0xcb8f1f1200000000,
+    0x6284965c00000000, 0x99980d8f00000000, 0x309384c100000000,
+    0x2ea74af300000000, 0x87acc3bd00000000, 0x7cb0586e00000000,
+    0xd5bbd12000000000, 0x40d8c40b00000000, 0xe9d34d4500000000,
+    0x12cfd69600000000, 0xbbc45fd800000000, 0xa5f091ea00000000,
+    0x0cfb18a400000000, 0xf7e7837700000000, 0x5eec0a3900000000,
+    0xa9c21e8800000000, 0x00c997c600000000, 0xfbd50c1500000000,
+    0x52de855b00000000, 0x4cea4b6900000000, 0xe5e1c22700000000,
+    0x1efd59f400000000, 0xb7f6d0ba00000000, 0x2295c59100000000,
+    0x8b9e4cdf00000000, 0x7082d70c00000000, 0xd9895e4200000000,
+    0xc7bd907000000000, 0x6eb6193e00000000, 0x95aa82ed00000000,
+    0x3ca10ba300000000, 0xbf6da8bb00000000, 0x166621f500000000,
+    0xed7aba2600000000, 0x4471336800000000, 0x5a45fd5a00000000,
+    0xf34e741400000000, 0x0852efc700000000, 0xa159668900000000,
+    0x343a73a200000000, 0x9d31faec00000000, 0x662d613f00000000,
+    0xcf26e87100000000, 0xd112264300000000, 0x7819af0d00000000,
+    0x830534de00000000, 0x2a0ebd9000000000, 0x859c73ef00000000,
+    0x2c97faa100000000, 0xd78b617200000000, 0x7e80e83c00000000,
+    0x60b4260e00000000, 0xc9bfaf4000000000, 0x32a3349300000000,
+    0x9ba8bddd00000000, 0x0ecba8f600000000, 0xa7c021b800000000,
+    0x5cdcba6b00000000, 0xf5d7332500000000, 0xebe3fd1700000000,
+    0x42e8745900000000, 0xb9f4ef8a00000000, 0x10ff66c400000000,
+    0x9333c5dc00000000, 0x3a384c9200000000, 0xc124d74100000000,
+    0x682f5e0f00000000, 0x761b903d00000000, 0xdf10197300000000,
+    0x240c82a000000000, 0x8d070bee00000000, 0x18641ec500000000,
+    0xb16f978b00000000, 0x4a730c5800000000, 0xe378851600000000,
+    0xfd4c4b2400000000, 0x5447c26a00000000, 0xaf5b59b900000000,
+    0x0650d0f700000000},
+   {0x0000000000000000, 0x479244af00000000, 0xcf22f88500000000,
+    0x88b0bc2a00000000, 0xdf4381d000000000, 0x98d1c57f00000000,
+    0x1061795500000000, 0x57f33dfa00000000, 0xff81737a00000000,
+    0xb81337d500000000, 0x30a38bff00000000, 0x7731cf5000000000,
+    0x20c2f2aa00000000, 0x6750b60500000000, 0xefe00a2f00000000,
+    0xa8724e8000000000, 0xfe03e7f400000000, 0xb991a35b00000000,
+    0x31211f7100000000, 0x76b35bde00000000, 0x2140662400000000,
+    0x66d2228b00000000, 0xee629ea100000000, 0xa9f0da0e00000000,
+    0x0182948e00000000, 0x4610d02100000000, 0xcea06c0b00000000,
+    0x893228a400000000, 0xdec1155e00000000, 0x995351f100000000,
+    0x11e3eddb00000000, 0x5671a97400000000, 0xbd01bf3200000000,
+    0xfa93fb9d00000000, 0x722347b700000000, 0x35b1031800000000,
+    0x62423ee200000000, 0x25d07a4d00000000, 0xad60c66700000000,
+    0xeaf282c800000000, 0x4280cc4800000000, 0x051288e700000000,
+    0x8da234cd00000000, 0xca30706200000000, 0x9dc34d9800000000,
+    0xda51093700000000, 0x52e1b51d00000000, 0x1573f1b200000000,
+    0x430258c600000000, 0x04901c6900000000, 0x8c20a04300000000,
+    0xcbb2e4ec00000000, 0x9c41d91600000000, 0xdbd39db900000000,
+    0x5363219300000000, 0x14f1653c00000000, 0xbc832bbc00000000,
+    0xfb116f1300000000, 0x73a1d33900000000, 0x3433979600000000,
+    0x63c0aa6c00000000, 0x2452eec300000000, 0xace252e900000000,
+    0xeb70164600000000, 0x7a037e6500000000, 0x3d913aca00000000,
+    0xb52186e000000000, 0xf2b3c24f00000000, 0xa540ffb500000000,
+    0xe2d2bb1a00000000, 0x6a62073000000000, 0x2df0439f00000000,
+    0x85820d1f00000000, 0xc21049b000000000, 0x4aa0f59a00000000,
+    0x0d32b13500000000, 0x5ac18ccf00000000, 0x1d53c86000000000,
+    0x95e3744a00000000, 0xd27130e500000000, 0x8400999100000000,
+    0xc392dd3e00000000, 0x4b22611400000000, 0x0cb025bb00000000,
+    0x5b43184100000000, 0x1cd15cee00000000, 0x9461e0c400000000,
+    0xd3f3a46b00000000, 0x7b81eaeb00000000, 0x3c13ae4400000000,
+    0xb4a3126e00000000, 0xf33156c100000000, 0xa4c26b3b00000000,
+    0xe3502f9400000000, 0x6be093be00000000, 0x2c72d71100000000,
+    0xc702c15700000000, 0x809085f800000000, 0x082039d200000000,
+    0x4fb27d7d00000000, 0x1841408700000000, 0x5fd3042800000000,
+    0xd763b80200000000, 0x90f1fcad00000000, 0x3883b22d00000000,
+    0x7f11f68200000000, 0xf7a14aa800000000, 0xb0330e0700000000,
+    0xe7c033fd00000000, 0xa052775200000000, 0x28e2cb7800000000,
+    0x6f708fd700000000, 0x390126a300000000, 0x7e93620c00000000,
+    0xf623de2600000000, 0xb1b19a8900000000, 0xe642a77300000000,
+    0xa1d0e3dc00000000, 0x29605ff600000000, 0x6ef21b5900000000,
+    0xc68055d900000000, 0x8112117600000000, 0x09a2ad5c00000000,
+    0x4e30e9f300000000, 0x19c3d40900000000, 0x5e5190a600000000,
+    0xd6e12c8c00000000, 0x9173682300000000, 0xf406fcca00000000,
+    0xb394b86500000000, 0x3b24044f00000000, 0x7cb640e000000000,
+    0x2b457d1a00000000, 0x6cd739b500000000, 0xe467859f00000000,
+    0xa3f5c13000000000, 0x0b878fb000000000, 0x4c15cb1f00000000,
+    0xc4a5773500000000, 0x8337339a00000000, 0xd4c40e6000000000,
+    0x93564acf00000000, 0x1be6f6e500000000, 0x5c74b24a00000000,
+    0x0a051b3e00000000, 0x4d975f9100000000, 0xc527e3bb00000000,
+    0x82b5a71400000000, 0xd5469aee00000000, 0x92d4de4100000000,
+    0x1a64626b00000000, 0x5df626c400000000, 0xf584684400000000,
+    0xb2162ceb00000000, 0x3aa690c100000000, 0x7d34d46e00000000,
+    0x2ac7e99400000000, 0x6d55ad3b00000000, 0xe5e5111100000000,
+    0xa27755be00000000, 0x490743f800000000, 0x0e95075700000000,
+    0x8625bb7d00000000, 0xc1b7ffd200000000, 0x9644c22800000000,
+    0xd1d6868700000000, 0x59663aad00000000, 0x1ef47e0200000000,
+    0xb686308200000000, 0xf114742d00000000, 0x79a4c80700000000,
+    0x3e368ca800000000, 0x69c5b15200000000, 0x2e57f5fd00000000,
+    0xa6e749d700000000, 0xe1750d7800000000, 0xb704a40c00000000,
+    0xf096e0a300000000, 0x78265c8900000000, 0x3fb4182600000000,
+    0x684725dc00000000, 0x2fd5617300000000, 0xa765dd5900000000,
+    0xe0f799f600000000, 0x4885d77600000000, 0x0f1793d900000000,
+    0x87a72ff300000000, 0xc0356b5c00000000, 0x97c656a600000000,
+    0xd054120900000000, 0x58e4ae2300000000, 0x1f76ea8c00000000,
+    0x8e0582af00000000, 0xc997c60000000000, 0x41277a2a00000000,
+    0x06b53e8500000000, 0x5146037f00000000, 0x16d447d000000000,
+    0x9e64fbfa00000000, 0xd9f6bf5500000000, 0x7184f1d500000000,
+    0x3616b57a00000000, 0xbea6095000000000, 0xf9344dff00000000,
+    0xaec7700500000000, 0xe95534aa00000000, 0x61e5888000000000,
+    0x2677cc2f00000000, 0x7006655b00000000, 0x379421f400000000,
+    0xbf249dde00000000, 0xf8b6d97100000000, 0xaf45e48b00000000,
+    0xe8d7a02400000000, 0x60671c0e00000000, 0x27f558a100000000,
+    0x8f87162100000000, 0xc815528e00000000, 0x40a5eea400000000,
+    0x0737aa0b00000000, 0x50c497f100000000, 0x1756d35e00000000,
+    0x9fe66f7400000000, 0xd8742bdb00000000, 0x33043d9d00000000,
+    0x7496793200000000, 0xfc26c51800000000, 0xbbb481b700000000,
+    0xec47bc4d00000000, 0xabd5f8e200000000, 0x236544c800000000,
+    0x64f7006700000000, 0xcc854ee700000000, 0x8b170a4800000000,
+    0x03a7b66200000000, 0x4435f2cd00000000, 0x13c6cf3700000000,
+    0x54548b9800000000, 0xdce437b200000000, 0x9b76731d00000000,
+    0xcd07da6900000000, 0x8a959ec600000000, 0x022522ec00000000,
+    0x45b7664300000000, 0x12445bb900000000, 0x55d61f1600000000,
+    0xdd66a33c00000000, 0x9af4e79300000000, 0x3286a91300000000,
+    0x7514edbc00000000, 0xfda4519600000000, 0xba36153900000000,
+    0xedc528c300000000, 0xaa576c6c00000000, 0x22e7d04600000000,
+    0x657594e900000000}};
+
+#else /* W == 4 */
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0x65673b46, 0xcace768c, 0xafa94dca, 0x4eedeb59,
+    0x2b8ad01f, 0x84239dd5, 0xe144a693, 0x9ddbd6b2, 0xf8bcedf4,
+    0x5715a03e, 0x32729b78, 0xd3363deb, 0xb65106ad, 0x19f84b67,
+    0x7c9f7021, 0xe0c6ab25, 0x85a19063, 0x2a08dda9, 0x4f6fe6ef,
+    0xae2b407c, 0xcb4c7b3a, 0x64e536f0, 0x01820db6, 0x7d1d7d97,
+    0x187a46d1, 0xb7d30b1b, 0xd2b4305d, 0x33f096ce, 0x5697ad88,
+    0xf93ee042, 0x9c59db04, 0x1afc500b, 0x7f9b6b4d, 0xd0322687,
+    0xb5551dc1, 0x5411bb52, 0x31768014, 0x9edfcdde, 0xfbb8f698,
+    0x872786b9, 0xe240bdff, 0x4de9f035, 0x288ecb73, 0xc9ca6de0,
+    0xacad56a6, 0x03041b6c, 0x6663202a, 0xfa3afb2e, 0x9f5dc068,
+    0x30f48da2, 0x5593b6e4, 0xb4d71077, 0xd1b02b31, 0x7e1966fb,
+    0x1b7e5dbd, 0x67e12d9c, 0x028616da, 0xad2f5b10, 0xc8486056,
+    0x290cc6c5, 0x4c6bfd83, 0xe3c2b049, 0x86a58b0f, 0x35f8a016,
+    0x509f9b50, 0xff36d69a, 0x9a51eddc, 0x7b154b4f, 0x1e727009,
+    0xb1db3dc3, 0xd4bc0685, 0xa82376a4, 0xcd444de2, 0x62ed0028,
+    0x078a3b6e, 0xe6ce9dfd, 0x83a9a6bb, 0x2c00eb71, 0x4967d037,
+    0xd53e0b33, 0xb0593075, 0x1ff07dbf, 0x7a9746f9, 0x9bd3e06a,
+    0xfeb4db2c, 0x511d96e6, 0x347aada0, 0x48e5dd81, 0x2d82e6c7,
+    0x822bab0d, 0xe74c904b, 0x060836d8, 0x636f0d9e, 0xccc64054,
+    0xa9a17b12, 0x2f04f01d, 0x4a63cb5b, 0xe5ca8691, 0x80adbdd7,
+    0x61e91b44, 0x048e2002, 0xab276dc8, 0xce40568e, 0xb2df26af,
+    0xd7b81de9, 0x78115023, 0x1d766b65, 0xfc32cdf6, 0x9955f6b0,
+    0x36fcbb7a, 0x539b803c, 0xcfc25b38, 0xaaa5607e, 0x050c2db4,
+    0x606b16f2, 0x812fb061, 0xe4488b27, 0x4be1c6ed, 0x2e86fdab,
+    0x52198d8a, 0x377eb6cc, 0x98d7fb06, 0xfdb0c040, 0x1cf466d3,
+    0x79935d95, 0xd63a105f, 0xb35d2b19, 0x6bf1402c, 0x0e967b6a,
+    0xa13f36a0, 0xc4580de6, 0x251cab75, 0x407b9033, 0xefd2ddf9,
+    0x8ab5e6bf, 0xf62a969e, 0x934dadd8, 0x3ce4e012, 0x5983db54,
+    0xb8c77dc7, 0xdda04681, 0x72090b4b, 0x176e300d, 0x8b37eb09,
+    0xee50d04f, 0x41f99d85, 0x249ea6c3, 0xc5da0050, 0xa0bd3b16,
+    0x0f1476dc, 0x6a734d9a, 0x16ec3dbb, 0x738b06fd, 0xdc224b37,
+    0xb9457071, 0x5801d6e2, 0x3d66eda4, 0x92cfa06e, 0xf7a89b28,
+    0x710d1027, 0x146a2b61, 0xbbc366ab, 0xdea45ded, 0x3fe0fb7e,
+    0x5a87c038, 0xf52e8df2, 0x9049b6b4, 0xecd6c695, 0x89b1fdd3,
+    0x2618b019, 0x437f8b5f, 0xa23b2dcc, 0xc75c168a, 0x68f55b40,
+    0x0d926006, 0x91cbbb02, 0xf4ac8044, 0x5b05cd8e, 0x3e62f6c8,
+    0xdf26505b, 0xba416b1d, 0x15e826d7, 0x708f1d91, 0x0c106db0,
+    0x697756f6, 0xc6de1b3c, 0xa3b9207a, 0x42fd86e9, 0x279abdaf,
+    0x8833f065, 0xed54cb23, 0x5e09e03a, 0x3b6edb7c, 0x94c796b6,
+    0xf1a0adf0, 0x10e40b63, 0x75833025, 0xda2a7def, 0xbf4d46a9,
+    0xc3d23688, 0xa6b50dce, 0x091c4004, 0x6c7b7b42, 0x8d3fddd1,
+    0xe858e697, 0x47f1ab5d, 0x2296901b, 0xbecf4b1f, 0xdba87059,
+    0x74013d93, 0x116606d5, 0xf022a046, 0x95459b00, 0x3aecd6ca,
+    0x5f8bed8c, 0x23149dad, 0x4673a6eb, 0xe9daeb21, 0x8cbdd067,
+    0x6df976f4, 0x089e4db2, 0xa7370078, 0xc2503b3e, 0x44f5b031,
+    0x21928b77, 0x8e3bc6bd, 0xeb5cfdfb, 0x0a185b68, 0x6f7f602e,
+    0xc0d62de4, 0xa5b116a2, 0xd92e6683, 0xbc495dc5, 0x13e0100f,
+    0x76872b49, 0x97c38dda, 0xf2a4b69c, 0x5d0dfb56, 0x386ac010,
+    0xa4331b14, 0xc1542052, 0x6efd6d98, 0x0b9a56de, 0xeadef04d,
+    0x8fb9cb0b, 0x201086c1, 0x4577bd87, 0x39e8cda6, 0x5c8ff6e0,
+    0xf326bb2a, 0x9641806c, 0x770526ff, 0x12621db9, 0xbdcb5073,
+    0xd8ac6b35},
+   {0x00000000, 0xd7e28058, 0x74b406f1, 0xa35686a9, 0xe9680de2,
+    0x3e8a8dba, 0x9ddc0b13, 0x4a3e8b4b, 0x09a11d85, 0xde439ddd,
+    0x7d151b74, 0xaaf79b2c, 0xe0c91067, 0x372b903f, 0x947d1696,
+    0x439f96ce, 0x13423b0a, 0xc4a0bb52, 0x67f63dfb, 0xb014bda3,
+    0xfa2a36e8, 0x2dc8b6b0, 0x8e9e3019, 0x597cb041, 0x1ae3268f,
+    0xcd01a6d7, 0x6e57207e, 0xb9b5a026, 0xf38b2b6d, 0x2469ab35,
+    0x873f2d9c, 0x50ddadc4, 0x26847614, 0xf166f64c, 0x523070e5,
+    0x85d2f0bd, 0xcfec7bf6, 0x180efbae, 0xbb587d07, 0x6cbafd5f,
+    0x2f256b91, 0xf8c7ebc9, 0x5b916d60, 0x8c73ed38, 0xc64d6673,
+    0x11afe62b, 0xb2f96082, 0x651be0da, 0x35c64d1e, 0xe224cd46,
+    0x41724bef, 0x9690cbb7, 0xdcae40fc, 0x0b4cc0a4, 0xa81a460d,
+    0x7ff8c655, 0x3c67509b, 0xeb85d0c3, 0x48d3566a, 0x9f31d632,
+    0xd50f5d79, 0x02eddd21, 0xa1bb5b88, 0x7659dbd0, 0x4d08ec28,
+    0x9aea6c70, 0x39bcead9, 0xee5e6a81, 0xa460e1ca, 0x73826192,
+    0xd0d4e73b, 0x07366763, 0x44a9f1ad, 0x934b71f5, 0x301df75c,
+    0xe7ff7704, 0xadc1fc4f, 0x7a237c17, 0xd975fabe, 0x0e977ae6,
+    0x5e4ad722, 0x89a8577a, 0x2afed1d3, 0xfd1c518b, 0xb722dac0,
+    0x60c05a98, 0xc396dc31, 0x14745c69, 0x57ebcaa7, 0x80094aff,
+    0x235fcc56, 0xf4bd4c0e, 0xbe83c745, 0x6961471d, 0xca37c1b4,
+    0x1dd541ec, 0x6b8c9a3c, 0xbc6e1a64, 0x1f389ccd, 0xc8da1c95,
+    0x82e497de, 0x55061786, 0xf650912f, 0x21b21177, 0x622d87b9,
+    0xb5cf07e1, 0x16998148, 0xc17b0110, 0x8b458a5b, 0x5ca70a03,
+    0xfff18caa, 0x28130cf2, 0x78cea136, 0xaf2c216e, 0x0c7aa7c7,
+    0xdb98279f, 0x91a6acd4, 0x46442c8c, 0xe512aa25, 0x32f02a7d,
+    0x716fbcb3, 0xa68d3ceb, 0x05dbba42, 0xd2393a1a, 0x9807b151,
+    0x4fe53109, 0xecb3b7a0, 0x3b5137f8, 0x9a11d850, 0x4df35808,
+    0xeea5dea1, 0x39475ef9, 0x7379d5b2, 0xa49b55ea, 0x07cdd343,
+    0xd02f531b, 0x93b0c5d5, 0x4452458d, 0xe704c324, 0x30e6437c,
+    0x7ad8c837, 0xad3a486f, 0x0e6ccec6, 0xd98e4e9e, 0x8953e35a,
+    0x5eb16302, 0xfde7e5ab, 0x2a0565f3, 0x603beeb8, 0xb7d96ee0,
+    0x148fe849, 0xc36d6811, 0x80f2fedf, 0x57107e87, 0xf446f82e,
+    0x23a47876, 0x699af33d, 0xbe787365, 0x1d2ef5cc, 0xcacc7594,
+    0xbc95ae44, 0x6b772e1c, 0xc821a8b5, 0x1fc328ed, 0x55fda3a6,
+    0x821f23fe, 0x2149a557, 0xf6ab250f, 0xb534b3c1, 0x62d63399,
+    0xc180b530, 0x16623568, 0x5c5cbe23, 0x8bbe3e7b, 0x28e8b8d2,
+    0xff0a388a, 0xafd7954e, 0x78351516, 0xdb6393bf, 0x0c8113e7,
+    0x46bf98ac, 0x915d18f4, 0x320b9e5d, 0xe5e91e05, 0xa67688cb,
+    0x71940893, 0xd2c28e3a, 0x05200e62, 0x4f1e8529, 0x98fc0571,
+    0x3baa83d8, 0xec480380, 0xd7193478, 0x00fbb420, 0xa3ad3289,
+    0x744fb2d1, 0x3e71399a, 0xe993b9c2, 0x4ac53f6b, 0x9d27bf33,
+    0xdeb829fd, 0x095aa9a5, 0xaa0c2f0c, 0x7deeaf54, 0x37d0241f,
+    0xe032a447, 0x436422ee, 0x9486a2b6, 0xc45b0f72, 0x13b98f2a,
+    0xb0ef0983, 0x670d89db, 0x2d330290, 0xfad182c8, 0x59870461,
+    0x8e658439, 0xcdfa12f7, 0x1a1892af, 0xb94e1406, 0x6eac945e,
+    0x24921f15, 0xf3709f4d, 0x502619e4, 0x87c499bc, 0xf19d426c,
+    0x267fc234, 0x8529449d, 0x52cbc4c5, 0x18f54f8e, 0xcf17cfd6,
+    0x6c41497f, 0xbba3c927, 0xf83c5fe9, 0x2fdedfb1, 0x8c885918,
+    0x5b6ad940, 0x1154520b, 0xc6b6d253, 0x65e054fa, 0xb202d4a2,
+    0xe2df7966, 0x353df93e, 0x966b7f97, 0x4189ffcf, 0x0bb77484,
+    0xdc55f4dc, 0x7f037275, 0xa8e1f22d, 0xeb7e64e3, 0x3c9ce4bb,
+    0x9fca6212, 0x4828e24a, 0x02166901, 0xd5f4e959, 0x76a26ff0,
+    0xa140efa8},
+   {0x00000000, 0xef52b6e1, 0x05d46b83, 0xea86dd62, 0x0ba8d706,
+    0xe4fa61e7, 0x0e7cbc85, 0xe12e0a64, 0x1751ae0c, 0xf80318ed,
+    0x1285c58f, 0xfdd7736e, 0x1cf9790a, 0xf3abcfeb, 0x192d1289,
+    0xf67fa468, 0x2ea35c18, 0xc1f1eaf9, 0x2b77379b, 0xc425817a,
+    0x250b8b1e, 0xca593dff, 0x20dfe09d, 0xcf8d567c, 0x39f2f214,
+    0xd6a044f5, 0x3c269997, 0xd3742f76, 0x325a2512, 0xdd0893f3,
+    0x378e4e91, 0xd8dcf870, 0x5d46b830, 0xb2140ed1, 0x5892d3b3,
+    0xb7c06552, 0x56ee6f36, 0xb9bcd9d7, 0x533a04b5, 0xbc68b254,
+    0x4a17163c, 0xa545a0dd, 0x4fc37dbf, 0xa091cb5e, 0x41bfc13a,
+    0xaeed77db, 0x446baab9, 0xab391c58, 0x73e5e428, 0x9cb752c9,
+    0x76318fab, 0x9963394a, 0x784d332e, 0x971f85cf, 0x7d9958ad,
+    0x92cbee4c, 0x64b44a24, 0x8be6fcc5, 0x616021a7, 0x8e329746,
+    0x6f1c9d22, 0x804e2bc3, 0x6ac8f6a1, 0x859a4040, 0xba8d7060,
+    0x55dfc681, 0xbf591be3, 0x500bad02, 0xb125a766, 0x5e771187,
+    0xb4f1cce5, 0x5ba37a04, 0xaddcde6c, 0x428e688d, 0xa808b5ef,
+    0x475a030e, 0xa674096a, 0x4926bf8b, 0xa3a062e9, 0x4cf2d408,
+    0x942e2c78, 0x7b7c9a99, 0x91fa47fb, 0x7ea8f11a, 0x9f86fb7e,
+    0x70d44d9f, 0x9a5290fd, 0x7500261c, 0x837f8274, 0x6c2d3495,
+    0x86abe9f7, 0x69f95f16, 0x88d75572, 0x6785e393, 0x8d033ef1,
+    0x62518810, 0xe7cbc850, 0x08997eb1, 0xe21fa3d3, 0x0d4d1532,
+    0xec631f56, 0x0331a9b7, 0xe9b774d5, 0x06e5c234, 0xf09a665c,
+    0x1fc8d0bd, 0xf54e0ddf, 0x1a1cbb3e, 0xfb32b15a, 0x146007bb,
+    0xfee6dad9, 0x11b46c38, 0xc9689448, 0x263a22a9, 0xccbcffcb,
+    0x23ee492a, 0xc2c0434e, 0x2d92f5af, 0xc71428cd, 0x28469e2c,
+    0xde393a44, 0x316b8ca5, 0xdbed51c7, 0x34bfe726, 0xd591ed42,
+    0x3ac35ba3, 0xd04586c1, 0x3f173020, 0xae6be681, 0x41395060,
+    0xabbf8d02, 0x44ed3be3, 0xa5c33187, 0x4a918766, 0xa0175a04,
+    0x4f45ece5, 0xb93a488d, 0x5668fe6c, 0xbcee230e, 0x53bc95ef,
+    0xb2929f8b, 0x5dc0296a, 0xb746f408, 0x581442e9, 0x80c8ba99,
+    0x6f9a0c78, 0x851cd11a, 0x6a4e67fb, 0x8b606d9f, 0x6432db7e,
+    0x8eb4061c, 0x61e6b0fd, 0x97991495, 0x78cba274, 0x924d7f16,
+    0x7d1fc9f7, 0x9c31c393, 0x73637572, 0x99e5a810, 0x76b71ef1,
+    0xf32d5eb1, 0x1c7fe850, 0xf6f93532, 0x19ab83d3, 0xf88589b7,
+    0x17d73f56, 0xfd51e234, 0x120354d5, 0xe47cf0bd, 0x0b2e465c,
+    0xe1a89b3e, 0x0efa2ddf, 0xefd427bb, 0x0086915a, 0xea004c38,
+    0x0552fad9, 0xdd8e02a9, 0x32dcb448, 0xd85a692a, 0x3708dfcb,
+    0xd626d5af, 0x3974634e, 0xd3f2be2c, 0x3ca008cd, 0xcadfaca5,
+    0x258d1a44, 0xcf0bc726, 0x205971c7, 0xc1777ba3, 0x2e25cd42,
+    0xc4a31020, 0x2bf1a6c1, 0x14e696e1, 0xfbb42000, 0x1132fd62,
+    0xfe604b83, 0x1f4e41e7, 0xf01cf706, 0x1a9a2a64, 0xf5c89c85,
+    0x03b738ed, 0xece58e0c, 0x0663536e, 0xe931e58f, 0x081fefeb,
+    0xe74d590a, 0x0dcb8468, 0xe2993289, 0x3a45caf9, 0xd5177c18,
+    0x3f91a17a, 0xd0c3179b, 0x31ed1dff, 0xdebfab1e, 0x3439767c,
+    0xdb6bc09d, 0x2d1464f5, 0xc246d214, 0x28c00f76, 0xc792b997,
+    0x26bcb3f3, 0xc9ee0512, 0x2368d870, 0xcc3a6e91, 0x49a02ed1,
+    0xa6f29830, 0x4c744552, 0xa326f3b3, 0x4208f9d7, 0xad5a4f36,
+    0x47dc9254, 0xa88e24b5, 0x5ef180dd, 0xb1a3363c, 0x5b25eb5e,
+    0xb4775dbf, 0x555957db, 0xba0be13a, 0x508d3c58, 0xbfdf8ab9,
+    0x670372c9, 0x8851c428, 0x62d7194a, 0x8d85afab, 0x6caba5cf,
+    0x83f9132e, 0x697fce4c, 0x862d78ad, 0x7052dcc5, 0x9f006a24,
+    0x7586b746, 0x9ad401a7, 0x7bfa0bc3, 0x94a8bd22, 0x7e2e6040,
+    0x917cd6a1},
+   {0x00000000, 0x87a6cb43, 0xd43c90c7, 0x539a5b84, 0x730827cf,
+    0xf4aeec8c, 0xa734b708, 0x20927c4b, 0xe6104f9e, 0x61b684dd,
+    0x322cdf59, 0xb58a141a, 0x95186851, 0x12bea312, 0x4124f896,
+    0xc68233d5, 0x1751997d, 0x90f7523e, 0xc36d09ba, 0x44cbc2f9,
+    0x6459beb2, 0xe3ff75f1, 0xb0652e75, 0x37c3e536, 0xf141d6e3,
+    0x76e71da0, 0x257d4624, 0xa2db8d67, 0x8249f12c, 0x05ef3a6f,
+    0x567561eb, 0xd1d3aaa8, 0x2ea332fa, 0xa905f9b9, 0xfa9fa23d,
+    0x7d39697e, 0x5dab1535, 0xda0dde76, 0x899785f2, 0x0e314eb1,
+    0xc8b37d64, 0x4f15b627, 0x1c8feda3, 0x9b2926e0, 0xbbbb5aab,
+    0x3c1d91e8, 0x6f87ca6c, 0xe821012f, 0x39f2ab87, 0xbe5460c4,
+    0xedce3b40, 0x6a68f003, 0x4afa8c48, 0xcd5c470b, 0x9ec61c8f,
+    0x1960d7cc, 0xdfe2e419, 0x58442f5a, 0x0bde74de, 0x8c78bf9d,
+    0xaceac3d6, 0x2b4c0895, 0x78d65311, 0xff709852, 0x5d4665f4,
+    0xdae0aeb7, 0x897af533, 0x0edc3e70, 0x2e4e423b, 0xa9e88978,
+    0xfa72d2fc, 0x7dd419bf, 0xbb562a6a, 0x3cf0e129, 0x6f6abaad,
+    0xe8cc71ee, 0xc85e0da5, 0x4ff8c6e6, 0x1c629d62, 0x9bc45621,
+    0x4a17fc89, 0xcdb137ca, 0x9e2b6c4e, 0x198da70d, 0x391fdb46,
+    0xbeb91005, 0xed234b81, 0x6a8580c2, 0xac07b317, 0x2ba17854,
+    0x783b23d0, 0xff9de893, 0xdf0f94d8, 0x58a95f9b, 0x0b33041f,
+    0x8c95cf5c, 0x73e5570e, 0xf4439c4d, 0xa7d9c7c9, 0x207f0c8a,
+    0x00ed70c1, 0x874bbb82, 0xd4d1e006, 0x53772b45, 0x95f51890,
+    0x1253d3d3, 0x41c98857, 0xc66f4314, 0xe6fd3f5f, 0x615bf41c,
+    0x32c1af98, 0xb56764db, 0x64b4ce73, 0xe3120530, 0xb0885eb4,
+    0x372e95f7, 0x17bce9bc, 0x901a22ff, 0xc380797b, 0x4426b238,
+    0x82a481ed, 0x05024aae, 0x5698112a, 0xd13eda69, 0xf1aca622,
+    0x760a6d61, 0x259036e5, 0xa236fda6, 0xba8ccbe8, 0x3d2a00ab,
+    0x6eb05b2f, 0xe916906c, 0xc984ec27, 0x4e222764, 0x1db87ce0,
+    0x9a1eb7a3, 0x5c9c8476, 0xdb3a4f35, 0x88a014b1, 0x0f06dff2,
+    0x2f94a3b9, 0xa83268fa, 0xfba8337e, 0x7c0ef83d, 0xaddd5295,
+    0x2a7b99d6, 0x79e1c252, 0xfe470911, 0xded5755a, 0x5973be19,
+    0x0ae9e59d, 0x8d4f2ede, 0x4bcd1d0b, 0xcc6bd648, 0x9ff18dcc,
+    0x1857468f, 0x38c53ac4, 0xbf63f187, 0xecf9aa03, 0x6b5f6140,
+    0x942ff912, 0x13893251, 0x401369d5, 0xc7b5a296, 0xe727dedd,
+    0x6081159e, 0x331b4e1a, 0xb4bd8559, 0x723fb68c, 0xf5997dcf,
+    0xa603264b, 0x21a5ed08, 0x01379143, 0x86915a00, 0xd50b0184,
+    0x52adcac7, 0x837e606f, 0x04d8ab2c, 0x5742f0a8, 0xd0e43beb,
+    0xf07647a0, 0x77d08ce3, 0x244ad767, 0xa3ec1c24, 0x656e2ff1,
+    0xe2c8e4b2, 0xb152bf36, 0x36f47475, 0x1666083e, 0x91c0c37d,
+    0xc25a98f9, 0x45fc53ba, 0xe7caae1c, 0x606c655f, 0x33f63edb,
+    0xb450f598, 0x94c289d3, 0x13644290, 0x40fe1914, 0xc758d257,
+    0x01dae182, 0x867c2ac1, 0xd5e67145, 0x5240ba06, 0x72d2c64d,
+    0xf5740d0e, 0xa6ee568a, 0x21489dc9, 0xf09b3761, 0x773dfc22,
+    0x24a7a7a6, 0xa3016ce5, 0x839310ae, 0x0435dbed, 0x57af8069,
+    0xd0094b2a, 0x168b78ff, 0x912db3bc, 0xc2b7e838, 0x4511237b,
+    0x65835f30, 0xe2259473, 0xb1bfcff7, 0x361904b4, 0xc9699ce6,
+    0x4ecf57a5, 0x1d550c21, 0x9af3c762, 0xba61bb29, 0x3dc7706a,
+    0x6e5d2bee, 0xe9fbe0ad, 0x2f79d378, 0xa8df183b, 0xfb4543bf,
+    0x7ce388fc, 0x5c71f4b7, 0xdbd73ff4, 0x884d6470, 0x0febaf33,
+    0xde38059b, 0x599eced8, 0x0a04955c, 0x8da25e1f, 0xad302254,
+    0x2a96e917, 0x790cb293, 0xfeaa79d0, 0x38284a05, 0xbf8e8146,
+    0xec14dac2, 0x6bb21181, 0x4b206dca, 0xcc86a689, 0x9f1cfd0d,
+    0x18ba364e}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x00000000, 0x43cba687, 0xc7903cd4, 0x845b9a53, 0xcf270873,
+    0x8cecaef4, 0x08b734a7, 0x4b7c9220, 0x9e4f10e6, 0xdd84b661,
+    0x59df2c32, 0x1a148ab5, 0x51681895, 0x12a3be12, 0x96f82441,
+    0xd53382c6, 0x7d995117, 0x3e52f790, 0xba096dc3, 0xf9c2cb44,
+    0xb2be5964, 0xf175ffe3, 0x752e65b0, 0x36e5c337, 0xe3d641f1,
+    0xa01de776, 0x24467d25, 0x678ddba2, 0x2cf14982, 0x6f3aef05,
+    0xeb617556, 0xa8aad3d1, 0xfa32a32e, 0xb9f905a9, 0x3da29ffa,
+    0x7e69397d, 0x3515ab5d, 0x76de0dda, 0xf2859789, 0xb14e310e,
+    0x647db3c8, 0x27b6154f, 0xa3ed8f1c, 0xe026299b, 0xab5abbbb,
+    0xe8911d3c, 0x6cca876f, 0x2f0121e8, 0x87abf239, 0xc46054be,
+    0x403bceed, 0x03f0686a, 0x488cfa4a, 0x0b475ccd, 0x8f1cc69e,
+    0xccd76019, 0x19e4e2df, 0x5a2f4458, 0xde74de0b, 0x9dbf788c,
+    0xd6c3eaac, 0x95084c2b, 0x1153d678, 0x529870ff, 0xf465465d,
+    0xb7aee0da, 0x33f57a89, 0x703edc0e, 0x3b424e2e, 0x7889e8a9,
+    0xfcd272fa, 0xbf19d47d, 0x6a2a56bb, 0x29e1f03c, 0xadba6a6f,
+    0xee71cce8, 0xa50d5ec8, 0xe6c6f84f, 0x629d621c, 0x2156c49b,
+    0x89fc174a, 0xca37b1cd, 0x4e6c2b9e, 0x0da78d19, 0x46db1f39,
+    0x0510b9be, 0x814b23ed, 0xc280856a, 0x17b307ac, 0x5478a12b,
+    0xd0233b78, 0x93e89dff, 0xd8940fdf, 0x9b5fa958, 0x1f04330b,
+    0x5ccf958c, 0x0e57e573, 0x4d9c43f4, 0xc9c7d9a7, 0x8a0c7f20,
+    0xc170ed00, 0x82bb4b87, 0x06e0d1d4, 0x452b7753, 0x9018f595,
+    0xd3d35312, 0x5788c941, 0x14436fc6, 0x5f3ffde6, 0x1cf45b61,
+    0x98afc132, 0xdb6467b5, 0x73ceb464, 0x300512e3, 0xb45e88b0,
+    0xf7952e37, 0xbce9bc17, 0xff221a90, 0x7b7980c3, 0x38b22644,
+    0xed81a482, 0xae4a0205, 0x2a119856, 0x69da3ed1, 0x22a6acf1,
+    0x616d0a76, 0xe5369025, 0xa6fd36a2, 0xe8cb8cba, 0xab002a3d,
+    0x2f5bb06e, 0x6c9016e9, 0x27ec84c9, 0x6427224e, 0xe07cb81d,
+    0xa3b71e9a, 0x76849c5c, 0x354f3adb, 0xb114a088, 0xf2df060f,
+    0xb9a3942f, 0xfa6832a8, 0x7e33a8fb, 0x3df80e7c, 0x9552ddad,
+    0xd6997b2a, 0x52c2e179, 0x110947fe, 0x5a75d5de, 0x19be7359,
+    0x9de5e90a, 0xde2e4f8d, 0x0b1dcd4b, 0x48d66bcc, 0xcc8df19f,
+    0x8f465718, 0xc43ac538, 0x87f163bf, 0x03aaf9ec, 0x40615f6b,
+    0x12f92f94, 0x51328913, 0xd5691340, 0x96a2b5c7, 0xddde27e7,
+    0x9e158160, 0x1a4e1b33, 0x5985bdb4, 0x8cb63f72, 0xcf7d99f5,
+    0x4b2603a6, 0x08eda521, 0x43913701, 0x005a9186, 0x84010bd5,
+    0xc7caad52, 0x6f607e83, 0x2cabd804, 0xa8f04257, 0xeb3be4d0,
+    0xa04776f0, 0xe38cd077, 0x67d74a24, 0x241ceca3, 0xf12f6e65,
+    0xb2e4c8e2, 0x36bf52b1, 0x7574f436, 0x3e086616, 0x7dc3c091,
+    0xf9985ac2, 0xba53fc45, 0x1caecae7, 0x5f656c60, 0xdb3ef633,
+    0x98f550b4, 0xd389c294, 0x90426413, 0x1419fe40, 0x57d258c7,
+    0x82e1da01, 0xc12a7c86, 0x4571e6d5, 0x06ba4052, 0x4dc6d272,
+    0x0e0d74f5, 0x8a56eea6, 0xc99d4821, 0x61379bf0, 0x22fc3d77,
+    0xa6a7a724, 0xe56c01a3, 0xae109383, 0xeddb3504, 0x6980af57,
+    0x2a4b09d0, 0xff788b16, 0xbcb32d91, 0x38e8b7c2, 0x7b231145,
+    0x305f8365, 0x739425e2, 0xf7cfbfb1, 0xb4041936, 0xe69c69c9,
+    0xa557cf4e, 0x210c551d, 0x62c7f39a, 0x29bb61ba, 0x6a70c73d,
+    0xee2b5d6e, 0xade0fbe9, 0x78d3792f, 0x3b18dfa8, 0xbf4345fb,
+    0xfc88e37c, 0xb7f4715c, 0xf43fd7db, 0x70644d88, 0x33afeb0f,
+    0x9b0538de, 0xd8ce9e59, 0x5c95040a, 0x1f5ea28d, 0x542230ad,
+    0x17e9962a, 0x93b20c79, 0xd079aafe, 0x054a2838, 0x46818ebf,
+    0xc2da14ec, 0x8111b26b, 0xca6d204b, 0x89a686cc, 0x0dfd1c9f,
+    0x4e36ba18},
+   {0x00000000, 0xe1b652ef, 0x836bd405, 0x62dd86ea, 0x06d7a80b,
+    0xe761fae4, 0x85bc7c0e, 0x640a2ee1, 0x0cae5117, 0xed1803f8,
+    0x8fc58512, 0x6e73d7fd, 0x0a79f91c, 0xebcfabf3, 0x89122d19,
+    0x68a47ff6, 0x185ca32e, 0xf9eaf1c1, 0x9b37772b, 0x7a8125c4,
+    0x1e8b0b25, 0xff3d59ca, 0x9de0df20, 0x7c568dcf, 0x14f2f239,
+    0xf544a0d6, 0x9799263c, 0x762f74d3, 0x12255a32, 0xf39308dd,
+    0x914e8e37, 0x70f8dcd8, 0x30b8465d, 0xd10e14b2, 0xb3d39258,
+    0x5265c0b7, 0x366fee56, 0xd7d9bcb9, 0xb5043a53, 0x54b268bc,
+    0x3c16174a, 0xdda045a5, 0xbf7dc34f, 0x5ecb91a0, 0x3ac1bf41,
+    0xdb77edae, 0xb9aa6b44, 0x581c39ab, 0x28e4e573, 0xc952b79c,
+    0xab8f3176, 0x4a396399, 0x2e334d78, 0xcf851f97, 0xad58997d,
+    0x4ceecb92, 0x244ab464, 0xc5fce68b, 0xa7216061, 0x4697328e,
+    0x229d1c6f, 0xc32b4e80, 0xa1f6c86a, 0x40409a85, 0x60708dba,
+    0x81c6df55, 0xe31b59bf, 0x02ad0b50, 0x66a725b1, 0x8711775e,
+    0xe5ccf1b4, 0x047aa35b, 0x6cdedcad, 0x8d688e42, 0xefb508a8,
+    0x0e035a47, 0x6a0974a6, 0x8bbf2649, 0xe962a0a3, 0x08d4f24c,
+    0x782c2e94, 0x999a7c7b, 0xfb47fa91, 0x1af1a87e, 0x7efb869f,
+    0x9f4dd470, 0xfd90529a, 0x1c260075, 0x74827f83, 0x95342d6c,
+    0xf7e9ab86, 0x165ff969, 0x7255d788, 0x93e38567, 0xf13e038d,
+    0x10885162, 0x50c8cbe7, 0xb17e9908, 0xd3a31fe2, 0x32154d0d,
+    0x561f63ec, 0xb7a93103, 0xd574b7e9, 0x34c2e506, 0x5c669af0,
+    0xbdd0c81f, 0xdf0d4ef5, 0x3ebb1c1a, 0x5ab132fb, 0xbb076014,
+    0xd9dae6fe, 0x386cb411, 0x489468c9, 0xa9223a26, 0xcbffbccc,
+    0x2a49ee23, 0x4e43c0c2, 0xaff5922d, 0xcd2814c7, 0x2c9e4628,
+    0x443a39de, 0xa58c6b31, 0xc751eddb, 0x26e7bf34, 0x42ed91d5,
+    0xa35bc33a, 0xc18645d0, 0x2030173f, 0x81e66bae, 0x60503941,
+    0x028dbfab, 0xe33bed44, 0x8731c3a5, 0x6687914a, 0x045a17a0,
+    0xe5ec454f, 0x8d483ab9, 0x6cfe6856, 0x0e23eebc, 0xef95bc53,
+    0x8b9f92b2, 0x6a29c05d, 0x08f446b7, 0xe9421458, 0x99bac880,
+    0x780c9a6f, 0x1ad11c85, 0xfb674e6a, 0x9f6d608b, 0x7edb3264,
+    0x1c06b48e, 0xfdb0e661, 0x95149997, 0x74a2cb78, 0x167f4d92,
+    0xf7c91f7d, 0x93c3319c, 0x72756373, 0x10a8e599, 0xf11eb776,
+    0xb15e2df3, 0x50e87f1c, 0x3235f9f6, 0xd383ab19, 0xb78985f8,
+    0x563fd717, 0x34e251fd, 0xd5540312, 0xbdf07ce4, 0x5c462e0b,
+    0x3e9ba8e1, 0xdf2dfa0e, 0xbb27d4ef, 0x5a918600, 0x384c00ea,
+    0xd9fa5205, 0xa9028edd, 0x48b4dc32, 0x2a695ad8, 0xcbdf0837,
+    0xafd526d6, 0x4e637439, 0x2cbef2d3, 0xcd08a03c, 0xa5acdfca,
+    0x441a8d25, 0x26c70bcf, 0xc7715920, 0xa37b77c1, 0x42cd252e,
+    0x2010a3c4, 0xc1a6f12b, 0xe196e614, 0x0020b4fb, 0x62fd3211,
+    0x834b60fe, 0xe7414e1f, 0x06f71cf0, 0x642a9a1a, 0x859cc8f5,
+    0xed38b703, 0x0c8ee5ec, 0x6e536306, 0x8fe531e9, 0xebef1f08,
+    0x0a594de7, 0x6884cb0d, 0x893299e2, 0xf9ca453a, 0x187c17d5,
+    0x7aa1913f, 0x9b17c3d0, 0xff1ded31, 0x1eabbfde, 0x7c763934,
+    0x9dc06bdb, 0xf564142d, 0x14d246c2, 0x760fc028, 0x97b992c7,
+    0xf3b3bc26, 0x1205eec9, 0x70d86823, 0x916e3acc, 0xd12ea049,
+    0x3098f2a6, 0x5245744c, 0xb3f326a3, 0xd7f90842, 0x364f5aad,
+    0x5492dc47, 0xb5248ea8, 0xdd80f15e, 0x3c36a3b1, 0x5eeb255b,
+    0xbf5d77b4, 0xdb575955, 0x3ae10bba, 0x583c8d50, 0xb98adfbf,
+    0xc9720367, 0x28c45188, 0x4a19d762, 0xabaf858d, 0xcfa5ab6c,
+    0x2e13f983, 0x4cce7f69, 0xad782d86, 0xc5dc5270, 0x246a009f,
+    0x46b78675, 0xa701d49a, 0xc30bfa7b, 0x22bda894, 0x40602e7e,
+    0xa1d67c91},
+   {0x00000000, 0x5880e2d7, 0xf106b474, 0xa98656a3, 0xe20d68e9,
+    0xba8d8a3e, 0x130bdc9d, 0x4b8b3e4a, 0x851da109, 0xdd9d43de,
+    0x741b157d, 0x2c9bf7aa, 0x6710c9e0, 0x3f902b37, 0x96167d94,
+    0xce969f43, 0x0a3b4213, 0x52bba0c4, 0xfb3df667, 0xa3bd14b0,
+    0xe8362afa, 0xb0b6c82d, 0x19309e8e, 0x41b07c59, 0x8f26e31a,
+    0xd7a601cd, 0x7e20576e, 0x26a0b5b9, 0x6d2b8bf3, 0x35ab6924,
+    0x9c2d3f87, 0xc4addd50, 0x14768426, 0x4cf666f1, 0xe5703052,
+    0xbdf0d285, 0xf67beccf, 0xaefb0e18, 0x077d58bb, 0x5ffdba6c,
+    0x916b252f, 0xc9ebc7f8, 0x606d915b, 0x38ed738c, 0x73664dc6,
+    0x2be6af11, 0x8260f9b2, 0xdae01b65, 0x1e4dc635, 0x46cd24e2,
+    0xef4b7241, 0xb7cb9096, 0xfc40aedc, 0xa4c04c0b, 0x0d461aa8,
+    0x55c6f87f, 0x9b50673c, 0xc3d085eb, 0x6a56d348, 0x32d6319f,
+    0x795d0fd5, 0x21dded02, 0x885bbba1, 0xd0db5976, 0x28ec084d,
+    0x706cea9a, 0xd9eabc39, 0x816a5eee, 0xcae160a4, 0x92618273,
+    0x3be7d4d0, 0x63673607, 0xadf1a944, 0xf5714b93, 0x5cf71d30,
+    0x0477ffe7, 0x4ffcc1ad, 0x177c237a, 0xbefa75d9, 0xe67a970e,
+    0x22d74a5e, 0x7a57a889, 0xd3d1fe2a, 0x8b511cfd, 0xc0da22b7,
+    0x985ac060, 0x31dc96c3, 0x695c7414, 0xa7caeb57, 0xff4a0980,
+    0x56cc5f23, 0x0e4cbdf4, 0x45c783be, 0x1d476169, 0xb4c137ca,
+    0xec41d51d, 0x3c9a8c6b, 0x641a6ebc, 0xcd9c381f, 0x951cdac8,
+    0xde97e482, 0x86170655, 0x2f9150f6, 0x7711b221, 0xb9872d62,
+    0xe107cfb5, 0x48819916, 0x10017bc1, 0x5b8a458b, 0x030aa75c,
+    0xaa8cf1ff, 0xf20c1328, 0x36a1ce78, 0x6e212caf, 0xc7a77a0c,
+    0x9f2798db, 0xd4aca691, 0x8c2c4446, 0x25aa12e5, 0x7d2af032,
+    0xb3bc6f71, 0xeb3c8da6, 0x42badb05, 0x1a3a39d2, 0x51b10798,
+    0x0931e54f, 0xa0b7b3ec, 0xf837513b, 0x50d8119a, 0x0858f34d,
+    0xa1dea5ee, 0xf95e4739, 0xb2d57973, 0xea559ba4, 0x43d3cd07,
+    0x1b532fd0, 0xd5c5b093, 0x8d455244, 0x24c304e7, 0x7c43e630,
+    0x37c8d87a, 0x6f483aad, 0xc6ce6c0e, 0x9e4e8ed9, 0x5ae35389,
+    0x0263b15e, 0xabe5e7fd, 0xf365052a, 0xb8ee3b60, 0xe06ed9b7,
+    0x49e88f14, 0x11686dc3, 0xdffef280, 0x877e1057, 0x2ef846f4,
+    0x7678a423, 0x3df39a69, 0x657378be, 0xccf52e1d, 0x9475ccca,
+    0x44ae95bc, 0x1c2e776b, 0xb5a821c8, 0xed28c31f, 0xa6a3fd55,
+    0xfe231f82, 0x57a54921, 0x0f25abf6, 0xc1b334b5, 0x9933d662,
+    0x30b580c1, 0x68356216, 0x23be5c5c, 0x7b3ebe8b, 0xd2b8e828,
+    0x8a380aff, 0x4e95d7af, 0x16153578, 0xbf9363db, 0xe713810c,
+    0xac98bf46, 0xf4185d91, 0x5d9e0b32, 0x051ee9e5, 0xcb8876a6,
+    0x93089471, 0x3a8ec2d2, 0x620e2005, 0x29851e4f, 0x7105fc98,
+    0xd883aa3b, 0x800348ec, 0x783419d7, 0x20b4fb00, 0x8932ada3,
+    0xd1b24f74, 0x9a39713e, 0xc2b993e9, 0x6b3fc54a, 0x33bf279d,
+    0xfd29b8de, 0xa5a95a09, 0x0c2f0caa, 0x54afee7d, 0x1f24d037,
+    0x47a432e0, 0xee226443, 0xb6a28694, 0x720f5bc4, 0x2a8fb913,
+    0x8309efb0, 0xdb890d67, 0x9002332d, 0xc882d1fa, 0x61048759,
+    0x3984658e, 0xf712facd, 0xaf92181a, 0x06144eb9, 0x5e94ac6e,
+    0x151f9224, 0x4d9f70f3, 0xe4192650, 0xbc99c487, 0x6c429df1,
+    0x34c27f26, 0x9d442985, 0xc5c4cb52, 0x8e4ff518, 0xd6cf17cf,
+    0x7f49416c, 0x27c9a3bb, 0xe95f3cf8, 0xb1dfde2f, 0x1859888c,
+    0x40d96a5b, 0x0b525411, 0x53d2b6c6, 0xfa54e065, 0xa2d402b2,
+    0x6679dfe2, 0x3ef93d35, 0x977f6b96, 0xcfff8941, 0x8474b70b,
+    0xdcf455dc, 0x7572037f, 0x2df2e1a8, 0xe3647eeb, 0xbbe49c3c,
+    0x1262ca9f, 0x4ae22848, 0x01691602, 0x59e9f4d5, 0xf06fa276,
+    0xa8ef40a1},
+   {0x00000000, 0x463b6765, 0x8c76ceca, 0xca4da9af, 0x59ebed4e,
+    0x1fd08a2b, 0xd59d2384, 0x93a644e1, 0xb2d6db9d, 0xf4edbcf8,
+    0x3ea01557, 0x789b7232, 0xeb3d36d3, 0xad0651b6, 0x674bf819,
+    0x21709f7c, 0x25abc6e0, 0x6390a185, 0xa9dd082a, 0xefe66f4f,
+    0x7c402bae, 0x3a7b4ccb, 0xf036e564, 0xb60d8201, 0x977d1d7d,
+    0xd1467a18, 0x1b0bd3b7, 0x5d30b4d2, 0xce96f033, 0x88ad9756,
+    0x42e03ef9, 0x04db599c, 0x0b50fc1a, 0x4d6b9b7f, 0x872632d0,
+    0xc11d55b5, 0x52bb1154, 0x14807631, 0xdecddf9e, 0x98f6b8fb,
+    0xb9862787, 0xffbd40e2, 0x35f0e94d, 0x73cb8e28, 0xe06dcac9,
+    0xa656adac, 0x6c1b0403, 0x2a206366, 0x2efb3afa, 0x68c05d9f,
+    0xa28df430, 0xe4b69355, 0x7710d7b4, 0x312bb0d1, 0xfb66197e,
+    0xbd5d7e1b, 0x9c2de167, 0xda168602, 0x105b2fad, 0x566048c8,
+    0xc5c60c29, 0x83fd6b4c, 0x49b0c2e3, 0x0f8ba586, 0x16a0f835,
+    0x509b9f50, 0x9ad636ff, 0xdced519a, 0x4f4b157b, 0x0970721e,
+    0xc33ddbb1, 0x8506bcd4, 0xa47623a8, 0xe24d44cd, 0x2800ed62,
+    0x6e3b8a07, 0xfd9dcee6, 0xbba6a983, 0x71eb002c, 0x37d06749,
+    0x330b3ed5, 0x753059b0, 0xbf7df01f, 0xf946977a, 0x6ae0d39b,
+    0x2cdbb4fe, 0xe6961d51, 0xa0ad7a34, 0x81dde548, 0xc7e6822d,
+    0x0dab2b82, 0x4b904ce7, 0xd8360806, 0x9e0d6f63, 0x5440c6cc,
+    0x127ba1a9, 0x1df0042f, 0x5bcb634a, 0x9186cae5, 0xd7bdad80,
+    0x441be961, 0x02208e04, 0xc86d27ab, 0x8e5640ce, 0xaf26dfb2,
+    0xe91db8d7, 0x23501178, 0x656b761d, 0xf6cd32fc, 0xb0f65599,
+    0x7abbfc36, 0x3c809b53, 0x385bc2cf, 0x7e60a5aa, 0xb42d0c05,
+    0xf2166b60, 0x61b02f81, 0x278b48e4, 0xedc6e14b, 0xabfd862e,
+    0x8a8d1952, 0xccb67e37, 0x06fbd798, 0x40c0b0fd, 0xd366f41c,
+    0x955d9379, 0x5f103ad6, 0x192b5db3, 0x2c40f16b, 0x6a7b960e,
+    0xa0363fa1, 0xe60d58c4, 0x75ab1c25, 0x33907b40, 0xf9ddd2ef,
+    0xbfe6b58a, 0x9e962af6, 0xd8ad4d93, 0x12e0e43c, 0x54db8359,
+    0xc77dc7b8, 0x8146a0dd, 0x4b0b0972, 0x0d306e17, 0x09eb378b,
+    0x4fd050ee, 0x859df941, 0xc3a69e24, 0x5000dac5, 0x163bbda0,
+    0xdc76140f, 0x9a4d736a, 0xbb3dec16, 0xfd068b73, 0x374b22dc,
+    0x717045b9, 0xe2d60158, 0xa4ed663d, 0x6ea0cf92, 0x289ba8f7,
+    0x27100d71, 0x612b6a14, 0xab66c3bb, 0xed5da4de, 0x7efbe03f,
+    0x38c0875a, 0xf28d2ef5, 0xb4b64990, 0x95c6d6ec, 0xd3fdb189,
+    0x19b01826, 0x5f8b7f43, 0xcc2d3ba2, 0x8a165cc7, 0x405bf568,
+    0x0660920d, 0x02bbcb91, 0x4480acf4, 0x8ecd055b, 0xc8f6623e,
+    0x5b5026df, 0x1d6b41ba, 0xd726e815, 0x911d8f70, 0xb06d100c,
+    0xf6567769, 0x3c1bdec6, 0x7a20b9a3, 0xe986fd42, 0xafbd9a27,
+    0x65f03388, 0x23cb54ed, 0x3ae0095e, 0x7cdb6e3b, 0xb696c794,
+    0xf0ada0f1, 0x630be410, 0x25308375, 0xef7d2ada, 0xa9464dbf,
+    0x8836d2c3, 0xce0db5a6, 0x04401c09, 0x427b7b6c, 0xd1dd3f8d,
+    0x97e658e8, 0x5dabf147, 0x1b909622, 0x1f4bcfbe, 0x5970a8db,
+    0x933d0174, 0xd5066611, 0x46a022f0, 0x009b4595, 0xcad6ec3a,
+    0x8ced8b5f, 0xad9d1423, 0xeba67346, 0x21ebdae9, 0x67d0bd8c,
+    0xf476f96d, 0xb24d9e08, 0x780037a7, 0x3e3b50c2, 0x31b0f544,
+    0x778b9221, 0xbdc63b8e, 0xfbfd5ceb, 0x685b180a, 0x2e607f6f,
+    0xe42dd6c0, 0xa216b1a5, 0x83662ed9, 0xc55d49bc, 0x0f10e013,
+    0x492b8776, 0xda8dc397, 0x9cb6a4f2, 0x56fb0d5d, 0x10c06a38,
+    0x141b33a4, 0x522054c1, 0x986dfd6e, 0xde569a0b, 0x4df0deea,
+    0x0bcbb98f, 0xc1861020, 0x87bd7745, 0xa6cde839, 0xe0f68f5c,
+    0x2abb26f3, 0x6c804196, 0xff260577, 0xb91d6212, 0x7350cbbd,
+    0x356bacd8}};
+
+#endif /* W */
+
+#endif /* N == 5 */
+#if N == 6
+
+#if W == 8
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0x3db1ecdc, 0x7b63d9b8, 0x46d23564, 0xf6c7b370,
+    0xcb765fac, 0x8da46ac8, 0xb0158614, 0x36fe60a1, 0x0b4f8c7d,
+    0x4d9db919, 0x702c55c5, 0xc039d3d1, 0xfd883f0d, 0xbb5a0a69,
+    0x86ebe6b5, 0x6dfcc142, 0x504d2d9e, 0x169f18fa, 0x2b2ef426,
+    0x9b3b7232, 0xa68a9eee, 0xe058ab8a, 0xdde94756, 0x5b02a1e3,
+    0x66b34d3f, 0x2061785b, 0x1dd09487, 0xadc51293, 0x9074fe4f,
+    0xd6a6cb2b, 0xeb1727f7, 0xdbf98284, 0xe6486e58, 0xa09a5b3c,
+    0x9d2bb7e0, 0x2d3e31f4, 0x108fdd28, 0x565de84c, 0x6bec0490,
+    0xed07e225, 0xd0b60ef9, 0x96643b9d, 0xabd5d741, 0x1bc05155,
+    0x2671bd89, 0x60a388ed, 0x5d126431, 0xb60543c6, 0x8bb4af1a,
+    0xcd669a7e, 0xf0d776a2, 0x40c2f0b6, 0x7d731c6a, 0x3ba1290e,
+    0x0610c5d2, 0x80fb2367, 0xbd4acfbb, 0xfb98fadf, 0xc6291603,
+    0x763c9017, 0x4b8d7ccb, 0x0d5f49af, 0x30eea573, 0x6c820349,
+    0x5133ef95, 0x17e1daf1, 0x2a50362d, 0x9a45b039, 0xa7f45ce5,
+    0xe1266981, 0xdc97855d, 0x5a7c63e8, 0x67cd8f34, 0x211fba50,
+    0x1cae568c, 0xacbbd098, 0x910a3c44, 0xd7d80920, 0xea69e5fc,
+    0x017ec20b, 0x3ccf2ed7, 0x7a1d1bb3, 0x47acf76f, 0xf7b9717b,
+    0xca089da7, 0x8cdaa8c3, 0xb16b441f, 0x3780a2aa, 0x0a314e76,
+    0x4ce37b12, 0x715297ce, 0xc14711da, 0xfcf6fd06, 0xba24c862,
+    0x879524be, 0xb77b81cd, 0x8aca6d11, 0xcc185875, 0xf1a9b4a9,
+    0x41bc32bd, 0x7c0dde61, 0x3adfeb05, 0x076e07d9, 0x8185e16c,
+    0xbc340db0, 0xfae638d4, 0xc757d408, 0x7742521c, 0x4af3bec0,
+    0x0c218ba4, 0x31906778, 0xda87408f, 0xe736ac53, 0xa1e49937,
+    0x9c5575eb, 0x2c40f3ff, 0x11f11f23, 0x57232a47, 0x6a92c69b,
+    0xec79202e, 0xd1c8ccf2, 0x971af996, 0xaaab154a, 0x1abe935e,
+    0x270f7f82, 0x61dd4ae6, 0x5c6ca63a, 0xd9040692, 0xe4b5ea4e,
+    0xa267df2a, 0x9fd633f6, 0x2fc3b5e2, 0x1272593e, 0x54a06c5a,
+    0x69118086, 0xeffa6633, 0xd24b8aef, 0x9499bf8b, 0xa9285357,
+    0x193dd543, 0x248c399f, 0x625e0cfb, 0x5fefe027, 0xb4f8c7d0,
+    0x89492b0c, 0xcf9b1e68, 0xf22af2b4, 0x423f74a0, 0x7f8e987c,
+    0x395cad18, 0x04ed41c4, 0x8206a771, 0xbfb74bad, 0xf9657ec9,
+    0xc4d49215, 0x74c11401, 0x4970f8dd, 0x0fa2cdb9, 0x32132165,
+    0x02fd8416, 0x3f4c68ca, 0x799e5dae, 0x442fb172, 0xf43a3766,
+    0xc98bdbba, 0x8f59eede, 0xb2e80202, 0x3403e4b7, 0x09b2086b,
+    0x4f603d0f, 0x72d1d1d3, 0xc2c457c7, 0xff75bb1b, 0xb9a78e7f,
+    0x841662a3, 0x6f014554, 0x52b0a988, 0x14629cec, 0x29d37030,
+    0x99c6f624, 0xa4771af8, 0xe2a52f9c, 0xdf14c340, 0x59ff25f5,
+    0x644ec929, 0x229cfc4d, 0x1f2d1091, 0xaf389685, 0x92897a59,
+    0xd45b4f3d, 0xe9eaa3e1, 0xb58605db, 0x8837e907, 0xcee5dc63,
+    0xf35430bf, 0x4341b6ab, 0x7ef05a77, 0x38226f13, 0x059383cf,
+    0x8378657a, 0xbec989a6, 0xf81bbcc2, 0xc5aa501e, 0x75bfd60a,
+    0x480e3ad6, 0x0edc0fb2, 0x336de36e, 0xd87ac499, 0xe5cb2845,
+    0xa3191d21, 0x9ea8f1fd, 0x2ebd77e9, 0x130c9b35, 0x55deae51,
+    0x686f428d, 0xee84a438, 0xd33548e4, 0x95e77d80, 0xa856915c,
+    0x18431748, 0x25f2fb94, 0x6320cef0, 0x5e91222c, 0x6e7f875f,
+    0x53ce6b83, 0x151c5ee7, 0x28adb23b, 0x98b8342f, 0xa509d8f3,
+    0xe3dbed97, 0xde6a014b, 0x5881e7fe, 0x65300b22, 0x23e23e46,
+    0x1e53d29a, 0xae46548e, 0x93f7b852, 0xd5258d36, 0xe89461ea,
+    0x0383461d, 0x3e32aac1, 0x78e09fa5, 0x45517379, 0xf544f56d,
+    0xc8f519b1, 0x8e272cd5, 0xb396c009, 0x357d26bc, 0x08ccca60,
+    0x4e1eff04, 0x73af13d8, 0xc3ba95cc, 0xfe0b7910, 0xb8d94c74,
+    0x8568a0a8},
+   {0x00000000, 0x69790b65, 0xd2f216ca, 0xbb8b1daf, 0x7e952bd5,
+    0x17ec20b0, 0xac673d1f, 0xc51e367a, 0xfd2a57aa, 0x94535ccf,
+    0x2fd84160, 0x46a14a05, 0x83bf7c7f, 0xeac6771a, 0x514d6ab5,
+    0x383461d0, 0x2125a915, 0x485ca270, 0xf3d7bfdf, 0x9aaeb4ba,
+    0x5fb082c0, 0x36c989a5, 0x8d42940a, 0xe43b9f6f, 0xdc0ffebf,
+    0xb576f5da, 0x0efde875, 0x6784e310, 0xa29ad56a, 0xcbe3de0f,
+    0x7068c3a0, 0x1911c8c5, 0x424b522a, 0x2b32594f, 0x90b944e0,
+    0xf9c04f85, 0x3cde79ff, 0x55a7729a, 0xee2c6f35, 0x87556450,
+    0xbf610580, 0xd6180ee5, 0x6d93134a, 0x04ea182f, 0xc1f42e55,
+    0xa88d2530, 0x1306389f, 0x7a7f33fa, 0x636efb3f, 0x0a17f05a,
+    0xb19cedf5, 0xd8e5e690, 0x1dfbd0ea, 0x7482db8f, 0xcf09c620,
+    0xa670cd45, 0x9e44ac95, 0xf73da7f0, 0x4cb6ba5f, 0x25cfb13a,
+    0xe0d18740, 0x89a88c25, 0x3223918a, 0x5b5a9aef, 0x8496a454,
+    0xedefaf31, 0x5664b29e, 0x3f1db9fb, 0xfa038f81, 0x937a84e4,
+    0x28f1994b, 0x4188922e, 0x79bcf3fe, 0x10c5f89b, 0xab4ee534,
+    0xc237ee51, 0x0729d82b, 0x6e50d34e, 0xd5dbcee1, 0xbca2c584,
+    0xa5b30d41, 0xccca0624, 0x77411b8b, 0x1e3810ee, 0xdb262694,
+    0xb25f2df1, 0x09d4305e, 0x60ad3b3b, 0x58995aeb, 0x31e0518e,
+    0x8a6b4c21, 0xe3124744, 0x260c713e, 0x4f757a5b, 0xf4fe67f4,
+    0x9d876c91, 0xc6ddf67e, 0xafa4fd1b, 0x142fe0b4, 0x7d56ebd1,
+    0xb848ddab, 0xd131d6ce, 0x6abacb61, 0x03c3c004, 0x3bf7a1d4,
+    0x528eaab1, 0xe905b71e, 0x807cbc7b, 0x45628a01, 0x2c1b8164,
+    0x97909ccb, 0xfee997ae, 0xe7f85f6b, 0x8e81540e, 0x350a49a1,
+    0x5c7342c4, 0x996d74be, 0xf0147fdb, 0x4b9f6274, 0x22e66911,
+    0x1ad208c1, 0x73ab03a4, 0xc8201e0b, 0xa159156e, 0x64472314,
+    0x0d3e2871, 0xb6b535de, 0xdfcc3ebb, 0xd25c4ee9, 0xbb25458c,
+    0x00ae5823, 0x69d75346, 0xacc9653c, 0xc5b06e59, 0x7e3b73f6,
+    0x17427893, 0x2f761943, 0x460f1226, 0xfd840f89, 0x94fd04ec,
+    0x51e33296, 0x389a39f3, 0x8311245c, 0xea682f39, 0xf379e7fc,
+    0x9a00ec99, 0x218bf136, 0x48f2fa53, 0x8deccc29, 0xe495c74c,
+    0x5f1edae3, 0x3667d186, 0x0e53b056, 0x672abb33, 0xdca1a69c,
+    0xb5d8adf9, 0x70c69b83, 0x19bf90e6, 0xa2348d49, 0xcb4d862c,
+    0x90171cc3, 0xf96e17a6, 0x42e50a09, 0x2b9c016c, 0xee823716,
+    0x87fb3c73, 0x3c7021dc, 0x55092ab9, 0x6d3d4b69, 0x0444400c,
+    0xbfcf5da3, 0xd6b656c6, 0x13a860bc, 0x7ad16bd9, 0xc15a7676,
+    0xa8237d13, 0xb132b5d6, 0xd84bbeb3, 0x63c0a31c, 0x0ab9a879,
+    0xcfa79e03, 0xa6de9566, 0x1d5588c9, 0x742c83ac, 0x4c18e27c,
+    0x2561e919, 0x9eeaf4b6, 0xf793ffd3, 0x328dc9a9, 0x5bf4c2cc,
+    0xe07fdf63, 0x8906d406, 0x56caeabd, 0x3fb3e1d8, 0x8438fc77,
+    0xed41f712, 0x285fc168, 0x4126ca0d, 0xfaadd7a2, 0x93d4dcc7,
+    0xabe0bd17, 0xc299b672, 0x7912abdd, 0x106ba0b8, 0xd57596c2,
+    0xbc0c9da7, 0x07878008, 0x6efe8b6d, 0x77ef43a8, 0x1e9648cd,
+    0xa51d5562, 0xcc645e07, 0x097a687d, 0x60036318, 0xdb887eb7,
+    0xb2f175d2, 0x8ac51402, 0xe3bc1f67, 0x583702c8, 0x314e09ad,
+    0xf4503fd7, 0x9d2934b2, 0x26a2291d, 0x4fdb2278, 0x1481b897,
+    0x7df8b3f2, 0xc673ae5d, 0xaf0aa538, 0x6a149342, 0x036d9827,
+    0xb8e68588, 0xd19f8eed, 0xe9abef3d, 0x80d2e458, 0x3b59f9f7,
+    0x5220f292, 0x973ec4e8, 0xfe47cf8d, 0x45ccd222, 0x2cb5d947,
+    0x35a41182, 0x5cdd1ae7, 0xe7560748, 0x8e2f0c2d, 0x4b313a57,
+    0x22483132, 0x99c32c9d, 0xf0ba27f8, 0xc88e4628, 0xa1f74d4d,
+    0x1a7c50e2, 0x73055b87, 0xb61b6dfd, 0xdf626698, 0x64e97b37,
+    0x0d907052},
+   {0x00000000, 0x7fc99b93, 0xff933726, 0x805aacb5, 0x2457680d,
+    0x5b9ef39e, 0xdbc45f2b, 0xa40dc4b8, 0x48aed01a, 0x37674b89,
+    0xb73de73c, 0xc8f47caf, 0x6cf9b817, 0x13302384, 0x936a8f31,
+    0xeca314a2, 0x915da034, 0xee943ba7, 0x6ece9712, 0x11070c81,
+    0xb50ac839, 0xcac353aa, 0x4a99ff1f, 0x3550648c, 0xd9f3702e,
+    0xa63aebbd, 0x26604708, 0x59a9dc9b, 0xfda41823, 0x826d83b0,
+    0x02372f05, 0x7dfeb496, 0xf9ca4629, 0x8603ddba, 0x0659710f,
+    0x7990ea9c, 0xdd9d2e24, 0xa254b5b7, 0x220e1902, 0x5dc78291,
+    0xb1649633, 0xcead0da0, 0x4ef7a115, 0x313e3a86, 0x9533fe3e,
+    0xeafa65ad, 0x6aa0c918, 0x1569528b, 0x6897e61d, 0x175e7d8e,
+    0x9704d13b, 0xe8cd4aa8, 0x4cc08e10, 0x33091583, 0xb353b936,
+    0xcc9a22a5, 0x20393607, 0x5ff0ad94, 0xdfaa0121, 0xa0639ab2,
+    0x046e5e0a, 0x7ba7c599, 0xfbfd692c, 0x8434f2bf, 0x28e58a13,
+    0x572c1180, 0xd776bd35, 0xa8bf26a6, 0x0cb2e21e, 0x737b798d,
+    0xf321d538, 0x8ce84eab, 0x604b5a09, 0x1f82c19a, 0x9fd86d2f,
+    0xe011f6bc, 0x441c3204, 0x3bd5a997, 0xbb8f0522, 0xc4469eb1,
+    0xb9b82a27, 0xc671b1b4, 0x462b1d01, 0x39e28692, 0x9def422a,
+    0xe226d9b9, 0x627c750c, 0x1db5ee9f, 0xf116fa3d, 0x8edf61ae,
+    0x0e85cd1b, 0x714c5688, 0xd5419230, 0xaa8809a3, 0x2ad2a516,
+    0x551b3e85, 0xd12fcc3a, 0xaee657a9, 0x2ebcfb1c, 0x5175608f,
+    0xf578a437, 0x8ab13fa4, 0x0aeb9311, 0x75220882, 0x99811c20,
+    0xe64887b3, 0x66122b06, 0x19dbb095, 0xbdd6742d, 0xc21fefbe,
+    0x4245430b, 0x3d8cd898, 0x40726c0e, 0x3fbbf79d, 0xbfe15b28,
+    0xc028c0bb, 0x64250403, 0x1bec9f90, 0x9bb63325, 0xe47fa8b6,
+    0x08dcbc14, 0x77152787, 0xf74f8b32, 0x888610a1, 0x2c8bd419,
+    0x53424f8a, 0xd318e33f, 0xacd178ac, 0x51cb1426, 0x2e028fb5,
+    0xae582300, 0xd191b893, 0x759c7c2b, 0x0a55e7b8, 0x8a0f4b0d,
+    0xf5c6d09e, 0x1965c43c, 0x66ac5faf, 0xe6f6f31a, 0x993f6889,
+    0x3d32ac31, 0x42fb37a2, 0xc2a19b17, 0xbd680084, 0xc096b412,
+    0xbf5f2f81, 0x3f058334, 0x40cc18a7, 0xe4c1dc1f, 0x9b08478c,
+    0x1b52eb39, 0x649b70aa, 0x88386408, 0xf7f1ff9b, 0x77ab532e,
+    0x0862c8bd, 0xac6f0c05, 0xd3a69796, 0x53fc3b23, 0x2c35a0b0,
+    0xa801520f, 0xd7c8c99c, 0x57926529, 0x285bfeba, 0x8c563a02,
+    0xf39fa191, 0x73c50d24, 0x0c0c96b7, 0xe0af8215, 0x9f661986,
+    0x1f3cb533, 0x60f52ea0, 0xc4f8ea18, 0xbb31718b, 0x3b6bdd3e,
+    0x44a246ad, 0x395cf23b, 0x469569a8, 0xc6cfc51d, 0xb9065e8e,
+    0x1d0b9a36, 0x62c201a5, 0xe298ad10, 0x9d513683, 0x71f22221,
+    0x0e3bb9b2, 0x8e611507, 0xf1a88e94, 0x55a54a2c, 0x2a6cd1bf,
+    0xaa367d0a, 0xd5ffe699, 0x792e9e35, 0x06e705a6, 0x86bda913,
+    0xf9743280, 0x5d79f638, 0x22b06dab, 0xa2eac11e, 0xdd235a8d,
+    0x31804e2f, 0x4e49d5bc, 0xce137909, 0xb1dae29a, 0x15d72622,
+    0x6a1ebdb1, 0xea441104, 0x958d8a97, 0xe8733e01, 0x97baa592,
+    0x17e00927, 0x682992b4, 0xcc24560c, 0xb3edcd9f, 0x33b7612a,
+    0x4c7efab9, 0xa0ddee1b, 0xdf147588, 0x5f4ed93d, 0x208742ae,
+    0x848a8616, 0xfb431d85, 0x7b19b130, 0x04d02aa3, 0x80e4d81c,
+    0xff2d438f, 0x7f77ef3a, 0x00be74a9, 0xa4b3b011, 0xdb7a2b82,
+    0x5b208737, 0x24e91ca4, 0xc84a0806, 0xb7839395, 0x37d93f20,
+    0x4810a4b3, 0xec1d600b, 0x93d4fb98, 0x138e572d, 0x6c47ccbe,
+    0x11b97828, 0x6e70e3bb, 0xee2a4f0e, 0x91e3d49d, 0x35ee1025,
+    0x4a278bb6, 0xca7d2703, 0xb5b4bc90, 0x5917a832, 0x26de33a1,
+    0xa6849f14, 0xd94d0487, 0x7d40c03f, 0x02895bac, 0x82d3f719,
+    0xfd1a6c8a},
+   {0x00000000, 0xa396284c, 0x9c5d56d9, 0x3fcb7e95, 0xe3cbabf3,
+    0x405d83bf, 0x7f96fd2a, 0xdc00d566, 0x1ce651a7, 0xbf7079eb,
+    0x80bb077e, 0x232d2f32, 0xff2dfa54, 0x5cbbd218, 0x6370ac8d,
+    0xc0e684c1, 0x39cca34e, 0x9a5a8b02, 0xa591f597, 0x0607dddb,
+    0xda0708bd, 0x799120f1, 0x465a5e64, 0xe5cc7628, 0x252af2e9,
+    0x86bcdaa5, 0xb977a430, 0x1ae18c7c, 0xc6e1591a, 0x65777156,
+    0x5abc0fc3, 0xf92a278f, 0x7399469c, 0xd00f6ed0, 0xefc41045,
+    0x4c523809, 0x9052ed6f, 0x33c4c523, 0x0c0fbbb6, 0xaf9993fa,
+    0x6f7f173b, 0xcce93f77, 0xf32241e2, 0x50b469ae, 0x8cb4bcc8,
+    0x2f229484, 0x10e9ea11, 0xb37fc25d, 0x4a55e5d2, 0xe9c3cd9e,
+    0xd608b30b, 0x759e9b47, 0xa99e4e21, 0x0a08666d, 0x35c318f8,
+    0x965530b4, 0x56b3b475, 0xf5259c39, 0xcaeee2ac, 0x6978cae0,
+    0xb5781f86, 0x16ee37ca, 0x2925495f, 0x8ab36113, 0xe7328d38,
+    0x44a4a574, 0x7b6fdbe1, 0xd8f9f3ad, 0x04f926cb, 0xa76f0e87,
+    0x98a47012, 0x3b32585e, 0xfbd4dc9f, 0x5842f4d3, 0x67898a46,
+    0xc41fa20a, 0x181f776c, 0xbb895f20, 0x844221b5, 0x27d409f9,
+    0xdefe2e76, 0x7d68063a, 0x42a378af, 0xe13550e3, 0x3d358585,
+    0x9ea3adc9, 0xa168d35c, 0x02fefb10, 0xc2187fd1, 0x618e579d,
+    0x5e452908, 0xfdd30144, 0x21d3d422, 0x8245fc6e, 0xbd8e82fb,
+    0x1e18aab7, 0x94abcba4, 0x373de3e8, 0x08f69d7d, 0xab60b531,
+    0x77606057, 0xd4f6481b, 0xeb3d368e, 0x48ab1ec2, 0x884d9a03,
+    0x2bdbb24f, 0x1410ccda, 0xb786e496, 0x6b8631f0, 0xc81019bc,
+    0xf7db6729, 0x544d4f65, 0xad6768ea, 0x0ef140a6, 0x313a3e33,
+    0x92ac167f, 0x4eacc319, 0xed3aeb55, 0xd2f195c0, 0x7167bd8c,
+    0xb181394d, 0x12171101, 0x2ddc6f94, 0x8e4a47d8, 0x524a92be,
+    0xf1dcbaf2, 0xce17c467, 0x6d81ec2b, 0x15141c31, 0xb682347d,
+    0x89494ae8, 0x2adf62a4, 0xf6dfb7c2, 0x55499f8e, 0x6a82e11b,
+    0xc914c957, 0x09f24d96, 0xaa6465da, 0x95af1b4f, 0x36393303,
+    0xea39e665, 0x49afce29, 0x7664b0bc, 0xd5f298f0, 0x2cd8bf7f,
+    0x8f4e9733, 0xb085e9a6, 0x1313c1ea, 0xcf13148c, 0x6c853cc0,
+    0x534e4255, 0xf0d86a19, 0x303eeed8, 0x93a8c694, 0xac63b801,
+    0x0ff5904d, 0xd3f5452b, 0x70636d67, 0x4fa813f2, 0xec3e3bbe,
+    0x668d5aad, 0xc51b72e1, 0xfad00c74, 0x59462438, 0x8546f15e,
+    0x26d0d912, 0x191ba787, 0xba8d8fcb, 0x7a6b0b0a, 0xd9fd2346,
+    0xe6365dd3, 0x45a0759f, 0x99a0a0f9, 0x3a3688b5, 0x05fdf620,
+    0xa66bde6c, 0x5f41f9e3, 0xfcd7d1af, 0xc31caf3a, 0x608a8776,
+    0xbc8a5210, 0x1f1c7a5c, 0x20d704c9, 0x83412c85, 0x43a7a844,
+    0xe0318008, 0xdffafe9d, 0x7c6cd6d1, 0xa06c03b7, 0x03fa2bfb,
+    0x3c31556e, 0x9fa77d22, 0xf2269109, 0x51b0b945, 0x6e7bc7d0,
+    0xcdedef9c, 0x11ed3afa, 0xb27b12b6, 0x8db06c23, 0x2e26446f,
+    0xeec0c0ae, 0x4d56e8e2, 0x729d9677, 0xd10bbe3b, 0x0d0b6b5d,
+    0xae9d4311, 0x91563d84, 0x32c015c8, 0xcbea3247, 0x687c1a0b,
+    0x57b7649e, 0xf4214cd2, 0x282199b4, 0x8bb7b1f8, 0xb47ccf6d,
+    0x17eae721, 0xd70c63e0, 0x749a4bac, 0x4b513539, 0xe8c71d75,
+    0x34c7c813, 0x9751e05f, 0xa89a9eca, 0x0b0cb686, 0x81bfd795,
+    0x2229ffd9, 0x1de2814c, 0xbe74a900, 0x62747c66, 0xc1e2542a,
+    0xfe292abf, 0x5dbf02f3, 0x9d598632, 0x3ecfae7e, 0x0104d0eb,
+    0xa292f8a7, 0x7e922dc1, 0xdd04058d, 0xe2cf7b18, 0x41595354,
+    0xb87374db, 0x1be55c97, 0x242e2202, 0x87b80a4e, 0x5bb8df28,
+    0xf82ef764, 0xc7e589f1, 0x6473a1bd, 0xa495257c, 0x07030d30,
+    0x38c873a5, 0x9b5e5be9, 0x475e8e8f, 0xe4c8a6c3, 0xdb03d856,
+    0x7895f01a},
+   {0x00000000, 0x2a283862, 0x545070c4, 0x7e7848a6, 0xa8a0e188,
+    0x8288d9ea, 0xfcf0914c, 0xd6d8a92e, 0x8a30c551, 0xa018fd33,
+    0xde60b595, 0xf4488df7, 0x229024d9, 0x08b81cbb, 0x76c0541d,
+    0x5ce86c7f, 0xcf108ce3, 0xe538b481, 0x9b40fc27, 0xb168c445,
+    0x67b06d6b, 0x4d985509, 0x33e01daf, 0x19c825cd, 0x452049b2,
+    0x6f0871d0, 0x11703976, 0x3b580114, 0xed80a83a, 0xc7a89058,
+    0xb9d0d8fe, 0x93f8e09c, 0x45501f87, 0x6f7827e5, 0x11006f43,
+    0x3b285721, 0xedf0fe0f, 0xc7d8c66d, 0xb9a08ecb, 0x9388b6a9,
+    0xcf60dad6, 0xe548e2b4, 0x9b30aa12, 0xb1189270, 0x67c03b5e,
+    0x4de8033c, 0x33904b9a, 0x19b873f8, 0x8a409364, 0xa068ab06,
+    0xde10e3a0, 0xf438dbc2, 0x22e072ec, 0x08c84a8e, 0x76b00228,
+    0x5c983a4a, 0x00705635, 0x2a586e57, 0x542026f1, 0x7e081e93,
+    0xa8d0b7bd, 0x82f88fdf, 0xfc80c779, 0xd6a8ff1b, 0x8aa03f0e,
+    0xa088076c, 0xdef04fca, 0xf4d877a8, 0x2200de86, 0x0828e6e4,
+    0x7650ae42, 0x5c789620, 0x0090fa5f, 0x2ab8c23d, 0x54c08a9b,
+    0x7ee8b2f9, 0xa8301bd7, 0x821823b5, 0xfc606b13, 0xd6485371,
+    0x45b0b3ed, 0x6f988b8f, 0x11e0c329, 0x3bc8fb4b, 0xed105265,
+    0xc7386a07, 0xb94022a1, 0x93681ac3, 0xcf8076bc, 0xe5a84ede,
+    0x9bd00678, 0xb1f83e1a, 0x67209734, 0x4d08af56, 0x3370e7f0,
+    0x1958df92, 0xcff02089, 0xe5d818eb, 0x9ba0504d, 0xb188682f,
+    0x6750c101, 0x4d78f963, 0x3300b1c5, 0x192889a7, 0x45c0e5d8,
+    0x6fe8ddba, 0x1190951c, 0x3bb8ad7e, 0xed600450, 0xc7483c32,
+    0xb9307494, 0x93184cf6, 0x00e0ac6a, 0x2ac89408, 0x54b0dcae,
+    0x7e98e4cc, 0xa8404de2, 0x82687580, 0xfc103d26, 0xd6380544,
+    0x8ad0693b, 0xa0f85159, 0xde8019ff, 0xf4a8219d, 0x227088b3,
+    0x0858b0d1, 0x7620f877, 0x5c08c015, 0xce31785d, 0xe419403f,
+    0x9a610899, 0xb04930fb, 0x669199d5, 0x4cb9a1b7, 0x32c1e911,
+    0x18e9d173, 0x4401bd0c, 0x6e29856e, 0x1051cdc8, 0x3a79f5aa,
+    0xeca15c84, 0xc68964e6, 0xb8f12c40, 0x92d91422, 0x0121f4be,
+    0x2b09ccdc, 0x5571847a, 0x7f59bc18, 0xa9811536, 0x83a92d54,
+    0xfdd165f2, 0xd7f95d90, 0x8b1131ef, 0xa139098d, 0xdf41412b,
+    0xf5697949, 0x23b1d067, 0x0999e805, 0x77e1a0a3, 0x5dc998c1,
+    0x8b6167da, 0xa1495fb8, 0xdf31171e, 0xf5192f7c, 0x23c18652,
+    0x09e9be30, 0x7791f696, 0x5db9cef4, 0x0151a28b, 0x2b799ae9,
+    0x5501d24f, 0x7f29ea2d, 0xa9f14303, 0x83d97b61, 0xfda133c7,
+    0xd7890ba5, 0x4471eb39, 0x6e59d35b, 0x10219bfd, 0x3a09a39f,
+    0xecd10ab1, 0xc6f932d3, 0xb8817a75, 0x92a94217, 0xce412e68,
+    0xe469160a, 0x9a115eac, 0xb03966ce, 0x66e1cfe0, 0x4cc9f782,
+    0x32b1bf24, 0x18998746, 0x44914753, 0x6eb97f31, 0x10c13797,
+    0x3ae90ff5, 0xec31a6db, 0xc6199eb9, 0xb861d61f, 0x9249ee7d,
+    0xcea18202, 0xe489ba60, 0x9af1f2c6, 0xb0d9caa4, 0x6601638a,
+    0x4c295be8, 0x3251134e, 0x18792b2c, 0x8b81cbb0, 0xa1a9f3d2,
+    0xdfd1bb74, 0xf5f98316, 0x23212a38, 0x0909125a, 0x77715afc,
+    0x5d59629e, 0x01b10ee1, 0x2b993683, 0x55e17e25, 0x7fc94647,
+    0xa911ef69, 0x8339d70b, 0xfd419fad, 0xd769a7cf, 0x01c158d4,
+    0x2be960b6, 0x55912810, 0x7fb91072, 0xa961b95c, 0x8349813e,
+    0xfd31c998, 0xd719f1fa, 0x8bf19d85, 0xa1d9a5e7, 0xdfa1ed41,
+    0xf589d523, 0x23517c0d, 0x0979446f, 0x77010cc9, 0x5d2934ab,
+    0xced1d437, 0xe4f9ec55, 0x9a81a4f3, 0xb0a99c91, 0x667135bf,
+    0x4c590ddd, 0x3221457b, 0x18097d19, 0x44e11166, 0x6ec92904,
+    0x10b161a2, 0x3a9959c0, 0xec41f0ee, 0xc669c88c, 0xb811802a,
+    0x9239b848},
+   {0x00000000, 0x4713f6fb, 0x8e27edf6, 0xc9341b0d, 0xc73eddad,
+    0x802d2b56, 0x4919305b, 0x0e0ac6a0, 0x550cbd1b, 0x121f4be0,
+    0xdb2b50ed, 0x9c38a616, 0x923260b6, 0xd521964d, 0x1c158d40,
+    0x5b067bbb, 0xaa197a36, 0xed0a8ccd, 0x243e97c0, 0x632d613b,
+    0x6d27a79b, 0x2a345160, 0xe3004a6d, 0xa413bc96, 0xff15c72d,
+    0xb80631d6, 0x71322adb, 0x3621dc20, 0x382b1a80, 0x7f38ec7b,
+    0xb60cf776, 0xf11f018d, 0x8f43f22d, 0xc85004d6, 0x01641fdb,
+    0x4677e920, 0x487d2f80, 0x0f6ed97b, 0xc65ac276, 0x8149348d,
+    0xda4f4f36, 0x9d5cb9cd, 0x5468a2c0, 0x137b543b, 0x1d71929b,
+    0x5a626460, 0x93567f6d, 0xd4458996, 0x255a881b, 0x62497ee0,
+    0xab7d65ed, 0xec6e9316, 0xe26455b6, 0xa577a34d, 0x6c43b840,
+    0x2b504ebb, 0x70563500, 0x3745c3fb, 0xfe71d8f6, 0xb9622e0d,
+    0xb768e8ad, 0xf07b1e56, 0x394f055b, 0x7e5cf3a0, 0xc5f6e21b,
+    0x82e514e0, 0x4bd10fed, 0x0cc2f916, 0x02c83fb6, 0x45dbc94d,
+    0x8cefd240, 0xcbfc24bb, 0x90fa5f00, 0xd7e9a9fb, 0x1eddb2f6,
+    0x59ce440d, 0x57c482ad, 0x10d77456, 0xd9e36f5b, 0x9ef099a0,
+    0x6fef982d, 0x28fc6ed6, 0xe1c875db, 0xa6db8320, 0xa8d14580,
+    0xefc2b37b, 0x26f6a876, 0x61e55e8d, 0x3ae32536, 0x7df0d3cd,
+    0xb4c4c8c0, 0xf3d73e3b, 0xfdddf89b, 0xbace0e60, 0x73fa156d,
+    0x34e9e396, 0x4ab51036, 0x0da6e6cd, 0xc492fdc0, 0x83810b3b,
+    0x8d8bcd9b, 0xca983b60, 0x03ac206d, 0x44bfd696, 0x1fb9ad2d,
+    0x58aa5bd6, 0x919e40db, 0xd68db620, 0xd8877080, 0x9f94867b,
+    0x56a09d76, 0x11b36b8d, 0xe0ac6a00, 0xa7bf9cfb, 0x6e8b87f6,
+    0x2998710d, 0x2792b7ad, 0x60814156, 0xa9b55a5b, 0xeea6aca0,
+    0xb5a0d71b, 0xf2b321e0, 0x3b873aed, 0x7c94cc16, 0x729e0ab6,
+    0x358dfc4d, 0xfcb9e740, 0xbbaa11bb, 0x509cc277, 0x178f348c,
+    0xdebb2f81, 0x99a8d97a, 0x97a21fda, 0xd0b1e921, 0x1985f22c,
+    0x5e9604d7, 0x05907f6c, 0x42838997, 0x8bb7929a, 0xcca46461,
+    0xc2aea2c1, 0x85bd543a, 0x4c894f37, 0x0b9ab9cc, 0xfa85b841,
+    0xbd964eba, 0x74a255b7, 0x33b1a34c, 0x3dbb65ec, 0x7aa89317,
+    0xb39c881a, 0xf48f7ee1, 0xaf89055a, 0xe89af3a1, 0x21aee8ac,
+    0x66bd1e57, 0x68b7d8f7, 0x2fa42e0c, 0xe6903501, 0xa183c3fa,
+    0xdfdf305a, 0x98ccc6a1, 0x51f8ddac, 0x16eb2b57, 0x18e1edf7,
+    0x5ff21b0c, 0x96c60001, 0xd1d5f6fa, 0x8ad38d41, 0xcdc07bba,
+    0x04f460b7, 0x43e7964c, 0x4ded50ec, 0x0afea617, 0xc3cabd1a,
+    0x84d94be1, 0x75c64a6c, 0x32d5bc97, 0xfbe1a79a, 0xbcf25161,
+    0xb2f897c1, 0xf5eb613a, 0x3cdf7a37, 0x7bcc8ccc, 0x20caf777,
+    0x67d9018c, 0xaeed1a81, 0xe9feec7a, 0xe7f42ada, 0xa0e7dc21,
+    0x69d3c72c, 0x2ec031d7, 0x956a206c, 0xd279d697, 0x1b4dcd9a,
+    0x5c5e3b61, 0x5254fdc1, 0x15470b3a, 0xdc731037, 0x9b60e6cc,
+    0xc0669d77, 0x87756b8c, 0x4e417081, 0x0952867a, 0x075840da,
+    0x404bb621, 0x897fad2c, 0xce6c5bd7, 0x3f735a5a, 0x7860aca1,
+    0xb154b7ac, 0xf6474157, 0xf84d87f7, 0xbf5e710c, 0x766a6a01,
+    0x31799cfa, 0x6a7fe741, 0x2d6c11ba, 0xe4580ab7, 0xa34bfc4c,
+    0xad413aec, 0xea52cc17, 0x2366d71a, 0x647521e1, 0x1a29d241,
+    0x5d3a24ba, 0x940e3fb7, 0xd31dc94c, 0xdd170fec, 0x9a04f917,
+    0x5330e21a, 0x142314e1, 0x4f256f5a, 0x083699a1, 0xc10282ac,
+    0x86117457, 0x881bb2f7, 0xcf08440c, 0x063c5f01, 0x412fa9fa,
+    0xb030a877, 0xf7235e8c, 0x3e174581, 0x7904b37a, 0x770e75da,
+    0x301d8321, 0xf929982c, 0xbe3a6ed7, 0xe53c156c, 0xa22fe397,
+    0x6b1bf89a, 0x2c080e61, 0x2202c8c1, 0x65113e3a, 0xac252537,
+    0xeb36d3cc},
+   {0x00000000, 0xa13984ee, 0x99020f9d, 0x383b8b73, 0xe975197b,
+    0x484c9d95, 0x707716e6, 0xd14e9208, 0x099b34b7, 0xa8a2b059,
+    0x90993b2a, 0x31a0bfc4, 0xe0ee2dcc, 0x41d7a922, 0x79ec2251,
+    0xd8d5a6bf, 0x1336696e, 0xb20fed80, 0x8a3466f3, 0x2b0de21d,
+    0xfa437015, 0x5b7af4fb, 0x63417f88, 0xc278fb66, 0x1aad5dd9,
+    0xbb94d937, 0x83af5244, 0x2296d6aa, 0xf3d844a2, 0x52e1c04c,
+    0x6ada4b3f, 0xcbe3cfd1, 0x266cd2dc, 0x87555632, 0xbf6edd41,
+    0x1e5759af, 0xcf19cba7, 0x6e204f49, 0x561bc43a, 0xf72240d4,
+    0x2ff7e66b, 0x8ece6285, 0xb6f5e9f6, 0x17cc6d18, 0xc682ff10,
+    0x67bb7bfe, 0x5f80f08d, 0xfeb97463, 0x355abbb2, 0x94633f5c,
+    0xac58b42f, 0x0d6130c1, 0xdc2fa2c9, 0x7d162627, 0x452dad54,
+    0xe41429ba, 0x3cc18f05, 0x9df80beb, 0xa5c38098, 0x04fa0476,
+    0xd5b4967e, 0x748d1290, 0x4cb699e3, 0xed8f1d0d, 0x4cd9a5b8,
+    0xede02156, 0xd5dbaa25, 0x74e22ecb, 0xa5acbcc3, 0x0495382d,
+    0x3caeb35e, 0x9d9737b0, 0x4542910f, 0xe47b15e1, 0xdc409e92,
+    0x7d791a7c, 0xac378874, 0x0d0e0c9a, 0x353587e9, 0x940c0307,
+    0x5fefccd6, 0xfed64838, 0xc6edc34b, 0x67d447a5, 0xb69ad5ad,
+    0x17a35143, 0x2f98da30, 0x8ea15ede, 0x5674f861, 0xf74d7c8f,
+    0xcf76f7fc, 0x6e4f7312, 0xbf01e11a, 0x1e3865f4, 0x2603ee87,
+    0x873a6a69, 0x6ab57764, 0xcb8cf38a, 0xf3b778f9, 0x528efc17,
+    0x83c06e1f, 0x22f9eaf1, 0x1ac26182, 0xbbfbe56c, 0x632e43d3,
+    0xc217c73d, 0xfa2c4c4e, 0x5b15c8a0, 0x8a5b5aa8, 0x2b62de46,
+    0x13595535, 0xb260d1db, 0x79831e0a, 0xd8ba9ae4, 0xe0811197,
+    0x41b89579, 0x90f60771, 0x31cf839f, 0x09f408ec, 0xa8cd8c02,
+    0x70182abd, 0xd121ae53, 0xe91a2520, 0x4823a1ce, 0x996d33c6,
+    0x3854b728, 0x006f3c5b, 0xa156b8b5, 0x99b34b70, 0x388acf9e,
+    0x00b144ed, 0xa188c003, 0x70c6520b, 0xd1ffd6e5, 0xe9c45d96,
+    0x48fdd978, 0x90287fc7, 0x3111fb29, 0x092a705a, 0xa813f4b4,
+    0x795d66bc, 0xd864e252, 0xe05f6921, 0x4166edcf, 0x8a85221e,
+    0x2bbca6f0, 0x13872d83, 0xb2bea96d, 0x63f03b65, 0xc2c9bf8b,
+    0xfaf234f8, 0x5bcbb016, 0x831e16a9, 0x22279247, 0x1a1c1934,
+    0xbb259dda, 0x6a6b0fd2, 0xcb528b3c, 0xf369004f, 0x525084a1,
+    0xbfdf99ac, 0x1ee61d42, 0x26dd9631, 0x87e412df, 0x56aa80d7,
+    0xf7930439, 0xcfa88f4a, 0x6e910ba4, 0xb644ad1b, 0x177d29f5,
+    0x2f46a286, 0x8e7f2668, 0x5f31b460, 0xfe08308e, 0xc633bbfd,
+    0x670a3f13, 0xace9f0c2, 0x0dd0742c, 0x35ebff5f, 0x94d27bb1,
+    0x459ce9b9, 0xe4a56d57, 0xdc9ee624, 0x7da762ca, 0xa572c475,
+    0x044b409b, 0x3c70cbe8, 0x9d494f06, 0x4c07dd0e, 0xed3e59e0,
+    0xd505d293, 0x743c567d, 0xd56aeec8, 0x74536a26, 0x4c68e155,
+    0xed5165bb, 0x3c1ff7b3, 0x9d26735d, 0xa51df82e, 0x04247cc0,
+    0xdcf1da7f, 0x7dc85e91, 0x45f3d5e2, 0xe4ca510c, 0x3584c304,
+    0x94bd47ea, 0xac86cc99, 0x0dbf4877, 0xc65c87a6, 0x67650348,
+    0x5f5e883b, 0xfe670cd5, 0x2f299edd, 0x8e101a33, 0xb62b9140,
+    0x171215ae, 0xcfc7b311, 0x6efe37ff, 0x56c5bc8c, 0xf7fc3862,
+    0x26b2aa6a, 0x878b2e84, 0xbfb0a5f7, 0x1e892119, 0xf3063c14,
+    0x523fb8fa, 0x6a043389, 0xcb3db767, 0x1a73256f, 0xbb4aa181,
+    0x83712af2, 0x2248ae1c, 0xfa9d08a3, 0x5ba48c4d, 0x639f073e,
+    0xc2a683d0, 0x13e811d8, 0xb2d19536, 0x8aea1e45, 0x2bd39aab,
+    0xe030557a, 0x4109d194, 0x79325ae7, 0xd80bde09, 0x09454c01,
+    0xa87cc8ef, 0x9047439c, 0x317ec772, 0xe9ab61cd, 0x4892e523,
+    0x70a96e50, 0xd190eabe, 0x00de78b6, 0xa1e7fc58, 0x99dc772b,
+    0x38e5f3c5},
+   {0x00000000, 0xe81790a1, 0x0b5e2703, 0xe349b7a2, 0x16bc4e06,
+    0xfeabdea7, 0x1de26905, 0xf5f5f9a4, 0x2d789c0c, 0xc56f0cad,
+    0x2626bb0f, 0xce312bae, 0x3bc4d20a, 0xd3d342ab, 0x309af509,
+    0xd88d65a8, 0x5af13818, 0xb2e6a8b9, 0x51af1f1b, 0xb9b88fba,
+    0x4c4d761e, 0xa45ae6bf, 0x4713511d, 0xaf04c1bc, 0x7789a414,
+    0x9f9e34b5, 0x7cd78317, 0x94c013b6, 0x6135ea12, 0x89227ab3,
+    0x6a6bcd11, 0x827c5db0, 0xb5e27030, 0x5df5e091, 0xbebc5733,
+    0x56abc792, 0xa35e3e36, 0x4b49ae97, 0xa8001935, 0x40178994,
+    0x989aec3c, 0x708d7c9d, 0x93c4cb3f, 0x7bd35b9e, 0x8e26a23a,
+    0x6631329b, 0x85788539, 0x6d6f1598, 0xef134828, 0x0704d889,
+    0xe44d6f2b, 0x0c5aff8a, 0xf9af062e, 0x11b8968f, 0xf2f1212d,
+    0x1ae6b18c, 0xc26bd424, 0x2a7c4485, 0xc935f327, 0x21226386,
+    0xd4d79a22, 0x3cc00a83, 0xdf89bd21, 0x379e2d80, 0xb0b5e621,
+    0x58a27680, 0xbbebc122, 0x53fc5183, 0xa609a827, 0x4e1e3886,
+    0xad578f24, 0x45401f85, 0x9dcd7a2d, 0x75daea8c, 0x96935d2e,
+    0x7e84cd8f, 0x8b71342b, 0x6366a48a, 0x802f1328, 0x68388389,
+    0xea44de39, 0x02534e98, 0xe11af93a, 0x090d699b, 0xfcf8903f,
+    0x14ef009e, 0xf7a6b73c, 0x1fb1279d, 0xc73c4235, 0x2f2bd294,
+    0xcc626536, 0x2475f597, 0xd1800c33, 0x39979c92, 0xdade2b30,
+    0x32c9bb91, 0x05579611, 0xed4006b0, 0x0e09b112, 0xe61e21b3,
+    0x13ebd817, 0xfbfc48b6, 0x18b5ff14, 0xf0a26fb5, 0x282f0a1d,
+    0xc0389abc, 0x23712d1e, 0xcb66bdbf, 0x3e93441b, 0xd684d4ba,
+    0x35cd6318, 0xdddaf3b9, 0x5fa6ae09, 0xb7b13ea8, 0x54f8890a,
+    0xbcef19ab, 0x491ae00f, 0xa10d70ae, 0x4244c70c, 0xaa5357ad,
+    0x72de3205, 0x9ac9a2a4, 0x79801506, 0x919785a7, 0x64627c03,
+    0x8c75eca2, 0x6f3c5b00, 0x872bcba1, 0xba1aca03, 0x520d5aa2,
+    0xb144ed00, 0x59537da1, 0xaca68405, 0x44b114a4, 0xa7f8a306,
+    0x4fef33a7, 0x9762560f, 0x7f75c6ae, 0x9c3c710c, 0x742be1ad,
+    0x81de1809, 0x69c988a8, 0x8a803f0a, 0x6297afab, 0xe0ebf21b,
+    0x08fc62ba, 0xebb5d518, 0x03a245b9, 0xf657bc1d, 0x1e402cbc,
+    0xfd099b1e, 0x151e0bbf, 0xcd936e17, 0x2584feb6, 0xc6cd4914,
+    0x2edad9b5, 0xdb2f2011, 0x3338b0b0, 0xd0710712, 0x386697b3,
+    0x0ff8ba33, 0xe7ef2a92, 0x04a69d30, 0xecb10d91, 0x1944f435,
+    0xf1536494, 0x121ad336, 0xfa0d4397, 0x2280263f, 0xca97b69e,
+    0x29de013c, 0xc1c9919d, 0x343c6839, 0xdc2bf898, 0x3f624f3a,
+    0xd775df9b, 0x5509822b, 0xbd1e128a, 0x5e57a528, 0xb6403589,
+    0x43b5cc2d, 0xaba25c8c, 0x48ebeb2e, 0xa0fc7b8f, 0x78711e27,
+    0x90668e86, 0x732f3924, 0x9b38a985, 0x6ecd5021, 0x86dac080,
+    0x65937722, 0x8d84e783, 0x0aaf2c22, 0xe2b8bc83, 0x01f10b21,
+    0xe9e69b80, 0x1c136224, 0xf404f285, 0x174d4527, 0xff5ad586,
+    0x27d7b02e, 0xcfc0208f, 0x2c89972d, 0xc49e078c, 0x316bfe28,
+    0xd97c6e89, 0x3a35d92b, 0xd222498a, 0x505e143a, 0xb849849b,
+    0x5b003339, 0xb317a398, 0x46e25a3c, 0xaef5ca9d, 0x4dbc7d3f,
+    0xa5abed9e, 0x7d268836, 0x95311897, 0x7678af35, 0x9e6f3f94,
+    0x6b9ac630, 0x838d5691, 0x60c4e133, 0x88d37192, 0xbf4d5c12,
+    0x575accb3, 0xb4137b11, 0x5c04ebb0, 0xa9f11214, 0x41e682b5,
+    0xa2af3517, 0x4ab8a5b6, 0x9235c01e, 0x7a2250bf, 0x996be71d,
+    0x717c77bc, 0x84898e18, 0x6c9e1eb9, 0x8fd7a91b, 0x67c039ba,
+    0xe5bc640a, 0x0dabf4ab, 0xeee24309, 0x06f5d3a8, 0xf3002a0c,
+    0x1b17baad, 0xf85e0d0f, 0x10499dae, 0xc8c4f806, 0x20d368a7,
+    0xc39adf05, 0x2b8d4fa4, 0xde78b600, 0x366f26a1, 0xd5269103,
+    0x3d3101a2}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x0000000000000000, 0xa19017e800000000, 0x03275e0b00000000,
+    0xa2b749e300000000, 0x064ebc1600000000, 0xa7deabfe00000000,
+    0x0569e21d00000000, 0xa4f9f5f500000000, 0x0c9c782d00000000,
+    0xad0c6fc500000000, 0x0fbb262600000000, 0xae2b31ce00000000,
+    0x0ad2c43b00000000, 0xab42d3d300000000, 0x09f59a3000000000,
+    0xa8658dd800000000, 0x1838f15a00000000, 0xb9a8e6b200000000,
+    0x1b1faf5100000000, 0xba8fb8b900000000, 0x1e764d4c00000000,
+    0xbfe65aa400000000, 0x1d51134700000000, 0xbcc104af00000000,
+    0x14a4897700000000, 0xb5349e9f00000000, 0x1783d77c00000000,
+    0xb613c09400000000, 0x12ea356100000000, 0xb37a228900000000,
+    0x11cd6b6a00000000, 0xb05d7c8200000000, 0x3070e2b500000000,
+    0x91e0f55d00000000, 0x3357bcbe00000000, 0x92c7ab5600000000,
+    0x363e5ea300000000, 0x97ae494b00000000, 0x351900a800000000,
+    0x9489174000000000, 0x3cec9a9800000000, 0x9d7c8d7000000000,
+    0x3fcbc49300000000, 0x9e5bd37b00000000, 0x3aa2268e00000000,
+    0x9b32316600000000, 0x3985788500000000, 0x98156f6d00000000,
+    0x284813ef00000000, 0x89d8040700000000, 0x2b6f4de400000000,
+    0x8aff5a0c00000000, 0x2e06aff900000000, 0x8f96b81100000000,
+    0x2d21f1f200000000, 0x8cb1e61a00000000, 0x24d46bc200000000,
+    0x85447c2a00000000, 0x27f335c900000000, 0x8663222100000000,
+    0x229ad7d400000000, 0x830ac03c00000000, 0x21bd89df00000000,
+    0x802d9e3700000000, 0x21e6b5b000000000, 0x8076a25800000000,
+    0x22c1ebbb00000000, 0x8351fc5300000000, 0x27a809a600000000,
+    0x86381e4e00000000, 0x248f57ad00000000, 0x851f404500000000,
+    0x2d7acd9d00000000, 0x8ceada7500000000, 0x2e5d939600000000,
+    0x8fcd847e00000000, 0x2b34718b00000000, 0x8aa4666300000000,
+    0x28132f8000000000, 0x8983386800000000, 0x39de44ea00000000,
+    0x984e530200000000, 0x3af91ae100000000, 0x9b690d0900000000,
+    0x3f90f8fc00000000, 0x9e00ef1400000000, 0x3cb7a6f700000000,
+    0x9d27b11f00000000, 0x35423cc700000000, 0x94d22b2f00000000,
+    0x366562cc00000000, 0x97f5752400000000, 0x330c80d100000000,
+    0x929c973900000000, 0x302bdeda00000000, 0x91bbc93200000000,
+    0x1196570500000000, 0xb00640ed00000000, 0x12b1090e00000000,
+    0xb3211ee600000000, 0x17d8eb1300000000, 0xb648fcfb00000000,
+    0x14ffb51800000000, 0xb56fa2f000000000, 0x1d0a2f2800000000,
+    0xbc9a38c000000000, 0x1e2d712300000000, 0xbfbd66cb00000000,
+    0x1b44933e00000000, 0xbad484d600000000, 0x1863cd3500000000,
+    0xb9f3dadd00000000, 0x09aea65f00000000, 0xa83eb1b700000000,
+    0x0a89f85400000000, 0xab19efbc00000000, 0x0fe01a4900000000,
+    0xae700da100000000, 0x0cc7444200000000, 0xad5753aa00000000,
+    0x0532de7200000000, 0xa4a2c99a00000000, 0x0615807900000000,
+    0xa785979100000000, 0x037c626400000000, 0xa2ec758c00000000,
+    0x005b3c6f00000000, 0xa1cb2b8700000000, 0x03ca1aba00000000,
+    0xa25a0d5200000000, 0x00ed44b100000000, 0xa17d535900000000,
+    0x0584a6ac00000000, 0xa414b14400000000, 0x06a3f8a700000000,
+    0xa733ef4f00000000, 0x0f56629700000000, 0xaec6757f00000000,
+    0x0c713c9c00000000, 0xade12b7400000000, 0x0918de8100000000,
+    0xa888c96900000000, 0x0a3f808a00000000, 0xabaf976200000000,
+    0x1bf2ebe000000000, 0xba62fc0800000000, 0x18d5b5eb00000000,
+    0xb945a20300000000, 0x1dbc57f600000000, 0xbc2c401e00000000,
+    0x1e9b09fd00000000, 0xbf0b1e1500000000, 0x176e93cd00000000,
+    0xb6fe842500000000, 0x1449cdc600000000, 0xb5d9da2e00000000,
+    0x11202fdb00000000, 0xb0b0383300000000, 0x120771d000000000,
+    0xb397663800000000, 0x33baf80f00000000, 0x922aefe700000000,
+    0x309da60400000000, 0x910db1ec00000000, 0x35f4441900000000,
+    0x946453f100000000, 0x36d31a1200000000, 0x97430dfa00000000,
+    0x3f26802200000000, 0x9eb697ca00000000, 0x3c01de2900000000,
+    0x9d91c9c100000000, 0x39683c3400000000, 0x98f82bdc00000000,
+    0x3a4f623f00000000, 0x9bdf75d700000000, 0x2b82095500000000,
+    0x8a121ebd00000000, 0x28a5575e00000000, 0x893540b600000000,
+    0x2dccb54300000000, 0x8c5ca2ab00000000, 0x2eebeb4800000000,
+    0x8f7bfca000000000, 0x271e717800000000, 0x868e669000000000,
+    0x24392f7300000000, 0x85a9389b00000000, 0x2150cd6e00000000,
+    0x80c0da8600000000, 0x2277936500000000, 0x83e7848d00000000,
+    0x222caf0a00000000, 0x83bcb8e200000000, 0x210bf10100000000,
+    0x809be6e900000000, 0x2462131c00000000, 0x85f204f400000000,
+    0x27454d1700000000, 0x86d55aff00000000, 0x2eb0d72700000000,
+    0x8f20c0cf00000000, 0x2d97892c00000000, 0x8c079ec400000000,
+    0x28fe6b3100000000, 0x896e7cd900000000, 0x2bd9353a00000000,
+    0x8a4922d200000000, 0x3a145e5000000000, 0x9b8449b800000000,
+    0x3933005b00000000, 0x98a317b300000000, 0x3c5ae24600000000,
+    0x9dcaf5ae00000000, 0x3f7dbc4d00000000, 0x9eedaba500000000,
+    0x3688267d00000000, 0x9718319500000000, 0x35af787600000000,
+    0x943f6f9e00000000, 0x30c69a6b00000000, 0x91568d8300000000,
+    0x33e1c46000000000, 0x9271d38800000000, 0x125c4dbf00000000,
+    0xb3cc5a5700000000, 0x117b13b400000000, 0xb0eb045c00000000,
+    0x1412f1a900000000, 0xb582e64100000000, 0x1735afa200000000,
+    0xb6a5b84a00000000, 0x1ec0359200000000, 0xbf50227a00000000,
+    0x1de76b9900000000, 0xbc777c7100000000, 0x188e898400000000,
+    0xb91e9e6c00000000, 0x1ba9d78f00000000, 0xba39c06700000000,
+    0x0a64bce500000000, 0xabf4ab0d00000000, 0x0943e2ee00000000,
+    0xa8d3f50600000000, 0x0c2a00f300000000, 0xadba171b00000000,
+    0x0f0d5ef800000000, 0xae9d491000000000, 0x06f8c4c800000000,
+    0xa768d32000000000, 0x05df9ac300000000, 0xa44f8d2b00000000,
+    0x00b678de00000000, 0xa1266f3600000000, 0x039126d500000000,
+    0xa201313d00000000},
+   {0x0000000000000000, 0xee8439a100000000, 0x9d0f029900000000,
+    0x738b3b3800000000, 0x7b1975e900000000, 0x959d4c4800000000,
+    0xe616777000000000, 0x08924ed100000000, 0xb7349b0900000000,
+    0x59b0a2a800000000, 0x2a3b999000000000, 0xc4bfa03100000000,
+    0xcc2deee000000000, 0x22a9d74100000000, 0x5122ec7900000000,
+    0xbfa6d5d800000000, 0x6e69361300000000, 0x80ed0fb200000000,
+    0xf366348a00000000, 0x1de20d2b00000000, 0x157043fa00000000,
+    0xfbf47a5b00000000, 0x887f416300000000, 0x66fb78c200000000,
+    0xd95dad1a00000000, 0x37d994bb00000000, 0x4452af8300000000,
+    0xaad6962200000000, 0xa244d8f300000000, 0x4cc0e15200000000,
+    0x3f4bda6a00000000, 0xd1cfe3cb00000000, 0xdcd26c2600000000,
+    0x3256558700000000, 0x41dd6ebf00000000, 0xaf59571e00000000,
+    0xa7cb19cf00000000, 0x494f206e00000000, 0x3ac41b5600000000,
+    0xd44022f700000000, 0x6be6f72f00000000, 0x8562ce8e00000000,
+    0xf6e9f5b600000000, 0x186dcc1700000000, 0x10ff82c600000000,
+    0xfe7bbb6700000000, 0x8df0805f00000000, 0x6374b9fe00000000,
+    0xb2bb5a3500000000, 0x5c3f639400000000, 0x2fb458ac00000000,
+    0xc130610d00000000, 0xc9a22fdc00000000, 0x2726167d00000000,
+    0x54ad2d4500000000, 0xba2914e400000000, 0x058fc13c00000000,
+    0xeb0bf89d00000000, 0x9880c3a500000000, 0x7604fa0400000000,
+    0x7e96b4d500000000, 0x90128d7400000000, 0xe399b64c00000000,
+    0x0d1d8fed00000000, 0xb8a5d94c00000000, 0x5621e0ed00000000,
+    0x25aadbd500000000, 0xcb2ee27400000000, 0xc3bcaca500000000,
+    0x2d38950400000000, 0x5eb3ae3c00000000, 0xb037979d00000000,
+    0x0f91424500000000, 0xe1157be400000000, 0x929e40dc00000000,
+    0x7c1a797d00000000, 0x748837ac00000000, 0x9a0c0e0d00000000,
+    0xe987353500000000, 0x07030c9400000000, 0xd6ccef5f00000000,
+    0x3848d6fe00000000, 0x4bc3edc600000000, 0xa547d46700000000,
+    0xadd59ab600000000, 0x4351a31700000000, 0x30da982f00000000,
+    0xde5ea18e00000000, 0x61f8745600000000, 0x8f7c4df700000000,
+    0xfcf776cf00000000, 0x12734f6e00000000, 0x1ae101bf00000000,
+    0xf465381e00000000, 0x87ee032600000000, 0x696a3a8700000000,
+    0x6477b56a00000000, 0x8af38ccb00000000, 0xf978b7f300000000,
+    0x17fc8e5200000000, 0x1f6ec08300000000, 0xf1eaf92200000000,
+    0x8261c21a00000000, 0x6ce5fbbb00000000, 0xd3432e6300000000,
+    0x3dc717c200000000, 0x4e4c2cfa00000000, 0xa0c8155b00000000,
+    0xa85a5b8a00000000, 0x46de622b00000000, 0x3555591300000000,
+    0xdbd160b200000000, 0x0a1e837900000000, 0xe49abad800000000,
+    0x971181e000000000, 0x7995b84100000000, 0x7107f69000000000,
+    0x9f83cf3100000000, 0xec08f40900000000, 0x028ccda800000000,
+    0xbd2a187000000000, 0x53ae21d100000000, 0x20251ae900000000,
+    0xcea1234800000000, 0xc6336d9900000000, 0x28b7543800000000,
+    0x5b3c6f0000000000, 0xb5b856a100000000, 0x704bb39900000000,
+    0x9ecf8a3800000000, 0xed44b10000000000, 0x03c088a100000000,
+    0x0b52c67000000000, 0xe5d6ffd100000000, 0x965dc4e900000000,
+    0x78d9fd4800000000, 0xc77f289000000000, 0x29fb113100000000,
+    0x5a702a0900000000, 0xb4f413a800000000, 0xbc665d7900000000,
+    0x52e264d800000000, 0x21695fe000000000, 0xcfed664100000000,
+    0x1e22858a00000000, 0xf0a6bc2b00000000, 0x832d871300000000,
+    0x6da9beb200000000, 0x653bf06300000000, 0x8bbfc9c200000000,
+    0xf834f2fa00000000, 0x16b0cb5b00000000, 0xa9161e8300000000,
+    0x4792272200000000, 0x34191c1a00000000, 0xda9d25bb00000000,
+    0xd20f6b6a00000000, 0x3c8b52cb00000000, 0x4f0069f300000000,
+    0xa184505200000000, 0xac99dfbf00000000, 0x421de61e00000000,
+    0x3196dd2600000000, 0xdf12e48700000000, 0xd780aa5600000000,
+    0x390493f700000000, 0x4a8fa8cf00000000, 0xa40b916e00000000,
+    0x1bad44b600000000, 0xf5297d1700000000, 0x86a2462f00000000,
+    0x68267f8e00000000, 0x60b4315f00000000, 0x8e3008fe00000000,
+    0xfdbb33c600000000, 0x133f0a6700000000, 0xc2f0e9ac00000000,
+    0x2c74d00d00000000, 0x5fffeb3500000000, 0xb17bd29400000000,
+    0xb9e99c4500000000, 0x576da5e400000000, 0x24e69edc00000000,
+    0xca62a77d00000000, 0x75c472a500000000, 0x9b404b0400000000,
+    0xe8cb703c00000000, 0x064f499d00000000, 0x0edd074c00000000,
+    0xe0593eed00000000, 0x93d205d500000000, 0x7d563c7400000000,
+    0xc8ee6ad500000000, 0x266a537400000000, 0x55e1684c00000000,
+    0xbb6551ed00000000, 0xb3f71f3c00000000, 0x5d73269d00000000,
+    0x2ef81da500000000, 0xc07c240400000000, 0x7fdaf1dc00000000,
+    0x915ec87d00000000, 0xe2d5f34500000000, 0x0c51cae400000000,
+    0x04c3843500000000, 0xea47bd9400000000, 0x99cc86ac00000000,
+    0x7748bf0d00000000, 0xa6875cc600000000, 0x4803656700000000,
+    0x3b885e5f00000000, 0xd50c67fe00000000, 0xdd9e292f00000000,
+    0x331a108e00000000, 0x40912bb600000000, 0xae15121700000000,
+    0x11b3c7cf00000000, 0xff37fe6e00000000, 0x8cbcc55600000000,
+    0x6238fcf700000000, 0x6aaab22600000000, 0x842e8b8700000000,
+    0xf7a5b0bf00000000, 0x1921891e00000000, 0x143c06f300000000,
+    0xfab83f5200000000, 0x8933046a00000000, 0x67b73dcb00000000,
+    0x6f25731a00000000, 0x81a14abb00000000, 0xf22a718300000000,
+    0x1cae482200000000, 0xa3089dfa00000000, 0x4d8ca45b00000000,
+    0x3e079f6300000000, 0xd083a6c200000000, 0xd811e81300000000,
+    0x3695d1b200000000, 0x451eea8a00000000, 0xab9ad32b00000000,
+    0x7a5530e000000000, 0x94d1094100000000, 0xe75a327900000000,
+    0x09de0bd800000000, 0x014c450900000000, 0xefc87ca800000000,
+    0x9c43479000000000, 0x72c77e3100000000, 0xcd61abe900000000,
+    0x23e5924800000000, 0x506ea97000000000, 0xbeea90d100000000,
+    0xb678de0000000000, 0x58fce7a100000000, 0x2b77dc9900000000,
+    0xc5f3e53800000000},
+   {0x0000000000000000, 0xfbf6134700000000, 0xf6ed278e00000000,
+    0x0d1b34c900000000, 0xaddd3ec700000000, 0x562b2d8000000000,
+    0x5b30194900000000, 0xa0c60a0e00000000, 0x1bbd0c5500000000,
+    0xe04b1f1200000000, 0xed502bdb00000000, 0x16a6389c00000000,
+    0xb660329200000000, 0x4d9621d500000000, 0x408d151c00000000,
+    0xbb7b065b00000000, 0x367a19aa00000000, 0xcd8c0aed00000000,
+    0xc0973e2400000000, 0x3b612d6300000000, 0x9ba7276d00000000,
+    0x6051342a00000000, 0x6d4a00e300000000, 0x96bc13a400000000,
+    0x2dc715ff00000000, 0xd63106b800000000, 0xdb2a327100000000,
+    0x20dc213600000000, 0x801a2b3800000000, 0x7bec387f00000000,
+    0x76f70cb600000000, 0x8d011ff100000000, 0x2df2438f00000000,
+    0xd60450c800000000, 0xdb1f640100000000, 0x20e9774600000000,
+    0x802f7d4800000000, 0x7bd96e0f00000000, 0x76c25ac600000000,
+    0x8d34498100000000, 0x364f4fda00000000, 0xcdb95c9d00000000,
+    0xc0a2685400000000, 0x3b547b1300000000, 0x9b92711d00000000,
+    0x6064625a00000000, 0x6d7f569300000000, 0x968945d400000000,
+    0x1b885a2500000000, 0xe07e496200000000, 0xed657dab00000000,
+    0x16936eec00000000, 0xb65564e200000000, 0x4da377a500000000,
+    0x40b8436c00000000, 0xbb4e502b00000000, 0x0035567000000000,
+    0xfbc3453700000000, 0xf6d871fe00000000, 0x0d2e62b900000000,
+    0xade868b700000000, 0x561e7bf000000000, 0x5b054f3900000000,
+    0xa0f35c7e00000000, 0x1be2f6c500000000, 0xe014e58200000000,
+    0xed0fd14b00000000, 0x16f9c20c00000000, 0xb63fc80200000000,
+    0x4dc9db4500000000, 0x40d2ef8c00000000, 0xbb24fccb00000000,
+    0x005ffa9000000000, 0xfba9e9d700000000, 0xf6b2dd1e00000000,
+    0x0d44ce5900000000, 0xad82c45700000000, 0x5674d71000000000,
+    0x5b6fe3d900000000, 0xa099f09e00000000, 0x2d98ef6f00000000,
+    0xd66efc2800000000, 0xdb75c8e100000000, 0x2083dba600000000,
+    0x8045d1a800000000, 0x7bb3c2ef00000000, 0x76a8f62600000000,
+    0x8d5ee56100000000, 0x3625e33a00000000, 0xcdd3f07d00000000,
+    0xc0c8c4b400000000, 0x3b3ed7f300000000, 0x9bf8ddfd00000000,
+    0x600eceba00000000, 0x6d15fa7300000000, 0x96e3e93400000000,
+    0x3610b54a00000000, 0xcde6a60d00000000, 0xc0fd92c400000000,
+    0x3b0b818300000000, 0x9bcd8b8d00000000, 0x603b98ca00000000,
+    0x6d20ac0300000000, 0x96d6bf4400000000, 0x2dadb91f00000000,
+    0xd65baa5800000000, 0xdb409e9100000000, 0x20b68dd600000000,
+    0x807087d800000000, 0x7b86949f00000000, 0x769da05600000000,
+    0x8d6bb31100000000, 0x006aace000000000, 0xfb9cbfa700000000,
+    0xf6878b6e00000000, 0x0d71982900000000, 0xadb7922700000000,
+    0x5641816000000000, 0x5b5ab5a900000000, 0xa0aca6ee00000000,
+    0x1bd7a0b500000000, 0xe021b3f200000000, 0xed3a873b00000000,
+    0x16cc947c00000000, 0xb60a9e7200000000, 0x4dfc8d3500000000,
+    0x40e7b9fc00000000, 0xbb11aabb00000000, 0x77c29c5000000000,
+    0x8c348f1700000000, 0x812fbbde00000000, 0x7ad9a89900000000,
+    0xda1fa29700000000, 0x21e9b1d000000000, 0x2cf2851900000000,
+    0xd704965e00000000, 0x6c7f900500000000, 0x9789834200000000,
+    0x9a92b78b00000000, 0x6164a4cc00000000, 0xc1a2aec200000000,
+    0x3a54bd8500000000, 0x374f894c00000000, 0xccb99a0b00000000,
+    0x41b885fa00000000, 0xba4e96bd00000000, 0xb755a27400000000,
+    0x4ca3b13300000000, 0xec65bb3d00000000, 0x1793a87a00000000,
+    0x1a889cb300000000, 0xe17e8ff400000000, 0x5a0589af00000000,
+    0xa1f39ae800000000, 0xace8ae2100000000, 0x571ebd6600000000,
+    0xf7d8b76800000000, 0x0c2ea42f00000000, 0x013590e600000000,
+    0xfac383a100000000, 0x5a30dfdf00000000, 0xa1c6cc9800000000,
+    0xacddf85100000000, 0x572beb1600000000, 0xf7ede11800000000,
+    0x0c1bf25f00000000, 0x0100c69600000000, 0xfaf6d5d100000000,
+    0x418dd38a00000000, 0xba7bc0cd00000000, 0xb760f40400000000,
+    0x4c96e74300000000, 0xec50ed4d00000000, 0x17a6fe0a00000000,
+    0x1abdcac300000000, 0xe14bd98400000000, 0x6c4ac67500000000,
+    0x97bcd53200000000, 0x9aa7e1fb00000000, 0x6151f2bc00000000,
+    0xc197f8b200000000, 0x3a61ebf500000000, 0x377adf3c00000000,
+    0xcc8ccc7b00000000, 0x77f7ca2000000000, 0x8c01d96700000000,
+    0x811aedae00000000, 0x7aecfee900000000, 0xda2af4e700000000,
+    0x21dce7a000000000, 0x2cc7d36900000000, 0xd731c02e00000000,
+    0x6c206a9500000000, 0x97d679d200000000, 0x9acd4d1b00000000,
+    0x613b5e5c00000000, 0xc1fd545200000000, 0x3a0b471500000000,
+    0x371073dc00000000, 0xcce6609b00000000, 0x779d66c000000000,
+    0x8c6b758700000000, 0x8170414e00000000, 0x7a86520900000000,
+    0xda40580700000000, 0x21b64b4000000000, 0x2cad7f8900000000,
+    0xd75b6cce00000000, 0x5a5a733f00000000, 0xa1ac607800000000,
+    0xacb754b100000000, 0x574147f600000000, 0xf7874df800000000,
+    0x0c715ebf00000000, 0x016a6a7600000000, 0xfa9c793100000000,
+    0x41e77f6a00000000, 0xba116c2d00000000, 0xb70a58e400000000,
+    0x4cfc4ba300000000, 0xec3a41ad00000000, 0x17cc52ea00000000,
+    0x1ad7662300000000, 0xe121756400000000, 0x41d2291a00000000,
+    0xba243a5d00000000, 0xb73f0e9400000000, 0x4cc91dd300000000,
+    0xec0f17dd00000000, 0x17f9049a00000000, 0x1ae2305300000000,
+    0xe114231400000000, 0x5a6f254f00000000, 0xa199360800000000,
+    0xac8202c100000000, 0x5774118600000000, 0xf7b21b8800000000,
+    0x0c4408cf00000000, 0x015f3c0600000000, 0xfaa92f4100000000,
+    0x77a830b000000000, 0x8c5e23f700000000, 0x8145173e00000000,
+    0x7ab3047900000000, 0xda750e7700000000, 0x21831d3000000000,
+    0x2c9829f900000000, 0xd76e3abe00000000, 0x6c153ce500000000,
+    0x97e32fa200000000, 0x9af81b6b00000000, 0x610e082c00000000,
+    0xc1c8022200000000, 0x3a3e116500000000, 0x372525ac00000000,
+    0xccd336eb00000000},
+   {0x0000000000000000, 0x6238282a00000000, 0xc470505400000000,
+    0xa648787e00000000, 0x88e1a0a800000000, 0xead9888200000000,
+    0x4c91f0fc00000000, 0x2ea9d8d600000000, 0x51c5308a00000000,
+    0x33fd18a000000000, 0x95b560de00000000, 0xf78d48f400000000,
+    0xd924902200000000, 0xbb1cb80800000000, 0x1d54c07600000000,
+    0x7f6ce85c00000000, 0xe38c10cf00000000, 0x81b438e500000000,
+    0x27fc409b00000000, 0x45c468b100000000, 0x6b6db06700000000,
+    0x0955984d00000000, 0xaf1de03300000000, 0xcd25c81900000000,
+    0xb249204500000000, 0xd071086f00000000, 0x7639701100000000,
+    0x1401583b00000000, 0x3aa880ed00000000, 0x5890a8c700000000,
+    0xfed8d0b900000000, 0x9ce0f89300000000, 0x871f504500000000,
+    0xe527786f00000000, 0x436f001100000000, 0x2157283b00000000,
+    0x0ffef0ed00000000, 0x6dc6d8c700000000, 0xcb8ea0b900000000,
+    0xa9b6889300000000, 0xd6da60cf00000000, 0xb4e248e500000000,
+    0x12aa309b00000000, 0x709218b100000000, 0x5e3bc06700000000,
+    0x3c03e84d00000000, 0x9a4b903300000000, 0xf873b81900000000,
+    0x6493408a00000000, 0x06ab68a000000000, 0xa0e310de00000000,
+    0xc2db38f400000000, 0xec72e02200000000, 0x8e4ac80800000000,
+    0x2802b07600000000, 0x4a3a985c00000000, 0x3556700000000000,
+    0x576e582a00000000, 0xf126205400000000, 0x931e087e00000000,
+    0xbdb7d0a800000000, 0xdf8ff88200000000, 0x79c780fc00000000,
+    0x1bffa8d600000000, 0x0e3fa08a00000000, 0x6c0788a000000000,
+    0xca4ff0de00000000, 0xa877d8f400000000, 0x86de002200000000,
+    0xe4e6280800000000, 0x42ae507600000000, 0x2096785c00000000,
+    0x5ffa900000000000, 0x3dc2b82a00000000, 0x9b8ac05400000000,
+    0xf9b2e87e00000000, 0xd71b30a800000000, 0xb523188200000000,
+    0x136b60fc00000000, 0x715348d600000000, 0xedb3b04500000000,
+    0x8f8b986f00000000, 0x29c3e01100000000, 0x4bfbc83b00000000,
+    0x655210ed00000000, 0x076a38c700000000, 0xa12240b900000000,
+    0xc31a689300000000, 0xbc7680cf00000000, 0xde4ea8e500000000,
+    0x7806d09b00000000, 0x1a3ef8b100000000, 0x3497206700000000,
+    0x56af084d00000000, 0xf0e7703300000000, 0x92df581900000000,
+    0x8920f0cf00000000, 0xeb18d8e500000000, 0x4d50a09b00000000,
+    0x2f6888b100000000, 0x01c1506700000000, 0x63f9784d00000000,
+    0xc5b1003300000000, 0xa789281900000000, 0xd8e5c04500000000,
+    0xbadde86f00000000, 0x1c95901100000000, 0x7eadb83b00000000,
+    0x500460ed00000000, 0x323c48c700000000, 0x947430b900000000,
+    0xf64c189300000000, 0x6aace00000000000, 0x0894c82a00000000,
+    0xaedcb05400000000, 0xcce4987e00000000, 0xe24d40a800000000,
+    0x8075688200000000, 0x263d10fc00000000, 0x440538d600000000,
+    0x3b69d08a00000000, 0x5951f8a000000000, 0xff1980de00000000,
+    0x9d21a8f400000000, 0xb388702200000000, 0xd1b0580800000000,
+    0x77f8207600000000, 0x15c0085c00000000, 0x5d7831ce00000000,
+    0x3f4019e400000000, 0x9908619a00000000, 0xfb3049b000000000,
+    0xd599916600000000, 0xb7a1b94c00000000, 0x11e9c13200000000,
+    0x73d1e91800000000, 0x0cbd014400000000, 0x6e85296e00000000,
+    0xc8cd511000000000, 0xaaf5793a00000000, 0x845ca1ec00000000,
+    0xe66489c600000000, 0x402cf1b800000000, 0x2214d99200000000,
+    0xbef4210100000000, 0xdccc092b00000000, 0x7a84715500000000,
+    0x18bc597f00000000, 0x361581a900000000, 0x542da98300000000,
+    0xf265d1fd00000000, 0x905df9d700000000, 0xef31118b00000000,
+    0x8d0939a100000000, 0x2b4141df00000000, 0x497969f500000000,
+    0x67d0b12300000000, 0x05e8990900000000, 0xa3a0e17700000000,
+    0xc198c95d00000000, 0xda67618b00000000, 0xb85f49a100000000,
+    0x1e1731df00000000, 0x7c2f19f500000000, 0x5286c12300000000,
+    0x30bee90900000000, 0x96f6917700000000, 0xf4ceb95d00000000,
+    0x8ba2510100000000, 0xe99a792b00000000, 0x4fd2015500000000,
+    0x2dea297f00000000, 0x0343f1a900000000, 0x617bd98300000000,
+    0xc733a1fd00000000, 0xa50b89d700000000, 0x39eb714400000000,
+    0x5bd3596e00000000, 0xfd9b211000000000, 0x9fa3093a00000000,
+    0xb10ad1ec00000000, 0xd332f9c600000000, 0x757a81b800000000,
+    0x1742a99200000000, 0x682e41ce00000000, 0x0a1669e400000000,
+    0xac5e119a00000000, 0xce6639b000000000, 0xe0cfe16600000000,
+    0x82f7c94c00000000, 0x24bfb13200000000, 0x4687991800000000,
+    0x5347914400000000, 0x317fb96e00000000, 0x9737c11000000000,
+    0xf50fe93a00000000, 0xdba631ec00000000, 0xb99e19c600000000,
+    0x1fd661b800000000, 0x7dee499200000000, 0x0282a1ce00000000,
+    0x60ba89e400000000, 0xc6f2f19a00000000, 0xa4cad9b000000000,
+    0x8a63016600000000, 0xe85b294c00000000, 0x4e13513200000000,
+    0x2c2b791800000000, 0xb0cb818b00000000, 0xd2f3a9a100000000,
+    0x74bbd1df00000000, 0x1683f9f500000000, 0x382a212300000000,
+    0x5a12090900000000, 0xfc5a717700000000, 0x9e62595d00000000,
+    0xe10eb10100000000, 0x8336992b00000000, 0x257ee15500000000,
+    0x4746c97f00000000, 0x69ef11a900000000, 0x0bd7398300000000,
+    0xad9f41fd00000000, 0xcfa769d700000000, 0xd458c10100000000,
+    0xb660e92b00000000, 0x1028915500000000, 0x7210b97f00000000,
+    0x5cb961a900000000, 0x3e81498300000000, 0x98c931fd00000000,
+    0xfaf119d700000000, 0x859df18b00000000, 0xe7a5d9a100000000,
+    0x41eda1df00000000, 0x23d589f500000000, 0x0d7c512300000000,
+    0x6f44790900000000, 0xc90c017700000000, 0xab34295d00000000,
+    0x37d4d1ce00000000, 0x55ecf9e400000000, 0xf3a4819a00000000,
+    0x919ca9b000000000, 0xbf35716600000000, 0xdd0d594c00000000,
+    0x7b45213200000000, 0x197d091800000000, 0x6611e14400000000,
+    0x0429c96e00000000, 0xa261b11000000000, 0xc059993a00000000,
+    0xeef041ec00000000, 0x8cc869c600000000, 0x2a8011b800000000,
+    0x48b8399200000000},
+   {0x0000000000000000, 0x4c2896a300000000, 0xd9565d9c00000000,
+    0x957ecb3f00000000, 0xf3abcbe300000000, 0xbf835d4000000000,
+    0x2afd967f00000000, 0x66d500dc00000000, 0xa751e61c00000000,
+    0xeb7970bf00000000, 0x7e07bb8000000000, 0x322f2d2300000000,
+    0x54fa2dff00000000, 0x18d2bb5c00000000, 0x8dac706300000000,
+    0xc184e6c000000000, 0x4ea3cc3900000000, 0x028b5a9a00000000,
+    0x97f591a500000000, 0xdbdd070600000000, 0xbd0807da00000000,
+    0xf120917900000000, 0x645e5a4600000000, 0x2876cce500000000,
+    0xe9f22a2500000000, 0xa5dabc8600000000, 0x30a477b900000000,
+    0x7c8ce11a00000000, 0x1a59e1c600000000, 0x5671776500000000,
+    0xc30fbc5a00000000, 0x8f272af900000000, 0x9c46997300000000,
+    0xd06e0fd000000000, 0x4510c4ef00000000, 0x0938524c00000000,
+    0x6fed529000000000, 0x23c5c43300000000, 0xb6bb0f0c00000000,
+    0xfa9399af00000000, 0x3b177f6f00000000, 0x773fe9cc00000000,
+    0xe24122f300000000, 0xae69b45000000000, 0xc8bcb48c00000000,
+    0x8494222f00000000, 0x11eae91000000000, 0x5dc27fb300000000,
+    0xd2e5554a00000000, 0x9ecdc3e900000000, 0x0bb308d600000000,
+    0x479b9e7500000000, 0x214e9ea900000000, 0x6d66080a00000000,
+    0xf818c33500000000, 0xb430559600000000, 0x75b4b35600000000,
+    0x399c25f500000000, 0xace2eeca00000000, 0xe0ca786900000000,
+    0x861f78b500000000, 0xca37ee1600000000, 0x5f49252900000000,
+    0x1361b38a00000000, 0x388d32e700000000, 0x74a5a44400000000,
+    0xe1db6f7b00000000, 0xadf3f9d800000000, 0xcb26f90400000000,
+    0x870e6fa700000000, 0x1270a49800000000, 0x5e58323b00000000,
+    0x9fdcd4fb00000000, 0xd3f4425800000000, 0x468a896700000000,
+    0x0aa21fc400000000, 0x6c771f1800000000, 0x205f89bb00000000,
+    0xb521428400000000, 0xf909d42700000000, 0x762efede00000000,
+    0x3a06687d00000000, 0xaf78a34200000000, 0xe35035e100000000,
+    0x8585353d00000000, 0xc9ada39e00000000, 0x5cd368a100000000,
+    0x10fbfe0200000000, 0xd17f18c200000000, 0x9d578e6100000000,
+    0x0829455e00000000, 0x4401d3fd00000000, 0x22d4d32100000000,
+    0x6efc458200000000, 0xfb828ebd00000000, 0xb7aa181e00000000,
+    0xa4cbab9400000000, 0xe8e33d3700000000, 0x7d9df60800000000,
+    0x31b560ab00000000, 0x5760607700000000, 0x1b48f6d400000000,
+    0x8e363deb00000000, 0xc21eab4800000000, 0x039a4d8800000000,
+    0x4fb2db2b00000000, 0xdacc101400000000, 0x96e486b700000000,
+    0xf031866b00000000, 0xbc1910c800000000, 0x2967dbf700000000,
+    0x654f4d5400000000, 0xea6867ad00000000, 0xa640f10e00000000,
+    0x333e3a3100000000, 0x7f16ac9200000000, 0x19c3ac4e00000000,
+    0x55eb3aed00000000, 0xc095f1d200000000, 0x8cbd677100000000,
+    0x4d3981b100000000, 0x0111171200000000, 0x946fdc2d00000000,
+    0xd8474a8e00000000, 0xbe924a5200000000, 0xf2badcf100000000,
+    0x67c417ce00000000, 0x2bec816d00000000, 0x311c141500000000,
+    0x7d3482b600000000, 0xe84a498900000000, 0xa462df2a00000000,
+    0xc2b7dff600000000, 0x8e9f495500000000, 0x1be1826a00000000,
+    0x57c914c900000000, 0x964df20900000000, 0xda6564aa00000000,
+    0x4f1baf9500000000, 0x0333393600000000, 0x65e639ea00000000,
+    0x29ceaf4900000000, 0xbcb0647600000000, 0xf098f2d500000000,
+    0x7fbfd82c00000000, 0x33974e8f00000000, 0xa6e985b000000000,
+    0xeac1131300000000, 0x8c1413cf00000000, 0xc03c856c00000000,
+    0x55424e5300000000, 0x196ad8f000000000, 0xd8ee3e3000000000,
+    0x94c6a89300000000, 0x01b863ac00000000, 0x4d90f50f00000000,
+    0x2b45f5d300000000, 0x676d637000000000, 0xf213a84f00000000,
+    0xbe3b3eec00000000, 0xad5a8d6600000000, 0xe1721bc500000000,
+    0x740cd0fa00000000, 0x3824465900000000, 0x5ef1468500000000,
+    0x12d9d02600000000, 0x87a71b1900000000, 0xcb8f8dba00000000,
+    0x0a0b6b7a00000000, 0x4623fdd900000000, 0xd35d36e600000000,
+    0x9f75a04500000000, 0xf9a0a09900000000, 0xb588363a00000000,
+    0x20f6fd0500000000, 0x6cde6ba600000000, 0xe3f9415f00000000,
+    0xafd1d7fc00000000, 0x3aaf1cc300000000, 0x76878a6000000000,
+    0x10528abc00000000, 0x5c7a1c1f00000000, 0xc904d72000000000,
+    0x852c418300000000, 0x44a8a74300000000, 0x088031e000000000,
+    0x9dfefadf00000000, 0xd1d66c7c00000000, 0xb7036ca000000000,
+    0xfb2bfa0300000000, 0x6e55313c00000000, 0x227da79f00000000,
+    0x099126f200000000, 0x45b9b05100000000, 0xd0c77b6e00000000,
+    0x9cefedcd00000000, 0xfa3aed1100000000, 0xb6127bb200000000,
+    0x236cb08d00000000, 0x6f44262e00000000, 0xaec0c0ee00000000,
+    0xe2e8564d00000000, 0x77969d7200000000, 0x3bbe0bd100000000,
+    0x5d6b0b0d00000000, 0x11439dae00000000, 0x843d569100000000,
+    0xc815c03200000000, 0x4732eacb00000000, 0x0b1a7c6800000000,
+    0x9e64b75700000000, 0xd24c21f400000000, 0xb499212800000000,
+    0xf8b1b78b00000000, 0x6dcf7cb400000000, 0x21e7ea1700000000,
+    0xe0630cd700000000, 0xac4b9a7400000000, 0x3935514b00000000,
+    0x751dc7e800000000, 0x13c8c73400000000, 0x5fe0519700000000,
+    0xca9e9aa800000000, 0x86b60c0b00000000, 0x95d7bf8100000000,
+    0xd9ff292200000000, 0x4c81e21d00000000, 0x00a974be00000000,
+    0x667c746200000000, 0x2a54e2c100000000, 0xbf2a29fe00000000,
+    0xf302bf5d00000000, 0x3286599d00000000, 0x7eaecf3e00000000,
+    0xebd0040100000000, 0xa7f892a200000000, 0xc12d927e00000000,
+    0x8d0504dd00000000, 0x187bcfe200000000, 0x5453594100000000,
+    0xdb7473b800000000, 0x975ce51b00000000, 0x02222e2400000000,
+    0x4e0ab88700000000, 0x28dfb85b00000000, 0x64f72ef800000000,
+    0xf189e5c700000000, 0xbda1736400000000, 0x7c2595a400000000,
+    0x300d030700000000, 0xa573c83800000000, 0xe95b5e9b00000000,
+    0x8f8e5e4700000000, 0xc3a6c8e400000000, 0x56d803db00000000,
+    0x1af0957800000000},
+   {0x0000000000000000, 0x939bc97f00000000, 0x263793ff00000000,
+    0xb5ac5a8000000000, 0x0d68572400000000, 0x9ef39e5b00000000,
+    0x2b5fc4db00000000, 0xb8c40da400000000, 0x1ad0ae4800000000,
+    0x894b673700000000, 0x3ce73db700000000, 0xaf7cf4c800000000,
+    0x17b8f96c00000000, 0x8423301300000000, 0x318f6a9300000000,
+    0xa214a3ec00000000, 0x34a05d9100000000, 0xa73b94ee00000000,
+    0x1297ce6e00000000, 0x810c071100000000, 0x39c80ab500000000,
+    0xaa53c3ca00000000, 0x1fff994a00000000, 0x8c64503500000000,
+    0x2e70f3d900000000, 0xbdeb3aa600000000, 0x0847602600000000,
+    0x9bdca95900000000, 0x2318a4fd00000000, 0xb0836d8200000000,
+    0x052f370200000000, 0x96b4fe7d00000000, 0x2946caf900000000,
+    0xbadd038600000000, 0x0f71590600000000, 0x9cea907900000000,
+    0x242e9ddd00000000, 0xb7b554a200000000, 0x02190e2200000000,
+    0x9182c75d00000000, 0x339664b100000000, 0xa00dadce00000000,
+    0x15a1f74e00000000, 0x863a3e3100000000, 0x3efe339500000000,
+    0xad65faea00000000, 0x18c9a06a00000000, 0x8b52691500000000,
+    0x1de6976800000000, 0x8e7d5e1700000000, 0x3bd1049700000000,
+    0xa84acde800000000, 0x108ec04c00000000, 0x8315093300000000,
+    0x36b953b300000000, 0xa5229acc00000000, 0x0736392000000000,
+    0x94adf05f00000000, 0x2101aadf00000000, 0xb29a63a000000000,
+    0x0a5e6e0400000000, 0x99c5a77b00000000, 0x2c69fdfb00000000,
+    0xbff2348400000000, 0x138ae52800000000, 0x80112c5700000000,
+    0x35bd76d700000000, 0xa626bfa800000000, 0x1ee2b20c00000000,
+    0x8d797b7300000000, 0x38d521f300000000, 0xab4ee88c00000000,
+    0x095a4b6000000000, 0x9ac1821f00000000, 0x2f6dd89f00000000,
+    0xbcf611e000000000, 0x04321c4400000000, 0x97a9d53b00000000,
+    0x22058fbb00000000, 0xb19e46c400000000, 0x272ab8b900000000,
+    0xb4b171c600000000, 0x011d2b4600000000, 0x9286e23900000000,
+    0x2a42ef9d00000000, 0xb9d926e200000000, 0x0c757c6200000000,
+    0x9feeb51d00000000, 0x3dfa16f100000000, 0xae61df8e00000000,
+    0x1bcd850e00000000, 0x88564c7100000000, 0x309241d500000000,
+    0xa30988aa00000000, 0x16a5d22a00000000, 0x853e1b5500000000,
+    0x3acc2fd100000000, 0xa957e6ae00000000, 0x1cfbbc2e00000000,
+    0x8f60755100000000, 0x37a478f500000000, 0xa43fb18a00000000,
+    0x1193eb0a00000000, 0x8208227500000000, 0x201c819900000000,
+    0xb38748e600000000, 0x062b126600000000, 0x95b0db1900000000,
+    0x2d74d6bd00000000, 0xbeef1fc200000000, 0x0b43454200000000,
+    0x98d88c3d00000000, 0x0e6c724000000000, 0x9df7bb3f00000000,
+    0x285be1bf00000000, 0xbbc028c000000000, 0x0304256400000000,
+    0x909fec1b00000000, 0x2533b69b00000000, 0xb6a87fe400000000,
+    0x14bcdc0800000000, 0x8727157700000000, 0x328b4ff700000000,
+    0xa110868800000000, 0x19d48b2c00000000, 0x8a4f425300000000,
+    0x3fe318d300000000, 0xac78d1ac00000000, 0x2614cb5100000000,
+    0xb58f022e00000000, 0x002358ae00000000, 0x93b891d100000000,
+    0x2b7c9c7500000000, 0xb8e7550a00000000, 0x0d4b0f8a00000000,
+    0x9ed0c6f500000000, 0x3cc4651900000000, 0xaf5fac6600000000,
+    0x1af3f6e600000000, 0x89683f9900000000, 0x31ac323d00000000,
+    0xa237fb4200000000, 0x179ba1c200000000, 0x840068bd00000000,
+    0x12b496c000000000, 0x812f5fbf00000000, 0x3483053f00000000,
+    0xa718cc4000000000, 0x1fdcc1e400000000, 0x8c47089b00000000,
+    0x39eb521b00000000, 0xaa709b6400000000, 0x0864388800000000,
+    0x9bfff1f700000000, 0x2e53ab7700000000, 0xbdc8620800000000,
+    0x050c6fac00000000, 0x9697a6d300000000, 0x233bfc5300000000,
+    0xb0a0352c00000000, 0x0f5201a800000000, 0x9cc9c8d700000000,
+    0x2965925700000000, 0xbafe5b2800000000, 0x023a568c00000000,
+    0x91a19ff300000000, 0x240dc57300000000, 0xb7960c0c00000000,
+    0x1582afe000000000, 0x8619669f00000000, 0x33b53c1f00000000,
+    0xa02ef56000000000, 0x18eaf8c400000000, 0x8b7131bb00000000,
+    0x3edd6b3b00000000, 0xad46a24400000000, 0x3bf25c3900000000,
+    0xa869954600000000, 0x1dc5cfc600000000, 0x8e5e06b900000000,
+    0x369a0b1d00000000, 0xa501c26200000000, 0x10ad98e200000000,
+    0x8336519d00000000, 0x2122f27100000000, 0xb2b93b0e00000000,
+    0x0715618e00000000, 0x948ea8f100000000, 0x2c4aa55500000000,
+    0xbfd16c2a00000000, 0x0a7d36aa00000000, 0x99e6ffd500000000,
+    0x359e2e7900000000, 0xa605e70600000000, 0x13a9bd8600000000,
+    0x803274f900000000, 0x38f6795d00000000, 0xab6db02200000000,
+    0x1ec1eaa200000000, 0x8d5a23dd00000000, 0x2f4e803100000000,
+    0xbcd5494e00000000, 0x097913ce00000000, 0x9ae2dab100000000,
+    0x2226d71500000000, 0xb1bd1e6a00000000, 0x041144ea00000000,
+    0x978a8d9500000000, 0x013e73e800000000, 0x92a5ba9700000000,
+    0x2709e01700000000, 0xb492296800000000, 0x0c5624cc00000000,
+    0x9fcdedb300000000, 0x2a61b73300000000, 0xb9fa7e4c00000000,
+    0x1beedda000000000, 0x887514df00000000, 0x3dd94e5f00000000,
+    0xae42872000000000, 0x16868a8400000000, 0x851d43fb00000000,
+    0x30b1197b00000000, 0xa32ad00400000000, 0x1cd8e48000000000,
+    0x8f432dff00000000, 0x3aef777f00000000, 0xa974be0000000000,
+    0x11b0b3a400000000, 0x822b7adb00000000, 0x3787205b00000000,
+    0xa41ce92400000000, 0x06084ac800000000, 0x959383b700000000,
+    0x203fd93700000000, 0xb3a4104800000000, 0x0b601dec00000000,
+    0x98fbd49300000000, 0x2d578e1300000000, 0xbecc476c00000000,
+    0x2878b91100000000, 0xbbe3706e00000000, 0x0e4f2aee00000000,
+    0x9dd4e39100000000, 0x2510ee3500000000, 0xb68b274a00000000,
+    0x03277dca00000000, 0x90bcb4b500000000, 0x32a8175900000000,
+    0xa133de2600000000, 0x149f84a600000000, 0x87044dd900000000,
+    0x3fc0407d00000000, 0xac5b890200000000, 0x19f7d38200000000,
+    0x8a6c1afd00000000},
+   {0x0000000000000000, 0x650b796900000000, 0xca16f2d200000000,
+    0xaf1d8bbb00000000, 0xd52b957e00000000, 0xb020ec1700000000,
+    0x1f3d67ac00000000, 0x7a361ec500000000, 0xaa572afd00000000,
+    0xcf5c539400000000, 0x6041d82f00000000, 0x054aa14600000000,
+    0x7f7cbf8300000000, 0x1a77c6ea00000000, 0xb56a4d5100000000,
+    0xd061343800000000, 0x15a9252100000000, 0x70a25c4800000000,
+    0xdfbfd7f300000000, 0xbab4ae9a00000000, 0xc082b05f00000000,
+    0xa589c93600000000, 0x0a94428d00000000, 0x6f9f3be400000000,
+    0xbffe0fdc00000000, 0xdaf576b500000000, 0x75e8fd0e00000000,
+    0x10e3846700000000, 0x6ad59aa200000000, 0x0fdee3cb00000000,
+    0xa0c3687000000000, 0xc5c8111900000000, 0x2a524b4200000000,
+    0x4f59322b00000000, 0xe044b99000000000, 0x854fc0f900000000,
+    0xff79de3c00000000, 0x9a72a75500000000, 0x356f2cee00000000,
+    0x5064558700000000, 0x800561bf00000000, 0xe50e18d600000000,
+    0x4a13936d00000000, 0x2f18ea0400000000, 0x552ef4c100000000,
+    0x30258da800000000, 0x9f38061300000000, 0xfa337f7a00000000,
+    0x3ffb6e6300000000, 0x5af0170a00000000, 0xf5ed9cb100000000,
+    0x90e6e5d800000000, 0xead0fb1d00000000, 0x8fdb827400000000,
+    0x20c609cf00000000, 0x45cd70a600000000, 0x95ac449e00000000,
+    0xf0a73df700000000, 0x5fbab64c00000000, 0x3ab1cf2500000000,
+    0x4087d1e000000000, 0x258ca88900000000, 0x8a91233200000000,
+    0xef9a5a5b00000000, 0x54a4968400000000, 0x31afefed00000000,
+    0x9eb2645600000000, 0xfbb91d3f00000000, 0x818f03fa00000000,
+    0xe4847a9300000000, 0x4b99f12800000000, 0x2e92884100000000,
+    0xfef3bc7900000000, 0x9bf8c51000000000, 0x34e54eab00000000,
+    0x51ee37c200000000, 0x2bd8290700000000, 0x4ed3506e00000000,
+    0xe1cedbd500000000, 0x84c5a2bc00000000, 0x410db3a500000000,
+    0x2406cacc00000000, 0x8b1b417700000000, 0xee10381e00000000,
+    0x942626db00000000, 0xf12d5fb200000000, 0x5e30d40900000000,
+    0x3b3bad6000000000, 0xeb5a995800000000, 0x8e51e03100000000,
+    0x214c6b8a00000000, 0x444712e300000000, 0x3e710c2600000000,
+    0x5b7a754f00000000, 0xf467fef400000000, 0x916c879d00000000,
+    0x7ef6ddc600000000, 0x1bfda4af00000000, 0xb4e02f1400000000,
+    0xd1eb567d00000000, 0xabdd48b800000000, 0xced631d100000000,
+    0x61cbba6a00000000, 0x04c0c30300000000, 0xd4a1f73b00000000,
+    0xb1aa8e5200000000, 0x1eb705e900000000, 0x7bbc7c8000000000,
+    0x018a624500000000, 0x64811b2c00000000, 0xcb9c909700000000,
+    0xae97e9fe00000000, 0x6b5ff8e700000000, 0x0e54818e00000000,
+    0xa1490a3500000000, 0xc442735c00000000, 0xbe746d9900000000,
+    0xdb7f14f000000000, 0x74629f4b00000000, 0x1169e62200000000,
+    0xc108d21a00000000, 0xa403ab7300000000, 0x0b1e20c800000000,
+    0x6e1559a100000000, 0x1423476400000000, 0x71283e0d00000000,
+    0xde35b5b600000000, 0xbb3eccdf00000000, 0xe94e5cd200000000,
+    0x8c4525bb00000000, 0x2358ae0000000000, 0x4653d76900000000,
+    0x3c65c9ac00000000, 0x596eb0c500000000, 0xf6733b7e00000000,
+    0x9378421700000000, 0x4319762f00000000, 0x26120f4600000000,
+    0x890f84fd00000000, 0xec04fd9400000000, 0x9632e35100000000,
+    0xf3399a3800000000, 0x5c24118300000000, 0x392f68ea00000000,
+    0xfce779f300000000, 0x99ec009a00000000, 0x36f18b2100000000,
+    0x53faf24800000000, 0x29ccec8d00000000, 0x4cc795e400000000,
+    0xe3da1e5f00000000, 0x86d1673600000000, 0x56b0530e00000000,
+    0x33bb2a6700000000, 0x9ca6a1dc00000000, 0xf9add8b500000000,
+    0x839bc67000000000, 0xe690bf1900000000, 0x498d34a200000000,
+    0x2c864dcb00000000, 0xc31c179000000000, 0xa6176ef900000000,
+    0x090ae54200000000, 0x6c019c2b00000000, 0x163782ee00000000,
+    0x733cfb8700000000, 0xdc21703c00000000, 0xb92a095500000000,
+    0x694b3d6d00000000, 0x0c40440400000000, 0xa35dcfbf00000000,
+    0xc656b6d600000000, 0xbc60a81300000000, 0xd96bd17a00000000,
+    0x76765ac100000000, 0x137d23a800000000, 0xd6b532b100000000,
+    0xb3be4bd800000000, 0x1ca3c06300000000, 0x79a8b90a00000000,
+    0x039ea7cf00000000, 0x6695dea600000000, 0xc988551d00000000,
+    0xac832c7400000000, 0x7ce2184c00000000, 0x19e9612500000000,
+    0xb6f4ea9e00000000, 0xd3ff93f700000000, 0xa9c98d3200000000,
+    0xccc2f45b00000000, 0x63df7fe000000000, 0x06d4068900000000,
+    0xbdeaca5600000000, 0xd8e1b33f00000000, 0x77fc388400000000,
+    0x12f741ed00000000, 0x68c15f2800000000, 0x0dca264100000000,
+    0xa2d7adfa00000000, 0xc7dcd49300000000, 0x17bde0ab00000000,
+    0x72b699c200000000, 0xddab127900000000, 0xb8a06b1000000000,
+    0xc29675d500000000, 0xa79d0cbc00000000, 0x0880870700000000,
+    0x6d8bfe6e00000000, 0xa843ef7700000000, 0xcd48961e00000000,
+    0x62551da500000000, 0x075e64cc00000000, 0x7d687a0900000000,
+    0x1863036000000000, 0xb77e88db00000000, 0xd275f1b200000000,
+    0x0214c58a00000000, 0x671fbce300000000, 0xc802375800000000,
+    0xad094e3100000000, 0xd73f50f400000000, 0xb234299d00000000,
+    0x1d29a22600000000, 0x7822db4f00000000, 0x97b8811400000000,
+    0xf2b3f87d00000000, 0x5dae73c600000000, 0x38a50aaf00000000,
+    0x4293146a00000000, 0x27986d0300000000, 0x8885e6b800000000,
+    0xed8e9fd100000000, 0x3defabe900000000, 0x58e4d28000000000,
+    0xf7f9593b00000000, 0x92f2205200000000, 0xe8c43e9700000000,
+    0x8dcf47fe00000000, 0x22d2cc4500000000, 0x47d9b52c00000000,
+    0x8211a43500000000, 0xe71add5c00000000, 0x480756e700000000,
+    0x2d0c2f8e00000000, 0x573a314b00000000, 0x3231482200000000,
+    0x9d2cc39900000000, 0xf827baf000000000, 0x28468ec800000000,
+    0x4d4df7a100000000, 0xe2507c1a00000000, 0x875b057300000000,
+    0xfd6d1bb600000000, 0x986662df00000000, 0x377be96400000000,
+    0x5270900d00000000},
+   {0x0000000000000000, 0xdcecb13d00000000, 0xb8d9637b00000000,
+    0x6435d24600000000, 0x70b3c7f600000000, 0xac5f76cb00000000,
+    0xc86aa48d00000000, 0x148615b000000000, 0xa160fe3600000000,
+    0x7d8c4f0b00000000, 0x19b99d4d00000000, 0xc5552c7000000000,
+    0xd1d339c000000000, 0x0d3f88fd00000000, 0x690a5abb00000000,
+    0xb5e6eb8600000000, 0x42c1fc6d00000000, 0x9e2d4d5000000000,
+    0xfa189f1600000000, 0x26f42e2b00000000, 0x32723b9b00000000,
+    0xee9e8aa600000000, 0x8aab58e000000000, 0x5647e9dd00000000,
+    0xe3a1025b00000000, 0x3f4db36600000000, 0x5b78612000000000,
+    0x8794d01d00000000, 0x9312c5ad00000000, 0x4ffe749000000000,
+    0x2bcba6d600000000, 0xf72717eb00000000, 0x8482f9db00000000,
+    0x586e48e600000000, 0x3c5b9aa000000000, 0xe0b72b9d00000000,
+    0xf4313e2d00000000, 0x28dd8f1000000000, 0x4ce85d5600000000,
+    0x9004ec6b00000000, 0x25e207ed00000000, 0xf90eb6d000000000,
+    0x9d3b649600000000, 0x41d7d5ab00000000, 0x5551c01b00000000,
+    0x89bd712600000000, 0xed88a36000000000, 0x3164125d00000000,
+    0xc64305b600000000, 0x1aafb48b00000000, 0x7e9a66cd00000000,
+    0xa276d7f000000000, 0xb6f0c24000000000, 0x6a1c737d00000000,
+    0x0e29a13b00000000, 0xd2c5100600000000, 0x6723fb8000000000,
+    0xbbcf4abd00000000, 0xdffa98fb00000000, 0x031629c600000000,
+    0x17903c7600000000, 0xcb7c8d4b00000000, 0xaf495f0d00000000,
+    0x73a5ee3000000000, 0x4903826c00000000, 0x95ef335100000000,
+    0xf1dae11700000000, 0x2d36502a00000000, 0x39b0459a00000000,
+    0xe55cf4a700000000, 0x816926e100000000, 0x5d8597dc00000000,
+    0xe8637c5a00000000, 0x348fcd6700000000, 0x50ba1f2100000000,
+    0x8c56ae1c00000000, 0x98d0bbac00000000, 0x443c0a9100000000,
+    0x2009d8d700000000, 0xfce569ea00000000, 0x0bc27e0100000000,
+    0xd72ecf3c00000000, 0xb31b1d7a00000000, 0x6ff7ac4700000000,
+    0x7b71b9f700000000, 0xa79d08ca00000000, 0xc3a8da8c00000000,
+    0x1f446bb100000000, 0xaaa2803700000000, 0x764e310a00000000,
+    0x127be34c00000000, 0xce97527100000000, 0xda1147c100000000,
+    0x06fdf6fc00000000, 0x62c824ba00000000, 0xbe24958700000000,
+    0xcd817bb700000000, 0x116dca8a00000000, 0x755818cc00000000,
+    0xa9b4a9f100000000, 0xbd32bc4100000000, 0x61de0d7c00000000,
+    0x05ebdf3a00000000, 0xd9076e0700000000, 0x6ce1858100000000,
+    0xb00d34bc00000000, 0xd438e6fa00000000, 0x08d457c700000000,
+    0x1c52427700000000, 0xc0bef34a00000000, 0xa48b210c00000000,
+    0x7867903100000000, 0x8f4087da00000000, 0x53ac36e700000000,
+    0x3799e4a100000000, 0xeb75559c00000000, 0xfff3402c00000000,
+    0x231ff11100000000, 0x472a235700000000, 0x9bc6926a00000000,
+    0x2e2079ec00000000, 0xf2ccc8d100000000, 0x96f91a9700000000,
+    0x4a15abaa00000000, 0x5e93be1a00000000, 0x827f0f2700000000,
+    0xe64add6100000000, 0x3aa66c5c00000000, 0x920604d900000000,
+    0x4eeab5e400000000, 0x2adf67a200000000, 0xf633d69f00000000,
+    0xe2b5c32f00000000, 0x3e59721200000000, 0x5a6ca05400000000,
+    0x8680116900000000, 0x3366faef00000000, 0xef8a4bd200000000,
+    0x8bbf999400000000, 0x575328a900000000, 0x43d53d1900000000,
+    0x9f398c2400000000, 0xfb0c5e6200000000, 0x27e0ef5f00000000,
+    0xd0c7f8b400000000, 0x0c2b498900000000, 0x681e9bcf00000000,
+    0xb4f22af200000000, 0xa0743f4200000000, 0x7c988e7f00000000,
+    0x18ad5c3900000000, 0xc441ed0400000000, 0x71a7068200000000,
+    0xad4bb7bf00000000, 0xc97e65f900000000, 0x1592d4c400000000,
+    0x0114c17400000000, 0xddf8704900000000, 0xb9cda20f00000000,
+    0x6521133200000000, 0x1684fd0200000000, 0xca684c3f00000000,
+    0xae5d9e7900000000, 0x72b12f4400000000, 0x66373af400000000,
+    0xbadb8bc900000000, 0xdeee598f00000000, 0x0202e8b200000000,
+    0xb7e4033400000000, 0x6b08b20900000000, 0x0f3d604f00000000,
+    0xd3d1d17200000000, 0xc757c4c200000000, 0x1bbb75ff00000000,
+    0x7f8ea7b900000000, 0xa362168400000000, 0x5445016f00000000,
+    0x88a9b05200000000, 0xec9c621400000000, 0x3070d32900000000,
+    0x24f6c69900000000, 0xf81a77a400000000, 0x9c2fa5e200000000,
+    0x40c314df00000000, 0xf525ff5900000000, 0x29c94e6400000000,
+    0x4dfc9c2200000000, 0x91102d1f00000000, 0x859638af00000000,
+    0x597a899200000000, 0x3d4f5bd400000000, 0xe1a3eae900000000,
+    0xdb0586b500000000, 0x07e9378800000000, 0x63dce5ce00000000,
+    0xbf3054f300000000, 0xabb6414300000000, 0x775af07e00000000,
+    0x136f223800000000, 0xcf83930500000000, 0x7a65788300000000,
+    0xa689c9be00000000, 0xc2bc1bf800000000, 0x1e50aac500000000,
+    0x0ad6bf7500000000, 0xd63a0e4800000000, 0xb20fdc0e00000000,
+    0x6ee36d3300000000, 0x99c47ad800000000, 0x4528cbe500000000,
+    0x211d19a300000000, 0xfdf1a89e00000000, 0xe977bd2e00000000,
+    0x359b0c1300000000, 0x51aede5500000000, 0x8d426f6800000000,
+    0x38a484ee00000000, 0xe44835d300000000, 0x807de79500000000,
+    0x5c9156a800000000, 0x4817431800000000, 0x94fbf22500000000,
+    0xf0ce206300000000, 0x2c22915e00000000, 0x5f877f6e00000000,
+    0x836bce5300000000, 0xe75e1c1500000000, 0x3bb2ad2800000000,
+    0x2f34b89800000000, 0xf3d809a500000000, 0x97eddbe300000000,
+    0x4b016ade00000000, 0xfee7815800000000, 0x220b306500000000,
+    0x463ee22300000000, 0x9ad2531e00000000, 0x8e5446ae00000000,
+    0x52b8f79300000000, 0x368d25d500000000, 0xea6194e800000000,
+    0x1d46830300000000, 0xc1aa323e00000000, 0xa59fe07800000000,
+    0x7973514500000000, 0x6df544f500000000, 0xb119f5c800000000,
+    0xd52c278e00000000, 0x09c096b300000000, 0xbc267d3500000000,
+    0x60cacc0800000000, 0x04ff1e4e00000000, 0xd813af7300000000,
+    0xcc95bac300000000, 0x10790bfe00000000, 0x744cd9b800000000,
+    0xa8a0688500000000}};
+
+#else /* W == 4 */
+
+static const uint32_t crc_braid_table[][256] = {
+   {0x00000000, 0x81256527, 0xd93bcc0f, 0x581ea928, 0x69069e5f,
+    0xe823fb78, 0xb03d5250, 0x31183777, 0xd20d3cbe, 0x53285999,
+    0x0b36f0b1, 0x8a139596, 0xbb0ba2e1, 0x3a2ec7c6, 0x62306eee,
+    0xe3150bc9, 0x7f6b7f3d, 0xfe4e1a1a, 0xa650b332, 0x2775d615,
+    0x166de162, 0x97488445, 0xcf562d6d, 0x4e73484a, 0xad664383,
+    0x2c4326a4, 0x745d8f8c, 0xf578eaab, 0xc460dddc, 0x4545b8fb,
+    0x1d5b11d3, 0x9c7e74f4, 0xfed6fe7a, 0x7ff39b5d, 0x27ed3275,
+    0xa6c85752, 0x97d06025, 0x16f50502, 0x4eebac2a, 0xcfcec90d,
+    0x2cdbc2c4, 0xadfea7e3, 0xf5e00ecb, 0x74c56bec, 0x45dd5c9b,
+    0xc4f839bc, 0x9ce69094, 0x1dc3f5b3, 0x81bd8147, 0x0098e460,
+    0x58864d48, 0xd9a3286f, 0xe8bb1f18, 0x699e7a3f, 0x3180d317,
+    0xb0a5b630, 0x53b0bdf9, 0xd295d8de, 0x8a8b71f6, 0x0bae14d1,
+    0x3ab623a6, 0xbb934681, 0xe38defa9, 0x62a88a8e, 0x26dcfab5,
+    0xa7f99f92, 0xffe736ba, 0x7ec2539d, 0x4fda64ea, 0xceff01cd,
+    0x96e1a8e5, 0x17c4cdc2, 0xf4d1c60b, 0x75f4a32c, 0x2dea0a04,
+    0xaccf6f23, 0x9dd75854, 0x1cf23d73, 0x44ec945b, 0xc5c9f17c,
+    0x59b78588, 0xd892e0af, 0x808c4987, 0x01a92ca0, 0x30b11bd7,
+    0xb1947ef0, 0xe98ad7d8, 0x68afb2ff, 0x8bbab936, 0x0a9fdc11,
+    0x52817539, 0xd3a4101e, 0xe2bc2769, 0x6399424e, 0x3b87eb66,
+    0xbaa28e41, 0xd80a04cf, 0x592f61e8, 0x0131c8c0, 0x8014ade7,
+    0xb10c9a90, 0x3029ffb7, 0x6837569f, 0xe91233b8, 0x0a073871,
+    0x8b225d56, 0xd33cf47e, 0x52199159, 0x6301a62e, 0xe224c309,
+    0xba3a6a21, 0x3b1f0f06, 0xa7617bf2, 0x26441ed5, 0x7e5ab7fd,
+    0xff7fd2da, 0xce67e5ad, 0x4f42808a, 0x175c29a2, 0x96794c85,
+    0x756c474c, 0xf449226b, 0xac578b43, 0x2d72ee64, 0x1c6ad913,
+    0x9d4fbc34, 0xc551151c, 0x4474703b, 0x4db9f56a, 0xcc9c904d,
+    0x94823965, 0x15a75c42, 0x24bf6b35, 0xa59a0e12, 0xfd84a73a,
+    0x7ca1c21d, 0x9fb4c9d4, 0x1e91acf3, 0x468f05db, 0xc7aa60fc,
+    0xf6b2578b, 0x779732ac, 0x2f899b84, 0xaeacfea3, 0x32d28a57,
+    0xb3f7ef70, 0xebe94658, 0x6acc237f, 0x5bd41408, 0xdaf1712f,
+    0x82efd807, 0x03cabd20, 0xe0dfb6e9, 0x61fad3ce, 0x39e47ae6,
+    0xb8c11fc1, 0x89d928b6, 0x08fc4d91, 0x50e2e4b9, 0xd1c7819e,
+    0xb36f0b10, 0x324a6e37, 0x6a54c71f, 0xeb71a238, 0xda69954f,
+    0x5b4cf068, 0x03525940, 0x82773c67, 0x616237ae, 0xe0475289,
+    0xb859fba1, 0x397c9e86, 0x0864a9f1, 0x8941ccd6, 0xd15f65fe,
+    0x507a00d9, 0xcc04742d, 0x4d21110a, 0x153fb822, 0x941add05,
+    0xa502ea72, 0x24278f55, 0x7c39267d, 0xfd1c435a, 0x1e094893,
+    0x9f2c2db4, 0xc732849c, 0x4617e1bb, 0x770fd6cc, 0xf62ab3eb,
+    0xae341ac3, 0x2f117fe4, 0x6b650fdf, 0xea406af8, 0xb25ec3d0,
+    0x337ba6f7, 0x02639180, 0x8346f4a7, 0xdb585d8f, 0x5a7d38a8,
+    0xb9683361, 0x384d5646, 0x6053ff6e, 0xe1769a49, 0xd06ead3e,
+    0x514bc819, 0x09556131, 0x88700416, 0x140e70e2, 0x952b15c5,
+    0xcd35bced, 0x4c10d9ca, 0x7d08eebd, 0xfc2d8b9a, 0xa43322b2,
+    0x25164795, 0xc6034c5c, 0x4726297b, 0x1f388053, 0x9e1de574,
+    0xaf05d203, 0x2e20b724, 0x763e1e0c, 0xf71b7b2b, 0x95b3f1a5,
+    0x14969482, 0x4c883daa, 0xcdad588d, 0xfcb56ffa, 0x7d900add,
+    0x258ea3f5, 0xa4abc6d2, 0x47becd1b, 0xc69ba83c, 0x9e850114,
+    0x1fa06433, 0x2eb85344, 0xaf9d3663, 0xf7839f4b, 0x76a6fa6c,
+    0xead88e98, 0x6bfdebbf, 0x33e34297, 0xb2c627b0, 0x83de10c7,
+    0x02fb75e0, 0x5ae5dcc8, 0xdbc0b9ef, 0x38d5b226, 0xb9f0d701,
+    0xe1ee7e29, 0x60cb1b0e, 0x51d32c79, 0xd0f6495e, 0x88e8e076,
+    0x09cd8551},
+   {0x00000000, 0x9b73ead4, 0xed96d3e9, 0x76e5393d, 0x005ca193,
+    0x9b2f4b47, 0xedca727a, 0x76b998ae, 0x00b94326, 0x9bcaa9f2,
+    0xed2f90cf, 0x765c7a1b, 0x00e5e2b5, 0x9b960861, 0xed73315c,
+    0x7600db88, 0x0172864c, 0x9a016c98, 0xece455a5, 0x7797bf71,
+    0x012e27df, 0x9a5dcd0b, 0xecb8f436, 0x77cb1ee2, 0x01cbc56a,
+    0x9ab82fbe, 0xec5d1683, 0x772efc57, 0x019764f9, 0x9ae48e2d,
+    0xec01b710, 0x77725dc4, 0x02e50c98, 0x9996e64c, 0xef73df71,
+    0x740035a5, 0x02b9ad0b, 0x99ca47df, 0xef2f7ee2, 0x745c9436,
+    0x025c4fbe, 0x992fa56a, 0xefca9c57, 0x74b97683, 0x0200ee2d,
+    0x997304f9, 0xef963dc4, 0x74e5d710, 0x03978ad4, 0x98e46000,
+    0xee01593d, 0x7572b3e9, 0x03cb2b47, 0x98b8c193, 0xee5df8ae,
+    0x752e127a, 0x032ec9f2, 0x985d2326, 0xeeb81a1b, 0x75cbf0cf,
+    0x03726861, 0x980182b5, 0xeee4bb88, 0x7597515c, 0x05ca1930,
+    0x9eb9f3e4, 0xe85ccad9, 0x732f200d, 0x0596b8a3, 0x9ee55277,
+    0xe8006b4a, 0x7373819e, 0x05735a16, 0x9e00b0c2, 0xe8e589ff,
+    0x7396632b, 0x052ffb85, 0x9e5c1151, 0xe8b9286c, 0x73cac2b8,
+    0x04b89f7c, 0x9fcb75a8, 0xe92e4c95, 0x725da641, 0x04e43eef,
+    0x9f97d43b, 0xe972ed06, 0x720107d2, 0x0401dc5a, 0x9f72368e,
+    0xe9970fb3, 0x72e4e567, 0x045d7dc9, 0x9f2e971d, 0xe9cbae20,
+    0x72b844f4, 0x072f15a8, 0x9c5cff7c, 0xeab9c641, 0x71ca2c95,
+    0x0773b43b, 0x9c005eef, 0xeae567d2, 0x71968d06, 0x0796568e,
+    0x9ce5bc5a, 0xea008567, 0x71736fb3, 0x07caf71d, 0x9cb91dc9,
+    0xea5c24f4, 0x712fce20, 0x065d93e4, 0x9d2e7930, 0xebcb400d,
+    0x70b8aad9, 0x06013277, 0x9d72d8a3, 0xeb97e19e, 0x70e40b4a,
+    0x06e4d0c2, 0x9d973a16, 0xeb72032b, 0x7001e9ff, 0x06b87151,
+    0x9dcb9b85, 0xeb2ea2b8, 0x705d486c, 0x0b943260, 0x90e7d8b4,
+    0xe602e189, 0x7d710b5d, 0x0bc893f3, 0x90bb7927, 0xe65e401a,
+    0x7d2daace, 0x0b2d7146, 0x905e9b92, 0xe6bba2af, 0x7dc8487b,
+    0x0b71d0d5, 0x90023a01, 0xe6e7033c, 0x7d94e9e8, 0x0ae6b42c,
+    0x91955ef8, 0xe77067c5, 0x7c038d11, 0x0aba15bf, 0x91c9ff6b,
+    0xe72cc656, 0x7c5f2c82, 0x0a5ff70a, 0x912c1dde, 0xe7c924e3,
+    0x7cbace37, 0x0a035699, 0x9170bc4d, 0xe7958570, 0x7ce66fa4,
+    0x09713ef8, 0x9202d42c, 0xe4e7ed11, 0x7f9407c5, 0x092d9f6b,
+    0x925e75bf, 0xe4bb4c82, 0x7fc8a656, 0x09c87dde, 0x92bb970a,
+    0xe45eae37, 0x7f2d44e3, 0x0994dc4d, 0x92e73699, 0xe4020fa4,
+    0x7f71e570, 0x0803b8b4, 0x93705260, 0xe5956b5d, 0x7ee68189,
+    0x085f1927, 0x932cf3f3, 0xe5c9cace, 0x7eba201a, 0x08bafb92,
+    0x93c91146, 0xe52c287b, 0x7e5fc2af, 0x08e65a01, 0x9395b0d5,
+    0xe57089e8, 0x7e03633c, 0x0e5e2b50, 0x952dc184, 0xe3c8f8b9,
+    0x78bb126d, 0x0e028ac3, 0x95716017, 0xe394592a, 0x78e7b3fe,
+    0x0ee76876, 0x959482a2, 0xe371bb9f, 0x7802514b, 0x0ebbc9e5,
+    0x95c82331, 0xe32d1a0c, 0x785ef0d8, 0x0f2cad1c, 0x945f47c8,
+    0xe2ba7ef5, 0x79c99421, 0x0f700c8f, 0x9403e65b, 0xe2e6df66,
+    0x799535b2, 0x0f95ee3a, 0x94e604ee, 0xe2033dd3, 0x7970d707,
+    0x0fc94fa9, 0x94baa57d, 0xe25f9c40, 0x792c7694, 0x0cbb27c8,
+    0x97c8cd1c, 0xe12df421, 0x7a5e1ef5, 0x0ce7865b, 0x97946c8f,
+    0xe17155b2, 0x7a02bf66, 0x0c0264ee, 0x97718e3a, 0xe194b707,
+    0x7ae75dd3, 0x0c5ec57d, 0x972d2fa9, 0xe1c81694, 0x7abbfc40,
+    0x0dc9a184, 0x96ba4b50, 0xe05f726d, 0x7b2c98b9, 0x0d950017,
+    0x96e6eac3, 0xe003d3fe, 0x7b70392a, 0x0d70e2a2, 0x96030876,
+    0xe0e6314b, 0x7b95db9f, 0x0d2c4331, 0x965fa9e5, 0xe0ba90d8,
+    0x7bc97a0c},
+   {0x00000000, 0x172864c0, 0x2e50c980, 0x3978ad40, 0x5ca19300,
+    0x4b89f7c0, 0x72f15a80, 0x65d93e40, 0xb9432600, 0xae6b42c0,
+    0x9713ef80, 0x803b8b40, 0xe5e2b500, 0xf2cad1c0, 0xcbb27c80,
+    0xdc9a1840, 0xa9f74a41, 0xbedf2e81, 0x87a783c1, 0x908fe701,
+    0xf556d941, 0xe27ebd81, 0xdb0610c1, 0xcc2e7401, 0x10b46c41,
+    0x079c0881, 0x3ee4a5c1, 0x29ccc101, 0x4c15ff41, 0x5b3d9b81,
+    0x624536c1, 0x756d5201, 0x889f92c3, 0x9fb7f603, 0xa6cf5b43,
+    0xb1e73f83, 0xd43e01c3, 0xc3166503, 0xfa6ec843, 0xed46ac83,
+    0x31dcb4c3, 0x26f4d003, 0x1f8c7d43, 0x08a41983, 0x6d7d27c3,
+    0x7a554303, 0x432dee43, 0x54058a83, 0x2168d882, 0x3640bc42,
+    0x0f381102, 0x181075c2, 0x7dc94b82, 0x6ae12f42, 0x53998202,
+    0x44b1e6c2, 0x982bfe82, 0x8f039a42, 0xb67b3702, 0xa15353c2,
+    0xc48a6d82, 0xd3a20942, 0xeadaa402, 0xfdf2c0c2, 0xca4e23c7,
+    0xdd664707, 0xe41eea47, 0xf3368e87, 0x96efb0c7, 0x81c7d407,
+    0xb8bf7947, 0xaf971d87, 0x730d05c7, 0x64256107, 0x5d5dcc47,
+    0x4a75a887, 0x2fac96c7, 0x3884f207, 0x01fc5f47, 0x16d43b87,
+    0x63b96986, 0x74910d46, 0x4de9a006, 0x5ac1c4c6, 0x3f18fa86,
+    0x28309e46, 0x11483306, 0x066057c6, 0xdafa4f86, 0xcdd22b46,
+    0xf4aa8606, 0xe382e2c6, 0x865bdc86, 0x9173b846, 0xa80b1506,
+    0xbf2371c6, 0x42d1b104, 0x55f9d5c4, 0x6c817884, 0x7ba91c44,
+    0x1e702204, 0x095846c4, 0x3020eb84, 0x27088f44, 0xfb929704,
+    0xecbaf3c4, 0xd5c25e84, 0xc2ea3a44, 0xa7330404, 0xb01b60c4,
+    0x8963cd84, 0x9e4ba944, 0xeb26fb45, 0xfc0e9f85, 0xc57632c5,
+    0xd25e5605, 0xb7876845, 0xa0af0c85, 0x99d7a1c5, 0x8effc505,
+    0x5265dd45, 0x454db985, 0x7c3514c5, 0x6b1d7005, 0x0ec44e45,
+    0x19ec2a85, 0x209487c5, 0x37bce305, 0x4fed41cf, 0x58c5250f,
+    0x61bd884f, 0x7695ec8f, 0x134cd2cf, 0x0464b60f, 0x3d1c1b4f,
+    0x2a347f8f, 0xf6ae67cf, 0xe186030f, 0xd8feae4f, 0xcfd6ca8f,
+    0xaa0ff4cf, 0xbd27900f, 0x845f3d4f, 0x9377598f, 0xe61a0b8e,
+    0xf1326f4e, 0xc84ac20e, 0xdf62a6ce, 0xbabb988e, 0xad93fc4e,
+    0x94eb510e, 0x83c335ce, 0x5f592d8e, 0x4871494e, 0x7109e40e,
+    0x662180ce, 0x03f8be8e, 0x14d0da4e, 0x2da8770e, 0x3a8013ce,
+    0xc772d30c, 0xd05ab7cc, 0xe9221a8c, 0xfe0a7e4c, 0x9bd3400c,
+    0x8cfb24cc, 0xb583898c, 0xa2abed4c, 0x7e31f50c, 0x691991cc,
+    0x50613c8c, 0x4749584c, 0x2290660c, 0x35b802cc, 0x0cc0af8c,
+    0x1be8cb4c, 0x6e85994d, 0x79adfd8d, 0x40d550cd, 0x57fd340d,
+    0x32240a4d, 0x250c6e8d, 0x1c74c3cd, 0x0b5ca70d, 0xd7c6bf4d,
+    0xc0eedb8d, 0xf99676cd, 0xeebe120d, 0x8b672c4d, 0x9c4f488d,
+    0xa537e5cd, 0xb21f810d, 0x85a36208, 0x928b06c8, 0xabf3ab88,
+    0xbcdbcf48, 0xd902f108, 0xce2a95c8, 0xf7523888, 0xe07a5c48,
+    0x3ce04408, 0x2bc820c8, 0x12b08d88, 0x0598e948, 0x6041d708,
+    0x7769b3c8, 0x4e111e88, 0x59397a48, 0x2c542849, 0x3b7c4c89,
+    0x0204e1c9, 0x152c8509, 0x70f5bb49, 0x67dddf89, 0x5ea572c9,
+    0x498d1609, 0x95170e49, 0x823f6a89, 0xbb47c7c9, 0xac6fa309,
+    0xc9b69d49, 0xde9ef989, 0xe7e654c9, 0xf0ce3009, 0x0d3cf0cb,
+    0x1a14940b, 0x236c394b, 0x34445d8b, 0x519d63cb, 0x46b5070b,
+    0x7fcdaa4b, 0x68e5ce8b, 0xb47fd6cb, 0xa357b20b, 0x9a2f1f4b,
+    0x8d077b8b, 0xe8de45cb, 0xfff6210b, 0xc68e8c4b, 0xd1a6e88b,
+    0xa4cbba8a, 0xb3e3de4a, 0x8a9b730a, 0x9db317ca, 0xf86a298a,
+    0xef424d4a, 0xd63ae00a, 0xc11284ca, 0x1d889c8a, 0x0aa0f84a,
+    0x33d8550a, 0x24f031ca, 0x41290f8a, 0x56016b4a, 0x6f79c60a,
+    0x7851a2ca},
+   {0x00000000, 0x9fda839e, 0xe4c4017d, 0x7b1e82e3, 0x12f904bb,
+    0x8d238725, 0xf63d05c6, 0x69e78658, 0x25f20976, 0xba288ae8,
+    0xc136080b, 0x5eec8b95, 0x370b0dcd, 0xa8d18e53, 0xd3cf0cb0,
+    0x4c158f2e, 0x4be412ec, 0xd43e9172, 0xaf201391, 0x30fa900f,
+    0x591d1657, 0xc6c795c9, 0xbdd9172a, 0x220394b4, 0x6e161b9a,
+    0xf1cc9804, 0x8ad21ae7, 0x15089979, 0x7cef1f21, 0xe3359cbf,
+    0x982b1e5c, 0x07f19dc2, 0x97c825d8, 0x0812a646, 0x730c24a5,
+    0xecd6a73b, 0x85312163, 0x1aeba2fd, 0x61f5201e, 0xfe2fa380,
+    0xb23a2cae, 0x2de0af30, 0x56fe2dd3, 0xc924ae4d, 0xa0c32815,
+    0x3f19ab8b, 0x44072968, 0xdbddaaf6, 0xdc2c3734, 0x43f6b4aa,
+    0x38e83649, 0xa732b5d7, 0xced5338f, 0x510fb011, 0x2a1132f2,
+    0xb5cbb16c, 0xf9de3e42, 0x6604bddc, 0x1d1a3f3f, 0x82c0bca1,
+    0xeb273af9, 0x74fdb967, 0x0fe33b84, 0x9039b81a, 0xf4e14df1,
+    0x6b3bce6f, 0x10254c8c, 0x8fffcf12, 0xe618494a, 0x79c2cad4,
+    0x02dc4837, 0x9d06cba9, 0xd1134487, 0x4ec9c719, 0x35d745fa,
+    0xaa0dc664, 0xc3ea403c, 0x5c30c3a2, 0x272e4141, 0xb8f4c2df,
+    0xbf055f1d, 0x20dfdc83, 0x5bc15e60, 0xc41bddfe, 0xadfc5ba6,
+    0x3226d838, 0x49385adb, 0xd6e2d945, 0x9af7566b, 0x052dd5f5,
+    0x7e335716, 0xe1e9d488, 0x880e52d0, 0x17d4d14e, 0x6cca53ad,
+    0xf310d033, 0x63296829, 0xfcf3ebb7, 0x87ed6954, 0x1837eaca,
+    0x71d06c92, 0xee0aef0c, 0x95146def, 0x0aceee71, 0x46db615f,
+    0xd901e2c1, 0xa21f6022, 0x3dc5e3bc, 0x542265e4, 0xcbf8e67a,
+    0xb0e66499, 0x2f3ce707, 0x28cd7ac5, 0xb717f95b, 0xcc097bb8,
+    0x53d3f826, 0x3a347e7e, 0xa5eefde0, 0xdef07f03, 0x412afc9d,
+    0x0d3f73b3, 0x92e5f02d, 0xe9fb72ce, 0x7621f150, 0x1fc67708,
+    0x801cf496, 0xfb027675, 0x64d8f5eb, 0x32b39da3, 0xad691e3d,
+    0xd6779cde, 0x49ad1f40, 0x204a9918, 0xbf901a86, 0xc48e9865,
+    0x5b541bfb, 0x174194d5, 0x889b174b, 0xf38595a8, 0x6c5f1636,
+    0x05b8906e, 0x9a6213f0, 0xe17c9113, 0x7ea6128d, 0x79578f4f,
+    0xe68d0cd1, 0x9d938e32, 0x02490dac, 0x6bae8bf4, 0xf474086a,
+    0x8f6a8a89, 0x10b00917, 0x5ca58639, 0xc37f05a7, 0xb8618744,
+    0x27bb04da, 0x4e5c8282, 0xd186011c, 0xaa9883ff, 0x35420061,
+    0xa57bb87b, 0x3aa13be5, 0x41bfb906, 0xde653a98, 0xb782bcc0,
+    0x28583f5e, 0x5346bdbd, 0xcc9c3e23, 0x8089b10d, 0x1f533293,
+    0x644db070, 0xfb9733ee, 0x9270b5b6, 0x0daa3628, 0x76b4b4cb,
+    0xe96e3755, 0xee9faa97, 0x71452909, 0x0a5babea, 0x95812874,
+    0xfc66ae2c, 0x63bc2db2, 0x18a2af51, 0x87782ccf, 0xcb6da3e1,
+    0x54b7207f, 0x2fa9a29c, 0xb0732102, 0xd994a75a, 0x464e24c4,
+    0x3d50a627, 0xa28a25b9, 0xc652d052, 0x598853cc, 0x2296d12f,
+    0xbd4c52b1, 0xd4abd4e9, 0x4b715777, 0x306fd594, 0xafb5560a,
+    0xe3a0d924, 0x7c7a5aba, 0x0764d859, 0x98be5bc7, 0xf159dd9f,
+    0x6e835e01, 0x159ddce2, 0x8a475f7c, 0x8db6c2be, 0x126c4120,
+    0x6972c3c3, 0xf6a8405d, 0x9f4fc605, 0x0095459b, 0x7b8bc778,
+    0xe45144e6, 0xa844cbc8, 0x379e4856, 0x4c80cab5, 0xd35a492b,
+    0xbabdcf73, 0x25674ced, 0x5e79ce0e, 0xc1a34d90, 0x519af58a,
+    0xce407614, 0xb55ef4f7, 0x2a847769, 0x4363f131, 0xdcb972af,
+    0xa7a7f04c, 0x387d73d2, 0x7468fcfc, 0xebb27f62, 0x90acfd81,
+    0x0f767e1f, 0x6691f847, 0xf94b7bd9, 0x8255f93a, 0x1d8f7aa4,
+    0x1a7ee766, 0x85a464f8, 0xfebae61b, 0x61606585, 0x0887e3dd,
+    0x975d6043, 0xec43e2a0, 0x7399613e, 0x3f8cee10, 0xa0566d8e,
+    0xdb48ef6d, 0x44926cf3, 0x2d75eaab, 0xb2af6935, 0xc9b1ebd6,
+    0x566b6848}};
+
+static const z_word_t crc_braid_big_table[][256] = {
+   {0x00000000, 0x9e83da9f, 0x7d01c4e4, 0xe3821e7b, 0xbb04f912,
+    0x2587238d, 0xc6053df6, 0x5886e769, 0x7609f225, 0xe88a28ba,
+    0x0b0836c1, 0x958bec5e, 0xcd0d0b37, 0x538ed1a8, 0xb00ccfd3,
+    0x2e8f154c, 0xec12e44b, 0x72913ed4, 0x911320af, 0x0f90fa30,
+    0x57161d59, 0xc995c7c6, 0x2a17d9bd, 0xb4940322, 0x9a1b166e,
+    0x0498ccf1, 0xe71ad28a, 0x79990815, 0x211fef7c, 0xbf9c35e3,
+    0x5c1e2b98, 0xc29df107, 0xd825c897, 0x46a61208, 0xa5240c73,
+    0x3ba7d6ec, 0x63213185, 0xfda2eb1a, 0x1e20f561, 0x80a32ffe,
+    0xae2c3ab2, 0x30afe02d, 0xd32dfe56, 0x4dae24c9, 0x1528c3a0,
+    0x8bab193f, 0x68290744, 0xf6aadddb, 0x34372cdc, 0xaab4f643,
+    0x4936e838, 0xd7b532a7, 0x8f33d5ce, 0x11b00f51, 0xf232112a,
+    0x6cb1cbb5, 0x423edef9, 0xdcbd0466, 0x3f3f1a1d, 0xa1bcc082,
+    0xf93a27eb, 0x67b9fd74, 0x843be30f, 0x1ab83990, 0xf14de1f4,
+    0x6fce3b6b, 0x8c4c2510, 0x12cfff8f, 0x4a4918e6, 0xd4cac279,
+    0x3748dc02, 0xa9cb069d, 0x874413d1, 0x19c7c94e, 0xfa45d735,
+    0x64c60daa, 0x3c40eac3, 0xa2c3305c, 0x41412e27, 0xdfc2f4b8,
+    0x1d5f05bf, 0x83dcdf20, 0x605ec15b, 0xfedd1bc4, 0xa65bfcad,
+    0x38d82632, 0xdb5a3849, 0x45d9e2d6, 0x6b56f79a, 0xf5d52d05,
+    0x1657337e, 0x88d4e9e1, 0xd0520e88, 0x4ed1d417, 0xad53ca6c,
+    0x33d010f3, 0x29682963, 0xb7ebf3fc, 0x5469ed87, 0xcaea3718,
+    0x926cd071, 0x0cef0aee, 0xef6d1495, 0x71eece0a, 0x5f61db46,
+    0xc1e201d9, 0x22601fa2, 0xbce3c53d, 0xe4652254, 0x7ae6f8cb,
+    0x9964e6b0, 0x07e73c2f, 0xc57acd28, 0x5bf917b7, 0xb87b09cc,
+    0x26f8d353, 0x7e7e343a, 0xe0fdeea5, 0x037ff0de, 0x9dfc2a41,
+    0xb3733f0d, 0x2df0e592, 0xce72fbe9, 0x50f12176, 0x0877c61f,
+    0x96f41c80, 0x757602fb, 0xebf5d864, 0xa39db332, 0x3d1e69ad,
+    0xde9c77d6, 0x401fad49, 0x18994a20, 0x861a90bf, 0x65988ec4,
+    0xfb1b545b, 0xd5944117, 0x4b179b88, 0xa89585f3, 0x36165f6c,
+    0x6e90b805, 0xf013629a, 0x13917ce1, 0x8d12a67e, 0x4f8f5779,
+    0xd10c8de6, 0x328e939d, 0xac0d4902, 0xf48bae6b, 0x6a0874f4,
+    0x898a6a8f, 0x1709b010, 0x3986a55c, 0xa7057fc3, 0x448761b8,
+    0xda04bb27, 0x82825c4e, 0x1c0186d1, 0xff8398aa, 0x61004235,
+    0x7bb87ba5, 0xe53ba13a, 0x06b9bf41, 0x983a65de, 0xc0bc82b7,
+    0x5e3f5828, 0xbdbd4653, 0x233e9ccc, 0x0db18980, 0x9332531f,
+    0x70b04d64, 0xee3397fb, 0xb6b57092, 0x2836aa0d, 0xcbb4b476,
+    0x55376ee9, 0x97aa9fee, 0x09294571, 0xeaab5b0a, 0x74288195,
+    0x2cae66fc, 0xb22dbc63, 0x51afa218, 0xcf2c7887, 0xe1a36dcb,
+    0x7f20b754, 0x9ca2a92f, 0x022173b0, 0x5aa794d9, 0xc4244e46,
+    0x27a6503d, 0xb9258aa2, 0x52d052c6, 0xcc538859, 0x2fd19622,
+    0xb1524cbd, 0xe9d4abd4, 0x7757714b, 0x94d56f30, 0x0a56b5af,
+    0x24d9a0e3, 0xba5a7a7c, 0x59d86407, 0xc75bbe98, 0x9fdd59f1,
+    0x015e836e, 0xe2dc9d15, 0x7c5f478a, 0xbec2b68d, 0x20416c12,
+    0xc3c37269, 0x5d40a8f6, 0x05c64f9f, 0x9b459500, 0x78c78b7b,
+    0xe64451e4, 0xc8cb44a8, 0x56489e37, 0xb5ca804c, 0x2b495ad3,
+    0x73cfbdba, 0xed4c6725, 0x0ece795e, 0x904da3c1, 0x8af59a51,
+    0x147640ce, 0xf7f45eb5, 0x6977842a, 0x31f16343, 0xaf72b9dc,
+    0x4cf0a7a7, 0xd2737d38, 0xfcfc6874, 0x627fb2eb, 0x81fdac90,
+    0x1f7e760f, 0x47f89166, 0xd97b4bf9, 0x3af95582, 0xa47a8f1d,
+    0x66e77e1a, 0xf864a485, 0x1be6bafe, 0x85656061, 0xdde38708,
+    0x43605d97, 0xa0e243ec, 0x3e619973, 0x10ee8c3f, 0x8e6d56a0,
+    0x6def48db, 0xf36c9244, 0xabea752d, 0x3569afb2, 0xd6ebb1c9,
+    0x48686b56},
+   {0x00000000, 0xc0642817, 0x80c9502e, 0x40ad7839, 0x0093a15c,
+    0xc0f7894b, 0x805af172, 0x403ed965, 0x002643b9, 0xc0426bae,
+    0x80ef1397, 0x408b3b80, 0x00b5e2e5, 0xc0d1caf2, 0x807cb2cb,
+    0x40189adc, 0x414af7a9, 0x812edfbe, 0xc183a787, 0x01e78f90,
+    0x41d956f5, 0x81bd7ee2, 0xc11006db, 0x01742ecc, 0x416cb410,
+    0x81089c07, 0xc1a5e43e, 0x01c1cc29, 0x41ff154c, 0x819b3d5b,
+    0xc1364562, 0x01526d75, 0xc3929f88, 0x03f6b79f, 0x435bcfa6,
+    0x833fe7b1, 0xc3013ed4, 0x036516c3, 0x43c86efa, 0x83ac46ed,
+    0xc3b4dc31, 0x03d0f426, 0x437d8c1f, 0x8319a408, 0xc3277d6d,
+    0x0343557a, 0x43ee2d43, 0x838a0554, 0x82d86821, 0x42bc4036,
+    0x0211380f, 0xc2751018, 0x824bc97d, 0x422fe16a, 0x02829953,
+    0xc2e6b144, 0x82fe2b98, 0x429a038f, 0x02377bb6, 0xc25353a1,
+    0x826d8ac4, 0x4209a2d3, 0x02a4daea, 0xc2c0f2fd, 0xc7234eca,
+    0x074766dd, 0x47ea1ee4, 0x878e36f3, 0xc7b0ef96, 0x07d4c781,
+    0x4779bfb8, 0x871d97af, 0xc7050d73, 0x07612564, 0x47cc5d5d,
+    0x87a8754a, 0xc796ac2f, 0x07f28438, 0x475ffc01, 0x873bd416,
+    0x8669b963, 0x460d9174, 0x06a0e94d, 0xc6c4c15a, 0x86fa183f,
+    0x469e3028, 0x06334811, 0xc6576006, 0x864ffada, 0x462bd2cd,
+    0x0686aaf4, 0xc6e282e3, 0x86dc5b86, 0x46b87391, 0x06150ba8,
+    0xc67123bf, 0x04b1d142, 0xc4d5f955, 0x8478816c, 0x441ca97b,
+    0x0422701e, 0xc4465809, 0x84eb2030, 0x448f0827, 0x049792fb,
+    0xc4f3baec, 0x845ec2d5, 0x443aeac2, 0x040433a7, 0xc4601bb0,
+    0x84cd6389, 0x44a94b9e, 0x45fb26eb, 0x859f0efc, 0xc53276c5,
+    0x05565ed2, 0x456887b7, 0x850cafa0, 0xc5a1d799, 0x05c5ff8e,
+    0x45dd6552, 0x85b94d45, 0xc514357c, 0x05701d6b, 0x454ec40e,
+    0x852aec19, 0xc5879420, 0x05e3bc37, 0xcf41ed4f, 0x0f25c558,
+    0x4f88bd61, 0x8fec9576, 0xcfd24c13, 0x0fb66404, 0x4f1b1c3d,
+    0x8f7f342a, 0xcf67aef6, 0x0f0386e1, 0x4faefed8, 0x8fcad6cf,
+    0xcff40faa, 0x0f9027bd, 0x4f3d5f84, 0x8f597793, 0x8e0b1ae6,
+    0x4e6f32f1, 0x0ec24ac8, 0xcea662df, 0x8e98bbba, 0x4efc93ad,
+    0x0e51eb94, 0xce35c383, 0x8e2d595f, 0x4e497148, 0x0ee40971,
+    0xce802166, 0x8ebef803, 0x4edad014, 0x0e77a82d, 0xce13803a,
+    0x0cd372c7, 0xccb75ad0, 0x8c1a22e9, 0x4c7e0afe, 0x0c40d39b,
+    0xcc24fb8c, 0x8c8983b5, 0x4cedaba2, 0x0cf5317e, 0xcc911969,
+    0x8c3c6150, 0x4c584947, 0x0c669022, 0xcc02b835, 0x8cafc00c,
+    0x4ccbe81b, 0x4d99856e, 0x8dfdad79, 0xcd50d540, 0x0d34fd57,
+    0x4d0a2432, 0x8d6e0c25, 0xcdc3741c, 0x0da75c0b, 0x4dbfc6d7,
+    0x8ddbeec0, 0xcd7696f9, 0x0d12beee, 0x4d2c678b, 0x8d484f9c,
+    0xcde537a5, 0x0d811fb2, 0x0862a385, 0xc8068b92, 0x88abf3ab,
+    0x48cfdbbc, 0x08f102d9, 0xc8952ace, 0x883852f7, 0x485c7ae0,
+    0x0844e03c, 0xc820c82b, 0x888db012, 0x48e99805, 0x08d74160,
+    0xc8b36977, 0x881e114e, 0x487a3959, 0x4928542c, 0x894c7c3b,
+    0xc9e10402, 0x09852c15, 0x49bbf570, 0x89dfdd67, 0xc972a55e,
+    0x09168d49, 0x490e1795, 0x896a3f82, 0xc9c747bb, 0x09a36fac,
+    0x499db6c9, 0x89f99ede, 0xc954e6e7, 0x0930cef0, 0xcbf03c0d,
+    0x0b94141a, 0x4b396c23, 0x8b5d4434, 0xcb639d51, 0x0b07b546,
+    0x4baacd7f, 0x8bcee568, 0xcbd67fb4, 0x0bb257a3, 0x4b1f2f9a,
+    0x8b7b078d, 0xcb45dee8, 0x0b21f6ff, 0x4b8c8ec6, 0x8be8a6d1,
+    0x8abacba4, 0x4adee3b3, 0x0a739b8a, 0xca17b39d, 0x8a296af8,
+    0x4a4d42ef, 0x0ae03ad6, 0xca8412c1, 0x8a9c881d, 0x4af8a00a,
+    0x0a55d833, 0xca31f024, 0x8a0f2941, 0x4a6b0156, 0x0ac6796f,
+    0xcaa25178},
+   {0x00000000, 0xd4ea739b, 0xe9d396ed, 0x3d39e576, 0x93a15c00,
+    0x474b2f9b, 0x7a72caed, 0xae98b976, 0x2643b900, 0xf2a9ca9b,
+    0xcf902fed, 0x1b7a5c76, 0xb5e2e500, 0x6108969b, 0x5c3173ed,
+    0x88db0076, 0x4c867201, 0x986c019a, 0xa555e4ec, 0x71bf9777,
+    0xdf272e01, 0x0bcd5d9a, 0x36f4b8ec, 0xe21ecb77, 0x6ac5cb01,
+    0xbe2fb89a, 0x83165dec, 0x57fc2e77, 0xf9649701, 0x2d8ee49a,
+    0x10b701ec, 0xc45d7277, 0x980ce502, 0x4ce69699, 0x71df73ef,
+    0xa5350074, 0x0badb902, 0xdf47ca99, 0xe27e2fef, 0x36945c74,
+    0xbe4f5c02, 0x6aa52f99, 0x579ccaef, 0x8376b974, 0x2dee0002,
+    0xf9047399, 0xc43d96ef, 0x10d7e574, 0xd48a9703, 0x0060e498,
+    0x3d5901ee, 0xe9b37275, 0x472bcb03, 0x93c1b898, 0xaef85dee,
+    0x7a122e75, 0xf2c92e03, 0x26235d98, 0x1b1ab8ee, 0xcff0cb75,
+    0x61687203, 0xb5820198, 0x88bbe4ee, 0x5c519775, 0x3019ca05,
+    0xe4f3b99e, 0xd9ca5ce8, 0x0d202f73, 0xa3b89605, 0x7752e59e,
+    0x4a6b00e8, 0x9e817373, 0x165a7305, 0xc2b0009e, 0xff89e5e8,
+    0x2b639673, 0x85fb2f05, 0x51115c9e, 0x6c28b9e8, 0xb8c2ca73,
+    0x7c9fb804, 0xa875cb9f, 0x954c2ee9, 0x41a65d72, 0xef3ee404,
+    0x3bd4979f, 0x06ed72e9, 0xd2070172, 0x5adc0104, 0x8e36729f,
+    0xb30f97e9, 0x67e5e472, 0xc97d5d04, 0x1d972e9f, 0x20aecbe9,
+    0xf444b872, 0xa8152f07, 0x7cff5c9c, 0x41c6b9ea, 0x952cca71,
+    0x3bb47307, 0xef5e009c, 0xd267e5ea, 0x068d9671, 0x8e569607,
+    0x5abce59c, 0x678500ea, 0xb36f7371, 0x1df7ca07, 0xc91db99c,
+    0xf4245cea, 0x20ce2f71, 0xe4935d06, 0x30792e9d, 0x0d40cbeb,
+    0xd9aab870, 0x77320106, 0xa3d8729d, 0x9ee197eb, 0x4a0be470,
+    0xc2d0e406, 0x163a979d, 0x2b0372eb, 0xffe90170, 0x5171b806,
+    0x859bcb9d, 0xb8a22eeb, 0x6c485d70, 0x6032940b, 0xb4d8e790,
+    0x89e102e6, 0x5d0b717d, 0xf393c80b, 0x2779bb90, 0x1a405ee6,
+    0xceaa2d7d, 0x46712d0b, 0x929b5e90, 0xafa2bbe6, 0x7b48c87d,
+    0xd5d0710b, 0x013a0290, 0x3c03e7e6, 0xe8e9947d, 0x2cb4e60a,
+    0xf85e9591, 0xc56770e7, 0x118d037c, 0xbf15ba0a, 0x6bffc991,
+    0x56c62ce7, 0x822c5f7c, 0x0af75f0a, 0xde1d2c91, 0xe324c9e7,
+    0x37ceba7c, 0x9956030a, 0x4dbc7091, 0x708595e7, 0xa46fe67c,
+    0xf83e7109, 0x2cd40292, 0x11ede7e4, 0xc507947f, 0x6b9f2d09,
+    0xbf755e92, 0x824cbbe4, 0x56a6c87f, 0xde7dc809, 0x0a97bb92,
+    0x37ae5ee4, 0xe3442d7f, 0x4ddc9409, 0x9936e792, 0xa40f02e4,
+    0x70e5717f, 0xb4b80308, 0x60527093, 0x5d6b95e5, 0x8981e67e,
+    0x27195f08, 0xf3f32c93, 0xcecac9e5, 0x1a20ba7e, 0x92fbba08,
+    0x4611c993, 0x7b282ce5, 0xafc25f7e, 0x015ae608, 0xd5b09593,
+    0xe88970e5, 0x3c63037e, 0x502b5e0e, 0x84c12d95, 0xb9f8c8e3,
+    0x6d12bb78, 0xc38a020e, 0x17607195, 0x2a5994e3, 0xfeb3e778,
+    0x7668e70e, 0xa2829495, 0x9fbb71e3, 0x4b510278, 0xe5c9bb0e,
+    0x3123c895, 0x0c1a2de3, 0xd8f05e78, 0x1cad2c0f, 0xc8475f94,
+    0xf57ebae2, 0x2194c979, 0x8f0c700f, 0x5be60394, 0x66dfe6e2,
+    0xb2359579, 0x3aee950f, 0xee04e694, 0xd33d03e2, 0x07d77079,
+    0xa94fc90f, 0x7da5ba94, 0x409c5fe2, 0x94762c79, 0xc827bb0c,
+    0x1ccdc897, 0x21f42de1, 0xf51e5e7a, 0x5b86e70c, 0x8f6c9497,
+    0xb25571e1, 0x66bf027a, 0xee64020c, 0x3a8e7197, 0x07b794e1,
+    0xd35de77a, 0x7dc55e0c, 0xa92f2d97, 0x9416c8e1, 0x40fcbb7a,
+    0x84a1c90d, 0x504bba96, 0x6d725fe0, 0xb9982c7b, 0x1700950d,
+    0xc3eae696, 0xfed303e0, 0x2a39707b, 0xa2e2700d, 0x76080396,
+    0x4b31e6e0, 0x9fdb957b, 0x31432c0d, 0xe5a95f96, 0xd890bae0,
+    0x0c7ac97b},
+   {0x00000000, 0x27652581, 0x0fcc3bd9, 0x28a91e58, 0x5f9e0669,
+    0x78fb23e8, 0x50523db0, 0x77371831, 0xbe3c0dd2, 0x99592853,
+    0xb1f0360b, 0x9695138a, 0xe1a20bbb, 0xc6c72e3a, 0xee6e3062,
+    0xc90b15e3, 0x3d7f6b7f, 0x1a1a4efe, 0x32b350a6, 0x15d67527,
+    0x62e16d16, 0x45844897, 0x6d2d56cf, 0x4a48734e, 0x834366ad,
+    0xa426432c, 0x8c8f5d74, 0xabea78f5, 0xdcdd60c4, 0xfbb84545,
+    0xd3115b1d, 0xf4747e9c, 0x7afed6fe, 0x5d9bf37f, 0x7532ed27,
+    0x5257c8a6, 0x2560d097, 0x0205f516, 0x2aaceb4e, 0x0dc9cecf,
+    0xc4c2db2c, 0xe3a7fead, 0xcb0ee0f5, 0xec6bc574, 0x9b5cdd45,
+    0xbc39f8c4, 0x9490e69c, 0xb3f5c31d, 0x4781bd81, 0x60e49800,
+    0x484d8658, 0x6f28a3d9, 0x181fbbe8, 0x3f7a9e69, 0x17d38031,
+    0x30b6a5b0, 0xf9bdb053, 0xded895d2, 0xf6718b8a, 0xd114ae0b,
+    0xa623b63a, 0x814693bb, 0xa9ef8de3, 0x8e8aa862, 0xb5fadc26,
+    0x929ff9a7, 0xba36e7ff, 0x9d53c27e, 0xea64da4f, 0xcd01ffce,
+    0xe5a8e196, 0xc2cdc417, 0x0bc6d1f4, 0x2ca3f475, 0x040aea2d,
+    0x236fcfac, 0x5458d79d, 0x733df21c, 0x5b94ec44, 0x7cf1c9c5,
+    0x8885b759, 0xafe092d8, 0x87498c80, 0xa02ca901, 0xd71bb130,
+    0xf07e94b1, 0xd8d78ae9, 0xffb2af68, 0x36b9ba8b, 0x11dc9f0a,
+    0x39758152, 0x1e10a4d3, 0x6927bce2, 0x4e429963, 0x66eb873b,
+    0x418ea2ba, 0xcf040ad8, 0xe8612f59, 0xc0c83101, 0xe7ad1480,
+    0x909a0cb1, 0xb7ff2930, 0x9f563768, 0xb83312e9, 0x7138070a,
+    0x565d228b, 0x7ef43cd3, 0x59911952, 0x2ea60163, 0x09c324e2,
+    0x216a3aba, 0x060f1f3b, 0xf27b61a7, 0xd51e4426, 0xfdb75a7e,
+    0xdad27fff, 0xade567ce, 0x8a80424f, 0xa2295c17, 0x854c7996,
+    0x4c476c75, 0x6b2249f4, 0x438b57ac, 0x64ee722d, 0x13d96a1c,
+    0x34bc4f9d, 0x1c1551c5, 0x3b707444, 0x6af5b94d, 0x4d909ccc,
+    0x65398294, 0x425ca715, 0x356bbf24, 0x120e9aa5, 0x3aa784fd,
+    0x1dc2a17c, 0xd4c9b49f, 0xf3ac911e, 0xdb058f46, 0xfc60aac7,
+    0x8b57b2f6, 0xac329777, 0x849b892f, 0xa3feacae, 0x578ad232,
+    0x70eff7b3, 0x5846e9eb, 0x7f23cc6a, 0x0814d45b, 0x2f71f1da,
+    0x07d8ef82, 0x20bdca03, 0xe9b6dfe0, 0xced3fa61, 0xe67ae439,
+    0xc11fc1b8, 0xb628d989, 0x914dfc08, 0xb9e4e250, 0x9e81c7d1,
+    0x100b6fb3, 0x376e4a32, 0x1fc7546a, 0x38a271eb, 0x4f9569da,
+    0x68f04c5b, 0x40595203, 0x673c7782, 0xae376261, 0x895247e0,
+    0xa1fb59b8, 0x869e7c39, 0xf1a96408, 0xd6cc4189, 0xfe655fd1,
+    0xd9007a50, 0x2d7404cc, 0x0a11214d, 0x22b83f15, 0x05dd1a94,
+    0x72ea02a5, 0x558f2724, 0x7d26397c, 0x5a431cfd, 0x9348091e,
+    0xb42d2c9f, 0x9c8432c7, 0xbbe11746, 0xccd60f77, 0xebb32af6,
+    0xc31a34ae, 0xe47f112f, 0xdf0f656b, 0xf86a40ea, 0xd0c35eb2,
+    0xf7a67b33, 0x80916302, 0xa7f44683, 0x8f5d58db, 0xa8387d5a,
+    0x613368b9, 0x46564d38, 0x6eff5360, 0x499a76e1, 0x3ead6ed0,
+    0x19c84b51, 0x31615509, 0x16047088, 0xe2700e14, 0xc5152b95,
+    0xedbc35cd, 0xcad9104c, 0xbdee087d, 0x9a8b2dfc, 0xb22233a4,
+    0x95471625, 0x5c4c03c6, 0x7b292647, 0x5380381f, 0x74e51d9e,
+    0x03d205af, 0x24b7202e, 0x0c1e3e76, 0x2b7b1bf7, 0xa5f1b395,
+    0x82949614, 0xaa3d884c, 0x8d58adcd, 0xfa6fb5fc, 0xdd0a907d,
+    0xf5a38e25, 0xd2c6aba4, 0x1bcdbe47, 0x3ca89bc6, 0x1401859e,
+    0x3364a01f, 0x4453b82e, 0x63369daf, 0x4b9f83f7, 0x6cfaa676,
+    0x988ed8ea, 0xbfebfd6b, 0x9742e333, 0xb027c6b2, 0xc710de83,
+    0xe075fb02, 0xc8dce55a, 0xefb9c0db, 0x26b2d538, 0x01d7f0b9,
+    0x297eeee1, 0x0e1bcb60, 0x792cd351, 0x5e49f6d0, 0x76e0e888,
+    0x5185cd09}};
+
+#endif /* W */
+
+#endif /* N == 6 */
+
+static const uint32_t x2n_table[] = {
+    0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000,
+    0xedb88320, 0xb1e6b092, 0xa06a2517, 0xed627dae, 0x88d14467,
+    0xd7bbfe6a, 0xec447f11, 0x8e7ea170, 0x6427800e, 0x4d47bae0,
+    0x09fe548f, 0x83852d0f, 0x30362f1a, 0x7b5a9cc3, 0x31fec169,
+    0x9fec022a, 0x6c8dedc4, 0x15d6874d, 0x5fde7a4e, 0xbad90e37,
+    0x2e4e5eef, 0x4eaba214, 0xa8a472c0, 0x429a969e, 0x148d302a,
+    0xc40ba6d0, 0xc4e22c3c};
+
+#endif /* CRC32_BRAID_TBL_H_ */
diff --git a/3rdparty/zlib-ng/crc32_fold.c b/3rdparty/zlib-ng/crc32_fold.c
new file mode 100644
index 000000000000..5b3c7c459fd3
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32_fold.c
@@ -0,0 +1,33 @@
+/* crc32_fold.c -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "functable.h"
+
+#include "crc32_fold.h"
+
+#include <limits.h>
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
+    crc->value = CRC32_INITIAL_VALUE;
+    return crc->value;
+}
+
+Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc->value = functable.crc32(crc->value, src, len);
+    memcpy(dst, src, len);
+}
+
+Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+    /* Note: while this is basically the same thing as the vanilla CRC function, we still need
+     * a functable entry for it so that we can generically dispatch to this function with the
+     * same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The
+     * init_crc is an unused argument in this context */
+    Z_UNUSED(init_crc);
+    crc->value = functable.crc32(crc->value, src, len);
+}
+
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
+    return crc->value;
+}
diff --git a/3rdparty/zlib-ng/crc32_fold.h b/3rdparty/zlib-ng/crc32_fold.h
new file mode 100644
index 000000000000..0d2ff66967de
--- /dev/null
+++ b/3rdparty/zlib-ng/crc32_fold.h
@@ -0,0 +1,21 @@
+/* crc32_fold.h -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CRC32_FOLD_H_
+#define CRC32_FOLD_H_
+
+#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
+/* sizeof(__m128i) * (4 folds) */
+
+typedef struct crc32_fold_s {
+    uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
+    uint32_t value;
+} crc32_fold;
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
+Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+Z_INTERNAL void     crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
+
+#endif
diff --git a/3rdparty/zlib-ng/deflate.c b/3rdparty/zlib-ng/deflate.c
new file mode 100644
index 000000000000..2a0a20e5d29a
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate.c
@@ -0,0 +1,1410 @@
+/* deflate.c -- compress data using the deflation algorithm
+ * Copyright (C) 1995-2023 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/*
+ *  ALGORITHM
+ *
+ *      The "deflation" process depends on being able to identify portions
+ *      of the input text which are identical to earlier input (within a
+ *      sliding window trailing behind the input currently being processed).
+ *
+ *      The most straightforward technique turns out to be the fastest for
+ *      most input files: try all possible matches and select the longest.
+ *      The key feature of this algorithm is that insertions into the string
+ *      dictionary are very simple and thus fast, and deletions are avoided
+ *      completely. Insertions are performed at each input character, whereas
+ *      string matches are performed only when the previous match ends. So it
+ *      is preferable to spend more time in matches to allow very fast string
+ *      insertions and avoid deletions. The matching algorithm for small
+ *      strings is inspired from that of Rabin & Karp. A brute force approach
+ *      is used to find longer strings when a small match has been found.
+ *      A similar algorithm is used in comic (by Jan-Mark Wams) and freeze
+ *      (by Leonid Broukhis).
+ *         A previous version of this file used a more sophisticated algorithm
+ *      (by Fiala and Greene) which is guaranteed to run in linear amortized
+ *      time, but has a larger average cost, uses more memory and is patented.
+ *      However the F&G algorithm may be faster for some highly redundant
+ *      files if the parameter max_chain_length (described below) is too large.
+ *
+ *  ACKNOWLEDGEMENTS
+ *
+ *      The idea of lazy evaluation of matches is due to Jan-Mark Wams, and
+ *      I found it in 'freeze' written by Leonid Broukhis.
+ *      Thanks to many people for bug reports and testing.
+ *
+ *  REFERENCES
+ *
+ *      Deutsch, L.P.,"DEFLATE Compressed Data Format Specification".
+ *      Available in https://tools.ietf.org/html/rfc1951
+ *
+ *      A description of the Rabin and Karp algorithm is given in the book
+ *         "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
+ *
+ *      Fiala,E.R., and Greene,D.H.
+ *         Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595
+ *
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* Avoid conflicts with zlib.h macros */
+#ifdef ZLIB_COMPAT
+# undef deflateInit
+# undef deflateInit2
+#endif
+
+const char PREFIX(deflate_copyright)[] = " deflate 1.3.0 Copyright 1995-2023 Jean-loup Gailly and Mark Adler ";
+/*
+  If you use the zlib library in a product, an acknowledgment is welcome
+  in the documentation of your product. If for some reason you cannot
+  include such an acknowledgment, I would appreciate that you keep this
+  copyright string in the executable of your product.
+ */
+
+/* ===========================================================================
+ *  Architecture-specific hooks.
+ */
+#ifdef S390_DFLTCC_DEFLATE
+#  include "arch/s390/dfltcc_deflate.h"
+#else
+/* Memory management for the deflate state. Useful for allocating arch-specific extension blocks. */
+#  define ZALLOC_DEFLATE_STATE(strm) ((deflate_state *)ZALLOC(strm, 1, sizeof(deflate_state)))
+#  define ZFREE_STATE(strm, addr) ZFREE(strm, addr)
+#  define ZCOPY_DEFLATE_STATE(dst, src) memcpy(dst, src, sizeof(deflate_state))
+/* Memory management for the window. Useful for allocation the aligned window. */
+#  define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size)
+#  define TRY_FREE_WINDOW(strm, addr) TRY_FREE(strm, addr)
+/* Invoked at the beginning of deflateSetDictionary(). Useful for checking arch-specific window data. */
+#  define DEFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
+/* Invoked at the beginning of deflateGetDictionary(). Useful for adjusting arch-specific window data. */
+#  define DEFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
+/* Invoked at the end of deflateResetKeep(). Useful for initializing arch-specific extension blocks. */
+#  define DEFLATE_RESET_KEEP_HOOK(strm) do {} while (0)
+/* Invoked at the beginning of deflateParams(). Useful for updating arch-specific compression parameters. */
+#  define DEFLATE_PARAMS_HOOK(strm, level, strategy, hook_flush) do {} while (0)
+/* Returns whether the last deflate(flush) operation did everything it's supposed to do. */
+#  define DEFLATE_DONE(strm, flush) 1
+/* Adjusts the upper bound on compressed data length based on compression parameters and uncompressed data length.
+ * Useful when arch-specific deflation code behaves differently than regular zlib-ng algorithms. */
+#  define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, sourceLen) do {} while (0)
+/* Returns whether an optimistic upper bound on compressed data length should *not* be used.
+ * Useful when arch-specific deflation code behaves differently than regular zlib-ng algorithms. */
+#  define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) 0
+/* Invoked for each deflate() call. Useful for plugging arch-specific deflation code. */
+#  define DEFLATE_HOOK(strm, flush, bstate) 0
+/* Returns whether zlib-ng should compute a checksum. Set to 0 if arch-specific deflation code already does that. */
+#  define DEFLATE_NEED_CHECKSUM(strm) 1
+/* Returns whether reproducibility parameter can be set to a given value. */
+#  define DEFLATE_CAN_SET_REPRODUCIBLE(strm, reproducible) 1
+#endif
+
+/* ===========================================================================
+ *  Function prototypes.
+ */
+static int deflateStateCheck      (PREFIX3(stream) *strm);
+Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush);
+Z_INTERNAL block_state deflate_fast  (deflate_state *s, int flush);
+Z_INTERNAL block_state deflate_quick (deflate_state *s, int flush);
+#ifndef NO_MEDIUM_STRATEGY
+Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush);
+#endif
+Z_INTERNAL block_state deflate_slow  (deflate_state *s, int flush);
+Z_INTERNAL block_state deflate_rle   (deflate_state *s, int flush);
+Z_INTERNAL block_state deflate_huff  (deflate_state *s, int flush);
+static void lm_set_level         (deflate_state *s, int level);
+static void lm_init              (deflate_state *s);
+Z_INTERNAL unsigned read_buf  (PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
+
+extern uint32_t update_hash_roll        (deflate_state *const s, uint32_t h, uint32_t val);
+extern void     insert_string_roll      (deflate_state *const s, uint32_t str, uint32_t count);
+extern Pos      quick_insert_string_roll(deflate_state *const s, uint32_t str);
+
+/* ===========================================================================
+ * Local data
+ */
+
+/* Values for max_lazy_match, good_match and max_chain_length, depending on
+ * the desired pack level (0..9). The values given below have been tuned to
+ * exclude worst case performance for pathological files. Better values may be
+ * found for specific files.
+ */
+typedef struct config_s {
+    uint16_t good_length; /* reduce lazy search above this match length */
+    uint16_t max_lazy;    /* do not perform lazy search above this match length */
+    uint16_t nice_length; /* quit search above this match length */
+    uint16_t max_chain;
+    compress_func func;
+} config;
+
+static const config configuration_table[10] = {
+/*      good lazy nice chain */
+/* 0 */ {0,    0,  0,    0, deflate_stored},  /* store only */
+
+#ifdef NO_QUICK_STRATEGY
+/* 1 */ {4,    4,  8,    4, deflate_fast}, /* max speed, no lazy matches */
+/* 2 */ {4,    5, 16,    8, deflate_fast},
+#else
+/* 1 */ {0,    0,  0,    0, deflate_quick},
+/* 2 */ {4,    4,  8,    4, deflate_fast}, /* max speed, no lazy matches */
+#endif
+
+#ifdef NO_MEDIUM_STRATEGY
+/* 3 */ {4,    6, 32,   32, deflate_fast},
+/* 4 */ {4,    4, 16,   16, deflate_slow},  /* lazy matches */
+/* 5 */ {8,   16, 32,   32, deflate_slow},
+/* 6 */ {8,   16, 128, 128, deflate_slow},
+#else
+/* 3 */ {4,    6, 16,    6, deflate_medium},
+/* 4 */ {4,   12, 32,   24, deflate_medium},  /* lazy matches */
+/* 5 */ {8,   16, 32,   32, deflate_medium},
+/* 6 */ {8,   16, 128, 128, deflate_medium},
+#endif
+
+/* 7 */ {8,   32, 128,  256, deflate_slow},
+/* 8 */ {32, 128, 258, 1024, deflate_slow},
+/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* max compression */
+
+/* Note: the deflate() code requires max_lazy >= STD_MIN_MATCH and max_chain >= 4
+ * For deflate_fast() (levels <= 3) good is ignored and lazy has a different
+ * meaning.
+ */
+
+/* rank Z_BLOCK between Z_NO_FLUSH and Z_PARTIAL_FLUSH */
+#define RANK(f) (((f) * 2) - ((f) > 4 ? 9 : 0))
+
+
+/* ===========================================================================
+ * Initialize the hash table. prev[] will be initialized on the fly.
+ */
+#define CLEAR_HASH(s) do { \
+    memset((unsigned char *)s->head, 0, HASH_SIZE * sizeof(*s->head)); \
+  } while (0)
+
+/* ========================================================================= */
+/* This function is hidden in ZLIB_COMPAT builds. */
+int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level, int32_t method, int32_t windowBits,
+                                            int32_t memLevel, int32_t strategy) {
+    /* Todo: ignore strm->next_in if we use it as window */
+    uint32_t window_padding = 0;
+    deflate_state *s;
+    int wrap = 1;
+
+    /* Force initialization functable, because deflate captures function pointers from functable. */
+    functable.force_init();
+
+    if (strm == NULL)
+        return Z_STREAM_ERROR;
+
+    strm->msg = NULL;
+    if (strm->zalloc == NULL) {
+        strm->zalloc = PREFIX(zcalloc);
+        strm->opaque = NULL;
+    }
+    if (strm->zfree == NULL)
+        strm->zfree = PREFIX(zcfree);
+
+    if (level == Z_DEFAULT_COMPRESSION)
+        level = 6;
+
+    if (windowBits < 0) { /* suppress zlib wrapper */
+        wrap = 0;
+        if (windowBits < -MAX_WBITS)
+            return Z_STREAM_ERROR;
+        windowBits = -windowBits;
+#ifdef GZIP
+    } else if (windowBits > MAX_WBITS) {
+        wrap = 2;       /* write gzip wrapper instead */
+        windowBits -= 16;
+#endif
+    }
+    if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED || windowBits < MIN_WBITS ||
+        windowBits > MAX_WBITS || level < 0 || level > 9 || strategy < 0 || strategy > Z_FIXED ||
+        (windowBits == 8 && wrap != 1)) {
+        return Z_STREAM_ERROR;
+    }
+    if (windowBits == 8)
+        windowBits = 9;  /* until 256-byte window bug fixed */
+
+    s = ZALLOC_DEFLATE_STATE(strm);
+    if (s == NULL)
+        return Z_MEM_ERROR;
+    strm->state = (struct internal_state *)s;
+    s->strm = strm;
+    s->status = INIT_STATE;     /* to pass state test in deflateReset() */
+
+    s->wrap = wrap;
+    s->gzhead = NULL;
+    s->w_bits = (unsigned int)windowBits;
+    s->w_size = 1 << s->w_bits;
+    s->w_mask = s->w_size - 1;
+
+#ifdef X86_PCLMULQDQ_CRC
+    window_padding = 8;
+#endif
+
+    s->window = (unsigned char *) ZALLOC_WINDOW(strm, s->w_size + window_padding, 2*sizeof(unsigned char));
+    s->prev   = (Pos *)  ZALLOC(strm, s->w_size, sizeof(Pos));
+    memset(s->prev, 0, s->w_size * sizeof(Pos));
+    s->head   = (Pos *)  ZALLOC(strm, HASH_SIZE, sizeof(Pos));
+
+    s->high_water = 0;      /* nothing written to s->window yet */
+
+    s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
+
+    /* We overlay pending_buf and sym_buf. This works since the average size
+     * for length/distance pairs over any compressed block is assured to be 31
+     * bits or less.
+     *
+     * Analysis: The longest fixed codes are a length code of 8 bits plus 5
+     * extra bits, for lengths 131 to 257. The longest fixed distance codes are
+     * 5 bits plus 13 extra bits, for distances 16385 to 32768. The longest
+     * possible fixed-codes length/distance pair is then 31 bits total.
+     *
+     * sym_buf starts one-fourth of the way into pending_buf. So there are
+     * three bytes in sym_buf for every four bytes in pending_buf. Each symbol
+     * in sym_buf is three bytes -- two for the distance and one for the
+     * literal/length. As each symbol is consumed, the pointer to the next
+     * sym_buf value to read moves forward three bytes. From that symbol, up to
+     * 31 bits are written to pending_buf. The closest the written pending_buf
+     * bits gets to the next sym_buf symbol to read is just before the last
+     * code is written. At that time, 31*(n-2) bits have been written, just
+     * after 24*(n-2) bits have been consumed from sym_buf. sym_buf starts at
+     * 8*n bits into pending_buf. (Note that the symbol buffer fills when n-1
+     * symbols are written.) The closest the writing gets to what is unread is
+     * then n+14 bits. Here n is lit_bufsize, which is 16384 by default, and
+     * can range from 128 to 32768.
+     *
+     * Therefore, at a minimum, there are 142 bits of space between what is
+     * written and what is read in the overlain buffers, so the symbols cannot
+     * be overwritten by the compressed data. That space is actually 139 bits,
+     * due to the three-bit fixed-code block header.
+     *
+     * That covers the case where either Z_FIXED is specified, forcing fixed
+     * codes, or when the use of fixed codes is chosen, because that choice
+     * results in a smaller compressed block than dynamic codes. That latter
+     * condition then assures that the above analysis also covers all dynamic
+     * blocks. A dynamic-code block will only be chosen to be emitted if it has
+     * fewer bits than a fixed-code block would for the same set of symbols.
+     * Therefore its average symbol length is assured to be less than 31. So
+     * the compressed data for a dynamic block also cannot overwrite the
+     * symbols from which it is being constructed.
+     */
+
+    s->pending_buf = (unsigned char *) ZALLOC(strm, s->lit_bufsize, 4);
+    s->pending_buf_size = s->lit_bufsize * 4;
+
+    if (s->window == NULL || s->prev == NULL || s->head == NULL || s->pending_buf == NULL) {
+        s->status = FINISH_STATE;
+        strm->msg = ERR_MSG(Z_MEM_ERROR);
+        PREFIX(deflateEnd)(strm);
+        return Z_MEM_ERROR;
+    }
+    s->sym_buf = s->pending_buf + s->lit_bufsize;
+    s->sym_end = (s->lit_bufsize - 1) * 3;
+    /* We avoid equality with lit_bufsize*3 because of wraparound at 64K
+     * on 16 bit machines and because stored blocks are restricted to
+     * 64K-1 bytes.
+     */
+
+    s->level = level;
+    s->strategy = strategy;
+    s->block_open = 0;
+    s->reproducible = 0;
+
+    return PREFIX(deflateReset)(strm);
+}
+
+#ifndef ZLIB_COMPAT
+int32_t Z_EXPORT PREFIX(deflateInit)(PREFIX3(stream) *strm, int32_t level) {
+    return PREFIX(deflateInit2)(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+}
+#endif
+
+/* Function used by zlib.h and zlib-ng version 2.0 macros */
+int32_t Z_EXPORT PREFIX(deflateInit_)(PREFIX3(stream) *strm, int32_t level, const char *version, int32_t stream_size) {
+    if (CHECK_VER_STSIZE(version, stream_size))
+        return Z_VERSION_ERROR;
+    return PREFIX(deflateInit2)(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+}
+
+/* Function used by zlib.h and zlib-ng version 2.0 macros */
+int32_t Z_EXPORT PREFIX(deflateInit2_)(PREFIX3(stream) *strm, int32_t level, int32_t method, int32_t windowBits,
+                           int32_t memLevel, int32_t strategy, const char *version, int32_t stream_size) {
+    if (CHECK_VER_STSIZE(version, stream_size))
+        return Z_VERSION_ERROR;
+    return PREFIX(deflateInit2)(strm, level, method, windowBits, memLevel, strategy);
+}
+
+/* =========================================================================
+ * Check for a valid deflate stream state. Return 0 if ok, 1 if not.
+ */
+static int deflateStateCheck(PREFIX3(stream) *strm) {
+    deflate_state *s;
+    if (strm == NULL || strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
+        return 1;
+    s = strm->state;
+    if (s == NULL || s->strm != strm || (s->status < INIT_STATE || s->status > MAX_STATE))
+        return 1;
+    return 0;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflateSetDictionary)(PREFIX3(stream) *strm, const uint8_t *dictionary, uint32_t dictLength) {
+    deflate_state *s;
+    unsigned int str, n;
+    int wrap;
+    uint32_t avail;
+    const unsigned char *next;
+
+    if (deflateStateCheck(strm) || dictionary == NULL)
+        return Z_STREAM_ERROR;
+    s = strm->state;
+    wrap = s->wrap;
+    if (wrap == 2 || (wrap == 1 && s->status != INIT_STATE) || s->lookahead)
+        return Z_STREAM_ERROR;
+
+    /* when using zlib wrappers, compute Adler-32 for provided dictionary */
+    if (wrap == 1)
+        strm->adler = functable.adler32(strm->adler, dictionary, dictLength);
+    DEFLATE_SET_DICTIONARY_HOOK(strm, dictionary, dictLength);  /* hook for IBM Z DFLTCC */
+    s->wrap = 0;                    /* avoid computing Adler-32 in read_buf */
+
+    /* if dictionary would fill window, just replace the history */
+    if (dictLength >= s->w_size) {
+        if (wrap == 0) {            /* already empty otherwise */
+            CLEAR_HASH(s);
+            s->strstart = 0;
+            s->block_start = 0;
+            s->insert = 0;
+        }
+        dictionary += dictLength - s->w_size;  /* use the tail */
+        dictLength = s->w_size;
+    }
+
+    /* insert dictionary into window and hash */
+    avail = strm->avail_in;
+    next = strm->next_in;
+    strm->avail_in = dictLength;
+    strm->next_in = (z_const unsigned char *)dictionary;
+    PREFIX(fill_window)(s);
+    while (s->lookahead >= STD_MIN_MATCH) {
+        str = s->strstart;
+        n = s->lookahead - (STD_MIN_MATCH - 1);
+        s->insert_string(s, str, n);
+        s->strstart = str + n;
+        s->lookahead = STD_MIN_MATCH - 1;
+        PREFIX(fill_window)(s);
+    }
+    s->strstart += s->lookahead;
+    s->block_start = (int)s->strstart;
+    s->insert = s->lookahead;
+    s->lookahead = 0;
+    s->prev_length = 0;
+    s->match_available = 0;
+    strm->next_in = (z_const unsigned char *)next;
+    strm->avail_in = avail;
+    s->wrap = wrap;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflateGetDictionary)(PREFIX3(stream) *strm, uint8_t *dictionary, uint32_t *dictLength) {
+    deflate_state *s;
+    unsigned int len;
+
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    DEFLATE_GET_DICTIONARY_HOOK(strm, dictionary, dictLength);  /* hook for IBM Z DFLTCC */
+    s = strm->state;
+    len = s->strstart + s->lookahead;
+    if (len > s->w_size)
+        len = s->w_size;
+    if (dictionary != NULL && len)
+        memcpy(dictionary, s->window + s->strstart + s->lookahead - len, len);
+    if (dictLength != NULL)
+        *dictLength = len;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
+    deflate_state *s;
+
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+
+    strm->total_in = strm->total_out = 0;
+    strm->msg = NULL; /* use zfree if we ever allocate msg dynamically */
+    strm->data_type = Z_UNKNOWN;
+
+    s = (deflate_state *)strm->state;
+    s->pending = 0;
+    s->pending_out = s->pending_buf;
+
+    if (s->wrap < 0)
+        s->wrap = -s->wrap; /* was made negative by deflate(..., Z_FINISH); */
+
+    s->status =
+#ifdef GZIP
+        s->wrap == 2 ? GZIP_STATE :
+#endif
+        INIT_STATE;
+
+#ifdef GZIP
+    if (s->wrap == 2) {
+        strm->adler = functable.crc32_fold_reset(&s->crc_fold);
+    } else
+#endif
+        strm->adler = ADLER32_INITIAL_VALUE;
+    s->last_flush = -2;
+
+    zng_tr_init(s);
+
+    DEFLATE_RESET_KEEP_HOOK(strm);  /* hook for IBM Z DFLTCC */
+
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflateReset)(PREFIX3(stream) *strm) {
+    int ret = PREFIX(deflateResetKeep)(strm);
+    if (ret == Z_OK)
+        lm_init(strm->state);
+    return ret;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflateSetHeader)(PREFIX3(stream) *strm, PREFIX(gz_headerp) head) {
+    if (deflateStateCheck(strm) || strm->state->wrap != 2)
+        return Z_STREAM_ERROR;
+    strm->state->gzhead = head;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflatePending)(PREFIX3(stream) *strm, uint32_t *pending, int32_t *bits) {
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    if (pending != NULL)
+        *pending = strm->state->pending;
+    if (bits != NULL)
+        *bits = strm->state->bi_valid;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflatePrime)(PREFIX3(stream) *strm, int32_t bits, int32_t value) {
+    deflate_state *s;
+    uint64_t value64 = (uint64_t)value;
+    int32_t put;
+
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    s = strm->state;
+    if (bits < 0 || bits > BIT_BUF_SIZE || bits > (int32_t)(sizeof(value) << 3) ||
+        s->sym_buf < s->pending_out + ((BIT_BUF_SIZE + 7) >> 3))
+        return Z_BUF_ERROR;
+    do {
+        put = BIT_BUF_SIZE - s->bi_valid;
+        put = MIN(put, bits);
+
+        if (s->bi_valid == 0)
+            s->bi_buf = value64;
+        else
+            s->bi_buf |= (value64 & ((UINT64_C(1) << put) - 1)) << s->bi_valid;
+        s->bi_valid += put;
+        zng_tr_flush_bits(s);
+        value64 >>= put;
+        bits -= put;
+    } while (bits);
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflateParams)(PREFIX3(stream) *strm, int32_t level, int32_t strategy) {
+    deflate_state *s;
+    compress_func func;
+    int hook_flush = Z_NO_FLUSH;
+
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    s = strm->state;
+
+    if (level == Z_DEFAULT_COMPRESSION)
+        level = 6;
+    if (level < 0 || level > 9 || strategy < 0 || strategy > Z_FIXED)
+        return Z_STREAM_ERROR;
+    DEFLATE_PARAMS_HOOK(strm, level, strategy, &hook_flush);  /* hook for IBM Z DFLTCC */
+    func = configuration_table[s->level].func;
+
+    if (((strategy != s->strategy || func != configuration_table[level].func) && s->last_flush != -2)
+        || hook_flush != Z_NO_FLUSH) {
+        /* Flush the last buffer. Use Z_BLOCK mode, unless the hook requests a "stronger" one. */
+        int flush = RANK(hook_flush) > RANK(Z_BLOCK) ? hook_flush : Z_BLOCK;
+        int err = PREFIX(deflate)(strm, flush);
+        if (err == Z_STREAM_ERROR)
+            return err;
+        if (strm->avail_in || ((int)s->strstart - s->block_start) + s->lookahead || !DEFLATE_DONE(strm, flush))
+            return Z_BUF_ERROR;
+    }
+    if (s->level != level) {
+        if (s->level == 0 && s->matches != 0) {
+            if (s->matches == 1) {
+                functable.slide_hash(s);
+            } else {
+                CLEAR_HASH(s);
+            }
+            s->matches = 0;
+        }
+
+        lm_set_level(s, level);
+    }
+    s->strategy = strategy;
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflateTune)(PREFIX3(stream) *strm, int32_t good_length, int32_t max_lazy, int32_t nice_length, int32_t max_chain) {
+    deflate_state *s;
+
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    s = strm->state;
+    s->good_match = (unsigned int)good_length;
+    s->max_lazy_match = (unsigned int)max_lazy;
+    s->nice_match = nice_length;
+    s->max_chain_length = (unsigned int)max_chain;
+    return Z_OK;
+}
+
+/* =========================================================================
+ * For the default windowBits of 15 and memLevel of 8, this function returns
+ * a close to exact, as well as small, upper bound on the compressed size.
+ * They are coded as constants here for a reason--if the #define's are
+ * changed, then this function needs to be changed as well.  The return
+ * value for 15 and 8 only works for those exact settings.
+ *
+ * For any setting other than those defaults for windowBits and memLevel,
+ * the value returned is a conservative worst case for the maximum expansion
+ * resulting from using fixed blocks instead of stored blocks, which deflate
+ * can emit on compressed data for some combinations of the parameters.
+ *
+ * This function could be more sophisticated to provide closer upper bounds for
+ * every combination of windowBits and memLevel.  But even the conservative
+ * upper bound of about 14% expansion does not seem onerous for output buffer
+ * allocation.
+ */
+unsigned long Z_EXPORT PREFIX(deflateBound)(PREFIX3(stream) *strm, unsigned long sourceLen) {
+    deflate_state *s;
+    unsigned long complen, wraplen;
+
+    /* conservative upper bound for compressed data */
+    complen = sourceLen + ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 5;
+    DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, sourceLen);  /* hook for IBM Z DFLTCC */
+
+    /* if can't get parameters, return conservative bound plus zlib wrapper */
+    if (deflateStateCheck(strm))
+        return complen + 6;
+
+    /* compute wrapper length */
+    s = strm->state;
+    switch (s->wrap) {
+    case 0:                                 /* raw deflate */
+        wraplen = 0;
+        break;
+    case 1:                                 /* zlib wrapper */
+        wraplen = ZLIB_WRAPLEN + (s->strstart ? 4 : 0);
+        break;
+#ifdef GZIP
+    case 2:                                 /* gzip wrapper */
+        wraplen = GZIP_WRAPLEN;
+        if (s->gzhead != NULL) {            /* user-supplied gzip header */
+            unsigned char *str;
+            if (s->gzhead->extra != NULL) {
+                wraplen += 2 + s->gzhead->extra_len;
+            }
+            str = s->gzhead->name;
+            if (str != NULL) {
+                do {
+                    wraplen++;
+                } while (*str++);
+            }
+            str = s->gzhead->comment;
+            if (str != NULL) {
+                do {
+                    wraplen++;
+                } while (*str++);
+            }
+            if (s->gzhead->hcrc)
+                wraplen += 2;
+        }
+        break;
+#endif
+    default:                                /* for compiler happiness */
+        wraplen = ZLIB_WRAPLEN;
+    }
+
+    /* if not default parameters, return conservative bound */
+    if (DEFLATE_NEED_CONSERVATIVE_BOUND(strm) ||  /* hook for IBM Z DFLTCC */
+            s->w_bits != MAX_WBITS || HASH_BITS < 15) {
+        if (s->level == 0) {
+            /* upper bound for stored blocks with length 127 (memLevel == 1) --
+               ~4% overhead plus a small constant */
+            complen = sourceLen + (sourceLen >> 5) + (sourceLen >> 7) + (sourceLen >> 11) + 7;
+        }
+
+        return complen + wraplen;
+    }
+
+#ifndef NO_QUICK_STRATEGY
+    return sourceLen                       /* The source size itself */
+      + (sourceLen == 0 ? 1 : 0)           /* Always at least one byte for any input */
+      + (sourceLen < 9 ? 1 : 0)            /* One extra byte for lengths less than 9 */
+      + DEFLATE_QUICK_OVERHEAD(sourceLen)  /* Source encoding overhead, padded to next full byte */
+      + DEFLATE_BLOCK_OVERHEAD             /* Deflate block overhead bytes */
+      + wraplen;                           /* none, zlib or gzip wrapper */
+#else
+    return sourceLen + (sourceLen >> 4) + 7 + wraplen;
+#endif
+}
+
+/* =========================================================================
+ * Flush as much pending output as possible. All deflate() output, except for
+ * some deflate_stored() output, goes through this function so some
+ * applications may wish to modify it to avoid allocating a large
+ * strm->next_out buffer and copying into it. (See also read_buf()).
+ */
+Z_INTERNAL void PREFIX(flush_pending)(PREFIX3(stream) *strm) {
+    uint32_t len;
+    deflate_state *s = strm->state;
+
+    zng_tr_flush_bits(s);
+    len = MIN(s->pending, strm->avail_out);
+    if (len == 0)
+        return;
+
+    Tracev((stderr, "[FLUSH]"));
+    memcpy(strm->next_out, s->pending_out, len);
+    strm->next_out  += len;
+    s->pending_out  += len;
+    strm->total_out += len;
+    strm->avail_out -= len;
+    s->pending      -= len;
+    if (s->pending == 0)
+        s->pending_out = s->pending_buf;
+}
+
+/* ===========================================================================
+ * Update the header CRC with the bytes s->pending_buf[beg..s->pending - 1].
+ */
+#define HCRC_UPDATE(beg) \
+    do { \
+        if (s->gzhead->hcrc && s->pending > (beg)) \
+            strm->adler = PREFIX(crc32)(strm->adler, s->pending_buf + (beg), s->pending - (beg)); \
+    } while (0)
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
+    int32_t old_flush; /* value of flush param for previous deflate call */
+    deflate_state *s;
+
+    if (deflateStateCheck(strm) || flush > Z_BLOCK || flush < 0)
+        return Z_STREAM_ERROR;
+    s = strm->state;
+
+    if (strm->next_out == NULL || (strm->avail_in != 0 && strm->next_in == NULL)
+        || (s->status == FINISH_STATE && flush != Z_FINISH)) {
+        ERR_RETURN(strm, Z_STREAM_ERROR);
+    }
+    if (strm->avail_out == 0) {
+        ERR_RETURN(strm, Z_BUF_ERROR);
+    }
+
+    old_flush = s->last_flush;
+    s->last_flush = flush;
+
+    /* Flush as much pending output as possible */
+    if (s->pending != 0) {
+        PREFIX(flush_pending)(strm);
+        if (strm->avail_out == 0) {
+            /* Since avail_out is 0, deflate will be called again with
+             * more output space, but possibly with both pending and
+             * avail_in equal to zero. There won't be anything to do,
+             * but this is not an error situation so make sure we
+             * return OK instead of BUF_ERROR at next call of deflate:
+             */
+            s->last_flush = -1;
+            return Z_OK;
+        }
+
+        /* Make sure there is something to do and avoid duplicate consecutive
+         * flushes. For repeated and useless calls with Z_FINISH, we keep
+         * returning Z_STREAM_END instead of Z_BUF_ERROR.
+         */
+    } else if (strm->avail_in == 0 && RANK(flush) <= RANK(old_flush) && flush != Z_FINISH) {
+        ERR_RETURN(strm, Z_BUF_ERROR);
+    }
+
+    /* User must not provide more input after the first FINISH: */
+    if (s->status == FINISH_STATE && strm->avail_in != 0)   {
+        ERR_RETURN(strm, Z_BUF_ERROR);
+    }
+
+    /* Write the header */
+    if (s->status == INIT_STATE && s->wrap == 0)
+        s->status = BUSY_STATE;
+    if (s->status == INIT_STATE) {
+        /* zlib header */
+        unsigned int header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
+        unsigned int level_flags;
+
+        if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2)
+            level_flags = 0;
+        else if (s->level < 6)
+            level_flags = 1;
+        else if (s->level == 6)
+            level_flags = 2;
+        else
+            level_flags = 3;
+        header |= (level_flags << 6);
+        if (s->strstart != 0)
+            header |= PRESET_DICT;
+        header += 31 - (header % 31);
+
+        put_short_msb(s, (uint16_t)header);
+
+        /* Save the adler32 of the preset dictionary: */
+        if (s->strstart != 0)
+            put_uint32_msb(s, strm->adler);
+        strm->adler = ADLER32_INITIAL_VALUE;
+        s->status = BUSY_STATE;
+
+        /* Compression must start with an empty pending buffer */
+        PREFIX(flush_pending)(strm);
+        if (s->pending != 0) {
+            s->last_flush = -1;
+            return Z_OK;
+        }
+    }
+#ifdef GZIP
+    if (s->status == GZIP_STATE) {
+        /* gzip header */
+        functable.crc32_fold_reset(&s->crc_fold);
+        put_byte(s, 31);
+        put_byte(s, 139);
+        put_byte(s, 8);
+        if (s->gzhead == NULL) {
+            put_uint32(s, 0);
+            put_byte(s, 0);
+            put_byte(s, s->level == 9 ? 2 :
+                     (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ? 4 : 0));
+            put_byte(s, OS_CODE);
+            s->status = BUSY_STATE;
+
+            /* Compression must start with an empty pending buffer */
+            PREFIX(flush_pending)(strm);
+            if (s->pending != 0) {
+                s->last_flush = -1;
+                return Z_OK;
+            }
+        } else {
+            put_byte(s, (s->gzhead->text ? 1 : 0) +
+                     (s->gzhead->hcrc ? 2 : 0) +
+                     (s->gzhead->extra == NULL ? 0 : 4) +
+                     (s->gzhead->name == NULL ? 0 : 8) +
+                     (s->gzhead->comment == NULL ? 0 : 16)
+                     );
+            put_uint32(s, s->gzhead->time);
+            put_byte(s, s->level == 9 ? 2 : (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ? 4 : 0));
+            put_byte(s, s->gzhead->os & 0xff);
+            if (s->gzhead->extra != NULL)
+                put_short(s, (uint16_t)s->gzhead->extra_len);
+            if (s->gzhead->hcrc)
+                strm->adler = PREFIX(crc32)(strm->adler, s->pending_buf, s->pending);
+            s->gzindex = 0;
+            s->status = EXTRA_STATE;
+        }
+    }
+    if (s->status == EXTRA_STATE) {
+        if (s->gzhead->extra != NULL) {
+            uint32_t beg = s->pending;   /* start of bytes to update crc */
+            uint32_t left = (s->gzhead->extra_len & 0xffff) - s->gzindex;
+
+            while (s->pending + left > s->pending_buf_size) {
+                uint32_t copy = s->pending_buf_size - s->pending;
+                memcpy(s->pending_buf + s->pending, s->gzhead->extra + s->gzindex, copy);
+                s->pending = s->pending_buf_size;
+                HCRC_UPDATE(beg);
+                s->gzindex += copy;
+                PREFIX(flush_pending)(strm);
+                if (s->pending != 0) {
+                    s->last_flush = -1;
+                    return Z_OK;
+                }
+                beg = 0;
+                left -= copy;
+            }
+            memcpy(s->pending_buf + s->pending, s->gzhead->extra + s->gzindex, left);
+            s->pending += left;
+            HCRC_UPDATE(beg);
+            s->gzindex = 0;
+        }
+        s->status = NAME_STATE;
+    }
+    if (s->status == NAME_STATE) {
+        if (s->gzhead->name != NULL) {
+            uint32_t beg = s->pending;   /* start of bytes to update crc */
+            unsigned char val;
+
+            do {
+                if (s->pending == s->pending_buf_size) {
+                    HCRC_UPDATE(beg);
+                    PREFIX(flush_pending)(strm);
+                    if (s->pending != 0) {
+                        s->last_flush = -1;
+                        return Z_OK;
+                    }
+                    beg = 0;
+                }
+                val = s->gzhead->name[s->gzindex++];
+                put_byte(s, val);
+            } while (val != 0);
+            HCRC_UPDATE(beg);
+            s->gzindex = 0;
+        }
+        s->status = COMMENT_STATE;
+    }
+    if (s->status == COMMENT_STATE) {
+        if (s->gzhead->comment != NULL) {
+            uint32_t beg = s->pending;  /* start of bytes to update crc */
+            unsigned char val;
+
+            do {
+                if (s->pending == s->pending_buf_size) {
+                    HCRC_UPDATE(beg);
+                    PREFIX(flush_pending)(strm);
+                    if (s->pending != 0) {
+                        s->last_flush = -1;
+                        return Z_OK;
+                    }
+                    beg = 0;
+                }
+                val = s->gzhead->comment[s->gzindex++];
+                put_byte(s, val);
+            } while (val != 0);
+            HCRC_UPDATE(beg);
+        }
+        s->status = HCRC_STATE;
+    }
+    if (s->status == HCRC_STATE) {
+        if (s->gzhead->hcrc) {
+            if (s->pending + 2 > s->pending_buf_size) {
+                PREFIX(flush_pending)(strm);
+                if (s->pending != 0) {
+                    s->last_flush = -1;
+                    return Z_OK;
+                }
+            }
+            put_short(s, (uint16_t)strm->adler);
+            functable.crc32_fold_reset(&s->crc_fold);
+        }
+        s->status = BUSY_STATE;
+
+        /* Compression must start with an empty pending buffer */
+        PREFIX(flush_pending)(strm);
+        if (s->pending != 0) {
+            s->last_flush = -1;
+            return Z_OK;
+        }
+    }
+#endif
+
+    /* Start a new block or continue the current one.
+     */
+    if (strm->avail_in != 0 || s->lookahead != 0 || (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) {
+        block_state bstate;
+
+        bstate = DEFLATE_HOOK(strm, flush, &bstate) ? bstate :  /* hook for IBM Z DFLTCC */
+                 s->level == 0 ? deflate_stored(s, flush) :
+                 s->strategy == Z_HUFFMAN_ONLY ? deflate_huff(s, flush) :
+                 s->strategy == Z_RLE ? deflate_rle(s, flush) :
+                 (*(configuration_table[s->level].func))(s, flush);
+
+        if (bstate == finish_started || bstate == finish_done) {
+            s->status = FINISH_STATE;
+        }
+        if (bstate == need_more || bstate == finish_started) {
+            if (strm->avail_out == 0) {
+                s->last_flush = -1; /* avoid BUF_ERROR next call, see above */
+            }
+            return Z_OK;
+            /* If flush != Z_NO_FLUSH && avail_out == 0, the next call
+             * of deflate should use the same flush parameter to make sure
+             * that the flush is complete. So we don't have to output an
+             * empty block here, this will be done at next call. This also
+             * ensures that for a very small output buffer, we emit at most
+             * one empty block.
+             */
+        }
+        if (bstate == block_done) {
+            if (flush == Z_PARTIAL_FLUSH) {
+                zng_tr_align(s);
+            } else if (flush != Z_BLOCK) { /* FULL_FLUSH or SYNC_FLUSH */
+                zng_tr_stored_block(s, (char*)0, 0L, 0);
+                /* For a full flush, this empty block will be recognized
+                 * as a special marker by inflate_sync().
+                 */
+                if (flush == Z_FULL_FLUSH) {
+                    CLEAR_HASH(s);             /* forget history */
+                    if (s->lookahead == 0) {
+                        s->strstart = 0;
+                        s->block_start = 0;
+                        s->insert = 0;
+                    }
+                }
+            }
+            PREFIX(flush_pending)(strm);
+            if (strm->avail_out == 0) {
+                s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */
+                return Z_OK;
+            }
+        }
+    }
+
+    if (flush != Z_FINISH)
+        return Z_OK;
+
+    /* Write the trailer */
+#ifdef GZIP
+    if (s->wrap == 2) {
+        strm->adler = functable.crc32_fold_final(&s->crc_fold);
+
+        put_uint32(s, strm->adler);
+        put_uint32(s, (uint32_t)strm->total_in);
+    } else
+#endif
+    {
+        if (s->wrap == 1)
+            put_uint32_msb(s, strm->adler);
+    }
+    PREFIX(flush_pending)(strm);
+    /* If avail_out is zero, the application will call deflate again
+     * to flush the rest.
+     */
+    if (s->wrap > 0)
+        s->wrap = -s->wrap; /* write the trailer only once! */
+    if (s->pending == 0) {
+        Assert(s->bi_valid == 0, "bi_buf not flushed");
+        return Z_STREAM_END;
+    }
+    return Z_OK;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT PREFIX(deflateEnd)(PREFIX3(stream) *strm) {
+    int32_t status;
+
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+
+    status = strm->state->status;
+
+    /* Deallocate in reverse order of allocations: */
+    TRY_FREE(strm, strm->state->pending_buf);
+    TRY_FREE(strm, strm->state->head);
+    TRY_FREE(strm, strm->state->prev);
+    TRY_FREE_WINDOW(strm, strm->state->window);
+
+    ZFREE_STATE(strm, strm->state);
+    strm->state = NULL;
+
+    return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
+}
+
+/* =========================================================================
+ * Copy the source state to the destination state.
+ */
+int32_t Z_EXPORT PREFIX(deflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *source) {
+    deflate_state *ds;
+    deflate_state *ss;
+    uint32_t window_padding = 0;
+
+    if (deflateStateCheck(source) || dest == NULL)
+        return Z_STREAM_ERROR;
+
+    ss = source->state;
+
+    memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream)));
+
+    ds = ZALLOC_DEFLATE_STATE(dest);
+    if (ds == NULL)
+        return Z_MEM_ERROR;
+    dest->state = (struct internal_state *) ds;
+    ZCOPY_DEFLATE_STATE(ds, ss);
+    ds->strm = dest;
+
+#ifdef X86_PCLMULQDQ_CRC
+    window_padding = 8;
+#endif
+
+    ds->window = (unsigned char *) ZALLOC_WINDOW(dest, ds->w_size + window_padding, 2*sizeof(unsigned char));
+    ds->prev   = (Pos *)  ZALLOC(dest, ds->w_size, sizeof(Pos));
+    ds->head   = (Pos *)  ZALLOC(dest, HASH_SIZE, sizeof(Pos));
+    ds->pending_buf = (unsigned char *) ZALLOC(dest, ds->lit_bufsize, 4);
+
+    if (ds->window == NULL || ds->prev == NULL || ds->head == NULL || ds->pending_buf == NULL) {
+        PREFIX(deflateEnd)(dest);
+        return Z_MEM_ERROR;
+    }
+
+    memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(unsigned char));
+    memcpy((void *)ds->prev, (void *)ss->prev, ds->w_size * sizeof(Pos));
+    memcpy((void *)ds->head, (void *)ss->head, HASH_SIZE * sizeof(Pos));
+    memcpy(ds->pending_buf, ss->pending_buf, ds->pending_buf_size);
+
+    ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
+    ds->sym_buf = ds->pending_buf + ds->lit_bufsize;
+
+    ds->l_desc.dyn_tree = ds->dyn_ltree;
+    ds->d_desc.dyn_tree = ds->dyn_dtree;
+    ds->bl_desc.dyn_tree = ds->bl_tree;
+
+    return Z_OK;
+}
+
+/* ===========================================================================
+ * Read a new buffer from the current input stream, update the adler32
+ * and total number of bytes read.  All deflate() input goes through
+ * this function so some applications may wish to modify it to avoid
+ * allocating a large strm->next_in buffer and copying from it.
+ * (See also flush_pending()).
+ */
+Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf, unsigned size) {
+    uint32_t len = MIN(strm->avail_in, size);
+    if (len == 0)
+        return 0;
+
+    strm->avail_in  -= len;
+
+    if (!DEFLATE_NEED_CHECKSUM(strm)) {
+        memcpy(buf, strm->next_in, len);
+#ifdef GZIP
+    } else if (strm->state->wrap == 2) {
+        functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len);
+#endif
+    } else if (strm->state->wrap == 1) {
+        strm->adler = functable.adler32_fold_copy(strm->adler, buf, strm->next_in, len);
+    } else {
+        memcpy(buf, strm->next_in, len);
+    }
+    strm->next_in  += len;
+    strm->total_in += len;
+
+    return len;
+}
+
+/* ===========================================================================
+ * Set longest match variables based on level configuration
+ */
+static void lm_set_level(deflate_state *s, int level) {
+    s->max_lazy_match   = configuration_table[level].max_lazy;
+    s->good_match       = configuration_table[level].good_length;
+    s->nice_match       = configuration_table[level].nice_length;
+    s->max_chain_length = configuration_table[level].max_chain;
+
+    /* Use rolling hash for deflate_slow algorithm with level 9. It allows us to
+     * properly lookup different hash chains to speed up longest_match search. Since hashing
+     * method changes depending on the level we cannot put this into functable. */
+    if (s->max_chain_length > 1024) {
+        s->update_hash = &update_hash_roll;
+        s->insert_string = &insert_string_roll;
+        s->quick_insert_string = &quick_insert_string_roll;
+    } else {
+        s->update_hash = functable.update_hash;
+        s->insert_string = functable.insert_string;
+        s->quick_insert_string = functable.quick_insert_string;
+    }
+
+    s->level = level;
+}
+
+/* ===========================================================================
+ * Initialize the "longest match" routines for a new zlib stream
+ */
+static void lm_init(deflate_state *s) {
+    s->window_size = 2 * s->w_size;
+
+    CLEAR_HASH(s);
+
+    /* Set the default configuration parameters:
+     */
+    lm_set_level(s, s->level);
+
+    s->strstart = 0;
+    s->block_start = 0;
+    s->lookahead = 0;
+    s->insert = 0;
+    s->prev_length = 0;
+    s->match_available = 0;
+    s->match_start = 0;
+    s->ins_h = 0;
+}
+
+/* ===========================================================================
+ * Fill the window when the lookahead becomes insufficient.
+ * Updates strstart and lookahead.
+ *
+ * IN assertion: lookahead < MIN_LOOKAHEAD
+ * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
+ *    At least one byte has been read, or avail_in == 0; reads are
+ *    performed for at least two bytes (required for the zip translate_eol
+ *    option -- not supported here).
+ */
+
+void Z_INTERNAL PREFIX(fill_window)(deflate_state *s) {
+    unsigned n;
+    unsigned int more;    /* Amount of free space at the end of the window. */
+    unsigned int wsize = s->w_size;
+
+    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+    do {
+        more = s->window_size - s->lookahead - s->strstart;
+
+        /* If the window is almost full and there is insufficient lookahead,
+         * move the upper half to the lower one to make room in the upper half.
+         */
+        if (s->strstart >= wsize+MAX_DIST(s)) {
+            memcpy(s->window, s->window+wsize, (unsigned)wsize);
+            if (s->match_start >= wsize) {
+                s->match_start -= wsize;
+            } else {
+                s->match_start = 0;
+                s->prev_length = 0;
+            }
+            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
+            s->block_start -= (int)wsize;
+            if (s->insert > s->strstart)
+                s->insert = s->strstart;
+            functable.slide_hash(s);
+            more += wsize;
+        }
+        if (s->strm->avail_in == 0)
+            break;
+
+        /* If there was no sliding:
+         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+         *    more == window_size - lookahead - strstart
+         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+         * => more >= window_size - 2*WSIZE + 2
+         * In the BIG_MEM or MMAP case (not yet supported),
+         *   window_size == input_size + MIN_LOOKAHEAD  &&
+         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+         * Otherwise, window_size == 2*WSIZE so more >= 2.
+         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+         */
+        Assert(more >= 2, "more < 2");
+
+        n = PREFIX(read_buf)(s->strm, s->window + s->strstart + s->lookahead, more);
+        s->lookahead += n;
+
+        /* Initialize the hash value now that we have some input: */
+        if (s->lookahead + s->insert >= STD_MIN_MATCH) {
+            unsigned int str = s->strstart - s->insert;
+            if (UNLIKELY(s->max_chain_length > 1024)) {
+                s->ins_h = s->update_hash(s, s->window[str], s->window[str+1]);
+            } else if (str >= 1) {
+                s->quick_insert_string(s, str + 2 - STD_MIN_MATCH);
+            }
+            unsigned int count = s->insert;
+            if (UNLIKELY(s->lookahead == 1)) {
+                count -= 1;
+            }
+            if (count > 0) {
+                s->insert_string(s, str, count);
+                s->insert -= count;
+            }
+        }
+        /* If the whole input has less than STD_MIN_MATCH bytes, ins_h is garbage,
+         * but this is not important since only literal bytes will be emitted.
+         */
+    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+    /* If the WIN_INIT bytes after the end of the current data have never been
+     * written, then zero those bytes in order to avoid memory check reports of
+     * the use of uninitialized (or uninitialised as Julian writes) bytes by
+     * the longest match routines.  Update the high water mark for the next
+     * time through here.  WIN_INIT is set to STD_MAX_MATCH since the longest match
+     * routines allow scanning to strstart + STD_MAX_MATCH, ignoring lookahead.
+     */
+    if (s->high_water < s->window_size) {
+        unsigned int curr = s->strstart + s->lookahead;
+        unsigned int init;
+
+        if (s->high_water < curr) {
+            /* Previous high water mark below current data -- zero WIN_INIT
+             * bytes or up to end of window, whichever is less.
+             */
+            init = s->window_size - curr;
+            if (init > WIN_INIT)
+                init = WIN_INIT;
+            memset(s->window + curr, 0, init);
+            s->high_water = curr + init;
+        } else if (s->high_water < curr + WIN_INIT) {
+            /* High water mark at or above current data, but below current data
+             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+             * to end of window, whichever is less.
+             */
+            init = curr + WIN_INIT - s->high_water;
+            if (init > s->window_size - s->high_water)
+                init = s->window_size - s->high_water;
+            memset(s->window + s->high_water, 0, init);
+            s->high_water += init;
+        }
+    }
+
+    Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+           "not enough room for search");
+}
+
+#ifndef ZLIB_COMPAT
+/* =========================================================================
+ * Checks whether buffer size is sufficient and whether this parameter is a duplicate.
+ */
+static int32_t deflateSetParamPre(zng_deflate_param_value **out, size_t min_size, zng_deflate_param_value *param) {
+    int32_t buf_error = param->size < min_size;
+
+    if (*out != NULL) {
+        (*out)->status = Z_BUF_ERROR;
+        buf_error = 1;
+    }
+    *out = param;
+    return buf_error;
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT zng_deflateSetParams(zng_stream *strm, zng_deflate_param_value *params, size_t count) {
+    size_t i;
+    deflate_state *s;
+    zng_deflate_param_value *new_level = NULL;
+    zng_deflate_param_value *new_strategy = NULL;
+    zng_deflate_param_value *new_reproducible = NULL;
+    int param_buf_error;
+    int version_error = 0;
+    int buf_error = 0;
+    int stream_error = 0;
+
+    /* Initialize the statuses. */
+    for (i = 0; i < count; i++)
+        params[i].status = Z_OK;
+
+    /* Check whether the stream state is consistent. */
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    s = strm->state;
+
+    /* Check buffer sizes and detect duplicates. */
+    for (i = 0; i < count; i++) {
+        switch (params[i].param) {
+            case Z_DEFLATE_LEVEL:
+                param_buf_error = deflateSetParamPre(&new_level, sizeof(int), &params[i]);
+                break;
+            case Z_DEFLATE_STRATEGY:
+                param_buf_error = deflateSetParamPre(&new_strategy, sizeof(int), &params[i]);
+                break;
+            case Z_DEFLATE_REPRODUCIBLE:
+                param_buf_error = deflateSetParamPre(&new_reproducible, sizeof(int), &params[i]);
+                break;
+            default:
+                params[i].status = Z_VERSION_ERROR;
+                version_error = 1;
+                param_buf_error = 0;
+                break;
+        }
+        if (param_buf_error) {
+            params[i].status = Z_BUF_ERROR;
+            buf_error = 1;
+        }
+    }
+    /* Exit early if small buffers or duplicates are detected. */
+    if (buf_error)
+        return Z_BUF_ERROR;
+
+    /* Apply changes, remember if there were errors. */
+    if (new_level != NULL || new_strategy != NULL) {
+        int ret = PREFIX(deflateParams)(strm, new_level == NULL ? s->level : *(int *)new_level->buf,
+                                        new_strategy == NULL ? s->strategy : *(int *)new_strategy->buf);
+        if (ret != Z_OK) {
+            if (new_level != NULL)
+                new_level->status = Z_STREAM_ERROR;
+            if (new_strategy != NULL)
+                new_strategy->status = Z_STREAM_ERROR;
+            stream_error = 1;
+        }
+    }
+    if (new_reproducible != NULL) {
+        int val = *(int *)new_reproducible->buf;
+        if (DEFLATE_CAN_SET_REPRODUCIBLE(strm, val)) {
+            s->reproducible = val;
+        } else {
+            new_reproducible->status = Z_STREAM_ERROR;
+            stream_error = 1;
+        }
+    }
+
+    /* Report version errors only if there are no real errors. */
+    return stream_error ? Z_STREAM_ERROR : (version_error ? Z_VERSION_ERROR : Z_OK);
+}
+
+/* ========================================================================= */
+int32_t Z_EXPORT zng_deflateGetParams(zng_stream *strm, zng_deflate_param_value *params, size_t count) {
+    deflate_state *s;
+    size_t i;
+    int32_t buf_error = 0;
+    int32_t version_error = 0;
+
+    /* Initialize the statuses. */
+    for (i = 0; i < count; i++)
+        params[i].status = Z_OK;
+
+    /* Check whether the stream state is consistent. */
+    if (deflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    s = strm->state;
+
+    for (i = 0; i < count; i++) {
+        switch (params[i].param) {
+            case Z_DEFLATE_LEVEL:
+                if (params[i].size < sizeof(int))
+                    params[i].status = Z_BUF_ERROR;
+                else
+                    *(int *)params[i].buf = s->level;
+                break;
+            case Z_DEFLATE_STRATEGY:
+                if (params[i].size < sizeof(int))
+                    params[i].status = Z_BUF_ERROR;
+                else
+                    *(int *)params[i].buf = s->strategy;
+                break;
+            case Z_DEFLATE_REPRODUCIBLE:
+                if (params[i].size < sizeof(int))
+                    params[i].status = Z_BUF_ERROR;
+                else
+                    *(int *)params[i].buf = s->reproducible;
+                break;
+            default:
+                params[i].status = Z_VERSION_ERROR;
+                version_error = 1;
+                break;
+        }
+        if (params[i].status == Z_BUF_ERROR)
+            buf_error = 1;
+    }
+    return buf_error ? Z_BUF_ERROR : (version_error ? Z_VERSION_ERROR : Z_OK);
+}
+#endif
diff --git a/3rdparty/zlib-ng/deflate.h b/3rdparty/zlib-ng/deflate.h
new file mode 100644
index 000000000000..8001b47c999d
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate.h
@@ -0,0 +1,408 @@
+#ifndef DEFLATE_H_
+#define DEFLATE_H_
+/* deflate.h -- internal compression state
+ * Copyright (C) 1995-2016 Jean-loup Gailly
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+#include "zutil.h"
+#include "zendian.h"
+#include "adler32_fold.h"
+#include "crc32_fold.h"
+
+/* define NO_GZIP when compiling if you want to disable gzip header and
+   trailer creation by deflate().  NO_GZIP would be used to avoid linking in
+   the crc code when it is not needed.  For shared libraries, gzip encoding
+   should be left enabled. */
+#ifndef NO_GZIP
+#  define GZIP
+#endif
+
+/* ===========================================================================
+ * Internal compression state.
+ */
+
+#define LENGTH_CODES 29
+/* number of length codes, not counting the special END_BLOCK code */
+
+#define LITERALS  256
+/* number of literal bytes 0..255 */
+
+#define L_CODES (LITERALS+1+LENGTH_CODES)
+/* number of Literal or Length codes, including the END_BLOCK code */
+
+#define D_CODES   30
+/* number of distance codes */
+
+#define BL_CODES  19
+/* number of codes used to transfer the bit lengths */
+
+#define HEAP_SIZE (2*L_CODES+1)
+/* maximum heap size */
+
+#define BIT_BUF_SIZE 64
+/* size of bit buffer in bi_buf */
+
+#define END_BLOCK 256
+/* end of block literal code */
+
+#define INIT_STATE      1    /* zlib header -> BUSY_STATE */
+#ifdef GZIP
+#  define GZIP_STATE    4    /* gzip header -> BUSY_STATE | EXTRA_STATE */
+#  define EXTRA_STATE   5    /* gzip extra block -> NAME_STATE */
+#  define NAME_STATE    6    /* gzip file name -> COMMENT_STATE */
+#  define COMMENT_STATE 7    /* gzip comment -> HCRC_STATE */
+#  define HCRC_STATE    8    /* gzip header CRC -> BUSY_STATE */
+#endif
+#define BUSY_STATE      2    /* deflate -> FINISH_STATE */
+#define FINISH_STATE    3    /* stream complete */
+#ifdef GZIP
+#  define MAX_STATE     HCRC_STATE
+#else
+#  define MAX_STATE     FINISH_STATE
+#endif
+/* Stream status */
+
+#define HASH_BITS    16u           /* log2(HASH_SIZE) */
+#ifndef HASH_SIZE
+#  define HASH_SIZE 65536u         /* number of elements in hash table */
+#endif
+#define HASH_MASK (HASH_SIZE - 1u) /* HASH_SIZE-1 */
+
+
+/* Data structure describing a single value and its code string. */
+typedef struct ct_data_s {
+    union {
+        uint16_t  freq;       /* frequency count */
+        uint16_t  code;       /* bit string */
+    } fc;
+    union {
+        uint16_t  dad;        /* father node in Huffman tree */
+        uint16_t  len;        /* length of bit string */
+    } dl;
+} ct_data;
+
+#define Freq fc.freq
+#define Code fc.code
+#define Dad  dl.dad
+#define Len  dl.len
+
+typedef struct static_tree_desc_s  static_tree_desc;
+
+typedef struct tree_desc_s {
+    ct_data                *dyn_tree;  /* the dynamic tree */
+    int                    max_code;   /* largest code with non zero frequency */
+    const static_tree_desc *stat_desc; /* the corresponding static tree */
+} tree_desc;
+
+typedef uint16_t Pos;
+
+/* A Pos is an index in the character window. We use short instead of int to
+ * save space in the various tables.
+ */
+/* Type definitions for hash callbacks */
+typedef struct internal_state deflate_state;
+
+typedef uint32_t (* update_hash_cb)        (deflate_state *const s, uint32_t h, uint32_t val);
+typedef void     (* insert_string_cb)      (deflate_state *const s, uint32_t str, uint32_t count);
+typedef Pos      (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);
+
+struct internal_state {
+    PREFIX3(stream)      *strm;            /* pointer back to this zlib stream */
+    unsigned char        *pending_buf;     /* output still pending */
+    unsigned char        *pending_out;     /* next pending byte to output to the stream */
+    uint32_t             pending_buf_size; /* size of pending_buf */
+    uint32_t             pending;          /* nb of bytes in the pending buffer */
+    int                  wrap;             /* bit 0 true for zlib, bit 1 true for gzip */
+    uint32_t             gzindex;          /* where in extra, name, or comment */
+    PREFIX(gz_headerp)   gzhead;           /* gzip header information to write */
+    int                  status;           /* as the name implies */
+    int                  last_flush;       /* value of flush param for previous deflate call */
+    int                  reproducible;     /* Whether reproducible compression results are required. */
+
+    int block_open;
+    /* Whether or not a block is currently open for the QUICK deflation scheme.
+     * This is set to 1 if there is an active block, or 0 if the block was just closed.
+     */
+
+                /* used by deflate.c: */
+
+    unsigned int  w_size;            /* LZ77 window size (32K by default) */
+    unsigned int  w_bits;            /* log2(w_size)  (8..16) */
+    unsigned int  w_mask;            /* w_size - 1 */
+    unsigned int  lookahead;         /* number of valid bytes ahead in window */
+
+    unsigned int high_water;
+    /* High water mark offset in window for initialized bytes -- bytes above
+     * this are set to zero in order to avoid memory check warnings when
+     * longest match routines access bytes past the input.  This is then
+     * updated to the new high water mark.
+     */
+
+    unsigned int window_size;
+    /* Actual size of window: 2*wSize, except when the user input buffer
+     * is directly used as sliding window.
+     */
+
+    unsigned char *window;
+    /* Sliding window. Input bytes are read into the second half of the window,
+     * and move to the first half later to keep a dictionary of at least wSize
+     * bytes. With this organization, matches are limited to a distance of
+     * wSize-STD_MAX_MATCH bytes, but this ensures that IO is always
+     * performed with a length multiple of the block size. Also, it limits
+     * the window size to 64K, which is quite useful on MSDOS.
+     * To do: use the user input buffer as sliding window.
+     */
+
+    Pos *prev;
+    /* Link to older string with same hash index. To limit the size of this
+     * array to 64K, this link is maintained only for the last 32K strings.
+     * An index in this array is thus a window index modulo 32K.
+     */
+
+    Pos *head; /* Heads of the hash chains or 0. */
+
+    uint32_t ins_h; /* hash index of string to be inserted */
+
+    int block_start;
+    /* Window position at the beginning of the current output block. Gets
+     * negative when the window is moved backwards.
+     */
+
+    unsigned int match_length;       /* length of best match */
+    Pos          prev_match;         /* previous match */
+    int          match_available;    /* set if previous match exists */
+    unsigned int strstart;           /* start of string to insert */
+    unsigned int match_start;        /* start of matching string */
+
+    unsigned int prev_length;
+    /* Length of the best match at previous step. Matches not greater than this
+     * are discarded. This is used in the lazy match evaluation.
+     */
+
+    unsigned int max_chain_length;
+    /* To speed up deflation, hash chains are never searched beyond this length.
+     * A higher limit improves compression ratio but degrades the speed.
+     */
+
+    unsigned int max_lazy_match;
+    /* Attempt to find a better match only when the current match is strictly smaller
+     * than this value. This mechanism is used only for compression levels >= 4.
+     */
+#   define max_insert_length  max_lazy_match
+    /* Insert new strings in the hash table only if the match length is not
+     * greater than this length. This saves time but degrades compression.
+     * max_insert_length is used only for compression levels <= 3.
+     */
+
+    update_hash_cb          update_hash;
+    insert_string_cb        insert_string;
+    quick_insert_string_cb  quick_insert_string;
+    /* Hash function callbacks that can be configured depending on the deflate
+     * algorithm being used */
+
+    int level;    /* compression level (1..9) */
+    int strategy; /* favor or force Huffman coding*/
+
+    unsigned int good_match;
+    /* Use a faster search when the previous match is longer than this */
+
+    int nice_match; /* Stop searching when current match exceeds this */
+
+    struct crc32_fold_s ALIGNED_(16) crc_fold;
+
+                /* used by trees.c: */
+    /* Didn't use ct_data typedef below to suppress compiler warning */
+    struct ct_data_s dyn_ltree[HEAP_SIZE];   /* literal and length tree */
+    struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
+    struct ct_data_s bl_tree[2*BL_CODES+1];  /* Huffman tree for bit lengths */
+
+    struct tree_desc_s l_desc;               /* desc. for literal tree */
+    struct tree_desc_s d_desc;               /* desc. for distance tree */
+    struct tree_desc_s bl_desc;              /* desc. for bit length tree */
+
+    uint16_t bl_count[MAX_BITS+1];
+    /* number of codes at each bit length for an optimal tree */
+
+    int heap[2*L_CODES+1];      /* heap used to build the Huffman trees */
+    int heap_len;               /* number of elements in the heap */
+    int heap_max;               /* element of largest frequency */
+    /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
+     * The same heap array is used to build all trees.
+     */
+
+    unsigned char depth[2*L_CODES+1];
+    /* Depth of each subtree used as tie breaker for trees of equal frequency
+     */
+
+    unsigned int  lit_bufsize;
+    /* Size of match buffer for literals/lengths.  There are 4 reasons for
+     * limiting lit_bufsize to 64K:
+     *   - frequencies can be kept in 16 bit counters
+     *   - if compression is not successful for the first block, all input
+     *     data is still in the window so we can still emit a stored block even
+     *     when input comes from standard input.  (This can also be done for
+     *     all blocks if lit_bufsize is not greater than 32K.)
+     *   - if compression is not successful for a file smaller than 64K, we can
+     *     even emit a stored file instead of a stored block (saving 5 bytes).
+     *     This is applicable only for zip (not gzip or zlib).
+     *   - creating new Huffman trees less frequently may not provide fast
+     *     adaptation to changes in the input data statistics. (Take for
+     *     example a binary file with poorly compressible code followed by
+     *     a highly compressible string table.) Smaller buffer sizes give
+     *     fast adaptation but have of course the overhead of transmitting
+     *     trees more frequently.
+     *   - I can't count above 4
+     */
+
+    unsigned char *sym_buf;       /* buffer for distances and literals/lengths */
+    unsigned int sym_next;        /* running index in sym_buf */
+    unsigned int sym_end;         /* symbol table full when sym_next reaches this */
+
+    unsigned long opt_len;        /* bit length of current block with optimal trees */
+    unsigned long static_len;     /* bit length of current block with static trees */
+    unsigned int matches;         /* number of string matches in current block */
+    unsigned int insert;          /* bytes at end of window left to insert */
+
+    /* compressed_len and bits_sent are only used if ZLIB_DEBUG is defined */
+    unsigned long compressed_len; /* total bit length of compressed file mod 2^32 */
+    unsigned long bits_sent;      /* bit length of compressed data sent mod 2^32 */
+
+    /* Reserved for future use and alignment purposes */
+    char *reserved_p;
+
+    uint64_t bi_buf;
+    /* Output buffer. bits are inserted starting at the bottom (least significant bits). */
+
+    int32_t bi_valid;
+    /* Number of valid bits in bi_buf.  All bits above the last valid bit are always zero. */
+
+    /* Reserved for future use and alignment purposes */
+    int32_t reserved[11];
+} ALIGNED_(8);
+
+typedef enum {
+    need_more,      /* block not completed, need more input or more output */
+    block_done,     /* block flush performed */
+    finish_started, /* finish started, need only more output at next deflate */
+    finish_done     /* finish done, accept no more input or output */
+} block_state;
+
+/* Output a byte on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+#define put_byte(s, c) { \
+    s->pending_buf[s->pending++] = (unsigned char)(c); \
+}
+
+/* ===========================================================================
+ * Output a short LSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_short(deflate_state *s, uint16_t w) {
+#if BYTE_ORDER == BIG_ENDIAN
+    w = ZSWAP16(w);
+#endif
+    memcpy(&s->pending_buf[s->pending], &w, sizeof(w));
+    s->pending += 2;
+}
+
+/* ===========================================================================
+ * Output a short MSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_short_msb(deflate_state *s, uint16_t w) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    w = ZSWAP16(w);
+#endif
+    memcpy(&s->pending_buf[s->pending], &w, sizeof(w));
+    s->pending += 2;
+}
+
+/* ===========================================================================
+ * Output a 32-bit unsigned int LSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_uint32(deflate_state *s, uint32_t dw) {
+#if BYTE_ORDER == BIG_ENDIAN
+    dw = ZSWAP32(dw);
+#endif
+    memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw));
+    s->pending += 4;
+}
+
+/* ===========================================================================
+ * Output a 32-bit unsigned int MSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_uint32_msb(deflate_state *s, uint32_t dw) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    dw = ZSWAP32(dw);
+#endif
+    memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw));
+    s->pending += 4;
+}
+
+/* ===========================================================================
+ * Output a 64-bit unsigned int LSB first on the stream.
+ * IN assertion: there is enough room in pending_buf.
+ */
+static inline void put_uint64(deflate_state *s, uint64_t lld) {
+#if BYTE_ORDER == BIG_ENDIAN
+    lld = ZSWAP64(lld);
+#endif
+    memcpy(&s->pending_buf[s->pending], &lld, sizeof(lld));
+    s->pending += 8;
+}
+
+#define MIN_LOOKAHEAD (STD_MAX_MATCH + STD_MIN_MATCH + 1)
+/* Minimum amount of lookahead, except at the end of the input file.
+ * See deflate.c for comments about the STD_MIN_MATCH+1.
+ */
+
+#define MAX_DIST(s)  ((s)->w_size - MIN_LOOKAHEAD)
+/* In order to simplify the code, particularly on 16 bit machines, match
+ * distances are limited to MAX_DIST instead of WSIZE.
+ */
+
+#define WIN_INIT STD_MAX_MATCH
+/* Number of bytes after end of data in window to initialize in order to avoid
+   memory checker errors from longest match routines */
+
+
+void Z_INTERNAL PREFIX(fill_window)(deflate_state *s);
+void Z_INTERNAL slide_hash_c(deflate_state *s);
+
+        /* in trees.c */
+void Z_INTERNAL zng_tr_init(deflate_state *s);
+void Z_INTERNAL zng_tr_flush_block(deflate_state *s, char *buf, uint32_t stored_len, int last);
+void Z_INTERNAL zng_tr_flush_bits(deflate_state *s);
+void Z_INTERNAL zng_tr_align(deflate_state *s);
+void Z_INTERNAL zng_tr_stored_block(deflate_state *s, char *buf, uint32_t stored_len, int last);
+uint16_t Z_INTERNAL PREFIX(bi_reverse)(unsigned code, int len);
+void Z_INTERNAL PREFIX(flush_pending)(PREFIX3(streamp) strm);
+#define d_code(dist) ((dist) < 256 ? zng_dist_code[dist] : zng_dist_code[256+((dist)>>7)])
+/* Mapping from a distance to a distance code. dist is the distance - 1 and
+ * must not have side effects. zng_dist_code[256] and zng_dist_code[257] are never
+ * used.
+ */
+
+/* Bit buffer and compress bits calculation debugging */
+#ifdef ZLIB_DEBUG
+#  define cmpr_bits_add(s, len)     s->compressed_len += (len)
+#  define cmpr_bits_align(s)        s->compressed_len = (s->compressed_len + 7) & ~7L
+#  define sent_bits_add(s, bits)    s->bits_sent += (bits)
+#  define sent_bits_align(s)        s->bits_sent = (s->bits_sent + 7) & ~7L
+#else
+#  define cmpr_bits_add(s, len)     Z_UNUSED(len)
+#  define cmpr_bits_align(s)
+#  define sent_bits_add(s, bits)    Z_UNUSED(bits)
+#  define sent_bits_align(s)
+#endif
+
+#endif /* DEFLATE_H_ */
diff --git a/3rdparty/zlib-ng/deflate_fast.c b/3rdparty/zlib-ng/deflate_fast.c
new file mode 100644
index 000000000000..3184aa718c7e
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate_fast.c
@@ -0,0 +1,102 @@
+/* deflate_fast.c -- compress data using the fast strategy of deflation algorithm
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* ===========================================================================
+ * Compress as much as possible from the input stream, return the current
+ * block state.
+ * This function does not perform lazy evaluation of matches and inserts
+ * new strings in the dictionary only for unmatched strings or for short
+ * matches. It is used only for the fast compression options.
+ */
+Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
+    Pos hash_head;        /* head of the hash chain */
+    int bflush = 0;       /* set if current block must be flushed */
+    int64_t dist;
+    uint32_t match_len = 0;
+
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need STD_MAX_MATCH bytes
+         * for the next match, plus WANT_MIN_MATCH bytes to insert the
+         * string following the next match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            PREFIX(fill_window)(s);
+            if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
+                return need_more;
+            }
+            if (UNLIKELY(s->lookahead == 0))
+                break; /* flush the current block */
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+        if (s->lookahead >= WANT_MIN_MATCH) {
+            hash_head = functable.quick_insert_string(s, s->strstart);
+            dist = (int64_t)s->strstart - hash_head;
+
+            /* Find the longest match, discarding those <= prev_length.
+             * At this point we have always match length < WANT_MIN_MATCH
+             */
+            if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
+                /* To simplify the code, we prevent matches with the string
+                 * of window index 0 (in particular we have to avoid a match
+                 * of the string with itself at the start of the input file).
+                 */
+                match_len = functable.longest_match(s, hash_head);
+                /* longest_match() sets match_start */
+            }
+        }
+
+        if (match_len >= WANT_MIN_MATCH) {
+            check_match(s, s->strstart, s->match_start, match_len);
+
+            bflush = zng_tr_tally_dist(s, s->strstart - s->match_start, match_len - STD_MIN_MATCH);
+
+            s->lookahead -= match_len;
+
+            /* Insert new strings in the hash table only if the match length
+             * is not too large. This saves time but degrades compression.
+             */
+            if (match_len <= s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
+                match_len--; /* string at strstart already in table */
+                s->strstart++;
+
+                functable.insert_string(s, s->strstart, match_len);
+                s->strstart += match_len;
+            } else {
+                s->strstart += match_len;
+                functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
+
+                /* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
+                 * matter since it will be recomputed at next deflate call.
+                 */
+            }
+            match_len = 0;
+        } else {
+            /* No match, output a literal byte */
+            bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
+            s->lookahead--;
+            s->strstart++;
+        }
+        if (UNLIKELY(bflush))
+            FLUSH_BLOCK(s, 0);
+    }
+    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+    if (UNLIKELY(flush == Z_FINISH)) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (UNLIKELY(s->sym_next))
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
diff --git a/3rdparty/zlib-ng/deflate_huff.c b/3rdparty/zlib-ng/deflate_huff.c
new file mode 100644
index 000000000000..b197e24d7c38
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate_huff.c
@@ -0,0 +1,45 @@
+/* deflate_huff.c -- compress data using huffman encoding only strategy
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* ===========================================================================
+ * For Z_HUFFMAN_ONLY, do not look for matches.  Do not maintain a hash table.
+ * (It will be regenerated if this run of deflate switches away from Huffman.)
+ */
+Z_INTERNAL block_state deflate_huff(deflate_state *s, int flush) {
+    int bflush = 0;         /* set if current block must be flushed */
+
+    for (;;) {
+        /* Make sure that we have a literal to write. */
+        if (s->lookahead == 0) {
+            PREFIX(fill_window)(s);
+            if (s->lookahead == 0) {
+                if (flush == Z_NO_FLUSH)
+                    return need_more;
+                break;      /* flush the current block */
+            }
+        }
+
+        /* Output a literal byte */
+        bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
+        s->lookahead--;
+        s->strstart++;
+        if (bflush)
+            FLUSH_BLOCK(s, 0);
+    }
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->sym_next)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
diff --git a/3rdparty/zlib-ng/deflate_medium.c b/3rdparty/zlib-ng/deflate_medium.c
new file mode 100644
index 000000000000..47796e32217a
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate_medium.c
@@ -0,0 +1,293 @@
+/* deflate_medium.c -- The deflate_medium deflate strategy
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Arjan van de Ven    <arjan@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef NO_MEDIUM_STRATEGY
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+struct match {
+    uint16_t match_start;
+    uint16_t match_length;
+    uint16_t strstart;
+    uint16_t orgstart;
+};
+
+static int emit_match(deflate_state *s, struct match match) {
+    int bflush = 0;
+
+    /* matches that are not long enough we need to emit as literals */
+    if (match.match_length < WANT_MIN_MATCH) {
+        while (match.match_length) {
+            bflush += zng_tr_tally_lit(s, s->window[match.strstart]);
+            s->lookahead--;
+            match.strstart++;
+            match.match_length--;
+        }
+        return bflush;
+    }
+
+    check_match(s, match.strstart, match.match_start, match.match_length);
+
+    bflush += zng_tr_tally_dist(s, match.strstart - match.match_start, match.match_length - STD_MIN_MATCH);
+
+    s->lookahead -= match.match_length;
+    return bflush;
+}
+
+static void insert_match(deflate_state *s, struct match match) {
+    if (UNLIKELY(s->lookahead <= (unsigned int)(match.match_length + WANT_MIN_MATCH)))
+        return;
+
+    /* matches that are not long enough we need to emit as literals */
+    if (LIKELY(match.match_length < WANT_MIN_MATCH)) {
+        match.strstart++;
+        match.match_length--;
+        if (UNLIKELY(match.match_length > 0)) {
+            if (match.strstart >= match.orgstart) {
+                if (match.strstart + match.match_length - 1 >= match.orgstart) {
+                    functable.insert_string(s, match.strstart, match.match_length);
+                } else {
+                    functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
+                }
+                match.strstart += match.match_length;
+                match.match_length = 0;
+            }
+        }
+        return;
+    }
+
+    /* Insert new strings in the hash table only if the match length
+     * is not too large. This saves time but degrades compression.
+     */
+    if (match.match_length <= 16 * s->max_insert_length && s->lookahead >= WANT_MIN_MATCH) {
+        match.match_length--; /* string at strstart already in table */
+        match.strstart++;
+
+        if (LIKELY(match.strstart >= match.orgstart)) {
+            if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
+                functable.insert_string(s, match.strstart, match.match_length);
+            } else {
+                functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
+            }
+        } else if (match.orgstart < match.strstart + match.match_length) {
+            functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
+        }
+        match.strstart += match.match_length;
+        match.match_length = 0;
+    } else {
+        match.strstart += match.match_length;
+        match.match_length = 0;
+
+        if (match.strstart >= (STD_MIN_MATCH - 2))
+            functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);
+
+        /* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
+         * matter since it will be recomputed at next deflate call.
+         */
+    }
+}
+
+static void fizzle_matches(deflate_state *s, struct match *current, struct match *next) {
+    Pos limit;
+    unsigned char *match, *orig;
+    int changed = 0;
+    struct match c, n;
+    /* step zero: sanity checks */
+
+    if (current->match_length <= 1)
+        return;
+
+    if (UNLIKELY(current->match_length > 1 + next->match_start))
+        return;
+
+    if (UNLIKELY(current->match_length > 1 + next->strstart))
+        return;
+
+    match = s->window - current->match_length + 1 + next->match_start;
+    orig  = s->window - current->match_length + 1 + next->strstart;
+
+    /* quick exit check.. if this fails then don't bother with anything else */
+    if (LIKELY(*match != *orig))
+        return;
+
+    c = *current;
+    n = *next;
+
+    /* step one: try to move the "next" match to the left as much as possible */
+    limit = next->strstart > MAX_DIST(s) ? next->strstart - (Pos)MAX_DIST(s) : 0;
+
+    match = s->window + n.match_start - 1;
+    orig = s->window + n.strstart - 1;
+
+    while (*match == *orig) {
+        if (UNLIKELY(c.match_length < 1))
+            break;
+        if (UNLIKELY(n.strstart <= limit))
+            break;
+        if (UNLIKELY(n.match_length >= 256))
+            break;
+        if (UNLIKELY(n.match_start <= 1))
+            break;
+
+        n.strstart--;
+        n.match_start--;
+        n.match_length++;
+        c.match_length--;
+        match--;
+        orig--;
+        changed++;
+    }
+
+    if (!changed)
+        return;
+
+    if (c.match_length <= 1 && n.match_length != 2) {
+        n.orgstart++;
+        *current = c;
+        *next = n;
+    } else {
+        return;
+    }
+}
+
+Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
+    /* Align the first struct to start on a new cacheline, this allows us to fit both structs in one cacheline */
+    ALIGNED_(16) struct match current_match;
+                 struct match next_match;
+
+    /* For levels below 5, don't check the next position for a better match */
+    int early_exit = s->level < 5;
+
+    memset(&current_match, 0, sizeof(struct match));
+    memset(&next_match, 0, sizeof(struct match));
+
+    for (;;) {
+        Pos hash_head = 0;    /* head of the hash chain */
+        int bflush = 0;       /* set if current block must be flushed */
+        int64_t dist;
+
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need STD_MAX_MATCH bytes
+         * for the next match, plus WANT_MIN_MATCH bytes to insert the
+         * string following the next current_match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            PREFIX(fill_window)(s);
+            if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
+                return need_more;
+            }
+            if (UNLIKELY(s->lookahead == 0))
+                break; /* flush the current block */
+            next_match.match_length = 0;
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+
+        /* If we already have a future match from a previous round, just use that */
+        if (!early_exit && next_match.match_length > 0) {
+            current_match = next_match;
+            next_match.match_length = 0;
+        } else {
+            hash_head = 0;
+            if (s->lookahead >= WANT_MIN_MATCH) {
+                hash_head = functable.quick_insert_string(s, s->strstart);
+            }
+
+            current_match.strstart = (uint16_t)s->strstart;
+            current_match.orgstart = current_match.strstart;
+
+            /* Find the longest match, discarding those <= prev_length.
+             * At this point we have always match_length < WANT_MIN_MATCH
+             */
+
+            dist = (int64_t)s->strstart - hash_head;
+            if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
+                /* To simplify the code, we prevent matches with the string
+                 * of window index 0 (in particular we have to avoid a match
+                 * of the string with itself at the start of the input file).
+                 */
+                current_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
+                current_match.match_start = (uint16_t)s->match_start;
+                if (UNLIKELY(current_match.match_length < WANT_MIN_MATCH))
+                    current_match.match_length = 1;
+                if (UNLIKELY(current_match.match_start >= current_match.strstart)) {
+                    /* this can happen due to some restarts */
+                    current_match.match_length = 1;
+                }
+            } else {
+                /* Set up the match to be a 1 byte literal */
+                current_match.match_start = 0;
+                current_match.match_length = 1;
+            }
+        }
+
+        insert_match(s, current_match);
+
+        /* now, look ahead one */
+        if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) {
+            s->strstart = current_match.strstart + current_match.match_length;
+            hash_head = functable.quick_insert_string(s, s->strstart);
+
+            next_match.strstart = (uint16_t)s->strstart;
+            next_match.orgstart = next_match.strstart;
+
+            /* Find the longest match, discarding those <= prev_length.
+             * At this point we have always match_length < WANT_MIN_MATCH
+             */
+
+            dist = (int64_t)s->strstart - hash_head;
+            if (dist <= MAX_DIST(s) && dist > 0 && hash_head != 0) {
+                /* To simplify the code, we prevent matches with the string
+                 * of window index 0 (in particular we have to avoid a match
+                 * of the string with itself at the start of the input file).
+                 */
+                next_match.match_length = (uint16_t)functable.longest_match(s, hash_head);
+                next_match.match_start = (uint16_t)s->match_start;
+                if (UNLIKELY(next_match.match_start >= next_match.strstart)) {
+                    /* this can happen due to some restarts */
+                    next_match.match_length = 1;
+                }
+                if (next_match.match_length < WANT_MIN_MATCH)
+                    next_match.match_length = 1;
+                else
+                    fizzle_matches(s, &current_match, &next_match);
+            } else {
+                /* Set up the match to be a 1 byte literal */
+                next_match.match_start = 0;
+                next_match.match_length = 1;
+            }
+
+            s->strstart = current_match.strstart;
+        } else {
+            next_match.match_length = 0;
+        }
+
+        /* now emit the current match */
+        bflush = emit_match(s, current_match);
+
+        /* move the "cursor" forward */
+        s->strstart += current_match.match_length;
+
+        if (UNLIKELY(bflush))
+            FLUSH_BLOCK(s, 0);
+    }
+    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (UNLIKELY(s->sym_next))
+        FLUSH_BLOCK(s, 0);
+
+    return block_done;
+}
+#endif
diff --git a/3rdparty/zlib-ng/deflate_p.h b/3rdparty/zlib-ng/deflate_p.h
new file mode 100644
index 000000000000..dd2021a0f59a
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate_p.h
@@ -0,0 +1,116 @@
+/* deflate_p.h -- Private inline functions and macros shared with more than
+ *                one deflate method
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#ifndef DEFLATE_P_H
+#define DEFLATE_P_H
+
+/* Forward declare common non-inlined functions declared in deflate.c */
+
+#ifdef ZLIB_DEBUG
+/* ===========================================================================
+ * Check that the match at match_start is indeed a match.
+ */
+static inline void check_match(deflate_state *s, Pos start, Pos match, int length) {
+    /* check that the match length is valid*/
+    if (length < STD_MIN_MATCH || length > STD_MAX_MATCH) {
+        fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
+        z_error("invalid match length");
+    }
+    /* check that the match isn't at the same position as the start string */
+    if (match == start) {
+        fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
+        z_error("invalid match position");
+    }
+    /* check that the match is indeed a match */
+    if (memcmp(s->window + match, s->window + start, length) != 0) {
+        int32_t i = 0;
+        fprintf(stderr, " start %u, match %u, length %d\n", start, match, length);
+        do {
+            fprintf(stderr, "  %03d: match [%02x] start [%02x]\n", i++,
+                s->window[match++], s->window[start++]);
+        } while (--length != 0);
+        z_error("invalid match");
+    }
+    if (z_verbose > 1) {
+        fprintf(stderr, "\\[%u,%d]", start-match, length);
+        do {
+            putc(s->window[start++], stderr);
+        } while (--length != 0);
+    }
+}
+#else
+#define check_match(s, start, match, length)
+#endif
+
+Z_INTERNAL void PREFIX(flush_pending)(PREFIX3(stream) *strm);
+Z_INTERNAL unsigned PREFIX(read_buf)(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
+
+/* ===========================================================================
+ * Save the match info and tally the frequency counts. Return true if
+ * the current block must be flushed.
+ */
+
+extern const unsigned char Z_INTERNAL zng_length_code[];
+extern const unsigned char Z_INTERNAL zng_dist_code[];
+
+static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
+    /* c is the unmatched char */
+    s->sym_buf[s->sym_next++] = 0;
+    s->sym_buf[s->sym_next++] = 0;
+    s->sym_buf[s->sym_next++] = c;
+    s->dyn_ltree[c].Freq++;
+    Tracevv((stderr, "%c", c));
+    Assert(c <= (STD_MAX_MATCH-STD_MIN_MATCH), "zng_tr_tally: bad literal");
+    return (s->sym_next == s->sym_end);
+}
+
+static inline int zng_tr_tally_dist(deflate_state *s, uint32_t dist, uint32_t len) {
+    /* dist: distance of matched string */
+    /* len: match length-STD_MIN_MATCH */
+    s->sym_buf[s->sym_next++] = (uint8_t)(dist);
+    s->sym_buf[s->sym_next++] = (uint8_t)(dist >> 8);
+    s->sym_buf[s->sym_next++] = (uint8_t)len;
+    s->matches++;
+    dist--;
+    Assert(dist < MAX_DIST(s) && (uint16_t)d_code(dist) < (uint16_t)D_CODES,
+        "zng_tr_tally: bad match");
+
+    s->dyn_ltree[zng_length_code[len]+LITERALS+1].Freq++;
+    s->dyn_dtree[d_code(dist)].Freq++;
+    return (s->sym_next == s->sym_end);
+}
+
+/* ===========================================================================
+ * Flush the current block, with given end-of-file flag.
+ * IN assertion: strstart is set to the end of the current match.
+ */
+#define FLUSH_BLOCK_ONLY(s, last) { \
+    zng_tr_flush_block(s, (s->block_start >= 0 ? \
+                   (char *)&s->window[(unsigned)s->block_start] : \
+                   NULL), \
+                   (uint32_t)((int)s->strstart - s->block_start), \
+                   (last)); \
+    s->block_start = (int)s->strstart; \
+    PREFIX(flush_pending)(s->strm); \
+}
+
+/* Same but force premature exit if necessary. */
+#define FLUSH_BLOCK(s, last) { \
+    FLUSH_BLOCK_ONLY(s, last); \
+    if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \
+}
+
+/* Maximum stored block length in deflate format (not including header). */
+#define MAX_STORED 65535
+
+/* Compression function. Returns the block state after the call. */
+typedef block_state (*compress_func) (deflate_state *s, int flush);
+/* Match function. Returns the longest match. */
+typedef uint32_t    (*match_func)    (deflate_state *const s, Pos cur_match);
+
+#endif
diff --git a/3rdparty/zlib-ng/deflate_quick.c b/3rdparty/zlib-ng/deflate_quick.c
new file mode 100644
index 000000000000..df5a17b9e662
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate_quick.c
@@ -0,0 +1,129 @@
+/*
+ * The deflate_quick deflate strategy, designed to be used when cycles are
+ * at a premium.
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *  Jim Guilford    <james.guilford@intel.com>
+ *  Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *  Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * Portions are Copyright (C) 2016 12Sided Technology, LLC.
+ * Author:
+ *  Phil Vachon     <pvachon@12sidedtech.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+#include "trees_emit.h"
+
+extern const ct_data static_ltree[L_CODES+2];
+extern const ct_data static_dtree[D_CODES];
+
+#define QUICK_START_BLOCK(s, last) { \
+    zng_tr_emit_tree(s, STATIC_TREES, last); \
+    s->block_open = 1 + (int)last; \
+    s->block_start = (int)s->strstart; \
+}
+
+#define QUICK_END_BLOCK(s, last) { \
+    if (s->block_open) { \
+        zng_tr_emit_end_block(s, static_ltree, last); \
+        s->block_open = 0; \
+        s->block_start = (int)s->strstart; \
+        PREFIX(flush_pending)(s->strm); \
+        if (s->strm->avail_out == 0) \
+            return (last) ? finish_started : need_more; \
+    } \
+}
+
+Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
+    Pos hash_head;
+    int64_t dist;
+    unsigned match_len, last;
+
+
+    last = (flush == Z_FINISH) ? 1 : 0;
+    if (UNLIKELY(last && s->block_open != 2)) {
+        /* Emit end of previous block */
+        QUICK_END_BLOCK(s, 0);
+        /* Emit start of last block */
+        QUICK_START_BLOCK(s, last);
+    } else if (UNLIKELY(s->block_open == 0 && s->lookahead > 0)) {
+        /* Start new block only when we have lookahead data, so that if no
+           input data is given an empty block will not be written */
+        QUICK_START_BLOCK(s, last);
+    }
+
+    for (;;) {
+        if (UNLIKELY(s->pending + ((BIT_BUF_SIZE + 7) >> 3) >= s->pending_buf_size)) {
+            PREFIX(flush_pending)(s->strm);
+            if (s->strm->avail_out == 0) {
+                return (last && s->strm->avail_in == 0 && s->bi_valid == 0 && s->block_open == 0) ? finish_started : need_more;
+            }
+        }
+
+        if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD)) {
+            PREFIX(fill_window)(s);
+            if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
+                return need_more;
+            }
+            if (UNLIKELY(s->lookahead == 0))
+                break;
+
+            if (UNLIKELY(s->block_open == 0)) {
+                /* Start new block when we have lookahead data, so that if no
+                   input data is given an empty block will not be written */
+                QUICK_START_BLOCK(s, last);
+            }
+        }
+
+        if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
+            hash_head = functable.quick_insert_string(s, s->strstart);
+            dist = (int64_t)s->strstart - hash_head;
+
+            if (dist <= MAX_DIST(s) && dist > 0) {
+                const uint8_t *str_start = s->window + s->strstart;
+                const uint8_t *match_start = s->window + hash_head;
+
+                if (zng_memcmp_2(str_start, match_start) == 0) {
+                    match_len = functable.compare256(str_start+2, match_start+2) + 2;
+
+                    if (match_len >= WANT_MIN_MATCH) {
+                        if (UNLIKELY(match_len > s->lookahead))
+                            match_len = s->lookahead;
+                        if (UNLIKELY(match_len > STD_MAX_MATCH))
+                            match_len = STD_MAX_MATCH;
+
+                        check_match(s, s->strstart, hash_head, match_len);
+
+                        zng_tr_emit_dist(s, static_ltree, static_dtree, match_len - STD_MIN_MATCH, (uint32_t)dist);
+                        s->lookahead -= match_len;
+                        s->strstart += match_len;
+                        continue;
+                    }
+                }
+            }
+        }
+
+        zng_tr_emit_lit(s, static_ltree, s->window[s->strstart]);
+        s->strstart++;
+        s->lookahead--;
+    }
+
+    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+    if (UNLIKELY(last)) {
+        QUICK_END_BLOCK(s, 1);
+        return finish_done;
+    }
+
+    QUICK_END_BLOCK(s, 0);
+    return block_done;
+}
diff --git a/3rdparty/zlib-ng/deflate_rle.c b/3rdparty/zlib-ng/deflate_rle.c
new file mode 100644
index 000000000000..cd0850994606
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate_rle.c
@@ -0,0 +1,85 @@
+/* deflate_rle.c -- compress data using RLE strategy of deflation algorithm
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "compare256_rle.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+#ifdef UNALIGNED_OK
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+#    define compare256_rle compare256_rle_unaligned_64
+#  elif defined(HAVE_BUILTIN_CTZ)
+#    define compare256_rle compare256_rle_unaligned_32
+#  else
+#    define compare256_rle compare256_rle_unaligned_16
+#  endif
+#else
+#  define compare256_rle compare256_rle_c
+#endif
+
+/* ===========================================================================
+ * For Z_RLE, simply look for runs of bytes, generate matches only of distance
+ * one.  Do not maintain a hash table.  (It will be regenerated if this run of
+ * deflate switches away from Z_RLE.)
+ */
+Z_INTERNAL block_state deflate_rle(deflate_state *s, int flush) {
+    int bflush = 0;                 /* set if current block must be flushed */
+    unsigned char *scan;            /* scan goes up to strend for length of run */
+    uint32_t match_len = 0;
+
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need STD_MAX_MATCH bytes
+         * for the longest run, plus one for the unrolled loop.
+         */
+        if (s->lookahead <= STD_MAX_MATCH) {
+            PREFIX(fill_window)(s);
+            if (s->lookahead <= STD_MAX_MATCH && flush == Z_NO_FLUSH)
+                return need_more;
+            if (s->lookahead == 0)
+                break; /* flush the current block */
+        }
+
+        /* See how many times the previous byte repeats */
+        if (s->lookahead >= STD_MIN_MATCH && s->strstart > 0) {
+            scan = s->window + s->strstart - 1;
+            if (scan[0] == scan[1] && scan[1] == scan[2]) {
+                match_len = compare256_rle(scan, scan+3)+2;
+                match_len = MIN(match_len, s->lookahead);
+                match_len = MIN(match_len, STD_MAX_MATCH);
+            }
+            Assert(scan+match_len <= s->window + s->window_size - 1, "wild scan");
+        }
+
+        /* Emit match if have run of STD_MIN_MATCH or longer, else emit literal */
+        if (match_len >= STD_MIN_MATCH) {
+            check_match(s, s->strstart, s->strstart - 1, match_len);
+
+            bflush = zng_tr_tally_dist(s, 1, match_len - STD_MIN_MATCH);
+
+            s->lookahead -= match_len;
+            s->strstart += match_len;
+            match_len = 0;
+        } else {
+            /* No match, output a literal byte */
+            bflush = zng_tr_tally_lit(s, s->window[s->strstart]);
+            s->lookahead--;
+            s->strstart++;
+        }
+        if (bflush)
+            FLUSH_BLOCK(s, 0);
+    }
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->sym_next)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
diff --git a/3rdparty/zlib-ng/deflate_slow.c b/3rdparty/zlib-ng/deflate_slow.c
new file mode 100644
index 000000000000..9f1c913467b7
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate_slow.c
@@ -0,0 +1,143 @@
+/* deflate_slow.c -- compress data using the slow strategy of deflation algorithm
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* ===========================================================================
+ * Same as deflate_medium, but achieves better compression. We use a lazy
+ * evaluation for matches: a match is finally adopted only if there is
+ * no better match at the next window position.
+ */
+Z_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
+    Pos hash_head;           /* head of hash chain */
+    int bflush;              /* set if current block must be flushed */
+    int64_t dist;
+    uint32_t match_len;
+    match_func *longest_match;
+
+    if (s->max_chain_length <= 1024)
+        longest_match = &functable.longest_match;
+    else
+        longest_match = &functable.longest_match_slow;
+
+    /* Process the input block. */
+    for (;;) {
+        /* Make sure that we always have enough lookahead, except
+         * at the end of the input file. We need STD_MAX_MATCH bytes
+         * for the next match, plus WANT_MIN_MATCH bytes to insert the
+         * string following the next match.
+         */
+        if (s->lookahead < MIN_LOOKAHEAD) {
+            PREFIX(fill_window)(s);
+            if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
+                return need_more;
+            }
+            if (UNLIKELY(s->lookahead == 0))
+                break; /* flush the current block */
+        }
+
+        /* Insert the string window[strstart .. strstart+2] in the
+         * dictionary, and set hash_head to the head of the hash chain:
+         */
+        hash_head = 0;
+        if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
+            hash_head = s->quick_insert_string(s, s->strstart);
+        }
+
+        /* Find the longest match, discarding those <= prev_length.
+         */
+        s->prev_match = (Pos)s->match_start;
+        match_len = STD_MIN_MATCH - 1;
+        dist = (int64_t)s->strstart - hash_head;
+
+        if (dist <= MAX_DIST(s) && dist > 0 && s->prev_length < s->max_lazy_match && hash_head != 0) {
+            /* To simplify the code, we prevent matches with the string
+             * of window index 0 (in particular we have to avoid a match
+             * of the string with itself at the start of the input file).
+             */
+            match_len = (*longest_match)(s, hash_head);
+            /* longest_match() sets match_start */
+
+            if (match_len <= 5 && (s->strategy == Z_FILTERED)) {
+                /* If prev_match is also WANT_MIN_MATCH, match_start is garbage
+                 * but we will ignore the current match anyway.
+                 */
+                match_len = STD_MIN_MATCH - 1;
+            }
+        }
+        /* If there was a match at the previous step and the current
+         * match is not better, output the previous match:
+         */
+        if (s->prev_length >= STD_MIN_MATCH && match_len <= s->prev_length) {
+            unsigned int max_insert = s->strstart + s->lookahead - STD_MIN_MATCH;
+            /* Do not insert strings in hash table beyond this. */
+
+            check_match(s, s->strstart-1, s->prev_match, s->prev_length);
+
+            bflush = zng_tr_tally_dist(s, s->strstart -1 - s->prev_match, s->prev_length - STD_MIN_MATCH);
+
+            /* Insert in hash table all strings up to the end of the match.
+             * strstart-1 and strstart are already inserted. If there is not
+             * enough lookahead, the last two strings are not inserted in
+             * the hash table.
+             */
+            s->prev_length -= 1;
+            s->lookahead -= s->prev_length;
+
+            unsigned int mov_fwd = s->prev_length - 1;
+            if (max_insert > s->strstart) {
+                unsigned int insert_cnt = mov_fwd;
+                if (UNLIKELY(insert_cnt > max_insert - s->strstart))
+                    insert_cnt = max_insert - s->strstart;
+                s->insert_string(s, s->strstart + 1, insert_cnt);
+            }
+            s->prev_length = 0;
+            s->match_available = 0;
+            s->strstart += mov_fwd + 1;
+
+            if (UNLIKELY(bflush))
+                FLUSH_BLOCK(s, 0);
+
+        } else if (s->match_available) {
+            /* If there was no match at the previous position, output a
+             * single literal. If there was a match but the current match
+             * is longer, truncate the previous match to a single literal.
+             */
+            bflush = zng_tr_tally_lit(s, s->window[s->strstart-1]);
+            if (UNLIKELY(bflush))
+                FLUSH_BLOCK_ONLY(s, 0);
+            s->prev_length = match_len;
+            s->strstart++;
+            s->lookahead--;
+            if (UNLIKELY(s->strm->avail_out == 0))
+                return need_more;
+        } else {
+            /* There is no previous match to compare with, wait for
+             * the next step to decide.
+             */
+            s->prev_length = match_len;
+            s->match_available = 1;
+            s->strstart++;
+            s->lookahead--;
+        }
+    }
+    Assert(flush != Z_NO_FLUSH, "no flush?");
+    if (UNLIKELY(s->match_available)) {
+        (void) zng_tr_tally_lit(s, s->window[s->strstart-1]);
+        s->match_available = 0;
+    }
+    s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+    if (UNLIKELY(flush == Z_FINISH)) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (UNLIKELY(s->sym_next))
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
diff --git a/3rdparty/zlib-ng/deflate_stored.c b/3rdparty/zlib-ng/deflate_stored.c
new file mode 100644
index 000000000000..6160896b3fed
--- /dev/null
+++ b/3rdparty/zlib-ng/deflate_stored.c
@@ -0,0 +1,186 @@
+/* deflate_stored.c -- store data without compression using deflation algorithm
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+
+/* ===========================================================================
+ * Copy without compression as much as possible from the input stream, return
+ * the current block state.
+ *
+ * In case deflateParams() is used to later switch to a non-zero compression
+ * level, s->matches (otherwise unused when storing) keeps track of the number
+ * of hash table slides to perform. If s->matches is 1, then one hash table
+ * slide will be done when switching. If s->matches is 2, the maximum value
+ * allowed here, then the hash table will be cleared, since two or more slides
+ * is the same as a clear.
+ *
+ * deflate_stored() is written to minimize the number of times an input byte is
+ * copied. It is most efficient with large input and output buffers, which
+ * maximizes the opportunites to have a single copy from next_in to next_out.
+ */
+Z_INTERNAL block_state deflate_stored(deflate_state *s, int flush) {
+    /* Smallest worthy block size when not flushing or finishing. By default
+     * this is 32K. This can be as small as 507 bytes for memLevel == 1. For
+     * large input and output buffers, the stored block size will be larger.
+     */
+    unsigned min_block = MIN(s->pending_buf_size - 5, s->w_size);
+
+    /* Copy as many min_block or larger stored blocks directly to next_out as
+     * possible. If flushing, copy the remaining available input to next_out as
+     * stored blocks, if there is enough space.
+     */
+    unsigned len, left, have, last = 0;
+    unsigned used = s->strm->avail_in;
+    do {
+        /* Set len to the maximum size block that we can copy directly with the
+         * available input data and output space. Set left to how much of that
+         * would be copied from what's left in the window.
+         */
+        len = MAX_STORED;       /* maximum deflate stored block length */
+        have = (s->bi_valid + 42) >> 3;         /* number of header bytes */
+        if (s->strm->avail_out < have)          /* need room for header */
+            break;
+            /* maximum stored block length that will fit in avail_out: */
+        have = s->strm->avail_out - have;
+        left = (int)s->strstart - s->block_start;    /* bytes left in window */
+        if (len > (unsigned long)left + s->strm->avail_in)
+            len = left + s->strm->avail_in;     /* limit len to the input */
+        len = MIN(len, have);                   /* limit len to the output */
+
+        /* If the stored block would be less than min_block in length, or if
+         * unable to copy all of the available input when flushing, then try
+         * copying to the window and the pending buffer instead. Also don't
+         * write an empty block when flushing -- deflate() does that.
+         */
+        if (len < min_block && ((len == 0 && flush != Z_FINISH) || flush == Z_NO_FLUSH || len != left + s->strm->avail_in))
+            break;
+
+        /* Make a dummy stored block in pending to get the header bytes,
+         * including any pending bits. This also updates the debugging counts.
+         */
+        last = flush == Z_FINISH && len == left + s->strm->avail_in ? 1 : 0;
+        zng_tr_stored_block(s, (char *)0, 0L, last);
+
+        /* Replace the lengths in the dummy stored block with len. */
+        s->pending -= 4;
+        put_short(s, (uint16_t)len);
+        put_short(s, (uint16_t)~len);
+
+        /* Write the stored block header bytes. */
+        PREFIX(flush_pending)(s->strm);
+
+        /* Update debugging counts for the data about to be copied. */
+        cmpr_bits_add(s, len << 3);
+        sent_bits_add(s, len << 3);
+
+        /* Copy uncompressed bytes from the window to next_out. */
+        if (left) {
+            left = MIN(left, len);
+            memcpy(s->strm->next_out, s->window + s->block_start, left);
+            s->strm->next_out += left;
+            s->strm->avail_out -= left;
+            s->strm->total_out += left;
+            s->block_start += (int)left;
+            len -= left;
+        }
+
+        /* Copy uncompressed bytes directly from next_in to next_out, updating
+         * the check value.
+         */
+        if (len) {
+            PREFIX(read_buf)(s->strm, s->strm->next_out, len);
+            s->strm->next_out += len;
+            s->strm->avail_out -= len;
+            s->strm->total_out += len;
+        }
+    } while (last == 0);
+
+    /* Update the sliding window with the last s->w_size bytes of the copied
+     * data, or append all of the copied data to the existing window if less
+     * than s->w_size bytes were copied. Also update the number of bytes to
+     * insert in the hash tables, in the event that deflateParams() switches to
+     * a non-zero compression level.
+     */
+    used -= s->strm->avail_in;      /* number of input bytes directly copied */
+    if (used) {
+        /* If any input was used, then no unused input remains in the window,
+         * therefore s->block_start == s->strstart.
+         */
+        if (used >= s->w_size) {    /* supplant the previous history */
+            s->matches = 2;         /* clear hash */
+            memcpy(s->window, s->strm->next_in - s->w_size, s->w_size);
+            s->strstart = s->w_size;
+            s->insert = s->strstart;
+        } else {
+            if (s->window_size - s->strstart <= used) {
+                /* Slide the window down. */
+                s->strstart -= s->w_size;
+                memcpy(s->window, s->window + s->w_size, s->strstart);
+                if (s->matches < 2)
+                    s->matches++;   /* add a pending slide_hash() */
+                s->insert = MIN(s->insert, s->strstart);
+            }
+            memcpy(s->window + s->strstart, s->strm->next_in - used, used);
+            s->strstart += used;
+            s->insert += MIN(used, s->w_size - s->insert);
+        }
+        s->block_start = (int)s->strstart;
+    }
+    s->high_water = MAX(s->high_water, s->strstart);
+
+    /* If the last block was written to next_out, then done. */
+    if (last)
+        return finish_done;
+
+    /* If flushing and all input has been consumed, then done. */
+    if (flush != Z_NO_FLUSH && flush != Z_FINISH && s->strm->avail_in == 0 && (int)s->strstart == s->block_start)
+        return block_done;
+
+    /* Fill the window with any remaining input. */
+    have = s->window_size - s->strstart;
+    if (s->strm->avail_in > have && s->block_start >= (int)s->w_size) {
+        /* Slide the window down. */
+        s->block_start -= (int)s->w_size;
+        s->strstart -= s->w_size;
+        memcpy(s->window, s->window + s->w_size, s->strstart);
+        if (s->matches < 2)
+            s->matches++;           /* add a pending slide_hash() */
+        have += s->w_size;          /* more space now */
+        s->insert = MIN(s->insert, s->strstart);
+    }
+
+    have = MIN(have, s->strm->avail_in);
+    if (have) {
+        PREFIX(read_buf)(s->strm, s->window + s->strstart, have);
+        s->strstart += have;
+        s->insert += MIN(have, s->w_size - s->insert);
+    }
+    s->high_water = MAX(s->high_water, s->strstart);
+
+    /* There was not enough avail_out to write a complete worthy or flushed
+     * stored block to next_out. Write a stored block to pending instead, if we
+     * have enough input for a worthy block, or if flushing and there is enough
+     * room for the remaining input as a stored block in the pending buffer.
+     */
+    have = (s->bi_valid + 42) >> 3;         /* number of header bytes */
+        /* maximum stored block length that will fit in pending: */
+    have = MIN(s->pending_buf_size - have, MAX_STORED);
+    min_block = MIN(have, s->w_size);
+    left = (int)s->strstart - s->block_start;
+    if (left >= min_block || ((left || flush == Z_FINISH) && flush != Z_NO_FLUSH && s->strm->avail_in == 0 && left <= have)) {
+        len = MIN(left, have);
+        last = flush == Z_FINISH && s->strm->avail_in == 0 && len == left ? 1 : 0;
+        zng_tr_stored_block(s, (char *)s->window + s->block_start, len, last);
+        s->block_start += (int)len;
+        PREFIX(flush_pending)(s->strm);
+    }
+
+    /* We've done all we can with the available input and output. */
+    return last ? finish_started : need_more;
+}
diff --git a/3rdparty/zlib-ng/fallback_builtins.h b/3rdparty/zlib-ng/fallback_builtins.h
new file mode 100644
index 000000000000..79072a1028ec
--- /dev/null
+++ b/3rdparty/zlib-ng/fallback_builtins.h
@@ -0,0 +1,50 @@
+#ifndef FALLBACK_BUILTINS_H
+#define FALLBACK_BUILTINS_H
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) ||  defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#include <intrin.h>
+#ifdef X86_FEATURES
+#  include "arch/x86/x86_features.h"
+#endif
+
+/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
+ * Because of that assumption trailing_zero is not initialized and the return value is not checked.
+ * Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed.
+ * If tzcnt instruction is not supported, the cpu will itself execute bsf instead.
+ * Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu.
+ */
+static __forceinline int __builtin_ctz(unsigned int value) {
+    Assert(value != 0, "Invalid input value: 0");
+# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
+    return (int)_tzcnt_u32(value);
+# else
+    unsigned long trailing_zero;
+    _BitScanForward(&trailing_zero, value);
+    return (int)trailing_zero;
+# endif
+}
+#define HAVE_BUILTIN_CTZ
+
+#ifdef _M_AMD64
+/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0.
+ * Because of that assumption trailing_zero is not initialized and the return value is not checked.
+ */
+static __forceinline int __builtin_ctzll(unsigned long long value) {
+    Assert(value != 0, "Invalid input value: 0");
+# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
+    return (int)_tzcnt_u64(value);
+# else
+    unsigned long trailing_zero;
+    _BitScanForward64(&trailing_zero, value);
+    return (int)trailing_zero;
+# endif
+}
+#define HAVE_BUILTIN_CTZLL
+#endif // Microsoft AMD64
+
+#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
+#endif // _MSC_VER & !clang
+
+#endif // include guard FALLBACK_BUILTINS_H
diff --git a/3rdparty/zlib-ng/functable.c b/3rdparty/zlib-ng/functable.c
new file mode 100644
index 000000000000..37c4aeef7d0e
--- /dev/null
+++ b/3rdparty/zlib-ng/functable.c
@@ -0,0 +1,403 @@
+/* functable.c -- Choose relevant optimized functions at runtime
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "crc32_braid_p.h"
+#include "deflate.h"
+#include "deflate_p.h"
+#include "functable.h"
+#include "cpu_features.h"
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/* Platform has pointer size atomic store */
+#if defined(__GNUC__) || defined(__clang__)
+#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
+    __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
+#  define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
+#elif defined(_MSC_VER)
+#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
+    _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
+#  if defined(_M_ARM) || defined(_M_ARM64)
+#    define FUNCTABLE_BARRIER() do { \
+    _ReadWriteBarrier();  \
+    __dmb(0xB); /* _ARM_BARRIER_ISH */ \
+    _ReadWriteBarrier(); \
+} while (0)
+#  else
+#    define FUNCTABLE_BARRIER() _ReadWriteBarrier()
+#  endif
+#else
+#  warning Unable to detect atomic intrinsic support.
+#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
+    *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
+#  define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
+#endif
+
+static void force_init_empty(void) {
+    // empty
+}
+
+static void init_functable(void) {
+    struct functable_s ft;
+    struct cpu_features cf;
+
+    cpu_check_features(&cf);
+
+    // Generic code
+    ft.force_init = &force_init_empty;
+    ft.adler32 = &adler32_c;
+    ft.adler32_fold_copy = &adler32_fold_copy_c;
+    ft.chunkmemset_safe = &chunkmemset_safe_c;
+    ft.chunksize = &chunksize_c;
+    ft.crc32 = &PREFIX(crc32_braid);
+    ft.crc32_fold = &crc32_fold_c;
+    ft.crc32_fold_copy = &crc32_fold_copy_c;
+    ft.crc32_fold_final = &crc32_fold_final_c;
+    ft.crc32_fold_reset = &crc32_fold_reset_c;
+    ft.inflate_fast = &inflate_fast_c;
+    ft.insert_string = &insert_string_c;
+    ft.quick_insert_string = &quick_insert_string_c;
+    ft.slide_hash = &slide_hash_c;
+    ft.update_hash = &update_hash_c;
+
+#if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
+#  if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+    ft.longest_match = &longest_match_unaligned_64;
+    ft.longest_match_slow = &longest_match_slow_unaligned_64;
+    ft.compare256 = &compare256_unaligned_64;
+#  elif defined(HAVE_BUILTIN_CTZ)
+    ft.longest_match = &longest_match_unaligned_32;
+    ft.longest_match_slow = &longest_match_slow_unaligned_32;
+    ft.compare256 = &compare256_unaligned_32;
+#  else
+    ft.longest_match = &longest_match_unaligned_16;
+    ft.longest_match_slow = &longest_match_slow_unaligned_16;
+    ft.compare256 = &compare256_unaligned_16;
+#  endif
+#else
+    ft.longest_match = &longest_match_c;
+    ft.longest_match_slow = &longest_match_slow_c;
+    ft.compare256 = &compare256_c;
+#endif
+
+
+    // Select arch-optimized functions
+
+    // X86 - SSE2
+#ifdef X86_SSE2
+#  if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (cf.x86.has_sse2)
+#  endif
+    {
+        ft.chunkmemset_safe = &chunkmemset_safe_sse2;
+        ft.chunksize = &chunksize_sse2;
+        ft.inflate_fast = &inflate_fast_sse2;
+        ft.slide_hash = &slide_hash_sse2;
+#  ifdef HAVE_BUILTIN_CTZ
+        ft.compare256 = &compare256_sse2;
+        ft.longest_match = &longest_match_sse2;
+        ft.longest_match_slow = &longest_match_slow_sse2;
+#  endif
+    }
+#endif
+    // X86 - SSSE3
+#ifdef X86_SSSE3
+    if (cf.x86.has_ssse3) {
+        ft.adler32 = &adler32_ssse3;
+#  ifdef X86_SSE2
+        ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
+        ft.inflate_fast = &inflate_fast_ssse3;
+#  endif
+    }
+#endif
+    // X86 - SSE4.2
+#ifdef X86_SSE42
+    if (cf.x86.has_sse42) {
+        ft.adler32_fold_copy = &adler32_fold_copy_sse42;
+        ft.insert_string = &insert_string_sse42;
+        ft.quick_insert_string = &quick_insert_string_sse42;
+        ft.update_hash = &update_hash_sse42;
+    }
+#endif
+    // X86 - PCLMUL
+#ifdef X86_PCLMULQDQ_CRC
+    if (cf.x86.has_pclmulqdq) {
+        ft.crc32 = &crc32_pclmulqdq;
+        ft.crc32_fold = &crc32_fold_pclmulqdq;
+        ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy;
+        ft.crc32_fold_final = &crc32_fold_pclmulqdq_final;
+        ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset;
+    }
+#endif
+    // X86 - AVX
+#ifdef X86_AVX2
+    if (cf.x86.has_avx2) {
+        ft.adler32 = &adler32_avx2;
+        ft.adler32_fold_copy = &adler32_fold_copy_avx2;
+        ft.chunkmemset_safe = &chunkmemset_safe_avx2;
+        ft.chunksize = &chunksize_avx2;
+        ft.inflate_fast = &inflate_fast_avx2;
+        ft.slide_hash = &slide_hash_avx2;
+#  ifdef HAVE_BUILTIN_CTZ
+        ft.compare256 = &compare256_avx2;
+        ft.longest_match = &longest_match_avx2;
+        ft.longest_match_slow = &longest_match_slow_avx2;
+#  endif
+    }
+#endif
+#ifdef X86_AVX512
+    if (cf.x86.has_avx512) {
+        ft.adler32 = &adler32_avx512;
+        ft.adler32_fold_copy = &adler32_fold_copy_avx512;
+    }
+#endif
+#ifdef X86_AVX512VNNI
+    if (cf.x86.has_avx512vnni) {
+        ft.adler32 = &adler32_avx512_vnni;
+        ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
+    }
+#endif
+    // X86 - VPCLMULQDQ
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512 && cf.x86.has_vpclmulqdq) {
+        ft.crc32 = &crc32_vpclmulqdq;
+        ft.crc32_fold = &crc32_fold_vpclmulqdq;
+        ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
+        ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
+        ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
+    }
+#endif
+
+
+    // ARM - SIMD
+#ifdef ARM_SIMD
+#  ifndef ARM_NOCHECK_SIMD
+    if (cf.arm.has_simd)
+#  endif
+    {
+        ft.slide_hash = &slide_hash_armv6;
+    }
+#endif
+    // ARM - NEON
+#ifdef ARM_NEON
+#  ifndef ARM_NOCHECK_NEON
+    if (cf.arm.has_neon)
+#  endif
+    {
+        ft.adler32 = &adler32_neon;
+        ft.chunkmemset_safe = &chunkmemset_safe_neon;
+        ft.chunksize = &chunksize_neon;
+        ft.inflate_fast = &inflate_fast_neon;
+        ft.slide_hash = &slide_hash_neon;
+#  ifdef HAVE_BUILTIN_CTZLL
+        ft.compare256 = &compare256_neon;
+        ft.longest_match = &longest_match_neon;
+        ft.longest_match_slow = &longest_match_slow_neon;
+#  endif
+    }
+#endif
+    // ARM - ACLE
+#ifdef ARM_ACLE
+    if (cf.arm.has_crc32) {
+        ft.crc32 = &crc32_acle;
+        ft.insert_string = &insert_string_acle;
+        ft.quick_insert_string = &quick_insert_string_acle;
+        ft.update_hash = &update_hash_acle;
+    }
+#endif
+
+
+    // Power - VMX
+#ifdef PPC_VMX
+    if (cf.power.has_altivec) {
+        ft.adler32 = &adler32_vmx;
+        ft.slide_hash = &slide_hash_vmx;
+    }
+#endif
+    // Power8 - VSX
+#ifdef POWER8_VSX
+    if (cf.power.has_arch_2_07) {
+        ft.adler32 = &adler32_power8;
+        ft.chunkmemset_safe = &chunkmemset_safe_power8;
+        ft.chunksize = &chunksize_power8;
+        ft.inflate_fast = &inflate_fast_power8;
+        ft.slide_hash = &slide_hash_power8;
+    }
+#endif
+#ifdef POWER8_VSX_CRC32
+    if (cf.power.has_arch_2_07)
+        ft.crc32 = &crc32_power8;
+#endif
+    // Power9
+#ifdef POWER9
+    if (cf.power.has_arch_3_00) {
+        ft.compare256 = &compare256_power9;
+        ft.longest_match = &longest_match_power9;
+        ft.longest_match_slow = &longest_match_slow_power9;
+    }
+#endif
+
+
+    // RISCV - RVV
+#ifdef RISCV_RVV
+    if (cf.riscv.has_rvv) {
+        ft.adler32 = &adler32_rvv;
+        ft.adler32_fold_copy = &adler32_fold_copy_rvv;
+        ft.chunkmemset_safe = &chunkmemset_safe_rvv;
+        ft.chunksize = &chunksize_rvv;
+        ft.compare256 = &compare256_rvv;
+        ft.inflate_fast = &inflate_fast_rvv;
+        ft.longest_match = &longest_match_rvv;
+        ft.longest_match_slow = &longest_match_slow_rvv;
+        ft.slide_hash = &slide_hash_rvv;
+    }
+#endif
+
+
+    // S390
+#ifdef S390_CRC32_VX
+    if (cf.s390.has_vx)
+        ft.crc32 = crc32_s390_vx;
+#endif
+
+    // Assign function pointers individually for atomic operation
+    FUNCTABLE_ASSIGN(ft, force_init);
+    FUNCTABLE_ASSIGN(ft, adler32);
+    FUNCTABLE_ASSIGN(ft, adler32_fold_copy);
+    FUNCTABLE_ASSIGN(ft, chunkmemset_safe);
+    FUNCTABLE_ASSIGN(ft, chunksize);
+    FUNCTABLE_ASSIGN(ft, compare256);
+    FUNCTABLE_ASSIGN(ft, crc32);
+    FUNCTABLE_ASSIGN(ft, crc32_fold);
+    FUNCTABLE_ASSIGN(ft, crc32_fold_copy);
+    FUNCTABLE_ASSIGN(ft, crc32_fold_final);
+    FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
+    FUNCTABLE_ASSIGN(ft, inflate_fast);
+    FUNCTABLE_ASSIGN(ft, insert_string);
+    FUNCTABLE_ASSIGN(ft, longest_match);
+    FUNCTABLE_ASSIGN(ft, longest_match_slow);
+    FUNCTABLE_ASSIGN(ft, quick_insert_string);
+    FUNCTABLE_ASSIGN(ft, slide_hash);
+    FUNCTABLE_ASSIGN(ft, update_hash);
+
+    // Memory barrier for weak memory order CPUs
+    FUNCTABLE_BARRIER();
+}
+
+/* stub functions */
+static void force_init_stub(void) {
+    init_functable();
+}
+
+static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
+    init_functable();
+    return functable.adler32(adler, buf, len);
+}
+
+static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
+    init_functable();
+    return functable.adler32_fold_copy(adler, dst, src, len);
+}
+
+static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) {
+    init_functable();
+    return functable.chunkmemset_safe(out, dist, len, left);
+}
+
+static uint32_t chunksize_stub(void) {
+    init_functable();
+    return functable.chunksize();
+}
+
+static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
+    init_functable();
+    return functable.compare256(src0, src1);
+}
+
+static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
+    init_functable();
+    return functable.crc32(crc, buf, len);
+}
+
+static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) {
+    init_functable();
+    functable.crc32_fold(crc, src, len, init_crc);
+}
+
+static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) {
+    init_functable();
+    functable.crc32_fold_copy(crc, dst, src, len);
+}
+
+static uint32_t crc32_fold_final_stub(crc32_fold* crc) {
+    init_functable();
+    return functable.crc32_fold_final(crc);
+}
+
+static uint32_t crc32_fold_reset_stub(crc32_fold* crc) {
+    init_functable();
+    return functable.crc32_fold_reset(crc);
+}
+
+static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
+    init_functable();
+    functable.inflate_fast(strm, start);
+}
+
+static void insert_string_stub(deflate_state* const s, uint32_t str, uint32_t count) {
+    init_functable();
+    functable.insert_string(s, str, count);
+}
+
+static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
+    init_functable();
+    return functable.longest_match(s, cur_match);
+}
+
+static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
+    init_functable();
+    return functable.longest_match_slow(s, cur_match);
+}
+
+static Pos quick_insert_string_stub(deflate_state* const s, const uint32_t str) {
+    init_functable();
+    return functable.quick_insert_string(s, str);
+}
+
+static void slide_hash_stub(deflate_state* s) {
+    init_functable();
+    functable.slide_hash(s);
+}
+
+static uint32_t update_hash_stub(deflate_state* const s, uint32_t h, uint32_t val) {
+    init_functable();
+    return functable.update_hash(s, h, val);
+}
+
+/* functable init */
+Z_INTERNAL struct functable_s functable = {
+    force_init_stub,
+    adler32_stub,
+    adler32_fold_copy_stub,
+    chunkmemset_safe_stub,
+    chunksize_stub,
+    compare256_stub,
+    crc32_stub,
+    crc32_fold_stub,
+    crc32_fold_copy_stub,
+    crc32_fold_final_stub,
+    crc32_fold_reset_stub,
+    inflate_fast_stub,
+    insert_string_stub,
+    longest_match_stub,
+    longest_match_slow_stub,
+    quick_insert_string_stub,
+    slide_hash_stub,
+    update_hash_stub
+};
diff --git a/3rdparty/zlib-ng/functable.h b/3rdparty/zlib-ng/functable.h
new file mode 100644
index 000000000000..9f78188e1054
--- /dev/null
+++ b/3rdparty/zlib-ng/functable.h
@@ -0,0 +1,42 @@
+/* functable.h -- Struct containing function pointers to optimized functions
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef FUNCTABLE_H_
+#define FUNCTABLE_H_
+
+#include "deflate.h"
+#include "crc32_fold.h"
+#include "adler32_fold.h"
+
+#ifdef ZLIB_COMPAT
+typedef struct z_stream_s z_stream;
+#else
+typedef struct zng_stream_s zng_stream;
+#endif
+
+struct functable_s {
+    void     (* force_init)         (void);
+    uint32_t (* adler32)            (uint32_t adler, const uint8_t *buf, size_t len);
+    uint32_t (* adler32_fold_copy)  (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+    uint8_t* (* chunkmemset_safe)   (uint8_t *out, unsigned dist, unsigned len, unsigned left);
+    uint32_t (* chunksize)          (void);
+    uint32_t (* compare256)         (const uint8_t *src0, const uint8_t *src1);
+    uint32_t (* crc32)              (uint32_t crc, const uint8_t *buf, size_t len);
+    void     (* crc32_fold)         (struct crc32_fold_s *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+    void     (* crc32_fold_copy)    (struct crc32_fold_s *crc, uint8_t *dst, const uint8_t *src, size_t len);
+    uint32_t (* crc32_fold_final)   (struct crc32_fold_s *crc);
+    uint32_t (* crc32_fold_reset)   (struct crc32_fold_s *crc);
+    void     (* inflate_fast)       (PREFIX3(stream) *strm, uint32_t start);
+    void     (* insert_string)      (deflate_state *const s, uint32_t str, uint32_t count);
+    uint32_t (* longest_match)      (deflate_state *const s, Pos cur_match);
+    uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match);
+    Pos      (* quick_insert_string)(deflate_state *const s, uint32_t str);
+    void     (* slide_hash)         (deflate_state *s);
+    uint32_t (* update_hash)        (deflate_state *const s, uint32_t h, uint32_t val);
+};
+
+Z_INTERNAL extern struct functable_s functable;
+
+#endif
diff --git a/3rdparty/zlib-ng/gzguts.h b/3rdparty/zlib-ng/gzguts.h
new file mode 100644
index 000000000000..a663844b693e
--- /dev/null
+++ b/3rdparty/zlib-ng/gzguts.h
@@ -0,0 +1,144 @@
+#ifndef GZGUTS_H_
+#define GZGUTS_H_
+/* gzguts.h -- zlib internal header definitions for gz* operations
+ * Copyright (C) 2004-2019 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef _LARGEFILE64_SOURCE
+#  ifndef _LARGEFILE_SOURCE
+#    define _LARGEFILE_SOURCE 1
+#  endif
+#  undef _FILE_OFFSET_BITS
+#  undef _TIME_BITS
+#endif
+
+#if defined(HAVE_VISIBILITY_INTERNAL)
+#  define Z_INTERNAL __attribute__((visibility ("internal")))
+#elif defined(HAVE_VISIBILITY_HIDDEN)
+#  define Z_INTERNAL __attribute__((visibility ("hidden")))
+#else
+#  define Z_INTERNAL
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <fcntl.h>
+
+#if defined(ZLIB_COMPAT)
+#  include "zlib.h"
+#else
+#  include "zlib-ng.h"
+#endif
+
+#ifdef _WIN32
+#  include <stddef.h>
+#endif
+
+#if defined(_WIN32)
+#  include <io.h>
+#  define WIDECHAR
+#endif
+
+#ifdef WINAPI_FAMILY
+#  define open _open
+#  define read _read
+#  define write _write
+#  define close _close
+#endif
+
+/* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
+#if !defined(STDC99) && !defined(__CYGWIN__) && !defined(__MINGW__) && defined(_WIN32)
+#  if !defined(vsnprintf)
+#    if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 )
+#       define vsnprintf _vsnprintf
+#    endif
+#  endif
+#endif
+
+/* unlike snprintf (which is required in C99), _snprintf does not guarantee
+   null termination of the result -- however this is only used in gzlib.c
+   where the result is assured to fit in the space provided */
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#  define snprintf _snprintf
+#endif
+
+/* get errno and strerror definition */
+#ifndef NO_STRERROR
+#  include <errno.h>
+#  define zstrerror() strerror(errno)
+#else
+#  define zstrerror() "stdio error (consult errno)"
+#endif
+
+/* default memLevel */
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+
+/* default i/o buffer size -- double this for output when reading (this and
+   twice this must be able to fit in an unsigned type) */
+#ifndef GZBUFSIZE
+#  define GZBUFSIZE 131072
+#endif
+
+/* gzip modes, also provide a little integrity check on the passed structure */
+#define GZ_NONE 0
+#define GZ_READ 7247
+#define GZ_WRITE 31153
+#define GZ_APPEND 1     /* mode set to GZ_WRITE after the file is opened */
+
+/* values for gz_state how */
+#define LOOK 0      /* look for a gzip header */
+#define COPY 1      /* copy input directly */
+#define GZIP 2      /* decompress a gzip stream */
+
+/* internal gzip file state data structure */
+typedef struct {
+        /* exposed contents for gzgetc() macro */
+    struct gzFile_s x;      /* "x" for exposed */
+                            /* x.have: number of bytes available at x.next */
+                            /* x.next: next output data to deliver or write */
+                            /* x.pos: current position in uncompressed data */
+        /* used for both reading and writing */
+    int mode;               /* see gzip modes above */
+    int fd;                 /* file descriptor */
+    char *path;             /* path or fd for error messages */
+    unsigned size;          /* buffer size, zero if not allocated yet */
+    unsigned want;          /* requested buffer size, default is GZBUFSIZE */
+    unsigned char *in;      /* input buffer (double-sized when writing) */
+    unsigned char *out;     /* output buffer (double-sized when reading) */
+    int direct;             /* 0 if processing gzip, 1 if transparent */
+        /* just for reading */
+    int how;                /* 0: get header, 1: copy, 2: decompress */
+    z_off64_t start;        /* where the gzip data started, for rewinding */
+    int eof;                /* true if end of input file reached */
+    int past;               /* true if read requested past end */
+        /* just for writing */
+    int level;              /* compression level */
+    int strategy;           /* compression strategy */
+    int reset;              /* true if a reset is pending after a Z_FINISH */
+        /* seek request */
+    z_off64_t skip;         /* amount to skip (already rewound if backwards) */
+    int seek;               /* true if seek request pending */
+        /* error information */
+    int err;                /* error code */
+    char *msg;              /* error message */
+        /* zlib inflate or deflate stream */
+    PREFIX3(stream) strm;  /* stream structure in-place (not a pointer) */
+} gz_state;
+typedef gz_state *gz_statep;
+
+/* shared functions */
+void Z_INTERNAL gz_error(gz_state *, int, const char *);
+
+/* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
+   value -- needed when comparing unsigned to z_off64_t, which is signed
+   (possible z_off64_t types off_t, off64_t, and long are all signed) */
+#define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
+
+#endif /* GZGUTS_H_ */
diff --git a/3rdparty/zlib-ng/gzlib.c b/3rdparty/zlib-ng/gzlib.c
new file mode 100644
index 000000000000..e613837efb52
--- /dev/null
+++ b/3rdparty/zlib-ng/gzlib.c
@@ -0,0 +1,525 @@
+/* gzlib.c -- zlib functions common to reading and writing gzip files
+ * Copyright (C) 2004-2019 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "gzguts.h"
+
+#if defined(_WIN32)
+#  define LSEEK _lseeki64
+#else
+#if defined(_LARGEFILE64_SOURCE) && _LFS64_LARGEFILE-0
+#  define LSEEK lseek64
+#else
+#  define LSEEK lseek
+#endif
+#endif
+
+/* Local functions */
+static void gz_reset(gz_state *);
+static gzFile gz_open(const void *, int, const char *);
+
+/* Reset gzip file state */
+static void gz_reset(gz_state *state) {
+    state->x.have = 0;              /* no output data available */
+    if (state->mode == GZ_READ) {   /* for reading ... */
+        state->eof = 0;             /* not at end of file */
+        state->past = 0;            /* have not read past end yet */
+        state->how = LOOK;          /* look for gzip header */
+    }
+    else                            /* for writing ... */
+        state->reset = 0;           /* no deflateReset pending */
+    state->seek = 0;                /* no seek request pending */
+    gz_error(state, Z_OK, NULL);    /* clear error */
+    state->x.pos = 0;               /* no uncompressed data yet */
+    state->strm.avail_in = 0;       /* no input data yet */
+}
+
+/* Open a gzip file either by name or file descriptor. */
+static gzFile gz_open(const void *path, int fd, const char *mode) {
+    gz_state *state;
+    size_t len;
+    int oflag;
+#ifdef O_CLOEXEC
+    int cloexec = 0;
+#endif
+#ifdef O_EXCL
+    int exclusive = 0;
+#endif
+
+    /* check input */
+    if (path == NULL)
+        return NULL;
+
+    /* allocate gzFile structure to return */
+    state = (gz_state *)zng_alloc(sizeof(gz_state));
+    if (state == NULL)
+        return NULL;
+    state->size = 0;            /* no buffers allocated yet */
+    state->want = GZBUFSIZE;    /* requested buffer size */
+    state->msg = NULL;          /* no error message yet */
+
+    /* interpret mode */
+    state->mode = GZ_NONE;
+    state->level = Z_DEFAULT_COMPRESSION;
+    state->strategy = Z_DEFAULT_STRATEGY;
+    state->direct = 0;
+    while (*mode) {
+        if (*mode >= '0' && *mode <= '9') {
+            state->level = *mode - '0';
+        } else {
+            switch (*mode) {
+            case 'r':
+                state->mode = GZ_READ;
+                break;
+#ifndef NO_GZCOMPRESS
+            case 'w':
+                state->mode = GZ_WRITE;
+                break;
+            case 'a':
+                state->mode = GZ_APPEND;
+                break;
+#endif
+            case '+':       /* can't read and write at the same time */
+                zng_free(state);
+                return NULL;
+            case 'b':       /* ignore -- will request binary anyway */
+                break;
+#ifdef O_CLOEXEC
+            case 'e':
+                cloexec = 1;
+                break;
+#endif
+#ifdef O_EXCL
+            case 'x':
+                exclusive = 1;
+                break;
+#endif
+            case 'f':
+                state->strategy = Z_FILTERED;
+                break;
+            case 'h':
+                state->strategy = Z_HUFFMAN_ONLY;
+                break;
+            case 'R':
+                state->strategy = Z_RLE;
+                break;
+            case 'F':
+                state->strategy = Z_FIXED;
+                break;
+            case 'T':
+                state->direct = 1;
+                break;
+            default:        /* could consider as an error, but just ignore */
+                {}
+            }
+        }
+        mode++;
+    }
+
+    /* must provide an "r", "w", or "a" */
+    if (state->mode == GZ_NONE) {
+        zng_free(state);
+        return NULL;
+    }
+
+    /* can't force transparent read */
+    if (state->mode == GZ_READ) {
+        if (state->direct) {
+            zng_free(state);
+            return NULL;
+        }
+        state->direct = 1;      /* for empty file */
+    }
+
+    /* save the path name for error messages */
+#ifdef WIDECHAR
+    if (fd == -2) {
+        len = wcstombs(NULL, (const wchar_t *)path, 0);
+        if (len == (size_t)-1)
+            len = 0;
+    } else
+#endif
+        len = strlen((const char *)path);
+    state->path = (char *)malloc(len + 1);
+    if (state->path == NULL) {
+        zng_free(state);
+        return NULL;
+    }
+#ifdef WIDECHAR
+    if (fd == -2)
+        if (len) {
+            wcstombs(state->path, (const wchar_t *)path, len + 1);
+        } else {
+            *(state->path) = 0;
+        }
+    else
+#endif
+        (void)snprintf(state->path, len + 1, "%s", (const char *)path);
+
+    /* compute the flags for open() */
+    oflag =
+#ifdef O_LARGEFILE
+        O_LARGEFILE |
+#endif
+#ifdef O_BINARY
+        O_BINARY |
+#endif
+#ifdef O_CLOEXEC
+        (cloexec ? O_CLOEXEC : 0) |
+#endif
+        (state->mode == GZ_READ ?
+         O_RDONLY :
+         (O_WRONLY | O_CREAT |
+#ifdef O_EXCL
+          (exclusive ? O_EXCL : 0) |
+#endif
+          (state->mode == GZ_WRITE ?
+           O_TRUNC :
+           O_APPEND)));
+
+    /* open the file with the appropriate flags (or just use fd) */
+    state->fd = fd > -1 ? fd : (
+#if defined(_WIN32)
+        fd == -2 ? _wopen((const wchar_t *)path, oflag, 0666) :
+#elif __CYGWIN__
+        fd == -2 ? open(state->path, oflag, 0666) :
+#endif
+        open((const char *)path, oflag, 0666));
+    if (state->fd == -1) {
+        free(state->path);
+        zng_free(state);
+        return NULL;
+    }
+    if (state->mode == GZ_APPEND) {
+        LSEEK(state->fd, 0, SEEK_END);  /* so gzoffset() is correct */
+        state->mode = GZ_WRITE;         /* simplify later checks */
+    }
+
+    /* save the current position for rewinding (only if reading) */
+    if (state->mode == GZ_READ) {
+        state->start = LSEEK(state->fd, 0, SEEK_CUR);
+        if (state->start == -1) state->start = 0;
+    }
+
+    /* initialize stream */
+    gz_reset(state);
+
+    /* return stream */
+    return (gzFile)state;
+}
+
+/* -- see zlib.h -- */
+gzFile Z_EXPORT PREFIX(gzopen)(const char *path, const char *mode) {
+    return gz_open(path, -1, mode);
+}
+
+#ifdef ZLIB_COMPAT
+gzFile Z_EXPORT PREFIX4(gzopen)(const char *path, const char *mode) {
+    return gz_open(path, -1, mode);
+}
+#endif
+
+/* -- see zlib.h -- */
+gzFile Z_EXPORT PREFIX(gzdopen)(int fd, const char *mode) {
+    char *path;         /* identifier for error messages */
+    gzFile gz;
+
+    if (fd == -1 || (path = (char *)malloc(7 + 3 * sizeof(int))) == NULL)
+        return NULL;
+    (void)snprintf(path, 7 + 3 * sizeof(int), "<fd:%d>", fd); /* for debugging */
+    gz = gz_open(path, fd, mode);
+    free(path);
+    return gz;
+}
+
+/* -- see zlib.h -- */
+#ifdef WIDECHAR
+gzFile Z_EXPORT PREFIX(gzopen_w)(const wchar_t *path, const char *mode) {
+    return gz_open(path, -2, mode);
+}
+#endif
+
+int Z_EXPORT PREFIX(gzclose)(gzFile file) {
+#ifndef NO_GZCOMPRESS
+    gz_state *state;
+
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_state *)file;
+
+    return state->mode == GZ_READ ? PREFIX(gzclose_r)(file) : PREFIX(gzclose_w)(file);
+#else
+    return PREFIX(gzclose_r)(file);
+#endif
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzbuffer)(gzFile file, unsigned size) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* make sure we haven't already allocated memory */
+    if (state->size != 0)
+        return -1;
+
+    /* check and set requested size */
+    if ((size << 1) < size)
+        return -1;              /* need to be able to double it */
+    if (size < 8)
+        size = 8;               /* needed to behave well with flushing */
+    state->want = size;
+    return 0;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzrewind)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no error */
+    if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* back up and start over */
+    if (LSEEK(state->fd, state->start, SEEK_SET) == -1)
+        return -1;
+    gz_reset(state);
+    return 0;
+}
+
+/* -- see zlib.h -- */
+z_off64_t Z_EXPORT PREFIX4(gzseek)(gzFile file, z_off64_t offset, int whence) {
+    unsigned n;
+    z_off64_t ret;
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* check that there's no error */
+    if (state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+
+    /* can only seek from start or relative to current position */
+    if (whence != SEEK_SET && whence != SEEK_CUR)
+        return -1;
+
+    /* normalize offset to a SEEK_CUR specification */
+    if (whence == SEEK_SET)
+        offset -= state->x.pos;
+    else if (state->seek)
+        offset += state->skip;
+    state->seek = 0;
+
+    /* if within raw area while reading, just go there */
+    if (state->mode == GZ_READ && state->how == COPY && state->x.pos + offset >= 0) {
+        ret = LSEEK(state->fd, offset - (z_off64_t)state->x.have, SEEK_CUR);
+        if (ret == -1)
+            return -1;
+        state->x.have = 0;
+        state->eof = 0;
+        state->past = 0;
+        state->seek = 0;
+        gz_error(state, Z_OK, NULL);
+        state->strm.avail_in = 0;
+        state->x.pos += offset;
+        return state->x.pos;
+    }
+
+    /* calculate skip amount, rewinding if needed for back seek when reading */
+    if (offset < 0) {
+        if (state->mode != GZ_READ)         /* writing -- can't go backwards */
+            return -1;
+        offset += state->x.pos;
+        if (offset < 0)                     /* before start of file! */
+            return -1;
+        if (PREFIX(gzrewind)(file) == -1)   /* rewind, then skip to offset */
+            return -1;
+    }
+
+    /* if reading, skip what's in output buffer (one less gzgetc() check) */
+    if (state->mode == GZ_READ) {
+        n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > offset ? (unsigned)offset : state->x.have;
+        state->x.have -= n;
+        state->x.next += n;
+        state->x.pos += n;
+        offset -= n;
+    }
+
+    /* request skip (if not zero) */
+    if (offset) {
+        state->seek = 1;
+        state->skip = offset;
+    }
+    return state->x.pos + offset;
+}
+
+/* -- see zlib.h -- */
+#ifdef ZLIB_COMPAT
+z_off_t Z_EXPORT PREFIX(gzseek)(gzFile file, z_off_t offset, int whence) {
+    z_off64_t ret;
+
+    ret = PREFIX4(gzseek)(file, (z_off64_t)offset, whence);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+#endif
+
+/* -- see zlib.h -- */
+z_off64_t Z_EXPORT PREFIX4(gztell)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* return position */
+    return state->x.pos + (state->seek ? state->skip : 0);
+}
+
+/* -- see zlib.h -- */
+#ifdef ZLIB_COMPAT
+z_off_t Z_EXPORT PREFIX(gztell)(gzFile file) {
+
+    z_off64_t ret;
+
+    ret = PREFIX4(gztell)(file);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+#endif
+
+/* -- see zlib.h -- */
+z_off64_t Z_EXPORT PREFIX4(gzoffset)(gzFile file) {
+    z_off64_t offset;
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return -1;
+
+    /* compute and return effective offset in file */
+    offset = LSEEK(state->fd, 0, SEEK_CUR);
+    if (offset == -1)
+        return -1;
+    if (state->mode == GZ_READ)             /* reading */
+        offset -= state->strm.avail_in;     /* don't count buffered input */
+    return offset;
+}
+
+/* -- see zlib.h -- */
+#ifdef ZLIB_COMPAT
+z_off_t Z_EXPORT PREFIX(gzoffset)(gzFile file) {
+    z_off64_t ret;
+
+    ret = PREFIX4(gzoffset)(file);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+#endif
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzeof)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return 0;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return 0;
+
+    /* return end-of-file state */
+    return state->mode == GZ_READ ? state->past : 0;
+}
+
+/* -- see zlib.h -- */
+const char * Z_EXPORT PREFIX(gzerror)(gzFile file, int *errnum) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return NULL;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return NULL;
+
+    /* return error information */
+    if (errnum != NULL)
+        *errnum = state->err;
+    return state->err == Z_MEM_ERROR ? "out of memory" : (state->msg == NULL ? "" : state->msg);
+}
+
+/* -- see zlib.h -- */
+void Z_EXPORT PREFIX(gzclearerr)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return;
+    state = (gz_state *)file;
+    if (state->mode != GZ_READ && state->mode != GZ_WRITE)
+        return;
+
+    /* clear error and end-of-file */
+    if (state->mode == GZ_READ) {
+        state->eof = 0;
+        state->past = 0;
+    }
+    gz_error(state, Z_OK, NULL);
+}
+
+/* Create an error message in allocated memory and set state->err and
+   state->msg accordingly.  Free any previous error message already there.  Do
+   not try to free or allocate space if the error is Z_MEM_ERROR (out of
+   memory).  Simply save the error message as a static string.  If there is an
+   allocation failure constructing the error message, then convert the error to
+   out of memory. */
+void Z_INTERNAL gz_error(gz_state *state, int err, const char *msg) {
+    /* free previously allocated message and clear */
+    if (state->msg != NULL) {
+        if (state->err != Z_MEM_ERROR)
+            free(state->msg);
+        state->msg = NULL;
+    }
+
+    /* if fatal, set state->x.have to 0 so that the gzgetc() macro fails */
+    if (err != Z_OK && err != Z_BUF_ERROR)
+        state->x.have = 0;
+
+    /* set error code, and if no message, then done */
+    state->err = err;
+    if (msg == NULL)
+        return;
+
+    /* for an out of memory error, return literal string when requested */
+    if (err == Z_MEM_ERROR)
+        return;
+
+    /* construct error message with path */
+    if ((state->msg = (char *)malloc(strlen(state->path) + strlen(msg) + 3)) == NULL) {
+        state->err = Z_MEM_ERROR;
+        return;
+    }
+    (void)snprintf(state->msg, strlen(state->path) + strlen(msg) + 3, "%s%s%s", state->path, ": ", msg);
+}
diff --git a/3rdparty/zlib-ng/gzread.c.in b/3rdparty/zlib-ng/gzread.c.in
new file mode 100644
index 000000000000..1fc7b370fde3
--- /dev/null
+++ b/3rdparty/zlib-ng/gzread.c.in
@@ -0,0 +1,606 @@
+/* gzread.c -- zlib functions for reading gzip files
+ * Copyright (C) 2004-2017 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "gzguts.h"
+
+/* Local functions */
+static int gz_load(gz_state *, unsigned char *, unsigned, unsigned *);
+static int gz_avail(gz_state *);
+static int gz_look(gz_state *);
+static int gz_decomp(gz_state *);
+static int gz_fetch(gz_state *);
+static int gz_skip(gz_state *, z_off64_t);
+static size_t gz_read(gz_state *, void *, size_t);
+
+/* Use read() to load a buffer -- return -1 on error, otherwise 0.  Read from
+   state->fd, and update state->eof, state->err, and state->msg as appropriate.
+   This function needs to loop on read(), since read() is not guaranteed to
+   read the number of bytes requested, depending on the type of descriptor. */
+static int gz_load(gz_state *state, unsigned char *buf, unsigned len, unsigned *have) {
+    ssize_t ret;
+
+    *have = 0;
+    do {
+        ret = read(state->fd, buf + *have, len - *have);
+        if (ret <= 0)
+            break;
+        *have += (unsigned)ret;
+    } while (*have < len);
+    if (ret < 0) {
+        gz_error(state, Z_ERRNO, zstrerror());
+        return -1;
+    }
+    if (ret == 0)
+        state->eof = 1;
+    return 0;
+}
+
+/* Load up input buffer and set eof flag if last data loaded -- return -1 on
+   error, 0 otherwise.  Note that the eof flag is set when the end of the input
+   file is reached, even though there may be unused data in the buffer.  Once
+   that data has been used, no more attempts will be made to read the file.
+   If strm->avail_in != 0, then the current data is moved to the beginning of
+   the input buffer, and then the remainder of the buffer is loaded with the
+   available data from the input file. */
+static int gz_avail(gz_state *state) {
+    unsigned got;
+    PREFIX3(stream) *strm = &(state->strm);
+
+    if (state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+    if (state->eof == 0) {
+        if (strm->avail_in) {       /* copy what's there to the start */
+            unsigned char *p = state->in;
+            unsigned const char *q = strm->next_in;
+            unsigned n = strm->avail_in;
+            do {
+                *p++ = *q++;
+            } while (--n);
+        }
+        if (gz_load(state, state->in + strm->avail_in, state->size - strm->avail_in, &got) == -1)
+            return -1;
+        strm->avail_in += got;
+        strm->next_in = state->in;
+    }
+    return 0;
+}
+
+/* Look for gzip header, set up for inflate or copy.  state->x.have must be 0.
+   If this is the first time in, allocate required memory.  state->how will be
+   left unchanged if there is no more input data available, will be set to COPY
+   if there is no gzip header and direct copying will be performed, or it will
+   be set to GZIP for decompression.  If direct copying, then leftover input
+   data from the input buffer will be copied to the output buffer.  In that
+   case, all further file reads will be directly to either the output buffer or
+   a user buffer.  If decompressing, the inflate state will be initialized.
+   gz_look() will return 0 on success or -1 on failure. */
+static int gz_look(gz_state *state) {
+    PREFIX3(stream) *strm = &(state->strm);
+
+    /* allocate read buffers and inflate memory */
+    if (state->size == 0) {
+        /* allocate buffers */
+        state->in = (unsigned char *)zng_alloc(state->want);
+        state->out = (unsigned char *)zng_alloc(state->want << 1);
+        if (state->in == NULL || state->out == NULL) {
+            zng_free(state->out);
+            zng_free(state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        state->size = state->want;
+
+        /* allocate inflate memory */
+        state->strm.zalloc = NULL;
+        state->strm.zfree = NULL;
+        state->strm.opaque = NULL;
+        state->strm.avail_in = 0;
+        state->strm.next_in = NULL;
+        if (PREFIX(inflateInit2)(&(state->strm), MAX_WBITS + 16) != Z_OK) {    /* gunzip */
+            zng_free(state->out);
+            zng_free(state->in);
+            state->size = 0;
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+    }
+
+    /* get at least the magic bytes in the input buffer */
+    if (strm->avail_in < 2) {
+        if (gz_avail(state) == -1)
+            return -1;
+        if (strm->avail_in == 0)
+            return 0;
+    }
+
+    /* look for gzip magic bytes -- if there, do gzip decoding (note: there is
+       a logical dilemma here when considering the case of a partially written
+       gzip file, to wit, if a single 31 byte is written, then we cannot tell
+       whether this is a single-byte file, or just a partially written gzip
+       file -- for here we assume that if a gzip file is being written, then
+       the header will be written in a single operation, so that reading a
+       single byte is sufficient indication that it is not a gzip file) */
+    if (strm->avail_in > 1 &&
+            strm->next_in[0] == 31 && strm->next_in[1] == 139) {
+        PREFIX(inflateReset)(strm);
+        state->how = GZIP;
+        state->direct = 0;
+        return 0;
+    }
+
+    /* no gzip header -- if we were decoding gzip before, then this is trailing
+       garbage.  Ignore the trailing garbage and finish. */
+    if (state->direct == 0) {
+        strm->avail_in = 0;
+        state->eof = 1;
+        state->x.have = 0;
+        return 0;
+    }
+
+    /* doing raw i/o, copy any leftover input to output -- this assumes that
+       the output buffer is larger than the input buffer, which also assures
+       space for gzungetc() */
+    state->x.next = state->out;
+    memcpy(state->x.next, strm->next_in, strm->avail_in);
+    state->x.have = strm->avail_in;
+    strm->avail_in = 0;
+    state->how = COPY;
+    state->direct = 1;
+    return 0;
+}
+
+/* Decompress from input to the provided next_out and avail_out in the state.
+   On return, state->x.have and state->x.next point to the just decompressed
+   data.  If the gzip stream completes, state->how is reset to LOOK to look for
+   the next gzip stream or raw data, once state->x.have is depleted.  Returns 0
+   on success, -1 on failure. */
+static int gz_decomp(gz_state *state) {
+    int ret = Z_OK;
+    unsigned had;
+    PREFIX3(stream) *strm = &(state->strm);
+
+    /* fill output buffer up to end of deflate stream */
+    had = strm->avail_out;
+    do {
+        /* get more input for inflate() */
+        if (strm->avail_in == 0 && gz_avail(state) == -1)
+            return -1;
+        if (strm->avail_in == 0) {
+            gz_error(state, Z_BUF_ERROR, "unexpected end of file");
+            break;
+        }
+
+        /* decompress and handle errors */
+        ret = PREFIX(inflate)(strm, Z_NO_FLUSH);
+        if (ret == Z_STREAM_ERROR || ret == Z_NEED_DICT) {
+            gz_error(state, Z_STREAM_ERROR, "internal error: inflate stream corrupt");
+            return -1;
+        }
+        if (ret == Z_MEM_ERROR) {
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        if (ret == Z_DATA_ERROR) {              /* deflate stream invalid */
+            gz_error(state, Z_DATA_ERROR, strm->msg == NULL ? "compressed data error" : strm->msg);
+            return -1;
+        }
+    } while (strm->avail_out && ret != Z_STREAM_END);
+
+    /* update available output */
+    state->x.have = had - strm->avail_out;
+    state->x.next = strm->next_out - state->x.have;
+
+    /* if the gzip stream completed successfully, look for another */
+    if (ret == Z_STREAM_END)
+        state->how = LOOK;
+
+    /* good decompression */
+    return 0;
+}
+
+/* Fetch data and put it in the output buffer.  Assumes state->x.have is 0.
+   Data is either copied from the input file or decompressed from the input
+   file depending on state->how.  If state->how is LOOK, then a gzip header is
+   looked for to determine whether to copy or decompress.  Returns -1 on error,
+   otherwise 0.  gz_fetch() will leave state->how as COPY or GZIP unless the
+   end of the input file has been reached and all data has been processed.  */
+static int gz_fetch(gz_state *state) {
+    PREFIX3(stream) *strm = &(state->strm);
+
+    do {
+        switch (state->how) {
+        case LOOK:      /* -> LOOK, COPY (only if never GZIP), or GZIP */
+            if (gz_look(state) == -1)
+                return -1;
+            if (state->how == LOOK)
+                return 0;
+            break;
+        case COPY:      /* -> COPY */
+            if (gz_load(state, state->out, state->size << 1, &(state->x.have))
+                    == -1)
+                return -1;
+            state->x.next = state->out;
+            return 0;
+        case GZIP:      /* -> GZIP or LOOK (if end of gzip stream) */
+            strm->avail_out = state->size << 1;
+            strm->next_out = state->out;
+            if (gz_decomp(state) == -1)
+                return -1;
+        }
+    } while (state->x.have == 0 && (!state->eof || strm->avail_in));
+    return 0;
+}
+
+/* Skip len uncompressed bytes of output.  Return -1 on error, 0 on success. */
+static int gz_skip(gz_state *state, z_off64_t len) {
+    unsigned n;
+
+    /* skip over len bytes or reach end-of-file, whichever comes first */
+    while (len)
+        /* skip over whatever is in output buffer */
+        if (state->x.have) {
+            n = GT_OFF(state->x.have) || (z_off64_t)state->x.have > len ?
+                (unsigned)len : state->x.have;
+            state->x.have -= n;
+            state->x.next += n;
+            state->x.pos += n;
+            len -= n;
+        } else if (state->eof && state->strm.avail_in == 0) {
+            /* output buffer empty -- return if we're at the end of the input */
+            break;
+        } else {
+            /* need more data to skip -- load up output buffer */
+            /* get more output, looking for header if required */
+            if (gz_fetch(state) == -1)
+                return -1;
+        }
+    return 0;
+}
+
+/* Read len bytes into buf from file, or less than len up to the end of the
+   input.  Return the number of bytes read.  If zero is returned, either the
+   end of file was reached, or there was an error.  state->err must be
+   consulted in that case to determine which. */
+static size_t gz_read(gz_state *state, void *buf, size_t len) {
+    size_t got;
+    unsigned n;
+
+    /* if len is zero, avoid unnecessary operations */
+    if (len == 0)
+        return 0;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return 0;
+    }
+
+    /* get len bytes to buf, or less than len if at the end */
+    got = 0;
+    do {
+        /* set n to the maximum amount of len that fits in an unsigned int */
+        n = (unsigned)-1;
+        if (n > len)
+            n = (unsigned)len;
+
+        /* first just try copying data from the output buffer */
+        if (state->x.have) {
+            if (state->x.have < n)
+                n = state->x.have;
+            memcpy(buf, state->x.next, n);
+            state->x.next += n;
+            state->x.have -= n;
+        }
+
+        /* output buffer empty -- return if we're at the end of the input */
+        else if (state->eof && state->strm.avail_in == 0) {
+            state->past = 1;        /* tried to read past end */
+            break;
+        }
+
+        /* need output data -- for small len or new stream load up our output
+           buffer */
+        else if (state->how == LOOK || n < (state->size << 1)) {
+            /* get more output, looking for header if required */
+            if (gz_fetch(state) == -1)
+                return 0;
+            continue;       /* no progress yet -- go back to copy above */
+            /* the copy above assures that we will leave with space in the
+               output buffer, allowing at least one gzungetc() to succeed */
+        }
+
+        /* large len -- read directly into user buffer */
+        else if (state->how == COPY) {      /* read directly */
+            if (gz_load(state, (unsigned char *)buf, n, &n) == -1)
+                return 0;
+        }
+
+        /* large len -- decompress directly into user buffer */
+        else {  /* state->how == GZIP */
+            state->strm.avail_out = n;
+            state->strm.next_out = (unsigned char *)buf;
+            if (gz_decomp(state) == -1)
+                return 0;
+            n = state->x.have;
+            state->x.have = 0;
+        }
+
+        /* update progress */
+        len -= n;
+        buf = (char *)buf + n;
+        got += n;
+        state->x.pos += n;
+    } while (len);
+
+    /* return number of bytes read into user buffer */
+    return got;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzread)(gzFile file, void *buf, unsigned len) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+            (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* since an int is returned, make sure len fits in one, otherwise return
+       with an error (this avoids a flaw in the interface) */
+    if ((int)len < 0) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in an int");
+        return -1;
+    }
+
+    /* read len or fewer bytes to buf */
+    len = (unsigned)gz_read(state, buf, len);
+
+    /* check for an error */
+    if (len == 0 && state->err != Z_OK && state->err != Z_BUF_ERROR)
+        return -1;
+
+    /* return the number of bytes read (this is assured to fit in an int) */
+    return (int)len;
+}
+
+/* -- see zlib.h -- */
+size_t Z_EXPORT PREFIX(gzfread)(void *buf, size_t size, size_t nitems, gzFile file) {
+    size_t len;
+    gz_state *state;
+
+    /* Exit early if size is zero, also prevents potential division by zero */
+    if (size == 0)
+        return 0;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ ||
+            (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return 0;
+
+    /* compute bytes to read -- error on overflow */
+    if (size && SIZE_MAX / size < nitems) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
+        return 0;
+    }
+    len = nitems * size;
+
+    /* read len or fewer bytes to buf, return the number of full items read */
+    return len ? gz_read(state, buf, len) / size : 0;
+}
+
+/* -- see zlib.h -- */
+#undef @ZLIB_SYMBOL_PREFIX@gzgetc
+#undef @ZLIB_SYMBOL_PREFIX@zng_gzgetc
+int Z_EXPORT PREFIX(gzgetc)(gzFile file) {
+    unsigned char buf[1];
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* try output buffer (no need to check for skip request) */
+    if (state->x.have) {
+        state->x.have--;
+        state->x.pos++;
+        return *(state->x.next)++;
+    }
+
+    /* nothing there -- try gz_read() */
+    return gz_read(state, buf, 1) < 1 ? -1 : buf[0];
+}
+
+#ifdef ZLIB_COMPAT
+int Z_EXPORT PREFIX(gzgetc_)(gzFile file) {
+    return PREFIX(gzgetc)(file);
+}
+#endif
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzungetc)(int c, gzFile file) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* in case this was just opened, set up the input buffer */
+    if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
+        (void)gz_look(state);
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return -1;
+    }
+
+    /* can't push EOF */
+    if (c < 0)
+        return -1;
+
+    /* if output buffer empty, put byte at end (allows more pushing) */
+    if (state->x.have == 0) {
+        state->x.have = 1;
+        state->x.next = state->out + (state->size << 1) - 1;
+        state->x.next[0] = (unsigned char)c;
+        state->x.pos--;
+        state->past = 0;
+        return c;
+    }
+
+    /* if no room, give up (must have already done a gzungetc()) */
+    if (state->x.have == (state->size << 1)) {
+        gz_error(state, Z_DATA_ERROR, "out of room to push characters");
+        return -1;
+    }
+
+    /* slide output data if needed and insert byte before existing data */
+    if (state->x.next == state->out) {
+        unsigned char *src = state->out + state->x.have;
+        unsigned char *dest = state->out + (state->size << 1);
+        while (src > state->out)
+            *--dest = *--src;
+        state->x.next = dest;
+    }
+    state->x.have++;
+    state->x.next--;
+    state->x.next[0] = (unsigned char)c;
+    state->x.pos--;
+    state->past = 0;
+    return c;
+}
+
+/* -- see zlib.h -- */
+char * Z_EXPORT PREFIX(gzgets)(gzFile file, char *buf, int len) {
+    unsigned left, n;
+    char *str;
+    unsigned char *eol;
+    gz_state *state;
+
+    /* check parameters and get internal structure */
+    if (file == NULL || buf == NULL || len < 1)
+        return NULL;
+    state = (gz_state *)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state->mode != GZ_READ || (state->err != Z_OK && state->err != Z_BUF_ERROR))
+        return NULL;
+
+    /* process a skip request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_skip(state, state->skip) == -1)
+            return NULL;
+    }
+
+    /* copy output bytes up to new line or len - 1, whichever comes first --
+       append a terminating zero to the string (we don't check for a zero in
+       the contents, let the user worry about that) */
+    str = buf;
+    left = (unsigned)len - 1;
+    if (left) {
+        do {
+            /* assure that something is in the output buffer */
+            if (state->x.have == 0 && gz_fetch(state) == -1)
+                return NULL;                /* error */
+            if (state->x.have == 0) {       /* end of file */
+                state->past = 1;            /* read past end */
+                break;                      /* return what we have */
+            }
+
+            /* look for end-of-line in current output buffer */
+            n = state->x.have > left ? left : state->x.have;
+            eol = (unsigned char *)memchr(state->x.next, '\n', n);
+            if (eol != NULL)
+                n = (unsigned)(eol - state->x.next) + 1;
+
+            /* copy through end-of-line, or remainder if not found */
+            memcpy(buf, state->x.next, n);
+            state->x.have -= n;
+            state->x.next += n;
+            state->x.pos += n;
+            left -= n;
+            buf += n;
+        } while (left && eol == NULL);
+    }
+
+    /* return terminated string, or if nothing, end of file */
+    if (buf == str)
+        return NULL;
+    buf[0] = 0;
+    return str;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzdirect)(gzFile file) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+
+    state = (gz_state *)file;
+
+    /* if the state is not known, but we can find out, then do so (this is
+       mainly for right after a gzopen() or gzdopen()) */
+    if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
+        (void)gz_look(state);
+
+    /* return 1 if transparent, 0 if processing a gzip stream */
+    return state->direct;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzclose_r)(gzFile file) {
+    int ret, err;
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+
+    state = (gz_state *)file;
+
+    /* check that we're reading */
+    if (state->mode != GZ_READ)
+        return Z_STREAM_ERROR;
+
+    /* free memory and close file */
+    if (state->size) {
+        PREFIX(inflateEnd)(&(state->strm));
+        zng_free(state->out);
+        zng_free(state->in);
+    }
+    err = state->err == Z_BUF_ERROR ? Z_BUF_ERROR : Z_OK;
+    gz_error(state, Z_OK, NULL);
+    free(state->path);
+    ret = close(state->fd);
+    zng_free(state);
+    return ret ? Z_ERRNO : err;
+}
diff --git a/3rdparty/zlib-ng/gzwrite.c b/3rdparty/zlib-ng/gzwrite.c
new file mode 100644
index 000000000000..08e0ce9aab2a
--- /dev/null
+++ b/3rdparty/zlib-ng/gzwrite.c
@@ -0,0 +1,526 @@
+/* gzwrite.c -- zlib functions for writing gzip files
+ * Copyright (C) 2004-2019 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include <stdarg.h>
+#include "gzguts.h"
+
+/* Local functions */
+static int gz_init(gz_state *);
+static int gz_comp(gz_state *, int);
+static int gz_zero(gz_state *, z_off64_t);
+static size_t gz_write(gz_state *, void const *, size_t);
+
+/* Initialize state for writing a gzip file.  Mark initialization by setting
+   state->size to non-zero.  Return -1 on a memory allocation failure, or 0 on
+   success. */
+static int gz_init(gz_state *state) {
+    int ret;
+    PREFIX3(stream) *strm = &(state->strm);
+
+    /* allocate input buffer (double size for gzprintf) */
+    state->in = (unsigned char *)zng_alloc(state->want << 1);
+    if (state->in == NULL) {
+        gz_error(state, Z_MEM_ERROR, "out of memory");
+        return -1;
+    }
+    memset(state->in, 0, state->want << 1);
+
+    /* only need output buffer and deflate state if compressing */
+    if (!state->direct) {
+        /* allocate output buffer */
+        state->out = (unsigned char *)zng_alloc(state->want);
+        if (state->out == NULL) {
+            zng_free(state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+
+        /* allocate deflate memory, set up for gzip compression */
+        strm->zalloc = NULL;
+        strm->zfree = NULL;
+        strm->opaque = NULL;
+        ret = PREFIX(deflateInit2)(strm, state->level, Z_DEFLATED, MAX_WBITS + 16, DEF_MEM_LEVEL, state->strategy);
+        if (ret != Z_OK) {
+            zng_free(state->out);
+            zng_free(state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        strm->next_in = NULL;
+    }
+
+    /* mark state as initialized */
+    state->size = state->want;
+
+    /* initialize write buffer if compressing */
+    if (!state->direct) {
+        strm->avail_out = state->size;
+        strm->next_out = state->out;
+        state->x.next = strm->next_out;
+    }
+    return 0;
+}
+
+/* Compress whatever is at avail_in and next_in and write to the output file.
+   Return -1 if there is an error writing to the output file or if gz_init()
+   fails to allocate memory, otherwise 0.  flush is assumed to be a valid
+   deflate() flush value.  If flush is Z_FINISH, then the deflate() state is
+   reset to start a new gzip stream.  If gz->direct is true, then simply write
+   to the output file without compressing, and ignore flush. */
+static int gz_comp(gz_state *state, int flush) {
+    int ret;
+    ssize_t got;
+    unsigned have;
+    PREFIX3(stream) *strm = &(state->strm);
+
+    /* allocate memory if this is the first time through */
+    if (state->size == 0 && gz_init(state) == -1)
+        return -1;
+
+    /* write directly if requested */
+    if (state->direct) {
+        got = write(state->fd, strm->next_in, strm->avail_in);
+        if (got < 0 || (unsigned)got != strm->avail_in) {
+            gz_error(state, Z_ERRNO, zstrerror());
+            return -1;
+        }
+        strm->avail_in = 0;
+        return 0;
+    }
+
+    /* check for a pending reset */
+    if (state->reset) {
+        /* don't start a new gzip member unless there is data to write */
+        if (strm->avail_in == 0)
+            return 0;
+        PREFIX(deflateReset)(strm);
+        state->reset = 0;
+    }
+
+    /* run deflate() on provided input until it produces no more output */
+    ret = Z_OK;
+    do {
+        /* write out current buffer contents if full, or if flushing, but if
+           doing Z_FINISH then don't write until we get to Z_STREAM_END */
+        if (strm->avail_out == 0 || (flush != Z_NO_FLUSH && (flush != Z_FINISH || ret == Z_STREAM_END))) {
+            have = (unsigned)(strm->next_out - state->x.next);
+            if (have && ((got = write(state->fd, state->x.next, (unsigned long)have)) < 0 || (unsigned)got != have)) {
+                gz_error(state, Z_ERRNO, zstrerror());
+                return -1;
+            }
+            if (strm->avail_out == 0) {
+                strm->avail_out = state->size;
+                strm->next_out = state->out;
+                state->x.next = state->out;
+            }
+            state->x.next = strm->next_out;
+        }
+
+        /* compress */
+        have = strm->avail_out;
+        ret = PREFIX(deflate)(strm, flush);
+        if (ret == Z_STREAM_ERROR) {
+            gz_error(state, Z_STREAM_ERROR, "internal error: deflate stream corrupt");
+            return -1;
+        }
+        have -= strm->avail_out;
+    } while (have);
+
+    /* if that completed a deflate stream, allow another to start */
+    if (flush == Z_FINISH)
+        state->reset = 1;
+    /* all done, no errors */
+    return 0;
+}
+
+/* Compress len zeros to output.  Return -1 on a write error or memory
+   allocation failure by gz_comp(), or 0 on success. */
+static int gz_zero(gz_state *state, z_off64_t len) {
+    int first;
+    unsigned n;
+    PREFIX3(stream) *strm = &(state->strm);
+
+    /* consume whatever's left in the input buffer */
+    if (strm->avail_in && gz_comp(state, Z_NO_FLUSH) == -1)
+        return -1;
+
+    /* compress len zeros (len guaranteed > 0) */
+    first = 1;
+    while (len) {
+        n = GT_OFF(state->size) || (z_off64_t)state->size > len ? (unsigned)len : state->size;
+        if (first) {
+            memset(state->in, 0, n);
+            first = 0;
+        }
+        strm->avail_in = n;
+        strm->next_in = state->in;
+        state->x.pos += n;
+        if (gz_comp(state, Z_NO_FLUSH) == -1)
+            return -1;
+        len -= n;
+    }
+    return 0;
+}
+
+/* Write len bytes from buf to file.  Return the number of bytes written.  If
+   the returned value is less than len, then there was an error. */
+static size_t gz_write(gz_state *state, void const *buf, size_t len) {
+    size_t put = len;
+
+    /* if len is zero, avoid unnecessary operations */
+    if (len == 0)
+        return 0;
+
+    /* allocate memory if this is the first time through */
+    if (state->size == 0 && gz_init(state) == -1)
+        return 0;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return 0;
+    }
+
+    /* for small len, copy to input buffer, otherwise compress directly */
+    if (len < state->size) {
+        /* copy to input buffer, compress when full */
+        do {
+            unsigned have, copy;
+
+            if (state->strm.avail_in == 0)
+                state->strm.next_in = state->in;
+            have = (unsigned)((state->strm.next_in + state->strm.avail_in) -
+                              state->in);
+            copy = state->size - have;
+            if (copy > len)
+                copy = (unsigned)len;
+            memcpy(state->in + have, buf, copy);
+            state->strm.avail_in += copy;
+            state->x.pos += copy;
+            buf = (const char *)buf + copy;
+            len -= copy;
+            if (len && gz_comp(state, Z_NO_FLUSH) == -1)
+                return 0;
+        } while (len);
+    } else {
+        /* consume whatever's left in the input buffer */
+        if (state->strm.avail_in && gz_comp(state, Z_NO_FLUSH) == -1)
+            return 0;
+
+        /* directly compress user buffer to file */
+        state->strm.next_in = (z_const unsigned char *) buf;
+        do {
+            unsigned n = (unsigned)-1;
+            if (n > len)
+                n = (unsigned)len;
+            state->strm.avail_in = n;
+            state->x.pos += n;
+            if (gz_comp(state, Z_NO_FLUSH) == -1)
+                return 0;
+            len -= n;
+        } while (len);
+    }
+
+    /* input was all buffered or compressed */
+    return put;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzwrite)(gzFile file, void const *buf, unsigned len) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_state *)file;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return 0;
+
+    /* since an int is returned, make sure len fits in one, otherwise return
+       with an error (this avoids a flaw in the interface) */
+    if ((int)len < 0) {
+        gz_error(state, Z_DATA_ERROR, "requested length does not fit in int");
+        return 0;
+    }
+
+    /* write len bytes from buf (the return value will fit in an int) */
+    return (int)gz_write(state, buf, len);
+}
+
+/* -- see zlib.h -- */
+size_t Z_EXPORT PREFIX(gzfwrite)(void const *buf, size_t size, size_t nitems, gzFile file) {
+    size_t len;
+    gz_state *state;
+
+    /* Exit early if size is zero, also prevents potential division by zero */
+    if (size == 0)
+        return 0;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_state *)file;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return 0;
+
+    /* compute bytes to read -- error on overflow */
+    len = nitems * size;
+    if (len / size != nitems) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
+        return 0;
+    }
+
+    /* write len bytes to buf, return the number of full items written */
+    return len ? gz_write(state, buf, len) / size : 0;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzputc)(gzFile file, int c) {
+    unsigned have;
+    unsigned char buf[1];
+    gz_state *state;
+    PREFIX3(stream) *strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+    strm = &(state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return -1;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return -1;
+    }
+
+    /* try writing to input buffer for speed (state->size == 0 if buffer not
+       initialized) */
+    if (state->size) {
+        if (strm->avail_in == 0)
+            strm->next_in = state->in;
+        have = (unsigned)((strm->next_in + strm->avail_in) - state->in);
+        if (have < state->size) {
+            state->in[have] = (unsigned char)c;
+            strm->avail_in++;
+            state->x.pos++;
+            return c & 0xff;
+        }
+    }
+
+    /* no room in buffer or not initialized, use gz_write() */
+    buf[0] = (unsigned char)c;
+    if (gz_write(state, buf, 1) != 1)
+        return -1;
+    return c & 0xff;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzputs)(gzFile file, const char *s) {
+    size_t len, put;
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_state *)file;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return -1;
+
+    /* write string */
+    len = strlen(s);
+    if ((int)len < 0 || (unsigned)len != len) {
+        gz_error(state, Z_STREAM_ERROR, "string length does not fit in int");
+        return -1;
+    }
+    put = gz_write(state, s, len);
+    return put < len ? -1 : (int)len;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORTVA PREFIX(gzvprintf)(gzFile file, const char *format, va_list va) {
+    int len;
+    unsigned left;
+    char *next;
+    gz_state *state;
+    PREFIX3(stream) *strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_state *)file;
+    strm = &(state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* make sure we have some buffer space */
+    if (state->size == 0 && gz_init(state) == -1)
+        return state->err;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return state->err;
+    }
+
+    /* do the printf() into the input buffer, put length in len -- the input
+       buffer is double-sized just for this function, so there is guaranteed to
+       be state->size bytes available after the current contents */
+    if (strm->avail_in == 0)
+        strm->next_in = state->in;
+    next = (char *)(state->in + (strm->next_in - state->in) + strm->avail_in);
+    next[state->size - 1] = 0;
+    len = vsnprintf(next, state->size, format, va);
+
+    /* check that printf() results fit in buffer */
+    if (len == 0 || (unsigned)len >= state->size || next[state->size - 1] != 0)
+        return 0;
+
+    /* update buffer and position, compress first half if past that */
+    strm->avail_in += (unsigned)len;
+    state->x.pos += len;
+    if (strm->avail_in >= state->size) {
+        left = strm->avail_in - state->size;
+        strm->avail_in = state->size;
+        if (gz_comp(state, Z_NO_FLUSH) == -1)
+            return state->err;
+        memmove(state->in, state->in + state->size, left);
+        strm->next_in = state->in;
+        strm->avail_in = left;
+    }
+    return len;
+}
+
+int Z_EXPORTVA PREFIX(gzprintf)(gzFile file, const char *format, ...) {
+    va_list va;
+    int ret;
+
+    va_start(va, format);
+    ret = PREFIX(gzvprintf)(file, format, va);
+    va_end(va);
+    return ret;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzflush)(gzFile file, int flush) {
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_state *)file;
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* check flush parameter */
+    if (flush < 0 || flush > Z_FINISH)
+        return Z_STREAM_ERROR;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return state->err;
+    }
+
+    /* compress remaining data with requested flush */
+    (void)gz_comp(state, flush);
+    return state->err;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzsetparams)(gzFile file, int level, int strategy) {
+    gz_state *state;
+    PREFIX3(stream) *strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_state *)file;
+    strm = &(state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state->mode != GZ_WRITE || state->err != Z_OK || state->direct)
+        return Z_STREAM_ERROR;
+
+    /* if no change is requested, then do nothing */
+    if (level == state->level && strategy == state->strategy)
+        return Z_OK;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            return state->err;
+    }
+
+    /* change compression parameters for subsequent input */
+    if (state->size) {
+        /* flush previous input with previous parameters before changing */
+        if (strm->avail_in && gz_comp(state, Z_BLOCK) == -1)
+            return state->err;
+        PREFIX(deflateParams)(strm, level, strategy);
+    }
+    state->level = level;
+    state->strategy = strategy;
+    return Z_OK;
+}
+
+/* -- see zlib.h -- */
+int Z_EXPORT PREFIX(gzclose_w)(gzFile file) {
+    int ret = Z_OK;
+    gz_state *state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_state *)file;
+
+    /* check that we're writing */
+    if (state->mode != GZ_WRITE)
+        return Z_STREAM_ERROR;
+
+    /* check for seek request */
+    if (state->seek) {
+        state->seek = 0;
+        if (gz_zero(state, state->skip) == -1)
+            ret = state->err;
+    }
+
+    /* flush, free memory, and close file */
+    if (gz_comp(state, Z_FINISH) == -1)
+        ret = state->err;
+    if (state->size) {
+        if (!state->direct) {
+            (void)PREFIX(deflateEnd)(&(state->strm));
+            zng_free(state->out);
+        }
+        zng_free(state->in);
+    }
+    gz_error(state, Z_OK, NULL);
+    free(state->path);
+    if (close(state->fd) == -1)
+        ret = Z_ERRNO;
+    zng_free(state);
+    return ret;
+}
diff --git a/3rdparty/zlib-ng/infback.c b/3rdparty/zlib-ng/infback.c
new file mode 100644
index 000000000000..9f5042b4d3dc
--- /dev/null
+++ b/3rdparty/zlib-ng/infback.c
@@ -0,0 +1,511 @@
+/* infback.c -- inflate using a call-back interface
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/*
+   This code is largely copied from inflate.c.  Normally either infback.o or
+   inflate.o would be linked into an application--not both.  The interface
+   with inffast.c is retained so that optimized assembler-coded versions of
+   inflate_fast() can be used with either inflate.c or infback.c.
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inflate_p.h"
+#include "functable.h"
+
+/* Avoid conflicts with zlib.h macros */
+#ifdef ZLIB_COMPAT
+# undef inflateBackInit
+#endif
+
+/*
+   strm provides memory allocation functions in zalloc and zfree, or
+   NULL to use the library memory allocation functions.
+
+   windowBits is in the range 8..15, and window is a user-supplied
+   window and output buffer that is 2**windowBits bytes.
+
+   This function is hidden in ZLIB_COMPAT builds.
+ */
+int32_t ZNG_CONDEXPORT PREFIX(inflateBackInit)(PREFIX3(stream) *strm, int32_t windowBits, uint8_t *window) {
+    struct inflate_state *state;
+
+    if (strm == NULL || window == NULL || windowBits < MIN_WBITS || windowBits > MAX_WBITS)
+        return Z_STREAM_ERROR;
+    strm->msg = NULL;                   /* in case we return an error */
+    if (strm->zalloc == NULL) {
+        strm->zalloc = PREFIX(zcalloc);
+        strm->opaque = NULL;
+    }
+    if (strm->zfree == NULL)
+        strm->zfree = PREFIX(zcfree);
+    state = ZALLOC_INFLATE_STATE(strm);
+    if (state == NULL)
+        return Z_MEM_ERROR;
+    Tracev((stderr, "inflate: allocated\n"));
+    strm->state = (struct internal_state *)state;
+    state->dmax = 32768U;
+    state->wbits = (unsigned int)windowBits;
+    state->wsize = 1U << windowBits;
+    state->window = window;
+    state->wnext = 0;
+    state->whave = 0;
+    state->sane = 1;
+    state->chunksize = functable.chunksize();
+    return Z_OK;
+}
+
+/* Function used by zlib.h and zlib-ng version 2.0 macros */
+int32_t Z_EXPORT PREFIX(inflateBackInit_)(PREFIX3(stream) *strm, int32_t windowBits, uint8_t *window,
+                              const char *version, int32_t stream_size) {
+    if (CHECK_VER_STSIZE(version, stream_size))
+        return Z_VERSION_ERROR;
+    return PREFIX(inflateBackInit)(strm, windowBits, window);
+}
+
+/*
+   Private macros for inflateBack()
+   Look in inflate_p.h for macros shared with inflate()
+*/
+
+/* Assure that some input is available.  If input is requested, but denied,
+   then return a Z_BUF_ERROR from inflateBack(). */
+#define PULL() \
+    do { \
+        if (have == 0) { \
+            have = in(in_desc, &next); \
+            if (have == 0) { \
+                next = NULL; \
+                ret = Z_BUF_ERROR; \
+                goto inf_leave; \
+            } \
+        } \
+    } while (0)
+
+/* Get a byte of input into the bit accumulator, or return from inflateBack()
+   with an error if there is no input available. */
+#define PULLBYTE() \
+    do { \
+        PULL(); \
+        have--; \
+        hold += ((unsigned)(*next++) << bits); \
+        bits += 8; \
+    } while (0)
+
+/* Assure that some output space is available, by writing out the window
+   if it's full.  If the write fails, return from inflateBack() with a
+   Z_BUF_ERROR. */
+#define ROOM() \
+    do { \
+        if (left == 0) { \
+            put = state->window; \
+            left = state->wsize; \
+            state->whave = left; \
+            if (out(out_desc, put, left)) { \
+                ret = Z_BUF_ERROR; \
+                goto inf_leave; \
+            } \
+        } \
+    } while (0)
+
+/*
+   strm provides the memory allocation functions and window buffer on input,
+   and provides information on the unused input on return.  For Z_DATA_ERROR
+   returns, strm will also provide an error message.
+
+   in() and out() are the call-back input and output functions.  When
+   inflateBack() needs more input, it calls in().  When inflateBack() has
+   filled the window with output, or when it completes with data in the
+   window, it calls out() to write out the data.  The application must not
+   change the provided input until in() is called again or inflateBack()
+   returns.  The application must not change the window/output buffer until
+   inflateBack() returns.
+
+   in() and out() are called with a descriptor parameter provided in the
+   inflateBack() call.  This parameter can be a structure that provides the
+   information required to do the read or write, as well as accumulated
+   information on the input and output such as totals and check values.
+
+   in() should return zero on failure.  out() should return non-zero on
+   failure.  If either in() or out() fails, than inflateBack() returns a
+   Z_BUF_ERROR.  strm->next_in can be checked for NULL to see whether it
+   was in() or out() that caused in the error.  Otherwise, inflateBack()
+   returns Z_STREAM_END on success, Z_DATA_ERROR for an deflate format
+   error, or Z_MEM_ERROR if it could not allocate memory for the state.
+   inflateBack() can also return Z_STREAM_ERROR if the input parameters
+   are not correct, i.e. strm is NULL or the state was not initialized.
+ */
+int32_t Z_EXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in_desc, out_func out, void *out_desc) {
+    struct inflate_state *state;
+    z_const unsigned char *next; /* next input */
+    unsigned char *put;          /* next output */
+    unsigned have, left;         /* available input and output */
+    uint32_t hold;               /* bit buffer */
+    unsigned bits;               /* bits in bit buffer */
+    unsigned copy;               /* number of stored or match bytes to copy */
+    unsigned char *from;         /* where to copy match bytes from */
+    code here;                   /* current decoding table entry */
+    code last;                   /* parent table entry */
+    unsigned len;                /* length to copy for repeats, bits to drop */
+    int32_t ret;                 /* return code */
+    static const uint16_t order[19] = /* permutation of code lengths */
+        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+    /* Check that the strm exists and that the state was initialized */
+    if (strm == NULL || strm->state == NULL)
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+
+    /* Reset the state */
+    strm->msg = NULL;
+    state->mode = TYPE;
+    state->last = 0;
+    state->whave = 0;
+    next = strm->next_in;
+    have = next != NULL ? strm->avail_in : 0;
+    hold = 0;
+    bits = 0;
+    put = state->window;
+    left = state->wsize;
+
+    /* Inflate until end of block marked as last */
+    for (;;)
+        switch (state->mode) {
+        case TYPE:
+            /* determine and dispatch block type */
+            if (state->last) {
+                BYTEBITS();
+                state->mode = DONE;
+                break;
+            }
+            NEEDBITS(3);
+            state->last = BITS(1);
+            DROPBITS(1);
+            switch (BITS(2)) {
+            case 0:                             /* stored block */
+                Tracev((stderr, "inflate:     stored block%s\n", state->last ? " (last)" : ""));
+                state->mode = STORED;
+                break;
+            case 1:                             /* fixed block */
+                PREFIX(fixedtables)(state);
+                Tracev((stderr, "inflate:     fixed codes block%s\n", state->last ? " (last)" : ""));
+                state->mode = LEN;              /* decode codes */
+                break;
+            case 2:                             /* dynamic block */
+                Tracev((stderr, "inflate:     dynamic codes block%s\n", state->last ? " (last)" : ""));
+                state->mode = TABLE;
+                break;
+            case 3:
+                SET_BAD("invalid block type");
+            }
+            DROPBITS(2);
+            break;
+
+        case STORED:
+            /* get and verify stored block length */
+            BYTEBITS();                         /* go to byte boundary */
+            NEEDBITS(32);
+            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
+                SET_BAD("invalid stored block lengths");
+                break;
+            }
+            state->length = (uint16_t)hold;
+            Tracev((stderr, "inflate:       stored length %u\n", state->length));
+            INITBITS();
+
+            /* copy stored block from input to output */
+            while (state->length != 0) {
+                copy = state->length;
+                PULL();
+                ROOM();
+                copy = MIN(copy, have);
+                copy = MIN(copy, left);
+                memcpy(put, next, copy);
+                have -= copy;
+                next += copy;
+                left -= copy;
+                put += copy;
+                state->length -= copy;
+            }
+            Tracev((stderr, "inflate:       stored end\n"));
+            state->mode = TYPE;
+            break;
+
+        case TABLE:
+            /* get dynamic table entries descriptor */
+            NEEDBITS(14);
+            state->nlen = BITS(5) + 257;
+            DROPBITS(5);
+            state->ndist = BITS(5) + 1;
+            DROPBITS(5);
+            state->ncode = BITS(4) + 4;
+            DROPBITS(4);
+#ifndef PKZIP_BUG_WORKAROUND
+            if (state->nlen > 286 || state->ndist > 30) {
+                SET_BAD("too many length or distance symbols");
+                break;
+            }
+#endif
+            Tracev((stderr, "inflate:       table sizes ok\n"));
+            state->have = 0;
+
+            /* get code length code lengths (not a typo) */
+            while (state->have < state->ncode) {
+                NEEDBITS(3);
+                state->lens[order[state->have++]] = (uint16_t)BITS(3);
+                DROPBITS(3);
+            }
+            while (state->have < 19)
+                state->lens[order[state->have++]] = 0;
+            state->next = state->codes;
+            state->lencode = (const code *)(state->next);
+            state->lenbits = 7;
+            ret = zng_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work);
+            if (ret) {
+                SET_BAD("invalid code lengths set");
+                break;
+            }
+            Tracev((stderr, "inflate:       code lengths ok\n"));
+            state->have = 0;
+
+            /* get length and distance code code lengths */
+            while (state->have < state->nlen + state->ndist) {
+                for (;;) {
+                    here = state->lencode[BITS(state->lenbits)];
+                    if (here.bits <= bits) break;
+                    PULLBYTE();
+                }
+                if (here.val < 16) {
+                    DROPBITS(here.bits);
+                    state->lens[state->have++] = here.val;
+                } else {
+                    if (here.val == 16) {
+                        NEEDBITS(here.bits + 2);
+                        DROPBITS(here.bits);
+                        if (state->have == 0) {
+                            SET_BAD("invalid bit length repeat");
+                            break;
+                        }
+                        len = state->lens[state->have - 1];
+                        copy = 3 + BITS(2);
+                        DROPBITS(2);
+                    } else if (here.val == 17) {
+                        NEEDBITS(here.bits + 3);
+                        DROPBITS(here.bits);
+                        len = 0;
+                        copy = 3 + BITS(3);
+                        DROPBITS(3);
+                    } else {
+                        NEEDBITS(here.bits + 7);
+                        DROPBITS(here.bits);
+                        len = 0;
+                        copy = 11 + BITS(7);
+                        DROPBITS(7);
+                    }
+                    if (state->have + copy > state->nlen + state->ndist) {
+                        SET_BAD("invalid bit length repeat");
+                        break;
+                    }
+                    while (copy) {
+                        --copy;
+                        state->lens[state->have++] = (uint16_t)len;
+                    }
+                }
+            }
+
+            /* handle error breaks in while */
+            if (state->mode == BAD)
+                break;
+
+            /* check for end-of-block code (better have one) */
+            if (state->lens[256] == 0) {
+                SET_BAD("invalid code -- missing end-of-block");
+                break;
+            }
+
+            /* build code tables -- note: do not change the lenbits or distbits
+               values here (10 and 9) without reading the comments in inftrees.h
+               concerning the ENOUGH constants, which depend on those values */
+            state->next = state->codes;
+            state->lencode = (const code *)(state->next);
+            state->lenbits = 10;
+            ret = zng_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work);
+            if (ret) {
+                SET_BAD("invalid literal/lengths set");
+                break;
+            }
+            state->distcode = (const code *)(state->next);
+            state->distbits = 9;
+            ret = zng_inflate_table(DISTS, state->lens + state->nlen, state->ndist,
+                                &(state->next), &(state->distbits), state->work);
+            if (ret) {
+                SET_BAD("invalid distances set");
+                break;
+            }
+            Tracev((stderr, "inflate:       codes ok\n"));
+            state->mode = LEN;
+            Z_FALLTHROUGH;
+
+        case LEN:
+            /* use inflate_fast() if we have enough input and output */
+            if (have >= INFLATE_FAST_MIN_HAVE &&
+                left >= INFLATE_FAST_MIN_LEFT) {
+                RESTORE();
+                if (state->whave < state->wsize)
+                    state->whave = state->wsize - left;
+                functable.inflate_fast(strm, state->wsize);
+                LOAD();
+                break;
+            }
+
+            /* get a literal, length, or end-of-block code */
+            for (;;) {
+                here = state->lencode[BITS(state->lenbits)];
+                if (here.bits <= bits)
+                    break;
+                PULLBYTE();
+            }
+            if (here.op && (here.op & 0xf0) == 0) {
+                last = here;
+                for (;;) {
+                    here = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)last.bits + (unsigned)here.bits <= bits)
+                        break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(here.bits);
+            state->length = here.val;
+
+            /* process literal */
+            if ((int)(here.op) == 0) {
+                Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
+                        "inflate:         literal '%c'\n" :
+                        "inflate:         literal 0x%02x\n", here.val));
+                ROOM();
+                *put++ = (unsigned char)(state->length);
+                left--;
+                state->mode = LEN;
+                break;
+            }
+
+            /* process end of block */
+            if (here.op & 32) {
+                Tracevv((stderr, "inflate:         end of block\n"));
+                state->mode = TYPE;
+                break;
+            }
+
+            /* invalid code */
+            if (here.op & 64) {
+                SET_BAD("invalid literal/length code");
+                break;
+            }
+
+            /* length code -- get extra bits, if any */
+            state->extra = (here.op & MAX_BITS);
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->length += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+            Tracevv((stderr, "inflate:         length %u\n", state->length));
+
+            /* get distance code */
+            for (;;) {
+                here = state->distcode[BITS(state->distbits)];
+                if (here.bits <= bits)
+                    break;
+                PULLBYTE();
+            }
+            if ((here.op & 0xf0) == 0) {
+                last = here;
+                for (;;) {
+                    here = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)last.bits + (unsigned)here.bits <= bits)
+                        break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(here.bits);
+            if (here.op & 64) {
+                SET_BAD("invalid distance code");
+                break;
+            }
+            state->offset = here.val;
+            state->extra = (here.op & MAX_BITS);
+
+            /* get distance extra bits, if any */
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->offset += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+#ifdef INFLATE_STRICT
+            if (state->offset > state->wsize - (state->whave < state->wsize ? left : 0)) {
+                SET_BAD("invalid distance too far back");
+                break;
+            }
+#endif
+            Tracevv((stderr, "inflate:         distance %u\n", state->offset));
+
+            /* copy match from window to output */
+            do {
+                ROOM();
+                copy = state->wsize - state->offset;
+                if (copy < left) {
+                    from = put + copy;
+                    copy = left - copy;
+                } else {
+                    from = put - state->offset;
+                    copy = left;
+                }
+                copy = MIN(copy, state->length);
+                state->length -= copy;
+                left -= copy;
+                do {
+                    *put++ = *from++;
+                } while (--copy);
+            } while (state->length != 0);
+            break;
+
+        case DONE:
+            /* inflate stream terminated properly */
+            ret = Z_STREAM_END;
+            goto inf_leave;
+
+        case BAD:
+            ret = Z_DATA_ERROR;
+            goto inf_leave;
+
+        default:                /* can't happen, but makes compilers happy */
+            ret = Z_STREAM_ERROR;
+            goto inf_leave;
+        }
+
+    /* Write leftover output and return unused input */
+  inf_leave:
+    if (left < state->wsize) {
+        if (out(out_desc, state->window, state->wsize - left) && (ret == Z_STREAM_END)) {
+            ret = Z_BUF_ERROR;
+        }
+    }
+    strm->next_in = next;
+    strm->avail_in = have;
+    return ret;
+}
+
+int32_t Z_EXPORT PREFIX(inflateBackEnd)(PREFIX3(stream) *strm) {
+    if (strm == NULL || strm->state == NULL || strm->zfree == NULL)
+        return Z_STREAM_ERROR;
+    ZFREE_STATE(strm, strm->state);
+    strm->state = NULL;
+    Tracev((stderr, "inflate: end\n"));
+    return Z_OK;
+}
diff --git a/3rdparty/zlib-ng/inffast_tpl.h b/3rdparty/zlib-ng/inffast_tpl.h
new file mode 100644
index 000000000000..9ddd187d8473
--- /dev/null
+++ b/3rdparty/zlib-ng/inffast_tpl.h
@@ -0,0 +1,326 @@
+/* inffast.c -- fast decoding
+ * Copyright (C) 1995-2017 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zendian.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inflate_p.h"
+#include "functable.h"
+
+/*
+   Decode literal, length, and distance codes and write out the resulting
+   literal and match bytes until either not enough input or output is
+   available, an end-of-block is encountered, or a data error is encountered.
+   When large enough input and output buffers are supplied to inflate(), for
+   example, a 16K input buffer and a 64K output buffer, more than 95% of the
+   inflate execution time is spent in this routine.
+
+   Entry assumptions:
+
+        state->mode == LEN
+        strm->avail_in >= INFLATE_FAST_MIN_HAVE
+        strm->avail_out >= INFLATE_FAST_MIN_LEFT
+        start >= strm->avail_out
+        state->bits < 8
+
+   On return, state->mode is one of:
+
+        LEN -- ran out of enough output space or enough available input
+        TYPE -- reached end of block code, inflate() to interpret next block
+        BAD -- error in block data
+
+   Notes:
+
+    - The maximum input bits used by a length/distance pair is 15 bits for the
+      length code, 5 bits for the length extra, 15 bits for the distance code,
+      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
+      Therefore if strm->avail_in >= 6, then there is enough input to avoid
+      checking for available input while decoding.
+
+    - On some architectures, it can be significantly faster (e.g. up to 1.2x
+      faster on x86_64) to load from strm->next_in 64 bits, or 8 bytes, at a
+      time, so INFLATE_FAST_MIN_HAVE == 8.
+
+    - The maximum bytes that a single length/distance pair can output is 258
+      bytes, which is the maximum length that can be coded.  inflate_fast()
+      requires strm->avail_out >= 258 for each loop to avoid checking for
+      output space.
+ */
+void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) {
+    /* start: inflate()'s starting value for strm->avail_out */
+    struct inflate_state *state;
+    z_const unsigned char *in;  /* local strm->next_in */
+    const unsigned char *last;  /* have enough input while in < last */
+    unsigned char *out;         /* local strm->next_out */
+    unsigned char *beg;         /* inflate()'s initial strm->next_out */
+    unsigned char *end;         /* while out < end, enough space available */
+    unsigned char *safe;        /* can use chunkcopy provided out < safe */
+#ifdef INFLATE_STRICT
+    unsigned dmax;              /* maximum distance from zlib header */
+#endif
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned wnext;             /* window write index */
+    unsigned char *window;      /* allocated sliding window, if wsize != 0 */
+
+    /* hold is a local copy of strm->hold. By default, hold satisfies the same
+       invariants that strm->hold does, namely that (hold >> bits) == 0. This
+       invariant is kept by loading bits into hold one byte at a time, like:
+
+       hold |= next_byte_of_input << bits; in++; bits += 8;
+
+       If we need to ensure that bits >= 15 then this code snippet is simply
+       repeated. Over one iteration of the outermost do/while loop, this
+       happens up to six times (48 bits of input), as described in the NOTES
+       above.
+
+       However, on some little endian architectures, it can be significantly
+       faster to load 64 bits once instead of 8 bits six times:
+
+       if (bits <= 16) {
+         hold |= next_8_bytes_of_input << bits; in += 6; bits += 48;
+       }
+
+       Unlike the simpler one byte load, shifting the next_8_bytes_of_input
+       by bits will overflow and lose those high bits, up to 2 bytes' worth.
+       The conservative estimate is therefore that we have read only 6 bytes
+       (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the
+       rest of the iteration, and we will not need to load another 8 bytes.
+
+       Inside this function, we no longer satisfy (hold >> bits) == 0, but
+       this is not problematic, even if that overflow does not land on an 8 bit
+       byte boundary. Those excess bits will eventually shift down lower as the
+       Huffman decoder consumes input, and when new input bits need to be loaded
+       into the bits variable, the same input bits will be or'ed over those
+       existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b).
+       Note that we therefore write that load operation as "hold |= etc" and not
+       "hold += etc".
+
+       Outside that loop, at the end of the function, hold is bitwise and'ed
+       with (1<<bits)-1 to drop those excess bits so that, on function exit, we
+       keep the invariant that (state->hold >> state->bits) == 0.
+    */
+    uint64_t hold;              /* local strm->hold */
+    unsigned bits;              /* local strm->bits */
+    code const *lcode;          /* local strm->lencode */
+    code const *dcode;          /* local strm->distcode */
+    unsigned lmask;             /* mask for first level of length codes */
+    unsigned dmask;             /* mask for first level of distance codes */
+    const code *here;           /* retrieved table entry */
+    unsigned op;                /* code bits, operation, extra bits, or */
+                                /*  window position, window bytes to copy */
+    unsigned len;               /* match length, unused bytes */
+    unsigned dist;              /* match distance */
+    unsigned char *from;        /* where to copy match from */
+    unsigned extra_safe;        /* copy chunks safely in all cases */
+
+    /* copy state to local variables */
+    state = (struct inflate_state *)strm->state;
+    in = strm->next_in;
+    last = in + (strm->avail_in - (INFLATE_FAST_MIN_HAVE - 1));
+    out = strm->next_out;
+    beg = out - (start - strm->avail_out);
+    end = out + (strm->avail_out - (INFLATE_FAST_MIN_LEFT - 1));
+    safe = out + strm->avail_out;
+#ifdef INFLATE_STRICT
+    dmax = state->dmax;
+#endif
+    wsize = state->wsize;
+    whave = state->whave;
+    wnext = state->wnext;
+    window = state->window;
+    hold = state->hold;
+    bits = state->bits;
+    lcode = state->lencode;
+    dcode = state->distcode;
+    lmask = (1U << state->lenbits) - 1;
+    dmask = (1U << state->distbits) - 1;
+
+    /* Detect if out and window point to the same memory allocation. In this instance it is
+       necessary to use safe chunk copy functions to prevent overwriting the window. If the
+       window is overwritten then future matches with far distances will fail to copy correctly. */
+    extra_safe = (wsize != 0 && out >= window && out + INFLATE_FAST_MIN_LEFT <= window + wsize);
+
+#define REFILL() do { \
+        hold |= load_64_bits(in, bits); \
+        in += 7; \
+        in -= ((bits >> 3) & 7); \
+        bits |= 56; \
+    } while (0)
+
+    /* decode literals and length/distances until end-of-block or not enough
+       input data or output space */
+    do {
+        REFILL();
+        here = lcode + (hold & lmask);
+        if (here->op == 0) {
+            *out++ = (unsigned char)(here->val);
+            DROPBITS(here->bits);
+            here = lcode + (hold & lmask);
+            if (here->op == 0) {
+                *out++ = (unsigned char)(here->val);
+                DROPBITS(here->bits);
+                here = lcode + (hold & lmask);
+            }
+        }
+      dolen:
+        DROPBITS(here->bits);
+        op = here->op;
+        if (op == 0) {                          /* literal */
+            Tracevv((stderr, here->val >= 0x20 && here->val < 0x7f ?
+                    "inflate:         literal '%c'\n" :
+                    "inflate:         literal 0x%02x\n", here->val));
+            *out++ = (unsigned char)(here->val);
+        } else if (op & 16) {                     /* length base */
+            len = here->val;
+            op &= MAX_BITS;                       /* number of extra bits */
+            len += BITS(op);
+            DROPBITS(op);
+            Tracevv((stderr, "inflate:         length %u\n", len));
+            here = dcode + (hold & dmask);
+            if (bits < MAX_BITS + MAX_DIST_EXTRA_BITS) {
+                REFILL();
+            }
+          dodist:
+            DROPBITS(here->bits);
+            op = here->op;
+            if (op & 16) {                      /* distance base */
+                dist = here->val;
+                op &= MAX_BITS;                 /* number of extra bits */
+                dist += BITS(op);
+#ifdef INFLATE_STRICT
+                if (dist > dmax) {
+                    SET_BAD("invalid distance too far back");
+                    break;
+                }
+#endif
+                DROPBITS(op);
+                Tracevv((stderr, "inflate:         distance %u\n", dist));
+                op = (unsigned)(out - beg);     /* max distance in output */
+                if (dist > op) {                /* see if copy from window */
+                    op = dist - op;             /* distance back in window */
+                    if (op > whave) {
+                        if (state->sane) {
+                            SET_BAD("invalid distance too far back");
+                            break;
+                        }
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+                        if (len <= op - whave) {
+                            do {
+                                *out++ = 0;
+                            } while (--len);
+                            continue;
+                        }
+                        len -= op - whave;
+                        do {
+                            *out++ = 0;
+                        } while (--op > whave);
+                        if (op == 0) {
+                            from = out - dist;
+                            do {
+                                *out++ = *from++;
+                            } while (--len);
+                            continue;
+                        }
+#endif
+                    }
+                    from = window;
+                    if (wnext == 0) {           /* very common case */
+                        from += wsize - op;
+                    } else if (wnext >= op) {   /* contiguous in window */
+                        from += wnext - op;
+                    } else {                    /* wrap around window */
+                        op -= wnext;
+                        from += wsize - op;
+                        if (op < len) {         /* some from end of window */
+                            len -= op;
+                            out = chunkcopy_safe(out, from, op, safe);
+                            from = window;      /* more from start of window */
+                            op = wnext;
+                            /* This (rare) case can create a situation where
+                               the first chunkcopy below must be checked.
+                             */
+                        }
+                    }
+                    if (op < len) {             /* still need some from output */
+                        len -= op;
+                        out = chunkcopy_safe(out, from, op, safe);
+                        out = CHUNKUNROLL(out, &dist, &len);
+                        out = chunkcopy_safe(out, out - dist, len, safe);
+                    } else {
+                        out = chunkcopy_safe(out, from, len, safe);
+                    }
+                } else if (extra_safe) {
+                    /* Whole reference is in range of current output. */
+                    if (dist >= len || dist >= state->chunksize)
+                        out = chunkcopy_safe(out, out - dist, len, safe);
+                    else
+                        out = CHUNKMEMSET_SAFE(out, dist, len, (unsigned)((safe - out) + 1));
+                } else {
+                    /* Whole reference is in range of current output.  No range checks are
+                       necessary because we start with room for at least 258 bytes of output,
+                       so unroll and roundoff operations can write beyond `out+len` so long
+                       as they stay within 258 bytes of `out`.
+                    */
+                    if (dist >= len || dist >= state->chunksize)
+                        out = CHUNKCOPY(out, out - dist, len);
+                    else
+                        out = CHUNKMEMSET(out, dist, len);
+                }
+            } else if ((op & 64) == 0) {          /* 2nd level distance code */
+                here = dcode + here->val + BITS(op);
+                goto dodist;
+            } else {
+                SET_BAD("invalid distance code");
+                break;
+            }
+        } else if ((op & 64) == 0) {              /* 2nd level length code */
+            here = lcode + here->val + BITS(op);
+            goto dolen;
+        } else if (op & 32) {                     /* end-of-block */
+            Tracevv((stderr, "inflate:         end of block\n"));
+            state->mode = TYPE;
+            break;
+        } else {
+            SET_BAD("invalid literal/length code");
+            break;
+        }
+    } while (in < last && out < end);
+
+    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
+    len = bits >> 3;
+    in -= len;
+    bits -= len << 3;
+    hold &= (UINT64_C(1) << bits) - 1;
+
+    /* update state and return */
+    strm->next_in = in;
+    strm->next_out = out;
+    strm->avail_in = (unsigned)(in < last ? (INFLATE_FAST_MIN_HAVE - 1) + (last - in)
+                                          : (INFLATE_FAST_MIN_HAVE - 1) - (in - last));
+    strm->avail_out = (unsigned)(out < end ? (INFLATE_FAST_MIN_LEFT - 1) + (end - out)
+                                           : (INFLATE_FAST_MIN_LEFT - 1) - (out - end));
+
+    Assert(bits <= 32, "Remaining bits greater than 32");
+    state->hold = (uint32_t)hold;
+    state->bits = bits;
+    return;
+}
+
+/*
+   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
+   - Using bit fields for code structure
+   - Different op definition to avoid & for extra bits (do & for table bits)
+   - Three separate decoding do-loops for direct, window, and wnext == 0
+   - Special case for distance > 1 copies to do overlapped load and store copy
+   - Explicit branch predictions (based on measured branch probabilities)
+   - Deferring match copy and interspersed it with decoding subsequent codes
+   - Swapping literal/length else
+   - Swapping window/direct else
+   - Larger unrolled copy loops (three is about right)
+   - Moving len -= 3 statement into middle of loop
+ */
diff --git a/3rdparty/zlib-ng/inffixed_tbl.h b/3rdparty/zlib-ng/inffixed_tbl.h
new file mode 100644
index 000000000000..7292fa06eccd
--- /dev/null
+++ b/3rdparty/zlib-ng/inffixed_tbl.h
@@ -0,0 +1,94 @@
+/* inffixed_tbl.h -- table for decoding fixed codes
+ * Generated automatically by makefixed().
+ */
+
+/* WARNING: this file should *not* be used by applications.
+ * It is part of the implementation of this library and is
+ * subject to change. Applications should only use zlib.h.
+ */
+
+static const code lenfix[512] = {
+    {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48},
+    {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128},
+    {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59},
+    {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176},
+    {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20},
+    {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100},
+    {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8},
+    {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216},
+    {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76},
+    {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114},
+    {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2},
+    {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148},
+    {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42},
+    {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86},
+    {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15},
+    {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236},
+    {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62},
+    {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142},
+    {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31},
+    {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162},
+    {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25},
+    {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105},
+    {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4},
+    {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202},
+    {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69},
+    {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125},
+    {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13},
+    {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195},
+    {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35},
+    {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91},
+    {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19},
+    {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246},
+    {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55},
+    {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135},
+    {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99},
+    {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190},
+    {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16},
+    {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96},
+    {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6},
+    {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209},
+    {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72},
+    {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116},
+    {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4},
+    {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153},
+    {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44},
+    {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82},
+    {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11},
+    {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229},
+    {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58},
+    {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138},
+    {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51},
+    {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173},
+    {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30},
+    {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110},
+    {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0},
+    {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195},
+    {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65},
+    {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121},
+    {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9},
+    {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258},
+    {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37},
+    {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93},
+    {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23},
+    {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251},
+    {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51},
+    {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131},
+    {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67},
+    {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183},
+    {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23},
+    {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103},
+    {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9},
+    {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223},
+    {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79},
+    {0,9,255}
+};
+
+static const code distfix[32] = {
+    {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025},
+    {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193},
+    {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385},
+    {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577},
+    {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073},
+    {22,5,193},{64,5,0}
+};
diff --git a/3rdparty/zlib-ng/inflate.c b/3rdparty/zlib-ng/inflate.c
new file mode 100644
index 000000000000..fe55c498e312
--- /dev/null
+++ b/3rdparty/zlib-ng/inflate.c
@@ -0,0 +1,1413 @@
+/* inflate.c -- zlib decompression
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inflate_p.h"
+#include "inffixed_tbl.h"
+#include "functable.h"
+
+/* Avoid conflicts with zlib.h macros */
+#ifdef ZLIB_COMPAT
+# undef inflateInit
+# undef inflateInit2
+#endif
+
+/* function prototypes */
+static int inflateStateCheck(PREFIX3(stream) *strm);
+static int updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum);
+static uint32_t syncsearch(uint32_t *have, const unsigned char *buf, uint32_t len);
+
+static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst,
+                           const uint8_t *src, uint32_t copy) {
+    if (!copy) return;
+    struct inflate_state *state = (struct inflate_state*)strm->state;
+#ifdef GUNZIP
+    if (state->flags) {
+        functable.crc32_fold_copy(&state->crc_fold, dst, src, copy);
+    } else
+#endif
+    {
+        strm->adler = state->check = functable.adler32_fold_copy(state->check, dst, src, copy);
+    }
+}
+
+static inline void inf_chksum(PREFIX3(stream) *strm, const uint8_t *src, uint32_t len) {
+    struct inflate_state *state = (struct inflate_state*)strm->state;
+#ifdef GUNZIP
+    if (state->flags) {
+        functable.crc32_fold(&state->crc_fold, src, len, 0);
+    } else
+#endif
+    {
+        strm->adler = state->check = functable.adler32(state->check, src, len);
+    }
+}
+
+static int inflateStateCheck(PREFIX3(stream) *strm) {
+    struct inflate_state *state;
+    if (strm == NULL || strm->zalloc == NULL || strm->zfree == NULL)
+        return 1;
+    state = (struct inflate_state *)strm->state;
+    if (state == NULL || state->strm != strm || state->mode < HEAD || state->mode > SYNC)
+        return 1;
+    return 0;
+}
+
+int32_t Z_EXPORT PREFIX(inflateResetKeep)(PREFIX3(stream) *strm) {
+    struct inflate_state *state;
+
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    strm->total_in = strm->total_out = state->total = 0;
+    strm->msg = NULL;
+    if (state->wrap)        /* to support ill-conceived Java test suite */
+        strm->adler = state->wrap & 1;
+    state->mode = HEAD;
+    state->check = ADLER32_INITIAL_VALUE;
+    state->last = 0;
+    state->havedict = 0;
+    state->flags = -1;
+    state->dmax = 32768U;
+    state->head = NULL;
+    state->hold = 0;
+    state->bits = 0;
+    state->lencode = state->distcode = state->next = state->codes;
+    state->sane = 1;
+    state->back = -1;
+    INFLATE_RESET_KEEP_HOOK(strm);  /* hook for IBM Z DFLTCC */
+    Tracev((stderr, "inflate: reset\n"));
+    return Z_OK;
+}
+
+int32_t Z_EXPORT PREFIX(inflateReset)(PREFIX3(stream) *strm) {
+    struct inflate_state *state;
+
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    state->wsize = 0;
+    state->whave = 0;
+    state->wnext = 0;
+    return PREFIX(inflateResetKeep)(strm);
+}
+
+int32_t Z_EXPORT PREFIX(inflateReset2)(PREFIX3(stream) *strm, int32_t windowBits) {
+    int wrap;
+    struct inflate_state *state;
+
+    /* get the state */
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+
+    /* extract wrap request from windowBits parameter */
+    if (windowBits < 0) {
+        wrap = 0;
+        if (windowBits < -MAX_WBITS)
+            return Z_STREAM_ERROR;
+        windowBits = -windowBits;
+    } else {
+        wrap = (windowBits >> 4) + 5;
+#ifdef GUNZIP
+        if (windowBits < 48)
+            windowBits &= MAX_WBITS;
+#endif
+    }
+
+    /* set number of window bits, free window if different */
+    if (windowBits && (windowBits < MIN_WBITS || windowBits > MAX_WBITS))
+        return Z_STREAM_ERROR;
+    if (state->window != NULL && state->wbits != (unsigned)windowBits) {
+        ZFREE_WINDOW(strm, state->window);
+        state->window = NULL;
+    }
+
+    /* update state and reset the rest of it */
+    state->wrap = wrap;
+    state->wbits = (unsigned)windowBits;
+    return PREFIX(inflateReset)(strm);
+}
+
+/* This function is hidden in ZLIB_COMPAT builds. */
+int32_t ZNG_CONDEXPORT PREFIX(inflateInit2)(PREFIX3(stream) *strm, int32_t windowBits) {
+    int32_t ret;
+    struct inflate_state *state;
+
+    /* Initialize functable earlier. */
+    functable.force_init();
+
+    if (strm == NULL)
+        return Z_STREAM_ERROR;
+    strm->msg = NULL;                   /* in case we return an error */
+    if (strm->zalloc == NULL) {
+        strm->zalloc = PREFIX(zcalloc);
+        strm->opaque = NULL;
+    }
+    if (strm->zfree == NULL)
+        strm->zfree = PREFIX(zcfree);
+    state = ZALLOC_INFLATE_STATE(strm);
+    if (state == NULL)
+        return Z_MEM_ERROR;
+    Tracev((stderr, "inflate: allocated\n"));
+    strm->state = (struct internal_state *)state;
+    state->strm = strm;
+    state->window = NULL;
+    state->mode = HEAD;     /* to pass state test in inflateReset2() */
+    state->chunksize = functable.chunksize();
+    ret = PREFIX(inflateReset2)(strm, windowBits);
+    if (ret != Z_OK) {
+        ZFREE_STATE(strm, state);
+        strm->state = NULL;
+    }
+    return ret;
+}
+
+#ifndef ZLIB_COMPAT
+int32_t Z_EXPORT PREFIX(inflateInit)(PREFIX3(stream) *strm) {
+    return PREFIX(inflateInit2)(strm, DEF_WBITS);
+}
+#endif
+
+/* Function used by zlib.h and zlib-ng version 2.0 macros */
+int32_t Z_EXPORT PREFIX(inflateInit_)(PREFIX3(stream) *strm, const char *version, int32_t stream_size) {
+    if (CHECK_VER_STSIZE(version, stream_size))
+        return Z_VERSION_ERROR;
+    return PREFIX(inflateInit2)(strm, DEF_WBITS);
+}
+
+/* Function used by zlib.h and zlib-ng version 2.0 macros */
+int32_t Z_EXPORT PREFIX(inflateInit2_)(PREFIX3(stream) *strm, int32_t windowBits, const char *version, int32_t stream_size) {
+    if (CHECK_VER_STSIZE(version, stream_size))
+        return Z_VERSION_ERROR;
+    return PREFIX(inflateInit2)(strm, windowBits);
+}
+
+int32_t Z_EXPORT PREFIX(inflatePrime)(PREFIX3(stream) *strm, int32_t bits, int32_t value) {
+    struct inflate_state *state;
+
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    if (bits == 0)
+        return Z_OK;
+    INFLATE_PRIME_HOOK(strm, bits, value);  /* hook for IBM Z DFLTCC */
+    state = (struct inflate_state *)strm->state;
+    if (bits < 0) {
+        state->hold = 0;
+        state->bits = 0;
+        return Z_OK;
+    }
+    if (bits > 16 || state->bits + (unsigned int)bits > 32)
+        return Z_STREAM_ERROR;
+    value &= (1L << bits) - 1;
+    state->hold += (unsigned)value << state->bits;
+    state->bits += (unsigned int)bits;
+    return Z_OK;
+}
+
+/*
+   Return state with length and distance decoding tables and index sizes set to
+   fixed code decoding.  This returns fixed tables from inffixed_tbl.h.
+ */
+
+void Z_INTERNAL PREFIX(fixedtables)(struct inflate_state *state) {
+    state->lencode = lenfix;
+    state->lenbits = 9;
+    state->distcode = distfix;
+    state->distbits = 5;
+}
+
+int Z_INTERNAL PREFIX(inflate_ensure_window)(struct inflate_state *state) {
+    /* if it hasn't been done already, allocate space for the window */
+    if (state->window == NULL) {
+        unsigned wsize = 1U << state->wbits;
+        state->window = (unsigned char *)ZALLOC_WINDOW(state->strm, wsize + state->chunksize, sizeof(unsigned char));
+        if (state->window == NULL)
+            return Z_MEM_ERROR;
+#ifdef Z_MEMORY_SANITIZER
+        /* This is _not_ to subvert the memory sanitizer but to instead unposion some
+           data we willingly and purposefully load uninitialized into vector registers
+           in order to safely read the last < chunksize bytes of the window. */
+        __msan_unpoison(state->window + wsize, state->chunksize);
+#endif
+    }
+
+    /* if window not in use yet, initialize */
+    if (state->wsize == 0) {
+        state->wsize = 1U << state->wbits;
+        state->wnext = 0;
+        state->whave = 0;
+    }
+
+    return Z_OK;
+}
+
+/*
+   Update the window with the last wsize (normally 32K) bytes written before
+   returning.  If window does not exist yet, create it.  This is only called
+   when a window is already in use, or when output has been written during this
+   inflate call, but the end of the deflate stream has not been reached yet.
+   It is also called to create a window for dictionary data when a dictionary
+   is loaded.
+
+   Providing output buffers larger than 32K to inflate() should provide a speed
+   advantage, since only the last 32K of output is copied to the sliding window
+   upon return from inflate(), and since all distances after the first 32K of
+   output will fall in the output data, making match copies simpler and faster.
+   The advantage may be dependent on the size of the processor's data caches.
+ */
+static int32_t updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t len, int32_t cksum) {
+    struct inflate_state *state;
+    uint32_t dist;
+
+    state = (struct inflate_state *)strm->state;
+
+    if (PREFIX(inflate_ensure_window)(state)) return 1;
+
+    /* len state->wsize or less output bytes into the circular window */
+    if (len >= state->wsize) {
+        /* Only do this if the caller specifies to checksum bytes AND the platform requires
+         * it (s/390 being the primary exception to this. Also, for now, do the adler checksums
+         * if not a gzip based header. The inline adler checksums will come in the near future,
+         * possibly the next commit */
+        if (INFLATE_NEED_CHECKSUM(strm) && cksum) {
+            /* We have to split the checksum over non-copied and copied bytes */
+            if (len > state->wsize)
+                inf_chksum(strm, end - len, len - state->wsize);
+            inf_chksum_cpy(strm, state->window, end - state->wsize, state->wsize);
+        } else {
+            memcpy(state->window, end - state->wsize, state->wsize);
+        }
+
+        state->wnext = 0;
+        state->whave = state->wsize;
+    } else {
+        dist = state->wsize - state->wnext;
+        /* Only do this if the caller specifies to checksum bytes AND the platform requires
+         * We need to maintain the correct order here for the checksum */
+        dist = MIN(dist, len);
+        if (INFLATE_NEED_CHECKSUM(strm) && cksum) {
+            inf_chksum_cpy(strm, state->window + state->wnext, end - len, dist);
+        } else {
+            memcpy(state->window + state->wnext, end - len, dist);
+        }
+        len -= dist;
+        if (len) {
+            if (INFLATE_NEED_CHECKSUM(strm) && cksum) {
+                inf_chksum_cpy(strm, state->window, end - len, len);
+            } else {
+                memcpy(state->window, end - len, len);
+            }
+
+            state->wnext = len;
+            state->whave = state->wsize;
+        } else {
+            state->wnext += dist;
+            if (state->wnext == state->wsize)
+                state->wnext = 0;
+            if (state->whave < state->wsize)
+                state->whave += dist;
+        }
+    }
+    return 0;
+}
+
+/*
+   Private macros for inflate()
+   Look in inflate_p.h for macros shared with inflateBack()
+*/
+
+/* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */
+#define PULLBYTE() \
+    do { \
+        if (have == 0) goto inf_leave; \
+        have--; \
+        hold += ((unsigned)(*next++) << bits); \
+        bits += 8; \
+    } while (0)
+
+/*
+   inflate() uses a state machine to process as much input data and generate as
+   much output data as possible before returning.  The state machine is
+   structured roughly as follows:
+
+    for (;;) switch (state) {
+    ...
+    case STATEn:
+        if (not enough input data or output space to make progress)
+            return;
+        ... make progress ...
+        state = STATEm;
+        break;
+    ...
+    }
+
+   so when inflate() is called again, the same case is attempted again, and
+   if the appropriate resources are provided, the machine proceeds to the
+   next state.  The NEEDBITS() macro is usually the way the state evaluates
+   whether it can proceed or should return.  NEEDBITS() does the return if
+   the requested bits are not available.  The typical use of the BITS macros
+   is:
+
+        NEEDBITS(n);
+        ... do something with BITS(n) ...
+        DROPBITS(n);
+
+   where NEEDBITS(n) either returns from inflate() if there isn't enough
+   input left to load n bits into the accumulator, or it continues.  BITS(n)
+   gives the low n bits in the accumulator.  When done, DROPBITS(n) drops
+   the low n bits off the accumulator.  INITBITS() clears the accumulator
+   and sets the number of available bits to zero.  BYTEBITS() discards just
+   enough bits to put the accumulator on a byte boundary.  After BYTEBITS()
+   and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
+
+   NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
+   if there is no input available.  The decoding of variable length codes uses
+   PULLBYTE() directly in order to pull just enough bytes to decode the next
+   code, and no more.
+
+   Some states loop until they get enough input, making sure that enough
+   state information is maintained to continue the loop where it left off
+   if NEEDBITS() returns in the loop.  For example, want, need, and keep
+   would all have to actually be part of the saved state in case NEEDBITS()
+   returns:
+
+    case STATEw:
+        while (want < need) {
+            NEEDBITS(n);
+            keep[want++] = BITS(n);
+            DROPBITS(n);
+        }
+        state = STATEx;
+    case STATEx:
+
+   As shown above, if the next state is also the next case, then the break
+   is omitted.
+
+   A state may also return if there is not enough output space available to
+   complete that state.  Those states are copying stored data, writing a
+   literal byte, and copying a matching string.
+
+   When returning, a "goto inf_leave" is used to update the total counters,
+   update the check value, and determine whether any progress has been made
+   during that inflate() call in order to return the proper return code.
+   Progress is defined as a change in either strm->avail_in or strm->avail_out.
+   When there is a window, goto inf_leave will update the window with the last
+   output written.  If a goto inf_leave occurs in the middle of decompression
+   and there is no window currently, goto inf_leave will create one and copy
+   output to the window for the next call of inflate().
+
+   In this implementation, the flush parameter of inflate() only affects the
+   return code (per zlib.h).  inflate() always writes as much as possible to
+   strm->next_out, given the space available and the provided input--the effect
+   documented in zlib.h of Z_SYNC_FLUSH.  Furthermore, inflate() always defers
+   the allocation of and copying into a sliding window until necessary, which
+   provides the effect documented in zlib.h for Z_FINISH when the entire input
+   stream available.  So the only thing the flush parameter actually does is:
+   when flush is set to Z_FINISH, inflate() cannot return Z_OK.  Instead it
+   will return Z_BUF_ERROR if it has not reached the end of the stream.
+ */
+
+int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
+    struct inflate_state *state;
+    const unsigned char *next;  /* next input */
+    unsigned char *put;         /* next output */
+    unsigned have, left;        /* available input and output */
+    uint32_t hold;              /* bit buffer */
+    unsigned bits;              /* bits in bit buffer */
+    uint32_t in, out;           /* save starting available input and output */
+    unsigned copy;              /* number of stored or match bytes to copy */
+    unsigned char *from;        /* where to copy match bytes from */
+    code here;                  /* current decoding table entry */
+    code last;                  /* parent table entry */
+    unsigned len;               /* length to copy for repeats, bits to drop */
+    int32_t ret;                /* return code */
+#ifdef GUNZIP
+    unsigned char hbuf[4];      /* buffer for gzip header crc calculation */
+#endif
+    static const uint16_t order[19] = /* permutation of code lengths */
+        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+    if (inflateStateCheck(strm) || strm->next_out == NULL ||
+        (strm->next_in == NULL && strm->avail_in != 0))
+        return Z_STREAM_ERROR;
+
+    state = (struct inflate_state *)strm->state;
+    if (state->mode == TYPE)      /* skip check */
+        state->mode = TYPEDO;
+    LOAD();
+    in = have;
+    out = left;
+    ret = Z_OK;
+    for (;;)
+        switch (state->mode) {
+        case HEAD:
+            if (state->wrap == 0) {
+                state->mode = TYPEDO;
+                break;
+            }
+            NEEDBITS(16);
+#ifdef GUNZIP
+            if ((state->wrap & 2) && hold == 0x8b1f) {  /* gzip header */
+                if (state->wbits == 0)
+                    state->wbits = MAX_WBITS;
+                state->check = CRC32_INITIAL_VALUE;
+                CRC2(state->check, hold);
+                INITBITS();
+                state->mode = FLAGS;
+                break;
+            }
+            if (state->head != NULL)
+                state->head->done = -1;
+            if (!(state->wrap & 1) ||   /* check if zlib header allowed */
+#else
+            if (
+#endif
+                ((BITS(8) << 8) + (hold >> 8)) % 31) {
+                SET_BAD("incorrect header check");
+                break;
+            }
+            if (BITS(4) != Z_DEFLATED) {
+                SET_BAD("unknown compression method");
+                break;
+            }
+            DROPBITS(4);
+            len = BITS(4) + 8;
+            if (state->wbits == 0)
+                state->wbits = len;
+            if (len > MAX_WBITS || len > state->wbits) {
+                SET_BAD("invalid window size");
+                break;
+            }
+            state->dmax = 1U << len;
+            state->flags = 0;               /* indicate zlib header */
+            Tracev((stderr, "inflate:   zlib header ok\n"));
+            strm->adler = state->check = ADLER32_INITIAL_VALUE;
+            state->mode = hold & 0x200 ? DICTID : TYPE;
+            INITBITS();
+            break;
+#ifdef GUNZIP
+
+        case FLAGS:
+            NEEDBITS(16);
+            state->flags = (int)(hold);
+            if ((state->flags & 0xff) != Z_DEFLATED) {
+                SET_BAD("unknown compression method");
+                break;
+            }
+            if (state->flags & 0xe000) {
+                SET_BAD("unknown header flags set");
+                break;
+            }
+            if (state->head != NULL)
+                state->head->text = (int)((hold >> 8) & 1);
+            if ((state->flags & 0x0200) && (state->wrap & 4))
+                CRC2(state->check, hold);
+            INITBITS();
+            state->mode = TIME;
+            Z_FALLTHROUGH;
+
+        case TIME:
+            NEEDBITS(32);
+            if (state->head != NULL)
+                state->head->time = hold;
+            if ((state->flags & 0x0200) && (state->wrap & 4))
+                CRC4(state->check, hold);
+            INITBITS();
+            state->mode = OS;
+            Z_FALLTHROUGH;
+
+        case OS:
+            NEEDBITS(16);
+            if (state->head != NULL) {
+                state->head->xflags = (int)(hold & 0xff);
+                state->head->os = (int)(hold >> 8);
+            }
+            if ((state->flags & 0x0200) && (state->wrap & 4))
+                CRC2(state->check, hold);
+            INITBITS();
+            state->mode = EXLEN;
+            Z_FALLTHROUGH;
+
+        case EXLEN:
+            if (state->flags & 0x0400) {
+                NEEDBITS(16);
+                state->length = (uint16_t)hold;
+                if (state->head != NULL)
+                    state->head->extra_len = (uint16_t)hold;
+                if ((state->flags & 0x0200) && (state->wrap & 4))
+                    CRC2(state->check, hold);
+                INITBITS();
+            } else if (state->head != NULL) {
+                state->head->extra = NULL;
+            }
+            state->mode = EXTRA;
+            Z_FALLTHROUGH;
+
+        case EXTRA:
+            if (state->flags & 0x0400) {
+                copy = state->length;
+                if (copy > have)
+                    copy = have;
+                if (copy) {
+                    if (state->head != NULL && state->head->extra != NULL) {
+                        len = state->head->extra_len - state->length;
+                        if (len < state->head->extra_max) {
+                            memcpy(state->head->extra + len, next,
+                                    len + copy > state->head->extra_max ?
+                                    state->head->extra_max - len : copy);
+                        }
+                    }
+                    if ((state->flags & 0x0200) && (state->wrap & 4)) {
+                        state->check = PREFIX(crc32)(state->check, next, copy);
+                    }
+                    have -= copy;
+                    next += copy;
+                    state->length -= copy;
+                }
+                if (state->length)
+                    goto inf_leave;
+            }
+            state->length = 0;
+            state->mode = NAME;
+            Z_FALLTHROUGH;
+
+        case NAME:
+            if (state->flags & 0x0800) {
+                if (have == 0) goto inf_leave;
+                copy = 0;
+                do {
+                    len = (unsigned)(next[copy++]);
+                    if (state->head != NULL && state->head->name != NULL && state->length < state->head->name_max)
+                        state->head->name[state->length++] = (unsigned char)len;
+                } while (len && copy < have);
+                if ((state->flags & 0x0200) && (state->wrap & 4))
+                    state->check = PREFIX(crc32)(state->check, next, copy);
+                have -= copy;
+                next += copy;
+                if (len)
+                    goto inf_leave;
+            } else if (state->head != NULL) {
+                state->head->name = NULL;
+            }
+            state->length = 0;
+            state->mode = COMMENT;
+            Z_FALLTHROUGH;
+
+        case COMMENT:
+            if (state->flags & 0x1000) {
+                if (have == 0) goto inf_leave;
+                copy = 0;
+                do {
+                    len = (unsigned)(next[copy++]);
+                    if (state->head != NULL && state->head->comment != NULL
+                        && state->length < state->head->comm_max)
+                        state->head->comment[state->length++] = (unsigned char)len;
+                } while (len && copy < have);
+                if ((state->flags & 0x0200) && (state->wrap & 4))
+                    state->check = PREFIX(crc32)(state->check, next, copy);
+                have -= copy;
+                next += copy;
+                if (len)
+                    goto inf_leave;
+            } else if (state->head != NULL) {
+                state->head->comment = NULL;
+            }
+            state->mode = HCRC;
+            Z_FALLTHROUGH;
+
+        case HCRC:
+            if (state->flags & 0x0200) {
+                NEEDBITS(16);
+                if ((state->wrap & 4) && hold != (state->check & 0xffff)) {
+                    SET_BAD("header crc mismatch");
+                    break;
+                }
+                INITBITS();
+            }
+            if (state->head != NULL) {
+                state->head->hcrc = (int)((state->flags >> 9) & 1);
+                state->head->done = 1;
+            }
+            /* compute crc32 checksum if not in raw mode */
+            if ((state->wrap & 4) && state->flags)
+                strm->adler = state->check = functable.crc32_fold_reset(&state->crc_fold);
+            state->mode = TYPE;
+            break;
+#endif
+        case DICTID:
+            NEEDBITS(32);
+            strm->adler = state->check = ZSWAP32(hold);
+            INITBITS();
+            state->mode = DICT;
+            Z_FALLTHROUGH;
+
+        case DICT:
+            if (state->havedict == 0) {
+                RESTORE();
+                return Z_NEED_DICT;
+            }
+            strm->adler = state->check = ADLER32_INITIAL_VALUE;
+            state->mode = TYPE;
+            Z_FALLTHROUGH;
+
+        case TYPE:
+            if (flush == Z_BLOCK || flush == Z_TREES)
+                goto inf_leave;
+            Z_FALLTHROUGH;
+
+        case TYPEDO:
+            /* determine and dispatch block type */
+            INFLATE_TYPEDO_HOOK(strm, flush);  /* hook for IBM Z DFLTCC */
+            if (state->last) {
+                BYTEBITS();
+                state->mode = CHECK;
+                break;
+            }
+            NEEDBITS(3);
+            state->last = BITS(1);
+            DROPBITS(1);
+            switch (BITS(2)) {
+            case 0:                             /* stored block */
+                Tracev((stderr, "inflate:     stored block%s\n", state->last ? " (last)" : ""));
+                state->mode = STORED;
+                break;
+            case 1:                             /* fixed block */
+                PREFIX(fixedtables)(state);
+                Tracev((stderr, "inflate:     fixed codes block%s\n", state->last ? " (last)" : ""));
+                state->mode = LEN_;             /* decode codes */
+                if (flush == Z_TREES) {
+                    DROPBITS(2);
+                    goto inf_leave;
+                }
+                break;
+            case 2:                             /* dynamic block */
+                Tracev((stderr, "inflate:     dynamic codes block%s\n", state->last ? " (last)" : ""));
+                state->mode = TABLE;
+                break;
+            case 3:
+                SET_BAD("invalid block type");
+            }
+            DROPBITS(2);
+            break;
+
+        case STORED:
+            /* get and verify stored block length */
+            BYTEBITS();                         /* go to byte boundary */
+            NEEDBITS(32);
+            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
+                SET_BAD("invalid stored block lengths");
+                break;
+            }
+            state->length = (uint16_t)hold;
+            Tracev((stderr, "inflate:       stored length %u\n", state->length));
+            INITBITS();
+            state->mode = COPY_;
+            if (flush == Z_TREES)
+                goto inf_leave;
+            Z_FALLTHROUGH;
+
+        case COPY_:
+            state->mode = COPY;
+            Z_FALLTHROUGH;
+
+        case COPY:
+            /* copy stored block from input to output */
+            copy = state->length;
+            if (copy) {
+                copy = MIN(copy, have);
+                copy = MIN(copy, left);
+                if (copy == 0)
+                    goto inf_leave;
+                memcpy(put, next, copy);
+                have -= copy;
+                next += copy;
+                left -= copy;
+                put += copy;
+                state->length -= copy;
+                break;
+            }
+            Tracev((stderr, "inflate:       stored end\n"));
+            state->mode = TYPE;
+            break;
+
+        case TABLE:
+            /* get dynamic table entries descriptor */
+            NEEDBITS(14);
+            state->nlen = BITS(5) + 257;
+            DROPBITS(5);
+            state->ndist = BITS(5) + 1;
+            DROPBITS(5);
+            state->ncode = BITS(4) + 4;
+            DROPBITS(4);
+#ifndef PKZIP_BUG_WORKAROUND
+            if (state->nlen > 286 || state->ndist > 30) {
+                SET_BAD("too many length or distance symbols");
+                break;
+            }
+#endif
+            Tracev((stderr, "inflate:       table sizes ok\n"));
+            state->have = 0;
+            state->mode = LENLENS;
+            Z_FALLTHROUGH;
+
+        case LENLENS:
+            /* get code length code lengths (not a typo) */
+            while (state->have < state->ncode) {
+                NEEDBITS(3);
+                state->lens[order[state->have++]] = (uint16_t)BITS(3);
+                DROPBITS(3);
+            }
+            while (state->have < 19)
+                state->lens[order[state->have++]] = 0;
+            state->next = state->codes;
+            state->lencode = (const code *)(state->next);
+            state->lenbits = 7;
+            ret = zng_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work);
+            if (ret) {
+                SET_BAD("invalid code lengths set");
+                break;
+            }
+            Tracev((stderr, "inflate:       code lengths ok\n"));
+            state->have = 0;
+            state->mode = CODELENS;
+            Z_FALLTHROUGH;
+
+        case CODELENS:
+            /* get length and distance code code lengths */
+            while (state->have < state->nlen + state->ndist) {
+                for (;;) {
+                    here = state->lencode[BITS(state->lenbits)];
+                    if (here.bits <= bits) break;
+                    PULLBYTE();
+                }
+                if (here.val < 16) {
+                    DROPBITS(here.bits);
+                    state->lens[state->have++] = here.val;
+                } else {
+                    if (here.val == 16) {
+                        NEEDBITS(here.bits + 2);
+                        DROPBITS(here.bits);
+                        if (state->have == 0) {
+                            SET_BAD("invalid bit length repeat");
+                            break;
+                        }
+                        len = state->lens[state->have - 1];
+                        copy = 3 + BITS(2);
+                        DROPBITS(2);
+                    } else if (here.val == 17) {
+                        NEEDBITS(here.bits + 3);
+                        DROPBITS(here.bits);
+                        len = 0;
+                        copy = 3 + BITS(3);
+                        DROPBITS(3);
+                    } else {
+                        NEEDBITS(here.bits + 7);
+                        DROPBITS(here.bits);
+                        len = 0;
+                        copy = 11 + BITS(7);
+                        DROPBITS(7);
+                    }
+                    if (state->have + copy > state->nlen + state->ndist) {
+                        SET_BAD("invalid bit length repeat");
+                        break;
+                    }
+                    while (copy) {
+                        --copy;
+                        state->lens[state->have++] = (uint16_t)len;
+                    }
+                }
+            }
+
+            /* handle error breaks in while */
+            if (state->mode == BAD)
+                break;
+
+            /* check for end-of-block code (better have one) */
+            if (state->lens[256] == 0) {
+                SET_BAD("invalid code -- missing end-of-block");
+                break;
+            }
+
+            /* build code tables -- note: do not change the lenbits or distbits
+               values here (10 and 9) without reading the comments in inftrees.h
+               concerning the ENOUGH constants, which depend on those values */
+            state->next = state->codes;
+            state->lencode = (const code *)(state->next);
+            state->lenbits = 10;
+            ret = zng_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work);
+            if (ret) {
+                SET_BAD("invalid literal/lengths set");
+                break;
+            }
+            state->distcode = (const code *)(state->next);
+            state->distbits = 9;
+            ret = zng_inflate_table(DISTS, state->lens + state->nlen, state->ndist,
+                            &(state->next), &(state->distbits), state->work);
+            if (ret) {
+                SET_BAD("invalid distances set");
+                break;
+            }
+            Tracev((stderr, "inflate:       codes ok\n"));
+            state->mode = LEN_;
+            if (flush == Z_TREES)
+                goto inf_leave;
+            Z_FALLTHROUGH;
+
+        case LEN_:
+            state->mode = LEN;
+            Z_FALLTHROUGH;
+
+        case LEN:
+            /* use inflate_fast() if we have enough input and output */
+            if (have >= INFLATE_FAST_MIN_HAVE && left >= INFLATE_FAST_MIN_LEFT) {
+                RESTORE();
+                functable.inflate_fast(strm, out);
+                LOAD();
+                if (state->mode == TYPE)
+                    state->back = -1;
+                break;
+            }
+            state->back = 0;
+
+            /* get a literal, length, or end-of-block code */
+            for (;;) {
+                here = state->lencode[BITS(state->lenbits)];
+                if (here.bits <= bits)
+                    break;
+                PULLBYTE();
+            }
+            if (here.op && (here.op & 0xf0) == 0) {
+                last = here;
+                for (;;) {
+                    here = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)last.bits + (unsigned)here.bits <= bits)
+                        break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+                state->back += last.bits;
+            }
+            DROPBITS(here.bits);
+            state->back += here.bits;
+            state->length = here.val;
+
+            /* process literal */
+            if ((int)(here.op) == 0) {
+                Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
+                        "inflate:         literal '%c'\n" :
+                        "inflate:         literal 0x%02x\n", here.val));
+                state->mode = LIT;
+                break;
+            }
+
+            /* process end of block */
+            if (here.op & 32) {
+                Tracevv((stderr, "inflate:         end of block\n"));
+                state->back = -1;
+                state->mode = TYPE;
+                break;
+            }
+
+            /* invalid code */
+            if (here.op & 64) {
+                SET_BAD("invalid literal/length code");
+                break;
+            }
+
+            /* length code */
+            state->extra = (here.op & MAX_BITS);
+            state->mode = LENEXT;
+            Z_FALLTHROUGH;
+
+        case LENEXT:
+            /* get extra bits, if any */
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->length += BITS(state->extra);
+                DROPBITS(state->extra);
+                state->back += state->extra;
+            }
+            Tracevv((stderr, "inflate:         length %u\n", state->length));
+            state->was = state->length;
+            state->mode = DIST;
+            Z_FALLTHROUGH;
+
+        case DIST:
+            /* get distance code */
+            for (;;) {
+                here = state->distcode[BITS(state->distbits)];
+                if (here.bits <= bits)
+                    break;
+                PULLBYTE();
+            }
+            if ((here.op & 0xf0) == 0) {
+                last = here;
+                for (;;) {
+                    here = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)last.bits + (unsigned)here.bits <= bits)
+                        break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+                state->back += last.bits;
+            }
+            DROPBITS(here.bits);
+            state->back += here.bits;
+            if (here.op & 64) {
+                SET_BAD("invalid distance code");
+                break;
+            }
+            state->offset = here.val;
+            state->extra = (here.op & MAX_BITS);
+            state->mode = DISTEXT;
+            Z_FALLTHROUGH;
+
+        case DISTEXT:
+            /* get distance extra bits, if any */
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->offset += BITS(state->extra);
+                DROPBITS(state->extra);
+                state->back += state->extra;
+            }
+#ifdef INFLATE_STRICT
+            if (state->offset > state->dmax) {
+                SET_BAD("invalid distance too far back");
+                break;
+            }
+#endif
+            Tracevv((stderr, "inflate:         distance %u\n", state->offset));
+            state->mode = MATCH;
+            Z_FALLTHROUGH;
+
+        case MATCH:
+            /* copy match from window to output */
+            if (left == 0)
+                goto inf_leave;
+            copy = out - left;
+            if (state->offset > copy) {         /* copy from window */
+                copy = state->offset - copy;
+                if (copy > state->whave) {
+                    if (state->sane) {
+                        SET_BAD("invalid distance too far back");
+                        break;
+                    }
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+                    Trace((stderr, "inflate.c too far\n"));
+                    copy -= state->whave;
+                    copy = MIN(copy, state->length);
+                    copy = MIN(copy, left);
+                    left -= copy;
+                    state->length -= copy;
+                    do {
+                        *put++ = 0;
+                    } while (--copy);
+                    if (state->length == 0)
+                        state->mode = LEN;
+                    break;
+#endif
+                }
+                if (copy > state->wnext) {
+                    copy -= state->wnext;
+                    from = state->window + (state->wsize - copy);
+                } else {
+                    from = state->window + (state->wnext - copy);
+                }
+                copy = MIN(copy, state->length);
+                copy = MIN(copy, left);
+
+                put = chunkcopy_safe(put, from, copy, put + left);
+            } else {
+                copy = MIN(state->length, left);
+
+                put = functable.chunkmemset_safe(put, state->offset, copy, left);
+            }
+            left -= copy;
+            state->length -= copy;
+            if (state->length == 0)
+                state->mode = LEN;
+            break;
+
+        case LIT:
+            if (left == 0)
+                goto inf_leave;
+            *put++ = (unsigned char)(state->length);
+            left--;
+            state->mode = LEN;
+            break;
+
+        case CHECK:
+            if (state->wrap) {
+                NEEDBITS(32);
+                out -= left;
+                strm->total_out += out;
+                state->total += out;
+
+                /* compute crc32 checksum if not in raw mode */
+                if (INFLATE_NEED_CHECKSUM(strm) && state->wrap & 4) {
+                    if (out) {
+                        inf_chksum(strm, put - out, out);
+                    }
+#ifdef GUNZIP
+                    if (state->flags)
+                        strm->adler = state->check = functable.crc32_fold_final(&state->crc_fold);
+#endif
+                }
+                out = left;
+                if ((state->wrap & 4) && (
+#ifdef GUNZIP
+                     state->flags ? hold :
+#endif
+                     ZSWAP32(hold)) != state->check) {
+                    SET_BAD("incorrect data check");
+                    break;
+                }
+                INITBITS();
+                Tracev((stderr, "inflate:   check matches trailer\n"));
+            }
+#ifdef GUNZIP
+            state->mode = LENGTH;
+            Z_FALLTHROUGH;
+
+        case LENGTH:
+            if (state->wrap && state->flags) {
+                NEEDBITS(32);
+                if ((state->wrap & 4) && hold != (state->total & 0xffffffff)) {
+                    SET_BAD("incorrect length check");
+                    break;
+                }
+                INITBITS();
+                Tracev((stderr, "inflate:   length matches trailer\n"));
+            }
+#endif
+            state->mode = DONE;
+            Z_FALLTHROUGH;
+
+        case DONE:
+            /* inflate stream terminated properly */
+            ret = Z_STREAM_END;
+            goto inf_leave;
+
+        case BAD:
+            ret = Z_DATA_ERROR;
+            goto inf_leave;
+
+        case MEM:
+            return Z_MEM_ERROR;
+
+        case SYNC:
+
+        default:                 /* can't happen, but makes compilers happy */
+            return Z_STREAM_ERROR;
+        }
+
+    /*
+       Return from inflate(), updating the total counts and the check value.
+       If there was no progress during the inflate() call, return a buffer
+       error.  Call updatewindow() to create and/or update the window state.
+       Note: a memory error from inflate() is non-recoverable.
+     */
+  inf_leave:
+    RESTORE();
+    uint32_t check_bytes = out - strm->avail_out;
+    if (INFLATE_NEED_UPDATEWINDOW(strm) &&
+            (state->wsize || (out != strm->avail_out && state->mode < BAD &&
+                 (state->mode < CHECK || flush != Z_FINISH)))) {
+        /* update sliding window with respective checksum if not in "raw" mode */
+        if (updatewindow(strm, strm->next_out, check_bytes, state->wrap & 4)) {
+            state->mode = MEM;
+            return Z_MEM_ERROR;
+        }
+    }
+    in -= strm->avail_in;
+    out -= strm->avail_out;
+    strm->total_in += in;
+    strm->total_out += out;
+    state->total += out;
+
+    strm->data_type = (int)state->bits + (state->last ? 64 : 0) +
+                      (state->mode == TYPE ? 128 : 0) + (state->mode == LEN_ || state->mode == COPY_ ? 256 : 0);
+    if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) {
+        /* when no sliding window is used, hash the output bytes if no CHECK state */
+        if (INFLATE_NEED_CHECKSUM(strm) && !state->wsize && flush == Z_FINISH) {
+            inf_chksum(strm, put - check_bytes, check_bytes);
+        }
+        ret = Z_BUF_ERROR;
+    }
+    return ret;
+}
+
+int32_t Z_EXPORT PREFIX(inflateEnd)(PREFIX3(stream) *strm) {
+    struct inflate_state *state;
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (state->window != NULL)
+        ZFREE_WINDOW(strm, state->window);
+    ZFREE_STATE(strm, strm->state);
+    strm->state = NULL;
+    Tracev((stderr, "inflate: end\n"));
+    return Z_OK;
+}
+
+int32_t Z_EXPORT PREFIX(inflateGetDictionary)(PREFIX3(stream) *strm, uint8_t *dictionary, uint32_t *dictLength) {
+    struct inflate_state *state;
+
+    /* check state */
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+
+    INFLATE_GET_DICTIONARY_HOOK(strm, dictionary, dictLength);  /* hook for IBM Z DFLTCC */
+
+    /* copy dictionary */
+    if (state->whave && dictionary != NULL) {
+        memcpy(dictionary, state->window + state->wnext, state->whave - state->wnext);
+        memcpy(dictionary + state->whave - state->wnext, state->window, state->wnext);
+    }
+    if (dictLength != NULL)
+        *dictLength = state->whave;
+    return Z_OK;
+}
+
+int32_t Z_EXPORT PREFIX(inflateSetDictionary)(PREFIX3(stream) *strm, const uint8_t *dictionary, uint32_t dictLength) {
+    struct inflate_state *state;
+    unsigned long dictid;
+    int32_t ret;
+
+    /* check state */
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (state->wrap != 0 && state->mode != DICT)
+        return Z_STREAM_ERROR;
+
+    /* check for correct dictionary identifier */
+    if (state->mode == DICT) {
+        dictid = functable.adler32(ADLER32_INITIAL_VALUE, dictionary, dictLength);
+        if (dictid != state->check)
+            return Z_DATA_ERROR;
+    }
+
+    INFLATE_SET_DICTIONARY_HOOK(strm, dictionary, dictLength);  /* hook for IBM Z DFLTCC */
+
+    /* copy dictionary to window using updatewindow(), which will amend the
+       existing dictionary if appropriate */
+    ret = updatewindow(strm, dictionary + dictLength, dictLength, 0);
+    if (ret) {
+        state->mode = MEM;
+        return Z_MEM_ERROR;
+    }
+    state->havedict = 1;
+    Tracev((stderr, "inflate:   dictionary set\n"));
+    return Z_OK;
+}
+
+int32_t Z_EXPORT PREFIX(inflateGetHeader)(PREFIX3(stream) *strm, PREFIX(gz_headerp) head) {
+    struct inflate_state *state;
+
+    /* check state */
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if ((state->wrap & 2) == 0)
+        return Z_STREAM_ERROR;
+
+    /* save header structure */
+    state->head = head;
+    head->done = 0;
+    return Z_OK;
+}
+
+/*
+   Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff.  Return when found
+   or when out of input.  When called, *have is the number of pattern bytes
+   found in order so far, in 0..3.  On return *have is updated to the new
+   state.  If on return *have equals four, then the pattern was found and the
+   return value is how many bytes were read including the last byte of the
+   pattern.  If *have is less than four, then the pattern has not been found
+   yet and the return value is len.  In the latter case, syncsearch() can be
+   called again with more data and the *have state.  *have is initialized to
+   zero for the first call.
+ */
+static uint32_t syncsearch(uint32_t *have, const uint8_t *buf, uint32_t len) {
+    uint32_t got, next;
+
+    got = *have;
+    next = 0;
+    while (next < len && got < 4) {
+        if ((int)(buf[next]) == (got < 2 ? 0 : 0xff))
+            got++;
+        else if (buf[next])
+            got = 0;
+        else
+            got = 4 - got;
+        next++;
+    }
+    *have = got;
+    return next;
+}
+
+int32_t Z_EXPORT PREFIX(inflateSync)(PREFIX3(stream) *strm) {
+    unsigned len;               /* number of bytes to look at or looked at */
+    int flags;                  /* temporary to save header status */
+    size_t in, out;             /* temporary to save total_in and total_out */
+    unsigned char buf[4];       /* to restore bit buffer to byte string */
+    struct inflate_state *state;
+
+    /* check parameters */
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (strm->avail_in == 0 && state->bits < 8)
+        return Z_BUF_ERROR;
+
+    /* if first time, start search in bit buffer */
+    if (state->mode != SYNC) {
+        state->mode = SYNC;
+        state->hold <<= state->bits & 7;
+        state->bits -= state->bits & 7;
+        len = 0;
+        while (state->bits >= 8) {
+            buf[len++] = (unsigned char)(state->hold);
+            state->hold >>= 8;
+            state->bits -= 8;
+        }
+        state->have = 0;
+        syncsearch(&(state->have), buf, len);
+    }
+
+    /* search available input */
+    len = syncsearch(&(state->have), strm->next_in, strm->avail_in);
+    strm->avail_in -= len;
+    strm->next_in += len;
+    strm->total_in += len;
+
+    /* return no joy or set up to restart inflate() on a new block */
+    if (state->have != 4)
+        return Z_DATA_ERROR;
+    if (state->flags == -1)
+        state->wrap = 0;    /* if no header yet, treat as raw */
+    else
+        state->wrap &= ~4;  /* no point in computing a check value now */
+    flags = state->flags;
+    in = strm->total_in;
+    out = strm->total_out;
+    PREFIX(inflateReset)(strm);
+    strm->total_in = (z_uintmax_t)in; /* Can't use z_size_t here as it will overflow on 64-bit Windows */
+    strm->total_out = (z_uintmax_t)out;
+    state->flags = flags;
+    state->mode = TYPE;
+    return Z_OK;
+}
+
+/*
+   Returns true if inflate is currently at the end of a block generated by
+   Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
+   implementation to provide an additional safety check. PPP uses
+   Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
+   block. When decompressing, PPP checks that at the end of input packet,
+   inflate is waiting for these length bytes.
+ */
+int32_t Z_EXPORT PREFIX(inflateSyncPoint)(PREFIX3(stream) *strm) {
+    struct inflate_state *state;
+
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    INFLATE_SYNC_POINT_HOOK(strm);
+    state = (struct inflate_state *)strm->state;
+    return state->mode == STORED && state->bits == 0;
+}
+
+int32_t Z_EXPORT PREFIX(inflateCopy)(PREFIX3(stream) *dest, PREFIX3(stream) *source) {
+    struct inflate_state *state;
+    struct inflate_state *copy;
+
+    /* check input */
+    if (inflateStateCheck(source) || dest == NULL)
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)source->state;
+
+    /* allocate space */
+    copy = ZALLOC_INFLATE_STATE(source);
+    if (copy == NULL)
+        return Z_MEM_ERROR;
+
+    /* copy state */
+    memcpy((void *)dest, (void *)source, sizeof(PREFIX3(stream)));
+    ZCOPY_INFLATE_STATE(copy, state);
+    copy->strm = dest;
+    if (state->lencode >= state->codes && state->lencode <= state->codes + ENOUGH - 1) {
+        copy->lencode = copy->codes + (state->lencode - state->codes);
+        copy->distcode = copy->codes + (state->distcode - state->codes);
+    }
+    copy->next = copy->codes + (state->next - state->codes);
+
+    /* window */
+    copy->window = NULL;
+    if (state->window != NULL) {
+        if (PREFIX(inflate_ensure_window)(copy)) {
+            ZFREE_STATE(source, copy);
+            return Z_MEM_ERROR;
+        }
+        ZCOPY_WINDOW(copy->window, state->window, (size_t)state->wsize);
+    }
+
+    dest->state = (struct internal_state *)copy;
+    return Z_OK;
+}
+
+int32_t Z_EXPORT PREFIX(inflateUndermine)(PREFIX3(stream) *strm, int32_t subvert) {
+    struct inflate_state *state;
+
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+    state->sane = !subvert;
+    return Z_OK;
+#else
+    Z_UNUSED(subvert);
+    state->sane = 1;
+    return Z_DATA_ERROR;
+#endif
+}
+
+int32_t Z_EXPORT PREFIX(inflateValidate)(PREFIX3(stream) *strm, int32_t check) {
+    struct inflate_state *state;
+
+    if (inflateStateCheck(strm))
+        return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (check && state->wrap)
+        state->wrap |= 4;
+    else
+        state->wrap &= ~4;
+    return Z_OK;
+}
+
+long Z_EXPORT PREFIX(inflateMark)(PREFIX3(stream) *strm) {
+    struct inflate_state *state;
+
+    if (inflateStateCheck(strm))
+        return -65536;
+    INFLATE_MARK_HOOK(strm);  /* hook for IBM Z DFLTCC */
+    state = (struct inflate_state *)strm->state;
+    return (long)(((unsigned long)((long)state->back)) << 16) +
+        (state->mode == COPY ? state->length :
+            (state->mode == MATCH ? state->was - state->length : 0));
+}
+
+unsigned long Z_EXPORT PREFIX(inflateCodesUsed)(PREFIX3(stream) *strm) {
+    struct inflate_state *state;
+    if (strm == NULL || strm->state == NULL)
+        return (unsigned long)-1;
+    state = (struct inflate_state *)strm->state;
+    return (unsigned long)(state->next - state->codes);
+}
diff --git a/3rdparty/zlib-ng/inflate.h b/3rdparty/zlib-ng/inflate.h
new file mode 100644
index 000000000000..39cdf5d683c3
--- /dev/null
+++ b/3rdparty/zlib-ng/inflate.h
@@ -0,0 +1,140 @@
+/* inflate.h -- internal inflate state definition
+ * Copyright (C) 1995-2019 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+#ifndef INFLATE_H_
+#define INFLATE_H_
+
+#include "adler32_fold.h"
+#include "crc32_fold.h"
+
+/* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate().
+   NO_GZIP would be used to avoid linking in the crc code when it is not needed.
+   For shared libraries, gzip decoding should be left enabled. */
+#ifndef NO_GZIP
+#  define GUNZIP
+#endif
+
+/* Possible inflate modes between inflate() calls */
+typedef enum {
+    HEAD = 16180,   /* i: waiting for magic header */
+    FLAGS,      /* i: waiting for method and flags (gzip) */
+    TIME,       /* i: waiting for modification time (gzip) */
+    OS,         /* i: waiting for extra flags and operating system (gzip) */
+    EXLEN,      /* i: waiting for extra length (gzip) */
+    EXTRA,      /* i: waiting for extra bytes (gzip) */
+    NAME,       /* i: waiting for end of file name (gzip) */
+    COMMENT,    /* i: waiting for end of comment (gzip) */
+    HCRC,       /* i: waiting for header crc (gzip) */
+    DICTID,     /* i: waiting for dictionary check value */
+    DICT,       /* waiting for inflateSetDictionary() call */
+        TYPE,       /* i: waiting for type bits, including last-flag bit */
+        TYPEDO,     /* i: same, but skip check to exit inflate on new block */
+        STORED,     /* i: waiting for stored size (length and complement) */
+        COPY_,      /* i/o: same as COPY below, but only first time in */
+        COPY,       /* i/o: waiting for input or output to copy stored block */
+        TABLE,      /* i: waiting for dynamic block table lengths */
+        LENLENS,    /* i: waiting for code length code lengths */
+        CODELENS,   /* i: waiting for length/lit and distance code lengths */
+            LEN_,       /* i: same as LEN below, but only first time in */
+            LEN,        /* i: waiting for length/lit/eob code */
+            LENEXT,     /* i: waiting for length extra bits */
+            DIST,       /* i: waiting for distance code */
+            DISTEXT,    /* i: waiting for distance extra bits */
+            MATCH,      /* o: waiting for output space to copy string */
+            LIT,        /* o: waiting for output space to write literal */
+    CHECK,      /* i: waiting for 32-bit check value */
+    LENGTH,     /* i: waiting for 32-bit length (gzip) */
+    DONE,       /* finished check, done -- remain here until reset */
+    BAD,        /* got a data error -- remain here until reset */
+    MEM,        /* got an inflate() memory error -- remain here until reset */
+    SYNC        /* looking for synchronization bytes to restart inflate() */
+} inflate_mode;
+
+/*
+    State transitions between above modes -
+
+    (most modes can go to BAD or MEM on error -- not shown for clarity)
+
+    Process header:
+        HEAD -> (gzip) or (zlib) or (raw)
+        (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME -> COMMENT ->
+                  HCRC -> TYPE
+        (zlib) -> DICTID or TYPE
+        DICTID -> DICT -> TYPE
+        (raw) -> TYPEDO
+    Read deflate blocks:
+            TYPE -> TYPEDO -> STORED or TABLE or LEN_ or CHECK
+            STORED -> COPY_ -> COPY -> TYPE
+            TABLE -> LENLENS -> CODELENS -> LEN_
+            LEN_ -> LEN
+    Read deflate codes in fixed or dynamic block:
+                LEN -> LENEXT or LIT or TYPE
+                LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
+                LIT -> LEN
+    Process trailer:
+        CHECK -> LENGTH -> DONE
+ */
+
+/* State maintained between inflate() calls -- approximately 7K bytes, not
+   including the allocated sliding window, which is up to 32K bytes. */
+struct inflate_state {
+    PREFIX3(stream) *strm;             /* pointer back to this zlib stream */
+    inflate_mode mode;          /* current inflate mode */
+    int last;                   /* true if processing last block */
+    int wrap;                   /* bit 0 true for zlib, bit 1 true for gzip,
+                                   bit 2 true to validate check value */
+    int havedict;               /* true if dictionary provided */
+    int flags;                  /* gzip header method and flags, 0 if zlib, or
+                                   -1 if raw or no header yet */
+    unsigned dmax;              /* zlib header max distance (INFLATE_STRICT) */
+    unsigned long check;        /* protected copy of check value */
+    unsigned long total;        /* protected copy of output count */
+    PREFIX(gz_headerp) head;    /* where to save gzip header information */
+        /* sliding window */
+    unsigned wbits;             /* log base 2 of requested window size */
+    uint32_t wsize;             /* window size or zero if not using window */
+    uint32_t whave;             /* valid bytes in the window */
+    uint32_t wnext;             /* window write index */
+    unsigned char *window;      /* allocated sliding window, if needed */
+
+    struct crc32_fold_s ALIGNED_(16) crc_fold;
+
+        /* bit accumulator */
+    uint32_t hold;              /* input bit accumulator */
+    unsigned bits;              /* number of bits in "in" */
+        /* for string and stored block copying */
+    uint32_t length;            /* literal or length of data to copy */
+    unsigned offset;            /* distance back to copy string from */
+        /* for table and code decoding */
+    unsigned extra;             /* extra bits needed */
+        /* fixed and dynamic code tables */
+    code const *lencode;        /* starting table for length/literal codes */
+    code const *distcode;       /* starting table for distance codes */
+    unsigned lenbits;           /* index bits for lencode */
+    unsigned distbits;          /* index bits for distcode */
+        /* dynamic table building */
+    unsigned ncode;             /* number of code length code lengths */
+    unsigned nlen;              /* number of length code lengths */
+    unsigned ndist;             /* number of distance code lengths */
+    uint32_t have;              /* number of code lengths in lens[] */
+    code *next;                 /* next available space in codes[] */
+    uint16_t lens[320];         /* temporary storage for code lengths */
+    uint16_t work[288];         /* work area for code table building */
+    code codes[ENOUGH];         /* space for code tables */
+    int sane;                   /* if false, allow invalid distance too far */
+    int back;                   /* bits back of last unprocessed length/lit */
+    unsigned was;               /* initial length of match */
+    uint32_t chunksize;         /* size of memory copying chunk */
+};
+
+int Z_INTERNAL PREFIX(inflate_ensure_window)(struct inflate_state *state);
+void Z_INTERNAL PREFIX(fixedtables)(struct inflate_state *state);
+
+#endif /* INFLATE_H_ */
diff --git a/3rdparty/zlib-ng/inflate_p.h b/3rdparty/zlib-ng/inflate_p.h
new file mode 100644
index 000000000000..eff73876daf2
--- /dev/null
+++ b/3rdparty/zlib-ng/inflate_p.h
@@ -0,0 +1,230 @@
+/* inflate_p.h -- Private inline functions and macros shared with more than one deflate method
+ *
+ */
+
+#ifndef INFLATE_P_H
+#define INFLATE_P_H
+
+#include <stdlib.h>
+
+/* Architecture-specific hooks. */
+#ifdef S390_DFLTCC_INFLATE
+#  include "arch/s390/dfltcc_inflate.h"
+#else
+/* Memory management for the inflate state. Useful for allocating arch-specific extension blocks. */
+#  define ZALLOC_INFLATE_STATE(strm) ((struct inflate_state *)ZALLOC(strm, 1, sizeof(struct inflate_state)))
+#  define ZFREE_STATE(strm, addr) ZFREE(strm, addr)
+#  define ZCOPY_INFLATE_STATE(dst, src) memcpy(dst, src, sizeof(struct inflate_state))
+/* Memory management for the window. Useful for allocation the aligned window. */
+#  define ZALLOC_WINDOW(strm, items, size) ZALLOC(strm, items, size)
+#  define ZCOPY_WINDOW(dest, src, n) memcpy(dest, src, n)
+#  define ZFREE_WINDOW(strm, addr) ZFREE(strm, addr)
+/* Invoked at the end of inflateResetKeep(). Useful for initializing arch-specific extension blocks. */
+#  define INFLATE_RESET_KEEP_HOOK(strm) do {} while (0)
+/* Invoked at the beginning of inflatePrime(). Useful for updating arch-specific buffers. */
+#  define INFLATE_PRIME_HOOK(strm, bits, value) do {} while (0)
+/* Invoked at the beginning of each block. Useful for plugging arch-specific inflation code. */
+#  define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0)
+/* Returns whether zlib-ng should compute a checksum. Set to 0 if arch-specific inflation code already does that. */
+#  define INFLATE_NEED_CHECKSUM(strm) 1
+/* Returns whether zlib-ng should update a window. Set to 0 if arch-specific inflation code already does that. */
+#  define INFLATE_NEED_UPDATEWINDOW(strm) 1
+/* Invoked at the beginning of inflateMark(). Useful for updating arch-specific pointers and offsets. */
+#  define INFLATE_MARK_HOOK(strm) do {} while (0)
+/* Invoked at the beginning of inflateSyncPoint(). Useful for performing arch-specific state checks. */
+#  define INFLATE_SYNC_POINT_HOOK(strm) do {} while (0)
+/* Invoked at the beginning of inflateSetDictionary(). Useful for checking arch-specific window data. */
+#  define INFLATE_SET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
+/* Invoked at the beginning of inflateGetDictionary(). Useful for adjusting arch-specific window data. */
+#  define INFLATE_GET_DICTIONARY_HOOK(strm, dict, dict_len) do {} while (0)
+#endif
+
+/*
+ *   Macros shared by inflate() and inflateBack()
+ */
+
+/* check function to use adler32() for zlib or crc32() for gzip */
+#ifdef GUNZIP
+#  define UPDATE(check, buf, len) \
+    (state->flags ? PREFIX(crc32)(check, buf, len) : functable.adler32(check, buf, len))
+#else
+#  define UPDATE(check, buf, len) functable.adler32(check, buf, len)
+#endif
+
+/* check macros for header crc */
+#ifdef GUNZIP
+#  define CRC2(check, word) \
+    do { \
+        hbuf[0] = (unsigned char)(word); \
+        hbuf[1] = (unsigned char)((word) >> 8); \
+        check = PREFIX(crc32)(check, hbuf, 2); \
+    } while (0)
+
+#  define CRC4(check, word) \
+    do { \
+        hbuf[0] = (unsigned char)(word); \
+        hbuf[1] = (unsigned char)((word) >> 8); \
+        hbuf[2] = (unsigned char)((word) >> 16); \
+        hbuf[3] = (unsigned char)((word) >> 24); \
+        check = PREFIX(crc32)(check, hbuf, 4); \
+    } while (0)
+#endif
+
+/* Load registers with state in inflate() for speed */
+#define LOAD() \
+    do { \
+        put = strm->next_out; \
+        left = strm->avail_out; \
+        next = strm->next_in; \
+        have = strm->avail_in; \
+        hold = state->hold; \
+        bits = state->bits; \
+    } while (0)
+
+/* Restore state from registers in inflate() */
+#define RESTORE() \
+    do { \
+        strm->next_out = put; \
+        strm->avail_out = left; \
+        strm->next_in = (z_const unsigned char *)next; \
+        strm->avail_in = have; \
+        state->hold = hold; \
+        state->bits = bits; \
+    } while (0)
+
+/* Clear the input bit accumulator */
+#define INITBITS() \
+    do { \
+        hold = 0; \
+        bits = 0; \
+    } while (0)
+
+/* Ensure that there is at least n bits in the bit accumulator.  If there is
+   not enough available input to do that, then return from inflate()/inflateBack(). */
+#define NEEDBITS(n) \
+    do { \
+        while (bits < (unsigned)(n)) \
+            PULLBYTE(); \
+    } while (0)
+
+/* Return the low n bits of the bit accumulator (n < 16) */
+#define BITS(n) \
+    (hold & ((1U << (unsigned)(n)) - 1))
+
+/* Remove n bits from the bit accumulator */
+#define DROPBITS(n) \
+    do { \
+        hold >>= (n); \
+        bits -= (unsigned)(n); \
+    } while (0)
+
+/* Remove zero to seven bits as needed to go to a byte boundary */
+#define BYTEBITS() \
+    do { \
+        hold >>= bits & 7; \
+        bits -= bits & 7; \
+    } while (0)
+
+/* Set mode=BAD and prepare error message */
+#define SET_BAD(errmsg) \
+    do { \
+        state->mode = BAD; \
+        strm->msg = (char *)errmsg; \
+    } while (0)
+
+#define INFLATE_FAST_MIN_HAVE 15
+#define INFLATE_FAST_MIN_LEFT 260
+
+/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
+static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
+    uint64_t chunk;
+    memcpy(&chunk, in, sizeof(chunk));
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+    return chunk << bits;
+#else
+    return ZSWAP64(chunk) << bits;
+#endif
+}
+
+/* Behave like chunkcopy, but avoid writing beyond of legal output. */
+static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe) {
+    uint64_t safelen = (safe - out) + 1;
+    len = MIN(len, safelen);
+    int32_t olap_src = from >= out && from < out + len;
+    int32_t olap_dst = out >= from && out < from + len;
+    uint64_t tocopy;
+
+    /* For all cases without overlap, memcpy is ideal */
+    if (!(olap_src || olap_dst)) {
+        memcpy(out, from, (size_t)len);
+        return out + len;
+    }
+
+    /* Complete overlap: Source == destination */
+    if (out == from) {
+        return out + len;
+    }
+
+    /* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior,
+     * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the
+     * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
+     * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
+     * behind or lookahead distance. */
+    uint64_t non_olap_size = llabs(from - out); // llabs vs labs for compatibility with windows
+
+    memcpy(out, from, (size_t)non_olap_size);
+    out += non_olap_size;
+    from += non_olap_size;
+    len -= non_olap_size;
+
+    /* So this doesn't give use a worst case scenario of function calls in a loop,
+     * we want to instead break this down into copy blocks of fixed lengths */
+    while (len) {
+        tocopy = MIN(non_olap_size, len);
+        len -= tocopy;
+
+        while (tocopy >= 32) {
+            memcpy(out, from, 32);
+            out += 32;
+            from += 32;
+            tocopy -= 32;
+        }
+
+        if (tocopy >= 16) {
+            memcpy(out, from, 16);
+            out += 16;
+            from += 16;
+            tocopy -= 16;
+        }
+
+        if (tocopy >= 8) {
+            memcpy(out, from, 8);
+            out += 8;
+            from += 8;
+            tocopy -= 8;
+        }
+
+        if (tocopy >= 4) {
+            memcpy(out, from, 4);
+            out += 4;
+            from += 4;
+            tocopy -= 4;
+        }
+
+        if (tocopy >= 2) {
+            memcpy(out, from, 2);
+            out += 2;
+            from += 2;
+            tocopy -= 2;
+        }
+
+        if (tocopy) {
+            *out++ = *from++;
+        }
+    }
+
+    return out;
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/inftrees.c b/3rdparty/zlib-ng/inftrees.c
new file mode 100644
index 000000000000..423f7b461d7c
--- /dev/null
+++ b/3rdparty/zlib-ng/inftrees.c
@@ -0,0 +1,295 @@
+/* inftrees.c -- generate Huffman trees for efficient decoding
+ * Copyright (C) 1995-2023 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+#include "inftrees.h"
+
+const char PREFIX(inflate_copyright)[] = " inflate 1.3.0 Copyright 1995-2023 Mark Adler ";
+/*
+  If you use the zlib library in a product, an acknowledgment is welcome
+  in the documentation of your product. If for some reason you cannot
+  include such an acknowledgment, I would appreciate that you keep this
+  copyright string in the executable of your product.
+ */
+
+/*
+   Build a set of tables to decode the provided canonical Huffman code.
+   The code lengths are lens[0..codes-1].  The result starts at *table,
+   whose indices are 0..2^bits-1.  work is a writable array of at least
+   lens shorts, which is used as a work area.  type is the type of code
+   to be generated, CODES, LENS, or DISTS.  On return, zero is success,
+   -1 is an invalid code, and +1 means that ENOUGH isn't enough.  table
+   on return points to the next available entry's address.  bits is the
+   requested root table index bits, and on return it is the actual root
+   table index bits.  It will differ if the request is greater than the
+   longest code or if it is less than the shortest code.
+ */
+int Z_INTERNAL zng_inflate_table(codetype type, uint16_t *lens, unsigned codes,
+                                code * *table, unsigned *bits, uint16_t *work) {
+    unsigned len;               /* a code's length in bits */
+    unsigned sym;               /* index of code symbols */
+    unsigned min, max;          /* minimum and maximum code lengths */
+    unsigned root;              /* number of index bits for root table */
+    unsigned curr;              /* number of index bits for current table */
+    unsigned drop;              /* code bits to drop for sub-table */
+    int left;                   /* number of prefix codes available */
+    unsigned used;              /* code entries in table used */
+    unsigned huff;              /* Huffman code */
+    unsigned incr;              /* for incrementing code, index */
+    unsigned fill;              /* index for replicating entries */
+    unsigned low;               /* low bits for current root entry */
+    unsigned mask;              /* mask for low root bits */
+    code here;                  /* table entry for duplication */
+    code *next;                 /* next available space in table */
+    const uint16_t *base;       /* base value table to use */
+    const uint16_t *extra;      /* extra bits table to use */
+    unsigned match;             /* use base and extra for symbol >= match */
+    uint16_t count[MAX_BITS+1];  /* number of codes of each length */
+    uint16_t offs[MAX_BITS+1];   /* offsets in table for each length */
+    static const uint16_t lbase[31] = { /* Length codes 257..285 base */
+        3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+        35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+    static const uint16_t lext[31] = { /* Length codes 257..285 extra */
+        16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 77, 202};
+    static const uint16_t dbase[32] = { /* Distance codes 0..29 base */
+        1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+        257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+        8193, 12289, 16385, 24577, 0, 0};
+    static const uint16_t dext[32] = { /* Distance codes 0..29 extra */
+        16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+        23, 23, 24, 24, 25, 25, 26, 26, 27, 27,
+        28, 28, 29, 29, 64, 64};
+
+    /*
+       Process a set of code lengths to create a canonical Huffman code.  The
+       code lengths are lens[0..codes-1].  Each length corresponds to the
+       symbols 0..codes-1.  The Huffman code is generated by first sorting the
+       symbols by length from short to long, and retaining the symbol order
+       for codes with equal lengths.  Then the code starts with all zero bits
+       for the first code of the shortest length, and the codes are integer
+       increments for the same length, and zeros are appended as the length
+       increases.  For the deflate format, these bits are stored backwards
+       from their more natural integer increment ordering, and so when the
+       decoding tables are built in the large loop below, the integer codes
+       are incremented backwards.
+
+       This routine assumes, but does not check, that all of the entries in
+       lens[] are in the range 0..MAXBITS.  The caller must assure this.
+       1..MAXBITS is interpreted as that code length.  zero means that that
+       symbol does not occur in this code.
+
+       The codes are sorted by computing a count of codes for each length,
+       creating from that a table of starting indices for each length in the
+       sorted table, and then entering the symbols in order in the sorted
+       table.  The sorted table is work[], with that space being provided by
+       the caller.
+
+       The length counts are used for other purposes as well, i.e. finding
+       the minimum and maximum length codes, determining if there are any
+       codes at all, checking for a valid set of lengths, and looking ahead
+       at length counts to determine sub-table sizes when building the
+       decoding tables.
+     */
+
+    /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */
+    for (len = 0; len <= MAX_BITS; len++)
+        count[len] = 0;
+    for (sym = 0; sym < codes; sym++)
+        count[lens[sym]]++;
+
+    /* bound code lengths, force root to be within code lengths */
+    root = *bits;
+    for (max = MAX_BITS; max >= 1; max--)
+        if (count[max] != 0) break;
+    root = MIN(root, max);
+    if (UNLIKELY(max == 0)) {           /* no symbols to code at all */
+        here.op = (unsigned char)64;    /* invalid code marker */
+        here.bits = (unsigned char)1;
+        here.val = (uint16_t)0;
+        *(*table)++ = here;             /* make a table to force an error */
+        *(*table)++ = here;
+        *bits = 1;
+        return 0;     /* no symbols, but wait for decoding to report error */
+    }
+    for (min = 1; min < max; min++)
+        if (count[min] != 0) break;
+    root = MAX(root, min);
+
+    /* check for an over-subscribed or incomplete set of lengths */
+    left = 1;
+    for (len = 1; len <= MAX_BITS; len++) {
+        left <<= 1;
+        left -= count[len];
+        if (left < 0) return -1;        /* over-subscribed */
+    }
+    if (left > 0 && (type == CODES || max != 1))
+        return -1;                      /* incomplete set */
+
+    /* generate offsets into symbol table for each length for sorting */
+    offs[1] = 0;
+    for (len = 1; len < MAX_BITS; len++)
+        offs[len + 1] = offs[len] + count[len];
+
+    /* sort symbols by length, by symbol order within each length */
+    for (sym = 0; sym < codes; sym++)
+        if (lens[sym] != 0) work[offs[lens[sym]]++] = (uint16_t)sym;
+
+    /*
+       Create and fill in decoding tables.  In this loop, the table being
+       filled is at next and has curr index bits.  The code being used is huff
+       with length len.  That code is converted to an index by dropping drop
+       bits off of the bottom.  For codes where len is less than drop + curr,
+       those top drop + curr - len bits are incremented through all values to
+       fill the table with replicated entries.
+
+       root is the number of index bits for the root table.  When len exceeds
+       root, sub-tables are created pointed to by the root entry with an index
+       of the low root bits of huff.  This is saved in low to check for when a
+       new sub-table should be started.  drop is zero when the root table is
+       being filled, and drop is root when sub-tables are being filled.
+
+       When a new sub-table is needed, it is necessary to look ahead in the
+       code lengths to determine what size sub-table is needed.  The length
+       counts are used for this, and so count[] is decremented as codes are
+       entered in the tables.
+
+       used keeps track of how many table entries have been allocated from the
+       provided *table space.  It is checked for LENS and DIST tables against
+       the constants ENOUGH_LENS and ENOUGH_DISTS to guard against changes in
+       the initial root table size constants.  See the comments in inftrees.h
+       for more information.
+
+       sym increments through all symbols, and the loop terminates when
+       all codes of length max, i.e. all codes, have been processed.  This
+       routine permits incomplete codes, so another loop after this one fills
+       in the rest of the decoding tables with invalid code markers.
+     */
+
+    /* set up for code type */
+    switch (type) {
+    case CODES:
+        base = extra = work;    /* dummy value--not used */
+        match = 20;
+        break;
+    case LENS:
+        base = lbase;
+        extra = lext;
+        match = 257;
+        break;
+    default:    /* DISTS */
+        base = dbase;
+        extra = dext;
+        match = 0;
+    }
+
+    /* initialize state for loop */
+    huff = 0;                   /* starting code */
+    sym = 0;                    /* starting code symbol */
+    len = min;                  /* starting code length */
+    next = *table;              /* current table to fill in */
+    curr = root;                /* current table index bits */
+    drop = 0;                   /* current bits to drop from code for index */
+    low = (unsigned)(-1);       /* trigger new sub-table when len > root */
+    used = 1U << root;          /* use root table entries */
+    mask = used - 1;            /* mask for comparing low */
+
+    /* check available table space */
+    if ((type == LENS && used > ENOUGH_LENS) ||
+        (type == DISTS && used > ENOUGH_DISTS))
+        return 1;
+
+    /* process all codes and make table entries */
+    for (;;) {
+        /* create table entry */
+        here.bits = (unsigned char)(len - drop);
+        if (LIKELY(work[sym] >= match)) {
+            here.op = (unsigned char)(extra[work[sym] - match]);
+            here.val = base[work[sym] - match];
+        } else if (work[sym] + 1U < match) {
+            here.op = (unsigned char)0;
+            here.val = work[sym];
+        } else {
+            here.op = (unsigned char)(32 + 64);         /* end of block */
+            here.val = 0;
+        }
+
+        /* replicate for those indices with low len bits equal to huff */
+        incr = 1U << (len - drop);
+        fill = 1U << curr;
+        min = fill;                 /* save offset to next table */
+        do {
+            fill -= incr;
+            next[(huff >> drop) + fill] = here;
+        } while (fill != 0);
+
+        /* backwards increment the len-bit code huff */
+        incr = 1U << (len - 1);
+        while (huff & incr)
+            incr >>= 1;
+        if (incr != 0) {
+            huff &= incr - 1;
+            huff += incr;
+        } else {
+            huff = 0;
+        }
+
+        /* go to next symbol, update count, len */
+        sym++;
+        if (--(count[len]) == 0) {
+            if (len == max)
+                break;
+            len = lens[work[sym]];
+        }
+
+        /* create new sub-table if needed */
+        if (len > root && (huff & mask) != low) {
+            /* if first time, transition to sub-tables */
+            if (drop == 0)
+                drop = root;
+
+            /* increment past last table */
+            next += min;            /* here min is 1 << curr */
+
+            /* determine length of next table */
+            curr = len - drop;
+            left = (int)(1 << curr);
+            while (curr + drop < max) {
+                left -= count[curr + drop];
+                if (left <= 0)
+                    break;
+                curr++;
+                left <<= 1;
+            }
+
+            /* check for enough space */
+            used += 1U << curr;
+            if ((type == LENS && used > ENOUGH_LENS) || (type == DISTS && used > ENOUGH_DISTS))
+                return 1;
+
+            /* point entry in root table to sub-table */
+            low = huff & mask;
+            (*table)[low].op = (unsigned char)curr;
+            (*table)[low].bits = (unsigned char)root;
+            (*table)[low].val = (uint16_t)(next - *table);
+        }
+    }
+
+    /* fill in remaining table entry if code is incomplete (guaranteed to have
+       at most one remaining entry, since if the code is incomplete, the
+       maximum code length that was allowed to get this far is one bit) */
+    if (UNLIKELY(huff != 0)) {
+        here.op = (unsigned char)64;            /* invalid code marker */
+        here.bits = (unsigned char)(len - drop);
+        here.val = (uint16_t)0;
+        next[huff] = here;
+    }
+
+    /* set return parameters */
+    *table += used;
+    *bits = root;
+    return 0;
+}
diff --git a/3rdparty/zlib-ng/inftrees.h b/3rdparty/zlib-ng/inftrees.h
new file mode 100644
index 000000000000..ad2be151f2c0
--- /dev/null
+++ b/3rdparty/zlib-ng/inftrees.h
@@ -0,0 +1,66 @@
+#ifndef INFTREES_H_
+#define INFTREES_H_
+
+/* inftrees.h -- header to use inftrees.c
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* Structure for decoding tables.  Each entry provides either the
+   information needed to do the operation requested by the code that
+   indexed that table entry, or it provides a pointer to another
+   table that indexes more bits of the code.  op indicates whether
+   the entry is a pointer to another table, a literal, a length or
+   distance, an end-of-block, or an invalid code.  For a table
+   pointer, the low four bits of op is the number of index bits of
+   that table.  For a length or distance, the low four bits of op
+   is the number of extra bits to get after the code.  bits is
+   the number of bits in this code or part of the code to drop off
+   of the bit buffer.  val is the actual byte to output in the case
+   of a literal, the base length or distance, or the offset from
+   the current table to the next table.  Each entry is four bytes. */
+typedef struct {
+    unsigned char op;         /* operation, extra bits, table bits */
+    unsigned char bits;       /* bits in this part of the code */
+    uint16_t val;             /* offset in table or code value */
+} code;
+
+/* op values as set by inflate_table():
+    00000000 - literal
+    0000tttt - table link, tttt != 0 is the number of table index bits
+    0001eeee - length or distance, eeee is the number of extra bits
+    01100000 - end of block
+    01000000 - invalid code
+ */
+
+/* Maximum size of the dynamic table.  The maximum number of code structures is
+   1924, which is the sum of 1332 for literal/length codes and 592 for distance
+   codes.  These values were found by exhaustive searches using the program
+   examples/enough.c found in the zlib distributions.  The arguments to that
+   program are the number of symbols, the initial root table size, and the
+   maximum bit length of a code.  "enough 286 10 15" for literal/length codes
+   returns 1332, and "enough 30 9 15" for distance codes returns 592.
+   The initial root table size (10 or 9) is found in the fifth argument of the
+   inflate_table() calls in inflate.c and infback.c.  If the root table size is
+   changed, then these maximum sizes would be need to be recalculated and
+   updated. */
+#define ENOUGH_LENS 1332
+#define ENOUGH_DISTS 592
+#define ENOUGH (ENOUGH_LENS+ENOUGH_DISTS)
+
+/* Type of code to build for inflate_table() */
+typedef enum {
+    CODES,
+    LENS,
+    DISTS
+} codetype;
+
+int Z_INTERNAL zng_inflate_table (codetype type, uint16_t *lens, unsigned codes,
+                                  code * *table, unsigned *bits, uint16_t *work);
+
+#endif /* INFTREES_H_ */
diff --git a/3rdparty/zlib-ng/insert_string.c b/3rdparty/zlib-ng/insert_string.c
new file mode 100644
index 000000000000..cfe39837f86a
--- /dev/null
+++ b/3rdparty/zlib-ng/insert_string.c
@@ -0,0 +1,21 @@
+/* insert_string.c -- insert_string integer hash variant
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#define HASH_SLIDE           16
+
+#define HASH_CALC(s, h, val) h = ((val * 2654435761U) >> HASH_SLIDE);
+#define HASH_CALC_VAR        h
+#define HASH_CALC_VAR_INIT   uint32_t h = 0
+
+#define UPDATE_HASH          update_hash_c
+#define INSERT_STRING        insert_string_c
+#define QUICK_INSERT_STRING  quick_insert_string_c
+
+#include "insert_string_tpl.h"
diff --git a/3rdparty/zlib-ng/insert_string_roll.c b/3rdparty/zlib-ng/insert_string_roll.c
new file mode 100644
index 000000000000..dfea347bccb7
--- /dev/null
+++ b/3rdparty/zlib-ng/insert_string_roll.c
@@ -0,0 +1,24 @@
+/* insert_string_roll.c -- insert_string rolling hash variant
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+
+#define HASH_SLIDE           5
+
+#define HASH_CALC(s, h, val) h = ((h << HASH_SLIDE) ^ ((uint8_t)val))
+#define HASH_CALC_VAR        s->ins_h
+#define HASH_CALC_VAR_INIT
+#define HASH_CALC_READ       val = strstart[0]
+#define HASH_CALC_MASK       (32768u - 1u)
+#define HASH_CALC_OFFSET     (STD_MIN_MATCH-1)
+
+#define UPDATE_HASH          update_hash_roll
+#define INSERT_STRING        insert_string_roll
+#define QUICK_INSERT_STRING  quick_insert_string_roll
+
+#include "insert_string_tpl.h"
diff --git a/3rdparty/zlib-ng/insert_string_tpl.h b/3rdparty/zlib-ng/insert_string_tpl.h
new file mode 100644
index 000000000000..c84617730ac3
--- /dev/null
+++ b/3rdparty/zlib-ng/insert_string_tpl.h
@@ -0,0 +1,108 @@
+#ifndef INSERT_STRING_H_
+#define INSERT_STRING_H_
+
+/* insert_string.h -- Private insert_string functions shared with more than
+ *                    one insert string implementation
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *  Jim Guilford    <james.guilford@intel.com>
+ *  Vinodh Gopal    <vinodh.gopal@intel.com>
+ *  Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *  Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * Portions are Copyright (C) 2016 12Sided Technology, LLC.
+ * Author:
+ *  Phil Vachon     <pvachon@12sidedtech.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#ifndef HASH_CALC_OFFSET
+#  define HASH_CALC_OFFSET 0
+#endif
+#ifndef HASH_CALC_MASK
+#  define HASH_CALC_MASK HASH_MASK
+#endif
+#ifndef HASH_CALC_READ
+#  if BYTE_ORDER == LITTLE_ENDIAN
+#    define HASH_CALC_READ \
+        memcpy(&val, strstart, sizeof(val));
+#  else
+#    define HASH_CALC_READ \
+        val  = ((uint32_t)(strstart[0])); \
+        val |= ((uint32_t)(strstart[1]) << 8); \
+        val |= ((uint32_t)(strstart[2]) << 16); \
+        val |= ((uint32_t)(strstart[3]) << 24);
+#  endif
+#endif
+
+/* ===========================================================================
+ * Update a hash value with the given input byte
+ * IN  assertion: all calls to UPDATE_HASH are made with consecutive
+ *    input characters, so that a running hash key can be computed from the
+ *    previous key instead of complete recalculation each time.
+ */
+Z_INTERNAL uint32_t UPDATE_HASH(deflate_state *const s, uint32_t h, uint32_t val) {
+    (void)s;
+    HASH_CALC(s, h, val);
+    return h & HASH_CALC_MASK;
+}
+
+/* ===========================================================================
+ * Quick insert string str in the dictionary and set match_head to the previous head
+ * of the hash chain (the most recent string with same hash key). Return
+ * the previous length of the hash chain.
+ */
+Z_INTERNAL Pos QUICK_INSERT_STRING(deflate_state *const s, uint32_t str) {
+    Pos head;
+    uint8_t *strstart = s->window + str + HASH_CALC_OFFSET;
+    uint32_t val, hm;
+
+    HASH_CALC_VAR_INIT;
+    HASH_CALC_READ;
+    HASH_CALC(s, HASH_CALC_VAR, val);
+    HASH_CALC_VAR &= HASH_CALC_MASK;
+    hm = HASH_CALC_VAR;
+
+    head = s->head[hm];
+    if (LIKELY(head != str)) {
+        s->prev[str & s->w_mask] = head;
+        s->head[hm] = (Pos)str;
+    }
+    return head;
+}
+
+/* ===========================================================================
+ * Insert string str in the dictionary and set match_head to the previous head
+ * of the hash chain (the most recent string with same hash key). Return
+ * the previous length of the hash chain.
+ * IN  assertion: all calls to INSERT_STRING are made with consecutive
+ *    input characters and the first STD_MIN_MATCH bytes of str are valid
+ *    (except for the last STD_MIN_MATCH-1 bytes of the input file).
+ */
+Z_INTERNAL void INSERT_STRING(deflate_state *const s, uint32_t str, uint32_t count) {
+    uint8_t *strstart = s->window + str + HASH_CALC_OFFSET;
+    uint8_t *strend = strstart + count;
+
+    for (Pos idx = (Pos)str; strstart < strend; idx++, strstart++) {
+        uint32_t val, hm;
+
+        HASH_CALC_VAR_INIT;
+        HASH_CALC_READ;
+        HASH_CALC(s, HASH_CALC_VAR, val);
+        HASH_CALC_VAR &= HASH_CALC_MASK;
+        hm = HASH_CALC_VAR;
+
+        Pos head = s->head[hm];
+        if (LIKELY(head != idx)) {
+            s->prev[idx & s->w_mask] = head;
+            s->head[hm] = idx;
+        }
+    }
+}
+#endif
diff --git a/3rdparty/zlib-ng/match_tpl.h b/3rdparty/zlib-ng/match_tpl.h
new file mode 100644
index 000000000000..d076798520ee
--- /dev/null
+++ b/3rdparty/zlib-ng/match_tpl.h
@@ -0,0 +1,289 @@
+/* match_tpl.h -- find longest match template for compare256 variants
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Portions copyright (C) 2014-2021 Konstantin Nosov
+ *  Fast-zlib optimized longest_match
+ *  https://github.com/gildor2/fast_zlib
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "deflate.h"
+#include "functable.h"
+
+#ifndef MATCH_TPL_H
+#define MATCH_TPL_H
+
+#define EARLY_EXIT_TRIGGER_LEVEL 5
+
+#endif
+
+/* Set match_start to the longest match starting at the given string and
+ * return its length. Matches shorter or equal to prev_length are discarded,
+ * in which case the result is equal to prev_length and match_start is garbage.
+ *
+ * IN assertions: cur_match is the head of the hash chain for the current
+ * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
+ * OUT assertion: the match length is not greater than s->lookahead
+ */
+Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
+    unsigned int strstart = s->strstart;
+    const unsigned wmask = s->w_mask;
+    unsigned char *window = s->window;
+    unsigned char *scan = window + strstart;
+    Z_REGISTER unsigned char *mbase_start = window;
+    Z_REGISTER unsigned char *mbase_end;
+    const Pos *prev = s->prev;
+    Pos limit;
+#ifdef LONGEST_MATCH_SLOW
+    Pos limit_base;
+#else
+    int32_t early_exit;
+#endif
+    uint32_t chain_length, nice_match, best_len, offset;
+    uint32_t lookahead = s->lookahead;
+    Pos match_offset = 0;
+#ifdef UNALIGNED_OK
+    uint8_t scan_start[8];
+#endif
+    uint8_t scan_end[8];
+
+#define GOTO_NEXT_CHAIN \
+    if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \
+        continue; \
+    return best_len;
+
+    /* The code is optimized for STD_MAX_MATCH-2 multiple of 16. */
+    Assert(STD_MAX_MATCH == 258, "Code too clever");
+
+    best_len = s->prev_length ? s->prev_length : STD_MIN_MATCH-1;
+
+    /* Calculate read offset which should only extend an extra byte
+     * to find the next best match length.
+     */
+    offset = best_len-1;
+#ifdef UNALIGNED_OK
+    if (best_len >= sizeof(uint32_t)) {
+        offset -= 2;
+#ifdef UNALIGNED64_OK
+        if (best_len >= sizeof(uint64_t))
+            offset -= 4;
+#endif
+    }
+#endif
+
+#ifdef UNALIGNED64_OK
+    memcpy(scan_start, scan, sizeof(uint64_t));
+    memcpy(scan_end, scan+offset, sizeof(uint64_t));
+#elif defined(UNALIGNED_OK)
+    memcpy(scan_start, scan, sizeof(uint32_t));
+    memcpy(scan_end, scan+offset, sizeof(uint32_t));
+#else
+    scan_end[0] = *(scan+offset);
+    scan_end[1] = *(scan+offset+1);
+#endif
+    mbase_end  = (mbase_start+offset);
+
+    /* Do not waste too much time if we already have a good match */
+    chain_length = s->max_chain_length;
+    if (best_len >= s->good_match)
+        chain_length >>= 2;
+    nice_match = (uint32_t)s->nice_match;
+
+    /* Stop when cur_match becomes <= limit. To simplify the code,
+     * we prevent matches with the string of window index 0
+     */
+    limit = strstart > MAX_DIST(s) ? (Pos)(strstart - MAX_DIST(s)) : 0;
+#ifdef LONGEST_MATCH_SLOW
+    limit_base = limit;
+    if (best_len >= STD_MIN_MATCH) {
+        /* We're continuing search (lazy evaluation). */
+        uint32_t i, hash;
+        Pos pos;
+
+        /* Find a most distant chain starting from scan with index=1 (index=0 corresponds
+         * to cur_match). We cannot use s->prev[strstart+1,...] immediately, because
+         * these strings are not yet inserted into the hash table.
+         */
+        hash = s->update_hash(s, 0, scan[1]);
+        hash = s->update_hash(s, hash, scan[2]);
+
+        for (i = 3; i <= best_len; i++) {
+            hash = s->update_hash(s, hash, scan[i]);
+
+            /* If we're starting with best_len >= 3, we can use offset search. */
+            pos = s->head[hash];
+            if (pos < cur_match) {
+                match_offset = (Pos)(i - 2);
+                cur_match = pos;
+            }
+        }
+
+        /* Update offset-dependent variables */
+        limit = limit_base+match_offset;
+        if (cur_match <= limit)
+            goto break_matching;
+        mbase_start -= match_offset;
+        mbase_end -= match_offset;
+    }
+#else
+    early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL;
+#endif
+    Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");
+    for (;;) {
+        if (cur_match >= strstart)
+            break;
+
+        /* Skip to next match if the match length cannot increase or if the match length is
+         * less than 2. Note that the checks below for insufficient lookahead only occur
+         * occasionally for performance reasons.
+         * Therefore uninitialized memory will be accessed and conditional jumps will be made
+         * that depend on those values. However the length of the match is limited to the
+         * lookahead, so the output of deflate is not affected by the uninitialized values.
+         */
+#ifdef UNALIGNED_OK
+        if (best_len < sizeof(uint32_t)) {
+            for (;;) {
+                if (zng_memcmp_2(mbase_end+cur_match, scan_end) == 0 &&
+                    zng_memcmp_2(mbase_start+cur_match, scan_start) == 0)
+                    break;
+                GOTO_NEXT_CHAIN;
+            }
+#  ifdef UNALIGNED64_OK
+        } else if (best_len >= sizeof(uint64_t)) {
+            for (;;) {
+                if (zng_memcmp_8(mbase_end+cur_match, scan_end) == 0 &&
+                    zng_memcmp_8(mbase_start+cur_match, scan_start) == 0)
+                    break;
+                GOTO_NEXT_CHAIN;
+            }
+#  endif
+        } else {
+            for (;;) {
+                if (zng_memcmp_4(mbase_end+cur_match, scan_end) == 0 &&
+                    zng_memcmp_4(mbase_start+cur_match, scan_start) == 0)
+                    break;
+                GOTO_NEXT_CHAIN;
+            }
+        }
+#else
+        for (;;) {
+            if (mbase_end[cur_match] == scan_end[0] && mbase_end[cur_match+1] == scan_end[1] &&
+                mbase_start[cur_match] == scan[0] && mbase_start[cur_match+1] == scan[1])
+                break;
+            GOTO_NEXT_CHAIN;
+        }
+#endif
+        uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2;
+        Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan");
+
+        if (len > best_len) {
+            uint32_t match_start = cur_match - match_offset;
+            s->match_start = match_start;
+
+            /* Do not look for matches beyond the end of the input. */
+            if (len > lookahead)
+                return lookahead;
+            best_len = len;
+            if (best_len >= nice_match)
+                return best_len;
+
+            offset = best_len-1;
+#ifdef UNALIGNED_OK
+            if (best_len >= sizeof(uint32_t)) {
+                offset -= 2;
+#ifdef UNALIGNED64_OK
+                if (best_len >= sizeof(uint64_t))
+                    offset -= 4;
+#endif
+            }
+#endif
+
+#ifdef UNALIGNED64_OK
+            memcpy(scan_end, scan+offset, sizeof(uint64_t));
+#elif defined(UNALIGNED_OK)
+            memcpy(scan_end, scan+offset, sizeof(uint32_t));
+#else
+            scan_end[0] = *(scan+offset);
+            scan_end[1] = *(scan+offset+1);
+#endif
+
+#ifdef LONGEST_MATCH_SLOW
+            /* Look for a better string offset */
+            if (UNLIKELY(len > STD_MIN_MATCH && match_start + len < strstart)) {
+                Pos pos, next_pos;
+                uint32_t i, hash;
+                unsigned char *scan_endstr;
+
+                /* Go back to offset 0 */
+                cur_match -= match_offset;
+                match_offset = 0;
+                next_pos = cur_match;
+                for (i = 0; i <= len - STD_MIN_MATCH; i++) {
+                    pos = prev[(cur_match + i) & wmask];
+                    if (pos < next_pos) {
+                        /* Hash chain is more distant, use it */
+                        if (pos <= limit_base + i)
+                            goto break_matching;
+                        next_pos = pos;
+                        match_offset = (Pos)i;
+                    }
+                }
+                /* Switch cur_match to next_pos chain */
+                cur_match = next_pos;
+
+                /* Try hash head at len-(STD_MIN_MATCH-1) position to see if we could get
+                 * a better cur_match at the end of string. Using (STD_MIN_MATCH-1) lets
+                 * us include one more byte into hash - the byte which will be checked
+                 * in main loop now, and which allows to grow match by 1.
+                 */
+                scan_endstr = scan + len - (STD_MIN_MATCH+1);
+
+                hash = s->update_hash(s, 0, scan_endstr[0]);
+                hash = s->update_hash(s, hash, scan_endstr[1]);
+                hash = s->update_hash(s, hash, scan_endstr[2]);
+
+                pos = s->head[hash];
+                if (pos < cur_match) {
+                    match_offset = (Pos)(len - (STD_MIN_MATCH+1));
+                    if (pos <= limit_base + match_offset)
+                        goto break_matching;
+                    cur_match = pos;
+                }
+
+                /* Update offset-dependent variables */
+                limit = limit_base+match_offset;
+                mbase_start = window-match_offset;
+                mbase_end = (mbase_start+offset);
+                continue;
+            }
+#endif
+            mbase_end = (mbase_start+offset);
+        }
+#ifndef LONGEST_MATCH_SLOW
+        else if (UNLIKELY(early_exit)) {
+            /* The probability of finding a match later if we here is pretty low, so for
+             * performance it's best to outright stop here for the lower compression levels
+             */
+            break;
+        }
+#endif
+        GOTO_NEXT_CHAIN;
+    }
+    return best_len;
+
+#ifdef LONGEST_MATCH_SLOW
+break_matching:
+
+    if (best_len < s->lookahead)
+        return best_len;
+
+    return s->lookahead;
+#endif
+}
+
+#undef LONGEST_MATCH_SLOW
+#undef LONGEST_MATCH
+#undef COMPARE256
diff --git a/3rdparty/zlib-ng/slide_hash.c b/3rdparty/zlib-ng/slide_hash.c
new file mode 100644
index 000000000000..b9fbbdb69f86
--- /dev/null
+++ b/3rdparty/zlib-ng/slide_hash.c
@@ -0,0 +1,52 @@
+/* slide_hash.c -- slide hash table C implementation
+ *
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+
+/* ===========================================================================
+ * Slide the hash table when sliding the window down (could be avoided with 32
+ * bit values at the expense of memory usage). We slide even when level == 0 to
+ * keep the hash table consistent if we switch back to level > 0 later.
+ */
+static inline void slide_hash_c_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+#ifdef NOT_TWEAK_COMPILER
+    table += entries;
+    do {
+        unsigned m;
+        m = *--table;
+        *table = (Pos)(m >= wsize ? m-wsize : 0);
+        /* If entries is not on any hash chain, prev[entries] is garbage but
+         * its value will never be used.
+         */
+    } while (--entries);
+#else
+    {
+    /* As of I make this change, gcc (4.8.*) isn't able to vectorize
+     * this hot loop using saturated-subtraction on x86-64 architecture.
+     * To avoid this defect, we can change the loop such that
+     *    o. the pointer advance forward, and
+     *    o. demote the variable 'm' to be local to the loop, and
+     *       choose type "Pos" (instead of 'unsigned int') for the
+     *       variable to avoid unnecessary zero-extension.
+     */
+        unsigned int i;
+        Pos *q = table;
+        for (i = 0; i < entries; i++) {
+            Pos m = *q;
+            Pos t = (Pos)wsize;
+            *q++ = (Pos)(m >= t ? m-t: 0);
+        }
+    }
+#endif /* NOT_TWEAK_COMPILER */
+}
+
+Z_INTERNAL void slide_hash_c(deflate_state *s) {
+    uint16_t wsize = (uint16_t)s->w_size;
+
+    slide_hash_c_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_c_chain(s->prev, wsize, wsize);
+}
diff --git a/3rdparty/zlib-ng/trees.c b/3rdparty/zlib-ng/trees.c
new file mode 100644
index 000000000000..5bb88389baa3
--- /dev/null
+++ b/3rdparty/zlib-ng/trees.c
@@ -0,0 +1,818 @@
+/* trees.c -- output deflated data using Huffman coding
+ * Copyright (C) 1995-2021 Jean-loup Gailly
+ * detect_data_type() function provided freely by Cosmin Truta, 2006
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/*
+ *  ALGORITHM
+ *
+ *      The "deflation" process uses several Huffman trees. The more
+ *      common source values are represented by shorter bit sequences.
+ *
+ *      Each code tree is stored in a compressed form which is itself
+ * a Huffman encoding of the lengths of all the code strings (in
+ * ascending order by source values).  The actual code strings are
+ * reconstructed from the lengths in the inflate process, as described
+ * in the deflate specification.
+ *
+ *  REFERENCES
+ *
+ *      Deutsch, L.P.,"'Deflate' Compressed Data Format Specification".
+ *      Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc
+ *
+ *      Storer, James A.
+ *          Data Compression:  Methods and Theory, pp. 49-50.
+ *          Computer Science Press, 1988.  ISBN 0-7167-8156-5.
+ *
+ *      Sedgewick, R.
+ *          Algorithms, p290.
+ *          Addison-Wesley, 1983. ISBN 0-201-06672-6.
+ */
+
+#include "zbuild.h"
+#include "deflate.h"
+#include "trees.h"
+#include "trees_emit.h"
+#include "trees_tbl.h"
+
+/* The lengths of the bit length codes are sent in order of decreasing
+ * probability, to avoid transmitting the lengths for unused bit length codes.
+ */
+
+/* ===========================================================================
+ * Local data. These are initialized only once.
+ */
+
+struct static_tree_desc_s {
+    const ct_data *static_tree; /* static tree or NULL */
+    const int     *extra_bits;  /* extra bits for each code or NULL */
+    int            extra_base;  /* base index for extra_bits */
+    int            elems;       /* max number of elements in the tree */
+    unsigned int   max_length;  /* max bit length for the codes */
+};
+
+static const static_tree_desc  static_l_desc =
+{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS};
+
+static const static_tree_desc  static_d_desc =
+{static_dtree, extra_dbits, 0,          D_CODES, MAX_BITS};
+
+static const static_tree_desc  static_bl_desc =
+{(const ct_data *)0, extra_blbits, 0,   BL_CODES, MAX_BL_BITS};
+
+/* ===========================================================================
+ * Local (static) routines in this file.
+ */
+
+static void init_block       (deflate_state *s);
+static void pqdownheap       (deflate_state *s, ct_data *tree, int k);
+static void gen_bitlen       (deflate_state *s, tree_desc *desc);
+static void build_tree       (deflate_state *s, tree_desc *desc);
+static void scan_tree        (deflate_state *s, ct_data *tree, int max_code);
+static void send_tree        (deflate_state *s, ct_data *tree, int max_code);
+static int  build_bl_tree    (deflate_state *s);
+static void send_all_trees   (deflate_state *s, int lcodes, int dcodes, int blcodes);
+static void compress_block   (deflate_state *s, const ct_data *ltree, const ct_data *dtree);
+static int  detect_data_type (deflate_state *s);
+static void bi_flush         (deflate_state *s);
+
+/* ===========================================================================
+ * Initialize the tree data structures for a new zlib stream.
+ */
+void Z_INTERNAL zng_tr_init(deflate_state *s) {
+    s->l_desc.dyn_tree = s->dyn_ltree;
+    s->l_desc.stat_desc = &static_l_desc;
+
+    s->d_desc.dyn_tree = s->dyn_dtree;
+    s->d_desc.stat_desc = &static_d_desc;
+
+    s->bl_desc.dyn_tree = s->bl_tree;
+    s->bl_desc.stat_desc = &static_bl_desc;
+
+    s->bi_buf = 0;
+    s->bi_valid = 0;
+#ifdef ZLIB_DEBUG
+    s->compressed_len = 0L;
+    s->bits_sent = 0L;
+#endif
+
+    /* Initialize the first block of the first file: */
+    init_block(s);
+}
+
+/* ===========================================================================
+ * Initialize a new block.
+ */
+static void init_block(deflate_state *s) {
+    int n; /* iterates over tree elements */
+
+    /* Initialize the trees. */
+    for (n = 0; n < L_CODES;  n++)
+        s->dyn_ltree[n].Freq = 0;
+    for (n = 0; n < D_CODES;  n++)
+        s->dyn_dtree[n].Freq = 0;
+    for (n = 0; n < BL_CODES; n++)
+        s->bl_tree[n].Freq = 0;
+
+    s->dyn_ltree[END_BLOCK].Freq = 1;
+    s->opt_len = s->static_len = 0L;
+    s->sym_next = s->matches = 0;
+}
+
+#define SMALLEST 1
+/* Index within the heap array of least frequent node in the Huffman tree */
+
+
+/* ===========================================================================
+ * Remove the smallest element from the heap and recreate the heap with
+ * one less element. Updates heap and heap_len.
+ */
+#define pqremove(s, tree, top) \
+{\
+    top = s->heap[SMALLEST]; \
+    s->heap[SMALLEST] = s->heap[s->heap_len--]; \
+    pqdownheap(s, tree, SMALLEST); \
+}
+
+/* ===========================================================================
+ * Compares to subtrees, using the tree depth as tie breaker when
+ * the subtrees have equal frequency. This minimizes the worst case length.
+ */
+#define smaller(tree, n, m, depth) \
+    (tree[n].Freq < tree[m].Freq || \
+    (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m]))
+
+/* ===========================================================================
+ * Restore the heap property by moving down the tree starting at node k,
+ * exchanging a node with the smallest of its two sons if necessary, stopping
+ * when the heap property is re-established (each father smaller than its
+ * two sons).
+ */
+static void pqdownheap(deflate_state *s, ct_data *tree, int k) {
+    /* tree: the tree to restore */
+    /* k: node to move down */
+    int v = s->heap[k];
+    int j = k << 1;  /* left son of k */
+    while (j <= s->heap_len) {
+        /* Set j to the smallest of the two sons: */
+        if (j < s->heap_len && smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
+            j++;
+        }
+        /* Exit if v is smaller than both sons */
+        if (smaller(tree, v, s->heap[j], s->depth))
+            break;
+
+        /* Exchange v with the smallest son */
+        s->heap[k] = s->heap[j];
+        k = j;
+
+        /* And continue down the tree, setting j to the left son of k */
+        j <<= 1;
+    }
+    s->heap[k] = v;
+}
+
+/* ===========================================================================
+ * Compute the optimal bit lengths for a tree and update the total bit length
+ * for the current block.
+ * IN assertion: the fields freq and dad are set, heap[heap_max] and
+ *    above are the tree nodes sorted by increasing frequency.
+ * OUT assertions: the field len is set to the optimal bit length, the
+ *     array bl_count contains the frequencies for each bit length.
+ *     The length opt_len is updated; static_len is also updated if stree is
+ *     not null.
+ */
+static void gen_bitlen(deflate_state *s, tree_desc *desc) {
+    /* desc: the tree descriptor */
+    ct_data *tree           = desc->dyn_tree;
+    int max_code            = desc->max_code;
+    const ct_data *stree    = desc->stat_desc->static_tree;
+    const int *extra        = desc->stat_desc->extra_bits;
+    int base                = desc->stat_desc->extra_base;
+    unsigned int max_length = desc->stat_desc->max_length;
+    int h;              /* heap index */
+    int n, m;           /* iterate over the tree elements */
+    unsigned int bits;  /* bit length */
+    int xbits;          /* extra bits */
+    uint16_t f;         /* frequency */
+    int overflow = 0;   /* number of elements with bit length too large */
+
+    for (bits = 0; bits <= MAX_BITS; bits++)
+        s->bl_count[bits] = 0;
+
+    /* In a first pass, compute the optimal bit lengths (which may
+     * overflow in the case of the bit length tree).
+     */
+    tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */
+
+    for (h = s->heap_max + 1; h < HEAP_SIZE; h++) {
+        n = s->heap[h];
+        bits = tree[tree[n].Dad].Len + 1u;
+        if (bits > max_length){
+            bits = max_length;
+            overflow++;
+        }
+        tree[n].Len = (uint16_t)bits;
+        /* We overwrite tree[n].Dad which is no longer needed */
+
+        if (n > max_code) /* not a leaf node */
+            continue;
+
+        s->bl_count[bits]++;
+        xbits = 0;
+        if (n >= base)
+            xbits = extra[n-base];
+        f = tree[n].Freq;
+        s->opt_len += (unsigned long)f * (unsigned int)(bits + xbits);
+        if (stree)
+            s->static_len += (unsigned long)f * (unsigned int)(stree[n].Len + xbits);
+    }
+    if (overflow == 0)
+        return;
+
+    Tracev((stderr, "\nbit length overflow\n"));
+    /* This happens for example on obj2 and pic of the Calgary corpus */
+
+    /* Find the first bit length which could increase: */
+    do {
+        bits = max_length - 1;
+        while (s->bl_count[bits] == 0)
+            bits--;
+        s->bl_count[bits]--;       /* move one leaf down the tree */
+        s->bl_count[bits+1] += 2u; /* move one overflow item as its brother */
+        s->bl_count[max_length]--;
+        /* The brother of the overflow item also moves one step up,
+         * but this does not affect bl_count[max_length]
+         */
+        overflow -= 2;
+    } while (overflow > 0);
+
+    /* Now recompute all bit lengths, scanning in increasing frequency.
+     * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all
+     * lengths instead of fixing only the wrong ones. This idea is taken
+     * from 'ar' written by Haruhiko Okumura.)
+     */
+    for (bits = max_length; bits != 0; bits--) {
+        n = s->bl_count[bits];
+        while (n != 0) {
+            m = s->heap[--h];
+            if (m > max_code)
+                continue;
+            if (tree[m].Len != bits) {
+                Tracev((stderr, "code %d bits %d->%u\n", m, tree[m].Len, bits));
+                s->opt_len += (unsigned long)(bits * tree[m].Freq);
+                s->opt_len -= (unsigned long)(tree[m].Len * tree[m].Freq);
+                tree[m].Len = (uint16_t)bits;
+            }
+            n--;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Generate the codes for a given tree and bit counts (which need not be
+ * optimal).
+ * IN assertion: the array bl_count contains the bit length statistics for
+ * the given tree and the field len is set for all tree elements.
+ * OUT assertion: the field code is set for all tree elements of non
+ *     zero code length.
+ */
+Z_INTERNAL void gen_codes(ct_data *tree, int max_code, uint16_t *bl_count) {
+    /* tree: the tree to decorate */
+    /* max_code: largest code with non zero frequency */
+    /* bl_count: number of codes at each bit length */
+    uint16_t next_code[MAX_BITS+1];  /* next code value for each bit length */
+    unsigned int code = 0;           /* running code value */
+    int bits;                        /* bit index */
+    int n;                           /* code index */
+
+    /* The distribution counts are first used to generate the code values
+     * without bit reversal.
+     */
+    for (bits = 1; bits <= MAX_BITS; bits++) {
+        code = (code + bl_count[bits-1]) << 1;
+        next_code[bits] = (uint16_t)code;
+    }
+    /* Check that the bit counts in bl_count are consistent. The last code
+     * must be all ones.
+     */
+    Assert(code + bl_count[MAX_BITS]-1 == (1 << MAX_BITS)-1, "inconsistent bit counts");
+    Tracev((stderr, "\ngen_codes: max_code %d ", max_code));
+
+    for (n = 0;  n <= max_code; n++) {
+        int len = tree[n].Len;
+        if (len == 0)
+            continue;
+        /* Now reverse the bits */
+        tree[n].Code = PREFIX(bi_reverse)(next_code[len]++, len);
+
+        Tracecv(tree != static_ltree, (stderr, "\nn %3d %c l %2d c %4x (%x) ",
+             n, (isgraph(n & 0xff) ? n : ' '), len, tree[n].Code, next_code[len]-1));
+    }
+}
+
+/* ===========================================================================
+ * Construct one Huffman tree and assigns the code bit strings and lengths.
+ * Update the total bit length for the current block.
+ * IN assertion: the field freq is set for all tree elements.
+ * OUT assertions: the fields len and code are set to the optimal bit length
+ *     and corresponding code. The length opt_len is updated; static_len is
+ *     also updated if stree is not null. The field max_code is set.
+ */
+static void build_tree(deflate_state *s, tree_desc *desc) {
+    /* desc: the tree descriptor */
+    ct_data *tree         = desc->dyn_tree;
+    const ct_data *stree  = desc->stat_desc->static_tree;
+    int elems             = desc->stat_desc->elems;
+    int n, m;          /* iterate over heap elements */
+    int max_code = -1; /* largest code with non zero frequency */
+    int node;          /* new node being created */
+
+    /* Construct the initial heap, with least frequent element in
+     * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
+     * heap[0] is not used.
+     */
+    s->heap_len = 0;
+    s->heap_max = HEAP_SIZE;
+
+    for (n = 0; n < elems; n++) {
+        if (tree[n].Freq != 0) {
+            s->heap[++(s->heap_len)] = max_code = n;
+            s->depth[n] = 0;
+        } else {
+            tree[n].Len = 0;
+        }
+    }
+
+    /* The pkzip format requires that at least one distance code exists,
+     * and that at least one bit should be sent even if there is only one
+     * possible code. So to avoid special checks later on we force at least
+     * two codes of non zero frequency.
+     */
+    while (s->heap_len < 2) {
+        node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0);
+        tree[node].Freq = 1;
+        s->depth[node] = 0;
+        s->opt_len--;
+        if (stree)
+            s->static_len -= stree[node].Len;
+        /* node is 0 or 1 so it does not have extra bits */
+    }
+    desc->max_code = max_code;
+
+    /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
+     * establish sub-heaps of increasing lengths:
+     */
+    for (n = s->heap_len/2; n >= 1; n--)
+        pqdownheap(s, tree, n);
+
+    /* Construct the Huffman tree by repeatedly combining the least two
+     * frequent nodes.
+     */
+    node = elems;              /* next internal node of the tree */
+    do {
+        pqremove(s, tree, n);  /* n = node of least frequency */
+        m = s->heap[SMALLEST]; /* m = node of next least frequency */
+
+        s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */
+        s->heap[--(s->heap_max)] = m;
+
+        /* Create a new node father of n and m */
+        tree[node].Freq = tree[n].Freq + tree[m].Freq;
+        s->depth[node] = (unsigned char)((s->depth[n] >= s->depth[m] ?
+                                          s->depth[n] : s->depth[m]) + 1);
+        tree[n].Dad = tree[m].Dad = (uint16_t)node;
+#ifdef DUMP_BL_TREE
+        if (tree == s->bl_tree) {
+            fprintf(stderr, "\nnode %d(%d), sons %d(%d) %d(%d)",
+                    node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq);
+        }
+#endif
+        /* and insert the new node in the heap */
+        s->heap[SMALLEST] = node++;
+        pqdownheap(s, tree, SMALLEST);
+    } while (s->heap_len >= 2);
+
+    s->heap[--(s->heap_max)] = s->heap[SMALLEST];
+
+    /* At this point, the fields freq and dad are set. We can now
+     * generate the bit lengths.
+     */
+    gen_bitlen(s, (tree_desc *)desc);
+
+    /* The field len is now set, we can generate the bit codes */
+    gen_codes((ct_data *)tree, max_code, s->bl_count);
+}
+
+/* ===========================================================================
+ * Scan a literal or distance tree to determine the frequencies of the codes
+ * in the bit length tree.
+ */
+static void scan_tree(deflate_state *s, ct_data *tree, int max_code) {
+    /* tree: the tree to be scanned */
+    /* max_code: and its largest code of non zero frequency */
+    int n;                     /* iterates over all tree elements */
+    int prevlen = -1;          /* last emitted length */
+    int curlen;                /* length of current code */
+    int nextlen = tree[0].Len; /* length of next code */
+    uint16_t count = 0;        /* repeat count of the current code */
+    uint16_t max_count = 7;    /* max repeat count */
+    uint16_t min_count = 4;    /* min repeat count */
+
+    if (nextlen == 0)
+        max_count = 138, min_count = 3;
+
+    tree[max_code+1].Len = (uint16_t)0xffff; /* guard */
+
+    for (n = 0; n <= max_code; n++) {
+        curlen = nextlen;
+        nextlen = tree[n+1].Len;
+        if (++count < max_count && curlen == nextlen) {
+            continue;
+        } else if (count < min_count) {
+            s->bl_tree[curlen].Freq += count;
+        } else if (curlen != 0) {
+            if (curlen != prevlen)
+                s->bl_tree[curlen].Freq++;
+            s->bl_tree[REP_3_6].Freq++;
+        } else if (count <= 10) {
+            s->bl_tree[REPZ_3_10].Freq++;
+        } else {
+            s->bl_tree[REPZ_11_138].Freq++;
+        }
+        count = 0;
+        prevlen = curlen;
+        if (nextlen == 0) {
+            max_count = 138, min_count = 3;
+        } else if (curlen == nextlen) {
+            max_count = 6, min_count = 3;
+        } else {
+            max_count = 7, min_count = 4;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Send a literal or distance tree in compressed form, using the codes in
+ * bl_tree.
+ */
+static void send_tree(deflate_state *s, ct_data *tree, int max_code) {
+    /* tree: the tree to be scanned */
+    /* max_code and its largest code of non zero frequency */
+    int n;                     /* iterates over all tree elements */
+    int prevlen = -1;          /* last emitted length */
+    int curlen;                /* length of current code */
+    int nextlen = tree[0].Len; /* length of next code */
+    int count = 0;             /* repeat count of the current code */
+    int max_count = 7;         /* max repeat count */
+    int min_count = 4;         /* min repeat count */
+
+    /* tree[max_code+1].Len = -1; */  /* guard already set */
+    if (nextlen == 0)
+        max_count = 138, min_count = 3;
+
+    // Temp local variables
+    uint32_t bi_valid = s->bi_valid;
+    uint64_t bi_buf = s->bi_buf;
+
+    for (n = 0; n <= max_code; n++) {
+        curlen = nextlen;
+        nextlen = tree[n+1].Len;
+        if (++count < max_count && curlen == nextlen) {
+            continue;
+        } else if (count < min_count) {
+            do {
+                send_code(s, curlen, s->bl_tree, bi_buf, bi_valid);
+            } while (--count != 0);
+
+        } else if (curlen != 0) {
+            if (curlen != prevlen) {
+                send_code(s, curlen, s->bl_tree, bi_buf, bi_valid);
+                count--;
+            }
+            Assert(count >= 3 && count <= 6, " 3_6?");
+            send_code(s, REP_3_6, s->bl_tree, bi_buf, bi_valid);
+            send_bits(s, count-3, 2, bi_buf, bi_valid);
+
+        } else if (count <= 10) {
+            send_code(s, REPZ_3_10, s->bl_tree, bi_buf, bi_valid);
+            send_bits(s, count-3, 3, bi_buf, bi_valid);
+
+        } else {
+            send_code(s, REPZ_11_138, s->bl_tree, bi_buf, bi_valid);
+            send_bits(s, count-11, 7, bi_buf, bi_valid);
+        }
+        count = 0;
+        prevlen = curlen;
+        if (nextlen == 0) {
+            max_count = 138, min_count = 3;
+        } else if (curlen == nextlen) {
+            max_count = 6, min_count = 3;
+        } else {
+            max_count = 7, min_count = 4;
+        }
+    }
+
+    // Store back temp variables
+    s->bi_buf = bi_buf;
+    s->bi_valid = bi_valid;
+}
+
+/* ===========================================================================
+ * Construct the Huffman tree for the bit lengths and return the index in
+ * bl_order of the last bit length code to send.
+ */
+static int build_bl_tree(deflate_state *s) {
+    int max_blindex;  /* index of last bit length code of non zero freq */
+
+    /* Determine the bit length frequencies for literal and distance trees */
+    scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code);
+    scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code);
+
+    /* Build the bit length tree: */
+    build_tree(s, (tree_desc *)(&(s->bl_desc)));
+    /* opt_len now includes the length of the tree representations, except
+     * the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
+     */
+
+    /* Determine the number of bit length codes to send. The pkzip format
+     * requires that at least 4 bit length codes be sent. (appnote.txt says
+     * 3 but the actual value used is 4.)
+     */
+    for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) {
+        if (s->bl_tree[bl_order[max_blindex]].Len != 0)
+            break;
+    }
+    /* Update opt_len to include the bit length tree and counts */
+    s->opt_len += 3*((unsigned long)max_blindex+1) + 5+5+4;
+    Tracev((stderr, "\ndyn trees: dyn %lu, stat %lu", s->opt_len, s->static_len));
+
+    return max_blindex;
+}
+
+/* ===========================================================================
+ * Send the header for a block using dynamic Huffman trees: the counts, the
+ * lengths of the bit length codes, the literal tree and the distance tree.
+ * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4.
+ */
+static void send_all_trees(deflate_state *s, int lcodes, int dcodes, int blcodes) {
+    int rank;                    /* index in bl_order */
+
+    Assert(lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
+    Assert(lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES, "too many codes");
+
+    // Temp local variables
+    uint32_t bi_valid = s->bi_valid;
+    uint64_t bi_buf = s->bi_buf;
+
+    Tracev((stderr, "\nbl counts: "));
+    send_bits(s, lcodes-257, 5, bi_buf, bi_valid); /* not +255 as stated in appnote.txt */
+    send_bits(s, dcodes-1,   5, bi_buf, bi_valid);
+    send_bits(s, blcodes-4,  4, bi_buf, bi_valid); /* not -3 as stated in appnote.txt */
+    for (rank = 0; rank < blcodes; rank++) {
+        Tracev((stderr, "\nbl code %2u ", bl_order[rank]));
+        send_bits(s, s->bl_tree[bl_order[rank]].Len, 3, bi_buf, bi_valid);
+    }
+    Tracev((stderr, "\nbl tree: sent %lu", s->bits_sent));
+
+    // Store back temp variables
+    s->bi_buf = bi_buf;
+    s->bi_valid = bi_valid;
+
+    send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
+    Tracev((stderr, "\nlit tree: sent %lu", s->bits_sent));
+
+    send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
+    Tracev((stderr, "\ndist tree: sent %lu", s->bits_sent));
+}
+
+/* ===========================================================================
+ * Send a stored block
+ */
+void Z_INTERNAL zng_tr_stored_block(deflate_state *s, char *buf, uint32_t stored_len, int last) {
+    /* buf: input block */
+    /* stored_len: length of input block */
+    /* last: one if this is the last block for a file */
+    zng_tr_emit_tree(s, STORED_BLOCK, last); /* send block type */
+    zng_tr_emit_align(s);                    /* align on byte boundary */
+    cmpr_bits_align(s);
+    put_short(s, (uint16_t)stored_len);
+    put_short(s, (uint16_t)~stored_len);
+    cmpr_bits_add(s, 32);
+    sent_bits_add(s, 32);
+    if (stored_len) {
+        memcpy(s->pending_buf + s->pending, (unsigned char *)buf, stored_len);
+        s->pending += stored_len;
+        cmpr_bits_add(s, stored_len << 3);
+        sent_bits_add(s, stored_len << 3);
+    }
+}
+
+/* ===========================================================================
+ * Flush the bits in the bit buffer to pending output (leaves at most 7 bits)
+ */
+void Z_INTERNAL zng_tr_flush_bits(deflate_state *s) {
+    bi_flush(s);
+}
+
+/* ===========================================================================
+ * Send one empty static block to give enough lookahead for inflate.
+ * This takes 10 bits, of which 7 may remain in the bit buffer.
+ */
+void Z_INTERNAL zng_tr_align(deflate_state *s) {
+    zng_tr_emit_tree(s, STATIC_TREES, 0);
+    zng_tr_emit_end_block(s, static_ltree, 0);
+    bi_flush(s);
+}
+
+/* ===========================================================================
+ * Determine the best encoding for the current block: dynamic trees, static
+ * trees or store, and write out the encoded block.
+ */
+void Z_INTERNAL zng_tr_flush_block(deflate_state *s, char *buf, uint32_t stored_len, int last) {
+    /* buf: input block, or NULL if too old */
+    /* stored_len: length of input block */
+    /* last: one if this is the last block for a file */
+    unsigned long opt_lenb, static_lenb; /* opt_len and static_len in bytes */
+    int max_blindex = 0;  /* index of last bit length code of non zero freq */
+
+    /* Build the Huffman trees unless a stored block is forced */
+    if (UNLIKELY(s->sym_next == 0)) {
+        /* Emit an empty static tree block with no codes */
+        opt_lenb = static_lenb = 0;
+        s->static_len = 7;
+    } else if (s->level > 0) {
+        /* Check if the file is binary or text */
+        if (s->strm->data_type == Z_UNKNOWN)
+            s->strm->data_type = detect_data_type(s);
+
+        /* Construct the literal and distance trees */
+        build_tree(s, (tree_desc *)(&(s->l_desc)));
+        Tracev((stderr, "\nlit data: dyn %lu, stat %lu", s->opt_len, s->static_len));
+
+        build_tree(s, (tree_desc *)(&(s->d_desc)));
+        Tracev((stderr, "\ndist data: dyn %lu, stat %lu", s->opt_len, s->static_len));
+        /* At this point, opt_len and static_len are the total bit lengths of
+         * the compressed block data, excluding the tree representations.
+         */
+
+        /* Build the bit length tree for the above two trees, and get the index
+         * in bl_order of the last bit length code to send.
+         */
+        max_blindex = build_bl_tree(s);
+
+        /* Determine the best encoding. Compute the block lengths in bytes. */
+        opt_lenb = (s->opt_len+3+7) >> 3;
+        static_lenb = (s->static_len+3+7) >> 3;
+
+        Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %u lit %u ",
+                opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
+                s->sym_next / 3));
+
+        if (static_lenb <= opt_lenb || s->strategy == Z_FIXED)
+            opt_lenb = static_lenb;
+
+    } else {
+        Assert(buf != NULL, "lost buf");
+        opt_lenb = static_lenb = stored_len + 5; /* force a stored block */
+    }
+
+    if (stored_len+4 <= opt_lenb && buf != NULL) {
+        /* 4: two words for the lengths
+         * The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
+         * Otherwise we can't have processed more than WSIZE input bytes since
+         * the last block flush, because compression would have been
+         * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to
+         * transform a block into a stored block.
+         */
+        zng_tr_stored_block(s, buf, stored_len, last);
+
+    } else if (static_lenb == opt_lenb) {
+        zng_tr_emit_tree(s, STATIC_TREES, last);
+        compress_block(s, (const ct_data *)static_ltree, (const ct_data *)static_dtree);
+        cmpr_bits_add(s, s->static_len);
+    } else {
+        zng_tr_emit_tree(s, DYN_TREES, last);
+        send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1, max_blindex+1);
+        compress_block(s, (const ct_data *)s->dyn_ltree, (const ct_data *)s->dyn_dtree);
+        cmpr_bits_add(s, s->opt_len);
+    }
+    Assert(s->compressed_len == s->bits_sent, "bad compressed size");
+    /* The above check is made mod 2^32, for files larger than 512 MB
+     * and unsigned long implemented on 32 bits.
+     */
+    init_block(s);
+
+    if (last) {
+        zng_tr_emit_align(s);
+    }
+    Tracev((stderr, "\ncomprlen %lu(%lu) ", s->compressed_len>>3, s->compressed_len-7*last));
+}
+
+/* ===========================================================================
+ * Send the block data compressed using the given Huffman trees
+ */
+static void compress_block(deflate_state *s, const ct_data *ltree, const ct_data *dtree) {
+    /* ltree: literal tree */
+    /* dtree: distance tree */
+    unsigned dist;      /* distance of matched string */
+    int lc;             /* match length or unmatched char (if dist == 0) */
+    unsigned sx = 0;    /* running index in sym_buf */
+
+    if (s->sym_next != 0) {
+        do {
+            dist = s->sym_buf[sx++] & 0xff;
+            dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8;
+            lc = s->sym_buf[sx++];
+            if (dist == 0) {
+                zng_emit_lit(s, ltree, lc);
+            } else {
+                zng_emit_dist(s, ltree, dtree, lc, dist);
+            } /* literal or match pair ? */
+
+            /* Check that the overlay between pending_buf and sym_buf is ok: */
+            Assert(s->pending < s->lit_bufsize + sx, "pending_buf overflow");
+        } while (sx < s->sym_next);
+    }
+
+    zng_emit_end_block(s, ltree, 0);
+}
+
+/* ===========================================================================
+ * Check if the data type is TEXT or BINARY, using the following algorithm:
+ * - TEXT if the two conditions below are satisfied:
+ *    a) There are no non-portable control characters belonging to the
+ *       "black list" (0..6, 14..25, 28..31).
+ *    b) There is at least one printable character belonging to the
+ *       "white list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255).
+ * - BINARY otherwise.
+ * - The following partially-portable control characters form a
+ *   "gray list" that is ignored in this detection algorithm:
+ *   (7 {BEL}, 8 {BS}, 11 {VT}, 12 {FF}, 26 {SUB}, 27 {ESC}).
+ * IN assertion: the fields Freq of dyn_ltree are set.
+ */
+static int detect_data_type(deflate_state *s) {
+    /* black_mask is the bit mask of black-listed bytes
+     * set bits 0..6, 14..25, and 28..31
+     * 0xf3ffc07f = binary 11110011111111111100000001111111
+     */
+    unsigned long black_mask = 0xf3ffc07fUL;
+    int n;
+
+    /* Check for non-textual ("black-listed") bytes. */
+    for (n = 0; n <= 31; n++, black_mask >>= 1)
+        if ((black_mask & 1) && (s->dyn_ltree[n].Freq != 0))
+            return Z_BINARY;
+
+    /* Check for textual ("white-listed") bytes. */
+    if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0 || s->dyn_ltree[13].Freq != 0)
+        return Z_TEXT;
+    for (n = 32; n < LITERALS; n++)
+        if (s->dyn_ltree[n].Freq != 0)
+            return Z_TEXT;
+
+    /* There are no "black-listed" or "white-listed" bytes:
+     * this stream either is empty or has tolerated ("gray-listed") bytes only.
+     */
+    return Z_BINARY;
+}
+
+/* ===========================================================================
+ * Flush the bit buffer, keeping at most 7 bits in it.
+ */
+static void bi_flush(deflate_state *s) {
+    if (s->bi_valid == 64) {
+        put_uint64(s, s->bi_buf);
+        s->bi_buf = 0;
+        s->bi_valid = 0;
+    } else {
+        if (s->bi_valid >= 32) {
+            put_uint32(s, (uint32_t)s->bi_buf);
+            s->bi_buf >>= 32;
+            s->bi_valid -= 32;
+        }
+        if (s->bi_valid >= 16) {
+            put_short(s, (uint16_t)s->bi_buf);
+            s->bi_buf >>= 16;
+            s->bi_valid -= 16;
+        }
+        if (s->bi_valid >= 8) {
+            put_byte(s, s->bi_buf);
+            s->bi_buf >>= 8;
+            s->bi_valid -= 8;
+        }
+    }
+}
+
+/* ===========================================================================
+ * Reverse the first len bits of a code using bit manipulation
+ */
+Z_INTERNAL uint16_t PREFIX(bi_reverse)(unsigned code, int len) {
+    /* code: the value to invert */
+    /* len: its bit length */
+    Assert(len >= 1 && len <= 15, "code length must be 1-15");
+#define bitrev8(b) \
+    (uint8_t)((((uint8_t)(b) * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32)
+    return (bitrev8(code >> 8) | (uint16_t)bitrev8(code) << 8) >> (16 - len);
+}
diff --git a/3rdparty/zlib-ng/trees.h b/3rdparty/zlib-ng/trees.h
new file mode 100644
index 000000000000..e57f926489f4
--- /dev/null
+++ b/3rdparty/zlib-ng/trees.h
@@ -0,0 +1,40 @@
+#ifndef TREES_H_
+#define TREES_H_
+
+/* Constants */
+
+#define DIST_CODE_LEN  512
+/* see definition of array dist_code in trees.c */
+
+#define MAX_BL_BITS 7
+/* Bit length codes must not exceed MAX_BL_BITS bits */
+
+#define REP_3_6      16
+/* repeat previous bit length 3-6 times (2 bits of repeat count) */
+
+#define REPZ_3_10    17
+/* repeat a zero length 3-10 times  (3 bits of repeat count) */
+
+#define REPZ_11_138  18
+/* repeat a zero length 11-138 times  (7 bits of repeat count) */
+
+static const int extra_lbits[LENGTH_CODES] /* extra bits for each length code */
+    = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0};
+
+static const int extra_dbits[D_CODES] /* extra bits for each distance code */
+    = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static const int extra_blbits[BL_CODES] /* extra bits for each bit length code */
+    = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7};
+
+static const unsigned char bl_order[BL_CODES]
+    = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15};
+ /* The lengths of the bit length codes are sent in order of decreasing
+  * probability, to avoid transmitting the lengths for unused bit length codes.
+  */
+
+
+/* Function definitions */
+void gen_codes        (ct_data *tree, int max_code, uint16_t *bl_count);
+
+#endif
diff --git a/3rdparty/zlib-ng/trees_emit.h b/3rdparty/zlib-ng/trees_emit.h
new file mode 100644
index 000000000000..922daae509f5
--- /dev/null
+++ b/3rdparty/zlib-ng/trees_emit.h
@@ -0,0 +1,227 @@
+#ifndef TREES_EMIT_H_
+#define TREES_EMIT_H_
+
+#include "zbuild.h"
+#include "trees.h"
+
+#ifdef ZLIB_DEBUG
+#  include <ctype.h>
+#  include <inttypes.h>
+#endif
+
+
+/* trees.h */
+extern Z_INTERNAL const ct_data static_ltree[L_CODES+2];
+extern Z_INTERNAL const ct_data static_dtree[D_CODES];
+
+extern const unsigned char Z_INTERNAL zng_dist_code[DIST_CODE_LEN];
+extern const unsigned char Z_INTERNAL zng_length_code[STD_MAX_MATCH-STD_MIN_MATCH+1];
+
+extern Z_INTERNAL const int base_length[LENGTH_CODES];
+extern Z_INTERNAL const int base_dist[D_CODES];
+
+/* Bit buffer and deflate code stderr tracing */
+#ifdef ZLIB_DEBUG
+#  define send_bits_trace(s, value, length) { \
+        Tracevv((stderr, " l %2d v %4llx ", (int)(length), (long long)(value))); \
+        Assert(length > 0 && length <= BIT_BUF_SIZE, "invalid length"); \
+    }
+#  define send_code_trace(s, c) \
+    if (z_verbose > 2) { \
+        fprintf(stderr, "\ncd %3d ", (c)); \
+    }
+#else
+#  define send_bits_trace(s, value, length)
+#  define send_code_trace(s, c)
+#endif
+
+/* If not enough room in bi_buf, use (valid) bits from bi_buf and
+ * (64 - bi_valid) bits from value, leaving (width - (64-bi_valid))
+ * unused bits in value.
+ */
+#define send_bits(s, t_val, t_len, bi_buf, bi_valid) {\
+    uint64_t val = (uint64_t)t_val;\
+    uint32_t len = (uint32_t)t_len;\
+    uint32_t total_bits = bi_valid + len;\
+    send_bits_trace(s, val, len);\
+    sent_bits_add(s, len);\
+    if (total_bits < BIT_BUF_SIZE) {\
+        bi_buf |= val << bi_valid;\
+        bi_valid = total_bits;\
+    } else if (bi_valid == BIT_BUF_SIZE) {\
+        put_uint64(s, bi_buf);\
+        bi_buf = val;\
+        bi_valid = len;\
+    } else {\
+        bi_buf |= val << bi_valid;\
+        put_uint64(s, bi_buf);\
+        bi_buf = val >> (BIT_BUF_SIZE - bi_valid);\
+        bi_valid = total_bits - BIT_BUF_SIZE;\
+    }\
+}
+
+/* Send a code of the given tree. c and tree must not have side effects */
+#ifdef ZLIB_DEBUG
+#  define send_code(s, c, tree, bi_buf, bi_valid) { \
+    send_code_trace(s, c); \
+    send_bits(s, tree[c].Code, tree[c].Len, bi_buf, bi_valid); \
+}
+#else
+#  define send_code(s, c, tree, bi_buf, bi_valid) \
+    send_bits(s, tree[c].Code, tree[c].Len, bi_buf, bi_valid)
+#endif
+
+/* ===========================================================================
+ * Flush the bit buffer and align the output on a byte boundary
+ */
+static void bi_windup(deflate_state *s) {
+    if (s->bi_valid > 56) {
+        put_uint64(s, s->bi_buf);
+    } else {
+        if (s->bi_valid > 24) {
+            put_uint32(s, (uint32_t)s->bi_buf);
+            s->bi_buf >>= 32;
+            s->bi_valid -= 32;
+        }
+        if (s->bi_valid > 8) {
+            put_short(s, (uint16_t)s->bi_buf);
+            s->bi_buf >>= 16;
+            s->bi_valid -= 16;
+        }
+        if (s->bi_valid > 0) {
+            put_byte(s, s->bi_buf);
+        }
+    }
+    s->bi_buf = 0;
+    s->bi_valid = 0;
+}
+
+/* ===========================================================================
+ * Emit literal code
+ */
+static inline uint32_t zng_emit_lit(deflate_state *s, const ct_data *ltree, unsigned c) {
+    uint32_t bi_valid = s->bi_valid;
+    uint64_t bi_buf = s->bi_buf;
+
+    send_code(s, c, ltree, bi_buf, bi_valid);
+
+    s->bi_valid = bi_valid;
+    s->bi_buf = bi_buf;
+
+    Tracecv(isgraph(c & 0xff), (stderr, " '%c' ", c));
+
+    return ltree[c].Len;
+}
+
+/* ===========================================================================
+ * Emit match distance/length code
+ */
+static inline uint32_t zng_emit_dist(deflate_state *s, const ct_data *ltree, const ct_data *dtree,
+    uint32_t lc, uint32_t dist) {
+    uint32_t c, extra;
+    uint8_t code;
+    uint64_t match_bits;
+    uint32_t match_bits_len;
+    uint32_t bi_valid = s->bi_valid;
+    uint64_t bi_buf = s->bi_buf;
+
+    /* Send the length code, len is the match length - STD_MIN_MATCH */
+    code = zng_length_code[lc];
+    c = code+LITERALS+1;
+    Assert(c < L_CODES, "bad l_code");
+    send_code_trace(s, c);
+
+    match_bits = ltree[c].Code;
+    match_bits_len = ltree[c].Len;
+    extra = extra_lbits[code];
+    if (extra != 0) {
+        lc -= base_length[code];
+        match_bits |= ((uint64_t)lc << match_bits_len);
+        match_bits_len += extra;
+    }
+
+    dist--; /* dist is now the match distance - 1 */
+    code = d_code(dist);
+    Assert(code < D_CODES, "bad d_code");
+    send_code_trace(s, code);
+
+    /* Send the distance code */
+    match_bits |= ((uint64_t)dtree[code].Code << match_bits_len);
+    match_bits_len += dtree[code].Len;
+    extra = extra_dbits[code];
+    if (extra != 0) {
+        dist -= base_dist[code];
+        match_bits |= ((uint64_t)dist << match_bits_len);
+        match_bits_len += extra;
+    }
+
+    send_bits(s, match_bits, match_bits_len, bi_buf, bi_valid);
+
+    s->bi_valid = bi_valid;
+    s->bi_buf = bi_buf;
+
+    return match_bits_len;
+}
+
+/* ===========================================================================
+ * Emit end block
+ */
+static inline void zng_emit_end_block(deflate_state *s, const ct_data *ltree, const int last) {
+    uint32_t bi_valid = s->bi_valid;
+    uint64_t bi_buf = s->bi_buf;
+    send_code(s, END_BLOCK, ltree, bi_buf, bi_valid);
+    s->bi_valid = bi_valid;
+    s->bi_buf = bi_buf;
+    Tracev((stderr, "\n+++ Emit End Block: Last: %u Pending: %u Total Out: %" PRIu64 "\n",
+        last, s->pending, (uint64_t)s->strm->total_out));
+    Z_UNUSED(last);
+}
+
+/* ===========================================================================
+ * Emit literal and count bits
+ */
+static inline void zng_tr_emit_lit(deflate_state *s, const ct_data *ltree, unsigned c) {
+    cmpr_bits_add(s, zng_emit_lit(s, ltree, c));
+}
+
+/* ===========================================================================
+ * Emit match and count bits
+ */
+static inline void zng_tr_emit_dist(deflate_state *s, const ct_data *ltree, const ct_data *dtree,
+    uint32_t lc, uint32_t dist) {
+    cmpr_bits_add(s, zng_emit_dist(s, ltree, dtree, lc, dist));
+}
+
+/* ===========================================================================
+ * Emit start of block
+ */
+static inline void zng_tr_emit_tree(deflate_state *s, int type, const int last) {
+    uint32_t bi_valid = s->bi_valid;
+    uint64_t bi_buf = s->bi_buf;
+    uint32_t header_bits = (type << 1) + last;
+    send_bits(s, header_bits, 3, bi_buf, bi_valid);
+    cmpr_bits_add(s, 3);
+    s->bi_valid = bi_valid;
+    s->bi_buf = bi_buf;
+    Tracev((stderr, "\n--- Emit Tree: Last: %u\n", last));
+}
+
+/* ===========================================================================
+ * Align bit buffer on a byte boundary and count bits
+ */
+static inline void zng_tr_emit_align(deflate_state *s) {
+    bi_windup(s); /* align on byte boundary */
+    sent_bits_align(s);
+}
+
+/* ===========================================================================
+ * Emit an end block and align bit buffer if last block
+ */
+static inline void zng_tr_emit_end_block(deflate_state *s, const ct_data *ltree, const int last) {
+    zng_emit_end_block(s, ltree, last);
+    cmpr_bits_add(s, 7);
+    if (last)
+        zng_tr_emit_align(s);
+}
+
+#endif
diff --git a/3rdparty/zlib-ng/trees_tbl.h b/3rdparty/zlib-ng/trees_tbl.h
new file mode 100644
index 000000000000..a3912b7fd767
--- /dev/null
+++ b/3rdparty/zlib-ng/trees_tbl.h
@@ -0,0 +1,132 @@
+#ifndef TREES_TBL_H_
+#define TREES_TBL_H_
+
+/* header created automatically with maketrees.c */
+
+Z_INTERNAL const ct_data static_ltree[L_CODES+2] = {
+{{ 12},{8}}, {{140},{8}}, {{ 76},{8}}, {{204},{8}}, {{ 44},{8}},
+{{172},{8}}, {{108},{8}}, {{236},{8}}, {{ 28},{8}}, {{156},{8}},
+{{ 92},{8}}, {{220},{8}}, {{ 60},{8}}, {{188},{8}}, {{124},{8}},
+{{252},{8}}, {{  2},{8}}, {{130},{8}}, {{ 66},{8}}, {{194},{8}},
+{{ 34},{8}}, {{162},{8}}, {{ 98},{8}}, {{226},{8}}, {{ 18},{8}},
+{{146},{8}}, {{ 82},{8}}, {{210},{8}}, {{ 50},{8}}, {{178},{8}},
+{{114},{8}}, {{242},{8}}, {{ 10},{8}}, {{138},{8}}, {{ 74},{8}},
+{{202},{8}}, {{ 42},{8}}, {{170},{8}}, {{106},{8}}, {{234},{8}},
+{{ 26},{8}}, {{154},{8}}, {{ 90},{8}}, {{218},{8}}, {{ 58},{8}},
+{{186},{8}}, {{122},{8}}, {{250},{8}}, {{  6},{8}}, {{134},{8}},
+{{ 70},{8}}, {{198},{8}}, {{ 38},{8}}, {{166},{8}}, {{102},{8}},
+{{230},{8}}, {{ 22},{8}}, {{150},{8}}, {{ 86},{8}}, {{214},{8}},
+{{ 54},{8}}, {{182},{8}}, {{118},{8}}, {{246},{8}}, {{ 14},{8}},
+{{142},{8}}, {{ 78},{8}}, {{206},{8}}, {{ 46},{8}}, {{174},{8}},
+{{110},{8}}, {{238},{8}}, {{ 30},{8}}, {{158},{8}}, {{ 94},{8}},
+{{222},{8}}, {{ 62},{8}}, {{190},{8}}, {{126},{8}}, {{254},{8}},
+{{  1},{8}}, {{129},{8}}, {{ 65},{8}}, {{193},{8}}, {{ 33},{8}},
+{{161},{8}}, {{ 97},{8}}, {{225},{8}}, {{ 17},{8}}, {{145},{8}},
+{{ 81},{8}}, {{209},{8}}, {{ 49},{8}}, {{177},{8}}, {{113},{8}},
+{{241},{8}}, {{  9},{8}}, {{137},{8}}, {{ 73},{8}}, {{201},{8}},
+{{ 41},{8}}, {{169},{8}}, {{105},{8}}, {{233},{8}}, {{ 25},{8}},
+{{153},{8}}, {{ 89},{8}}, {{217},{8}}, {{ 57},{8}}, {{185},{8}},
+{{121},{8}}, {{249},{8}}, {{  5},{8}}, {{133},{8}}, {{ 69},{8}},
+{{197},{8}}, {{ 37},{8}}, {{165},{8}}, {{101},{8}}, {{229},{8}},
+{{ 21},{8}}, {{149},{8}}, {{ 85},{8}}, {{213},{8}}, {{ 53},{8}},
+{{181},{8}}, {{117},{8}}, {{245},{8}}, {{ 13},{8}}, {{141},{8}},
+{{ 77},{8}}, {{205},{8}}, {{ 45},{8}}, {{173},{8}}, {{109},{8}},
+{{237},{8}}, {{ 29},{8}}, {{157},{8}}, {{ 93},{8}}, {{221},{8}},
+{{ 61},{8}}, {{189},{8}}, {{125},{8}}, {{253},{8}}, {{ 19},{9}},
+{{275},{9}}, {{147},{9}}, {{403},{9}}, {{ 83},{9}}, {{339},{9}},
+{{211},{9}}, {{467},{9}}, {{ 51},{9}}, {{307},{9}}, {{179},{9}},
+{{435},{9}}, {{115},{9}}, {{371},{9}}, {{243},{9}}, {{499},{9}},
+{{ 11},{9}}, {{267},{9}}, {{139},{9}}, {{395},{9}}, {{ 75},{9}},
+{{331},{9}}, {{203},{9}}, {{459},{9}}, {{ 43},{9}}, {{299},{9}},
+{{171},{9}}, {{427},{9}}, {{107},{9}}, {{363},{9}}, {{235},{9}},
+{{491},{9}}, {{ 27},{9}}, {{283},{9}}, {{155},{9}}, {{411},{9}},
+{{ 91},{9}}, {{347},{9}}, {{219},{9}}, {{475},{9}}, {{ 59},{9}},
+{{315},{9}}, {{187},{9}}, {{443},{9}}, {{123},{9}}, {{379},{9}},
+{{251},{9}}, {{507},{9}}, {{  7},{9}}, {{263},{9}}, {{135},{9}},
+{{391},{9}}, {{ 71},{9}}, {{327},{9}}, {{199},{9}}, {{455},{9}},
+{{ 39},{9}}, {{295},{9}}, {{167},{9}}, {{423},{9}}, {{103},{9}},
+{{359},{9}}, {{231},{9}}, {{487},{9}}, {{ 23},{9}}, {{279},{9}},
+{{151},{9}}, {{407},{9}}, {{ 87},{9}}, {{343},{9}}, {{215},{9}},
+{{471},{9}}, {{ 55},{9}}, {{311},{9}}, {{183},{9}}, {{439},{9}},
+{{119},{9}}, {{375},{9}}, {{247},{9}}, {{503},{9}}, {{ 15},{9}},
+{{271},{9}}, {{143},{9}}, {{399},{9}}, {{ 79},{9}}, {{335},{9}},
+{{207},{9}}, {{463},{9}}, {{ 47},{9}}, {{303},{9}}, {{175},{9}},
+{{431},{9}}, {{111},{9}}, {{367},{9}}, {{239},{9}}, {{495},{9}},
+{{ 31},{9}}, {{287},{9}}, {{159},{9}}, {{415},{9}}, {{ 95},{9}},
+{{351},{9}}, {{223},{9}}, {{479},{9}}, {{ 63},{9}}, {{319},{9}},
+{{191},{9}}, {{447},{9}}, {{127},{9}}, {{383},{9}}, {{255},{9}},
+{{511},{9}}, {{  0},{7}}, {{ 64},{7}}, {{ 32},{7}}, {{ 96},{7}},
+{{ 16},{7}}, {{ 80},{7}}, {{ 48},{7}}, {{112},{7}}, {{  8},{7}},
+{{ 72},{7}}, {{ 40},{7}}, {{104},{7}}, {{ 24},{7}}, {{ 88},{7}},
+{{ 56},{7}}, {{120},{7}}, {{  4},{7}}, {{ 68},{7}}, {{ 36},{7}},
+{{100},{7}}, {{ 20},{7}}, {{ 84},{7}}, {{ 52},{7}}, {{116},{7}},
+{{  3},{8}}, {{131},{8}}, {{ 67},{8}}, {{195},{8}}, {{ 35},{8}},
+{{163},{8}}, {{ 99},{8}}, {{227},{8}}
+};
+
+Z_INTERNAL const ct_data static_dtree[D_CODES] = {
+{{ 0},{5}}, {{16},{5}}, {{ 8},{5}}, {{24},{5}}, {{ 4},{5}},
+{{20},{5}}, {{12},{5}}, {{28},{5}}, {{ 2},{5}}, {{18},{5}},
+{{10},{5}}, {{26},{5}}, {{ 6},{5}}, {{22},{5}}, {{14},{5}},
+{{30},{5}}, {{ 1},{5}}, {{17},{5}}, {{ 9},{5}}, {{25},{5}},
+{{ 5},{5}}, {{21},{5}}, {{13},{5}}, {{29},{5}}, {{ 3},{5}},
+{{19},{5}}, {{11},{5}}, {{27},{5}}, {{ 7},{5}}, {{23},{5}}
+};
+
+const unsigned char Z_INTERNAL zng_dist_code[DIST_CODE_LEN] = {
+ 0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,
+ 8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
+10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13,
+13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15,
+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  0,  0, 16, 17,
+18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
+};
+
+const unsigned char Z_INTERNAL zng_length_code[STD_MAX_MATCH-STD_MIN_MATCH+1] = {
+ 0,  1,  2,  3,  4,  5,  6,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 12, 12,
+13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28
+};
+
+Z_INTERNAL const int base_length[LENGTH_CODES] = {
+0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
+64, 80, 96, 112, 128, 160, 192, 224, 0
+};
+
+Z_INTERNAL const int base_dist[D_CODES] = {
+    0,     1,     2,     3,     4,     6,     8,    12,    16,    24,
+   32,    48,    64,    96,   128,   192,   256,   384,   512,   768,
+ 1024,  1536,  2048,  3072,  4096,  6144,  8192, 12288, 16384, 24576
+};
+
+#endif /* TREES_TBL_H_ */
diff --git a/3rdparty/zlib-ng/uncompr.c b/3rdparty/zlib-ng/uncompr.c
new file mode 100644
index 000000000000..311eca2b06bf
--- /dev/null
+++ b/3rdparty/zlib-ng/uncompr.c
@@ -0,0 +1,80 @@
+/* uncompr.c -- decompress a memory buffer
+ * Copyright (C) 1995-2003, 2010, 2014, 2016 Jean-loup Gailly, Mark Adler.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+
+/* ===========================================================================
+     Decompresses the source buffer into the destination buffer.  *sourceLen is
+   the byte length of the source buffer. Upon entry, *destLen is the total size
+   of the destination buffer, which must be large enough to hold the entire
+   uncompressed data. (The size of the uncompressed data must have been saved
+   previously by the compressor and transmitted to the decompressor by some
+   mechanism outside the scope of this compression library.) Upon exit,
+   *destLen is the size of the decompressed data and *sourceLen is the number
+   of source bytes consumed. Upon return, source + *sourceLen points to the
+   first unused input byte.
+
+     uncompress returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer, or
+   Z_DATA_ERROR if the input data was corrupted, including if the input data is
+   an incomplete zlib stream.
+*/
+int Z_EXPORT PREFIX(uncompress2)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source, z_uintmax_t *sourceLen) {
+    PREFIX3(stream) stream;
+    int err;
+    const unsigned int max = (unsigned int)-1;
+    z_uintmax_t len, left;
+    unsigned char buf[1];    /* for detection of incomplete stream when *destLen == 0 */
+
+    len = *sourceLen;
+    if (*destLen) {
+        left = *destLen;
+        *destLen = 0;
+    } else {
+        left = 1;
+        dest = buf;
+    }
+
+    stream.next_in = (z_const unsigned char *)source;
+    stream.avail_in = 0;
+    stream.zalloc = NULL;
+    stream.zfree = NULL;
+    stream.opaque = NULL;
+
+    err = PREFIX(inflateInit)(&stream);
+    if (err != Z_OK) return err;
+
+    stream.next_out = dest;
+    stream.avail_out = 0;
+
+    do {
+        if (stream.avail_out == 0) {
+            stream.avail_out = left > (unsigned long)max ? max : (unsigned int)left;
+            left -= stream.avail_out;
+        }
+        if (stream.avail_in == 0) {
+            stream.avail_in = len > (unsigned long)max ? max : (unsigned int)len;
+            len -= stream.avail_in;
+        }
+        err = PREFIX(inflate)(&stream, Z_NO_FLUSH);
+    } while (err == Z_OK);
+
+    *sourceLen -= len + stream.avail_in;
+    if (dest != buf)
+        *destLen = (z_size_t)stream.total_out;
+    else if (stream.total_out && err == Z_BUF_ERROR)
+        left = 1;
+
+    PREFIX(inflateEnd)(&stream);
+    return err == Z_STREAM_END ? Z_OK :
+           err == Z_NEED_DICT ? Z_DATA_ERROR  :
+           err == Z_BUF_ERROR && left + stream.avail_out ? Z_DATA_ERROR :
+           err;
+}
+
+int Z_EXPORT PREFIX(uncompress)(unsigned char *dest, z_uintmax_t *destLen, const unsigned char *source, z_uintmax_t sourceLen) {
+    return PREFIX(uncompress2)(dest, destLen, source, &sourceLen);
+}
diff --git a/3rdparty/zlib-ng/zbuild.h b/3rdparty/zlib-ng/zbuild.h
new file mode 100644
index 000000000000..d550b4c582c3
--- /dev/null
+++ b/3rdparty/zlib-ng/zbuild.h
@@ -0,0 +1,260 @@
+#ifndef _ZBUILD_H
+#define _ZBUILD_H
+
+#define _POSIX_SOURCE 1  /* fileno */
+#ifndef _POSIX_C_SOURCE
+#  define _POSIX_C_SOURCE 200809L /* snprintf, posix_memalign, strdup */
+#endif
+#ifndef _ISOC11_SOURCE
+#  define _ISOC11_SOURCE 1 /* aligned_alloc */
+#endif
+#ifdef __OpenBSD__
+#  define _BSD_SOURCE 1
+#endif
+
+#include <stddef.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+/* Determine compiler version of C Standard */
+#ifdef __STDC_VERSION__
+#  if __STDC_VERSION__ >= 199901L
+#    ifndef STDC99
+#      define STDC99
+#    endif
+#  endif
+#  if __STDC_VERSION__ >= 201112L
+#    ifndef STDC11
+#      define STDC11
+#    endif
+#  endif
+#endif
+
+#ifndef Z_HAS_ATTRIBUTE
+#  if defined(__has_attribute)
+#    define Z_HAS_ATTRIBUTE(a) __has_attribute(a)
+#  else
+#    define Z_HAS_ATTRIBUTE(a) 0
+#  endif
+#endif
+
+#ifndef Z_FALLTHROUGH
+#  if Z_HAS_ATTRIBUTE(__fallthrough__) || (defined(__GNUC__) && (__GNUC__ >= 7))
+#    define Z_FALLTHROUGH __attribute__((__fallthrough__))
+#  else
+#    define Z_FALLTHROUGH do {} while(0) /* fallthrough */
+#  endif
+#endif
+
+#ifndef Z_TARGET
+#  if Z_HAS_ATTRIBUTE(__target__)
+#    define Z_TARGET(x) __attribute__((__target__(x)))
+#  else
+#    define Z_TARGET(x)
+#  endif
+#endif
+
+/* This has to be first include that defines any types */
+#if defined(_MSC_VER)
+#  if defined(_WIN64)
+    typedef __int64 ssize_t;
+#  else
+    typedef long ssize_t;
+#  endif
+
+#  if defined(_WIN64)
+    #define SSIZE_MAX _I64_MAX
+#  else
+    #define SSIZE_MAX LONG_MAX
+#  endif
+#endif
+
+/* MS Visual Studio does not allow inline in C, only C++.
+   But it provides __inline instead, so use that. */
+#if defined(_MSC_VER) && !defined(inline) && !defined(__cplusplus)
+#  define inline __inline
+#endif
+
+#if defined(ZLIB_COMPAT)
+#  define PREFIX(x) x
+#  define PREFIX2(x) ZLIB_ ## x
+#  define PREFIX3(x) z_ ## x
+#  define PREFIX4(x) x ## 64
+#  define zVersion zlibVersion
+#else
+#  define PREFIX(x) zng_ ## x
+#  define PREFIX2(x) ZLIBNG_ ## x
+#  define PREFIX3(x) zng_ ## x
+#  define PREFIX4(x) zng_ ## x
+#  define zVersion zlibng_version
+#  define z_size_t size_t
+#endif
+
+/* In zlib-compat some functions and types use unsigned long, but zlib-ng use size_t */
+#if defined(ZLIB_COMPAT)
+#  define z_uintmax_t unsigned long
+#else
+#  define z_uintmax_t size_t
+#endif
+
+/* Minimum of a and b. */
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+/* Maximum of a and b. */
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+/* Ignore unused variable warning */
+#define Z_UNUSED(var) (void)(var)
+
+#if defined(HAVE_VISIBILITY_INTERNAL)
+#  define Z_INTERNAL __attribute__((visibility ("internal")))
+#elif defined(HAVE_VISIBILITY_HIDDEN)
+#  define Z_INTERNAL __attribute__((visibility ("hidden")))
+#else
+#  define Z_INTERNAL
+#endif
+
+/* Symbol versioning helpers, allowing multiple versions of a function to exist.
+ * Functions using this must also be added to zlib-ng.map for each version.
+ * Double @@ means this is the default for newly compiled applications to link against.
+ * Single @ means this is kept for backwards compatibility.
+ * This is only used for Zlib-ng native API, and only on platforms supporting this.
+ */
+#if defined(HAVE_SYMVER)
+#  define ZSYMVER(func,alias,ver) __asm__(".symver " func ", " alias "@ZLIB_NG_" ver);
+#  define ZSYMVER_DEF(func,alias,ver) __asm__(".symver " func ", " alias "@@ZLIB_NG_" ver);
+#else
+#  define ZSYMVER(func,alias,ver)
+#  define ZSYMVER_DEF(func,alias,ver)
+#endif
+
+#ifndef __cplusplus
+#  define Z_REGISTER register
+#else
+#  define Z_REGISTER
+#endif
+
+/* Reverse the bytes in a value. Use compiler intrinsics when
+   possible to take advantage of hardware implementations. */
+#if defined(_MSC_VER) && (_MSC_VER >= 1300)
+#  include <stdlib.h>
+#  pragma intrinsic(_byteswap_ulong)
+#  define ZSWAP16(q) _byteswap_ushort(q)
+#  define ZSWAP32(q) _byteswap_ulong(q)
+#  define ZSWAP64(q) _byteswap_uint64(q)
+
+#elif defined(__clang__) || (defined(__GNUC__) && \
+        (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
+#  define ZSWAP16(q) __builtin_bswap16(q)
+#  define ZSWAP32(q) __builtin_bswap32(q)
+#  define ZSWAP64(q) __builtin_bswap64(q)
+
+#elif defined(__GNUC__) && (__GNUC__ >= 2) && defined(__linux__)
+#  include <byteswap.h>
+#  define ZSWAP16(q) bswap_16(q)
+#  define ZSWAP32(q) bswap_32(q)
+#  define ZSWAP64(q) bswap_64(q)
+
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__DragonFly__)
+#  include <sys/endian.h>
+#  define ZSWAP16(q) bswap16(q)
+#  define ZSWAP32(q) bswap32(q)
+#  define ZSWAP64(q) bswap64(q)
+#elif defined(__OpenBSD__)
+#  include <sys/endian.h>
+#  define ZSWAP16(q) swap16(q)
+#  define ZSWAP32(q) swap32(q)
+#  define ZSWAP64(q) swap64(q)
+#elif defined(__INTEL_COMPILER)
+/* ICC does not provide a two byte swap. */
+#  define ZSWAP16(q) ((((q) & 0xff) << 8) | (((q) & 0xff00) >> 8))
+#  define ZSWAP32(q) _bswap(q)
+#  define ZSWAP64(q) _bswap64(q)
+
+#else
+#  define ZSWAP16(q) ((((q) & 0xff) << 8) | (((q) & 0xff00) >> 8))
+#  define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
+                     (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
+#  define ZSWAP64(q)                           \
+         (((q & 0xFF00000000000000u) >> 56u) | \
+          ((q & 0x00FF000000000000u) >> 40u) | \
+          ((q & 0x0000FF0000000000u) >> 24u) | \
+          ((q & 0x000000FF00000000u) >> 8u)  | \
+          ((q & 0x00000000FF000000u) << 8u)  | \
+          ((q & 0x0000000000FF0000u) << 24u) | \
+          ((q & 0x000000000000FF00u) << 40u) | \
+          ((q & 0x00000000000000FFu) << 56u))
+#endif
+
+/* Only enable likely/unlikely if the compiler is known to support it */
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__INTEL_COMPILER) || defined(__clang__)
+#  define LIKELY_NULL(x)        __builtin_expect((x) != 0, 0)
+#  define LIKELY(x)             __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x)           __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY_NULL(x)        x
+#  define LIKELY(x)             x
+#  define UNLIKELY(x)           x
+#endif /* (un)likely */
+
+#if defined(HAVE_ATTRIBUTE_ALIGNED)
+#  define ALIGNED_(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#  define ALIGNED_(x) __declspec(align(x))
+#endif
+
+/* Diagnostic functions */
+#ifdef ZLIB_DEBUG
+#  include <stdio.h>
+   extern int Z_INTERNAL z_verbose;
+   extern void Z_INTERNAL z_error(const char *m);
+#  define Assert(cond, msg) {if (!(cond)) z_error(msg);}
+#  define Trace(x) {if (z_verbose >= 0) fprintf x;}
+#  define Tracev(x) {if (z_verbose > 0) fprintf x;}
+#  define Tracevv(x) {if (z_verbose > 1) fprintf x;}
+#  define Tracec(c, x) {if (z_verbose > 0 && (c)) fprintf x;}
+#  define Tracecv(c, x) {if (z_verbose > 1 && (c)) fprintf x;}
+#else
+#  define Assert(cond, msg)
+#  define Trace(x)
+#  define Tracev(x)
+#  define Tracevv(x)
+#  define Tracec(c, x)
+#  define Tracecv(c, x)
+#endif
+
+#ifndef NO_UNALIGNED
+#  if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64)
+#    define UNALIGNED_OK
+#    define UNALIGNED64_OK
+#  elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \
+        defined(__i686__) || defined(_X86_) || defined(_M_IX86)
+#    define UNALIGNED_OK
+#  elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#    if (defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED)) || !defined(__GNUC__)
+#      define UNALIGNED_OK
+#      define UNALIGNED64_OK
+#    endif
+#  elif defined(__arm__) || (_M_ARM >= 7)
+#    if (defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED)) || !defined(__GNUC__)
+#      define UNALIGNED_OK
+#    endif
+#  elif defined(__powerpc64__) || defined(__ppc64__)
+#    if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#      define UNALIGNED_OK
+#      define UNALIGNED64_OK
+#    endif
+#  endif
+#endif
+
+#if defined(__has_feature)
+#  if __has_feature(memory_sanitizer)
+#    define Z_MEMORY_SANITIZER 1
+#    include <sanitizer/msan_interface.h>
+#  endif
+#endif
+
+#ifndef Z_MEMORY_SANITIZER
+#  define __msan_unpoison(a, size) do { Z_UNUSED(a); Z_UNUSED(size); } while (0)
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/zconf.h.in b/3rdparty/zlib-ng/zconf.h.in
new file mode 100644
index 000000000000..7a6e281e849d
--- /dev/null
+++ b/3rdparty/zlib-ng/zconf.h.in
@@ -0,0 +1,206 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ZCONF_H
+#define ZCONF_H
+
+#include "zlib_name_mangling.h"
+
+#if !defined(_WIN32) && defined(__WIN32__)
+#  define _WIN32
+#endif
+
+/* Clang macro for detecting declspec support
+ * https://clang.llvm.org/docs/LanguageExtensions.html#has-declspec-attribute
+ */
+#ifndef __has_declspec_attribute
+#  define __has_declspec_attribute(x) 0
+#endif
+
+#if defined(ZLIB_CONST) && !defined(z_const)
+#  define z_const const
+#else
+#  define z_const
+#endif
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+#  define MAX_MEM_LEVEL 9
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MIN_WBITS
+#  define MIN_WBITS   8  /* 256 LZ77 window */
+#endif
+#ifndef MAX_WBITS
+#  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+            (1 << (windowBits+2)) +  (1 << (memLevel+9))
+ that is: 128K for windowBits=15  +  128K for memLevel = 8  (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+     make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+   The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus about 7 kilobytes
+ for small objects.
+*/
+
+/* Type declarations */
+
+
+#ifndef OF /* function prototypes */
+#  define OF(args)  args
+#endif
+
+#ifdef ZLIB_INTERNAL
+#  define Z_INTERNAL ZLIB_INTERNAL
+#endif
+
+/* If building or using zlib as a DLL, define ZLIB_DLL.
+ * This is not mandatory, but it offers a little performance increase.
+ */
+#if defined(ZLIB_DLL) && (defined(_WIN32) || (__has_declspec_attribute(dllexport) && __has_declspec_attribute(dllimport)))
+#  ifdef Z_INTERNAL
+#    define Z_EXTERN extern __declspec(dllexport)
+#  else
+#    define Z_EXTERN extern __declspec(dllimport)
+#  endif
+#endif
+
+/* If building or using zlib with the WINAPI/WINAPIV calling convention,
+ * define ZLIB_WINAPI.
+ * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+ */
+#if defined(ZLIB_WINAPI) && defined(_WIN32)
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  include <windows.h>
+   /* No need for _export, use ZLIB.DEF instead. */
+   /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+#  define Z_EXPORT WINAPI
+#  define Z_EXPORTVA WINAPIV
+#endif
+
+#ifndef Z_EXTERN
+#  define Z_EXTERN extern
+#endif
+#ifndef Z_EXPORT
+#  define Z_EXPORT
+#endif
+#ifndef Z_EXPORTVA
+#  define Z_EXPORTVA
+#endif
+
+/* Conditional exports */
+#define ZNG_CONDEXPORT Z_INTERNAL
+
+/* For backwards compatibility */
+
+#ifndef ZEXTERN
+#  define ZEXTERN Z_EXTERN
+#endif
+#ifndef ZEXPORT
+#  define ZEXPORT Z_EXPORT
+#endif
+#ifndef ZEXPORTVA
+#  define ZEXPORTVA Z_EXPORTVA
+#endif
+#ifndef FAR
+#  define FAR
+#endif
+
+/* Legacy zlib typedefs for backwards compatibility. Don't assume stdint.h is defined. */
+typedef unsigned char Byte;
+typedef Byte Bytef;
+
+typedef unsigned int   uInt;  /* 16 bits or more */
+typedef unsigned long  uLong; /* 32 bits or more */
+
+typedef char  charf;
+typedef int   intf;
+typedef uInt  uIntf;
+typedef uLong uLongf;
+
+typedef void const *voidpc;
+typedef void       *voidpf;
+typedef void       *voidp;
+
+typedef unsigned int z_crc_t;
+
+#ifdef HAVE_UNISTD_H    /* may be set to #if 1 by configure/cmake/etc */
+#  define Z_HAVE_UNISTD_H
+#endif
+
+#ifdef NEED_PTRDIFF_T    /* may be set to #if 1 by configure/cmake/etc */
+typedef PTRDIFF_TYPE ptrdiff_t;
+#endif
+
+#include <sys/types.h>      /* for off_t */
+
+#include <stddef.h>         /* for wchar_t and NULL */
+
+/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and
+ * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even
+ * though the former does not conform to the LFS document), but considering
+ * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as
+ * equivalently requesting no 64-bit operations
+ */
+#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1
+#  undef _LARGEFILE64_SOURCE
+#endif
+
+#if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE)
+#  include <unistd.h>         /* for SEEK_*, off_t, and _LFS64_LARGEFILE */
+#  ifndef z_off_t
+#    define z_off_t off_t
+#  endif
+#endif
+
+#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0
+#  define Z_LFS64
+#endif
+
+#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64)
+#  define Z_LARGE64
+#endif
+
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64)
+#  define Z_WANT64
+#endif
+
+#if !defined(SEEK_SET)
+#  define SEEK_SET        0       /* Seek from beginning of file.  */
+#  define SEEK_CUR        1       /* Seek from current position.  */
+#  define SEEK_END        2       /* Set file pointer to EOF plus "offset" */
+#endif
+
+#ifndef z_off_t
+#  define z_off_t long
+#endif
+
+#if !defined(_WIN32) && defined(Z_LARGE64)
+#  define z_off64_t off64_t
+#else
+#  if defined(__MSYS__)
+#    define z_off64_t _off64_t
+#  elif defined(_WIN32) && !defined(__GNUC__)
+#    define z_off64_t __int64
+#  else
+#    define z_off64_t z_off_t
+#  endif
+#endif
+
+typedef size_t z_size_t;
+
+#endif /* ZCONF_H */
diff --git a/3rdparty/zlib-ng/zendian.h b/3rdparty/zlib-ng/zendian.h
new file mode 100644
index 000000000000..28177a609ff1
--- /dev/null
+++ b/3rdparty/zlib-ng/zendian.h
@@ -0,0 +1,60 @@
+/* zendian.h -- define BYTE_ORDER for endian tests
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ENDIAN_H_
+#define ENDIAN_H_
+
+/* First check whether the compiler knows the target __BYTE_ORDER__. */
+#if defined(__BYTE_ORDER__)
+#  if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#    if !defined(LITTLE_ENDIAN)
+#      define LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__
+#    endif
+#    if !defined(BYTE_ORDER)
+#      define BYTE_ORDER LITTLE_ENDIAN
+#    endif
+#  elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#    if !defined(BIG_ENDIAN)
+#      define BIG_ENDIAN __ORDER_BIG_ENDIAN__
+#    endif
+#    if !defined(BYTE_ORDER)
+#      define BYTE_ORDER BIG_ENDIAN
+#    endif
+#  endif
+#elif defined(__MINGW32__)
+#  include <sys/param.h>
+#elif defined(_WIN32)
+#  define LITTLE_ENDIAN 1234
+#  define BIG_ENDIAN 4321
+#  if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined (_M_ARM) || defined (_M_ARM64) || defined (_M_ARM64EC)
+#    define BYTE_ORDER LITTLE_ENDIAN
+#  else
+#    error Unknown endianness!
+#  endif
+#elif defined(__linux__)
+#  include <endian.h>
+#elif defined(__APPLE__)
+#  include <machine/endian.h>
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__)
+#  include <sys/endian.h>
+#elif defined(__sun) || defined(sun)
+#  include <sys/byteorder.h>
+#  if !defined(LITTLE_ENDIAN)
+#    define LITTLE_ENDIAN 4321
+#   endif
+#  if !defined(BIG_ENDIAN)
+#    define BIG_ENDIAN 1234
+#  endif
+#  if !defined(BYTE_ORDER)
+#    if defined(_BIG_ENDIAN)
+#      define BYTE_ORDER BIG_ENDIAN
+#    else
+#      define BYTE_ORDER LITTLE_ENDIAN
+#    endif
+#  endif
+#else
+#  include <endian.h>
+#endif
+
+#endif
diff --git a/3rdparty/zlib-ng/zlib.h.in b/3rdparty/zlib-ng/zlib.h.in
new file mode 100644
index 000000000000..eabb94afe09c
--- /dev/null
+++ b/3rdparty/zlib-ng/zlib.h.in
@@ -0,0 +1,1859 @@
+#ifndef ZLIB_H_
+#define ZLIB_H_
+/* zlib.h -- interface of the 'zlib-ng' compression library
+   Forked from and compatible with zlib 1.2.13
+
+  Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files https://tools.ietf.org/html/rfc1950
+  (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format).
+*/
+
+#ifdef ZNGLIB_H_
+#  error Include zlib-ng.h for zlib-ng API or zlib.h for zlib-compat API but not both
+#endif
+
+#ifndef RC_INVOKED
+#include <stdint.h>
+#include <stdarg.h>
+
+#include "zconf.h"
+
+#ifndef ZCONF_H
+#  error Missing zconf.h add binary output directory to include directories
+#endif
+#endif  /* RC_INVOKED */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZLIBNG_VERSION "2.1.6"
+#define ZLIBNG_VERNUM 0x020106F0L   /* MMNNRRSM: major minor revision status modified */
+#define ZLIBNG_VER_MAJOR 2
+#define ZLIBNG_VER_MINOR 1
+#define ZLIBNG_VER_REVISION 6
+#define ZLIBNG_VER_STATUS F         /* 0=devel, 1-E=beta, F=Release (DEPRECATED) */
+#define ZLIBNG_VER_STATUSH 0xF      /* Hex values: 0=devel, 1-E=beta, F=Release */
+#define ZLIBNG_VER_MODIFIED 0       /* non-zero if modified externally from zlib-ng */
+
+#define ZLIB_VERSION "1.3.0.zlib-ng"
+#define ZLIB_VERNUM 0x130f
+#define ZLIB_VER_MAJOR 1
+#define ZLIB_VER_MINOR 3
+#define ZLIB_VER_REVISION 0
+#define ZLIB_VER_SUBREVISION 15    /* 15=fork (0xf) */
+
+/*
+    The 'zlib' compression library provides in-memory compression and
+  decompression functions, including integrity checks of the uncompressed data.
+  This version of the library supports only one compression method (deflation)
+  but other algorithms will be added later and will have the same stream
+  interface.
+
+    Compression can be done in a single step if the buffers are large enough,
+  or can be done by repeated calls of the compression function.  In the latter
+  case, the application must provide more input and/or consume the output
+  (providing more output space) before each call.
+
+    The compressed data format used by default by the in-memory functions is
+  the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+  around a deflate stream, which is itself documented in RFC 1951.
+
+    The library also supports reading and writing files in gzip (.gz) format
+  with an interface similar to that of stdio using the functions that start
+  with "gz".  The gzip format is different from the zlib format.  gzip is a
+  gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+
+    This library can optionally read and write gzip and raw deflate streams in
+  memory as well.
+
+    The zlib format was designed to be compact and fast for use in memory
+  and on communications channels.  The gzip format was designed for single-
+  file compression on file systems, has a larger header than zlib to maintain
+  directory information, and uses a different, slower check method than zlib.
+
+    The library does not install any signal handler.  The decoder checks
+  the consistency of the compressed data, so the library should never crash
+  even in the case of corrupted input.
+*/
+
+typedef void *(*alloc_func) (void *opaque, unsigned int items, unsigned int size);
+typedef void  (*free_func)  (void *opaque, void *address);
+
+struct internal_state;
+
+typedef struct z_stream_s {
+    z_const unsigned char *next_in;   /* next input byte */
+    uint32_t              avail_in;   /* number of bytes available at next_in */
+    unsigned long         total_in;   /* total number of input bytes read so far */
+
+    unsigned char         *next_out;  /* next output byte will go here */
+    uint32_t              avail_out;  /* remaining free space at next_out */
+    unsigned long         total_out;  /* total number of bytes output so far */
+
+    z_const char          *msg;       /* last error message, NULL if no error */
+    struct internal_state *state;     /* not visible by applications */
+
+    alloc_func            zalloc;     /* used to allocate the internal state */
+    free_func             zfree;      /* used to free the internal state */
+    void                  *opaque;    /* private data object passed to zalloc and zfree */
+
+    int                   data_type;  /* best guess about the data type: binary or text
+                                         for deflate, or the decoding state for inflate */
+    unsigned long         adler;      /* Adler-32 or CRC-32 value of the uncompressed data */
+    unsigned long         reserved;   /* reserved for future use */
+} z_stream;
+
+typedef z_stream *z_streamp;  /* Obsolete type, retained for compatibility only */
+
+/*
+    gzip header information passed to and from zlib routines.  See RFC 1952
+  for more details on the meanings of these fields.
+*/
+typedef struct gz_header_s {
+    int             text;       /* true if compressed data believed to be text */
+    unsigned long   time;       /* modification time */
+    int             xflags;     /* extra flags (not used when writing a gzip file) */
+    int             os;         /* operating system */
+    unsigned char   *extra;     /* pointer to extra field or NULL if none */
+    unsigned int    extra_len;  /* extra field length (valid if extra != NULL) */
+    unsigned int    extra_max;  /* space at extra (only when reading header) */
+    unsigned char   *name;      /* pointer to zero-terminated file name or NULL */
+    unsigned int    name_max;   /* space at name (only when reading header) */
+    unsigned char   *comment;   /* pointer to zero-terminated comment or NULL */
+    unsigned int    comm_max;   /* space at comment (only when reading header) */
+    int             hcrc;       /* true if there was or will be a header crc */
+    int             done;       /* true when done reading gzip header (not used when writing a gzip file) */
+} gz_header;
+
+typedef gz_header *gz_headerp;
+
+/*
+     The application must update next_in and avail_in when avail_in has dropped
+   to zero.  It must update next_out and avail_out when avail_out has dropped
+   to zero.  The application must initialize zalloc, zfree and opaque before
+   calling the init function.  All other fields are set by the compression
+   library and must not be updated by the application.
+
+     The opaque value provided by the application will be passed as the first
+   parameter for calls of zalloc and zfree.  This can be useful for custom
+   memory management.  The compression library attaches no meaning to the
+   opaque value.
+
+     zalloc must return NULL if there is not enough memory for the object.
+   If zlib is used in a multi-threaded application, zalloc and zfree must be
+   thread safe.  In that case, zlib is thread-safe.  When zalloc and zfree are
+   Z_NULL on entry to the initialization function, they are set to internal
+   routines that use the standard library functions malloc() and free().
+
+     The fields total_in and total_out can be used for statistics or progress
+   reports.  After compression, total_in holds the total size of the
+   uncompressed data and may be saved for use by the decompressor (particularly
+   if the decompressor wants to decompress everything in a single step).
+*/
+
+                        /* constants */
+
+#define Z_NO_FLUSH      0
+#define Z_PARTIAL_FLUSH 1
+#define Z_SYNC_FLUSH    2
+#define Z_FULL_FLUSH    3
+#define Z_FINISH        4
+#define Z_BLOCK         5
+#define Z_TREES         6
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK            0
+#define Z_STREAM_END    1
+#define Z_NEED_DICT     2
+#define Z_ERRNO        (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR   (-3)
+#define Z_MEM_ERROR    (-4)
+#define Z_BUF_ERROR    (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative values
+ * are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION         0
+#define Z_BEST_SPEED             1
+#define Z_BEST_COMPRESSION       9
+#define Z_DEFAULT_COMPRESSION  (-1)
+/* compression levels */
+
+#define Z_FILTERED            1
+#define Z_HUFFMAN_ONLY        2
+#define Z_RLE                 3
+#define Z_FIXED               4
+#define Z_DEFAULT_STRATEGY    0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY   0
+#define Z_TEXT     1
+#define Z_ASCII    Z_TEXT   /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN  2
+/* Possible values of the data_type field for deflate() */
+
+#define Z_DEFLATED   8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL  NULL  /* for compatibility with zlib, was for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+
+                        /* basic functions */
+
+Z_EXTERN const char * Z_EXPORT zlibVersion(void);
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+   If the first character differs, the library code actually used is not
+   compatible with the zlib.h header file used by the application.  This check
+   is automatically made by deflateInit and inflateInit.
+ */
+
+/*
+Z_EXTERN int Z_EXPORT deflateInit (z_stream *strm, int level);
+
+     Initializes the internal stream state for compression.  The fields
+   zalloc, zfree and opaque must be initialized before by the caller.  If
+   zalloc and zfree are set to Z_NULL, deflateInit updates them to use default
+   allocation functions.  total_in, total_out, adler, and msg are initialized.
+
+     The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+   1 gives best speed, 9 gives best compression, 0 gives no compression at all
+   (the input data is simply copied a block at a time).  Z_DEFAULT_COMPRESSION
+   requests a default compromise between speed and compression (currently
+   equivalent to level 6).
+
+     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if level is not a valid compression level, or
+   Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+   with the version assumed by the caller (ZLIB_VERSION).  msg is set to null
+   if there is no error message.  deflateInit does not perform any compression:
+   this will be done by deflate().
+*/
+
+
+Z_EXTERN int Z_EXPORT deflate(z_stream *strm, int flush);
+/*
+    deflate compresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+    The detailed semantics are as follows.  deflate performs one or both of the
+  following actions:
+
+  - Compress more input starting at next_in and update next_in and avail_in
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), next_in and avail_in are updated and
+    processing will resume at this point for the next call of deflate().
+
+  - Generate more output starting at next_out and update next_out and avail_out
+    accordingly.  This action is forced if the parameter flush is non zero.
+    Forcing flush frequently degrades the compression ratio, so this parameter
+    should be set only when necessary.  Some output may be provided even if
+    flush is zero.
+
+    Before the call of deflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating avail_in or avail_out accordingly; avail_out should
+  never be zero before the call.  The application can consume the compressed
+  output when it wants, for example when the output buffer is full (avail_out
+  == 0), or after each call of deflate().  If deflate returns Z_OK and with
+  zero avail_out, it must be called again after making room in the output
+  buffer because there might be more output pending. See deflatePending(),
+  which can be used if desired to determine whether or not there is more output
+  in that case.
+
+    Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+  decide how much data to accumulate before producing output, in order to
+  maximize compression.
+
+    If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+  flushed to the output buffer and the output is aligned on a byte boundary, so
+  that the decompressor can get all input data available so far.  (In
+  particular avail_in is zero after the call if enough output space has been
+  provided before the call.) Flushing may degrade compression for some
+  compression algorithms and so it should be used only when necessary.  This
+  completes the current deflate block and follows it with an empty stored block
+  that is three bits plus filler bits to the next byte, followed by four bytes
+  (00 00 ff ff).
+
+    If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the
+  output buffer, but the output is not aligned to a byte boundary.  All of the
+  input data so far will be available to the decompressor, as for Z_SYNC_FLUSH.
+  This completes the current deflate block and follows it with an empty fixed
+  codes block that is 10 bits long.  This assures that enough bytes are output
+  in order for the decompressor to finish the block before the empty fixed
+  codes block.
+
+    If flush is set to Z_BLOCK, a deflate block is completed and emitted, as
+  for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to
+  seven bits of the current block are held to be written as the next byte after
+  the next deflate block is completed.  In this case, the decompressor may not
+  be provided enough bits at this point in order to complete decompression of
+  the data provided so far to the compressor.  It may need to wait for the next
+  block to be emitted.  This is for advanced applications that need to control
+  the emission of deflate blocks.
+
+    If flush is set to Z_FULL_FLUSH, all output is flushed as with
+  Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+  restart from this point if previous compressed data has been damaged or if
+  random access is desired.  Using Z_FULL_FLUSH too often can seriously degrade
+  compression.
+
+    If deflate returns with avail_out == 0, this function must be called again
+  with the same value of the flush parameter and more output space (updated
+  avail_out), until the flush is complete (deflate returns with non-zero
+  avail_out).  In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+  avail_out is greater than six when the flush marker begins, in order to avoid
+  repeated flush markers upon calling deflate() again when avail_out == 0.
+
+    If the parameter flush is set to Z_FINISH, pending input is processed,
+  pending output is flushed and deflate returns with Z_STREAM_END if there was
+  enough output space.  If deflate returns with Z_OK or Z_BUF_ERROR, this
+  function must be called again with Z_FINISH and more output space (updated
+  avail_out) but no more input data, until it returns with Z_STREAM_END or an
+  error.  After deflate has returned Z_STREAM_END, the only possible operations
+  on the stream are deflateReset or deflateEnd.
+
+    Z_FINISH can be used in the first deflate call after deflateInit if all the
+  compression is to be done in a single step.  In order to complete in one
+  call, avail_out must be at least the value returned by deflateBound (see
+  below).  Then deflate is guaranteed to return Z_STREAM_END.  If not enough
+  output space is provided, deflate will not return Z_STREAM_END, and it must
+  be called again as described above.
+
+    deflate() sets strm->adler to the Adler-32 checksum of all input read
+  so far (that is, total_in bytes).  If a gzip stream is being generated, then
+  strm->adler will be the CRC-32 checksum of the input read so far.  (See
+  deflateInit2 below.)
+
+    deflate() may update strm->data_type if it can make a good guess about
+  the input data type (Z_BINARY or Z_TEXT).  If in doubt, the data is
+  considered binary.  This field is only for information purposes and does not
+  affect the compression algorithm in any manner.
+
+    deflate() returns Z_OK if some progress has been made (more input
+  processed or more output produced), Z_STREAM_END if all input has been
+  consumed and all output has been produced (only when flush is set to
+  Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+  if next_in or next_out was NULL) or the state was inadvertently written over
+  by the application), or Z_BUF_ERROR if no progress is possible (for example
+  avail_in or avail_out was zero).  Note that Z_BUF_ERROR is not fatal, and
+  deflate() can be called again with more input and more output space to
+  continue compressing.
+*/
+
+
+Z_EXTERN int Z_EXPORT deflateEnd(z_stream *strm);
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any pending
+   output.
+
+     deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+   stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+   prematurely (some input or output was discarded).  In the error case, msg
+   may be set but then points to a static string (which must not be
+   deallocated).
+*/
+
+
+/*
+Z_EXTERN int Z_EXPORT inflateInit (z_stream *strm);
+
+     Initializes the internal stream state for decompression.  The fields
+   next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+   the caller.  In the current version of inflate, the provided input is not
+   read or consumed.  The allocation of a sliding window will be deferred to
+   the first call of inflate (if the decompression does not complete on the
+   first call).  If zalloc and zfree are set to Z_NULL, inflateInit updates
+   them to use default allocation functions.  total_in, total_out, adler, and
+   msg are initialized.
+
+     inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit does not perform any decompression.
+   Actual decompression will be done by inflate().  So next_in, and avail_in,
+   next_out, and avail_out are unused and unchanged.  The current
+   implementation of inflateInit() does not process any header information --
+   that is deferred until inflate() is called.
+*/
+
+
+Z_EXTERN int Z_EXPORT inflate(z_stream *strm, int flush);
+/*
+    inflate decompresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+  The detailed semantics are as follows.  inflate performs one or both of the
+  following actions:
+
+  - Decompress more input starting at next_in and update next_in and avail_in
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), then next_in and avail_in are updated
+    accordingly, and processing will resume at this point for the next call of
+    inflate().
+
+  - Generate more output starting at next_out and update next_out and avail_out
+    accordingly.  inflate() provides as much output as possible, until there is
+    no more input data or no more space in the output buffer (see below about
+    the flush parameter).
+
+    Before the call of inflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating the next_* and avail_* values accordingly.  If the
+  caller of inflate() does not provide both available input and available
+  output space, it is possible that there will be no progress made.  The
+  application can consume the uncompressed output when it wants, for example
+  when the output buffer is full (avail_out == 0), or after each call of
+  inflate().  If inflate returns Z_OK and with zero avail_out, it must be
+  called again after making room in the output buffer because there might be
+  more output pending.
+
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH,
+  Z_BLOCK, or Z_TREES.  Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer.  Z_BLOCK requests that inflate()
+  stop if and when it gets to the next deflate block boundary.  When decoding
+  the zlib or gzip format, this will cause inflate() to return immediately
+  after the header and before the first block.  When doing a raw inflate,
+  inflate() will go ahead and process the first block, and will return when it
+  gets to the end of that block, or when it runs out of data.
+
+    The Z_BLOCK option assists in appending to or combining deflate streams.
+  To assist in this, on return inflate() always sets strm->data_type to the
+  number of unused bits in the last byte taken from strm->next_in, plus 64 if
+  inflate() is currently decoding the last block in the deflate stream, plus
+  128 if inflate() returned immediately after decoding an end-of-block code or
+  decoding the complete header up to just before the first byte of the deflate
+  stream.  The end-of-block will not be indicated until all of the uncompressed
+  data from that block has been written to strm->next_out.  The number of
+  unused bits may in general be greater than seven, except when bit 7 of
+  data_type is set, in which case the number of unused bits will be less than
+  eight.  data_type is set as noted here every time inflate() returns for all
+  flush options, and so can be used to determine the amount of currently
+  consumed input in bits.
+
+    The Z_TREES option behaves as Z_BLOCK does, but it also returns when the
+  end of each deflate block header is reached, before any actual data in that
+  block is decoded.  This allows the caller to determine the length of the
+  deflate block header for later use in random access within a deflate block.
+  256 is added to the value of strm->data_type when inflate() returns
+  immediately after reaching the end of the deflate block header.
+
+    inflate() should normally be called until it returns Z_STREAM_END or an
+  error.  However if all decompression is to be performed in a single step (a
+  single call of inflate), the parameter flush should be set to Z_FINISH.  In
+  this case all pending input is processed and all pending output is flushed;
+  avail_out must be large enough to hold all of the uncompressed data for the
+  operation to complete.  (The size of the uncompressed data may have been
+  saved by the compressor for this purpose.)  The use of Z_FINISH is not
+  required to perform an inflation in one step.  However it may be used to
+  inform inflate that a faster approach can be used for the single inflate()
+  call.  Z_FINISH also informs inflate to not maintain a sliding window if the
+  stream completes, which reduces inflate's memory footprint.  If the stream
+  does not complete, either because not all of the stream is provided or not
+  enough output space is provided, then a sliding window will be allocated and
+  inflate() can be called again to continue the operation as if Z_NO_FLUSH had
+  been used.
+
+     In this implementation, inflate() always flushes as much output as
+  possible to the output buffer, and always uses the faster approach on the
+  first call.  So the effects of the flush parameter in this implementation are
+  on the return value of inflate() as noted below, when inflate() returns early
+  when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of
+  memory for a sliding window when Z_FINISH is used.
+
+     If a preset dictionary is needed after this call (see inflateSetDictionary
+  below), inflate sets strm->adler to the Adler-32 checksum of the dictionary
+  chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+  strm->adler to the Adler-32 checksum of all output produced so far (that is,
+  total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+  below.  At the end of the stream, inflate() checks that its computed Adler-32
+  checksum is equal to that saved by the compressor and returns Z_STREAM_END
+  only if the checksum is correct.
+
+    inflate() can decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically, if requested when
+  initializing with inflateInit2().  Any information contained in the gzip
+  header is not retained unless inflateGetHeader() is used.  When processing
+  gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output
+  produced so far.  The CRC-32 is checked against the gzip trailer, as is the
+  uncompressed length, modulo 2^32.
+
+    inflate() returns Z_OK if some progress has been made (more input processed
+  or more output produced), Z_STREAM_END if the end of the compressed data has
+  been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+  preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+  corrupted (input stream not conforming to the zlib format or incorrect check
+  value, in which case strm->msg points to a string with a more specific
+  error), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+  next_in or next_out was NULL, or the state was inadvertently written over
+  by the application), Z_MEM_ERROR if there was not enough memory, Z_BUF_ERROR
+  if no progress is possible or if there was not enough room in the output
+  buffer when Z_FINISH is used.  Note that Z_BUF_ERROR is not fatal, and
+  inflate() can be called again with more input and more output space to
+  continue decompressing.  If Z_DATA_ERROR is returned, the application may
+  then call inflateSync() to look for a good compression block if a partial
+  recovery of the data is to be attempted.
+*/
+
+
+Z_EXTERN int Z_EXPORT inflateEnd(z_stream *strm);
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any pending
+   output.
+
+     inflateEnd returns Z_OK if success, or Z_STREAM_ERROR if the stream state
+   was inconsistent.
+*/
+
+
+                        /* Advanced functions */
+
+/*
+    The following functions are needed only in some special applications.
+*/
+
+/*
+Z_EXTERN int Z_EXPORT deflateInit2 (z_stream *strm,
+                                     int  level,
+                                     int  method,
+                                     int  windowBits,
+                                     int  memLevel,
+                                     int  strategy);
+
+     This is another version of deflateInit with more compression options.  The
+   fields zalloc, zfree and opaque must be initialized before by the caller.
+
+     The method parameter is the compression method.  It must be Z_DEFLATED in
+   this version of the library.
+
+     The windowBits parameter is the base two logarithm of the window size
+   (the size of the history buffer).  It should be in the range 8..15 for this
+   version of the library.  Larger values of this parameter result in better
+   compression at the expense of memory usage.  The default value is 15 if
+   deflateInit is used instead.
+
+     For the current implementation of deflate(), a windowBits value of 8 (a
+   window size of 256 bytes) is not supported.  As a result, a request for 8
+   will result in 9 (a 512-byte window).  In that case, providing 8 to
+   inflateInit2() will result in an error when the zlib header with 9 is
+   checked against the initialization of inflate().  The remedy is to not use 8
+   with deflateInit2() with this initialization, or at least in that case use 9
+   with inflateInit2().
+
+     windowBits can also be -8..-15 for raw deflate.  In this case, -windowBits
+   determines the window size.  deflate() will then generate raw deflate data
+   with no zlib header or trailer, and will not compute a check value.
+
+     windowBits can also be greater than 15 for optional gzip encoding.  Add
+   16 to windowBits to write a simple gzip header and trailer around the
+   compressed data instead of a zlib wrapper.  The gzip header will have no
+   file name, no extra data, no comment, no modification time (set to zero), no
+   header crc, and the operating system will be set to the appropriate value,
+   if the operating system was determined at compile time.  If a gzip stream is
+   being written, strm->adler is a CRC-32 instead of an Adler-32.
+
+     For raw deflate or gzip encoding, a request for a 256-byte window is
+   rejected as invalid, since only the zlib header provides a means of
+   transmitting the window size to the decompressor.
+
+     The memLevel parameter specifies how much memory should be allocated
+   for the internal compression state.  memLevel=1 uses minimum memory but is
+   slow and reduces compression ratio; memLevel=9 uses maximum memory for
+   optimal speed.  The default value is 8.  See zconf.h for total memory usage
+   as a function of windowBits and memLevel.
+
+     The strategy parameter is used to tune the compression algorithm.  Use the
+   value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+   filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+   string match), or Z_RLE to limit match distances to one (run-length
+   encoding).  Filtered data consists mostly of small values with a somewhat
+   random distribution.  In this case, the compression algorithm is tuned to
+   compress them better.  The effect of Z_FILTERED is to force more Huffman
+   coding and less string matching; it is somewhat intermediate between
+   Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY.  Z_RLE is designed to be almost as
+   fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data.  The
+   strategy parameter only affects the compression ratio but not the
+   correctness of the compressed output even if it is not set appropriately.
+   Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler
+   decoder for special applications.
+
+     deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid
+   method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is
+   incompatible with the version assumed by the caller (ZLIB_VERSION).  msg is
+   set to null if there is no error message.  deflateInit2 does not perform any
+   compression: this will be done by deflate().
+*/
+
+Z_EXTERN int Z_EXPORT deflateSetDictionary(z_stream *strm,
+                                             const unsigned char *dictionary,
+                                             unsigned int dictLength);
+/*
+     Initializes the compression dictionary from the given byte sequence
+   without producing any compressed output.  When using the zlib format, this
+   function must be called immediately after deflateInit, deflateInit2 or
+   deflateReset, and before any call of deflate.  When doing raw deflate, this
+   function must be called either before any call of deflate, or immediately
+   after the completion of a deflate block, i.e. after all input has been
+   consumed and all output has been delivered when using any of the flush
+   options Z_BLOCK, Z_PARTIAL_FLUSH, Z_SYNC_FLUSH, or Z_FULL_FLUSH.  The
+   compressor and decompressor must use exactly the same dictionary (see
+   inflateSetDictionary).
+
+     The dictionary should consist of strings (byte sequences) that are likely
+   to be encountered later in the data to be compressed, with the most commonly
+   used strings preferably put towards the end of the dictionary.  Using a
+   dictionary is most useful when the data to be compressed is short and can be
+   predicted with good accuracy; the data can then be compressed better than
+   with the default empty dictionary.
+
+     Depending on the size of the compression data structures selected by
+   deflateInit or deflateInit2, a part of the dictionary may in effect be
+   discarded, for example if the dictionary is larger than the window size
+   provided in deflateInit or deflateInit2.  Thus the strings most likely to be
+   useful should be put at the end of the dictionary, not at the front.  In
+   addition, the current implementation of deflate will use at most the window
+   size minus 262 bytes of the provided dictionary.
+
+     Upon return of this function, strm->adler is set to the Adler-32 value
+   of the dictionary; the decompressor may later use this value to determine
+   which dictionary has been used by the compressor.  (The Adler-32 value
+   applies to the whole dictionary even if only a subset of the dictionary is
+   actually used by the compressor.) If a raw deflate was requested, then the
+   Adler-32 value is not computed and strm->adler is not set.
+
+     deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+   parameter is invalid (e.g.  dictionary being NULL) or the stream state is
+   inconsistent (for example if deflate has already been called for this stream
+   or if not at a block boundary for raw deflate).  deflateSetDictionary does
+   not perform any compression: this will be done by deflate().
+*/
+
+Z_EXTERN int Z_EXPORT deflateGetDictionary (z_stream *strm, unsigned char *dictionary, unsigned int *dictLength);
+/*
+     Returns the sliding dictionary being maintained by deflate.  dictLength is
+   set to the number of bytes in the dictionary, and that many bytes are copied
+   to dictionary.  dictionary must have enough space, where 32768 bytes is
+   always enough.  If deflateGetDictionary() is called with dictionary equal to
+   Z_NULL, then only the dictionary length is returned, and nothing is copied.
+   Similarly, if dictLength is Z_NULL, then it is not set.
+
+     deflateGetDictionary() may return a length less than the window size, even
+   when more than the window size in input has been provided. It may return up
+   to 258 bytes less in that case, due to how zlib's implementation of deflate
+   manages the sliding window and lookahead for matches, where matches can be
+   up to 258 bytes long. If the application needs the last window-size bytes of
+   input, then that would need to be saved by the application outside of zlib.
+
+     deflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+   stream state is inconsistent.
+*/
+
+Z_EXTERN int Z_EXPORT deflateCopy(z_stream *dest, z_stream *source);
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when several compression strategies will be
+   tried, for example when there are several ways of pre-processing the input
+   data with a filter.  The streams that will be discarded should then be freed
+   by calling deflateEnd.  Note that deflateCopy duplicates the internal
+   compression state which can be quite large, so this strategy is slow and can
+   consume lots of memory.
+
+     deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being NULL).  msg is left unchanged in both source and
+   destination.
+*/
+
+Z_EXTERN int Z_EXPORT deflateReset(z_stream *strm);
+/*
+     This function is equivalent to deflateEnd followed by deflateInit, but
+   does not free and reallocate the internal compression state.  The stream
+   will leave the compression level and any other attributes that may have been
+   set unchanged.  total_in, total_out, adler, and msg are initialized.
+
+     deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+Z_EXTERN int Z_EXPORT deflateParams(z_stream *strm, int level, int strategy);
+/*
+     Dynamically update the compression level and compression strategy.  The
+   interpretation of level and strategy is as in deflateInit2().  This can be
+   used to switch between compression and straight copy of the input data, or
+   to switch to a different kind of input data requiring a different strategy.
+   If the compression approach (which is a function of the level) or the
+   strategy is changed, and if there have been any deflate() calls since the
+   state was initialized or reset, then the input available so far is
+   compressed with the old level and strategy using deflate(strm, Z_BLOCK).
+   There are three approaches for the compression levels 0, 1..3, and 4..9
+   respectively.  The new level and strategy will take effect at the next call
+   of deflate().
+
+     If a deflate(strm, Z_BLOCK) is performed by deflateParams(), and it does
+   not have enough output space to complete, then the parameter change will not
+   take effect.  In this case, deflateParams() can be called again with the
+   same parameters and more output space to try again.
+
+     In order to assure a change in the parameters on the first try, the
+   deflate stream should be flushed using deflate() with Z_BLOCK or other flush
+   request until strm.avail_out is not zero, before calling deflateParams().
+   Then no more input data should be provided before the deflateParams() call.
+   If this is done, the old level and strategy will be applied to the data
+   compressed before deflateParams(), and the new level and strategy will be
+   applied to the data compressed after deflateParams().
+
+     deflateParams returns Z_OK on success, Z_STREAM_ERROR if the source stream
+   state was inconsistent or if a parameter was invalid, or Z_BUF_ERROR if
+   there was not enough output space to complete the compression of the
+   available input data before a change in the strategy or approach.  Note that
+   in the case of a Z_BUF_ERROR, the parameters are not changed.  A return
+   value of Z_BUF_ERROR is not fatal, in which case deflateParams() can be
+   retried with more output space.
+*/
+
+Z_EXTERN int Z_EXPORT deflateTune(z_stream *strm, int good_length, int max_lazy, int nice_length, int max_chain);
+/*
+     Fine tune deflate's internal compression parameters.  This should only be
+   used by someone who understands the algorithm used by zlib's deflate for
+   searching for the best matching string, and even then only by the most
+   fanatic optimizer trying to squeeze out the last compressed bit for their
+   specific input data.  Read the deflate.c source code for the meaning of the
+   max_lazy, good_length, nice_length, and max_chain parameters.
+
+     deflateTune() can be called after deflateInit() or deflateInit2(), and
+   returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+
+Z_EXTERN unsigned long Z_EXPORT deflateBound(z_stream *strm, unsigned long sourceLen);
+/*
+     deflateBound() returns an upper bound on the compressed size after
+   deflation of sourceLen bytes.  It must be called after deflateInit() or
+   deflateInit2(), and after deflateSetHeader(), if used.  This would be used
+   to allocate an output buffer for deflation in a single pass, and so would be
+   called before deflate().  If that first deflate() call is provided the
+   sourceLen input bytes, an output buffer allocated to the size returned by
+   deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed
+   to return Z_STREAM_END.  Note that it is possible for the compressed size to
+   be larger than the value returned by deflateBound() if flush options other
+   than Z_FINISH or Z_NO_FLUSH are used.
+*/
+
+Z_EXTERN int Z_EXPORT deflatePending(z_stream *strm, uint32_t *pending, int *bits);
+/*
+     deflatePending() returns the number of bytes and bits of output that have
+   been generated, but not yet provided in the available output.  The bytes not
+   provided would be due to the available output space having being consumed.
+   The number of bits of output not provided are between 0 and 7, where they
+   await more bits to join them in order to fill out a full byte.  If pending
+   or bits are NULL, then those values are not set.
+
+     deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+ */
+
+Z_EXTERN int Z_EXPORT deflatePrime(z_stream *strm, int bits, int value);
+/*
+     deflatePrime() inserts bits in the deflate output stream.  The intent
+   is that this function is used to start off the deflate output with the bits
+   leftover from a previous deflate stream when appending to it.  As such, this
+   function can only be used for raw deflate, and must be used before the first
+   deflate() call after a deflateInit2() or deflateReset().  bits must be less
+   than or equal to 16, and that many of the least significant bits of value
+   will be inserted in the output.
+
+     deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough
+   room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the
+   source stream state was inconsistent.
+*/
+
+Z_EXTERN int Z_EXPORT deflateSetHeader(z_stream *strm, gz_headerp head);
+/*
+     deflateSetHeader() provides gzip header information for when a gzip
+   stream is requested by deflateInit2().  deflateSetHeader() may be called
+   after deflateInit2() or deflateReset() and before the first call of
+   deflate().  The text, time, os, extra field, name, and comment information
+   in the provided gz_header structure are written to the gzip header (xflag is
+   ignored -- the extra flags are set according to the compression level).  The
+   caller must assure that, if not NULL, name and comment are terminated with
+   a zero byte, and that if extra is not NULL, that extra_len bytes are
+   available there.  If hcrc is true, a gzip header crc is included.  Note that
+   the current versions of the command-line version of gzip (up through version
+   1.3.x) do not support header crc's, and will report that it is a "multi-part
+   gzip file" and give up.
+
+     If deflateSetHeader is not used, the default gzip header has text false,
+   the time set to zero, and os set to the current operating system, with no
+   extra, name, or comment fields.  The gzip header is returned to the default
+   state by deflateReset().
+
+     deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+Z_EXTERN int Z_EXPORT inflateInit2(z_stream *strm, int  windowBits);
+
+     This is another version of inflateInit with an extra parameter.  The
+   fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+   before by the caller.
+
+     The windowBits parameter is the base two logarithm of the maximum window
+   size (the size of the history buffer).  It should be in the range 8..15 for
+   this version of the library.  The default value is 15 if inflateInit is used
+   instead.  windowBits must be greater than or equal to the windowBits value
+   provided to deflateInit2() while compressing, or it must be equal to 15 if
+   deflateInit2() was not used.  If a compressed stream with a larger window
+   size is given as input, inflate() will return with the error code
+   Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     windowBits can also be zero to request that inflate use the window size in
+   the zlib header of the compressed stream.
+
+     windowBits can also be -8..-15 for raw inflate.  In this case, -windowBits
+   determines the window size.  inflate() will then process raw deflate data,
+   not looking for a zlib or gzip header, not generating a check value, and not
+   looking for any check values for comparison at the end of the stream.  This
+   is for use with other formats that use the deflate compressed data format
+   such as zip.  Those formats provide their own check values.  If a custom
+   format is developed using the raw deflate format for compressed data, it is
+   recommended that a check value such as an Adler-32 or a CRC-32 be applied to
+   the uncompressed data as is done in the zlib, gzip, and zip formats.  For
+   most applications, the zlib format should be used as is.  Note that comments
+   above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+     windowBits can also be greater than 15 for optional gzip decoding.  Add
+   32 to windowBits to enable zlib and gzip decoding with automatic header
+   detection, or add 16 to decode only the gzip format (the zlib format will
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is a
+   CRC-32 instead of an Adler-32.  Unlike the gunzip utility and gzread() (see
+   below), inflate() will *not* automatically decode concatenated gzip members.
+   inflate() will return Z_STREAM_END at the end of the gzip member.  The state
+   would need to be reset to continue decoding a subsequent gzip member.  This
+   *must* be done if there is more data after a gzip member, in order for the
+   decompression to be compliant with the gzip standard (RFC 1952).
+
+     inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit2 does not perform any decompression
+   apart from possibly reading the zlib header if present: actual decompression
+   will be done by inflate().  (So next_in and avail_in may be modified, but
+   next_out and avail_out are unused and unchanged.) The current implementation
+   of inflateInit2() does not process any header information -- that is
+   deferred until inflate() is called.
+*/
+
+Z_EXTERN int Z_EXPORT inflateSetDictionary(z_stream *strm, const unsigned char *dictionary, unsigned int dictLength);
+/*
+     Initializes the decompression dictionary from the given uncompressed byte
+   sequence.  This function must be called immediately after a call of inflate,
+   if that call returned Z_NEED_DICT.  The dictionary chosen by the compressor
+   can be determined from the Adler-32 value returned by that call of inflate.
+   The compressor and decompressor must use exactly the same dictionary (see
+   deflateSetDictionary).  For raw inflate, this function can be called at any
+   time to set the dictionary.  If the provided dictionary is smaller than the
+   window and there is already data in the window, then the provided dictionary
+   will amend what's there.  The application must insure that the dictionary
+   that was used for compression is provided.
+
+     inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+   parameter is invalid (e.g.  dictionary being NULL) or the stream state is
+   inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+   expected one (incorrect Adler-32 value).  inflateSetDictionary does not
+   perform any decompression: this will be done by subsequent calls of
+   inflate().
+*/
+
+Z_EXTERN int Z_EXPORT inflateGetDictionary(z_stream *strm, unsigned char *dictionary, unsigned int *dictLength);
+/*
+     Returns the sliding dictionary being maintained by inflate.  dictLength is
+   set to the number of bytes in the dictionary, and that many bytes are copied
+   to dictionary.  dictionary must have enough space, where 32768 bytes is
+   always enough.  If inflateGetDictionary() is called with dictionary equal to
+   NULL, then only the dictionary length is returned, and nothing is copied.
+   Similarly, if dictLength is NULL, then it is not set.
+
+     inflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+   stream state is inconsistent.
+*/
+
+Z_EXTERN int Z_EXPORT inflateSync(z_stream *strm);
+/*
+     Skips invalid compressed data until a possible full flush point (see above
+   for the description of deflate with Z_FULL_FLUSH) can be found, or until all
+   available input is skipped.  No output is provided.
+
+     inflateSync searches for a 00 00 FF FF pattern in the compressed data.
+   All full flush points have this pattern, but not all occurrences of this
+   pattern are full flush points.
+
+     inflateSync returns Z_OK if a possible full flush point has been found,
+   Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point
+   has been found, or Z_STREAM_ERROR if the stream structure was inconsistent.
+   In the success case, the application may save the current value of
+   total_in which indicates where valid compressed data was found.  In the
+   error case, the application may repeatedly call inflateSync, providing more
+   input each time, until success or end of the input data.
+*/
+
+Z_EXTERN int Z_EXPORT inflateCopy(z_stream *dest, z_stream *source);
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when randomly accessing a large stream.  The
+   first pass through the stream can periodically record the inflate state,
+   allowing restarting inflate at those points when randomly accessing the
+   stream.
+
+     inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being NULL).  msg is left unchanged in both source and
+   destination.
+*/
+
+Z_EXTERN int Z_EXPORT inflateReset(z_stream *strm);
+/*
+     This function is equivalent to inflateEnd followed by inflateInit,
+   but does not free and reallocate the internal decompression state.  The
+   stream will keep attributes that may have been set by inflateInit2.
+   total_in, total_out, adler, and msg are initialized.
+
+     inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL).
+*/
+
+Z_EXTERN int Z_EXPORT inflateReset2(z_stream *strm, int windowBits);
+/*
+     This function is the same as inflateReset, but it also permits changing
+   the wrap and window size requests.  The windowBits parameter is interpreted
+   the same as it is for inflateInit2.  If the window size is changed, then the
+   memory allocated for the window is freed, and the window will be reallocated
+   by inflate() if needed.
+
+     inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being NULL), or if
+   the windowBits parameter is invalid.
+*/
+
+Z_EXTERN int Z_EXPORT inflatePrime(z_stream *strm, int bits, int value);
+/*
+     This function inserts bits in the inflate input stream.  The intent is
+   that this function is used to start inflating at a bit position in the
+   middle of a byte.  The provided bits will be used before any bytes are used
+   from next_in.  This function should only be used with raw inflate, and
+   should be used before the first inflate() call after inflateInit2() or
+   inflateReset().  bits must be less than or equal to 16, and that many of the
+   least significant bits of value will be inserted in the input.
+
+     If bits is negative, then the input stream bit buffer is emptied.  Then
+   inflatePrime() can be called again to put bits in the buffer.  This is used
+   to clear out bits leftover after feeding inflate a block description prior
+   to feeding inflate codes.
+
+     inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+Z_EXTERN long Z_EXPORT inflateMark(z_stream *strm);
+/*
+     This function returns two values, one in the lower 16 bits of the return
+   value, and the other in the remaining upper bits, obtained by shifting the
+   return value down 16 bits.  If the upper value is -1 and the lower value is
+   zero, then inflate() is currently decoding information outside of a block.
+   If the upper value is -1 and the lower value is non-zero, then inflate is in
+   the middle of a stored block, with the lower value equaling the number of
+   bytes from the input remaining to copy.  If the upper value is not -1, then
+   it is the number of bits back from the current bit position in the input of
+   the code (literal or length/distance pair) currently being processed.  In
+   that case the lower value is the number of bytes already emitted for that
+   code.
+
+     A code is being processed if inflate is waiting for more input to complete
+   decoding of the code, or if it has completed decoding but is waiting for
+   more output space to write the literal or match data.
+
+     inflateMark() is used to mark locations in the input data for random
+   access, which may be at bit positions, and to note those cases where the
+   output of a code may span boundaries of random access blocks.  The current
+   location in the input stream can be determined from avail_in and data_type
+   as noted in the description for the Z_BLOCK flush parameter for inflate.
+
+     inflateMark returns the value noted above, or -65536 if the provided
+   source stream state was inconsistent.
+*/
+
+Z_EXTERN int Z_EXPORT inflateGetHeader(z_stream *strm, gz_headerp head);
+/*
+     inflateGetHeader() requests that gzip header information be stored in the
+   provided gz_header structure.  inflateGetHeader() may be called after
+   inflateInit2() or inflateReset(), and before the first call of inflate().
+   As inflate() processes the gzip stream, head->done is zero until the header
+   is completed, at which time head->done is set to one.  If a zlib stream is
+   being decoded, then head->done is set to -1 to indicate that there will be
+   no gzip header information forthcoming.  Note that Z_BLOCK or Z_TREES can be
+   used to force inflate() to return immediately after header processing is
+   complete and before any actual data is decompressed.
+
+     The text, time, xflags, and os fields are filled in with the gzip header
+   contents.  hcrc is set to true if there is a header CRC.  (The header CRC
+   was valid if done is set to one.) If extra is not NULL, then extra_max
+   contains the maximum number of bytes to write to extra.  Once done is true,
+   extra_len contains the actual extra field length, and extra contains the
+   extra field, or that field truncated if extra_max is less than extra_len.
+   If name is not NULL, then up to name_max characters are written there,
+   terminated with a zero unless the length is greater than name_max.  If
+   comment is not NULL, then up to comm_max characters are written there,
+   terminated with a zero unless the length is greater than comm_max.  When any
+   of extra, name, or comment are not NULL and the respective field is not
+   present in the header, then that field is set to NULL to signal its
+   absence.  This allows the use of deflateSetHeader() with the returned
+   structure to duplicate the header.  However if those fields are set to
+   allocated memory, then the application will need to save those pointers
+   elsewhere so that they can be eventually freed.
+
+     If inflateGetHeader is not used, then the header information is simply
+   discarded.  The header is always checked for validity, including the header
+   CRC if present.  inflateReset() will reset the process to discard the header
+   information.  The application would need to call inflateGetHeader() again to
+   retrieve the header from the next gzip stream.
+
+     inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+Z_EXTERN int Z_EXPORT inflateBackInit (z_stream *strm, int windowBits, unsigned char *window);
+
+     Initialize the internal stream state for decompression using inflateBack()
+   calls.  The fields zalloc, zfree and opaque in strm must be initialized
+   before the call.  If zalloc and zfree are NULL, then the default library-
+   derived memory allocation routines are used.  windowBits is the base two
+   logarithm of the window size, in the range 8..15.  window is a caller
+   supplied buffer of that size.  Except for special applications where it is
+   assured that deflate was used with small window sizes, windowBits must be 15
+   and a 32K byte window must be supplied to be able to decompress general
+   deflate streams.
+
+     See inflateBack() for the usage of these routines.
+
+     inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+   the parameters are invalid, Z_MEM_ERROR if the internal state could not be
+   allocated, or Z_VERSION_ERROR if the version of the library does not match
+   the version of the header file.
+*/
+
+typedef uint32_t (*in_func) (void *, z_const unsigned char * *);
+typedef int (*out_func) (void *, unsigned char *, uint32_t);
+
+Z_EXTERN int Z_EXPORT inflateBack(z_stream *strm, in_func in, void *in_desc, out_func out, void *out_desc);
+/*
+     inflateBack() does a raw inflate with a single call using a call-back
+   interface for input and output.  This is potentially more efficient than
+   inflate() for file i/o applications, in that it avoids copying between the
+   output and the sliding window by simply making the window itself the output
+   buffer.  inflate() can be faster on modern CPUs when used with large
+   buffers.  inflateBack() trusts the application to not change the output
+   buffer passed by the output function, at least until inflateBack() returns.
+
+     inflateBackInit() must be called first to allocate the internal state
+   and to initialize the state with the user-provided window buffer.
+   inflateBack() may then be used multiple times to inflate a complete, raw
+   deflate stream with each call.  inflateBackEnd() is then called to free the
+   allocated state.
+
+     A raw deflate stream is one with no zlib or gzip header or trailer.
+   This routine would normally be used in a utility that reads zip or gzip
+   files and writes out uncompressed files.  The utility would decode the
+   header and process the trailer on its own, hence this routine expects only
+   the raw deflate stream to decompress.  This is different from the default
+   behavior of inflate(), which expects a zlib header and trailer around the
+   deflate stream.
+
+     inflateBack() uses two subroutines supplied by the caller that are then
+   called by inflateBack() for input and output.  inflateBack() calls those
+   routines until it reads a complete deflate stream and writes out all of the
+   uncompressed data, or until it encounters an error.  The function's
+   parameters and return types are defined above in the in_func and out_func
+   typedefs.  inflateBack() will call in(in_desc, &buf) which should return the
+   number of bytes of provided input, and a pointer to that input in buf.  If
+   there is no input available, in() must return zero -- buf is ignored in that
+   case -- and inflateBack() will return a buffer error.  inflateBack() will
+   call out(out_desc, buf, len) to write the uncompressed data buf[0..len-1].
+   out() should return zero on success, or non-zero on failure.  If out()
+   returns non-zero, inflateBack() will return with an error.  Neither in() nor
+   out() are permitted to change the contents of the window provided to
+   inflateBackInit(), which is also the buffer that out() uses to write from.
+   The length written by out() will be at most the window size.  Any non-zero
+   amount of input may be provided by in().
+
+     For convenience, inflateBack() can be provided input on the first call by
+   setting strm->next_in and strm->avail_in.  If that input is exhausted, then
+   in() will be called.  Therefore strm->next_in must be initialized before
+   calling inflateBack().  If strm->next_in is NULL, then in() will be called
+   immediately for input.  If strm->next_in is not NULL, then strm->avail_in
+   must also be initialized, and then if strm->avail_in is not zero, input will
+   initially be taken from strm->next_in[0 ..  strm->avail_in - 1].
+
+     The in_desc and out_desc parameters of inflateBack() is passed as the
+   first parameter of in() and out() respectively when they are called.  These
+   descriptors can be optionally used to pass any information that the caller-
+   supplied in() and out() functions need to do their job.
+
+     On return, inflateBack() will set strm->next_in and strm->avail_in to
+   pass back any unused input that was provided by the last in() call.  The
+   return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+   if in() or out() returned an error, Z_DATA_ERROR if there was a format error
+   in the deflate stream (in which case strm->msg is set to indicate the nature
+   of the error), or Z_STREAM_ERROR if the stream was not properly initialized.
+   In the case of Z_BUF_ERROR, an input or output error can be distinguished
+   using strm->next_in which will be NULL only if in() returned an error.  If
+   strm->next_in is not NULL, then the Z_BUF_ERROR was due to out() returning
+   non-zero.  (in() will always be called before out(), so strm->next_in is
+   assured to be defined if out() returns non-zero.)  Note that inflateBack()
+   cannot return Z_OK.
+*/
+
+Z_EXTERN int Z_EXPORT inflateBackEnd(z_stream *strm);
+/*
+     All memory allocated by inflateBackInit() is freed.
+
+     inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+   state was inconsistent.
+*/
+
+Z_EXTERN unsigned long Z_EXPORT zlibCompileFlags(void);
+/* Return flags indicating compile-time options.
+
+    Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+     1.0: size of unsigned int
+     3.2: size of unsigned long
+     5.4: size of void * (pointer)
+     7.6: size of z_off_t
+
+    Compiler, assembler, and debug options:
+     8: ZLIB_DEBUG
+     9: ASMV or ASMINF -- use ASM code
+     10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+     11: 0 (reserved)
+
+    One-time table building (smaller code, but not thread-safe if true):
+     12: BUILDFIXED -- build static block decoding tables when needed (not supported by zlib-ng)
+     13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+     14,15: 0 (reserved)
+
+    Library content (indicates missing functionality):
+     16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+                          deflate code when not needed)
+     17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+                    and decode gzip streams (to avoid linking crc code)
+     18-19: 0 (reserved)
+
+    Operation variations (changes in library functionality):
+     20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+     21: FASTEST -- deflate algorithm with only one, lowest compression level
+     22,23: 0 (reserved)
+
+    The sprintf variant used by gzprintf (zero is best):
+     24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+     25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+     26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+
+    Remainder:
+     27-31: 0 (reserved)
+ */
+
+
+#ifndef Z_SOLO
+
+                        /* utility functions */
+
+/*
+     The following utility functions are implemented on top of the basic
+   stream-oriented functions.  To simplify the interface, some default options
+   are assumed (compression level and memory usage, standard memory allocation
+   functions).  The source code of these utility functions can be modified if
+   you need special options.
+*/
+
+Z_EXTERN int Z_EXPORT compress(unsigned char *dest, unsigned long *destLen, const unsigned char *source, unsigned long sourceLen);
+/*
+     Compresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
+   compressed data.  compress() is equivalent to compress2() with a level
+   parameter of Z_DEFAULT_COMPRESSION.
+
+     compress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer.
+*/
+
+Z_EXTERN int Z_EXPORT compress2(unsigned char *dest, unsigned long *destLen, const unsigned char *source,
+                              unsigned long sourceLen, int level);
+/*
+     Compresses the source buffer into the destination buffer.  The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer.  Upon entry, destLen is the total size of the
+   destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
+   compressed data.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+
+Z_EXTERN unsigned long Z_EXPORT compressBound(unsigned long sourceLen);
+/*
+     compressBound() returns an upper bound on the compressed size after
+   compress() or compress2() on sourceLen bytes.  It would be used before a
+   compress() or compress2() call to allocate the destination buffer.
+*/
+
+Z_EXTERN int Z_EXPORT uncompress(unsigned char *dest, unsigned long *destLen, const unsigned char *source, unsigned long sourceLen);
+/*
+     Decompresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be large enough to hold the entire
+   uncompressed data.  (The size of the uncompressed data must have been saved
+   previously by the compressor and transmitted to the decompressor by some
+   mechanism outside the scope of this compression library.) Upon exit, destLen
+   is the actual size of the uncompressed data.
+
+     uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.  In
+   the case where there is not enough room, uncompress() will fill the output
+   buffer with the uncompressed data up to that point.
+*/
+
+
+Z_EXTERN int Z_EXPORT uncompress2 (unsigned char *dest,         unsigned long *destLen,
+                                 const unsigned char *source, unsigned long *sourceLen);
+/*
+     Same as uncompress, except that sourceLen is a pointer, where the
+   length of the source is *sourceLen.  On return, *sourceLen is the number of
+   source bytes consumed.
+*/
+
+
+                        /* gzip file access functions */
+
+/*
+     This library supports reading and writing files in gzip (.gz) format with
+   an interface similar to that of stdio, using the functions that start with
+   "gz".  The gzip format is different from the zlib format.  gzip is a gzip
+   wrapper, documented in RFC 1952, wrapped around a deflate stream.
+*/
+
+typedef struct gzFile_s *gzFile;    /* semi-opaque gzip file descriptor */
+
+/*
+Z_EXTERN gzFile Z_EXPORT gzopen(const char *path, const char *mode);
+
+     Open the gzip (.gz) file at path for reading and decompressing, or
+   compressing and writing.  The mode parameter is as in fopen ("rb" or "wb")
+   but can also include a compression level ("wb9") or a strategy: 'f' for
+   filtered data as in "wb6f", 'h' for Huffman-only compression as in "wb1h",
+   'R' for run-length encoding as in "wb1R", or 'F' for fixed code compression
+   as in "wb9F".  (See the description of deflateInit2 for more information
+   about the strategy parameter.)  'T' will request transparent writing or
+   appending with no compression and not using the gzip format.
+
+     "a" can be used instead of "w" to request that the gzip stream that will
+   be written be appended to the file.  "+" will result in an error, since
+   reading and writing to the same gzip file is not supported.  The addition of
+   "x" when writing will create the file exclusively, which fails if the file
+   already exists.  On systems that support it, the addition of "e" when
+   reading or writing will set the flag to close the file on an execve() call.
+
+     These functions, as well as gzip, will read and decode a sequence of gzip
+   streams in a file.  The append function of gzopen() can be used to create
+   such a file.  (Also see gzflush() for another way to do this.)  When
+   appending, gzopen does not test whether the file begins with a gzip stream,
+   nor does it look for the end of the gzip streams to begin appending.  gzopen
+   will simply append a gzip stream to the existing file.
+
+     gzopen can be used to read a file which is not in gzip format; in this
+   case gzread will directly read from the file without decompression.  When
+   reading, this will be detected automatically by looking for the magic two-
+   byte gzip header.
+
+     gzopen returns NULL if the file could not be opened, if there was
+   insufficient memory to allocate the gzFile state, or if an invalid mode was
+   specified (an 'r', 'w', or 'a' was not provided, or '+' was provided).
+   errno can be checked to determine if the reason gzopen failed was that the
+   file could not be opened.
+*/
+
+Z_EXTERN gzFile Z_EXPORT gzdopen(int fd, const char *mode);
+/*
+     Associate a gzFile with the file descriptor fd.  File descriptors are
+   obtained from calls like open, dup, creat, pipe or fileno (if the file has
+   been previously opened with fopen).  The mode parameter is as in gzopen.
+
+     The next call of gzclose on the returned gzFile will also close the file
+   descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor
+   fd.  If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd,
+   mode);.  The duplicated descriptor should be saved to avoid a leak, since
+   gzdopen does not close fd if it fails.  If you are using fileno() to get the
+   file descriptor from a FILE *, then you will have to use dup() to avoid
+   double-close()ing the file descriptor.  Both gzclose() and fclose() will
+   close the associated file descriptor, so they need to have different file
+   descriptors.
+
+     gzdopen returns NULL if there was insufficient memory to allocate the
+   gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not
+   provided, or '+' was provided), or if fd is -1.  The file descriptor is not
+   used until the next gz* read, write, seek, or close operation, so gzdopen
+   will not detect if fd is invalid (unless fd is -1).
+*/
+
+Z_EXTERN int Z_EXPORT gzbuffer(gzFile file, unsigned size);
+/*
+     Set the internal buffer size used by this library's functions for file to
+   size.  The default buffer size is 8192 bytes.  This function must be called
+   after gzopen() or gzdopen(), and before any other calls that read or write
+   the file.  The buffer memory allocation is always deferred to the first read
+   or write.  Three times that size in buffer space is allocated.  A larger
+   buffer size of, for example, 64K or 128K bytes will noticeably increase the
+   speed of decompression (reading).
+
+     The new buffer size also affects the maximum length for gzprintf().
+
+     gzbuffer() returns 0 on success, or -1 on failure, such as being called
+   too late.
+*/
+
+Z_EXTERN int Z_EXPORT gzsetparams(gzFile file, int level, int strategy);
+/*
+     Dynamically update the compression level and strategy for file.  See the
+   description of deflateInit2 for the meaning of these parameters. Previously
+   provided data is flushed before applying the parameter changes.
+
+     gzsetparams returns Z_OK if success, Z_STREAM_ERROR if the file was not
+   opened for writing, Z_ERRNO if there is an error writing the flushed data,
+   or Z_MEM_ERROR if there is a memory allocation error.
+*/
+
+Z_EXTERN int Z_EXPORT gzread(gzFile file, void *buf, unsigned len);
+/*
+     Read and decompress up to len uncompressed bytes from file into buf.  If
+   the input file is not in gzip format, gzread copies the given number of
+   bytes into the buffer directly from the file.
+
+     After reaching the end of a gzip stream in the input, gzread will continue
+   to read, looking for another gzip stream.  Any number of gzip streams may be
+   concatenated in the input file, and will all be decompressed by gzread().
+   If something other than a gzip stream is encountered after a gzip stream,
+   that remaining trailing garbage is ignored (and no error is returned).
+
+     gzread can be used to read a gzip file that is being concurrently written.
+   Upon reaching the end of the input, gzread will return with the available
+   data.  If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then
+   gzclearerr can be used to clear the end of file indicator in order to permit
+   gzread to be tried again.  Z_OK indicates that a gzip stream was completed
+   on the last gzread.  Z_BUF_ERROR indicates that the input file ended in the
+   middle of a gzip stream.  Note that gzread does not return -1 in the event
+   of an incomplete gzip stream.  This error is deferred until gzclose(), which
+   will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip
+   stream.  Alternatively, gzerror can be used before gzclose to detect this
+   case.
+
+     gzread returns the number of uncompressed bytes actually read, less than
+   len for end of file, or -1 for error.  If len is too large to fit in an int,
+   then nothing is read, -1 is returned, and the error state is set to
+   Z_STREAM_ERROR.
+*/
+
+Z_EXTERN size_t Z_EXPORT gzfread (void *buf, size_t size, size_t nitems, gzFile file);
+/*
+     Read and decompress up to nitems items of size size from file into buf,
+   otherwise operating as gzread() does.  This duplicates the interface of
+   stdio's fread(), with size_t request and return types.  If the library
+   defines size_t, then z_size_t is identical to size_t.  If not, then z_size_t
+   is an unsigned integer type that can contain a pointer.
+
+     gzfread() returns the number of full items read of size size, or zero if
+   the end of the file was reached and a full item could not be read, or if
+   there was an error.  gzerror() must be consulted if zero is returned in
+   order to determine if there was an error.  If the multiplication of size and
+   nitems overflows, i.e. the product does not fit in a size_t, then nothing
+   is read, zero is returned, and the error state is set to Z_STREAM_ERROR.
+
+     In the event that the end of file is reached and only a partial item is
+   available at the end, i.e. the remaining uncompressed data length is not a
+   multiple of size, then the final partial item is nevertheless read into buf
+   and the end-of-file flag is set.  The length of the partial item read is not
+   provided, but could be inferred from the result of gztell().  This behavior
+   is the same as the behavior of fread() implementations in common libraries,
+   but it prevents the direct use of gzfread() to read a concurrently written
+   file, resetting and retrying on end-of-file, when size is not 1.
+*/
+
+Z_EXTERN int Z_EXPORT gzwrite(gzFile file, void const *buf, unsigned len);
+/*
+     Compress and write the len uncompressed bytes at buf to file. gzwrite
+   returns the number of uncompressed bytes written or 0 in case of error.
+*/
+
+Z_EXTERN size_t Z_EXPORT gzfwrite(void const *buf, size_t size, size_t nitems, gzFile file);
+/*
+     Compress and write nitems items of size size from buf to file, duplicating
+   the interface of stdio's fwrite(), with size_t request and return types.
+
+     gzfwrite() returns the number of full items written of size size, or zero
+   if there was an error.  If the multiplication of size and nitems overflows,
+   i.e. the product does not fit in a size_t, then nothing is written, zero
+   is returned, and the error state is set to Z_STREAM_ERROR.
+*/
+
+Z_EXTERN int Z_EXPORTVA gzprintf(gzFile file, const char *format, ...);
+/*
+     Convert, format, compress, and write the arguments (...) to file under
+   control of the string format, as in fprintf.  gzprintf returns the number of
+   uncompressed bytes actually written, or a negative zlib error code in case
+   of error.  The number of uncompressed bytes written is limited to 8191, or
+   one less than the buffer size given to gzbuffer().  The caller should assure
+   that this limit is not exceeded.  If it is exceeded, then gzprintf() will
+   return an error (0) with nothing written.  In this case, there may also be a
+   buffer overflow with unpredictable consequences, which is possible only if
+   zlib was compiled with the insecure functions sprintf() or vsprintf(),
+   because the secure snprintf() or vsnprintf() functions were not available.
+   This can be determined using zlibCompileFlags().
+*/
+
+Z_EXTERN int Z_EXPORT gzputs(gzFile file, const char *s);
+/*
+     Compress and write the given null-terminated string s to file, excluding
+   the terminating null character.
+
+     gzputs returns the number of characters written, or -1 in case of error.
+*/
+
+Z_EXTERN char * Z_EXPORT gzgets(gzFile file, char *buf, int len);
+/*
+     Read and decompress bytes from file into buf, until len-1 characters are
+   read, or until a newline character is read and transferred to buf, or an
+   end-of-file condition is encountered.  If any characters are read or if len
+   is one, the string is terminated with a null character.  If no characters
+   are read due to an end-of-file or len is less than one, then the buffer is
+   left untouched.
+
+     gzgets returns buf which is a null-terminated string, or it returns NULL
+   for end-of-file or in case of error.  If there was an error, the contents at
+   buf are indeterminate.
+*/
+
+Z_EXTERN int Z_EXPORT gzputc(gzFile file, int c);
+/*
+     Compress and write c, converted to an unsigned char, into file.  gzputc
+   returns the value that was written, or -1 in case of error.
+*/
+
+Z_EXTERN int Z_EXPORT gzgetc(gzFile file);
+/*
+     Read and decompress one byte from file.  gzgetc returns this byte or -1
+   in case of end of file or error.  This is implemented as a macro for speed.
+   As such, it does not do all of the checking the other functions do.  I.e.
+   it does not check to see if file is NULL, nor whether the structure file
+   points to has been clobbered or not.
+*/
+
+Z_EXTERN int Z_EXPORT gzungetc(int c, gzFile file);
+/*
+     Push c back onto the stream for file to be read as the first character on
+   the next read.  At least one character of push-back is always allowed.
+   gzungetc() returns the character pushed, or -1 on failure.  gzungetc() will
+   fail if c is -1, and may fail if a character has been pushed but not read
+   yet.  If gzungetc is used immediately after gzopen or gzdopen, at least the
+   output buffer size of pushed characters is allowed.  (See gzbuffer above.)
+   The pushed character will be discarded if the stream is repositioned with
+   gzseek() or gzrewind().
+*/
+
+Z_EXTERN int Z_EXPORT gzflush(gzFile file, int flush);
+/*
+     Flush all pending output to file.  The parameter flush is as in the
+   deflate() function.  The return value is the zlib error number (see function
+   gzerror below).  gzflush is only permitted when writing.
+
+     If the flush parameter is Z_FINISH, the remaining data is written and the
+   gzip stream is completed in the output.  If gzwrite() is called again, a new
+   gzip stream will be started in the output.  gzread() is able to read such
+   concatenated gzip streams.
+
+     gzflush should be called only when strictly necessary because it will
+   degrade compression if called too often.
+*/
+
+/*
+Z_EXTERN z_off_t Z_EXPORT gzseek (gzFile file, z_off_t offset, int whence);
+
+     Set the starting position to offset relative to whence for the next gzread
+   or gzwrite on file.  The offset represents a number of bytes in the
+   uncompressed data stream.  The whence parameter is defined as in lseek(2);
+   the value SEEK_END is not supported.
+
+     If the file is opened for reading, this function is emulated but can be
+   extremely slow.  If the file is opened for writing, only forward seeks are
+   supported; gzseek then compresses a sequence of zeroes up to the new
+   starting position.
+
+     gzseek returns the resulting offset location as measured in bytes from
+   the beginning of the uncompressed stream, or -1 in case of error, in
+   particular if the file is opened for writing and the new starting position
+   would be before the current position.
+*/
+
+Z_EXTERN int Z_EXPORT gzrewind(gzFile file);
+/*
+     Rewind file. This function is supported only for reading.
+
+     gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET).
+*/
+
+/*
+Z_EXTERN z_off_t Z_EXPORT gztell(gzFile file);
+
+     Return the starting position for the next gzread or gzwrite on file.
+   This position represents a number of bytes in the uncompressed data stream,
+   and is zero when starting, even if appending or reading a gzip stream from
+   the middle of a file using gzdopen().
+
+     gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
+/*
+Z_EXTERN z_off_t Z_EXPORT gzoffset(gzFile file);
+
+     Return the current compressed (actual) read or write offset of file.  This
+   offset includes the count of bytes that precede the gzip stream, for example
+   when appending or when using gzdopen() for reading.  When reading, the
+   offset does not include as yet unused buffered input.  This information can
+   be used for a progress indicator.  On error, gzoffset() returns -1.
+*/
+
+Z_EXTERN int Z_EXPORT gzeof(gzFile file);
+/*
+     Return true (1) if the end-of-file indicator for file has been set while
+   reading, false (0) otherwise.  Note that the end-of-file indicator is set
+   only if the read tried to go past the end of the input, but came up short.
+   Therefore, just like feof(), gzeof() may return false even if there is no
+   more data to read, in the event that the last read request was for the exact
+   number of bytes remaining in the input file.  This will happen if the input
+   file size is an exact multiple of the buffer size.
+
+     If gzeof() returns true, then the read functions will return no more data,
+   unless the end-of-file indicator is reset by gzclearerr() and the input file
+   has grown since the previous end of file was detected.
+*/
+
+Z_EXTERN int Z_EXPORT gzdirect(gzFile file);
+/*
+     Return true (1) if file is being copied directly while reading, or false
+   (0) if file is a gzip stream being decompressed.
+
+     If the input file is empty, gzdirect() will return true, since the input
+   does not contain a gzip stream.
+
+     If gzdirect() is used immediately after gzopen() or gzdopen() it will
+   cause buffers to be allocated to allow reading the file to determine if it
+   is a gzip file.  Therefore if gzbuffer() is used, it should be called before
+   gzdirect().
+
+     When writing, gzdirect() returns true (1) if transparent writing was
+   requested ("wT" for the gzopen() mode), or false (0) otherwise.  (Note:
+   gzdirect() is not needed when writing.  Transparent writing must be
+   explicitly requested, so the application already knows the answer.  When
+   linking statically, using gzdirect() will include all of the zlib code for
+   gzip file reading and decompression, which may not be desired.)
+*/
+
+Z_EXTERN int Z_EXPORT gzclose(gzFile file);
+/*
+     Flush all pending output for file, if necessary, close file and
+   deallocate the (de)compression state.  Note that once file is closed, you
+   cannot call gzerror with file, since its structures have been deallocated.
+   gzclose must not be called more than once on the same file, just as free
+   must not be called more than once on the same allocation.
+
+     gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a
+   file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the
+   last read ended in the middle of a gzip stream, or Z_OK on success.
+*/
+
+Z_EXTERN int Z_EXPORT gzclose_r(gzFile file);
+Z_EXTERN int Z_EXPORT gzclose_w(gzFile file);
+/*
+     Same as gzclose(), but gzclose_r() is only for use when reading, and
+   gzclose_w() is only for use when writing or appending.  The advantage to
+   using these instead of gzclose() is that they avoid linking in zlib
+   compression or decompression code that is not used when only reading or only
+   writing respectively.  If gzclose() is used, then both compression and
+   decompression code will be included the application when linking to a static
+   zlib library.
+*/
+
+Z_EXTERN const char * Z_EXPORT gzerror(gzFile file, int *errnum);
+/*
+     Return the error message for the last error which occurred on file.
+   errnum is set to zlib error number.  If an error occurred in the file system
+   and not in the compression library, errnum is set to Z_ERRNO and the
+   application may consult errno to get the exact error code.
+
+     The application must not modify the returned string.  Future calls to
+   this function may invalidate the previously returned string.  If file is
+   closed, then the string previously returned by gzerror will no longer be
+   available.
+
+     gzerror() should be used to distinguish errors from end-of-file for those
+   functions above that do not distinguish those cases in their return values.
+*/
+
+Z_EXTERN void Z_EXPORT gzclearerr(gzFile file);
+/*
+     Clear the error and end-of-file flags for file.  This is analogous to the
+   clearerr() function in stdio.  This is useful for continuing to read a gzip
+   file that is being written concurrently.
+*/
+
+#endif
+
+                        /* checksum functions */
+
+/*
+     These functions are not related to compression but are exported
+   anyway because they might be useful in applications using the compression
+   library.
+*/
+
+Z_EXTERN unsigned long Z_EXPORT adler32(unsigned long adler, const unsigned char *buf, unsigned int len);
+/*
+     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+   return the updated checksum. An Adler-32 value is in the range of a 32-bit
+   unsigned integer. If buf is Z_NULL, this function returns the required
+   initial value for the checksum.
+
+     An Adler-32 checksum is almost as reliable as a CRC-32 but can be computed
+   much faster.
+
+   Usage example:
+
+     uint32_t adler = adler32(0L, NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       adler = adler32(adler, buffer, length);
+     }
+     if (adler != original_adler) error();
+*/
+
+Z_EXTERN unsigned long Z_EXPORT adler32_z(unsigned long adler, const unsigned char *buf, size_t len);
+/*
+     Same as adler32(), but with a size_t length.
+*/
+
+/*
+Z_EXTERN unsigned long Z_EXPORT adler32_combine(unsigned long adler1, unsigned long adler2, z_off_t len2);
+
+     Combine two Adler-32 checksums into one.  For two sequences of bytes, seq1
+   and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+   each, adler1 and adler2.  adler32_combine() returns the Adler-32 checksum of
+   seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.  Note
+   that the z_off_t type (like off_t) is a signed integer.  If len2 is
+   negative, the result has no meaning or utility.
+*/
+
+Z_EXTERN unsigned long Z_EXPORT crc32(unsigned long crc, const unsigned char *buf, unsigned int len);
+/*
+     Update a running CRC-32 with the bytes buf[0..len-1] and return the
+   updated CRC-32. A CRC-32 value is in the range of a 32-bit unsigned integer.
+   If buf is Z_NULL, this function returns the required initial value for the
+   crc. Pre- and post-conditioning (one's complement) is performed within this
+   function so it shouldn't be done by the application.
+
+   Usage example:
+
+     uint32_t crc = crc32(0L, NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       crc = crc32(crc, buffer, length);
+     }
+     if (crc != original_crc) error();
+*/
+
+Z_EXTERN unsigned long Z_EXPORT crc32_z(unsigned long crc, const unsigned char *buf, size_t len);
+/*
+     Same as crc32(), but with a size_t length.
+*/
+
+/*
+Z_EXTERN unsigned long Z_EXPORT crc32_combine(unsigned long crc1, unsigned long crc2, z_off64_t len2);
+
+     Combine two CRC-32 check values into one.  For two sequences of bytes,
+   seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+   calculated for each, crc1 and crc2.  crc32_combine() returns the CRC-32
+   check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+   len2.
+*/
+
+/*
+Z_EXTERN unsigned long Z_EXPORT crc32_combine_gen(z_off_t len2);
+
+     Return the operator corresponding to length len2, to be used with
+   crc32_combine_op().
+*/
+
+Z_EXTERN unsigned long Z_EXPORT crc32_combine_op(unsigned long crc1, unsigned long crc2,
+                                                 const unsigned long op);
+/*
+     Give the same result as crc32_combine(), using op in place of len2. op is
+   is generated from len2 by crc32_combine_gen(). This will be faster than
+   crc32_combine() if the generated op is used more than once.
+*/
+
+
+                        /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+Z_EXTERN int Z_EXPORT deflateInit_(z_stream *strm, int level, const char *version, int stream_size);
+Z_EXTERN int Z_EXPORT inflateInit_(z_stream *strm, const char *version, int stream_size);
+Z_EXTERN int Z_EXPORT deflateInit2_(z_stream *strm, int  level, int  method, int windowBits, int memLevel,
+                                   int strategy, const char *version, int stream_size);
+Z_EXTERN int Z_EXPORT inflateInit2_(z_stream *strm, int  windowBits, const char *version, int stream_size);
+Z_EXTERN int Z_EXPORT inflateBackInit_(z_stream *strm, int windowBits, unsigned char *window,
+                                      const char *version, int stream_size);
+#define @ZLIB_SYMBOL_PREFIX@deflateInit(strm, level) deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
+#define @ZLIB_SYMBOL_PREFIX@inflateInit(strm) inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
+#define @ZLIB_SYMBOL_PREFIX@deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+        deflateInit2_((strm), (level), (method), (windowBits), (memLevel), \
+                     (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
+#define @ZLIB_SYMBOL_PREFIX@inflateInit2(strm, windowBits) inflateInit2_((strm), (windowBits), ZLIB_VERSION, (int)sizeof(z_stream))
+#define @ZLIB_SYMBOL_PREFIX@inflateBackInit(strm, windowBits, window) \
+                        inflateBackInit_((strm), (windowBits), (window), ZLIB_VERSION, (int)sizeof(z_stream))
+
+
+#ifndef Z_SOLO
+/* gzgetc() macro and its supporting function and exposed data structure.  Note
+ * that the real internal state is much larger than the exposed structure.
+ * This abbreviated structure exposes just enough for the gzgetc() macro.  The
+ * user should not mess with these exposed elements, since their names or
+ * behavior could change in the future, perhaps even capriciously.  They can
+ * only be used by the gzgetc() macro.  You have been warned.
+ */
+struct gzFile_s {
+    unsigned have;
+    unsigned char *next;
+    z_off64_t pos;
+};
+Z_EXTERN int Z_EXPORT gzgetc_(gzFile file);  /* backward compatibility */
+#  define @ZLIB_SYMBOL_PREFIX@gzgetc(g) ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : (@ZLIB_SYMBOL_PREFIX@gzgetc)(g))
+
+/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or
+ * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if
+ * both are true, the application gets the *64 functions, and the regular
+ * functions are changed to 64 bits) -- in case these are set on systems
+ * without large file support, _LFS64_LARGEFILE must also be true
+ */
+#ifdef Z_LARGE64
+   Z_EXTERN gzFile Z_EXPORT gzopen64(const char *, const char *);
+   Z_EXTERN z_off64_t Z_EXPORT gzseek64(gzFile, z_off64_t, int);
+   Z_EXTERN z_off64_t Z_EXPORT gztell64(gzFile);
+   Z_EXTERN z_off64_t Z_EXPORT gzoffset64(gzFile);
+   Z_EXTERN unsigned long Z_EXPORT adler32_combine64(unsigned long, unsigned long, z_off64_t);
+   Z_EXTERN unsigned long Z_EXPORT crc32_combine64(unsigned long, unsigned long, z_off64_t);
+   Z_EXTERN unsigned long Z_EXPORT crc32_combine_gen64(z_off64_t);
+#endif
+#endif
+
+#if !defined(Z_SOLO) && !defined(Z_INTERNAL) && defined(Z_WANT64)
+#    define @ZLIB_SYMBOL_PREFIX@gzopen @ZLIB_SYMBOL_PREFIX@gzopen64
+#    define @ZLIB_SYMBOL_PREFIX@gzseek @ZLIB_SYMBOL_PREFIX@gzseek64
+#    define @ZLIB_SYMBOL_PREFIX@gztell @ZLIB_SYMBOL_PREFIX@gztell64
+#    define @ZLIB_SYMBOL_PREFIX@gzoffset @ZLIB_SYMBOL_PREFIX@gzoffset64
+#    define @ZLIB_SYMBOL_PREFIX@adler32_combine @ZLIB_SYMBOL_PREFIX@adler32_combine64
+#    define @ZLIB_SYMBOL_PREFIX@crc32_combine @ZLIB_SYMBOL_PREFIX@crc32_combine64
+#    define @ZLIB_SYMBOL_PREFIX@crc32_combine_gen @ZLIB_SYMBOL_PREFIX@crc32_combine_gen64
+#  ifndef Z_LARGE64
+     Z_EXTERN gzFile Z_EXPORT @ZLIB_SYMBOL_PREFIX@gzopen64(const char *, const char *);
+     Z_EXTERN z_off_t Z_EXPORT @ZLIB_SYMBOL_PREFIX@gzseek64(gzFile, z_off_t, int);
+     Z_EXTERN z_off_t Z_EXPORT @ZLIB_SYMBOL_PREFIX@gztell64(gzFile);
+     Z_EXTERN z_off_t Z_EXPORT @ZLIB_SYMBOL_PREFIX@gzoffset64(gzFile);
+     Z_EXTERN unsigned long Z_EXPORT @ZLIB_SYMBOL_PREFIX@adler32_combine64(unsigned long, unsigned long, z_off_t);
+     Z_EXTERN unsigned long Z_EXPORT @ZLIB_SYMBOL_PREFIX@crc32_combine64(unsigned long, unsigned long, z_off_t);
+     Z_EXTERN unsigned long Z_EXPORT @ZLIB_SYMBOL_PREFIX@crc32_combine_gen64(z_off64_t);
+#  endif
+#else
+#  ifndef Z_SOLO
+   Z_EXTERN gzFile Z_EXPORT @ZLIB_SYMBOL_PREFIX@gzopen(const char *, const char *);
+   Z_EXTERN z_off_t Z_EXPORT @ZLIB_SYMBOL_PREFIX@gzseek(gzFile, z_off_t, int);
+   Z_EXTERN z_off_t Z_EXPORT @ZLIB_SYMBOL_PREFIX@gztell(gzFile);
+   Z_EXTERN z_off_t Z_EXPORT @ZLIB_SYMBOL_PREFIX@gzoffset(gzFile);
+#  endif
+   Z_EXTERN unsigned long Z_EXPORT @ZLIB_SYMBOL_PREFIX@adler32_combine(unsigned long, unsigned long, z_off_t);
+   Z_EXTERN unsigned long Z_EXPORT @ZLIB_SYMBOL_PREFIX@crc32_combine(unsigned long, unsigned long, z_off_t);
+   Z_EXTERN unsigned long Z_EXPORT @ZLIB_SYMBOL_PREFIX@crc32_combine_gen(z_off_t);
+#endif
+
+/* undocumented functions */
+Z_EXTERN const char     * Z_EXPORT zError           (int);
+Z_EXTERN int              Z_EXPORT inflateSyncPoint (z_stream *);
+Z_EXTERN const uint32_t * Z_EXPORT get_crc_table    (void);
+Z_EXTERN int              Z_EXPORT inflateUndermine (z_stream *, int);
+Z_EXTERN int              Z_EXPORT inflateValidate  (z_stream *, int);
+Z_EXTERN unsigned long    Z_EXPORT inflateCodesUsed (z_stream *);
+Z_EXTERN int              Z_EXPORT inflateResetKeep (z_stream *);
+Z_EXTERN int              Z_EXPORT deflateResetKeep (z_stream *);
+
+#ifndef Z_SOLO
+#if defined(_WIN32)
+    Z_EXTERN gzFile Z_EXPORT gzopen_w(const wchar_t *path, const char *mode);
+#endif
+Z_EXTERN int Z_EXPORTVA gzvprintf(gzFile file, const char *format, va_list va);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZLIB_H_ */
diff --git a/3rdparty/zlib-ng/zlib_name_mangling.h.empty b/3rdparty/zlib-ng/zlib_name_mangling.h.empty
new file mode 100644
index 000000000000..b24cb834a6a6
--- /dev/null
+++ b/3rdparty/zlib-ng/zlib_name_mangling.h.empty
@@ -0,0 +1,8 @@
+/* zlib_name_mangling.h has been automatically generated from
+ * zlib_name_mangling.h.empty because ZLIB_SYMBOL_PREFIX was NOT set.
+ */
+
+#ifndef ZLIB_NAME_MANGLING_H
+#define ZLIB_NAME_MANGLING_H
+
+#endif /* ZLIB_NAME_MANGLING_H */
diff --git a/3rdparty/zlib-ng/zutil.c b/3rdparty/zlib-ng/zutil.c
new file mode 100644
index 000000000000..270a28c74201
--- /dev/null
+++ b/3rdparty/zlib-ng/zutil.c
@@ -0,0 +1,159 @@
+/* zutil.c -- target dependent utility functions for the compression library
+ * Copyright (C) 1995-2017 Jean-loup Gailly
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil_p.h"
+#include "zutil.h"
+
+z_const char * const PREFIX(z_errmsg)[10] = {
+    (z_const char *)"need dictionary",     /* Z_NEED_DICT       2  */
+    (z_const char *)"stream end",          /* Z_STREAM_END      1  */
+    (z_const char *)"",                    /* Z_OK              0  */
+    (z_const char *)"file error",          /* Z_ERRNO         (-1) */
+    (z_const char *)"stream error",        /* Z_STREAM_ERROR  (-2) */
+    (z_const char *)"data error",          /* Z_DATA_ERROR    (-3) */
+    (z_const char *)"insufficient memory", /* Z_MEM_ERROR     (-4) */
+    (z_const char *)"buffer error",        /* Z_BUF_ERROR     (-5) */
+    (z_const char *)"incompatible version",/* Z_VERSION_ERROR (-6) */
+    (z_const char *)""
+};
+
+const char PREFIX3(vstring)[] =
+    " zlib-ng 2.1.6";
+
+#ifdef ZLIB_COMPAT
+const char * Z_EXPORT zlibVersion(void) {
+    return ZLIB_VERSION;
+}
+#else
+const char * Z_EXPORT zlibng_version(void) {
+    return ZLIBNG_VERSION;
+}
+#endif
+
+unsigned long Z_EXPORT PREFIX(zlibCompileFlags)(void) {
+    unsigned long flags;
+
+    flags = 0;
+    switch ((int)(sizeof(unsigned int))) {
+    case 2:     break;
+    case 4:     flags += 1;     break;
+    case 8:     flags += 2;     break;
+    default:    flags += 3;
+    }
+    switch ((int)(sizeof(unsigned long))) {
+    case 2:     break;
+    case 4:     flags += 1 << 2;        break;
+    case 8:     flags += 2 << 2;        break;
+    default:    flags += 3 << 2;
+    }
+    switch ((int)(sizeof(void *))) {
+    case 2:     break;
+    case 4:     flags += 1 << 4;        break;
+    case 8:     flags += 2 << 4;        break;
+    default:    flags += 3 << 4;
+    }
+    switch ((int)(sizeof(z_off_t))) {
+    case 2:     break;
+    case 4:     flags += 1 << 6;        break;
+    case 8:     flags += 2 << 6;        break;
+    default:    flags += 3 << 6;
+    }
+#ifdef ZLIB_DEBUG
+    flags += 1 << 8;
+#endif
+#ifdef ZLIB_WINAPI
+    flags += 1 << 10;
+#endif
+    /* Bit 13 reserved for DYNAMIC_CRC_TABLE */
+#ifdef NO_GZCOMPRESS
+    flags += 1L << 16;
+#endif
+#ifdef NO_GZIP
+    flags += 1L << 17;
+#endif
+#ifdef PKZIP_BUG_WORKAROUND
+    flags += 1L << 20;
+#endif
+    return flags;
+}
+
+#ifdef ZLIB_DEBUG
+#  include <stdlib.h>
+#  ifndef verbose
+#    define verbose 0
+#  endif
+int Z_INTERNAL z_verbose = verbose;
+
+void Z_INTERNAL z_error(const char *m) {
+    fprintf(stderr, "%s\n", m);
+    exit(1);
+}
+#endif
+
+/* exported to allow conversion of error code to string for compress() and
+ * uncompress()
+ */
+const char * Z_EXPORT PREFIX(zError)(int err) {
+    return ERR_MSG(err);
+}
+
+void Z_INTERNAL *PREFIX(zcalloc)(void *opaque, unsigned items, unsigned size) {
+    Z_UNUSED(opaque);
+    return zng_alloc((size_t)items * (size_t)size);
+}
+
+void Z_INTERNAL PREFIX(zcfree)(void *opaque, void *ptr) {
+    Z_UNUSED(opaque);
+    zng_free(ptr);
+}
+
+/* Since we support custom memory allocators, some which might not align memory as we expect,
+ * we have to ask for extra memory and return an aligned pointer. */
+void Z_INTERNAL *PREFIX3(alloc_aligned)(zng_calloc_func zalloc, void *opaque, unsigned items, unsigned size, unsigned align) {
+    uintptr_t return_ptr, original_ptr;
+    uint32_t alloc_size, align_diff;
+    void *ptr;
+
+    /* If no custom calloc function used then call zlib-ng's aligned calloc */
+    if (zalloc == PREFIX(zcalloc))
+        return PREFIX(zcalloc)(opaque, items, size);
+
+    /* Allocate enough memory for proper alignment and to store the original memory pointer */
+    alloc_size = sizeof(void *) + (items * size) + align;
+    ptr = zalloc(opaque, 1, alloc_size);
+    if (!ptr)
+        return NULL;
+
+    /* Calculate return pointer address with space enough to store original pointer */
+    align_diff = align - ((uintptr_t)ptr % align);
+    return_ptr = (uintptr_t)ptr + align_diff;
+    if (align_diff < sizeof(void *))
+        return_ptr += align;
+
+    /* Store the original pointer for free() */
+    original_ptr = return_ptr - sizeof(void *);
+    memcpy((void *)original_ptr, &ptr, sizeof(void *));
+
+    /* Return properly aligned pointer in allocation */
+    return (void *)return_ptr;
+}
+
+void Z_INTERNAL PREFIX3(free_aligned)(zng_cfree_func zfree, void *opaque, void *ptr) {
+    /* If no custom cfree function used then call zlib-ng's aligned cfree */
+    if (zfree == PREFIX(zcfree)) {
+        PREFIX(zcfree)(opaque, ptr);
+        return;
+    }
+    if (!ptr)
+        return;
+
+    /* Calculate offset to original memory allocation pointer */
+    void *original_ptr = (void *)((uintptr_t)ptr - sizeof(void *));
+    void *free_ptr = *(void **)original_ptr;
+
+    /* Free original memory allocation */
+    zfree(opaque, free_ptr);
+}
diff --git a/3rdparty/zlib-ng/zutil.h b/3rdparty/zlib-ng/zutil.h
new file mode 100644
index 000000000000..663616b44d89
--- /dev/null
+++ b/3rdparty/zlib-ng/zutil.h
@@ -0,0 +1,148 @@
+#ifndef ZUTIL_H_
+#define ZUTIL_H_
+/* zutil.h -- internal interface and configuration of the compression library
+ * Copyright (C) 1995-2022 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+#include "zbuild.h"
+#ifdef ZLIB_COMPAT
+#  include "zlib.h"
+#else
+#  include "zlib-ng.h"
+#endif
+
+typedef unsigned char uch; /* Included for compatibility with external code only */
+typedef uint16_t ush;      /* Included for compatibility with external code only */
+typedef unsigned long ulg;
+
+extern z_const char * const PREFIX(z_errmsg)[10]; /* indexed by 2-zlib_error */
+/* (size given to avoid silly warnings with Visual C++) */
+
+#define ERR_MSG(err) PREFIX(z_errmsg)[Z_NEED_DICT-(err)]
+
+#define ERR_RETURN(strm, err) return (strm->msg = ERR_MSG(err), (err))
+/* To be used only when the state is known to be valid */
+
+        /* common constants */
+
+#ifndef DEF_WBITS
+#  define DEF_WBITS MAX_WBITS
+#endif
+/* default windowBits for decompression. MAX_WBITS is for compression only */
+
+#define MAX_BITS 15
+/* all codes must not exceed MAX_BITS bits */
+#define MAX_DIST_EXTRA_BITS 13
+/* maximum number of extra distance bits */
+
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+/* default memLevel */
+
+#define STORED_BLOCK 0
+#define STATIC_TREES 1
+#define DYN_TREES    2
+/* The three kinds of block type */
+
+#define STD_MIN_MATCH  3
+#define STD_MAX_MATCH  258
+/* The minimum and maximum match lengths mandated by the deflate standard */
+
+#define WANT_MIN_MATCH  4
+/* The minimum wanted match length, affects deflate_quick, deflate_fast, deflate_medium and deflate_slow  */
+
+#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
+
+#define ADLER32_INITIAL_VALUE 1 /* initial adler-32 hash value */
+#define CRC32_INITIAL_VALUE   0 /* initial crc-32 hash value */
+
+#define ZLIB_WRAPLEN 6      /* zlib format overhead */
+#define GZIP_WRAPLEN 18     /* gzip format overhead */
+
+#define DEFLATE_HEADER_BITS 3
+#define DEFLATE_EOBS_BITS   15
+#define DEFLATE_PAD_BITS    6
+#define DEFLATE_BLOCK_OVERHEAD ((DEFLATE_HEADER_BITS + DEFLATE_EOBS_BITS + DEFLATE_PAD_BITS) >> 3)
+/* deflate block overhead: 3 bits for block start + 15 bits for block end + padding to nearest byte */
+
+#define DEFLATE_QUICK_LIT_MAX_BITS 9
+#define DEFLATE_QUICK_OVERHEAD(x) ((x * (DEFLATE_QUICK_LIT_MAX_BITS - 8) + 7) >> 3)
+/* deflate_quick worst-case overhead: 9 bits per literal, round up to next byte (+7) */
+
+
+        /* target dependencies */
+
+#ifdef AMIGA
+#  define OS_CODE  1
+#endif
+
+#ifdef __370__
+#  if __TARGET_LIB__ < 0x20000000
+#    define OS_CODE 4
+#  elif __TARGET_LIB__ < 0x40000000
+#    define OS_CODE 11
+#  else
+#    define OS_CODE 8
+#  endif
+#endif
+
+#if defined(ATARI) || defined(atarist)
+#  define OS_CODE  5
+#endif
+
+#ifdef OS2
+#  define OS_CODE  6
+#endif
+
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+#  define OS_CODE  7
+#endif
+
+#ifdef __acorn
+#  define OS_CODE 13
+#endif
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#  define OS_CODE  10
+#endif
+
+#ifdef __APPLE__
+#  define OS_CODE 19
+#endif
+
+        /* common defaults */
+
+#ifndef OS_CODE
+#  define OS_CODE  3  /* assume Unix */
+#endif
+
+         /* macros */
+
+#define CHECK_VER_STSIZE(_ver,_stsize) ((_ver) == NULL || (_ver)[0] != PREFIX2(VERSION)[0] || (_stsize) != (int32_t)sizeof(PREFIX3(stream)))
+
+         /* memory allocation functions */
+
+void Z_INTERNAL *PREFIX(zcalloc)(void *opaque, unsigned items, unsigned size);
+void Z_INTERNAL  PREFIX(zcfree)(void *opaque, void *ptr);
+
+typedef void *zng_calloc_func(void *opaque, unsigned items, unsigned size);
+typedef void  zng_cfree_func(void *opaque, void *ptr);
+
+void Z_INTERNAL *PREFIX3(alloc_aligned)(zng_calloc_func zalloc, void *opaque, unsigned items, unsigned size, unsigned align);
+void Z_INTERNAL  PREFIX3(free_aligned)(zng_cfree_func zfree, void *opaque, void *ptr);
+
+#define ZALLOC(strm, items, size) PREFIX3(alloc_aligned)((strm)->zalloc, (strm)->opaque, (items), (size), 64)
+#define ZFREE(strm, addr)         PREFIX3(free_aligned)((strm)->zfree, (strm)->opaque, (void *)(addr))
+
+#define TRY_FREE(s, p)            {if (p) ZFREE(s, p);}
+
+#endif /* ZUTIL_H_ */
diff --git a/3rdparty/zlib-ng/zutil_p.h b/3rdparty/zlib-ng/zutil_p.h
new file mode 100644
index 000000000000..caec91d50d36
--- /dev/null
+++ b/3rdparty/zlib-ng/zutil_p.h
@@ -0,0 +1,71 @@
+/* zutil_p.h -- Private inline functions used internally in zlib-ng
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ZUTIL_P_H
+#define ZUTIL_P_H
+
+#if defined(__APPLE__) || defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_ALIGNED_ALLOC)
+#  include <stdlib.h>
+#elif defined(__FreeBSD__)
+#  include <stdlib.h>
+#  include <malloc_np.h>
+#else
+#  include <malloc.h>
+#endif
+
+/* Function to allocate 16 or 64-byte aligned memory */
+static inline void *zng_alloc(size_t size) {
+#ifdef HAVE_POSIX_MEMALIGN
+    void *ptr;
+    return posix_memalign(&ptr, 64, size) ? NULL : ptr;
+#elif defined(_WIN32)
+    return (void *)_aligned_malloc(size, 64);
+#elif defined(__APPLE__)
+    return (void *)malloc(size);     /* MacOS always aligns to 16 bytes */
+#elif defined(HAVE_ALIGNED_ALLOC)
+    return (void *)aligned_alloc(64, size);
+#else
+    return (void *)memalign(64, size);
+#endif
+}
+
+/* Function that can free aligned memory */
+static inline void zng_free(void *ptr) {
+#if defined(_WIN32)
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}
+
+/* Use memcpy instead of memcmp to avoid older compilers not converting memcmp calls to
+   unaligned comparisons when unaligned access is supported. */
+static inline int32_t zng_memcmp_2(const void *src0, const void *src1) {
+    uint16_t src0_cmp, src1_cmp;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+    memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+
+    return src0_cmp != src1_cmp;
+}
+
+static inline int32_t zng_memcmp_4(const void *src0, const void *src1) {
+    uint32_t src0_cmp, src1_cmp;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+    memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+
+    return src0_cmp != src1_cmp;
+}
+
+static inline int32_t zng_memcmp_8(const void *src0, const void *src1) {
+    uint64_t src0_cmp, src1_cmp;
+
+    memcpy(&src0_cmp, src0, sizeof(src0_cmp));
+    memcpy(&src1_cmp, src1, sizeof(src1_cmp));
+
+    return src0_cmp != src1_cmp;
+}
+
+#endif
diff --git a/3rdparty/zlib/CMakeLists.txt b/3rdparty/zlib/CMakeLists.txt
index addd3e5a140c..d88ad8f39fa4 100644
--- a/3rdparty/zlib/CMakeLists.txt
+++ b/3rdparty/zlib/CMakeLists.txt
@@ -23,9 +23,6 @@ endif()
 #
 if(NOT MSVC)
   check_include_file(unistd.h Z_HAVE_UNISTD_H)
-  if(Z_HAVE_UNISTD_H)
-    add_definitions(-DZ_HAVE_UNISTD_H)
-  endif()
 endif()
 
 if(MSVC)
@@ -41,10 +38,13 @@ if(HAVE_OFF64_T)
   add_definitions(-D_LARGEFILE64_SOURCE=1)
 endif()
 
-ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+configure_file(	${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.cmakein
+                ${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY)
+
+ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}")
 
 set(ZLIB_PUBLIC_HDRS
-    zconf.h
+    ${CMAKE_CURRENT_BINARY_DIR}/zconf.h
     zlib.h
 )
 set(ZLIB_PRIVATE_HDRS
diff --git a/3rdparty/zlib/ChangeLog b/3rdparty/zlib/ChangeLog
index 457526bc6a51..b801a1031ec0 100644
--- a/3rdparty/zlib/ChangeLog
+++ b/3rdparty/zlib/ChangeLog
@@ -1,6 +1,34 @@
 
                 ChangeLog file for zlib
 
+Changes in 1.3.1 (22 Jan 2024)
+- Reject overflows of zip header fields in minizip
+- Fix bug in inflateSync() for data held in bit buffer
+- Add LIT_MEM define to use more memory for a small deflate speedup
+- Fix decision on the emission of Zip64 end records in minizip
+- Add bounds checking to ERR_MSG() macro, used by zError()
+- Neutralize zip file traversal attacks in miniunz
+- Fix a bug in ZLIB_DEBUG compiles in check_match()
+- Various portability and appearance improvements
+
+Changes in 1.3 (18 Aug 2023)
+- Remove K&R function definitions and zlib2ansi
+- Fix bug in deflateBound() for level 0 and memLevel 9
+- Fix bug when gzungetc() is used immediately after gzopen()
+- Fix bug when using gzflush() with a very small buffer
+- Fix crash when gzsetparams() attempted for transparent write
+- Fix test/example.c to work with FORCE_STORED
+- Rewrite of zran in examples (see zran.c version history)
+- Fix minizip to allow it to open an empty zip file
+- Fix reading disk number start on zip64 files in minizip
+- Fix logic error in minizip argument processing
+- Add minizip testing to Makefile
+- Read multiple bytes instead of byte-by-byte in minizip unzip.c
+- Add memory sanitizer to configure (--memory)
+- Various portability improvements
+- Various documentation improvements
+- Various spelling and typo corrections
+
 Changes in 1.2.13 (13 Oct 2022)
 - Fix configure issue that discarded provided CC definition
 - Correct incorrect inputs provided to the CRC functions
@@ -1445,7 +1473,7 @@ Changes in 0.99 (27 Jan 96)
 - fix typo in Make_vms.com (f$trnlnm -> f$getsyi)
 - in fcalloc, normalize pointer if size > 65520 bytes
 - don't use special fcalloc for 32 bit Borland C++
-- use STDC instead of __GO32__ to avoid redeclaring exit, calloc, etc...
+- use STDC instead of __GO32__ to avoid redeclaring exit, calloc, etc.
 - use Z_BINARY instead of BINARY
 - document that gzclose after gzdopen will close the file
 - allow "a" as mode in gzopen
diff --git a/3rdparty/zlib/README b/3rdparty/zlib/README
index ba34d1894a9b..c5f917540b6f 100644
--- a/3rdparty/zlib/README
+++ b/3rdparty/zlib/README
@@ -1,6 +1,6 @@
 ZLIB DATA COMPRESSION LIBRARY
 
-zlib 1.2.13 is a general purpose data compression library.  All the code is
+zlib 1.3.1 is a general purpose data compression library.  All the code is
 thread safe.  The data format used by the zlib library is described by RFCs
 (Request for Comments) 1950 to 1952 in the files
 http://tools.ietf.org/html/rfc1950 (zlib format), rfc1951 (deflate format) and
@@ -29,18 +29,17 @@ PLEASE read the zlib FAQ http://zlib.net/zlib_faq.html before asking for help.
 
 Mark Nelson <markn@ieee.org> wrote an article about zlib for the Jan.  1997
 issue of Dr.  Dobb's Journal; a copy of the article is available at
-http://marknelson.us/1997/01/01/zlib-engine/ .
+https://marknelson.us/posts/1997/01/01/zlib-engine.html .
 
-The changes made in version 1.2.13 are documented in the file ChangeLog.
+The changes made in version 1.3.1 are documented in the file ChangeLog.
 
 Unsupported third party contributions are provided in directory contrib/ .
 
-zlib is available in Java using the java.util.zip package, documented at
-http://java.sun.com/developer/technicalArticles/Programming/compression/ .
+zlib is available in Java using the java.util.zip package. Follow the API
+Documentation link at: https://docs.oracle.com/search/?q=java.util.zip .
 
-A Perl interface to zlib written by Paul Marquess <pmqs@cpan.org> is available
-at CPAN (Comprehensive Perl Archive Network) sites, including
-http://search.cpan.org/~pmqs/IO-Compress-Zlib/ .
+A Perl interface to zlib and bzip2 written by Paul Marquess <pmqs@cpan.org>
+can be found at https://github.com/pmqs/IO-Compress .
 
 A Python interface to zlib written by A.M. Kuchling <amk@amk.ca> is
 available in Python 1.5 and later versions, see
@@ -64,7 +63,7 @@ Notes for some targets:
 - zlib doesn't work with gcc 2.6.3 on a DEC 3000/300LX under OSF/1 2.1 it works
   when compiled with cc.
 
-- On Digital Unix 4.0D (formely OSF/1) on AlphaServer, the cc option -std1 is
+- On Digital Unix 4.0D (formerly OSF/1) on AlphaServer, the cc option -std1 is
   necessary to get gzprintf working correctly. This is done by configure.
 
 - zlib doesn't work on HP-UX 9.05 with some versions of /bin/cc. It works with
@@ -84,7 +83,7 @@ Acknowledgments:
 
 Copyright notice:
 
- (C) 1995-2022 Jean-loup Gailly and Mark Adler
+ (C) 1995-2024 Jean-loup Gailly and Mark Adler
 
   This software is provided 'as-is', without any express or implied
   warranty.  In no event will the authors be held liable for any damages
diff --git a/3rdparty/zlib/adler32.c b/3rdparty/zlib/adler32.c
index d0be4380a39c..04b81d29bad1 100644
--- a/3rdparty/zlib/adler32.c
+++ b/3rdparty/zlib/adler32.c
@@ -7,8 +7,6 @@
 
 #include "zutil.h"
 
-local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
-
 #define BASE 65521U     /* largest prime smaller than 65536 */
 #define NMAX 5552
 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
@@ -60,11 +58,7 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
 #endif
 
 /* ========================================================================= */
-uLong ZEXPORT adler32_z(adler, buf, len)
-    uLong adler;
-    const Bytef *buf;
-    z_size_t len;
-{
+uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
     unsigned long sum2;
     unsigned n;
 
@@ -131,20 +125,12 @@ uLong ZEXPORT adler32_z(adler, buf, len)
 }
 
 /* ========================================================================= */
-uLong ZEXPORT adler32(adler, buf, len)
-    uLong adler;
-    const Bytef *buf;
-    uInt len;
-{
+uLong ZEXPORT adler32(uLong adler, const Bytef *buf, uInt len) {
     return adler32_z(adler, buf, len);
 }
 
 /* ========================================================================= */
-local uLong adler32_combine_(adler1, adler2, len2)
-    uLong adler1;
-    uLong adler2;
-    z_off64_t len2;
-{
+local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2) {
     unsigned long sum1;
     unsigned long sum2;
     unsigned rem;
@@ -169,18 +155,10 @@ local uLong adler32_combine_(adler1, adler2, len2)
 }
 
 /* ========================================================================= */
-uLong ZEXPORT adler32_combine(adler1, adler2, len2)
-    uLong adler1;
-    uLong adler2;
-    z_off_t len2;
-{
+uLong ZEXPORT adler32_combine(uLong adler1, uLong adler2, z_off_t len2) {
     return adler32_combine_(adler1, adler2, len2);
 }
 
-uLong ZEXPORT adler32_combine64(adler1, adler2, len2)
-    uLong adler1;
-    uLong adler2;
-    z_off64_t len2;
-{
+uLong ZEXPORT adler32_combine64(uLong adler1, uLong adler2, z_off64_t len2) {
     return adler32_combine_(adler1, adler2, len2);
 }
diff --git a/3rdparty/zlib/compress.c b/3rdparty/zlib/compress.c
index 2ad5326c14ec..f43bacf7ab97 100644
--- a/3rdparty/zlib/compress.c
+++ b/3rdparty/zlib/compress.c
@@ -19,13 +19,8 @@
    memory, Z_BUF_ERROR if there was not enough room in the output buffer,
    Z_STREAM_ERROR if the level parameter is invalid.
 */
-int ZEXPORT compress2(dest, destLen, source, sourceLen, level)
-    Bytef *dest;
-    uLongf *destLen;
-    const Bytef *source;
-    uLong sourceLen;
-    int level;
-{
+int ZEXPORT compress2(Bytef *dest, uLongf *destLen, const Bytef *source,
+                      uLong sourceLen, int level) {
     z_stream stream;
     int err;
     const uInt max = (uInt)-1;
@@ -65,12 +60,8 @@ int ZEXPORT compress2(dest, destLen, source, sourceLen, level)
 
 /* ===========================================================================
  */
-int ZEXPORT compress(dest, destLen, source, sourceLen)
-    Bytef *dest;
-    uLongf *destLen;
-    const Bytef *source;
-    uLong sourceLen;
-{
+int ZEXPORT compress(Bytef *dest, uLongf *destLen, const Bytef *source,
+                     uLong sourceLen) {
     return compress2(dest, destLen, source, sourceLen, Z_DEFAULT_COMPRESSION);
 }
 
@@ -78,9 +69,7 @@ int ZEXPORT compress(dest, destLen, source, sourceLen)
      If the default memLevel or windowBits for deflateInit() is changed, then
    this function needs to be updated.
  */
-uLong ZEXPORT compressBound(sourceLen)
-    uLong sourceLen;
-{
+uLong ZEXPORT compressBound(uLong sourceLen) {
     return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) +
            (sourceLen >> 25) + 13;
 }
diff --git a/3rdparty/zlib/crc32.c b/3rdparty/zlib/crc32.c
index f8357b083f76..6c38f5c04c6a 100644
--- a/3rdparty/zlib/crc32.c
+++ b/3rdparty/zlib/crc32.c
@@ -103,19 +103,6 @@
 #  define ARMCRC32
 #endif
 
-/* Local functions. */
-local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
-local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
-
-#if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
-    local z_word_t byte_swap OF((z_word_t word));
-#endif
-
-#if defined(W) && !defined(ARMCRC32)
-    local z_crc_t crc_word OF((z_word_t data));
-    local z_word_t crc_word_big OF((z_word_t data));
-#endif
-
 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
 /*
   Swap the bytes in a z_word_t to convert between little and big endian. Any
@@ -123,9 +110,7 @@ local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
   instruction, if one is available. This assumes that word_t is either 32 bits
   or 64 bits.
  */
-local z_word_t byte_swap(word)
-    z_word_t word;
-{
+local z_word_t byte_swap(z_word_t word) {
 #  if W == 8
     return
         (word & 0xff00000000000000) >> 56 |
@@ -146,24 +131,77 @@ local z_word_t byte_swap(word)
 }
 #endif
 
+#ifdef DYNAMIC_CRC_TABLE
+/* =========================================================================
+ * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
+ * below.
+ */
+   local z_crc_t FAR x2n_table[32];
+#else
+/* =========================================================================
+ * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
+ * of x for combining CRC-32s, all made by make_crc_table().
+ */
+#  include "crc32.h"
+#endif
+
 /* CRC polynomial. */
 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
 
-#ifdef DYNAMIC_CRC_TABLE
+/*
+  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
+  reflected. For speed, this requires that a not be zero.
+ */
+local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
+    z_crc_t m, p;
+
+    m = (z_crc_t)1 << 31;
+    p = 0;
+    for (;;) {
+        if (a & m) {
+            p ^= b;
+            if ((a & (m - 1)) == 0)
+                break;
+        }
+        m >>= 1;
+        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
+    }
+    return p;
+}
 
+/*
+  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
+  initialized.
+ */
+local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
+    z_crc_t p;
+
+    p = (z_crc_t)1 << 31;           /* x^0 == 1 */
+    while (n) {
+        if (n & 1)
+            p = multmodp(x2n_table[k & 31], p);
+        n >>= 1;
+        k++;
+    }
+    return p;
+}
+
+#ifdef DYNAMIC_CRC_TABLE
+/* =========================================================================
+ * Build the tables for byte-wise and braided CRC-32 calculations, and a table
+ * of powers of x for combining CRC-32s.
+ */
 local z_crc_t FAR crc_table[256];
-local z_crc_t FAR x2n_table[32];
-local void make_crc_table OF((void));
 #ifdef W
    local z_word_t FAR crc_big_table[256];
    local z_crc_t FAR crc_braid_table[W][256];
    local z_word_t FAR crc_braid_big_table[W][256];
-   local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
+   local void braid(z_crc_t [][256], z_word_t [][256], int, int);
 #endif
 #ifdef MAKECRCH
-   local void write_table OF((FILE *, const z_crc_t FAR *, int));
-   local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
-   local void write_table64 OF((FILE *, const z_word_t FAR *, int));
+   local void write_table(FILE *, const z_crc_t FAR *, int);
+   local void write_table32hi(FILE *, const z_word_t FAR *, int);
+   local void write_table64(FILE *, const z_word_t FAR *, int);
 #endif /* MAKECRCH */
 
 /*
@@ -176,7 +214,6 @@ local void make_crc_table OF((void));
 
 /* Definition of once functionality. */
 typedef struct once_s once_t;
-local void once OF((once_t *, void (*)(void)));
 
 /* Check for the availability of atomics. */
 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
@@ -196,10 +233,7 @@ struct once_s {
   invoke once() at the same time. The state must be a once_t initialized with
   ONCE_INIT.
  */
-local void once(state, init)
-    once_t *state;
-    void (*init)(void);
-{
+local void once(once_t *state, void (*init)(void)) {
     if (!atomic_load(&state->done)) {
         if (atomic_flag_test_and_set(&state->begun))
             while (!atomic_load(&state->done))
@@ -222,10 +256,7 @@ struct once_s {
 
 /* Test and set. Alas, not atomic, but tries to minimize the period of
    vulnerability. */
-local int test_and_set OF((int volatile *));
-local int test_and_set(flag)
-    int volatile *flag;
-{
+local int test_and_set(int volatile *flag) {
     int was;
 
     was = *flag;
@@ -234,10 +265,7 @@ local int test_and_set(flag)
 }
 
 /* Run the provided init() function once. This is not thread-safe. */
-local void once(state, init)
-    once_t *state;
-    void (*init)(void);
-{
+local void once(once_t *state, void (*init)(void)) {
     if (!state->done) {
         if (test_and_set(&state->begun))
             while (!state->done)
@@ -279,8 +307,7 @@ local once_t made = ONCE_INIT;
   combinations of CRC register values and incoming bytes.
  */
 
-local void make_crc_table()
-{
+local void make_crc_table(void) {
     unsigned i, j, n;
     z_crc_t p;
 
@@ -447,11 +474,7 @@ local void make_crc_table()
    Write the 32-bit values in table[0..k-1] to out, five per line in
    hexadecimal separated by commas.
  */
-local void write_table(out, table, k)
-    FILE *out;
-    const z_crc_t FAR *table;
-    int k;
-{
+local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
     int n;
 
     for (n = 0; n < k; n++)
@@ -464,11 +487,7 @@ local void write_table(out, table, k)
    Write the high 32-bits of each value in table[0..k-1] to out, five per line
    in hexadecimal separated by commas.
  */
-local void write_table32hi(out, table, k)
-FILE *out;
-const z_word_t FAR *table;
-int k;
-{
+local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
     int n;
 
     for (n = 0; n < k; n++)
@@ -484,11 +503,7 @@ int k;
   bits. If not, then the type cast and format string can be adjusted
   accordingly.
  */
-local void write_table64(out, table, k)
-    FILE *out;
-    const z_word_t FAR *table;
-    int k;
-{
+local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
     int n;
 
     for (n = 0; n < k; n++)
@@ -498,8 +513,7 @@ local void write_table64(out, table, k)
 }
 
 /* Actually do the deed. */
-int main()
-{
+int main(void) {
     make_crc_table();
     return 0;
 }
@@ -511,12 +525,7 @@ int main()
   Generate the little and big-endian braid tables for the given n and z_word_t
   size w. Each array must have room for w blocks of 256 elements.
  */
-local void braid(ltl, big, n, w)
-    z_crc_t ltl[][256];
-    z_word_t big[][256];
-    int n;
-    int w;
-{
+local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
     int k;
     z_crc_t i, p, q;
     for (k = 0; k < w; k++) {
@@ -531,69 +540,13 @@ local void braid(ltl, big, n, w)
 }
 #endif
 
-#else /* !DYNAMIC_CRC_TABLE */
-/* ========================================================================
- * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
- * of x for combining CRC-32s, all made by make_crc_table().
- */
-#include "crc32.h"
 #endif /* DYNAMIC_CRC_TABLE */
 
-/* ========================================================================
- * Routines used for CRC calculation. Some are also required for the table
- * generation above.
- */
-
-/*
-  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
-  reflected. For speed, this requires that a not be zero.
- */
-local z_crc_t multmodp(a, b)
-    z_crc_t a;
-    z_crc_t b;
-{
-    z_crc_t m, p;
-
-    m = (z_crc_t)1 << 31;
-    p = 0;
-    for (;;) {
-        if (a & m) {
-            p ^= b;
-            if ((a & (m - 1)) == 0)
-                break;
-        }
-        m >>= 1;
-        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
-    }
-    return p;
-}
-
-/*
-  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
-  initialized.
- */
-local z_crc_t x2nmodp(n, k)
-    z_off64_t n;
-    unsigned k;
-{
-    z_crc_t p;
-
-    p = (z_crc_t)1 << 31;           /* x^0 == 1 */
-    while (n) {
-        if (n & 1)
-            p = multmodp(x2n_table[k & 31], p);
-        n >>= 1;
-        k++;
-    }
-    return p;
-}
-
 /* =========================================================================
  * This function can be used by asm versions of crc32(), and to force the
  * generation of the CRC tables in a threaded application.
  */
-const z_crc_t FAR * ZEXPORT get_crc_table()
-{
+const z_crc_t FAR * ZEXPORT get_crc_table(void) {
 #ifdef DYNAMIC_CRC_TABLE
     once(&made, make_crc_table);
 #endif /* DYNAMIC_CRC_TABLE */
@@ -619,11 +572,8 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
 
-unsigned long ZEXPORT crc32_z(crc, buf, len)
-    unsigned long crc;
-    const unsigned char FAR *buf;
-    z_size_t len;
-{
+unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
+                              z_size_t len) {
     z_crc_t val;
     z_word_t crc1, crc2;
     const z_word_t *word;
@@ -723,18 +673,14 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
   least-significant byte of the word as the first byte of data, without any pre
   or post conditioning. This is used to combine the CRCs of each braid.
  */
-local z_crc_t crc_word(data)
-    z_word_t data;
-{
+local z_crc_t crc_word(z_word_t data) {
     int k;
     for (k = 0; k < W; k++)
         data = (data >> 8) ^ crc_table[data & 0xff];
     return (z_crc_t)data;
 }
 
-local z_word_t crc_word_big(data)
-    z_word_t data;
-{
+local z_word_t crc_word_big(z_word_t data) {
     int k;
     for (k = 0; k < W; k++)
         data = (data << 8) ^
@@ -745,11 +691,8 @@ local z_word_t crc_word_big(data)
 #endif
 
 /* ========================================================================= */
-unsigned long ZEXPORT crc32_z(crc, buf, len)
-    unsigned long crc;
-    const unsigned char FAR *buf;
-    z_size_t len;
-{
+unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
+                              z_size_t len) {
     /* Return initial CRC, if requested. */
     if (buf == Z_NULL) return 0;
 
@@ -781,8 +724,8 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
         words = (z_word_t const *)buf;
 
         /* Do endian check at execution time instead of compile time, since ARM
-           processors can change the endianess at execution time. If the
-           compiler knows what the endianess will be, it can optimize out the
+           processors can change the endianness at execution time. If the
+           compiler knows what the endianness will be, it can optimize out the
            check and the unused branch. */
         endian = 1;
         if (*(unsigned char *)&endian) {
@@ -1069,20 +1012,13 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
 #endif
 
 /* ========================================================================= */
-unsigned long ZEXPORT crc32(crc, buf, len)
-    unsigned long crc;
-    const unsigned char FAR *buf;
-    uInt len;
-{
+unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
+                            uInt len) {
     return crc32_z(crc, buf, len);
 }
 
 /* ========================================================================= */
-uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
-    uLong crc1;
-    uLong crc2;
-    z_off64_t len2;
-{
+uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
 #ifdef DYNAMIC_CRC_TABLE
     once(&made, make_crc_table);
 #endif /* DYNAMIC_CRC_TABLE */
@@ -1090,18 +1026,12 @@ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
 }
 
 /* ========================================================================= */
-uLong ZEXPORT crc32_combine(crc1, crc2, len2)
-    uLong crc1;
-    uLong crc2;
-    z_off_t len2;
-{
+uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
 }
 
 /* ========================================================================= */
-uLong ZEXPORT crc32_combine_gen64(len2)
-    z_off64_t len2;
-{
+uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
 #ifdef DYNAMIC_CRC_TABLE
     once(&made, make_crc_table);
 #endif /* DYNAMIC_CRC_TABLE */
@@ -1109,17 +1039,11 @@ uLong ZEXPORT crc32_combine_gen64(len2)
 }
 
 /* ========================================================================= */
-uLong ZEXPORT crc32_combine_gen(len2)
-    z_off_t len2;
-{
+uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
     return crc32_combine_gen64((z_off64_t)len2);
 }
 
 /* ========================================================================= */
-uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
-    uLong crc1;
-    uLong crc2;
-    uLong op;
-{
+uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
 }
diff --git a/3rdparty/zlib/deflate.c b/3rdparty/zlib/deflate.c
index 4a689db35989..012ea8148e8d 100644
--- a/3rdparty/zlib/deflate.c
+++ b/3rdparty/zlib/deflate.c
@@ -1,5 +1,5 @@
 /* deflate.c -- compress data using the deflation algorithm
- * Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -52,7 +52,7 @@
 #include "deflate.h"
 
 const char deflate_copyright[] =
-   " deflate 1.2.13 Copyright 1995-2022 Jean-loup Gailly and Mark Adler ";
+   " deflate 1.3.1 Copyright 1995-2024 Jean-loup Gailly and Mark Adler ";
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -60,9 +60,6 @@ const char deflate_copyright[] =
   copyright string in the executable of your product.
  */
 
-/* ===========================================================================
- *  Function prototypes.
- */
 typedef enum {
     need_more,      /* block not completed, need more input or more output */
     block_done,     /* block flush performed */
@@ -70,29 +67,16 @@ typedef enum {
     finish_done     /* finish done, accept no more input or output */
 } block_state;
 
-typedef block_state (*compress_func) OF((deflate_state *s, int flush));
+typedef block_state (*compress_func)(deflate_state *s, int flush);
 /* Compression function. Returns the block state after the call. */
 
-local int deflateStateCheck      OF((z_streamp strm));
-local void slide_hash     OF((deflate_state *s));
-local void fill_window    OF((deflate_state *s));
-local block_state deflate_stored OF((deflate_state *s, int flush));
-local block_state deflate_fast   OF((deflate_state *s, int flush));
+local block_state deflate_stored(deflate_state *s, int flush);
+local block_state deflate_fast(deflate_state *s, int flush);
 #ifndef FASTEST
-local block_state deflate_slow   OF((deflate_state *s, int flush));
-#endif
-local block_state deflate_rle    OF((deflate_state *s, int flush));
-local block_state deflate_huff   OF((deflate_state *s, int flush));
-local void lm_init        OF((deflate_state *s));
-local void putShortMSB    OF((deflate_state *s, uInt b));
-local void flush_pending  OF((z_streamp strm));
-local unsigned read_buf   OF((z_streamp strm, Bytef *buf, unsigned size));
-local uInt longest_match  OF((deflate_state *s, IPos cur_match));
-
-#ifdef ZLIB_DEBUG
-local  void check_match OF((deflate_state *s, IPos start, IPos match,
-                            int length));
+local block_state deflate_slow(deflate_state *s, int flush);
 #endif
+local block_state deflate_rle(deflate_state *s, int flush);
+local block_state deflate_huff(deflate_state *s, int flush);
 
 /* ===========================================================================
  * Local data
@@ -195,9 +179,12 @@ local const config configuration_table[10] = {
  * bit values at the expense of memory usage). We slide even when level == 0 to
  * keep the hash table consistent if we switch back to level > 0 later.
  */
-local void slide_hash(s)
-    deflate_state *s;
-{
+#if defined(__has_feature)
+#  if __has_feature(memory_sanitizer)
+     __attribute__((no_sanitize("memory")))
+#  endif
+#endif
+local void slide_hash(deflate_state *s) {
     unsigned n, m;
     Posf *p;
     uInt wsize = s->w_size;
@@ -221,30 +208,177 @@ local void slide_hash(s)
 #endif
 }
 
+/* ===========================================================================
+ * Read a new buffer from the current input stream, update the adler32
+ * and total number of bytes read.  All deflate() input goes through
+ * this function so some applications may wish to modify it to avoid
+ * allocating a large strm->next_in buffer and copying from it.
+ * (See also flush_pending()).
+ */
+local unsigned read_buf(z_streamp strm, Bytef *buf, unsigned size) {
+    unsigned len = strm->avail_in;
+
+    if (len > size) len = size;
+    if (len == 0) return 0;
+
+    strm->avail_in  -= len;
+
+    zmemcpy(buf, strm->next_in, len);
+    if (strm->state->wrap == 1) {
+        strm->adler = adler32(strm->adler, buf, len);
+    }
+#ifdef GZIP
+    else if (strm->state->wrap == 2) {
+        strm->adler = crc32(strm->adler, buf, len);
+    }
+#endif
+    strm->next_in  += len;
+    strm->total_in += len;
+
+    return len;
+}
+
+/* ===========================================================================
+ * Fill the window when the lookahead becomes insufficient.
+ * Updates strstart and lookahead.
+ *
+ * IN assertion: lookahead < MIN_LOOKAHEAD
+ * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
+ *    At least one byte has been read, or avail_in == 0; reads are
+ *    performed for at least two bytes (required for the zip translate_eol
+ *    option -- not supported here).
+ */
+local void fill_window(deflate_state *s) {
+    unsigned n;
+    unsigned more;    /* Amount of free space at the end of the window. */
+    uInt wsize = s->w_size;
+
+    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+    do {
+        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
+
+        /* Deal with !@#$% 64K limit: */
+        if (sizeof(int) <= 2) {
+            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+                more = wsize;
+
+            } else if (more == (unsigned)(-1)) {
+                /* Very unlikely, but possible on 16 bit machine if
+                 * strstart == 0 && lookahead == 1 (input done a byte at time)
+                 */
+                more--;
+            }
+        }
+
+        /* If the window is almost full and there is insufficient lookahead,
+         * move the upper half to the lower one to make room in the upper half.
+         */
+        if (s->strstart >= wsize + MAX_DIST(s)) {
+
+            zmemcpy(s->window, s->window + wsize, (unsigned)wsize - more);
+            s->match_start -= wsize;
+            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
+            s->block_start -= (long) wsize;
+            if (s->insert > s->strstart)
+                s->insert = s->strstart;
+            slide_hash(s);
+            more += wsize;
+        }
+        if (s->strm->avail_in == 0) break;
+
+        /* If there was no sliding:
+         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+         *    more == window_size - lookahead - strstart
+         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+         * => more >= window_size - 2*WSIZE + 2
+         * In the BIG_MEM or MMAP case (not yet supported),
+         *   window_size == input_size + MIN_LOOKAHEAD  &&
+         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+         * Otherwise, window_size == 2*WSIZE so more >= 2.
+         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+         */
+        Assert(more >= 2, "more < 2");
+
+        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+        s->lookahead += n;
+
+        /* Initialize the hash value now that we have some input: */
+        if (s->lookahead + s->insert >= MIN_MATCH) {
+            uInt str = s->strstart - s->insert;
+            s->ins_h = s->window[str];
+            UPDATE_HASH(s, s->ins_h, s->window[str + 1]);
+#if MIN_MATCH != 3
+            Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+            while (s->insert) {
+                UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+#ifndef FASTEST
+                s->prev[str & s->w_mask] = s->head[s->ins_h];
+#endif
+                s->head[s->ins_h] = (Pos)str;
+                str++;
+                s->insert--;
+                if (s->lookahead + s->insert < MIN_MATCH)
+                    break;
+            }
+        }
+        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+         * but this is not important since only literal bytes will be emitted.
+         */
+
+    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+    /* If the WIN_INIT bytes after the end of the current data have never been
+     * written, then zero those bytes in order to avoid memory check reports of
+     * the use of uninitialized (or uninitialised as Julian writes) bytes by
+     * the longest match routines.  Update the high water mark for the next
+     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
+     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+     */
+    if (s->high_water < s->window_size) {
+        ulg curr = s->strstart + (ulg)(s->lookahead);
+        ulg init;
+
+        if (s->high_water < curr) {
+            /* Previous high water mark below current data -- zero WIN_INIT
+             * bytes or up to end of window, whichever is less.
+             */
+            init = s->window_size - curr;
+            if (init > WIN_INIT)
+                init = WIN_INIT;
+            zmemzero(s->window + curr, (unsigned)init);
+            s->high_water = curr + init;
+        }
+        else if (s->high_water < (ulg)curr + WIN_INIT) {
+            /* High water mark at or above current data, but below current data
+             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+             * to end of window, whichever is less.
+             */
+            init = (ulg)curr + WIN_INIT - s->high_water;
+            if (init > s->window_size - s->high_water)
+                init = s->window_size - s->high_water;
+            zmemzero(s->window + s->high_water, (unsigned)init);
+            s->high_water += init;
+        }
+    }
+
+    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+           "not enough room for search");
+}
+
 /* ========================================================================= */
-int ZEXPORT deflateInit_(strm, level, version, stream_size)
-    z_streamp strm;
-    int level;
-    const char *version;
-    int stream_size;
-{
+int ZEXPORT deflateInit_(z_streamp strm, int level, const char *version,
+                         int stream_size) {
     return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL,
                          Z_DEFAULT_STRATEGY, version, stream_size);
     /* To do: ignore strm->next_in if we use it as window */
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
-                  version, stream_size)
-    z_streamp strm;
-    int  level;
-    int  method;
-    int  windowBits;
-    int  memLevel;
-    int  strategy;
-    const char *version;
-    int stream_size;
-{
+int ZEXPORT deflateInit2_(z_streamp strm, int level, int method,
+                          int windowBits, int memLevel, int strategy,
+                          const char *version, int stream_size) {
     deflate_state *s;
     int wrap = 1;
     static const char my_version[] = ZLIB_VERSION;
@@ -359,7 +493,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
      * symbols from which it is being constructed.
      */
 
-    s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, 4);
+    s->pending_buf = (uchf *) ZALLOC(strm, s->lit_bufsize, LIT_BUFS);
     s->pending_buf_size = (ulg)s->lit_bufsize * 4;
 
     if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL ||
@@ -369,8 +503,14 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
         deflateEnd (strm);
         return Z_MEM_ERROR;
     }
+#ifdef LIT_MEM
+    s->d_buf = (ushf *)(s->pending_buf + (s->lit_bufsize << 1));
+    s->l_buf = s->pending_buf + (s->lit_bufsize << 2);
+    s->sym_end = s->lit_bufsize - 1;
+#else
     s->sym_buf = s->pending_buf + s->lit_bufsize;
     s->sym_end = (s->lit_bufsize - 1) * 3;
+#endif
     /* We avoid equality with lit_bufsize*3 because of wraparound at 64K
      * on 16 bit machines and because stored blocks are restricted to
      * 64K-1 bytes.
@@ -386,9 +526,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
 /* =========================================================================
  * Check for a valid deflate stream state. Return 0 if ok, 1 if not.
  */
-local int deflateStateCheck(strm)
-    z_streamp strm;
-{
+local int deflateStateCheck(z_streamp strm) {
     deflate_state *s;
     if (strm == Z_NULL ||
         strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
@@ -409,11 +547,8 @@ local int deflateStateCheck(strm)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateSetDictionary(strm, dictionary, dictLength)
-    z_streamp strm;
-    const Bytef *dictionary;
-    uInt  dictLength;
-{
+int ZEXPORT deflateSetDictionary(z_streamp strm, const Bytef *dictionary,
+                                 uInt  dictLength) {
     deflate_state *s;
     uInt str, n;
     int wrap;
@@ -478,11 +613,8 @@ int ZEXPORT deflateSetDictionary(strm, dictionary, dictLength)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateGetDictionary(strm, dictionary, dictLength)
-    z_streamp strm;
-    Bytef *dictionary;
-    uInt  *dictLength;
-{
+int ZEXPORT deflateGetDictionary(z_streamp strm, Bytef *dictionary,
+                                 uInt *dictLength) {
     deflate_state *s;
     uInt len;
 
@@ -500,9 +632,7 @@ int ZEXPORT deflateGetDictionary(strm, dictionary, dictLength)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateResetKeep(strm)
-    z_streamp strm;
-{
+int ZEXPORT deflateResetKeep(z_streamp strm) {
     deflate_state *s;
 
     if (deflateStateCheck(strm)) {
@@ -537,10 +667,32 @@ int ZEXPORT deflateResetKeep(strm)
     return Z_OK;
 }
 
+/* ===========================================================================
+ * Initialize the "longest match" routines for a new zlib stream
+ */
+local void lm_init(deflate_state *s) {
+    s->window_size = (ulg)2L*s->w_size;
+
+    CLEAR_HASH(s);
+
+    /* Set the default configuration parameters:
+     */
+    s->max_lazy_match   = configuration_table[s->level].max_lazy;
+    s->good_match       = configuration_table[s->level].good_length;
+    s->nice_match       = configuration_table[s->level].nice_length;
+    s->max_chain_length = configuration_table[s->level].max_chain;
+
+    s->strstart = 0;
+    s->block_start = 0L;
+    s->lookahead = 0;
+    s->insert = 0;
+    s->match_length = s->prev_length = MIN_MATCH-1;
+    s->match_available = 0;
+    s->ins_h = 0;
+}
+
 /* ========================================================================= */
-int ZEXPORT deflateReset(strm)
-    z_streamp strm;
-{
+int ZEXPORT deflateReset(z_streamp strm) {
     int ret;
 
     ret = deflateResetKeep(strm);
@@ -550,10 +702,7 @@ int ZEXPORT deflateReset(strm)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateSetHeader(strm, head)
-    z_streamp strm;
-    gz_headerp head;
-{
+int ZEXPORT deflateSetHeader(z_streamp strm, gz_headerp head) {
     if (deflateStateCheck(strm) || strm->state->wrap != 2)
         return Z_STREAM_ERROR;
     strm->state->gzhead = head;
@@ -561,11 +710,7 @@ int ZEXPORT deflateSetHeader(strm, head)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflatePending(strm, pending, bits)
-    unsigned *pending;
-    int *bits;
-    z_streamp strm;
-{
+int ZEXPORT deflatePending(z_streamp strm, unsigned *pending, int *bits) {
     if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
     if (pending != Z_NULL)
         *pending = strm->state->pending;
@@ -575,19 +720,21 @@ int ZEXPORT deflatePending(strm, pending, bits)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflatePrime(strm, bits, value)
-    z_streamp strm;
-    int bits;
-    int value;
-{
+int ZEXPORT deflatePrime(z_streamp strm, int bits, int value) {
     deflate_state *s;
     int put;
 
     if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
     s = strm->state;
+#ifdef LIT_MEM
+    if (bits < 0 || bits > 16 ||
+        (uchf *)s->d_buf < s->pending_out + ((Buf_size + 7) >> 3))
+        return Z_BUF_ERROR;
+#else
     if (bits < 0 || bits > 16 ||
         s->sym_buf < s->pending_out + ((Buf_size + 7) >> 3))
         return Z_BUF_ERROR;
+#endif
     do {
         put = Buf_size - s->bi_valid;
         if (put > bits)
@@ -602,11 +749,7 @@ int ZEXPORT deflatePrime(strm, bits, value)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateParams(strm, level, strategy)
-    z_streamp strm;
-    int level;
-    int strategy;
-{
+int ZEXPORT deflateParams(z_streamp strm, int level, int strategy) {
     deflate_state *s;
     compress_func func;
 
@@ -651,13 +794,8 @@ int ZEXPORT deflateParams(strm, level, strategy)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
-    z_streamp strm;
-    int good_length;
-    int max_lazy;
-    int nice_length;
-    int max_chain;
-{
+int ZEXPORT deflateTune(z_streamp strm, int good_length, int max_lazy,
+                        int nice_length, int max_chain) {
     deflate_state *s;
 
     if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
@@ -693,10 +831,7 @@ int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
  *
  * Shifts are used to approximate divisions, for speed.
  */
-uLong ZEXPORT deflateBound(strm, sourceLen)
-    z_streamp strm;
-    uLong sourceLen;
-{
+uLong ZEXPORT deflateBound(z_streamp strm, uLong sourceLen) {
     deflate_state *s;
     uLong fixedlen, storelen, wraplen;
 
@@ -752,7 +887,8 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
 
     /* if not default parameters, return one of the conservative bounds */
     if (s->w_bits != 15 || s->hash_bits != 8 + 7)
-        return (s->w_bits <= s->hash_bits ? fixedlen : storelen) + wraplen;
+        return (s->w_bits <= s->hash_bits && s->level ? fixedlen : storelen) +
+               wraplen;
 
     /* default settings: return tight bound for that case -- ~0.03% overhead
        plus a small constant */
@@ -765,10 +901,7 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
  * IN assertion: the stream state is correct and there is enough room in
  * pending_buf.
  */
-local void putShortMSB(s, b)
-    deflate_state *s;
-    uInt b;
-{
+local void putShortMSB(deflate_state *s, uInt b) {
     put_byte(s, (Byte)(b >> 8));
     put_byte(s, (Byte)(b & 0xff));
 }
@@ -779,9 +912,7 @@ local void putShortMSB(s, b)
  * applications may wish to modify it to avoid allocating a large
  * strm->next_out buffer and copying into it. (See also read_buf()).
  */
-local void flush_pending(strm)
-    z_streamp strm;
-{
+local void flush_pending(z_streamp strm) {
     unsigned len;
     deflate_state *s = strm->state;
 
@@ -812,10 +943,7 @@ local void flush_pending(strm)
     } while (0)
 
 /* ========================================================================= */
-int ZEXPORT deflate(strm, flush)
-    z_streamp strm;
-    int flush;
-{
+int ZEXPORT deflate(z_streamp strm, int flush) {
     int old_flush; /* value of flush param for previous deflate call */
     deflate_state *s;
 
@@ -1127,9 +1255,7 @@ int ZEXPORT deflate(strm, flush)
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateEnd(strm)
-    z_streamp strm;
-{
+int ZEXPORT deflateEnd(z_streamp strm) {
     int status;
 
     if (deflateStateCheck(strm)) return Z_STREAM_ERROR;
@@ -1153,11 +1279,10 @@ int ZEXPORT deflateEnd(strm)
  * To simplify the source, this is not supported for 16-bit MSDOS (which
  * doesn't have enough memory anyway to duplicate compression states).
  */
-int ZEXPORT deflateCopy(dest, source)
-    z_streamp dest;
-    z_streamp source;
-{
+int ZEXPORT deflateCopy(z_streamp dest, z_streamp source) {
 #ifdef MAXSEG_64K
+    (void)dest;
+    (void)source;
     return Z_STREAM_ERROR;
 #else
     deflate_state *ds;
@@ -1181,7 +1306,7 @@ int ZEXPORT deflateCopy(dest, source)
     ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte));
     ds->prev   = (Posf *)  ZALLOC(dest, ds->w_size, sizeof(Pos));
     ds->head   = (Posf *)  ZALLOC(dest, ds->hash_size, sizeof(Pos));
-    ds->pending_buf = (uchf *) ZALLOC(dest, ds->lit_bufsize, 4);
+    ds->pending_buf = (uchf *) ZALLOC(dest, ds->lit_bufsize, LIT_BUFS);
 
     if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL ||
         ds->pending_buf == Z_NULL) {
@@ -1192,10 +1317,15 @@ int ZEXPORT deflateCopy(dest, source)
     zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte));
     zmemcpy((voidpf)ds->prev, (voidpf)ss->prev, ds->w_size * sizeof(Pos));
     zmemcpy((voidpf)ds->head, (voidpf)ss->head, ds->hash_size * sizeof(Pos));
-    zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size);
+    zmemcpy(ds->pending_buf, ss->pending_buf, ds->lit_bufsize * LIT_BUFS);
 
     ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
+#ifdef LIT_MEM
+    ds->d_buf = (ushf *)(ds->pending_buf + (ds->lit_bufsize << 1));
+    ds->l_buf = ds->pending_buf + (ds->lit_bufsize << 2);
+#else
     ds->sym_buf = ds->pending_buf + ds->lit_bufsize;
+#endif
 
     ds->l_desc.dyn_tree = ds->dyn_ltree;
     ds->d_desc.dyn_tree = ds->dyn_dtree;
@@ -1205,66 +1335,6 @@ int ZEXPORT deflateCopy(dest, source)
 #endif /* MAXSEG_64K */
 }
 
-/* ===========================================================================
- * Read a new buffer from the current input stream, update the adler32
- * and total number of bytes read.  All deflate() input goes through
- * this function so some applications may wish to modify it to avoid
- * allocating a large strm->next_in buffer and copying from it.
- * (See also flush_pending()).
- */
-local unsigned read_buf(strm, buf, size)
-    z_streamp strm;
-    Bytef *buf;
-    unsigned size;
-{
-    unsigned len = strm->avail_in;
-
-    if (len > size) len = size;
-    if (len == 0) return 0;
-
-    strm->avail_in  -= len;
-
-    zmemcpy(buf, strm->next_in, len);
-    if (strm->state->wrap == 1) {
-        strm->adler = adler32(strm->adler, buf, len);
-    }
-#ifdef GZIP
-    else if (strm->state->wrap == 2) {
-        strm->adler = crc32(strm->adler, buf, len);
-    }
-#endif
-    strm->next_in  += len;
-    strm->total_in += len;
-
-    return len;
-}
-
-/* ===========================================================================
- * Initialize the "longest match" routines for a new zlib stream
- */
-local void lm_init(s)
-    deflate_state *s;
-{
-    s->window_size = (ulg)2L*s->w_size;
-
-    CLEAR_HASH(s);
-
-    /* Set the default configuration parameters:
-     */
-    s->max_lazy_match   = configuration_table[s->level].max_lazy;
-    s->good_match       = configuration_table[s->level].good_length;
-    s->nice_match       = configuration_table[s->level].nice_length;
-    s->max_chain_length = configuration_table[s->level].max_chain;
-
-    s->strstart = 0;
-    s->block_start = 0L;
-    s->lookahead = 0;
-    s->insert = 0;
-    s->match_length = s->prev_length = MIN_MATCH-1;
-    s->match_available = 0;
-    s->ins_h = 0;
-}
-
 #ifndef FASTEST
 /* ===========================================================================
  * Set match_start to the longest match starting at the given string and
@@ -1275,10 +1345,7 @@ local void lm_init(s)
  *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
  * OUT assertion: the match length is not greater than s->lookahead.
  */
-local uInt longest_match(s, cur_match)
-    deflate_state *s;
-    IPos cur_match;                             /* current match */
-{
+local uInt longest_match(deflate_state *s, IPos cur_match) {
     unsigned chain_length = s->max_chain_length;/* max hash chain length */
     register Bytef *scan = s->window + s->strstart; /* current string */
     register Bytef *match;                      /* matched string */
@@ -1426,10 +1493,7 @@ local uInt longest_match(s, cur_match)
 /* ---------------------------------------------------------------------------
  * Optimized version for FASTEST only
  */
-local uInt longest_match(s, cur_match)
-    deflate_state *s;
-    IPos cur_match;                             /* current match */
-{
+local uInt longest_match(deflate_state *s, IPos cur_match) {
     register Bytef *scan = s->window + s->strstart; /* current string */
     register Bytef *match;                       /* matched string */
     register int len;                           /* length of current match */
@@ -1490,19 +1554,23 @@ local uInt longest_match(s, cur_match)
 /* ===========================================================================
  * Check that the match at match_start is indeed a match.
  */
-local void check_match(s, start, match, length)
-    deflate_state *s;
-    IPos start, match;
-    int length;
-{
+local void check_match(deflate_state *s, IPos start, IPos match, int length) {
     /* check that the match is indeed a match */
-    if (zmemcmp(s->window + match,
-                s->window + start, length) != EQUAL) {
-        fprintf(stderr, " start %u, match %u, length %d\n",
-                start, match, length);
+    Bytef *back = s->window + (int)match, *here = s->window + start;
+    IPos len = length;
+    if (match == (IPos)-1) {
+        /* match starts one byte before the current window -- just compare the
+           subsequent length-1 bytes */
+        back++;
+        here++;
+        len--;
+    }
+    if (zmemcmp(back, here, len) != EQUAL) {
+        fprintf(stderr, " start %u, match %d, length %d\n",
+                start, (int)match, length);
         do {
-            fprintf(stderr, "%c%c", s->window[match++], s->window[start++]);
-        } while (--length != 0);
+            fprintf(stderr, "(%02x %02x)", *back++, *here++);
+        } while (--len != 0);
         z_error("invalid match");
     }
     if (z_verbose > 1) {
@@ -1514,137 +1582,6 @@ local void check_match(s, start, match, length)
 #  define check_match(s, start, match, length)
 #endif /* ZLIB_DEBUG */
 
-/* ===========================================================================
- * Fill the window when the lookahead becomes insufficient.
- * Updates strstart and lookahead.
- *
- * IN assertion: lookahead < MIN_LOOKAHEAD
- * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
- *    At least one byte has been read, or avail_in == 0; reads are
- *    performed for at least two bytes (required for the zip translate_eol
- *    option -- not supported here).
- */
-local void fill_window(s)
-    deflate_state *s;
-{
-    unsigned n;
-    unsigned more;    /* Amount of free space at the end of the window. */
-    uInt wsize = s->w_size;
-
-    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
-    do {
-        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
-
-        /* Deal with !@#$% 64K limit: */
-        if (sizeof(int) <= 2) {
-            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
-                more = wsize;
-
-            } else if (more == (unsigned)(-1)) {
-                /* Very unlikely, but possible on 16 bit machine if
-                 * strstart == 0 && lookahead == 1 (input done a byte at time)
-                 */
-                more--;
-            }
-        }
-
-        /* If the window is almost full and there is insufficient lookahead,
-         * move the upper half to the lower one to make room in the upper half.
-         */
-        if (s->strstart >= wsize + MAX_DIST(s)) {
-
-            zmemcpy(s->window, s->window + wsize, (unsigned)wsize - more);
-            s->match_start -= wsize;
-            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
-            s->block_start -= (long) wsize;
-            if (s->insert > s->strstart)
-                s->insert = s->strstart;
-            slide_hash(s);
-            more += wsize;
-        }
-        if (s->strm->avail_in == 0) break;
-
-        /* If there was no sliding:
-         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
-         *    more == window_size - lookahead - strstart
-         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
-         * => more >= window_size - 2*WSIZE + 2
-         * In the BIG_MEM or MMAP case (not yet supported),
-         *   window_size == input_size + MIN_LOOKAHEAD  &&
-         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
-         * Otherwise, window_size == 2*WSIZE so more >= 2.
-         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
-         */
-        Assert(more >= 2, "more < 2");
-
-        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
-        s->lookahead += n;
-
-        /* Initialize the hash value now that we have some input: */
-        if (s->lookahead + s->insert >= MIN_MATCH) {
-            uInt str = s->strstart - s->insert;
-            s->ins_h = s->window[str];
-            UPDATE_HASH(s, s->ins_h, s->window[str + 1]);
-#if MIN_MATCH != 3
-            Call UPDATE_HASH() MIN_MATCH-3 more times
-#endif
-            while (s->insert) {
-                UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
-#ifndef FASTEST
-                s->prev[str & s->w_mask] = s->head[s->ins_h];
-#endif
-                s->head[s->ins_h] = (Pos)str;
-                str++;
-                s->insert--;
-                if (s->lookahead + s->insert < MIN_MATCH)
-                    break;
-            }
-        }
-        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
-         * but this is not important since only literal bytes will be emitted.
-         */
-
-    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
-    /* If the WIN_INIT bytes after the end of the current data have never been
-     * written, then zero those bytes in order to avoid memory check reports of
-     * the use of uninitialized (or uninitialised as Julian writes) bytes by
-     * the longest match routines.  Update the high water mark for the next
-     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
-     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
-     */
-    if (s->high_water < s->window_size) {
-        ulg curr = s->strstart + (ulg)(s->lookahead);
-        ulg init;
-
-        if (s->high_water < curr) {
-            /* Previous high water mark below current data -- zero WIN_INIT
-             * bytes or up to end of window, whichever is less.
-             */
-            init = s->window_size - curr;
-            if (init > WIN_INIT)
-                init = WIN_INIT;
-            zmemzero(s->window + curr, (unsigned)init);
-            s->high_water = curr + init;
-        }
-        else if (s->high_water < (ulg)curr + WIN_INIT) {
-            /* High water mark at or above current data, but below current data
-             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
-             * to end of window, whichever is less.
-             */
-            init = (ulg)curr + WIN_INIT - s->high_water;
-            if (init > s->window_size - s->high_water)
-                init = s->window_size - s->high_water;
-            zmemzero(s->window + s->high_water, (unsigned)init);
-            s->high_water += init;
-        }
-    }
-
-    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
-           "not enough room for search");
-}
-
 /* ===========================================================================
  * Flush the current block, with given end-of-file flag.
  * IN assertion: strstart is set to the end of the current match.
@@ -1687,10 +1624,7 @@ local void fill_window(s)
  * copied. It is most efficient with large input and output buffers, which
  * maximizes the opportunities to have a single copy from next_in to next_out.
  */
-local block_state deflate_stored(s, flush)
-    deflate_state *s;
-    int flush;
-{
+local block_state deflate_stored(deflate_state *s, int flush) {
     /* Smallest worthy block size when not flushing or finishing. By default
      * this is 32K. This can be as small as 507 bytes for memLevel == 1. For
      * large input and output buffers, the stored block size will be larger.
@@ -1874,10 +1808,7 @@ local block_state deflate_stored(s, flush)
  * new strings in the dictionary only for unmatched strings or for short
  * matches. It is used only for the fast compression options.
  */
-local block_state deflate_fast(s, flush)
-    deflate_state *s;
-    int flush;
-{
+local block_state deflate_fast(deflate_state *s, int flush) {
     IPos hash_head;       /* head of the hash chain */
     int bflush;           /* set if current block must be flushed */
 
@@ -1976,10 +1907,7 @@ local block_state deflate_fast(s, flush)
  * evaluation for matches: a match is finally adopted only if there is
  * no better match at the next window position.
  */
-local block_state deflate_slow(s, flush)
-    deflate_state *s;
-    int flush;
-{
+local block_state deflate_slow(deflate_state *s, int flush) {
     IPos hash_head;          /* head of hash chain */
     int bflush;              /* set if current block must be flushed */
 
@@ -2107,10 +2035,7 @@ local block_state deflate_slow(s, flush)
  * one.  Do not maintain a hash table.  (It will be regenerated if this run of
  * deflate switches away from Z_RLE.)
  */
-local block_state deflate_rle(s, flush)
-    deflate_state *s;
-    int flush;
-{
+local block_state deflate_rle(deflate_state *s, int flush) {
     int bflush;             /* set if current block must be flushed */
     uInt prev;              /* byte at distance one to match */
     Bytef *scan, *strend;   /* scan goes up to strend for length of run */
@@ -2181,10 +2106,7 @@ local block_state deflate_rle(s, flush)
  * For Z_HUFFMAN_ONLY, do not look for matches.  Do not maintain a hash table.
  * (It will be regenerated if this run of deflate switches away from Huffman.)
  */
-local block_state deflate_huff(s, flush)
-    deflate_state *s;
-    int flush;
-{
+local block_state deflate_huff(deflate_state *s, int flush) {
     int bflush;             /* set if current block must be flushed */
 
     for (;;) {
diff --git a/3rdparty/zlib/deflate.h b/3rdparty/zlib/deflate.h
index 1a06cd5f25d1..300c6ada62b8 100644
--- a/3rdparty/zlib/deflate.h
+++ b/3rdparty/zlib/deflate.h
@@ -1,5 +1,5 @@
 /* deflate.h -- internal compression state
- * Copyright (C) 1995-2018 Jean-loup Gailly
+ * Copyright (C) 1995-2024 Jean-loup Gailly
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -23,6 +23,10 @@
 #  define GZIP
 #endif
 
+/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at
+   the cost of a larger memory footprint */
+/* #define LIT_MEM */
+
 /* ===========================================================================
  * Internal compression state.
  */
@@ -217,7 +221,14 @@ typedef struct internal_state {
     /* Depth of each subtree used as tie breaker for trees of equal frequency
      */
 
+#ifdef LIT_MEM
+#   define LIT_BUFS 5
+    ushf *d_buf;          /* buffer for distances */
+    uchf *l_buf;          /* buffer for literals/lengths */
+#else
+#   define LIT_BUFS 4
     uchf *sym_buf;        /* buffer for distances and literals/lengths */
+#endif
 
     uInt  lit_bufsize;
     /* Size of match buffer for literals/lengths.  There are 4 reasons for
@@ -239,7 +250,7 @@ typedef struct internal_state {
      *   - I can't count above 4
      */
 
-    uInt sym_next;      /* running index in sym_buf */
+    uInt sym_next;      /* running index in symbol buffer */
     uInt sym_end;       /* symbol table full when sym_next reaches this */
 
     ulg opt_len;        /* bit length of current block with optimal trees */
@@ -291,14 +302,14 @@ typedef struct internal_state {
    memory checker errors from longest match routines */
 
         /* in trees.c */
-void ZLIB_INTERNAL _tr_init OF((deflate_state *s));
-int ZLIB_INTERNAL _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc));
-void ZLIB_INTERNAL _tr_flush_block OF((deflate_state *s, charf *buf,
-                        ulg stored_len, int last));
-void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s));
-void ZLIB_INTERNAL _tr_align OF((deflate_state *s));
-void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
-                        ulg stored_len, int last));
+void ZLIB_INTERNAL _tr_init(deflate_state *s);
+int ZLIB_INTERNAL _tr_tally(deflate_state *s, unsigned dist, unsigned lc);
+void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, charf *buf,
+                                   ulg stored_len, int last);
+void ZLIB_INTERNAL _tr_flush_bits(deflate_state *s);
+void ZLIB_INTERNAL _tr_align(deflate_state *s);
+void ZLIB_INTERNAL _tr_stored_block(deflate_state *s, charf *buf,
+                                    ulg stored_len, int last);
 
 #define d_code(dist) \
    ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
@@ -318,6 +329,25 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
   extern const uch ZLIB_INTERNAL _dist_code[];
 #endif
 
+#ifdef LIT_MEM
+# define _tr_tally_lit(s, c, flush) \
+  { uch cc = (c); \
+    s->d_buf[s->sym_next] = 0; \
+    s->l_buf[s->sym_next++] = cc; \
+    s->dyn_ltree[cc].Freq++; \
+    flush = (s->sym_next == s->sym_end); \
+   }
+# define _tr_tally_dist(s, distance, length, flush) \
+  { uch len = (uch)(length); \
+    ush dist = (ush)(distance); \
+    s->d_buf[s->sym_next] = dist; \
+    s->l_buf[s->sym_next++] = len; \
+    dist--; \
+    s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
+    s->dyn_dtree[d_code(dist)].Freq++; \
+    flush = (s->sym_next == s->sym_end); \
+  }
+#else
 # define _tr_tally_lit(s, c, flush) \
   { uch cc = (c); \
     s->sym_buf[s->sym_next++] = 0; \
@@ -337,6 +367,7 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
     s->dyn_dtree[d_code(dist)].Freq++; \
     flush = (s->sym_next == s->sym_end); \
   }
+#endif
 #else
 # define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
 # define _tr_tally_dist(s, distance, length, flush) \
diff --git a/3rdparty/zlib/gzclose.c b/3rdparty/zlib/gzclose.c
index caeb99a3177f..48d6a86f04b6 100644
--- a/3rdparty/zlib/gzclose.c
+++ b/3rdparty/zlib/gzclose.c
@@ -8,9 +8,7 @@
 /* gzclose() is in a separate file so that it is linked in only if it is used.
    That way the other gzclose functions can be used instead to avoid linking in
    unneeded compression or decompression routines. */
-int ZEXPORT gzclose(file)
-    gzFile file;
-{
+int ZEXPORT gzclose(gzFile file) {
 #ifndef NO_GZCOMPRESS
     gz_statep state;
 
diff --git a/3rdparty/zlib/gzguts.h b/3rdparty/zlib/gzguts.h
index 57faf37165a3..eba72085bb75 100644
--- a/3rdparty/zlib/gzguts.h
+++ b/3rdparty/zlib/gzguts.h
@@ -1,5 +1,5 @@
 /* gzguts.h -- zlib internal header definitions for gz* operations
- * Copyright (C) 2004-2019 Mark Adler
+ * Copyright (C) 2004-2024 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -7,9 +7,8 @@
 #  ifndef _LARGEFILE_SOURCE
 #    define _LARGEFILE_SOURCE 1
 #  endif
-#  ifdef _FILE_OFFSET_BITS
-#    undef _FILE_OFFSET_BITS
-#  endif
+#  undef _FILE_OFFSET_BITS
+#  undef _TIME_BITS
 #endif
 
 #ifdef HAVE_HIDDEN
@@ -119,8 +118,8 @@
 
 /* gz* functions always use library allocation functions */
 #ifndef STDC
-  extern voidp  malloc OF((uInt size));
-  extern void   free   OF((voidpf ptr));
+  extern voidp  malloc(uInt size);
+  extern void   free(voidpf ptr);
 #endif
 
 /* get errno and strerror definition */
@@ -138,10 +137,10 @@
 
 /* provide prototypes for these when building zlib without LFS */
 #if !defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0
-    ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
-    ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
-    ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
-    ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+    ZEXTERN gzFile ZEXPORT gzopen64(const char *, const char *);
+    ZEXTERN z_off64_t ZEXPORT gzseek64(gzFile, z_off64_t, int);
+    ZEXTERN z_off64_t ZEXPORT gztell64(gzFile);
+    ZEXTERN z_off64_t ZEXPORT gzoffset64(gzFile);
 #endif
 
 /* default memLevel */
@@ -203,17 +202,13 @@ typedef struct {
 typedef gz_state FAR *gz_statep;
 
 /* shared functions */
-void ZLIB_INTERNAL gz_error OF((gz_statep, int, const char *));
+void ZLIB_INTERNAL gz_error(gz_statep, int, const char *);
 #if defined UNDER_CE
-char ZLIB_INTERNAL *gz_strwinerror OF((DWORD error));
+char ZLIB_INTERNAL *gz_strwinerror(DWORD error);
 #endif
 
 /* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
    value -- needed when comparing unsigned to z_off64_t, which is signed
    (possible z_off64_t types off_t, off64_t, and long are all signed) */
-#ifdef INT_MAX
-#  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
-#else
-unsigned ZLIB_INTERNAL gz_intmax OF((void));
-#  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax())
-#endif
+unsigned ZLIB_INTERNAL gz_intmax(void);
+#define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax())
diff --git a/3rdparty/zlib/gzlib.c b/3rdparty/zlib/gzlib.c
index 55da46a453fd..983153cc8e49 100644
--- a/3rdparty/zlib/gzlib.c
+++ b/3rdparty/zlib/gzlib.c
@@ -1,5 +1,5 @@
 /* gzlib.c -- zlib functions common to reading and writing gzip files
- * Copyright (C) 2004-2019 Mark Adler
+ * Copyright (C) 2004-2024 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -15,10 +15,6 @@
 #endif
 #endif
 
-/* Local functions */
-local void gz_reset OF((gz_statep));
-local gzFile gz_open OF((const void *, int, const char *));
-
 #if defined UNDER_CE
 
 /* Map the Windows error number in ERROR to a locale-dependent error message
@@ -30,9 +26,7 @@ local gzFile gz_open OF((const void *, int, const char *));
 
    The gz_strwinerror function does not change the current setting of
    GetLastError. */
-char ZLIB_INTERNAL *gz_strwinerror(error)
-     DWORD error;
-{
+char ZLIB_INTERNAL *gz_strwinerror(DWORD error) {
     static char buf[1024];
 
     wchar_t *msgbuf;
@@ -72,9 +66,7 @@ char ZLIB_INTERNAL *gz_strwinerror(error)
 #endif /* UNDER_CE */
 
 /* Reset gzip file state */
-local void gz_reset(state)
-    gz_statep state;
-{
+local void gz_reset(gz_statep state) {
     state->x.have = 0;              /* no output data available */
     if (state->mode == GZ_READ) {   /* for reading ... */
         state->eof = 0;             /* not at end of file */
@@ -90,11 +82,7 @@ local void gz_reset(state)
 }
 
 /* Open a gzip file either by name or file descriptor. */
-local gzFile gz_open(path, fd, mode)
-    const void *path;
-    int fd;
-    const char *mode;
-{
+local gzFile gz_open(const void *path, int fd, const char *mode) {
     gz_statep state;
     z_size_t len;
     int oflag;
@@ -269,26 +257,17 @@ local gzFile gz_open(path, fd, mode)
 }
 
 /* -- see zlib.h -- */
-gzFile ZEXPORT gzopen(path, mode)
-    const char *path;
-    const char *mode;
-{
+gzFile ZEXPORT gzopen(const char *path, const char *mode) {
     return gz_open(path, -1, mode);
 }
 
 /* -- see zlib.h -- */
-gzFile ZEXPORT gzopen64(path, mode)
-    const char *path;
-    const char *mode;
-{
+gzFile ZEXPORT gzopen64(const char *path, const char *mode) {
     return gz_open(path, -1, mode);
 }
 
 /* -- see zlib.h -- */
-gzFile ZEXPORT gzdopen(fd, mode)
-    int fd;
-    const char *mode;
-{
+gzFile ZEXPORT gzdopen(int fd, const char *mode) {
     char *path;         /* identifier for error messages */
     gzFile gz;
 
@@ -306,19 +285,13 @@ gzFile ZEXPORT gzdopen(fd, mode)
 
 /* -- see zlib.h -- */
 #ifdef WIDECHAR
-gzFile ZEXPORT gzopen_w(path, mode)
-    const wchar_t *path;
-    const char *mode;
-{
+gzFile ZEXPORT gzopen_w(const wchar_t *path, const char *mode) {
     return gz_open(path, -2, mode);
 }
 #endif
 
 /* -- see zlib.h -- */
-int ZEXPORT gzbuffer(file, size)
-    gzFile file;
-    unsigned size;
-{
+int ZEXPORT gzbuffer(gzFile file, unsigned size) {
     gz_statep state;
 
     /* get internal structure and check integrity */
@@ -335,16 +308,14 @@ int ZEXPORT gzbuffer(file, size)
     /* check and set requested size */
     if ((size << 1) < size)
         return -1;              /* need to be able to double it */
-    if (size < 2)
-        size = 2;               /* need two bytes to check magic header */
+    if (size < 8)
+        size = 8;               /* needed to behave well with flushing */
     state->want = size;
     return 0;
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzrewind(file)
-    gzFile file;
-{
+int ZEXPORT gzrewind(gzFile file) {
     gz_statep state;
 
     /* get internal structure */
@@ -365,11 +336,7 @@ int ZEXPORT gzrewind(file)
 }
 
 /* -- see zlib.h -- */
-z_off64_t ZEXPORT gzseek64(file, offset, whence)
-    gzFile file;
-    z_off64_t offset;
-    int whence;
-{
+z_off64_t ZEXPORT gzseek64(gzFile file, z_off64_t offset, int whence) {
     unsigned n;
     z_off64_t ret;
     gz_statep state;
@@ -442,11 +409,7 @@ z_off64_t ZEXPORT gzseek64(file, offset, whence)
 }
 
 /* -- see zlib.h -- */
-z_off_t ZEXPORT gzseek(file, offset, whence)
-    gzFile file;
-    z_off_t offset;
-    int whence;
-{
+z_off_t ZEXPORT gzseek(gzFile file, z_off_t offset, int whence) {
     z_off64_t ret;
 
     ret = gzseek64(file, (z_off64_t)offset, whence);
@@ -454,9 +417,7 @@ z_off_t ZEXPORT gzseek(file, offset, whence)
 }
 
 /* -- see zlib.h -- */
-z_off64_t ZEXPORT gztell64(file)
-    gzFile file;
-{
+z_off64_t ZEXPORT gztell64(gzFile file) {
     gz_statep state;
 
     /* get internal structure and check integrity */
@@ -471,9 +432,7 @@ z_off64_t ZEXPORT gztell64(file)
 }
 
 /* -- see zlib.h -- */
-z_off_t ZEXPORT gztell(file)
-    gzFile file;
-{
+z_off_t ZEXPORT gztell(gzFile file) {
     z_off64_t ret;
 
     ret = gztell64(file);
@@ -481,9 +440,7 @@ z_off_t ZEXPORT gztell(file)
 }
 
 /* -- see zlib.h -- */
-z_off64_t ZEXPORT gzoffset64(file)
-    gzFile file;
-{
+z_off64_t ZEXPORT gzoffset64(gzFile file) {
     z_off64_t offset;
     gz_statep state;
 
@@ -504,9 +461,7 @@ z_off64_t ZEXPORT gzoffset64(file)
 }
 
 /* -- see zlib.h -- */
-z_off_t ZEXPORT gzoffset(file)
-    gzFile file;
-{
+z_off_t ZEXPORT gzoffset(gzFile file) {
     z_off64_t ret;
 
     ret = gzoffset64(file);
@@ -514,9 +469,7 @@ z_off_t ZEXPORT gzoffset(file)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzeof(file)
-    gzFile file;
-{
+int ZEXPORT gzeof(gzFile file) {
     gz_statep state;
 
     /* get internal structure and check integrity */
@@ -531,10 +484,7 @@ int ZEXPORT gzeof(file)
 }
 
 /* -- see zlib.h -- */
-const char * ZEXPORT gzerror(file, errnum)
-    gzFile file;
-    int *errnum;
-{
+const char * ZEXPORT gzerror(gzFile file, int *errnum) {
     gz_statep state;
 
     /* get internal structure and check integrity */
@@ -552,9 +502,7 @@ const char * ZEXPORT gzerror(file, errnum)
 }
 
 /* -- see zlib.h -- */
-void ZEXPORT gzclearerr(file)
-    gzFile file;
-{
+void ZEXPORT gzclearerr(gzFile file) {
     gz_statep state;
 
     /* get internal structure and check integrity */
@@ -578,11 +526,7 @@ void ZEXPORT gzclearerr(file)
    memory).  Simply save the error message as a static string.  If there is an
    allocation failure constructing the error message, then convert the error to
    out of memory. */
-void ZLIB_INTERNAL gz_error(state, err, msg)
-    gz_statep state;
-    int err;
-    const char *msg;
-{
+void ZLIB_INTERNAL gz_error(gz_statep state, int err, const char *msg) {
     /* free previously allocated message and clear */
     if (state->msg != NULL) {
         if (state->err != Z_MEM_ERROR)
@@ -619,21 +563,20 @@ void ZLIB_INTERNAL gz_error(state, err, msg)
 #endif
 }
 
-#ifndef INT_MAX
 /* portably return maximum value for an int (when limits.h presumed not
    available) -- we need to do this to cover cases where 2's complement not
    used, since C standard permits 1's complement and sign-bit representations,
    otherwise we could just use ((unsigned)-1) >> 1 */
-unsigned ZLIB_INTERNAL gz_intmax()
-{
-    unsigned p, q;
-
-    p = 1;
+unsigned ZLIB_INTERNAL gz_intmax(void) {
+#ifdef INT_MAX
+    return INT_MAX;
+#else
+    unsigned p = 1, q;
     do {
         q = p;
         p <<= 1;
         p++;
     } while (p > q);
     return q >> 1;
-}
 #endif
+}
diff --git a/3rdparty/zlib/gzread.c b/3rdparty/zlib/gzread.c
index dd77381596cb..4168cbc88752 100644
--- a/3rdparty/zlib/gzread.c
+++ b/3rdparty/zlib/gzread.c
@@ -5,25 +5,12 @@
 
 #include "gzguts.h"
 
-/* Local functions */
-local int gz_load OF((gz_statep, unsigned char *, unsigned, unsigned *));
-local int gz_avail OF((gz_statep));
-local int gz_look OF((gz_statep));
-local int gz_decomp OF((gz_statep));
-local int gz_fetch OF((gz_statep));
-local int gz_skip OF((gz_statep, z_off64_t));
-local z_size_t gz_read OF((gz_statep, voidp, z_size_t));
-
 /* Use read() to load a buffer -- return -1 on error, otherwise 0.  Read from
    state->fd, and update state->eof, state->err, and state->msg as appropriate.
    This function needs to loop on read(), since read() is not guaranteed to
    read the number of bytes requested, depending on the type of descriptor. */
-local int gz_load(state, buf, len, have)
-    gz_statep state;
-    unsigned char *buf;
-    unsigned len;
-    unsigned *have;
-{
+local int gz_load(gz_statep state, unsigned char *buf, unsigned len,
+                  unsigned *have) {
     int ret;
     unsigned get, max = ((unsigned)-1 >> 2) + 1;
 
@@ -53,9 +40,7 @@ local int gz_load(state, buf, len, have)
    If strm->avail_in != 0, then the current data is moved to the beginning of
    the input buffer, and then the remainder of the buffer is loaded with the
    available data from the input file. */
-local int gz_avail(state)
-    gz_statep state;
-{
+local int gz_avail(gz_statep state) {
     unsigned got;
     z_streamp strm = &(state->strm);
 
@@ -88,9 +73,7 @@ local int gz_avail(state)
    case, all further file reads will be directly to either the output buffer or
    a user buffer.  If decompressing, the inflate state will be initialized.
    gz_look() will return 0 on success or -1 on failure. */
-local int gz_look(state)
-    gz_statep state;
-{
+local int gz_look(gz_statep state) {
     z_streamp strm = &(state->strm);
 
     /* allocate read buffers and inflate memory */
@@ -170,9 +153,7 @@ local int gz_look(state)
    data.  If the gzip stream completes, state->how is reset to LOOK to look for
    the next gzip stream or raw data, once state->x.have is depleted.  Returns 0
    on success, -1 on failure. */
-local int gz_decomp(state)
-    gz_statep state;
-{
+local int gz_decomp(gz_statep state) {
     int ret = Z_OK;
     unsigned had;
     z_streamp strm = &(state->strm);
@@ -224,9 +205,7 @@ local int gz_decomp(state)
    looked for to determine whether to copy or decompress.  Returns -1 on error,
    otherwise 0.  gz_fetch() will leave state->how as COPY or GZIP unless the
    end of the input file has been reached and all data has been processed.  */
-local int gz_fetch(state)
-    gz_statep state;
-{
+local int gz_fetch(gz_statep state) {
     z_streamp strm = &(state->strm);
 
     do {
@@ -254,10 +233,7 @@ local int gz_fetch(state)
 }
 
 /* Skip len uncompressed bytes of output.  Return -1 on error, 0 on success. */
-local int gz_skip(state, len)
-    gz_statep state;
-    z_off64_t len;
-{
+local int gz_skip(gz_statep state, z_off64_t len) {
     unsigned n;
 
     /* skip over len bytes or reach end-of-file, whichever comes first */
@@ -289,11 +265,7 @@ local int gz_skip(state, len)
    input.  Return the number of bytes read.  If zero is returned, either the
    end of file was reached, or there was an error.  state->err must be
    consulted in that case to determine which. */
-local z_size_t gz_read(state, buf, len)
-    gz_statep state;
-    voidp buf;
-    z_size_t len;
-{
+local z_size_t gz_read(gz_statep state, voidp buf, z_size_t len) {
     z_size_t got;
     unsigned n;
 
@@ -370,11 +342,7 @@ local z_size_t gz_read(state, buf, len)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzread(file, buf, len)
-    gzFile file;
-    voidp buf;
-    unsigned len;
-{
+int ZEXPORT gzread(gzFile file, voidp buf, unsigned len) {
     gz_statep state;
 
     /* get internal structure */
@@ -406,12 +374,7 @@ int ZEXPORT gzread(file, buf, len)
 }
 
 /* -- see zlib.h -- */
-z_size_t ZEXPORT gzfread(buf, size, nitems, file)
-    voidp buf;
-    z_size_t size;
-    z_size_t nitems;
-    gzFile file;
-{
+z_size_t ZEXPORT gzfread(voidp buf, z_size_t size, z_size_t nitems, gzFile file) {
     z_size_t len;
     gz_statep state;
 
@@ -442,9 +405,7 @@ z_size_t ZEXPORT gzfread(buf, size, nitems, file)
 #else
 #  undef gzgetc
 #endif
-int ZEXPORT gzgetc(file)
-    gzFile file;
-{
+int ZEXPORT gzgetc(gzFile file) {
     unsigned char buf[1];
     gz_statep state;
 
@@ -469,17 +430,12 @@ int ZEXPORT gzgetc(file)
     return gz_read(state, buf, 1) < 1 ? -1 : buf[0];
 }
 
-int ZEXPORT gzgetc_(file)
-gzFile file;
-{
+int ZEXPORT gzgetc_(gzFile file) {
     return gzgetc(file);
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzungetc(c, file)
-    int c;
-    gzFile file;
-{
+int ZEXPORT gzungetc(int c, gzFile file) {
     gz_statep state;
 
     /* get internal structure */
@@ -487,6 +443,10 @@ int ZEXPORT gzungetc(c, file)
         return -1;
     state = (gz_statep)file;
 
+    /* in case this was just opened, set up the input buffer */
+    if (state->mode == GZ_READ && state->how == LOOK && state->x.have == 0)
+        (void)gz_look(state);
+
     /* check that we're reading and that there's no (serious) error */
     if (state->mode != GZ_READ ||
         (state->err != Z_OK && state->err != Z_BUF_ERROR))
@@ -536,11 +496,7 @@ int ZEXPORT gzungetc(c, file)
 }
 
 /* -- see zlib.h -- */
-char * ZEXPORT gzgets(file, buf, len)
-    gzFile file;
-    char *buf;
-    int len;
-{
+char * ZEXPORT gzgets(gzFile file, char *buf, int len) {
     unsigned left, n;
     char *str;
     unsigned char *eol;
@@ -600,9 +556,7 @@ char * ZEXPORT gzgets(file, buf, len)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzdirect(file)
-    gzFile file;
-{
+int ZEXPORT gzdirect(gzFile file) {
     gz_statep state;
 
     /* get internal structure */
@@ -620,9 +574,7 @@ int ZEXPORT gzdirect(file)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzclose_r(file)
-    gzFile file;
-{
+int ZEXPORT gzclose_r(gzFile file) {
     int ret, err;
     gz_statep state;
 
diff --git a/3rdparty/zlib/gzwrite.c b/3rdparty/zlib/gzwrite.c
index eb8a0e5893ff..435b4621b534 100644
--- a/3rdparty/zlib/gzwrite.c
+++ b/3rdparty/zlib/gzwrite.c
@@ -5,18 +5,10 @@
 
 #include "gzguts.h"
 
-/* Local functions */
-local int gz_init OF((gz_statep));
-local int gz_comp OF((gz_statep, int));
-local int gz_zero OF((gz_statep, z_off64_t));
-local z_size_t gz_write OF((gz_statep, voidpc, z_size_t));
-
 /* Initialize state for writing a gzip file.  Mark initialization by setting
    state->size to non-zero.  Return -1 on a memory allocation failure, or 0 on
    success. */
-local int gz_init(state)
-    gz_statep state;
-{
+local int gz_init(gz_statep state) {
     int ret;
     z_streamp strm = &(state->strm);
 
@@ -70,10 +62,7 @@ local int gz_init(state)
    deflate() flush value.  If flush is Z_FINISH, then the deflate() state is
    reset to start a new gzip stream.  If gz->direct is true, then simply write
    to the output file without compressing, and ignore flush. */
-local int gz_comp(state, flush)
-    gz_statep state;
-    int flush;
-{
+local int gz_comp(gz_statep state, int flush) {
     int ret, writ;
     unsigned have, put, max = ((unsigned)-1 >> 2) + 1;
     z_streamp strm = &(state->strm);
@@ -151,10 +140,7 @@ local int gz_comp(state, flush)
 
 /* Compress len zeros to output.  Return -1 on a write error or memory
    allocation failure by gz_comp(), or 0 on success. */
-local int gz_zero(state, len)
-    gz_statep state;
-    z_off64_t len;
-{
+local int gz_zero(gz_statep state, z_off64_t len) {
     int first;
     unsigned n;
     z_streamp strm = &(state->strm);
@@ -184,11 +170,7 @@ local int gz_zero(state, len)
 
 /* Write len bytes from buf to file.  Return the number of bytes written.  If
    the returned value is less than len, then there was an error. */
-local z_size_t gz_write(state, buf, len)
-    gz_statep state;
-    voidpc buf;
-    z_size_t len;
-{
+local z_size_t gz_write(gz_statep state, voidpc buf, z_size_t len) {
     z_size_t put = len;
 
     /* if len is zero, avoid unnecessary operations */
@@ -252,11 +234,7 @@ local z_size_t gz_write(state, buf, len)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzwrite(file, buf, len)
-    gzFile file;
-    voidpc buf;
-    unsigned len;
-{
+int ZEXPORT gzwrite(gzFile file, voidpc buf, unsigned len) {
     gz_statep state;
 
     /* get internal structure */
@@ -280,12 +258,8 @@ int ZEXPORT gzwrite(file, buf, len)
 }
 
 /* -- see zlib.h -- */
-z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
-    voidpc buf;
-    z_size_t size;
-    z_size_t nitems;
-    gzFile file;
-{
+z_size_t ZEXPORT gzfwrite(voidpc buf, z_size_t size, z_size_t nitems,
+                          gzFile file) {
     z_size_t len;
     gz_statep state;
 
@@ -310,10 +284,7 @@ z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzputc(file, c)
-    gzFile file;
-    int c;
-{
+int ZEXPORT gzputc(gzFile file, int c) {
     unsigned have;
     unsigned char buf[1];
     gz_statep state;
@@ -358,10 +329,7 @@ int ZEXPORT gzputc(file, c)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzputs(file, s)
-    gzFile file;
-    const char *s;
-{
+int ZEXPORT gzputs(gzFile file, const char *s) {
     z_size_t len, put;
     gz_statep state;
 
@@ -388,8 +356,7 @@ int ZEXPORT gzputs(file, s)
 #include <stdarg.h>
 
 /* -- see zlib.h -- */
-int ZEXPORTVA gzvprintf(gzFile file, const char *format, va_list va)
-{
+int ZEXPORTVA gzvprintf(gzFile file, const char *format, va_list va) {
     int len;
     unsigned left;
     char *next;
@@ -460,8 +427,7 @@ int ZEXPORTVA gzvprintf(gzFile file, const char *format, va_list va)
     return len;
 }
 
-int ZEXPORTVA gzprintf(gzFile file, const char *format, ...)
-{
+int ZEXPORTVA gzprintf(gzFile file, const char *format, ...) {
     va_list va;
     int ret;
 
@@ -474,13 +440,10 @@ int ZEXPORTVA gzprintf(gzFile file, const char *format, ...)
 #else /* !STDC && !Z_HAVE_STDARG_H */
 
 /* -- see zlib.h -- */
-int ZEXPORTVA gzprintf(file, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
-                       a11, a12, a13, a14, a15, a16, a17, a18, a19, a20)
-    gzFile file;
-    const char *format;
-    int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
-        a11, a12, a13, a14, a15, a16, a17, a18, a19, a20;
-{
+int ZEXPORTVA gzprintf(gzFile file, const char *format, int a1, int a2, int a3,
+                       int a4, int a5, int a6, int a7, int a8, int a9, int a10,
+                       int a11, int a12, int a13, int a14, int a15, int a16,
+                       int a17, int a18, int a19, int a20) {
     unsigned len, left;
     char *next;
     gz_statep state;
@@ -562,10 +525,7 @@ int ZEXPORTVA gzprintf(file, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
 #endif
 
 /* -- see zlib.h -- */
-int ZEXPORT gzflush(file, flush)
-    gzFile file;
-    int flush;
-{
+int ZEXPORT gzflush(gzFile file, int flush) {
     gz_statep state;
 
     /* get internal structure */
@@ -594,11 +554,7 @@ int ZEXPORT gzflush(file, flush)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzsetparams(file, level, strategy)
-    gzFile file;
-    int level;
-    int strategy;
-{
+int ZEXPORT gzsetparams(gzFile file, int level, int strategy) {
     gz_statep state;
     z_streamp strm;
 
@@ -609,7 +565,7 @@ int ZEXPORT gzsetparams(file, level, strategy)
     strm = &(state->strm);
 
     /* check that we're writing and that there's no error */
-    if (state->mode != GZ_WRITE || state->err != Z_OK)
+    if (state->mode != GZ_WRITE || state->err != Z_OK || state->direct)
         return Z_STREAM_ERROR;
 
     /* if no change is requested, then do nothing */
@@ -636,9 +592,7 @@ int ZEXPORT gzsetparams(file, level, strategy)
 }
 
 /* -- see zlib.h -- */
-int ZEXPORT gzclose_w(file)
-    gzFile file;
-{
+int ZEXPORT gzclose_w(gzFile file) {
     int ret = Z_OK;
     gz_statep state;
 
diff --git a/3rdparty/zlib/infback.c b/3rdparty/zlib/infback.c
index babeaf1806f9..e7b25b307a30 100644
--- a/3rdparty/zlib/infback.c
+++ b/3rdparty/zlib/infback.c
@@ -15,9 +15,6 @@
 #include "inflate.h"
 #include "inffast.h"
 
-/* function prototypes */
-local void fixedtables OF((struct inflate_state FAR *state));
-
 /*
    strm provides memory allocation functions in zalloc and zfree, or
    Z_NULL to use the library memory allocation functions.
@@ -25,13 +22,9 @@ local void fixedtables OF((struct inflate_state FAR *state));
    windowBits is in the range 8..15, and window is a user-supplied
    window and output buffer that is 2**windowBits bytes.
  */
-int ZEXPORT inflateBackInit_(strm, windowBits, window, version, stream_size)
-z_streamp strm;
-int windowBits;
-unsigned char FAR *window;
-const char *version;
-int stream_size;
-{
+int ZEXPORT inflateBackInit_(z_streamp strm, int windowBits,
+                             unsigned char FAR *window, const char *version,
+                             int stream_size) {
     struct inflate_state FAR *state;
 
     if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
@@ -80,9 +73,7 @@ int stream_size;
    used for threaded applications, since the rewriting of the tables and virgin
    may not be thread-safe.
  */
-local void fixedtables(state)
-struct inflate_state FAR *state;
-{
+local void fixedtables(struct inflate_state FAR *state) {
 #ifdef BUILDFIXED
     static int virgin = 1;
     static code *lenfix, *distfix;
@@ -248,13 +239,8 @@ struct inflate_state FAR *state;
    inflateBack() can also return Z_STREAM_ERROR if the input parameters
    are not correct, i.e. strm is Z_NULL or the state was not initialized.
  */
-int ZEXPORT inflateBack(strm, in, in_desc, out, out_desc)
-z_streamp strm;
-in_func in;
-void FAR *in_desc;
-out_func out;
-void FAR *out_desc;
-{
+int ZEXPORT inflateBack(z_streamp strm, in_func in, void FAR *in_desc,
+                        out_func out, void FAR *out_desc) {
     struct inflate_state FAR *state;
     z_const unsigned char FAR *next;    /* next input */
     unsigned char FAR *put;     /* next output */
@@ -632,9 +618,7 @@ void FAR *out_desc;
     return ret;
 }
 
-int ZEXPORT inflateBackEnd(strm)
-z_streamp strm;
-{
+int ZEXPORT inflateBackEnd(z_streamp strm) {
     if (strm == Z_NULL || strm->state == Z_NULL || strm->zfree == (free_func)0)
         return Z_STREAM_ERROR;
     ZFREE(strm, strm->state);
diff --git a/3rdparty/zlib/inffast.c b/3rdparty/zlib/inffast.c
index 1fec7f363fa6..9354676e786e 100644
--- a/3rdparty/zlib/inffast.c
+++ b/3rdparty/zlib/inffast.c
@@ -47,10 +47,7 @@
       requires strm->avail_out >= 258 for each loop to avoid checking for
       output space.
  */
-void ZLIB_INTERNAL inflate_fast(strm, start)
-z_streamp strm;
-unsigned start;         /* inflate()'s starting value for strm->avail_out */
-{
+void ZLIB_INTERNAL inflate_fast(z_streamp strm, unsigned start) {
     struct inflate_state FAR *state;
     z_const unsigned char FAR *in;      /* local strm->next_in */
     z_const unsigned char FAR *last;    /* have enough input while in < last */
diff --git a/3rdparty/zlib/inffast.h b/3rdparty/zlib/inffast.h
index e5c1aa4ca8cd..49c6d156c5c6 100644
--- a/3rdparty/zlib/inffast.h
+++ b/3rdparty/zlib/inffast.h
@@ -8,4 +8,4 @@
    subject to change. Applications should only use zlib.h.
  */
 
-void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start));
+void ZLIB_INTERNAL inflate_fast(z_streamp strm, unsigned start);
diff --git a/3rdparty/zlib/inflate.c b/3rdparty/zlib/inflate.c
index c84f52507c87..81545bbbdf8d 100644
--- a/3rdparty/zlib/inflate.c
+++ b/3rdparty/zlib/inflate.c
@@ -91,20 +91,7 @@
 #  endif
 #endif
 
-/* function prototypes */
-local int inflateStateCheck OF((z_streamp strm));
-local void fixedtables OF((struct inflate_state FAR *state));
-local int updatewindow OF((z_streamp strm, const unsigned char FAR *end,
-                           unsigned copy));
-#ifdef BUILDFIXED
-   void makefixed OF((void));
-#endif
-local unsigned syncsearch OF((unsigned FAR *have, const unsigned char FAR *buf,
-                              unsigned len));
-
-local int inflateStateCheck(strm)
-z_streamp strm;
-{
+local int inflateStateCheck(z_streamp strm) {
     struct inflate_state FAR *state;
     if (strm == Z_NULL ||
         strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0)
@@ -116,9 +103,7 @@ z_streamp strm;
     return 0;
 }
 
-int ZEXPORT inflateResetKeep(strm)
-z_streamp strm;
-{
+int ZEXPORT inflateResetKeep(z_streamp strm) {
     struct inflate_state FAR *state;
 
     if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
@@ -142,9 +127,7 @@ z_streamp strm;
     return Z_OK;
 }
 
-int ZEXPORT inflateReset(strm)
-z_streamp strm;
-{
+int ZEXPORT inflateReset(z_streamp strm) {
     struct inflate_state FAR *state;
 
     if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
@@ -155,10 +138,7 @@ z_streamp strm;
     return inflateResetKeep(strm);
 }
 
-int ZEXPORT inflateReset2(strm, windowBits)
-z_streamp strm;
-int windowBits;
-{
+int ZEXPORT inflateReset2(z_streamp strm, int windowBits) {
     int wrap;
     struct inflate_state FAR *state;
 
@@ -195,12 +175,8 @@ int windowBits;
     return inflateReset(strm);
 }
 
-int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size)
-z_streamp strm;
-int windowBits;
-const char *version;
-int stream_size;
-{
+int ZEXPORT inflateInit2_(z_streamp strm, int windowBits,
+                          const char *version, int stream_size) {
     int ret;
     struct inflate_state FAR *state;
 
@@ -240,22 +216,17 @@ int stream_size;
     return ret;
 }
 
-int ZEXPORT inflateInit_(strm, version, stream_size)
-z_streamp strm;
-const char *version;
-int stream_size;
-{
+int ZEXPORT inflateInit_(z_streamp strm, const char *version,
+                         int stream_size) {
     return inflateInit2_(strm, DEF_WBITS, version, stream_size);
 }
 
-int ZEXPORT inflatePrime(strm, bits, value)
-z_streamp strm;
-int bits;
-int value;
-{
+int ZEXPORT inflatePrime(z_streamp strm, int bits, int value) {
     struct inflate_state FAR *state;
 
     if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
+    if (bits == 0)
+        return Z_OK;
     state = (struct inflate_state FAR *)strm->state;
     if (bits < 0) {
         state->hold = 0;
@@ -279,9 +250,7 @@ int value;
    used for threaded applications, since the rewriting of the tables and virgin
    may not be thread-safe.
  */
-local void fixedtables(state)
-struct inflate_state FAR *state;
-{
+local void fixedtables(struct inflate_state FAR *state) {
 #ifdef BUILDFIXED
     static int virgin = 1;
     static code *lenfix, *distfix;
@@ -343,7 +312,7 @@ struct inflate_state FAR *state;
 
     a.out > inffixed.h
  */
-void makefixed()
+void makefixed(void)
 {
     unsigned low, size;
     struct inflate_state state;
@@ -397,11 +366,7 @@ void makefixed()
    output will fall in the output data, making match copies simpler and faster.
    The advantage may be dependent on the size of the processor's data caches.
  */
-local int updatewindow(strm, end, copy)
-z_streamp strm;
-const Bytef *end;
-unsigned copy;
-{
+local int updatewindow(z_streamp strm, const Bytef *end, unsigned copy) {
     struct inflate_state FAR *state;
     unsigned dist;
 
@@ -623,10 +588,7 @@ unsigned copy;
    will return Z_BUF_ERROR if it has not reached the end of the stream.
  */
 
-int ZEXPORT inflate(strm, flush)
-z_streamp strm;
-int flush;
-{
+int ZEXPORT inflate(z_streamp strm, int flush) {
     struct inflate_state FAR *state;
     z_const unsigned char FAR *next;    /* next input */
     unsigned char FAR *put;     /* next output */
@@ -1302,9 +1264,7 @@ int flush;
     return ret;
 }
 
-int ZEXPORT inflateEnd(strm)
-z_streamp strm;
-{
+int ZEXPORT inflateEnd(z_streamp strm) {
     struct inflate_state FAR *state;
     if (inflateStateCheck(strm))
         return Z_STREAM_ERROR;
@@ -1316,11 +1276,8 @@ z_streamp strm;
     return Z_OK;
 }
 
-int ZEXPORT inflateGetDictionary(strm, dictionary, dictLength)
-z_streamp strm;
-Bytef *dictionary;
-uInt *dictLength;
-{
+int ZEXPORT inflateGetDictionary(z_streamp strm, Bytef *dictionary,
+                                 uInt *dictLength) {
     struct inflate_state FAR *state;
 
     /* check state */
@@ -1339,11 +1296,8 @@ uInt *dictLength;
     return Z_OK;
 }
 
-int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength)
-z_streamp strm;
-const Bytef *dictionary;
-uInt dictLength;
-{
+int ZEXPORT inflateSetDictionary(z_streamp strm, const Bytef *dictionary,
+                                 uInt dictLength) {
     struct inflate_state FAR *state;
     unsigned long dictid;
     int ret;
@@ -1374,10 +1328,7 @@ uInt dictLength;
     return Z_OK;
 }
 
-int ZEXPORT inflateGetHeader(strm, head)
-z_streamp strm;
-gz_headerp head;
-{
+int ZEXPORT inflateGetHeader(z_streamp strm, gz_headerp head) {
     struct inflate_state FAR *state;
 
     /* check state */
@@ -1402,11 +1353,8 @@ gz_headerp head;
    called again with more data and the *have state.  *have is initialized to
    zero for the first call.
  */
-local unsigned syncsearch(have, buf, len)
-unsigned FAR *have;
-const unsigned char FAR *buf;
-unsigned len;
-{
+local unsigned syncsearch(unsigned FAR *have, const unsigned char FAR *buf,
+                          unsigned len) {
     unsigned got;
     unsigned next;
 
@@ -1425,9 +1373,7 @@ unsigned len;
     return next;
 }
 
-int ZEXPORT inflateSync(strm)
-z_streamp strm;
-{
+int ZEXPORT inflateSync(z_streamp strm) {
     unsigned len;               /* number of bytes to look at or looked at */
     int flags;                  /* temporary to save header status */
     unsigned long in, out;      /* temporary to save total_in and total_out */
@@ -1442,7 +1388,7 @@ z_streamp strm;
     /* if first time, start search in bit buffer */
     if (state->mode != SYNC) {
         state->mode = SYNC;
-        state->hold <<= state->bits & 7;
+        state->hold >>= state->bits & 7;
         state->bits -= state->bits & 7;
         len = 0;
         while (state->bits >= 8) {
@@ -1483,9 +1429,7 @@ z_streamp strm;
    block. When decompressing, PPP checks that at the end of input packet,
    inflate is waiting for these length bytes.
  */
-int ZEXPORT inflateSyncPoint(strm)
-z_streamp strm;
-{
+int ZEXPORT inflateSyncPoint(z_streamp strm) {
     struct inflate_state FAR *state;
 
     if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
@@ -1493,10 +1437,7 @@ z_streamp strm;
     return state->mode == STORED && state->bits == 0;
 }
 
-int ZEXPORT inflateCopy(dest, source)
-z_streamp dest;
-z_streamp source;
-{
+int ZEXPORT inflateCopy(z_streamp dest, z_streamp source) {
     struct inflate_state FAR *state;
     struct inflate_state FAR *copy;
     unsigned char FAR *window;
@@ -1540,10 +1481,7 @@ z_streamp source;
     return Z_OK;
 }
 
-int ZEXPORT inflateUndermine(strm, subvert)
-z_streamp strm;
-int subvert;
-{
+int ZEXPORT inflateUndermine(z_streamp strm, int subvert) {
     struct inflate_state FAR *state;
 
     if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
@@ -1558,10 +1496,7 @@ int subvert;
 #endif
 }
 
-int ZEXPORT inflateValidate(strm, check)
-z_streamp strm;
-int check;
-{
+int ZEXPORT inflateValidate(z_streamp strm, int check) {
     struct inflate_state FAR *state;
 
     if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
@@ -1573,9 +1508,7 @@ int check;
     return Z_OK;
 }
 
-long ZEXPORT inflateMark(strm)
-z_streamp strm;
-{
+long ZEXPORT inflateMark(z_streamp strm) {
     struct inflate_state FAR *state;
 
     if (inflateStateCheck(strm))
@@ -1586,9 +1519,7 @@ z_streamp strm;
             (state->mode == MATCH ? state->was - state->length : 0));
 }
 
-unsigned long ZEXPORT inflateCodesUsed(strm)
-z_streamp strm;
-{
+unsigned long ZEXPORT inflateCodesUsed(z_streamp strm) {
     struct inflate_state FAR *state;
     if (inflateStateCheck(strm)) return (unsigned long)-1;
     state = (struct inflate_state FAR *)strm->state;
diff --git a/3rdparty/zlib/inftrees.c b/3rdparty/zlib/inftrees.c
index 57d2793bec93..98cfe164458c 100644
--- a/3rdparty/zlib/inftrees.c
+++ b/3rdparty/zlib/inftrees.c
@@ -1,5 +1,5 @@
 /* inftrees.c -- generate Huffman trees for efficient decoding
- * Copyright (C) 1995-2022 Mark Adler
+ * Copyright (C) 1995-2024 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -9,7 +9,7 @@
 #define MAXBITS 15
 
 const char inflate_copyright[] =
-   " inflate 1.2.13 Copyright 1995-2022 Mark Adler ";
+   " inflate 1.3.1 Copyright 1995-2024 Mark Adler ";
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -29,14 +29,9 @@ const char inflate_copyright[] =
    table index bits.  It will differ if the request is greater than the
    longest code or if it is less than the shortest code.
  */
-int ZLIB_INTERNAL inflate_table(type, lens, codes, table, bits, work)
-codetype type;
-unsigned short FAR *lens;
-unsigned codes;
-code FAR * FAR *table;
-unsigned FAR *bits;
-unsigned short FAR *work;
-{
+int ZLIB_INTERNAL inflate_table(codetype type, unsigned short FAR *lens,
+                                unsigned codes, code FAR * FAR *table,
+                                unsigned FAR *bits, unsigned short FAR *work) {
     unsigned len;               /* a code's length in bits */
     unsigned sym;               /* index of code symbols */
     unsigned min, max;          /* minimum and maximum code lengths */
@@ -62,7 +57,7 @@ unsigned short FAR *work;
         35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
     static const unsigned short lext[31] = { /* Length codes 257..285 extra */
         16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
-        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 194, 65};
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 203, 77};
     static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
         1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
         257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
diff --git a/3rdparty/zlib/inftrees.h b/3rdparty/zlib/inftrees.h
index f53665311c16..396f74b5da79 100644
--- a/3rdparty/zlib/inftrees.h
+++ b/3rdparty/zlib/inftrees.h
@@ -41,8 +41,8 @@ typedef struct {
    examples/enough.c found in the zlib distribution.  The arguments to that
    program are the number of symbols, the initial root table size, and the
    maximum bit length of a code.  "enough 286 9 15" for literal/length codes
-   returns returns 852, and "enough 30 6 15" for distance codes returns 592.
-   The initial root table size (9 or 6) is found in the fifth argument of the
+   returns 852, and "enough 30 6 15" for distance codes returns 592. The
+   initial root table size (9 or 6) is found in the fifth argument of the
    inflate_table() calls in inflate.c and infback.c.  If the root table size is
    changed, then these maximum sizes would be need to be recalculated and
    updated. */
@@ -57,6 +57,6 @@ typedef enum {
     DISTS
 } codetype;
 
-int ZLIB_INTERNAL inflate_table OF((codetype type, unsigned short FAR *lens,
-                             unsigned codes, code FAR * FAR *table,
-                             unsigned FAR *bits, unsigned short FAR *work));
+int ZLIB_INTERNAL inflate_table(codetype type, unsigned short FAR *lens,
+                                unsigned codes, code FAR * FAR *table,
+                                unsigned FAR *bits, unsigned short FAR *work);
diff --git a/3rdparty/zlib/trees.c b/3rdparty/zlib/trees.c
index 5f305c47221e..6a523ef34e3c 100644
--- a/3rdparty/zlib/trees.c
+++ b/3rdparty/zlib/trees.c
@@ -1,5 +1,5 @@
 /* trees.c -- output deflated data using Huffman coding
- * Copyright (C) 1995-2021 Jean-loup Gailly
+ * Copyright (C) 1995-2024 Jean-loup Gailly
  * detect_data_type() function provided freely by Cosmin Truta, 2006
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
@@ -122,39 +122,116 @@ struct static_tree_desc_s {
     int     max_length;          /* max bit length for the codes */
 };
 
-local const static_tree_desc  static_l_desc =
+#ifdef NO_INIT_GLOBAL_POINTERS
+#  define TCONST
+#else
+#  define TCONST const
+#endif
+
+local TCONST static_tree_desc static_l_desc =
 {static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS};
 
-local const static_tree_desc  static_d_desc =
+local TCONST static_tree_desc static_d_desc =
 {static_dtree, extra_dbits, 0,          D_CODES, MAX_BITS};
 
-local const static_tree_desc  static_bl_desc =
+local TCONST static_tree_desc static_bl_desc =
 {(const ct_data *)0, extra_blbits, 0,   BL_CODES, MAX_BL_BITS};
 
 /* ===========================================================================
- * Local (static) routines in this file.
+ * Output a short LSB first on the stream.
+ * IN assertion: there is enough room in pendingBuf.
+ */
+#define put_short(s, w) { \
+    put_byte(s, (uch)((w) & 0xff)); \
+    put_byte(s, (uch)((ush)(w) >> 8)); \
+}
+
+/* ===========================================================================
+ * Reverse the first len bits of a code, using straightforward code (a faster
+ * method would use a table)
+ * IN assertion: 1 <= len <= 15
  */
+local unsigned bi_reverse(unsigned code, int len) {
+    register unsigned res = 0;
+    do {
+        res |= code & 1;
+        code >>= 1, res <<= 1;
+    } while (--len > 0);
+    return res >> 1;
+}
 
-local void tr_static_init OF((void));
-local void init_block     OF((deflate_state *s));
-local void pqdownheap     OF((deflate_state *s, ct_data *tree, int k));
-local void gen_bitlen     OF((deflate_state *s, tree_desc *desc));
-local void gen_codes      OF((ct_data *tree, int max_code, ushf *bl_count));
-local void build_tree     OF((deflate_state *s, tree_desc *desc));
-local void scan_tree      OF((deflate_state *s, ct_data *tree, int max_code));
-local void send_tree      OF((deflate_state *s, ct_data *tree, int max_code));
-local int  build_bl_tree  OF((deflate_state *s));
-local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes,
-                              int blcodes));
-local void compress_block OF((deflate_state *s, const ct_data *ltree,
-                              const ct_data *dtree));
-local int  detect_data_type OF((deflate_state *s));
-local unsigned bi_reverse OF((unsigned code, int len));
-local void bi_windup      OF((deflate_state *s));
-local void bi_flush       OF((deflate_state *s));
+/* ===========================================================================
+ * Flush the bit buffer, keeping at most 7 bits in it.
+ */
+local void bi_flush(deflate_state *s) {
+    if (s->bi_valid == 16) {
+        put_short(s, s->bi_buf);
+        s->bi_buf = 0;
+        s->bi_valid = 0;
+    } else if (s->bi_valid >= 8) {
+        put_byte(s, (Byte)s->bi_buf);
+        s->bi_buf >>= 8;
+        s->bi_valid -= 8;
+    }
+}
+
+/* ===========================================================================
+ * Flush the bit buffer and align the output on a byte boundary
+ */
+local void bi_windup(deflate_state *s) {
+    if (s->bi_valid > 8) {
+        put_short(s, s->bi_buf);
+    } else if (s->bi_valid > 0) {
+        put_byte(s, (Byte)s->bi_buf);
+    }
+    s->bi_buf = 0;
+    s->bi_valid = 0;
+#ifdef ZLIB_DEBUG
+    s->bits_sent = (s->bits_sent + 7) & ~7;
+#endif
+}
+
+/* ===========================================================================
+ * Generate the codes for a given tree and bit counts (which need not be
+ * optimal).
+ * IN assertion: the array bl_count contains the bit length statistics for
+ * the given tree and the field len is set for all tree elements.
+ * OUT assertion: the field code is set for all tree elements of non
+ *     zero code length.
+ */
+local void gen_codes(ct_data *tree, int max_code, ushf *bl_count) {
+    ush next_code[MAX_BITS+1]; /* next code value for each bit length */
+    unsigned code = 0;         /* running code value */
+    int bits;                  /* bit index */
+    int n;                     /* code index */
+
+    /* The distribution counts are first used to generate the code values
+     * without bit reversal.
+     */
+    for (bits = 1; bits <= MAX_BITS; bits++) {
+        code = (code + bl_count[bits - 1]) << 1;
+        next_code[bits] = (ush)code;
+    }
+    /* Check that the bit counts in bl_count are consistent. The last code
+     * must be all ones.
+     */
+    Assert (code + bl_count[MAX_BITS] - 1 == (1 << MAX_BITS) - 1,
+            "inconsistent bit counts");
+    Tracev((stderr,"\ngen_codes: max_code %d ", max_code));
+
+    for (n = 0;  n <= max_code; n++) {
+        int len = tree[n].Len;
+        if (len == 0) continue;
+        /* Now reverse the bits */
+        tree[n].Code = (ush)bi_reverse(next_code[len]++, len);
+
+        Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
+            n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len] - 1));
+    }
+}
 
 #ifdef GEN_TREES_H
-local void gen_trees_header OF((void));
+local void gen_trees_header(void);
 #endif
 
 #ifndef ZLIB_DEBUG
@@ -167,27 +244,12 @@ local void gen_trees_header OF((void));
        send_bits(s, tree[c].Code, tree[c].Len); }
 #endif
 
-/* ===========================================================================
- * Output a short LSB first on the stream.
- * IN assertion: there is enough room in pendingBuf.
- */
-#define put_short(s, w) { \
-    put_byte(s, (uch)((w) & 0xff)); \
-    put_byte(s, (uch)((ush)(w) >> 8)); \
-}
-
 /* ===========================================================================
  * Send a value on a given number of bits.
  * IN assertion: length <= 16 and value fits in length bits.
  */
 #ifdef ZLIB_DEBUG
-local void send_bits      OF((deflate_state *s, int value, int length));
-
-local void send_bits(s, value, length)
-    deflate_state *s;
-    int value;  /* value to send */
-    int length; /* number of bits */
-{
+local void send_bits(deflate_state *s, int value, int length) {
     Tracevv((stderr," l %2d v %4x ", length, value));
     Assert(length > 0 && length <= 15, "invalid length");
     s->bits_sent += (ulg)length;
@@ -229,8 +291,7 @@ local void send_bits(s, value, length)
 /* ===========================================================================
  * Initialize the various 'constant' tables.
  */
-local void tr_static_init()
-{
+local void tr_static_init(void) {
 #if defined(GEN_TREES_H) || !defined(STDC)
     static int static_init_done = 0;
     int n;        /* iterates over tree elements */
@@ -323,8 +384,7 @@ local void tr_static_init()
       ((i) == (last)? "\n};\n\n" :    \
        ((i) % (width) == (width) - 1 ? ",\n" : ", "))
 
-void gen_trees_header()
-{
+void gen_trees_header(void) {
     FILE *header = fopen("trees.h", "w");
     int i;
 
@@ -373,12 +433,26 @@ void gen_trees_header()
 }
 #endif /* GEN_TREES_H */
 
+/* ===========================================================================
+ * Initialize a new block.
+ */
+local void init_block(deflate_state *s) {
+    int n; /* iterates over tree elements */
+
+    /* Initialize the trees. */
+    for (n = 0; n < L_CODES;  n++) s->dyn_ltree[n].Freq = 0;
+    for (n = 0; n < D_CODES;  n++) s->dyn_dtree[n].Freq = 0;
+    for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0;
+
+    s->dyn_ltree[END_BLOCK].Freq = 1;
+    s->opt_len = s->static_len = 0L;
+    s->sym_next = s->matches = 0;
+}
+
 /* ===========================================================================
  * Initialize the tree data structures for a new zlib stream.
  */
-void ZLIB_INTERNAL _tr_init(s)
-    deflate_state *s;
-{
+void ZLIB_INTERNAL _tr_init(deflate_state *s) {
     tr_static_init();
 
     s->l_desc.dyn_tree = s->dyn_ltree;
@@ -401,24 +475,6 @@ void ZLIB_INTERNAL _tr_init(s)
     init_block(s);
 }
 
-/* ===========================================================================
- * Initialize a new block.
- */
-local void init_block(s)
-    deflate_state *s;
-{
-    int n; /* iterates over tree elements */
-
-    /* Initialize the trees. */
-    for (n = 0; n < L_CODES;  n++) s->dyn_ltree[n].Freq = 0;
-    for (n = 0; n < D_CODES;  n++) s->dyn_dtree[n].Freq = 0;
-    for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0;
-
-    s->dyn_ltree[END_BLOCK].Freq = 1;
-    s->opt_len = s->static_len = 0L;
-    s->sym_next = s->matches = 0;
-}
-
 #define SMALLEST 1
 /* Index within the heap array of least frequent node in the Huffman tree */
 
@@ -448,11 +504,7 @@ local void init_block(s)
  * when the heap property is re-established (each father smaller than its
  * two sons).
  */
-local void pqdownheap(s, tree, k)
-    deflate_state *s;
-    ct_data *tree;  /* the tree to restore */
-    int k;               /* node to move down */
-{
+local void pqdownheap(deflate_state *s, ct_data *tree, int k) {
     int v = s->heap[k];
     int j = k << 1;  /* left son of k */
     while (j <= s->heap_len) {
@@ -483,10 +535,7 @@ local void pqdownheap(s, tree, k)
  *     The length opt_len is updated; static_len is also updated if stree is
  *     not null.
  */
-local void gen_bitlen(s, desc)
-    deflate_state *s;
-    tree_desc *desc;    /* the tree descriptor */
-{
+local void gen_bitlen(deflate_state *s, tree_desc *desc) {
     ct_data *tree        = desc->dyn_tree;
     int max_code         = desc->max_code;
     const ct_data *stree = desc->stat_desc->static_tree;
@@ -561,48 +610,9 @@ local void gen_bitlen(s, desc)
     }
 }
 
-/* ===========================================================================
- * Generate the codes for a given tree and bit counts (which need not be
- * optimal).
- * IN assertion: the array bl_count contains the bit length statistics for
- * the given tree and the field len is set for all tree elements.
- * OUT assertion: the field code is set for all tree elements of non
- *     zero code length.
- */
-local void gen_codes(tree, max_code, bl_count)
-    ct_data *tree;             /* the tree to decorate */
-    int max_code;              /* largest code with non zero frequency */
-    ushf *bl_count;            /* number of codes at each bit length */
-{
-    ush next_code[MAX_BITS+1]; /* next code value for each bit length */
-    unsigned code = 0;         /* running code value */
-    int bits;                  /* bit index */
-    int n;                     /* code index */
-
-    /* The distribution counts are first used to generate the code values
-     * without bit reversal.
-     */
-    for (bits = 1; bits <= MAX_BITS; bits++) {
-        code = (code + bl_count[bits - 1]) << 1;
-        next_code[bits] = (ush)code;
-    }
-    /* Check that the bit counts in bl_count are consistent. The last code
-     * must be all ones.
-     */
-    Assert (code + bl_count[MAX_BITS] - 1 == (1 << MAX_BITS) - 1,
-            "inconsistent bit counts");
-    Tracev((stderr,"\ngen_codes: max_code %d ", max_code));
-
-    for (n = 0;  n <= max_code; n++) {
-        int len = tree[n].Len;
-        if (len == 0) continue;
-        /* Now reverse the bits */
-        tree[n].Code = (ush)bi_reverse(next_code[len]++, len);
-
-        Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
-            n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len] - 1));
-    }
-}
+#ifdef DUMP_BL_TREE
+#  include <stdio.h>
+#endif
 
 /* ===========================================================================
  * Construct one Huffman tree and assigns the code bit strings and lengths.
@@ -612,10 +622,7 @@ local void gen_codes(tree, max_code, bl_count)
  *     and corresponding code. The length opt_len is updated; static_len is
  *     also updated if stree is not null. The field max_code is set.
  */
-local void build_tree(s, desc)
-    deflate_state *s;
-    tree_desc *desc; /* the tree descriptor */
-{
+local void build_tree(deflate_state *s, tree_desc *desc) {
     ct_data *tree         = desc->dyn_tree;
     const ct_data *stree  = desc->stat_desc->static_tree;
     int elems             = desc->stat_desc->elems;
@@ -700,11 +707,7 @@ local void build_tree(s, desc)
  * Scan a literal or distance tree to determine the frequencies of the codes
  * in the bit length tree.
  */
-local void scan_tree(s, tree, max_code)
-    deflate_state *s;
-    ct_data *tree;   /* the tree to be scanned */
-    int max_code;    /* and its largest code of non zero frequency */
-{
+local void scan_tree(deflate_state *s, ct_data *tree, int max_code) {
     int n;                     /* iterates over all tree elements */
     int prevlen = -1;          /* last emitted length */
     int curlen;                /* length of current code */
@@ -745,11 +748,7 @@ local void scan_tree(s, tree, max_code)
  * Send a literal or distance tree in compressed form, using the codes in
  * bl_tree.
  */
-local void send_tree(s, tree, max_code)
-    deflate_state *s;
-    ct_data *tree; /* the tree to be scanned */
-    int max_code;       /* and its largest code of non zero frequency */
-{
+local void send_tree(deflate_state *s, ct_data *tree, int max_code) {
     int n;                     /* iterates over all tree elements */
     int prevlen = -1;          /* last emitted length */
     int curlen;                /* length of current code */
@@ -796,9 +795,7 @@ local void send_tree(s, tree, max_code)
  * Construct the Huffman tree for the bit lengths and return the index in
  * bl_order of the last bit length code to send.
  */
-local int build_bl_tree(s)
-    deflate_state *s;
-{
+local int build_bl_tree(deflate_state *s) {
     int max_blindex;  /* index of last bit length code of non zero freq */
 
     /* Determine the bit length frequencies for literal and distance trees */
@@ -831,10 +828,8 @@ local int build_bl_tree(s)
  * lengths of the bit length codes, the literal tree and the distance tree.
  * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4.
  */
-local void send_all_trees(s, lcodes, dcodes, blcodes)
-    deflate_state *s;
-    int lcodes, dcodes, blcodes; /* number of codes for each tree */
-{
+local void send_all_trees(deflate_state *s, int lcodes, int dcodes,
+                          int blcodes) {
     int rank;                    /* index in bl_order */
 
     Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
@@ -860,12 +855,8 @@ local void send_all_trees(s, lcodes, dcodes, blcodes)
 /* ===========================================================================
  * Send a stored block
  */
-void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
-    deflate_state *s;
-    charf *buf;       /* input block */
-    ulg stored_len;   /* length of input block */
-    int last;         /* one if this is the last block for a file */
-{
+void ZLIB_INTERNAL _tr_stored_block(deflate_state *s, charf *buf,
+                                    ulg stored_len, int last) {
     send_bits(s, (STORED_BLOCK<<1) + last, 3);  /* send block type */
     bi_windup(s);        /* align on byte boundary */
     put_short(s, (ush)stored_len);
@@ -884,9 +875,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
 /* ===========================================================================
  * Flush the bits in the bit buffer to pending output (leaves at most 7 bits)
  */
-void ZLIB_INTERNAL _tr_flush_bits(s)
-    deflate_state *s;
-{
+void ZLIB_INTERNAL _tr_flush_bits(deflate_state *s) {
     bi_flush(s);
 }
 
@@ -894,9 +883,7 @@ void ZLIB_INTERNAL _tr_flush_bits(s)
  * Send one empty static block to give enough lookahead for inflate.
  * This takes 10 bits, of which 7 may remain in the bit buffer.
  */
-void ZLIB_INTERNAL _tr_align(s)
-    deflate_state *s;
-{
+void ZLIB_INTERNAL _tr_align(deflate_state *s) {
     send_bits(s, STATIC_TREES<<1, 3);
     send_code(s, END_BLOCK, static_ltree);
 #ifdef ZLIB_DEBUG
@@ -905,16 +892,108 @@ void ZLIB_INTERNAL _tr_align(s)
     bi_flush(s);
 }
 
+/* ===========================================================================
+ * Send the block data compressed using the given Huffman trees
+ */
+local void compress_block(deflate_state *s, const ct_data *ltree,
+                          const ct_data *dtree) {
+    unsigned dist;      /* distance of matched string */
+    int lc;             /* match length or unmatched char (if dist == 0) */
+    unsigned sx = 0;    /* running index in symbol buffers */
+    unsigned code;      /* the code to send */
+    int extra;          /* number of extra bits to send */
+
+    if (s->sym_next != 0) do {
+#ifdef LIT_MEM
+        dist = s->d_buf[sx];
+        lc = s->l_buf[sx++];
+#else
+        dist = s->sym_buf[sx++] & 0xff;
+        dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8;
+        lc = s->sym_buf[sx++];
+#endif
+        if (dist == 0) {
+            send_code(s, lc, ltree); /* send a literal byte */
+            Tracecv(isgraph(lc), (stderr," '%c' ", lc));
+        } else {
+            /* Here, lc is the match length - MIN_MATCH */
+            code = _length_code[lc];
+            send_code(s, code + LITERALS + 1, ltree);   /* send length code */
+            extra = extra_lbits[code];
+            if (extra != 0) {
+                lc -= base_length[code];
+                send_bits(s, lc, extra);       /* send the extra length bits */
+            }
+            dist--; /* dist is now the match distance - 1 */
+            code = d_code(dist);
+            Assert (code < D_CODES, "bad d_code");
+
+            send_code(s, code, dtree);       /* send the distance code */
+            extra = extra_dbits[code];
+            if (extra != 0) {
+                dist -= (unsigned)base_dist[code];
+                send_bits(s, dist, extra);   /* send the extra distance bits */
+            }
+        } /* literal or match pair ? */
+
+        /* Check for no overlay of pending_buf on needed symbols */
+#ifdef LIT_MEM
+        Assert(s->pending < 2 * (s->lit_bufsize + sx), "pendingBuf overflow");
+#else
+        Assert(s->pending < s->lit_bufsize + sx, "pendingBuf overflow");
+#endif
+
+    } while (sx < s->sym_next);
+
+    send_code(s, END_BLOCK, ltree);
+}
+
+/* ===========================================================================
+ * Check if the data type is TEXT or BINARY, using the following algorithm:
+ * - TEXT if the two conditions below are satisfied:
+ *    a) There are no non-portable control characters belonging to the
+ *       "block list" (0..6, 14..25, 28..31).
+ *    b) There is at least one printable character belonging to the
+ *       "allow list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255).
+ * - BINARY otherwise.
+ * - The following partially-portable control characters form a
+ *   "gray list" that is ignored in this detection algorithm:
+ *   (7 {BEL}, 8 {BS}, 11 {VT}, 12 {FF}, 26 {SUB}, 27 {ESC}).
+ * IN assertion: the fields Freq of dyn_ltree are set.
+ */
+local int detect_data_type(deflate_state *s) {
+    /* block_mask is the bit mask of block-listed bytes
+     * set bits 0..6, 14..25, and 28..31
+     * 0xf3ffc07f = binary 11110011111111111100000001111111
+     */
+    unsigned long block_mask = 0xf3ffc07fUL;
+    int n;
+
+    /* Check for non-textual ("block-listed") bytes. */
+    for (n = 0; n <= 31; n++, block_mask >>= 1)
+        if ((block_mask & 1) && (s->dyn_ltree[n].Freq != 0))
+            return Z_BINARY;
+
+    /* Check for textual ("allow-listed") bytes. */
+    if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0
+            || s->dyn_ltree[13].Freq != 0)
+        return Z_TEXT;
+    for (n = 32; n < LITERALS; n++)
+        if (s->dyn_ltree[n].Freq != 0)
+            return Z_TEXT;
+
+    /* There are no "block-listed" or "allow-listed" bytes:
+     * this stream either is empty or has tolerated ("gray-listed") bytes only.
+     */
+    return Z_BINARY;
+}
+
 /* ===========================================================================
  * Determine the best encoding for the current block: dynamic trees, static
  * trees or store, and write out the encoded block.
  */
-void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
-    deflate_state *s;
-    charf *buf;       /* input block, or NULL if too old */
-    ulg stored_len;   /* length of input block */
-    int last;         /* one if this is the last block for a file */
-{
+void ZLIB_INTERNAL _tr_flush_block(deflate_state *s, charf *buf,
+                                   ulg stored_len, int last) {
     ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */
     int max_blindex = 0;  /* index of last bit length code of non zero freq */
 
@@ -1011,14 +1090,15 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
  * Save the match info and tally the frequency counts. Return true if
  * the current block must be flushed.
  */
-int ZLIB_INTERNAL _tr_tally(s, dist, lc)
-    deflate_state *s;
-    unsigned dist;  /* distance of matched string */
-    unsigned lc;    /* match length - MIN_MATCH or unmatched char (dist==0) */
-{
+int ZLIB_INTERNAL _tr_tally(deflate_state *s, unsigned dist, unsigned lc) {
+#ifdef LIT_MEM
+    s->d_buf[s->sym_next] = (ush)dist;
+    s->l_buf[s->sym_next++] = (uch)lc;
+#else
     s->sym_buf[s->sym_next++] = (uch)dist;
     s->sym_buf[s->sym_next++] = (uch)(dist >> 8);
     s->sym_buf[s->sym_next++] = (uch)lc;
+#endif
     if (dist == 0) {
         /* lc is the unmatched char */
         s->dyn_ltree[lc].Freq++;
@@ -1035,147 +1115,3 @@ int ZLIB_INTERNAL _tr_tally(s, dist, lc)
     }
     return (s->sym_next == s->sym_end);
 }
-
-/* ===========================================================================
- * Send the block data compressed using the given Huffman trees
- */
-local void compress_block(s, ltree, dtree)
-    deflate_state *s;
-    const ct_data *ltree; /* literal tree */
-    const ct_data *dtree; /* distance tree */
-{
-    unsigned dist;      /* distance of matched string */
-    int lc;             /* match length or unmatched char (if dist == 0) */
-    unsigned sx = 0;    /* running index in sym_buf */
-    unsigned code;      /* the code to send */
-    int extra;          /* number of extra bits to send */
-
-    if (s->sym_next != 0) do {
-        dist = s->sym_buf[sx++] & 0xff;
-        dist += (unsigned)(s->sym_buf[sx++] & 0xff) << 8;
-        lc = s->sym_buf[sx++];
-        if (dist == 0) {
-            send_code(s, lc, ltree); /* send a literal byte */
-            Tracecv(isgraph(lc), (stderr," '%c' ", lc));
-        } else {
-            /* Here, lc is the match length - MIN_MATCH */
-            code = _length_code[lc];
-            send_code(s, code + LITERALS + 1, ltree);   /* send length code */
-            extra = extra_lbits[code];
-            if (extra != 0) {
-                lc -= base_length[code];
-                send_bits(s, lc, extra);       /* send the extra length bits */
-            }
-            dist--; /* dist is now the match distance - 1 */
-            code = d_code(dist);
-            Assert (code < D_CODES, "bad d_code");
-
-            send_code(s, code, dtree);       /* send the distance code */
-            extra = extra_dbits[code];
-            if (extra != 0) {
-                dist -= (unsigned)base_dist[code];
-                send_bits(s, dist, extra);   /* send the extra distance bits */
-            }
-        } /* literal or match pair ? */
-
-        /* Check that the overlay between pending_buf and sym_buf is ok: */
-        Assert(s->pending < s->lit_bufsize + sx, "pendingBuf overflow");
-
-    } while (sx < s->sym_next);
-
-    send_code(s, END_BLOCK, ltree);
-}
-
-/* ===========================================================================
- * Check if the data type is TEXT or BINARY, using the following algorithm:
- * - TEXT if the two conditions below are satisfied:
- *    a) There are no non-portable control characters belonging to the
- *       "block list" (0..6, 14..25, 28..31).
- *    b) There is at least one printable character belonging to the
- *       "allow list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255).
- * - BINARY otherwise.
- * - The following partially-portable control characters form a
- *   "gray list" that is ignored in this detection algorithm:
- *   (7 {BEL}, 8 {BS}, 11 {VT}, 12 {FF}, 26 {SUB}, 27 {ESC}).
- * IN assertion: the fields Freq of dyn_ltree are set.
- */
-local int detect_data_type(s)
-    deflate_state *s;
-{
-    /* block_mask is the bit mask of block-listed bytes
-     * set bits 0..6, 14..25, and 28..31
-     * 0xf3ffc07f = binary 11110011111111111100000001111111
-     */
-    unsigned long block_mask = 0xf3ffc07fUL;
-    int n;
-
-    /* Check for non-textual ("block-listed") bytes. */
-    for (n = 0; n <= 31; n++, block_mask >>= 1)
-        if ((block_mask & 1) && (s->dyn_ltree[n].Freq != 0))
-            return Z_BINARY;
-
-    /* Check for textual ("allow-listed") bytes. */
-    if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0
-            || s->dyn_ltree[13].Freq != 0)
-        return Z_TEXT;
-    for (n = 32; n < LITERALS; n++)
-        if (s->dyn_ltree[n].Freq != 0)
-            return Z_TEXT;
-
-    /* There are no "block-listed" or "allow-listed" bytes:
-     * this stream either is empty or has tolerated ("gray-listed") bytes only.
-     */
-    return Z_BINARY;
-}
-
-/* ===========================================================================
- * Reverse the first len bits of a code, using straightforward code (a faster
- * method would use a table)
- * IN assertion: 1 <= len <= 15
- */
-local unsigned bi_reverse(code, len)
-    unsigned code; /* the value to invert */
-    int len;       /* its bit length */
-{
-    register unsigned res = 0;
-    do {
-        res |= code & 1;
-        code >>= 1, res <<= 1;
-    } while (--len > 0);
-    return res >> 1;
-}
-
-/* ===========================================================================
- * Flush the bit buffer, keeping at most 7 bits in it.
- */
-local void bi_flush(s)
-    deflate_state *s;
-{
-    if (s->bi_valid == 16) {
-        put_short(s, s->bi_buf);
-        s->bi_buf = 0;
-        s->bi_valid = 0;
-    } else if (s->bi_valid >= 8) {
-        put_byte(s, (Byte)s->bi_buf);
-        s->bi_buf >>= 8;
-        s->bi_valid -= 8;
-    }
-}
-
-/* ===========================================================================
- * Flush the bit buffer and align the output on a byte boundary
- */
-local void bi_windup(s)
-    deflate_state *s;
-{
-    if (s->bi_valid > 8) {
-        put_short(s, s->bi_buf);
-    } else if (s->bi_valid > 0) {
-        put_byte(s, (Byte)s->bi_buf);
-    }
-    s->bi_buf = 0;
-    s->bi_valid = 0;
-#ifdef ZLIB_DEBUG
-    s->bits_sent = (s->bits_sent + 7) & ~7;
-#endif
-}
diff --git a/3rdparty/zlib/uncompr.c b/3rdparty/zlib/uncompr.c
index f9532f46c1a6..5e256663b451 100644
--- a/3rdparty/zlib/uncompr.c
+++ b/3rdparty/zlib/uncompr.c
@@ -24,12 +24,8 @@
    Z_DATA_ERROR if the input data was corrupted, including if the input data is
    an incomplete zlib stream.
 */
-int ZEXPORT uncompress2(dest, destLen, source, sourceLen)
-    Bytef *dest;
-    uLongf *destLen;
-    const Bytef *source;
-    uLong *sourceLen;
-{
+int ZEXPORT uncompress2(Bytef *dest, uLongf *destLen, const Bytef *source,
+                        uLong *sourceLen) {
     z_stream stream;
     int err;
     const uInt max = (uInt)-1;
@@ -83,11 +79,7 @@ int ZEXPORT uncompress2(dest, destLen, source, sourceLen)
            err;
 }
 
-int ZEXPORT uncompress(dest, destLen, source, sourceLen)
-    Bytef *dest;
-    uLongf *destLen;
-    const Bytef *source;
-    uLong sourceLen;
-{
+int ZEXPORT uncompress(Bytef *dest, uLongf *destLen, const Bytef *source,
+                       uLong sourceLen) {
     return uncompress2(dest, destLen, source, &sourceLen);
 }
diff --git a/3rdparty/zlib/zconf.h b/3rdparty/zlib/zconf.h.cmakein
similarity index 96%
rename from 3rdparty/zlib/zconf.h
rename to 3rdparty/zlib/zconf.h.cmakein
index bf977d3e70ad..96e9296a9a9a 100644
--- a/3rdparty/zlib/zconf.h
+++ b/3rdparty/zlib/zconf.h.cmakein
@@ -1,5 +1,5 @@
 /* zconf.h -- configuration of the zlib compression library
- * Copyright (C) 1995-2016 Jean-loup Gailly, Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -7,6 +7,8 @@
 
 #ifndef ZCONF_H
 #define ZCONF_H
+#cmakedefine Z_PREFIX
+#cmakedefine Z_HAVE_UNISTD_H
 
 /*
  * If you *really* need a unique prefix for all types and library functions,
@@ -241,7 +243,11 @@
 #endif
 
 #ifdef Z_SOLO
-   typedef unsigned long z_size_t;
+#  ifdef _WIN64
+     typedef unsigned long long z_size_t;
+#  else
+     typedef unsigned long z_size_t;
+#  endif
 #else
 #  define z_longlong long long
 #  if defined(NO_SIZE_T)
@@ -296,14 +302,6 @@
 #  endif
 #endif
 
-#ifndef Z_ARG /* function prototypes for stdarg */
-#  if defined(STDC) || defined(Z_HAVE_STDARG_H)
-#    define Z_ARG(args)  args
-#  else
-#    define Z_ARG(args)  ()
-#  endif
-#endif
-
 /* The following definitions for FAR are needed only for MSDOS mixed
  * model programming (small or medium model with some far allocations).
  * This was tested only with MSC; for other MSDOS compilers you may have
@@ -474,12 +472,8 @@ typedef uLong FAR uLongf;
 #endif
 
 #ifndef Z_HAVE_UNISTD_H
-#  ifdef __WATCOMC__
-#    define Z_HAVE_UNISTD_H
-#  endif
-#endif
-#ifndef Z_HAVE_UNISTD_H
-#  if defined(_LARGEFILE64_SOURCE) && !defined(_WIN32)
+#  if defined(__WATCOMC__) || defined(__GO32__) || \
+      (defined(_LARGEFILE64_SOURCE) && !defined(_WIN32))
 #    define Z_HAVE_UNISTD_H
 #  endif
 #endif
@@ -519,12 +513,12 @@ typedef uLong FAR uLongf;
 
 #if !defined(_WIN32) && defined(Z_LARGE64)
 #  define z_off64_t off64_t
+#elif defined(_WIN32) && !defined(__GNUC__)
+#  define z_off64_t __int64
+#elif defined(__GO32__)
+#  define z_off64_t offset_t
 #else
-#  if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO)
-#    define z_off64_t __int64
-#  else
-#    define z_off64_t z_off_t
-#  endif
+#  define z_off64_t z_off_t
 #endif
 
 /* MVS linker does not support external names larger than 8 bytes */
diff --git a/3rdparty/zlib/zlib.h b/3rdparty/zlib/zlib.h
index 953cb5012dc2..8d4b932eaf6a 100644
--- a/3rdparty/zlib/zlib.h
+++ b/3rdparty/zlib/zlib.h
@@ -1,7 +1,7 @@
 /* zlib.h -- interface of the 'zlib' general purpose compression library
-  version 1.2.13, October 13th, 2022
+  version 1.3.1, January 22nd, 2024
 
-  Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler
+  Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
 
   This software is provided 'as-is', without any express or implied
   warranty.  In no event will the authors be held liable for any damages
@@ -37,11 +37,11 @@
 extern "C" {
 #endif
 
-#define ZLIB_VERSION "1.2.13"
-#define ZLIB_VERNUM 0x12d0
+#define ZLIB_VERSION "1.3.1"
+#define ZLIB_VERNUM 0x1310
 #define ZLIB_VER_MAJOR 1
-#define ZLIB_VER_MINOR 2
-#define ZLIB_VER_REVISION 13
+#define ZLIB_VER_MINOR 3
+#define ZLIB_VER_REVISION 1
 #define ZLIB_VER_SUBREVISION 0
 
 /*
@@ -78,8 +78,8 @@ extern "C" {
   even in the case of corrupted input.
 */
 
-typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
-typedef void   (*free_func)  OF((voidpf opaque, voidpf address));
+typedef voidpf (*alloc_func)(voidpf opaque, uInt items, uInt size);
+typedef void   (*free_func)(voidpf opaque, voidpf address);
 
 struct internal_state;
 
@@ -217,7 +217,7 @@ typedef gz_header FAR *gz_headerp;
 
                         /* basic functions */
 
-ZEXTERN const char * ZEXPORT zlibVersion OF((void));
+ZEXTERN const char * ZEXPORT zlibVersion(void);
 /* The application can compare zlibVersion and ZLIB_VERSION for consistency.
    If the first character differs, the library code actually used is not
    compatible with the zlib.h header file used by the application.  This check
@@ -225,12 +225,12 @@ ZEXTERN const char * ZEXPORT zlibVersion OF((void));
  */
 
 /*
-ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
+ZEXTERN int ZEXPORT deflateInit(z_streamp strm, int level);
 
      Initializes the internal stream state for compression.  The fields
    zalloc, zfree and opaque must be initialized before by the caller.  If
    zalloc and zfree are set to Z_NULL, deflateInit updates them to use default
-   allocation functions.
+   allocation functions.  total_in, total_out, adler, and msg are initialized.
 
      The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
    1 gives best speed, 9 gives best compression, 0 gives no compression at all
@@ -247,7 +247,7 @@ ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
 */
 
 
-ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
+ZEXTERN int ZEXPORT deflate(z_streamp strm, int flush);
 /*
     deflate compresses as much data as possible, and stops when the input
   buffer becomes empty or the output buffer becomes full.  It may introduce
@@ -320,8 +320,8 @@ ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
   with the same value of the flush parameter and more output space (updated
   avail_out), until the flush is complete (deflate returns with non-zero
   avail_out).  In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
-  avail_out is greater than six to avoid repeated flush markers due to
-  avail_out == 0 on return.
+  avail_out is greater than six when the flush marker begins, in order to avoid
+  repeated flush markers upon calling deflate() again when avail_out == 0.
 
     If the parameter flush is set to Z_FINISH, pending input is processed,
   pending output is flushed and deflate returns with Z_STREAM_END if there was
@@ -360,7 +360,7 @@ ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
 */
 
 
-ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
+ZEXTERN int ZEXPORT deflateEnd(z_streamp strm);
 /*
      All dynamically allocated data structures for this stream are freed.
    This function discards any unprocessed input and does not flush any pending
@@ -375,7 +375,7 @@ ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
 
 
 /*
-ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
+ZEXTERN int ZEXPORT inflateInit(z_streamp strm);
 
      Initializes the internal stream state for decompression.  The fields
    next_in, avail_in, zalloc, zfree and opaque must be initialized before by
@@ -383,7 +383,8 @@ ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
    read or consumed.  The allocation of a sliding window will be deferred to
    the first call of inflate (if the decompression does not complete on the
    first call).  If zalloc and zfree are set to Z_NULL, inflateInit updates
-   them to use default allocation functions.
+   them to use default allocation functions.  total_in, total_out, adler, and
+   msg are initialized.
 
      inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
    memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
@@ -397,7 +398,7 @@ ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
 */
 
 
-ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
+ZEXTERN int ZEXPORT inflate(z_streamp strm, int flush);
 /*
     inflate decompresses as much data as possible, and stops when the input
   buffer becomes empty or the output buffer becomes full.  It may introduce
@@ -517,7 +518,7 @@ ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
 */
 
 
-ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
+ZEXTERN int ZEXPORT inflateEnd(z_streamp strm);
 /*
      All dynamically allocated data structures for this stream are freed.
    This function discards any unprocessed input and does not flush any pending
@@ -535,12 +536,12 @@ ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
 */
 
 /*
-ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
-                                     int  level,
-                                     int  method,
-                                     int  windowBits,
-                                     int  memLevel,
-                                     int  strategy));
+ZEXTERN int ZEXPORT deflateInit2(z_streamp strm,
+                                 int level,
+                                 int method,
+                                 int windowBits,
+                                 int memLevel,
+                                 int strategy);
 
      This is another version of deflateInit with more compression options.  The
    fields zalloc, zfree and opaque must be initialized before by the caller.
@@ -607,9 +608,9 @@ ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
    compression: this will be done by deflate().
 */
 
-ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
-                                             const Bytef *dictionary,
-                                             uInt  dictLength));
+ZEXTERN int ZEXPORT deflateSetDictionary(z_streamp strm,
+                                         const Bytef *dictionary,
+                                         uInt  dictLength);
 /*
      Initializes the compression dictionary from the given byte sequence
    without producing any compressed output.  When using the zlib format, this
@@ -651,9 +652,9 @@ ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
    not perform any compression: this will be done by deflate().
 */
 
-ZEXTERN int ZEXPORT deflateGetDictionary OF((z_streamp strm,
-                                             Bytef *dictionary,
-                                             uInt  *dictLength));
+ZEXTERN int ZEXPORT deflateGetDictionary(z_streamp strm,
+                                         Bytef *dictionary,
+                                         uInt  *dictLength);
 /*
      Returns the sliding dictionary being maintained by deflate.  dictLength is
    set to the number of bytes in the dictionary, and that many bytes are copied
@@ -673,8 +674,8 @@ ZEXTERN int ZEXPORT deflateGetDictionary OF((z_streamp strm,
    stream state is inconsistent.
 */
 
-ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
-                                    z_streamp source));
+ZEXTERN int ZEXPORT deflateCopy(z_streamp dest,
+                                z_streamp source);
 /*
      Sets the destination stream as a complete copy of the source stream.
 
@@ -691,20 +692,20 @@ ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
    destination.
 */
 
-ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
+ZEXTERN int ZEXPORT deflateReset(z_streamp strm);
 /*
      This function is equivalent to deflateEnd followed by deflateInit, but
    does not free and reallocate the internal compression state.  The stream
    will leave the compression level and any other attributes that may have been
-   set unchanged.
+   set unchanged.  total_in, total_out, adler, and msg are initialized.
 
      deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
    stream state was inconsistent (such as zalloc or state being Z_NULL).
 */
 
-ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
-                                      int level,
-                                      int strategy));
+ZEXTERN int ZEXPORT deflateParams(z_streamp strm,
+                                  int level,
+                                  int strategy);
 /*
      Dynamically update the compression level and compression strategy.  The
    interpretation of level and strategy is as in deflateInit2().  This can be
@@ -729,7 +730,7 @@ ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
    Then no more input data should be provided before the deflateParams() call.
    If this is done, the old level and strategy will be applied to the data
    compressed before deflateParams(), and the new level and strategy will be
-   applied to the the data compressed after deflateParams().
+   applied to the data compressed after deflateParams().
 
      deflateParams returns Z_OK on success, Z_STREAM_ERROR if the source stream
    state was inconsistent or if a parameter was invalid, or Z_BUF_ERROR if
@@ -740,11 +741,11 @@ ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
    retried with more output space.
 */
 
-ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
-                                    int good_length,
-                                    int max_lazy,
-                                    int nice_length,
-                                    int max_chain));
+ZEXTERN int ZEXPORT deflateTune(z_streamp strm,
+                                int good_length,
+                                int max_lazy,
+                                int nice_length,
+                                int max_chain);
 /*
      Fine tune deflate's internal compression parameters.  This should only be
    used by someone who understands the algorithm used by zlib's deflate for
@@ -757,8 +758,8 @@ ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
    returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
  */
 
-ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
-                                       uLong sourceLen));
+ZEXTERN uLong ZEXPORT deflateBound(z_streamp strm,
+                                   uLong sourceLen);
 /*
      deflateBound() returns an upper bound on the compressed size after
    deflation of sourceLen bytes.  It must be called after deflateInit() or
@@ -772,9 +773,9 @@ ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
    than Z_FINISH or Z_NO_FLUSH are used.
 */
 
-ZEXTERN int ZEXPORT deflatePending OF((z_streamp strm,
-                                       unsigned *pending,
-                                       int *bits));
+ZEXTERN int ZEXPORT deflatePending(z_streamp strm,
+                                   unsigned *pending,
+                                   int *bits);
 /*
      deflatePending() returns the number of bytes and bits of output that have
    been generated, but not yet provided in the available output.  The bytes not
@@ -787,9 +788,9 @@ ZEXTERN int ZEXPORT deflatePending OF((z_streamp strm,
    stream state was inconsistent.
  */
 
-ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
-                                     int bits,
-                                     int value));
+ZEXTERN int ZEXPORT deflatePrime(z_streamp strm,
+                                 int bits,
+                                 int value);
 /*
      deflatePrime() inserts bits in the deflate output stream.  The intent
    is that this function is used to start off the deflate output with the bits
@@ -804,8 +805,8 @@ ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
    source stream state was inconsistent.
 */
 
-ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
-                                         gz_headerp head));
+ZEXTERN int ZEXPORT deflateSetHeader(z_streamp strm,
+                                     gz_headerp head);
 /*
      deflateSetHeader() provides gzip header information for when a gzip
    stream is requested by deflateInit2().  deflateSetHeader() may be called
@@ -821,16 +822,17 @@ ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
    gzip file" and give up.
 
      If deflateSetHeader is not used, the default gzip header has text false,
-   the time set to zero, and os set to 255, with no extra, name, or comment
-   fields.  The gzip header is returned to the default state by deflateReset().
+   the time set to zero, and os set to the current operating system, with no
+   extra, name, or comment fields.  The gzip header is returned to the default
+   state by deflateReset().
 
      deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
    stream state was inconsistent.
 */
 
 /*
-ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
-                                     int  windowBits));
+ZEXTERN int ZEXPORT inflateInit2(z_streamp strm,
+                                 int windowBits);
 
      This is another version of inflateInit with an extra parameter.  The
    fields next_in, avail_in, zalloc, zfree and opaque must be initialized
@@ -883,9 +885,9 @@ ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
    deferred until inflate() is called.
 */
 
-ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
-                                             const Bytef *dictionary,
-                                             uInt  dictLength));
+ZEXTERN int ZEXPORT inflateSetDictionary(z_streamp strm,
+                                         const Bytef *dictionary,
+                                         uInt  dictLength);
 /*
      Initializes the decompression dictionary from the given uncompressed byte
    sequence.  This function must be called immediately after a call of inflate,
@@ -906,9 +908,9 @@ ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
    inflate().
 */
 
-ZEXTERN int ZEXPORT inflateGetDictionary OF((z_streamp strm,
-                                             Bytef *dictionary,
-                                             uInt  *dictLength));
+ZEXTERN int ZEXPORT inflateGetDictionary(z_streamp strm,
+                                         Bytef *dictionary,
+                                         uInt  *dictLength);
 /*
      Returns the sliding dictionary being maintained by inflate.  dictLength is
    set to the number of bytes in the dictionary, and that many bytes are copied
@@ -921,7 +923,7 @@ ZEXTERN int ZEXPORT inflateGetDictionary OF((z_streamp strm,
    stream state is inconsistent.
 */
 
-ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+ZEXTERN int ZEXPORT inflateSync(z_streamp strm);
 /*
      Skips invalid compressed data until a possible full flush point (see above
    for the description of deflate with Z_FULL_FLUSH) can be found, or until all
@@ -934,14 +936,14 @@ ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
      inflateSync returns Z_OK if a possible full flush point has been found,
    Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point
    has been found, or Z_STREAM_ERROR if the stream structure was inconsistent.
-   In the success case, the application may save the current current value of
-   total_in which indicates where valid compressed data was found.  In the
-   error case, the application may repeatedly call inflateSync, providing more
-   input each time, until success or end of the input data.
+   In the success case, the application may save the current value of total_in
+   which indicates where valid compressed data was found.  In the error case,
+   the application may repeatedly call inflateSync, providing more input each
+   time, until success or end of the input data.
 */
 
-ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
-                                    z_streamp source));
+ZEXTERN int ZEXPORT inflateCopy(z_streamp dest,
+                                z_streamp source);
 /*
      Sets the destination stream as a complete copy of the source stream.
 
@@ -956,18 +958,19 @@ ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
    destination.
 */
 
-ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
+ZEXTERN int ZEXPORT inflateReset(z_streamp strm);
 /*
      This function is equivalent to inflateEnd followed by inflateInit,
    but does not free and reallocate the internal decompression state.  The
    stream will keep attributes that may have been set by inflateInit2.
+   total_in, total_out, adler, and msg are initialized.
 
      inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
    stream state was inconsistent (such as zalloc or state being Z_NULL).
 */
 
-ZEXTERN int ZEXPORT inflateReset2 OF((z_streamp strm,
-                                      int windowBits));
+ZEXTERN int ZEXPORT inflateReset2(z_streamp strm,
+                                  int windowBits);
 /*
      This function is the same as inflateReset, but it also permits changing
    the wrap and window size requests.  The windowBits parameter is interpreted
@@ -980,9 +983,9 @@ ZEXTERN int ZEXPORT inflateReset2 OF((z_streamp strm,
    the windowBits parameter is invalid.
 */
 
-ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
-                                     int bits,
-                                     int value));
+ZEXTERN int ZEXPORT inflatePrime(z_streamp strm,
+                                 int bits,
+                                 int value);
 /*
      This function inserts bits in the inflate input stream.  The intent is
    that this function is used to start inflating at a bit position in the
@@ -1001,7 +1004,7 @@ ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
    stream state was inconsistent.
 */
 
-ZEXTERN long ZEXPORT inflateMark OF((z_streamp strm));
+ZEXTERN long ZEXPORT inflateMark(z_streamp strm);
 /*
      This function returns two values, one in the lower 16 bits of the return
    value, and the other in the remaining upper bits, obtained by shifting the
@@ -1029,8 +1032,8 @@ ZEXTERN long ZEXPORT inflateMark OF((z_streamp strm));
    source stream state was inconsistent.
 */
 
-ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
-                                         gz_headerp head));
+ZEXTERN int ZEXPORT inflateGetHeader(z_streamp strm,
+                                     gz_headerp head);
 /*
      inflateGetHeader() requests that gzip header information be stored in the
    provided gz_header structure.  inflateGetHeader() may be called after
@@ -1070,8 +1073,8 @@ ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
 */
 
 /*
-ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
-                                        unsigned char FAR *window));
+ZEXTERN int ZEXPORT inflateBackInit(z_streamp strm, int windowBits,
+                                    unsigned char FAR *window);
 
      Initialize the internal stream state for decompression using inflateBack()
    calls.  The fields zalloc, zfree and opaque in strm must be initialized
@@ -1091,13 +1094,13 @@ ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
    the version of the header file.
 */
 
-typedef unsigned (*in_func) OF((void FAR *,
-                                z_const unsigned char FAR * FAR *));
-typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
+typedef unsigned (*in_func)(void FAR *,
+                            z_const unsigned char FAR * FAR *);
+typedef int (*out_func)(void FAR *, unsigned char FAR *, unsigned);
 
-ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
-                                    in_func in, void FAR *in_desc,
-                                    out_func out, void FAR *out_desc));
+ZEXTERN int ZEXPORT inflateBack(z_streamp strm,
+                                in_func in, void FAR *in_desc,
+                                out_func out, void FAR *out_desc);
 /*
      inflateBack() does a raw inflate with a single call using a call-back
    interface for input and output.  This is potentially more efficient than
@@ -1165,7 +1168,7 @@ ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
    cannot return Z_OK.
 */
 
-ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
+ZEXTERN int ZEXPORT inflateBackEnd(z_streamp strm);
 /*
      All memory allocated by inflateBackInit() is freed.
 
@@ -1173,7 +1176,7 @@ ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
    state was inconsistent.
 */
 
-ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
+ZEXTERN uLong ZEXPORT zlibCompileFlags(void);
 /* Return flags indicating compile-time options.
 
     Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
@@ -1226,8 +1229,8 @@ ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
    you need special options.
 */
 
-ZEXTERN int ZEXPORT compress OF((Bytef *dest,   uLongf *destLen,
-                                 const Bytef *source, uLong sourceLen));
+ZEXTERN int ZEXPORT compress(Bytef *dest,   uLongf *destLen,
+                             const Bytef *source, uLong sourceLen);
 /*
      Compresses the source buffer into the destination buffer.  sourceLen is
    the byte length of the source buffer.  Upon entry, destLen is the total size
@@ -1241,9 +1244,9 @@ ZEXTERN int ZEXPORT compress OF((Bytef *dest,   uLongf *destLen,
    buffer.
 */
 
-ZEXTERN int ZEXPORT compress2 OF((Bytef *dest,   uLongf *destLen,
-                                  const Bytef *source, uLong sourceLen,
-                                  int level));
+ZEXTERN int ZEXPORT compress2(Bytef *dest,   uLongf *destLen,
+                              const Bytef *source, uLong sourceLen,
+                              int level);
 /*
      Compresses the source buffer into the destination buffer.  The level
    parameter has the same meaning as in deflateInit.  sourceLen is the byte
@@ -1257,15 +1260,15 @@ ZEXTERN int ZEXPORT compress2 OF((Bytef *dest,   uLongf *destLen,
    Z_STREAM_ERROR if the level parameter is invalid.
 */
 
-ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
+ZEXTERN uLong ZEXPORT compressBound(uLong sourceLen);
 /*
      compressBound() returns an upper bound on the compressed size after
    compress() or compress2() on sourceLen bytes.  It would be used before a
    compress() or compress2() call to allocate the destination buffer.
 */
 
-ZEXTERN int ZEXPORT uncompress OF((Bytef *dest,   uLongf *destLen,
-                                   const Bytef *source, uLong sourceLen));
+ZEXTERN int ZEXPORT uncompress(Bytef *dest,   uLongf *destLen,
+                               const Bytef *source, uLong sourceLen);
 /*
      Decompresses the source buffer into the destination buffer.  sourceLen is
    the byte length of the source buffer.  Upon entry, destLen is the total size
@@ -1282,8 +1285,8 @@ ZEXTERN int ZEXPORT uncompress OF((Bytef *dest,   uLongf *destLen,
    buffer with the uncompressed data up to that point.
 */
 
-ZEXTERN int ZEXPORT uncompress2 OF((Bytef *dest,   uLongf *destLen,
-                                    const Bytef *source, uLong *sourceLen));
+ZEXTERN int ZEXPORT uncompress2(Bytef *dest,   uLongf *destLen,
+                                const Bytef *source, uLong *sourceLen);
 /*
      Same as uncompress, except that sourceLen is a pointer, where the
    length of the source is *sourceLen.  On return, *sourceLen is the number of
@@ -1302,7 +1305,7 @@ ZEXTERN int ZEXPORT uncompress2 OF((Bytef *dest,   uLongf *destLen,
 typedef struct gzFile_s *gzFile;    /* semi-opaque gzip file descriptor */
 
 /*
-ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
+ZEXTERN gzFile ZEXPORT gzopen(const char *path, const char *mode);
 
      Open the gzip (.gz) file at path for reading and decompressing, or
    compressing and writing.  The mode parameter is as in fopen ("rb" or "wb")
@@ -1339,7 +1342,7 @@ ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
    file could not be opened.
 */
 
-ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
+ZEXTERN gzFile ZEXPORT gzdopen(int fd, const char *mode);
 /*
      Associate a gzFile with the file descriptor fd.  File descriptors are
    obtained from calls like open, dup, creat, pipe or fileno (if the file has
@@ -1362,7 +1365,7 @@ ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
    will not detect if fd is invalid (unless fd is -1).
 */
 
-ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size));
+ZEXTERN int ZEXPORT gzbuffer(gzFile file, unsigned size);
 /*
      Set the internal buffer size used by this library's functions for file to
    size.  The default buffer size is 8192 bytes.  This function must be called
@@ -1378,7 +1381,7 @@ ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size));
    too late.
 */
 
-ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
+ZEXTERN int ZEXPORT gzsetparams(gzFile file, int level, int strategy);
 /*
      Dynamically update the compression level and strategy for file.  See the
    description of deflateInit2 for the meaning of these parameters. Previously
@@ -1389,7 +1392,7 @@ ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
    or Z_MEM_ERROR if there is a memory allocation error.
 */
 
-ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
+ZEXTERN int ZEXPORT gzread(gzFile file, voidp buf, unsigned len);
 /*
      Read and decompress up to len uncompressed bytes from file into buf.  If
    the input file is not in gzip format, gzread copies the given number of
@@ -1419,8 +1422,8 @@ ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
    Z_STREAM_ERROR.
 */
 
-ZEXTERN z_size_t ZEXPORT gzfread OF((voidp buf, z_size_t size, z_size_t nitems,
-                                     gzFile file));
+ZEXTERN z_size_t ZEXPORT gzfread(voidp buf, z_size_t size, z_size_t nitems,
+                                 gzFile file);
 /*
      Read and decompress up to nitems items of size size from file into buf,
    otherwise operating as gzread() does.  This duplicates the interface of
@@ -1445,14 +1448,14 @@ ZEXTERN z_size_t ZEXPORT gzfread OF((voidp buf, z_size_t size, z_size_t nitems,
    file, resetting and retrying on end-of-file, when size is not 1.
 */
 
-ZEXTERN int ZEXPORT gzwrite OF((gzFile file, voidpc buf, unsigned len));
+ZEXTERN int ZEXPORT gzwrite(gzFile file, voidpc buf, unsigned len);
 /*
      Compress and write the len uncompressed bytes at buf to file. gzwrite
    returns the number of uncompressed bytes written or 0 in case of error.
 */
 
-ZEXTERN z_size_t ZEXPORT gzfwrite OF((voidpc buf, z_size_t size,
-                                      z_size_t nitems, gzFile file));
+ZEXTERN z_size_t ZEXPORT gzfwrite(voidpc buf, z_size_t size,
+                                  z_size_t nitems, gzFile file);
 /*
      Compress and write nitems items of size size from buf to file, duplicating
    the interface of stdio's fwrite(), with size_t request and return types.  If
@@ -1465,7 +1468,7 @@ ZEXTERN z_size_t ZEXPORT gzfwrite OF((voidpc buf, z_size_t size,
    is returned, and the error state is set to Z_STREAM_ERROR.
 */
 
-ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...));
+ZEXTERN int ZEXPORTVA gzprintf(gzFile file, const char *format, ...);
 /*
      Convert, format, compress, and write the arguments (...) to file under
    control of the string format, as in fprintf.  gzprintf returns the number of
@@ -1480,7 +1483,7 @@ ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...));
    This can be determined using zlibCompileFlags().
 */
 
-ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
+ZEXTERN int ZEXPORT gzputs(gzFile file, const char *s);
 /*
      Compress and write the given null-terminated string s to file, excluding
    the terminating null character.
@@ -1488,7 +1491,7 @@ ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
      gzputs returns the number of characters written, or -1 in case of error.
 */
 
-ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
+ZEXTERN char * ZEXPORT gzgets(gzFile file, char *buf, int len);
 /*
      Read and decompress bytes from file into buf, until len-1 characters are
    read, or until a newline character is read and transferred to buf, or an
@@ -1502,13 +1505,13 @@ ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
    buf are indeterminate.
 */
 
-ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
+ZEXTERN int ZEXPORT gzputc(gzFile file, int c);
 /*
      Compress and write c, converted to an unsigned char, into file.  gzputc
    returns the value that was written, or -1 in case of error.
 */
 
-ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
+ZEXTERN int ZEXPORT gzgetc(gzFile file);
 /*
      Read and decompress one byte from file.  gzgetc returns this byte or -1
    in case of end of file or error.  This is implemented as a macro for speed.
@@ -1517,7 +1520,7 @@ ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
    points to has been clobbered or not.
 */
 
-ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
+ZEXTERN int ZEXPORT gzungetc(int c, gzFile file);
 /*
      Push c back onto the stream for file to be read as the first character on
    the next read.  At least one character of push-back is always allowed.
@@ -1529,7 +1532,7 @@ ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
    gzseek() or gzrewind().
 */
 
-ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
+ZEXTERN int ZEXPORT gzflush(gzFile file, int flush);
 /*
      Flush all pending output to file.  The parameter flush is as in the
    deflate() function.  The return value is the zlib error number (see function
@@ -1545,8 +1548,8 @@ ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
 */
 
 /*
-ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
-                                   z_off_t offset, int whence));
+ZEXTERN z_off_t ZEXPORT gzseek(gzFile file,
+                               z_off_t offset, int whence);
 
      Set the starting position to offset relative to whence for the next gzread
    or gzwrite on file.  The offset represents a number of bytes in the
@@ -1564,7 +1567,7 @@ ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
    would be before the current position.
 */
 
-ZEXTERN int ZEXPORT    gzrewind OF((gzFile file));
+ZEXTERN int ZEXPORT    gzrewind(gzFile file);
 /*
      Rewind file. This function is supported only for reading.
 
@@ -1572,7 +1575,7 @@ ZEXTERN int ZEXPORT    gzrewind OF((gzFile file));
 */
 
 /*
-ZEXTERN z_off_t ZEXPORT    gztell OF((gzFile file));
+ZEXTERN z_off_t ZEXPORT    gztell(gzFile file);
 
      Return the starting position for the next gzread or gzwrite on file.
    This position represents a number of bytes in the uncompressed data stream,
@@ -1583,7 +1586,7 @@ ZEXTERN z_off_t ZEXPORT    gztell OF((gzFile file));
 */
 
 /*
-ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file));
+ZEXTERN z_off_t ZEXPORT gzoffset(gzFile file);
 
      Return the current compressed (actual) read or write offset of file.  This
    offset includes the count of bytes that precede the gzip stream, for example
@@ -1592,7 +1595,7 @@ ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file));
    be used for a progress indicator.  On error, gzoffset() returns -1.
 */
 
-ZEXTERN int ZEXPORT gzeof OF((gzFile file));
+ZEXTERN int ZEXPORT gzeof(gzFile file);
 /*
      Return true (1) if the end-of-file indicator for file has been set while
    reading, false (0) otherwise.  Note that the end-of-file indicator is set
@@ -1607,7 +1610,7 @@ ZEXTERN int ZEXPORT gzeof OF((gzFile file));
    has grown since the previous end of file was detected.
 */
 
-ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
+ZEXTERN int ZEXPORT gzdirect(gzFile file);
 /*
      Return true (1) if file is being copied directly while reading, or false
    (0) if file is a gzip stream being decompressed.
@@ -1628,7 +1631,7 @@ ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
    gzip file reading and decompression, which may not be desired.)
 */
 
-ZEXTERN int ZEXPORT    gzclose OF((gzFile file));
+ZEXTERN int ZEXPORT    gzclose(gzFile file);
 /*
      Flush all pending output for file, if necessary, close file and
    deallocate the (de)compression state.  Note that once file is closed, you
@@ -1641,8 +1644,8 @@ ZEXTERN int ZEXPORT    gzclose OF((gzFile file));
    last read ended in the middle of a gzip stream, or Z_OK on success.
 */
 
-ZEXTERN int ZEXPORT gzclose_r OF((gzFile file));
-ZEXTERN int ZEXPORT gzclose_w OF((gzFile file));
+ZEXTERN int ZEXPORT gzclose_r(gzFile file);
+ZEXTERN int ZEXPORT gzclose_w(gzFile file);
 /*
      Same as gzclose(), but gzclose_r() is only for use when reading, and
    gzclose_w() is only for use when writing or appending.  The advantage to
@@ -1653,7 +1656,7 @@ ZEXTERN int ZEXPORT gzclose_w OF((gzFile file));
    zlib library.
 */
 
-ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
+ZEXTERN const char * ZEXPORT gzerror(gzFile file, int *errnum);
 /*
      Return the error message for the last error which occurred on file.
    errnum is set to zlib error number.  If an error occurred in the file system
@@ -1669,7 +1672,7 @@ ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
    functions above that do not distinguish those cases in their return values.
 */
 
-ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
+ZEXTERN void ZEXPORT gzclearerr(gzFile file);
 /*
      Clear the error and end-of-file flags for file.  This is analogous to the
    clearerr() function in stdio.  This is useful for continuing to read a gzip
@@ -1686,7 +1689,7 @@ ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
    library.
 */
 
-ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+ZEXTERN uLong ZEXPORT adler32(uLong adler, const Bytef *buf, uInt len);
 /*
      Update a running Adler-32 checksum with the bytes buf[0..len-1] and
    return the updated checksum. An Adler-32 value is in the range of a 32-bit
@@ -1706,15 +1709,15 @@ ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
      if (adler != original_adler) error();
 */
 
-ZEXTERN uLong ZEXPORT adler32_z OF((uLong adler, const Bytef *buf,
-                                    z_size_t len));
+ZEXTERN uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf,
+                                z_size_t len);
 /*
      Same as adler32(), but with a size_t length.
 */
 
 /*
-ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
-                                          z_off_t len2));
+ZEXTERN uLong ZEXPORT adler32_combine(uLong adler1, uLong adler2,
+                                      z_off_t len2);
 
      Combine two Adler-32 checksums into one.  For two sequences of bytes, seq1
    and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
@@ -1724,7 +1727,7 @@ ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
    negative, the result has no meaning or utility.
 */
 
-ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
+ZEXTERN uLong ZEXPORT crc32(uLong crc, const Bytef *buf, uInt len);
 /*
      Update a running CRC-32 with the bytes buf[0..len-1] and return the
    updated CRC-32. A CRC-32 value is in the range of a 32-bit unsigned integer.
@@ -1742,30 +1745,30 @@ ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
      if (crc != original_crc) error();
 */
 
-ZEXTERN uLong ZEXPORT crc32_z OF((uLong crc, const Bytef *buf,
-                                  z_size_t len));
+ZEXTERN uLong ZEXPORT crc32_z(uLong crc, const Bytef *buf,
+                              z_size_t len);
 /*
      Same as crc32(), but with a size_t length.
 */
 
 /*
-ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
+ZEXTERN uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2);
 
      Combine two CRC-32 check values into one.  For two sequences of bytes,
    seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
    calculated for each, crc1 and crc2.  crc32_combine() returns the CRC-32
    check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
-   len2.
+   len2. len2 must be non-negative.
 */
 
 /*
-ZEXTERN uLong ZEXPORT crc32_combine_gen OF((z_off_t len2));
+ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t len2);
 
      Return the operator corresponding to length len2, to be used with
-   crc32_combine_op().
+   crc32_combine_op(). len2 must be non-negative.
 */
 
-ZEXTERN uLong ZEXPORT crc32_combine_op OF((uLong crc1, uLong crc2, uLong op));
+ZEXTERN uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op);
 /*
      Give the same result as crc32_combine(), using op in place of len2. op is
    is generated from len2 by crc32_combine_gen(). This will be faster than
@@ -1778,20 +1781,20 @@ ZEXTERN uLong ZEXPORT crc32_combine_op OF((uLong crc1, uLong crc2, uLong op));
 /* deflateInit and inflateInit are macros to allow checking the zlib version
  * and the compiler's view of z_stream:
  */
-ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
-                                     const char *version, int stream_size));
-ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
-                                     const char *version, int stream_size));
-ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int  level, int  method,
-                                      int windowBits, int memLevel,
-                                      int strategy, const char *version,
-                                      int stream_size));
-ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int  windowBits,
-                                      const char *version, int stream_size));
-ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
-                                         unsigned char FAR *window,
-                                         const char *version,
-                                         int stream_size));
+ZEXTERN int ZEXPORT deflateInit_(z_streamp strm, int level,
+                                 const char *version, int stream_size);
+ZEXTERN int ZEXPORT inflateInit_(z_streamp strm,
+                                 const char *version, int stream_size);
+ZEXTERN int ZEXPORT deflateInit2_(z_streamp strm, int  level, int  method,
+                                  int windowBits, int memLevel,
+                                  int strategy, const char *version,
+                                  int stream_size);
+ZEXTERN int ZEXPORT inflateInit2_(z_streamp strm, int  windowBits,
+                                  const char *version, int stream_size);
+ZEXTERN int ZEXPORT inflateBackInit_(z_streamp strm, int windowBits,
+                                     unsigned char FAR *window,
+                                     const char *version,
+                                     int stream_size);
 #ifdef Z_PREFIX_SET
 #  define z_deflateInit(strm, level) \
           deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
@@ -1836,7 +1839,7 @@ struct gzFile_s {
     unsigned char *next;
     z_off64_t pos;
 };
-ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file));  /* backward compatibility */
+ZEXTERN int ZEXPORT gzgetc_(gzFile file);       /* backward compatibility */
 #ifdef Z_PREFIX_SET
 #  undef z_gzgetc
 #  define z_gzgetc(g) \
@@ -1853,13 +1856,13 @@ ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file));  /* backward compatibility */
  * without large file support, _LFS64_LARGEFILE must also be true
  */
 #ifdef Z_LARGE64
-   ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
-   ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
-   ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
-   ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
-   ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off64_t));
-   ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off64_t));
-   ZEXTERN uLong ZEXPORT crc32_combine_gen64 OF((z_off64_t));
+   ZEXTERN gzFile ZEXPORT gzopen64(const char *, const char *);
+   ZEXTERN z_off64_t ZEXPORT gzseek64(gzFile, z_off64_t, int);
+   ZEXTERN z_off64_t ZEXPORT gztell64(gzFile);
+   ZEXTERN z_off64_t ZEXPORT gzoffset64(gzFile);
+   ZEXTERN uLong ZEXPORT adler32_combine64(uLong, uLong, z_off64_t);
+   ZEXTERN uLong ZEXPORT crc32_combine64(uLong, uLong, z_off64_t);
+   ZEXTERN uLong ZEXPORT crc32_combine_gen64(z_off64_t);
 #endif
 
 #if !defined(ZLIB_INTERNAL) && defined(Z_WANT64)
@@ -1881,50 +1884,50 @@ ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file));  /* backward compatibility */
 #    define crc32_combine_gen crc32_combine_gen64
 #  endif
 #  ifndef Z_LARGE64
-     ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
-     ZEXTERN z_off_t ZEXPORT gzseek64 OF((gzFile, z_off_t, int));
-     ZEXTERN z_off_t ZEXPORT gztell64 OF((gzFile));
-     ZEXTERN z_off_t ZEXPORT gzoffset64 OF((gzFile));
-     ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
-     ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
-     ZEXTERN uLong ZEXPORT crc32_combine_gen64 OF((z_off_t));
+     ZEXTERN gzFile ZEXPORT gzopen64(const char *, const char *);
+     ZEXTERN z_off_t ZEXPORT gzseek64(gzFile, z_off_t, int);
+     ZEXTERN z_off_t ZEXPORT gztell64(gzFile);
+     ZEXTERN z_off_t ZEXPORT gzoffset64(gzFile);
+     ZEXTERN uLong ZEXPORT adler32_combine64(uLong, uLong, z_off_t);
+     ZEXTERN uLong ZEXPORT crc32_combine64(uLong, uLong, z_off_t);
+     ZEXTERN uLong ZEXPORT crc32_combine_gen64(z_off_t);
 #  endif
 #else
-   ZEXTERN gzFile ZEXPORT gzopen OF((const char *, const char *));
-   ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile, z_off_t, int));
-   ZEXTERN z_off_t ZEXPORT gztell OF((gzFile));
-   ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile));
-   ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
-   ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
-   ZEXTERN uLong ZEXPORT crc32_combine_gen OF((z_off_t));
+   ZEXTERN gzFile ZEXPORT gzopen(const char *, const char *);
+   ZEXTERN z_off_t ZEXPORT gzseek(gzFile, z_off_t, int);
+   ZEXTERN z_off_t ZEXPORT gztell(gzFile);
+   ZEXTERN z_off_t ZEXPORT gzoffset(gzFile);
+   ZEXTERN uLong ZEXPORT adler32_combine(uLong, uLong, z_off_t);
+   ZEXTERN uLong ZEXPORT crc32_combine(uLong, uLong, z_off_t);
+   ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t);
 #endif
 
 #else /* Z_SOLO */
 
-   ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
-   ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
-   ZEXTERN uLong ZEXPORT crc32_combine_gen OF((z_off_t));
+   ZEXTERN uLong ZEXPORT adler32_combine(uLong, uLong, z_off_t);
+   ZEXTERN uLong ZEXPORT crc32_combine(uLong, uLong, z_off_t);
+   ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t);
 
 #endif /* !Z_SOLO */
 
 /* undocumented functions */
-ZEXTERN const char   * ZEXPORT zError           OF((int));
-ZEXTERN int            ZEXPORT inflateSyncPoint OF((z_streamp));
-ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table    OF((void));
-ZEXTERN int            ZEXPORT inflateUndermine OF((z_streamp, int));
-ZEXTERN int            ZEXPORT inflateValidate OF((z_streamp, int));
-ZEXTERN unsigned long  ZEXPORT inflateCodesUsed OF((z_streamp));
-ZEXTERN int            ZEXPORT inflateResetKeep OF((z_streamp));
-ZEXTERN int            ZEXPORT deflateResetKeep OF((z_streamp));
+ZEXTERN const char   * ZEXPORT zError(int);
+ZEXTERN int            ZEXPORT inflateSyncPoint(z_streamp);
+ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table(void);
+ZEXTERN int            ZEXPORT inflateUndermine(z_streamp, int);
+ZEXTERN int            ZEXPORT inflateValidate(z_streamp, int);
+ZEXTERN unsigned long  ZEXPORT inflateCodesUsed(z_streamp);
+ZEXTERN int            ZEXPORT inflateResetKeep(z_streamp);
+ZEXTERN int            ZEXPORT deflateResetKeep(z_streamp);
 #if defined(_WIN32) && !defined(Z_SOLO)
-ZEXTERN gzFile         ZEXPORT gzopen_w OF((const wchar_t *path,
-                                            const char *mode));
+ZEXTERN gzFile         ZEXPORT gzopen_w(const wchar_t *path,
+                                        const char *mode);
 #endif
 #if defined(STDC) || defined(Z_HAVE_STDARG_H)
 #  ifndef Z_SOLO
-ZEXTERN int            ZEXPORTVA gzvprintf Z_ARG((gzFile file,
-                                                  const char *format,
-                                                  va_list va));
+ZEXTERN int            ZEXPORTVA gzvprintf(gzFile file,
+                                           const char *format,
+                                           va_list va);
 #  endif
 #endif
 
diff --git a/3rdparty/zlib/zutil.c b/3rdparty/zlib/zutil.c
index 9543ae825e32..b1c5d2d3c6da 100644
--- a/3rdparty/zlib/zutil.c
+++ b/3rdparty/zlib/zutil.c
@@ -24,13 +24,11 @@ z_const char * const z_errmsg[10] = {
 };
 
 
-const char * ZEXPORT zlibVersion()
-{
+const char * ZEXPORT zlibVersion(void) {
     return ZLIB_VERSION;
 }
 
-uLong ZEXPORT zlibCompileFlags()
-{
+uLong ZEXPORT zlibCompileFlags(void) {
     uLong flags;
 
     flags = 0;
@@ -121,9 +119,7 @@ uLong ZEXPORT zlibCompileFlags()
 #  endif
 int ZLIB_INTERNAL z_verbose = verbose;
 
-void ZLIB_INTERNAL z_error(m)
-    char *m;
-{
+void ZLIB_INTERNAL z_error(char *m) {
     fprintf(stderr, "%s\n", m);
     exit(1);
 }
@@ -132,9 +128,7 @@ void ZLIB_INTERNAL z_error(m)
 /* exported to allow conversion of error code to string for compress() and
  * uncompress()
  */
-const char * ZEXPORT zError(err)
-    int err;
-{
+const char * ZEXPORT zError(int err) {
     return ERR_MSG(err);
 }
 
@@ -148,22 +142,14 @@ const char * ZEXPORT zError(err)
 
 #ifndef HAVE_MEMCPY
 
-void ZLIB_INTERNAL zmemcpy(dest, source, len)
-    Bytef* dest;
-    const Bytef* source;
-    uInt  len;
-{
+void ZLIB_INTERNAL zmemcpy(Bytef* dest, const Bytef* source, uInt len) {
     if (len == 0) return;
     do {
         *dest++ = *source++; /* ??? to be unrolled */
     } while (--len != 0);
 }
 
-int ZLIB_INTERNAL zmemcmp(s1, s2, len)
-    const Bytef* s1;
-    const Bytef* s2;
-    uInt  len;
-{
+int ZLIB_INTERNAL zmemcmp(const Bytef* s1, const Bytef* s2, uInt len) {
     uInt j;
 
     for (j = 0; j < len; j++) {
@@ -172,10 +158,7 @@ int ZLIB_INTERNAL zmemcmp(s1, s2, len)
     return 0;
 }
 
-void ZLIB_INTERNAL zmemzero(dest, len)
-    Bytef* dest;
-    uInt  len;
-{
+void ZLIB_INTERNAL zmemzero(Bytef* dest, uInt len) {
     if (len == 0) return;
     do {
         *dest++ = 0;  /* ??? to be unrolled */
@@ -216,8 +199,7 @@ local ptr_table table[MAX_PTR];
  * a protected system like OS/2. Use Microsoft C instead.
  */
 
-voidpf ZLIB_INTERNAL zcalloc(voidpf opaque, unsigned items, unsigned size)
-{
+voidpf ZLIB_INTERNAL zcalloc(voidpf opaque, unsigned items, unsigned size) {
     voidpf buf;
     ulg bsize = (ulg)items*size;
 
@@ -242,8 +224,7 @@ voidpf ZLIB_INTERNAL zcalloc(voidpf opaque, unsigned items, unsigned size)
     return buf;
 }
 
-void ZLIB_INTERNAL zcfree(voidpf opaque, voidpf ptr)
-{
+void ZLIB_INTERNAL zcfree(voidpf opaque, voidpf ptr) {
     int n;
 
     (void)opaque;
@@ -279,14 +260,12 @@ void ZLIB_INTERNAL zcfree(voidpf opaque, voidpf ptr)
 #  define _hfree   hfree
 #endif
 
-voidpf ZLIB_INTERNAL zcalloc(voidpf opaque, uInt items, uInt size)
-{
+voidpf ZLIB_INTERNAL zcalloc(voidpf opaque, uInt items, uInt size) {
     (void)opaque;
     return _halloc((long)items, size);
 }
 
-void ZLIB_INTERNAL zcfree(voidpf opaque, voidpf ptr)
-{
+void ZLIB_INTERNAL zcfree(voidpf opaque, voidpf ptr) {
     (void)opaque;
     _hfree(ptr);
 }
@@ -299,25 +278,18 @@ void ZLIB_INTERNAL zcfree(voidpf opaque, voidpf ptr)
 #ifndef MY_ZCALLOC /* Any system without a special alloc function */
 
 #ifndef STDC
-extern voidp  malloc OF((uInt size));
-extern voidp  calloc OF((uInt items, uInt size));
-extern void   free   OF((voidpf ptr));
+extern voidp malloc(uInt size);
+extern voidp calloc(uInt items, uInt size);
+extern void free(voidpf ptr);
 #endif
 
-voidpf ZLIB_INTERNAL zcalloc(opaque, items, size)
-    voidpf opaque;
-    unsigned items;
-    unsigned size;
-{
+voidpf ZLIB_INTERNAL zcalloc(voidpf opaque, unsigned items, unsigned size) {
     (void)opaque;
     return sizeof(uInt) > 2 ? (voidpf)malloc(items * size) :
                               (voidpf)calloc(items, size);
 }
 
-void ZLIB_INTERNAL zcfree(opaque, ptr)
-    voidpf opaque;
-    voidpf ptr;
-{
+void ZLIB_INTERNAL zcfree(voidpf opaque, voidpf ptr) {
     (void)opaque;
     free(ptr);
 }
diff --git a/3rdparty/zlib/zutil.h b/3rdparty/zlib/zutil.h
index 0bc7f4ecd1c0..48dd7febae65 100644
--- a/3rdparty/zlib/zutil.h
+++ b/3rdparty/zlib/zutil.h
@@ -1,5 +1,5 @@
 /* zutil.h -- internal interface and configuration of the compression library
- * Copyright (C) 1995-2022 Jean-loup Gailly, Mark Adler
+ * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
@@ -56,7 +56,7 @@ typedef unsigned long  ulg;
 extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 /* (size given to avoid silly warnings with Visual C++) */
 
-#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
+#define ERR_MSG(err) z_errmsg[(err) < -6 || (err) > 2 ? 9 : 2 - (err)]
 
 #define ERR_RETURN(strm,err) \
   return (strm->msg = ERR_MSG(err), (err))
@@ -137,17 +137,8 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #  endif
 #endif
 
-#if defined(MACOS) || defined(TARGET_OS_MAC)
+#if defined(MACOS)
 #  define OS_CODE  7
-#  ifndef Z_SOLO
-#    if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os
-#      include <unix.h> /* for fdopen */
-#    else
-#      ifndef fdopen
-#        define fdopen(fd,mode) NULL /* No fdopen() */
-#      endif
-#    endif
-#  endif
 #endif
 
 #ifdef __acorn
@@ -170,18 +161,6 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #  define OS_CODE 19
 #endif
 
-#if defined(_BEOS_) || defined(RISCOS)
-#  define fdopen(fd,mode) NULL /* No fdopen() */
-#endif
-
-#if (defined(_MSC_VER) && (_MSC_VER > 600)) && !defined __INTERIX
-#  if defined(_WIN32_WCE)
-#    define fdopen(fd,mode) NULL /* No fdopen() */
-#  else
-#    define fdopen(fd,type)  _fdopen(fd,type)
-#  endif
-#endif
-
 #if defined(__BORLANDC__) && !defined(MSDOS)
   #pragma warn -8004
   #pragma warn -8008
@@ -191,9 +170,9 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 /* provide prototypes for these when building zlib without LFS */
 #if !defined(_WIN32) && \
     (!defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0)
-    ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
-    ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
-    ZEXTERN uLong ZEXPORT crc32_combine_gen64 OF((z_off_t));
+    ZEXTERN uLong ZEXPORT adler32_combine64(uLong, uLong, z_off_t);
+    ZEXTERN uLong ZEXPORT crc32_combine64(uLong, uLong, z_off_t);
+    ZEXTERN uLong ZEXPORT crc32_combine_gen64(z_off_t);
 #endif
 
         /* common defaults */
@@ -232,16 +211,16 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #    define zmemzero(dest, len) memset(dest, 0, len)
 #  endif
 #else
-   void ZLIB_INTERNAL zmemcpy OF((Bytef* dest, const Bytef* source, uInt len));
-   int ZLIB_INTERNAL zmemcmp OF((const Bytef* s1, const Bytef* s2, uInt len));
-   void ZLIB_INTERNAL zmemzero OF((Bytef* dest, uInt len));
+   void ZLIB_INTERNAL zmemcpy(Bytef* dest, const Bytef* source, uInt len);
+   int ZLIB_INTERNAL zmemcmp(const Bytef* s1, const Bytef* s2, uInt len);
+   void ZLIB_INTERNAL zmemzero(Bytef* dest, uInt len);
 #endif
 
 /* Diagnostic functions */
 #ifdef ZLIB_DEBUG
 #  include <stdio.h>
    extern int ZLIB_INTERNAL z_verbose;
-   extern void ZLIB_INTERNAL z_error OF((char *m));
+   extern void ZLIB_INTERNAL z_error(char *m);
 #  define Assert(cond,msg) {if(!(cond)) z_error(msg);}
 #  define Trace(x) {if (z_verbose>=0) fprintf x ;}
 #  define Tracev(x) {if (z_verbose>0) fprintf x ;}
@@ -258,9 +237,9 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #endif
 
 #ifndef Z_SOLO
-   voidpf ZLIB_INTERNAL zcalloc OF((voidpf opaque, unsigned items,
-                                    unsigned size));
-   void ZLIB_INTERNAL zcfree  OF((voidpf opaque, voidpf ptr));
+   voidpf ZLIB_INTERNAL zcalloc(voidpf opaque, unsigned items,
+                                unsigned size);
+   void ZLIB_INTERNAL zcfree(voidpf opaque, voidpf ptr);
 #endif
 
 #define ZALLOC(strm, items, size) \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ceeb8b8d7d8a..002b352d94a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,6 @@
 #      $ cmake <PATH_TO_OPENCV_ROOT>
 #
 # ----------------------------------------------------------------------------
-
 # Disable in-source builds to prevent source tree corruption.
 if(" ${CMAKE_SOURCE_DIR}" STREQUAL " ${CMAKE_BINARY_DIR}")
   message(FATAL_ERROR "
@@ -14,7 +13,6 @@ FATAL: In-source builds are not allowed.
 ")
 endif()
 
-
 include(cmake/OpenCVMinDepVersions.cmake)
 
 if(CMAKE_SYSTEM_NAME MATCHES WindowsPhone OR CMAKE_SYSTEM_NAME MATCHES WindowsStore)
@@ -66,6 +64,10 @@ if(POLICY CMP0068)
   cmake_policy(SET CMP0068 NEW)  # CMake 3.9+: `RPATH` settings on macOS do not affect `install_name`.
 endif()
 
+if(POLICY CMP0071)
+  cmake_policy(SET CMP0071 NEW)  # CMake 3.10+: Let `AUTOMOC` and `AUTOUIC` process `GENERATED` files.
+endif()
+
 if(POLICY CMP0075)
   cmake_policy(SET CMP0075 NEW)  # CMake 3.12+: Include file check macros honor `CMAKE_REQUIRED_LIBRARIES`
 endif()
@@ -74,6 +76,18 @@ if(POLICY CMP0077)
   cmake_policy(SET CMP0077 NEW)  # CMake 3.13+: option() honors normal variables.
 endif()
 
+if(POLICY CMP0091)
+  cmake_policy(SET CMP0091 NEW) # CMake 3.15+: leave MSVC runtime selection out of default CMAKE_<LANG>_FLAGS_<CONFIG> flags
+endif()
+
+if(POLICY CMP0146)
+  cmake_policy(SET CMP0146 OLD)  # CMake 3.27+: use CMake FindCUDA if available.
+endif()
+
+if(POLICY CMP0148)
+  cmake_policy(SET CMP0148 OLD)  # CMake 3.27+: use CMake FindPythonInterp and FindPythonLib if available.
+endif()
+
 #
 # Configure OpenCV CMake hooks
 #
@@ -235,9 +249,9 @@ OCV_OPTION(BUILD_ITT                "Build Intel ITT from source"
 # Optional 3rd party components
 # ===================================================
 OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_DC1394_2)
-OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O (iOS/Mac)" ON
+OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O (iOS/visionOS/Mac)" ON
   VISIBLE_IF APPLE
   VERIFY HAVE_AVFOUNDATION)
 OCV_OPTION(WITH_AVIF "Enable AVIF support" OFF
@@ -246,15 +260,19 @@ OCV_OPTION(WITH_CAP_IOS "Enable iOS video capture" ON
   VISIBLE_IF IOS
   VERIFY HAVE_CAP_IOS)
 OCV_OPTION(WITH_CAROTENE "Use NVidia carotene acceleration library for ARM platform" (NOT CV_DISABLE_OPTIMIZATION)
-  VISIBLE_IF (ARM OR AARCH64) AND NOT IOS)
+  VISIBLE_IF (ARM OR AARCH64) AND NOT IOS AND NOT XROS)
+OCV_OPTION(WITH_KLEIDICV "Use KleidiCV library for ARM platforms" OFF
+  VISIBLE_IF (AARCH64 AND (ANDROID OR UNIX AND NOT IOS AND NOT XROS)))
+OCV_OPTION(WITH_NDSRVP "Use Andes RVP extension" (NOT CV_DISABLE_OPTIMIZATION)
+  VISIBLE_IF RISCV)
 OCV_OPTION(WITH_CPUFEATURES "Use cpufeatures Android library" ON
   VISIBLE_IF ANDROID
   VERIFY HAVE_CPUFEATURES)
 OCV_OPTION(WITH_VTK "Include VTK library support (and build opencv_viz module eiher)" ON
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING
   VERIFY HAVE_VTK)
 OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" OFF
-  VISIBLE_IF NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_CUDA)
 OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" WITH_CUDA
   VISIBLE_IF WITH_CUDA
@@ -275,10 +293,10 @@ OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" (NOT CV_DISABLE_OPTIMIZATI
   VISIBLE_IF NOT WINRT
   VERIFY HAVE_EIGEN)
 OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" (NOT ANDROID)
-  VISIBLE_IF NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_FFMPEG)
 OCV_OPTION(WITH_GSTREAMER "Include Gstreamer support" ON
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_GSTREAMER AND GSTREAMER_VERSION VERSION_GREATER "0.99")
 OCV_OPTION(WITH_GTK "Include GTK support" ON
   VISIBLE_IF UNIX AND NOT APPLE AND NOT ANDROID
@@ -290,7 +308,7 @@ OCV_OPTION(WITH_WAYLAND "Include Wayland support" OFF
         VISIBLE_IF UNIX AND NOT APPLE AND NOT ANDROID
         VERIFY HAVE_WAYLAND)
 OCV_OPTION(WITH_IPP "Include Intel IPP support" (NOT MINGW AND NOT CV_DISABLE_OPTIMIZATION)
-  VISIBLE_IF (X86_64 OR X86) AND NOT WINRT AND NOT IOS
+  VISIBLE_IF (X86_64 OR X86) AND NOT WINRT AND NOT IOS AND NOT XROS
   VERIFY HAVE_IPP)
 OCV_OPTION(WITH_HALIDE "Include Halide support" OFF
   VISIBLE_IF TRUE
@@ -306,10 +324,10 @@ OCV_OPTION(WITH_WEBNN "Include WebNN support" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_WEBNN)
 OCV_OPTION(WITH_JASPER "Include JPEG2K support (Jasper)" ON
-  VISIBLE_IF NOT IOS
+  VISIBLE_IF NOT IOS AND NOT XROS
   VERIFY HAVE_JASPER)
 OCV_OPTION(WITH_OPENJPEG "Include JPEG2K support (OpenJPEG)" ON
-  VISIBLE_IF NOT IOS
+  VISIBLE_IF NOT IOS AND NOT XROS
   VERIFY HAVE_OPENJPEG)
 OCV_OPTION(WITH_JPEG "Include JPEG support" ON
   VISIBLE_IF TRUE
@@ -327,10 +345,10 @@ OCV_OPTION(WITH_OPENVX "Include OpenVX support" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_OPENVX)
 OCV_OPTION(WITH_OPENNI "Include OpenNI support" OFF
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_OPENNI)
 OCV_OPTION(WITH_OPENNI2 "Include OpenNI2 support" OFF
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_OPENNI2)
 OCV_OPTION(WITH_PNG "Include PNG support" ON
   VISIBLE_IF TRUE
@@ -342,19 +360,19 @@ OCV_OPTION(WITH_GDCM "Include DICOM support" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_GDCM)
 OCV_OPTION(WITH_PVAPI "Include Prosilica GigE support" OFF
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_PVAPI)
 OCV_OPTION(WITH_ARAVIS "Include Aravis GigE support" OFF
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT WIN32
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT AND NOT WIN32
   VERIFY HAVE_ARAVIS_API)
 OCV_OPTION(WITH_QT "Build with Qt Backend support" OFF
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_QT)
 OCV_OPTION(WITH_WIN32UI "Build with Win32 UI Backend support" ON
   VISIBLE_IF WIN32 AND NOT WINRT
   VERIFY HAVE_WIN32UI)
 OCV_OPTION(WITH_TBB "Include Intel TBB support" OFF
-  VISIBLE_IF NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_TBB)
 OCV_OPTION(WITH_HPX "Include Ste||ar Group HPX support" OFF
   VISIBLE_IF TRUE
@@ -366,7 +384,7 @@ OCV_OPTION(WITH_PTHREADS_PF "Use pthreads-based parallel_for" ON
   VISIBLE_IF NOT WIN32 OR MINGW
   VERIFY HAVE_PTHREADS_PF)
 OCV_OPTION(WITH_TIFF "Include TIFF support" ON
-  VISIBLE_IF NOT IOS
+  VISIBLE_IF NOT IOS AND NOT XROS
   VERIFY HAVE_TIFF)
 OCV_OPTION(WITH_V4L "Include Video 4 Linux support" ON
   VISIBLE_IF UNIX AND NOT ANDROID AND NOT APPLE
@@ -393,20 +411,23 @@ OCV_OPTION(WITH_CLP "Include Clp support (EPL)" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_CLP)
 OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" (NOT ANDROID AND NOT CV_DISABLE_OPTIMIZATION)
-  VISIBLE_IF NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_OPENCL)
 OCV_OPTION(WITH_OPENCL_SVM "Include OpenCL Shared Virtual Memory support" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_OPENCL_SVM) # experimental
 OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" ON
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_CLAMDFFT)
 OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" ON
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_CLAMDBLAS)
 OCV_OPTION(WITH_DIRECTX "Include DirectX support" ON
   VISIBLE_IF WIN32 AND NOT WINRT
   VERIFY HAVE_DIRECTX)
+OCV_OPTION(WITH_DIRECTML "Include DirectML support" ON
+  VISIBLE_IF WIN32 AND NOT WINRT
+  VERIFY HAVE_DIRECTML)
 OCV_OPTION(WITH_OPENCL_D3D11_NV "Include NVIDIA OpenCL D3D11 support" WITH_DIRECTX
   VISIBLE_IF WIN32 AND NOT WINRT
   VERIFY HAVE_OPENCL_D3D11_NV)
@@ -423,13 +444,13 @@ OCV_OPTION(WITH_MFX "Include Intel Media SDK support" OFF
   VISIBLE_IF (UNIX AND NOT ANDROID) OR (WIN32 AND NOT WINRT AND NOT MINGW)
   VERIFY HAVE_MFX)
 OCV_OPTION(WITH_GDAL "Include GDAL Support" OFF
-  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT WINRT
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS AND NOT WINRT
   VERIFY HAVE_GDAL)
 OCV_OPTION(WITH_GPHOTO2 "Include gPhoto2 library support" OFF
-  VISIBLE_IF UNIX AND NOT ANDROID AND NOT IOS
+  VISIBLE_IF UNIX AND NOT ANDROID AND NOT IOS AND NOT XROS
   VERIFY HAVE_GPHOTO2)
 OCV_OPTION(WITH_LAPACK "Include Lapack library support" (NOT CV_DISABLE_OPTIMIZATION)
-  VISIBLE_IF NOT ANDROID AND NOT IOS
+  VISIBLE_IF NOT ANDROID AND NOT IOS AND NOT XROS
   VERIFY HAVE_LAPACK)
 OCV_OPTION(WITH_ITT "Include Intel ITT support" ON
   VISIBLE_IF NOT APPLE_FRAMEWORK
@@ -449,7 +470,7 @@ OCV_OPTION(WITH_IMGCODEC_PXM "Include PNM (PBM,PGM,PPM) and PAM formats support"
 OCV_OPTION(WITH_IMGCODEC_PFM "Include PFM formats support" ON
   VISIBLE_IF TRUE
   VERIFY HAVE_IMGCODEC_PFM)
-OCV_OPTION(WITH_QUIRC "Include library QR-code decoding" ON
+OCV_OPTION(WITH_QUIRC "Include library QR-code decoding" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_QUIRC)
 OCV_OPTION(WITH_ANDROID_MEDIANDK "Use Android Media NDK for Video I/O (Android)" (ANDROID_NATIVE_API_LEVEL GREATER 20)
@@ -458,17 +479,17 @@ OCV_OPTION(WITH_ANDROID_MEDIANDK "Use Android Media NDK for Video I/O (Android)"
 OCV_OPTION(WITH_ANDROID_NATIVE_CAMERA "Use Android NDK for Camera I/O (Android)" (ANDROID_NATIVE_API_LEVEL GREATER 23)
   VISIBLE_IF ANDROID
   VERIFY HAVE_ANDROID_NATIVE_CAMERA)
-OCV_OPTION(WITH_TENGINE "Include Arm Inference Tengine support" OFF
-  VISIBLE_IF (ARM OR AARCH64) AND (UNIX OR ANDROID) AND NOT IOS
-  VERIFY HAVE_TENGINE)
 OCV_OPTION(WITH_ONNX "Include Microsoft ONNX Runtime support" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_ONNX)
 OCV_OPTION(WITH_TIMVX "Include Tim-VX support" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_TIMVX)
-OCV_OPTION(WITH_OBSENSOR "Include obsensor support (Orbbec RGB-D modules: Astra+/Femto)" ON
-  VISIBLE_IF (WIN32 AND NOT ARM AND NOT WINRT) OR ( UNIX AND NOT APPLE AND NOT ANDROID)
+# Attention when OBSENSOR_USE_ORBBEC_SDK set to off:
+#   Astra2 cameras currently only support Windows and Linux kernel versions no higher than 4.15, and higher versions of Linux kernel may have exceptions.
+OCV_OPTION(OBSENSOR_USE_ORBBEC_SDK "Use Orbbec SDK as backend to support more camera models and platforms (force to ON on MacOS)" OFF)
+OCV_OPTION(WITH_OBSENSOR "Include obsensor support (Orbbec 3D Cameras)" ON
+  VISIBLE_IF (WIN32 AND NOT ARM AND NOT WINRT AND NOT MINGW) OR ( UNIX AND NOT APPLE AND NOT ANDROID) OR (APPLE AND AARCH64 AND NOT IOS)
   VERIFY HAVE_OBSENSOR)
 OCV_OPTION(WITH_CANN "Include CANN support" OFF
   VISIBLE_IF TRUE
@@ -476,6 +497,9 @@ OCV_OPTION(WITH_CANN "Include CANN support" OFF
 OCV_OPTION(WITH_FLATBUFFERS "Include Flatbuffers support (required by DNN/TFLite importer)" ON
   VISIBLE_IF TRUE
   VERIFY HAVE_FLATBUFFERS)
+OCV_OPTION(WITH_ZLIB_NG "Use zlib-ng instead of zlib" OFF
+  VISIBLE_IF TRUE
+  VERIFY HAVE_ZLIB_NG)
 
 # OpenCV build components
 # ===================================================
@@ -512,7 +536,7 @@ OCV_OPTION(INSTALL_TESTS            "Install accuracy and performance test binar
 # OpenCV build options
 # ===================================================
 OCV_OPTION(ENABLE_CCACHE              "Use ccache"                                               (UNIX AND (CMAKE_GENERATOR MATCHES "Makefile" OR CMAKE_GENERATOR MATCHES "Ninja" OR CMAKE_GENERATOR MATCHES "Xcode")) )
-OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  MSVC IF (MSVC OR (NOT IOS AND NOT CMAKE_CROSSCOMPILING) ) )
+OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  MSVC IF (MSVC OR (NOT IOS AND NOT XROS AND NOT CMAKE_CROSSCOMPILING) ) )
 OCV_OPTION(ENABLE_DELAYLOAD           "Enable delayed loading of OpenCV DLLs"                    OFF VISIBLE_IF MSVC AND BUILD_SHARED_LIBS)
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CV_GCC )
@@ -522,8 +546,8 @@ OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable compiler options for fast math optimizations on FP computations (not recommended)" OFF)
 if(NOT IOS AND (NOT ANDROID OR OPENCV_ANDROID_USE_LEGACY_FLAGS) AND CMAKE_CROSSCOMPILING)  # Use CPU_BASELINE instead
-OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
-OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
+OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS OR XROS) )
+OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS OR XROS) )
 endif()
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
 OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"                                 OFF )
@@ -545,6 +569,9 @@ OCV_OPTION(OPENCV_ENABLE_MEMALIGN     "Enable posix_memalign or memalign usage"
 OCV_OPTION(OPENCV_DISABLE_FILESYSTEM_SUPPORT "Disable filesystem support" OFF)
 OCV_OPTION(OPENCV_DISABLE_THREAD_SUPPORT "Build the library without multi-threaded code." OFF)
 OCV_OPTION(OPENCV_SEMIHOSTING         "Build the library for semihosting target (Arm). See https://developer.arm.com/documentation/100863/latest." OFF)
+OCV_OPTION(ENABLE_CUDA_FIRST_CLASS_LANGUAGE "Enable CUDA as a first class language, if enabled dependant projects will need to use CMake >= 3.18" OFF
+  VISIBLE_IF (WITH_CUDA AND NOT CMAKE_VERSION VERSION_LESS 3.18)
+  VERIFY HAVE_CUDA)
 
 OCV_OPTION(ENABLE_PYLINT              "Add target with Pylint checks"                            (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
 OCV_OPTION(ENABLE_FLAKE8              "Add target with Python flake8 checker"                    (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
@@ -557,7 +584,9 @@ if(OPENCV_DISABLE_FILESYSTEM_SUPPORT)
   add_definitions(-DOPENCV_HAVE_FILESYSTEM_SUPPORT=0)
 endif()
 
-set(OPENCV_MATHJAX_RELPATH "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0" CACHE STRING "URI to a MathJax installation")
+# MathJax is used for math rendering by both Doxygen HTML and JavaDoc, so
+# this var have to be defined before "modules" AND "doc" are processed
+set(OPENCV_MATHJAX_RELPATH "https://cdn.jsdelivr.net/npm/mathjax@3.0.1" CACHE STRING "URI to a MathJax installation")
 
 # ----------------------------------------------------------------------------
 #  Get actual OpenCV version number from sources
@@ -601,7 +630,7 @@ if(WIN32)
 else()
   # Postfix of so's:
   ocv_update(OPENCV_DLLVERSION "")
-  ocv_update(OPENCV_DEBUG_POSTFIX "")
+  ocv_update(OPENCV_DEBUG_POSTFIX "d")
 endif()
 
 if(DEFINED CMAKE_DEBUG_POSTFIX)
@@ -645,7 +674,7 @@ endif()
 ocv_cmake_hook(POST_CMAKE_BUILD_OPTIONS)
 
 # --- Python Support ---
-if(NOT IOS)
+if(NOT IOS AND NOT XROS)
   include(cmake/OpenCVDetectPython.cmake)
 endif()
 
@@ -653,6 +682,51 @@ include(cmake/OpenCVCompilerOptions.cmake)
 
 ocv_cmake_hook(POST_COMPILER_OPTIONS)
 
+# --- CUDA Support ---
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  if(CMAKE_VERSION VERSION_LESS 3.18)
+    message(WARNING "CUDA: First class language only supported for CMake versions >= 3.18, falling back to FindCUDA!")
+    set(ENABLE_CUDA_FIRST_CLASS_LANGUAGE OFF CACHE BOOL "Enable CUDA as a first class language, if enabled dependant projects will need to use CMake >= 3.18" FORCE)
+  else()
+
+    # Check CUDA_PATH if supplied
+    if(UNIX AND CUDA_PATH AND NOT ENV{CUDA_PATH})
+      set(ENV{CUDA_PATH} ${CUDA_PATH})
+    elseif(WIN32 AND CUDA_PATH)
+      set(ENV{PATH} "${CUDA_PATH}\\bin\;$ENV{PATH}")
+    endif()
+    include(CheckLanguage)
+    check_language(CUDA)
+
+    # Fallback to checking default locations
+    if(NOT CMAKE_CUDA_COMPILER)
+      # Checking windows default search location isn't possible because the CUDA Toolkit is installed to C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/vXX.X
+      if(WIN32)
+        if(CMAKE_GENERATOR MATCHES "Visual Studio")
+          message(STATUS "CUDA: Not detected, when using stand alone installations with the Visual Studio generator the path to the CUDA toolkit should be manually specified with -Tcuda=. e.g. -Tcuda=\"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/vXX.X\"")
+        else()
+          message(STATUS "CUDA: Not detected, for stand alone installations the path to the CUDA toolkit should be manually specified with -DCUDA_PATH=. e.g. -DCUDA_PATH=\"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/vXX.X\"")
+        endif()
+      elseif(UNIX)
+        message(STATUS "CUDA: Not detected, make sure you have performed the mandatory Post-installation actions described in the CUDA installation guide.\n   For stand alone installations you can set the CUDA_PATH environmental or CMake variable. e.g. export CUDA_PATH=/usr/local/cuda-XX.X or -DCUDA_PATH=/usr/local/cuda-XX.X.")
+        message(STATUS "CUDA: Falling back to searching for the CUDA compiler in its default location (/usr/local/cuda)")
+        set(CUDA_PATH "/usr/local/cuda" CACHE INTERNAL "")
+        set(ENV{CUDA_PATH} ${CUDA_PATH})
+        unset(CMAKE_CUDA_COMPILER CACHE)
+        unset(CMAKE_CUDA_COMPILER)
+        check_language(CUDA)
+      endif()
+    endif()
+
+    cmake_policy(SET CMP0092 NEW) # CMake 3.15+: leave warning flags out of default CMAKE_<LANG>_FLAGS flags.
+    if(CMAKE_CUDA_COMPILER)
+      enable_language(CUDA)
+    elseif(UNIX)
+      message(WARNING "CUDA: Not detected!  If you are not using the default host compiler (g++) then you need to specify both CMAKE_CUDA_HOST_COMPILER and CMAKE_CUDA_COMPILER. e.g. -DCMAKE_CUDA_HOST_COMPILER=/usr/bin/clang++ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc.")
+    endif()
+  endif()
+endif()
+
 # ----------------------------------------------------------------------------
 #       CHECK FOR SYSTEM LIBRARIES, OPTIONS, ETC..
 # ----------------------------------------------------------------------------
@@ -730,7 +804,7 @@ include(cmake/OpenCVModule.cmake)
 #  Detect endianness of build platform
 # ----------------------------------------------------------------------------
 
-if(IOS)
+if(IOS OR XROS)
   # test_big_endian needs try_compile, which doesn't work for iOS
   # http://public.kitware.com/Bug/view.php?id=12288
   set(WORDS_BIGENDIAN 0)
@@ -757,9 +831,6 @@ include(cmake/OpenCVFindLibsPerf.cmake)
 include(cmake/OpenCVFindLAPACK.cmake)
 include(cmake/OpenCVFindProtobuf.cmake)
 include(cmake/OpenCVDetectFlatbuffers.cmake)
-if(WITH_TENGINE)
-  include(cmake/OpenCVFindTengine.cmake)
-endif()
 if(WITH_TIMVX)
   include(cmake/OpenCVFindTIMVX.cmake)
 endif()
@@ -850,6 +921,10 @@ endif()
 if(WITH_DIRECTX)
   include(cmake/OpenCVDetectDirectX.cmake)
 endif()
+# --- DirectML ---
+if(WITH_DIRECTML)
+  include(cmake/OpenCVDetectDirectML.cmake)
+endif()
 
 if(WITH_VTK)
   include(cmake/OpenCVDetectVTK.cmake)
@@ -898,6 +973,13 @@ if(HAVE_OPENVX)
   endif()
 endif()
 
+if(WITH_KLEIDICV)
+  ocv_debug_message(STATUS "Enable KleidiCV acceleration")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";kleidicv;")
+    set(OpenCV_HAL "kleidicv;${OpenCV_HAL}")
+  endif()
+endif()
+
 if(WITH_CAROTENE)
   ocv_debug_message(STATUS "Enable carotene acceleration")
   if(NOT ";${OpenCV_HAL};" MATCHES ";carotene;")
@@ -905,15 +987,42 @@ if(WITH_CAROTENE)
   endif()
 endif()
 
+if(WITH_NDSRVP)
+  ocv_debug_message(STATUS "Andes RVP 3rdparty NDSRVP enabled")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";ndsrvp;")
+    set(OpenCV_HAL "ndsrvp;${OpenCV_HAL}")
+  endif()
+endif()
+
 foreach(hal ${OpenCV_HAL})
   if(hal STREQUAL "carotene")
     if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
       add_subdirectory(3rdparty/carotene/hal)
       ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
-      list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
+
+      if( NOT DEFINED CAROTENE_NEON_ARCH)
+          set(CAROTENE_NEON_MSG "Auto detected")
+      elseif( CAROTENE_NEON_ARCH GREATER 7)
+          set(CAROTENE_NEON_MSG "Force ARMv8+")
+      else()
+          set(CAROTENE_NEON_MSG "Force ARMv7")
+      endif()
+      list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION}, ${CAROTENE_NEON_MSG})")
     else()
       message(STATUS "Carotene: NEON is not available, disabling carotene...")
     endif()
+  elseif(hal STREQUAL "kleidicv")
+    add_subdirectory(3rdparty/kleidicv)
+    ocv_hal_register(KLEIDICV_HAL_LIBRARIES KLEIDICV_HAL_HEADERS KLEIDICV_HAL_INCLUDE_DIRS)
+    list(APPEND OpenCV_USED_HAL "KleidiCV (ver ${KLEIDICV_HAL_VERSION})")
+  elseif(hal STREQUAL "ndsrvp")
+    if(CMAKE_C_FLAGS MATCHES "-mext-dsp" AND CMAKE_CXX_FLAGS MATCHES "-mext-dsp" AND NOT ";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
+      add_subdirectory(3rdparty/ndsrvp)
+      ocv_hal_register(NDSRVP_HAL_LIBRARIES NDSRVP_HAL_HEADERS NDSRVP_HAL_INCLUDE_DIRS)
+      list(APPEND OpenCV_USED_HAL "ndsrvp (ver ${NDSRVP_HAL_VERSION})")
+    else()
+      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not open, disabling ndsrvp...")
+    endif()
   elseif(hal STREQUAL "openvx")
     add_subdirectory(3rdparty/openvx)
     ocv_hal_register(OPENVX_HAL_LIBRARIES OPENVX_HAL_HEADERS OPENVX_HAL_INCLUDE_DIRS)
@@ -982,10 +1091,6 @@ if(BUILD_EXAMPLES OR BUILD_ANDROID_EXAMPLES OR INSTALL_ANDROID_EXAMPLES OR INSTA
   add_subdirectory(samples)
 endif()
 
-if(ANDROID)
-  add_subdirectory(platforms/android/service)
-endif()
-
 # ----------------------------------------------------------------------------
 # Finalization: generate configuration-based files
 # ----------------------------------------------------------------------------
@@ -1006,7 +1111,7 @@ include(cmake/OpenCVGenAndroidMK.cmake)
 # Generate OpenCVConfig.cmake and OpenCVConfig-version.cmake for cmake projects
 include(cmake/OpenCVGenConfig.cmake)
 
-# Generate Info.plist for the IOS framework
+# Generate Info.plist for the iOS/visionOS framework
 if(APPLE_FRAMEWORK)
   include(cmake/OpenCVGenInfoPlist.cmake)
 endif()
@@ -1297,21 +1402,12 @@ status("")
 status("  GUI: " "${OPENCV_HIGHGUI_BUILTIN_BACKEND}")
 
 if(WITH_WAYLAND OR HAVE_WAYLAND)
-  if(HAVE_WAYLAND_CLIENT)
-    status("    Wayland Client:" "YES (ver ${WAYLAND_CLIENT_VERSION})")
-  endif()
-  if(HAVE_WAYLAND_CURSOR)
-    status("    Wayland Cursor:" "YES (ver ${WAYLAND_CURSOR_VERSION})")
-  endif()
-  if(HAVE_WAYLAND_PROTOCOL)
-    status("    Wayland Protocol:" "YES (ver ${WAYLAND_PROTOCOL_VERSION})")
-  endif()
-  if(HAVE_WAYLAND_EGL)
-    status("    Wayland EGL:" "YES (ver ${WAYLAND_EGL_VERSION})")
-  endif()
-  if(HAVE_XKBCOMMON)
-    status("    Xkbcommon:" "YES (ver ${XKBCOMMON_VERSION})")
-  endif()
+  status("    Wayland:" HAVE_WAYLAND THEN "(Experimental) YES" ELSE "NO")
+  status("      Wayland Client:" HAVE_WAYLAND_CLIENT THEN "YES (ver ${WAYLAND_CLIENT_VERSION})" ELSE "NO")
+  status("      Wayland Cursor:" HAVE_WAYLAND_CURSOR THEN "YES (ver ${WAYLAND_CURSOR_VERSION})" ELSE "NO")
+  status("      Wayland Protocols:" HAVE_WAYLAND_PROTOCOLS THEN "YES (ver ${WAYLAND_PROTOCOLS_VERSION})" ELSE "NO")
+  status("      Xkbcommon:" HAVE_XKBCOMMON THEN "YES (ver ${XKBCOMMON_VERSION})" ELSE "NO")
+  status("      Wayland EGL(Option):" HAVE_WAYLAND_EGL THEN "YES (ver ${WAYLAND_EGL_VERSION})" ELSE "NO")
 endif()
 
 if(WITH_QT OR HAVE_QT)
@@ -1364,12 +1460,16 @@ endif()
 # ========================== MEDIA IO ==========================
 status("")
 status("  Media I/O: ")
-status("    ZLib:"   ZLIB_FOUND THEN "${ZLIB_LIBRARIES} (ver ${ZLIB_VERSION_STRING})" ELSE "build (ver ${ZLIB_VERSION_STRING})")
+if(WITH_ZLIB_NG OR HAVE_ZLIB_NG)
+  status("    ZLib-Ng:" "build (zlib ver ${ZLIB_VERSION_STRING}, zlib-ng ver ${ZLIBNG_VERSION_STRING})")
+else()
+  status("    ZLib:"   ZLIB_FOUND THEN "${ZLIB_LIBRARIES} (ver ${ZLIB_VERSION_STRING})" ELSE "build (ver ${ZLIB_VERSION_STRING})")
+endif()
 
 if(WITH_JPEG OR HAVE_JPEG)
   if(NOT HAVE_JPEG)
     status("    JPEG:" NO)
-  elseif(BUILD_JPEG)
+  elseif(BUILD_JPEG OR NOT JPEG_FOUND)
     status("    JPEG:" "build-${JPEG_LIBRARY} (ver ${JPEG_LIB_VERSION})")
     if(ENABLE_LIBJPEG_TURBO_SIMD)
       status("      SIMD Support Request:" "YES")
@@ -1391,18 +1491,44 @@ if(WITH_WEBP OR HAVE_WEBP)
 endif()
 
 if(WITH_AVIF OR HAVE_AVIF)
-  if(AVIF_VERSION)
-    status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY} (ver ${AVIF_VERSION})" ELSE "NO")
+  if(libavif_VERSION)
+    status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY} (ver ${libavif_VERSION})" ELSE "NO")
   else()
     status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY}" ELSE "NO")
   endif()
 endif()
 
-if(WITH_PNG OR HAVE_PNG OR WITH_SPNG)
-  if(WITH_SPNG)
+if(WITH_SPNG)
+  if(BUILD_SPNG)
     status("    PNG:" "build-${SPNG_LIBRARY} (ver ${SPNG_VERSION})")
+  elseif(HAVE_SPNG)
+    status("    PNG:" "${SPNG_LIBRARY} (ver ${SPNG_VERSION})")
+  endif()
+elseif(WITH_PNG OR HAVE_PNG)
+  status("    PNG:"  PNG_FOUND  THEN "${PNG_LIBRARY} (ver ${PNG_VERSION_STRING})" ELSE "build (ver ${PNG_VERSION_STRING})")
+  if(BUILD_PNG AND PNG_HARDWARE_OPTIMIZATIONS)
+    status("      SIMD Support Request:" "YES")
+    if(PNG_INTEL_SSE)
+    status("      SIMD Support:" "YES (Intel SSE)")
+  elseif(PNG_POWERPC_VSX)
+    status("      SIMD Support:" "YES (PowerPC VSX)")
+  elseif(PNG_ARM_NEON)
+    status("      SIMD Support:" "YES (Arm NEON)")
+  elseif(PNG_MIPS_MSA OR PNG_MIPS_MMI)
+    if(PNG_MIPS_MSA AND PNG_MIPS_MMI)
+      status("      SIMD Support:" "YES (Mips MSA & MMI)")
+    elseif(PNG_MIPS_MSA AND NOT PNG_MIPS_MMI)
+      status("      SIMD Support:" "YES (Mips MSA)")
+    else()
+      status("      SIMD Support:" "YES (Mips MMI)")
+    endif()
+  elseif(PNG_LOONGARCH_LSX)
+    status("      SIMD Support:" "YES (LoongArch LSX)")
   else()
-    status("    PNG:"  PNG_FOUND  THEN "${PNG_LIBRARY} (ver ${PNG_VERSION})" ELSE "build (ver ${PNG_VERSION})")
+    status("      SIMD Support:" "NO")
+  endif()
+  elseif(BUILD_PNG)
+    status("      SIMD Support Request:" "NO")
   endif()
 endif()
 
@@ -1412,7 +1538,7 @@ endif()
 
 if(HAVE_OPENJPEG)
   status("    JPEG 2000:" OpenJPEG_FOUND
-      THEN "OpenJPEG (ver ${OPENJPEG_MAJOR_VERSION}.${OPENJPEG_MINOR_VERSION}.${OPENJPEG_BUILD_VERSION})"
+      THEN "OpenJPEG (ver ${OPENJPEG_VERSION})"
       ELSE "build (ver ${OPENJPEG_VERSION})"
   )
 elseif(HAVE_JASPER)
@@ -1553,6 +1679,11 @@ if(WITH_GPHOTO2 OR HAVE_GPHOTO2)
   status("    gPhoto2:"        HAVE_GPHOTO2        THEN "YES"                                 ELSE NO)
 endif()
 
+if(ANDROID)
+  status("   MEDIANDK:"         HAVE_ANDROID_MEDIANDK THEN "YES"                              ELSE NO)
+  status("   NDK Camera:"       HAVE_ANDROID_NATIVE_CAMERA THEN "YES"                         ELSE NO)
+endif()
+
 # Order is similar to CV_PARALLEL_FRAMEWORK in core/src/parallel.cpp
 ocv_build_features_string(parallel_status EXCLUSIVE
   IF HAVE_TBB THEN "TBB (ver ${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR} interface ${TBB_INTERFACE_VERSION})"
@@ -1608,10 +1739,6 @@ if(WITH_VA OR HAVE_VA)
   status("    VA:"            HAVE_VA          THEN "YES" ELSE NO)
 endif()
 
-if(WITH_TENGINE OR HAVE_TENGINE)
-  status("    Tengine:"      HAVE_TENGINE     THEN "YES (${TENGINE_LIBRARIES})" ELSE NO)
-endif()
-
 if(WITH_LAPACK OR HAVE_LAPACK)
   status("    Lapack:"      HAVE_LAPACK     THEN "YES (${LAPACK_LIBRARIES})" ELSE NO)
 endif()
@@ -1678,6 +1805,10 @@ else()
   endif()
 endif()
 
+if(BUILD_opencv_dnn AND OPENCV_DNN_BACKEND_DEFAULT)
+    status("    Default DNN backend:" ${OPENCV_DNN_BACKEND_DEFAULT})
+endif()
+
 if(WITH_EIGEN OR HAVE_EIGEN)
   status("    Eigen:"      HAVE_EIGEN       THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
 endif()
@@ -1798,6 +1929,7 @@ if(BUILD_opencv_python3)
   else()
     status("    Libraries:"   HAVE_opencv_python3  THEN  "${PYTHON3_LIBRARIES}"                                      ELSE NO)
   endif()
+  status("    Limited API:" PYTHON3_LIMITED_API THEN "YES (ver ${PYTHON3_LIMITED_API_VERSION})"                    ELSE NO)
   status("    numpy:"         PYTHON3_NUMPY_INCLUDE_DIRS THEN "${PYTHON3_NUMPY_INCLUDE_DIRS} (ver ${PYTHON3_NUMPY_VERSION})" ELSE "NO (Python3 wrappers can not be generated)")
   status("    install path:"  HAVE_opencv_python3  THEN "${__INSTALL_PATH_PYTHON3}"                            ELSE "-")
 endif()
@@ -1815,7 +1947,7 @@ if(BUILD_JAVA)
     status("    JNI:"         JNI_INCLUDE_DIRS    THEN "${JNI_INCLUDE_DIRS}"                                       ELSE NO)
   endif()
   status("    Java wrappers:" HAVE_opencv_java                                                            THEN "YES (${OPENCV_JAVA_SDK_BUILD_TYPE})" ELSE NO)
-  status("    Java tests:"    BUILD_TESTS AND opencv_test_java_BINARY_DIR                                 THEN YES ELSE NO)
+  status("    Java tests:"    BUILD_TESTS AND (opencv_test_java_BINARY_DIR OR opencv_test_android_BINARY_DIR) THEN YES ELSE NO)
 endif()
 
 # ========================== Objective-C =======================
diff --git a/README.md b/README.md
index c6263728aaf8..8217dde1588e 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 ## OpenCV: Open Source Computer Vision Library
 
+
 ### Resources
 
 * Homepage: <https://opencv.org>
@@ -8,7 +9,8 @@
 * Q&A forum: <https://forum.opencv.org>
   * previous forum (read only): <http://answers.opencv.org>
 * Issue tracking: <https://github.com/opencv/opencv/issues>
-* Additional OpenCV functionality: <https://github.com/opencv/opencv_contrib> 
+* Additional OpenCV functionality: <https://github.com/opencv/opencv_contrib>
+* Donate to OpenCV: <https://opencv.org/support/>
 
 
 ### Contributing
@@ -22,3 +24,13 @@ Please read the [contribution guidelines](https://github.com/opencv/opencv/wiki/
 * Include tests and documentation;
 * Clean up "oops" commits before submitting;
 * Follow the [coding style guide](https://github.com/opencv/opencv/wiki/Coding_Style_Guide).
+
+### Additional Resources
+
+* [Submit your OpenCV-based project](https://form.jotform.com/233105358823151) for inclusion in Community Friday on opencv.org
+* [Subscribe to the OpenCV YouTube Channel](http://youtube.com/@opencvofficial) featuring OpenCV Live, an hour-long streaming show
+* [Follow OpenCV on LinkedIn](http://linkedin.com/company/opencv/) for daily posts showing the state-of-the-art in computer vision & AI
+* [Apply to be an OpenCV Volunteer](https://form.jotform.com/232745316792159) to help organize events and online campaigns as well as amplify them
+* [Follow OpenCV on Mastodon](http://mastodon.social/@opencv) in the Fediverse
+* [Follow OpenCV on Twitter](https://twitter.com/opencvlive)
+* [OpenCV.ai](https://opencv.ai): Computer Vision and AI development services from the OpenCV team.
diff --git a/apps/createsamples/utility.cpp b/apps/createsamples/utility.cpp
index 198f4c2eb763..b57d3fcb81a1 100644
--- a/apps/createsamples/utility.cpp
+++ b/apps/createsamples/utility.cpp
@@ -1168,7 +1168,7 @@ void cvCreateTestSamples( const char* infoname,
             }
             else
             {
-                filename++; // character after last / or \
+                filename++; // get basename after last path delimiter
             }
 
             count = MIN( count, cvbgdata->count );
diff --git a/apps/model-diagnostics/model_diagnostics.cpp b/apps/model-diagnostics/model_diagnostics.cpp
index 6970c8507108..365833c9e535 100644
--- a/apps/model-diagnostics/model_diagnostics.cpp
+++ b/apps/model-diagnostics/model_diagnostics.cpp
@@ -32,12 +32,36 @@ static std::string checkFileExists(const std::string& fileName)
          "Please, specify a full path to the file.");
 }
 
+static std::vector<int> parseShape(const std::string &shape_str) {
+    std::stringstream ss(shape_str);
+    std::string item;
+    std::vector<std::string> items;
+
+    while (std::getline(ss, item, ',')) {
+        items.push_back(item);
+    }
+
+    std::vector<int> shape;
+    for (size_t i = 0; i < items.size(); i++) {
+        shape.push_back(std::stoi(items[i]));
+    }
+    return shape;
+}
+
 std::string diagnosticKeys =
         "{ model m     | | Path to the model file. }"
         "{ config c    | | Path to the model configuration file. }"
-        "{ framework f | | [Optional] Name of the model framework. }";
-
-
+        "{ framework f | | [Optional] Name of the model framework. }"
+        "{ input0_name | | [Optional] Name of input0. Use with input0_shape}"
+        "{ input0_shape | | [Optional] Shape of input0. Use with input0_name}"
+        "{ input1_name | | [Optional] Name of input1. Use with input1_shape}"
+        "{ input1_shape | | [Optional] Shape of input1. Use with input1_name}"
+        "{ input2_name | | [Optional] Name of input2. Use with input2_shape}"
+        "{ input2_shape | | [Optional] Shape of input2. Use with input2_name}"
+        "{ input3_name | | [Optional] Name of input3. Use with input3_shape}"
+        "{ input3_shape | | [Optional] Shape of input3. Use with input3_name}"
+        "{ input4_name | | [Optional] Name of input4. Use with input4_shape}"
+        "{ input4_shape | | [Optional] Shape of input4. Use with input4_name}";
 
 int main( int argc, const char** argv )
 {
@@ -55,6 +79,17 @@ int main( int argc, const char** argv )
     std::string config = checkFileExists(argParser.get<std::string>("config"));
     std::string frameworkId = argParser.get<std::string>("framework");
 
+    std::string input0_name = argParser.get<std::string>("input0_name");
+    std::string input0_shape = argParser.get<std::string>("input0_shape");
+    std::string input1_name = argParser.get<std::string>("input1_name");
+    std::string input1_shape = argParser.get<std::string>("input1_shape");
+    std::string input2_name = argParser.get<std::string>("input2_name");
+    std::string input2_shape = argParser.get<std::string>("input2_shape");
+    std::string input3_name = argParser.get<std::string>("input3_name");
+    std::string input3_shape = argParser.get<std::string>("input3_shape");
+    std::string input4_name = argParser.get<std::string>("input4_name");
+    std::string input4_shape = argParser.get<std::string>("input4_shape");
+
     CV_Assert(!model.empty());
 
     enableModelDiagnostics(true);
@@ -63,5 +98,50 @@ int main( int argc, const char** argv )
 
     Net ocvNet = readNet(model, config, frameworkId);
 
+    std::vector<std::string> input_names;
+    std::vector<std::vector<int>> input_shapes;
+    if (!input0_name.empty() || !input0_shape.empty()) {
+        CV_CheckFalse(input0_name.empty(), "input0_name cannot be empty");
+        CV_CheckFalse(input0_shape.empty(), "input0_shape cannot be empty");
+        input_names.push_back(input0_name);
+        input_shapes.push_back(parseShape(input0_shape));
+    }
+    if (!input1_name.empty() || !input1_shape.empty()) {
+        CV_CheckFalse(input1_name.empty(), "input1_name cannot be empty");
+        CV_CheckFalse(input1_shape.empty(), "input1_shape cannot be empty");
+        input_names.push_back(input1_name);
+        input_shapes.push_back(parseShape(input1_shape));
+    }
+    if (!input2_name.empty() || !input2_shape.empty()) {
+        CV_CheckFalse(input2_name.empty(), "input2_name cannot be empty");
+        CV_CheckFalse(input2_shape.empty(), "input2_shape cannot be empty");
+        input_names.push_back(input2_name);
+        input_shapes.push_back(parseShape(input2_shape));
+    }
+    if (!input3_name.empty() || !input3_shape.empty()) {
+        CV_CheckFalse(input3_name.empty(), "input3_name cannot be empty");
+        CV_CheckFalse(input3_shape.empty(), "input3_shape cannot be empty");
+        input_names.push_back(input3_name);
+        input_shapes.push_back(parseShape(input3_shape));
+    }
+    if (!input4_name.empty() || !input4_shape.empty()) {
+        CV_CheckFalse(input4_name.empty(), "input4_name cannot be empty");
+        CV_CheckFalse(input4_shape.empty(), "input4_shape cannot be empty");
+        input_names.push_back(input4_name);
+        input_shapes.push_back(parseShape(input4_shape));
+    }
+
+    if (!input_names.empty() && !input_shapes.empty() && input_names.size() == input_shapes.size()) {
+        ocvNet.setInputsNames(input_names);
+        for (size_t i = 0; i < input_names.size(); i++) {
+            Mat input(input_shapes[i], CV_32F);
+            ocvNet.setInput(input, input_names[i]);
+        }
+
+        size_t dot_index = model.rfind('.');
+        std::string graph_filename = model.substr(0, dot_index) + ".pbtxt";
+        ocvNet.dumpToPbtxt(graph_filename);
+    }
+
     return 0;
 }
diff --git a/apps/traincascade/boost.cpp b/apps/traincascade/boost.cpp
index d409216b8a95..c1a451672a35 100644
--- a/apps/traincascade/boost.cpp
+++ b/apps/traincascade/boost.cpp
@@ -86,10 +86,10 @@ static CvMat* cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, b
     int* dsti;
 
     if( !CV_IS_MAT(idx_arr) )
-        CV_ERROR( CV_StsBadArg, "Invalid index array" );
+        CV_ERROR( cv::Error::StsBadArg, "Invalid index array" );
 
     if( idx_arr->rows != 1 && idx_arr->cols != 1 )
-        CV_ERROR( CV_StsBadSize, "the index array must be 1-dimensional" );
+        CV_ERROR( cv::Error::StsBadSize, "the index array must be 1-dimensional" );
 
     idx_total = idx_arr->rows + idx_arr->cols - 1;
     srcb = idx_arr->data.ptr;
@@ -105,20 +105,20 @@ static CvMat* cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, b
         // idx_arr is array of 1's and 0's -
         // i.e. it is a mask of the selected components
         if( idx_total != data_arr_size )
-            CV_ERROR( CV_StsUnmatchedSizes,
+            CV_ERROR( cv::Error::StsUnmatchedSizes,
             "Component mask should contain as many elements as the total number of input variables" );
 
         for( i = 0; i < idx_total; i++ )
             idx_selected += srcb[i*step] != 0;
 
         if( idx_selected == 0 )
-            CV_ERROR( CV_StsOutOfRange, "No components/input_variables is selected!" );
+            CV_ERROR( cv::Error::StsOutOfRange, "No components/input_variables is selected!" );
 
         break;
     case CV_32SC1:
         // idx_arr is array of integer indices of selected components
         if( idx_total > data_arr_size )
-            CV_ERROR( CV_StsOutOfRange,
+            CV_ERROR( cv::Error::StsOutOfRange,
             "index array may not contain more elements than the total number of input variables" );
         idx_selected = idx_total;
         // check if sorted already
@@ -134,7 +134,7 @@ static CvMat* cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, b
         }
         break;
     default:
-        CV_ERROR( CV_StsUnsupportedFormat, "Unsupported index array data type "
+        CV_ERROR( cv::Error::StsUnsupportedFormat, "Unsupported index array data type "
                                            "(it should be 8uC1, 8sC1 or 32sC1)" );
     }
 
@@ -156,13 +156,13 @@ static CvMat* cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, b
             qsort( dsti, idx_total, sizeof(dsti[0]), icvCmpIntegers );
 
         if( dsti[0] < 0 || dsti[idx_total-1] >= data_arr_size )
-            CV_ERROR( CV_StsOutOfRange, "the index array elements are out of range" );
+            CV_ERROR( cv::Error::StsOutOfRange, "the index array elements are out of range" );
 
         if( check_for_duplicates )
         {
             for( i = 1; i < idx_total; i++ )
                 if( dsti[i] <= dsti[i-1] )
-                    CV_ERROR( CV_StsBadArg, "There are duplicated index array elements" );
+                    CV_ERROR( cv::Error::StsBadArg, "There are duplicated index array elements" );
         }
     }
 
@@ -218,7 +218,7 @@ bool CvCascadeBoostParams::read( const FileNode &node )
                  !boostTypeStr.compare( CC_LOGIT_BOOST ) ? CvBoost::LOGIT :
                  !boostTypeStr.compare( CC_GENTLE_BOOST ) ? CvBoost::GENTLE : -1;
     if (boost_type == -1)
-        CV_Error( CV_StsBadArg, "unsupported Boost type" );
+        CV_Error( cv::Error::StsBadArg, "unsupported Boost type" );
     node[CC_MINHITRATE] >> minHitRate;
     node[CC_MAXFALSEALARM] >> maxFalseAlarm;
     node[CC_TRIM_RATE] >> weight_trim_rate ;
@@ -228,7 +228,7 @@ bool CvCascadeBoostParams::read( const FileNode &node )
          maxFalseAlarm <= 0 || maxFalseAlarm > 1 ||
          weight_trim_rate <= 0 || weight_trim_rate > 1 ||
          max_depth <= 0 || weak_count <= 0 )
-        CV_Error( CV_StsBadArg, "bad parameters range");
+        CV_Error( cv::Error::StsBadArg, "bad parameters range");
     return true;
 }
 
@@ -309,7 +309,7 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
     bool isMakeRootCopy = true;
 
     if( !data_root )
-        CV_Error( CV_StsError, "No training data has been set" );
+        CV_Error( cv::Error::StsError, "No training data has been set" );
 
     if( _subsample_idx )
     {
@@ -547,7 +547,7 @@ void CvCascadeBoostTrainData::setData( const CvFeatureEvaluator* _featureEvaluat
     // TODO: check responses: elements must be 0 or 1
 
     if( _precalcValBufSize < 0 || _precalcIdxBufSize < 0)
-        CV_Error( CV_StsOutOfRange, "_numPrecalcVal and _numPrecalcIdx must be positive or 0" );
+        CV_Error( cv::Error::StsOutOfRange, "_numPrecalcVal and _numPrecalcIdx must be positive or 0" );
 
     var_count = var_all = featureEvaluator->getNumFeatures() * featureEvaluator->getFeatureSize();
     sample_count = _numSamples;
@@ -602,7 +602,7 @@ void CvCascadeBoostTrainData::setData( const CvFeatureEvaluator* _featureEvaluat
 
     if ((uint64)effective_buf_width * (uint64)effective_buf_height != effective_buf_size)
     {
-        CV_Error(CV_StsBadArg, "The memory buffer cannot be allocated since its size exceeds integer fields limit");
+        CV_Error(cv::Error::StsBadArg, "The memory buffer cannot be allocated since its size exceeds integer fields limit");
     }
 
     if ( is_buf_16u )
@@ -914,7 +914,7 @@ CvDTreeNode* CvCascadeBoostTree::predict( int sampleIdx ) const
 {
     CvDTreeNode* node = root;
     if( !node )
-        CV_Error( CV_StsError, "The tree has not been trained yet" );
+        CV_Error( cv::Error::StsError, "The tree has not been trained yet" );
 
     if ( ((CvCascadeBoostTrainData*)data)->featureEvaluator->getMaxCatCount() == 0 ) // ordered
     {
diff --git a/apps/traincascade/cascadeclassifier.cpp b/apps/traincascade/cascadeclassifier.cpp
index e364f2e3edfe..6540c30e3a4f 100644
--- a/apps/traincascade/cascadeclassifier.cpp
+++ b/apps/traincascade/cascadeclassifier.cpp
@@ -142,7 +142,7 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
     double time = (double)getTickCount();
 
     if( _cascadeDirName.empty() || _posFilename.empty() || _negFilename.empty() )
-        CV_Error( CV_StsBadArg, "_cascadeDirName or _bgfileName or _vecFileName is NULL" );
+        CV_Error( cv::Error::StsBadArg, "_cascadeDirName or _bgfileName or _vecFileName is NULL" );
 
     string dirName;
     if (_cascadeDirName.find_last_of("/\\") == (_cascadeDirName.length() - 1) )
@@ -452,7 +452,7 @@ void CvCascadeClassifier::save( const string filename, bool baseFormat )
         //char buf[256];
         CvSeq* weak;
         if ( cascadeParams.featureType != CvFeatureParams::HAAR )
-            CV_Error( CV_StsBadFunc, "old file format is used for Haar-like features only");
+            CV_Error( cv::Error::StsBadFunc, "old file format is used for Haar-like features only");
         fs << "{:" ICV_HAAR_TYPE_ID;
         fs << ICV_HAAR_SIZE_NAME << "[:" << cascadeParams.winSize.width <<
             cascadeParams.winSize.height << "]";
diff --git a/apps/traincascade/imagestorage.cpp b/apps/traincascade/imagestorage.cpp
index f220e5c2b388..a32824c31788 100644
--- a/apps/traincascade/imagestorage.cpp
+++ b/apps/traincascade/imagestorage.cpp
@@ -138,7 +138,7 @@ bool CvCascadeImageReader::PosReader::create( const string _filename )
         fread( &vecSize, sizeof( vecSize ), 1, file ) != 1 ||
         fread( &tmp, sizeof( tmp ), 1, file ) != 1 ||
         fread( &tmp, sizeof( tmp ), 1, file ) != 1 )
-        CV_Error_( CV_StsParseError, ("wrong file format for %s\n", _filename.c_str()) );
+        CV_Error_( cv::Error::StsParseError, ("wrong file format for %s\n", _filename.c_str()) );
     base = sizeof( count ) + sizeof( vecSize ) + 2*sizeof( tmp );
     if( feof( file ) )
         return false;
@@ -154,14 +154,14 @@ bool CvCascadeImageReader::PosReader::get( Mat &_img )
     uchar tmp = 0;
     size_t elements_read = fread( &tmp, sizeof( tmp ), 1, file );
     if( elements_read != 1 )
-        CV_Error( CV_StsBadArg, "Can not get new positive sample. The most possible reason is "
+        CV_Error( cv::Error::StsBadArg, "Can not get new positive sample. The most possible reason is "
                                 "insufficient count of samples in given vec-file.\n");
     elements_read = fread( vec, sizeof( vec[0] ), vecSize, file );
     if( elements_read != (size_t)(vecSize) )
-        CV_Error( CV_StsBadArg, "Can not get new positive sample. Seems that vec-file has incorrect structure.\n");
+        CV_Error( cv::Error::StsBadArg, "Can not get new positive sample. Seems that vec-file has incorrect structure.\n");
 
     if( feof( file ) || last++ >= count )
-        CV_Error( CV_StsBadArg, "Can not get new positive sample. vec-file is over.\n");
+        CV_Error( cv::Error::StsBadArg, "Can not get new positive sample. vec-file is over.\n");
 
     for( int r = 0; r < _img.rows; r++ )
     {
diff --git a/apps/traincascade/old_ml_boost.cpp b/apps/traincascade/old_ml_boost.cpp
index 2f53bd903110..3dbd0204b144 100644
--- a/apps/traincascade/old_ml_boost.cpp
+++ b/apps/traincascade/old_ml_boost.cpp
@@ -991,7 +991,7 @@ CvBoost::set_params( const CvBoostParams& _params )
     params = _params;
     if( params.boost_type != DISCRETE && params.boost_type != REAL &&
         params.boost_type != LOGIT && params.boost_type != GENTLE )
-        CV_ERROR( CV_StsBadArg, "Unknown/unsupported boosting type" );
+        CV_ERROR( cv::Error::StsBadArg, "Unknown/unsupported boosting type" );
 
     params.weak_count = MAX( params.weak_count, 1 );
     params.weight_trim_rate = MAX( params.weight_trim_rate, 0. );
@@ -1045,7 +1045,7 @@ CvBoost::train( const CvMat* _train_data, int _tflag,
             _sample_idx, _var_type, _missing_mask, _params, true, true );
 
         if( data->get_num_classes() != 2 )
-            CV_ERROR( CV_StsNotImplemented,
+            CV_ERROR( cv::Error::StsNotImplemented,
             "Boosted trees can only be used for 2-class classification." );
         CV_CALL( storage = cvCreateMemStorage() );
         weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
@@ -1482,7 +1482,7 @@ CvBoost::get_active_vars( bool absolute_idx )
     __BEGIN__;
 
     if( !weak )
-        CV_ERROR( CV_StsError, "The boosted tree ensemble has not been trained yet" );
+        CV_ERROR( cv::Error::StsError, "The boosted tree ensemble has not been trained yet" );
 
     if( !active_vars || !active_vars_abs )
     {
@@ -1612,13 +1612,13 @@ CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
     const float* sample_data;
 
     if( !weak )
-        CV_Error( CV_StsError, "The boosted tree ensemble has not been trained yet" );
+        CV_Error( cv::Error::StsError, "The boosted tree ensemble has not been trained yet" );
 
     if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 ||
         (_sample->cols != 1 && _sample->rows != 1) ||
         (_sample->cols + _sample->rows - 1 != data->var_all && !raw_mode) ||
         (active_vars && _sample->cols + _sample->rows - 1 != active_vars->cols && raw_mode) )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
         "the input sample must be 1d floating-point vector with the same "
         "number of elements as the total number of variables or "
         "as the number of variables used for training" );
@@ -1627,7 +1627,7 @@ CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
     {
         if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) ||
             !CV_ARE_SIZES_EQ(_missing, _sample) )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
             "the missing data mask must be 8-bit vector of the same size as input sample" );
     }
 
@@ -1644,7 +1644,7 @@ CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
             CV_MAT_TYPE(weak_responses->type) != CV_32FC1 ||
             (weak_responses->cols != 1 && weak_responses->rows != 1) ||
             weak_responses->cols + weak_responses->rows - 1 != weak_count )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
             "The output matrix of weak classifier responses must be valid "
             "floating-point vector of the same number of components as the length of input slice" );
         wstep = CV_IS_MAT_CONT(weak_responses->type) ? 1 : weak_responses->step/sizeof(float);
@@ -1700,7 +1700,7 @@ CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
                     c = a;
                 int ival = cvRound(val);
                 if ( (ival != val) && (!m) )
-                    CV_Error( CV_StsBadArg,
+                    CV_Error( cv::Error::StsBadArg,
                         "one of input categorical variable is not an integer" );
 
                 while( a < b )
@@ -1735,7 +1735,7 @@ CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
     else
     {
         if( !CV_IS_MAT_CONT(_sample->type & (_missing ? _missing->type : -1)) )
-            CV_Error( CV_StsBadArg, "In raw mode the input vectors must be continuous" );
+            CV_Error( cv::Error::StsBadArg, "In raw mode the input vectors must be continuous" );
     }
 
     cvStartReadSeq( weak, &reader );
@@ -1951,7 +1951,7 @@ void CvBoost::read_params( cv::FileNode& fnode )
         params.boost_type = temp.empty() ? -1 : (int)temp;
 
     if( params.boost_type < DISCRETE || params.boost_type > GENTLE )
-        CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
+        CV_ERROR( cv::Error::StsBadArg, "Unknown boosting type" );
 
     temp = fnode[ "splitting_criteria" ];
     if( !temp.empty() && temp.isString() )
@@ -1966,7 +1966,7 @@ void CvBoost::read_params( cv::FileNode& fnode )
         params.split_criteria = temp.empty() ? -1 : (int) temp;
 
     if( params.split_criteria < DEFAULT || params.boost_type > SQERR )
-        CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
+        CV_ERROR( cv::Error::StsBadArg, "Unknown boosting type" );
 
     params.weak_count = (int) fnode[ "ntrees" ];
     params.weight_trim_rate = (double)fnode["weight_trimming_rate"];
@@ -1996,13 +1996,13 @@ CvBoost::read( cv::FileNode& node )
 
     trees_fnode =  node[ "trees" ];
     if( trees_fnode.empty() || !trees_fnode.isSeq() )
-        CV_ERROR( CV_StsParseError, "<trees> tag is missing" );
+        CV_ERROR( cv::Error::StsParseError, "<trees> tag is missing" );
 
     reader = trees_fnode.begin();
     ntrees = (int) trees_fnode.size();
 
     if( ntrees != params.weak_count )
-        CV_ERROR( CV_StsUnmatchedSizes,
+        CV_ERROR( cv::Error::StsUnmatchedSizes,
         "The number of trees stored does not match <ntrees> tag value" );
 
     CV_CALL( storage = cvCreateMemStorage() );
@@ -2034,7 +2034,7 @@ CvBoost::write( cv::FileStorage& fs, const char* name ) const
     fs.startWriteStruct( name, cv::FileNode::MAP, CV_TYPE_NAME_ML_BOOSTING );
 
     if( !weak )
-        CV_ERROR( CV_StsBadArg, "The classifier has not been trained yet" );
+        CV_ERROR( cv::Error::StsBadArg, "The classifier has not been trained yet" );
 
     write_params( fs );
     fs.startWriteStruct( "trees", cv::FileNode::SEQ );
diff --git a/apps/traincascade/old_ml_data.cpp b/apps/traincascade/old_ml_data.cpp
index d221dcbf0f7b..3b0712def135 100644
--- a/apps/traincascade/old_ml_data.cpp
+++ b/apps/traincascade/old_ml_data.cpp
@@ -284,7 +284,7 @@ const CvMat* CvMLData::get_missing() const
     __BEGIN__;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
 
     __END__;
 
@@ -331,7 +331,7 @@ void CvMLData::set_delimiter(char ch)
     __BEGIN__;
 
     if (ch == miss_ch /*|| ch == flt_separator*/)
-        CV_ERROR(CV_StsBadArg, "delimited, miss_character and flt_separator must be different");
+        CV_ERROR(cv::Error::StsBadArg, "delimited, miss_character and flt_separator must be different");
 
     delimiter = ch;
 
@@ -349,7 +349,7 @@ void CvMLData::set_miss_ch(char ch)
     __BEGIN__;
 
     if (ch == delimiter/* || ch == flt_separator*/)
-        CV_ERROR(CV_StsBadArg, "delimited, miss_character and flt_separator must be different");
+        CV_ERROR(cv::Error::StsBadArg, "delimited, miss_character and flt_separator must be different");
 
     miss_ch = ch;
 
@@ -367,10 +367,10 @@ void CvMLData::set_response_idx( int idx )
     __BEGIN__;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
 
     if ( idx >= values->cols)
-        CV_ERROR( CV_StsBadArg, "idx value is not correct" );
+        CV_ERROR( cv::Error::StsBadArg, "idx value is not correct" );
 
     if ( response_idx >= 0 )
         chahge_var_idx( response_idx, true );
@@ -387,7 +387,7 @@ int CvMLData::get_response_idx() const
     __BEGIN__;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
      __END__;
     return response_idx;
 }
@@ -400,19 +400,19 @@ void CvMLData::change_var_type( int var_idx, int type )
     int var_count = 0;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
 
      var_count = values->cols;
 
     if ( var_idx < 0 || var_idx >= var_count)
-        CV_ERROR( CV_StsBadArg, "var_idx is not correct" );
+        CV_ERROR( cv::Error::StsBadArg, "var_idx is not correct" );
 
     if ( type != CV_VAR_ORDERED && type != CV_VAR_CATEGORICAL)
-         CV_ERROR( CV_StsBadArg, "type is not correct" );
+         CV_ERROR( cv::Error::StsBadArg, "type is not correct" );
 
     assert( var_types );
     if ( var_types->data.ptr[var_idx] == CV_VAR_CATEGORICAL && type == CV_VAR_ORDERED)
-        CV_ERROR( CV_StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
+        CV_ERROR( cv::Error::StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
     var_types->data.ptr[var_idx] = (uchar)type;
 
     __END__;
@@ -428,7 +428,7 @@ void CvMLData::set_var_types( const char* str )
     const char* ord = 0, *cat = 0;
     int var_count = 0, set_var_type_count = 0;
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
 
     var_count = values->cols;
 
@@ -437,7 +437,7 @@ void CvMLData::set_var_types( const char* str )
     ord = strstr( str, "ord" );
     cat = strstr( str, "cat" );
     if ( !ord && !cat )
-        CV_ERROR( CV_StsBadArg, "types string is not correct" );
+        CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
 
     if ( !ord && strlen(cat) == 3 ) // str == "cat"
     {
@@ -455,19 +455,19 @@ void CvMLData::set_var_types( const char* str )
     {
         char* stopstring = NULL;
         if ( ord[3] != '[')
-            CV_ERROR( CV_StsBadArg, "types string is not correct" );
+            CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
 
         ord += 4; // pass "ord["
         do
         {
             int b1 = (int)strtod( ord, &stopstring );
             if ( *stopstring == 0 || (*stopstring != ',' && *stopstring != ']' && *stopstring != '-') )
-                CV_ERROR( CV_StsBadArg, "types string is not correct" );
+                CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
             ord = stopstring + 1;
             if ( (stopstring[0] == ',') || (stopstring[0] == ']'))
             {
                 if ( var_types->data.ptr[b1] == CV_VAR_CATEGORICAL)
-                    CV_ERROR( CV_StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
+                    CV_ERROR( cv::Error::StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
                 var_types->data.ptr[b1] = CV_VAR_ORDERED;
                 set_var_type_count++;
             }
@@ -477,39 +477,39 @@ void CvMLData::set_var_types( const char* str )
                 {
                     int b2 = (int)strtod( ord, &stopstring);
                     if ( (*stopstring == 0) || (*stopstring != ',' && *stopstring != ']') )
-                        CV_ERROR( CV_StsBadArg, "types string is not correct" );
+                        CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
                     ord = stopstring + 1;
                     for (int i = b1; i <= b2; i++)
                     {
                         if ( var_types->data.ptr[i] == CV_VAR_CATEGORICAL)
-                            CV_ERROR( CV_StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
+                            CV_ERROR( cv::Error::StsBadArg, "it`s impossible to assign CV_VAR_ORDERED type to categorical variable" );
                         var_types->data.ptr[i] = CV_VAR_ORDERED;
                     }
                     set_var_type_count += b2 - b1 + 1;
                 }
                 else
-                    CV_ERROR( CV_StsBadArg, "types string is not correct" );
+                    CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
 
             }
         }
         while (*stopstring != ']');
 
         if ( stopstring[1] != '\0' && stopstring[1] != ',')
-            CV_ERROR( CV_StsBadArg, "types string is not correct" );
+            CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
     }
 
     if ( cat ) // parse cat str
     {
         char* stopstring = NULL;
         if ( cat[3] != '[')
-            CV_ERROR( CV_StsBadArg, "types string is not correct" );
+            CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
 
         cat += 4; // pass "cat["
         do
         {
             int b1 = (int)strtod( cat, &stopstring );
             if ( *stopstring == 0 || (*stopstring != ',' && *stopstring != ']' && *stopstring != '-') )
-                CV_ERROR( CV_StsBadArg, "types string is not correct" );
+                CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
             cat = stopstring + 1;
             if ( (stopstring[0] == ',') || (stopstring[0] == ']'))
             {
@@ -522,25 +522,25 @@ void CvMLData::set_var_types( const char* str )
                 {
                     int b2 = (int)strtod( cat, &stopstring);
                     if ( (*stopstring == 0) || (*stopstring != ',' && *stopstring != ']') )
-                        CV_ERROR( CV_StsBadArg, "types string is not correct" );
+                        CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
                     cat = stopstring + 1;
                     for (int i = b1; i <= b2; i++)
                         var_types->data.ptr[i] = CV_VAR_CATEGORICAL;
                     set_var_type_count += b2 - b1 + 1;
                 }
                 else
-                    CV_ERROR( CV_StsBadArg, "types string is not correct" );
+                    CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
 
             }
         }
         while (*stopstring != ']');
 
         if ( stopstring[1] != '\0' && stopstring[1] != ',')
-            CV_ERROR( CV_StsBadArg, "types string is not correct" );
+            CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
     }
 
     if (set_var_type_count != var_count)
-        CV_ERROR( CV_StsBadArg, "types string is not correct" );
+        CV_ERROR( cv::Error::StsBadArg, "types string is not correct" );
 
      __END__;
 }
@@ -553,7 +553,7 @@ const CvMat* CvMLData::get_var_types()
     uchar *var_types_out_ptr = 0;
     int avcount, vt_size;
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
 
     assert( var_idx_mask );
 
@@ -597,7 +597,7 @@ const CvMat* CvMLData::get_responses()
     int var_count = 0;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
     var_count = values->cols;
 
     if ( response_idx < 0 || response_idx >= var_count )
@@ -621,7 +621,7 @@ void CvMLData::set_train_test_split( const CvTrainTestSplit * spl)
     int sample_count = 0;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
 
     sample_count = values->rows;
 
@@ -631,14 +631,14 @@ void CvMLData::set_train_test_split( const CvTrainTestSplit * spl)
     {
         train_sample_count = spl->train_sample_part.count;
         if (train_sample_count > sample_count)
-            CV_ERROR( CV_StsBadArg, "train samples count is not correct" );
+            CV_ERROR( cv::Error::StsBadArg, "train samples count is not correct" );
         train_sample_count = train_sample_count<=0 ? sample_count : train_sample_count;
     }
     else // dtype.train_sample_part_mode == CV_PORTION
     {
         train_sample_portion = spl->train_sample_part.portion;
         if ( train_sample_portion > 1)
-            CV_ERROR( CV_StsBadArg, "train samples count is not correct" );
+            CV_ERROR( cv::Error::StsBadArg, "train samples count is not correct" );
         train_sample_portion = train_sample_portion <= FLT_EPSILON ||
             1 - train_sample_portion <= FLT_EPSILON ? 1 : train_sample_portion;
         train_sample_count = std::max(1, cvFloor( train_sample_portion * sample_count ));
@@ -680,7 +680,7 @@ const CvMat* CvMLData::get_train_sample_idx() const
     __BEGIN__;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
     __END__;
 
     return train_sample_idx;
@@ -692,7 +692,7 @@ const CvMat* CvMLData::get_test_sample_idx() const
     __BEGIN__;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
     __END__;
 
     return test_sample_idx;
@@ -704,7 +704,7 @@ void CvMLData::mix_train_and_test_idx()
     __BEGIN__;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
     __END__;
 
     if ( !sample_idx)
@@ -731,7 +731,7 @@ const CvMat* CvMLData::get_var_idx()
     int avcount = 0;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
 
     assert( var_idx_mask );
 
@@ -776,12 +776,12 @@ void CvMLData::change_var_idx( int vi, bool state )
     int var_count = 0;
 
     if ( !values )
-        CV_ERROR( CV_StsInternal, "data is empty" );
+        CV_ERROR( cv::Error::StsInternal, "data is empty" );
 
     var_count = values->cols;
 
     if ( vi < 0 || vi >= var_count)
-        CV_ERROR( CV_StsBadArg, "variable index is not correct" );
+        CV_ERROR( cv::Error::StsBadArg, "variable index is not correct" );
 
     assert( var_idx_mask );
     var_idx_mask->data.ptr[vi] = state;
diff --git a/apps/traincascade/old_ml_inner_functions.cpp b/apps/traincascade/old_ml_inner_functions.cpp
index ef81da35486a..719674f4beb9 100644
--- a/apps/traincascade/old_ml_inner_functions.cpp
+++ b/apps/traincascade/old_ml_inner_functions.cpp
@@ -67,7 +67,7 @@ void CvStatModel::save( const char* filename, const char* name ) const
     __BEGIN__;
 
     if( !fs.open( filename, cv::FileStorage::WRITE ))
-        CV_ERROR( CV_StsError, "Could not open the file storage. Check the path and permissions" );
+        CV_ERROR( cv::Error::StsError, "Could not open the file storage. Check the path and permissions" );
 
     write( fs, name ? name : default_model_name );
 
@@ -87,7 +87,7 @@ void CvStatModel::load( const char* filename, const char* name )
     cv::FileNode model_node;
 
     if( !fs.open(filename, cv::FileStorage::READ) )
-        CV_ERROR( CV_StsError, "Could not open the file storage. Check the path and permissions" );
+        CV_ERROR( cv::Error::StsError, "Could not open the file storage. Check the path and permissions" );
 
     if( name )
         model_node = fs[ name ];
@@ -107,12 +107,12 @@ void CvStatModel::load( const char* filename, const char* name )
 
 void CvStatModel::write( cv::FileStorage&, const char* ) const
 {
-    OPENCV_ERROR( CV_StsNotImplemented, "CvStatModel::write", "" );
+    OPENCV_ERROR( cv::Error::StsNotImplemented, "CvStatModel::write", "" );
 }
 
 void CvStatModel::read( const cv::FileNode& )
 {
-    OPENCV_ERROR( CV_StsNotImplemented, "CvStatModel::read", "" );
+    OPENCV_ERROR( cv::Error::StsNotImplemented, "CvStatModel::read", "" );
 }
 
 CvMat* icvGenerateRandomClusterCenters ( int seed, const CvMat* data,
@@ -134,7 +134,7 @@ CvMat* icvGenerateRandomClusterCenters ( int seed, const CvMat* data,
     {
         if( _centers && !ICV_IS_MAT_OF_TYPE (_centers, CV_32FC1) )
         {
-            CV_ERROR(CV_StsBadArg,"");
+            CV_ERROR(cv::Error::StsBadArg,"");
         }
         else if( !_centers )
             CV_CALL(centers = cvCreateMat (num_of_clusters, dim, CV_32FC1));
@@ -143,16 +143,16 @@ CvMat* icvGenerateRandomClusterCenters ( int seed, const CvMat* data,
     {
         if( _centers && !ICV_IS_MAT_OF_TYPE (_centers, CV_64FC1) )
         {
-            CV_ERROR(CV_StsBadArg,"");
+            CV_ERROR(cv::Error::StsBadArg,"");
         }
         else if( !_centers )
             CV_CALL(centers = cvCreateMat (num_of_clusters, dim, CV_64FC1));
     }
     else
-        CV_ERROR (CV_StsBadArg,"");
+        CV_ERROR (cv::Error::StsBadArg,"");
 
     if( num_of_clusters < 1 )
-        CV_ERROR (CV_StsBadArg,"");
+        CV_ERROR (cv::Error::StsBadArg,"");
 
     rng = cvRNG(seed);
     for (i = 0; i < dim; i++)
@@ -208,10 +208,10 @@ cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, bool check_for_
     int* dsti;
 
     if( !CV_IS_MAT(idx_arr) )
-        CV_ERROR( CV_StsBadArg, "Invalid index array" );
+        CV_ERROR( cv::Error::StsBadArg, "Invalid index array" );
 
     if( idx_arr->rows != 1 && idx_arr->cols != 1 )
-        CV_ERROR( CV_StsBadSize, "the index array must be 1-dimensional" );
+        CV_ERROR( cv::Error::StsBadSize, "the index array must be 1-dimensional" );
 
     idx_total = idx_arr->rows + idx_arr->cols - 1;
     srcb = idx_arr->data.ptr;
@@ -227,20 +227,20 @@ cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, bool check_for_
         // idx_arr is array of 1's and 0's -
         // i.e. it is a mask of the selected components
         if( idx_total != data_arr_size )
-            CV_ERROR( CV_StsUnmatchedSizes,
+            CV_ERROR( cv::Error::StsUnmatchedSizes,
             "Component mask should contain as many elements as the total number of input variables" );
 
         for( i = 0; i < idx_total; i++ )
             idx_selected += srcb[i*step] != 0;
 
         if( idx_selected == 0 )
-            CV_ERROR( CV_StsOutOfRange, "No components/input_variables is selected!" );
+            CV_ERROR( cv::Error::StsOutOfRange, "No components/input_variables is selected!" );
 
         break;
     case CV_32SC1:
         // idx_arr is array of integer indices of selected components
         if( idx_total > data_arr_size )
-            CV_ERROR( CV_StsOutOfRange,
+            CV_ERROR( cv::Error::StsOutOfRange,
             "index array may not contain more elements than the total number of input variables" );
         idx_selected = idx_total;
         // check if sorted already
@@ -256,7 +256,7 @@ cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, bool check_for_
         }
         break;
     default:
-        CV_ERROR( CV_StsUnsupportedFormat, "Unsupported index array data type "
+        CV_ERROR( cv::Error::StsUnsupportedFormat, "Unsupported index array data type "
                                            "(it should be 8uC1, 8sC1 or 32sC1)" );
     }
 
@@ -278,13 +278,13 @@ cvPreprocessIndexArray( const CvMat* idx_arr, int data_arr_size, bool check_for_
             qsort( dsti, idx_total, sizeof(dsti[0]), icvCmpIntegers );
 
         if( dsti[0] < 0 || dsti[idx_total-1] >= data_arr_size )
-            CV_ERROR( CV_StsOutOfRange, "the index array elements are out of range" );
+            CV_ERROR( cv::Error::StsOutOfRange, "the index array elements are out of range" );
 
         if( check_for_duplicates )
         {
             for( i = 1; i < idx_total; i++ )
                 if( dsti[i] <= dsti[i-1] )
-                    CV_ERROR( CV_StsBadArg, "There are duplicated index array elements" );
+                    CV_ERROR( cv::Error::StsBadArg, "There are duplicated index array elements" );
         }
     }
 
@@ -315,19 +315,19 @@ cvPreprocessVarType( const CvMat* var_type, const CvMat* var_idx,
     uchar* dst;
 
     if( !CV_IS_MAT(var_type) )
-        CV_ERROR( var_type ? CV_StsBadArg : CV_StsNullPtr, "Invalid or absent var_type array" );
+        CV_ERROR( var_type ? cv::Error::StsBadArg : cv::Error::StsNullPtr, "Invalid or absent var_type array" );
 
     if( var_type->rows != 1 && var_type->cols != 1 )
-        CV_ERROR( CV_StsBadSize, "var_type array must be 1-dimensional" );
+        CV_ERROR( cv::Error::StsBadSize, "var_type array must be 1-dimensional" );
 
     if( !CV_IS_MASK_ARR(var_type))
-        CV_ERROR( CV_StsUnsupportedFormat, "type mask must be 8uC1 or 8sC1 array" );
+        CV_ERROR( cv::Error::StsUnsupportedFormat, "type mask must be 8uC1 or 8sC1 array" );
 
     tm_size = var_type->rows + var_type->cols - 1;
     tm_step = var_type->rows == 1 ? 1 : var_type->step/CV_ELEM_SIZE(var_type->type);
 
     if( /*tm_size != var_count &&*/ tm_size != var_count + 1 )
-        CV_ERROR( CV_StsBadArg,
+        CV_ERROR( cv::Error::StsBadArg,
         "type mask must be of <input var count> + 1 size" );
 
     if( response_type && tm_size > var_count )
@@ -337,9 +337,9 @@ cvPreprocessVarType( const CvMat* var_type, const CvMat* var_idx,
     {
         if( !CV_IS_MAT(var_idx) || CV_MAT_TYPE(var_idx->type) != CV_32SC1 ||
             (var_idx->rows != 1 && var_idx->cols != 1) || !CV_IS_MAT_CONT(var_idx->type) )
-            CV_ERROR( CV_StsBadArg, "var index array should be continuous 1-dimensional integer vector" );
+            CV_ERROR( cv::Error::StsBadArg, "var index array should be continuous 1-dimensional integer vector" );
         if( var_idx->rows + var_idx->cols - 1 > var_count )
-            CV_ERROR( CV_StsBadSize, "var index array is too large" );
+            CV_ERROR( cv::Error::StsBadSize, "var index array is too large" );
         //map = var_idx->data.i;
         var_count = var_idx->rows + var_idx->cols - 1;
     }
@@ -376,18 +376,18 @@ cvPreprocessOrderedResponses( const CvMat* responses, const CvMat* sample_idx, i
     int sample_count = sample_all;
 
     if( !CV_IS_MAT(responses) )
-        CV_ERROR( CV_StsBadArg, "Invalid response array" );
+        CV_ERROR( cv::Error::StsBadArg, "Invalid response array" );
 
     if( responses->rows != 1 && responses->cols != 1 )
-        CV_ERROR( CV_StsBadSize, "Response array must be 1-dimensional" );
+        CV_ERROR( cv::Error::StsBadSize, "Response array must be 1-dimensional" );
 
     if( responses->rows + responses->cols - 1 != sample_count )
-        CV_ERROR( CV_StsUnmatchedSizes,
+        CV_ERROR( cv::Error::StsUnmatchedSizes,
         "Response array must contain as many elements as the total number of samples" );
 
     r_type = CV_MAT_TYPE(responses->type);
     if( r_type != CV_32FC1 && r_type != CV_32SC1 )
-        CV_ERROR( CV_StsUnsupportedFormat, "Unsupported response type" );
+        CV_ERROR( cv::Error::StsUnsupportedFormat, "Unsupported response type" );
 
     r_step = responses->step ? responses->step / CV_ELEM_SIZE(responses->type) : 1;
 
@@ -401,9 +401,9 @@ cvPreprocessOrderedResponses( const CvMat* responses, const CvMat* sample_idx, i
     {
         if( !CV_IS_MAT(sample_idx) || CV_MAT_TYPE(sample_idx->type) != CV_32SC1 ||
             (sample_idx->rows != 1 && sample_idx->cols != 1) || !CV_IS_MAT_CONT(sample_idx->type) )
-            CV_ERROR( CV_StsBadArg, "sample index array should be continuous 1-dimensional integer vector" );
+            CV_ERROR( cv::Error::StsBadArg, "sample index array should be continuous 1-dimensional integer vector" );
         if( sample_idx->rows + sample_idx->cols - 1 > sample_count )
-            CV_ERROR( CV_StsBadSize, "sample index array is too large" );
+            CV_ERROR( cv::Error::StsBadSize, "sample index array is too large" );
         map = sample_idx->data.i;
         sample_count = sample_idx->rows + sample_idx->cols - 1;
     }
@@ -466,18 +466,18 @@ cvPreprocessCategoricalResponses( const CvMat* responses,
     int sample_count = sample_all;
 
     if( !CV_IS_MAT(responses) )
-        CV_ERROR( CV_StsBadArg, "Invalid response array" );
+        CV_ERROR( cv::Error::StsBadArg, "Invalid response array" );
 
     if( responses->rows != 1 && responses->cols != 1 )
-        CV_ERROR( CV_StsBadSize, "Response array must be 1-dimensional" );
+        CV_ERROR( cv::Error::StsBadSize, "Response array must be 1-dimensional" );
 
     if( responses->rows + responses->cols - 1 != sample_count )
-        CV_ERROR( CV_StsUnmatchedSizes,
+        CV_ERROR( cv::Error::StsUnmatchedSizes,
         "Response array must contain as many elements as the total number of samples" );
 
     r_type = CV_MAT_TYPE(responses->type);
     if( r_type != CV_32FC1 && r_type != CV_32SC1 )
-        CV_ERROR( CV_StsUnsupportedFormat, "Unsupported response type" );
+        CV_ERROR( cv::Error::StsUnsupportedFormat, "Unsupported response type" );
 
     r_step = responses->rows == 1 ? 1 : responses->step / CV_ELEM_SIZE(responses->type);
 
@@ -485,9 +485,9 @@ cvPreprocessCategoricalResponses( const CvMat* responses,
     {
         if( !CV_IS_MAT(sample_idx) || CV_MAT_TYPE(sample_idx->type) != CV_32SC1 ||
             (sample_idx->rows != 1 && sample_idx->cols != 1) || !CV_IS_MAT_CONT(sample_idx->type) )
-            CV_ERROR( CV_StsBadArg, "sample index array should be continuous 1-dimensional integer vector" );
+            CV_ERROR( cv::Error::StsBadArg, "sample index array should be continuous 1-dimensional integer vector" );
         if( sample_idx->rows + sample_idx->cols - 1 > sample_count )
-            CV_ERROR( CV_StsBadSize, "sample index array is too large" );
+            CV_ERROR( cv::Error::StsBadSize, "sample index array is too large" );
         map = sample_idx->data.i;
         sample_count = sample_idx->rows + sample_idx->cols - 1;
     }
@@ -495,7 +495,7 @@ cvPreprocessCategoricalResponses( const CvMat* responses,
     CV_CALL( out_responses = cvCreateMat( 1, sample_count, CV_32SC1 ));
 
     if( !out_response_map )
-        CV_ERROR( CV_StsNullPtr, "out_response_map pointer is NULL" );
+        CV_ERROR( cv::Error::StsNullPtr, "out_response_map pointer is NULL" );
 
     CV_CALL( response_ptr = (int**)cvAlloc( sample_count*sizeof(response_ptr[0])));
 
@@ -517,7 +517,7 @@ cvPreprocessCategoricalResponses( const CvMat* responses,
             {
                 char buf[100];
                 snprintf( buf, sizeof(buf), "response #%d is not integral", idx );
-                CV_ERROR( CV_StsBadArg, buf );
+                CV_ERROR( cv::Error::StsBadArg, buf );
             }
             dst[i] = ri;
         }
@@ -531,7 +531,7 @@ cvPreprocessCategoricalResponses( const CvMat* responses,
         cls_count += *response_ptr[i] != *response_ptr[i-1];
 
     if( cls_count < 2 )
-        CV_ERROR( CV_StsBadArg, "There is only a single class" );
+        CV_ERROR( cv::Error::StsBadArg, "There is only a single class" );
 
     CV_CALL( *out_response_map = cvCreateMat( 1, cls_count, CV_32SC1 ));
 
@@ -588,7 +588,7 @@ cvGetTrainSamples( const CvMat* train_data, int tflag,
     const int *s_idx, *v_idx;
 
     if( !CV_IS_MAT(train_data) )
-        CV_ERROR( CV_StsBadArg, "Invalid or NULL training data matrix" );
+        CV_ERROR( cv::Error::StsBadArg, "Invalid or NULL training data matrix" );
 
     var_count = var_idx ? var_idx->cols + var_idx->rows - 1 :
                 tflag == CV_ROW_SAMPLE ? train_data->cols : train_data->rows;
@@ -659,18 +659,18 @@ cvCheckTrainData( const CvMat* train_data, int tflag,
 
     // check parameter types and sizes
     if( !CV_IS_MAT(train_data) || CV_MAT_TYPE(train_data->type) != CV_32FC1 )
-        CV_ERROR( CV_StsBadArg, "train data must be floating-point matrix" );
+        CV_ERROR( cv::Error::StsBadArg, "train data must be floating-point matrix" );
 
     if( missing_mask )
     {
         if( !CV_IS_MAT(missing_mask) || !CV_IS_MASK_ARR(missing_mask) ||
             !CV_ARE_SIZES_EQ(train_data, missing_mask) )
-            CV_ERROR( CV_StsBadArg,
+            CV_ERROR( cv::Error::StsBadArg,
             "missing value mask must be 8-bit matrix of the same size as training data" );
     }
 
     if( tflag != CV_ROW_SAMPLE && tflag != CV_COL_SAMPLE )
-        CV_ERROR( CV_StsBadArg,
+        CV_ERROR( cv::Error::StsBadArg,
         "Unknown training data layout (must be CV_ROW_SAMPLE or CV_COL_SAMPLE)" );
 
     if( var_all )
@@ -736,7 +736,7 @@ cvPrepareTrainData( const char* /*funcname*/,
     __BEGIN__;
 
     if( !out_train_samples )
-        CV_ERROR( CV_StsBadArg, "output pointer to train samples is NULL" );
+        CV_ERROR( cv::Error::StsBadArg, "output pointer to train samples is NULL" );
 
     CV_CALL( cvCheckTrainData( train_data, tflag, 0, &var_all, &sample_all ));
 
@@ -748,7 +748,7 @@ cvPrepareTrainData( const char* /*funcname*/,
     if( responses )
     {
         if( !out_responses )
-            CV_ERROR( CV_StsNullPtr, "output response pointer is NULL" );
+            CV_ERROR( cv::Error::StsNullPtr, "output response pointer is NULL" );
 
         if( response_type == CV_VAR_NUMERICAL )
         {
@@ -841,10 +841,10 @@ cvSortSamplesByClasses( const float** samples, const CvMat* classes,
     int i, k = 0, sample_count;
 
     if( !samples || !classes || !class_ranges )
-        CV_ERROR( CV_StsNullPtr, "INTERNAL ERROR: some of the args are NULL pointers" );
+        CV_ERROR( cv::Error::StsNullPtr, "INTERNAL ERROR: some of the args are NULL pointers" );
 
     if( classes->rows != 1 || CV_MAT_TYPE(classes->type) != CV_32SC1 )
-        CV_ERROR( CV_StsBadArg, "classes array must be a single row of integers" );
+        CV_ERROR( cv::Error::StsBadArg, "classes array must be a single row of integers" );
 
     sample_count = classes->cols;
     CV_CALL( pairs = (CvSampleResponsePair*)cvAlloc( (sample_count+1)*sizeof(pairs[0])));
@@ -901,45 +901,45 @@ cvPreparePredictData( const CvArr* _sample, int dims_all,
     int vec_size;
 
     if( !is_sparse && !CV_IS_MAT(sample) )
-        CV_ERROR( !sample ? CV_StsNullPtr : CV_StsBadArg, "The sample is not a valid vector" );
+        CV_ERROR( !sample ? cv::Error::StsNullPtr : cv::Error::StsBadArg, "The sample is not a valid vector" );
 
     if( cvGetElemType( sample ) != CV_32FC1 )
-        CV_ERROR( CV_StsUnsupportedFormat, "Input sample must have 32fC1 type" );
+        CV_ERROR( cv::Error::StsUnsupportedFormat, "Input sample must have 32fC1 type" );
 
     CV_CALL( d = cvGetDims( sample, sizes ));
 
     if( !((is_sparse && d == 1) || (!is_sparse && d == 2 && (sample->rows == 1 || sample->cols == 1))) )
-        CV_ERROR( CV_StsBadSize, "Input sample must be 1-dimensional vector" );
+        CV_ERROR( cv::Error::StsBadSize, "Input sample must be 1-dimensional vector" );
 
     if( d == 1 )
         sizes[1] = 1;
 
     if( sizes[0] + sizes[1] - 1 != dims_all )
-        CV_ERROR( CV_StsUnmatchedSizes,
+        CV_ERROR( cv::Error::StsUnmatchedSizes,
         "The sample size is different from what has been used for training" );
 
     if( !_row_sample )
-        CV_ERROR( CV_StsNullPtr, "INTERNAL ERROR: The row_sample pointer is NULL" );
+        CV_ERROR( cv::Error::StsNullPtr, "INTERNAL ERROR: The row_sample pointer is NULL" );
 
     if( comp_idx && (!CV_IS_MAT(comp_idx) || comp_idx->rows != 1 ||
         CV_MAT_TYPE(comp_idx->type) != CV_32SC1) )
-        CV_ERROR( CV_StsBadArg, "INTERNAL ERROR: invalid comp_idx" );
+        CV_ERROR( cv::Error::StsBadArg, "INTERNAL ERROR: invalid comp_idx" );
 
     dims_selected = comp_idx ? comp_idx->cols : dims_all;
 
     if( prob )
     {
         if( !CV_IS_MAT(prob) )
-            CV_ERROR( CV_StsBadArg, "The output matrix of probabilities is invalid" );
+            CV_ERROR( cv::Error::StsBadArg, "The output matrix of probabilities is invalid" );
 
         if( (prob->rows != 1 && prob->cols != 1) ||
             (CV_MAT_TYPE(prob->type) != CV_32FC1 &&
             CV_MAT_TYPE(prob->type) != CV_64FC1) )
-            CV_ERROR( CV_StsBadSize,
+            CV_ERROR( cv::Error::StsBadSize,
             "The matrix of probabilities must be 1-dimensional vector of 32fC1 type" );
 
         if( prob->rows + prob->cols - 1 != class_count )
-            CV_ERROR( CV_StsUnmatchedSizes,
+            CV_ERROR( cv::Error::StsUnmatchedSizes,
             "The vector of probabilities must contain as many elements as "
             "the number of classes in the training set" );
     }
@@ -1071,7 +1071,7 @@ icvConvertDataToSparse( const uchar* src, int src_step, int src_type,
     dst_type = CV_MAT_TYPE(dst_type);
 
     if( CV_MAT_CN(src_type) != 1 || CV_MAT_CN(dst_type) != 1 )
-        CV_ERROR( CV_StsUnsupportedFormat, "The function supports only single-channel arrays" );
+        CV_ERROR( cv::Error::StsUnsupportedFormat, "The function supports only single-channel arrays" );
 
     if( src_step == 0 )
         src_step = CV_ELEM_SIZE(src_type);
@@ -1134,7 +1134,7 @@ icvConvertDataToSparse( const uchar* src, int src_step, int src_type,
                     ((float*)_dst)[j] = (float)((double*)src)[j];
         }
     else
-        CV_ERROR( CV_StsUnsupportedFormat, "Unsupported combination of input and output vectors" );
+        CV_ERROR( cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output vectors" );
 
     __END__;
 }
@@ -1154,15 +1154,15 @@ cvWritebackLabels( const CvMat* labels, CvMat* dst_labels,
     int samples_selected = samples_all, dims_selected = dims_all;
 
     if( dst_labels && !CV_IS_MAT(dst_labels) )
-        CV_ERROR( CV_StsBadArg, "Array of output labels is not a valid matrix" );
+        CV_ERROR( cv::Error::StsBadArg, "Array of output labels is not a valid matrix" );
 
     if( dst_centers )
         if( !ICV_IS_MAT_OF_TYPE(dst_centers, CV_32FC1) &&
             !ICV_IS_MAT_OF_TYPE(dst_centers, CV_64FC1) )
-            CV_ERROR( CV_StsBadArg, "Array of cluster centers is not a valid matrix" );
+            CV_ERROR( cv::Error::StsBadArg, "Array of cluster centers is not a valid matrix" );
 
     if( dst_probs && !CV_IS_MAT(dst_probs) )
-        CV_ERROR( CV_StsBadArg, "Probability matrix is not valid" );
+        CV_ERROR( cv::Error::StsBadArg, "Probability matrix is not valid" );
 
     if( sample_idx )
     {
@@ -1179,15 +1179,15 @@ cvWritebackLabels( const CvMat* labels, CvMat* dst_labels,
     if( dst_labels && (!labels || labels->data.ptr != dst_labels->data.ptr) )
     {
         if( !labels )
-            CV_ERROR( CV_StsNullPtr, "NULL labels" );
+            CV_ERROR( cv::Error::StsNullPtr, "NULL labels" );
 
         CV_ASSERT( labels->rows == 1 );
 
         if( dst_labels->rows != 1 && dst_labels->cols != 1 )
-            CV_ERROR( CV_StsBadSize, "Array of output labels should be 1d vector" );
+            CV_ERROR( cv::Error::StsBadSize, "Array of output labels should be 1d vector" );
 
         if( dst_labels->rows + dst_labels->cols - 1 != samples_all )
-            CV_ERROR( CV_StsUnmatchedSizes,
+            CV_ERROR( cv::Error::StsUnmatchedSizes,
             "Size of vector of output labels is not equal to the total number of input samples" );
 
         CV_ASSERT( labels->cols == samples_selected );
@@ -1202,13 +1202,13 @@ cvWritebackLabels( const CvMat* labels, CvMat* dst_labels,
         int i;
 
         if( !centers )
-            CV_ERROR( CV_StsNullPtr, "NULL centers" );
+            CV_ERROR( cv::Error::StsNullPtr, "NULL centers" );
 
         if( centers->rows != dst_centers->rows )
-            CV_ERROR( CV_StsUnmatchedSizes, "Invalid number of rows in matrix of output centers" );
+            CV_ERROR( cv::Error::StsUnmatchedSizes, "Invalid number of rows in matrix of output centers" );
 
         if( dst_centers->cols != dims_all )
-            CV_ERROR( CV_StsUnmatchedSizes,
+            CV_ERROR( cv::Error::StsUnmatchedSizes,
             "Number of columns in matrix of output centers is "
             "not equal to the total number of components in the input samples" );
 
@@ -1223,13 +1223,13 @@ cvWritebackLabels( const CvMat* labels, CvMat* dst_labels,
     if( dst_probs && (!probs || probs->data.ptr != dst_probs->data.ptr) )
     {
         if( !probs )
-            CV_ERROR( CV_StsNullPtr, "NULL probs" );
+            CV_ERROR( cv::Error::StsNullPtr, "NULL probs" );
 
         if( probs->cols != dst_probs->cols )
-            CV_ERROR( CV_StsUnmatchedSizes, "Invalid number of columns in output probability matrix" );
+            CV_ERROR( cv::Error::StsUnmatchedSizes, "Invalid number of columns in output probability matrix" );
 
         if( dst_probs->rows != samples_all )
-            CV_ERROR( CV_StsUnmatchedSizes,
+            CV_ERROR( cv::Error::StsUnmatchedSizes,
             "Number of rows in output probability matrix is "
             "not equal to the total number of input samples" );
 
@@ -1273,29 +1273,29 @@ cvStatModelMultiPredict( const CvStatModel* stat_model,
     CvMat* probs1 = probs ? &probs_part : 0;
 
     if( !CV_IS_STAT_MODEL(stat_model) )
-        CV_ERROR( !stat_model ? CV_StsNullPtr : CV_StsBadArg, "Invalid statistical model" );
+        CV_ERROR( !stat_model ? cv::Error::StsNullPtr : cv::Error::StsBadArg, "Invalid statistical model" );
 
     if( !stat_model->predict )
-        CV_ERROR( CV_StsNotImplemented, "There is no \"predict\" method" );
+        CV_ERROR( cv::Error::StsNotImplemented, "There is no \"predict\" method" );
 
     if( !predict_input || !predict_output )
-        CV_ERROR( CV_StsNullPtr, "NULL input or output matrices" );
+        CV_ERROR( cv::Error::StsNullPtr, "NULL input or output matrices" );
 
     if( !is_sparse && !CV_IS_MAT(predict_input) )
-        CV_ERROR( CV_StsBadArg, "predict_input should be a matrix or a sparse matrix" );
+        CV_ERROR( cv::Error::StsBadArg, "predict_input should be a matrix or a sparse matrix" );
 
     if( !CV_IS_MAT(predict_output) )
-        CV_ERROR( CV_StsBadArg, "predict_output should be a matrix" );
+        CV_ERROR( cv::Error::StsBadArg, "predict_output should be a matrix" );
 
     type = cvGetElemType( predict_input );
     if( type != CV_32FC1 ||
         (CV_MAT_TYPE(predict_output->type) != CV_32FC1 &&
          CV_MAT_TYPE(predict_output->type) != CV_32SC1 ))
-         CV_ERROR( CV_StsUnsupportedFormat, "The input or output matrix has unsupported format" );
+         CV_ERROR( cv::Error::StsUnsupportedFormat, "The input or output matrix has unsupported format" );
 
     CV_CALL( d = cvGetDims( predict_input, sizes ));
     if( d > 2 )
-        CV_ERROR( CV_StsBadSize, "The input matrix should be 1- or 2-dimensional" );
+        CV_ERROR( cv::Error::StsBadSize, "The input matrix should be 1- or 2-dimensional" );
 
     if( !tflag )
     {
@@ -1311,30 +1311,30 @@ cvStatModelMultiPredict( const CvStatModel* stat_model,
     if( sample_idx )
     {
         if( !CV_IS_MAT(sample_idx) )
-            CV_ERROR( CV_StsBadArg, "Invalid sample_idx matrix" );
+            CV_ERROR( cv::Error::StsBadArg, "Invalid sample_idx matrix" );
 
         if( sample_idx->cols != 1 && sample_idx->rows != 1 )
-            CV_ERROR( CV_StsBadSize, "sample_idx must be 1-dimensional matrix" );
+            CV_ERROR( cv::Error::StsBadSize, "sample_idx must be 1-dimensional matrix" );
 
         samples_selected = sample_idx->rows + sample_idx->cols - 1;
 
         if( CV_MAT_TYPE(sample_idx->type) == CV_32SC1 )
         {
             if( samples_selected > samples_all )
-                CV_ERROR( CV_StsBadSize, "sample_idx is too large vector" );
+                CV_ERROR( cv::Error::StsBadSize, "sample_idx is too large vector" );
         }
         else if( samples_selected != samples_all )
-            CV_ERROR( CV_StsUnmatchedSizes, "sample_idx has incorrect size" );
+            CV_ERROR( cv::Error::StsUnmatchedSizes, "sample_idx has incorrect size" );
 
         sample_idx_step = sample_idx->step ?
             sample_idx->step / CV_ELEM_SIZE(sample_idx->type) : 1;
     }
 
     if( predict_output->rows != 1 && predict_output->cols != 1 )
-        CV_ERROR( CV_StsBadSize, "predict_output should be a 1-dimensional matrix" );
+        CV_ERROR( cv::Error::StsBadSize, "predict_output should be a 1-dimensional matrix" );
 
     if( predict_output->rows + predict_output->cols - 1 != samples_all )
-        CV_ERROR( CV_StsUnmatchedSizes, "predict_output and predict_input have uncoordinated sizes" );
+        CV_ERROR( cv::Error::StsUnmatchedSizes, "predict_output and predict_input have uncoordinated sizes" );
 
     predict_output_step = predict_output->step ?
         predict_output->step / CV_ELEM_SIZE(predict_output->type) : 1;
@@ -1342,14 +1342,14 @@ cvStatModelMultiPredict( const CvStatModel* stat_model,
     if( probs )
     {
         if( !CV_IS_MAT(probs) )
-            CV_ERROR( CV_StsBadArg, "Invalid matrix of probabilities" );
+            CV_ERROR( cv::Error::StsBadArg, "Invalid matrix of probabilities" );
 
         if( probs->rows != samples_all )
-            CV_ERROR( CV_StsUnmatchedSizes,
+            CV_ERROR( cv::Error::StsUnmatchedSizes,
             "matrix of probabilities must have as many rows as the total number of samples" );
 
         if( CV_MAT_TYPE(probs->type) != CV_32FC1 )
-            CV_ERROR( CV_StsUnsupportedFormat, "matrix of probabilities must have 32fC1 type" );
+            CV_ERROR( cv::Error::StsUnsupportedFormat, "matrix of probabilities must have 32fC1 type" );
     }
 
     if( is_sparse )
@@ -1414,7 +1414,7 @@ cvStatModelMultiPredict( const CvStatModel* stat_model,
             {
                 idx = sample_idx->data.i[i*sample_idx_step];
                 if( (unsigned)idx >= (unsigned)samples_all )
-                    CV_ERROR( CV_StsOutOfRange, "Some of sample_idx elements are out of range" );
+                    CV_ERROR( cv::Error::StsOutOfRange, "Some of sample_idx elements are out of range" );
             }
             else if( CV_MAT_TYPE(sample_idx->type) == CV_8UC1 &&
                      sample_idx->data.ptr[i*sample_idx_step] == 0 )
@@ -1494,7 +1494,7 @@ void cvCombineResponseMaps (CvMat*  _responses,
         (!ICV_IS_MAT_OF_TYPE (old_response_map, CV_32SC1)) ||
         (!ICV_IS_MAT_OF_TYPE (new_response_map, CV_32SC1)))
     {
-        CV_ERROR (CV_StsBadArg, "Some of input arguments is not the CvMat")
+        CV_ERROR (cv::Error::StsBadArg, "Some of input arguments is not the CvMat")
     }
 
 // Prepare sorted responses.
diff --git a/apps/traincascade/old_ml_precomp.hpp b/apps/traincascade/old_ml_precomp.hpp
index 6702e5b59f5b..592739c65d60 100644
--- a/apps/traincascade/old_ml_precomp.hpp
+++ b/apps/traincascade/old_ml_precomp.hpp
@@ -70,7 +70,7 @@
 /* Convert matrix to vector */
 #define ICV_MAT2VEC( mat, vdata, vstep, num )      \
     if( MIN( (mat).rows, (mat).cols ) != 1 )       \
-        CV_ERROR( CV_StsBadArg, "" );              \
+        CV_ERROR( cv::Error::StsBadArg, "" );              \
     (vdata) = ((mat).data.ptr);                    \
     if( (mat).rows == 1 )                          \
     {                                              \
@@ -142,7 +142,7 @@
 #define ICV_TRAIN_DATA_REQUIRED( param, flags )                                     \
     if( !ICV_IS_MAT_OF_TYPE( (param), CV_32FC1 ) )                                  \
     {                                                                               \
-        CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );                   \
+        CV_ERROR( cv::Error::StsBadArg, "Invalid " #param " parameter" );                   \
     }                                                                               \
     else                                                                            \
     {                                                                               \
@@ -154,21 +154,21 @@
 #define ICV_TRAIN_CLASSES_REQUIRED( param )                                         \
     if( !ICV_IS_MAT_OF_TYPE( (param), CV_32FC1 ) )                                  \
     {                                                                               \
-        CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );                   \
+        CV_ERROR( cv::Error::StsBadArg, "Invalid " #param " parameter" );                   \
     }                                                                               \
     else                                                                            \
     {                                                                               \
         ICV_MAT2VEC( *(param), classes, clstep, ncl );                              \
         if( m != ncl )                                                              \
         {                                                                           \
-            CV_ERROR( CV_StsBadArg, "Unmatched sizes" );                            \
+            CV_ERROR( cv::Error::StsBadArg, "Unmatched sizes" );                            \
         }                                                                           \
     }
 
 #define ICV_ARG_NULL( param )                                                       \
     if( (param) != NULL )                                                           \
     {                                                                               \
-        CV_ERROR( CV_StsBadArg, #param " parameter must be NULL" );                 \
+        CV_ERROR( cv::Error::StsBadArg, #param " parameter must be NULL" );                 \
     }
 
 #define ICV_MISSED_MEASUREMENTS_OPTIONAL( param, flags )                            \
@@ -176,14 +176,14 @@
     {                                                                               \
         if( !ICV_IS_MAT_OF_TYPE( param, CV_8UC1 ) )                                 \
         {                                                                           \
-            CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );               \
+            CV_ERROR( cv::Error::StsBadArg, "Invalid " #param " parameter" );               \
         }                                                                           \
         else                                                                        \
         {                                                                           \
             ICV_RAWDATA( *(param), (flags), missed, msstep, mcstep, mm, mn );       \
             if( mm != m || mn != n )                                                \
             {                                                                       \
-                CV_ERROR( CV_StsBadArg, "Unmatched sizes" );                        \
+                CV_ERROR( cv::Error::StsBadArg, "Unmatched sizes" );                        \
             }                                                                       \
         }                                                                           \
     }
@@ -193,13 +193,13 @@
     {                                                                               \
         if( !ICV_IS_MAT_OF_TYPE( param, CV_32SC1 ) )                                \
         {                                                                           \
-            CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );               \
+            CV_ERROR( cv::Error::StsBadArg, "Invalid " #param " parameter" );               \
         }                                                                           \
         else                                                                        \
         {                                                                           \
             ICV_MAT2VEC( *(param), cidx, cistep, k );                               \
             if( k > n )                                                             \
-                CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );           \
+                CV_ERROR( cv::Error::StsBadArg, "Invalid " #param " parameter" );           \
         }                                                                           \
     }
 
@@ -208,13 +208,13 @@
     {                                                                               \
         if( !ICV_IS_MAT_OF_TYPE( param, CV_32SC1 ) )                                \
         {                                                                           \
-            CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );               \
+            CV_ERROR( cv::Error::StsBadArg, "Invalid " #param " parameter" );               \
         }                                                                           \
         else                                                                        \
         {                                                                           \
             ICV_MAT2VEC( *sampleIdx, sidx, sistep, l );                             \
             if( l > m )                                                             \
-                CV_ERROR( CV_StsBadArg, "Invalid " #param " parameter" );           \
+                CV_ERROR( cv::Error::StsBadArg, "Invalid " #param " parameter" );           \
         }                                                                           \
     }
 
diff --git a/apps/traincascade/old_ml_tree.cpp b/apps/traincascade/old_ml_tree.cpp
index 55052ed532ad..7085debd89f9 100644
--- a/apps/traincascade/old_ml_tree.cpp
+++ b/apps/traincascade/old_ml_tree.cpp
@@ -93,17 +93,17 @@ bool CvDTreeTrainData::set_params( const CvDTreeParams& _params )
     params = _params;
 
     if( params.max_categories < 2 )
-        CV_ERROR( CV_StsOutOfRange, "params.max_categories should be >= 2" );
+        CV_ERROR( cv::Error::StsOutOfRange, "params.max_categories should be >= 2" );
     params.max_categories = MIN( params.max_categories, 15 );
 
     if( params.max_depth < 0 )
-        CV_ERROR( CV_StsOutOfRange, "params.max_depth should be >= 0" );
+        CV_ERROR( cv::Error::StsOutOfRange, "params.max_depth should be >= 0" );
     params.max_depth = MIN( params.max_depth, 25 );
 
     params.min_sample_count = MAX(params.min_sample_count,1);
 
     if( params.cv_folds < 0 )
-        CV_ERROR( CV_StsOutOfRange,
+        CV_ERROR( cv::Error::StsOutOfRange,
         "params.cv_folds should be =0 (the tree is not pruned) "
         "or n>0 (tree is pruned using n-fold cross-validation)" );
 
@@ -111,7 +111,7 @@ bool CvDTreeTrainData::set_params( const CvDTreeParams& _params )
         params.cv_folds = 0;
 
     if( params.regression_accuracy < 0 )
-        CV_ERROR( CV_StsOutOfRange, "params.regression_accuracy should be >= 0" );
+        CV_ERROR( cv::Error::StsOutOfRange, "params.regression_accuracy should be >= 0" );
 
     ok = true;
 
@@ -183,7 +183,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
             cvNorm( data->var_type, var_type, CV_C ) < FLT_EPSILON &&
             cvNorm( data->cat_count, cat_count, CV_C ) < FLT_EPSILON &&
             cvNorm( data->cat_map, cat_map, CV_C ) < FLT_EPSILON) )
-            CV_ERROR( CV_StsBadArg,
+            CV_ERROR( cv::Error::StsBadArg,
             "The new training data must have the same types and the input and output variables "
             "and the same categories for categorical variables" );
 
@@ -264,7 +264,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
          CV_MAT_TYPE(_responses->type) != CV_32FC1) ||
         (_responses->rows != 1 && _responses->cols != 1) ||
         _responses->rows + _responses->cols - 1 != sample_all )
-        CV_ERROR( CV_StsBadArg, "The array of _responses must be an integer or "
+        CV_ERROR( cv::Error::StsBadArg, "The array of _responses must be an integer or "
                   "floating-point vector containing as many elements as "
                   "the total number of samples in the training data matrix" );
 
@@ -317,7 +317,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
 
     if ((uint64)effective_buf_width * (uint64)effective_buf_height != effective_buf_size)
     {
-        CV_Error(CV_StsBadArg, "The memory buffer cannot be allocated since its size exceeds integer fields limit");
+        CV_Error(cv::Error::StsBadArg, "The memory buffer cannot be allocated since its size exceeds integer fields limit");
     }
 
 
@@ -360,7 +360,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
     if( cv_n )
     {
         if( sample_count < cv_n*MAX(params.min_sample_count,10) )
-            CV_ERROR( CV_StsOutOfRange,
+            CV_ERROR( cv::Error::StsOutOfRange,
                 "The many folds in cross-validation for such a small dataset" );
 
         cv_size = cvAlign( cv_n*(sizeof(int) + sizeof(double)*2), sizeof(double) );
@@ -444,7 +444,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
                         {
                             snprintf( err, sizeof(err), "%d-th value of %d-th (categorical) "
                                 "variable is not an integer", i, vi );
-                            CV_ERROR( CV_StsBadArg, err );
+                            CV_ERROR( cv::Error::StsBadArg, err );
                         }
                     }
 
@@ -452,7 +452,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
                     {
                         snprintf( err, sizeof(err), "%d-th value of %d-th (categorical) "
                             "variable is too large", i, vi );
-                        CV_ERROR( CV_StsBadArg, err );
+                        CV_ERROR( cv::Error::StsBadArg, err );
                     }
                     num_valid++;
                 }
@@ -559,7 +559,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
                     {
                         snprintf( err, sizeof(err), "%d-th value of %d-th (ordered) "
                             "variable (=%g) is too large", i, vi, val );
-                        CV_ERROR( CV_StsBadArg, err );
+                        CV_ERROR( cv::Error::StsBadArg, err );
                     }
                     num_valid++;
                 }
@@ -652,7 +652,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
         {
             double val = have_priors ? params.priors[i] : 1.;
             if( val <= 0 )
-                CV_ERROR( CV_StsOutOfRange, "Every class weight should be positive" );
+                CV_ERROR( cv::Error::StsOutOfRange, "Every class weight should be positive" );
             priors->data.db[i] = val;
             sum += val;
         }
@@ -705,7 +705,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
     __BEGIN__;
 
     if( !data_root )
-        CV_ERROR( CV_StsError, "No training data has been set" );
+        CV_ERROR( cv::Error::StsError, "No training data has been set" );
 
     if( _subsample_idx )
     {
@@ -1396,7 +1396,7 @@ void CvDTreeTrainData::read_params( const cv::FileNode& node )
             auto tmat = cvMat( tparams_node[ "priors" ].mat() );
             priors = cvCloneMat( &tmat );
             if( !CV_IS_MAT(priors) )
-                CV_ERROR( CV_StsParseError, "priors must stored as a matrix" );
+                CV_ERROR( cv::Error::StsParseError, "priors must stored as a matrix" );
             priors_mult = cvCloneMat( priors );
         }
     }
@@ -1413,12 +1413,12 @@ void CvDTreeTrainData::read_params( const cv::FileNode& node )
             (var_idx->cols != 1 && var_idx->rows != 1) ||
             var_idx->cols + var_idx->rows - 1 != var_count ||
             CV_MAT_TYPE(var_idx->type) != CV_32SC1 )
-            CV_ERROR( CV_StsParseError,
+            CV_ERROR( cv::Error::StsParseError,
                 "var_idx (if exist) must be valid 1d integer vector containing <var_count> elements" );
 
         for( vi = 0; vi < var_count; vi++ )
             if( (unsigned)var_idx->data.i[vi] >= (unsigned)var_all )
-                CV_ERROR( CV_StsOutOfRange, "some of var_idx elements are out of range" );
+                CV_ERROR( cv::Error::StsOutOfRange, "some of var_idx elements are out of range" );
     }
 
     ////// read var type
@@ -1434,7 +1434,7 @@ void CvDTreeTrainData::read_params( const cv::FileNode& node )
     {
         if( vartype_node.empty() || !vartype_node.isSeq() ||
             vartype_node.size() != (size_t) var_count )
-            CV_ERROR( CV_StsParseError, "var_type must exist and be a sequence of 0's and 1's" );
+            CV_ERROR( cv::Error::StsParseError, "var_type must exist and be a sequence of 0's and 1's" );
 
         reader = vartype_node.begin();
 
@@ -1442,7 +1442,7 @@ void CvDTreeTrainData::read_params( const cv::FileNode& node )
         {
           cv::FileNode n = *reader;
             if( !n.isInt() || ((int) n & ~1) )
-                CV_ERROR( CV_StsParseError, "var_type must exist and be a sequence of 0's and 1's" );
+                CV_ERROR( cv::Error::StsParseError, "var_type must exist and be a sequence of 0's and 1's" );
             var_type->data.i[vi] = (int) n ? cat_var_count++ : ord_var_count--;
             reader++;
         }
@@ -1468,7 +1468,7 @@ void CvDTreeTrainData::read_params( const cv::FileNode& node )
             cat_count->cols + cat_count->rows - 1 != cat_var_count + is_classifier ||
             (cat_map->cols != 1 && cat_map->rows != 1) ||
             CV_MAT_TYPE(cat_map->type) != CV_32SC1 )
-            CV_ERROR( CV_StsParseError,
+            CV_ERROR( cv::Error::StsParseError,
             "Both cat_count and cat_map must exist and be valid 1d integer vectors of an appropriate size" );
 
         ccount = cat_var_count + is_classifier;
@@ -1481,13 +1481,13 @@ void CvDTreeTrainData::read_params( const cv::FileNode& node )
         {
             int val = cat_count->data.i[vi];
             if( val <= 0 )
-                CV_ERROR( CV_StsOutOfRange, "some of cat_count elements are out of range" );
+                CV_ERROR( cv::Error::StsOutOfRange, "some of cat_count elements are out of range" );
             max_c_count = MAX( max_c_count, val );
             cat_ofs->data.i[vi+1] = total_c_count += val;
         }
 
         if( cat_map->cols + cat_map->rows - 1 != total_c_count )
-            CV_ERROR( CV_StsBadSize,
+            CV_ERROR( cv::Error::StsBadSize,
             "cat_map vector length is not equal to the total number of categories in all categorical vars" );
     }
 
@@ -3631,13 +3631,13 @@ CvDTreeNode* CvDTree::predict( const CvMat* _sample,
     CvDTreeNode* node = root;
 
     if( !node )
-        CV_Error( CV_StsError, "The tree has not been trained yet" );
+        CV_Error( cv::Error::StsError, "The tree has not been trained yet" );
 
     if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 ||
         (_sample->cols != 1 && _sample->rows != 1) ||
         (_sample->cols + _sample->rows - 1 != data->var_all && !preprocessed_input) ||
         (_sample->cols + _sample->rows - 1 != data->var_count && preprocessed_input) )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
         "the input sample must be 1d floating-point vector with the same "
         "number of elements as the total number of variables used for training" );
 
@@ -3656,7 +3656,7 @@ CvDTreeNode* CvDTree::predict( const CvMat* _sample,
     {
         if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) ||
             !CV_ARE_SIZES_EQ(_missing, _sample) )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
         "the missing data mask must be 8-bit vector of the same size as input sample" );
         m = _missing->data.ptr;
         mstep = CV_IS_MAT_CONT(_missing->type) ? 1 : _missing->step/sizeof(m[0]);
@@ -3696,7 +3696,7 @@ CvDTreeNode* CvDTree::predict( const CvMat* _sample,
 
                         int ival = cvRound(val);
                         if( ival != val )
-                            CV_Error( CV_StsBadArg,
+                            CV_Error( cv::Error::StsBadArg,
                             "one of input categorical variable is not an integer" );
 
                         while( a < b )
@@ -3936,11 +3936,11 @@ CvDTreeSplit* CvDTree::read_split( const cv::FileNode& fnode )
     int vi, ci;
 
     if( fnode.empty() || !fnode.isMap() )
-        CV_ERROR( CV_StsParseError, "some of the splits are not stored properly" );
+        CV_ERROR( cv::Error::StsParseError, "some of the splits are not stored properly" );
 
     vi = fnode[ "var" ].empty() ? -1 : (int) fnode[ "var" ];
     if( (unsigned)vi >= (unsigned)data->var_count )
-        CV_ERROR( CV_StsOutOfRange, "Split variable index is out of range" );
+        CV_ERROR( cv::Error::StsOutOfRange, "Split variable index is out of range" );
 
     ci = data->get_var_type(vi);
     if( ci >= 0 ) // split on categorical var
@@ -3957,14 +3957,14 @@ CvDTreeSplit* CvDTree::read_split( const cv::FileNode& fnode )
         }
         if( inseq.empty() ||
             (!inseq.isSeq() && !inseq.isInt()))
-            CV_ERROR( CV_StsParseError,
+            CV_ERROR( cv::Error::StsParseError,
             "Either 'in' or 'not_in' tags should be inside a categorical split data" );
 
         if( inseq.isInt() )
         {
             val = (int) inseq;
             if( (unsigned)val >= (unsigned)n )
-                CV_ERROR( CV_StsOutOfRange, "some of in/not_in elements are out of range" );
+                CV_ERROR( cv::Error::StsOutOfRange, "some of in/not_in elements are out of range" );
 
             split->subset[val >> 5] |= 1 << (val & 31);
         }
@@ -3977,7 +3977,7 @@ CvDTreeSplit* CvDTree::read_split( const cv::FileNode& fnode )
                 cv::FileNode inode = *reader;
                 val = (int) inode;
                 if( !inode.isInt() || (unsigned)val >= (unsigned)n )
-                    CV_ERROR( CV_StsOutOfRange, "some of in/not_in elements are out of range" );
+                    CV_ERROR( cv::Error::StsOutOfRange, "some of in/not_in elements are out of range" );
 
                 split->subset[val >> 5] |= 1 << (val & 31);
                 reader++;
@@ -4025,12 +4025,12 @@ CvDTreeNode* CvDTree::read_node( const cv::FileNode& fnode, CvDTreeNode* parent
     int i, depth;
 
     if( fnode.empty() || !fnode.isMap() )
-        CV_ERROR( CV_StsParseError, "some of the tree elements are not stored properly" );
+        CV_ERROR( cv::Error::StsParseError, "some of the tree elements are not stored properly" );
 
     CV_CALL( node = data->new_node( parent, 0, 0, 0 ));
     depth = fnode[ "depth" ].empty() ? -1 : (int) fnode[ "depth" ];
     if( depth != node->depth )
-        CV_ERROR( CV_StsParseError, "incorrect node depth" );
+        CV_ERROR( cv::Error::StsParseError, "incorrect node depth" );
 
     node->sample_count = (int) fnode[ "sample_count" ];
     node->value = (double) fnode[ "value" ];
@@ -4051,7 +4051,7 @@ CvDTreeNode* CvDTree::read_node( const cv::FileNode& fnode, CvDTreeNode* parent
         CvDTreeSplit* last_split = 0;
 
         if( !splits.isSeq() )
-            CV_ERROR( CV_StsParseError, "splits tag must stored as a sequence" );
+            CV_ERROR( cv::Error::StsParseError, "splits tag must stored as a sequence" );
 
         reader = splits.begin();
         for( i = 0; i < (int) (*reader).size(); i++ )
@@ -4137,7 +4137,7 @@ void CvDTree::read( const cv::FileNode& node, CvDTreeTrainData* _data )
 
     tree_nodes = node[ "nodes" ];
     if( tree_nodes.empty() || !tree_nodes.isSeq() )
-        CV_ERROR( CV_StsParseError, "nodes tag is missing" );
+        CV_ERROR( cv::Error::StsParseError, "nodes tag is missing" );
 
     pruned_tree_idx = node[ "best_tree_idx" ].empty() ? -1 : node[ "best_tree_idx" ];
     read_tree_nodes( tree_nodes );
diff --git a/apps/visualisation/opencv_visualisation.cpp b/apps/visualisation/opencv_visualisation.cpp
index 85e9697aad61..9b7fcd9f4804 100644
--- a/apps/visualisation/opencv_visualisation.cpp
+++ b/apps/visualisation/opencv_visualisation.cpp
@@ -60,6 +60,7 @@ Created by: Puttemans Steven - April 2016
 
 #include <fstream>
 #include <iostream>
+#include <sstream>
 
 using namespace std;
 using namespace cv;
diff --git a/cmake/FindONNX.cmake b/cmake/FindONNX.cmake
index 56dd6d5098b0..0b611858240e 100644
--- a/cmake/FindONNX.cmake
+++ b/cmake/FindONNX.cmake
@@ -11,12 +11,37 @@ if(ONNXRT_ROOT_DIR)
   find_library(ORT_LIB onnxruntime
     ${ONNXRT_ROOT_DIR}/lib
     CMAKE_FIND_ROOT_PATH_BOTH)
+  # The location of headers varies across different versions of ONNX Runtime
   find_path(ORT_INCLUDE onnxruntime_cxx_api.h
+    ${ONNXRT_ROOT_DIR}/include/onnxruntime/
     ${ONNXRT_ROOT_DIR}/include/onnxruntime/core/session
     CMAKE_FIND_ROOT_PATH_BOTH)
 endif()
 
+macro(detect_onxxrt_ep filename dir have_ep_var)
+    find_path(ORT_EP_INCLUDE ${filename} ${dir} CMAKE_FIND_ROOT_PATH_BOTH)
+    if(ORT_EP_INCLUDE)
+       set(${have_ep_var} TRUE)
+    endif()
+endmacro()
+
 if(ORT_LIB AND ORT_INCLUDE)
+  # Check DirectML Execution Provider availability
+  get_filename_component(dml_dir ${ONNXRT_ROOT_DIR}/include/onnxruntime/core/providers/dml ABSOLUTE)
+  detect_onxxrt_ep(
+      dml_provider_factory.h
+      ${dml_dir}
+      HAVE_ONNX_DML
+  )
+
+  # Check CoreML Execution Provider availability
+  get_filename_component(coreml_dir ${ONNXRT_ROOT_DIR}/include/onnxruntime/core/providers/coreml ABSOLUTE)
+  detect_onxxrt_ep(
+      coreml_provider_factory.h
+      ${coreml_dir}
+      HAVE_ONNX_COREML
+  )
+
   set(HAVE_ONNX TRUE)
   # For CMake output only
   set(ONNX_LIBRARIES "${ORT_LIB}" CACHE STRING "ONNX Runtime libraries")
diff --git a/cmake/OpenCVCRTLinkage.cmake b/cmake/OpenCVCRTLinkage.cmake
index 0e0a54ecf911..c29daad0d1fc 100644
--- a/cmake/OpenCVCRTLinkage.cmake
+++ b/cmake/OpenCVCRTLinkage.cmake
@@ -33,34 +33,44 @@ endif()
 # Ignore warning: This object file does not define any previously undefined public symbols, ...
 set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /IGNORE:4221")
 
+if(POLICY CMP0091)
+  cmake_policy(GET CMP0091 MSVC_RUNTIME_SET_BY_ABSTRACTION)
+endif()
+
 if(NOT BUILD_SHARED_LIBS AND BUILD_WITH_STATIC_CRT)
-  foreach(flag_var
-          CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-          CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
-          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-    if(${flag_var} MATCHES "/MD")
-      string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-    endif()
-    if(${flag_var} MATCHES "/MDd")
-      string(REGEX REPLACE "/MDd" "/MTd" ${flag_var} "${${flag_var}}")
-    endif()
-  endforeach(flag_var)
+  if(MSVC_RUNTIME_SET_BY_ABSTRACTION STREQUAL "NEW")
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+  else()
+    foreach(flag_var
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif()
+      if(${flag_var} MATCHES "/MDd")
+        string(REGEX REPLACE "/MDd" "/MTd" ${flag_var} "${${flag_var}}")
+      endif()
+    endforeach(flag_var)
+  endif()
 
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:atlthunk.lib")
   set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} /NODEFAULTLIB:libcmt.lib /NODEFAULTLIB:libcpmt.lib /NODEFAULTLIB:msvcrt.lib")
   set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /NODEFAULTLIB:libcmtd.lib /NODEFAULTLIB:libcpmtd.lib /NODEFAULTLIB:msvcrtd.lib")
 else()
-  foreach(flag_var
-          CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-          CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
-          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-    if(${flag_var} MATCHES "/MT")
-      string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
-    endif()
-    if(${flag_var} MATCHES "/MTd")
-      string(REGEX REPLACE "/MTd" "/MDd" ${flag_var} "${${flag_var}}")
-    endif()
-  endforeach(flag_var)
+  if(NOT MSVC_RUNTIME_SET_BY_ABSTRACTION STREQUAL "NEW")
+    foreach(flag_var
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MT")
+        string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
+      endif()
+      if(${flag_var} MATCHES "/MTd")
+        string(REGEX REPLACE "/MTd" "/MDd" ${flag_var} "${${flag_var}}")
+      endif()
+    endforeach(flag_var)
+  endif()
 endif()
diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index 28929c389010..ff0e40c66694 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -49,10 +49,11 @@
 
 set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
 list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
-list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
+list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD NEON_FP16 NEON_BF16)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
+list(APPEND CPU_ALL_OPTIMIZATIONS LSX)
 list(APPEND CPU_ALL_OPTIMIZATIONS LASX)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)
 
@@ -224,7 +225,7 @@ if(X86 OR X86_64)
     ocv_update(CPU_SSE2_IMPLIES "SSE")
   endif()
 
-  if(CV_ICC)
+  if(CV_ICC OR CV_ICX)
     macro(ocv_intel_compiler_optimization_option name unix_flags msvc_flags)
       ocv_update(CPU_${name}_FLAGS_NAME "${name}")
       if(MSVC)
@@ -260,7 +261,7 @@ if(X86 OR X86_64)
     ocv_intel_compiler_optimization_option(AVX512_CNL "-xCANNONLAKE" "/Qx:CANNONLAKE")
     ocv_intel_compiler_optimization_option(AVX512_CLX "-xCASCADELAKE" "/Qx:CASCADELAKE")
     ocv_intel_compiler_optimization_option(AVX512_ICL "-xICELAKE-CLIENT" "/Qx:ICELAKE-CLIENT")
-  elseif(CV_GCC OR CV_CLANG)
+  elseif(CV_GCC OR CV_CLANG OR CV_ICX)
     ocv_update(CPU_AVX2_FLAGS_ON "-mavx2")
     ocv_update(CPU_FP16_FLAGS_ON "-mf16c")
     ocv_update(CPU_AVX_FLAGS_ON "-mavx")
@@ -331,7 +332,9 @@ if(X86 OR X86_64)
 elseif(ARM OR AARCH64)
   ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
   ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
-  ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp")
+  ocv_update(CPU_NEON_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_fp16.cpp")
+  ocv_update(CPU_NEON_BF16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_bf16.cpp")
+  ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon_dotprod.cpp")
   if(NOT AARCH64)
     ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
     if(NOT MSVC)
@@ -343,12 +346,23 @@ elseif(ARM OR AARCH64)
     endif()
     ocv_update(CPU_FP16_IMPLIES "NEON")
   else()
-    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD")
+    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD;NEON_FP16;NEON_BF16")
     ocv_update(CPU_NEON_FLAGS_ON "")
     ocv_update(CPU_FP16_IMPLIES "NEON")
-    ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
     ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
+    ocv_update(CPU_NEON_FP16_IMPLIES "NEON")
+    ocv_update(CPU_NEON_BF16_IMPLIES "NEON")
+    if(MSVC)
+      ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "")
+      ocv_update(CPU_NEON_FP16_FLAGS_ON "")
+      ocv_update(CPU_NEON_BF16_FLAGS_ON "")
+    else()
+      ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
+      ocv_update(CPU_NEON_FP16_FLAGS_ON "-march=armv8.2-a+fp16")
+      ocv_update(CPU_NEON_BF16_FLAGS_ON "-march=armv8.2-a+bf16")
+    endif()
     set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
+    set(CPU_DISPATCH "NEON_FP16;NEON_BF16;NEON_DOTPROD" CACHE STRING "${HELP_CPU_DISPATCH}")
   endif()
 elseif(MIPS)
   ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp")
@@ -390,10 +404,13 @@ elseif(RISCV)
   set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
 
 elseif(LOONGARCH64)
+  ocv_update(CPU_LSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lsx.cpp")
   ocv_update(CPU_LASX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_lasx.cpp")
-  ocv_update(CPU_KNOWN_OPTIMIZATIONS "LASX")
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "LSX;LASX")
+  ocv_update(CPU_LSX_FLAGS_ON "-mlsx")
   ocv_update(CPU_LASX_FLAGS_ON "-mlasx")
-  set(CPU_BASELINE "LASX" CACHE STRING "${HELP_CPU_BASELINE}")
+  set(CPU_BASELINE "LSX" CACHE STRING "${HELP_CPU_BASELINE}")
+  set(CPU_DISPATCH "LASX" CACHE STRING "${HELP_CPU_DISPATCH}")
 
 endif()
 
@@ -466,6 +483,23 @@ macro(ocv_check_compiler_optimization OPT)
   endif()
 endmacro()
 
+macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION)
+  unset(_POSTFIX)
+  # Check each feature option
+  foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
+    string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND)
+    if(NOT ${OPT_FOUND} EQUAL -1)
+      string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}")
+      string(APPEND _POSTFIX "${TRAILING_PART}")
+      string(REPLACE " ${CPU_${OPT}_FLAGS_ON}" "" ${FLAG_STRING} ${${FLAG_STRING}})
+    endif()
+  endforeach()
+  # If more than one option found, merge them
+  if(NOT "x${_POSTFIX}" STREQUAL "x")
+    set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${_POSTFIX}")
+  endif()
+endmacro()
+
 foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
   set(CPU_${OPT}_USAGE_COUNT 0 CACHE INTERNAL "")
   if("${CPU_${OPT}_FLAGS_ON}" STREQUAL "disabled")
@@ -559,6 +593,15 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
   endif()
 endforeach()
 
+if(AARCH64)
+  if(NOT MSVC)
+    # Define the list of NEON options to check
+    set(NEON_OPTIONS_LIST NEON_DOTPROD NEON_FP16 NEON_BF16)
+    set(BASE_ARCHITECTURE "-march=armv8.2-a")
+    ocv_cpu_aarch64_baseline_merge_feature_options(NEON_OPTIONS_LIST CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE})
+  endif()
+endif()
+
 foreach(OPT ${CPU_BASELINE_REQUIRE})
   if(NOT ";${CPU_BASELINE_FINAL};" MATCHES ";${OPT};")
     message(SEND_ERROR "Required baseline optimization is not supported: ${OPT} (CPU_BASELINE_REQUIRE=${CPU_BASELINE_REQUIRE})")
@@ -657,7 +700,7 @@ macro(ocv_compiler_optimization_options)
 endmacro()
 
 macro(ocv_compiler_optimization_options_finalize)
-  if((CV_GCC OR CV_CLANG) AND (X86 OR X86_64))
+  if((CV_GCC OR CV_CLANG OR CV_ICX) AND (X86 OR X86_64))
     if(NOT APPLE AND CMAKE_SIZEOF_VOID_P EQUAL 4)
       if(OPENCV_EXTRA_CXX_FLAGS MATCHES "-m(sse2|avx)")
         add_extra_compiler_option(-mfpmath=sse) # !! important - be on the same wave with x64 compilers
@@ -944,7 +987,7 @@ macro(ocv_add_dispatched_file_force_all)
 endmacro()
 
 
-if(CV_DISABLE_OPTIMIZATION OR CV_ICC)
+if(CV_DISABLE_OPTIMIZATION OR CV_ICC OR CX_ICX)
   ocv_update(CV_ENABLE_UNROLLED 0)
 else()
   ocv_update(CV_ENABLE_UNROLLED 1)
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 3f3358aae571..f23bb13dc5fc 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -1,13 +1,6 @@
 if("${CMAKE_CXX_COMPILER};${CMAKE_C_COMPILER};${CMAKE_CXX_COMPILER_LAUNCHER}" MATCHES "ccache")
-  set(CMAKE_COMPILER_IS_CCACHE 1)  # TODO: FIXIT Avoid setting of CMAKE_ variables
   set(OPENCV_COMPILER_IS_CCACHE 1)
 endif()
-function(access_CMAKE_COMPILER_IS_CCACHE)
-  if(NOT OPENCV_SUPPRESS_DEPRECATIONS)
-    message(WARNING "DEPRECATED: CMAKE_COMPILER_IS_CCACHE is replaced to OPENCV_COMPILER_IS_CCACHE.")
-  endif()
-endfunction()
-variable_watch(CMAKE_COMPILER_IS_CCACHE access_CMAKE_COMPILER_IS_CCACHE)
 if(ENABLE_CCACHE AND NOT OPENCV_COMPILER_IS_CCACHE)
   # This works fine with Unix Makefiles and Ninja generators
   find_host_program(CCACHE_PROGRAM ccache)
@@ -105,13 +98,27 @@ elseif(CV_ICC)
       add_extra_compiler_option("-fp-model precise")
     endif()
   endif()
+elseif(CV_ICX)
+  # ICX uses -ffast-math by default.
+  # use own flags, if no one of the flags provided by user: -fp-model, -ffast-math -fno-fast-math
+  if(NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES " /fp:"
+      AND NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES " -fp-model"
+      AND NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES " -ffast-math"
+      AND NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES " -fno-fast-math"
+  )
+    if(NOT ENABLE_FAST_MATH)
+      add_extra_compiler_option(-fno-fast-math)
+      add_extra_compiler_option(-fp-model=precise)
+    endif()
+  endif()
 elseif(CV_GCC OR CV_CLANG)
   if(ENABLE_FAST_MATH)
     add_extra_compiler_option(-ffast-math)
+    add_extra_compiler_option(-fno-finite-math-only)
   endif()
 endif()
 
-if(CV_GCC OR CV_CLANG)
+if(CV_GCC OR CV_CLANG OR CV_ICX)
   # High level of warnings.
   add_extra_compiler_option(-W)
   if (NOT MSVC)
@@ -260,7 +267,11 @@ if(CV_GCC OR CV_CLANG)
   endif()
 
   if(ENABLE_LTO)
-    add_extra_compiler_option(-flto)
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
+      add_extra_compiler_option(-flto=auto)
+    else()
+      add_extra_compiler_option(-flto)
+    endif()
   endif()
   if(ENABLE_THIN_LTO)
     add_extra_compiler_option(-flto=thin)
@@ -331,7 +342,7 @@ if(COMMAND ocv_compiler_optimization_options_finalize)
 endif()
 
 # set default visibility to hidden
-if((CV_GCC OR CV_CLANG)
+if((CV_GCC OR CV_CLANG OR CV_ICX)
     AND NOT MSVC
     AND NOT OPENCV_SKIP_VISIBILITY_HIDDEN
     AND NOT " ${CMAKE_CXX_FLAGS} ${OPENCV_EXTRA_FLAGS} ${OPENCV_EXTRA_CXX_FLAGS}" MATCHES " -fvisibility")
@@ -373,7 +384,7 @@ endif()
 
 # Apply "-Wl,--no-undefined" linker flags: https://github.com/opencv/opencv/pull/21347
 if(NOT OPENCV_SKIP_LINK_NO_UNDEFINED)
-  if(UNIX AND (NOT APPLE OR NOT CMAKE_VERSION VERSION_LESS "3.2"))
+  if(UNIX AND ((NOT APPLE OR NOT CMAKE_VERSION VERSION_LESS "3.2") AND NOT CMAKE_SYSTEM_NAME MATCHES "OpenBSD"))
     set(_option "-Wl,--no-undefined")
     set(_saved_CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${_option}")  # requires CMake 3.2+ and CMP0056
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 4a562bdaf94c..06998400d713 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -1,10 +1,10 @@
 if((WIN32 AND NOT MSVC) OR OPENCV_CMAKE_FORCE_CUDA)
-  message(STATUS "CUDA compilation is disabled (due to only Visual Studio compiler supported on your platform).")
+  message(STATUS "CUDA: Compilation is disabled (due to only Visual Studio compiler supported on your platform).")
   return()
 endif()
 
 if((NOT UNIX AND CV_CLANG) OR OPENCV_CMAKE_FORCE_CUDA)
-  message(STATUS "CUDA compilation is disabled (due to Clang unsupported on your platform).")
+  message(STATUS "CUDA: Compilation is disabled (due to Clang unsupported on your platform).")
   return()
 endif()
 
@@ -31,434 +31,143 @@ else()
   list(REMOVE_AT CMAKE_MODULE_PATH 0)
 endif()
 
-if(CUDA_FOUND)
-  unset(CUDA_nvcuvenc_LIBRARY CACHE)
-  set(HAVE_CUDA 1)
-  if(NOT CUDA_VERSION VERSION_LESS 11.0)
-    # CUDA 11.0 removes nppicom
-    ocv_list_filterout(CUDA_nppi_LIBRARY "nppicom")
-    ocv_list_filterout(CUDA_npp_LIBRARY "nppicom")
-  endif()
-
-  if(WITH_CUFFT)
-    set(HAVE_CUFFT 1)
-  endif()
-
-  if(WITH_CUBLAS)
-    set(HAVE_CUBLAS 1)
-  endif()
-
-  if(WITH_CUDNN)
-      set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
-      find_host_package(CUDNN "${MIN_VER_CUDNN}")
-      list(REMOVE_AT CMAKE_MODULE_PATH 0)
-
-      if(CUDNN_FOUND)
-        set(HAVE_CUDNN 1)
-      endif()
-  endif()
+if(NOT CUDA_FOUND)
+  unset(CUDA_ARCH_BIN CACHE)
+  unset(CUDA_ARCH_PTX CACHE)
+  return()
+endif()
 
-  if(WITH_NVCUVID OR WITH_NVCUVENC)
-    macro(ocv_cuda_SEARCH_NVCUVID_HEADER _filename _result)
-      # place header file under CUDA_TOOLKIT_TARGET_DIR or CUDA_TOOLKIT_ROOT_DIR
-      find_path(_header_result
-        ${_filename}
-        PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
-        ENV CUDA_PATH
-        ENV CUDA_INC_PATH
-        PATH_SUFFIXES include
-        NO_DEFAULT_PATH
-        )
-      if("x${_header_result}" STREQUAL "x_header_result-NOTFOUND")
-        set(${_result} 0)
-      else()
-        set(${_result} 1)
-      endif()
-      unset(_header_result CACHE)
-    endmacro()
-    if(WITH_NVCUVID)
-      ocv_cuda_SEARCH_NVCUVID_HEADER("nvcuvid.h" HAVE_NVCUVID_HEADER)
-      ocv_cuda_SEARCH_NVCUVID_HEADER("dynlink_nvcuvid.h" HAVE_DYNLINK_NVCUVID_HEADER)
-      find_cuda_helper_libs(nvcuvid)
-      if(CUDA_nvcuvid_LIBRARY AND (${HAVE_NVCUVID_HEADER} OR ${HAVE_DYNLINK_NVCUVID_HEADER}))
-        # make sure to have both header and library before enabling
-        set(HAVE_NVCUVID 1)
-      endif()
-    endif()
-    if(WITH_NVCUVENC)
-      ocv_cuda_SEARCH_NVCUVID_HEADER("nvEncodeAPI.h" HAVE_NVCUVENC_HEADER)
-      if(WIN32)
-        find_cuda_helper_libs(nvencodeapi)
-      else()
-        find_cuda_helper_libs(nvidia-encode)
-      endif()
-      if((CUDA_nvencodeapi_LIBRARY OR CUDA_nvidia-encode_LIBRARY) AND ${HAVE_NVCUVENC_HEADER})
-        set(HAVE_NVCUVENC 1)
-      endif()
-    endif()
-  endif()
+unset(CUDA_nvcuvenc_LIBRARY CACHE)
+set(HAVE_CUDA 1)
+if(NOT CUDA_VERSION VERSION_LESS 11.0)
+  # CUDA 11.0 removes nppicom
+  ocv_list_filterout(CUDA_nppi_LIBRARY "nppicom")
+  ocv_list_filterout(CUDA_npp_LIBRARY "nppicom")
+endif()
 
-  message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if(WITH_CUFFT)
+  set(HAVE_CUFFT 1)
+endif()
 
-  OCV_OPTION(CUDA_ENABLE_DEPRECATED_GENERATION "Enable deprecated generations in the list" OFF)
-  set(_generations "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "Lovelace" "Hopper")
-  if(CUDA_ENABLE_DEPRECATED_GENERATION)
-    set(_generations "Fermi" "${_generations}")
-    set(_generations "Kepler" "${_generations}")
-  endif()
-  set(_arch_fermi    "2.0")
-  set(_arch_kepler   "3.0;3.5;3.7")
-  set(_arch_maxwell  "5.0;5.2")
-  set(_arch_pascal   "6.0;6.1")
-  set(_arch_volta    "7.0")
-  set(_arch_turing   "7.5")
-  set(_arch_ampere   "8.0;8.6")
-  set(_arch_lovelace "8.9")
-  set(_arch_hopper   "9.0")
-  if(NOT CMAKE_CROSSCOMPILING)
-    list(APPEND _generations "Auto")
-  endif()
-  set(CUDA_GENERATION "" CACHE STRING "Build CUDA device code only for specific GPU architecture. Leave empty to build for all architectures.")
-  if( CMAKE_VERSION VERSION_GREATER "2.8" )
-    set_property( CACHE CUDA_GENERATION PROPERTY STRINGS "" ${_generations} )
-  endif()
+if(WITH_CUBLAS)
+  set(HAVE_CUBLAS 1)
+endif()
 
-  if(CUDA_GENERATION)
-    if(NOT ";${_generations};" MATCHES ";${CUDA_GENERATION};")
-      string(REPLACE ";" ", " _generations "${_generations}")
-      message(FATAL_ERROR "ERROR: ${_generations} Generations are supported.")
-    endif()
-    unset(CUDA_ARCH_BIN CACHE)
-    unset(CUDA_ARCH_PTX CACHE)
-  endif()
+if(WITH_CUDNN)
+    set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+    find_host_package(CUDNN "${MIN_VER_CUDNN}")
+    list(REMOVE_AT CMAKE_MODULE_PATH 0)
 
-  if(OPENCV_CUDA_DETECTION_NVCC_FLAGS MATCHES "-ccbin")
-    # already specified by user
-  elseif(CUDA_HOST_COMPILER AND EXISTS "${CUDA_HOST_COMPILER}")
-    get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
-    # C compiler doesn't work with --run option, forcing C++ compiler instead
-    if(CUDA_HOST_COMPILER STREQUAL c_compiler_realpath OR CUDA_HOST_COMPILER STREQUAL CMAKE_C_COMPILER)
-      if(DEFINED CMAKE_CXX_COMPILER)
-        get_filename_component(cxx_compiler_realpath "${CMAKE_CXX_COMPILER}" REALPATH)
-        LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${cxx_compiler_realpath}")
-      else()
-        message(STATUS "CUDA: CMAKE_CXX_COMPILER is not available. You may need to specify CUDA_HOST_COMPILER.")
-      endif()
-    else()
-      LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${CUDA_HOST_COMPILER}")
-    endif()
-  elseif(WIN32 AND CMAKE_LINKER) # Workaround for VS cl.exe not being in the env. path
-    get_filename_component(host_compiler_bindir ${CMAKE_LINKER} DIRECTORY)
-    LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${host_compiler_bindir}")
-  else()
-    if(CUDA_HOST_COMPILER)
-      message(STATUS "CUDA: CUDA_HOST_COMPILER='${CUDA_HOST_COMPILER}' is not valid, autodetection may not work. Specify OPENCV_CUDA_DETECTION_NVCC_FLAGS with -ccbin option for fix that")
+    if(CUDNN_FOUND)
+      set(HAVE_CUDNN 1)
     endif()
-  endif()
+endif()
 
-  macro(ocv_filter_available_architecture result_list)
-    set(__cache_key_check "${ARGN} : ${CUDA_NVCC_EXECUTABLE} ${OPENCV_CUDA_DETECTION_NVCC_FLAGS}")
-    if(DEFINED OPENCV_CACHE_CUDA_SUPPORTED_CC AND OPENCV_CACHE_CUDA_SUPPORTED_CC_check STREQUAL __cache_key_check)
-      set(${result_list} "${OPENCV_CACHE_CUDA_SUPPORTED_CC}")
-    else()
-      set(CC_LIST ${ARGN})
-      foreach(target_arch ${CC_LIST})
-        string(REPLACE "." "" target_arch_short "${target_arch}")
-        set(NVCC_OPTION "-gencode;arch=compute_${target_arch_short},code=sm_${target_arch_short}")
-        set(_cmd "${CUDA_NVCC_EXECUTABLE}" ${OPENCV_CUDA_DETECTION_NVCC_FLAGS} ${NVCC_OPTION} "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" --compile)
-        execute_process(
-            COMMAND ${_cmd}
-            WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
-            RESULT_VARIABLE _nvcc_res
-            OUTPUT_VARIABLE _nvcc_out
-            ERROR_VARIABLE _nvcc_err
-            #ERROR_QUIET
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-        )
-        if(OPENCV_CMAKE_CUDA_DEBUG)
-          message(WARNING "COMMAND: ${_cmd}")
-          message(STATUS "Result: ${_nvcc_res}")
-          message(STATUS "Out: ${_nvcc_out}")
-          message(STATUS "Err: ${_nvcc_err}")
-        endif()
-        if(_nvcc_res EQUAL 0)
-          LIST(APPEND ${result_list} "${target_arch}")
-        endif()
-      endforeach()
-      string(STRIP "${${result_list}}" ${result_list})
-      if(" ${${result_list}}" STREQUAL " ")
-        message(WARNING "CUDA: Autodetection arch list is empty. Please enable OPENCV_CMAKE_CUDA_DEBUG=1 and check/specify OPENCV_CUDA_DETECTION_NVCC_FLAGS variable")
-      endif()
+include(cmake/OpenCVDetectCUDAUtils.cmake)
 
-      # cache detected values
-      set(OPENCV_CACHE_CUDA_SUPPORTED_CC ${${result_list}} CACHE INTERNAL "")
-      set(OPENCV_CACHE_CUDA_SUPPORTED_CC_check "${__cache_key_check}" CACHE INTERNAL "")
-    endif()
-  endmacro()
-
-  macro(ocv_detect_native_cuda_arch status output)
-    set(OPENCV_CUDA_DETECT_ARCHS_COMMAND "${CUDA_NVCC_EXECUTABLE}" ${OPENCV_CUDA_DETECTION_NVCC_FLAGS} "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run")
-    set(__cache_key_check "${OPENCV_CUDA_DETECT_ARCHS_COMMAND}")
-    if(DEFINED OPENCV_CACHE_CUDA_ACTIVE_CC AND OPENCV_CACHE_CUDA_ACTIVE_CC_check STREQUAL __cache_key_check)
-      set(${output} "${OPENCV_CACHE_CUDA_ACTIVE_CC}")
-      set(${status} 0)
-    else()
-      execute_process(
-          COMMAND ${OPENCV_CUDA_DETECT_ARCHS_COMMAND}
-          WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
-          RESULT_VARIABLE ${status}
-          OUTPUT_VARIABLE _nvcc_out
-          ERROR_VARIABLE _nvcc_err
-          ERROR_QUIET
-          OUTPUT_STRIP_TRAILING_WHITESPACE
-      )
-      if(OPENCV_CMAKE_CUDA_DEBUG)
-        message(WARNING "COMMAND: ${OPENCV_CUDA_DETECT_ARCHS_COMMAND}")
-        message(STATUS "Result: ${${status}}")
-        message(STATUS "Out: ${_nvcc_out}")
-        message(STATUS "Err: ${_nvcc_err}")
-      endif()
-      string(REGEX REPLACE ".*\n" "" ${output} "${_nvcc_out}") #Strip leading warning messages, if any
-
-      if(${status} EQUAL 0)
-        # cache detected values
-        set(OPENCV_CACHE_CUDA_ACTIVE_CC ${${output}} CACHE INTERNAL "")
-        set(OPENCV_CACHE_CUDA_ACTIVE_CC_check "${__cache_key_check}" CACHE INTERNAL "")
-      endif()
-    endif()
-  endmacro()
-
-  set(__cuda_arch_ptx "")
-  if(CUDA_GENERATION STREQUAL "Fermi")
-    set(__cuda_arch_bin ${_arch_fermi})
-  elseif(CUDA_GENERATION STREQUAL "Kepler")
-    set(__cuda_arch_bin ${_arch_kepler})
-  elseif(CUDA_GENERATION STREQUAL "Maxwell")
-    set(__cuda_arch_bin ${_arch_maxwell})
-  elseif(CUDA_GENERATION STREQUAL "Pascal")
-    set(__cuda_arch_bin ${_arch_pascal})
-  elseif(CUDA_GENERATION STREQUAL "Volta")
-    set(__cuda_arch_bin ${_arch_volta})
-  elseif(CUDA_GENERATION STREQUAL "Turing")
-    set(__cuda_arch_bin ${_arch_turing})
-  elseif(CUDA_GENERATION STREQUAL "Ampere")
-    set(__cuda_arch_bin ${_arch_ampere})
-  elseif(CUDA_GENERATION STREQUAL "Lovelace")
-    set(__cuda_arch_bin ${_arch_lovelace})
-  elseif(CUDA_GENERATION STREQUAL "Hopper")
-    set(__cuda_arch_bin ${_arch_hopper})
-  elseif(CUDA_GENERATION STREQUAL "Auto")
-    ocv_detect_native_cuda_arch(_nvcc_res _nvcc_out)
-    if(NOT _nvcc_res EQUAL 0)
-      message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.")
-    else()
-      string(REGEX MATCHALL "[0-9]+\\.[0-9]" __cuda_arch_bin "${_nvcc_out}")
-    endif()
-  elseif(CUDA_ARCH_BIN)
-    message(STATUS "CUDA: Using CUDA_ARCH_BIN=${CUDA_ARCH_BIN}")
-    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
-  endif()
+if(WITH_NVCUVID OR WITH_NVCUVENC)
+  set(cuda_toolkit_dirs "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}")
+  ocv_check_for_nvidia_video_codec_sdk("${cuda_toolkit_dirs}")
+endif()
 
-  if(NOT DEFINED __cuda_arch_bin)
-    if(ARM)
-      set(__cuda_arch_bin "3.2")
-      set(__cuda_arch_ptx "")
-    elseif(AARCH64)
-      if(NOT CMAKE_CROSSCOMPILING)
-        ocv_detect_native_cuda_arch(_nvcc_res _nvcc_out)
-      else()
-        set(_nvcc_res -1)  # emulate error, see below
-      endif()
-      if(NOT _nvcc_res EQUAL 0)
-        message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.")
-        # TX1 (5.3) TX2 (6.2) Xavier (7.2) V100 (7.0) Orin (8.7)
-        ocv_filter_available_architecture(__cuda_arch_bin
-            5.3
-            6.2
-            7.2
-            7.0
-            8.7
-        )
-      else()
-        set(__cuda_arch_bin "${_nvcc_out}")
-      endif()
-      set(__cuda_arch_ptx "")
-    else()
-      ocv_filter_available_architecture(__cuda_arch_bin
-          ${_arch_fermi}
-          ${_arch_kepler}
-          ${_arch_maxwell}
-          ${_arch_pascal}
-          ${_arch_volta}
-          ${_arch_turing}
-          ${_arch_ampere}
-          ${_arch_lovelace}
-          ${_arch_hopper}
-      )
-    endif()
-  endif()
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
 
-  set(CUDA_ARCH_BIN ${__cuda_arch_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-  set(CUDA_ARCH_PTX ${__cuda_arch_ptx} CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+ocv_set_cuda_detection_nvcc_flags(CUDA_HOST_COMPILER)
+ocv_set_cuda_arch_bin_and_ptx(${CUDA_NVCC_EXECUTABLE})
 
-  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
-  string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
+# NVCC flags to be set
+set(NVCC_FLAGS_EXTRA "")
 
-  # Check if user specified 1.0/2.1 compute capability: we don't support it
-  macro(ocv_wipeout_deprecated_cc target_cc)
-    if(" ${CUDA_ARCH_BIN} ${CUDA_ARCH_PTX}" MATCHES " ${target_cc}")
-      message(SEND_ERROR "CUDA: ${target_cc} compute capability is not supported - exclude it from ARCH/PTX list and re-run CMake")
-    endif()
-  endmacro()
-  ocv_wipeout_deprecated_cc("1.0")
-  ocv_wipeout_deprecated_cc("2.1")
-
-  # NVCC flags to be set
-  set(NVCC_FLAGS_EXTRA "")
-
-  # These vars will be passed into the templates
-  set(OPENCV_CUDA_ARCH_BIN "")
-  set(OPENCV_CUDA_ARCH_PTX "")
-  set(OPENCV_CUDA_ARCH_FEATURES "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  string(REGEX MATCHALL "[0-9()]+" ARCH_LIST "${ARCH_BIN_NO_POINTS}")
-  foreach(ARCH IN LISTS ARCH_LIST)
-    if(ARCH MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified PTX for the concrete BIN
-      set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      set(OPENCV_CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN} ${CMAKE_MATCH_1}")
-      set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${CMAKE_MATCH_2}")
-    else()
-      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
-      set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=sm_${ARCH})
-      set(OPENCV_CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN} ${ARCH}")
-      set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
-    endif()
-  endforeach()
-  set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -D_FORCE_INLINES)
+# These vars will be passed into the templates
+set(OPENCV_CUDA_ARCH_BIN "")
+set(OPENCV_CUDA_ARCH_PTX "")
+set(OPENCV_CUDA_ARCH_FEATURES "")
 
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")
-  foreach(ARCH IN LISTS ARCH_LIST)
-    set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
-    set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
+# Tell NVCC to add binaries for the specified GPUs
+string(REGEX MATCHALL "[0-9()]+" ARCH_LIST "${ARCH_BIN_NO_POINTS}")
+foreach(ARCH IN LISTS ARCH_LIST)
+  if(ARCH MATCHES "([0-9]+)\\(([0-9]+)\\)")
+    # User explicitly specified PTX for the concrete BIN
+    set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+    set(OPENCV_CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN} ${CMAKE_MATCH_1}")
+    set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${CMAKE_MATCH_2}")
+  else()
+    # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+    set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=sm_${ARCH})
+    set(OPENCV_CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN} ${ARCH}")
     set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
-  endforeach()
-
-  # These vars will be processed in other scripts
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
-  set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}")
-
-  if(ANDROID)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xptxas;-dlcm=ca")
   endif()
+endforeach()
+set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -D_FORCE_INLINES)
 
-  message(STATUS "CUDA NVCC target flags: ${CUDA_NVCC_FLAGS}")
-
-  OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF)
-
-  if(CUDA_FAST_MATH)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
-  endif()
+# Tell NVCC to add PTX intermediate code for the specified architectures
+string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")
+foreach(ARCH IN LISTS ARCH_LIST)
+  set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -gencode arch=compute_${ARCH},code=compute_${ARCH})
+  set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
+  set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
+endforeach()
 
-  OCV_OPTION(CUDA_ENABLE_DELAYLOAD "Enable delayed loading of CUDA DLLs" OFF VISIBLE_IF MSVC)
+# These vars will be processed in other scripts
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
+set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}")
 
-  mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
-
-  macro(ocv_cuda_filter_options)
-    foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-      set(${var}_backup_in_cuda_compile_ "${${var}}")
-
-      if (CV_CLANG)
-        # we remove -Winconsistent-missing-override and -Qunused-arguments
-        # just in case we are compiling CUDA with gcc but OpenCV with clang
-        string(REPLACE "-Winconsistent-missing-override" "" ${var} "${${var}}")
-        string(REPLACE "-Qunused-arguments" "" ${var} "${${var}}")
-      endif()
-
-      # we remove /EHa as it generates warnings under windows
-      string(REPLACE "/EHa" "" ${var} "${${var}}")
-
-      # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
-      string(REPLACE "-ggdb3" "" ${var} "${${var}}")
-
-      # we remove -Wsign-promo as it generates warnings under linux
-      string(REPLACE "-Wsign-promo" "" ${var} "${${var}}")
-
-      # we remove -Wno-sign-promo as it generates warnings under linux
-      string(REPLACE "-Wno-sign-promo" "" ${var} "${${var}}")
-
-      # we remove -Wno-delete-non-virtual-dtor because it's used for C++ compiler
-      # but NVCC uses C compiler by default
-      string(REPLACE "-Wno-delete-non-virtual-dtor" "" ${var} "${${var}}")
-
-      # we remove -frtti because it's used for C++ compiler
-      # but NVCC uses C compiler by default
-      string(REPLACE "-frtti" "" ${var} "${${var}}")
+if(ANDROID)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xptxas;-dlcm=ca")
+endif()
 
-      string(REPLACE "-fvisibility-inlines-hidden" "" ${var} "${${var}}")
+ocv_set_nvcc_threads_for_vs()
 
-      # cc1: warning: command line option '-Wsuggest-override' is valid for C++/ObjC++ but not for C
-      string(REPLACE "-Wsuggest-override" "" ${var} "${${var}}")
+message(STATUS "CUDA: NVCC target flags ${CUDA_NVCC_FLAGS}")
 
-      # issue: #11552 (from OpenCVCompilerOptions.cmake)
-      string(REGEX REPLACE "-Wimplicit-fallthrough(=[0-9]+)? " "" ${var} "${${var}}")
+OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF)
 
-      # removal of custom specified options
-      if(OPENCV_CUDA_NVCC_FILTEROUT_OPTIONS)
-        foreach(__flag ${OPENCV_CUDA_NVCC_FILTEROUT_OPTIONS})
-          string(REPLACE "${__flag}" "" ${var} "${${var}}")
-        endforeach()
-      endif()
-    endforeach()
-  endmacro()
+if(CUDA_FAST_MATH)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
+endif()
 
-  macro(ocv_cuda_compile VAR)
-    ocv_cuda_filter_options()
+OCV_OPTION(CUDA_ENABLE_DELAYLOAD "Enable delayed loading of CUDA DLLs" OFF VISIBLE_IF MSVC)
 
-    if(BUILD_SHARED_LIBS)
-      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -DCVAPI_EXPORTS)
-    endif()
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
 
-    if(UNIX OR APPLE)
-      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC)
-      if(NOT " ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_DEBUG} ${CUDA_NVCC_FLAGS}" MATCHES "-std=")
-        if(CUDA_VERSION VERSION_LESS "11.0")
-          list(APPEND CUDA_NVCC_FLAGS "--std=c++11")
-        else()
-          list(APPEND CUDA_NVCC_FLAGS "--std=c++14")
-        endif()
+macro(ocv_check_windows_crt_linkage)
+  # The new MSVC runtime abstraction is only useable if CUDA is a first class language
+  if(WIN32 AND POLICY CMP0091)
+    cmake_policy(GET CMP0091 MSVC_RUNTIME_SET_BY_ABSTRACTION)
+    if(MSVC_RUNTIME_SET_BY_ABSTRACTION STREQUAL "NEW")
+      if(NOT BUILD_SHARED_LIBS AND BUILD_WITH_STATIC_CRT)
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+      else()
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MD")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MDd")
       endif()
     endif()
-    if(APPLE)
-      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
-    endif()
-
-    if(WIN32 AND NOT (CUDA_VERSION VERSION_LESS "11.2"))
-      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcudafe --display_error_number --diag-suppress 1394,1388)
-    endif()
+  endif()
+endmacro()
 
-    if(CMAKE_CROSSCOMPILING AND (ARM OR AARCH64))
-      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xlinker --unresolved-symbols=ignore-in-shared-libs)
-    endif()
+macro(ocv_cuda_compile VAR)
+  ocv_cuda_filter_options()
+  ocv_check_windows_crt_linkage()
+  ocv_nvcc_flags()
 
-    # disabled because of multiple warnings during building nvcc auto generated files
-    if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.6.0")
-      ocv_warnings_disable(CMAKE_CXX_FLAGS -Wunused-but-set-variable)
+  if(UNIX OR APPLE)
+    if(NOT " ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_DEBUG} ${CUDA_NVCC_FLAGS}" MATCHES "-std=")
+      if(CUDA_VERSION VERSION_LESS "11.0")
+        list(APPEND CUDA_NVCC_FLAGS "--std=c++11")
+      else()
+        list(APPEND CUDA_NVCC_FLAGS "--std=c++14")
+      endif()
     endif()
+  endif()
 
-    CUDA_COMPILE(${VAR} ${ARGN})
+  CUDA_COMPILE(${VAR} ${ARGN})
 
-    foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-      set(${var} "${${var}_backup_in_cuda_compile_}")
-      unset(${var}_backup_in_cuda_compile_)
-    endforeach()
-  endmacro()
-else()
-  unset(CUDA_ARCH_BIN CACHE)
-  unset(CUDA_ARCH_PTX CACHE)
-endif()
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var} "${${var}_backup_in_cuda_compile_}")
+    unset(${var}_backup_in_cuda_compile_)
+  endforeach()
+endmacro()
 
 if(HAVE_CUDA)
   set(CUDA_LIBS_PATH "")
@@ -516,36 +225,13 @@ if(HAVE_CUDA)
   endif()
 endif()
 
-
-# ----------------------------------------------------------------------------
-# Add CUDA libraries (needed for apps/tools, samples)
-# ----------------------------------------------------------------------------
 if(HAVE_CUDA)
-  # details: https://github.com/NVIDIA/nvidia-docker/issues/775
-  if(" ${CUDA_CUDA_LIBRARY}" MATCHES "/stubs/libcuda.so" AND NOT OPENCV_SKIP_CUDA_STUB_WORKAROUND)
-    set(CUDA_STUB_ENABLED_LINK_WORKAROUND 1)
-    if(EXISTS "${CUDA_CUDA_LIBRARY}" AND NOT OPENCV_SKIP_CUDA_STUB_WORKAROUND_RPATH_LINK)
-      set(CUDA_STUB_TARGET_PATH "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/")
-      execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${CUDA_CUDA_LIBRARY}" "${CUDA_STUB_TARGET_PATH}/libcuda.so.1"
-          RESULT_VARIABLE CUDA_STUB_SYMLINK_RESULT)
-      if(NOT CUDA_STUB_SYMLINK_RESULT EQUAL 0)
-        execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CUDA_CUDA_LIBRARY}" "${CUDA_STUB_TARGET_PATH}/libcuda.so.1"
-          RESULT_VARIABLE CUDA_STUB_COPY_RESULT)
-        if(NOT CUDA_STUB_COPY_RESULT EQUAL 0)
-          set(CUDA_STUB_ENABLED_LINK_WORKAROUND 0)
-        endif()
-      endif()
-      if(CUDA_STUB_ENABLED_LINK_WORKAROUND)
-        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath-link,\"${CUDA_STUB_TARGET_PATH}\"")
-      endif()
-    else()
-      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--allow-shlib-undefined")
-    endif()
-    if(NOT CUDA_STUB_ENABLED_LINK_WORKAROUND)
-      message(WARNING "CUDA: workaround for stubs/libcuda.so.1 is not applied")
-    endif()
-  endif()
+  ocv_apply_cuda_stub_workaround("${CUDA_CUDA_LIBRARY}")
+  ocv_check_cuda_delayed_load("${CUDA_TOOLKIT_ROOT_DIR}")
 
+  # ----------------------------------------------------------------------------
+  # Add CUDA libraries (needed for apps/tools, samples)
+  # ----------------------------------------------------------------------------
   set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
   if(HAVE_CUBLAS)
     set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cublas_LIBRARY})
@@ -563,19 +249,4 @@ if(HAVE_CUDA)
       set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CMAKE_LIBRARY_PATH_FLAG}${p})
     endif()
   endforeach()
-
-  if(MSVC AND CUDA_ENABLE_DELAYLOAD)
-    set(DELAYFLAGS "delayimp.lib")
-    file(GLOB CUDA_DLLS "${CUDA_TOOLKIT_ROOT_DIR}/bin/*.dll")
-    foreach(d ${CUDA_DLLS})
-      cmake_path(GET "d" FILENAME DLL_NAME)
-      if(NOT ${DLL_NAME} MATCHES "cudart")
-        set(DELAYFLAGS "${DELAYFLAGS} /DELAYLOAD:${DLL_NAME}")
-      endif()
-    endforeach()
-    set(DELAYFLAGS "${DELAYFLAGS} /DELAYLOAD:nvcuda.dll /DELAYLOAD:nvml.dll /IGNORE:4199")
-    set(CMAKE_EXE_LINKER_FLAGS       "${CMAKE_EXE_LINKER_FLAGS} ${DELAYFLAGS}")
-    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${DELAYFLAGS}")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${DELAYFLAGS}")
-  endif()
 endif()
diff --git a/cmake/OpenCVDetectCUDALanguage.cmake b/cmake/OpenCVDetectCUDALanguage.cmake
new file mode 100644
index 000000000000..0eeea77f2cf6
--- /dev/null
+++ b/cmake/OpenCVDetectCUDALanguage.cmake
@@ -0,0 +1,154 @@
+#######################
+# Previously in FindCUDA and still required for FindCUDNN
+macro(FIND_CUDA_HELPER_LIBS _name)
+  if(CMAKE_CROSSCOMPILING AND (ARM OR AARCH64))
+    set(_cuda_cross_arm_lib_dir "lib/stubs")
+  endif()
+  find_library(CUDA_${_name}_LIBRARY ${_name}
+    NAMES ${_name}
+    PATHS "${CUDAToolkit_LIBRARY_ROOT}"
+    PATH_SUFFIXES "lib/x64" "lib64" ${_cuda_cross_arm_lib_dir} "lib/Win32" "lib"
+    DOC "\"${_name}\" library"
+    )
+  mark_as_advanced(CUDA_${_name}_LIBRARY)
+endmacro()
+#######################
+include(cmake/OpenCVDetectCUDAUtils.cmake)
+
+if((WIN32 AND NOT MSVC) OR OPENCV_CMAKE_FORCE_CUDA)
+  message(STATUS "CUDA: Compilation is disabled (due to only Visual Studio compiler supported on your platform).")
+  return()
+endif()
+
+if((NOT UNIX AND CV_CLANG) OR OPENCV_CMAKE_FORCE_CUDA)
+  message(STATUS "CUDA: Compilation is disabled (due to Clang unsupported on your platform).")
+  return()
+endif()
+
+#set(OPENCV_CMAKE_CUDA_DEBUG 1)
+
+find_package(CUDAToolkit)
+if(CMAKE_CUDA_COMPILER AND CUDAToolkit_FOUND)
+  set(CUDA_FOUND TRUE)
+  set(CUDA_TOOLKIT_INCLUDE ${CUDAToolkit_INCLUDE_DIRS})
+  set(CUDA_VERSION_STRING ${CUDAToolkit_VERSION})
+  set(CUDA_VERSION ${CUDAToolkit_VERSION})
+  if(NOT CUDA_VERSION VERSION_LESS 11.0)
+      set(CMAKE_CUDA_STANDARD 14)
+  else()
+      set(CMAKE_CUDA_STANDARD 11)
+  endif()
+  if(UNIX AND NOT BUILD_SHARED_LIBS)
+      set(CUDA_LIB_EXT "_static")
+  endif()
+endif()
+
+if(NOT CUDA_FOUND)
+  unset(CUDA_ARCH_BIN CACHE)
+  unset(CUDA_ARCH_PTX CACHE)
+  return()
+endif()
+
+set(HAVE_CUDA 1)
+
+if(WITH_CUFFT)
+  set(HAVE_CUFFT 1)
+endif()
+
+if(WITH_CUBLAS)
+  set(HAVE_CUBLAS 1)
+endif()
+
+if(WITH_CUDNN)
+    set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
+    find_host_package(CUDNN "${MIN_VER_CUDNN}")
+    list(REMOVE_AT CMAKE_MODULE_PATH 0)
+
+    if(CUDNN_FOUND)
+      set(HAVE_CUDNN 1)
+    endif()
+endif()
+
+if(WITH_NVCUVID OR WITH_NVCUVENC)
+  ocv_check_for_nvidia_video_codec_sdk("${CUDAToolkit_LIBRARY_ROOT}")
+endif()
+
+ocv_check_for_cmake_cuda_architectures()
+ocv_set_cuda_detection_nvcc_flags(CMAKE_CUDA_HOST_COMPILER)
+ocv_set_cuda_arch_bin_and_ptx(${CUDAToolkit_NVCC_EXECUTABLE})
+
+# NVCC flags to be set
+set(NVCC_FLAGS_EXTRA "")
+
+# These vars will be passed into the templates
+set(OPENCV_CUDA_ARCH_BIN "")
+set(OPENCV_CUDA_ARCH_PTX "")
+set(OPENCV_CUDA_ARCH_FEATURES "")
+
+# Tell NVCC to add binaries for the specified GPUs
+string(REGEX MATCHALL "[0-9()]+" ARCH_LIST "${ARCH_BIN_NO_POINTS}")
+foreach(ARCH IN LISTS ARCH_LIST)
+  if(ARCH MATCHES "([0-9]+)\\(([0-9]+)\\)")
+    # User explicitly specified PTX for the concrete BIN
+    set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} ${CMAKE_MATCH_2}-virtual;${CMAKE_MATCH_1}-real;)
+    set(OPENCV_CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN} ${CMAKE_MATCH_1}")
+    set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${CMAKE_MATCH_2}")
+  else()
+    # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+    set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} ${ARCH}-real;)
+    set(OPENCV_CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN} ${ARCH}")
+    set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
+  endif()
+endforeach()
+set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -D_FORCE_INLINES)
+
+# Tell NVCC to add PTX intermediate code for the specified architectures
+string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")
+foreach(ARCH IN LISTS ARCH_LIST)
+  set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} ${ARCH}-virtual;)
+  set(OPENCV_CUDA_ARCH_PTX "${OPENCV_CUDA_ARCH_PTX} ${ARCH}")
+  set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
+endforeach()
+
+ocv_set_nvcc_threads_for_vs()
+
+# These vars will be processed in other scripts
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
+set(OpenCV_CUDA_CC "${CMAKE_CUDA_ARCHITECTURES}")
+
+if(ANDROID)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xptxas;-dlcm=ca")
+endif()
+
+message(STATUS "CUDA: NVCC target flags ${CUDA_NVCC_FLAGS}")
+
+OCV_OPTION(CUDA_FAST_MATH "Enable --use_fast_math for CUDA compiler " OFF)
+
+if(CUDA_FAST_MATH)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
+endif()
+
+OCV_OPTION(CUDA_ENABLE_DELAYLOAD "Enable delayed loading of CUDA DLLs" OFF VISIBLE_IF MSVC)
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
+
+macro(ocv_cuda_unfilter_options)
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var} "${${var}_backup_in_cuda_compile_}")
+    unset(${var}_backup_in_cuda_compile_)
+  endforeach()
+endmacro()
+
+macro(ocv_cuda_compile_flags)
+  ocv_cuda_filter_options()
+  ocv_nvcc_flags()
+  set(CMAKE_CXX_FLAGS_CUDA ${CMAKE_CXX_FLAGS})
+  set(CMAKE_CXX_FLAGS_RELEASE_CUDA ${CMAKE_CXX_FLAGS_RELEASE})
+  set(CMAKE_CXX_FLAGS_DEBUG_CUDA ${CMAKE_CXX_FLAGS_DEBUG})
+  ocv_cuda_unfilter_options()
+endmacro()
+
+if(HAVE_CUDA)
+  ocv_apply_cuda_stub_workaround("${CUDA_cuda_driver_LIBRARY}")
+  ocv_check_cuda_delayed_load("${cuda_toolkit_root_dir}")
+endif()
\ No newline at end of file
diff --git a/cmake/OpenCVDetectCUDAUtils.cmake b/cmake/OpenCVDetectCUDAUtils.cmake
new file mode 100644
index 000000000000..97676628abe9
--- /dev/null
+++ b/cmake/OpenCVDetectCUDAUtils.cmake
@@ -0,0 +1,442 @@
+macro(ocv_check_for_nvidia_video_codec_sdk cuda_toolkit_dirs)
+  macro(ocv_cuda_SEARCH_NVCUVID_HEADER _filename _result)
+    # place header file under CUDAToolkit_LIBRARY_ROOT
+    find_path(_header_result
+      ${_filename}
+      PATHS ${cuda_toolkit_dirs}
+      PATH_SUFFIXES include
+      NO_DEFAULT_PATH
+      )
+    if("x${_header_result}" STREQUAL "x_header_result-NOTFOUND")
+      set(${_result} 0)
+    else()
+      set(${_result} 1)
+    endif()
+    unset(_header_result CACHE)
+  endmacro()
+  if(WITH_NVCUVID)
+    ocv_cuda_SEARCH_NVCUVID_HEADER("nvcuvid.h" HAVE_NVCUVID_HEADER)
+    # make sure to have both header and library before enabling
+    if(${HAVE_NVCUVID_HEADER})
+      find_cuda_helper_libs(nvcuvid)
+      if(CUDA_nvcuvid_LIBRARY)
+        set(HAVE_NVCUVID 1)
+        message(STATUS "Found NVCUVID: ${CUDA_nvcuvid_LIBRARY}")
+      else()
+        if(WIN32)
+          message(STATUS "NVCUVID: Library not found, WITH_NVCUVID requires Nvidia decoding library nvcuvid.lib to either be inside ${cuda_toolkit_dirs}/lib or its location manually set with CUDA_nvcuvid_LIBRARY, i.e. CUDA_nvcuvid_LIBRARY=${cuda_toolkit_dirs}/lib/nvcuvid.lib")
+        else()
+          message(STATUS "NVCUVID: Library not found, WITH_NVCUVID requires the Nvidia decoding shared library nvcuvid.so from the driver installation or the location of the stub library to be manually set with CUDA_nvcuvid_LIBRARY i.e. CUDA_nvcuvid_LIBRARY=/home/user/Video_Codec_SDK_X.X.X/Lib/linux/stubs/x86_64/nvcuvid.so")
+        endif()
+      endif()
+    else()
+      message(STATUS "NVCUVID: Header not found, WITH_NVCUVID requires Nvidia decoding library header ${cuda_toolkit_dirs}/include/nvcuvid.h")
+    endif()
+  endif()
+
+  if(WITH_NVCUVENC)
+    ocv_cuda_SEARCH_NVCUVID_HEADER("nvEncodeAPI.h" HAVE_NVCUVENC_HEADER)
+    if(${HAVE_NVCUVENC_HEADER})
+      if(WIN32)
+        find_cuda_helper_libs(nvencodeapi)
+      else()
+        find_cuda_helper_libs(nvidia-encode)
+      endif()
+      if(CUDA_nvencodeapi_LIBRARY OR CUDA_nvidia-encode_LIBRARY)
+        set(HAVE_NVCUVENC 1)
+        message(STATUS "Found NVCUVENC: ${CUDA_nvencodeapi_LIBRARY} ${CUDA_nvidia-encode_LIBRARY}")
+      else()
+        if(WIN32)
+          message(STATUS "NVCUVENC: Library not found, WITH_NVCUVENC requires Nvidia encoding library nvencodeapi.lib to either be inside ${cuda_toolkit_dirs}/lib or its location manually set with CUDA_nvencodeapi_LIBRARY, i.e. CUDA_nvencodeapi_LIBRARY=${cuda_toolkit_dirs}/lib/nvencodeapi.lib")
+        else()
+          message(STATUS "NVCUVENC: Library not found, WITH_NVCUVENC requires the Nvidia encoding shared library libnvidia-encode.so from the driver installation or the location of the stub library to be manually set with CUDA_nvidia-encode_LIBRARY i.e. CUDA_nvidia-encode_LIBRARY=/home/user/Video_Codec_SDK_X.X.X/Lib/linux/stubs/x86_64/libnvidia-encode.so")
+        endif()
+      endif()
+    else()
+      message(STATUS "NVCUVENC: Header not found, WITH_NVCUVENC requires Nvidia encoding library header ${cuda_toolkit_dirs}/include/nvEncodeAPI.h")
+    endif()
+  endif()
+endmacro()
+
+# Use CMAKE_CUDA_ARCHITECTURES if provided: order of preference CMAKE_CUDA_ARCHITECTURES > CUDA_GENERATION > CUDA_ARCH_BIN and/or CUDA_ARCH_PTX
+function(ocv_check_for_cmake_cuda_architectures)
+  if(NOT CMAKE_CUDA_ARCHITECTURES)
+    return()
+  endif()
+  if(CMAKE_CUDA_ARCHITECTURES STREQUAL "all" OR CMAKE_CUDA_ARCHITECTURES STREQUAL "all-major" OR CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
+    message(WARNING "CUDA: CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}, special values all, all-major and native are not supported by OpenCV, specify only CUDA real and/or virtual architectures or use combinations of CUDA_ARCH_BIN and CUDA_ARCH_PTX or specify the CUDA_GENERATION where -DCUDA_GENERATION=Auto is equivalent to native!")
+    return()
+  endif()
+  set(internal_ptx "")
+  set(internal_bin "")
+  foreach(ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
+    if(ARCH MATCHES "([0-9]+)\-real")
+      set(internal_bin ${internal_bin} ${CMAKE_MATCH_1};)
+    elseif(ARCH MATCHES "([0-9]+)\-virtual")
+      set(internal_ptx ${internal_ptx} ${CMAKE_MATCH_1};)
+    elseif(ARCH MATCHES "([0-9]+)")
+      set(internal_bin ${internal_bin} ${CMAKE_MATCH_1};)
+      set(internal_ptx ${internal_ptx} ${CMAKE_MATCH_1};)
+    endif()
+  endforeach()
+  if(internal_bin OR internal_ptx)
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+  if(internal_ptx)
+    set(CUDA_ARCH_PTX ${internal_ptx} CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for (see https://docs.opencv.org/4.x/d2/dbc/cuda_intro.html)")
+  endif()
+  if(internal_bin)
+    set(CUDA_ARCH_BIN ${internal_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported (see https://docs.opencv.org/4.x/d2/dbc/cuda_intro.html)")
+  endif()
+  set(CMAKE_CUDA_ARCHITECTURES "" PARENT)
+  unset(CUDA_GENERATION CACHE)
+endfunction()
+
+macro(ocv_initialize_nvidia_device_generations)
+  OCV_OPTION(CUDA_ENABLE_DEPRECATED_GENERATION "Enable deprecated generations in the list" OFF)
+  set(_generations "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "Lovelace" "Hopper")
+  if(CUDA_ENABLE_DEPRECATED_GENERATION)
+    set(_generations "Fermi" "${_generations}")
+    set(_generations "Kepler" "${_generations}")
+  endif()
+  set(_arch_fermi    "2.0")
+  set(_arch_kepler   "3.0;3.5;3.7")
+  set(_arch_maxwell  "5.0;5.2")
+  set(_arch_pascal   "6.0;6.1")
+  set(_arch_volta    "7.0")
+  set(_arch_turing   "7.5")
+  set(_arch_ampere   "8.0;8.6")
+  set(_arch_lovelace "8.9")
+  set(_arch_hopper   "9.0")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND _generations "Auto")
+  endif()
+  set(CUDA_GENERATION "" CACHE STRING "Build CUDA device code only for specific GPU architecture. Leave empty to build for all architectures (see https://docs.opencv.org/4.x/d2/dbc/cuda_intro.html).")
+  if( CMAKE_VERSION VERSION_GREATER "2.8" )
+    set_property( CACHE CUDA_GENERATION PROPERTY STRINGS "" ${_generations} )
+  endif()
+
+  if(CUDA_GENERATION)
+    if(NOT ";${_generations};" MATCHES ";${CUDA_GENERATION};")
+      string(REPLACE ";" ", " _generations "${_generations}")
+      message(FATAL_ERROR "ERROR: ${_generations} Generations are supported.")
+    endif()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+endmacro()
+
+macro(ocv_set_cuda_detection_nvcc_flags cuda_host_compiler_var)
+  if(OPENCV_CUDA_DETECTION_NVCC_FLAGS MATCHES "-ccbin")
+  # already specified by user
+  elseif(${cuda_host_compiler_var} AND EXISTS "${${cuda_host_compiler_var}}")
+    get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+    # C compiler doesn't work with --run option, forcing C++ compiler instead
+    if(${cuda_host_compiler_var} STREQUAL c_compiler_realpath OR ${cuda_host_compiler_var} STREQUAL CMAKE_C_COMPILER)
+      if(DEFINED CMAKE_CXX_COMPILER)
+        get_filename_component(cxx_compiler_realpath "${CMAKE_CXX_COMPILER}" REALPATH)
+        LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${cxx_compiler_realpath}")
+      else()
+        message(STATUS "CUDA: CMAKE_CXX_COMPILER is not available. You may need to specify ${cuda_host_compiler_var}.")
+      endif()
+    else()
+      LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${${cuda_host_compiler_var}}")
+    endif()
+  elseif(WIN32 AND CMAKE_LINKER) # Workaround for VS cl.exe not being in the env. path
+    get_filename_component(host_compiler_bindir ${CMAKE_LINKER} DIRECTORY)
+    LIST(APPEND OPENCV_CUDA_DETECTION_NVCC_FLAGS -ccbin "${host_compiler_bindir}")
+  else()
+    if(${cuda_host_compiler_var})
+      message(STATUS "CUDA: ${cuda_host_compiler_var}='${cuda_host_compiler}' is not valid, autodetection may not work. Specify OPENCV_CUDA_DETECTION_NVCC_FLAGS with -ccbin option for fix that")
+    endif()
+  endif()
+endmacro()
+
+macro(ocv_filter_available_architecture nvcc_executable result_list)
+  set(__cache_key_check "${ARGN} : ${nvcc_executable} ${OPENCV_CUDA_DETECTION_NVCC_FLAGS}")
+  if(DEFINED OPENCV_CACHE_CUDA_SUPPORTED_CC AND OPENCV_CACHE_CUDA_SUPPORTED_CC_check STREQUAL __cache_key_check)
+    set(${result_list} "${OPENCV_CACHE_CUDA_SUPPORTED_CC}")
+  else()
+    set(CC_LIST ${ARGN})
+    foreach(target_arch ${CC_LIST})
+      string(REPLACE "." "" target_arch_short "${target_arch}")
+      set(NVCC_OPTION "-gencode;arch=compute_${target_arch_short},code=sm_${target_arch_short}")
+      set(_cmd "${nvcc_executable}" ${OPENCV_CUDA_DETECTION_NVCC_FLAGS} ${NVCC_OPTION} "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" --compile)
+      execute_process(
+          COMMAND ${_cmd}
+          WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
+          RESULT_VARIABLE _nvcc_res
+          OUTPUT_VARIABLE _nvcc_out
+          ERROR_VARIABLE _nvcc_err
+          #ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE
+      )
+      if(OPENCV_CMAKE_CUDA_DEBUG)
+        message(WARNING "COMMAND: ${_cmd}")
+        message(STATUS "Result: ${_nvcc_res}")
+        message(STATUS "Out: ${_nvcc_out}")
+        message(STATUS "Err: ${_nvcc_err}")
+      endif()
+      if(_nvcc_res EQUAL 0)
+        LIST(APPEND ${result_list} "${target_arch}")
+      endif()
+    endforeach()
+    string(STRIP "${${result_list}}" ${result_list})
+    if(" ${${result_list}}" STREQUAL " ")
+      message(WARNING "CUDA: Autodetection arch list is empty. Please enable OPENCV_CMAKE_CUDA_DEBUG=1 and check/specify OPENCV_CUDA_DETECTION_NVCC_FLAGS variable")
+    endif()
+
+    # cache detected values
+    set(OPENCV_CACHE_CUDA_SUPPORTED_CC ${${result_list}} CACHE INTERNAL "")
+    set(OPENCV_CACHE_CUDA_SUPPORTED_CC_check "${__cache_key_check}" CACHE INTERNAL "")
+  endif()
+endmacro()
+
+macro(ocv_detect_native_cuda_arch nvcc_executable status output)
+  set(OPENCV_CUDA_DETECT_ARCHS_COMMAND "${nvcc_executable}" ${OPENCV_CUDA_DETECTION_NVCC_FLAGS} "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run")
+  set(__cache_key_check "${OPENCV_CUDA_DETECT_ARCHS_COMMAND}")
+  if(DEFINED OPENCV_CACHE_CUDA_ACTIVE_CC AND OPENCV_CACHE_CUDA_ACTIVE_CC_check STREQUAL __cache_key_check)
+    set(${output} "${OPENCV_CACHE_CUDA_ACTIVE_CC}")
+    set(${status} 0)
+  else()
+    execute_process(
+        COMMAND ${OPENCV_CUDA_DETECT_ARCHS_COMMAND}
+        WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
+        RESULT_VARIABLE ${status}
+        OUTPUT_VARIABLE _nvcc_out
+        ERROR_VARIABLE _nvcc_err
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(OPENCV_CMAKE_CUDA_DEBUG)
+      message(WARNING "COMMAND: ${OPENCV_CUDA_DETECT_ARCHS_COMMAND}")
+      message(STATUS "Result: ${${status}}")
+      message(STATUS "Out: ${_nvcc_out}")
+      message(STATUS "Err: ${_nvcc_err}")
+    endif()
+    string(REGEX REPLACE ".*\n" "" ${output} "${_nvcc_out}") #Strip leading warning messages, if any
+
+    if(${status} EQUAL 0)
+      # cache detected values
+      set(OPENCV_CACHE_CUDA_ACTIVE_CC ${${output}} CACHE INTERNAL "")
+      set(OPENCV_CACHE_CUDA_ACTIVE_CC_check "${__cache_key_check}" CACHE INTERNAL "")
+    endif()
+  endif()
+endmacro()
+
+macro(ocv_set_cuda_arch_bin_and_ptx nvcc_executable)
+  ocv_initialize_nvidia_device_generations()
+  set(__cuda_arch_ptx ${CUDA_ARCH_PTX})
+  if(CUDA_GENERATION STREQUAL "Fermi")
+    set(__cuda_arch_bin ${_arch_fermi})
+  elseif(CUDA_GENERATION STREQUAL "Kepler")
+    set(__cuda_arch_bin ${_arch_kepler})
+  elseif(CUDA_GENERATION STREQUAL "Maxwell")
+    set(__cuda_arch_bin ${_arch_maxwell})
+  elseif(CUDA_GENERATION STREQUAL "Pascal")
+    set(__cuda_arch_bin ${_arch_pascal})
+  elseif(CUDA_GENERATION STREQUAL "Volta")
+    set(__cuda_arch_bin ${_arch_volta})
+  elseif(CUDA_GENERATION STREQUAL "Turing")
+    set(__cuda_arch_bin ${_arch_turing})
+  elseif(CUDA_GENERATION STREQUAL "Ampere")
+    set(__cuda_arch_bin ${_arch_ampere})
+  elseif(CUDA_GENERATION STREQUAL "Lovelace")
+    set(__cuda_arch_bin ${_arch_lovelace})
+  elseif(CUDA_GENERATION STREQUAL "Hopper")
+    set(__cuda_arch_bin ${_arch_hopper})
+  elseif(CUDA_GENERATION STREQUAL "Auto")
+    ocv_detect_native_cuda_arch(${nvcc_executable} _nvcc_res _nvcc_out)
+    if(NOT _nvcc_res EQUAL 0)
+      message(STATUS "CUDA: Automatic detection of CUDA generation failed. Going to build for all known architectures")
+    else()
+      string(REGEX MATCHALL "[0-9]+\\.[0-9]" __cuda_arch_bin "${_nvcc_out}")
+    endif()
+  elseif(CUDA_ARCH_BIN)
+    message(STATUS "CUDA: Using CUDA_ARCH_BIN=${CUDA_ARCH_BIN}")
+    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  if(NOT DEFINED __cuda_arch_bin AND NOT DEFINED __cuda_arch_ptx)
+    if(ARM)
+      set(__cuda_arch_bin "3.2")
+      set(__cuda_arch_ptx "")
+    elseif(AARCH64)
+      if(NOT CMAKE_CROSSCOMPILING)
+        ocv_detect_native_cuda_arch(${nvcc_executable} _nvcc_res _nvcc_out)
+      else()
+        set(_nvcc_res -1)  # emulate error, see below
+      endif()
+      if(NOT _nvcc_res EQUAL 0)
+        message(STATUS "CUDA: Automatic detection of CUDA generation failed. Going to build for all known architectures")
+        # TX1 (5.3) TX2 (6.2) Xavier (7.2) V100 (7.0) Orin (8.7)
+        ocv_filter_available_architecture(${nvcc_executable} __cuda_arch_bin
+            5.3
+            6.2
+            7.2
+            7.0
+            8.7
+        )
+      else()
+        set(__cuda_arch_bin "${_nvcc_out}")
+      endif()
+      set(__cuda_arch_ptx "")
+    else()
+      ocv_filter_available_architecture(${nvcc_executable} __cuda_arch_bin
+          ${_arch_fermi}
+          ${_arch_kepler}
+          ${_arch_maxwell}
+          ${_arch_pascal}
+          ${_arch_volta}
+          ${_arch_turing}
+          ${_arch_ampere}
+          ${_arch_lovelace}
+          ${_arch_hopper}
+      )
+      list(GET __cuda_arch_bin -1 __cuda_arch_ptx)
+    endif()
+  endif()
+
+  set(CUDA_ARCH_BIN ${__cuda_arch_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported (see https://docs.opencv.org/4.x/d2/dbc/cuda_intro.html)")
+  set(CUDA_ARCH_PTX ${__cuda_arch_ptx} CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for (see https://docs.opencv.org/4.x/d2/dbc/cuda_intro.html)")
+  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
+  string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
+
+  # Check if user specified 1.0/2.1 compute capability: we don't support it
+  macro(ocv_wipeout_deprecated_cc target_cc)
+    if(" ${CUDA_ARCH_BIN} ${CUDA_ARCH_PTX}" MATCHES " ${target_cc}")
+      message(SEND_ERROR "CUDA: ${target_cc} compute capability is not supported - exclude it from ARCH/PTX list and re-run CMake")
+    endif()
+  endmacro()
+  ocv_wipeout_deprecated_cc("1.0")
+  ocv_wipeout_deprecated_cc("2.1")
+endmacro()
+
+macro(ocv_set_nvcc_threads_for_vs)
+  # Tell NVCC the maximum number of threads to be used to execute the compilation steps in parallel
+  # (option --threads was introduced in version 11.2)
+  if(NOT CUDA_VERSION VERSION_LESS "11.2")
+    if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT $ENV{CMAKE_BUILD_PARALLEL_LEVEL} STREQUAL "")
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--threads=$ENV{CMAKE_BUILD_PARALLEL_LEVEL}")
+    endif()
+  endif()
+endmacro()
+
+macro(ocv_cuda_filter_options)
+  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+    set(${var}_backup_in_cuda_compile_ "${${var}}")
+
+    if (CV_CLANG)
+      # we remove -Winconsistent-missing-override and -Qunused-arguments
+      # just in case we are compiling CUDA with gcc but OpenCV with clang
+      string(REPLACE "-Winconsistent-missing-override" "" ${var} "${${var}}")
+      string(REPLACE "-Qunused-arguments" "" ${var} "${${var}}")
+    endif()
+
+    # we remove /EHa as it generates warnings under windows
+    string(REPLACE "/EHa" "" ${var} "${${var}}")
+
+    # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
+    string(REPLACE "-ggdb3" "" ${var} "${${var}}")
+
+    # we remove -Wsign-promo as it generates warnings under linux
+    string(REPLACE "-Wsign-promo" "" ${var} "${${var}}")
+
+    # we remove -Wno-sign-promo as it generates warnings under linux
+    string(REPLACE "-Wno-sign-promo" "" ${var} "${${var}}")
+
+    # we remove -Wno-delete-non-virtual-dtor because it's used for C++ compiler
+    # but NVCC uses C compiler by default
+    string(REPLACE "-Wno-delete-non-virtual-dtor" "" ${var} "${${var}}")
+
+    # we remove -frtti because it's used for C++ compiler
+    # but NVCC uses C compiler by default
+    string(REPLACE "-frtti" "" ${var} "${${var}}")
+
+    string(REPLACE "-fvisibility-inlines-hidden" "" ${var} "${${var}}")
+
+    # cc1: warning: command line option '-Wsuggest-override' is valid for C++/ObjC++ but not for C
+    string(REPLACE "-Wsuggest-override" "" ${var} "${${var}}")
+
+    # issue: #11552 (from OpenCVCompilerOptions.cmake)
+    string(REGEX REPLACE "-Wimplicit-fallthrough(=[0-9]+)? " "" ${var} "${${var}}")
+
+    # removal of custom specified options
+    if(OPENCV_CUDA_NVCC_FILTEROUT_OPTIONS)
+      foreach(__flag ${OPENCV_CUDA_NVCC_FILTEROUT_OPTIONS})
+        string(REPLACE "${__flag}" "" ${var} "${${var}}")
+      endforeach()
+    endif()
+  endforeach()
+endmacro()
+
+macro(ocv_nvcc_flags)
+  if(BUILD_SHARED_LIBS)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler=-DCVAPI_EXPORTS)
+  endif()
+
+  if(UNIX OR APPLE)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler=-fPIC)
+  endif()
+  if(APPLE)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler=-fno-finite-math-only)
+  endif()
+
+  if(WIN32 AND NOT (CUDA_VERSION VERSION_LESS "11.2"))
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcudafe --display_error_number --diag-suppress 1394,1388)
+  endif()
+
+  if(CMAKE_CROSSCOMPILING AND (ARM OR AARCH64))
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xlinker --unresolved-symbols=ignore-in-shared-libs)
+  endif()
+
+  # disabled because of multiple warnings during building nvcc auto generated files
+  if(CV_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.6.0")
+    ocv_warnings_disable(CMAKE_CXX_FLAGS -Wunused-but-set-variable)
+  endif()
+endmacro()
+
+macro(ocv_apply_cuda_stub_workaround cuda_driver_library_path)
+  # details: https://github.com/NVIDIA/nvidia-docker/issues/775
+  if(" ${cuda_driver_library_path}" MATCHES "/stubs/libcuda.so" AND NOT OPENCV_SKIP_CUDA_STUB_WORKAROUND)
+    set(CUDA_STUB_ENABLED_LINK_WORKAROUND 1)
+    if(EXISTS "${cuda_driver_library_path}" AND NOT OPENCV_SKIP_CUDA_STUB_WORKAROUND_RPATH_LINK)
+      set(CUDA_STUB_TARGET_PATH "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/")
+      execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${cuda_driver_library_path}" "${CUDA_STUB_TARGET_PATH}/libcuda.so.1"
+          RESULT_VARIABLE CUDA_STUB_SYMLINK_RESULT)
+      if(NOT CUDA_STUB_SYMLINK_RESULT EQUAL 0)
+        execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${cuda_driver_library_path}" "${CUDA_STUB_TARGET_PATH}/libcuda.so.1"
+          RESULT_VARIABLE CUDA_STUB_COPY_RESULT)
+        if(NOT CUDA_STUB_COPY_RESULT EQUAL 0)
+          set(CUDA_STUB_ENABLED_LINK_WORKAROUND 0)
+        endif()
+      endif()
+      if(CUDA_STUB_ENABLED_LINK_WORKAROUND)
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath-link,\"${CUDA_STUB_TARGET_PATH}\"")
+      endif()
+    else()
+      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--allow-shlib-undefined")
+    endif()
+    if(NOT CUDA_STUB_ENABLED_LINK_WORKAROUND)
+      message(WARNING "CUDA: Workaround for stubs/libcuda.so.1 is not applied")
+    endif()
+  endif()
+endmacro()
+
+macro(ocv_check_cuda_delayed_load cuda_toolkit_root_dir)
+  if(MSVC AND CUDA_ENABLE_DELAYLOAD)
+    set(DELAYFLAGS "delayimp.lib")
+    file(GLOB CUDA_DLLS "${cuda_toolkit_root_dir}/bin/*.dll")
+    foreach(d ${CUDA_DLLS})
+      cmake_path(GET "d" FILENAME DLL_NAME)
+      if(NOT ${DLL_NAME} MATCHES "cudart")
+        set(DELAYFLAGS "${DELAYFLAGS} /DELAYLOAD:${DLL_NAME}")
+      endif()
+    endforeach()
+    set(DELAYFLAGS "${DELAYFLAGS} /DELAYLOAD:nvcuda.dll /DELAYLOAD:nvml.dll /IGNORE:4199")
+    set(CMAKE_EXE_LINKER_FLAGS       "${CMAKE_EXE_LINKER_FLAGS} ${DELAYFLAGS}")
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${DELAYFLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${DELAYFLAGS}")
+  endif()
+endmacro()
diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake
index 8fe89b3fe0c3..448afd46eafb 100644
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -28,22 +28,8 @@ if(NOT DEFINED CV_GCC AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
 endif()
 if(NOT DEFINED CV_CLANG AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")  # Clang or AppleClang (see CMP0025)
   set(CV_CLANG 1)
-  set(CMAKE_COMPILER_IS_CLANGCXX 1)  # TODO next release: remove this
-  set(CMAKE_COMPILER_IS_CLANGCC 1)   # TODO next release: remove this
 endif()
 
-function(access_CMAKE_COMPILER_IS_CLANGCXX)
-  if(NOT OPENCV_SUPPRESS_DEPRECATIONS)
-    message(WARNING "DEPRECATED: CMAKE_COMPILER_IS_CLANGCXX support is deprecated in OpenCV.
-    Consider using:
-    - CV_GCC    # GCC
-    - CV_CLANG  # Clang or AppleClang (see CMP0025)
-")
-  endif()
-endfunction()
-variable_watch(CMAKE_COMPILER_IS_CLANGCXX access_CMAKE_COMPILER_IS_CLANGCXX)
-variable_watch(CMAKE_COMPILER_IS_CLANGCC access_CMAKE_COMPILER_IS_CLANGCXX)
-
 
 # ----------------------------------------------------------------------------
 # Detect Intel ICC compiler
@@ -68,6 +54,23 @@ if(MSVC AND CMAKE_C_COMPILER MATCHES "icc|icl")
   set(CV_ICC   __INTEL_COMPILER_FOR_WINDOWS)
 endif()
 
+# ----------------------------------------------------------------------------
+# Detect Intel ICXC compiler
+# ----------------------------------------------------------------------------
+if(UNIX)
+  if(__INTEL_COMPILER)
+    set(CV_ICX   __INTEL_LLVM_COMPILER)
+  elseif(CMAKE_C_COMPILER MATCHES "icx")
+    set(CV_ICX   icx_matches_c_compiler)
+  elseif(CMAKE_CXX_COMPILER MATCHES "icpx")
+    set(CV_ICX   icpx_matches_cxx_compiler)
+  endif()
+endif()
+
+if(MSVC AND CMAKE_CXX_COMPILER MATCHES ".*(dpcpp-cl|dpcpp|icx-cl|icpx|icx)(.exe)?$")
+  set(CV_ICX   __INTEL_LLVM_COMPILER_WINDOWS)
+endif()
+
 if(NOT DEFINED CMAKE_CXX_COMPILER_VERSION
     AND NOT OPENCV_SUPPRESS_MESSAGE_MISSING_COMPILER_VERSION)
   message(WARNING "OpenCV: Compiler version is not available: CMAKE_CXX_COMPILER_VERSION is not set")
@@ -173,7 +176,7 @@ elseif(MSVC)
     set(OpenCV_RUNTIME vc15)
   elseif(MSVC_VERSION MATCHES "^192[0-9]$")
     set(OpenCV_RUNTIME vc16)
-  elseif(MSVC_VERSION MATCHES "^193[0-9]$")
+  elseif(MSVC_VERSION MATCHES "^19[34][0-9]$")
     set(OpenCV_RUNTIME vc17)
   else()
     message(WARNING "OpenCV does not recognize MSVC_VERSION \"${MSVC_VERSION}\". Cannot set OpenCV_RUNTIME")
diff --git a/cmake/OpenCVDetectDirectML.cmake b/cmake/OpenCVDetectDirectML.cmake
new file mode 100644
index 000000000000..0fc71eca03ad
--- /dev/null
+++ b/cmake/OpenCVDetectDirectML.cmake
@@ -0,0 +1,13 @@
+if(WIN32)
+  try_compile(__VALID_DIRECTML
+    "${OpenCV_BINARY_DIR}"
+    "${OpenCV_SOURCE_DIR}/cmake/checks/directml.cpp"
+    LINK_LIBRARIES d3d12 dxcore directml
+    OUTPUT_VARIABLE TRY_OUT
+  )
+  if(NOT __VALID_DIRECTML)
+    message(STATUS "No support for DirectML (d3d12, dxcore, directml libs are required)")
+    return()
+  endif()
+  set(HAVE_DIRECTML ON)
+endif()
diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
index 319fd5bf0ad2..9a2eb38b0394 100644
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -13,67 +13,3 @@ if(WITH_OPENVINO)
     return()
   endif()
 endif()
-
-# ======================
-
-if(WITH_OPENVINO)
-  find_package(OpenVINO QUIET)
-  if(OpenVINO_FOUND)
-    message(STATUS "OpenVINO FOUND: ${OpenVINO_VERSION}")
-    math(EXPR ver "${OpenVINO_VERSION_MAJOR} * 1000000 + ${OpenVINO_VERSION_MINOR} * 10000 + ${OpenVINO_VERSION_PATCH} * 100")
-    ocv_add_external_target(openvino "" "openvino::runtime" "INF_ENGINE_RELEASE=${ver};HAVE_NGRAPH;HAVE_DNN_NGRAPH;HAVE_INF_ENGINE")
-    set(HAVE_OPENVINO 1)
-    return()
-  endif()
-endif()
-
-# ======================
-
-find_package(InferenceEngine QUIET)
-if(InferenceEngine_FOUND)
-  set(INF_ENGINE_TARGET ${InferenceEngine_LIBRARIES})
-  set(INF_ENGINE_VERSION "${InferenceEngine_VERSION}")
-  message(STATUS "Detected InferenceEngine: cmake package (${InferenceEngine_VERSION})")
-endif()
-
-if(DEFINED InferenceEngine_VERSION)
-  message(STATUS "InferenceEngine: ${InferenceEngine_VERSION}")
-  if(NOT INF_ENGINE_RELEASE AND NOT (InferenceEngine_VERSION VERSION_LESS "2021.4"))
-    math(EXPR INF_ENGINE_RELEASE_INIT "${InferenceEngine_VERSION_MAJOR} * 1000000 + ${InferenceEngine_VERSION_MINOR} * 10000 + ${InferenceEngine_VERSION_PATCH} * 100")
-  endif()
-endif()
-if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT)
-  message(STATUS "WARNING: InferenceEngine version has not been set, 2021.4.2 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
-  set(INF_ENGINE_RELEASE_INIT "2021040200")
-elseif(DEFINED INF_ENGINE_RELEASE)
-  set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}")
-endif()
-set(INF_ENGINE_RELEASE "${INF_ENGINE_RELEASE_INIT}" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
-
-set(tgts)
-set(defs)
-
-# Add more features to the target
-if(INF_ENGINE_TARGET)
-  set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
-      INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
-  )
-  list(APPEND tgts ${INF_ENGINE_TARGET})
-  list(APPEND defs "INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}" "HAVE_INF_ENGINE")
-endif()
-
-if(WITH_NGRAPH OR NOT DEFINED WITH_NGRAPH)
-  find_package(ngraph QUIET)
-  if(ngraph_FOUND)
-    ocv_assert(TARGET ngraph::ngraph)
-    if(INF_ENGINE_RELEASE VERSION_LESS "2019039999")
-      message(WARNING "nGraph is not tested with current InferenceEngine version: INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}")
-    endif()
-    message(STATUS "Detected ngraph: cmake package (${ngraph_VERSION})")
-    set(HAVE_NGRAPH ON)
-    list(APPEND tgts ngraph::ngraph)
-    list(APPEND defs "HAVE_NGRAPH" "HAVE_DNN_NGRAPH")
-  endif()
-endif()
-
-ocv_add_external_target(openvino "" "${tgts}" "${defs}")
diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake
index c93eb9f9a7e0..a23fba6e5a4f 100644
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -175,7 +175,7 @@ if(NOT ${found})
       endif()
     endif()
 
-    if(NOT ANDROID AND NOT IOS)
+    if(NOT ANDROID AND NOT IOS AND NOT XROS)
       if(CMAKE_HOST_UNIX)
         execute_process(COMMAND ${_executable} -c "from sysconfig import *; print(get_path('purelib'))"
                         RESULT_VARIABLE _cvpy_process
@@ -216,7 +216,7 @@ if(NOT ${found})
           message(STATUS "  PYTHON3_NUMPY_INCLUDE_DIRS")
         else()
           # Attempt to discover the NumPy include directory. If this succeeds, then build python API with NumPy
-          execute_process(COMMAND "${_executable}" -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print(os.pathsep.join(numpy.distutils.misc_util.get_numpy_include_dirs()))"
+          execute_process(COMMAND "${_executable}" -c "import numpy; print(numpy.get_include())"
                           RESULT_VARIABLE _numpy_process
                           OUTPUT_VARIABLE _numpy_include_dirs
                           OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -240,7 +240,7 @@ if(NOT ${found})
                           OUTPUT_STRIP_TRAILING_WHITESPACE)
         endif()
       endif()
-    endif(NOT ANDROID AND NOT IOS)
+    endif(NOT ANDROID AND NOT IOS AND NOT XROS)
   endif()
 
   # Export return values
@@ -258,7 +258,7 @@ if(NOT ${found})
   set(${include_path} "${_include_path}" CACHE INTERNAL "")
   set(${include_dir} "${_include_dir}" CACHE PATH "Python include dir")
   set(${include_dir2} "${_include_dir2}" CACHE PATH "Python include dir 2")
-  set(${packages_path} "${_packages_path}" CACHE PATH "Where to install the python packages.")
+  set(${packages_path} "${_packages_path}" CACHE STRING "Where to install the python packages.")
   set(${numpy_include_dirs} ${_numpy_include_dirs} CACHE PATH "Path to numpy headers")
   set(${numpy_version} "${_numpy_version}" CACHE INTERNAL "")
 endif()
@@ -268,13 +268,19 @@ if(OPENCV_PYTHON_SKIP_DETECTION)
   return()
 endif()
 
-find_python("" "${MIN_VER_PYTHON2}" PYTHON2_LIBRARY PYTHON2_INCLUDE_DIR
+ocv_check_environment_variables(OPENCV_ENABLE_PYTHON2)
+ocv_check_environment_variables(PYTHON2_EXECUTABLE)
+if((OPENCV_ENABLE_PYTHON2 OR PYTHON2_EXECUTABLE OR BUILD_opencv_python2)
+    AND NOT OPENCV_PYTHON2_SKIP_DETECTION
+)
+  find_python("" "${MIN_VER_PYTHON2}" PYTHON2_LIBRARY PYTHON2_INCLUDE_DIR
     PYTHON2INTERP_FOUND PYTHON2_EXECUTABLE PYTHON2_VERSION_STRING
     PYTHON2_VERSION_MAJOR PYTHON2_VERSION_MINOR PYTHON2LIBS_FOUND
     PYTHON2LIBS_VERSION_STRING PYTHON2_LIBRARIES PYTHON2_LIBRARY
     PYTHON2_DEBUG_LIBRARIES PYTHON2_LIBRARY_DEBUG PYTHON2_INCLUDE_PATH
     PYTHON2_INCLUDE_DIR PYTHON2_INCLUDE_DIR2 PYTHON2_PACKAGES_PATH
     PYTHON2_NUMPY_INCLUDE_DIRS PYTHON2_NUMPY_VERSION)
+endif()
 
 option(OPENCV_PYTHON3_VERSION "Python3 version" "")
 find_python("${OPENCV_PYTHON3_VERSION}" "${MIN_VER_PYTHON3}" PYTHON3_LIBRARY PYTHON3_INCLUDE_DIR
@@ -285,6 +291,17 @@ find_python("${OPENCV_PYTHON3_VERSION}" "${MIN_VER_PYTHON3}" PYTHON3_LIBRARY PYT
     PYTHON3_INCLUDE_DIR PYTHON3_INCLUDE_DIR2 PYTHON3_PACKAGES_PATH
     PYTHON3_NUMPY_INCLUDE_DIRS PYTHON3_NUMPY_VERSION)
 
+# Problem in numpy >=1.15 <1.17
+OCV_OPTION(PYTHON3_LIMITED_API "Build with Python Limited API (not available with numpy >=1.15 <1.17)" NO
+           VISIBLE_IF PYTHON3_NUMPY_VERSION VERSION_LESS "1.15" OR NOT PYTHON3_NUMPY_VERSION VERSION_LESS "1.17")
+if(PYTHON3_LIMITED_API)
+  set(_default_ver "0x03060000")
+  if(PYTHON3_VERSION_STRING VERSION_LESS "3.6")
+    # fix for older pythons
+    set(_default_ver "0x030${PYTHON3_VERSION_MINOR}0000")
+  endif()
+  set(PYTHON3_LIMITED_API_VERSION ${_default_ver} CACHE STRING "Minimal Python version for Limited API")
+endif()
 
 if(PYTHON_DEFAULT_EXECUTABLE)
     set(PYTHON_DEFAULT_AVAILABLE "TRUE")
diff --git a/cmake/OpenCVFindCANN.cmake b/cmake/OpenCVFindCANN.cmake
index b0b8e35c6bb8..36d160d0f49f 100644
--- a/cmake/OpenCVFindCANN.cmake
+++ b/cmake/OpenCVFindCANN.cmake
@@ -5,13 +5,24 @@ if("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME
     message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
 endif()
 
+if(EXISTS "${CANN_INSTALL_DIR}/opp/op_proto/built-in/inc")
+    set(CANN_VERSION_BELOW_6_3_ALPHA002 "YES" )
+    add_definitions(-DCANN_VERSION_BELOW_6_3_ALPHA002="YES")
+endif()
+
 if(CANN_INSTALL_DIR)
+    # Supported system: UNIX
+    if(NOT UNIX)
+        set(HAVE_CANN OFF)
+        message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off HAVE_CANN")
+        return()
+    endif()
     # Supported platforms: x86-64, arm64
     if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
     elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
     else()
         set(HAVE_CANN OFF)
-        message(STATUS "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off HAVE_CANN")
+        message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off HAVE_CANN")
         return()
     endif()
 
@@ -31,7 +42,30 @@ if(CANN_INSTALL_DIR)
         set(lib_ascendcl ${found_lib_ascendcl})
         message(STATUS "CANN: libascendcl.so is found at ${lib_ascendcl}")
     else()
-        message(STATUS "CANN: Missing libascendcl.so. Turning off HAVE_CANN")
+        message(WARNING "CANN: Missing libascendcl.so. Turning off HAVE_CANN")
+        set(HAVE_CANN OFF)
+        return()
+    endif()
+    #  * libacl_op_compiler.so
+    set(lib_acl_op_compiler "${CANN_INSTALL_DIR}/lib64")
+    find_library(found_lib_acl_op_compiler NAMES acl_op_compiler PATHS ${lib_acl_op_compiler} NO_DEFAULT_PATH)
+    if(found_lib_acl_op_compiler)
+        set(lib_acl_op_compiler ${found_lib_acl_op_compiler})
+        message(STATUS "CANN: libacl_op_compiler.so is found at ${lib_acl_op_compiler}")
+    else()
+        message(STATUS "CANN: Missing libacl_op_compiler.so. Turning off HAVE_CANN")
+        set(HAVE_CANN OFF)
+        return()
+    endif()
+
+    #  * libacl_dvpp_mpi.so
+    set(libacl_dvpp_mpi "${CANN_INSTALL_DIR}/lib64")
+    find_library(found_libacldvppmpi NAMES acl_dvpp_mpi PATHS ${libacl_dvpp_mpi} NO_DEFAULT_PATH)
+    if(found_libacldvppmpi)
+        set(libacl_dvpp_mpi ${found_libacldvppmpi})
+        message(STATUS "CANN: libacl_dvpp_mpi.so is found at ${libacl_dvpp_mpi}")
+    else()
+        message(STATUS "CANN: Missing libacl_dvpp_mpi.so. Turning off HAVE_CANN")
         set(HAVE_CANN OFF)
         return()
     endif()
@@ -42,7 +76,7 @@ if(CANN_INSTALL_DIR)
         set(lib_graph ${found_lib_graph})
         message(STATUS "CANN: libgraph.so is found at ${lib_graph}")
     else()
-        message(STATUS "CANN: Missing libgraph.so. Turning off HAVE_CANN")
+        message(WARNING "CANN: Missing libgraph.so. Turning off HAVE_CANN")
         set(HAVE_CANN OFF)
         return()
     endif()
@@ -53,28 +87,50 @@ if(CANN_INSTALL_DIR)
         set(lib_ge_compiler ${found_lib_ge_compiler})
         message(STATUS "CANN: libge_compiler.so is found at ${lib_ge_compiler}")
     else()
-        message(STATUS "CANN: Missing libge_compiler.so. Turning off HAVE_CANN")
+        message(WARNING "CANN: Missing libge_compiler.so. Turning off HAVE_CANN")
         set(HAVE_CANN OFF)
         return()
     endif()
     #  * libopsproto.so
-    set(lib_opsproto "${CANN_INSTALL_DIR}/opp/op_proto/built-in")
+    if (CANN_VERSION_BELOW_6_3_ALPHA002)
+        set(lib_opsproto "${CANN_INSTALL_DIR}/opp/op_proto/built-in/")
+    else()
+        if(EXISTS "${CANN_INSTALL_DIR}/opp/built-in/op_proto/lib/linux")
+            set(lib_opsproto "${CANN_INSTALL_DIR}/opp/built-in/op_proto/lib/linux/${CMAKE_HOST_SYSTEM_PROCESSOR}")
+        else()
+            set(lib_opsproto "${CANN_INSTALL_DIR}/opp/built-in/op_proto")
+        endif()
+    endif()
     find_library(found_lib_opsproto NAMES opsproto PATHS ${lib_opsproto} NO_DEFAULT_PATH)
     if(found_lib_opsproto)
         set(lib_opsproto ${found_lib_opsproto})
         message(STATUS "CANN: libopsproto.so is found at ${lib_opsproto}")
     else()
-        message(STATUS "CANN: Missing libopsproto.so. Turning off HAVE_CANN")
+        message(WARNING "CANN: Missing libopsproto.so can't found at ${lib_opsproto}. Turning off HAVE_CANN")
         set(HAVE_CANN OFF)
         return()
     endif()
 
-
     set(libs_cann "")
     list(APPEND libs_cann ${lib_ascendcl})
+    list(APPEND libs_cann ${lib_acl_op_compiler})
     list(APPEND libs_cann ${lib_opsproto})
     list(APPEND libs_cann ${lib_graph})
     list(APPEND libs_cann ${lib_ge_compiler})
+    list(APPEND libs_cann ${libacl_dvpp_mpi})
+
+    #  * lib_graph_base.so
+    if(NOT CANN_VERSION_BELOW_6_3_ALPHA002)
+        set(lib_graph_base "${CANN_INSTALL_DIR}/compiler/lib64")
+        find_library(found_libgraph_base NAMES graph_base PATHS ${lib_graph_base} NO_DEFAULT_PATH)
+        if(found_libgraph_base)
+            set(lib_graph_base ${found_libgraph_base})
+            message(STATUS "CANN: lib_graph_base.so is found at ${lib_graph_base}")
+            list(APPEND libs_cann ${lib_graph_base})
+        else()
+            message(STATUS "CANN: Missing lib_graph_base.so. It is only required after cann version 6.3.RC1.alpha002")
+        endif()
+    endif()
 
     try_compile(VALID_ASCENDCL
         "${OpenCV_BINARY_DIR}"
diff --git a/cmake/OpenCVFindIPP.cmake b/cmake/OpenCVFindIPP.cmake
index 6bcd81d8b4c6..2328ef8b435e 100644
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@@ -84,7 +84,7 @@ endmacro()
 # This macro uses IPP_ROOT_DIR variable
 # TODO Cleanup code after ICV package stabilization
 macro(ipp_detect_version)
-  set(IPP_INCLUDE_DIRS ${IPP_ROOT_DIR}/include)
+  get_filename_component(IPP_INCLUDE_DIRS ${IPP_VERSION_FILE} PATH)
 
   set(__msg)
   if(EXISTS ${IPP_ROOT_DIR}/include/ippicv_redefs.h)
@@ -271,7 +271,9 @@ if(NOT DEFINED IPPROOT)
 endif()
 
 file(TO_CMAKE_PATH "${IPPROOT}" __IPPROOT)
-if(EXISTS "${__IPPROOT}/include/ippversion.h")
+file(GLOB_RECURSE IPP_VERSION_FILE "${__IPPROOT}/include/*ippversion.h")
+
+if(EXISTS ${IPP_VERSION_FILE})
   set(IPP_ROOT_DIR ${__IPPROOT})
   ipp_detect_version()
 endif()
diff --git a/cmake/OpenCVFindLAPACK.cmake b/cmake/OpenCVFindLAPACK.cmake
index 9b1b60f19ec3..ba682c1c04aa 100644
--- a/cmake/OpenCVFindLAPACK.cmake
+++ b/cmake/OpenCVFindLAPACK.cmake
@@ -1,3 +1,26 @@
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+  set(_apple_device_min_target_os_version "13.3")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "iOS")
+  set(_apple_device_min_target_os_version "16.4")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "watchOS")
+  set(_apple_device_min_target_os_version "9.4")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "tvOS")
+  set(_apple_device_min_target_os_version "16.4")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "visionOS")
+  set(_apple_device_min_target_os_version "1.0")
+endif()
+
+if(DEFINED _apple_device_min_target_os_version AND
+   ("${CMAKE_OSX_DEPLOYMENT_TARGET}" VERSION_GREATER "${_apple_device_min_target_os_version}" OR
+    "${CMAKE_OSX_DEPLOYMENT_TARGET}" VERSION_EQUAL "${_apple_device_min_target_os_version}"))
+  set(_apple_device_has_required_min_os_version ON)
+else()
+  set(_apple_device_has_required_min_os_version OFF)
+endif()
+
+OCV_OPTION(OPENCV_OSX_USE_ACCELERATE_NEW_LAPACK "Use new BLAS/LAPACK interfaces from Accelerate framework on Apple platform" _apple_device_has_required_min_os_version
+  VISIBLE_IF APPLE)
+
 macro(_find_header_file_in_dirs VAR NAME)
   unset(${VAR})
   unset(${VAR} CACHE)
@@ -106,11 +129,20 @@ macro(ocv_lapack_check)
       list(APPEND __link_directories ${LAPACK_LINK_LIBRARIES})
     endif()
 
+    set(LAPACK_TRY_COMPILE_DEF "")
+    if(LAPACK_IMPL STREQUAL "LAPACK/Apple" AND OPENCV_OSX_USE_ACCELERATE_NEW_LAPACK)
+      message(STATUS "LAPACK(${LAPACK_IMPL}): Accelerate New LAPACK is enabled.")
+      set(LAPACK_TRY_COMPILE_DEF "-DACCELERATE_NEW_LAPACK")
+      add_compile_definitions(ACCELERATE_NEW_LAPACK)
+      add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+    endif()
+
     try_compile(__VALID_LAPACK
         "${OpenCV_BINARY_DIR}"
         "${OpenCV_SOURCE_DIR}/cmake/checks/lapack_check.cpp"
         CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${LAPACK_INCLUDE_DIR}\;${CMAKE_BINARY_DIR}"
                     "-DLINK_DIRECTORIES:STRING=${__link_directories}"
+        COMPILE_DEFINITIONS ${LAPACK_TRY_COMPILE_DEF}
         LINK_LIBRARIES ${LAPACK_LIBRARIES}
         OUTPUT_VARIABLE TRY_OUT
     )
diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake
index 79758fa813cc..31c19b1da7a5 100644
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -77,7 +77,7 @@ endif(WITH_OPENGL)
 
 # --- Cocoa ---
 if(APPLE)
-  if(NOT IOS AND CV_CLANG)
+  if(NOT IOS AND NOT XROS AND CV_CLANG)
     set(HAVE_COCOA YES)
   endif()
 endif()
diff --git a/cmake/OpenCVFindLibsGrfmt.cmake b/cmake/OpenCVFindLibsGrfmt.cmake
index e544f78eaa14..04755c4c2c65 100644
--- a/cmake/OpenCVFindLibsGrfmt.cmake
+++ b/cmake/OpenCVFindLibsGrfmt.cmake
@@ -3,10 +3,23 @@
 # ----------------------------------------------------------------------------
 
 # --- zlib (required) ---
-if(BUILD_ZLIB)
-  ocv_clear_vars(ZLIB_FOUND)
+if(WITH_ZLIB_NG)
+  ocv_clear_vars(ZLIB_LIBRARY ZLIB_LIBRARIES ZLIB_INCLUDE_DIR)
+  set(ZLIB_LIBRARY zlib CACHE INTERNAL "")
+  add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/zlib-ng")
+  set(ZLIB_INCLUDE_DIR "${${ZLIB_LIBRARY}_BINARY_DIR}" CACHE INTERNAL "")
+  set(ZLIB_INCLUDE_DIRS ${ZLIB_INCLUDE_DIR})
+  set(ZLIB_LIBRARIES ${ZLIB_LIBRARY})
+
+  ocv_parse_header_version(ZLIB "${${ZLIB_LIBRARY}_SOURCE_DIR}/zlib.h.in" ZLIB_VERSION)
+  ocv_parse_header_version(ZLIBNG "${${ZLIB_LIBRARY}_SOURCE_DIR}/zlib.h.in" ZLIBNG_VERSION)
+
+  set(HAVE_ZLIB_NG YES)
 else()
-  ocv_clear_internal_cache_vars(ZLIB_LIBRARY ZLIB_INCLUDE_DIR)
+  if(BUILD_ZLIB)
+    ocv_clear_vars(ZLIB_FOUND)
+  else()
+    ocv_clear_internal_cache_vars(ZLIB_LIBRARY ZLIB_INCLUDE_DIR)
   if(ANDROID)
     set(_zlib_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
     set(CMAKE_FIND_LIBRARY_SUFFIXES .so)
@@ -23,18 +36,19 @@ else()
       set(ZLIB_LIBRARY_RELEASE z)
     endif()
   endif()
-endif()
+  endif()
 
-if(NOT ZLIB_FOUND)
-  ocv_clear_vars(ZLIB_LIBRARY ZLIB_LIBRARIES ZLIB_INCLUDE_DIR)
+  if(NOT ZLIB_FOUND)
+    ocv_clear_vars(ZLIB_LIBRARY ZLIB_LIBRARIES ZLIB_INCLUDE_DIR)
 
-  set(ZLIB_LIBRARY zlib CACHE INTERNAL "")
-  add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/zlib")
-  set(ZLIB_INCLUDE_DIR "${${ZLIB_LIBRARY}_SOURCE_DIR}" "${${ZLIB_LIBRARY}_BINARY_DIR}" CACHE INTERNAL "")
-  set(ZLIB_INCLUDE_DIRS ${ZLIB_INCLUDE_DIR})
-  set(ZLIB_LIBRARIES ${ZLIB_LIBRARY})
+    set(ZLIB_LIBRARY zlib CACHE INTERNAL "")
+    add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/zlib")
+    set(ZLIB_INCLUDE_DIR "${${ZLIB_LIBRARY}_SOURCE_DIR}" "${${ZLIB_LIBRARY}_BINARY_DIR}" CACHE INTERNAL "")
+    set(ZLIB_INCLUDE_DIRS ${ZLIB_INCLUDE_DIR})
+    set(ZLIB_LIBRARIES ${ZLIB_LIBRARY})
 
-  ocv_parse_header2(ZLIB "${${ZLIB_LIBRARY}_SOURCE_DIR}/zlib.h" ZLIB_VERSION)
+    ocv_parse_header_version(ZLIB "${${ZLIB_LIBRARY}_SOURCE_DIR}/zlib.h" ZLIB_VERSION)
+  endif()
 endif()
 
 # --- libavif (optional) ---
@@ -124,13 +138,13 @@ if(WITH_TIFF)
   endif()
 
   if(NOT TIFF_VERSION_STRING AND TIFF_INCLUDE_DIR)
-    list(GET TIFF_INCLUDE_DIR 0 _TIFF_INCLUDE_DIR)
-    if(EXISTS "${_TIFF_INCLUDE_DIR}/tiffvers.h")
-      file(STRINGS "${_TIFF_INCLUDE_DIR}/tiffvers.h" tiff_version_str REGEX "^#define[\t ]+TIFFLIB_VERSION_STR[\t ]+\"LIBTIFF, Version .*")
-      string(REGEX REPLACE "^#define[\t ]+TIFFLIB_VERSION_STR[\t ]+\"LIBTIFF, Version +([^ \\n]*).*" "\\1" TIFF_VERSION_STRING "${tiff_version_str}")
-      unset(tiff_version_str)
-    endif()
-    unset(_TIFF_INCLUDE_DIR)
+    foreach(_TIFF_INCLUDE_DIR IN LISTS TIFF_INCLUDE_DIR)
+      if(EXISTS "${_TIFF_INCLUDE_DIR}/tiffvers.h")
+        file(STRINGS "${_TIFF_INCLUDE_DIR}/tiffvers.h" tiff_version_str REGEX "^#define[\t ]+TIFFLIB_VERSION_STR[\t ]+\"LIBTIFF, Version .*")
+        string(REGEX REPLACE "^#define[\t ]+TIFFLIB_VERSION_STR[\t ]+\"LIBTIFF, Version +([^ \\n]*).*" "\\1" TIFF_VERSION_STRING "${tiff_version_str}")
+        unset(tiff_version_str)
+      endif()
+    endforeach()
   endif()
 
   set(HAVE_TIFF YES)
@@ -202,6 +216,7 @@ if(WITH_OPENJPEG)
     endif()
   else()
     set(HAVE_OPENJPEG YES)
+    set(OPENJPEG_VERSION "${OPENJPEG_MAJOR_VERSION}.${OPENJPEG_MINOR_VERSION}.${OPENJPEG_BUILD_VERSION}")
     message(STATUS "Found system OpenJPEG: ${OPENJPEG_LIBRARIES} "
             "(found version \"${OPENJPEG_VERSION}\")")
   endif()
@@ -232,16 +247,38 @@ if(WITH_JASPER AND NOT HAVE_OPENJPEG)
 endif()
 
 if(WITH_SPNG)
-  set(SPNG_LIBRARY libspng CACHE INTERNAL "")
-  set(SPNG_LIBRARIES ${SPNG_LIBRARY})
-  add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/libspng")
-  set(SPNG_INCLUDE_DIR "${${SPNG_LIBRARY}_SOURCE_DIR}" CACHE INTERNAL "")
-  set(SPNG_DEFINITIONS "")
-  ocv_parse_header("${SPNG_INCLUDE_DIR}/spng.h" SPNG_VERSION_LINES SPNG_VERSION_MAJOR SPNG_VERSION_MINOR SPNG_VERSION_PATCH)
-
-  set(HAVE_SPNG YES)
-  set(SPNG_VERSION "${SPNG_VERSION_MAJOR}.${SPNG_VERSION_MINOR}.${SPNG_VERSION_PATCH}")
-  message(STATUS "imgcodecs: PNG codec will use SPNG, version: ${SPNG_VERSION} ")
+  if(BUILD_SPNG)
+    ocv_clear_vars(PNG_FOUND)
+  else()
+    # CMakeConfig bug in SPNG, include is missing there in version 0.7.4 and older
+    # See https://github.com/randy408/libspng/pull/264
+    include(CMakeFindDependencyMacro)
+    find_package(SPNG QUIET)
+    if(SPNG_FOUND)
+      set(SPNG_LIBRARY "spng::spng" CACHE INTERNAL "")
+      set(SPNG_LIBRARIES ${SPNG_LIBRARY})
+    else()
+      if(PkgConfig_FOUND)
+        pkg_check_modules(SPNG QUIET spng)
+      endif()
+    endif()
+    if(SPNG_FOUND)
+      set(HAVE_SPNG YES)
+      message(STATUS "imgcodecs: PNG codec will use SPNG, version: ${SPNG_VERSION}")
+    endif()
+  endif()
+  if(NOT SPNG_FOUND)
+    set(SPNG_LIBRARY libspng CACHE INTERNAL "")
+    set(SPNG_LIBRARIES ${SPNG_LIBRARY})
+    add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/libspng")
+    set(SPNG_INCLUDE_DIR "${${SPNG_LIBRARY}_SOURCE_DIR}" CACHE INTERNAL "")
+    set(SPNG_DEFINITIONS "")
+    ocv_parse_header("${SPNG_INCLUDE_DIR}/spng.h" SPNG_VERSION_LINES SPNG_VERSION_MAJOR SPNG_VERSION_MINOR SPNG_VERSION_PATCH)
+
+    set(HAVE_SPNG YES)
+    set(SPNG_VERSION "${SPNG_VERSION_MAJOR}.${SPNG_VERSION_MINOR}.${SPNG_VERSION_PATCH}")
+    message(STATUS "imgcodecs: PNG codec will use SPNG, version: ${SPNG_VERSION} ")
+  endif()
 endif()
 
 # --- libpng (optional, should be searched after zlib) ---
@@ -250,31 +287,21 @@ if(NOT HAVE_SPNG AND WITH_PNG)
     ocv_clear_vars(PNG_FOUND)
   else()
     ocv_clear_internal_cache_vars(PNG_LIBRARY PNG_INCLUDE_DIR)
-    include(FindPNG)
-    if(PNG_FOUND)
-      include(CheckIncludeFile)
-      check_include_file("${PNG_PNG_INCLUDE_DIR}/libpng/png.h" HAVE_LIBPNG_PNG_H)
-      if(HAVE_LIBPNG_PNG_H)
-        ocv_parse_header("${PNG_PNG_INCLUDE_DIR}/libpng/png.h" PNG_VERSION_LINES PNG_LIBPNG_VER_MAJOR PNG_LIBPNG_VER_MINOR PNG_LIBPNG_VER_RELEASE)
-      else()
-        ocv_parse_header("${PNG_PNG_INCLUDE_DIR}/png.h" PNG_VERSION_LINES PNG_LIBPNG_VER_MAJOR PNG_LIBPNG_VER_MINOR PNG_LIBPNG_VER_RELEASE)
-      endif()
-    endif()
+    find_package(PNG QUIET)
   endif()
 
   if(NOT PNG_FOUND)
-    ocv_clear_vars(PNG_LIBRARY PNG_LIBRARIES PNG_INCLUDE_DIR PNG_PNG_INCLUDE_DIR HAVE_LIBPNG_PNG_H PNG_DEFINITIONS)
+    ocv_clear_vars(PNG_LIBRARY PNG_LIBRARIES PNG_INCLUDE_DIR PNG_DEFINITIONS)
 
     set(PNG_LIBRARY libpng CACHE INTERNAL "")
     set(PNG_LIBRARIES ${PNG_LIBRARY})
     add_subdirectory("${OpenCV_SOURCE_DIR}/3rdparty/libpng")
     set(PNG_INCLUDE_DIR "${${PNG_LIBRARY}_SOURCE_DIR}" CACHE INTERNAL "")
     set(PNG_DEFINITIONS "")
-    ocv_parse_header("${PNG_INCLUDE_DIR}/png.h" PNG_VERSION_LINES PNG_LIBPNG_VER_MAJOR PNG_LIBPNG_VER_MINOR PNG_LIBPNG_VER_RELEASE)
+    ocv_parse_header_version(PNG "${PNG_INCLUDE_DIR}/png.h" PNG_LIBPNG_VER_STRING)
   endif()
 
   set(HAVE_PNG YES)
-  set(PNG_VERSION "${PNG_LIBPNG_VER_MAJOR}.${PNG_LIBPNG_VER_MINOR}.${PNG_LIBPNG_VER_RELEASE}")
 endif()
 
 
diff --git a/cmake/OpenCVFindLibsPerf.cmake b/cmake/OpenCVFindLibsPerf.cmake
index a191afde58b7..5f72a67d897f 100644
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@@ -40,7 +40,11 @@ endif()
 
 # --- CUDA ---
 if(WITH_CUDA)
-  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCUDA.cmake")
+  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+    include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCUDALanguage.cmake")
+  else()
+    include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectCUDA.cmake")
+  endif()
   if(NOT HAVE_CUDA)
     message(WARNING "OpenCV is not able to find/configure CUDA SDK (required by WITH_CUDA).
 CUDA support will be disabled in OpenCV build.
diff --git a/cmake/OpenCVFindOpenBLAS.cmake b/cmake/OpenCVFindOpenBLAS.cmake
index d1db034908f4..4e3f0cc21063 100644
--- a/cmake/OpenCVFindOpenBLAS.cmake
+++ b/cmake/OpenCVFindOpenBLAS.cmake
@@ -73,7 +73,7 @@ SET(Open_BLAS_LIB_SEARCH_PATHS
  )
 
 FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
-FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS}  NO_DEFAULT_PATH)
+FIND_LIBRARY(OpenBLAS_LIB NAMES openblas libopenblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS}  NO_DEFAULT_PATH)
 
 SET(OpenBLAS_FOUND ON)
 
diff --git a/cmake/OpenCVFindProtobuf.cmake b/cmake/OpenCVFindProtobuf.cmake
index 8835347d1d26..5b1e17529ff2 100644
--- a/cmake/OpenCVFindProtobuf.cmake
+++ b/cmake/OpenCVFindProtobuf.cmake
@@ -30,8 +30,14 @@ if(BUILD_PROTOBUF)
   set(Protobuf_LIBRARIES "libprotobuf")
   set(HAVE_PROTOBUF TRUE)
 else()
+  # we still need this for command PROTOBUF_GENERATE_CPP.
+  set(protobuf_MODULE_COMPATIBLE ON)
+
   unset(Protobuf_VERSION CACHE)
-  find_package(Protobuf QUIET)
+  find_package(Protobuf QUIET CONFIG)
+  if(NOT Protobuf_FOUND)
+    find_package(Protobuf QUIET)
+  endif()
 
   # Backwards compatibility
   # Define camel case versions of input variables
@@ -67,6 +73,20 @@ else()
   endif()
 endif()
 
+# See https://github.com/opencv/opencv/issues/24369
+# In Protocol Buffers v22.0 and later drops C++11 support and depends abseil-cpp.
+#   Details: https://protobuf.dev/news/2022-08-03/
+# And if std::text_view is in abseil-cpp requests C++17 and later.
+
+if(HAVE_PROTOBUF)
+  if(NOT (Protobuf_VERSION VERSION_LESS 22))
+    if((CMAKE_CXX_STANDARD EQUAL 98) OR (CMAKE_CXX_STANDARD LESS 17))
+      message(STATUS "CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} is too old to support protobuf(${Protobuf_VERSION}) and/or abseil-cpp. Use C++17 or later. Turning HAVE_PROTOBUF off")
+      set(HAVE_PROTOBUF FALSE)
+    endif()
+  endif()
+endif()
+
 if(HAVE_PROTOBUF AND PROTOBUF_UPDATE_FILES AND NOT COMMAND PROTOBUF_GENERATE_CPP)
   message(FATAL_ERROR "Can't configure protobuf dependency (BUILD_PROTOBUF=${BUILD_PROTOBUF} PROTOBUF_UPDATE_FILES=${PROTOBUF_UPDATE_FILES})")
 endif()
@@ -74,15 +94,20 @@ endif()
 if(HAVE_PROTOBUF)
   list(APPEND CUSTOM_STATUS protobuf)
   if(NOT BUILD_PROTOBUF)
+    unset( __location)
     if(TARGET "${Protobuf_LIBRARIES}")
       get_target_property(__location "${Protobuf_LIBRARIES}" IMPORTED_LOCATION_RELEASE)
       if(NOT __location)
         get_target_property(__location "${Protobuf_LIBRARIES}" IMPORTED_LOCATION)
       endif()
-    elseif(Protobuf_LIBRARY)
-      set(__location "${Protobuf_LIBRARY}")
-    else()
-      set(__location "${Protobuf_LIBRARIES}")
+    endif()
+
+    if(NOT __location)
+      if(Protobuf_LIBRARY)
+        set(__location "${Protobuf_LIBRARY}")
+      else()
+        set(__location "${Protobuf_LIBRARIES}")
+      endif()
     endif()
   endif()
   list(APPEND CUSTOM_STATUS_protobuf "    Protobuf:"
diff --git a/cmake/OpenCVFindTengine.cmake b/cmake/OpenCVFindTengine.cmake
deleted file mode 100644
index 2d33f5c99387..000000000000
--- a/cmake/OpenCVFindTengine.cmake
+++ /dev/null
@@ -1,78 +0,0 @@
-# COPYRIGHT
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# License); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Copyright (c) 2020, OPEN AI LAB
-# Author: qtang@openailab.com or https://github.com/BUG1989
-#
-
-# ----------------------------------------------------------------------------
-#  Path for Tengine binaries
-# ----------------------------------------------------------------------------
-set(OPENCV_LIBTENGINE_ROOT_DIR "" CACHE PATH "Path to TENGINE binaries installation")
-
-IF(OPENCV_LIBTENGINE_ROOT_DIR AND NOT BUILD_TENGINE)
-
-	MESSAGE(STATUS "TENGINE:--  Use binaries at ${OPENCV_LIBTENGINE_ROOT_DIR}")
-
-	SET(Tengine_FOUND ON)
-	set(BUILD_TENGINE OFF)
-
-	SET(Tengine_INCLUDE_DIR "${OPENCV_LIBTENGINE_ROOT_DIR}/include" CACHE PATH "TENGINE include dir")
-	SET(Tengine_LIB "${OPENCV_LIBTENGINE_ROOT_DIR}/lib/libtengine.a" CACHE PATH "TENGINE library dir")
-
-ELSE()
-	IF(ANDROID)
-		IF(OPENCV_TENGINE_FORCE_ANDROID)
-			# nothing, use Android
-		ELSEIF(OPENCV_TENGINE_SKIP_ANDROID)
-			set(Tengine_FOUND OFF)
-			set(HAVE_TENGINE FALSE)
-			return()
-		ELSEIF(NOT DEFINED ANDROID_NDK_REVISION)
-			MESSAGE(STATUS "Android NDK version Tengine not support: ANDROID_NDK_REVISION is not defined")
-			set(Tengine_FOUND OFF)
-			set(HAVE_TENGINE FALSE)
-			return()
-		ELSEIF(ANDROID_NDK_REVISION VERSION_LESS 14)
-			MESSAGE(STATUS "Android NDK version Tengine not support: ANDROID_NDK_REVISION=${ANDROID_NDK_REVISION}")
-			set(Tengine_FOUND OFF)
-			set(HAVE_TENGINE FALSE)
-			return()
-		ENDIF()
-	ENDIF()
-	MESSAGE(STATUS "TENGINE:--  Build Tengine from source code. ")
-	include("${OpenCV_SOURCE_DIR}/3rdparty/libtengine/tengine.cmake")
-ENDIF()
-
-IF(NOT Tengine_LIB)
-	SET(Tengine_FOUND OFF)
-	MESSAGE(STATUS "#### Could not find Tengine lib. Turning Tengine_FOUND off")
-ENDIF()
-
-IF (Tengine_FOUND)
-	MESSAGE(STATUS "Found Tengine include: ${Tengine_INCLUDE_DIR}")
-	MESSAGE(STATUS "Found Tengine libraries: ${Tengine_LIB}")
-	set(HAVE_TENGINE 1)
-	set(TENGINE_LIBRARIES    ${Tengine_LIB})
-	set(TENGINE_INCLUDE_DIRS    ${Tengine_INCLUDE_DIR})
-ENDIF (Tengine_FOUND)
-
-MARK_AS_ADVANCED(
-	Tengine_INCLUDE_DIR
-	Tengine_LIB
-)
diff --git a/cmake/OpenCVGenConfig.cmake b/cmake/OpenCVGenConfig.cmake
index 838852c4e717..df48ae084815 100644
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@@ -12,7 +12,11 @@ else()
 endif()
 
 if(HAVE_CUDA)
-  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-CUDA.cmake.in" CUDA_CONFIGCMAKE @ONLY)
+  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+    ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-CUDALanguage.cmake.in" CUDA_CONFIGCMAKE @ONLY)
+  else()
+    ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-CUDA.cmake.in" CUDA_CONFIGCMAKE @ONLY)
+  endif()
 endif()
 
 if(ANDROID)
@@ -120,7 +124,6 @@ endif()
 
 if(ANDROID)
   ocv_gen_config("${CMAKE_BINARY_DIR}/unix-install" "abi-${ANDROID_NDK_ABI_NAME}" "OpenCVConfig.root-ANDROID.cmake.in")
-  install(FILES "${OpenCV_SOURCE_DIR}/platforms/android/android.toolchain.cmake" DESTINATION "${OPENCV_CONFIG_INSTALL_PATH}" COMPONENT dev)
 endif()
 
 # --------------------------------------------------------------------------------------------
diff --git a/cmake/OpenCVGenInfoPlist.cmake b/cmake/OpenCVGenInfoPlist.cmake
index 105087907ffb..f1a6926d0713 100644
--- a/cmake/OpenCVGenInfoPlist.cmake
+++ b/cmake/OpenCVGenInfoPlist.cmake
@@ -13,6 +13,14 @@ if(IOS)
     configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.plist.in"
                    "${CMAKE_BINARY_DIR}/ios/Info.plist")
   endif()
+elseif(XROS)
+  if(APPLE_FRAMEWORK AND DYNAMIC_PLIST)
+    configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.Dynamic.plist.in"
+                   "${CMAKE_BINARY_DIR}/visionos/Info.plist")
+  else()
+    configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.plist.in"
+                   "${CMAKE_BINARY_DIR}/visionos/Info.plist")
+  endif()
 elseif(APPLE)
   configure_file("${OpenCV_SOURCE_DIR}/platforms/osx/Info.plist.in"
                  "${CMAKE_BINARY_DIR}/osx/Info.plist")
diff --git a/cmake/OpenCVGenPkgconfig.cmake b/cmake/OpenCVGenPkgconfig.cmake
index 8d36b74f09ed..4fc80f5e4b27 100644
--- a/cmake/OpenCVGenPkgconfig.cmake
+++ b/cmake/OpenCVGenPkgconfig.cmake
@@ -1,4 +1,4 @@
-if(MSVC OR IOS)
+if(MSVC OR IOS OR XROS)
   return()
 endif()
 
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index b6cee904a983..5411a28c6135 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -1003,7 +1003,7 @@ macro(_ocv_create_module)
                                           INTERFACE ${OPENCV_MODULE_${the_module}_DEPS_EXT}
   )
   ocv_target_link_libraries(${the_module} PRIVATE ${OPENCV_LINKER_LIBS} ${OPENCV_HAL_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
-  if (HAVE_CUDA)
+  if (NOT ENABLE_CUDA_FIRST_CLASS_LANGUAGE AND HAVE_CUDA)
     ocv_target_link_libraries(${the_module} PRIVATE ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
   endif()
 
diff --git a/cmake/OpenCVPackaging.cmake b/cmake/OpenCVPackaging.cmake
index 7ce7efa6619e..2480a1cae887 100644
--- a/cmake/OpenCVPackaging.cmake
+++ b/cmake/OpenCVPackaging.cmake
@@ -52,8 +52,8 @@ else()
   set(OPENCV_PACKAGE_ARCH_SUFFIX ${CMAKE_SYSTEM_PROCESSOR})
 endif()
 
-set(CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${OPENCV_VCSVERSION}-${OPENCV_PACKAGE_ARCH_SUFFIX}")
-set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${OPENCV_VCSVERSION}-${OPENCV_PACKAGE_ARCH_SUFFIX}")
+set(CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION}-${OPENCV_PACKAGE_ARCH_SUFFIX}")
+set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION}-${OPENCV_PACKAGE_ARCH_SUFFIX}")
 
 #rpm options
 set(CPACK_RPM_COMPONENT_INSTALL TRUE)
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 437042958ef4..16babb4937ee 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -1430,6 +1430,18 @@ macro(ocv_parse_header2 LIBNAME HDR_PATH VARNAME)
   endif()
 endmacro()
 
+# set ${LIBNAME}_VERSION_STRING to ${LIBVER} without quotes
+macro(ocv_parse_header_version LIBNAME HDR_PATH LIBVER)
+  ocv_clear_vars(${LIBNAME}_VERSION_STRING)
+  set(${LIBNAME}_H "")
+  if(EXISTS "${HDR_PATH}")
+    file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${LIBVER}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
+  endif()
+  if(${LIBNAME}_H)
+    string(REGEX REPLACE "^.*[ \t]${LIBVER}[ \t]+\"(.+)\"$" "\\1" ${LIBNAME}_VERSION_STRING "${${LIBNAME}_H}")
+  endif()
+endmacro()
+
 ################################################################################################
 # short command to setup source group
 function(ocv_source_group group)
@@ -1545,13 +1557,23 @@ function(_ocv_append_target_includes target)
   endif()
 endfunction()
 
+macro(ocv_add_cuda_compile_flags)
+  ocv_cuda_compile_flags()
+  target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: ${CUDA_NVCC_FLAGS}
+  "-Xcompiler=${CMAKE_CXX_FLAGS_CUDA} $<$<CONFIG:Debug>:${CMAKE_CXX_FLAGS_DEBUG_CUDA}> \
+  $<$<CONFIG:Release>:${CMAKE_CXX_FLAGS_RELEASE_CUDA}>" >)
+endmacro()
+
 function(ocv_add_executable target)
   add_executable(${target} ${ARGN})
+  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE AND HAVE_CUDA)
+    ocv_add_cuda_compile_flags()
+  endif()
   _ocv_append_target_includes(${target})
 endfunction()
 
 function(ocv_add_library target)
-  if(HAVE_CUDA AND ARGN MATCHES "\\.cu")
+  if(NOT ENABLE_CUDA_FIRST_CLASS_LANGUAGE AND HAVE_CUDA AND ARGN MATCHES "\\.cu")
     ocv_include_directories(${CUDA_INCLUDE_DIRS})
     ocv_cuda_compile(cuda_objs ${ARGN})
     set(OPENCV_MODULE_${target}_CUDA_OBJECTS ${cuda_objs} CACHE INTERNAL "Compiled CUDA object files")
@@ -1559,12 +1581,16 @@ function(ocv_add_library target)
 
   add_library(${target} ${ARGN} ${cuda_objs})
 
+  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE AND HAVE_CUDA)
+    ocv_add_cuda_compile_flags()
+  endif()
+
   if(APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
     message(STATUS "Setting Apple target properties for ${target}")
 
     set(CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG 1)
 
-    if(IOS AND NOT MAC_CATALYST)
+    if((IOS OR XROS) AND NOT MAC_CATALYST)
       set(OPENCV_APPLE_INFO_PLIST "${CMAKE_BINARY_DIR}/ios/Info.plist")
     else()
       set(OPENCV_APPLE_INFO_PLIST "${CMAKE_BINARY_DIR}/osx/Info.plist")
@@ -1632,13 +1658,19 @@ function(ocv_add_external_target name inc link def)
   endif()
 endfunction()
 
+set(__OPENCV_EXPORTED_EXTERNAL_TARGETS "" CACHE INTERNAL "")
 function(ocv_install_used_external_targets)
   if(NOT BUILD_SHARED_LIBS
       AND NOT (CMAKE_VERSION VERSION_LESS "3.13.0")  # upgrade CMake: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/2152
   )
     foreach(tgt in ${ARGN})
       if(tgt MATCHES "^ocv\.3rdparty\.")
-        install(TARGETS ${tgt} EXPORT OpenCVModules)
+        list(FIND __OPENCV_EXPORTED_EXTERNAL_TARGETS "${tgt}" _found)
+        if(_found EQUAL -1)  # don't export target twice
+          install(TARGETS ${tgt} EXPORT OpenCVModules)
+          list(APPEND __OPENCV_EXPORTED_EXTERNAL_TARGETS "${tgt}")
+          set(__OPENCV_EXPORTED_EXTERNAL_TARGETS "${__OPENCV_EXPORTED_EXTERNAL_TARGETS}" CACHE INTERNAL "")
+        endif()
       endif()
     endforeach()
   endif()
diff --git a/cmake/android/android_gradle_projects.cmake b/cmake/android/android_gradle_projects.cmake
index 0a8b9f4b606c..4278b10f8d41 100644
--- a/cmake/android/android_gradle_projects.cmake
+++ b/cmake/android/android_gradle_projects.cmake
@@ -1,8 +1,8 @@
 # https://developer.android.com/studio/releases/gradle-plugin
-set(ANDROID_GRADLE_PLUGIN_VERSION "3.2.1" CACHE STRING "Android Gradle Plugin version")
+set(ANDROID_GRADLE_PLUGIN_VERSION "7.3.1" CACHE STRING "Android Gradle Plugin version")
 message(STATUS "Android Gradle Plugin version: ${ANDROID_GRADLE_PLUGIN_VERSION}")
 
-set(KOTLIN_PLUGIN_VERSION "1.4.10" CACHE STRING "Kotlin Plugin version")
+set(KOTLIN_PLUGIN_VERSION "1.8.20" CACHE STRING "Kotlin Plugin version")
 message(STATUS "Kotlin Plugin version: ${KOTLIN_PLUGIN_VERSION}")
 
 if(BUILD_KOTLIN_EXTENSIONS)
@@ -13,16 +13,16 @@ else()
   set(KOTLIN_STD_LIB "" CACHE STRING "Kotlin Standard Library dependency")
 endif()
 
-set(GRADLE_VERSION "5.6.4" CACHE STRING "Gradle version")
+set(GRADLE_VERSION "7.6.3" CACHE STRING "Gradle version")
 message(STATUS "Gradle version: ${GRADLE_VERSION}")
 
-set(ANDROID_COMPILE_SDK_VERSION "26" CACHE STRING "Android compileSdkVersion")
+set(ANDROID_COMPILE_SDK_VERSION "31" CACHE STRING "Android compileSdkVersion")
 if(ANDROID_NATIVE_API_LEVEL GREATER 21)
   set(ANDROID_MIN_SDK_VERSION "${ANDROID_NATIVE_API_LEVEL}" CACHE STRING "Android minSdkVersion")
 else()
   set(ANDROID_MIN_SDK_VERSION "21" CACHE STRING "Android minSdkVersion")
 endif()
-set(ANDROID_TARGET_SDK_VERSION "26" CACHE STRING "Android minSdkVersion")
+set(ANDROID_TARGET_SDK_VERSION "31" CACHE STRING "Android minSdkVersion")
 
 set(ANDROID_BUILD_BASE_DIR "${OpenCV_BINARY_DIR}/opencv_android" CACHE INTERNAL "")
 set(ANDROID_TMP_INSTALL_BASE_DIR "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/install/opencv_android")
@@ -89,15 +89,11 @@ else()
   ocv_update(OPENCV_ANDROID_NAMESPACE_DECLARATION "")
 endif()
 
-# set android gradle java version in build.gradle and set aidl config
 if(NOT (ANDROID_GRADLE_PLUGIN_VERSION VERSION_LESS "8.0.0"))
   # AGP-8.0 requires a minimum JDK version of JDK17
   ocv_update(ANDROID_GRADLE_JAVA_VERSION_INIT "17")
-  # Enable aidl configuration for OpenCV compile with AGP-8.0
-  ocv_update(ANDROID_GRADLE_BUILD_FEATURE_AIDL "buildFeatures { aidl true }")
 else()
   ocv_update(ANDROID_GRADLE_JAVA_VERSION_INIT "1_8")
-  ocv_update(ANDROID_GRADLE_BUILD_FEATURE_AIDL "")
 endif()
 
 set(ANDROID_GRADLE_JAVA_VERSION "${ANDROID_GRADLE_JAVA_VERSION_INIT}" CACHE STRING "Android Gradle Java version")
@@ -109,24 +105,44 @@ if(NOT OPENCV_SKIP_ANDROID_FORCE_CMAKE)
     get_filename_component(_CMAKE_INSTALL_DIR "${CMAKE_ROOT}" PATH)
     get_filename_component(_CMAKE_INSTALL_DIR "${_CMAKE_INSTALL_DIR}" PATH)
   endif()
-  ocv_update_file("${ANDROID_BUILD_BASE_DIR}/local.properties" "cmake.dir=${_CMAKE_INSTALL_DIR}")
+  ocv_update_file("${ANDROID_BUILD_BASE_DIR}/local.properties" "cmake.dir=${_CMAKE_INSTALL_DIR}\nndk.dir=${ANDROID_NDK}")
 endif()
 
 file(WRITE "${ANDROID_BUILD_BASE_DIR}/settings.gradle" "
+gradle.ext {
+    // possible options: 'maven_central', 'maven_local', 'sdk_path'
+    opencv_source = 'sdk_path'
+}
+
 include ':opencv'
 ")
 
 file(WRITE "${ANDROID_TMP_INSTALL_BASE_DIR}/settings.gradle" "
 rootProject.name = 'opencv_samples'
 
-def opencvsdk='../'
-//def opencvsdk='/<path to OpenCV-android-sdk>'
-//println opencvsdk
-include ':opencv'
-project(':opencv').projectDir = new File(opencvsdk + '/sdk')
+gradle.ext {
+    // possible options: 'maven_central', 'maven_local', 'sdk_path'
+    opencv_source = 'sdk_path'
+}
+
+if (gradle.opencv_source == 'maven_local') {
+    gradle.ext {
+        opencv_maven_path = '<path_to_maven_repo>'
+    }
+}
+
+if (gradle.opencv_source == 'sdk_path') {
+    def opencvsdk = '../'
+    //def opencvsdk='/<path to OpenCV-android-sdk>'
+    //println opencvsdk
+    include ':opencv'
+    project(':opencv').projectDir = new File(opencvsdk + '/sdk')
+}
 ")
 
 ocv_check_environment_variables(OPENCV_GRADLE_VERBOSE_OPTIONS)
+ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
+separate_arguments(OPENCV_GRADLE_VERBOSE_OPTIONS UNIX_COMMAND "${OPENCV_GRADLE_VERBOSE_OPTIONS}")
 
 macro(add_android_project target path)
   get_filename_component(__dir "${path}" NAME)
@@ -161,7 +177,6 @@ include ':${__dir}'
   if (BUILD_ANDROID_EXAMPLES)
     # build apk
     set(APK_FILE "${ANDROID_BUILD_BASE_DIR}/${__dir}/build/outputs/apk/release/${__dir}-${ANDROID_ABI}-release-unsigned.apk")
-    ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
     add_custom_command(
         OUTPUT "${APK_FILE}" "${OPENCV_DEPHELPER}/android_sample_${__dir}"
         COMMAND ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} "${__dir}:assemble"
diff --git a/cmake/checks/cpu_fp16.cpp b/cmake/checks/cpu_fp16.cpp
index f12cb10f4d23..c57b5d47b63e 100644
--- a/cmake/checks/cpu_fp16.cpp
+++ b/cmake/checks/cpu_fp16.cpp
@@ -11,7 +11,8 @@ int test()
     _mm_storel_epi64((__m128i*)dst, v_dst);
     return (int)dst[0];
 }
-#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
+#elif (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) /*|| (defined _MSC_VER && defined _M_ARM64)*/
+// Windows + ARM64 case disabled: https://github.com/opencv/opencv/issues/25052
 #include "arm_neon.h"
 int test()
 {
diff --git a/cmake/checks/cpu_lsx.cpp b/cmake/checks/cpu_lsx.cpp
new file mode 100644
index 000000000000..86c2e5c7a609
--- /dev/null
+++ b/cmake/checks/cpu_lsx.cpp
@@ -0,0 +1,15 @@
+#include <stdio.h>
+#include <lsxintrin.h>
+
+int test()
+{
+    const float src[] = { 0.0f, 1.0f, 2.0f, 3.0f};
+    v4f32 val = (v4f32)__lsx_vld((const float*)(src), 0);
+    return __lsx_vpickve2gr_w(__lsx_vftint_w_s(val), 3);
+}
+
+int main()
+{
+  printf("%d\n", test());
+  return 0;
+}
diff --git a/cmake/checks/cpu_neon.cpp b/cmake/checks/cpu_neon.cpp
index bb103ec3661d..7af16f5ffcb2 100644
--- a/cmake/checks/cpu_neon.cpp
+++ b/cmake/checks/cpu_neon.cpp
@@ -5,7 +5,7 @@
 # include <Intrin.h>
 # include <arm_neon.h>
 # define CV_NEON 1
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#elif defined(__ARM_NEON)
 #  include <arm_neon.h>
 #  define CV_NEON 1
 #endif
diff --git a/cmake/checks/cpu_neon_bf16.cpp b/cmake/checks/cpu_neon_bf16.cpp
new file mode 100644
index 000000000000..a9045d711754
--- /dev/null
+++ b/cmake/checks/cpu_neon_bf16.cpp
@@ -0,0 +1,46 @@
+#if (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) || (defined _MSC_VER && defined _M_ARM64)
+#include <stdio.h>
+#include "arm_neon.h"
+
+/*#if defined __clang__
+#pragma clang attribute push (__attribute__((target("bf16"))), apply_to=function)
+#elif defined GCC
+#pragma GCC push_options
+#pragma GCC target("armv8.2-a", "bf16")
+#endif*/
+bfloat16x8_t vld1q_as_bf16(const float* src)
+{
+    float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4);
+    return vcombine_bf16(vcvt_bf16_f32(s0), vcvt_bf16_f32(s1));
+}
+
+void vprintreg(const char* name, const float32x4_t& r)
+{
+    float data[4];
+    vst1q_f32(data, r);
+    printf("%s: (%.2f, %.2f, %.2f, %.2f)\n",
+        name, data[0], data[1], data[2], data[3]);
+}
+
+void test()
+{
+    const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
+    const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f };
+    bfloat16x8_t s1 = vld1q_as_bf16(src1), s2 = vld1q_as_bf16(src2);
+    float32x4_t d = vbfdotq_f32(vdupq_n_f32(0.f), s1, s2);
+    vprintreg("(s1[0]*s2[0] + s1[1]*s2[1], ... s1[6]*s2[6] + s1[7]*s2[7])", d);
+}
+/*#if defined __clang__
+#pragma clang attribute pop
+#elif defined GCC
+#pragma GCC pop_options
+#endif*/
+#else
+#error "BF16 is not supported"
+#endif
+
+int main()
+{
+    test();
+    return 0;
+}
diff --git a/cmake/checks/cpu_dotprod.cpp b/cmake/checks/cpu_neon_dotprod.cpp
similarity index 80%
rename from cmake/checks/cpu_dotprod.cpp
rename to cmake/checks/cpu_neon_dotprod.cpp
index 4f39c5065990..74f44a183259 100644
--- a/cmake/checks/cpu_dotprod.cpp
+++ b/cmake/checks/cpu_neon_dotprod.cpp
@@ -1,6 +1,6 @@
 #include <stdio.h>
 
-#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
+#if (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) || (defined _MSC_VER && defined _M_ARM64)
 #include "arm_neon.h"
 int test()
 {
diff --git a/cmake/checks/cpu_neon_fp16.cpp b/cmake/checks/cpu_neon_fp16.cpp
new file mode 100644
index 000000000000..bba5b9702633
--- /dev/null
+++ b/cmake/checks/cpu_neon_fp16.cpp
@@ -0,0 +1,46 @@
+#include <stdio.h>
+
+#if (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) || (defined _MSC_VER && defined _M_ARM64)
+#include "arm_neon.h"
+
+float16x8_t vld1q_as_f16(const float* src)
+{
+    float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4);
+    return vcombine_f16(vcvt_f16_f32(s0), vcvt_f16_f32(s1));
+}
+
+void vprintreg(const char* name, const float16x8_t& r)
+{
+    float data[8];
+    vst1q_f32(data, vcvt_f32_f16(vget_low_f16(r)));
+    vst1q_f32(data + 4, vcvt_f32_f16(vget_high_f16(r)));
+    printf("%s: (%.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f)\n",
+        name, data[0], data[1], data[2], data[3],
+        data[4], data[5], data[6], data[7]);
+}
+
+void test()
+{
+    const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
+    const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f };
+    float16x8_t s1 = vld1q_as_f16(src1), s2 = vld1q_as_f16(src2);
+    float16x8_t d = vsubq_f16(s1, s1);
+    d = vfmaq_laneq_f16(d, s1, s2, 0);
+    d = vfmaq_laneq_f16(d, s1, s2, 1);
+    d = vfmaq_laneq_f16(d, s1, s2, 2);
+    d = vfmaq_laneq_f16(d, s1, s2, 3);
+    d = vfmaq_laneq_f16(d, s1, s2, 4);
+    d = vfmaq_laneq_f16(d, s1, s2, 5);
+    d = vfmaq_laneq_f16(d, s1, s2, 6);
+    d = vfmaq_laneq_f16(d, s1, s2, 7);
+    vprintreg("s1*s2[0]+s1*s2[1] + ... + s1*s2[7]", d);
+}
+#else
+#error "FP16 is not supported"
+#endif
+
+int main()
+{
+    test();
+    return 0;
+}
diff --git a/cmake/checks/directml.cpp b/cmake/checks/directml.cpp
new file mode 100644
index 000000000000..1cf62b8fad87
--- /dev/null
+++ b/cmake/checks/directml.cpp
@@ -0,0 +1,38 @@
+#include <initguid.h>
+
+#include <d3d11.h>
+#include <dxgi1_2.h>
+#include <dxgi1_4.h>
+#include <dxgi.h>
+#include <dxcore.h>
+#include <dxcore_interface.h>
+#include <d3d12.h>
+#include <directml.h>
+
+int main(int /*argc*/, char** /*argv*/)
+{
+    IDXCoreAdapterFactory* factory;
+    DXCoreCreateAdapterFactory(__uuidof(IDXCoreAdapterFactory), (void**)&factory);
+
+    IDXCoreAdapterList* adapterList;
+    const GUID dxGUIDs[] = { DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE };
+    factory->CreateAdapterList(ARRAYSIZE(dxGUIDs), dxGUIDs, __uuidof(IDXCoreAdapterList), (void**)&adapterList);
+
+    IDXCoreAdapter* adapter;
+    adapterList->GetAdapter(0u, __uuidof(IDXCoreAdapter), (void**)&adapter);
+
+    D3D_FEATURE_LEVEL d3dFeatureLevel = D3D_FEATURE_LEVEL_1_0_CORE;
+    ID3D12Device* d3d12Device = NULL;
+    D3D12CreateDevice((IUnknown*)adapter, d3dFeatureLevel, __uuidof(ID3D11Device), (void**)&d3d12Device);
+
+    D3D12_COMMAND_LIST_TYPE commandQueueType = D3D12_COMMAND_LIST_TYPE_COMPUTE;
+    ID3D12CommandQueue* cmdQueue;
+    D3D12_COMMAND_QUEUE_DESC commandQueueDesc = {};
+    commandQueueDesc.Type = commandQueueType;
+
+    d3d12Device->CreateCommandQueue(&commandQueueDesc, __uuidof(ID3D12CommandQueue), (void**)&cmdQueue);
+    IDMLDevice* dmlDevice;
+    DMLCreateDevice(d3d12Device, DML_CREATE_DEVICE_FLAG_NONE, IID_PPV_ARGS(&dmlDevice));
+
+    return 0;
+}
\ No newline at end of file
diff --git a/cmake/copy_files.cmake b/cmake/copy_files.cmake
index 423f7fff9c87..f7e13a45d49f 100644
--- a/cmake/copy_files.cmake
+++ b/cmake/copy_files.cmake
@@ -21,7 +21,7 @@ macro(copy_file_ src dst prefix)
   endif()
   if(use_symlink)
     if(local_update OR NOT IS_SYMLINK "${dst}")
-      message("${prefix}Symlink: '${dst_name}' ...")
+      #message("${prefix}Symlink: '${dst_name}' ...")
     endif()
     get_filename_component(target_path "${dst}" PATH)
     file(MAKE_DIRECTORY "${target_path}")
@@ -38,7 +38,7 @@ macro(copy_file_ src dst prefix)
       set(local_update 1)
     endif()
     if(local_update)
-      message("${prefix}Copying: '${dst_name}' ...")
+      #message("${prefix}Copying: '${dst_name}' ...")
       configure_file(${src} ${dst} COPYONLY)
     else()
       #message("${prefix}Up-to-date: '${dst_name}'")
@@ -55,7 +55,7 @@ if(NOT DEFINED COPYLIST_VAR)
   set(COPYLIST_VAR "COPYLIST")
 endif()
 list(LENGTH ${COPYLIST_VAR} __length)
-message("${prefix}... ${__length} entries (${COPYLIST_VAR})")
+#message("${prefix}... ${__length} entries (${COPYLIST_VAR})")
 foreach(id ${${COPYLIST_VAR}})
   set(src "${${COPYLIST_VAR}_SRC_${id}}")
   set(dst "${${COPYLIST_VAR}_DST_${id}}")
@@ -80,7 +80,7 @@ foreach(id ${${COPYLIST_VAR}})
     endif()
     file(GLOB_RECURSE _files RELATIVE "${src}" ${src_glob})
     list(LENGTH _files __length)
-    message("${prefix}    ... directory '.../${src_name2}/${src_name}' with ${__length} files")
+    #message("${prefix}    ... directory '.../${src_name2}/${src_name}' with ${__length} files")
     foreach(f ${_files})
       if(NOT EXISTS "${src}/${f}")
         message(FATAL_ERROR "COPY ERROR: Source file is missing: ${src}/${f}")
@@ -98,12 +98,12 @@ else()
 endif()
 if(NOT "${__state}" STREQUAL "${__prev_state}")
   file(WRITE "${STATE_FILE}" "${__state}")
-  message("${prefix}Updated!")
+  #message("${prefix}Updated!")
   set(update_dephelper 1)
 endif()
 
 if(NOT update_dephelper)
-  message("${prefix}All files are up-to-date.")
+  #message("${prefix}All files are up-to-date.")
 elseif(DEFINED DEPHELPER)
   file(WRITE "${DEPHELPER}" "")
 endif()
diff --git a/cmake/mirrors/custom.cmake b/cmake/mirrors/custom.cmake
index 3cdf700e1926..8c421471f3eb 100644
--- a/cmake/mirrors/custom.cmake
+++ b/cmake/mirrors/custom.cmake
@@ -1,15 +1,12 @@
 # Gitlab-style mirror
 # CMake scripts look for opencv/opencv_3rdparty,
-#  OAID/Tengine, 01org/tbb(oneAPI/oneTBB), opencv/ade
+#  01org/tbb(oneAPI/oneTBB), opencv/ade
 #  from OPENCV_DOWNLOAD_MIRROR
 ocv_update(OPENCV_DOWNLOAD_MIRROR_URL "")
 
 ######
 # Download via commit id
 ######
-# Tengine
-ocv_update(TENGINE_PKG_MD5_CUSTOM "")
-ocv_update(TENGINE_PKG_MD5_ORIGINAL 23f61ebb1dd419f1207d8876496289c5) # same as tengine_md5sum for TENGINE commit of e89cf8870de2ff0a80cfe626c0b52b2a16fb302e
 # NVIDIA_OPTICAL_FLOW
 ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_GITCODE "")
 ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_ORIGINAL a73cd48b18dcc0cc8933b30796074191)
@@ -77,7 +74,7 @@ else()
     ocv_download_url_custom_usercontent(opencv)
   elseif(DL_ID STREQUAL "wechat_qrcode")
     ocv_download_url_gitcode_usercontent(WeChatCV)
-  elseif((DL_ID STREQUAL "TENGINE") OR (DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
+  elseif((DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
     ocv_download_url_custom_archive_commit_id()
   elseif(DL_ID STREQUAL "TBB")
     ocv_download_url_custom_archive_release()
diff --git a/cmake/mirrors/gitcode.cmake b/cmake/mirrors/gitcode.cmake
index c9d41e74581f..e208a8724567 100644
--- a/cmake/mirrors/gitcode.cmake
+++ b/cmake/mirrors/gitcode.cmake
@@ -1,9 +1,6 @@
 ######
 # Download via commit id
 ######
-# Tengine
-ocv_update(TENGINE_PKG_MD5_GITCODE 1b5908632b557275cd6e85b0c03f9690)
-ocv_update(TENGINE_PKG_MD5_ORIGINAL 23f61ebb1dd419f1207d8876496289c5) # same as tengine_md5sum for TENGINE commit of e89cf8870de2ff0a80cfe626c0b52b2a16fb302e
 # NVIDIA_OPTICAL_FLOW
 ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_GITCODE 8d5b7eeb24d6ca9c6bcfdff4196d5b47)
 ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_ORIGINAL a73cd48b18dcc0cc8933b30796074191)
@@ -74,7 +71,7 @@ if((DL_ID STREQUAL "FFMPEG") OR (DL_ID STREQUAL "IPPICV") OR (DL_ID STREQUAL "da
   ocv_download_url_gitcode_usercontent(opencv)
 elseif(DL_ID STREQUAL "wechat_qrcode")
   ocv_download_url_gitcode_usercontent(mirrors/WeChatCV)
-elseif((DL_ID STREQUAL "TENGINE") OR (DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
+elseif((DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
   ocv_download_url_gitcode_archive_commit_id()
 elseif(DL_ID STREQUAL "TBB")
   ocv_download_url_gitcode_archive_release(OPENCV_TBB_SUBDIR)
diff --git a/cmake/templates/OpenCVConfig-CUDALanguage.cmake.in b/cmake/templates/OpenCVConfig-CUDALanguage.cmake.in
new file mode 100644
index 000000000000..259141006ae2
--- /dev/null
+++ b/cmake/templates/OpenCVConfig-CUDALanguage.cmake.in
@@ -0,0 +1,31 @@
+# Version Compute Capability from which OpenCV has been compiled is remembered
+set(OpenCV_COMPUTE_CAPABILITIES "@OpenCV_CUDA_CC@")
+
+set(OpenCV_CUDA_VERSION "@CUDA_VERSION_STRING@")
+set(OpenCV_USE_CUBLAS   "@HAVE_CUBLAS@")
+set(OpenCV_USE_CUFFT    "@HAVE_CUFFT@")
+set(OpenCV_USE_NVCUVID  "@HAVE_NVCUVID@")
+set(OpenCV_USE_NVCUVENC "@HAVE_NVCUVENC@")
+set(OpenCV_CUDNN_VERSION    "@CUDNN_VERSION@")
+set(OpenCV_USE_CUDNN        "@HAVE_CUDNN@")
+set(ENABLE_CUDA_FIRST_CLASS_LANGUAGE  ON)
+
+if(NOT CUDAToolkit_FOUND)
+  if(NOT CMAKE_VERSION VERSION_LESS 3.18)
+    if(UNIX AND NOT CMAKE_CUDA_COMPILER AND NOT CUDAToolkit_ROOT)
+      message(STATUS "Checking for CUDAToolkit in default location (/usr/local/cuda)")
+      set(CUDA_PATH "/usr/local/cuda" CACHE INTERNAL "")
+      set(ENV{CUDA_PATH} ${CUDA_PATH})
+    endif()
+    find_package(CUDAToolkit ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
+  else()
+    message(FATAL_ERROR "Using OpenCV compiled with CUDA as first class language requires CMake \>= 3.18.")
+  endif()
+else()
+  if(CUDAToolkit_FOUND)
+    set(CUDA_VERSION_STRING ${CUDAToolkit_VERSION})
+  endif()
+  if(NOT CUDA_VERSION_STRING VERSION_EQUAL OpenCV_CUDA_VERSION)
+      message(FATAL_ERROR "OpenCV library was compiled with CUDA ${OpenCV_CUDA_VERSION} support. Please, use the same version or rebuild OpenCV with CUDA ${CUDA_VERSION_STRING}")
+  endif()
+endif()
diff --git a/cmake/templates/OpenCVConfig.root-WIN32.cmake.in b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
index b0f254ebe80f..62e36272f350 100644
--- a/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
+++ b/cmake/templates/OpenCVConfig.root-WIN32.cmake.in
@@ -137,7 +137,7 @@ elseif(MSVC)
         set(OpenCV_RUNTIME vc14) # selecting previous compatible runtime version
       endif()
     endif()
-  elseif(MSVC_VERSION MATCHES "^193[0-9]$")
+  elseif(MSVC_VERSION MATCHES "^19[34][0-9]$")
     set(OpenCV_RUNTIME vc17)
     check_one_config(has_VS2022)
     if(NOT has_VS2022)
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index d6c7875411f6..ed53f3bf44d8 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -78,9 +78,6 @@
 /* IJG JPEG codec */
 #cmakedefine HAVE_JPEG
 
-/* libpng/png.h needs to be included */
-#cmakedefine HAVE_LIBPNG_PNG_H
-
 /* GDCM DICOM codec */
 #cmakedefine HAVE_GDCM
 
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 64a065217c0e..411b77808ace 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -16,6 +16,10 @@ endif()
 
 find_package(Doxygen)
 if(DOXYGEN_FOUND)
+  if (DOXYGEN_VERSION VERSION_LESS 1.9.8)
+    message(WARNING "Found doxygen ${DOXYGEN_VERSION}, version 1.9.8 is used for testing, there is "
+                    "a chance your documentation will look different or have some limitations.")
+  endif()
   add_custom_target(doxygen)
 
   # not documented modules list
@@ -191,10 +195,17 @@ if(DOXYGEN_FOUND)
   list(APPEND CMAKE_DOXYGEN_HTML_FILES "${CMAKE_CURRENT_SOURCE_DIR}/tutorial-utils.js")
   string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_HTML_FILES "${CMAKE_DOXYGEN_HTML_FILES}")
 
-  set(OPENCV_DOCS_DOT_PATH "" CACHE PATH "Doxygen/DOT_PATH value")
+  if (DOXYGEN_DOT_EXECUTABLE)
+    message(STATUS "Found DOT executable: ${DOXYGEN_DOT_EXECUTABLE}")
+    set(init_dot_path "${DOXYGEN_DOT_EXECUTABLE}")
+    set(init_dot_mode "YES")
+  else()
+    set(init_dot_path "")
+    set(init_dot_mode "NO")
+  endif()
+  set(OPENCV_DOCS_DOT_PATH "${init_dot_path}" CACHE PATH "Doxygen/DOT_PATH value")
+  set(OPENCV_DOCS_HAVE_DOT "${init_dot_mode}" CACHE BOOL "Doxygen: build extra diagrams")
   set(CMAKECONFIG_DOT_PATH "${OPENCV_DOCS_DOT_PATH}")
-
-  set(OPENCV_DOCS_HAVE_DOT "NO" CACHE BOOL "Doxygen: build extra diagrams")
   set(CMAKECONFIG_HAVE_DOT "${OPENCV_DOCS_HAVE_DOT}")
 
   # 'png' is good enough for compatibility (but requires +50% storage space)
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index ce207d3e318f..d757c6f92e5a 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -3,8 +3,11 @@ PROJECT_NAME           = OpenCV
 PROJECT_NUMBER         = @OPENCV_VERSION@
 PROJECT_BRIEF          = "Open Source Computer Vision"
 PROJECT_LOGO           = @CMAKE_CURRENT_SOURCE_DIR@/opencv-logo-small.png
+#PROJECT_ICON           =
 OUTPUT_DIRECTORY       = @CMAKE_DOXYGEN_OUTPUT_PATH@
 CREATE_SUBDIRS         = YES
+CREATE_SUBDIRS_LEVEL   = 8
+ALLOW_UNICODE_NAMES    = NO
 OUTPUT_LANGUAGE        = English
 BRIEF_MEMBER_DESC      = YES
 REPEAT_BRIEF           = YES
@@ -26,8 +29,10 @@ STRIP_FROM_PATH        = @CMAKE_SOURCE_DIR@/modules @CMAKE_DOXYGEN_INCLUDE_ROOTS
 STRIP_FROM_INC_PATH    = @CMAKE_DOXYGEN_INCLUDE_ROOTS@
 SHORT_NAMES            = NO
 JAVADOC_AUTOBRIEF      = NO
+JAVADOC_BANNER         = NO
 QT_AUTOBRIEF           = NO
 MULTILINE_CPP_IS_BRIEF = NO
+PYTHON_DOCSTRING       = YES
 INHERIT_DOCS           = YES
 SEPARATE_MEMBER_PAGES  = NO
 TAB_SIZE               = 4
@@ -43,26 +48,34 @@ OPTIMIZE_OUTPUT_FOR_C  = NO
 OPTIMIZE_OUTPUT_JAVA   = NO
 OPTIMIZE_FOR_FORTRAN   = NO
 OPTIMIZE_OUTPUT_VHDL   = NO
+OPTIMIZE_OUTPUT_SLICE  = NO
 EXTENSION_MAPPING      =
 MARKDOWN_SUPPORT       = YES
+TOC_INCLUDE_HEADINGS   = 5
+MARKDOWN_ID_STYLE      = DOXYGEN
 AUTOLINK_SUPPORT       = YES
 BUILTIN_STL_SUPPORT    = YES
 CPP_CLI_SUPPORT        = NO
 SIP_SUPPORT            = NO
 IDL_PROPERTY_SUPPORT   = YES
 DISTRIBUTE_GROUP_DOC   = NO
+GROUP_NESTED_COMPOUNDS = NO
 SUBGROUPING            = YES
 INLINE_GROUPED_CLASSES = NO
 INLINE_SIMPLE_STRUCTS  = NO
 TYPEDEF_HIDES_STRUCT   = YES
 LOOKUP_CACHE_SIZE      = 0
+NUM_PROC_THREADS       = 1
+TIMESTAMP              = YES
 EXTRACT_ALL            = YES
 EXTRACT_PRIVATE        = NO
+EXTRACT_PRIV_VIRTUAL   = NO
 EXTRACT_PACKAGE        = NO
 EXTRACT_STATIC         = YES
 EXTRACT_LOCAL_CLASSES  = NO
 EXTRACT_LOCAL_METHODS  = NO
 EXTRACT_ANON_NSPACES   = NO
+RESOLVE_UNNAMED_PARAMS = YES
 HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -70,6 +83,8 @@ HIDE_IN_BODY_DOCS      = NO
 INTERNAL_DOCS          = NO
 CASE_SENSE_NAMES       = YES
 HIDE_SCOPE_NAMES       = NO
+HIDE_COMPOUND_REFERENCE= NO
+SHOW_HEADERFILE        = YES
 SHOW_INCLUDE_FILES     = YES
 SHOW_GROUPED_MEMB_INC  = YES
 FORCE_LOCAL_INCLUDES   = NO
@@ -96,11 +111,16 @@ QUIET                  = YES
 WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 WARN_IF_DOC_ERROR      = YES
+WARN_IF_INCOMPLETE_DOC = YES
 WARN_NO_PARAMDOC       = NO
+WARN_IF_UNDOC_ENUM_VAL = NO
+WARN_AS_ERROR          = NO
 WARN_FORMAT            = "$file:$line: $text"
+WARN_LINE_FORMAT       = "at line $line of file $file"
 WARN_LOGFILE           =
 INPUT                  = @CMAKE_DOXYGEN_INPUT_LIST@
 INPUT_ENCODING         = UTF-8
+INPUT_FILE_ENCODING    =
 FILE_PATTERNS          =
 RECURSIVE              = YES
 EXCLUDE                = @CMAKE_DOXYGEN_EXCLUDE_LIST@
@@ -125,8 +145,11 @@ REFERENCES_LINK_SOURCE = YES
 SOURCE_TOOLTIPS        = YES
 USE_HTAGS              = NO
 VERBATIM_HEADERS       = NO
+CLANG_ASSISTED_PARSING = NO
+CLANG_ADD_INC_PATHS    = YES
+CLANG_OPTIONS          =
+CLANG_DATABASE_PATH    =
 ALPHABETICAL_INDEX     = YES
-COLS_IN_ALPHA_INDEX    = 5
 IGNORE_PREFIX          =
 GENERATE_HTML          = YES
 HTML_OUTPUT            = html
@@ -136,14 +159,19 @@ HTML_FOOTER            = @CMAKE_CURRENT_SOURCE_DIR@/footer.html
 HTML_STYLESHEET        =
 HTML_EXTRA_STYLESHEET  = @CMAKE_CURRENT_SOURCE_DIR@/stylesheet.css
 HTML_EXTRA_FILES       = @CMAKE_DOXYGEN_HTML_FILES@
+HTML_COLORSTYLE        = LIGHT
 HTML_COLORSTYLE_HUE    = 220
 HTML_COLORSTYLE_SAT    = 100
 HTML_COLORSTYLE_GAMMA  = 80
-HTML_TIMESTAMP         = YES
+HTML_DYNAMIC_MENUS     = YES
 HTML_DYNAMIC_SECTIONS  = NO
+HTML_CODE_FOLDING      = YES
+#HTML_COPY_CLIPBOARD    = YES
+#HTML_PROJECT_COOKIE    =
 HTML_INDEX_NUM_ENTRIES = 100
 GENERATE_DOCSET        = NO
 DOCSET_FEEDNAME        = "Doxygen generated docs"
+DOCSET_FEEDURL         =
 DOCSET_BUNDLE_ID       = org.doxygen.Project
 DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
 DOCSET_PUBLISHER_NAME  = Publisher
@@ -154,6 +182,7 @@ GENERATE_CHI           = NO
 CHM_INDEX_ENCODING     =
 BINARY_TOC             = NO
 TOC_EXPAND             = NO
+SITEMAP_URL            =
 GENERATE_QHP           = @CMAKE_DOXYGEN_GENERATE_QHP@
 QCH_FILE               = ../opencv-@OPENCV_VERSION@.qch
 QHP_NAMESPACE          = org.opencv.@OPENCV_VERSION@
@@ -166,15 +195,19 @@ GENERATE_ECLIPSEHELP   = NO
 ECLIPSE_DOC_ID         = org.doxygen.Project
 DISABLE_INDEX          = NO
 GENERATE_TREEVIEW      = NO
+FULL_SIDEBAR           = NO
 ENUM_VALUES_PER_LINE   = 1
 TREEVIEW_WIDTH         = 250
 EXT_LINKS_IN_WINDOW    = YES
+OBFUSCATE_EMAILS       = YES
+HTML_FORMULA_FORMAT    = svg
 FORMULA_FONTSIZE       = 14
-FORMULA_TRANSPARENT    = YES
+FORMULA_MACROFILE      =
 USE_MATHJAX            = YES
-MATHJAX_FORMAT         = HTML-CSS
+MATHJAX_VERSION        = MathJax_3
+MATHJAX_FORMAT         = chtml
 MATHJAX_RELPATH        = @OPENCV_MATHJAX_RELPATH@
-MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
+MATHJAX_EXTENSIONS     = ams
 MATHJAX_CODEFILE       = @CMAKE_CURRENT_SOURCE_DIR@/mymath.js
 SEARCHENGINE           = YES
 SERVER_BASED_SEARCH    = NO
@@ -187,18 +220,20 @@ GENERATE_LATEX         = NO
 LATEX_OUTPUT           = latex
 LATEX_CMD_NAME         = latex
 MAKEINDEX_CMD_NAME     = makeindex
+LATEX_MAKEINDEX_CMD    = makeindex
 COMPACT_LATEX          = NO
 PAPER_TYPE             = a4
 EXTRA_PACKAGES         = mymath
 LATEX_HEADER           =
 LATEX_FOOTER           =
+LATEX_EXTRA_STYLESHEET =
 LATEX_EXTRA_FILES      =
 PDF_HYPERLINKS         = YES
 USE_PDFLATEX           = YES
 LATEX_BATCHMODE        = NO
 LATEX_HIDE_INDICES     = NO
-LATEX_SOURCE_CODE      = NO
 LATEX_BIB_STYLE        = plain
+LATEX_EMOJI_DIRECTORY  =
 GENERATE_RTF           = NO
 RTF_OUTPUT             = rtf
 COMPACT_RTF            = NO
@@ -208,13 +243,18 @@ RTF_EXTENSIONS_FILE    =
 GENERATE_MAN           = NO
 MAN_OUTPUT             = man
 MAN_EXTENSION          = .3
+MAN_SUBDIR             =
 MAN_LINKS              = NO
 GENERATE_XML           = NO
 XML_OUTPUT             = xml
 XML_PROGRAMLISTING     = YES
+XML_NS_MEMB_FILE_SCOPE = NO
 GENERATE_DOCBOOK       = NO
 DOCBOOK_OUTPUT         = docbook
 GENERATE_AUTOGEN_DEF   = NO
+GENERATE_SQLITE3       = NO
+SQLITE3_OUTPUT         = sqlite3
+SQLITE3_RECREATE_DB    = YES
 GENERATE_PERLMOD       = NO
 PERLMOD_LATEX          = NO
 PERLMOD_PRETTY         = YES
@@ -272,19 +312,20 @@ GENERATE_TAGFILE       = @CMAKE_DOXYGEN_OUTPUT_PATH@/html/opencv.tag
 ALLEXTERNALS           = NO
 EXTERNAL_GROUPS        = YES
 EXTERNAL_PAGES         = YES
-CLASS_DIAGRAMS         = YES
-DIA_PATH               =
 HIDE_UNDOC_RELATIONS   = NO
 HAVE_DOT               = @CMAKECONFIG_HAVE_DOT@
 DOT_NUM_THREADS        = 0
-DOT_FONTNAME           = Helvetica
-DOT_FONTSIZE           = 10
 DOT_FONTPATH           =
-CLASS_GRAPH            = YES
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+CLASS_GRAPH            = NO
 COLLABORATION_GRAPH    = YES
-GROUP_GRAPHS           = YES
+GROUP_GRAPHS           = NO
 UML_LOOK               = YES
 UML_LIMIT_NUM_FIELDS   = 10
+DOT_UML_DETAILS        = NO
+DOT_WRAP_THRESHOLD     = 17
 TEMPLATE_RELATIONS     = YES
 INCLUDE_GRAPH          = YES
 INCLUDED_BY_GRAPH      = YES
@@ -292,15 +333,19 @@ CALL_GRAPH             = YES
 CALLER_GRAPH           = NO
 GRAPHICAL_HIERARCHY    = YES
 DIRECTORY_GRAPH        = YES
+DIR_GRAPH_MAX_DEPTH    = 1
 DOT_IMAGE_FORMAT       = @CMAKECONFIG_DOT_IMAGE_FORMAT@
 INTERACTIVE_SVG        = @CMAKECONFIG_INTERACTIVE_SVG@
 DOT_PATH               = @CMAKECONFIG_DOT_PATH@
 DOTFILE_DIRS           =
-MSCFILE_DIRS           =
 DIAFILE_DIRS           =
-DOT_GRAPH_MAX_NODES    = 50
+PLANTUML_JAR_PATH      =
+PLANTUML_CFG_FILE      =
+PLANTUML_INCLUDE_PATH  =
+DOT_GRAPH_MAX_NODES    = 250
 MAX_DOT_GRAPH_DEPTH    = 0
-DOT_TRANSPARENT        = NO
 DOT_MULTI_TARGETS      = NO
 GENERATE_LEGEND        = YES
 DOT_CLEANUP            = YES
+MSCGEN_TOOL            =
+MSCFILE_DIRS           =
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index 1385327f622c..c8f98cda32db 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -139,6 +139,9 @@
     <groupgraph visible="$GROUP_GRAPHS"/>
     <memberdecl>
       <nestedgroups visible="yes" title=""/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdecl>
       <dirs visible="yes" title=""/>
       <files visible="yes" title=""/>
       <namespaces visible="yes" title=""/>
@@ -158,7 +161,6 @@
       <friends title=""/>
       <membergroups visible="yes"/>
     </memberdecl>
-    <detaileddescription title=""/>
     <memberdef>
       <pagedocs/>
       <inlineclasses title=""/>
diff --git a/doc/LICENSE_CHANGE_NOTICE.txt b/doc/LICENSE_CHANGE_NOTICE.txt
index 0df3eca67141..c2711d4671d7 100644
--- a/doc/LICENSE_CHANGE_NOTICE.txt
+++ b/doc/LICENSE_CHANGE_NOTICE.txt
@@ -1,4 +1,4 @@
-Starting from OpenCV 4.5-pre (2020 August) OpenCV has changed the license from BSD to Apache 2. See https://opencv.org/opencv-is-to-change-the-license-to-apache-2/ and https://github.com/opencv/opencv/wiki/OE-32.--Change-OpenCV-License-to-Apache-2 for details.
+Starting from OpenCV 4.5-pre (2020 August) OpenCV has changed the license from BSD to Apache 2. See https://opencv.org/blog/opencv-is-to-change-the-license-to-apache-2/ and https://github.com/opencv/opencv/wiki/OE-32.--Change-OpenCV-License-to-Apache-2 for details.
 
 Here is the original OpenCV license:
 ------------------------------------------------------------------------------------
diff --git a/doc/js_tutorials/js_gui/js_image_display/js_image_display.markdown b/doc/js_tutorials/js_gui/js_image_display/js_image_display.markdown
index 9ad4ce2e5348..fb0ae42eb3b8 100644
--- a/doc/js_tutorials/js_gui/js_image_display/js_image_display.markdown
+++ b/doc/js_tutorials/js_gui/js_image_display/js_image_display.markdown
@@ -45,7 +45,7 @@ cv.cvtColor(dst, dst, cv.COLOR_***2RGBA);
 
 Then, new an ImageData obj from dst:
 @code{.js}
-let imgData = new ImageData(new Uint8ClampedArray(dst.data, dst.cols, dst.rows);
+let imgData = new ImageData(new Uint8ClampedArray(dst.data), dst.cols, dst.rows);
 @endcode
 
 Finally, display it:
diff --git a/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown b/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
index a9d8ff5cadcb..b0d1a1d169c2 100644
--- a/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
+++ b/doc/js_tutorials/js_setup/js_nodejs/js_nodejs.markdown
@@ -336,7 +336,7 @@ function installDOM(){
 -   Make sure the files `aarcascade_frontalface_default.xml` and `haarcascade_eye.xml` are present in project's directory. They can be obtained from [OpenCV sources](https://github.com/opencv/opencv/tree/4.x/data/haarcascades).
 -   Make sure a sample image file `lena.jpg` exists in project's directory. It should display people's faces for this example to make sense. The following image is known to work:
 
-![image](lena.jpg)
+![image](js_assets/lena.jpg)
 
 The following command should generate the file `output3.jpg`:
 
diff --git a/doc/js_tutorials/js_setup/js_setup/js_setup.markdown b/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
index 5b0e65b25023..87a32a78cba2 100644
--- a/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
+++ b/doc/js_tutorials/js_setup/js_setup/js_setup.markdown
@@ -54,7 +54,7 @@ repository](https://github.com/opencv/opencv.git).
 
 ### Obtaining the Latest Stable OpenCV Version
 
--   Go to our [releases page](http://opencv.org/releases.html).
+-   Go to our [releases page](https://opencv.org/releases).
 -   Download the source archive and unpack it.
 
 ### Obtaining the Cutting-edge OpenCV from the Git Repository
diff --git a/doc/mymath.js b/doc/mymath.js
index ffa2b11d3dbc..028fa41cd278 100644
--- a/doc/mymath.js
+++ b/doc/mymath.js
@@ -1,22 +1,23 @@
 //<![CDATA[
-MathJax.Hub.Config(
-{
-  TeX: {
-      Macros: {
-          matTT: [ "\\[ \\left|\\begin{array}{ccc} #1 & #2 & #3\\\\ #4 & #5 & #6\\\\ #7 & #8 & #9 \\end{array}\\right| \\]", 9],
-          fork: ["\\left\\{ \\begin{array}{l l} #1 & \\mbox{#2}\\\\ #3 & \\mbox{#4}\\\\ \\end{array} \\right.", 4],
-          forkthree: ["\\left\\{ \\begin{array}{l l} #1 & \\mbox{#2}\\\\ #3 & \\mbox{#4}\\\\ #5 & \\mbox{#6}\\\\ \\end{array} \\right.", 6],
-          forkfour: ["\\left\\{ \\begin{array}{l l} #1 & \\mbox{#2}\\\\ #3 & \\mbox{#4}\\\\ #5 & \\mbox{#6}\\\\ #7 & \\mbox{#8}\\\\ \\end{array} \\right.", 8],
-          vecthree: ["\\begin{bmatrix} #1\\\\ #2\\\\ #3 \\end{bmatrix}", 3],
-          vecthreethree: ["\\begin{bmatrix} #1 & #2 & #3\\\\ #4 & #5 & #6\\\\ #7 & #8 & #9 \\end{bmatrix}", 9],
-          cameramatrix: ["#1 = \\begin{bmatrix} f_x & 0 & c_x\\\\ 0 & f_y & c_y\\\\ 0 & 0 & 1 \\end{bmatrix}", 1],
-          distcoeffs: ["(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \\tau_x, \\tau_y]]]]) \\text{ of 4, 5, 8, 12 or 14 elements}"],
-          distcoeffsfisheye: ["(k_1, k_2, k_3, k_4)"],
-          hdotsfor: ["\\dots", 1],
-          mathbbm: ["\\mathbb{#1}", 1],
-          bordermatrix: ["\\matrix{#1}", 1]
-      }
-  }
-}
-);
+window.MathJax = {
+    loader: {load: ['[tex]/ams']},
+    tex: {
+        packages: {'[+]': ['ams']},
+        macros: {
+            matTT: [ "\\[ \\left|\\begin{array}{ccc} #1 & #2 & #3\\\\ #4 & #5 & #6\\\\ #7 & #8 & #9 \\end{array}\\right| \\]", 9],
+            fork: ["\\left\\{ \\begin{array}{l l} #1 & \\mbox{#2}\\\\ #3 & \\mbox{#4}\\\\ \\end{array} \\right.", 4],
+            forkthree: ["\\left\\{ \\begin{array}{l l} #1 & \\mbox{#2}\\\\ #3 & \\mbox{#4}\\\\ #5 & \\mbox{#6}\\\\ \\end{array} \\right.", 6],
+            forkfour: ["\\left\\{ \\begin{array}{l l} #1 & \\mbox{#2}\\\\ #3 & \\mbox{#4}\\\\ #5 & \\mbox{#6}\\\\ #7 & \\mbox{#8}\\\\ \\end{array} \\right.", 8],
+            vecthree: ["\\begin{bmatrix} #1\\\\ #2\\\\ #3 \\end{bmatrix}", 3],
+            vecthreethree: ["\\begin{bmatrix} #1 & #2 & #3\\\\ #4 & #5 & #6\\\\ #7 & #8 & #9 \\end{bmatrix}", 9],
+            cameramatrix: ["#1 = \\begin{bmatrix} f_x & 0 & c_x\\\\ 0 & f_y & c_y\\\\ 0 & 0 & 1 \\end{bmatrix}", 1],
+            distcoeffs: ["(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6 [, s_1, s_2, s_3, s_4[, \\tau_x, \\tau_y]]]]) \\text{ of 4, 5, 8, 12 or 14 elements}"],
+            distcoeffsfisheye: ["(k_1, k_2, k_3, k_4)"],
+            hdotsfor: ["\\dots", 1],
+            mathbbm: ["\\mathbb{#1}", 1],
+            bordermatrix: ["\\matrix{#1}", 1]
+        },
+        processEscapes: false
+    }
+};
 //]]>
diff --git a/doc/opencv.bib b/doc/opencv.bib
index 64aa363202e2..6632271e4a91 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -1377,3 +1377,93 @@ @article{Buades2005DenoisingIS
   year={2005},
   pages={70-74}
 }
+@inproceedings{wang2016iros,
+    AUTHOR     = {John Wang and Edwin Olson},
+    TITLE      = {{AprilTag} 2: Efficient and robust fiducial detection},
+    BOOKTITLE  = {Proceedings of the {IEEE/RSJ} International Conference on Intelligent
+                 Robots and Systems {(IROS)}},
+    YEAR       = {2016},
+    MONTH      = {October},
+}
+@inproceedings{BarathGCRANSAC,
+  author = {Barath, Daniel and Matas, Jiri},
+  title = {Graph-Cut RANSAC},
+  booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2018}
+}
+@misc{barath2019progressive,
+  title={Progressive NAPSAC: sampling from gradually growing neighborhoods},
+  author={Barath, Daniel and Ivashechkin, Maksym and Matas, Jiri},
+  year={2019},
+  eprint={1906.02295},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV}
+}
+@inproceedings{BarathMAGSAC,
+  author = {Barath, Daniel and Noskova, Jana and Ivashechkin, Maksym and Matas, Jiri},
+  title = {MAGSAC++, a Fast, Reliable and Accurate Robust Estimator},
+  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  month = {June},
+  year = {2020}
+}
+@inproceedings{ChumPROSAC,
+  title = {Matching with {PROSAC} - Progressive Sampling Consensus},
+  author = {Chum, Ondrej and  Matas, Jiri},
+  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2005}
+}
+@inproceedings{ChumLORANSAC,
+  title = {Locally Optimized {RANSAC}},
+  author = {Chum, Ondrej and Matas, Jiri and Kittler, Josef},
+  booktitle = {DAGM},
+  year = {2003}
+}
+@inproceedings{ChumEpipolar,
+  author={Chum, Ondrej and Werner, Tomas and Matas, Jiri},
+  booktitle={Proceedings of the 17th International Conference on Pattern Recognition. ICPR 2004},
+  title={Epipolar geometry estimation via RANSAC benefits from the oriented epipolar constraint},
+  year={2004},
+  volume={1},
+  pages={112-115 Vol.1}
+}
+@inproceedings{ChumDominant,
+  title = {Epipolar Geometry Estimation Unaffected by the Dominant Plane},
+  author = {Chum, Ondrej and Werner, Tomas and  Matas, Jiri.},
+  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2005}
+}
+@article{FischlerRANSAC,
+  author = {Fischler, Martin A. and Bolles, Robert C.},
+  title = {Random Sample Consensus: A Paradigm for Model Fitting with Applications to Image Analysis and Automated Cartography},
+  year = {1981},
+  publisher = {Association for Computing Machinery},
+  volume = {24},
+  number = {6},
+  month = {jun},
+  pages = {381–395},
+  numpages = {15}
+}
+@article{Matas2005RandomizedRW,
+  title={Randomized RANSAC with sequential probability ratio test},
+  author={Matas, Jiri and Chum, Ondrej},
+  journal={Tenth IEEE International Conference on Computer Vision (ICCV) Volume 1},
+  year={2005},
+  volume={2},
+  pages={1727-1732 Vol. 2}
+}
+@inproceedings{MyattNAPSAC,
+  author = {Myatt, D. and Torr, Philip and Nasuto, Slawomir and Bishop, John and Craddock, R.},
+  year = {2002},
+  booktitle = {Proceedings of the British Machine Vision Conference (BMVC)},
+  title = {NAPSAC: High Noise, High Dimensional Robust Estimation - it's in the Bag}
+}
+@article{SteweniusRecent,
+  author = {Stewenius, Henrik and Engels, Christopher and Nister, David},
+  year = {2006},
+  month = {06},
+  pages = {284-294},
+  title = {Recent developments on direct relative orientation},
+  volume = {60},
+  journal = {ISPRS Journal of Photogrammetry and Remote Sensing}
+}
diff --git a/doc/pattern_tools/gen_pattern.py b/doc/pattern_tools/gen_pattern.py
index bec535baf6bd..f426bb11c5b7 100755
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@@ -186,6 +186,8 @@ def make_charuco_board(self):
         yspacing = (self.height - self.rows * self.square_size) / 2.0
 
         ch_ar_border = (self.square_size - self.aruco_marker_size)/2
+        if ch_ar_border < side*0.7:
+            print("Marker border {} is less than 70% of ArUco pin size {}. Please increase --square_size or decrease --marker_size for stable board detection".format(ch_ar_border, int(side)))
         marker_id = 0
         for y in range(0, self.rows):
             for x in range(0, self.cols):
@@ -283,6 +285,9 @@ def main():
             else:
                 raise ValueError("The marker {},{} is outside the checkerboard".format(x, y))
 
+    if p_type == "charuco_board" and aruco_marker_size >= square_size:
+        raise ValueError("ArUco markers size must be smaller than square size")
+
     pm = PatternMaker(columns, rows, output, units, square_size, radius_rate, page_width, page_height, markers, aruco_marker_size, dict_file)
     # dict for easy lookup of pattern type
     mp = {"circles": pm.make_circles_pattern, "acircles": pm.make_acircles_pattern,
diff --git a/doc/pattern_tools/test_charuco_board.py b/doc/pattern_tools/test_charuco_board.py
index 83d355b7a6fc..0258e4034fec 100644
--- a/doc/pattern_tools/test_charuco_board.py
+++ b/doc/pattern_tools/test_charuco_board.py
@@ -34,7 +34,7 @@ def test_aruco_dicts(self):
                 aruco_dict = cv.aruco.getPredefinedDictionary(aruco_type[aruco_type_i])
                 board = cv.aruco.CharucoBoard((cols, rows), square_size, marker_size, aruco_dict)
                 charuco_detector = cv.aruco.CharucoDetector(board)
-                from_cv_img = board.generateImage((cols*square_size*10, rows*square_size*10))
+                from_cv_img = board.generateImage((cols*square_size, rows*square_size))
 
                 #draw desk using svg
                 fd1, filesvg = tempfile.mkstemp(prefix="out", suffix=".svg")
@@ -50,15 +50,20 @@ def test_aruco_dicts(self):
                     pm.make_charuco_board()
                     pm.save()
                     drawing = svg2rlg(filesvg)
-                    renderPM.drawToFile(drawing, filepng, fmt='PNG', dpi=720)
+                    renderPM.drawToFile(drawing, filepng, fmt='PNG', dpi=72)
                     from_svg_img = cv.imread(filepng)
+                    _charucoCorners, _charuco_ids_svg, marker_corners_svg, marker_ids_svg = charuco_detector.detectBoard(from_svg_img)
+                    _charucoCorners, _charuco_ids_cv, marker_corners_cv, marker_ids_cv = charuco_detector.detectBoard(from_cv_img)
+                    marker_corners_svg_map, marker_corners_cv_map = {}, {}
+                    for i in range(len(marker_ids_svg)):
+                        marker_corners_svg_map[int(marker_ids_svg[i][0])] = marker_corners_svg[i]
+                    for i in range(len(marker_ids_cv)):
+                        marker_corners_cv_map[int(marker_ids_cv[i][0])] = marker_corners_cv[i]
 
-                    #test
-                    _charucoCorners, _charucoIds, markerCorners_svg, markerIds_svg = charuco_detector.detectBoard(from_svg_img)
-                    _charucoCorners, _charucoIds, markerCorners_cv, markerIds_cv = charuco_detector.detectBoard(from_cv_img)
-
-                    np.testing.assert_allclose(markerCorners_svg, markerCorners_cv, 0.1, 0.1)
-                    np.testing.assert_allclose(markerIds_svg, markerIds_cv, 0.1, 0.1)
+                    for key_svg in marker_corners_svg_map.keys():
+                        marker_svg = marker_corners_svg_map[key_svg]
+                        marker_cv = marker_corners_cv_map[key_svg]
+                        np.testing.assert_allclose(marker_svg, marker_cv, 0.1, 0.1)
                 finally:
                     if os.path.exists(filesvg):
                         os.remove(filesvg)
@@ -87,7 +92,7 @@ def test_aruco_marker_sizes(self):
                 aruco_dict = cv.aruco.getPredefinedDictionary(aruco_type)
                 board = cv.aruco.CharucoBoard((cols, rows), square_size, marker_size, aruco_dict)
                 charuco_detector = cv.aruco.CharucoDetector(board)
-                from_cv_img = board.generateImage((cols*square_size*10, rows*square_size*10))
+                from_cv_img = board.generateImage((cols*square_size, rows*square_size))
 
                 #draw desk using svg
                 fd1, filesvg = tempfile.mkstemp(prefix="out", suffix=".svg")
@@ -102,17 +107,24 @@ def test_aruco_marker_sizes(self):
                     pm.make_charuco_board()
                     pm.save()
                     drawing = svg2rlg(filesvg)
-                    renderPM.drawToFile(drawing, filepng, fmt='PNG', dpi=720)
+                    renderPM.drawToFile(drawing, filepng, fmt='PNG', dpi=72)
                     from_svg_img = cv.imread(filepng)
 
                     #test
-                    _charucoCorners, _charucoIds, markerCorners_svg, markerIds_svg = charuco_detector.detectBoard(from_svg_img)
-                    _charucoCorners, _charucoIds, markerCorners_cv, markerIds_cv = charuco_detector.detectBoard(from_cv_img)
+                    _charucoCorners, _charuco_ids_svg, marker_corners_svg, marker_ids_svg = charuco_detector.detectBoard(from_svg_img)
+                    _charucoCorners, _charuco_ids_cv, marker_corners_cv, marker_ids_cv = charuco_detector.detectBoard(from_cv_img)
+                    marker_corners_svg_map, marker_corners_cv_map = {}, {}
+                    for i in range(len(marker_ids_svg)):
+                        marker_corners_svg_map[int(marker_ids_svg[i][0])] = marker_corners_svg[i]
+                    for i in range(len(marker_ids_cv)):
+                        marker_corners_cv_map[int(marker_ids_cv[i][0])] = marker_corners_cv[i]
 
-                    np.testing.assert_allclose(markerCorners_svg, markerCorners_cv, 0.1, 0.1)
-                    np.testing.assert_allclose(markerIds_svg, markerIds_cv, 0.1, 0.1)
+                    for key_svg in marker_corners_svg_map.keys():
+                        marker_svg = marker_corners_svg_map[key_svg]
+                        marker_cv = marker_corners_cv_map[key_svg]
+                        np.testing.assert_allclose(marker_svg, marker_cv, 0.1, 0.1)
                 finally:
                     if os.path.exists(filesvg):
                         os.remove(filesvg)
                     if os.path.exists(filepng):
-                        os.remove(filepng)
+                        os.remove(filepng)
\ No newline at end of file
diff --git a/doc/py_tutorials/py_bindings/py_bindings_basics/py_bindings_basics.markdown b/doc/py_tutorials/py_bindings/py_bindings_basics/py_bindings_basics.markdown
index 001952deca39..f7a29b48052d 100644
--- a/doc/py_tutorials/py_bindings/py_bindings_basics/py_bindings_basics.markdown
+++ b/doc/py_tutorials/py_bindings/py_bindings_basics/py_bindings_basics.markdown
@@ -79,9 +79,12 @@ Functions are extended using `CV_EXPORTS_W` macro. An example is shown below.
 @code{.cpp}
 CV_EXPORTS_W void equalizeHist( InputArray src, OutputArray dst );
 @endcode
-Header parser can understand the input and output arguments from keywords like
-InputArray, OutputArray etc. But sometimes, we may need to hardcode inputs and outputs. For that,
-macros like `CV_OUT`, `CV_IN_OUT` etc. are used.
+Header parser can understand the input and output arguments from keywords like InputArray,
+OutputArray etc. The arguments semantics are kept in Python: anything that is modified in C++
+will be modified in Python. And vice-versa read-only Python objects cannot be modified by OpenCV,
+if they are used as output. Such situation will cause Python exception. Sometimes, the parameters
+that are passed by reference in C++ may be used as input, output or both.
+Macros `CV_OUT`, `CV_IN_OUT` allow to solve ambiguity and generate correct bindings.
 @code{.cpp}
 CV_EXPORTS_W void minEnclosingCircle( InputArray points,
                                      CV_OUT Point2f& center, CV_OUT float& radius );
diff --git a/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown b/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown
index d60b84624549..5819653fa0f7 100644
--- a/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown
+++ b/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown
@@ -111,7 +111,7 @@ frames per second (fps) and frame size should be passed. And the last one is the
 `True`, the encoder expect color frame, otherwise it works with grayscale frame.
 
 [FourCC](http://en.wikipedia.org/wiki/FourCC) is a 4-byte code used to specify the video codec. The
-list of available codes can be found in [fourcc.org](http://www.fourcc.org/codecs.php). It is
+list of available codes can be found in [fourcc.org](https://fourcc.org/codecs.php). It is
 platform dependent. The following codecs work fine for me.
 
 -   In Fedora: DIVX, XVID, MJPG, X264, WMV1, WMV2. (XVID is more preferable. MJPG results in high
diff --git a/doc/py_tutorials/py_imgproc/py_transforms/py_fourier_transform/py_fourier_transform.markdown b/doc/py_tutorials/py_imgproc/py_transforms/py_fourier_transform/py_fourier_transform.markdown
index 59337b1355e8..df12efd45c86 100644
--- a/doc/py_tutorials/py_imgproc/py_transforms/py_fourier_transform/py_fourier_transform.markdown
+++ b/doc/py_tutorials/py_imgproc/py_transforms/py_fourier_transform/py_fourier_transform.markdown
@@ -80,7 +80,7 @@ using **np.ifft2()** function. The result, again, will be a complex number. You
 absolute value.
 @code{.py}
 rows, cols = img.shape
-crow,ccol = rows//2 , cols//2
+crow, ccol = rows//2, cols//2
 fshift[crow-30:crow+31, ccol-30:ccol+31] = 0
 f_ishift = np.fft.ifftshift(fshift)
 img_back = np.fft.ifft2(f_ishift)
@@ -146,7 +146,7 @@ content, and 0 at HF region.
 
 @code{.py}
 rows, cols = img.shape
-crow,ccol = rows/2 , cols/2
+crow, ccol = rows//2, cols//2
 
 # create a mask first, center square is 1, remaining all zeros
 mask = np.zeros((rows,cols,2),np.uint8)
diff --git a/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/py_knn_understanding.markdown b/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/py_knn_understanding.markdown
index 9f76e0f808de..5985cdd55969 100644
--- a/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/py_knn_understanding.markdown
+++ b/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/py_knn_understanding.markdown
@@ -141,7 +141,7 @@ Additional Resources
 --------------------
 
 1.  [NPTEL notes on Pattern Recognition, Chapter
-    11](https://nptel.ac.in/courses/106/108/106108057/)
+    11](https://nptel.ac.in/courses/106108057)
 2.  [Wikipedia article on Nearest neighbor search](https://en.wikipedia.org/wiki/Nearest_neighbor_search)
 3.  [Wikipedia article on k-d tree](https://en.wikipedia.org/wiki/K-d_tree)
 
diff --git a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/py_svm_basics.markdown b/doc/py_tutorials/py_ml/py_svm/py_svm_basics/py_svm_basics.markdown
index c8dbe3992066..55f74237e9e3 100644
--- a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/py_svm_basics.markdown
+++ b/doc/py_tutorials/py_ml/py_svm/py_svm_basics/py_svm_basics.markdown
@@ -129,7 +129,6 @@ Additional Resources
 --------------------
 
 -#  [NPTEL notes on Statistical Pattern Recognition, Chapters
-    25-29](http://www.nptel.ac.in/courses/106108057/26).
-
+    25-29](https://nptel.ac.in/courses/117108048)
 Exercises
 ---------
diff --git a/doc/py_tutorials/py_setup/py_setup_in_windows/py_setup_in_windows.markdown b/doc/py_tutorials/py_setup/py_setup_in_windows/py_setup_in_windows.markdown
index 1cefb01d5ce2..c30f80dd1884 100644
--- a/doc/py_tutorials/py_setup/py_setup_in_windows/py_setup_in_windows.markdown
+++ b/doc/py_tutorials/py_setup/py_setup_in_windows/py_setup_in_windows.markdown
@@ -79,62 +79,38 @@ Building OpenCV from source
 -#  Extract it to a folder, opencv and create a new folder build in it.
 -#  Open CMake-gui (*Start \> All Programs \> CMake-gui*)
 -#  Fill the fields as follows (see the image below):
-
     -#  Click on **Browse Source...** and locate the opencv folder.
-
     -#  Click on **Browse Build...** and locate the build folder we created.
-
     -#  Click on **Configure**.
-
         ![image](images/Capture1.jpg)
-
     -#  It will open a new window to select the compiler. Choose appropriate compiler (here,
         Visual Studio 11) and click **Finish**.
-
         ![image](images/Capture2.png)
-
     -#  Wait until analysis is finished.
-
 -#  You will see all the fields are marked in red. Click on the **WITH** field to expand it. It
     decides what extra features you need. So mark appropriate fields. See the below image:
-
     ![image](images/Capture3.png)
-
 -#  Now click on **BUILD** field to expand it. First few fields configure the build method. See the
     below image:
-
     ![image](images/Capture5.png)
-
 -#  Remaining fields specify what modules are to be built. Since GPU modules are not yet supported
     by OpenCV-Python, you can completely avoid it to save time (But if you work with them, keep it
     there). See the image below:
-
     ![image](images/Capture6.png)
-
 -#  Now click on **ENABLE** field to expand it. Make sure **ENABLE_SOLUTION_FOLDERS** is unchecked
     (Solution folders are not supported by Visual Studio Express edition). See the image below:
-
     ![image](images/Capture7.png)
-
 -#  Also make sure that in the **PYTHON** field, everything is filled. (Ignore
     PYTHON_DEBUG_LIBRARY). See image below:
-
     ![image](images/Capture80.png)
-
 -#  Finally click the **Generate** button.
-
 -#  Now go to our **opencv/build** folder. There you will find **OpenCV.sln** file. Open it with
     Visual Studio.
-
 -#  Check build mode as **Release** instead of **Debug**.
-
 -#  In the solution explorer, right-click on the **Solution** (or **ALL_BUILD**) and build it. It
     will take some time to finish.
-
 -#  Again, right-click on **INSTALL** and build it. Now OpenCV-Python will be installed.
-
     ![image](images/Capture8.png)
-
 -#  Open Python IDLE and enter 'import cv2 as cv'. If no error, it is installed correctly.
 
 @note We have installed with no other support like TBB, Eigen, Qt, Documentation etc. It would be
diff --git a/doc/tutorial-utils.js b/doc/tutorial-utils.js
index 646287258089..3f08f1d2c1ff 100644
--- a/doc/tutorial-utils.js
+++ b/doc/tutorial-utils.js
@@ -50,7 +50,7 @@ function addButton(label, buttonName) {
 
 function buttonsToAdd($elements, $heading, $type) {
     if ($elements.length === 0) {
-        $elements = $("" + $type + ":contains(" + $heading.html() + ")").parent().prev("div.newInnerHTML");
+        return;
     }
     var arr = jQuery.makeArray($elements);
     var seen = {};
@@ -72,18 +72,12 @@ function buttonsToAdd($elements, $heading, $type) {
 }
 
 function addTutorialsButtons() {
-    $("h2").each(function() {
-        $heading = $(this);
-        $smallerHeadings = $(this).nextUntil("h2").filter("h3").add($(this).nextUntil("h2").find("h3"));
-        if ($smallerHeadings.length) {
-            $smallerHeadings.each(function() {
-                var $elements = $(this).nextUntil("h2,h3").filter("div.newInnerHTML");
-                buttonsToAdd($elements, $(this), "h3");
-            });
-        } else {
-            var $elements = $(this).nextUntil("h2").filter("div.newInnerHTML");
-            buttonsToAdd($elements, $heading, "h2");
-        }
+    $("h1").each(function() {
+        var $elements = $(this).nextUntil("h1")
+        var $lower = $elements.find("div.newInnerHTML")
+        $elements = $elements.add($lower)
+        $elements = $elements.filter("div.newInnerHTML")
+        buttonsToAdd($elements, $(this), "h1")
     });
     $(".toggleable_button").first().click();
     var $clickDefault = $('.toggleable_button.label_python').first();
diff --git a/doc/tutorials/app/highgui_wayland_ubuntu.markdown b/doc/tutorials/app/highgui_wayland_ubuntu.markdown
new file mode 100644
index 000000000000..2b8020ad19aa
--- /dev/null
+++ b/doc/tutorials/app/highgui_wayland_ubuntu.markdown
@@ -0,0 +1,106 @@
+Using Wayland highgui-backend in Ubuntu {#tutorial_wayland_ubuntu}
+=======================================
+
+@tableofcontents
+
+@prev_tutorial{tutorial_intelperc}
+
+|    |    |
+| -: | :- |
+| Original author | Kumataro |
+| Compatibility | OpenCV >= 4.10 |
+| ^ | Ubuntu 24.04 |
+
+Goal
+-----
+This tutorial is to use Wayland highgui-backend in Ubuntu 24.04.
+
+Wayland highgui-backend is experimental implementation.
+
+Setup
+-----
+- Setup Ubuntu 24.04.
+- `sudo apt install build-essential git cmake` to build OpenCV.
+- `sudo apt install libwayland-dev wayland-protocols libxkbcommon-dev` to enable Wayland highgui-backend.
+- (Option) `sudo apt install ninja-build` (or remove `-GNinja` option for cmake command).
+- (Option) `sudo apt install libwayland-egl1` to enable Wayland EGL library.
+
+Get OpenCV from GitHub
+----------------------
+
+```bash
+mkdir work
+cd work
+git clone --depth=1 https://github.com/opencv/opencv.git
+```
+
+@note
+`--depth=1` option is to limit downloading commits. If you want to see more commit history, please remove this option.
+
+Build/Install OpenCV with Wayland highgui-backend
+-------------------------------------------------
+
+Run `cmake` with `-DWITH_WAYLAND=ON` option to configure OpenCV.
+
+```bash
+cmake -S opencv -B build4-main -DWITH_WAYLAND=ON -GNinja
+```
+
+If succeeded, Wayland Client/Cursor/Protocols and Xkbcommon versions are shown. Wayland EGL is option.
+
+```plaintext
+--
+--   GUI:                           Wayland
+--     Wayland:                     (Experimental) YES
+--       Wayland Client:            YES (ver 1.22.0)
+--       Wayland Cursor:            YES (ver 1.22.0)
+--       Wayland Protocols:         YES (ver 1.34)
+--       Xkbcommon:                 YES (ver 1.6.0)
+--       Wayland EGL(Option):       YES (ver 18.1.0)
+--     GTK+:                        NO
+--     VTK support:                 NO
+```
+
+Run `cmake --build` to build, and `sudo cmake --install` to install into your system.
+
+```bash
+cmake --build build4-main
+sudo cmake --install build4-main
+sudo ldconfig
+```
+
+Simple Application to try Wayland highgui-backend
+-------------------------------------------------
+Try this code, so you can see name of currentUIFrramework() and OpenCV logo window with Wayland highgui-backend.
+
+
+```bash
+// g++ main.cpp -o a.out -I /usr/local/include/opencv4 -lopencv_core -lopencv_highgui -lopencv_imgcodecs
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <iostream>
+#include <string>
+
+int main(void)
+{
+  std::cout << "cv::currentUIFramework() returns " << cv::currentUIFramework() << std::endl;
+
+  cv::Mat src;
+  src = cv::imread("opencv-logo.png");
+
+  cv::namedWindow("src");
+
+  int key = 0;
+  do
+  {
+      cv::imshow("src", src );
+      key = cv::waitKey(50);
+  } while( key != 'q' );
+  return 0;
+}
+```
+
+Limitation/Known problem
+------------------------
+- cv::moveWindow() is not implementated. ( See. https://github.com/opencv/opencv/issues/25478 )
diff --git a/doc/tutorials/app/intelperc.markdown b/doc/tutorials/app/intelperc.markdown
index 132074faa318..574bfc9e6a58 100644
--- a/doc/tutorials/app/intelperc.markdown
+++ b/doc/tutorials/app/intelperc.markdown
@@ -4,6 +4,7 @@ Using Creative Senz3D and other Intel RealSense SDK compatible depth sensors {#t
 @tableofcontents
 
 @prev_tutorial{tutorial_orbbec_astra}
+@next_tutorial{tutorial_wayland_ubuntu}
 
 ![hardwares](images/realsense.jpg)
 
diff --git a/doc/tutorials/app/table_of_content_app.markdown b/doc/tutorials/app/table_of_content_app.markdown
index 8e05dfaf0774..6671f6b541fb 100644
--- a/doc/tutorials/app/table_of_content_app.markdown
+++ b/doc/tutorials/app/table_of_content_app.markdown
@@ -8,3 +8,4 @@ Application utils (highgui, imgcodecs, videoio modules) {#tutorial_table_of_cont
 -   @subpage tutorial_kinect_openni
 -   @subpage tutorial_orbbec_astra
 -   @subpage tutorial_intelperc
+-   @subpage tutorial_wayland_ubuntu
diff --git a/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown b/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
index a50058ba847c..b02e6ecfd21d 100644
--- a/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
+++ b/doc/tutorials/calib3d/interactive_calibration/interactive_calibration.markdown
@@ -4,6 +4,7 @@ Interactive camera calibration application {#tutorial_interactive_calibration}
 @tableofcontents
 
 @prev_tutorial{tutorial_real_time_pose}
+@next_tutorial{tutorial_usac}
 
 |    |    |
 | -: | :- |
diff --git a/doc/tutorials/calib3d/table_of_content_calib3d.markdown b/doc/tutorials/calib3d/table_of_content_calib3d.markdown
index 5fc6e591e926..4be1058dd37c 100644
--- a/doc/tutorials/calib3d/table_of_content_calib3d.markdown
+++ b/doc/tutorials/calib3d/table_of_content_calib3d.markdown
@@ -6,3 +6,4 @@ Camera calibration and 3D reconstruction (calib3d module) {#tutorial_table_of_co
 -   @subpage tutorial_camera_calibration
 -   @subpage tutorial_real_time_pose
 -   @subpage tutorial_interactive_calibration
+-   @subpage tutorial_usac
diff --git a/doc/tutorials/calib3d/usac.markdown b/doc/tutorials/calib3d/usac.markdown
index df9e25f907b9..7fb18aa87ccd 100644
--- a/doc/tutorials/calib3d/usac.markdown
+++ b/doc/tutorials/calib3d/usac.markdown
@@ -1,14 +1,19 @@
----
-author:
-- Maksym Ivashechkin
-bibliography: 'bibs.bib'
-csl: 'acm-sigchi-proceedings.csl'
-date: August 2020
-title: 'Google Summer of Code: Improvement of Random Sample Consensus in OpenCV'
-...
+USAC: Improvement of Random Sample Consensus in OpenCV {#tutorial_usac}
+==============================
+
+@tableofcontents
+
+@prev_tutorial{tutorial_interactive_calibration}
+
+|    |    |
+| -: | :- |
+| Original author | Maksym Ivashechkin |
+| Compatibility | OpenCV >= 4.0 |
+
+This work was integrated as part of the Google Summer of Code (August 2020).
 
 Contribution
-============
+------
 
 The integrated part to OpenCV `calib3d` module is RANSAC-based universal
 framework USAC (`namespace usac`) written in C++. The framework includes
@@ -20,25 +25,25 @@ components:
 
 1.  Sampling method:
 
-    1.  Uniform – standard RANSAC sampling proposed in \[8\] which draw
+    1.  Uniform – standard RANSAC sampling proposed in @cite FischlerRANSAC which draw
         minimal subset independently uniformly at random. *The default
         option in proposed framework*.
 
-    2.  PROSAC – method \[4\] that assumes input data points sorted by
+    2.  PROSAC – method @cite ChumPROSAC that assumes input data points sorted by
         quality so sampling can start from the most promising points.
         Correspondences for this method can be sorted e.g., by ratio of
         descriptor distances of the best to second match obtained from
         SIFT detector. *This is method is recommended to use because it
         can find good model and terminate much earlier*.
 
-    3.  NAPSAC – sampling method \[10\] which takes initial point
+    3.  NAPSAC – sampling method @cite MyattNAPSAC which takes initial point
         uniformly at random and the rest of points for minimal sample in
         the neighborhood of initial point. This is method can be
         potentially useful when models are localized. For example, for
         plane fitting. However, in practise struggles from degenerate
         issues and defining optimal neighborhood size.
 
-    4.  Progressive-NAPSAC – sampler \[2\] which is similar to NAPSAC,
+    4.  Progressive-NAPSAC – sampler @cite barath2019progressive which is similar to NAPSAC,
         although it starts from local and gradually converges to
         global sampling. This method can be quite useful if local models
         are expected but distribution of data can be arbitrary. The
@@ -56,7 +61,7 @@ components:
         default option in framework*. The model might not have as many
         inliers as using RANSAC score, however will be more accurate.
 
-    3.  MAGSAC – threshold-free method \[3\] to compute score. Using,
+    3.  MAGSAC – threshold-free method @cite BarathMAGSAC to compute score. Using,
         although, maximum sigma (standard deviation of noise) level to
         marginalize residual of point over sigma. Score of the point
         represents likelihood of point being inlier. *Recommended option
@@ -86,7 +91,7 @@ components:
 
 4.  Degeneracy:
 
-    1.  DEGENSAC – method \[7\] which for Fundamental matrix estimation
+    1.  DEGENSAC – method @cite ChumDominant which for Fundamental matrix estimation
         efficiently verifies and recovers model which has at least 5
         points in minimal sample lying on the dominant plane.
 
@@ -96,11 +101,11 @@ components:
         in minimal sample lie on the same side w.r.t. to any line
         crossing any two points in sample (does not assume reflection).
 
-    3.  Oriented epipolar constraint – method \[6\] for epipolar
+    3.  Oriented epipolar constraint – method @cite ChumEpipolar for epipolar
         geometry which verifies model (fundamental and essential matrix)
         to have points visible in the front of the camera.
 
-5.  SPRT verification – method \[9\] which verifies model by its
+5.  SPRT verification – method @cite Matas2005RandomizedRW which verifies model by its
     evaluation on randomly shuffled points using statistical properties
     given by probability of inlier, relative time for estimation,
     average number of output models etc. Significantly speeding up
@@ -109,17 +114,17 @@ components:
 
 6.  Local Optimization:
 
-    1.  Locally Optimized RANSAC – method \[5\] that iteratively
+    1.  Locally Optimized RANSAC – method @cite ChumLORANSAC that iteratively
         improves so-far-the-best model by non-minimal estimation. *The
         default option in framework. This procedure is the fastest and
         not worse than others local optimization methods.*
 
-    2.  Graph-Cut RANSAC – method \[1\] that refine so-far-the-best
+    2.  Graph-Cut RANSAC – method @cite BarathGCRANSAC that refine so-far-the-best
         model, however, it exploits spatial coherence of the
         data points. *This procedure is quite precise however
         computationally slower.*
 
-    3.  Sigma Consensus – method \[3\] which improves model by applying
+    3.  Sigma Consensus – method @cite BarathMAGSAC which improves model by applying
         non-minimal weighted estimation, where weights are computed with
         the same logic as in MAGSAC score. This method is better to use
         together with MAGSAC score.
@@ -152,7 +157,7 @@ components:
 
     4.  Essential matrix – 4 null vectors are found using
         Gaussian elimination. Then the solver based on Gröbner basis
-        described in \[11\] is used. Essential matrix can be computed
+        described in @cite SteweniusRecent is used. Essential matrix can be computed
         only if <span style="font-variant:small-caps;">LAPACK</span> or
         <span style="font-variant:small-caps;">Eigen</span> are
         installed as it requires eigen decomposition with complex
@@ -180,12 +185,12 @@ sequentially. However, using default options of framework parallel
 RANSAC is not deterministic since it depends on how often each thread is
 running. The easiest way to make it deterministic is using PROSAC
 sampler without SPRT and Local Optimization and not for Fundamental
-matrix, because they internally use random generators.\
-\
+matrix, because they internally use random generators.
+
 For NAPSAC, Progressive NAPSAC or Graph-Cut methods is required to build
 a neighborhood graph. In framework there are 3 options to do it:
 
-1.  `NEIGH_FLANN_KNN` – estimate neighborhood graph using OpenCV FLANN
+1.  NEIGH_FLANN_KNN – estimate neighborhood graph using OpenCV FLANN
     K nearest-neighbors. The default value for KNN is 7. KNN method may
     work good for sampling but not good for GC-RANSAC.
 
@@ -193,14 +198,14 @@ a neighborhood graph. In framework there are 3 options to do it:
     points which distance is less than 20 pixels.
 
 3.  `NEIGH_GRID` – for finding points’ neighborhood tiles points in
-    cells using hash-table. The method is described in \[2\]. Less
+    cells using hash-table. The method is described in @cite barath2019progressive. Less
     accurate than `NEIGH_FLANN_RADIUS`, although significantly faster.
 
 Note, `NEIGH_FLANN_RADIUS` and `NEIGH_FLANN_RADIUS` are not able to PnP
-solver, since there are 3D object points.\
-\
-New flags:
+solver, since there are 3D object points.
 
+New flags:
+------
 1.  `USAC_DEFAULT` – has standard LO-RANSAC.
 
 2.  `USAC_PARALLEL` – has LO-RANSAC and RANSACs run in parallel.
@@ -220,9 +225,10 @@ New flags:
 
 Every flag uses SPRT verification. And in the end the final
 so-far-the-best model is polished by non minimal estimation of all found
-inliers.\
-\
+inliers.
+
 A few other important parameters:
+------
 
 1.  `randomGeneratorState` – since every USAC solver is deterministic in
     OpenCV (i.e., for the same points and parameters returns the
@@ -240,6 +246,7 @@ A few other important parameters:
     estimation on low number of points is faster and more robust.
 
 Samples:
+------
 
 There are three new sample files in opencv/samples directory.
 
@@ -260,48 +267,3 @@ There are three new sample files in opencv/samples directory.
 3.  `essential_mat_reconstr.py` – the same functionality as in .cpp
     file, however instead of clustering points to plane the 3D map of
     object points is plot.
-
-References:
-
-1\. Daniel Barath and Jiří Matas. 2018. Graph-Cut RANSAC. In *Proceedings
-of the iEEE conference on computer vision and pattern recognition*,
-6733–6741.
-
-2\. Daniel Barath, Maksym Ivashechkin, and Jiri Matas. 2019. Progressive
-NAPSAC: Sampling from gradually growing neighborhoods. *arXiv preprint
-arXiv:1906.02295*.
-
-3\. Daniel Barath, Jana Noskova, Maksym Ivashechkin, and Jiri Matas.
-2020. MAGSAC++, a fast, reliable and accurate robust estimator. In
-*Proceedings of the iEEE/CVF conference on computer vision and pattern
-recognition (cVPR)*.
-
-4\. O. Chum and J. Matas. 2005. Matching with PROSAC-progressive sample
-consensus. In *Computer vision and pattern recognition*.
-
-5\. O. Chum, J. Matas, and J. Kittler. 2003. Locally optimized RANSAC. In
-*Joint pattern recognition symposium*.
-
-6\. O. Chum, T. Werner, and J. Matas. 2004. Epipolar geometry estimation
-via RANSAC benefits from the oriented epipolar constraint. In
-*International conference on pattern recognition*.
-
-7\. Ondrej Chum, Tomas Werner, and Jiri Matas. 2005. Two-view geometry
-estimation unaffected by a dominant plane. In *2005 iEEE computer
-society conference on computer vision and pattern recognition
-(cVPR’05)*, 772–779.
-
-8\. M. A. Fischler and R. C. Bolles. 1981. Random sample consensus: A
-paradigm for model fitting with applications to image analysis and
-automated cartography. *Communications of the ACM*.
-
-9\. Jiri Matas and Ondrej Chum. 2005. Randomized RANSAC with sequential
-probability ratio test. In *Tenth iEEE international conference on
-computer vision (iCCV’05) volume 1*, 1727–1732.
-
-10\. D. R. Myatt, P. H. S. Torr, S. J. Nasuto, J. M. Bishop, and R.
-Craddock. 2002. NAPSAC: High noise, high dimensional robust estimation.
-In *In bMVC02*, 458–467.
-
-11\. Henrik Stewénius, Christopher Engels, and David Nistér. 2006. Recent
-developments on direct relative orientation.
diff --git a/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.markdown b/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.markdown
index 9c11ec3b2efb..adb1c6fb0e95 100644
--- a/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.markdown
+++ b/doc/tutorials/core/discrete_fourier_transform/discrete_fourier_transform.markdown
@@ -87,7 +87,7 @@ Fourier Transform too needs to be of a discrete type resulting in a Discrete Fou
 (*DFT*). You'll want to use this whenever you need to determine the structure of an image from a
 geometrical point of view. Here are the steps to follow (in case of a gray scale input image *I*):
 
-#### Expand the image to an optimal size
+### Expand the image to an optimal size
 
 The performance of a DFT is dependent of the image
 size. It tends to be the fastest for image sizes that are multiple of the numbers two, three and
@@ -108,7 +108,7 @@ image (the appended pixels are initialized with zero):
 @snippet python/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.py expand
 @end_toggle
 
-#### Make place for both the complex and the real values
+### Make place for both the complex and the real values
 
 The result of a Fourier Transform is
 complex. This implies that for each image value the result is two image values (one per
@@ -128,7 +128,7 @@ input image to this type and expand it with another channel to hold the complex
 @snippet python/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.py complex_and_real
 @end_toggle
 
-#### Make the Discrete Fourier Transform
+### Make the Discrete Fourier Transform
 It's possible an in-place calculation (same input as
 output):
 
@@ -144,7 +144,7 @@ output):
 @snippet python/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.py dft
 @end_toggle
 
-#### Transform the real and complex values to magnitude
+### Transform the real and complex values to magnitude
 A complex number has a real (*Re*) and a
 complex (imaginary - *Im*) part. The results of a DFT are complex numbers. The magnitude of a
 DFT is:
@@ -165,7 +165,7 @@ Translated to OpenCV code:
 @snippet python/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.py magnitude
 @end_toggle
 
-#### Switch to a logarithmic scale
+### Switch to a logarithmic scale
 It turns out that the dynamic range of the Fourier
 coefficients is too large to be displayed on the screen. We have some small and some high
 changing values that we can't observe like this. Therefore the high values will all turn out as
@@ -188,7 +188,7 @@ Translated to OpenCV code:
 @snippet python/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.py log
 @end_toggle
 
-#### Crop and rearrange
+### Crop and rearrange
 Remember, that at the first step, we expanded the image? Well, it's time
 to throw away the newly introduced values. For visualization purposes we may also rearrange the
 quadrants of the result, so that the origin (zero, zero) corresponds with the image center.
@@ -205,7 +205,7 @@ quadrants of the result, so that the origin (zero, zero) corresponds with the im
 @snippet python/tutorial_code/core/discrete_fourier_transform/discrete_fourier_transform.py crop_rearrange
 @end_toggle
 
-#### Normalize
+### Normalize
 This is done again for visualization purposes. We now have the magnitudes,
 however this are still out of our image display range of zero to one. We normalize our values to
 this range using the @ref cv::normalize() function.
diff --git a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
index 4c68efecd0b4..ab24d27ab19f 100644
--- a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
+++ b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
@@ -9,6 +9,9 @@ How to use the OpenCV parallel_for_ to parallelize your code {#tutorial_how_to_u
 | -: | :- |
 | Compatibility | OpenCV >= 3.0 |
 
+
+@note See also C++ lambda usage with parallel for in [tuturial](@ref tutorial_how_to_use_OpenCV_parallel_for_new).
+
 Goal
 ----
 
@@ -20,7 +23,7 @@ If you want more information about multithreading, you will have to refer to a r
 to remain simple.
 
 Precondition
-----
+------------
 
 The first precondition is to have OpenCV built with a parallel framework.
 In OpenCV 3.2, the following parallel frameworks are available in that order:
@@ -50,7 +53,7 @@ We will use the example of drawing a Mandelbrot set to show how from a regular s
 the code to parallelize the computation.
 
 Theory
------------
+------
 
 The Mandelbrot set definition has been named in tribute to the mathematician Benoit Mandelbrot by the mathematician
 Adrien Douady. It has been famous outside of the mathematics field as the image representation is an example of a
@@ -69,7 +72,7 @@ Here, we will just introduce the formula to draw the Mandelbrot set (from the me
 > \f[\limsup_{n\to\infty}|z_{n+1}|\leqslant2\f]
 
 Pseudocode
------------
+----------
 
 A simple algorithm to generate a representation of the Mandelbrot set is called the
 ["escape time algorithm"](https://en.wikipedia.org/wiki/Mandelbrot_set#Escape_time_algorithm).
@@ -110,10 +113,10 @@ On this figure, we recall that the real part of a complex number is on the x-axi
 You can see that the whole shape can be repeatedly visible if we zoom at particular locations.
 
 Implementation
------------
+--------------
 
 Escape time algorithm implementation
---------------------------
+------------------------------------
 
 @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-escape-time-algorithm
 
@@ -121,7 +124,7 @@ Here, we used the [`std::complex`](http://en.cppreference.com/w/cpp/numeric/comp
 complex number. This function performs the test to check if the pixel is in set or not and returns the "escaped" iteration.
 
 Sequential Mandelbrot implementation
---------------------------
+------------------------------------
 
 @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-sequential
 
@@ -149,7 +152,7 @@ The green curve corresponds to a simple linear scale transformation, the blue on
 and you can observe how the lowest values will be boosted when looking at the slope at these positions.
 
 Parallel Mandelbrot implementation
---------------------------
+----------------------------------
 
 When looking at the sequential implementation, we can notice that each pixel is computed independently. To optimize the
 computation, we can perform multiple pixel calculations in parallel, by exploiting the multi-core architecture of modern
@@ -181,7 +184,7 @@ C++ 11 standard allows to simplify the parallel implementation by get rid of the
 @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-parallel-call-cxx11
 
 Results
------------
+-------
 
 You can find the full tutorial code [here](https://github.com/opencv/opencv/blob/4.x/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp).
 The performance of the parallel implementation depends of the type of CPU you have. For instance, on 4 cores / 8 threads
diff --git a/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown b/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
index 895815596120..e401e61012e6 100644
--- a/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
+++ b/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
@@ -73,7 +73,7 @@ Here's a function that will do this:
 @snippet samples/cpp/tutorial_code/core/mat_mask_operations/mat_mask_operations.cpp basic_method
 
 At first we make sure that the input images data is in unsigned char format. For this we use the
-@ref cv::CV_Assert function that throws an error when the expression inside it is false.
+@ref CV_Assert function (macro) that throws an error when the expression inside it is false.
 @snippet samples/cpp/tutorial_code/core/mat_mask_operations/mat_mask_operations.cpp 8_bit
 @end_toggle
 
diff --git a/doc/tutorials/core/univ_intrin/univ_intrin.markdown b/doc/tutorials/core/univ_intrin/univ_intrin.markdown
index 894a7440a06f..a80b6d4bd3de 100644
--- a/doc/tutorials/core/univ_intrin/univ_intrin.markdown
+++ b/doc/tutorials/core/univ_intrin/univ_intrin.markdown
@@ -245,7 +245,7 @@ In the following section, we will vectorize a simple convolution function for si
 
 You may learn more about convolution from the previous tutorial. We use the same naive implementation from the previous tutorial and compare it to the vectorized version.
 
-The full tutorial code is [here](https://github.com/opencv/opencv/tree/4.x/samples/cpp/tutorial_code/univ_intrin/univ_intrin.cpp).
+The full tutorial code is [here](https://github.com/opencv/opencv/tree/4.x/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp).
 
 ### Vectorizing Convolution
 
diff --git a/doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown b/doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
index f7bdd9512d38..ae4deb01de80 100644
--- a/doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
+++ b/doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
@@ -22,7 +22,7 @@ In this tutorial, we first introduce how to obtain the custom OCR model, then ho
 
 After completing the model training, please use [transform_to_onnx.py](https://github.com/zihaomu/deep-text-recognition-benchmark/blob/master/transform_to_onnx.py) to convert the model into onnx format.
 
-#### Execute in webcam
+### Execute in webcam
 The Python version example code can be found at [here](https://github.com/opencv/opencv/blob/4.x/samples/dnn/text_detection.py).
 
 Example:
@@ -44,9 +44,10 @@ Their performance at different text recognition datasets is shown in the table b
 | CRNN_VGG-BiLSTM-CTC | 82.63    | 82.07 | 92.96     | 88.867     | 66.28     | 71.01  | 62.37    | 78.03       | 8.45              |
 | ResNet-CTC           | 84.00        | 84.08  | 92.39     | 88.96     | 67.74     | 74.73  | 67.60    | 79.93    | 44.28             |
 
-The performance of the text recognition model were tesred on OpenCV DNN, and does not include the text detection model.
+The performance of the text recognition model were tested on OpenCV DNN, and does not include the text detection model.
+
+### Model selection suggestion
 
-#### Model selection suggestion:
 The input of text recognition model is the output of the text detection model, which causes the performance of text detection to greatly affect the performance of text recognition.
 
 DenseNet_CTC has the smallest parameters and best FPS, and it is suitable for edge devices, which are very sensitive to the cost of calculation. If you have limited computing resources and want to achieve better accuracy, VGG_CTC is a good choice.
diff --git a/doc/tutorials/dnn/dnn_android/10_opencv_dependency.png b/doc/tutorials/dnn/dnn_android/10_opencv_dependency.png
deleted file mode 100644
index 03b0b597ec87..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/10_opencv_dependency.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/1_start_new_project.png b/doc/tutorials/dnn/dnn_android/1_start_new_project.png
deleted file mode 100644
index 37f1dd71827e..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/1_start_new_project.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/2_start_new_project.png b/doc/tutorials/dnn/dnn_android/2_start_new_project.png
deleted file mode 100644
index 4eeeb8ae1133..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/2_start_new_project.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/3_start_new_project.png b/doc/tutorials/dnn/dnn_android/3_start_new_project.png
deleted file mode 100644
index 8a2cd4b45498..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/3_start_new_project.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/4_start_new_project.png b/doc/tutorials/dnn/dnn_android/4_start_new_project.png
deleted file mode 100644
index d1b63f94c22d..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/4_start_new_project.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/5_setup.png b/doc/tutorials/dnn/dnn_android/5_setup.png
deleted file mode 100644
index 81b88a9f4912..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/5_setup.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/6_run_empty_project.png b/doc/tutorials/dnn/dnn_android/6_run_empty_project.png
deleted file mode 100644
index 88dc30ce99fc..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/6_run_empty_project.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/7_import_module.png b/doc/tutorials/dnn/dnn_android/7_import_module.png
deleted file mode 100644
index c258f5ad2593..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/7_import_module.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/8_import_module.png b/doc/tutorials/dnn/dnn_android/8_import_module.png
deleted file mode 100644
index e2e0d3e35847..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/8_import_module.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/9_opencv_dependency.png b/doc/tutorials/dnn/dnn_android/9_opencv_dependency.png
deleted file mode 100644
index 107f4d210b0d..000000000000
Binary files a/doc/tutorials/dnn/dnn_android/9_opencv_dependency.png and /dev/null differ
diff --git a/doc/tutorials/dnn/dnn_android/dnn_android.markdown b/doc/tutorials/dnn/dnn_android/dnn_android.markdown
index 2c81b7ed1d8a..a153bd4255b8 100644
--- a/doc/tutorials/dnn/dnn_android/dnn_android.markdown
+++ b/doc/tutorials/dnn/dnn_android/dnn_android.markdown
@@ -1,107 +1 @@
-# How to run deep networks on Android device {#tutorial_dnn_android}
-
-@tableofcontents
-
-@prev_tutorial{tutorial_dnn_openvino}
-@next_tutorial{tutorial_dnn_yolo}
-
-|    |    |
-| -: | :- |
-| Original author | Dmitry Kurtaev |
-| Compatibility | OpenCV >= 3.3 |
-
-## Introduction
-In this tutorial you'll know how to run deep learning networks on Android device
-using OpenCV deep learning module.
-
-Tutorial was written for the following versions of corresponding software:
-- Android Studio 2.3.3
-- OpenCV 3.3.0+
-
-## Requirements
-
-- Download and install Android Studio from https://developer.android.com/studio.
-
-- Get the latest pre-built OpenCV for Android release from https://github.com/opencv/opencv/releases and unpack it (for example, `opencv-4.X.Y-android-sdk.zip`).
-
-- Download MobileNet object detection model from https://github.com/chuanqi305/MobileNet-SSD. We need a configuration file `MobileNetSSD_deploy.prototxt` and weights `MobileNetSSD_deploy.caffemodel`.
-
-## Create an empty Android Studio project
-- Open Android Studio. Start a new project. Let's call it `opencv_mobilenet`.
-![](1_start_new_project.png)
-
-- Keep default target settings.
-![](2_start_new_project.png)
-
-- Use "Empty Activity" template. Name activity as `MainActivity` with a
-corresponding layout `activity_main`.
-![](3_start_new_project.png)
-
-  ![](4_start_new_project.png)
-
-- Wait until a project was created. Go to `Run->Edit Configurations`.
-Choose `USB Device` as target device for runs.
-![](5_setup.png)
-Plug in your device and run the project. It should be installed and launched
-successfully before we'll go next.
-@note Read @ref tutorial_android_dev_intro in case of problems.
-
-![](6_run_empty_project.png)
-
-## Add OpenCV dependency
-
-- Go to `File->New->Import module` and provide a path to `unpacked_OpenCV_package/sdk/java`. The name of module detects automatically.
-Disable all features that Android Studio will suggest you on the next window.
-![](7_import_module.png)
-
-  ![](8_import_module.png)
-
-- Open two files:
-
-  1. `AndroidStudioProjects/opencv_mobilenet/app/build.gradle`
-
-  2. `AndroidStudioProjects/opencv_mobilenet/openCVLibrary330/build.gradle`
-
-  Copy both `compileSdkVersion` and `buildToolsVersion` from the first file to
-  the second one.
-
-  `compileSdkVersion 14` -> `compileSdkVersion 26`
-
-  `buildToolsVersion "25.0.0"` -> `buildToolsVersion "26.0.1"`
-
-- Make the project. There is no errors should be at this point.
-
-- Go to `File->Project Structure`. Add OpenCV module dependency.
-![](9_opencv_dependency.png)
-
-  ![](10_opencv_dependency.png)
-
-- Install once an appropriate OpenCV manager from `unpacked_OpenCV_package/apk`
-to target device.
-@code
-adb install OpenCV_3.3.0_Manager_3.30_armeabi-v7a.apk
-@endcode
-
-- Congratulations! We're ready now to make a sample using OpenCV.
-
-## Make a sample
-Our sample will takes pictures from a camera, forwards it into a deep network and
-receives a set of rectangles, class identifiers and confidence values in `[0, 1]`
-range.
-
-- First of all, we need to add a necessary widget which displays processed
-frames. Modify `app/src/main/res/layout/activity_main.xml`:
-@include android/mobilenet-objdetect/res/layout/activity_main.xml
-
-- Put downloaded `MobileNetSSD_deploy.prototxt` and `MobileNetSSD_deploy.caffemodel`
-into `app/build/intermediates/assets/debug` folder.
-
-- Modify `/app/src/main/AndroidManifest.xml` to enable full-screen mode, set up
-a correct screen orientation and allow to use a camera.
-@include android/mobilenet-objdetect/gradle/AndroidManifest.xml
-
-- Replace content of `app/src/main/java/org/opencv/samples/opencv_mobilenet/MainActivity.java`:
-@include android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java
-
-- Launch an application and make a fun!
-![](11_demo.jpg)
+The page was moved to @ref tutorial_android_dnn_intro
\ No newline at end of file
diff --git a/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown b/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown
index 972842b4f5d3..635961b7e4e3 100644
--- a/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown
+++ b/doc/tutorials/dnn/dnn_googlenet/dnn_googlenet.markdown
@@ -17,7 +17,7 @@ In this tutorial you will learn how to use opencv_dnn module for image classific
 GoogLeNet trained network from [Caffe model zoo](http://caffe.berkeleyvision.org/model_zoo.html).
 
 We will demonstrate results of this example on the following picture.
-![Buran space shuttle](images/space_shuttle.jpg)
+![Buran space shuttle](dnn/images/space_shuttle.jpg)
 
 Source Code
 -----------
diff --git a/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown b/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown
index 57c984038680..8eb965ca55a1 100644
--- a/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown
+++ b/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown
@@ -2,7 +2,7 @@ OpenCV usage with OpenVINO {#tutorial_dnn_openvino}
 =====================
 
 @prev_tutorial{tutorial_dnn_halide_scheduling}
-@next_tutorial{tutorial_dnn_android}
+@next_tutorial{tutorial_dnn_yolo}
 
 |    |    |
 | -: | :- |
diff --git a/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown b/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
index b6f4e120fb7a..b675c1fd298f 100644
--- a/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
+++ b/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
@@ -14,7 +14,7 @@
 In this tutorial, we will introduce the APIs for TextRecognitionModel and TextDetectionModel in detail.
 
 ---
-#### TextRecognitionModel:
+### TextRecognitionModel
 
 In the current version, @ref cv::dnn::TextRecognitionModel only supports CNN+RNN+CTC based algorithms,
 and the greedy decoding method for CTC is provided.
@@ -38,7 +38,7 @@ Before recognition, you should `setVocabulary` and `setDecodeType`.
 
 ---
 
-#### TextDetectionModel:
+### TextDetectionModel
 
 @ref cv::dnn::TextDetectionModel API provides these methods for text detection:
 - cv::dnn::TextDetectionModel::detect() returns the results in std::vector<std::vector<Point>> (4-points quadrangles)
@@ -60,7 +60,7 @@ We encourage you to add new algorithms to these APIs.
 
 ## Pretrained Models
 
-#### TextRecognitionModel:
+### TextRecognitionModel
 
 ```
 crnn.onnx:
@@ -92,7 +92,7 @@ More models can be found in [here](https://drive.google.com/drive/folders/1cTbQ3
 which are taken from [clovaai](https://github.com/clovaai/deep-text-recognition-benchmark).
 You can train more models by [CRNN](https://github.com/meijieru/crnn.pytorch), and convert models by `torch.onnx.export`.
 
-#### TextDetectionModel:
+### TextDetectionModel
 
 ```
 - DB_IC15_resnet50.onnx:
@@ -297,7 +297,7 @@ For more information, please refer to:
 - [samples/dnn/text_detection.cpp](https://github.com/opencv/opencv/blob/4.x/samples/dnn/text_detection.cpp)
 - [samples/dnn/scene_text_spotting.cpp](https://github.com/opencv/opencv/blob/4.x/samples/dnn/scene_text_spotting.cpp)
 
-#### Test with an image
+### Test with an image
 Examples:
 ```bash
 example_dnn_scene_text_recognition -mp=path/to/crnn_cs.onnx -i=path/to/an/image -rgb=1 -vp=/path/to/alphabet_94.txt
@@ -306,7 +306,7 @@ example_dnn_scene_text_spotting -dmp=path/to/DB_IC15_resnet50.onnx -rmp=path/to/
 example_dnn_text_detection -dmp=path/to/EAST.pb -rmp=path/to/crnn_cs.onnx -i=path/to/an/image -rgb=1 -vp=path/to/alphabet_94.txt
 ```
 
-#### Test on public datasets
+### Test on public datasets
 Text Recognition:
 
 The download link for testing images can be found in the **Images for Testing**
diff --git a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
index 364091044c41..a2d4b2a3060e 100644
--- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
+++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
@@ -3,52 +3,230 @@ YOLO DNNs  {#tutorial_dnn_yolo}
 
 @tableofcontents
 
-@prev_tutorial{tutorial_dnn_android}
+@prev_tutorial{tutorial_dnn_openvino}
 @next_tutorial{tutorial_dnn_javascript}
 
 |    |    |
 | -: | :- |
 | Original author | Alessandro de Oliveira Faria |
-| Compatibility | OpenCV >= 3.3.1 |
+| Extended by     | Abduragim Shtanchaev |
+| Compatibility   | OpenCV >= 4.9.0 |
+
+
+Running pre-trained YOLO model in OpenCV
+----------------------------------------
+
+Deploying pre-trained models is a common task in machine learning, particularly when working with
+hardware that does not support certain frameworks like PyTorch. This guide provides a comprehensive
+overview of exporting pre-trained YOLO family models from PyTorch and deploying them using OpenCV's
+DNN framework. For demonstration purposes, we will focus on the [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX/blob/main)
+model, but the methodology applies to other supported models.
+
+@note Currently, OpenCV supports the following YOLO models:
+- [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX/blob/main),
+- [YoloNas](https://github.com/Deci-AI/super-gradients/tree/master),
+- [YOLOv8](https://github.com/ultralytics/ultralytics/tree/main),
+- [YOLOv7](https://github.com/WongKinYiu/yolov7/tree/main),
+- [YOLOv6](https://github.com/meituan/YOLOv6/blob/main),
+- [YOLOv5](https://github.com/ultralytics/yolov5),
+- [YOLOv4](https://github.com/Tianxiaomo/pytorch-YOLOv4).
+
+This support includes pre and post-processing routines specific to these models. While other older
+version of YOLO are also supported by OpenCV in Darknet format, they are out of the scope of this tutorial.
+
+
+Assuming that we have successfully trained YOLOX model, the subsequent step involves exporting and
+running this model with OpenCV. There are several critical considerations to address before
+proceeding with this process. Let's delve into these aspects.
+
+### YOLO's Pre-proccessing & Output
+
+Understanding the nature of inputs and outputs associated with YOLO family detectors is pivotal.
+These detectors, akin to most Deep Neural Networks (DNN), typically exhibit variation in input
+sizes contingent upon the model's scale.
+
+| Model Scale  | Input Size   |
+|--------------|--------------|
+| Small Models <sup>[1](https://github.com/Megvii-BaseDetection/YOLOX/tree/main#standard-models)</sup>| 416x416      |
+| Midsize Models <sup>[2](https://github.com/Megvii-BaseDetection/YOLOX/tree/main#standard-models)</sup>| 640x640    |
+| Large Models <sup>[3](https://github.com/meituan/YOLOv6/tree/main#benchmark)</sup>| 1280x1280    |
+
+This table provides a quick reference to understand the different input dimensions commonly used in
+various YOLO models inputs. These are standard input shapes. Make sure you use input size that you
+trained model with, if it is differed from from the size mentioned in the table.
+
+The next critical element in the process involves understanding the specifics of image pre-processing
+for YOLO detectors. While the fundamental pre-processing approach remains consistent across the YOLO
+family, there are subtle yet crucial differences that must be accounted for to avoid any degradation
+in performance. Key among these are the `resize type` and the `padding value` applied post-resize.
+For instance, the [YOLOX model](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/yolox/data/data_augment.py#L142)
+utilizes a `LetterBox` resize method and a padding value of `114.0`. It is imperative to ensure that
+these parameters, along with the normalization constants, are appropriately matched to the model being
+exported.
+
+Regarding the model's output, it typically takes the form of a tensor with dimensions [BxNxC+5] or
+[BxNxC+4], where 'B' represents the batch size, 'N' denotes the number of anchors, and 'C' signifies
+the number of classes (for instance, 80 classes if the model is trained on the COCO dataset).
+The additional 5 in the former tensor structure corresponds to the objectness score (obj), confidence
+score (conf), and the bounding box coordinates (cx, cy, w, h). Notably, the YOLOv8 model's output
+is shaped as [BxNxC+4], where there is no explicit objectness score, and the object score is directly
+inferred from the class score. For the YOLOX model, specifically, it is also necessary to incorporate
+anchor points to rescale predictions back to the image domain. This step will be integrated into
+the ONNX graph, a process that we will detail further in the subsequent sections.
+
+
+### PyTorch Model Export
+
+Now that we know know the parameters of the pre-precessing we can go on and export the model from
+Pytorch to ONNX graph. Since in this tutorial we are using YOLOX as our sample model, lets use its
+export for demonstration purposes (the process is  identical for the rest of the YOLO detectors).
+To exporting YOLOX we can just use [export script](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/tools/export_onnx.py). Particularly we need following commands:
 
-Introduction
-------------
-
-In this text you will learn how to use opencv_dnn module using yolo_object_detection (Sample of using OpenCV dnn module in real time with device capture, video and image).
+@code{.bash}
+git clone https://github.com/Megvii-BaseDetection/YOLOX.git
+cd YOLOX
+wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.pth # download pre-trained weights
+python3 -m tools.export_onnx --output-name yolox_s.onnx -n yolox-s -c yolox_s.pth --decode_in_inference
+@endcode
 
-We will demonstrate results of this example on the following picture.
-![Picture example](images/yolo.jpg)
+**NOTE:** Here `--decode_in_inference` is to include anchor box creation in the ONNX graph itself.
+It sets [this value](https://github.com/Megvii-BaseDetection/YOLOX/blob/ac58e0a5e68e57454b7b9ac822aced493b553c53/yolox/models/yolo_head.py#L210C16-L210C39)
+to `True`, which subsequently includes anchor generation function.
+
+Below we demonstrated the minimal version of the export script (which could be used for models other
+than YOLOX) in case it is needed. However, usually each YOLO repository has predefined export script.
+
+@code{.py}
+    import onnx
+    import torch
+    from onnxsim import simplify
+
+    # load the model state dict
+    ckpt = torch.load(ckpt_file, map_location="cpu")
+    model.load_state_dict(ckpt)
+
+    # prepare dummy input
+    dummy_input = torch.randn(args.batch_size, 3, exp.test_size[0], exp.test_size[1])
+
+    #export the model
+    torch.onnx._export(
+        model,
+        dummy_input,
+        "yolox.onnx",
+        input_names=["input"],
+        output_names=["output"],
+        dynamic_axes={"input": {0: 'batch'},
+                      "output": {0: 'batch'}})
+
+    # use onnx-simplifier to reduce reduent model.
+    onnx_model = onnx.load(args.output_name)
+    model_simp, check = simplify(onnx_model)
+    assert check, "Simplified ONNX model could not be validated"
+    onnx.save(model_simp, args.output_name)
+@endcode
 
-Examples
---------
+### Running Yolo ONNX detector with OpenCV Sample
+
+Once we have our ONNX graph of the model, we just simply can run with OpenCV's sample. To that we need to make sure:
+
+1. OpenCV is build with -DBUILD_EXAMLES=ON flag.
+2. Navigate to the OpenCV's `build` directory
+3. Run the following command:
+
+@code{.cpp}
+./bin/example_dnn_yolo_detector --input=<path_to_your_input_file> \
+                                --classes=<path_to_class_names_file> \
+                                --thr=<confidence_threshold> \
+                                --nms=<non_maximum_suppression_threshold> \
+                                --mean=<mean_normalization_value> \
+                                --scale=<scale_factor> \
+                                --yolo=<yolo_model_version> \
+                                --padvalue=<padding_value> \
+                                --paddingmode=<padding_mode> \
+                                --backend=<computation_backend> \
+                                --target=<target_computation_device>
+@endcode
 
 VIDEO DEMO:
 @youtube{NHtRlndE2cg}
 
-Source Code
------------
+- --input: File path to your input image or video. If omitted, it will capture frames from a camera.
+- --classes: File path to a text file containing class names for object detection.
+- --thr: Confidence threshold for detection (e.g., 0.5).
+- --nms: Non-maximum suppression threshold (e.g., 0.4).
+- --mean: Mean normalization value (e.g., 0.0 for no mean normalization).
+- --scale: Scale factor for input normalization (e.g., 1.0).
+- --yolo: YOLO model version (e.g., YOLOv3, YOLOv4, etc.).
+- --padvalue: Padding value used in pre-processing (e.g., 114.0).
+- --paddingmode: Method for handling image resizing and padding. Options: 0 (resize without extra processing), 1 (crop after resize), 2 (resize with aspect ratio preservation).
+- --backend: Selection of computation backend (0 for automatic, 1 for Halide, 2 for OpenVINO, etc.).
+- --target: Selection of target computation device (0 for CPU, 1 for OpenCL, etc.).
+- --device: Camera device number (0 for default camera). If `--input` is not provided camera with index 0 will used by default.
+
+Here `mean`, `scale`, `padvalue`, `paddingmode` should exactly match those that we discussed
+in pre-processing section in order for the model to match result in PyTorch
+
+To demonstrate how to run OpenCV YOLO samples without your own pretrained model, follow these instructions:
+
+1. Ensure Python is installed on your platform.
+2. Confirm that OpenCV is built with the `-DBUILD_EXAMPLES=ON` flag.
+
+Run the YOLOX detector(with default values):
+
+@code{.sh}
+git clone https://github.com/opencv/opencv_extra.git
+cd opencv_extra/testdata/dnn
+python download_models.py yolox_s_inf_decoder
+cd ..
+export OPENCV_TEST_DATA_PATH=$(pwd)
+cd <build directory of OpenCV>
+./bin/example_dnn_yolo_detector
+@endcode
 
-Use a universal sample for object detection models written
-[in C++](https://github.com/opencv/opencv/blob/4.x/samples/dnn/object_detection.cpp) and
-[in Python](https://github.com/opencv/opencv/blob/4.x/samples/dnn/object_detection.py) languages
+This will execute the YOLOX detector with your camera. For YOLOv8 (for instance), follow these additional steps:
 
-Usage examples
---------------
+@code{.sh}
+cd opencv_extra/testdata/dnn
+python download_models.py yolov8
+cd ..
+export OPENCV_TEST_DATA_PATH=$(pwd)
+cd <build directory of OpenCV>
 
-Execute in webcam:
+./bin/example_dnn_yolo_detector --model=onnx/models/yolov8n.onnx --yolo=yolov8 --mean=0.0 --scale=0.003921568627 --paddingmode=2 --padvalue=144.0 --thr=0.5 --nms=0.4 --rgb=0
+@endcode
 
-@code{.bash}
 
-$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --rgb
+### Building a Custom Pipeline
 
-@endcode
+Sometimes there is a need to make some custom adjustments in the inference pipeline. With OpenCV DNN
+module this is also quite easy to achieve. Below we will outline the sample implementation details:
 
-Execute with image or video file:
+- Import required libraries
 
-@code{.bash}
+@snippet samples/dnn/yolo_detector.cpp includes
 
-$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE] --rgb
+- Read ONNX graph and create neural network model:
 
-@endcode
+@snippet samples/dnn/yolo_detector.cpp read_net
+
+- Read image and pre-process it:
+
+@snippet samples/dnn/yolo_detector.cpp preprocess_params
+@snippet samples/dnn/yolo_detector.cpp preprocess_call
+@snippet samples/dnn/yolo_detector.cpp preprocess_call_func
+
+- Inference:
+
+@snippet samples/dnn/yolo_detector.cpp forward_buffers
+@snippet samples/dnn/yolo_detector.cpp forward
+
+- Post-Processing
+
+All post-processing steps are implemented in function `yoloPostProcess`. Please pay attention,
+that NMS step is not included into onnx graph. Sample uses OpenCV function for it.
+
+@snippet samples/dnn/yolo_detector.cpp postprocess
+
+- Draw predicted boxes
 
-Questions and suggestions email to: Alessandro de Oliveira Faria cabelo@opensuse.org or OpenCV Team.
+@snippet samples/dnn/yolo_detector.cpp draw_boxes
diff --git a/doc/tutorials/dnn/table_of_content_dnn.markdown b/doc/tutorials/dnn/table_of_content_dnn.markdown
index e878eb23579c..974ca4fc5e9e 100644
--- a/doc/tutorials/dnn/table_of_content_dnn.markdown
+++ b/doc/tutorials/dnn/table_of_content_dnn.markdown
@@ -5,7 +5,6 @@ Deep Neural Networks (dnn module) {#tutorial_table_of_content_dnn}
 -   @subpage tutorial_dnn_halide
 -   @subpage tutorial_dnn_halide_scheduling
 -   @subpage tutorial_dnn_openvino
--   @subpage tutorial_dnn_android
 -   @subpage tutorial_dnn_yolo
 -   @subpage tutorial_dnn_javascript
 -   @subpage tutorial_dnn_custom_layers
diff --git a/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown b/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
index a9826c54d8d2..36829683d5fe 100644
--- a/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
+++ b/doc/tutorials/imgproc/erosion_dilatation/erosion_dilatation.markdown
@@ -111,7 +111,7 @@ called and it will update the output image based on the current trackbar values.
 
 Let's analyze these two functions:
 
-#### The erosion function
+### The erosion function (CPP)
 
 @snippet cpp/tutorial_code/ImgProc/Morphology_1.cpp erosion
 
@@ -135,7 +135,7 @@ receives three arguments:
 
 That is all. We are ready to perform the erosion of our image.
 
-#### The dilation function
+### The dilation function (CPP)
 
 The code is below. As you can see, it is completely similar to the snippet of code for **erosion**.
 Here we also have the option of defining our kernel, its anchor point and the size of the operator
@@ -175,7 +175,7 @@ In short we
 The action and state changed listeners added call at the end the `update` method which updates
 the image based on the current slider values. So every time we move any slider, the `update` method is triggered.
 
-#### Updating the image
+### Updating the image (Java)
 
 To update the image we used the following implementation:
 
@@ -190,7 +190,7 @@ In other words we
 
 Let's analyze the `erode` and `dilate` methods:
 
-#### The erosion method
+### The erosion method (Java)
 
 @snippet java/tutorial_code/ImgProc/erosion_dilatation/MorphologyDemo1.java erosion
 
@@ -213,7 +213,7 @@ receives three arguments:
 
 That is all. We are ready to perform the erosion of our image.
 
-#### The dilation function
+### The dilation function (Java)
 
 The code is below. As you can see, it is completely similar to the snippet of code for **erosion**.
 Here we also have the option of defining our kernel, its anchor point and the size of the operator
@@ -240,7 +240,7 @@ called and it will update the output image based on the current trackbar values.
 
 Let's analyze these two functions:
 
-#### The erosion function
+### The erosion function (Python)
 
 @snippet python/tutorial_code/imgProc/erosion_dilatation/morphology_1.py erosion
 
@@ -262,7 +262,7 @@ specified, it is assumed to be in the center.
 
 That is all. We are ready to perform the erosion of our image.
 
-#### The dilation function
+### The dilation function (Python)
 
 The code is below. As you can see, it is completely similar to the snippet of code for **erosion**.
 Here we also have the option of defining our kernel, its anchor point and the size of the operator
diff --git a/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.markdown b/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.markdown
index 7cdbcaf823a1..2023de809bce 100644
--- a/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.markdown
+++ b/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.markdown
@@ -133,7 +133,7 @@ Explanation
 Let's check the OpenCV functions that involve only the smoothing procedure, since the rest is
 already known by now.
 
-#### Normalized Block Filter:
+### Normalized Block Filter:
 
 -   OpenCV offers the function **blur()** to perform smoothing with this filter.
     We specify 4 arguments (more details, check the Reference):
@@ -157,7 +157,7 @@ already known by now.
 @snippet samples/python/tutorial_code/imgProc/Smoothing/smoothing.py blur
 @end_toggle
 
-#### Gaussian Filter:
+### Gaussian Filter:
 
 -   It is performed by the function **GaussianBlur()** :
     Here we use 4 arguments (more details, check the OpenCV reference):
@@ -183,7 +183,7 @@ already known by now.
 @snippet samples/python/tutorial_code/imgProc/Smoothing/smoothing.py gaussianblur
 @end_toggle
 
-#### Median Filter:
+### Median Filter:
 
 -   This filter is provided by the **medianBlur()** function:
     We use three arguments:
@@ -203,7 +203,7 @@ already known by now.
 @snippet samples/python/tutorial_code/imgProc/Smoothing/smoothing.py medianblur
 @end_toggle
 
-#### Bilateral Filter
+### Bilateral Filter
 
 -   Provided by OpenCV function **bilateralFilter()**
     We use 5 arguments:
diff --git a/doc/tutorials/imgproc/imgtrans/copyMakeBorder/copyMakeBorder.markdown b/doc/tutorials/imgproc/imgtrans/copyMakeBorder/copyMakeBorder.markdown
index 7a1efcdac274..1741b1e19ac0 100644
--- a/doc/tutorials/imgproc/imgtrans/copyMakeBorder/copyMakeBorder.markdown
+++ b/doc/tutorials/imgproc/imgtrans/copyMakeBorder/copyMakeBorder.markdown
@@ -78,7 +78,7 @@ You can also download it from
 Explanation
 -----------
 
-#### Declare the variables
+### Declare the variables
 
 First we declare the variables we are going to use:
 
@@ -97,7 +97,7 @@ First we declare the variables we are going to use:
 Especial attention deserves the variable *rng* which is a random number generator. We use it to
 generate the random border color, as we will see soon.
 
-#### Load an image
+### Load an image
 
 As usual we load our source image *src*:
 
@@ -113,7 +113,7 @@ As usual we load our source image *src*:
 @snippet python/tutorial_code/ImgTrans/MakeBorder/copy_make_border.py load
 @end_toggle
 
-#### Create a window
+### Create a window
 
 After giving a short intro of how to use the program, we create a window:
 
@@ -129,7 +129,7 @@ After giving a short intro of how to use the program, we create a window:
 @snippet python/tutorial_code/ImgTrans/MakeBorder/copy_make_border.py create_window
 @end_toggle
 
-#### Initialize arguments
+### Initialize arguments
 
 Now we initialize the argument that defines the size of the borders (*top*, *bottom*, *left* and
 *right*). We give them a value of 5% the size of *src*.
@@ -146,7 +146,7 @@ Now we initialize the argument that defines the size of the borders (*top*, *bot
 @snippet python/tutorial_code/ImgTrans/MakeBorder/copy_make_border.py init_arguments
 @end_toggle
 
-#### Loop
+### Loop
 
 The program runs in an infinite loop while the key **ESC** isn't pressed.
 If the user presses '**c**' or '**r**', the *borderType* variable
@@ -164,7 +164,7 @@ takes the value of *BORDER_CONSTANT* or *BORDER_REPLICATE* respectively:
 @snippet python/tutorial_code/ImgTrans/MakeBorder/copy_make_border.py check_keypress
 @end_toggle
 
-#### Random color
+### Random color
 
 In each iteration (after 0.5 seconds), the random border color (*value*) is updated...
 
@@ -182,7 +182,7 @@ In each iteration (after 0.5 seconds), the random border color (*value*) is upda
 
 This value is a set of three numbers picked randomly in the range \f$[0,255]\f$.
 
-#### Form a border around the image
+### Form a border around the image
 
 Finally, we call the function **copyMakeBorder()** to apply the respective padding:
 
@@ -209,7 +209,7 @@ Finally, we call the function **copyMakeBorder()** to apply the respective paddi
     -#  *value*: If *borderType* is *BORDER_CONSTANT*, this is the value used to fill the border
         pixels.
 
-#### Display the results
+### Display the results
 
 We display our output image in the image created previously
 
diff --git a/doc/tutorials/imgproc/imgtrans/filter_2d/filter_2d.markdown b/doc/tutorials/imgproc/imgtrans/filter_2d/filter_2d.markdown
index e93a26d8d07f..f4dc885b41cb 100644
--- a/doc/tutorials/imgproc/imgtrans/filter_2d/filter_2d.markdown
+++ b/doc/tutorials/imgproc/imgtrans/filter_2d/filter_2d.markdown
@@ -94,7 +94,7 @@ You can also download it from
 Explanation
 -----------
 
-####  Load an image
+###  Load an image
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/filter2D_demo.cpp load
@@ -108,7 +108,7 @@ Explanation
 @snippet python/tutorial_code/ImgTrans/Filter2D/filter2D.py load
 @end_toggle
 
-####  Initialize the arguments
+###  Initialize the arguments
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/filter2D_demo.cpp init_arguments
@@ -122,7 +122,7 @@ Explanation
 @snippet python/tutorial_code/ImgTrans/Filter2D/filter2D.py init_arguments
 @end_toggle
 
-##### Loop
+### Loop
 
 Perform an infinite loop updating the kernel size and applying our linear filter to the input
 image. Let's analyze that more in detail:
diff --git a/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown b/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
index 15b01a4d77c6..cbdf30d28559 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.markdown
@@ -74,7 +74,7 @@ Explanation
 
 The image we used can be found [here](https://raw.githubusercontent.com/opencv/opencv/4.x/samples/data/smarties.png)
 
-####  Load an image:
+###  Load an image:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghcircles.cpp load
@@ -88,7 +88,7 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 @snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py load
 @end_toggle
 
-####  Convert it to grayscale:
+###  Convert it to grayscale:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghcircles.cpp convert_to_gray
@@ -102,7 +102,7 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 @snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py convert_to_gray
 @end_toggle
 
-#### Apply a Median blur to reduce noise and avoid false circle detection:
+### Apply a Median blur to reduce noise and avoid false circle detection:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghcircles.cpp reduce_noise
@@ -116,7 +116,7 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 @snippet samples/python/tutorial_code/ImgTrans/HoughCircle/hough_circle.py reduce_noise
 @end_toggle
 
-#### Proceed to apply Hough Circle Transform:
+### Proceed to apply Hough Circle Transform:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghcircles.cpp houghcircles
@@ -144,7 +144,7 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
     -   *min_radius = 0*: Minimum radius to be detected. If unknown, put zero as default.
     -   *max_radius = 0*: Maximum radius to be detected. If unknown, put zero as default.
 
-####  Draw the detected circles:
+###  Draw the detected circles:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghcircles.cpp draw
@@ -160,7 +160,7 @@ The image we used can be found [here](https://raw.githubusercontent.com/opencv/o
 
 You can see that we will draw the circle(s) on red and the center(s) with a small green dot
 
-####  Display the detected circle(s) and wait for the user to exit the program:
+###  Display the detected circle(s) and wait for the user to exit the program:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghcircles.cpp display
diff --git a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
index 22295c182983..ca4d9b9a17bf 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
@@ -129,7 +129,7 @@ The sample code that we will explain can be downloaded from
 Explanation
 -----------
 
-#### Load an image:
+### Load an image:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghlines.cpp load
@@ -143,7 +143,7 @@ Explanation
 @snippet samples/python/tutorial_code/ImgTrans/HoughLine/hough_lines.py load
 @end_toggle
 
-#### Detect the edges of the image by using a Canny detector:
+### Detect the edges of the image by using a Canny detector:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghlines.cpp edge_detection
@@ -160,7 +160,7 @@ Explanation
 Now we will apply the Hough Line Transform. We will explain how to use both OpenCV functions
 available for this purpose.
 
-#### Standard Hough Line Transform:
+### Standard Hough Line Transform:
 First, you apply the Transform:
 
 @add_toggle_cpp
@@ -199,7 +199,7 @@ And then you display the result by drawing the lines.
 @snippet samples/python/tutorial_code/ImgTrans/HoughLine/hough_lines.py draw_lines
 @end_toggle
 
-#### Probabilistic Hough Line Transform
+### Probabilistic Hough Line Transform
 First you apply the transform:
 
 @add_toggle_cpp
@@ -242,7 +242,7 @@ And then you display the result by drawing the lines.
 @snippet samples/python/tutorial_code/ImgTrans/HoughLine/hough_lines.py draw_lines_p
 @end_toggle
 
-#### Display the original image and the detected lines:
+### Display the original image and the detected lines:
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghlines.cpp imshow
@@ -256,7 +256,7 @@ And then you display the result by drawing the lines.
 @snippet samples/python/tutorial_code/ImgTrans/HoughLine/hough_lines.py imshow
 @end_toggle
 
-#### Wait until the user exits the program
+### Wait until the user exits the program
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgTrans/houghlines.cpp exit
diff --git a/doc/tutorials/imgproc/imgtrans/laplace_operator/laplace_operator.markdown b/doc/tutorials/imgproc/imgtrans/laplace_operator/laplace_operator.markdown
index 272456c776e7..33e375a82181 100644
--- a/doc/tutorials/imgproc/imgtrans/laplace_operator/laplace_operator.markdown
+++ b/doc/tutorials/imgproc/imgtrans/laplace_operator/laplace_operator.markdown
@@ -81,7 +81,7 @@ Code
 Explanation
 -----------
 
-#### Declare variables
+### Declare variables
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp variables
@@ -95,7 +95,7 @@ Explanation
 @snippet samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py variables
 @end_toggle
 
-#### Load source image
+### Load source image
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp load
@@ -109,7 +109,7 @@ Explanation
 @snippet samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py load
 @end_toggle
 
-#### Reduce noise
+### Reduce noise
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp reduce_noise
@@ -123,7 +123,7 @@ Explanation
 @snippet samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py reduce_noise
 @end_toggle
 
-#### Grayscale
+### Grayscale
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp convert_to_gray
@@ -137,7 +137,7 @@ Explanation
 @snippet samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py convert_to_gray
 @end_toggle
 
-#### Laplacian operator
+### Laplacian operator
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp laplacian
@@ -160,7 +160,7 @@ Explanation
         this example.
     -   *scale*, *delta* and *BORDER_DEFAULT*: We leave them as default values.
 
-#### Convert output to a *CV_8U* image
+### Convert output to a *CV_8U* image
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp convert
@@ -174,7 +174,7 @@ Explanation
 @snippet samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py convert
 @end_toggle
 
-#### Display the result
+### Display the result
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgTrans/Laplace_Demo.cpp display
diff --git a/doc/tutorials/imgproc/imgtrans/sobel_derivatives/sobel_derivatives.markdown b/doc/tutorials/imgproc/imgtrans/sobel_derivatives/sobel_derivatives.markdown
index 1e5a12356f3d..70b7ef89c964 100644
--- a/doc/tutorials/imgproc/imgtrans/sobel_derivatives/sobel_derivatives.markdown
+++ b/doc/tutorials/imgproc/imgtrans/sobel_derivatives/sobel_derivatives.markdown
@@ -58,7 +58,7 @@ Theory
     gradient of an image intensity function.
 -#  The Sobel Operator combines Gaussian smoothing and differentiation.
 
-#### Formulation
+### Formulation
 
 Assuming that the image to be operated is \f$I\f$:
 
@@ -140,23 +140,23 @@ You can also download it from
 Explanation
 -----------
 
-#### Declare variables
+### Declare variables
 
 @snippet cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp variables
 
-#### Load source image
+### Load source image
 
 @snippet cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp load
 
-#### Reduce noise
+### Reduce noise
 
 @snippet cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp reduce_noise
 
-#### Grayscale
+### Grayscale
 
 @snippet cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp convert_to_gray
 
-#### Sobel Operator
+### Sobel Operator
 
 @snippet cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp sobel
 
@@ -174,18 +174,18 @@ Explanation
     Notice that to calculate the gradient in *x* direction we use: \f$x_{order}= 1\f$ and
     \f$y_{order} = 0\f$. We do analogously for the *y* direction.
 
-#### Convert output to a CV_8U image
+### Convert output to a CV_8U image
 
 @snippet cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp convert
 
-#### Gradient
+### Gradient
 
 @snippet cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp blend
 
 We try to approximate the *gradient* by adding both directional gradients (note that
 this is not an exact calculation at all! but it is good for our purposes).
 
-#### Show results
+### Show results
 
 @snippet cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp display
 
diff --git a/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md b/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
index 2d0f5de98d1c..2636d85fea51 100644
--- a/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
+++ b/doc/tutorials/imgproc/morph_lines_detection/morph_lines_detection.md
@@ -80,7 +80,7 @@ Explanation / Result
 
 Get image from [here](https://raw.githubusercontent.com/opencv/opencv/4.x/doc/tutorials/imgproc/morph_lines_detection/images/src.png) .
 
-#### Load Image
+### Load Image
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgProc/morph_lines_detection/Morphology_3.cpp load_image
@@ -96,7 +96,7 @@ Get image from [here](https://raw.githubusercontent.com/opencv/opencv/4.x/doc/tu
 
 ![](images/src.png)
 
-#### Grayscale
+### Grayscale
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgProc/morph_lines_detection/Morphology_3.cpp gray
@@ -112,7 +112,7 @@ Get image from [here](https://raw.githubusercontent.com/opencv/opencv/4.x/doc/tu
 
 ![](images/gray.png)
 
-#### Grayscale to Binary image
+### Grayscale to Binary image
 
 @add_toggle_cpp
 @snippet samples/cpp/tutorial_code/ImgProc/morph_lines_detection/Morphology_3.cpp bin
@@ -128,7 +128,7 @@ Get image from [here](https://raw.githubusercontent.com/opencv/opencv/4.x/doc/tu
 
 ![](images/binary.png)
 
-#### Output images
+### Output images
 
 Now we are ready to apply morphological operations in order to extract the horizontal and vertical lines and as a consequence to separate the music notes from the music sheet, but first let's initialize the output images that we will use for that reason:
 
@@ -144,7 +144,7 @@ Now we are ready to apply morphological operations in order to extract the horiz
 @snippet samples/python/tutorial_code/imgProc/morph_lines_detection/morph_lines_detection.py init
 @end_toggle
 
-#### Structure elements
+### Structure elements
 
 As we specified in the theory in order to extract the object that we desire, we need to create the corresponding structure element. Since  we want to extract the horizontal lines, a corresponding structure element for that purpose will have the following shape:
 ![](images/linear_horiz.png)
@@ -182,7 +182,7 @@ and again this is represented as follows:
 
 ![](images/vert.png)
 
-#### Refine edges / Result
+### Refine edges / Result
 
 As you can see we are almost there. However, at that point you will notice that the edges of the notes are a bit rough. For that reason we need to refine the edges in order to obtain a smoother result:
 
diff --git a/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown b/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
index 13db710b32dd..d2dc68bc90a8 100644
--- a/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
+++ b/doc/tutorials/imgproc/out_of_focus_deblur_filter/out_of_focus_deblur_filter.markdown
@@ -117,6 +117,6 @@ References
 - [SmartDeblur] - SmartDeblur site
 
 <!-- invisible references list -->
-[Digital Image Processing]: http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/Digital_Image_Processing_2ndEd.pdf
+[Digital Image Processing]: http://web.ipac.caltech.edu/staff/fmasci/home/RefMaterial/ImageProc/Book_DigitalImageProcessing.pdf
 [Image Deblurring in Matlab]: https://www.mathworks.com/help/images/image-deblurring.html
 [SmartDeblur]: http://yuzhikov.com/articles/BlurredImagesRestoration1.htm
diff --git a/doc/tutorials/imgproc/pyramids/pyramids.markdown b/doc/tutorials/imgproc/pyramids/pyramids.markdown
index 8a6b9ce267b5..84887fd13bcb 100644
--- a/doc/tutorials/imgproc/pyramids/pyramids.markdown
+++ b/doc/tutorials/imgproc/pyramids/pyramids.markdown
@@ -43,7 +43,7 @@ Theory
         pyramid (with less resolution)
 -   In this tutorial we'll use the *Gaussian pyramid*.
 
-#### Gaussian Pyramid
+### Gaussian Pyramid
 
 -   Imagine the pyramid as a set of layers in which the higher the layer, the smaller the size.
 
@@ -100,7 +100,7 @@ Explanation
 
 Let's check the general structure of the program:
 
-#### Load an image
+### Load an image
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgProc/Pyramids/Pyramids.cpp load
@@ -114,7 +114,7 @@ Let's check the general structure of the program:
 @snippet python/tutorial_code/imgProc/Pyramids/pyramids.py load
 @end_toggle
 
-#### Create window
+### Create window
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgProc/Pyramids/Pyramids.cpp show_image
@@ -128,7 +128,7 @@ Let's check the general structure of the program:
 @snippet python/tutorial_code/imgProc/Pyramids/pyramids.py show_image
 @end_toggle
 
-#### Loop
+### Loop
 
 @add_toggle_cpp
 @snippet cpp/tutorial_code/ImgProc/Pyramids/Pyramids.cpp loop
diff --git a/doc/tutorials/introduction/android_binary_package/O4A_SDK.markdown b/doc/tutorials/introduction/android_binary_package/O4A_SDK.markdown
deleted file mode 100644
index 092eacff215b..000000000000
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.markdown
+++ /dev/null
@@ -1,255 +0,0 @@
-OpenCV4Android SDK {#tutorial_O4A_SDK}
-==================
-
-@prev_tutorial{tutorial_android_dev_intro}
-@next_tutorial{tutorial_dev_with_OCV_on_Android}
-
-|    |    |
-| -: | :- |
-| Original author | Vsevolod Glumov |
-| Compatibility | OpenCV >= 3.0 |
-
-@warning
-This tutorial is deprecated.
-
-This tutorial was designed to help you with installation and configuration of OpenCV4Android SDK.
-
-This guide was written with MS Windows 7 in mind, though it should work with GNU Linux and Apple Mac
-OS as well.
-
-This tutorial assumes you have the following software installed and configured:
-
--   JDK
--   Android SDK and NDK
--   Eclipse IDE
--   ADT and CDT plugins for Eclipse
-
-If you need help with anything of the above, you may refer to our @ref tutorial_android_dev_intro guide.
-
-If you encounter any error after thoroughly following these steps, feel free to contact us via
-[OpenCV4Android](https://groups.google.com/group/android-opencv/) discussion group or OpenCV [Q&A
-forum](https://forum.opencv.org). We'll do our best to help you out.
-
-General info
-------------
-
-OpenCV4Android SDK package enables development of Android applications with use of OpenCV library.
-
-The structure of package contents looks as follows:
-
-    OpenCV-2.4.9-android-sdk
-    |_ apk
-    |   |_ OpenCV_2.4.9_binary_pack_armv7a.apk
-    |   |_ OpenCV_2.4.9_Manager_2.18_XXX.apk
-    |
-    |_ doc
-    |_ samples
-    |_ sdk
-    |    |_ etc
-    |    |_ java
-    |    |_ native
-    |          |_ 3rdparty
-    |          |_ jni
-    |          |_ libs
-    |               |_ armeabi
-    |               |_ armeabi-v7a
-    |               |_ x86
-    |
-    |_ LICENSE
-    |_ README.android
-
--   `sdk` folder contains OpenCV API and libraries for Android:
--   `sdk/java` folder contains an Android library Eclipse project providing OpenCV Java API that can
-    be imported into developer's workspace;
--   `sdk/native` folder contains OpenCV C++ headers (for JNI code) and native Android libraries
-    (\*.so and \*.a) for ARM-v5, ARM-v7a and x86 architectures;
--   `sdk/etc` folder contains Haar and LBP cascades distributed with OpenCV.
--   `apk` folder contains Android packages that should be installed on the target Android device to
-    enable OpenCV library access via OpenCV Manager API (see details below).
-
-    On production devices that have access to Google Play Market (and Internet) these packages will
-    be installed from Market on the first start of an application using OpenCV Manager API. But
-    devkits without Market or Internet connection require this packages to be installed manually.
-    Install the Manager.apk and optional binary_pack.apk if it needed. See `Manager Selection`
-    for details.
-
-    @note Installation from Internet is the preferable way since OpenCV team may publish updated
-    versions of this packages on the Market.
-
--   `samples` folder contains sample applications projects
-    and their prebuilt packages (APK). Import them into Eclipse workspace (like described below) and
-    browse the code to learn possible ways of OpenCV use on Android.
-
--   `doc` folder contains various OpenCV documentation in PDF format. It's also available online at
-    <http://docs.opencv.org>.
-    @note The most recent docs (nightly build) are at <http://docs.opencv.org/4.x>. Generally, it's more
-    up-to-date, but can refer to not-yet-released functionality.
-    @todo I'm not sure that this is the best place to talk about OpenCV Manager
-
-Starting from version 2.4.3 OpenCV4Android SDK uses OpenCV Manager API for library initialization.
-OpenCV Manager is an Android service based solution providing the following benefits for OpenCV
-applications developers:
-
--   Compact apk-size, since all applications use the same binaries from Manager and do not store
-    native libs within themselves;
--   Hardware specific optimizations are automatically enabled on all supported platforms;
--   Automatic updates and bug fixes;
--   Trusted OpenCV library source. All packages with OpenCV are published on Google Play;
-
-
-Manual OpenCV4Android SDK setup
--------------------------------
-
-### Get the OpenCV4Android SDK
-
--#  Go to the [OpenCV download page on
-    SourceForge](http://sourceforge.net/projects/opencvlibrary/files/) and download
-    the latest available version. This tutorial is based on this package: [OpenCV-2.4.9-android-sdk.zip](http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.9/OpenCV-2.4.9-android-sdk.zip/download).
--#  Create a new folder for Android with OpenCV development. For this tutorial we have unpacked
-    OpenCV SDK to the `C:\Work\OpenCV4Android\` directory.
-
-    @note Better to use a path without spaces in it. Otherwise you may have problems with ndk-build.
-
--#  Unpack the SDK archive into the chosen directory.
-
-    You can unpack it using any popular archiver (e.g with 7-Zip):
-
-    ![](images/android_package_7zip.png)
-
-    On Unix you can use the following command:
-    @code{.bash}
-    unzip ~/Downloads/OpenCV-2.4.9-android-sdk.zip
-    @endcode
-
-### Import OpenCV library and samples to the Eclipse
-
--#  Start Eclipse and choose your workspace location.
-
-    We recommend to start working with OpenCV for Android from a new clean workspace. A new Eclipse
-    workspace can for example be created in the folder where you have unpacked OpenCV4Android SDK
-    package:
-
-    ![](images/eclipse_1_choose_workspace.png)
-
--#  Import OpenCV library and samples into workspace.
-
-    OpenCV library is packed as a ready-for-use [Android Library
-    Project](http://developer.android.com/guide/developing/projects/index.html#LibraryProjects). You
-    can simply reference it in your projects.
-
-    Each sample included into the `OpenCV-2.4.9-android-sdk.zip` is a regular Android project that
-    already references OpenCV library. Follow the steps below to import OpenCV and samples into the
-    workspace:
-
-    -   Right click on the Package Explorer window and choose Import... option from the context
-        menu:
-
-        ![](images/eclipse_5_import_command.png)
-
-    -   In the main panel select General --\> Existing Projects into Workspace and press Next
-        button:
-
-        ![](images/eclipse_6_import_existing_projects.png)
-
-    -   In the Select root directory field locate your OpenCV package folder. Eclipse should
-        automatically locate OpenCV library and samples:
-
-        ![](images/eclipse_7_select_projects.png)
-
-    -   Click Finish button to complete the import operation.
-
-    @note OpenCV samples are indeed **dependent** on OpenCV library project so don't forget to import it to your workspace as well.
-
-    After clicking Finish button Eclipse will load all selected projects into workspace, and you
-    have to wait some time while it is building OpenCV samples. Just give a minute to Eclipse to
-    complete initialization.
-
-    ![](images/eclipse_cdt_cfg4.png)
-
-    Once Eclipse completes build you will have the clean workspace without any build errors:
-
-    ![](images/eclipse_10_crystal_clean.png)
-
-@anchor tutorial_O4A_SDK_samples
-### Running OpenCV Samples
-
-At this point you should be able to build and run the samples. Keep in mind, that face-detection and
-Tutorial 2 - Mixed Processing include some native code and require Android NDK and NDK/CDT plugin
-for Eclipse to build working applications. If you haven't installed these tools, see the
-corresponding section of @ref tutorial_android_dev_intro.
-
-**warning**
-
-Please consider that some samples use Android Java Camera API, which is accessible with an AVD.
-
-@note Recent *Android SDK tools, revision 19+* can run ARM v7a OS images but they available not for
-all Android versions.
-
-Well, running samples from Eclipse is very simple:
-
--   Connect your device with adb tool from Android SDK or create an emulator with camera support.
-    -   See [Managing Virtual Devices](http://developer.android.com/guide/developing/devices/index.html) document for help
-        with Android Emulator.
-    -   See [Using Hardware Devices](http://developer.android.com/guide/developing/device.html) for
-        help with real devices (not emulators).
-
--   Select project you want to start in Package Explorer and just press Ctrl + F11 or select option
-    Run --\> Run from the main menu, or click Run button on the toolbar.
-
-    @note Android Emulator can take several minutes to start. So, please, be patient. \* On the first
-    run Eclipse will ask you about the running mode for your application:
-
-    ![](images/eclipse_11_run_as.png)
-
--   Select the Android Application option and click OK button. Eclipse will install and run the
-    sample.
-
-    Chances are that on the first launch you will not have the [OpenCV
-    Manager](https://docs.google.com/a/itseez.com/presentation/d/1EO_1kijgBg_BsjNp2ymk-aarg-0K279_1VZRcPplSuk/present#slide=id.p)
-    package installed. In this case you will see the following message:
-
-    ![](images/android_emulator_opencv_manager_fail.png)
-
-    To get rid of the message you will need to install OpenCV Manager and the appropriate
-    OpenCV binary pack. Simply tap Yes if you have *Google Play Market* installed on your
-    device/emulator. It will redirect you to the corresponding page on *Google Play Market*.
-
-    If you have no access to the *Market*, which is often the case with emulators - you will need to
-    install the packages from OpenCV4Android SDK folder manually. See `Manager Selection` for
-    details.
-    @code{.sh}
-    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.9_Manager_2.18_armv7a-neon.apk
-    @endcode
-
-    @note armeabi, armv7a-neon, arm7a-neon-android8, mips and x86 stand for platform targets:
-        -   armeabi is for ARM v5 and ARM v6 architectures with Android API 8+,
-        -   armv7a-neon is for NEON-optimized ARM v7 with Android API 9+,
-        -   arm7a-neon-android8 is for NEON-optimized ARM v7 with Android API 8,
-        -   mips is for MIPS architecture with Android API 9+,
-        -   x86 is for Intel x86 CPUs with Android API 9+.
-
-    @note
-    If using hardware device for testing/debugging, run the following command to learn its CPU
-    architecture:
-    @code{.sh}
-    adb shell getprop ro.product.cpu.abi
-    @endcode
-    If you're using an AVD emulator, go Window \> AVD Manager to see the list of available devices.
-    Click Edit in the context menu of the selected device. In the window, which then pop-ups, find
-    the CPU field.
-
-    @note
-    You may also see section `Manager Selection` for details.
-
-    When done, you will be able to run OpenCV samples on your device/emulator seamlessly.
-
--   Here is Sample - image-manipulations sample, running on top of stock camera-preview of the
-    emulator.
-
-    ![](images/emulator_canny.png)
-
-What's next
------------
-
-Now, when you have your instance of OpenCV4Adroid SDK set up and configured, you may want to proceed
-to using OpenCV in your own application. You can learn how to do that in a separate @ref tutorial_dev_with_OCV_on_Android tutorial.
diff --git a/doc/tutorials/introduction/android_binary_package/android_dev_intro.markdown b/doc/tutorials/introduction/android_binary_package/android_dev_intro.markdown
index 584f9a2f6cf2..106278bb95a7 100644
--- a/doc/tutorials/introduction/android_binary_package/android_dev_intro.markdown
+++ b/doc/tutorials/introduction/android_binary_package/android_dev_intro.markdown
@@ -2,24 +2,18 @@ Introduction into Android Development {#tutorial_android_dev_intro}
 =====================================
 
 @prev_tutorial{tutorial_clojure_dev_intro}
-@next_tutorial{tutorial_O4A_SDK}
+@next_tutorial{tutorial_dev_with_OCV_on_Android}
 
 |    |    |
 | -: | :- |
-| Original author | Vsevolod Glumov |
-| Compatibility | OpenCV >= 3.0 |
-
-@warning
-This tutorial is deprecated.
-
+| Original author | Rostislav Vasilikhin |
+| Compatibility | OpenCV >= 4.0 |
 
 This guide was designed to help you in learning Android development basics and setting up your
-working environment quickly. It was written with Windows 7 in mind, though it would work with Linux
-(Ubuntu), Mac OS X and any other OS supported by Android SDK.
+working environment quickly. It was tested with Ubuntu 22.04 and Windows 10.
 
 If you encounter any error after thoroughly following these steps, feel free to contact us via
-[OpenCV4Android](https://groups.google.com/group/android-opencv/) discussion group or OpenCV [Q&A
-forum](https://forum.opencv.org). We'll do our best to help you out.
+OpenCV [Forum](https://forum.opencv.org). We'll do our best to help you out.
 
 Preface
 -------
@@ -29,7 +23,7 @@ by Google. See the [Android home site](http://www.android.com/about/) for genera
 
 Development for Android significantly differs from development for other platforms. So before
 starting programming for Android we recommend you make sure that you are familiar with the following
-key topis:
+key topics:
 
 -#  [Java](http://en.wikipedia.org/wiki/Java_(programming_language)) programming language that is
     the primary development technology for Android OS. Also, you can find [Oracle docs on
@@ -39,368 +33,62 @@ key topis:
     JNI](http://docs.oracle.com/javase/7/docs/technotes/guides/jni/) useful.
 -#  [Android
     Activity](http://developer.android.com/training/basics/activity-lifecycle/starting.html) and its
-    lifecycle, that is an essential Android API class.
+    life-cycle, that is an essential Android API class.
 -#  OpenCV development will certainly require some knowledge of the [Android
     Camera](http://developer.android.com/guide/topics/media/camera.html) specifics.
 
 Manual environment setup for Android development
 ------------------------------------------------
 
-### Development in Java
-
-You need the following software to be installed in order to develop for Android in Java:
-
--#  **Sun JDK 6** (Sun JDK 7 is also possible)
-
-    Visit [Java SE Downloads page](http://www.oracle.com/technetwork/java/javase/downloads/) and
-    download an installer for your OS.
-
-    Here is a detailed JDK (Java Development Kit) [installation
-    guide](http://source.android.com/source/initializing.html#installing-the-jdk) for Ubuntu and Mac
-    OS (only JDK sections are applicable for OpenCV)
-
-    @note OpenJDK is not suitable for Android development, since Android SDK supports only Sun JDK. If you use Ubuntu, after installation of Sun JDK you should run the following command to set Sun java environment:
-        @code{.bash}
-        sudo update-java-alternatives --set java-6-sun
-        @endcode
-
--#  **Android SDK**
-
-    Get the latest Android SDK from <http://developer.android.com/sdk/index.html>
-
-    Here is Google's [install guide](http://developer.android.com/sdk/installing.html) for the SDK.
-
-    @note You can choose downloading **ADT Bundle package** that in addition to Android SDK Tools
-    includes Eclipse + ADT + NDK/CDT plugins, Android Platform-tools, the latest Android platform and
-    the latest Android system image for the emulator - this is the best choice for those who is setting
-    up Android development environment the first time!
-
-    @note If you are running x64 version of Ubuntu Linux, then you need ia32 shared libraries for use on amd64 and ia64 systems to be installed. You can install them with the following command:
-       @code{.bash}
-        sudo apt-get install ia32-libs
-        @endcode
-        For Red Hat based systems the following command might be helpful:
-        @code{.bash}
-        sudo yum install libXtst.i386
-        @endcode
-
--#  **Android SDK components**
-
-    You need the following SDK components to be installed:
-
-    -   *Android SDK Tools, revision 20* or newer.
-
-        Older revisions should also work, but they are not recommended.
-
-    -   *SDK Platform Android 3.0* (API 11).
-
-        The minimal platform supported by OpenCV Java API is **Android 2.2** (API 8). This is also
-        the minimum API Level required for the provided samples to run. See the
-        \<uses-sdk android:minSdkVersion="8"/\> tag in their **AndroidManifest.xml** files. But for
-        successful compilation the **target** platform should be set to Android 3.0 (API 11) or
-        higher. It will not prevent them from running on Android 2.2.
-
-        ![](images/android_sdk_and_avd_manager.png)
-
-        See [Adding Platforms and
-        Packages](http://developer.android.com/sdk/installing/adding-packages.html) for help with
-        installing/updating SDK components.
-
--#  **Eclipse IDE**
-
-    Check the [Android SDK System Requirements](http://developer.android.com/sdk/requirements.html)
-    document for a list of Eclipse versions that are compatible with the Android SDK. For OpenCV
-    2.4.x we recommend **Eclipse 3.7 (Indigo)** or **Eclipse 4.2 (Juno)**. They work well for OpenCV
-    under both Windows and Linux.
-
-    If you have no Eclipse installed, you can get it from the [official
-    site](http://www.eclipse.org/downloads/).
-
--#  **ADT plugin for Eclipse**
+In this tutorial we're gonna use an official Android Studio IDE and a set of other freely available tools.
 
-    These instructions are copied from [Android Developers
-    site](http://developer.android.com/sdk/installing/installing-adt.html), check it out in case of
-    any ADT-related problem.
+### Get tools and dependencies
 
-    Assuming that you have Eclipse IDE installed, as described above, follow these steps to download
-    and install the ADT plugin:
+Here's how to get a ready to work environment:
 
-    -#  Start Eclipse, then select Help --\> Install New Software...
-    -#  Click Add (in the top-right corner).
-    -#  In the Add Repository dialog that appears, enter "ADT Plugin" for the Name and the following
-        URL for the Location: <https://dl-ssl.google.com/android/eclipse/>
+1. Download and install Android Studio:
+    * Ubuntu:
+        1. Download Android Studio: https://developer.android.com/studio
+        2. Extract the tar.gz archive
+        3. Follow the instructions in `Install-Linux-tar.txt`: open `android-studio/bin` folder in terminal and run `./studio.sh`
+        4. Perform standard installation through GUI
+        5. Optionally you can add a shortcut on a desktop for a quick access by clicking menu ***Tools -> Create desktop entry***. The menu appears after any project is created or opened.
+    * Windows:
+        Just download Android Studio from the official site and run installer.
 
-    -#  Click OK
+2. Install fresh Android SDK and NDK:
+    1. Open SDK manager in Android Studio (***Customize -> All Settings -> Languages & Frameworks -> Android SDK***)
+    2. Enable "Show Package Details" checkbox
+    ![](images/sdk_ndk_manager.png)
+    3. Check SDK and NDK of the latest versions and press OK
+    4. Make sure that your device support the chosen SDK versions
 
-        @note If you have trouble acquiring the plugin, try using "http" in the Location URL, instead of "https" (https is preferred for security reasons).
+3. Install all the necessary packages for the build:
+    - `sudo apt install git cmake ninja-build openjdk-17-jdk openjdk-17-jre`
+    - the rest required packages are dependencies and should be installed automatically
 
-    -#  In the Available Software dialog, select the checkbox next to Developer Tools and click Next.
+### Check OpenCV examples
 
-    -#  In the next window, you'll see a list of the tools to be downloaded. Click Next.
+1. Download OpenCV from Android SDK from official [release page on Github](https://github.com/opencv/opencv/releases)
+or [SourceForge](https://sourceforge.net/projects/opencvlibrary/).
+2. Extract zip archive with your OS tools.
+3. Open the project `<YOUR_OPENCV_BUILD_FOLDER>/OpenCV-android-sdk/samples` in Android Studio.
+4. Connect your device
+    * Debugging should be enabled on a device, you can find an instruction about it across the web
+    * Alternatively you can use a virtual device that comes with the Android studio
+    ![](images/choose_device.png)
+5. Choose a sample from the drop-down menu (for example, `15-puzzle`) and run it.
 
-        @note If you also plan to develop native C++ code with Android NDK don't forget to enable NDK Plugins installations as well.
+Setup Device for Testing and Debugging
+--------------------------------------
 
-        ![](images/eclipse_inst_adt.png)
+Usually the recipe above works as expected, but in some cases there are additional actions that must
+be performed. In this section we'll cover some cases.
 
-    -#  Read and accept the license agreements, then click Finish.
+### Windows host computer
 
-        @note If you get a security warning saying that the authenticity or validity of the software can't be established, click OK.
-
-    -#  When the installation completes, restart Eclipse.
-
-### Native development in C++
-
-You need the following software to be installed in order to develop for Android in C++:
-
--#  **Android NDK**
-
-    To compile C++ code for Android platform you need Android Native Development Kit (*NDK*).
-
-    You can get the latest version of NDK from the [download
-    page](http://developer.android.com/tools/sdk/ndk/index.html). To install Android NDK just
-    extract the archive to some folder on your computer. Here are [installation
-    instructions](http://developer.android.com/tools/sdk/ndk/index.html#Installing).
-
-    @note Before start you can read official Android NDK documentation which is in the Android NDK
-    archive, in the folder `docs/`. The main article about using Android NDK build system is in the
-    `ANDROID-MK.html` file. Some additional information you can find in the `APPLICATION-MK.html`,
-    `NDK-BUILD.html` files, and `CPU-ARM-NEON.html`, `CPLUSPLUS-SUPPORT.html`, `PREBUILTS.html`.
-
--#  **CDT plugin for Eclipse**
-
-    If you selected for installation the NDK plugins component of Eclipse ADT plugin (see the picture
-    above) your Eclipse IDE should already have CDT plugin (that means C/C++ Development Tooling).
-    There are several possible ways to integrate compilation of C++ code by Android NDK into Eclipse
-    compilation process. We recommend the approach based on Eclipse CDT(C/C++ Development Tooling)
-    Builder.
-
-Android application structure
------------------------------
-
-Usually source code of an Android application has the following structure:
-
--   `root folder of the project/`
-    -   `jni/`
-    -   `libs/`
-    -   `res/`
-    -   `src/`
-    -   `AndroidManifest.xml`
-    -   `project.properties`
-    -   `... other files ...`
-
-Where:
-
--   the `src` folder contains Java code of the application,
--   the `res` folder contains resources of the application (images, xml files describing UI layout,
-    etc),
--   the `libs` folder will contain native libraries after a successful build,
--   and the `jni` folder contains C/C++ application source code and NDK's build scripts `Android.mk`
-    and `Application.mk` producing the native libraries,
--   `AndroidManifest.xml` file presents essential information about application to the Android
-    system (name of the Application, name of main application's package, components of the
-    application, required permissions, etc).
-
-    It can be created using Eclipse wizard or android tool from Android SDK.
-
--   `project.properties` is a text file containing information about target Android platform and
-    other build details. This file is generated by Eclipse or can be created with android tool
-    included in Android SDK.
-
-@note Both `AndroidManifest.xml` and `project.properties` files are required to compile the C++ part
-of the application, since Android NDK build system relies on them. If any of these files does not
-exist, compile the Java part of the project before the C++ part.
-
-`Android.mk` and `Application.mk` scripts
------------------------------------------
-
-The script `Android.mk` usually has the following structure:
-@code{.make}
-LOCAL_PATH := \f$(call my-dir)
-
-include \f$(CLEAR_VARS)
-LOCAL_MODULE    := <module_name>
-LOCAL_SRC_FILES := <list of .c and .cpp project files>
-<some variable name> := <some variable value>
-...
-<some variable name> := <some variable value>
-
-include \f$(BUILD_SHARED_LIBRARY)
-@endcode
-This is the minimal file `Android.mk`, which builds C++ source code of an Android application. Note
-that the first two lines and the last line are mandatory for any `Android.mk`.
-
-Usually the file `Application.mk` is optional, but in case of project using OpenCV, when STL and
-exceptions are used in C++, it also should be created. Example of the file `Application.mk`:
-@code{.make}
-APP_STL := gnustl_static
-APP_CPPFLAGS := -frtti -fexceptions
-APP_ABI := all
-@endcode
-
-@note We recommend setting APP_ABI := all for all targets. If you want to specify the target
-explicitly, use armeabi for ARMv5/ARMv6, armeabi-v7a for ARMv7, x86 for Intel Atom or mips for MIPS.
-
-@anchor tutorial_android_dev_intro_ndk
-Building application native part from command line
---------------------------------------------------
-
-Here is the standard way to compile C++ part of an Android application:
-
-**warning**
-
-We strongly recommend using cmd.exe (standard Windows console) instead of Cygwin on
-   **Windows**. Use the latter if only you're absolutely sure about, what you're doing. Cygwin is
-    not really supported and we are unlikely to help you in case you encounter some problems with
-    it. So, use it only if you're capable of handling the consequences yourself.
-
--#  Open console and go to the root folder of an Android application
-    @code{.bash}
-    cd <root folder of the project>/
-    @endcode
--#  Run the following command
-    @code{.bash}
-    <path_where_NDK_is_placed>/ndk-build
-    @endcode
-    @note On Windows we recommend to use ndk-build.cmd in standard Windows console (cmd.exe) rather than the similar bash script in Cygwin shell.
-       ![](images/ndk_build.png)
-
--#  After executing this command the C++ part of the source code is compiled.
-
-After that the Java part of the application can be (re)compiled (using either *Eclipse* or *Ant*
-build tool).
-
-@note Some parameters can be set for the ndk-build:
-   **Example 1**: Verbose compilation
-    @code{.bash}
-    <path_where_NDK_is_placed>/ndk-build V=1
-    @endcode
-    **Example 2**: Rebuild all
-    @code{.bash}
-    <path_where_NDK_is_placed>/ndk-build -B
-    @endcode
-
-@anchor tutorial_android_dev_intro_eclipse
-Building application native part from *Eclipse* (CDT Builder)
--------------------------------------------------------------
-
-There are several possible ways to integrate compilation of native C++ code by Android NDK into
-Eclipse build process. We recommend the approach based on Eclipse CDT(C/C++ Development Tooling)
-Builder.
-
-**important**
-
-OpenCV for Android package since version 2.4.2 contains sample projects
-   pre-configured CDT Builders. For your own projects follow the steps below.
-
--#  Define the NDKROOT environment variable containing the path to Android NDK in your system (e.g.
-    "X:\\Apps\\android-ndk-r8" or "/opt/android-ndk-r8").
-
-    **On Windows** an environment variable can be set via
-    My Computer -\> Properties -\> Advanced -\> Environment variables. On Windows 7 it's also
-    possible to use [setx](http://ss64.com/nt/setx.html) command in a console session.
-
-    **On Linux** and **MacOS** an environment variable can be set via appending a
-    "export VAR_NAME=VAR_VALUE" line to the `"~/.bashrc"` file and logging off and then on.
-
-    @note It's also possible to define the NDKROOT environment variable within Eclipse IDE, but it
-    should be done for every new workspace you create. If you prefer this option better than setting
-    system environment variable, open Eclipse menu
-    Window -\> Preferences -\> C/C++ -\> Build -\> Environment, press the Add... button and set variable
-    name to NDKROOT and value to local Android NDK path. \#. After that you need to **restart Eclipse**
-    to apply the changes.
-
--#  Open Eclipse and load the Android app project to configure.
-
--#  Add C/C++ Nature to the project via Eclipse menu
-    New -\> Other -\> C/C++ -\> Convert to a C/C++ Project.
-    ![](images/eclipse_cdt_cfg1.png)
-    And:
-    ![](images/eclipse_cdt_cfg2.png)
-
--#  Select the project(s) to convert. Specify "Project type" = Makefile project, "Toolchains" =
-    Other Toolchain.
-    ![](images/eclipse_cdt_cfg3.png)
-
--#  Open Project Properties -\> C/C++ Build, uncheck Use default build command, replace "Build
-    command" text from "make" to
-
-    "${NDKROOT}/ndk-build.cmd" on Windows,
-
-    "${NDKROOT}/ndk-build" on Linux and MacOS.
-
-    ![](images/eclipse_cdt_cfg4.png)
-
--#  Go to Behaviour tab and change "Workbench build type" section like shown below:
-
-    ![](images/eclipse_cdt_cfg5.png)
-
--#  Press OK and make sure the ndk-build is successfully invoked when building the project.
-
-    ![](images/eclipse_cdt_cfg6.png)
-
--#  If you open your C++ source file in Eclipse editor, you'll see syntax error notifications. They
-    are not real errors, but additional CDT configuring is required.
-
-    ![](images/eclipse_cdt_cfg7.png)
-
--#  Open Project Properties -\> C/C++ General -\> Paths and Symbols and add the following
-    **Include** paths for **C++**:
-    @code
-        # for NDK r8 and prior:
-        ${NDKROOT}/platforms/android-9/arch-arm/usr/include
-        ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/include
-        ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/libs/armeabi-v7a/include
-        ${ProjDirPath}/../../sdk/native/jni/include
-
-        # for NDK r8b and later:
-        ${NDKROOT}/platforms/android-9/arch-arm/usr/include
-        ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include
-        ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include
-        ${ProjDirPath}/../../sdk/native/jni/include
-    @endcode
-    The last path should be changed to the correct absolute or relative path to OpenCV4Android SDK
-    location.
-
-    This should clear the syntax error notifications in Eclipse C++ editor.
-
-    ![](images/eclipse_cdt_cfg8.png)
-
-Debugging and Testing
----------------------
-
-In this section we will give you some easy-to-follow instructions on how to set up an emulator or
-hardware device for testing and debugging an Android project.
-
-### AVD
-
-AVD (*Android Virtual Device*) is not probably the most convenient way to test an OpenCV-dependent
-application, but sure the most uncomplicated one to configure.
-
--#  Assuming you already have *Android SDK* and *Eclipse IDE* installed, in Eclipse go
-    Window -\> AVD Manager.
--#  Press the New button in AVD Manager window.
--#  Create new Android Virtual Device window will let you select some properties for your new
-    device, like target API level, size of SD-card and other.
-
-    ![](images/AVD_create.png)
-
--#  When you click the Create AVD button, your new AVD will be available in AVD Manager.
--#  Press Start to launch the device. Be aware that any AVD (a.k.a. Emulator) is usually much slower
-    than a hardware Android device, so it may take up to several minutes to start.
--#  Go Run -\> Run/Debug in Eclipse IDE to run your application in regular or debugging mode.
-    Device Chooser will let you choose among the running devices or to start a new one.
-
-### Hardware Device
-
-If you have an Android device, you can use it to test and debug your applications. This way is more
-authentic, though a little bit harder to set up. You need to make some actions for Windows and Linux
-operating systems to be able to work with Android devices. No extra actions are required for Mac OS.
-See detailed information on configuring hardware devices in subsections below.
-
-You may also consult the official [Android Developers site
-instructions](http://developer.android.com/tools/device.html) for more information.
-
-#### Windows host computer
+If you have Windows 10 or higher then you don't have to do additional actions to connect
+a phone and run samples on it. However, earlier Windows versions require a longer procedure:
 
 -#  Enable USB debugging on the Android device (via Settings menu).
 -#  Attach the Android device to your PC with a USB cable.
@@ -475,10 +163,10 @@ instructions](http://developer.android.com/tools/device.html) for more informati
 -#  Now, in Eclipse go Run -\> Run/Debug to run your application in regular or debugging mode.
     Device Chooser will let you choose among the devices.
 
-#### Linux host computer
+### Linux host computer
 
-By default Linux doesn't recognize Android devices, but it's easy to fix this issue. On Ubuntu Linux
-you have to create a new **/etc/udev/rules.d/51-android.rules** configuration file that contains
+While the latest Ubuntu versions work well with connected Android devices, there can be issues on older versions.
+However, most of them can be fixed easily. You have to create a new **/etc/udev/rules.d/51-android.rules** configuration file that contains
 information about your Android device. You may find some Vendor ID's
 [here](http://developer.android.com/tools/device.html#VendorIds) or execute lsusb command to view
 VendorID of plugged Android device. Here is an example of such file for LG device:
@@ -488,14 +176,20 @@ SUBSYSTEM=="usb", ATTR{idVendor}=="1004",  MODE="0666", GROUP="plugdev"
 Then restart your adb server (even better to restart the system), plug in your Android device and
 execute adb devices command. You will see the list of attached devices:
 
-![](images/usb_device_connect_ubuntu.png)
+```
+savuor@rostislav-laptop:~/Android/Sdk/platform-tools$ ./adb devices
+List of devices attached
+R58MB40Q3VP     device
+
+savuor@rostislav-laptop:~/Android/Sdk/platform-tools$
+```
 
-#### Mac OS host computer
+### Mac OS host computer
 
 No actions are required, just connect your device via USB and run adb devices to check connection.
 
 What's next
 -----------
 
-Now, when you have your development environment set up and configured, you may want to proceed to
-installing OpenCV4Android SDK. You can learn how to do that in a separate @ref tutorial_O4A_SDK tutorial.
+Now, when you have your instance of OpenCV4Adroid SDK set up and configured, you may want to proceed
+to using OpenCV in your own application. You can learn how to do that in a separate @ref tutorial_dev_with_OCV_on_Android tutorial.
diff --git a/doc/tutorials/introduction/android_binary_package/android_dnn_intro.markdown b/doc/tutorials/introduction/android_binary_package/android_dnn_intro.markdown
new file mode 100644
index 000000000000..871ac5e588b1
--- /dev/null
+++ b/doc/tutorials/introduction/android_binary_package/android_dnn_intro.markdown
@@ -0,0 +1,85 @@
+# How to run deep networks on Android device {#tutorial_android_dnn_intro}
+
+@tableofcontents
+
+@prev_tutorial{tutorial_dev_with_OCV_on_Android}
+@next_tutorial{tutorial_android_ocl_intro}
+
+@see @ref tutorial_table_of_content_dnn
+
+|    |    |
+| -: | :- |
+| Original author | Dmitry Kurtaev |
+| Compatibility | OpenCV >= 4.9 |
+
+## Introduction
+In this tutorial you'll know how to run deep learning networks on Android device
+using OpenCV deep learning module.
+Tutorial was written for Android Studio 2022.2.1.
+
+## Requirements
+
+- Download and install Android Studio from https://developer.android.com/studio.
+
+- Get the latest pre-built OpenCV for Android release from https://github.com/opencv/opencv/releases
+and unpack it (for example, `opencv-4.X.Y-android-sdk.zip`, minimum version 4.9 is required).
+
+- Download MobileNet object detection model from https://github.com/chuanqi305/MobileNet-SSD.
+Configuration file `MobileNetSSD_deploy.prototxt` and model weights `MobileNetSSD_deploy.caffemodel`
+are required.
+
+## Create an empty Android Studio project and add OpenCV dependency
+
+Use @ref tutorial_dev_with_OCV_on_Android tutorial to initialize your project and add OpenCV.
+
+## Make an app
+
+Our sample will takes pictures from a camera, forwards it into a deep network and
+receives a set of rectangles, class identifiers and confidence values in range [0, 1].
+
+- First of all, we need to add a necessary widget which displays processed
+frames. Modify `app/src/main/res/layout/activity_main.xml`:
+@include android/mobilenet-objdetect/res/layout/activity_main.xml
+
+- Modify `/app/src/main/AndroidManifest.xml` to enable full-screen mode, set up
+a correct screen orientation and allow to use a camera.
+@code{.xml}
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <application
+        android:label="@string/app_name">
+@endcode
+@snippet android/mobilenet-objdetect/gradle/AndroidManifest.xml mobilenet_tutorial
+
+- Replace content of `app/src/main/java/com/example/myapplication/MainActivity.java` and set a custom package name if necessary:
+
+@snippet android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java mobilenet_tutorial_package
+@snippet android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java mobilenet_tutorial
+
+- Put downloaded `deploy.prototxt` and `mobilenet_iter_73000.caffemodel`
+into `app/src/main/res/raw` folder. OpenCV DNN model is mainly designed to load ML and DNN models
+from file. Modern Android does not allow it without extra permissions, but provides Java API to load
+bytes from resources. The sample uses alternative DNN API that initializes a model from in-memory
+buffer rather than a file. The following function reads model file from resources and converts it to
+`MatOfBytes` (analog of `std::vector<char>` in C++ world) object suitable for OpenCV Java API:
+
+@snippet android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java mobilenet_tutorial_resource
+
+And then the network initialization is done with the following lines:
+
+@snippet android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java init_model_from_memory
+
+See also [Android documentation on resources](https://developer.android.com/guide/topics/resources/providing-resources.html)
+
+- Take a look how DNN model input is prepared and inference result is interpreted:
+
+@snippet android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java mobilenet_handle_frame
+
+`Dnn.blobFromImage` converts camera frame to neural network input tensor. Resize and statistical
+normalization are applied. Each line of network output tensor contains information on one detected
+object in the following order: confidence in range [0, 1], class id, left, top, right, bottom box
+coordinates. All coordinates are in range [0, 1] and should be scaled to image size before rendering.
+
+- Launch an application and make a fun!
+![](images/11_demo.jpg)
diff --git a/doc/tutorials/introduction/android_binary_package/android_ocl_intro.markdown b/doc/tutorials/introduction/android_binary_package/android_ocl_intro.markdown
index b06914a9086b..2cab424623a0 100644
--- a/doc/tutorials/introduction/android_binary_package/android_ocl_intro.markdown
+++ b/doc/tutorials/introduction/android_binary_package/android_ocl_intro.markdown
@@ -1,26 +1,24 @@
 Use OpenCL in Android camera preview based CV application {#tutorial_android_ocl_intro}
 =====================================
 
-@prev_tutorial{tutorial_dev_with_OCV_on_Android}
+@prev_tutorial{tutorial_android_dnn_intro}
 @next_tutorial{tutorial_macos_install}
 
 |    |    |
 | -: | :- |
-| Original author | Andrey Pavlenko |
-| Compatibility | OpenCV >= 3.0 |
-
-@warning
-This tutorial is deprecated.
+| Original author | Andrey Pavlenko, Alexander Panov |
+| Compatibility   | OpenCV >= 4.9 |
 
 This guide was designed to help you in use of [OpenCL &trade;](https://www.khronos.org/opencl/) in Android camera preview based CV application.
-It was written for [Eclipse-based ADT tools](http://developer.android.com/tools/help/adt.html)
-(deprecated by Google now), but it easily can be reproduced with [Android Studio](http://developer.android.com/tools/studio/index.html).
+Tutorial was written for [Android Studio](http://developer.android.com/tools/studio/index.html) 2022.2.1. It was tested with Ubuntu 22.04.
 
 This tutorial assumes you have the following installed and configured:
 
--   JDK
--   Android SDK and NDK
--   Eclipse IDE with ADT and CDT plugins
+-   Android Studio (2022.2.1.+)
+-   JDK 17
+-   Android SDK
+-   Android NDK (25.2.9519653+)
+-   download OpenCV source code from [github](git@github.com:opencv/opencv.git) or from [releases](https://opencv.org/releases/) and build by [instruction on wiki](https://github.com/opencv/opencv/wiki/Custom-OpenCV-Android-SDK-and-AAR-package-build).
 
 It also assumes that you are familiar with Android Java and JNI programming basics.
 If you need help with anything of the above, you may refer to our @ref tutorial_android_dev_intro guide.
@@ -30,6 +28,56 @@ This tutorial also assumes you have an Android operated device with OpenCL enabl
 The related source code is located within OpenCV samples at
 [opencv/samples/android/tutorial-4-opencl](https://github.com/opencv/opencv/tree/4.x/samples/android/tutorial-4-opencl/) directory.
 
+How to build custom OpenCV Android SDK with OpenCL
+--------------------------------------------------
+
+1. __Assemble and configure Android OpenCL SDK.__
+The JNI part of the sample depends on standard Khornos OpenCL headers, and C++ wrapper for OpenCL and libOpenCL.so.
+The standard OpenCL headers may be copied from 3rdparty directory in OpenCV repository or you Linux distribution package.
+C++ wrapper is available in [official Khronos reposiotry on Github](https://github.com/KhronosGroup/OpenCL-CLHPP).
+Copy the header files to didicated directory in the following way:
+@code{.bash}
+cd your_path/ && mkdir ANDROID_OPENCL_SDK && mkdir ANDROID_OPENCL_SDK/include && cd ANDROID_OPENCL_SDK/include
+cp -r path_to_opencv/opencv/3rdparty/include/opencl/1.2/CL . && cd CL
+wget https://github.com/KhronosGroup/OpenCL-CLHPP/raw/main/include/CL/opencl.hpp
+wget https://github.com/KhronosGroup/OpenCL-CLHPP/raw/main/include/CL/cl2.hpp
+@endcode
+libOpenCL.so may be provided with BSP or just downloaded from any OpenCL-cabaple Android device with relevant arhitecture.
+@code{.bash}
+cd your_path/ANDROID_OPENCL_SDK && mkdir lib && cd lib
+adb pull /system/vendor/lib64/libOpenCL.so
+@endcode
+System verison of libOpenCL.so may have a lot of platform specific dependencies. `-Wl,--allow-shlib-undefined` flag allows
+to ignore 3rdparty symbols if they are not used during the build.
+The following CMake line allows to link the JNI part against standard OpenCL, but not include the loadLibrary into
+application package. System OpenCL API is used in run-time.
+@code
+target_link_libraries(${target} -lOpenCL)
+@endcode
+
+
+2. __Build custom OpenCV Android SDK with OpenCL.__
+OpenCL support (T-API) is disabled in OpenCV builds for Android OS by default.
+but it's possible to rebuild locally OpenCV for Android with OpenCL/T-API enabled: use `-DWITH_OPENCL=ON` option for CMake.
+You also need to specify the path to the Android OpenCL SDK: use `-DANDROID_OPENCL_SDK=path_to_your_Android_OpenCL_SDK` option for CMake.
+If you are building OpenCV using `build_sdk.py` please follow [instruction on wiki](https://github.com/opencv/opencv/wiki/Custom-OpenCV-Android-SDK-and-AAR-package-build).
+Set these CMake parameters in your `.config.py`, e.g. `ndk-18-api-level-21.config.py`:
+@code{.py}
+ABI("3", "arm64-v8a", None, 21, cmake_vars=dict('WITH_OPENCL': 'ON', 'ANDROID_OPENCL_SDK': 'path_to_your_Android_OpenCL_SDK'))
+@endcode
+If you are building OpenCV using cmake/ninja, use this bash script (set your NDK_VERSION and your paths instead of examples of paths):
+@code{.bash}
+cd path_to_opencv && mkdir build && cd build
+export NDK_VERSION=25.2.9519653
+export ANDROID_SDK=/home/user/Android/Sdk/
+export ANDROID_OPENCL_SDK=/path_to_ANDROID_OPENCL_SDK/
+export ANDROID_HOME=$ANDROID_SDK
+export ANDROID_NDK_HOME=$ANDROID_SDK/ndk/$NDK_VERSION/
+cmake -GNinja -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_HOME/build/cmake/android.toolchain.cmake -DANDROID_STL=c++_shared -DANDROID_NATIVE_API_LEVEL=24
+-DANDROID_SDK=$ANDROID_SDK -DANDROID_NDK=$ANDROID_NDK_HOME -DBUILD_JAVA=ON -DANDROID_HOME=$ANDROID_SDK -DBUILD_ANDROID_EXAMPLES=ON
+-DINSTALL_ANDROID_EXAMPLES=ON -DANDROID_ABI=arm64-v8a -DWITH_OPENCL=ON -DANDROID_OPENCL_SDK=$ANDROID_OPENCL_SDK ..
+@endcode
+
 Preface
 -------
 
@@ -97,74 +145,16 @@ public class Tutorial4Activity extends Activity {
 
 And a minimal `View` class respectively:
 
-@code{.java}
-public class MyGLSurfaceView extends GLSurfaceView {
-
-    MyGLRendererBase mRenderer;
-
-    public MyGLSurfaceView(Context context) {
-        super(context);
-
-        if(android.os.Build.VERSION.SDK_INT >= 21)
-            mRenderer = new Camera2Renderer(this);
-        else
-            mRenderer = new CameraRenderer(this);
-
-        setEGLContextClientVersion(2);
-        setRenderer(mRenderer);
-        setRenderMode(GLSurfaceView.RENDERMODE_WHEN_DIRTY);
-    }
-
-    @Override
-    public void surfaceCreated(SurfaceHolder holder) {
-        super.surfaceCreated(holder);
-    }
-
-    @Override
-    public void surfaceDestroyed(SurfaceHolder holder) {
-        super.surfaceDestroyed(holder);
-    }
-
-    @Override
-    public void surfaceChanged(SurfaceHolder holder, int format, int w, int h) {
-        super.surfaceChanged(holder, format, w, h);
-    }
-
-    @Override
-    public void onResume() {
-        super.onResume();
-        mRenderer.onResume();
-    }
-
-    @Override
-    public void onPause() {
-        mRenderer.onPause();
-        super.onPause();
-    }
-}
-@endcode
+@snippet samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/MyGLSurfaceView.java minimal_surface_view
 
-__Note__: we use two renderer classes: one for legacy [Camera](http://developer.android.com/reference/android/hardware/Camera.html) API
+@note we use two renderer classes: one for legacy [Camera](http://developer.android.com/reference/android/hardware/Camera.html) API
 and another for modern [Camera2](http://developer.android.com/reference/android/hardware/camera2/package-summary.html).
 
 A minimal `Renderer` class can be implemented in Java (OpenGL ES 2.0 [available](http://developer.android.com/reference/android/opengl/GLES20.html) in Java),
 but since we are going to modify the preview texture with OpenCL let's move OpenGL stuff to JNI.
 Here is a simple Java wrapper for our JNI stuff:
 
-@code{.java}
-public class NativeGLRenderer {
-    static
-    {
-        System.loadLibrary("opencv_java4"); // comment this when using OpenCV Manager
-        System.loadLibrary("JNIrender");
-    }
-
-    public static native int initGL();
-    public static native void closeGL();
-    public static native void drawFrame();
-    public static native void changeSize(int width, int height);
-}
-@endcode
+@snippet samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/NativePart.java native_part
 
 Since `Camera` and `Camera2` APIs differ significantly in camera setup and control, let's create a base class for the two corresponding renderers:
 
@@ -275,126 +265,21 @@ After that we can read (_copy_) pixel data from C/C++ via `glReadPixels()` and w
 
 Also that `GL_TEXTURE_2D` texture can be shared with OpenCL without copying, but we have to create OpenCL context with special way for that:
 
-@code{.cpp}
-void initCL()
-{
-    EGLDisplay mEglDisplay = eglGetCurrentDisplay();
-    if (mEglDisplay == EGL_NO_DISPLAY)
-        LOGE("initCL: eglGetCurrentDisplay() returned 'EGL_NO_DISPLAY', error = %x", eglGetError());
-
-    EGLContext mEglContext = eglGetCurrentContext();
-    if (mEglContext == EGL_NO_CONTEXT)
-        LOGE("initCL: eglGetCurrentContext() returned 'EGL_NO_CONTEXT', error = %x", eglGetError());
-
-    cl_context_properties props[] =
-    {   CL_GL_CONTEXT_KHR,   (cl_context_properties) mEglContext,
-        CL_EGL_DISPLAY_KHR,  (cl_context_properties) mEglDisplay,
-        CL_CONTEXT_PLATFORM, 0,
-        0 };
-
-    try
-    {
-        cl::Platform p = cl::Platform::getDefault();
-        std::string ext = p.getInfo<CL_PLATFORM_EXTENSIONS>();
-        if(ext.find("cl_khr_gl_sharing") == std::string::npos)
-            LOGE("Warning: CL-GL sharing isn't supported by PLATFORM");
-        props[5] = (cl_context_properties) p();
-
-        theContext = cl::Context(CL_DEVICE_TYPE_GPU, props);
-        std::vector<cl::Device> devs = theContext.getInfo<CL_CONTEXT_DEVICES>();
-        LOGD("Context returned %d devices, taking the 1st one", devs.size());
-        ext = devs[0].getInfo<CL_DEVICE_EXTENSIONS>();
-        if(ext.find("cl_khr_gl_sharing") == std::string::npos)
-            LOGE("Warning: CL-GL sharing isn't supported by DEVICE");
-
-        theQueue = cl::CommandQueue(theContext, devs[0]);
-
-        // ...
-    }
-    catch(cl::Error& e)
-    {
-        LOGE("cl::Error: %s (%d)", e.what(), e.err());
-    }
-    catch(std::exception& e)
-    {
-        LOGE("std::exception: %s", e.what());
-    }
-    catch(...)
-    {
-        LOGE( "OpenCL info: unknown error while initializing OpenCL stuff" );
-    }
-    LOGD("initCL completed");
-}
-@endcode
-
-@note To build this JNI code you need __OpenCL 1.2__ headers from [Khronos web site](https://www.khronos.org/registry/cl/api/1.2/) and
-the __libOpenCL.so__ downloaded from the device you'll run the application.
+@snippet samples/android/tutorial-4-opencl/jni/CLprocessor.cpp init_opencl
 
 Then the texture can be wrapped by a `cl::ImageGL` object and processed via OpenCL calls:
-@code{.cpp}
-    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, texIn);
-    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut);
-
-    std::vector < cl::Memory > images;
-    images.push_back(imgIn);
-    images.push_back(imgOut);
-    theQueue.enqueueAcquireGLObjects(&images);
-    theQueue.finish();
-
-    cl::Kernel Laplacian = ...
-    Laplacian.setArg(0, imgIn);
-    Laplacian.setArg(1, imgOut);
-    theQueue.finish();
-
-    theQueue.enqueueNDRangeKernel(Laplacian, cl::NullRange, cl::NDRange(w, h), cl::NullRange);
-    theQueue.finish();
-
-    theQueue.enqueueReleaseGLObjects(&images);
-    theQueue.finish();
-@endcode
+
+@snippet samples/android/tutorial-4-opencl/jni/CLprocessor.cpp process_pure_opencl
 
 ### OpenCV T-API
 
 But instead of writing OpenCL code by yourselves you may want to use __OpenCV T-API__ that calls OpenCL implicitly.
 All that you need is to pass the created OpenCL context to OpenCV (via `cv::ocl::attachContext()`) and somehow wrap OpenGL texture with `cv::UMat`.
 Unfortunately `UMat` keeps OpenCL _buffer_ internally, that can't be wrapped over either OpenGL _texture_ or OpenCL _image_ - so we have to copy image data here:
-@code{.cpp}
-    cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, tex);
-    std::vector < cl::Memory > images(1, imgIn);
-    theQueue.enqueueAcquireGLObjects(&images);
-    theQueue.finish();
-
-    cv::UMat uIn, uOut, uTmp;
-    cv::ocl::convertFromImage(imgIn(), uIn);
-    theQueue.enqueueReleaseGLObjects(&images);
-
-    cv::Laplacian(uIn, uTmp, CV_8U);
-    cv:multiply(uTmp, 10, uOut);
-    cv::ocl::finish();
-
-    cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, tex);
-    images.clear();
-    images.push_back(imgOut);
-    theQueue.enqueueAcquireGLObjects(&images);
-    cl_mem clBuffer = (cl_mem)uOut.handle(cv::ACCESS_READ);
-    cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr();
-    size_t offset = 0;
-    size_t origin[3] = { 0, 0, 0 };
-    size_t region[3] = { w, h, 1 };
-    CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS);
-    theQueue.enqueueReleaseGLObjects(&images);
-    cv::ocl::finish();
-@endcode
 
-- @note We have to make one more image data copy when placing back the modified image to the original OpenGL texture via OpenCL image wrapper.
-- @note By default the OpenCL support (T-API) is disabled in OpenCV builds for Android OS (so it's absent in official packages as of version 3.0),
-  but it's possible to rebuild locally OpenCV for Android with OpenCL/T-API enabled: use `-DWITH_OPENCL=YES` option for CMake.
-  @code{.cmd}
-  cd opencv-build-android
-  path/to/cmake.exe -GNinja -DCMAKE_MAKE_PROGRAM="path/to/ninja.exe" -DCMAKE_TOOLCHAIN_FILE=path/to/opencv/platforms/android/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a with NEON" -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON path/to/opencv
-  path/to/ninja.exe install/strip
-  @endcode
-  To use your own modified `libopencv_java4.so` you have to keep inside your APK, not to use OpenCV Manager and load it manually via `System.loadLibrary("opencv_java4")`.
+@snippet samples/android/tutorial-4-opencl/jni/CLprocessor.cpp process_tapi
+
+@note We have to make one more image data copy when placing back the modified image to the original OpenGL texture via OpenCL image wrapper.
 
 Performance notes
 -----------------
diff --git a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
index d37721a188c2..331c6bfb5115 100644
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
@@ -1,384 +1,234 @@
 Android Development with OpenCV {#tutorial_dev_with_OCV_on_Android}
 ===============================
 
-@prev_tutorial{tutorial_O4A_SDK}
-@next_tutorial{tutorial_android_ocl_intro}
+@prev_tutorial{tutorial_android_dev_intro}
+@next_tutorial{tutorial_android_dnn_intro}
 
 |    |    |
 | -: | :- |
-| Original author | Vsevolod Glumov |
-| Compatibility | OpenCV >= 3.0 |
-
-@warning
-This tutorial is deprecated.
+| Original authors | Alexander Panov, Rostislav Vasilikhin |
+| Compatibility | OpenCV >= 4.9.0 |
 
 This tutorial has been created to help you use OpenCV library within your Android project.
 
-This guide was written with Windows 7 in mind, though it should work with any other OS supported by
-OpenCV4Android SDK.
+This guide was checked on Ubuntu but contains no platform-dependent parts, therefore should be compatible with any OS supported by Android Studio and OpenCV4Android SDK.
 
 This tutorial assumes you have the following installed and configured:
 
+-   Android Studio
 -   JDK
 -   Android SDK and NDK
--   Eclipse IDE
--   ADT and CDT plugins for Eclipse
+-   Optional: OpenCV for Android SDK from official [release page on Github](https://github.com/opencv/opencv/releases)
+    or [SourceForge](https://sourceforge.net/projects/opencvlibrary/). Advanced: as alternative the SDK may be
+    built from source code by [instruction on wiki](https://github.com/opencv/opencv/wiki/Custom-OpenCV-Android-SDK-and-AAR-package-build).
 
 If you need help with anything of the above, you may refer to our @ref tutorial_android_dev_intro guide.
 
-This tutorial also assumes you have OpenCV4Android SDK already installed on your development machine
-and OpenCV Manager on your testing device correspondingly. If you need help with any of these, you
-may consult our @ref tutorial_O4A_SDK tutorial.
-
-If you encounter any error after thoroughly following these steps, feel free to contact us via
-[OpenCV4Android](https://groups.google.com/group/android-opencv/) discussion group or OpenCV [Q&A
-forum](https://forum.opencv.org) . We'll do our best to help you out.
+If you encounter any error after thoroughly following these steps, feel free to contact us via OpenCV [forum](https://forum.opencv.org). We'll do our best to help you out.
 
-Using OpenCV Library Within Your Android Project
-------------------------------------------------
 
-In this section we will explain how to make some existing project to use OpenCV. Starting with 2.4.2
-release for Android, *OpenCV Manager* is used to provide apps with the best available version of
-OpenCV. You can get more information here: `Android OpenCV Manager` and in these
-[slides](https://docs.google.com/a/itseez.com/presentation/d/1EO_1kijgBg_BsjNp2ymk-aarg-0K279_1VZRcPplSuk/present#slide=id.p).
+Hello OpenCV sample with SDK
+----------------------------
 
-### Java
+In this section we're gonna create a simple app that does nothing but OpenCV loading. In next section we'll extend it to support camera.
 
-#### Application Development with Async Initialization
+In addition to this instruction you can use some video guide, for example [this one](https://www.youtube.com/watch?v=bR7lL886-uc&ab_channel=ProgrammingHut)
 
-Using async initialization is a **recommended** way for application development. It uses the OpenCV
-Manager to access OpenCV libraries externally installed in the target system.
+1. Open Android Studio and create empty project by choosing ***Empty Views Activity***
 
--#  Add OpenCV library project to your workspace. Use menu
-    File -\> Import -\> Existing project in your workspace.
+    ![](images/create_empty_project.png)
 
-    Press Browse button and locate OpenCV4Android SDK (`OpenCV-2.4.9-android-sdk/sdk`).
+2. Setup the project:
+    - Choose ***Java*** language
+    - Choose ***Groovy DSL*** build configuration language
+    - Choose ***Minumum SDK*** with the version number not less than was used during OpenCV 4 Android build
+        - If you don't know it, you can find it in file `OpenCV-android-sdk/sdk/build.gradle` at `android -> defaultConfig -> minSdkVersion`
 
-    ![](images/eclipse_opencv_dependency0.png)
+    ![](images/setup_project.png)
 
--#  In application project add a reference to the OpenCV Java SDK in
-    Project -\> Properties -\> Android -\> Library -\> Add select OpenCV Library - 2.4.9.
 
-    ![](images/eclipse_opencv_dependency1.png)
+3. Click ***File -> New -> Import module...*** and select OpenCV SDK path
 
-In most cases OpenCV Manager may be installed automatically from Google Play. For the case, when
-Google Play is not available, i.e. emulator, developer board, etc, you can install it manually using
-adb tool. See `Manager Selection` for details.
+    ![](images/sdk_path.png)
 
-There is a very base code snippet implementing the async initialization. It shows basic principles.
-See the "15-puzzle" OpenCV sample for details.
-@code{.java}
-public class Sample1Java extends Activity implements CvCameraViewListener {
+4. Set module name as `OpenCV` and press `Finish`
 
-    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-                    mOpenCvCameraView.enableView();
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
+    ![](images/module_name.png)
 
-    @Override
-    public void onResume()
-    {
-        super.onResume();
-        OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_6, this, mLoaderCallback);
+5. OpenCV also provides experiemental Kotlin support. Please add Android Kotlin plugin to `MyApplication/OpenCV/build.gradle` file:
+    @code{.gradle}
+    plugins {
+        id 'org.jetbrains.kotlin.android' version '1.7.10' #version may differ for your setup
     }
-
-    ...
-}
-@endcode
-It this case application works with OpenCV Manager in asynchronous fashion. OnManagerConnected
-callback will be called in UI thread, when initialization finishes. Please note, that it is not
-allowed to use OpenCV calls or load OpenCV-dependent native libs before invoking this callback. Load
-your own native libraries that depend on OpenCV after the successful OpenCV initialization. Default
-BaseLoaderCallback implementation treat application context as Activity and calls Activity.finish()
-method to exit in case of initialization failure. To override this behavior you need to override
-finish() method of BaseLoaderCallback class and implement your own finalization method.
-
-#### Application Development with Static Initialization
-
-According to this approach all OpenCV binaries are included into your application package. It is
-designed mostly for development purposes. This approach is deprecated for the production code,
-release package is recommended to communicate with OpenCV Manager via the async initialization
-described above.
-
--#  Add the OpenCV library project to your workspace the same way as for the async initialization
-    above. Use menu File -\> Import -\> Existing project in your workspace, press Browse button and
-    select OpenCV SDK path (`OpenCV-2.4.9-android-sdk/sdk`).
-
-    ![](images/eclipse_opencv_dependency0.png)
-
--#  In the application project add a reference to the OpenCV4Android SDK in
-    Project -\> Properties -\> Android -\> Library -\> Add select OpenCV Library - 2.4.9;
-
-    ![](images/eclipse_opencv_dependency1.png)
-
--#  If your application project **doesn't have a JNI part**, just copy the corresponding OpenCV
-    native libs from `<OpenCV-2.4.9-android-sdk>/sdk/native/libs/<target_arch>` to your project
-    directory to folder `libs/<target_arch>`.
-
-    In case of the application project **with a JNI part**, instead of manual libraries copying you
-    need to modify your Android.mk file: add the following two code lines after the
-    "include $(CLEAR_VARS)" and before
-    "include path_to_OpenCV-2.4.9-android-sdk/sdk/native/jni/OpenCV.mk"
-    @code{.make}
-    OPENCV_CAMERA_MODULES:=on
-    OPENCV_INSTALL_MODULES:=on
-    @endcode
-    The result should look like the following:
-    @code{.make}
-    include $(CLEAR_VARS)
-
-    # OpenCV
-    OPENCV_CAMERA_MODULES:=on
-    OPENCV_INSTALL_MODULES:=on
-    include ../../sdk/native/jni/OpenCV.mk
     @endcode
-    After that the OpenCV libraries will be copied to your application `libs` folder during the JNI
-    build.v
-
-    Eclipse will automatically include all the libraries from the `libs` folder to the application
-    package (APK).
-
--#  The last step of enabling OpenCV in your application is Java initialization code before calling
-    OpenCV API. It can be done, for example, in the static section of the Activity class:
-    @code{.java}
-    static {
-        if (!OpenCVLoader.initDebug()) {
-            // Handle initialization error
-        }
+    Like this:
+    ![](images/gradle_ocv_fix.png)
+    If you don't do this, you may get an error:
+    @code
+    Task failed with an exception.
+    -----------
+    * Where:
+    Build file '/home/alexander/AndroidStudioProjects/MyApplication/opencv/build.gradle' line: 4
+
+    * What went wrong:
+    A problem occurred evaluating project ':opencv'.
+    > Plugin with id 'kotlin-android' not found.
+    @endcode
+    The fix was found [here](https://stackoverflow.com/questions/73225714/import-opencv-sdk-to-android-studio-chipmunk)
+
+6. OpenCV project uses `buildConfig` feature. Please enable it in
+   `MyApplication/OpenCV/build.gradle` file to `android` block:
+    @code{.gradle}
+    buildFeatures{
+        buildConfig true
     }
+
     @endcode
-    If you application includes other OpenCV-dependent native libraries you should load them
-    **after** OpenCV initialization:
-    @code{.java}
-    static {
-        if (!OpenCVLoader.initDebug()) {
-            // Handle initialization error
-        } else {
-            System.loadLibrary("my_jni_lib1");
-            System.loadLibrary("my_jni_lib2");
-        }
-    }
+    Like this:
+    ![](images/module_gradle_fix.png)
+    If you don't do this, you may get an error:
+    @code
+    JavaCameraView.java:15: error: cannot find symbol import org.opencv.BuildConfig; ^ symbol: class BuildConfig location: package org.opencv
     @endcode
+    The fix was found [here](https://stackoverflow.com/questions/76374886/error-cannot-find-symbol-import-org-opencv-buildconfig-android-studio) and [here](https://forum.opencv.org/t/task-compiledebugjavawithjavac-failed/13667/4)
 
-### Native/C++
+7. Add the module to the project:
+    - Click ***File -> Project structure... -> Dependencies -> All modules -> + (Add Dependency button) -> Module dependency***
 
-To build your own Android application, using OpenCV as native part, the following steps should be
-taken:
+    ![](images/add_module_1.png)
 
--#  You can use an environment variable to specify the location of OpenCV package or just hardcode
-    absolute or relative path in the `jni/Android.mk` of your projects.
--#  The file `jni/Android.mk` should be written for the current application using the common rules
-    for this file.
+    - Choose `app`
 
-    For detailed information see the Android NDK documentation from the Android NDK archive, in the
-    file `<path_where_NDK_is_placed>/docs/ANDROID-MK.html`.
+    ![](images/add_module_2.png)
 
--#  The following line:
-    @code{.make}
-    include C:\Work\OpenCV4Android\OpenCV-2.4.9-android-sdk\sdk\native\jni\OpenCV.mk
-    @endcode
-    Should be inserted into the `jni/Android.mk` file **after** this line:
-    @code{.make}
-    include $(CLEAR_VARS)
-    @endcode
--#  Several variables can be used to customize OpenCV stuff, but you **don't need** to use them when
-    your application uses the async initialization via the OpenCV Manager API.
+    - Select `OpenCV`
 
-    @note These variables should be set **before** the "include .../OpenCV.mk" line:
-    @code{.make}
-    OPENCV_INSTALL_MODULES:=on
-    @endcode
+    ![](images/add_module_3.png)
 
-    Copies necessary OpenCV dynamic libs to the project libs folder in order to include them
-    into the APK.
-    @code{.make}
-    OPENCV_CAMERA_MODULES:=off
-    @endcode
-    Skip native OpenCV camera related libs copying to the project libs folder.
-    @code{.make}
-    OPENCV_LIB_TYPE:=STATIC
-    @endcode
-    Perform static linking with OpenCV. By default dynamic link is used and the project JNI lib
-    depends on libopencv_java.so.
+8. Before using any OpenCV function you have to load the library first. If you application includes other OpenCV-dependent native libraries you should load them ***after*** OpenCV initialization.
+    Add the folowing code to load the library at app start:
+    @snippet samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java ocv_loader_init
+    Like this:
+    ![](images/sample_code.png)
 
--#  The file `Application.mk` should exist and should contain lines:
-    @code{.make}
-    APP_STL := gnustl_static
-    APP_CPPFLAGS := -frtti -fexceptions
-    @endcode
-    Also, the line like this one:
-    @code{.make}
-    APP_ABI := armeabi-v7a
-    @endcode
-    Should specify the application target platforms.
-
-    In some cases a linkage error (like
-    `"In function 'cv::toUtf16(std::basic_string<...>... undefined reference to 'mbstowcs'"`)
-    happens when building an application JNI library, depending on OpenCV. The following line in the
-    `Application.mk` usually fixes it:
-    @code{.make}
-    APP_PLATFORM := android-9
-    @endcode
+9. Choose a device to check the sample on and run the code by pressing `run` button
 
--#  Either use @ref tutorial_android_dev_intro_ndk "manual"  ndk-build invocation or
-    @ref tutorial_android_dev_intro_eclipse "setup Eclipse CDT Builder" to build native JNI lib
-    before (re)building the Java part and creating
-    an APK.
+    ![](images/run_app.png)
 
-Hello OpenCV Sample
--------------------
+Hello OpenCV sample with Maven Central
+--------------------------------------
 
-Here are basic steps to guide you through the process of creating a simple OpenCV-centric
-application. It will be capable of accessing camera output, processing it and displaying the result.
+Since OpenCV 4.9.0 OpenCV for Android package is available with Maven Central and may be installed
+automatically as Gradle dependency. In this section we're gonna create a simple app that does nothing
+but OpenCV loading with Maven Central.
 
--#  Open Eclipse IDE, create a new clean workspace, create a new Android project
-    File --\> New --\> Android Project
--#  Set name, target, package and minSDKVersion accordingly. The minimal SDK version for build with
-    OpenCV4Android SDK is 11. Minimal device API Level (for application manifest) is 8.
--#  Allow Eclipse to create default activity. Lets name the activity HelloOpenCvActivity.
--#  Choose Blank Activity with full screen layout. Lets name the layout HelloOpenCvLayout.
--#  Import OpenCV library project to your workspace.
--#  Reference OpenCV library within your project properties.
+1. Open Android Studio and create empty project by choosing ***Empty Views Activity***
 
-    ![](images/dev_OCV_reference.png)
+    ![](images/create_empty_project.png)
 
--#  Edit your layout file as xml file and pass the following layout there:
-    @code{.xml}
-    <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
-        xmlns:tools="http://schemas.android.com/tools"
-        xmlns:opencv="http://schemas.android.com/apk/res-auto"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent" >
-
-        <org.opencv.android.JavaCameraView
-            android:layout_width="fill_parent"
-            android:layout_height="fill_parent"
-            android:visibility="gone"
-            android:id="@+id/HelloOpenCvView"
-            opencv:show_fps="true"
-            opencv:camera_id="any" />
-
-    </LinearLayout>
-    @endcode
--#  Add the following permissions to the `AndroidManifest.xml` file:
-    @code{.xml}
-    </application>
+2. Setup the project:
+    - Choose ***Java*** language
+    - Choose ***Groovy DSL*** build configuration language
+    - Choose ***Minumum SDK*** with the version number not less than OpenCV supports. For 4.9.0 minimal SDK version is 21.
 
-    <uses-permission android:name="android.permission.CAMERA"/>
+    ![](images/setup_project.png)
 
-    <uses-feature android:name="android.hardware.camera" android:required="false"/>
-    <uses-feature android:name="android.hardware.camera.autofocus" android:required="false"/>
-    <uses-feature android:name="android.hardware.camera.front" android:required="false"/>
-    <uses-feature android:name="android.hardware.camera.front.autofocus" android:required="false"/>
-    @endcode
--#  Set application theme in AndroidManifest.xml to hide title and system buttons.
-    @code{.xml}
-    <application
-        android:icon="@drawable/icon"
-        android:label="@string/app_name"
-        android:theme="@android:style/Theme.NoTitleBar.Fullscreen" >
-    @endcode
--#  Add OpenCV library initialization to your activity. Fix errors by adding required imports.
-    @code{.java}
-    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-                    mOpenCvCameraView.enableView();
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
-
-    @Override
-    public void onResume()
-    {
-        super.onResume();
-        OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_6, this, mLoaderCallback);
+3. Edit `build.gradle` and add OpenCV library to Dependencies list like this:
+    @code{.gradle}
+    dependencies {
+        implementation 'org.opencv:opencv:4.9.0'
     }
     @endcode
--#  Defines that your activity implements CvCameraViewListener2 interface and fix activity related
-    errors by defining missed methods. For this activity define onCreate, onDestroy and onPause and
-    implement them according to the code snippet below. Fix errors by adding required imports.
-    @code{.java}
-    private CameraBridgeViewBase mOpenCvCameraView;
-
-    @Override
-    public void onCreate(Bundle savedInstanceState) {
-        Log.i(TAG, "called onCreate");
-        super.onCreate(savedInstanceState);
-        getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
-        setContentView(R.layout.HelloOpenCvLayout);
-        mOpenCvCameraView = (CameraBridgeViewBase) findViewById(R.id.HelloOpenCvView);
-        mOpenCvCameraView.setVisibility(SurfaceView.VISIBLE);
-        mOpenCvCameraView.setCvCameraViewListener(this);
-    }
+   `4.9.0` may be replaced by any version available as [official release](https://central.sonatype.com/artifact/org.opencv/opencv).
 
-    @Override
-    public void onPause()
-    {
-        super.onPause();
-        if (mOpenCvCameraView != null)
-            mOpenCvCameraView.disableView();
-    }
+4. Before using any OpenCV function you have to load the library first. If you application includes other
+   OpenCV-dependent native libraries you should load them ***after*** OpenCV initialization. Add the folowing
+   code to load the library at app start:
+    @snippet samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java ocv_loader_init
+    Like this:
+    ![](images/sample_code.png)
 
-    public void onDestroy() {
-        super.onDestroy();
-        if (mOpenCvCameraView != null)
-            mOpenCvCameraView.disableView();
-    }
+5. Choose a device to check the sample on and run the code by pressing `run` button
 
-    public void onCameraViewStarted(int width, int height) {
-    }
+    ![](images/run_app.png)
 
-    public void onCameraViewStopped() {
-    }
+Camera view sample
+------------------
 
-    public Mat onCameraFrame(CvCameraViewFrame inputFrame) {
-        return inputFrame.rgba();
-    }
+In this section we'll extend our empty OpenCV app created in the previous section to support camera. We'll take camera frames and display them on the screen.
+
+1. Tell a system that we need camera permissions.
+    Add the following code to the file `MyApplication/app/src/main/AndroidManifest.xml`:
+    @snippet samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml camera_permissions
+    Like this:
+    ![](images/camera_permissions.png)
+
+2. Go to `activity_main.xml` layout and delete TextView with text "Hello World!"
+
+    ![](images/delete_text.png)
+
+    This can also be done in Code or Split mode by removing the `TextView` block from XML file.
+
+3. Add camera view to the layout:
+    1. Add a scheme into layout description:
+    @code{.xml}
+    xmlns:opencv="http://schemas.android.com/apk/res-auto"
     @endcode
--#  Run your application on device or emulator.
-
-Lets discuss some most important steps. Every Android application with UI must implement Activity
-and View. By the first steps we create blank activity and default view layout. The simplest
-OpenCV-centric application must implement OpenCV initialization, create its own view to show preview
-from camera and implements CvCameraViewListener2 interface to get frames from camera and process it.
-
-First of all we create our application view using xml layout. Our layout consists of the only one
-full screen component of class org.opencv.android.JavaCameraView. This class is implemented inside
-OpenCV library. It is inherited from CameraBridgeViewBase, that extends SurfaceView and uses
-standard Android camera API.
-
-After creating layout we need to implement Activity class. OpenCV initialization process has been
-already discussed above. In this sample we use asynchronous initialization. Implementation of
-CvCameraViewListener interface allows you to add processing steps after frame grabbing from camera
-and before its rendering on screen. The most important function is onCameraFrame. It is callback
-function and it is called on retrieving frame from camera. The callback input is object of
-CvCameraViewFrame class that represents frame from camera.
-
-@note Do not save or use CvCameraViewFrame object out of onCameraFrame callback. This object does
-not have its own state and its behavior out of callback is unpredictable!
-
-It has rgba() and gray()
-methods that allows to get frame as RGBA and one channel gray scale Mat respectively. It expects
-that onCameraFrame function returns RGBA frame that will be drawn on the screen.
+
+    2. Replace `TextView` with `org.opencv.android.JavaCameraView` widget:
+    @snippet /samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml camera_view
+
+    3. If you get a layout warning replace `fill_parent` values by `match_parent` for `android:layout_width` and `android:layout_height` properties
+
+    You'll get a code like this:
+
+    @include /samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml
+
+4. Inherit the main class from `org.opencv.android.CameraActivity`. CameraActivity implements
+   camera perimission requiest and some other utilities needed for CV application. Methods we're
+   interested in to override are `onCreate`, `onDestroy`, `onPause`, `onResume` and `getCameraViewList`
+
+5. Implement the interface `org.opencv.android.CameraBridgeViewBase.CvCameraViewListener2`
+   `onCameraFrame` method should return the `Mat` object with content for render.
+    The sample just returns camera frame for preview: `return inputFrame.rgba();`
+
+6. Allocate `org.opencv.android.CameraBridgeViewBase` object:
+    - It should be created at app start (`onCreate` method) and this class should be set as a listener
+    - At pause/resume (`onPause`, `onResume` methods) it should be disabled/enabled
+    - Should be disabled at app finish (`onDestroy` method)
+    - Should be returned in `getCameraViewList`
+
+7. Optionally you can forbid the phone to dim screen or lock:
+
+    @snippet samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java keep_screen
+
+Finally you'll get source code similar to this:
+
+@include samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java
+
+This is it! Now you can run the code on your device to check it.
+
+
+Let's discuss some most important steps
+---------------------------------------
+
+Every Android application with UI must implement Activity and View. By the first steps we create blank
+activity and default view layout. The simplest OpenCV-centric application must perform OpenCV
+initialization, create a view to show preview from camera and implement `CvCameraViewListener2` interface
+to get frames from camera and process them.
+
+First of all we create our application view using XML layout. Our layout consists of the only one
+full screen component of class `org.opencv.android.JavaCameraView`. This OpenCV class is inherited from
+ `CameraBridgeViewBase` that extends `SurfaceView` and under the hood uses standard Android camera API.
+
+The `CvCameraViewListener2` interface lets you add some processing steps after the frame is grabbed from
+the camera and before it's rendered on the screen. The most important method is `onCameraFrame`. This is
+a callback function and it's called on retrieving frame from camera. It expects that `onCameraFrame`
+function returns RGBA frame that will be drawn on the screen.
+
+The callback passes a frame from camera to our class as an object of `CvCameraViewFrame` class.
+This object has `rgba()` and `gray()` methods that let a user get colored or one-channel grayscale
+frame as a `Mat` class object.
+
+@note Do not save or use `CvCameraViewFrame` object out of `onCameraFrame` callback. This object does
+not have its own state and its behavior outside the callback is unpredictable!
diff --git a/doc/tutorials/dnn/dnn_android/11_demo.jpg b/doc/tutorials/introduction/android_binary_package/images/11_demo.jpg
similarity index 100%
rename from doc/tutorials/dnn/dnn_android/11_demo.jpg
rename to doc/tutorials/introduction/android_binary_package/images/11_demo.jpg
diff --git a/doc/tutorials/introduction/android_binary_package/images/AVD_create.png b/doc/tutorials/introduction/android_binary_package/images/AVD_create.png
deleted file mode 100644
index f55ea51d7376..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/AVD_create.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/AVD_empty.png b/doc/tutorials/introduction/android_binary_package/images/AVD_empty.png
deleted file mode 100644
index 6989f7e16768..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/AVD_empty.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/add_module_1.png b/doc/tutorials/introduction/android_binary_package/images/add_module_1.png
new file mode 100644
index 000000000000..2653c433bba2
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/add_module_1.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/add_module_2.png b/doc/tutorials/introduction/android_binary_package/images/add_module_2.png
new file mode 100644
index 000000000000..55ac7bcdfcef
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/add_module_2.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/add_module_3.png b/doc/tutorials/introduction/android_binary_package/images/add_module_3.png
new file mode 100644
index 000000000000..b9be8e8bb6e2
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/add_module_3.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/android_emulator_opencv_manager_fail.png b/doc/tutorials/introduction/android_binary_package/images/android_emulator_opencv_manager_fail.png
deleted file mode 100644
index 04839636ec87..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/android_emulator_opencv_manager_fail.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/android_package_7zip.png b/doc/tutorials/introduction/android_binary_package/images/android_package_7zip.png
deleted file mode 100644
index 3bd6aa3457f1..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/android_package_7zip.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/android_sdk_and_avd_manager.png b/doc/tutorials/introduction/android_binary_package/images/android_sdk_and_avd_manager.png
deleted file mode 100644
index cf5eb7d1abf8..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/android_sdk_and_avd_manager.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/camera_permissions.png b/doc/tutorials/introduction/android_binary_package/images/camera_permissions.png
new file mode 100644
index 000000000000..f7622dee8828
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/camera_permissions.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/choose_device.png b/doc/tutorials/introduction/android_binary_package/images/choose_device.png
new file mode 100644
index 000000000000..76f3c3060fb2
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/choose_device.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/cmd_adb_devices.png b/doc/tutorials/introduction/android_binary_package/images/cmd_adb_devices.png
deleted file mode 100644
index e0e4853deca5..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/cmd_adb_devices.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/create_empty_project.png b/doc/tutorials/introduction/android_binary_package/images/create_empty_project.png
new file mode 100644
index 000000000000..609f258d4574
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/create_empty_project.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/delete_text.png b/doc/tutorials/introduction/android_binary_package/images/delete_text.png
new file mode 100644
index 000000000000..d865ff92cc2a
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/delete_text.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/dev_OCV_new_class.png b/doc/tutorials/introduction/android_binary_package/images/dev_OCV_new_class.png
deleted file mode 100644
index 3a75b1108136..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/dev_OCV_new_class.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/dev_OCV_reference.png b/doc/tutorials/introduction/android_binary_package/images/dev_OCV_reference.png
deleted file mode 100644
index 5179b2343097..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/dev_OCV_reference.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/device_details.png b/doc/tutorials/introduction/android_binary_package/images/device_details.png
deleted file mode 100644
index 9c0a94000bad..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/device_details.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_10_crystal_clean.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_10_crystal_clean.png
deleted file mode 100644
index 499247a48261..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_10_crystal_clean.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_11_run_as.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_11_run_as.png
deleted file mode 100644
index 46c584d43b62..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_11_run_as.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_1_choose_workspace.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_1_choose_workspace.png
deleted file mode 100644
index af06bd9b0d3d..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_1_choose_workspace.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_1a_locate_sdk.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_1a_locate_sdk.png
deleted file mode 100644
index 95bb41e4bd88..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_1a_locate_sdk.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_2_window_preferences.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_2_window_preferences.png
deleted file mode 100644
index 414673547ac2..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_2_window_preferences.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_3_preferences_android.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_3_preferences_android.png
deleted file mode 100644
index 76a0589d71e5..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_3_preferences_android.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_4_locate_sdk.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_4_locate_sdk.png
deleted file mode 100644
index 9657cc5c4b47..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_4_locate_sdk.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_5_import_command.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_5_import_command.png
deleted file mode 100644
index f1117950142a..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_5_import_command.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_6_import_existing_projects.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_6_import_existing_projects.png
deleted file mode 100644
index c397a834c656..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_6_import_existing_projects.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_7_select_projects.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_7_select_projects.png
deleted file mode 100644
index e152bc4ed2cd..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_7_select_projects.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_8_false_alarm.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_8_false_alarm.png
deleted file mode 100644
index 43295daf8be9..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_8_false_alarm.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_8a_target.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_8a_target.png
deleted file mode 100644
index cee04cdfde37..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_8a_target.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_8b_fix_props.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_8b_fix_props.png
deleted file mode 100644
index 061be3dc6120..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_8b_fix_props.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_9_errors_dissapearing.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_9_errors_dissapearing.png
deleted file mode 100644
index 5a9157237be7..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_9_errors_dissapearing.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_NDK_build_success.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_NDK_build_success.png
deleted file mode 100644
index 0bf2a85287ee..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_NDK_build_success.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_builder_types.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_builder_types.png
deleted file mode 100644
index 61673a369940..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_builder_types.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_builders.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_builders.png
deleted file mode 100644
index 69d90d839dfa..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_builders.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg1.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg1.png
deleted file mode 100644
index 41e489f97408..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg1.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg2.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg2.png
deleted file mode 100644
index e216b58fe17c..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg2.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg3.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg3.png
deleted file mode 100644
index fd73103b286d..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg3.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg4.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg4.png
deleted file mode 100644
index f8126b5b5695..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg4.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg5.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg5.png
deleted file mode 100644
index e4e6f4fc5650..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg5.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg6.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg6.png
deleted file mode 100644
index 8212ae700ebd..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg6.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg7.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg7.png
deleted file mode 100644
index 205a06593c9b..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg7.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg8.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg8.png
deleted file mode 100644
index d5253bd9b5a8..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg8.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_build_options.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_build_options.png
deleted file mode 100644
index 9a4683d5bf26..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_build_options.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_build_resources.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_build_resources.png
deleted file mode 100644
index 9c3480ea7657..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_build_resources.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_main.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_main.png
deleted file mode 100644
index 4c3bea0c8627..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_main.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_refresh.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_refresh.png
deleted file mode 100644
index 4e36593b985d..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_refresh.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_specify_resources.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_specify_resources.png
deleted file mode 100644
index bce08a883194..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_edit_configuration_specify_resources.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_adt.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_adt.png
deleted file mode 100644
index 94491e0a619a..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_adt.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_cdt.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_cdt.png
deleted file mode 100644
index ab34a6783fce..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_cdt.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_cdt_2.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_cdt_2.png
deleted file mode 100644
index 3cb773e79b0d..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_inst_cdt_2.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_ndk_build.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_ndk_build.png
deleted file mode 100644
index 0fdc59a9fce0..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_ndk_build.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_opencv_dependency0.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_opencv_dependency0.png
deleted file mode 100644
index 5c5673b48174..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_opencv_dependency0.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_opencv_dependency1.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_opencv_dependency1.png
deleted file mode 100644
index 5fc63da9baa9..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_opencv_dependency1.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/eclipse_windows_environment.png b/doc/tutorials/introduction/android_binary_package/images/eclipse_windows_environment.png
deleted file mode 100644
index 5deced55c998..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/eclipse_windows_environment.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png b/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png
deleted file mode 100644
index d08340be9cee..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/gradle_ocv_fix.png b/doc/tutorials/introduction/android_binary_package/images/gradle_ocv_fix.png
new file mode 100644
index 000000000000..abaddce00f2c
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/gradle_ocv_fix.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/module_gradle_fix.png b/doc/tutorials/introduction/android_binary_package/images/module_gradle_fix.png
new file mode 100644
index 000000000000..235f012e3c4b
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/module_gradle_fix.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/module_name.png b/doc/tutorials/introduction/android_binary_package/images/module_name.png
new file mode 100644
index 000000000000..e0c85b0ec7ed
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/module_name.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/ndk_build.png b/doc/tutorials/introduction/android_binary_package/images/ndk_build.png
deleted file mode 100644
index 633d0684e97d..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/ndk_build.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/run_app.png b/doc/tutorials/introduction/android_binary_package/images/run_app.png
new file mode 100644
index 000000000000..97529779b76d
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/run_app.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/sample_code.png b/doc/tutorials/introduction/android_binary_package/images/sample_code.png
new file mode 100644
index 000000000000..e800184ae0a1
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/sample_code.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/sdk_ndk_manager.png b/doc/tutorials/introduction/android_binary_package/images/sdk_ndk_manager.png
new file mode 100644
index 000000000000..fcd1ad006072
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/sdk_ndk_manager.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/sdk_path.png b/doc/tutorials/introduction/android_binary_package/images/sdk_path.png
new file mode 100644
index 000000000000..0f8fbd770350
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/sdk_path.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/setup_project.png b/doc/tutorials/introduction/android_binary_package/images/setup_project.png
new file mode 100644
index 000000000000..5586b1786fac
Binary files /dev/null and b/doc/tutorials/introduction/android_binary_package/images/setup_project.png differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/usb_device_connect_13.png b/doc/tutorials/introduction/android_binary_package/images/usb_device_connect_13.png
deleted file mode 100644
index 4d7c84fbc703..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/usb_device_connect_13.png and /dev/null differ
diff --git a/doc/tutorials/introduction/android_binary_package/images/usb_device_connect_ubuntu.png b/doc/tutorials/introduction/android_binary_package/images/usb_device_connect_ubuntu.png
deleted file mode 100644
index fb9f72a45f11..000000000000
Binary files a/doc/tutorials/introduction/android_binary_package/images/usb_device_connect_ubuntu.png and /dev/null differ
diff --git a/doc/tutorials/introduction/building_tegra_cuda/building_tegra_cuda.markdown b/doc/tutorials/introduction/building_tegra_cuda/building_tegra_cuda.markdown
index a56c0fa17f8a..e1c129f0e189 100644
--- a/doc/tutorials/introduction/building_tegra_cuda/building_tegra_cuda.markdown
+++ b/doc/tutorials/introduction/building_tegra_cuda/building_tegra_cuda.markdown
@@ -15,7 +15,7 @@ This tutorial is deprecated.
 @tableofcontents
 
 OpenCV with CUDA for Tegra
-==========================
+--------------------------
 
 This document is a basic guide to building the OpenCV libraries with CUDA support for use in the Tegra environment. It covers the basic elements of building the version 3.1.0 libraries from source code for three (3) different types of platforms:
 
@@ -36,17 +36,16 @@ The OpenCV build system supports native compilation for all the supported platfo
 At the present time, this document focuses only on native compilation.
 
 Getting the Source Code {#tutorial_building_tegra_cuda_getting_the_code}
-=======================
+-----------------------
 
 There are two (2) ways to get the OpenCV source code:
 
-* Direct download from the [OpenCV downloads](http://opencv.org/releases.html) page
+* Direct download from the [OpenCV downloads](https://opencv.org/releases) page
 * Cloning the git repositories hosted on [GitHub](https://github.com/opencv)
 
 For this guide, the focus is on using the git repositories. This is because the 3.1.0 version of OpenCV will not build with CUDA 8.0 without applying a few small upstream changes from the git repository.
 
-OpenCV
-------
+### OpenCV
 
 Start with the `opencv` repository:
 
@@ -93,8 +92,7 @@ You should see output similar to:
 
 At this point, the `opencv` repository is ready for building.
 
-OpenCV Extra
-------------
+### OpenCV Extra
 
 The `opencv_extra` repository contains extra data for the OpenCV library, including the data files used by the tests and demos. It must be cloned separately:
 
@@ -111,12 +109,11 @@ You may opt to not fetch this repository if you do not plan on running the tests
 __Note:__ If you plan to run the tests, some tests expect the data to be present and will fail without it.
 
 Preparation and Prerequisites {#tutorial_building_tegra_cuda_preparation}
-=============================
+-----------------------------
 
 To build OpenCV, you need a directory to create the configuration and build the libraries. You also need a number of 3rd-party libraries upon which OpenCV depends.
 
-Prerequisites for Ubuntu Linux
-------------------------------
+### Prerequisites for Ubuntu Linux
 
 These are the basic requirements for building OpenCV for Tegra on Linux:
 
@@ -186,8 +183,7 @@ The commands that will do this:
 
 Once all the necessary packages are installed, you can configure the build.
 
-Preparing the Build Area
-------------------------
+### Preparing the Build Area
 
 Software projects that use the CMake system for configuring their builds expect the actual builds to be done outside of the source tree itself. For configuring and building OpenCV, create a directory called "build" in the same base directory into which you cloned the git repositories:
 
@@ -197,7 +193,7 @@ Software projects that use the CMake system for configuring their builds expect
 You are now ready to configure and build OpenCV.
 
 Configuring OpenCV for Building {#tutorial_building_tegra_cuda_configuring}
-===============================
+-------------------------------
 
 The CMake configuration options given below for the different platforms are targeted towards the functionality needed for Tegra. They are based on the original configuration options used for building OpenCV 2.4.13.
 
@@ -209,8 +205,7 @@ For the Linux-based platforms, the shown value for the `CMAKE_INSTALL_PREFIX` pa
 
 In each of the `cmake` invocations below, the last parameter, `OPENCV_TEST_DATA_PATH`, tells the build system where to find the test-data that is provided by the `opencv_extra` repository. When this is included, a `make install` installs this test-data alongside the libraries and example code, and a `make test` automatically provides this path to the tests that have to load data from it. If you did not clone the `opencv_extra` repository, do not include this parameter.
 
-Vibrante V4L Configuration
---------------------------
+### Vibrante V4L Configuration
 
 Supported platform: Drive PX 2
 
@@ -251,8 +246,7 @@ The configuration provided above builds the Python bindings for Python 2 (but no
 
     -DBUILD_opencv_python2=OFF
 
-Jetson L4T Configuration
-------------------------
+### Jetson L4T Configuration
 
 Supported platforms:
 
@@ -261,7 +255,7 @@ Supported platforms:
 
 Configuration is slightly different for the Jetson TK1 and the Jetson TX1 systems.
 
-### Jetson TK1
+#### Jetson TK1
 
     $ cmake \
         -DCMAKE_BUILD_TYPE=Release \
@@ -299,7 +293,7 @@ Configuration is slightly different for the Jetson TK1 and the Jetson TX1 system
 
 __Note:__ This uses CUDA 6.5, not 8.0.
 
-### Jetson TX1
+#### Jetson TX1
 
     $ cmake \
         -DCMAKE_BUILD_TYPE=Release \
@@ -336,8 +330,7 @@ __Note:__ This uses CUDA 6.5, not 8.0.
 
 __Note:__ This configuration does not set the `ENABLE_NEON` parameter.
 
-Ubuntu Desktop Linux Configuration
-----------------------------------
+### Ubuntu Desktop Linux Configuration
 
 Supported platforms:
 
@@ -383,12 +376,11 @@ This configuration is nearly identical to that for V4L and L4T, except that the
 As with previous examples, the configuration given above builds the Python bindings for Python 2 (but not Python 3) as part of the build process.
 
 Building OpenCV {#tutorial_building_tegra_cuda_building}
-===============
+---------------
 
 Once `cmake` finishes configuring OpenCV, building is done using the standard `make` utility.
 
-Building with `make`
---------------------
+### Building with `make`
 
 The only parameter that is needed for the invocation of `make` is the `-j` parameter for specifying how many parallel threads to use. This varies depending on the system and how much memory is available, other running processes, etc. The following table offers suggested values for this parameter:
 
@@ -408,12 +400,11 @@ By default, CMake hides the details of the build steps. If you need to see more
     $ make -j6 VERBOSE=1
 
 Testing OpenCV {#tutorial_building_tegra_cuda_testing}
-==============
+--------------
 
 Once the build completes successfully, you have the option of running the extensive set of tests that OpenCV provides. If you did not clone the `opencv_extra` repository and specify the path to `testdata` in the `cmake` invocation, then testing is not recommended.
 
-Testing under Linux
--------------------
+### Testing under Linux
 
 To run the basic tests under Linux, execute:
 
@@ -425,7 +416,7 @@ This executes `ctest` to carry out the tests, as specified in CTest syntax withi
 
 In this example, there are two (2) arguments passed to `ctest`: `--verbose` and `--parallel 3`. The first argument causes the output from `ctest` to be more detailed, and the second causes `ctest` to run as many as three (3) tests in parallel. As with choosing a thread count for building, base any choice for testing on the available number of processor cores, physical memory, etc. Some of the tests do attempt to allocate significant amounts of memory.
 
-### Known Issues with Tests
+#### Known Issues with Tests
 
 At present, not all of the tests in the OpenCV test suite pass. There are tests that fail whether or not CUDA is compiled, and there are tests that are only specific to CUDA that also do not currently pass.
 
@@ -434,7 +425,7 @@ __Note:__ There are no tests that pass without CUDA but fail only when CUDA is i
 As the full lists of failing tests vary based on platform, it is impractical to list them here.
 
 Installing OpenCV {#tutorial_building_tegra_cuda_installing}
-=================
+-----------------
 
 Installing OpenCV is very straightforward. For the Linux-based platforms, the command is:
 
@@ -443,14 +434,13 @@ Installing OpenCV is very straightforward. For the Linux-based platforms, the co
 Depending on the chosen installation location, you may need root privilege to install.
 
 Building OpenCV 2.4.X {#tutorial_building_tegra_cuda_opencv_24X}
-=====================
+---------------------
 
 If you wish to build your own version of the 2.4 version of OpenCV, there are only a few adjustments that must be made. At the time of this writing, the latest version on the 2.4 tree is 2.4.13. These instructions may work for later versions of 2.4, though they have not been tested for any earlier versions.
 
 __Note:__ The 2.4.X OpenCV source does not have the extra modules and code for Tegra that was upstreamed into the 3.X versions of OpenCV. This part of the guide is only for cases where you want to build a vanilla version of OpenCV 2.4.
 
-Selecting the 2.4 Source
-------------------------
+### Selecting the 2.4 Source
 
 First you must select the correct source branch or tag. If you want a specific version such as 2.4.13, you want to make a local branch based on the tag, as was done with the 3.1.0 tag above:
 
@@ -466,14 +456,13 @@ If you simply want the newest code from the 2.4 line of OpenCV, there is a `2.4`
 
 There is no need for the `git cherry-pick` commands used with 3.1.0 when building the 2.4.13 source.
 
-Configuring
------------
+### Configuring
 
 Configuring is done with CMake as before. The primary difference is that OpenCV 2.4 only provides Python bindings for Python 2, and thus does not distinguish between Python 2 and Python 3 in the CMake parameters. There is only one parameter, `BUILD_opencv_python`. In addition, there is a build-related parameter that controls features in 2.4 that are not in 3.1.0. This parameter is `BUILD_opencv_nonfree`.
 
 Configuration still takes place in a separate directory that must be a sibling to the `opencv` and `opencv_extra` directories.
 
-### Configuring Vibrante V4L
+#### Configuring Vibrante V4L
 
 For DRIVE PX 2:
 
@@ -510,7 +499,7 @@ For DRIVE PX 2:
         -DOPENCV_TEST_DATA_PATH=../opencv_extra/testdata \
         ../opencv
 
-### Configuring Jetson L4T
+#### Configuring Jetson L4T
 
 For Jetson TK1:
 
@@ -582,7 +571,7 @@ For Jetson TX1:
         -DOPENCV_TEST_DATA_PATH=../opencv_extra/testdata \
         ../opencv
 
-### Configuring Desktop Ubuntu Linux
+#### Configuring Desktop Ubuntu Linux
 
 For both 14.04 LTS and 16.04 LTS:
 
@@ -618,13 +607,12 @@ For both 14.04 LTS and 16.04 LTS:
         -DOPENCV_TEST_DATA_PATH=../opencv_extra/testdata \
         ../opencv
 
-Building, Testing and Installing
---------------------------------
+### Building, Testing and Installing
 
 Once configured, the steps of building, testing, and installing are the same as above for the 3.1.0 source.
 
 CMake Parameter Reference {#tutorial_building_tegra_cuda_parameter_reference}
-=========================
+-------------------------
 
 The following is a table of all the parameters passed to CMake in the recommended invocations above. Some of these are parameters from CMake itself, while most are specific to OpenCV.
 
diff --git a/doc/tutorials/introduction/clojure_dev_intro/clojure_dev_intro.markdown b/doc/tutorials/introduction/clojure_dev_intro/clojure_dev_intro.markdown
index 98ba16b7feec..cbf52bbdc3b1 100644
--- a/doc/tutorials/introduction/clojure_dev_intro/clojure_dev_intro.markdown
+++ b/doc/tutorials/introduction/clojure_dev_intro/clojure_dev_intro.markdown
@@ -35,11 +35,11 @@ issue the following command to run the sample from the command line.
 cd path/to/samples/java/clojure/simple-sample
 lein run
 @endcode
+
 Preamble
 --------
 
-For detailed instruction on installing OpenCV with desktop Java support refer to the @ref tutorial_java_dev_intro "corresponding
-tutorial".
+For detailed instruction on installing OpenCV with desktop Java support refer to the @ref tutorial_java_dev_intro "corresponding tutorial".
 
 If you are in hurry, here is a minimum quick start guide to install OpenCV on Mac OS X:
 
@@ -63,6 +63,7 @@ make -j8
 # optional
 # make install
 @endcode
+
 Install Leiningen
 -----------------
 
@@ -170,6 +171,7 @@ i386     -> x86
 arm      -> arm
 sparc    -> sparc
 @endcode
+
 ### Package the native lib as a jar
 
 Next you need to package the native lib in a jar file by using the jar command to create a new jar
@@ -193,6 +195,7 @@ tree
 
 3 directories, 3 files
 @endcode
+
 ### Locally install the jars
 
 We are now ready to add the two jars as artifacts to the local maven repository with the help of the
@@ -402,6 +405,7 @@ Let's now try to port to Clojure the @ref tutorial_java_dev_intro "OpenCV Java t
 Instead of writing it in a source file we're going to evaluate it at the REPL.
 
 Following is the original Java source code of the cited sample.
+
 @code{.java}
 import org.opencv.core.Mat;
 import org.opencv.core.CvType;
@@ -430,20 +434,25 @@ Before start coding, we'd like to eliminate the boring need of interactively loa
 opencv lib any time we start a new REPL to interact with it.
 
 First, stop the REPL by evaluating the (exit) expression at the REPL prompt.
+
 @code{.clojure}
 user=> (exit)
 Bye for now!
 @endcode
+
 Then open your project.clj file and edit it as follows:
+
 @code{.clojure}
 (defproject simple-sample "0.1.0-SNAPSHOT"
   ...
 injections [(clojure.lang.RT/loadLibrary org.opencv.core.Core/NATIVE_LIBRARY_NAME)])
 @endcode
+
 Here we're saying to load the opencv native lib anytime we run the REPL in such a way that we have
 not anymore to remember to manually do it.
 
 Rerun the lein repl task
+
 @code{.bash}
 lein repl
 nREPL server started on port 51645 on host 127.0.0.1
@@ -458,11 +467,14 @@ Clojure 1.5.1
 
 user=>
 @endcode
+
 Import the interested OpenCV java interfaces.
+
 @code{.clojure}
 user=> (import '[org.opencv.core Mat CvType Scalar])
 org.opencv.core.Scalar
 @endcode
+
 We're going to mimic almost verbatim the original OpenCV java tutorial to:
 
 -   create a 5x10 matrix with all its elements initialized to 0
diff --git a/doc/tutorials/introduction/config_reference/config_reference.markdown b/doc/tutorials/introduction/config_reference/config_reference.markdown
index 81607b008682..7ced9a2536b1 100644
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@@ -2,7 +2,7 @@ OpenCV configuration options reference {#tutorial_config_reference}
 ======================================
 
 @prev_tutorial{tutorial_general_install}
-@next_tutorial{tutorial_linux_install}
+@next_tutorial{tutorial_env_reference}
 
 @tableofcontents
 
@@ -224,6 +224,16 @@ Following options can be used to produce special builds with instrumentation or
 @see [Link time optimization](https://gcc.gnu.org/wiki/LinkTimeOptimization)
 @see [ThinLTO](https://clang.llvm.org/docs/ThinLTO.html)
 
+## Enable IPP optimization
+
+Following options can be used to enables IPP optimizations for each functions but increases the size of the opencv library. All options are disabled by default.
+
+| Option | Functions | + roughly size |
+| -------| --------- | -------------- |
+| `OPENCV_IPP_GAUSSIAN_BLUR` | GaussianBlur() | +8Mb |
+| `OPENCV_IPP_MEAN` | mean() / meanStdDev() | +0.2Mb |
+| `OPENCV_IPP_MINMAX` | minMaxLoc() / minMaxIdx() | +0.2Mb |
+| `OPENCV_IPP_SUM` | sum() | +0.1Mb |
 
 # Functional features and dependencies {#tutorial_config_reference_func}
 
@@ -484,7 +494,6 @@ OpenCV have own DNN inference module which have own build-in engine, but can als
 | `OPENCV_DNN_CUDA` | _OFF_ | Enable CUDA backend. [CUDA](https://en.wikipedia.org/wiki/CUDA), CUBLAS and [CUDNN](https://developer.nvidia.com/cudnn) must be installed. |
 | `WITH_HALIDE` | _OFF_ | Use experimental [Halide](https://en.wikipedia.org/wiki/Halide_(programming_language)) backend which can generate optimized code for dnn-layers at runtime. Halide must be installed. |
 | `WITH_VULKAN` | _OFF_ | Enable experimental [Vulkan](https://en.wikipedia.org/wiki/Vulkan_(API)) backend. Does not require additional dependencies, but can use external Vulkan headers (`VULKAN_INCLUDE_DIRS`). |
-| `WITH_TENGINE` | _OFF_ | Enable experimental [Tengine](https://github.com/OAID/Tengine) backend for ARM CPUs. Tengine library must be installed. |
 
 
 # Installation layout {#tutorial_config_reference_install}
@@ -566,6 +575,7 @@ Following options can be used to change installation layout for common scenarios
 | ------ | ------- | ----------- |
 | `OPENCV_ENABLE_NONFREE` | _OFF_ | Some algorithms included in the library are known to be protected by patents and are disabled by default. |
 | `OPENCV_FORCE_3RDPARTY_BUILD`| _OFF_ | Enable all `BUILD_` options at once. |
+| `OPENCV_IPP_ENABLE_ALL`| _OFF_ | Enable all `OPENCV_IPP_` options at once. |
 | `ENABLE_CCACHE` | _ON_ (on Unix-like platforms) | Enable [ccache](https://en.wikipedia.org/wiki/Ccache) auto-detection. This tool wraps compiler calls and caches results, can significantly improve re-compilation time. |
 | `ENABLE_PRECOMPILED_HEADERS` | _ON_ (for MSVC) | Enable precompiled headers support. Improves build time. |
 | `BUILD_DOCS` | _OFF_ | Enable documentation build (_doxygen_, _doxygen_cpp_, _doxygen_python_, _doxygen_javadoc_ targets). [Doxygen](http://www.doxygen.org/index.html) must be installed for C++ documentation build. Python and [BeautifulSoup4](https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)) must be installed for Python documentation build. Javadoc and Ant must be installed for Java documentation build (part of Java SDK). |
@@ -575,6 +585,7 @@ Following options can be used to change installation layout for common scenarios
 | `BUILD_FAT_JAVA_LIB` | _ON_ (for static Android builds) | Build single _opencv_java_ dynamic library containing all library functionality bundled with Java bindings. |
 | `BUILD_opencv_python2` | _ON_ | Build python2 bindings (deprecated). Python with development files and numpy must be installed. |
 | `BUILD_opencv_python3` | _ON_ | Build python3 bindings. Python with development files and numpy must be installed. |
+| `CAROTENE_NEON_ARCH` | '(auto)' | Switch NEON Arch for Carotene. If it sets nothing, it will be auto-detected. If it sets 8, ARMv8(and later) is used. Otherwise, ARMv7 is used. |
 
 TODO: need separate tutorials covering bindings builds
 
@@ -612,6 +623,7 @@ Following build options are utilized in `opencv_contrib` modules, as stated [pre
 `CMAKE_TOOLCHAIN_FILE`
 
 `WITH_CAROTENE`
+`WITH_KLEIDICV`
 `WITH_CPUFEATURES`
 `WITH_EIGEN`
 `WITH_OPENVX`
diff --git a/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown b/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
index ec605c8e45d3..6d43761e1456 100644
--- a/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
+++ b/doc/tutorials/introduction/cross_referencing/tutorial_cross_referencing.markdown
@@ -46,14 +46,14 @@ Open your Doxyfile using your favorite text editor and search for the key
 `TAGFILES`. Change it as follows:
 
 @code
-TAGFILES = ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/4.8.0
+TAGFILES = ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/4.10.0
 @endcode
 
 If you had other definitions already, you can append the line using a `\`:
 
 @code
 TAGFILES = ./docs/doxygen-tags/libstdc++.tag=https://gcc.gnu.org/onlinedocs/libstdc++/latest-doxygen \
-           ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/4.8.0
+           ./docs/doxygen-tags/opencv.tag=http://docs.opencv.org/4.10.0
 @endcode
 
 Doxygen can now use the information from the tag file to link to the OpenCV
diff --git a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.markdown b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.markdown
index 058b5c92f274..33704d5bcaa2 100644
--- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.markdown
+++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.markdown
@@ -2,7 +2,7 @@ Cross compilation for ARM based Linux systems {#tutorial_arm_crosscompile_with_c
 =============================================
 
 @prev_tutorial{tutorial_macos_install}
-@next_tutorial{tutorial_building_tegra_cuda}
+@next_tutorial{tutorial_crosscompile_with_multiarch}
 
 |    |    |
 | -: | :- |
diff --git a/doc/tutorials/introduction/crosscompilation/crosscompile_with_multiarch.markdown b/doc/tutorials/introduction/crosscompilation/crosscompile_with_multiarch.markdown
new file mode 100644
index 000000000000..2ae3e24e3821
--- /dev/null
+++ b/doc/tutorials/introduction/crosscompilation/crosscompile_with_multiarch.markdown
@@ -0,0 +1,595 @@
+# MultiArch cross-compilation with Ubuntu/Debian{#tutorial_crosscompile_with_multiarch}
+
+@prev_tutorial{tutorial_arm_crosscompile_with_cmake}
+@next_tutorial{tutorial_building_tegra_cuda}
+
+[TOC]
+
+|    |    |
+| -: | :- |
+| Original author | Kumataro |
+| Compatibility   | Ubuntu >=23.04 |
+|^                | OpenCV >=4.8.0 |
+
+@warning
+This tutorial may contain obsolete information.
+
+## What is "MultiArch"
+
+OpenCV may use a lot of 3rdparty libraries for video and image decoding, rendering, acceleration
+and complex math algorithms. The 3rd party components are found by CMake on the build host
+cross-compilation allows to build OpenCV for foreign architecture or OS, but we loose that large
+world of components and have to cross-compile each dependency separately and point to it during
+OpenCV build.
+
+Debian/Ubuntu MultiArch helps to fix this. It allows to install several foreign architecture
+libraries on host system and use them during OpenCV dependencies resolution.
+
+@warning
+- Following these steps will make your Linux environment a little dirty.
+  If possible, it is better to use VMs or Container(e.g. Docker).
+- This tutorial expects host and target uses same Ubuntu version.
+   Do not use/mix different versions for external library dependency.
+  - Good: Host and Target are 23.04.
+  - Good: Host and Target are 23.10.
+  - Not Good: Host is 23.04, and Target is 23.10.
+  - Not Good: Host is 23.10, and Target is 23.04.
+- This tutorial may be used for Debian and its derivatives like Raspberry Pi OS. Please make any
+necessary changes.
+
+## Download tools
+
+Install necessary tools and toolchains for cross-compilation.
+
+- git, cmake, pkgconf and build-essential are required basically.
+- ninja-build is to reduce compilation time(option).
+- crossbuild-essential-armhf is toolchain package for armv7 target.
+- crossbuild-essential-arm64 is toolchain package for aarch64 target.
+
+@code{.bash}
+sudo apt update -y
+sudo apt install -y \
+    git \
+    cmake \
+    pkgconf \
+    build-essential \
+    ninja-build \
+    crossbuild-essential-armhf \
+    crossbuild-essential-arm64
+@endcode
+
+If you want to enable Python 3 wrapper, install these packages too.
+
+@code{.bash}
+sudo apt install -y \
+    python3-minimal \
+    python3-numpy
+@endcode
+
+## Working folder structure
+
+In this tutorial, following working folder structure are used.
+
+@code{.unparsed}
+/home
+  + kmtr                    - please replace your account name.
+    + work
+      + opencv              - source, cloned from github
+      + opencv_contrib      - source, cloned from github
+      + build4-full_arm64   - artifact(for aarch64 target), created by cmake
+      + build4-full_armhf   - artifact(for armhf target), created by cmake
+@endcode
+
+1. Create working folder under your home directory.
+2. Clone OpenCV and OpenCV Contrib from repository to work directory.
+
+@code{.bash}
+cd ~
+mkdir work
+cd work
+git clone --depth=1 https://github.com/opencv/opencv.git
+git clone --depth=1 https://github.com/opencv/opencv_contrib.git
+@endcode
+
+## Update apt and dpkg settings
+
+These steps are on host.
+
+`apt` and `dpkg` are package management systems used in Ubuntu and Debian.
+
+Following are setup steps to use MultiArch.
+
+### Step 1. Add apt source for arm64 and armhf
+
+Execute `sudo apt edit-sources` to add foreign arch libraries at end of file.
+
+Example 1: arm64 and armv7 for Ubuntu 23.04
+
+@code{.unparsed}
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar main restricted
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar-updates main restricted
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar universe
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar-updates universe
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar multiverse
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar-updates multiverse
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar-backports main restricted universe multiverse
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar-security main restricted
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar-security universe
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports lunar-security multiverse
+@endcode
+
+Example 2: arm64 and armv7 for Ubuntu 23.10
+
+@code{.unparsed}
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic main restricted
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic-updates main restricted
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic universe
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic-updates universe
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic multiverse
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic-updates multiverse
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic-backports main restricted universe multiverse
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic-security main restricted
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic-security universe
+deb [arch=arm64,armhf] http://ports.ubuntu.com/ubuntu-ports mantic-security multiverse
+@endcode
+
+### Step 2. Update apt database
+
+Update apt database to apply new apt sources.
+
+Execute `sudo apt update`.
+
+@code{.bash}
+sudo apt update
+@endcode
+
+### Step 3. Update dpkg settings
+
+Update dpkg settings to support foreign architectures.
+
+Execute `sudo dpkg --add-architecture arm64` and/or `sudo dpkg --add-architecture armhf`.
+
+@code{.bash}
+sudo dpkg --add-architecture arm64
+sudo dpkg --add-architecture armhf
+@endcode
+
+`sudo dpkg --print-architecture` shows what is host architecture.
+
+@code{.bash}
+sudo dpkg --print-architecture
+amd64
+@endcode
+
+And `sudo dpkg --print-foreign-architectures` shows what foreign architectures are supported.
+
+@code{.bash}
+sudo dpkg --print-foreign-architectures
+arm64
+armhf
+@endcode
+
+### Confirm working pkg-config
+
+With MultiArch, several shared libraries and pkg-config information for each architectures are stored into /usr/lib.
+
+@code{.unparsed}
+/usr
+  + lib
+    + aarch64-linux-gnu   - shared libraries for arm64
+      + pkgconfig         - pkg-config files for arm64 libraries
+    + arm-linux-gnueabihf - shared libraries for armhf
+      + pkgconfig         - pkg-config files for armhf libraries
+  + share
+    + pkgconfig         - pkg-config files(for header files)
+@endcode
+
+Confirm to work `pkg-config` using `PKG_CONFIG_PATH`, `PKG_CONFIG_LIBDIR` and `PKG_CONFIG_SYSROOT_DIR` options.
+
+for aarch64:
+
+@code{.bash}
+PKG_CONFIG_PATH=/usr/lib/aarch64-linux-gnu/pkgconfig:/usr/share/pkgconfig \
+    PKG_CONFIG_LIBDIR=/usr/lib/aarch64-linux-gnu \
+    PKG_CONFIG_SYSROOT_DIR=/ \
+      pkg-config --list-all
+@endcode
+
+for armv7:
+
+@code{.bash}
+PKG_CONFIG_PATH=/usr/lib/arm-linux-gnueabihf/pkgconfig:/usr/share/pkgconfig \
+  PKG_CONFIG_LIBDIR=/usr/lib/arm-linux-gnueabihf \
+  PKG_CONFIG_SYSROOT_DIR=/ \
+      pkg-config --list-all
+@endcode
+
+## Cross-compile for aarch64
+
+Following is to compile for target (aarch64) at host (x86-64).
+
+### Step 1. Install external libraries for target into host
+
+This step is on host.
+
+Install libfreetype-dev, libharfbuzz-dev and FFmpeg packages for target (arm64) into host (x86-64).
+
+@code{.bash}
+sudo apt install -y \
+    libavcodec-dev:arm64 \
+    libavformat-dev:arm64 \
+    libavutil-dev:arm64 \
+    libswscale-dev:arm64 \
+    libfreetype-dev:arm64 \
+    libharfbuzz-dev:arm64
+@endcode
+
+If you want to enable Python 3 wrapper, install these packages too.
+
+@code{.bash}
+sudo apt install -y \
+    libpython3-dev:arm64
+@endcode
+
+If succeed, pkg-config can show information about these packages.
+
+For Freetype2 and Harfbuzz:
+
+@code{.bash}
+PKG_CONFIG_PATH=/usr/lib/aarch64-linux-gnu/pkgconfig:/usr/share/pkgconfig \
+    PKG_CONFIG_LIBDIR=/usr/lib/aarch64-linux-gnu \
+    PKG_CONFIG_SYSROOT_DIR=/ \
+       pkg-config freetype2 harfbuzz --cflags --libs
+-I/usr/include/freetype2 -I/usr/include/libpng16 -I/usr/include/harfbuzz -I/usr/include/glib-2.0 -I/usr/lib/aarch64-linux-gnu/glib-2.0/include -L/usr/lib/aarch64-linux-gnu -lfreetype -lharfbuzz
+@endcode
+
+For FFmpeg:
+
+@code{.bash}
+PKG_CONFIG_PATH=/usr/lib/aarch64-linux-gnu/pkgconfig:/usr/share/pkgconfig \
+    PKG_CONFIG_LIBDIR=/usr/lib/aarch64-linux-gnu \
+    PKG_CONFIG_SYSROOT_DIR=/ \
+       pkg-config libavcodec libavformat libavutil libswscale --cflags --libs
+-I/usr/include/aarch64-linux-gnu -L/usr/lib/aarch64-linux-gnu -lavcodec -lavformat -lavutil -lswscale
+@endcode
+
+### Step 2. Configure OpenCV Settings
+This step is on host.
+
+Execute `cmake` to make cross-compile configuration for aarch64.
+
+@note `-DCMAKE_TOOLCHAIN_FILE` should be absolute/real file path, not relative path.
+
+@code{.bash}
+PKG_CONFIG_PATH=/usr/lib/aarch64-linux-gnu/pkgconfig:/usr/share/pkgconfig \
+    PKG_CONFIG_LIBDIR=/usr/lib/aarch64-linux-gnu \
+    PKG_CONFIG_SYSROOT_DIR=/ \
+        cmake -S opencv \
+              -B build4-full_arm64 \
+              -DCMAKE_TOOLCHAIN_FILE=/home/kmtr/work/opencv/platforms/linux/aarch64-gnu.toolchain.cmake \
+              -DOPENCV_EXTRA_MODULES_PATH=opencv_contrib/modules \
+              -GNinja
+@endcode
+
+If you want to enable Python 3 wrapper, extra options are needed.
+
+@code{.bash}
+PYTHON3_REALPATH=`realpath /usr/bin/python3`
+PYTHON3_BASENAME=`basename ${PYTHON3_REALPATH}`
+PKG_CONFIG_PATH=/usr/lib/aarch64-linux-gnu/pkgconfig:/usr/share/pkgconfig \
+    PKG_CONFIG_LIBDIR=/usr/lib/aarch64-linux-gnu \
+    PKG_CONFIG_SYSROOT_DIR=/ \
+        cmake -S opencv \
+              -B build4-full_arm64 \
+              -DCMAKE_TOOLCHAIN_FILE=/home/kmtr/work/opencv/platforms/linux/aarch64-gnu.toolchain.cmake \
+              -DOPENCV_EXTRA_MODULES_PATH=opencv_contrib/modules \
+              -DPYTHON3_NUMPY_INCLUDE_DIRS="/usr/local/lib/${PYTHON3_BASENAME}/dist-packages/numpy/core/include/" \
+              -DPYTHON3_INCLUDE_PATH="/usr/include/${PYTHON3_BASENAME};/usr/include/" \
+              -DPYTHON3_LIBRARIES=`find /usr/lib/aarch64-linux-gnu/ -name libpython*.so` \
+              -DPYTHON3_EXECUTABLE="/usr/bin/${PYTHON3_BASENAME}" \
+              -DPYTHON3_CVPY_SUFFIX=".so" \
+              -GNinja
+@endcode
+
+@note
+@parblock
+Lastly, "python3.XX" string is needed. So this script generate it.
+- Get real path from "/usr/bin/python3" to "/usr/bin/python3.xx".
+- Get base name from "/usr/bin/python3.xx" to "pyhton3.xx".
+@endparblock
+
+Following is cmake outputs.
+- `Host` is `Linux x86_64`.
+- `Target` is `Linux aarch64`.
+- FFmpeg is available.
+
+@code{.unparsed}
+-- General configuration for OpenCV 4.8.0-dev =====================================
+--   Version control:               408730b
+--
+--   Extra modules:
+--     Location (extra):            /home/kmtr/work/opencv_contrib/modules
+--     Version control (extra):     faa5468
+--
+--   Platform:
+--     Timestamp:                   2023-12-01T22:02:14Z
+--     Host:                        Linux 6.5.0-13-generic x86_64
+--     Target:                      Linux 1 aarch64
+--     CMake:                       3.27.4
+--     CMake generator:             Ninja
+--     CMake build tool:            /usr/bin/ninja
+--     Configuration:               Release
+--
+--   CPU/HW features:
+--     Baseline:                    NEON FP16
+--       required:                  NEON
+--       disabled:                  VFPV3
+--     Dispatched code generation:  NEON_DOTPROD NEON_FP16 NEON_BF16
+--       requested:                 NEON_FP16 NEON_BF16 NEON_DOTPROD
+--       NEON_DOTPROD (1 files):    + NEON_DOTPROD
+--       NEON_FP16 (2 files):       + NEON_FP16
+--       NEON_BF16 (0 files):       + NEON_BF16
+--
+--   C/C++:
+--     Built as dynamic libs?:      YES
+--     C++ standard:                11
+--     C++ Compiler:                /usr/bin/aarch64-linux-gnu-g++  (ver 13.2.0)
+
+:
+:
+
+--
+--   Video I/O:
+--     DC1394:                      NO
+--     FFMPEG:                      YES
+--       avcodec:                   YES (60.3.100)
+--       avformat:                  YES (60.3.100)
+--       avutil:                    YES (58.2.100)
+--       swscale:                   YES (7.1.100)
+--       avresample:                NO
+--     GStreamer:                   NO
+--     v4l/v4l2:                    YES (linux/videodev2.h)
+--
+@endcode
+
+If enabling Python 3 wrapper is succeeded, `Python 3:` section shows more.
+
+@code{.unparsed}
+--
+--   Python 3:
+--     Interpreter:                 /usr/bin/python3.11 (ver 3.11.6)
+--     Libraries:                   /usr/lib/aarch64-linux-gnu/libpython3.11.so
+--     numpy:                       /usr/local/lib/python3.11/dist-packages/numpy/core/include/ (ver undefined - cannot be probed because of the cross-compilation)
+--     install path:                lib/python3.11/dist-packages/cv2/python-3.11
+--
+--   Python (for build):            /usr/bin/python3.11
+--
+@endcode
+
+### Step 3. Build and archive OpenCV libraries and headers
+
+This step in in host.
+
+Build and install.
+(This `install` means only that copying artifacts to `install` folder.)
+
+@code{.bash}
+     cmake --build   build4-full_arm64
+sudo cmake --install build4-full_arm64
+@endcode
+
+Archive artifacts(built libraries and headers) to `opencv_arm64.tgz` with tar command.
+
+@code{.bash}
+tar czvf opencv_arm64.tgz -C build4-full_arm64/install .
+@endcode
+
+And send `opencv_arm64.tgz` to target.
+
+### Step 4. Install dependency libraries at target
+
+This step is executed on the target system.
+
+Install dependency run-time libraries for OpenCV/OpenCV contrib libraries at target.
+
+@code{.bash}
+sudo apt install -y \
+    libavcodec60 \
+    libavformat60 \
+    libavutil58 \
+    libswscale7 \
+    libfreetype6 \
+    libharfbuzz0b
+
+sudo ldconfig
+@endcode
+
+If you want to enable Python 3 wrapper, install these packages too.
+
+@code{.bash}
+sudo apt install -y \
+    python3-minimal \
+    python3-numpy
+@endcode
+
+@warning
+@parblock
+If version of runtime libraries and/or programs are incremented, apt package names may be changed
+(e.g. `libswscale6` is used for Ubuntu 23.04, but `libswscale7` is used for Ubuntu 23.10).
+Looking for it with `apt search` command or https://packages.ubuntu.com/ .
+@endparblock
+
+@warning
+@parblock
+External library version between host and target should be same.
+Please update to the latest version libraries at the same time as possible.
+
+Even if the OS versions are the same between the Host and Target,
+the versions may differ due to additional updates to the libraries.
+This will cause unexpected problems.
+
+For example)
+- On Host, OpenCV has been build with external libA (v1.0) for target.
+- libA (v1.1) may be updated.
+- On Target, libA (v1.1) is installed to use OpenCV.
+- In this case, versions of libA is difference between compiling and running.
+@endparblock
+
+@warning
+@parblock
+If you forget/mismatch to install some necessary libraries, OpenCV will not works well.
+
+`ldd` command can detect dependency. If there are any "not found", please install necessary libraries.
+
+@code{.bash}
+ldd /usr/local/lib/libopencv_freetype.so
+@endcode
+
+(Not Good) `freetype module` requires `libharfbuzz.so.0`, but it has not been installed.
+@code{.unparsed}
+        linux-vdso.so.1 (0xABCDEFG01234567)
+        libopencv_imgproc.so.408 => /usr/local/lib/libopencv_imgproc.so.408 (0xABCDEF001234567)
+        libfreetype.so.6 => /lib/aarch64-linux-gnu/libfreetype.so.6 (0xABCDEF001234567)
+        libharfbuzz.so.0 => not found
+        libopencv_core.so.408 => /usr/local/lib/libopencv_core.so.408 (0xABCDEF001234567)
+        :
+@endcode
+
+(Good) All libraries which are required from `freetype modules` are installed.
+@code{.unparsed}
+        linux-vdso.so.1 (0xABCDEFG01234567)
+        libopencv_imgproc.so.408 => /usr/local/lib/libopencv_imgproc.so.408 (0xABCDEF001234567)
+        libfreetype.so.6 => /lib/aarch64-linux-gnu/libfreetype.so.6 (0xABCDEF001234567)
+        libharfbuzz.so.0 => /lib/aarch64-linux-gnu/libharfbuzz.so.0 (0xABCDEF001234567)
+        libopencv_core.so.408 => /usr/local/lib/libopencv_core.so.408 (0xABCDEF001234567)
+        :
+@endcode
+@endparblock
+
+### Step 5. Install OpenCV libraries to target
+
+This step is on target.
+
+Receive `opencv_arm64.tgz` from host (generated at Step3), and extract to `/usr/local`.
+
+@code{.bash}
+sudo tar zxvf opencv_arm64.tgz -C /usr/local
+sudo ldconfig
+@endcode
+
+You can use OpenCV libraries same as self-compiling. Following is OpenCV sample code. Compile and
+run it on target.
+
+Makefile
+@code{.make}
+a.out : main.cpp
+    g++ main.cpp -o a.out \
+        -I/usr/local/include/opencv4 \
+        -lopencv_core
+@endcode
+
+main.cpp
+@code{.cpp}
+#include <iostream>
+#include <opencv2/core.hpp>
+int main(void)
+{
+  std::cout << cv::getBuildInformation() << std::endl;
+  return 0;
+}
+@endcode
+
+Execute `make` and run it.
+@code{.bash}
+make a.out
+./a.out
+@endcode
+
+If you want to enable Python 3 wrapper, execute following command to confirm.
+
+@code{.bash}
+python3 -c "import cv2; print(cv2.getBuildInformation())"
+@endcode
+
+## Cross-compile for armv7
+
+Following is to compile for target (armhf) at host (x86-64).
+
+- To resolve dependencies, `linux-libc-dev:armhf` is required.
+- To optimize with neon, `-DENABLE_NEON=ON` is needed.
+
+@code{.bash}
+sudo apt install -y \
+    linux-libc-dev:armhf \
+    libavcodec-dev:armhf \
+    libavformat-dev:armhf \
+    libavutil-dev:armhf \
+    libswscale-dev:armhf \
+    libfreetype-dev:armhf \
+    libharfbuzz-dev:armhf
+
+PKG_CONFIG_PATH=/usr/lib/arm-linux-gnueabihf/pkgconfig:/usr/share/pkgconfig \
+    PKG_CONFIG_LIBDIR=/usr/lib/arm-linux-gnueabihf \
+    PKG_CONFIG_SYSROOT_DIR=/ \
+        cmake -S opencv \
+              -B build4-full_armhf \
+              -DENABLE_NEON=ON \
+              -DCMAKE_TOOLCHAIN_FILE=/home/kmtr/work/opencv/platforms/linux/arm-gnueabi.toolchain.cmake \
+              -DOPENCV_EXTRA_MODULES_PATH=opencv_contrib/modules \
+              -GNinja
+
+cmake      --build   build4-full_armhf
+sudo cmake --install build4-full_armhf
+tar czvf opencv_armhf.tgz -C build4-full_armhf/install .
+@endcode
+
+Following is cmake outputs.
+- `Host` is `Linux x86_64`.
+- `Target` is `Linux arm`.
+- FFmpeg is available.
+
+@code{.unparsed}
+-- General configuration for OpenCV 4.8.0-dev =====================================
+--   Version control:               408730b
+--
+--   Extra modules:
+--     Location (extra):            /home/kmtr/work/opencv_contrib/modules
+--     Version control (extra):     faa5468
+--
+--   Platform:
+--     Timestamp:                   2023-12-02T03:39:58Z
+--     Host:                        Linux 6.5.0-13-generic x86_64
+--     Target:                      Linux 1 arm
+--     CMake:                       3.27.4
+--     CMake generator:             Ninja
+--     CMake build tool:            /usr/bin/ninja
+--     Configuration:               Release
+--
+--   CPU/HW features:
+--     Baseline:                    NEON
+--       requested:                 DETECT
+--       required:                  NEON
+--       disabled:                  VFPV3
+--
+--   C/C++:
+--     Built as dynamic libs?:      YES
+--     C++ standard:                11
+--     C++ Compiler:                /usr/bin/arm-linux-gnueabihf-g++  (ver 13.2.0)
+
+:
+:
+
+--
+--   Video I/O:
+--     DC1394:                      NO
+--     FFMPEG:                      YES
+--       avcodec:                   YES (60.3.100)
+--       avformat:                  YES (60.3.100)
+--       avutil:                    YES (58.2.100)
+--       swscale:                   YES (7.1.100)
+--       avresample:                NO
+--     GStreamer:                   NO
+--     v4l/v4l2:                    YES (linux/videodev2.h)
+--
+
+@endcode
diff --git a/doc/tutorials/introduction/env_reference/env_reference.markdown b/doc/tutorials/introduction/env_reference/env_reference.markdown
new file mode 100644
index 000000000000..c25ea9e533d9
--- /dev/null
+++ b/doc/tutorials/introduction/env_reference/env_reference.markdown
@@ -0,0 +1,351 @@
+OpenCV environment variables reference {#tutorial_env_reference}
+======================================
+
+@prev_tutorial{tutorial_config_reference}
+@next_tutorial{tutorial_linux_install}
+
+@tableofcontents
+
+## Introduction
+
+OpenCV can change its behavior depending on the runtime environment:
+- enable extra debugging output or performance tracing
+- modify default locations and search paths
+- tune some algorithms or general behavior
+- enable or disable workarounds, safety features and optimizations
+
+**Notes:**
+- ⭐ marks most popular variables
+- variables with names like this `VAR_${NAME}` describes family of variables, where `${NAME}` should be changed to one of predefined values, e.g. `VAR_TBB`, `VAR_OPENMP`, ...
+
+### Setting environment variable in Windows
+In terminal or cmd-file (bat-file):
+```.bat
+set MY_ENV_VARIABLE=true
+C:\my_app.exe
+```
+In GUI:
+- Go to "Settings -> System -> About"
+- Click on "Advanced system settings" in the right part
+- In new window click on the "Environment variables" button
+- Add an entry to the "User variables" list
+
+### Setting environment variable in Linux
+
+In terminal or shell script:
+```.sh
+export MY_ENV_VARIABLE=true
+./my_app
+```
+or as a single command:
+```.sh
+MY_ENV_VARIABLE=true ./my_app
+```
+
+### Setting environment variable in Python
+
+```.py
+import os
+os.environ["MY_ENV_VARIABLE"] = "True" # value must be a string
+import cv2 # variables set after this may not have effect
+```
+
+@note This method may not work on all operating systems and/or Python distributions. For example, it works on Ubuntu Linux with system Python interpreter, but doesn't work on Windows 10 with the official Python package. It depends on the ability of a process to change its own environment (OpenCV uses `getenv` from C++ runtime to read variables).
+
+@note See also:
+- https://docs.python.org/3.12/library/os.html#os.environ
+- https://stackoverflow.com/questions/69199708/setenvironmentvariable-does-not-seem-to-set-values-that-can-be-retrieved-by-ge
+
+
+## Types
+
+- _non-null_ - set to anything to enable feature, in some cases can be interpreted as other types (e.g. path)
+- _bool_ - `1`, `True`, `true`, `TRUE` / `0`, `False`, `false`, `FALSE`
+- _number_/_size_ - unsigned number, suffixes `MB`, `Mb`, `mb`, `KB`, `Kb`, `kb`
+- _string_ - plain string or can have a structure
+- _path_ - to file, to directory
+- _paths_ - `;`-separated on Windows, `:`-separated on others
+
+
+## General, core
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_SKIP_CPU_BASELINE_CHECK | non-null | | do not check that current CPU supports all features used by the build (baseline) |
+| OPENCV_CPU_DISABLE | `,` or `;`-separated | | disable code branches which use CPU features (dispatched code) |
+| OPENCV_SETUP_TERMINATE_HANDLER | bool | true (Windows) | use std::set_terminate to install own termination handler |
+| OPENCV_LIBVA_RUNTIME | file path | | libva for VA interoperability utils |
+| OPENCV_ENABLE_MEMALIGN | bool | true (except static analysis, memory sanitizer, fuzzying, _WIN32?) | enable aligned memory allocations |
+| OPENCV_BUFFER_AREA_ALWAYS_SAFE | bool | false | enable safe mode for multi-buffer allocations (each buffer separately) |
+| OPENCV_KMEANS_PARALLEL_GRANULARITY | num | 1000 | tune algorithm parallel work distribution parameter `parallel_for_(..., ..., ..., granularity)` |
+| OPENCV_DUMP_ERRORS | bool | true (Debug or Android), false (others) | print extra information on exception (log to Android) |
+| OPENCV_DUMP_CONFIG | non-null | | print build configuration to stderr (`getBuildInformation`) |
+| OPENCV_PYTHON_DEBUG | bool | false | enable extra warnings in Python bindings |
+| OPENCV_TEMP_PATH | non-null / path | `/tmp/` (Linux), `/data/local/tmp/` (Android), `GetTempPathA` (Windows) | directory for temporary files |
+| OPENCV_DATA_PATH_HINT | paths | | paths for findDataFile |
+| OPENCV_DATA_PATH | paths | | paths for findDataFile |
+| OPENCV_SAMPLES_DATA_PATH_HINT | paths | | paths for findDataFile |
+| OPENCV_SAMPLES_DATA_PATH | paths | | paths for findDataFile |
+
+Links:
+- https://github.com/opencv/opencv/wiki/CPU-optimizations-build-options
+
+
+## Logging
+| name | type | default | description |
+|------|------|---------|-------------|
+| ⭐ OPENCV_LOG_LEVEL | string | | logging level (see accepted values below) |
+| OPENCV_LOG_TIMESTAMP | bool | true | logging with timestamps |
+| OPENCV_LOG_TIMESTAMP_NS | bool | false | add nsec to logging timestamps |
+
+### Levels
+- `0`, `O`, `OFF`, `S`, `SILENT`, `DISABLE`, `DISABLED`
+- `F`, `FATAL`
+- `E`, `ERROR`
+- `W`, `WARNING`, `WARN`, `WARNINGS`
+- `I`, `INFO`
+- `D`, `DEBUG`
+- `V`, `VERBOSE`
+
+
+## core/parallel_for
+| name | type | default | description |
+|------|------|---------|-------------|
+| ⭐ OPENCV_FOR_THREADS_NUM | num | 0 | set number of threads |
+| OPENCV_THREAD_POOL_ACTIVE_WAIT_PAUSE_LIMIT | num | 16 | tune pthreads parallel_for backend |
+| OPENCV_THREAD_POOL_ACTIVE_WAIT_WORKER | num | 2000 | tune pthreads parallel_for backend |
+| OPENCV_THREAD_POOL_ACTIVE_WAIT_MAIN | num | 10000 | tune pthreads parallel_for backend |
+| OPENCV_THREAD_POOL_ACTIVE_WAIT_THREADS_LIMIT | num | 0 | tune pthreads parallel_for backend |
+| OPENCV_FOR_OPENMP_DYNAMIC_DISABLE | bool | false | use single OpenMP thread |
+
+
+## backends
+OPENCV_LEGACY_WAITKEY
+Some modules have multiple available backends, following variables allow choosing specific backend or changing default priorities in which backends will be probed (e.g. when opening a video file).
+
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_PARALLEL_BACKEND | string | | choose specific paralel_for backend (one of `TBB`, `ONETBB`, `OPENMP`) |
+| OPENCV_PARALLEL_PRIORITY_${NAME} | num | | set backend priority, default is 1000 |
+| OPENCV_PARALLEL_PRIORITY_LIST | string, `,`-separated | | list of backends in priority order |
+| OPENCV_UI_BACKEND | string | | choose highgui backend for window rendering (one of `GTK`, `GTK3`, `GTK2`, `QT`, `WIN32`) |
+| OPENCV_UI_PRIORITY_${NAME} | num | | set highgui backend priority, default is 1000 |
+| OPENCV_UI_PRIORITY_LIST | string, `,`-separated | | list of highgui backends in priority order |
+| OPENCV_VIDEOIO_PRIORITY_${NAME} | num | | set videoio backend priority, default is 1000 |
+| OPENCV_VIDEOIO_PRIORITY_LIST | string, `,`-separated | | list of videoio backends in priority order |
+
+
+## plugins
+Some external dependencies can be detached into a dynamic library, which will be loaded at runtime (plugin). Following variables allow changing default search locations and naming pattern for these plugins.
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_CORE_PLUGIN_PATH | paths | | directories to search for _core_ plugins |
+| OPENCV_CORE_PARALLEL_PLUGIN_${NAME} | string, glob | | parallel_for plugin library name (glob), e.g. default for TBB is "opencv_core_parallel_tbb*.so" |
+| OPENCV_DNN_PLUGIN_PATH | paths | | directories to search for _dnn_ plugins |
+| OPENCV_DNN_PLUGIN_${NAME} | string, glob | | parallel_for plugin library name (glob), e.g. default for TBB is "opencv_core_parallel_tbb*.so" |
+| OPENCV_CORE_PLUGIN_PATH | paths | | directories to search for _highgui_ plugins (YES it is CORE) |
+| OPENCV_UI_PLUGIN_${NAME} | string, glob | | _highgui_ plugin library name (glob) |
+| OPENCV_VIDEOIO_PLUGIN_PATH | paths | | directories to search for _videoio_ plugins |
+| OPENCV_VIDEOIO_PLUGIN_${NAME} | string, glob | | _videoio_ plugin library name (glob) |
+
+## OpenCL
+
+**Note:** OpenCL device specification format is `<Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>`, e.g. `AMD:GPU:`
+
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_OPENCL_RUNTIME | filepath or `disabled` | | path to OpenCL runtime library (e.g. `OpenCL.dll`, `libOpenCL.so`) |
+| ⭐ OPENCV_OPENCL_DEVICE | string or `disabled` | | choose specific OpenCL device. See specification format in the note above. See more details in the Links section. |
+| OPENCV_OPENCL_RAISE_ERROR | bool | false | raise exception if something fails during OpenCL kernel preparation and execution (Release builds only) |
+| OPENCV_OPENCL_ABORT_ON_BUILD_ERROR | bool | false | abort if OpenCL kernel compilation failed |
+| OPENCV_OPENCL_CACHE_ENABLE | bool | true | enable OpenCL kernel cache |
+| OPENCV_OPENCL_CACHE_WRITE | bool | true | allow writing to the cache, otherwise cache will be read-only |
+| OPENCV_OPENCL_CACHE_LOCK_ENABLE | bool | true | use .lock files to synchronize between multiple applications using the same OpenCL cache (may not work on network drives) |
+| OPENCV_OPENCL_CACHE_CLEANUP | bool | true | automatically remove old entries from cache (leftovers from older OpenCL runtimes) |
+| OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS | bool | false | validate loaded binary OpenCL kernels |
+| OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS | bool | true (Apple), false (others) | enable workaround for non-continuos data downloads |
+| OPENCV_OPENCL_BUILD_EXTRA_OPTIONS | string | | pass extra options to OpenCL kernel compilation |
+| OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR | bool | true | workaround/optimization for buffer allocation |
+| OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR | num | 4 | parameter for OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR |
+| OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE | num | 0 | allow to decrease maxWorkGroupSize |
+| OPENCV_OPENCL_PROGRAM_CACHE | num | 0 | limit number of programs in OpenCL kernel cache |
+| OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL | bool | false | raise exception if async kernel failed |
+| OPENCV_OPENCL_BUFFERPOOL_LIMIT | num | 1 << 27 (Intel device), 0 (others) | limit memory used by buffer bool |
+| OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT | num | | same as OPENCV_OPENCL_BUFFERPOOL_LIMIT, but for HOST_PTR buffers |
+| OPENCV_OPENCL_BUFFER_FORCE_MAPPING | bool | false | force clEnqueueMapBuffer |
+| OPENCV_OPENCL_BUFFER_FORCE_COPYING | bool | false | force clEnqueueReadBuffer/clEnqueueWriteBuffer |
+| OPENCV_OPENCL_FORCE | bool | false | force running OpenCL kernel even if usual conditions are not met (e.g. dst.isUMat) |
+| OPENCV_OPENCL_PERF_CHECK_BYPASS | bool | false | force running OpenCL kernel even if usual performance-related conditions are not met (e.g. image is very small) |
+
+### SVM (Shared Virtual Memory) - disabled by default
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_OPENCL_SVM_DISABLE | bool | false | disable SVM |
+| OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE | bool | false | |
+| OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE | bool | false | |
+| OPENCV_OPENCL_SVM_CAPABILITIES_MASK | num | | |
+| OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT | num | | same as OPENCV_OPENCL_BUFFERPOOL_LIMIT, but for SVM buffers |
+
+### Links:
+- https://github.com/opencv/opencv/wiki/OpenCL-optimizations
+
+
+## Tracing/Profiling
+| name | type | default | description |
+|------|------|---------|-------------|
+| ⭐ OPENCV_TRACE | bool | false | enable trace |
+| OPENCV_TRACE_LOCATION | string | `OpenCVTrace` | trace file name ("${name}-$03d.txt") |
+| OPENCV_TRACE_DEPTH_OPENCV | num | 1 | |
+| OPENCV_TRACE_MAX_CHILDREN_OPENCV | num | 1000 | |
+| OPENCV_TRACE_MAX_CHILDREN | num | 1000 | |
+| OPENCV_TRACE_SYNC_OPENCL | bool | false | wait for OpenCL kernels to finish |
+| OPENCV_TRACE_ITT_ENABLE | bool | true | |
+| OPENCV_TRACE_ITT_PARENT | bool | false | set parentID for ITT task |
+| OPENCV_TRACE_ITT_SET_THREAD_NAME | bool | false | set name for OpenCV's threads "OpenCVThread-%03d" |
+
+### Links:
+- https://github.com/opencv/opencv/wiki/Profiling-OpenCV-Applications
+
+
+## Cache
+**Note:** Default tmp location is `%TMPDIR%` (Windows); `$XDG_CACHE_HOME`, `$HOME/.cache`, `/var/tmp`, `/tmp` (others)
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_CACHE_SHOW_CLEANUP_MESSAGE | bool | true | show cache cleanup message |
+| OPENCV_DOWNLOAD_CACHE_DIR | path | default tmp location | cache directory for downloaded files (subdirectory `downloads`) |
+| OPENCV_DNN_IE_GPU_CACHE_DIR | path | default tmp location | cache directory for OpenVINO OpenCL kernels (subdirectory `dnn_ie_cache_${device}`) |
+| OPENCV_OPENCL_CACHE_DIR | path | default tmp location | cache directory for OpenCL kernels cache (subdirectory `opencl_cache`) |
+
+
+## dnn
+**Note:** In the table below `dump_base_name` equals to `ocv_dnn_net_%05d_%02d` where first argument is internal network ID and the second - dump level.
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_DNN_BACKEND_DEFAULT | num | 3 (OpenCV) | set default DNN backend, see dnn.hpp for backends enumeration |
+| OPENCV_DNN_NETWORK_DUMP | num | 0 | level of information dumps, 0 - no dumps (default file name `${dump_base_name}.dot`) |
+| OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS | bool | false |  |
+| OPENCV_DNN_CHECK_NAN_INF | bool | false | check for NaNs in layer outputs |
+| OPENCV_DNN_CHECK_NAN_INF_DUMP | bool | false | print layer data when NaN check has failed |
+| OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR | bool | false | also raise exception when NaN check has failed |
+| OPENCV_DNN_ONNX_USE_LEGACY_NAMES | bool | false | use ONNX node names as-is instead of "onnx_node!${node_name}" |
+| OPENCV_DNN_CUSTOM_ONNX_TYPE_INCLUDE_DOMAIN_NAME | bool | true | prepend layer domain to layer types ("domain.type") |
+| OPENCV_VULKAN_RUNTIME | file path | | set location of Vulkan runtime library for DNN Vulkan backend |
+| OPENCV_DNN_IE_SERIALIZE | bool | false | dump intermediate OpenVINO graph (default file names `${dump_base_name}_ngraph.xml`, `${dump_base_name}_ngraph.bin`) |
+| OPENCV_DNN_IE_EXTRA_PLUGIN_PATH | path | | path to extra OpenVINO plugins |
+| OPENCV_DNN_IE_VPU_TYPE | string | | Force using specific OpenVINO VPU device type ("Myriad2" or "MyriadX") |
+| OPENCV_TEST_DNN_IE_VPU_TYPE | string | | same as OPENCV_DNN_IE_VPU_TYPE, but for tests |
+| OPENCV_DNN_INFERENCE_ENGINE_HOLD_PLUGINS | bool | true | always hold one existing OpenVINO instance to avoid crashes on unloading |
+| OPENCV_DNN_INFERENCE_ENGINE_CORE_LIFETIME_WORKAROUND | bool | true (Windows), false (other) | another OpenVINO lifetime workaround |
+| OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES | bool | false | allow running on CPU devices, allow FP16 on non-Intel device |
+| OPENCV_OCL4DNN_CONVOLUTION_IGNORE_INPUT_DIMS_4_CHECK | bool | false | workaround for OpenCL backend, see https://github.com/opencv/opencv/issues/20833 |
+| OPENCV_OCL4DNN_WORKAROUND_IDLF | bool | true | another workaround for OpenCL backend |
+| OPENCV_OCL4DNN_CONFIG_PATH | path | | path to kernel configuration cache for auto-tuning (must be existing directory), set this variable to enable auto-tuning |
+| OPENCV_OCL4DNN_DISABLE_AUTO_TUNING | bool | false | disable auto-tuning |
+| OPENCV_OCL4DNN_FORCE_AUTO_TUNING | bool | false | force auto-tuning |
+| OPENCV_OCL4DNN_TEST_ALL_KERNELS | num | 0 | test convolution kernels, number of iterations (auto-tuning) |
+| OPENCV_OCL4DNN_DUMP_FAILED_RESULT | bool | false | dump extra information on errors (auto-tuning) |
+| OPENCV_OCL4DNN_TUNING_RAISE_CHECK_ERROR | bool | false | raise exception on errors (auto-tuning) |
+
+
+## Tests
+| name | type | default | description |
+|------|------|---------|-------------|
+| ⭐ OPENCV_TEST_DATA_PATH | dir path | | set test data search location (e.g. `/home/user/opencv_extra/testdata`) |
+| ⭐ OPENCV_DNN_TEST_DATA_PATH | dir path | `$OPENCV_TEST_DATA_PATH/dnn` | set DNN model search location for tests (used by _dnn_, _gapi_, _objdetect_, _video_ modules)  |
+| OPENCV_OPEN_MODEL_ZOO_DATA_PATH | dir path | `$OPENCV_DNN_TEST_DATA_PATH/omz_intel_models` | set OpenVINO models search location for tests (used by _dnn_, _gapi_ modules) |
+| INTEL_CVSDK_DIR | | | some _dnn_ tests can search OpenVINO models here too |
+| OPENCV_TEST_DEBUG | num | 0 | debug level for tests, same as `--test_debug` (0 - no debug (default), 1 - basic test debug information, >1 - extra debug information) |
+| OPENCV_TEST_REQUIRE_DATA | bool | false | same as `--test_require_data` option (fail on missing non-required test data instead of skip) |
+| OPENCV_TEST_CHECK_OPTIONAL_DATA | bool | false | assert when optional data is not found |
+| OPENCV_IPP_CHECK | bool | false | default value for `--test_ipp_check` and `--perf_ipp_check` |
+| OPENCV_PERF_VALIDATION_DIR | dir path | | location of files read/written by `--perf_read_validation_results`/`--perf_write_validation_results` |
+| ⭐ OPENCV_PYTEST_FILTER | string (glob) | | test filter for Python tests |
+
+### Links:
+* https://github.com/opencv/opencv/wiki/QA_in_OpenCV
+
+
+## videoio
+**Note:** extra FFmpeg options should be pased in form `key;value|key;value|key;value`, for example `hwaccel;cuvid|video_codec;h264_cuvid|vsync;0` or `vcodec;x264|vprofile;high|vlevel;4.0`
+
+| name | type | default | description |
+|------|------|---------|-------------|
+| ⭐ OPENCV_FFMPEG_CAPTURE_OPTIONS | string (see note) | | extra options for VideoCapture FFmpeg backend |
+| ⭐ OPENCV_FFMPEG_WRITER_OPTIONS | string (see note) | | extra options for VideoWriter FFmpeg backend |
+| OPENCV_FFMPEG_THREADS | num | | set FFmpeg thread count |
+| OPENCV_FFMPEG_DEBUG | non-null | | enable logging messages from FFmpeg |
+| OPENCV_FFMPEG_LOGLEVEL | num | | set FFmpeg logging level |
+| OPENCV_FFMPEG_DLL_DIR | dir path | | directory with FFmpeg plugin (legacy) |
+| OPENCV_FFMPEG_IS_THREAD_SAFE | bool | false | enabling this option will turn off thread safety locks in the FFmpeg backend (use only if you are sure FFmpeg is built with threading support, tested on Linux) |
+| OPENCV_FFMPEG_READ_ATTEMPTS | num | 4096 | number of failed `av_read_frame` attempts before failing read procedure |
+| OPENCV_FFMPEG_DECODE_ATTEMPTS | num | 64 | number of failed `avcodec_receive_frame` attempts before failing decoding procedure |
+| OPENCV_VIDEOIO_GSTREAMER_CALL_DEINIT | bool | false | close GStreamer instance on end |
+| OPENCV_VIDEOIO_GSTREAMER_START_MAINLOOP | bool | false | start GStreamer loop in separate thread |
+| OPENCV_VIDEOIO_MFX_IMPL | num | | set specific MFX implementation (see MFX docs for enumeration) |
+| OPENCV_VIDEOIO_MFX_EXTRA_SURFACE_NUM | num | 1 | add extra surfaces to the surface pool |
+| OPENCV_VIDEOIO_MFX_POOL_TIMEOUT | num | 1 | timeout for waiting for free surface from the pool (in seconds) |
+| OPENCV_VIDEOIO_MFX_BITRATE_DIVISOR | num | 300 | this option allows to tune encoding bitrate (video quality/size) |
+| OPENCV_VIDEOIO_MFX_WRITER_TIMEOUT | num | 1 | timeout for encoding operation (in seconds) |
+| OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS | bool | true | allow HW-accelerated transformations (DXVA) in MediaFoundation processing graph (may slow down camera probing process) |
+| OPENCV_DSHOW_DEBUG | non-null | | enable verbose logging in the DShow backend |
+| OPENCV_DSHOW_SAVEGRAPH_FILENAME | file path | | enable processing graph tump in the DShow backend |
+| OPENCV_VIDEOIO_V4L_RANGE_NORMALIZED | bool | false | use (0, 1) range for properties (V4L) |
+| OPENCV_VIDEOIO_V4L_SELECT_TIMEOUT | num | 10 | timeout for select call (in seconds) (V4L) |
+| OPENCV_VIDEOCAPTURE_DEBUG | bool | false | enable debug messages for VideoCapture |
+| OPENCV_VIDEOWRITER_DEBUG | bool | false | enable debug messages for VideoWriter |
+| ⭐ OPENCV_VIDEOIO_DEBUG | bool | false | debug messages for both VideoCapture and VideoWriter |
+
+### videoio tests
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_TEST_VIDEOIO_BACKEND_REQUIRE_FFMPEG | | | test app will exit if no FFmpeg backend is available |
+| OPENCV_TEST_V4L2_VIVID_DEVICE | file path | | path to VIVID virtual camera device for V4L2 test (e.g. `/dev/video5`) |
+| OPENCV_TEST_PERF_CAMERA_LIST | paths | | cameras to use in performance test (waitAny_V4L test) |
+| OPENCV_TEST_CAMERA_%d_FPS | num | | fps to set for N-th camera (0-based index) (waitAny_V4L test) |
+
+
+## gapi
+| name | type | default | description |
+|------|------|---------|-------------|
+| ⭐ GRAPH_DUMP_PATH | file path | | dump graph (dot format) |
+| PIPELINE_MODELS_PATH | dir path | | pipeline_modeling_tool sample application uses this var |
+| OPENCV_GAPI_INFERENCE_ENGINE_CORE_LIFETIME_WORKAROUND | bool | true (Windows, Apple), false (others) | similar to OPENCV_DNN_INFERENCE_ENGINE_CORE_LIFETIME_WORKAROUND |
+
+### gapi tests/samples
+| name | type | default | description |
+|------|------|---------|-------------|
+| PLAIDML_DEVICE | string | | specific to PlaidML backend test |
+| PLAIDML_TARGET | string | | specific to PlaidML backend test |
+| OPENCV_GAPI_ONNX_MODEL_PATH | dir path | | search location for ONNX models test |
+| OPENCV_TEST_FREETYPE_FONT_PATH | file path | | location of TrueType font for one of tests |
+
+### Links:
+* https://github.com/opencv/opencv/wiki/Using-G-API-with-OpenVINO-Toolkit
+* https://github.com/opencv/opencv/wiki/Using-G-API-with-MS-ONNX-Runtime
+
+
+## highgui
+
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_LEGACY_WAITKEY | non-null | | switch `waitKey` return result (default behavior: `return code & 0xff` (or -1), legacy behavior: `return code`) |
+| $XDG_RUNTIME_DIR | | | Wayland backend specific - create shared memory-mapped file for interprocess communication (named `opencv-shared-??????`) |
+
+
+## imgproc
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_OPENCL_IMGPROC_MORPH_SPECIAL_KERNEL | bool | true (Apple), false (others) | use special OpenCL kernel for small morph kernel (Intel devices) |
+| OPENCV_GAUSSIANBLUR_CHECK_BITEXACT_KERNELS | bool | false | validate Gaussian kernels before running (src is CV_16U, bit-exact version) |
+
+
+## imgcodecs
+| name | type | default | description |
+|------|------|---------|-------------|
+| OPENCV_IMGCODECS_AVIF_MAX_FILE_SIZE | num | 64MB | limit input AVIF size |
+| OPENCV_IMGCODECS_WEBP_MAX_FILE_SIZE | num | 64MB | limit input WEBM size |
+| OPENCV_IO_MAX_IMAGE_PARAMS | num | 50 | limit maximum allowed number of parameters in imwrite and imencode |
+| OPENCV_IO_MAX_IMAGE_WIDTH | num | 1 << 20, limit input image size to avoid large memory allocations | |
+| OPENCV_IO_MAX_IMAGE_HEIGHT | num | 1 << 20 | |
+| OPENCV_IO_MAX_IMAGE_PIXELS | num | 1 << 30 | |
+| OPENCV_IO_ENABLE_OPENEXR | bool | true (set build option OPENCV_IO_FORCE_OPENEXR or use external OpenEXR), false (otherwise) | enable OpenEXR backend |
+| OPENCV_IO_ENABLE_JASPER | bool | true (set build option OPENCV_IO_FORCE_JASPER), false (otherwise) | enable Jasper backend |
diff --git a/doc/tutorials/introduction/java_eclipse/java_eclipse.markdown b/doc/tutorials/introduction/java_eclipse/java_eclipse.markdown
index 9d9434adf791..46e2aac5d168 100644
--- a/doc/tutorials/introduction/java_eclipse/java_eclipse.markdown
+++ b/doc/tutorials/introduction/java_eclipse/java_eclipse.markdown
@@ -21,7 +21,7 @@ less mistakes. Here we go.
 Configuring Eclipse
 -------------------
 
-First, obtain a fresh release of OpenCV [from download page](http://opencv.org/releases.html) and
+First, obtain a fresh release of OpenCV [from download page](https://opencv.org/releases) and
 extract it under a simple location like `C:\OpenCV-2.4.6\`. I am using version 2.4.6, but the steps
 are more or less the same for other versions.
 
diff --git a/doc/tutorials/introduction/linux_install/linux_install.markdown b/doc/tutorials/introduction/linux_install/linux_install.markdown
index e69f6ea70749..77855e069eb9 100644
--- a/doc/tutorials/introduction/linux_install/linux_install.markdown
+++ b/doc/tutorials/introduction/linux_install/linux_install.markdown
@@ -65,7 +65,7 @@ There are two methods of getting OpenCV sources:
 
 
 @note
-Snapshots of other branches, releases or commits can be found on the [GitHub](https://github.com/opencv/opencv) and the [official download page](https://opencv.org/releases.html).
+Snapshots of other branches, releases or commits can be found on the [GitHub](https://github.com/opencv/opencv) and the [official download page](https://opencv.org/releases).
 
 
 ## Configure and build {#tutorial_linux_install_detailed_basic_build}
diff --git a/doc/tutorials/introduction/macos_install/macos_install.markdown b/doc/tutorials/introduction/macos_install/macos_install.markdown
index dadce9304c81..f979f50b94f5 100644
--- a/doc/tutorials/introduction/macos_install/macos_install.markdown
+++ b/doc/tutorials/introduction/macos_install/macos_install.markdown
@@ -53,7 +53,7 @@ You can use the latest stable OpenCV version or you can grab the latest snapshot
 
 ### Getting the Latest Stable OpenCV Version
 
--   Go to our [downloads page](http://opencv.org/releases.html).
+-   Go to our [downloads page](https://opencv.org/releases).
 -   Download the source archive and unpack it.
 
 ### Getting the Cutting-edge OpenCV from the Git Repository
diff --git a/doc/tutorials/introduction/table_of_content_introduction.markdown b/doc/tutorials/introduction/table_of_content_introduction.markdown
index 8fa89d7d7f9b..22d9688291f9 100644
--- a/doc/tutorials/introduction/table_of_content_introduction.markdown
+++ b/doc/tutorials/introduction/table_of_content_introduction.markdown
@@ -3,6 +3,7 @@ Introduction to OpenCV {#tutorial_table_of_content_introduction}
 
 - @subpage tutorial_general_install
 - @subpage tutorial_config_reference
+- @subpage tutorial_env_reference
 
 ##### Linux
 -   @subpage tutorial_linux_install
@@ -20,13 +21,14 @@ Introduction to OpenCV {#tutorial_table_of_content_introduction}
 -   @subpage tutorial_java_eclipse
 -   @subpage tutorial_clojure_dev_intro
 -   @subpage tutorial_android_dev_intro
--   @subpage tutorial_O4A_SDK
 -   @subpage tutorial_dev_with_OCV_on_Android
+-   @subpage tutorial_android_dnn_intro
 -   @subpage tutorial_android_ocl_intro
 
 ##### Other platforms
 -   @subpage tutorial_macos_install
 -   @subpage tutorial_arm_crosscompile_with_cmake
+-   @subpage tutorial_crosscompile_with_multiarch
 -   @subpage tutorial_building_tegra_cuda
 -   @ref tutorial_ios_install
 
diff --git a/doc/tutorials/introduction/transition_guide/transition_guide.markdown b/doc/tutorials/introduction/transition_guide/transition_guide.markdown
index 454d3ca05167..43bc5dd9580c 100644
--- a/doc/tutorials/introduction/transition_guide/transition_guide.markdown
+++ b/doc/tutorials/introduction/transition_guide/transition_guide.markdown
@@ -201,7 +201,7 @@ All specialized `ocl` implementations has been hidden behind general C++ algorit
 
 New class cv::UMat is intended to hide data exchange with OpenCL device in a convenient way.
 
-Following example illustrate API modifications (from [OpenCV site](http://opencv.org/platforms/opencl.html)):
+Following example illustrate API modifications (from [OpenCV site](https://opencv.org/opencl)):
 
 -   OpenCL-aware code OpenCV-2.x
 @code{.cpp}
diff --git a/doc/tutorials/introduction/windows_install/windows_install.markdown b/doc/tutorials/introduction/windows_install/windows_install.markdown
index eabf31482fd3..36d4e0ccba93 100644
--- a/doc/tutorials/introduction/windows_install/windows_install.markdown
+++ b/doc/tutorials/introduction/windows_install/windows_install.markdown
@@ -22,8 +22,7 @@ best to help you out.
 with the latest Microsoft Visual Studio IDE and do not take advantage of the most advanced
 technologies we integrate into our library. .. _Windows_Install_Prebuild:
 
-Installation by Using the Pre-built Libraries {#tutorial_windows_install_prebuilt}
-=============================================
+## Installation by Using the Pre-built Libraries {#tutorial_windows_install_prebuilt}
 
 -#  Launch a web browser of choice and go to our [page on
     Sourceforge](http://sourceforge.net/projects/opencvlibrary/files/).
@@ -35,8 +34,7 @@ Installation by Using the Pre-built Libraries {#tutorial_windows_install_prebuil
 
 -#  To finalize the installation go to the @ref tutorial_windows_install_path section.
 
-Installation by Using git-bash (version>=2.14.1) and cmake (version >=3.9.1){#tutorial_windows_gitbash_build}
-===============================================================
+## Installation by Using git-bash (version>=2.14.1) and cmake (version >=3.9.1){#tutorial_windows_gitbash_build}
 
 -#  You must download [cmake (version >=3.9.1)](https://cmake.org) and install it. You must add cmake to PATH variable during installation
 
@@ -108,8 +106,7 @@ CMAKE_OPTIONS=(-DBUILD_PERF_TESTS:BOOL=OFF -DBUILD_TESTS:BOOL=OFF -DBUILD_DOCS:B
 -# Next time you run this script, opencv and opencv_contrib will be updated and rebuild
 
 
-Installation by Making Your Own Libraries from the Source Files {#tutorial_windows_install_build}
-===============================================================
+## Installation by Making Your Own Libraries from the Source Files {#tutorial_windows_install_build}
 
 You may find the content of this tutorial also inside the following videos:
 [Part 1](https://www.youtube.com/watch?v=NnovZ1cTlMs) and [Part 2](https://www.youtube.com/watch?v=qGNWMcfWwPU), hosted on YouTube.
@@ -364,8 +361,7 @@ libraries). If you do not need the support for some of these, you can just freel
     caused mostly by old video card drivers. For testing the GPU (if built) run the
     *performance_gpu.exe* sample application.
 
-Set the OpenCV environment variable and add it to the systems path {#tutorial_windows_install_path}
-=================================================================
+## Set the OpenCV environment variable and add it to the systems path {#tutorial_windows_install_path}
 
 First, we set an environment variable to make our work easier. This will hold the build directory of
 our OpenCV library that we use in our projects. Start up a command window and enter:
@@ -378,6 +374,9 @@ our OpenCV library that we use in our projects. Start up a command window and en
 
     setx OpenCV_DIR D:\OpenCV\build\x64\vc16     (suggested for Visual Studio 2019 - 64 bit Windows)
     setx OpenCV_DIR D:\OpenCV\build\x86\vc16     (suggested for Visual Studio 2019 - 32 bit Windows)
+
+    setx OpenCV_DIR D:\OpenCV\build\x64\vc17     (suggested for Visual Studio 2022 - 64 bit Windows)
+    setx OpenCV_DIR D:\OpenCV\build\x86\vc17     (suggested for Visual Studio 2022 - 32 bit Windows)
 @endcode
 Here the directory is where you have your OpenCV binaries (*extracted* or *built*). You can have
 different platform (e.g. x64 instead of x86) or compiler type, so substitute appropriate value.
diff --git a/doc/tutorials/objdetect/aruco_board_detection/aruco_board_detection.markdown b/doc/tutorials/objdetect/aruco_board_detection/aruco_board_detection.markdown
new file mode 100644
index 000000000000..56eb1a75d1e2
--- /dev/null
+++ b/doc/tutorials/objdetect/aruco_board_detection/aruco_board_detection.markdown
@@ -0,0 +1,202 @@
+Detection of ArUco boards {#tutorial_aruco_board_detection}
+=========================
+
+@prev_tutorial{tutorial_aruco_detection}
+@next_tutorial{tutorial_charuco_detection}
+
+|    |    |
+| -: | :- |
+| Original authors | Sergio Garrido, Alexander Panov |
+| Compatibility    | OpenCV >= 4.7.0 |
+
+An ArUco board is a set of markers that acts like a single marker in the sense that it provides a
+single pose for the camera.
+
+The most popular board is the one with all the markers in the same plane, since it can be easily printed:
+
+![](images/gboriginal.jpg)
+
+However, boards are not limited to this arrangement and can represent any 2d or 3d layout.
+
+The difference between a board and a set of independent markers is that the relative position between
+the markers in the board is known a priori. This allows that the corners of all the markers can be used for
+estimating the pose of the camera respect to the whole board.
+
+When you use a set of independent markers, you can estimate the pose for each marker individually,
+since you dont know the relative position of the markers in the environment.
+
+The main benefits of using boards are:
+
+- The pose estimation is much more versatile. Only some markers are necessary to perform pose estimation.
+Thus, the pose can be calculated even in the presence of occlusions or partial views.
+- The obtained pose is usually more accurate since a higher amount of point correspondences (marker
+corners) are employed.
+
+Board Detection
+---------------
+
+A board detection is similar to the standard marker detection. The only difference is in the pose estimation step.
+In fact, to use marker boards, a standard marker detection should be done before estimating the board pose.
+
+To perform pose estimation for boards, you should use `solvePnP()` function, as shown below
+in the `samples/cpp/tutorial_code/objectDetection/detect_board.cpp`.
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board.cpp aruco_detect_board_full_sample
+
+
+The parameters are:
+
+- `objPoints`, `imgPoints` object and image points, matched with `cv::aruco::GridBoard::matchImagePoints()`
+   which, in turn, takes as input `markerCorners` and `markerIds` structures of detected markers from
+   `cv::aruco::ArucoDetector::detectMarkers()` function.
+- `board` the `cv::aruco::Board` object that defines the board layout and its ids
+- `cameraMatrix` and `distCoeffs`: camera calibration parameters necessary for pose estimation.
+- `rvec` and `tvec`: estimated pose of the board. If not empty then treated as initial guess.
+- The function returns the total number of markers employed for estimating the board pose.
+
+The drawFrameAxes() function can be used to check the obtained pose. For instance:
+
+![Board with axis](images/gbmarkersaxis.jpg)
+
+And this is another example with the board partially occluded:
+
+![Board with occlusions](images/gbocclusion.jpg)
+
+As it can be observed, although some markers have not been detected, the board pose can still be
+estimated from the rest of markers.
+
+Sample video:
+
+@youtube{Q1HlJEjW_j0}
+
+A full working example is included in the `detect_board.cpp` inside the `samples/cpp/tutorial_code/objectDetection/`.
+
+The samples now take input via command line via the `cv::CommandLineParser`. For this file the example
+parameters will look like:
+@code{.cpp}
+    -w=5 -h=7 -l=100 -s=10
+    -v=/path_to_opencv/opencv/doc/tutorials/objdetect/aruco_board_detection/gboriginal.jpg
+    -c=/path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/tutorial_camera_params.yml
+    -cd=/path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/tutorial_dict.yml
+@endcode
+Parameters for `detect_board.cpp`:
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board.cpp aruco_detect_board_keys
+
+Grid Board
+----------
+
+Creating the `cv::aruco::Board` object requires specifying the corner positions for each marker in the environment.
+However, in many cases, the board will be just a set of markers in the same plane and in a grid layout,
+so it can be easily printed and used.
+
+Fortunately, the aruco module provides the basic functionality to create and print these types of markers
+easily.
+
+The `cv::aruco::GridBoard` class is a specialized class that inherits from the `cv::aruco::Board`
+class and which represents a Board with all the markers in the same plane and in a grid layout,
+as in the following image:
+
+![Image with aruco board](images/gboriginal.jpg)
+
+Concretely, the coordinate system in a grid board is positioned in the board plane, centered in the bottom left
+corner of the board and with the Z pointing out, like in the following image (X:red, Y:green, Z:blue):
+
+![Board with axis](images/gbaxis.jpg)
+
+A `cv::aruco::GridBoard` object can be defined using the following parameters:
+
+- Number of markers in the X direction.
+- Number of markers in the Y direction.
+- Length of the marker side.
+- Length of the marker separation.
+- The dictionary of the markers.
+- Ids of all the markers (X*Y markers).
+
+This object can be easily created from these parameters using the `cv::aruco::GridBoard` constructor:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board.cpp aruco_create_board
+
+- The first and second parameters are the number of markers in the X and Y direction respectively.
+- The third and fourth parameters are the marker length and the marker separation respectively.
+  They can be provided in any unit, having in mind that the estimated pose for this board will be
+  measured in the same units (in general, meters are used).
+- Finally, the dictionary of the markers is provided.
+
+So, this board will be composed by 5x7=35 markers. The ids of each of the markers are assigned, by default,
+in ascending order starting on 0, so they will be 0, 1, 2, ..., 34.
+
+After creating a grid board, we probably want to print it and use it.
+There are two ways to do this:
+1. By using the script `doc/patter_tools/gen_pattern.py `, see @subpage tutorial_camera_calibration_pattern.
+2. By using the function `cv::aruco::GridBoard::generateImage()`.
+
+The function `cv::aruco::GridBoard::generateImage()` is provided in cv::aruco::GridBoard class and
+can be called by using the following code:
+
+@snippet samples/cpp/tutorial_code/objectDetection/create_board.cpp aruco_generate_board_image
+
+- The first parameter is the size of the output image in pixels. In this case 600x500 pixels. If this is not proportional
+to the board dimensions, it will be centered on the image.
+- `boardImage`: the output image with the board.
+- The third parameter is the (optional) margin in pixels, so none of the markers are touching the image border.
+In this case the margin is 10.
+- Finally, the size of the marker border, similarly to `generateImageMarker()` function. The default value is 1.
+
+A full working example of board creation is included in the `samples/cpp/tutorial_code/objectDetection/create_board.cpp`
+
+The output image will be something like this:
+
+![](images/board.png)
+
+The samples now take input via commandline via the `cv::CommandLineParser`. For this file the example
+parameters will look like:
+@code{.cpp}
+    "_output_path_/aboard.png" -w=5 -h=7 -l=100 -s=10 -d=10
+@endcode
+
+Refine marker detection
+-----------------------
+
+ArUco boards can also be used to improve the detection of markers. If we have detected a subset of the markers
+that belongs to the board, we can use these markers and the board layout information to try to find the
+markers that have not been previously detected.
+
+This can be done using the `cv::aruco::refineDetectedMarkers()` function, which should be called
+after calling `cv::aruco::ArucoDetector::detectMarkers()`.
+
+The main parameters of this function are the original image where markers were detected, the board object,
+the detected marker corners, the detected marker ids and the rejected marker corners.
+
+The rejected corners can be obtained from the `cv::aruco::ArucoDetector::detectMarkers()` function and
+are also known as marker candidates. This candidates are square shapes that have been found in the
+original image but have failed to pass the identification step (i.e. their inner codification presents
+too many errors) and thus they have not been recognized as markers.
+
+However, these candidates are sometimes actual markers that have not been correctly identified due to high
+noise in the image, very low resolution or other related problems that affect to the binary code extraction.
+The `cv::aruco::ArucoDetector::refineDetectedMarkers()` function finds correspondences between these
+candidates and the missing markers of the board. This search is based on two parameters:
+
+- Distance between the candidate and the projection of the missing marker. To obtain these projections,
+it is necessary to have detected at least one marker of the board. The projections are obtained using the
+camera parameters (camera matrix and distortion coefficients) if they are provided. If not, the projections
+are obtained from local homography and only planar board are allowed (i.e. the Z coordinate of all the
+marker corners should be the same). The `minRepDistance` parameter in `refineDetectedMarkers()`
+determines the minimum euclidean distance between the candidate corners and the projected marker corners
+(default value 10).
+
+- Binary codification. If a candidate surpasses the minimum distance condition, its internal bits
+are analyzed again to determine if it is actually the projected marker or not. However, in this case,
+the condition is not so strong and the number of allowed erroneous bits can be higher. This is indicated
+in the `errorCorrectionRate` parameter (default value 3.0). If a negative value is provided, the
+internal bits are not analyzed at all and only the corner distances are evaluated.
+
+This is an example of using the `cv::aruco::ArucoDetector::refineDetectedMarkers()` function:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board.cpp aruco_detect_and_refine
+
+It must also be noted that, in some cases, if the number of detected markers in the first place is
+too low (for instance only 1 or 2 markers), the projections of the missing markers can be of bad
+quality, producing erroneous correspondences.
+
+See module samples for a more detailed implementation.
diff --git a/doc/tutorials/objdetect/aruco_board_detection/images/board.png b/doc/tutorials/objdetect/aruco_board_detection/images/board.png
new file mode 100644
index 000000000000..0f2c25b55954
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_board_detection/images/board.png differ
diff --git a/doc/tutorials/objdetect/aruco_board_detection/images/gbaxis.jpg b/doc/tutorials/objdetect/aruco_board_detection/images/gbaxis.jpg
new file mode 100644
index 000000000000..ce1434378403
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_board_detection/images/gbaxis.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_board_detection/images/gbmarkersaxis.jpg b/doc/tutorials/objdetect/aruco_board_detection/images/gbmarkersaxis.jpg
new file mode 100644
index 000000000000..d1d5fcba49a1
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_board_detection/images/gbmarkersaxis.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_board_detection/images/gbocclusion.jpg b/doc/tutorials/objdetect/aruco_board_detection/images/gbocclusion.jpg
new file mode 100644
index 000000000000..9222b7d2b957
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_board_detection/images/gbocclusion.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_board_detection/images/gboriginal.jpg b/doc/tutorials/objdetect/aruco_board_detection/images/gboriginal.jpg
new file mode 100644
index 000000000000..8343139aedee
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_board_detection/images/gboriginal.jpg differ
diff --git a/modules/objdetect/tutorials/images/singlemarkersaxes.jpg b/doc/tutorials/objdetect/aruco_board_detection/images/singlemarkersaxes.jpg
similarity index 100%
rename from modules/objdetect/tutorials/images/singlemarkersaxes.jpg
rename to doc/tutorials/objdetect/aruco_board_detection/images/singlemarkersaxes.jpg
diff --git a/doc/tutorials/objdetect/aruco_calibration/aruco_calibration.markdown b/doc/tutorials/objdetect/aruco_calibration/aruco_calibration.markdown
new file mode 100644
index 000000000000..e36e335d4f32
--- /dev/null
+++ b/doc/tutorials/objdetect/aruco_calibration/aruco_calibration.markdown
@@ -0,0 +1,88 @@
+Calibration with ArUco and ChArUco {#tutorial_aruco_calibration}
+==================================
+
+@prev_tutorial{tutorial_charuco_diamond_detection}
+@next_tutorial{tutorial_aruco_faq}
+
+The ArUco module can also be used to calibrate a camera. Camera calibration consists in obtaining the
+camera intrinsic parameters and distortion coefficients. This parameters remain fixed unless the camera
+optic is modified, thus camera calibration only need to be done once.
+
+Camera calibration is usually performed using the OpenCV `cv::calibrateCamera()` function. This function
+requires some correspondences between environment points and their projection in the camera image from
+different viewpoints. In general, these correspondences are obtained from the corners of chessboard
+patterns. See `cv::calibrateCamera()` function documentation or the OpenCV calibration tutorial for
+more detailed information.
+
+Using the ArUco module, calibration can be performed based on ArUco markers corners or ChArUco corners.
+Calibrating using ArUco is much more versatile than using traditional chessboard patterns, since it
+allows occlusions or partial views.
+
+As it can be stated, calibration can be done using both, marker corners or ChArUco corners. However,
+it is highly recommended using the ChArUco corners approach since the provided corners are much
+more accurate in comparison to the marker corners. Calibration using a standard Board should only be
+employed in those scenarios where the ChArUco boards cannot be employed because of any kind of restriction.
+
+Calibration with ChArUco Boards
+-------------------------------
+
+To calibrate using a ChArUco board, it is necessary to detect the board from different viewpoints, in the
+same way that the standard calibration does with the traditional chessboard pattern. However, due to the
+benefits of using ChArUco, occlusions and partial views are allowed, and not all the corners need to be
+visible in all the viewpoints.
+
+![ChArUco calibration viewpoints](images/charucocalibration.jpg)
+
+The example of using `cv::calibrateCamera()` for cv::aruco::CharucoBoard:
+
+@snippet samples/cpp/tutorial_code/objectDetection/calibrate_camera_charuco.cpp CalibrationWithCharucoBoard1
+@snippet samples/cpp/tutorial_code/objectDetection/calibrate_camera_charuco.cpp CalibrationWithCharucoBoard2
+@snippet samples/cpp/tutorial_code/objectDetection/calibrate_camera_charuco.cpp CalibrationWithCharucoBoard3
+
+The ChArUco corners and ChArUco identifiers captured on each viewpoint are stored in the vectors
+`allCharucoCorners` and `allCharucoIds`, one element per viewpoint.
+
+The `calibrateCamera()` function will fill the `cameraMatrix` and `distCoeffs` arrays with the
+camera calibration parameters. It will return the reprojection error obtained from the calibration.
+The elements in `rvecs` and `tvecs` will be filled with the estimated pose of the camera
+(respect to the ChArUco board) in each of the viewpoints.
+
+Finally, the `calibrationFlags` parameter determines some of the options for the calibration.
+
+A full working example is included in the `calibrate_camera_charuco.cpp` inside the
+`samples/cpp/tutorial_code/objectDetection` folder.
+
+The samples now take input via commandline via the `cv::CommandLineParser`. For this file the example
+parameters will look like:
+@code{.cpp}
+    "camera_calib.txt" -w=5 -h=7 -sl=0.04 -ml=0.02 -d=10
+    -v=path/img_%02d.jpg
+@endcode
+
+The camera calibration parameters from `opencv/samples/cpp/tutorial_code/objectDetection/tutorial_camera_charuco.yml`
+were obtained by the `img_00.jpg-img_03.jpg` placed from this
+[folder](https://github.com/opencv/opencv_contrib/tree/4.6.0/modules/aruco/tutorials/aruco_calibration/images).
+
+Calibration with ArUco Boards
+-----------------------------
+
+As it has been stated, it is recommended the use of ChAruco boards instead of ArUco boards for camera
+calibration, since ChArUco corners are more accurate than marker corners. However, in some special cases
+it must be required to use calibration based on ArUco boards. As in the previous case, it requires
+the detections of an ArUco board from different viewpoints.
+
+![ArUco calibration viewpoints](images/arucocalibration.jpg)
+
+The example of using `cv::calibrateCamera()` for cv::aruco::GridBoard:
+
+@snippet samples/cpp/tutorial_code/objectDetection/calibrate_camera.cpp CalibrationWithArucoBoard1
+@snippet samples/cpp/tutorial_code/objectDetection/calibrate_camera.cpp CalibrationWithArucoBoard2
+@snippet samples/cpp/tutorial_code/objectDetection/calibrate_camera.cpp CalibrationWithArucoBoard3
+
+A full working example is included in the `calibrate_camera.cpp` inside the `samples/cpp/tutorial_code/objectDetection` folder.
+
+The samples now take input via commandline via the `cv::CommandLineParser`. For this file the example
+parameters will look like:
+@code{.cpp}
+    "camera_calib.txt" -w=5 -h=7 -l=100 -s=10 -d=10 -v=path/aruco_videos_or_images
+@endcode
diff --git a/doc/tutorials/objdetect/aruco_calibration/images/arucocalibration.jpg b/doc/tutorials/objdetect/aruco_calibration/images/arucocalibration.jpg
new file mode 100644
index 000000000000..9a86015049dd
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_calibration/images/arucocalibration.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_calibration/images/charucocalibration.jpg b/doc/tutorials/objdetect/aruco_calibration/images/charucocalibration.jpg
new file mode 100644
index 000000000000..ed45382cd0fb
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_calibration/images/charucocalibration.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_detection/aruco_detection.markdown b/doc/tutorials/objdetect/aruco_detection/aruco_detection.markdown
new file mode 100644
index 000000000000..d3ac7451607f
--- /dev/null
+++ b/doc/tutorials/objdetect/aruco_detection/aruco_detection.markdown
@@ -0,0 +1,702 @@
+Detection of ArUco Markers {#tutorial_aruco_detection}
+==========================
+
+@next_tutorial{tutorial_aruco_board_detection}
+
+|    |    |
+| -: | :- |
+| Original authors | Sergio Garrido, Alexander Panov |
+| Compatibility    | OpenCV >= 4.7.0 |
+
+Pose estimation is of great importance in many computer vision applications: robot navigation,
+augmented reality, and many more. This process is based on finding correspondences between points in
+the real environment and their 2d image projection. This is usually a difficult step, and thus it is
+common to use synthetic or fiducial markers to make it easier.
+
+One of the most popular approaches is the use of binary square fiducial markers. The main benefit
+of these markers is that a single marker provides enough correspondences (its four corners)
+to obtain the camera pose. Also, the inner binary codification makes them specially robust, allowing
+the possibility of applying error detection and correction techniques.
+
+The aruco module is based on the [ArUco library](http://www.uco.es/investiga/grupos/ava/node/26),
+a popular library for detection of square fiducial markers developed by Rafael Muñoz and Sergio Garrido @cite Aruco2014.
+
+The aruco functionalities are included in:
+@code{.cpp}
+#include <opencv2/objdetect/aruco_detector.hpp>
+@endcode
+
+
+Markers and Dictionaries
+------------------------
+
+An ArUco marker is a synthetic square marker composed by a wide black border and an inner binary
+matrix which determines its identifier (id). The black border facilitates its fast detection in the
+image and the binary codification allows its identification and the application of error detection
+and correction techniques. The marker size determines the size of the internal matrix. For instance
+a marker size of 4x4 is composed by 16 bits.
+
+Some examples of ArUco markers:
+
+![Example of markers images](images/markers.jpg)
+
+It must be noted that a marker can be found rotated in the environment, however, the detection
+process needs to be able to determine its original rotation, so that each corner is identified
+unequivocally. This is also done based on the binary codification.
+
+A dictionary of markers is the set of markers that are considered in a specific application. It is
+simply the list of binary codifications of each of its markers.
+
+The main properties of a dictionary are the dictionary size and the marker size.
+
+- The dictionary size is the number of markers that compose the dictionary.
+- The marker size is the size of those markers (the number of bits/modules).
+
+The aruco module includes some predefined dictionaries covering a range of different dictionary
+sizes and marker sizes.
+
+One may think that the marker id is the number obtained from converting the binary codification to
+a decimal base number. However, this is not possible since for high marker sizes the number of bits
+is too high and managing such huge numbers is not practical. Instead, a marker id is simply
+the marker index within the dictionary it belongs to. For instance, the first 5 markers in a
+dictionary have the ids: 0, 1, 2, 3 and 4.
+
+More information about dictionaries is provided in the "Selecting a dictionary" section.
+
+
+Marker Creation
+---------------
+
+Before their detection, markers need to be printed in order to be placed in the environment.
+Marker images can be generated using the `generateImageMarker()` function.
+
+For example, lets analyze the following call:
+
+@code{.cpp}
+cv::Mat markerImage;
+cv::aruco::Dictionary dictionary = cv::aruco::getPredefinedDictionary(cv::aruco::DICT_6X6_250);
+cv::aruco::generateImageMarker(dictionary, 23, 200, markerImage, 1);
+cv::imwrite("marker23.png", markerImage);
+@endcode
+
+First, the `cv::aruco::Dictionary` object is created by choosing one of the predefined dictionaries in the aruco module.
+Concretely, this dictionary is composed of 250 markers and a marker size of 6x6 bits (`cv::aruco::DICT_6X6_250`).
+
+The parameters of `cv::aruco::generateImageMarker()` are:
+
+- The first parameter is the `cv::aruco::Dictionary` object previously created.
+- The second parameter is the marker id, in this case the marker 23 of the dictionary `cv::aruco::DICT_6X6_250`.
+Note that each dictionary is composed of a different number of markers. In this case, the valid ids
+go from 0 to 249. Any specific id out of the valid range will produce an exception.
+- The third parameter, 200, is the size of the output marker image. In this case, the output image
+will have a size of 200x200 pixels. Note that this parameter should be large enough to store the
+number of bits for the specific dictionary. So, for instance, you cannot generate an image of
+5x5 pixels for a marker size of 6x6 bits (and that is without considering the marker border).
+Furthermore, to avoid deformations, this parameter should be proportional to the number of bits +
+border size, or at least much higher than the marker size (like 200 in the example), so that
+deformations are insignificant.
+- The fourth parameter is the output image.
+- Finally, the last parameter is an optional parameter to specify the width of the marker black
+border. The size is specified proportional to the number of bits. For instance a value of 2 means
+that the border will have a width equivalent to the size of two internal bits. The default value
+is 1.
+
+The generated image is:
+
+![Generated marker](images/marker23.png)
+
+A full working example is included in the `create_marker.cpp` inside the `samples/cpp/tutorial_code/objectDetection/`.
+
+The samples now take input from the command line using cv::CommandLineParser. For this file the example
+parameters will look like:
+@code{.cpp}
+"marker23.png" -d=10 -id=23
+@endcode
+Parameters for `create_marker.cpp`:
+@snippet samples/cpp/tutorial_code/objectDetection/create_marker.cpp aruco_create_markers_keys
+
+Marker Detection
+----------------
+
+Given an image containing ArUco markers, the detection process has to return a list of
+detected markers. Each detected marker includes:
+
+- The position of its four corners in the image (in their original order).
+- The id of the marker.
+
+The marker detection process is comprised of two main steps:
+
+1. Detection of marker candidates. In this step the image is analyzed in order to find square shapes
+that are candidates to be markers. It begins with an adaptive thresholding to segment the markers,
+then contours are extracted from the thresholded image and those that are not convex or do not
+approximate to a square shape are discarded. Some extra filtering is also applied (removing contours
+that are too small or too big, removing contours too close to each other, etc).
+
+2. After the candidate detection, it is necessary to determine if they are actually markers by
+analyzing their inner codification. This step starts by extracting the marker bits of each marker.
+To do so, a perspective transformation is first applied to obtain the marker in its canonical form.
+Then, the canonical image is thresholded using Otsu to separate white and black bits. The image
+is divided into different cells according to the marker size and the border size. Then the number
+of black or white pixels in each cell is counted to determine if it is a white or a black bit.
+Finally, the bits are analyzed to determine if the marker belongs to the specific dictionary.
+Error correction techniques are employed when necessary.
+
+
+Consider the following image:
+
+![Image with an assortment of markers](images/singlemarkerssource.jpg)
+
+And a printout of this image in a photo:
+
+![Original image with markers](images/singlemarkersoriginal.jpg)
+
+These are the detected markers (in green). Note that some markers are rotated. The small red square
+indicates the marker’s top left corner:
+
+![Image with detected markers](images/singlemarkersdetection.jpg)
+
+And these are the marker candidates that have been rejected during the identification step (in pink):
+
+![Image with rejected candidates](images/singlemarkersrejected.jpg)
+
+In the aruco module, the detection is performed in the `cv::aruco::ArucoDetector::detectMarkers()`
+function. This function is the most important in the module, since all the rest of the functionality
+is based on the detected markers returned by `cv::aruco::ArucoDetector::detectMarkers()`.
+
+An example of marker detection:
+
+@code{.cpp}
+cv::Mat inputImage;
+// ... read inputImage ...
+std::vector<int> markerIds;
+std::vector<std::vector<cv::Point2f>> markerCorners, rejectedCandidates;
+cv::aruco::DetectorParameters detectorParams = cv::aruco::DetectorParameters();
+cv::aruco::Dictionary dictionary = cv::aruco::getPredefinedDictionary(cv::aruco::DICT_6X6_250);
+cv::aruco::ArucoDetector detector(dictionary, detectorParams);
+detector.detectMarkers(inputImage, markerCorners, markerIds, rejectedCandidates);
+@endcode
+
+When you create an `cv::aruco::ArucoDetector` object, you need to pass the following parameters to the constructor:
+
+- A dictionary object, in this case one of the predefined dictionaries (`cv::aruco::DICT_6X6_250`).
+- Object of type `cv::aruco::DetectorParameters`. This object includes all parameters that can be customized during the detection process.
+These parameters will be explained in the next section.
+
+The parameters of `cv::aruco::ArucoDetector::detectMarkers()` are:
+
+- The first parameter is the image containing the markers to be detected.
+- The detected markers are stored in the `markerCorners` and `markerIds` structures:
+    - `markerCorners` is the list of corners of the detected markers. For each marker, its four
+    corners are returned in their original order (which is clockwise starting with top left).
+    So, the first corner is the top left corner, followed by the top right, bottom right and bottom left.
+    - `markerIds` is the list of ids of each of the detected markers in `markerCorners`.
+    Note that the returned `markerCorners` and `markerIds` vectors have the same size.
+- The final optional parameter, `rejectedCandidates`, is a returned list of marker candidates, i.e.
+shapes that were found and considered but did not contain a valid marker. Each candidate is also
+defined by its four corners, and its format is the same as the `markerCorners` parameter. This
+parameter can be omitted and is only useful for debugging purposes and for ‘refind’ strategies
+(see `cv::aruco::ArucoDetector::refineDetectedMarkers()`).
+
+
+The next thing you probably want to do after `cv::aruco::ArucoDetector::detectMarkers()` is check that
+your markers have been correctly detected. Fortunately, the aruco module provides a function to draw
+the detected markers in the input image, this function is `drawDetectedMarkers()`. For example:
+
+@code{.cpp}
+cv::Mat outputImage = inputImage.clone();
+cv::aruco::drawDetectedMarkers(outputImage, markerCorners, markerIds);
+@endcode
+
+- `outputImage ` is the input/output image where the markers will be drawn (it will normally be
+  the same as the image where the markers were detected).
+- `markerCorners` and `markerIds` are the structures of the detected markers returned by the
+  `cv::aruco::ArucoDetector::detectMarkers()` function.
+
+![Image with detected markers](images/singlemarkersdetection.jpg)
+
+Note that this function is only provided for visualization and its use can be omitted.
+
+With these two functions we can create a basic marker detection loop to detect markers from our
+camera:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_markers.cpp aruco_detect_markers
+
+Note that some of the optional parameters have been omitted, like the detection parameter object and the
+output vector of rejected candidates.
+
+A full working example is included in the `detect_markers.cpp` inside the `samples/cpp/tutorial_code/objectDetection/`.
+
+The samples now take input from the command line using cv::CommandLineParser. For this file
+the example parameters will look like:
+@code{.cpp}
+-v=/path_to_opencv/opencv/doc/tutorials/objdetect/aruco_detection/images/singlemarkersoriginal.jpg -d=10
+@endcode
+Parameters for `detect_markers.cpp`:
+@snippet samples/cpp/tutorial_code/objectDetection/detect_markers.cpp aruco_detect_markers_keys
+
+
+Pose Estimation
+---------------
+
+The next thing you'll probably want to do after detecting the markers is to use them to get the camera pose.
+
+To perform camera pose estimation, you need to know your camera's calibration parameters. These are
+the camera matrix and distortion coefficients. If you do not know how to calibrate your camera,
+you can take a look at the `calibrateCamera()` function and the Calibration tutorial of OpenCV.
+You can also calibrate your camera using the aruco module as explained in the **Calibration with ArUco and ChArUco**
+tutorial. Note that this only needs to be done once unless the camera optics are modified
+(for instance changing its focus).
+
+As a result of the calibration, you get a camera matrix: a matrix of 3x3 elements with the
+focal distances and the camera center coordinates (a.k.a intrinsic parameters), and the distortion
+coefficients: a vector of 5 or more elements that models the distortion produced by your camera.
+
+When you estimate the pose with ArUco markers, you can estimate the pose of each marker individually.
+If you want to estimate one pose from a set of markers, use ArUco Boards (see the **Detection of ArUco
+Boards** tutorial). Using ArUco boards instead of single markers allows some markers to be occluded.
+
+The camera pose relative to the marker is a 3d transformation from the marker coordinate system to the
+camera coordinate system. It is specified by rotation and translation vectors. OpenCV provides
+`cv::solvePnP()` function to do that.
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_markers.cpp aruco_pose_estimation1
+@snippet samples/cpp/tutorial_code/objectDetection/detect_markers.cpp aruco_pose_estimation2
+@snippet samples/cpp/tutorial_code/objectDetection/detect_markers.cpp aruco_pose_estimation3
+
+- The `corners` parameter is the vector of marker corners returned by the `cv::aruco::ArucoDetector::detectMarkers()` function.
+- The second parameter is the size of the marker side in meters or in any other unit. Note that the
+  translation vectors of the estimated poses will be in the same units.
+- `camMatrix` and `distCoeffs` are the camera calibration parameters that were created during
+  the camera calibration process.
+- The output parameters `rvecs` and `tvecs` are the rotation and translation vectors respectively,
+  for each of the detected markers in `corners`.
+
+The marker coordinate system that is assumed by this function is placed in the center (by default) or
+in the top left corner of the marker with the Z axis pointing out, as in the following image.
+Axis-color correspondences are X: red, Y: green, Z: blue. Note the axis directions of the rotated
+markers in this image.
+
+![Image with axes drawn](images/singlemarkersaxes.jpg)
+
+OpenCV provides a function to draw the axis as in the image above, so pose estimation can be
+checked:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_markers.cpp aruco_draw_pose_estimation
+
+- `imageCopy` is the input/output image where the detected markers will be shown.
+- `camMatrix` and `distCoeffs` are the camera calibration parameters.
+- `rvecs[i]` and `tvecs[i]` are the rotation and translation vectors respectively, for each of the detected markers.
+- The last parameter is the length of the axis, in the same unit as tvec (usually meters).
+
+Sample video:
+
+@youtube{IsXWrcB_Hvs}
+
+A full working example is included in the `detect_markers.cpp` inside the `samples/cpp/tutorial_code/objectDetection/`.
+
+The samples now take input from the command line using cv::CommandLineParser. For this file
+the example parameters will look like:
+@code{.cpp}
+-v=/path_to_opencv/opencv/doc/tutorials/objdetect/aruco_detection/images/singlemarkersoriginal.jpg -d=10
+-c=/path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/tutorial_camera_params.yml
+@endcode
+Parameters for `detect_markers.cpp`:
+@snippet samples/cpp/tutorial_code/objectDetection/detect_markers.cpp aruco_detect_markers_keys
+
+Selecting a dictionary
+----------------------
+
+The aruco module provides the `Dictionary` class to represent a dictionary of markers.
+
+In addition to the marker size and the number of markers in the dictionary, there is another important
+parameter of the dictionary - the inter-marker distance. The inter-marker distance is the minimum
+Hamming distance between dictionary markers that determines the dictionary's ability to detect and
+correct errors.
+
+In general, smaller dictionary sizes and larger marker sizes increase the inter-marker distance and
+vice versa. However, the detection of markers with larger sizes is more difficult due to the higher
+number of bits that need to be extracted from the image.
+
+For instance, if you need only 10 markers in your application, it is better to use a dictionary composed
+only of those 10 markers than using a dictionary composed of 1000 markers. The reason is that
+the dictionary composed of 10 markers will have a higher inter-marker distance and, thus, it will be
+more robust to errors.
+
+As a consequence, the aruco module includes several ways to select your dictionary of markers, so that
+you can increase your system robustness:
+
+### Predefined dictionaries
+
+This is the easiest way to select a dictionary. The aruco module includes a set of predefined
+dictionaries in a variety of marker sizes and number of markers. For instance:
+
+@code{.cpp}
+cv::aruco::Dictionary dictionary = cv::aruco::getPredefinedDictionary(cv::aruco::DICT_6X6_250);
+@endcode
+
+`cv::aruco::DICT_6X6_250` is an example of predefined dictionary of markers with 6x6 bits and a total of 250
+markers.
+
+From all the provided dictionaries, it is recommended to choose the smallest one that fits your application.
+For instance, if you need 200 markers of 6x6 bits, it is better to use `cv::aruco::DICT_6X6_250` than `cv::aruco::DICT_6X6_1000`.
+The smaller the dictionary, the higher the inter-marker distance.
+
+The list of available predefined dictionaries can be found in the documentation for the `PredefinedDictionaryType` enum.
+
+### Automatic dictionary generation
+
+A dictionary can be generated automatically to adjust the desired number of markers and bits
+to optimize the inter-marker distance:
+
+@code{.cpp}
+cv::aruco::Dictionary dictionary = cv::aruco::extendDictionary(36, 5);
+@endcode
+
+This will generate a customized dictionary composed of 36 markers of 5x5 bits. The process can take several
+seconds, depending on the parameters (it is slower for larger dictionaries and higher numbers of bits).
+
+Also you could use `aruco_dict_utils.cpp` sample inside the `opencv/samples/cpp`. This sample calculates
+the minimum Hamming distance for the generated dictionary and also allows you to create markers that are
+resistant to reflection.
+
+### Manual dictionary definition
+
+Finally, the dictionary can be configured manually, so that any encoding can be used. To do that,
+the `cv::aruco::Dictionary` object parameters need to be assigned manually. It must be noted that,
+unless you have a special reason to do this manually, it is preferable to use one of the previous alternatives.
+
+The `cv::aruco::Dictionary` parameters are:
+
+@code{.cpp}
+    class Dictionary {
+    public:
+
+        cv::Mat bytesList;      // marker code information
+        int markerSize;         // number of bits per dimension
+        int maxCorrectionBits;  // maximum number of bits that can be corrected
+
+        ...
+
+    }
+@endcode
+
+`bytesList` is the array that contains all the information about the marker codes. `markerSize` is the size
+ of each marker dimension (for instance, 5 for markers with 5x5 bits). Finally, `maxCorrectionBits` is
+the maximum number of erroneous bits that can be corrected during the marker detection. If this value is too
+high, it can lead to a high number of false positives.
+
+Each row in `bytesList` represents one of the dictionary markers. However, the markers are not stored in their
+binary form, instead they are stored in a special format to simplify their detection.
+
+Fortunately, a marker can be easily transformed to this form using the static method `Dictionary::getByteListFromBits()`.
+
+For example:
+
+@code{.cpp}
+    cv::aruco::Dictionary dictionary;
+
+    // Markers of 6x6 bits
+    dictionary.markerSize = 6;
+
+    // Maximum number of bit corrections
+    dictionary.maxCorrectionBits = 3;
+
+    // Let's create a dictionary of 100 markers
+    for(int i = 0; i < 100; i++)
+    {
+        // Assume generateMarkerBits() generates a new marker in binary format, so that
+        // markerBits is a 6x6 matrix of CV_8UC1 type, only containing 0s and 1s
+        cv::Mat markerBits = generateMarkerBits();
+        cv::Mat markerCompressed = cv::aruco::Dictionary::getByteListFromBits(markerBits);
+
+        // Add the marker as a new row
+        dictionary.bytesList.push_back(markerCompressed);
+    }
+@endcode
+
+Detector Parameters
+-------------------
+
+One of the parameters of `cv::aruco::ArucoDetector` is a `cv::aruco::DetectorParameters` object. This object
+includes all the options that can be customized during the marker detection process.
+
+This section describes each detector parameter. The parameters can be classified depending on
+the process in which they’re involved:
+
+### Thresholding
+
+One of the first steps in the marker detection process is adaptive thresholding of the input image.
+
+For instance, the thresholded image for the sample image used above is:
+
+![Thresholded image](images/singlemarkersthresh.png)
+
+This thresholding can be customized with the following parameters:
+
+#### adaptiveThreshWinSizeMin, adaptiveThreshWinSizeMax, and adaptiveThreshWinSizeStep
+
+The `adaptiveThreshWinSizeMin` and `adaptiveThreshWinSizeMax` parameters represent the interval where the
+thresholding window sizes (in pixels) are selected for the adaptive thresholding (see OpenCV
+`threshold()` and `adaptiveThreshold()` functions for more details).
+
+The parameter `adaptiveThreshWinSizeStep` indicates the increments of the window size from
+`adaptiveThreshWinSizeMin` to `adaptiveThreshWinSizeMax`.
+
+For instance, for the values `adaptiveThreshWinSizeMin` = 5 and `adaptiveThreshWinSizeMax` = 21 and
+`adaptiveThreshWinSizeStep` = 4, there will be 5 thresholding steps with window sizes 5, 9, 13, 17 and 21.
+On each thresholding image, marker candidates will be extracted.
+
+Low values of window size can "break" the marker border if the marker size is too large, causing it to not be detected, as in the following image:
+
+![Broken marker image](images/singlemarkersbrokenthresh.png)
+
+On the other hand, too large values can produce the same effect if the markers are too small, and can also
+reduce the performance. Moreover the process will tend to global thresholding, resulting in a loss of adaptive benefits.
+
+The simplest case is using the same value for `adaptiveThreshWinSizeMin` and
+ `adaptiveThreshWinSizeMax`, which produces a single thresholding step. However, it is usually better to use a
+ range of values for the window size, although many thresholding steps can also reduce the performance considerably.
+
+@see cv::aruco::DetectorParameters::adaptiveThreshWinSizeMin, cv::aruco::DetectorParameters::adaptiveThreshWinSizeMax,
+cv::aruco::DetectorParameters::adaptiveThreshWinSizeStep
+
+#### adaptiveThreshConstant
+
+The `adaptiveThreshConstant` parameter represents the constant value added in the thresholding operation (see OpenCV
+`threshold()` and `adaptiveThreshold()` functions for more details). Its default value is a good option in most cases.
+
+@see cv::aruco::DetectorParameters::adaptiveThreshConstant
+
+
+### Contour filtering
+
+After thresholding, contours are detected. However, not all contours
+are considered as marker candidates. They are filtered out in different steps so that contours that are
+very unlikely to be markers are discarded. The parameters in this section customize
+this filtering process.
+
+It must be noted that in most cases it is a question of balance between detection capacity
+and performance. All the considered contours will be processed in the following stages, which usually have
+a higher computational cost. So, it is preferred to discard invalid candidates in this stage than in the later stages.
+
+On the other hand, if the filtering conditions are too strict, the real marker contours could be discarded and,
+hence, not detected.
+
+#### minMarkerPerimeterRate and maxMarkerPerimeterRate
+
+These parameters determine the minimum and maximum size of a marker, specifically the minimum and
+maximum marker perimeter. They are not specified in absolute pixel values, instead they are specified
+relative to the maximum dimension of the input image.
+
+For instance, a image with size 640x480 and a minimum relative marker perimeter of 0.05 will lead
+to a minimum marker perimeter of 640x0.05 = 32 pixels, since 640 is the maximum dimension of the
+image. The same applies for the `maxMarkerPerimeterRate` parameter.
+
+If the `minMarkerPerimeterRate` is too low, detection performance can be significantly reduced,
+as many more contours will be considered for future stages.
+This penalization is not so noticeable for the `maxMarkerPerimeterRate` parameter, since there are
+usually many more small contours than big contours.
+A `minMarkerPerimeterRate` value of 0 and a `maxMarkerPerimeterRate` value of 4 (or more) will be
+equivalent to consider all the contours in the image, however this is not recommended for
+performance reasons.
+
+@see cv::aruco::DetectorParameters::minMarkerPerimeterRate, cv::aruco::DetectorParameters::maxMarkerPerimeterRate
+
+#### polygonalApproxAccuracyRate
+
+A polygonal approximation is applied to each candidate and only those that approximate to a square
+shape are accepted. This value determines the maximum error that the polygonal approximation can
+produce (see `approxPolyDP()` function for more information).
+
+This parameter is relative to the candidate length (in pixels). So if the candidate has
+a perimeter of 100 pixels and the value of `polygonalApproxAccuracyRate` is 0.04, the maximum error
+would be 100x0.04=5.4 pixels.
+
+In most cases, the default value works fine, but higher error values could be necessary for highly
+distorted images.
+
+@see cv::aruco::DetectorParameters::polygonalApproxAccuracyRate
+
+#### minCornerDistanceRate
+
+Minimum distance between any pair of corners in the same marker. It is expressed relative to the marker
+perimeter. Minimum distance in pixels is Perimeter * minCornerDistanceRate.
+
+@see cv::aruco::DetectorParameters::minCornerDistanceRate
+
+#### minMarkerDistanceRate
+
+Minimum distance between any pair of corners from two different markers. It is expressed relative to
+the minimum marker perimeter of the two markers. If two candidates are too close, the smaller one is ignored.
+
+@see cv::aruco::DetectorParameters::minMarkerDistanceRate
+
+#### minDistanceToBorder
+
+Minimum distance to any of the marker corners to the image border (in pixels). Markers partially occluded
+by the image border can be correctly detected if the occlusion is small. However, if one of the corners
+is occluded, the returned corner is usually placed in a wrong position near the image border.
+
+If the position of marker corners is important, for instance if you want to do pose estimation, it is
+better to discard any markers whose corners are too close to the image border. Elsewhere, it is not necessary.
+
+@see cv::aruco::DetectorParameters::minDistanceToBorder
+
+### Bits Extraction
+
+After candidate detection, the bits of each candidate are analyzed in order to determine if they
+are markers or not.
+
+Before analyzing the binary code itself, the bits need to be extracted. To do this, perspective
+distortion is corrected and the resulting image is thresholded using Otsu threshold to separate
+black and white pixels.
+
+This is an example of the image obtained after removing the perspective distortion of a marker:
+
+![Perspective removing](images/removeperspective.jpg)
+
+Then, the image is divided into a grid with the same number of cells as the number of bits in the marker.
+In each cell, the number of black and white pixels are counted to determine the bit value assigned
+to the cell (from the majority value):
+
+![Marker cells](images/bitsextraction1.png)
+
+There are several parameters that can customize this process:
+
+#### markerBorderBits
+
+This parameter indicates the width of the marker border. It is relative to the size of each bit. So, a
+value of 2 indicates the border has the width of two internal bits.
+
+This parameter needs to coincide with the border size of the markers you are using. The border size
+can be configured in the marker drawing functions such as `generateImageMarker()`.
+
+@see cv::aruco::DetectorParameters::markerBorderBits
+
+#### minOtsuStdDev
+
+This value determines the minimum standard deviation of the pixel values to perform Otsu
+thresholding. If the deviation is low, it probably means that all the square is black (or white)
+and applying Otsu does not make sense. If this is the case, all the bits are set to 0 (or 1)
+depending on whether the mean value is higher or lower than 128.
+
+@see cv::aruco::DetectorParameters::minOtsuStdDev
+
+#### perspectiveRemovePixelPerCell
+
+This parameter determines the number of pixels (per cell) in the obtained image after correcting perspective
+distortion (including the border). This is the size of the red squares in the image above.
+
+For instance, let’s assume we are dealing with markers of 5x5 bits and border size of 1 bit
+(see `markerBorderBits`). Then, the total number of cells/bits per dimension is 5 + 2*1 = 7 (the border
+has to be counted twice). The total number of cells is 7x7.
+
+If the value of `perspectiveRemovePixelPerCell` is 10, then the size of the obtained image will be
+10*7 = 70 -> 70x70 pixels.
+
+A higher value of this parameter can improve the bits extraction process (up to some degree),
+however it can penalize the performance.
+
+@see cv::aruco::DetectorParameters::perspectiveRemovePixelPerCell
+
+#### perspectiveRemoveIgnoredMarginPerCell
+
+When extracting the bits of each cell, the numbers of black and white pixels are counted. In general, it is
+not recommended to consider all the cell pixels. Instead it is better to ignore some pixels in the
+margins of the cells.
+
+The reason for this is that, after removing the perspective distortion, the cells’ colors are, in general, not
+perfectly separated and white cells can invade some pixels of black cells (and vice versa). Thus, it is
+better to ignore some pixels just to avoid counting erroneous pixels.
+
+For instance, in the following image:
+
+![Marker cell margins](images/bitsextraction2.png)
+
+only the pixels inside the green squares are considered. It can be seen in the right image that
+the resulting pixels contain a lower amount of noise from neighbor cells.
+The `perspectiveRemoveIgnoredMarginPerCell` parameter indicates the difference between the red and
+the green squares.
+
+This parameter is relative to the total size of the cell. For instance if the cell size is 40 pixels and the
+value of this parameter is 0.1, a margin of 40*0.1=4 pixels is ignored in the cells. This means that the total
+number of pixels that would be analyzed in each cell would actually be 32x32, instead of 40x40.
+
+@see cv::aruco::DetectorParameters::perspectiveRemoveIgnoredMarginPerCell
+
+
+### Marker identification
+
+After the bits have been extracted, the next step is checking whether the extracted code belongs to the marker
+dictionary and, if necessary, error correction can be performed.
+
+#### maxErroneousBitsInBorderRate
+
+The bits of the marker border should be black. This parameter specifies the allowed number of erroneous
+bits in the border, i.e. the maximum number of white bits in the border. It is represented
+relative to the total number of bits in the marker.
+
+@see cv::aruco::DetectorParameters::maxErroneousBitsInBorderRate
+
+#### errorCorrectionRate
+
+Each marker dictionary has a theoretical maximum number of bits that can be corrected (`Dictionary.maxCorrectionBits`).
+However, this value can be modified by the `errorCorrectionRate` parameter.
+
+For instance, if the allowed number of bits that can be corrected (for the used dictionary) is 6 and the value of `errorCorrectionRate` is
+0.5, the real maximum number of bits that can be corrected is 6*0.5=3 bits.
+
+This value is useful to reduce the error correction capabilities in order to avoid false positives.
+
+@see cv::aruco::DetectorParameters::errorCorrectionRate
+
+
+### Corner Refinement
+
+After markers have been detected and identified, the last step is performing subpixel refinement
+of the corner positions (see OpenCV `cornerSubPix()` and `cv::aruco::CornerRefineMethod`).
+
+Note that this step is optional and it only makes sense if the positions of the marker corners have to
+be accurate, for instance for pose estimation. It is usually a time-consuming step and therefore is disabled by default.
+
+#### cornerRefinementMethod
+
+This parameter determines whether the corner subpixel process is performed or not and which method to use
+if it is being performed. It can be disabled if accurate corners are not necessary. Possible values are
+`CORNER_REFINE_NONE`, `CORNER_REFINE_SUBPIX`, `CORNER_REFINE_CONTOUR`, and `CORNER_REFINE_APRILTAG`.
+
+@see cv::aruco::DetectorParameters::cornerRefinementMethod
+
+#### cornerRefinementWinSize
+
+This parameter determines the maximum window size for the corner refinement process.
+
+High values can cause close corners of the image to be included in the window area, so that the corner
+of the marker moves to a different and incorrect location during the process. Also, it may affect performance.
+The window size may decrease if the ArUco marker is too small, check cv::aruco::DetectorParameters::relativeCornerRefinmentWinSize.
+The final window size is calculated as: min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+where averageArucoModuleSize is average module size of ArUco marker in pixels.
+
+@see cv::aruco::DetectorParameters::cornerRefinementWinSize
+
+#### relativeCornerRefinmentWinSize
+
+Dynamic window size for corner refinement relative to Aruco module size (default 0.3).
+
+The final window size is calculated as: min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+where averageArucoModuleSize is average module size of ArUco marker in pixels.
+In the case of markers located far from each other, it may be useful to increase the value of the parameter to 0.4-0.5.
+In the case of markers located close to each other, it may be useful to decrease the parameter value to 0.1-0.2.
+
+@see cv::aruco::DetectorParameters::relativeCornerRefinmentWinSize
+
+#### cornerRefinementMaxIterations and cornerRefinementMinAccuracy
+
+These two parameters determine the stop criteria of the subpixel refinement process. The
+`cornerRefinementMaxIterations` indicates the maximum number of iterations and
+`cornerRefinementMinAccuracy` the minimum error value before stopping the process.
+
+If the number of iterations is too high, it may affect the performance. On the other hand, if it is
+too low, it can result in poor subpixel refinement.
+
+@see cv::aruco::DetectorParameters::cornerRefinementMaxIterations, cv::aruco::DetectorParameters::cornerRefinementMinAccuracy
diff --git a/doc/tutorials/objdetect/aruco_detection/images/bitsextraction1.png b/doc/tutorials/objdetect/aruco_detection/images/bitsextraction1.png
new file mode 100644
index 000000000000..53c2d38c6510
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/bitsextraction1.png differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/bitsextraction2.png b/doc/tutorials/objdetect/aruco_detection/images/bitsextraction2.png
new file mode 100644
index 000000000000..d3e8fb0507cb
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/bitsextraction2.png differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/marker23.png b/doc/tutorials/objdetect/aruco_detection/images/marker23.png
new file mode 100644
index 000000000000..f82555576e4d
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/marker23.png differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/markers.jpg b/doc/tutorials/objdetect/aruco_detection/images/markers.jpg
new file mode 100644
index 000000000000..aa213f536ee1
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/markers.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/removeperspective.jpg b/doc/tutorials/objdetect/aruco_detection/images/removeperspective.jpg
new file mode 100644
index 000000000000..8eeeb7557864
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/removeperspective.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/singlemarkersbrokenthresh.png b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersbrokenthresh.png
new file mode 100644
index 000000000000..e60f98b3425d
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersbrokenthresh.png differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/singlemarkersdetection.jpg b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersdetection.jpg
new file mode 100644
index 000000000000..48995077d617
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersdetection.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/singlemarkersoriginal.jpg b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersoriginal.jpg
new file mode 100644
index 000000000000..a0c9c43be49c
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersoriginal.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/singlemarkersrejected.jpg b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersrejected.jpg
new file mode 100644
index 000000000000..2b51e3913c10
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersrejected.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/singlemarkerssource.jpg b/doc/tutorials/objdetect/aruco_detection/images/singlemarkerssource.jpg
new file mode 100644
index 000000000000..a95b9e7dd995
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/singlemarkerssource.jpg differ
diff --git a/doc/tutorials/objdetect/aruco_detection/images/singlemarkersthresh.png b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersthresh.png
new file mode 100644
index 000000000000..94738c952eeb
Binary files /dev/null and b/doc/tutorials/objdetect/aruco_detection/images/singlemarkersthresh.png differ
diff --git a/doc/tutorials/objdetect/aruco_faq/aruco_faq.markdown b/doc/tutorials/objdetect/aruco_faq/aruco_faq.markdown
new file mode 100644
index 000000000000..0db99be83516
--- /dev/null
+++ b/doc/tutorials/objdetect/aruco_faq/aruco_faq.markdown
@@ -0,0 +1,190 @@
+Aruco module FAQ {#tutorial_aruco_faq}
+================
+
+@prev_tutorial{tutorial_aruco_calibration}
+
+This is a compilation of questions that can be useful for those that want to use the aruco module.
+
+- I only want to label some objects, what should I use?
+
+In this case, you only need single ArUco markers. You can place one or several markers with different
+ids in each of the object you want to identify.
+
+
+- Which algorithm is used for marker detection?
+
+The aruco module is based on the original ArUco library. A full description of the detection process
+can be found in:
+
+> S. Garrido-Jurado, R. Muñoz-Salinas, F. J. Madrid-Cuevas, and M. J. Marín-Jiménez. 2014.
+> "Automatic generation and detection of highly reliable fiducial markers under occlusion".
+> Pattern Recogn. 47, 6 (June 2014), 2280-2292. DOI=10.1016/j.patcog.2014.01.005
+
+
+- My markers are not being detected correctly, what can I do?
+
+There can be many factors that avoid the correct detection of markers. You probably need to adjust
+some of the parameters in the `cv::aruco::DetectorParameters` object. The first thing you can do is
+checking if your markers are returned as rejected candidates by the `cv::aruco::ArucoDetector::detectMarkers()`
+function. Depending on this, you should try to modify different parameters.
+
+If you are using a ArUco board, you can also try the `cv::aruco::ArucoDetector::refineDetectedMarkers()` function.
+If you are [using big markers](https://github.com/opencv/opencv_contrib/issues/2811) (400x400 pixels and more), try
+increasing `cv::aruco::DetectorParameters::adaptiveThreshWinSizeMax` value.
+Also avoid [narrow borders around the ArUco marker](https://github.com/opencv/opencv_contrib/issues/2492)
+(5% or less of the marker perimeter, adjusted by `cv::aruco::DetectorParameters::minMarkerDistanceRate`)
+around markers.
+
+
+- What are the benefits of ArUco boards? What are the drawbacks?
+
+Using a board of markers you can obtain the camera pose from a set of markers, instead of a single one.
+This way, the detection is able to handle occlusion of partial views of the Board, since only one
+marker is necessary to obtain the pose.
+
+Furthermore, as in most cases you are using more corners for pose estimation, it will be more
+accurate than using a single marker.
+
+The main drawback is that a Board is not as versatile as a single marker.
+
+
+
+- What are the benefits of ChArUco boards over ArUco boards? And the drawbacks?
+
+ChArUco boards combines chessboards with ArUco boards. Thanks to this, the corners provided by
+ChArUco boards are more accurate than those provided by ArUco Boards (or single markers).
+
+The main drawback is that ChArUco boards are not as versatile as ArUco board. For instance,
+a ChArUco board is a planar board with a specific marker layout while the ArUco boards can have
+any layout, even in 3d. Furthermore, the markers in the ChArUco board are usually smaller and
+more difficult to detect.
+
+
+- I do not need pose estimation, should I use ChArUco boards?
+
+No. The main goal of ChArUco boards is provide high accurate corners for pose estimation or camera
+calibration.
+
+
+- Should all the markers in an ArUco board be placed in the same plane?
+
+No, the marker corners in a ArUco board can be placed anywhere in its 3d coordinate system.
+
+
+- Should all the markers in an ChArUco board be placed in the same plane?
+
+Yes, all the markers in a ChArUco board need to be in the same plane and their layout is fixed by
+the chessboard shape.
+
+
+- What is the difference between a `cv::aruco::Board` object and a `cv::aruco::GridBoard` object?
+
+The `cv::aruco::GridBoard` class is a specific type of board that inherits from `cv::aruco::Board` class.
+A `cv::aruco::GridBoard` object is a board whose markers are placed in the same plane and in a grid layout.
+
+
+- What are Diamond markers?
+
+Diamond markers are very similar to a ChArUco board of 3x3 squares. However, contrary to ChArUco boards,
+the detection of diamonds is based on the relative position of the markers.
+They are useful when you want to provide a conceptual meaning to any (or all) of the markers in
+the diamond. An example is using one of the marker to provide the diamond scale.
+
+
+- Do I need to detect marker before board detection, ChArUco board detection or Diamond detection?
+
+Yes, the detection of single markers is a basic tool in the aruco module. It is done using the
+`cv::aruco::DetectorParameters::detectMarkers()` function. The rest of functionalities receives
+a list of detected markers from this function.
+
+
+- I want to calibrate my camera, can I use this module?
+
+Yes, the aruco module provides functionalities to calibrate the camera using both, ArUco boards and
+ChArUco boards.
+
+
+- Should I calibrate using a ChArUco board or an ArUco board?
+
+It is highly recommended the calibration using ChArUco board due to the high accuracy.
+
+
+- Should I use a predefined dictionary or generate my own dictionary?
+
+In general, it is easier to use one of the predefined dictionaries. However, if you need a bigger
+dictionary (in terms of number of markers or number of bits) you should generate your own dictionary.
+Dictionary generation is also useful if you want to maximize the inter-marker distance to achieve
+a better error correction during the identification step.
+
+- I am generating my own dictionary but it takes too long
+
+Dictionary generation should only be done once at the beginning of your application and it should take
+some seconds. If you are generating the dictionary on each iteration of your detection loop, you are
+doing it wrong.
+
+Furthermore, it is recommendable to save the dictionary to a file with `cv::aruco::Dictionary::writeDictionary()`
+and read it with `cv::aruco::Dictionary::readDictionary()` on every execution, so you don't need
+to generate it.
+
+
+- I would like to use some markers of the original ArUco library that I have already printed, can I use them?
+
+Yes, one of the predefined dictionary is `cv::aruco::DICT_ARUCO_ORIGINAL`, which detects the marker
+of the original ArUco library with the same identifiers.
+
+
+- Can I use the Board configuration file of the original ArUco library in this module?
+
+Not directly, you will need to adapt the information of the ArUco file to the aruco module Board format.
+
+
+- Can I use this module to detect the markers of other libraries based on binary fiducial markers?
+
+Probably yes, however you will need to port the dictionary of the original library to the aruco module format.
+
+
+- Do I need to store the Dictionary information in a file so I can use it in different executions?
+
+If you are using one of the predefined dictionaries, it is not necessary. Otherwise, it is recommendable
+that you save it to file.
+
+
+- Do I need to store the Board information in a file so I can use it in different executions?
+
+If you are using a `cv::aruco::GridBoard` or a `cv::aruco::CharucoBoard` you only need to store
+the board measurements that are provided to the `cv::aruco::GridBoard::GridBoard()` constructor or
+in or `cv::aruco::CharucoBoard` constructor. If you manually modify the marker ids of the boards,
+or if you use a different type of board, you should save your board object to file.
+
+- Does the aruco module provide functions to save the Dictionary or Board to file?
+
+You can use `cv::aruco::Dictionary::writeDictionary()` and `cv::aruco::Dictionary::readDictionary()`
+for `cv::aruco::Dictionary`. The data member of board classes are public and can be easily stored.
+
+
+- Alright, but how can I render a 3d model to create an augmented reality application?
+
+To do so, you will need to use an external rendering engine library, such as OpenGL. The aruco module
+only provides the functionality to obtain the camera pose, i.e. the rotation and traslation vectors,
+which is necessary to create the augmented reality effect. However, you will need to adapt the rotation
+and traslation vectors from the OpenCV format to the format accepted by your 3d rendering library.
+The original ArUco library contains examples of how to do it for OpenGL and Ogre3D.
+
+
+- I have use this module in my research work, how can I cite it?
+
+You can cite the original ArUco library:
+
+> S. Garrido-Jurado, R. Muñoz-Salinas, F. J. Madrid-Cuevas, and M. J. Marín-Jiménez. 2014.
+> "Automatic generation and detection of highly reliable fiducial markers under occlusion".
+> Pattern Recogn. 47, 6 (June 2014), 2280-2292. DOI=10.1016/j.patcog.2014.01.005
+
+- Pose estimation markers are not being detected correctly, what can I do?
+
+It is important to remark that the estimation of the pose using only 4 coplanar points is subject to ambiguity.
+In general, the ambiguity can be solved, if the camera is near to the marker.
+However, as the marker becomes small, the errors in the corner estimation grows and ambiguity comes
+as a problem. Try increasing the size of the marker you're using, and you can also try non-symmetrical
+(aruco_dict_utils.cpp) markers to avoid collisions. Use multiple markers (ArUco/ChArUco/Diamonds boards)
+and pose estimation with solvePnP() with the `cv::SOLVEPNP_IPPE_SQUARE` option.
+More in [this issue](https://github.com/opencv/opencv/issues/8813).
diff --git a/doc/tutorials/objdetect/charuco_detection/charuco_detection.markdown b/doc/tutorials/objdetect/charuco_detection/charuco_detection.markdown
new file mode 100644
index 000000000000..c1039376eac8
--- /dev/null
+++ b/doc/tutorials/objdetect/charuco_detection/charuco_detection.markdown
@@ -0,0 +1,265 @@
+Detection of ChArUco Boards {#tutorial_charuco_detection}
+===========================
+
+@prev_tutorial{tutorial_aruco_board_detection}
+@next_tutorial{tutorial_charuco_diamond_detection}
+
+ArUco markers and boards are very useful due to their fast detection and their versatility.
+However, one of the problems of ArUco markers is that the accuracy of their corner positions is not
+too high, even after applying subpixel refinement.
+
+On the contrary, the corners of chessboard patterns can be refined more accurately since each corner
+is surrounded by two black squares. However, finding a chessboard pattern is not as versatile as
+finding an ArUco board: it has to be completely visible and occlusions are not permitted.
+
+A ChArUco board tries to combine the benefits of these two approaches:
+
+![Charuco definition](images/charucodefinition.png)
+
+The ArUco part is used to interpolate the position of the chessboard corners, so that it has the
+versatility of marker boards, since it allows occlusions or partial views. Moreover, since the
+interpolated corners belong to a chessboard, they are very accurate in terms of subpixel accuracy.
+
+When high precision is necessary, such as in camera calibration, Charuco boards are a better option
+than standard ArUco boards.
+
+Goal
+----
+
+In this tutorial you will learn:
+
+- How to create a charuco board ?
+- How to detect the charuco corners without performing camera calibration ?
+- How to detect the charuco corners with camera calibration and pose estimation ?
+
+Source code
+-----------
+
+You can find this code in `samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp`
+
+Here's a sample code of how to achieve all the stuff enumerated at the goal list.
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp charuco_detect_board_full_sample
+
+ChArUco Board Creation
+----------------------
+
+The aruco module provides the `cv::aruco::CharucoBoard` class that represents a Charuco Board and
+which inherits from the `cv::aruco::Board` class.
+
+This class, as the rest of ChArUco functionalities, are defined in:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp charucohdr
+
+To define a `cv::aruco::CharucoBoard`, it is necessary:
+
+- Number of chessboard squares in X and Y directions.
+- Length of square side.
+- Length of marker side.
+- The dictionary of the markers.
+- Ids of all the markers.
+
+As for the `cv::aruco::GridBoard` objects, the aruco module provides to create `cv::aruco::CharucoBoard`
+easily. This object can be easily created from these parameters using the `cv::aruco::CharucoBoard`
+constructor:
+
+@snippet samples/cpp/tutorial_code/objectDetection/create_board_charuco.cpp create_charucoBoard
+
+- The first parameter is the number of squares in X and Y direction respectively.
+- The second and third parameters are the length of the squares and the markers respectively. They can
+  be provided in any unit, having in mind that the estimated pose for this board would be measured
+  in the same units (usually meters are used).
+- Finally, the dictionary of the markers is provided.
+
+The ids of each of the markers are assigned by default in ascending order and starting on 0, like in
+`cv::aruco::GridBoard` constructor. This can be easily customized by accessing to the ids vector
+through `board.ids`, like in the `cv::aruco::Board` parent class.
+
+Once we have our `cv::aruco::CharucoBoard` object, we can create an image to print it. There are
+two ways to do this:
+1. By using the script `doc/patter_tools/gen_pattern.py `, see @subpage tutorial_camera_calibration_pattern.
+2. By using the function `cv::aruco::CharucoBoard::generateImage()`.
+
+The function `cv::aruco::CharucoBoard::generateImage()` is provided in cv::aruco::CharucoBoard class
+and can be called by using the following code:
+@snippet samples/cpp/tutorial_code/objectDetection/create_board_charuco.cpp generate_charucoBoard
+
+- The first parameter is the size of the output image in pixels. If this is not proportional
+to the board dimensions, it will be centered on the image.
+- The second parameter is the output image with the charuco board.
+- The third parameter is the (optional) margin in pixels, so none of the markers are touching the
+  image border.
+- Finally, the size of the marker border, similarly to `cv::aruco::generateImageMarker()` function.
+  The default value is 1.
+
+The output image will be something like this:
+
+![](images/charucoboard.png)
+
+A full working example is included in the `create_board_charuco.cpp` inside the `samples/cpp/tutorial_code/objectDetection/`.
+
+The samples `create_board_charuco.cpp` now take input via commandline via the `cv::CommandLineParser`.
+For this file the example
+parameters will look like:
+@code{.cpp}
+    "_output_path_/chboard.png" -w=5 -h=7 -sl=100 -ml=60 -d=10
+@endcode
+
+
+ChArUco Board Detection
+-----------------------
+
+When you detect a ChArUco board, what you are actually detecting is each of the chessboard corners
+of the board.
+
+Each corner on a ChArUco board has a unique identifier (id) assigned. These ids go from 0 to the total
+number of corners in the board.
+The steps of charuco board detection can be broken down to the following steps:
+
+- **Taking input Image**
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp inputImg
+
+The original image where the markers are to be detected. The image is necessary to perform subpixel
+refinement in the ChArUco corners.
+
+- **Reading the camera calibration Parameters(only for detection with camera calibration)**
+
+@snippet samples/cpp/tutorial_code/objectDetection/aruco_samples_utility.hpp camDistCoeffs
+
+The parameters of `readCameraParameters` are:
+- The first parameter is the path to the camera intrinsic matrix and distortion coefficients.
+- The second and third parameters are cameraMatrix and distCoeffs.
+
+This function takes these parameters as input and returns a boolean value of whether the camera
+calibration parameters are valid or not. For detection of charuco corners without calibration,
+this step is not required.
+
+- **Detecting the markers and interpolation of charuco corners from markers**
+
+The detection of the ChArUco corners is based on the previous detected markers.
+So that, first markers are detected, and then ChArUco corners are interpolated from markers.
+The method that detect the ChArUco corners is `cv::aruco::CharucoDetector::detectBoard()`.
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp interpolateCornersCharuco
+
+The parameters of detectBoard are:
+- `image` - Input image.
+- `charucoCorners` - output list of image positions of the detected corners.
+- `charucoIds` - output ids for each of the detected corners in `charucoCorners`.
+- `markerCorners` - input/output vector of detected marker corners.
+- `markerIds` - input/output vector of identifiers of the detected markers
+
+If markerCorners and markerIds are empty, the function will detect aruco markers and ids.
+
+If calibration parameters are provided, the ChArUco corners are interpolated by, first, estimating
+a rough pose from the ArUco markers and, then, reprojecting the ChArUco corners back to the image.
+
+On the other hand, if calibration parameters are not provided, the ChArUco corners are interpolated
+by calculating the corresponding homography between the ChArUco plane and the ChArUco image projection.
+
+The main problem of using homography is that the interpolation is more sensible to image distortion.
+Actually, the homography is only performed using the closest markers of each ChArUco corner to reduce
+the effect of distortion.
+
+When detecting markers for ChArUco boards, and specially when using homography, it is recommended to
+disable the corner refinement of markers. The reason of this is that, due to the proximity of the
+chessboard squares, the subpixel process can produce important deviations in the corner positions and
+these deviations are propagated to the ChArUco corner interpolation, producing poor results.
+
+@note To avoid deviations, the margin between chessboard square and aruco marker should be greater
+than 70% of one marker module.
+
+Furthermore, only those corners whose two surrounding markers have be found are returned. If any of
+the two surrounding markers has not been detected, this usually means that there is some occlusion
+or the image quality is not good in that zone. In any case, it is preferable not to consider that
+corner, since what we want is to be sure that the interpolated ChArUco corners are very accurate.
+
+After the ChArUco corners have been interpolated, a subpixel refinement is performed.
+
+Once we have interpolated the ChArUco corners, we would probably want to draw them to see if their
+detections are correct. This can be easily done using the `cv::aruco::drawDetectedCornersCharuco()`
+function:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp drawDetectedCornersCharuco
+
+- `imageCopy` is the image where the corners will be drawn (it will normally be the same image where
+   the corners were detected).
+- The `outputImage` will be a clone of `inputImage` with the corners drawn.
+- `charucoCorners` and `charucoIds` are the detected Charuco corners from the `cv::aruco::CharucoDetector::detectBoard()`
+  function.
+- Finally, the last parameter is the (optional) color we want to draw the corners with, of type `cv::Scalar`.
+
+For this image:
+
+![Image with Charuco board](images/choriginal.jpg)
+
+The result will be:
+
+![Charuco board detected](images/chcorners.jpg)
+
+In the presence of occlusion. like in the following image, although some corners are clearly visible,
+not all their surrounding markers have been detected due occlusion and, thus, they are not interpolated:
+
+![Charuco detection with occlusion](images/chocclusion.jpg)
+
+Sample video:
+
+@youtube{Nj44m_N_9FY}
+
+A full working example is included in the `detect_board_charuco.cpp` inside the
+`samples/cpp/tutorial_code/objectDetection/`.
+
+The samples `detect_board_charuco.cpp` now take input via commandline via the `cv::CommandLineParser`.
+For this file the example parameters will look like:
+@code{.cpp}
+    -w=5 -h=7 -sl=0.04 -ml=0.02 -d=10 -v=/path_to_opencv/opencv/doc/tutorials/objdetect/charuco_detection/images/choriginal.jpg
+@endcode
+
+ChArUco Pose Estimation
+-----------------------
+
+The final goal of the ChArUco boards is finding corners very accurately for a high precision calibration
+or pose estimation.
+
+The aruco module provides a function to perform ChArUco pose estimation easily. As in the
+`cv::aruco::GridBoard`, the coordinate system of the `cv::aruco::CharucoBoard` is placed in
+the board plane with the Z axis pointing in, and centered in the bottom left corner of the board.
+
+@note After OpenCV 4.6.0, there was an incompatible change in the coordinate systems of the boards,
+now the coordinate systems are placed in the boards plane with the Z axis pointing in the plane
+(previously the axis pointed out the plane).
+`objPoints` in CW order correspond to the Z-axis pointing in the plane.
+`objPoints` in CCW order correspond to the Z-axis pointing out the plane.
+See PR https://github.com/opencv/opencv_contrib/pull/3174
+
+
+To perform pose estimation for charuco boards, you should use `cv::aruco::CharucoBoard::matchImagePoints()`
+and `cv::solvePnP()`:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp poseCharuco
+
+- The `charucoCorners` and `charucoIds` parameters are the detected charuco corners from the
+  `cv::aruco::CharucoDetector::detectBoard()` function.
+- The `cameraMatrix` and `distCoeffs` are the camera calibration parameters which are necessary
+  for pose estimation.
+- Finally, the `rvec` and `tvec` parameters are the output pose of the Charuco Board.
+- `cv::solvePnP()` returns true if the pose was correctly estimated and false otherwise.
+  The main reason of failing is that there are not enough corners for pose estimation or
+  they are in the same line.
+
+The axis can be drawn using `cv::drawFrameAxes()` to check the pose is correctly estimated.
+The result would be: (X:red, Y:green, Z:blue)
+
+![Charuco Board Axis](images/chaxis.jpg)
+
+A full working example is included in the `detect_board_charuco.cpp` inside the
+`samples/cpp/tutorial_code/objectDetection/`.
+
+The samples `detect_board_charuco.cpp` now take input via commandline via the `cv::CommandLineParser`.
+For this file the example parameters will look like:
+@code{.cpp}
+    -w=5 -h=7 -sl=0.04 -ml=0.02 -d=10
+    -v=/path_to_opencv/opencv/doc/tutorials/objdetect/charuco_detection/images/choriginal.jpg
+    -c=/path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/tutorial_camera_charuco.yml
+@endcode
diff --git a/doc/tutorials/objdetect/charuco_detection/images/charucoboard.png b/doc/tutorials/objdetect/charuco_detection/images/charucoboard.png
new file mode 100644
index 000000000000..ee9fe3e0db07
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_detection/images/charucoboard.png differ
diff --git a/doc/tutorials/objdetect/charuco_detection/images/charucodefinition.png b/doc/tutorials/objdetect/charuco_detection/images/charucodefinition.png
new file mode 100644
index 000000000000..44684f310c57
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_detection/images/charucodefinition.png differ
diff --git a/doc/tutorials/objdetect/charuco_detection/images/chaxis.jpg b/doc/tutorials/objdetect/charuco_detection/images/chaxis.jpg
new file mode 100644
index 000000000000..a00ba6134c8a
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_detection/images/chaxis.jpg differ
diff --git a/doc/tutorials/objdetect/charuco_detection/images/chcorners.jpg b/doc/tutorials/objdetect/charuco_detection/images/chcorners.jpg
new file mode 100644
index 000000000000..1eca446421e1
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_detection/images/chcorners.jpg differ
diff --git a/doc/tutorials/objdetect/charuco_detection/images/chocclusion.jpg b/doc/tutorials/objdetect/charuco_detection/images/chocclusion.jpg
new file mode 100644
index 000000000000..e4860fc8324e
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_detection/images/chocclusion.jpg differ
diff --git a/doc/tutorials/objdetect/charuco_detection/images/chocclusion_original.jpg b/doc/tutorials/objdetect/charuco_detection/images/chocclusion_original.jpg
new file mode 100644
index 000000000000..037a8eb12844
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_detection/images/chocclusion_original.jpg differ
diff --git a/doc/tutorials/objdetect/charuco_detection/images/choriginal.jpg b/doc/tutorials/objdetect/charuco_detection/images/choriginal.jpg
new file mode 100644
index 000000000000..3ca7c3149ffe
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_detection/images/choriginal.jpg differ
diff --git a/doc/tutorials/objdetect/charuco_diamond_detection/charuco_diamond_detection.markdown b/doc/tutorials/objdetect/charuco_diamond_detection/charuco_diamond_detection.markdown
new file mode 100644
index 000000000000..04ae79ded0ad
--- /dev/null
+++ b/doc/tutorials/objdetect/charuco_diamond_detection/charuco_diamond_detection.markdown
@@ -0,0 +1,143 @@
+Detection of Diamond Markers {#tutorial_charuco_diamond_detection}
+==============================
+
+@prev_tutorial{tutorial_charuco_detection}
+@next_tutorial{tutorial_aruco_calibration}
+
+A ChArUco diamond marker (or simply diamond marker) is a chessboard composed by 3x3 squares and 4 ArUco markers inside the white squares.
+It is similar to a ChArUco board in appearance, however they are conceptually different.
+
+![Diamond marker examples](images/diamondmarkers.jpg)
+
+In both, ChArUco board and Diamond markers, their detection is based on the previous detected ArUco
+markers. In the ChArUco case, the used markers are selected by directly looking their identifiers. This means
+that if a marker (included in the board) is found on a image, it will be automatically assumed to belong to the board. Furthermore,
+if a marker board is found more than once in the image, it will produce an ambiguity since the system wont
+be able to know which one should be used for the Board.
+
+On the other hand, the detection of Diamond marker is not based on the identifiers. Instead, their detection
+is based on the relative position of the markers. As a consequence, marker identifiers can be repeated in the
+same diamond or among different diamonds, and they can be detected simultaneously without ambiguity. However,
+due to the complexity of finding marker based on their relative position, the diamond markers are limited to
+a size of 3x3 squares and 4 markers.
+
+As in a single ArUco marker, each Diamond marker is composed by 4 corners and a identifier. The four corners
+correspond to the 4 chessboard corners in the marker and the identifier is actually an array of 4 numbers, which are
+the identifiers of the four ArUco markers inside the diamond.
+
+Diamond markers are useful in those scenarios where repeated markers should be allowed. For instance:
+
+- To increase the number of identifiers of single markers by using diamond marker for labeling. They would allow
+up to N^4 different ids, being N the number of markers in the used dictionary.
+
+- Give to each of the four markers a conceptual meaning. For instance, one of the four marker ids could be
+used to indicate the scale of the marker (i.e. the size of the square), so that the same diamond can be found
+in the environment with different sizes just by changing one of the four markers and the user does not need
+to manually indicate the scale of each of them. This case is included in the `detect_diamonds.cpp` file inside
+the samples folder of the module.
+
+Furthermore, as its corners are chessboard corners, they can be used for accurate pose estimation.
+
+The diamond functionalities are included in `<opencv2/objdetect/charuco_detector.hpp>`
+
+
+ChArUco Diamond Creation
+------
+
+The image of a diamond marker can be easily created using the `cv::aruco::CharucoBoard::generateImage()` function.
+For instance:
+
+@snippet samples/cpp/tutorial_code/objectDetection/create_diamond.cpp generate_diamond
+
+This will create a diamond marker image with a square size of 200 pixels and a marker size of 120 pixels.
+The marker ids are given in the second parameter as a `cv::Vec4i` object. The order of the marker ids
+in the diamond layout are the same as in a standard ChArUco board, i.e. top, left, right and bottom.
+
+The image produced will be:
+
+![Diamond marker](images/diamondmarker.png)
+
+A full working example is included in the `create_diamond.cpp` inside the `samples/cpp/tutorial_code/objectDetection/`.
+
+The samples `create_diamond.cpp` now take input via commandline via the `cv::CommandLineParser`. For this file the example
+parameters will look like:
+@code{.cpp}
+    "_path_/mydiamond.png" -sl=200 -ml=120 -d=10 -ids=0,1,2,3
+@endcode
+
+ChArUco Diamond Detection
+------
+
+As in most cases, the detection of diamond markers requires a previous detection of ArUco markers.
+After detecting markers, diamond are detected using the `cv::aruco::CharucoDetector::detectDiamonds()` function:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_diamonds.cpp detect_diamonds
+
+The `cv::aruco::CharucoDetector::detectDiamonds()` function receives the original image and the previous detected marker corners and ids.
+If markerCorners and markerIds are empty, the function will detect aruco markers and ids.
+The input image is necessary to perform subpixel refinement in the ChArUco corners.
+It also receives the rate between the square size and the marker sizes which is required for both, detecting the diamond
+from the relative positions of the markers and interpolating the ChArUco corners.
+
+The function returns the detected diamonds in two parameters. The first parameter, `diamondCorners`, is an array containing
+all the four corners of each detected diamond. Its format is similar to the detected corners by the `cv::aruco::ArucoDetector::detectMarkers()`
+function and, for each diamond, the corners are represented in the same order than in the ArUco markers, i.e. clockwise order
+starting with the top-left corner. The second returned parameter, `diamondIds`, contains all the ids of the returned
+diamond corners in `diamondCorners`. Each id is actually an array of 4 integers that can be represented with `cv::Vec4i`.
+
+The detected diamond can be visualized using the function `cv::aruco::drawDetectedDiamonds()` which simply receives the image and the diamond
+corners and ids:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_diamonds.cpp draw_diamonds
+
+The result is the same that the one produced by `cv::aruco::drawDetectedMarkers()`, but printing the four ids of the diamond:
+
+![Detected diamond markers](images/detecteddiamonds.jpg)
+
+A full working example is included in the `detect_diamonds.cpp` inside the `samples/cpp/tutorial_code/objectDetection/`.
+
+The samples `detect_diamonds.cpp` now take input via commandline via the `cv::CommandLineParser`. For this file the example
+parameters will look like:
+@code{.cpp}
+    -dp=path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/detector_params.yml -sl=0.4 -ml=0.25 -refine=3
+    -v=path_to_opencv/opencv/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondmarkers.jpg
+    -cd=path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/tutorial_dict.yml
+@endcode
+
+ChArUco Diamond Pose Estimation
+------
+
+Since a ChArUco diamond is represented by its four corners, its pose can be estimated in the same way than in a single ArUco marker,
+i.e. using the `cv::solvePnP()` function. For instance:
+
+@snippet samples/cpp/tutorial_code/objectDetection/detect_diamonds.cpp diamond_pose_estimation
+@snippet samples/cpp/tutorial_code/objectDetection/detect_diamonds.cpp draw_diamond_pose_estimation
+
+The function will obtain the rotation and translation vector for each of the diamond marker and store them
+in `rvecs` and `tvecs`. Note that the diamond corners are a chessboard square corners and thus, the square length
+has to be provided for pose estimation, and not the marker length. Camera calibration parameters are also required.
+
+Finally, an axis can be drawn to check the estimated pose is correct using `drawFrameAxes()`:
+
+![Detected diamond axis](images/diamondsaxis.jpg)
+
+The coordinate system of the diamond pose will be in the center of the marker with the Z axis pointing out,
+as in a simple ArUco marker pose estimation.
+
+Sample video:
+
+@youtube{OqKpBnglH7k}
+
+Also ChArUco diamond pose can be estimated as ChArUco board:
+@snippet samples/cpp/tutorial_code/objectDetection/detect_diamonds.cpp diamond_pose_estimation_as_charuco
+
+A full working example is included in the `detect_diamonds.cpp` inside the `samples/cpp/tutorial_code/objectDetection/`.
+
+The samples `detect_diamonds.cpp` now take input via commandline via the `cv::CommandLineParser`. For this file the example
+parameters will look like:
+@code{.cpp}
+    -dp=path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/detector_params.yml -sl=0.4 -ml=0.25 -refine=3
+    -v=path_to_opencv/opencv/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondmarkers.jpg
+    -cd=path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/tutorial_dict.yml
+    -c=path_to_opencv/opencv/samples/cpp/tutorial_code/objectDetection/tutorial_camera_params.yml
+@endcode
diff --git a/doc/tutorials/objdetect/charuco_diamond_detection/images/detecteddiamonds.jpg b/doc/tutorials/objdetect/charuco_diamond_detection/images/detecteddiamonds.jpg
new file mode 100644
index 000000000000..32b6eb0b9207
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_diamond_detection/images/detecteddiamonds.jpg differ
diff --git a/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondmarker.png b/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondmarker.png
new file mode 100644
index 000000000000..6a490806fce5
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondmarker.png differ
diff --git a/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondmarkers.jpg b/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondmarkers.jpg
new file mode 100644
index 000000000000..7d270ad6480d
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondmarkers.jpg differ
diff --git a/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondsaxis.jpg b/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondsaxis.jpg
new file mode 100644
index 000000000000..20e144036dfe
Binary files /dev/null and b/doc/tutorials/objdetect/charuco_diamond_detection/images/diamondsaxis.jpg differ
diff --git a/doc/tutorials/objdetect/table_of_content_objdetect.markdown b/doc/tutorials/objdetect/table_of_content_objdetect.markdown
new file mode 100644
index 000000000000..3eade74105cc
--- /dev/null
+++ b/doc/tutorials/objdetect/table_of_content_objdetect.markdown
@@ -0,0 +1,9 @@
+Object Detection (objdetect module) {#tutorial_table_of_content_objdetect}
+==========================================================
+
+-   @subpage tutorial_aruco_detection
+-   @subpage tutorial_aruco_board_detection
+-   @subpage tutorial_charuco_detection
+-   @subpage tutorial_charuco_diamond_detection
+-   @subpage tutorial_aruco_calibration
+-   @subpage tutorial_aruco_faq
diff --git a/doc/tutorials/others/_old/table_of_content_objdetect.markdown b/doc/tutorials/others/_old/table_of_content_objdetect.markdown
deleted file mode 100644
index 0aa69fcd8d73..000000000000
--- a/doc/tutorials/others/_old/table_of_content_objdetect.markdown
+++ /dev/null
@@ -1,4 +0,0 @@
-Object Detection (objdetect module) {#tutorial_table_of_content_objdetect}
-===================================
-
-Content has been moved to this page: @ref tutorial_table_of_content_other
diff --git a/doc/tutorials/tutorials.markdown b/doc/tutorials/tutorials.markdown
index 59aefc2b1f5c..cd83c241af2c 100644
--- a/doc/tutorials/tutorials.markdown
+++ b/doc/tutorials/tutorials.markdown
@@ -6,6 +6,7 @@ OpenCV Tutorials {#tutorial_root}
 - @subpage tutorial_table_of_content_imgproc - image processing functions
 - @subpage tutorial_table_of_content_app - application utils (GUI, image/video input/output)
 - @subpage tutorial_table_of_content_calib3d - extract 3D world information from 2D images
+- @subpage tutorial_table_of_content_objdetect - INSERT OBJDETECT MODULE INFO
 - @subpage tutorial_table_of_content_features2d - feature detectors, descriptors and matching framework
 - @subpage tutorial_table_of_content_dnn - infer neural networks using built-in _dnn_ module
 - @subpage tutorial_table_of_content_gapi - graph-based approach to computer vision algorithms building
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index ec4c275597ad..0280e05e2184 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -48,6 +48,7 @@
 #include "opencv2/core/types.hpp"
 #include "opencv2/features2d.hpp"
 #include "opencv2/core/affine.hpp"
+#include "opencv2/core/utils/logger.hpp"
 
 /**
   @defgroup calib3d Camera Calibration and 3D Reconstruction
@@ -433,8 +434,6 @@ R & t \\
     Summary:
     Generic camera model @cite Kannala2006 with perspective projection and without distortion correction
 
-    @defgroup calib3d_c C API
-
   @}
  */
 
@@ -490,7 +489,8 @@ enum { CALIB_CB_ADAPTIVE_THRESH = 1,
        CALIB_CB_EXHAUSTIVE      = 16,
        CALIB_CB_ACCURACY        = 32,
        CALIB_CB_LARGER          = 64,
-       CALIB_CB_MARKER          = 128
+       CALIB_CB_MARKER          = 128,
+       CALIB_CB_PLAIN           = 256
      };
 
 enum { CALIB_CB_SYMMETRIC_GRID  = 1,
@@ -727,8 +727,8 @@ correctly only when there are more than 50% of inliers. Finally, if there are no
 noise is rather small, use the default method (method=0).
 
 The function is used to find initial intrinsic and extrinsic matrices. Homography matrix is
-determined up to a scale. Thus, it is normalized so that \f$h_{33}=1\f$. Note that whenever an \f$H\f$ matrix
-cannot be estimated, an empty one will be returned.
+determined up to a scale. If \f$h_{33}\f$ is non-zero, the matrix is normalized so that \f$h_{33}=1\f$.
+@note Whenever an \f$H\f$ matrix cannot be estimated, an empty one will be returned.
 
 @sa
 getAffineTransform, estimateAffine2D, estimateAffinePartial2D, getPerspectiveTransform, warpPerspective,
@@ -763,7 +763,7 @@ and a rotation matrix.
 It optionally returns three rotation matrices, one for each axis, and the three Euler angles in
 degrees (as the return value) that could be used in OpenGL. Note, there is always more than one
 sequence of rotations about the three principal axes that results in the same orientation of an
-object, e.g. see @cite Slabaugh . Returned tree rotation matrices and corresponding three Euler angles
+object, e.g. see @cite Slabaugh . Returned three rotation matrices and corresponding three Euler angles
 are only one of the possible solutions.
  */
 CV_EXPORTS_W Vec3d RQDecomp3x3( InputArray src, OutputArray mtxR, OutputArray mtxQ,
@@ -789,7 +789,7 @@ matrix and the position of a camera.
 It optionally returns three rotation matrices, one for each axis, and three Euler angles that could
 be used in OpenGL. Note, there is always more than one sequence of rotations about the three
 principal axes that results in the same orientation of an object, e.g. see @cite Slabaugh . Returned
-tree rotation matrices and corresponding three Euler angles are only one of the possible solutions.
+three rotation matrices and corresponding three Euler angles are only one of the possible solutions.
 
 The function is based on #RQDecomp3x3 .
  */
@@ -1235,6 +1235,10 @@ square-like shape) to filter out false quads extracted at the contour retrieval
 -   @ref CALIB_CB_FAST_CHECK Run a fast check on the image that looks for chessboard corners,
 and shortcut the call if none is found. This can drastically speed up the call in the
 degenerate condition when no chessboard is observed.
+-   @ref CALIB_CB_PLAIN All other flags are ignored. The input image is taken as is.
+No image processing is done to improve to find the checkerboard. This has the effect of speeding up the
+execution of the function but could lead to not recognizing the checkerboard if the image
+is not previously binarized in the appropriate manner.
 
 The function attempts to determine whether the input image is a view of the chessboard pattern and
 locate the internal chessboard corners. The function returns a non-zero value if all of the corners
@@ -1595,6 +1599,10 @@ The algorithm performs the following steps:
     \f$f_y\f$ (ratios of 10:1 or more)), then you are probably using patternSize=cvSize(rows,cols)
     instead of using patternSize=cvSize(cols,rows) in @ref findChessboardCorners.
 
+@note
+    The function may throw exceptions, if unsupported combination of parameters is provided or
+    the system is underconstrained.
+
 @sa
    calibrateCameraRO, findChessboardCorners, solvePnP, initCameraMatrix2D, stereoCalibrate,
    undistort
@@ -2485,13 +2493,13 @@ CV_EXPORTS_W Mat findFundamentalMat( InputArray points1, InputArray points2,
 
 @param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
 be floating-point (single or double precision).
-@param points2 Array of the second image points of the same size and format as points1 .
+@param points2 Array of the second image points of the same size and format as points1.
 @param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
 Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera intrinsic matrix. If this assumption does not hold for your use case, use
-#undistortPoints with `P = cv::NoArray()` for both cameras to transform image points
-to normalized image coordinates, which are valid for the identity camera intrinsic matrix. When
-passing these coordinates, pass the identity matrix for this parameter.
+same camera intrinsic matrix. If this assumption does not hold for your use case, use another
+function overload or #undistortPoints with `P = cv::NoArray()` for both cameras to transform image
+points to normalized image coordinates, which are valid for the identity camera intrinsic matrix.
+When passing these coordinates, pass the identity matrix for this parameter.
 @param method Method for computing an essential matrix.
 -   @ref RANSAC for the RANSAC algorithm.
 -   @ref LMEDS for the LMedS algorithm.
@@ -2583,23 +2591,13 @@ Mat findEssentialMat(
 
 @param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
 be floating-point (single or double precision).
-@param points2 Array of the second image points of the same size and format as points1 .
-@param cameraMatrix1 Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
-Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera matrix. If this assumption does not hold for your use case, use
-#undistortPoints with `P = cv::NoArray()` for both cameras to transform image points
-to normalized image coordinates, which are valid for the identity camera matrix. When
-passing these coordinates, pass the identity matrix for this parameter.
-@param cameraMatrix2 Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
-Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera matrix. If this assumption does not hold for your use case, use
-#undistortPoints with `P = cv::NoArray()` for both cameras to transform image points
-to normalized image coordinates, which are valid for the identity camera matrix. When
-passing these coordinates, pass the identity matrix for this parameter.
-@param distCoeffs1 Input vector of distortion coefficients
+@param points2 Array of the second image points of the same size and format as points1.
+@param cameraMatrix1 Camera matrix for the first camera \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix2 Camera matrix for the second camera \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param distCoeffs1 Input vector of distortion coefficients for the first camera
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
 of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
-@param distCoeffs2 Input vector of distortion coefficients
+@param distCoeffs2 Input vector of distortion coefficients for the second camera
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
 of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
 @param method Method for computing an essential matrix.
@@ -2974,7 +2972,7 @@ W
 x \\
 y \\
 \texttt{disparity} (x,y) \\
-z
+1
 \end{bmatrix}.\f]
 
 @sa
@@ -4046,6 +4044,45 @@ optimization. It is the \f$max(width,height)/\pi\f$ or the provided \f$f_x\f$, \
                                   OutputArray R, OutputArray T, int flags = fisheye::CALIB_FIX_INTRINSIC,
                                   TermCriteria criteria = TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 100, DBL_EPSILON));
 
+    /**
+    @brief Finds an object pose from 3D-2D point correspondences for fisheye camera moodel.
+
+    @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
+    1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
+    @param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+    where N is the number of points. vector\<Point2d\> can be also passed here.
+    @param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+    @param distCoeffs Input vector of distortion coefficients (4x1/1x4).
+    @param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
+    the model coordinate system to the camera coordinate system.
+    @param tvec Output translation vector.
+    @param useExtrinsicGuess Parameter used for #SOLVEPNP_ITERATIVE. If true (1), the function uses
+    the provided rvec and tvec values as initial approximations of the rotation and translation
+    vectors, respectively, and further optimizes them.
+    @param flags Method for solving a PnP problem: see @ref calib3d_solvePnP_flags
+    This function returns the rotation and the translation vectors that transform a 3D point expressed in the object
+    coordinate frame to the camera coordinate frame, using different methods:
+    - P3P methods (@ref SOLVEPNP_P3P, @ref SOLVEPNP_AP3P): need 4 input points to return a unique solution.
+    - @ref SOLVEPNP_IPPE Input points must be >= 4 and object points must be coplanar.
+    - @ref SOLVEPNP_IPPE_SQUARE Special case suitable for marker pose estimation.
+    Number of input points must be 4. Object points must be defined in the following order:
+    - point 0: [-squareLength / 2,  squareLength / 2, 0]
+    - point 1: [ squareLength / 2,  squareLength / 2, 0]
+    - point 2: [ squareLength / 2, -squareLength / 2, 0]
+    - point 3: [-squareLength / 2, -squareLength / 2, 0]
+    - for all the other flags, number of input points must be >= 4 and object points can be in any configuration.
+    @param criteria Termination criteria for internal undistortPoints call.
+    The function interally undistorts points with @ref undistortPoints and call @ref cv::solvePnP,
+    thus the input are very similar. Check there and Perspective-n-Points is described in @ref calib3d_solvePnP
+    for more information.
+    */
+    CV_EXPORTS_W bool solvePnP( InputArray objectPoints, InputArray imagePoints,
+                                InputArray cameraMatrix, InputArray distCoeffs,
+                                OutputArray rvec, OutputArray tvec,
+                                bool useExtrinsicGuess = false, int flags = SOLVEPNP_ITERATIVE,
+                                TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 10, 1e-8)
+                              );
+
 //! @} calib3d_fisheye
 } // end namespace fisheye
 
diff --git a/modules/calib3d/misc/objc/gen_dict.json b/modules/calib3d/misc/objc/gen_dict.json
index bace946846ac..655ca2426d15 100644
--- a/modules/calib3d/misc/objc/gen_dict.json
+++ b/modules/calib3d/misc/objc/gen_dict.json
@@ -1,4 +1,7 @@
 {
+    "namespaces_dict": {
+        "cv.fisheye": "fisheye"
+    },
     "func_arg_fix" : {
         "Calib3d" : {
             "findCirclesGrid" : { "blobDetector" : {"defval" : "cv::SimpleBlobDetector::create()"} }
diff --git a/modules/calib3d/perf/perf_stereosgbm.cpp b/modules/calib3d/perf/perf_stereosgbm.cpp
index 8ae477748ab5..0ae37355652f 100644
--- a/modules/calib3d/perf/perf_stereosgbm.cpp
+++ b/modules/calib3d/perf/perf_stereosgbm.cpp
@@ -43,7 +43,7 @@ using namespace testing;
 
 static void MakeArtificialExample(Mat& dst_left_view, Mat& dst_view);
 
-CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY, StereoSGBM::MODE_HH4);
+CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY, StereoSGBM::MODE_HH4)
 typedef tuple<Size, int, SGBMModes> SGBMParams;
 typedef TestBaseWithParam<SGBMParams> TestStereoCorrespSGBM;
 
diff --git a/modules/calib3d/src/ap3p.cpp b/modules/calib3d/src/ap3p.cpp
index 582b201b36a1..79da0f13a75d 100644
--- a/modules/calib3d/src/ap3p.cpp
+++ b/modules/calib3d/src/ap3p.cpp
@@ -1,5 +1,6 @@
 #include "precomp.hpp"
 #include "ap3p.h"
+#include "polynom_solver.h"
 
 #include <cmath>
 #include <complex>
@@ -8,63 +9,10 @@ static inline double cbrt(double x) { return (double)cv::cubeRoot((float)x); };
 #endif
 
 namespace {
-void solveQuartic(const double *factors, double *realRoots) {
-    const double &a4 = factors[0];
-    const double &a3 = factors[1];
-    const double &a2 = factors[2];
-    const double &a1 = factors[3];
-    const double &a0 = factors[4];
-
-    double a4_2 = a4 * a4;
-    double a3_2 = a3 * a3;
-    double a4_3 = a4_2 * a4;
-    double a2a4 = a2 * a4;
-
-    double p4 = (8 * a2a4 - 3 * a3_2) / (8 * a4_2);
-    double q4 = (a3_2 * a3 - 4 * a2a4 * a3 + 8 * a1 * a4_2) / (8 * a4_3);
-    double r4 = (256 * a0 * a4_3 - 3 * (a3_2 * a3_2) - 64 * a1 * a3 * a4_2 + 16 * a2a4 * a3_2) / (256 * (a4_3 * a4));
-
-    double p3 = ((p4 * p4) / 12 + r4) / 3; // /=-3
-    double q3 = (72 * r4 * p4 - 2 * p4 * p4 * p4 - 27 * q4 * q4) / 432; // /=2
-
-    double t; // *=2
-    std::complex<double> w;
-    if (q3 >= 0)
-        w = -std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
-    else
-        w = std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
-    if (w.imag() == 0.0) {
-        w.real(std::cbrt(w.real()));
-        t = 2.0 * (w.real() + p3 / w.real());
-    } else {
-        w = pow(w, 1.0 / 3);
-        t = 4.0 * w.real();
-    }
-
-    std::complex<double> sqrt_2m = sqrt(static_cast<std::complex<double> >(-2 * p4 / 3 + t));
-    double B_4A = -a3 / (4 * a4);
-    double complex1 = 4 * p4 / 3 + t;
-#if defined(__clang__) && defined(__arm__) && (__clang_major__ == 3 || __clang_major__ == 4) && !defined(__ANDROID__)
-    // details: https://github.com/opencv/opencv/issues/11135
-    // details: https://github.com/opencv/opencv/issues/11056
-    std::complex<double> complex2 = 2 * q4;
-    complex2 = std::complex<double>(complex2.real() / sqrt_2m.real(), 0);
-#else
-    std::complex<double> complex2 = 2 * q4 / sqrt_2m;
-#endif
-    double sqrt_2m_rh = sqrt_2m.real() / 2;
-    double sqrt1 = sqrt(-(complex1 + complex2)).real() / 2;
-    realRoots[0] = B_4A + sqrt_2m_rh + sqrt1;
-    realRoots[1] = B_4A + sqrt_2m_rh - sqrt1;
-    double sqrt2 = sqrt(-(complex1 - complex2)).real() / 2;
-    realRoots[2] = B_4A - sqrt_2m_rh + sqrt2;
-    realRoots[3] = B_4A - sqrt_2m_rh - sqrt2;
-}
-
-void polishQuarticRoots(const double *coeffs, double *roots) {
+void polishQuarticRoots(const double *coeffs, double *roots, int nb_roots) {
     const int iterations = 2;
     for (int i = 0; i < iterations; ++i) {
-        for (int j = 0; j < 4; ++j) {
+        for (int j = 0; j < nb_roots; ++j) {
             double error =
                     (((coeffs[0] * roots[j] + coeffs[1]) * roots[j] + coeffs[2]) * roots[j] + coeffs[3]) * roots[j] +
                     coeffs[4];
@@ -227,8 +175,9 @@ int ap3p::computePoses(const double featureVectors[3][4],
                         2 * (g6 * g7 - g1 * g2 - g3 * g4),
                         g7 * g7 - g2 * g2 - g4 * g4};
     double s[4];
-    solveQuartic(coeffs, s);
-    polishQuarticRoots(coeffs, s);
+    int nb_roots = solve_deg4(coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4],
+                              s[0], s[1], s[2], s[3]);
+    polishQuarticRoots(coeffs, s, nb_roots);
 
     double temp[3];
     vect_cross(k1, nl, temp);
@@ -254,7 +203,7 @@ int ap3p::computePoses(const double featureVectors[3][4],
     double reproj_errors[4];
 
     int nb_solutions = 0;
-    for (int i = 0; i < 4; ++i) {
+    for (int i = 0; i < nb_roots; ++i) {
         double ctheta1p = s[i];
         if (abs(ctheta1p) > 1)
             continue;
diff --git a/modules/calib3d/src/calib3d_c_api.h b/modules/calib3d/src/calib3d_c_api.h
index c9ac9b49f5b8..cdbf021667c2 100644
--- a/modules/calib3d/src/calib3d_c_api.h
+++ b/modules/calib3d/src/calib3d_c_api.h
@@ -51,10 +51,6 @@
 extern "C" {
 #endif
 
-/** @addtogroup calib3d_c
-  @{
-  */
-
 /****************************************************************************************\
 *                      Camera Calibration, Pose Estimation and Stereo                    *
 \****************************************************************************************/
@@ -416,8 +412,6 @@ void cvUndistortPoints( const CvMat* src, CvMat* dst,
                         const CvMat* R CV_DEFAULT(0),
                         const CvMat* P CV_DEFAULT(0));
 
-/** @} calib3d_c */
-
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index 135519e77608..3e6c0bfdba7c 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -46,7 +46,7 @@
     Here is the copyright notice from the original Vladimir's code:
     ===============================================================
 
-    The algorithms developed and implemented by Vezhnevets Vldimir
+    The algorithms developed and implemented by Vezhnevets Vladimir
     aka Dead Moroz (vvp@graphics.cs.msu.ru)
     See http://graphics.cs.msu.su/en/research/calibration/opencv.html
     for detailed information.
@@ -54,7 +54,7 @@
     Reliability additions and modifications made by Philip Gruebele.
     <a href="mailto:pgruebele@cox.net">pgruebele@cox.net</a>
 
-    Some further improvements for detection of partially ocluded boards at non-ideal
+    Some further improvements for detection of partially occluded boards at non-ideal
     lighting conditions have been made by Alex Bovyrin and Kurt Kolonige
 
 \************************************************************************************/
@@ -71,6 +71,7 @@
 
 #include "precomp.hpp"
 #include "circlesgrid.hpp"
+#include "opencv2/flann.hpp"
 
 #include <stack>
 
@@ -155,7 +156,8 @@ struct ChessBoardQuad
     float edge_len; // quad edge len, in pix^2
     // neighbors and corners are synced, i.e., neighbor 0 shares corner 0
     ChessBoardCorner *corners[4]; // Coordinates of quad corners
-    struct ChessBoardQuad *neighbors[4]; // Pointers of quad neighbors
+    struct ChessBoardQuad *neighbors[4]; // Pointers of quad neighbors. M.b. sparse.
+    // Each neighbors element corresponds to quad corner, but not just sequential index.
 
     ChessBoardQuad(int group_idx_ = -1) :
         count(0),
@@ -232,7 +234,7 @@ class ChessBoardDetector
         all_quads_count = 0;
     }
 
-    void generateQuads(const cv::Mat& image_, int flags);
+    void generateQuads(const cv::Mat& image_, int flags, int dilations);
 
     bool processQuads(std::vector<cv::Point2f>& out_corners, int &prev_sqr_size);
 
@@ -240,7 +242,7 @@ class ChessBoardDetector
 
     void findConnectedQuads(std::vector<ChessBoardQuad*>& out_group, int group_idx);
 
-    int checkQuadGroup(std::vector<ChessBoardQuad*>& quad_group, std::vector<ChessBoardCorner*>& out_corners);
+    int checkQuadGroup(const std::vector<ChessBoardQuad*>& quad_group, std::vector<ChessBoardCorner*>& out_corners);
 
     int cleanFoundConnectedQuads(std::vector<ChessBoardQuad*>& quad_group);
 
@@ -297,7 +299,7 @@ static void icvSmoothHistogram256(const ArrayContainer& piHist, ArrayContainer&
             CV_DbgAssert(iIdx >= 0 && iIdx < 256);
             iSmooth += piHist[iIdx];
         }
-        piHistSmooth[i] = iSmooth/(2*iWidth+1);
+        piHistSmooth[i] = iSmooth/(iIdx_max-iIdx_min+1);
     }
 }
 /***************************************************************************************************/
@@ -325,7 +327,7 @@ static void icvGradientOfHistogram256(const ArrayContainer& piHist, ArrayContain
     piHistGrad[255] = 0;
 }
 /***************************************************************************************************/
-//PERFORM SMART IMAGE THRESHOLDING BASED ON ANALYSIS OF INTENSTY HISTOGRAM
+//PERFORM SMART IMAGE THRESHOLDING BASED ON ANALYSIS OF INTENSITY HISTOGRAM
 static void icvBinarizationHistogramBased(Mat & img)
 {
     CV_Assert(img.channels() == 1 && img.depth() == CV_8U);
@@ -479,8 +481,7 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
 
     bool found = false;
 
-    const int min_dilations = 0;
-    const int max_dilations = 7;
+    const bool is_plain = (flags & CALIB_CB_PLAIN) != 0;
 
     int type = image_.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     Mat img = image_.getMat();
@@ -496,6 +497,9 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
 
     std::vector<cv::Point2f> out_corners;
 
+    if (is_plain)
+      CV_CheckType(type, depth == CV_8U && cn == 1, "Only 8-bit grayscale images are supported whith CALIB_CB_PLAIN flag enable");
+
     if (img.channels() != 1)
     {
         cvtColor(img, img, COLOR_BGR2GRAY);
@@ -504,10 +508,11 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
     int prev_sqr_size = 0;
 
     Mat thresh_img_new = img.clone();
-    icvBinarizationHistogramBased(thresh_img_new); // process image in-place
+    if(!is_plain)
+        icvBinarizationHistogramBased(thresh_img_new); // process image in-place
     SHOW("New binarization", thresh_img_new);
 
-    if (flags & CALIB_CB_FAST_CHECK)
+    if (flags & CALIB_CB_FAST_CHECK && !is_plain)
     {
         //perform new method for checking chessboard using a binary image.
         //image is binarised using a threshold dependent on the image histogram
@@ -523,14 +528,18 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
 
     ChessBoardDetector detector(pattern_size);
 
-    // Try our standard "1" dilation, but if the pattern is not found, iterate the whole procedure with higher dilations.
-    // This is necessary because some squares simply do not separate properly with a single dilation.  However,
+    const int min_dilations = 0;
+    const int max_dilations = is_plain ? 0 : 7;
+
+    // Try our standard "0" and "1" dilations, but if the pattern is not found, iterate the whole procedure with higher dilations.
+    // This is necessary because some squares simply do not separate properly without and with a single dilations. However,
     // we want to use the minimum number of dilations possible since dilations cause the squares to become smaller,
     // making it difficult to detect smaller squares.
     for (int dilations = min_dilations; dilations <= max_dilations; dilations++)
     {
         //USE BINARY IMAGE COMPUTED USING icvBinarizationHistogramBased METHOD
-        dilate( thresh_img_new, thresh_img_new, Mat(), Point(-1, -1), 1 );
+        if(!is_plain && dilations > 0)
+            dilate( thresh_img_new, thresh_img_new, Mat(), Point(-1, -1), 1 );
 
         // So we can find rectangles that go to the edge, we draw a white line around the image edge.
         // Otherwise FindContours will miss those clipped rectangle contours.
@@ -538,7 +547,7 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
         rectangle( thresh_img_new, Point(0,0), Point(thresh_img_new.cols-1, thresh_img_new.rows-1), Scalar(255,255,255), 3, LINE_8);
 
         detector.reset();
-        detector.generateQuads(thresh_img_new, flags);
+        detector.generateQuads(thresh_img_new, flags, dilations);
         DPRINTF("Quad count: %d/%d", detector.all_quads_count, (pattern_size.width/2+1)*(pattern_size.height/2+1));
         SHOW_QUADS("New quads", thresh_img_new, &detector.all_quads[0], detector.all_quads_count);
         if (detector.processQuads(out_corners, prev_sqr_size))
@@ -551,7 +560,7 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
     DPRINTF("Chessboard detection result 0: %d", (int)found);
 
     // revert to old, slower, method if detection failed
-    if (!found)
+    if (!found && !is_plain)
     {
         if (flags & CALIB_CB_NORMALIZE_IMAGE)
         {
@@ -574,8 +583,10 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
         }
         //if flag CALIB_CB_ADAPTIVE_THRESH is not set it doesn't make sense to iterate over k
         int max_k = useAdaptive ? 6 : 1;
+        Mat prev_thresh_img;
         for (int k = 0; k < max_k && !found; k++)
         {
+            int prev_block_size = -1;
             for (int dilations = min_dilations; dilations <= max_dilations; dilations++)
             {
                 // convert the input grayscale image to binary (black-n-white)
@@ -586,14 +597,23 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
                                              : prev_sqr_size * 2);
                     block_size = block_size | 1;
                     // convert to binary
-                    adaptiveThreshold( img, thresh_img, 255, ADAPTIVE_THRESH_MEAN_C, THRESH_BINARY, block_size, (k/2)*5 );
-                    if (dilations > 0)
-                        dilate( thresh_img, thresh_img, Mat(), Point(-1, -1), dilations-1 );
-
+                    if (block_size != prev_block_size)
+                    {
+                        adaptiveThreshold( img, thresh_img, 255, ADAPTIVE_THRESH_MEAN_C, THRESH_BINARY, block_size, (k/2)*5 );
+                        dilate( thresh_img, thresh_img, Mat(), Point(-1, -1), dilations );
+                        thresh_img.copyTo(prev_thresh_img);
+                    }
+                    else if (dilations > 0)
+                    {
+                        dilate( prev_thresh_img, prev_thresh_img, Mat(), Point(-1, -1), 1 );
+                        prev_thresh_img.copyTo(thresh_img);
+                    }
+                    prev_block_size = block_size;
                 }
                 else
                 {
-                    dilate( thresh_img, thresh_img, Mat(), Point(-1, -1), 1 );
+                    if (dilations > 0)
+                        dilate( thresh_img, thresh_img, Mat(), Point(-1, -1), 1 );
                 }
                 SHOW("Old binarization", thresh_img);
 
@@ -603,7 +623,7 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
                 rectangle( thresh_img, Point(0,0), Point(thresh_img.cols-1, thresh_img.rows-1), Scalar(255,255,255), 3, LINE_8);
 
                 detector.reset();
-                detector.generateQuads(thresh_img, flags);
+                detector.generateQuads(thresh_img, flags, dilations);
                 DPRINTF("Quad count: %d/%d", detector.all_quads_count, (pattern_size.width/2+1)*(pattern_size.height/2+1));
                 SHOW_QUADS("Old quads", thresh_img, &detector.all_quads[0], detector.all_quads_count);
                 if (detector.processQuads(out_corners, prev_sqr_size))
@@ -662,7 +682,6 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
     return found;
 }
 
-
 //
 // Checks that each board row and column is pretty much monotonous curve:
 // It analyzes each row and each column of the chessboard as following:
@@ -671,7 +690,7 @@ bool findChessboardCorners(InputArray image_, Size pattern_size,
 //    of the neighbor corners in the same row/column.
 //
 // This function has been created as temporary workaround for the bug in current implementation
-// of cvFindChessboardCornes that produces absolutely unordered sets of corners.
+// of cvFindChessboardCorners that produces absolutely unordered sets of corners.
 //
 bool ChessBoardDetector::checkBoardMonotony(const std::vector<cv::Point2f>& corners)
 {
@@ -1213,9 +1232,9 @@ int ChessBoardDetector::cleanFoundConnectedQuads(std::vector<ChessBoardQuad*>& q
     // We iteratively remove the point which reduces the size of
     // the bounding box of the blobs the most
     // (since we want the rectangle to be as small as possible)
-    // remove the quadrange that causes the biggest reduction
+    // remove the quadrangle that causes the biggest reduction
     // in pattern size until we have the correct number
-    for (; quad_count > count; quad_count--)
+    while (quad_count > count)
     {
         double min_box_area = DBL_MAX;
         int min_box_area_index = -1;
@@ -1321,7 +1340,7 @@ void ChessBoardDetector::findConnectedQuads(std::vector<ChessBoardQuad*>& out_gr
 }
 
 
-int ChessBoardDetector::checkQuadGroup(std::vector<ChessBoardQuad*>& quad_group, std::vector<ChessBoardCorner*>& out_corners)
+int ChessBoardDetector::checkQuadGroup(const std::vector<ChessBoardQuad*>& quad_group, std::vector<ChessBoardCorner*>& out_corners)
 {
     const int ROW1 = 1000000;
     const int ROW2 = 2000000;
@@ -1419,6 +1438,7 @@ int ChessBoardDetector::checkQuadGroup(std::vector<ChessBoardQuad*>& quad_group,
     ChessBoardCorner* cur = first;
     ChessBoardCorner* right = NULL;
     ChessBoardCorner* below = NULL;
+    out_corners.clear();
     out_corners.push_back(cur);
 
     for (int k = 0; k < 4; ++k)
@@ -1588,7 +1608,24 @@ int ChessBoardDetector::checkQuadGroup(std::vector<ChessBoardQuad*>& quad_group,
 void ChessBoardDetector::findQuadNeighbors()
 {
     const float thresh_scale = 1.f;
+
+    const int all_corners_count = all_quads_count * 4;
+
+    std::vector<Point2f> all_quads_pts;
+    all_quads_pts.reserve(all_corners_count);
+    for (int idx = 0; idx < all_quads_count; idx++)
+    {
+        const ChessBoardQuad& cur_quad = (const ChessBoardQuad&)all_quads[idx];
+        for (int i = 0; i < 4; i++)
+            all_quads_pts.push_back(cur_quad.corners[i]->pt);
+    }
+
+    const cvflann::KDTreeSingleIndexParams index_params;
+    flann::GenericIndex<flann::L2_Simple<float>> all_quads_pts_index(Mat(all_quads_pts).reshape(1, all_corners_count), index_params);
+
     // find quad neighbors
+    std::vector<int> neighbors_indices(all_corners_count);
+    std::vector<float> neighbors_dists(all_corners_count);
     for (int idx = 0; idx < all_quads_count; idx++)
     {
         ChessBoardQuad& cur_quad = (ChessBoardQuad&)all_quads[idx];
@@ -1605,47 +1642,54 @@ void ChessBoardDetector::findQuadNeighbors()
                 continue;
 
             float min_dist = FLT_MAX;
+            int closest_neighbor_idx = -1;
             int closest_corner_idx = -1;
             ChessBoardQuad *closest_quad = 0;
 
-            cv::Point2f pt = cur_quad.corners[i]->pt;
+            cv::Point2f pt = all_quads_pts[(idx << 2) + i];
 
             // find the closest corner in all other quadrangles
-            for (int k = 0; k < all_quads_count; k++)
+            std::vector<float> query = Mat(pt);
+            float radius = cur_quad.edge_len * thresh_scale + 1;
+            const cvflann::SearchParams search_params(-1);
+            int neighbors_count = all_quads_pts_index.radiusSearch(query, neighbors_indices, neighbors_dists, radius, search_params);
+
+            for (int neighbor_idx_idx = 0; neighbor_idx_idx < neighbors_count; neighbor_idx_idx++)
             {
+                const int neighbor_idx = neighbors_indices[neighbor_idx_idx];
+                const int k = neighbor_idx >> 2;
                 if (k == idx)
                     continue;
 
                 ChessBoardQuad& q_k = all_quads[k];
+                const int j = neighbor_idx & 3;
+                if (q_k.neighbors[j])
+                    continue;
 
-                for (int j = 0; j < 4; j++)
+                const float dist = normL2Sqr<float>(pt - all_quads_pts[neighbor_idx]);
+                if (dist <= cur_quad.edge_len * thresh_scale &&
+                    dist <= q_k.edge_len * thresh_scale)
                 {
-                    if (q_k.neighbors[j])
-                        continue;
-
-                    float dist = normL2Sqr<float>(pt - q_k.corners[j]->pt);
-                    if (dist < min_dist &&
-                        dist <= cur_quad.edge_len*thresh_scale &&
-                        dist <= q_k.edge_len*thresh_scale )
+                    // check edge lengths, make sure they're compatible
+                    // edges that are different by more than 1:4 are rejected.
+                    // edge_len is squared edge length, so we compare them
+                    // with squared constant 16 = 4^2
+                    if (q_k.edge_len > 16 * cur_quad.edge_len ||
+                        cur_quad.edge_len > 16 * q_k.edge_len)
                     {
-                        // check edge lengths, make sure they're compatible
-                        // edges that are different by more than 1:4 are rejected
-                        float ediff = cur_quad.edge_len - q_k.edge_len;
-                        if (ediff > 32*cur_quad.edge_len ||
-                            ediff > 32*q_k.edge_len)
-                        {
-                            DPRINTF("Incompatible edge lengths");
-                            continue;
-                        }
-                        closest_corner_idx = j;
-                        closest_quad = &q_k;
-                        min_dist = dist;
+                        DPRINTF("Incompatible edge lengths");
+                        continue;
                     }
+                    closest_neighbor_idx = neighbor_idx;
+                    closest_corner_idx = j;
+                    closest_quad = &q_k;
+                    min_dist = dist;
+                    break;
                 }
             }
 
             // we found a matching corner point?
-            if (closest_corner_idx >= 0 && min_dist < FLT_MAX)
+            if (closest_neighbor_idx >= 0 && closest_corner_idx >= 0 && min_dist < FLT_MAX)
             {
                 CV_Assert(closest_quad);
 
@@ -1657,6 +1701,7 @@ void ChessBoardDetector::findQuadNeighbors()
                 // This is necessary to support small squares where otherwise the wrong
                 // corner will get matched to closest_quad;
                 ChessBoardCorner& closest_corner = *closest_quad->corners[closest_corner_idx];
+                cv::Point2f closest_corner_pt = all_quads_pts[closest_neighbor_idx];
 
                 int j = 0;
                 for (; j < 4; j++)
@@ -1664,46 +1709,48 @@ void ChessBoardDetector::findQuadNeighbors()
                     if (cur_quad.neighbors[j] == closest_quad)
                         break;
 
-                    if (normL2Sqr<float>(closest_corner.pt - cur_quad.corners[j]->pt) < min_dist)
+                    if (normL2Sqr<float>(closest_corner_pt - all_quads_pts[(idx << 2) + j]) < min_dist)
                         break;
                 }
                 if (j < 4)
                     continue;
 
                 // Check that each corner is a neighbor of different quads
-                for(j = 0; j < closest_quad->count; j++ )
+                for(j = 0; j < 4; j++ )
                 {
                     if (closest_quad->neighbors[j] == &cur_quad)
                         break;
                 }
-                if (j < closest_quad->count)
+                if (j < 4)
                     continue;
 
-                // check whether the closest corner to closest_corner
-                // is different from cur_quad->corners[i]->pt
-                for (j = 0; j < all_quads_count; j++ )
+                // check whether the closest corner to closest_corner is different from pt
+                query = Mat(closest_corner_pt);
+                radius = min_dist + 1;
+                neighbors_count = all_quads_pts_index.radiusSearch(query, neighbors_indices, neighbors_dists, radius, search_params);
+
+                int neighbor_idx_idx = 0;
+                for (; neighbor_idx_idx < neighbors_count; neighbor_idx_idx++)
                 {
+                    const int neighbor_idx = neighbors_indices[neighbor_idx_idx];
+                    j = neighbor_idx >> 2;
+
                     ChessBoardQuad* q = &const_cast<ChessBoardQuad&>(all_quads[j]);
                     if (j == idx || q == closest_quad)
                         continue;
 
-                    int k = 0;
-                    for (; k < 4; k++ )
+                    const int k = neighbor_idx & 3;
+                    CV_DbgAssert(q);
+                    if (!q->neighbors[k])
                     {
-                        CV_DbgAssert(q);
-                        if (!q->neighbors[k])
-                        {
-                            if (normL2Sqr<float>(closest_corner.pt - q->corners[k]->pt) < min_dist)
-                                break;
-                        }
+                        if (normL2Sqr<float>(closest_corner_pt - all_quads_pts[neighbor_idx]) < min_dist)
+                            break;
                     }
-                    if (k < 4)
-                        break;
                 }
-                if (j < all_quads_count)
+                if (neighbor_idx_idx < neighbors_count)
                     continue;
 
-                closest_corner.pt = (pt + closest_corner.pt) * 0.5f;
+                closest_corner.pt = (pt + closest_corner_pt) * 0.5f;
 
                 // We've found one more corner - remember it
                 cur_quad.count++;
@@ -1721,7 +1768,7 @@ void ChessBoardDetector::findQuadNeighbors()
 // returns corners in clockwise order
 // corners don't necessarily start at same position on quad (e.g.,
 //   top left corner)
-void ChessBoardDetector::generateQuads(const cv::Mat& image_, int flags)
+void ChessBoardDetector::generateQuads(const cv::Mat& image_, int flags, int dilations)
 {
     binarized_image = image_;  // save for debug purposes
 
@@ -1730,8 +1777,8 @@ void ChessBoardDetector::generateQuads(const cv::Mat& image_, int flags)
     all_quads.deallocate();
     all_corners.deallocate();
 
-    // empiric bound for minimal allowed perimeter for squares
-    int min_size = 25; //cvRound( image->cols * image->rows * .03 * 0.01 * 0.92 );
+    // empiric bound for minimal allowed area for squares
+    const int min_area = 25; //cvRound( image->cols * image->rows * .03 * 0.01 * 0.92 );
 
     bool filterQuads = (flags & CALIB_CB_FILTER_QUADS) != 0;
 
@@ -1759,25 +1806,15 @@ void ChessBoardDetector::generateQuads(const cv::Mat& image_, int flags)
         const std::vector<Point>& contour = contours[idx];
 
         Rect contour_rect = boundingRect(contour);
-        if (contour_rect.area() < min_size)
+        if (contour_rect.area() < min_area)
             continue;
 
-        std::vector<Point> approx_contour;
+        std::vector<Point> approx_contour = contour;
 
         const int min_approx_level = 1, max_approx_level = MAX_CONTOUR_APPROX;
-        for (int approx_level = min_approx_level; approx_level <= max_approx_level; approx_level++ )
+        for (int approx_level = min_approx_level; approx_contour.size() > 4 && approx_level <= max_approx_level; approx_level++ )
         {
-            approxPolyDP(contour, approx_contour, (float)approx_level, true);
-            if (approx_contour.size() == 4)
-                break;
-
-            // we call this again on its own output, because sometimes
-            // approxPoly() does not simplify as much as it should.
-            std::vector<Point> approx_contour_tmp;
-            std::swap(approx_contour, approx_contour_tmp);
-            approxPolyDP(approx_contour_tmp, approx_contour, (float)approx_level, true);
-            if (approx_contour.size() == 4)
-                break;
+            approxPolyDP(approx_contour, approx_contour, (float)approx_level, true);
         }
 
         // reject non-quadrangles
@@ -1803,7 +1840,7 @@ void ChessBoardDetector::generateQuads(const cv::Mat& image_, int flags)
             // than rectangular and which are big enough
             double d3 = sqrt(normL2Sqr<double>(pt[0] - pt[1]));
             double d4 = sqrt(normL2Sqr<double>(pt[1] - pt[2]));
-            if (!(d3*4 > d4 && d4*4 > d3 && d3*d4 < area*1.5 && area > min_size &&
+            if (!(d3*4 > d4 && d4*4 > d3 && d3*d4 < area*1.5 && area > min_area &&
                 d1 >= 0.15 * p && d2 >= 0.15 * p))
                 continue;
         }
@@ -1846,6 +1883,9 @@ void ChessBoardDetector::generateQuads(const cv::Mat& image_, int flags)
             float d = normL2Sqr<float>(q.corners[i]->pt - q.corners[(i+1)&3]->pt);
             q.edge_len = std::min(q.edge_len, d);
         }
+
+        const int edge_len_compensation = 2 * dilations;
+        q.edge_len += 2 * sqrt(q.edge_len) * edge_len_compensation + edge_len_compensation * edge_len_compensation;
     }
 
     all_quads_count = quad_count;
@@ -1912,6 +1952,7 @@ bool ChessBoardDetector::processQuads(std::vector<cv::Point2f>& out_corners, int
         if (count > 0 || (-count > (int)out_corners.size()))
         {
             // copy corners to output array
+            out_corners.clear();
             out_corners.reserve(n);
             for (int i = 0; i < n; ++i)
                 out_corners.push_back(corner_group[i]->pt);
diff --git a/modules/calib3d/src/calibration.cpp b/modules/calib3d/src/calibration.cpp
index 9e8c252ceef1..c428da6bd5c4 100644
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "hal_replacement.hpp"
 #include "opencv2/imgproc/imgproc_c.h"
 #include "distortion_model.hpp"
 #include "calib3d_c_api.h"
@@ -254,32 +255,32 @@ CV_IMPL int cvRodrigues2( const CvMat* src, CvMat* dst, CvMat* jacobian )
     CvMat matJ = cvMat( 3, 9, CV_64F, J );
 
     if( !CV_IS_MAT(src) )
-        CV_Error( !src ? CV_StsNullPtr : CV_StsBadArg, "Input argument is not a valid matrix" );
+        CV_Error( !src ? cv::Error::StsNullPtr : cv::Error::StsBadArg, "Input argument is not a valid matrix" );
 
     if( !CV_IS_MAT(dst) )
-        CV_Error( !dst ? CV_StsNullPtr : CV_StsBadArg,
+        CV_Error( !dst ? cv::Error::StsNullPtr : cv::Error::StsBadArg,
         "The first output argument is not a valid matrix" );
 
     int depth = CV_MAT_DEPTH(src->type);
     int elem_size = CV_ELEM_SIZE(depth);
 
     if( depth != CV_32F && depth != CV_64F )
-        CV_Error( CV_StsUnsupportedFormat, "The matrices must have 32f or 64f data type" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "The matrices must have 32f or 64f data type" );
 
     if( !CV_ARE_DEPTHS_EQ(src, dst) )
-        CV_Error( CV_StsUnmatchedFormats, "All the matrices must have the same data type" );
+        CV_Error( cv::Error::StsUnmatchedFormats, "All the matrices must have the same data type" );
 
     if( jacobian )
     {
         if( !CV_IS_MAT(jacobian) )
-            CV_Error( CV_StsBadArg, "Jacobian is not a valid matrix" );
+            CV_Error( cv::Error::StsBadArg, "Jacobian is not a valid matrix" );
 
         if( !CV_ARE_DEPTHS_EQ(src, jacobian) || CV_MAT_CN(jacobian->type) != 1 )
-            CV_Error( CV_StsUnmatchedFormats, "Jacobian must have 32fC1 or 64fC1 datatype" );
+            CV_Error( cv::Error::StsUnmatchedFormats, "Jacobian must have 32fC1 or 64fC1 datatype" );
 
         if( (jacobian->rows != 9 || jacobian->cols != 3) &&
             (jacobian->rows != 3 || jacobian->cols != 9))
-            CV_Error( CV_StsBadSize, "Jacobian must be 3x9 or 9x3" );
+            CV_Error( cv::Error::StsBadSize, "Jacobian must be 3x9 or 9x3" );
     }
 
     if( src->cols == 1 || src->rows == 1 )
@@ -287,10 +288,10 @@ CV_IMPL int cvRodrigues2( const CvMat* src, CvMat* dst, CvMat* jacobian )
         int step = src->rows > 1 ? src->step / elem_size : 1;
 
         if( src->rows + src->cols*CV_MAT_CN(src->type) - 1 != 3 )
-            CV_Error( CV_StsBadSize, "Input matrix must be 1x3, 3x1 or 3x3" );
+            CV_Error( cv::Error::StsBadSize, "Input matrix must be 1x3, 3x1 or 3x3" );
 
         if( dst->rows != 3 || dst->cols != 3 || CV_MAT_CN(dst->type) != 1 )
-            CV_Error( CV_StsBadSize, "Output matrix must be 3x3, single-channel floating point matrix" );
+            CV_Error( cv::Error::StsBadSize, "Output matrix must be 3x3, single-channel floating point matrix" );
 
         Point3d r;
         if( depth == CV_32F )
@@ -368,7 +369,7 @@ CV_IMPL int cvRodrigues2( const CvMat* src, CvMat* dst, CvMat* jacobian )
 
         if( (dst->rows != 1 || dst->cols*CV_MAT_CN(dst->type) != 3) &&
             (dst->rows != 3 || dst->cols != 1 || CV_MAT_CN(dst->type) != 1))
-            CV_Error( CV_StsBadSize, "Output matrix must be 1x3 or 3x1" );
+            CV_Error( cv::Error::StsBadSize, "Output matrix must be 1x3 or 3x1" );
 
         Matx33d R = cvarrToMat(src);
 
@@ -490,7 +491,7 @@ CV_IMPL int cvRodrigues2( const CvMat* src, CvMat* dst, CvMat* jacobian )
     }
     else
     {
-        CV_Error(CV_StsBadSize, "Input matrix must be 1x3 or 3x1 for a rotation vector, or 3x3 for a rotation matrix");
+        CV_Error(cv::Error::StsBadSize, "Input matrix must be 1x3 or 3x1 for a rotation vector, or 3x3 for a rotation matrix");
     }
 
     if( jacobian )
@@ -516,7 +517,6 @@ CV_IMPL int cvRodrigues2( const CvMat* src, CvMat* dst, CvMat* jacobian )
     return 1;
 }
 
-
 static const char* cvDistCoeffErr = "Distortion coefficients must be 1x4, 4x1, 1x5, 5x1, 1x8, 8x1, 1x12, 12x1, 1x14 or 14x1 floating-point vector";
 
 static void cvProjectPoints2Internal( const CvMat* objectPoints,
@@ -553,55 +553,41 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
     if( !CV_IS_MAT(objectPoints) || !CV_IS_MAT(r_vec) ||
         !CV_IS_MAT(t_vec) || !CV_IS_MAT(A) ||
         /*!CV_IS_MAT(distCoeffs) ||*/ !CV_IS_MAT(imagePoints) )
-        CV_Error( CV_StsBadArg, "One of required arguments is not a valid matrix" );
+        CV_Error( cv::Error::StsBadArg, "One of required arguments is not a valid matrix" );
 
-    int total = objectPoints->rows * objectPoints->cols * CV_MAT_CN(objectPoints->type);
+    int odepth = CV_MAT_DEPTH(objectPoints->type);
+    int ochans = CV_MAT_CN(objectPoints->type);
+    int orows = objectPoints->rows, ocols = objectPoints->cols;
+    int total = orows * ocols * ochans;
     if(total % 3 != 0)
     {
         //we have stopped support of homogeneous coordinates because it cause ambiguity in interpretation of the input data
-        CV_Error( CV_StsBadArg, "Homogeneous coordinates are not supported" );
+        CV_Error( cv::Error::StsBadArg, "Homogeneous coordinates are not supported" );
     }
     count = total / 3;
 
-    if( CV_IS_CONT_MAT(objectPoints->type) &&
-        (CV_MAT_DEPTH(objectPoints->type) == CV_32F || CV_MAT_DEPTH(objectPoints->type) == CV_64F)&&
-        ((objectPoints->rows == 1 && CV_MAT_CN(objectPoints->type) == 3) ||
-        (objectPoints->rows == count && CV_MAT_CN(objectPoints->type)*objectPoints->cols == 3) ||
-        (objectPoints->rows == 3 && CV_MAT_CN(objectPoints->type) == 1 && objectPoints->cols == count)))
-    {
-        matM.reset(cvCreateMat( objectPoints->rows, objectPoints->cols, CV_MAKETYPE(CV_64F,CV_MAT_CN(objectPoints->type)) ));
-        cvConvert(objectPoints, matM);
-    }
-    else
-    {
-//        matM = cvCreateMat( 1, count, CV_64FC3 );
-//        cvConvertPointsHomogeneous( objectPoints, matM );
-        CV_Error( CV_StsBadArg, "Homogeneous coordinates are not supported" );
-    }
-
-    if( CV_IS_CONT_MAT(imagePoints->type) &&
-        (CV_MAT_DEPTH(imagePoints->type) == CV_32F || CV_MAT_DEPTH(imagePoints->type) == CV_64F) &&
-        ((imagePoints->rows == 1 && CV_MAT_CN(imagePoints->type) == 2) ||
-        (imagePoints->rows == count && CV_MAT_CN(imagePoints->type)*imagePoints->cols == 2) ||
-        (imagePoints->rows == 2 && CV_MAT_CN(imagePoints->type) == 1 && imagePoints->cols == count)))
-    {
-        _m.reset(cvCreateMat( imagePoints->rows, imagePoints->cols, CV_MAKETYPE(CV_64F,CV_MAT_CN(imagePoints->type)) ));
-        cvConvert(imagePoints, _m);
-    }
-    else
-    {
-//        _m = cvCreateMat( 1, count, CV_64FC2 );
-        CV_Error( CV_StsBadArg, "Homogeneous coordinates are not supported" );
-    }
-
-    M = (CvPoint3D64f*)matM->data.db;
-    m = (CvPoint2D64f*)_m->data.db;
+    CV_Assert(CV_IS_CONT_MAT(objectPoints->type));
+    CV_Assert(odepth == CV_32F || odepth == CV_64F);
+    // Homogeneous coordinates are not supported
+    CV_Assert((orows == 1 && ochans == 3) ||
+              (orows == count && ochans*ocols == 3) ||
+              (orows == 3 && ochans == 1 && ocols == count));
+
+    int idepth = CV_MAT_DEPTH(imagePoints->type);
+    int ichans = CV_MAT_CN(imagePoints->type);
+    int irows = imagePoints->rows, icols = imagePoints->cols;
+    CV_Assert(CV_IS_CONT_MAT(imagePoints->type));
+    CV_Assert(idepth == CV_32F || idepth == CV_64F);
+    // Homogeneous coordinates are not supported
+    CV_Assert((irows == 1 && ichans == 2) ||
+              (irows == count && ichans*icols == 2) ||
+              (irows == 2 && ichans == 1 && icols == count));
 
     if( (CV_MAT_DEPTH(r_vec->type) != CV_64F && CV_MAT_DEPTH(r_vec->type) != CV_32F) ||
         (((r_vec->rows != 1 && r_vec->cols != 1) ||
         r_vec->rows*r_vec->cols*CV_MAT_CN(r_vec->type) != 3) &&
         ((r_vec->rows != 3 && r_vec->cols != 3) || CV_MAT_CN(r_vec->type) != 1)))
-        CV_Error( CV_StsBadArg, "Rotation must be represented by 1x3 or 3x1 "
+        CV_Error( cv::Error::StsBadArg, "Rotation must be represented by 1x3 or 3x1 "
                   "floating-point rotation vector, or 3x3 rotation matrix" );
 
     if( r_vec->rows == 3 && r_vec->cols == 3 )
@@ -621,7 +607,7 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
     if( (CV_MAT_DEPTH(t_vec->type) != CV_64F && CV_MAT_DEPTH(t_vec->type) != CV_32F) ||
         (t_vec->rows != 1 && t_vec->cols != 1) ||
         t_vec->rows*t_vec->cols*CV_MAT_CN(t_vec->type) != 3 )
-        CV_Error( CV_StsBadArg,
+        CV_Error( cv::Error::StsBadArg,
             "Translation vector must be 1x3 or 3x1 floating-point vector" );
 
     _t = cvMat( t_vec->rows, t_vec->cols, CV_MAKETYPE(CV_64F,CV_MAT_CN(t_vec->type)), t );
@@ -629,7 +615,7 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
 
     if( (CV_MAT_TYPE(A->type) != CV_64FC1 && CV_MAT_TYPE(A->type) != CV_32FC1) ||
         A->rows != 3 || A->cols != 3 )
-        CV_Error( CV_StsBadArg, "Intrinsic parameters must be 3x3 floating-point matrix" );
+        CV_Error( cv::Error::StsBadArg, "Intrinsic parameters must be 3x3 floating-point matrix" );
 
     cvConvert( A, &_a );
     fx = a[0]; fy = a[4];
@@ -638,27 +624,148 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
     if( fixedAspectRatio )
         fx = fy*aspectRatio;
 
+    int delems = 0;
     if( distCoeffs )
     {
-        if( !CV_IS_MAT(distCoeffs) ||
-            (CV_MAT_DEPTH(distCoeffs->type) != CV_64F &&
-            CV_MAT_DEPTH(distCoeffs->type) != CV_32F) ||
-            (distCoeffs->rows != 1 && distCoeffs->cols != 1) ||
-            (distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) != 4 &&
-            distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) != 5 &&
-            distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) != 8 &&
-            distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) != 12 &&
-            distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) != 14) )
-            CV_Error( CV_StsBadArg, cvDistCoeffErr );
-
-        _k = cvMat( distCoeffs->rows, distCoeffs->cols,
-                    CV_MAKETYPE(CV_64F,CV_MAT_CN(distCoeffs->type)), k );
+        CV_Assert(CV_IS_MAT(distCoeffs));
+
+        int ddepth = CV_MAT_DEPTH(distCoeffs->type);
+        int dchans = CV_MAT_CN(distCoeffs->type);
+        int drows = distCoeffs->rows, dcols = distCoeffs->cols;
+        delems = drows * dcols * dchans;
+        CV_Assert((ddepth == CV_32F || ddepth == CV_64F) &&
+                  (drows == 1 || dcols == 1) &&
+                  (delems == 4 || delems == 5 || delems == 8 || delems == 12 || delems == 14));
+
+        _k = cvMat( drows, dcols, CV_MAKETYPE(CV_64F, dchans), k );
         cvConvert( distCoeffs, &_k );
         if(k[12] != 0 || k[13] != 0)
         {
-          detail::computeTiltProjectionMatrix(k[12], k[13],
-            &matTilt, &dMatTiltdTauX, &dMatTiltdTauY);
+            detail::computeTiltProjectionMatrix(k[12], k[13], &matTilt, &dMatTiltdTauX, &dMatTiltdTauY);
+        }
+    }
+
+    if (idepth == CV_32F && odepth == CV_32F)
+    {
+        float rtMatrix[12] = { (float)R[0], (float)R[1], (float)R[2], (float)t[0],
+                               (float)R[3], (float)R[4], (float)R[5], (float)t[1],
+                               (float)R[6], (float)R[7], (float)R[8], (float)t[2] };
+
+        cv_camera_intrinsics_pinhole_32f intr;
+        intr.fx = (float)fx; intr.fy = (float)fy;
+        intr.cx = (float)cx; intr.cy = (float)cy;
+        intr.amt_k = 0; intr.amt_p = 0; intr.amt_s = 0; intr.use_tau = false;
+
+        switch (delems)
+        {
+        case  0: break;
+        case  4: // [k_1, k_2, p_1, p_2]
+            intr.amt_k = 2; intr.amt_p = 2;
+            break;
+        case  5: // [k_1, k_2, p_1, p_2, k_3]
+            intr.amt_k = 3; intr.amt_p = 2;
+            break;
+        case  8: // [k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6]
+            intr.amt_k = 6; intr.amt_p = 2;
+            break;
+        case 12: // [k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6, s_1, s_2, s_3, s_4]
+            intr.amt_k = 6; intr.amt_p = 2; intr.amt_s = 4;
+            break;
+        case 14: // [k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6, s_1, s_2, s_3, s_4, tau_x, tau_y]
+            intr.amt_k = 6; intr.amt_p = 2; intr.amt_s = 4; intr.use_tau = true;
+            break;
+        default:
+            CV_Error(cv::Error::StsInternal, "Wrong number of distortion coefficients");
+        }
+
+        intr.k[0] = (float)k[0];
+        intr.k[1] = (float)k[1];
+        intr.k[2] = (float)k[4];
+        intr.k[3] = (float)k[5];
+        intr.k[4] = (float)k[6];
+        intr.k[5] = (float)k[7];
+
+        intr.p[0] = (float)k[2];
+        intr.p[1] = (float)k[3];
+
+        for (int ctr = 0; ctr < 4; ctr++)
+        {
+            intr.s[ctr] = (float)k[8+ctr];
+        }
+
+        intr.tau_x = (float)k[12];
+        intr.tau_y = (float)k[13];
+
+        CALL_HAL(projectPoints, cv_hal_project_points_pinhole32f,
+                 objectPoints->data.fl, objectPoints->step, count,
+                 imagePoints->data.fl, imagePoints->step,
+                 rtMatrix, &intr);
+    }
+
+    _m.reset(cvCreateMat( imagePoints->rows, imagePoints->cols, CV_MAKETYPE(CV_64F,CV_MAT_CN(imagePoints->type)) ));
+    cvConvert(imagePoints, _m);
+
+    matM.reset(cvCreateMat( objectPoints->rows, objectPoints->cols, CV_MAKETYPE(CV_64F,CV_MAT_CN(objectPoints->type)) ));
+    cvConvert(objectPoints, matM);
+
+    M = (CvPoint3D64f*)matM->data.db;
+    m = (CvPoint2D64f*)_m->data.db;
+
+    if (idepth == CV_64F && odepth == CV_64F)
+    {
+        double rtMatrix[12] = { R[0], R[1], R[2], t[0],
+                                R[3], R[4], R[5], t[1],
+                                R[6], R[7], R[8], t[2] };
+
+        cv_camera_intrinsics_pinhole_64f intr;
+        intr.fx = fx; intr.fy = fy;
+        intr.cx = cx; intr.cy = cy;
+        intr.amt_k = 0; intr.amt_p = 0; intr.amt_s = 0; intr.use_tau = false;
+
+        switch (delems)
+        {
+        case  0: break;
+        case  4: // [k_1, k_2, p_1, p_2]
+            intr.amt_k = 2; intr.amt_p = 2;
+            break;
+        case  5: // [k_1, k_2, p_1, p_2, k_3]
+            intr.amt_k = 3; intr.amt_p = 2;
+            break;
+        case  8: // [k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6]
+            intr.amt_k = 6; intr.amt_p = 2;
+            break;
+        case 12: // [k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6, s_1, s_2, s_3, s_4]
+            intr.amt_k = 6; intr.amt_p = 2; intr.amt_s = 4;
+            break;
+        case 14: // [k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6, s_1, s_2, s_3, s_4, tau_x, tau_y]
+            intr.amt_k = 6; intr.amt_p = 2; intr.amt_s = 4; intr.use_tau = true;
+            break;
+        default:
+            CV_Error(cv::Error::StsInternal, "Wrong number of distortion coefficients");
         }
+
+        intr.k[0] = k[0];
+        intr.k[1] = k[1];
+        intr.k[2] = k[4];
+        intr.k[3] = k[5];
+        intr.k[4] = k[6];
+        intr.k[5] = k[7];
+
+        intr.p[0] = k[2];
+        intr.p[1] = k[3];
+
+        for (int ctr = 0; ctr < 4; ctr++)
+        {
+            intr.s[ctr] = k[8+ctr];
+        }
+
+        intr.tau_x = k[12];
+        intr.tau_y = k[13];
+
+        CALL_HAL(projectPoints, cv_hal_project_points_pinhole64f,
+                 objectPoints->data.db, objectPoints->step, count,
+                 imagePoints->data.db, imagePoints->step,
+                 rtMatrix, &intr);
     }
 
     if( dpdr )
@@ -667,7 +774,7 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
             (CV_MAT_TYPE(dpdr->type) != CV_32FC1 &&
             CV_MAT_TYPE(dpdr->type) != CV_64FC1) ||
             dpdr->rows != count*2 || dpdr->cols != 3 )
-            CV_Error( CV_StsBadArg, "dp/drot must be 2Nx3 floating-point matrix" );
+            CV_Error( cv::Error::StsBadArg, "dp/drot must be 2Nx3 floating-point matrix" );
 
         if( CV_MAT_TYPE(dpdr->type) == CV_64FC1 )
         {
@@ -685,7 +792,7 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
             (CV_MAT_TYPE(dpdt->type) != CV_32FC1 &&
             CV_MAT_TYPE(dpdt->type) != CV_64FC1) ||
             dpdt->rows != count*2 || dpdt->cols != 3 )
-            CV_Error( CV_StsBadArg, "dp/dT must be 2Nx3 floating-point matrix" );
+            CV_Error( cv::Error::StsBadArg, "dp/dT must be 2Nx3 floating-point matrix" );
 
         if( CV_MAT_TYPE(dpdt->type) == CV_64FC1 )
         {
@@ -702,7 +809,7 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
         if( !CV_IS_MAT(dpdf) ||
             (CV_MAT_TYPE(dpdf->type) != CV_32FC1 && CV_MAT_TYPE(dpdf->type) != CV_64FC1) ||
             dpdf->rows != count*2 || dpdf->cols != 2 )
-            CV_Error( CV_StsBadArg, "dp/df must be 2Nx2 floating-point matrix" );
+            CV_Error( cv::Error::StsBadArg, "dp/df must be 2Nx2 floating-point matrix" );
 
         if( CV_MAT_TYPE(dpdf->type) == CV_64FC1 )
         {
@@ -719,7 +826,7 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
         if( !CV_IS_MAT(dpdc) ||
             (CV_MAT_TYPE(dpdc->type) != CV_32FC1 && CV_MAT_TYPE(dpdc->type) != CV_64FC1) ||
             dpdc->rows != count*2 || dpdc->cols != 2 )
-            CV_Error( CV_StsBadArg, "dp/dc must be 2Nx2 floating-point matrix" );
+            CV_Error( cv::Error::StsBadArg, "dp/dc must be 2Nx2 floating-point matrix" );
 
         if( CV_MAT_TYPE(dpdc->type) == CV_64FC1 )
         {
@@ -736,10 +843,10 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
         if( !CV_IS_MAT(dpdk) ||
             (CV_MAT_TYPE(dpdk->type) != CV_32FC1 && CV_MAT_TYPE(dpdk->type) != CV_64FC1) ||
             dpdk->rows != count*2 || (dpdk->cols != 14 && dpdk->cols != 12 && dpdk->cols != 8 && dpdk->cols != 5 && dpdk->cols != 4 && dpdk->cols != 2) )
-            CV_Error( CV_StsBadArg, "dp/df must be 2Nx14, 2Nx12, 2Nx8, 2Nx5, 2Nx4 or 2Nx2 floating-point matrix" );
+            CV_Error( cv::Error::StsBadArg, "dp/df must be 2Nx14, 2Nx12, 2Nx8, 2Nx5, 2Nx4 or 2Nx2 floating-point matrix" );
 
         if( !distCoeffs )
-            CV_Error( CV_StsNullPtr, "distCoeffs is NULL while dpdk is not" );
+            CV_Error( cv::Error::StsNullPtr, "distCoeffs is NULL while dpdk is not" );
 
         if( CV_MAT_TYPE(dpdk->type) == CV_64FC1 )
         {
@@ -756,7 +863,7 @@ static void cvProjectPoints2Internal( const CvMat* objectPoints,
         if( !CV_IS_MAT( dpdo ) || ( CV_MAT_TYPE( dpdo->type ) != CV_32FC1
                                     && CV_MAT_TYPE( dpdo->type ) != CV_64FC1 )
             || dpdo->rows != count * 2 || dpdo->cols != count * 3 )
-            CV_Error( CV_StsBadArg, "dp/do must be 2Nx3N floating-point matrix" );
+            CV_Error( cv::Error::StsBadArg, "dp/do must be 2Nx3N floating-point matrix" );
 
         if( CV_MAT_TYPE( dpdo->type ) == CV_64FC1 )
         {
@@ -1283,10 +1390,10 @@ CV_IMPL void cvInitIntrinsicParams2D( const CvMat* objectPoints,
         CV_MAT_TYPE(objectPoints->type) != CV_64FC3) ||
         (CV_MAT_TYPE(imagePoints->type) != CV_32FC2 &&
         CV_MAT_TYPE(imagePoints->type) != CV_64FC2) )
-        CV_Error( CV_StsUnsupportedFormat, "Both object points and image points must be 2D" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Both object points and image points must be 2D" );
 
     if( objectPoints->rows != 1 || imagePoints->rows != 1 )
-        CV_Error( CV_StsBadSize, "object points and image points must be a single-row matrices" );
+        CV_Error( cv::Error::StsBadSize, "object points and image points must be a single-row matrices" );
 
     matA.reset(cvCreateMat( 2*nimages, 2, CV_64F ));
     _b.reset(cvCreateMat( 2*nimages, 1, CV_64F ));
@@ -1395,27 +1502,27 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
     // 0. check the parameters & allocate buffers
     if( !CV_IS_MAT(objectPoints) || !CV_IS_MAT(imagePoints) ||
         !CV_IS_MAT(npoints) || !CV_IS_MAT(cameraMatrix) || !CV_IS_MAT(distCoeffs) )
-        CV_Error( CV_StsBadArg, "One of required vector arguments is not a valid matrix" );
+        CV_Error( cv::Error::StsBadArg, "One of required vector arguments is not a valid matrix" );
 
     if( imageSize.width <= 0 || imageSize.height <= 0 )
-        CV_Error( CV_StsOutOfRange, "image width and height must be positive" );
+        CV_Error( cv::Error::StsOutOfRange, "image width and height must be positive" );
 
     if( CV_MAT_TYPE(npoints->type) != CV_32SC1 ||
         (npoints->rows != 1 && npoints->cols != 1) )
-        CV_Error( CV_StsUnsupportedFormat,
+        CV_Error( cv::Error::StsUnsupportedFormat,
             "the array of point counters must be 1-dimensional integer vector" );
     if(flags & CALIB_TILTED_MODEL)
     {
         //when the tilted sensor model is used the distortion coefficients matrix must have 14 parameters
         if (distCoeffs->cols*distCoeffs->rows != 14)
-            CV_Error( CV_StsBadArg, "The tilted sensor model must have 14 parameters in the distortion matrix" );
+            CV_Error( cv::Error::StsBadArg, "The tilted sensor model must have 14 parameters in the distortion matrix" );
     }
     else
     {
         //when the thin prism model is used the distortion coefficients matrix must have 12 parameters
         if(flags & CALIB_THIN_PRISM_MODEL)
             if (distCoeffs->cols*distCoeffs->rows != 12)
-                CV_Error( CV_StsBadArg, "Thin prism model must have 12 parameters in the distortion matrix" );
+                CV_Error( cv::Error::StsBadArg, "Thin prism model must have 12 parameters in the distortion matrix" );
     }
 
     nimages = npoints->rows*npoints->cols;
@@ -1428,7 +1535,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
             (CV_MAT_DEPTH(rvecs->type) != CV_32F && CV_MAT_DEPTH(rvecs->type) != CV_64F) ||
             ((rvecs->rows != nimages || (rvecs->cols*cn != 3 && rvecs->cols*cn != 9)) &&
             (rvecs->rows != 1 || rvecs->cols != nimages || cn != 3)) )
-            CV_Error( CV_StsBadArg, "the output array of rotation vectors must be 3-channel "
+            CV_Error( cv::Error::StsBadArg, "the output array of rotation vectors must be 3-channel "
                 "1xn or nx1 array or 1-channel nx3 or nx9 array, where n is the number of views" );
     }
 
@@ -1439,7 +1546,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
             (CV_MAT_DEPTH(tvecs->type) != CV_32F && CV_MAT_DEPTH(tvecs->type) != CV_64F) ||
             ((tvecs->rows != nimages || tvecs->cols*cn != 3) &&
             (tvecs->rows != 1 || tvecs->cols != nimages || cn != 3)) )
-            CV_Error( CV_StsBadArg, "the output array of translation vectors must be 3-channel "
+            CV_Error( cv::Error::StsBadArg, "the output array of translation vectors must be 3-channel "
                 "1xn or nx1 array or 1-channel nx3 array, where n is the number of views" );
     }
 
@@ -1454,7 +1561,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
             (stdDevs->rows != 1 || stdDevs->cols != (nimages*6 + NINTRINSIC) || cn != 1)) )
 #define STR__(x) #x
 #define STR_(x) STR__(x)
-            CV_Error( CV_StsBadArg, "the output array of standard deviations vectors must be 1-channel "
+            CV_Error( cv::Error::StsBadArg, "the output array of standard deviations vectors must be 1-channel "
                 "1x(n*6 + NINTRINSIC) or (n*6 + NINTRINSIC)x1 array, where n is the number of views,"
                 " NINTRINSIC = " STR_(CV_CALIB_NINTRINSIC));
     }
@@ -1462,7 +1569,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
     if( (CV_MAT_TYPE(cameraMatrix->type) != CV_32FC1 &&
         CV_MAT_TYPE(cameraMatrix->type) != CV_64FC1) ||
         cameraMatrix->rows != 3 || cameraMatrix->cols != 3 )
-        CV_Error( CV_StsBadArg,
+        CV_Error( cv::Error::StsBadArg,
             "Intrinsic parameters must be 3x3 floating-point matrix" );
 
     if( (CV_MAT_TYPE(distCoeffs->type) != CV_32FC1 &&
@@ -1473,14 +1580,14 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
         distCoeffs->cols*distCoeffs->rows != 8 &&
         distCoeffs->cols*distCoeffs->rows != 12 &&
         distCoeffs->cols*distCoeffs->rows != 14) )
-        CV_Error( CV_StsBadArg, cvDistCoeffErr );
+        CV_Error( cv::Error::StsBadArg, cvDistCoeffErr );
 
     for( i = 0; i < nimages; i++ )
     {
         ni = npoints->data.i[i*npstep];
         if( ni < 4 )
         {
-            CV_Error_( CV_StsOutOfRange, ("The number of points in the view #%d is < 4", i));
+            CV_Error_( cv::Error::StsOutOfRange, ("The number of points in the view #%d is < 4", i));
         }
         maxPoints = MAX( maxPoints, ni );
         total += ni;
@@ -1493,7 +1600,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
             (CV_MAT_DEPTH(newObjPoints->type) != CV_32F && CV_MAT_DEPTH(newObjPoints->type) != CV_64F) ||
             ((newObjPoints->rows != maxPoints || newObjPoints->cols*cn != 3) &&
             (newObjPoints->rows != 1 || newObjPoints->cols != maxPoints || cn != 3)) )
-            CV_Error( CV_StsBadArg, "the output array of refined object points must be 3-channel "
+            CV_Error( cv::Error::StsBadArg, "the output array of refined object points must be 3-channel "
                 "1xn or nx1 array or 1-channel nx3 array, where n is the number of object points per view" );
     }
 
@@ -1504,7 +1611,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
             (CV_MAT_DEPTH(stdDevs->type) != CV_32F && CV_MAT_DEPTH(stdDevs->type) != CV_64F) ||
             ((stdDevs->rows != (nimages*6 + NINTRINSIC + maxPoints*3) || stdDevs->cols*cn != 1) &&
             (stdDevs->rows != 1 || stdDevs->cols != (nimages*6 + NINTRINSIC + maxPoints*3) || cn != 1)) )
-            CV_Error( CV_StsBadArg, "the output array of standard deviations vectors must be 1-channel "
+            CV_Error( cv::Error::StsBadArg, "the output array of standard deviations vectors must be 1-channel "
                 "1x(n*6 + NINTRINSIC + m*3) or (n*6 + NINTRINSIC + m*3)x1 array, where n is the number of views,"
                 " NINTRINSIC = " STR_(CV_CALIB_NINTRINSIC) ", m is the number of object points per view");
     }
@@ -1544,15 +1651,15 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
     {
         cvConvert( cameraMatrix, &matA );
         if( A(0, 0) <= 0 || A(1, 1) <= 0 )
-            CV_Error( CV_StsOutOfRange, "Focal length (fx and fy) must be positive" );
+            CV_Error( cv::Error::StsOutOfRange, "Focal length (fx and fy) must be positive" );
         if( A(0, 2) < 0 || A(0, 2) >= imageSize.width ||
             A(1, 2) < 0 || A(1, 2) >= imageSize.height )
-            CV_Error( CV_StsOutOfRange, "Principal point must be within the image" );
+            CV_Error( cv::Error::StsOutOfRange, "Principal point must be within the image" );
         if( fabs(A(0, 1)) > 1e-5 )
-            CV_Error( CV_StsOutOfRange, "Non-zero skew is not supported by the function" );
+            CV_Error( cv::Error::StsOutOfRange, "Non-zero skew is not supported by the function" );
         if( fabs(A(1, 0)) > 1e-5 || fabs(A(2, 0)) > 1e-5 ||
             fabs(A(2, 1)) > 1e-5 || fabs(A(2,2)-1) > 1e-5 )
-            CV_Error( CV_StsOutOfRange,
+            CV_Error( cv::Error::StsOutOfRange,
                 "The intrinsic matrix must have [fx 0 cx; 0 fy cy; 0 0 1] shape" );
         A(0, 1) = A(1, 0) = A(2, 0) = A(2, 1) = 0.;
         A(2, 2) = 1.;
@@ -1562,7 +1669,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
             aspectRatio = A(0, 0)/A(1, 1);
 
             if( aspectRatio < minValidAspectRatio || aspectRatio > maxValidAspectRatio )
-                CV_Error( CV_StsOutOfRange,
+                CV_Error( cv::Error::StsOutOfRange,
                     "The specified aspect ratio (= cameraMatrix[0][0] / cameraMatrix[1][1]) is incorrect" );
         }
         cvConvert( distCoeffs, &_k );
@@ -1572,7 +1679,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
         Scalar mean, sdv;
         meanStdDev(matM, mean, sdv);
         if( fabs(mean[2]) > 1e-5 || fabs(sdv[2]) > 1e-5 )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
             "For non-planar calibration rigs the initial intrinsic matrix must be specified" );
         for( i = 0; i < total; i++ )
             matM.at<Point3d>(i).z = 0.;
@@ -1582,7 +1689,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
             aspectRatio = cvmGet(cameraMatrix,0,0);
             aspectRatio /= cvmGet(cameraMatrix,1,1);
             if( aspectRatio < minValidAspectRatio || aspectRatio > maxValidAspectRatio )
-                CV_Error( CV_StsOutOfRange,
+                CV_Error( cv::Error::StsOutOfRange,
                     "The specified aspect ratio (= cameraMatrix[0][0] / cameraMatrix[1][1]) is incorrect" );
         }
         CvMat _matM = cvMat(matM), m = cvMat(_m);
@@ -1673,7 +1780,7 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
     Mat mask = cvarrToMat(solver.mask);
     int nparams_nz = countNonZero(mask);
     if (nparams_nz >= 2 * total)
-        CV_Error_(CV_StsBadArg,
+        CV_Error_(cv::Error::StsBadArg,
                   ("There should be less vars to optimize (having %d) than the number of residuals (%d = 2 per point)", nparams_nz, 2 * total));
 
     // 2. initialize extrinsic parameters
@@ -1889,10 +1996,10 @@ CV_IMPL double cvCalibrateCamera4( const CvMat* objectPoints,
                     CvMat* rvecs, CvMat* tvecs, CvMat* newObjPoints, int flags, CvTermCriteria termCrit )
 {
     if( !CV_IS_MAT(npoints) )
-        CV_Error( CV_StsBadArg, "npoints is not a valid matrix" );
+        CV_Error( cv::Error::StsBadArg, "npoints is not a valid matrix" );
     if( CV_MAT_TYPE(npoints->type) != CV_32SC1 ||
         (npoints->rows != 1 && npoints->cols != 1) )
-        CV_Error( CV_StsUnsupportedFormat,
+        CV_Error( cv::Error::StsUnsupportedFormat,
             "the array of point counters must be 1-dimensional integer vector" );
 
     bool releaseObject = iFixedPoint > 0 && iFixedPoint < npoints->data.i[0] - 1;
@@ -1903,7 +2010,7 @@ CV_IMPL double cvCalibrateCamera4( const CvMat* objectPoints,
     if( releaseObject )
     {
         if( !CV_IS_MAT(objectPoints) )
-            CV_Error( CV_StsBadArg, "objectPoints is not a valid matrix" );
+            CV_Error( cv::Error::StsBadArg, "objectPoints is not a valid matrix" );
         Mat matM;
         if(CV_MAT_CN(objectPoints->type) == 3) {
             matM = cvarrToMat(objectPoints);
@@ -1917,14 +2024,14 @@ CV_IMPL double cvCalibrateCamera4( const CvMat* objectPoints,
         {
             if( npoints->data.i[i * npstep] != ni )
             {
-                CV_Error( CV_StsBadArg, "All objectPoints[i].size() should be equal when "
+                CV_Error( cv::Error::StsBadArg, "All objectPoints[i].size() should be equal when "
                                         "object-releasing method is requested." );
             }
             Mat ocmp = matM.colRange(ni * i, ni * i + ni) != matM.colRange(0, ni);
             ocmp = ocmp.reshape(1);
             if( countNonZero(ocmp) )
             {
-                CV_Error( CV_StsBadArg, "All objectPoints[i] should be identical when object-releasing"
+                CV_Error( cv::Error::StsBadArg, "All objectPoints[i] should be identical when object-releasing"
                                         " method is requested." );
             }
         }
@@ -1941,10 +2048,10 @@ void cvCalibrationMatrixValues( const CvMat *calibMatr, CvSize imgSize,
 {
     /* Validate parameters. */
     if(calibMatr == 0)
-        CV_Error(CV_StsNullPtr, "Some of parameters is a NULL pointer!");
+        CV_Error(cv::Error::StsNullPtr, "Some of parameters is a NULL pointer!");
 
     if(!CV_IS_MAT(calibMatr))
-        CV_Error(CV_StsUnsupportedFormat, "Input parameters must be matrices!");
+        CV_Error(cv::Error::StsUnsupportedFormat, "Input parameters must be matrices!");
 
     double dummy = .0;
     Point2d pp;
@@ -2023,7 +2130,7 @@ static double cvStereoCalibrateImpl( const CvMat* _objectPoints, const CvMat* _i
             (CV_MAT_DEPTH(rvecs->type) != CV_32F && CV_MAT_DEPTH(rvecs->type) != CV_64F) ||
             ((rvecs->rows != nimages || (rvecs->cols*cn != 3 && rvecs->cols*cn != 9)) &&
             (rvecs->rows != 1 || rvecs->cols != nimages || cn != 3)) )
-            CV_Error( CV_StsBadArg, "the output array of rotation vectors must be 3-channel "
+            CV_Error( cv::Error::StsBadArg, "the output array of rotation vectors must be 3-channel "
                 "1xn or nx1 array or 1-channel nx3 or nx9 array, where n is the number of views" );
     }
 
@@ -2034,7 +2141,7 @@ static double cvStereoCalibrateImpl( const CvMat* _objectPoints, const CvMat* _i
             (CV_MAT_DEPTH(tvecs->type) != CV_32F && CV_MAT_DEPTH(tvecs->type) != CV_64F) ||
             ((tvecs->rows != nimages || tvecs->cols*cn != 3) &&
             (tvecs->rows != 1 || tvecs->cols != nimages || cn != 3)) )
-            CV_Error( CV_StsBadArg, "the output array of translation vectors must be 3-channel "
+            CV_Error( cv::Error::StsBadArg, "the output array of translation vectors must be 3-channel "
                 "1xn or nx1 array or 1-channel nx3 array, where n is the number of views" );
     }
 
@@ -2699,19 +2806,19 @@ void cvStereoRectify( const CvMat* _cameraMatrix1, const CvMat* _cameraMatrix2,
     if( alpha >= 0 )
     {
         double s0 = std::max(std::max(std::max((double)cx1/(cx1_0 - inner1.x), (double)cy1/(cy1_0 - inner1.y)),
-                            (double)(newImgSize.width - cx1)/(inner1.x + inner1.width - cx1_0)),
-                        (double)(newImgSize.height - cy1)/(inner1.y + inner1.height - cy1_0));
+                            (double)(newImgSize.width - 1 - cx1)/(inner1.x + inner1.width - cx1_0)),
+                        (double)(newImgSize.height - 1 - cy1)/(inner1.y + inner1.height - cy1_0));
         s0 = std::max(std::max(std::max(std::max((double)cx2/(cx2_0 - inner2.x), (double)cy2/(cy2_0 - inner2.y)),
-                         (double)(newImgSize.width - cx2)/(inner2.x + inner2.width - cx2_0)),
-                     (double)(newImgSize.height - cy2)/(inner2.y + inner2.height - cy2_0)),
+                         (double)(newImgSize.width - 1 - cx2)/(inner2.x + inner2.width - cx2_0)),
+                     (double)(newImgSize.height - 1 - cy2)/(inner2.y + inner2.height - cy2_0)),
                  s0);
 
         double s1 = std::min(std::min(std::min((double)cx1/(cx1_0 - outer1.x), (double)cy1/(cy1_0 - outer1.y)),
-                            (double)(newImgSize.width - cx1)/(outer1.x + outer1.width - cx1_0)),
-                        (double)(newImgSize.height - cy1)/(outer1.y + outer1.height - cy1_0));
+                            (double)(newImgSize.width - 1 - cx1)/(outer1.x + outer1.width - cx1_0)),
+                        (double)(newImgSize.height - 1 - cy1)/(outer1.y + outer1.height - cy1_0));
         s1 = std::min(std::min(std::min(std::min((double)cx2/(cx2_0 - outer2.x), (double)cy2/(cy2_0 - outer2.y)),
-                         (double)(newImgSize.width - cx2)/(outer2.x + outer2.width - cx2_0)),
-                     (double)(newImgSize.height - cy2)/(outer2.y + outer2.height - cy2_0)),
+                         (double)(newImgSize.width - 1 - cx2)/(outer2.x + outer2.width - cx2_0)),
+                     (double)(newImgSize.height - 1 - cy2)/(outer2.y + outer2.height - cy2_0)),
                  s1);
 
         s = s0*(1 - alpha) + s1*alpha;
@@ -3188,9 +3295,9 @@ cvRQDecomp3x3( const CvMat *matrixM, CvMat *matrixR, CvMat *matrixQ,
     Qx = ( 0  c  s ), c = m33/sqrt(m32^2 + m33^2), s = m32/sqrt(m32^2 + m33^2)
          ( 0 -s  c )
     */
-    s = matM[2][1];
-    c = matM[2][2];
-    z = 1./std::sqrt(c * c + s * s + DBL_EPSILON);
+    s = std::abs(matM[2][1]) > DBL_EPSILON ? matM[2][1] : 0;
+    c = std::abs(matM[2][1]) > DBL_EPSILON ? matM[2][2] : 1;
+    z = 1./std::sqrt(c * c + s * s);
     c *= z;
     s *= z;
 
@@ -3207,9 +3314,9 @@ cvRQDecomp3x3( const CvMat *matrixM, CvMat *matrixR, CvMat *matrixQ,
     Qy = ( 0  1  0 ), c = m33/sqrt(m31^2 + m33^2), s = -m31/sqrt(m31^2 + m33^2)
          ( s  0  c )
     */
-    s = -matR[2][0];
-    c = matR[2][2];
-    z = 1./std::sqrt(c * c + s * s + DBL_EPSILON);
+    s = std::abs(matR[2][0]) > DBL_EPSILON ? -matR[2][0] : 0;
+    c = std::abs(matR[2][0]) > DBL_EPSILON ? matR[2][2] : 1;
+    z = 1./std::sqrt(c * c + s * s);
     c *= z;
     s *= z;
 
@@ -3227,9 +3334,9 @@ cvRQDecomp3x3( const CvMat *matrixM, CvMat *matrixR, CvMat *matrixQ,
          ( 0  0  1 )
     */
 
-    s = matM[1][0];
-    c = matM[1][1];
-    z = 1./std::sqrt(c * c + s * s + DBL_EPSILON);
+    s = std::abs(matM[1][0]) > DBL_EPSILON ? matM[1][0] : 0;
+    c = std::abs(matM[1][0]) > DBL_EPSILON ? matM[1][1] : 1;
+    z = 1./std::sqrt(c * c + s * s);
     c *= z;
     s *= z;
 
@@ -3344,19 +3451,19 @@ cvDecomposeProjectionMatrix( const CvMat *projMatr, CvMat *calibMatr,
 
     /* Validate parameters. */
     if(projMatr == 0 || calibMatr == 0 || rotMatr == 0 || posVect == 0)
-        CV_Error(CV_StsNullPtr, "Some of parameters is a NULL pointer!");
+        CV_Error(cv::Error::StsNullPtr, "Some of parameters is a NULL pointer!");
 
     if(!CV_IS_MAT(projMatr) || !CV_IS_MAT(calibMatr) || !CV_IS_MAT(rotMatr) || !CV_IS_MAT(posVect))
-        CV_Error(CV_StsUnsupportedFormat, "Input parameters must be matrices!");
+        CV_Error(cv::Error::StsUnsupportedFormat, "Input parameters must be matrices!");
 
     if(projMatr->cols != 4 || projMatr->rows != 3)
-        CV_Error(CV_StsUnmatchedSizes, "Size of projection matrix must be 3x4!");
+        CV_Error(cv::Error::StsUnmatchedSizes, "Size of projection matrix must be 3x4!");
 
     if(calibMatr->cols != 3 || calibMatr->rows != 3 || rotMatr->cols != 3 || rotMatr->rows != 3)
-        CV_Error(CV_StsUnmatchedSizes, "Size of calibration and rotation matrices must be 3x3!");
+        CV_Error(cv::Error::StsUnmatchedSizes, "Size of calibration and rotation matrices must be 3x3!");
 
     if(posVect->cols != 1 || posVect->rows != 4)
-        CV_Error(CV_StsUnmatchedSizes, "Size of position vector must be 4x1!");
+        CV_Error(cv::Error::StsUnmatchedSizes, "Size of position vector must be 4x1!");
 
     /* Compute position vector. */
     cvSetZero(&tmpProjMatr); // Add zero row to make matrix square.
@@ -3402,17 +3509,17 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
     {
         Mat objectPoint = objectPoints.getMat(i);
         if (objectPoint.empty())
-            CV_Error(CV_StsBadSize, "objectPoints should not contain empty vector of vectors of points");
+            CV_Error(cv::Error::StsBadSize, "objectPoints should not contain empty vector of vectors of points");
         int numberOfObjectPoints = objectPoint.checkVector(3, CV_32F);
         if (numberOfObjectPoints <= 0)
-            CV_Error(CV_StsUnsupportedFormat, "objectPoints should contain vector of vectors of points of type Point3f");
+            CV_Error(cv::Error::StsUnsupportedFormat, "objectPoints should contain vector of vectors of points of type Point3f");
 
         Mat imagePoint1 = imagePoints1.getMat(i);
         if (imagePoint1.empty())
-            CV_Error(CV_StsBadSize, "imagePoints1 should not contain empty vector of vectors of points");
+            CV_Error(cv::Error::StsBadSize, "imagePoints1 should not contain empty vector of vectors of points");
         int numberOfImagePoints = imagePoint1.checkVector(2, CV_32F);
         if (numberOfImagePoints <= 0)
-            CV_Error(CV_StsUnsupportedFormat, "imagePoints1 should contain vector of vectors of points of type Point2f");
+            CV_Error(cv::Error::StsUnsupportedFormat, "imagePoints1 should contain vector of vectors of points of type Point2f");
         CV_CheckEQ(numberOfObjectPoints, numberOfImagePoints, "Number of object and image points must be equal");
 
         total += numberOfObjectPoints;
@@ -3467,14 +3574,14 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
         {
             if( npoints.at<int>(i) != ni )
             {
-                CV_Error( CV_StsBadArg, "All objectPoints[i].size() should be equal when "
+                CV_Error( cv::Error::StsBadArg, "All objectPoints[i].size() should be equal when "
                                         "object-releasing method is requested." );
             }
             Mat ocmp = objPtMat.colRange(ni * i, ni * i + ni) != objPtMat.colRange(0, ni);
             ocmp = ocmp.reshape(1);
             if( countNonZero(ocmp) )
             {
-                CV_Error( CV_StsBadArg, "All objectPoints[i] should be identical when object-releasing"
+                CV_Error( cv::Error::StsBadArg, "All objectPoints[i] should be identical when object-releasing"
                                         " method is requested." );
             }
         }
@@ -3884,7 +3991,7 @@ void cv::calibrationMatrixValues( InputArray _cameraMatrix, Size imageSize,
     CV_INSTRUMENT_REGION();
 
     if(_cameraMatrix.size() != Size(3, 3))
-        CV_Error(CV_StsUnmatchedSizes, "Size of cameraMatrix must be 3x3!");
+        CV_Error(cv::Error::StsUnmatchedSizes, "Size of cameraMatrix must be 3x3!");
 
     Matx33d K = _cameraMatrix.getMat();
 
diff --git a/modules/calib3d/src/calibration_handeye.cpp b/modules/calib3d/src/calibration_handeye.cpp
index 25fa5af053fa..cd4999a61831 100644
--- a/modules/calib3d/src/calibration_handeye.cpp
+++ b/modules/calib3d/src/calibration_handeye.cpp
@@ -289,13 +289,12 @@ static void calibrateHandEyeTsai(const std::vector<Mat>& Hg, const std::vector<M
     int idx = 0;
     for (size_t i = 0; i < Hg.size(); i++)
     {
-        for (size_t j = i+1; j < Hg.size(); j++, idx++)
+        for (size_t j = i+1; j < Hg.size(); j++)
         {
             //Defines coordinate transformation from Gi to Gj
             //Hgi is from Gi (gripper) to RW (robot base)
             //Hgj is from Gj (gripper) to RW (robot base)
             Mat Hgij = homogeneousInverse(Hg[j]) * Hg[i]; //eq 6
-            vec_Hgij.push_back(Hgij);
             //Rotation axis for Rgij which is the 3D rotation from gripper coordinate frame Gi to Gj
             Mat Pgij = 2*rot2quatMinimal(Hgij);
 
@@ -303,18 +302,42 @@ static void calibrateHandEyeTsai(const std::vector<Mat>& Hg, const std::vector<M
             //Hci is from CW (calibration target) to Ci (camera)
             //Hcj is from CW (calibration target) to Cj (camera)
             Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]); //eq 7
-            vec_Hcij.push_back(Hcij);
             //Rotation axis for Rcij
             Mat Pcij = 2*rot2quatMinimal(Hcij);
 
+            // Discard motions with rotation too small or too close to pi radians
+            //   The limits 1.7 and 0.3 correspond to angles less than 17 degrees or greater than 120 degrees. They are
+            //   based on verifying equation 12 from the source paper using data generated with a known hand-eye
+            //   calibration. The data contained 25 poses, so 300 motions were considered. Of these, 188 satisfied
+            //   equation 12, and the remaining 112 all had Pcij or Pgij with norms greater than 1.7. Although errors
+            //   from small rotations were not observed, it is known that these motions are less informative (see
+            //   section II.B.3, and figure 6).
+            double Pgij_norm = cv::norm(Pgij);
+            double Pcij_norm = cv::norm(Pcij);
+            if (Pgij_norm < 0.3 || Pcij_norm < 0.3 || Pgij_norm > 1.7 || Pcij_norm > 1.7) {
+                continue;
+            }
+
+            vec_Hgij.push_back(Hgij);
+            vec_Hcij.push_back(Hcij);
+
             //Left-hand side: skew(Pgij+Pcij)
             skew(Pgij+Pcij).copyTo(A(Rect(0, idx*3, 3, 3)));
             //Right-hand side: Pcij - Pgij
             Mat diff = Pcij - Pgij;
             diff.copyTo(B(Rect(0, idx*3, 1, 3)));
+            idx++;
         }
     }
 
+    // insufficient data
+    if (idx < 2) {
+        CV_LOG_ERROR(NULL, "Hand-eye calibration failed! Not enough informative motions--include larger rotations.");
+        return;
+    }
+    A.resize(3*idx);
+    B.resize(3*idx);
+
     Mat Pcg_;
     //Rotation from camera to gripper is obtained from the set of equations:
     //    skew(Pgij+Pcij) * Pcg_ = Pcij - Pgij    (eq 12)
@@ -327,28 +350,24 @@ static void calibrateHandEyeTsai(const std::vector<Mat>& Hg, const std::vector<M
 
     Mat Rcg = quatMinimal2rot(Pcg/2.0);
 
-    idx = 0;
-    for (size_t i = 0; i < Hg.size(); i++)
+    for (size_t i = 0; i < vec_Hgij.size(); i++)
     {
-        for (size_t j = i+1; j < Hg.size(); j++, idx++)
-        {
-            //Defines coordinate transformation from Gi to Gj
-            //Hgi is from Gi (gripper) to RW (robot base)
-            //Hgj is from Gj (gripper) to RW (robot base)
-            Mat Hgij = vec_Hgij[static_cast<size_t>(idx)];
-            //Defines coordinate transformation from Ci to Cj
-            //Hci is from CW (calibration target) to Ci (camera)
-            //Hcj is from CW (calibration target) to Cj (camera)
-            Mat Hcij = vec_Hcij[static_cast<size_t>(idx)];
-
-            //Left-hand side: (Rgij - I)
-            Mat diff = Hgij(Rect(0,0,3,3)) - Mat::eye(3,3,CV_64FC1);
-            diff.copyTo(A(Rect(0, idx*3, 3, 3)));
-
-            //Right-hand side: Rcg*Tcij - Tgij
-            diff = Rcg*Hcij(Rect(3, 0, 1, 3)) - Hgij(Rect(3, 0, 1, 3));
-            diff.copyTo(B(Rect(0, idx*3, 1, 3)));
-        }
+        //Defines coordinate transformation from Gi to Gj
+        //Hgi is from Gi (gripper) to RW (robot base)
+        //Hgj is from Gj (gripper) to RW (robot base)
+        Mat Hgij = vec_Hgij[i];
+        //Defines coordinate transformation from Ci to Cj
+        //Hci is from CW (calibration target) to Ci (camera)
+        //Hcj is from CW (calibration target) to Cj (camera)
+        Mat Hcij = vec_Hcij[i];
+
+        //Left-hand side: (Rgij - I)
+        Mat diff = Hgij(Rect(0,0,3,3)) - Mat::eye(3,3,CV_64FC1);
+        diff.copyTo(A(Rect(0, static_cast<int>(i)*3, 3, 3)));
+
+        //Right-hand side: Rcg*Tcij - Tgij
+        diff = Rcg*Hcij(Rect(3, 0, 1, 3)) - Hgij(Rect(3, 0, 1, 3));
+        diff.copyTo(B(Rect(0, static_cast<int>(i)*3, 1, 3)));
     }
 
     Mat Tcg;
@@ -449,6 +468,9 @@ static void calibrateHandEyeHoraud(const std::vector<Mat>& Hg, const std::vector
             Mat Rcij = Hcij(Rect(0, 0, 3, 3));
 
             Mat qgij = rot2quat(Rgij);
+            if (qgij.at<double>(0, 0) < 0) {
+                qgij *= -1;
+            }
             double r0 = qgij.at<double>(0,0);
             double rx = qgij.at<double>(1,0);
             double ry = qgij.at<double>(2,0);
@@ -461,6 +483,9 @@ static void calibrateHandEyeHoraud(const std::vector<Mat>& Hg, const std::vector
                         rz, -ry,  rx,  r0);
 
             Mat qcij = rot2quat(Rcij);
+            if (qcij.at<double>(0, 0) < 0) {
+                qcij *= -1;
+            }
             r0 = qcij.at<double>(0,0);
             rx = qcij.at<double>(1,0);
             ry = qcij.at<double>(2,0);
@@ -618,7 +643,13 @@ static void calibrateHandEyeDaniilidis(const std::vector<Mat>& Hg, const std::ve
             Mat Hcij = Hc[j] * homogeneousInverse(Hc[i]);
 
             Mat dualqa = homogeneous2dualQuaternion(Hgij);
+            if (dualqa.at<double>(0, 0) < 0) {
+                dualqa *= -1;
+            }
             Mat dualqb = homogeneous2dualQuaternion(Hcij);
+            if (dualqb.at<double>(0, 0) < 0) {
+                dualqb *= -1;
+            }
 
             Mat a = dualqa(Rect(0, 1, 1, 3));
             Mat b = dualqb(Rect(0, 1, 1, 3));
@@ -722,7 +753,11 @@ void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gr
         if(R_gripper2base_[i].size() == Size(3, 3))
             R_gripper2base_[i].convertTo(R, CV_64F);
         else
-            Rodrigues(R_gripper2base_[i], R);
+        {
+            cv::Mat R_temp;
+            Rodrigues(R_gripper2base_[i], R_temp);
+            R_temp.convertTo(R, CV_64F);
+        }
 
         Mat t = m(Rect(3, 0, 1, 3));
         t_gripper2base_[i].convertTo(t, CV_64F);
@@ -740,7 +775,11 @@ void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gr
         if(R_target2cam_[i].size() == Size(3, 3))
             R_target2cam_[i].convertTo(R, CV_64F);
         else
-            Rodrigues(R_target2cam_[i], R);
+        {
+            cv::Mat R_temp;
+            Rodrigues(R_target2cam_[i], R_temp);
+            R_temp.convertTo(R, CV_64F);
+        }
 
         Mat t = m(Rect(3, 0, 1, 3));
         t_target2cam_[i].convertTo(t, CV_64F);
@@ -920,7 +959,9 @@ void calibrateRobotWorldHandEye(InputArrayOfArrays R_world2cam, InputArrayOfArra
             }
             else
             {
-                Rodrigues(rot, R);
+                cv::Mat R_temp;
+                Rodrigues(rot, R_temp);
+                R_temp.convertTo(R, CV_64F);
                 R_base2gripper_.push_back(R);
             }
             Mat tvec = t_base2gripper_tmp[i];
@@ -938,7 +979,9 @@ void calibrateRobotWorldHandEye(InputArrayOfArrays R_world2cam, InputArrayOfArra
             }
             else
             {
-                Rodrigues(rot, R);
+                cv::Mat R_temp;
+                Rodrigues(rot, R_temp);
+                R_temp.convertTo(R, CV_64F);
                 R_world2cam_.push_back(R);
             }
             Mat tvec = t_world2cam_tmp[i];
diff --git a/modules/calib3d/src/checkchessboard.cpp b/modules/calib3d/src/checkchessboard.cpp
index 97876d218cf6..350614e78fce 100644
--- a/modules/calib3d/src/checkchessboard.cpp
+++ b/modules/calib3d/src/checkchessboard.cpp
@@ -55,15 +55,12 @@ static void icvGetQuadrangleHypotheses(const std::vector<std::vector< cv::Point
     const float max_aspect_ratio = 3.0f;
     const float min_box_size = 10.0f;
 
-    typedef std::vector< std::vector< cv::Point > >::const_iterator iter_t;
-    iter_t i;
-    for (i = contours.begin(); i != contours.end(); ++i)
+    for (size_t i = 0; i < contours.size(); ++i)
     {
-        const iter_t::difference_type idx = i - contours.begin();
-        if (hierarchy.at(idx)[3] != -1)
+        if (hierarchy.at(i)[3] != -1)
             continue; // skip holes
 
-        const std::vector< cv::Point > & c = *i;
+        const std::vector< cv::Point > & c = contours[i];
         cv::RotatedRect box = cv::minAreaRect(c);
 
         float box_size = MAX(box.size.width, box.size.height);
diff --git a/modules/calib3d/src/chessboard.hpp b/modules/calib3d/src/chessboard.hpp
index f49b83572f76..80519d15a5cd 100644
--- a/modules/calib3d/src/chessboard.hpp
+++ b/modules/calib3d/src/chessboard.hpp
@@ -203,12 +203,12 @@ class Chessboard: public cv::Feature2D
                  * d12/d34 = d13/d24
                  *
                  * point order on the line:
-                 * pt1 --> pt2 --> pt3 --> pt4
+                 * p0 --> p1 --> p2 --> p3
                  *
-                 * \param[in] pt1 First point coordinate
-                 * \param[in] pt2 Second point coordinate
-                 * \param[in] pt3 Third point coordinate
-                 * \param[out] pt4 Forth point coordinate
+                 * \param[in] p0 First point coordinate
+                 * \param[in] p1 Second point coordinate
+                 * \param[in] p2 Third point coordinate
+                 * \param[out] p3 Forth point coordinate
                  *
                  */
                 static bool estimatePoint(const cv::Point2f &p0,const cv::Point2f &p1,const cv::Point2f &p2,cv::Point2f &p3);
@@ -309,7 +309,7 @@ class Chessboard: public cv::Feature2D
                  * \brief Draws the corners into the given image
                  *
                  * \param[in] m The image
-                 * \param[out] m The resulting image
+                 * \param[out] out The resulting image
                  * \param[in] H optional homography to calculate search area
                  *
                  */
@@ -668,7 +668,7 @@ class Chessboard: public cv::Feature2D
                   * \brief Calculates the average edge sharpness for the chessboard
                   *
                   * \param[in] image The image where the chessboard was detected
-                  * \param[in] rise_distante Rise distance 0.8 means 10% ... 90%
+                  * \param[in] rise_distance Rise distance 0.8 means 10% ... 90%
                   * \param[in] vertical by default only edge response for horiontal lines are calculated
                   *
                   * \returns Scalar(sharpness, average min_val, average max_val)
diff --git a/modules/calib3d/src/circlesgrid.cpp b/modules/calib3d/src/circlesgrid.cpp
index df9534f75504..d32913f9ef9f 100644
--- a/modules/calib3d/src/circlesgrid.cpp
+++ b/modules/calib3d/src/circlesgrid.cpp
@@ -701,22 +701,26 @@ bool CirclesGridFinder::isDetectionCorrect()
   {
     case CirclesGridFinderParameters::SYMMETRIC_GRID:
     {
-      if (holes.size() != patternSize.height)
+      rotatedGrid = holes.size() != patternSize.height && holes.size() == patternSize.width;
+      if (holes.size() != patternSize.height && holes.size() != patternSize.width)
         return false;
 
-      std::set<size_t> vertices;
+      size_t num_vertices = 0ull;
       for (size_t i = 0; i < holes.size(); i++)
       {
-        if (holes[i].size() != patternSize.width)
+        if (holes[i].size() != patternSize.width && rotatedGrid == false)
+        {
           return false;
-
-        for (size_t j = 0; j < holes[i].size(); j++)
+        }
+        else if (holes[i].size() != patternSize.height && rotatedGrid == true)
         {
-          vertices.insert(holes[i][j]);
+          rotatedGrid = false;
+          return false;
         }
-      }
 
-      return vertices.size() == patternSize.area();
+        num_vertices += holes[i].size();
+      }
+      return num_vertices == patternSize.area();
     }
 
     case CirclesGridFinderParameters::ASYMMETRIC_GRID:
@@ -1431,12 +1435,32 @@ Size CirclesGridFinder::getDetectedGridSize() const
 void CirclesGridFinder::getHoles(std::vector<Point2f> &outHoles) const
 {
   outHoles.clear();
-
-  for (size_t i = 0; i < holes.size(); i++)
+  if (rotatedGrid == false)
   {
-    for (size_t j = 0; j < holes[i].size(); j++)
+    for (size_t i = 0ull; i < holes.size(); i++)
     {
-      outHoles.push_back(keypoints[holes[i][j]]);
+      for (size_t j = 0ull; j < holes[i].size(); j++)
+      {
+        outHoles.push_back(keypoints[holes[i][j]]);
+      }
+    }
+  }
+  else
+  {
+    bool visit_all = false;
+    size_t j = 0ull;
+    while (visit_all != true)
+    {
+      visit_all = true;
+      for (size_t i = 0ull; i < holes.size(); i++)
+      {
+        if (j < holes[i].size())
+        {
+          outHoles.push_back(keypoints[holes[i][j]]);
+          visit_all = false;
+        }
+      }
+      j++;
     }
   }
 }
diff --git a/modules/calib3d/src/circlesgrid.hpp b/modules/calib3d/src/circlesgrid.hpp
index f058af7a430a..8fba89fd1101 100644
--- a/modules/calib3d/src/circlesgrid.hpp
+++ b/modules/calib3d/src/circlesgrid.hpp
@@ -186,6 +186,7 @@ class CirclesGridFinder
 
   const cv::Size_<size_t> patternSize;
   cv::CirclesGridFinderParameters parameters;
+  bool rotatedGrid = false;
 
   CirclesGridFinder& operator=(const CirclesGridFinder&);
   CirclesGridFinder(const CirclesGridFinder&);
diff --git a/modules/calib3d/src/fisheye.cpp b/modules/calib3d/src/fisheye.cpp
index 1be7cb44c257..751a1aa6da24 100644
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@@ -799,7 +799,7 @@ double cv::fisheye::calibrate(InputArrayOfArrays objectPoints, InputArrayOfArray
     }
     else
     {
-        finalParam.Init(Vec2d(max(image_size.width, image_size.height) / CV_PI, max(image_size.width, image_size.height) / CV_PI),
+        finalParam.Init(Vec2d(max(image_size.width, image_size.height) / 2., max(image_size.width, image_size.height) / 2.),
                         Vec2d(image_size.width  / 2.0 - 0.5, image_size.height / 2.0 - 0.5));
     }
 
@@ -1148,6 +1148,20 @@ double cv::fisheye::stereoCalibrate(InputArrayOfArrays objectPoints, InputArrayO
     return rms;
 }
 
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// cv::fisheye::solvePnP
+
+bool cv::fisheye::solvePnP( InputArray opoints, InputArray ipoints,
+               InputArray cameraMatrix, InputArray distCoeffs,
+               OutputArray rvec, OutputArray tvec, bool useExtrinsicGuess,
+               int flags, TermCriteria criteria)
+{
+
+    Mat imagePointsNormalized;
+    cv::fisheye::undistortPoints(ipoints, imagePointsNormalized, cameraMatrix, distCoeffs, noArray(), cameraMatrix, criteria);
+    return cv::solvePnP(opoints, imagePointsNormalized, cameraMatrix, noArray(), rvec, tvec, useExtrinsicGuess, flags);
+}
+
 namespace cv{ namespace {
 void subMatrix(const Mat& src, Mat& dst, const std::vector<uchar>& cols, const std::vector<uchar>& rows)
 {
diff --git a/modules/calib3d/src/five-point.cpp b/modules/calib3d/src/five-point.cpp
index b8d0b43edcdc..9fda9f73f163 100644
--- a/modules/calib3d/src/five-point.cpp
+++ b/modules/calib3d/src/five-point.cpp
@@ -36,6 +36,13 @@
 namespace cv
 {
 
+// for some compilers it takes very long time to compile
+// automatically generated code in EMEstimatorCallback::runKernel(),
+// so we temporarily disable optimizations here
+#if defined __hexagon__ && defined __clang__
+#pragma clang optimize off
+#endif
+
 class EMEstimatorCallback CV_FINAL : public PointSetRegistrator::Callback
 {
 public:
@@ -401,6 +408,11 @@ class EMEstimatorCallback CV_FINAL : public PointSetRegistrator::Callback
     }
 };
 
+// restore optimizations (if any)
+#if defined __hexagon__ && defined __clang__
+#pragma clang optimize on
+#endif
+
 // Find essential matrix given undistorted points and two cameras.
 static Mat findEssentialMat_( InputArray _points1, InputArray _points2,
                              InputArray cameraMatrix1, InputArray cameraMatrix2,
diff --git a/modules/calib3d/src/fundam.cpp b/modules/calib3d/src/fundam.cpp
index 8e07efa26d6c..75d345659516 100644
--- a/modules/calib3d/src/fundam.cpp
+++ b/modules/calib3d/src/fundam.cpp
@@ -49,6 +49,13 @@
 namespace cv
 {
 
+static inline double scaleFor(double x){
+    return (std::fabs(x) > std::numeric_limits<float>::epsilon()) ? 1./x : 1.;
+}
+static inline float scaleFor(float x){
+    return (std::fabs(x) > std::numeric_limits<float>::epsilon()) ? 1.f/x : 1.f;
+}
+
 /**
  * This class estimates a homography \f$H\in \mathbb{R}^{3\times 3}\f$
  * between \f$\mathbf{x} \in \mathbb{R}^3\f$ and
@@ -113,7 +120,7 @@ class HomographyEstimatorCallback CV_FINAL : public PointSetRegistrator::Callbac
      *            2 columns 1 channel
      * @param _m2 destination points containing (x,y), depth is CV_32F with 1 column 2 channels or
      *            2 columns 1 channel
-     * @param _model, CV_64FC1, 3x3, normalized, i.e., the last element is 1
+     * @param _model CV_64FC1, 3x3, normalized, i.e., the last element is 1
      */
     int runKernel( InputArray _m1, InputArray _m2, OutputArray _model ) const CV_OVERRIDE
     {
@@ -177,8 +184,7 @@ class HomographyEstimatorCallback CV_FINAL : public PointSetRegistrator::Callbac
         eigen( _LtL, matW, matV );
         _Htemp = _invHnorm*_H0;
         _H0 = _Htemp*_Hnorm2;
-        _H0.convertTo(_model, _H0.type(), 1./_H0.at<double>(2,2) );
-
+        _H0.convertTo(_model, _H0.type(), scaleFor(_H0.at<double>(2,2)));
         return 1;
     }
 
@@ -188,7 +194,7 @@ class HomographyEstimatorCallback CV_FINAL : public PointSetRegistrator::Callbac
      * @param _m1 depth CV_32F, 1-channel with 2 columns or 2-channel with 1 column
      * @param _m2 depth CV_32F, 1-channel with 2 columns or 2-channel with 1 column
      * @param _model CV_64FC1, 3x3
-     * @param _err, output, CV_32FC1, square of the L2 norm
+     * @param _err output, CV_32FC1, square of the L2 norm
      */
     void computeError( InputArray _m1, InputArray _m2, InputArray _model, OutputArray _err ) const CV_OVERRIDE
     {
@@ -197,14 +203,14 @@ class HomographyEstimatorCallback CV_FINAL : public PointSetRegistrator::Callbac
         const Point2f* M = m1.ptr<Point2f>();
         const Point2f* m = m2.ptr<Point2f>();
         const double* H = model.ptr<double>();
-        float Hf[] = { (float)H[0], (float)H[1], (float)H[2], (float)H[3], (float)H[4], (float)H[5], (float)H[6], (float)H[7] };
+        float Hf[] = { (float)H[0], (float)H[1], (float)H[2], (float)H[3], (float)H[4], (float)H[5], (float)H[6], (float)H[7], (float)H[8] };
 
         _err.create(count, 1, CV_32F);
         float* err = _err.getMat().ptr<float>();
 
         for( i = 0; i < count; i++ )
         {
-            float ww = 1.f/(Hf[6]*M[i].x + Hf[7]*M[i].y + 1.f);
+            float ww = 1.f/(Hf[6]*M[i].x + Hf[7]*M[i].y + Hf[8]);
             float dx = (Hf[0]*M[i].x + Hf[1]*M[i].y + Hf[2])*ww - m[i].x;
             float dy = (Hf[3]*M[i].x + Hf[4]*M[i].y + Hf[5])*ww - m[i].y;
             err[i] = dx*dx + dy*dy;
@@ -231,8 +237,9 @@ class HomographyRefineCallback CV_FINAL : public LMSolver::Callback
         if( _Jac.needed())
         {
             _Jac.create(count*2, param.rows, CV_64F);
+            _Jac.setTo(0.);
             J = _Jac.getMat();
-            CV_Assert( J.isContinuous() && J.cols == 8 );
+            CV_Assert( J.isContinuous() && J.cols == 9 );
         }
 
         const Point2f* M = src.ptr<Point2f>();
@@ -244,7 +251,7 @@ class HomographyRefineCallback CV_FINAL : public LMSolver::Callback
         for( i = 0; i < count; i++ )
         {
             double Mx = M[i].x, My = M[i].y;
-            double ww = h[6]*Mx + h[7]*My + 1.;
+            double ww = h[6]*Mx + h[7]*My + h[8];
             ww = fabs(ww) > DBL_EPSILON ? 1./ww : 0;
             double xi = (h[0]*Mx + h[1]*My + h[2])*ww;
             double yi = (h[3]*Mx + h[4]*My + h[5])*ww;
@@ -254,13 +261,11 @@ class HomographyRefineCallback CV_FINAL : public LMSolver::Callback
             if( Jptr )
             {
                 Jptr[0] = Mx*ww; Jptr[1] = My*ww; Jptr[2] = ww;
-                Jptr[3] = Jptr[4] = Jptr[5] = 0.;
-                Jptr[6] = -Mx*ww*xi; Jptr[7] = -My*ww*xi;
-                Jptr[8] = Jptr[9] = Jptr[10] = 0.;
-                Jptr[11] = Mx*ww; Jptr[12] = My*ww; Jptr[13] = ww;
-                Jptr[14] = -Mx*ww*yi; Jptr[15] = -My*ww*yi;
+                Jptr[6] = -Mx*ww*xi; Jptr[7] = -My*ww*xi; Jptr[8] = -ww*xi;
+                Jptr[12] = Mx*ww; Jptr[13] = My*ww; Jptr[14] = ww;
+                Jptr[15] = -Mx*ww*yi; Jptr[16] = -My*ww*yi; Jptr[17] = -ww*yi;
 
-                Jptr += 16;
+                Jptr += 18;
             }
         }
 
@@ -269,7 +274,7 @@ class HomographyRefineCallback CV_FINAL : public LMSolver::Callback
 
     Mat src, dst;
 };
-} // end namesapce cv
+} // end namespace cv
 
 namespace cv{
 static bool createAndRunRHORegistrator(double confidence,
@@ -419,8 +424,9 @@ cv::Mat cv::findHomography( InputArray _points1, InputArray _points2,
             dst = dst1;
             if( method == RANSAC || method == LMEDS )
                 cb->runKernel( src, dst, H );
-            Mat H8(8, 1, CV_64F, H.ptr<double>());
+            Mat H8(9, 1, CV_64F, H.ptr<double>());
             LMSolver::create(makePtr<HomographyRefineCallback>(src, dst), 10)->run(H8);
+            H.convertTo(H, H.type(), scaleFor(H.at<double>(2,2)));
         }
     }
 
@@ -1002,14 +1008,6 @@ void cv::computeCorrespondEpilines( InputArray _points, int whichImage,
     }
 }
 
-static inline double scaleFor(double x){
-    return (std::fabs(x) > std::numeric_limits<float>::epsilon()) ? 1./x : 1.;
-}
-static inline float scaleFor(float x){
-    return (std::fabs(x) > std::numeric_limits<float>::epsilon()) ? 1.f/x : 1.f;
-}
-
-
 void cv::convertPointsFromHomogeneous( InputArray _src, OutputArray _dst )
 {
     CV_INSTRUMENT_REGION();
diff --git a/modules/calib3d/src/hal_replacement.hpp b/modules/calib3d/src/hal_replacement.hpp
new file mode 100644
index 000000000000..8874c9127be7
--- /dev/null
+++ b/modules/calib3d/src/hal_replacement.hpp
@@ -0,0 +1,188 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CALIB3D_HAL_REPLACEMENT_HPP
+#define OPENCV_CALIB3D_HAL_REPLACEMENT_HPP
+
+#include "opencv2/core/hal/interface.h"
+
+#if defined(__clang__)  // clang or MSVC clang
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+//! @addtogroup calib3d_hal_interface
+//! @note Define your functions to override default implementations:
+//! @code
+//! #undef hal_add8u
+//! #define hal_add8u my_add8u
+//! @endcode
+//! @{
+
+/**
+ * @brief Camera intrinsics structure, see projectPoints() documentation for details
+ */
+struct cv_camera_intrinsics_pinhole_32f
+{
+    // focal length, principal point
+    float fx, fy, cx, cy;
+    // radial distortion coefficients
+    float k[6];
+    // amount of radial distortion coefficients passed
+    int amt_k;
+    // tangential distortion coefficients
+    float p[2];
+    // amount of tangential distortion coefficients passed
+    int amt_p;
+    // prism distortion coefficients
+    float s[4];
+    // amount of prism distortion coefficients passed
+    int amt_s;
+    // tilt distortion coefficients
+    float tau_x, tau_y;
+    // to use tilt distortion coefficients or not
+    bool use_tau;
+};
+
+/**
+   @brief Project points from 3D world space to 2D screen space using rotation and translation matrix and camera intrinsic parameters
+   @param src_data Pointer to 3D points array with coordinates interleaved as X, Y, Z, X, Y, Z,..
+   @param src_step Step between consecutive 3D points
+   @param src_size Amount of points
+   @param dst_data Pointer to resulting projected 2D points with coordinates interleaved as u, v, u, v,..
+   @param dst_step Step between consecutive projected 2D points
+   @param rt_data Pointer to 3x4 array containing rotation-then-translation matrix
+   @param intr_data Pointer to camera intrinsics structure
+*/
+inline int hal_ni_project_points_pinhole32f(const float* src_data, size_t src_step, size_t src_size,
+                                            float* dst_data, size_t dst_step, const float* rt_data,
+                                            const cv_camera_intrinsics_pinhole_32f* intr_data)
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_project_points_pinhole32f hal_ni_project_points_pinhole32f
+//! @endcond
+
+/**
+ * @brief Camera intrinsics structure, see projectPoints() documentation for details
+ */
+struct cv_camera_intrinsics_pinhole_64f
+{
+    // focal length, principal point
+    double fx, fy, cx, cy;
+    // radial distortion coefficients
+    double k[6];
+    // amount of radial distortion coefficients passed
+    int amt_k;
+    // tangential distortion coefficients
+    double p[2];
+    // amount of tangential distortion coefficients passed
+    int amt_p;
+    // prism distortion coefficients
+    double s[4];
+    // amount of prism distortion coefficients passed
+    int amt_s;
+    // tilt distortion coefficients
+    double tau_x, tau_y;
+    // to use tilt distortion coefficients or not
+    bool use_tau;
+};
+
+/**
+   @brief Project points from 3D world space to 2D screen space using rotation and translation matrix and camera intrinsic parameters
+   @param src_data Pointer to 3D points array with coordinates interleaved as X, Y, Z, X, Y, Z,..
+   @param src_step Step between consecutive 3D points
+   @param src_size Amount of points
+   @param dst_data Pointer to resulting projected 2D points with coordinates interleaved as u, v, u, v,..
+   @param dst_step Step between consecutive projected 2D points
+   @param rt_data Pointer to 3x4 array containing rotation-then-translation matrix
+   @param intr_data Pointer to camera intrinsics structure
+*/
+inline int hal_ni_project_points_pinhole64f(const double* src_data, size_t src_step, size_t src_size,
+                                            double* dst_data, size_t dst_step, const double* rt_data,
+                                            const cv_camera_intrinsics_pinhole_64f* intr_data)
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_project_points_pinhole64f hal_ni_project_points_pinhole64f
+//! @endcond
+
+//! @}
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+#include "custom_hal.hpp"
+
+//! @cond IGNORED
+#define CALL_HAL_RET(name, fun, retval, ...) \
+    int res = __CV_EXPAND(fun(__VA_ARGS__, &retval)); \
+    if (res == CV_HAL_ERROR_OK) \
+        return retval; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+
+
+#define CALL_HAL(name, fun, ...) \
+    int res = __CV_EXPAND(fun(__VA_ARGS__)); \
+    if (res == CV_HAL_ERROR_OK) \
+        return; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+//! @endcond
+
+#endif
diff --git a/modules/calib3d/src/ippe.hpp b/modules/calib3d/src/ippe.hpp
index 986210b187ad..c1d4dd2a210b 100644
--- a/modules/calib3d/src/ippe.hpp
+++ b/modules/calib3d/src/ippe.hpp
@@ -111,7 +111,7 @@ class PoseSolver {
     /**
      * @brief                           Computes the translation solution for a given rotation solution
      * @param objectPoints              Array of corresponding object points, 1xN/Nx1 3-channel where N is the number of points
-     * @param normalizedImagePoints     Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
+     * @param normalizedImgPoints       Array of corresponding image points (undistorted), 1xN/Nx1 2-channel where N is the number of points
      * @param R                         Rotation solution (3x1 rotation vector)
      * @param t                         Translation solution (3x1 rotation vector)
      */
@@ -220,10 +220,10 @@ class PoseSolver {
 
     /**
      * @brief                   Computes the average depth of an object given its pose in camera coordinates
-     * @param objectPoints:     Object points defined in 3D object space
-     * @param rvec:             Rotation component of pose
-     * @param tvec:             Translation component of pose
-     * @return:                 average depth of the object
+     * @param objectPoints      Object points defined in 3D object space
+     * @param rvec              Rotation component of pose
+     * @param tvec              Translation component of pose
+     * @return                  average depth of the object
      */
     double meanSceneDepth(InputArray objectPoints, InputArray rvec, InputArray tvec);
 
diff --git a/modules/calib3d/src/p3p.cpp b/modules/calib3d/src/p3p.cpp
index 01b1734db818..9a7e96cca2e4 100644
--- a/modules/calib3d/src/p3p.cpp
+++ b/modules/calib3d/src/p3p.cpp
@@ -214,8 +214,8 @@ int p3p::solve(double R[4][3][3], double t[4][3],
 /// Only the solution to the main branch.
 /// Reference : X.S. Gao, X.-R. Hou, J. Tang, H.-F. Chang; "Complete Solution Classification for the Perspective-Three-Point Problem"
 /// IEEE Trans. on PAMI, vol. 25, No. 8, August 2003
-/// \param lengths3D Lengths of line segments up to four solutions.
-/// \param dist3D Distance between 3D points in pairs |BC|, |AC|, |AB|.
+/// \param lengths Lengths of line segments up to four solutions.
+/// \param distances Distance between 3D points in pairs |BC|, |AC|, |AB|.
 /// \param cosines Cosine of the angles /_BPC, /_APC, /_APB.
 /// \returns Number of solutions.
 /// WARNING: NOT ALL THE DEGENERATE CASES ARE IMPLEMENTED
diff --git a/modules/calib3d/src/precomp.hpp b/modules/calib3d/src/precomp.hpp
index 610deebfa9ff..8f598d6709e1 100644
--- a/modules/calib3d/src/precomp.hpp
+++ b/modules/calib3d/src/precomp.hpp
@@ -69,7 +69,7 @@ namespace cv
  * @param ep outlier ratio
  * @param modelPoints number of model points required for estimation
  * @param maxIters maximum number of iterations
- * @return
+ * @return The number of iterations according to the formula
  * \f[
  * \frac{\ln(1-p)}{\ln\left(1-(1-ep)^\mathrm{modelPoints}\right)}
  * \f]
diff --git a/modules/calib3d/src/rho.cpp b/modules/calib3d/src/rho.cpp
index 341b6b906380..6edb3b8272c9 100644
--- a/modules/calib3d/src/rho.cpp
+++ b/modules/calib3d/src/rho.cpp
@@ -490,7 +490,7 @@ void rhoSeed(Ptr<RHO_HEST> p, uint64_t seed){
  * Estimates the homography using the given context, matches and parameters to
  * PROSAC.
  *
- * @param [in/out] p       The context to use for homography estimation. Must
+ * @param [in,out] p       The context to use for homography estimation. Must
  *                             be already initialized. Cannot be NULL.
  * @param [in]     src     The pointer to the source points of the matches.
  *                             Must be aligned to 4 bytes. Cannot be NULL.
diff --git a/modules/calib3d/src/rho.h b/modules/calib3d/src/rho.h
index a8211161af29..0410cc09250c 100644
--- a/modules/calib3d/src/rho.h
+++ b/modules/calib3d/src/rho.h
@@ -215,7 +215,7 @@ void rhoSeed(Ptr<RHO_HEST> p, uint64_t seed);
  * homography with at least the minimum required support, and 0 if it was not.
  *
  *
- * @param [in/out] p       The context to use for homography estimation. Must
+ * @param [in,out] p       The context to use for homography estimation. Must
  *                             be already initialized. Cannot be NULL.
  * @param [in]     src     The pointer to the source points of the matches.
  *                             Must be aligned to 4 bytes. Cannot be NULL.
diff --git a/modules/calib3d/src/solvepnp.cpp b/modules/calib3d/src/solvepnp.cpp
index 54e23ca9f8eb..771abff483b5 100644
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -1041,7 +1041,7 @@ int solvePnPGeneric( InputArray _opoints, InputArray _ipoints,
         vec_tvecs.push_back(tvec);
     }*/
     else
-        CV_Error(CV_StsBadArg, "The flags argument must be one of SOLVEPNP_ITERATIVE, SOLVEPNP_P3P, "
+        CV_Error(cv::Error::StsBadArg, "The flags argument must be one of SOLVEPNP_ITERATIVE, SOLVEPNP_P3P, "
             "SOLVEPNP_EPNP, SOLVEPNP_DLS, SOLVEPNP_UPNP, SOLVEPNP_AP3P, SOLVEPNP_IPPE, SOLVEPNP_IPPE_SQUARE or SOLVEPNP_SQPNP");
 
     CV_Assert(vec_rvecs.size() == vec_tvecs.size());
diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index f58aa5e40059..625196ea6339 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -231,13 +231,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
         dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
         x = 1;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             v_int16 ftz = vx_setall_s16((short) ftzero);
             v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
             v_int16 z = vx_setzero_s16();
 
-            for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
+            for(; x <= (size.width - 1) - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
             {
                 v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
                 v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
@@ -248,13 +248,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
                 v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
                 v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));
 
-                v_int16 d0 = s00 - s01;
-                v_int16 d1 = s10 - s11;
-                v_int16 d2 = s20 - s21;
-                v_int16 d3 = s30 - s31;
+                v_int16 d0 = v_sub(s00, s01);
+                v_int16 d1 = v_sub(s10, s11);
+                v_int16 d2 = v_sub(s20, s21);
+                v_int16 d3 = v_sub(s30, s31);
 
-                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
-                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
+                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d0, d1), d1), d2), ftz), ftz2), z));
+                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d1, d2), d2), d3), ftz), ftz2), z));
 
                 v_pack_store(dptr0 + x, v0);
                 v_pack_store(dptr1 + x, v1);
@@ -277,10 +277,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
     {
         uchar* dptr = dst.ptr<uchar>(y);
         x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             v_uint8 val0_16 = vx_setall_u8(val0);
-            for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
+            for(; x <= size.width-VTraits<v_uint8>::vlanes(); x+=VTraits<v_uint8>::vlanes())
                 v_store(dptr + x, val0_16);
         }
 #endif
@@ -356,7 +356,7 @@ class BufferBM
         for (size_t i = 0; i < nstripes; ++i)
         {
             // 1D: [1][  ndisp  ][1]
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (params.useShorts())
                 area.allocate(sad_short[i], ndisp + 2);
             else
@@ -364,7 +364,7 @@ class BufferBM
                 area.allocate(sad[i], ndisp + 2);
 
             // 2D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ]
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (params.useShorts())
                 area.allocate(hsad_short[i], (height + wsz + 2) * ndisp);
             else
@@ -390,7 +390,7 @@ class BufferBM
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template <typename dType>
 static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                                             Mat& disp, Mat& cost, const StereoBMParams& state,
@@ -422,8 +422,8 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
     short costbuf = 0;
     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
     const uchar * tab = bufX.tab;
-    short v_seq[v_int16::nlanes];
-    for (short i = 0; i < v_int16::nlanes; ++i)
+    short v_seq[VTraits<v_int16>::max_nlanes];
+    for (short i = 0; i < VTraits<v_int16>::vlanes(); ++i)
         v_seq[i] = i;
 
     ushort *sad = bufX.sad_short[bufNum] + 1;
@@ -446,19 +446,19 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             v_uint8 lv = vx_setall_u8((uchar)lval);
-            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+            for( d = 0; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
             {
                 v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                 v_store(cbuf + d, diff);
-                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
-                v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
+                v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff)));
+                v_store(hsad + d + VTraits<v_uint16>::vlanes(), v_add(vx_load(hsad + d + VTraits<v_uint16>::vlanes()), v_expand_high(diff)));
             }
-            if( d <= ndisp - v_uint16::nlanes )
+            if( d <= ndisp - VTraits<v_uint16>::vlanes() )
             {
                 v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
                 v_store_low(cbuf + d, diff);
-                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
-                d += v_uint16::nlanes;
+                v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff)));
+                d += VTraits<v_uint16>::vlanes();
             }
             for( ; d < ndisp; d++ )
             {
@@ -496,20 +496,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             v_uint8 lv = vx_setall_u8((uchar)lval);
-            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+            for( d = 0; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
             {
                 v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                 v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
                 v_store(cbuf + d, diff);
-                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
-                v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
+                v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), v_expand_low(cbs))));
+                v_store(hsad + d + VTraits<v_uint16>::vlanes(), v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d + VTraits<v_uint16>::vlanes()), v_expand_high(diff))), v_expand_high(cbs))));
             }
-            if( d <= ndisp - v_uint16::nlanes)
+            if( d <= ndisp - VTraits<v_uint16>::vlanes())
             {
                 v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
                 v_store_low(cbuf + d, diff);
-                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
-                d += v_uint16::nlanes;
+                v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), vx_load_expand((schar *)cbuf_sub + d))));
+                d += VTraits<v_uint16>::vlanes();
             }
             for( ; d < ndisp; d++ )
             {
@@ -533,20 +533,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
         hsad = hsad0 + (1 - dy0)*ndisp;
         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
         {
-            for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
+            for( d = 0; d <= ndisp-2*VTraits<v_uint16>::vlanes(); d += 2*VTraits<v_uint16>::vlanes() )
             {
-                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
-                v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
+                v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d)));
+                v_store(sad + d + VTraits<v_uint16>::vlanes(), v_add(vx_load(sad + d + VTraits<v_uint16>::vlanes()), vx_load(hsad + d + VTraits<v_uint16>::vlanes())));
             }
-            if( d <= ndisp-v_uint16::nlanes )
+            if( d <= ndisp-VTraits<v_uint16>::vlanes() )
             {
-                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
-                d += v_uint16::nlanes;
+                v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d)));
+                d += VTraits<v_uint16>::vlanes();
             }
-            if( d <= ndisp-v_uint16::nlanes/2 )
+            if( d <= ndisp-VTraits<v_uint16>::vlanes()/2 )
             {
-                v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
-                d += v_uint16::nlanes/2;
+                v_store_low(sad + d, v_add(vx_load_low(sad + d), vx_load_low(hsad + d)));
+                d += VTraits<v_uint16>::vlanes()/2;
             }
             for( ; d < ndisp; d++ )
                 sad[d] = sad[d] + hsad[d];
@@ -564,29 +564,29 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
             v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
             v_int16 mind8 = vx_setall_s16(0);
 
-            for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
+            for( d = 0; d <= ndisp - 2*VTraits<v_int16>::vlanes(); d += 2*VTraits<v_int16>::vlanes() )
             {
-                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d)));
                 v_store(sad + d, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d)));
                 minsad8 = v_min(minsad8, sad8);
 
-                sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
-                v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d+v_int16::nlanes));
+                sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d + VTraits<v_int16>::vlanes())), v_reinterpret_as_s16(vx_load(hsad_sub + d + VTraits<v_int16>::vlanes()))), v_reinterpret_as_s16(vx_load(sad + d + VTraits<v_int16>::vlanes())));
+                v_store(sad + d + VTraits<v_int16>::vlanes(), v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)(d + VTraits<v_int16>::vlanes()))));
                 minsad8 = v_min(minsad8, sad8);
             }
-            if( d <= ndisp - v_int16::nlanes )
+            if( d <= ndisp - VTraits<v_int16>::vlanes() )
             {
-                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d)));
                 v_store(sad + d, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d)));
                 minsad8 = v_min(minsad8, sad8);
-                d += v_int16::nlanes;
+                d += VTraits<v_int16>::vlanes();
             }
             minsad = v_reduce_min(minsad8);
-            v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
-            mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+            v_int16 v_mask = (v_eq(vx_setall_s16((short)minsad), minsad8));
+            mind = v_reduce_min(v_or(v_and(v_add(mind8, vx_load(v_seq)), v_mask), v_and(vx_setall_s16(32767), v_not(v_mask))));
             for( ; d < ndisp; d++ )
             {
                 int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
@@ -610,34 +610,34 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                 int thresh = minsad + (minsad * uniquenessRatio/100);
                 v_int32 thresh4 = vx_setall_s32(thresh + 1);
                 v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
-                v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
+                v_int32 dd_4 = vx_setall_s32(VTraits<v_int32>::vlanes());
                 v_int32 d4 = vx_load_expand(v_seq);
 
-                for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
+                for( d = 0; d <= ndisp - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                 {
                     v_int32 sad4_l, sad4_h;
                     v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
-                    if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
+                    if( v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))) )
                         break;
-                    d4 += dd_4;
-                    if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
+                    d4 = v_add(d4, dd_4);
+                    if( v_check_any(v_and(v_gt(thresh4, sad4_h), v_or(v_gt(d1, d4), v_gt(d4, d2)))) )
                         break;
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);
                 }
-                if( d <= ndisp - v_int16::nlanes )
+                if( d <= ndisp - VTraits<v_int16>::vlanes() )
                 {
                     dptr[y*dstep] = FILTERED;
                     continue;
                 }
-                if( d <= ndisp - v_int32::nlanes )
+                if( d <= ndisp - VTraits<v_int32>::vlanes() )
                 {
                     v_int32 sad4_l = vx_load_expand((short*)sad + d);
-                    if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
+                    if (v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))))
                     {
                         dptr[y*dstep] = FILTERED;
                         continue;
                     }
-                    d += v_int16::nlanes;
+                    d += VTraits<v_int16>::vlanes();
                 }
                 for( ; d < ndisp; d++ )
                 {
@@ -699,11 +699,11 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
     const uchar * tab = bufX.tab;
 
-#if CV_SIMD
-    int v_seq[v_int32::nlanes];
-    for (int i = 0; i < v_int32::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int v_seq[VTraits<v_int32>::max_nlanes];
+    for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
         v_seq[i] = i;
-    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
+    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(VTraits<v_int32>::vlanes());
 #endif
 
     int *sad = bufX.sad[bufNum] + 1;
@@ -725,17 +725,17 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 v_uint8 lv = vx_setall_u8((uchar)lval);
 
-                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+                for( ; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint8 rv = vx_load(rptr + d);
                     v_int32 hsad_0 = vx_load(hsad + d);
-                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
-                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
-                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_int32 hsad_1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits<v_int32>::vlanes());
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits<v_int32>::vlanes());
                     v_uint8 diff = v_absdiff(lv, rv);
                     v_store(cbuf + d, diff);
 
@@ -745,15 +745,15 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                     v_expand(diff0, diff00, diff01);
                     v_expand(diff1, diff10, diff11);
 
-                    hsad_0 += v_reinterpret_as_s32(diff00);
-                    hsad_1 += v_reinterpret_as_s32(diff01);
-                    hsad_2 += v_reinterpret_as_s32(diff10);
-                    hsad_3 += v_reinterpret_as_s32(diff11);
+                    hsad_0 = v_add(hsad_0, v_reinterpret_as_s32(diff00));
+                    hsad_1 = v_add(hsad_1, v_reinterpret_as_s32(diff01));
+                    hsad_2 = v_add(hsad_2, v_reinterpret_as_s32(diff10));
+                    hsad_3 = v_add(hsad_3, v_reinterpret_as_s32(diff11));
 
                     v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + v_int32::nlanes, hsad_1);
-                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
-                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
+                    v_store(hsad + d + VTraits<v_int32>::vlanes(), hsad_1);
+                    v_store(hsad + d + 2*VTraits<v_int32>::vlanes(), hsad_2);
+                    v_store(hsad + d + 3*VTraits<v_int32>::vlanes(), hsad_3);
                 }
             }
 #endif
@@ -793,16 +793,16 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 v_uint8 lv = vx_setall_u8((uchar)lval);
-                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+                for( ; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint8 rv = vx_load(rptr + d);
                     v_int32 hsad_0 = vx_load(hsad + d);
-                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
-                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
-                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_int32 hsad_1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits<v_int32>::vlanes());
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits<v_int32>::vlanes());
                     v_uint8 cbs = vx_load(cbuf_sub + d);
                     v_uint8 diff = v_absdiff(lv, rv);
                     v_store(cbuf + d, diff);
@@ -816,19 +816,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                     v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
                     v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);
 
-                    v_int32 diff_0 = diff00 - cbs00;
-                    v_int32 diff_1 = diff01 - cbs01;
-                    v_int32 diff_2 = diff10 - cbs10;
-                    v_int32 diff_3 = diff11 - cbs11;
-                    hsad_0 += diff_0;
-                    hsad_1 += diff_1;
-                    hsad_2 += diff_2;
-                    hsad_3 += diff_3;
+                    v_int32 diff_0 = v_sub(diff00, cbs00);
+                    v_int32 diff_1 = v_sub(diff01, cbs01);
+                    v_int32 diff_2 = v_sub(diff10, cbs10);
+                    v_int32 diff_3 = v_sub(diff11, cbs11);
+                    hsad_0 = v_add(hsad_0, diff_0);
+                    hsad_1 = v_add(hsad_1, diff_1);
+                    hsad_2 = v_add(hsad_2, diff_2);
+                    hsad_3 = v_add(hsad_3, diff_3);
 
                     v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + v_int32::nlanes, hsad_1);
-                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
-                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
+                    v_store(hsad + d + VTraits<v_int32>::vlanes(), hsad_1);
+                    v_store(hsad + d + 2*VTraits<v_int32>::vlanes(), hsad_2);
+                    v_store(hsad + d + 3*VTraits<v_int32>::vlanes(), hsad_3);
                 }
             }
 #endif
@@ -855,18 +855,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
         {
             d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
-                for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
+                for( d = 0; d <= ndisp-2*VTraits<v_int32>::vlanes(); d += 2*VTraits<v_int32>::vlanes() )
                 {
                     v_int32 s0 = vx_load(sad + d);
-                    v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
+                    v_int32 s1 = vx_load(sad + d + VTraits<v_int32>::vlanes());
                     v_int32 t0 = vx_load(hsad + d);
-                    v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
-                    s0 += t0;
-                    s1 += t1;
+                    v_int32 t1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    s0 = v_add(s0, t0);
+                    s1 = v_add(s1, t1);
                     v_store(sad + d, s0);
-                    v_store(sad + d + v_int32::nlanes, s1);
+                    v_store(sad + d + VTraits<v_int32>::vlanes(), s1);
                 }
             }
 #endif
@@ -884,30 +884,30 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
             d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 v_int32 minsad4 = vx_setall_s32(INT_MAX);
                 v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;
 
-                for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
+                for( ; d <= ndisp - 2*VTraits<v_int32>::vlanes(); d += 2*VTraits<v_int32>::vlanes() )
                 {
-                    v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
+                    v_int32 sad4 = v_sub(v_add(vx_load(sad + d), vx_load(hsad + d)), vx_load(hsad_sub + d));
                     v_store(sad + d, sad4);
-                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    mind4 = v_select(v_gt(minsad4, sad4), d4, mind4);
                     minsad4 = v_min(minsad4, sad4);
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);
 
-                    sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
-                    v_store(sad + d + v_int32::nlanes, sad4);
-                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    sad4 = v_sub(v_add(vx_load(sad + d + VTraits<v_int32>::vlanes()), vx_load(hsad + d + VTraits<v_int32>::vlanes())), vx_load(hsad_sub + d + VTraits<v_int32>::vlanes()));
+                    v_store(sad + d + VTraits<v_int32>::vlanes(), sad4);
+                    mind4 = v_select(v_gt(minsad4, sad4), d4, mind4);
                     minsad4 = v_min(minsad4, sad4);
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);
                 }
 
-                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
+                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[VTraits<v_int32>::max_nlanes], mind_buf[VTraits<v_int32>::max_nlanes];
                 v_store(minsad_buf, minsad4);
                 v_store(mind_buf, mind4);
-                for (int i = 0; i < v_int32::nlanes; ++i)
+                for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                     if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
             }
 #endif
@@ -1102,7 +1102,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
         Mat disp_i = disp->rowRange(row0, row1);
         Mat cost_i = state.disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         if (state.useShorts())
         {
             if( disp_i.type() == CV_16S)
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index e30973ec9416..75f6f32564c1 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -123,7 +123,7 @@ struct StereoSGBMParams
     int mode;
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #if CV_SIMD_WIDTH == 16
 static inline v_int16 vx_setseq_s16()
 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7); }
@@ -136,10 +136,10 @@ static inline v_int16 vx_setseq_s16()
 #else
 struct vseq_s16
 {
-    short data[v_int16::nlanes];
+    short data[VTraits<v_int16>::max_nlanes];
     vseq_s16()
     {
-        for (int i = 0; i < v_int16::nlanes; i++)
+        for (int i = 0; i < VTraits<v_int16>::vlanes(); i++)
             data[i] = i;
     }
 };
@@ -153,8 +153,8 @@ static inline v_int16 vx_setseq_s16()
 static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos)
 {
     min_val = v_reduce_min(val);
-    v_int16 v_mask = (vx_setall_s16(min_val) == val);
-    min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+    v_int16 v_mask = (v_eq(vx_setall_s16(min_val), val));
+    min_pos = v_reduce_min(v_or(v_and(v_add(pos, vx_setseq_s16()), v_mask), v_and(vx_setall_s16(SHRT_MAX), v_not(v_mask))));
 }
 #endif
 
@@ -270,26 +270,26 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
             int u1 = std::max(ul, ur); u1 = std::max(u1, u);
 
             int d = minD;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
             v_uint8 _u  = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0);
             v_uint8 _u1 = vx_setall_u8((uchar)u1);
 
-            for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
+            for( ; d <= maxD - 2*VTraits<v_int16>::vlanes(); d += 2*VTraits<v_int16>::vlanes() )
             {
                 v_uint8 _v  = vx_load(prow2  + width-x-1 + d);
                 v_uint8 _v0 = vx_load(buffer + width-x-1 + d);
                 v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2);
-                v_uint8 c0 = v_max(_u - _v1, _v0 - _u);
-                v_uint8 c1 = v_max(_v - _u1, _u0 - _v);
+                v_uint8 c0 = v_max(v_sub(_u, _v1), v_sub(_v0, _u));
+                v_uint8 c1 = v_max(v_sub(_v, _u1), v_sub(_u0, _v));
                 v_uint8 diff = v_min(c0, c1);
 
                 v_int16 _c0 = vx_load_aligned(cost + x*D + d);
-                v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes);
+                v_int16 _c1 = vx_load_aligned(cost + x*D + d + VTraits<v_int16>::vlanes());
 
                 v_uint16 diff1,diff2;
                 v_expand(diff,diff1,diff2);
-                v_store_aligned(cost + x*D + d,                   _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
-                v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
+                v_store_aligned(cost + x*D + d,                   v_add(_c0, v_reinterpret_as_s16(v_shr(diff1, diff_scale))));
+                v_store_aligned(cost + x*D + d + VTraits<v_int16>::vlanes(), v_add(_c1, v_reinterpret_as_s16(v_shr(diff2, diff_scale))));
             }
         #endif
             for( ; d < maxD; d++ )
@@ -555,13 +555,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
 
                         memset(hsumAdd, 0, Da*sizeof(CostType));
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                         v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
-                        for( d = 0; d < Da; d += v_int16::nlanes )
+                        for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                         {
-                            v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
+                            v_int16 v_hsumAdd = v_mul(vx_load_aligned(mem.pixDiff + d), h_scale);
                             for( x = Da; x <= SW2*Da; x += Da )
-                                v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
+                                v_hsumAdd = v_add(v_hsumAdd, vx_load_aligned(mem.pixDiff + x + d));
                             v_store_aligned(hsumAdd + d, v_hsumAdd);
                         }
 #else
@@ -578,9 +578,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                             const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                             const CostType* Cprev =  mem.getCBuf(y - 1);
 
-#if CV_SIMD
-                            for (d = 0; d < Da; d += v_int16::nlanes)
-                                v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(Cprev + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d)));
 #else
                             for (d = 0; d < D; d++)
                                 C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]);
@@ -590,12 +590,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                             {
                                 const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                 const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                                 {
-                                    v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
+                                    v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d));
                                     v_store_aligned(hsumAdd + x + d, hv);
-                                    v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
+                                    v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv));
                                 }
 #else
                                 for( d = 0; d < D; d++ )
@@ -608,10 +608,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         }
                         else
                         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                             v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
-                            for (d = 0; d < Da; d += v_int16::nlanes)
-                                v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
+                            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale)));
 #else
                             int scale = k == 0 ? SH2 + 1 : 1;
                             for (d = 0; d < D; d++)
@@ -622,12 +622,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                                 const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                 const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
 
-#if CV_SIMD
-                                for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                                 {
-                                    v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                                    v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                                     v_store_aligned(hsumAdd + x + d, hv);
-                                    v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                                    v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                                 }
 #else
                                 for( d = 0; d < D; d++ )
@@ -646,9 +646,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         {
                             const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                             const CostType* Cprev = mem.getCBuf(y - 1);
-#if CV_SIMD
-                            for (x = 0; x < width1*Da; x += v_int16::nlanes)
-                                v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x)));
 #else
                             for (x = 0; x < width1*Da; x++)
                                 C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
@@ -656,9 +656,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         }
                         else
                         {
-#if CV_SIMD
-                            for (x = 0; x < width1*Da; x += v_int16::nlanes)
-                                v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                             for (x = 0; x < width1*Da; x++)
                                 C[x] = (CostType)(C[x] + hsumAdd[x]);
@@ -714,7 +714,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                 CostType* minL = mem.getMinLr(lrID, x);
                 d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 _P1 = vx_setall_s16((short)P1);
 
                 v_int16 _delta0 = vx_setall_s16((short)delta0);
@@ -726,31 +726,31 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 v_int16 _minL2 = vx_setall_s16((short)MAX_COST);
                 v_int16 _minL3 = vx_setall_s16((short)MAX_COST);
 
-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
                     v_int16 Spd = vx_load_aligned(Sp + d);
                     v_int16 L;
 
-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd);
                     v_store_aligned(Lr_p + d, L);
                     _minL0 = v_min(_minL0, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);
 
-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), v_add(vx_load(Lr_p1 + d - 1), _P1)), v_add(vx_load(Lr_p1 + d + 1), _P1)), _delta1), _delta1), Cpd);
                     v_store_aligned(Lr_p + d + Dlra, L);
                     _minL1 = v_min(_minL1, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);
 
-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), v_add(vx_load(Lr_p2 + d - 1), _P1)), v_add(vx_load(Lr_p2 + d + 1), _P1)), _delta2), _delta2), Cpd);
                     v_store_aligned(Lr_p + d + Dlra*2, L);
                     _minL2 = v_min(_minL2, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);
 
-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), v_add(vx_load(Lr_p3 + d - 1), _P1)), v_add(vx_load(Lr_p3 + d + 1), _P1)), _delta3), _delta3), Cpd);
                     v_store_aligned(Lr_p + d + Dlra*3, L);
                     _minL3 = v_min(_minL3, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);
 
                     v_store_aligned(Sp + d, Spd);
                 }
@@ -769,7 +769,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 t0 = v_min(t0, t1);
                 t0 = v_min(t0, v_rotate_right<4>(t0));
 #if CV_SIMD_WIDTH == 32
-                CostType buf[v_int16::nlanes];
+                CostType buf[VTraits<v_int16>::max_nlanes];
                 v_store_low(buf, v_min(t0, v_rotate_right<8>(t0)));
                 minL[0] = buf[0];
                 minL[1] = buf[1];
@@ -817,10 +817,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
             if( pass == npasses )
             {
                 x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
                 v_int16 v_max_cost = vx_setall_s16(MAX_COST);
-                for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
+                for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes() )
                 {
                     v_store(disp1ptr + x, v_inv_dist);
                     v_store(mem.disp2ptr + x, v_inv_dist);
@@ -850,23 +850,23 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         d = 0;
                         int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
                         int minL0 = MAX_COST;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                         v_int16 _P1 = vx_setall_s16((short)P1);
                         v_int16 _delta0 = vx_setall_s16((short)delta0);
 
                         v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
                         v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                        for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                        for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                         {
                             v_int16 Cpd = vx_load_aligned(Cp + d);
-                            v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
+                            v_int16 L0 = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd);
 
                             v_store_aligned(Lr_p + d, L0);
                             _minL0 = v_min(_minL0, L0);
-                            L0 += vx_load_aligned(Sp + d);
+                            L0 = v_add(L0, vx_load_aligned(Sp + d));
                             v_store_aligned(Sp + d, L0);
 
-                            _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
+                            _bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp);
                             _minS = v_min(_minS, L0);
                         }
                         minL0 = (CostType)v_reduce_min(_minL0);
@@ -891,12 +891,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     else
                     {
                         d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                         v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                        for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes )
+                        for( ; d <= D - VTraits<v_int16>::vlanes(); d+= VTraits<v_int16>::vlanes() )
                         {
                             v_int16 L0 = vx_load_aligned(Sp + d);
-                            _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
+                            _bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp);
                             _minS = v_min( L0, _minS );
                         }
                         min_pos(_minS, _bestDisp, minS, bestDisp);
@@ -1039,9 +1039,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                             for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
                             {
                                 int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
-                                    v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(hsumAdd + x1*Da + d, v_add(vx_load_aligned(hsumAdd + x1 * this->Da + d), vx_load_aligned(pixDiff + xbord + d)));
 #else
                                 for( d = 0; d < D; d++ )
                                     hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]);
@@ -1052,9 +1052,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                             {
                                 const CostType* hsumSub =  mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                                 const CostType* Cprev = mem.getCBuf(y - 1);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
-                                    v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x1*Da + d, v_sub(v_add(vx_load_aligned(Cprev + x1 * this->Da + d), vx_load_aligned(hsumAdd + x1 * this->Da + d)), vx_load_aligned(hsumSub + x1 * this->Da + d)));
 #else
                                 for( d = 0; d < D; d++ )
                                     C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]);
@@ -1064,12 +1064,12 @@ struct CalcVerticalSums: public ParallelLoopBody
                                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
 
-#if CV_SIMD
-                                    for( d = 0; d < Da; d += v_int16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                    for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                                     {
-                                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
+                                        v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d));
                                         v_store_aligned(hsumAdd + x + d, hv);
-                                        v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
+                                        v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv));
                                     }
 #else
                                     for( d = 0; d < D; d++ )
@@ -1082,10 +1082,10 @@ struct CalcVerticalSums: public ParallelLoopBody
                             }
                             else
                             {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                                 v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
-                                for (d = 0; d < Da; d += v_int16::nlanes)
-                                    v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale);
+                                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                    v_store_aligned(C + x1*Da + d, v_add(vx_load_aligned(C + x1 * this->Da + d), v_mul(vx_load_aligned(hsumAdd + x1 * this->Da + d), v_scale)));
 #else
                                 int scale = k == 0 ? SH2 + 1 : 1;
                                 for (d = 0; d < D; d++)
@@ -1095,12 +1095,12 @@ struct CalcVerticalSums: public ParallelLoopBody
                                 {
                                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                                    for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                    for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                                     {
-                                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                                        v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                                         v_store_aligned(hsumAdd + x + d, hv);
-                                        v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                                        v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                                     }
 #else
                                     for( d = 0; d < D; d++ )
@@ -1120,9 +1120,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                                 const CostType* Cprev = mem.getCBuf(y - 1);
 
-#if CV_SIMD
-                                for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
-                                    v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( x = x1*Da; x < x2*Da; x += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x)));
 #else
                                 for( x = x1*Da; x < x2*Da; x++ )
                                     C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
@@ -1131,9 +1131,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                             else*/
                             if(y == 0)
                             {
-#if CV_SIMD
-                                for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
-                                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( x = x1*Da; x < x2*Da; x += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                                 for( x = x1*Da; x < x2*Da; x++ )
                                     C[x] = (CostType)(C[x] + hsumAdd[x]);
@@ -1167,19 +1167,19 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                     CostType& minL = *(mem.getMinLr(lrID, x));
                     d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_int16 _P1 = vx_setall_s16((short)P1);
 
                     v_int16 _delta = vx_setall_s16((short)delta);
                     v_int16 _minL = vx_setall_s16((short)MAX_COST);
 
-                    for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                    for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                     {
                         v_int16 Cpd = vx_load_aligned(Cp + d);
-                        v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                        v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                         v_store_aligned(Lr_p + d, L);
                         _minL = v_min(_minL, L);
-                        v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
+                        v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L));
                     }
                     minL = v_reduce_min(_minL);
 #else
@@ -1264,10 +1264,10 @@ struct CalcHorizontalSums: public ParallelLoopBody
             CostType* S = mem.getSBuf(y);
 
             x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
             v_int16 v_max_cost = vx_setall_s16(MAX_COST);
-            for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+            for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
             {
                 v_store(disp1ptr + x, v_inv_dist);
                 v_store(disp2ptr + x, v_inv_dist);
@@ -1304,19 +1304,19 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 CostType* Sp = S + x*Da;
 
                 d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 _P1 = vx_setall_s16((short)P1);
 
                 v_int16 _delta = vx_setall_s16((short)delta);
                 v_int16 _minL = vx_setall_s16((short)MAX_COST);
 
-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes())
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                     v_store(Lr_p + d, L);
                     _minL = v_min(_minL, L);
-                    v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
+                    v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L));
                 }
                 minLr = v_reduce_min(_minL);
 #else
@@ -1349,22 +1349,22 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 minLr = MAX_COST;
 
                 d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 _P1 = vx_setall_s16((short)P1);
                 v_int16 _delta = vx_setall_s16((short)delta);
 
                 v_int16 _minL = vx_setall_s16((short)MAX_COST);
                 v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                     v_store(Lr_p + d, L);
                     _minL = v_min(_minL, L);
-                    L += vx_load_aligned(Sp + d);
+                    L = v_add(L, vx_load_aligned(Sp + d));
                     v_store_aligned(Sp + d, L);
 
-                    _bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp);
+                    _bestDisp = v_select(v_gt(_minS, L), vx_setall_s16((short)d), _bestDisp);
                     _minS = v_min( L, _minS );
                 }
                 minLr = v_reduce_min(_minL);
@@ -1581,8 +1581,8 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
 
     utils::BufferArea aux_area;
     PixType* clipTab;
-#if CV_SIMD
-    short idx_row[v_int16::nlanes];
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    short idx_row[VTraits<v_int16>::max_nlanes];
 #endif
     SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
     void operator () (const Range& range) const CV_OVERRIDE;
@@ -1637,8 +1637,8 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
     uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 
-#if CV_SIMD
-    for(short i = 0; i < v_int16::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    for(short i = 0; i < VTraits<v_int16>::vlanes(); ++i)
         idx_row[i] = i;
 #endif
 }
@@ -1659,13 +1659,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
         {
             calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
-            for (d = 0; d < Da; d += v_int16::nlanes)
+            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
             {
-                v_int16 hsA = vx_load_aligned(pixDiff + d) * sw2_1;
+                v_int16 hsA = v_mul(vx_load_aligned(pixDiff + d), sw2_1);
                 for (x = Da; x <= SW2 * Da; x += Da)
-                    hsA += vx_load_aligned(pixDiff + x + d);
+                    hsA = v_add(hsA, vx_load_aligned(pixDiff + x + d));
                 v_store_aligned(hsumAdd + d, hsA);
             }
 #else
@@ -1681,9 +1681,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
             {
                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
 
-#if CV_SIMD
-                for (d = 0; d < Da; d += v_int16::nlanes)
-                    v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(C + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d)));
 #else
                 for (d = 0; d < D; d++)
                     C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]);
@@ -1693,13 +1693,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
                 {
                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_int16 hv_reg;
-                    for( d = 0; d < Da; d+=v_int16::nlanes )
+                    for( d = 0; d < Da; d+=VTraits<v_int16>::vlanes() )
                     {
-                        hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d);
+                        hv_reg = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                         v_store_aligned(hsumAdd+x+d,hv_reg);
-                        v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d));
+                        v_store_aligned(C+x+d,v_sub(v_add(vx_load_aligned(C + x + d), hv_reg), vx_load_aligned(hsumSub + x + d)));
                     }
 #else
                     for( d = 0; d < D; d++ )
@@ -1712,10 +1712,10 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
             }
             else
             {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1);
-                for (d = 0; d < Da; d += v_int16::nlanes)
-                    v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
+                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale)));
 #else
                 int scale = k == src_start_idx ? SH2 + 1 : 1;
                 for (d = 0; d < D; d++)
@@ -1725,12 +1725,12 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
                 {
                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                    for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                     {
-                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                        v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                         v_store_aligned(hsumAdd + x + d, hv);
-                        v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                        v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                     }
 #else
                     for (d = 0; d < D; d++)
@@ -1748,9 +1748,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
             if( y > src_start_idx )
             {
                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
-#if CV_SIMD
-                for( x = 0; x < width1*Da; x += v_int16::nlanes)
-                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + x, v_sub(v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)), vx_load_aligned(hsumSub + x)));
 #else
                 for( x = 0; x < width1*Da; x++ )
                     C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]);
@@ -1758,9 +1758,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
             }
             else
             {
-#if CV_SIMD
-                for( x = 0; x < width1*Da; x += v_int16::nlanes)
-                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                 for( x = 0; x < width1*Da; x++ )
                     C[x] = (CostType)(C[x] + hsumAdd[x]);
@@ -1781,7 +1781,7 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
     CostType *costs = mem.curCostVolumeLine - Da + x;
     CostType& topMinCost = mem.vertPassMin[x/Da];
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
 
     v_int16 leftMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
@@ -1798,18 +1798,18 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
     v_int16 src_shifted_left,src_shifted_right;
     v_int16 res;
 
-    for(;i<Da-v_int16::nlanes;i+= v_int16::nlanes)
+    for(;i<Da-VTraits<v_int16>::vlanes();i+= VTraits<v_int16>::vlanes())
     {
         //process leftBuf:
         //lookahead load:
-        src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes);
+        src2 = vx_load_aligned(leftBuf_prev+i+VTraits<v_int16>::vlanes());
 
         //get shifted versions of the current block and add P1:
         src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
         src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );
 
         // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg));
         leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
         v_store_aligned(leftBuf+i, res);
 
@@ -1819,14 +1819,14 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
 
         //process topBuf:
         //lookahead load:
-        src2 = vx_load_aligned(topBuf+i+v_int16::nlanes);
+        src2 = vx_load_aligned(topBuf+i+VTraits<v_int16>::vlanes());
 
         //get shifted versions of the current block and add P1:
         src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
         src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );
 
         // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg));
         topMinCost_new_reg = v_min(topMinCost_new_reg,res);
         v_store_aligned(topBuf+i, res);
 
@@ -1843,17 +1843,17 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
         src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
         src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );
 
-        res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->Da - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg));
         leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
-        v_store_aligned(leftBuf+Da-v_int16::nlanes, res);
+        v_store_aligned(leftBuf+Da-VTraits<v_int16>::vlanes(), res);
 
         //process topBuf:
         src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
         src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );
 
-        res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->Da - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg));
         topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
-        v_store_aligned(topBuf+Da-v_int16::nlanes, res);
+        v_store_aligned(topBuf+Da-VTraits<v_int16>::vlanes(), res);
     }
     else
     {
@@ -1904,7 +1904,7 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
     CostType* leftBuf = mem.horPassCostVolume + x;
 
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
 
     v_int16 rightMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
@@ -1919,27 +1919,27 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
     v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX);
     v_int16 min_sum_pos_reg  = vx_setall_s16(0);
 
-    for(;i<Da-v_int16::nlanes;i+=v_int16::nlanes)
+    for(;i<Da-VTraits<v_int16>::vlanes();i+=VTraits<v_int16>::vlanes())
     {
         //lookahead load:
-        src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes);
+        src2 = vx_load_aligned(rightBuf+i+VTraits<v_int16>::vlanes());
 
         //get shifted versions of the current block and add P1:
         src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
         src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );
 
         // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg));
         rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
         v_store_aligned(rightBuf+i, res);
 
         // compute and save total cost:
-        res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i);
+        res = v_add(v_add(res, vx_load_aligned(leftBuf + i)), vx_load_aligned(topBuf + i));
         v_store_aligned(leftBuf+i, res);
 
         // track disparity value with the minimum cost:
         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
-        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)i) - min_sum_pos_reg));
+        min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)i), min_sum_pos_reg)));
 
         //update src:
         src0_rightBuf    = src1_rightBuf;
@@ -1953,15 +1953,15 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
         src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
         src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );
 
-        res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->D - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg));
         rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
-        v_store_aligned(rightBuf+D-v_int16::nlanes, res);
+        v_store_aligned(rightBuf+D-VTraits<v_int16>::vlanes(), res);
 
-        res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes);
-        v_store_aligned(leftBuf+D-v_int16::nlanes, res);
+        res = v_add(v_add(res, vx_load_aligned(leftBuf + this->D - VTraits<v_int16>::vlanes())), vx_load_aligned(topBuf + this->D - VTraits<v_int16>::vlanes()));
+        v_store_aligned(leftBuf+D-VTraits<v_int16>::vlanes(), res);
 
         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
-        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)D-v_int16::nlanes) - min_sum_pos_reg));
+        min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)(this->D - VTraits<v_int16>::vlanes())), min_sum_pos_reg)));
         min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
     }
     else
@@ -2070,40 +2070,40 @@ void SGBM3WayMainLoop::impl(const Range& range) const
             if(uniquenessRatio>0)
             {
                 d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 horPassCostVolume+=x;
                 int thresh = (100*min_cost)/(100-uniquenessRatio);
                 v_int16 thresh_reg = vx_setall_s16((short)(thresh+1));
                 v_int16 d1 = vx_setall_s16((short)(best_d-1));
                 v_int16 d2 = vx_setall_s16((short)(best_d+1));
-                v_int16 eight_reg = vx_setall_s16(v_int16::nlanes);
+                v_int16 eight_reg = vx_setall_s16((short)VTraits<v_int16>::vlanes());
                 v_int16 cur_d = vx_load(idx_row);
                 v_int16 mask;
 
-                for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes )
+                for( ; d <= D - 2*VTraits<v_int16>::vlanes(); d+=2*VTraits<v_int16>::vlanes() )
                 {
-                    mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
-                    cur_d = cur_d+eight_reg;
+                    mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)));
+                    cur_d = v_add(cur_d, eight_reg);
                     if( v_check_any(mask) )
                         break;
-                    mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
-                    cur_d = cur_d+eight_reg;
+                    mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d + VTraits<v_int16>::vlanes()), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)));
+                    cur_d = v_add(cur_d, eight_reg);
                     if( v_check_any(mask) )
                         break;
                 }
-                if( d <= D - 2*v_int16::nlanes )
+                if( d <= D - 2*VTraits<v_int16>::vlanes() )
                 {
                     horPassCostVolume-=x;
                     continue;
                 }
-                if( d <= D - v_int16::nlanes )
+                if( d <= D - VTraits<v_int16>::vlanes() )
                 {
-                    if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) )
+                    if( v_check_any(v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)))) )
                     {
                         horPassCostVolume-=x;
                         continue;
                     }
-                    d+=v_int16::nlanes;
+                    d+=VTraits<v_int16>::vlanes();
                 }
                 horPassCostVolume-=x;
 #endif
diff --git a/modules/calib3d/src/triangulate.cpp b/modules/calib3d/src/triangulate.cpp
index cded42232aad..671659c84b7a 100644
--- a/modules/calib3d/src/triangulate.cpp
+++ b/modules/calib3d/src/triangulate.cpp
@@ -56,30 +56,30 @@ icvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2, CvMat* projPoints1, CvM
     if( projMatr1 == 0 || projMatr2 == 0 ||
       projPoints1 == 0 || projPoints2 == 0 ||
       points4D == 0)
-      CV_Error( CV_StsNullPtr, "Some of parameters is a NULL pointer" );
+      CV_Error( cv::Error::StsNullPtr, "Some of parameters is a NULL pointer" );
 
     if( !CV_IS_MAT(projMatr1) || !CV_IS_MAT(projMatr2) ||
       !CV_IS_MAT(projPoints1) || !CV_IS_MAT(projPoints2) ||
       !CV_IS_MAT(points4D) )
-      CV_Error( CV_StsUnsupportedFormat, "Input parameters must be matrices" );
+      CV_Error( cv::Error::StsUnsupportedFormat, "Input parameters must be matrices" );
 
     int numPoints = projPoints1->cols;
 
     if( numPoints < 1 )
-        CV_Error( CV_StsOutOfRange, "Number of points must be more than zero" );
+        CV_Error( cv::Error::StsOutOfRange, "Number of points must be more than zero" );
 
     if( projPoints2->cols != numPoints || points4D->cols != numPoints )
-        CV_Error( CV_StsUnmatchedSizes, "Number of points must be the same" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "Number of points must be the same" );
 
     if( projPoints1->rows != 2 || projPoints2->rows != 2)
-        CV_Error( CV_StsUnmatchedSizes, "Number of proj points coordinates must be == 2" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "Number of proj points coordinates must be == 2" );
 
     if( points4D->rows != 4 )
-        CV_Error( CV_StsUnmatchedSizes, "Number of world points coordinates must be == 4" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "Number of world points coordinates must be == 4" );
 
     if( projMatr1->cols != 4 || projMatr1->rows != 3 ||
        projMatr2->cols != 4 || projMatr2->rows != 3)
-        CV_Error( CV_StsUnmatchedSizes, "Size of projection matrices must be 3x4" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "Size of projection matrices must be 3x4" );
 
     // preallocate SVD matrices on stack
     cv::Matx<double, 4, 4> matrA;
@@ -147,30 +147,30 @@ icvCorrectMatches(CvMat *F_, CvMat *points1_, CvMat *points2_, CvMat *new_points
     cv::Ptr<CvMat> F;
 
     if (!CV_IS_MAT(F_) || !CV_IS_MAT(points1_) || !CV_IS_MAT(points2_) )
-        CV_Error( CV_StsUnsupportedFormat, "Input parameters must be matrices" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Input parameters must be matrices" );
     if (!( F_->cols == 3 && F_->rows == 3))
-        CV_Error( CV_StsUnmatchedSizes, "The fundamental matrix must be a 3x3 matrix");
+        CV_Error( cv::Error::StsUnmatchedSizes, "The fundamental matrix must be a 3x3 matrix");
     if (!(((F_->type & CV_MAT_TYPE_MASK) >> 3) == 0 ))
-        CV_Error( CV_StsUnsupportedFormat, "The fundamental matrix must be a single-channel matrix" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "The fundamental matrix must be a single-channel matrix" );
     if (!(points1_->rows == 1 && points2_->rows == 1 && points1_->cols == points2_->cols))
-        CV_Error( CV_StsUnmatchedSizes, "The point-matrices must have one row, and an equal number of columns" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "The point-matrices must have one row, and an equal number of columns" );
     if (((points1_->type & CV_MAT_TYPE_MASK) >> 3) != 1 )
-        CV_Error( CV_StsUnmatchedSizes, "The first set of points must contain two channels; one for x and one for y" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "The first set of points must contain two channels; one for x and one for y" );
     if (((points2_->type & CV_MAT_TYPE_MASK) >> 3) != 1 )
-        CV_Error( CV_StsUnmatchedSizes, "The second set of points must contain two channels; one for x and one for y" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "The second set of points must contain two channels; one for x and one for y" );
     if (new_points1 != NULL) {
         CV_Assert(CV_IS_MAT(new_points1));
         if (new_points1->cols != points1_->cols || new_points1->rows != 1)
-            CV_Error( CV_StsUnmatchedSizes, "The first output matrix must have the same dimensions as the input matrices" );
+            CV_Error( cv::Error::StsUnmatchedSizes, "The first output matrix must have the same dimensions as the input matrices" );
         if (CV_MAT_CN(new_points1->type) != 2)
-            CV_Error( CV_StsUnsupportedFormat, "The first output matrix must have two channels; one for x and one for y" );
+            CV_Error( cv::Error::StsUnsupportedFormat, "The first output matrix must have two channels; one for x and one for y" );
     }
     if (new_points2 != NULL) {
         CV_Assert(CV_IS_MAT(new_points2));
         if (new_points2->cols != points2_->cols || new_points2->rows != 1)
-            CV_Error( CV_StsUnmatchedSizes, "The second output matrix must have the same dimensions as the input matrices" );
+            CV_Error( cv::Error::StsUnmatchedSizes, "The second output matrix must have the same dimensions as the input matrices" );
         if (CV_MAT_CN(new_points2->type) != 2)
-            CV_Error( CV_StsUnsupportedFormat, "The second output matrix must have two channels; one for x and one for y" );
+            CV_Error( cv::Error::StsUnsupportedFormat, "The second output matrix must have two channels; one for x and one for y" );
     }
 
     // Make sure F uses double precision
diff --git a/modules/calib3d/src/undistort.simd.hpp b/modules/calib3d/src/undistort.simd.hpp
index 7998a3b086ea..70bac4470245 100644
--- a/modules/calib3d/src/undistort.simd.hpp
+++ b/modules/calib3d/src/undistort.simd.hpp
@@ -89,8 +89,8 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
         s2(_s2),
         s3(_s3),
         s4(_s4) {
-#if CV_SIMD_64F
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
         {
             s_x[i] = ir[0] * i;
             s_y[i] = ir[3] * i;
@@ -123,26 +123,26 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
             else
                 CV_Assert(m1 != NULL);
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             const v_float64 v_one = vx_setall_f64(1.0);
-            for (; j <= size.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes, _x += 2*v_float64::nlanes * ir[0], _y += 2*v_float64::nlanes * ir[3], _w += 2*v_float64::nlanes * ir[6])
+            for (; j <= size.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes(), _x += 2*VTraits<v_float64>::vlanes() * ir[0], _y += 2*VTraits<v_float64>::vlanes() * ir[3], _w += 2*VTraits<v_float64>::vlanes() * ir[6])
             {
                 v_float64 m_0, m_1, m_2, m_3;
-                m_2 = v_one / (vx_setall_f64(_w) + vx_load(s_w));
-                m_3 = v_one / (vx_setall_f64(_w) + vx_load(s_w + v_float64::nlanes));
+                m_2 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w)));
+                m_3 = v_div(v_one, v_add(vx_setall_f64(_w), vx_load(this->s_w + VTraits<v_float64>::vlanes())));
                 m_0 = vx_setall_f64(_x); m_1 = vx_setall_f64(_y);
-                v_float64 x_0 = (m_0 + vx_load(s_x)) * m_2;
-                v_float64 x_1 = (m_0 + vx_load(s_x + v_float64::nlanes)) * m_3;
-                v_float64 y_0 = (m_1 + vx_load(s_y)) * m_2;
-                v_float64 y_1 = (m_1 + vx_load(s_y + v_float64::nlanes)) * m_3;
+                v_float64 x_0 = v_mul(v_add(m_0, vx_load(this->s_x)), m_2);
+                v_float64 x_1 = v_mul(v_add(m_0, vx_load(this->s_x + VTraits<v_float64>::vlanes())), m_3);
+                v_float64 y_0 = v_mul(v_add(m_1, vx_load(this->s_y)), m_2);
+                v_float64 y_1 = v_mul(v_add(m_1, vx_load(this->s_y + VTraits<v_float64>::vlanes())), m_3);
 
-                v_float64 xd_0 = x_0 * x_0;
-                v_float64 yd_0 = y_0 * y_0;
-                v_float64 xd_1 = x_1 * x_1;
-                v_float64 yd_1 = y_1 * y_1;
+                v_float64 xd_0 = v_mul(x_0, x_0);
+                v_float64 yd_0 = v_mul(y_0, y_0);
+                v_float64 xd_1 = v_mul(x_1, x_1);
+                v_float64 yd_1 = v_mul(y_1, y_1);
 
-                v_float64 r2_0 = xd_0 + yd_0;
-                v_float64 r2_1 = xd_1 + yd_1;
+                v_float64 r2_0 = v_add(xd_0, yd_0);
+                v_float64 r2_1 = v_add(xd_1, yd_1);
 
                 m_1 = vx_setall_f64(k3);
                 m_2 = vx_setall_f64(k2);
@@ -151,18 +151,18 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
                 m_1 = v_muladd(v_muladd(v_muladd(m_1, r2_1, m_2), r2_1, m_3), r2_1, v_one);
                 m_3 = vx_setall_f64(k6);
                 m_2 = vx_setall_f64(k5);
-                m_0 /= v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(k4)), r2_0, v_one);
-                m_1 /= v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(k4)), r2_1, v_one);
+                m_0 = v_div(m_0, v_muladd(v_muladd(v_muladd(m_3, r2_0, m_2), r2_0, vx_setall_f64(this->k4)), r2_0, v_one));
+                m_1 = v_div(m_1, v_muladd(v_muladd(v_muladd(m_3, r2_1, m_2), r2_1, vx_setall_f64(this->k4)), r2_1, v_one));
 
                 m_3 = vx_setall_f64(2.0);
                 xd_0 = v_muladd(m_3, xd_0, r2_0);
                 yd_0 = v_muladd(m_3, yd_0, r2_0);
                 xd_1 = v_muladd(m_3, xd_1, r2_1);
                 yd_1 = v_muladd(m_3, yd_1, r2_1);
-                m_2 = x_0 * y_0 * m_3;
-                m_3 = x_1 * y_1 * m_3;
+                m_2 = v_mul(v_mul(x_0, y_0), m_3);
+                m_3 = v_mul(v_mul(x_1, y_1), m_3);
 
-                x_0 *= m_0; y_0 *= m_0; x_1 *= m_1; y_1 *= m_1;
+                x_0 = v_mul(x_0, m_0); y_0 = v_mul(y_0, m_0); x_1 = v_mul(x_1, m_1); y_1 = v_mul(y_1, m_1);
 
                 m_0 = vx_setall_f64(p1);
                 m_1 = vx_setall_f64(p2);
@@ -176,8 +176,8 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
                 xd_1 = v_muladd(m_0, m_3, xd_1);
                 yd_1 = v_muladd(m_1, m_3, yd_1);
 
-                m_0 = r2_0 * r2_0;
-                m_1 = r2_1 * r2_1;
+                m_0 = v_mul(r2_0, r2_0);
+                m_1 = v_mul(r2_1, r2_1);
                 m_2 = vx_setall_f64(s2);
                 m_3 = vx_setall_f64(s1);
                 xd_0 = v_muladd(m_3, r2_0, v_muladd(m_2, m_0, xd_0));
@@ -203,17 +203,17 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
                 r2_0 = v_muladd(m_0, xd_0, v_muladd(m_1, yd_0, m_2));
                 r2_1 = v_muladd(m_0, xd_1, v_muladd(m_1, yd_1, m_2));
                 m_0 = vx_setzero_f64();
-                r2_0 = v_select(r2_0 == m_0, v_one, v_one / r2_0);
-                r2_1 = v_select(r2_1 == m_0, v_one, v_one / r2_1);
+                r2_0 = v_select(v_eq(r2_0, m_0), v_one, v_div(v_one, r2_0));
+                r2_1 = v_select(v_eq(r2_1, m_0), v_one, v_div(v_one, r2_1));
 
                 m_0 = vx_setall_f64(fx);
                 m_1 = vx_setall_f64(u0);
                 m_2 = vx_setall_f64(fy);
                 m_3 = vx_setall_f64(v0);
-                x_0 = v_muladd(m_0 * r2_0, x_0, m_1);
-                y_0 = v_muladd(m_2 * r2_0, y_0, m_3);
-                x_1 = v_muladd(m_0 * r2_1, x_1, m_1);
-                y_1 = v_muladd(m_2 * r2_1, y_1, m_3);
+                x_0 = v_muladd(v_mul(m_0, r2_0), x_0, m_1);
+                y_0 = v_muladd(v_mul(m_2, r2_0), y_0, m_3);
+                x_1 = v_muladd(v_mul(m_0, r2_1), x_1, m_1);
+                y_1 = v_muladd(v_mul(m_2, r2_1), y_1, m_3);
 
                 if (m1type == CV_32FC1)
                 {
@@ -225,20 +225,20 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
                     v_float32 mf0, mf1;
                     v_zip(v_cvt_f32(x_0, x_1), v_cvt_f32(y_0, y_1), mf0, mf1);
                     v_store(&m1f[j * 2], mf0);
-                    v_store(&m1f[j * 2 + v_float32::nlanes], mf1);
+                    v_store(&m1f[j * 2 + VTraits<v_float32>::vlanes()], mf1);
                 }
                 else // m1type == CV_16SC2
                 {
                     m_0 = vx_setall_f64(INTER_TAB_SIZE);
-                    x_0 *= m_0; x_1 *= m_0; y_0 *= m_0; y_1 *= m_0;
+                    x_0 = v_mul(x_0, m_0); x_1 = v_mul(x_1, m_0); y_0 = v_mul(y_0, m_0); y_1 = v_mul(y_1, m_0);
 
                     v_int32 mask = vx_setall_s32(INTER_TAB_SIZE - 1);
                     v_int32 iu = v_round(x_0, x_1);
                     v_int32 iv = v_round(y_0, y_1);
 
-                    v_pack_u_store(&m2[j], (iu & mask) + (iv & mask) * vx_setall_s32(INTER_TAB_SIZE));
+                    v_pack_u_store(&m2[j], v_add(v_and(iu, mask), v_mul(v_and(iv, mask), vx_setall_s32(INTER_TAB_SIZE))));
                     v_int32 out0, out1;
-                    v_zip(iu >> INTER_BITS, iv >> INTER_BITS, out0, out1);
+                    v_zip(v_shr<INTER_BITS>(iu), v_shr<INTER_BITS>(iv), out0, out1);
                     v_store(&m1[j * 2], v_pack(out0, out1));
                 }
             }
@@ -302,10 +302,10 @@ class initUndistortRectifyMapComputer : public ParallelLoopBody
     double s2;
     double s3;
     double s4;
-#if CV_SIMD_64F
-    double s_x[2*v_float64::nlanes];
-    double s_y[2*v_float64::nlanes];
-    double s_w[2*v_float64::nlanes];
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    double s_x[2*VTraits<v_float64>::max_nlanes];
+    double s_y[2*VTraits<v_float64>::max_nlanes];
+    double s_w[2*VTraits<v_float64>::max_nlanes];
 #endif
 };
 }
diff --git a/modules/calib3d/src/usac.hpp b/modules/calib3d/src/usac.hpp
index 85b2730e4a10..9b66a4576f0a 100644
--- a/modules/calib3d/src/usac.hpp
+++ b/modules/calib3d/src/usac.hpp
@@ -176,7 +176,7 @@ class EpipolarNonMinimalSolver : public NonMinimalSolver {
 //-------------------------- ESSENTIAL MATRIX -----------------------
 class EssentialNonMinimalSolverViaF : public NonMinimalSolver {
 public:
-static Ptr<EssentialNonMinimalSolverViaF> create(const Mat &points_, const cv::Mat &K1, const Mat &K2);
+    static Ptr<EssentialNonMinimalSolverViaF> create(const Mat &points_, const cv::Mat &K1, const Mat &K2);
 };
 
 class EssentialNonMinimalSolverViaT : public NonMinimalSolver {
@@ -210,12 +210,12 @@ class LarssonOptimizer : public NonMinimalSolver {
 class Score {
 public:
     int inlier_number;
-    double score;
+    float score;
     Score () { // set worst case
         inlier_number = 0;
-        score = std::numeric_limits<double>::max();
+        score = std::numeric_limits<float>::max();
     }
-    Score (int inlier_number_, double score_) { // copy constructor
+    Score (int inlier_number_, float score_) { // copy constructor
         inlier_number = inlier_number_;
         score = score_;
     }
@@ -254,7 +254,7 @@ class Quality : public Algorithm {
     // get @inliers of the @model for given threshold
     virtual int getInliers (const Mat &model, std::vector<int> &inliers, double thr) const = 0;
     // Set the best score, so evaluation of the model can terminate earlier
-    virtual void setBestScore (double best_score_) = 0;
+    virtual void setBestScore (float best_score_) = 0;
     // set @inliers_mask: true if point i is inlier, false - otherwise.
     virtual int getInliers (const Mat &model, std::vector<bool> &inliers_mask) const = 0;
     virtual int getPointsSize() const = 0;
@@ -432,7 +432,7 @@ class FundamentalEstimator : public Estimator {
 };
 
 class EssentialEstimator : public Estimator {
-public :
+public:
     static Ptr<EssentialEstimator> create (const Ptr<MinimalSolver> &min_solver_,
             const Ptr<NonMinimalSolver> &non_min_solver_, const Ptr<Degeneracy> &degeneracy_);
 };
@@ -542,11 +542,6 @@ class GridNeighborhoodGraph : public NeighborhoodGraph {
             int cell_size_x_img1_, int cell_size_y_img1_,
             int cell_size_x_img2_, int cell_size_y_img2_, int max_neighbors);
 };
-class GridNeighborhoodGraph2Images : public NeighborhoodGraph {
-public:
-    static Ptr<GridNeighborhoodGraph2Images> create(const Mat &points, int points_size,
-        float cell_size_x_img1_, float cell_size_y_img1_, float cell_size_x_img2_, float cell_size_y_img2_);
-};
 
 ////////////////////////////////////// UNIFORM SAMPLER ////////////////////////////////////////////
 class UniformSampler : public Sampler {
@@ -554,11 +549,6 @@ class UniformSampler : public Sampler {
     static Ptr<UniformSampler> create(int state, int sample_size_, int points_size_);
 };
 
-class QuasiUniformSampler : public Sampler {
-public:
-    static Ptr<QuasiUniformSampler> create(int state, int sample_size_, int points_size_);
-};
-
 /////////////////////////////////// PROSAC (SIMPLE) SAMPLER ///////////////////////////////////////
 class ProsacSimpleSampler : public Sampler {
 public:
diff --git a/modules/calib3d/src/usac/degeneracy.cpp b/modules/calib3d/src/usac/degeneracy.cpp
index 6ccf2bb497ee..bf5ec2b1101c 100644
--- a/modules/calib3d/src/usac/degeneracy.cpp
+++ b/modules/calib3d/src/usac/degeneracy.cpp
@@ -112,7 +112,7 @@ class HomographyDegeneracyImpl : public HomographyDegeneracy {
             return false;
 
         // Checks if points are not collinear
-        // If area of triangle constructed with 3 points is less then threshold then points are collinear:
+        // If area of triangle constructed with 3 points is less than threshold then points are collinear:
         //           |x1 y1 1|             |x1      y1      1|
         // (1/2) det |x2 y2 1| = (1/2) det |x2-x1   y2-y1   0| = det |x2-x1   y2-y1| < 2 * threshold
         //           |x3 y3 1|             |x3-x1   y3-y1   0|       |x3-x1   y3-y1|
diff --git a/modules/calib3d/src/usac/dls_solver.cpp b/modules/calib3d/src/usac/dls_solver.cpp
index ca6a66450422..2e866b938920 100644
--- a/modules/calib3d/src/usac/dls_solver.cpp
+++ b/modules/calib3d/src/usac/dls_solver.cpp
@@ -155,7 +155,12 @@ class DLSPnPImpl : public DLSPnP {
         const auto &eigen_vectors = eigen_solver.eigenvectors();
         const auto &eigen_values = eigen_solver.eigenvalues();
 #else
+
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+        long mat_order = 27, info, lda = 27, ldvl = 1, ldvr = 27, lwork = 500;
+#else
         int mat_order = 27, info, lda = 27, ldvl = 1, ldvr = 27, lwork = 500;
+#endif
         double wr[27], wi[27] = {0}; // 27 = mat_order
         std::vector<double> work(lwork), eig_vecs(729);
         char jobvl = 'N', jobvr = 'V'; // only left eigen vectors are computed
diff --git a/modules/calib3d/src/usac/essential_solver.cpp b/modules/calib3d/src/usac/essential_solver.cpp
index 6dad2a4b8838..8f9b5b9d330f 100644
--- a/modules/calib3d/src/usac/essential_solver.cpp
+++ b/modules/calib3d/src/usac/essential_solver.cpp
@@ -142,7 +142,7 @@ class EssentialMinimalSolver5ptsImpl : public EssentialMinimalSolver5pts {
             }
 
             std::vector<double> c(11), rs;
-            // filling coefficients of 10-degree polynomial satysfying zero-determinant constraint of essential matrix, ie., det(E) = 0
+            // filling coefficients of 10-degree polynomial satisfying zero-determinant constraint of essential matrix, ie., det(E) = 0
             // based on "An Efficient Solution to the Five-Point Relative Pose Problem" (David Nister)
             // same as in five-point.cpp
             c[10] = (b[0]*b[17]*b[34]+b[26]*b[4]*b[21]-b[26]*b[17]*b[8]-b[13]*b[4]*b[34]-b[0]*b[21]*b[30]+b[13]*b[30]*b[8]);
@@ -175,6 +175,8 @@ class EssentialMinimalSolver5ptsImpl : public EssentialMinimalSolver5pts {
                 Matx33d Bz(bz);
                 // Bz is rank 2, matrix, so epipole is its null-vector
                 Vec3d xy1 = Utils::getRightEpipole(Mat(Bz * (1/sqrt(norm_bz))));
+                const double one_over_xy1_norm = 1 / sqrt(xy1[0] * xy1[0] + xy1[1] * xy1[1] + xy1[2] * xy1[2]);
+                xy1 *= one_over_xy1_norm;
 
                 if (fabs(xy1(2)) < 1e-10) continue;
                 Mat_<double> E(3,3);
@@ -239,7 +241,8 @@ class EssentialMinimalSolver5ptsImpl : public EssentialMinimalSolver5pts {
             // (5) Compute the left eigenvectors of the action matrix
             Eigen::EigenSolver<Eigen::Matrix<double, 10, 10>> eigensolver(action_mat_eig);
             const Eigen::VectorXcd &eigenvalues = eigensolver.eigenvalues();
-            const auto * const eig_vecs_ = (double *) eigensolver.eigenvectors().real().data();
+            const Eigen::MatrixXcd eigenvectors = eigensolver.eigenvectors();
+            const auto * const eig_vecs_ = (double *) eigenvectors.data();
 #else
             Matx<double, 10, 10> A = constraint_mat.colRange(0, 10),
                              B = constraint_mat.colRange(10, 20), eliminated_mat;
@@ -256,7 +259,11 @@ class EssentialMinimalSolver5ptsImpl : public EssentialMinimalSolver5pts {
             action_mat_data[83] = -1.0; // 8 row, 3 col
             action_mat_data[96] = -1.0; // 9 row, 6 col
 
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+            long mat_order = 10, info, lda = 10, ldvl = 10, ldvr = 1, lwork = 100;
+#else
             int mat_order = 10, info, lda = 10, ldvl = 10, ldvr = 1, lwork = 100;
+#endif
             double wr[10], wi[10] = {0}, eig_vecs[100], work[100]; // 10 = mat_order, 100 = lwork
             char jobvl = 'V', jobvr = 'N'; // only left eigen vectors are computed
             OCV_LAPACK_FUNC(dgeev)(&jobvl, &jobvr, &mat_order, action_mat_data, &lda, wr, wi, eig_vecs, &ldvl,
diff --git a/modules/calib3d/src/usac/fundamental_solver.cpp b/modules/calib3d/src/usac/fundamental_solver.cpp
index 4083b7610283..a5e3b30fba96 100644
--- a/modules/calib3d/src/usac/fundamental_solver.cpp
+++ b/modules/calib3d/src/usac/fundamental_solver.cpp
@@ -438,7 +438,7 @@ class CovarianceEpipolarSolverImpl : public CovarianceEpipolarSolver {
     explicit CovarianceEpipolarSolverImpl (const Mat &points_, bool is_fundamental_) {
         points_size = points_.rows;
         is_fundamental = is_fundamental_;
-        if (is_fundamental) { // normalize image points only for fundmantal matrix
+        if (is_fundamental) { // normalize image points only for fundamental matrix
             std::vector<int> sample(points_size);
             for (int i = 0; i < points_size; i++) sample[i] = i;
             const Ptr<NormTransform> normTr = NormTransform::create(points_);
@@ -558,7 +558,7 @@ class LarssonOptimizerImpl : public LarssonOptimizer {
         const auto * const pts_ = (float *) calib_points.data;
         // a few point are enough to test
         // actually due to Sampson error minimization, the input R,t do not really matter
-        // for a correct pair there is a sligthly faster convergence
+        // for a correct pair there is a slightly faster convergence
         for (int i = 0; i < 3; i++) { // could be 1 point
             const int rand_inl = 4 * sample[rng.uniform(0, sample_size)];
             Vec3d p1 (pts_[rand_inl], pts_[rand_inl+1], 1), p2(pts_[rand_inl+2], pts_[rand_inl+3], 1);
diff --git a/modules/calib3d/src/usac/gamma_values.cpp b/modules/calib3d/src/usac/gamma_values.cpp
index fa1c03e8ed97..191ae38a0f4b 100644
--- a/modules/calib3d/src/usac/gamma_values.cpp
+++ b/modules/calib3d/src/usac/gamma_values.cpp
@@ -13,7 +13,6 @@ class GammaValuesImpl : public GammaValues {
     int max_size_table, DoF;
 public:
     GammaValuesImpl (int DoF_, int max_size_table_) {
-        max_size_table = max_size_table_;
         max_size_table = max_size_table_;
         DoF = DoF_;
         /*
diff --git a/modules/calib3d/src/usac/pnp_solver.cpp b/modules/calib3d/src/usac/pnp_solver.cpp
index b7b136d1e2fe..db0477090863 100644
--- a/modules/calib3d/src/usac/pnp_solver.cpp
+++ b/modules/calib3d/src/usac/pnp_solver.cpp
@@ -196,7 +196,7 @@ class PnPNonMinimalSolverImpl : public PnPNonMinimalSolver {
                 a2[10] = v * Z;
                 a2[11] = v;
 
-                // fill covarinace matrix
+                // fill covariance matrix
                 for (int j = 0; j < 12; j++)
                     for (int z = j; z < 12; z++)
                         AtA[j * 12 + z] += a1[j] * a1[z] + a2[j] * a2[z];
@@ -227,7 +227,7 @@ class PnPNonMinimalSolverImpl : public PnPNonMinimalSolver {
                 a2[10] = v * weight_Z;
                 a2[11] = v * weight;
 
-                // fill covarinace matrix
+                // fill covariance matrix
                 for (int j = 0; j < 12; j++)
                     for (int z = j; z < 12; z++)
                         AtA[j * 12 + z] += a1[j] * a1[z] + a2[j] * a2[z];
diff --git a/modules/calib3d/src/usac/quality.cpp b/modules/calib3d/src/usac/quality.cpp
index 89c5760c1da6..9a72f754f58d 100644
--- a/modules/calib3d/src/usac/quality.cpp
+++ b/modules/calib3d/src/usac/quality.cpp
@@ -69,7 +69,7 @@ class RansacQualityImpl : public RansacQuality {
             else if (inlier_number - point < preemptive_thr)
                     break;
         // score is negative inlier number! If less then better
-        return {inlier_number, -static_cast<double>(inlier_number)};
+        return {inlier_number, -static_cast<float>(inlier_number)};
     }
 
     Score getScore (const std::vector<float> &errors) const override {
@@ -78,10 +78,10 @@ class RansacQualityImpl : public RansacQuality {
             if (errors[point] < threshold)
                 inlier_number++;
         // score is negative inlier number! If less then better
-        return {inlier_number, -static_cast<double>(inlier_number)};
+        return {inlier_number, -static_cast<float>(inlier_number)};
     }
 
-    void setBestScore(double best_score_) override {
+    void setBestScore(float best_score_) override {
         if (best_score > best_score_) best_score = best_score_;
     }
 
@@ -106,18 +106,17 @@ class MsacQualityImpl : public MsacQuality {
     const Ptr<Error> error;
     const int points_size;
     const double threshold, k_msac;
-    double best_score, norm_thr, one_over_thr;
+    const float norm_thr, one_over_thr;
+    float best_score;
 public:
     MsacQualityImpl (int points_size_, double threshold_, const Ptr<Error> &error_, double k_msac_)
-            : error (error_), points_size (points_size_), threshold (threshold_), k_msac(k_msac_) {
-        best_score = std::numeric_limits<double>::max();
-        norm_thr = threshold*k_msac;
-        one_over_thr = 1 / norm_thr;
-    }
+            : error (error_), points_size (points_size_), threshold (threshold_), k_msac(k_msac_),
+              norm_thr(static_cast<float>(threshold*k_msac)), one_over_thr(1.f/norm_thr),
+              best_score(std::numeric_limits<float>::max()) {}
 
     inline Score getScore (const Mat &model) const override {
         error->setModelParameters(model);
-        double err, sum_errors = 0;
+        float err, sum_errors = 0;
         int inlier_number = 0;
         const auto preemptive_thr = points_size + best_score;
         for (int point = 0; point < points_size; point++) {
@@ -133,7 +132,7 @@ class MsacQualityImpl : public MsacQuality {
     }
 
     Score getScore (const std::vector<float> &errors) const override {
-        double sum_errors = 0;
+        float sum_errors = 0;
         int inlier_number = 0;
         for (int point = 0; point < points_size; point++) {
             const auto err = errors[point];
@@ -146,7 +145,7 @@ class MsacQualityImpl : public MsacQuality {
         return {inlier_number, sum_errors};
     }
 
-    void setBestScore(double best_score_) override {
+    void setBestScore(float best_score_) override {
         if (best_score > best_score_) best_score = best_score_;
     }
 
@@ -244,7 +243,7 @@ class MagsacQualityImpl : public MagsacQuality {
             } else if (total_loss + point_idx > preemptive_thr)
                 break;
         }
-        return {num_tentative_inliers, total_loss};
+        return {num_tentative_inliers, (float)total_loss};
     }
 
     Score getScore (const std::vector<float> &errors) const override {
@@ -263,10 +262,10 @@ class MagsacQualityImpl : public MagsacQuality {
                         (stored_complete_gamma_values[x] - gamma_value_of_k)) * norm_loss);
             }
         }
-        return {num_tentative_inliers, total_loss};
+        return {num_tentative_inliers, (float)total_loss};
     }
 
-    void setBestScore (double best_loss) override {
+    void setBestScore (float best_loss) override {
         if (previous_best_loss > best_loss) previous_best_loss = best_loss;
     }
 
@@ -317,7 +316,7 @@ class LMedsQualityImpl : public LMedsQuality {
         return {inlier_number, Utils::findMedian (errors)};
     }
 
-    void setBestScore (double /*best_score*/) override {}
+    void setBestScore (float /*best_score*/) override {}
 
     int getPointsSize () const override { return points_size; }
     int getInliers (const Mat &model, std::vector<int> &inliers) const override
@@ -487,9 +486,9 @@ class AdaptiveSPRTImpl : public AdaptiveSPRT {
         if (last_model_is_good && do_sprt) {
             out_score.inlier_number = tested_inliers;
             if (score_type == ScoreMethod::SCORE_METHOD_MSAC)
-                out_score.score = sum_errors;
+                out_score.score = static_cast<float>(sum_errors);
             else if (score_type == ScoreMethod::SCORE_METHOD_RANSAC)
-                out_score.score = -static_cast<double>(tested_inliers);
+                out_score.score = -static_cast<float>(tested_inliers);
             else out_score = quality->getScore(errors);
         }
         return last_model_is_good;
diff --git a/modules/calib3d/src/usac/ransac_solvers.cpp b/modules/calib3d/src/usac/ransac_solvers.cpp
index 1a7635333152..494bbc1517cb 100644
--- a/modules/calib3d/src/usac/ransac_solvers.cpp
+++ b/modules/calib3d/src/usac/ransac_solvers.cpp
@@ -294,7 +294,7 @@ class Ransac {
                     params->getUpperIncompleteOfSigmaQuantile()); break;
             case ScoreMethod::SCORE_METHOD_LMEDS :
                 quality = LMedsQuality::create(points_size, threshold, error); break;
-            default: CV_Error(cv::Error::StsNotImplemented, "Score is not imeplemeted!");
+            default: CV_Error(cv::Error::StsNotImplemented, "Score is not implemented!");
         }
 
         const auto is_ge_solver = params->getRansacSolver() == GEM_SOLVER;
@@ -733,7 +733,7 @@ class Ransac {
             const bool is_prosac = params->getSampler() == SamplingMethod::SAMPLING_PROSAC;
             std::atomic_bool success(false);
             std::atomic_int num_hypothesis_tested(0), thread_cnt(0), max_number_inliers(0), subset_size, termination_length;
-            std::atomic<double> best_score_all(std::numeric_limits<double>::max());
+            std::atomic<float> best_score_all(std::numeric_limits<float>::max());
             std::vector<Score> best_scores(MAX_THREADS), best_scores_not_LO;
             std::vector<Mat> best_models(MAX_THREADS), best_models_not_LO, K1_apx, K2_apx;
             std::vector<int> num_tested_models_threads(MAX_THREADS), growth_function, non_random_inliers;
@@ -782,7 +782,7 @@ class Ransac {
                         model_verifier, local_optimization, termination, sampler, lo_sampler, weight_fnc, true);
                 bool is_last_from_LO_thread = false;
                 Mat best_model_thread, non_degenerate_model, lo_model, best_not_LO_thread;
-                Score best_score_thread, current_score, non_denegenerate_model_score, lo_score,best_score_all_threads, best_not_LO_score_thread;
+                Score best_score_thread, current_score, non_denegenerate_model_score, lo_score, best_score_all_threads, best_not_LO_score_thread;
                 std::vector<int> sample(estimator->getMinimalSampleSize()), best_sample_thread, supports;
                 supports.reserve(3*MAX_MODELS_ADAPT); // store model supports
                 std::vector<bool> best_inliers_mask_local(points_size, false), model_inliers_mask(points_size, false);
@@ -790,7 +790,8 @@ class Ransac {
                 auto update_best = [&] (const Score &new_score, const Mat &new_model, bool from_LO=false) {
                     // update best score of all threads
                     if (max_number_inliers < new_score.inlier_number) max_number_inliers = new_score.inlier_number;
-                    if (best_score_all > new_score.score) best_score_all = new_score.score;
+                    if (best_score_all > new_score.score)
+                        best_score_all = new_score.score;
                     best_score_all_threads = Score(max_number_inliers, best_score_all);
                     //
                     quality->getInliers(new_model, model_inliers_mask);
@@ -839,7 +840,7 @@ class Ransac {
                     success = num_hypothesis_tested++ > max_iters;
                     if (iters % 10 && !adapt) {
                         // Synchronize threads. just to speed verification of model.
-                        quality->setBestScore(std::min(best_score_thread.score, (double)best_score_all));
+                        quality->setBestScore(std::min(best_score_thread.score, (float)best_score_all));
                         model_verifier->update(best_score_thread.inlier_number > max_number_inliers ? best_score_thread : best_score_all_threads, iters);
                     }
 
diff --git a/modules/calib3d/src/usac/sampler.cpp b/modules/calib3d/src/usac/sampler.cpp
index 2095ee8b4da6..1938bde918bc 100644
--- a/modules/calib3d/src/usac/sampler.cpp
+++ b/modules/calib3d/src/usac/sampler.cpp
@@ -62,8 +62,8 @@ Ptr<UniformSampler> UniformSampler::create(int state, int sample_size_, int poin
 /////////////////////////////////// PROSAC (SIMPLE) SAMPLER ///////////////////////////////////////
 /*
 * PROSAC (simple) sampler does not use array of precalculated T_n (n is subset size) samples, but computes T_n for
-* specific n directy in generateSample() function.
-* Also, the stopping length (or maximum subset size n*) by default is set to points_size (N) and does not updating
+* specific n directly in generateSample() function.
+* Also, the stopping length (or maximum subset size n*) by default is set to points_size (N) and does not update
 * during computation.
 */
 class ProsacSimpleSamplerImpl : public ProsacSimpleSampler {
@@ -176,7 +176,7 @@ class ProsacSamplerImpl : public ProsacSampler {
     // In our experiments, the parameter was set to T_N = 200000
     int growth_max_samples;
 
-    // how many time PROSAC generateSample() was called
+    // how many times PROSAC generateSample() was called
     int kth_sample_number;
     Ptr<UniformRandomGenerator> random_gen;
 public:
@@ -488,7 +488,7 @@ class NapsacSamplerImpl : public NapsacSampler {
 
         points_large_neighborhood_size = 0;
 
-        // find indicies of points that have sufficient neighborhood (at least sample_size-1)
+        // find indices of points that have sufficient neighborhood (at least sample_size-1)
         for (int pt_idx = 0; pt_idx < points_size; pt_idx++)
             if ((int)neighborhood_graph->getNeighbors(pt_idx).size() >= sample_size-1)
                 points_large_neighborhood[points_large_neighborhood_size++] = pt_idx;
diff --git a/modules/calib3d/src/usac/termination.cpp b/modules/calib3d/src/usac/termination.cpp
index 803b060e4159..26a9e331ed96 100644
--- a/modules/calib3d/src/usac/termination.cpp
+++ b/modules/calib3d/src/usac/termination.cpp
@@ -19,7 +19,7 @@ class StandardTerminationCriteriaImpl : public StandardTerminationCriteria {
 
     /*
      * Get upper bound iterations for any sample number
-     * n is points size, w is inlier ratio, p is desired probability, k is expceted number of iterations.
+     * n is points size, w is inlier ratio, p is desired probability, k is expected number of iterations.
      * 1 - p = (1 - w^n)^k,
      * k = log_(1-w^n) (1-p)
      * k = ln (1-p) / ln (1-w^n)
diff --git a/modules/calib3d/src/usac/utils.cpp b/modules/calib3d/src/usac/utils.cpp
index 5e2206702fe8..8d95fb9c3360 100644
--- a/modules/calib3d/src/usac/utils.cpp
+++ b/modules/calib3d/src/usac/utils.cpp
@@ -169,6 +169,9 @@ class SolvePoly : public SolverPoly {
     int getRealRoots (const std::vector<double> &coeffs, std::vector<double> &real_roots) override {
         if (coeffs.empty())
             return 0;
+        for (auto c : coeffs)
+            if (cvIsNaN(c) || cvIsInf(c))
+                return 0;
         Poly input(coeffs);
         if (input.degree() < 1)
             return 0;
@@ -344,10 +347,10 @@ void Utils::decomposeProjection (const Mat &P, Matx33d &K, Matx33d &R, Vec3d &t,
 }
 
 double Utils::getPoissonCDF (double lambda, int inliers) {
-    double exp_lamda = exp(-lambda), cdf = exp_lamda, lambda_i_div_fact_i = 1;
+    double exp_lambda = exp(-lambda), cdf = exp_lambda, lambda_i_div_fact_i = 1;
     for (int i = 1; i <= inliers; i++) {
         lambda_i_div_fact_i *= (lambda / i);
-        cdf += exp_lamda * lambda_i_div_fact_i;
+        cdf += exp_lambda * lambda_i_div_fact_i;
         if (fabs(cdf - 1) < DBL_EPSILON) // cdf is almost 1
             break;
     }
@@ -826,94 +829,4 @@ Ptr<GridNeighborhoodGraph> GridNeighborhoodGraph::create(const Mat &points,
     return makePtr<GridNeighborhoodGraphImpl>(points, points_size,
       cell_size_x_img1_, cell_size_y_img1_, cell_size_x_img2_, cell_size_y_img2_, max_neighbors);
 }
-
-class GridNeighborhoodGraph2ImagesImpl : public GridNeighborhoodGraph2Images {
-private:
-    // This struct is used for the nearest neighbors search by griding two images.
-    struct CellCoord {
-        int c1x, c1y;
-        CellCoord (int c1x_, int c1y_) {
-            c1x = c1x_; c1y = c1y_;
-        }
-        bool operator==(const CellCoord &o) const {
-            return c1x == o.c1x && c1y == o.c1y;
-        }
-        bool operator<(const CellCoord &o) const {
-            if (c1x < o.c1x) return true;
-            return c1x == o.c1x && c1y < o.c1y;
-        }
-    };
-
-    std::vector<std::vector<int>> graph;
-public:
-    GridNeighborhoodGraph2ImagesImpl (const Mat &container_, int points_size,
-            float cell_size_x_img1, float cell_size_y_img1, float cell_size_x_img2, float cell_size_y_img2) {
-
-        std::map<CellCoord, std::vector<int >> neighbors_map1, neighbors_map2;
-        const auto * const container = (float *) container_.data;
-        // Key is cell position. The value is indexes of neighbors.
-
-        const auto cell_sz_x1 = 1.f / cell_size_x_img1,
-                   cell_sz_y1 = 1.f / cell_size_y_img1,
-                   cell_sz_x2 = 1.f / cell_size_x_img2,
-                   cell_sz_y2 = 1.f / cell_size_y_img2;
-        const int dimension = container_.cols;
-        for (int i = 0; i < points_size; i++) {
-            const int idx = dimension * i;
-            neighbors_map1[CellCoord((int)(container[idx  ] * cell_sz_x1),
-                                    (int)(container[idx+1] * cell_sz_y1))].emplace_back(i);
-            neighbors_map2[CellCoord((int)(container[idx+2] * cell_sz_x2),
-                                    (int)(container[idx+3] * cell_sz_y2))].emplace_back(i);
-        }
-
-        //--------- create a graph ----------
-        graph = std::vector<std::vector<int>>(points_size);
-
-        // store neighbors cells into graph (2D vector)
-        for (const auto &cell : neighbors_map1) {
-            const int neighbors_in_cell = static_cast<int>(cell.second.size());
-            // only one point in cell -> no neighbors
-            if (neighbors_in_cell < 2) continue;
-
-            const std::vector<int> &neighbors = cell.second;
-            // ---------- fill graph -----
-            // for speed-up we make no symmetric graph, eg, x has a neighbor y, but y does not have x
-            const int v_in_cell = neighbors[0];
-            // there is always at least one neighbor
-            auto &graph_row = graph[v_in_cell];
-            graph_row.reserve(neighbors_in_cell);
-            for (int n : neighbors)
-                if (n != v_in_cell)
-                    graph_row.emplace_back(n);
-        }
-
-        // fill neighbors of a second image
-        for (const auto &cell : neighbors_map2) {
-            if (cell.second.size() < 2) continue;
-            const std::vector<int> &neighbors = cell.second;
-            const int v_in_cell = neighbors[0];
-            auto &graph_row = graph[v_in_cell];
-            for (const int &n : neighbors)
-                if (n != v_in_cell) {
-                    bool has = false;
-                    for (const int &nn : graph_row)
-                        if (n == nn) {
-                            has = true; break;
-                        }
-                    if (!has) graph_row.emplace_back(n);
-                }
-        }
-    }
-    const std::vector<std::vector<int>> &getGraph () const override { return graph; }
-    inline const std::vector<int> &getNeighbors(int point_idx) const override {
-        // Note, neighbors vector also includes point_idx!
-        return graph[point_idx];
-    }
-};
-
-Ptr<GridNeighborhoodGraph2Images> GridNeighborhoodGraph2Images::create(const Mat &points,
-        int points_size, float cell_size_x_img1_, float cell_size_y_img1_, float cell_size_x_img2_, float cell_size_y_img2_) {
-    return makePtr<GridNeighborhoodGraph2ImagesImpl>(points, points_size,
-            cell_size_x_img1_, cell_size_y_img1_, cell_size_x_img2_, cell_size_y_img2_);
-}
 }}
\ No newline at end of file
diff --git a/modules/calib3d/test/test_affine2d_estimator.cpp b/modules/calib3d/test/test_affine2d_estimator.cpp
index 95f12351057e..2282dc32408c 100644
--- a/modules/calib3d/test/test_affine2d_estimator.cpp
+++ b/modules/calib3d/test/test_affine2d_estimator.cpp
@@ -115,8 +115,8 @@ TEST_P(EstimateAffine2D, testNPoints)
 
         EXPECT_NEAR(0., cvtest::norm(aff_est, aff, NORM_INF), 1e-4);
 
-        bool inliers_good = count(inliers.begin(), inliers.end(), 1) == m &&
-            m == accumulate(inliers.begin(), inliers.begin() + m, 0);
+        bool inliers_good = std::count(inliers.begin(), inliers.end(), 1) == m &&
+            m == std::accumulate(inliers.begin(), inliers.begin() + m, 0);
 
         EXPECT_TRUE(inliers_good);
     }
diff --git a/modules/calib3d/test/test_affine3d_estimator.cpp b/modules/calib3d/test/test_affine3d_estimator.cpp
index 3f1b50e5f262..bb639a4018df 100644
--- a/modules/calib3d/test/test_affine3d_estimator.cpp
+++ b/modules/calib3d/test/test_affine3d_estimator.cpp
@@ -160,8 +160,8 @@ bool CV_Affine3D_EstTest::testNPoints()
         return false;
     }
 
-    bool outl_good = count(outl.begin(), outl.end(), 1) == m &&
-        m == accumulate(outl.begin(), outl.begin() + m, 0);
+    bool outl_good = std::count(outl.begin(), outl.end(), 1) == m &&
+        m == std::accumulate(outl.begin(), outl.begin() + m, 0);
 
     if (!outl_good)
     {
diff --git a/modules/calib3d/test/test_affine_partial2d_estimator.cpp b/modules/calib3d/test/test_affine_partial2d_estimator.cpp
index 0be25ee7eb84..dbbb4da0d954 100644
--- a/modules/calib3d/test/test_affine_partial2d_estimator.cpp
+++ b/modules/calib3d/test/test_affine_partial2d_estimator.cpp
@@ -125,8 +125,8 @@ TEST_P(EstimateAffinePartial2D, testNPoints)
 
         EXPECT_NEAR(0., cvtest::norm(aff_est, aff, NORM_INF), 1e-4);
 
-        bool inliers_good = count(inliers.begin(), inliers.end(), 1) == m &&
-            m == accumulate(inliers.begin(), inliers.begin() + m, 0);
+        bool inliers_good = std::count(inliers.begin(), inliers.end(), 1) == m &&
+            m == std::accumulate(inliers.begin(), inliers.begin() + m, 0);
 
         EXPECT_TRUE(inliers_good);
     }
diff --git a/modules/calib3d/test/test_calibration_hand_eye.cpp b/modules/calib3d/test/test_calibration_hand_eye.cpp
index aa8b34d6d962..edfc5fec0ef6 100644
--- a/modules/calib3d/test/test_calibration_hand_eye.cpp
+++ b/modules/calib3d/test/test_calibration_hand_eye.cpp
@@ -756,4 +756,110 @@ TEST(Calib3d_CalibrateRobotWorldHandEye, regression)
     }
 }
 
+TEST(Calib3d_CalibrateHandEye, regression_24871)
+{
+    std::vector<Mat> R_target2cam, t_target2cam;
+    std::vector<Mat> R_gripper2base, t_gripper2base;
+    Mat T_true_cam2gripper;
+
+    T_true_cam2gripper = (cv::Mat_<double>(4, 4) <<  0,  0, -1, 0.1,
+                                                     1,  0,  0, 0.2,
+                                                     0, -1,  0, 0.3,
+                                                     0,  0,  0, 1);
+
+    R_target2cam.push_back((cv::Mat_<double>(3, 3) <<
+            0.04964505493834381, 0.5136826827431226, 0.8565427426404346,
+            -0.3923117691818854, 0.7987004864191318, -0.4562554205214679,
+            -0.9184916136152514, -0.3133809733274676, 0.2411752915926112));
+    t_target2cam.push_back((cv::Mat_<double>(3, 1) <<
+            -1.588728904724121,
+            0.07843752950429916,
+            -1.002813339233398));
+
+    R_gripper2base.push_back((cv::Mat_<double>(3, 3) <<
+            -0.4143743581399177, -0.6105088815982459, -0.6749613298595637,
+            -0.1598851232573451, -0.6812625208693498, 0.71436554019614,
+            -0.895952364066927, 0.4039310376145889, 0.1846864320259794));
+    t_gripper2base.push_back((cv::Mat_<double>(3, 1) <<
+            -1.249274406461827,
+            -1.916570771580279,
+            2.005069553422765));
+
+    R_target2cam.push_back((cv::Mat_<double>(3, 3) <<
+            -0.3048000068139332, 0.6971848192711539, 0.6488684640388026,
+            -0.9377589344241749, -0.3387497187353627, -0.07652979135179161,
+            0.1664486009369332, -0.6318084803439735, 0.7570422097951847));
+    t_target2cam.push_back((cv::Mat_<double>(3, 1) <<
+            -1.906493663787842,
+            -0.07281044125556946,
+            0.6088893413543701));
+
+    R_gripper2base.push_back((cv::Mat_<double>(3, 3) <<
+            0.7262439860936567, -0.201662933718935, -0.6571923111439066,
+            -0.4640017362244384, -0.8491808316335328, -0.2521791108852766,
+            -0.5072199339965884, 0.4880819361030014, -0.7102844234575628));
+    t_gripper2base.push_back((cv::Mat_<double>(3, 1) <<
+            -0.7375172846804027,
+            -2.579760910816792,
+            1.336561572270101));
+
+    R_target2cam.push_back((cv::Mat_<double>(3, 3) <<
+            -0.590234879685801, -0.7051138289845309, -0.3929850823848928,
+            0.6017371069678565, -0.7088332765096816, 0.3680595606834615,
+            -0.5380847896941907, -0.01923211603859842, 0.8426712792141644));
+    t_target2cam.push_back((cv::Mat_<double>(3, 1) <<
+            -0.9809040427207947,
+            -0.2707894444465637,
+            -0.2577074766159058));
+
+    R_gripper2base.push_back((cv::Mat_<double>(3, 3) <<
+            0.2541996332132083, 0.6186461729765909, 0.7434106934499181,
+            0.2194912986375709, 0.711701808961156, -0.6673111005698995,
+            -0.9419161938817396, 0.3328024155303503, 0.04512688689130734));
+    t_gripper2base.push_back((cv::Mat_<double>(3, 1) <<
+            -1.040123533893404,
+            -0.1303773962721222,
+            1.068029475621886));
+
+    R_target2cam.push_back((cv::Mat_<double>(3, 3) <<
+            0.7643667483125168, -0.08523002870239212, 0.63912386614923,
+            -0.2583463792779588, 0.8676987164647345, 0.424683512464778,
+            -0.5907627462764713, -0.489729292214425, 0.6412211770980741));
+    t_target2cam.push_back((cv::Mat_<double>(3, 1) <<
+            -1.58987033367157,
+            -1.924914002418518,
+            -0.3109001517295837));
+
+    R_gripper2base.push_back((cv::Mat_<double>(3, 3) <<
+            0.116348305340805, -0.9917998080681939, 0.0528792261688552,
+            -0.2760629007224059, 0.01884966191381591, 0.9609547154213178,
+            -0.9540714578526358, -0.1264034452126562, -0.2716060057313114));
+    t_gripper2base.push_back((cv::Mat_<double>(3, 1) <<
+            -2.551899142554571,
+            -2.986937398237611,
+            1.317613923218308));
+
+    Mat R_true_cam2gripper;
+    Mat t_true_cam2gripper;
+    R_true_cam2gripper = T_true_cam2gripper(Rect(0, 0, 3, 3));
+    t_true_cam2gripper = T_true_cam2gripper(Rect(3, 0, 1, 3));
+
+    std::vector<HandEyeCalibrationMethod> methods = {CALIB_HAND_EYE_TSAI,
+                                                     CALIB_HAND_EYE_PARK,
+                                                     CALIB_HAND_EYE_HORAUD,
+                                                     CALIB_HAND_EYE_ANDREFF,
+                                                     CALIB_HAND_EYE_DANIILIDIS};
+
+    for (auto method : methods) {
+        SCOPED_TRACE(cv::format("method=%s", getMethodName(method).c_str()));
+
+        Matx33d R_cam2gripper_est;
+        Matx31d t_cam2gripper_est;
+        calibrateHandEye(R_gripper2base, t_gripper2base, R_target2cam, t_target2cam, R_cam2gripper_est, t_cam2gripper_est, method);
+
+        EXPECT_TRUE(cv::norm(R_cam2gripper_est - R_true_cam2gripper) < 1e-9);
+        EXPECT_TRUE(cv::norm(t_cam2gripper_est - t_true_cam2gripper) < 1e-9);
+    }
+}
+
 }} // namespace
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index d2ab906fee97..7668d74c48d2 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -1388,8 +1388,8 @@ void CV_StereoCalibrationTest::run( int )
 
         for( int i = 0; i < nframes; i++ )
         {
-            Mat left = imread(imglist[i*2]);
-            Mat right = imread(imglist[i*2+1]);
+            Mat left = imread(imglist[i*2], IMREAD_GRAYSCALE);
+            Mat right = imread(imglist[i*2+1], IMREAD_GRAYSCALE);
             if(left.empty() || right.empty())
             {
                 ts->printf( cvtest::TS::LOG, "Can not load images %s and %s, testcase %d\n",
@@ -1400,6 +1400,8 @@ void CV_StereoCalibrationTest::run( int )
             imgsize = left.size();
             bool found1 = findChessboardCorners(left, patternSize, imgpt1[i]);
             bool found2 = findChessboardCorners(right, patternSize, imgpt2[i]);
+            cornerSubPix(left, imgpt1[i], Size(5, 5), Size(-1, -1), TermCriteria(TermCriteria::EPS | TermCriteria::MAX_ITER, 30, 0.1));
+            cornerSubPix(right, imgpt2[i], Size(5, 5), Size(-1, -1), TermCriteria(TermCriteria::EPS | TermCriteria::MAX_ITER, 30, 0.1));
             if(!found1 || !found2)
             {
                 ts->printf( cvtest::TS::LOG, "The function could not detect boards (%d x %d) on the images %s and %s, testcase %d\n",
@@ -2150,6 +2152,54 @@ TEST(Calib3d_StereoCalibrate, regression_11131)
     EXPECT_GE(roi2.area(), 400*300) << roi2;
 }
 
+TEST(Calib3d_StereoCalibrate, regression_23305)
+{
+    const Matx33d M1(
+        850, 0, 640,
+        0, 850, 640,
+        0, 0, 1
+    );
+
+    const Matx34d P1_gold(
+        850, 0, 640, 0,
+        0, 850, 640, 0,
+        0, 0, 1, 0
+    );
+
+    const Matx33d M2(
+        850, 0, 640,
+        0, 850, 640,
+        0, 0, 1
+    );
+
+    const Matx34d P2_gold(
+        850, 0, 640, -2*850, // correcponds to T(-2., 0., 0.)
+        0, 850, 640, 0,
+        0, 0, 1, 0
+    );
+
+    const Matx<double, 5, 1> D1(0, 0, 0, 0, 0);
+    const Matx<double, 5, 1> D2(0, 0, 0, 0, 0);
+
+    const Matx33d R(
+        1., 0., 0.,
+        0., 1., 0.,
+        0., 0., 1.
+    );
+    const Matx31d T(-2., 0., 0.);
+
+    const Size imageSize(1280, 1280);
+
+    Mat R1, R2, P1, P2, Q;
+    Rect roi1, roi2;
+    stereoRectify(M1, D1, M2, D2, imageSize, R, T,
+                  R1, R2, P1, P2, Q,
+                  CALIB_ZERO_DISPARITY, 0, imageSize, &roi1, &roi2);
+
+    EXPECT_EQ(cv::norm(P1, P1_gold), 0.);
+    EXPECT_EQ(cv::norm(P2, P2_gold), 0.);
+}
+
 TEST(Calib3d_Triangulate, accuracy)
 {
     // the testcase from http://code.opencv.org/issues/4334
diff --git a/modules/calib3d/test/test_cameracalibration_badarg.cpp b/modules/calib3d/test/test_cameracalibration_badarg.cpp
index 240bdbb1b375..046fc1d530a0 100644
--- a/modules/calib3d/test/test_cameracalibration_badarg.cpp
+++ b/modules/calib3d/test/test_cameracalibration_badarg.cpp
@@ -149,49 +149,49 @@ void CV_CameraCalibrationBadArgTest::run( int /* start_from */ )
 
     caller.initArgs();
     caller.objPts_arg = noArray();
-    errors += run_test_case( CV_StsBadArg, "None passed in objPts", caller);
+    errors += run_test_case( cv::Error::StsBadArg, "None passed in objPts", caller);
 
     caller.initArgs();
     caller.imgPts_arg = noArray();
-    errors += run_test_case( CV_StsBadArg, "None passed in imgPts", caller );
+    errors += run_test_case( cv::Error::StsBadArg, "None passed in imgPts", caller );
 
     caller.initArgs();
     caller.cameraMatrix_arg = noArray();
-    errors += run_test_case( CV_StsBadArg, "Zero passed in cameraMatrix", caller );
+    errors += run_test_case( cv::Error::StsBadArg, "Zero passed in cameraMatrix", caller );
 
     caller.initArgs();
     caller.distCoeffs_arg = noArray();
-    errors += run_test_case( CV_StsBadArg, "Zero passed in distCoeffs", caller );
+    errors += run_test_case( cv::Error::StsBadArg, "Zero passed in distCoeffs", caller );
 
     caller.initArgs();
     caller.imageSize.width = -1;
-    errors += run_test_case( CV_StsOutOfRange, "Bad image width", caller );
+    errors += run_test_case( cv::Error::StsOutOfRange, "Bad image width", caller );
 
     caller.initArgs();
     caller.imageSize.height = -1;
-    errors += run_test_case( CV_StsOutOfRange, "Bad image height", caller );
+    errors += run_test_case( cv::Error::StsOutOfRange, "Bad image height", caller );
 
     caller.initArgs();
     caller.imgPts[0].clear();
-    errors += run_test_case( CV_StsBadSize, "Bad imgpts[0]", caller );
+    errors += run_test_case( cv::Error::StsBadSize, "Bad imgpts[0]", caller );
     caller.imgPts[0] = caller.imgPts[1];
 
     caller.initArgs();
     caller.objPts[1].clear();
-    errors += run_test_case( CV_StsBadSize, "Bad objpts[1]", caller );
+    errors += run_test_case( cv::Error::StsBadSize, "Bad objpts[1]", caller );
     caller.objPts[1] = caller.objPts[0];
 
     caller.initArgs();
     Mat badCM = Mat::zeros(4, 4, CV_64F);
     caller.cameraMatrix_arg = badCM;
     caller.flags = CALIB_USE_INTRINSIC_GUESS;
-    errors += run_test_case( CV_StsBadArg, "Bad camearaMatrix header", caller );
+    errors += run_test_case( cv::Error::StsBadArg, "Bad camearaMatrix header", caller );
 
     caller.initArgs();
     Mat badDC = Mat::zeros(10, 10, CV_64F);
     caller.distCoeffs_arg = badDC;
     caller.flags = CALIB_USE_INTRINSIC_GUESS;
-    errors += run_test_case( CV_StsBadArg, "Bad camearaMatrix header", caller );
+    errors += run_test_case( cv::Error::StsBadArg, "Bad camearaMatrix header", caller );
 
     if (errors)
         ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
@@ -244,15 +244,15 @@ class CV_Rodrigues2BadArgTest : public cvtest::BadArgTest
 
         caller.initArgs();
         caller.src_arg = noArray();
-        errors += run_test_case( CV_StsBadArg, "Src is empty matrix", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Src is empty matrix", caller );
 
         caller.initArgs();
         caller.src = Mat::zeros(3, 1, CV_8U);
-        errors += run_test_case( CV_StsUnsupportedFormat, "Bad src formart", caller );
+        errors += run_test_case( cv::Error::StsUnsupportedFormat, "Bad src formart", caller );
 
         caller.initArgs();
         caller.src = Mat::zeros(1, 1, CV_32F);
-        errors += run_test_case( CV_StsBadSize, "Bad src size", caller );
+        errors += run_test_case( cv::Error::StsBadSize, "Bad src size", caller );
 
         if (errors)
             ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
@@ -331,57 +331,57 @@ class CV_ProjectPoints2BadArgTest : public cvtest::BadArgTest
 
         caller.initArgs();
         caller.objectPoints_arg = noArray();
-        errors += run_test_case( CV_StsBadArg, "Zero objectPoints", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Zero objectPoints", caller );
 
         caller.initArgs();
         caller.rvec_arg = noArray();
-        errors += run_test_case( CV_StsBadArg, "Zero r_vec", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Zero r_vec", caller );
 
         caller.initArgs();
         caller.tvec_arg = noArray();
-        errors += run_test_case( CV_StsBadArg, "Zero t_vec", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Zero t_vec", caller );
 
         caller.initArgs();
         caller.A_arg = noArray();
-        errors += run_test_case( CV_StsBadArg, "Zero camMat", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Zero camMat", caller );
 
         caller.initArgs();
         caller.imagePoints_arg = noArray();
-        errors += run_test_case( CV_StsBadArg, "Zero imagePoints", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Zero imagePoints", caller );
 
         Mat save_rvec = caller.r_vec;
         caller.initArgs();
         caller.r_vec.create(2, 2, CV_32F);
-        errors += run_test_case( CV_StsBadArg, "Bad rvec format", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Bad rvec format", caller );
 
         caller.initArgs();
         caller.r_vec.create(1, 3, CV_8U);
-        errors += run_test_case( CV_StsBadArg, "Bad rvec format", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Bad rvec format", caller );
         caller.r_vec = save_rvec;
 
         /****************************/
         Mat save_tvec = caller.t_vec;
         caller.initArgs();
         caller.t_vec.create(3, 3, CV_32F);
-        errors += run_test_case( CV_StsBadArg, "Bad tvec format", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Bad tvec format", caller );
 
         caller.initArgs();
         caller.t_vec.create(1, 3, CV_8U);
-        errors += run_test_case( CV_StsBadArg, "Bad tvec format", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Bad tvec format", caller );
         caller.t_vec = save_tvec;
 
         /****************************/
         Mat save_A = caller.A;
         caller.initArgs();
         caller.A.create(2, 2, CV_32F);
-        errors += run_test_case( CV_StsBadArg, "Bad A format", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Bad A format", caller );
         caller.A = save_A;
 
         /****************************/
         Mat save_DC = caller.distCoeffs;
         caller.initArgs();
         caller.distCoeffs.create(3, 3, CV_32F);
-        errors += run_test_case( CV_StsBadArg, "Bad distCoeffs format", caller );
+        errors += run_test_case( cv::Error::StsBadArg, "Bad distCoeffs format", caller );
         caller.distCoeffs = save_DC;
 
         if (errors)
diff --git a/modules/calib3d/test/test_chesscorners.cpp b/modules/calib3d/test/test_chesscorners.cpp
index 7226da999abf..a63d5b3e83db 100644
--- a/modules/calib3d/test/test_chesscorners.cpp
+++ b/modules/calib3d/test/test_chesscorners.cpp
@@ -73,7 +73,7 @@ void show_points( const Mat& gray, const Mat& expected, const vector<Point2f>& a
 #define show_points(...)
 #endif
 
-enum Pattern { CHESSBOARD,CHESSBOARD_SB,CIRCLES_GRID, ASYMMETRIC_CIRCLES_GRID};
+enum Pattern { CHESSBOARD, CHESSBOARD_SB, CHESSBOARD_PLAIN, CIRCLES_GRID, ASYMMETRIC_CIRCLES_GRID};
 
 class CV_ChessboardDetectorTest : public cvtest::BaseTest
 {
@@ -149,6 +149,25 @@ void CV_ChessboardDetectorTest::run( int /*start_from */)
         case CHESSBOARD_SB:
             checkByGeneratorHighAccuracy();      // not supported by CHESSBOARD
             /* fallthrough */
+        case CHESSBOARD_PLAIN:
+            checkByGenerator();
+            if (ts->get_err_code() != cvtest::TS::OK)
+            {
+                break;
+            }
+
+            run_batch("negative_list.dat");
+            if (ts->get_err_code() != cvtest::TS::OK)
+            {
+                break;
+            }
+
+            run_batch("chessboard_list.dat");
+            if (ts->get_err_code() != cvtest::TS::OK)
+            {
+                break;
+            }
+            break;
         case CHESSBOARD:
             checkByGenerator();
             if (ts->get_err_code() != cvtest::TS::OK)
@@ -191,6 +210,7 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
     {
         case CHESSBOARD:
         case CHESSBOARD_SB:
+        case CHESSBOARD_PLAIN:
             folder = string(ts->get_data_path()) + "cv/cameracalibration/";
             break;
         case CIRCLES_GRID:
@@ -215,6 +235,9 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
 
     int progress = 0;
     int max_idx = (int)board_list.size()/2;
+    if(filename.compare("chessboard_list.dat") == 0 && pattern == CHESSBOARD_PLAIN)
+         max_idx = 7;
+
     double sum_error = 0.0;
     int count = 0;
 
@@ -247,6 +270,7 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
         size_t count_exp = static_cast<size_t>(expected.cols * expected.rows);
         Size pattern_size = expected.size();
 
+        Mat ori;
         vector<Point2f> v;
         int flags = 0;
         switch( pattern )
@@ -254,14 +278,30 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
             case CHESSBOARD:
                 flags = CALIB_CB_ADAPTIVE_THRESH | CALIB_CB_NORMALIZE_IMAGE;
                 break;
+            case CHESSBOARD_PLAIN: {
+                flags = CALIB_CB_PLAIN;
+                ori = gray.clone();
+                int min_size = cvRound((gray.cols * gray.rows * 0.05) / ((pattern_size.width+1) * (pattern_size.height+1)));
+                if(min_size%2==0) min_size += 1;
+                adaptiveThreshold(gray, gray, 255, ADAPTIVE_THRESH_MEAN_C, THRESH_BINARY, min_size, 0);
+                dilate(gray, gray, Mat(), Point(-1, -1), 1);
+                break;
+            }
             case CIRCLES_GRID:
             case CHESSBOARD_SB:
             case ASYMMETRIC_CIRCLES_GRID:
             default:
                 flags = 0;
         }
+
         bool result = findChessboardCornersWrapper(gray, pattern_size,v,flags);
-        if(result && sharpness && (pattern == CHESSBOARD_SB || pattern == CHESSBOARD))
+
+        if(result && pattern == CHESSBOARD_PLAIN) {
+            gray = ori;
+            cornerSubPix(gray, v, Size(6,6), Size(-1,-1), TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 30, 0.1));
+        }
+
+        if(result && sharpness && (pattern == CHESSBOARD_SB || pattern == CHESSBOARD || pattern == CHESSBOARD_PLAIN))
         {
             Scalar s= estimateChessboardSharpness(gray,pattern_size,v);
             if(fabs(s[0] - sharpness) > 0.1)
@@ -287,7 +327,7 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
             double err = calcError(v, expected);
             max_rough_error = MAX( max_rough_error, err );
 #endif
-            if( pattern == CHESSBOARD )
+            if( pattern == CHESSBOARD || pattern == CHESSBOARD_PLAIN )
                 cornerSubPix( gray, v, Size(5, 5), Size(-1,-1), TermCriteria(TermCriteria::EPS|TermCriteria::MAX_ITER, 30, 0.1));
             //find4QuadCornerSubpix(gray, v, Size(5, 5));
             show_points( gray, expected, v, result  );
@@ -381,6 +421,7 @@ bool CV_ChessboardDetectorTest::findChessboardCornersWrapper(InputArray image, S
     switch(pattern)
     {
     case CHESSBOARD:
+    case CHESSBOARD_PLAIN:
         return findChessboardCorners(image,patternSize,corners,flags);
     case CHESSBOARD_SB:
         // check default settings until flags have been specified
@@ -631,6 +672,7 @@ bool CV_ChessboardDetectorTest::checkByGeneratorHighAccuracy()
 
 TEST(Calib3d_ChessboardDetector, accuracy) {  CV_ChessboardDetectorTest test( CHESSBOARD ); test.safe_run(); }
 TEST(Calib3d_ChessboardDetector2, accuracy) {  CV_ChessboardDetectorTest test( CHESSBOARD_SB ); test.safe_run(); }
+TEST(Calib3d_ChessboardDetector3, accuracy) {  CV_ChessboardDetectorTest test( CHESSBOARD_PLAIN ); test.safe_run(); }
 TEST(Calib3d_CirclesPatternDetector, accuracy) { CV_ChessboardDetectorTest test( CIRCLES_GRID ); test.safe_run(); }
 TEST(Calib3d_AsymmetricCirclesPatternDetector, accuracy) { CV_ChessboardDetectorTest test( ASYMMETRIC_CIRCLES_GRID ); test.safe_run(); }
 #ifdef HAVE_OPENCV_FLANN
@@ -750,5 +792,42 @@ TEST(Calib3d_AsymmetricCirclesPatternDetector, regression_19498)
     EXPECT_FALSE(res);
 }
 
+TEST(Calib3d_RotatedCirclesPatternDetector, issue_24964)
+{
+    string path = cvtest::findDataFile("cv/cameracalibration/circles/circles_24964.png");
+    Mat image = cv::imread(path);
+    ASSERT_FALSE(image.empty()) << "Can't read image: " << path;
+
+    vector<Point2f> centers;
+    Size parrernSize(7, 6);
+    Mat goldCenters(parrernSize.height, parrernSize.width, CV_32FC2);
+    Point2f firstGoldCenter(380.f, 430.f);
+    for (int i = 0; i < parrernSize.height; i++)
+    {
+        for (int j = 0; j < parrernSize.width; j++)
+        {
+            goldCenters.at<Point2f>(i, j) = Point2f(firstGoldCenter.x + j * 100.f, firstGoldCenter.y + i * 100.f);
+        }
+    }
+
+    bool found = false;
+    found = findCirclesGrid(image, parrernSize, centers, CALIB_CB_SYMMETRIC_GRID);
+
+    EXPECT_TRUE(found);
+    ASSERT_EQ(centers.size(), (size_t)parrernSize.area());
+    double error = calcError(centers, goldCenters);
+    EXPECT_LE(error, precise_success_error_level);
+
+    // "rotate" the circle grid by 90 degrees
+    swap(parrernSize.height, parrernSize.width);
+
+    found = findCirclesGrid(image, parrernSize, centers, CALIB_CB_SYMMETRIC_GRID);
+    error = calcError(centers, goldCenters.t());
+
+    EXPECT_TRUE(found);
+    ASSERT_EQ(centers.size(), (size_t)parrernSize.area());
+    EXPECT_LE(error, precise_success_error_level);
+}
+
 }} // namespace
 /* End of file. */
diff --git a/modules/calib3d/test/test_chesscorners_timing.cpp b/modules/calib3d/test/test_chesscorners_timing.cpp
index 9bd0f87078d2..e24172362fdf 100644
--- a/modules/calib3d/test/test_chesscorners_timing.cpp
+++ b/modules/calib3d/test/test_chesscorners_timing.cpp
@@ -139,13 +139,13 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
         }
 
         int num_pixels = gray.cols*gray.rows;
-        float check_chessboard_time = float(_time01 - _time0)/(float)cv::getTickFrequency(); // in us
+        float check_chessboard_time = float(_time01 - _time0)/(float)cv::getTickFrequency(); // in s
         ts->printf(cvtest::TS::LOG, "    cvCheckChessboard time s: %f, us per pixel: %f\n",
-                   check_chessboard_time*1e-6, check_chessboard_time/num_pixels);
+                   check_chessboard_time, check_chessboard_time*1e6/num_pixels);
 
         float find_chessboard_time = float(_time1 - _time01)/(float)cv::getTickFrequency();
         ts->printf(cvtest::TS::LOG, "    cvFindChessboard time s: %f, us per pixel: %f\n",
-                   find_chessboard_time*1e-6, find_chessboard_time/num_pixels);
+                   find_chessboard_time, find_chessboard_time*1e6/num_pixels);
         progress = update_progress( progress, idx-1, max_idx, 0 );
     }
 
diff --git a/modules/calib3d/test/test_decompose_projection.cpp b/modules/calib3d/test/test_decompose_projection.cpp
index e43aa65b69ba..a25081b45240 100644
--- a/modules/calib3d/test/test_decompose_projection.cpp
+++ b/modules/calib3d/test/test_decompose_projection.cpp
@@ -141,4 +141,43 @@ TEST(Calib3d_DecomposeProjectionMatrix, accuracy)
     test.safe_run();
 }
 
+TEST(Calib3d_DecomposeProjectionMatrix, degenerate_cases)
+{
+    for (int i = 0; i < 3; i++)
+    {
+        for (int j = 0; j < 2; j++)
+        {
+            cv::Matx34d P;
+            P(0, i) = 1;
+            P(1, (i + j + 1) % 3) = 1;
+            P(2, (i + 2 * j + 2) % 3) = 1;
+
+            cv::Matx33d K, R;
+            cv::Vec4d t;
+            decomposeProjectionMatrix(P, K, R, t);
+            EXPECT_LT(cv::norm(K * R, P.get_minor<3, 3>(0, 0), cv::NORM_INF), 1e-6);
+        }
+    }
+}
+
+TEST(Calib3d_DecomposeProjectionMatrix, bug_23733)
+{
+    cv::Matx34d P(52, -7, 4, 12,
+                  -6, 49, 12, 8,
+                  4, 17, 1, 0);
+    P *= 1e-6;
+
+    cv::Matx33d K, R;
+    cv::Vec4d t;
+    decomposeProjectionMatrix(P, K, R, t);
+
+    EXPECT_LT(cv::norm(R.t() * R - cv::Matx33d::eye(), cv::NORM_INF), 1e-10);
+
+    cv::Matx34d M;
+    cv::hconcat(R, -R * cv::Vec3d(t[0] / t[3], t[1] / t[3], t[2] / t[3]), M);
+
+    cv::Matx34d P_recompose = K * M;
+    EXPECT_LT(cv::norm(P_recompose - P, cv::NORM_INF), 1e-16);
+}
+
 }} // namespace
diff --git a/modules/calib3d/test/test_fisheye.cpp b/modules/calib3d/test/test_fisheye.cpp
index b914b72957d6..36b7d0d653dc 100644
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@@ -150,6 +150,28 @@ TEST_F(fisheyeTest, distortUndistortPoints)
     }
 }
 
+TEST_F(fisheyeTest, solvePnP)
+{
+    const int n = 16;
+
+    cv::Mat obj_points(1, n, CV_64FC3);
+    theRNG().fill(obj_points, cv::RNG::NORMAL, 2, 1);
+    obj_points = cv::abs(obj_points) * 10;
+
+    cv::Mat rvec;
+    cv::Rodrigues(this->R, rvec);
+    cv::Mat img_points;
+    cv::fisheye::projectPoints(obj_points, img_points, rvec, this->T, this->K, this->D);
+
+    cv::Mat rvec_pred;
+    cv::Mat tvec_pred;
+    bool converged = cv::fisheye::solvePnP(obj_points, img_points, this->K, this->D, rvec_pred, tvec_pred);
+    EXPECT_MAT_NEAR(rvec, rvec_pred, 1e-6);
+    EXPECT_MAT_NEAR(this->T, tvec_pred, 1e-6);
+
+    ASSERT_TRUE(converged);
+}
+
 TEST_F(fisheyeTest, undistortImage)
 {
     // we use it to reduce patch size for images in testdata
@@ -163,7 +185,7 @@ TEST_F(fisheyeTest, undistortImage)
 
     cv::Matx33d theK = this->K;
     cv::Mat theD = cv::Mat(this->D);
-    std::string file = combine(datasets_repository_path, "/calib-3_stereo_from_JY/left/stereo_pair_014.jpg");
+    std::string file = combine(datasets_repository_path, "stereo_pair_014.png");
     cv::Matx33d newK = theK;
     cv::Mat distorted = cv::imread(file), undistorted;
     {
@@ -410,6 +432,11 @@ TEST_F(fisheyeTest, Calibration)
 {
     const int n_images = 34;
 
+    const cv::Matx33d goldK(558.4780870585967, 0, 620.4585053962692,
+                            0, 560.5067667343917, 381.9394122875291,
+                            0, 0, 1);
+    const cv::Vec4d goldD(-0.00146136, -0.00329847, 0.00605742, -0.00374201);
+
     std::vector<std::vector<cv::Point2d> > imagePoints(n_images);
     std::vector<std::vector<cv::Point3d> > objectPoints(n_images);
 
@@ -437,8 +464,8 @@ TEST_F(fisheyeTest, Calibration)
     cv::fisheye::calibrate(objectPoints, imagePoints, imageSize, theK, theD,
                            cv::noArray(), cv::noArray(), flag, cv::TermCriteria(3, 20, 1e-6));
 
-    EXPECT_MAT_NEAR(theK, this->K, 1e-10);
-    EXPECT_MAT_NEAR(theD, this->D, 1e-10);
+    EXPECT_MAT_NEAR(theK, goldK, 1e-8);
+    EXPECT_MAT_NEAR(theD, goldD, 1e-8);
 }
 
 TEST_F(fisheyeTest, CalibrationWithFixedFocalLength)
@@ -597,10 +624,10 @@ TEST_F(fisheyeTest, EstimateUncertainties)
     cv::internal::EstimateUncertainties(objectPoints, imagePoints, param,  rvec, tvec,
                                         errors, err_std, thresh_cond, check_cond, rms);
 
-    EXPECT_MAT_NEAR(errors.f, cv::Vec2d(1.34250246865020720, 1.36037536429654530), 1e-10);
-    EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.92070526160049848, 0.84383585812851514), 1e-10);
-    EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.0053379581373996041, 0.017389792901700545, 0.022036256089491224, 0.0094714594258908952), 1e-10);
-    EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-10);
+    EXPECT_MAT_NEAR(errors.f, cv::Vec2d(1.34250246865020720, 1.36037536429654530), 1e-6);
+    EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.92070526160049848, 0.84383585812851514), 1e-6);
+    EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.0053379581373996041, 0.017389792901700545, 0.022036256089491224, 0.0094714594258908952), 1e-7);
+    EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-7);
     CV_Assert(fabs(rms - 0.263782587133546) < 1e-10);
     CV_Assert(errors.alpha == 0);
 }
diff --git a/modules/calib3d/test/test_homography.cpp b/modules/calib3d/test/test_homography.cpp
index 41188a066db6..d8ab243a840c 100644
--- a/modules/calib3d/test/test_homography.cpp
+++ b/modules/calib3d/test/test_homography.cpp
@@ -704,4 +704,63 @@ TEST(Calib3d_Homography, minPoints)
     EXPECT_THROW(findHomography(p1, p2, RANSAC, 0.01, mask), cv::Exception);
 }
 
+TEST(Calib3d_Homography, not_normalized)
+{
+    Mat_<double> p1({5, 2}, {-1, -1, -2, -2, -1, 1, -2, 2, -1, 0});
+    Mat_<double> p2({5, 2}, {0, -1, -1, -1, 0, 0, -1, 0, 0, -0.5});
+    Mat_<double> ref({3, 3}, {
+        0.74276086, 0., 0.74276086,
+        0.18569022, 0.18569022, 0.,
+        -0.37138043, 0., 0.
+    });
+
+    for (int method : std::vector<int>({0, RANSAC, LMEDS}))
+    {
+        Mat h = findHomography(p1, p2, method);
+        for (auto it = h.begin<double>(); it != h.end<double>(); ++it) {
+            ASSERT_FALSE(cvIsNaN(*it)) << cv::format("method %d\nResult:\n", method) << h;
+        }
+        if (h.at<double>(0, 0) * ref.at<double>(0, 0) < 0) {
+            h *= -1;
+        }
+        ASSERT_LE(cv::norm(h, ref, NORM_INF), 1e-8) << cv::format("method %d\nResult:\n", method) << h;
+    }
+}
+
+TEST(Calib3d_Homography, Refine)
+{
+    Mat_<double> p1({10, 2}, {41, -86, -87, 99, 66, -96, -86, -8, -67, 24,
+                              -87, -76, -19, 89, 37, -4, -86, -86, -66, -53});
+    Mat_<double> p2({10, 2}, {
+        0.007723226608700208, -1.177541410622515,
+        -0.1909072353027552, -0.4247610181930323,
+        -0.134992319993638, -0.6469949816560389,
+        -0.3570627451405215, 0.1811469436293486,
+        -0.3005671881038939, -0.02325733734262935,
+        -0.4404509481789249, 0.4851526464158342,
+        0.6343346428859541, -3.396187657072353,
+        -0.3539383967092603, 0.1469447227353143,
+        -0.4526924606856586, 0.5296757109061794,
+        -0.4309974583614644, 0.4522732662733471
+    });
+    hconcat(p1, Mat::ones(p1.rows, 1, CV_64F), p1);
+    hconcat(p2, Mat::ones(p2.rows, 1, CV_64F), p2);
+
+    for(int method : std::vector<int>({0, RANSAC, LMEDS}))
+    {
+        Mat h = findHomography(p1, p2, method);
+        EXPECT_NEAR(h.at<double>(2, 2), 1.0, 1e-7);
+
+        Mat proj = p1 * h.t();
+        proj.col(0) /= proj.col(2);
+        proj.col(1) /= proj.col(2);
+
+        Mat error;
+        cv::pow(p2.colRange(0, 2) - proj.colRange(0, 2), 2, error);
+        cv::reduce(error, error, 1, REDUCE_SUM);
+        cv::reduce(error, error, 0, REDUCE_AVG);
+        EXPECT_LE(sqrt(error.at<double>(0, 0)), method == LMEDS ? 7e-4 : 7e-5);
+    }
+}
+
 }} // namespace
diff --git a/modules/calib3d/test/test_solvepnp_ransac.cpp b/modules/calib3d/test/test_solvepnp_ransac.cpp
index 43b90dff9272..a16928c73885 100644
--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/core/utils/logger.hpp"
 
 namespace opencv_test { namespace {
 
@@ -1531,8 +1532,8 @@ TEST(Calib3d_SolvePnP, generic)
                 }
                 else
                 {
-                    p3f = p3f_;
-                    p2f = p2f_;
+                    p3f = vector<Point3f>(p3f_.begin(), p3f_.end());
+                    p2f = vector<Point2f>(p2f_.begin(), p2f_.end());
                 }
 
                 vector<double> reprojectionErrors;
@@ -2258,4 +2259,65 @@ TEST(Calib3d_SolvePnP, inputShape)
     }
 }
 
+bool hasNan(const cv::Mat& mat)
+{
+    bool has = false;
+    if (mat.type() == CV_32F)
+    {
+        for(int i = 0; i < static_cast<int>(mat.total()); i++)
+            has |= cvIsNaN(mat.at<float>(i)) != 0;
+    }
+    else if (mat.type() == CV_64F)
+    {
+        for(int i = 0; i < static_cast<int>(mat.total()); i++)
+            has |= cvIsNaN(mat.at<double>(i)) != 0;
+    }
+    else
+    {
+        has = true;
+        CV_LOG_ERROR(NULL, "check hasNan called with unsupported type!");
+    }
+
+    return has;
+}
+
+TEST(AP3P, ctheta1p_nan_23607)
+{
+    // the task is not well defined and may not converge (empty R, t) or should
+    // converge to some non-NaN solution
+    const std::array<cv::Point2d, 3> cameraPts = {
+        cv::Point2d{0.042784865945577621, 0.59844839572906494},
+        cv::Point2d{-0.028428621590137482, 0.60354739427566528},
+        cv::Point2d{0.0046037044376134872, 0.70674681663513184}
+    };
+    const std::array<cv::Point3d, 3> modelPts = {
+        cv::Point3d{-0.043258000165224075, 0.020459245890378952, -0.0069921980611979961},
+        cv::Point3d{-0.045648999512195587, 0.0029820732306689024, 0.0079000638797879219},
+        cv::Point3d{-0.043276999145746231, -0.013622495345771313, 0.0080113131552934647}
+    };
+
+    std::vector<Mat> R, t;
+    solveP3P(modelPts, cameraPts, Mat::eye(3, 3, CV_64F), Mat(), R, t, SOLVEPNP_AP3P);
+
+    EXPECT_EQ(R.size(), 2ul);
+    EXPECT_EQ(t.size(), 2ul);
+
+    // Try apply rvec and tvec to get model points from camera points.
+    Mat pts = Mat(modelPts).reshape(1, 3);
+    Mat expected = Mat(cameraPts).reshape(1, 3);
+    for (size_t i = 0; i < R.size(); ++i) {
+        EXPECT_TRUE(!hasNan(R[i]));
+        EXPECT_TRUE(!hasNan(t[i]));
+
+        Mat transform;
+        cv::Rodrigues(R[i], transform);
+        Mat res = pts * transform.t();
+        for (int j = 0; j < 3; ++j) {
+            res.row(j) += t[i].reshape(1, 1);
+            res.row(j) /= res.row(j).at<double>(2);
+        }
+        EXPECT_LE(cvtest::norm(res.colRange(0, 2), expected, NORM_INF), 3.34e-16);
+    }
+}
+
 }} // namespace
diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp
index 02d1823d2d5c..c17d92292a41 100644
--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -740,8 +740,8 @@ class CV_StereoBMTest : public CV_StereoMatchingTest
     CV_StereoBMTest()
     {
         name = "stereobm";
-        fill(rmsEps.begin(), rmsEps.end(), 0.4f);
-        fill(fracEps.begin(), fracEps.end(), 0.022f);
+        std::fill(rmsEps.begin(), rmsEps.end(), 0.4f);
+        std::fill(fracEps.begin(), fracEps.end(), 0.022f);
     }
 
 protected:
@@ -866,8 +866,8 @@ class CV_StereoSGBMTest : public CV_StereoMatchingTest
     CV_StereoSGBMTest()
     {
         name = "stereosgbm";
-        fill(rmsEps.begin(), rmsEps.end(), 0.25f);
-        fill(fracEps.begin(), fracEps.end(), 0.01f);
+        std::fill(rmsEps.begin(), rmsEps.end(), 0.25f);
+        std::fill(fracEps.begin(), fracEps.end(), 0.01f);
     }
 
 protected:
diff --git a/modules/calib3d/test/test_translation3d_estimator.cpp b/modules/calib3d/test/test_translation3d_estimator.cpp
index 88ad40e0f8d5..97c20e503377 100644
--- a/modules/calib3d/test/test_translation3d_estimator.cpp
+++ b/modules/calib3d/test/test_translation3d_estimator.cpp
@@ -91,8 +91,8 @@ TEST(Calib3d_EstimateTranslation3D, testNPoints)
         << "aff est: " << trans_est << endl
         << "aff ref: " << trans;
 
-    bool outl_good = count(outl.begin(), outl.end(), 1) == m &&
-        m == accumulate(outl.begin(), outl.begin() + m, 0);
+    bool outl_good = std::count(outl.begin(), outl.end(), 1) == m &&
+        m == std::accumulate(outl.begin(), outl.begin() + m, 0);
 
     EXPECT_TRUE(outl_good);
 }
diff --git a/modules/calib3d/test/test_undistort_badarg.cpp b/modules/calib3d/test/test_undistort_badarg.cpp
index 0ba13f12125f..71bb21a0bd04 100644
--- a/modules/calib3d/test/test_undistort_badarg.cpp
+++ b/modules/calib3d/test/test_undistort_badarg.cpp
@@ -106,15 +106,15 @@ void CV_UndistortPointsBadArgTest::run(int)
     src_points = cv::cvarrToMat(&_src_points_orig);
 
     src_points.create(2, 2, CV_32FC2);
-    errcount += run_test_case( CV_StsAssert, "Invalid input data matrix size" );
+    errcount += run_test_case( cv::Error::StsAssert, "Invalid input data matrix size" );
     src_points = cv::cvarrToMat(&_src_points_orig);
 
     src_points.create(1, 4, CV_64FC2);
-    errcount += run_test_case( CV_StsAssert, "Invalid input data matrix type" );
+    errcount += run_test_case( cv::Error::StsAssert, "Invalid input data matrix type" );
     src_points = cv::cvarrToMat(&_src_points_orig);
 
     src_points = cv::Mat();
-    errcount += run_test_case( CV_StsBadArg, "Input data matrix is not continuous" );
+    errcount += run_test_case( cv::Error::StsBadArg, "Input data matrix is not continuous" );
     src_points = cv::cvarrToMat(&_src_points_orig);
 
 //------------
@@ -181,19 +181,19 @@ void CV_InitUndistortRectifyMapBadArgTest::run(int)
     mapy = cv::cvarrToMat(&_mapy_orig);
 
     mat_type = CV_64F;
-    errcount += run_test_case( CV_StsAssert, "Invalid map matrix type" );
+    errcount += run_test_case( cv::Error::StsAssert, "Invalid map matrix type" );
     mat_type = mat_type_orig;
 
     camera_mat.create(3, 2, CV_32F);
-    errcount += run_test_case( CV_StsAssert, "Invalid camera data matrix size" );
+    errcount += run_test_case( cv::Error::StsAssert, "Invalid camera data matrix size" );
     camera_mat = cv::cvarrToMat(&_camera_mat_orig);
 
     R.create(4, 3, CV_32F);
-    errcount += run_test_case( CV_StsAssert, "Invalid R data matrix size" );
+    errcount += run_test_case( cv::Error::StsAssert, "Invalid R data matrix size" );
     R = cv::cvarrToMat(&_R_orig);
 
     distortion_coeffs.create(6, 1, CV_32F);
-    errcount += run_test_case( CV_StsAssert, "Invalid distortion coefficients data matrix size" );
+    errcount += run_test_case( cv::Error::StsAssert, "Invalid distortion coefficients data matrix size" );
     distortion_coeffs = cv::cvarrToMat(&_distortion_coeffs_orig);
 
 //------------
@@ -256,7 +256,7 @@ void CV_UndistortBadArgTest::run(int)
     dst = cv::cvarrToMat(&_dst_orig);
 
     camera_mat.create(5, 5, CV_64F);
-    errcount += run_test_case( CV_StsAssert, "Invalid camera data matrix size" );
+    errcount += run_test_case( cv::Error::StsAssert, "Invalid camera data matrix size" );
 
 //------------
     ts->set_failed_test_info(errcount > 0 ? cvtest::TS::FAIL_BAD_ARG_CHECK : cvtest::TS::OK);
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 1b3f574275e8..16f32c994a65 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,26 +1,26 @@
 set(the_description "The Core Functionality")
 
-ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
-ocv_add_dispatched_file(stat SSE4_2 AVX2)
-ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
-ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
-ocv_add_dispatched_file(convert_scale SSE2 AVX2)
-ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
-ocv_add_dispatched_file(has_non_zero SSE2 AVX2)
-ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD)
-ocv_add_dispatched_file(mean SSE2 AVX2)
-ocv_add_dispatched_file(merge SSE2 AVX2)
-ocv_add_dispatched_file(split SSE2 AVX2)
-ocv_add_dispatched_file(sum SSE2 AVX2)
+ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2 LASX)
+ocv_add_dispatched_file(stat SSE4_2 AVX2 LASX)
+ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3 LASX)
+ocv_add_dispatched_file(convert SSE2 AVX2 VSX3 LASX)
+ocv_add_dispatched_file(convert_scale SSE2 AVX2 LASX)
+ocv_add_dispatched_file(count_non_zero SSE2 AVX2 LASX)
+ocv_add_dispatched_file(has_non_zero SSE2 AVX2 LASX )
+ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD LASX)
+ocv_add_dispatched_file(mean SSE2 AVX2 LASX)
+ocv_add_dispatched_file(merge SSE2 AVX2 LASX)
+ocv_add_dispatched_file(split SSE2 AVX2 LASX)
+ocv_add_dispatched_file(sum SSE2 AVX2 LASX)
 
 # dispatching for accuracy tests
 ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2 AVX512_SKX)
-ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2 AVX512_SKX LASX)
 ocv_add_dispatched_file_force_all(test_intrin512 TEST AVX512_SKX)
 
 
 set(PARALLEL_ENABLE_PLUGINS_DEFAULT ON)
-if(EMSCRIPTEN OR IOS OR WINRT)
+if(EMSCRIPTEN OR IOS OR XROS OR WINRT)
   set(PARALLEL_ENABLE_PLUGINS_DEFAULT OFF)
 endif()
 # parallel backends configuration
@@ -49,17 +49,30 @@ if(DEFINED WINRT AND NOT DEFINED ENABLE_WINRT_MODE_NATIVE)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW")
 endif()
 
-if(HAVE_CUDA)
-  if(NOT HAVE_opencv_cudev)
-    message(FATAL_ERROR "CUDA: OpenCV requires enabled 'cudev' module from 'opencv_contrib' repository: https://github.com/opencv/opencv_contrib")
-  endif()
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function -Wshadow)
-endif()
-
 if(CV_TRACE AND HAVE_ITT)
   add_definitions(-DOPENCV_WITH_ITT=1)
 endif()
 
+# https://github.com/opencv/opencv/issues/24145
+if(HAVE_IPP)
+  OCV_OPTION(OPENCV_IPP_ENABLE_ALL "Enable all OPENCV_IPP_ options at once" OFF)
+  OCV_OPTION(OPENCV_IPP_MEAN   "Enable IPP optimizations for mean (+200Kb in binary size)"                OPENCV_IPP_ENABLE_ALL)
+  OCV_OPTION(OPENCV_IPP_MINMAX "Enable IPP optimizations for minMaxLoc/minMaxIdx (+200Kb in binary size)" OPENCV_IPP_ENABLE_ALL)
+  OCV_OPTION(OPENCV_IPP_SUM    "Enable IPP optimizations for sum (+100Kb in binary size)"                 OPENCV_IPP_ENABLE_ALL)
+
+  if(OPENCV_IPP_MEAN)
+    ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/mean.dispatch.cpp "OPENCV_IPP_MEAN=1")
+  endif()
+
+  if(OPENCV_IPP_MINMAX)
+    ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/minmax.cpp "OPENCV_IPP_MINMAX=1")
+  endif()
+
+  if(OPENCV_IPP_SUM)
+    ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/sum.dispatch.cpp "OPENCV_IPP_SUM=1")
+  endif()
+endif()
+
 file(GLOB lib_cuda_hdrs
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cuda/*.hpp"
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cuda/*.h")
@@ -133,7 +146,6 @@ elseif(HAVE_CXX11 OR DEFINED OPENCV_ALLOCATOR_STATS_COUNTER_TYPE)
   endif()
 endif()
 
-
 if(PARALLEL_ENABLE_PLUGINS)
   ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/parallel/parallel.cpp "PARALLEL_ENABLE_PLUGINS=1")
   if(OPENCV_DEBUG_POSTFIX)
@@ -141,6 +153,15 @@ if(PARALLEL_ENABLE_PLUGINS)
   endif()
 endif()
 
+if(HAVE_CUDA)
+  if(NOT HAVE_opencv_cudev)
+    message(FATAL_ERROR "CUDA: OpenCV requires enabled 'cudev' module from 'opencv_contrib' repository: https://github.com/opencv/opencv_contrib")
+  endif()
+  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+   ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+  endif()
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function -Wshadow)
+endif()
 
 ocv_create_module(${extra_libs})
 
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index d9a21701f2d4..b58a3a6ccbbe 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -62,10 +62,6 @@
 @defgroup core Core functionality
 @{
     @defgroup core_basic Basic structures
-    @defgroup core_c C structures and operations
-    @{
-        @defgroup core_c_glue Connections with C++
-    @}
     @defgroup core_array Operations on arrays
     @defgroup core_async Asynchronous API
     @defgroup core_xml XML/YAML Persistence
@@ -252,7 +248,7 @@ CV_EXPORTS void swap( UMat& a, UMat& b );
 The function computes and returns the coordinate of a donor pixel corresponding to the specified
 extrapolated pixel when using the specified extrapolation border mode. For example, if you use
 cv::BORDER_WRAP mode in the horizontal direction, cv::BORDER_REFLECT_101 in the vertical direction and
-want to compute value of the "virtual" pixel Point(-5, 100) in a floating-point image img , it
+want to compute value of the "virtual" pixel Point(-5, 100) in a floating-point image img, it
 looks like:
 @code{.cpp}
     float val = img.at<float>(borderInterpolate(100, img.rows, cv::BORDER_REFLECT_101),
@@ -263,7 +259,7 @@ copyMakeBorder.
 @param p 0-based coordinate of the extrapolated pixel along one of the axes, likely \<0 or \>= len
 @param len Length of the array along the corresponding axis.
 @param borderType Border type, one of the #BorderTypes, except for #BORDER_TRANSPARENT and
-#BORDER_ISOLATED . When borderType==#BORDER_CONSTANT , the function always returns -1, regardless
+#BORDER_ISOLATED. When borderType==#BORDER_CONSTANT, the function always returns -1, regardless
 of p and len.
 
 @sa copyMakeBorder
@@ -349,6 +345,9 @@ be set to the default -1. In this case, the output array will have the same dept
 array, be it src1, src2 or both.
 @note Saturation is not applied when the output array has the depth CV_32S. You may even get
 result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`add(src,X)` means `add(src,(X,X,X,X))`.
+`add(src,(X,))` means `add(src,(X,0,0,0))`.
 @param src1 first input array or a scalar.
 @param src2 second input array or a scalar.
 @param dst output array that has the same size and number of channels as the input array(s); the
@@ -390,6 +389,9 @@ in the first case, when src1.depth() == src2.depth(), dtype can be set to the de
 case the output array will have the same depth as the input array, be it src1, src2 or both.
 @note Saturation is not applied when the output array has the depth CV_32S. You may even get
 result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`subtract(src,X)` means `subtract(src,(X,X,X,X))`.
+`subtract(src,(X,))` means `subtract(src,(X,0,0,0))`.
 @param src1 first input array or a scalar.
 @param src2 second input array or a scalar.
 @param dst output array of the same size and the same number of channels as the input array.
@@ -415,6 +417,9 @@ For a not-per-element matrix product, see gemm .
 @note Saturation is not applied when the output array has the depth
 CV_32S. You may even get result of an incorrect sign in the case of
 overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`multiply(src,X)` means `multiply(src,(X,X,X,X))`.
+`multiply(src,(X,))` means `multiply(src,(X,0,0,0))`.
 @param src1 first input array.
 @param src2 second input array of the same size and the same type as src1.
 @param dst output array of the same size and type as src1.
@@ -443,6 +448,9 @@ Expect correct IEEE-754 behaviour for floating-point data (with NaN, Inf result
 
 @note Saturation is not applied when the output array has the depth CV_32S. You may even get
 result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`divide(src,X)` means `divide(src,(X,X,X,X))`.
+`divide(src,(X,))` means `divide(src,(X,0,0,0))`.
 @param src1 first input array.
 @param src2 second input array of the same size and type as src1.
 @param scale scalar factor.
@@ -544,6 +552,8 @@ The format of half precision floating point is defined in IEEE 754-2008.
 
 @param src input array.
 @param dst output array.
+
+@deprecated Use Mat::convertTo with CV_16F instead.
 */
 CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst);
 
@@ -575,8 +585,18 @@ CV_EXPORTS_AS(sumElems) Scalar sum(InputArray src);
 /** @brief Checks for the presence of at least one non-zero array element.
 
 The function returns whether there are non-zero elements in src
+
+The function do not work with multi-channel arrays. If you need to check non-zero array
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If the location of non-zero array elements is important, @ref findNonZero is helpful.
+- If the count of non-zero array elements is important, @ref countNonZero is helpful.
 @param src single-channel array.
 @sa  mean, meanStdDev, norm, minMaxLoc, calcCovarMatrix
+@sa  findNonZero, countNonZero
 */
 CV_EXPORTS_W bool hasNonZero( InputArray src );
 
@@ -584,8 +604,18 @@ CV_EXPORTS_W bool hasNonZero( InputArray src );
 
 The function returns the number of non-zero elements in src :
 \f[\sum _{I: \; \texttt{src} (I) \ne0 } 1\f]
+
+The function do not work with multi-channel arrays. If you need to count non-zero array
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If only whether there are non-zero elements is important, @ref hasNonZero is helpful.
+- If the location of non-zero array elements is important, @ref findNonZero is helpful.
 @param src single-channel array.
 @sa  mean, meanStdDev, norm, minMaxLoc, calcCovarMatrix
+@sa  findNonZero, hasNonZero
 */
 CV_EXPORTS_W int countNonZero( InputArray src );
 
@@ -612,8 +642,18 @@ or
     // access pixel coordinates
     Point pnt = locations[i];
 @endcode
+
+The function do not work with multi-channel arrays. If you need to find non-zero
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If only count of non-zero array elements is important, @ref countNonZero is helpful.
+- If only whether there are non-zero elements is important, @ref hasNonZero is helpful.
 @param src single-channel array
 @param idx the output array, type of cv::Mat or std::vector<Point>, corresponding to non-zero indices in the input
+@sa  countNonZero, hasNonZero
 */
 CV_EXPORTS_W void findNonZero( InputArray src, OutputArray idx );
 
@@ -820,8 +860,8 @@ array region.
 
 The function do not work with multi-channel arrays. If you need to find minimum or maximum
 elements across all the channels, use Mat::reshape first to reinterpret the array as
-single-channel. Or you may extract the particular channel using either extractImageCOI , or
-mixChannels , or split .
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
 @param src input single-channel array.
 @param minVal pointer to the returned minimum value; NULL is used if not required.
 @param maxVal pointer to the returned maximum value; NULL is used if not required.
@@ -872,11 +912,8 @@ CV_EXPORTS_W void reduceArgMax(InputArray src, OutputArray dst, int axis, bool l
 
 The function cv::minMaxIdx finds the minimum and maximum element values and their positions. The
 extremums are searched across the whole array or, if mask is not an empty array, in the specified
-array region. The function does not work with multi-channel arrays. If you need to find minimum or
-maximum elements across all the channels, use Mat::reshape first to reinterpret the array as
-single-channel. Or you may extract the particular channel using either extractImageCOI , or
-mixChannels , or split . In case of a sparse matrix, the minimum is found among non-zero elements
-only.
+array region. In case of a sparse matrix, the minimum is found among non-zero elements
+only. Multi-channel input is supported without mask and extremums indexes (should be nullptr).
 @note When minIdx is not NULL, it must have at least 2 elements (as well as maxIdx), even if src is
 a single-row or single-column matrix. In OpenCV (following MATLAB) each array has at least 2
 dimensions, i.e. single-column matrix is Mx1 matrix (and therefore minIdx/maxIdx will be
@@ -911,8 +948,8 @@ CV_EXPORTS void minMaxLoc(const SparseMat& a, double* minVal,
 The function #reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
 1D vectors and performing the specified operation on the vectors until a single row/column is
 obtained. For example, the function can be used to compute horizontal and vertical projections of a
-raster image. In case of #REDUCE_MAX and #REDUCE_MIN , the output image should have the same type as the source one.
-In case of #REDUCE_SUM, #REDUCE_SUM2 and #REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
+raster image. In case of #REDUCE_MAX and #REDUCE_MIN, the output image should have the same type as the source one.
+In case of #REDUCE_SUM, #REDUCE_SUM2 and #REDUCE_AVG, the output may have a larger element bit-depth to preserve accuracy.
 And multi-channel arrays are also supported in these two reduction modes.
 
 The following code demonstrates its usage for a single channel matrix.
@@ -966,7 +1003,7 @@ CV_EXPORTS_W void merge(InputArrayOfArrays mv, OutputArray dst);
 The function cv::split splits a multi-channel array into separate single-channel arrays:
 \f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
 If you need to extract a single channel or do some other sophisticated channel permutation, use
-mixChannels .
+mixChannels.
 
 The following example demonstrates how to split a 3-channel matrix into 3 single channel matrices.
 @snippet snippets/core_split.cpp example
@@ -1107,7 +1144,7 @@ The example scenarios of using the function are the following:
 flipping around the x-axis and positive value (for example, 1) means
 flipping around y-axis. Negative value (for example, -1) means flipping
 around both axes.
-@sa transpose , repeat , completeSymm
+@sa transpose, repeat, completeSymm
 */
 CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
 
@@ -1118,6 +1155,13 @@ CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
  */
 CV_EXPORTS_W void flipND(InputArray src, OutputArray dst, int axis);
 
+/** @brief Broadcast the given Mat to the given shape.
+ * @param src input array
+ * @param shape target shape. Should be a list of CV_32S numbers. Note that negative values are not supported.
+ * @param dst output array that has the given shape
+ */
+CV_EXPORTS_W void broadcast(InputArray src, InputArray shape, OutputArray dst);
+
 enum RotateFlags {
     ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
     ROTATE_180 = 1, //!<Rotate 180 degrees clockwise
@@ -1132,7 +1176,7 @@ The function cv::rotate rotates the array in one of three different ways:
 @param dst output array of the same type as src.  The size is the same with ROTATE_180,
 and the rows and cols are switched for ROTATE_90_CLOCKWISE and ROTATE_90_COUNTERCLOCKWISE.
 @param rotateCode an enum to specify how to rotate the array; see the enum #RotateFlags
-@sa transpose , repeat , completeSymm, flip, RotateFlags
+@sa transpose, repeat, completeSymm, flip, RotateFlags
 */
 CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, int rotateCode);
 
@@ -1405,6 +1449,9 @@ The function cv::absdiff calculates:
     multi-channel arrays, each channel is processed independently.
 @note Saturation is not applied when the arrays have the depth CV_32S.
 You may even get a negative value in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`absdiff(src,X)` means `absdiff(src,(X,X,X,X))`.
+`absdiff(src,(X,))` means `absdiff(src,(X,0,0,0))`.
 @param src1 first input array or a scalar.
 @param src2 second input array or a scalar.
 @param dst output array that has the same size and type as input arrays.
@@ -1563,7 +1610,7 @@ converts denormalized values to zeros on output. Special values (NaN,
 Inf) are not handled.
 @param src input array.
 @param dst output array of the same size and type as src.
-@sa log , cartToPolar , polarToCart , phase , pow , sqrt , magnitude
+@sa log, cartToPolar, polarToCart, phase, pow, sqrt, magnitude
 */
 CV_EXPORTS_W void exp(InputArray src, OutputArray dst);
 
@@ -1675,7 +1722,7 @@ elements.
 CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
                             double minVal = -DBL_MAX, double maxVal = DBL_MAX);
 
-/** @brief converts NaNs to the given number
+/** @brief Replaces NaNs by given number
 @param a input/output matrix (CV_32F type).
 @param val value to convert the NaNs
 */
@@ -1707,7 +1754,7 @@ should have the same type as src1 and src2.
 @param dst output matrix; it has the proper size and the same type as
 input matrices.
 @param flags operation flags (cv::GemmFlags)
-@sa mulTransposed , transform
+@sa mulTransposed, transform
 */
 CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, double alpha,
                        InputArray src3, double beta, OutputArray dst, int flags = 0);
@@ -1717,7 +1764,7 @@ CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, double alpha,
 The function cv::mulTransposed calculates the product of src and its
 transposition:
 \f[\texttt{dst} = \texttt{scale} ( \texttt{src} - \texttt{delta} )^T ( \texttt{src} - \texttt{delta} )\f]
-if aTa=true , and
+if aTa=true, and
 \f[\texttt{dst} = \texttt{scale} ( \texttt{src} - \texttt{delta} ) ( \texttt{src} - \texttt{delta} )^T\f]
 otherwise. The function is used to calculate the covariance matrix. With
 zero delta, it can be used as a faster substitute for general matrix
@@ -1730,7 +1777,7 @@ description below.
 @param delta Optional delta matrix subtracted from src before the
 multiplication. When the matrix is empty ( delta=noArray() ), it is
 assumed to be zero, that is, nothing is subtracted. If it has the same
-size as src , it is simply subtracted. Otherwise, it is "repeated" (see
+size as src, it is simply subtracted. Otherwise, it is "repeated" (see
 repeat ) to cover the full src and then subtracted. Type of the delta
 matrix, when it is not empty, must be the same as the type of created
 output matrix. See the dtype parameter description below.
@@ -1760,7 +1807,7 @@ CV_EXPORTS_W void transpose(InputArray src, OutputArray dst);
  * @note Input should be continuous single-channel matrix.
  * @param src input array.
  * @param order a permutation of [0,1,..,N-1] where N is the number of axes of src.
- * The i’th axis of dst will correspond to the axis numbered order[i] of the input.
+ * The i'th axis of dst will correspond to the axis numbered order[i] of the input.
  * @param dst output array of the same type as src.
  */
 CV_EXPORTS_W void transposeND(InputArray src, const std::vector<int>& order, OutputArray dst);
@@ -2004,7 +2051,7 @@ in the descending order.
 @param eigenvectors output matrix of eigenvectors; it has the same size and type as src; the
 eigenvectors are stored as subsequent matrix rows, in the same order as the corresponding
 eigenvalues.
-@sa eigenNonSymmetric, completeSymm , PCA
+@sa eigenNonSymmetric, completeSymm, PCA
 */
 CV_EXPORTS_W bool eigen(InputArray src, OutputArray eigenvalues,
                         OutputArray eigenvectors = noArray());
@@ -2144,7 +2191,7 @@ So, the function chooses an operation mode depending on the flags and size of th
 
 If #DFT_SCALE is set, the scaling is done after the transformation.
 
-Unlike dct , the function supports arrays of arbitrary size. But only those arrays are processed
+Unlike dct, the function supports arrays of arbitrary size. But only those arrays are processed
 efficiently, whose sizes can be factorized in a product of small prime numbers (2, 3, and 5 in the
 current implementation). Such an efficient DFT size can be calculated using the getOptimalDFTSize
 method.
@@ -2227,8 +2274,8 @@ nonzeroRows rows of the input array (#DFT_INVERSE is not set) or only the first
 output array (#DFT_INVERSE is set) contain non-zeros, thus, the function can handle the rest of the
 rows more efficiently and save some time; this technique is very useful for calculating array
 cross-correlation or convolution using DFT.
-@sa dct , getOptimalDFTSize , mulSpectrums, filter2D , matchTemplate , flip , cartToPolar ,
-magnitude , phase
+@sa dct, getOptimalDFTSize, mulSpectrums, filter2D, matchTemplate, flip, cartToPolar,
+magnitude, phase
 */
 CV_EXPORTS_W void dft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);
 
@@ -2265,9 +2312,9 @@ floating-point array:
     \f[X =  \left (C^{(N)} \right )^T  \cdot X  \cdot C^{(N)}\f]
 
 The function chooses the mode of operation by looking at the flags and size of the input array:
--   If (flags & #DCT_INVERSE) == 0 , the function does a forward 1D or 2D transform. Otherwise, it
+-   If (flags & #DCT_INVERSE) == 0, the function does a forward 1D or 2D transform. Otherwise, it
     is an inverse 1D or 2D transform.
--   If (flags & #DCT_ROWS) != 0 , the function performs a 1D transform of each row.
+-   If (flags & #DCT_ROWS) != 0, the function performs a 1D transform of each row.
 -   If the array is a single column or a single row, the function performs a 1D transform.
 -   If none of the above is true, the function performs a 2D transform.
 
@@ -2283,7 +2330,7 @@ of a vector of size N/2 . Thus, the optimal DCT size N1 \>= N can be calculated
 @param src input floating-point array.
 @param dst output array of the same size and type as src .
 @param flags transformation flags as a combination of cv::DftFlags (DCT_*)
-@sa dft , getOptimalDFTSize , idct
+@sa dft, getOptimalDFTSize, idct
 */
 CV_EXPORTS_W void dct(InputArray src, OutputArray dst, int flags = 0);
 
@@ -2302,7 +2349,7 @@ CV_EXPORTS_W void idct(InputArray src, OutputArray dst, int flags = 0);
 The function cv::mulSpectrums performs the per-element multiplication of the two CCS-packed or complex
 matrices that are results of a real or complex Fourier transform.
 
-The function, together with dft and idft , may be used to calculate convolution (pass conjB=false )
+The function, together with dft and idft, may be used to calculate convolution (pass conjB=false )
 or correlation (pass conjB=true ) of two arrays rapidly. When the arrays are complex, they are
 simply multiplied (per element) with an optional conjugation of the second-array elements. When the
 arrays are real, they are assumed to be CCS-packed (see dft for details).
@@ -2336,7 +2383,7 @@ While the function cannot be used directly to estimate the optimal vector size f
 (since the current DCT implementation supports only even-size vectors), it can be easily processed
 as getOptimalDFTSize((vecsize+1)/2)\*2.
 @param vecsize vector size.
-@sa dft , dct , idft , idct , mulSpectrums
+@sa dft, dct, idft, idct, mulSpectrums
 */
 CV_EXPORTS_W int getOptimalDFTSize(int vecsize);
 
@@ -2888,7 +2935,7 @@ class CV_EXPORTS RNG
 
     The methods transform the state using the MWC algorithm and return the
     next random number. The first form is equivalent to RNG::next . The
-    second form returns the random number modulo N , which means that the
+    second form returns the random number modulo N, which means that the
     result is in the range [0, N) .
     */
     unsigned operator ()();
diff --git a/modules/core/include/opencv2/core/async.hpp b/modules/core/include/opencv2/core/async.hpp
index 54560c7d001a..98868a130b23 100644
--- a/modules/core/include/opencv2/core/async.hpp
+++ b/modules/core/include/opencv2/core/async.hpp
@@ -7,10 +7,8 @@
 
 #include <opencv2/core/mat.hpp>
 
-#ifdef CV_CXX11
 //#include <future>
 #include <chrono>
-#endif
 
 namespace cv {
 
@@ -69,7 +67,6 @@ class CV_EXPORTS_W AsyncArray
 
     CV_WRAP bool valid() const CV_NOEXCEPT;
 
-#ifdef CV_CXX11
     inline AsyncArray(AsyncArray&& o) { p = o.p; o.p = NULL; }
     inline AsyncArray& operator=(AsyncArray&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
 
@@ -89,7 +86,6 @@ class CV_EXPORTS_W AsyncArray
     std::future<Mat> getFutureMat() const;
     std::future<UMat> getFutureUMat() const;
 #endif
-#endif
 
 
     // PImpl
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 21a61a4e53c8..cc4cc0ddd25c 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -271,11 +271,11 @@ enum BorderTypes {
     BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
     BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
     BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
-    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno` - Treats outliers as transparent.
 
     BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
     BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
-    BORDER_ISOLATED    = 16 //!< do not look outside of ROI
+    BORDER_ISOLATED    = 16 //!< Interpolation restricted within the ROI boundaries.
 };
 
 //! @} core_array
diff --git a/modules/core/include/opencv2/core/bindings_utils.hpp b/modules/core/include/opencv2/core/bindings_utils.hpp
index 64f346570a29..9c8f9e0f2bce 100644
--- a/modules/core/include/opencv2/core/bindings_utils.hpp
+++ b/modules/core/include/opencv2/core/bindings_utils.hpp
@@ -75,20 +75,6 @@ String dumpString(const String& argument)
     return cv::format("String: %s", argument.c_str());
 }
 
-CV_WRAP static inline
-String testOverloadResolution(int value, const Point& point = Point(42, 24))
-{
-    return format("overload (int=%d, point=(x=%d, y=%d))", value, point.x,
-                  point.y);
-}
-
-CV_WRAP static inline
-String testOverloadResolution(const Rect& rect)
-{
-    return format("overload (rect=(x=%d, y=%d, w=%d, h=%d))", rect.x, rect.y,
-                  rect.width, rect.height);
-}
-
 CV_WRAP static inline
 String dumpRect(const Rect& argument)
 {
@@ -111,6 +97,42 @@ String dumpRotatedRect(const RotatedRect& argument)
                   argument.size.height, argument.angle);
 }
 
+CV_WRAP static inline
+String dumpRange(const Range& argument)
+{
+    if (argument == Range::all())
+    {
+        return "range: all";
+    }
+    else
+    {
+        return format("range: (s=%d, e=%d)", argument.start, argument.end);
+    }
+}
+
+CV_EXPORTS_W String dumpVectorOfInt(const std::vector<int>& vec);
+
+CV_EXPORTS_W String dumpVectorOfDouble(const std::vector<double>& vec);
+
+CV_EXPORTS_W String dumpVectorOfRect(const std::vector<Rect>& vec);
+
+
+//! @cond IGNORED
+
+CV_WRAP static inline
+String testOverloadResolution(int value, const Point& point = Point(42, 24))
+{
+    return format("overload (int=%d, point=(x=%d, y=%d))", value, point.x,
+                  point.y);
+}
+
+CV_WRAP static inline
+String testOverloadResolution(const Rect& rect)
+{
+    return format("overload (rect=(x=%d, y=%d, w=%d, h=%d))", rect.x, rect.y,
+                  rect.width, rect.height);
+}
+
 CV_WRAP static inline
 RotatedRect testRotatedRect(float x, float y, float w, float h, float angle)
 {
@@ -126,19 +148,6 @@ std::vector<RotatedRect> testRotatedRectVector(float x, float y, float w, float
     return result;
 }
 
-CV_WRAP static inline
-String dumpRange(const Range& argument)
-{
-    if (argument == Range::all())
-    {
-        return "range: all";
-    }
-    else
-    {
-        return format("range: (s=%d, e=%d)", argument.start, argument.end);
-    }
-}
-
 CV_WRAP static inline
 int testOverwriteNativeMethod(int argument)
 {
@@ -151,12 +160,6 @@ String testReservedKeywordConversion(int positional_argument, int lambda = 2, in
     return format("arg=%d, lambda=%d, from=%d", positional_argument, lambda, from);
 }
 
-CV_EXPORTS_W String dumpVectorOfInt(const std::vector<int>& vec);
-
-CV_EXPORTS_W String dumpVectorOfDouble(const std::vector<double>& vec);
-
-CV_EXPORTS_W String dumpVectorOfRect(const std::vector<Rect>& vec);
-
 CV_WRAP static inline
 void generateVectorOfRect(size_t len, CV_OUT std::vector<Rect>& vec)
 {
@@ -323,6 +326,8 @@ class CV_EXPORTS_W CV_WRAP_AS(ExportClassName) OriginalClassName
 typedef OriginalClassName::Params OriginalClassName_Params;
 } // namespace nested
 
+//! @endcond IGNORED
+
 namespace fs {
     CV_EXPORTS_W cv::String getCacheDirectoryForDownloads();
 } // namespace fs
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 9c948ce00ac5..9d210ed7b55b 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -198,16 +198,32 @@ class CV_EXPORTS_W GpuMat
     CV_WRAP GpuMat clone() const;
 
     //! copies the GpuMat content to device memory (Blocking call)
-    CV_WRAP void copyTo(OutputArray dst) const;
+    void copyTo(OutputArray dst) const;
+    //! bindings overload which copies the GpuMat content to device memory (Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst) const {
+        copyTo(static_cast<OutputArray>(dst));
+    }
 
     //! copies the GpuMat content to device memory (Non-Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, Stream& stream) const;
+    void copyTo(OutputArray dst, Stream& stream) const;
+    //! bindings overload which copies the GpuMat content to device memory (Non-Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, Stream& stream) const {
+        copyTo(static_cast<OutputArray>(dst), stream);
+    }
 
     //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, InputArray mask) const;
+    void copyTo(OutputArray dst, InputArray mask) const;
+    //! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask) const {
+        copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask));
+    }
 
     //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+    void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+    //! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask, Stream& stream) const {
+        copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask), stream);
+    }
 
     //! sets some of the GpuMat elements to s (Blocking call)
     CV_WRAP GpuMat& setTo(Scalar s);
@@ -222,19 +238,31 @@ class CV_EXPORTS_W GpuMat
     CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
 
     //! converts GpuMat to another datatype (Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype) const;
+    void convertTo(OutputArray dst, int rtype) const;
 
     //! converts GpuMat to another datatype (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+    //! bindings overload which converts GpuMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, Stream& stream) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, stream);
+    }
 
     //! converts GpuMat to another datatype with scaling (Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+    //! bindings overload which converts GpuMat to another datatype with scaling(Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha = 1.0, double beta = 0.0) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta);
+    }
 
     //! converts GpuMat to another datatype with scaling (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
 
     //! converts GpuMat to another datatype with scaling (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+    //! bindings overload which converts GpuMat to another datatype with scaling (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha, double beta, Stream& stream) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta, stream);
+    }
 
     CV_WRAP void assignTo(GpuMat& m, int type = -1) const;
 
@@ -577,7 +605,7 @@ CV_EXPORTS_W void ensureSizeIsEnough(int rows, int cols, int type, OutputArray a
  */
 CV_EXPORTS_W GpuMat inline createGpuMatFromCudaMemory(int rows, int cols, int type, size_t cudaMemoryAddress, size_t step = Mat::AUTO_STEP) {
     return GpuMat(rows, cols, type, reinterpret_cast<void*>(cudaMemoryAddress), step);
-};
+}
 
  /** @overload
 @param size 2D array size: Size(cols, rows). In the Size() constructor, the number of rows and the number of columns go in the reverse order.
@@ -588,7 +616,7 @@ CV_EXPORTS_W GpuMat inline createGpuMatFromCudaMemory(int rows, int cols, int ty
  */
 CV_EXPORTS_W inline GpuMat createGpuMatFromCudaMemory(Size size, int type, size_t cudaMemoryAddress, size_t step = Mat::AUTO_STEP) {
     return GpuMat(size, type, reinterpret_cast<void*>(cudaMemoryAddress), step);
-};
+}
 
 /** @brief BufferPool for use with CUDA streams
 
diff --git a/modules/core/include/opencv2/core/cuda/common.hpp b/modules/core/include/opencv2/core/cuda/common.hpp
index 134809678d5c..1e1d5de1b08e 100644
--- a/modules/core/include/opencv2/core/cuda/common.hpp
+++ b/modules/core/include/opencv2/core/cuda/common.hpp
@@ -99,7 +99,7 @@ namespace cv { namespace cuda
         }
 
 #if (CUDART_VERSION >= 12000)
-        template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz<T>& img, const cudaTextureDesc& texDesc) {
+        template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t*, PtrStepSz<T>&, const cudaTextureDesc&) {
             CV_Error(cv::Error::GpuNotSupported, "Function removed in CUDA SDK 12"); }
 #else
         //TODO: remove from OpenCV 5.x
diff --git a/modules/core/include/opencv2/core/cuda/detail/reduce.hpp b/modules/core/include/opencv2/core/cuda/detail/reduce.hpp
index 8af20b0dc86d..05a672c3dc36 100644
--- a/modules/core/include/opencv2/core/cuda/detail/reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/reduce.hpp
@@ -134,6 +134,22 @@ namespace cv { namespace cuda { namespace device
         {
             val = smem[tid];
         }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
         __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
@@ -142,6 +158,7 @@ namespace cv { namespace cuda { namespace device
         {
             For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
         }
+
         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
         __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
@@ -151,18 +168,6 @@ namespace cv { namespace cuda { namespace device
             For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
         }
 
-        template <typename T, class Op>
-        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
-        {
-            T reg = smem[tid + delta];
-            smem[tid] = val = op(val, reg);
-        }
-        template <typename T, class Op>
-        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
-        {
-            T reg = shfl_down(val, delta, width);
-            val = op(val, reg);
-        }
         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
                   class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
@@ -183,7 +188,31 @@ namespace cv { namespace cuda { namespace device
         {
             For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
         }
+#else
+        template <typename... P, typename... R>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::loadToSmem(smem, val, tid);
+        }
 
+        template <typename... P, typename... R>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename... P, typename... R, class... Op>
+        __device__ __forceinline__ void merge(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid, unsigned int delta, const thrust::tuple<Op...>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::merge(smem, val, tid, delta, op);
+        }
+
+        template <typename... R, class... Op>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R...>& val, unsigned int delta, unsigned int width, const thrust::tuple<Op...>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R...> >::value>::mergeShfl(val, delta, width, op);
+        }
+#endif
         template <unsigned int N> struct Generic
         {
             template <typename Pointer, typename Reference, class Op>
diff --git a/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp b/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp
index df37c173be64..4a248c83657e 100644
--- a/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/reduce_key_val.hpp
@@ -177,6 +177,8 @@ namespace cv { namespace cuda { namespace device
         {
             data = smem[tid];
         }
+
+#if (CUDART_VERSION < 12040)
         template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
         __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
@@ -193,9 +195,18 @@ namespace cv { namespace cuda { namespace device
         {
             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
         }
-
-        //////////////////////////////////////////////////////
-        // copyVals
+#else
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP...>& smem, const thrust::tuple<VR...>& data, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP...>& smem, const thrust::tuple<VR...>& data, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::loadFromSmem(smem, data, tid);
+        }
+#endif
 
         template <typename V>
         __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
@@ -207,24 +218,6 @@ namespace cv { namespace cuda { namespace device
         {
             svals[tid] = val = svals[tid + delta];
         }
-        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
-        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
-                                                     unsigned int delta,
-                                                     int width)
-        {
-            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
-        }
-        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
-                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
-        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
-                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
-                                                 unsigned int tid, unsigned int delta)
-        {
-            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
-        }
-
-        //////////////////////////////////////////////////////
-        // merge
 
         template <typename K, typename V, class Cmp>
         __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
@@ -248,6 +241,24 @@ namespace cv { namespace cuda { namespace device
                 copyVals(svals, val, tid, delta);
             }
         }
+
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
         template <typename K,
                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
                   class Cmp>
@@ -305,7 +316,61 @@ namespace cv { namespace cuda { namespace device
         {
             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
         }
+#else
+        template <typename... VR>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR...>& val, unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR...> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::copy(svals, val, tid, delta);
+        }
+
+        template <typename K, typename... VR, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, const thrust::tuple<VR...>& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename... VP, typename... VR, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, const thrust::tuple<VP...>& svals,
+                                              const thrust::tuple<VR...>& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename... KR, typename... VR, class... Cmp>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR...>& key,
+                                                  const thrust::tuple<VR...>& val,
+                                                  const thrust::tuple<Cmp...>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR...> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP...>& skeys,
+                                              const thrust::tuple<KR...>& key,
+                                              const thrust::tuple<VP...>& svals,
+                                              const thrust::tuple<VR...>& val,
+                                              const thrust::tuple<Cmp...>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
 
+#endif
         //////////////////////////////////////////////////////
         // Generic
 
diff --git a/modules/core/include/opencv2/core/cuda/reduce.hpp b/modules/core/include/opencv2/core/cuda/reduce.hpp
index 5de365081789..fb74de95a8ab 100644
--- a/modules/core/include/opencv2/core/cuda/reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/reduce.hpp
@@ -64,6 +64,12 @@ namespace cv { namespace cuda { namespace device
     {
         reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
     }
+    template <unsigned int N, typename K, typename V, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
     template <int N,
               typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
               typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
@@ -79,11 +85,6 @@ namespace cv { namespace cuda { namespace device
                 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
     }
 
-    template <unsigned int N, typename K, typename V, class Cmp>
-    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
-    {
-        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
-    }
     template <unsigned int N,
               typename K,
               typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
@@ -99,6 +100,7 @@ namespace cv { namespace cuda { namespace device
                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
                 const Cmp&>(skeys, key, svals, val, tid, cmp);
     }
+
     template <unsigned int N,
               typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
               typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
@@ -120,6 +122,25 @@ namespace cv { namespace cuda { namespace device
                 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
                 >(skeys, key, svals, val, tid, cmp);
     }
+#else
+    template <int N, typename... P, typename... R, class... Op>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid, const thrust::tuple<Op...>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<const thrust::tuple<P...>&, const thrust::tuple<R...>&, const thrust::tuple<Op...>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N, typename K, typename... VP, typename... VR, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, const thrust::tuple<VP...>&, const thrust::tuple<VR...>&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+
+    template <unsigned int N, typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP...>& skeys, const thrust::tuple<KR...>& key, const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, const thrust::tuple<Cmp...>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<const thrust::tuple<KP...>&, const thrust::tuple<KR...>&, const thrust::tuple<VP...>&, const thrust::tuple<VR...>&, const thrust::tuple<Cmp...>&>(skeys, key, svals, val, tid, cmp);
+    }
+#endif
 
     // smem_tuple
 
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index 3235b6317e00..0817e7ec7066 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -141,20 +141,16 @@
 # include <Intrin.h>
 # include <arm_neon.h>
 # define CV_NEON 1
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#elif defined(__ARM_NEON)
 #  include <arm_neon.h>
 #  define CV_NEON 1
 #endif
 
 #if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
-# include<riscv-vector.h>
+# include<riscv_vector.h>
 # define CV_RVV071 1
 #endif
 
-#if defined(__ARM_NEON__) || defined(__aarch64__)
-#  include <arm_neon.h>
-#endif
-
 #ifdef CV_CPU_COMPILE_VSX
 #  include <altivec.h>
 #  undef vector
@@ -172,6 +168,11 @@
 #  define CV_MSA 1
 #endif
 
+#ifdef CV_CPU_COMPILE_LSX
+#  include <lsxintrin.h>
+#  define CV_LSX 1
+#endif
+
 #ifdef CV_CPU_COMPILE_LASX
 #  include <lasxintrin.h>
 #  define CV_LASX 1
@@ -224,7 +225,7 @@ struct VZeroUpperGuard {
 # include <Intrin.h>
 # include <arm_neon.h>
 # define CV_NEON 1
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#elif defined(__ARM_NEON)
 #  include <arm_neon.h>
 #  define CV_NEON 1
 #elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
@@ -376,6 +377,10 @@ struct VZeroUpperGuard {
 #  define CV_RVV 0
 #endif
 
+#ifndef CV_LSX
+#  define CV_LSX 0
+#endif
+
 #ifndef CV_LASX
 #  define CV_LASX 0
 #endif
diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h
index 41fc9d50fa1e..04b00d202443 100644
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@@ -441,6 +441,48 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...)  CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_FP16
+#  define CV_TRY_NEON_FP16 1
+#  define CV_CPU_FORCE_NEON_FP16 1
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 1
+#  define CV_CPU_CALL_NEON_FP16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args) return (opt_NEON_FP16::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_FP16
+#  define CV_TRY_NEON_FP16 1
+#  define CV_CPU_FORCE_NEON_FP16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 (cv::checkHardwareSupport(CV_CPU_NEON_FP16))
+#  define CV_CPU_CALL_NEON_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_FP16) return (opt_NEON_FP16::fn args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_FP16) return (opt_NEON_FP16::fn args)
+#else
+#  define CV_TRY_NEON_FP16 0
+#  define CV_CPU_FORCE_NEON_FP16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 0
+#  define CV_CPU_CALL_NEON_FP16(fn, args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_FP16(fn, args, mode, ...)  CV_CPU_CALL_NEON_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_BF16
+#  define CV_TRY_NEON_BF16 1
+#  define CV_CPU_FORCE_NEON_BF16 1
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 1
+#  define CV_CPU_CALL_NEON_BF16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args) return (opt_NEON_BF16::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_BF16
+#  define CV_TRY_NEON_BF16 1
+#  define CV_CPU_FORCE_NEON_BF16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 (cv::checkHardwareSupport(CV_CPU_NEON_BF16))
+#  define CV_CPU_CALL_NEON_BF16(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_BF16) return (opt_NEON_BF16::fn args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_BF16) return (opt_NEON_BF16::fn args)
+#else
+#  define CV_TRY_NEON_BF16 0
+#  define CV_CPU_FORCE_NEON_BF16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 0
+#  define CV_CPU_CALL_NEON_BF16(fn, args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_BF16(fn, args, mode, ...)  CV_CPU_CALL_NEON_BF16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
 #  define CV_TRY_MSA 1
 #  define CV_CPU_FORCE_MSA 1
@@ -525,6 +567,27 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX
+#  define CV_TRY_LSX 1
+#  define CV_CPU_FORCE_LSX 1
+#  define CV_CPU_HAS_SUPPORT_LSX 1
+#  define CV_CPU_CALL_LSX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_LSX_(fn, args) return (opt_LSX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LSX
+#  define CV_TRY_LSX 1
+#  define CV_CPU_FORCE_LSX 0
+#  define CV_CPU_HAS_SUPPORT_LSX (cv::checkHardwareSupport(CV_CPU_LSX))
+#  define CV_CPU_CALL_LSX(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
+#  define CV_CPU_CALL_LSX_(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
+#else
+#  define CV_TRY_LSX 0
+#  define CV_CPU_FORCE_LSX 0
+#  define CV_CPU_HAS_SUPPORT_LSX 0
+#  define CV_CPU_CALL_LSX(fn, args)
+#  define CV_CPU_CALL_LSX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_LSX(fn, args, mode, ...)  CV_CPU_CALL_LSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
 #  define CV_TRY_LASX 1
 #  define CV_CPU_FORCE_LASX 1
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 8307ca7d1ce3..748ecb9eceeb 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -201,6 +201,14 @@ namespace cv {
 #  define CV_ICC   __INTEL_COMPILER
 #endif
 
+#if defined _WIN32
+#  define CV_CDECL __cdecl
+#  define CV_STDCALL __stdcall
+#else
+#  define CV_CDECL
+#  define CV_STDCALL
+#endif
+
 #ifndef CV_INLINE
 #  if defined __cplusplus
 #    define CV_INLINE static inline
@@ -269,6 +277,8 @@ namespace cv {
 
 #define CV_CPU_NEON             100
 #define CV_CPU_NEON_DOTPROD     101
+#define CV_CPU_NEON_FP16        102
+#define CV_CPU_NEON_BF16        103
 
 #define CV_CPU_MSA              150
 
@@ -279,7 +289,8 @@ namespace cv {
 
 #define CV_CPU_RVV              210
 
-#define CV_CPU_LASX             230
+#define CV_CPU_LSX              230
+#define CV_CPU_LASX             231
 
 // CPU features groups
 #define CV_CPU_AVX512_SKX       256
@@ -328,6 +339,8 @@ enum CpuFeatures {
 
     CPU_NEON            = 100,
     CPU_NEON_DOTPROD    = 101,
+    CPU_NEON_FP16       = 102,
+    CPU_NEON_BF16       = 103,
 
     CPU_MSA             = 150,
 
@@ -338,7 +351,8 @@ enum CpuFeatures {
 
     CPU_RVV             = 210,
 
-    CPU_LASX             = 230,
+    CPU_LSX             = 230,
+    CPU_LASX            = 231,
 
     CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
     CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
@@ -469,11 +483,14 @@ Cv64suf;
 #define CV_WRAP_MAPPABLE(mappable)
 #define CV_WRAP_PHANTOM(phantom_header)
 #define CV_WRAP_DEFAULT(val)
+/* Indicates that the function parameter has filesystem path semantic */
+#define CV_WRAP_FILE_PATH
 
 /****************************************************************************************\
 *                                  Matrix type (Mat)                                     *
 \****************************************************************************************/
 
+#define CV_MAX_DIM              32
 #define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
 #define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
 #define CV_MAT_TYPE_MASK        (CV_DEPTH_MAX*CV_CN_MAX - 1)
@@ -500,6 +517,13 @@ Cv64suf;
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif
 
+/** min & max without jumps */
+#define CV_IMIN(a, b)  ((a) ^ (((a)^(b)) & (((a) < (b)) - 1)))
+#define CV_IMAX(a, b)  ((a) ^ (((a)^(b)) & (((a) > (b)) - 1)))
+#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
+#define CV_CMP(a,b)    (((a) > (b)) - ((a) < (b)))
+#define CV_SIGN(a)     CV_CMP((a),0)
+
 ///////////////////////////////////////// Enum operators ///////////////////////////////////////
 
 /**
@@ -745,88 +769,43 @@ __CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)
 
 
 /****************************************************************************************\
-*                      CV_NODISCARD attribute (deprecated, GCC only)                     *
-* DONT USE: use instead the standard CV_NODISCARD_STD macro above                        *
-*           this legacy method silently fails to issue warning until some version        *
-*           after gcc 6.3.0. Yet with gcc 7+ you can use the above standard method       *
-*           which makes this method useless. Don't use it.                               *
-* @deprecated use instead CV_NODISCARD_STD                                               *
+*                                    C++ 11                                              *
 \****************************************************************************************/
-#ifndef CV_NODISCARD
-#  if defined(__GNUC__)
-#    define CV_NODISCARD __attribute__((__warn_unused_result__))
-#  elif defined(__clang__) && defined(__has_attribute)
-#    if __has_attribute(__warn_unused_result__)
-#      define CV_NODISCARD __attribute__((__warn_unused_result__))
+#ifdef __cplusplus
+// MSVC was stuck at __cplusplus == 199711L for a long time, even where it supports C++11,
+// so check _MSC_VER instead. See:
+// <https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus>
+#  if defined(_MSC_VER)
+#    if _MSC_VER < 1800
+#      error "OpenCV 4.x+ requires enabled C++11 support"
 #    endif
+#  elif __cplusplus < 201103L
+#    error "OpenCV 4.x+ requires enabled C++11 support"
 #  endif
 #endif
-#ifndef CV_NODISCARD
-#  define CV_NODISCARD /* nothing by default */
-#endif
-
 
-/****************************************************************************************\
-*                                    C++ 11                                              *
-\****************************************************************************************/
-#ifndef CV_CXX11
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)
-#    define CV_CXX11 1
-#  endif
-#else
-#  if CV_CXX11 == 0
-#    undef CV_CXX11
-#  endif
-#endif
 #ifndef CV_CXX11
-#  error "OpenCV 4.x+ requires enabled C++11 support"
+#  define CV_CXX11 1
 #endif
 
-#define CV_CXX_MOVE_SEMANTICS 1
-#define CV_CXX_MOVE(x) std::move(x)
-#define CV_CXX_STD_ARRAY 1
-#include <array>
 #ifndef CV_OVERRIDE
 #  define CV_OVERRIDE override
 #endif
+
 #ifndef CV_FINAL
 #  define CV_FINAL final
 #endif
 
 #ifndef CV_NOEXCEPT
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
-#    define CV_NOEXCEPT noexcept
-#  endif
-#endif
-#ifndef CV_NOEXCEPT
-#  define CV_NOEXCEPT
+#  define CV_NOEXCEPT noexcept
 #endif
 
 #ifndef CV_CONSTEXPR
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
-#    define CV_CONSTEXPR constexpr
-#  endif
-#endif
-#ifndef CV_CONSTEXPR
-#  define CV_CONSTEXPR
+#  define CV_CONSTEXPR constexpr
 #endif
 
 // Integer types portability
-#ifdef OPENCV_STDINT_HEADER
-#include OPENCV_STDINT_HEADER
-#elif defined(__cplusplus)
-#if defined(_MSC_VER) && _MSC_VER < 1600 /* MSVS 2010 */
-namespace cv {
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
-typedef signed short int16_t;
-typedef unsigned short uint16_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef signed __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-}
-#elif defined(_MSC_VER) || __cplusplus >= 201103L
+#ifdef __cplusplus
 #include <cstdint>
 namespace cv {
 using std::int8_t;
@@ -838,19 +817,6 @@ using std::uint32_t;
 using std::int64_t;
 using std::uint64_t;
 }
-#else
-#include <stdint.h>
-namespace cv {
-typedef ::int8_t int8_t;
-typedef ::uint8_t uint8_t;
-typedef ::int16_t int16_t;
-typedef ::uint16_t uint16_t;
-typedef ::int32_t int32_t;
-typedef ::uint32_t uint32_t;
-typedef ::int64_t int64_t;
-typedef ::uint64_t uint64_t;
-}
-#endif
 #else // pure C
 #include <stdint.h>
 #endif
@@ -859,42 +825,22 @@ typedef ::uint64_t uint64_t;
 namespace cv
 {
 
-class float16_t
+class hfloat
 {
 public:
 #if CV_FP16_TYPE
 
-    float16_t() : h(0) {}
-    explicit float16_t(float x) { h = (__fp16)x; }
+    hfloat() : h(0) {}
+    explicit hfloat(float x) { h = (__fp16)x; }
     operator float() const { return (float)h; }
-    static float16_t fromBits(ushort w)
-    {
-        Cv16suf u;
-        u.u = w;
-        float16_t result;
-        result.h = u.h;
-        return result;
-    }
-    static float16_t zero()
-    {
-        float16_t result;
-        result.h = (__fp16)0;
-        return result;
-    }
-    ushort bits() const
-    {
-        Cv16suf u;
-        u.h = h;
-        return u.u;
-    }
 protected:
     __fp16 h;
 
 #else
-    float16_t() : w(0) {}
-    explicit float16_t(float x)
+    hfloat() : w(0) {}
+    explicit hfloat(float x)
     {
-    #if CV_FP16
+    #if CV_FP16 && CV_AVX2
         __m128 v = _mm_load_ss(&x);
         w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
     #else
@@ -925,7 +871,7 @@ class float16_t
 
     operator float() const
     {
-    #if CV_FP16
+    #if CV_FP16 && CV_AVX2
         float f;
         _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
         return f;
@@ -943,25 +889,37 @@ class float16_t
     #endif
     }
 
-    static float16_t fromBits(ushort b)
-    {
-        float16_t result;
-        result.w = b;
-        return result;
-    }
-    static float16_t zero()
-    {
-        float16_t result;
-        result.w = (ushort)0;
-        return result;
-    }
-    ushort bits() const { return w; }
 protected:
     ushort w;
 
 #endif
 };
 
+inline hfloat hfloatFromBits(ushort w) {
+#if CV_FP16_TYPE
+    Cv16suf u;
+    u.u = w;
+    hfloat res(float(u.h));
+    return res;
+#else
+    Cv32suf out;
+
+    unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+    unsigned sign = (w & 0x8000) << 16;
+    unsigned e = w & 0x7c00;
+
+    out.u = t + (1 << 23);
+    out.u = (e >= 0x7c00 ? t + 0x38000000 :
+            e == 0 ? (static_cast<void>(out.f -= 6.103515625e-05f), out.u) : t) | sign;
+    hfloat res(out.f);
+    return res;
+#endif
+}
+
+#if !defined(__OPENCV_BUILD) && !(defined __STDCPP_FLOAT16_T__) && !(defined __ARM_NEON)
+typedef hfloat float16_t;
+#endif
+
 }
 #endif
 
diff --git a/modules/core/include/opencv2/core/cvstd.hpp b/modules/core/include/opencv2/core/cvstd.hpp
index 6ce9e4b06024..d216d267ef06 100644
--- a/modules/core/include/opencv2/core/cvstd.hpp
+++ b/modules/core/include/opencv2/core/cvstd.hpp
@@ -140,7 +140,6 @@ template<typename _Tp> class Allocator
 
 //! @} core_utils
 
-//! @endcond
 
 //! @addtogroup core_basic
 //! @{
diff --git a/modules/core/include/opencv2/core/detail/async_promise.hpp b/modules/core/include/opencv2/core/detail/async_promise.hpp
index 6eb3fb52c1a1..c039ec046a08 100644
--- a/modules/core/include/opencv2/core/detail/async_promise.hpp
+++ b/modules/core/include/opencv2/core/detail/async_promise.hpp
@@ -52,10 +52,8 @@ class CV_EXPORTS AsyncPromise
     */
     void setException(const cv::Exception& exception);
 
-#ifdef CV_CXX11
     explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; }
     AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
-#endif
 
 
     // PImpl
diff --git a/modules/core/include/opencv2/core/detail/exception_ptr.hpp b/modules/core/include/opencv2/core/detail/exception_ptr.hpp
index d98ffc40c63c..a1a591e45582 100644
--- a/modules/core/include/opencv2/core/detail/exception_ptr.hpp
+++ b/modules/core/include/opencv2/core/detail/exception_ptr.hpp
@@ -8,14 +8,8 @@
 #ifndef CV__EXCEPTION_PTR
 #  if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2
 #    define CV__EXCEPTION_PTR 0  // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
-#  elif defined(CV_CXX11)
+#  else
 #    define CV__EXCEPTION_PTR 1
-#  elif defined(_MSC_VER)
-#    define CV__EXCEPTION_PTR (_MSC_VER >= 1600)
-#  elif defined(__clang__)
-#    define CV__EXCEPTION_PTR 0  // C++11 only (see above)
-#  elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)
-#    define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0)
 #  endif
 #endif
 #ifndef CV__EXCEPTION_PTR
diff --git a/modules/core/include/opencv2/core/dualquaternion.inl.hpp b/modules/core/include/opencv2/core/dualquaternion.inl.hpp
index 6abb15924b83..1a68f12d305b 100644
--- a/modules/core/include/opencv2/core/dualquaternion.inl.hpp
+++ b/modules/core/include/opencv2/core/dualquaternion.inl.hpp
@@ -36,15 +36,15 @@
 namespace cv {
 
 template <typename T>
-DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){};
+DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){}
 
 template <typename T>
 DualQuat<T>::DualQuat(const T vw, const T vx, const T vy, const T vz, const T _w, const T _x, const T _y, const T _z):
-                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){};
+                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){}
 
 template <typename T>
 DualQuat<T>::DualQuat(const Vec<T, 8> &q):w(q[0]), x(q[1]), y(q[2]), z(q[3]),
-                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){};
+                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){}
 
 template <typename T>
 DualQuat<T> DualQuat<T>::createFromQuat(const Quat<T> &realPart, const Quat<T> &dualPart)
diff --git a/modules/core/include/opencv2/core/eigen.hpp b/modules/core/include/opencv2/core/eigen.hpp
index f176409cc3b5..231c6805c0ca 100644
--- a/modules/core/include/opencv2/core/eigen.hpp
+++ b/modules/core/include/opencv2/core/eigen.hpp
@@ -61,8 +61,7 @@
 #endif
 
 #if !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
-#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 \
-    && defined(CV_CXX11) && defined(CV_CXX_STD_ARRAY)
+#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
 #include <unsupported/Eigen/CXX11/Tensor>
 #define OPENCV_EIGEN_TENSOR_SUPPORT 1
 #endif  // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
diff --git a/modules/core/include/opencv2/core/fast_math.hpp b/modules/core/include/opencv2/core/fast_math.hpp
index 47a2948222eb..a28c3fbedfe6 100644
--- a/modules/core/include/opencv2/core/fast_math.hpp
+++ b/modules/core/include/opencv2/core/fast_math.hpp
@@ -68,7 +68,7 @@
   // nothing, intrinsics/asm code is not supported
 #else
   #if ((defined _MSC_VER && defined _M_X64) \
-      || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__)) \
+      || (defined __GNUC__ && defined __SSE2__)) \
       && !defined(OPENCV_SKIP_INCLUDE_EMMINTRIN_H)
     #include <emmintrin.h>
   #endif
@@ -84,7 +84,7 @@
   #if defined(CV_INLINE_ROUND_FLT)
     // user-specified version
     // CV_INLINE_ROUND_DBL should be defined too
-  #elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
+  #elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON) && !defined __SOFTFP__
     // 1. general scheme
     #define ARM_ROUND(_value, _asm_string) \
         int res; \
@@ -201,7 +201,7 @@ cvRound( double value )
 {
 #if defined CV_INLINE_ROUND_DBL
     CV_INLINE_ROUND_DBL(value);
-#elif (defined _MSC_VER && defined _M_X64) && !defined(__CUDACC__)
+#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __SSE2__)) && !defined(__CUDACC__)
     __m128d t = _mm_set_sd( value );
     return _mm_cvtsd_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@@ -323,7 +323,7 @@ CV_INLINE int cvRound(float value)
 {
 #if defined CV_INLINE_ROUND_FLT
     CV_INLINE_ROUND_FLT(value);
-#elif (defined _MSC_VER && defined _M_X64) && !defined(__CUDACC__)
+#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __SSE2__)) && !defined(__CUDACC__)
     __m128 t = _mm_set_ss( value );
     return _mm_cvtss_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@@ -354,7 +354,7 @@ CV_INLINE int cvFloor( float value )
 #if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
     defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
     return (int)__builtin_floorf(value);
-#elif defined __loongarch
+#elif defined __loongarch__
     int i;
     float tmp;
     __asm__ ("ftintrm.w.s     %[tmp],    %[in]       \n\t"
@@ -381,7 +381,7 @@ CV_INLINE int cvCeil( float value )
 #if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
     defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
     return (int)__builtin_ceilf(value);
-#elif defined __loongarch
+#elif defined __loongarch__
     int i;
     float tmp;
     __asm__ ("ftintrp.w.s     %[tmp],    %[in]       \n\t"
diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 0d68078d98d6..deca4e9539ae 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -91,10 +91,14 @@ CV_EXPORTS void exp64f(const double* src, double* dst, int n);
 CV_EXPORTS void log32f(const float* src, float* dst, int n);
 CV_EXPORTS void log64f(const double* src, double* dst, int n);
 
+CV_EXPORTS void cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int n, bool angleInDegrees);
+CV_EXPORTS void cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int n, bool angleInDegrees);
 CV_EXPORTS void fastAtan32f(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
 CV_EXPORTS void fastAtan64f(const double* y, const double* x, double* dst, int n, bool angleInDegrees);
 CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
 CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
+CV_EXPORTS void polarToCart32f(const float* mag, const float* angle, float* x, float* y, int n, bool angleInDegrees);
+CV_EXPORTS void polarToCart64f(const double* mag, const double* angle, double* x, double* y, int n, bool angleInDegrees);
 CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
 CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
 CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
@@ -195,8 +199,8 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 
-CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
-CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+CV_EXPORTS void cvt16f32f( const hfloat* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, hfloat* dst, int len );
 
 CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
 CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index ee8310b5c5f2..27beccd9ab9f 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -206,7 +206,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_RVV
 #endif
 
-#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP)
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP)
 #define CV__SIMD_FORWARD 128
 #include "opencv2/core/hal/intrin_forward.hpp"
 #endif
@@ -242,11 +242,9 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #include "opencv2/core/hal/intrin_rvv.hpp"
 #endif
 
-#elif CV_LASX
-    #if !defined(CV_FORCE_SIMD128_CPP)
-    #define CV_FORCE_SIMD128_CPP 1
-    #endif
-#include "opencv2/core/hal/intrin_cpp.hpp"
+#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_lsx.hpp"
 
 #else
 
@@ -710,7 +708,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
     inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
     inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
-    inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_float32 vx_load_expand(const hfloat * ptr) { return VXPREFIX(_load_expand)(ptr); }
     //! @}
 
     //! @name Wide load with quad expansion
@@ -723,7 +721,7 @@ namespace CV__SIMD_NAMESPACE {
     /** @brief SIMD processing state cleanup call */
     inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
 
-#if !CV_SIMD_SCALABLE
+#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
     // Compatibility layer
 
     template<typename T> struct VTraits {
@@ -745,7 +743,22 @@ namespace CV__SIMD_NAMESPACE {
     inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
         return v_add(f1 + f2, vf...); \
     }
+    #define OPENCV_HAL_WRAP_SHIFT_OP(_Tpvec) \
+    inline _Tpvec v_shr(const _Tpvec& a, int n) \
+    { \
+        return a >> n; \
+    } \
+    inline _Tpvec v_shl(const _Tpvec& a, int n) \
+    { \
+        return a << n; \
+    }
 
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_uint64)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int16)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int32)
+    OPENCV_HAL_WRAP_SHIFT_OP(v_int64)
     OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
     OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
     OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
@@ -769,6 +782,12 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x2)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
         #endif
@@ -784,6 +803,12 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_SHIFT_OP(v_int64x4)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
         #endif
@@ -801,7 +826,9 @@ namespace CV__SIMD_NAMESPACE {
     inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
     { \
         return a ^ b; \
-    } \
+    }
+
+    #define OPENCV_HAL_WRAP_NOT_OP(_Tpvec) \
     inline _Tpvec v_not(const _Tpvec& a) \
     { \
         return ~a; \
@@ -815,6 +842,18 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint8)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint16)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint32)
+    OPENCV_HAL_WRAP_NOT_OP(v_uint64)
+    OPENCV_HAL_WRAP_NOT_OP(v_int8)
+    OPENCV_HAL_WRAP_NOT_OP(v_int16)
+    OPENCV_HAL_WRAP_NOT_OP(v_int32)
+    OPENCV_HAL_WRAP_NOT_OP(v_int64)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64)
+    #endif
     #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
@@ -824,6 +863,18 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x2)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x2)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x2)
+        #endif
     #endif
     #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
@@ -834,6 +885,18 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
         OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_uint64x4)
+        OPENCV_HAL_WRAP_NOT_OP(v_int8x32)
+        OPENCV_HAL_WRAP_NOT_OP(v_int16x16)
+        OPENCV_HAL_WRAP_NOT_OP(v_int32x8)
+        OPENCV_HAL_WRAP_NOT_OP(v_int64x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_float64x4)
+        #endif
     #endif
 
     #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
@@ -907,6 +970,15 @@ namespace CV__SIMD_NAMESPACE {
     { \
         return a op b; \
     }
+    #define OPENCV_HAL_WRAP_EQ_OP(_Tpvec) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a == b; \
+    } \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return a != b; \
+    }
 
     #define OPENCV_HAL_WRAP_CMP(_Tpvec) \
     OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \
@@ -919,11 +991,11 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_CMP(v_uint8)
     OPENCV_HAL_WRAP_CMP(v_uint16)
     OPENCV_HAL_WRAP_CMP(v_uint32)
-    // OPENCV_HAL_WRAP_CMP(v_uint64)
+    OPENCV_HAL_WRAP_EQ_OP(v_uint64)
     OPENCV_HAL_WRAP_CMP(v_int8)
     OPENCV_HAL_WRAP_CMP(v_int16)
     OPENCV_HAL_WRAP_CMP(v_int32)
-    // OPENCV_HAL_WRAP_CMP(v_int64)
+    OPENCV_HAL_WRAP_EQ_OP(v_int64)
     OPENCV_HAL_WRAP_CMP(v_float32)
     #if CV_SIMD_64F
     OPENCV_HAL_WRAP_CMP(v_float64)
@@ -932,9 +1004,11 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_CMP(v_uint8x16)
         OPENCV_HAL_WRAP_CMP(v_uint16x8)
         OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x2)
         OPENCV_HAL_WRAP_CMP(v_int8x16)
         OPENCV_HAL_WRAP_CMP(v_int16x8)
         OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x2)
         OPENCV_HAL_WRAP_CMP(v_float32x4)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_CMP(v_float64x2)
@@ -944,9 +1018,11 @@ namespace CV__SIMD_NAMESPACE {
         OPENCV_HAL_WRAP_CMP(v_uint8x32)
         OPENCV_HAL_WRAP_CMP(v_uint16x16)
         OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_uint64x4)
         OPENCV_HAL_WRAP_CMP(v_int8x32)
         OPENCV_HAL_WRAP_CMP(v_int16x16)
         OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_EQ_OP(v_int64x4)
         OPENCV_HAL_WRAP_CMP(v_float32x8)
         #if CV_SIMD_64F
         OPENCV_HAL_WRAP_CMP(v_float64x4)
@@ -1070,6 +1146,74 @@ namespace CV__SIMD_NAMESPACE {
 
 #endif //!CV_SIMD_SCALABLE
 
+#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
+// Compatibility layer for the backend that cleaned up.
+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_add(v_add(f1, f2), vf...); \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_mul(v_mul(f1, f2), vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
+    { \
+        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_EXTRACT(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
+    inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
+    { \
+        return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
+    OPENCV_HAL_WRAP_BROADCAST(v_int32)
+    OPENCV_HAL_WRAP_BROADCAST(v_float32)
+
+#endif //CV_NEON
+
 //! @cond IGNORED
 
     // backward compatibility
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 979b6163d8f8..eed609f80ef6 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -3137,7 +3137,7 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, u
 // FP16
 //
 
-inline v_float32x8 v256_load_expand(const float16_t* ptr)
+inline v_float32x8 v256_load_expand(const hfloat* ptr)
 {
 #if CV_FP16
     return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
@@ -3149,7 +3149,7 @@ inline v_float32x8 v256_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
 {
 #if CV_FP16
     __m128i ah = _mm256_cvtps_ph(a.val, 0);
@@ -3158,7 +3158,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
     float CV_DECL_ALIGNED(32) buf[8];
     v_store_aligned(buf, a);
     for (int i = 0; i < 8; i++)
-        ptr[i] = float16_t(buf[i]);
+        ptr[i] = hfloat(buf[i]);
 #endif
 }
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
index d20d6dd1ffd3..e59b8d92ebe9 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -506,12 +506,12 @@ inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
 { return v_float64x8(_mm512_castps_pd(a.val)); }
 
 // FP16
-inline v_float32x16 v512_load_expand(const float16_t* ptr)
+inline v_float32x16 v512_load_expand(const hfloat* ptr)
 {
     return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x16& a)
 {
     __m256i ah = _mm512_cvtps_ph(a.val, 0);
     _mm256_storeu_si256((__m256i*)ptr, ah);
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index e9a09d12ae2e..8619fec60c53 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -3251,7 +3251,7 @@ template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int,
 ////// FP16 support ///////
 
 inline v_reg<float, simd128_width / sizeof(float)>
-v_load_expand(const float16_t* ptr)
+v_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd128_width / sizeof(float)> v;
     for( int i = 0; i < v.nlanes; i++ )
@@ -3262,7 +3262,7 @@ v_load_expand(const float16_t* ptr)
 }
 #if CV_SIMD256
 inline v_reg<float, simd256_width / sizeof(float)>
-v256_load_expand(const float16_t* ptr)
+v256_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd256_width / sizeof(float)> v;
     for (int i = 0; i < v.nlanes; i++)
@@ -3274,7 +3274,7 @@ v256_load_expand(const float16_t* ptr)
 #endif
 #if CV_SIMD512
 inline v_reg<float, simd512_width / sizeof(float)>
-v512_load_expand(const float16_t* ptr)
+v512_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd512_width / sizeof(float)> v;
     for (int i = 0; i < v.nlanes; i++)
@@ -3286,11 +3286,11 @@ v512_load_expand(const float16_t* ptr)
 #endif
 
 template<int n> inline void
-v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
+v_pack_store(hfloat* ptr, const v_reg<float, n>& v)
 {
     for( int i = 0; i < v.nlanes; i++ )
     {
-        ptr[i] = float16_t(v.s[i]);
+        ptr[i] = hfloat(v.s[i]);
     }
 }
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_forward.hpp b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
index 979f15a277d1..28f67cc9ef97 100644
--- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@@ -188,4 +188,4 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
 
-} // cv::
\ No newline at end of file
+} // cv::
diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
index 37f2e3f81dd3..4a98dbf96ebe 100644
--- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp
@@ -96,54 +96,22 @@ inline __m256d _v256_setall_pd(double f64)
 
 inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b)
 {
-    __m256i u8min = __lasx_xvreplgr2vr_h(0);
-    __m256i u8max = __lasx_xvreplgr2vr_h(255);
-    __m256i sat_a = __lasx_xvmax_h(a, u8min);
-            sat_a = __lasx_xvmin_h(sat_a, u8max);
-    __m256i sat_b = __lasx_xvmax_h(b, u8min);
-            sat_b = __lasx_xvmin_h(sat_b, u8max);
-    __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-                                     0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-    return __lasx_xvshuf_b(sat_b, sat_a, byteIndex);
+    return __lasx_xvssrarni_bu_h(b, a, 0);
 }
 
 inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b)
 {
-    __m256i s8min = __lasx_xvreplgr2vr_h(-128);
-    __m256i s8max = __lasx_xvreplgr2vr_h(127);
-    __m256i sat_a = __lasx_xvmax_h(a, s8min);
-            sat_a = __lasx_xvmin_h(sat_a, s8max);
-    __m256i sat_b = __lasx_xvmax_h(b, s8min);
-            sat_b = __lasx_xvmin_h(sat_b, s8max);
-    __m256i byteIndex = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-                                     0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-    return __lasx_xvshuf_b(sat_b, sat_a, byteIndex);
+    return __lasx_xvssrarni_b_h(b, a, 0);
 }
 
 inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b)
 {
-    __m256i u16min = __lasx_xvreplgr2vr_w(0);
-    __m256i u16max = __lasx_xvreplgr2vr_w(0xffff);
-    __m256i sat_a = __lasx_xvmax_w(a, u16min);
-            sat_a = __lasx_xvmin_w(sat_a, u16max);
-    __m256i sat_b = __lasx_xvmax_w(b, u16min);
-            sat_b = __lasx_xvmin_w(sat_b, u16max);
-    __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14,
-                                      0, 2, 4, 6, 8, 10, 12, 14);
-    return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a);
+    return __lasx_xvssrarni_hu_w(b, a, 0);
 }
 
 inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b)
 {
-    __m256i s16min = __lasx_xvreplgr2vr_w(-0x8000);
-    __m256i s16max = __lasx_xvreplgr2vr_w(0x7fff);
-    __m256i sat_a = __lasx_xvmax_w(a, s16min);
-            sat_a = __lasx_xvmin_w(sat_a, s16max);
-    __m256i sat_b = __lasx_xvmax_w(b, s16min);
-            sat_b = __lasx_xvmin_w(sat_b, s16max);
-    __m256i hwordIndex = _v256_setr_h(0, 2, 4, 6, 8, 10, 12, 14,
-                                      0, 2, 4, 6, 8, 10, 12, 14);
-    return __lasx_xvshuf_h(hwordIndex, sat_b, sat_a);
+    return __lasx_xvssrarni_h_w(b, a, 0);
 }
 
 inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
@@ -191,7 +159,7 @@ inline _Tpvec v256_permute4x64(const _Tpvec& a)
 { return _Tpvec(_v256_permute4x64<imm>(a.val)); }
 
 inline __m128i _v256_extract_high(const __m256i& v)
-{ __m256i temp256i = __lasx_xvpermi_q(v, v, 0x31);
+{ __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
   return *((__m128i*)&temp256i); }
 
 inline __m128  _v256_extract_high(const __m256& v)
@@ -211,10 +179,7 @@ inline __m128d _v256_extract_low(const __m256d& v)
 
 inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
 {
-    const __m256i maxv = __lasx_xvreplgr2vr_w(65535);
-    __m256i am = __lasx_xvmin_wu(a, maxv);
-    __m256i bm = __lasx_xvmin_wu(b, maxv);
-    return _lasx_packus_w(am, bm);
+    return __lasx_xvssrlrni_hu_w(b, a, 0);
 }
 
 template<int i>
@@ -869,14 +834,11 @@ OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16,  __lasx_xvmul_h)
 
 inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
 {
-    __m256i ad = __lasx_xvsrai_h(a.val, 8);
-    __m256i bd = __lasx_xvsrai_h(b.val, 8);
-    __m256i p0 = __lasx_xvmul_h(a.val, b.val);
-    __m256i p1 = __lasx_xvslli_h(__lasx_xvmul_h(ad, bd), 8);
-
-    const __m256i b01 = __lasx_xvreplgr2vr_w(0xFF00FF00);
-    return v_uint8x32(__lasx_xvbitsel_v(p0, p1, b01));
+    __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
+    __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
+    return v_uint8x32(__lasx_xvpackev_b(p1, p0));
 }
+
 inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
 {
     return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
@@ -963,14 +925,7 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
 
 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
 OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8,  v_int32x8,  w, __lasx_xvsra_w)
-
-inline __m256i _v256_srai_dx(const __m256i a, const __m256i shift)
-{
-    __m256i d = __lasx_xvreplgr2vr_d((int64)1 << 63);
-    __m256i r = __lasx_xvsrl_d(__lasx_xvadd_d(a, d), shift);
-    return __lasx_xvsub_d(r, __lasx_xvsrl_d(d, shift));
-}
-OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, _v256_srai_dx)
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, __lasx_xvsra_d)
 
 
 /** Bitwise logic **/
@@ -979,7 +934,7 @@ OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, _v256_srai_dx)
     OPENCV_HAL_IMPL_LASX_BIN_OP(|, _Tpvec, __lasx_xvor_##suffix)    \
     OPENCV_HAL_IMPL_LASX_BIN_OP(^, _Tpvec, __lasx_xvxor_##suffix)   \
     inline _Tpvec operator ~ (const _Tpvec& a)                      \
-    { return _Tpvec(__lasx_xvxor_##suffix(a.val, not_const)); }
+    { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
 
 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32,   v, __lasx_xvreplgr2vr_w(-1))
 OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32,    v, __lasx_xvreplgr2vr_w(-1))
@@ -1224,11 +1179,9 @@ inline v_int8x32 v_reverse(const v_int8x32 &a)
 
 inline v_uint16x16 v_reverse(const v_uint16x16 &a)
 {
-    static const __m256i perm = _v256_setr_b(
-            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
-            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
-    __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
-    return v_uint16x16(__lasx_xvpermi_q(vec, vec, 1));
+    __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
+    vec = __lasx_xvshuf4i_w(vec, 0x4E);
+    return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
 }
 
 inline v_int16x16 v_reverse(const v_int16x16 &a)
@@ -1236,8 +1189,8 @@ inline v_int16x16 v_reverse(const v_int16x16 &a)
 
 inline v_uint32x8 v_reverse(const v_uint32x8 &a)
 {
-    static const __m256i perm = _v256_setr_w(7, 6, 5, 4, 3, 2, 1, 0);
-    return v_uint32x8(__lasx_xvperm_w(a.val, perm));
+    __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
+    return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
 }
 
 inline v_int32x8 v_reverse(const v_int32x8 &a)
@@ -1266,17 +1219,19 @@ inline unsigned v_reduce_sum(const v_uint8x32& a)
     __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
     __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
     __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
-    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
 }
+
 inline int v_reduce_sum(const v_int8x32& a)
 {
     __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
     __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
     __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
-    return (int)(((v4i64)t3)[0]+((v4i64)t3)[1]+((v4i64)t3)[2]+((v4i64)t3)[3]);
+    __m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
+    return (int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
 }
 
-
 #define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
     inline sctype v_reduce_##func(const _Tpvec& a) \
     { \
@@ -1344,7 +1299,8 @@ OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
 inline int v_reduce_sum(const v_int32x8& a)
 {
     __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
-    return (int)(((v4i64)t1)[0]+((v4i64)t1)[1]+((v4i64)t1)[2]+((v4i64)t1)[3]);
+    __m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
+    return (int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
 }
 
 inline unsigned v_reduce_sum(const v_uint32x8& a)
@@ -1367,13 +1323,13 @@ inline float v_reduce_sum(const v_float32x8& a)
 
 inline uint64 v_reduce_sum(const v_uint64x4& a)
 {
-    uint64 *pa = (uint64*)&a;
-    return pa[0] + pa[1] + pa[2] + pa[3];
+    __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
+    return (uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
 }
 inline int64 v_reduce_sum(const v_int64x4& a)
 {
-    int64 *pa = (int64*)&a;
-    return pa[0] + pa[1] + pa[2] + pa[3];
+    __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
+    return (int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
 }
 inline double v_reduce_sum(const v_float64x4& a)
 {
@@ -1406,7 +1362,8 @@ inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
     __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
     __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
     __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
-    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
 }
 inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
 {
@@ -1414,7 +1371,8 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
     __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
     __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
     __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
-    return (unsigned)(((v4u64)t3)[0]+((v4u64)t3)[1]+((v4u64)t3)[2]+((v4u64)t3)[3]);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
 }
 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
 {
@@ -1445,36 +1403,13 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 
 /** Popcount **/
 inline v_uint8x32 v_popcount(const v_uint8x32& a)
-{
-    __m256i _popcnt_table = _v256_setr_b(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-                                         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
-    __m256i _popcnt_mask = __lasx_xvreplgr2vr_b(0x0F);
-    return v_uint8x32(__lasx_xvadd_b(__lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(a.val, _popcnt_mask)),
-                                     __lasx_xvshuf_b(_popcnt_table, _popcnt_table, __lasx_xvand_v(__lasx_xvsrli_h(a.val, 4), _popcnt_mask))));
-}
+{ return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
 inline v_uint16x16 v_popcount(const v_uint16x16& a)
-{
-    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v_uint16x16(__lasx_xvreplgr2vr_h(0x00ff));
-}
+{ return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
 inline v_uint32x8 v_popcount(const v_uint32x8& a)
-{
-    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v_uint32x8(__lasx_xvreplgr2vr_w(0x000000ff));
-}
+{ return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
 inline v_uint64x4 v_popcount(const v_uint64x4& a)
-{
-    v_uint8x32 atemp = v_popcount(v_reinterpret_as_u8(a));
-    uint8_t *pa = (uint8_t*)&atemp;
-    uint64 v[4];
-    for (int i = 0; i < 4; ++i) {
-        v[i] = pa[i*8] + pa[i*8+1] + pa[i*8+2] + pa[i*8+3] + pa[i*8+4] + pa[i*8+5] + pa[i*8+6] + pa[i*8+7];
-    }
-    return v_uint64x4(v[0], v[1], v[2], v[3]);
-}
+{ return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
 inline v_uint8x32 v_popcount(const v_int8x32& a)
 { return v_popcount(v_reinterpret_as_u8(a)); }
 inline v_uint16x16 v_popcount(const v_int16x16& a)
@@ -1484,26 +1419,11 @@ inline v_uint32x8 v_popcount(const v_int32x8& a)
 inline v_uint64x4 v_popcount(const v_int64x4& a)
 { return v_popcount(v_reinterpret_as_u64(a)); }
 
-/** Mask **/
-#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
-inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
-OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
-OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
-OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
-OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
-OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
-OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
-OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
-OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
-OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
-OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
-
 inline int v_signmask(const v_int8x32& a)
 {
-    int mask = 0;
-    int8_t *pa = (int8_t*)&a;
-    for( int i = 0; i < 32; i++ )
-        mask |= (reinterpret_int(pa[i]) < 0) << i;
+    __m256i result = __lasx_xvmskltz_b(a.val);
+    int mask = __lasx_xvpickve2gr_w(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 16);
     return mask;
 }
 inline int v_signmask(const v_uint8x32& a)
@@ -1516,10 +1436,9 @@ inline int v_signmask(const v_uint16x16& a)
 
 inline int v_signmask(const v_int32x8& a)
 {
-    int mask = 0;
-    int *pa = (int*)&a;
-    for( int i = 0; i < 8; i++ )
-        mask |= (pa[i] < 0) << i;
+    __m256i result = __lasx_xvmskltz_w(a.val);
+    int mask = __lasx_xvpickve2gr_w(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 4);
     return mask;
 }
 inline int v_signmask(const v_uint32x8& a)
@@ -1527,10 +1446,9 @@ inline int v_signmask(const v_uint32x8& a)
 
 inline int v_signmask(const v_int64x4& a)
 {
-    int mask = 0;
-    int64 *pa = (int64*)&a;
-    for( int i = 0; i < 4; i++ )
-        mask |= (pa[i] < 0) << i;
+    __m256i result = __lasx_xvmskltz_d(a.val);
+    int mask = __lasx_xvpickve2gr_d(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 2);
     return mask;
 }
 inline int v_signmask(const v_uint64x4& a)
@@ -1592,7 +1510,7 @@ OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
 
 inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
 {
-    return a * b + c;
+    return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
 }
 
 inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
@@ -1601,17 +1519,10 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x
 }
 
 inline v_float32x8 v_invsqrt(const v_float32x8& x)
-{
-    v_float32x8 half = x * v_float32x8(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5);
-    v_float32x8 t  = v_float32x8(__lasx_xvfrsqrt_s(x.val));
-    t *= v_float32x8(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5) - ((t * t) * half);
-    return t;
-}
+{ return v_float32x8(__lasx_xvfrsqrt_s(x.val)); }
 
 inline v_float64x4 v_invsqrt(const v_float64x4& x)
-{
-    return v_float64x4(1., 1., 1., 1.) / v_sqrt(x);
-}
+{ return v_float64x4(__lasx_xvfrsqrt_d(x.val)); }
 
 /** Absolute values **/
 #define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix)         \
@@ -1629,28 +1540,18 @@ inline v_float64x4 v_abs(const v_float64x4& x)
 
 /** Absolute difference **/
 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }
 
 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
-{
-    v_int8x32 d = v_sub_wrap(a, b);
-    v_int8x32 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
-}
-
+{ return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
-{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
-
+{ return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
-{
-    v_int32x8 d = a - b;
-    v_int32x8 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
-}
+{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
 
 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
 { return v_abs(a - b); }
@@ -1740,28 +1641,8 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
     return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
 }
 
-// from (Mysticial and wim) https://stackoverflow.com/q/41144668
 inline v_float64x4 v_cvt_f64(const v_int64x4& v)
-{
-    // constants encoded as floating-point
-    __m256i magic_i_lo   = __lasx_xvreplgr2vr_d(0x4330000000000000);
-    __m256i magic_i_hi32 = __lasx_xvreplgr2vr_d(0x4530000080000000);
-    __m256i magic_i_all  = __lasx_xvreplgr2vr_d(0x4530000080100000);
-    __m256d magic_d_all  = _lasx_256_castsi256_pd(magic_i_all);
-
-    // Blend the 32 lowest significant bits of v with magic_int_lo
-    __m256i mask = _v256_set_w(0, -1, 0, -1, 0, -1, 0, -1);
-    __m256i v_lo         = __lasx_xvbitsel_v(magic_i_lo, v.val, mask);
-    // Extract the 32 most significant bits of v
-    __m256i v_hi         = __lasx_xvsrli_d(v.val, 32);
-    // Flip the msb of v_hi and blend with 0x45300000
-              v_hi         = __lasx_xvxor_v(v_hi, magic_i_hi32);
-    // Compute in double precision
-    __m256d v_hi_dbl     = __lasx_xvfsub_d(_lasx_256_castsi256_pd(v_hi), magic_d_all);
-    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
-    __m256d result       = __lasx_xvfadd_d(v_hi_dbl, _lasx_256_castsi256_pd(v_lo));
-    return v_float64x4(result);
-}
+{ return v_float64x4(__lasx_xvffint_d_l(v.val)); }
 
 ////////////// Lookup table access ////////////////////
 
@@ -1967,11 +1848,9 @@ inline v_float32x8 v_interleave_pairs(const v_float32x8& vec)
 inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
 {
     __m256i vzero = __lasx_xvreplgr2vr_w(0);
-    __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val,
-                   _v256_set_d(0xffffff0f0e0d0c0a, 0x0908060504020100, 0xffffff0f0e0d0c0a, 0x0908060504020100));
-    __m256i t2 = __lasx_xvshuf_b(vzero, t1,
-                   _v256_set_d(0x1211100c0b0a0908, 0x0706050403020100, 0x1211100c0b0a0908, 0x0706050403020100));
-    return v_int8x32(__lasx_xvperm_w(t2,
+    __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
+                   _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
+    return v_int8x32(__lasx_xvperm_w(t1,
                        _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
 }
 inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
@@ -1980,11 +1859,9 @@ inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
 inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
 {
     __m256i vzero = __lasx_xvreplgr2vr_w(0);
-    __m256i t1 = __lasx_xvshuf_b(vec.val, vec.val,
-                   _v256_set_d(0xffff0f0e0d0c0b0a, 0x0908050403020100, 0xffff0f0e0d0c0b0a, 0x0908050403020100));
-    __m256i t2 = __lasx_xvshuf_b(vzero, t1,
-                   _v256_set_d(0x11100d0c0b0a0908, 0x0706050403020100, 0x11100d0c0b0a0908, 0x0706050403020100));
-    return v_int16x16(__lasx_xvperm_w(t2,
+    __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
+                   _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
+    return v_int16x16(__lasx_xvperm_w(t1,
                         _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
 }
 inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec)
@@ -2018,24 +1895,21 @@ inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
 {
     __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
-    __m256i odd = __lasx_xvmulwod_d_w(a.val, b.val);
-    return v_int64x4(__lasx_xvadd_d(even, odd));
+    return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
 }
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
-{ return v_dotprod(a, b) + c; }
+{
+    __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
+    return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
+}
 
 // 8 >> 32
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
 {
-    __m256i even_m = __lasx_xvreplgr2vr_w(0xFF00FF00);
-    __m256i even_a = __lasx_xvbitsel_v(a.val, __lasx_xvreplgr2vr_d(0), even_m);
-    __m256i odd_a  = __lasx_xvsrli_h(a.val, 8);
-
-    __m256i even_b = __lasx_xvbitsel_v(b.val, __lasx_xvreplgr2vr_d(0), even_m);
-    __m256i odd_b  = __lasx_xvsrli_h(b.val, 8);
-
-    __m256i prod0  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b));
-    __m256i prod1  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b));
+    __m256i even  = __lasx_xvmulwev_h_bu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_h_bu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
     return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
 }
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
@@ -2043,14 +1917,10 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, con
 
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
 {
-    __m256i even_a = __lasx_xvsrai_h(__lasx_xvbsll_v(a.val, 1), 8);
-    __m256i odd_a  = __lasx_xvsrai_h(a.val, 8);
-
-    __m256i even_b = __lasx_xvsrai_h(__lasx_xvbsll_v(b.val, 1), 8);
-    __m256i odd_b  = __lasx_xvsrai_h(b.val, 8);
-
-    __m256i prod0  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(even_a, even_b), __lasx_xvmulwod_w_h(even_a, even_b));
-    __m256i prod1  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(odd_a, odd_b),__lasx_xvmulwod_w_h(odd_a, odd_b));
+    __m256i even  = __lasx_xvmulwev_h_b(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_h_b(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_w_h(even, even);
+    __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
     return v_int32x8(__lasx_xvadd_w(prod0, prod1));
 }
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
@@ -2059,36 +1929,24 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const
 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
 {
-    __m256i mullo = __lasx_xvmul_h(a.val, b.val);
-    __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val);
-    __m256i mul0  = __lasx_xvilvl_h(mulhi, mullo);
-    __m256i mul1  = __lasx_xvilvh_h(mulhi, mullo);
-
-    __m256i p02   = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
-    __m256i p13   = __lasx_xvsrli_d(mul0, 32);
-    __m256i p46   = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
-    __m256i p57   = __lasx_xvsrli_d(mul1, 32);
-
-    __m256i p15_  = __lasx_xvadd_d(p02, p13);
-    __m256i p9d_  = __lasx_xvadd_d(p46, p57);
-
-    return v_uint64x4(__lasx_xvadd_d(
-        __lasx_xvilvl_d(p9d_, p15_),
-        __lasx_xvilvh_d(p9d_, p15_)));
+    __m256i even  = __lasx_xvmulwev_w_hu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_hu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
+    return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
 }
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
 { return v_dotprod_expand(a, b) + c; }
 
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 {
-    __m256i prod  = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
-    __m256i sign = __lasx_xvsrai_w(prod, 31);
-
-    __m256i lo = __lasx_xvilvl_w(sign, prod);
-    __m256i hi = __lasx_xvilvh_w(sign, prod);
-
-    return v_int64x4(__lasx_xvadd_d(__lasx_xvilvl_d(hi, lo), __lasx_xvilvh_d(hi, lo)));
+    __m256i even  = __lasx_xvmulwev_w_h(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_h(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_d_w(even, even);
+    __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
+    return v_int64x4(__lasx_xvadd_d(prod0, prod1));
 }
+
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
 { return v_dotprod_expand(a, b) + c; }
 
@@ -2126,20 +1984,11 @@ inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, c
 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
 {
-    __m256i mullo = __lasx_xvmul_h(a.val, b.val);
-    __m256i mulhi = __lasx_xvmuh_hu(a.val, b.val);
-    __m256i mul0 = __lasx_xvilvl_h(mulhi, mullo);
-    __m256i mul1  = __lasx_xvilvh_h(mulhi, mullo);
-
-    __m256i p02   = __lasx_xvbitsel_v(mul0, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
-    __m256i p13 = __lasx_xvsrli_d(mul0, 32);
-    __m256i p46   = __lasx_xvbitsel_v(mul1, __lasx_xvreplgr2vr_d(0), _v256_set_w(-1, 0, -1, 0, -1, 0, -1, 0));
-    __m256i p57   = __lasx_xvsrli_d(mul1, 32);
-
-    __m256i p15_  = __lasx_xvadd_d(p02, p13);
-    __m256i p9d_  = __lasx_xvadd_d(p46, p57);
-
-    return v_uint64x4(__lasx_xvadd_d(p15_, p9d_));
+    __m256i even  = __lasx_xvmulwev_w_hu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_hu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
+    return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
 }
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
 { return v_dotprod_expand_fast(a, b) + c; }
@@ -2261,12 +2110,7 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
 { return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
 
 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
-{
-    __m256i t = __lasx_xvreplgr2vr_h(255);
-    __m256i a1 = __lasx_xvmin_hu(a.val, t);
-    __m256i b1 = __lasx_xvmin_hu(b.val, t);
-    return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a1, b1)));
-}
+{ return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }
 
 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
 {
@@ -2276,13 +2120,8 @@ inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
 inline void v_pack_store(schar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack(a, a)); }
 
-inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
-{
-    const __m256i m = __lasx_xvreplgr2vr_h(255);
-    __m256i am = __lasx_xvmin_hu(a.val, m);
-            am = _v256_shuffle_odd_64(_lasx_packus_h(am, am));
-    v_store_low(ptr, v_uint8x32(am));
-}
+inline void v_pack_store(uchar *ptr, const v_uint16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
 
 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack_u(a, a)); }
@@ -2290,45 +2129,46 @@ inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
 template<int n> inline
 v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
 {
-    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
-    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
-                    v_reinterpret_as_s16((b + delta) >> n));
+    __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
+    return v_uint8x32(_v256_shuffle_odd_64(res));
 }
 
 template<int n> inline
 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
 {
-    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+    __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 template<int n> inline
 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
 {
-    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
+    return v_uint8x32(_v256_shuffle_odd_64(res));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
 {
-    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 template<int n> inline
 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
 {
-    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
+    return v_int8x32(_v256_shuffle_odd_64(res));
 }
 
 template<int n> inline
 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
 {
-    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 // 32
@@ -2346,67 +2186,55 @@ inline void v_pack_store(short* ptr, const v_int32x8& a)
 
 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
 {
-    const __m256i m = __lasx_xvreplgr2vr_w(65535);
-    __m256i am = __lasx_xvmin_wu(a.val, m);
-            am = _v256_shuffle_odd_64(_lasx_packus_w(am, am));
-    v_store_low(ptr, v_uint16x16(am));
+    __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
 { v_store_low(ptr, v_pack_u(a, a)); }
 
-
 template<int n> inline
 v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
-{
-    // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
-    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
-                    v_reinterpret_as_s32((b + delta) >> n));
-}
+{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }
 
 template<int n> inline
 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
 {
-    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+    __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 template<int n> inline
 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
-{
-    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
-}
+{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }
 
 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
 {
-    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 template<int n> inline
 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
-{
-    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
-}
+{ return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }
 
 template<int n> inline
 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
 {
-    v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 // 64
 // Non-saturating pack
 inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
 {
-    __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
-    __m256i b0 = __lasx_xvshuf4i_w(b.val, 0x08);
-    __m256i ab = __lasx_xvilvl_d(b0, a0);
+    __m256i ab = __lasx_xvpickev_w(b.val, a.val);
     return v_uint32x8(_v256_shuffle_odd_64(ab));
 }
 
@@ -2424,30 +2252,26 @@ inline void v_pack_store(int* ptr, const v_int64x4& b)
 
 template<int n> inline
 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
-{
-    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
-}
+{ return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }
 
 template<int n> inline
 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
 {
-    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 template<int n> inline
 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
-{
-    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
-}
+{ return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }
 
 template<int n> inline
 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
 {
-    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    __m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
 }
 
 // pack boolean
@@ -2583,63 +2407,48 @@ template<int i>
 inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
 { return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
 
-
 ///////////////////// load deinterleave /////////////////////////////
 
-inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b)
 {
-    __m256i ab0 = __lasx_xvld(ptr, 0);
-    __m256i ab1 = __lasx_xvld(ptr + 32, 0);
-
-    const __m256i sh = _v256_setr_b(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-    __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh);
-    __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh);
-    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
-    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
-    __m256i a0 = __lasx_xvilvl_d(ph, pl);
-    __m256i b0 = __lasx_xvilvh_d(ph, pl);
-    a = v_uint8x32(a0);
-    b = v_uint8x32(b0);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_b(t1, t0);
+    __m256i p1 = __lasx_xvpickod_b(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
 }
 
 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
 {
-    __m256i ab0 = __lasx_xvld(ptr, 0);
-    __m256i ab1 = __lasx_xvld(ptr + 16, 0);
-
-    const __m256i sh = _v256_setr_b(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-    __m256i p0 = __lasx_xvshuf_b(ab0, ab0, sh);
-    __m256i p1 = __lasx_xvshuf_b(ab1, ab1, sh);
-    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
-    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
-    __m256i a0 = __lasx_xvilvl_d(ph, pl);
-    __m256i b0 = __lasx_xvilvh_d(ph, pl);
-    a = v_uint16x16(a0);
-    b = v_uint16x16(b0);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_h(t1, t0);
+    __m256i p1 = __lasx_xvpickod_h(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
 }
 
 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
 {
-    __m256i ab0 = __lasx_xvld(ptr, 0);
-    __m256i ab1 = __lasx_xvld(ptr + 8, 0);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
 
-    //const int sh = 0+2*4+1*16+3*64;
-    __m256i p0 = __lasx_xvshuf4i_w(ab0, 0xD8);
-    __m256i p1 = __lasx_xvshuf4i_w(ab1, 0xD8);
-    __m256i pl = __lasx_xvpermi_q(p0, p1, 0x02);
-    __m256i ph = __lasx_xvpermi_q(p0, p1, 0x13);
-    __m256i a0 = __lasx_xvilvl_d(ph, pl);
-    __m256i b0 = __lasx_xvilvh_d(ph, pl);
-    a = v_uint32x8(a0);
-    b = v_uint32x8(b0);
+    __m256i p0 = __lasx_xvpickev_w(t1, t0);
+    __m256i p1 = __lasx_xvpickod_w(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
 }
 
 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
 {
     __m256i ab0 = __lasx_xvld(ptr, 0);
-    __m256i ab1 = __lasx_xvld(ptr + 4, 0);
+    __m256i ab1 = __lasx_xvld(ptr, 32);
 
     __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
     __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
@@ -2652,8 +2461,8 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b
 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
 {
     __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 32, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 64, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
 
     __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
     __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@@ -2686,8 +2495,8 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b,
 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
 {
     __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 16, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 32, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
 
     __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
     __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@@ -2717,8 +2526,8 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16&
 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
 {
     __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 8, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 16, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
 
     __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
     __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
@@ -2741,8 +2550,8 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8&
 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
 {
     __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 4, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 8, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
 
     __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128
     __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
@@ -2756,81 +2565,60 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b
     c = v_uint64x4(r0);
 }
 
-inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d)
 {
-    __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 32, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 64, 0);
-    __m256i bgr3 = __lasx_xvld(ptr + 96, 0);
-    const __m256i sh = _v256_setr_b(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-                                    0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+    __m256i t2 = __lasx_xvld(ptr, 64);
+    __m256i t3 = __lasx_xvld(ptr, 96);
 
-    __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh);
-    __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh);
-    __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh);
-    __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh);
+    const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
+    __m256i ac_lo = __lasx_xvpickev_b(t1, t0);
+    __m256i bd_lo = __lasx_xvpickod_b(t1, t0);
+    __m256i ac_hi = __lasx_xvpickev_b(t3, t2);
+    __m256i bd_hi = __lasx_xvpickod_b(t3, t2);
 
-    __m256i p01l = __lasx_xvilvl_w(p1, p0);
-    __m256i p01h = __lasx_xvilvh_w(p1, p0);
-    __m256i p23l = __lasx_xvilvl_w(p3, p2);
-    __m256i p23h = __lasx_xvilvh_w(p3, p2);
+    __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
+    __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
+    __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
+    __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);
 
-    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
-    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
-    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
-    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
-
-    __m256i b0 = __lasx_xvilvl_w(plh, pll);
-    __m256i g0 = __lasx_xvilvh_w(plh, pll);
-    __m256i r0 = __lasx_xvilvl_w(phh, phl);
-    __m256i a0 = __lasx_xvilvh_w(phh, phl);
-
-    a = v_uint8x32(b0);
-    b = v_uint8x32(g0);
-    c = v_uint8x32(r0);
-    d = v_uint8x32(a0);
+    a.val = __lasx_xvperm_w(a_pre, sh);
+    b.val = __lasx_xvperm_w(b_pre, sh);
+    c.val = __lasx_xvperm_w(c_pre, sh);
+    d.val = __lasx_xvperm_w(d_pre, sh);
 }
 
-inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d)
 {
-    __m256i bgr0 = __lasx_xvld(ptr, 0);
-    __m256i bgr1 = __lasx_xvld(ptr + 16, 0);
-    __m256i bgr2 = __lasx_xvld(ptr + 32, 0);
-    __m256i bgr3 = __lasx_xvld(ptr + 48, 0);
-    const __m256i sh = _v256_setr_b(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-                                    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-    __m256i p0 = __lasx_xvshuf_b(bgr0, bgr0, sh);
-    __m256i p1 = __lasx_xvshuf_b(bgr1, bgr1, sh);
-    __m256i p2 = __lasx_xvshuf_b(bgr2, bgr2, sh);
-    __m256i p3 = __lasx_xvshuf_b(bgr3, bgr3, sh);
-
-    __m256i p01l = __lasx_xvilvl_w(p1, p0);
-    __m256i p01h = __lasx_xvilvh_w(p1, p0);
-    __m256i p23l = __lasx_xvilvl_w(p3, p2);
-    __m256i p23h = __lasx_xvilvh_w(p3, p2);
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+    __m256i t2 = __lasx_xvld(ptr, 64);
+    __m256i t3 = __lasx_xvld(ptr, 96);
 
-    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
-    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
-    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
-    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
+    const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
+    __m256i ac_lo = __lasx_xvpickev_h(t1, t0);
+    __m256i bd_lo = __lasx_xvpickod_h(t1, t0);
+    __m256i ac_hi = __lasx_xvpickev_h(t3, t2);
+    __m256i bd_hi = __lasx_xvpickod_h(t3, t2);
 
-    __m256i b0 = __lasx_xvilvl_w(plh, pll);
-    __m256i g0 = __lasx_xvilvh_w(plh, pll);
-    __m256i r0 = __lasx_xvilvl_w(phh, phl);
-    __m256i a0 = __lasx_xvilvh_w(phh, phl);
+    __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
+    __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
+    __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
+    __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);
 
-    a = v_uint16x16(b0);
-    b = v_uint16x16(g0);
-    c = v_uint16x16(r0);
-    d = v_uint16x16(a0);
+    a.val = __lasx_xvperm_w(a_pre, sh);
+    b.val = __lasx_xvperm_w(b_pre, sh);
+    c.val = __lasx_xvperm_w(c_pre, sh);
+    d.val = __lasx_xvperm_w(d_pre, sh);
 }
 
 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
 {
     __m256i p0 = __lasx_xvld(ptr, 0);
-    __m256i p1 = __lasx_xvld(ptr + 8, 0);
-    __m256i p2 = __lasx_xvld(ptr + 16, 0);
-    __m256i p3 = __lasx_xvld(ptr + 24, 0);
+    __m256i p1 = __lasx_xvld(ptr, 32);
+    __m256i p2 = __lasx_xvld(ptr, 64);
+    __m256i p3 = __lasx_xvld(ptr, 96);
 
     __m256i p01l = __lasx_xvilvl_w(p1, p0);
     __m256i p01h = __lasx_xvilvh_w(p1, p0);
@@ -2856,9 +2644,9 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8&
 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
 {
     __m256i bgra0 = __lasx_xvld(ptr, 0);
-    __m256i bgra1 = __lasx_xvld(ptr + 4, 0);
-    __m256i bgra2 = __lasx_xvld(ptr + 8, 0);
-    __m256i bgra3 = __lasx_xvld(ptr + 12, 0);
+    __m256i bgra1 = __lasx_xvld(ptr, 32);
+    __m256i bgra2 = __lasx_xvld(ptr, 64);
+    __m256i bgra3 = __lasx_xvld(ptr, 96);
 
     __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
     __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
@@ -3195,7 +2983,7 @@ OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4,
 // FP16
 //
 
-inline v_float32x8 v256_load_expand(const float16_t* ptr)
+inline v_float32x8 v256_load_expand(const hfloat* ptr)
 {
 #if CV_FP16
     //1-load128, 2-permi, 3-cvt
@@ -3208,7 +2996,7 @@ inline v_float32x8 v256_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
 {
 #if CV_FP16
     __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
@@ -3217,7 +3005,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
     float CV_DECL_ALIGNED(32) buf[8];
     v_store_aligned(buf, a);
     for (int i = 0; i < 8; i++)
-        ptr[i] = float16_t(buf[i]);
+        ptr[i] = hfloat(buf[i]);
 #endif
 }
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
new file mode 100644
index 000000000000..6e3290426f77
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp
@@ -0,0 +1,2538 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_LSX_HPP
+#define OPENCV_HAL_INTRIN_LSX_HPP
+
+#include <lsxintrin.h>
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD128_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+/////////// Utils ////////
+
+inline __m128i _v128_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
+        char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
+{
+    return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7,
+                           v8, v9, v10, v11, v12, v13, v14, v15 };
+}
+
+inline __m128i _v128_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
+        char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
+{
+    return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8,
+                           v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m128i _v128_setr_h(short v0, short v1, short v2, short v3, short v4, short v5,
+       short v6, short v7)
+{
+    return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m128i _v128_setr_w(int v0, int v1, int v2, int v3)
+{
+    return (__m128i)v4i32{ v0, v1, v2, v3 };
+}
+
+inline __m128i _v128_set_w(int v0, int v1, int v2, int v3)
+{
+    return (__m128i)v4i32{ v3, v2, v1, v0 };
+}
+
+inline __m128i _v128_setall_w(int v0)
+{
+    return __lsx_vreplgr2vr_w(v0);
+}
+
+inline __m128i _v128_setr_d(int64 v0, int64 v1)
+{
+    return (__m128i)v2i64{ v0, v1 };
+}
+
+inline __m128i _v128_set_d(int64 v0, int64 v1)
+{
+    return (__m128i)v2i64{ v1, v0 };
+}
+
+inline __m128 _v128_setr_ps(float v0, float v1, float v2, float v3)
+{
+    return (__m128)v4f32{ v0, v1, v2, v3 };
+}
+
+inline __m128 _v128_setall_ps(float v0)
+{
+    return (__m128)v4f32{ v0, v0, v0, v0 };
+}
+
+inline __m128d _v128_setr_pd(double v0, double v1)
+{
+    return (__m128d)v2f64{ v0, v1 };
+}
+
+inline __m128d _v128_setall_pd(double v0)
+{
+    return (__m128d)v2f64{ v0, v0 };
+}
+
+inline __m128i _lsx_packus_h(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_bu_h(b, a, 0);
+}
+
+inline __m128i _lsx_packs_h(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_b_h(b, a, 0);
+}
+
+inline __m128i _lsx_packus_w(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_hu_w(b, a, 0);
+}
+
+/////// Types ///////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16};
+
+    v_uint8x16() {}
+    explicit v_uint8x16(__m128i v): val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+             uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+
+    uchar get0() const
+    {
+        return (uchar)__lsx_vpickve2gr_bu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+            schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+
+    schar get0() const
+    {
+        return (schar)__lsx_vpickve2gr_b(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+
+    ushort get0() const
+    {
+        return (ushort)__lsx_vpickve2gr_hu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+
+    short get0() const
+    {
+        return (short)__lsx_vpickve2gr_h(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _v128_setr_w(v0, v1, v2, v3);
+    }
+
+    unsigned get0() const
+    {
+        return (unsigned)__lsx_vpickve2gr_wu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _v128_setr_w(v0, v1, v2, v3);
+    }
+
+    int get0() const
+    {
+        return (int)__lsx_vpickve2gr_w(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4};
+
+    v_float32x4() {}
+    explicit v_float32x4(__m128 v) : val(v) {}
+    explicit v_float32x4(__m128i v) { val = *((__m128*)&v); }
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _v128_setr_ps(v0, v1, v2, v3);
+    }
+
+    float get0() const
+    {
+        union { int iv; float fv; } d;
+        d.iv = __lsx_vpickve2gr_w(val, 0);
+        return d.fv;
+    }
+
+    int get0toint() const
+    {
+        __m128i result = __lsx_vftintrz_w_s(val);
+        return (int)__lsx_vpickve2gr_w(result, 0);
+    }
+
+    __m128 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2};
+
+    v_uint64x2() {}
+    explicit v_uint64x2(__m128i v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        val = _v128_setr_d(v0, v1);
+    }
+
+    uint64 get0() const
+    {
+        return __lsx_vpickve2gr_du(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2};
+
+    v_int64x2() {}
+    explicit v_int64x2(__m128i v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        val = _v128_setr_d(v0, v1);
+    }
+
+    uint64 get0() const
+    {
+        return __lsx_vpickve2gr_d(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2};
+
+    v_float64x2() {}
+    explicit v_float64x2(__m128d v) : val(v) {}
+    explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); }
+    v_float64x2(double v0, double v1)
+    {
+        val = _v128_setr_pd(v0, v1);
+    }
+
+    double get0() const
+    {
+        union { int64 iv; double fv; } d;
+        d.iv = __lsx_vpickve2gr_d(val, 0);
+        return d.fv;
+    }
+
+    int64 get0toint64() const
+    {
+        __m128i result = __lsx_vftintrz_l_d(val);
+        return (int64)__lsx_vpickve2gr_d(result, 0);
+    }
+
+    __m128d val;
+};
+
+////////////// Load and store operations /////////
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp)                     \
+    inline _Tpvec v_load(const _Tp* ptr)                               \
+    { return _Tpvec(__lsx_vld(ptr, 0)); }                              \
+    inline _Tpvec v_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(__lsx_vld(ptr, 0)); }                              \
+    inline _Tpvec v_load_low(const _Tp* ptr)                           \
+    { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); }                        \
+    inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                  \
+        __m128i vl = __lsx_vldrepl_d(ptr0, 0);                         \
+        __m128i vh = __lsx_vldrepl_d(ptr1, 0);                         \
+        return _Tpvec(__lsx_vilvl_d(vh, vl));                          \
+    }                                                                  \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                     \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)             \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)     \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
+    {                                                                  \
+        if ( mode == hal::STORE_UNALIGNED)                             \
+            __lsx_vst(a.val, ptr, 0);                                  \
+        else if ( mode == hal::STORE_ALIGNED_NOCACHE)                  \
+            __lsx_vst(a.val, ptr, 0);                                  \
+        else                                                           \
+            __lsx_vst(a.val, ptr, 0);                                  \
+    }                                                                  \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                 \
+    {  __lsx_vstelm_d(a.val, ptr, 0, 0); }                             \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                \
+    {  __lsx_vstelm_d(a.val, ptr, 0, 1); }                             \
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint8x16,  uchar)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int8x16,   schar)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int16x8,  short)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint32x4,  unsigned)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int32x4,   int)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint64x2,  uint64)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int64x2,   int64)
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg)        \
+    inline _Tpvec v_load(const _Tp* ptr)                               \
+    { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); }                     \
+    inline _Tpvec v_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); }                     \
+    inline _Tpvec v_load_low(const _Tp* ptr)                           \
+    { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); }               \
+    inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                  \
+        __m128i vl = __lsx_vldrepl_d(ptr0, 0);                         \
+        __m128i vh = __lsx_vldrepl_d(ptr1, 0);                         \
+        return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl));                 \
+    }                                                                  \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                     \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)             \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)     \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
+    {                                                                  \
+        if( mode == hal::STORE_UNALIGNED)                              \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE)                   \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+        else                                                           \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+    }                                                                  \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                 \
+    {  __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); }                    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                \
+    {  __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); }                    \
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float32x4, float, __m128)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float64x2, double, __m128d)
+
+inline __m128i _lsx_128_castps_si128(const __m128& v)
+{ return __m128i(v); }
+
+inline __m128i _lsx_128_castpd_si128(const __m128d& v)
+{ return __m128i(v); }
+
+#define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast)  \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)    \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)           \
+    inline _Tpvec v_setzero_##suffix()                                            \
+    { return _Tpvec(__lsx_vldi(0)); }                                             \
+    inline _Tpvec v_setall_##suffix(_Tp v)                                        \
+    { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); }                    \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128)  \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128)  \
+
+OPENCV_HAL_IMPL_LSX_INIT(v_uint8x16,  uchar,    u8,   b,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int8x16,   schar,    s8,   b,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint16x8,  ushort,   u16,  h,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int16x8,   short,    s16,  h,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint32x4,  unsigned, u32,  w,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int32x4,   int,      s32,  w,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint64x2,  uint64,   u64,  d,  long int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int64x2,   int64,    s64,  d,  long int)
+
+inline __m128 _lsx_128_castsi128_ps(const __m128i &v)
+{ return __m128(v); }
+
+inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
+{ return __m128d(v); }
+
+#define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast)    \
+    inline _Tpvec v_setzero_##suffix()                                      \
+    { return _Tpvec(__lsx_vldi(0)); }                                       \
+    inline _Tpvec v_setall_##suffix(_Tp v)                                  \
+    { return _Tpvec(_v128_setall_##zsuffix(v)); }                           \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2,      suffix,   cast)        \
+
+OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float32x4, float,  f32, ps, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float64x2, double, f64, pd, _lsx_128_castsi128_pd)
+
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a)
+{ return a; }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a)
+{ return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); }
+
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a)
+{ return a; }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a)
+{ return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); }
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix)                            \
+    inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b)             \
+    { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); }  \
+    inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b)             \
+    { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); }  \
+
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint8x16,  b)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int8x16,   b)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint16x8,  h)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int16x8,   h)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint32x4,  w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int32x4,   w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint64x2,  d)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int64x2,   d)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_float32x4, w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_float64x2, d)
+
+//ZIP
+#define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec)                               \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)     \
+    { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)    \
+    { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,         \
+                            _Tpvec& c, _Tpvec& d)                     \
+    {                                                                 \
+        __m128i a1 = (__m128i)a.val,  b1 = (__m128i)b.val;            \
+        c = _Tpvec(__lsx_vilvl_d(b1, a1));                            \
+        d = _Tpvec(__lsx_vilvh_d(b1, a1));                            \
+    }                                                                 \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,               \
+                      _Tpvec& ab0, _Tpvec& ab1)                       \
+    {                                                                 \
+        ab0 = v128_unpacklo(a, b);                                    \
+        ab1 = v128_unpackhi(a, b);                                    \
+    }
+
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int8x16)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int16x8)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint64x2)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int64x2)
+OPENCV_HAL_IMPL_LSX_ZIP(v_float32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin)           \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint8x16,  __lsx_vsadd_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint8x16,  __lsx_vssub_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int8x16,   __lsx_vsadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int8x16,   __lsx_vssub_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint16x8,  __lsx_vsadd_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint16x8,  __lsx_vssub_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int16x8,   __lsx_vsadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int16x8,   __lsx_vssub_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint32x4,  __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint32x4,  __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_uint32x4,  __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int32x4,   __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int32x4,   __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_int32x4,   __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_uint64x2,  __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_uint64x2,  __lsx_vsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_int64x2,   __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_int64x2,   __lsx_vsub_d)
+
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float32x4, __lsx_vfadd_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float32x4, __lsx_vfsub_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float32x4, __lsx_vfmul_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float32x4, __lsx_vfdiv_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(+, v_float64x2, __lsx_vfadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(-, v_float64x2, __lsx_vfsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(*, v_float64x2, __lsx_vfmul_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(/, v_float64x2, __lsx_vfdiv_d)
+
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
+{
+    v_uint16x8 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x16 operator * (const v_int8x16& a, const v_int8x16& b)
+{
+    v_int16x8 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x8 operator * (const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
+    __m128i pod = __lsx_vmulwod_w_hu(a0, b0);
+    __m128i pl  = __lsx_vilvl_w(pod, pev);
+    __m128i ph  = __lsx_vilvh_w(pod, pev);
+    return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
+}
+inline v_int16x8 operator * (const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i pev = __lsx_vmulwev_w_h(a0, b0);
+    __m128i pod = __lsx_vmulwod_w_h(a0, b0);
+    __m128i pl  = __lsx_vilvl_w(pod, pev);
+    __m128i ph  = __lsx_vilvh_w(pod, pev);
+    return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
+}
+inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
+{ a = a * b; return a; }
+inline v_int8x16& operator *= (v_int8x16& a, const v_int8x16& b)
+{ a = a * b; return a; }
+inline v_uint16x8& operator *= (v_uint16x8& a, const v_uint16x8& b)
+{ a = a * b; return a; }
+inline v_int16x8& operator *= (v_int16x8& a, const v_int16x8& b)
+{ a = a * b; return a; }
+
+/** Non-saturating arithmetics **/
+
+#define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin)         \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)           \
+    { return _Tpvec(intrin(a.val, b.val)); }                       \
+
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint8x16,  __lsx_vadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int8x16,   __lsx_vadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint16x8,  __lsx_vadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int16x8,   __lsx_vadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint8x16,  __lsx_vsub_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int8x16,   __lsx_vsub_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint16x8,  __lsx_vsub_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int16x8,   __lsx_vsub_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_uint16x8,  __lsx_vmul_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_int16x8,   __lsx_vmul_h)
+
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
+    return v_uint8x16(__lsx_vpackev_b(p1, p0));
+}
+
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+// Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
+    c.val = __lsx_vilvl_h(p1, p0);
+    d.val = __lsx_vilvh_h(p1, p0);
+}
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_b(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_b(a0, b0);
+    c.val = __lsx_vilvl_h(p1, p0);
+    d.val = __lsx_vilvh_h(p1, p0);
+}
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_w_h(a0, b0);
+    __m128i p1 = __lsx_vmulwod_w_h(a0, b0);
+    c.val = __lsx_vilvl_w(p1, p0);
+    d.val = __lsx_vilvh_w(p1, p0);
+}
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_w_hu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_w_hu(a0, b0);
+    c.val = __lsx_vilvl_w(p1, p0);
+    d.val = __lsx_vilvh_w(p1, p0);
+}
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_d_wu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_d_wu(a0, b0);
+    c.val = __lsx_vilvl_d(p1, p0);
+    d.val = __lsx_vilvh_d(p1, p0);
+}
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(__lsx_vmuh_h(a.val, b.val)); }
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); }
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                 \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)                           \
+    { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)                           \
+    { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)                           \
+    { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)                           \
+    { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); }                 \
+    template<int imm>                                                                \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                                           \
+    { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                                           \
+    { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                                           \
+    { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                                           \
+    { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); }                            \
+
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint16x8, v_int16x8, h, __lsx_vsra_h)
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint32x4, v_int32x4, w, __lsx_vsra_w)
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix)                                 \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(&, _Tpvec, __lsx_vand_##suffix)                       \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(|, _Tpvec, __lsx_vor_##suffix)                        \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(^, _Tpvec, __lsx_vxor_##suffix)                       \
+    inline _Tpvec operator ~(const _Tpvec& a)                                        \
+    { return _Tpvec(__lsx_vnori_b(a.val, 0)); }                                      \
+
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int8x16,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint16x8,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int16x8,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint32x4,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int32x4,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2,    v)
+
+#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)               \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)                 \
+    { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }                   \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)                   \
+    { __m128i c = intrin((__m128i)(a.val), (__m128i)b.val);                          \
+      a.val = cast(c);                                                               \
+      return a;}
+
+#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast)                             \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(&, _Tpvec, __lsx_vand_v, cast)                  \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(|, _Tpvec, __lsx_vor_v, cast)                   \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(^, _Tpvec, __lsx_vxor_v, cast)                  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                                       \
+    { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); }                           \
+
+OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float64x2, _lsx_128_castsi128_pd)
+
+/** Select **/
+#define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec)                                           \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); }                      \
+
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int8x16)
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int16x8)
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int32x4)
+
+inline v_float32x4 v_select(const v_float32x4 &mask, const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
+inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec)                            \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)         \
+    { return ~( a == b ); }                                              \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)         \
+    { return b > a ; }                                                   \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)         \
+    { return ~(a < b); }                                                 \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)         \
+    { return b >= a; }                                                   \
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)    \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)          \
+    { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
+    inline _Tpuvec operator >  (const _Tpuvec& a, const _Tpuvec& b)          \
+    { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); }                  \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)          \
+    { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)           \
+    { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); }                   \
+    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec)                                   \
+    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint8x16,  v_int8x16,  b, bu)
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8,  v_int16x8,  h, hu)
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4,  v_int32x4,  w, wu)
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix)          \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); }         \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
+OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
+
+#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)       \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)       \
+    { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); }           \
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix)                    \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(==, vfcmp_ceq, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(!=, vfcmp_cne, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(<,  vfcmp_clt, _Tpvec, ssuffix)            \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(<=, vfcmp_cle, _Tpvec, ssuffix)            \
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
+OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
+
+inline v_float32x4 operator > (const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
+
+inline v_float32x4 operator >= (const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
+
+inline v_float64x2 operator > (const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
+
+inline v_float64x2 operator >= (const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); }
+
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint8x16,  __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint8x16,  __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int8x16,   __lsx_vmin_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int8x16,   __lsx_vmax_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint16x8,  __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint16x8,  __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int16x8,   __lsx_vmin_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int16x8,   __lsx_vmax_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint32x4,  __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint32x4,  __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int32x4,   __lsx_vmin_w)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int32x4,   __lsx_vmax_w)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float32x4, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float32x4, __lsx_vfmax_s)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float64x2, __lsx_vfmin_d)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float64x2, __lsx_vfmax_d)
+
+template <int imm,
+    bool is_invalid = ((imm < 0) || (imm > 16)),
+    bool is_first = (imm == 0),
+    bool is_half = (imm == 8),
+    bool is_second = (imm == 16),
+    bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
+class v_lsx_palignr_u8_class;
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, true, false, false, false, false>;
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        CV_UNUSED(b);
+        return a;
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, true, false, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        return __lsx_vshuf4i_d(a, b, 0x9);
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        CV_UNUSED(a);
+        return b;
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, false, false, true>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        enum { imm2 = (sizeof(__m128i) - imm) };
+        return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2));
+    }
+};
+
+template <int imm>
+inline __m128i v_lsx_palignr_u8(const __m128i& a, const __m128i& b)
+{
+    CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_lsx_palignr_u8");
+    return v_lsx_palignr_u8_class<imm>()(a, b);
+}
+/** Rotate **/
+#define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast)                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_right(const _Tpvec &a)                                       \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2);                              \
+        return _Tpvec(cast(ret));                                                       \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_left(const _Tpvec &a)                                        \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2);                              \
+        return _Tpvec(cast(ret));                                                       \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                      \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)a.val, (__m128i)b.val)));    \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                       \
+    {                                                                                   \
+        enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))};   \
+        return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)b.val, (__m128i)a.val)));    \
+    }
+
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint8x16, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int8x16,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint16x8, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int16x8,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint32x4, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int32x4,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint64x2, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int64x2,  OPENCV_HAL_NOP)                             \
+
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float32x4, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float64x2, _lsx_128_castsi128_pd)
+
+/** Rverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B);
+    return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B);
+    return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{ return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{ return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////////// Reduce and mask ////////////
+
+/** Reduce **/
+// this function is return a[0]+a[1]+...+a[31]
+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline int v_reduce_sum(const v_int8x16 &a)
+{
+    __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_w_h(t1, t1);
+    __m128i t3 = __lsx_vhaddw_d_w(t2, t2);
+    __m128i t4 = __lsx_vhaddw_q_d(t3, t3);
+    return (int)__lsx_vpickve2gr_w(t4, 0);
+}
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin)            \
+    inline sctype v_reduce_##func(const _Tpvec& a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 2));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 1));                              \
+        return (sctype)__lsx_vpickve2gr_b(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, min, __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, max, __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16,  schar, min, __lsx_vmin_b)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16,  schar, max, __lsx_vmax_b)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin)             \
+    inline sctype v_reduce_##func(const _Tpvec &a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 2));                              \
+        return (sctype)__lsx_vpickve2gr_h(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, min, __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, max, __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8,  short,  min, __lsx_vmin_h)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8,  short,  max, __lsx_vmax_h)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin)             \
+    inline sctype v_reduce_##func(const _Tpvec &a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        return (sctype)__lsx_vpickve2gr_w(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, min, __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, max, __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4,  int,      min, __lsx_vmin_w)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4,  int,      max, __lsx_vmax_w)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin)                           \
+    inline float v_reduce_##func(const v_float32x4 &a)                         \
+    {                                                                          \
+        __m128 val   = a.val;                                                  \
+        val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8));             \
+        val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4));             \
+        float *fval = (float*)&val;                                            \
+        return fval[0];                                                        \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_FLT(min, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LSX_REDUCE_FLT(max, __lsx_vfmax_s)
+
+inline int v_reduce_sum(const v_int32x4 &a)
+{
+    __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_q_d(t1, t1);
+    return (int)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x4 &a)
+{
+    __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (int)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline int v_reduce_sum(const v_int16x8 &a)
+{
+    __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_d_w(t1, t1);
+    __m128i t3 = __lsx_vhaddw_q_d(t2, t2);
+    return (int)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sum(const v_uint16x8 &a)
+{
+    __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (int)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline float v_reduce_sum(const v_float32x4 &a)
+{
+    __m128i val = (__m128i)a.val;
+    val = __lsx_vbsrl_v(val, 8);
+    __m128 result = __lsx_vfadd_s(a.val, (__m128)val);
+    float *pa = (float*)&result;
+    return (float)(pa[0] + pa[1]);
+}
+
+inline uint64 v_reduce_sum(const v_uint64x2 &a)
+{
+    __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val);
+    return (uint64)__lsx_vpickve2gr_du(t0, 0);
+}
+
+inline int64 v_reduce_sum(const v_int64x2 &a)
+{
+    __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val);
+    return (int64)__lsx_vpickve2gr_d(t0, 0);
+}
+
+inline double v_reduce_sum(const v_float64x2 &a)
+{
+    double *pa = (double*)&a;
+    return pa[0] + pa[1];
+}
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    __m128i a0 = (__m128i)a.val;
+    __m128i b0 = (__m128i)b.val;
+    __m128i c0 = (__m128i)c.val;
+    __m128i d0 = (__m128i)d.val;
+    __m128i ac_l = __lsx_vilvl_w(c0, a0);
+    __m128i ac_h = __lsx_vilvh_w(c0, a0);
+    __m128i bd_l = __lsx_vilvl_w(d0, b0);
+    __m128i bd_h = __lsx_vilvh_w(d0, b0);
+    __m128  ac   = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h);
+    __m128  bd   = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h);
+    return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac),
+                       (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac)));
+}
+
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i t0 = __lsx_vabsd_b(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i t0 = __lsx_vabsd_bu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i t0 = __lsx_vabsd_hu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (unsigned)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i t0 = __lsx_vabsd_h(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (unsigned)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i t0 = __lsx_vabsd_wu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (unsigned)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i t0 = __lsx_vabsd_w(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (unsigned)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 a_b = a - b;
+    return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
+}
+
+/** Popcount **/
+#define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix)                  \
+inline _Tpvec v_popcount(const _Tp& a)                                     \
+{ return _Tpvec(__lsx_vpcnt_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16,  v_uint8x16,  b);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16,  v_int8x16,   b);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8,  v_uint16x8,  h);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8,  v_int16x8,   h);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4,  v_uint32x4,  w);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4,  v_int32x4,   w);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2,  v_uint64x2,  d);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2,  v_int64x2,   d);
+
+/** Mask **/
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt)              \
+inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
+
+inline int v_signmask(const v_int8x16& a)
+{
+    __m128i result = __lsx_vmskltz_b(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_uint8x16& a)
+{ return v_signmask(v_reinterpret_as_s8(a)) ;}
+
+inline int v_signmask(const v_int16x8 &a)
+{
+    __m128i result = __lsx_vmskltz_h(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_uint16x8 &a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    __m128i result = __lsx_vmskltz_w(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+inline int v_signmask(const v_uint64x2& a)
+{
+    __m128i result = __lsx_vmskltz_d(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(*(v_int32x4*)(&a)); }
+
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(*(v_int64x2*)(&a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint8x16, 65535)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int8x16, 65535)
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint16x8, 255);
+OPENCV_HAL_IMPL_LSX_CHECK(v_int16x8, 255);
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint64x2, 3)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int64x2, 3)
+OPENCV_HAL_IMPL_LSX_CHECK(v_float32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
+
+///////////// Other math /////////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix)                              \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+    { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); }              \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c)   \
+    { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); }              \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                       \
+    { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); }                            \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
+    { return v_fma(a, a, b * b); }                                              \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_sqrt(v_fma(a, a, b * b)); }
+
+OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
+OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); }
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return v_fma(a, b, c); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    return v_float32x4(__lsx_vfrsqrt_s(x.val));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    return v_float64x2(__lsx_vfrsqrt_d(x.val));
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix)                          \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)                        \
+    { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); }
+
+OPENCV_HAL_IMPL_LSX_ABS(int8x16, b)
+OPENCV_HAL_IMPL_LSX_ABS(int16x8, h)
+OPENCV_HAL_IMPL_LSX_ABS(int32x4, w)
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(*((__m128i*)&x) & __lsx_vreplgr2vr_w(0x7fffffff)); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{ return v_float64x2(*((__m128i*)&x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); }
+
+/** Absolute difference **/
+
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return (v_uint8x16)__lsx_vabsd_bu(a.val, b.val); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return (v_uint16x8)__lsx_vabsd_hu(a.val, b.val); }
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return (v_uint32x4)__lsx_vabsd_wu(a.val, b.val); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return (v_uint8x16)__lsx_vabsd_b(a.val, b.val); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return (v_uint16x8)__lsx_vabsd_h(a.val, b.val); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(a - b); }
+
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+///////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftint_w_s(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); }
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(a.val)); }
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{ return v_trunc(v_float64x2(__lsx_vfrintrm_d(a.val))); }
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); }
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{ return v_trunc(v_float64x2(__lsx_vfrintrp_d(a.val))); }
+
+/** To float **/
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{ return v_float32x4(__lsx_vffint_s_w(a.val)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{ return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{ return v_float64x2(__lsx_vffintl_d_w(a.val)); }
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{ return v_float64x2(__lsx_vffinth_d_w(a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{ return v_float64x2(__lsx_vfcvtl_d_s(a.val)); }
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{ return v_float64x2(__lsx_vfcvth_d_s(a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& v)
+{ return v_float64x2(__lsx_vffint_d_l(v.val)); }
+
+
+//////////////// Lookup table access ////////////////
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_b(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]], tab[idx[8]],
+                     tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]],
+                     tab[idx[14]], tab[idx[15]]));
+}
+
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_h(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]),
+           *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]), *(const short*)(tab + idx[4]),
+           *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
+}
+
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_h(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
+}
+
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(_v128_setr_w(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_int32x4 v_lut_pairs(const int *tab, const int* idx)
+{
+    return v_int32x4(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int *idx)
+{
+    return v_int64x2(_v128_setr_d(tab[idx[0]], tab[idx[1]]));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(_v128_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    return v_float32x4((__m128)_v128_setr_pd(*(const double*)(tab + idx[0]), *(const double*)(tab + idx[1])));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4((__m128)__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(_v128_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2((__m128d)__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int *idx = (int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy0  = __lsx_vld(tab + idx[0], 0);
+    __m128i xy1  = __lsx_vld(tab + idx[1], 0);
+    __m128i xy2  = __lsx_vld(tab + idx[2], 0);
+    __m128i xy3  = __lsx_vld(tab + idx[3], 0);
+    __m128i xy01 = __lsx_vilvl_d(xy1, xy0);
+    __m128i xy23 = __lsx_vilvl_d(xy3, xy2);
+    __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01);
+    __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01);
+    x = v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02));
+    y = v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    const int* idx = (const int*)&idxvec.val;
+    __m128i xy0 = __lsx_vld(tab + idx[0], 0);
+    __m128i xy1 = __lsx_vld(tab + idx[1], 0);
+    x = v_float64x2((__m128d)__lsx_vilvl_d(xy1, xy0));
+    y = v_float64x2((__m128d)__lsx_vilvh_d(xy1, xy0));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08)));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08)));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908)));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504)));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    __m128i zero = __lsx_vldi(0);
+    return v_int8x16(__lsx_vshuf_b(zero, vec.val,
+           _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100)));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    __m128i zero = __lsx_vldi(0);
+    return v_int16x8(__lsx_vshuf_b(zero, vec.val,
+           _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100)));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+//////////// Matrix operations /////////
+
+/////////// Dot Product /////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y));
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    __m128i x = a.val, y = b.val, z = c.val;
+    __m128i t = __lsx_vmaddwev_w_h(z, x, y);
+    return v_int32x4(__lsx_vmaddwod_w_h(t, x, y));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i x = a.val, y = b.val;
+    return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(x, y), x, y));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    __m128i x = a.val, y = b.val, z = c.val;
+    __m128i t = __lsx_vmaddwev_d_w(z, x, y);
+    return v_int64x2(__lsx_vmaddwod_d_w(t, x, y));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_h_bu(x, y);
+    __m128i odd   = __lsx_vmulwod_h_bu(x, y);
+    __m128i prod0 = __lsx_vhaddw_wu_hu(even, even);
+    __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd);
+    return v_uint32x4(__lsx_vadd_w(prod0, prod1));
+}
+
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c ;}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_h_b(x, y);
+    __m128i odd   = __lsx_vmulwod_h_b(x, y);
+    __m128i prod0 = __lsx_vhaddw_w_h(even, even);
+    __m128i prod1 = __lsx_vhaddw_w_h(odd, odd);
+    return v_int32x4(__lsx_vadd_w(prod0, prod1));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_hu(x, y);
+    __m128i odd   = __lsx_vmulwod_w_hu(x, y);
+    __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
+    __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
+    return v_uint64x2(__lsx_vadd_d(prod0, prod1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_h(x, y);
+    __m128i odd   = __lsx_vmulwod_w_h(x, y);
+    __m128i prod0 = __lsx_vhaddw_d_w(even, even);
+    __m128i prod1 = __lsx_vhaddw_d_w(odd, odd);
+    return v_int64x2(__lsx_vadd_d(prod0, prod1));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+///////// Fast Dot Product //////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_hu(x, y);
+    __m128i odd   = __lsx_vmulwod_w_hu(x, y);
+    __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
+    __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
+    return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y);
+    __m128i sign = __lsx_vsrai_w(prod, 31);
+    __m128i lo   = __lsx_vilvl_w(sign, prod);
+    __m128i hi   = __lsx_vilvh_w(sign, prod);
+    return v_int64x2(__lsx_vadd_d(lo, hi));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3)
+{
+    __m128i x = (__m128i)v.val;
+    __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
+    __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
+    __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val);
+    __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xFF), m3.val);
+
+    return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const  v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a)
+{
+    __m128i x = (__m128i)v.val;
+    __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
+    __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
+    __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val, a.val);
+
+    return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2));
+}
+
+#define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to)                          \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,                            \
+                               const _Tpvec& a2, const _Tpvec& a3,                            \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)                \
+   {                                                                                          \
+       __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val));                                 \
+       __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val));                                 \
+       __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val));                                 \
+       __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val));                                 \
+       b0.val = cast_to(__lsx_vilvl_d(t1, t0));                                               \
+       b1.val = cast_to(__lsx_vilvh_d(t1, t0));                                               \
+       b2.val = cast_to(__lsx_vilvl_d(t3, t2));                                               \
+       b3.val = cast_to(__lsx_vilvh_d(t3, t2));                                               \
+   }
+
+OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+
+inline void v_transpose4x4(const v_float32x4& a0, const v_float32x4& a1,
+                           const v_float32x4& a2, const v_float32x4& a3,
+                           v_float32x4& b0, v_float32x4& b1, v_float32x4& b2, v_float32x4& b3)
+{
+    __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val;
+    __m128i vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val;
+    __m128i t0 = __lsx_vilvl_w(vec1, vec0);
+    __m128i t1 = __lsx_vilvl_w(vec3, vec2);
+    __m128i t2 = __lsx_vilvh_w(vec1, vec0);
+    __m128i t3 = __lsx_vilvh_w(vec3, vec2);
+    b0.val = __m128(__lsx_vilvl_d(t1, t0));
+    b1.val = __m128(__lsx_vilvh_d(t1, t0));
+    b2.val = __m128(__lsx_vilvl_d(t3, t2));
+    b3.val = __m128(__lsx_vilvh_d(t3, t2));
+}
+
+////////////////// Value reordering ////////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi)     \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)                \
+    {                                                                              \
+        b0.val = intrin_lo(a.val, 0);                                              \
+        b1.val = intrin_hi(a.val);                                                 \
+    }                                                                              \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                                   \
+    { return _Tpwvec(intrin_lo(a.val, 0)); }                                       \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                                  \
+    { return _Tpwvec(intrin_hi(a.val)); }                                          \
+    inline _Tpwvec v_load_expand(const _Tp* ptr)                                   \
+    {                                                                              \
+        __m128i a = __lsx_vld(ptr, 0);                                             \
+        return _Tpwvec(intrin_lo(a, 0));                                           \
+    }
+
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint8x16, v_uint16x8, uchar,     __lsx_vsllwil_hu_bu, __lsx_vexth_hu_bu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int8x16,  v_int16x8,  schar,     __lsx_vsllwil_h_b,   __lsx_vexth_h_b)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint16x8, v_uint32x4, ushort,    __lsx_vsllwil_wu_hu, __lsx_vexth_wu_hu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int16x8,  v_int32x4,  short,     __lsx_vsllwil_w_h,   __lsx_vexth_w_h)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint32x4, v_uint64x2, unsigned,  __lsx_vsllwil_du_wu, __lsx_vexth_du_wu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int32x4,  v_int64x2,  int,       __lsx_vsllwil_d_w,   __lsx_vexth_d_w)
+
+#define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi)          \
+    inline _Tpvec v_load_expand_q(const _Tp* ptr)                                \
+    {                                                                            \
+        __m128i a = __lsx_vld(ptr, 0);                                           \
+        __m128i b = intrin_lo(a, 0);                                             \
+        return _Tpvec(intrin_hi(b, 0));                                          \
+    }
+
+OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_uint32x4, uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu)
+OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_int32x4,  schar, __lsx_vsllwil_h_b,   __lsx_vsllwil_w_h)
+
+/* pack */
+// 16
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_lsx_packs_h(a.val, b.val)); }
+
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); }
+
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_lsx_packus_h(a.val, b.val)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); }
+
+//32
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); }
+
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); }
+
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); }
+
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort *ptr, const v_uint32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr,  0, 0); }
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
+
+template<int n> inline
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); }
+
+// 64
+// Non-saturaing pack
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); }
+
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{ __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); }
+
+inline void v_pack_store(int *ptr, const v_int64x2& a)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(a)); }
+
+template<int n> inline
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{ __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{ return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{ __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); }
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); }
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0);
+    __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0);
+    return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0);
+    __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0);
+    __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0);
+    __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0);
+
+    __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0);
+    __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0);
+    return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec)                    \
+    template<int s>                                            \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int8x16)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int16x8)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint64x2)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int64x2)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_float32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_float64x2)
+
+#define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin)             \
+template<int i>                                                           \
+inline _Twvec v_extract_n(const _Tpvec& a)                                \
+{ return (_Twvec)intrin(a.val, i); }
+
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint8x16, uchar,   __lsx_vpickve2gr_b)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int8x16,  schar,   __lsx_vpickve2gr_b)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint16x8, ushort,  __lsx_vpickve2gr_h)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int16x8,  short,   __lsx_vpickve2gr_h)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint32x4, uint,    __lsx_vpickve2gr_w)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int32x4,  int,     __lsx_vpickve2gr_w)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint64x2, uint64,  __lsx_vpickve2gr_d)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int64x2,  int64,   __lsx_vpickve2gr_d)
+
+template<int i>
+inline float v_extract_n(const v_float32x4& v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = __lsx_vpickve2gr_w(v.val, i);
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(const v_float64x2& v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = __lsx_vpickve2gr_d(v.val, i);
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{ return v_uint32x4(__lsx_vreplvei_w(a.val, i)); }
+
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{ return v_int32x4(__lsx_vreplvei_w(a.val, i)); }
+
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{ return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); }
+
+/////////////////// load deinterleave //////////////////////////////
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+
+    a.val = __lsx_vpickev_b(t1, t0);
+    b.val = __lsx_vpickod_b(t1, t0);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vpickev_h(t1, t0);
+    b.val = __lsx_vpickod_h(t1, t0);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vpickev_w(t1, t0);
+    b.val = __lsx_vpickod_w(t1, t0);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vilvl_d(t1, t0);
+    b.val = __lsx_vilvh_d(t1, t0);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0);
+    __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1);
+    __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
+    const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29);
+    const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30);
+    const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31);
+
+    a.val = __lsx_vshuf_b(t2, a0, shuff_a);
+    b.val = __lsx_vshuf_b(t2, b0, shuff_b);
+    c.val = __lsx_vshuf_b(t2, c0, shuff_c);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0);
+    const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1);
+
+    __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1);
+    __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0);
+    __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
+
+    const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27);
+    const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29);
+    const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31);
+
+    a.val = __lsx_vshuf_b(t2, a0, shuff_a);
+    b.val = __lsx_vshuf_b(t2, b0, shuff_b);
+    c.val = __lsx_vshuf_b(t2, c0, shuff_c);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+
+    __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC);
+    __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5);
+    __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A);
+
+    a.val = __lsx_vextrins_w(a0, t2, 0x31);
+    b0    = __lsx_vshuf4i_w(b0, 0x38);
+    c0    = __lsx_vshuf4i_w(c0, 0x8);
+    b.val = __lsx_vextrins_w(b0, t2, 0x32);
+    c.val = __lsx_vpermi_w(t2, c0, 0xC4);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+
+    a.val = __lsx_vshuf4i_d(t0, t1, 0xC);
+    b.val = __lsx_vshuf4i_d(t0, t2, 0x9);
+    c.val = __lsx_vshuf4i_d(t1, t2, 0xC);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    __m128i ac_lo = __lsx_vpickev_b(t1, t0);
+    __m128i bd_lo = __lsx_vpickod_b(t1, t0);
+    __m128i ac_hi = __lsx_vpickev_b(t3, t2);
+    __m128i bd_hi = __lsx_vpickod_b(t3, t2);
+
+    a.val = __lsx_vpickev_b(ac_hi, ac_lo);
+    c.val = __lsx_vpickod_b(ac_hi, ac_lo);
+    b.val = __lsx_vpickev_b(bd_hi, bd_lo);
+    d.val = __lsx_vpickod_b(bd_hi, bd_lo);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    __m128i ac_lo = __lsx_vpickev_h(t1, t0);
+    __m128i bd_lo = __lsx_vpickod_h(t1, t0);
+    __m128i ac_hi = __lsx_vpickev_h(t3, t2);
+    __m128i bd_hi = __lsx_vpickod_h(t3, t2);
+
+    a.val = __lsx_vpickev_h(ac_hi, ac_lo);
+    c.val = __lsx_vpickod_h(ac_hi, ac_lo);
+    b.val = __lsx_vpickev_h(bd_hi, bd_lo);
+    d.val = __lsx_vpickod_h(bd_hi, bd_lo);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i p0 = __lsx_vld(ptr, 0);
+    __m128i p1 = __lsx_vld(ptr, 16);
+    __m128i p2 = __lsx_vld(ptr, 32);
+    __m128i p3 = __lsx_vld(ptr, 48);
+
+    __m128i t0 = __lsx_vilvl_w(p1, p0);
+    __m128i t1 = __lsx_vilvl_w(p3, p2);
+    __m128i t2 = __lsx_vilvh_w(p1, p0);
+    __m128i t3 = __lsx_vilvh_w(p3, p2);
+    a.val = __lsx_vilvl_d(t1, t0);
+    b.val = __lsx_vilvh_d(t1, t0);
+    c.val = __lsx_vilvl_d(t3, t2);
+    d.val = __lsx_vilvh_d(t3, t2);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    a.val = __lsx_vilvl_d(t2, t0);
+    b.val = __lsx_vilvh_d(t2, t0);
+    c.val = __lsx_vilvl_d(t3, t1);
+    d.val = __lsx_vilvh_d(t3, t1);
+}
+
+////////////////////////// store interleave ////////////////////////////////
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_b(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_b(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_h(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_h(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_w(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_w(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_d(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
+    __m128i v_c = c.val;
+    const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10);
+    const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0);
+    const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21);
+    const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31);
+    __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
+
+    __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
+    __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
+    __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
+    dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, const v_uint16x8& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
+    __m128i v_c = c.val;
+    const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11);
+    const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0);
+    const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21);
+    const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31);
+    __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
+
+    __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
+    __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
+    __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
+    dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, const v_uint32x4& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v_c = c.val;
+    __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);  //a0 b0 a1 b1
+    __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);  //a2 b2 a3 b3
+    __m128i bc_od = __lsx_vpackod_w(v_c, b.val); // b1 c1 b3 c3
+
+    __m128i dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4);  //a0 b0 b1 a1
+    __m128i dst1 = __lsx_vilvl_d(ab_hi, bc_od); //b1 c1 a2 b2
+    __m128i dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8); //a2, a3, b3, c3
+
+    dst0 = __lsx_vextrins_w(dst0, v_c, 0x20);
+    dst2 = __lsx_vextrins_w(dst2, v_c, 0x2);
+    __lsx_vst(dst0, ptr, 0);  //a0 b0 c0 a1
+    __lsx_vst(dst1, ptr, 16); //b1 c1 a2 b2
+    __lsx_vst(dst2, ptr, 32); //c2 a3 b3 c3
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4);
+    __m128i dst2 = __lsx_vilvh_d(c.val, b.val);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                               const v_uint8x16& c, const v_uint8x16& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_b(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_b(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_h(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_h(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_h(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_h(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                               const v_uint16x8& c, const v_uint16x8& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_h(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_h(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_w(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_w(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_w(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_w(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_w(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_w(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_d(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_d(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_d(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_d(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i dst2 = __lsx_vilvh_d(b.val, a.val);
+    __m128i dst1 = __lsx_vilvl_d(d.val, c.val);
+    __m128i dst3 = __lsx_vilvh_d(d.val, c.val);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1)  \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0)                        \
+{                                                                                                 \
+    _Tpvec1 a1, b1;                                                                               \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1);                                                \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+}                                                                                                 \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0)           \
+{                                                                                                 \
+    _Tpvec1 a1, b1, c1;                                                                           \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1);                                            \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+    c0 = v_reinterpret_as_##suffix0(c1);                                                          \
+}                                                                                                 \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0,                        \
+                                _Tpvec0& c0, _Tpvec0& d0)                                         \
+{                                                                                                 \
+    _Tpvec1 a1, b1, c1, d1;                                                                       \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1);                                        \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+    c0 = v_reinterpret_as_##suffix0(c1);                                                          \
+    d0 = v_reinterpret_as_##suffix0(d1);                                                          \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0,                   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1);                                                     \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1);                                                 \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0,                   \
+                               const _Tpvec0& c0, const _Tpvec0& d0,                              \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0);                                                  \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1);                                             \
+}
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+//
+// FP16
+//
+
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+#if CV_FP16
+    return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
+#else
+    float CV_DECL_ALIGNED(32) buf[4];
+    for (int i = 0; i < 4; i++)
+        buf[i] = (float)ptr[i];
+    return v_float32x4((__m128)__lsx_vld(buf, 0));
+#endif
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
+{
+#if CV_FP16
+    __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
+    __lsx_vstelm_d(res, ptr, 0, 0);
+#else
+    float CV_DECL_ALIGNED(32) buf[4];
+    v_store_aligned(buf, a);
+    for (int i = 0; i < 4; i++)
+        ptr[i] = hfloat(buf[i]);
+#endif
+}
+
+//
+// end of FP16
+//
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_LSX_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
index c035fdad602a..23d6ebd3d1d0 100644
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@@ -1838,7 +1838,7 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)
 
 ////// FP16 support ///////
 #if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
 #ifndef msa_ld1_f16
     v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
@@ -1848,7 +1848,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_float32x4(msa_cvt_f32_f16(v));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     v4f16 hv = msa_cvt_f16_f32(v.val);
 
@@ -1859,7 +1859,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 #endif
 }
 #else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     float buf[4];
     for( int i = 0; i < 4; i++ )
@@ -1867,12 +1867,12 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     float buf[4];
     v_store(buf, v);
     for( int i = 0; i < 4; i++ )
-        ptr[i] = (float16_t)buf[i];
+        ptr[i] = (hfloat)buf[i];
 }
 #endif
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 6f8973231b83..6e843d68ea67 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -131,13 +131,22 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
 #endif
 
+//////////// Compatibility layer ////////////
+template<typename T> struct VTraits {
+        static inline int vlanes() { return T::nlanes; }
+        enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
+        using lane_type = typename T::lane_type;
+};
+
+template<typename T>
+inline typename VTraits<T>::lane_type v_get0(const T& v) \
+{ \
+    return v.get0(); \
+}
 //////////// Types ////////////
 
 struct v_uint8x16
 {
-    typedef uchar lane_type;
-    enum { nlanes = 16 };
-
     v_uint8x16() {}
     explicit v_uint8x16(uint8x16_t v) : val(v) {}
     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
@@ -146,19 +155,22 @@ struct v_uint8x16
         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
         val = vld1q_u8(v);
     }
+    uint8x16_t val;
+
+private:
+    friend struct VTraits<v_uint8x16>;
+    enum { nlanes = 16 };
+    typedef uchar lane_type;
+
+    friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(const v_uint8x16& v);
     uchar get0() const
     {
         return vgetq_lane_u8(val, 0);
     }
-
-    uint8x16_t val;
 };
 
 struct v_int8x16
 {
-    typedef schar lane_type;
-    enum { nlanes = 16 };
-
     v_int8x16() {}
     explicit v_int8x16(int8x16_t v) : val(v) {}
     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
@@ -167,19 +179,22 @@ struct v_int8x16
         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
         val = vld1q_s8(v);
     }
+    int8x16_t val;
+
+private:
+    friend struct VTraits<v_int8x16>;
+    enum { nlanes = 16 };
+    typedef schar lane_type;
+
+    friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(const v_int8x16& v);
     schar get0() const
     {
         return vgetq_lane_s8(val, 0);
     }
-
-    int8x16_t val;
 };
 
 struct v_uint16x8
 {
-    typedef ushort lane_type;
-    enum { nlanes = 8 };
-
     v_uint16x8() {}
     explicit v_uint16x8(uint16x8_t v) : val(v) {}
     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
@@ -187,19 +202,22 @@ struct v_uint16x8
         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
         val = vld1q_u16(v);
     }
+    uint16x8_t val;
+
+private:
+    friend struct VTraits<v_uint16x8>;
+    enum { nlanes = 8 };
+    typedef ushort lane_type;
+
+    friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(const v_uint16x8& v);
     ushort get0() const
     {
         return vgetq_lane_u16(val, 0);
     }
-
-    uint16x8_t val;
 };
 
 struct v_int16x8
 {
-    typedef short lane_type;
-    enum { nlanes = 8 };
-
     v_int16x8() {}
     explicit v_int16x8(int16x8_t v) : val(v) {}
     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
@@ -207,19 +225,22 @@ struct v_int16x8
         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
         val = vld1q_s16(v);
     }
+    int16x8_t val;
+
+private:
+    friend struct VTraits<v_int16x8>;
+    enum { nlanes = 8 };
+    typedef short lane_type;
+
+    friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(const v_int16x8& v);
     short get0() const
     {
         return vgetq_lane_s16(val, 0);
     }
-
-    int16x8_t val;
 };
 
 struct v_uint32x4
 {
-    typedef unsigned lane_type;
-    enum { nlanes = 4 };
-
     v_uint32x4() {}
     explicit v_uint32x4(uint32x4_t v) : val(v) {}
     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
@@ -227,19 +248,22 @@ struct v_uint32x4
         unsigned v[] = {v0, v1, v2, v3};
         val = vld1q_u32(v);
     }
+    uint32x4_t val;
+
+private:
+    friend struct VTraits<v_uint32x4>;
+    enum { nlanes = 4 };
+    typedef unsigned lane_type;
+
+    friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(const v_uint32x4& v);
     unsigned get0() const
     {
         return vgetq_lane_u32(val, 0);
     }
-
-    uint32x4_t val;
 };
 
 struct v_int32x4
 {
-    typedef int lane_type;
-    enum { nlanes = 4 };
-
     v_int32x4() {}
     explicit v_int32x4(int32x4_t v) : val(v) {}
     v_int32x4(int v0, int v1, int v2, int v3)
@@ -247,18 +271,22 @@ struct v_int32x4
         int v[] = {v0, v1, v2, v3};
         val = vld1q_s32(v);
     }
+    int32x4_t val;
+
+private:
+    friend struct VTraits<v_int32x4>;
+    enum { nlanes = 4 };
+    typedef int lane_type;
+
+    friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(const v_int32x4& v);
     int get0() const
     {
         return vgetq_lane_s32(val, 0);
     }
-    int32x4_t val;
 };
 
 struct v_float32x4
 {
-    typedef float lane_type;
-    enum { nlanes = 4 };
-
     v_float32x4() {}
     explicit v_float32x4(float32x4_t v) : val(v) {}
     v_float32x4(float v0, float v1, float v2, float v3)
@@ -266,18 +294,22 @@ struct v_float32x4
         float v[] = {v0, v1, v2, v3};
         val = vld1q_f32(v);
     }
+    float32x4_t val;
+
+private:
+    friend struct VTraits<v_float32x4>;
+    enum { nlanes = 4 };
+    typedef float lane_type;
+
+    friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(const v_float32x4& v);
     float get0() const
     {
         return vgetq_lane_f32(val, 0);
     }
-    float32x4_t val;
 };
 
 struct v_uint64x2
 {
-    typedef uint64 lane_type;
-    enum { nlanes = 2 };
-
     v_uint64x2() {}
     explicit v_uint64x2(uint64x2_t v) : val(v) {}
     v_uint64x2(uint64 v0, uint64 v1)
@@ -285,18 +317,21 @@ struct v_uint64x2
         uint64 v[] = {v0, v1};
         val = vld1q_u64(v);
     }
+    uint64x2_t val;
+private:
+    friend struct VTraits<v_uint64x2>;
+    enum { nlanes = 2 };
+    typedef uint64 lane_type;
+
+    friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(const v_uint64x2& v);
     uint64 get0() const
     {
         return vgetq_lane_u64(val, 0);
     }
-    uint64x2_t val;
 };
 
 struct v_int64x2
 {
-    typedef int64 lane_type;
-    enum { nlanes = 2 };
-
     v_int64x2() {}
     explicit v_int64x2(int64x2_t v) : val(v) {}
     v_int64x2(int64 v0, int64 v1)
@@ -304,19 +339,23 @@ struct v_int64x2
         int64 v[] = {v0, v1};
         val = vld1q_s64(v);
     }
+    int64x2_t val;
+
+private:
+    friend struct VTraits<v_int64x2>;
+    enum { nlanes = 2 };
+    typedef int64 lane_type;
+
+    friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(const v_int64x2& v);
     int64 get0() const
     {
         return vgetq_lane_s64(val, 0);
     }
-    int64x2_t val;
 };
 
 #if CV_SIMD128_64F
 struct v_float64x2
 {
-    typedef double lane_type;
-    enum { nlanes = 2 };
-
     v_float64x2() {}
     explicit v_float64x2(float64x2_t v) : val(v) {}
     v_float64x2(double v0, double v1)
@@ -324,11 +363,18 @@ struct v_float64x2
         double v[] = {v0, v1};
         val = vld1q_f64(v);
     }
+
+    float64x2_t val;
+private:
+    friend struct VTraits<v_float64x2>;
+    enum { nlanes = 2 };
+    typedef double lane_type;
+
+    friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(const v_float64x2& v);
     double get0() const
     {
         return vgetq_lane_f64(val, 0);
     }
-    float64x2_t val;
 };
 #endif
 
@@ -460,71 +506,56 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64)
 #else
-inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b)
 {
     float32x4_t reciprocal = vrecpeq_f32(b.val);
     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
     return v_float32x4(vmulq_f32(a.val, reciprocal));
 }
-inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
-{
-    float32x4_t reciprocal = vrecpeq_f32(b.val);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    a.val = vmulq_f32(a.val, reciprocal);
-    return a;
-}
 #endif
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b)  \
     {                                                            \
         _Tpwvec c, d;                                            \
         v_mul_expand(a, b, c, d);                                \
         return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-    { a = a * b; return a; }
+    }
 
 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -698,7 +729,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
-    return v_dotprod_expand(a, b) + c;
+    return v_add(v_dotprod_expand(a, b), c);
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
@@ -715,7 +746,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                   const v_int32x4& c)
 {
-    return v_dotprod_expand(a, b) + c;
+    return v_add(v_dotprod_expand(a, b), c);
 }
 #endif
 // 16 >> 64
@@ -735,7 +766,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     return v_uint64x2(vaddq_u64(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -752,7 +783,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                   const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 #if CV_SIMD128_64F
@@ -760,7 +791,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
                                     const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 #endif
 
 //////// Fast Dot Product ////////
@@ -850,7 +881,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
 {
-    return v_dotprod_expand_fast(a, b) + c;
+    return v_add(v_dotprod_expand_fast(a, b), c);
 }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
@@ -861,7 +892,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
-    return v_dotprod_expand_fast(a, b) + c;
+    return v_add(v_dotprod_expand_fast(a, b), c);
 }
 #endif
 
@@ -875,7 +906,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     return v_uint64x2(vaddq_u64(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
@@ -884,22 +915,22 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 #if CV_SIMD128_64F
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod_fast(a, b)); }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 #endif
 
 
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
-    inline _Tpvec operator ~ (const _Tpvec& a) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
+    inline _Tpvec v_not (const _Tpvec& a) \
     { \
         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
     }
@@ -914,21 +945,16 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
 
 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
 
-inline v_float32x4 operator ~ (const v_float32x4& a)
+inline v_float32x4 v_not (const v_float32x4& a)
 {
     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
 }
@@ -942,7 +968,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x)
 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 {
     v_float32x4 one = v_setall_f32(1.0f);
-    return one / v_sqrt(x);
+    return v_div(one, v_sqrt(x));
 }
 #else
 inline v_float32x4 v_sqrt(const v_float32x4& x)
@@ -975,21 +1001,16 @@ inline v_float32x4 v_abs(v_float32x4 x)
 
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
-inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
     return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
-} \
-inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
-{ \
-    a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
 
-inline v_float64x2 operator ~ (const v_float64x2& a)
+inline v_float64x2 v_not (const v_float64x2& a)
 {
     return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
 }
@@ -1002,7 +1023,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x)
 inline v_float64x2 v_invsqrt(const v_float64x2& x)
 {
     v_float64x2 one = v_setall_f64(1.0f);
-    return one / v_sqrt(x);
+    return v_div(one, v_sqrt(x));
 }
 
 inline v_float64x2 v_abs(v_float64x2 x)
@@ -1037,17 +1058,17 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
 #endif
 
 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
 
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
@@ -1065,22 +1086,22 @@ static inline uint64x2_t vmvnq_u64(uint64x2_t a)
 }
 //OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
 //OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
-static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
+static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
 { return v_uint64x2(vceqq_u64(a.val, b.val)); }
-static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
+static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
 { return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
-static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
+static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
 { return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
-static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
+static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
 { return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
 #else
-static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
+static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
 {
     uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
     uint32x4_t swapped = vrev64q_u32(cmp);
     return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
 }
-static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
+static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
 {
     uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
     uint32x4_t swapped = vrev64q_u32(cmp);
@@ -1088,13 +1109,13 @@ static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
     uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
     return v_uint64x2(veorq_u64(v_eq, vx));
 }
-static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
+static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
 {
-    return v_reinterpret_as_s64(v_reinterpret_as_u64(a) == v_reinterpret_as_u64(b));
+    return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
 }
-static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
+static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
 {
-    return v_reinterpret_as_s64(v_reinterpret_as_u64(a) != v_reinterpret_as_u64(b));
+    return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
 }
 #endif
 #if CV_SIMD128_64F
@@ -1207,9 +1228,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 
 // trade efficiency for convenience
 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
+inline _Tpvec v_shl (const _Tpvec& a, int n) \
 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
+inline _Tpvec v_shr (const _Tpvec& a, int n) \
 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
@@ -1231,13 +1252,13 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
-{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 { return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
+{ return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { CV_UNUSED(b); return a; }
 
@@ -1621,7 +1642,7 @@ inline int v_signmask(const v_uint64x2& a)
 #if CV_NEON_AARCH64
     const int64x2_t signPosition = {0,1};
     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
-    uint64_t t0 = vaddvq_u64(v0);
+    int t0 = (int)vaddvq_u64(v0);
     return t0;
 #else // #if CV_NEON_AARCH64
     int64x1_t m0 = vdup_n_s64(0);
@@ -1969,11 +1990,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
 #else
 inline v_int32x4 v_round(const v_float32x4& a)
 {
-    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
-        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
-
-    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
-    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
+    // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
+    float32x4_t delta = vdupq_n_f32(12582912.0f);
+    return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
 }
 #endif
 inline v_int32x4 v_floor(const v_float32x4& a)
@@ -1997,12 +2016,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 {
     static const int32x2_t zero = vdup_n_s32(0);
-    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
 }
 
 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
-    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
 }
 
 inline v_int32x4 v_floor(const v_float64x2& a)
@@ -2586,7 +2605,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 
 ////// FP16 support ///////
 #if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     float16x4_t v =
     #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
@@ -2597,7 +2616,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_float32x4(vcvt_f32_f16(v));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     float16x4_t hv = vcvt_f16_f32(v.val);
 
@@ -2608,7 +2627,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
     #endif
 }
 #else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     const int N = 4;
     float buf[N];
@@ -2616,12 +2635,12 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     const int N = 4;
     float buf[N];
     v_store(buf, v);
-    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+    for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
 }
 #endif
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
index 04e3c0e140b2..d446a05db5c2 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
@@ -32,6 +32,8 @@
 namespace cv
 {
 
+//! @cond IGNORED
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 #define CV_SIMD128 1
@@ -2871,17 +2873,17 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
 ////// FP16 support ///////
 
 #if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
 }
 #else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     const int N = 4;
     float buf[N];
@@ -2889,12 +2891,12 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     const int N = 4;
     float buf[N];
     v_store(buf, v);
-    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+    for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
 }
 #endif
 
@@ -3336,7 +3338,8 @@ inline void v_cleanup() {}
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
+//! @endcond
 
-}
+} // namespace cv
 
 #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
index 9faefd97b7e4..5681ae211de6 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -19,7 +19,7 @@ namespace cv
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 #define CV_SIMD128 1
-#define CV_SIMD128_64F 0
+#define CV_SIMD128_64F 1
 //////////// Types ////////////
 struct v_uint8x16
 {
@@ -32,11 +32,11 @@ struct v_uint8x16
                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
     {
         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
+        val = (vuint8m1_t)vle8_v_u8m1((unsigned char*)v, 16);
     }
     uchar get0() const
     {
-        return vmv_x_s_u8m1_u8(val, 16);
+        return vmv_x_s_u8m1_u8(val);
     }
 
     vuint8m1_t val;
@@ -53,11 +53,11 @@ struct v_int8x16
                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
     {
         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
+        val = (vint8m1_t)vle8_v_i8m1((schar*)v, 16);
     }
     schar get0() const
     {
-        return vmv_x_s_i8m1_i8(val, 16);
+        return vmv_x_s_i8m1_i8(val);
     }
 
     vint8m1_t val;
@@ -73,11 +73,11 @@ struct v_uint16x8
     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
     {
         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
+        val = (vuint16m1_t)vle16_v_u16m1((unsigned short*)v, 8);
     }
     ushort get0() const
     {
-        return vmv_x_s_u16m1_u16(val, 8);
+        return vmv_x_s_u16m1_u16(val);
     }
 
     vuint16m1_t val;
@@ -93,11 +93,11 @@ struct v_int16x8
     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
+        val = (vint16m1_t)vle16_v_i16m1((signed short*)v, 8);
     }
     short get0() const
     {
-        return vmv_x_s_i16m1_i16(val, 8);
+        return vmv_x_s_i16m1_i16(val);
     }
 
     vint16m1_t val;
@@ -113,11 +113,11 @@ struct v_uint32x4
     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
     {
         unsigned v[] = {v0, v1, v2, v3};
-        val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
+        val = (vuint32m1_t)vle32_v_u32m1((unsigned int*)v, 4);
     }
     unsigned get0() const
     {
-        return vmv_x_s_u32m1_u32(val, 4);
+        return vmv_x_s_u32m1_u32(val);
     }
 
     vuint32m1_t val;
@@ -133,11 +133,11 @@ struct v_int32x4
     v_int32x4(int v0, int v1, int v2, int v3)
     {
         int v[] = {v0, v1, v2, v3};
-        val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
+        val = (vint32m1_t)vle32_v_i32m1((signed int*)v, 4);
     }
     int get0() const
     {
-        return vmv_x_s_i32m1_i32(val, 4);
+        return vmv_x_s_i32m1_i32(val);
     }
     vint32m1_t val;
 };
@@ -152,11 +152,11 @@ struct v_float32x4
     v_float32x4(float v0, float v1, float v2, float v3)
     {
         float v[] = {v0, v1, v2, v3};
-        val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
+        val = (vfloat32m1_t)vle32_v_f32m1((float*)v, 4);
     }
     float get0() const
     {
-        return vfmv_f_s_f32m1_f32(val, 4);
+        return vfmv_f_s_f32m1_f32(val);
     }
     vfloat32m1_t val;
 };
@@ -171,11 +171,11 @@ struct v_uint64x2
     v_uint64x2(uint64 v0, uint64 v1)
     {
         uint64 v[] = {v0, v1};
-        val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
+        val = (vuint64m1_t)vle64_v_u64m1((unsigned long*)v, 2);
     }
     uint64 get0() const
     {
-        return vmv_x_s_u64m1_u64(val, 2);
+        return vmv_x_s_u64m1_u64(val);
     }
     vuint64m1_t val;
 };
@@ -190,11 +190,11 @@ struct v_int64x2
     v_int64x2(int64 v0, int64 v1)
     {
         int64 v[] = {v0, v1};
-        val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
+        val = (vint64m1_t)vle64_v_i64m1((long*)v, 2);
     }
     int64 get0() const
     {
-        return vmv_x_s_i64m1_i64(val, 2);
+        return vmv_x_s_i64m1_i64(val);
     }
     vint64m1_t val;
 };
@@ -209,21 +209,21 @@ struct v_float64x2
     v_float64x2(double v0, double v1)
     {
         double v[] = {v0, v1};
-        val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
+        val = (vfloat64m1_t)vle64_v_f64m1((double*)v, 2);
     }
     double get0() const
     {
-        return vfmv_f_s_f64m1_f64(val, 2);
+        return vfmv_f_s_f64m1_f64(val);
     }
     vfloat64m1_t val;
 };
-
+/*
 #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
-inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
+inline _Tp##m1_t vreinterpret_v_##suffix##m1_##suffix##m1(_Tp##m1_t v) { return v; } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
-inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } \
 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
@@ -233,17 +233,128 @@ inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(
 
 
 OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
-OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
+OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, i8)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
-OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
+OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, i16)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
-OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
+OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, i32)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
-OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
+OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, i64)
 OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
 OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
+*/
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint8x16& v) { return v_uint8x16(v.val); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint8x16& v) { return v_int8x16(vreinterpret_v_u8m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint8x16& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint8x16& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint8x16& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint8x16& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint8x16& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int8x16& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int8x16& v) { return v_int8x16(v.val); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int8x16& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int8x16& v) { return v_int32x4(vreinterpret_v_i8m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int8x16& v) { return v_int64x2(vreinterpret_v_i8m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int8x16& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i8m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int8x16& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i8m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint16x8& v) { return v_uint8x16(vreinterpret_v_u16m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint16x8& v) { return v_uint16x8(v.val); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint16x8& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint16x8& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint16x8& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint16x8& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint16x8& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int16x8& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int16x8& v) { return v_uint16x8(vreinterpret_v_i16m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int16x8& v) { return v_int16x8(v.val); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int16x8& v) { return v_int32x4(vreinterpret_v_i16m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int16x8& v) { return v_int64x2(vreinterpret_v_i16m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int16x8& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i16m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int16x8& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i16m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint32x4& v) { return v_uint32x4(v.val); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint32x4& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint32x4& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint32x4& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint32x4& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int32x4& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int32x4& v) { return v_uint32x4(vreinterpret_v_i32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int32x4& v) { return v_int32x4(v.val); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int32x4& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint64x2& v) { return v_uint64x2(v.val); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint64x2& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int64x2& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int64x2& v) { return v_uint64x2(vreinterpret_v_i64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int64x2& v) { return v_int64x2(v.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& v) { return v_uint32x4(vreinterpret_v_f32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& v) { return v_int32x4(vreinterpret_v_f32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& v) { return v_float32x4(v.val); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float64x2& v) { return v_uint64x2(vreinterpret_v_f64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float64x2& v) { return v_int64x2(vreinterpret_v_f64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64x2(v.val); }
+
 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
-inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); }     \
+inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); }     \
 inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
 
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
@@ -254,7 +365,7 @@ OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
-inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
+inline v_float32x4 v_setzero_f32() { return v_float32x4(vfmv_v_f_f32m1(0, 4)); }
 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
 
 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
@@ -379,12 +490,12 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
 
 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
-    return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
+    return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
 }
 
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
+    return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
 }
 
 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
@@ -401,10 +512,10 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
 {
-    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
-    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     return v_float32x4(res);
 }
 
@@ -412,9 +523,9 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                                const v_float32x4& m1, const v_float32x4& m2,
                                const v_float32x4& a)
 {
-    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
-    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     return v_float32x4(res);
 }
@@ -442,7 +553,7 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
 
 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
+    return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
 }
 
 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
@@ -471,11 +582,11 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
-    return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
 } \
 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
 { \
-    a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
+    a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
     return a; \
 }
 
@@ -485,17 +596,17 @@ OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
 
 inline v_float32x4 operator ~ (const v_float32x4& a)
 {
-    return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
-    return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
 } \
 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
 { \
-    a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
+    a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
     return a; \
 }
 
@@ -505,7 +616,7 @@ OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
 
 inline v_float64x2 operator ~ (const v_float64x2& a)
 {
-    return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
 }
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
@@ -527,19 +638,19 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 inline v_uint32x4 v_abs(v_int32x4 x)
 {
     vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
-    return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
+    return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)));
 }
 
 inline v_uint16x8 v_abs(v_int16x8 x)
 {
     vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
-    return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
+    return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)));
 }
 
 inline v_uint8x16 v_abs(v_int8x16 x)
 {
     vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
-    return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
+    return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)));
 }
 
 inline v_float32x4 v_abs(v_float32x4 x)
@@ -591,7 +702,7 @@ inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){    \
      vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
      vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
-    return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num));    \
+    return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num)));    \
 }
 
 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
@@ -604,8 +715,8 @@ inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
 {
     vint16m2_t res = vundefined_i16m2();
     res = vwmul_vv_i16m2(a.val, b.val, 16);
-    c.val = vget_i16m2_i16m1(res, 0);
-    d.val = vget_i16m2_i16m1(res, 1);
+    c.val = vget_v_i16m2_i16m1(res, 0);
+    d.val = vget_v_i16m2_i16m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
@@ -613,8 +724,8 @@ inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
 {
     vuint16m2_t res = vundefined_u16m2();
     res = vwmulu_vv_u16m2(a.val, b.val, 16);
-    c.val = vget_u16m2_u16m1(res, 0);
-    d.val = vget_u16m2_u16m1(res, 1);
+    c.val = vget_v_u16m2_u16m1(res, 0);
+    d.val = vget_v_u16m2_u16m1(res, 1);
 }
 
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
@@ -622,8 +733,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
 {
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    c.val = vget_i32m2_i32m1(res, 0);
-    d.val = vget_i32m2_i32m1(res, 1);
+    c.val = vget_v_i32m2_i32m1(res, 0);
+    d.val = vget_v_i32m2_i32m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
@@ -631,8 +742,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
 {
     vuint32m2_t res = vundefined_u32m2();
     res = vwmulu_vv_u32m2(a.val, b.val, 8);
-    c.val = vget_u32m2_u32m1(res, 0);
-    d.val = vget_u32m2_u32m1(res, 1);
+    c.val = vget_v_u32m2_u32m1(res, 0);
+    d.val = vget_v_u32m2_u32m1(res, 1);
 }
 
 inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
@@ -640,8 +751,8 @@ inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
 {
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    c.val = vget_i64m2_i64m1(res, 0);
-    d.val = vget_i64m2_i64m1(res, 1);
+    c.val = vget_v_i64m2_i64m1(res, 0);
+    d.val = vget_v_i64m2_i64m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
@@ -649,8 +760,8 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
 {
     vuint64m2_t res = vundefined_u64m2();
     res = vwmulu_vv_u64m2(a.val, b.val, 4);
-    c.val = vget_u64m2_u64m1(res, 0);
-    d.val = vget_u64m2_u64m1(res, 1);
+    c.val = vget_v_u64m2_u64m1(res, 0);
+    d.val = vget_v_u64m2_u64m1(res, 1);
 }
 
 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
@@ -669,118 +780,202 @@ OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
 // 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
 }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
 }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
 }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
 }
 
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                    const v_int32x4& c)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
 }
 
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
                                    const v_uint64x2& c)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                    const v_int64x2& c)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
 }
 
 //////// Fast Dot Product ////////
@@ -789,14 +984,14 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 {
     vint32m2_t v1 = vundefined_i32m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
 }
 
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
     vint32m2_t v1 = vundefined_i32m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
 }
 
 // 32 >> 64
@@ -804,13 +999,13 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vundefined_i64m2();
     v1 = vwmul_vv_i64m2(a.val, b.val, 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
 }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
     vint64m2_t v1 = vundefined_i64m2();
     v1 = vwmul_vv_i64m2(a.val, b.val, 8);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
 }
 
 // 8 >> 32
@@ -819,8 +1014,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
 }
 
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@@ -828,8 +1023,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
@@ -837,16 +1032,16 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
 }
 
 // 16 >> 64
@@ -855,16 +1050,16 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
 {
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
 }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
@@ -872,16 +1067,16 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
 {
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
 }
 
 
@@ -890,16 +1085,16 @@ inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
 {\
     v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
     val = intrin(val, a.val, val, num);    \
-    return vmv_x_s_##len##m1_##len(val, num);    \
+    return vmv_x_s_##len##m1_##len(val);    \
 }
 
 
-#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
 {\
-    v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
+    v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
     val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num);    \
-    return val[0];    \
+    return scalerfunc(val);    \
 }
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
@@ -910,30 +1105,30 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu
 inline float v_reduce_sum(const v_float32x4& a) \
 {\
     vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
-    val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4);    \
-    return vfmv_f_s_f32m1_f32(val, 4);    \
+    val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4);    \
+    return vfmv_f_s_f32m1_f32(val);    \
 }
 inline double v_reduce_sum(const v_float64x2& a) \
 {\
     vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
-    val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2);    \
-    return vfmv_f_s_f64m1_f64(val, 2);    \
+    val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2);    \
+    return vfmv_f_s_f64m1_f64(val);    \
 }
 inline uint64 v_reduce_sum(const v_uint64x2& a)
-{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
+{ vuint64m1_t res = vundefined_u64m1(); return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
 
 inline int64 v_reduce_sum(const v_int64x2& a)
-{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
+{ vint64m1_t res = vundefined_i64m1(); return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
 
 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16, vmv_x_s_i8m1_i8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
 
@@ -944,11 +1139,15 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
     vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
     vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
     vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
-    a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
-    b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
-    c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
-    d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
-    return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
+    a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
+    b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
+    c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
+    d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
+    vfloat32m1_t res;
+    res = vslideup_vx_f32m1(a0, b0, 1, 4);
+    res = vslideup_vx_f32m1(res, c0, 2, 4);
+    res = vslideup_vx_f32m1(res, d0, 3, 4);
+    return v_float32x4(res);
 }
 
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
@@ -957,8 +1156,8 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
     vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
     vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
     vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
-    a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
-    return a0[0];
+    a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
+    return vfmv_f_s_f32m1_f32(a0);
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
@@ -1020,43 +1219,43 @@ inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
-}
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}/**/
 inline v_float32x4 v_not_nan(const v_float32x4& a)
 {
-    vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
+    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 
 //TODO: ==
@@ -1064,43 +1263,43 @@ inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
-}
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}/**/
 inline v_float64x2 v_not_nan(const v_float64x2& a)
 {
-    vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
+    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
@@ -1108,16 +1307,23 @@ inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
                          v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
                          v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
 { \
+    vuint32m4_t vindex = vundefined_u32m4(); \
+    vuint32m1_t vindex0 = vid_v_u32m1(4); \
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
+    vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
+    vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
     v##_Tp##32m4_t val = vundefined_##_T##m4();    \
-    val = vset_##_T##m4(val, 0, a0.val);    \
-    val = vset_##_T##m4(val, 1, a1.val);    \
-    val = vset_##_T##m4(val, 2, a2.val);    \
-    val = vset_##_T##m4(val, 3, a3.val);   \
-    val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);    \
-    b0.val = vget_##_T##m4_##_T##m1(val, 0);   \
-    b1.val = vget_##_T##m4_##_T##m1(val, 1);   \
-    b2.val = vget_##_T##m4_##_T##m1(val, 2);   \
-    b3.val = vget_##_T##m4_##_T##m1(val, 3);   \
+    val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val);   \
+    val = vrgather_vv_##_T##m4(val, vindex, 16);    \
+    b0.val = vget_v_##_T##m4_##_T##m1(val, 0);   \
+    b1.val = vget_v_##_T##m4_##_T##m1(val, 1);   \
+    b2.val = vget_v_##_T##m4_##_T##m1(val, 2);   \
+    b3.val = vget_v_##_T##m4_##_T##m1(val, 3);   \
 }
 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
@@ -1167,25 +1373,28 @@ template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 {     \
-        return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
+        suffix##m1_t res = vundefined_##_T##m1(); \
+        return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 { return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { \
     suffix##m2_t tmp = vundefined_##_T##m2();    \
-    tmp = vset_##_T##m2(tmp, 0, a.val);          \
-    tmp = vset_##_T##m2(tmp, 1, b.val);          \
-        tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
-        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val);          \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val);          \
+        res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
 } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 { \
     suffix##m2_t tmp = vundefined_##_T##m2();    \
-    tmp = vset_##_T##m2(tmp, 0, b.val);    \
-    tmp = vset_##_T##m2(tmp, 1, a.val);    \
-        tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
-        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val);    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val);    \
+        res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1203,50 +1412,132 @@ OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
 
-#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
+#if 1
+#define vreinterpret_v_i8m1_i8m1
+#define vreinterpret_v_u8m1_u8m1
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
 { \
-  typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
-  vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
-    return _Tpvec(_Tp2##_t(tmp)); } \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
+  res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
 inline _Tpvec v_load_low(const _Tp* ptr) \
-{ return _Tpvec(vle_v_##len(ptr, hnum)); }\
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
-{ return _Tpvec(vle_v_##len(ptr, num)); } \
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
 inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, hnum);}\
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-  _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num);    \
-  vse_v_##len(ptr, a0, hnum);}\
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
-{ vse_v_##len(ptr, a.val, num); }
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64, i8m1, schar)
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
+  res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
 
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
+#else
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res, res1; \
+  res = vle##elemsize##_v_##len(ptr0, hnum); \
+  res1 = vle##elemsize##_v_##len(ptr1, hnum); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, hnum);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0; \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse##elemsize##_v_##len(ptr, a0, hnum);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
 
+#endif
 
 ////////////// Lookup table access ////////////////////
 
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
-#if 1
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[ 0]],
@@ -1266,16 +1557,18 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
         tab[idx[14]],
         tab[idx[15]]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
 #else
-    int32xm4_t index32 = vlev_int32xm4(idx, 16);
-    vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
-    vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
-    return v_int8x16(vlxbv_i8m1(tab, index, 16));
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, vle32_v_u32m4((unsigned int *)idx, 16), 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((unsigned int *)idx, 16), 16));
+#endif
 #endif
 }
 
 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[0]],
@@ -1295,10 +1588,24 @@ inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
         tab[idx[7]],
         tab[idx[7] + 1]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 8);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 1, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    index = vadd_vv_u32m4(vand_vx_u32m4(seq, 1, 16), vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
 }
 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
 {
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[0]],
@@ -1318,7 +1625,23 @@ inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
         tab[idx[3] + 2],
         tab[idx[3] + 3]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 4);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 2, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    seq = vset_v_u32m1_u32m4(seq, 1, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 2, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 3, vget_v_u32m4_u32m1(seq, 0));
+    index = vadd_vv_u32m4(seq, vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
 }
 
 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
@@ -1327,6 +1650,7 @@ inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reint
 
 inline v_int16x8 v_lut(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1338,10 +1662,18 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
         tab[idx[6]],
         tab[idx[7]]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8));
+#endif
+#endif
 }
 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1353,10 +1685,24 @@ inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
         tab[idx[3]],
         tab[idx[3] + 1]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 4);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 1, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(seq, 1, 8), vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
 }
 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1368,7 +1714,21 @@ inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
         tab[idx[1] + 2],
         tab[idx[1] + 3]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 2);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 2, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    seq = vset_v_u32m1_u32m2(seq, 1, vget_v_u32m2_u32m1(seq, 0));
+    index = vsll_vx_u32m2(vadd_vv_u32m2(seq, vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
 }
 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
@@ -1376,6 +1736,7 @@ inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_rein
 
 inline v_int32x4 v_lut(const int* tab, const int* idx)
 {
+#if 0
     int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1383,10 +1744,14 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
         tab[idx[2]],
         tab[idx[3]]
     };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
 }
 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
 {
+#if 0
     int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1394,11 +1759,20 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
         tab[idx[1]],
         tab[idx[1] + 1]
     };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_int32x4(vloxei32_v_i32m1(tab, index, 4));
+#endif
 }
 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
 {
-    return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
+    return v_int32x4(vle32_v_i32m1(tab+idx[0], 4));
 }
 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
@@ -1406,26 +1780,27 @@ inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_re
 
 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
 {
-    vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_int64x2(res);
+    //vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
 {
-    return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
+    return v_int64x2(vle64_v_i64m1(tab+idx[0], 2));
 }
 
 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
 {
-    vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_uint64x2(res);
+    //vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
 {
-    return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
+    return v_uint64x2(vle64_v_u64m1(tab+idx[0], 2));
 }
 
 inline v_float32x4 v_lut(const float* tab, const int* idx)
 {
+#if 0
     float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1433,10 +1808,14 @@ inline v_float32x4 v_lut(const float* tab, const int* idx)
         tab[idx[2]],
         tab[idx[3]]
     };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
 }
 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
 {
+#if 0
     float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1444,69 +1823,79 @@ inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
         tab[idx[1]],
         tab[idx[1]+1]
     };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_float32x4(vloxei32_v_f32m1(tab, index, 4));
+#endif
 }
 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
 {
-    return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
+    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
 }
 inline v_float64x2 v_lut(const double* tab, const int* idx)
 {
-    vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_float64x2(res);
+    //vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
 {
-    return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
+    return v_float64x2(vle64_v_f64m1(tab+idx[0], 2));
 }
 
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    /*int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    };*/
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 
 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
 {
-    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    /*unsigned CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_uint32x4(vle_v_u32m1(elems, 4));
+    };*/
+    return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 
 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    /*float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    };*/
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
 {
-    vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
-    return v_float64x2(res);
+    //vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
 }
 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
 {
-    vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
-    vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
+    vint32m1_t index = vmul_vx_i32m1(idxvec.val, 4, 4);
+    //vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
 
-    x.val = vlxe_v_f32m1(tab, index_x, 4);
-    y.val = vlxe_v_f32m1(tab, index_y, 4);
+    //x.val = vlxe_v_f32m1(tab, index_x, 4);
+    //y.val = vlxe_v_f32m1(tab, index_y, 4);
+    vloxseg2ei32_v_f32m1(&x.val, &y.val, tab, vreinterpret_v_i32m1_u32m1(index), 4);
 }
 
 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
@@ -1518,52 +1907,52 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
+#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
 { \
     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
     return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
 }\
 template<int n> inline \
 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
 { \
     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
     return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
 }\
 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
 { \
     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
     asm("" ::: "memory");                                       \
-    vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
+    vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
 }\
 template<int n> inline \
 void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
 { \
     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
-    vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
 }
-OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
-OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
-OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx, signed char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx, signed short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx, int, 32)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx, unsigned char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx, unsigned short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx, unsigned int, 32)
 
 // pack boolean
 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
 {
     vuint16m2_t tmp = vundefined_u16m2();    \
-    tmp = vset_u16m2(tmp, 0, a.val);    \
-    tmp = vset_u16m2(tmp, 1, b.val);    \
-    return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
+    tmp = vset_v_u16m1_u16m2(tmp, 0, a.val);    \
+    tmp = vset_v_u16m1_u16m2(tmp, 1, b.val);    \
+    return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
@@ -1571,12 +1960,12 @@ inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
 {
     vuint32m4_t vabcd = vundefined_u32m4();    \
     vuint16m2_t v16 = vundefined_u16m2();    \
-    vabcd = vset_u32m4(vabcd, 0, a.val);    \
-    vabcd = vset_u32m4(vabcd, 1, b.val);    \
-    vabcd = vset_u32m4(vabcd, 2, c.val);    \
-    vabcd = vset_u32m4(vabcd, 3, d.val);    \
-    v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
-    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+    vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val);    \
+    v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
@@ -1586,17 +1975,17 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
     vuint64m8_t v64 = vundefined_u64m8();    \
     vuint32m4_t v32 = vundefined_u32m4();    \
     vuint16m2_t v16 = vundefined_u16m2();    \
-    v64 = vset_u64m8(v64, 0, a.val);    \
-    v64 = vset_u64m8(v64, 1, b.val);    \
-    v64 = vset_u64m8(v64, 2, c.val);    \
-    v64 = vset_u64m8(v64, 3, d.val);    \
-    v64 = vset_u64m8(v64, 4, e.val);    \
-    v64 = vset_u64m8(v64, 5, f.val);    \
-    v64 = vset_u64m8(v64, 6, g.val);    \
-    v64 = vset_u64m8(v64, 7, h.val);    \
-    v32 = vnsrl_vx_u32m4(v64, 0, 16);
-    v16 = vnsrl_vx_u16m2(v32, 0, 16);
-    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+    v64 = vset_v_u64m1_u64m8(v64, 0, a.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 1, b.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 2, c.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 3, d.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 4, e.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 5, f.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 6, g.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 7, h.val);    \
+    v32 = vnsrl_wx_u32m4(v64, 0, 16);
+    v16 = vnsrl_wx_u16m2(v32, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
 }
 
 //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
@@ -1612,63 +2001,56 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
-    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1));    \
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1));    \
 } \
 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2);    \
 } \
 template<int n> inline \
 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
-    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1));    \
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1));    \
 } \
 template<int n> inline \
 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
     vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1);    \
-    return vse_v_u##tp1##m1(ptr, val, num2);\
+    vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
 }
 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
 
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
 
 // saturating multiply 8-bit, 16-bit
-#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec)            \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
-    {                                                            \
-        _Tpwvec c, d;                                            \
-        v_mul_expand(a, b, c, d);                                \
-        return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt)   \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
+    {                                                           \
+        auto res = mul(a.val, b.val, num);                      \
+        return _Tpvec(cvt(res, 0, num));                        \
+    }                                                           \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)     \
     { a = a * b; return a; }
 
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  v_int16x8)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  v_int32x4)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  16, vwmul_vv_i16m2, vnclip_wx_i8m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  32, vwmul_vv_i32m2, vnclip_wx_i16m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
+
 
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
 static const signed char popCountTable[256] =
 {
     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
@@ -1690,8 +2072,12 @@ static const signed char popCountTable[256] =
 };
 
 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
-    vuint8m1_t v0 = val & 1;
-    return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
+#if __riscv_v == 7000
+    vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
+    return vadd_vv_u8m1(vloxei8_v_u8m1((unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
+#else
+    return vloxei8_v_u8m1((unsigned char*)popCountTable, val, 16);
+#endif
 }
 
 inline v_uint8x16
@@ -1703,156 +2089,138 @@ v_popcount(const v_uint8x16& a)
 inline v_uint8x16
 v_popcount(const v_int8x16& a)
 {
-    return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
+    return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
 }
 
 inline v_uint16x8
 v_popcount(const v_uint16x8& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
-    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
 }
 
 inline v_uint16x8
 v_popcount(const v_int16x8& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
-    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
 }
 
 inline v_uint32x4
 v_popcount(const v_uint32x4& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
-                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
-    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
-    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
 }
 
 inline v_uint32x4
 v_popcount(const v_int32x4& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
-                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
-    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
-    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
 }
 
 inline v_uint64x2
 v_popcount(const v_uint64x2& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
-                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
-    vuint8m1_t res1 = zero;
-    vuint8m1_t res2 = zero;
-    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
-    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
-
-    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1();
+    vuint16m1_t res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
 }
 
 inline v_uint64x2
 v_popcount(const v_int64x2& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
-                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
-    vuint8m1_t res1 = zero;
-    vuint8m1_t res2 = zero;
-    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
-    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
-
-    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
 }
 
 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
 inline int v_signmask(const v_uint8x16& a)
 {
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
     vuint8m1_t t0  = vsrl_vx_u8m1(a.val, 7, 16);
-    vuint8m1_t m1  = (vuint8m1_t){SMASK, SMASK};
-    vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
-    vuint32m1_t res = vmv_v_x_u32m1(0, 4);
-    vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
-    res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
-    res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
-    return vmv_x_s_u32m1_u32(res, 8);
+    vbool8_t mask = vmseq_vx_u8m1_b8(t0, 1, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_int8x16& a)
 {
-    vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
-    vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
-    vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
-    res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
-    res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
-    return vmv_x_s_i32m1_i32(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
+    vbool8_t mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 
 inline int v_signmask(const v_int16x8& a)
 {
-    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
-    vint16m1_t m1 = (vint16m1_t){SMASK};
-    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
-    vint16m1_t res = vmv_v_x_i16m1(0, 8);
-    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
-    return vmv_x_s_i16m1_i16(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vbool16_t mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_uint16x8& a)
 {
-    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
-    vint16m1_t m1 = (vint16m1_t){SMASK};
-    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
-    vint16m1_t res = vmv_v_x_i16m1(0, 8);
-    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
-    return vmv_x_s_i16m1_i16(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vuint16m1_t t0  = vsrl_vx_u16m1(a.val, 15, 8);
+    vbool16_t mask = vmseq_vx_u16m1_b16(t0, 1, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 8);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_int32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
 }
 inline int v_signmask(const v_uint32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vuint32m1_t t0  = vsrl_vx_u32m1(a.val, 31, 4);
+    vbool32_t mask = vmseq_vx_u32m1_b32(t0, 1, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
 }
 inline int v_signmask(const v_uint64x2& a)
 {
-    vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
-    int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
-    return res;
+    vuint64m1_t res = vundefined_u64m1();
+    vuint64m1_t id = vid_v_u64m1(2);
+    vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2), id, 2);
+    vuint64m1_t t0  = vsrl_vx_u64m1(a.val, 63, 2);
+    vbool64_t mask = vmseq_vx_u64m1_b64(t0, 1, 2);
+    res = vredsum_vs_u64m1_u64m1_m(mask, res, num, vmv_v_x_u64m1(0, 2), 2);
+    return vmv_x_s_u64m1_u64(res);
 }
 inline int v_signmask(const v_int64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
@@ -1860,12 +2228,14 @@ inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
 inline int v_signmask(const v_float32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    return v_signmask(v_reinterpret_as_u32(a));
+    /*
+    vuint32m1_t res;
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmflt_vf_f32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);*/
 }
 
 inline int v_scan_forward(const v_int8x16& a) {
@@ -1905,24 +2275,22 @@ int val = v_signmask(a);
 if(val==0) return 0;
 else return trailingZeros32(val); }
 
-#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
+#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
 inline bool v_check_all(const v_##_Tpvec& a) \
 { \
     suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
-    vuint32m1_t v1 = vuint32m1_t(v0); \
-    return (v1[0] | v1[1] | v1[2] | v1[3]) == 0; \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
 } \
 inline bool v_check_any(const v_##_Tpvec& a) \
 { \
     suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
-    vuint32m1_t v1 = vuint32m1_t(v0); \
-    return (v1[0] | v1[1] | v1[2] | v1[3]) != 0; \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
 }
 
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16, b8)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
 
 inline bool v_check_all(const v_int8x16& a)
 { return v_check_all(v_reinterpret_as_u8(a)); }
@@ -1950,92 +2318,93 @@ inline bool v_check_any(const v_int64x2& a)
 inline bool v_check_any(const v_float64x2& a)
 { return v_check_any(v_reinterpret_as_u64(a)); }
 
-#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
+#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
 { \
-    return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
+    return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
 inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
+    return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(mask.val, 0, 4), b.val, a.val, 4));
 }
 inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
 {
-    return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
+    return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(mask.val, 0, 2), b.val, a.val, 2));
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
+#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
-    b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0);  \
-    b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1);  \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0);  \
+    b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1);  \
 } \
 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
 } \
 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
 } \
 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
 { \
-    _T2##_t val = vle##_v_##_Tp1(ptr, num2);    \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+    _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2);    \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
 
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
 {
     vuint16m2_t b = vundefined_u16m2();
     vuint32m2_t c = vundefined_u32m2();
-    vuint8m1_t val = vle_v_u8m1(ptr, 4);    \
+    vuint8m1_t val = vle8_v_u8m1(ptr, 4);    \
     b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4);    \
-    c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
-    return v_uint32x4(vget_u32m2_u32m1(c, 0));
+    c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
+    return v_uint32x4(vget_v_u32m2_u32m1(c, 0));
 }
 
 inline v_int32x4 v_load_expand_q(const schar* ptr)
 {
     vint16m2_t b = vundefined_i16m2();
     vint32m2_t c = vundefined_i32m2();
-    vint8m1_t val = vle_v_i8m1(ptr, 4);    \
+    vint8m1_t val = vle8_v_i8m1(ptr, 4);    \
     b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4);    \
-    c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
-    return v_int32x4(vget_i32m2_i32m1(c, 0));
+    c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
+    return v_int32x4(vget_v_i32m2_i32m1(c, 0));
 }
-#define VITL_16 (vuint32m2_t){0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
-#define VITL_8 (vuint32m2_t){0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
-#define VITL_4 (vuint32m2_t){0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
-#define VITL_2 (vuint32m2_t){0, 0, 2, 0, 1, 0, 3, 0}
+#define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
+#define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
+#define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
+#define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
 
-#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
+#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
 { \
     v##_Tp##m2_t tmp = vundefined_##_T##m2();\
-    tmp = vset_##_T##m2(tmp, 0, a0.val); \
-    tmp = vset_##_T##m2(tmp, 1, a1.val); \
-    vuint32m2_t mask = VITL_##num;    \
-    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2);    \
-    b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
-    b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
+    unsigned mdata[] = VITL_##num; \
+    vuint32m2_t mask = vle32_v_u32m2(mdata, 8);    \
+    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2);    \
+    b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
+    b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
 } \
 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
@@ -2044,58 +2413,59 @@ inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 } \
 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
-    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
-    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
-    v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
+    b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
     return v_##_Tpvec(b1);\
 } \
 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
 { \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
     c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
-    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
-    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
     d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
 }
 
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
 
 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
 {
-    vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
-    return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
+    return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
 }
 inline v_int8x16 v_reverse(const v_int8x16 &a)
 {
-    vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
-    return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
+    return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
 }
 
 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
 {
-    vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
-    return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
+    return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
 }
 
 inline v_int16x8 v_reverse(const v_int16x8 &a)
 {
-    vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
-    return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
+    return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
 }
 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
 {
-    return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+    return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
 }
 
 inline v_int32x4 v_reverse(const v_int32x4 &a)
 {
-    return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+    return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
 }
 
 inline v_float32x4 v_reverse(const v_float32x4 &a)
@@ -2103,17 +2473,17 @@ inline v_float32x4 v_reverse(const v_float32x4 &a)
 
 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
 {
-    return v_uint64x2(a.val[1], a.val[0]);
+    return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 inline v_int64x2 v_reverse(const v_int64x2 &a)
 {
-    return v_int64x2(a.val[1], a.val[0]);
+    return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 inline v_float64x2 v_reverse(const v_float64x2 &a)
 {
-    return v_float64x2(a.val[1], a.val[0]);
+    return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
@@ -2132,19 +2502,19 @@ OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
 
 
-#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
-template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
 
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
 
 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
@@ -2158,10 +2528,24 @@ OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
+
+inline void __builtin_riscv_fsrm(int val)
+{
+    asm("csrw frm, %0\n\t"
+        :
+        :"r"(val));
+    return;
+}
+
+inline void barrier1(void *arg) {
+  __asm__ __volatile__("" : : "r" (arg) : "memory");
+}
+
 inline v_int32x4 v_round(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(0);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2170,7 +2554,8 @@ inline v_int32x4 v_round(const v_float32x4& a)
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(2);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2180,7 +2565,8 @@ inline v_int32x4 v_floor(const v_float32x4& a)
 inline v_int32x4 v_ceil(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(3);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2190,7 +2576,8 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
 inline v_int32x4 v_trunc(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(1);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2201,10 +2588,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(0);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
     //_val = vset_f64m2(_val, 1, a.val);
-    _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
-    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
@@ -2212,9 +2600,10 @@ inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
     __builtin_riscv_fsrm(0);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    _val = vset_f64m2(_val, 1, b.val);
-    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
@@ -2222,10 +2611,10 @@ inline v_int32x4 v_floor(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(2);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
@@ -2236,10 +2625,10 @@ inline v_int32x4 v_ceil(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(3);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
@@ -2250,139 +2639,86 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(1);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
 { \
-    v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
-    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
+    intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
 } \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
 { \
-    v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
-    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
+    intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
 }\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
 { \
-    v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
-    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
-    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
+    intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
 } \
 
-#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();      \
-    ret = vset_##_T##m1x2(ret, 0, a.val);  \
-    ret = vset_##_T##m1x2(ret, 1, b.val);  \
-    intrin##2e_v_##_T##m1x2(ptr, ret, num); \
+    intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();       \
-    ret = vset_##_T##m1x3(ret, 0, a.val);  \
-    ret = vset_##_T##m1x3(ret, 1, b.val);  \
-    ret = vset_##_T##m1x3(ret, 2, c.val);  \
-    intrin##3e_v_##_T##m1x3(ptr, ret, num); \
+    intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
-    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();             \
-    ret = vset_##_T##m1x4(ret, 0, a.val);  \
-    ret = vset_##_T##m1x4(ret, 1, b.val);  \
-    ret = vset_##_T##m1x4(ret, 2, c.val);  \
-    ret = vset_##_T##m1x4(ret, 3, d.val);  \
-    intrin##4e_v_##_T##m1x4(ptr, ret, num); \
+    intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
-OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T)    \
-OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
+OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize)    \
+OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
 
 //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32, 32)
 
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32, 32)
 
-#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
-{ \
-    v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
-    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
-} \
+{ vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
-{ \
-    v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num);    \
-    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
-}\
+{ vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
-{ \
-    v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num);    \
-    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
-    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
-} \
+{ vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();    \
-    ret = vset_##_T##m1x2(ret, 0, a.val);  \
-    ret = vset_##_T##m1x2(ret, 1, b.val);  \
-    vsseg2e_v_##_T##m1x2(ptr, ret, num);    \
-} \
+{ vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();    \
-    ret = vset_##_T##m1x3(ret, 0, a.val);  \
-    ret = vset_##_T##m1x3(ret, 1, b.val);  \
-    ret = vset_##_T##m1x3(ret, 2, c.val);  \
-    vsseg3e_v_##_T##m1x3(ptr, ret, num);    \
-} \
+{ vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
-{ \
-    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();    \
-    ret = vset_##_T##m1x4(ret, 0, a.val);  \
-    ret = vset_##_T##m1x4(ret, 1, b.val);  \
-    ret = vset_##_T##m1x4(ret, 2, c.val);  \
-    ret = vset_##_T##m1x4(ret, 3, d.val);  \
-    vsseg4e_v_##_T##m1x4(ptr, ret, num);    \
-}
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
+{ vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
 
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32, 32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64, 64)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64, 64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64, 64)
 
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
@@ -2393,17 +2729,17 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 {
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
     return v_float32x4(aval);
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
 {
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    _val = vset_f64m2(_val, 1, b.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
     return v_float32x4(aval);
 }
 
@@ -2411,26 +2747,26 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
 {
     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
 }
 
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
 }
 
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
@@ -2441,8 +2777,9 @@ inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 #endif
 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
-    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
 }
 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
 {
@@ -2451,8 +2788,9 @@ inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
 
 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
-    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
 }
 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
 {
@@ -2461,35 +2799,40 @@ inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
 
 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
 
 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
 {
-    vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
-    return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
-    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
 }
 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
 
 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
@@ -2506,7 +2849,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
-    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
+    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
     return v_float64x2(res);
 }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
@@ -2514,21 +2857,37 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
   return res + c; }
 #endif
 ////// FP16 support ///////
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+#if __riscv_v == 7000
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
-    vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
+    vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
     vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
-    return v_float32x4(vget_f32m2_f32m1(v32, 0));
+    return v_float32x4(vget_v_f32m2_f32m1(v32, 0));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     vfloat32m2_t v32 = vundefined_f32m2();
-    v32 = vset_f32m2(v32, 0, v.val);
-    vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
-    vse_v_f16m1((__fp16*)ptr, hv, 4);
+    v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
+    vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
+    vse16_v_f16m1((__fp16*)ptr, hv, 4);
+}
+#else
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
+    vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
+    return v_float32x4(v32);
 }
 
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    //vfloat32m2_t v32 = vundefined_f32m2();
+    //v32 = vset_f32m2(v32, 0, v.val);
+    vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
+    vse16_v_f16mf2((__fp16*)ptr, hv, 4);
+}
+#endif
 
 inline void v_cleanup() {}
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_011_compat.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
new file mode 100644
index 000000000000..da5e0fdd5754
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_011_compat.hpp
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// 0.11 -> 0.12 compatibility
+
+#ifndef _RVV_IMPLICIT_VXRM
+#define _RVV_IMPLICIT_VXRM __RISCV_VXRM_RNU
+#endif
+
+// NOTE: masked should go first to avoid extra substitution (3 arg -> 4 arg -> 5 arg)
+
+// masked
+#define __riscv_vaadd(_1, _2, _3, _4) __riscv_vaadd(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasub(_1, _2, _3, _4) __riscv_vasub(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vaaddu(_1, _2, _3, _4) __riscv_vaaddu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vasubu(_1, _2, _3, _4) __riscv_vasubu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vsmul(_1, _2, _3, _4) __riscv_vsmul(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssra(_1, _2, _3, _4) __riscv_vssra(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vssrl(_1, _2, _3, _4) __riscv_vssrl(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclip(_1, _2, _3, _4) __riscv_vnclip(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+#define __riscv_vnclipu(_1, _2, _3, _4) __riscv_vnclipu(_1, _2, _3, _RVV_IMPLICIT_VXRM, _4)
+
+// unmasked
+#define __riscv_vaadd(_1, _2, _3) __riscv_vaadd(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasub(_1, _2, _3) __riscv_vasub(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vaaddu(_1, _2, _3) __riscv_vaaddu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vasubu(_1, _2, _3) __riscv_vasubu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vsmul(_1, _2, _3) __riscv_vsmul(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssra(_1, _2, _3) __riscv_vssra(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vssrl(_1, _2, _3) __riscv_vssrl(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclip(_1, _2, _3) __riscv_vnclip(_1, _2, _RVV_IMPLICIT_VXRM, _3)
+#define __riscv_vnclipu(_1, _2, _3) __riscv_vnclipu(_1, _2, _RVV_IMPLICIT_VXRM, _3)
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
index 7dd735f99aac..2a323069fd9a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_compat_overloaded.hpp
@@ -45,6 +45,7 @@ OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m2_t, u8m2, vuint8m2_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m4_t, u8m4, vuint8m4_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint8m8_t, u8m8, vuint8m8_t, i8)
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat32m1_t, f32m1, vuint32m1_t, i32)
+OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vuint32m1_t, u32m1, vuint32m1_t, i32)
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_FUN_LOXEI(vfloat64m1_t, f64m1, vuint32mf2_t, i32)
 #endif
@@ -199,9 +200,14 @@ inline static vuint32mf2_t vmul(const vuint32mf2_t & op1, uint32_t op2, size_t v
     return vmul_vx_u32mf2(op1, op2, vl);
 }
 
-inline static vuint32mf2_t vreinterpret_u32mf2(vint32mf2_t val)
+inline static vuint32mf2_t vreinterpret_u32mf2(const vint32mf2_t& val)
 {
     return vreinterpret_v_i32mf2_u32mf2(val);
 }
 
+inline static vuint32mf2_t vreinterpret_u32mf2(const vuint16mf2_t& val)
+{
+    return vreinterpret_v_u16mf2_u32mf2(val);
+}
+
 #endif //OPENCV_HAL_INTRIN_RVV_COMPAT_OVERLOAD_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
index 60066ba04186..0159e4325a3a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -8,9 +8,6 @@
 #ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
 #define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
 
-#include <initializer_list>
-#include <assert.h>
-#include <vector>
 #include <opencv2/core/check.hpp>
 
 // RVV intrinsics have been renamed in version 0.11, so we need to include
@@ -21,6 +18,10 @@
 #include "intrin_rvv_010_compat_overloaded-non-policy.hpp"
 #endif
 
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>11999
+#include "intrin_rvv_011_compat.hpp"
+#endif
+
 #if defined(__GNUC__) && !defined(__clang__)
 // FIXIT: eliminate massive warnigs from templates
 // GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)
@@ -34,6 +35,9 @@
 
 namespace cv
 {
+
+//! @cond IGNORED
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 #define CV_SIMD_SCALABLE 1
@@ -411,11 +415,6 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
     vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
 } \
-inline _Tpvec v_load(std::initializer_list<_Tp> nScalars) \
-{ \
-    assert(nScalars.size() == vl); \
-    return vle##width##_v_##suffix##m1(nScalars.begin(), nScalars.size()); \
-} \
 template<typename... Targs> \
 _Tpvec v_load_##suffix(Targs... nScalars) \
 { \
@@ -441,29 +440,7 @@ OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits<v_floa
 #define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
 { \
-    vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
-    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
-} \
-inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
-{ \
-    std::vector<uint> idx_; \
-    for (int i = 0; i < VTraits<v_int16>::vlanes(); ++i) { \
-        idx_.push_back(idx[i]); \
-        idx_.push_back(idx[i]+1); \
-    } \
-    vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
-    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
-} \
-inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
-{ \
-    std::vector<uint> idx_; \
-    for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i) { \
-        idx_.push_back(idx[i]); \
-        idx_.push_back(idx[i]+1); \
-        idx_.push_back(idx[i]+2); \
-        idx_.push_back(idx[i]+3); \
-    } \
-    vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    auto vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
     return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
 }
 OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4)
@@ -475,6 +452,74 @@ OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
 OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
 #endif
 
+#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \
+inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
+{ \
+    auto v0 = vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
+    auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/2); \
+    auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/2); \
+    auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/2); \
+    auto sh1 = vslide1up(v_trunc(vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
+    auto vid = vor(sh1, v_trunc(vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
+    auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m2, m4, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m1, m2, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, mf2, m1, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, mf2, m1, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, mf2, m1, vlmul_trunc_u32mf2)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, mf2, m1, vlmul_trunc_u32mf2)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \
+inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
+{ \
+    auto v0 = vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
+    auto v1 = vadd(v0, 1, VTraits<_Tpvec>::vlanes()/4); \
+    auto v2 = vadd(v0, 2, VTraits<_Tpvec>::vlanes()/4); \
+    auto v3 = vadd(v0, 3, VTraits<_Tpvec>::vlanes()/4); \
+    auto w0 = vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/4); \
+    auto w1 = vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/4); \
+    auto w2 = vwcvtu_x(v2, VTraits<_Tpvec>::vlanes()/4); \
+    auto w3 = vwcvtu_x(v3, VTraits<_Tpvec>::vlanes()/4); \
+    auto sh2 = vslide1up(vreinterpret_u32##suffix1(w2),0, VTraits<_Tpvec>::vlanes()/2); \
+    auto sh3 = vslide1up(vreinterpret_u32##suffix1(w3),0, VTraits<_Tpvec>::vlanes()/2); \
+    auto vid0 = vor(sh2, vreinterpret_u32##suffix1(w0), VTraits<_Tpvec>::vlanes()/2); \
+    auto vid1 = vor(sh3, vreinterpret_u32##suffix1(w1), VTraits<_Tpvec>::vlanes()/2); \
+    auto wid0 = vwcvtu_x(v_trunc(vid0), VTraits<_Tpvec>::vlanes()/2); \
+    auto wid1 = vwcvtu_x(v_trunc(vid1), VTraits<_Tpvec>::vlanes()/2); \
+    auto shwid1 = vslide1up(vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
+    auto vid = vor(shwid1, vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
+    auto vidx = vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m1, m2, m4, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, mf2 , m1, m2, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, mf2, m1, m1, vlmul_trunc_u32mf2)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, mf2, m1, m1, vlmul_trunc_u32mf2)
+
+#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
+inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
+{ \
+    v_uint32 vidx_ = vmul(vreinterpret_u32m1(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
+    return vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned)
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
+{ \
+    vuint32mf2_t vidx_ = vmul(vlmul_trunc_u32mf2(vreinterpret_u32m1(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
+    return vloxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+
 inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
 inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
 inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
@@ -486,7 +531,6 @@ inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_rein
 inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
 inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
 inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
-inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); }
 
 ////////////// Pack boolean ////////////////////
 inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b)
@@ -690,23 +734,27 @@ inline v_float64 v_not (const v_float64& a) \
 
 
 ////////////// Bitwise shifts //////////////
+/*  Usage
+1. v_shl<N>(vec);
+2. v_shl(vec, N); // instead of vec << N, when N is non-constant.
+*/
 
 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
 { \
     return _Tpvec(vsll(a, uint8_t(n), vl)); \
 } \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
 { \
     return _Tpvec(vsrl(a, uint8_t(n), vl)); \
 }
 
 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
 { \
     return _Tpvec(vsll(a, uint8_t(n), vl)); \
 } \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
 { \
     return _Tpvec(vsra(a, uint8_t(n), vl)); \
 }
@@ -719,56 +767,62 @@ OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits<v_int32>::vlanes())
 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits<v_int64>::vlanes())
 
 ////////////// Comparison //////////////
-#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
+#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix) \
 inline _Tpvec v_##op(const _Tpvec& a, const _Tpvec& b) \
 { \
+    size_t VLEN = VTraits<_Tpvec>::vlanes(); \
     uint64_t ones = -1; \
-    return vmerge(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl); \
+    return vmerge(intrin(a, b, VLEN), vmv_v_x_##suffix##m1(0, VLEN), ones, VLEN); \
 }
 
-#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix) \
 inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
 { \
-    union { uint64 u; double d; } ones; ones.u = -1; \
-    return _Tpvec(vfmerge(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
+    size_t VLEN = VTraits<_Tpvec>::vlanes(); \
+    union { uint64_t u; VTraits<_Tpvec>::lane_type d; } ones; \
+    ones.u = -1; \
+    auto diff = intrin(a, b, VLEN); \
+    auto z = vfmv_v_f_##suffix##m1(0, VLEN); \
+    auto res = vfmerge(diff, z, ones.d, VLEN); \
+    return _Tpvec(res); \
 } //TODO
 
-#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmsltu, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgtu, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsleu, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsgeu, suffix, vl)
-
-#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmslt, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgt, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsle, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsge, suffix, vl)
-
-#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, eq, vmfeq, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ne, vmfne, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, lt, vmflt, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, vmfgt, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, vmfle, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, vmfge, suffix, vl)
-
-
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8, VTraits<v_uint8>::vlanes())
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16, VTraits<v_uint16>::vlanes())
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32, u32, VTraits<v_uint32>::vlanes())
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64, u64, VTraits<v_uint64>::vlanes())
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8, VTraits<v_int8>::vlanes())
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16, VTraits<v_int16>::vlanes())
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32, VTraits<v_int32>::vlanes())
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64, VTraits<v_int64>::vlanes())
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32, VTraits<v_float32>::vlanes())
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmsltu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgtu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsleu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsgeu, suffix)
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, vmseq, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, vmsne, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, vmslt, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, vmsgt, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, vmsle, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, vmsge, suffix)
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, eq, vmfeq, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ne, vmfne, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, lt, vmflt, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, vmfgt, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, vmfle, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, vmfge, suffix)
+
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64, u64)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64)
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32)
 #if CV_SIMD_SCALABLE_64F
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64, VTraits<v_float64>::vlanes())
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64)
 #endif
 
 inline v_float32 v_not_nan(const v_float32& a)
@@ -924,6 +978,9 @@ inline scalartype v_reduce_sum(const _Tpvec& a)  \
     return (scalartype)v_get0(res); \
 }
 OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
+#endif
 
 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
 inline scalartype v_reduce_##func(const _Tpvec& a)  \
@@ -1360,23 +1417,23 @@ OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64)
 #define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
 { \
-    _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
     b0 = vget_##suffix##m1(temp, 0); \
     b1 = vget_##suffix##m1(temp, 1); \
 } \
 inline _Tpwvec v_expand_low(const _Tpvec& a) \
 { \
-    _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
     return vget_##suffix##m1(temp, 0); \
 } \
 inline _Tpwvec v_expand_high(const _Tpvec& a) \
 { \
-    _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
     return vget_##suffix##m1(temp, 1); \
 } \
 inline _Tpwvec v_load_expand(const _Tp* ptr) \
 { \
-    return cvt(vle##width##_v_##suffix2##mf2(ptr, vsetvlmax_e##width##m1()), vsetvlmax_e##width##m1()); \
+    return cvt(vle##width##_v_##suffix2##mf2(ptr, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
 }
 
 OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x)
@@ -1471,6 +1528,26 @@ OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPEN
 OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
 OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
 
+#if CV_SIMD_SCALABLE_64F
+inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \
+    vuint16mf4_t idx0 = vid_v_u16mf4(VTraits<v_float64>::vlanes());
+    vuint16mf4_t idx1 = vadd(idx0, VTraits<v_float64>::vlanes(), VTraits<v_float64>::vlanes());
+    vuint16mf2_t idx = vreinterpret_u16mf2(( \
+        vor(vzext_vf2(idx0, VTraits<v_float64>::vlanes()), \
+            vreinterpret_u32mf2(vslide1up(vreinterpret_u16mf2(vzext_vf2(idx1, VTraits<v_float64>::vlanes())), 0, VTraits<v_uint32>::vlanes())), \
+            VTraits<v_uint32>::vlanes())));
+#if 0
+    vfloat64m2_t temp = __riscv_vcreate_v_f64m1_f64m2(a0, a1);
+#else // TODO: clean up when RVV Intrinsic is frozen.
+    vfloat64m2_t temp = vlmul_ext_f64m2(a0);
+    temp = vset(temp, 1, a1);
+#endif
+    temp = vrgatherei16(temp, idx, VTraits<v_float64>::vlanes()*2);
+    b0 = vget_f64m1(temp, 0); \
+    b1 = vget_f64m1(temp, 1); \
+}
+#endif
+
 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1651,6 +1728,10 @@ inline v_uint32 v_popcount(const v_uint32& a)
 {
     return v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a))));
 }
+inline v_uint64 v_popcount(const v_uint64& a)
+{
+    return v_hadd(v_hadd(v_hadd(v_popcount(vreinterpret_u8m1(a)))));
+}
 
 inline v_uint8 v_popcount(const v_int8& a)
 {
@@ -1664,6 +1745,11 @@ inline v_uint32 v_popcount(const v_int32& a)
 {
     return v_popcount(v_abs(a));\
 }
+inline v_uint64 v_popcount(const v_int64& a)
+{
+    // max(0 - a) is used, since v_abs does not support 64-bit integers.
+    return v_popcount(v_reinterpret_as_u64(vmax(a, v_sub(v_setzero_s64(), a), VTraits<v_int64>::vlanes())));
+}
 
 
 //////////// SignMask ////////////
@@ -1720,8 +1806,8 @@ inline int v_scan_forward(const v_float64& a)
 // mask: {0,0,0,1, ...} -> {T,T,T,F, ...}
 #define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, v_trunc) \
 inline _Tpvec v_pack_triplets(const _Tpvec& vec) { \
-    size_t vl = vsetvlmax_e8m1(); \
-    vuint32m1_t one = vmv_v_x_u32m1(1, vl/4); \
+    size_t vl = __cv_rvv_e8m1_nlanes; \
+    vuint32m1_t one = vmv_v_x_u32m1(1, __cv_rvv_e32m1_nlanes); \
     vuint8m1_t zero = vmv_v_x_u8m1(0, vl); \
     vuint8m1_t mask = vreinterpret_u8m1(one); \
     return vcompress(vmseq(v_trunc(vslideup(zero, mask, 3, vl)), 0, vl), vec, vec, VTraits<_Tpvec>::vlanes()); \
@@ -1744,28 +1830,28 @@ OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, vlmul_trunc_u8mf8)
 ////// FP16 support ///////
 
 #if defined(__riscv_zfh) && __riscv_zfh
-inline v_float32 v_load_expand(const float16_t* ptr)
+inline v_float32 v_load_expand(const hfloat* ptr)
 {
     return vfwcvt_f(vle16_v_f16mf2((_Float16*)ptr, VTraits<v_float32>::vlanes()) ,VTraits<v_float32>::vlanes());;
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32& v)
+inline void v_pack_store(hfloat* ptr, const v_float32& v)
 {
     vse16_v_f16mf2((_Float16*)ptr, vfncvt_f_f_w_f16mf2(v, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
 }
 #else
-inline v_float32 v_load_expand(const float16_t* ptr)
+inline v_float32 v_load_expand(const hfloat* ptr)
 {
     float buf[32];
     for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) buf[i] = (float)ptr[i];
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32& v)
+inline void v_pack_store(hfloat* ptr, const v_float32& v)
 {
     float buf[32];
     v_store(buf, v);
-    for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) ptr[i] = float16_t(buf[i]);
+    for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) ptr[i] = hfloat(buf[i]);
 }
 #endif
 ////////////// Rounding //////////////
@@ -1793,12 +1879,14 @@ inline v_int32 v_trunc(const v_float32& a)
 #if CV_SIMD_SCALABLE_64F
 inline v_int32 v_round(const v_float64& a)
 {
-    return vfncvt_x(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
+    return vfncvt_x(vlmul_ext_f64m2(a), VTraits<v_float32>::vlanes());
 }
 
 inline v_int32 v_round(const v_float64& a, const v_float64& b)
 {
-    return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
+    // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
+    // Fix https://github.com/opencv/opencv/issues/24746
+    return vfncvt_x(vset(vlmul_ext_f64m2(a), 1, b), VTraits<v_float32>::vlanes());
 }
 
 inline v_int32 v_floor(const v_float64& a)
@@ -2087,6 +2175,8 @@ inline void v_cleanup() {}
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
+//! @endcond
+
 } //namespace cv
 
 #endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 9d17f7166665..68b5a67bbc00 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -3407,7 +3407,7 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& v)
 
 ////////////// FP16 support ///////////////////////////
 
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
 #if CV_FP16
     return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
@@ -3427,7 +3427,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
 #if CV_FP16
     __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index b198643cc665..e66563bede26 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -1361,7 +1361,7 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
 
 /////// FP16 support ////////
 
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
 #if CV_VSX3 && defined(vec_extract_fp_from_shorth)
@@ -1388,7 +1388,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
 // fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
 #if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
index b4178af8b789..5d470d94192c 100644
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@@ -8,6 +8,7 @@
 #include <limits>
 #include <cstring>
 #include <algorithm>
+#include <emscripten/version.h>
 #include "opencv2/core/saturate.hpp"
 
 #define CV_SIMD128 1
@@ -2753,7 +2754,7 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)
 
 ////////////// FP16 support ///////////////////////////
 
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     float a[4];
     for (int i = 0; i < 4; i++)
@@ -2761,14 +2762,14 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_float32x4(wasm_v128_load(a));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     double v_[4];
     wasm_v128_store(v_, v.val);
-    ptr[0] = float16_t(v_[0]);
-    ptr[1] = float16_t(v_[1]);
-    ptr[2] = float16_t(v_[2]);
-    ptr[3] = float16_t(v_[3]);
+    ptr[0] = hfloat(v_[0]);
+    ptr[1] = hfloat(v_[1]);
+    ptr[2] = hfloat(v_[2]);
+    ptr[3] = hfloat(v_[3]);
 }
 
 inline void v_cleanup() {}
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index c4c6ff6d784c..2bfb0966c2a2 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -53,6 +53,7 @@
 
 #include "opencv2/core/bufferpool.hpp"
 
+#include <array>
 #include <type_traits>
 
 namespace cv
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 886b82c6a045..f0eed783a595 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -51,7 +51,7 @@
 
 #ifdef _MSC_VER
 #pragma warning( push )
-#pragma warning( disable: 4127 )
+#pragma warning( disable: 4127 5054 )
 #endif
 
 #if defined(CV_SKIP_DISABLE_CLANG_ENUM_WARNINGS)
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 686ff5d99b90..ad13797da3a8 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -61,8 +61,6 @@ namespace cv
 //! @addtogroup core_basic
 //! @{
 
-////////////////////////////// Small Matrix ///////////////////////////
-
 //! @cond IGNORED
 // FIXIT Remove this (especially CV_EXPORTS modifier)
 struct CV_EXPORTS Matx_AddOp { Matx_AddOp() {} Matx_AddOp(const Matx_AddOp&) {} };
@@ -74,6 +72,8 @@ struct CV_EXPORTS Matx_MatMulOp { Matx_MatMulOp() {} Matx_MatMulOp(const Matx_Ma
 struct CV_EXPORTS Matx_TOp { Matx_TOp() {} Matx_TOp(const Matx_TOp&) {} };
 //! @endcond
 
+////////////////////////////// Small Matrix ///////////////////////////
+
 /** @brief Template class for small matrices whose type and size are known at compilation time
 
 If you need a more flexible type, use Mat . The elements of the matrix M are accessible using the
@@ -215,7 +215,7 @@ template<typename _Tp, int m, int n> class Matx
     template<int l> Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp);
     Matx(const Matx<_Tp, n, m>& a, Matx_TOp);
 
-    _Tp val[m*n]; //< matrix elements
+    _Tp val[m*n]; ///< matrix elements
 };
 
 typedef Matx<float, 1, 2> Matx12f;
@@ -256,56 +256,83 @@ typedef Matx<double, 4, 4> Matx44d;
 typedef Matx<float, 6, 6> Matx66f;
 typedef Matx<double, 6, 6> Matx66d;
 
-/*!
-  traits
-*/
-template<typename _Tp, int m, int n> class DataType< Matx<_Tp, m, n> >
-{
-public:
-    typedef Matx<_Tp, m, n>                               value_type;
-    typedef Matx<typename DataType<_Tp>::work_type, m, n> work_type;
-    typedef _Tp                                           channel_type;
-    typedef value_type                                    vec_type;
-
-    enum { generic_type = 0,
-           channels     = m * n,
-           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
-#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
-           ,depth        = DataType<channel_type>::depth
-           ,type         = CV_MAKETYPE(depth, channels)
-#endif
-         };
-};
+template<typename _Tp, int m> static inline
+double determinant(const Matx<_Tp, m, m>& a);
 
-namespace traits {
-template<typename _Tp, int m, int n>
-struct Depth< Matx<_Tp, m, n> > { enum { value = Depth<_Tp>::value }; };
-template<typename _Tp, int m, int n>
-struct Type< Matx<_Tp, m, n> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, n*m) }; };
-} // namespace
+template<typename _Tp, int m, int n> static inline
+double trace(const Matx<_Tp, m, n>& a);
 
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M);
 
-/** @brief  Comma-separated Matrix Initializer
-*/
-template<typename _Tp, int m, int n> class MatxCommaInitializer
-{
-public:
-    MatxCommaInitializer(Matx<_Tp, m, n>* _mtx);
-    template<typename T2> MatxCommaInitializer<_Tp, m, n>& operator , (T2 val);
-    Matx<_Tp, m, n> operator *() const;
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M, int normType);
 
-    Matx<_Tp, m, n>* dst;
-    int idx;
-};
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b);
 
-/*
- Utility methods
-*/
-template<typename _Tp, int m> static double determinant(const Matx<_Tp, m, m>& a);
-template<typename _Tp, int m, int n> static double trace(const Matx<_Tp, m, n>& a);
-template<typename _Tp, int m, int n> static double norm(const Matx<_Tp, m, n>& M);
-template<typename _Tp, int m, int n> static double norm(const Matx<_Tp, m, n>& M, int normType);
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha);
 
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n, int l> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
 
 
 /////////////////////// Vec (used as element of multi-channel images /////////////////////
@@ -376,10 +403,8 @@ template<typename _Tp, int cn> class Vec : public Matx<_Tp, cn, 1>
     static Vec randn(_Tp a, _Tp b);
     static Vec randu(_Tp a, _Tp b);
     static Vec zeros();
-#ifdef CV_CXX11
     static Vec diag(_Tp alpha) = delete;
     static Vec eye() = delete;
-#endif
 
     //! per-element multiplication
     Vec mul(const Vec<_Tp, cn>& v) const;
@@ -402,9 +427,7 @@ template<typename _Tp, int cn> class Vec : public Matx<_Tp, cn, 1>
     const _Tp& operator ()(int i) const;
     _Tp& operator ()(int i);
 
-#ifdef CV_CXX11
     Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default;
-#endif
 
     Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp);
     Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp);
@@ -443,1094 +466,79 @@ typedef Vec<double, 4> Vec4d;
 typedef Vec<double, 6> Vec6d;
 /** @} */
 
-/*!
-  traits
-*/
-template<typename _Tp, int cn> class DataType< Vec<_Tp, cn> >
-{
-public:
-    typedef Vec<_Tp, cn>                               value_type;
-    typedef Vec<typename DataType<_Tp>::work_type, cn> work_type;
-    typedef _Tp                                        channel_type;
-    typedef value_type                                 vec_type;
-
-    enum { generic_type = 0,
-           channels     = cn,
-           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
-#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
-           depth        = DataType<channel_type>::depth,
-           type         = CV_MAKETYPE(depth, channels),
-#endif
-           _dummy_enum_finalizer = 0
-         };
-};
-
-namespace traits {
-template<typename _Tp, int cn>
-struct Depth< Vec<_Tp, cn> > { enum { value = Depth<_Tp>::value }; };
-template<typename _Tp, int cn>
-struct Type< Vec<_Tp, cn> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, cn) }; };
-} // namespace
-
-
-/** @brief  Comma-separated Vec Initializer
-*/
-template<typename _Tp, int m> class VecCommaInitializer : public MatxCommaInitializer<_Tp, m, 1>
-{
-public:
-    VecCommaInitializer(Vec<_Tp, m>* _vec);
-    template<typename T2> VecCommaInitializer<_Tp, m>& operator , (T2 val);
-    Vec<_Tp, m> operator *() const;
-};
-
-template<typename _Tp, int cn> static Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v);
-
-//! @} core_basic
-
-//! @cond IGNORED
-
-///////////////////////////////////// helper classes /////////////////////////////////////
-namespace internal
-{
-
-template<typename _Tp, int m> struct Matx_DetOp
-{
-    double operator ()(const Matx<_Tp, m, m>& a) const
-    {
-        Matx<_Tp, m, m> temp = a;
-        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
-        if( p == 0 )
-            return p;
-        for( int i = 0; i < m; i++ )
-            p *= temp(i, i);
-        return p;
-    }
-};
-
-template<typename _Tp> struct Matx_DetOp<_Tp, 1>
-{
-    double operator ()(const Matx<_Tp, 1, 1>& a) const
-    {
-        return a(0,0);
-    }
-};
-
-template<typename _Tp> struct Matx_DetOp<_Tp, 2>
-{
-    double operator ()(const Matx<_Tp, 2, 2>& a) const
-    {
-        return a(0,0)*a(1,1) - a(0,1)*a(1,0);
-    }
-};
-
-template<typename _Tp> struct Matx_DetOp<_Tp, 3>
-{
-    double operator ()(const Matx<_Tp, 3, 3>& a) const
-    {
-        return a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2)) -
-            a(0,1)*(a(1,0)*a(2,2) - a(2,0)*a(1,2)) +
-            a(0,2)*(a(1,0)*a(2,1) - a(2,0)*a(1,1));
-    }
-};
-
-template<typename _Tp> Vec<_Tp, 2> inline conjugate(const Vec<_Tp, 2>& v)
-{
-    return Vec<_Tp, 2>(v[0], -v[1]);
-}
-
-template<typename _Tp> Vec<_Tp, 4> inline conjugate(const Vec<_Tp, 4>& v)
-{
-    return Vec<_Tp, 4>(v[0], -v[1], -v[2], -v[3]);
-}
-
-} // internal
-
-
-
-////////////////////////////////// Matx Implementation ///////////////////////////////////
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx()
-{
-    for(int i = 0; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0)
-{
-    val[0] = v0;
-    for(int i = 1; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1)
-{
-    CV_StaticAssert(channels >= 2, "Matx should have at least 2 elements.");
-    val[0] = v0; val[1] = v1;
-    for(int i = 2; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2)
-{
-    CV_StaticAssert(channels >= 3, "Matx should have at least 3 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2;
-    for(int i = 3; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
-{
-    CV_StaticAssert(channels >= 4, "Matx should have at least 4 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    for(int i = 4; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
-{
-    CV_StaticAssert(channels >= 5, "Matx should have at least 5 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3; val[4] = v4;
-    for(int i = 5; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
-{
-    CV_StaticAssert(channels >= 6, "Matx should have at least 6 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5;
-    for(int i = 6; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
-{
-    CV_StaticAssert(channels >= 7, "Matx should have at least 7 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6;
-    for(int i = 7; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
-{
-    CV_StaticAssert(channels >= 8, "Matx should have at least 8 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    for(int i = 8; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
-{
-    CV_StaticAssert(channels >= 9, "Matx should have at least 9 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8;
-    for(int i = 9; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
-{
-    CV_StaticAssert(channels >= 10, "Matx should have at least 10 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8; val[9] = v9;
-    for(int i = 10; i < channels; i++) val[i] = _Tp(0);
-}
-
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11)
-{
-    CV_StaticAssert(channels >= 12, "Matx should have at least 12 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
-    for(int i = 12; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
-{
-    CV_StaticAssert(channels >= 14, "Matx should have at least 14 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
-    val[12] = v12; val[13] = v13;
-    for (int i = 14; i < channels; i++) val[i] = _Tp(0);
-}
-
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13, _Tp v14, _Tp v15)
-{
-    CV_StaticAssert(channels >= 16, "Matx should have at least 16 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
-    val[12] = v12; val[13] = v13; val[14] = v14; val[15] = v15;
-    for(int i = 16; i < channels; i++) val[i] = _Tp(0);
-}
-
-// WARNING: unreachable code using Ninja
-#if defined _MSC_VER && _MSC_VER >= 1920
-#pragma warning(push)
-#pragma warning(disable: 4702)
-#endif
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(const _Tp* values)
-{
-    for( int i = 0; i < channels; i++ ) val[i] = values[i];
-}
-#if defined _MSC_VER && _MSC_VER >= 1920
-#pragma warning(pop)
-#endif
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(std::initializer_list<_Tp> list)
-{
-    CV_DbgAssert(list.size() == channels);
-    int i = 0;
-    for(const auto& elem : list)
-    {
-        val[i++] = elem;
-    }
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n> Matx<_Tp, m, n>::all(_Tp alpha)
-{
-    Matx<_Tp, m, n> M;
-    for( int i = 0; i < m*n; i++ ) M.val[i] = alpha;
-    return M;
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n> Matx<_Tp,m,n>::zeros()
-{
-    return all(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n> Matx<_Tp,m,n>::ones()
-{
-    return all(1);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n> Matx<_Tp,m,n>::eye()
-{
-    Matx<_Tp,m,n> M;
-    for(int i = 0; i < shortdim; i++)
-        M(i,i) = 1;
-    return M;
-}
-
-template<typename _Tp, int m, int n> inline
-_Tp Matx<_Tp, m, n>::dot(const Matx<_Tp, m, n>& M) const
-{
-    _Tp s = 0;
-    for( int i = 0; i < channels; i++ ) s += val[i]*M.val[i];
-    return s;
-}
-
-template<typename _Tp, int m, int n> inline
-double Matx<_Tp, m, n>::ddot(const Matx<_Tp, m, n>& M) const
-{
-    double s = 0;
-    for( int i = 0; i < channels; i++ ) s += (double)val[i]*M.val[i];
-    return s;
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n> Matx<_Tp,m,n>::diag(const typename Matx<_Tp,m,n>::diag_type& d)
-{
-    Matx<_Tp,m,n> M;
-    for(int i = 0; i < shortdim; i++)
-        M(i,i) = d(i, 0);
-    return M;
-}
-
-template<typename _Tp, int m, int n> template<typename T2>
-inline Matx<_Tp, m, n>::operator Matx<T2, m, n>() const
-{
-    Matx<T2, m, n> M;
-    for( int i = 0; i < m*n; i++ ) M.val[i] = saturate_cast<T2>(val[i]);
-    return M;
-}
-
-template<typename _Tp, int m, int n> template<int m1, int n1> inline
-Matx<_Tp, m1, n1> Matx<_Tp, m, n>::reshape() const
-{
-    CV_StaticAssert(m1*n1 == m*n, "Input and destnarion matrices must have the same number of elements");
-    return (const Matx<_Tp, m1, n1>&)*this;
-}
-
-template<typename _Tp, int m, int n>
-template<int m1, int n1> inline
-Matx<_Tp, m1, n1> Matx<_Tp, m, n>::get_minor(int base_row, int base_col) const
-{
-    CV_DbgAssert(0 <= base_row && base_row+m1 <= m && 0 <= base_col && base_col+n1 <= n);
-    Matx<_Tp, m1, n1> s;
-    for( int di = 0; di < m1; di++ )
-        for( int dj = 0; dj < n1; dj++ )
-            s(di, dj) = (*this)(base_row+di, base_col+dj);
-    return s;
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, 1, n> Matx<_Tp, m, n>::row(int i) const
-{
-    CV_DbgAssert((unsigned)i < (unsigned)m);
-    return Matx<_Tp, 1, n>(&val[i*n]);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, 1> Matx<_Tp, m, n>::col(int j) const
-{
-    CV_DbgAssert((unsigned)j < (unsigned)n);
-    Matx<_Tp, m, 1> v;
-    for( int i = 0; i < m; i++ )
-        v.val[i] = val[i*n + j];
-    return v;
-}
-
-template<typename _Tp, int m, int n> inline
-typename Matx<_Tp, m, n>::diag_type Matx<_Tp, m, n>::diag() const
-{
-    diag_type d;
-    for( int i = 0; i < shortdim; i++ )
-        d.val[i] = val[i*n + i];
-    return d;
-}
-
-template<typename _Tp, int m, int n> inline
-const _Tp& Matx<_Tp, m, n>::operator()(int row_idx, int col_idx) const
-{
-    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
-    return this->val[row_idx*n + col_idx];
-}
-
-template<typename _Tp, int m, int n> inline
-_Tp& Matx<_Tp, m, n>::operator ()(int row_idx, int col_idx)
-{
-    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
-    return val[row_idx*n + col_idx];
-}
-
-template<typename _Tp, int m, int n> inline
-const _Tp& Matx<_Tp, m, n>::operator ()(int i) const
-{
-    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
-    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
-    return val[i];
-}
-
-template<typename _Tp, int m, int n> inline
-_Tp& Matx<_Tp, m, n>::operator ()(int i)
-{
-    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
-    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
-    return val[i];
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_AddOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] + b.val[i]);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] - b.val[i]);
-}
-
-template<typename _Tp, int m, int n> template<typename _T2> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] * b.val[i]);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] / b.val[i]);
-}
-
-template<typename _Tp, int m, int n> template<int l> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp)
-{
-    for( int i = 0; i < m; i++ )
-        for( int j = 0; j < n; j++ )
-        {
-            _Tp s = 0;
-            for( int k = 0; k < l; k++ )
-                s += a(i, k) * b(k, j);
-            val[i*n + j] = s;
-        }
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, n, m>& a, Matx_TOp)
-{
-    for( int i = 0; i < m; i++ )
-        for( int j = 0; j < n; j++ )
-            val[i*n + j] = a(j, i);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n> Matx<_Tp, m, n>::mul(const Matx<_Tp, m, n>& a) const
-{
-    return Matx<_Tp, m, n>(*this, a, Matx_MulOp());
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n> Matx<_Tp, m, n>::div(const Matx<_Tp, m, n>& a) const
-{
-    return Matx<_Tp, m, n>(*this, a, Matx_DivOp());
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, n, m> Matx<_Tp, m, n>::t() const
-{
-    return Matx<_Tp, n, m>(*this, Matx_TOp());
-}
-
-template<typename _Tp, int m, int n> inline
-Vec<_Tp, n> Matx<_Tp, m, n>::solve(const Vec<_Tp, m>& rhs, int method) const
-{
-    Matx<_Tp, n, 1> x = solve((const Matx<_Tp, m, 1>&)(rhs), method);
-    return (Vec<_Tp, n>&)(x);
-}
-
-template<typename _Tp, int m> static inline
-double determinant(const Matx<_Tp, m, m>& a)
-{
-    return cv::internal::Matx_DetOp<_Tp, m>()(a);
-}
-
-template<typename _Tp, int m, int n> static inline
-double trace(const Matx<_Tp, m, n>& a)
-{
-    _Tp s = 0;
-    for( int i = 0; i < std::min(m, n); i++ )
-        s += a(i,i);
-    return s;
-}
-
-template<typename _Tp, int m, int n> static inline
-double norm(const Matx<_Tp, m, n>& M)
-{
-    return std::sqrt(normL2Sqr<_Tp, double>(M.val, m*n));
-}
-
-template<typename _Tp, int m, int n> static inline
-double norm(const Matx<_Tp, m, n>& M, int normType)
-{
-    switch(normType) {
-    case NORM_INF:
-        return (double)normInf<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
-    case NORM_L1:
-        return (double)normL1<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
-    case NORM_L2SQR:
-        return (double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
-    default:
-    case NORM_L2:
-        return std::sqrt((double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n));
-    }
-}
-
-
-
-//////////////////////////////// matx comma initializer //////////////////////////////////
-
-template<typename _Tp, typename _T2, int m, int n> static inline
-MatxCommaInitializer<_Tp, m, n> operator << (const Matx<_Tp, m, n>& mtx, _T2 val)
-{
-    MatxCommaInitializer<_Tp, m, n> commaInitializer((Matx<_Tp, m, n>*)&mtx);
-    return (commaInitializer, val);
-}
-
-template<typename _Tp, int m, int n> inline
-MatxCommaInitializer<_Tp, m, n>::MatxCommaInitializer(Matx<_Tp, m, n>* _mtx)
-    : dst(_mtx), idx(0)
-{}
-
-template<typename _Tp, int m, int n> template<typename _T2> inline
-MatxCommaInitializer<_Tp, m, n>& MatxCommaInitializer<_Tp, m, n>::operator , (_T2 value)
-{
-    CV_DbgAssert( idx < m*n );
-    dst->val[idx++] = saturate_cast<_Tp>(value);
-    return *this;
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n> MatxCommaInitializer<_Tp, m, n>::operator *() const
-{
-    CV_DbgAssert( idx == n*m );
-    return *dst;
-}
-
-
-
-/////////////////////////////////// Vec Implementation ///////////////////////////////////
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec() {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0)
-    : Matx<_Tp, cn, 1>(v0) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1)
-    : Matx<_Tp, cn, 1>(v0, v1) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2)
-    : Matx<_Tp, cn, 1>(v0, v1, v2) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(const _Tp* values)
-    : Matx<_Tp, cn, 1>(values) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(std::initializer_list<_Tp> list)
-    : Matx<_Tp, cn, 1>(list) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(const Vec<_Tp, cn>& m)
-    : Matx<_Tp, cn, 1>(m.val) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp op)
-    : Matx<_Tp, cn, 1>(a, b, op) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp op)
-    : Matx<_Tp, cn, 1>(a, b, op) {}
-
-template<typename _Tp, int cn> template<typename _T2> inline
-Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp op)
-    : Matx<_Tp, cn, 1>(a, alpha, op) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::all(_Tp alpha)
-{
-    Vec v;
-    for( int i = 0; i < cn; i++ ) v.val[i] = alpha;
-    return v;
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::ones()
-{
-    return Vec::all(1);
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::zeros()
-{
-    return Vec::all(0);
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::mul(const Vec<_Tp, cn>& v) const
-{
-    Vec<_Tp, cn> w;
-    for( int i = 0; i < cn; i++ ) w.val[i] = saturate_cast<_Tp>(this->val[i]*v.val[i]);
-    return w;
-}
-
-template<> inline
-Vec<float, 2> Vec<float, 2>::conj() const
-{
-    return cv::internal::conjugate(*this);
-}
-
-template<> inline
-Vec<double, 2> Vec<double, 2>::conj() const
-{
-    return cv::internal::conjugate(*this);
-}
-
-template<> inline
-Vec<float, 4> Vec<float, 4>::conj() const
-{
-    return cv::internal::conjugate(*this);
-}
-
-template<> inline
-Vec<double, 4> Vec<double, 4>::conj() const
-{
-    return cv::internal::conjugate(*this);
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::cross(const Vec<_Tp, cn>&) const
-{
-    CV_StaticAssert(cn == 3, "for arbitrary-size vector there is no cross-product defined");
-    return Vec<_Tp, cn>();
-}
-
-template<> inline
-Vec<float, 3> Vec<float, 3>::cross(const Vec<float, 3>& v) const
-{
-    return Vec<float,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
-                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
-                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
-}
-
-template<> inline
-Vec<double, 3> Vec<double, 3>::cross(const Vec<double, 3>& v) const
-{
-    return Vec<double,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
-                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
-                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
-}
-
-template<typename _Tp, int cn> template<typename T2> inline
-Vec<_Tp, cn>::operator Vec<T2, cn>() const
-{
-    Vec<T2, cn> v;
-    for( int i = 0; i < cn; i++ ) v.val[i] = saturate_cast<T2>(this->val[i]);
-    return v;
-}
-
-template<typename _Tp, int cn> inline
-const _Tp& Vec<_Tp, cn>::operator [](int i) const
-{
-    CV_DbgAssert( (unsigned)i < (unsigned)cn );
-    return this->val[i];
-}
-
 template<typename _Tp, int cn> inline
-_Tp& Vec<_Tp, cn>::operator [](int i)
-{
-    CV_DbgAssert( (unsigned)i < (unsigned)cn );
-    return this->val[i];
-}
-
-template<typename _Tp, int cn> inline
-const _Tp& Vec<_Tp, cn>::operator ()(int i) const
-{
-    CV_DbgAssert( (unsigned)i < (unsigned)cn );
-    return this->val[i];
-}
-
-template<typename _Tp, int cn> inline
-_Tp& Vec<_Tp, cn>::operator ()(int i)
-{
-    CV_DbgAssert( (unsigned)i < (unsigned)cn );
-    return this->val[i];
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v)
-{
-    double nv = norm(v);
-    return v * (nv ? 1./nv : 0.);
-}
-
-
-
-//////////////////////////////// vec comma initializer //////////////////////////////////
-
-
-template<typename _Tp, typename _T2, int cn> static inline
-VecCommaInitializer<_Tp, cn> operator << (const Vec<_Tp, cn>& vec, _T2 val)
-{
-    VecCommaInitializer<_Tp, cn> commaInitializer((Vec<_Tp, cn>*)&vec);
-    return (commaInitializer, val);
-}
-
-template<typename _Tp, int cn> inline
-VecCommaInitializer<_Tp, cn>::VecCommaInitializer(Vec<_Tp, cn>* _vec)
-    : MatxCommaInitializer<_Tp, cn, 1>(_vec)
-{}
-
-template<typename _Tp, int cn> template<typename _T2> inline
-VecCommaInitializer<_Tp, cn>& VecCommaInitializer<_Tp, cn>::operator , (_T2 value)
-{
-    CV_DbgAssert( this->idx < cn );
-    this->dst->val[this->idx++] = saturate_cast<_Tp>(value);
-    return *this;
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> VecCommaInitializer<_Tp, cn>::operator *() const
-{
-    CV_DbgAssert( this->idx == cn );
-    return *this->dst;
-}
-
-//! @endcond
-
-///////////////////////////// Matx out-of-class operators ////////////////////////////////
-
-//! @relates cv::Matx
-//! @{
-
-template<typename _Tp1, typename _Tp2, int m, int n> static inline
-Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
-    return a;
-}
-
-template<typename _Tp1, typename _Tp2, int m, int n> static inline
-Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
-{
-    return Matx<_Tp, m, n>(a, b, Matx_AddOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
-{
-    return Matx<_Tp, m, n>(a, b, Matx_SubOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = a.val[i] / alpha;
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = a.val[i] / alpha;
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
-{
-    return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
-{
-    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
-{
-    return Matx<_Tp, m, n>(a, -1, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n, int l> static inline
-Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b)
-{
-    return Matx<_Tp, m, n>(a, b, Matx_MatMulOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b)
-{
-    Matx<_Tp, m, 1> c(a, b, Matx_MatMulOp());
-    return (const Vec<_Tp, m>&)(c);
-}
-
-template<typename _Tp, int m, int n> static inline
-bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
-{
-    for( int i = 0; i < m*n; i++ )
-        if( a.val[i] != b.val[i] ) return false;
-    return true;
-}
-
-template<typename _Tp, int m, int n> static inline
-bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
-{
-    return !(a == b);
-}
-
-//! @}
-
-////////////////////////////// Vec out-of-class operators ////////////////////////////////
-
-//! @relates cv::Vec
-//! @{
+Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v);
 
 template<typename _Tp1, typename _Tp2, int cn> static inline
-Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
-{
-    for( int i = 0; i < cn; i++ )
-        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
-    return a;
-}
+Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b);
 
 template<typename _Tp1, typename _Tp2, int cn> static inline
-Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
-{
-    for( int i = 0; i < cn; i++ )
-        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
-    return a;
-}
+Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
-{
-    return Vec<_Tp, cn>(a, b, Matx_AddOp());
-}
+Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
-{
-    return Vec<_Tp, cn>(a, b, Matx_SubOp());
-}
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha)
-{
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*alpha);
-    return a;
-}
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha)
-{
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*alpha);
-    return a;
-}
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha)
-{
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*alpha);
-    return a;
-}
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha)
-{
-    double ialpha = 1./alpha;
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
-    return a;
-}
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha)
-{
-    float ialpha = 1.f/alpha;
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
-    return a;
-}
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha)
-{
-    double ialpha = 1./alpha;
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
-    return a;
-}
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha)
-{
-    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha)
-{
-    return Vec<_Tp, cn>(a, 1.f/alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha)
-{
-    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a)
-{
-    Vec<_Tp,cn> t;
-    for( int i = 0; i < cn; i++ ) t.val[i] = saturate_cast<_Tp>(-a.val[i]);
-    return t;
-}
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a);
 
-template<typename _Tp> inline Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
-{
-    return Vec<_Tp, 4>(saturate_cast<_Tp>(v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3]),
-                       saturate_cast<_Tp>(v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2]),
-                       saturate_cast<_Tp>(v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1]),
-                       saturate_cast<_Tp>(v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0]));
-}
+template<typename _Tp> inline
+Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2);
 
-template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
-{
-    v1 = v1 * v2;
-    return v1;
-}
+template<typename _Tp> inline
+Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2);
 
-//! @}
+//! @} core_basic
 
 } // cv
 
+#include "opencv2/core/matx.inl.hpp"
+
 #endif // OPENCV_CORE_MATX_HPP
diff --git a/modules/core/include/opencv2/core/matx.inl.hpp b/modules/core/include/opencv2/core/matx.inl.hpp
new file mode 100644
index 000000000000..faa3e749d62f
--- /dev/null
+++ b/modules/core/include/opencv2/core/matx.inl.hpp
@@ -0,0 +1,1115 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_MATX_INL_HPP
+#define OPENCV_CORE_MATX_INL_HPP
+
+#ifndef __cplusplus
+#  error matx.inl.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/matx.hpp"
+
+namespace cv
+{
+
+//==============================================================================
+// Helpers
+
+namespace internal
+{
+
+template<typename _Tp, int m> struct Matx_DetOp
+{
+    double operator ()(const Matx<_Tp, m, m>& a) const
+    {
+        Matx<_Tp, m, m> temp = a;
+        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        if( p == 0 )
+            return p;
+        for( int i = 0; i < m; i++ )
+            p *= temp(i, i);
+        return p;
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 1>
+{
+    double operator ()(const Matx<_Tp, 1, 1>& a) const
+    {
+        return a(0,0);
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 2>
+{
+    double operator ()(const Matx<_Tp, 2, 2>& a) const
+    {
+        return a(0,0)*a(1,1) - a(0,1)*a(1,0);
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 3>
+{
+    double operator ()(const Matx<_Tp, 3, 3>& a) const
+    {
+        return a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2)) -
+            a(0,1)*(a(1,0)*a(2,2) - a(2,0)*a(1,2)) +
+            a(0,2)*(a(1,0)*a(2,1) - a(2,0)*a(1,1));
+    }
+};
+
+template<typename _Tp> Vec<_Tp, 2> inline conjugate(const Vec<_Tp, 2>& v)
+{
+    return Vec<_Tp, 2>(v[0], -v[1]);
+}
+
+template<typename _Tp> Vec<_Tp, 4> inline conjugate(const Vec<_Tp, 4>& v)
+{
+    return Vec<_Tp, 4>(v[0], -v[1], -v[2], -v[3]);
+}
+
+} // internal::
+
+
+//==============================================================================
+// Matx
+
+template<typename _Tp, int m, int n> class DataType< Matx<_Tp, m, n> >
+{
+public:
+    typedef Matx<_Tp, m, n>                               value_type;
+    typedef Matx<typename DataType<_Tp>::work_type, m, n> work_type;
+    typedef _Tp                                           channel_type;
+    typedef value_type                                    vec_type;
+
+    enum { generic_type = 0,
+           channels     = m * n,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+};
+
+
+namespace traits {
+template<typename _Tp, int m, int n>
+struct Depth< Matx<_Tp, m, n> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp, int m, int n>
+struct Type< Matx<_Tp, m, n> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, n*m) }; };
+} // namespace
+
+
+//! @brief  Comma-separated Matrix Initializer
+template<typename _Tp, int m, int n> class MatxCommaInitializer
+{
+public:
+    MatxCommaInitializer(Matx<_Tp, m, n>* _mtx);
+    template<typename T2> MatxCommaInitializer<_Tp, m, n>& operator , (T2 val);
+    Matx<_Tp, m, n> operator *() const;
+
+    Matx<_Tp, m, n>* dst;
+    int idx;
+};
+
+template<typename _Tp, typename _T2, int m, int n> static inline
+MatxCommaInitializer<_Tp, m, n> operator << (const Matx<_Tp, m, n>& mtx, _T2 val)
+{
+    MatxCommaInitializer<_Tp, m, n> commaInitializer((Matx<_Tp, m, n>*)&mtx);
+    return (commaInitializer, val);
+}
+
+template<typename _Tp, int m, int n> inline
+MatxCommaInitializer<_Tp, m, n>::MatxCommaInitializer(Matx<_Tp, m, n>* _mtx)
+    : dst(_mtx), idx(0)
+{}
+
+template<typename _Tp, int m, int n> template<typename _T2> inline
+MatxCommaInitializer<_Tp, m, n>& MatxCommaInitializer<_Tp, m, n>::operator , (_T2 value)
+{
+    CV_DbgAssert( idx < m*n );
+    dst->val[idx++] = saturate_cast<_Tp>(value);
+    return *this;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> MatxCommaInitializer<_Tp, m, n>::operator *() const
+{
+    CV_DbgAssert( idx == n*m );
+    return *dst;
+}
+
+////////////////////////////////// Matx Implementation ///////////////////////////////////
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx()
+{
+    for(int i = 0; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0)
+{
+    val[0] = v0;
+    for(int i = 1; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1)
+{
+    CV_StaticAssert(channels >= 2, "Matx should have at least 2 elements.");
+    val[0] = v0; val[1] = v1;
+    for(int i = 2; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2)
+{
+    CV_StaticAssert(channels >= 3, "Matx should have at least 3 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2;
+    for(int i = 3; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+{
+    CV_StaticAssert(channels >= 4, "Matx should have at least 4 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    for(int i = 4; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
+{
+    CV_StaticAssert(channels >= 5, "Matx should have at least 5 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3; val[4] = v4;
+    for(int i = 5; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
+{
+    CV_StaticAssert(channels >= 6, "Matx should have at least 6 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5;
+    for(int i = 6; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
+{
+    CV_StaticAssert(channels >= 7, "Matx should have at least 7 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6;
+    for(int i = 7; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
+{
+    CV_StaticAssert(channels >= 8, "Matx should have at least 8 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    for(int i = 8; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
+{
+    CV_StaticAssert(channels >= 9, "Matx should have at least 9 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8;
+    for(int i = 9; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
+{
+    CV_StaticAssert(channels >= 10, "Matx should have at least 10 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9;
+    for(int i = 10; i < channels; i++) val[i] = _Tp(0);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11)
+{
+    CV_StaticAssert(channels >= 12, "Matx should have at least 12 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    for(int i = 12; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
+{
+    CV_StaticAssert(channels >= 14, "Matx should have at least 14 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    val[12] = v12; val[13] = v13;
+    for (int i = 14; i < channels; i++) val[i] = _Tp(0);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13, _Tp v14, _Tp v15)
+{
+    CV_StaticAssert(channels >= 16, "Matx should have at least 16 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    val[12] = v12; val[13] = v13; val[14] = v14; val[15] = v15;
+    for(int i = 16; i < channels; i++) val[i] = _Tp(0);
+}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(const _Tp* values)
+{
+    for( int i = 0; i < channels; i++ ) val[i] = values[i];
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(std::initializer_list<_Tp> list)
+{
+    CV_DbgAssert(list.size() == channels);
+    int i = 0;
+    for(const auto& elem : list)
+    {
+        val[i++] = elem;
+    }
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::all(_Tp alpha)
+{
+    Matx<_Tp, m, n> M;
+    for( int i = 0; i < m*n; i++ ) M.val[i] = alpha;
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::zeros()
+{
+    return all(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::ones()
+{
+    return all(1);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::eye()
+{
+    Matx<_Tp,m,n> M;
+    for(int i = 0; i < shortdim; i++)
+        M(i,i) = 1;
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp Matx<_Tp, m, n>::dot(const Matx<_Tp, m, n>& M) const
+{
+    _Tp s = 0;
+    for( int i = 0; i < channels; i++ ) s += val[i]*M.val[i];
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+double Matx<_Tp, m, n>::ddot(const Matx<_Tp, m, n>& M) const
+{
+    double s = 0;
+    for( int i = 0; i < channels; i++ ) s += (double)val[i]*M.val[i];
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::diag(const typename Matx<_Tp,m,n>::diag_type& d)
+{
+    Matx<_Tp,m,n> M;
+    for(int i = 0; i < shortdim; i++)
+        M(i,i) = d(i, 0);
+    return M;
+}
+
+template<typename _Tp, int m, int n> template<typename T2>
+inline Matx<_Tp, m, n>::operator Matx<T2, m, n>() const
+{
+    Matx<T2, m, n> M;
+    for( int i = 0; i < m*n; i++ ) M.val[i] = saturate_cast<T2>(val[i]);
+    return M;
+}
+
+template<typename _Tp, int m, int n> template<int m1, int n1> inline
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::reshape() const
+{
+    CV_StaticAssert(m1*n1 == m*n, "Input and destination matrices must have the same number of elements");
+    return (const Matx<_Tp, m1, n1>&)*this;
+}
+
+template<typename _Tp, int m, int n>
+template<int m1, int n1> inline
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::get_minor(int base_row, int base_col) const
+{
+    CV_DbgAssert(0 <= base_row && base_row+m1 <= m && 0 <= base_col && base_col+n1 <= n);
+    Matx<_Tp, m1, n1> s;
+    for( int di = 0; di < m1; di++ )
+        for( int dj = 0; dj < n1; dj++ )
+            s(di, dj) = (*this)(base_row+di, base_col+dj);
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, 1, n> Matx<_Tp, m, n>::row(int i) const
+{
+    CV_DbgAssert((unsigned)i < (unsigned)m);
+    return Matx<_Tp, 1, n>(&val[i*n]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, 1> Matx<_Tp, m, n>::col(int j) const
+{
+    CV_DbgAssert((unsigned)j < (unsigned)n);
+    Matx<_Tp, m, 1> v;
+    for( int i = 0; i < m; i++ )
+        v.val[i] = val[i*n + j];
+    return v;
+}
+
+template<typename _Tp, int m, int n> inline
+typename Matx<_Tp, m, n>::diag_type Matx<_Tp, m, n>::diag() const
+{
+    diag_type d;
+    for( int i = 0; i < shortdim; i++ )
+        d.val[i] = val[i*n + i];
+    return d;
+}
+
+template<typename _Tp, int m, int n> inline
+const _Tp& Matx<_Tp, m, n>::operator()(int row_idx, int col_idx) const
+{
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return this->val[row_idx*n + col_idx];
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp& Matx<_Tp, m, n>::operator ()(int row_idx, int col_idx)
+{
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return val[row_idx*n + col_idx];
+}
+
+template<typename _Tp, int m, int n> inline
+const _Tp& Matx<_Tp, m, n>::operator ()(int i) const
+{
+    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
+    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
+    return val[i];
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp& Matx<_Tp, m, n>::operator ()(int i)
+{
+    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
+    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
+    return val[i];
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_AddOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] + b.val[i]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] - b.val[i]);
+}
+
+template<typename _Tp, int m, int n> template<typename _T2> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] * b.val[i]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] / b.val[i]);
+}
+
+template<typename _Tp, int m, int n> template<int l> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp)
+{
+    for( int i = 0; i < m; i++ )
+        for( int j = 0; j < n; j++ )
+        {
+            _Tp s = 0;
+            for( int k = 0; k < l; k++ )
+                s += a(i, k) * b(k, j);
+            val[i*n + j] = s;
+        }
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, n, m>& a, Matx_TOp)
+{
+    for( int i = 0; i < m; i++ )
+        for( int j = 0; j < n; j++ )
+            val[i*n + j] = a(j, i);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::mul(const Matx<_Tp, m, n>& a) const
+{
+    return Matx<_Tp, m, n>(*this, a, Matx_MulOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::div(const Matx<_Tp, m, n>& a) const
+{
+    return Matx<_Tp, m, n>(*this, a, Matx_DivOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, n, m> Matx<_Tp, m, n>::t() const
+{
+    return Matx<_Tp, n, m>(*this, Matx_TOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Vec<_Tp, n> Matx<_Tp, m, n>::solve(const Vec<_Tp, m>& rhs, int method) const
+{
+    Matx<_Tp, n, 1> x = solve((const Matx<_Tp, m, 1>&)(rhs), method);
+    return (Vec<_Tp, n>&)(x);
+}
+
+template<typename _Tp, int m> static inline
+double determinant(const Matx<_Tp, m, m>& a)
+{
+    return cv::internal::Matx_DetOp<_Tp, m>()(a);
+}
+
+template<typename _Tp, int m, int n> static inline
+double trace(const Matx<_Tp, m, n>& a)
+{
+    _Tp s = 0;
+    for( int i = 0; i < std::min(m, n); i++ )
+        s += a(i,i);
+    return s;
+}
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M)
+{
+    return std::sqrt(normL2Sqr<_Tp, double>(M.val, m*n));
+}
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M, int normType)
+{
+    switch(normType) {
+    case NORM_INF:
+        return (double)normInf<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    case NORM_L1:
+        return (double)normL1<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    case NORM_L2SQR:
+        return (double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    default:
+    case NORM_L2:
+        return std::sqrt((double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n));
+    }
+}
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
+    return a;
+}
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_AddOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_SubOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, -1, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n, int l> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_MatMulOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b)
+{
+    Matx<_Tp, m, 1> c(a, b, Matx_MatMulOp());
+    return (const Vec<_Tp, m>&)(c);
+}
+
+template<typename _Tp, int m, int n> static inline
+bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        if( a.val[i] != b.val[i] ) return false;
+    return true;
+}
+
+template<typename _Tp, int m, int n> static inline
+bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return !(a == b);
+}
+
+//==============================================================================
+// Vec
+
+template<typename _Tp, int cn> class DataType< Vec<_Tp, cn> >
+{
+public:
+    typedef Vec<_Tp, cn>                               value_type;
+    typedef Vec<typename DataType<_Tp>::work_type, cn> work_type;
+    typedef _Tp                                        channel_type;
+    typedef value_type                                 vec_type;
+
+    enum { generic_type = 0,
+           channels     = cn,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           depth        = DataType<channel_type>::depth,
+           type         = CV_MAKETYPE(depth, channels),
+#endif
+           _dummy_enum_finalizer = 0
+         };
+};
+
+namespace traits {
+template<typename _Tp, int cn>
+struct Depth< Vec<_Tp, cn> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp, int cn>
+struct Type< Vec<_Tp, cn> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, cn) }; };
+} // namespace
+
+/** @brief  Comma-separated Vec Initializer
+*/
+template<typename _Tp, int m> class VecCommaInitializer : public MatxCommaInitializer<_Tp, m, 1>
+{
+public:
+    VecCommaInitializer(Vec<_Tp, m>* _vec);
+    template<typename T2> VecCommaInitializer<_Tp, m>& operator , (T2 val);
+    Vec<_Tp, m> operator *() const;
+};
+
+template<typename _Tp, typename _T2, int cn> static inline
+VecCommaInitializer<_Tp, cn> operator << (const Vec<_Tp, cn>& vec, _T2 val)
+{
+    VecCommaInitializer<_Tp, cn> commaInitializer((Vec<_Tp, cn>*)&vec);
+    return (commaInitializer, val);
+}
+
+template<typename _Tp, int cn> inline
+VecCommaInitializer<_Tp, cn>::VecCommaInitializer(Vec<_Tp, cn>* _vec)
+    : MatxCommaInitializer<_Tp, cn, 1>(_vec)
+{}
+
+template<typename _Tp, int cn> template<typename _T2> inline
+VecCommaInitializer<_Tp, cn>& VecCommaInitializer<_Tp, cn>::operator , (_T2 value)
+{
+    CV_DbgAssert( this->idx < cn );
+    this->dst->val[this->idx++] = saturate_cast<_Tp>(value);
+    return *this;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> VecCommaInitializer<_Tp, cn>::operator *() const
+{
+    CV_DbgAssert( this->idx == cn );
+    return *this->dst;
+}
+
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec() {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0)
+    : Matx<_Tp, cn, 1>(v0) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1)
+    : Matx<_Tp, cn, 1>(v0, v1) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2)
+    : Matx<_Tp, cn, 1>(v0, v1, v2) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const _Tp* values)
+    : Matx<_Tp, cn, 1>(values) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(std::initializer_list<_Tp> list)
+    : Matx<_Tp, cn, 1>(list) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Vec<_Tp, cn>& m)
+    : Matx<_Tp, cn, 1>(m.val) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp op)
+    : Matx<_Tp, cn, 1>(a, b, op) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp op)
+    : Matx<_Tp, cn, 1>(a, b, op) {}
+
+template<typename _Tp, int cn> template<typename _T2> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp op)
+    : Matx<_Tp, cn, 1>(a, alpha, op) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::all(_Tp alpha)
+{
+    Vec v;
+    for( int i = 0; i < cn; i++ ) v.val[i] = alpha;
+    return v;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::ones()
+{
+    return Vec::all(1);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::zeros()
+{
+    return Vec::all(0);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::mul(const Vec<_Tp, cn>& v) const
+{
+    Vec<_Tp, cn> w;
+    for( int i = 0; i < cn; i++ ) w.val[i] = saturate_cast<_Tp>(this->val[i]*v.val[i]);
+    return w;
+}
+
+template<> inline
+Vec<float, 2> Vec<float, 2>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<double, 2> Vec<double, 2>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<float, 4> Vec<float, 4>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<double, 4> Vec<double, 4>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::cross(const Vec<_Tp, cn>&) const
+{
+    CV_StaticAssert(cn == 3, "for arbitrary-size vector there is no cross-product defined");
+    return Vec<_Tp, cn>();
+}
+
+template<> inline
+Vec<float, 3> Vec<float, 3>::cross(const Vec<float, 3>& v) const
+{
+    return Vec<float,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
+                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
+                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
+}
+
+template<> inline
+Vec<double, 3> Vec<double, 3>::cross(const Vec<double, 3>& v) const
+{
+    return Vec<double,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
+                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
+                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
+}
+
+template<typename _Tp, int cn> template<typename T2> inline
+Vec<_Tp, cn>::operator Vec<T2, cn>() const
+{
+    Vec<T2, cn> v;
+    for( int i = 0; i < cn; i++ ) v.val[i] = saturate_cast<T2>(this->val[i]);
+    return v;
+}
+
+template<typename _Tp, int cn> inline
+const _Tp& Vec<_Tp, cn>::operator [](int i) const
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+_Tp& Vec<_Tp, cn>::operator [](int i)
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+const _Tp& Vec<_Tp, cn>::operator ()(int i) const
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+_Tp& Vec<_Tp, cn>::operator ()(int i)
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v)
+{
+    double nv = norm(v);
+    return v * (nv ? 1./nv : 0.);
+}
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
+{
+    for( int i = 0; i < cn; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
+    return a;
+}
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
+{
+    for( int i = 0; i < cn; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
+{
+    return Vec<_Tp, cn>(a, b, Matx_AddOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
+{
+    return Vec<_Tp, cn>(a, b, Matx_SubOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha)
+{
+    float ialpha = 1.f/alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a)
+{
+    Vec<_Tp,cn> t;
+    for( int i = 0; i < cn; i++ ) t.val[i] = saturate_cast<_Tp>(-a.val[i]);
+    return t;
+}
+
+template<typename _Tp> inline Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    return Vec<_Tp, 4>(saturate_cast<_Tp>(v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3]),
+                       saturate_cast<_Tp>(v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2]),
+                       saturate_cast<_Tp>(v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1]),
+                       saturate_cast<_Tp>(v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0]));
+}
+
+template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    v1 = v1 * v2;
+    return v1;
+}
+
+} // cv::
+
+#endif // OPENCV_CORE_MATX_INL_HPP
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index ade972973bf8..891fd678b74f 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -127,6 +127,11 @@ class CV_EXPORTS_W_SIMPLE Device
     CV_WRAP int singleFPConfig() const;
     CV_WRAP int halfFPConfig() const;
 
+    /// true if 'cl_khr_fp64' extension is available
+    CV_WRAP bool hasFP64() const;
+    /// true if 'cl_khr_fp16' extension is available
+    CV_WRAP bool hasFP16() const;
+
     CV_WRAP bool endianLittle() const;
     CV_WRAP bool errorCorrectionSupport() const;
 
@@ -779,7 +784,7 @@ class CV_EXPORTS Timer
     void start();
     void stop();
 
-    uint64 durationNS() const; //< duration in nanoseconds
+    uint64 durationNS() const; ///< duration in nanoseconds
 
 protected:
     struct Impl;
diff --git a/modules/core/include/opencv2/core/opencl/opencl_info.hpp b/modules/core/include/opencv2/core/opencl/opencl_info.hpp
index 3ead76e5c46e..845efba9fc1d 100644
--- a/modules/core/include/opencv2/core/opencl/opencl_info.hpp
+++ b/modules/core/include/opencv2/core/opencl/opencl_info.hpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include <iostream>
+#include <sstream>
 
 #include <opencv2/core.hpp>
 #include <opencv2/core/ocl.hpp>
@@ -140,13 +141,13 @@ static void dumpOpenCLInformation()
         DUMP_MESSAGE_STDOUT("    Max memory allocation size = " << maxMemAllocSizeStr);
         DUMP_CONFIG_PROPERTY("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());
 
-        const char* doubleSupportStr = device.doubleFPConfig() > 0 ? "Yes" : "No";
+        const char* doubleSupportStr = device.hasFP64() ? "Yes" : "No";
         DUMP_MESSAGE_STDOUT("    Double support = " << doubleSupportStr);
-        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.hasFP64());
 
-        const char* halfSupportStr = device.halfFPConfig() > 0 ? "Yes" : "No";
+        const char* halfSupportStr = device.hasFP16() ? "Yes" : "No";
         DUMP_MESSAGE_STDOUT("    Half support = " << halfSupportStr);
-        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.halfFPConfig() > 0);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.hasFP16());
 
         const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
         DUMP_MESSAGE_STDOUT("    Host unified memory = " << isUnifiedMemoryStr);
diff --git a/modules/core/include/opencv2/core/persistence.hpp b/modules/core/include/opencv2/core/persistence.hpp
index 8e135d1a1109..9c4f33fb1457 100644
--- a/modules/core/include/opencv2/core/persistence.hpp
+++ b/modules/core/include/opencv2/core/persistence.hpp
@@ -696,9 +696,6 @@ class CV_EXPORTS FileNodeIterator
 
 /////////////////// XML & YAML I/O implementation //////////////////
 
-//! @relates cv::FileStorage
-//! @{
-
 CV_EXPORTS void write( FileStorage& fs, const String& name, int value );
 CV_EXPORTS void write( FileStorage& fs, const String& name, float value );
 CV_EXPORTS void write( FileStorage& fs, const String& name, double value );
@@ -715,11 +712,6 @@ CV_EXPORTS void writeScalar( FileStorage& fs, float value );
 CV_EXPORTS void writeScalar( FileStorage& fs, double value );
 CV_EXPORTS void writeScalar( FileStorage& fs, const String& value );
 
-//! @}
-
-//! @relates cv::FileNode
-//! @{
-
 CV_EXPORTS void read(const FileNode& node, int& value, int default_value);
 CV_EXPORTS void read(const FileNode& node, float& value, float default_value);
 CV_EXPORTS void read(const FileNode& node, double& value, double default_value);
@@ -796,10 +788,7 @@ static inline void read(const FileNode& node, Range& value, const Range& default
     value.start = temp.x; value.end = temp.y;
 }
 
-//! @}
-
 /** @brief Writes string to a file storage.
-@relates cv::FileStorage
  */
 CV_EXPORTS FileStorage& operator << (FileStorage& fs, const String& str);
 
@@ -884,9 +873,6 @@ namespace internal
 
 //! @endcond
 
-//! @relates cv::FileStorage
-//! @{
-
 template<typename _Tp> static inline
 void write(FileStorage& fs, const _Tp& value)
 {
@@ -1118,10 +1104,6 @@ static inline void write(FileStorage& fs, const std::vector<DMatch>& vec)
 }
 #endif
 
-//! @} FileStorage
-
-//! @relates cv::FileNode
-//! @{
 
 static inline
 void read(const FileNode& node, bool& value, bool default_value)
@@ -1208,11 +1190,6 @@ void read( const FileNode& node, std::vector<DMatch>& vec, const std::vector<DMa
         read(node, vec);
 }
 
-//! @} FileNode
-
-//! @relates cv::FileStorage
-//! @{
-
 /** @brief Writes data to a file storage.
  */
 template<typename _Tp> static inline
@@ -1244,11 +1221,6 @@ FileStorage& operator << (FileStorage& fs, char* value)
     return (fs << String(value));
 }
 
-//! @} FileStorage
-
-//! @relates cv::FileNodeIterator
-//! @{
-
 /** @brief Reads data from a file storage.
  */
 template<typename _Tp> static inline
@@ -1268,11 +1240,6 @@ FileNodeIterator& operator >> (FileNodeIterator& it, std::vector<_Tp>& vec)
     return it;
 }
 
-//! @} FileNodeIterator
-
-//! @relates cv::FileNode
-//! @{
-
 /** @brief Reads data from a file storage.
  */
 template<typename _Tp> static inline
@@ -1323,11 +1290,6 @@ void operator >> (const FileNode& n, DMatch& m)
     it >> m.queryIdx >> m.trainIdx >> m.imgIdx >> m.distance;
 }
 
-//! @} FileNode
-
-//! @relates cv::FileNodeIterator
-//! @{
-
 CV_EXPORTS bool operator == (const FileNodeIterator& it1, const FileNodeIterator& it2);
 CV_EXPORTS bool operator != (const FileNodeIterator& it1, const FileNodeIterator& it2);
 
@@ -1343,8 +1305,6 @@ bool operator < (const FileNodeIterator& it1, const FileNodeIterator& it2)
     return it1.remaining() > it2.remaining();
 }
 
-//! @} FileNodeIterator
-
 } // cv
 
 #endif // OPENCV_CORE_PERSISTENCE_HPP
diff --git a/modules/core/include/opencv2/core/quaternion.inl.hpp b/modules/core/include/opencv2/core/quaternion.inl.hpp
index b901ecbc68fa..4204806a823e 100644
--- a/modules/core/include/opencv2/core/quaternion.inl.hpp
+++ b/modules/core/include/opencv2/core/quaternion.inl.hpp
@@ -28,7 +28,7 @@
 #define OPENCV_CORE_QUATERNION_INL_HPP
 
 #ifndef OPENCV_CORE_QUATERNION_HPP
-#erorr This is not a standalone header. Include quaternion.hpp instead.
+#error This is not a standalone header. Include quaternion.hpp instead.
 #endif
 
 //@cond IGNORE
diff --git a/modules/core/include/opencv2/core/saturate.hpp b/modules/core/include/opencv2/core/saturate.hpp
index e0cc965ab6f6..18ffd1c7af2c 100644
--- a/modules/core/include/opencv2/core/saturate.hpp
+++ b/modules/core/include/opencv2/core/saturate.hpp
@@ -158,20 +158,20 @@ template<> inline uint64 saturate_cast<uint64>(int64 v)      { return (uint64)st
 template<> inline int64 saturate_cast<int64>(uint64 v)       { return (int64)std::min(v, (uint64)LLONG_MAX); }
 
 /** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); }
+template<typename _Tp> static inline _Tp saturate_cast(hfloat v) { return saturate_cast<_Tp>((float)v); }
 
 // in theory, we could use a LUT for 8u/8s->16f conversion,
 // but with hardware support for FP32->FP16 conversion the current approach is preferable
-template<> inline float16_t saturate_cast<float16_t>(uchar v)   { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(schar v)   { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(ushort v)  { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(short v)   { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(unsigned v){ return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(int v)     { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(uint64 v)  { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(int64 v)   { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(float v)   { return float16_t(v); }
-template<> inline float16_t saturate_cast<float16_t>(double v)  { return float16_t((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(uchar v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(schar v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(ushort v)  { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(short v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(unsigned v){ return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(int v)     { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(uint64 v)  { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(int64 v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(float v)   { return hfloat(v); }
+template<> inline hfloat saturate_cast<hfloat>(double v)  { return hfloat((float)v); }
 
 //! @}
 
diff --git a/modules/core/include/opencv2/core/traits.hpp b/modules/core/include/opencv2/core/traits.hpp
index 52ab083ca43d..522519389bfb 100644
--- a/modules/core/include/opencv2/core/traits.hpp
+++ b/modules/core/include/opencv2/core/traits.hpp
@@ -261,10 +261,10 @@ template<> class DataType<double>
          };
 };
 
-template<> class DataType<float16_t>
+template<> class DataType<hfloat>
 {
 public:
-    typedef float16_t   value_type;
+    typedef hfloat   value_type;
     typedef float       work_type;
     typedef value_type  channel_type;
     typedef value_type  vec_type;
@@ -347,7 +347,7 @@ template<> class TypeDepth<CV_64F>
 template<> class TypeDepth<CV_16F>
 {
     enum { depth = CV_16F };
-    typedef float16_t value_type;
+    typedef hfloat value_type;
 };
 
 #endif
diff --git a/modules/core/include/opencv2/core/types.hpp b/modules/core/include/opencv2/core/types.hpp
index 844d8f895063..8e56d5dd93a7 100644
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@@ -89,7 +89,7 @@ template<typename _Tp> class Complex
     //! conjugation
     Complex conj() const;
 
-    _Tp re, im; //< the real and the imaginary parts
+    _Tp re, im; ///< the real and the imaginary parts
 };
 
 typedef Complex<float> Complexf;
@@ -558,7 +558,7 @@ class CV_EXPORTS_W_SIMPLE RotatedRect
     //! returns the minimal up-right integer rectangle containing the rotated rectangle
     CV_WRAP Rect boundingRect() const;
     //! returns the minimal (exact) floating point rectangle containing the rotated rectangle, not intended for use with images
-    Rect_<float> boundingRect2f() const;
+    CV_WRAP Rect2f boundingRect2f() const;
     //! returns the rectangle mass center
     CV_PROP_RW Point2f center;
     //! returns width and height of the rectangle
@@ -2031,8 +2031,8 @@ double jaccardDistance(const Rect_<_Tp>& a, const Rect_<_Tp>& b) {
 /** @brief Finds out if there is any intersection between two rectangles
  *
  * mainly useful for language bindings
- * @param rect1 First rectangle
- * @param rect2 Second rectangle
+ * @param a First rectangle
+ * @param b Second rectangle
  * @return the area of the intersection
  */
 CV_EXPORTS_W inline double rectangleIntersectionArea(const Rect2d& a, const Rect2d& b) { return (a & b).area(); }
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index 32f3c8c99998..02d4a4f68058 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -90,13 +90,7 @@
 #include <float.h>
 #endif // SKIP_INCLUDES
 
-#if defined _WIN32
-#  define CV_CDECL __cdecl
-#  define CV_STDCALL __stdcall
-#else
-#  define CV_CDECL
-#  define CV_STDCALL
-#endif
+
 
 #ifndef CV_DEFAULT
 #  ifdef __cplusplus
@@ -203,21 +197,13 @@ enum {
 *                             Common macros and inline functions                         *
 \****************************************************************************************/
 
-#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
-
-/** min & max without jumps */
-#define  CV_IMIN(a, b)  ((a) ^ (((a)^(b)) & (((a) < (b)) - 1)))
-
-#define  CV_IMAX(a, b)  ((a) ^ (((a)^(b)) & (((a) > (b)) - 1)))
-
 /** absolute value without jumps */
 #ifndef __cplusplus
 #  define  CV_IABS(a)     (((a) ^ ((a) < 0 ? -1 : 0)) - ((a) < 0 ? -1 : 0))
 #else
 #  define  CV_IABS(a)     abs(a)
 #endif
-#define  CV_CMP(a,b)    (((a) > (b)) - ((a) < (b)))
-#define  CV_SIGN(a)     CV_CMP((a),0)
+
 
 #define cvInvSqrt(value) ((float)(1./sqrt(value)))
 #define cvSqrt(value)  ((float)sqrt(value))
@@ -675,8 +661,6 @@ CV_INLINE int cvIplDepth( int type )
 #define CV_MATND_MAGIC_VAL    0x42430000
 #define CV_TYPE_NAME_MATND    "opencv-nd-matrix"
 
-#define CV_MAX_DIM            32
-
 #ifdef __cplusplus
 typedef struct CvMatND CvMatND;
 CV_EXPORTS CvMatND cvMatND(const cv::Mat& m);
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index db8c42976fa6..e491352bcf6c 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -773,7 +773,7 @@ The sample below demonstrates how to use CommandLineParser:
 The keys parameter is a string containing several blocks, each one is enclosed in curly braces and
 describes one argument. Each argument contains three parts separated by the `|` symbol:
 
--# argument names is a space-separated list of option synonyms (to mark argument as positional, prefix it with the `@` symbol)
+-# argument names is a list of option synonyms separated by standard space characters ' ' (to mark argument as positional, prefix it with the `@` symbol)
 -# default value will be used if the argument was not provided (can be empty)
 -# help message (can be empty)
 
@@ -796,6 +796,8 @@ For example:
 Note that there are no default values for `help` and `timestamp` so we can check their presence using the `has()` method.
 Arguments with default values are considered to be always present. Use the `get()` method in these cases to check their
 actual value instead.
+Note that whitespace characters other than standard spaces are considered part of the string.
+Additionally, leading and trailing standard spaces around the help messages are ignored.
 
 String keys like `get<String>("@image1")` return the empty string `""` by default - even with an empty default value.
 Use the special `<none>` default value to enforce that the returned string must not be empty. (like in `get<String>("@image2")`)
diff --git a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
index eb5ecde16b0d..bbc6cf89799a 100644
--- a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
+++ b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
@@ -9,8 +9,6 @@
 
 //#define OPENCV_DISABLE_ALLOCATOR_STATS
 
-#ifdef CV_CXX11
-
 #include <atomic>
 
 #ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
@@ -26,14 +24,6 @@
 #define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long
 #endif
 
-#else  // CV_CXX11
-
-#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
-#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int  // CV_XADD supports int only
-#endif
-
-#endif  // CV_CXX11
-
 namespace cv { namespace utils {
 
 #ifdef CV__ALLOCATOR_STATS_LOG
@@ -59,7 +49,7 @@ class AllocatorStatistics : public AllocatorStatisticsInterface
     void onAllocate(size_t /*sz*/) {}
     void onFree(size_t /*sz*/) {}
 
-#elif defined(CV_CXX11)
+#else
 
 protected:
     typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
@@ -104,49 +94,7 @@ class AllocatorStatistics : public AllocatorStatisticsInterface
 #endif
         curr -= (counter_t)sz;
     }
-
-#else  // non C++11
-
-protected:
-    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
-    volatile counter_t curr, total, total_allocs, peak;  // overflow is possible, CV_XADD operates with 'int' only
-public:
-    AllocatorStatistics()
-        : curr(0), total(0), total_allocs(0), peak(0)
-    {}
-    ~AllocatorStatistics() CV_OVERRIDE {}
-
-    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr; }
-    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total; }
-    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs; }
-    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak; }
-
-    void resetPeakUsage() CV_OVERRIDE { peak = curr; }
-
-    // Controller interface
-    void onAllocate(size_t sz)
-    {
-#ifdef CV__ALLOCATOR_STATS_LOG
-        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
-#endif
-
-        counter_t new_curr = (counter_t)CV_XADD(&curr, (counter_t)sz) + (counter_t)sz;
-
-        peak = std::max((counter_t)peak, new_curr);  // non-thread safe
-
-        //CV_XADD(&total, (uint64_t)sz);  // overflow with int, non-reliable...
-        total += sz;
-
-        CV_XADD(&total_allocs, (counter_t)1);
-    }
-    void onFree(size_t sz)
-    {
-#ifdef CV__ALLOCATOR_STATS_LOG
-        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
-#endif
-        CV_XADD(&curr, (counter_t)-sz);
-    }
-#endif
+#endif // OPENCV_DISABLE_ALLOCATOR_STATS
 };
 
 #ifdef CV__ALLOCATOR_STATS_LOG
diff --git a/modules/core/include/opencv2/core/utils/filesystem.private.hpp b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
index c32be15c617c..c6bd5b316acf 100644
--- a/modules/core/include/opencv2/core/utils/filesystem.private.hpp
+++ b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
@@ -12,7 +12,8 @@
 #  elif defined WINRT || defined _WIN32_WCE
      /* not supported */
 #  elif defined __ANDROID__ || defined __linux__ || defined _WIN32 || \
-        defined __FreeBSD__ || defined __bsdi__ || defined __HAIKU__
+        defined __FreeBSD__ || defined __bsdi__ || defined __HAIKU__ || \
+        defined __GNU__
 #      define OPENCV_HAVE_FILESYSTEM_SUPPORT 1
 #  elif defined(__APPLE__)
 #    include <TargetConditionals.h>
@@ -46,11 +47,11 @@ class CV_EXPORTS FileLock {
     explicit FileLock(const char* fname);
     ~FileLock();
 
-    void lock(); //< acquire exclusive (writer) lock
-    void unlock(); //< release exclusive (writer) lock
+    void lock(); ///< acquire exclusive (writer) lock
+    void unlock(); ///< release exclusive (writer) lock
 
-    void lock_shared(); //< acquire shareable (reader) lock
-    void unlock_shared(); //< release shareable (reader) lock
+    void lock_shared(); ///< acquire shareable (reader) lock
+    void unlock_shared(); ///< release shareable (reader) lock
 
     struct Impl;
 protected:
diff --git a/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
index 53b8c48c38d1..23e48ee0eb6a 100644
--- a/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
+++ b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
@@ -12,7 +12,7 @@
 
 #if defined(_WIN32)
 #include <windows.h>
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
 #include <dlfcn.h>
 #endif
 
@@ -65,7 +65,7 @@ void* getSymbol_(LibHandle_t h, const char* symbolName)
 {
 #if defined(_WIN32)
     return (void*)GetProcAddress(h, symbolName);
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
     return dlsym(h, symbolName);
 #endif
 }
@@ -79,7 +79,7 @@ LibHandle_t libraryLoad_(const FileSystemPath_t& filename)
 # else
     return LoadLibraryW(filename.c_str());
 #endif
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
     void* handle = dlopen(filename.c_str(), RTLD_NOW);
     CV_LOG_IF_DEBUG(NULL, !handle, "dlopen() error: " << dlerror());
     return handle;
@@ -91,7 +91,7 @@ void libraryRelease_(LibHandle_t h)
 {
 #if defined(_WIN32)
     FreeLibrary(h);
-#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
+#elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__) || defined(__EMSCRIPTEN__)
     dlclose(h);
 #endif
 }
diff --git a/modules/core/include/opencv2/core/utils/trace.hpp b/modules/core/include/opencv2/core/utils/trace.hpp
index ef5d35b4f2bb..ea43bbeea105 100644
--- a/modules/core/include/opencv2/core/utils/trace.hpp
+++ b/modules/core/include/opencv2/core/utils/trace.hpp
@@ -70,11 +70,11 @@ class CV_EXPORTS Region
     struct LocationExtraData;
     struct LocationStaticStorage
     {
-        LocationExtraData** ppExtra;   //< implementation specific data
-        const char* name;              //< region name (function name or other custom name)
-        const char* filename;          //< source code filename
-        int line;                      //< source code line
-        int flags;                     //< flags (implementation code path: Plain, IPP, OpenCL)
+        LocationExtraData** ppExtra;   ///< implementation specific data
+        const char* name;              ///< region name (function name or other custom name)
+        const char* filename;          ///< source code filename
+        int line;                      ///< source code line
+        int flags;                     ///< flags (implementation code path: Plain, IPP, OpenCL)
     };
 
     Region(const LocationStaticStorage& location);
@@ -100,18 +100,18 @@ class CV_EXPORTS Region
 
 //! Specify region flags
 enum RegionLocationFlag {
-    REGION_FLAG_FUNCTION = (1 << 0),             //< region is function (=1) / nested named region (=0)
-    REGION_FLAG_APP_CODE = (1 << 1),             //< region is Application code (=1) / OpenCV library code (=0)
-    REGION_FLAG_SKIP_NESTED = (1 << 2),          //< avoid processing of nested regions
+    REGION_FLAG_FUNCTION = (1 << 0),             ///< region is function (=1) / nested named region (=0)
+    REGION_FLAG_APP_CODE = (1 << 1),             ///< region is Application code (=1) / OpenCV library code (=0)
+    REGION_FLAG_SKIP_NESTED = (1 << 2),          ///< avoid processing of nested regions
 
-    REGION_FLAG_IMPL_IPP = (1 << 16),            //< region is part of IPP code path
-    REGION_FLAG_IMPL_OPENCL = (2 << 16),         //< region is part of OpenCL code path
-    REGION_FLAG_IMPL_OPENVX = (3 << 16),         //< region is part of OpenVX code path
+    REGION_FLAG_IMPL_IPP = (1 << 16),            ///< region is part of IPP code path
+    REGION_FLAG_IMPL_OPENCL = (2 << 16),         ///< region is part of OpenCL code path
+    REGION_FLAG_IMPL_OPENVX = (3 << 16),         ///< region is part of OpenVX code path
 
     REGION_FLAG_IMPL_MASK = (15 << 16),
 
     REGION_FLAG_REGION_FORCE = (1 << 30),
-    REGION_FLAG_REGION_NEXT = (1 << 31),         //< close previous region (see #CV_TRACE_REGION_NEXT macro)
+    REGION_FLAG_REGION_NEXT = (1 << 31),         ///< close previous region (see #CV_TRACE_REGION_NEXT macro)
 
     ENUM_REGION_FLAG_FORCE_INT = INT_MAX
 };
diff --git a/modules/core/include/opencv2/core/version.hpp b/modules/core/include/opencv2/core/version.hpp
index d5494f53a8da..80e3dc32bcde 100644
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_VERSION_HPP
 
 #define CV_VERSION_MAJOR    4
-#define CV_VERSION_MINOR    8
+#define CV_VERSION_MINOR    10
 #define CV_VERSION_REVISION 0
 #define CV_VERSION_STATUS   ""
 
diff --git a/modules/core/misc/java/test/CoreTest.java b/modules/core/misc/java/test/CoreTest.java
index c63cb23fab91..62d72b706fd3 100644
--- a/modules/core/misc/java/test/CoreTest.java
+++ b/modules/core/misc/java/test/CoreTest.java
@@ -947,11 +947,11 @@ public void testMagnitude() {
     }
 
     public void testMahalanobis() {
-        Mat src = new Mat(matSize, matSize, CvType.CV_32F);
+        Mat src = new Mat(matSize + 1, matSize, CvType.CV_32F);
         Core.randu(src, -128, 128);
 
         Mat covar = new Mat(matSize, matSize, CvType.CV_32F);
-        Mat mean = new Mat(1, matSize, CvType.CV_32F);
+        Mat mean = new Mat(1, matSize + 1, CvType.CV_32F);
         Core.calcCovarMatrix(src, covar, mean, Core.COVAR_ROWS | Core.COVAR_NORMAL, CvType.CV_32F);
         covar = covar.inv();
 
@@ -963,7 +963,6 @@ public void testMahalanobis() {
         assertEquals(0.0, d);
 
         d = Core.Mahalanobis(line1, line2, covar);
-
         assertTrue(d > 0.0);
     }
 
diff --git a/modules/core/misc/objc/common/Mat.h b/modules/core/misc/objc/common/Mat.h
index 42d4689e8f75..04fc8da82c30 100644
--- a/modules/core/misc/objc/common/Mat.h
+++ b/modules/core/misc/objc/common/Mat.h
@@ -15,7 +15,7 @@
 #import <Foundation/Foundation.h>
 
 #ifdef AVAILABLE_IMGCODECS
-#if TARGET_OS_IPHONE
+#if TARGET_OS_IPHONE || TARGET_OS_VISION
 #import <UIKit/UIKit.h>
 #elif TARGET_OS_MAC
 #import <AppKit/AppKit.h>
@@ -197,7 +197,7 @@ CV_EXPORTS @interface Mat : NSObject
 - (instancetype)initWithCGImage:(CGImageRef)image;
 - (instancetype)initWithCGImage:(CGImageRef)image alphaExist:(BOOL)alphaExist;
 
-#if TARGET_OS_IPHONE
+#if TARGET_OS_IPHONE || TARGET_OS_VISION
 
 - (UIImage*)toUIImage;
 - (instancetype)initWithUIImage:(UIImage*)image;
diff --git a/modules/core/misc/objc/common/Mat.mm b/modules/core/misc/objc/common/Mat.mm
index 80ada0b930b7..54387a477293 100644
--- a/modules/core/misc/objc/common/Mat.mm
+++ b/modules/core/misc/objc/common/Mat.mm
@@ -951,7 +951,7 @@ -(instancetype)initWithCGImage:(CGImageRef)image alphaExist:(BOOL)alphaExist {
     return [MatConverters convertCGImageRefToMat:image alphaExist:alphaExist];
 }
 
-#if TARGET_OS_IPHONE
+#if TARGET_OS_IPHONE || TARGET_OS_VISION
 
 -(UIImage*)toUIImage {
     return [MatConverters converMatToUIImage:self];
diff --git a/modules/core/misc/objc/common/Range.h b/modules/core/misc/objc/common/Range.h
index df0c01398f30..12bd48f54758 100644
--- a/modules/core/misc/objc/common/Range.h
+++ b/modules/core/misc/objc/common/Range.h
@@ -25,6 +25,9 @@ CV_EXPORTS @interface Range : NSObject
 
 @property int start;
 @property int end;
+#ifdef __cplusplus
+@property(readonly) cv::Range& nativeRef;
+#endif
 
 #pragma mark - Constructors
 
@@ -32,6 +35,10 @@ CV_EXPORTS @interface Range : NSObject
 - (instancetype)initWithStart:(int)start end:(int)end;
 - (instancetype)initWithVals:(NSArray<NSNumber*>*)vals;
 
+#ifdef __cplusplus
++ (instancetype)fromNative:(cv::Range&)range;
+#endif
+
 #pragma mark - Methods
 
 /**
diff --git a/modules/core/misc/objc/common/Range.m b/modules/core/misc/objc/common/Range.mm
similarity index 81%
rename from modules/core/misc/objc/common/Range.m
rename to modules/core/misc/objc/common/Range.mm
index a4e155214e94..55ba49e3ad0a 100644
--- a/modules/core/misc/objc/common/Range.m
+++ b/modules/core/misc/objc/common/Range.mm
@@ -1,12 +1,34 @@
 //
-//  Range.m
+//  Range.mm
 //
 //  Created by Giles Payne on 2019/10/08.
 //
 
 #import "Range.h"
 
-@implementation Range
+@implementation Range {
+    cv::Range native;
+}
+
+- (int)start {
+    return native.start;
+}
+
+- (void)setStart:(int)val {
+    native.start = val;
+}
+
+- (int)end {
+    return native.end;
+}
+
+- (void)setEnd:(int)val {
+    native.end = val;
+}
+
+- (cv::Range&)nativeRef {
+    return native;
+}
 
 - (instancetype)init {
     return [self initWithStart:0 end: 0];
@@ -29,6 +51,10 @@ - (instancetype)initWithVals:(NSArray<NSNumber*>*)vals {
     return self;
 }
 
++ (instancetype)fromNative:(cv::Range&)range {
+    return [[Range alloc] initWithStart:range.start end:range.end];
+}
+
 - (void)set:(NSArray<NSNumber*>*)vals {
     self.start = (vals != nil && vals.count > 0) ? vals[0].intValue : 0;
     self.end = (vals != nil && vals.count > 1 ) ? vals[1].intValue : 0;
diff --git a/modules/core/misc/objc/gen_dict.json b/modules/core/misc/objc/gen_dict.json
index 58300255dcb3..e01b32d6dc33 100644
--- a/modules/core/misc/objc/gen_dict.json
+++ b/modules/core/misc/objc/gen_dict.json
@@ -167,7 +167,9 @@
             "from_cpp": "[Point3i fromNative:%(n)s]"
         },
         "Range": {
-            "objc_type": "Range*"
+            "objc_type": "Range*",
+            "to_cpp": "%(n)s.nativeRef",
+            "from_cpp": "[Range fromNative:%(n)s]"
         },
         "Rect": {
             "objc_type": "Rect2i*",
diff --git a/modules/core/misc/python/package/mat_wrapper/__init__.py b/modules/core/misc/python/package/mat_wrapper/__init__.py
index 7309c32b01c8..8a1e4580c983 100644
--- a/modules/core/misc/python/package/mat_wrapper/__init__.py
+++ b/modules/core/misc/python/package/mat_wrapper/__init__.py
@@ -1,12 +1,19 @@
 __all__ = []
 
-import sys
 import numpy as np
 import cv2 as cv
+from typing import TYPE_CHECKING, Any
+
+# Same as cv2.typing.NumPyArrayNumeric, but avoids circular dependencies
+if TYPE_CHECKING:
+    _NumPyArrayNumeric = np.ndarray[Any, np.dtype[np.integer[Any] | np.floating[Any]]]
+else:
+    _NumPyArrayNumeric = np.ndarray
 
 # NumPy documentation: https://numpy.org/doc/stable/user/basics.subclassing.html
 
-class Mat(np.ndarray):
+
+class Mat(_NumPyArrayNumeric):
     '''
     cv.Mat wrapper for numpy array.
 
diff --git a/modules/core/misc/python/pyopencv_async.hpp b/modules/core/misc/python/pyopencv_async.hpp
index 6a8e73526e99..625365ac50b0 100644
--- a/modules/core/misc/python/pyopencv_async.hpp
+++ b/modules/core/misc/python/pyopencv_async.hpp
@@ -2,7 +2,7 @@
 
 #include "opencv2/core/async.hpp"
 
-CV_PY_TO_CLASS(AsyncArray);
-CV_PY_FROM_CLASS(AsyncArray);
+CV_PY_TO_CLASS(AsyncArray)
+CV_PY_FROM_CLASS(AsyncArray)
 
 #endif
diff --git a/modules/core/misc/python/pyopencv_cuda.hpp b/modules/core/misc/python/pyopencv_cuda.hpp
index 5be4977ca0f5..a424498f272c 100644
--- a/modules/core/misc/python/pyopencv_cuda.hpp
+++ b/modules/core/misc/python/pyopencv_cuda.hpp
@@ -20,18 +20,18 @@ template<> struct pyopencvVecConverter<cuda::GpuMat>
     }
 };
 
-CV_PY_TO_CLASS(cuda::GpuMat);
-CV_PY_TO_CLASS(cuda::Stream);
-CV_PY_TO_CLASS(cuda::Event);
-CV_PY_TO_CLASS(cuda::HostMem);
+CV_PY_TO_CLASS(cuda::GpuMat)
+CV_PY_TO_CLASS(cuda::Stream)
+CV_PY_TO_CLASS(cuda::Event)
+CV_PY_TO_CLASS(cuda::HostMem)
 
-CV_PY_TO_CLASS_PTR(cuda::GpuMat);
-CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator);
+CV_PY_TO_CLASS_PTR(cuda::GpuMat)
+CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator)
 
-CV_PY_FROM_CLASS(cuda::GpuMat);
-CV_PY_FROM_CLASS(cuda::Stream);
-CV_PY_FROM_CLASS(cuda::HostMem);
+CV_PY_FROM_CLASS(cuda::GpuMat)
+CV_PY_FROM_CLASS(cuda::Stream)
+CV_PY_FROM_CLASS(cuda::HostMem)
 
-CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator);
+CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator)
 
 #endif
diff --git a/modules/core/misc/python/pyopencv_umat.hpp b/modules/core/misc/python/pyopencv_umat.hpp
index 697adaf2027e..2e91cd5c6593 100644
--- a/modules/core/misc/python/pyopencv_umat.hpp
+++ b/modules/core/misc/python/pyopencv_umat.hpp
@@ -4,8 +4,8 @@
 
 typedef std::vector<Range> vector_Range;
 
-CV_PY_TO_CLASS(UMat);
-CV_PY_FROM_CLASS(UMat);
+CV_PY_TO_CLASS(UMat)
+CV_PY_FROM_CLASS(UMat)
 
 static bool cv_mappable_to(const Ptr<Mat>& src, Ptr<UMat>& dst)
 {
@@ -28,7 +28,7 @@ static void* cv_UMat_context()
 static Mat cv_UMat_get(const UMat* _self)
 {
     Mat m;
-    m.allocator = &g_numpyAllocator;
+    m.allocator = &GetNumpyAllocator();
     _self->copyTo(m);
     return m;
 }
diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp
index 8d1e7a6288bb..04d343a13635 100644
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -374,6 +374,36 @@ OCL_PERF_TEST_P(FlipFixture, Flip,
     SANITY_CHECK(dst);
 }
 
+///////////// Rotate ////////////////////////
+
+enum
+{
+    ROTATE_90_CLOCKWISE = 0, ROTATE_180, ROTATE_90_COUNTERCLOCKWISE
+};
+
+CV_ENUM(RotateType, ROTATE_90_CLOCKWISE, ROTATE_180, ROTATE_90_COUNTERCLOCKWISE)
+
+typedef tuple<Size, MatType, RotateType> RotateParams;
+typedef TestBaseWithParam<RotateParams> RotateFixture;
+
+OCL_PERF_TEST_P(RotateFixture, rotate,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, RotateType::all()))
+{
+    const RotateParams params = GetParam();
+    const Size srcSize   = get<0>(params);
+    const int type       = get<1>(params);
+    const int rotateCode = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::rotate(src, dst, rotateCode);
+
+    SANITY_CHECK_NOTHING();
+}
+
 ///////////// minMaxLoc ////////////////////////
 
 typedef Size_MatType MinMaxLocFixture;
diff --git a/modules/core/perf/opencl/perf_matop.cpp b/modules/core/perf/opencl/perf_matop.cpp
index b763a98e2acd..2fd5b177a621 100644
--- a/modules/core/perf/opencl/perf_matop.cpp
+++ b/modules/core/perf/opencl/perf_matop.cpp
@@ -80,6 +80,187 @@ OCL_PERF_TEST_P(ConvertToFixture, ConvertTo,
     SANITY_CHECK(dst);
 }
 
+
+//#define RUN_CONVERTFP16
+static Size convertFP16_srcSize(4000, 4000);
+
+OCL_PERF_TEST(Core, ConvertFP32FP16MatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16MatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16UMatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16UMatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32MatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32MatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32UMatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32UMatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+
 ///////////// CopyTo ////////////////////////
 
 typedef Size_MatType CopyToFixture;
diff --git a/modules/core/perf/perf_allocation.cpp b/modules/core/perf/perf_allocation.cpp
index 2f3bf3eaa76f..237a8dc1a34b 100755
--- a/modules/core/perf/perf_allocation.cpp
+++ b/modules/core/perf/perf_allocation.cpp
@@ -45,4 +45,4 @@ PERF_TEST_P(MatDepth_tb, DISABLED_Allocation_Aligned,
     SANITY_CHECK_NOTHING();
 }
 
-};
+}
diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp
index 3ac9a2463979..36f400e34f53 100644
--- a/modules/core/perf/perf_arithm.cpp
+++ b/modules/core/perf/perf_arithm.cpp
@@ -1,12 +1,40 @@
 #include "perf_precomp.hpp"
 #include <numeric>
+#include "opencv2/core/softfloat.hpp"
 
 namespace opencv_test
 {
 using namespace perf;
 
+using BroadcastTest = perf::TestBaseWithParam<std::tuple<std::vector<int>, perf::MatType, std::vector<int>>>;
 typedef Size_MatType BinaryOpTest;
 
+PERF_TEST_P_(BroadcastTest, basic)
+{
+    std::vector<int> shape_src = get<0>(GetParam());
+    int dt_type = get<1>(GetParam());
+    std::vector<int> shape_dst = get<2>(GetParam());
+
+    cv::Mat src(static_cast<int>(shape_src.size()), shape_src.data(), dt_type);
+    cv::Mat dst(static_cast<int>(shape_dst.size()), shape_dst.data(), dt_type);
+
+    cv::randu(src, -1.f, 1.f);
+
+    TEST_CYCLE() cv::broadcast(src, shape_dst, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , BroadcastTest,
+    testing::Combine(
+        testing::Values(std::vector<int>{1, 100, 800},
+                        std::vector<int>{10, 1, 800},
+                        std::vector<int>{10, 100, 1}),
+        testing::Values(CV_32FC1),
+        testing::Values(std::vector<int>{10, 100, 800})
+    )
+);
+
 PERF_TEST_P_(BinaryOpTest, min)
 {
     Size sz = get<0>(GetParam());
@@ -424,4 +452,350 @@ INSTANTIATE_TEST_CASE_P(/*nothing*/ , BinaryOpTest,
     )
 );
 
+///////////// Mixed type arithmetics ////////
+
+typedef perf::TestBaseWithParam<std::tuple<cv::Size, std::tuple<perf::MatType, perf::MatType>>> ArithmMixedTest;
+
+PERF_TEST_P_(ArithmMixedTest, add)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a = Mat(sz, srcType);
+    cv::Mat b = Mat(sz, srcType);
+    cv::Mat c = Mat(sz, dstType);
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+    declare.time(50);
+
+    if (CV_MAT_DEPTH(dstType) == CV_32S)
+    {
+        //see ticket 1529: add can be without saturation on 32S
+        a /= 2;
+        b /= 2;
+    }
+
+    TEST_CYCLE() cv::add(a, b, c, /* mask */ noArray(), dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, addScalarDouble)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a = Mat(sz, srcType);
+    cv::Scalar b;
+    cv::Mat c = Mat(sz, dstType);
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    if (CV_MAT_DEPTH(dstType) == CV_32S)
+    {
+        //see ticket 1529: add can be without saturation on 32S
+        a /= 2;
+        b /= 2;
+    }
+
+    TEST_CYCLE() cv::add(a, b, c, /* mask */ noArray(), dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, addScalarSameType)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a = Mat(sz, srcType);
+    cv::Scalar b;
+    cv::Mat c = Mat(sz, dstType);
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    if (CV_MAT_DEPTH(dstType) < CV_32S)
+    {
+        b = Scalar(1, 0, 3, 4); // don't pass non-integer values for 8U/8S/16U/16S processing
+    }
+    else if (CV_MAT_DEPTH(dstType) == CV_32S)
+    {
+        //see ticket 1529: add can be without saturation on 32S
+        a /= 2;
+        b = Scalar(1, 0, -3, 4); // don't pass non-integer values for 32S processing
+    }
+
+    TEST_CYCLE() cv::add(a, b, c, /* mask */ noArray(), dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, subtract)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a = Mat(sz, srcType);
+    cv::Mat b = Mat(sz, srcType);
+    cv::Mat c = Mat(sz, dstType);
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    if (CV_MAT_DEPTH(dstType) == CV_32S)
+    {
+        //see ticket 1529: subtract can be without saturation on 32S
+        a /= 2;
+        b /= 2;
+    }
+
+    TEST_CYCLE() cv::subtract(a, b, c, /* mask */ noArray(), dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, subtractScalarDouble)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a = Mat(sz, srcType);
+    cv::Scalar b;
+    cv::Mat c = Mat(sz, dstType);
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    if (CV_MAT_DEPTH(dstType) == CV_32S)
+    {
+        //see ticket 1529: subtract can be without saturation on 32S
+        a /= 2;
+        b /= 2;
+    }
+
+    TEST_CYCLE() cv::subtract(a, b, c, /* mask */ noArray(), dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, subtractScalarSameType)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a = Mat(sz, srcType);
+    cv::Scalar b;
+    cv::Mat c = Mat(sz, dstType);
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    if (CV_MAT_DEPTH(dstType) < CV_32S)
+    {
+        b = Scalar(1, 0, 3, 4); // don't pass non-integer values for 8U/8S/16U/16S processing
+    }
+    else if (CV_MAT_DEPTH(dstType) == CV_32S)
+    {
+        //see ticket 1529: subtract can be without saturation on 32S
+        a /= 2;
+        b = Scalar(1, 0, -3, 4); // don't pass non-integer values for 32S processing
+    }
+
+    TEST_CYCLE() cv::subtract(a, b, c, /* mask */ noArray(), dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, multiply)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a(sz, srcType), b(sz, srcType), c(sz, dstType);
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+    if (CV_MAT_DEPTH(dstType) == CV_32S)
+    {
+        //According to docs, saturation is not applied when result is 32bit integer
+        a /= (2 << 16);
+        b /= (2 << 16);
+    }
+
+    TEST_CYCLE() cv::multiply(a, b, c, /* scale */ 1.0, dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, multiplyScale)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a(sz, srcType), b(sz, srcType), c(sz, dstType);
+    double scale = 0.5;
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    if (CV_MAT_DEPTH(dstType) == CV_32S)
+    {
+        //According to docs, saturation is not applied when result is 32bit integer
+        a /= (2 << 16);
+        b /= (2 << 16);
+    }
+
+    TEST_CYCLE() cv::multiply(a, b, c, scale, dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, divide)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat a(sz, srcType), b(sz, srcType), c(sz, dstType);
+    double scale = 0.5;
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    TEST_CYCLE() cv::divide(a, b, c, scale, dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(ArithmMixedTest, reciprocal)
+{
+    auto p = GetParam();
+    Size sz = get<0>(p);
+    int srcType = get<0>(get<1>(p));
+    int dstType = get<1>(get<1>(p));
+
+    cv::Mat b(sz, srcType), c(sz, dstType);
+    double scale = 0.5;
+
+    declare.in(b, WARMUP_RNG).out(c);
+
+    TEST_CYCLE() cv::divide(scale, b, c, dstType);
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , ArithmMixedTest,
+    testing::Combine(
+        testing::Values(szVGA, sz720p, sz1080p),
+        testing::Values(std::tuple<perf::MatType, perf::MatType>{CV_8U, CV_16U},
+                        std::tuple<perf::MatType, perf::MatType>{CV_8S, CV_16S},
+                        std::tuple<perf::MatType, perf::MatType>{CV_8U, CV_32F},
+                        std::tuple<perf::MatType, perf::MatType>{CV_8S, CV_32F}
+            )
+    )
+);
+
+///////////// Rotate ////////////////////////
+
+typedef perf::TestBaseWithParam<std::tuple<cv::Size, int, perf::MatType>> RotateTest;
+
+PERF_TEST_P_(RotateTest, rotate)
+{
+    Size sz        = get<0>(GetParam());
+    int rotatecode = get<1>(GetParam());
+    int type       = get<2>(GetParam());
+    cv::Mat a(sz, type), b(sz, type);
+
+    declare.in(a, WARMUP_RNG).out(b);
+
+    TEST_CYCLE() cv::rotate(a, b, rotatecode);
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , RotateTest,
+    testing::Combine(
+        testing::Values(szVGA, sz720p, sz1080p),
+        testing::Values(ROTATE_180, ROTATE_90_CLOCKWISE, ROTATE_90_COUNTERCLOCKWISE),
+        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_8SC1, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, CV_32SC1, CV_32FC1)
+    )
+);
+
+
+///////////// PatchNaNs ////////////////////////
+
+template<typename _Tp>
+_Tp randomNan(RNG& rng);
+
+template<>
+float randomNan(RNG& rng)
+{
+    uint32_t r = rng.next();
+    Cv32suf v;
+    v.u = r;
+    // exp & set a bit to avoid zero mantissa
+    v.u = v.u | 0x7f800001;
+    return v.f;
+}
+
+template<>
+double randomNan(RNG& rng)
+{
+    uint32_t r0 = rng.next();
+    uint32_t r1 = rng.next();
+    Cv64suf v;
+    v.u = (uint64_t(r0) << 32) | uint64_t(r1);
+    // exp &set a bit to avoid zero mantissa
+    v.u = v.u | 0x7ff0000000000001;
+    return v.f;
+}
+
+typedef Size_MatType PatchNaNsFixture;
+
+PERF_TEST_P_(PatchNaNsFixture, PatchNaNs)
+{
+    const Size_MatType_t params = GetParam();
+    Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type);
+
+    Mat src(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(src);
+
+    // generating NaNs
+    {
+        srcSize.width *= cn;
+        RNG& rng = theRNG();
+        for (int y = 0; y < srcSize.height; ++y)
+        {
+            float  *const ptrf = src.ptr<float>(y);
+            for (int x = 0; x < srcSize.width; ++x)
+            {
+                ptrf[x] = (x + y) % 2 == 0 ? randomNan<float >(rng) : ptrf[x];
+            }
+        }
+    }
+
+    TEST_CYCLE() cv::patchNaNs(src, 17.7);
+
+    SANITY_CHECK(src);
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , PatchNaNsFixture,
+    testing::Combine(
+        testing::Values(szVGA, sz720p, sz1080p, sz2160p),
+        testing::Values(CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4)
+    )
+);
+
 } // namespace
diff --git a/modules/core/perf/perf_math.cpp b/modules/core/perf/perf_math.cpp
index 16d262e4c927..fe947aec1ab2 100644
--- a/modules/core/perf/perf_math.cpp
+++ b/modules/core/perf/perf_math.cpp
@@ -36,6 +36,234 @@ PERF_TEST_P(VectorLength, phase64f, testing::Values(128, 1000, 128*1024, 512*102
     SANITY_CHECK(angle, 5e-5);
 }
 
+// generates random vectors, performs Gram-Schmidt orthogonalization on them
+Mat randomOrtho(int rows, int ftype, RNG& rng)
+{
+    Mat result(rows, rows, ftype);
+    rng.fill(result, RNG::UNIFORM, cv::Scalar(-1), cv::Scalar(1));
+
+    for (int i = 0; i < rows; i++)
+    {
+        Mat v = result.row(i);
+
+        for (int j = 0; j < i; j++)
+        {
+            Mat p = result.row(j);
+            v -= p.dot(v) * p;
+        }
+
+        v = v * (1. / cv::norm(v));
+    }
+
+    return result;
+}
+
+template<typename FType>
+Mat buildRandomMat(int rows, int cols, RNG& rng, int rank, bool symmetrical)
+{
+    int mtype = cv::traits::Depth<FType>::value;
+    Mat u = randomOrtho(rows, mtype, rng);
+    Mat v = randomOrtho(cols, mtype, rng);
+    Mat s(rows, cols, mtype, Scalar(0));
+
+    std::vector<FType> singVals(rank);
+    rng.fill(singVals, RNG::UNIFORM, Scalar(0), Scalar(10));
+    std::sort(singVals.begin(), singVals.end());
+    auto singIter = singVals.rbegin();
+    for (int i = 0; i < rank; i++)
+    {
+        s.at<FType>(i, i) = *singIter++;
+    }
+
+    if (symmetrical)
+        return u * s * u.t();
+    else
+        return u * s * v.t();
+}
+
+Mat buildRandomMat(int rows, int cols, int mtype, RNG& rng, int rank, bool symmetrical)
+{
+    if (mtype == CV_32F)
+    {
+        return buildRandomMat<float>(rows, cols, rng, rank, symmetrical);
+    }
+    else if (mtype == CV_64F)
+    {
+        return buildRandomMat<double>(rows, cols, rng, rank, symmetrical);
+    }
+    else
+    {
+        CV_Error(cv::Error::StsBadArg, "This type is not supported");
+    }
+}
+
+CV_ENUM(SolveDecompEnum, DECOMP_LU, DECOMP_SVD, DECOMP_EIG, DECOMP_CHOLESKY, DECOMP_QR)
+
+enum RankMatrixOptions
+{
+    RANK_HALF, RANK_MINUS_1, RANK_FULL
+};
+
+CV_ENUM(RankEnum, RANK_HALF, RANK_MINUS_1, RANK_FULL)
+
+enum SolutionsOptions
+{
+    NO_SOLUTIONS, ONE_SOLUTION, MANY_SOLUTIONS
+};
+
+CV_ENUM(SolutionsEnum, NO_SOLUTIONS, ONE_SOLUTION, MANY_SOLUTIONS)
+
+typedef perf::TestBaseWithParam<std::tuple<int, RankEnum, MatDepth, SolveDecompEnum, bool, SolutionsEnum>> SolveTest;
+
+PERF_TEST_P(SolveTest, randomMat, ::testing::Combine(
+    ::testing::Values(31, 64, 100),
+    ::testing::Values(RANK_HALF, RANK_MINUS_1, RANK_FULL),
+    ::testing::Values(CV_32F, CV_64F),
+    ::testing::Values(DECOMP_LU, DECOMP_SVD, DECOMP_EIG, DECOMP_CHOLESKY, DECOMP_QR),
+    ::testing::Bool(), // normal
+    ::testing::Values(NO_SOLUTIONS, ONE_SOLUTION, MANY_SOLUTIONS)
+    ))
+{
+    auto t = GetParam();
+    int size       = std::get<0>(t);
+    auto rankEnum  = std::get<1>(t);
+    int mtype      = std::get<2>(t);
+    int method     = std::get<3>(t);
+    bool normal    = std::get<4>(t);
+    auto solutions = std::get<5>(t);
+
+    bool symmetrical = (method == DECOMP_CHOLESKY || method == DECOMP_LU);
+
+    if (normal)
+    {
+        method |= DECOMP_NORMAL;
+    }
+
+    int rank = size;
+    switch (rankEnum)
+    {
+        case RANK_HALF:    rank /= 2; break;
+        case RANK_MINUS_1: rank -= 1; break;
+        default: break;
+    }
+
+    RNG& rng = theRNG();
+    Mat A = buildRandomMat(size, size, mtype, rng, rank, symmetrical);
+    Mat x(size, 1, mtype);
+    Mat b(size, 1, mtype);
+
+    switch (solutions)
+    {
+        // no solutions, let's make b random
+        case NO_SOLUTIONS:
+        {
+            rng.fill(b, RNG::UNIFORM, Scalar(-1), Scalar(1));
+        }
+        break;
+        // exactly 1 solution, let's combine b from A and x
+        case ONE_SOLUTION:
+        {
+            rng.fill(x, RNG::UNIFORM, Scalar(-10), Scalar(10));
+            b = A * x;
+        }
+        break;
+        // infinitely many solutions, let's make b zero
+        default:
+        {
+            b = 0;
+        }
+        break;
+    }
+
+    TEST_CYCLE() cv::solve(A, b, x, method);
+
+    SANITY_CHECK_NOTHING();
+}
+
+typedef perf::TestBaseWithParam<std::tuple<std::tuple<int, int>, RankEnum, MatDepth, bool, bool>> SvdTest;
+
+PERF_TEST_P(SvdTest, decompose, ::testing::Combine(
+    ::testing::Values(std::make_tuple(5, 15), std::make_tuple(32, 32), std::make_tuple(100, 100)),
+    ::testing::Values(RANK_HALF, RANK_MINUS_1, RANK_FULL),
+    ::testing::Values(CV_32F, CV_64F),
+    ::testing::Bool(), // symmetrical
+    ::testing::Bool() // needUV
+    ))
+{
+    auto t = GetParam();
+    auto rc          = std::get<0>(t);
+    auto rankEnum    = std::get<1>(t);
+    int mtype        = std::get<2>(t);
+    bool symmetrical = std::get<3>(t);
+    bool needUV      = std::get<4>(t);
+
+    int rows = std::get<0>(rc);
+    int cols = std::get<1>(rc);
+
+    if (symmetrical)
+    {
+        rows = max(rows, cols);
+        cols = rows;
+    }
+
+    int rank = std::min(rows, cols);
+    switch (rankEnum)
+    {
+    case RANK_HALF:    rank /= 2; break;
+    case RANK_MINUS_1: rank -= 1; break;
+    default: break;
+    }
+
+    int flags = needUV ? 0 : SVD::NO_UV;
+
+    RNG& rng = theRNG();
+    Mat A = buildRandomMat(rows, cols, mtype, rng, rank, symmetrical);
+    TEST_CYCLE() cv::SVD svd(A, flags);
+
+    SANITY_CHECK_NOTHING();
+}
+
+
+PERF_TEST_P(SvdTest, backSubst, ::testing::Combine(
+    ::testing::Values(std::make_tuple(5, 15), std::make_tuple(32, 32), std::make_tuple(100, 100)),
+    ::testing::Values(RANK_HALF, RANK_MINUS_1, RANK_FULL),
+    ::testing::Values(CV_32F, CV_64F),
+    // back substitution works the same regardless of source matrix properties
+    ::testing::Values(true),
+    // back substitution has no sense without u and v
+    ::testing::Values(true) // needUV
+    ))
+{
+    auto t = GetParam();
+    auto rc       = std::get<0>(t);
+    auto rankEnum = std::get<1>(t);
+    int mtype     = std::get<2>(t);
+
+    int rows = std::get<0>(rc);
+    int cols = std::get<1>(rc);
+
+    int rank = std::min(rows, cols);
+    switch (rankEnum)
+    {
+    case RANK_HALF:    rank /= 2; break;
+    case RANK_MINUS_1: rank -= 1; break;
+    default: break;
+    }
+
+    RNG& rng = theRNG();
+    Mat A = buildRandomMat(rows, cols, mtype, rng, rank, /* symmetrical */ false);
+    cv::SVD svd(A);
+    // preallocate to not spend time on it during backSubst()
+    Mat dst(cols, 1, mtype);
+    Mat rhs(rows, 1, mtype);
+    rng.fill(rhs, RNG::UNIFORM, Scalar(-10), Scalar(10));
+
+    TEST_CYCLE() svd.backSubst(rhs, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+
 typedef perf::TestBaseWithParam< testing::tuple<int, int, int> > KMeans;
 
 PERF_TEST_P_(KMeans, single_iter)
diff --git a/modules/core/src/alloc.cpp b/modules/core/src/alloc.cpp
index a0def9db2e2f..f6abeeb09837 100644
--- a/modules/core/src/alloc.cpp
+++ b/modules/core/src/alloc.cpp
@@ -53,7 +53,6 @@
 #undef CV__ALLOCATOR_STATS_LOG
 
 //#define OPENCV_ALLOC_ENABLE_STATISTICS
-#define OPENCV_ALLOC_STATISTICS_LIMIT 4096  // don't track buffers less than N bytes
 
 
 #ifdef HAVE_POSIX_MEMALIGN
@@ -63,6 +62,7 @@
 #endif
 
 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
+#define OPENCV_ALLOC_STATISTICS_LIMIT 4096  // don't track buffers less than N bytes
 #include <map>
 #endif
 
@@ -70,7 +70,7 @@ namespace cv {
 
 static void* OutOfMemoryError(size_t size)
 {
-    CV_Error_(CV_StsNoMem, ("Failed to allocate %llu bytes", (unsigned long long)size));
+    CV_Error_(cv::Error::StsNoMem, ("Failed to allocate %llu bytes", (unsigned long long)size));
 }
 
 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getAllocatorStatistics();
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 5709ec12e410..08e1f613ef7a 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -209,7 +209,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
             swap(sz1, sz2);
         }
         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
-            CV_Error( CV_StsUnmatchedSizes,
+            CV_Error( cv::Error::StsUnmatchedSizes,
                       "The operation is neither 'array op array' (where arrays have the same size and type), "
                       "nor 'array op scalar', nor 'scalar op array'" );
         haveScalar = true;
@@ -329,7 +329,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
 
 static BinaryFuncC* getMaxTab()
 {
-    static BinaryFuncC maxTab[] =
+    static BinaryFuncC maxTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16s),
@@ -343,7 +343,7 @@ static BinaryFuncC* getMaxTab()
 
 static BinaryFuncC* getMinTab()
 {
-    static BinaryFuncC minTab[] =
+    static BinaryFuncC minTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16s),
@@ -585,9 +585,14 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
 #endif
 
+typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1,
+                                const uchar* src2, size_t step2,
+                                uchar* dst, size_t step, int width, int height,
+                                void*);
+
 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                       InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false,
-                      void* usrdata=0, int oclop=-1 )
+                      void* usrdata=0, int oclop=-1, ExtendedTypeFunc extendedFunc = nullptr )
 {
     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
     _InputArray::KindFlag kind1 = psrc1->kind(), kind2 = psrc2->kind();
@@ -617,7 +622,13 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
         Size sz = getContinuousSize2D(src1, src2, dst, src1.channels());
-        tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
+        if (!extendedFunc || extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step,
+                                          dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0)
+        {
+            BinaryFuncC func = tab[depth1];
+            CV_Assert(func);
+            func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
+        }
         return;
     }
 
@@ -644,7 +655,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 oclop = OCL_OP_RDIV_SCALE;
         }
         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
-            CV_Error( CV_StsUnmatchedSizes,
+            CV_Error( cv::Error::StsUnmatchedSizes,
                      "The operation is neither 'array op array' "
                      "(where arrays have the same size and the same number of channels), "
                      "nor 'array op scalar', nor 'scalar op array'" );
@@ -669,7 +680,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         else
         {
             if( !haveScalar && type1 != type2 )
-                CV_Error(CV_StsBadArg,
+                CV_Error(cv::Error::StsBadArg,
                      "When the input arrays in add/subtract/multiply/divide functions have different types, "
                      "the output array type must be explicitly specified");
             dtype = type1;
@@ -748,14 +759,22 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         _buf.allocate(bufesz*blocksize + 64);
         buf = _buf.data();
         if( cvtsrc1 )
+        {
             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         if( cvtsrc2 )
+        {
             buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         wbuf = maskbuf = buf;
         if( cvtdst )
+        {
             buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         if( haveMask )
+        {
             maskbuf = buf;
+        }
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
@@ -765,38 +784,44 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 Size bszn(bsz*cn, 1);
                 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
                 uchar* dptr = ptrs[2];
-                if( cvtsrc1 )
+                // try to perform operation with conversion in one call
+                // if fail, use converter functions
+                uchar* opconverted = haveMask ? maskbuf : dptr;
+                if (!extendedFunc || extendedFunc(sptr1, 1, sptr2, 1, opconverted, (!haveMask),
+                                                  bszn.width, bszn.height, usrdata) != 0)
                 {
-                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
-                    sptr1 = buf1;
-                }
-                if( ptrs[0] == ptrs[1] )
-                    sptr2 = sptr1;
-                else if( cvtsrc2 )
-                {
-                    cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
-                    sptr2 = buf2;
-                }
-
-                if( !haveMask && !cvtdst )
-                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
-                else
-                {
-                    func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata );
-                    if( !haveMask )
-                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
-                    else if( !cvtdst )
+                    if( cvtsrc1 )
                     {
-                        copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[3] += bsz;
+                        cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                        sptr1 = buf1;
                     }
-                    else
+                    if( ptrs[0] == ptrs[1] )
+                    {
+                        sptr2 = sptr1;
+                    }
+                    else if( cvtsrc2 )
+                    {
+                        cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
+                        sptr2 = buf2;
+                    }
+
+                    uchar* fdst = (haveMask || cvtdst) ? wbuf : dptr;
+                    func(sptr1, 1, sptr2, 1, fdst, (!haveMask && !cvtdst), bszn.width, bszn.height, usrdata);
+
+                    if (cvtdst)
                     {
-                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
-                        copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[3] += bsz;
+                        uchar* cdst = haveMask ? maskbuf : dptr;
+                        cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
                     }
+                    opconverted = cvtdst ? maskbuf : wbuf;
                 }
+
+                if (haveMask)
+                {
+                    copymask(opconverted, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz);
+                    ptrs[3] += bsz;
+                }
+
                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
             }
         }
@@ -812,13 +837,19 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         _buf.allocate(bufesz*blocksize + 64);
         buf = _buf.data();
         if( cvtsrc1 )
-            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        {
+            buf1 = buf, buf = alignPtr(buf + blocksize * wsz, 16);
+        }
         buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
         wbuf = maskbuf = buf;
         if( cvtdst )
-            buf = alignPtr(buf + blocksize*wsz, 16);
+        {
+            buf = alignPtr(buf + blocksize * wsz, 16);
+        }
         if( haveMask )
+        {
             maskbuf = buf;
+        }
 
         convertAndUnrollScalar( src2, wtype, buf2, blocksize);
 
@@ -832,34 +863,43 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 const uchar* sptr2 = buf2;
                 uchar* dptr = ptrs[1];
 
-                if( cvtsrc1 )
-                {
-                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
-                    sptr1 = buf1;
-                }
-
+                const uchar* extSptr1 = sptr1;
+                const uchar* extSptr2 = sptr2;
                 if( swapped12 )
-                    std::swap(sptr1, sptr2);
+                    std::swap(extSptr1, extSptr1);
 
-                if( !haveMask && !cvtdst )
-                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
-                else
+                // try to perform operation with conversion in one call
+                // if fail, use converter functions
+                uchar* opconverted = haveMask ? maskbuf : dptr;
+                if (!extendedFunc || extendedFunc(extSptr1, 1, extSptr2, 1, opconverted, 1,
+                                                  bszn.width, bszn.height, usrdata) != 0)
                 {
-                    func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata );
-                    if( !haveMask )
-                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
-                    else if( !cvtdst )
+                    if( cvtsrc1 )
                     {
-                        copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[2] += bsz;
+                        cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                        sptr1 = buf1;
                     }
-                    else
+
+                    if( swapped12 )
+                        std::swap(sptr1, sptr2);
+
+                    uchar* fdst = ( haveMask || cvtdst ) ? wbuf : dptr;
+                    func( sptr1, 1, sptr2, 1, fdst, 1, bszn.width, bszn.height, usrdata );
+
+                    if (cvtdst)
                     {
-                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
-                        copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[2] += bsz;
+                        uchar* cdst = haveMask ? maskbuf : dptr;
+                        cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
                     }
+                    opconverted = cvtdst ? maskbuf : wbuf;
+                }
+
+                if (haveMask)
+                {
+                    copymask(opconverted, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz);
+                    ptrs[2] += bsz;
                 }
+
                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
             }
         }
@@ -868,7 +908,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
 static BinaryFuncC* getAddTab()
 {
-    static BinaryFuncC addTab[] =
+    static BinaryFuncC addTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16s),
@@ -880,9 +920,35 @@ static BinaryFuncC* getAddTab()
     return addTab;
 }
 
+static int sub8u32fWrapper(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height, void* )
+{
+    int res = cv_hal_sub8u32f(src1, step1, src2, step2, (float *)dst, step, width, height);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation sub8u32f ==> " CVAUX_STR(cv_hal_sub8u32f)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
+static int sub8s32fWrapper(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height, void* )
+{
+    int res = cv_hal_sub8s32f((schar*)src1, step1, (schar*)src2, step2, (float *)dst, step, width, height);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation sub8s32f ==> " CVAUX_STR(cv_hal_sub8s32f)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
 static BinaryFuncC* getSubTab()
 {
-    static BinaryFuncC subTab[] =
+    static BinaryFuncC subTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16s),
@@ -894,9 +960,25 @@ static BinaryFuncC* getSubTab()
     return subTab;
 }
 
+static ExtendedTypeFunc getSubExtFunc(int src1Type, int src2Type, int dstType)
+{
+    if (src1Type == CV_8U && src2Type == CV_8U && dstType == CV_32F)
+    {
+        return sub8u32fWrapper;
+    }
+    else if (src1Type == CV_8S && src2Type == CV_8S && dstType == CV_32F)
+    {
+        return sub8s32fWrapper;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 static BinaryFuncC* getAbsDiffTab()
 {
-    static BinaryFuncC absDiffTab[] =
+    static BinaryFuncC absDiffTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16s),
@@ -919,11 +1001,13 @@ void cv::add( InputArray src1, InputArray src2, OutputArray dst,
 }
 
 void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
-               InputArray mask, int dtype )
+                   InputArray mask, int dtype )
 {
     CV_INSTRUMENT_REGION();
 
-    arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
+    ExtendedTypeFunc subExtFunc = getSubExtFunc(_src1.depth(), _src2.depth(), dtype < 0 ? _dst.depth() : dtype);
+    arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB,
+              /* extendedFunc */ subExtFunc);
 }
 
 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
@@ -947,9 +1031,41 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)
 namespace cv
 {
 
+static int mul8u16uWrapper(const uchar* src1, size_t step1,
+                           const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height,
+                           void* usrdata)
+{
+    double scale = *((double*)usrdata);
+    int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16u ==> " CVAUX_STR(cv_hal_mul8u16u)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
+static int mul8s16sWrapper(const uchar* src1, size_t step1,
+                           const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height,
+                           void* usrdata)
+{
+    double scale = *((double*)usrdata);
+    int res = cv_hal_mul8s16s((schar *)src1, step1, (schar *)src2, step2, (short *)dst, step, width, height, scale);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s ==> " CVAUX_STR(cv_hal_mul8s16s)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
 static BinaryFuncC* getMulTab()
 {
-    static BinaryFuncC mulTab[] =
+    static BinaryFuncC mulTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)cv::hal::mul8u, (BinaryFuncC)cv::hal::mul8s, (BinaryFuncC)cv::hal::mul16u,
         (BinaryFuncC)cv::hal::mul16s, (BinaryFuncC)cv::hal::mul32s, (BinaryFuncC)cv::hal::mul32f,
@@ -959,9 +1075,25 @@ static BinaryFuncC* getMulTab()
     return mulTab;
 }
 
+static ExtendedTypeFunc getMulExtFunc(int src1Type, int src2Type, int dstType)
+{
+    if (src1Type == CV_8U && src2Type == CV_8U && dstType == CV_16U)
+    {
+        return mul8u16uWrapper;
+    }
+    else if (src1Type == CV_8S && src2Type == CV_8S && dstType == CV_16S)
+    {
+        return mul8s16sWrapper;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 static BinaryFuncC* getDivTab()
 {
-    static BinaryFuncC divTab[] =
+    static BinaryFuncC divTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)cv::hal::div8u, (BinaryFuncC)cv::hal::div8s, (BinaryFuncC)cv::hal::div16u,
         (BinaryFuncC)cv::hal::div16s, (BinaryFuncC)cv::hal::div32s, (BinaryFuncC)cv::hal::div32f,
@@ -973,7 +1105,7 @@ static BinaryFuncC* getDivTab()
 
 static BinaryFuncC* getRecipTab()
 {
-    static BinaryFuncC recipTab[] =
+    static BinaryFuncC recipTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)cv::hal::recip8u, (BinaryFuncC)cv::hal::recip8s, (BinaryFuncC)cv::hal::recip16u,
         (BinaryFuncC)cv::hal::recip16s, (BinaryFuncC)cv::hal::recip32s, (BinaryFuncC)cv::hal::recip32f,
@@ -984,12 +1116,14 @@ static BinaryFuncC* getRecipTab()
 }
 
 void multiply(InputArray src1, InputArray src2,
-                  OutputArray dst, double scale, int dtype)
+              OutputArray dst, double scale, int dtype)
 {
     CV_INSTRUMENT_REGION();
 
+    ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype < 0 ? dst.depth() : dtype);
     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
-              true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
+              /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
+              /* extendedFunc */ mulExtFunc );
 }
 
 void divide(InputArray src1, InputArray src2,
@@ -1021,7 +1155,7 @@ UMat UMat::mul(InputArray m, double scale) const
 
 static BinaryFuncC* getAddWeightedTab()
 {
-    static BinaryFuncC addWeightedTab[] =
+    static BinaryFuncC addWeightedTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16u),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted32s), (BinaryFuncC)cv::hal::addWeighted32f,
@@ -1052,7 +1186,7 @@ namespace cv
 
 static BinaryFuncC getCmpFunc(int depth)
 {
-    static BinaryFuncC cmpTab[] =
+    static BinaryFuncC cmpTab[CV_DEPTH_MAX] =
     {
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8s),
         (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16s),
@@ -1206,7 +1340,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
             return;
         }
         else if(is_src1_scalar == is_src2_scalar)
-            CV_Error( CV_StsUnmatchedSizes,
+            CV_Error( cv::Error::StsUnmatchedSizes,
                      "The operation is neither 'array op array' (where arrays have the same size and the same type), "
                      "nor 'array op scalar', nor 'scalar op array'" );
         haveScalar = true;
@@ -1332,7 +1466,7 @@ struct InRange_SIMD
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template <>
 struct InRange_SIMD<uchar>
@@ -1341,7 +1475,7 @@ struct InRange_SIMD<uchar>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = v_uint8::nlanes;
+        const int width = VTraits<v_uint8>::vlanes();
 
         for (; x <= len - width; x += width)
         {
@@ -1349,7 +1483,7 @@ struct InRange_SIMD<uchar>
             v_uint8 low = vx_load(src2 + x);
             v_uint8 high = vx_load(src3 + x);
 
-            v_store(dst + x, (values >= low) & (high >= values));
+            v_store(dst + x, v_and(v_ge(values, low), v_ge(high, values)));
         }
         vx_cleanup();
         return x;
@@ -1363,7 +1497,7 @@ struct InRange_SIMD<schar>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = v_int8::nlanes;
+        const int width = VTraits<v_int8>::vlanes();
 
         for (; x <= len - width; x += width)
         {
@@ -1371,7 +1505,7 @@ struct InRange_SIMD<schar>
             v_int8 low = vx_load(src2 + x);
             v_int8 high = vx_load(src3 + x);
 
-            v_store((schar*)(dst + x), (values >= low) & (high >= values));
+            v_store((schar*)(dst + x), v_and(v_ge(values, low), v_ge(high, values)));
         }
         vx_cleanup();
         return x;
@@ -1385,7 +1519,7 @@ struct InRange_SIMD<ushort>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = v_uint16::nlanes * 2;
+        const int width = VTraits<v_uint16>::vlanes() * 2;
 
         for (; x <= len - width; x += width)
         {
@@ -1393,11 +1527,11 @@ struct InRange_SIMD<ushort>
             v_uint16 low1 = vx_load(src2 + x);
             v_uint16 high1 = vx_load(src3 + x);
 
-            v_uint16 values2 = vx_load(src1 + x + v_uint16::nlanes);
-            v_uint16 low2 = vx_load(src2 + x + v_uint16::nlanes);
-            v_uint16 high2 = vx_load(src3 + x + v_uint16::nlanes);
+            v_uint16 values2 = vx_load(src1 + x + VTraits<v_uint16>::vlanes());
+            v_uint16 low2 = vx_load(src2 + x + VTraits<v_uint16>::vlanes());
+            v_uint16 high2 = vx_load(src3 + x + VTraits<v_uint16>::vlanes());
 
-            v_store(dst + x, v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
+            v_store(dst + x, v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
         }
         vx_cleanup();
         return x;
@@ -1411,7 +1545,7 @@ struct InRange_SIMD<short>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = (int)v_int16::nlanes * 2;
+        const int width = (int)VTraits<v_int16>::vlanes() * 2;
 
         for (; x <= len - width; x += width)
         {
@@ -1419,11 +1553,11 @@ struct InRange_SIMD<short>
             v_int16 low1 = vx_load(src2 + x);
             v_int16 high1 = vx_load(src3 + x);
 
-            v_int16 values2 = vx_load(src1 + x + v_int16::nlanes);
-            v_int16 low2 = vx_load(src2 + x + v_int16::nlanes);
-            v_int16 high2 = vx_load(src3 + x + v_int16::nlanes);
+            v_int16 values2 = vx_load(src1 + x + VTraits<v_int16>::vlanes());
+            v_int16 low2 = vx_load(src2 + x + VTraits<v_int16>::vlanes());
+            v_int16 high2 = vx_load(src3 + x + VTraits<v_int16>::vlanes());
 
-            v_store((schar*)(dst + x), v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
+            v_store((schar*)(dst + x), v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
         }
         vx_cleanup();
         return x;
@@ -1437,7 +1571,7 @@ struct InRange_SIMD<int>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = (int)v_int32::nlanes * 2;
+        const int width = (int)VTraits<v_int32>::vlanes() * 2;
 
         for (; x <= len - width; x += width)
         {
@@ -1445,11 +1579,11 @@ struct InRange_SIMD<int>
             v_int32 low1 = vx_load(src2 + x);
             v_int32 high1 = vx_load(src3 + x);
 
-            v_int32 values2 = vx_load(src1 + x + v_int32::nlanes);
-            v_int32 low2 = vx_load(src2 + x + v_int32::nlanes);
-            v_int32 high2 = vx_load(src3 + x + v_int32::nlanes);
+            v_int32 values2 = vx_load(src1 + x + VTraits<v_int32>::vlanes());
+            v_int32 low2 = vx_load(src2 + x + VTraits<v_int32>::vlanes());
+            v_int32 high2 = vx_load(src3 + x + VTraits<v_int32>::vlanes());
 
-            v_pack_store(dst + x, v_reinterpret_as_u16(v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2))));
+            v_pack_store(dst + x, v_reinterpret_as_u16(v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2)))));
         }
         vx_cleanup();
         return x;
@@ -1463,7 +1597,7 @@ struct InRange_SIMD<float>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = (int)v_float32::nlanes * 2;
+        const int width = (int)VTraits<v_float32>::vlanes() * 2;
 
         for (; x <= len - width; x += width)
         {
@@ -1471,12 +1605,12 @@ struct InRange_SIMD<float>
             v_float32 low1 = vx_load(src2 + x);
             v_float32 high1 = vx_load(src3 + x);
 
-            v_float32 values2 = vx_load(src1 + x + v_float32::nlanes);
-            v_float32 low2 = vx_load(src2 + x + v_float32::nlanes);
-            v_float32 high2 = vx_load(src3 + x + v_float32::nlanes);
+            v_float32 values2 = vx_load(src1 + x + VTraits<v_float32>::vlanes());
+            v_float32 low2 = vx_load(src2 + x + VTraits<v_float32>::vlanes());
+            v_float32 high2 = vx_load(src3 + x + VTraits<v_float32>::vlanes());
 
-            v_pack_store(dst + x, v_pack(v_reinterpret_as_u32(values1 >= low1) & v_reinterpret_as_u32(high1 >= values1),
-                                         v_reinterpret_as_u32(values2 >= low2) & v_reinterpret_as_u32(high2 >= values2)));
+            v_pack_store(dst + x, v_pack(v_and(v_reinterpret_as_u32(v_ge(values1, low1)), v_reinterpret_as_u32(v_ge(high1, values1))),
+                                         v_and(v_reinterpret_as_u32(v_ge(values2, low2)), v_reinterpret_as_u32(v_ge(high2, values2)))));
         }
         vx_cleanup();
         return x;
@@ -1588,7 +1722,7 @@ typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2,
 
 static InRangeFunc getInRangeFunc(int depth)
 {
-    static InRangeFunc inRangeTab[] =
+    static InRangeFunc inRangeTab[CV_DEPTH_MAX] =
     {
         (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u),
         (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f),
@@ -1615,7 +1749,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
         ssize != lsize || stype != ltype )
     {
         if( !checkScalar(_lowerb, stype, lkind, skind) )
-            CV_Error( CV_StsUnmatchedSizes,
+            CV_Error( cv::Error::StsUnmatchedSizes,
                      "The lower boundary is neither an array of the same size and same type as src, nor a scalar");
         lbScalar = true;
     }
@@ -1624,7 +1758,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
         ssize != usize || stype != utype )
     {
         if( !checkScalar(_upperb, stype, ukind, skind) )
-            CV_Error( CV_StsUnmatchedSizes,
+            CV_Error( cv::Error::StsUnmatchedSizes,
                      "The upper boundary is neither an array of the same size and same type as src, nor a scalar");
         ubScalar = true;
     }
@@ -1643,7 +1777,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
     if (kercn % cn != 0)
         kercn = cn;
     int colsPerWI = kercn / cn;
-    String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
+    String opts = format("%s-D CN=%d -D SRC_T=%s -D SRC_T1=%s -D DST_T=%s -D KERCN=%d -D DEPTH=%d%s -D COLS_PER_WI=%d",
                            haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
                            ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
                            doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);
@@ -1738,7 +1872,7 @@ void cv::inRange(InputArray _src, InputArray _lowerb,
         src.size != lb.size || src.type() != lb.type() )
     {
         if( !checkScalar(lb, src.type(), lkind, skind) )
-            CV_Error( CV_StsUnmatchedSizes,
+            CV_Error( cv::Error::StsUnmatchedSizes,
                      "The lower boundary is neither an array of the same size and same type as src, nor a scalar");
         lbScalar = true;
     }
@@ -1747,7 +1881,7 @@ void cv::inRange(InputArray _src, InputArray _lowerb,
         src.size != ub.size || src.type() != ub.type() )
     {
         if( !checkScalar(ub, src.type(), ukind, skind) )
-            CV_Error( CV_StsUnmatchedSizes,
+            CV_Error( cv::Error::StsUnmatchedSizes,
                      "The upper boundary is neither an array of the same size and same type as src, nor a scalar");
         ubScalar = true;
     }
diff --git a/modules/core/src/arithm.dispatch.cpp b/modules/core/src/arithm.dispatch.cpp
index 1cbceaee2996..b6a854379d19 100644
--- a/modules/core/src/arithm.dispatch.cpp
+++ b/modules/core/src/arithm.dispatch.cpp
@@ -8,4 +8,4 @@
 #include "arithm.simd_declarations.hpp"
 
 #define ARITHM_DISPATCHING_ONLY
-#include "arithm.simd.hpp"
\ No newline at end of file
+#include "arithm.simd.hpp"
diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp
index 06ebfb767842..3b8ab1c48b2c 100644
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@@ -69,7 +69,7 @@
 #define DEFINE_SIMD_F32(fun, ...) \
     DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     #define DEFINE_SIMD_F64(fun, ...) \
         DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
 #else
@@ -104,7 +104,7 @@ namespace cv { namespace hal {
 
 #ifdef ARITHM_DEFINITIONS_ONLY
 
-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 typedef int v_float64; // dummy
 #endif
 
@@ -219,7 +219,7 @@ template<typename T1, typename Tvec>
 struct op_add
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a + b; }
+    { return v_add(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return c_add(a, b); }
 };
@@ -229,7 +229,7 @@ template<typename T1, typename Tvec>
 struct op_sub
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a - b; }
+    { return v_sub(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return c_sub(a, b); }
 };
@@ -266,7 +266,7 @@ struct op_absdiff
 template<>
 struct op_absdiff<schar, v_int8>
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_int8 r(const v_int8& a, const v_int8& b)
     { return v_absdiffs(a, b); }
 #endif
@@ -276,7 +276,7 @@ struct op_absdiff<schar, v_int8>
 template<>
 struct op_absdiff<short, v_int16>
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_int16 r(const v_int16& a, const v_int16& b)
     { return v_absdiffs(a, b); }
 #endif
@@ -286,7 +286,7 @@ struct op_absdiff<short, v_int16>
 template<>
 struct op_absdiff<int, v_int32>
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_int32 r(const v_int32& a, const v_int32& b)
     { return v_reinterpret_as_s32(v_absdiff(a, b)); }
 #endif
@@ -299,7 +299,7 @@ template<typename T1, typename Tvec>
 struct op_or
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a | b; }
+    { return v_or(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return a | b; }
 };
@@ -307,7 +307,7 @@ template<typename T1, typename Tvec>
 struct op_xor
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a ^ b; }
+    { return v_xor(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return a ^ b; }
 };
@@ -315,7 +315,7 @@ template<typename T1, typename Tvec>
 struct op_and
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a & b; }
+    { return v_and(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return a & b; }
 };
@@ -324,14 +324,14 @@ struct op_not
 {
     // ignored b from loader level
     static inline Tvec r(const Tvec& a)
-    { return ~a; }
+    { return v_not(a); }
     static inline T1 r(T1 a, T1)
     { return ~a; }
 };
 
 //////////////////////////// Loaders /////////////////////////////////
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct bin_loader
@@ -396,13 +396,13 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
     typedef OP<T1, Tvec> op;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef bin_loader<OP, T1, Tvec> ldr;
-    enum {wide_step = Tvec::nlanes};
+    const int wide_step = VTraits<Tvec>::vlanes();
     #if !CV_NEON && CV_SIMD_WIDTH == 16
-        enum {wide_step_l = wide_step * 2};
+        const int wide_step_l = wide_step * 2;
     #else
-        enum {wide_step_l = wide_step};
+        const int wide_step_l = wide_step;
     #endif
 #endif // CV_SIMD
 
@@ -414,7 +414,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     {
         int x = 0;
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         #if !CV_NEON && !CV_MSA
         if (is_aligned(src1, src2, dst))
         {
@@ -464,7 +464,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     vx_cleanup();
 }
 
-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
@@ -496,7 +496,7 @@ static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t
 #define BIN_LOOP64F bin_loop_nosimd
 #else
 #define BIN_LOOP64F bin_loop
-#endif //!CV_SIMD_64F
+#endif //!(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 #endif // ARITHM_DEFINITIONS_ONLY
 
@@ -587,7 +587,7 @@ template<typename T1, typename Tvec>
 struct op_cmplt
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a < b; }
+    { return v_lt(a, b); }
     static inline uchar r(T1 a, T1 b)
     { return (uchar)-(int)(a < b); }
 };
@@ -596,7 +596,7 @@ template<typename T1, typename Tvec>
 struct op_cmple
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a <= b; }
+    { return v_le(a, b); }
     static inline uchar r(T1 a, T1 b)
     { return (uchar)-(int)(a <= b); }
 };
@@ -605,7 +605,7 @@ template<typename T1, typename Tvec>
 struct op_cmpeq
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a == b; }
+    { return v_eq(a, b); }
     static inline uchar r(T1 a, T1 b)
     { return (uchar)-(int)(a == b); }
 };
@@ -614,14 +614,14 @@ template<typename T1, typename Tvec>
 struct op_cmpne
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a != b; }
+    { return v_ne(a, b); }
     static inline uchar r(T1 a, T1 b)
     { return (uchar)-(int)(a != b); }
 };
 
 //////////////////////////// Loaders /////////////////////////////////
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct cmp_loader_n
@@ -646,10 +646,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
 {
     typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};
 
     static inline void l(const T1* src1, const T1* src2, uchar* dst)
     {
+        const int step = VTraits<Tvec>::vlanes();
         Tvec c0 = op::r(vx_load(src1), vx_load(src2));
         Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
         v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
@@ -660,10 +660,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
 {
     typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};
 
     static inline void l(const T1* src1, const T1* src2, uchar* dst)
     {
+        const int step = VTraits<Tvec>::vlanes();
         v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
         v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
         v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
@@ -676,10 +676,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
 {
     typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};
 
     static inline void l(const T1* src1, const T1* src2, uchar* dst)
     {
+        const int step = VTraits<Tvec>::vlanes();
         v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
         v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
         v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
@@ -701,9 +701,9 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
     typedef OP<T1, Tvec> op;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
-    enum {wide_step = Tvec::nlanes * sizeof(T1)};
+    const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
 #endif // CV_SIMD
 
     step1 /= sizeof(T1);
@@ -713,7 +713,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     {
         int x = 0;
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, src2 + x, dst + x);
@@ -768,7 +768,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     }
 }
 
-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template< template<typename T1, typename Tvec> class OP, typename T1>
 static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
@@ -822,7 +822,7 @@ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2
         break;
     }
 }
-#endif // !CV_SIMD_64F
+#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 #endif // ARITHM_DEFINITIONS_ONLY
 
@@ -865,7 +865,7 @@ static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2
     }
 
 // todo: try to avoid define dispatcher functions using macros with these such cases
-DEFINE_SIMD_ALL(cmp)
+DEFINE_SIMD_ALL(cmp, void)
 
 //=========================================================================
 // scaling helpers for single and dual source
@@ -880,7 +880,7 @@ DEFINE_SIMD_ALL(cmp)
 
 //////////////////////////// Loaders ///////////////////////////////
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 struct scalar_loader_n
@@ -1013,10 +1013,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
 struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
 {
     typedef OP<int, T2, v_int32> op;
-    enum {step = v_int32::nlanes};
 
     static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
     {
+        const int step = VTraits<v_int32>::vlanes();
         v_int32 v_src1 = vx_load(src1);
         v_int32 v_src2 = vx_load(src2);
         v_int32 v_src1s = vx_load(src1 + step);
@@ -1043,6 +1043,7 @@ struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
 
     static inline void l(const int* src1, const T2* scalar, int* dst)
     {
+        const int step = VTraits<v_int32>::vlanes();
         v_int32 v_src1 = vx_load(src1);
         v_int32 v_src1s = vx_load(src1 + step);
 
@@ -1068,10 +1069,9 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
 struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 {
     typedef OP<float, T2, v_float32> op;
-    enum {step = v_float32::nlanes};
-
     static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
     {
+        const int step = VTraits<v_float32>::vlanes();
         v_float32 v_src1 = vx_load(src1);
         v_float32 v_src2 = vx_load(src2);
         v_float32 v_src1s = vx_load(src1 + step);
@@ -1086,6 +1086,7 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 
     static inline void l(const float* src1, const T2* scalar, float* dst)
     {
+        const int step = VTraits<v_float32>::vlanes();
         v_float32 v_src1 = vx_load(src1);
         v_float32 v_src1s = vx_load(src1 + step);
 
@@ -1098,16 +1099,16 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 };
 #endif // CV_SIMD
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<template<typename T1, typename T2, typename Tvec> class OP>
 struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
 {
     typedef OP<int, float, v_int32> op;
     typedef OP<double, double, v_float64> op64;
-    enum {step = v_int32::nlanes};
 
     static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
     {
+        const int step = VTraits<v_int32>::vlanes();
         v_int32 v_src1 = vx_load(src1);
         v_int32 v_src2 = vx_load(src2);
         v_int32 v_src1s = vx_load(src1 + step);
@@ -1124,6 +1125,7 @@ struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
     }
     static inline void l(const int* src1, const double* scalar, int* dst)
     {
+        const int step = VTraits<v_int32>::vlanes();
         v_int32 v_src1 = vx_load(src1);
         v_int32 v_src1s = vx_load(src1 + step);
 
@@ -1168,10 +1170,10 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
 {
     typedef OP<float, float, v_float32> op;
     typedef OP<double, double, v_float64> op64;
-    enum {step = v_float32::nlanes};
 
     static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
     {
+        const int step = VTraits<v_float32>::vlanes();
         v_float32 v_src1 = vx_load(src1);
         v_float32 v_src2 = vx_load(src2);
         v_float32 v_src1s = vx_load(src1 + step);
@@ -1185,6 +1187,7 @@ struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
     }
     static inline void l(const float* src1, const double* scalar, float* dst)
     {
+        const int step = VTraits<v_float32>::vlanes();
         v_float32 v_src1 = vx_load(src1);
         v_float32 v_src1s = vx_load(src1 + step);
 
@@ -1225,10 +1228,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP>
 struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
 {
     typedef OP<double, double, v_float64> op;
-    enum {step = v_float64::nlanes};
 
     static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
     {
+        const int step = VTraits<v_float64>::vlanes();
         v_float64 v_src1 = vx_load(src1);
         v_float64 v_src2 = vx_load(src2);
         v_float64 v_src1s = vx_load(src1 + step);
@@ -1242,6 +1245,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
     }
     static inline void l(const double* src1, const double* scalar, double* dst)
     {
+        const int step = VTraits<v_float64>::vlanes();
         v_float64 v_src1 = vx_load(src1);
         v_float64 v_src1s = vx_load(src1 + step);
 
@@ -1252,7 +1256,7 @@ struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
         v_store(dst + step, r1);
     }
 };
-#endif // CV_SIMD_64F
+#endif // (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 //////////////////////////// Loops /////////////////////////////////
 
@@ -1262,10 +1266,10 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
                  T1* dst, size_t step, int width, int height, const T2* scalar)
 {
     typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
-    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
-                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
+                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
 #endif // CV_SIMD
 
     step1 /= sizeof(T1);
@@ -1276,7 +1280,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
     {
         int x = 0;
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, src2 + x, scalar, dst + x);
@@ -1308,10 +1312,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
 static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
 {
     typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
-    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
-                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
+                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
 #endif // CV_SIMD
 
     step1 /= sizeof(T1);
@@ -1321,7 +1325,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
     {
         int x = 0;
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, scalar, dst + x);
@@ -1348,7 +1352,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
     vx_cleanup();
 }
 
-#if !CV_SIMD_64F
+#if !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 // dual source
 template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
@@ -1412,7 +1416,7 @@ static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t ste
 #define SCALAR_LOOP64F scalar_loop_nosimd
 #else
 #define SCALAR_LOOP64F scalar_loop
-#endif // !CV_SIMD_64F
+#endif // !(CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 #endif // ARITHM_DEFINITIONS_ONLY
 
@@ -1428,7 +1432,7 @@ template<typename T1, typename Tvec>
 struct op_mul
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a * b; }
+    { return v_mul(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return saturate_cast<T1>(a * b); }
 };
@@ -1436,11 +1440,11 @@ struct op_mul
 template<typename T1, typename T2, typename Tvec>
 struct op_mul_scale
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar * a * b;
+        return v_mul(v_scalar , a , b);
     }
 #endif
     static inline T1 r(T1 a, T1 b, const T2* scalar)
@@ -1452,11 +1456,11 @@ struct op_mul_scale
 template<>
 struct op_mul_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return v_scalar * a * b;
+        return v_mul(v_mul(v_scalar, a), b);
     }
 #endif
     static inline double r(double a, double b, const double* scalar)
@@ -1569,7 +1573,7 @@ template<typename T1, typename Tvec>
 struct op_div_f
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a / b; }
+    { return v_div(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return a / b; }
 };
@@ -1577,16 +1581,16 @@ struct op_div_f
 template<typename T1, typename T2, typename Tvec>
 struct op_div_scale
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
     }
     static inline Tvec pre(const Tvec& denom, const Tvec& res)
     {
-        const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
-        return v_select(denom == v_zero, v_zero, res);
+        const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
+        return v_select(v_eq(denom, v_zero), v_zero, res);
     }
 #endif
     static inline T1 r(T1 a, T1 denom, const T2* scalar)
@@ -1599,11 +1603,11 @@ struct op_div_scale
 template<>
 struct op_div_scale<float, float, v_float32>
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
     }
 #endif
     static inline float r(float a, float denom, const float* scalar)
@@ -1613,11 +1617,11 @@ struct op_div_scale<float, float, v_float32>
 template<>
 struct op_div_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
     }
 #endif
     static inline double r(double a, double denom, const double* scalar)
@@ -1685,7 +1689,7 @@ DEFINE_SIMD_ALL(div, div_loop)
 template<typename T1, typename T2, typename Tvec>
 struct op_add_scale
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_alpha = vx_setall_f32(*scalar);
@@ -1701,7 +1705,7 @@ struct op_add_scale
 template<>
 struct op_add_scale<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
     {
         const v_float64 v_alpha = vx_setall_f64(*scalar);
@@ -1718,7 +1722,7 @@ struct op_add_scale<double, double, v_float64>
 template<typename T1, typename T2, typename Tvec>
 struct op_add_weighted
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
     {
         const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@@ -1736,7 +1740,7 @@ struct op_add_weighted
 template<>
 struct op_add_weighted<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
     {
         const v_float64 v_alpha = vx_setall_f64(scalars[0]);
@@ -1835,16 +1839,16 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
 template<typename T1, typename T2, typename Tvec>
 struct op_recip
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
     }
     static inline Tvec pre(const Tvec& denom, const Tvec& res)
     {
-        const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
-        return v_select(denom == v_zero, v_zero, res);
+        const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
+        return v_select(v_eq(denom, v_zero), v_zero, res);
     }
 #endif
     static inline T1 r(T1 denom, const T2* scalar)
@@ -1857,11 +1861,11 @@ struct op_recip
 template<>
 struct op_recip<float, float, v_float32>
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     static inline v_float32 r(const v_float32& a, const float* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
     }
 #endif
     static inline float r(float denom, const float* scalar)
@@ -1871,11 +1875,11 @@ struct op_recip<float, float, v_float32>
 template<>
 struct op_recip<double, double, v_float64>
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     static inline v_float64 r(const v_float64& a, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
     }
 #endif
     static inline double r(double denom, const double* scalar)
diff --git a/modules/core/src/arithm_ipp.hpp b/modules/core/src/arithm_ipp.hpp
index 4aa7d006e44a..ed722113a728 100644
--- a/modules/core/src/arithm_ipp.hpp
+++ b/modules/core/src/arithm_ipp.hpp
@@ -414,4 +414,4 @@ inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2,
 
 #if !ARITHM_USE_IPP
 #define ARITHM_CALL_IPP(...)
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/core/src/array.cpp b/modules/core/src/array.cpp
index 1eef447d0c91..252ac13ca039 100644
--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@@ -79,7 +79,7 @@ cvSetIPLAllocators( Cv_iplCreateImageHeader createHeader,
         (createROI != 0) + (cloneImage != 0);
 
     if( count != 0 && count != 5 )
-        CV_Error( CV_StsBadArg, "Either all the pointers should be null or "
+        CV_Error( cv::Error::StsBadArg, "Either all the pointers should be null or "
                                  "they all should be non-null" );
 
     CvIPL.createHeader = createHeader;
@@ -118,11 +118,11 @@ cvCreateMatHeader( int rows, int cols, int type )
     type = CV_MAT_TYPE(type);
 
     if( rows < 0 || cols < 0 )
-        CV_Error( CV_StsBadSize, "Non-positive width or height" );
+        CV_Error( cv::Error::StsBadSize, "Non-positive width or height" );
 
     int min_step = CV_ELEM_SIZE(type);
     if( min_step <= 0 )
-        CV_Error( CV_StsUnsupportedFormat, "Invalid matrix type" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Invalid matrix type" );
     min_step *= cols;
 
     CvMat* arr = (CvMat*)cvAlloc( sizeof(*arr));
@@ -146,13 +146,13 @@ cvInitMatHeader( CvMat* arr, int rows, int cols,
                  int type, void* data, int step )
 {
     if( !arr )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( (unsigned)CV_MAT_DEPTH(type) > CV_DEPTH_MAX )
-        CV_Error( CV_BadNumChannels, "" );
+        CV_Error( cv::Error::BadNumChannels, "" );
 
     if( rows < 0 || cols < 0 )
-        CV_Error( CV_StsBadSize, "Non-positive cols or rows" );
+        CV_Error( cv::Error::StsBadSize, "Non-positive cols or rows" );
 
     type = CV_MAT_TYPE( type );
     arr->type = type | CV_MAT_MAGIC_VAL;
@@ -168,7 +168,7 @@ cvInitMatHeader( CvMat* arr, int rows, int cols,
     if( step != CV_AUTOSTEP && step != 0 )
     {
         if( step < min_step )
-            CV_Error( CV_BadStep, "" );
+            CV_Error( cv::Error::BadStep, "" );
         arr->step = step;
     }
     else
@@ -196,7 +196,7 @@ cvReleaseMat( CvMat** array )
         CvMat* arr = *array;
 
         if( !CV_IS_MAT_HDR_Z(arr) && !CV_IS_MATND_HDR(arr) )
-            CV_Error( CV_StsBadFlag, "" );
+            CV_Error( cv::Error::StsBadFlag, "" );
 
         *array = 0;
 
@@ -211,7 +211,7 @@ CV_IMPL CvMat*
 cvCloneMat( const CvMat* src )
 {
     if( !CV_IS_MAT_HDR( src ))
-        CV_Error( CV_StsBadArg, "Bad CvMat header" );
+        CV_Error( cv::Error::StsBadArg, "Bad CvMat header" );
 
     CvMat* dst = cvCreateMatHeader( src->rows, src->cols, src->type );
 
@@ -237,25 +237,25 @@ cvInitMatNDHeader( CvMatND* mat, int dims, const int* sizes,
     int64 step = CV_ELEM_SIZE(type);
 
     if( !mat )
-        CV_Error( CV_StsNullPtr, "NULL matrix header pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL matrix header pointer" );
 
     if( step == 0 )
-        CV_Error( CV_StsUnsupportedFormat, "invalid array data type" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "invalid array data type" );
 
     if( !sizes )
-        CV_Error( CV_StsNullPtr, "NULL <sizes> pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL <sizes> pointer" );
 
     if( dims <= 0 || dims > CV_MAX_DIM )
-        CV_Error( CV_StsOutOfRange,
+        CV_Error( cv::Error::StsOutOfRange,
         "non-positive or too large number of dimensions" );
 
     for( int i = dims - 1; i >= 0; i-- )
     {
         if( sizes[i] < 0 )
-            CV_Error( CV_StsBadSize, "one of dimension sizes is non-positive" );
+            CV_Error( cv::Error::StsBadSize, "one of dimension sizes is non-positive" );
         mat->dim[i].size = sizes[i];
         if( step > INT_MAX )
-            CV_Error( CV_StsOutOfRange, "The array is too big" );
+            CV_Error( cv::Error::StsOutOfRange, "The array is too big" );
         mat->dim[i].step = (int)step;
         step *= sizes[i];
     }
@@ -285,7 +285,7 @@ CV_IMPL CvMatND*
 cvCreateMatNDHeader( int dims, const int* sizes, int type )
 {
     if( dims <= 0 || dims > CV_MAX_DIM )
-        CV_Error( CV_StsOutOfRange,
+        CV_Error( cv::Error::StsOutOfRange,
         "non-positive or too large number of dimensions" );
 
     CvMatND* arr = (CvMatND*)cvAlloc( sizeof(*arr) );
@@ -301,7 +301,7 @@ CV_IMPL CvMatND*
 cvCloneMatND( const CvMatND* src )
 {
     if( !CV_IS_MATND_HDR( src ))
-        CV_Error( CV_StsBadArg, "Bad CvMatND header" );
+        CV_Error( cv::Error::StsBadArg, "Bad CvMatND header" );
 
     CV_Assert( src->dims <= CV_MAX_DIM );
     int sizes[CV_MAX_DIM];
@@ -335,12 +335,12 @@ cvGetMatND( const CvArr* arr, CvMatND* matnd, int* coi )
         *coi = 0;
 
     if( !matnd || !arr )
-        CV_Error( CV_StsNullPtr, "NULL array pointer is passed" );
+        CV_Error( cv::Error::StsNullPtr, "NULL array pointer is passed" );
 
     if( CV_IS_MATND_HDR(arr))
     {
         if( !((CvMatND*)arr)->data.ptr )
-            CV_Error( CV_StsNullPtr, "The matrix has NULL data pointer" );
+            CV_Error( cv::Error::StsNullPtr, "The matrix has NULL data pointer" );
 
         result = (CvMatND*)arr;
     }
@@ -352,10 +352,10 @@ cvGetMatND( const CvArr* arr, CvMatND* matnd, int* coi )
             mat = cvGetMat( mat, &stub, coi );
 
         if( !CV_IS_MAT_HDR( mat ))
-            CV_Error( CV_StsBadArg, "Unrecognized or unsupported array type" );
+            CV_Error( cv::Error::StsBadArg, "Unrecognized or unsupported array type" );
 
         if( !mat->data.ptr )
-            CV_Error( CV_StsNullPtr, "Input array has NULL data pointer" );
+            CV_Error( cv::Error::StsNullPtr, "Input array has NULL data pointer" );
 
         matnd->data.ptr = mat->data.ptr;
         matnd->refcount = 0;
@@ -393,16 +393,16 @@ cvInitNArrayIterator( int count, CvArr** arrs,
     CvMatND* hdr0 = 0;
 
     if( count < 1 || count > CV_MAX_ARR )
-        CV_Error( CV_StsOutOfRange, "Incorrect number of arrays" );
+        CV_Error( cv::Error::StsOutOfRange, "Incorrect number of arrays" );
 
     if( !arrs || !stubs )
-        CV_Error( CV_StsNullPtr, "Some of required array pointers is NULL" );
+        CV_Error( cv::Error::StsNullPtr, "Some of required array pointers is NULL" );
 
     if( !iterator )
-        CV_Error( CV_StsNullPtr, "Iterator pointer is NULL" );
+        CV_Error( cv::Error::StsNullPtr, "Iterator pointer is NULL" );
 
     if (mask)
-        CV_Error( CV_StsBadArg, "Iterator with mask is not supported" );
+        CV_Error( cv::Error::StsBadArg, "Iterator with mask is not supported" );
 
     for( i = 0; i < count; i++ )
     {
@@ -410,7 +410,7 @@ cvInitNArrayIterator( int count, CvArr** arrs,
         CvMatND* hdr;
 
         if( !arr )
-            CV_Error( CV_StsNullPtr, "Some of required array pointers is NULL" );
+            CV_Error( cv::Error::StsNullPtr, "Some of required array pointers is NULL" );
 
         if( CV_IS_MATND( arr ))
             hdr = (CvMatND*)arr;
@@ -419,7 +419,7 @@ cvInitNArrayIterator( int count, CvArr** arrs,
             int coi = 0;
             hdr = cvGetMatND( arr, stubs + i, &coi );
             if( coi != 0 )
-                CV_Error( CV_BadCOI, "COI set is not allowed here" );
+                CV_Error( cv::Error::BadCOI, "COI set is not allowed here" );
         }
 
         iterator->hdr[i] = hdr;
@@ -427,24 +427,24 @@ cvInitNArrayIterator( int count, CvArr** arrs,
         if( i > 0 )
         {
             if( hdr->dims != hdr0->dims )
-                CV_Error( CV_StsUnmatchedSizes,
+                CV_Error( cv::Error::StsUnmatchedSizes,
                           "Number of dimensions is the same for all arrays" );
 
             switch( flags & (CV_NO_DEPTH_CHECK|CV_NO_CN_CHECK))
             {
             case 0:
                 if( !CV_ARE_TYPES_EQ( hdr, hdr0 ))
-                    CV_Error( CV_StsUnmatchedFormats,
+                    CV_Error( cv::Error::StsUnmatchedFormats,
                               "Data type is not the same for all arrays" );
                 break;
             case CV_NO_DEPTH_CHECK:
                 if( !CV_ARE_CNS_EQ( hdr, hdr0 ))
-                    CV_Error( CV_StsUnmatchedFormats,
+                    CV_Error( cv::Error::StsUnmatchedFormats,
                               "Number of channels is not the same for all arrays" );
                 break;
             case CV_NO_CN_CHECK:
                 if( !CV_ARE_CNS_EQ( hdr, hdr0 ))
-                    CV_Error( CV_StsUnmatchedFormats,
+                    CV_Error( cv::Error::StsUnmatchedFormats,
                               "Depth is not the same for all arrays" );
                 break;
             }
@@ -453,7 +453,7 @@ cvInitNArrayIterator( int count, CvArr** arrs,
             {
                 for( j = 0; j < hdr->dims; j++ )
                     if( hdr->dim[j].size != hdr0->dim[j].size )
-                        CV_Error( CV_StsUnmatchedSizes,
+                        CV_Error( cv::Error::StsUnmatchedSizes,
                                   "Dimension sizes are the same for all arrays" );
             }
         }
@@ -536,18 +536,18 @@ cvCreateSparseMat( int dims, const int* sizes, int type )
     CvMemStorage* storage;
 
     if( pix_size == 0 )
-        CV_Error( CV_StsUnsupportedFormat, "invalid array data type" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "invalid array data type" );
 
     if( dims <= 0 || dims > CV_MAX_DIM )
-        CV_Error( CV_StsOutOfRange, "bad number of dimensions" );
+        CV_Error( cv::Error::StsOutOfRange, "bad number of dimensions" );
 
     if( !sizes )
-        CV_Error( CV_StsNullPtr, "NULL <sizes> pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL <sizes> pointer" );
 
     for( i = 0; i < dims; i++ )
     {
         if( sizes[i] <= 0 )
-            CV_Error( CV_StsBadSize, "one of dimension sizes is non-positive" );
+            CV_Error( cv::Error::StsBadSize, "one of dimension sizes is non-positive" );
     }
 
     CvSparseMat* arr = (CvSparseMat*)cvAlloc(sizeof(*arr)+MAX(0,dims-CV_MAX_DIM)*sizeof(arr->size[0]));
@@ -587,7 +587,7 @@ cvReleaseSparseMat( CvSparseMat** array )
         CvSparseMat* arr = *array;
 
         if( !CV_IS_SPARSE_MAT_HDR(arr) )
-            CV_Error( CV_StsBadFlag, "" );
+            CV_Error( cv::Error::StsBadFlag, "" );
 
         *array = 0;
 
@@ -604,7 +604,7 @@ CV_IMPL CvSparseMat*
 cvCloneSparseMat( const CvSparseMat* src )
 {
     if( !CV_IS_SPARSE_MAT_HDR(src) )
-        CV_Error( CV_StsBadArg, "Invalid sparse array header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid sparse array header" );
 
     CvSparseMat* dst = cvCreateSparseMat( src->dims, src->size, src->type );
     cvCopy( src, dst );
@@ -619,10 +619,10 @@ cvInitSparseMatIterator( const CvSparseMat* mat, CvSparseMatIterator* iterator )
     int idx;
 
     if( !CV_IS_SPARSE_MAT( mat ))
-        CV_Error( CV_StsBadArg, "Invalid sparse matrix header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid sparse matrix header" );
 
     if( !iterator )
-        CV_Error( CV_StsNullPtr, "NULL iterator pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL iterator pointer" );
 
     iterator->mat = (CvSparseMat*)mat;
     iterator->node = 0;
@@ -656,7 +656,7 @@ icvGetNodePtr( CvSparseMat* mat, const int* idx, int* _type,
         {
             int t = idx[i];
             if( (unsigned)t >= (unsigned)mat->size[i] )
-                CV_Error( CV_StsOutOfRange, "One of indices is out of range" );
+                CV_Error( cv::Error::StsOutOfRange, "One of indices is out of range" );
             hashval = hashval*ICV_SPARSE_MAT_HASH_MULTIPLIER + t;
         }
     }
@@ -750,7 +750,7 @@ icvDeleteNode( CvSparseMat* mat, const int* idx, unsigned* precalc_hashval )
         {
             int t = idx[i];
             if( (unsigned)t >= (unsigned)mat->size[i] )
-                CV_Error( CV_StsOutOfRange, "One of indices is out of range" );
+                CV_Error( cv::Error::StsOutOfRange, "One of indices is out of range" );
             hashval = hashval*ICV_SPARSE_MAT_HASH_MULTIPLIER + t;
         }
     }
@@ -805,7 +805,7 @@ cvCreateData( CvArr* arr )
             return;
 
         if( mat->data.ptr != 0 )
-            CV_Error( CV_StsError, "Data is already allocated" );
+            CV_Error( cv::Error::StsError, "Data is already allocated" );
 
         if( step == 0 )
             step = CV_ELEM_SIZE(mat->type)*mat->cols;
@@ -813,7 +813,7 @@ cvCreateData( CvArr* arr )
         int64 _total_size = (int64)step*mat->rows + sizeof(int) + CV_MALLOC_ALIGN;
         total_size = (size_t)_total_size;
         if(_total_size != (int64)total_size)
-            CV_Error(CV_StsNoMem, "Too big buffer is allocated" );
+            CV_Error(cv::Error::StsNoMem, "Too big buffer is allocated" );
         mat->refcount = (int*)cvAlloc( (size_t)total_size );
         mat->data.ptr = (uchar*)cvAlignPtr( mat->refcount + 1, CV_MALLOC_ALIGN );
         *mat->refcount = 1;
@@ -823,13 +823,13 @@ cvCreateData( CvArr* arr )
         IplImage* img = (IplImage*)arr;
 
         if( img->imageData != 0 )
-            CV_Error( CV_StsError, "Data is already allocated" );
+            CV_Error( cv::Error::StsError, "Data is already allocated" );
 
         if( !CvIPL.allocateData )
         {
             const int64 imageSize_tmp = (int64)img->widthStep*(int64)img->height;
             if( (int64)img->imageSize != imageSize_tmp )
-                CV_Error( CV_StsNoMem, "Overflow for imageSize" );
+                CV_Error( cv::Error::StsNoMem, "Overflow for imageSize" );
             img->imageData = img->imageDataOrigin =
                         (char*)cvAlloc( (size_t)img->imageSize );
         }
@@ -859,7 +859,7 @@ cvCreateData( CvArr* arr )
             return;
 
         if( mat->data.ptr != 0 )
-            CV_Error( CV_StsError, "Data is already allocated" );
+            CV_Error( cv::Error::StsError, "Data is already allocated" );
 
         if( CV_IS_MAT_CONT( mat->type ))
         {
@@ -884,7 +884,7 @@ cvCreateData( CvArr* arr )
         *mat->refcount = 1;
     }
     else
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
 }
 
 
@@ -908,7 +908,7 @@ cvSetData( CvArr* arr, void* data, int step )
         if( step != CV_AUTOSTEP && step != 0 )
         {
             if( step < min_step && data != 0 )
-                CV_Error( CV_BadStep, "" );
+                CV_Error( cv::Error::BadStep, "" );
             mat->step = step;
         }
         else
@@ -929,7 +929,7 @@ cvSetData( CvArr* arr, void* data, int step )
         if( step != CV_AUTOSTEP && img->height > 1 )
         {
             if( step < min_step && data != 0 )
-                CV_Error( CV_BadStep, "" );
+                CV_Error( cv::Error::BadStep, "" );
             img->widthStep = step;
         }
         else
@@ -940,7 +940,7 @@ cvSetData( CvArr* arr, void* data, int step )
         const int64 imageSize_tmp = (int64)img->widthStep*(int64)img->height;
         img->imageSize = (int)imageSize_tmp;
         if( (int64)img->imageSize != imageSize_tmp )
-            CV_Error( CV_StsNoMem, "Overflow for imageSize" );
+            CV_Error( cv::Error::StsNoMem, "Overflow for imageSize" );
         img->imageData = img->imageDataOrigin = (char*)data;
 
         if( (((int)(size_t)data | step) & 7) == 0 &&
@@ -956,7 +956,7 @@ cvSetData( CvArr* arr, void* data, int step )
         int64 cur_step;
 
         if( step != CV_AUTOSTEP )
-            CV_Error( CV_BadStep,
+            CV_Error( cv::Error::BadStep,
             "For multidimensional array only CV_AUTOSTEP is allowed here" );
 
         mat->data.ptr = (uchar*)data;
@@ -965,13 +965,13 @@ cvSetData( CvArr* arr, void* data, int step )
         for( i = mat->dims - 1; i >= 0; i-- )
         {
             if( cur_step > INT_MAX )
-                CV_Error( CV_StsOutOfRange, "The array is too big" );
+                CV_Error( cv::Error::StsOutOfRange, "The array is too big" );
             mat->dim[i].step = (int)cur_step;
             cur_step *= mat->dim[i].size;
         }
     }
     else
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
 }
 
 
@@ -1000,7 +1000,7 @@ cvReleaseData( CvArr* arr )
         }
     }
     else
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
 }
 
 
@@ -1048,7 +1048,7 @@ cvGetRawData( const CvArr* arr, uchar** data, int* step, CvSize* roi_size )
         CvMatND* mat = (CvMatND*)arr;
 
         if( !CV_IS_MAT_CONT( mat->type ))
-            CV_Error( CV_StsBadArg, "Only continuous nD arrays are supported here" );
+            CV_Error( cv::Error::StsBadArg, "Only continuous nD arrays are supported here" );
 
         if( data )
             *data = mat->data.ptr;
@@ -1077,7 +1077,7 @@ cvGetRawData( const CvArr* arr, uchar** data, int* step, CvSize* roi_size )
         }
     }
     else
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
 }
 
 
@@ -1093,7 +1093,7 @@ cvGetElemType( const CvArr* arr )
         type = CV_MAKETYPE( IPL2CV_DEPTH(img->depth), img->nChannels );
     }
     else
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
 
     return type;
 }
@@ -1147,7 +1147,7 @@ cvGetDims( const CvArr* arr, int* sizes )
             memcpy( sizes, mat->size, dims*sizeof(sizes[0]));
     }
     else
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
 
     return dims;
 }
@@ -1172,7 +1172,7 @@ cvGetDimSize( const CvArr* arr, int index )
             size = mat->cols;
             break;
         default:
-            CV_Error( CV_StsOutOfRange, "bad dimension index" );
+            CV_Error( cv::Error::StsOutOfRange, "bad dimension index" );
         }
     }
     else if( CV_IS_IMAGE( arr ))
@@ -1188,7 +1188,7 @@ cvGetDimSize( const CvArr* arr, int index )
             size = !img->roi ? img->width : img->roi->width;
             break;
         default:
-            CV_Error( CV_StsOutOfRange, "bad dimension index" );
+            CV_Error( cv::Error::StsOutOfRange, "bad dimension index" );
         }
     }
     else if( CV_IS_MATND_HDR( arr ))
@@ -1196,7 +1196,7 @@ cvGetDimSize( const CvArr* arr, int index )
         CvMatND* mat = (CvMatND*)arr;
 
         if( (unsigned)index >= (unsigned)mat->dims )
-            CV_Error( CV_StsOutOfRange, "bad dimension index" );
+            CV_Error( cv::Error::StsOutOfRange, "bad dimension index" );
 
         size = mat->dim[index].size;
     }
@@ -1205,12 +1205,12 @@ cvGetDimSize( const CvArr* arr, int index )
         CvSparseMat* mat = (CvSparseMat*)arr;
 
         if( (unsigned)index >= (unsigned)mat->dims )
-            CV_Error( CV_StsOutOfRange, "bad dimension index" );
+            CV_Error( cv::Error::StsOutOfRange, "bad dimension index" );
 
         size = mat->size[index];
     }
     else
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
 
     return size;
 }
@@ -1245,7 +1245,7 @@ cvGetSize( const CvArr* arr )
         }
     }
     else
-        CV_Error( CV_StsBadArg, "Array should be CvMat or IplImage" );
+        CV_Error( cv::Error::StsBadArg, "Array should be CvMat or IplImage" );
 
     return size;
 }
@@ -1262,14 +1262,14 @@ cvGetSubRect( const CvArr* arr, CvMat* submat, CvRect rect )
         mat = cvGetMat( mat, &stub );
 
     if( !submat )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( (rect.x|rect.y|rect.width|rect.height) < 0 )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     if( rect.x + rect.width > mat->cols ||
         rect.y + rect.height > mat->rows )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     {
     /*
@@ -1307,11 +1307,11 @@ cvGetRows( const CvArr* arr, CvMat* submat,
         mat = cvGetMat( mat, &stub );
 
     if( !submat )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( (unsigned)start_row >= (unsigned)mat->rows ||
         (unsigned)end_row > (unsigned)mat->rows || delta_row <= 0 )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     {
     /*
@@ -1359,12 +1359,12 @@ cvGetCols( const CvArr* arr, CvMat* submat, int start_col, int end_col )
         mat = cvGetMat( mat, &stub );
 
     if( !submat )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     cols = mat->cols;
     if( (unsigned)start_col >= (unsigned)cols ||
         (unsigned)end_col > (unsigned)cols )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     {
     /*
@@ -1401,7 +1401,7 @@ cvGetDiag( const CvArr* arr, CvMat* submat, int diag )
         mat = cvGetMat( mat, &stub );
 
     if( !submat )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     pix_size = CV_ELEM_SIZE(mat->type);
 
@@ -1419,7 +1419,7 @@ cvGetDiag( const CvArr* arr, CvMat* submat, int diag )
         len = mat->cols - diag;
 
         if( len <= 0 )
-            CV_Error( CV_StsOutOfRange, "" );
+            CV_Error( cv::Error::StsOutOfRange, "" );
 
         len = CV_IMIN( len, mat->rows );
         submat->data.ptr = mat->data.ptr + diag*pix_size;
@@ -1429,7 +1429,7 @@ cvGetDiag( const CvArr* arr, CvMat* submat, int diag )
         len = mat->rows + diag;
 
         if( len <= 0 )
-            CV_Error( CV_StsOutOfRange, "" );
+            CV_Error( cv::Error::StsOutOfRange, "" );
 
         len = CV_IMIN( len, mat->cols );
         submat->data.ptr = mat->data.ptr - diag*mat->step;
@@ -1464,7 +1464,7 @@ cvScalarToRawData( const CvScalar* scalar, void* data, int type, int extend_to_1
 
     CV_Assert( scalar && data );
     if( (unsigned)(cn - 1) >= 4 )
-        CV_Error( CV_StsOutOfRange, "The number of channels must be 1, 2, 3 or 4" );
+        CV_Error( cv::Error::StsOutOfRange, "The number of channels must be 1, 2, 3 or 4" );
 
     switch( depth )
     {
@@ -1510,7 +1510,7 @@ cvScalarToRawData( const CvScalar* scalar, void* data, int type, int extend_to_1
         break;
     default:
         CV_Assert(0);
-        CV_Error( CV_BadDepth, "" );
+        CV_Error( cv::Error::BadDepth, "" );
     }
 
     if( extend_to_12 )
@@ -1537,7 +1537,7 @@ cvRawDataToScalar( const void* data, int flags, CvScalar* scalar )
     CV_Assert( scalar && data );
 
     if( (unsigned)(cn - 1) >= 4 )
-        CV_Error( CV_StsOutOfRange, "The number of channels must be 1, 2, 3 or 4" );
+        CV_Error( cv::Error::StsOutOfRange, "The number of channels must be 1, 2, 3 or 4" );
 
     memset( scalar->val, 0, sizeof(scalar->val));
 
@@ -1573,7 +1573,7 @@ cvRawDataToScalar( const void* data, int flags, CvScalar* scalar )
         break;
     default:
         CV_Assert(0);
-        CV_Error( CV_BadDepth, "" );
+        CV_Error( cv::Error::BadDepth, "" );
     }
 }
 
@@ -1660,7 +1660,7 @@ cvPtr1D( const CvArr* arr, int idx, int* _type )
         // that the index is within the matrix
         if( (unsigned)idx >= (unsigned)(mat->rows + mat->cols - 1) &&
             (unsigned)idx >= (unsigned)(mat->rows*mat->cols))
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         if( CV_IS_MAT_CONT(mat->type))
         {
@@ -1697,7 +1697,7 @@ cvPtr1D( const CvArr* arr, int idx, int* _type )
             size *= mat->dim[j].size;
 
         if((unsigned)idx >= (unsigned)size )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         if( CV_IS_MAT_CONT(mat->type))
         {
@@ -1741,7 +1741,7 @@ cvPtr1D( const CvArr* arr, int idx, int* _type )
     }
     else
     {
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
     }
 
     return ptr;
@@ -1760,7 +1760,7 @@ cvPtr2D( const CvArr* arr, int y, int x, int* _type )
 
         if( (unsigned)y >= (unsigned)(mat->rows) ||
             (unsigned)x >= (unsigned)(mat->cols) )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         type = CV_MAT_TYPE(mat->type);
         if( _type )
@@ -1790,7 +1790,7 @@ cvPtr2D( const CvArr* arr, int y, int x, int* _type )
             {
                 int coi = img->roi->coi;
                 if( !coi )
-                    CV_Error( CV_BadCOI,
+                    CV_Error( cv::Error::BadCOI,
                         "COI must be non-null in case of planar images" );
                 ptr += (coi - 1)*img->imageSize;
             }
@@ -1803,7 +1803,7 @@ cvPtr2D( const CvArr* arr, int y, int x, int* _type )
 
         if( (unsigned)y >= (unsigned)height ||
             (unsigned)x >= (unsigned)width )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         ptr += y*img->widthStep + x*pix_size;
 
@@ -1811,7 +1811,7 @@ cvPtr2D( const CvArr* arr, int y, int x, int* _type )
         {
             int type = IPL2CV_DEPTH(img->depth);
             if( type < 0 || (unsigned)(img->nChannels - 1) > 3 )
-                CV_Error( CV_StsUnsupportedFormat, "" );
+                CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
             *_type = CV_MAKETYPE( type, img->nChannels );
         }
@@ -1823,7 +1823,7 @@ cvPtr2D( const CvArr* arr, int y, int x, int* _type )
         if( mat->dims != 2 ||
             (unsigned)y >= (unsigned)(mat->dim[0].size) ||
             (unsigned)x >= (unsigned)(mat->dim[1].size) )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         ptr = mat->data.ptr + (size_t)y*mat->dim[0].step + x*mat->dim[1].step;
         if( _type )
@@ -1837,7 +1837,7 @@ cvPtr2D( const CvArr* arr, int y, int x, int* _type )
     }
     else
     {
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
     }
 
     return ptr;
@@ -1857,7 +1857,7 @@ cvPtr3D( const CvArr* arr, int z, int y, int x, int* _type )
             (unsigned)z >= (unsigned)(mat->dim[0].size) ||
             (unsigned)y >= (unsigned)(mat->dim[1].size) ||
             (unsigned)x >= (unsigned)(mat->dim[2].size) )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         ptr = mat->data.ptr + (size_t)z*mat->dim[0].step +
               (size_t)y*mat->dim[1].step + x*mat->dim[2].step;
@@ -1872,7 +1872,7 @@ cvPtr3D( const CvArr* arr, int z, int y, int x, int* _type )
     }
     else
     {
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
     }
 
     return ptr;
@@ -1886,7 +1886,7 @@ cvPtrND( const CvArr* arr, const int* idx, int* _type,
 {
     uchar* ptr = 0;
     if( !idx )
-        CV_Error( CV_StsNullPtr, "NULL pointer to indices" );
+        CV_Error( cv::Error::StsNullPtr, "NULL pointer to indices" );
 
     if( CV_IS_SPARSE_MAT( arr ))
         ptr = icvGetNodePtr( (CvSparseMat*)arr, idx,
@@ -1900,7 +1900,7 @@ cvPtrND( const CvArr* arr, const int* idx, int* _type,
         for( i = 0; i < mat->dims; i++ )
         {
             if( (unsigned)idx[i] >= (unsigned)(mat->dim[i].size) )
-                CV_Error( CV_StsOutOfRange, "index is out of range" );
+                CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
             ptr += (size_t)idx[i]*mat->dim[i].step;
         }
 
@@ -1910,7 +1910,7 @@ cvPtrND( const CvArr* arr, const int* idx, int* _type,
     else if( CV_IS_MAT_HDR(arr) || CV_IS_IMAGE_HDR(arr) )
         ptr = cvPtr2D( arr, idx[0], idx[1], _type );
     else
-        CV_Error( CV_StsBadArg, "unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadArg, "unrecognized or unsupported array type" );
 
     return ptr;
 }
@@ -1935,7 +1935,7 @@ cvGet1D( const CvArr* arr, int idx )
         // that the index is within the matrix
         if( (unsigned)idx >= (unsigned)(mat->rows + mat->cols - 1) &&
             (unsigned)idx >= (unsigned)(mat->rows*mat->cols))
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         ptr = mat->data.ptr + (size_t)idx*pix_size;
     }
@@ -1965,7 +1965,7 @@ cvGet2D( const CvArr* arr, int y, int x )
 
         if( (unsigned)y >= (unsigned)(mat->rows) ||
             (unsigned)x >= (unsigned)(mat->cols) )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         type = CV_MAT_TYPE(mat->type);
         ptr = mat->data.ptr + (size_t)y*mat->step + x*CV_ELEM_SIZE(type);
@@ -2046,7 +2046,7 @@ cvGetReal1D( const CvArr* arr, int idx )
         // that the index is within the matrix
         if( (unsigned)idx >= (unsigned)(mat->rows + mat->cols - 1) &&
             (unsigned)idx >= (unsigned)(mat->rows*mat->cols))
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         ptr = mat->data.ptr + (size_t)idx*pix_size;
     }
@@ -2058,7 +2058,7 @@ cvGetReal1D( const CvArr* arr, int idx )
     if( ptr )
     {
         if( CV_MAT_CN( type ) > 1 )
-            CV_Error( CV_BadNumChannels, "cvGetReal* support only single-channel arrays" );
+            CV_Error( cv::Error::BadNumChannels, "cvGetReal* support only single-channel arrays" );
 
         value = icvGetReal( ptr, type );
     }
@@ -2080,7 +2080,7 @@ cvGetReal2D( const CvArr* arr, int y, int x )
 
         if( (unsigned)y >= (unsigned)(mat->rows) ||
             (unsigned)x >= (unsigned)(mat->cols) )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         type = CV_MAT_TYPE(mat->type);
         ptr = mat->data.ptr + (size_t)y*mat->step + x*CV_ELEM_SIZE(type);
@@ -2096,7 +2096,7 @@ cvGetReal2D( const CvArr* arr, int y, int x )
     if( ptr )
     {
         if( CV_MAT_CN( type ) > 1 )
-            CV_Error( CV_BadNumChannels, "cvGetReal* support only single-channel arrays" );
+            CV_Error( cv::Error::BadNumChannels, "cvGetReal* support only single-channel arrays" );
 
         value = icvGetReal( ptr, type );
     }
@@ -2124,7 +2124,7 @@ cvGetReal3D( const CvArr* arr, int z, int y, int x )
     if( ptr )
     {
         if( CV_MAT_CN( type ) > 1 )
-            CV_Error( CV_BadNumChannels, "cvGetReal* support only single-channel arrays" );
+            CV_Error( cv::Error::BadNumChannels, "cvGetReal* support only single-channel arrays" );
 
         value = icvGetReal( ptr, type );
     }
@@ -2149,7 +2149,7 @@ cvGetRealND( const CvArr* arr, const int* idx )
     if( ptr )
     {
         if( CV_MAT_CN( type ) > 1 )
-            CV_Error( CV_BadNumChannels, "cvGetReal* support only single-channel arrays" );
+            CV_Error( cv::Error::BadNumChannels, "cvGetReal* support only single-channel arrays" );
 
         value = icvGetReal( ptr, type );
     }
@@ -2176,7 +2176,7 @@ cvSet1D( CvArr* arr, int idx, CvScalar scalar )
         // that the index is within the matrix
         if( (unsigned)idx >= (unsigned)(mat->rows + mat->cols - 1) &&
             (unsigned)idx >= (unsigned)(mat->rows*mat->cols))
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         ptr = mat->data.ptr + (size_t)idx*pix_size;
     }
@@ -2202,7 +2202,7 @@ cvSet2D( CvArr* arr, int y, int x, CvScalar scalar )
 
         if( (unsigned)y >= (unsigned)(mat->rows) ||
             (unsigned)x >= (unsigned)(mat->cols) )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         type = CV_MAT_TYPE(mat->type);
         ptr = mat->data.ptr + (size_t)y*mat->step + x*CV_ELEM_SIZE(type);
@@ -2268,7 +2268,7 @@ cvSetReal1D( CvArr* arr, int idx, double value )
         // that the index is within the matrix
         if( (unsigned)idx >= (unsigned)(mat->rows + mat->cols - 1) &&
             (unsigned)idx >= (unsigned)(mat->rows*mat->cols))
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         ptr = mat->data.ptr + (size_t)idx*pix_size;
     }
@@ -2278,7 +2278,7 @@ cvSetReal1D( CvArr* arr, int idx, double value )
         ptr = icvGetNodePtr( (CvSparseMat*)arr, &idx, &type, -1, 0 );
 
     if( CV_MAT_CN( type ) > 1 )
-        CV_Error( CV_BadNumChannels, "cvSetReal* support only single-channel arrays" );
+        CV_Error( cv::Error::BadNumChannels, "cvSetReal* support only single-channel arrays" );
 
     if( ptr )
         icvSetReal( value, ptr, type );
@@ -2297,7 +2297,7 @@ cvSetReal2D( CvArr* arr, int y, int x, double value )
 
         if( (unsigned)y >= (unsigned)(mat->rows) ||
             (unsigned)x >= (unsigned)(mat->cols) )
-            CV_Error( CV_StsOutOfRange, "index is out of range" );
+            CV_Error( cv::Error::StsOutOfRange, "index is out of range" );
 
         type = CV_MAT_TYPE(mat->type);
         ptr = mat->data.ptr + (size_t)y*mat->step + x*CV_ELEM_SIZE(type);
@@ -2312,7 +2312,7 @@ cvSetReal2D( CvArr* arr, int y, int x, double value )
         ptr = icvGetNodePtr( (CvSparseMat*)arr, idx, &type, -1, 0 );
     }
     if( CV_MAT_CN( type ) > 1 )
-        CV_Error( CV_BadNumChannels, "cvSetReal* support only single-channel arrays" );
+        CV_Error( cv::Error::BadNumChannels, "cvSetReal* support only single-channel arrays" );
 
     if( ptr )
         icvSetReal( value, ptr, type );
@@ -2333,7 +2333,7 @@ cvSetReal3D( CvArr* arr, int z, int y, int x, double value )
         ptr = icvGetNodePtr( (CvSparseMat*)arr, idx, &type, -1, 0 );
     }
     if( CV_MAT_CN( type ) > 1 )
-        CV_Error( CV_BadNumChannels, "cvSetReal* support only single-channel arrays" );
+        CV_Error( cv::Error::BadNumChannels, "cvSetReal* support only single-channel arrays" );
 
     if( ptr )
         icvSetReal( value, ptr, type );
@@ -2352,7 +2352,7 @@ cvSetRealND( CvArr* arr, const int* idx, double value )
         ptr = icvGetNodePtr( (CvSparseMat*)arr, idx, &type, -1, 0 );
 
     if( CV_MAT_CN( type ) > 1 )
-        CV_Error( CV_BadNumChannels, "cvSetReal* support only single-channel arrays" );
+        CV_Error( cv::Error::BadNumChannels, "cvSetReal* support only single-channel arrays" );
 
     if( ptr )
         icvSetReal( value, ptr, type );
@@ -2389,12 +2389,12 @@ cvGetMat( const CvArr* array, CvMat* mat,
     int coi = 0;
 
     if( !mat || !src )
-        CV_Error( CV_StsNullPtr, "NULL array pointer is passed" );
+        CV_Error( cv::Error::StsNullPtr, "NULL array pointer is passed" );
 
     if( CV_IS_MAT_HDR(src))
     {
         if( !src->data.ptr )
-            CV_Error( CV_StsNullPtr, "The matrix has NULL data pointer" );
+            CV_Error( cv::Error::StsNullPtr, "The matrix has NULL data pointer" );
 
         result = (CvMat*)src;
     }
@@ -2404,11 +2404,11 @@ cvGetMat( const CvArr* array, CvMat* mat,
         int depth, order;
 
         if( img->imageData == 0 )
-            CV_Error( CV_StsNullPtr, "The image has NULL data pointer" );
+            CV_Error( cv::Error::StsNullPtr, "The image has NULL data pointer" );
 
         depth = IPL2CV_DEPTH( img->depth );
         if( depth < 0 )
-            CV_Error( CV_BadDepth, "" );
+            CV_Error( cv::Error::BadDepth, "" );
 
         order = img->dataOrder & (img->nChannels > 1 ? -1 : 0);
 
@@ -2419,7 +2419,7 @@ cvGetMat( const CvArr* array, CvMat* mat,
                 int type = depth;
 
                 if( img->roi->coi == 0 )
-                    CV_Error( CV_StsBadFlag,
+                    CV_Error( cv::Error::StsBadFlag,
                     "Images with planar data layout should be used with COI selected" );
 
                 cvInitMatHeader( mat, img->roi->height,
@@ -2435,7 +2435,7 @@ cvGetMat( const CvArr* array, CvMat* mat,
                 coi = img->roi->coi;
 
                 if( img->nChannels > CV_CN_MAX )
-                    CV_Error( CV_BadNumChannels,
+                    CV_Error( cv::Error::BadNumChannels,
                         "The image is interleaved and has over CV_CN_MAX channels" );
 
                 cvInitMatHeader( mat, img->roi->height, img->roi->width,
@@ -2450,7 +2450,7 @@ cvGetMat( const CvArr* array, CvMat* mat,
             int type = CV_MAKETYPE( depth, img->nChannels );
 
             if( order != IPL_DATA_ORDER_PIXEL )
-                CV_Error( CV_StsBadFlag, "Pixel order should be used with coi == 0" );
+                CV_Error( cv::Error::StsBadFlag, "Pixel order should be used with coi == 0" );
 
             cvInitMatHeader( mat, img->height, img->width, type,
                              img->imageData, img->widthStep );
@@ -2464,10 +2464,10 @@ cvGetMat( const CvArr* array, CvMat* mat,
         int size1 = matnd->dim[0].size, size2 = 1;
 
         if( !src->data.ptr )
-            CV_Error( CV_StsNullPtr, "Input array has NULL data pointer" );
+            CV_Error( cv::Error::StsNullPtr, "Input array has NULL data pointer" );
 
         if( !CV_IS_MAT_CONT( matnd->type ))
-            CV_Error( CV_StsBadArg, "Only continuous nD arrays are supported here" );
+            CV_Error( cv::Error::StsBadArg, "Only continuous nD arrays are supported here" );
 
         if( matnd->dims > 2 )
         {
@@ -2491,7 +2491,7 @@ cvGetMat( const CvArr* array, CvMat* mat,
         result = mat;
     }
     else
-        CV_Error( CV_StsBadFlag, "Unrecognized or unsupported array type" );
+        CV_Error( cv::Error::StsBadFlag, "Unrecognized or unsupported array type" );
 
     if( pCOI )
         *pCOI = coi;
@@ -2509,10 +2509,10 @@ cvReshapeMatND( const CvArr* arr,
     int dims, coi = 0;
 
     if( !arr || !_header )
-        CV_Error( CV_StsNullPtr, "NULL pointer to array or destination header" );
+        CV_Error( cv::Error::StsNullPtr, "NULL pointer to array or destination header" );
 
     if( new_cn == 0 && new_dims == 0 )
-        CV_Error( CV_StsBadArg, "None of array parameters is changed: dummy call?" );
+        CV_Error( cv::Error::StsBadArg, "None of array parameters is changed: dummy call?" );
 
     dims = cvGetDims( arr );
 
@@ -2528,9 +2528,9 @@ cvReshapeMatND( const CvArr* arr,
     else
     {
         if( new_dims <= 0 || new_dims > CV_MAX_DIM )
-            CV_Error( CV_StsOutOfRange, "Non-positive or too large number of dimensions" );
+            CV_Error( cv::Error::StsOutOfRange, "Non-positive or too large number of dimensions" );
         if( !new_sizes )
-            CV_Error( CV_StsNullPtr, "New dimension sizes are not specified" );
+            CV_Error( cv::Error::StsNullPtr, "New dimension sizes are not specified" );
     }
 
     if( new_dims <= 2 )
@@ -2542,7 +2542,7 @@ cvReshapeMatND( const CvArr* arr,
         int  total_width, new_rows, cn;
 
         if( sizeof_header != sizeof(CvMat) && sizeof_header != sizeof(CvMatND) )
-            CV_Error( CV_StsBadArg, "The output header should be CvMat or CvMatND" );
+            CV_Error( cv::Error::StsBadArg, "The output header should be CvMat or CvMatND" );
 
         if( mat == (CvMat*)_header )
         {
@@ -2575,13 +2575,13 @@ cvReshapeMatND( const CvArr* arr,
             int total_size = total_width * mat->rows;
 
             if( !CV_IS_MAT_CONT( mat->type ))
-                CV_Error( CV_BadStep,
+                CV_Error( cv::Error::BadStep,
                 "The matrix is not continuous so the number of rows can not be changed" );
 
             total_width = total_size / new_rows;
 
             if( total_width * new_rows != total_size )
-                CV_Error( CV_StsBadArg, "The total number of matrix elements "
+                CV_Error( cv::Error::StsBadArg, "The total number of matrix elements "
                                         "is not divisible by the new number of rows" );
         }
 
@@ -2590,7 +2590,7 @@ cvReshapeMatND( const CvArr* arr,
 
         if( header.cols * new_cn != total_width ||
             (new_sizes && header.cols != new_sizes[1]) )
-            CV_Error( CV_StsBadArg, "The total matrix width is not "
+            CV_Error( cv::Error::StsBadArg, "The total matrix width is not "
                             "divisible by the new number of columns" );
 
         header.type = (mat->type & ~CV_MAT_TYPE_MASK) | CV_MAKETYPE(mat->type, new_cn);
@@ -2614,12 +2614,12 @@ cvReshapeMatND( const CvArr* arr,
         CvMatND* header = (CvMatND*)_header;
 
         if( sizeof_header != sizeof(CvMatND))
-            CV_Error( CV_StsBadSize, "The output header should be CvMatND" );
+            CV_Error( cv::Error::StsBadSize, "The output header should be CvMatND" );
 
         if( !new_sizes )
         {
             if( !CV_IS_MATND( arr ))
-                CV_Error( CV_StsBadArg, "The input array must be CvMatND" );
+                CV_Error( cv::Error::StsBadArg, "The input array must be CvMatND" );
 
             {
             CvMatND* mat = (CvMatND*)arr;
@@ -2628,7 +2628,7 @@ cvReshapeMatND( const CvArr* arr,
             int new_size = last_dim_size/new_cn;
 
             if( new_size*new_cn != last_dim_size )
-                CV_Error( CV_StsBadArg,
+                CV_Error( cv::Error::StsBadArg,
                 "The last dimension full size is not divisible by new number of channels");
 
             if( mat != header )
@@ -2650,7 +2650,7 @@ cvReshapeMatND( const CvArr* arr,
             int step;
 
             if( new_cn != 0 )
-                CV_Error( CV_StsBadArg,
+                CV_Error( cv::Error::StsBadArg,
                 "Simultaneous change of shape and number of channels is not supported. "
                 "Do it by 2 separate calls" );
 
@@ -2661,7 +2661,7 @@ cvReshapeMatND( const CvArr* arr,
             }
 
             if( CV_IS_MAT_CONT( mat->type ))
-                CV_Error( CV_StsBadArg, "Non-continuous nD arrays are not supported" );
+                CV_Error( cv::Error::StsBadArg, "Non-continuous nD arrays are not supported" );
 
             size1 = mat->dim[0].size;
             for( i = 1; i < dims; i++ )
@@ -2671,13 +2671,13 @@ cvReshapeMatND( const CvArr* arr,
             for( i = 0; i < new_dims; i++ )
             {
                 if( new_sizes[i] <= 0 )
-                    CV_Error( CV_StsBadSize,
+                    CV_Error( cv::Error::StsBadSize,
                     "One of new dimension sizes is non-positive" );
                 size2 *= new_sizes[i];
             }
 
             if( size1 != size2 )
-                CV_Error( CV_StsBadSize,
+                CV_Error( cv::Error::StsBadSize,
                 "Number of elements in the original and reshaped array is different" );
 
             if( header != mat )
@@ -2701,7 +2701,7 @@ cvReshapeMatND( const CvArr* arr,
     }
 
     if( coi )
-        CV_Error( CV_BadCOI, "COI is not supported by this operation" );
+        CV_Error( cv::Error::BadCOI, "COI is not supported by this operation" );
 
     result = _header;
     return result;
@@ -2717,20 +2717,20 @@ cvReshape( const CvArr* array, CvMat* header,
     int total_width, new_width;
 
     if( !header )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( !CV_IS_MAT( mat ))
     {
         int coi = 0;
         mat = cvGetMat( mat, header, &coi, 1 );
         if( coi )
-            CV_Error( CV_BadCOI, "COI is not supported" );
+            CV_Error( cv::Error::BadCOI, "COI is not supported" );
     }
 
     if( new_cn == 0 )
         new_cn = CV_MAT_CN(mat->type);
     else if( (unsigned)(new_cn - 1) > 3 )
-        CV_Error( CV_BadNumChannels, "" );
+        CV_Error( cv::Error::BadNumChannels, "" );
 
     if( mat != header )
     {
@@ -2754,16 +2754,16 @@ cvReshape( const CvArr* array, CvMat* header,
     {
         int total_size = total_width * mat->rows;
         if( !CV_IS_MAT_CONT( mat->type ))
-            CV_Error( CV_BadStep,
+            CV_Error( cv::Error::BadStep,
             "The matrix is not continuous, thus its number of rows can not be changed" );
 
         if( (unsigned)new_rows > (unsigned)total_size )
-            CV_Error( CV_StsOutOfRange, "Bad new number of rows" );
+            CV_Error( cv::Error::StsOutOfRange, "Bad new number of rows" );
 
         total_width = total_size / new_rows;
 
         if( total_width * new_rows != total_size )
-            CV_Error( CV_StsBadArg, "The total number of matrix elements "
+            CV_Error( cv::Error::StsBadArg, "The total number of matrix elements "
                                     "is not divisible by the new number of rows" );
 
         header->rows = new_rows;
@@ -2773,7 +2773,7 @@ cvReshape( const CvArr* array, CvMat* header,
     new_width = total_width / new_cn;
 
     if( new_width * new_cn != total_width )
-        CV_Error( CV_BadNumChannels,
+        CV_Error( cv::Error::BadNumChannels,
         "The total width is not divisible by the new number of channels" );
 
     header->cols = new_width;
@@ -2792,17 +2792,17 @@ cvGetImage( const CvArr* array, IplImage* img )
     const IplImage* src = (const IplImage*)array;
 
     if( !img )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( !CV_IS_IMAGE_HDR(src) )
     {
         const CvMat* mat = (const CvMat*)src;
 
         if( !CV_IS_MAT_HDR(mat))
-            CV_Error( CV_StsBadFlag, "" );
+            CV_Error( cv::Error::StsBadFlag, "" );
 
         if( mat->data.ptr == 0 )
-            CV_Error( CV_StsNullPtr, "" );
+            CV_Error( cv::Error::StsNullPtr, "" );
 
         int depth = cvIplDepth(mat->type);
 
@@ -2942,7 +2942,7 @@ cvInitImageHeader( IplImage * image, CvSize size, int depth,
          depth != (int)IPL_DEPTH_16S && depth != (int)IPL_DEPTH_32S &&
          depth != (int)IPL_DEPTH_32F && depth != (int)IPL_DEPTH_64F) ||
          channels < 0 )
-        CV_Error( CV_BadDepth, "Unsupported format" );
+        CV_Error( cv::Error::BadDepth, "Unsupported format" );
     if( origin != CV_ORIGIN_BL && origin != CV_ORIGIN_TL )
         CV_Error( CV_BadOrigin, "Bad input origin" );
 
@@ -2969,7 +2969,7 @@ cvInitImageHeader( IplImage * image, CvSize size, int depth,
     const int64 imageSize_tmp = (int64)image->widthStep*(int64)image->height;
     image->imageSize = (int)imageSize_tmp;
     if( (int64)image->imageSize != imageSize_tmp )
-        CV_Error( CV_StsNoMem, "Overflow for imageSize" );
+        CV_Error( cv::Error::StsNoMem, "Overflow for imageSize" );
 
     return image;
 }
@@ -2979,7 +2979,7 @@ CV_IMPL void
 cvReleaseImageHeader( IplImage** image )
 {
     if( !image )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( *image )
     {
@@ -3003,7 +3003,7 @@ CV_IMPL void
 cvReleaseImage( IplImage ** image )
 {
     if( !image )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( *image )
     {
@@ -3077,7 +3077,7 @@ cvGetImageROI( const IplImage* img )
 {
     CvRect rect = {0, 0, 0, 0};
     if( !img )
-        CV_Error( CV_StsNullPtr, "Null pointer to image" );
+        CV_Error( cv::Error::StsNullPtr, "Null pointer to image" );
 
     if( img->roi )
         rect = cvRect( img->roi->xOffset, img->roi->yOffset,
@@ -3096,7 +3096,7 @@ cvSetImageCOI( IplImage* image, int coi )
         CV_Error( CV_HeaderIsNull, "" );
 
     if( (unsigned)coi > (unsigned)(image->nChannels) )
-        CV_Error( CV_BadCOI, "" );
+        CV_Error( cv::Error::BadCOI, "" );
 
     if( image->roi || coi != 0 )
     {
@@ -3128,7 +3128,7 @@ cvCloneImage( const IplImage* src )
     IplImage* dst = 0;
 
     if( !CV_IS_IMAGE_HDR( src ))
-        CV_Error( CV_StsBadArg, "Bad image header" );
+        CV_Error( cv::Error::StsBadArg, "Bad image header" );
 
     if( !CvIPL.cloneImage )
     {
@@ -3174,13 +3174,13 @@ cvCheckTermCriteria( CvTermCriteria criteria, double default_eps,
     crit.epsilon = (float)default_eps;
 
     if( (criteria.type & ~(CV_TERMCRIT_EPS | CV_TERMCRIT_ITER)) != 0 )
-        CV_Error( CV_StsBadArg,
+        CV_Error( cv::Error::StsBadArg,
                   "Unknown type of term criteria" );
 
     if( (criteria.type & CV_TERMCRIT_ITER) != 0 )
     {
         if( criteria.max_iter <= 0 )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                   "Iterations flag is set and maximum number of iterations is <= 0" );
         crit.max_iter = criteria.max_iter;
     }
@@ -3188,13 +3188,13 @@ cvCheckTermCriteria( CvTermCriteria criteria, double default_eps,
     if( (criteria.type & CV_TERMCRIT_EPS) != 0 )
     {
         if( criteria.epsilon < 0 )
-            CV_Error( CV_StsBadArg, "Accuracy flag is set and epsilon is < 0" );
+            CV_Error( cv::Error::StsBadArg, "Accuracy flag is set and epsilon is < 0" );
 
         crit.epsilon = criteria.epsilon;
     }
 
     if( (criteria.type & (CV_TERMCRIT_EPS | CV_TERMCRIT_ITER)) == 0 )
-        CV_Error( CV_StsBadArg,
+        CV_Error( cv::Error::StsBadArg,
                   "Neither accuracy nor maximum iterations "
                   "number flags are set in criteria type" );
 
@@ -3221,7 +3221,7 @@ CV_IMPL void
 cvRelease( void** struct_ptr )
 {
     if( !struct_ptr )
-        CV_Error( CV_StsNullPtr, "NULL double pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL double pointer" );
 
     if( *struct_ptr )
     {
@@ -3230,7 +3230,7 @@ cvRelease( void** struct_ptr )
         else if( CV_IS_IMAGE(*struct_ptr))
             cvReleaseImage((IplImage**)struct_ptr);
         else
-            CV_Error( CV_StsError, "Unknown object type" );
+            CV_Error( cv::Error::StsError, "Unknown object type" );
     }
 }
 
@@ -3238,14 +3238,14 @@ void* cvClone( const void* struct_ptr )
 {
     void* ptr = 0;
     if( !struct_ptr )
-        CV_Error( CV_StsNullPtr, "NULL structure pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL structure pointer" );
 
     if( CV_IS_MAT(struct_ptr) )
         ptr = cvCloneMat((const CvMat*)struct_ptr);
     else if( CV_IS_IMAGE(struct_ptr))
         ptr = cvCloneImage((const IplImage*)struct_ptr);
     else
-        CV_Error( CV_StsError, "Unknown object type" );
+        CV_Error( cv::Error::StsError, "Unknown object type" );
     return ptr;
 }
 
diff --git a/modules/core/src/async.cpp b/modules/core/src/async.cpp
index 78c0a1ee8116..3aeaaf73943c 100644
--- a/modules/core/src/async.cpp
+++ b/modules/core/src/async.cpp
@@ -3,7 +3,6 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "precomp.hpp"
-//#undef CV_CXX11  // debug non C++11 mode
 #include "opencv2/core/async.hpp"
 #include "opencv2/core/detail/async_promise.hpp"
 
@@ -16,11 +15,9 @@
 
 #ifndef OPENCV_DISABLE_THREAD_SUPPORT
 
-#ifdef CV_CXX11
 #include <mutex>
 #include <condition_variable>
 #include <chrono>
-#endif
 
 namespace cv {
 
@@ -37,12 +34,8 @@ struct AsyncArray::Impl
     void releasePromise() CV_NOEXCEPT { CV_XADD(&refcount_promise, -1); if(1 == CV_XADD(&refcount, -1)) delete this; } \
     int refcount_promise;
 
-#ifdef CV_CXX11
     mutable std::mutex mtx;
     mutable std::condition_variable cond_var;
-#else
-    mutable cv::Mutex mtx;
-#endif
 
     mutable bool has_result; // Mat, UMat or exception
 
@@ -88,11 +81,7 @@ struct AsyncArray::Impl
             if (!wait_for(timeoutNs))
                 return false;
         }
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
         if (has_result)
         {
             if (!result_mat.empty())
@@ -145,7 +134,6 @@ struct AsyncArray::Impl
         if (timeoutNs == 0)
             return has_result;
         CV_LOG_INFO(NULL, "Waiting for async result ...");
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
         const auto cond_pred = [&]{ return has_result == true; };
         if (timeoutNs > 0)
@@ -156,9 +144,6 @@ struct AsyncArray::Impl
             CV_Assert(has_result);
             return true;
         }
-#else
-        CV_Error(Error::StsNotImplemented, "OpenCV has been built without async waiting support (C++11 is required)");
-#endif
     }
 
     AsyncArray getArrayResult()
@@ -175,11 +160,7 @@ struct AsyncArray::Impl
     {
         if (future_is_returned && refcount_future == 0)
             CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
         CV_Assert(!has_result);
         int k = value.kind();
         if (k == _InputArray::UMAT)
@@ -193,9 +174,7 @@ struct AsyncArray::Impl
             value.copyTo(*result_mat.get());
         }
         has_result = true;
-#ifdef CV_CXX11
         cond_var.notify_all();
-#endif
     }
 
 #if CV__EXCEPTION_PTR
@@ -203,18 +182,12 @@ struct AsyncArray::Impl
     {
         if (future_is_returned && refcount_future == 0)
             CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
         CV_Assert(!has_result);
         has_exception = true;
         exception = e;
         has_result = true;
-#ifdef CV_CXX11
         cond_var.notify_all();
-#endif
     }
 #endif
 
@@ -222,18 +195,12 @@ struct AsyncArray::Impl
     {
         if (future_is_returned && refcount_future == 0)
             CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
         CV_Assert(!has_result);
         has_exception = true;
         cv_exception = e;
         has_result = true;
-#ifdef CV_CXX11
         cond_var.notify_all();
-#endif
     }
 };
 
diff --git a/modules/core/src/batch_distance.cpp b/modules/core/src/batch_distance.cpp
index 1ce2edb7690d..4210c672b9d4 100644
--- a/modules/core/src/batch_distance.cpp
+++ b/modules/core/src/batch_distance.cpp
@@ -377,7 +377,7 @@ void cv::batchDistance( InputArray _src1, InputArray _src2,
     }
 
     if( func == 0 )
-        CV_Error_(CV_StsUnsupportedFormat,
+        CV_Error_(cv::Error::StsUnsupportedFormat,
                   ("The combination of type=%d, dtype=%d and normType=%d is not supported",
                    type, dtype, normType));
 
diff --git a/modules/core/src/channels.cpp b/modules/core/src/channels.cpp
index 6ceed44a28e1..3ee7088a4f2c 100644
--- a/modules/core/src/channels.cpp
+++ b/modules/core/src/channels.cpp
@@ -46,44 +46,44 @@ mixChannels_( const T** src, const int* sdelta,
 }
 
 
-static void mixChannels8u( const uchar** src, const int* sdelta,
-                           uchar** dst, const int* ddelta,
+static void mixChannels8u( const void** src, const int* sdelta,
+                           void** dst, const int* ddelta,
                            int len, int npairs )
 {
-    mixChannels_(src, sdelta, dst, ddelta, len, npairs);
+    mixChannels_((const uchar**)src, sdelta, (uchar**)dst, ddelta, len, npairs);
 }
 
-static void mixChannels16u( const ushort** src, const int* sdelta,
-                            ushort** dst, const int* ddelta,
+static void mixChannels16u( const void** src, const int* sdelta,
+                            void** dst, const int* ddelta,
                             int len, int npairs )
 {
-    mixChannels_(src, sdelta, dst, ddelta, len, npairs);
+    mixChannels_((const ushort**)src, sdelta, (ushort**)dst, ddelta, len, npairs);
 }
 
-static void mixChannels32s( const int** src, const int* sdelta,
-                            int** dst, const int* ddelta,
+static void mixChannels32s( const void** src, const int* sdelta,
+                            void** dst, const int* ddelta,
                             int len, int npairs )
 {
-    mixChannels_(src, sdelta, dst, ddelta, len, npairs);
+    mixChannels_((const int**)src, sdelta, (int**)dst, ddelta, len, npairs);
 }
 
-static void mixChannels64s( const int64** src, const int* sdelta,
-                            int64** dst, const int* ddelta,
+static void mixChannels64s( const void** src, const int* sdelta,
+                            void** dst, const int* ddelta,
                             int len, int npairs )
 {
-    mixChannels_(src, sdelta, dst, ddelta, len, npairs);
+    mixChannels_((const int64**)src, sdelta, (int64**)dst, ddelta, len, npairs);
 }
 
-typedef void (*MixChannelsFunc)( const uchar** src, const int* sdelta,
-        uchar** dst, const int* ddelta, int len, int npairs );
+typedef void (*MixChannelsFunc)( const void** src, const int* sdelta,
+        void** dst, const int* ddelta, int len, int npairs );
 
 static MixChannelsFunc getMixchFunc(int depth)
 {
-    static MixChannelsFunc mixchTab[] =
+    static MixChannelsFunc mixchTab[CV_DEPTH_MAX] =
     {
-        (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels16u,
-        (MixChannelsFunc)mixChannels16u, (MixChannelsFunc)mixChannels32s, (MixChannelsFunc)mixChannels32s,
-        (MixChannelsFunc)mixChannels64s, 0
+        mixChannels8u, mixChannels8u, mixChannels16u,
+        mixChannels16u, mixChannels32s, mixChannels32s,
+        mixChannels64s, 0
     };
 
     return mixchTab[depth];
@@ -146,6 +146,7 @@ void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, cons
     NAryMatIterator it(arrays, ptrs, (int)(nsrcs + ndsts));
     int total = (int)it.size, blocksize = std::min(total, (int)((BLOCK_SIZE + esz1-1)/esz1));
     MixChannelsFunc func = getMixchFunc(depth);
+    CV_Assert(func);
 
     for( i = 0; i < it.nplanes; i++, ++it )
     {
@@ -158,7 +159,7 @@ void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, cons
         for( int t = 0; t < total; t += blocksize )
         {
             int bsz = std::min(total - t, blocksize);
-            func( srcs, sdelta, dsts, ddelta, bsz, (int)npairs );
+            func( (const void**)srcs, sdelta, (void **)dsts, ddelta, bsz, (int)npairs );
 
             if( t + blocksize < total )
                 for( k = 0; k < npairs; k++ )
diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp
index ffd9b302bf6e..2891f3a2e37a 100644
--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -4,6 +4,8 @@
 
 #include "precomp.hpp"
 
+#include <sstream>
+
 #include "opencv2/core/check.hpp"
 
 namespace cv {
diff --git a/modules/core/src/command_line_parser.cpp b/modules/core/src/command_line_parser.cpp
index af97232db604..a83cb3166db8 100644
--- a/modules/core/src/command_line_parser.cpp
+++ b/modules/core/src/command_line_parser.cpp
@@ -464,7 +464,7 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
         {
             if (begin == true)
             {
-                throw cv::Exception(CV_StsParseError,
+                throw cv::Exception(cv::Error::StsParseError,
                          String("error in split_range_string(")
                          + str
                          + String(", ")
@@ -484,7 +484,7 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
         {
             if (begin == false)
             {
-                throw cv::Exception(CV_StsParseError,
+                throw cv::Exception(cv::Error::StsParseError,
                          String("error in split_range_string(")
                          + str
                          + String(", ")
@@ -508,7 +508,7 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
 
     if (begin == true)
     {
-        throw cv::Exception(CV_StsParseError,
+        throw cv::Exception(cv::Error::StsParseError,
                  String("error in split_range_string(")
                  + str
                  + String(", ")
diff --git a/modules/core/src/convert.dispatch.cpp b/modules/core/src/convert.dispatch.cpp
index 345b4624cb14..2b4035285fc8 100644
--- a/modules/core/src/convert.dispatch.cpp
+++ b/modules/core/src/convert.dispatch.cpp
@@ -11,13 +11,13 @@
 namespace cv {
 
 namespace hal {
-void cvt16f32f(const float16_t* src, float* dst, int len)
+void cvt16f32f(const hfloat* src, float* dst, int len)
 {
     CV_INSTRUMENT_REGION();
     CV_CPU_DISPATCH(cvt16f32f, (src, dst, len),
         CV_CPU_DISPATCH_MODES_ALL);
 }
-void cvt32f16f(const float* src, float16_t* dst, int len)
+void cvt32f16f(const float* src, hfloat* dst, int len)
 {
     CV_INSTRUMENT_REGION();
     CV_CPU_DISPATCH(cvt32f16f, (src, dst, len),
@@ -169,52 +169,130 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
     size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
     return k.run(2, globalsize, NULL, false);
 }
+
+static bool ocl_convertTo(InputArray src_, OutputArray dst_, int ddepth, bool noScale, double alpha, double beta)
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert(ddepth >= 0);
+
+    int stype = src_.type();
+    int sdepth = CV_MAT_DEPTH(stype);
+    int cn = CV_MAT_CN(stype);
+
+    int dtype = CV_MAKETYPE(ddepth, cn);
+
+    int wdepth = (sdepth == CV_64F) ? CV_64F : CV_32F;
+
+    bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
+    bool doubleCheck = true;
+    if (needDouble)
+    {
+        doubleCheck = ocl::Device::getDefault().hasFP64();
+    }
+    bool halfCheck = true;
+    bool needHalf = sdepth == CV_16F || ddepth == CV_16F;
+    if (needHalf)
+    {
+        halfCheck = ocl::Device::getDefault().hasFP16();
+    }
+
+    if (!doubleCheck)
+        return false;
+    if (!halfCheck)
+        return false;
+
+    const int rowsPerWI = 4;
+
+    char cvt[2][50];
+    ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
+                  format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s -D rowsPerWI=%d%s%s%s",
+                         ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
+                         ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
+                         ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
+                         rowsPerWI,
+                         needDouble ? " -D DOUBLE_SUPPORT" : "",
+                         needHalf ? " -D HALF_SUPPORT" : "",
+                         noScale ? " -D NO_SCALE" : ""
+                  )
+    );
+
+    if (k.empty())
+        return false;
+
+    UMat src = src_.getUMat();
+    dst_.createSameSize(src_, dtype);
+    UMat dst = dst_.getUMat();
+
+    float alphaf = (float)alpha, betaf = (float)beta;
+
+    if (noScale)
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn));
+    else if (wdepth == CV_32F)
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf);
+    else
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alpha, beta);
+
+    size_t globalsize[2] = {
+        (size_t)dst.cols * cn,
+        divUp((size_t)dst.rows, rowsPerWI)
+    };
+    if (!k.run(2, globalsize, NULL, false))
+        return false;
+
+    CV_IMPL_ADD(CV_IMPL_OCL);
+    return true;
+}
 #endif
 
-void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
+void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
 {
     CV_INSTRUMENT_REGION();
 
-    if( empty() )
+    if (empty())
     {
-        _dst.release();
+        dst.release();
         return;
     }
 
-    bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
+    int stype = type();
+    int sdepth = CV_MAT_DEPTH(stype);
 
-    if( _type < 0 )
-        _type = _dst.fixedType() ? _dst.type() : type();
+    int ddepth = sdepth;
+    if (type_ >= 0)
+        ddepth = CV_MAT_DEPTH(type_);
     else
-        _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
+        ddepth = dst.fixedType() ? dst.depth() : sdepth;
 
-    int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
-    if( sdepth == ddepth && noScale )
+    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
+    if (sdepth == ddepth && noScale)
     {
-        copyTo(_dst);
+        copyTo(dst);
         return;
     }
 
+    CV_OCL_RUN(dims <= 2 && dst.isUMat(),
+               ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
+
+    int cn = channels();
+    int dtype = CV_MAKETYPE(ddepth, cn);
+
     Mat src = *this;
-    if( dims <= 2 )
-        _dst.create( size(), _type );
-    else
-        _dst.create( dims, size, _type );
-    Mat dst = _dst.getMat();
+    dst.create(dims, size, dtype);
+    Mat dstMat = dst.getMat();
 
     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
     double scale[] = {alpha, beta};
-    int cn = channels();
     CV_Assert( func != 0 );
 
     if( dims <= 2 )
     {
-        Size sz = getContinuousSize2D(src, dst, cn);
-        func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
+        Size sz = getContinuousSize2D(src, dstMat, cn);
+        func(src.data, src.step, 0, 0, dstMat.data, dstMat.step, sz, scale);
     }
     else
     {
-        const Mat* arrays[] = {&src, &dst, 0};
+        const Mat* arrays[] = {&src, &dstMat, 0};
         uchar* ptrs[2] = {};
         NAryMatIterator it(arrays, ptrs);
         Size sz((int)(it.size*cn), 1);
@@ -224,6 +302,44 @@ void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) cons
     }
 }
 
+void UMat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
+{
+    CV_INSTRUMENT_REGION();
+
+    if (empty())
+    {
+        dst.release();
+        return;
+    }
+
+#ifdef HAVE_OPENCL
+    int stype = type();
+    int sdepth = CV_MAT_DEPTH(stype);
+
+    int ddepth = sdepth;
+    if (type_ >= 0)
+        ddepth = CV_MAT_DEPTH(type_);
+    else
+        ddepth = dst.fixedType() ? dst.depth() : sdepth;
+
+    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
+    if (sdepth == ddepth && noScale)
+    {
+        copyTo(dst);
+        return;
+    }
+
+    CV_OCL_RUN(dims <= 2,
+               ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
+#endif // HAVE_OPENCL
+
+    UMat src = *this;  // Fake reference to itself.
+                       // Resolves issue 8693 in case of src == dst.
+    Mat m = getMat(ACCESS_READ);
+    m.convertTo(dst, type_, alpha, beta);
+    (void)src;
+}
+
 //==================================================================================================
 
 void convertFp16(InputArray _src, OutputArray _dst)
diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp
index 4b9ddbb413c9..177f236ee7f2 100644
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -11,7 +11,7 @@
 namespace cv
 {
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 static inline void vx_load_as(const uchar* ptr, v_float32& a)
 { a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
@@ -31,7 +31,7 @@ static inline void vx_load_as(const int* ptr, v_float32& a)
 static inline void vx_load_as(const float* ptr, v_float32& a)
 { a = vx_load(ptr); }
 
-static inline void vx_load_as(const float16_t* ptr, v_float32& a)
+static inline void vx_load_as(const hfloat* ptr, v_float32& a)
 { a = vx_load_expand(ptr); }
 
 static inline void v_store_as(ushort* ptr, const v_float32& a)
@@ -46,7 +46,7 @@ static inline void v_store_as(int* ptr, const v_float32& a)
 static inline void v_store_as(float* ptr, const v_float32& a)
 { v_store(ptr, a); }
 
-static inline void v_store_as(float16_t* ptr, const v_float32& a)
+static inline void v_store_as(hfloat* ptr, const v_float32& a)
 { v_pack_store(ptr, a); }
 
 static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b)
@@ -62,7 +62,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b)
 }
 
 static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }
 
 static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b)
 {
@@ -76,7 +76,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b)
 { v_expand(vx_load(ptr), a, b); }
 
 static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }
 
 static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b)
 {
@@ -105,7 +105,7 @@ static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b)
 static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b)
 {
     a = vx_load(ptr);
-    b = vx_load(ptr + v_int32::nlanes);
+    b = vx_load(ptr + VTraits<v_int32>::vlanes());
 }
 
 static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b)
@@ -142,18 +142,18 @@ static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b)
 
 static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
 {
-    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes);
+    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + VTraits<v_int32>::vlanes());
     a = v_cvt_f32(ia);
     b = v_cvt_f32(ib);
 }
 
 static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_float32>::vlanes()); }
 
-static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
+static inline void vx_load_pair_as(const hfloat* ptr, v_float32& a, v_float32& b)
 {
     a = vx_load_expand(ptr);
-    b = vx_load_expand(ptr + v_float32::nlanes);
+    b = vx_load_expand(ptr + VTraits<v_float32>::vlanes());
 }
 
 static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
@@ -169,7 +169,7 @@ static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16
 }
 
 static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b)
-{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_uint16>::vlanes(), b); }
 
 static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b)
 { v_store(ptr, v_pack_u(a, b)); }
@@ -178,7 +178,7 @@ static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16&
 { v_store(ptr, v_pack(a, b)); }
 
 static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b)
-{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_int16>::vlanes(), b); }
 
 static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b)
 { v_pack_u_store(ptr, v_pack(a, b)); }
@@ -195,7 +195,7 @@ static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32&
 static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
 {
     v_store(ptr, a);
-    v_store(ptr + v_int32::nlanes, b);
+    v_store(ptr + VTraits<v_int32>::vlanes(), b);
 }
 
 static inline void v_store_pair_as(uchar* ptr, const v_float32& a, const v_float32& b)
@@ -214,24 +214,24 @@ static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32
 {
     v_int32 ia = v_round(a), ib = v_round(b);
     v_store(ptr, ia);
-    v_store(ptr + v_int32::nlanes, ib);
+    v_store(ptr + VTraits<v_int32>::vlanes(), ib);
 }
 
 static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
-{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_float32>::vlanes(), b); }
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 static inline void vx_load_as(const double* ptr, v_float32& a)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
     a = v_cvt_f32(v0, v1);
 }
 
 static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
-    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
+    v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
     v_int32 iv0 = v_round(v0), iv1 = v_round(v1);
     v_int32 iv2 = v_round(v2), iv3 = v_round(v3);
     a = v_combine_low(iv0, iv1);
@@ -240,8 +240,8 @@ static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
 
 static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
-    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
+    v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
     a = v_cvt_f32(v0, v1);
     b = v_cvt_f32(v2, v3);
 }
@@ -291,10 +291,10 @@ static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b)
 static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b)
 {
     a = vx_load(ptr);
-    b = vx_load(ptr + v_float64::nlanes);
+    b = vx_load(ptr + VTraits<v_float64>::vlanes());
 }
 
-static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
+static inline void vx_load_pair_as(const hfloat* ptr, v_float64& a, v_float64& b)
 {
     v_float32 v0 = vx_load_expand(ptr);
     a = v_cvt_f64(v0);
@@ -305,7 +305,7 @@ static inline void v_store_as(double* ptr, const v_float32& a)
 {
     v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
     v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
 }
 
 static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b)
@@ -314,9 +314,9 @@ static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32&
     v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
 
     v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
-    v_store(ptr + v_float64::nlanes*2, fb0);
-    v_store(ptr + v_float64::nlanes*3, fb1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
+    v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
 }
 
 static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b)
@@ -325,15 +325,15 @@ static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_floa
     v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
 
     v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
-    v_store(ptr + v_float64::nlanes*2, fb0);
-    v_store(ptr + v_float64::nlanes*3, fb1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
+    v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
 }
 
 static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b)
 {
     v_store(ptr, a);
-    v_store(ptr + v_float64::nlanes, b);
+    v_store(ptr + VTraits<v_float64>::vlanes(), b);
 }
 
 static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b)
@@ -348,7 +348,7 @@ static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float
     v_store(ptr, v);
 }
 
-static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
+static inline void v_store_pair_as(hfloat* ptr, const v_float64& a, const v_float64& b)
 {
     v_float32 v = v_cvt_f32(a, b);
     v_pack_store(ptr, v);
@@ -358,8 +358,8 @@ static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_f
 
 static inline void vx_load_as(const double* ptr, v_float32& a)
 {
-    const int VECSZ = v_float32::nlanes;
-    float buf[VECSZ*2];
+    const int VECSZ = VTraits<v_float32>::vlanes();
+    float buf[VTraits<v_float32>::max_nlanes*2];
 
     for( int i = 0; i < VECSZ; i++ )
         buf[i] = saturate_cast<float>(ptr[i]);
@@ -369,19 +369,19 @@ static inline void vx_load_as(const double* ptr, v_float32& a)
 template<typename _Tdvec>
 static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b)
 {
-    const int VECSZ = _Tdvec::nlanes;
-    typename _Tdvec::lane_type buf[VECSZ*2];
+    const int VECSZ = VTraits<_Tdvec>::vlanes();
+    typename VTraits<_Tdvec>::lane_type buf[VTraits<_Tdvec>::max_nlanes*2];
 
     for( int i = 0; i < VECSZ*2; i++ )
-        buf[i] = saturate_cast<typename _Tdvec::lane_type>(ptr[i]);
+        buf[i] = saturate_cast<typename VTraits<_Tdvec>::lane_type>(ptr[i]);
     a = vx_load(buf);
     b = vx_load(buf + VECSZ);
 }
 
 static inline void v_store_as(double* ptr, const v_float32& a)
 {
-    const int VECSZ = v_float32::nlanes;
-    float buf[VECSZ];
+    const int VECSZ = VTraits<v_float32>::vlanes();
+    float buf[VTraits<v_float32>::max_nlanes];
 
     v_store(buf, a);
     for( int i = 0; i < VECSZ; i++ )
@@ -391,8 +391,8 @@ static inline void v_store_as(double* ptr, const v_float32& a)
 template<typename _Tsvec>
 static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b)
 {
-    const int VECSZ = _Tsvec::nlanes;
-    typename _Tsvec::lane_type buf[VECSZ*2];
+    const int VECSZ = VTraits<_Tsvec>::vlanes();
+    typename VTraits<_Tsvec>::lane_type buf[VTraits<_Tsvec>::max_nlanes*2];
 
     v_store(buf, a); v_store(buf + VECSZ, b);
     for( int i = 0; i < VECSZ*2; i++ )
diff --git a/modules/core/src/convert.simd.hpp b/modules/core/src/convert.simd.hpp
index 5154041b6d45..d6c18a70b5be 100644
--- a/modules/core/src/convert.simd.hpp
+++ b/modules/core/src/convert.simd.hpp
@@ -14,8 +14,8 @@ namespace cv {
 namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-void cvt16f32f(const float16_t* src, float* dst, int len);
-void cvt32f16f(const float* src, float16_t* dst, int len);
+void cvt16f32f(const hfloat* src, float* dst, int len);
+void cvt32f16f(const float* src, hfloat* dst, int len);
 void addRNGBias32f(float* arr, const float* scaleBiasPairs, int len);
 void addRNGBias64f(double* arr, const double* scaleBiasPairs, int len);
 
@@ -35,12 +35,12 @@ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
 BinaryFunc getConvertFunc(int sdepth, int ddepth);
 
-void cvt16f32f( const float16_t* src, float* dst, int len )
+void cvt16f32f( const hfloat* src, float* dst, int len )
 {
     CV_INSTRUMENT_REGION();
     int j = 0;
-#if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; j < len; j += VECSZ )
     {
         if( j > len - VECSZ )
@@ -56,12 +56,12 @@ void cvt16f32f( const float16_t* src, float* dst, int len )
         dst[j] = (float)src[j];
 }
 
-void cvt32f16f( const float* src, float16_t* dst, int len )
+void cvt32f16f( const float* src, hfloat* dst, int len )
 {
     CV_INSTRUMENT_REGION();
     int j = 0;
-#if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; j < len; j += VECSZ )
     {
         if( j > len - VECSZ )
@@ -74,7 +74,7 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
     }
 #endif
     for( ; j < len; j++ )
-        dst[j] = float16_t(src[j]);
+        dst[j] = hfloat(src[j]);
 }
 
 void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
@@ -108,8 +108,8 @@ cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
-        const int VECSZ = _Twvec::nlanes*2;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<_Twvec>::vlanes()*2;
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -139,8 +139,8 @@ cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
-        const int VECSZ = _Twvec::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<_Twvec>::vlanes();
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -188,7 +188,7 @@ DEF_CVT_FUNC(8u16s, cvt_,  uchar, short,    v_int16)
 DEF_CVT_FUNC(8u32s, cvt_,  uchar, int,      v_int32)
 DEF_CVT_FUNC(8u32f, cvt_,  uchar, float,    v_float32)
 DEF_CVT_FUNC(8u64f, cvt_,  uchar, double,   v_int32)
-DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
+DEF_CVT_FUNC(8u16f, cvt1_, uchar, hfloat, v_float32)
 
 ////////////////////// 8s -> ... ////////////////////////
 
@@ -198,7 +198,7 @@ DEF_CVT_FUNC(8s16s, cvt_,  schar, short,    v_int16)
 DEF_CVT_FUNC(8s32s, cvt_,  schar, int,      v_int32)
 DEF_CVT_FUNC(8s32f, cvt_,  schar, float,    v_float32)
 DEF_CVT_FUNC(8s64f, cvt_,  schar, double,   v_int32)
-DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
+DEF_CVT_FUNC(8s16f, cvt1_, schar, hfloat, v_float32)
 
 ////////////////////// 16u -> ... ////////////////////////
 
@@ -208,7 +208,7 @@ DEF_CVT_FUNC(16u16s, cvt_, ushort, short,  v_int32)
 DEF_CVT_FUNC(16u32s, cvt_, ushort, int,    v_int32)
 DEF_CVT_FUNC(16u32f, cvt_, ushort, float,  v_float32)
 DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
-DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
+DEF_CVT_FUNC(16u16f, cvt1_,ushort, hfloat, v_float32)
 
 ////////////////////// 16s -> ... ////////////////////////
 
@@ -218,7 +218,7 @@ DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
 DEF_CVT_FUNC(16s32s, cvt_, short, int,    v_int32)
 DEF_CVT_FUNC(16s32f, cvt_, short, float,  v_float32)
 DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
-DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
+DEF_CVT_FUNC(16s16f, cvt1_,short, hfloat, v_float32)
 
 ////////////////////// 32s -> ... ////////////////////////
 
@@ -228,7 +228,7 @@ DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
 DEF_CVT_FUNC(32s16s, cvt_, int, short,  v_int32)
 DEF_CVT_FUNC(32s32f, cvt_, int, float,  v_float32)
 DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
-DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
+DEF_CVT_FUNC(32s16f, cvt1_,int, hfloat, v_float32)
 
 ////////////////////// 32f -> ... ////////////////////////
 
@@ -238,7 +238,7 @@ DEF_CVT_FUNC(32f16u, cvt_, float, ushort, v_float32)
 DEF_CVT_FUNC(32f16s, cvt_, float, short,  v_float32)
 DEF_CVT_FUNC(32f32s, cvt_, float, int,    v_float32)
 DEF_CVT_FUNC(32f64f, cvt_, float, double, v_float32)
-DEF_CVT_FUNC(32f16f, cvt1_,float, float16_t, v_float32)
+DEF_CVT_FUNC(32f16f, cvt1_,float, hfloat, v_float32)
 
 ////////////////////// 64f -> ... ////////////////////////
 
@@ -248,17 +248,17 @@ DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
 DEF_CVT_FUNC(64f16s, cvt_, double, short,  v_int32)
 DEF_CVT_FUNC(64f32s, cvt_, double, int,    v_int32)
 DEF_CVT_FUNC(64f32f, cvt_, double, float,  v_float32)
-DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
+DEF_CVT_FUNC(64f16f, cvt1_,double, hfloat, v_float32)
 
 ////////////////////// 16f -> ... ////////////////////////
 
-DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
-DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
-DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
-DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
-DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
-DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float,  v_float32)
-DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
+DEF_CVT_FUNC(16f8u,  cvt_,  hfloat, uchar,  v_float32)
+DEF_CVT_FUNC(16f8s,  cvt_,  hfloat, schar,  v_float32)
+DEF_CVT_FUNC(16f16u, cvt1_, hfloat, ushort, v_float32)
+DEF_CVT_FUNC(16f16s, cvt1_, hfloat, short,  v_float32)
+DEF_CVT_FUNC(16f32s, cvt1_, hfloat, int,    v_float32)
+DEF_CVT_FUNC(16f32f, cvt1_, hfloat, float,  v_float32)
+DEF_CVT_FUNC(16f64f, cvt1_, hfloat, double, v_float32)
 
 ///////////// "conversion" w/o conversion ///////////////
 
@@ -372,7 +372,7 @@ DEF_CPY_FUNC(64s,    int64)
 
 BinaryFunc getConvertFunc(int sdepth, int ddepth)
 {
-    static BinaryFunc cvtTab[][8] =
+    static BinaryFunc cvtTab[CV_DEPTH_MAX][CV_DEPTH_MAX] =
     {
         {
             (cvt8u), (cvt8s8u), (cvt16u8u),
diff --git a/modules/core/src/convert_scale.simd.hpp b/modules/core/src/convert_scale.simd.hpp
index 2c6d55462be1..b5322ed88834 100644
--- a/modules/core/src/convert_scale.simd.hpp
+++ b/modules/core/src/convert_scale.simd.hpp
@@ -22,9 +22,9 @@ template<typename _Ts, typename _Td> inline void
 cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
             Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
+    const int VECSZ = VTraits<v_float32>::vlanes()*2;
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -32,7 +32,7 @@ cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -58,9 +58,9 @@ template<typename _Ts, typename _Td> inline void
 cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
              Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
+    const int VECSZ = VTraits<v_float32>::vlanes()*2;
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -68,7 +68,7 @@ cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -92,9 +92,9 @@ template<typename _Ts, typename _Td> inline void
 cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
          Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
+    const int VECSZ = VTraits<v_float32>::vlanes()*2;
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -102,7 +102,7 @@ cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -128,9 +128,9 @@ template<typename _Ts, typename _Td> inline void
 cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
           Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -138,7 +138,7 @@ cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -163,9 +163,9 @@ template<typename _Ts, typename _Td> inline void
 cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
          Size size, double a, double b )
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b);
-    const int VECSZ = v_float64::nlanes*2;
+    const int VECSZ = VTraits<v_float64>::vlanes()*2;
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -173,7 +173,7 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -232,7 +232,7 @@ DEF_CVT_SCALE_FUNC(16s8u,  cvt_32f, short,  uchar, float)
 DEF_CVT_SCALE_FUNC(32s8u,  cvt_32f, int,    uchar, float)
 DEF_CVT_SCALE_FUNC(32f8u,  cvt_32f, float,  uchar, float)
 DEF_CVT_SCALE_FUNC(64f8u,  cvt_32f, double, uchar, float)
-DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
+DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, hfloat, uchar, float)
 
 DEF_CVT_SCALE_FUNC(8u8s,   cvt_32f, uchar,  schar, float)
 DEF_CVT_SCALE_FUNC(8s,     cvt_32f, schar,  schar, float)
@@ -241,7 +241,7 @@ DEF_CVT_SCALE_FUNC(16s8s,  cvt_32f, short,  schar, float)
 DEF_CVT_SCALE_FUNC(32s8s,  cvt_32f, int,    schar, float)
 DEF_CVT_SCALE_FUNC(32f8s,  cvt_32f, float,  schar, float)
 DEF_CVT_SCALE_FUNC(64f8s,  cvt_32f, double, schar, float)
-DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
+DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, hfloat, schar, float)
 
 DEF_CVT_SCALE_FUNC(8u16u,  cvt_32f, uchar,  ushort, float)
 DEF_CVT_SCALE_FUNC(8s16u,  cvt_32f, schar,  ushort, float)
@@ -250,7 +250,7 @@ DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short,  ushort, float)
 DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int,    ushort, float)
 DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float,  ushort, float)
 DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float)
-DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
+DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, hfloat, ushort, float)
 
 DEF_CVT_SCALE_FUNC(8u16s,  cvt_32f, uchar,  short, float)
 DEF_CVT_SCALE_FUNC(8s16s,  cvt_32f, schar,  short, float)
@@ -259,7 +259,7 @@ DEF_CVT_SCALE_FUNC(16s,    cvt_32f, short,  short, float)
 DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int,    short, float)
 DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float,  short, float)
 DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float)
-DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
+DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, hfloat, short, float)
 
 DEF_CVT_SCALE_FUNC(8u32s,  cvt_32f, uchar,  int, float)
 DEF_CVT_SCALE_FUNC(8s32s,  cvt_32f, schar,  int, float)
@@ -268,7 +268,7 @@ DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short,  int, float)
 DEF_CVT_SCALE_FUNC(32s,    cvt_64f, int,    int, double)
 DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float,  int, float)
 DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double)
-DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
+DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, hfloat, int, float)
 
 DEF_CVT_SCALE_FUNC(8u32f,  cvt_32f, uchar,  float, float)
 DEF_CVT_SCALE_FUNC(8s32f,  cvt_32f, schar,  float, float)
@@ -277,7 +277,7 @@ DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short,  float, float)
 DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int,    float, float)
 DEF_CVT_SCALE_FUNC(32f,    cvt_32f, float,  float, float)
 DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double)
-DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
+DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, hfloat, float, float)
 
 DEF_CVT_SCALE_FUNC(8u64f,  cvt_64f, uchar,  double, double)
 DEF_CVT_SCALE_FUNC(8s64f,  cvt_64f, schar,  double, double)
@@ -286,20 +286,20 @@ DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short,  double, double)
 DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int,    double, double)
 DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float,  double, double)
 DEF_CVT_SCALE_FUNC(64f,    cvt_64f, double, double, double)
-DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
+DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, hfloat, double, double)
 
-DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
-DEF_CVT_SCALE_FUNC(8s16f,  cvt1_32f, schar,  float16_t, float)
-DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float)
-DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short,  float16_t, float)
-DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int,    float16_t, float)
-DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float,  float16_t, float)
-DEF_CVT_SCALE_FUNC(64f16f, cvt_64f,  double, float16_t, double)
-DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)
+DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  hfloat, float)
+DEF_CVT_SCALE_FUNC(8s16f,  cvt1_32f, schar,  hfloat, float)
+DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, hfloat, float)
+DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short,  hfloat, float)
+DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int,    hfloat, float)
+DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float,  hfloat, float)
+DEF_CVT_SCALE_FUNC(64f16f, cvt_64f,  double, hfloat, double)
+DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, hfloat, hfloat, float)
 
 BinaryFunc getCvtScaleAbsFunc(int depth)
 {
-    static BinaryFunc cvtScaleAbsTab[] =
+    static BinaryFunc cvtScaleAbsTab[CV_DEPTH_MAX] =
     {
         (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u,
         (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u,
@@ -311,7 +311,7 @@ BinaryFunc getCvtScaleAbsFunc(int depth)
 
 BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
 {
-    static BinaryFunc cvtScaleTab[][8] =
+    static BinaryFunc cvtScaleTab[CV_DEPTH_MAX][CV_DEPTH_MAX] =
     {
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 89948fb878dc..5c8af185b533 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -93,10 +93,10 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
         scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
         break;
     case CV_16F:
-        scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
+        scalarToRawData_<hfloat>(s, (hfloat*)_buf, cn, unroll_to);
         break;
     default:
-        CV_Error(CV_StsUnsupportedFormat,"");
+        CV_Error(cv::Error::StsUnsupportedFormat,"");
     }
 }
 
@@ -156,15 +156,15 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
         const uchar* src = (const uchar*)_src;
         uchar* dst = (uchar*)_dst;
         int x = 0;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             v_uint8 v_zero = vx_setzero_u8();
 
-            for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
+            for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
             {
                 v_uint8 v_src   = vx_load(src  + x),
                         v_dst   = vx_load(dst  + x),
-                        v_nmask = vx_load(mask + x) == v_zero;
+                        v_nmask = v_eq(vx_load(mask + x), v_zero);
 
                 v_dst = v_select(v_nmask, v_dst, v_src);
                 v_store(dst + x, v_dst);
@@ -188,23 +188,23 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
         const ushort* src = (const ushort*)_src;
         ushort* dst = (ushort*)_dst;
         int x = 0;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             v_uint8 v_zero = vx_setzero_u8();
 
-            for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
+            for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
             {
-                v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + v_uint16::nlanes),
-                         v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + v_uint16::nlanes);
+                v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + VTraits<v_uint16>::vlanes()),
+                         v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + VTraits<v_uint16>::vlanes());
 
                 v_uint8 v_nmask1, v_nmask2;
-                v_uint8 v_nmask = vx_load(mask + x) == v_zero;
+                v_uint8 v_nmask = v_eq(vx_load(mask + x), v_zero);
                 v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
 
                 v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
                 v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
                 v_store(dst + x, v_dst1);
-                v_store(dst + x + v_uint16::nlanes, v_dst2);
+                v_store(dst + x + VTraits<v_uint16>::vlanes(), v_dst2);
             }
         }
         vx_cleanup();
@@ -788,7 +788,7 @@ int cv::borderInterpolate( int p, int len, int borderType )
     else if( borderType == BORDER_CONSTANT )
         p = -1;
     else
-        CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
+        CV_Error( cv::Error::StsBadArg, "Unknown/unsupported border type" );
     return p;
 }
 
@@ -860,14 +860,14 @@ void copyMakeBorder_8u( const uchar* src, size_t srcstep, cv::Size srcroi,
     }
 
     dstroi.width *= elemSize;
-    dst += dststep*top;
 
     for( i = 0; i < top; i++ )
     {
         j = cv::borderInterpolate(i - top, srcroi.height, borderType);
-        memcpy(dst + (i - top)*dststep, dst + j*dststep, dstroi.width);
+        memcpy(dst + i*dststep, dst + (top+j)*dststep, dstroi.width);
     }
 
+    dst += dststep*top;
     for( i = 0; i < bottom; i++ )
     {
         j = cv::borderInterpolate(i + srcroi.height, srcroi.height, borderType);
diff --git a/modules/core/src/count_non_zero.simd.hpp b/modules/core/src/count_non_zero.simd.hpp
index 699456412747..9de616fe8ae5 100644
--- a/modules/core/src/count_non_zero.simd.hpp
+++ b/modules/core/src/count_non_zero.simd.hpp
@@ -32,8 +32,8 @@ static int countNonZero_(const T* src, int len )
 static int countNonZero8u( const uchar* src, int len )
 {
     int i=0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint8>::vlanes();
     v_uint8 v_zero = vx_setzero_u8();
     v_uint8 v_one = vx_setall_u8(1);
 
@@ -42,20 +42,20 @@ static int countNonZero8u( const uchar* src, int len )
     {
         v_uint16 v_sum16 = vx_setzero_u16();
         int j = i;
-        while (j < std::min(len0, i + 65280 * v_uint16::nlanes))
+        while (j < std::min(len0, i + 65280 * VTraits<v_uint16>::vlanes()))
         {
             v_uint8 v_sum8 = vx_setzero_u8();
             int k = j;
-            for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes)
-                v_sum8 += v_one & (vx_load(src + k) == v_zero);
+            for (; k < std::min(len0, j + 255 * VTraits<v_uint8>::vlanes()); k += VTraits<v_uint8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_eq(vx_load(src + k), v_zero)));
             v_uint16 part1, part2;
             v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
             j = k;
         }
         v_uint32 part1, part2;
         v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
         i = j;
     }
     nz = i - v_reduce_sum(v_sum32);
@@ -69,8 +69,8 @@ static int countNonZero8u( const uchar* src, int len )
 static int countNonZero16u( const ushort* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
     v_uint16 v_zero = vx_setzero_u16();
     v_int8 v_one = vx_setall_s8(1);
 
@@ -79,20 +79,20 @@ static int countNonZero16u( const ushort* src, int len )
     {
         v_int16 v_sum16 = vx_setzero_s16();
         int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
         {
             v_int8 v_sum8 = vx_setzero_s8();
             int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero));
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_reinterpret_as_s16(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s16(v_eq(vx_load(src + k + VTraits<v_uint16>::vlanes()), v_zero)))));
             v_int16 part1, part2;
             v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
             j = k;
         }
         v_int32 part1, part2;
         v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
         i = j;
     }
     nz = i - v_reduce_sum(v_sum32);
@@ -104,8 +104,8 @@ static int countNonZero16u( const ushort* src, int len )
 static int countNonZero32s( const int* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
     v_int32 v_zero = vx_setzero_s32();
     v_int8 v_one = vx_setall_s8(1);
 
@@ -114,23 +114,20 @@ static int countNonZero32s( const int* src, int len )
     {
         v_int16 v_sum16 = vx_setzero_s16();
         int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
         {
             v_int8 v_sum8 = vx_setzero_s8();
             int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(
-                    v_pack(vx_load(src + k                    ) == v_zero, vx_load(src + k +   v_int32::nlanes) == v_zero),
-                    v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero)
-                );
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_eq(vx_load(src + k), v_zero), v_eq(vx_load(src + k + VTraits<v_int32>::vlanes()), v_zero)), v_pack(v_eq(vx_load(src + k + 2 * VTraits<v_int32>::vlanes()), v_zero), v_eq(vx_load(src + k + 3 * VTraits<v_int32>::vlanes()), v_zero)))));
             v_int16 part1, part2;
             v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
             j = k;
         }
         v_int32 part1, part2;
         v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
         i = j;
     }
     nz = i - v_reduce_sum(v_sum32);
@@ -142,8 +139,8 @@ static int countNonZero32s( const int* src, int len )
 static int countNonZero32f( const float* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
     v_float32 v_zero = vx_setzero_f32();
     v_int8 v_one = vx_setall_s8(1);
 
@@ -152,23 +149,20 @@ static int countNonZero32f( const float* src, int len )
     {
         v_int16 v_sum16 = vx_setzero_s16();
         int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
         {
             v_int8 v_sum8 = vx_setzero_s8();
             int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(
-                    v_pack(v_reinterpret_as_s32(vx_load(src + k                      ) == v_zero), v_reinterpret_as_s32(vx_load(src + k +   v_float32::nlanes) == v_zero)),
-                    v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero))
-                );
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + VTraits<v_float32>::vlanes()), v_zero))), v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k + 2 * VTraits<v_float32>::vlanes()), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + 3 * VTraits<v_float32>::vlanes()), v_zero))))));
             v_int16 part1, part2;
             v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
             j = k;
         }
         v_int32 part1, part2;
         v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
         i = j;
     }
     nz = i - v_reduce_sum(v_sum32);
@@ -180,21 +174,21 @@ static int countNonZero32f( const float* src, int len )
 static int countNonZero64f( const double* src, int len )
 {
     int nz = 0, i = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     v_int64 sum1 = vx_setzero_s64();
     v_int64 sum2 = vx_setzero_s64();
     v_float64 zero = vx_setzero_f64();
-    int step = v_float64::nlanes * 2;
+    int step = VTraits<v_float64>::vlanes() * 2;
     int len0 = len & -step;
 
     for(i = 0; i < len0; i += step )
         {
-        sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
-        sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
+        sum1 = v_add(sum1, v_reinterpret_as_s64(v_eq(vx_load(&src[i]), zero)));
+        sum2 = v_add(sum2, v_reinterpret_as_s64(v_eq(vx_load(&src[i + step / 2]), zero)));
         }
 
     // N.B the value is incremented by -1 (0xF...F) for each value
-    nz = i + (int)v_reduce_sum(sum1 + sum2);
+    nz = i + (int)v_reduce_sum(v_add(sum1, sum2));
     v_cleanup();
 #endif
     return nz + countNonZero_(src + i, len - i);
@@ -202,7 +196,7 @@ static int countNonZero64f( const double* src, int len )
 
 CountNonZeroFunc getCountNonZeroTab(int depth)
 {
-    static CountNonZeroFunc countNonZeroTab[] =
+    static CountNonZeroFunc countNonZeroTab[CV_DEPTH_MAX] =
     {
         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u),
         (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u),
diff --git a/modules/core/src/datastructs.cpp b/modules/core/src/datastructs.cpp
index 80b02283dc90..17da7119a3e4 100644
--- a/modules/core/src/datastructs.cpp
+++ b/modules/core/src/datastructs.cpp
@@ -91,7 +91,7 @@ static void
 icvInitMemStorage( CvMemStorage* storage, int block_size )
 {
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( block_size <= 0 )
         block_size = CV_STORAGE_BLOCK_SIZE;
@@ -120,7 +120,7 @@ CV_IMPL CvMemStorage *
 cvCreateChildMemStorage( CvMemStorage * parent )
 {
     if( !parent )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     CvMemStorage* storage = cvCreateMemStorage(parent->block_size);
     storage->parent = parent;
@@ -137,7 +137,7 @@ icvDestroyMemStorage( CvMemStorage* storage )
     CvMemBlock *dst_top = 0;
 
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( storage->parent )
         dst_top = storage->parent->top;
@@ -180,7 +180,7 @@ CV_IMPL void
 cvReleaseMemStorage( CvMemStorage** storage )
 {
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     CvMemStorage* st = *storage;
     *storage = 0;
@@ -197,7 +197,7 @@ CV_IMPL void
 cvClearMemStorage( CvMemStorage * storage )
 {
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( storage->parent )
         icvDestroyMemStorage( storage );
@@ -215,7 +215,7 @@ static void
 icvGoNextMemBlock( CvMemStorage * storage )
 {
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( !storage->top || !storage->top->next )
     {
@@ -273,7 +273,7 @@ CV_IMPL void
 cvSaveMemStoragePos( const CvMemStorage * storage, CvMemStoragePos * pos )
 {
     if( !storage || !pos )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     pos->top = storage->top;
     pos->free_space = storage->free_space;
@@ -285,9 +285,9 @@ CV_IMPL void
 cvRestoreMemStoragePos( CvMemStorage * storage, CvMemStoragePos * pos )
 {
     if( !storage || !pos )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     if( pos->free_space > storage->block_size )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     /*
     // this breaks icvGoNextMemBlock, so comment it off for now
@@ -324,10 +324,10 @@ cvMemStorageAlloc( CvMemStorage* storage, size_t size )
 {
     schar *ptr = 0;
     if( !storage )
-        CV_Error( CV_StsNullPtr, "NULL storage pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL storage pointer" );
 
     if( size > INT_MAX )
-        CV_Error( CV_StsOutOfRange, "Too large memory block is requested" );
+        CV_Error( cv::Error::StsOutOfRange, "Too large memory block is requested" );
 
     CV_Assert( storage->free_space % CV_STRUCT_ALIGN == 0 );
 
@@ -335,7 +335,7 @@ cvMemStorageAlloc( CvMemStorage* storage, size_t size )
     {
         size_t max_free_space = cvAlignLeft(storage->block_size - sizeof(CvMemBlock), CV_STRUCT_ALIGN);
         if( max_free_space < size )
-            CV_Error( CV_StsOutOfRange, "requested size is negative or too big" );
+            CV_Error( cv::Error::StsOutOfRange, "requested size is negative or too big" );
 
         icvGoNextMemBlock( storage );
     }
@@ -374,9 +374,9 @@ cvCreateSeq( int seq_flags, size_t header_size, size_t elem_size, CvMemStorage*
     CvSeq *seq = 0;
 
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     if( header_size < sizeof( CvSeq ) || elem_size <= 0 )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     /* allocate sequence header */
     seq = (CvSeq*)cvMemStorageAlloc( storage, header_size );
@@ -390,7 +390,7 @@ cvCreateSeq( int seq_flags, size_t header_size, size_t elem_size, CvMemStorage*
 
         if( elemtype != CV_SEQ_ELTYPE_GENERIC && elemtype != CV_SEQ_ELTYPE_PTR &&
             typesize != 0 && typesize != (int)elem_size )
-            CV_Error( CV_StsBadSize,
+            CV_Error( cv::Error::StsBadSize,
             "Specified element size doesn't match to the size of the specified element type "
             "(try to use 0 for element type)" );
     }
@@ -412,9 +412,9 @@ cvSetSeqBlockSize( CvSeq *seq, int delta_elements )
     int useful_block_size;
 
     if( !seq || !seq->storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     if( delta_elements < 0 )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     useful_block_size = cvAlignLeft(seq->storage->block_size - sizeof(CvMemBlock) -
                                     sizeof(CvSeqBlock), CV_STRUCT_ALIGN);
@@ -429,7 +429,7 @@ cvSetSeqBlockSize( CvSeq *seq, int delta_elements )
     {
         delta_elements = useful_block_size / elem_size;
         if( delta_elements == 0 )
-            CV_Error( CV_StsOutOfRange, "Storage block size is too small "
+            CV_Error( cv::Error::StsOutOfRange, "Storage block size is too small "
                                         "to fit the sequence elements" );
     }
 
@@ -487,7 +487,7 @@ cvSeqElemIdx( const CvSeq* seq, const void* _element, CvSeqBlock** _block )
     CvSeqBlock *block;
 
     if( !seq || !element )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     block = first_block = seq->first;
     elem_size = seq->elem_size;
@@ -548,7 +548,7 @@ cvCvtSeqToArray( const CvSeq *seq, void *array, CvSlice slice )
     char *dst = (char*)array;
 
     if( !seq || !array )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     elem_size = seq->elem_size;
     total = cvSliceLength( slice, seq )*elem_size;
@@ -587,10 +587,10 @@ cvMakeSeqHeaderForArray( int seq_flags, int header_size, int elem_size,
     CvSeq* result = 0;
 
     if( elem_size <= 0 || header_size < (int)sizeof( CvSeq ) || total < 0 )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     if( !seq || ((!array || !block) && total > 0) )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     memset( seq, 0, header_size );
 
@@ -602,7 +602,7 @@ cvMakeSeqHeaderForArray( int seq_flags, int header_size, int elem_size,
 
         if( elemtype != CV_SEQ_ELTYPE_GENERIC &&
             typesize != 0 && typesize != elem_size )
-            CV_Error( CV_StsBadSize,
+            CV_Error( cv::Error::StsBadSize,
             "Element size doesn't match to the size of predefined element type "
             "(try to use 0 for sequence element type)" );
     }
@@ -634,7 +634,7 @@ icvGrowSeq( CvSeq *seq, int in_front_of )
     CvSeqBlock *block;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     block = seq->free_blocks;
 
     if( !block )
@@ -647,7 +647,7 @@ icvGrowSeq( CvSeq *seq, int in_front_of )
             cvSetSeqBlockSize( seq, delta_elems*2 );
 
         if( !storage )
-            CV_Error( CV_StsNullPtr, "The sequence has NULL storage pointer" );
+            CV_Error( cv::Error::StsNullPtr, "The sequence has NULL storage pointer" );
 
         /* If there is a free space just after last allocated block
            and it is big enough then enlarge the last block.
@@ -817,7 +817,7 @@ CV_IMPL void
 cvStartAppendToSeq( CvSeq *seq, CvSeqWriter * writer )
 {
     if( !seq || !writer )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     memset( writer, 0, sizeof( *writer ));
     writer->header_size = sizeof( CvSeqWriter );
@@ -835,7 +835,7 @@ cvStartWriteSeq( int seq_flags, int header_size,
                  int elem_size, CvMemStorage * storage, CvSeqWriter * writer )
 {
     if( !storage || !writer )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     CvSeq* seq = cvCreateSeq( seq_flags, header_size, elem_size, storage );
     cvStartAppendToSeq( seq, writer );
@@ -847,7 +847,7 @@ CV_IMPL void
 cvFlushSeqWriter( CvSeqWriter * writer )
 {
     if( !writer )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     CvSeq* seq = writer->seq;
     seq->ptr = writer->ptr;
@@ -878,7 +878,7 @@ CV_IMPL CvSeq *
 cvEndWriteSeq( CvSeqWriter * writer )
 {
     if( !writer )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     cvFlushSeqWriter( writer );
     CvSeq* seq = writer->seq;
@@ -909,7 +909,7 @@ CV_IMPL void
 cvCreateSeqBlock( CvSeqWriter * writer )
 {
     if( !writer || !writer->seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     CvSeq* seq = writer->seq;
 
@@ -942,7 +942,7 @@ cvStartReadSeq( const CvSeq *seq, CvSeqReader * reader, int reverse )
     }
 
     if( !seq || !reader )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     reader->header_size = sizeof( CvSeqReader );
     reader->seq = (CvSeq*)seq;
@@ -992,7 +992,7 @@ cvChangeSeqBlock( void* _reader, int direction )
     CvSeqReader* reader = (CvSeqReader*)_reader;
 
     if( !reader )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( direction > 0 )
     {
@@ -1017,7 +1017,7 @@ cvGetSeqReaderPos( CvSeqReader* reader )
     int index = -1;
 
     if( !reader || !reader->ptr )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     elem_size = reader->seq->elem_size;
     if( elem_size <= ICV_SHIFT_TAB_MAX && (index = icvPower2ShiftTab[elem_size - 1]) >= 0 )
@@ -1042,7 +1042,7 @@ cvSetSeqReaderPos( CvSeqReader* reader, int index, int is_relative )
     int elem_size, count, total;
 
     if( !reader || !reader->seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     total = reader->seq->total;
     elem_size = reader->seq->elem_size;
@@ -1052,14 +1052,14 @@ cvSetSeqReaderPos( CvSeqReader* reader, int index, int is_relative )
         if( index < 0 )
         {
             if( index < -total )
-                CV_Error( CV_StsOutOfRange, "" );
+                CV_Error( cv::Error::StsOutOfRange, "" );
             index += total;
         }
         else if( index >= total )
         {
             index -= total;
             if( index >= total )
-                CV_Error( CV_StsOutOfRange, "" );
+                CV_Error( cv::Error::StsOutOfRange, "" );
         }
 
         block = reader->seq->first;
@@ -1135,7 +1135,7 @@ cvSeqPush( CvSeq *seq, const void *element )
     size_t elem_size;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     elem_size = seq->elem_size;
     ptr = seq->ptr;
@@ -1166,9 +1166,9 @@ cvSeqPop( CvSeq *seq, void *element )
     int elem_size;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     if( seq->total <= 0 )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     elem_size = seq->elem_size;
     seq->ptr = ptr = seq->ptr - elem_size;
@@ -1195,7 +1195,7 @@ cvSeqPushFront( CvSeq *seq, const void *element )
     CvSeqBlock *block;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     elem_size = seq->elem_size;
     block = seq->first;
@@ -1228,9 +1228,9 @@ cvSeqPopFront( CvSeq *seq, void *element )
     CvSeqBlock *block;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     if( seq->total <= 0 )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     elem_size = seq->elem_size;
     block = seq->first;
@@ -1257,14 +1257,14 @@ cvSeqInsert( CvSeq *seq, int before_index, const void *element )
     schar* ret_ptr = 0;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     total = seq->total;
     before_index += before_index < 0 ? total : 0;
     before_index -= before_index > total ? total : 0;
 
     if( (unsigned)before_index > (unsigned)total )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     if( before_index == total )
     {
@@ -1375,7 +1375,7 @@ cvSeqRemove( CvSeq *seq, int index )
     int total, front = 0;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     total = seq->total;
 
@@ -1383,7 +1383,7 @@ cvSeqRemove( CvSeq *seq, int index )
     index -= index >= total ? total : 0;
 
     if( (unsigned) index >= (unsigned) total )
-        CV_Error( CV_StsOutOfRange, "Invalid index" );
+        CV_Error( cv::Error::StsOutOfRange, "Invalid index" );
 
     if( index == total - 1 )
     {
@@ -1456,9 +1456,9 @@ cvSeqPushMulti( CvSeq *seq, const void *_elements, int count, int front )
     char *elements = (char *) _elements;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "NULL sequence pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL sequence pointer" );
     if( count < 0 )
-        CV_Error( CV_StsBadSize, "number of removed elements is negative" );
+        CV_Error( cv::Error::StsBadSize, "number of removed elements is negative" );
 
     int elem_size = seq->elem_size;
 
@@ -1525,9 +1525,9 @@ cvSeqPopMulti( CvSeq *seq, void *_elements, int count, int front )
     char *elements = (char *) _elements;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "NULL sequence pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL sequence pointer" );
     if( count < 0 )
-        CV_Error( CV_StsBadSize, "number of removed elements is negative" );
+        CV_Error( cv::Error::StsBadSize, "number of removed elements is negative" );
 
     count = MIN( count, seq->total );
 
@@ -1593,7 +1593,7 @@ CV_IMPL void
 cvClearSeq( CvSeq *seq )
 {
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     cvSeqPopMulti( seq, 0, seq->total );
 }
 
@@ -1607,13 +1607,13 @@ cvSeqSlice( const CvSeq* seq, CvSlice slice, CvMemStorage* storage, int copy_dat
     CvSeqBlock *block, *first_block = 0, *last_block = 0;
 
     if( !CV_IS_SEQ(seq) )
-        CV_Error( CV_StsBadArg, "Invalid sequence header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid sequence header" );
 
     if( !storage )
     {
         storage = seq->storage;
         if( !storage )
-            CV_Error( CV_StsNullPtr, "NULL storage pointer" );
+            CV_Error( cv::Error::StsNullPtr, "NULL storage pointer" );
     }
 
     elem_size = seq->elem_size;
@@ -1624,7 +1624,7 @@ cvSeqSlice( const CvSeq* seq, CvSlice slice, CvMemStorage* storage, int copy_dat
         slice.start_index -= seq->total;
     if( (unsigned)length > (unsigned)seq->total ||
         ((unsigned)slice.start_index >= (unsigned)seq->total && length != 0) )
-        CV_Error( CV_StsOutOfRange, "Bad sequence slice" );
+        CV_Error( cv::Error::StsOutOfRange, "Bad sequence slice" );
 
     subseq = cvCreateSeq( seq->flags, seq->header_size, elem_size, storage );
 
@@ -1680,7 +1680,7 @@ cvSeqRemoveSlice( CvSeq* seq, CvSlice slice )
     int total, length;
 
     if( !CV_IS_SEQ(seq) )
-        CV_Error( CV_StsBadArg, "Invalid sequence header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid sequence header" );
 
     length = cvSliceLength( slice, seq );
     total = seq->total;
@@ -1691,7 +1691,7 @@ cvSeqRemoveSlice( CvSeq* seq, CvSlice slice )
         slice.start_index -= total;
 
     if( (unsigned)slice.start_index >= (unsigned)total )
-        CV_Error( CV_StsOutOfRange, "start slice index is out of range" );
+        CV_Error( cv::Error::StsOutOfRange, "start slice index is out of range" );
 
     slice.end_index = slice.start_index + length;
 
@@ -1757,16 +1757,16 @@ cvSeqInsertSlice( CvSeq* seq, int index, const CvArr* from_arr )
     CvSeqBlock block;
 
     if( !CV_IS_SEQ(seq) )
-        CV_Error( CV_StsBadArg, "Invalid destination sequence header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid destination sequence header" );
 
     if( !CV_IS_SEQ(from))
     {
         CvMat* mat = (CvMat*)from;
         if( !CV_IS_MAT(mat))
-            CV_Error( CV_StsBadArg, "Source is not a sequence nor matrix" );
+            CV_Error( cv::Error::StsBadArg, "Source is not a sequence nor matrix" );
 
         if( !CV_IS_MAT_CONT(mat->type) || (mat->rows != 1 && mat->cols != 1) )
-            CV_Error( CV_StsBadArg, "The source array must be 1d continuous vector" );
+            CV_Error( cv::Error::StsBadArg, "The source array must be 1d continuous vector" );
 
         from = cvMakeSeqHeaderForArray( CV_SEQ_KIND_GENERIC, sizeof(from_header),
                                                  CV_ELEM_SIZE(mat->type),
@@ -1775,7 +1775,7 @@ cvSeqInsertSlice( CvSeq* seq, int index, const CvArr* from_arr )
     }
 
     if( seq->elem_size != from->elem_size )
-        CV_Error( CV_StsUnmatchedSizes,
+        CV_Error( cv::Error::StsUnmatchedSizes,
         "Source and destination sequence element sizes are different." );
 
     from_total = from->total;
@@ -1788,7 +1788,7 @@ cvSeqInsertSlice( CvSeq* seq, int index, const CvArr* from_arr )
     index -= index > total ? total : 0;
 
     if( (unsigned)index > (unsigned)total )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     elem_size = seq->elem_size;
 
@@ -1918,10 +1918,10 @@ cvSeqSort( CvSeq* seq, CvCmpFunc cmp_func, void* aux )
     stack[48];
 
     if( !CV_IS_SEQ(seq) )
-        CV_Error( !seq ? CV_StsNullPtr : CV_StsBadArg, "Bad input sequence" );
+        CV_Error( !seq ? cv::Error::StsNullPtr : cv::Error::StsBadArg, "Bad input sequence" );
 
     if( !cmp_func )
-        CV_Error( CV_StsNullPtr, "Null compare function" );
+        CV_Error( cv::Error::StsNullPtr, "Null compare function" );
 
     if( seq->total <= 1 )
         return;
@@ -2195,10 +2195,10 @@ cvSeqSearch( CvSeq* seq, const void* _elem, CvCmpFunc cmp_func,
         *_idx = idx;
 
     if( !CV_IS_SEQ(seq) )
-        CV_Error( !seq ? CV_StsNullPtr : CV_StsBadArg, "Bad input sequence" );
+        CV_Error( !seq ? cv::Error::StsNullPtr : cv::Error::StsBadArg, "Bad input sequence" );
 
     if( !elem )
-        CV_Error( CV_StsNullPtr, "Null element pointer" );
+        CV_Error( cv::Error::StsNullPtr, "Null element pointer" );
 
     int elem_size = seq->elem_size;
     int total = seq->total;
@@ -2256,7 +2256,7 @@ cvSeqSearch( CvSeq* seq, const void* _elem, CvCmpFunc cmp_func,
     else
     {
         if( !cmp_func )
-            CV_Error( CV_StsNullPtr, "Null compare function" );
+            CV_Error( cv::Error::StsNullPtr, "Null compare function" );
 
         i = 0, j = total;
 
@@ -2340,16 +2340,16 @@ cvSeqPartition( const CvSeq* seq, CvMemStorage* storage, CvSeq** labels,
     int is_set;
 
     if( !labels )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( !seq || !is_equal )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( !storage )
         storage = seq->storage;
 
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     is_set = CV_IS_SET(seq);
 
@@ -2483,11 +2483,11 @@ CV_IMPL CvSet*
 cvCreateSet( int set_flags, int header_size, int elem_size, CvMemStorage * storage )
 {
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     if( header_size < (int)sizeof( CvSet ) ||
         elem_size < (int)sizeof(void*)*2 ||
         (elem_size & (sizeof(void*)-1)) != 0 )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     CvSet* set = (CvSet*) cvCreateSeq( set_flags, header_size, elem_size, storage );
     set->flags = (set->flags & ~CV_MAGIC_MASK) | CV_SET_MAGIC_VAL;
@@ -2504,7 +2504,7 @@ cvSetAdd( CvSet* set, CvSetElem* element, CvSetElem** inserted_element )
     CvSetElem *free_elem;
 
     if( !set )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( !(set->free_elems) )
     {
@@ -2552,7 +2552,7 @@ cvSetRemove( CvSet* set, int index )
     if( elem )
         cvSetRemoveByPtr( set, elem );
     else if( !set )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 }
 
 
@@ -2583,7 +2583,7 @@ cvCreateGraph( int graph_type, int header_size,
     ||  edge_size   < (int) sizeof( CvGraphEdge )
     ||  vtx_size    < (int) sizeof( CvGraphVtx  )
     ){
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
     }
 
     vertices = cvCreateSet( graph_type, header_size, vtx_size, storage );
@@ -2602,7 +2602,7 @@ CV_IMPL void
 cvClearGraph( CvGraph * graph )
 {
     if( !graph )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     cvClearSet( graph->edges );
     cvClearSet( (CvSet*)graph );
@@ -2617,7 +2617,7 @@ cvGraphAddVtx( CvGraph* graph, const CvGraphVtx* _vertex, CvGraphVtx** _inserted
     int index = -1;
 
     if( !graph )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     vertex = (CvGraphVtx*)cvSetNew((CvSet*)graph);
     if( vertex )
@@ -2642,10 +2642,10 @@ cvGraphRemoveVtxByPtr( CvGraph* graph, CvGraphVtx* vtx )
     int count = -1;
 
     if( !graph || !vtx )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( !CV_IS_SET_ELEM(vtx))
-        CV_Error( CV_StsBadArg, "The vertex does not belong to the graph" );
+        CV_Error( cv::Error::StsBadArg, "The vertex does not belong to the graph" );
 
     count = graph->edges->active_count;
     for( ;; )
@@ -2670,11 +2670,11 @@ cvGraphRemoveVtx( CvGraph* graph, int index )
     CvGraphVtx *vtx = 0;
 
     if( !graph )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     vtx = cvGetGraphVtx( graph, index );
     if( !vtx )
-        CV_Error( CV_StsBadArg, "The vertex is not found" );
+        CV_Error( cv::Error::StsBadArg, "The vertex is not found" );
 
     count = graph->edges->active_count;
     for( ;; )
@@ -2702,7 +2702,7 @@ cvFindGraphEdgeByPtr( const CvGraph* graph,
     int ofs = 0;
 
     if( !graph || !start_vtx || !end_vtx )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( start_vtx == end_vtx )
         return 0;
@@ -2735,7 +2735,7 @@ cvFindGraphEdge( const CvGraph* graph, int start_idx, int end_idx )
     CvGraphVtx *end_vtx;
 
     if( !graph )
-        CV_Error( CV_StsNullPtr, "graph pointer is NULL" );
+        CV_Error( cv::Error::StsNullPtr, "graph pointer is NULL" );
 
     start_vtx = cvGetGraphVtx( graph, start_idx );
     end_vtx = cvGetGraphVtx( graph, end_idx );
@@ -2759,7 +2759,7 @@ cvGraphAddEdgeByPtr( CvGraph* graph,
     int delta;
 
     if( !graph )
-        CV_Error( CV_StsNullPtr, "graph pointer is NULL" );
+        CV_Error( cv::Error::StsNullPtr, "graph pointer is NULL" );
 
     if( !CV_IS_GRAPH_ORIENTED( graph ) &&
         (start_vtx->flags & CV_SET_ELEM_IDX_MASK) > (end_vtx->flags & CV_SET_ELEM_IDX_MASK) )
@@ -2778,7 +2778,7 @@ cvGraphAddEdgeByPtr( CvGraph* graph,
     }
 
     if( start_vtx == end_vtx )
-        CV_Error( start_vtx ? CV_StsBadArg : CV_StsNullPtr,
+        CV_Error( start_vtx ? cv::Error::StsBadArg : cv::Error::StsNullPtr,
         "vertex pointers coincide (or set to NULL)" );
 
     edge = (CvGraphEdge*)cvSetNew( (CvSet*)(graph->edges) );
@@ -2826,7 +2826,7 @@ cvGraphAddEdge( CvGraph* graph,
     CvGraphVtx *end_vtx;
 
     if( !graph )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     start_vtx = cvGetGraphVtx( graph, start_idx );
     end_vtx = cvGetGraphVtx( graph, end_idx );
@@ -2843,7 +2843,7 @@ cvGraphRemoveEdgeByPtr( CvGraph* graph, CvGraphVtx* start_vtx, CvGraphVtx* end_v
     CvGraphEdge *edge, *next_edge, *prev_edge;
 
     if( !graph || !start_vtx || !end_vtx )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( start_vtx == end_vtx )
         return;
@@ -2902,7 +2902,7 @@ cvGraphRemoveEdge( CvGraph* graph, int start_idx, int end_idx )
     CvGraphVtx *end_vtx;
 
     if( !graph )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     start_vtx = cvGetGraphVtx( graph, start_idx );
     end_vtx = cvGetGraphVtx( graph, end_idx );
@@ -2919,7 +2919,7 @@ cvGraphVtxDegreeByPtr( const CvGraph* graph, const CvGraphVtx* vertex )
     int count;
 
     if( !graph || !vertex )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     for( edge = vertex->first, count = 0; edge; )
     {
@@ -2940,11 +2940,11 @@ cvGraphVtxDegree( const CvGraph* graph, int vtx_idx )
     int count;
 
     if( !graph )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     vertex = cvGetGraphVtx( graph, vtx_idx );
     if( !vertex )
-        CV_Error( CV_StsObjectNotFound, "" );
+        CV_Error( cv::Error::StsObjectNotFound, "" );
 
     for( edge = vertex->first, count = 0; edge; )
     {
@@ -2971,13 +2971,13 @@ icvSeqElemsClearFlags( CvSeq* seq, int offset, int clear_mask )
     int i, total, elem_size;
 
     if( !seq )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     elem_size = seq->elem_size;
     total = seq->total;
 
     if( (unsigned)offset > (unsigned)elem_size )
-        CV_Error( CV_StsBadArg, "" );
+        CV_Error( cv::Error::StsBadArg, "" );
 
     cvStartReadSeq( seq, &reader );
 
@@ -3001,14 +3001,14 @@ icvSeqFindNextElem( CvSeq* seq, int offset, int mask,
     int total, elem_size, index;
 
     if( !seq || !start_index )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     elem_size = seq->elem_size;
     total = seq->total;
     index = *start_index;
 
     if( (unsigned)offset > (unsigned)elem_size )
-        CV_Error( CV_StsBadArg, "" );
+        CV_Error( cv::Error::StsBadArg, "" );
 
     if( total == 0 )
         return 0;
@@ -3048,7 +3048,7 @@ CV_IMPL CvGraphScanner*
 cvCreateGraphScanner( CvGraph* graph, CvGraphVtx* vtx, int mask )
 {
     if( !graph )
-        CV_Error( CV_StsNullPtr, "Null graph pointer" );
+        CV_Error( cv::Error::StsNullPtr, "Null graph pointer" );
 
     CV_Assert( graph->storage != 0 );
 
@@ -3082,7 +3082,7 @@ CV_IMPL void
 cvReleaseGraphScanner( CvGraphScanner** scanner )
 {
     if( !scanner )
-        CV_Error( CV_StsNullPtr, "Null double pointer to graph scanner" );
+        CV_Error( cv::Error::StsNullPtr, "Null double pointer to graph scanner" );
 
     if( *scanner )
     {
@@ -3103,7 +3103,7 @@ cvNextGraphItem( CvGraphScanner* scanner )
     CvGraphItem item;
 
     if( !scanner || !(scanner->stack))
-        CV_Error( CV_StsNullPtr, "Null graph scanner" );
+        CV_Error( cv::Error::StsNullPtr, "Null graph scanner" );
 
     dst = scanner->dst;
     vtx = scanner->vtx;
@@ -3259,13 +3259,13 @@ cvCloneGraph( const CvGraph* graph, CvMemStorage* storage )
     CvSeqReader reader;
 
     if( !CV_IS_GRAPH(graph))
-        CV_Error( CV_StsBadArg, "Invalid graph pointer" );
+        CV_Error( cv::Error::StsBadArg, "Invalid graph pointer" );
 
     if( !storage )
         storage = graph->storage;
 
     if( !storage )
-        CV_Error( CV_StsNullPtr, "NULL storage pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL storage pointer" );
 
     vtx_size = graph->elem_size;
     edge_size = graph->edges->elem_size;
@@ -3343,7 +3343,7 @@ cvTreeToNodeSeq( const void* first, int header_size, CvMemStorage* storage )
     CvTreeNodeIterator iterator;
 
     if( !storage )
-        CV_Error( CV_StsNullPtr, "NULL storage pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL storage pointer" );
 
     allseq = cvCreateSeq( 0, header_size, sizeof(first), storage );
 
@@ -3389,7 +3389,7 @@ cvInsertNodeIntoTree( void* _node, void* _parent, void* _frame )
     CvTreeNode* parent = (CvTreeNode*)_parent;
 
     if( !node || !parent )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     node->v_prev = _parent != _frame ? parent : 0;
     node->h_next = parent->v_next;
@@ -3410,10 +3410,10 @@ cvRemoveNodeFromTree( void* _node, void* _frame )
     CvTreeNode* frame = (CvTreeNode*)_frame;
 
     if( !node )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( node == frame )
-        CV_Error( CV_StsBadArg, "frame node could not be deleted" );
+        CV_Error( cv::Error::StsBadArg, "frame node could not be deleted" );
 
     if( node->h_next )
         node->h_next->h_prev = node->h_prev;
@@ -3440,10 +3440,10 @@ cvInitTreeNodeIterator( CvTreeNodeIterator* treeIterator,
                         const void* first, int max_level )
 {
     if( !treeIterator || !first )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( max_level < 0 )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     treeIterator->node = (void*)first;
     treeIterator->level = 0;
@@ -3459,7 +3459,7 @@ cvNextTreeNode( CvTreeNodeIterator* treeIterator )
     int level;
 
     if( !treeIterator )
-        CV_Error( CV_StsNullPtr, "NULL iterator pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL iterator pointer" );
 
     prevNode = node = (CvTreeNode*)treeIterator->node;
     level = treeIterator->level;
@@ -3500,7 +3500,7 @@ cvPrevTreeNode( CvTreeNodeIterator* treeIterator )
     int level;
 
     if( !treeIterator )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     prevNode = node = (CvTreeNode*)treeIterator->node;
     level = treeIterator->level;
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index dbf5a52c174c..85966a5a52ba 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -64,8 +64,6 @@ namespace cv
                                Discrete Fourier Transform
 \****************************************************************************************/
 
-#define CV_MAX_LOCAL_DFT_SIZE  (1 << 15)
-
 static unsigned char bitrevTab[] =
 {
   0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,
@@ -3471,7 +3469,7 @@ Ptr<DFT2D> DFT2D::create(int width, int height, int depth,
     {
         if(width == 1 && nonzero_rows > 0 )
         {
-            CV_Error( CV_StsNotImplemented,
+            CV_Error( cv::Error::StsNotImplemented,
             "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
             "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
         }
@@ -4319,7 +4317,7 @@ class OcvDctImpl CV_FINAL : public hal::DCT2D
             if( len != prev_len )
             {
                 if( len > 1 && (len & 1) )
-                    CV_Error( CV_StsNotImplemented, "Odd-size DCT\'s are not implemented" );
+                    CV_Error( cv::Error::StsNotImplemented, "Odd-size DCT\'s are not implemented" );
 
                 opt.nf = DFTFactorize( len, opt.factors );
                 bool inplace_transform = opt.factors[0] == opt.factors[opt.nf-1];
@@ -4365,7 +4363,7 @@ struct ReplacementDCT2D : public hal::DCT2D
     ReplacementDCT2D() : context(0), isInitialized(false) {}
     bool init(int width, int height, int depth, int flags)
     {
-        int res = hal_ni_dctInit2D(&context, width, height, depth, flags);
+        int res = cv_hal_dctInit2D(&context, width, height, depth, flags);
         isInitialized = (res == CV_HAL_ERROR_OK);
         return isInitialized;
     }
diff --git a/modules/core/src/glob.cpp b/modules/core/src/glob.cpp
index b7cf1bf236c9..03638d49b187 100644
--- a/modules/core/src/glob.cpp
+++ b/modules/core/src/glob.cpp
@@ -276,7 +276,7 @@ static void glob_rec(const cv::String& directory, const cv::String& wildchart, s
     }
     else
     {
-        CV_Error_(CV_StsObjectNotFound, ("could not open directory: %s", directory.c_str()));
+        CV_Error_(cv::Error::StsObjectNotFound, ("could not open directory: %s", directory.c_str()));
     }
 }
 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
diff --git a/modules/core/src/hal_internal.cpp b/modules/core/src/hal_internal.cpp
index f581d05cf4f7..377c68801589 100644
--- a/modules/core/src/hal_internal.cpp
+++ b/modules/core/src/hal_internal.cpp
@@ -66,6 +66,7 @@
 
 #if defined(__clang__) && defined(__has_feature)
 #if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
 #define CV_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) \
 __msan_unpoison(address, size)
 #endif
@@ -110,8 +111,18 @@ set_value(fptype *dst, size_t dst_ld, fptype value, size_t m, size_t n)
 template <typename fptype> static inline int
 lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* info)
 {
-    int lda = (int)(a_step / sizeof(fptype)), sign = 0;
-    int* piv = new int[m];
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+    cv::AutoBuffer<long> piv_buff(m);
+    long lda = (long)(a_step / sizeof(fptype));
+    long _m = static_cast<long>(m), _n = static_cast<long>(n);
+    long _info[1];
+#else
+    cv::AutoBuffer<int> piv_buff(m);
+    int lda = (int)(a_step / sizeof(fptype));
+    int _m = m, _n = n;
+    int* _info = info;
+#endif
+    auto piv = piv_buff.data();
 
     transpose_square_inplace(a, lda, m);
 
@@ -120,9 +131,9 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int*
         if(n == 1 && b_step == sizeof(fptype))
         {
             if(typeid(fptype) == typeid(float))
-                sgesv_(&m, &n, (float*)a, &lda, piv, (float*)b, &m, info);
+                sgesv_(&_m, &_n, (float*)a, &lda, piv, (float*)b, &_m, _info);
             else if(typeid(fptype) == typeid(double))
-                dgesv_(&m, &n, (double*)a, &lda, piv, (double*)b, &m, info);
+                dgesv_(&_m, &_n, (double*)a, &lda, piv, (double*)b, &_m, _info);
         }
         else
         {
@@ -132,9 +143,9 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int*
             transpose(b, ldb, tmpB, m, m, n);
 
             if(typeid(fptype) == typeid(float))
-                sgesv_(&m, &n, (float*)a, &lda, piv, (float*)tmpB, &m, info);
+                sgesv_(&_m, &_n, (float*)a, &lda, piv, (float*)tmpB, &_m, _info);
             else if(typeid(fptype) == typeid(double))
-                dgesv_(&m, &n, (double*)a, &lda, piv, (double*)tmpB, &m, info);
+                dgesv_(&_m, &_n, (double*)a, &lda, piv, (double*)tmpB, &_m, _info);
 
             transpose(tmpB, m, b, ldb, n, m);
             delete[] tmpB;
@@ -143,11 +154,16 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int*
     else
     {
         if(typeid(fptype) == typeid(float))
-            sgetrf_(&m, &m, (float*)a, &lda, piv, info);
+            sgetrf_(&_m, &_m, (float*)a, &lda, piv, _info);
         else if(typeid(fptype) == typeid(double))
-            dgetrf_(&m, &m, (double*)a, &lda, piv, info);
+            dgetrf_(&_m, &_m, (double*)a, &lda, piv, _info);
     }
 
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+    *info = static_cast<int>(_info[0]);
+#endif
+
+    int sign = 0;
     if(*info == 0)
     {
         for(int i = 0; i < m; i++)
@@ -157,15 +173,21 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int*
     else
         *info = 0; //in opencv LU function zero means error
 
-    delete[] piv;
     return CV_HAL_ERROR_OK;
 }
 
 template <typename fptype> static inline int
 lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, bool* info)
 {
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+    long _m = static_cast<long>(m), _n = static_cast<long>(n);
+    long lapackStatus = 0;
+    long lda = (long)(a_step / sizeof(fptype));
+#else
+    int _m = m, _n = n;
     int lapackStatus = 0;
     int lda = (int)(a_step / sizeof(fptype));
+#endif
     char L[] = {'L', '\0'};
 
     if(b)
@@ -173,9 +195,9 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n
         if(n == 1 && b_step == sizeof(fptype))
         {
             if(typeid(fptype) == typeid(float))
-                OCV_LAPACK_FUNC(sposv)(L, &m, &n, (float*)a, &lda, (float*)b, &m, &lapackStatus);
+                OCV_LAPACK_FUNC(sposv)(L, &_m, &_n, (float*)a, &lda, (float*)b, &_m, &lapackStatus);
             else if(typeid(fptype) == typeid(double))
-                OCV_LAPACK_FUNC(dposv)(L, &m, &n, (double*)a, &lda, (double*)b, &m, &lapackStatus);
+                OCV_LAPACK_FUNC(dposv)(L, &_m, &_n, (double*)a, &lda, (double*)b, &_m, &lapackStatus);
         }
         else
         {
@@ -184,9 +206,9 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n
             transpose(b, ldb, tmpB, m, m, n);
 
             if(typeid(fptype) == typeid(float))
-                OCV_LAPACK_FUNC(sposv)(L, &m, &n, (float*)a, &lda, (float*)tmpB, &m, &lapackStatus);
+                OCV_LAPACK_FUNC(sposv)(L, &_m, &_n, (float*)a, &lda, (float*)tmpB, &_m, &lapackStatus);
             else if(typeid(fptype) == typeid(double))
-                OCV_LAPACK_FUNC(dposv)(L, &m, &n, (double*)a, &lda, (double*)tmpB, &m, &lapackStatus);
+                OCV_LAPACK_FUNC(dposv)(L, &_m, &_n, (double*)a, &lda, (double*)tmpB, &_m, &lapackStatus);
 
             transpose(tmpB, m, b, ldb, n, m);
             delete[] tmpB;
@@ -195,9 +217,9 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n
     else
     {
         if(typeid(fptype) == typeid(float))
-            OCV_LAPACK_FUNC(spotrf)(L, &m, (float*)a, &lda, &lapackStatus);
+            OCV_LAPACK_FUNC(spotrf)(L, &_m, (float*)a, &lda, &lapackStatus);
         else if(typeid(fptype) == typeid(double))
-            OCV_LAPACK_FUNC(dpotrf)(L, &m, (double*)a, &lda, &lapackStatus);
+            OCV_LAPACK_FUNC(dpotrf)(L, &_m, (double*)a, &lda, &lapackStatus);
     }
 
     if(lapackStatus == 0) *info = true;
@@ -209,11 +231,24 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n
 template <typename fptype> static inline int
 lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype* vt, size_t v_step, int m, int n, int flags, int* info)
 {
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+    long _m = static_cast<long>(m), _n = static_cast<long>(n);
+    long _info[1];
+    long lda = (long)(a_step / sizeof(fptype));
+    long ldv = (long)(v_step / sizeof(fptype));
+    long ldu = (long)(u_step / sizeof(fptype));
+    long lwork = -1;
+    cv::AutoBuffer<long> iworkBuf_(8 * std::min(m, n));
+#else
+    int _m = m, _n = n;
+    int* _info = info;
     int lda = (int)(a_step / sizeof(fptype));
     int ldv = (int)(v_step / sizeof(fptype));
     int ldu = (int)(u_step / sizeof(fptype));
     int lwork = -1;
-    int* iworkBuf = new int[8*std::min(m, n)];
+    cv::AutoBuffer<int> iworkBuf_(8 * std::min(m, n));
+#endif
+    auto iworkBuf = iworkBuf_.data();
     fptype work1 = 0;
 
     //A already transposed and m>=n
@@ -237,9 +272,9 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype
     }
 
     if(typeid(fptype) == typeid(float))
-        OCV_LAPACK_FUNC(sgesdd)(mode, &m, &n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)&work1, &lwork, iworkBuf, info);
+        OCV_LAPACK_FUNC(sgesdd)(mode, &_m, &_n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)&work1, &lwork, iworkBuf, _info);
     else if(typeid(fptype) == typeid(double))
-        OCV_LAPACK_FUNC(dgesdd)(mode, &m, &n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)&work1, &lwork, iworkBuf, info);
+        OCV_LAPACK_FUNC(dgesdd)(mode, &_m, &_n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)&work1, &lwork, iworkBuf, _info);
 
     lwork = (int)round(work1); //optimal buffer size
     fptype* buffer = new fptype[lwork + 1];
@@ -250,9 +285,13 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype
     CV_ANNOTATE_MEMORY_IS_INITIALIZED(buffer, sizeof(fptype) * (lwork + 1));
 
     if(typeid(fptype) == typeid(float))
-        OCV_LAPACK_FUNC(sgesdd)(mode, &m, &n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)buffer, &lwork, iworkBuf, info);
+        OCV_LAPACK_FUNC(sgesdd)(mode, &_m, &_n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)buffer, &lwork, iworkBuf, _info);
     else if(typeid(fptype) == typeid(double))
-        OCV_LAPACK_FUNC(dgesdd)(mode, &m, &n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)buffer, &lwork, iworkBuf, info);
+        OCV_LAPACK_FUNC(dgesdd)(mode, &_m, &_n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)buffer, &lwork, iworkBuf, _info);
+
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+    *info = static_cast<int>(_info[0]);
+#endif
 
     // Make sure MSAN sees the memory as having been written.
     // MSAN does not think it has been written because a different language was called.
@@ -275,7 +314,6 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype
         delete[] u;
     }
 
-    delete[] iworkBuf;
     delete[] buffer;
     return CV_HAL_ERROR_OK;
 }
@@ -283,14 +321,27 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype
 template <typename fptype> static inline int
 lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_step, fptype* dst, int* info)
 {
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+    long _m = static_cast<long>(m), _n = static_cast<long>(n), _k = static_cast<long>(k);
+    long _info[1];
+    long lda = (long)(a_step / sizeof(fptype));
+    long lwork = -1;
+    long ldtmpA;
+#else
+    int _m = m, _n = n, _k = k;
+    int* _info = info;
     int lda = (int)(a_step / sizeof(fptype));
+    int lwork = -1;
+    int ldtmpA;
+#endif
+
     char mode[] = { 'N', '\0' };
     if(m < n)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
     std::vector<fptype> tmpAMemHolder;
     fptype* tmpA;
-    int ldtmpA;
+
     if (m == n)
     {
         transpose_square_inplace(a, lda, m);
@@ -305,7 +356,6 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste
         transpose(a, lda, tmpA, m, m, n);
     }
 
-    int lwork = -1;
     fptype work1 = 0.;
 
     if (b)
@@ -313,18 +363,18 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste
         if (k == 1 && b_step == sizeof(fptype))
         {
             if (typeid(fptype) == typeid(float))
-                OCV_LAPACK_FUNC(sgels)(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)b, &m, (float*)&work1, &lwork, info);
+                OCV_LAPACK_FUNC(sgels)(mode, &_m, &_n, &_k, (float*)tmpA, &ldtmpA, (float*)b, &_m, (float*)&work1, &lwork, _info);
             else if (typeid(fptype) == typeid(double))
-                OCV_LAPACK_FUNC(dgels)(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)b, &m, (double*)&work1, &lwork, info);
+                OCV_LAPACK_FUNC(dgels)(mode, &_m, &_n, &_k, (double*)tmpA, &ldtmpA, (double*)b, &_m, (double*)&work1, &lwork, _info);
 
             lwork = cvRound(work1); //optimal buffer size
             std::vector<fptype> workBufMemHolder(lwork + 1);
             fptype* buffer = &workBufMemHolder.front();
 
             if (typeid(fptype) == typeid(float))
-                OCV_LAPACK_FUNC(sgels)(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)b, &m, (float*)buffer, &lwork, info);
+                OCV_LAPACK_FUNC(sgels)(mode, &_m, &_n, &_k, (float*)tmpA, &ldtmpA, (float*)b, &_m, (float*)buffer, &lwork, _info);
             else if (typeid(fptype) == typeid(double))
-                OCV_LAPACK_FUNC(dgels)(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)b, &m, (double*)buffer, &lwork, info);
+                OCV_LAPACK_FUNC(dgels)(mode, &_m, &_n, &_k, (double*)tmpA, &ldtmpA, (double*)b, &_m, (double*)buffer, &lwork, _info);
         }
         else
         {
@@ -334,18 +384,18 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste
             transpose(b, ldb, tmpB, m, m, k);
 
             if (typeid(fptype) == typeid(float))
-                OCV_LAPACK_FUNC(sgels)(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)tmpB, &m, (float*)&work1, &lwork, info);
+                OCV_LAPACK_FUNC(sgels)(mode, &_m, &_n, &_k, (float*)tmpA, &ldtmpA, (float*)tmpB, &_m, (float*)&work1, &lwork, _info);
             else if (typeid(fptype) == typeid(double))
-                OCV_LAPACK_FUNC(dgels)(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)tmpB, &m, (double*)&work1, &lwork, info);
+                OCV_LAPACK_FUNC(dgels)(mode, &_m, &_n, &_k, (double*)tmpA, &ldtmpA, (double*)tmpB, &_m, (double*)&work1, &lwork, _info);
 
             lwork = cvRound(work1); //optimal buffer size
             std::vector<fptype> workBufMemHolder(lwork + 1);
             fptype* buffer = &workBufMemHolder.front();
 
             if (typeid(fptype) == typeid(float))
-                OCV_LAPACK_FUNC(sgels)(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)tmpB, &m, (float*)buffer, &lwork, info);
+                OCV_LAPACK_FUNC(sgels)(mode, &_m, &_n, &_k, (float*)tmpA, &ldtmpA, (float*)tmpB, &_m, (float*)buffer, &lwork, _info);
             else if (typeid(fptype) == typeid(double))
-                OCV_LAPACK_FUNC(dgels)(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)tmpB, &m, (double*)buffer, &lwork, info);
+                OCV_LAPACK_FUNC(dgels)(mode, &_m, &_n, &_k, (double*)tmpA, &ldtmpA, (double*)tmpB, &_m, (double*)buffer, &lwork, _info);
 
             transpose(tmpB, m, b, ldb, k, m);
         }
@@ -353,18 +403,18 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste
     else
     {
         if (typeid(fptype) == typeid(float))
-            sgeqrf_(&m, &n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)&work1, &lwork, info);
+            sgeqrf_(&_m, &_n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)&work1, &lwork, _info);
         else if (typeid(fptype) == typeid(double))
-            dgeqrf_(&m, &n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)&work1, &lwork, info);
+            dgeqrf_(&_m, &_n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)&work1, &lwork, _info);
 
         lwork = cvRound(work1); //optimal buffer size
         std::vector<fptype> workBufMemHolder(lwork + 1);
         fptype* buffer = &workBufMemHolder.front();
 
         if (typeid(fptype) == typeid(float))
-            sgeqrf_(&m, &n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)buffer, &lwork, info);
+            sgeqrf_(&_m, &_n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)buffer, &lwork, _info);
         else if (typeid(fptype) == typeid(double))
-            dgeqrf_(&m, &n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)buffer, &lwork, info);
+            dgeqrf_(&_m, &_n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)buffer, &lwork, _info);
     }
 
     CV_ANNOTATE_MEMORY_IS_INITIALIZED(info, sizeof(int));
@@ -373,6 +423,10 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste
     else
         transpose(tmpA, m, a, lda, n, m);
 
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+    *info = static_cast<int>(_info[0]);
+#endif
+
     if (*info != 0)
         *info = 0;
     else
@@ -457,7 +511,6 @@ lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src
     return CV_HAL_ERROR_OK;
 }
 
-
 template <typename fptype> static inline int
 lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
             const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags)
@@ -528,10 +581,29 @@ lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t s
     else if(src3_step == 0 && beta != 0.0)
         set_value((std::complex<fptype>*)dst, lddst, std::complex<fptype>(0.0, 0.0), d_m, d_n);
 
+    // FIXME: this is a workaround. Support ILP64 in HAL API.
+#if defined (ACCELERATE_NEW_LAPACK) && defined (ACCELERATE_LAPACK_ILP64)
+    int M = a_m, N = d_n, K = a_n;
+    if(typeid(fptype) == typeid(float)) {
+        auto src1_cast = (std::complex<float>*)(src1);
+        auto src2_cast = (std::complex<float>*)(src2);
+        auto dst_cast = (std::complex<float>*)(dst);
+        long lda = ldsrc1, ldb = ldsrc2, ldc = lddst;
+        cblas_cgemm(CblasRowMajor, transA, transB, M, N, K, (std::complex<float>*)&cAlpha, src1_cast, lda, src2_cast, ldb, (std::complex<float>*)&cBeta, dst_cast, ldc);
+    }
+    else if(typeid(fptype) == typeid(double)) {
+        auto src1_cast = (std::complex<double>*)(src1);
+        auto src2_cast = (std::complex<double>*)(src2);
+        auto dst_cast = (std::complex<double>*)(dst);
+        long lda = ldsrc1, ldb = ldsrc2, ldc = lddst;
+        cblas_zgemm(CblasRowMajor, transA, transB, M, N, K, (std::complex<double>*)&cAlpha, src1_cast, lda, src2_cast, ldb, (std::complex<double>*)&cBeta, dst_cast, ldc);
+    }
+#else
     if(typeid(fptype) == typeid(float))
         cblas_cgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (float*)reinterpret_cast<fptype(&)[2]>(cAlpha), (float*)src1, ldsrc1, (float*)src2, ldsrc2, (float*)reinterpret_cast<fptype(&)[2]>(cBeta), (float*)dst, lddst);
     else if(typeid(fptype) == typeid(double))
         cblas_zgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (double*)reinterpret_cast<fptype(&)[2]>(cAlpha), (double*)src1, ldsrc1, (double*)src2, ldsrc2, (double*)reinterpret_cast<fptype(&)[2]>(cBeta), (double*)dst, lddst);
+#endif
 
     return CV_HAL_ERROR_OK;
 }
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index 25acff662ca1..f78608dbade7 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -69,10 +69,14 @@
 /**
 Add: _dst[i] = src1[i] + src2[i]_ @n
 Sub: _dst[i] = src1[i] - src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_addsub Element-wise add and subtract
 //! @{
@@ -91,15 +95,22 @@ inline int hal_ni_sub16s(const short *src1_data, size_t src1_step, const short *
 inline int hal_ni_sub32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_sub32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_sub64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+inline int hal_ni_sub8u32f(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub8s32f(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}
 
 /**
 Minimum: _dst[i] = min(src1[i], src2[i])_ @n
 Maximum: _dst[i] = max(src1[i], src2[i])_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_minmax Element-wise minimum or maximum
 //! @{
@@ -122,11 +133,14 @@ inline int hal_ni_min64f(const double *src1_data, size_t src1_step, const double
 
 /**
 Absolute difference: _dst[i] = | src1[i] - src2[i] |_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
-@param scale additional multiplier
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 */
 //! @addtogroup core_hal_interface_absdiff Element-wise absolute difference
 //! @{
@@ -144,10 +158,14 @@ Bitwise AND: _dst[i] = src1[i] & src2[i]_ @n
 Bitwise OR: _dst[i] = src1[i] | src2[i]_ @n
 Bitwise XOR: _dst[i] = src1[i] ^ src2[i]_ @n
 Bitwise NOT: _dst[i] = !src[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
  */
 //! @addtogroup core_hal_interface_logical Bitwise logical operations
 //! @{
@@ -172,6 +190,8 @@ inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data,
 #define cv_hal_sub32s hal_ni_sub32s
 #define cv_hal_sub32f hal_ni_sub32f
 #define cv_hal_sub64f hal_ni_sub64f
+#define cv_hal_sub8u32f hal_ni_sub8u32f
+#define cv_hal_sub8s32f hal_ni_sub8s32f
 #define cv_hal_max8u hal_ni_max8u
 #define cv_hal_max8s hal_ni_max8s
 #define cv_hal_max16u hal_ni_max16u
@@ -199,12 +219,72 @@ inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data,
 #define cv_hal_not8u hal_ni_not8u
 //! @endcond
 
+/**
+Lookup table replacement
+Table consists of 256 elements of a size from 1 to 8 bytes having 1 channel or src_channels
+For 8s input type 128 is added to LUT index
+Destination should have the same element type and number of channels as lookup table elements
+@param src_data Source image data
+@param src_step Source image step
+@param src_type Sorce image type
+@param lut_data Pointer to lookup table
+@param lut_channel_size Size of each channel in bytes
+@param lut_channels Number of channels in lookup table
+@param dst_data Destination data
+@param dst_step Destination step
+@param width Width of images
+@param height Height of images
+@sa LUT
+*/
+//! @addtogroup core_hal_interface_lut Lookup table
+//! @{
+inline int hal_ni_lut(const uchar *src_data, size_t src_step, size_t src_type, const uchar* lut_data, size_t lut_channel_size, size_t lut_channels, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_lut hal_ni_lut
+//! @endcond
+
+/**
+Hamming norm of a vector
+@param a pointer to vector data
+@param n length of a vector
+@param cellSize how many bits of the vector will be added and treated as a single bit, can be 1 (standard Hamming distance), 2 or 4
+@param result pointer to result output
+*/
+//! @addtogroup core_hal_interface_hamming Hamming distance
+//! @{
+inline int hal_ni_normHamming8u(const uchar* a, int n, int cellSize, int* result) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+/**
+Hamming distance between two vectors
+@param a pointer to first vector data
+@param b pointer to second vector data
+@param n length of vectors
+@param cellSize how many bits of the vectors will be added and treated as a single bit, can be 1 (standard Hamming distance), 2 or 4
+@param result pointer to result output
+*/
+//! @addtogroup core_hal_interface_hamming Hamming distance
+//! @{
+inline int hal_ni_normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_normHamming8u hal_ni_normHamming8u
+#define cv_hal_normHammingDiff8u hal_ni_normHammingDiff8u
+//! @endcond
+
 /**
 Compare: _dst[i] = src1[i] op src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 @param operation one of (CV_HAL_CMP_EQ, CV_HAL_CMP_GT, ...)
 */
 //! @addtogroup core_hal_interface_compare Element-wise compare
@@ -230,10 +310,14 @@ inline int hal_ni_cmp64f(const double *src1_data, size_t src1_step, const double
 
 /**
 Multiply: _dst[i] = scale * src1[i] * src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 @param scale additional multiplier
 */
 //! @addtogroup core_hal_interface_multiply Element-wise multiply
@@ -245,14 +329,20 @@ inline int hal_ni_mul16s(const short *src1_data, size_t src1_step, const short *
 inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8u16u(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8s16s(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}
 
 /**
 Divide: _dst[i] = scale * src1[i] / src2[i]_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data and step
+@param src1_step first source image data and step
+@param src2_data second source image data and step
+@param src2_step second source image data and step
+@param dst_data destination image data and step
+@param dst_step destination image data and step
+@param width dimensions of the images
+@param height dimensions of the images
 @param scale additional multiplier
 */
 //! @addtogroup core_hal_interface_divide Element-wise divide
@@ -268,9 +358,12 @@ inline int hal_ni_div64f(const double *src1_data, size_t src1_step, const double
 
 /**
 Computes reciprocial: _dst[i] = scale / src[i]_
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 @param scale additional multiplier
  */
 //! @addtogroup core_hal_interface_reciprocial Element-wise reciprocial
@@ -292,6 +385,8 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_
 #define cv_hal_mul32s hal_ni_mul32s
 #define cv_hal_mul32f hal_ni_mul32f
 #define cv_hal_mul64f hal_ni_mul64f
+#define cv_hal_mul8u16u hal_ni_mul8u16u
+#define cv_hal_mul8s16s hal_ni_mul8s16s
 #define cv_hal_div8u hal_ni_div8u
 #define cv_hal_div8s hal_ni_div8s
 #define cv_hal_div16u hal_ni_div16u
@@ -310,10 +405,14 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_
 
 /**
 Computes weighted sum of two arrays using formula: _dst[i] = a * src1[i] + b * src2[i] + c_
-@param src1_data,src1_step first source image data and step
-@param src2_data,src2_step second source image data and step
-@param dst_data,dst_step destination image data and step
-@param width,height dimensions of the images
+@param src1_data first source image data
+@param src1_step first source image step
+@param src2_data second source image data
+@param src2_step second source image step
+@param dst_data destination image data
+@param dst_step destination image step
+@param width width of the images
+@param height height of the images
 @param scalars numbers _a_, _b_, and _c_
  */
 //! @addtogroup core_hal_interface_addWeighted Element-wise weighted sum
@@ -379,9 +478,28 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #define cv_hal_merge64s hal_ni_merge64s
 //! @endcond
 
+/**
+@param x source X arrays
+@param y source Y arrays
+@param mag destination magnitude array
+@param angle destination angle array
+@param len length of arrays
+@param angleInDegrees if set to true return angles in degrees, otherwise in radians
+*/
+//! @addtogroup core_hal_interface_fastAtan Atan calculation
+//! @{
+inline int hal_ni_cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_cartToPolar32f hal_ni_cartToPolar32f
+#define cv_hal_cartToPolar64f hal_ni_cartToPolar64f
+//! @endcond
 
 /**
-@param y,x source Y and X arrays
+@param y source Y arrays
+@param x source X arrays
 @param dst destination array
 @param len length of arrays
 @param angleInDegrees if set to true return angles in degrees, otherwise in radians
@@ -399,7 +517,8 @@ inline int hal_ni_fastAtan64f(const double* y, const double* x, double* dst, int
 
 
 /**
-@param x,y source X and Y arrays
+@param x source X array
+@param y source Y array
 @param dst destination array
 @param len length of arrays
  */
@@ -414,6 +533,24 @@ inline int hal_ni_magnitude64f(const double *x, const double  *y, double *dst, i
 #define cv_hal_magnitude64f hal_ni_magnitude64f
 //! @endcond
 
+/**
+@param mag source magnitude arrays
+@param mag source angle arrays
+@param x destination X array
+@param y destination Y array
+@param len length of arrays
+@param angleInDegrees if set to true interpret angles from degrees, otherwise from radians
+*/
+//! @addtogroup core_hal_interface_fastAtan Atan calculation
+//! @{
+inline int hal_ni_polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_polarToCart32f hal_ni_polarToCart32f
+#define cv_hal_polarToCart64f hal_ni_polarToCart64f
+//! @endcond
 
 /**
 @param src source array
@@ -530,7 +667,8 @@ inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
 
 /**
 @param context double pointer to context storing all necessary data
-@param width,height image dimensions
+@param width image width
+@param height image height
 @param depth image type (CV_32F or CV_64F)
 @param src_channels number of channels in input image
 @param dst_channels number of channels in output image
@@ -540,8 +678,10 @@ inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
 inline int hal_ni_dftInit2D(cvhalDFT **context, int width, int height, int depth, int src_channels, int dst_channels, int flags, int nonzero_rows) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
 @param context pointer to context storing all necessary data
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
  */
 inline int hal_ni_dft2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
@@ -557,15 +697,18 @@ inline int hal_ni_dftFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
 
 /**
 @param context double pointer to context storing all necessary data
-@param width,height image dimensions
+@param width image width
+@param height image height
 @param depth image type (CV_32F or CV_64F)
 @param flags algorithm options (combination of CV_HAL_DFT_INVERSE, ...)
  */
 inline int hal_ni_dctInit2D(cvhalDFT **context, int width, int height, int depth, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
 @param context pointer to context storing all necessary data
-@param src_data,src_step source image data and step
-@param dst_data,dst_step destination image data and step
+@param src_data source image data
+@param src_step source image step
+@param dst_data destination image data
+@param dst_step destination image step
  */
 inline int hal_ni_dct2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 /**
@@ -717,11 +860,15 @@ inline int hal_ni_gemm64fc(const double* src1, size_t src1_step, const double* s
 
 /**
    @brief Finds the global minimum and maximum in an array.
-   @param src_data,src_step Source image
-   @param width,height Source image dimensions
+   @param src_data Source image
+   @param src_step Source image
+   @param width Source image dimensions
+   @param height Source image dimensions
    @param depth Depth of source image
-   @param minVal,maxVal Pointer to the returned global minimum and maximum in an array.
-   @param minIdx,maxIdx Pointer to the returned minimum and maximum location.
+   @param minVal Pointer to the returned global minimum and maximum in an array.
+   @param maxVal Pointer to the returned global minimum and maximum in an array.
+   @param minIdx Pointer to the returned minimum and maximum location.
+   @param maxIdx Pointer to the returned minimum and maximum location.
    @param mask Specified array region.
 */
 inline int hal_ni_minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal,
@@ -731,6 +878,81 @@ inline int hal_ni_minMaxIdx(const uchar* src_data, size_t src_step, int width, i
 #define cv_hal_minMaxIdx hal_ni_minMaxIdx
 //! @endcond
 
+/**
+   @brief calculates the mean and the standard deviation of array elements independently for each channel
+   @param src_data Source image
+   @param src_step Source image
+   @param width Source image dimensions
+   @param height Source image dimensions
+   @param src_type Type of source image
+   @param mean_val Array of per-channel mean values. May be nullptr, if mean value is not required.
+   @param stddev_val Array of per-channel standard deviation values. May be nullptr, if stddev value is not required.
+   @param mask Specified array region.
+   @param mask_step Mask array step.
+   @sa meanStdDev
+*/
+inline int hal_ni_meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
+                             int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step)
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_meanStdDev hal_ni_meanStdDev
+//! @endcond
+
+/**
+   @brief hal_flip
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source and destination image width
+   @param src_height source and destination image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param flip_mode 0 flips around x-axis, positive around y-axis, negative both
+ */
+inline int hal_ni_flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+                       uchar* dst_data, size_t dst_step, int flip_mode) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_flip hal_ni_flip
+//! @endcond
+
+/**
+   @brief rotate90
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   If angle has value [180] it is also destination image width
+   If angle has values [90, 270] it is also destination image height
+   @param src_height source and destination image height (destination image width for angles [90, 270])
+   If angle has value [180] it is also destination image height
+   If angle has values [90, 270] it is also destination image width
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param angle clockwise angle for rotation in degrees from set [90, 180, 270]
+ */
+inline int hal_ni_rotate90(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+                           uchar* dst_data, size_t dst_step, int angle) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_rotate90 hal_ni_rotate90
+//! @endcond
+
+/**
+   @brief Transpose2d
+   @param src_data,src_step Source image
+   @param dst_data,dst_step Destination image
+   @param src_width,src_height Source image dimensions
+   @param element_size Size of an element in bytes
+*/
+inline int hal_ni_transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int src_width,
+                              int src_height, int element_size) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_transpose2d hal_ni_transpose2d
+//! @endcond
+
 //! @}
 
 
diff --git a/modules/core/src/has_non_zero.simd.hpp b/modules/core/src/has_non_zero.simd.hpp
index 6ea8bcd7d2d1..29a1de0113cd 100644
--- a/modules/core/src/has_non_zero.simd.hpp
+++ b/modules/core/src/has_non_zero.simd.hpp
@@ -87,11 +87,11 @@ static bool hasNonZero8u( const uchar* src, size_t len )
 {
     bool res = false;
     const uchar* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef v_uint8 v_type;
     const v_type v_zero = vx_setzero_u8();
     constexpr const int unrollCount = 2;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const uchar* srcSimdEnd = src+len0;
 
@@ -99,10 +99,10 @@ static bool hasNonZero8u( const uchar* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
-        res = v_check_any(((v0 | v1) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        res = v_check_any((v_ne(v_or(v0, v1), v_zero)));
     }
 
     v_cleanup();
@@ -114,11 +114,11 @@ static bool hasNonZero16u( const ushort* src, size_t len )
 {
     bool res = false;
     const ushort* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef v_uint16 v_type;
     const v_type v_zero = vx_setzero_u16();
     constexpr const int unrollCount = 4;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const ushort* srcSimdEnd = src+len0;
 
@@ -126,16 +126,16 @@ static bool hasNonZero16u( const ushort* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v3 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        res = v_check_any(((v0 | v2) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        res = v_check_any((v_ne(v_or(v0, v2), v_zero)));
     }
 
     v_cleanup();
@@ -147,11 +147,11 @@ static bool hasNonZero32s( const int* src, size_t len )
 {
     bool res = false;
     const int* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef v_int32 v_type;
     const v_type v_zero = vx_setzero_s32();
     constexpr const int unrollCount = 8;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const int* srcSimdEnd = src+len0;
 
@@ -159,29 +159,29 @@ static bool hasNonZero32s( const int* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v7 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        v4 |= v5;
-        v6 |= v7;
-
-        v0 |= v2;
-        v4 |= v6;
-        res = v_check_any(((v0 | v4) != v_zero));
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);
+
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
+        res = v_check_any((v_ne(v_or(v0, v4), v_zero)));
     }
 
     v_cleanup();
@@ -193,11 +193,11 @@ static bool hasNonZero32f( const float* src, size_t len )
 {
     bool res = false;
     const float* srcEnd = src+len;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     typedef v_float32 v_type;
     const v_type v_zero = vx_setzero_f32();
     constexpr const int unrollCount = 8;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const float* srcSimdEnd = src+len0;
 
@@ -205,30 +205,30 @@ static bool hasNonZero32f( const float* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v7 = vx_load(src);
-        src += v_type::nlanes;
-        v0 |= v1;
-        v2 |= v3;
-        v4 |= v5;
-        v6 |= v7;
-
-        v0 |= v2;
-        v4 |= v6;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);
+
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
         //res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
-        res = !v_check_all(((v0 | v4) == v_zero));
+        res = !v_check_all((v_eq(v_or(v0, v4), v_zero)));
     }
 
     v_cleanup();
@@ -240,11 +240,11 @@ static bool hasNonZero64f( const double* src, size_t len )
 {
     bool res = false;
     const double* srcEnd = src+len;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     typedef v_float64 v_type;
     const v_type v_zero = vx_setzero_f64();
     constexpr const int unrollCount = 16;
-    int step = v_type::nlanes * unrollCount;
+    int step = VTraits<v_type>::vlanes() * unrollCount;
     int len0 = len & -step;
     const double* srcSimdEnd = src+len0;
 
@@ -252,55 +252,55 @@ static bool hasNonZero64f( const double* src, size_t len )
     while(!res && countSIMD--)
     {
         v_type v0 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v1 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v2 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v3 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v4 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v5 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v6 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v7 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v8 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v9 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v10 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v11 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v12 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v13 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v14 = vx_load(src);
-        src += v_type::nlanes;
+        src += VTraits<v_type>::vlanes();
         v_type v15 = vx_load(src);
-        src += v_type::nlanes;
-        v0  |= v1;
-        v2  |= v3;
-        v4  |= v5;
-        v6  |= v7;
-        v8  |= v9;
-        v10 |= v11;
-        v12 |= v13;
-        v14 |= v15;
-
-        v0  |= v2;
-        v4  |= v6;
-        v8  |= v10;
-        v12 |= v14;
-
-        v0  |= v4;
-        v8  |= v12;
+        src += VTraits<v_type>::vlanes();
+        v0 = v_or(v0, v1);
+        v2 = v_or(v2, v3);
+        v4 = v_or(v4, v5);
+        v6 = v_or(v6, v7);
+        v8 = v_or(v8, v9);
+        v10 = v_or(v10, v11);
+        v12 = v_or(v12, v13);
+        v14 = v_or(v14, v15);
+
+        v0 = v_or(v0, v2);
+        v4 = v_or(v4, v6);
+        v8 = v_or(v8, v10);
+        v12 = v_or(v12, v14);
+
+        v0 = v_or(v0, v4);
+        v8 = v_or(v8, v12);
         //res = v_check_any(((v0 | v8) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ
-        res = !v_check_all(((v0 | v8) == v_zero));
+        res = !v_check_all((v_eq(v_or(v0, v8), v_zero)));
     }
 
     v_cleanup();
@@ -310,7 +310,7 @@ static bool hasNonZero64f( const double* src, size_t len )
 
 HasNonZeroFunc getHasNonZeroTab(int depth)
 {
-    static HasNonZeroFunc hasNonZeroTab[] =
+    static HasNonZeroFunc hasNonZeroTab[CV_DEPTH_MAX] =
     {
         (HasNonZeroFunc)GET_OPTIMIZED(hasNonZero8u), (HasNonZeroFunc)GET_OPTIMIZED(hasNonZero8u),
         (HasNonZeroFunc)GET_OPTIMIZED(hasNonZero16u), (HasNonZeroFunc)GET_OPTIMIZED(hasNonZero16u),
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index a644fe15a7ae..83adeeabc39a 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -274,22 +274,21 @@ template<typename T> struct VBLAS
 {
     int dot(const T*, const T*, int, T*) const { return 0; }
     int givens(T*, T*, int, T, T) const { return 0; }
-    int givensx(T*, T*, int, T, T, T*, T*) const { return 0; }
 };
 
-#if CV_SIMD
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
 template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
 {
-    if( n < 2*v_float32::nlanes )
+    if( n < 2*VTraits<v_float32>::vlanes() )
         return 0;
     int k = 0;
     v_float32 s0 = vx_setzero_f32();
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
+    for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
     {
         v_float32 a0 = vx_load(a + k);
         v_float32 b0 = vx_load(b + k);
 
-        s0 += a0 * b0;
+        s0 = v_add(s0, v_mul(a0, b0));
     }
     *result = v_reduce_sum(s0);
     vx_cleanup();
@@ -299,16 +298,16 @@ template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, f
 
 template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, float s) const
 {
-    if( n < v_float32::nlanes)
+    if( n < VTraits<v_float32>::vlanes())
         return 0;
     int k = 0;
     v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
+    for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
     {
         v_float32 a0 = vx_load(a + k);
         v_float32 b0 = vx_load(b + k);
-        v_float32 t0 = (a0 * c4) + (b0 * s4);
-        v_float32 t1 = (b0 * c4) - (a0 * s4);
+        v_float32 t0 = v_add(v_mul(a0, c4), v_mul(b0, s4));
+        v_float32 t1 = v_sub(v_mul(b0, c4), v_mul(a0, s4));
         v_store(a + k, t0);
         v_store(b + k, t1);
     }
@@ -317,44 +316,19 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
 }
 
 
-template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c, float s,
-                                             float* anorm, float* bnorm) const
-{
-    if( n < v_float32::nlanes)
-        return 0;
-    int k = 0;
-    v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
-    v_float32 sa = vx_setzero_f32(), sb = vx_setzero_f32();
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
-    {
-        v_float32 a0 = vx_load(a + k);
-        v_float32 b0 = vx_load(b + k);
-        v_float32 t0 = (a0 * c4) + (b0 * s4);
-        v_float32 t1 = (b0 * c4) - (a0 * s4);
-        v_store(a + k, t0);
-        v_store(b + k, t1);
-        sa += t0 + t0;
-        sb += t1 + t1;
-    }
-    *anorm = v_reduce_sum(sa);
-    *bnorm = v_reduce_sum(sb);
-    vx_cleanup();
-    return k;
-}
-
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n, double* result) const
 {
-    if( n < 2*v_float64::nlanes )
+    if( n < 2*VTraits<v_float64>::vlanes() )
         return 0;
     int k = 0;
     v_float64 s0 = vx_setzero_f64();
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
+    for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
     {
         v_float64 a0 = vx_load(a + k);
         v_float64 b0 = vx_load(b + k);
 
-        s0 += a0 * b0;
+        s0 = v_add(s0, v_mul(a0, b0));
     }
     double sbuf[2];
     v_store(sbuf, s0);
@@ -368,12 +342,12 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
 {
     int k = 0;
     v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
+    for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
     {
         v_float64 a0 = vx_load(a + k);
         v_float64 b0 = vx_load(b + k);
-        v_float64 t0 = (a0 * c2) + (b0 * s2);
-        v_float64 t1 = (b0 * c2) - (a0 * s2);
+        v_float64 t0 = v_add(v_mul(a0, c2), v_mul(b0, s2));
+        v_float64 t1 = v_sub(v_mul(b0, c2), v_mul(a0, s2));
         v_store(a + k, t0);
         v_store(b + k, t1);
     }
@@ -382,30 +356,6 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
 }
 
 
-template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double c, double s,
-                                              double* anorm, double* bnorm) const
-{
-    int k = 0;
-    v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
-    v_float64 sa = vx_setzero_f64(), sb = vx_setzero_f64();
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
-    {
-        v_float64 a0 = vx_load(a + k);
-        v_float64 b0 = vx_load(b + k);
-        v_float64 t0 = (a0 * c2) + (b0 * s2);
-        v_float64 t1 = (b0 * c2) - (a0 * s2);
-        v_store(a + k, t0);
-        v_store(b + k, t1);
-        sa += t0 * t0;
-        sb += t1 * t1;
-    }
-    double abuf[2], bbuf[2];
-    v_store(abuf, sa);
-    v_store(bbuf, sb);
-    *anorm = abuf[0] + abuf[1];
-    *bnorm = bbuf[0] + bbuf[1];
-    return k;
-}
 #endif //CV_SIMD_64F
 #endif //CV_SIMD
 
@@ -916,7 +866,7 @@ double invert( InputArray _src, OutputArray _dst, int method )
                 #if CV_SIMD128
                     const float d_32f = (float)d;
                     const v_float32x4 d_vec(d_32f, -d_32f, -d_32f, d_32f);
-                    v_float32x4 s0 = v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * d_vec;//0123//3120
+                    v_float32x4 s0 = v_mul(v_load_halves((const float *)srcdata, (const float *)(srcdata + srcstep)), d_vec);//0123//3120
                     s0 = v_extract<3>(s0, v_combine_low(v_rotate_right<1>(s0), s0));
                     v_store_low((float*)dstdata, s0);
                     v_store_high((float*)(dstdata + dststep), s0);
@@ -942,10 +892,10 @@ double invert( InputArray _src, OutputArray _dst, int method )
                     d = 1./d;
                 #if CV_SIMD128_64F
                     v_float64x2 det = v_setall_f64(d);
-                    v_float64x2 s0 = v_load((const double*)srcdata) * det;
-                    v_float64x2 s1 = v_load((const double*)(srcdata+srcstep)) * det;
+                    v_float64x2 s0 = v_mul(v_load((const double *)srcdata), det);
+                    v_float64x2 s1 = v_mul(v_load((const double *)(srcdata + srcstep)), det);
                     v_float64x2 sm = v_extract<1>(s1, s0);//30
-                    v_float64x2 ss = v_setall<double>(0) - v_extract<1>(s0, s1);//12
+                    v_float64x2 ss = v_sub(v_setall<double>(0), v_extract<1>(s0, s1));//12
                     v_store((double*)dstdata, v_combine_low(sm, ss));//31
                     v_store((double*)(dstdata + dststep), v_combine_high(ss, sm));//20
                 #else
@@ -1241,7 +1191,7 @@ bool solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int method )
     Mat dst = _dst.getMat();
 
     if( m < n )
-        CV_Error(CV_StsBadArg, "The function can not solve under-determined linear systems" );
+        CV_Error(cv::Error::StsBadArg, "The function can not solve under-determined linear systems" );
 
     if( m == n )
         is_normal = false;
@@ -1565,7 +1515,7 @@ void SVD::backSubst( InputArray _w, InputArray _u, InputArray _vt,
                vt.ptr<double>(), vt.step, true, rhs.ptr<double>(), rhs.step, nb,
                dst.ptr<double>(), dst.step, buffer.data());
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "" );
 }
 
 
diff --git a/modules/core/src/lut.cpp b/modules/core/src/lut.cpp
index f5dc205082a0..031f113bec23 100644
--- a/modules/core/src/lut.cpp
+++ b/modules/core/src/lut.cpp
@@ -68,7 +68,7 @@ static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len
 
 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn );
 
-static LUTFunc lutTab[] =
+static LUTFunc lutTab[CV_DEPTH_MAX] =
 {
     (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s,
     (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0
@@ -330,7 +330,7 @@ class LUTParallelBody : public ParallelLoopBody
 
     void operator()( const cv::Range& range ) const CV_OVERRIDE
     {
-        CV_DbgAssert(*ok);
+        CV_Assert(*ok);
 
         const int row0 = range.start;
         const int row1 = range.end;
@@ -377,6 +377,9 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
     CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_TABLE_LOOKUP>(src.cols, src.rows),
                openvx_LUT(src, dst, lut))
 
+    CALL_HAL(LUT, cv_hal_lut, src.data, src.step, src.type(), lut.data,
+             lut.elemSize1(), lutcn, dst.data, dst.step, src.cols, src.rows);
+
 #if !IPP_DISABLE_PERF_LUT
     CV_IPP_RUN(_src.dims() <= 2, ipp_lut(src, lut, dst));
 #endif
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 056be63a716a..764d2d9b03d3 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -233,16 +233,26 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
             rowsPerWI = d.isIntel() ? 4 : 1;
     bool doubleSupport = d.doubleFPConfig() > 0;
 
+    const bool _src1IsDstMag = (_src1.getObj() == _dst1.getObj());
+    const bool _src1IsDstAngle = (_src1.getObj() == _dst2.getObj());
+    const bool _src2IsDstMag = (_src2.getObj() == _dst1.getObj());
+    const bool _src2IsDstAngle = (_src2.getObj() == _dst2.getObj());
+
     if ( !(_src1.dims() <= 2 && _src2.dims() <= 2 &&
            (depth == CV_32F || depth == CV_64F) && type == _src2.type()) ||
          (depth == CV_64F && !doubleSupport) )
         return false;
 
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D BINARY_OP -D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d -D OP_CTP_%s%s",
+                  format("-D BINARY_OP -D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d -D OP_CTP_%s%s%s%s%s%s",
                          ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), depth,
                          rowsPerWI, angleInDegrees ? "AD" : "AR",
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         _src1IsDstMag   ? " -D SRC1_IS_DST_MAG" : "",
+                         _src1IsDstAngle ? " -D SRC1_IS_DST_ANGLE" : "",
+                         _src2IsDstMag   ? " -D SRC2_IS_DST_MAG" : "",
+                         _src2IsDstAngle ? " -D SRC2_IS_DST_ANGLE" : ""
+                         ));
     if (k.empty())
         return false;
 
@@ -254,8 +264,8 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
     _dst2.create(size, type);
     UMat dst1 = _dst1.getUMat(), dst2 = _dst2.getUMat();
 
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
-           ocl::KernelArg::ReadOnlyNoSize(src2),
+    k.args(_src1IsDstMag || _src1IsDstAngle ? ocl::KernelArg::ReadWriteNoSize(src1) : ocl::KernelArg::ReadOnlyNoSize(src1),
+           _src2IsDstMag || _src2IsDstAngle ? ocl::KernelArg::ReadWriteNoSize(src2) : ocl::KernelArg::ReadOnlyNoSize(src2),
            ocl::KernelArg::WriteOnly(dst1, cn),
            ocl::KernelArg::WriteOnlyNoSize(dst2));
 
@@ -270,8 +280,7 @@ void cartToPolar( InputArray src1, InputArray src2,
 {
     CV_INSTRUMENT_REGION();
 
-    CV_Assert(src1.getObj() != dst1.getObj() && src1.getObj() != dst2.getObj() &&
-              src2.getObj() != dst1.getObj() && src2.getObj() != dst2.getObj());
+    CV_Assert(dst1.getObj() != dst2.getObj());
 
     CV_OCL_RUN(dst1.isUMat() && dst2.isUMat(),
             ocl_cartToPolar(src1, src2, dst1, dst2, angleInDegrees))
@@ -298,15 +307,13 @@ void cartToPolar( InputArray src1, InputArray src2,
             {
                 const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                 float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
-                hal::magnitude32f( x, y, mag, len );
-                hal::fastAtan32f( y, x, angle, len, angleInDegrees );
+                hal::cartToPolar32f( x, y, mag, angle, len, angleInDegrees );
             }
             else
             {
                 const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
-                double *angle = (double*)ptrs[3];
-                hal::magnitude64f(x, y, (double*)ptrs[2], len);
-                hal::fastAtan64f(y, x, angle, len, angleInDegrees);
+                double *mag = (double*)ptrs[2], *angle = (double*)ptrs[3];
+                hal::cartToPolar64f(x, y, mag, angle, len, angleInDegrees);
             }
             ptrs[0] += len*esz1;
             ptrs[1] += len*esz1;
@@ -474,15 +481,24 @@ static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
             rowsPerWI = d.isIntel() ? 4 : 1;
     bool doubleSupport = d.doubleFPConfig() > 0;
 
+    const bool _src1IsDstX = (_mag.getObj() == _dst1.getObj());
+    const bool _src1IsDstY = (_mag.getObj() == _dst2.getObj());
+    const bool _src2IsDstX = (_angle.getObj() == _dst1.getObj());
+    const bool _src2IsDstY = (_angle.getObj() == _dst2.getObj());
+
     if ( !doubleSupport && depth == CV_64F )
         return false;
 
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d -D BINARY_OP -D OP_PTC_%s%s",
+                  format("-D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d -D BINARY_OP -D OP_PTC_%s%s%s%s%s%s",
                          ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), depth,
                          rowsPerWI,
                          angleInDegrees ? "AD" : "AR",
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         _src1IsDstX   ? " -D SRC1_IS_DST_X" : "",
+                         _src1IsDstY ? " -D SRC1_IS_DST_Y" : "",
+                         _src2IsDstX   ? " -D SRC2_IS_DST_X" : "",
+                         _src2IsDstY ? " -D SRC2_IS_DST_Y" : ""));
     if (k.empty())
         return false;
 
@@ -494,8 +510,10 @@ static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
     _dst2.create(size, type);
     UMat dst1 = _dst1.getUMat(), dst2 = _dst2.getUMat();
 
-    k.args(ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::ReadOnlyNoSize(angle),
-           ocl::KernelArg::WriteOnly(dst1, cn), ocl::KernelArg::WriteOnlyNoSize(dst2));
+    k.args(_src1IsDstX || _src1IsDstY ? ocl::KernelArg::ReadWriteNoSize(mag) : ocl::KernelArg::ReadOnlyNoSize(mag),
+           _src2IsDstX || _src2IsDstY  ? ocl::KernelArg::ReadWriteNoSize(angle) : ocl::KernelArg::ReadOnlyNoSize(angle),
+           ocl::KernelArg::WriteOnly(dst1, cn),
+           ocl::KernelArg::WriteOnlyNoSize(dst2));
 
     size_t globalsize[2] = { (size_t)dst1.cols * cn, ((size_t)dst1.rows + rowsPerWI - 1) / rowsPerWI };
     return k.run(2, globalsize, NULL, false);
@@ -567,8 +585,13 @@ void polarToCart( InputArray src1, InputArray src2,
 {
     CV_INSTRUMENT_REGION();
 
-    CV_Assert(src1.getObj() != dst1.getObj() && src1.getObj() != dst2.getObj() &&
-              src2.getObj() != dst1.getObj() && src2.getObj() != dst2.getObj());
+    CV_Assert(dst1.getObj() != dst2.getObj());
+
+    const bool isInPlace =
+        (src1.getObj() == dst1.getObj()) ||
+        (src1.getObj() == dst2.getObj()) ||
+        (src2.getObj() == dst1.getObj()) ||
+        (src2.getObj() == dst2.getObj());
 
     int type = src2.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     CV_Assert((depth == CV_32F || depth == CV_64F) && (src1.empty() || src1.type() == type));
@@ -582,7 +605,7 @@ void polarToCart( InputArray src1, InputArray src2,
     dst2.create( Angle.dims, Angle.size, type );
     Mat X = dst1.getMat(), Y = dst2.getMat();
 
-    CV_IPP_RUN(!angleInDegrees, ipp_polarToCart(Mag, Angle, X, Y));
+    CV_IPP_RUN(!angleInDegrees && !isInPlace, ipp_polarToCart(Mag, Angle, X, Y));
 
     const Mat* arrays[] = {&Mag, &Angle, &X, &Y, 0};
     uchar* ptrs[4] = {};
@@ -592,7 +615,7 @@ void polarToCart( InputArray src1, InputArray src2,
     int j, k, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn);
     size_t esz1 = Angle.elemSize1();
 
-    if( depth == CV_64F )
+    if (( depth == CV_64F ) || isInPlace)
     {
         _buf.allocate(blockSize*2);
         buf[0] = _buf.data();
@@ -604,7 +627,7 @@ void polarToCart( InputArray src1, InputArray src2,
         for( j = 0; j < total; j += blockSize )
         {
             int len = std::min(total - j, blockSize);
-            if( depth == CV_32F )
+            if (( depth == CV_32F ) && !isInPlace)
             {
                 const float *mag = (const float*)ptrs[0], *angle = (const float*)ptrs[1];
                 float *x = (float*)ptrs[2], *y = (float*)ptrs[3];
@@ -614,13 +637,13 @@ void polarToCart( InputArray src1, InputArray src2,
                 {
                     k = 0;
 
-#if CV_SIMD
-                    int cWidth = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    int cWidth = VTraits<v_float32>::vlanes();
                     for( ; k <= len - cWidth; k += cWidth )
                     {
                         v_float32 v_m = vx_load(mag + k);
-                        v_store(x + k, vx_load(x + k) * v_m);
-                        v_store(y + k, vx_load(y + k) * v_m);
+                        v_store(x + k, v_mul(vx_load(x + k), v_m));
+                        v_store(y + k, v_mul(vx_load(y + k), v_m));
                     }
                     vx_cleanup();
 #endif
@@ -632,6 +655,27 @@ void polarToCart( InputArray src1, InputArray src2,
                     }
                 }
             }
+            else if (( depth == CV_32F ) && isInPlace)
+            {
+                const float *mag = (const float*)ptrs[0], *angle = (const float*)ptrs[1];
+                float *x = (float*)ptrs[2], *y = (float*)ptrs[3];
+
+                for( k = 0; k < len; k++ )
+                    buf[0][k] = (float)angle[k];
+
+                SinCos_32f( buf[0], buf[1], buf[0], len, angleInDegrees );
+                if( mag )
+                    for( k = 0; k < len; k++ )
+                    {
+                        float m = mag[k];
+                        x[k] = buf[0][k]*m; y[k] = buf[1][k]*m;
+                    }
+                else
+                {
+                    std::memcpy(x, buf[0], sizeof(float) * len);
+                    std::memcpy(y, buf[1], sizeof(float) * len);
+                }
+            }
             else
             {
                 const double *mag = (const double*)ptrs[0], *angle = (const double*)ptrs[1];
@@ -649,8 +693,11 @@ void polarToCart( InputArray src1, InputArray src2,
                     }
                 else
                 {
-                    std::memcpy(x, buf[0], sizeof(float) * len);
-                    std::memcpy(y, buf[1], sizeof(float) * len);
+                    for( k = 0; k < len; k++ )
+                    {
+                        x[k] = buf[0][k];
+                        y[k] = buf[1][k];
+                    }
                 }
             }
 
@@ -741,7 +788,7 @@ struct iPow_SIMD
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template <>
 struct iPow_SIMD<uchar, int>
@@ -751,7 +798,7 @@ struct iPow_SIMD<uchar, int>
         int i = 0;
         v_uint32 v_1 = vx_setall_u32(1u);
 
-        for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
+        for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
         {
             v_uint32 v_a1 = v_1, v_a2 = v_1;
             v_uint16 v = vx_load_expand(src + i);
@@ -763,16 +810,16 @@ struct iPow_SIMD<uchar, int>
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v = v_pack(v_a1, v_a2);
             v_pack_store(dst + i, v);
@@ -791,7 +838,7 @@ struct iPow_SIMD<schar, int>
         int i = 0;
         v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
+        for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
         {
             v_int32 v_a1 = v_1, v_a2 = v_1;
             v_int16 v = vx_load_expand(src + i);
@@ -803,16 +850,16 @@ struct iPow_SIMD<schar, int>
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v = v_pack(v_a1, v_a2);
             v_pack_store(dst + i, v);
@@ -831,7 +878,7 @@ struct iPow_SIMD<ushort, int>
         int i = 0;
         v_uint32 v_1 = vx_setall_u32(1u);
 
-        for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
+        for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
         {
             v_uint32 v_a1 = v_1, v_a2 = v_1;
             v_uint16 v = vx_load(src + i);
@@ -843,16 +890,16 @@ struct iPow_SIMD<ushort, int>
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v = v_pack(v_a1, v_a2);
             v_store(dst + i, v);
@@ -871,7 +918,7 @@ struct iPow_SIMD<short, int>
         int i = 0;
         v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
+        for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
         {
             v_int32 v_a1 = v_1, v_a2 = v_1;
             v_int16 v = vx_load(src + i);
@@ -883,16 +930,16 @@ struct iPow_SIMD<short, int>
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v = v_pack(v_a1, v_a2);
             v_store(dst + i, v);
@@ -911,29 +958,29 @@ struct iPow_SIMD<int, int>
         int i = 0;
         v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2)
+        for ( ; i <= len - VTraits<v_int32>::vlanes()*2; i += VTraits<v_int32>::vlanes()*2)
         {
             v_int32 v_a1 = v_1, v_a2 = v_1;
-            v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes);
+            v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_int32>::vlanes());
             int p = power;
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + v_int32::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_int32>::vlanes(), v_a2);
         }
         vx_cleanup();
 
@@ -949,34 +996,34 @@ struct iPow_SIMD<float, float>
         int i = 0;
         v_float32 v_1 = vx_setall_f32(1.f);
 
-        for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2)
+        for ( ; i <= len - VTraits<v_float32>::vlanes()*2; i += VTraits<v_float32>::vlanes()*2)
         {
             v_float32 v_a1 = v_1, v_a2 = v_1;
-            v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes);
+            v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float32>::vlanes());
             int p = std::abs(power);
             if( power < 0 )
             {
-                v_b1 = v_1 / v_b1;
-                v_b2 = v_1 / v_b2;
+                v_b1 = v_div(v_1, v_b1);
+                v_b2 = v_div(v_1, v_b2);
             }
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + v_float32::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), v_a2);
         }
         vx_cleanup();
 
@@ -984,7 +1031,7 @@ struct iPow_SIMD<float, float>
     }
 };
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template <>
 struct iPow_SIMD<double, double>
 {
@@ -993,34 +1040,34 @@ struct iPow_SIMD<double, double>
         int i = 0;
         v_float64 v_1 = vx_setall_f64(1.);
 
-        for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2)
+        for ( ; i <= len - VTraits<v_float64>::vlanes()*2; i += VTraits<v_float64>::vlanes()*2)
         {
             v_float64 v_a1 = v_1, v_a2 = v_1;
-            v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes);
+            v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float64>::vlanes());
             int p = std::abs(power);
             if( power < 0 )
             {
-                v_b1 = v_1 / v_b1;
-                v_b2 = v_1 / v_b2;
+                v_b1 = v_div(v_1, v_b1);
+                v_b2 = v_div(v_1, v_b2);
             }
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + v_float64::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_float64>::vlanes(), v_a2);
         }
         vx_cleanup();
 
@@ -1137,7 +1184,7 @@ static void iPow64f(const double* src, double* dst, int len, int power)
 
 typedef void (*IPowFunc)( const uchar* src, uchar* dst, int len, int power );
 
-static IPowFunc ipowTab[] =
+static IPowFunc ipowTab[CV_DEPTH_MAX] =
 {
     (IPowFunc)iPow8u, (IPowFunc)iPow8s, (IPowFunc)iPow16u, (IPowFunc)iPow16s,
     (IPowFunc)iPow32s, (IPowFunc)iPow32f, (IPowFunc)iPow64f, 0
@@ -1566,7 +1613,7 @@ bool checkRange(InputArray _src, bool quiet, Point* pt, double minVal, double ma
         {
             cv::String value_str;
             value_str << src(cv::Range(badPt.y, badPt.y + 1), cv::Range(badPt.x, badPt.x + 1));
-            CV_Error_( CV_StsOutOfRange,
+            CV_Error_( cv::Error::StsOutOfRange,
             ("the value at (%d, %d)=%s is out of range [%f, %f)", badPt.x, badPt.y, value_str.c_str(), minVal, maxVal));
         }
         return false;
@@ -1610,30 +1657,37 @@ void patchNaNs( InputOutputArray _a, double _val )
     const Mat* arrays[] = {&a, 0};
     int* ptrs[1] = {};
     NAryMatIterator it(arrays, (uchar**)ptrs);
-    size_t len = it.size*a.channels();
+    int len = (int)(it.size*a.channels());
     Cv32suf val;
     val.f = (float)_val;
 
-#if CV_SIMD
-    v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000);
-    v_int32 v_val = vx_setall_s32(val.i);
-#endif
-
     for( size_t i = 0; i < it.nplanes; i++, ++it )
     {
         int* tptr = ptrs[0];
-        size_t j = 0;
+        int j = 0;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        v_int32 v_pos_mask = vx_setall_s32(0x7fffffff), v_exp_mask = vx_setall_s32(0x7f800000);
+        v_int32 v_val = vx_setall_s32(val.i);
 
-#if CV_SIMD
-        size_t cWidth = (size_t)v_int32::nlanes;
-        for ( ; j + cWidth <= len; j += cWidth)
+        int cWidth = VTraits<v_int32>::vlanes();
+        for (; j < len - cWidth * 2 + 1; j += cWidth * 2)
         {
-            v_int32 v_src = vx_load(tptr + j);
-            v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1);
-            v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src);
-            v_store(tptr + j, v_dst);
+            v_int32 v_src0 = vx_load(tptr + j);
+            v_int32 v_src1 = vx_load(tptr + j + cWidth);
+
+            v_int32 v_cmp_mask0 = v_lt(v_exp_mask, v_and(v_src0, v_pos_mask));
+            v_int32 v_cmp_mask1 = v_lt(v_exp_mask, v_and(v_src1, v_pos_mask));
+
+            if (v_check_any(v_or(v_cmp_mask0, v_cmp_mask1)))
+            {
+                v_int32 v_dst0 = v_select(v_cmp_mask0, v_val, v_src0);
+                v_int32 v_dst1 = v_select(v_cmp_mask1, v_val, v_src1);
+
+                v_store(tptr + j, v_dst0);
+                v_store(tptr + j + cWidth, v_dst1);
+            }
         }
-        vx_cleanup();
 #endif
 
         for( ; j < len; j++ )
diff --git a/modules/core/src/mathfuncs_core.dispatch.cpp b/modules/core/src/mathfuncs_core.dispatch.cpp
index e48f84ebbef2..485eac27b448 100644
--- a/modules/core/src/mathfuncs_core.dispatch.cpp
+++ b/modules/core/src/mathfuncs_core.dispatch.cpp
@@ -9,7 +9,25 @@
 
 namespace cv { namespace hal {
 
-///////////////////////////////////// ATAN2 ////////////////////////////////////
+void cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees)
+{
+    CV_INSTRUMENT_REGION();
+
+    CALL_HAL(cartToPolar32f, cv_hal_cartToPolar32f, x, y, mag, angle, len, angleInDegrees);
+
+    CV_CPU_DISPATCH(cartToPolar32f, (x, y, mag, angle, len, angleInDegrees),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+
+void cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees)
+{
+    CV_INSTRUMENT_REGION();
+
+    CALL_HAL(cartToPolar64f, cv_hal_cartToPolar64f, x, y, mag, angle, len, angleInDegrees);
+
+    CV_CPU_DISPATCH(cartToPolar64f, (x, y, mag, angle, len, angleInDegrees),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
 
 void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp
index 1bf36bb17407..cb21064041dd 100644
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -9,6 +9,8 @@ namespace cv { namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
 // forward declarations
+void cartToPolar32f(const float *X, const float *Y, float* mag, float *angle, int len, bool angleInDegrees);
+void cartToPolar64f(const double *X, const double *Y, double* mag, double *angle, int len, bool angleInDegrees);
 void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
 void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees);
 void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
@@ -93,13 +95,13 @@ struct v_atan_f32
     {
         v_float32 ax = v_abs(x);
         v_float32 ay = v_abs(y);
-        v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
-        v_float32 cc = c * c;
-        v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
-        a = v_select(ax >= ay, a, val90 - a);
-        a = v_select(x < z, val180 - a, a);
-        a = v_select(y < z, val360 - a, a);
-        return a * s;
+        v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), this->eps));
+        v_float32 cc = v_mul(c, c);
+        v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, this->p7, this->p5), cc, this->p3), cc, this->p1), c);
+        a = v_select(v_ge(ax, ay), a, v_sub(this->val90, a));
+        a = v_select(v_lt(x, this->z), v_sub(this->val180, a), a);
+        a = v_select(v_lt(y, this->z), v_sub(this->val360, a), a);
+        return v_mul(a, this->s);
     }
 
     v_float32 eps;
@@ -118,14 +120,88 @@ struct v_atan_f32
 
 } // anonymous::
 
-///////////////////////////////////// ATAN2 ////////////////////////////////////
+static void cartToPolar32f_(const float *X, const float *Y, float *mag, float *angle, int len, bool angleInDegrees )
+{
+    float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
+    int i = 0;
+#if CV_SIMD
+    const int VECSZ = VTraits<v_float32>::vlanes();
+    v_atan_f32 v(scale);
+
+    for( ; i < len; i += VECSZ*2 )
+    {
+        if( i + VECSZ*2 > len )
+        {
+            // if it's inplace operation, we cannot repeatedly process
+            // the tail for the second time, so we have to use the
+            // scalar code
+            if( i == 0 || angle == X || angle == Y )
+                break;
+            i = len - VECSZ*2;
+        }
+
+        v_float32 x0 = vx_load(X + i);
+        v_float32 y0 = vx_load(Y + i);
+        v_float32 x1 = vx_load(X + i + VECSZ);
+        v_float32 y1 = vx_load(Y + i + VECSZ);
+
+        v_float32 m0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
+        v_float32 m1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
+
+        v_float32 r0 = v.compute(y0, x0);
+        v_float32 r1 = v.compute(y1, x1);
+
+        v_store(mag + i, m0);
+        v_store(mag + i + VECSZ, m1);
+
+        v_store(angle + i, r0);
+        v_store(angle + i + VECSZ, r1);
+    }
+    vx_cleanup();
+#endif
+
+    for( ; i < len; i++ )
+    {
+        float x0 = X[i], y0 = Y[i];
+        mag[i] = std::sqrt(x0*x0 + y0*y0);
+        angle[i] = atan_f32(y0, x0)*scale;
+    }
+}
+
+void cartToPolar32f(const float *X, const float *Y, float *mag, float *angle, int len, bool angleInDegrees )
+{
+    CV_INSTRUMENT_REGION();
+    cartToPolar32f_(X, Y, mag, angle, len, angleInDegrees );
+}
+
+void cartToPolar64f(const double *X, const double *Y, double *mag, double *angle, int len, bool angleInDegrees)
+{
+    CV_INSTRUMENT_REGION();
+
+    const int BLKSZ = 128;
+    float ybuf[BLKSZ], xbuf[BLKSZ], mbuf[BLKSZ], abuf[BLKSZ];
+    for( int i = 0; i < len; i += BLKSZ )
+    {
+        int j, blksz = std::min(BLKSZ, len - i);
+        for( j = 0; j < blksz; j++ )
+        {
+            xbuf[j] = (float)X[i + j];
+            ybuf[j] = (float)Y[i + j];
+        }
+        cartToPolar32f_(xbuf, ybuf, mbuf, abuf, blksz, angleInDegrees);
+        for( j = 0; j < blksz; j++ )
+            mag[i + j] = mbuf[j];
+        for( j = 0; j < blksz; j++ )
+            angle[i + j] = abuf[j];
+    }
+}
 
 static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
     float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
     int i = 0;
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     v_atan_f32 v(scale);
 
     for( ; i < len; i += VECSZ*2 )
@@ -198,7 +274,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
     int i = 0;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -209,8 +285,8 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
         }
         v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
         v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
-        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
-        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
+        x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
+        x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
         v_store(mag + i, x0);
         v_store(mag + i + VECSZ, x1);
     }
@@ -231,7 +307,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
     int i = 0;
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -242,8 +318,8 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
         }
         v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
         v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
-        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
-        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
+        x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
+        x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
         v_store(mag + i, x0);
         v_store(mag + i + VECSZ, x1);
     }
@@ -265,7 +341,7 @@ void invSqrt32f(const float* src, float* dst, int len)
     int i = 0;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -293,7 +369,7 @@ void invSqrt64f(const double* src, double* dst, int len)
     int i = 0;
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     for ( ; i < len; i += VECSZ*2)
     {
         if( i + VECSZ*2 > len )
@@ -321,7 +397,7 @@ void sqrt32f(const float* src, float* dst, int len)
     int i = 0;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -350,7 +426,7 @@ void sqrt64f(const double* src, double* dst, int len)
     int i = 0;
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -452,7 +528,7 @@ void exp32f( const float *_x, float *y, int n )
     float postscale = (float)exp_postscale;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
     const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
     const v_float32 vminval = vx_setall_f32(minval);
@@ -481,26 +557,26 @@ void exp32f( const float *_x, float *y, int n )
         xf0 = v_min(v_max(xf0, vminval), vmaxval);
         xf1 = v_min(v_max(xf1, vminval), vmaxval);
 
-        xf0 *= vprescale;
-        xf1 *= vprescale;
+        xf0 = v_mul(xf0, vprescale);
+        xf1 = v_mul(xf1, vprescale);
 
         v_int32 xi0 = v_round(xf0);
         v_int32 xi1 = v_round(xf1);
-        xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
-        xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
+        xf0 = v_mul(v_sub(xf0, v_cvt_f32(xi0)), vpostscale);
+        xf1 = v_mul(v_sub(xf1, v_cvt_f32(xi1)), vpostscale);
 
-        v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
-        v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
+        v_float32 yf0 = v_lut(expTab_f, v_and(xi0, vidxmask));
+        v_float32 yf1 = v_lut(expTab_f, v_and(xi1, vidxmask));
 
         v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
-        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
-        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
+        xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v127), v0), v255);
+        xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v127), v0), v255);
 
-        yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
-        yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
+        yf0 = v_mul(yf0, v_reinterpret_as_f32(v_shl<23>(xi0)));
+        yf1 = v_mul(yf1, v_reinterpret_as_f32(v_shl<23>(xi1)));
 
-        v_float32 zf0 = xf0 + vA1;
-        v_float32 zf1 = xf1 + vA1;
+        v_float32 zf0 = v_add(xf0, vA1);
+        v_float32 zf1 = v_add(xf1, vA1);
 
         zf0 = v_fma(zf0, xf0, vA2);
         zf1 = v_fma(zf1, xf1, vA2);
@@ -511,8 +587,8 @@ void exp32f( const float *_x, float *y, int n )
         zf0 = v_fma(zf0, xf0, vA4);
         zf1 = v_fma(zf1, xf1, vA4);
 
-        zf0 *= yf0;
-        zf1 *= yf1;
+        zf0 = v_mul(zf0, yf0);
+        zf1 = v_mul(zf1, yf1);
 
         if( y_aligned )
         {
@@ -566,7 +642,7 @@ void exp64f( const double *_x, double *y, int n )
     double maxval = (exp_max_val/exp_prescale);
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     const v_float64 vprescale = vx_setall_f64(exp_prescale);
     const v_float64 vpostscale = vx_setall_f64(exp_postscale);
     const v_float64 vminval = vx_setall_f64(minval);
@@ -596,30 +672,30 @@ void exp64f( const double *_x, double *y, int n )
         xf0 = v_min(v_max(xf0, vminval), vmaxval);
         xf1 = v_min(v_max(xf1, vminval), vmaxval);
 
-        xf0 *= vprescale;
-        xf1 *= vprescale;
+        xf0 = v_mul(xf0, vprescale);
+        xf1 = v_mul(xf1, vprescale);
 
         v_int32 xi0 = v_round(xf0);
         v_int32 xi1 = v_round(xf1);
-        xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
-        xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
+        xf0 = v_mul(v_sub(xf0, v_cvt_f64(xi0)), vpostscale);
+        xf1 = v_mul(v_sub(xf1, v_cvt_f64(xi1)), vpostscale);
 
-        v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
-        v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
+        v_float64 yf0 = v_lut(expTab, v_and(xi0, vidxmask));
+        v_float64 yf1 = v_lut(expTab, v_and(xi1, vidxmask));
 
         v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
-        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
-        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
+        xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v1023), v0), v2047);
+        xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v1023), v0), v2047);
 
         v_int64 xq0, xq1, dummy;
         v_expand(xi0, xq0, dummy);
         v_expand(xi1, xq1, dummy);
 
-        yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
-        yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
+        yf0 = v_mul(yf0, v_reinterpret_as_f64(v_shl<52>(xq0)));
+        yf1 = v_mul(yf1, v_reinterpret_as_f64(v_shl<52>(xq1)));
 
-        v_float64 zf0 = xf0 + vA1;
-        v_float64 zf1 = xf1 + vA1;
+        v_float64 zf0 = v_add(xf0, vA1);
+        v_float64 zf1 = v_add(xf1, vA1);
 
         zf0 = v_fma(zf0, xf0, vA2);
         zf1 = v_fma(zf1, xf1, vA2);
@@ -633,8 +709,8 @@ void exp64f( const double *_x, double *y, int n )
         zf0 = v_fma(zf0, xf0, vA5);
         zf1 = v_fma(zf1, xf1, vA5);
 
-        zf0 *= yf0;
-        zf1 *= yf1;
+        zf0 = v_mul(zf0, yf0);
+        zf1 = v_mul(zf1, yf1);
 
         if( y_aligned )
         {
@@ -696,7 +772,7 @@ void log32f( const float *_x, float *y, int n )
     const int* x = (const int*)_x;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     const v_float32 vln2 = vx_setall_f32((float)ln_2);
     const v_float32 v1 = vx_setall_f32(1.f);
     const v_float32 vshift = vx_setall_f32(-1.f/512);
@@ -715,18 +791,18 @@ void log32f( const float *_x, float *y, int n )
         }
 
         v_int32 h0 = vx_load(x + i);
-        v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
-        v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
+        v_int32 yi0 = v_sub(v_and(v_shr<23>(h0), vx_setall_s32(255)), vx_setall_s32(127));
+        v_int32 xi0 = v_or(v_and(h0, vx_setall_s32(LOGTAB_MASK2_32F)), vx_setall_s32(127 << 23));
 
-        h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
+        h0 = v_and(v_shr<23 - 8 - 1>(h0), vx_setall_s32(((1 << 8) - 1) * 2));
         v_float32 yf0, xf0;
 
         v_lut_deinterleave(logTab_f, h0, yf0, xf0);
 
         yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
 
-        v_float32 delta = v_select(v_reinterpret_as_f32(h0 == vx_setall_s32(510)), vshift, vx_setall<float>(0));
-        xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
+        v_float32 delta = v_select(v_reinterpret_as_f32(v_eq(h0, vx_setall_s32(510))), vshift, vx_setall<float>(0));
+        xf0 = v_fma((v_sub(v_reinterpret_as_f32(xi0), v1)), xf0, delta);
 
         v_float32 zf0 = v_fma(xf0, vA0, vA1);
         zf0 = v_fma(zf0, xf0, vA2);
@@ -771,7 +847,7 @@ void log64f( const double *x, double *y, int n )
     int i = 0;
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     const v_float64 vln2 = vx_setall_f64(ln_2);
 
     const v_float64
@@ -791,20 +867,20 @@ void log64f( const double *x, double *y, int n )
 
         v_int64 h0 = vx_load((const int64*)x + i);
         v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
-        yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
+        yi0 = v_sub(v_and(yi0, vx_setall_s32(2047)), vx_setall_s32(1023));
 
-        v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
+        v_int64 xi0 = v_or(v_and(h0, vx_setall_s64(LOGTAB_MASK2_64F)), vx_setall_s64((int64)1023 << 52));
         h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
-        v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
+        v_int32 idx = v_and(v_pack(h0, h0), vx_setall_s32(((1 << 8) - 1) * 2));
 
         v_float64 xf0, yf0;
         v_lut_deinterleave(logTab, idx, yf0, xf0);
 
         yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
-        v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
-        xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
+        v_float64 delta = v_mul(v_cvt_f64(v_eq(idx, vx_setall_s32(510))), vx_setall_f64(1. / 512));
+        xf0 = v_fma(v_sub(v_reinterpret_as_f64(xi0), vx_setall_f64(1.)), xf0, delta);
 
-        v_float64 xq = xf0*xf0;
+        v_float64 xq = v_mul(xf0, xf0);
         v_float64 zf0 = v_fma(xq, vA0, vA2);
         v_float64 zf1 = v_fma(xq, vA1, vA3);
         zf0 = v_fma(zf0, xq, vA4);
diff --git a/modules/core/src/matmul.dispatch.cpp b/modules/core/src/matmul.dispatch.cpp
index a213ca06c736..81953265d7f3 100644
--- a/modules/core/src/matmul.dispatch.cpp
+++ b/modules/core/src/matmul.dispatch.cpp
@@ -921,7 +921,7 @@ void mulTransposed(InputArray _src, OutputArray _dst, bool ata,
     {
         MulTransposedFunc func = getMulTransposedFunc(stype, dtype, ata);
         if( !func )
-            CV_Error( CV_StsUnsupportedFormat, "" );
+            CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
         func( src, dst, delta, scale );
         completeSymm( dst, false );
@@ -979,7 +979,7 @@ typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len);
 
 static DotProdFunc getDotProdFunc(int depth)
 {
-    static DotProdFunc dotProdTab[] =
+    static DotProdFunc dotProdTab[CV_DEPTH_MAX] =
     {
         (DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s),
         (DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s,
diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp
index 5a7f36d12bc0..ce3a48799e35 100644
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@@ -394,19 +394,6 @@ GEMMSingleMul( const T* a_data, size_t a_step,
             {
                 WT al(a_data[k]);
                 j=0;
-                 #if CV_ENABLE_UNROLLED
-                for(; j <= m - 4; j += 4 )
-                {
-                    WT t0 = d_buf[j] + WT(b_data[j])*al;
-                    WT t1 = d_buf[j+1] + WT(b_data[j+1])*al;
-                    d_buf[j] = t0;
-                    d_buf[j+1] = t1;
-                    t0 = d_buf[j+2] + WT(b_data[j+2])*al;
-                    t1 = d_buf[j+3] + WT(b_data[j+3])*al;
-                    d_buf[j+2] = t0;
-                    d_buf[j+3] = t1;
-                }
-                #endif
                 for( ; j < m; j++ )
                     d_buf[j] += WT(b_data[j])*al;
             }
@@ -1454,7 +1441,7 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
 static void
 transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const int BITS = 10, SCALE = 1 << BITS;
     const float MAX_M = (float)(1 << (15 - BITS));
 
@@ -1485,7 +1472,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
         v_int32 m10 = vx_setall_s32(m32[4]);
         v_int32 m11 = vx_setall_s32(m32[5]);
         int x = 0;
-        for (; x <= (len - v_uint8::nlanes) * nChannels; x += v_uint8::nlanes * nChannels)
+        for (; x <= (len - VTraits<v_uint8>::vlanes()) * nChannels; x += VTraits<v_uint8>::vlanes() * nChannels)
         {
             v_uint8 b, g, r;
             v_load_deinterleave(src + x, b, g, r);
@@ -1499,20 +1486,20 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
             v_int32 p1, p3;
             v_expand(bgl, p0, p2);
             v_expand(v_reinterpret_as_s16(rl), p1, p3);
-            dbl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 *  m2 + m3,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 *  m2 + m3);
-            dgl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 *  m6 + m7,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 *  m6 + m7);
-            drl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
+            dbl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
+            dgl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
+            drl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
             v_expand(bgh, p0, p2);
             v_expand(v_reinterpret_as_s16(rh), p1, p3);
-            dbh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 *  m2 + m3,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 *  m2 + m3);
-            dgh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 *  m6 + m7,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 *  m6 + m7);
-            drh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
+            dbh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
+            dgh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
+            drh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
             v_store_interleave(dst + x, v_pack_u(dbl, dbh), v_pack_u(dgl, dgh), v_pack_u(drl, drh));
         }
         m32[1] = saturate_cast<int>((m[3] + 0.5f)*SCALE);
@@ -1537,7 +1524,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
 static void
 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     if( scn == 3 && dcn == 3 )
     {
         int x = 0;
@@ -1555,7 +1542,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
         v_float32 m10 = vx_setall_f32(m[10]);
         v_float32 m11 = vx_setall_f32(m[11] - 32768.f);
         v_int16 delta = vx_setall_s16(-32768);
-        for (; x <= (len - v_uint16::nlanes)*3; x += v_uint16::nlanes*3)
+        for (; x <= (len - VTraits<v_uint16>::vlanes())*3; x +=  VTraits<v_uint16>::vlanes()*3)
         {
             v_uint16 b, g, r;
             v_load_deinterleave(src + x, b, g, r);
@@ -1574,6 +1561,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
             v_store_interleave(dst + x, v_reinterpret_as_u16(db), v_reinterpret_as_u16(dg), v_reinterpret_as_u16(dr));
         }
 #endif
+#if CV_SIMD128
         v_float32x4 _m0l(m[0], m[4], m[ 8], 0.f);
         v_float32x4 _m1l(m[1], m[5], m[ 9], 0.f);
         v_float32x4 _m2l(m[2], m[6], m[10], 0.f);
@@ -1583,10 +1571,11 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
         v_float32x4 _m2h = v_rotate_left<1>(_m2l);
         v_float32x4 _m3h = v_rotate_left<1>(_m3l);
         v_int16x8 _delta(0, -32768, -32768, -32768, -32768, -32768, -32768, 0);
-        for( ; x <= len*3 - v_uint16x8::nlanes; x += 3*v_uint16x8::nlanes/4 )
+        for( ; x <= len*3 - VTraits<v_uint16x8>::vlanes(); x += 3*VTraits<v_uint16x8>::vlanes()/4 )
             v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack(
                              v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x    ))), _m0h, _m1h, _m2h, _m3h)),
                              v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta))));
+#endif //CV_SIMD128
         for( ; x < len * 3; x += 3 )
         {
             float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
@@ -1606,25 +1595,25 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
 static void
 transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD && !defined(__aarch64__) && !defined(_M_ARM64)
+#if (CV_SIMD || CV_SIMD_SCALABLE) && !defined(__aarch64__) && !defined(_M_ARM64)
     int x = 0;
     if( scn == 3 && dcn == 3 )
     {
-        int idx[v_float32::nlanes/2];
-        for( int i = 0; i < v_float32::nlanes/4; i++ )
+        int idx[VTraits<v_float32>::max_nlanes/2];
+        for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
         {
             idx[i] = 3*i;
-            idx[i + v_float32::nlanes/4] = 0;
+            idx[i + VTraits<v_float32>::vlanes()/4] = 0;
         }
         float _m[] = { m[0], m[4], m[ 8], 0.f,
                        m[1], m[5], m[ 9], 0.f,
                        m[2], m[6], m[10], 0.f,
                        m[3], m[7], m[11], 0.f };
-        v_float32 m0 = vx_lut_quads(_m     , idx + v_float32::nlanes/4);
-        v_float32 m1 = vx_lut_quads(_m +  4, idx + v_float32::nlanes/4);
-        v_float32 m2 = vx_lut_quads(_m +  8, idx + v_float32::nlanes/4);
-        v_float32 m3 = vx_lut_quads(_m + 12, idx + v_float32::nlanes/4);
-        for( ; x <= len*3 - v_float32::nlanes; x += 3*v_float32::nlanes/4 )
+        v_float32 m0 = vx_lut_quads(_m     , idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m1 = vx_lut_quads(_m +  4, idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m2 = vx_lut_quads(_m +  8, idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m3 = vx_lut_quads(_m + 12, idx + VTraits<v_float32>::vlanes()/4);
+        for( ; x <= len*3 - VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes()/4 )
             v_store(dst + x, v_pack_triplets(v_matmuladd(vx_lut_quads(src + x, idx), m0, m1, m2, m3)));
         for( ; x < len*3; x += 3 )
         {
@@ -1641,8 +1630,8 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
     if( scn == 4 && dcn == 4 )
     {
 #if CV_SIMD_WIDTH > 16
-        int idx[v_float32::nlanes/4];
-        for( int i = 0; i < v_float32::nlanes/4; i++ )
+        int idx[VTraits<v_float32>::max_nlanes/4];
+        for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
             idx[i] = 0;
         float _m[] = { m[4], m[9], m[14], m[19] };
         v_float32 m0 = vx_lut_quads(m   , idx);
@@ -1650,22 +1639,34 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
         v_float32 m2 = vx_lut_quads(m+10, idx);
         v_float32 m3 = vx_lut_quads(m+15, idx);
         v_float32 m4 = vx_lut_quads(_m, idx);
-        for( ; x <= len*4 - v_float32::nlanes; x += v_float32::nlanes )
+        for( ; x <= len*4 - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes() )
         {
             v_float32 v_src = vx_load(src + x);
-            v_store(dst + x, v_reduce_sum4(v_src * m0, v_src * m1, v_src * m2, v_src * m3) + m4);
+            v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, m0), v_mul(v_src, m1), v_mul(v_src, m2), v_mul(v_src, m3)), m4));
         }
 #endif
+#if CV_SIMD128
         v_float32x4 _m0 = v_load(m     );
         v_float32x4 _m1 = v_load(m +  5);
         v_float32x4 _m2 = v_load(m + 10);
         v_float32x4 _m3 = v_load(m + 15);
         v_float32x4 _m4(m[4], m[9], m[14], m[19]);
-        for( ; x < len*4; x += v_float32x4::nlanes )
+        for( ; x < len*4; x += VTraits<v_float32x4>::vlanes() )
         {
             v_float32x4 v_src = v_load(src + x);
-            v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4);
+            v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, _m0), v_mul(v_src, _m1), v_mul(v_src, _m2), v_mul(v_src, _m3)), _m4));
         }
+#else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128
+        for( ; x < len*4; x += 4 )
+        {
+            float v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
+            float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[ 2]*v2 + m[ 3]*v3 + m[ 4]);
+            float t1 = saturate_cast<float>(m[5]*v0 + m[6]*v1 + m[ 7]*v2 + m[ 8]*v3 + m[ 9]);
+            float t2 = saturate_cast<float>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
+            float t3 = saturate_cast<float>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
+            dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; dst[x+3] = t3;
+        }
+#endif
         vx_cleanup();
         return;
     }
@@ -1791,7 +1792,7 @@ diagtransform_64f(const double* src, double* dst, const double* m, int len, int
 
 TransformFunc getTransformFunc(int depth)
 {
-    static TransformFunc transformTab[] =
+    static TransformFunc transformTab[CV_DEPTH_MAX] =
     {
         (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u,
         (TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f,
@@ -1803,7 +1804,7 @@ TransformFunc getTransformFunc(int depth)
 
 TransformFunc getDiagTransformFunc(int depth)
 {
-    static TransformFunc diagTransformTab[] =
+    static TransformFunc diagTransformTab[CV_DEPTH_MAX] =
     {
         (TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u,
         (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f,
@@ -1936,9 +1937,9 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
 {
     float alpha = *_alpha;
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 v_alpha = vx_setall_f32(alpha);
-    const int cWidth = v_float32::nlanes;
+    const int cWidth = VTraits<v_float32>::vlanes();
     for (; i <= len - cWidth; i += cWidth)
         v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i)));
     vx_cleanup();
@@ -1953,9 +1954,9 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
 {
     double alpha = *_alpha;
     int i = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     v_float64 a2 = vx_setall_f64(alpha);
-    const int cWidth = v_float64::nlanes;
+    const int cWidth = VTraits<v_float64>::vlanes();
     for (; i <= len - cWidth; i += cWidth)
         v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i)));
     vx_cleanup();
@@ -2078,7 +2079,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
         deltastep = deltastep ? 4 : 0;
     }
 
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
     v_float64x2 v_scale = v_setall_f64(scale);
 #endif
 
@@ -2090,7 +2091,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
 
             for( j = i; j <= size.width - 4; j += 4 )
             {
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                 if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                 {
                     v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
@@ -2099,12 +2100,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                     for( k = 0; k < size.height; k++, tsrc += srcstep )
                     {
                         v_float64x2 a = v_setall_f64((double)col_buf[k]);
-                        s0 += a * v_load(tsrc+0);
-                        s1 += a * v_load(tsrc+2);
+                        s0 = v_add(s0, v_mul(a, v_load(tsrc + 0)));
+                        s1 = v_add(s1, v_mul(a, v_load(tsrc + 2)));
                     }
 
-                    v_store((double*)(tdst+j), s0*v_scale);
-                    v_store((double*)(tdst+j+2), s1*v_scale);
+                    v_store((double*)(tdst+j), v_mul(s0, v_scale));
+                    v_store((double*)(tdst+j+2), v_mul(s1, v_scale));
                 } else
 #endif
                 {
@@ -2150,7 +2151,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
 
             for( j = i; j <= size.width - 4; j += 4 )
             {
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                 if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                 {
                     v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
@@ -2160,12 +2161,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                     for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
                     {
                         v_float64x2 a = v_setall_f64((double)col_buf[k]);
-                        s0 += a * (v_load(tsrc+0) - v_load(d+0));
-                        s1 += a * (v_load(tsrc+2) - v_load(d+2));
+                        s0 = v_add(s0, v_mul(a, v_sub(v_load(tsrc + 0), v_load(d + 0))));
+                        s1 = v_add(s1, v_mul(a, v_sub(v_load(tsrc + 2), v_load(d + 2))));
                     }
 
-                    v_store((double*)(tdst+j), s0*v_scale);
-                    v_store((double*)(tdst+j+2), s1*v_scale);
+                    v_store((double*)(tdst+j), v_mul(s0, v_scale));
+                    v_store((double*)(tdst+j+2), v_mul(s1, v_scale));
                 }
                 else
 #endif
@@ -2227,7 +2228,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                 double s = 0;
                 const sT *tsrc1 = src + i*srcstep;
                 const sT *tsrc2 = src + j*srcstep;
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                 if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                 {
                     const double *v_tsrc1 = (double *)(tsrc1);
@@ -2235,8 +2236,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                     v_float64x2 v_s = v_setzero_f64();
 
                     for( k = 0; k <= size.width - 4; k += 4 )
-                        v_s += (v_load(v_tsrc1+k) * v_load(v_tsrc2+k)) +
-                               (v_load(v_tsrc1+k+2) * v_load(v_tsrc2+k+2));
+                        v_s = v_add(v_s, v_add(v_mul(v_load(v_tsrc1 + k), v_load(v_tsrc2 + k)), v_mul(v_load(v_tsrc1 + k + 2), v_load(v_tsrc2 + k + 2))));
                     s += v_reduce_sum(v_s);
                 }
                 else
@@ -2280,7 +2280,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                         delta_buf[2] = delta_buf[3] = tdelta2[0];
                     tdelta2 = delta_buf;
                 }
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                 if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                 {
                     const double *v_tsrc2 = (double *)(tsrc2);
@@ -2289,8 +2289,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                     v_float64x2 v_s = v_setzero_f64();
 
                     for( k = 0; k <= size.width - 4; k += 4, v_tdelta2 += delta_shift )
-                        v_s += ((v_load(v_tsrc2+k) - v_load(v_tdelta2)) * v_load(v_row_buf+k)) +
-                               ((v_load(v_tsrc2+k+2) - v_load(v_tdelta2+2)) * v_load(v_row_buf+k+2));
+                        v_s = v_add(v_s, v_add(v_mul(v_sub(v_load(v_tsrc2 + k), v_load(v_tdelta2)), v_load(v_row_buf + k)), v_mul(v_sub(v_load(v_tsrc2 + k + 2), v_load(v_tdelta2 + 2)), v_load(v_row_buf + k + 2))));
                     s += v_reduce_sum(v_s);
 
                     tdelta2 = (const dT *)(v_tdelta2);
@@ -2393,14 +2392,14 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
     double r = 0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 15), blockSize;
 
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
         v_uint32 v_sum = vx_setzero_u32();
-        const int cWidth = v_uint16::nlanes;
+        const int cWidth = VTraits<v_uint16>::vlanes();
 
         int j = 0;
         for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
@@ -2414,7 +2413,7 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
         {
             v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
             v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
-            v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20));
+            v_sum = v_add(v_sum, v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20)));
         }
         r += (double)v_reduce_sum(v_sum);
 
@@ -2433,14 +2432,14 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 14), blockSize;
 
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
         v_int32 v_sum = vx_setzero_s32();
-        const int cWidth = v_int16::nlanes;
+        const int cWidth = VTraits<v_int16>::vlanes();
 
         int j = 0;
         for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
@@ -2473,14 +2472,14 @@ double dotProd_16u(const ushort* src1, const ushort* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 24), blockSize;
 
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
         v_uint64 v_sum = vx_setzero_u64();
-        const int cWidth = v_uint16::nlanes;
+        const int cWidth = VTraits<v_uint16>::vlanes();
 
         int j = 0;
         for (; j <= blockSize - cWidth; j += cWidth)
@@ -2505,14 +2504,14 @@ double dotProd_16s(const short* src1, const short* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 24), blockSize;
 
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
         v_int64 v_sum = vx_setzero_s64();
-        const int cWidth = v_int16::nlanes;
+        const int cWidth = VTraits<v_int16>::vlanes();
 
         int j = 0;
         for (; j <= blockSize - cWidth; j += cWidth)
@@ -2534,10 +2533,11 @@ double dotProd_16s(const short* src1, const short* src2, int len)
 
 double dotProd_32s(const int* src1, const int* src2, int len)
 {
-#if CV_SIMD_64F
+#if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
+// Test failed on RVV(QEMU): Too big difference (=1.20209e-08 > 1.11022e-12)
     double r = .0;
     int i = 0;
-    const int step  = v_int32::nlanes;
+    const int step  = VTraits<v_int32>::vlanes();
     v_float64 v_sum0 = vx_setzero_f64();
 #if CV_SIMD_WIDTH == 16
     const int wstep = step * 2;
@@ -2551,7 +2551,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
         v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0);
         v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1);
     }
-    v_sum0 += v_sum1;
+    v_sum0 = v_add(v_sum0, v_sum1);
 #endif
     for (; i < len - step; i += step, src1 += step, src2 += step)
     {
@@ -2572,8 +2572,8 @@ double dotProd_32f(const float* src1, const float* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_float32>::vlanes(), blockSize0 = (1 << 13), blockSize;
 
     while (i < len0)
     {
@@ -2581,7 +2581,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
         v_float32 v_sum = vx_setzero_f32();
 
         int j = 0;
-        int cWidth = v_float32::nlanes;
+        int cWidth = VTraits<v_float32>::vlanes();
 
 #if CV_ENABLE_UNROLLED
         v_float32 v_sum1 = vx_setzero_f32();
@@ -2600,7 +2600,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
                               vx_load(src2 + j + (cWidth * 3)), v_sum3);
         }
 
-        v_sum += v_sum1 + v_sum2 + v_sum3;
+        v_sum = v_add(v_sum, v_add(v_add(v_sum1, v_sum2), v_sum3));
 #endif
 
         for (; j <= blockSize - cWidth; j += cWidth)
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 8111dc223036..0701542dfd7f 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -267,7 +267,7 @@ void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool auto
             m.step.p[i] = total;
             uint64 total1 = (uint64)total*s;
             if( (uint64)total1 != (size_t)total1 )
-                CV_Error( CV_StsOutOfRange, "The total matrix size does not fit to \"size_t\" type" );
+                CV_Error( cv::Error::StsOutOfRange, "The total matrix size does not fit to \"size_t\" type" );
             total = (size_t)total1;
         }
     }
@@ -1072,9 +1072,9 @@ void Mat::push_back(const Mat& elems)
     bool eq = size == elems.size;
     size.p[0] = int(r);
     if( !eq )
-        CV_Error(CV_StsUnmatchedSizes, "Pushed vector length is not equal to matrix row length");
+        CV_Error(cv::Error::StsUnmatchedSizes, "Pushed vector length is not equal to matrix row length");
     if( type() != elems.type() )
-        CV_Error(CV_StsUnmatchedFormats, "Pushed vector type is not the same as matrix type");
+        CV_Error(cv::Error::StsUnmatchedFormats, "Pushed vector type is not the same as matrix type");
 
     if( isSubmatrix() || dataend + step.p[0]*delta > datalimit )
         reserve( std::max(r + delta, (r*3+1)/2) );
@@ -1128,7 +1128,7 @@ Mat& Mat::adjustROI( int dtop, int dbottom, int dleft, int dright )
     if(col1 > col2)
         std::swap(col1, col2);
 
-    data += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
+    data += (row1 - ofs.y)*(std::ptrdiff_t)step + (col1 - ofs.x)*(std::ptrdiff_t)esz;
     rows = row2 - row1; cols = col2 - col1;
     size.p[0] = rows; size.p[1] = cols;
     updateContinuityFlag();
@@ -1170,16 +1170,16 @@ Mat Mat::reshape(int new_cn, int new_rows) const
     {
         int total_size = total_width * rows;
         if( !isContinuous() )
-            CV_Error( CV_BadStep,
+            CV_Error( cv::Error::BadStep,
             "The matrix is not continuous, thus its number of rows can not be changed" );
 
         if( (unsigned)new_rows > (unsigned)total_size )
-            CV_Error( CV_StsOutOfRange, "Bad new number of rows" );
+            CV_Error( cv::Error::StsOutOfRange, "Bad new number of rows" );
 
         total_width = total_size / new_rows;
 
         if( total_width * new_rows != total_size )
-            CV_Error( CV_StsBadArg, "The total number of matrix elements "
+            CV_Error( cv::Error::StsBadArg, "The total number of matrix elements "
                                     "is not divisible by the new number of rows" );
 
         hdr.rows = new_rows;
@@ -1189,7 +1189,7 @@ Mat Mat::reshape(int new_cn, int new_rows) const
     int new_width = total_width / new_cn;
 
     if( new_width * new_cn != total_width )
-        CV_Error( CV_BadNumChannels,
+        CV_Error( cv::Error::BadNumChannels,
         "The total width is not divisible by the new number of channels" );
 
     hdr.cols = new_width;
@@ -1231,13 +1231,13 @@ Mat Mat::reshape(int _cn, int _newndims, const int* _newsz) const
             else if (i < dims)
                 newsz_buf[i] = this->size[i];
             else
-                CV_Error(CV_StsOutOfRange, "Copy dimension (which has zero size) is not present in source matrix");
+                CV_Error(cv::Error::StsOutOfRange, "Copy dimension (which has zero size) is not present in source matrix");
 
             total_elem1 *= (size_t)newsz_buf[i];
         }
 
         if (total_elem1 != total_elem1_ref)
-            CV_Error(CV_StsUnmatchedSizes, "Requested and source matrices have different count of elements");
+            CV_Error(cv::Error::StsUnmatchedSizes, "Requested and source matrices have different count of elements");
 
         Mat hdr = *this;
         hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((_cn-1) << CV_CN_SHIFT);
@@ -1246,7 +1246,7 @@ Mat Mat::reshape(int _cn, int _newndims, const int* _newsz) const
         return hdr;
     }
 
-    CV_Error(CV_StsNotImplemented, "Reshaping of n-dimensional non-continuous matrices is not supported yet");
+    CV_Error(cv::Error::StsNotImplemented, "Reshaping of n-dimensional non-continuous matrices is not supported yet");
     // TBD
 }
 
diff --git a/modules/core/src/matrix_c.cpp b/modules/core/src/matrix_c.cpp
index baa61bb66fb9..13c9c11dd4ca 100644
--- a/modules/core/src/matrix_c.cpp
+++ b/modules/core/src/matrix_c.cpp
@@ -163,7 +163,7 @@ Mat cvarrToMat(const CvArr* arr, bool copyData,
     {
         const IplImage* iplimg = (const IplImage*)arr;
         if( coiMode == 0 && iplimg->roi && iplimg->roi->coi > 0 )
-            CV_Error(CV_BadCOI, "COI is not supported by the function");
+            CV_Error(cv::Error::BadCOI, "COI is not supported by the function");
         return iplImageToMat(iplimg, copyData);
     }
     if( CV_IS_SEQ(arr) )
@@ -187,7 +187,7 @@ Mat cvarrToMat(const CvArr* arr, bool copyData,
         cvCvtSeqToArray(seq, buf.ptr(), CV_WHOLE_SEQ);
         return buf;
     }
-    CV_Error(CV_StsBadArg, "Unknown array type");
+    CV_Error(cv::Error::StsBadArg, "Unknown array type");
 }
 
 void extractImageCOI(const CvArr* arr, OutputArray _ch, int coi)
@@ -269,14 +269,14 @@ cvReduce( const CvArr* srcarr, CvArr* dstarr, int dim, int op )
         dim = src.rows > dst.rows ? 0 : src.cols > dst.cols ? 1 : dst.cols == 1;
 
     if( dim > 1 )
-        CV_Error( CV_StsOutOfRange, "The reduced dimensionality index is out of range" );
+        CV_Error( cv::Error::StsOutOfRange, "The reduced dimensionality index is out of range" );
 
     if( (dim == 0 && (dst.cols != src.cols || dst.rows != 1)) ||
         (dim == 1 && (dst.rows != src.rows || dst.cols != 1)) )
-        CV_Error( CV_StsBadSize, "The output array size is incorrect" );
+        CV_Error( cv::Error::StsBadSize, "The output array size is incorrect" );
 
     if( src.channels() != dst.channels() )
-        CV_Error( CV_StsUnmatchedFormats, "Input and output arrays must have the same number of channels" );
+        CV_Error( cv::Error::StsUnmatchedFormats, "Input and output arrays must have the same number of channels" );
 
     cv::reduce(src, dst, dim, op, dst.type());
 }
@@ -333,7 +333,7 @@ cvRange( CvArr* arr, double start, double end )
                 fdata[j] = (float)val;
     }
     else
-        CV_Error( CV_StsUnsupportedFormat, "The function only supports 32sC1 and 32fC1 datatypes" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "The function only supports 32sC1 and 32fC1 datatypes" );
 
     return arr;
 }
diff --git a/modules/core/src/matrix_expressions.cpp b/modules/core/src/matrix_expressions.cpp
index 44ac8f171331..dd39d50a016d 100644
--- a/modules/core/src/matrix_expressions.cpp
+++ b/modules/core/src/matrix_expressions.cpp
@@ -21,7 +21,7 @@ static void checkOperandsExist(const Mat& a)
 {
     if (a.empty())
     {
-        CV_Error(CV_StsBadArg, "Matrix operand is an empty matrix.");
+        CV_Error(cv::Error::StsBadArg, "Matrix operand is an empty matrix.");
     }
 }
 
@@ -29,7 +29,7 @@ static void checkOperandsExist(const Mat& a, const Mat& b)
 {
     if (a.empty() || b.empty())
     {
-        CV_Error(CV_StsBadArg, "One or more matrix operands are empty.");
+        CV_Error(cv::Error::StsBadArg, "One or more matrix operands are empty.");
     }
 }
 
@@ -1456,7 +1456,7 @@ void MatOp_Bin::assign(const MatExpr& e, Mat& m, int _type) const
     else if( e.flags == 'a' && !e.b.data )
         cv::absdiff(e.a, e.s, dst);
     else
-        CV_Error(CV_StsError, "Unknown operation");
+        CV_Error(cv::Error::StsError, "Unknown operation");
 
     if( dst.data != m.data )
         dst.convertTo(m, _type);
@@ -1691,7 +1691,7 @@ void MatOp_Initializer::assign(const MatExpr& e, Mat& m, int _type) const
     else if( e.flags == '1' )
         m = Scalar(e.alpha);
     else
-        CV_Error(CV_StsError, "Invalid matrix initializer type");
+        CV_Error(cv::Error::StsError, "Invalid matrix initializer type");
 }
 
 void MatOp_Initializer::multiply(const MatExpr& e, double s, MatExpr& res) const
diff --git a/modules/core/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
index 94e0c2b50b89..2992f00e739e 100644
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -954,7 +954,7 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
     }
 
     if( !func )
-        CV_Error( CV_StsUnsupportedFormat,
+        CV_Error( cv::Error::StsUnsupportedFormat,
                   "Unsupported combination of input and output array formats" );
 
     func( src, temp );
@@ -1259,7 +1259,7 @@ void cv::sort( InputArray _src, OutputArray _dst, int flags )
     Mat dst = _dst.getMat();
     CV_IPP_RUN_FAST(ipp_sort(src, dst, flags));
 
-    static SortFunc tab[] =
+    static SortFunc tab[CV_DEPTH_MAX] =
     {
         sort_<uchar>, sort_<schar>, sort_<ushort>, sort_<short>,
         sort_<int>, sort_<float>, sort_<double>, 0
@@ -1284,7 +1284,7 @@ void cv::sortIdx( InputArray _src, OutputArray _dst, int flags )
 
     CV_IPP_RUN_FAST(ipp_sortIdx(src, dst, flags));
 
-    static SortFunc tab[] =
+    static SortFunc tab[CV_DEPTH_MAX] =
     {
         sortIdx_<uchar>, sortIdx_<schar>, sortIdx_<ushort>, sortIdx_<short>,
         sortIdx_<int>, sortIdx_<float>, sortIdx_<double>, 0
diff --git a/modules/core/src/matrix_sparse.cpp b/modules/core/src/matrix_sparse.cpp
index 173f9ea8f64a..cfc769e79163 100644
--- a/modules/core/src/matrix_sparse.cpp
+++ b/modules/core/src/matrix_sparse.cpp
@@ -37,7 +37,7 @@ typedef void (*ConvertScaleData)(const void* from, void* to, int cn, double alph
 
 static ConvertData getConvertElem(int fromType, int toType)
 {
-    static ConvertData tab[][8] =
+    static ConvertData tab[CV_DEPTH_MAX][CV_DEPTH_MAX] =
     {{ convertData_<uchar, uchar>, convertData_<uchar, schar>,
       convertData_<uchar, ushort>, convertData_<uchar, short>,
       convertData_<uchar, int>, convertData_<uchar, float>,
@@ -82,7 +82,7 @@ static ConvertData getConvertElem(int fromType, int toType)
 
 static ConvertScaleData getConvertScaleElem(int fromType, int toType)
 {
-    static ConvertScaleData tab[][8] =
+    static ConvertScaleData tab[CV_DEPTH_MAX][CV_DEPTH_MAX] =
     {{ convertScaleData_<uchar, uchar>, convertScaleData_<uchar, schar>,
       convertScaleData_<uchar, ushort>, convertScaleData_<uchar, short>,
       convertScaleData_<uchar, int>, convertScaleData_<uchar, float>,
@@ -389,6 +389,7 @@ void SparseMat::convertTo( SparseMat& m, int rtype, double alpha ) const
     if( alpha == 1 )
     {
         ConvertData cvtfunc = getConvertElem(type(), rtype);
+        CV_Assert(cvtfunc);
         for( size_t i = 0; i < N; i++, ++from )
         {
             const Node* n = from.node();
@@ -399,6 +400,7 @@ void SparseMat::convertTo( SparseMat& m, int rtype, double alpha ) const
     else
     {
         ConvertScaleData cvtfunc = getConvertScaleElem(type(), rtype);
+        CV_Assert(cvtfunc);
         for( size_t i = 0; i < N; i++, ++from )
         {
             const Node* n = from.node();
@@ -758,7 +760,7 @@ double norm( const SparseMat& src, int normType )
             }
     }
     else
-        CV_Error( CV_StsUnsupportedFormat, "Only 32f and 64f are supported" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Only 32f and 64f are supported" );
 
     if( normType == NORM_L2 )
         result = std::sqrt(result);
@@ -821,7 +823,7 @@ void minMaxLoc( const SparseMat& src, double* _minval, double* _maxval, int* _mi
             *_maxval = maxval;
     }
     else
-        CV_Error( CV_StsUnsupportedFormat, "Only 32f and 64f are supported" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Only 32f and 64f are supported" );
 
     if( _minidx && minidx )
         for( i = 0; i < d; i++ )
@@ -843,7 +845,7 @@ void normalize( const SparseMat& src, SparseMat& dst, double a, int norm_type )
         scale = scale > DBL_EPSILON ? a/scale : 0.;
     }
     else
-        CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
+        CV_Error( cv::Error::StsBadArg, "Unknown/unsupported norm type" );
 
     src.convertTo( dst, -1, scale );
 }
diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp
index 57fd0c65091f..bad17e7b6b61 100644
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@@ -4,9 +4,11 @@
 
 #include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
+#include "hal_replacement.hpp"
 #include "opencv2/core/detail/dispatch_helper.impl.hpp"
 
 #include <algorithm> // std::swap_ranges
+#include <numeric> // std::accumulate
 
 namespace cv {
 
@@ -267,6 +269,8 @@ void transpose( InputArray _src, OutputArray _dst )
         return;
     }
 
+    CALL_HAL(transpose2d, cv_hal_transpose2d, src.data, src.step, dst.data, dst.step, src.cols, src.rows, esz);
+
     CV_IPP_RUN_FAST(ipp_transpose(src, dst))
 
     if( dst.data == src.data )
@@ -354,10 +358,10 @@ void transposeND(InputArray src_, const std::vector<int>& order, OutputArray dst
 #if CV_SIMD128
 template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
-    typedef typename V::lane_type T;
+    typedef typename VTraits<V>::lane_type T;
     int end = (int)(size.width*esz);
     int width = (end + 1)/2;
-    int width_1 = width & -v_uint8x16::nlanes;
+    int width_1 = width & -VTraits<v_uint8x16>::vlanes();
     int i, j;
 
 #if CV_STRONG_ALIGNMENT
@@ -366,15 +370,15 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
 
     for( ; size.height--; src += sstep, dst += dstep )
     {
-        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+        for( i = 0, j = end; i < width_1; i += VTraits<v_uint8x16>::vlanes(), j -= VTraits<v_uint8x16>::vlanes() )
         {
             V t0, t1;
 
             t0 = v_load((T*)((uchar*)src + i));
-            t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
+            t1 = v_load((T*)((uchar*)src + j - VTraits<v_uint8x16>::vlanes()));
             t0 = v_reverse(t0);
             t1 = v_reverse(t1);
-            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
+            v_store((T*)(dst + j - VTraits<v_uint8x16>::vlanes()), t0);
             v_store((T*)(dst + i), t1);
         }
         if (isAligned<sizeof(T)>(src, dst))
@@ -440,18 +444,18 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const
 static void
 flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
-#if CV_SIMD
+#if CV_SIMD128
 #if CV_STRONG_ALIGNMENT
     size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
 #endif
-    if (esz == 2 * v_uint8x16::nlanes)
+    if (esz == 2 * (size_t)VTraits<v_uint8x16>::vlanes())
     {
         int end = (int)(size.width*esz);
         int width = end/2;
 
         for( ; size.height--; src += sstep, dst += dstep )
         {
-            for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
+            for( int i = 0, j = end - 2 * VTraits<v_uint8x16>::vlanes(); i < width; i += 2 * VTraits<v_uint8x16>::vlanes(), j -= 2 * VTraits<v_uint8x16>::vlanes() )
             {
 #if CV_SIMD256
                 v_uint8x32 t0, t1;
@@ -464,25 +468,25 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
                 v_uint8x16 t0, t1, t2, t3;
 
                 t0 = v_load((uchar*)src + i);
-                t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
+                t1 = v_load((uchar*)src + i + VTraits<v_uint8x16>::vlanes());
                 t2 = v_load((uchar*)src + j);
-                t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
+                t3 = v_load((uchar*)src + j + VTraits<v_uint8x16>::vlanes());
                 v_store(dst + j, t0);
-                v_store(dst + j + v_uint8x16::nlanes, t1);
+                v_store(dst + j + VTraits<v_uint8x16>::vlanes(), t1);
                 v_store(dst + i, t2);
-                v_store(dst + i + v_uint8x16::nlanes, t3);
+                v_store(dst + i + VTraits<v_uint8x16>::vlanes(), t3);
 #endif
             }
         }
     }
-    else if (esz == v_uint8x16::nlanes)
+    else if (esz == (size_t)VTraits<v_uint8x16>::vlanes())
     {
         int end = (int)(size.width*esz);
         int width = end/2;
 
         for( ; size.height--; src += sstep, dst += dstep )
         {
-            for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+            for( int i = 0, j = end - VTraits<v_uint8x16>::vlanes(); i < width; i += VTraits<v_uint8x16>::vlanes(), j -= VTraits<v_uint8x16>::vlanes() )
             {
                 v_uint8x16 t0, t1;
 
@@ -532,19 +536,19 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
 
         for( ; size.height--; src += sstep, dst += dstep )
         {
-            for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
+            for ( int i = 0, j = end; i < width; i += VTraits<v_uint8x16>::vlanes() + sizeof(uint64_t), j -= VTraits<v_uint8x16>::vlanes() + sizeof(uint64_t) )
             {
                 v_uint8x16 t0, t1;
                 uint64_t t2, t3;
 
                 t0 = v_load((uchar*)src + i);
-                t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
-                t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
+                t2 = *((uint64_t*)((uchar*)src + i + VTraits<v_uint8x16>::vlanes()));
+                t1 = v_load((uchar*)src + j - VTraits<v_uint8x16>::vlanes() - sizeof(uint64_t));
                 t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
-                v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
+                v_store(dst + j - VTraits<v_uint8x16>::vlanes() - sizeof(uint64_t), t0);
                 *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
                 v_store(dst + i, t1);
-                *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
+                *((uint64_t*)(dst + i + VTraits<v_uint8x16>::vlanes())) = t3;
             }
         }
     }
@@ -563,7 +567,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
     }
 #endif
     else
-#endif // CV_SIMD
+#endif // CV_SIMD128
     {
         int i, j, limit = (int)(((size.width + 1)/2)*esz);
         AutoBuffer<int> _tab(size.width*esz);
@@ -596,23 +600,23 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
                                                   dst0 += dstep, dst1 -= dstep )
     {
         int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #if CV_STRONG_ALIGNMENT
         if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
 #endif
         {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
             {
-                v_int32 t0 = vx_load((int*)(src0 + i));
-                v_int32 t1 = vx_load((int*)(src1 + i));
-                v_store((int*)(dst0 + i), t1);
-                v_store((int*)(dst1 + i), t0);
+                v_int32 t0 = v_reinterpret_as_s32(vx_load(src0 + i));
+                v_int32 t1 = v_reinterpret_as_s32(vx_load(src1 + i));
+                v_store(dst0 + i, v_reinterpret_as_u8(t1));
+                v_store(dst1 + i, v_reinterpret_as_u8(t0));
             }
         }
 #if CV_STRONG_ALIGNMENT
         else
         {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 t0 = vx_load(src0 + i);
                 v_uint8 t1 = vx_load(src1 + i);
@@ -801,6 +805,9 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
     _dst.create( size, type );
     Mat dst = _dst.getMat();
 
+    CALL_HAL(flip, cv_hal_flip, type, src.ptr(), src.step, src.cols, src.rows,
+             dst.ptr(), dst.step, flip_mode);
+
     CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));
 
     size_t esz = CV_ELEM_SIZE(type);
@@ -857,10 +864,225 @@ void flipND(InputArray _src, OutputArray _dst, int _axis)
     flipNDImpl(dst.ptr(), dst.size.p, dst.step.p, axis);
 }
 
-void rotate(InputArray _src, OutputArray _dst, int rotateMode)
-{
-    CV_Assert(_src.dims() <= 2);
+/*
+    This function first prepends 1 to each tensor shape to have a common max_ndims dimension, then flatten non-broadcast dimensions.
+*/
+static bool _flatten_for_broadcast(int narrays, int max_ndims, const int* ndims, const int** orig_shape,
+                                   int** flatten_shape, size_t** flatten_step) {
+    int i, j, k;
+
+    // step 1.
+    // * make all inputs and the output max_ndims-dimensional.
+    // * compute proper step's
+    for (i = max_ndims - 1; i >= 0; i-- ) {
+        for (k = 0; k < narrays; k++) {
+            j = ndims[k] - (max_ndims - i);
+            int sz_i = j >= 0 ? orig_shape[k][j] : 1;
+            size_t st_i = i == max_ndims - 1 ? 1 : flatten_step[k][i+1] * flatten_shape[k][i+1];
+            flatten_shape[k][i] = sz_i;
+            flatten_step[k][i] = st_i;
+            if (flatten_shape[k][i] == 0)
+                return false;
+        }
+    }
+
+    // step 2. Let's do the flattening first,
+    // since we'd need proper values of steps to check continuity.
+    // this loop is probably the most tricky part
+    // in the whole implementation of broadcasting.
+    j = max_ndims-1;
+    for (i = j - 1; i >= 0; i--) {
+        bool all_contiguous = true, all_scalars = true, all_consistent = true;
+        for(k = 0; k < narrays; k++) {
+            size_t st = flatten_step[k][j] * flatten_shape[k][j];
+            bool prev_scalar = flatten_shape[k][j] == 1;
+            bool scalar = flatten_shape[k][i] == 1;
+            all_contiguous = all_contiguous && (st == flatten_step[k][i]);
+            all_scalars = all_scalars && scalar;
+            all_consistent = all_consistent && (scalar == prev_scalar);
+        }
+        if (all_contiguous && (all_consistent || all_scalars)) {
+            for(k = 0; k < narrays; k++)
+                flatten_shape[k][j] *= flatten_shape[k][i];
+        } else {
+            j--;
+            if (i < j) {
+                for(k = 0; k < narrays; k++) {
+                    flatten_shape[k][j] = flatten_shape[k][i];
+                    flatten_step[k][j] = flatten_step[k][i];
+                }
+            }
+        }
+    }
+
+    // step 3. Set some step's to 0's.
+    for (i = max_ndims-1; i >= j; i--) {
+        for (k = 0; k < narrays; k++)
+            flatten_step[k][i] = flatten_shape[k][i] == 1 ? 0 : flatten_step[k][i];
+    }
+    for (; i >= 0; i--) {
+        for (k = 0; k < narrays; k++) {
+            flatten_step[k][i] = 0;
+            flatten_shape[k][i] = 1;
+        }
+    }
+    return true;
+}
+
+void broadcast(InputArray _src, InputArray _shape, OutputArray _dst) {
+    CV_INSTRUMENT_REGION();
+
+    Mat src = _src.getMat();
+    CV_CheckTrue(src.isContinuous(), "broadcast: input array must be contiguous");
+    CV_CheckChannelsEQ(src.channels(), 1, "broadcast: input array must be single channel");
+
+    Mat shape = _shape.getMat();
+    CV_CheckTypeEQ(shape.type(), CV_32S, "broadcast: target shape must be of type int32");
+    const auto dims_shape = static_cast<int>(shape.total());
+    const auto *ptr_shape = shape.ptr<int>();
+
+    // check valid shape, 1D/0D Mat would fail in the following checks
+    const auto dims_src = src.dims;
+    CV_CheckLE(dims_src, dims_shape,
+               "broadcast: dimension of input array must be less than or equal to dimension of target shape");
+    std::vector<int> shape_src{src.size.p, src.size.p + dims_src};
+    if (shape_src.size() < static_cast<size_t>(dims_shape)) {
+        shape_src.insert(shape_src.begin(), dims_shape - shape_src.size(), 1);
+    }
+    for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
+        const auto *shape_target = ptr_shape;
+        if (shape_src[i] != 1) {
+            CV_CheckEQ(shape_src[i], shape_target[i], "target shape must be equal to input shape or 1");
+        }
+    }
 
+    // impl
+    _dst.create(dims_shape, shape.ptr<int>(), src.type());
+    Mat dst = _dst.getMat();
+    std::vector<int> is_same_shape(dims_shape, 0);
+    for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
+        if (shape_src[i] == ptr_shape[i]) {
+            is_same_shape[i] = 1;
+        }
+    }
+    // copy if same shape
+    if (std::accumulate(is_same_shape.begin(), is_same_shape.end(), 1, std::multiplies<int>()) != 0) {
+        const auto *p_src = src.ptr<const char>();
+        auto *p_dst = dst.ptr<char>();
+        std::memcpy(p_dst, p_src, dst.total() * dst.elemSize());
+        return;
+    }
+    // other cases
+    int max_ndims = std::max(dims_src, dims_shape);
+    const int all_ndims[2] = {src.dims, dst.dims};
+    const int* orig_shapes[2] = {src.size.p, dst.size.p};
+    cv::AutoBuffer<size_t> buff(max_ndims * 4);
+    int* flatten_shapes[2] = {(int*)buff.data(), (int*)(buff.data() + max_ndims)};
+    size_t* flatten_steps[2] = {(size_t*)(buff.data() + 2 * max_ndims), (size_t*)(buff.data() + 3 * max_ndims)};
+    if (_flatten_for_broadcast(2, max_ndims, all_ndims, orig_shapes, flatten_shapes, flatten_steps)) {
+        size_t src_dp = flatten_steps[0][max_ndims - 1];
+        size_t dst_dp = flatten_steps[1][max_ndims - 1];
+        CV_Assert(dst_dp == 1);
+        CV_Assert(max_ndims >= 2); // >= 3?
+        size_t rowstep_src = flatten_steps[0][max_ndims - 2];
+        size_t rowstep_dst = flatten_steps[1][max_ndims - 2];
+        const char* ptr_src = src.ptr<const char>();
+        char* ptr_dst = dst.ptr<char>();
+        size_t esz = src.elemSize();
+        int nrows = flatten_shapes[1][max_ndims - 2];
+        int ncols = flatten_shapes[1][max_ndims - 1];
+        int nplanes = 1;
+        CV_Check(esz, esz == 1 || esz == 2 || esz == 4 || esz == 8, "broadcast: not supported data type");
+
+        for (int k = 0; k < max_ndims - 2; k++) {
+            nplanes *= flatten_shapes[1][k];
+        }
+        for (int plane_idx = 0; plane_idx < nplanes; plane_idx++) {
+            size_t offset_src = 0, offset_dst = 0;
+            size_t idx = (size_t)plane_idx;
+            for (int k = max_ndims - 3; k >= 0; k--) {
+                size_t prev_idx = idx / flatten_shapes[1][k];
+                size_t i_k = (int)(idx - prev_idx * flatten_shapes[1][k]);
+                offset_src += i_k * flatten_steps[0][k];
+                offset_dst += i_k * flatten_steps[1][k];
+                idx = prev_idx;
+            }
+
+            #define OPENCV_CORE_BROADCAST_LOOP(_Tp) \
+                for (int i = 0; i < nrows; i++) {   \
+                    const _Tp *ptr_src_ = (const _Tp*)ptr_src + offset_src + rowstep_src * i; \
+                    _Tp *ptr_dst_ = (_Tp*)ptr_dst + offset_dst + rowstep_dst * i; \
+                    if (src_dp == 1) { \
+                        for (int j = 0; j < ncols; j++) { \
+                            ptr_dst_[j] = ptr_src_[j]; \
+                        } \
+                    } else { \
+                        _Tp x = *ptr_src_; \
+                        for (int j = 0; j < ncols; j++) { \
+                            ptr_dst_[j] = x; \
+                        } \
+                    } \
+                }
+
+            if (esz == 1) {
+                OPENCV_CORE_BROADCAST_LOOP(int8_t);
+            } else if (esz == 2) {
+                OPENCV_CORE_BROADCAST_LOOP(int16_t);
+            } else if (esz == 4) {
+                OPENCV_CORE_BROADCAST_LOOP(int32_t);
+            } else if (esz == 8) {
+                OPENCV_CORE_BROADCAST_LOOP(int64_t);
+            } else {
+                CV_Error(cv::Error::StsNotImplemented, "");
+            }
+            #undef OPENCV_CORE_BROADCAST_LOOP
+        }
+    } else {
+        // initial copy (src to dst)
+        std::vector<size_t> step_src{src.step.p, src.step.p + dims_src};
+        if (step_src.size() < static_cast<size_t>(dims_shape)) {
+            step_src.insert(step_src.begin(), dims_shape - step_src.size(), step_src[0]);
+        }
+        for (size_t i = 0; i < src.total(); ++i) {
+            size_t t = i;
+            size_t src_offset = 0, dst_offset = 0;
+            for (int j = static_cast<int>(shape_src.size() - 1); j >= 0; --j) {
+                size_t idx = t / shape_src[j];
+                size_t offset = static_cast<size_t>(t - idx * shape_src[j]);
+                src_offset += offset * step_src[j];
+                dst_offset += offset * dst.step[j];
+                t = idx;
+            }
+            const auto *p_src = src.ptr<const char>();
+            auto *p_dst = dst.ptr<char>();
+            std::memcpy(p_dst + dst_offset, p_src + src_offset, dst.elemSize());
+        }
+        // broadcast copy (dst inplace)
+        std::vector<int> cumulative_shape(dims_shape, 1);
+        int total = static_cast<int>(dst.total());
+        for (int i = dims_shape - 1; i >= 0; --i) {
+            cumulative_shape[i] = static_cast<int>(total / ptr_shape[i]);
+            total = cumulative_shape[i];
+        }
+        for (int i = dims_shape - 1; i >= 0; --i) {
+            if (is_same_shape[i] == 1) {
+                continue;
+            }
+            auto step = dst.step[i];
+            auto *p_dst = dst.ptr<char>();
+            for (int j = 0; j < cumulative_shape[i]; j++) {
+                for (int k = 0; k < ptr_shape[i] - 1; k++) {
+                    std::memcpy(p_dst + step, p_dst, step);
+                    p_dst += step;
+                }
+                p_dst += step;
+            }
+        }
+    }
+}
+
+static void rotateImpl(InputArray _src, OutputArray _dst, int rotateMode)
+{
     switch (rotateMode)
     {
     case ROTATE_90_CLOCKWISE:
@@ -879,4 +1101,51 @@ void rotate(InputArray _src, OutputArray _dst, int rotateMode)
     }
 }
 
+void rotate(InputArray _src, OutputArray _dst, int rotateMode)
+{
+    CV_Assert(_src.dims() <= 2);
+    int angle;
+
+    if (_dst.isUMat())
+    {
+        rotateImpl(_src, _dst, rotateMode);
+        return;
+    }
+
+    Mat src = _src.getMat();
+    int type = src.type();
+    if( src.empty() )
+    {
+        _dst.release();
+        return;
+    }
+
+    switch (rotateMode)
+    {
+    case ROTATE_90_CLOCKWISE:
+        _dst.create(src.cols, src.rows, type);
+        angle = 90;
+        break;
+    case ROTATE_180:
+        _dst.create(src.rows, src.cols, type);
+        angle = 180;
+        break;
+    case ROTATE_90_COUNTERCLOCKWISE:
+        _dst.create(src.cols, src.rows, type);
+        angle = 270;
+        break;
+    default:
+        _dst.create(src.rows, src.cols, type);
+        angle = 0;
+        break;
+    }
+
+    Mat dst = _dst.getMat();
+    CALL_HAL(rotate90, cv_hal_rotate90, type, src.ptr(), src.step, src.cols, src.rows,
+             dst.ptr(), dst.step, angle);
+
+    // use src (Mat) since _src (InputArray) is updated by _dst.create() when in-place
+    rotateImpl(src, _dst, rotateMode);
+}
+
 }  // namespace
diff --git a/modules/core/src/matrix_wrap.cpp b/modules/core/src/matrix_wrap.cpp
index bb61ce2de1b9..238fc642fd6b 100644
--- a/modules/core/src/matrix_wrap.cpp
+++ b/modules/core/src/matrix_wrap.cpp
@@ -948,7 +948,7 @@ bool _InputArray::isContinuous(int i) const
     if( k == CUDA_GPU_MAT )
       return i < 0 ? ((const cuda::GpuMat*)obj)->isContinuous() : true;
 
-    CV_Error(CV_StsNotImplemented, "Unknown/unsupported array type");
+    CV_Error(cv::Error::StsNotImplemented, "Unknown/unsupported array type");
 }
 
 bool _InputArray::isSubmatrix(int i) const
@@ -986,7 +986,7 @@ bool _InputArray::isSubmatrix(int i) const
         return vv[i].isSubmatrix();
     }
 
-    CV_Error(CV_StsNotImplemented, "");
+    CV_Error(cv::Error::StsNotImplemented, "");
 }
 
 size_t _InputArray::offset(int i) const
@@ -1466,14 +1466,14 @@ void _OutputArray::create(int d, const int* sizes, int mtype, int i,
             ((std::vector<Vec<int, 128> >*)v)->resize(len);
             break;
         default:
-            CV_Error_(CV_StsBadArg, ("Vectors with element size %d are not supported. Please, modify OutputArray::create()\n", esz));
+            CV_Error_(cv::Error::StsBadArg, ("Vectors with element size %d are not supported. Please, modify OutputArray::create()\n", esz));
         }
         return;
     }
 
     if( k == NONE )
     {
-        CV_Error(CV_StsNullPtr, "create() called for the missing output array" );
+        CV_Error(cv::Error::StsNullPtr, "create() called for the missing output array" );
     }
 
     if( k == STD_VECTOR_MAT )
@@ -1919,12 +1919,7 @@ void _OutputArray::move(UMat& u) const
     int k = kind();
     if (k == UMAT)
     {
-#ifdef CV_CXX11
         *(UMat*)obj = std::move(u);
-#else
-        *(UMat*)obj = u;
-        u.release();
-#endif
     }
     else if (k == MAT)
     {
@@ -1959,12 +1954,7 @@ void _OutputArray::move(Mat& m) const
     }
     else if (k == MAT)
     {
-#ifdef CV_CXX11
         *(Mat*)obj = std::move(m);
-#else
-        *(Mat*)obj = m;
-        m.release();
-#endif
     }
     else if (k == MATX)
     {
diff --git a/modules/core/src/mean.dispatch.cpp b/modules/core/src/mean.dispatch.cpp
index 6a5275ab43ff..b6ea92625546 100644
--- a/modules/core/src/mean.dispatch.cpp
+++ b/modules/core/src/mean.dispatch.cpp
@@ -8,20 +8,24 @@
 #include "opencv2/core/openvx/ovx_defs.hpp"
 #include "stat.hpp"
 
+#ifndef OPENCV_IPP_MEAN
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
 #define CV_IPP_RUN_FAST(f, ...)
 #undef CV_IPP_RUN
 #define CV_IPP_RUN(c, f, ...)
+#endif // OPENCV_IPP_MEAN
 
 #include "mean.simd.hpp"
 #include "mean.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
+#ifndef OPENCV_IPP_MEAN
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
 #define CV_IPP_RUN_FAST(f, ...)
 #undef CV_IPP_RUN
 #define CV_IPP_RUN(c, f, ...)
+#endif // OPENCV_IPP_MEAN
 
 namespace cv {
 
@@ -521,12 +525,55 @@ void meanStdDev(InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray
 
     Mat src = _src.getMat(), mask = _mask.getMat();
 
+    CV_Assert(mask.empty() || src.size == mask.size);
+
     CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MEAN_STDDEV>(src.cols, src.rows),
                openvx_meanStdDev(src, _mean, _sdv, mask))
 
     CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_meanStdDev(src, _mean, _sdv, mask));
 
     int k, cn = src.channels(), depth = src.depth();
+    Mat mean_mat, stddev_mat;
+
+    if(_mean.needed())
+    {
+        if( !_mean.fixedSize() )
+            _mean.create(cn, 1, CV_64F, -1, true);
+
+        mean_mat = _mean.getMat();
+        int dcn = (int)mean_mat.total();
+        CV_Assert( mean_mat.type() == CV_64F && mean_mat.isContinuous() &&
+                   (mean_mat.cols == 1 || mean_mat.rows == 1) && dcn >= cn );
+    }
+
+    if (_sdv.needed())
+    {
+        if( !_sdv.fixedSize() )
+            _sdv.create(cn, 1, CV_64F, -1, true);
+
+        stddev_mat = _sdv.getMat();
+        int dcn = (int)stddev_mat.total();
+        CV_Assert( stddev_mat.type() == CV_64F && stddev_mat.isContinuous() &&
+                   (stddev_mat.cols == 1 || stddev_mat.rows == 1) && dcn >= cn );
+    }
+
+    if (src.isContinuous() && mask.isContinuous())
+    {
+        CALL_HAL(meanStdDev, cv_hal_meanStdDev, src.data, 0, (int)src.total(), 1, src.type(),
+                 _mean.needed() ? mean_mat.ptr<double>() : nullptr,
+                 _sdv.needed() ? stddev_mat.ptr<double>() : nullptr,
+                 mask.data, 0);
+    }
+    else
+    {
+        if (src.dims <= 2)
+        {
+            CALL_HAL(meanStdDev, cv_hal_meanStdDev, src.data, src.step, src.cols, src.rows, src.type(),
+                     _mean.needed() ? mean_mat.ptr<double>() : nullptr,
+                     _sdv.needed() ? stddev_mat.ptr<double>() : nullptr,
+                     mask.data, mask.step);
+        }
+    }
 
     SumSqrFunc func = getSumSqrFunc(depth);
 
@@ -596,20 +643,22 @@ void meanStdDev(InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray
         sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.));
     }
 
-    for( j = 0; j < 2; j++ )
+    if (_mean.needed())
     {
-        const double* sptr = j == 0 ? s : sq;
-        _OutputArray _dst = j == 0 ? _mean : _sdv;
-        if( !_dst.needed() )
-            continue;
+        const double* sptr = s;
+        int dcn = (int)mean_mat.total();
+        double* dptr = mean_mat.ptr<double>();
+        for( k = 0; k < cn; k++ )
+            dptr[k] = sptr[k];
+        for( ; k < dcn; k++ )
+            dptr[k] = 0;
+    }
 
-        if( !_dst.fixedSize() )
-            _dst.create(cn, 1, CV_64F, -1, true);
-        Mat dst = _dst.getMat();
-        int dcn = (int)dst.total();
-        CV_Assert( dst.type() == CV_64F && dst.isContinuous() &&
-                   (dst.cols == 1 || dst.rows == 1) && dcn >= cn );
-        double* dptr = dst.ptr<double>();
+    if (_sdv.needed())
+    {
+        const double* sptr = sq;
+        int dcn = (int)stddev_mat.total();
+        double* dptr = stddev_mat.ptr<double>();
         for( k = 0; k < cn; k++ )
             dptr[k] = sptr[k];
         for( ; k < dcn; k++ )
diff --git a/modules/core/src/mean.simd.hpp b/modules/core/src/mean.simd.hpp
index d94c8872233b..c6bbc20b892a 100644
--- a/modules/core/src/mean.simd.hpp
+++ b/modules/core/src/mean.simd.hpp
@@ -24,7 +24,7 @@ struct SumSqr_SIMD
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template <>
 struct SumSqr_SIMD<uchar, int, int>
@@ -39,37 +39,37 @@ struct SumSqr_SIMD<uchar, int, int>
         v_int32 v_sum = vx_setzero_s32();
         v_int32 v_sqsum = vx_setzero_s32();
 
-        const int len0 = len & -v_uint8::nlanes;
+        const int len0 = len & -VTraits<v_uint8>::vlanes();
         while(x < len0)
         {
-            const int len_tmp = min(x + 256*v_uint16::nlanes, len0);
+            const int len_tmp = min(x + 256*VTraits<v_uint16>::vlanes(), len0);
             v_uint16 v_sum16 = vx_setzero_u16();
-            for ( ; x < len_tmp; x += v_uint8::nlanes)
+            for ( ; x < len_tmp; x += VTraits<v_uint8>::vlanes())
             {
                 v_uint16 v_src0 = vx_load_expand(src0 + x);
-                v_uint16 v_src1 = vx_load_expand(src0 + x + v_uint16::nlanes);
-                v_sum16 += v_src0 + v_src1;
+                v_uint16 v_src1 = vx_load_expand(src0 + x + VTraits<v_uint16>::vlanes());
+                v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
                 v_int16 v_tmp0, v_tmp1;
                 v_zip(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_src1), v_tmp0, v_tmp1);
-                v_sqsum += v_dotprod(v_tmp0, v_tmp0) + v_dotprod(v_tmp1, v_tmp1);
+                v_sqsum = v_add(v_sqsum, v_add(v_dotprod(v_tmp0, v_tmp0), v_dotprod(v_tmp1, v_tmp1)));
             }
             v_uint32 v_half0, v_half1;
             v_expand(v_sum16, v_half0, v_half1);
-            v_sum += v_reinterpret_as_s32(v_half0 + v_half1);
+            v_sum = v_add(v_sum, v_reinterpret_as_s32(v_add(v_half0, v_half1)));
         }
-        if (x <= len - v_uint16::nlanes)
+        if (x <= len - VTraits<v_uint16>::vlanes())
         {
             v_uint16 v_src = vx_load_expand(src0 + x);
             v_uint16 v_half = v_combine_high(v_src, v_src);
 
             v_uint32 v_tmp0, v_tmp1;
-            v_expand(v_src + v_half, v_tmp0, v_tmp1);
-            v_sum += v_reinterpret_as_s32(v_tmp0);
+            v_expand(v_add(v_src, v_half), v_tmp0, v_tmp1);
+            v_sum = v_add(v_sum, v_reinterpret_as_s32(v_tmp0));
 
             v_int16 v_tmp2, v_tmp3;
             v_zip(v_reinterpret_as_s16(v_src), v_reinterpret_as_s16(v_half), v_tmp2, v_tmp3);
-            v_sqsum += v_dotprod(v_tmp2, v_tmp2);
-            x += v_uint16::nlanes;
+            v_sqsum = v_add(v_sqsum, v_dotprod(v_tmp2, v_tmp2));
+            x += VTraits<v_uint16>::vlanes();
         }
 
         if (cn == 1)
@@ -79,13 +79,13 @@ struct SumSqr_SIMD<uchar, int, int>
         }
         else
         {
-            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_int32::nlanes];
+            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_int32>::max_nlanes];
             v_store(ar, v_sum);
-            v_store(ar + v_int32::nlanes, v_sqsum);
-            for (int i = 0; i < v_int32::nlanes; ++i)
+            v_store(ar + VTraits<v_int32>::vlanes(), v_sqsum);
+            for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
             {
                 sum[i % cn] += ar[i];
-                sqsum[i % cn] += ar[v_int32::nlanes + i];
+                sqsum[i % cn] += ar[VTraits<v_int32>::vlanes() + i];
             }
         }
         v_cleanup();
@@ -106,37 +106,37 @@ struct SumSqr_SIMD<schar, int, int>
         v_int32 v_sum = vx_setzero_s32();
         v_int32 v_sqsum = vx_setzero_s32();
 
-        const int len0 = len & -v_int8::nlanes;
+        const int len0 = len & -VTraits<v_int8>::vlanes();
         while (x < len0)
         {
-            const int len_tmp = min(x + 256 * v_int16::nlanes, len0);
+            const int len_tmp = min(x + 256 * VTraits<v_int16>::vlanes(), len0);
             v_int16 v_sum16 = vx_setzero_s16();
-            for (; x < len_tmp; x += v_int8::nlanes)
+            for (; x < len_tmp; x += VTraits<v_int8>::vlanes())
             {
                 v_int16 v_src0 = vx_load_expand(src0 + x);
-                v_int16 v_src1 = vx_load_expand(src0 + x + v_int16::nlanes);
-                v_sum16 += v_src0 + v_src1;
+                v_int16 v_src1 = vx_load_expand(src0 + x + VTraits<v_int16>::vlanes());
+                v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
                 v_int16 v_tmp0, v_tmp1;
                 v_zip(v_src0, v_src1, v_tmp0, v_tmp1);
-                v_sqsum += v_dotprod(v_tmp0, v_tmp0) + v_dotprod(v_tmp1, v_tmp1);
+                v_sqsum = v_add(v_sqsum, v_add(v_dotprod(v_tmp0, v_tmp0), v_dotprod(v_tmp1, v_tmp1)));
             }
             v_int32 v_half0, v_half1;
             v_expand(v_sum16, v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
         }
-        if (x <= len - v_int16::nlanes)
+        if (x <= len - VTraits<v_int16>::vlanes())
         {
             v_int16 v_src = vx_load_expand(src0 + x);
             v_int16 v_half = v_combine_high(v_src, v_src);
 
             v_int32 v_tmp0, v_tmp1;
-            v_expand(v_src + v_half, v_tmp0, v_tmp1);
-            v_sum += v_tmp0;
+            v_expand(v_add(v_src, v_half), v_tmp0, v_tmp1);
+            v_sum = v_add(v_sum, v_tmp0);
 
             v_int16 v_tmp2, v_tmp3;
             v_zip(v_src, v_half, v_tmp2, v_tmp3);
-            v_sqsum += v_dotprod(v_tmp2, v_tmp2);
-            x += v_int16::nlanes;
+            v_sqsum = v_add(v_sqsum, v_dotprod(v_tmp2, v_tmp2));
+            x += VTraits<v_int16>::vlanes();
         }
 
         if (cn == 1)
@@ -146,13 +146,13 @@ struct SumSqr_SIMD<schar, int, int>
         }
         else
         {
-            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_int32::nlanes];
+            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_int32>::max_nlanes];
             v_store(ar, v_sum);
-            v_store(ar + v_int32::nlanes, v_sqsum);
-            for (int i = 0; i < v_int32::nlanes; ++i)
+            v_store(ar + VTraits<v_int32>::vlanes(), v_sqsum);
+            for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
             {
                 sum[i % cn] += ar[i];
-                sqsum[i % cn] += ar[v_int32::nlanes + i];
+                sqsum[i % cn] += ar[VTraits<v_int32>::vlanes() + i];
             }
         }
         v_cleanup();
@@ -311,7 +311,7 @@ static int sqsum64f( const double* src, const uchar* mask, double* sum, double*
 SumSqrFunc getSumSqrFunc(int depth)
 {
     CV_INSTRUMENT_REGION();
-    static SumSqrFunc sumSqrTab[] =
+    static SumSqrFunc sumSqrTab[CV_DEPTH_MAX] =
     {
         (SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s,
         (SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0
diff --git a/modules/core/src/merge.dispatch.cpp b/modules/core/src/merge.dispatch.cpp
index b95dc7345da3..19a62d22b040 100644
--- a/modules/core/src/merge.dispatch.cpp
+++ b/modules/core/src/merge.dispatch.cpp
@@ -50,7 +50,7 @@ typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
 
 static MergeFunc getMergeFunc(int depth)
 {
-    static MergeFunc mergeTab[] =
+    static MergeFunc mergeTab[CV_DEPTH_MAX] =
     {
         (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u),
         (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
@@ -118,6 +118,7 @@ void merge(const Mat* mv, size_t n, OutputArray _dst)
     CV_INSTRUMENT_REGION();
 
     CV_Assert( mv && n > 0 );
+    CV_Assert(!mv[0].empty());
 
     int depth = mv[0].depth();
     bool allch1 = true;
diff --git a/modules/core/src/merge.simd.hpp b/modules/core/src/merge.simd.hpp
index ad08dd8879c9..d67a117c7baa 100644
--- a/modules/core/src/merge.simd.hpp
+++ b/modules/core/src/merge.simd.hpp
@@ -15,7 +15,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 /*
   The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
   on IA there are instructions movntps and such to which
@@ -38,7 +38,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
 template<typename T, typename VecT> static void
 vecmerge_( const T** src, T* dst, int len, int cn )
 {
-    const int VECSZ = VecT::nlanes;
+    const int VECSZ = VTraits<VecT>::vlanes();
     int i, i0 = 0;
     const T* src0 = src[0];
     const T* src1 = src[1];
@@ -173,8 +173,8 @@ merge_( const T** src, T* dst, int len, int cn )
 void merge8u(const uchar** src, uchar* dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
         vecmerge_<uchar, v_uint8>(src, dst, len, cn);
     else
 #endif
@@ -184,8 +184,8 @@ void merge8u(const uchar** src, uchar* dst, int len, int cn )
 void merge16u(const ushort** src, ushort* dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
         vecmerge_<ushort, v_uint16>(src, dst, len, cn);
     else
 #endif
@@ -195,8 +195,8 @@ void merge16u(const ushort** src, ushort* dst, int len, int cn )
 void merge32s(const int** src, int* dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int32>::vlanes() && 2 <= cn && cn <= 4 )
         vecmerge_<int, v_int32>(src, dst, len, cn);
     else
 #endif
@@ -206,8 +206,8 @@ void merge32s(const int** src, int* dst, int len, int cn )
 void merge64s(const int64** src, int64* dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
         vecmerge_<int64, v_int64>(src, dst, len, cn);
     else
 #endif
diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp
index 092c5e9234d0..8c6d8ad9a9a6 100644
--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
@@ -11,11 +11,13 @@
 
 #include <algorithm>
 
+#ifndef OPENCV_IPP_MINMAX
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
 #define CV_IPP_RUN_FAST(f, ...)
 #undef CV_IPP_RUN
 #define CV_IPP_RUN(c, f, ...)
+#endif // OPENCV_IPP_MINMAX
 
 #define IPP_DISABLE_MINMAXIDX_MANY_ROWS 1  // see Core_MinMaxIdx.rows_overflow test
 
@@ -139,7 +141,7 @@ CV_ALWAYS_INLINE uint64_t v_reduce_min(const v_uint64x2& a)
 
 CV_ALWAYS_INLINE v_uint64x2 v_select(const v_uint64x2& mask, const v_uint64x2& a, const v_uint64x2& b)
 {
-    return b ^ ((a ^ b) & mask);
+    return v_xor(b, v_and(v_xor(a, b), mask));
 }
 #endif
 
@@ -149,16 +151,16 @@ minMaxIdx_reduce_##suffix( VT &valMin, VT &valMax, IT &idxMin, IT &idxMax, IT &n
                   T &minVal, T &maxVal, size_t &minIdx, size_t &maxIdx, \
                   size_t delta ) \
 { \
-    if ( v_check_any(idxMin != none) ) \
+    if ( v_check_any(v_ne(idxMin, none)) ) \
     { \
         minVal = v_reduce_min(valMin); \
-        minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)minVal) == valMin), \
+        minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)minVal), valMin)), \
                      idxMin, v_setall_##suffix2(maxLimit))) + delta; \
     } \
-    if ( v_check_any(idxMax != none) ) \
+    if ( v_check_any(v_ne(idxMax, none)) ) \
     { \
         maxVal = v_reduce_max(valMax); \
-        maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)maxVal) == valMax), \
+        maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)maxVal), valMax)), \
                      idxMax, v_setall_##suffix2(maxLimit))) + delta; \
     } \
 }
@@ -208,18 +210,18 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int*
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= v_uint8x16::nlanes )
+    if ( len >= VTraits<v_uint8x16>::vlanes() )
     {
         int j, len0;
         int minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        (int)0, (int)UCHAR_MAX, v_uint8x16::nlanes, len, startidx, j, len0 );
+                        (int)0, (int)UCHAR_MAX, VTraits<v_uint8x16>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - v_uint8x16::nlanes )
+        if ( j <= len0 - VTraits<v_uint8x16>::vlanes() )
         {
-            v_uint8x16 inc = v_setall_u8(v_uint8x16::nlanes);
+            v_uint8x16 inc = v_setall_u8((uchar)VTraits<v_uint8x16>::vlanes());
             v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1));
             v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
@@ -233,31 +235,31 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int*
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes )
+                    for( ; k < std::min(len0, j + 15 * VTraits<v_uint8x16>::vlanes()); k += VTraits<v_uint8x16>::vlanes() )
                     {
                         v_uint8x16 data = v_load(src + k);
-                        v_uint8x16 cmpMin = (data < valMin);
-                        v_uint8x16 cmpMax = (data > valMax);
+                        v_uint8x16 cmpMin = (v_lt(data, valMin));
+                        v_uint8x16 cmpMax = (v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes )
+                    for( ; k < std::min(len0, j + 15 * VTraits<v_uint8x16>::vlanes()); k += VTraits<v_uint8x16>::vlanes() )
                     {
                         v_uint8x16 data = v_load(src + k);
-                        v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8();
-                        v_uint8x16 cmpMin = (data < valMin) & maskVal;
-                        v_uint8x16 cmpMax = (data > valMax) & maskVal;
+                        v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8());
+                        v_uint8x16 cmpMin = v_and(v_lt(data, valMin), maskVal);
+                        v_uint8x16 cmpMax = v_and(v_gt(data, valMax), maskVal);
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(cmpMin, data, valMin);
                         valMax = v_select(cmpMax, data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -285,18 +287,18 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int*
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= v_int8x16::nlanes )
+    if ( len >= VTraits<v_int8x16>::vlanes() )
     {
         int j, len0;
         int minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        (int)SCHAR_MIN, (int)SCHAR_MAX, v_int8x16::nlanes, len, startidx, j, len0 );
+                        (int)SCHAR_MIN, (int)SCHAR_MAX, VTraits<v_int8x16>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - v_int8x16::nlanes )
+        if ( j <= len0 - VTraits<v_int8x16>::vlanes() )
         {
-            v_uint8x16 inc = v_setall_u8(v_int8x16::nlanes);
+            v_uint8x16 inc = v_setall_u8((uchar)VTraits<v_int8x16>::vlanes());
             v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1));
             v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
@@ -310,31 +312,31 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int*
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes )
+                    for( ; k < std::min(len0, j + 15 * VTraits<v_int8x16>::vlanes()); k += VTraits<v_int8x16>::vlanes() )
                     {
                         v_int8x16 data = v_load(src + k);
-                        v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin);
-                        v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax);
+                        v_uint8x16 cmpMin = v_reinterpret_as_u8(v_lt(data, valMin));
+                        v_uint8x16 cmpMax = v_reinterpret_as_u8(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes )
+                    for( ; k < std::min(len0, j + 15 * VTraits<v_int8x16>::vlanes()); k += VTraits<v_int8x16>::vlanes() )
                     {
                         v_int8x16 data = v_load(src + k);
-                        v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8();
-                        v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin) & maskVal;
-                        v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax) & maskVal;
+                        v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8());
+                        v_uint8x16 cmpMin = v_and(v_reinterpret_as_u8(v_lt(data, valMin)), maskVal);
+                        v_uint8x16 cmpMax = v_and(v_reinterpret_as_u8(v_gt(data, valMax)), maskVal);
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_s8(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_s8(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -362,18 +364,18 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= v_uint16x8::nlanes )
+    if ( len >= VTraits<v_uint16x8>::vlanes() )
     {
         int j, len0;
         int minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        (int)0, (int)USHRT_MAX, v_uint16x8::nlanes, len, startidx, j, len0 );
+                        (int)0, (int)USHRT_MAX, VTraits<v_uint16x8>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - v_uint16x8::nlanes )
+        if ( j <= len0 - VTraits<v_uint16x8>::vlanes() )
         {
-            v_uint16x8 inc = v_setall_u16(v_uint16x8::nlanes);
+            v_uint16x8 inc = v_setall_u16((uchar)VTraits<v_uint16x8>::vlanes());
             v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1));
             v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7);
 
@@ -387,31 +389,31 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes )
+                    for( ; k < std::min(len0, j + 8191 * VTraits<v_uint16x8>::vlanes()); k += VTraits<v_uint16x8>::vlanes() )
                     {
                         v_uint16x8 data = v_load(src + k);
-                        v_uint16x8 cmpMin = (data < valMin);
-                        v_uint16x8 cmpMax = (data > valMax);
+                        v_uint16x8 cmpMin = (v_lt(data, valMin));
+                        v_uint16x8 cmpMax = (v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes )
+                    for( ; k < std::min(len0, j + 8191 * VTraits<v_uint16x8>::vlanes()); k += VTraits<v_uint16x8>::vlanes() )
                     {
                         v_uint16x8 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
-                        v_uint16x8 cmpMin = (data < valMin) & maskVal;
-                        v_uint16x8 cmpMax = (data > valMax) & maskVal;
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
+                        v_uint16x8 cmpMin = v_and(v_lt(data, valMin), maskVal);
+                        v_uint16x8 cmpMax = v_and(v_gt(data, valMax), maskVal);
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(cmpMin, data, valMin);
                         valMax = v_select(cmpMax, data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -439,18 +441,18 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int*
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= v_int16x8::nlanes )
+    if ( len >= VTraits<v_int16x8>::vlanes() )
     {
         int j, len0;
         int minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        (int)SHRT_MIN, (int)SHRT_MAX, v_int16x8::nlanes, len, startidx, j, len0 );
+                        (int)SHRT_MIN, (int)SHRT_MAX, VTraits<v_int16x8>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - v_int16x8::nlanes )
+        if ( j <= len0 - VTraits<v_int16x8>::vlanes() )
         {
-            v_uint16x8 inc = v_setall_u16(v_int16x8::nlanes);
+            v_uint16x8 inc = v_setall_u16((uchar)VTraits<v_int16x8>::vlanes());
             v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1));
             v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7);
 
@@ -464,31 +466,31 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int*
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes )
+                    for( ; k < std::min(len0, j + 8191 * VTraits<v_int16x8>::vlanes()); k += VTraits<v_int16x8>::vlanes() )
                     {
                         v_int16x8 data = v_load(src + k);
-                        v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin);
-                        v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax);
+                        v_uint16x8 cmpMin = v_reinterpret_as_u16(v_lt(data, valMin));
+                        v_uint16x8 cmpMax = v_reinterpret_as_u16(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes )
+                    for( ; k < std::min(len0, j + 8191 * VTraits<v_int16x8>::vlanes()); k += VTraits<v_int16x8>::vlanes() )
                     {
                         v_int16x8 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
-                        v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin) & maskVal;
-                        v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax) & maskVal;
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
+                        v_uint16x8 cmpMin = v_and(v_reinterpret_as_u16(v_lt(data, valMin)), maskVal);
+                        v_uint16x8 cmpMax = v_and(v_reinterpret_as_u16(v_gt(data, valMax)), maskVal);
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_s16(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_s16(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -516,14 +518,14 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= 2 * v_int32x4::nlanes )
+    if ( len >= 2 * VTraits<v_int32x4>::vlanes() )
     {
-        int j = 0, len0 = len & -(2 * v_int32x4::nlanes);
+        int j = 0, len0 = len & -(2 * VTraits<v_int32x4>::vlanes());
         int minVal = *minval, maxVal = *maxval;
         size_t minIdx = *minidx, maxIdx = *maxidx;
 
         {
-            v_uint32x4 inc = v_setall_u32(v_int32x4::nlanes);
+            v_uint32x4 inc = v_setall_u32(VTraits<v_int32x4>::vlanes());
             v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1));
             v_uint32x4 idxStart(0, 1, 2, 3);
 
@@ -537,49 +539,49 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes )
+                    for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_int32x4>::vlanes()); k += 2 * VTraits<v_int32x4>::vlanes() )
                     {
                         v_int32x4 data = v_load(src + k);
-                        v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin);
-                        v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax);
+                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
+                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_int32x4::nlanes);
-                        cmpMin = v_reinterpret_as_u32(data < valMin);
-                        cmpMax = v_reinterpret_as_u32(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_int32x4>::vlanes());
+                        cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes )
+                    for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_int32x4>::vlanes()); k += 2 * VTraits<v_int32x4>::vlanes() )
                     {
                         v_int32x4 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
                         v_int32x4 maskVal1, maskVal2;
                         v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
-                        v_uint32x4 cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal1);
-                        v_uint32x4 cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal1);
+                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal1));
+                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal1));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_int32x4::nlanes);
-                        cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal2);
-                        cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal2);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_int32x4>::vlanes());
+                        cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal2));
+                        cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal2));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -607,18 +609,18 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= 2 * v_float32x4::nlanes )
+    if ( len >= 2 * VTraits<v_float32x4>::vlanes() )
     {
         int j, len0;
         float minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        FLT_MIN, FLT_MAX, 2 * v_float32x4::nlanes, len, startidx, j, len0 );
+                        FLT_MIN, FLT_MAX, 2 * VTraits<v_float32x4>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - 2 * v_float32x4::nlanes )
+        if ( j <= len0 - 2 * VTraits<v_float32x4>::vlanes() )
         {
-            v_uint32x4 inc = v_setall_u32(v_float32x4::nlanes);
+            v_uint32x4 inc = v_setall_u32(VTraits<v_float32x4>::vlanes());
             v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1));
             v_uint32x4 idxStart(0, 1, 2, 3);
 
@@ -632,49 +634,49 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes )
+                    for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_float32x4>::vlanes()); k += 2 * VTraits<v_float32x4>::vlanes() )
                     {
                         v_float32x4 data = v_load(src + k);
-                        v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin);
-                        v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax);
+                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
+                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_float32x4::nlanes);
-                        cmpMin = v_reinterpret_as_u32(data < valMin);
-                        cmpMax = v_reinterpret_as_u32(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_float32x4>::vlanes());
+                        cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes )
+                    for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_float32x4>::vlanes()); k += 2 * VTraits<v_float32x4>::vlanes() )
                     {
                         v_float32x4 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
                         v_int32x4 maskVal1, maskVal2;
                         v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
-                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal1);
-                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal1);
+                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal1));
+                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal1));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_float32x4::nlanes);
-                        cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal2);
-                        cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal2);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_float32x4>::vlanes());
+                        cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal2));
+                        cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal2));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -702,18 +704,18 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval,
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128_64F
-    if ( len >= 4 * v_float64x2::nlanes )
+    if ( len >= 4 * VTraits<v_float64x2>::vlanes() )
     {
         int j, len0;
         double minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        DBL_MIN, DBL_MAX, 4 * v_float64x2::nlanes, len, startidx, j, len0 );
+                        DBL_MIN, DBL_MAX, 4 * VTraits<v_float64x2>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - 4 * v_float64x2::nlanes )
+        if ( j <= len0 - 4 * VTraits<v_float64x2>::vlanes() )
         {
-            v_uint64x2 inc = v_setall_u64(v_float64x2::nlanes);
+            v_uint64x2 inc = v_setall_u64(VTraits<v_float64x2>::vlanes());
             v_uint64x2 none = v_reinterpret_as_u64(v_setall_s64(-1));
             v_uint64x2 idxStart(0, 1);
 
@@ -727,84 +729,84 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval,
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes )
+                    for( ; k < std::min(len0, j + 32764 * 4 * VTraits<v_float64x2>::vlanes()); k += 4 * VTraits<v_float64x2>::vlanes() )
                     {
                         v_float64x2 data = v_load(src + k);
-                        v_uint64x2 cmpMin = v_reinterpret_as_u64(data < valMin);
-                        v_uint64x2 cmpMax = v_reinterpret_as_u64(data > valMax);
+                        v_uint64x2 cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
+                        v_uint64x2 cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(data < valMin);
-                        cmpMax = v_reinterpret_as_u64(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + 2 * v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(data < valMin);
-                        cmpMax = v_reinterpret_as_u64(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + 2 * VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + 3 * v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(data < valMin);
-                        cmpMax = v_reinterpret_as_u64(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + 3 * VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes )
+                    for( ; k < std::min(len0, j + 32764 * 4 * VTraits<v_float64x2>::vlanes()); k += 4 * VTraits<v_float64x2>::vlanes() )
                     {
                         v_float64x2 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
                         v_int32x4 maskVal1, maskVal2;
                         v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
                         v_int64x2 maskVal3, maskVal4;
                         v_expand(maskVal1, maskVal3, maskVal4);
-                        v_uint64x2 cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3);
-                        v_uint64x2 cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3);
+                        v_uint64x2 cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3));
+                        v_uint64x2 cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4);
-                        cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4));
+                        cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + 2 * v_float64x2::nlanes);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + 2 * VTraits<v_float64x2>::vlanes());
                         v_expand(maskVal2, maskVal3, maskVal4);
-                        cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3);
-                        cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3);
+                        cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3));
+                        cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + 3 * v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4);
-                        cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + 3 * VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4));
+                        cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -832,7 +834,7 @@ typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, s
 
 static MinMaxIdxFunc getMinmaxTab(int depth)
 {
-    static MinMaxIdxFunc minmaxTab[] =
+    static MinMaxIdxFunc minmaxTab[CV_DEPTH_MAX] =
     {
         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s),
         (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s),
@@ -1509,8 +1511,30 @@ void cv::minMaxIdx(InputArray _src, double* minVal,
     Mat src = _src.getMat(), mask = _mask.getMat();
 
     if (src.dims <= 2)
-        CALL_HAL(minMaxIdx, cv_hal_minMaxIdx, src.data, src.step, src.cols, src.rows, src.depth(), minVal, maxVal,
-                 minIdx, maxIdx, mask.data);
+    {
+        CALL_HAL(minMaxIdx, cv_hal_minMaxIdx, src.data, src.step, src.cols*cn, src.rows,
+                 src.depth(), minVal, maxVal, minIdx, maxIdx, mask.data);
+    }
+    else if (src.isContinuous())
+    {
+        int res = cv_hal_minMaxIdx(src.data, 0, (int)src.total()*cn, 1, src.depth(),
+                                   minVal, maxVal, minIdx, maxIdx, mask.data);
+
+        if (res == CV_HAL_ERROR_OK)
+        {
+            // minIdx[0] and minIdx[0] are always 0 for "flatten" version
+            if (minIdx)
+                ofs2idx(src, minIdx[1], minIdx);
+            if (maxIdx)
+                ofs2idx(src, maxIdx[1], maxIdx);
+            return;
+        }
+        else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED)
+        {
+            CV_Error_(cv::Error::StsInternal,
+            ("HAL implementation minMaxIdx ==> " CVAUX_STR(cv_hal_minMaxIdx) " returned %d (0x%08x)", res, res));
+        }
+    }
 
     CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MINMAXLOC>(src.cols, src.rows),
                openvx_minMaxIdx(src, minVal, maxVal, minIdx, maxIdx, mask))
@@ -1543,9 +1567,9 @@ void cv::minMaxIdx(InputArray _src, double* minVal,
     if (!src.empty() && mask.empty())
     {
         if( minidx == 0 )
-             minidx = 1;
-         if( maxidx == 0 )
-             maxidx = 1;
+            minidx = 1;
+        if( maxidx == 0 )
+            maxidx = 1;
     }
 
     if( minidx == 0 )
diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp
index 69da85f2913a..f2f84c35b84e 100644
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -52,6 +52,9 @@ static const uchar popCountTable4[] =
 
 int normHamming(const uchar* a, int n, int cellSize)
 {
+    int output;
+    CALL_HAL_RET(normHamming8u, cv_hal_normHamming8u, output, a, n, cellSize);
+
     if( cellSize == 1 )
         return normHamming(a, n);
     const uchar* tab = 0;
@@ -63,25 +66,25 @@ int normHamming(const uchar* a, int n, int cellSize)
         return -1;
     int i = 0;
     int result = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint64 t = vx_setzero_u64();
     if ( cellSize == 2)
     {
         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
         {
             v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
-            t += v_popcount(v_reinterpret_as_u64((a0 | (a0 >> 1)) & mask));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a0, v_shr<1>(a0)), mask))));
         }
     }
     else    // cellSize == 4
     {
         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
         {
             v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
-            v_uint16 a1 = a0 | (a0 >> 2);
-            t += v_popcount(v_reinterpret_as_u64((a1 | (a1 >> 1)) & mask));
+            v_uint16 a1 = v_or(a0, v_shr<2>(a0));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a1, v_shr<1>(a1)), mask))));
 
         }
     }
@@ -98,6 +101,9 @@ int normHamming(const uchar* a, int n, int cellSize)
 
 int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 {
+    int output;
+    CALL_HAL_RET(normHammingDiff8u, cv_hal_normHammingDiff8u, output, a, b, n, cellSize);
+
     if( cellSize == 1 )
         return normHamming(a, b, n);
     const uchar* tab = 0;
@@ -109,25 +115,25 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
         return -1;
     int i = 0;
     int result = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint64 t = vx_setzero_u64();
     if ( cellSize == 2)
     {
         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
         {
-            v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
-            t += v_popcount(v_reinterpret_as_u64((ab0 | (ab0 >> 1)) & mask));
+            v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab0, v_shr<1>(ab0)), mask))));
         }
     }
     else    // cellSize == 4
     {
         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
         {
-            v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
-            v_uint16 ab1 = ab0 | (ab0 >> 2);
-            t += v_popcount(v_reinterpret_as_u64((ab1 | (ab1 >> 1)) & mask));
+            v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
+            v_uint16 ab1 = v_or(ab0, v_shr<2>(ab0));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab1, v_shr<1>(ab1)), mask))));
         }
     }
     result += (int)v_reduce_sum(t);
@@ -145,21 +151,21 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 float normL2Sqr_(const float* a, const float* b, int n)
 {
     int j = 0; float d = 0.f;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
     v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
-    for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
+    for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
     {
-        v_float32 t0 = vx_load(a + j) - vx_load(b + j);
-        v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
+        v_float32 t0 = v_sub(vx_load(a + j), vx_load(b + j));
+        v_float32 t1 = v_sub(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes()));
         v_d0 = v_muladd(t0, t0, v_d0);
-        v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
+        v_float32 t2 = v_sub(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes()));
         v_d1 = v_muladd(t1, t1, v_d1);
-        v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
+        v_float32 t3 = v_sub(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes()));
         v_d2 = v_muladd(t2, t2, v_d2);
         v_d3 = v_muladd(t3, t3, v_d3);
     }
-    d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
+    d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
 #endif
     for( ; j < n; j++ )
     {
@@ -173,17 +179,17 @@ float normL2Sqr_(const float* a, const float* b, int n)
 float normL1_(const float* a, const float* b, int n)
 {
     int j = 0; float d = 0.f;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
     v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
-    for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
+    for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
     {
-        v_d0 += v_absdiff(vx_load(a + j), vx_load(b + j));
-        v_d1 += v_absdiff(vx_load(a + j + v_float32::nlanes), vx_load(b + j + v_float32::nlanes));
-        v_d2 += v_absdiff(vx_load(a + j + 2 * v_float32::nlanes), vx_load(b + j + 2 * v_float32::nlanes));
-        v_d3 += v_absdiff(vx_load(a + j + 3 * v_float32::nlanes), vx_load(b + j + 3 * v_float32::nlanes));
+        v_d0 = v_add(v_d0, v_absdiff(vx_load(a + j), vx_load(b + j)));
+        v_d1 = v_add(v_d1, v_absdiff(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes())));
+        v_d2 = v_add(v_d2, v_absdiff(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes())));
+        v_d3 = v_add(v_d3, v_absdiff(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes())));
     }
-    d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
+    d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
 #endif
     for( ; j < n; j++ )
         d += std::abs(a[j] - b[j]);
@@ -193,12 +199,12 @@ float normL1_(const float* a, const float* b, int n)
 int normL1_(const uchar* a, const uchar* b, int n)
 {
     int j = 0, d = 0;
-#if CV_SIMD
-    for (; j <= n - 4 * v_uint8::nlanes; j += 4 * v_uint8::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    for (; j <= n - 4 * VTraits<v_uint8>::vlanes(); j += 4 * VTraits<v_uint8>::vlanes())
         d += v_reduce_sad(vx_load(a + j), vx_load(b + j)) +
-             v_reduce_sad(vx_load(a + j + v_uint8::nlanes), vx_load(b + j + v_uint8::nlanes)) +
-             v_reduce_sad(vx_load(a + j + 2 * v_uint8::nlanes), vx_load(b + j + 2 * v_uint8::nlanes)) +
-             v_reduce_sad(vx_load(a + j + 3 * v_uint8::nlanes), vx_load(b + j + 3 * v_uint8::nlanes));
+             v_reduce_sad(vx_load(a + j + VTraits<v_uint8>::vlanes()), vx_load(b + j + VTraits<v_uint8>::vlanes())) +
+             v_reduce_sad(vx_load(a + j + 2 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 2 * VTraits<v_uint8>::vlanes())) +
+             v_reduce_sad(vx_load(a + j + 3 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 3 * VTraits<v_uint8>::vlanes()));
 #endif
     for( ; j < n; j++ )
         d += std::abs(a[j] - b[j]);
@@ -752,7 +758,7 @@ double norm( InputArray _src, int normType, InputArray _mask )
             for (int j = 0; j < total; j += blockSize)
             {
                 int bsz = std::min(total - j, blockSize);
-                hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
+                hal::cvt16f32f((const hfloat*)ptrs[0], data0, bsz * cn);
                 func((uchar*)data0, ptrs[1], (uchar*)&result.f, bsz, cn);
                 ptrs[0] += bsz*esz;
                 if (ptrs[1])
@@ -1222,8 +1228,8 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask
             for (int j = 0; j < total; j += blockSize)
             {
                 int bsz = std::min(total - j, blockSize);
-                hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
-                hal::cvt16f32f((const float16_t*)ptrs[1], data1, bsz * cn);
+                hal::cvt16f32f((const hfloat*)ptrs[0], data0, bsz * cn);
+                hal::cvt16f32f((const hfloat*)ptrs[1], data1, bsz * cn);
                 func((uchar*)data0, (uchar*)data1, ptrs[2], (uchar*)&result.f, bsz, cn);
                 ptrs[0] += bsz*esz;
                 ptrs[1] += bsz*esz;
@@ -1392,7 +1398,7 @@ void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
         shift = 0;
     }
     else
-        CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
+        CV_Error( cv::Error::StsBadArg, "Unknown/unsupported norm type" );
 
     CV_OCL_RUN(_dst.isUMat(),
                ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index bf562c309252..8d7d7faf4486 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -51,7 +51,6 @@
 #include <set>
 #include <string>
 #include <sstream>
-#include <iostream> // std::cerr
 #include <fstream>
 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
 #include <inttypes.h>
@@ -1605,6 +1604,9 @@ struct Device::Impl
             pos = pos2 + 1;
         }
 
+        khr_fp64_support_ = isExtensionSupported("cl_khr_fp64");
+        khr_fp16_support_ = isExtensionSupported("cl_khr_fp16");
+
         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
 
         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
@@ -1693,7 +1695,9 @@ struct Device::Impl
     String version_;
     std::string extensions_;
     int doubleFPConfig_;
+    bool khr_fp64_support_;
     int halfFPConfig_;
+    bool khr_fp16_support_;
     bool hostUnifiedMemory_;
     int maxComputeUnits_;
     size_t maxWorkGroupSize_;
@@ -1845,6 +1849,11 @@ int Device::singleFPConfig() const
 int Device::halfFPConfig() const
 { return p ? p->halfFPConfig_ : 0; }
 
+bool Device::hasFP64() const
+{ return p ? p->khr_fp64_support_ : false; }
+bool Device::hasFP16() const
+{ return p ? p->khr_fp16_support_ : false; }
+
 bool Device::endianLittle() const
 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
 
@@ -7201,7 +7210,7 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
 
     typedef std::string (* func_t)(const Mat &);
     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
-                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, kerToStr<float16_t> };
+                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, kerToStr<hfloat> };
     const func_t func = funcs[ddepth];
     CV_Assert(func != 0);
 
diff --git a/modules/core/src/ocl_disabled.impl.hpp b/modules/core/src/ocl_disabled.impl.hpp
index a217979a1e65..fab53510254f 100644
--- a/modules/core/src/ocl_disabled.impl.hpp
+++ b/modules/core/src/ocl_disabled.impl.hpp
@@ -67,6 +67,9 @@ int Device::doubleFPConfig() const { OCL_NOT_AVAILABLE(); }
 int Device::singleFPConfig() const { OCL_NOT_AVAILABLE(); }
 int Device::halfFPConfig() const { OCL_NOT_AVAILABLE(); }
 
+bool Device::hasFP64() const { OCL_NOT_AVAILABLE(); }
+bool Device::hasFP16() const { OCL_NOT_AVAILABLE(); }
+
 bool Device::endianLittle() const { OCL_NOT_AVAILABLE(); }
 bool Device::errorCorrectionSupport() const { OCL_NOT_AVAILABLE(); }
 
diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl
index d4165faae384..301cea9f9875 100644
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -381,6 +381,20 @@
 #elif defined OP_CTP_AR
 #define TO_DEGREE
 #endif
+#ifdef SRC1_IS_DST_MAG
+#define ADAPT_SRC1 dstptr = srcptr1;
+#elif SRC1_IS_DST_ANGLE
+#define ADAPT_SRC1 dstptr2 = srcptr1;
+#else
+#define ADAPT_SRC1
+#endif
+#ifdef SRC2_IS_DST_MAG
+#define ADAPT_SRC2 dstptr = srcptr2;
+#elif SRC2_IS_DST_ANGLE
+#define ADAPT_SRC2 dstptr2 = srcptr2;
+#else
+#define ADAPT_SRC2
+#endif
 #define PROCESS_ELEM \
     dstT x = srcelem1, y = srcelem2; \
     dstT x2 = x * x, y2 = y * y; \
@@ -390,6 +404,8 @@
     dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \
     dstT cartToPolar = y2 <= x2 ? x * y / mad((dstT)(0.28f), y2, x2 + CV_EPSILON) + tmp : (tmp1 - x * y / mad((dstT)(0.28f), x2, y2 + CV_EPSILON)); \
     TO_DEGREE \
+    ADAPT_SRC1 \
+    ADAPT_SRC2 \
     storedst(magnitude); \
     storedst2(cartToPolar)
 
@@ -399,9 +415,25 @@
 #else
 #define FROM_DEGREE
 #endif
+#ifdef SRC1_IS_DST_X
+#define ADAPT_SRC1 dstptr = srcptr1;
+#elif SRC1_IS_DST_Y
+#define ADAPT_SRC1 dstptr2 = srcptr1;
+#else
+#define ADAPT_SRC1
+#endif
+#ifdef SRC2_IS_DST_X
+#define ADAPT_SRC2 dstptr = srcptr2;
+#elif SRC2_IS_DST_Y
+#define ADAPT_SRC2 dstptr2 = srcptr2;
+#else
+#define ADAPT_SRC2
+#endif
 #define PROCESS_ELEM \
     dstT x = srcelem1, y = srcelem2, cosval; \
     FROM_DEGREE; \
+    ADAPT_SRC1; \
+    ADAPT_SRC2; \
     storedst2(sincos(y, &cosval) * x); \
     storedst(cosval * x);
 
diff --git a/modules/core/src/opencl/convert.cl b/modules/core/src/opencl/convert.cl
index e869d6d74359..1f58f63f074f 100644
--- a/modules/core/src/opencl/convert.cl
+++ b/modules/core/src/opencl/convert.cl
@@ -49,14 +49,21 @@
 #endif
 #endif
 
+#ifdef HALF_SUPPORT
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16:enable
+#endif
+#endif
+
+
 #define noconvert
 
 __kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
-                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
 #ifndef NO_SCALE
-                        WT alpha, WT beta,
+                        , WT alpha, WT beta
 #endif
-                        int rowsPerWI)
+)
 {
     int x = get_global_id(0);
     int y0 = get_global_id(1) * rowsPerWI;
diff --git a/modules/core/src/opencl/inrange.cl b/modules/core/src/opencl/inrange.cl
index 538259539a14..a5efd38bcf52 100644
--- a/modules/core/src/opencl/inrange.cl
+++ b/modules/core/src/opencl/inrange.cl
@@ -52,7 +52,7 @@
 __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 #ifdef HAVE_SCALAR
-                      __global const srcT1 * src2, __global const srcT1 * src3,
+                      __global const SRC_T1 * src2, __global const SRC_T1 * src3,
 #else
                       __global const uchar * src2ptr, int src2_step, int src2_offset,
                       __global const uchar * src3ptr, int src3_step, int src3_offset,
@@ -64,56 +64,56 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_of
 
     if (x < dst_cols)
     {
-        int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(srcT1) * kercn, src1_offset));
-        int dst_index = mad24(y0, dst_step, mad24(x, colsPerWI, dst_offset));
+        int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(SRC_T1) * KERCN, src1_offset));
+        int dst_index = mad24(y0, dst_step, mad24(x, COLS_PER_WI, dst_offset));
 #ifndef HAVE_SCALAR
-        int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(srcT1) * kercn, src2_offset));
-        int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(srcT1) * kercn, src3_offset));
+        int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(SRC_T1) * KERCN, src2_offset));
+        int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(SRC_T1) * KERCN, src3_offset));
 #endif
 
         for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)
         {
-#if kercn >= cn && kercn == 4 && depth <= 4 && !defined HAVE_SCALAR
-            srcT src1 = *(__global const srcT *)(src1ptr + src1_index);
-            srcT src2 = *(__global const srcT *)(src2ptr + src2_index);
-            srcT src3 = *(__global const srcT *)(src3ptr + src3_index);
-            __global dstT * dst = (__global dstT *)(dstptr + dst_index);
-#if cn == 1
-            dst[0] = src2 > src1 || src3 < src1 ? (dstT)(0) : (dstT)(255);
-#elif cn == 2
-            dst[0] = (dstT)(src2.xy > src1.xy || src3.xy < src1.xy ||
-                            src2.zw > src1.zw || src3.zw < src1.zw ? (dstT)(0) : (dstT)(255);
-#elif cn == 4
-            dst[0] = (dstT)(src2.x > src1.x || src3.x < src1.x ||
+#if KERCN >= CN && KERCN == 4 && DEPTH <= 4 && !defined HAVE_SCALAR
+            SRC_T src1 = *(__global const SRC_T *)(src1ptr + src1_index);
+            SRC_T src2 = *(__global const SRC_T *)(src2ptr + src2_index);
+            SRC_T src3 = *(__global const SRC_T *)(src3ptr + src3_index);
+            __global DST_T * dst = (__global DST_T *)(dstptr + dst_index);
+#if CN == 1
+            dst[0] = src2 > src1 || src3 < src1 ? (DST_T)(0) : (DST_T)(255);
+#elif CN == 2
+            dst[0] = (DST_T)(src2.xy > src1.xy || src3.xy < src1.xy ||
+                            src2.zw > src1.zw || src3.zw < src1.zw ? (DST_T)(0) : (DST_T)(255);
+#elif CN == 4
+            dst[0] = (DST_T)(src2.x > src1.x || src3.x < src1.x ||
                 src2.y > src1.y || src3.y < src1.y ||
                 src2.z > src1.z || src3.z < src1.z ||
                 src2.w > src1.w || src3.w < src1.w ? 0 : 255);
 #endif
 #else
-            __global const srcT1 * src1 = (__global const srcT1 *)(src1ptr + src1_index);
+            __global const SRC_T1 * src1 = (__global const SRC_T1 *)(src1ptr + src1_index);
             __global uchar * dst = dstptr + dst_index;
 #ifndef HAVE_SCALAR
-            __global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);
-            __global const srcT1 * src3 = (__global const srcT1 *)(src3ptr + src3_index);
+            __global const SRC_T1 * src2 = (__global const SRC_T1 *)(src2ptr + src2_index);
+            __global const SRC_T1 * src3 = (__global const SRC_T1 *)(src3ptr + src3_index);
 #endif
 
             #pragma unroll
-            for (int px = 0; px < colsPerWI; ++px, src1 += cn
+            for (int px = 0; px < COLS_PER_WI; ++px, src1 += CN
 #ifndef HAVE_SCALAR
-                , src2 += cn, src3 += cn
+                , src2 += CN, src3 += CN
 #endif
                 )
             {
                 dst[px] = 255;
 
-                for (int c = 0; c < cn; ++c)
+                for (int c = 0; c < CN; ++c)
                     if (src2[c] > src1[c] || src3[c] < src1[c])
                     {
                         dst[px] = 0;
                         break;
                     }
             }
-#endif // kercn >= cn
+#endif // KERCN >= CN
 #ifndef HAVE_SCALAR
             src2_index += src2_step;
             src3_index += src3_step;
diff --git a/modules/core/src/out.cpp b/modules/core/src/out.cpp
index 8a7d7e1636ac..342cc8a2bb6d 100644
--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -77,7 +77,7 @@ namespace cv
         void valueToStr32s() { snprintf(buf, sizeof(buf), "%d", mtx.ptr<int>(row, col)[cn]); }
         void valueToStr32f() { snprintf(buf, sizeof(buf), floatFormat, mtx.ptr<float>(row, col)[cn]); }
         void valueToStr64f() { snprintf(buf, sizeof(buf), floatFormat, mtx.ptr<double>(row, col)[cn]); }
-        void valueToStr16f() { snprintf(buf, sizeof(buf), floatFormat, (float)mtx.ptr<float16_t>(row, col)[cn]); }
+        void valueToStr16f() { snprintf(buf, sizeof(buf), floatFormat, (float)mtx.ptr<hfloat>(row, col)[cn]); }
         void valueToStrOther() { buf[0] = 0; }
 
     public:
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 2b7a75cd4faa..d81577cfedd9 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -791,7 +791,7 @@ int getThreadNum()
         return 0;
     #endif
 #elif defined HAVE_HPX
-        return (int)(hpx::get_num_worker_threads());
+    return (int)(hpx::get_num_worker_threads());
 #elif defined HAVE_OPENMP
     return omp_get_thread_num();
 #elif defined HAVE_GCD
@@ -870,7 +870,22 @@ int getNumberOfCPUsImpl(const char *filename)
 
 #if defined CV_HAVE_CGROUPS
 static inline
-unsigned getNumberOfCPUsCFS()
+unsigned getNumberOfCPUsCFSv2()
+{
+    int cfs_quota = 0;
+    int cfs_period = 0;
+
+    std::ifstream ss_cpu_max("/sys/fs/cgroup/cpu.max", std::ios::in | std::ios::binary);
+    ss_cpu_max >> cfs_quota >> cfs_period;
+
+    if (ss_cpu_max.fail() || cfs_quota < 1 || cfs_period < 1) /* values must not be 0 or negative */
+        return 0;
+
+    return (unsigned)max(1, cfs_quota/cfs_period);
+}
+
+static inline
+unsigned getNumberOfCPUsCFSv1()
 {
     int cfs_quota = 0;
     {
@@ -912,8 +927,7 @@ int getNumberOfCPUs_()
      * the minimum most value as it has high probablity of being right and safe.
      * Return 1 if we get 0 or not found on all methods.
     */
-#if defined CV_CXX11 \
-    && !defined(__MINGW32__) /* not implemented (2020-03) */ \
+#if !defined(__MINGW32__) /* not implemented (2020-03) */
 
     /*
      * Check for this standard C++11 way, we do not return directly because
@@ -967,8 +981,11 @@ int getNumberOfCPUs_()
     static unsigned ncpus_impl_cpuset = (unsigned)getNumberOfCPUsImpl("/sys/fs/cgroup/cpuset/cpuset.cpus");
     ncpus = minNonZero(ncpus, ncpus_impl_cpuset);
 
-    static unsigned ncpus_impl_cfs = getNumberOfCPUsCFS();
-    ncpus = minNonZero(ncpus, ncpus_impl_cfs);
+    static unsigned ncpus_impl_cfs_v1 = getNumberOfCPUsCFSv1();
+    ncpus = minNonZero(ncpus, ncpus_impl_cfs_v1);
+
+    static unsigned ncpus_impl_cfs_v2 = getNumberOfCPUsCFSv2();
+    ncpus = minNonZero(ncpus, ncpus_impl_cfs_v2);
 #endif
 
     static unsigned ncpus_impl_devices = (unsigned)getNumberOfCPUsImpl("/sys/devices/system/cpu/online");
diff --git a/modules/core/src/parallel_impl.cpp b/modules/core/src/parallel_impl.cpp
index b18204ce8425..fc7c4c2b6ec7 100644
--- a/modules/core/src/parallel_impl.cpp
+++ b/modules/core/src/parallel_impl.cpp
@@ -580,8 +580,11 @@ void ThreadPool::run(const Range& range, const ParallelLoopBody& body, double ns
             pthread_mutex_unlock(&mutex);
 
             CV_LOG_VERBOSE(NULL, 5, "MainThread: wake worker threads...");
-            for (size_t i = 0; i < threads.size(); ++i)
+            size_t num_threads_to_wake = std::min(static_cast<size_t>(range.size()), threads.size());
+            for (size_t i = 0; i < num_threads_to_wake; ++i)
             {
+                if (job->current_task >= job->range.size())
+                    break;
                 WorkerThread& thread = *(threads[i].get());
                 if (
 #if defined(__clang__) && defined(__has_feature)
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index 0d64bab094b1..a9a2eb5db028 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -76,9 +76,11 @@ char* doubleToString( char* buf, size_t bufSize, double value, bool explicitZero
         }
         else
         {
-            static const char* fmt = "%.16e";
+            // binary64 has 52 bit fraction with hidden bit.
+            // 53 * log_10(2) is 15.955. So "%.16f" should be fine, but its test fails.
+            snprintf( buf, bufSize, "%.17g", value );
+
             char* ptr = buf;
-            snprintf( buf, bufSize, fmt, value );
             if( *ptr == '+' || *ptr == '-' )
                 ptr++;
             for( ; cv_isdigit(*ptr); ptr++ )
@@ -118,11 +120,21 @@ char* floatToString( char* buf, size_t bufSize, float value, bool halfprecision,
         }
         else
         {
-            char* ptr = buf;
             if (halfprecision)
-                snprintf(buf, bufSize, "%.4e", value);
+            {
+                // bfloat16 has 7 bit fraction with hidden bit.
+                // binary16 has 10 bit fraction with hidden bit.
+                // 11 * log_10(2) is 3.311. So "%.4f" should be fine, but its test fails.
+                snprintf(buf, bufSize, "%.5g", value);
+            }
             else
-                snprintf(buf, bufSize, "%.8e", value);
+            {
+                // binray32 has 23 bit fraction with hidden bit.
+                // 24 * log_10(2) is 7.225. So "%.8f" should be fine, but its test fails.
+                snprintf(buf, bufSize, "%.9g", value);
+            }
+
+            char* ptr = buf;
             if( *ptr == '+' || *ptr == '-' )
                 ptr++;
             for( ; cv_isdigit(*ptr); ptr++ )
@@ -270,7 +282,7 @@ int calcStructSize( const char* dt, int initial_size )
         case 'i': { elem_max_size = std::max( elem_max_size, sizeof(int   ) ); break; }
         case 'f': { elem_max_size = std::max( elem_max_size, sizeof(float ) ); break; }
         case 'd': { elem_max_size = std::max( elem_max_size, sizeof(double) ); break; }
-        case 'h': { elem_max_size = std::max(elem_max_size, sizeof(float16_t)); break; }
+        case 'h': { elem_max_size = std::max(elem_max_size, sizeof(hfloat)); break; }
         default:
             CV_Error_(Error::StsNotImplemented, ("Unknown type identifier: '%c' in '%s'", (char)(*type), dt));
         }
@@ -295,16 +307,20 @@ int decodeSimpleFormat( const char* dt )
 
 }
 
-#if defined __i386__ || defined(_M_IX86) || defined __x86_64__ || defined(_M_X64)
-#define CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS 1
+#if defined __i386__ || defined(_M_IX86) || defined __x86_64__ || defined(_M_X64) || \
+    (defined (__LITTLE_ENDIAN__) && __LITTLE_ENDIAN__)
+#define CV_LITTLE_ENDIAN_MEM_ACCESS 1
 #else
-#define CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS 0
+#define CV_LITTLE_ENDIAN_MEM_ACCESS 0
 #endif
 
 static inline int readInt(const uchar* p)
 {
-#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
-    return *(const int*)p;
+    // On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
+#if CV_LITTLE_ENDIAN_MEM_ACCESS
+    int val;
+    memcpy(&val, p, sizeof(val));
+    return val;
 #else
     int val = (int)(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
     return val;
@@ -313,8 +329,11 @@ static inline int readInt(const uchar* p)
 
 static inline double readReal(const uchar* p)
 {
-#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
-    return *(const double*)p;
+    // On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
+#if CV_LITTLE_ENDIAN_MEM_ACCESS
+    double val;
+    memcpy(&val, p, sizeof(val));
+    return val;
 #else
     unsigned val0 = (unsigned)(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
     unsigned val1 = (unsigned)(p[4] | (p[5] << 8) | (p[6] << 16) | (p[7] << 24));
@@ -326,9 +345,9 @@ static inline double readReal(const uchar* p)
 
 static inline void writeInt(uchar* p, int ival)
 {
-#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
-    int* ip = (int*)p;
-    *ip = ival;
+    // On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
+#if CV_LITTLE_ENDIAN_MEM_ACCESS
+    memcpy(p, &ival, sizeof(ival));
 #else
     p[0] = (uchar)ival;
     p[1] = (uchar)(ival >> 8);
@@ -339,9 +358,9 @@ static inline void writeInt(uchar* p, int ival)
 
 static inline void writeReal(uchar* p, double fval)
 {
-#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
-    double* fp = (double*)p;
-    *fp = fval;
+    // On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
+#if CV_LITTLE_ENDIAN_MEM_ACCESS
+    memcpy(p, &fval, sizeof(fval));
 #else
     Cv64suf v;
     v.f = fval;
@@ -1122,8 +1141,8 @@ void FileStorage::Impl::writeRawData(const std::string &dt, const void *_data, s
                         data += sizeof(double);
                         break;
                     case CV_16F: /* reference */
-                        ptr = fs::floatToString(buf, sizeof(buf), (float) *(float16_t *) data, true, explicitZero);
-                        data += sizeof(float16_t);
+                        ptr = fs::floatToString(buf, sizeof(buf), (float) *(hfloat *) data, true, explicitZero);
+                        data += sizeof(hfloat);
                         break;
                     default:
                         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported type");
@@ -1802,7 +1821,7 @@ char *FileStorage::Impl::parseBase64(char *ptr, int indent, FileNode &collection
                         node_type = FileNode::REAL;
                         break;
                     case CV_16F:
-                        fval = (float) float16_t::fromBits(base64decoder.getUInt16());
+                        fval = float(hfloatFromBits(base64decoder.getUInt16()));
                         node_type = FileNode::REAL;
                         break;
                     default:
@@ -2593,8 +2612,8 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             data += sizeof(double);
                             break;
                         case CV_16F:
-                            *(float16_t*)data = float16_t((float)ival);
-                            data += sizeof(float16_t);
+                            *(hfloat*)data = hfloat((float)ival);
+                            data += sizeof(hfloat);
                             break;
                         default:
                             CV_Error( Error::StsUnsupportedFormat, "Unsupported type" );
@@ -2635,8 +2654,8 @@ FileNodeIterator& FileNodeIterator::readRaw( const String& fmt, void* _data0, si
                             data += sizeof(double);
                             break;
                         case CV_16F:
-                            *(float16_t*)data = float16_t((float)fval);
-                            data += sizeof(float16_t);
+                            *(hfloat*)data = hfloat((float)fval);
+                            data += sizeof(hfloat);
                             break;
                         default:
                             CV_Error( Error::StsUnsupportedFormat, "Unsupported type" );
diff --git a/modules/core/src/persistence_base64_encoding.cpp b/modules/core/src/persistence_base64_encoding.cpp
index 7d90fd422b2d..3fce79c08048 100644
--- a/modules/core/src/persistence_base64_encoding.cpp
+++ b/modules/core/src/persistence_base64_encoding.cpp
@@ -367,4 +367,4 @@ size_t base64::RawDataToBinaryConvertor::make_to_binary_funcs(const std::string
     return offset_packed;
 }
 
-}
\ No newline at end of file
+}
diff --git a/modules/core/src/persistence_base64_encoding.hpp b/modules/core/src/persistence_base64_encoding.hpp
index 1ee5201e141f..8b66e94095c6 100644
--- a/modules/core/src/persistence_base64_encoding.hpp
+++ b/modules/core/src/persistence_base64_encoding.hpp
@@ -124,4 +124,4 @@ class RawDataToBinaryConvertor
 }
 
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/core/src/persistence_xml.cpp b/modules/core/src/persistence_xml.cpp
index caba4f5bf001..6141fade2dec 100644
--- a/modules/core/src/persistence_xml.cpp
+++ b/modules/core/src/persistence_xml.cpp
@@ -308,8 +308,8 @@ class XMLEmitter : public FileStorageEmitter
 
         if( !multiline )
         {
-            ptr = fs->resizeWriteBuffer( ptr, len + 9 );
-            sprintf( ptr, "<!-- %s -->", comment );
+            ptr = fs->resizeWriteBuffer( ptr, len + 5+4+1 );
+            snprintf( ptr, len + 5+4+1, "<!-- %s -->", comment );
             len = (int)strlen(ptr);
         }
         else
@@ -344,7 +344,7 @@ class XMLEmitter : public FileStorageEmitter
                 fs->setBufferPtr(ptr);
                 ptr = fs->flush();
             }
-            sprintf( ptr, "-->" );
+            strcpy( ptr, "-->" );
             fs->setBufferPtr(ptr + 3);
             fs->flush();
         }
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp
index 0647c954862d..ee4afd7ef5e1 100644
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -195,7 +195,7 @@ randf_64f( double* arr, int len, uint64* state, const Vec2d* p, void*, bool )
     hal::addRNGBias64f(arr, &p[0][0], len);
 }
 
-static void randf_16f( float16_t* arr, int len, uint64* state, const Vec2f* p, float* fbuf, bool )
+static void randf_16f( hfloat* arr, int len, uint64* state, const Vec2f* p, float* fbuf, bool )
 {
     uint64 temp = *state;
     for( int i = 0; i < len; i++ )
@@ -215,7 +215,7 @@ static void randf_16f( float16_t* arr, int len, uint64* state, const Vec2f* p, f
 typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, void* tempbuf, bool small_flag);
 
 
-static RandFunc randTab[][8] =
+static RandFunc randTab[CV_DEPTH_MAX][CV_DEPTH_MAX] =
 {
     {
         (RandFunc)randi_8u, (RandFunc)randi_8s, (RandFunc)randi_16u, (RandFunc)randi_16s,
@@ -574,7 +574,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
         CV_Assert( scaleFunc != 0 );
     }
     else
-        CV_Error( CV_StsBadArg, "Unknown distribution type" );
+        CV_Error( cv::Error::StsBadArg, "Unknown distribution type" );
 
     const Mat* arrays[] = {&mat, 0};
     uchar* ptr;
diff --git a/modules/core/src/softfloat.cpp b/modules/core/src/softfloat.cpp
index a876ee14e2b7..b5ac5d7dc5e3 100644
--- a/modules/core/src/softfloat.cpp
+++ b/modules/core/src/softfloat.cpp
@@ -306,9 +306,6 @@ softdouble cos(const softdouble& a) { return f64_cos(a); }
 | The values to return on conversions to 32-bit integer formats that raise an
 | invalid exception.
 *----------------------------------------------------------------------------*/
-#define ui32_fromPosOverflow 0xFFFFFFFF
-#define ui32_fromNegOverflow 0
-#define ui32_fromNaN         0xFFFFFFFF
 #define i32_fromPosOverflow  0x7FFFFFFF
 #define i32_fromNegOverflow  (-0x7FFFFFFF - 1)
 #define i32_fromNaN          0x7FFFFFFF
@@ -317,9 +314,6 @@ softdouble cos(const softdouble& a) { return f64_cos(a); }
 | The values to return on conversions to 64-bit integer formats that raise an
 | invalid exception.
 *----------------------------------------------------------------------------*/
-#define ui64_fromPosOverflow UINT64_C( 0xFFFFFFFFFFFFFFFF )
-#define ui64_fromNegOverflow 0
-#define ui64_fromNaN         UINT64_C( 0xFFFFFFFFFFFFFFFF )
 #define i64_fromPosOverflow  UINT64_C( 0x7FFFFFFFFFFFFFFF )
 //fixed unsigned unary minus: -x == ~x + 1
 //#define i64_fromNegOverflow (-UINT64_C( 0x7FFFFFFFFFFFFFFF ) - 1)
@@ -422,34 +416,6 @@ struct uint64_extra { uint64_t v, extra; };
 struct uint128_extra { struct uint128 v; uint64_t extra; };
 #endif
 
-/*----------------------------------------------------------------------------
-| These macros are used to isolate the differences in word order between big-
-| endian and little-endian platforms.
-*----------------------------------------------------------------------------*/
-#ifndef WORDS_BIGENDIAN
-#define wordIncr 1
-#define indexWord( total, n ) (n)
-#define indexWordHi( total ) ((total) - 1)
-#define indexWordLo( total ) 0
-#define indexMultiword( total, m, n ) (n)
-#define indexMultiwordHi( total, n ) ((total) - (n))
-#define indexMultiwordLo( total, n ) 0
-#define indexMultiwordHiBut( total, n ) (n)
-#define indexMultiwordLoBut( total, n ) 0
-#define INIT_UINTM4( v3, v2, v1, v0 ) { v0, v1, v2, v3 }
-#else
-#define wordIncr -1
-#define indexWord( total, n ) ((total) - 1 - (n))
-#define indexWordHi( total ) 0
-#define indexWordLo( total ) ((total) - 1)
-#define indexMultiword( total, m, n ) ((total) - 1 - (m))
-#define indexMultiwordHi( total, n ) 0
-#define indexMultiwordLo( total, n ) ((total) - (n))
-#define indexMultiwordHiBut( total, n ) 0
-#define indexMultiwordLoBut( total, n ) (n)
-#define INIT_UINTM4( v3, v2, v1, v0 ) { v3, v2, v1, v0 }
-#endif
-
 enum {
     softfloat_mulAdd_subC    = 1,
     softfloat_mulAdd_subProd = 2
diff --git a/modules/core/src/split.dispatch.cpp b/modules/core/src/split.dispatch.cpp
index fc5e073497af..e2a47b704dc5 100644
--- a/modules/core/src/split.dispatch.cpp
+++ b/modules/core/src/split.dispatch.cpp
@@ -53,7 +53,7 @@ typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
 
 static SplitFunc getSplitFunc(int depth)
 {
-    static SplitFunc splitTab[] =
+    static SplitFunc splitTab[CV_DEPTH_MAX] =
     {
         (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u),
         (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
diff --git a/modules/core/src/split.simd.hpp b/modules/core/src/split.simd.hpp
index 25e90c006300..109d759f2458 100644
--- a/modules/core/src/split.simd.hpp
+++ b/modules/core/src/split.simd.hpp
@@ -15,12 +15,12 @@ void split64s(const int64* src, int64** dst, int len, int cn);
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // see the comments for vecmerge_ in merge.cpp
 template<typename T, typename VecT> static void
 vecsplit_( const T* src, T** dst, int len, int cn )
 {
-    const int VECSZ = VecT::nlanes;
+    const int VECSZ = VTraits<VecT>::vlanes();
     int i, i0 = 0;
     T* dst0 = dst[0];
     T* dst1 = dst[1];
@@ -177,8 +177,8 @@ split_( const T* src, T** dst, int len, int cn )
 void split8u(const uchar* src, uchar** dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
         vecsplit_<uchar, v_uint8>(src, dst, len, cn);
     else
 #endif
@@ -188,8 +188,8 @@ void split8u(const uchar* src, uchar** dst, int len, int cn )
 void split16u(const ushort* src, ushort** dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
         vecsplit_<ushort, v_uint16>(src, dst, len, cn);
     else
 #endif
@@ -199,8 +199,8 @@ void split16u(const ushort* src, ushort** dst, int len, int cn )
 void split32s(const int* src, int** dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint32>::vlanes() && 2 <= cn && cn <= 4 )
         vecsplit_<int, v_int32>(src, dst, len, cn);
     else
 #endif
@@ -210,8 +210,8 @@ void split32s(const int* src, int** dst, int len, int cn )
 void split64s(const int64* src, int64** dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
         vecsplit_<int64, v_int64>(src, dst, len, cn);
     else
 #endif
@@ -220,4 +220,4 @@ void split64s(const int64* src, int64** dst, int len, int cn )
 
 #endif
 CV_CPU_OPTIMIZATION_NAMESPACE_END
-}} // namespace
\ No newline at end of file
+}} // namespace
diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp
index 0592f84794f0..a5fb05476d07 100644
--- a/modules/core/src/stat.simd.hpp
+++ b/modules/core/src/stat.simd.hpp
@@ -33,11 +33,11 @@ int normHamming(const uchar* a, int n)
     int i = 0;
     int result = 0;
 
-#if CV_SIMD && CV_SIMD_WIDTH > 16
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         v_uint64 t = vx_setzero_u64();
-        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
+        for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(vx_load(a + i))));
         result = (int)v_reduce_sum(t);
         vx_cleanup();
     }
@@ -56,13 +56,6 @@ int normHamming(const uchar* a, int n)
             result += CV_POPCNT_U32(*(uint*)(a + i));
         }
     }
-#elif CV_SIMD
-    {
-        v_uint64x2 t = v_setzero_u64();
-        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
-        result += (int)v_reduce_sum(t);
-    }
 #endif
 #if CV_ENABLE_UNROLLED
     for(; i <= n - 4; i += 4)
@@ -85,11 +78,11 @@ int normHamming(const uchar* a, const uchar* b, int n)
     int i = 0;
     int result = 0;
 
-#if CV_SIMD && CV_SIMD_WIDTH > 16
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         v_uint64 t = vx_setzero_u64();
-        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
+        for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_xor(vx_load(a + i), vx_load(b + i)))));
         result += (int)v_reduce_sum(t);
     }
 #endif
@@ -107,13 +100,6 @@ int normHamming(const uchar* a, const uchar* b, int n)
             result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
         }
     }
-#elif CV_SIMD
-    {
-        v_uint64x2 t = v_setzero_u64();
-        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(v_load(a + i) ^ v_load(b + i)));
-        result += (int)v_reduce_sum(t);
-    }
 #endif
 #if CV_ENABLE_UNROLLED
     for(; i <= n - 4; i += 4)
diff --git a/modules/core/src/sum.dispatch.cpp b/modules/core/src/sum.dispatch.cpp
index a1f7d73868c3..fade94833619 100644
--- a/modules/core/src/sum.dispatch.cpp
+++ b/modules/core/src/sum.dispatch.cpp
@@ -10,11 +10,13 @@
 #include "sum.simd.hpp"
 #include "sum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
+#ifndef OPENCV_IPP_SUM
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
 #define CV_IPP_RUN_FAST(f, ...)
 #undef CV_IPP_RUN
 #define CV_IPP_RUN(c, f, ...)
+#endif // OPENCV_IPP_SUM
 
 namespace cv
 {
diff --git a/modules/core/src/sum.simd.hpp b/modules/core/src/sum.simd.hpp
index 2232013b24e9..f790fc733a7a 100644
--- a/modules/core/src/sum.simd.hpp
+++ b/modules/core/src/sum.simd.hpp
@@ -22,7 +22,7 @@ struct Sum_SIMD
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template <>
 struct Sum_SIMD<uchar, int>
@@ -36,41 +36,41 @@ struct Sum_SIMD<uchar, int>
         int x = 0;
         v_uint32 v_sum = vx_setzero_u32();
 
-        int len0 = len & -v_uint8::nlanes;
+        int len0 = len & -VTraits<v_uint8>::vlanes();
         while (x < len0)
         {
-            const int len_tmp = min(x + 256*v_uint16::nlanes, len0);
+            const int len_tmp = min(x + 256*VTraits<v_uint16>::vlanes(), len0);
             v_uint16 v_sum16 = vx_setzero_u16();
-            for (; x < len_tmp; x += v_uint8::nlanes)
+            for (; x < len_tmp; x += VTraits<v_uint8>::vlanes())
             {
                 v_uint16 v_src0, v_src1;
                 v_expand(vx_load(src0 + x), v_src0, v_src1);
-                v_sum16 += v_src0 + v_src1;
+                v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
             }
             v_uint32 v_half0, v_half1;
             v_expand(v_sum16, v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
         }
-        if (x <= len - v_uint16::nlanes)
+        if (x <= len - VTraits<v_uint16>::vlanes())
         {
             v_uint32 v_half0, v_half1;
             v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
-            x += v_uint16::nlanes;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
+            x += VTraits<v_uint16>::vlanes();
         }
-        if (x <= len - v_uint32::nlanes)
+        if (x <= len - VTraits<v_uint32>::vlanes())
         {
-            v_sum += vx_load_expand_q(src0 + x);
-            x += v_uint32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
+            x += VTraits<v_uint32>::vlanes();
         }
 
         if (cn == 1)
             *dst += v_reduce_sum(v_sum);
         else
         {
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
             v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_uint32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
                 dst[i % cn] += ar[i];
         }
         v_cleanup();
@@ -91,41 +91,41 @@ struct Sum_SIMD<schar, int>
         int x = 0;
         v_int32 v_sum = vx_setzero_s32();
 
-        int len0 = len & -v_int8::nlanes;
+        int len0 = len & -VTraits<v_int8>::vlanes();
         while (x < len0)
         {
-            const int len_tmp = min(x + 256*v_int16::nlanes, len0);
+            const int len_tmp = min(x + 256*VTraits<v_int16>::vlanes(), len0);
             v_int16 v_sum16 = vx_setzero_s16();
-            for (; x < len_tmp; x += v_int8::nlanes)
+            for (; x < len_tmp; x += VTraits<v_int8>::vlanes())
             {
                 v_int16 v_src0, v_src1;
                 v_expand(vx_load(src0 + x), v_src0, v_src1);
-                v_sum16 += v_src0 + v_src1;
+                v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
             }
             v_int32 v_half0, v_half1;
             v_expand(v_sum16, v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
         }
-        if (x <= len - v_int16::nlanes)
+        if (x <= len - VTraits<v_int16>::vlanes())
         {
             v_int32 v_half0, v_half1;
             v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
-            x += v_int16::nlanes;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
+            x += VTraits<v_int16>::vlanes();
         }
-        if (x <= len - v_int32::nlanes)
+        if (x <= len - VTraits<v_int32>::vlanes())
         {
-            v_sum += vx_load_expand_q(src0 + x);
-            x += v_int32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
+            x += VTraits<v_int32>::vlanes();
         }
 
         if (cn == 1)
             *dst += v_reduce_sum(v_sum);
         else
         {
-            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
+            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
             v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_int32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                 dst[i % cn] += ar[i];
         }
         v_cleanup();
@@ -146,25 +146,25 @@ struct Sum_SIMD<ushort, int>
         int x = 0;
         v_uint32 v_sum = vx_setzero_u32();
 
-        for (; x <= len - v_uint16::nlanes; x += v_uint16::nlanes)
+        for (; x <= len - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
         {
             v_uint32 v_src0, v_src1;
             v_expand(vx_load(src0 + x), v_src0, v_src1);
-            v_sum += v_src0 + v_src1;
+            v_sum = v_add(v_sum, v_add(v_src0, v_src1));
         }
-        if (x <= len - v_uint32::nlanes)
+        if (x <= len - VTraits<v_uint32>::vlanes())
         {
-            v_sum += vx_load_expand(src0 + x);
-            x += v_uint32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand(src0 + x));
+            x += VTraits<v_uint32>::vlanes();
         }
 
         if (cn == 1)
             *dst += v_reduce_sum(v_sum);
         else
         {
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
             v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_uint32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
                 dst[i % cn] += ar[i];
         }
         v_cleanup();
@@ -185,25 +185,25 @@ struct Sum_SIMD<short, int>
         int x = 0;
         v_int32 v_sum = vx_setzero_s32();
 
-        for (; x <= len - v_int16::nlanes; x += v_int16::nlanes)
+        for (; x <= len - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
         {
             v_int32 v_src0, v_src1;
             v_expand(vx_load(src0 + x), v_src0, v_src1);
-            v_sum += v_src0 + v_src1;
+            v_sum = v_add(v_sum, v_add(v_src0, v_src1));
         }
-        if (x <= len - v_int32::nlanes)
+        if (x <= len - VTraits<v_int32>::vlanes())
         {
-            v_sum += vx_load_expand(src0 + x);
-            x += v_int32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand(src0 + x));
+            x += VTraits<v_int32>::vlanes();
         }
 
         if (cn == 1)
             *dst += v_reduce_sum(v_sum);
         else
         {
-            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
+            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
             v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_int32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                 dst[i % cn] += ar[i];
         }
         v_cleanup();
@@ -212,7 +212,7 @@ struct Sum_SIMD<short, int>
     }
 };
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template <>
 struct Sum_SIMD<int, double>
 {
@@ -226,24 +226,24 @@ struct Sum_SIMD<int, double>
         v_float64 v_sum0 = vx_setzero_f64();
         v_float64 v_sum1 = vx_setzero_f64();
 
-        for (; x <= len - 2 * v_int32::nlanes; x += 2 * v_int32::nlanes)
+        for (; x <= len - 2 * VTraits<v_int32>::vlanes(); x += 2 * VTraits<v_int32>::vlanes())
         {
             v_int32 v_src0 = vx_load(src0 + x);
-            v_int32 v_src1 = vx_load(src0 + x + v_int32::nlanes);
-            v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
-            v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
+            v_int32 v_src1 = vx_load(src0 + x + VTraits<v_int32>::vlanes());
+            v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
+            v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
         }
 
 #if CV_SIMD256 || CV_SIMD512
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
-        v_store_aligned(ar, v_sum0 + v_sum1);
-        for (int i = 0; i < v_float64::nlanes; ++i)
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
+        v_store_aligned(ar, v_add(v_sum0, v_sum1));
+        for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
             dst[i % cn] += ar[i];
 #else
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
         v_store_aligned(ar, v_sum0);
-        v_store_aligned(ar + v_float64::nlanes, v_sum1);
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+        v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
             dst[i % cn] += ar[i];
 #endif
         v_cleanup();
@@ -265,24 +265,24 @@ struct Sum_SIMD<float, double>
         v_float64 v_sum0 = vx_setzero_f64();
         v_float64 v_sum1 = vx_setzero_f64();
 
-        for (; x <= len - 2 * v_float32::nlanes; x += 2 * v_float32::nlanes)
+        for (; x <= len - 2 * VTraits<v_float32>::vlanes(); x += 2 * VTraits<v_float32>::vlanes())
         {
             v_float32 v_src0 = vx_load(src0 + x);
-            v_float32 v_src1 = vx_load(src0 + x + v_float32::nlanes);
-            v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
-            v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
+            v_float32 v_src1 = vx_load(src0 + x + VTraits<v_float32>::vlanes());
+            v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
+            v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
         }
 
 #if CV_SIMD256 || CV_SIMD512
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
-        v_store_aligned(ar, v_sum0 + v_sum1);
-        for (int i = 0; i < v_float64::nlanes; ++i)
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
+        v_store_aligned(ar, v_add(v_sum0, v_sum1));
+        for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
             dst[i % cn] += ar[i];
 #else
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
         v_store_aligned(ar, v_sum0);
-        v_store_aligned(ar + v_float64::nlanes, v_sum1);
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+        v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
             dst[i % cn] += ar[i];
 #endif
         v_cleanup();
@@ -434,7 +434,7 @@ static int sum64f( const double* src, const uchar* mask, double* dst, int len, i
 
 SumFunc getSumFunc(int depth)
 {
-    static SumFunc sumTab[] =
+    static SumFunc sumTab[CV_DEPTH_MAX] =
     {
         (SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s,
         (SumFunc)sum16u, (SumFunc)sum16s,
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 7811ab72f005..c02944079e4c 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -120,11 +120,15 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 #include <cstdlib>        // std::abort
 #endif
 
-#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__
+#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__ || defined __QNX__
 #  include <unistd.h>
 #  include <fcntl.h>
 #if defined __QNX__
 #  include <sys/elf.h>
+#  include <sys/auxv.h>
+using Elf64_auxv_t = auxv64_t;
+#  include <elfdefinitions.h>
+const uint64_t AT_HWCAP = NT_GNU_HWCAP;
 #else
 #  include <elf.h>
 #endif
@@ -154,6 +158,12 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 # endif
 #endif
 
+#if defined __loongarch64
+#include "sys/auxv.h"
+#define LA_HWCAP_LSX   (1<<4)
+#define LA_HWCAP_LASX  (1<<5)
+#endif
+
 #if defined _WIN32 || defined WINCE
 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
   #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
@@ -245,7 +255,7 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix)
 #include "omp.h"
 #endif
 
-#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __GLIBC__ || defined __HAIKU__
+#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __GLIBC__ || defined __HAIKU__
 #include <unistd.h>
 #include <stdio.h>
 #include <sys/types.h>
@@ -295,9 +305,7 @@ DECLARE_CV_CPUID_X86
   #endif
 #endif
 
-#if defined CV_CXX11
-  #include <chrono>
-#endif
+#include <chrono>
 
 namespace cv
 {
@@ -408,6 +416,8 @@ struct HWFeatures
 
         g_hwFeatureNames[CPU_NEON] = "NEON";
         g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD";
+        g_hwFeatureNames[CPU_NEON_FP16] = "NEON_FP16";
+        g_hwFeatureNames[CPU_NEON_BF16] = "NEON_BF16";
 
         g_hwFeatureNames[CPU_VSX] = "VSX";
         g_hwFeatureNames[CPU_VSX3] = "VSX3";
@@ -425,6 +435,7 @@ struct HWFeatures
 
         g_hwFeatureNames[CPU_RVV] = "RVV";
 
+        g_hwFeatureNames[CPU_LSX]  = "LSX";
         g_hwFeatureNames[CPU_LASX] = "LASX";
     }
 
@@ -553,7 +564,7 @@ struct HWFeatures
         }
     #endif // CV_CPUID_X86
 
-    #if defined __ANDROID__ || defined __linux__ || defined __FreeBSD__ || defined __QNX__
+    #if defined __ANDROID__ || defined __linux__ || defined __QNX__
     #ifdef __aarch64__
         have[CV_CPU_NEON] = true;
         have[CV_CPU_FP16] = true;
@@ -566,11 +577,18 @@ struct HWFeatures
 
             while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
             {
+                // see https://elixir.bootlin.com/linux/latest/source/arch/arm64/include/uapi/asm/hwcap.h
                 if (auxv.a_type == AT_HWCAP)
                 {
-                    have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0;
-                    break;
+                    have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP
+                    have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP
                 }
+#if defined(AT_HWCAP2)
+                else if (auxv.a_type == AT_HWCAP2)
+                {
+                    have[CV_CPU_NEON_BF16] = (auxv.a_un.a_val & (1 << 14)) != 0; // HWCAP2_BF16
+                }
+#endif
             }
 
             close(cpufile);
@@ -597,7 +615,7 @@ struct HWFeatures
         CV_LOG_INFO(NULL, "- FP16 instructions is NOT enabled via build flags");
         #endif
       #endif
-    #elif defined __arm__ && !defined __FreeBSD__
+    #elif defined __arm__
         int cpufile = open("/proc/self/auxv", O_RDONLY);
 
         if (cpufile >= 0)
@@ -619,22 +637,29 @@ struct HWFeatures
         }
     #endif
     #elif (defined __APPLE__)
-    #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
+    #if defined __ARM_NEON
         have[CV_CPU_NEON] = true;
     #endif
-    #if (defined __ARM_FP  && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__))
-        have[CV_CPU_FP16] = true;
-    #endif
-    #if (defined __ARM_FEATURE_DOTPROD)
-        int has_feat_dotprod = 0;
-        size_t has_feat_dotprod_size = sizeof(has_feat_dotprod);
-        sysctlbyname("hw.optional.arm.FEAT_DotProd", &has_feat_dotprod, &has_feat_dotprod_size, NULL, 0);
-        if (has_feat_dotprod) {
-            have[CV_CPU_NEON_DOTPROD] = true;
-        }
+    #if (defined __ARM_FP  && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON))
+        have[CV_CPU_FP16] = have[CV_CPU_NEON_FP16] = true;
     #endif
+    // system.cpp may be compiled w/o special -march=armv8...+dotprod, -march=armv8...+bf16 etc.,
+    // so we check for the features in any case, no mater what are the compile flags.
+    // We check the real hardware capabilities here.
+    int has_feat_dotprod = 0;
+    size_t has_feat_dotprod_size = sizeof(has_feat_dotprod);
+    sysctlbyname("hw.optional.arm.FEAT_DotProd", &has_feat_dotprod, &has_feat_dotprod_size, NULL, 0);
+    if (has_feat_dotprod) {
+        have[CV_CPU_NEON_DOTPROD] = true;
+    }
+    int has_feat_bf16 = 0;
+    size_t has_feat_bf16_size = sizeof(has_feat_bf16);
+    sysctlbyname("hw.optional.arm.FEAT_BF16", &has_feat_bf16, &has_feat_bf16_size, NULL, 0);
+    if (has_feat_bf16) {
+        have[CV_CPU_NEON_BF16] = true;
+    }
     #elif (defined __clang__)
-    #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
+    #if defined __ARM_NEON
         have[CV_CPU_NEON] = true;
         #if (defined __ARM_FP  && ((__ARM_FP & 0x2) != 0))
         have[CV_CPU_FP16] = true;
@@ -689,8 +714,11 @@ struct HWFeatures
         have[CV_CPU_RVV] = true;
     #endif
 
-    #if defined __loongarch_asx
-        have[CV_CPU_LASX] = true;
+    #if defined __loongarch64 && defined __linux__
+        int flag = (int)getauxval(AT_HWCAP);
+
+        have[CV_CPU_LSX] = (flag & LA_HWCAP_LSX) != 0;
+        have[CV_CPU_LASX] = (flag & LA_HWCAP_LASX) != 0;
     #endif
 
         bool skip_baseline_check = false;
@@ -879,50 +907,15 @@ bool useOptimized(void)
 
 int64 getTickCount(void)
 {
-#if defined CV_CXX11
     std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
     return (int64)now.time_since_epoch().count();
-#elif defined _WIN32 || defined WINCE
-    LARGE_INTEGER counter;
-    QueryPerformanceCounter( &counter );
-    return (int64)counter.QuadPart;
-#elif defined __MACH__ && defined __APPLE__
-    return (int64)mach_absolute_time();
-#elif defined __unix__
-    struct timespec tp;
-    clock_gettime(CLOCK_MONOTONIC, &tp);
-    return (int64)tp.tv_sec*1000000000 + tp.tv_nsec;
-#else
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return (int64)tv.tv_sec*1000000 + tv.tv_usec;
-#endif
 }
 
 double getTickFrequency(void)
 {
-#if defined CV_CXX11
     using clock_period_t = std::chrono::steady_clock::duration::period;
     double clock_freq = clock_period_t::den / clock_period_t::num;
     return clock_freq;
-#elif defined _WIN32 || defined WINCE
-    LARGE_INTEGER freq;
-    QueryPerformanceFrequency(&freq);
-    return (double)freq.QuadPart;
-#elif defined __MACH__ && defined __APPLE__
-    static double freq = 0;
-    if( freq == 0 )
-    {
-        mach_timebase_info_data_t sTimebaseInfo;
-        mach_timebase_info(&sTimebaseInfo);
-        freq = sTimebaseInfo.denom*1e9/sTimebaseInfo.numer;
-    }
-    return freq;
-#elif defined __unix__
-    return 1e9;
-#else
-    return 1e6;
-#endif
 }
 
 #if defined __GNUC__ && (defined __i386__ || defined __x86_64__ || defined __ppc__)
@@ -1384,38 +1377,38 @@ CV_IMPL const char* cvErrorStr( int status )
 
     switch (status)
     {
-    case CV_StsOk :                  return "No Error";
-    case CV_StsBackTrace :           return "Backtrace";
-    case CV_StsError :               return "Unspecified error";
-    case CV_StsInternal :            return "Internal error";
-    case CV_StsNoMem :               return "Insufficient memory";
-    case CV_StsBadArg :              return "Bad argument";
-    case CV_StsNoConv :              return "Iterations do not converge";
-    case CV_StsAutoTrace :           return "Autotrace call";
-    case CV_StsBadSize :             return "Incorrect size of input array";
-    case CV_StsNullPtr :             return "Null pointer";
-    case CV_StsDivByZero :           return "Division by zero occurred";
-    case CV_BadStep :                return "Image step is wrong";
-    case CV_StsInplaceNotSupported : return "Inplace operation is not supported";
-    case CV_StsObjectNotFound :      return "Requested object was not found";
-    case CV_BadDepth :               return "Input image depth is not supported by function";
-    case CV_StsUnmatchedFormats :    return "Formats of input arguments do not match";
-    case CV_StsUnmatchedSizes :      return "Sizes of input arguments do not match";
-    case CV_StsOutOfRange :          return "One of the arguments\' values is out of range";
-    case CV_StsUnsupportedFormat :   return "Unsupported format or combination of formats";
-    case CV_BadCOI :                 return "Input COI is not supported";
-    case CV_BadNumChannels :         return "Bad number of channels";
-    case CV_StsBadFlag :             return "Bad flag (parameter or structure field)";
-    case CV_StsBadPoint :            return "Bad parameter of type CvPoint";
-    case CV_StsBadMask :             return "Bad type of mask argument";
-    case CV_StsParseError :          return "Parsing error";
-    case CV_StsNotImplemented :      return "The function/feature is not implemented";
-    case CV_StsBadMemBlock :         return "Memory block has been corrupted";
-    case CV_StsAssert :              return "Assertion failed";
-    case CV_GpuNotSupported :        return "No CUDA support";
-    case CV_GpuApiCallError :        return "Gpu API call";
-    case CV_OpenGlNotSupported :     return "No OpenGL support";
-    case CV_OpenGlApiCallError :     return "OpenGL API call";
+    case cv::Error::StsOk :                  return "No Error";
+    case cv::Error::StsBackTrace :           return "Backtrace";
+    case cv::Error::StsError :               return "Unspecified error";
+    case cv::Error::StsInternal :            return "Internal error";
+    case cv::Error::StsNoMem :               return "Insufficient memory";
+    case cv::Error::StsBadArg :              return "Bad argument";
+    case cv::Error::StsNoConv :              return "Iterations do not converge";
+    case cv::Error::StsAutoTrace :           return "Autotrace call";
+    case cv::Error::StsBadSize :             return "Incorrect size of input array";
+    case cv::Error::StsNullPtr :             return "Null pointer";
+    case cv::Error::StsDivByZero :           return "Division by zero occurred";
+    case cv::Error::BadStep :                return "Image step is wrong";
+    case cv::Error::StsInplaceNotSupported : return "Inplace operation is not supported";
+    case cv::Error::StsObjectNotFound :      return "Requested object was not found";
+    case cv::Error::BadDepth :               return "Input image depth is not supported by function";
+    case cv::Error::StsUnmatchedFormats :    return "Formats of input arguments do not match";
+    case cv::Error::StsUnmatchedSizes :      return "Sizes of input arguments do not match";
+    case cv::Error::StsOutOfRange :          return "One of the arguments\' values is out of range";
+    case cv::Error::StsUnsupportedFormat :   return "Unsupported format or combination of formats";
+    case cv::Error::BadCOI :                 return "Input COI is not supported";
+    case cv::Error::BadNumChannels :         return "Bad number of channels";
+    case cv::Error::StsBadFlag :             return "Bad flag (parameter or structure field)";
+    case cv::Error::StsBadPoint :            return "Bad parameter of type CvPoint";
+    case cv::Error::StsBadMask :             return "Bad type of mask argument";
+    case cv::Error::StsParseError :          return "Parsing error";
+    case cv::Error::StsNotImplemented :      return "The function/feature is not implemented";
+    case cv::Error::StsBadMemBlock :         return "Memory block has been corrupted";
+    case cv::Error::StsAssert :              return "Assertion failed";
+    case cv::Error::GpuNotSupported :        return "No CUDA support";
+    case cv::Error::GpuApiCallError :        return "Gpu API call";
+    case cv::Error::OpenGlNotSupported :     return "No OpenGL support";
+    case cv::Error::OpenGlApiCallError :     return "OpenGL API call";
     };
 
     snprintf(buf, sizeof(buf), "Unknown %s code %d", status >= 0 ? "status":"error", status);
@@ -1455,29 +1448,29 @@ cvErrorFromIppStatus( int status )
 {
     switch (status)
     {
-    case CV_BADSIZE_ERR:               return CV_StsBadSize;
-    case CV_BADMEMBLOCK_ERR:           return CV_StsBadMemBlock;
-    case CV_NULLPTR_ERR:               return CV_StsNullPtr;
-    case CV_DIV_BY_ZERO_ERR:           return CV_StsDivByZero;
-    case CV_BADSTEP_ERR:               return CV_BadStep;
-    case CV_OUTOFMEM_ERR:              return CV_StsNoMem;
-    case CV_BADARG_ERR:                return CV_StsBadArg;
-    case CV_NOTDEFINED_ERR:            return CV_StsError;
-    case CV_INPLACE_NOT_SUPPORTED_ERR: return CV_StsInplaceNotSupported;
-    case CV_NOTFOUND_ERR:              return CV_StsObjectNotFound;
-    case CV_BADCONVERGENCE_ERR:        return CV_StsNoConv;
-    case CV_BADDEPTH_ERR:              return CV_BadDepth;
-    case CV_UNMATCHED_FORMATS_ERR:     return CV_StsUnmatchedFormats;
-    case CV_UNSUPPORTED_COI_ERR:       return CV_BadCOI;
-    case CV_UNSUPPORTED_CHANNELS_ERR:  return CV_BadNumChannels;
-    case CV_BADFLAG_ERR:               return CV_StsBadFlag;
-    case CV_BADRANGE_ERR:              return CV_StsBadArg;
-    case CV_BADCOEF_ERR:               return CV_StsBadArg;
-    case CV_BADFACTOR_ERR:             return CV_StsBadArg;
-    case CV_BADPOINT_ERR:              return CV_StsBadPoint;
+    case CV_BADSIZE_ERR:               return cv::Error::StsBadSize;
+    case CV_BADMEMBLOCK_ERR:           return cv::Error::StsBadMemBlock;
+    case CV_NULLPTR_ERR:               return cv::Error::StsNullPtr;
+    case CV_DIV_BY_ZERO_ERR:           return cv::Error::StsDivByZero;
+    case CV_BADSTEP_ERR:               return cv::Error::BadStep;
+    case CV_OUTOFMEM_ERR:              return cv::Error::StsNoMem;
+    case CV_BADARG_ERR:                return cv::Error::StsBadArg;
+    case CV_NOTDEFINED_ERR:            return cv::Error::StsError;
+    case CV_INPLACE_NOT_SUPPORTED_ERR: return cv::Error::StsInplaceNotSupported;
+    case CV_NOTFOUND_ERR:              return cv::Error::StsObjectNotFound;
+    case CV_BADCONVERGENCE_ERR:        return cv::Error::StsNoConv;
+    case CV_BADDEPTH_ERR:              return cv::Error::BadDepth;
+    case CV_UNMATCHED_FORMATS_ERR:     return cv::Error::StsUnmatchedFormats;
+    case CV_UNSUPPORTED_COI_ERR:       return cv::Error::BadCOI;
+    case CV_UNSUPPORTED_CHANNELS_ERR:  return cv::Error::BadNumChannels;
+    case CV_BADFLAG_ERR:               return cv::Error::StsBadFlag;
+    case CV_BADRANGE_ERR:              return cv::Error::StsBadArg;
+    case CV_BADCOEF_ERR:               return cv::Error::StsBadArg;
+    case CV_BADFACTOR_ERR:             return cv::Error::StsBadArg;
+    case CV_BADPOINT_ERR:              return cv::Error::StsBadPoint;
 
     default:
-      return CV_StsError;
+      return cv::Error::StsError;
     }
 }
 
@@ -2574,7 +2567,7 @@ struct IPPInitSingleton
         ippStatus = ippGetCpuFeatures(&cpuFeatures, NULL);
         if(ippStatus < 0)
         {
-            std::cerr << "ERROR: IPP cannot detect CPU features, IPP was disabled " << std::endl;
+            CV_LOG_ERROR(NULL, "ERROR: IPP cannot detect CPU features, IPP was disabled");
             useIPP = false;
             return;
         }
@@ -2612,7 +2605,7 @@ struct IPPInitSingleton
 
             if(env == "disabled")
             {
-                std::cerr << "WARNING: IPP was disabled by OPENCV_IPP environment variable" << std::endl;
+                CV_LOG_WARNING(NULL, "WARNING: IPP was disabled by OPENCV_IPP environment variable");
                 useIPP = false;
             }
             else if(env == "sse42")
@@ -2626,7 +2619,7 @@ struct IPPInitSingleton
 #endif
 #endif
             else
-                std::cerr << "ERROR: Improper value of OPENCV_IPP: " << env.c_str() << ". Correct values are: disabled, sse42, avx2, avx512 (Intel64 only)" << std::endl;
+                CV_LOG_ERROR(NULL, "ERROR: Improper value of OPENCV_IPP: " << env.c_str() << ". Correct values are: disabled, sse42, avx2, avx512 (Intel64 only)");
 
             // Trim unsupported features
             ippFeatures &= cpuFeatures;
diff --git a/modules/core/src/types.cpp b/modules/core/src/types.cpp
index 43e25ef04534..ccf3109989c6 100644
--- a/modules/core/src/types.cpp
+++ b/modules/core/src/types.cpp
@@ -83,7 +83,7 @@ void KeyPoint::convert(const std::vector<KeyPoint>& keypoints, std::vector<Point
                 points2f[i] = keypoints[idx].pt;
             else
             {
-                CV_Error( CV_StsBadArg, "keypointIndexes has element < 0. TODO: process this case" );
+                CV_Error( cv::Error::StsBadArg, "keypointIndexes has element < 0. TODO: process this case" );
                 //points2f[i] = Point2f(-1, -1);
             }
         }
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index 02b37026f488..65d50458dc1c 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -539,7 +539,7 @@ void setSize( UMat& m, int _dims, const int* _sz,
             m.step.p[i] = total;
             int64 total1 = (int64)total*s;
             if( (uint64)total1 != (size_t)total1 )
-                CV_Error( CV_StsOutOfRange, "The total matrix size does not fit to \"size_t\" type" );
+                CV_Error( cv::Error::StsOutOfRange, "The total matrix size does not fit to \"size_t\" type" );
             total = (size_t)total1;
         }
     }
@@ -965,16 +965,16 @@ UMat UMat::reshape(int new_cn, int new_rows) const
     {
         int total_size = total_width * rows;
         if( !isContinuous() )
-            CV_Error( CV_BadStep,
+            CV_Error( cv::Error::BadStep,
             "The matrix is not continuous, thus its number of rows can not be changed" );
 
         if( (unsigned)new_rows > (unsigned)total_size )
-            CV_Error( CV_StsOutOfRange, "Bad new number of rows" );
+            CV_Error( cv::Error::StsOutOfRange, "Bad new number of rows" );
 
         total_width = total_size / new_rows;
 
         if( total_width * new_rows != total_size )
-            CV_Error( CV_StsBadArg, "The total number of matrix elements "
+            CV_Error( cv::Error::StsBadArg, "The total number of matrix elements "
                                     "is not divisible by the new number of rows" );
 
         hdr.rows = new_rows;
@@ -984,7 +984,7 @@ UMat UMat::reshape(int new_cn, int new_rows) const
     int new_width = total_width / new_cn;
 
     if( new_width * new_cn != total_width )
-        CV_Error( CV_BadNumChannels,
+        CV_Error( cv::Error::BadNumChannels,
         "The total width is not divisible by the new number of channels" );
 
     hdr.cols = new_width;
@@ -1050,13 +1050,13 @@ UMat UMat::reshape(int _cn, int _newndims, const int* _newsz) const
             else if (i < dims)
                 newsz_buf[i] = this->size[i];
             else
-                CV_Error(CV_StsOutOfRange, "Copy dimension (which has zero size) is not present in source matrix");
+                CV_Error(cv::Error::StsOutOfRange, "Copy dimension (which has zero size) is not present in source matrix");
 
             total_elem1 *= (size_t)newsz_buf[i];
         }
 
         if (total_elem1 != total_elem1_ref)
-            CV_Error(CV_StsUnmatchedSizes, "Requested and source matrices have different count of elements");
+            CV_Error(cv::Error::StsUnmatchedSizes, "Requested and source matrices have different count of elements");
 
         UMat hdr = *this;
         hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((_cn-1) << CV_CN_SHIFT);
@@ -1065,7 +1065,7 @@ UMat UMat::reshape(int _cn, int _newndims, const int* _newsz) const
         return hdr;
     }
 
-    CV_Error(CV_StsNotImplemented, "Reshaping of n-dimensional non-continuous matrices is not supported yet");
+    CV_Error(cv::Error::StsNotImplemented, "Reshaping of n-dimensional non-continuous matrices is not supported yet");
 }
 
 Mat UMat::getMat(AccessFlag accessFlags) const
@@ -1233,70 +1233,10 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
     src.copyTo(_dst, _mask);
 }
 
-void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
-{
-    CV_INSTRUMENT_REGION();
 
-    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
-    int stype = type(), cn = CV_MAT_CN(stype);
-
-    if( _type < 0 )
-        _type = _dst.fixedType() ? _dst.type() : stype;
-    else
-        _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), cn);
-
-    int sdepth = CV_MAT_DEPTH(stype), ddepth = CV_MAT_DEPTH(_type);
-    if( sdepth == ddepth && noScale )
-    {
-        copyTo(_dst);
-        return;
-    }
-#ifdef HAVE_OPENCL
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-    bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
-    if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
-            ((needDouble && doubleSupport) || !needDouble) )
-    {
-        int wdepth = std::max(CV_32F, sdepth), rowsPerWI = 4;
-
-        char cvt[2][50];
-        ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
-                      format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s%s%s",
-                             ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
-                             ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
-                             ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
-                             doubleSupport ? " -D DOUBLE_SUPPORT" : "", noScale ? " -D NO_SCALE" : ""));
-        if (!k.empty())
-        {
-            UMat src = *this;
-            _dst.create( size(), _type );
-            UMat dst = _dst.getUMat();
-
-            float alphaf = (float)alpha, betaf = (float)beta;
-            ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
-                    dstarg = ocl::KernelArg::WriteOnly(dst, cn);
-
-            if (noScale)
-                k.args(srcarg, dstarg, rowsPerWI);
-            else if (wdepth == CV_32F)
-                k.args(srcarg, dstarg, alphaf, betaf, rowsPerWI);
-            else
-                k.args(srcarg, dstarg, alpha, beta, rowsPerWI);
-
-            size_t globalsize[2] = { (size_t)dst.cols * cn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
-            if (k.run(2, globalsize, NULL, false))
-            {
-                CV_IMPL_ADD(CV_IMPL_OCL);
-                return;
-            }
-        }
-    }
-#endif
-    UMat src = *this;  // Fake reference to itself.
-                       // Resolves issue 8693 in case of src == dst.
-    Mat m = getMat(ACCESS_READ);
-    m.convertTo(_dst, _type, alpha, beta);
-}
+//
+// void UMat::convertTo moved to convert.dispatch.cpp
+//
 
 UMat& UMat::setTo(InputArray _value, InputArray _mask)
 {
diff --git a/modules/core/src/utils/filesystem.cpp b/modules/core/src/utils/filesystem.cpp
index 415323490dad..f59855861601 100644
--- a/modules/core/src/utils/filesystem.cpp
+++ b/modules/core/src/utils/filesystem.cpp
@@ -34,7 +34,7 @@
 #include <errno.h>
 #include <io.h>
 #include <stdio.h>
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__ || defined __EMSCRIPTEN__
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -194,7 +194,7 @@ cv::String getcwd()
     sz = GetCurrentDirectoryA((DWORD)buf.size(), buf.data());
     return cv::String(buf.data(), (size_t)sz);
 #endif
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __EMSCRIPTEN__
     for(;;)
     {
         char* p = ::getcwd(buf.data(), buf.size());
@@ -228,7 +228,7 @@ bool createDirectory(const cv::String& path)
 #else
     int result = _mkdir(path.c_str());
 #endif
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __EMSCRIPTEN__
     int result = mkdir(path.c_str(), 0777);
 #else
     int result = -1;
@@ -343,7 +343,7 @@ struct FileLock::Impl
     Impl& operator=(const Impl&); // disabled
 };
 
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__ || defined __EMSCRIPTEN__
 
 struct FileLock::Impl
 {
@@ -457,7 +457,7 @@ cv::String getCacheDirectory(const char* sub_directory_name, const char* configu
             default_cache_path = "/tmp/";
             CV_LOG_WARNING(NULL, "Using world accessible cache directory. This may be not secure: " << default_cache_path);
         }
-#elif defined __linux__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
         // https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
         if (default_cache_path.empty())
         {
diff --git a/modules/core/test/ocl/test_image2d.cpp b/modules/core/test/ocl/test_image2d.cpp
index 86561bddcf94..07652e5c9d9e 100644
--- a/modules/core/test/ocl/test_image2d.cpp
+++ b/modules/core/test/ocl/test_image2d.cpp
@@ -83,7 +83,7 @@ TEST(Image2D, turnOffOpenCL)
         }
         else
             std::cout << "CV_8UC1 is not supported for OpenCL images. Test skipped." << std::endl;
-    
+
         // reset state to the previous one
         cv::ocl::setUseOpenCL(useOCL);
     }
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index ea9cda56be78..6e4151c0cc81 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -15,7 +15,12 @@ const int ARITHM_MAX_SIZE_LOG = 10;
 
 struct BaseElemWiseOp
 {
-    enum { FIX_ALPHA=1, FIX_BETA=2, FIX_GAMMA=4, REAL_GAMMA=8, SUPPORT_MASK=16, SCALAR_OUTPUT=32, SUPPORT_MULTICHANNELMASK=64 };
+    enum
+    {
+        FIX_ALPHA=1, FIX_BETA=2, FIX_GAMMA=4, REAL_GAMMA=8,
+        SUPPORT_MASK=16, SCALAR_OUTPUT=32, SUPPORT_MULTICHANNELMASK=64,
+        MIXED_TYPE=128
+   };
     BaseElemWiseOp(int _ninputs, int _flags, double _alpha, double _beta,
                    Scalar _gamma=Scalar::all(0), int _context=1)
     : ninputs(_ninputs), flags(_flags), alpha(_alpha), beta(_beta), gamma(_gamma), context(_context) {}
@@ -101,14 +106,15 @@ struct BaseAddOp : public BaseElemWiseOp
 
     void refop(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
-        Mat temp;
+        int dstType = (flags & MIXED_TYPE) ? dst.type() : src[0].type();
         if( !mask.empty() )
         {
-            cvtest::add(src[0], alpha, src.size() > 1 ? src[1] : Mat(), beta, gamma, temp, src[0].type());
+            Mat temp;
+            cvtest::add(src[0], alpha, src.size() > 1 ? src[1] : Mat(), beta, gamma, temp, dstType);
             cvtest::copy(temp, dst, mask);
         }
         else
-            cvtest::add(src[0], alpha, src.size() > 1 ? src[1] : Mat(), beta, gamma, dst, src[0].type());
+            cvtest::add(src[0], alpha, src.size() > 1 ? src[1] : Mat(), beta, gamma, dst, dstType);
     }
 };
 
@@ -118,10 +124,8 @@ struct AddOp : public BaseAddOp
     AddOp() : BaseAddOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
-        if( mask.empty() )
-            cv::add(src[0], src[1], dst);
-        else
-            cv::add(src[0], src[1], dst, mask);
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::add(src[0], src[1], dst, mask, dtype);
     }
 };
 
@@ -131,10 +135,8 @@ struct SubOp : public BaseAddOp
     SubOp() : BaseAddOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, -1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
-        if( mask.empty() )
-            cv::subtract(src[0], src[1], dst);
-        else
-            cv::subtract(src[0], src[1], dst, mask);
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::subtract(src[0], src[1], dst, mask, dtype);
     }
 };
 
@@ -144,10 +146,8 @@ struct AddSOp : public BaseAddOp
     AddSOp() : BaseAddOp(1, FIX_ALPHA+FIX_BETA+SUPPORT_MASK, 1, 0, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
-        if( mask.empty() )
-            cv::add(src[0], gamma, dst);
-        else
-            cv::add(src[0], gamma, dst, mask);
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::add(src[0], gamma, dst, mask, dtype);
     }
 };
 
@@ -157,10 +157,8 @@ struct SubRSOp : public BaseAddOp
     SubRSOp() : BaseAddOp(1, FIX_ALPHA+FIX_BETA+SUPPORT_MASK, -1, 0, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
-        if( mask.empty() )
-            cv::subtract(gamma, src[0], dst);
-        else
-            cv::subtract(gamma, src[0], dst, mask);
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::subtract(gamma, src[0], dst, mask, dtype);
     }
 };
 
@@ -174,7 +172,7 @@ struct ScaleAddOp : public BaseAddOp
     }
     double getMaxErr(int depth)
     {
-        return depth <= CV_32S ? 2 : depth < CV_64F ? 1e-4 : 1e-12;
+        return depth < CV_32F ? 1 : depth == CV_32F ? 3e-5 : 1e-12;
     }
 };
 
@@ -184,11 +182,8 @@ struct AddWeightedOp : public BaseAddOp
     AddWeightedOp() : BaseAddOp(2, REAL_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
-        cv::addWeighted(src[0], alpha, src[1], beta, gamma[0], dst);
-    }
-    double getMaxErr(int depth)
-    {
-        return depth <= CV_32S ? 2 : depth < CV_64F ? 1e-5 : 1e-10;
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::addWeighted(src[0], alpha, src[1], beta, gamma[0], dst, dtype);
     }
 };
 
@@ -204,15 +199,35 @@ struct MulOp : public BaseElemWiseOp
     }
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
-        cv::multiply(src[0], src[1], dst, alpha);
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::multiply(src[0], src[1], dst, alpha, dtype);
     }
     void refop(const vector<Mat>& src, Mat& dst, const Mat&)
     {
-        cvtest::multiply(src[0], src[1], dst, alpha);
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cvtest::multiply(src[0], src[1], dst, alpha, dtype);
     }
-    double getMaxErr(int depth)
+};
+
+struct MulSOp : public BaseElemWiseOp
+{
+    MulSOp() : BaseElemWiseOp(1, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
+    void getValueRange(int depth, double& minval, double& maxval)
+    {
+        minval = depth < CV_32S ? cvtest::getMinVal(depth) : depth == CV_32S ? -1000000 : -1000.;
+        maxval = depth < CV_32S ? cvtest::getMaxVal(depth) : depth == CV_32S ? 1000000 : 1000.;
+        minval = std::max(minval, -30000.);
+        maxval = std::min(maxval, 30000.);
+    }
+    void op(const vector<Mat>& src, Mat& dst, const Mat&)
+    {
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::multiply(src[0], alpha, dst, /* scale */ 1.0, dtype);
+    }
+    void refop(const vector<Mat>& src, Mat& dst, const Mat&)
     {
-        return depth <= CV_32S ? 2 : depth < CV_64F ? 1e-5 : 1e-12;
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cvtest::multiply(Mat(), src[0], dst, alpha, dtype);
     }
 };
 
@@ -221,15 +236,20 @@ struct DivOp : public BaseElemWiseOp
     DivOp() : BaseElemWiseOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
-        cv::divide(src[0], src[1], dst, alpha);
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::divide(src[0], src[1], dst, alpha, dtype);
+        if (flags & MIXED_TYPE)
+        {
+            // div by zero result is implementation-defined
+            // since it may involve conversions to/from intermediate format
+            Mat zeroMask = src[1] == 0;
+            dst.setTo(0, zeroMask);
+        }
     }
     void refop(const vector<Mat>& src, Mat& dst, const Mat&)
     {
-        cvtest::divide(src[0], src[1], dst, alpha);
-    }
-    double getMaxErr(int depth)
-    {
-        return depth <= CV_32S ? 2 : depth < CV_64F ? 1e-5 : 1e-12;
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cvtest::divide(src[0], src[1], dst, alpha, dtype);
     }
 };
 
@@ -238,15 +258,20 @@ struct RecipOp : public BaseElemWiseOp
     RecipOp() : BaseElemWiseOp(1, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
-        cv::divide(alpha, src[0], dst);
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cv::divide(alpha, src[0], dst, dtype);
+        if (flags & MIXED_TYPE)
+        {
+            // div by zero result is implementation-defined
+            // since it may involve conversions to/from intermediate format
+            Mat zeroMask = src[0] == 0;
+            dst.setTo(0, zeroMask);
+        }
     }
     void refop(const vector<Mat>& src, Mat& dst, const Mat&)
     {
-        cvtest::divide(Mat(), src[0], dst, alpha);
-    }
-    double getMaxErr(int depth)
-    {
-        return depth <= CV_32S ? 2 : depth < CV_64F ? 1e-5 : 1e-12;
+        int dtype = (flags & MIXED_TYPE) ? dst.type() : -1;
+        cvtest::divide(Mat(), src[0], dst, alpha, dtype);
     }
 };
 
@@ -593,7 +618,7 @@ static void inRange(const Mat& src, const Mat& lb, const Mat& rb, Mat& dst)
             inRange_((const double*)sptr, (const double*)aptr, (const double*)bptr, dptr, total, cn);
             break;
         default:
-            CV_Error(CV_StsUnsupportedFormat, "");
+            CV_Error(cv::Error::StsUnsupportedFormat, "");
         }
     }
 }
@@ -642,13 +667,13 @@ static void inRangeS(const Mat& src, const Scalar& lb, const Scalar& rb, Mat& ds
             inRangeS_((const double*)sptr, lbuf.d, rbuf.d, dptr, total, cn);
             break;
         default:
-            CV_Error(CV_StsUnsupportedFormat, "");
+            CV_Error(cv::Error::StsUnsupportedFormat, "");
         }
     }
 }
 
 } // namespace
-CVTEST_GUARD_SYMBOL(inRange);
+CVTEST_GUARD_SYMBOL(inRange)
 
 struct InRangeSOp : public BaseElemWiseOp
 {
@@ -831,6 +856,7 @@ struct ConvertScaleAbsOp : public BaseElemWiseOp
 
 namespace reference {
 
+// does not support inplace operation
 static void flip(const Mat& src, Mat& dst, int flipcode)
 {
     CV_Assert(src.dims == 2);
@@ -852,6 +878,26 @@ static void flip(const Mat& src, Mat& dst, int flipcode)
     }
 }
 
+static void rotate(const Mat& src, Mat& dst, int rotateMode)
+{
+    Mat tmp;
+    switch (rotateMode)
+    {
+    case ROTATE_90_CLOCKWISE:
+        cvtest::transpose(src, tmp);
+        reference::flip(tmp, dst, 1);
+        break;
+    case ROTATE_180:
+        reference::flip(src, dst, -1);
+        break;
+    case ROTATE_90_COUNTERCLOCKWISE:
+        cvtest::transpose(src, tmp);
+        reference::flip(tmp, dst, 0);
+        break;
+    default:
+        break;
+    }
+}
 
 static void setIdentity(Mat& dst, const Scalar& s)
 {
@@ -898,6 +944,32 @@ struct FlipOp : public BaseElemWiseOp
     int flipcode;
 };
 
+struct RotateOp : public BaseElemWiseOp
+{
+    RotateOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) { rotatecode = 0; }
+    void getRandomSize(RNG& rng, vector<int>& size)
+    {
+        cvtest::randomSize(rng, 2, 2, ARITHM_MAX_SIZE_LOG, size);
+    }
+    void op(const vector<Mat>& src, Mat& dst, const Mat&)
+    {
+        cv::rotate(src[0], dst, rotatecode);
+    }
+    void refop(const vector<Mat>& src, Mat& dst, const Mat&)
+    {
+        reference::rotate(src[0], dst, rotatecode);
+    }
+    void generateScalars(int, RNG& rng)
+    {
+        rotatecode = rng.uniform(0, 3);
+    }
+    double getMaxErr(int)
+    {
+        return 0;
+    }
+    int rotatecode;
+};
+
 struct TransposeOp : public BaseElemWiseOp
 {
     TransposeOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
@@ -1178,7 +1250,7 @@ struct MeanOp : public BaseElemWiseOp
     MeanOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = 3;
-    };
+    }
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         dst.create(1, 1, CV_64FC4);
@@ -1201,7 +1273,7 @@ struct SumOp : public BaseElemWiseOp
     SumOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = 3;
-    };
+    }
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         dst.create(1, 1, CV_64FC4);
@@ -1261,7 +1333,7 @@ struct MeanStdDevOp : public BaseElemWiseOp
     {
         cn = 0;
         context = 7;
-    };
+    }
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         dst.create(1, 2, CV_64FC4);
@@ -1302,7 +1374,7 @@ struct NormOp : public BaseElemWiseOp
     {
         context = 1;
         normType = 0;
-    };
+    }
     int getRandomType(RNG& rng)
     {
         int type = cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_BUT_8S, 1, 4);
@@ -1348,7 +1420,7 @@ struct MinMaxLocOp : public BaseElemWiseOp
     MinMaxLocOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK+SCALAR_OUTPUT, 1, 1, Scalar::all(0))
     {
         context = ARITHM_MAX_NDIMS*2 + 2;
-    };
+    }
     int getRandomType(RNG& rng)
     {
         return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_BUT_8S, 1, 1);
@@ -1395,7 +1467,7 @@ struct reduceArgMinMaxOp : public BaseElemWiseOp
                           isLast(false), isMax(false), axis(0)
     {
         context = ARITHM_MAX_NDIMS*2 + 2;
-    };
+    }
     int getRandomType(RNG& rng) override
     {
         return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_BUT_8S, 1, 1);
@@ -1550,6 +1622,7 @@ INSTANTIATE_TEST_CASE_P(Core_InRangeS, ElemWiseTest, ::testing::Values(ElemWiseO
 INSTANTIATE_TEST_CASE_P(Core_InRange, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new InRangeOp)));
 
 INSTANTIATE_TEST_CASE_P(Core_Flip, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new FlipOp)));
+INSTANTIATE_TEST_CASE_P(Core_Rotate, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new RotateOp)));
 INSTANTIATE_TEST_CASE_P(Core_Transpose, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new TransposeOp)));
 INSTANTIATE_TEST_CASE_P(Core_SetIdentity, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new SetIdentityOp)));
 
@@ -1565,6 +1638,107 @@ INSTANTIATE_TEST_CASE_P(Core_MinMaxLoc, ElemWiseTest, ::testing::Values(ElemWise
 INSTANTIATE_TEST_CASE_P(Core_reduceArgMinMax, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new reduceArgMinMaxOp)));
 INSTANTIATE_TEST_CASE_P(Core_CartToPolarToCart, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new CartToPolarToCartOp)));
 
+// Mixed Type Arithmetic Operations
+
+typedef std::tuple<ElemWiseOpPtr, std::tuple<cvtest::MatDepth, cvtest::MatDepth>> SomeType;
+class ArithmMixedTest : public ::testing::TestWithParam<SomeType> {};
+
+TEST_P(ArithmMixedTest, accuracy)
+{
+    auto p = GetParam();
+    ElemWiseOpPtr op = std::get<0>(p);
+    int srcDepth = std::get<0>(std::get<1>(p));
+    int dstDepth = std::get<1>(std::get<1>(p));
+
+    op->flags |= BaseElemWiseOp::MIXED_TYPE;
+    int testIdx = 0;
+    RNG rng((uint64)ARITHM_RNG_SEED);
+    for( testIdx = 0; testIdx < ARITHM_NTESTS; testIdx++ )
+    {
+        vector<int> size;
+        op->getRandomSize(rng, size);
+        bool haveMask = ((op->flags & BaseElemWiseOp::SUPPORT_MASK) != 0) && rng.uniform(0, 4) == 0;
+
+        double minval=0, maxval=0;
+        op->getValueRange(srcDepth, minval, maxval);
+        int ninputs = op->ninputs;
+        vector<Mat> src(ninputs);
+        for(int i = 0; i < ninputs; i++ )
+            src[i] = cvtest::randomMat(rng, size, srcDepth, minval, maxval, true);
+        Mat dst0, dst, mask;
+        if( haveMask )
+        {
+            mask = cvtest::randomMat(rng, size, CV_8UC1, 0, 2, true);
+        }
+
+        dst0 = cvtest::randomMat(rng, size, dstDepth, minval, maxval, false);
+        dst = cvtest::randomMat(rng, size, dstDepth, minval, maxval, true);
+        cvtest::copy(dst, dst0);
+
+        op->generateScalars(dstDepth, rng);
+
+        op->refop(src, dst0, mask);
+        op->op(src, dst, mask);
+
+        double maxErr = op->getMaxErr(dstDepth);
+        ASSERT_PRED_FORMAT2(cvtest::MatComparator(maxErr, op->context), dst0, dst) << "\nsrc[0] ~ " <<
+            cvtest::MatInfo(!src.empty() ? src[0] : Mat()) << "\ntestCase #" << testIdx << "\n";
+    }
+}
+
+
+INSTANTIATE_TEST_CASE_P(Core_AddMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new AddOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+INSTANTIATE_TEST_CASE_P(Core_AddScalarMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new AddSOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+INSTANTIATE_TEST_CASE_P(Core_AddWeightedMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new AddWeightedOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+INSTANTIATE_TEST_CASE_P(Core_SubMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new SubOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+INSTANTIATE_TEST_CASE_P(Core_SubScalarMinusArgMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new SubRSOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+INSTANTIATE_TEST_CASE_P(Core_MulMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new MulOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+INSTANTIATE_TEST_CASE_P(Core_MulScalarMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new MulSOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+INSTANTIATE_TEST_CASE_P(Core_DivMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new DivOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_16U},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_16S},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
+INSTANTIATE_TEST_CASE_P(Core_RecipMixed, ArithmMixedTest,
+                        ::testing::Combine(::testing::Values(ElemWiseOpPtr(new RecipOp)),
+                                           ::testing::Values(std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8U, CV_32F},
+                                                             std::tuple<cvtest::MatDepth, cvtest::MatDepth>{CV_8S, CV_32F})));
 
 TEST(Core_ArithmMask, uninitialized)
 {
@@ -2268,6 +2442,139 @@ INSTANTIATE_TEST_CASE_P(Arithm, FlipND, testing::Combine(
     testing::Values(perf::MatType(CV_8UC1), CV_32FC1)
 ));
 
+TEST(BroadcastTo, basic) {
+    std::vector<int> shape_src{2, 1};
+    std::vector<int> data_src{1, 2};
+    Mat src(static_cast<int>(shape_src.size()), shape_src.data(), CV_32SC1, data_src.data());
+
+    auto get_index = [](const std::vector<int>& shape, size_t cnt) {
+        std::vector<int> index(shape.size());
+        size_t t = cnt;
+        for (int i = static_cast<int>(shape.size() - 1); i >= 0; --i) {
+            size_t idx = t / shape[i];
+            index[i] = static_cast<int>(t - idx * shape[i]);
+            t = idx;
+        }
+        return index;
+    };
+
+    auto fn_verify = [&get_index](const Mat& ref, const Mat& res) {
+        // check type
+        EXPECT_EQ(ref.type(), res.type());
+        // check shape
+        EXPECT_EQ(ref.dims, res.dims);
+        for (int i = 0; i < ref.dims; ++i) {
+            EXPECT_EQ(ref.size[i], res.size[i]);
+        }
+        // check value
+        std::vector<int> shape{ref.size.p, ref.size.p + ref.dims};
+        for (size_t i = 0; i < ref.total(); ++i) {
+            auto index = get_index(shape, i);
+            switch (ref.type()) {
+                case CV_32SC1: {
+                    ASSERT_EQ(ref.at<int>(index.data()), res.at<int>(index.data()));
+                } break;
+                case CV_8UC1: {
+                    ASSERT_EQ(ref.at<uint8_t>(index.data()), res.at<uint8_t>(index.data()));
+                } break;
+                case CV_32FC1: {
+                    ASSERT_EQ(ref.at<float>(index.data()), res.at<float>(index.data()));
+                } break;
+                default: FAIL() << "Unsupported type: " << ref.type();
+            }
+        }
+    };
+
+    {
+        std::vector<int> shape{4, 2, 3};
+        std::vector<int> data_ref{
+            1, 1, 1, // [0, 0, :]
+            2, 2, 2, // [0, 1, :]
+            1, 1, 1, // [1, 0, :]
+            2, 2, 2, // [1, 1, :]
+            1, 1, 1, // [2, 0, :]
+            2, 2, 2, // [2, 1, :]
+            1, 1, 1, // [3, 0, :]
+            2, 2, 2  // [3, 1, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), src.type(), data_ref.data());
+        Mat dst;
+        broadcast(src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        Mat _src;
+        src.convertTo(_src, CV_8U);
+        std::vector<int> shape{4, 2, 3};
+        std::vector<uint8_t> data_ref{
+            1, 1, 1, // [0, 0, :]
+            2, 2, 2, // [0, 1, :]
+            1, 1, 1, // [1, 0, :]
+            2, 2, 2, // [1, 1, :]
+            1, 1, 1, // [2, 0, :]
+            2, 2, 2, // [2, 1, :]
+            1, 1, 1, // [3, 0, :]
+            2, 2, 2  // [3, 1, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        Mat _src;
+        src.convertTo(_src, CV_32F);
+        std::vector<int> shape{1, 1, 2, 1}; // {2, 1}
+        std::vector<float> data_ref{
+            1.f, // [0, 0, 0, 0]
+            2.f, // [0, 0, 1, 0]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        std::vector<int> _shape_src{2, 3, 4};
+        std::vector<float> _data_src{
+            1.f, 2.f, 3.f, 4.f, // [0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [1, 2, :]
+        };
+        Mat _src(static_cast<int>(_shape_src.size()), _shape_src.data(), CV_32FC1, _data_src.data());
+
+        std::vector<int> shape{2, 1, 2, 3, 4};
+        std::vector<float> data_ref{
+            1.f, 2.f, 3.f, 4.f, // [0, 0, 0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [0, 0, 0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [0, 0, 0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [0, 0, 1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [0, 0, 1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [0, 0, 1, 2, :]
+
+            1.f, 2.f, 3.f, 4.f, // [1, 0, 0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [1, 0, 0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [1, 0, 0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [1, 0, 1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [1, 0, 1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [1, 0, 1, 2, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+}
+
 TEST(Core_minMaxIdx, regression_9207_2)
 {
     const int rows = 13;
@@ -2313,6 +2620,32 @@ TEST(Core_minMaxIdx, regression_9207_2)
     EXPECT_EQ(14, maxIdx[1]);
 }
 
+TEST(Core_MinMaxIdx, MatND)
+{
+    const int shape[3] = {5,5,3};
+    cv::Mat src = cv::Mat(3, shape, CV_8UC1);
+    src.setTo(1);
+    src.data[1] = 0;
+    src.data[5*5*3-2] = 2;
+
+    int minIdx[3];
+    int maxIdx[3];
+    double minVal, maxVal;
+
+    cv::minMaxIdx(src, &minVal, &maxVal, minIdx, maxIdx);
+
+    EXPECT_EQ(0, minVal);
+    EXPECT_EQ(2, maxVal);
+
+    EXPECT_EQ(0, minIdx[0]);
+    EXPECT_EQ(0, minIdx[1]);
+    EXPECT_EQ(1, minIdx[2]);
+
+    EXPECT_EQ(4, maxIdx[0]);
+    EXPECT_EQ(4, maxIdx[1]);
+    EXPECT_EQ(1, maxIdx[2]);
+}
+
 TEST(Core_Set, regression_11044)
 {
     Mat testFloat(Size(3, 3), CV_32FC1);
@@ -2674,7 +3007,6 @@ TEST(Core_MinMaxIdx, rows_overflow)
     }
 }
 
-
 TEST(Core_Magnitude, regression_19506)
 {
     for (int N = 1; N <= 64; ++N)
@@ -2686,12 +3018,26 @@ TEST(Core_Magnitude, regression_19506)
     }
 }
 
-TEST(Core_CartPolar, inplace)
+PARAM_TEST_CASE(Core_CartPolar_reverse, int, bool)
 {
-    RNG& rng = TS::ptr()->get_rng();
-    cv::Mat1d A[2] = {cv::Mat1d(10, 10), cv::Mat1d(10, 10)};
-    cv::Mat1d B[2], C[2];
+    int  depth;
+    bool angleInDegrees;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        angleInDegrees = GET_PARAM(1);
+    }
+};
+
+TEST_P(Core_CartPolar_reverse, reverse)
+{
+    const int type = CV_MAKETYPE(depth, 1);
+    cv::Mat A[2] = {cv::Mat(10, 10, type), cv::Mat(10, 10, type)};
+    cv::Mat B[2], C[2];
     cv::UMat uA[2];
+    cv::UMat uB[2];
+    cv::UMat uC[2];
 
     for(int i = 0; i < 2; ++i)
     {
@@ -2700,22 +3046,155 @@ TEST(Core_CartPolar, inplace)
     }
 
     // Reverse
-    cv::cartToPolar(A[0], A[1], B[0], B[1], false);
-    cv::polarToCart(B[0], B[1], C[0], C[1], false);
+    cv::cartToPolar(A[0], A[1], B[0], B[1], angleInDegrees);
+    cv::polarToCart(B[0], B[1], C[0], C[1], angleInDegrees);
     EXPECT_MAT_NEAR(A[0], C[0], 2);
     EXPECT_MAT_NEAR(A[1], C[1], 2);
+}
 
-    // Inplace
-    EXPECT_THROW(cv::polarToCart(B[0], B[1], B[0], B[1], false), cv::Exception);
-    EXPECT_THROW(cv::polarToCart(B[0], B[1], B[1], B[0], false), cv::Exception);
-    EXPECT_THROW(cv::cartToPolar(A[0], A[1], A[0], A[1], false), cv::Exception);
-    EXPECT_THROW(cv::cartToPolar(A[0], A[1], A[1], A[0], false), cv::Exception);
-    // Inplace OCL
-    EXPECT_THROW(cv::polarToCart(uA[0], uA[1], uA[0], uA[1]), cv::Exception);
-    EXPECT_THROW(cv::polarToCart(uA[0], uA[1], uA[1], uA[0]), cv::Exception);
-    EXPECT_THROW(cv::cartToPolar(uA[0], uA[1], uA[0], uA[1]), cv::Exception);
-    EXPECT_THROW(cv::cartToPolar(uA[0], uA[1], uA[0], uA[1]), cv::Exception);
+INSTANTIATE_TEST_CASE_P(Core_CartPolar, Core_CartPolar_reverse,
+    testing::Combine(
+        testing::Values(CV_32F, CV_64F),
+        testing::Values(false, true)
+    )
+);
 
+PARAM_TEST_CASE(Core_CartToPolar_inplace, int, bool)
+{
+    int  depth;
+    bool angleInDegrees;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        angleInDegrees = GET_PARAM(1);
+    }
+};
+
+TEST_P(Core_CartToPolar_inplace, inplace)
+{
+    const int type = CV_MAKETYPE(depth, 1);
+    cv::Mat A[2] = {cv::Mat(10, 10, type), cv::Mat(10, 10, type)};
+    cv::Mat B[2], C[2];
+    cv::UMat uA[2];
+    cv::UMat uB[2];
+    cv::UMat uC[2];
+
+    for(int i = 0; i < 2; ++i)
+    {
+        cvtest::randUni(rng, A[i], Scalar::all(-1000), Scalar::all(1000));
+        A[i].copyTo(uA[i]);
+    }
+
+    // Inplace x<->mag y<->angle
+    for(int i = 0; i < 2; ++i)
+        A[i].copyTo(B[i]);
+    cv::cartToPolar(A[0], A[1], C[0], C[1], angleInDegrees);
+    cv::cartToPolar(B[0], B[1], B[0], B[1], angleInDegrees);
+    EXPECT_MAT_NEAR(C[0], B[0], 2);
+    EXPECT_MAT_NEAR(C[1], B[1], 2);
+
+    // Inplace x<->angle y<->mag
+    for(int i = 0; i < 2; ++i)
+        A[i].copyTo(B[i]);
+    cv::cartToPolar(A[0], A[1], C[0], C[1], angleInDegrees);
+    cv::cartToPolar(B[0], B[1], B[1], B[0], angleInDegrees);
+    EXPECT_MAT_NEAR(C[0], B[1], 2);
+    EXPECT_MAT_NEAR(C[1], B[0], 2);
+
+    // Inplace OCL x<->mag y<->angle
+    for(int i = 0; i < 2; ++i)
+        uA[i].copyTo(uB[i]);
+    cv::cartToPolar(uA[0], uA[1], uC[0], uC[1], angleInDegrees);
+    cv::cartToPolar(uB[0], uB[1], uB[0], uB[1], angleInDegrees);
+    EXPECT_MAT_NEAR(uC[0], uB[0], 2);
+    EXPECT_MAT_NEAR(uC[1], uB[1], 2);
+
+    // Inplace OCL x<->angle y<->mag
+    for(int i = 0; i < 2; ++i)
+        uA[i].copyTo(uB[i]);
+    cv::cartToPolar(uA[0], uA[1], uC[0], uC[1], angleInDegrees);
+    cv::cartToPolar(uB[0], uB[1], uB[1], uB[0], angleInDegrees);
+    EXPECT_MAT_NEAR(uC[0], uB[1], 2);
+    EXPECT_MAT_NEAR(uC[1], uB[0], 2);
+}
+
+INSTANTIATE_TEST_CASE_P(Core_CartPolar, Core_CartToPolar_inplace,
+    testing::Combine(
+        testing::Values(CV_32F, CV_64F),
+        testing::Values(false, true)
+    )
+);
+
+PARAM_TEST_CASE(Core_PolarToCart_inplace, int, bool, bool)
+{
+    int  depth;
+    bool angleInDegrees;
+    bool implicitMagnitude;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        angleInDegrees = GET_PARAM(1);
+        implicitMagnitude = GET_PARAM(2);
+    }
+};
+
+TEST_P(Core_PolarToCart_inplace, inplace)
+{
+    const int type = CV_MAKETYPE(depth, 1);
+    cv::Mat A[2] = {cv::Mat(10, 10, type), cv::Mat(10, 10, type)};
+    cv::Mat B[2], C[2];
+    cv::UMat uA[2];
+    cv::UMat uB[2];
+    cv::UMat uC[2];
+
+    for(int i = 0; i < 2; ++i)
+    {
+        cvtest::randUni(rng, A[i], Scalar::all(-1000), Scalar::all(1000));
+        A[i].copyTo(uA[i]);
+    }
+
+    // Inplace OCL x<->mag y<->angle
+    for(int i = 0; i < 2; ++i)
+        A[i].copyTo(B[i]);
+    cv::polarToCart(implicitMagnitude ? cv::noArray() : A[0], A[1], C[0], C[1], angleInDegrees);
+    cv::polarToCart(implicitMagnitude ? cv::noArray() : B[0], B[1], B[0], B[1], angleInDegrees);
+    EXPECT_MAT_NEAR(C[0], B[0], 2);
+    EXPECT_MAT_NEAR(C[1], B[1], 2);
+
+    // Inplace OCL x<->angle y<->mag
+    for(int i = 0; i < 2; ++i)
+        A[i].copyTo(B[i]);
+    cv::polarToCart(implicitMagnitude ? cv::noArray() : A[0], A[1], C[0], C[1], angleInDegrees);
+    cv::polarToCart(implicitMagnitude ? cv::noArray() : B[0], B[1], B[1], B[0], angleInDegrees);
+    EXPECT_MAT_NEAR(C[0], B[1], 2);
+    EXPECT_MAT_NEAR(C[1], B[0], 2);
+
+    // Inplace OCL x<->mag y<->angle
+    for(int i = 0; i < 2; ++i)
+        uA[i].copyTo(uB[i]);
+    cv::polarToCart(implicitMagnitude ? cv::noArray() : uA[0], uA[1], uC[0], uC[1], angleInDegrees);
+    cv::polarToCart(implicitMagnitude ? cv::noArray() : uB[0], uB[1], uB[0], uB[1], angleInDegrees);
+    EXPECT_MAT_NEAR(uC[0], uB[0], 2);
+    EXPECT_MAT_NEAR(uC[1], uB[1], 2);
+
+    // Inplace OCL x<->angle y<->mag
+    for(int i = 0; i < 2; ++i)
+        uA[i].copyTo(uB[i]);
+    cv::polarToCart(implicitMagnitude ? cv::noArray() : uA[0], uA[1], uC[0], uC[1], angleInDegrees);
+    cv::polarToCart(implicitMagnitude ? cv::noArray() : uB[0], uB[1], uB[1], uB[0], angleInDegrees);
+    EXPECT_MAT_NEAR(uC[0], uB[1], 2);
+    EXPECT_MAT_NEAR(uC[1], uB[0], 2);
 }
 
+INSTANTIATE_TEST_CASE_P(Core_CartPolar, Core_PolarToCart_inplace,
+    testing::Combine(
+        testing::Values(CV_32F, CV_64F),
+        testing::Values(false, true),
+        testing::Values(true, false)
+    )
+);
+
+
 }} // namespace
diff --git a/modules/core/test/test_async.cpp b/modules/core/test/test_async.cpp
index 58bcfddcd769..2fcee300cf7e 100644
--- a/modules/core/test/test_async.cpp
+++ b/modules/core/test/test_async.cpp
@@ -7,7 +7,7 @@
 
 #include <opencv2/core/bindings_utils.hpp>
 
-#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT)
+#if !defined(OPENCV_DISABLE_THREAD_SUPPORT)
 #include <thread>
 #include <chrono>
 #endif
@@ -85,7 +85,7 @@ TEST(Core_Async, LikePythonTest)
 }
 
 
-#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT)
+#if !defined(OPENCV_DISABLE_THREAD_SUPPORT)
 
 TEST(Core_Async, AsyncThread_Simple)
 {
diff --git a/modules/core/test/test_countnonzero.cpp b/modules/core/test/test_countnonzero.cpp
index fe14affb9c92..41eaceb1898f 100644
--- a/modules/core/test/test_countnonzero.cpp
+++ b/modules/core/test/test_countnonzero.cpp
@@ -259,7 +259,7 @@ TEST_P (CountNonZeroND, ndim)
     const int ONE_SIZE = 5;
 
     vector<int> sizes(dims);
-    fill(sizes.begin(), sizes.end(), ONE_SIZE);
+    std::fill(sizes.begin(), sizes.end(), ONE_SIZE);
 
     Mat data(sizes, CV_MAKETYPE(type, 1));
     data = 0;
diff --git a/modules/core/test/test_dxt.cpp b/modules/core/test/test_dxt.cpp
index 05d1f3062cc2..b2305fef9a41 100644
--- a/modules/core/test/test_dxt.cpp
+++ b/modules/core/test/test_dxt.cpp
@@ -97,7 +97,7 @@ static void DFT_1D( const Mat& _src, Mat& _dst, int flags, const Mat& _wave=Mat(
         }
     }
     else
-        CV_Error(CV_StsUnsupportedFormat, "");
+        CV_Error(cv::Error::StsUnsupportedFormat, "");
 }
 
 
@@ -878,7 +878,7 @@ class Core_DFTComplexOutputTest : public cvtest::BaseTest
             {
                 cout << "actual:\n" << dst << endl << endl;
                 cout << "reference:\n" << dstz << endl << endl;
-                CV_Error(CV_StsError, "");
+                CV_Error(cv::Error::StsError, "");
             }
         }
     }
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 01ca50a26e61..08138d194dcf 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -22,130 +22,6 @@ void test_hal_intrin_float16();
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-template <typename R> struct Data;
-template <int N> struct initializer;
-
-#if CV_SIMD_SCALABLE
-template <> struct initializer<128>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
-        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
-        d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
-        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63],
-        d[64], d[65], d[66], d[67], d[68], d[69], d[70], d[71], d[72], d[73], d[74], d[75], d[76], d[77], d[78], d[79],
-        d[80], d[81], d[82], d[83], d[84], d[85], d[86], d[87], d[88], d[89], d[90], d[91], d[92], d[93], d[94], d[95],
-        d[96], d[97], d[98], d[99], d[100], d[101], d[102], d[103], d[104], d[105], d[106], d[107], d[108], d[109], d[110], d[111],
-        d[112], d[113], d[114], d[115], d[116], d[117], d[118], d[119], d[120], d[121], d[122], d[123], d[124], d[125], d[126], d[127]});
-    }
-};
-
-template <> struct initializer<64>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
-        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
-        d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
-        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]});
-    }
-};
-
-template <> struct initializer<32>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
-        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]});
-    }
-};
-
-template <> struct initializer<16>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]});
-    }
-};
-
-template <> struct initializer<8>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]});
-    }
-};
-
-template <> struct initializer<4>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return v_load({d[0], d[1], d[2], d[3]});
-    }
-};
-
-template <> struct initializer<2>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return v_load({d[0], d[1]});
-    }
-};
-
-#else
-template <> struct initializer<64>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
-        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31],
-        d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47],
-        d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]);
-    }
-};
-
-template <> struct initializer<32>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15],
-        d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]);
-    }
-};
-
-template <> struct initializer<16>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
-    }
-};
-
-template <> struct initializer<8>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return R(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
-    }
-};
-
-template <> struct initializer<4>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return R(d[0], d[1], d[2], d[3]);
-    }
-};
-
-template <> struct initializer<2>
-{
-    template <typename R> static R init(const Data<R> & d)
-    {
-        return R(d[0], d[1]);
-    }
-};
-#endif
 //==================================================================================================
 
 template <typename R> struct Data
@@ -168,7 +44,8 @@ template <typename R> struct Data
     }
     operator R () const
     {
-        return initializer<VTraits<R>::max_nlanes>().init(*this);
+        CV_Assert(VTraits<R>::vlanes() <= VTraits<R>::max_nlanes);
+        return vx_load(d);
     }
     Data<R> & operator=(const R & r)
     {
@@ -404,7 +281,7 @@ template<typename R> struct TheTest
         v_uint64 vu64 = v_reinterpret_as_u64(r1); out.a.clear(); v_store((uint64*)out.a.d, vu64); EXPECT_EQ(data.a, out.a);
         v_int64 vs64 = v_reinterpret_as_s64(r1); out.a.clear(); v_store((int64*)out.a.d, vs64); EXPECT_EQ(data.a, out.a);
         v_float32 vf32 = v_reinterpret_as_f32(r1); out.a.clear(); v_store((float*)out.a.d, vf32); EXPECT_EQ(data.a, out.a);
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         v_float64 vf64 = v_reinterpret_as_f64(r1); out.a.clear(); v_store((double*)out.a.d, vf64); EXPECT_EQ(data.a, out.a);
 #endif
 
@@ -870,7 +747,7 @@ template<typename R> struct TheTest
 
     TheTest & test_dotprod_expand_f64()
     {
-    #if CV_SIMD_64F
+    #if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         Data<R> dataA, dataB;
         dataA += std::numeric_limits<LaneType>::max() - VTraits<R>::vlanes();
         dataB += std::numeric_limits<LaneType>::min();
@@ -1475,12 +1352,15 @@ template<typename R> struct TheTest
     TheTest & test_float_math()
     {
         typedef typename V_RegTraits<R>::round_reg Ri;
-        Data<R> data1, data2, data3;
+        Data<R> data1, data1_border, data2, data3;
+        // See https://github.com/opencv/opencv/issues/24213
+        data1_border *= 0.5;
         data1 *= 1.1;
         data2 += 10;
-        R a1 = data1, a2 = data2, a3 = data3;
+        R a1 = data1, a1_border = data1_border, a2 = data2, a3 = data3;
 
         Data<Ri> resB = v_round(a1),
+                 resB_border = v_round(a1_border),
                  resC = v_trunc(a1),
                  resD = v_floor(a1),
                  resE = v_ceil(a1);
@@ -1493,6 +1373,7 @@ template<typename R> struct TheTest
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(cvRound(data1[i]), resB[i]);
+            EXPECT_EQ(cvRound(data1_border[i]), resB_border[i]);
             EXPECT_EQ((typename VTraits<Ri>::lane_type)data1[i], resC[i]);
             EXPECT_EQ(cvFloor(data1[i]), resD[i]);
             EXPECT_EQ(cvCeil(data1[i]), resE[i]);
@@ -1504,6 +1385,33 @@ template<typename R> struct TheTest
 
         return *this;
     }
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    TheTest & test_round_pair_f64()
+    {
+        typedef typename V_RegTraits<R>::round_reg Ri;
+        Data<R> data1, data1_border, data2;
+        // See https://github.com/opencv/opencv/issues/24213
+        // https://github.com/opencv/opencv/issues/24163
+        // https://github.com/opencv/opencv/pull/24271
+        data1_border *= 0.5;
+        data1 *= 1.1;
+        data2 += 10;
+        R a1 = data1, a1_border = data1_border, a2 = data2;
+
+        Data<Ri> resA = v_round(a1, a1),
+                 resB = v_round(a1_border, a1_border),
+                 resC = v_round(a2, a2);
+
+        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
+        {
+            EXPECT_EQ(cvRound(data1[i]), resA[i]);
+            EXPECT_EQ(cvRound(data1_border[i]), resB[i]);
+            EXPECT_EQ(cvRound(data2[i]), resC[i]);
+        }
+
+        return *this;
+    }
+#endif
 
     TheTest & test_float_cvt32()
     {
@@ -1524,7 +1432,7 @@ template<typename R> struct TheTest
 
     TheTest & test_float_cvt64()
     {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         typedef v_float64 Rt;
         Data<R> dataA;
         dataA *= 1.1;
@@ -1550,7 +1458,7 @@ template<typename R> struct TheTest
 
     TheTest & test_cvt64_double()
     {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         Data<R> dataA(std::numeric_limits<LaneType>::max()),
                 dataB(std::numeric_limits<LaneType>::min());
         dataB += VTraits<R>::vlanes();
@@ -1676,14 +1584,14 @@ template<typename R> struct TheTest
         AlignedData<v_float32> data_f32; data_f32.a.clear();
         AlignedData<v_uint16> out;
 
-        R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
+        R r1 = vx_load_expand((const cv::hfloat*)data.a.d);
         R r2(r1);
         EXPECT_EQ(1.0f, v_get0(r1));
         v_store(data_f32.a.d, r2);
         EXPECT_EQ(-2.0f, data_f32.a.d[VTraits<R>::vlanes() - 1]);
 
         out.a.clear();
-        v_pack_store((cv::float16_t*)out.a.d, r2);
+        v_pack_store((cv::hfloat*)out.a.d, r2);
         for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
             EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i;
@@ -1707,7 +1615,7 @@ template<typename R> struct TheTest
 
         // check some initialization methods
         R r1 = data.u;
-        R r2 = vx_load_expand((const float16_t*)data.a.d);
+        R r2 = vx_load_expand((const hfloat*)data.a.d);
         R r3(r2);
         EXPECT_EQ(data.u[0], v_get0(r1));
         EXPECT_EQ(data.a[0], v_get0(r2));
@@ -1741,13 +1649,8 @@ template<typename R> struct TheTest
         R a = dataA;
         R b = dataB;
 
-#if CV_SIMD_SCALABLE
         Data<R> dataEQ = v_eq(a, b);
         Data<R> dataNE = v_ne(a, b);
-#else
-        Data<R> dataEQ = (a == b);
-        Data<R> dataNE = (a != b);
-#endif
 
         for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
@@ -2048,6 +1951,7 @@ void test_hal_intrin_uint64()
         .test_rotate<0>().test_rotate<1>()
         .test_extract_n<0>().test_extract_n<1>()
         .test_extract_highest()
+        .test_popcount()
         //.test_broadcast_element<0>().test_broadcast_element<1>()
         ;
 }
@@ -2069,6 +1973,7 @@ void test_hal_intrin_int64()
         .test_extract_highest()
         //.test_broadcast_element<0>().test_broadcast_element<1>()
         .test_cvt64_double()
+        .test_popcount()
         ;
 }
 
@@ -2116,7 +2021,7 @@ void test_hal_intrin_float32()
 void test_hal_intrin_float64()
 {
     DUMP_ENTRY(v_float64);
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     TheTest<v_float64>()
         .test_loadstore()
         .test_addsub()
@@ -2130,6 +2035,7 @@ void test_hal_intrin_float64()
         .test_mask()
         .test_unpack()
         .test_float_math()
+        .test_round_pair_f64()
         .test_float_cvt32()
         .test_reverse()
         .test_extract<0>().test_extract<1>()
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 5e1f6d7a8ebd..16b66e75ee78 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -435,6 +435,8 @@ class CV_MiscIOTest : public cvtest::BaseTest
                 CV_Assert( ov1 == v1 );
                 CV_Assert( osc1 == sc1 );
                 CV_Assert( og1 == g1 );
+                fs.release();
+                remove(fname.c_str());
             }
             catch(...)
             {
@@ -489,6 +491,7 @@ TEST(Core_InputOutput, FileStorage)
     char arr[66];
     snprintf(arr, sizeof(arr), "snprintf is hell %d", 666);
     EXPECT_NO_THROW(f << arr);
+    remove(file.c_str());
 }
 
 TEST(Core_InputOutput, FileStorageKey)
@@ -534,6 +537,7 @@ TEST(Core_InputOutput, FileStorageSpaces)
         ASSERT_STREQ(values[i].c_str(), valuesReadAppend[i].c_str());
     }
     g3.release();
+    EXPECT_EQ(0, remove(fileName.c_str()));
 }
 
 struct data_t
@@ -585,12 +589,15 @@ struct data_t
 
 static void test_filestorage_basic(int write_flags, const char* suffix_name, bool testReadWrite, bool useMemory = false)
 {
+    const bool generateTestData = false; // enable to regenerate reference in opencv_extra
     const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
     CV_Assert(test_info);
     std::string name = (std::string(test_info->test_case_name()) + "--" + test_info->name() + suffix_name);
     std::string name_34 = string(cvtest::TS::ptr()->get_data_path()) + "io/3_4/" + name;
-    if (!testReadWrite)
+    if (!testReadWrite || generateTestData)
         name = string(cvtest::TS::ptr()->get_data_path()) + "io/" + name;
+    else
+        name = cv::tempfile(name.c_str());
 
     {
         const size_t rawdata_N = 40;
@@ -636,10 +643,7 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
                 rawdata.push_back(tmp);
             }
         }
-#ifdef GENERATE_TEST_DATA
-#else
-        if (testReadWrite || useMemory)
-#endif
+        if (testReadWrite || useMemory || generateTestData)
         {
             cv::FileStorage fs(name, write_flags + (useMemory ? cv::FileStorage::MEMORY : 0));
             fs << "normal_2d_mat" << _2d_out;
@@ -684,6 +688,7 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
             }
             std::cout << "Storage size: " << sz << std::endl;
             EXPECT_LE(sz, (size_t)6000);
+
         }
         {   /* read */
             cv::FileStorage fs(name, cv::FileStorage::READ + (useMemory ? cv::FileStorage::MEMORY : 0));
@@ -761,6 +766,10 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
         ASSERT_EQ(_rd_in.dims   , _rd_out.dims);
         ASSERT_EQ(_rd_in.depth(), _rd_out.depth());
         EXPECT_EQ(0, cv::norm(_rd_in, _rd_out, NORM_INF));
+        if (testReadWrite && !useMemory && !generateTestData)
+        {
+            EXPECT_EQ(0, remove(name.c_str()));
+        }
     }
 }
 
@@ -807,7 +816,7 @@ TEST(Core_InputOutput, filestorage_heap_overflow)
     const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
     CV_Assert(test_info);
 
-    std::string name = std::string(test_info->test_case_name()) + "--" + test_info->name();
+    std::string name = cv::tempfile();
     const char data[] = {0x00, 0x2f, 0x4a, 0x4a, 0x50, 0x4a, 0x4a };
 
     std::ofstream file;
@@ -819,6 +828,7 @@ TEST(Core_InputOutput, filestorage_heap_overflow)
 
     // This just shouldn't segfault, otherwise it's fine
     EXPECT_ANY_THROW(FileStorage(name, FileStorage::READ));
+    EXPECT_EQ(0, remove(name.c_str()));
 }
 
 TEST(Core_InputOutput, filestorage_base64_valid_call)
@@ -829,18 +839,6 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
         : (std::string(test_info->test_case_name()) + "--" + test_info->name());
 
     char const * filenames[] = {
-        "core_io_base64_other_test.yml",
-        "core_io_base64_other_test.xml",
-        "core_io_base64_other_test.json",
-        "core_io_base64_other_test.yml?base64",
-        "core_io_base64_other_test.xml?base64",
-        "core_io_base64_other_test.json?base64",
-        0
-    };
-    char const * real_name[] = {
-        "core_io_base64_other_test.yml",
-        "core_io_base64_other_test.xml",
-        "core_io_base64_other_test.json",
         "core_io_base64_other_test.yml",
         "core_io_base64_other_test.xml",
         "core_io_base64_other_test.json",
@@ -852,14 +850,16 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
 
     for (int n = 0; n < 6; n++)
     {
-        char const* suffix_name = filenames[n];
-        SCOPED_TRACE(suffix_name);
-        std::string name = basename + '_' + suffix_name;
-        std::string file_name = basename + '_' + real_name[n];
+        const int idx = n / 2;
+        const std::string mode_suffix = (n % 2 == 0) ? "" : "?base64";
+        std::string suffix_name = basename + "_" + filenames[idx];
+        std::string file_name = cv::tempfile(suffix_name.c_str());
+        std::string mode_file_name = file_name + mode_suffix;
+        SCOPED_TRACE(mode_file_name);
 
         EXPECT_NO_THROW(
         {
-            cv::FileStorage fs(name, cv::FileStorage::WRITE_BASE64);
+            cv::FileStorage fs(mode_file_name, cv::FileStorage::WRITE_BASE64);
 
             fs << "manydata" << "[";
             fs << "[:";
@@ -887,7 +887,7 @@ TEST(Core_InputOutput, filestorage_base64_valid_call)
 
         EXPECT_NO_THROW(
         {
-            cv::FileStorage fs(name, cv::FileStorage::WRITE);
+            cv::FileStorage fs(mode_file_name, cv::FileStorage::WRITE);
 
             fs << "manydata" << "[";
             fs << str_out;
@@ -931,10 +931,10 @@ TEST(Core_InputOutput, filestorage_base64_invalid_call)
         0
     };
 
-    for (char const ** ptr = filenames; *ptr; ptr++)
+    for (int idx = 0; idx < 3; ++idx)
     {
-        char const * suffix_name = *ptr;
-        std::string name = basename + '_' + suffix_name;
+        const string base_suffix = basename + '_' + filenames[idx];
+        std::string name = cv::tempfile(base_suffix.c_str());
 
         EXPECT_NO_THROW({
             cv::FileStorage fs(name, cv::FileStorage::WRITE);
@@ -955,7 +955,7 @@ TEST(Core_InputOutput, filestorage_base64_invalid_call)
 
 TEST(Core_InputOutput, filestorage_yml_vec2i)
 {
-    const std::string file_name = "vec2i.yml";
+    const std::string file_name = cv::tempfile("vec2i.yml");
     cv::Vec2i vec(2, 1), ovec;
 
     /* write */
@@ -1037,7 +1037,7 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
         }
     }
 
-    String fileName = "vec_vec_io_test.";
+    String basename = "vec_vec_io_test.";
 
     std::vector<String> formats;
     formats.push_back("xml");
@@ -1046,11 +1046,13 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
 
     for(size_t i = 0; i < formats.size(); i++)
     {
-        FileStorage writer(fileName + formats[i], FileStorage::WRITE);
+        const String basename_plus(basename + formats[i]);
+        const String fileName = tempfile(basename_plus.c_str());
+        FileStorage writer(fileName, FileStorage::WRITE);
         writer << "vecVecMat" << outputMats;
         writer.release();
 
-        FileStorage reader(fileName + formats[i], FileStorage::READ);
+        FileStorage reader(fileName, FileStorage::READ);
         std::vector<std::vector<Mat> > testMats;
         reader["vecVecMat"] >> testMats;
 
@@ -1067,7 +1069,7 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
         }
 
         reader.release();
-        remove((fileName + formats[i]).c_str());
+        remove(fileName.c_str());
     }
 }
 
@@ -1187,11 +1189,7 @@ TEST(Core_InputOutput, FileStorage_DMatch)
 
     EXPECT_NO_THROW(fs << "d" << d);
     cv::String fs_result = fs.releaseAndGetString();
-#if defined _MSC_VER && _MSC_VER <= 1800 /* MSVC 2013 and older */
-    EXPECT_STREQ(fs_result.c_str(), "%YAML:1.0\n---\nd: [ 1, 2, 3, -1.5000000000000000e+000 ]\n");
-#else
-    EXPECT_STREQ(fs_result.c_str(), "%YAML:1.0\n---\nd: [ 1, 2, 3, -1.5000000000000000e+00 ]\n");
-#endif
+    EXPECT_STREQ(fs_result.c_str(), "%YAML:1.0\n---\nd: [ 1, 2, 3, -1.5 ]\n");
 
     cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);
 
@@ -1218,25 +1216,14 @@ TEST(Core_InputOutput, FileStorage_DMatch_vector)
 
     EXPECT_NO_THROW(fs << "dv" << dv);
     cv::String fs_result = fs.releaseAndGetString();
-#if defined _MSC_VER && _MSC_VER <= 1800 /* MSVC 2013 and older */
-    EXPECT_STREQ(fs_result.c_str(),
-"%YAML:1.0\n"
-"---\n"
-"dv:\n"
-"   - [ 1, 2, 3, -1.5000000000000000e+000 ]\n"
-"   - [ 2, 3, 4, 1.5000000000000000e+000 ]\n"
-"   - [ 3, 2, 1, 5.0000000000000000e-001 ]\n"
-);
-#else
     EXPECT_STREQ(fs_result.c_str(),
 "%YAML:1.0\n"
 "---\n"
 "dv:\n"
-"   - [ 1, 2, 3, -1.5000000000000000e+00 ]\n"
-"   - [ 2, 3, 4, 1.5000000000000000e+00 ]\n"
-"   - [ 3, 2, 1, 5.0000000000000000e-01 ]\n"
+"   - [ 1, 2, 3, -1.5 ]\n"
+"   - [ 2, 3, 4, 1.5 ]\n"
+"   - [ 3, 2, 1, 0.5 ]\n"
 );
-#endif
 
     cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);
 
@@ -1276,33 +1263,18 @@ TEST(Core_InputOutput, FileStorage_DMatch_vector_vector)
     EXPECT_NO_THROW(fs << "dvv" << dvv);
     cv::String fs_result = fs.releaseAndGetString();
 #ifndef OPENCV_TRAITS_ENABLE_DEPRECATED
-#if defined _MSC_VER && _MSC_VER <= 1800 /* MSVC 2013 and older */
-    EXPECT_STREQ(fs_result.c_str(),
-"%YAML:1.0\n"
-"---\n"
-"dvv:\n"
-"   -\n"
-"      - [ 1, 2, 3, -1.5000000000000000e+000 ]\n"
-"      - [ 2, 3, 4, 1.5000000000000000e+000 ]\n"
-"      - [ 3, 2, 1, 5.0000000000000000e-001 ]\n"
-"   -\n"
-"      - [ 3, 2, 1, 5.0000000000000000e-001 ]\n"
-"      - [ 1, 2, 3, -1.5000000000000000e+000 ]\n"
-);
-#else
     EXPECT_STREQ(fs_result.c_str(),
 "%YAML:1.0\n"
 "---\n"
 "dvv:\n"
 "   -\n"
-"      - [ 1, 2, 3, -1.5000000000000000e+00 ]\n"
-"      - [ 2, 3, 4, 1.5000000000000000e+00 ]\n"
-"      - [ 3, 2, 1, 5.0000000000000000e-01 ]\n"
+"      - [ 1, 2, 3, -1.5 ]\n"
+"      - [ 2, 3, 4, 1.5 ]\n"
+"      - [ 3, 2, 1, 0.5 ]\n"
 "   -\n"
-"      - [ 3, 2, 1, 5.0000000000000000e-01 ]\n"
-"      - [ 1, 2, 3, -1.5000000000000000e+00 ]\n"
+"      - [ 3, 2, 1, 0.5 ]\n"
+"      - [ 1, 2, 3, -1.5 ]\n"
 );
-#endif
 #endif // OPENCV_TRAITS_ENABLE_DEPRECATED
 
     cv::FileStorage fs_read(fs_result, cv::FileStorage::READ | cv::FileStorage::MEMORY);
@@ -1658,7 +1630,7 @@ TEST(Core_InputOutput, FileStorage_json_bool)
 
 TEST(Core_InputOutput, FileStorage_free_file_after_exception)
 {
-    const std::string fileName = "FileStorage_free_file_after_exception_test.yml";
+    const std::string fileName = cv::tempfile("FileStorage_free_file_after_exception_test.yml");
     const std::string content = "%YAML:1.0\n cameraMatrix;:: !<tag:yaml.org,2002:opencv-matrix>\n";
 
     std::fstream testFile;
@@ -1681,11 +1653,11 @@ TEST(Core_InputOutput, FileStorage_free_file_after_exception)
 TEST(Core_InputOutput, FileStorage_write_to_sequence)
 {
     const std::vector<std::string> formatExts = { ".yml", ".json", ".xml" };
-    const std::string fileName = "FileStorage_write_to_sequence";
-
     for (const auto& ext : formatExts)
     {
-        FileStorage fs(fileName + ext, FileStorage::WRITE);
+        const std::string name = tempfile(ext.c_str());
+
+        FileStorage fs(name, FileStorage::WRITE);
         std::vector<int> in = { 23, 42 };
         fs.startWriteStruct("some_sequence", cv::FileNode::SEQ);
         for (int i : in)
@@ -1693,7 +1665,7 @@ TEST(Core_InputOutput, FileStorage_write_to_sequence)
         fs.endWriteStruct();
         fs.release();
 
-        FileStorage fsIn(fileName + ext, FileStorage::READ);
+        FileStorage fsIn(name, FileStorage::READ);
         FileNode seq = fsIn["some_sequence"];
         FileNodeIterator it = seq.begin(), it_end = seq.end();
         std::vector<int> out;
@@ -1701,12 +1673,13 @@ TEST(Core_InputOutput, FileStorage_write_to_sequence)
             out.push_back((int)*it);
 
         EXPECT_EQ(in, out);
+        EXPECT_EQ(0, remove(name.c_str()));
     }
 }
 
 TEST(Core_InputOutput, FileStorage_YAML_parse_multiple_documents)
 {
-    const std::string filename = "FileStorage_YAML_parse_multiple_documents.yml";
+    const std::string filename = cv::tempfile("FileStorage_YAML_parse_multiple_documents.yml");
     FileStorage fs;
 
     fs.open(filename, FileStorage::WRITE);
@@ -1963,5 +1936,53 @@ TEST(Core_InputOutput, FileStorage_invalid_path_regression_21448_JSON)
     fs.release();
 }
 
+// see https://github.com/opencv/opencv/issues/25073
+typedef testing::TestWithParam< std::string > Core_InputOutput_regression_25073;
+
+TEST_P(Core_InputOutput_regression_25073, my_double)
+{
+    cv::String res = "";
+    double my_double = 0.5;
+
+    FileStorage fs( GetParam(), cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+    EXPECT_NO_THROW( fs << "my_double" << my_double );
+    EXPECT_NO_THROW( fs << "my_int" << 5 );
+    EXPECT_NO_THROW( res = fs.releaseAndGetString() );
+    EXPECT_NE( res.find("0.5"), String::npos ) << res; // Found "0.5"
+    EXPECT_EQ( res.find("5.0"), String::npos ) << res; // Not Found "5.000000000000000000e-01"
+    fs.release();
+}
+
+TEST_P(Core_InputOutput_regression_25073, my_float)
+{
+    cv::String res = "";
+    float my_float = 0.5;
+
+    FileStorage fs( GetParam(), cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+    EXPECT_NO_THROW( fs << "my_float" << my_float );
+    EXPECT_NO_THROW( fs << "my_int" << 5 );
+    EXPECT_NO_THROW( res = fs.releaseAndGetString() );
+    EXPECT_NE( res.find("0.5"), String::npos ) << res; // Found "0.5"
+    EXPECT_EQ( res.find("5.0"), String::npos ) << res; // Not Found "5.00000000e-01",
+    fs.release();
+}
+
+TEST_P(Core_InputOutput_regression_25073, my_hfloat)
+{
+    cv::String res = "";
+    cv::hfloat my_hfloat(0.5);
+
+    FileStorage fs( GetParam(), cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+    EXPECT_NO_THROW( fs << "my_hfloat" << my_hfloat );
+    EXPECT_NO_THROW( fs << "my_int" << 5 );
+    EXPECT_NO_THROW( res = fs.releaseAndGetString() );
+    EXPECT_NE( res.find("0.5"), String::npos ) << res; // Found "0.5".
+    EXPECT_EQ( res.find("5.0"), String::npos ) << res; // Not Found "5.0000e-01".
+    fs.release();
+}
+
+INSTANTIATE_TEST_CASE_P( /*nothing*/,
+    Core_InputOutput_regression_25073,
+    Values("test.json", "test.xml", "test.yml") );
 
 }} // namespace
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 2d6019eac4d3..d13fd96f5776 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -18,7 +18,7 @@ class Core_ReduceTest : public cvtest::BaseTest
 public:
     Core_ReduceTest() {}
 protected:
-    void run( int);
+    void run( int) CV_OVERRIDE;
     int checkOp( const Mat& src, int dstType, int opType, const Mat& opRes, int dim );
     int checkCase( int srcType, int dstType, int dim, Size sz );
     int checkDim( int dim, Size sz );
@@ -474,12 +474,13 @@ TEST(Core_PCA, accuracy)
     ASSERT_LE(err, diffBackPrjEps) << "bad accuracy of cvBackProjectPCA() (CV_PCA_DATA_AS_COL)";
 #endif
     // Test read and write
-    FileStorage fs( "PCA_store.yml", FileStorage::WRITE );
+    const std::string filename = cv::tempfile("PCA_store.yml");
+    FileStorage fs( filename, FileStorage::WRITE );
     rPCA.write( fs );
     fs.release();
 
     PCA lPCA;
-    fs.open( "PCA_store.yml", FileStorage::READ );
+    fs.open( filename, FileStorage::READ );
     lPCA.read( fs.root() );
     err = cvtest::norm(rPCA.eigenvectors, lPCA.eigenvectors, NORM_L2 | NORM_RELATIVE);
     EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
@@ -487,6 +488,7 @@ TEST(Core_PCA, accuracy)
     EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
     err = cvtest::norm(rPCA.mean, lPCA.mean, NORM_L2 | NORM_RELATIVE);
     EXPECT_LE(err, 0) << "bad accuracy of write/load functions (YML)";
+    EXPECT_EQ(0, remove(filename.c_str()));
 }
 
 class Core_ArrayOpTest : public cvtest::BaseTest
@@ -495,7 +497,7 @@ class Core_ArrayOpTest : public cvtest::BaseTest
     Core_ArrayOpTest();
     ~Core_ArrayOpTest();
 protected:
-    void run(int);
+    void run(int) CV_OVERRIDE;
 };
 
 
@@ -596,9 +598,14 @@ static void setValue(SparseMat& M, const int* idx, double value, RNG& rng)
     else if( M.type() == CV_64F )
         *(double*)ptr = value;
     else
-        CV_Error(CV_StsUnsupportedFormat, "");
+        CV_Error(cv::Error::StsUnsupportedFormat, "");
 }
 
+#if defined(__GNUC__) && (__GNUC__ >= 11)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
 template<typename Pixel>
 struct InitializerFunctor{
     /// Initializer for cv::Mat::forEach test
@@ -621,6 +628,11 @@ struct InitializerFunctor5D{
     }
 };
 
+#if defined(__GNUC__) && (__GNUC__ == 11 || __GNUC__ == 12)
+#pragma GCC diagnostic pop
+#endif
+
+
 template<typename Pixel>
 struct EmptyFunctor
 {
@@ -1023,7 +1035,7 @@ class Core_MergeSplitBaseTest : public cvtest::BaseTest
 protected:
     virtual int run_case(int depth, size_t channels, const Size& size, RNG& rng) = 0;
 
-    virtual void run(int)
+    virtual void run(int) CV_OVERRIDE
     {
         // m is Mat
         // mv is vector<Mat>
@@ -1068,7 +1080,7 @@ class Core_MergeTest : public Core_MergeSplitBaseTest
     ~Core_MergeTest() {}
 
 protected:
-    virtual int run_case(int depth, size_t matCount, const Size& size, RNG& rng)
+    virtual int run_case(int depth, size_t matCount, const Size& size, RNG& rng) CV_OVERRIDE
     {
         const int maxMatChannels = 10;
 
@@ -1126,7 +1138,7 @@ class Core_SplitTest : public Core_MergeSplitBaseTest
     ~Core_SplitTest() {}
 
 protected:
-    virtual int run_case(int depth, size_t channels, const Size& size, RNG& rng)
+    virtual int run_case(int depth, size_t channels, const Size& size, RNG& rng) CV_OVERRIDE
     {
         Mat src(size, CV_MAKETYPE(depth, (int)channels));
         rng.fill(src, RNG::UNIFORM, 0, 100, true);
@@ -1358,6 +1370,18 @@ TEST(Core_Mat, copyNx1ToVector)
     ASSERT_PRED_FORMAT2(cvtest::MatComparator(0, 0), ref_dst16, cv::Mat_<ushort>(dst16));
 }
 
+TEST(Core_Mat, copyMakeBoderUndefinedBehavior)
+{
+    Mat1b src(4, 4), dst;
+    randu(src, Scalar(10), Scalar(100));
+    // This could trigger a (signed int)*size_t operation which is undefined behavior.
+    cv::copyMakeBorder(src, dst, 1, 1, 1, 1, cv::BORDER_REFLECT_101);
+    EXPECT_EQ(0, cv::norm(src.row(1), dst(Rect(1,0,4,1))));
+    EXPECT_EQ(0, cv::norm(src.row(2), dst(Rect(1,5,4,1))));
+    EXPECT_EQ(0, cv::norm(src.col(1), dst(Rect(0,1,1,4))));
+    EXPECT_EQ(0, cv::norm(src.col(2), dst(Rect(5,1,1,4))));
+}
+
 TEST(Core_Matx, fromMat_)
 {
     Mat_<double> a = (Mat_<double>(2,2) << 10, 11, 12, 13);
@@ -1990,7 +2014,6 @@ TEST(Core_InputArray, fetch_MatExpr)
 }
 
 
-#ifdef CV_CXX11
 class TestInputArrayRangeChecking {
     static const char *kind2str(cv::_InputArray ia)
     {
@@ -2137,8 +2160,6 @@ TEST(Core_InputArray, range_checking)
 {
     TestInputArrayRangeChecking::run();
 }
-#endif
-
 
 TEST(Core_Vectors, issue_13078)
 {
diff --git a/modules/core/test/test_misc.cpp b/modules/core/test/test_misc.cpp
index 8ed0afe77160..39d0788d645a 100644
--- a/modules/core/test/test_misc.cpp
+++ b/modules/core/test/test_misc.cpp
@@ -8,10 +8,8 @@
 
 #include <opencv2/core/utils/fp_control_utils.hpp>
 
-#ifdef CV_CXX11
 #include <chrono>
 #include <thread>
-#endif
 
 namespace opencv_test { namespace {
 
@@ -282,9 +280,7 @@ class FPDenormalsHintCheckerParallelLoopBody : public cv::ParallelLoopBody
             // FP state is not supported
             // no checks
         }
-#ifdef CV_CXX11
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
-#endif
     }
 
     cv::details::FPDenormalsModeState base_state;
@@ -917,5 +913,28 @@ REGISTER_TYPED_TEST_CASE_P(Rect_Test, Overflows);
 typedef ::testing::Types<int, float, double> RectTypes;
 INSTANTIATE_TYPED_TEST_CASE_P(Negative_Test, Rect_Test, RectTypes);
 
+// Expected that SkipTestException thrown in the constructor should skip test but not fail
+struct TestFixtureSkip: public ::testing::Test {
+    TestFixtureSkip(bool throwEx = true) {
+        if (throwEx) {
+            throw SkipTestException("Skip test at constructor");
+        }
+    }
+};
+
+TEST_F(TestFixtureSkip, NoBodyRun) {
+    FAIL() << "Unreachable code called";
+}
+
+// Expected that SkipTestException thrown in SetUp method should skip test but not fail
+struct TestSetUpSkip: public ::testing::Test {
+    virtual void SetUp() CV_OVERRIDE {
+        throw SkipTestException("Skip test at SetUp");
+    }
+};
+
+TEST_F(TestSetUpSkip, NoBodyRun) {
+    FAIL() << "Unreachable code called";
+}
 
 }} // namespace
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index 934028f3aec5..d5622dabb46f 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -1379,6 +1379,15 @@ TEST(MatTestRoi, adjustRoiOverflow)
     ASSERT_EQ(roi.rows, m.rows);
 }
 
+TEST(MatTestRoi, adjustRoiUndefinedBehavior)
+{
+    Mat m(6, 6, CV_8U);
+    Mat roi(m, cv::Range(2, 4), cv::Range(2, 4));
+    // This could trigger a (negative int)*size_t when updating data,
+    // which is undefined behavior.
+    roi.adjustROI(2, 2, 2, 2);
+    EXPECT_EQ(m.data, roi.data);
+}
 
 CV_ENUM(SortRowCol, SORT_EVERY_COLUMN, SORT_EVERY_ROW)
 CV_ENUM(SortOrder, SORT_ASCENDING, SORT_DESCENDING)
@@ -1561,4 +1570,50 @@ TEST(Core_Arithm, scalar_handling_19599)  // https://github.com/opencv/opencv/is
     EXPECT_EQ(1, c.rows);
 }
 
+// https://github.com/opencv/opencv/issues/24163
+typedef tuple<perf::MatDepth,int,int,int> Arith_Regression24163Param;
+typedef testing::TestWithParam<Arith_Regression24163Param> Core_Arith_Regression24163;
+
+TEST_P(Core_Arith_Regression24163, test_for_ties_to_even)
+{
+    const int matDepth = get<0>(GetParam());
+    const int matHeight= get<1>(GetParam());
+    const int matWidth = 3; // Fixed
+    const int alpha    = get<2>(GetParam());
+    const int beta     = get<3>(GetParam());
+
+    // If alpha and/or beta are negative, and matDepth is unsigned, test is passed.
+    if( ( (alpha < 0) || (beta < 0) )
+        &&
+        ( (matDepth != CV_8S) && (matDepth != CV_16S) && (matDepth != CV_32S) ) )
+    {
+        throw SkipTestException( cv::format("Test is skipped(matDepth is not signed, alpha = %d, beta = %d)", alpha, beta) );
+    }
+
+    const int matType = CV_MAKE_TYPE(matDepth, 1);
+    const Size matSize(matWidth, matHeight);
+    const Mat src1(matSize, matType, Scalar(alpha,alpha,alpha,alpha));
+    const Mat src2(matSize, matType, Scalar(beta, beta, beta, beta));
+    const Mat result = ( src1 + src2 ) / 2;
+
+    // Expected that default is FE_TONEAREST(Ties to Even).
+    const int mean = lrint( static_cast<double>(alpha + beta) / 2.0 );
+    const Mat expected(matSize, matType, Scalar(mean,mean,mean,mean));
+
+    // Compare result and extected.
+    ASSERT_EQ(expected.size(), result.size());
+    EXPECT_EQ(0, cvtest::norm(expected, result, NORM_INF)) <<
+        "result=" << std::endl << result << std::endl <<
+        "expected=" << std::endl << expected;
+}
+
+INSTANTIATE_TEST_CASE_P(/* */, Core_Arith_Regression24163,
+    testing::Combine(
+        testing::Values(perf::MatDepth(CV_8U), CV_8S, CV_16U, CV_16S, CV_32S), // MatType
+        testing::Values( 3, 4, 5, 6),    // MatHeight
+        testing::Values(-2,-1, 0, 1, 2), // src1
+        testing::Values(   -1, 0, 1   )  // src2
+    )
+);
+
 }} // namespace
diff --git a/modules/core/test/test_precomp.hpp b/modules/core/test/test_precomp.hpp
index 81ddf45de9eb..3d9e5a9f394f 100644
--- a/modules/core/test/test_precomp.hpp
+++ b/modules/core/test/test_precomp.hpp
@@ -4,6 +4,8 @@
 #ifndef __OPENCV_TEST_PRECOMP_HPP__
 #define __OPENCV_TEST_PRECOMP_HPP__
 
+#include <array>
+
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/ocl_test.hpp"
 #include "opencv2/core/private.hpp"
diff --git a/modules/core/test/test_utils_tls.impl.hpp b/modules/core/test/test_utils_tls.impl.hpp
index 36b880542228..20facabadd6e 100644
--- a/modules/core/test/test_utils_tls.impl.hpp
+++ b/modules/core/test/test_utils_tls.impl.hpp
@@ -4,9 +4,7 @@
 
 // This is .hpp file included from test_utils.cpp
 
-#ifdef CV_CXX11
 #include <thread>  // std::thread
-#endif
 
 #include "opencv2/core/utils/tls.hpp"
 
@@ -34,8 +32,6 @@ class TLSReporter
 int TLSReporter::g_last_id = 0;
 int TLSReporter::g_allocated = 0;
 
-#ifdef CV_CXX11
-
 template<typename T>
 static void callNThreadsWithTLS(int N, TLSData<T>& tls)
 {
@@ -129,6 +125,4 @@ static void testTLSAccumulator(bool detachFirst)
 TEST(Core_TLS, AccumulatorHoldData_detachData) { testTLSAccumulator(true); }
 TEST(Core_TLS, AccumulatorHoldData_gather) { testTLSAccumulator(false); }
 
-#endif
-
 }}  // namespace
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 804b78ead207..3b66b460d9ec 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -5,10 +5,11 @@ endif()
 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
 
 ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
-ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
-ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2)
+ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX)
+ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16)
 ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
-ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2)
+ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16)
+ocv_add_dispatched_file_force_all("layers/cpu_kernels/fast_gemm_kernels" AVX AVX2 NEON LASX)
 
 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
 
@@ -58,11 +59,6 @@ endif()
 ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
 
 
-if(HAVE_TENGINE)
-  ocv_target_compile_definitions(${the_module} PRIVATE "HAVE_TENGINE=1")
-endif()
-
-
 if(MSVC)
   add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
   ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
@@ -168,15 +164,16 @@ if(OPENCV_DNN_CUDA AND HAVE_CUDA AND HAVE_CUBLAS AND HAVE_CUDNN)
     endif()
   endforeach()
   unset(CC_LIST)
+  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+    list(APPEND libs ${CUDNN_LIBRARIES} CUDA::cublas${CUDA_LIB_EXT})
+    if(NOT CUDA_VERSION VERSION_LESS 10.1)
+      list(APPEND libs CUDA::cublasLt${CUDA_LIB_EXT})
+    endif()
+  endif()
 else()
   set(sources_options ${sources_options} EXCLUDE_CUDA)
 endif()
 
-if(HAVE_TENGINE)
-	list(APPEND include_dirs ${TENGINE_INCLUDE_DIRS})
-	list(APPEND libs -Wl,--whole-archive ${TENGINE_LIBRARIES} -Wl,--no-whole-archive)
-endif()
-
 if(HAVE_TIMVX)
     list(APPEND include_dirs ${TIMVX_INCLUDE_DIR})
     list(APPEND libs -Wl,--whole-archive ${TIMVX_LIBRARY} -Wl,--no-whole-archive)
@@ -237,6 +234,10 @@ if(TARGET ocv.3rdparty.openvino AND OPENCV_DNN_OPENVINO)
   endif()
 endif()
 
+set(OPENCV_DNN_BACKEND_DEFAULT "" CACHE STRING "Default backend used by the DNN module (DNN_BACKEND_OPENCV if empty)")
+if(OPENCV_DNN_BACKEND_DEFAULT)
+  ocv_append_source_file_compile_definitions("${CMAKE_CURRENT_LIST_DIR}/src/dnn_params.cpp" "OPENCV_DNN_BACKEND_DEFAULT=${OPENCV_DNN_BACKEND_DEFAULT}")
+endif()
 
 ocv_install_used_external_targets(${libs} ${dnn_runtime_libs})
 
@@ -245,6 +246,12 @@ ocv_create_module(${libs} ${dnn_runtime_libs})
 ocv_add_samples()
 ocv_add_accuracy_tests(${dnn_runtime_libs})
 
+if(NOT BUILD_PROTOBUF)
+  if(TARGET opencv_test_dnn)
+    ocv_target_compile_definitions(opencv_test_dnn PRIVATE "OPENCV_DNN_EXTERNAL_PROTOBUF=1")
+  endif()
+endif()
+
 set(perf_path "${CMAKE_CURRENT_LIST_DIR}/perf")
 file(GLOB_RECURSE perf_srcs "${perf_path}/*.cpp")
 file(GLOB_RECURSE perf_hdrs "${perf_path}/*.hpp" "${perf_path}/*.h")
@@ -300,6 +307,13 @@ if(TARGET ocv.3rdparty.cann AND OPENCV_TEST_DNN_CANN)
   endif()
 endif()
 
+ocv_option(OPENCV_TEST_DNN_TIMVX "Build test with TIM-VX" (HAVE_TIMVX))
+if(OPENCV_TEST_DNN_TIMVX)
+  if(TARGET opencv_test_dnn)
+    ocv_target_compile_definitions(opencv_test_dnn PRIVATE "HAVE_TIMVX=1")
+  endif()
+endif()
+
 ocv_option(OPENCV_TEST_DNN_TFLITE "Build test with TFLite" (OPENCV_DNN_TFLITE))
 if(OPENCV_TEST_DNN_TFLITE)
   if(TARGET opencv_test_dnn)
diff --git a/modules/dnn/cmake/plugin.cmake b/modules/dnn/cmake/plugin.cmake
index 055d21efc3c9..df603b7c7a44 100644
--- a/modules/dnn/cmake/plugin.cmake
+++ b/modules/dnn/cmake/plugin.cmake
@@ -50,6 +50,7 @@ function(ocv_create_builtin_dnn_plugin name target)
   endforeach()
 
   if(WIN32)
+    add_definitions(-D_USE_MATH_DEFINES)
     set(OPENCV_PLUGIN_VERSION "${OPENCV_DLLVERSION}" CACHE STRING "")
     if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
       set(OPENCV_PLUGIN_ARCH "_64" CACHE STRING "")
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index e133ffea65ad..3301f20fde70 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -241,6 +241,39 @@ CV__DNN_INLINE_NS_BEGIN
 
     };
 
+    /** @brief This function performs array summation based
+    * on the Einstein summation convention. The function
+    * allows for concise expressions of various mathematical
+    * operations using subscripts.
+    *
+    * By default, the labels are placed in alphabetical
+    * order at the end of the output.
+    * For example:
+    * if `c = einsum("i,j", a, b)`, then `c[i,j] == a[i]*b[j]`.
+    * However, if `c = einsum("j,i", a, b)`, then `c[i,j] = a[j]*b[i]`.
+    * Alternatively, you can control the output order or prevent
+    * an axis from being summed/force an axis to be summed
+    * by providing indices for the output.
+    * For example:
+    * `diag(a)`         -> `einsum("ii->i", a)`
+    * `sum(a, axis=0)`  -> `einsum("i...->", a)`
+    * Subscripts at the beginning and end may be specified
+    * by putting an ellipsis "..." in the middle.
+    * For instance, the function `einsum("i...i", a)` takes
+    * the diagonal of the first and last dimensions of
+    * the operand, and `einsum("ij...,jk...->ik...")` performs
+    * the matrix product using the first two indices
+    * of each operand instead of the last two.
+    * When there is only one operand, no axes being summed,
+    *  and no output parameter, this function returns
+    * a view into the operand instead of creating a copy.
+     */
+    class CV_EXPORTS EinsumLayer : public Layer
+    {
+    public:
+        static Ptr<EinsumLayer> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS BaseConvolutionLayer : public Layer
     {
     public:
@@ -258,7 +291,7 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
         bool fusedActivation = false;
         bool fusedAdd = false;
-        bool useWinograd = false; // Flag whether to use Winograd to speed up 3x3 convolution.
+        bool useWinograd = true; // Flag whether to use Winograd to speed up 3x3 convolution.
     };
 
     class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
@@ -270,7 +303,7 @@ CV__DNN_INLINE_NS_BEGIN
         // quantization type flag. The perChannel default is true, that means it contains the parameters
         // of per-Channel quantization. Otherwise, that means this layer contains per-Tensor quantized parameters.
         bool per_channel;
-        bool useWinograd = true; // Flag whether to use Winograd to speed up 3x3 convolution.
+        bool useWinograd = false; // Flag whether to use Winograd to speed up 3x3 convolution.
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
 
@@ -310,6 +343,22 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<GatherLayer> create(const LayerParams& params);
     };
 
+    /** @brief GatherElements layer
+    * GatherElements takes two inputs data and indices of the same rank r >= 1 and an optional attribute axis and works such that:
+    *   output[i][j][k] = data[index[i][j][k]][j][k] if axis = 0 and r = 3
+    *   output[i][j][k] = data[i][index[i][j][k]][k] if axis = 1 and r = 3
+    *   output[i][j][k] = data[i][j][index[i][j][k]] if axis = 2 and r = 3
+    *
+    * Gather, on the other hand, takes a data tensor of rank r >= 1, and indices tensor of rank q, and works such that:
+    *   it gathers the enteries along axis dimension of the input data indexed by indices and concatenates them in an output tensor of rank q + (r - 1)
+    *   e.g. If axis = 0, let k = indices[i_{0}, ..., i_{q-1}] then output[i_{0}, ..., i_{q-1}, j_{0}, ..., j_{r-2}] = input[k , j_{0}, ..., j_{r-2}]:
+     **/
+    class CV_EXPORTS GatherElementsLayer : public Layer
+    {
+    public:
+        static Ptr<GatherElementsLayer> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS PoolingLayer : public Layer
     {
     public:
@@ -555,11 +604,11 @@ CV__DNN_INLINE_NS_BEGIN
     {
     public:
         virtual void forwardSlice(const float* src, float* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
         virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
         virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
     };
 
     class CV_EXPORTS ReLULayer : public ActivationLayer
@@ -1094,13 +1143,51 @@ CV__DNN_INLINE_NS_BEGIN
     class CV_EXPORTS LayerNormLayer : public Layer
     {
     public:
-        bool hasBias;
+        CV_DEPRECATED_EXTERNAL bool hasBias; // Deprecated, preserve for compatibility
         int axis;
         float epsilon;
 
         static Ptr<LayerNormLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS GemmLayer : public Layer {
+    public:
+        bool trans_a;
+        bool trans_b;
+        float alpha;
+        float beta;
+
+        static Ptr<GemmLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS MatMulLayer : public Layer {
+     public:
+        static Ptr<MatMulLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ExpandLayer : public Layer
+    {
+    public:
+        static Ptr<ExpandLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS InstanceNormLayer : public Layer {
+    public:
+        float epsilon;
+
+        static Ptr<InstanceNormLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AttentionLayer : public Layer {
+     public:
+        static Ptr<AttentionLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS GroupNormLayer : public Layer {
+    public:
+        static Ptr<GroupNormLayer> create(const LayerParams &params);
+    };
+
 //! @}
 //! @}
 CV__DNN_INLINE_NS_END
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index d61f7191bc89..fd9129010475 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -69,9 +69,7 @@ CV__DNN_INLINE_NS_BEGIN
      */
     enum Backend
     {
-        //! DNN_BACKEND_DEFAULT equals to DNN_BACKEND_INFERENCE_ENGINE if
-        //! OpenCV is built with Intel OpenVINO or
-        //! DNN_BACKEND_OPENCV otherwise.
+        //! DNN_BACKEND_DEFAULT equals to OPENCV_DNN_BACKEND_DEFAULT, which can be defined using CMake or a configuration parameter
         DNN_BACKEND_DEFAULT = 0,
         DNN_BACKEND_HALIDE,
         DNN_BACKEND_INFERENCE_ENGINE,            //!< Intel OpenVINO computational backend
@@ -216,7 +214,7 @@ CV__DNN_INLINE_NS_BEGIN
 
     /** @brief This interface class allows to build new Layers - are building blocks of networks.
      *
-     * Each class, derived from Layer, must implement allocate() methods to declare own outputs and forward() to compute outputs.
+     * Each class, derived from Layer, must implement forward() method to compute outputs.
      * Also before using the new layer into networks you must register your layer by using one of @ref dnnLayerFactory "LayerFactory" macros.
      */
     class CV_EXPORTS_W Layer : public Algorithm
@@ -231,7 +229,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param[in]  input  vector of already allocated input blobs
          *  @param[out] output vector of already allocated output blobs
          *
-         * If this method is called after network has allocated all memory for input and output blobs
+         * This method is called after network has allocated all memory for input and output blobs
          * and before inferencing.
          */
         CV_DEPRECATED_EXTERNAL
@@ -241,7 +239,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param[in]  inputs  vector of already allocated input blobs
          *  @param[out] outputs vector of already allocated output blobs
          *
-         * If this method is called after network has allocated all memory for input and output blobs
+         * This method is called after network has allocated all memory for input and output blobs
          * and before inferencing.
          */
         CV_WRAP virtual void finalize(InputArrayOfArrays inputs, OutputArrayOfArrays outputs);
@@ -486,7 +484,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine
          *  backend.
          */
-        CV_WRAP static Net readFromModelOptimizer(const String& xml, const String& bin);
+        CV_WRAP static Net readFromModelOptimizer(CV_WRAP_FILE_PATH const String& xml, CV_WRAP_FILE_PATH const String& bin);
 
         /** @brief Create a network from Intel's Model Optimizer in-memory buffers with intermediate representation (IR).
          *  @param[in] bufferModelConfig buffer with model's configuration.
@@ -519,7 +517,15 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param path   path to output file with .dot extension
          *  @see dump()
          */
-        CV_WRAP void dumpToFile(const String& path);
+        CV_WRAP void dumpToFile(CV_WRAP_FILE_PATH const String& path);
+        /** @brief Dump net structure, hyperparameters, backend, target and fusion to pbtxt file
+         *  @param path   path to output file with .pbtxt extension
+         *
+         *  Use Netron (https://netron.app) to open the target file to visualize the model.
+         *  Call method after setInput(). To see correct backend, target and fusion run after forward().
+        */
+        CV_WRAP void dumpToPbtxt(CV_WRAP_FILE_PATH const String& path);
+
         /** @brief Adds new layer to the net.
          *  @param name   unique name of the adding layer.
          *  @param type   typename of the adding layer (type must be registered in LayerRegister).
@@ -688,9 +694,6 @@ CV__DNN_INLINE_NS_BEGIN
          * @brief Ask network to use specific computation backend where it supported.
          * @param[in] backendId backend identifier.
          * @see Backend
-         *
-         * If OpenCV is compiled with Intel's Inference Engine library, DNN_BACKEND_DEFAULT
-         * means DNN_BACKEND_INFERENCE_ENGINE. Otherwise it equals to DNN_BACKEND_OPENCV.
          */
         CV_WRAP void setPreferableBackend(int backendId);
 
@@ -895,7 +898,7 @@ CV__DNN_INLINE_NS_BEGIN
     *  @param darknetModel path to the .weights file with learned network.
     *  @returns Network object that ready to do forward, throw an exception in failure cases.
     */
-    CV_EXPORTS_W Net readNetFromDarknet(const String &cfgFile, const String &darknetModel = String());
+    CV_EXPORTS_W Net readNetFromDarknet(CV_WRAP_FILE_PATH const String &cfgFile, CV_WRAP_FILE_PATH const String &darknetModel = String());
 
     /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
      *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
@@ -920,7 +923,7 @@ CV__DNN_INLINE_NS_BEGIN
       * @param caffeModel path to the .caffemodel file with learned network.
       * @returns Net object.
       */
-    CV_EXPORTS_W Net readNetFromCaffe(const String &prototxt, const String &caffeModel = String());
+    CV_EXPORTS_W Net readNetFromCaffe(CV_WRAP_FILE_PATH const String &prototxt, CV_WRAP_FILE_PATH const String &caffeModel = String());
 
     /** @brief Reads a network model stored in Caffe model in memory.
       * @param bufferProto buffer containing the content of the .prototxt file
@@ -949,7 +952,7 @@ CV__DNN_INLINE_NS_BEGIN
       *               let us make it more flexible.
       * @returns Net object.
       */
-    CV_EXPORTS_W Net readNetFromTensorflow(const String &model, const String &config = String());
+    CV_EXPORTS_W Net readNetFromTensorflow(CV_WRAP_FILE_PATH const String &model, CV_WRAP_FILE_PATH const String &config = String());
 
     /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
       * @param bufferModel buffer containing the content of the pb file
@@ -974,7 +977,7 @@ CV__DNN_INLINE_NS_BEGIN
       * @param model  path to the .tflite file with binary flatbuffers description of the network architecture
       * @returns Net object.
       */
-    CV_EXPORTS_W Net readNetFromTFLite(const String &model);
+    CV_EXPORTS_W Net readNetFromTFLite(CV_WRAP_FILE_PATH const String &model);
 
     /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
       * @param bufferModel buffer containing the content of the tflite file
@@ -1016,7 +1019,7 @@ CV__DNN_INLINE_NS_BEGIN
      *
      * Also some equivalents of these classes from cunn, cudnn, and fbcunn may be successfully imported.
      */
-     CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true, bool evaluate = true);
+     CV_EXPORTS_W Net readNetFromTorch(CV_WRAP_FILE_PATH const String &model, bool isBinary = true, bool evaluate = true);
 
      /**
       * @brief Read deep learning network represented in one of the supported formats.
@@ -1026,14 +1029,14 @@ CV__DNN_INLINE_NS_BEGIN
       *                  * `*.pb` (TensorFlow, https://www.tensorflow.org/)
       *                  * `*.t7` | `*.net` (Torch, http://torch.ch/)
       *                  * `*.weights` (Darknet, https://pjreddie.com/darknet/)
-      *                  * `*.bin` (DLDT, https://software.intel.com/openvino-toolkit)
+      *                  * `*.bin` | `*.onnx` (OpenVINO, https://software.intel.com/openvino-toolkit)
       *                  * `*.onnx` (ONNX, https://onnx.ai/)
       * @param[in] config Text file contains network configuration. It could be a
       *                   file with the following extensions:
       *                  * `*.prototxt` (Caffe, http://caffe.berkeleyvision.org/)
       *                  * `*.pbtxt` (TensorFlow, https://www.tensorflow.org/)
       *                  * `*.cfg` (Darknet, https://pjreddie.com/darknet/)
-      *                  * `*.xml` (DLDT, https://software.intel.com/openvino-toolkit)
+      *                  * `*.xml` (OpenVINO, https://software.intel.com/openvino-toolkit)
       * @param[in] framework Explicit framework name tag to determine a format.
       * @returns Net object.
       *
@@ -1042,7 +1045,7 @@ CV__DNN_INLINE_NS_BEGIN
       * @ref readNetFromTorch or @ref readNetFromDarknet. An order of @p model and @p config
       * arguments does not matter.
       */
-     CV_EXPORTS_W Net readNet(const String& model, const String& config = "", const String& framework = "");
+     CV_EXPORTS_W Net readNet(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "", const String& framework = "");
 
      /**
       * @brief Read deep learning network represented in one of the supported formats.
@@ -1069,7 +1072,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  backend.
      */
     CV_EXPORTS_W
-    Net readNetFromModelOptimizer(const String &xml, const String &bin);
+    Net readNetFromModelOptimizer(CV_WRAP_FILE_PATH const String &xml, CV_WRAP_FILE_PATH const String &bin = "");
 
     /** @brief Load a network from Intel's Model Optimizer intermediate representation.
      *  @param[in] bufferModelConfig Buffer contains XML configuration with network's topology.
@@ -1098,7 +1101,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  @param onnxFile path to the .onnx file with text description of the network architecture.
      *  @returns Network object that ready to do forward, throw an exception in failure cases.
      */
-    CV_EXPORTS_W Net readNetFromONNX(const String &onnxFile);
+    CV_EXPORTS_W Net readNetFromONNX(CV_WRAP_FILE_PATH const String &onnxFile);
 
     /** @brief Reads a network model from <a href="https://onnx.ai/">ONNX</a>
      *         in-memory buffer.
@@ -1121,7 +1124,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  @param path to the .pb file with input tensor.
      *  @returns Mat.
      */
-    CV_EXPORTS_W Mat readTensorFromONNX(const String& path);
+    CV_EXPORTS_W Mat readTensorFromONNX(CV_WRAP_FILE_PATH const String& path);
 
     /** @brief Creates 4-dimensional blob from image. Optionally resizes and crops @p image from center,
      *  subtract @p mean values, scales values by @p scalefactor, swap Blue and Red channels.
@@ -1217,7 +1220,7 @@ CV__DNN_INLINE_NS_BEGIN
         CV_WRAP Image2BlobParams();
         CV_WRAP Image2BlobParams(const Scalar& scalefactor, const Size& size = Size(), const Scalar& mean = Scalar(),
                             bool swapRB = false, int ddepth = CV_32F, DataLayout datalayout = DNN_LAYOUT_NCHW,
-                            ImagePaddingMode mode = DNN_PMODE_NULL);
+                            ImagePaddingMode mode = DNN_PMODE_NULL, Scalar borderValue = 0.0);
 
         CV_PROP_RW Scalar scalefactor; //!< scalefactor multiplier for input image values.
         CV_PROP_RW Size size;    //!< Spatial size for output image.
@@ -1226,6 +1229,21 @@ CV__DNN_INLINE_NS_BEGIN
         CV_PROP_RW int ddepth;   //!< Depth of output blob. Choose CV_32F or CV_8U.
         CV_PROP_RW DataLayout datalayout; //!< Order of output dimensions. Choose DNN_LAYOUT_NCHW or DNN_LAYOUT_NHWC.
         CV_PROP_RW ImagePaddingMode paddingmode;   //!< Image padding mode. @see ImagePaddingMode.
+        CV_PROP_RW Scalar borderValue;   //!< Value used in padding mode for padding.
+
+        /** @brief Get rectangle coordinates in original image system from rectangle in blob coordinates.
+         *  @param rBlob rect in blob coordinates.
+         *  @param size original input image size.
+         *  @returns rectangle in original image coordinates.
+         */
+        CV_WRAP Rect blobRectToImageRect(const Rect &rBlob, const Size &size);
+
+        /** @brief Get rectangle coordinates in original image system from rectangle in blob coordinates.
+         *  @param rBlob rect in blob coordinates.
+         *  @param rImg result rect in image coordinates.
+         *  @param size original input image size.
+         */
+        CV_WRAP void blobRectsToImageRects(const std::vector<Rect> &rBlob, CV_OUT std::vector<Rect>& rImg, const Size& size);
     };
 
     /** @brief Creates 4-dimensional blob from image with given params.
@@ -1279,7 +1297,7 @@ CV__DNN_INLINE_NS_BEGIN
      *       is taken from NVidia's Caffe fork: https://github.com/NVIDIA/caffe.
      *       So the resulting model may be used there.
      */
-    CV_EXPORTS_W void shrinkCaffeModel(const String& src, const String& dst,
+    CV_EXPORTS_W void shrinkCaffeModel(CV_WRAP_FILE_PATH const String& src, CV_WRAP_FILE_PATH const String& dst,
                                        const std::vector<String>& layersTypes = std::vector<String>());
 
     /** @brief Create a text representation for a binary network stored in protocol buffer format.
@@ -1288,7 +1306,7 @@ CV__DNN_INLINE_NS_BEGIN
      *
      *  @note To reduce output file size, trained weights are not included.
      */
-    CV_EXPORTS_W void writeTextGraph(const String& model, const String& output);
+    CV_EXPORTS_W void writeTextGraph(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& output);
 
     /** @brief Performs non maximum suppression given boxes and corresponding scores.
 
@@ -1393,7 +1411,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-         CV_WRAP Model(const String& model, const String& config = "");
+         CV_WRAP Model(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1434,6 +1452,11 @@ CV__DNN_INLINE_NS_BEGIN
          */
          CV_WRAP Model& setInputSwapRB(bool swapRB);
 
+         /** @brief Set output names for frame.
+          *  @param[in] outNames Names for output layers.
+         */
+         CV_WRAP Model& setOutputNames(const std::vector<String>& outNames);
+
          /** @brief Set preprocessing parameters for frame.
          *  @param[in] size New input size.
          *  @param[in] mean Scalar with mean values which are subtracted from channels.
@@ -1463,6 +1486,9 @@ CV__DNN_INLINE_NS_BEGIN
          /// @sa Net::setPreferableTarget
          CV_WRAP Model& setPreferableTarget(dnn::Target targetId);
 
+         /// @sa Net::enableWinograd
+         CV_WRAP Model& enableWinograd(bool useWinograd);
+
          CV_DEPRECATED_EXTERNAL
          operator Net&() const { return getNetwork_(); }
 
@@ -1495,7 +1521,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP ClassificationModel(const String& model, const String& config = "");
+          CV_WRAP ClassificationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1545,7 +1571,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP KeypointsModel(const String& model, const String& config = "");
+          CV_WRAP KeypointsModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1577,7 +1603,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP SegmentationModel(const String& model, const String& config = "");
+          CV_WRAP SegmentationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1608,7 +1634,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-         CV_WRAP DetectionModel(const String& model, const String& config = "");
+         CV_WRAP DetectionModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1674,7 +1700,7 @@ class CV_EXPORTS_W_SIMPLE TextRecognitionModel : public Model
      * @param[in] config Text file contains network configuration
      */
     CV_WRAP inline
-    TextRecognitionModel(const std::string& model, const std::string& config = "")
+    TextRecognitionModel(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextRecognitionModel(readNet(model, config)) { /* nothing */ }
 
     /**
@@ -1829,7 +1855,7 @@ class CV_EXPORTS_W_SIMPLE TextDetectionModel_EAST : public TextDetectionModel
      * @param[in] config Text file contains network configuration.
      */
     CV_WRAP inline
-    TextDetectionModel_EAST(const std::string& model, const std::string& config = "")
+    TextDetectionModel_EAST(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextDetectionModel_EAST(readNet(model, config)) { /* nothing */ }
 
     /**
@@ -1890,7 +1916,7 @@ class CV_EXPORTS_W_SIMPLE TextDetectionModel_DB : public TextDetectionModel
      * @param[in] config Text file contains network configuration.
      */
     CV_WRAP inline
-    TextDetectionModel_DB(const std::string& model, const std::string& config = "")
+    TextDetectionModel_DB(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextDetectionModel_DB(readNet(model, config)) { /* nothing */ }
 
     CV_WRAP TextDetectionModel_DB& setBinaryThreshold(float binaryThreshold);
diff --git a/modules/dnn/include/opencv2/dnn/version.hpp b/modules/dnn/include/opencv2/dnn/version.hpp
index b93622def90d..f83d90dab410 100644
--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP
 
 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20230620
+#define OPENCV_DNN_API_VERSION 20240521
 
 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
diff --git a/modules/dnn/misc/python/test/test_dnn.py b/modules/dnn/misc/python/test/test_dnn.py
index 5c91aae56f91..da3409c3b3db 100644
--- a/modules/dnn/misc/python/test/test_dnn.py
+++ b/modules/dnn/misc/python/test/test_dnn.py
@@ -127,6 +127,34 @@ def test_getAvailableTargets(self):
         targets = cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_OPENCV)
         self.assertTrue(cv.dnn.DNN_TARGET_CPU in targets)
 
+    def test_blobRectsToImageRects(self):
+        paramNet = cv.dnn.Image2BlobParams()
+        paramNet.size = (226, 226)
+        paramNet.ddepth = cv.CV_32F
+        paramNet.mean = [0.485, 0.456, 0.406]
+        paramNet.scalefactor = [0.229, 0.224, 0.225]
+        paramNet.swapRB = False
+        paramNet.datalayout = cv.dnn.DNN_LAYOUT_NCHW
+        paramNet.paddingmode = cv.dnn.DNN_PMODE_LETTERBOX
+        rBlob = np.zeros(shape=(20, 4), dtype=np.int32)
+        rImg = paramNet.blobRectsToImageRects(rBlob, (356, 356))
+        self.assertTrue(type(rImg[0, 0])==np.int32)
+        self.assertTrue(rImg.shape==(20, 4))
+
+    def test_blobRectToImageRect(self):
+        paramNet = cv.dnn.Image2BlobParams()
+        paramNet.size = (226, 226)
+        paramNet.ddepth = cv.CV_32F
+        paramNet.mean = [0.485, 0.456, 0.406]
+        paramNet.scalefactor = [0.229, 0.224, 0.225]
+        paramNet.swapRB = False
+        paramNet.datalayout = cv.dnn.DNN_LAYOUT_NCHW
+        paramNet.paddingmode = cv.dnn.DNN_PMODE_LETTERBOX
+        rBlob = np.zeros(shape=(20, 4), dtype=np.int32)
+        rImg = paramNet.blobRectToImageRect((0, 0, 0, 0), (356, 356))
+        self.assertTrue(type(rImg[0])==int)
+
+
     def test_blobFromImage(self):
         np.random.seed(324)
 
@@ -191,10 +219,10 @@ def test_blobFromImageWithParams(self):
 
     def test_model(self):
         img_path = self.find_dnn_file("dnn/street.png")
-        weights = self.find_dnn_file("dnn/MobileNetSSD_deploy.caffemodel", required=False)
-        config = self.find_dnn_file("dnn/MobileNetSSD_deploy.prototxt", required=False)
+        weights = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", required=False)
+        config = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", required=False)
         if weights is None or config is None:
-            raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+            raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy_19e3ec3.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
 
         frame = cv.imread(img_path)
         model = cv.dnn_DetectionModel(weights, config)
diff --git a/modules/dnn/perf/perf_caffe.cpp b/modules/dnn/perf/perf_caffe.cpp
index 370f06dba24f..f1ba26afcc6e 100644
--- a/modules/dnn/perf/perf_caffe.cpp
+++ b/modules/dnn/perf/perf_caffe.cpp
@@ -101,8 +101,8 @@ PERF_TEST(SqueezeNet_v1_1_caffe, CaffePerfTest)
 
 PERF_TEST(MobileNet_SSD, CaffePerfTest)
 {
-    caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy.prototxt",
-                                     "dnn/MobileNetSSD_deploy.caffemodel");
+    caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
+                                     "dnn/MobileNetSSD_deploy_19e3ec3.caffemodel");
     TEST_CYCLE() net->Forward();
     SANITY_CHECK_NOTHING();
 }
diff --git a/modules/dnn/perf/perf_convolution.cpp b/modules/dnn/perf/perf_convolution.cpp
index bb890c6a00ab..2c33969a7636 100644
--- a/modules/dnn/perf/perf_convolution.cpp
+++ b/modules/dnn/perf/perf_convolution.cpp
@@ -4,6 +4,7 @@
 
 #include "perf_precomp.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/core/utils/configuration.private.hpp>
 
 namespace opencv_test {
 
@@ -26,752 +27,765 @@ struct ConvParam_t {
     double declared_flops;
 };
 // Details: #12142
-// Last update: 2021-09
-static const ConvParam_t testConvolutionConfigs[] = {
+// Last update: 2023-11
+// Extended and classified: #24547
+static const ConvParam_t testConvolution_Configs[] = {
     /* GFLOPS 3.398 x 20 = 67.956 */ {{7, 7}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {3, 3}, {0, 0}, "", true, 3397788160.},
     /* GFLOPS 16.987 x 3 = 50.962 */ {{5, 5}, {{1, 1152, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 16987226112.},
     /* GFLOPS 23.122 x 2 = 46.244 */ {{5, 5}, {{1, 672, 32, 32}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 23121788928.},
-    /* GFLOPS 9.987 x 3 = 29.960 */ {{3, 3}, {{1, 256, 92, 92}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9986707456.},
-    /* GFLOPS 1.595 x 16 = 25.524 */ {{3, 3}, {{1, 256, 26, 26}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595230208.},
     /* GFLOPS 4.566 x 5 = 22.828 */ {{7, 7}, {{1, 172, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {3, 3}, {0, 0}, "", true, 4565684736.},
-    /* GFLOPS 1.596 x 14 = 22.338 */ {{3, 3}, {{1, 128, 52, 52}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595576320.},
-    /* GFLOPS 1.595 x 12 = 19.141 */ {{3, 3}, {{1, 512, 13, 13}}, 1024, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595057152.},
-    /* GFLOPS 6.814 x 2 = 13.629 */ {{3, 3}, {{1, 512, 38, 38}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6814386176.},
-    /* GFLOPS 6.637 x 2 = 13.274 */ {{3, 3}, {{1, 256, 75, 75}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6636960000.},
     /* GFLOPS 11.797 x 1 = 11.797 */ {{5, 5}, {{1, 240, 64, 64}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 11797463040.},
     /* GFLOPS 11.797 x 1 = 11.797 */ {{5, 5}, {{1, 480, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 11796971520.},
-    /* GFLOPS 10.701 x 1 = 10.701 */ {{3, 3}, {{1, 512, 38, 38}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 10700715792.},
-    /* GFLOPS 10.087 x 1 = 10.087 */ {{3, 3}, {{1, 576, 38, 50}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10086963200.},
-    /* GFLOPS 9.993 x 1 = 9.993 */ {{3, 3}, {{1, 64, 368, 368}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9993207808.},
-    /* GFLOPS 9.989 x 1 = 9.989 */ {{3, 3}, {{1, 128, 184, 184}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9988874240.},
-    /* GFLOPS 9.986 x 1 = 9.986 */ {{3, 3}, {{1, 512, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9985624064.},
-    /* GFLOPS 1.704 x 5 = 8.518 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 512, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703596544.},
-    /* GFLOPS 1.704 x 5 = 8.518 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703596544.},
-    /* GFLOPS 4.247 x 2 = 8.494 */ {{3, 3}, {{1, 480, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4247224320.},
-    /* GFLOPS 8.025 x 1 = 8.025 */ {{3, 3}, {{1, 1024, 19, 19}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 8025101478.},
-    /* GFLOPS 0.798 x 9 = 7.180 */ {{3, 3}, {{1, 128, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797788160.},
-    /* GFLOPS 0.798 x 9 = 7.179 */ {{3, 3}, {{1, 256, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797615104.},
-    /* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 300, 300}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6641280000.},
-    /* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 150, 200}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6641280000.},
-    /* GFLOPS 6.638 x 1 = 6.638 */ {{3, 3}, {{1, 128, 150, 150}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6638400000.},
-    /* GFLOPS 6.118 x 1 = 6.118 */ {{3, 3}, {{1, 144, 128, 128}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6117654528.},
-    /* GFLOPS 6.116 x 1 = 6.116 */ {{3, 3}, {{1, 1152, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6115590144.},
     /* GFLOPS 5.780 x 1 = 5.780 */ {{5, 5}, {{1, 672, 32, 32}}, 672, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 5780447232.},
-    /* GFLOPS 1.704 x 3 = 5.111 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703596544.},
-    /* GFLOPS 4.997 x 1 = 4.997 */ {{3, 3}, {{1, 64, 184, 184}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4996603904.},
-    /* GFLOPS 4.994 x 1 = 4.994 */ {{3, 3}, {{1, 128, 92, 92}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4994437120.},
-    /* GFLOPS 4.993 x 1 = 4.993 */ {{3, 3}, {{1, 256, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4993353728.},
-    /* GFLOPS 4.993 x 1 = 4.993 */ {{3, 3}, {{1, 512, 46, 46}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4992812032.},
-    /* GFLOPS 1.659 x 3 = 4.977 */ {{3, 3}, {{1, 960, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1658976000.},
-    /* GFLOPS 2.156 x 2 = 4.312 */ {{3, 3}, {{1, 576, 19, 19}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2156088384.},
     /* GFLOPS 4.247 x 1 = 4.247 */ {{5, 5}, {{1, 144, 128, 128}}, 144, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4247322624.},
-    /* GFLOPS 0.798 x 5 = 3.988 */ {{3, 3}, {{1, 512, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 797528576.},
-    /* GFLOPS 0.958 x 4 = 3.833 */ {{3, 3}, {{1, 384, 19, 19}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 958307712.},
-    /* GFLOPS 0.624 x 6 = 3.746 */ {{3, 3}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 624304640.},
-    /* GFLOPS 3.408 x 1 = 3.408 */ {{3, 3}, {{1, 256, 38, 38}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3407562752.},
     /* GFLOPS 3.407 x 1 = 3.407 */ {{3, 3}, {{1, 512, 19, 19}}, 1024, 1, {1, 1}, {6, 6}, {6, 6}, {0, 0}, "", true, 3407193088.},
-    /* GFLOPS 0.177 x 19 = 3.370 */ {{1, 1}, {{1, 512, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177382400.},
-    /* GFLOPS 0.302 x 11 = 3.325 */ {{3, 3}, {{1, 64, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 302252032.},
-    /* GFLOPS 3.321 x 1 = 3.321 */ {{3, 3}, {{1, 64, 150, 150}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3320640000.},
-    /* GFLOPS 0.830 x 4 = 3.321 */ {{3, 3}, {{1, 64, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 830160000.},
-    /* GFLOPS 3.319 x 1 = 3.319 */ {{3, 3}, {{1, 128, 75, 75}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3319200000.},
     /* GFLOPS 1.598 x 2 = 3.195 */ {{3, 3}, {{1, 32, 416, 416}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1597652992.},
-    /* GFLOPS 1.598 x 2 = 3.195 */ {{3, 3}, {{1, 32, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1597652992.},
     /* GFLOPS 1.596 x 2 = 3.193 */ {{3, 3}, {{1, 64, 208, 208}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1596268544.},
-    /* GFLOPS 1.596 x 2 = 3.193 */ {{3, 3}, {{1, 64, 104, 104}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1596268544.},
     /* GFLOPS 1.596 x 2 = 3.191 */ {{3, 3}, {{1, 128, 104, 104}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595576320.},
-    /* GFLOPS 1.595 x 2 = 3.190 */ {{3, 3}, {{1, 256, 52, 52}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595230208.},
     /* GFLOPS 1.595 x 2 = 3.190 */ {{3, 3}, {{1, 512, 26, 26}}, 1024, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595057152.},
-    /* GFLOPS 0.178 x 16 = 2.841 */ {{1, 1}, {{1, 256, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177555456.},
     /* GFLOPS 2.719 x 1 = 2.719 */ {{3, 3}, {{1, 96, 256, 256}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2719481856.},
-    /* GFLOPS 0.177 x 15 = 2.659 */ {{1, 1}, {{1, 1024, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177295872.},
-    /* GFLOPS 1.245 x 2 = 2.490 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1244880000.},
-    /* GFLOPS 0.798 x 3 = 2.394 */ {{3, 3}, {{1, 64, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 798134272.},
-    /* GFLOPS 0.472 x 5 = 2.360 */ {{3, 3}, {{1, 256, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471961600.},
-    /* GFLOPS 2.255 x 1 = 2.255 */ {{3, 3}, {{1, 128, 80, 100}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2255285760.},
-    /* GFLOPS 2.153 x 1 = 2.153 */ {{3, 3}, {{1, 128, 78, 98}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2152611840.},
-    /* GFLOPS 2.100 x 1 = 2.100 */ {{3, 3}, {{1, 144, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2100330000.},
-    /* GFLOPS 2.052 x 1 = 2.052 */ {{3, 3}, {{1, 128, 76, 96}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2052298240.},
-    /* GFLOPS 1.022 x 2 = 2.044 */ {{3, 3}, {{1, 576, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1021896057.},
     /* GFLOPS 1.995 x 1 = 1.995 */ {{9, 9}, {{1, 3, 320, 400}}, 32, 1, {1, 1}, {1, 1}, {4, 4}, {0, 0}, "", true, 1994752000.},
-    /* GFLOPS 1.954 x 1 = 1.954 */ {{3, 3}, {{1, 128, 74, 94}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1954344960.},
-    /* GFLOPS 0.958 x 2 = 1.917 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 958446336.},
-    /* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1887539200.},
-    /* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1024, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1887539200.},
-    /* GFLOPS 1.859 x 1 = 1.859 */ {{3, 3}, {{1, 128, 72, 92}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1858752000.},
-    /* GFLOPS 1.766 x 1 = 1.766 */ {{3, 3}, {{1, 128, 70, 90}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1765519360.},
-    /* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703781376.},
-    /* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703781376.},
-    /* GFLOPS 1.675 x 1 = 1.675 */ {{3, 3}, {{1, 128, 68, 88}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1674647040.},
-    /* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1659600000.},
-    /* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1659600000.},
-    /* GFLOPS 1.586 x 1 = 1.586 */ {{3, 3}, {{1, 128, 66, 86}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1586135040.},
-    /* GFLOPS 1.500 x 1 = 1.500 */ {{3, 3}, {{1, 128, 64, 84}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1499983360.},
-    /* GFLOPS 1.416 x 1 = 1.416 */ {{3, 3}, {{1, 128, 62, 82}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1416192000.},
-    /* GFLOPS 0.472 x 3 = 1.416 */ {{3, 3}, {{1, 128, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 472064000.},
-    /* GFLOPS 0.472 x 3 = 1.416 */ {{3, 3}, {{1, 512, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471910400.},
-    /* GFLOPS 0.280 x 5 = 1.402 */ {{1, 1}, {{1, 576, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 280409600.},
-    /* GFLOPS 0.701 x 2 = 1.401 */ {{3, 3}, {{1, 128, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
-    /* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 128, 56, 56}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
-    /* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 256, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231261184.},
-    /* GFLOPS 0.210 x 6 = 1.262 */ {{1, 1}, {{1, 576, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
-    /* GFLOPS 0.420 x 3 = 1.261 */ {{3, 3}, {{1, 96, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420492800.},
-    /* GFLOPS 1.261 x 1 = 1.261 */ {{3, 3}, {{1, 192, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1261113600.},
-    /* GFLOPS 1.258 x 1 = 1.258 */ {{3, 3}, {{1, 1280, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258038600.},
-    /* GFLOPS 1.248 x 1 = 1.248 */ {{3, 3}, {{1, 256, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1248338432.},
-    /* GFLOPS 1.245 x 1 = 1.245 */ {{3, 3}, {{1, 64, 75, 75}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1245240000.},
-    /* GFLOPS 1.210 x 1 = 1.210 */ {{3, 3}, {{1, 32, 256, 256}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1210056704.},
-    /* GFLOPS 1.196 x 1 = 1.196 */ {{3, 3}, {{1, 384, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1196336128.},
+    /* GFLOPS 0.945 x 2 = 1.891 */ {{3, 3}, {{1, 32, 320, 320}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 945356800.},
+    /* GFLOPS 0.945 x 2 = 1.889 */ {{3, 3}, {{1, 64, 160, 160}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 944537600.},
+    /* GFLOPS 0.944 x 2 = 1.888 */ {{3, 3}, {{1, 128, 80, 80}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 944128000.},
+    /* GFLOPS 0.944 x 2 = 1.888 */ {{3, 3}, {{1, 256, 40, 40}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 943923200.},
     /* GFLOPS 1.195 x 1 = 1.195 */ {{9, 9}, {{1, 32, 240, 320}}, 3, 1, {1, 1}, {1, 1}, {4, 4}, {0, 0}, "", true, 1194624000.},
     /* GFLOPS 1.182 x 1 = 1.182 */ {{3, 3}, {{1, 32, 320, 400}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1181696000.},
     /* GFLOPS 1.181 x 1 = 1.181 */ {{3, 3}, {{1, 64, 160, 200}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1180672000.},
-    /* GFLOPS 0.561 x 2 = 1.121 */ {{3, 3}, {{1, 128, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 560576000.},
-    /* GFLOPS 1.112 x 1 = 1.112 */ {{3, 3}, {{1, 512, 10, 10}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1111570200.},
-    /* GFLOPS 0.357 x 3 = 1.072 */ {{1, 1}, {{1, 64, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 357187584.},
     /* GFLOPS 1.062 x 1 = 1.062 */ {{3, 3}, {{1, 240, 64, 64}}, 240, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1061928960.},
-    /* GFLOPS 0.076 x 14 = 1.058 */ {{3, 3}, {{1, 64, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 75563008.},
-    /* GFLOPS 1.051 x 1 = 1.051 */ {{3, 3}, {{1, 160, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1050988800.},
-    /* GFLOPS 0.210 x 5 = 1.051 */ {{1, 1}, {{1, 256, 20, 20}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210124800.},
-    /* GFLOPS 0.210 x 5 = 1.049 */ {{1, 1}, {{1, 1024, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209817600.},
-    /* GFLOPS 1.006 x 1 = 1.006 */ {{3, 3}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1006441800.},
-    /* GFLOPS 0.246 x 4 = 0.985 */ {{1, 1}, {{1, 256, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 246240000.},
-    /* GFLOPS 0.189 x 5 = 0.947 */ {{1, 1}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189452800.},
-    /* GFLOPS 0.189 x 5 = 0.947 */ {{1, 1}, {{1, 512, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 189452800.},
-    /* GFLOPS 0.472 x 2 = 0.945 */ {{3, 3}, {{1, 64, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 472268800.},
-    /* GFLOPS 0.934 x 1 = 0.934 */ {{3, 3}, {{1, 96, 150, 150}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 933660000.},
-    /* GFLOPS 0.231 x 4 = 0.925 */ {{3, 3}, {{1, 128, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
+    /* GFLOPS 0.237 x 4 = 0.947 */ {{3, 3}, {{1, 16, 320, 320}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 236748800.},
+    /* GFLOPS 0.236 x 4 = 0.945 */ {{3, 3}, {{1, 32, 160, 160}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 236339200.},
+    /* GFLOPS 0.236 x 4 = 0.945 */ {{3, 3}, {{1, 64, 80, 80}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 236134400.},
     /* GFLOPS 0.896 x 1 = 0.896 */ {{5, 5}, {{1, 96, 27, 27}}, 256, 2, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 895981824.},
-    /* GFLOPS 0.089 x 10 = 0.890 */ {{1, 1}, {{1, 128, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88950784.},
-    /* GFLOPS 0.089 x 10 = 0.888 */ {{1, 1}, {{1, 256, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88777728.},
-    /* GFLOPS 0.876 x 1 = 0.876 */ {{3, 3}, {{1, 160, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 875824000.},
     /* GFLOPS 0.850 x 1 = 0.850 */ {{7, 7}, {{1, 3, 600, 800}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 849600000.},
-    /* GFLOPS 0.841 x 1 = 0.841 */ {{3, 3}, {{1, 128, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 840864000.},
-    /* GFLOPS 0.415 x 2 = 0.831 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415440000.},
-    /* GFLOPS 0.757 x 1 = 0.757 */ {{1, 1}, {{1, 1024, 19, 19}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 757441536.},
+    /* GFLOPS 0.356 x 2 = 0.711 */ {{6, 6}, {{1, 3, 640, 640}}, 16, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 355532800.},
+    /* GFLOPS 0.701 x 1 = 0.701 */ {{3, 3}, {{1, 128, 75, 100}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
+    /* GFLOPS 0.483 x 1 = 0.483 */ {{7, 7}, {{1, 3, 320, 320}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 483328000.},
+    /* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471910400.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 426037760.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 425945344.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415080000.},
+    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 32, 208, 208}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 399413248.},
+    /* GFLOPS 0.090 x 4 = 0.360 */ {{3, 3}, {{1, 3, 640, 640}}, 16, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 90112000.},
+    /* GFLOPS 0.170 x 2 = 0.340 */ {{3, 3}, {{1, 64, 96, 96}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 170016768.},
+    /* GFLOPS 0.315 x 1 = 0.315 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 315369600.},
+    /* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 239611584.},
+    /* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 236830720.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 213018880.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", false, 213018880.},
+    /* GFLOPS 0.212 x 1 = 0.212 */ {{7, 7}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 212400000.},
+    /* GFLOPS 0.211 x 1 = 0.211 */ {{11, 11}, {{1, 3, 227, 227}}, 96, 1, {4, 4}, {1, 1}, {0, 0}, {0, 0}, "", true, 211120800.},
+    /* GFLOPS 0.159 x 1 = 0.159 */ {{7, 7}, {{1, 3, 300, 300}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 159300000.},
+    /* GFLOPS 0.133 x 1 = 0.133 */ {{3, 3}, {{1, 128, 38, 38}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
+    /* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 120497664.},
+    /* GFLOPS 0.060 x 2 = 0.119 */ {{3, 3}, {{1, 3, 736, 736}}, 8, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 59586560.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 64, 80, 80}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 118067200.},
+    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 128, 40, 40}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 118016000.},
+    /* GFLOPS 0.115 x 1 = 0.115 */ {{3, 3}, {{1, 3, 512, 512}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 115343360.},
+    /* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 106648064.},
+    /* GFLOPS 0.050 x 2 = 0.101 */ {{2, 2}, {{1, 512, 2, 25}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 50343936.},
+    /* GFLOPS 0.044 x 2 = 0.087 */ {{5, 5}, {{1, 3, 192, 192}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 43608800.},
+    /* GFLOPS 0.042 x 2 = 0.085 */ {{3, 3}, {{1, 128, 48, 48}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 42485760.},
+    /* GFLOPS 0.021 x 4 = 0.084 */ {{5, 1}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {2, 0}, {0, 0}, "", false, 21037056.},
+    /* GFLOPS 0.021 x 4 = 0.084 */ {{1, 5}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 2}, {0, 0}, "", true, 21037056.},
+    /* GFLOPS 0.076 x 1 = 0.076 */ {{3, 3}, {{1, 3, 416, 416}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 76144640.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {8, 8}, {8, 8}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {4, 4}, {4, 4}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {16, 16}, {16, 16}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.032 x 2 = 0.065 */ {{3, 3}, {{1, 3, 192, 192}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 32440320.},
+    /* GFLOPS 0.060 x 1 = 0.060 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59920224.},
+    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 58995200.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 3, 227, 227}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 44946880.},
+    /* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44256000.},
+    /* GFLOPS 0.043 x 1 = 0.043 */ {{7, 7}, {{1, 3, 96, 96}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 43499520.},
+    /* GFLOPS 0.022 x 2 = 0.043 */ {{3, 3}, {{1, 3, 224, 224}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 21684960.},
+    /* GFLOPS 0.022 x 2 = 0.043 */ {{3, 3}, {{1, 3, 258, 258}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 21626880.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 39600000.},
+    /* GFLOPS 0.034 x 1 = 0.034 */ {{2, 2}, {{1, 64, 64, 128}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 33619968.},
+    /* GFLOPS 0.016 x 2 = 0.033 */ {{3, 3}, {{1, 3, 224, 224}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 16263720.},
+    /* GFLOPS 0.005 x 6 = 0.032 */ {{3, 3}, {{1, 16, 48, 48}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5326848.},
+    /* GFLOPS 0.005 x 6 = 0.032 */ {{3, 3}, {{1, 32, 24, 24}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5317632.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15065344.},
+    /* GFLOPS 0.029 x 1 = 0.029 */ {{3, 3}, {{1, 256, 10, 10}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 29497600.},
+    /* GFLOPS 0.023 x 1 = 0.023 */ {{3, 3}, {{1, 3, 256, 512}}, 13, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 23429120.},
+    /* GFLOPS 0.017 x 1 = 0.017 */ {{2, 2}, {{1, 16, 128, 256}}, 16, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
+    /* GFLOPS 0.003 x 6 = 0.016 */ {{3, 3}, {{1, 16, 48, 48}}, 16, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2663424.},
+    /* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15059072.},
+    /* GFLOPS 0.005 x 2 = 0.011 */ {{3, 3}, {{1, 3, 256, 256}}, 6, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 5406720.},
+    /* GFLOPS 0.005 x 2 = 0.011 */ {{3, 3}, {{1, 6, 128, 128}}, 12, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 5357568.},
+    /* GFLOPS 0.005 x 2 = 0.011 */ {{3, 3}, {{1, 12, 64, 64}}, 24, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 5332992.},
+    /* GFLOPS 0.005 x 2 = 0.011 */ {{3, 3}, {{1, 24, 32, 32}}, 48, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 5320704.},
+    /* GFLOPS 0.003 x 4 = 0.011 */ {{3, 3}, {{1, 16, 24, 24}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2663424.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{5, 5}, {{1, 32, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 10041472.},
+    /* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 7535808.},
+    /* GFLOPS 0.007 x 1 = 0.007 */ {{3, 3}, {{1, 160, 6, 6}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 6637824.},
+    /* GFLOPS 0.003 x 2 = 0.005 */ {{3, 3}, {{1, 32, 24, 24}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2658816.},
+    /* GFLOPS 0.003 x 2 = 0.005 */ {{3, 3}, {{1, 32, 12, 12}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2658816.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 16, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3691008.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 64, 6, 6}}, 128, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3687552.},
+    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 3, 128, 128}}, 6, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1351680.},
+    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 6, 64, 64}}, 12, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1339392.},
+    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 12, 32, 32}}, 24, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1333248.},
+    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 16, 12, 12}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1331712.},
+    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 24, 16, 16}}, 48, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1330176.},
+    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 2360320.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180160.},
+    /* GFLOPS 0.001 x 2 = 0.001 */ {{3, 3}, {{1, 16, 24, 24}}, 16, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 665856.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{2, 2}, {{1, 192, 2, 2}}, 195, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 299715.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{2, 2}, {{1, 192, 2, 2}}, 117, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 179829.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 73792.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{2, 2}, {{1, 192, 2, 2}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1537.},
+};
+
+static const ConvParam_t testConvolution_1x1_Configs[] = {
+    /* GFLOPS 0.280 x 5 = 1.402 */ {{1, 1}, {{1, 576, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 280409600.},
+    /* GFLOPS 0.210 x 6 = 1.262 */ {{1, 1}, {{1, 576, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
+    /* GFLOPS 0.357 x 3 = 1.072 */ {{1, 1}, {{1, 64, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 357187584.},
+    /* GFLOPS 0.246 x 4 = 0.985 */ {{1, 1}, {{1, 256, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 246240000.},
+    /* GFLOPS 0.053 x 18 = 0.947 */ {{1, 1}, {{1, 128, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 52633600.},
     /* GFLOPS 0.712 x 1 = 0.712 */ {{1, 1}, {{1, 128, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 711606272.},
     /* GFLOPS 0.178 x 4 = 0.712 */ {{1, 1}, {{1, 128, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 177901568.},
     /* GFLOPS 0.354 x 2 = 0.707 */ {{1, 1}, {{1, 256, 52, 52}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 353723760.},
     /* GFLOPS 0.351 x 2 = 0.701 */ {{1, 1}, {{1, 576, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 350512000.},
-    /* GFLOPS 0.701 x 1 = 0.701 */ {{3, 3}, {{1, 128, 75, 100}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
-    /* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 694235136.},
-    /* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 694235136.},
-    /* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231411712.},
-    /* GFLOPS 0.058 x 12 = 0.694 */ {{3, 3}, {{1, 128, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 57827840.},
-    /* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 512, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231236096.},
-    /* GFLOPS 0.160 x 4 = 0.639 */ {{3, 3}, {{1, 64, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159833472.},
     /* GFLOPS 0.211 x 3 = 0.634 */ {{1, 1}, {{1, 64, 80, 80}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 211353600.},
     /* GFLOPS 0.211 x 3 = 0.632 */ {{1, 1}, {{1, 128, 40, 40}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210534400.},
+    /* GFLOPS 0.105 x 6 = 0.632 */ {{1, 1}, {{1, 128, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 105267200.},
     /* GFLOPS 0.210 x 3 = 0.630 */ {{1, 1}, {{1, 512, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209920000.},
-    /* GFLOPS 0.210 x 3 = 0.630 */ {{1, 1}, {{1, 512, 10, 10}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209920000.},
-    /* GFLOPS 0.103 x 6 = 0.618 */ {{1, 1}, {{1, 256, 14, 14}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102961152.},
     /* GFLOPS 0.615 x 1 = 0.615 */ {{1, 1}, {{1, 320, 75, 100}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 615360000.},
-    /* GFLOPS 0.305 x 2 = 0.609 */ {{3, 3}, {{1, 3, 416, 416}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 304578560.},
-    /* GFLOPS 0.597 x 1 = 0.597 */ {{3, 3}, {{1, 576, 19, 19}}, 576, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 597254400.},
-    /* GFLOPS 0.278 x 2 = 0.557 */ {{1, 1}, {{1, 128, 46, 46}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 278431744.},
+    /* GFLOPS 0.044 x 14 = 0.609 */ {{1, 1}, {{1, 1632, 7, 7}}, 272, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 43515920.},
     /* GFLOPS 0.185 x 3 = 0.554 */ {{1, 1}, {{1, 192, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 184800000.},
-    /* GFLOPS 0.553 x 1 = 0.553 */ {{3, 3}, {{1, 64, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 553440000.},
-    /* GFLOPS 0.539 x 1 = 0.539 */ {{3, 3}, {{1, 144, 75, 75}}, 144, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 539178048.},
-    /* GFLOPS 0.103 x 5 = 0.514 */ {{1, 1}, {{1, 1024, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102810624.},
+    /* GFLOPS 0.266 x 2 = 0.532 */ {{1, 1}, {{1, 240, 48, 48}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 265973760.},
     /* GFLOPS 0.491 x 1 = 0.491 */ {{1, 1}, {{1, 576, 38, 50}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 490716800.},
-    /* GFLOPS 0.483 x 1 = 0.483 */ {{7, 7}, {{1, 3, 320, 320}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 483328000.},
-    /* GFLOPS 0.240 x 2 = 0.479 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 239680896.},
-    /* GFLOPS 0.477 x 1 = 0.477 */ {{3, 3}, {{1, 3, 368, 368}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 476692480.},
-    /* GFLOPS 0.237 x 2 = 0.474 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 236830720.},
-    /* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 512, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 471910400.},
-    /* GFLOPS 0.472 x 1 = 0.472 */ {{3, 3}, {{1, 512, 19, 19}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 471910400.},
+    /* GFLOPS 0.079 x 6 = 0.473 */ {{1, 1}, {{1, 192, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 78848000.},
+    /* GFLOPS 0.079 x 6 = 0.472 */ {{1, 1}, {{1, 384, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 78745600.},
     /* GFLOPS 0.155 x 3 = 0.464 */ {{1, 1}, {{1, 112, 32, 32}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 154828800.},
     /* GFLOPS 0.114 x 4 = 0.454 */ {{1, 1}, {{1, 192, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113541120.},
-    /* GFLOPS 0.449 x 1 = 0.449 */ {{3, 3}, {{1, 384, 13, 13}}, 384, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 448626048.},
     /* GFLOPS 0.089 x 5 = 0.443 */ {{1, 1}, {{1, 512, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88691200.},
     /* GFLOPS 0.428 x 1 = 0.428 */ {{1, 1}, {{1, 64, 64, 64}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 427991040.},
-    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 426037760.},
-    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 426037760.},
-    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 426037760.},
-    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 425945344.},
-    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 425945344.},
-    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 425945344.},
+    /* GFLOPS 0.053 x 8 = 0.426 */ {{1, 1}, {{1, 32, 160, 160}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 53248000.},
+    /* GFLOPS 0.211 x 2 = 0.423 */ {{1, 1}, {{1, 64, 160, 160}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 211353600.},
+    /* GFLOPS 0.106 x 4 = 0.423 */ {{1, 1}, {{1, 64, 160, 160}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 105676800.},
     /* GFLOPS 0.421 x 1 = 0.421 */ {{1, 1}, {{1, 576, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420614400.},
+    /* GFLOPS 0.211 x 2 = 0.421 */ {{1, 1}, {{1, 64, 80, 80}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 210528000.},
     /* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 256, 40, 40}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 420249600.},
-    /* GFLOPS 0.210 x 2 = 0.420 */ {{1, 1}, {{1, 256, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210124800.},
-    /* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 512, 20, 20}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 419840000.},
     /* GFLOPS 0.420 x 1 = 0.420 */ {{1, 1}, {{1, 1024, 10, 10}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 419635200.},
-    /* GFLOPS 0.210 x 2 = 0.420 */ {{1, 1}, {{1, 2048, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209766400.},
-    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 32, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 415440000.},
-    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 64, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 415080000.},
-    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415080000.},
-    /* GFLOPS 0.104 x 4 = 0.414 */ {{1, 1}, {{1, 64, 56, 56}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103563264.},
-    /* GFLOPS 0.103 x 4 = 0.413 */ {{1, 1}, {{1, 128, 28, 28}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103161856.},
-    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 32, 208, 208}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 399413248.},
-    /* GFLOPS 0.200 x 2 = 0.399 */ {{3, 3}, {{1, 32, 104, 104}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199706624.},
-    /* GFLOPS 0.200 x 2 = 0.399 */ {{3, 3}, {{1, 64, 52, 52}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199533568.},
-    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 128, 52, 52}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 398894080.},
-    /* GFLOPS 0.199 x 2 = 0.399 */ {{3, 3}, {{1, 128, 26, 26}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199447040.},
-    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 256, 26, 26}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 398807552.},
-    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 256, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 398807552.},
+    /* GFLOPS 0.210 x 2 = 0.420 */ {{1, 1}, {{1, 256, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 210124800.},
     /* GFLOPS 0.376 x 1 = 0.376 */ {{1, 1}, {{1, 24, 300, 400}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 376320000.},
     /* GFLOPS 0.179 x 2 = 0.357 */ {{1, 1}, {{1, 64, 208, 208}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 178593792.},
     /* GFLOPS 0.089 x 4 = 0.357 */ {{1, 1}, {{1, 64, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 89296896.},
     /* GFLOPS 0.356 x 1 = 0.356 */ {{1, 1}, {{1, 128, 104, 104}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 355803136.},
-    /* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 256, 52, 52}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 355110912.},
-    /* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 512, 26, 26}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354764800.},
-    /* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 1024, 13, 13}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354591744.},
-    /* GFLOPS 0.355 x 1 = 0.355 */ {{1, 1}, {{1, 2048, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 354505216.},
-    /* GFLOPS 0.177 x 2 = 0.353 */ {{1, 1}, {{1, 512, 26, 26}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 176689500.},
-    /* GFLOPS 0.070 x 5 = 0.348 */ {{1, 1}, {{1, 128, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 69607936.},
-    /* GFLOPS 0.347 x 1 = 0.347 */ {{3, 3}, {{1, 128, 28, 28}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 346967040.},
-    /* GFLOPS 0.347 x 1 = 0.347 */ {{3, 3}, {{1, 128, 28, 28}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 346967040.},
-    /* GFLOPS 0.014 x 24 = 0.347 */ {{3, 3}, {{1, 128, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 14456960.},
     /* GFLOPS 0.113 x 3 = 0.340 */ {{1, 1}, {{1, 1152, 16, 16}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113295360.},
-    /* GFLOPS 0.053 x 6 = 0.320 */ {{1, 1}, {{1, 576, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 53277824.},
-    /* GFLOPS 0.319 x 1 = 0.319 */ {{3, 3}, {{1, 192, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 319482112.},
-    /* GFLOPS 0.317 x 1 = 0.317 */ {{3, 3}, {{1, 3, 300, 300}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 316800000.},
-    /* GFLOPS 0.315 x 1 = 0.315 */ {{3, 3}, {{1, 96, 75, 100}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 315369600.},
-    /* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 7, 7}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
-    /* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
+    /* GFLOPS 0.080 x 4 = 0.321 */ {{1, 1}, {{1, 56, 46, 46}}, 336, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 80340288.},
+    /* GFLOPS 0.158 x 2 = 0.315 */ {{1, 1}, {{1, 192, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 157696000.},
+    /* GFLOPS 0.157 x 2 = 0.315 */ {{1, 1}, {{1, 384, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 157491200.},
     /* GFLOPS 0.154 x 2 = 0.309 */ {{1, 1}, {{1, 672, 32, 32}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 154255360.},
+    /* GFLOPS 0.103 x 3 = 0.309 */ {{1, 1}, {{1, 512, 7, 7}}, 2048, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102860800.},
     /* GFLOPS 0.308 x 1 = 0.308 */ {{1, 1}, {{1, 320, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 307680000.},
     /* GFLOPS 0.034 x 9 = 0.304 */ {{1, 1}, {{1, 64, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 33816576.},
-    /* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 256, 13, 13}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299105664.},
-    /* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 384, 13, 13}}, 256, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299084032.},
     /* GFLOPS 0.017 x 17 = 0.290 */ {{1, 1}, {{1, 32, 32, 64}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17039360.},
     /* GFLOPS 0.017 x 16 = 0.269 */ {{1, 1}, {{1, 128, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16842752.},
-    /* GFLOPS 0.133 x 2 = 0.266 */ {{3, 3}, {{1, 128, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
-    /* GFLOPS 0.266 x 1 = 0.266 */ {{1, 1}, {{1, 384, 52, 52}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 266160128.},
     /* GFLOPS 0.266 x 1 = 0.266 */ {{1, 1}, {{1, 768, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 265987072.},
-    /* GFLOPS 0.038 x 7 = 0.265 */ {{3, 3}, {{1, 16, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37879808.},
-    /* GFLOPS 0.019 x 14 = 0.264 */ {{3, 3}, {{1, 64, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18890752.},
+    /* GFLOPS 0.132 x 2 = 0.263 */ {{1, 1}, {{1, 128, 80, 80}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 131584000.},
+    /* GFLOPS 0.026 x 10 = 0.263 */ {{1, 1}, {{1, 128, 40, 40}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26316800.},
     /* GFLOPS 0.262 x 1 = 0.262 */ {{1, 1}, {{1, 2560, 20, 20}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 262195200.},
-    /* GFLOPS 0.126 x 2 = 0.252 */ {{3, 3}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 125812050.},
     /* GFLOPS 0.248 x 1 = 0.248 */ {{1, 1}, {{1, 64, 150, 200}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 247680000.},
-    /* GFLOPS 0.040 x 6 = 0.240 */ {{1, 1}, {{1, 576, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
-    /* GFLOPS 0.080 x 3 = 0.240 */ {{3, 3}, {{1, 96, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 79893632.},
-    /* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 38, 38}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 239611584.},
-    /* GFLOPS 0.240 x 1 = 0.240 */ {{3, 3}, {{1, 192, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 239611584.},
+    /* GFLOPS 0.041 x 6 = 0.245 */ {{1, 1}, {{1, 80, 23, 23}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 40881120.},
     /* GFLOPS 0.079 x 3 = 0.237 */ {{1, 1}, {{1, 80, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 79134720.},
-    /* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", false, 236830720.},
-    /* GFLOPS 0.237 x 1 = 0.237 */ {{7, 7}, {{1, 3, 224, 224}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 236830720.},
-    /* GFLOPS 0.118 x 2 = 0.236 */ {{3, 3}, {{1, 32, 80, 80}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118169600.},
-    /* GFLOPS 0.236 x 1 = 0.236 */ {{3, 3}, {{1, 256, 19, 19}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 235980800.},
     /* GFLOPS 0.116 x 2 = 0.231 */ {{1, 1}, {{1, 24, 128, 128}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 115605504.},
-    /* GFLOPS 0.111 x 2 = 0.221 */ {{3, 3}, {{1, 192, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110624000.},
-    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 213018880.},
-    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", false, 213018880.},
-    /* GFLOPS 0.107 x 2 = 0.213 */ {{3, 3}, {{1, 128, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 106509440.},
-    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 212972672.},
-    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 512, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 212949568.},
-    /* GFLOPS 0.212 x 1 = 0.212 */ {{7, 7}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 212400000.},
-    /* GFLOPS 0.211 x 1 = 0.211 */ {{11, 11}, {{1, 3, 227, 227}}, 96, 1, {4, 4}, {1, 1}, {0, 0}, {0, 0}, "", true, 211120800.},
-    /* GFLOPS 0.210 x 1 = 0.210 */ {{3, 3}, {{1, 64, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
-    /* GFLOPS 0.210 x 1 = 0.210 */ {{1, 1}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 209817600.},
-    /* GFLOPS 0.210 x 1 = 0.210 */ {{1, 1}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 209817600.},
-    /* GFLOPS 0.104 x 2 = 0.208 */ {{3, 3}, {{1, 32, 75, 75}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 103860000.},
+    /* GFLOPS 0.107 x 2 = 0.215 */ {{1, 1}, {{1, 16, 184, 184}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 107255808.},
+    /* GFLOPS 0.106 x 2 = 0.213 */ {{1, 1}, {{1, 32, 160, 160}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 106496000.},
+    /* GFLOPS 0.105 x 2 = 0.210 */ {{1, 1}, {{1, 128, 40, 40}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 104856000.},
     /* GFLOPS 0.208 x 1 = 0.208 */ {{1, 1}, {{1, 16, 256, 256}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 207618048.},
     /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205922304.},
-    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205922304.},
-    /* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 256, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102961152.},
-    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 512, 28, 28}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205721600.},
     /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 512, 28, 28}}, 1024, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205721600.},
-    /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 1024, 14, 14}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 205621248.},
     /* GFLOPS 0.206 x 1 = 0.206 */ {{1, 1}, {{1, 1024, 14, 14}}, 2048, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 205621248.},
+    /* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 1024, 7, 7}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102810624.},
     /* GFLOPS 0.103 x 2 = 0.206 */ {{1, 1}, {{1, 2048, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 102785536.},
     /* GFLOPS 0.201 x 1 = 0.201 */ {{1, 1}, {{1, 512, 14, 14}}, 1000, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 200900000.},
-    /* GFLOPS 0.200 x 1 = 0.200 */ {{3, 3}, {{1, 160, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 199687872.},
     /* GFLOPS 0.190 x 1 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189637632.},
-    /* GFLOPS 0.190 x 1 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 189637632.},
     /* GFLOPS 0.047 x 4 = 0.190 */ {{1, 1}, {{1, 256, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 47409408.},
-    /* GFLOPS 0.189 x 1 = 0.189 */ {{1, 1}, {{1, 1024, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 189360384.},
-    /* GFLOPS 0.038 x 5 = 0.189 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37814272.},
     /* GFLOPS 0.189 x 1 = 0.189 */ {{1, 1}, {{1, 1152, 16, 16}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 188825600.},
     /* GFLOPS 0.185 x 1 = 0.185 */ {{1, 1}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 185040000.},
-    /* GFLOPS 0.185 x 1 = 0.185 */ {{1, 1}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 185040000.},
-    /* GFLOPS 0.181 x 1 = 0.181 */ {{3, 3}, {{1, 160, 14, 14}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 180696320.},
-    /* GFLOPS 0.181 x 1 = 0.181 */ {{3, 3}, {{1, 160, 14, 14}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 180696320.},
-    /* GFLOPS 0.090 x 2 = 0.181 */ {{3, 3}, {{1, 224, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 90339200.},
     /* GFLOPS 0.180 x 1 = 0.180 */ {{1, 1}, {{1, 224, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 180232192.},
+    /* GFLOPS 0.045 x 4 = 0.179 */ {{1, 1}, {{1, 16, 184, 184}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 44689920.},
+    /* GFLOPS 0.089 x 2 = 0.177 */ {{1, 1}, {{1, 24, 112, 112}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 88510464.},
     /* GFLOPS 0.088 x 2 = 0.177 */ {{1, 1}, {{1, 1024, 13, 13}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 88301655.},
-    /* GFLOPS 0.174 x 1 = 0.174 */ {{3, 3}, {{1, 96, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 173508608.},
-    /* GFLOPS 0.174 x 1 = 0.174 */ {{3, 3}, {{1, 96, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 173508608.},
-    /* GFLOPS 0.166 x 1 = 0.166 */ {{3, 3}, {{1, 160, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 166406560.},
-    /* GFLOPS 0.080 x 2 = 0.160 */ {{1, 1}, {{1, 576, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 79916736.},
-    /* GFLOPS 0.160 x 1 = 0.160 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159764160.},
-    /* GFLOPS 0.160 x 1 = 0.160 */ {{3, 3}, {{1, 1024, 19, 19}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 159703512.},
-    /* GFLOPS 0.159 x 1 = 0.159 */ {{7, 7}, {{1, 3, 300, 300}}, 24, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 159300000.},
+    /* GFLOPS 0.041 x 4 = 0.163 */ {{1, 1}, {{1, 480, 23, 23}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 40669520.},
+    /* GFLOPS 0.080 x 2 = 0.159 */ {{1, 1}, {{1, 336, 46, 46}}, 56, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 79747808.},
     /* GFLOPS 0.080 x 2 = 0.159 */ {{1, 1}, {{1, 40, 64, 64}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 79626240.},
+    /* GFLOPS 0.079 x 2 = 0.159 */ {{1, 1}, {{1, 48, 160, 160}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 79462400.},
+    /* GFLOPS 0.079 x 2 = 0.158 */ {{1, 1}, {{1, 96, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 79052800.},
     /* GFLOPS 0.079 x 2 = 0.157 */ {{1, 1}, {{1, 480, 32, 32}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78725120.},
-    /* GFLOPS 0.155 x 1 = 0.155 */ {{1, 1}, {{1, 192, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 154542080.},
-    /* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 146369664.},
-    /* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 146369664.},
+    /* GFLOPS 0.074 x 2 = 0.147 */ {{1, 1}, {{1, 8, 368, 368}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 73670656.},
     /* GFLOPS 0.072 x 2 = 0.144 */ {{1, 1}, {{1, 1024, 10, 10}}, 352, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 72124800.},
+    /* GFLOPS 0.072 x 2 = 0.143 */ {{1, 1}, {{1, 1632, 7, 7}}, 448, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 71673280.},
     /* GFLOPS 0.140 x 1 = 0.140 */ {{1, 1}, {{1, 576, 38, 50}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 140204800.},
-    /* GFLOPS 0.139 x 1 = 0.139 */ {{3, 3}, {{1, 256, 5, 5}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 138961350.},
     /* GFLOPS 0.017 x 8 = 0.138 */ {{1, 1}, {{1, 16, 64, 128}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17301504.},
-    /* GFLOPS 0.067 x 2 = 0.133 */ {{1, 1}, {{1, 576, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 66597280.},
-    /* GFLOPS 0.133 x 1 = 0.133 */ {{3, 3}, {{1, 128, 38, 38}}, 160, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
     /* GFLOPS 0.044 x 3 = 0.133 */ {{1, 1}, {{1, 512, 13, 13}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44345600.},
     /* GFLOPS 0.129 x 1 = 0.129 */ {{1, 1}, {{1, 160, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 128851968.},
-    /* GFLOPS 0.128 x 1 = 0.128 */ {{3, 3}, {{1, 64, 24, 24}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 127512576.},
-    /* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 120497664.},
-    /* GFLOPS 0.120 x 1 = 0.120 */ {{5, 5}, {{1, 32, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 120497664.},
-    /* GFLOPS 0.040 x 3 = 0.120 */ {{1, 1}, {{1, 96, 19, 19}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 40131648.},
     /* GFLOPS 0.118 x 1 = 0.118 */ {{1, 1}, {{1, 320, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 118477312.},
-    /* GFLOPS 0.017 x 7 = 0.118 */ {{1, 1}, {{1, 64, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
-    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 64, 80, 80}}, 64, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 118067200.},
-    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 64, 40, 40}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118067200.},
     /* GFLOPS 0.039 x 3 = 0.118 */ {{1, 1}, {{1, 1024, 10, 10}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39340800.},
-    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 128, 40, 40}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 118016000.},
-    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 128, 20, 20}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 118016000.},
-    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 256, 20, 20}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 117990400.},
-    /* GFLOPS 0.118 x 1 = 0.118 */ {{3, 3}, {{1, 256, 19, 19}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 117990400.},
-    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 58003456.},
-    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57903104.},
-    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57852928.},
-    /* GFLOPS 0.116 x 1 = 0.116 */ {{3, 3}, {{1, 128, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 115655680.},
-    /* GFLOPS 0.116 x 1 = 0.116 */ {{3, 3}, {{1, 128, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 115655680.},
-    /* GFLOPS 0.115 x 1 = 0.115 */ {{3, 3}, {{1, 3, 512, 512}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 115343360.},
+    /* GFLOPS 0.017 x 7 = 0.118 */ {{1, 1}, {{1, 64, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
+    /* GFLOPS 0.019 x 6 = 0.115 */ {{1, 1}, {{1, 32, 96, 96}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 19169280.},
     /* GFLOPS 0.114 x 1 = 0.114 */ {{1, 1}, {{1, 144, 128, 128}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 113639424.},
+    /* GFLOPS 0.057 x 2 = 0.114 */ {{1, 1}, {{1, 240, 46, 46}}, 56, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 56996576.},
+    /* GFLOPS 0.056 x 2 = 0.113 */ {{1, 1}, {{1, 448, 7, 7}}, 1280, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 56259840.},
     /* GFLOPS 0.112 x 1 = 0.112 */ {{1, 1}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 111875400.},
     /* GFLOPS 0.110 x 1 = 0.110 */ {{1, 1}, {{1, 480, 32, 32}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 110215168.},
+    /* GFLOPS 0.054 x 2 = 0.108 */ {{1, 1}, {{1, 16, 320, 320}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 54067200.},
     /* GFLOPS 0.107 x 1 = 0.107 */ {{1, 1}, {{1, 64, 32, 32}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 106997760.},
     /* GFLOPS 0.036 x 3 = 0.107 */ {{1, 1}, {{1, 192, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 35580160.},
-    /* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 106648064.},
-    /* GFLOPS 0.107 x 1 = 0.107 */ {{3, 3}, {{1, 64, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 106555648.},
+    /* GFLOPS 0.027 x 4 = 0.106 */ {{1, 1}, {{1, 32, 160, 160}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26624000.},
+    /* GFLOPS 0.027 x 4 = 0.106 */ {{1, 1}, {{1, 24, 92, 92}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26543104.},
+    /* GFLOPS 0.026 x 4 = 0.106 */ {{1, 1}, {{1, 64, 80, 80}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26419200.},
     /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 256, 40, 40}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 105062400.},
-    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 20, 20}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104960000.},
-    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 104960000.},
-    /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 512, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104960000.},
     /* GFLOPS 0.105 x 1 = 0.105 */ {{1, 1}, {{1, 1024, 10, 10}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 104908800.},
-    /* GFLOPS 0.103 x 1 = 0.103 */ {{1, 1}, {{1, 128, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 103161856.},
-    /* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 51480576.},
-    /* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 51480576.},
-    /* GFLOPS 0.008 x 12 = 0.101 */ {{1, 1}, {{1, 64, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 8454144.},
+    /* GFLOPS 0.052 x 2 = 0.105 */ {{1, 1}, {{1, 256, 20, 20}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 52326000.},
+    /* GFLOPS 0.026 x 4 = 0.105 */ {{1, 1}, {{1, 64, 92, 92}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26204544.},
+    /* GFLOPS 0.052 x 2 = 0.104 */ {{1, 1}, {{1, 32, 112, 112}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 52183040.},
+    /* GFLOPS 0.051 x 2 = 0.103 */ {{1, 1}, {{1, 512, 7, 7}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51430400.},
     /* GFLOPS 0.101 x 1 = 0.101 */ {{1, 1}, {{1, 512, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 101016825.},
-    /* GFLOPS 0.096 x 1 = 0.096 */ {{1, 1}, {{1, 480, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 96438272.},
+    /* GFLOPS 0.008 x 12 = 0.101 */ {{1, 1}, {{1, 64, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 8454144.},
+    /* GFLOPS 0.050 x 2 = 0.100 */ {{1, 1}, {{1, 24, 92, 92}}, 120, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 49768320.},
     /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 95003648.},
-    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 95003648.},
-    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 256, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 94818816.},
-    /* GFLOPS 0.095 x 1 = 0.095 */ {{1, 1}, {{1, 256, 19, 19}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 94818816.},
     /* GFLOPS 0.094 x 1 = 0.094 */ {{1, 1}, {{1, 32, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 93600000.},
-    /* GFLOPS 0.094 x 1 = 0.094 */ {{1, 1}, {{1, 32, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 93600000.},
     /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 512, 38, 50}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 93480000.},
     /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 576, 19, 19}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 93236192.},
     /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 64, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 92880000.},
-    /* GFLOPS 0.093 x 1 = 0.093 */ {{1, 1}, {{1, 64, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 92880000.},
-    /* GFLOPS 0.031 x 3 = 0.092 */ {{1, 1}, {{1, 160, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30816000.},
     /* GFLOPS 0.092 x 1 = 0.092 */ {{1, 1}, {{1, 192, 75, 100}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 92400000.},
-    /* GFLOPS 0.090 x 1 = 0.090 */ {{1, 1}, {{1, 448, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 90015744.},
-    /* GFLOPS 0.045 x 2 = 0.090 */ {{3, 3}, {{1, 576, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44918508.},
-    /* GFLOPS 0.044 x 2 = 0.089 */ {{1, 1}, {{1, 256, 26, 26}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44388864.},
-    /* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 88554368.},
-    /* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 88554368.},
-    /* GFLOPS 0.088 x 1 = 0.088 */ {{1, 1}, {{1, 256, 26, 26}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 88430940.},
-    /* GFLOPS 0.021 x 4 = 0.084 */ {{5, 1}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {2, 0}, {0, 0}, "", false, 21037056.},
-    /* GFLOPS 0.021 x 4 = 0.084 */ {{1, 5}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 2}, {0, 0}, "", true, 21037056.},
-    /* GFLOPS 0.084 x 1 = 0.084 */ {{1, 1}, {{1, 416, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 83593216.},
+    /* GFLOPS 0.031 x 3 = 0.092 */ {{1, 1}, {{1, 160, 10, 10}}, 960, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30816000.},
+    /* GFLOPS 0.044 x 2 = 0.088 */ {{1, 1}, {{1, 40, 184, 184}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 43877376.},
+    /* GFLOPS 0.044 x 2 = 0.087 */ {{1, 1}, {{1, 272, 7, 7}}, 1632, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 43582560.},
+    /* GFLOPS 0.042 x 2 = 0.084 */ {{1, 1}, {{1, 672, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 42179200.},
     /* GFLOPS 0.082 x 1 = 0.082 */ {{1, 1}, {{1, 320, 10, 10}}, 1280, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 82048000.},
-    /* GFLOPS 0.040 x 2 = 0.080 */ {{1, 1}, {{1, 576, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39958368.},
-    /* GFLOPS 0.040 x 2 = 0.079 */ {{1, 1}, {{1, 24, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39690000.},
-    /* GFLOPS 0.040 x 2 = 0.079 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39600000.},
+    /* GFLOPS 0.041 x 2 = 0.082 */ {{1, 1}, {{1, 40, 46, 46}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 41135040.},
+    /* GFLOPS 0.040 x 2 = 0.080 */ {{1, 1}, {{1, 24, 92, 92}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 39814656.},
+    /* GFLOPS 0.013 x 6 = 0.080 */ {{1, 1}, {{1, 32, 80, 80}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13312000.},
     /* GFLOPS 0.079 x 1 = 0.079 */ {{1, 1}, {{1, 240, 64, 64}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78807040.},
     /* GFLOPS 0.079 x 1 = 0.079 */ {{1, 1}, {{1, 384, 40, 40}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 78745600.},
+    /* GFLOPS 0.040 x 2 = 0.079 */ {{1, 1}, {{1, 24, 75, 75}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39690000.},
     /* GFLOPS 0.077 x 1 = 0.077 */ {{1, 1}, {{1, 96, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 77471744.},
-    /* GFLOPS 0.077 x 1 = 0.077 */ {{3, 3}, {{1, 192, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 77436800.},
-    /* GFLOPS 0.077 x 1 = 0.077 */ {{1, 1}, {{1, 384, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 77170688.},
-    /* GFLOPS 0.076 x 1 = 0.076 */ {{3, 3}, {{1, 3, 416, 416}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 76144640.},
     /* GFLOPS 0.076 x 1 = 0.076 */ {{1, 1}, {{1, 96, 128, 128}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 75890688.},
-    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {8, 8}, {8, 8}, {0, 0}, "", true, 37814272.},
-    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {4, 4}, {4, 4}, {0, 0}, "", true, 37814272.},
-    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {2, 2}, {2, 2}, {0, 0}, "", true, 37814272.},
-    /* GFLOPS 0.038 x 2 = 0.076 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {16, 16}, {16, 16}, {0, 0}, "", true, 37814272.},
-    /* GFLOPS 0.018 x 4 = 0.072 */ {{1, 1}, {{1, 64, 19, 19}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17882496.},
+    /* GFLOPS 0.038 x 2 = 0.076 */ {{1, 1}, {{1, 64, 48, 48}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 38043648.},
+    /* GFLOPS 0.018 x 4 = 0.074 */ {{1, 1}, {{1, 8, 368, 368}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 18417664.},
     /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 16, 150, 150}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 71280000.},
-    /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 352, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 70748160.},
     /* GFLOPS 0.071 x 1 = 0.071 */ {{1, 1}, {{1, 24, 150, 150}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 70560000.},
-    /* GFLOPS 0.070 x 1 = 0.070 */ {{3, 3}, {{1, 96, 14, 14}}, 208, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 70487872.},
-    /* GFLOPS 0.069 x 1 = 0.069 */ {{3, 3}, {{1, 96, 14, 14}}, 204, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 69132336.},
     /* GFLOPS 0.068 x 1 = 0.068 */ {{1, 1}, {{1, 32, 256, 256}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 68157440.},
-    /* GFLOPS 0.005 x 14 = 0.066 */ {{3, 3}, {{1, 64, 8, 8}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4722688.},
     /* GFLOPS 0.066 x 1 = 0.066 */ {{1, 1}, {{1, 672, 16, 16}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 66109440.},
     /* GFLOPS 0.066 x 1 = 0.066 */ {{1, 1}, {{1, 1280, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 65561600.},
-    /* GFLOPS 0.033 x 2 = 0.065 */ {{3, 3}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 32551680.},
-    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 65046912.},
-    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 65046912.},
-    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 160, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 64534400.},
-    /* GFLOPS 0.064 x 1 = 0.064 */ {{1, 1}, {{1, 320, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 64325632.},
-    /* GFLOPS 0.032 x 2 = 0.064 */ {{3, 3}, {{1, 96, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 31868928.},
+    /* GFLOPS 0.033 x 2 = 0.066 */ {{1, 1}, {{1, 128, 40, 40}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32896000.},
+    /* GFLOPS 0.016 x 4 = 0.066 */ {{1, 1}, {{1, 40, 46, 46}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 16454016.},
+    /* GFLOPS 0.016 x 4 = 0.065 */ {{1, 1}, {{1, 96, 46, 46}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 16335520.},
     /* GFLOPS 0.061 x 1 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 61472000.},
-    /* GFLOPS 0.031 x 2 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30736000.},
     /* GFLOPS 0.061 x 1 = 0.061 */ {{1, 1}, {{1, 512, 46, 46}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 60729200.},
-    /* GFLOPS 0.060 x 1 = 0.060 */ {{3, 3}, {{1, 96, 38, 38}}, 96, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59920224.},
+    /* GFLOPS 0.031 x 2 = 0.061 */ {{1, 1}, {{1, 960, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 30736000.},
     /* GFLOPS 0.059 x 1 = 0.059 */ {{1, 1}, {{1, 320, 38, 38}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 59238656.},
-    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 128, 19, 19}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 59008000.},
-    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 58995200.},
-    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 58995200.},
-    /* GFLOPS 0.059 x 1 = 0.059 */ {{3, 3}, {{1, 256, 10, 10}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 58995200.},
-    /* GFLOPS 0.058 x 1 = 0.058 */ {{1, 1}, {{1, 288, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 57903104.},
-    /* GFLOPS 0.004 x 16 = 0.058 */ {{3, 3}, {{1, 128, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 3614240.},
-    /* GFLOPS 0.055 x 1 = 0.055 */ {{3, 3}, {{1, 1280, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55298400.},
+    /* GFLOPS 0.007 x 8 = 0.059 */ {{1, 1}, {{1, 112, 7, 7}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7408800.},
+    /* GFLOPS 0.010 x 6 = 0.058 */ {{1, 1}, {{1, 56, 16, 16}}, 336, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9719808.},
+    /* GFLOPS 0.010 x 6 = 0.058 */ {{1, 1}, {{1, 64, 14, 14}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9709056.},
+    /* GFLOPS 0.028 x 2 = 0.057 */ {{1, 1}, {{1, 336, 23, 23}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28481360.},
+    /* GFLOPS 0.007 x 8 = 0.057 */ {{1, 1}, {{1, 96, 8, 8}}, 576, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7114752.},
+    /* GFLOPS 0.027 x 2 = 0.054 */ {{1, 1}, {{1, 16, 160, 160}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 27033600.},
     /* GFLOPS 0.018 x 3 = 0.054 */ {{1, 1}, {{1, 32, 38, 38}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18021120.},
-    /* GFLOPS 0.018 x 3 = 0.053 */ {{1, 1}, {{1, 384, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17766976.},
-    /* GFLOPS 0.053 x 1 = 0.053 */ {{3, 3}, {{1, 128, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 53254720.},
+    /* GFLOPS 0.014 x 4 = 0.054 */ {{1, 1}, {{1, 16, 160, 160}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13516800.},
     /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 528, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 53036032.},
-    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 528, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 53036032.},
-    /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 64, 80, 80}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52838400.},
     /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 64, 40, 40}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52838400.},
     /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 128, 80, 80}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52633600.},
     /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 128, 20, 20}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52633600.},
     /* GFLOPS 0.053 x 1 = 0.053 */ {{1, 1}, {{1, 256, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52531200.},
-    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 52454400.},
-    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 52454400.},
-    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 52454400.},
+    /* GFLOPS 0.026 x 2 = 0.053 */ {{1, 1}, {{1, 16, 112, 112}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26492928.},
+    /* GFLOPS 0.013 x 4 = 0.053 */ {{1, 1}, {{1, 128, 20, 20}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13158400.},
     /* GFLOPS 0.026 x 2 = 0.052 */ {{1, 1}, {{1, 1024, 10, 10}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26227200.},
-    /* GFLOPS 0.052 x 1 = 0.052 */ {{1, 1}, {{1, 64, 56, 56}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51781632.},
+    /* GFLOPS 0.013 x 4 = 0.052 */ {{1, 1}, {{1, 16, 64, 64}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12976128.},
     /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 256, 56, 56}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51480576.},
-    /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 256, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 51480576.},
     /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 512, 28, 28}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51430400.},
-    /* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25715200.},
-    /* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 25715200.},
-    /* GFLOPS 0.013 x 4 = 0.051 */ {{1, 1}, {{1, 512, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12857600.},
     /* GFLOPS 0.051 x 1 = 0.051 */ {{1, 1}, {{1, 1024, 14, 14}}, 512, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 51405312.},
-    /* GFLOPS 0.050 x 1 = 0.050 */ {{1, 1}, {{1, 992, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 49799680.},
-    /* GFLOPS 0.048 x 1 = 0.048 */ {{1, 1}, {{1, 960, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 48194048.},
-    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 256, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 47409408.},
+    /* GFLOPS 0.026 x 2 = 0.051 */ {{1, 1}, {{1, 960, 7, 7}}, 272, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25603088.},
     /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 144, 64, 64}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 47349760.},
     /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 512, 38, 50}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 46740000.},
-    /* GFLOPS 0.047 x 1 = 0.047 */ {{1, 1}, {{1, 928, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 46588416.},
-    /* GFLOPS 0.046 x 1 = 0.046 */ {{1, 1}, {{1, 64, 75, 75}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 46440000.},
-    /* GFLOPS 0.023 x 2 = 0.045 */ {{3, 3}, {{1, 256, 3, 3}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22648626.},
-    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 45174080.},
-    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 45174080.},
+    /* GFLOPS 0.023 x 2 = 0.046 */ {{1, 1}, {{1, 56, 46, 46}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 22954368.},
     /* GFLOPS 0.045 x 1 = 0.045 */ {{1, 1}, {{1, 224, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 45058048.},
-    /* GFLOPS 0.023 x 2 = 0.045 */ {{1, 1}, {{1, 512, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 22500800.},
-    /* GFLOPS 0.045 x 1 = 0.045 */ {{1, 1}, {{1, 896, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 44982784.},
-    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 3, 227, 227}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 44946880.},
-    /* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44256000.},
-    /* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44239200.},
     /* GFLOPS 0.044 x 1 = 0.044 */ {{1, 1}, {{1, 512, 13, 13}}, 255, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 44172375.},
-    /* GFLOPS 0.043 x 1 = 0.043 */ {{7, 7}, {{1, 3, 96, 96}}, 64, 1, {2, 2}, {1, 1}, {3, 3}, {0, 0}, "", true, 43499520.},
-    /* GFLOPS 0.043 x 1 = 0.043 */ {{1, 1}, {{1, 864, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 43377152.},
-    /* GFLOPS 0.042 x 1 = 0.042 */ {{1, 1}, {{1, 832, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 41771520.},
-    /* GFLOPS 0.040 x 1 = 0.040 */ {{5, 5}, {{1, 32, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 40165888.},
-    /* GFLOPS 0.040 x 1 = 0.040 */ {{5, 5}, {{1, 32, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 40165888.},
-    /* GFLOPS 0.040 x 1 = 0.040 */ {{1, 1}, {{1, 800, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 40165888.},
-    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 64, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
-    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 256, 19, 19}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 39932376.},
-    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 3, 300, 300}}, 32, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 39600000.},
+    /* GFLOPS 0.007 x 6 = 0.044 */ {{1, 1}, {{1, 672, 7, 7}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7381360.},
+    /* GFLOPS 0.007 x 6 = 0.043 */ {{1, 1}, {{1, 576, 8, 8}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7084032.},
+    /* GFLOPS 0.020 x 2 = 0.041 */ {{1, 1}, {{1, 120, 46, 46}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 20398240.},
+    /* GFLOPS 0.010 x 4 = 0.040 */ {{1, 1}, {{1, 16, 56, 56}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9934848.},
     /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 240, 32, 32}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39403520.},
     /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 144, 75, 75}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 39015000.},
     /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 192, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 38635520.},
-    /* GFLOPS 0.039 x 1 = 0.039 */ {{1, 1}, {{1, 768, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 38560256.},
-    /* GFLOPS 0.037 x 1 = 0.037 */ {{1, 1}, {{1, 736, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 36954624.},
+    /* GFLOPS 0.020 x 2 = 0.039 */ {{1, 1}, {{1, 32, 112, 112}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 19568640.},
+    /* GFLOPS 0.010 x 4 = 0.039 */ {{1, 1}, {{1, 336, 16, 16}}, 56, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9648128.},
+    /* GFLOPS 0.019 x 2 = 0.038 */ {{1, 1}, {{1, 32, 48, 48}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19169280.},
+    /* GFLOPS 0.005 x 8 = 0.038 */ {{1, 1}, {{1, 256, 6, 6}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4727808.},
     /* GFLOPS 0.036 x 1 = 0.036 */ {{1, 1}, {{1, 480, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 36164352.},
-    /* GFLOPS 0.036 x 1 = 0.036 */ {{1, 1}, {{1, 480, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 36164352.},
-    /* GFLOPS 0.018 x 2 = 0.036 */ {{1, 1}, {{1, 192, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 17790080.},
-    /* GFLOPS 0.035 x 1 = 0.035 */ {{1, 1}, {{1, 704, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 35348992.},
+    /* GFLOPS 0.018 x 2 = 0.036 */ {{1, 1}, {{1, 40, 46, 46}}, 104, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 17825184.},
+    /* GFLOPS 0.009 x 4 = 0.036 */ {{1, 1}, {{1, 8, 256, 256}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 8912896.},
     /* GFLOPS 0.035 x 1 = 0.035 */ {{1, 1}, {{1, 512, 46, 46}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 34702400.},
-    /* GFLOPS 0.034 x 1 = 0.034 */ {{1, 1}, {{1, 672, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 33743360.},
+    /* GFLOPS 0.018 x 2 = 0.035 */ {{1, 1}, {{1, 104, 46, 46}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 17689760.},
     /* GFLOPS 0.034 x 1 = 0.034 */ {{1, 1}, {{1, 128, 32, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 33685504.},
-    /* GFLOPS 0.034 x 1 = 0.034 */ {{2, 2}, {{1, 64, 64, 128}}, 32, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 33619968.},
-    /* GFLOPS 0.033 x 1 = 0.033 */ {{3, 3}, {{1, 256, 3, 3}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 33350724.},
+    /* GFLOPS 0.017 x 2 = 0.034 */ {{1, 1}, {{1, 192, 28, 28}}, 56, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 16903040.},
     /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 528, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 33147520.},
-    /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 528, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 33147520.},
     /* GFLOPS 0.033 x 1 = 0.033 */ {{1, 1}, {{1, 1024, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 32784000.},
+    /* GFLOPS 0.016 x 2 = 0.033 */ {{1, 1}, {{1, 40, 92, 92}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 16454016.},
+    /* GFLOPS 0.005 x 6 = 0.033 */ {{1, 1}, {{1, 48, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5475456.},
     /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 160, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 32212992.},
     /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 512, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32144000.},
-    /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 640, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 32137728.},
     /* GFLOPS 0.032 x 1 = 0.032 */ {{1, 1}, {{1, 508, 14, 14}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 31893120.},
     /* GFLOPS 0.011 x 3 = 0.032 */ {{1, 1}, {{1, 320, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10502144.},
     /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 832, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 31328640.},
-    /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 832, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 31328640.},
-    /* GFLOPS 0.031 x 1 = 0.031 */ {{1, 1}, {{1, 608, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 30532096.},
     /* GFLOPS 0.015 x 2 = 0.030 */ {{1, 1}, {{1, 128, 46, 46}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 15226736.},
-    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15065344.},
-    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 24, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15065344.},
-    /* GFLOPS 0.015 x 2 = 0.030 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15059072.},
-    /* GFLOPS 0.029 x 1 = 0.029 */ {{3, 3}, {{1, 256, 10, 10}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 29497600.},
-    /* GFLOPS 0.015 x 2 = 0.029 */ {{1, 1}, {{1, 112, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 14745600.},
-    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 192, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28976640.},
-    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 192, 28, 28}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 28976640.},
+    /* GFLOPS 0.015 x 2 = 0.030 */ {{1, 1}, {{1, 336, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 14773696.},
+    /* GFLOPS 0.005 x 6 = 0.030 */ {{1, 1}, {{1, 40, 16, 16}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4976640.},
     /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 512, 14, 14}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 28929600.},
-    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 512, 14, 14}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 28929600.},
-    /* GFLOPS 0.029 x 1 = 0.029 */ {{1, 1}, {{1, 576, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 28926464.},
-    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 544, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 27320832.},
-    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 64, 16, 16}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 26749440.},
+    /* GFLOPS 0.015 x 2 = 0.029 */ {{1, 1}, {{1, 112, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 14745600.},
+    /* GFLOPS 0.007 x 4 = 0.029 */ {{1, 1}, {{1, 24, 32, 32}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7225344.},
+    /* GFLOPS 0.014 x 2 = 0.028 */ {{1, 1}, {{1, 576, 8, 8}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 14168064.},
     /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 384, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 26650464.},
     /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 576, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26638912.},
-    /* GFLOPS 0.027 x 1 = 0.027 */ {{3, 3}, {{1, 128, 38, 38}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 26627360.},
-    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 528, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 26518016.},
-    /* GFLOPS 0.027 x 1 = 0.027 */ {{1, 1}, {{1, 528, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 26518016.},
-    /* GFLOPS 0.009 x 3 = 0.026 */ {{1, 1}, {{1, 128, 46, 46}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 8700992.},
     /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 96, 75, 75}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 26055000.},
-    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 25890816.},
-    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25890816.},
-    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 64, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25890816.},
     /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 1024, 10, 10}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25817400.},
-    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 128, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25790464.},
-    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 25740288.},
-    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 25740288.},
-    /* GFLOPS 0.013 x 2 = 0.026 */ {{1, 1}, {{1, 256, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12870144.},
-    /* GFLOPS 0.026 x 1 = 0.026 */ {{1, 1}, {{1, 512, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 25715200.},
     /* GFLOPS 0.013 x 2 = 0.026 */ {{1, 1}, {{1, 512, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12857600.},
-    /* GFLOPS 0.002 x 12 = 0.025 */ {{1, 1}, {{1, 64, 16, 16}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 2113536.},
+    /* GFLOPS 0.009 x 3 = 0.026 */ {{1, 1}, {{1, 128, 46, 46}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 8700992.},
+    /* GFLOPS 0.013 x 2 = 0.025 */ {{1, 1}, {{1, 96, 64, 64}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12648448.},
     /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 480, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 24109568.},
     /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 128, 38, 38}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 23750912.},
-    /* GFLOPS 0.024 x 1 = 0.024 */ {{1, 1}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 23704704.},
-    /* GFLOPS 0.023 x 1 = 0.023 */ {{3, 3}, {{1, 3, 256, 512}}, 13, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 23429120.},
     /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 32, 150, 150}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 23400000.},
     /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 512, 19, 19}}, 63, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 23311575.},
     /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 448, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 22503936.},
     /* GFLOPS 0.023 x 1 = 0.023 */ {{1, 1}, {{1, 512, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22500800.},
     /* GFLOPS 0.022 x 1 = 0.022 */ {{1, 1}, {{1, 508, 14, 14}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 22325184.},
-    /* GFLOPS 0.022 x 1 = 0.022 */ {{3, 3}, {{1, 512, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 22120800.},
-    /* GFLOPS 0.021 x 1 = 0.021 */ {{3, 3}, {{1, 128, 12, 12}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 21242880.},
+    /* GFLOPS 0.006 x 4 = 0.022 */ {{1, 1}, {{1, 24, 28, 28}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5531904.},
+    /* GFLOPS 0.005 x 4 = 0.022 */ {{1, 1}, {{1, 288, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5428416.},
     /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 40, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 21233664.},
     /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 416, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 20898304.},
     /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 20885760.},
-    /* GFLOPS 0.021 x 1 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20885760.},
-    /* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 10442880.},
-    /* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10442880.},
-    /* GFLOPS 0.010 x 2 = 0.020 */ {{3, 3}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10066056.},
-    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 16, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 20095488.},
-    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 16, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20095488.},
-    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 32, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 20082944.},
-    /* GFLOPS 0.020 x 1 = 0.020 */ {{5, 5}, {{1, 32, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 20082944.},
-    /* GFLOPS 0.020 x 1 = 0.020 */ {{3, 3}, {{1, 256, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 19966188.},
-    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 192, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 19317760.},
-    /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 192, 28, 28}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19317760.},
+    /* GFLOPS 0.010 x 2 = 0.021 */ {{1, 1}, {{1, 32, 64, 64}}, 39, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 10383360.},
+    /* GFLOPS 0.010 x 2 = 0.020 */ {{1, 1}, {{1, 24, 112, 112}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9834496.},
+    /* GFLOPS 0.005 x 4 = 0.020 */ {{1, 1}, {{1, 240, 16, 16}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4925440.},
     /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 384, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 19292672.},
     /* GFLOPS 0.019 x 1 = 0.019 */ {{1, 1}, {{1, 64, 64, 64}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 19021824.},
+    /* GFLOPS 0.010 x 2 = 0.019 */ {{1, 1}, {{1, 96, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9683968.},
+    /* GFLOPS 0.010 x 2 = 0.019 */ {{1, 1}, {{1, 32, 48, 48}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9584640.},
+    /* GFLOPS 0.010 x 2 = 0.019 */ {{1, 1}, {{1, 64, 48, 48}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 9510912.},
     /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 576, 10, 10}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 18448000.},
     /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 480, 14, 14}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 18082176.},
-    /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 480, 14, 14}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 18082176.},
     /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 192, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 17790080.},
     /* GFLOPS 0.018 x 1 = 0.018 */ {{1, 1}, {{1, 352, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 17687040.},
-    /* GFLOPS 0.017 x 1 = 0.017 */ {{2, 2}, {{1, 16, 128, 256}}, 16, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 16908288.},
-    /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 320, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 16081408.},
+    /* GFLOPS 0.009 x 2 = 0.018 */ {{1, 1}, {{1, 8, 128, 128}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 8912896.},
+    /* GFLOPS 0.008 x 2 = 0.017 */ {{1, 1}, {{1, 64, 80, 80}}, 10, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 8256000.},
     /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 832, 7, 7}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 15664320.},
-    /* GFLOPS 0.016 x 1 = 0.016 */ {{1, 1}, {{1, 832, 7, 7}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 15664320.},
-    /* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 48, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 15059072.},
-    /* GFLOPS 0.015 x 1 = 0.015 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 14754816.},
-    /* GFLOPS 0.015 x 1 = 0.015 */ {{3, 3}, {{1, 128, 10, 10}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 14752000.},
+    /* GFLOPS 0.008 x 2 = 0.016 */ {{1, 1}, {{1, 128, 20, 20}}, 80, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 8224000.},
+    /* GFLOPS 0.008 x 2 = 0.016 */ {{1, 1}, {{1, 256, 12, 12}}, 108, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7978176.},
     /* GFLOPS 0.014 x 1 = 0.014 */ {{1, 1}, {{1, 288, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 14475776.},
     /* GFLOPS 0.014 x 1 = 0.014 */ {{1, 1}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 13991250.},
+    /* GFLOPS 0.007 x 2 = 0.014 */ {{1, 1}, {{1, 288, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7237888.},
+    /* GFLOPS 0.007 x 2 = 0.014 */ {{1, 1}, {{1, 144, 32, 32}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7102464.},
+    /* GFLOPS 0.007 x 2 = 0.014 */ {{1, 1}, {{1, 240, 16, 16}}, 56, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6895616.},
     /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 144, 38, 38}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 13354112.},
-    /* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6623232.},
-    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 512, 10, 10}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13120000.},
     /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 832, 7, 7}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 13053600.},
-    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 832, 7, 7}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 13053600.},
+    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 508, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12757248.},
+    /* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6623232.},
+    /* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 128, 80, 80}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6579200.},
     /* GFLOPS 0.007 x 2 = 0.013 */ {{1, 1}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6522880.},
-    /* GFLOPS 0.001 x 11 = 0.013 */ {{3, 3}, {{1, 64, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180672.},
     /* GFLOPS 0.006 x 2 = 0.013 */ {{1, 1}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6472704.},
-    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 128, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12895232.},
-    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 256, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12870144.},
-    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 256, 14, 14}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12870144.},
-    /* GFLOPS 0.013 x 1 = 0.013 */ {{1, 1}, {{1, 508, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12757248.},
+    /* GFLOPS 0.006 x 2 = 0.013 */ {{1, 1}, {{1, 24, 128, 128}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6422528.},
+    /* GFLOPS 0.002 x 6 = 0.013 */ {{1, 1}, {{1, 8, 128, 128}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2228224.},
     /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 992, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12449920.},
     /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 480, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 12054784.},
-    /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 480, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 12054784.},
     /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 960, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 12048512.},
     /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 32, 75, 75}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", false, 12014080.},
-    /* GFLOPS 0.012 x 1 = 0.012 */ {{3, 3}, {{1, 96, 6, 6}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11950848.},
-    /* GFLOPS 0.006 x 2 = 0.012 */ {{3, 3}, {{1, 96, 3, 3}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5975424.},
     /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 320, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 11814912.},
     /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 640, 6, 6}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 11805696.},
     /* GFLOPS 0.012 x 1 = 0.012 */ {{1, 1}, {{1, 928, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11647104.},
     /* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 896, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11245696.},
-    /* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 256, 13, 13}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 11097216.},
-    /* GFLOPS 0.011 x 1 = 0.011 */ {{3, 3}, {{1, 256, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11061600.},
-    /* GFLOPS 0.006 x 2 = 0.011 */ {{3, 3}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5530200.},
     /* GFLOPS 0.011 x 1 = 0.011 */ {{1, 1}, {{1, 864, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10844288.},
+    /* GFLOPS 0.005 x 2 = 0.011 */ {{1, 1}, {{1, 144, 28, 28}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5437824.},
+    /* GFLOPS 0.005 x 2 = 0.011 */ {{1, 1}, {{1, 128, 24, 24}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5329152.},
     /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 832, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10442880.},
-    /* GFLOPS 0.010 x 1 = 0.010 */ {{5, 5}, {{1, 32, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 10041472.},
     /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 800, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 10041472.},
-    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 192, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9658880.},
-    /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 192, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 9658880.},
     /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 384, 14, 14}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 9646336.},
-    /* GFLOPS 0.005 x 2 = 0.010 */ {{1, 1}, {{1, 512, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4821600.},
     /* GFLOPS 0.010 x 1 = 0.010 */ {{1, 1}, {{1, 768, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 9640064.},
-    /* GFLOPS 0.010 x 1 = 0.010 */ {{3, 3}, {{1, 4, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9568256.},
-    /* GFLOPS 0.005 x 2 = 0.009 */ {{1, 1}, {{1, 4, 128, 256}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4718592.},
     /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 736, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 9238656.},
-    /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 192, 19, 19}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 8895040.},
     /* GFLOPS 0.009 x 1 = 0.009 */ {{1, 1}, {{1, 704, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8837248.},
+    /* GFLOPS 0.005 x 2 = 0.009 */ {{1, 1}, {{1, 96, 32, 32}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4743168.},
+    /* GFLOPS 0.005 x 2 = 0.009 */ {{1, 1}, {{1, 4, 128, 256}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4718592.},
+    /* GFLOPS 0.004 x 2 = 0.009 */ {{1, 1}, {{1, 16, 64, 64}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4325376.},
+    /* GFLOPS 0.004 x 2 = 0.009 */ {{1, 1}, {{1, 32, 64, 64}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4259840.},
     /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 672, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8435840.},
     /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 128, 32, 64}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8421376.},
-    /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 640, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 8034432.},
-    /* GFLOPS 0.004 x 2 = 0.008 */ {{1, 1}, {{1, 832, 7, 7}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3916080.},
     /* GFLOPS 0.008 x 1 = 0.008 */ {{1, 1}, {{1, 608, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 7633024.},
-    /* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 7535808.},
-    /* GFLOPS 0.008 x 1 = 0.008 */ {{5, 5}, {{1, 16, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 7535808.},
+    /* GFLOPS 0.004 x 2 = 0.008 */ {{1, 1}, {{1, 384, 7, 7}}, 112, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4220272.},
+    /* GFLOPS 0.004 x 2 = 0.008 */ {{1, 1}, {{1, 336, 8, 8}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4134912.},
     /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 640, 6, 6}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7378560.},
-    /* GFLOPS 0.004 x 2 = 0.007 */ {{1, 1}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3650304.},
     /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 384, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7234752.},
     /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 576, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 7231616.},
     /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 256, 12, 12}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 7091712.},
     /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 544, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 6830208.},
-    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 64, 8, 8}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 6687360.},
-    /* GFLOPS 0.007 x 1 = 0.007 */ {{3, 3}, {{1, 160, 6, 6}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 6637824.},
     /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 528, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6629504.},
-    /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 528, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6629504.},
     /* GFLOPS 0.007 x 1 = 0.007 */ {{1, 1}, {{1, 256, 5, 5}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6566400.},
-    /* GFLOPS 0.003 x 2 = 0.007 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 3280000.},
+    /* GFLOPS 0.004 x 2 = 0.007 */ {{1, 1}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3650304.},
+    /* GFLOPS 0.003 x 2 = 0.007 */ {{1, 1}, {{1, 64, 80, 80}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3302400.},
     /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 64, 56, 56}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6472704.},
-    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 128, 28, 28}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6447616.},
-    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 7, 7}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 6428800.},
     /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6428800.},
-    /* GFLOPS 0.006 x 1 = 0.006 */ {{1, 1}, {{1, 512, 14, 14}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6428800.},
-    /* GFLOPS 0.001 x 12 = 0.006 */ {{1, 1}, {{1, 64, 8, 8}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 528384.},
-    /* GFLOPS 0.006 x 1 = 0.006 */ {{3, 3}, {{1, 256, 10, 10}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5530800.},
+    /* GFLOPS 0.003 x 2 = 0.006 */ {{1, 1}, {{1, 144, 16, 16}}, 40, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2959360.},
     /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 12, 12}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5322240.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 5310720.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5310720.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 5310720.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{3, 3}, {{1, 128, 5, 5}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 5310720.},
     /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4917600.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4917600.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 28, 28}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4829440.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 192, 28, 28}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4829440.},
     /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 256, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4826304.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 512, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4821600.},
     /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 508, 14, 14}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4783968.},
     /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 64, 32, 32}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 4755456.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 64, 24, 24}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4755456.},
-    /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 256, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4727808.},
     /* GFLOPS 0.005 x 1 = 0.005 */ {{1, 1}, {{1, 1024, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4720896.},
+    /* GFLOPS 0.003 x 2 = 0.005 */ {{1, 1}, {{1, 144, 14, 14}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2718912.},
+    /* GFLOPS 0.002 x 2 = 0.005 */ {{1, 1}, {{1, 576, 8, 8}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2361344.},
     /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 512, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4440300.},
-    /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 512, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4440300.},
     /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 640, 6, 6}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4427136.},
     /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 16, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4325376.},
     /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 64, 64, 128}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 4227072.},
     /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 832, 7, 7}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3916080.},
-    /* GFLOPS 0.004 x 1 = 0.004 */ {{3, 3}, {{1, 256, 1, 1}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3705636.},
-    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 16, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3691008.},
-    /* GFLOPS 0.004 x 1 = 0.004 */ {{3, 3}, {{1, 64, 10, 10}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 3689600.},
-    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
-    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 32, 12, 12}}, 64, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3688704.},
-    /* GFLOPS 0.004 x 1 = 0.004 */ {{5, 5}, {{1, 64, 6, 6}}, 128, 1, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 3687552.},
     /* GFLOPS 0.004 x 1 = 0.004 */ {{1, 1}, {{1, 192, 12, 12}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3548160.},
+    /* GFLOPS 0.002 x 2 = 0.004 */ {{1, 1}, {{1, 240, 48, 48}}, 2, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2216448.},
+    /* GFLOPS 0.002 x 2 = 0.004 */ {{1, 1}, {{1, 32, 64, 64}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2129920.},
+    /* GFLOPS 0.002 x 2 = 0.004 */ {{1, 1}, {{1, 64, 40, 40}}, 10, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2064000.},
+    /* GFLOPS 0.001 x 6 = 0.004 */ {{1, 1}, {{1, 32, 24, 24}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 599040.},
     /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 736, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3393792.},
-    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 256, 10, 10}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3283200.},
     /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3280000.},
-    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3280000.},
     /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 512, 5, 5}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3228750.},
     /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 480, 14, 14}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 3013696.},
-    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 480, 14, 14}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 3013696.},
     /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 320, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2953728.},
     /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 640, 6, 6}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2951424.},
-    /* GFLOPS 0.003 x 1 = 0.003 */ {{3, 3}, {{1, 256, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 2765400.},
-    /* GFLOPS 0.003 x 1 = 0.003 */ {{3, 3}, {{1, 128, 5, 5}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2655360.},
     /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 832, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2610720.},
-    /* GFLOPS 0.003 x 1 = 0.003 */ {{1, 1}, {{1, 256, 3, 3}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2520882.},
-    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258530.},
+    /* GFLOPS 0.002 x 2 = 0.003 */ {{1, 1}, {{1, 128, 80, 80}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1644800.},
+    /* GFLOPS 0.002 x 2 = 0.003 */ {{1, 1}, {{1, 128, 40, 40}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1644800.},
+    /* GFLOPS 0.002 x 2 = 0.003 */ {{1, 1}, {{1, 24, 32, 32}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1605632.},
+    /* GFLOPS 0.001 x 4 = 0.003 */ {{1, 1}, {{1, 64, 80, 80}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 825600.},
     /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 256, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2363904.},
-    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 2360320.},
-    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2360320.},
-    /* GFLOPS 0.002 x 1 = 0.002 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 2360320.},
     /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 528, 4, 4}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2164736.},
     /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 508, 4, 4}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 2082816.},
     /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 1024, 1, 1}}, 1000, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2049000.},
-    /* GFLOPS 0.001 x 2 = 0.002 */ {{3, 3}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 995544.},
-    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 1024, 3, 3}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1770336.},
     /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 64, 4, 4}}, 810, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 1671840.},
     /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 32, 80, 80}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1664000.},
-    /* GFLOPS 0.002 x 1 = 0.002 */ {{1, 1}, {{1, 256, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1641600.},
+    /* GFLOPS 0.001 x 2 = 0.002 */ {{1, 1}, {{1, 16, 4, 8400}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", false, 1108800.},
+    /* GFLOPS 0.001 x 2 = 0.002 */ {{1, 1}, {{1, 56, 16, 16}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 925696.},
+    /* GFLOPS 0.001 x 2 = 0.002 */ {{1, 1}, {{1, 64, 40, 40}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 825600.},
+    /* GFLOPS 0.001 x 4 = 0.002 */ {{1, 1}, {{1, 64, 12, 12}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 594432.},
+    /* GFLOPS 0.000 x 8 = 0.002 */ {{1, 1}, {{1, 192, 2, 2}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 295680.},
     /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 640, 6, 6}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1475712.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1383000.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 64, 5, 5}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1328256.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 736, 3, 3}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1272672.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 64, 16, 16}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 1188864.},
-    /* GFLOPS 0.000 x 9 = 0.001 */ {{1, 1}, {{1, 64, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 132096.},
-    /* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590976.},
-    /* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 590976.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180160.},
     /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1120392.},
     /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 192, 12, 12}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 887040.},
-    /* GFLOPS 0.000 x 2 = 0.001 */ {{3, 3}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 442464.},
-    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 32, 80, 80}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 416000.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 691500.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 256, 3, 3}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 663696.},
     /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 640, 2, 2}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 655872.},
     /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 615000.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 615000.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 128, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 592128.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 590976.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590080.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 581742.},
-    /* GFLOPS 0.001 x 1 = 0.001 */ {{1, 1}, {{1, 256, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 525312.},
-    /* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 48, 1, 1}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 111744.},
-    /* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 1152, 1, 1}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110640.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 5, 5}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 411200.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 3, 3}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 331920.},
+    /* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 256, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590976.},
+    /* GFLOPS 0.001 x 2 = 0.001 */ {{1, 1}, {{1, 64, 20, 20}}, 10, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 516000.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 256, 12, 12}}, 6, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 443232.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 32, 80, 80}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 416000.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 128, 40, 40}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 411200.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 128, 20, 20}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 411200.},
+    /* GFLOPS 0.000 x 4 = 0.001 */ {{1, 1}, {{1, 64, 12, 12}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 297216.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 128, 6, 6}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 296064.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{1, 1}, {{1, 128, 24, 24}}, 2, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 296064.},
+    /* GFLOPS 0.000 x 4 = 0.001 */ {{1, 1}, {{1, 64, 40, 40}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 206400.},
+    /* GFLOPS 0.000 x 9 = 0.001 */ {{1, 1}, {{1, 64, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 132096.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 192, 5, 5}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 308000.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 8, 8}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 297216.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 2, 2}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 263168.},
-    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 131328.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 258552.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 64, 20, 20}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 206400.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 1024, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 196704.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 3, 3}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 165960.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 3, 3}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 148032.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 147584.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 147584.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 147584.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 128, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 147584.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 128, 6, 6}}, 16, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 148032.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 736, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 141408.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 140322.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 131328.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 131328.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 131328.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 48, 1, 1}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 111744.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 1152, 1, 1}}, 48, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110640.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 128, 20, 20}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 102800.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 4, 4}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 74304.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 64, 20, 20}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 51600.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 49248.},
     /* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 28, 1, 1}}, 672, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 38304.},
     /* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 672, 1, 1}}, 28, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 37660.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 110808.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 3, 3}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110808.},
-    /* GFLOPS 0.000 x 2 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55320.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 4, 4}}, 36, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "VALID", true, 74304.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 2, 2}}, 64, 1, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 73792.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 256, 1, 1}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 73744.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32382.},
     /* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 20, 1, 1}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19680.},
     /* GFLOPS 0.000 x 3 = 0.000 */ {{1, 1}, {{1, 480, 1, 1}}, 20, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 19220.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 49248.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 49248.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 36880.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 126, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 32382.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 18440.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 64, 1, 1}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 16512.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6168.},
     /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 10, 1, 1}}, 240, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5040.},
     /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 240, 1, 1}}, 10, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 4810.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 6168.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6168.},
+    /* GFLOPS 0.000 x 8 = 0.000 */ {{1, 1}, {{1, 24, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4704.},
+    /* GFLOPS 0.000 x 8 = 0.000 */ {{1, 1}, {{1, 96, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4632.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{1, 1}, {{1, 4, 16, 16}}, 2, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 4608.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 4, 16, 16}}, 1, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2304.},
     /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 6, 1, 1}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1872.},
     /* GFLOPS 0.000 x 2 = 0.000 */ {{1, 1}, {{1, 144, 1, 1}}, 6, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1734.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 4, 1, 1}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 864.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 96, 1, 1}}, 4, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 772.},
     /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 8, 1, 1}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 544.},
-    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 32, 1, 1}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 520.}
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{1, 1}, {{1, 32, 1, 1}}, 8, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 520.},
+    /* GFLOPS 0.000 x 8 = 0.000 */ {{1, 1}, {{1, 6, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 312.},
+    /* GFLOPS 0.000 x 8 = 0.000 */ {{1, 1}, {{1, 24, 1, 1}}, 6, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 294.},
+};
+
+static const ConvParam_t testConvolution_3x3S1D1_Configs[] = {
+    /* GFLOPS 1.596 x 14 = 22.338 */ {{3, 3}, {{1, 128, 52, 52}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595576320.},
+    /* GFLOPS 1.595 x 12 = 19.141 */ {{3, 3}, {{1, 512, 13, 13}}, 1024, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1595057152.},
+    /* GFLOPS 6.814 x 2 = 13.629 */ {{3, 3}, {{1, 512, 38, 38}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6814386176.},
+    /* GFLOPS 6.637 x 2 = 13.274 */ {{3, 3}, {{1, 256, 75, 75}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6636960000.},
+    /* GFLOPS 10.701 x 1 = 10.701 */ {{3, 3}, {{1, 512, 38, 38}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 10700715792.},
+    /* GFLOPS 10.087 x 1 = 10.087 */ {{3, 3}, {{1, 576, 38, 50}}, 512, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10086963200.},
+    /* GFLOPS 9.993 x 1 = 9.993 */ {{3, 3}, {{1, 64, 368, 368}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9993207808.},
+    /* GFLOPS 9.989 x 1 = 9.989 */ {{3, 3}, {{1, 128, 184, 184}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9988874240.},
+    /* GFLOPS 4.247 x 2 = 8.494 */ {{3, 3}, {{1, 480, 32, 32}}, 480, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 4247224320.},
+    /* GFLOPS 8.025 x 1 = 8.025 */ {{3, 3}, {{1, 1024, 19, 19}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 8025101478.},
+    /* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 300, 300}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6641280000.},
+    /* GFLOPS 6.641 x 1 = 6.641 */ {{3, 3}, {{1, 64, 150, 200}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 6641280000.},
+    /* GFLOPS 6.638 x 1 = 6.638 */ {{3, 3}, {{1, 128, 150, 150}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 6638400000.},
+    /* GFLOPS 6.118 x 1 = 6.118 */ {{3, 3}, {{1, 144, 128, 128}}, 144, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6117654528.},
+    /* GFLOPS 6.116 x 1 = 6.116 */ {{3, 3}, {{1, 1152, 16, 16}}, 1152, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 6115590144.},
+    /* GFLOPS 4.997 x 1 = 4.997 */ {{3, 3}, {{1, 64, 184, 184}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4996603904.},
+    /* GFLOPS 4.993 x 1 = 4.993 */ {{3, 3}, {{1, 512, 46, 46}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4992812032.},
+    /* GFLOPS 3.408 x 1 = 3.408 */ {{3, 3}, {{1, 256, 38, 38}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3407562752.},
+    /* GFLOPS 0.302 x 11 = 3.325 */ {{3, 3}, {{1, 64, 64, 64}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 302252032.},
+    /* GFLOPS 3.321 x 1 = 3.321 */ {{3, 3}, {{1, 64, 150, 150}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3320640000.},
+    /* GFLOPS 0.830 x 4 = 3.321 */ {{3, 3}, {{1, 64, 75, 100}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 830160000.},
+    /* GFLOPS 3.319 x 1 = 3.319 */ {{3, 3}, {{1, 128, 75, 75}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3319200000.},
+    /* GFLOPS 1.598 x 2 = 3.195 */ {{3, 3}, {{1, 32, 208, 208}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1597652992.},
+    /* GFLOPS 1.596 x 2 = 3.193 */ {{3, 3}, {{1, 64, 104, 104}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1596268544.},
+    /* GFLOPS 1.405 x 2 = 2.810 */ {{3, 3}, {{1, 96, 184, 184}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1404888576.},
+    /* GFLOPS 0.798 x 3 = 2.394 */ {{3, 3}, {{1, 64, 104, 104}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 798134272.},
+    /* GFLOPS 2.255 x 1 = 2.255 */ {{3, 3}, {{1, 128, 80, 100}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2255285760.},
+    /* GFLOPS 2.153 x 1 = 2.153 */ {{3, 3}, {{1, 128, 78, 98}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2152611840.},
+    /* GFLOPS 2.052 x 1 = 2.052 */ {{3, 3}, {{1, 128, 76, 96}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 2052298240.},
+    /* GFLOPS 1.022 x 2 = 2.044 */ {{3, 3}, {{1, 576, 19, 19}}, 273, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1021896057.},
+    /* GFLOPS 1.954 x 1 = 1.954 */ {{3, 3}, {{1, 128, 74, 94}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1954344960.},
+    /* GFLOPS 1.888 x 1 = 1.888 */ {{3, 3}, {{1, 1024, 10, 10}}, 1024, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1887539200.},
+    /* GFLOPS 1.859 x 1 = 1.859 */ {{3, 3}, {{1, 128, 72, 92}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1858752000.},
+    /* GFLOPS 1.766 x 1 = 1.766 */ {{3, 3}, {{1, 128, 70, 90}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1765519360.},
+    /* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1703781376.},
+    /* GFLOPS 1.675 x 1 = 1.675 */ {{3, 3}, {{1, 128, 68, 88}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1674647040.},
+    /* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1659600000.},
+    /* GFLOPS 1.586 x 1 = 1.586 */ {{3, 3}, {{1, 128, 66, 86}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1586135040.},
+    /* GFLOPS 1.500 x 1 = 1.500 */ {{3, 3}, {{1, 128, 64, 84}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1499983360.},
+    /* GFLOPS 0.711 x 2 = 1.422 */ {{3, 3}, {{1, 12, 320, 320}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 711065600.},
+    /* GFLOPS 1.416 x 1 = 1.416 */ {{3, 3}, {{1, 128, 62, 82}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 1416192000.},
+    /* GFLOPS 0.701 x 2 = 1.401 */ {{3, 3}, {{1, 128, 38, 50}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 700720000.},
+    /* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 128, 56, 56}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
+    /* GFLOPS 0.231 x 6 = 1.388 */ {{3, 3}, {{1, 256, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231261184.},
+    /* GFLOPS 0.420 x 3 = 1.261 */ {{3, 3}, {{1, 96, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 420492800.},
+    /* GFLOPS 1.258 x 1 = 1.258 */ {{3, 3}, {{1, 1280, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258038600.},
+    /* GFLOPS 1.248 x 1 = 1.248 */ {{3, 3}, {{1, 256, 46, 46}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1248338432.},
+    /* GFLOPS 1.245 x 1 = 1.245 */ {{3, 3}, {{1, 64, 75, 75}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1245240000.},
+    /* GFLOPS 1.210 x 1 = 1.210 */ {{3, 3}, {{1, 32, 256, 256}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1210056704.},
+    /* GFLOPS 1.196 x 1 = 1.196 */ {{3, 3}, {{1, 384, 26, 26}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 1196336128.},
+    /* GFLOPS 0.590 x 2 = 1.181 */ {{3, 3}, {{1, 64, 80, 80}}, 80, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 590336000.},
+    /* GFLOPS 0.561 x 2 = 1.121 */ {{3, 3}, {{1, 128, 38, 50}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 560576000.},
+    /* GFLOPS 1.112 x 1 = 1.112 */ {{3, 3}, {{1, 512, 10, 10}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1111570200.},
+    /* GFLOPS 0.076 x 14 = 1.058 */ {{3, 3}, {{1, 64, 32, 32}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 75563008.},
+    /* GFLOPS 1.051 x 1 = 1.051 */ {{3, 3}, {{1, 160, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1050988800.},
+    /* GFLOPS 1.006 x 1 = 1.006 */ {{3, 3}, {{1, 1024, 10, 10}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1006441800.},
+    /* GFLOPS 0.473 x 2 = 0.945 */ {{3, 3}, {{1, 32, 160, 160}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 472678400.},
+    /* GFLOPS 0.472 x 2 = 0.944 */ {{3, 3}, {{1, 512, 4, 25}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 471910400.},
+    /* GFLOPS 0.841 x 1 = 0.841 */ {{3, 3}, {{1, 128, 38, 50}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 840864000.},
+    /* GFLOPS 0.415 x 2 = 0.831 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 415440000.},
+    /* GFLOPS 0.118 x 6 = 0.710 */ {{3, 3}, {{1, 16, 160, 160}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 118374400.},
+    /* GFLOPS 0.351 x 2 = 0.702 */ {{3, 3}, {{1, 96, 92, 92}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 351222144.},
+    /* GFLOPS 0.694 x 1 = 0.694 */ {{3, 3}, {{1, 64, 56, 56}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 694235136.},
+    /* GFLOPS 0.231 x 3 = 0.694 */ {{3, 3}, {{1, 512, 7, 7}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 231236096.},
+    /* GFLOPS 0.160 x 4 = 0.639 */ {{3, 3}, {{1, 64, 38, 38}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159833472.},
+    /* GFLOPS 0.305 x 2 = 0.609 */ {{3, 3}, {{1, 3, 416, 416}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 304578560.},
+    /* GFLOPS 0.295 x 2 = 0.590 */ {{3, 3}, {{1, 128, 40, 40}}, 80, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 295040000.},
+    /* GFLOPS 0.553 x 1 = 0.553 */ {{3, 3}, {{1, 64, 75, 100}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 553440000.},
+    /* GFLOPS 0.477 x 1 = 0.477 */ {{3, 3}, {{1, 3, 368, 368}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 476692480.},
+    /* GFLOPS 0.236 x 2 = 0.472 */ {{3, 3}, {{1, 128, 40, 40}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 236032000.},
+    /* GFLOPS 0.236 x 2 = 0.472 */ {{3, 3}, {{1, 256, 8, 25}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 235980800.},
+    /* GFLOPS 0.236 x 2 = 0.472 */ {{3, 3}, {{1, 256, 4, 25}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 235980800.},
+    /* GFLOPS 0.449 x 1 = 0.449 */ {{3, 3}, {{1, 384, 13, 13}}, 384, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 448626048.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 38, 38}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 426037760.},
+    /* GFLOPS 0.399 x 1 = 0.399 */ {{3, 3}, {{1, 256, 13, 13}}, 512, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 398807552.},
+    /* GFLOPS 0.200 x 2 = 0.399 */ {{3, 3}, {{1, 32, 104, 104}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 199706624.},
+    /* GFLOPS 0.319 x 1 = 0.319 */ {{3, 3}, {{1, 192, 19, 19}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 319482112.},
+    /* GFLOPS 0.317 x 1 = 0.317 */ {{3, 3}, {{1, 3, 300, 300}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 316800000.},
+    /* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 256, 13, 13}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299105664.},
+    /* GFLOPS 0.299 x 1 = 0.299 */ {{3, 3}, {{1, 384, 13, 13}}, 256, 2, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 299084032.},
+    /* GFLOPS 0.147 x 2 = 0.295 */ {{3, 3}, {{1, 256, 20, 20}}, 80, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 147488000.},
+    /* GFLOPS 0.133 x 2 = 0.266 */ {{3, 3}, {{1, 128, 19, 19}}, 160, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 133136800.},
+    /* GFLOPS 0.038 x 7 = 0.265 */ {{3, 3}, {{1, 16, 64, 128}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37879808.},
+    /* GFLOPS 0.011 x 24 = 0.256 */ {{3, 3}, {{1, 16, 48, 48}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "SAME", true, 10653696.},
+    /* GFLOPS 0.011 x 24 = 0.255 */ {{3, 3}, {{1, 32, 24, 24}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "SAME", true, 10635264.},
+    /* GFLOPS 0.126 x 2 = 0.252 */ {{3, 3}, {{1, 512, 5, 5}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 125812050.},
+    /* GFLOPS 0.118 x 2 = 0.236 */ {{3, 3}, {{1, 64, 16, 50}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 118067200.},
+    /* GFLOPS 0.118 x 2 = 0.236 */ {{3, 3}, {{1, 128, 8, 25}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 118016000.},
+    /* GFLOPS 0.118 x 2 = 0.236 */ {{3, 3}, {{1, 256, 20, 20}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 117990400.},
+    /* GFLOPS 0.111 x 2 = 0.221 */ {{3, 3}, {{1, 192, 10, 10}}, 320, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 110624000.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 256, 19, 19}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 212972672.},
+    /* GFLOPS 0.213 x 1 = 0.213 */ {{3, 3}, {{1, 512, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 212949568.},
+    /* GFLOPS 0.210 x 1 = 0.210 */ {{3, 3}, {{1, 64, 38, 50}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 210307200.},
+    /* GFLOPS 0.104 x 2 = 0.208 */ {{3, 3}, {{1, 32, 75, 75}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 103860000.},
+    /* GFLOPS 0.200 x 1 = 0.200 */ {{3, 3}, {{1, 160, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 199687872.},
+    /* GFLOPS 0.038 x 5 = 0.189 */ {{3, 3}, {{1, 32, 32, 64}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 37814272.},
+    /* GFLOPS 0.090 x 2 = 0.181 */ {{3, 3}, {{1, 224, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 90339200.},
+    /* GFLOPS 0.088 x 2 = 0.176 */ {{3, 3}, {{1, 96, 46, 46}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 87805536.},
+    /* GFLOPS 0.160 x 1 = 0.160 */ {{3, 3}, {{1, 128, 19, 19}}, 192, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 159764160.},
+    /* GFLOPS 0.146 x 1 = 0.146 */ {{3, 3}, {{1, 144, 14, 14}}, 288, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 146369664.},
+    /* GFLOPS 0.139 x 1 = 0.139 */ {{3, 3}, {{1, 256, 5, 5}}, 1206, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 138961350.},
+    /* GFLOPS 0.128 x 1 = 0.128 */ {{3, 3}, {{1, 64, 24, 24}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 127512576.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 16, 56, 56}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 58003456.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 32, 28, 28}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57903104.},
+    /* GFLOPS 0.058 x 2 = 0.116 */ {{3, 3}, {{1, 64, 14, 14}}, 256, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 57852928.},
+    /* GFLOPS 0.045 x 2 = 0.090 */ {{3, 3}, {{1, 576, 19, 19}}, 12, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44918508.},
+    /* GFLOPS 0.089 x 1 = 0.089 */ {{3, 3}, {{1, 112, 14, 14}}, 224, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 88554368.},
+    /* GFLOPS 0.043 x 2 = 0.085 */ {{3, 3}, {{1, 32, 48, 48}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "SAME", true, 42541056.},
+    /* GFLOPS 0.011 x 8 = 0.085 */ {{3, 3}, {{1, 128, 6, 6}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "SAME", true, 10621440.},
+    /* GFLOPS 0.077 x 1 = 0.077 */ {{3, 3}, {{1, 192, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 77436800.},
+    /* GFLOPS 0.070 x 1 = 0.070 */ {{3, 3}, {{1, 96, 14, 14}}, 208, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 70487872.},
+    /* GFLOPS 0.069 x 1 = 0.069 */ {{3, 3}, {{1, 96, 14, 14}}, 204, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 69132336.},
+    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 192, 7, 7}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 65046912.},
+    /* GFLOPS 0.065 x 1 = 0.065 */ {{3, 3}, {{1, 160, 10, 10}}, 224, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 64534400.},
+    /* GFLOPS 0.033 x 2 = 0.065 */ {{3, 3}, {{1, 48, 14, 14}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 32551680.},
+    /* GFLOPS 0.032 x 2 = 0.064 */ {{3, 3}, {{1, 96, 12, 12}}, 128, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 31868928.},
+    /* GFLOPS 0.004 x 16 = 0.058 */ {{3, 3}, {{1, 128, 7, 7}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 3614240.},
+    /* GFLOPS 0.055 x 1 = 0.055 */ {{3, 3}, {{1, 1280, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55298400.},
+    /* GFLOPS 0.053 x 1 = 0.053 */ {{3, 3}, {{1, 128, 38, 38}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 53254720.},
+    /* GFLOPS 0.045 x 1 = 0.045 */ {{3, 3}, {{1, 160, 7, 7}}, 320, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 45174080.},
+    /* GFLOPS 0.044 x 1 = 0.044 */ {{3, 3}, {{1, 1024, 10, 10}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 44239200.},
+    /* GFLOPS 0.022 x 2 = 0.044 */ {{3, 3}, {{1, 3, 112, 112}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 22077440.},
+    /* GFLOPS 0.022 x 2 = 0.044 */ {{3, 3}, {{1, 96, 23, 23}}, 24, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 21951384.},
+    /* GFLOPS 0.007 x 6 = 0.043 */ {{3, 3}, {{1, 48, 16, 16}}, 32, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 7086080.},
+    /* GFLOPS 0.040 x 1 = 0.040 */ {{3, 3}, {{1, 64, 19, 19}}, 96, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 39958368.},
+    /* GFLOPS 0.027 x 1 = 0.027 */ {{3, 3}, {{1, 128, 38, 38}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 26627360.},
+    /* GFLOPS 0.010 x 2 = 0.020 */ {{3, 3}, {{1, 256, 2, 2}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 10066056.},
+    /* GFLOPS 0.010 x 2 = 0.019 */ {{3, 3}, {{1, 8, 256, 256}}, 1, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9502720.},
+    /* GFLOPS 0.002 x 6 = 0.014 */ {{3, 3}, {{1, 32, 16, 16}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 2363392.},
+    /* GFLOPS 0.001 x 11 = 0.013 */ {{3, 3}, {{1, 64, 4, 4}}, 64, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", false, 1180672.},
+    /* GFLOPS 0.012 x 1 = 0.012 */ {{3, 3}, {{1, 96, 6, 6}}, 192, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11950848.},
+    /* GFLOPS 0.006 x 2 = 0.012 */ {{3, 3}, {{1, 96, 3, 3}}, 384, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5975424.},
+    /* GFLOPS 0.006 x 2 = 0.011 */ {{3, 3}, {{1, 512, 5, 5}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 5530200.},
+    /* GFLOPS 0.010 x 1 = 0.010 */ {{3, 3}, {{1, 4, 128, 256}}, 4, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 9568256.},
+    /* GFLOPS 0.006 x 1 = 0.006 */ {{3, 3}, {{1, 256, 10, 10}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 5530800.},
+    /* GFLOPS 0.004 x 1 = 0.004 */ {{3, 3}, {{1, 256, 1, 1}}, 804, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3705636.},
+    /* GFLOPS 0.001 x 6 = 0.004 */ {{3, 3}, {{1, 16, 16, 16}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 591872.},
+    /* GFLOPS 0.001 x 2 = 0.003 */ {{3, 3}, {{1, 128, 1, 1}}, 546, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 1258530.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 5, 5}}, 12, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 691500.},
+    /* GFLOPS 0.001 x 1 = 0.001 */ {{3, 3}, {{1, 128, 3, 3}}, 256, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 590080.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{3, 3}, {{1, 256, 2, 2}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 442464.},
+    /* GFLOPS 0.000 x 6 = 0.001 */ {{3, 3}, {{1, 8, 16, 16}}, 4, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 148480.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 64, 3, 3}}, 128, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "", true, 147584.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 256, 1, 1}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 73744.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 24, 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, "SAME", true, 55320.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 16, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 36880.},
+    /* GFLOPS 0.000 x 1 = 0.000 */ {{3, 3}, {{1, 128, 1, 1}}, 8, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 18440.},
+};
+
+static const ConvParam_t testConvolution_Depthwise_Configs[] = {
+    /* GFLOPS 6.525 x 14 = 91.357 */ {{5, 5}, {{1, 1632, 7, 7}}, 1632, 1632, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 6525468768.},
+    /* GFLOPS 6.094 x 4 = 24.377 */ {{5, 5}, {{1, 480, 23, 23}}, 480, 480, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 6094333920.},
+    /* GFLOPS 0.925 x 10 = 9.249 */ {{3, 3}, {{1, 512, 14, 14}}, 512, 512, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 924944384.},
+    /* GFLOPS 4.301 x 2 = 8.601 */ {{3, 3}, {{1, 336, 46, 46}}, 336, 336, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 4300693824.},
+    /* GFLOPS 1.734 x 4 = 6.936 */ {{5, 5}, {{1, 64, 92, 92}}, 64, 64, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 1733968896.},
+    /* GFLOPS 1.106 x 6 = 6.638 */ {{5, 5}, {{1, 672, 7, 7}}, 672, 672, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 1106413728.},
+    /* GFLOPS 1.062 x 6 = 6.370 */ {{5, 5}, {{1, 576, 8, 8}}, 576, 576, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 1061720064.},
+    /* GFLOPS 2.986 x 2 = 5.973 */ {{5, 5}, {{1, 336, 46, 46}}, 336, 336, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 2986276944.},
+    /* GFLOPS 1.445 x 4 = 5.781 */ {{5, 5}, {{1, 336, 16, 16}}, 336, 336, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 1445154816.},
+    /* GFLOPS 0.472 x 10 = 4.719 */ {{5, 5}, {{1, 128, 24, 24}}, 128, 128, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 471932928.},
+    /* GFLOPS 2.194 x 2 = 4.389 */ {{3, 3}, {{1, 240, 46, 46}}, 240, 240, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 2194376640.},
+    /* GFLOPS 1.889 x 2 = 3.778 */ {{3, 3}, {{1, 64, 160, 160}}, 64, 64, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1889075200.},
+    /* GFLOPS 1.659 x 2 = 3.318 */ {{5, 5}, {{1, 960, 14, 14}}, 960, 960, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1658914560.},
+    /* GFLOPS 0.472 x 6 = 2.834 */ {{3, 3}, {{1, 64, 80, 80}}, 64, 64, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 472268800.},
+    /* GFLOPS 0.472 x 6 = 2.832 */ {{5, 5}, {{1, 64, 48, 48}}, 64, 64, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 472006656.},
+    /* GFLOPS 1.344 x 2 = 2.688 */ {{5, 5}, {{1, 192, 56, 56}}, 192, 192, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 1343832768.},
+    /* GFLOPS 0.382 x 6 = 2.293 */ {{3, 3}, {{1, 576, 8, 8}}, 576, 576, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 382242816.},
+    /* GFLOPS 1.130 x 2 = 2.259 */ {{3, 3}, {{1, 144, 112, 112}}, 144, 144, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 1129510800.},
+    /* GFLOPS 1.062 x 2 = 2.124 */ {{5, 5}, {{1, 144, 32, 32}}, 144, 144, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 1061830656.},
+    /* GFLOPS 0.976 x 2 = 1.953 */ {{3, 3}, {{1, 40, 184, 184}}, 40, 40, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 976407040.},
+    /* GFLOPS 0.473 x 4 = 1.891 */ {{3, 3}, {{1, 32, 160, 160}}, 32, 32, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 472678400.},
+    /* GFLOPS 0.925 x 2 = 1.850 */ {{3, 3}, {{1, 128, 56, 56}}, 128, 128, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 925245440.},
+    /* GFLOPS 0.925 x 2 = 1.850 */ {{3, 3}, {{1, 256, 28, 28}}, 256, 256, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 925044736.},
+    /* GFLOPS 0.925 x 2 = 1.850 */ {{3, 3}, {{1, 1024, 7, 7}}, 1024, 1024, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", false, 924894208.},
+    /* GFLOPS 1.704 x 1 = 1.704 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1703781376.},
+    /* GFLOPS 1.660 x 1 = 1.660 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1659600000.},
+    /* GFLOPS 0.813 x 2 = 1.626 */ {{5, 5}, {{1, 144, 28, 28}}, 144, 144, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 812964096.},
+    /* GFLOPS 0.813 x 2 = 1.626 */ {{5, 5}, {{1, 288, 14, 14}}, 288, 288, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 812907648.},
+    /* GFLOPS 0.737 x 2 = 1.475 */ {{5, 5}, {{1, 240, 16, 16}}, 240, 240, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 737341440.},
+    /* GFLOPS 0.351 x 4 = 1.405 */ {{3, 3}, {{1, 96, 46, 46}}, 96, 96, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 351222144.},
+    /* GFLOPS 0.680 x 2 = 1.360 */ {{3, 3}, {{1, 96, 64, 64}}, 96, 96, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 679870464.},
+    /* GFLOPS 0.677 x 2 = 1.355 */ {{5, 5}, {{1, 40, 184, 184}}, 40, 40, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 677458560.},
+    /* GFLOPS 0.625 x 2 = 1.250 */ {{3, 3}, {{1, 32, 368, 368}}, 32, 32, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 625117184.},
+    /* GFLOPS 0.293 x 4 = 1.171 */ {{3, 3}, {{1, 288, 14, 14}}, 288, 288, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 292682880.},
+    /* GFLOPS 0.549 x 2 = 1.097 */ {{3, 3}, {{1, 120, 92, 92}}, 120, 120, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 548721120.},
+    /* GFLOPS 0.265 x 4 = 1.062 */ {{3, 3}, {{1, 240, 16, 16}}, 240, 240, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 265482240.},
+    /* GFLOPS 0.473 x 2 = 0.947 */ {{3, 3}, {{1, 16, 320, 320}}, 16, 16, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 473497600.},
+    /* GFLOPS 0.472 x 2 = 0.944 */ {{5, 5}, {{1, 96, 64, 64}}, 96, 96, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 471957504.},
+    /* GFLOPS 0.398 x 2 = 0.797 */ {{3, 3}, {{1, 672, 7, 7}}, 672, 672, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 398330016.},
+    /* GFLOPS 0.361 x 2 = 0.723 */ {{5, 5}, {{1, 336, 16, 16}}, 336, 336, {2, 2}, {1, 1}, {2, 2}, {0, 0}, "", true, 361288704.},
+    /* GFLOPS 0.118 x 6 = 0.708 */ {{3, 3}, {{1, 64, 40, 40}}, 64, 64, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 118067200.},
+    /* GFLOPS 0.118 x 6 = 0.708 */ {{5, 5}, {{1, 256, 6, 6}}, 256, 256, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 117974016.},
+    /* GFLOPS 0.336 x 2 = 0.672 */ {{5, 5}, {{1, 96, 56, 56}}, 96, 96, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 335993184.},
+    /* GFLOPS 0.265 x 2 = 0.531 */ {{5, 5}, {{1, 384, 14, 14}}, 384, 384, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 265434624.},
+    /* GFLOPS 0.472 x 1 = 0.472 */ {{5, 5}, {{1, 32, 96, 96}}, 32, 32, {1, 1}, {1, 1}, {2, 2}, {0, 0}, "", true, 472154112.},
+    /* GFLOPS 0.232 x 2 = 0.463 */ {{3, 3}, {{1, 32, 112, 112}}, 32, 32, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 231612416.},
+    /* GFLOPS 0.231 x 2 = 0.463 */ {{3, 3}, {{1, 64, 112, 112}}, 64, 64, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 231411712.},
+    /* GFLOPS 0.231 x 2 = 0.463 */ {{3, 3}, {{1, 128, 56, 56}}, 128, 128, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 231311360.},
+    /* GFLOPS 0.231 x 2 = 0.463 */ {{3, 3}, {{1, 256, 28, 28}}, 256, 256, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 231261184.},
+    /* GFLOPS 0.231 x 2 = 0.462 */ {{3, 3}, {{1, 512, 14, 14}}, 512, 512, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", false, 231236096.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 128, 75, 75}}, 128, 128, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 426037760.},
+    /* GFLOPS 0.426 x 1 = 0.426 */ {{3, 3}, {{1, 256, 38, 38}}, 256, 256, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 425945344.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 32, 150, 150}}, 32, 32, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 415440000.},
+    /* GFLOPS 0.415 x 1 = 0.415 */ {{3, 3}, {{1, 64, 150, 150}}, 64, 64, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 415080000.},
+    /* GFLOPS 0.170 x 2 = 0.341 */ {{3, 3}, {{1, 24, 128, 128}}, 24, 24, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 170262528.},
+    /* GFLOPS 0.157 x 2 = 0.314 */ {{3, 3}, {{1, 8, 368, 368}}, 8, 8, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 157091840.},
+    /* GFLOPS 0.076 x 4 = 0.304 */ {{3, 3}, {{1, 8, 256, 256}}, 8, 8, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 76021760.},
+    /* GFLOPS 0.130 x 2 = 0.261 */ {{3, 3}, {{1, 24, 112, 112}}, 24, 24, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 130357248.},
+    /* GFLOPS 0.118 x 2 = 0.237 */ {{3, 3}, {{1, 16, 160, 160}}, 16, 16, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 118374400.},
+    /* GFLOPS 0.113 x 2 = 0.226 */ {{5, 5}, {{1, 32, 96, 96}}, 32, 32, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 113171488.},
+    /* GFLOPS 0.108 x 2 = 0.217 */ {{5, 5}, {{1, 64, 48, 48}}, 64, 64, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 108373056.},
+    /* GFLOPS 0.099 x 2 = 0.198 */ {{5, 5}, {{1, 128, 24, 24}}, 128, 128, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 99138688.},
+    /* GFLOPS 0.096 x 2 = 0.191 */ {{3, 3}, {{1, 144, 32, 32}}, 144, 144, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 95588352.},
+    /* GFLOPS 0.030 x 6 = 0.177 */ {{3, 3}, {{1, 64, 20, 20}}, 64, 64, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 29516800.},
+    /* GFLOPS 0.082 x 2 = 0.164 */ {{5, 5}, {{1, 256, 12, 12}}, 256, 256, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 81926400.},
+    /* GFLOPS 0.076 x 2 = 0.151 */ {{3, 3}, {{1, 32, 64, 64}}, 32, 32, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 75628544.},
+    /* GFLOPS 0.076 x 2 = 0.151 */ {{3, 3}, {{1, 32, 128, 128}}, 32, 32, {2, 2}, {1, 1}, {1, 1}, {0, 0}, "", true, 75628544.},
+    /* GFLOPS 0.063 x 2 = 0.126 */ {{3, 3}, {{1, 144, 28, 28}}, 144, 144, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 63103248.},
+    /* GFLOPS 0.019 x 6 = 0.114 */ {{3, 3}, {{1, 8, 128, 128}}, 8, 8, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 19005440.},
+    /* GFLOPS 0.019 x 2 = 0.038 */ {{3, 3}, {{1, 16, 64, 64}}, 16, 16, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 18939904.},
+    /* GFLOPS 0.014 x 2 = 0.029 */ {{3, 3}, {{1, 56, 16, 16}}, 56, 56, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 14465024.},
+    /* GFLOPS 0.012 x 2 = 0.023 */ {{3, 3}, {{1, 10, 80, 80}}, 10, 10, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 11584000.},
+    /* GFLOPS 0.011 x 2 = 0.021 */ {{3, 3}, {{1, 24, 32, 32}}, 24, 24, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 10641408.},
+    /* GFLOPS 0.003 x 6 = 0.016 */ {{3, 3}, {{1, 192, 2, 2}}, 192, 192, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 2654976.},
+    /* GFLOPS 0.004 x 2 = 0.008 */ {{3, 3}, {{1, 1, 32, 100}}, 64, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 3891200.},
+    /* GFLOPS 0.003 x 2 = 0.006 */ {{3, 3}, {{1, 10, 40, 40}}, 10, 10, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 2896000.},
+    /* GFLOPS 0.002 x 2 = 0.004 */ {{3, 3}, {{1, 4, 80, 80}}, 4, 4, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 1868800.},
+    /* GFLOPS 0.001 x 2 = 0.001 */ {{3, 3}, {{1, 10, 20, 20}}, 10, 10, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 724000.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{3, 3}, {{1, 192, 4, 4}}, 192, 192, {2, 2}, {1, 1}, {0, 0}, {0, 0}, "", true, 663744.},
+    /* GFLOPS 0.000 x 2 = 0.001 */ {{3, 3}, {{1, 4, 40, 40}}, 4, 4, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 467200.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{3, 3}, {{1, 1, 80, 80}}, 1, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 121600.},
+    /* GFLOPS 0.000 x 2 = 0.000 */ {{3, 3}, {{1, 4, 20, 20}}, 4, 4, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 116800.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{3, 3}, {{1, 1, 40, 40}}, 1, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 30400.},
+    /* GFLOPS 0.000 x 4 = 0.000 */ {{3, 3}, {{1, 1, 20, 20}}, 1, 1, {1, 1}, {1, 1}, {1, 1}, {0, 0}, "", true, 7600.},
 };
-struct ConvParamID
+
+struct ConvParamGenerator
 {
-    enum {
-        CONV_0 = 0,
-        CONV_100 = 100,
-        CONV_LAST = sizeof(testConvolutionConfigs) / sizeof(testConvolutionConfigs[0])
-    };
-    int val_;
-    ConvParamID(int val = 0) : val_(val) {}
-    operator int() const { return val_; }
-    static ::testing::internal::ParamGenerator<ConvParamID> all()
+    ConvParamGenerator(const ConvParam_t* testConfigs, const int size): testConfigs(testConfigs), size(size)
+    {}
+
+    const ConvParam_t* testConfigs;
+    const int size;
+
+    ::testing::internal::ParamGenerator<ConvParam_t> all() const
     {
-#if 0
-        enum { NUM = (int)CONV_LAST };
-#else
-        enum { NUM = (int)CONV_100 };
-#endif
-        ConvParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = ConvParamID(i); } // reduce generated code size
-        return ::testing::ValuesIn(v_, v_ + NUM);
+        int NUM = size;
+        static size_t DNN_LIMIT_CONV = utils::getConfigurationParameterSizeT("OPENCV_TEST_DNN_LIMIT_CONV", 0);
+        if (DNN_LIMIT_CONV > 0)
+            NUM = std::min(NUM, (int)DNN_LIMIT_CONV);
+
+        std::vector<ConvParam_t> v_(NUM);
+        for (int i = 0; i < NUM; ++i) { v_[i] = testConfigs[i]; } // reduce generated code size
+        return ::testing::ValuesIn(v_);
     }
 };
-static inline void PrintTo(const ConvParamID& v, std::ostream* os)
+static inline void PrintTo(const ConvParam_t& p, std::ostream* os)
 {
-    CV_Assert((int)v >= 0); CV_Assert((int)v < ConvParamID::CONV_LAST);
-    const ConvParam_t& p = testConvolutionConfigs[(int)v];
-
     *os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
         << ", K=" << (Size)p.kernel
         << ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << ", " << p.shapeIn.dims[3] << "}"
@@ -792,17 +806,20 @@ static inline void PrintTo(const ConvParamID& v, std::ostream* os)
         *os << ", BIAS";
 }
 
-
-
-typedef tuple<ConvParamID, tuple<Backend, Target> > ConvTestParam_t;
-typedef TestBaseWithParam<ConvTestParam_t> Conv;
-
-PERF_TEST_P_(Conv, conv)
+static
+Net build_net(
+    const ConvParam_t& params, Backend backendId, Target targetId,
+    const std::function<void(Net&)>& configure_network_cb = std::function<void(Net&)>(),
+    double flops_limit_debug_long = 2e9, double flops_limit_debug_verylong = 6e9
+)
 {
-    int test_id = (int)get<0>(GetParam());
-    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, ConvParamID::CONV_LAST);
-    const ConvParam_t& params = testConvolutionConfigs[test_id];
     double declared_flops = params.declared_flops;
+
+    if (flops_limit_debug_verylong > 0 && declared_flops >= flops_limit_debug_verylong)
+        applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+    if (flops_limit_debug_long > 0 && declared_flops >= flops_limit_debug_long)
+        applyTestTag(CV_TEST_TAG_DEBUG_LONG);
+
     Size kernel = params.kernel;
     MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 4);
     int outChannels = params.outCN;
@@ -813,8 +830,6 @@ PERF_TEST_P_(Conv, conv)
     Size padAdjust = params.padAdjust;
     std::string padMode(params.padMode);
     bool hasBias = params.hasBias;
-    Backend backendId = get<0>(get<1>(GetParam()));
-    Target targetId = get<1>(get<1>(GetParam()));
 
     int inChannels = inputShape[1];
     Size inSize(inputShape[3], inputShape[2]);
@@ -858,9 +873,14 @@ PERF_TEST_P_(Conv, conv)
     Net net;
     net.addLayerToPrev(lp.name, lp.type, lp);
 
-    net.setInput(input);
     net.setPreferableBackend(backendId);
     net.setPreferableTarget(targetId);
+    if (configure_network_cb)
+    {
+        configure_network_cb(net);
+    }
+
+    net.setInput(input);
 
     // warmup
     Mat output = net.forward();
@@ -877,17 +897,103 @@ PERF_TEST_P_(Conv, conv)
         << "    Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
         << "    MFLOPS=" << flops * 1e-6 << std::endl;
 
+    EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
+
+    return net;
+}
+
+typedef tuple<ConvParam_t, tuple<Backend, Target> > ConvTestParam_t;
+typedef tuple<ConvParam_t, tuple<Backend, Target>, bool> Conv3x3S1D1TestParam_t;
+typedef TestBaseWithParam<ConvTestParam_t> Conv;
+typedef TestBaseWithParam<ConvTestParam_t> Conv_1x1;
+typedef TestBaseWithParam<Conv3x3S1D1TestParam_t> Conv_3x3S1D1;
+typedef TestBaseWithParam<ConvTestParam_t> Conv_Depthwise;
+
+PERF_TEST_P_(Conv, conv)
+{
+    const ConvParam_t& params = get<0>(GetParam());
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+    Net net = build_net(params, backendId, targetId);
+
     TEST_CYCLE()
     {
         Mat res = net.forward();
     }
+    SANITY_CHECK_NOTHING();
+}
 
-    EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
+PERF_TEST_P_(Conv_1x1, conv)
+{
+    const ConvParam_t& params = get<0>(GetParam());
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+    Net net = build_net(params, backendId, targetId);
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
     SANITY_CHECK_NOTHING();
 }
 
+PERF_TEST_P_(Conv_3x3S1D1, conv)
+{
+    const ConvParam_t& params = get<0>(GetParam());
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+    bool winograd = get<2>(GetParam());
+    Net net = build_net(params, backendId, targetId,
+        [=](Net& net)
+        {
+            net.enableWinograd(winograd);
+        }
+    );
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(Conv_Depthwise, conv)
+{
+    const ConvParam_t& params = get<0>(GetParam());
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+    Net net = build_net(params, backendId, targetId, std::function<void(Net&)>(),
+        0/*flops_limit_debug_long*/, 0/*flops_limit_debug_verylong*/);
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+ConvParamGenerator conv_params(testConvolution_Configs, sizeof(testConvolution_Configs) / sizeof(testConvolution_Configs[0]));
 INSTANTIATE_TEST_CASE_P(/**/, Conv, Combine(
-    ConvParamID::all(),
+    conv_params.all(),
+    dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
+ConvParamGenerator conv_1x1_params(testConvolution_1x1_Configs, sizeof(testConvolution_1x1_Configs) / sizeof(testConvolution_1x1_Configs[0]));
+INSTANTIATE_TEST_CASE_P(/**/, Conv_1x1, Combine(
+    conv_1x1_params.all(),
+    dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
+ConvParamGenerator conv_3x3S1D1_params(testConvolution_3x3S1D1_Configs, sizeof(testConvolution_3x3S1D1_Configs) / sizeof(testConvolution_3x3S1D1_Configs[0]));
+INSTANTIATE_TEST_CASE_P(/**/, Conv_3x3S1D1, Combine(
+    conv_3x3S1D1_params.all(),
+    dnnBackendsAndTargets(false, false),  // defined in ../test/test_common.hpp
+    testing::Values(true, false)  // enable Winograd or not
+));
+
+ConvParamGenerator conv_depthwise_params(testConvolution_Depthwise_Configs, sizeof(testConvolution_Depthwise_Configs) / sizeof(testConvolution_Depthwise_Configs[0]));
+INSTANTIATE_TEST_CASE_P(/**/, Conv_Depthwise, Combine(
+    conv_depthwise_params.all(),
     dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
 ));
 
diff --git a/modules/dnn/perf/perf_einsum.cpp b/modules/dnn/perf/perf_einsum.cpp
new file mode 100644
index 000000000000..bad9d956bed2
--- /dev/null
+++ b/modules/dnn/perf/perf_einsum.cpp
@@ -0,0 +1,109 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test {
+
+struct EinsumParams {
+    int inputSize;
+    int outputSize;
+    std::string equation;
+    std::vector<MatShape> einsumInpShapes;
+    EinsumParams(std::string equation_, std::vector<MatShape> einsumInpShapes_ = std::vector<MatShape>())
+    {
+        inputSize = einsumInpShapes_.size();
+        equation = equation_;
+        einsumInpShapes = einsumInpShapes_;
+    }
+};
+
+static inline void PrintTo(const EinsumParams& params, ::std::ostream* os) {
+     (*os) << "Equation=" << params.equation << " ";
+
+        (*os) << "InputShape={";
+        for(int i = 0; i < params.einsumInpShapes.size(); i++)
+        {
+            (*os) << "{";
+            for(int j = 0; j < params.einsumInpShapes[i].size(); j++)
+            {
+                (*os) << params.einsumInpShapes[i][j] << ((j < params.einsumInpShapes[i].size() - 1) ?  ", " : "");
+            }
+            (*os) << ((i < params.einsumInpShapes.size() - 1) ? "}, " : "}");
+        }
+        (*os) << "}";
+}
+
+// test cases
+static const EinsumParams testEinsumConfigs[] = {
+    // TODO: Add tests with one input after ellips merge
+    {"ij, jk -> ik", {{2, 3}, {3, 2}}},
+    {"ij, jk -> ik", {{20, 30}, {30, 20}}},
+    {"ij, jk -> ik", {{113, 127}, {127, 113}}},
+
+    {"imkj, injs -> imnks", {{1, 4, 7, 9}, {1, 5, 9, 8}}},
+    {"imkj, injs -> imnks", {{1, 4, 70, 90}, {1, 5, 90, 80}}},
+    {"imkj, injs -> imnks", {{1, 4, 73, 91}, {1, 5, 91, 57}}},
+
+    {"ij -> i",  {{30, 40}}},
+    {"ij -> i",  {{113, 374}}},
+
+    {"...ij -> ...i", {{30, 40}}},
+    {"...ij -> ...i", {{113, 374}}},
+
+    {"...ij, ...jk -> ...ik",  {{40, 50}, {50, 80}}},
+    {"...ij, ...jk -> ...ik",  {{47, 51}, {51, 83}}},
+};
+
+class Layer_Einsum: public TestBaseWithParam<EinsumParams> {};
+
+PERF_TEST_P_(Layer_Einsum, einsum) {
+    const EinsumParams& params = GetParam();
+    LayerParams lp;
+    lp.type = "Einsum";
+    lp.name = "testEinsum";
+    lp.set("equation", params.equation);
+    lp.set("inputSize", params.inputSize);
+    lp.set("outputSize", 1);
+
+    CV_CheckFalse(params.einsumInpShapes.empty(), "ERROR no inputs shapes provided");
+
+    for (int i = 0; i < params.einsumInpShapes.size(); i++) {
+        lp.set("inputShapes" + cv::format("%d", i), DictValue::arrayInt(params.einsumInpShapes[i].begin(), params.einsumInpShapes[i].size()));
+    }
+
+    Net net;
+    std::vector<Mat> inputs;
+    std::vector<std::string> input_names;
+    int id = net.addLayer(lp.name, lp.type, lp);
+
+    for (int i = 0; i < params.inputSize; ++i) {
+        // create inputs
+        inputs.emplace_back(Mat(params.einsumInpShapes[i].size(), params.einsumInpShapes[i].data(), CV_32FC1));
+
+        // connect each input to the layer
+        net.connect(0, i, id, i);
+
+        // create input names dynamically, assuming input naming follows a consistent pattern
+        input_names.emplace_back("input" + std::to_string(i + 1));
+    }
+
+    //warm up
+    std::vector<Mat> outputs;
+    net.setInputsNames(input_names);
+    for (int i = 0; i < input_names.size(); i++){
+        net.setInput(inputs[i], input_names[i]);
+    }
+    net.forward(outputs, "testEinsum");
+
+    TEST_CYCLE()
+    {
+        net.forward(outputs, "testEinsum");
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Einsum, testing::ValuesIn(testEinsumConfigs));
+
+}; //namespace
diff --git a/modules/dnn/perf/perf_gemm.cpp b/modules/dnn/perf/perf_gemm.cpp
new file mode 100644
index 000000000000..40fd66865bdc
--- /dev/null
+++ b/modules/dnn/perf/perf_gemm.cpp
@@ -0,0 +1,415 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#include <numeric>
+
+namespace opencv_test {
+
+struct GemmParam_t {
+    std::vector<int> a_shape;
+    std::vector<int> b_shape;
+    std::vector<int> c_shape;
+    bool trans_a;
+    bool trans_b;
+
+    GemmParam_t(std::vector<int> a_shape_, std::vector<int> b_shape_, std::vector<int> c_shape_ = {}, bool trans_a_ = false, bool trans_b_ = false)
+        : a_shape(a_shape_), b_shape(b_shape_), c_shape(c_shape_), trans_a(trans_a_), trans_b(trans_b_) {}
+};
+
+// TODO: Dsiable most of the test cases except vision transformers to save time
+static const GemmParam_t test_gemm_configs[] = {
+    // vision transformers cases
+    { {  768,  768 }, {  768,  768 }, {  768 } },
+    { { 1024, 1024 }, { 1024, 1024 }, { 1024 } },
+    { {   50,  768 }, {  768, 2304 } },
+    { {  197,  768 }, {  768, 2304 } },
+    { {   50, 1024 }, { 1024, 3072 } },
+    { {  197, 1024 }, { 1024, 3072 } },
+
+// these cases are commented to save testing time
+/*
+    // square mat
+    { {   64,   64 }, {   64,   64 } },
+    { {  128,  128 }, {  128,  128 } },
+    { {  256,  256 }, {  256,  256 } },
+    { {  512,  512 }, {  512,  512 } },
+    { { 1024, 1024 }, { 1024, 1024 } },
+    { { 4096, 4096 }, { 4096, 4096 } },
+
+    // retangular mat
+    { {  256,  256 }, {  256, 1024 } },
+    { {  256, 1024 }, { 1024,  256 } },
+    { {  256, 1024 }, { 1024, 1024 } },
+    { { 1024, 1024 }, { 1024,  256 } },
+    { { 1024,  256 }, {  256, 1024 } },
+    { { 1024,  256 }, {  256,  256 } },
+
+    // with C
+    { {  256,  256 }, {  256,  256 }, {  256 } },
+    { {  256,  256 }, {  256, 1024 }, { 1024 } },
+    { {  256, 1024 }, { 1024,  256 }, {  256 } },
+    { {  256, 1024 }, { 1024, 1024 }, { 1024 } },
+    { { 1024, 1024 }, { 1024,  256 }, {  256 } },
+    { { 1024,  256 }, {  256, 1024 }, { 1024 } },
+    { { 1024,  256 }, {  256,  256 }, {  256 } },
+
+    // with C and trans_b
+    { {  256,  256 }, {  256,  256 }, {  256 } , false, true},
+    { {  256, 1024 }, {  256, 1024 }, {  256 } , false, true},
+    { {  256, 1024 }, { 1024, 1024 }, { 1024 } , false, true},
+    { { 1024, 1024 }, { 1024, 1024 }, { 1024 } , false, true},
+    { { 1024,  256 }, { 1024,  256 }, { 1024 } , false, true},
+    { { 1024,  256 }, {  256,  256 }, {  256 } , false, true},
+
+    // with C and trans_b and trans_a
+    { {  256,  256 }, {  256,  256 }, {  256 } , true, true},
+    { { 1024,  256 }, {  256, 1024 }, {  256 } , true, true},
+    { {  256, 1024 }, { 1024,  256 }, { 1024 } , true, true},
+    { { 1024, 1024 }, { 1024, 1024 }, { 1024 } , true, true},
+*/
+};
+
+static const GemmParam_t test_matmul_configs[] = {
+    // vision transformer cases
+    { {12, 197, 197}, {12, 197, 64} },
+    { {12, 197, 64 }, {12, 64, 197} },
+    { {12, 50, 64}, {12, 64, 50} },
+    { {12, 50, 50}, {12, 50, 64} },
+    { {16, 197, 197}, {16, 197, 64} },
+    { {16, 197, 64 }, {16, 64, 197} },
+    { {16, 50, 64}, {16, 64, 50} },
+    { {16, 50, 50}, {16, 50, 64} },
+};
+
+struct GemmParamId
+{
+    enum {
+        GEMM_0 = 0,
+        GEMM_LAST = sizeof(test_gemm_configs) / sizeof(test_gemm_configs[0])
+    };
+    int val_;
+    GemmParamId(int val = 0) : val_(val) {}
+    operator int() const { return val_; }
+    static ::testing::internal::ParamGenerator<GemmParamId> all()
+    {
+        enum { NUM = (int)GEMM_LAST };
+        GemmParamId v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = GemmParamId(i); } // reduce generated code size
+        return ::testing::ValuesIn(v_, v_ + NUM);
+    }
+};
+
+struct MatMulParamId {
+    enum {
+        MATMUL_0 = 0,
+        MATMUL_LAST = sizeof(test_matmul_configs) / sizeof(test_matmul_configs[0])
+    };
+    int val_;
+    MatMulParamId(int val = 0) : val_(val) {}
+    operator int() const { return val_; }
+    static ::testing::internal::ParamGenerator<MatMulParamId> all() {
+        enum { NUM = (int)MATMUL_LAST };
+        MatMulParamId v_[NUM]; for (int i = 0; i < NUM; i++) { v_[i] = MatMulParamId(i); }
+        return ::testing::ValuesIn(v_, v_ + NUM);
+    }
+};
+
+static inline void PrintTo(const GemmParamId& v, std::ostream* os)
+{
+    CV_Assert((int)v >= 0); CV_Assert((int)v < GemmParamId::GEMM_LAST);
+    const GemmParam_t& p = test_gemm_configs[(int)v];
+
+    auto print_shape = [os](const std::vector<int>& shape, const std::string tag) {
+        if (shape.empty()) {
+            return ;
+        }
+
+        *os << tag << "=[";
+        for (size_t i = 0; i < shape.size(); ++i) {
+            if (i == shape.size() - 1) {
+                *os << shape[i] << "]";
+                break;
+            }
+            *os << shape[i] << ", ";
+        }
+    };
+
+    print_shape(p.a_shape, "A");
+    print_shape(p.b_shape, ", B");
+    print_shape(p.c_shape, ", C");
+    *os << ", trans_a=" << p.trans_a << ", trans_b=" << p.trans_b;
+}
+
+typedef tuple<GemmParamId, tuple<Backend, Target> > GemmTestParam_t;
+typedef TestBaseWithParam<GemmTestParam_t> Gemm;
+
+PERF_TEST_P_(Gemm, gemm)
+{
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, GemmParamId::GEMM_LAST);
+    const GemmParam_t& params = test_gemm_configs[test_id];
+    auto a_shape = params.a_shape;
+    auto b_shape = params.b_shape;
+    auto c_shape = params.c_shape;
+    auto trans_a = params.trans_a;
+    auto trans_b = params.trans_b;
+    float alpha = 1.f;
+    float beta = 1.f;
+
+    Backend backend_id = get<0>(get<1>(GetParam()));
+    Target target_id = get<1>(get<1>(GetParam()));
+
+    bool have_bias = c_shape.empty() ? false : true;
+
+    Mat A(static_cast<int>(a_shape.size()), a_shape.data(), CV_32F);
+    randu(A, -1.0f, 1.0f);
+    Mat B(static_cast<int>(b_shape.size()), b_shape.data(), CV_32F);
+    randu(B, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.type = "Gemm";
+    lp.name = "testLayer";
+    lp.set("transA", trans_a);
+    lp.set("transB", trans_b);
+    lp.set("alpha", alpha);
+    lp.set("beta", beta);
+    lp.set("real_ndims_C", static_cast<int>(c_shape.size()));
+
+    lp.set("constB", true);
+    lp.blobs.push_back(B);
+    if (have_bias) {
+        Mat C(static_cast<int>(c_shape.size()), c_shape.data(), CV_32F);
+        randu(C, -1.0f, 1.0f);
+        lp.set("have_bias", true);
+        lp.set("constC", true);
+        lp.blobs.push_back(C);
+    }
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+    net.setPreferableBackend(backend_id);
+    net.setPreferableTarget(target_id);
+
+    // warmup
+    {
+        net.setInput(A);
+        Mat out = net.forward();
+    }
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(Gemm, innerproduct)
+{
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, GemmParamId::GEMM_LAST);
+    const GemmParam_t& params = test_gemm_configs[test_id];
+    auto a_shape = params.a_shape;
+    auto b_shape = params.b_shape;
+    auto c_shape = params.c_shape;
+    auto trans_a = params.trans_a;
+    auto trans_b = params.trans_b;
+
+    Backend backend_id = get<0>(get<1>(GetParam()));
+    Target target_id = get<1>(get<1>(GetParam()));
+
+    bool have_bias = c_shape.empty() ? false : true;
+
+    Mat A(static_cast<int>(a_shape.size()), a_shape.data(), CV_32F);
+    randu(A, -1.0f, 1.0f);
+    Mat B(static_cast<int>(b_shape.size()), b_shape.data(), CV_32F);
+    randu(B, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.type = "InnerProduct";
+    lp.name = "testLayer";
+    if (trans_a) {
+        cv::transpose(A, A);
+    }
+    if (!trans_b) {
+        cv::transpose(B, B);
+    }
+    lp.blobs.push_back(B);
+    lp.set("num_output", B.size[0]);
+    if (have_bias) {
+        Mat C(static_cast<int>(c_shape.size()), c_shape.data(), CV_32F);
+        randu(C, -1.0f, 1.0f);
+        lp.blobs.push_back(C);
+        lp.set("bias_term", true);
+    } else {
+        lp.set("bias_term", false);
+    }
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+    net.setPreferableBackend(backend_id);
+    net.setPreferableTarget(target_id);
+
+    // warmup
+    {
+        std::vector<std::string> input_names(1);
+        input_names[0] = "A";
+        net.setInputsNames(input_names);
+        net.setInput(A, input_names[0]);
+        Mat out = net.forward();
+    }
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+static inline void PrintTo(const MatMulParamId& v, std::ostream* os)
+{
+    CV_Assert((int)v >= 0); CV_Assert((int)v < MatMulParamId::MATMUL_LAST);
+    const GemmParam_t& p = test_matmul_configs[(int)v];
+
+    auto print_shape = [os](const std::vector<int>& shape, const std::string tag) {
+        if (shape.empty()) {
+            return ;
+        }
+
+        *os << tag << "=[";
+        for (size_t i = 0; i < shape.size(); ++i) {
+            if (i == shape.size() - 1) {
+                *os << shape[i] << "]";
+                break;
+            }
+            *os << shape[i] << ", ";
+        }
+    };
+
+    print_shape(p.a_shape, "A");
+    print_shape(p.b_shape, ", B");
+    print_shape(p.c_shape, ", C");
+    *os << ", trans_a=" << p.trans_a << ", trans_b=" << p.trans_b;
+}
+
+using MatMulTestParam_t = tuple<MatMulParamId, tuple<Backend, Target>>;
+using MatMul = TestBaseWithParam<MatMulTestParam_t>;
+
+PERF_TEST_P_(MatMul, matmul)
+{
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, MatMulParamId::MATMUL_LAST);
+    const GemmParam_t& params = test_matmul_configs[test_id];
+    auto a_shape = params.a_shape;
+    auto b_shape = params.b_shape;
+    auto trans_a = params.trans_a;
+    auto trans_b = params.trans_b;
+    float alpha = 1.f;
+    float beta = 1.f;
+
+    Backend backend_id = get<0>(get<1>(GetParam()));
+    Target target_id = get<1>(get<1>(GetParam()));
+
+    Mat A(a_shape, CV_32F);
+    randu(A, -1.0f, 1.0f);
+    Mat B(b_shape, CV_32F);
+    randu(B, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.type = "MatMul";
+    lp.name = "testLayer";
+    lp.set("transA", trans_a);
+    lp.set("transB", trans_b);
+    lp.set("alpha", alpha);
+    lp.set("beta", beta);
+    lp.blobs.push_back(B);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+    net.setPreferableBackend(backend_id);
+    net.setPreferableTarget(target_id);
+
+    // warmup
+    {
+        std::vector<std::string> input_names{"A"};
+        net.setInputsNames(input_names);
+        net.setInput(A, input_names[0]);
+        Mat out = net.forward();
+    }
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(MatMul, innerproduct)
+{
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, MatMulParamId::MATMUL_LAST);
+    const GemmParam_t& params = test_matmul_configs[test_id];
+    auto a_shape = params.a_shape;
+    auto b_shape = params.b_shape;
+
+    Backend backend_id = get<0>(get<1>(GetParam()));
+    Target target_id = get<1>(get<1>(GetParam()));
+
+    Mat A(a_shape, CV_32F);
+    randu(A, -1.0f, 1.0f);
+    Mat B(b_shape, CV_32F);
+    randu(B, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.type = "InnerProduct";
+    lp.name = "testLayer";
+    lp.set("axis", (int)(a_shape.size() - 1));
+    lp.set("bias_term", false);
+
+    // pre-transpose
+    std::vector<int> order(b_shape.size());
+    std::iota(order.begin(), order.end(), 0);
+    std::swap(order.back(), order[b_shape.size() - 2]);
+    Mat B_transposed;
+    transposeND(B, order, B_transposed);
+    lp.blobs.push_back(B_transposed);
+    lp.set("num_output", int(B_transposed.total(0, b_shape.size() - 1)));
+    lp.set("is_matmul", true);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+    net.setPreferableBackend(backend_id);
+    net.setPreferableTarget(target_id);
+
+    // warmup
+    {
+        std::vector<std::string> input_names{"A"};
+        net.setInputsNames(input_names);
+        net.setInput(A, input_names[0]);
+        Mat out = net.forward();
+    }
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Gemm, Combine(
+    GemmParamId::all(),
+    dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
+INSTANTIATE_TEST_CASE_P(/**/, MatMul, Combine(
+    MatMulParamId::all(),
+    dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
+} // namespace
diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp
index d5a9bb34af14..acdc778b3c5c 100644
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@@ -258,174 +258,162 @@ PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
     test_slice<4>(inputShape, begin, end);
 }
 
-struct Layer_Scatter : public TestBaseWithParam<tuple<Backend, Target> >
-{
-    void test_layer(const std::vector<int>& shape, const String reduction = "none", int axis = 0)
+using Layer_Scatter = TestBaseWithParam<tuple<std::vector<int>, std::string, int, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_Scatter, scatter) {
+    std::vector<int> shape = get<0>(GetParam());
+    std::string reduction = get<1>(GetParam());
+    int axis = get<2>(GetParam());
+    int backend_id = get<0>(get<3>(GetParam()));
+    int target_id = get<1>(get<3>(GetParam()));
+
+    Mat data(shape, CV_32FC1);
+    Mat indices(shape, CV_32FC1);
+    Mat updates(shape, CV_32FC1);
+
+    randn(data, 0.f, 1.f);
+    randu(indices, 0, shape[axis]);
+    randn(updates, 0.f, 1.f);
+
+    indices.convertTo(indices, CV_32SC1, 1, -1);
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Scatter";
+    lp.name = "testLayer";
+    lp.set("reduction", reduction);
+    lp.set("axis", axis);
+
+    int id = net.addLayerToPrev(lp.name, lp.type, lp);
+    net.connect(0, 0, id, 0);
+    net.connect(0, 1, id, 1);
+    net.connect(0, 2, id, 2);
+
+    // warmup
     {
-        int backendId = get<0>(GetParam());
-        int targetId = get<1>(GetParam());
-
-        Mat data(shape, CV_32FC1);
-        Mat indices(shape, CV_32FC1);
-        Mat updates(shape, CV_32FC1);
-
-        Scalar mean = 0.f;
-        Scalar std = 1.f;
-        randn(data, mean, std);
-        randu(indices, 0, shape[axis]);
-        randn(updates, mean, std);
-
-        indices.convertTo(indices, CV_32SC1, 1, -1);
-
-        Net net;
-        LayerParams lp;
-        lp.type = "Scatter";
-        lp.name = "testLayer";
-        lp.set("reduction", reduction);
-        lp.set("axis", axis);
-
-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
-        net.connect(0, 1, id, 1);
-        net.connect(0, 2, id, 2);
-
-        // warmup
-        {
-            std::vector<String> inpNames(3);
-            inpNames[0] = "data";
-            inpNames[1] = "indices";
-            inpNames[2] = "updates";
-            net.setInputsNames(inpNames);
-            net.setInput(data, inpNames[0]);
-            net.setInput(indices, inpNames[1]);
-            net.setInput(updates, inpNames[2]);
-
-            net.setPreferableBackend(backendId);
-            net.setPreferableTarget(targetId);
-            Mat out = net.forward();
-        }
-
-        TEST_CYCLE()
-        {
-            Mat res = net.forward();
-        }
-
-        SANITY_CHECK_NOTHING();
+        std::vector<String> input_names{"data", "indices", "updates"};
+        net.setInputsNames(input_names);
+        net.setInput(data, input_names[0]);
+        net.setInput(indices, input_names[1]);
+        net.setInput(updates, input_names[2]);
+
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
+        Mat out = net.forward();
     }
 
-    int N = 8;
-    int C = 256;
-    int H = 128;
-    int W = 100;
-};
-
-PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter)
-{
-    test_layer({N, C, H, W});
-}
+    // perf
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
 
-PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter_add)
-{
-    test_layer({N, C, H, W}, "add");
+    SANITY_CHECK_NOTHING();
 }
 
-struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
-{
-    void test_layer(const std::vector<int>& shape, const String reduction = "none")
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, Combine(
+    Values(std::vector<int>{2, 128, 64, 50}),
+    Values(std::string("none"), std::string("add")),
+    Values(0), // use Values(0, 1, 2, 3) for more details
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));
+
+using Layer_ScatterND = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_ScatterND, scatterND) {
+    std::vector<int> shape = get<0>(GetParam());
+    std::string reduction = get<1>(GetParam());
+    int backend_id = get<0>(get<2>(GetParam()));
+    int target_id = get<1>(get<2>(GetParam()));
+
+    std::vector<int> indices_shape(shape);
+    indices_shape.push_back(int(shape.size()));
+    Mat data(shape, CV_32FC1);
+    Mat indices(indices_shape, CV_32FC1);
+    Mat updates(shape, CV_32FC1);
+
+    randn(data, 0.f, 1.f);
+    randn(updates, 0.f, 1.f);
+
+    // Create indices such that indices[n_i, c_j, h_k, w_l, :4] = [i, j, k, l]
+    std::vector<int> current_index_tuple(shape.size());
+    int total = data.total();
+    std::vector<int> indices_step;
+    for (int i = 0; i < indices.dims; i++)
     {
-        int backendId = get<0>(GetParam());
-        int targetId = get<1>(GetParam());
-
-        std::vector<int> indices_shape(shape);
-        indices_shape.push_back(int(shape.size()));
-        Mat data(shape, CV_32FC1);
-        Mat indices(indices_shape, CV_32FC1);
-        Mat updates(shape, CV_32FC1);
-
-        Scalar mean = 0.f;
-        Scalar std = 1.f;
-        randn(data, mean, std);
-        randn(updates, mean, std);
-
-        // initialize the indices with index tuples like [0...N, 0...C, 0...H, 0...W]
-        std::vector<int> current_index_tuple(shape.size());
-        int total = data.total();
-        std::vector<int> indices_step;
-        for (int i = 0; i < indices.dims; i++)
-        {
-            int step = indices.step.p[i] / sizeof(float);
-            indices_step.push_back(step);
-        }
-        int t, j, idx, offset_at_idx, offset;
-        for (int i = 0; i < total; i++)
+        int step = indices.step.p[i] / sizeof(float);
+        indices_step.push_back(step);
+    }
+    int t, j, idx, offset_at_idx, offset;
+    auto *indices_ptr = indices.ptr<float>();
+    for (int i = 0; i < total; i++)
+    {
+        t = i;
+        for (j = shape.size() - 1; j >= 0; j--)
         {
-            t = i;
-            for (j = shape.size() - 1; j >= 0; j--)
-            {
-                idx = t / shape[j];
-                offset_at_idx = (int)(t - idx * shape[j]);
-                current_index_tuple[j] = offset_at_idx;
-                t = idx;
-            }
-
-            offset = 0;
-            for (j = 0; j < shape.size(); j++)
-                offset += current_index_tuple[j] * indices_step[j];
-
-            for (j = 0; j < shape.size(); j++)
-                indices.at<float>(offset + j) = current_index_tuple[j];
+            idx = t / shape[j];
+            offset_at_idx = (int)(t - idx * shape[j]);
+            current_index_tuple[j] = offset_at_idx;
+            t = idx;
         }
 
-        Net net;
-        LayerParams lp;
-        lp.type = "ScatterND";
-        lp.name = "testLayer";
-        lp.set("reduction", reduction);
+        offset = 0;
+        for (j = 0; j < shape.size(); j++)
+            offset += current_index_tuple[j] * indices_step[j];
 
-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
-        net.connect(0, 1, id, 1);
-        net.connect(0, 2, id, 2);
-
-        // warmup
-        {
-            std::vector<String> inpNames(3);
-            inpNames[0] = "data";
-            inpNames[1] = "indices";
-            inpNames[2] = "updates";
-            net.setInputsNames(inpNames);
-            net.setInput(data, inpNames[0]);
-            net.setInput(indices, inpNames[1]);
-            net.setInput(updates, inpNames[2]);
+        for (j = 0; j < shape.size(); j++)
+            indices_ptr[offset + j] = current_index_tuple[j];
+    }
 
-            net.setPreferableBackend(backendId);
-            net.setPreferableTarget(targetId);
-            Mat out = net.forward();
-        }
+    Net net;
+    LayerParams lp;
+    lp.type = "ScatterND";
+    lp.name = "testLayer";
+    lp.set("reduction", reduction);
 
-        TEST_CYCLE()
-        {
-            Mat res = net.forward();
-        }
+    int id = net.addLayerToPrev(lp.name, lp.type, lp);
+    net.connect(0, 0, id, 0);
+    net.connect(0, 1, id, 1);
+    net.connect(0, 2, id, 2);
 
-        SANITY_CHECK_NOTHING();
+    // warmup
+    {
+        std::vector<String> input_names{"data", "indices", "updates"};
+        net.setInputsNames(input_names);
+        net.setInput(data, input_names[0]);
+        net.setInput(indices, input_names[1]);
+        net.setInput(updates, input_names[2]);
+
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
+        Mat out = net.forward();
     }
 
-    int N = 8;
-    int C = 256;
-    int H = 128;
-    int W = 100;
-};
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
 
-PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND)
-{
-    test_layer({N, C, H ,W});
+    SANITY_CHECK_NOTHING();
 }
 
-PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND_add)
-{
-    test_layer({N, C, H , W}, "add");
-}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, Combine(
+    Values(std::vector<int>{2, 128, 64, 50}),
+    Values(std::string("none"), std::string("add")),
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));
 
 struct Layer_LayerNorm : public TestBaseWithParam<tuple<Backend, Target> >
 {
@@ -633,14 +621,358 @@ PERF_TEST_P_(Layer_LayerNormExpanded, DISABLED_LayerNormExpanded)
     test_layer({N, H ,W});
 }
 
+struct Layer_GatherElements : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    void test_layer(const std::vector<int>& data_shape, const std::vector<int>& indices_shape, int axis = 0)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat data(data_shape, CV_32FC1);
+        Mat indices(indices_shape, CV_32FC1);
+
+        randu(data, 0.f, 1.f);
+        randu(indices, 0, data_shape[axis]);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "GatherElements";
+        lp.name = "testLayer";
+        lp.set("axis", axis);
+        int id = net.addLayerToPrev(lp.name, lp.type, lp);
+        net.connect(0, 0, id, 0);
+        net.connect(0, 1, id, 1);
+
+        // warmup
+        {
+            std::vector<String> inpNames(3);
+            inpNames[0] = "data";
+            inpNames[1] = "indices";
+            net.setInputsNames(inpNames);
+            net.setInput(data, inpNames[0]);
+            net.setInput(indices, inpNames[1]);
+
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+};
+
+PERF_TEST_P_(Layer_GatherElements, GatherElements)
+{
+    test_layer({2700, 1, 2914}, {2700, 1, 81}, 2);
+}
+
+struct Layer_InstanceNorm : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    void test_layer(const std::vector<int>& x_shape)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat x(x_shape, CV_32FC1);
+        Mat scale(x_shape[1], 1, CV_32FC1);
+        Mat b(x_shape[1], 1, CV_32FC1);
+
+        randu(x, 0.f, 1.f);
+        randu(scale, 0.f, 1.f);
+        randu(b, 0.f, 1.f);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "InstanceNormalization";
+        lp.name = "testLayer";
+        int id = net.addLayerToPrev(lp.name, lp.type, lp);
+        net.connect(0, 0, id, 0);
+        net.connect(0, 1, id, 1);
+        net.connect(0, 2, id, 2);
+
+        // warmup
+        {
+            std::vector<String> inpNames{"x", "scale", "b"};
+            net.setInputsNames(inpNames);
+            net.setInput(x, inpNames[0]);
+            net.setInput(scale, inpNames[1]);
+            net.setInput(b, inpNames[2]);
+
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+
+    int N = 2;
+    int C = 64;
+    int H = 180;
+    int W = 240;
+};
+
+PERF_TEST_P_(Layer_InstanceNorm, InstanceNorm)
+{
+    test_layer({N, C, H, W});
+}
+
+struct Layer_Attention : public TestBaseWithParam<tuple<Backend, Target>> {
+    void test_layer(const std::vector<int> x_shape, const std::vector<int> qkv_hidden_sizes, const int num_heads) {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        auto qk_hidden_size = qkv_hidden_sizes[0];
+        auto v_hidden_size = qkv_hidden_sizes[2];
+
+        auto input_hidden_size = x_shape[2];
+        auto hidden_size = qk_hidden_size + qk_hidden_size + v_hidden_size;
+
+        Mat x(x_shape, CV_32F);
+        Mat weight(std::vector<int>{input_hidden_size, hidden_size}, CV_32F);
+        Mat bias(std::vector<int>{hidden_size}, CV_32F);
+
+        randu(x, 0.f, 1.f);
+        randu(weight, 0.f, 1.f);
+        randu(bias, 0.f, 1.f);
+
+        LayerParams lp;
+        lp.type = "Attention";
+        lp.name = "testLayer";
+        lp.set("num_heads", num_heads);
+        lp.set("qkv_hidden_sizes", DictValue::arrayInt(qkv_hidden_sizes.data(), qkv_hidden_sizes.size()));
+
+        Net net;
+        int id = net.addLayerToPrev(lp.name, lp.type, lp);
+        net.connect(0, 0, id, 0);
+        net.connect(0, 1, id, 1);
+        net.connect(0, 2, id, 2);
+
+        {
+            std::vector<std::string> input_names{"x", "weight", "bias"};
+            net.setInputsNames(input_names);
+            net.setInput(x, input_names[0]);
+            net.setInput(weight, input_names[1]);
+            net.setInput(bias, input_names[2]);
+
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+        }
+
+        TEST_CYCLE()
+        {
+            Mat out = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+};
+
+PERF_TEST_P_(Layer_Attention, VisionTransformer) {
+    test_layer({1, 197, 768}, {768, 768, 768}, 12);
+}
+
+struct Layer_GroupNorm : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    void test_layer(const std::vector<int>& x_shape, int num_groups)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat x(x_shape, CV_32FC1);
+        Mat scale(x_shape[1], 1, CV_32FC1);
+        Mat b(x_shape[1], 1, CV_32FC1);
+
+        randu(x, 0.f, 1.f);
+        randu(scale, 0.f, 1.f);
+        randu(b, 0.f, 1.f);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "GroupNormalization";
+        lp.name = "testLayer";
+        lp.set("num_groups", num_groups);
+
+        int id = net.addLayerToPrev(lp.name, lp.type, lp);
+        net.connect(0, 0, id, 0);
+        net.connect(0, 1, id, 1);
+        net.connect(0, 2, id, 2);
+
+        // warmup
+        {
+            std::vector<String> inpNames{"x", "scale", "b"};
+            net.setInputsNames(inpNames);
+            net.setInput(x, inpNames[0]);
+            net.setInput(scale, inpNames[1]);
+            net.setInput(b, inpNames[2]);
+
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+
+    int N = 2;
+    int C = 64;
+    int H = 180;
+    int W = 240;
+    int num_groups = 16;
+};
+
+PERF_TEST_P_(Layer_GroupNorm, GroupNorm)
+{
+    test_layer({N, C, H, W}, num_groups);
+}
+
+
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 #ifdef HAVE_CUDA
 INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
 #endif
-INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
-INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
+#ifdef HAVE_VULKAN
+INSTANTIATE_TEST_CASE_P(VULKAN, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN)));
+#endif
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
+INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
+INSTANTIATE_TEST_CASE_P(/**/, Layer_InstanceNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Attention, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
+INSTANTIATE_TEST_CASE_P(/**/, Layer_GroupNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
+
+typedef TestBaseWithParam<tuple<Vec4i, int, bool, tuple<Backend, Target> > > Layer_FullyConnected;
+PERF_TEST_P_(Layer_FullyConnected, fc)
+{
+    std::vector<int> inpShape;
+    inpShape.reserve(4);
+    for (int i = 0; i < 4; ++i) {
+        int dim = get<0>(GetParam())[i];
+        if (dim == 0)
+            break;
+        inpShape.push_back(dim);
+    }
+    Mat input(inpShape, CV_32F);
+    randn(input, 0, 1);
+
+    int axis = input.dims - 1;
+    int outDims = get<1>(GetParam());
+    bool isMatMul = get<2>(GetParam());
+    int backendId = get<0>(get<3>(GetParam()));
+    int targetId = get<1>(get<3>(GetParam()));
+
+    if (inpShape.size() == 4 && inpShape[0] == 5 && inpShape[1] == 16 && inpShape[2] == 512 && inpShape[3] == 128 && outDims >= 512)
+        applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
+    std::vector<int> weightShape;
+    if (isMatMul) {
+        weightShape = inpShape;
+        weightShape[weightShape.size() - 2] = outDims;
+    } else {
+        weightShape = {outDims, (int)input.total(axis, input.dims)};
+    }
+    Mat weights(weightShape, CV_32F);
+    randn(weights, 0, 1);
+
+    LayerParams lp;
+    lp.set("axis", input.dims - 1);
+    lp.set("is_matmul", weights.dims > 2);
+    lp.set("bias_term", false);
+    lp.set("num_output", (int)weights.total(0, weights.dims - 1));
+    lp.blobs.resize(1, weights);
+
+    Net net;
+    net.addLayerToPrev("matmul", "InnerProduct", lp);
+
+    net.setInput(input);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // warmup
+    Mat output = net.forward();
+
+    TEST_CYCLE()
+    {
+        net.forward();
+    }
+    SANITY_CHECK_NOTHING();
+}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_FullyConnected, Combine(
+    Values(                // input size
+        Vec4i(5, 512, 384),
+        Vec4i(5, 16, 512, 128)
+    ),
+    Values(256, 512, 1024),  // output dimension
+    testing::Bool(),         // is_matmul
+    dnnBackendsAndTargets()
+));
+
+typedef TestBaseWithParam<tuple<std::vector<int>, int, tuple<Backend, Target> > > Layer_Softmax;
+PERF_TEST_P_(Layer_Softmax, softmax_3d) {
+    std::vector<int> shape = get<0>(GetParam());
+    int axis = get<1>(GetParam());
+    int backendId = get<0>(get<2>(GetParam()));
+    int targetId = get<1>(get<2>(GetParam()));
+
+    Mat data(shape, CV_32FC1);
+    Scalar mean = 0.f;
+    Scalar std = 1.f;
+    randn(data, mean, std);
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Softmax";
+    lp.name = "testLayer";
+    lp.set("axis", axis);
+
+    net.addLayerToPrev(lp.name, lp.type, lp);
+    // warmup
+    {
+        net.setInput(data);
+        net.setPreferableBackend(backendId);
+        net.setPreferableTarget(targetId);
+        Mat out = net.forward();
+    }
+
+    TEST_CYCLE() {
+        Mat res = net.forward();
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Softmax, Combine(
+    Values(                // input size
+            std::vector<int>({16, 50, 50}),
+            std::vector<int>({16, 197, 197}),
+            std::vector<int>({16, 1024, 1024})
+    ),
+    Values(0, 1, 2),  // axis
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));
 
 } // namespace
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index cfbb45b17372..a94775791122 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -29,10 +29,7 @@ class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<Backend, Target>
     }
 
     void processNet(std::string weights, std::string proto, std::string halide_scheduler,
-                    const Mat& input, const std::string& outputLayer = "")
-    {
-        randu(input, 0.0f, 1.0f);
-
+                    const std::vector<std::tuple<Mat, std::string>>& inputs, const std::string& outputLayer = ""){
         weights = findDataFile(weights, false);
         if (!proto.empty())
             proto = findDataFile(proto);
@@ -43,8 +40,12 @@ class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<Backend, Target>
             if (!halide_scheduler.empty())
                 halide_scheduler = findDataFile(std::string("dnn/halide_scheduler_") + (target == DNN_TARGET_OPENCL ? "opencl_" : "") + halide_scheduler, true);
         }
-        net = readNet(proto, weights);
-        net.setInput(blobFromImage(input, 1.0, Size(), Scalar(), false));
+        net = readNet(weights, proto);
+        // Set multiple inputs
+        for(auto &inp: inputs){
+            net.setInput(std::get<0>(inp), std::get<1>(inp));
+        }
+
         net.setPreferableBackend(backend);
         net.setPreferableTarget(target);
         if (backend == DNN_BACKEND_HALIDE)
@@ -52,10 +53,14 @@ class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<Backend, Target>
             net.setHalideScheduler(halide_scheduler);
         }
 
-        MatShape netInputShape = shape(1, 3, input.rows, input.cols);
+        // Calculate multiple inputs memory consumption
+        std::vector<MatShape> netMatShapes;
+        for(auto &inp: inputs){
+            netMatShapes.push_back(shape(std::get<0>(inp)));
+        }
         size_t weightsMemory = 0, blobsMemory = 0;
-        net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
-        int64 flops = net.getFLOPS(netInputShape);
+        net.getMemoryConsumption(netMatShapes, weightsMemory, blobsMemory);
+        int64 flops = net.getFLOPS(netMatShapes);
         CV_Assert(flops > 0);
 
         net.forward(outputLayer); // warmup
@@ -71,31 +76,45 @@ class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<Backend, Target>
 
         SANITY_CHECK_NOTHING();
     }
-};
 
+    void processNet(std::string weights, std::string proto, std::string halide_scheduler,
+                    Mat &input, const std::string& outputLayer = "")
+    {
+        processNet(weights, proto, halide_scheduler, {std::make_tuple(input, "")}, outputLayer);
+    }
+
+    void processNet(std::string weights, std::string proto, std::string halide_scheduler,
+                    Size inpSize, const std::string& outputLayer = "")
+    {
+        Mat input_data(inpSize, CV_32FC3);
+        randu(input_data, 0.0f, 1.0f);
+        Mat input = blobFromImage(input_data, 1.0, Size(), Scalar(), false);
+        processNet(weights, proto, halide_scheduler, input, outputLayer);
+    }
+};
 
 PERF_TEST_P_(DNNTestNetwork, AlexNet)
 {
     processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
-            "alexnet.yml", Mat(cv::Size(227, 227), CV_32FC3));
+            "alexnet.yml", cv::Size(227, 227));
 }
 
 PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
 {
     processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
-            "", Mat(cv::Size(224, 224), CV_32FC3));
+            "", cv::Size(224, 224));
 }
 
 PERF_TEST_P_(DNNTestNetwork, ResNet_50)
 {
     processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
-            "resnet_50.yml", Mat(cv::Size(224, 224), CV_32FC3));
+            "resnet_50.yml", cv::Size(224, 224));
 }
 
 PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
 {
     processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
-            "squeezenet_v1_1.yml", Mat(cv::Size(227, 227), CV_32FC3));
+            "squeezenet_v1_1.yml", cv::Size(227, 227));
 }
 
 PERF_TEST_P_(DNNTestNetwork, Inception_5h)
@@ -103,7 +122,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_5h)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) throw SkipTestException("");
     processNet("dnn/tensorflow_inception_graph.pb", "",
             "inception_5h.yml",
-            Mat(cv::Size(224, 224), CV_32FC3), "softmax2");
+            cv::Size(224, 224), "softmax2");
 }
 
 PERF_TEST_P_(DNNTestNetwork, ENet)
@@ -116,13 +135,15 @@ PERF_TEST_P_(DNNTestNetwork, ENet)
         throw SkipTestException("");
 #endif
     processNet("dnn/Enet-model-best.net", "", "enet.yml",
-            Mat(cv::Size(512, 256), CV_32FC3));
+            cv::Size(512, 256));
 }
 
 PERF_TEST_P_(DNNTestNetwork, SSD)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
     processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", "dnn/ssd_vgg16.prototxt", "disabled",
-            Mat(cv::Size(300, 300), CV_32FC3));
+            cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, OpenFace)
@@ -134,15 +155,15 @@ PERF_TEST_P_(DNNTestNetwork, OpenFace)
         throw SkipTestException("");
 #endif
     processNet("dnn/openface_nn4.small2.v1.t7", "", "",
-            Mat(cv::Size(96, 96), CV_32FC3));
+            cv::Size(96, 96));
 }
 
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
 {
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
-    processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
-            Mat(cv::Size(300, 300), CV_32FC3));
+    processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", "",
+            cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
@@ -150,7 +171,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "ssd_mobilenet_v1_coco_2017_11_17.pbtxt", "",
-            Mat(cv::Size(300, 300), CV_32FC3));
+            cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
@@ -158,7 +179,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     processNet("dnn/ssd_mobilenet_v2_coco_2018_03_29.pb", "ssd_mobilenet_v2_coco_2018_03_29.pbtxt", "",
-            Mat(cv::Size(300, 300), CV_32FC3));
+            cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
@@ -166,18 +187,20 @@ PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", "",
-               Mat(cv::Size(224, 224), CV_32FC3));
+               cv::Size(224, 224));
 }
 
 PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
     if (backend == DNN_BACKEND_HALIDE ||
         (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_HDDL)))
         throw SkipTestException("");
     // The same .caffemodel but modified .prototxt
     // See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
     processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt", "",
-               Mat(cv::Size(368, 368), CV_32FC3));
+               cv::Size(368, 368));
 }
 
 PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
@@ -185,20 +208,25 @@ PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", "",
-               Mat(cv::Size(300, 300), CV_32FC3));
+               cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
-            Mat(cv::Size(300, 300), CV_32FC3));
+            cv::Size(300, 300));
 }
 
 PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 {
-    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_2GB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
@@ -213,15 +241,16 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 #endif
 
     Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
     processNet("dnn/yolov3.weights", "dnn/yolov3.cfg", "", inp);
 }
 
 PERF_TEST_P_(DNNTestNetwork, YOLOv4)
 {
-    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_2GB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
     if (target == DNN_TARGET_MYRIAD)  // not enough resources
@@ -233,9 +262,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4)
         throw SkipTestException("Test is disabled in OpenVINO 2020.4");
 #endif
     Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
     processNet("dnn/yolov4.weights", "dnn/yolov4.cfg", "", inp);
 }
 
@@ -248,28 +275,61 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
         throw SkipTestException("");
 #endif
     Mat sample = imread(findDataFile("dnn/dog416.png"));
-    cvtColor(sample, sample, COLOR_BGR2RGB);
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0f / 255, 0);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(), Scalar(), true);
     processNet("dnn/yolov4-tiny-2020-12.weights", "dnn/yolov4-tiny-2020-12.cfg", "", inp);
 }
 
+PERF_TEST_P_(DNNTestNetwork, YOLOv5) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("dnn/yolov5n.onnx", "", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOv8)
+{
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_512MB,
+        CV_TEST_TAG_DEBUG_LONG
+    );
+
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("dnn/yolov8n.onnx", "", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YOLOX) {
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_512MB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(640, 640), Scalar(), true);
+    processNet("dnn/yolox_s.onnx", "", "", inp);
+}
+
 PERF_TEST_P_(DNNTestNetwork, EAST_text_detection)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
-    processNet("dnn/frozen_east_text_detection.pb", "", "", Mat(cv::Size(320, 320), CV_32FC3));
+    processNet("dnn/frozen_east_text_detection.pb", "", "", cv::Size(320, 320));
 }
 
 PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
-    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", Mat(cv::Size(320, 240), CV_32FC3));
+    processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", cv::Size(320, 240));
 }
 
 PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019010000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         throw SkipTestException("Test is disabled in OpenVINO 2019R1");
@@ -288,7 +348,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
         throw SkipTestException("");
     processNet("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb",
                "dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", "",
-               Mat(cv::Size(800, 600), CV_32FC3));
+               cv::Size(800, 600));
 }
 
 PERF_TEST_P_(DNNTestNetwork, EfficientDet)
@@ -296,12 +356,95 @@ PERF_TEST_P_(DNNTestNetwork, EfficientDet)
     if (backend == DNN_BACKEND_HALIDE || target != DNN_TARGET_CPU)
         throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/dog416.png"));
-    resize(sample, sample, Size(512, 512));
-    Mat inp;
-    sample.convertTo(inp, CV_32FC3, 1.0/255);
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(512, 512), Scalar(), true);
     processNet("dnn/efficientdet-d0.pb", "dnn/efficientdet-d0.pbtxt", "", inp);
 }
 
+PERF_TEST_P_(DNNTestNetwork, EfficientNet)
+{
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    Mat inp = blobFromImage(sample, 1.0 / 255.0, Size(224, 224), Scalar(), true);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("dnn/efficientnet-lite4.onnx", "", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, YuNet) {
+    processNet("dnn/onnx/models/yunet-202303.onnx", "", "", cv::Size(640, 640));
+}
+
+PERF_TEST_P_(DNNTestNetwork, SFace) {
+    processNet("dnn/face_recognition_sface_2021dec.onnx", "", "", cv::Size(112, 112));
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPPalm) {
+    Mat inp(cv::Size(192, 192), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("dnn/palm_detection_mediapipe_2023feb.onnx", "", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPHand) {
+    Mat inp(cv::Size(224, 224), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("dnn/handpose_estimation_mediapipe_2023feb.onnx", "", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, MPPose) {
+    Mat inp(cv::Size(256, 256), CV_32FC3);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    transposeND(inp, {0, 2, 3, 1}, inp);
+    processNet("dnn/pose_estimation_mediapipe_2023mar.onnx", "", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, PPOCRv3) {
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    processNet("dnn/onnx/models/PP_OCRv3_DB_text_det.onnx", "", "", cv::Size(736, 736));
+}
+
+PERF_TEST_P_(DNNTestNetwork, PPHumanSeg) {
+    processNet("dnn/human_segmentation_pphumanseg_2023mar.onnx", "", "", cv::Size(192, 192));
+}
+
+PERF_TEST_P_(DNNTestNetwork, CRNN) {
+    Mat inp(cv::Size(100, 32), CV_32FC1);
+    randu(inp, 0.0f, 1.0f);
+    inp = blobFromImage(inp, 1.0, Size(), Scalar(), false);
+    processNet("dnn/text_recognition_CRNN_EN_2021sep.onnx", "", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, VitTrack) {
+    Mat inp1(cv::Size(128, 128), CV_32FC3);
+    Mat inp2(cv::Size(256, 256), CV_32FC3);
+    randu(inp1, 0.0f, 1.0f);
+    randu(inp2, 0.0f, 1.0f);
+    inp1 = blobFromImage(inp1, 1.0, Size(), Scalar(), false);
+    inp2 = blobFromImage(inp2, 1.0, Size(), Scalar(), false);
+    processNet("dnn/onnx/models/object_tracking_vittrack_2023sep.onnx", "", "",
+               {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
+}
+
+PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
+{
+    if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
+        backend != DNN_BACKEND_TIMVX && backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) {
+        throw SkipTestException("");
+    }
+    Mat inp = imread(findDataFile("dnn/dog416.png"));
+    inp = blobFromImage(inp, 1.0 / 255.0, Size(320, 320), Scalar(), true);
+    processNet("dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", "", "", inp);
+}
+
+PERF_TEST_P_(DNNTestNetwork, VIT_B_32)
+{
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
+    processNet("dnn/onnx/models/vit_b_32.onnx", "", "", cv::Size(224, 224));
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
 
 } // namespace
diff --git a/modules/dnn/src/caffe/caffe_importer.cpp b/modules/dnn/src/caffe/caffe_importer.cpp
index 3c08b92a75e6..50e1fbe93fa2 100644
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@@ -125,6 +125,7 @@ class CaffeImporter
         {
             const google::protobuf::UnknownField& field = unknownFields.field(i);
             CV_Assert(field.type() == google::protobuf::UnknownField::TYPE_GROUP);
+            CV_CheckGE(field.group().field_count(), 2, "UnknownField should have at least 2 items: name and value");
             std::string fieldName = field.group().field(0).length_delimited();
             std::string fieldValue = field.group().field(1).length_delimited();
             params.set(fieldName, fieldValue);
@@ -278,8 +279,8 @@ class CaffeImporter
                 // Half precision floats.
                 CV_Assert(raw_data.size() / 2 == (int)dstBlob.total());
 
-                Mat halfs((int)shape.size(), &shape[0], CV_16SC1, (void*)raw_data.c_str());
-                convertFp16(halfs, dstBlob);
+                Mat halfs((int)shape.size(), &shape[0], CV_16FC1, (void*)raw_data.c_str());
+                halfs.convertTo(dstBlob, CV_32F);
             }
             else if (pbBlob.raw_data_type() == caffe::FLOAT)
             {
@@ -498,6 +499,11 @@ class CaffeImporter
             {
                 type = "Convolution";
             }
+            else if (type == "Softmax"){
+                // set default axis to 1
+                if(!layerParams.has("axis"))
+                    layerParams.set("axis", 1);
+            }
 
             int id = dstNet.addLayer(name, type, layerParams);
 
diff --git a/modules/dnn/src/caffe/caffe_shrinker.cpp b/modules/dnn/src/caffe/caffe_shrinker.cpp
index 99e0ef85c126..a23ff5deb30a 100644
--- a/modules/dnn/src/caffe/caffe_shrinker.cpp
+++ b/modules/dnn/src/caffe/caffe_shrinker.cpp
@@ -44,8 +44,8 @@ void shrinkCaffeModel(const String& src, const String& dst, const std::vector<St
             CV_Assert(blob->data_size() != 0);  // float32 array.
 
             Mat floats(1, blob->data_size(), CV_32FC1, (void*)blob->data().data());
-            Mat halfs(1, blob->data_size(), CV_16SC1);
-            convertFp16(floats, halfs);  // Convert to float16.
+            Mat halfs(1, blob->data_size(), CV_16FC1);
+            floats.convertTo(halfs, CV_16F);  // Convert to float16.
 
             blob->clear_data();  // Clear float32 data.
 
diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu
index e12457a164ce..e983c95a9161 100644
--- a/modules/dnn/src/cuda/activations.cu
+++ b/modules/dnn/src/cuda/activations.cu
@@ -248,6 +248,11 @@ void selu(const Stream& stream, Span<T> output, View<T> input, T alpha, T gamma)
     generic_op<T, SeluFunctor<T>>(stream, output, input, {alpha, gamma});
 }
 
+template <class T>
+void gelu(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, GeluFunctor<T>>(stream, output, input);
+}
+
 template <class T>
 void sign(const Stream& stream, Span<T> output, View<T> input) {
     generic_op<T, SignFunctor<T>>(stream, output, input);
@@ -324,6 +329,7 @@ template void tan<__half>(const Stream&, Span<__half>, View<__half>);
 template void celu<__half>(const Stream&, Span<__half>, View<__half>, __half);
 template void hardsigmoid<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
 template void selu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void gelu<__half>(const Stream&, Span<__half>, View<__half>);
 template void thresholdedrelu<__half>(const Stream&, Span<__half>, View<__half>, __half);
 template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
 template void exp<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
@@ -366,6 +372,7 @@ template void tan<float>(const Stream&, Span<float>, View<float>);
 template void celu<float>(const Stream&, Span<float>, View<float>, float);
 template void hardsigmoid<float>(const Stream&, Span<float>, View<float>, float, float);
 template void selu<float>(const Stream&, Span<float>, View<float>, float, float);
+template void gelu<float>(const Stream&, Span<float>, View<float>);
 template void thresholdedrelu<float>(const Stream&, Span<float>, View<float>, float);
 template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
 template void exp<float>(const Stream&, Span<float>, View<float>, float, float);
diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu
index f94bdb811346..e2a7cc9a674d 100644
--- a/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/modules/dnn/src/cuda/eltwise_ops.cu
@@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
     }
     else
     {
-        CV_Assert(is_shape_compatible(output, x));
-        CV_Assert(is_shape_compatible(output, y));
+        auto inShape1 = x.shape_as_vector();
+        auto inShape2 = y.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size();
+        if (x_ndims >= y_ndims) {
+            for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) {
+               inShape2.insert(inShape2.begin(), 1);
+            }
+        } else {
+            for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) {
+               inShape1.insert(inShape1.begin(), 1);
+            }
+        }
+
+        CV_Assert(is_shape_compatible1(outShape, inShape1));
+        CV_Assert(is_shape_compatible1(outShape, inShape2));
 
         /* matching singleton axes in both input tensors can be eliminated
          *
@@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
          * x: [1, 256, 32, 32] -> [256, 32, 32]
          * y: [1, 256, 1, 1] -> [256, 1, 1]
          */
-        for (int r = 0; r < output.rank(); r++)
-        {
-            while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
-                CV_Assert(output.get_axis_size(r) == 1);
-
-                x.squeeze(r);
-                y.squeeze(r);
-                output.squeeze(r);
+        int eliminate_times = 0;
+        for (std::size_t i = 0; i < outShape.size(); i++) {
+            if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) {
+                eliminate_times++;
+            } else {
+                break;
+            }
+        }
+        if (eliminate_times > 0) {
+            for (int i = 0; i < eliminate_times; i++) {
+                inShape1.erase(inShape1.begin());
+                inShape2.erase(inShape2.begin());
+                outShape.erase(outShape.begin());
             }
         }
-
-        auto inShape1 = x.shape_as_vector();
-        auto inShape2 = y.shape_as_vector();
-        auto outShape = output.shape_as_vector();
 
         /* contiguous axes that do not broadcast can be merged into one axis
          *
@@ -319,7 +335,25 @@ void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     eltwise_op<T, DivFunctor<T>>(stream, output, x, y);
 }
 
+template <class T>
+void eltwise_sub_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, SubFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_mod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, ModFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, FModFunctor<T>>(stream, output, x, y);
+}
+
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void eltwise_mod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_fmod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_sub_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_sum_coeff_2(const Stream&, TensorSpan<__half>, __half, TensorView<__half>, __half, TensorView<__half>);
@@ -327,6 +361,9 @@ void eltwise_div_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
 #endif
+    template void eltwise_mod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_fmod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_sub_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_sum_coeff_2(const Stream&, TensorSpan<float>, float, TensorView<float>, float, TensorView<float>);
diff --git a/modules/dnn/src/cuda/functors.hpp b/modules/dnn/src/cuda/functors.hpp
index 83a949f8e7eb..cada43387e0e 100644
--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
@@ -588,6 +588,21 @@ struct SeluFunctor {
     T alpha, gamma;
 };
 
+template <class T>
+struct GeluFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE GeluFunctor() { }
+    CUDA4DNN_DEVICE GeluFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T value) {
+        using csl::device::erf;
+        return static_cast<T>(0.5f) * value * (static_cast<T>(1.f) + erf(value * static_cast<T>(M_SQRT1_2)));
+    }
+};
+
 template <class T>
 struct ThresholdedReluFunctor {
     struct Params {
@@ -726,6 +741,18 @@ struct DivFunctor {
     CUDA4DNN_DEVICE T operator()(T x, T y) { return x / y; }
 };
 
+template <class T>
+struct SubFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() { }
+    };
+
+    CUDA4DNN_DEVICE SubFunctor() { }
+    CUDA4DNN_DEVICE SubFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) { return x - y; }
+};
+
 template <class T>
 struct SignFunctor {
     struct Params {
@@ -772,6 +799,40 @@ struct ReciprocalFunctor {
     }
 };
 
+template <class T>
+struct ModFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() {}
+    };
+
+    CUDA4DNN_DEVICE ModFunctor() { }
+    CUDA4DNN_DEVICE ModFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        int res = (int)x % (int)y;
+        T zero = T(0);
+        if ((res > (int)zero && y < zero) || (res < (int)zero && y > zero)) {
+            res += (int)y;
+        }
+        return res;
+    }
+};
+
+template <class T>
+struct FModFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() {}
+    };
+
+    CUDA4DNN_DEVICE FModFunctor() { }
+    CUDA4DNN_DEVICE FModFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        using csl::device::fmod;
+        return fmod(x, y);
+    }
+};
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp
index 0a312a250d74..8e4f091f4f42 100644
--- a/modules/dnn/src/cuda/math.hpp
+++ b/modules/dnn/src/cuda/math.hpp
@@ -36,6 +36,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace de
     template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
     template <> inline __device__ double min(double x, double y) { return fmin(x, y); }
 
+    template <class T> __device__ T fmod(T x, T y) { return x % y; }
+    template <> inline __device__ float fmod(float x, float y) { return fmodf(x, y); }
+    template <> inline __device__ double fmod(double x, double y) { return fmod(x, y); }
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ half fmod(half x, half y) { return fmodf((float)x, (float)y); }
+#endif
+
     template <class T> __device__ T log1p(T val);
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
     template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); }
diff --git a/modules/dnn/src/cuda/mvn.cu b/modules/dnn/src/cuda/mvn.cu
index adf997c0b0c6..d6db7c4fb46d 100644
--- a/modules/dnn/src/cuda/mvn.cu
+++ b/modules/dnn/src/cuda/mvn.cu
@@ -66,6 +66,50 @@ namespace raw {
             output[idx] = (static_cast<float>(input[idx]) - means[outer_idx]) * scale[outer_idx];
         }
     }
+
+    template <class T>
+    __global__ void normalize_mean_variance_channelwise(Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, size_type inner_size, size_type C) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            const index_type c = outer_idx % C;
+            auto s = static_cast<float>(scale[c]) * inv_stddev[outer_idx];
+            auto b = static_cast<float>(bias[c]);
+            output[idx] = (static_cast<float>(input[idx]) - means[outer_idx]) * s + b;
+        }
+    }
+
+    template <class T>
+    __global__ void normalize_mean_variance_groupwise(Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, size_type inner_size, size_type C, size_type num_groups, size_type group_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            const index_type c = outer_idx % C;
+            const index_type group_idx = outer_idx / group_size;
+            auto s = static_cast<float>(scale[c]) * inv_stddev[group_idx];
+            auto b = static_cast<float>(bias[c]);
+            output[idx] = (static_cast<float>(input[idx]) - means[group_idx]) * s + b;
+        }
+    }
+
+    template <class T>
+    __global__ void normalize_mean_variance_layernorm(Span<T> output, View<T> input, View<T> scale, View<float> means, View<float> inv_stddev, size_type inner_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            const index_type inner_idx = idx % inner_size;
+            auto s = static_cast<float>(scale[inner_idx]) * inv_stddev[outer_idx];
+            output[idx] = (static_cast<float>(input[idx]) - means[outer_idx]) * s;
+        }
+    }
+
+    template <class T>
+    __global__ void normalize_mean_variance_layernorm_with_bias(Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, size_type inner_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            const index_type inner_idx = idx % inner_size;
+            auto s = static_cast<float>(scale[inner_idx]) * inv_stddev[outer_idx];
+            auto b = static_cast<float>(bias[inner_idx]);
+            output[idx] = (static_cast<float>(input[idx]) - means[outer_idx]) * s + b;
+        }
+    }
 }
 
 template <class T>
@@ -142,4 +186,73 @@ template void normalize_mean_variance(const Stream&, Span<__half>, View<__half>,
 #endif
 template void normalize_mean_variance(const Stream&, Span<float>, View<float>, View<float>, View<float>, std::size_t);
 
+template <class T>
+void normalize_mean_variance_channelwise(const Stream& stream, Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, std::size_t inner_size, std::size_t C)
+{
+    CV_Assert(input.size() == output.size());
+    CV_Assert(input.size() / inner_size == means.size());
+    CV_Assert(means.size() == inv_stddev.size());
+
+    auto kernel = raw::normalize_mean_variance_channelwise<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, scale, bias, means, inv_stddev, inner_size, C);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean_variance_channelwise(const Stream&, Span<__half> /*output*/, View<__half> /*input*/, View<__half> /*scale*/, View<__half> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t);
+#endif
+template void normalize_mean_variance_channelwise(const Stream&, Span<float> /*output*/, View<float> /*input*/, View<float> /*scale*/, View<float> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t);
+
+template <class T>
+void normalize_mean_variance_groupwise(const Stream& stream, Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, std::size_t inner_size, std::size_t C, std::size_t num_groups, std::size_t group_size)
+{
+    CV_Assert(input.size() == output.size());
+    CV_Assert(input.size() / inner_size == means.size() * group_size);
+    CV_Assert(means.size() == inv_stddev.size());
+
+    auto kernel = raw::normalize_mean_variance_groupwise<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, scale, bias, means, inv_stddev, inner_size, C, num_groups, group_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean_variance_groupwise(const Stream&, Span<__half> /*output*/, View<__half> /*input*/, View<__half> /*scale*/, View<__half> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t, std::size_t, std::size_t);
+#endif
+template void normalize_mean_variance_groupwise(const Stream&, Span<float> /*output*/, View<float> /*input*/, View<float> /*scale*/, View<float> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t, std::size_t, std::size_t);
+
+
+template <class T>
+void normalize_mean_variance_layernorm(const Stream& stream, Span<T> output, View<T> input, View<T> scale, View<float> means, View<float> inv_stddev, std::size_t inner_size)
+{
+    CV_Assert(input.size() == output.size());
+    CV_Assert(input.size() / inner_size == means.size());
+    CV_Assert(means.size() == inv_stddev.size());
+
+    auto kernel = raw::normalize_mean_variance_layernorm<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, scale, means, inv_stddev, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean_variance_layernorm(const Stream&, Span<__half> /*output*/, View<__half> /*input*/, View<__half> /*scale*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t);
+#endif
+template void normalize_mean_variance_layernorm(const Stream&, Span<float> /*output*/, View<float> /*input*/, View<float> /*scale*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t);
+
+template <class T>
+void normalize_mean_variance_layernorm(const Stream& stream, Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, std::size_t inner_size)
+{
+    CV_Assert(input.size() == output.size());
+    CV_Assert(input.size() / inner_size == means.size());
+    CV_Assert(means.size() == inv_stddev.size());
+
+    auto kernel = raw::normalize_mean_variance_layernorm_with_bias<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, scale, bias, means, inv_stddev, inner_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean_variance_layernorm(const Stream&, Span<__half> /*output*/, View<__half> /*input*/, View<__half> /*scale*/, View<__half> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t);
+#endif
+template void normalize_mean_variance_layernorm(const Stream&, Span<float> /*output*/, View<float> /*input*/, View<float> /*scale*/, View<float> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t);
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
diff --git a/modules/dnn/src/cuda4dnn/csl/cublas.hpp b/modules/dnn/src/cuda4dnn/csl/cublas.hpp
index 760e3824fdbe..96cf70fab9cd 100644
--- a/modules/dnn/src/cuda4dnn/csl/cublas.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/cublas.hpp
@@ -8,6 +8,7 @@
 #include "error.hpp"
 #include "stream.hpp"
 #include "pointer.hpp"
+#include "memory.hpp"
 
 #include <opencv2/core.hpp>
 
@@ -363,6 +364,145 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu
         );
     }
 
+    /** @brief Strided batched GEMM for colummn-major matrices
+     *
+     * \f$ C_i = \alpha A_i B_i + \beta C_i \f$ for a stack of matrices A, B and C indexed by i
+     *
+     * @tparam          T           matrix element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuBLAS Handle
+     * @param           trans_a     use transposed matrix of A_i for computation
+     * @param           trans_b     use transposed matrix of B_i for computation
+     * @param           M           number of rows in C
+     * @param           N           number of columns in C
+     * @param           K           common dimension of A (or trans A) and B (or trans B)
+     * @param           alpha       scale factor for A B
+     * @param[in]       A           pointer to stack of column-major matrices A in device memory
+     * @param           lda         leading dimension of matrix A
+     * @param           A_offsets   offsets to get A slices
+     * @param[in]       B           pointer to stack of column-major matrices B in device memory
+     * @param           ldb         leading dimension of matrix B
+     * @param           B_offsets   offsets to get B slices
+     * @param           beta        scale factor for C
+     * @param[in,out]   C           pointer to stack of column-major matrices C in device memory
+     * @param           ldc         leading dimension of matrix C
+     * @param           C_offsets   offsets to get C slices
+     * @param           batchCount  number of matrices in the batch
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void gemmBatched(const Handle &handle,
+                     bool trans_a, bool trans_b,
+                     std::size_t M, std::size_t N, std::size_t K,
+                     T alpha,
+                     const DevicePtr<const T> A, std::size_t lda, std::vector<std::size_t> A_offsets,
+                     const DevicePtr<const T> B, std::size_t ldb, std::vector<std::size_t> B_offsets,
+                     T beta,
+                     const DevicePtr<T> C, std::size_t ldc, std::vector<std::size_t> C_offsets,
+                     std::size_t batchCount);
+
+    template <> inline
+    void gemmBatched<half>(const Handle &handle,
+                           bool trans_a, bool trans_b,
+                           std::size_t M, std::size_t N, std::size_t K,
+                           half alpha,
+                           const DevicePtr<const half> A, std::size_t lda, std::vector<std::size_t> A_offsets,
+                           const DevicePtr<const half> B, std::size_t ldb, std::vector<std::size_t> B_offsets,
+                           half beta,
+                           const DevicePtr<half> C, std::size_t ldc, std::vector<std::size_t> C_offsets,
+                           std::size_t batchCount) {
+        CV_Assert(handle);
+
+        const auto opa = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
+                   opb = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+        const auto iM = static_cast<int>(M),
+                   iN = static_cast<int>(N),
+                   iK = static_cast<int>(K),
+                   ilda = static_cast<int>(lda),
+                   ildb = static_cast<int>(ldb),
+                   ildc = static_cast<int>(ldc);
+
+        const auto batch_count = static_cast<int>(batchCount);
+
+        AutoBuffer<half> buffer(3 * batch_count);
+        auto A_slices = (half**)(buffer.data());
+        auto B_slices = A_slices + batch_count;
+        auto C_slices = B_slices + batch_count;
+        // collect A, B and C slices
+        for (int i = 0; i < batch_count; i++) {
+            A_slices[i] = (half*)(A.get()) + A_offsets[i];
+            B_slices[i] = (half*)(B.get()) + B_offsets[i];
+            C_slices[i] = (half*)(C.get()) + C_offsets[i];
+        }
+
+        const half **dev_A_slices = 0, **dev_B_slices = 0;
+        half **dev_C_slices = 0;
+        cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(half*));
+        cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(half*));
+        cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(half*));
+        cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(half*), cudaMemcpyHostToDevice);
+
+        CUDA4DNN_CHECK_CUBLAS(cublasHgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count));
+
+        cudaFree(dev_A_slices);
+        cudaFree(dev_B_slices);
+        cudaFree(dev_C_slices);
+    }
+
+    template <> inline
+    void gemmBatched<float>(const Handle &handle,
+                           bool trans_a, bool trans_b,
+                           std::size_t M, std::size_t N, std::size_t K,
+                           float alpha,
+                           const DevicePtr<const float> A, std::size_t lda, std::vector<std::size_t> A_offsets,
+                           const DevicePtr<const float> B, std::size_t ldb, std::vector<std::size_t> B_offsets,
+                           float beta,
+                           const DevicePtr<float> C, std::size_t ldc, std::vector<std::size_t> C_offsets,
+                           std::size_t batchCount) {
+        CV_Assert(handle);
+
+        const auto opa = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
+                   opb = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+        const auto iM = static_cast<int>(M),
+                   iN = static_cast<int>(N),
+                   iK = static_cast<int>(K),
+                   ilda = static_cast<int>(lda),
+                   ildb = static_cast<int>(ldb),
+                   ildc = static_cast<int>(ldc);
+
+        const auto batch_count = static_cast<int>(batchCount);
+
+        AutoBuffer<float> buffer(3 * batch_count);
+        auto A_slices = (float**)(buffer.data());
+        auto B_slices = A_slices + batch_count;
+        auto C_slices = B_slices + batch_count;
+        // collect A, B and C slices
+        for (int i = 0; i < batch_count; i++) {
+            A_slices[i] = (float*)(A.get()) + A_offsets[i];
+            B_slices[i] = (float*)(B.get()) + B_offsets[i];
+            C_slices[i] = (float*)(C.get()) + C_offsets[i];
+        }
+
+        const float **dev_A_slices = 0, **dev_B_slices = 0;
+        float **dev_C_slices = 0;
+        cudaMalloc((void**)&dev_A_slices, batch_count * sizeof(float*));
+        cudaMalloc((void**)&dev_B_slices, batch_count * sizeof(float*));
+        cudaMalloc((void**)&dev_C_slices, batch_count * sizeof(float*));
+        cudaMemcpy(dev_A_slices, A_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_B_slices, B_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_C_slices, C_slices, batch_count * sizeof(float*), cudaMemcpyHostToDevice);
+
+        // cuBLAS is column-major
+        CUDA4DNN_CHECK_CUBLAS(cublasSgemmBatched(handle.get(), opa, opb, iM, iN, iK, &alpha, dev_A_slices, ilda, dev_B_slices, ildb, &beta, dev_C_slices, ildc, batch_count));
+
+        cudaFree(dev_A_slices);
+        cudaFree(dev_B_slices);
+        cudaFree(dev_C_slices);
+    }
+
 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */
diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/recurrent.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/recurrent.hpp
index 7ba6acdf173c..8006dca62b96 100644
--- a/modules/dnn/src/cuda4dnn/csl/cudnn/recurrent.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/cudnn/recurrent.hpp
@@ -97,7 +97,7 @@ class RNNDescriptor
 
     /**
     */
-    RNNDescriptor(const Handle &handle, RNNMode mode, int hidden_size, int num_layers,
+    RNNDescriptor(const Handle &handle, RNNMode mode, int input_size, int hidden_size, int num_layers,
                   bool bidirectional, const DropoutDescriptor &dropoutDesc)
     {
         CUDA4DNN_CHECK_CUDNN(cudnnCreateRNNDescriptor(&descriptor));
@@ -119,12 +119,35 @@ class RNNDescriptor
 
         try
         {
+#if CUDNN_MAJOR >= 9
+            CUDA4DNN_CHECK_CUDNN(cudnnSetRNNDescriptor_v8(
+                                    descriptor,
+                                    algo,
+                                    rnn_mode,
+                                    CUDNN_RNN_DOUBLE_BIAS,
+                                    bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
+                                    CUDNN_LINEAR_INPUT, detail::get_data_type<T>(),
+                                    detail::get_data_type<T>(),
+                                    detail::get_data_type<T>() == CUDNN_DATA_HALF ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH,
+                                    input_size,
+                                    hidden_size,
+                                    hidden_size,
+                                    num_layers,
+                                    dropoutDesc.get(),
+                                    0)); // What other flags do we might want here?
+#else
             CUDA4DNN_CHECK_CUDNN(cudnnSetRNNDescriptor_v6(
-                handle.get(), descriptor, hidden_size, num_layers, dropoutDesc.get(),
-                CUDNN_LINEAR_INPUT, bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
-                rnn_mode,
-                algo, //CUDNN_RNN_ALGO_STANDARD,
-                detail::get_data_type<T>()));
+                                    handle.get(),
+                                    descriptor,
+                                    hidden_size,
+                                    num_layers,
+                                    dropoutDesc.get(),
+                                    CUDNN_LINEAR_INPUT,
+                                    bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
+                                    rnn_mode,
+                                    algo,
+                                    detail::get_data_type<T>()));
+#endif
         }
         catch (...)
         {
@@ -158,16 +181,34 @@ class RNNDescriptor
     cudnnRNNAlgo_t algo{CUDNN_RNN_ALGO_STANDARD};
 };
 
-template<class T>
-size_t getRNNWorkspaceSize(const Handle &handle, const RNNDescriptor<T> &rnnDesc,
-                           const int seqLength, const TensorDescriptorsArray<T> &inputDesc)
+#if CUDNN_MAJOR >= 9
+template <class T>
+void LSTMForward(const Handle &handle, const RNNDescriptor<T> &rnnDesc,
+                 cudnnRNNDataDescriptor_t xDesc, DevicePtr<const T> x,
+                 cudnnRNNDataDescriptor_t yDesc, DevicePtr<T> y,
+                 cudnnTensorDescriptor_t hDesc, DevicePtr<const T> hx, DevicePtr<T> hy,
+                 cudnnTensorDescriptor_t cDesc, DevicePtr<const T> cx, DevicePtr<T> cy,
+                 size_t weightSpaceSize, DevicePtr<const T> weightSpace,
+                 size_t cudnn_WorkspaceSize, DevicePtr<T> cudnn_Workspace,
+                 size_t reserveSpaceSize, DevicePtr<T> reserveSpace)
 {
-    size_t workSize;
-    CUDA4DNN_CHECK_CUDNN(cudnnGetRNNWorkspaceSize(handle.get(), rnnDesc.get(), seqLength,
-                                                  inputDesc.get().data(), &workSize));
-    return workSize;
+    CV_Assert(handle);
+
+    std::cout << "cudnn_WorkspaceSize: " << cudnn_WorkspaceSize << std::endl;
+    std::cout << "reserveSpaceSize: " << reserveSpaceSize << std::endl;
+
+    CUDA4DNN_CHECK_CUDNN(cudnnRNNForward(
+        handle.get(), rnnDesc.get(), CUDNN_FWD_MODE_INFERENCE,
+        nullptr, // docs say use this as null on >= 8.9.1
+        xDesc, x.get(), yDesc, y.get(),
+        hDesc, hx.get(), hy.get(),
+        cDesc, cx.get(), cy.get(),
+        weightSpaceSize, weightSpace.get(),
+        cudnn_WorkspaceSize, cudnn_Workspace.get(),
+        reserveSpaceSize, reserveSpace.get()));
 }
 
+#else
 template<class T>
 void LSTMForward(const Handle &handle, const RNNDescriptor<T> &rnnDesc,
                  const FilterDescriptor<T> &filterDesc, DevicePtr<const T> filterPtr,
@@ -189,7 +230,8 @@ void LSTMForward(const Handle &handle, const RNNDescriptor<T> &rnnDesc,
                                                   initialCDesc.get(), ycOutputPtr.get(),
                                                   static_cast<void*>(workspace.get()), workspace.size_in_bytes()));
 }
+#endif
 
 }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
 
-#endif //OPENCV_DNN_CUDA4DNN_CSL_CUDNN_RECURRENT_HPP
\ No newline at end of file
+#endif //OPENCV_DNN_CUDA4DNN_CSL_CUDNN_RECURRENT_HPP
diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp
index 5a1286de99d6..8f495ac8071c 100644
--- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp
@@ -1187,6 +1187,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
         return true;
     }
 
+    template <typename ShapeType>
+    bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept {
+        const auto x_ndims = x_shape.size(), y_ndims = y_shape.size();
+
+        if (x_ndims != y_ndims) {
+            return false;
+        }
+
+        for (int i = 0; i < x_ndims; i++) {
+            if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) {
+                 return false;
+            }
+        }
+
+        return true;
+    }
+
     /** returns the rank to which the given tensor can be squeezed to */
     template <class TensorType>
     std::size_t get_effective_rank(const TensorType& x) noexcept {
diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
index 27f8306bf337..1c439fb3d6ad 100644
--- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
@@ -152,6 +152,31 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
                 batch_size);
         }
 
+        /** @brief performs generalized matrix-multiplication for a strided batch of matrices
+         *
+         * Pre-conditions:
+         * - A, B and C must be rank three tensors with dimensions (batch, rows, cols)
+         * - the last two axes of \p A and \p B must meet the mathematical requirements for matrix multiplication
+         * - \p C must be large enough to hold the result and the matrices must not overlap in memory
+         *
+         * Exception Guarantee: Basic
+         */
+        template <class T> inline
+        void gemmBatched(const cublas::Handle& handle, std::size_t batch,
+                         T beta, TensorSpan<T> C, const std::vector<std::size_t> C_offsets, T alpha,
+                         bool trans_a, TensorView<T> A, const std::vector<std::size_t> A_offsets,
+                         bool trans_b, TensorView<T> B, const std::vector<std::size_t> B_offsets) {
+            const auto M = C.get_axis_size(-2),
+                       N = C.get_axis_size(-1),
+                       K = A.get_axis_size(trans_a ? -2 : -1);
+            const auto lda = A.get_axis_size(-1),
+                       ldb = B.get_axis_size(-1),
+                       ldc = N;
+
+            // collect pointers and run cublasSgemmBatched / cublasHgemmBatched
+            csl::cublas::gemmBatched<T>(handle, trans_b, trans_a, N, M, K, 1.f, B.get(), ldb, B_offsets, A.get(), lda, A_offsets, 0.f, C.get(), ldc, C_offsets, batch);
+        }
+
         /** @brief performs element-wise addition with broadcasting
          *
          * Pre-conditions:
@@ -503,6 +528,46 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
         LSTM() = default;
         LSTM(const LSTM&) = delete;
         LSTM(LSTM&&) = default;
+
+#if CUDNN_MAJOR >= 9
+        LSTM(cudnn::Handle handle, const params_type &params)
+            : cudnnHandle(std::move(handle)), seqLength(params.seqLength)
+        {
+            std::vector<int> seqLenArr(params.miniBatch, seqLength);
+            cudnnCreateRNNDataDescriptor(&xDesc);
+            cudnnSetRNNDataDescriptor(xDesc, cudnn::detail::get_data_type<T>(),
+                                    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, seqLength,
+                                    params.miniBatch, params.inputSize, seqLenArr.data(),
+                                    nullptr);
+            cudnnCreateRNNDataDescriptor(&cyDesc);
+            cudnnSetRNNDataDescriptor(
+                cyDesc, cudnn::detail::get_data_type<T>(),
+                CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                seqLength, params.miniBatch,
+                params.bidirectional ? params.hiddenSize * 2 : params.hiddenSize,
+                seqLenArr.data(),
+                nullptr);
+
+            dropoutDesc = DropoutDescriptor(cudnnHandle, params.dropout);
+            rnnDesc = RNNDescriptor(cudnnHandle, params.type, params.inputSize, params.hiddenSize,
+                                    params.numLayers, params.bidirectional, dropoutDesc);
+
+            int num_direction = params.bidirectional ? 2 : 1;
+            h0TensorDesc = TensorDescriptor(num_direction, params.miniBatch, params.hiddenSize);
+            c0TensorDesc = TensorDescriptor(num_direction, params.miniBatch, params.hiddenSize);
+
+            // Get amount of work space required to execute the RNN described by rnnDesc
+            // with input dimensions defined by inputDesc
+            CUDA4DNN_CHECK_CUDNN(cudnnGetRNNTempSpaceSizes(
+                                    cudnnHandle.get(), rnnDesc.get(), CUDNN_FWD_MODE_INFERENCE,
+                                    xDesc, &workSpaceSize, &reserveSpaceSize));
+
+            csl::WorkspaceBuilder builder;
+            builder.require<T>(workSpaceSize);
+            builder.require<T>(reserveSpaceSize);
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+#else
         LSTM(cudnn::Handle handle, const params_type& params)
             : cudnnHandle(std::move(handle)), seqLength{params.seqLength},
               inputDesc(seqLength, {params.miniBatch, params.inputSize, 1}),
@@ -513,7 +578,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
         {
             dropoutDesc = DropoutDescriptor(cudnnHandle, params.dropout);
             filterDesc = FilterDescriptor(params.weights_shape);
-            rnnDesc = RNNDescriptor(cudnnHandle, params.type, params.hiddenSize,
+            rnnDesc = RNNDescriptor(cudnnHandle, params.type, params.inputSize, params.hiddenSize,
                                     params.numLayers, params.bidirectional, dropoutDesc);
 
             int num_direction = params.bidirectional ? 2 : 1;
@@ -525,19 +590,44 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
             // Get amount of work space required to execute the RNN described by rnnDesc
             // with input dimensions defined by inputDesc
             csl::WorkspaceBuilder builder;
-            builder.require(cudnn::getRNNWorkspaceSize<T>(cudnnHandle, rnnDesc, seqLength, inputDesc));
+            size_t workSize;
+            CUDA4DNN_CHECK_CUDNN(cudnnGetRNNWorkspaceSize(cudnnHandle.get(), rnnDesc.get(), seqLength,
+                                                          inputDesc.get().data(), &workSize));
+            builder.require(workSize);
             scratch_mem_in_bytes = builder.required_workspace_size();
         }
+#endif
 
         LSTM& operator=(const LSTM&) = delete;
         LSTM& operator=(LSTM&&) = default;
 
         void inference(TensorView<T> input, TensorSpan<T> y_output, TensorSpan<T> yc_output, TensorView<T> filters,
-                       TensorView<T> h0, TensorView<T> c0, WorkspaceInstance workspace)
+                       TensorView<T> h0, TensorView<T> c0, csl::Workspace& workspace)
         {
+            auto ws_allocator = csl::WorkspaceAllocator(workspace);
+
+#if CUDNN_MAJOR >= 9
+            size_t weightSpaceSize = sizeof(typename TensorView<T>::value_type) * filters.size();
+            auto workspaceData = ws_allocator.get_span<T>(workSpaceSize);
+            auto reserveSpaceData = ws_allocator.get_span<T>(reserveSpaceSize);
+            cudnn::LSTMForward<T>(cudnnHandle, rnnDesc, xDesc, input.get(), cyDesc,
+                                  y_output.get(), h0TensorDesc.get(), h0.get(),
+                                  DevicePtr<T>(nullptr), // hy, final state
+                                  c0TensorDesc.get(),    // maps to cxDesc
+                                  c0.get(),              // maps to cx
+                                  yc_output.get(),       // maps to cy
+                                  weightSpaceSize,
+                                  filters.get(),          // maps to weightSpace
+                                  workSpaceSize,
+                                  workspaceData.data(),   // workSpaceSize and workSpace
+                                  reserveSpaceSize,       // reserveSpaceSize
+                                  reserveSpaceData.data()
+                                 );
+#else
             cudnn::LSTMForward<T>(cudnnHandle, rnnDesc, filterDesc, filters.get(), inputDesc,
                                   input.get(), h0TensorDesc, h0.get(), c0TensorDesc, c0.get(),
-                                  seqLength, outputDesc, y_output.get(), yc_output.get(), workspace);
+                                  seqLength, outputDesc, y_output.get(), yc_output.get(), ws_allocator.get_instance());
+#endif
         }
 
         std::size_t get_workspace_memory_in_bytes() const noexcept { return scratch_mem_in_bytes; }
@@ -550,11 +640,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
         RNNDescriptor rnnDesc;
         DropoutDescriptor dropoutDesc;
 
-        FilterDescriptor filterDesc;
         TensorDescriptor h0TensorDesc, c0TensorDesc;
 
+#if CUDNN_MAJOR >= 9
+        size_t weightSpaceSize, workSpaceSize, reserveSpaceSize;
+        cudnnRNNDataDescriptor_t xDesc;
+        cudnnRNNDataDescriptor_t cyDesc; // represents cyDesc or cDesc(now reps both final and beginning)
+#else
+        FilterDescriptor filterDesc;
         TensorDescriptorsArray inputDesc;
         TensorDescriptorsArray outputDesc;
+#endif
     };
 
 }}}} /* namespace cv::dnn::cuda4dnn::csl */
diff --git a/modules/dnn/src/cuda4dnn/kernels/activations.hpp b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
index 6958b93d5efc..fad549a08399 100644
--- a/modules/dnn/src/cuda4dnn/kernels/activations.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/activations.hpp
@@ -114,6 +114,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
     template <class T>
     void selu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T gamma);
 
+    template <class T>
+    void gelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
     template <class T>
     void thresholdedrelu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha);
 
diff --git a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
index 0e44372fee3d..e80db943ae45 100644
--- a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
@@ -30,6 +30,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
     template <class T>
     void eltwise_div_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
 
+    template <class T>
+    void eltwise_sub_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
+    template <class T>
+    void eltwise_mod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
+    template <class T>
+    void eltwise_fmod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
diff --git a/modules/dnn/src/cuda4dnn/kernels/mvn.hpp b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
index b5a573e92122..a09dafb76d13 100644
--- a/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
@@ -26,6 +26,19 @@ void normalize_mean(const csl::Stream& stream, csl::Span<T> output, csl::View<T>
 template <class T>
 void normalize_mean_variance(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<float> means, csl::View<float> scale, std::size_t inner_size);
 
+template <class T>
+void normalize_mean_variance_channelwise(const csl::Stream &stream, csl::Span<T> output, csl::View<T> input, csl::View<T> scale, csl::View<T> bias, csl::View<float> means, csl::View<float> inv_stddev, std::size_t inner_size, std::size_t C);
+
+template <class T>
+void normalize_mean_variance_layernorm(const csl::Stream &stream, csl::Span<T> output, csl::View<T> input, csl::View<T> scale, csl::View<float> means, csl::View<float> inv_stddev, std::size_t inner_size);
+
+template <class T>
+void normalize_mean_variance_layernorm(const csl::Stream &stream, csl::Span<T> output, csl::View<T> input, csl::View<T> scale, csl::View<T> bias, csl::View<float> means, csl::View<float> inv_stddev, std::size_t inner_size);
+
+template <class T>
+void normalize_mean_variance_groupwise(const csl::Stream &stream, csl::Span<T> output, csl::View<T> input, csl::View<T> scale, csl::View<T> bias, csl::View<float> means, csl::View<float> inv_stddev, std::size_t inner_size, std::size_t C, std::size_t num_groups, std::size_t group_size);
+
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP */
diff --git a/modules/dnn/src/cuda4dnn/primitives/activation.hpp b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
index 564202e8c0bc..c10f9014a59b 100644
--- a/modules/dnn/src/cuda4dnn/primitives/activation.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/activation.hpp
@@ -537,6 +537,20 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         const T alpha, gamma;
     };
 
+    template <class T>
+    class GeluOp final : public BaseOp<GeluOp, T> {
+    public:
+        GeluOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void calculate(csl::TensorSpan<T> output, csl::TensorView<T> input) const
+        {
+            kernels::gelu<T>(stream, output, input);
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
     template <class T>
     class ThresholdedReluOp final : public BaseOp<ThresholdedReluOp, T> {
     public:
diff --git a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
index b46f0d870f4d..5822f48061d6 100644
--- a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
@@ -27,6 +27,9 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         PRODUCT,
         DIV,
         MIN,
+        SUB,
+        MOD,
+        FMOD,
     };
 
     class EltwiseOpBase : public CUDABackendNode {
@@ -88,6 +91,9 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                     else
                         kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y);
                     break;
+                case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, input_x, input_y); break;
                 }
             }
             else
@@ -119,6 +125,9 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                             kernels::eltwise_sum_coeff_2<T>(stream, output, coeff_x, output, coeffs[i], input);
                         }
                         break;
+                    case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, output, input); break;
                     }
                 }
             }
diff --git a/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp b/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp
new file mode 100644
index 000000000000..bb3e162a3305
--- /dev/null
+++ b/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp
@@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/workspace.hpp"
+
+#include "../kernels/fill_copy.hpp"
+#include "../kernels/mvn.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class GroupNormOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        GroupNormOp(csl::Stream stream_, float epsilon_, size_t loops, size_t num_groups)
+            : stream(std::move(stream_)), epsilon(epsilon_), num_groups(num_groups) {
+            csl::WorkspaceBuilder builder;
+            builder.require<float>(loops * num_groups); // mean and stdev for each group
+            builder.require<float>(loops * num_groups);
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+                     const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+                     csl::Workspace& workspace) override {
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto scale_wrapper = inputs[1].dynamicCast<wrapper_type>();
+            auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
+
+            auto input = input_wrapper->getView();
+            auto scale = scale_wrapper->getView();
+            auto bias = bias_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            auto C = input.get_axis_size(1);
+            auto loops = input.size_range(0, 2);
+            auto norm_size = input.size_range(2, input.rank());
+            auto num_groups = this->num_groups;
+            auto group_size = C / num_groups;
+            if (norm_size == 1) {
+                kernels::fill<T>(stream, output, 0.f);
+                return;
+            } else {
+                auto ws_allocator = csl::WorkspaceAllocator(workspace);
+
+                auto mean = ws_allocator.get_span<float>(loops / group_size);
+                kernels::fill<float>(stream, mean, 0.f);
+
+                auto stdev = ws_allocator.get_span<float>(loops / group_size);
+                kernels::fill<float>(stream, stdev, 0.f);
+
+                kernels::reduce_mean_sqr_sum<T>(stream, mean, stdev, input, norm_size * group_size);
+                kernels::compute_normalization_scale(stream, stdev, mean, stdev, norm_size * group_size, epsilon);
+                kernels::normalize_mean_variance_groupwise<T>(stream, output, input, scale, bias, mean, stdev, norm_size, C, num_groups, group_size);
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+    private:
+        csl::Stream stream;
+        float epsilon;
+        std::size_t num_groups;
+        std::size_t scratch_mem_in_bytes;
+    };
+
+}}} // cv::dnn::cuda4dnn
+
+#endif // OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
diff --git a/modules/dnn/src/cuda4dnn/primitives/instance_norm.hpp b/modules/dnn/src/cuda4dnn/primitives/instance_norm.hpp
new file mode 100644
index 000000000000..0a32e40fc06a
--- /dev/null
+++ b/modules/dnn/src/cuda4dnn/primitives/instance_norm.hpp
@@ -0,0 +1,86 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INSTANCE_NORM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INSTANCE_NORM_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/workspace.hpp"
+
+#include "../kernels/fill_copy.hpp"
+#include "../kernels/mvn.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class InstanceNormOp final : public CUDABackendNode {
+     public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        InstanceNormOp(csl::Stream stream_, float epsilon_, size_t loops)
+            : stream(std::move(stream_)), epsilon(epsilon_) {
+            csl::WorkspaceBuilder builder;
+            builder.require<float>(loops);
+            builder.require<float>(loops);
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+                     const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+                     csl::Workspace& workspace) override {
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto scale_wrapper = inputs[1].dynamicCast<wrapper_type>();
+            auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
+
+            auto input = input_wrapper->getView();
+            auto scale = scale_wrapper->getView();
+            auto bias = bias_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            auto C = input.get_axis_size(1);
+            auto loops = input.size_range(0, 2);
+            auto norm_size = input.size_range(2, input.rank());
+            if (norm_size == 1) {
+                kernels::fill<T>(stream, output, 0.f);
+                return;
+            } else {
+                auto ws_allocator = csl::WorkspaceAllocator(workspace);
+
+                auto mean = ws_allocator.get_span<float>(loops);
+                kernels::fill<float>(stream, mean, 0.f);
+
+                auto stdev = ws_allocator.get_span<float>(loops);
+                kernels::fill<float>(stream, stdev, 0.f);
+
+                kernels::reduce_mean_sqr_sum<T>(stream, mean, stdev, input, norm_size);
+                kernels::compute_normalization_scale(stream, stdev, mean, stdev, norm_size, epsilon);
+                kernels::normalize_mean_variance_channelwise<T>(stream, output, input, scale, bias, mean, stdev, norm_size, C);
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+     private:
+        csl::Stream stream;
+
+        float epsilon;
+
+        std::size_t scratch_mem_in_bytes;
+    };
+
+}}} // cv::dnn::cuda4dnn
+
+#endif // OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INSTANCE_NORM_HPP
diff --git a/modules/dnn/src/cuda4dnn/primitives/layer_norm.hpp b/modules/dnn/src/cuda4dnn/primitives/layer_norm.hpp
new file mode 100644
index 000000000000..baf7691c466c
--- /dev/null
+++ b/modules/dnn/src/cuda4dnn/primitives/layer_norm.hpp
@@ -0,0 +1,117 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LAYER_NORM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LAYER_NORM_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/workspace.hpp"
+
+#include "../kernels/fill_copy.hpp"
+#include "../kernels/mvn.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class LayerNormOp final : public CUDABackendNode {
+     public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        LayerNormOp(csl::Stream stream_, const Mat &scale, const Mat &bias, int normalized_axis, float epsilon_, size_t loops)
+            : stream(std::move(stream_)), epsilon(epsilon_) {
+            CV_CheckGE(normalized_axis, 0, "LayerNorm/CUDA: axis needs to be normalized");
+            axis = static_cast<size_t>(normalized_axis);
+
+            if (!scale.empty()) {
+                input_scale_tensor = csl::makeTensorHeader<T>(scale);
+                csl::copyMatToTensor<T>(scale, input_scale_tensor, stream);
+            }
+            if (!bias.empty()) {
+                input_bias_tensor = csl::makeTensorHeader<T>(bias);
+                csl::copyMatToTensor<T>(bias, input_bias_tensor, stream);
+            }
+
+            csl::WorkspaceBuilder builder;
+            builder.require<float>(loops);
+            builder.require<float>(loops);
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+                     const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+                     csl::Workspace& workspace) override {
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            csl::TensorView<T> scale;
+            if (input_scale_tensor.empty()) {
+                auto scale_wrapper = inputs[1].dynamicCast<wrapper_type>();
+                scale = scale_wrapper->getView();
+            } else {
+                scale = csl::TensorView<T>(input_scale_tensor);
+            }
+
+            csl::TensorView<T> bias;
+            if (input_bias_tensor.empty()) {
+                if (inputs.size() >= 3) {
+                    auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
+                    bias = bias_wrapper->getView();
+                }
+            } else {
+                bias = csl::TensorView<T>(input_bias_tensor);
+            }
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            auto loops = input.size_range(0, axis);
+            auto norm_size = input.size_range(axis, input.rank());
+            if (norm_size == 1) {
+                kernels::fill<T>(stream, output, 0.f);
+                return;
+            } else {
+                auto ws_allocator = csl::WorkspaceAllocator(workspace);
+
+                auto mean = ws_allocator.get_span<float>(loops);
+                kernels::fill<float>(stream, mean, 0.f);
+
+                auto inv_stddev = ws_allocator.get_span<float>(loops);
+                kernels::fill<float>(stream, inv_stddev, 0.f);
+
+                kernels::reduce_mean_sqr_sum<T>(stream, mean, inv_stddev, input, norm_size);
+                kernels::compute_normalization_scale(stream, inv_stddev, mean, inv_stddev, norm_size, epsilon);
+                if (!bias.empty()) {
+                    kernels::normalize_mean_variance_layernorm<T>(stream, output, input, scale, bias, mean, inv_stddev, norm_size);
+                } else {
+                    kernels::normalize_mean_variance_layernorm<T>(stream, output, input, scale, mean, inv_stddev, norm_size);
+                }
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+     private:
+        csl::Stream stream;
+        csl::Tensor<T> input_scale_tensor;
+        csl::Tensor<T> input_bias_tensor;
+
+        float epsilon;
+        size_t axis;
+
+        std::size_t scratch_mem_in_bytes;
+    };
+
+}}} // cv::dnn::cuda4dnn
+
+#endif // OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LAYER_NORM_HPP
diff --git a/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
new file mode 100644
index 000000000000..c99a1b5f3a43
--- /dev/null
+++ b/modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
@@ -0,0 +1,97 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_BROADCAST_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_BROADCAST_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/cublas.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/eltwise_ops.hpp" // for adding bias
+
+#include <opencv2/core.hpp>
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class MatMulBroadcastOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, const Mat &bias, bool _transA, bool _transB,
+                 const std::vector<size_t> &A_offsets_, const std::vector<size_t> &B_offsets_, std::vector<size_t> &C_offsets_,
+                 size_t batch_)
+            : stream(std::move(stream_)), cublasHandle(std::move(handle)), A_offsets(A_offsets_), B_offsets(B_offsets_), C_offsets(C_offsets_), batch(batch_)
+        {
+            if (!B.empty()) {
+                input_B_tensor = csl::makeTensorHeader<T>(B);
+                csl::copyMatToTensor<T>(B, input_B_tensor, stream);
+            }
+
+            if (!bias.empty()) {
+                bias_tensor = csl::makeTensorHeader<T>(bias);
+                csl::copyMatToTensor<T>(bias, bias_tensor, stream);
+            }
+
+            transA = _transA;
+            transB = _transB;
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            auto input_A_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input_A = input_A_wrapper->getView();
+
+            csl::TensorView<T> input_B;
+            if (input_B_tensor.empty()) {
+                auto input_B_wrapper = inputs[1].dynamicCast<wrapper_type>();
+                input_B = input_B_wrapper->getView();
+            } else {
+                input_B = csl::TensorView<T>(input_B_tensor);
+            }
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            csl::tensor_ops::gemmBatched<T>(cublasHandle, batch, 0.f, output, C_offsets, 1.f, transA, input_A, A_offsets, transB, input_B, B_offsets);
+
+            // add bias if exists
+            if (!bias_tensor.empty() || inputs.size() >= 3) {
+                csl::TensorView<T> bias;
+                if (bias_tensor.empty()) {
+                    auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
+                    bias = bias_wrapper->getView();
+                } else {
+                    bias = csl::TensorView<T>(bias_tensor);
+                }
+
+                kernels::eltwise_sum_2<T>(stream, output, output, bias);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        csl::cublas::Handle cublasHandle;
+        csl::Tensor<T> input_B_tensor;
+        csl::Tensor<T> bias_tensor;
+        bool transA, transB;
+
+        std::vector<size_t> A_offsets;
+        std::vector<size_t> B_offsets;
+        std::vector<size_t> C_offsets;
+        size_t batch;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MATMUL_BROADCAST_HPP */
diff --git a/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp b/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
index f067dddaa701..91ff33f81718 100644
--- a/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
@@ -111,7 +111,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
              * or there might be several weights
              * or we don't have to scale
              */
-            if (weight != 1.0)
+            if (weight != static_cast<T>(1.0f))
             {
                 kernels::scale1_with_bias1<T>(stream, output, input, weight, 1.0);
             }
diff --git a/modules/dnn/src/cuda4dnn/primitives/recurrent_cells.hpp b/modules/dnn/src/cuda4dnn/primitives/recurrent_cells.hpp
index 5cba78800812..67f1aff28546 100644
--- a/modules/dnn/src/cuda4dnn/primitives/recurrent_cells.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/recurrent_cells.hpp
@@ -55,9 +55,6 @@ class LSTMOp final : public CUDABackendNode
 
         c0Tensor = csl::makeTensorHeader<T>(c0);
         csl::copyMatToTensor<T>(c0, c0Tensor, stream);
-
-        csl::WorkspaceBuilder builder;
-        builder.require<T>(lstm.get_workspace_memory_in_bytes());
     }
 
     void forward(const std::vector<cv::Ptr<BackendWrapper>>& inputs,
@@ -75,8 +72,7 @@ class LSTMOp final : public CUDABackendNode
         Ptr<wrapper_type> yc_output_wrapper = outputs.size() == 2 ? outputs[1].dynamicCast<wrapper_type>() : Ptr<wrapper_type>();
         csl::TensorSpan<T> yc_output = yc_output_wrapper.empty() ? csl::TensorSpan<T>() : yc_output_wrapper->getSpan();
 
-        csl::WorkspaceAllocator allocator(workspace);
-        lstm.inference(input, y_output, yc_output, filtersTensor, h0Tensor, c0Tensor, allocator.get_instance());
+        lstm.inference(input, y_output, yc_output, filtersTensor, h0Tensor, c0Tensor, workspace);
     }
 
     std::size_t get_workspace_memory_in_bytes() const noexcept override
@@ -94,4 +90,4 @@ class LSTMOp final : public CUDABackendNode
 
 }}} /* namespace cv::dnn::cuda4dnn */
 
-#endif //OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RECURRENT_CELLS_HPP
\ No newline at end of file
+#endif //OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RECURRENT_CELLS_HPP
diff --git a/modules/dnn/src/cuda4dnn/primitives/region.hpp b/modules/dnn/src/cuda4dnn/primitives/region.hpp
index d22d44214e7b..3af05155feea 100644
--- a/modules/dnn/src/cuda4dnn/primitives/region.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/region.hpp
@@ -121,7 +121,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                 new_coords
             );
 
-            if (nms_iou_threshold > 0) {
+            if (nms_iou_threshold > static_cast<T>(0.0f)) {
                 auto output_mat = output_wrapper->getMutableHostMat();
                 CV_Assert(output_mat.type() == CV_32F);
                 for (int i = 0; i < input.get_axis_size(0); i++) {
diff --git a/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp b/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp
index 9da7ec332698..abe2615590ce 100644
--- a/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp
@@ -128,6 +128,9 @@ namespace cv { namespace dnn { namespace cuda4dnn {
 
             /* the scale shift operation might require broadcasting */
             const int end_axis = [&] {
+                if (num_parameters == 1) {
+                    return static_cast<int>(axis + 1);
+                }
                 for (int endAxis = axis + 1; endAxis <= input.rank(); endAxis++) {
                     if (input.size_range(axis, endAxis) == mid_size)
                         return endAxis;
diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index 54140f8dc0c2..a3c7b37a73dd 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -309,11 +309,56 @@ namespace cv {
                     fused_layer_names.push_back(last_layer);
                 }
 
+                void setCrop(int crop_height, int crop_width, int inp_height, int inp_width, bool noadjust)
+                {
+                    cv::dnn::LayerParams crop_param;
+                    crop_param.name = "CropLayer-name";
+                    std::vector<int> begin = {0, 0, (inp_height - crop_height) / 2, (inp_width - crop_width) / 2};
+                    std::vector<int> sizes = {-1, -1, crop_height, crop_width};
+                    crop_param.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
+                    crop_param.set("size", DictValue::arrayInt(&sizes[0], sizes.size()));
+                    crop_param.type = "Slice";
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("crop_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = crop_param.type;
+                    lp.layerParams = crop_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+                    layer_id++;
+
+                    if (!noadjust)
+                    {
+                        cv::dnn::LayerParams params;
+                        params.set("bias_term", true);
+                        params.blobs = {
+                            Mat(1, 1, CV_32F, Scalar(2)),
+                            Mat(1, 1, CV_32F, Scalar(-1))
+                        };
+
+                        darknet::LayerParameter lp;
+                        std::string layer_name = cv::format("adjust_crop_%d", layer_id);
+                        lp.layer_name = layer_name;
+                        lp.layer_type = "Scale";
+                        lp.layerParams = params;
+                        lp.bottom_indexes.push_back(last_layer);
+                        last_layer = layer_name;
+                        net->layers.push_back(lp);
+                        layer_id++;
+                    }
+                    fused_layer_names.push_back(last_layer);
+                }
+
                 void setSoftmax()
                 {
                     cv::dnn::LayerParams softmax_param;
                     softmax_param.name = "Softmax-name";
                     softmax_param.type = "Softmax";
+                    // set default axis to 1
+                    if(!softmax_param.has("axis"))
+                        softmax_param.set("axis", 1);
                     darknet::LayerParameter lp;
 
                     std::string layer_name = cv::format("softmax_%d", layer_id);
@@ -682,8 +727,8 @@ namespace cv {
 
                 MatShape tensor_shape(3);
                 tensor_shape[0] = net->channels;
-                tensor_shape[1] = net->width;
-                tensor_shape[2] = net->height;
+                tensor_shape[1] = net->height;
+                tensor_shape[2] = net->width;
                 net->out_channels_vec.resize(net->layers_cfg.size());
 
                 layers_counter = -1;
@@ -760,6 +805,19 @@ namespace cv {
                         tensor_shape[1] = 1;
                         tensor_shape[2] = 1;
                     }
+                    else if (layer_type == "crop")
+                    {
+                        int crop_height = getParam<int>(layer_params, "crop_height", 0);
+                        int crop_width = getParam<int>(layer_params, "crop_width", 0);
+                        bool noadjust = getParam<int>(layer_params, "noadjust", false);
+                        CV_CheckGT(crop_height, 0, "");
+                        CV_CheckGT(crop_width, 0, "");
+
+                        setParams.setCrop(crop_height, crop_width, tensor_shape[1], tensor_shape[2], noadjust);
+
+                        tensor_shape[1] = crop_height;
+                        tensor_shape[2] = crop_width;
+                    }
                     else if (layer_type == "softmax")
                     {
                         int groups = getParam<int>(layer_params, "groups", 1);
@@ -934,8 +992,8 @@ namespace cv {
 
                 MatShape tensor_shape(3);
                 tensor_shape[0] = net->channels;
-                tensor_shape[1] = net->width;
-                tensor_shape[2] = net->height;
+                tensor_shape[1] = net->height;
+                tensor_shape[2] = net->width;
                 int cv_layers_counter = -1;
                 int darknet_layers_counter = -1;
 
diff --git a/modules/dnn/src/dnn_params.cpp b/modules/dnn/src/dnn_params.cpp
index 86a43db75754..a76f4cd512f7 100644
--- a/modules/dnn/src/dnn_params.cpp
+++ b/modules/dnn/src/dnn_params.cpp
@@ -36,7 +36,11 @@ bool getParam_DNN_OPENCL_ALLOW_ALL_DEVICES()
 int getParam_DNN_BACKEND_DEFAULT()
 {
     static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
+#ifdef OPENCV_DNN_BACKEND_DEFAULT
+            (size_t)OPENCV_DNN_BACKEND_DEFAULT
+#else
             (size_t)DNN_BACKEND_OPENCV
+#endif
     );
     return PARAM_DNN_BACKEND_DEFAULT;
 }
diff --git a/modules/dnn/src/dnn_read.cpp b/modules/dnn/src/dnn_read.cpp
index 9c06ced3c4e9..2b02f462d3c5 100644
--- a/modules/dnn/src/dnn_read.cpp
+++ b/modules/dnn/src/dnn_read.cpp
@@ -43,9 +43,11 @@ Net readNet(const String& _model, const String& _config, const String& _framewor
             std::swap(model, config);
         return readNetFromDarknet(config, model);
     }
-    if (framework == "dldt" || modelExt == "bin" || configExt == "bin" || modelExt == "xml" || configExt == "xml")
+    if (framework == "dldt" || framework == "openvino" ||
+        modelExt == "bin" || configExt == "bin" ||
+        modelExt == "xml" || configExt == "xml")
     {
-        if (modelExt == "xml" || configExt == "bin")
+        if (modelExt == "xml" || configExt == "bin" || modelExt == "onnx")
             std::swap(model, config);
         return readNetFromModelOptimizer(config, model);
     }
@@ -60,7 +62,9 @@ Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
         const std::vector<uchar>& bufferConfig)
 {
     String framework = toLowerCase(_framework);
-    if (framework == "caffe")
+    if (framework == "onnx")
+        return readNetFromONNX(bufferModel);
+    else if (framework == "caffe")
         return readNetFromCaffe(bufferConfig, bufferModel);
     else if (framework == "tensorflow")
         return readNetFromTensorflow(bufferModel, bufferConfig);
@@ -68,7 +72,7 @@ Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
         return readNetFromDarknet(bufferConfig, bufferModel);
     else if (framework == "torch")
         CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
-    else if (framework == "dldt")
+    else if (framework == "dldt" || framework == "openvino")
         return readNetFromModelOptimizer(bufferConfig, bufferModel);
     else if (framework == "tflite")
         return readNetFromTFLite(bufferModel);
diff --git a/modules/dnn/src/dnn_utils.cpp b/modules/dnn/src/dnn_utils.cpp
index 18c7e975ebfd..3ef75fad3683 100644
--- a/modules/dnn/src/dnn_utils.cpp
+++ b/modules/dnn/src/dnn_utils.cpp
@@ -5,6 +5,7 @@
 #include "precomp.hpp"
 
 #include <opencv2/imgproc.hpp>
+#include <opencv2/core/utils/logger.hpp>
 
 
 namespace cv {
@@ -16,11 +17,53 @@ Image2BlobParams::Image2BlobParams():scalefactor(Scalar::all(1.0)), size(Size())
 {}
 
 Image2BlobParams::Image2BlobParams(const Scalar& scalefactor_, const Size& size_, const Scalar& mean_, bool swapRB_,
-                         int ddepth_, DataLayout datalayout_, ImagePaddingMode mode_):
-        scalefactor(scalefactor_), size(size_), mean(mean_), swapRB(swapRB_), ddepth(ddepth_),
-        datalayout(datalayout_), paddingmode(mode_)
+    int ddepth_, DataLayout datalayout_, ImagePaddingMode mode_, Scalar borderValue_):
+    scalefactor(scalefactor_), size(size_), mean(mean_), swapRB(swapRB_), ddepth(ddepth_),
+    datalayout(datalayout_), paddingmode(mode_), borderValue(borderValue_)
 {}
 
+void getVector(InputArrayOfArrays images_, std::vector<Mat>& images) {
+    images_.getMatVector(images);
+}
+
+void getVector(InputArrayOfArrays images_, std::vector<UMat>& images) {
+    images_.getUMatVector(images);
+}
+
+void getMat(UMat& blob, InputArray blob_, AccessFlag flag) {
+    if(blob_.kind() == _InputArray::UMAT)
+        blob = blob_.getUMat();
+    else if(blob_.kind() == _InputArray::MAT) {
+        blob = blob_.getUMat();
+    }
+}
+
+void getMat(Mat& blob, InputArray blob_, AccessFlag flag) {
+    if(blob_.kind() == _InputArray::UMAT)
+        blob = blob_.getMat();
+    else if(blob_.kind() == _InputArray::MAT) {
+        blob = blob_.getMat();
+    }
+}
+
+void getChannelFromBlob(Mat& m, InputArray blob, int i, int j, int rows, int cols, int type) {
+    m = Mat(rows, cols, type, blob.getMat().ptr(i, j));
+}
+
+void getChannelFromBlob(UMat& m, InputArray blob, int i, int j, int rows, int cols, int type) {
+    UMat ublob = blob.getUMat();
+    int offset = (i * ublob.step.p[0] + j * ublob.step.p[1]) / ublob.elemSize();
+    int length = 1;
+    for(int i = 0; i < ublob.dims; ++i) {
+        length *= ublob.size[i];
+    }
+
+    const int newShape[1] { length };
+    UMat reshaped = ublob.reshape(1, 1, newShape);
+    UMat roi = reshaped(Rect(0, offset, 1, rows * cols));
+    m = roi.reshape(CV_MAT_CN(type), rows);
+}
+
 Mat blobFromImage(InputArray image, const double scalefactor, const Size& size,
         const Scalar& mean, bool swapRB, bool crop, int ddepth)
 {
@@ -34,8 +77,13 @@ void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
         const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
 {
     CV_TRACE_FUNCTION();
-    std::vector<Mat> images(1, image.getMat());
-    blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
+    if (image.kind() == _InputArray::UMAT) {
+        std::vector<UMat> images(1, image.getUMat());
+        blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
+    } else {
+        std::vector<Mat> images(1, image.getMat());
+        blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
+    }
 }
 
 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
@@ -51,9 +99,9 @@ void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalef
         Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
 {
     CV_TRACE_FUNCTION();
-    if (images_.kind() != _InputArray::STD_VECTOR_MAT && images_.kind() != _InputArray::STD_ARRAY_MAT &&
+    if (images_.kind() != _InputArray::STD_VECTOR_UMAT  && images_.kind() != _InputArray::STD_VECTOR_MAT && images_.kind() != _InputArray::STD_ARRAY_MAT &&
         images_.kind() != _InputArray::STD_VECTOR_VECTOR) {
-        String error_message = "The data is expected as vectors of vectors or vectors of matrices.";
+        String error_message = "The data is expected as vectors of vectors, vectors of Mats or vectors of UMats.";
         CV_Error(Error::StsBadArg, error_message);
     }
     Image2BlobParams param(Scalar::all(scalefactor), size, mean_, swapRB, ddepth);
@@ -70,13 +118,6 @@ Mat blobFromImageWithParams(InputArray image, const Image2BlobParams& param)
     return blob;
 }
 
-void blobFromImageWithParams(InputArray image, OutputArray blob, const Image2BlobParams& param)
-{
-    CV_TRACE_FUNCTION();
-    std::vector<Mat> images(1, image.getMat());
-    blobFromImagesWithParams(images, blob, param);
-}
-
 Mat blobFromImagesWithParams(InputArrayOfArrays images, const Image2BlobParams& param)
 {
     CV_TRACE_FUNCTION();
@@ -85,28 +126,45 @@ Mat blobFromImagesWithParams(InputArrayOfArrays images, const Image2BlobParams&
     return blob;
 }
 
-void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, const Image2BlobParams& param)
+template<class Tmat>
+void blobFromImagesWithParamsImpl(InputArrayOfArrays images_, Tmat& blob_, const Image2BlobParams& param)
 {
     CV_TRACE_FUNCTION();
-    if (images_.kind() != _InputArray::STD_VECTOR_MAT && images_.kind() != _InputArray::STD_ARRAY_MAT &&
-        images_.kind() != _InputArray::STD_VECTOR_VECTOR) {
-        String error_message = "The data is expected as vectors of vectors or vectors of matrices.";
+    if(!std::is_same<Tmat, UMat>::value && !std::is_same<Tmat, Mat>::value) {
+        String error_message = "The template parameter is expected to be either a cv::Mat or a cv::UMat";
         CV_Error(Error::StsBadArg, error_message);
     }
+
     CV_CheckType(param.ddepth, param.ddepth == CV_32F || param.ddepth == CV_8U,
                  "Blob depth should be CV_32F or CV_8U");
     Size size = param.size;
-    std::vector<Mat> images;
-    images_.getMatVector(images);
+
+    std::vector<Tmat> images;
+    getVector(images_, images);
+
     CV_Assert(!images.empty());
 
+    if (param.ddepth == CV_8U)
+    {
+        CV_Assert(param.scalefactor == Scalar::all(1.0) && "Scaling is not supported for CV_8U blob depth");
+        CV_Assert(param.mean == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
+    }
+
     int nch = images[0].channels();
     Scalar scalefactor = param.scalefactor;
+    Scalar mean = param.mean;
 
-    if (param.ddepth == CV_8U)
+    if (param.swapRB)
     {
-        CV_Assert(scalefactor == Scalar::all(1.0) && "Scaling is not supported for CV_8U blob depth");
-        CV_Assert(param.mean == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
+        if (nch > 2)
+        {
+            std::swap(mean[0], mean[2]);
+            std::swap(scalefactor[0], scalefactor[2]);
+        }
+        else
+        {
+            CV_LOG_WARNING(NULL, "Red/blue color swapping requires at least three image channels.");
+        }
     }
 
     for (size_t i = 0; i < images.size(); i++)
@@ -126,43 +184,35 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
                           size);
                 images[i] = images[i](crop);
             }
+            else if (param.paddingmode == DNN_PMODE_LETTERBOX)
+            {
+                float resizeFactor = std::min(size.width / (float)imgSize.width,
+                                              size.height / (float)imgSize.height);
+                int rh = int(imgSize.height * resizeFactor);
+                int rw = int(imgSize.width * resizeFactor);
+                resize(images[i], images[i], Size(rw, rh), INTER_LINEAR);
+
+                int top = (size.height - rh)/2;
+                int bottom = size.height - top - rh;
+                int left = (size.width - rw)/2;
+                int right = size.width - left - rw;
+                copyMakeBorder(images[i], images[i], top, bottom, left, right, BORDER_CONSTANT, param.borderValue);
+            }
             else
             {
-                if (param.paddingmode == DNN_PMODE_LETTERBOX)
-                {
-                    float resizeFactor = std::min(size.width / (float)imgSize.width,
-                                                  size.height / (float)imgSize.height);
-                    int rh = int(imgSize.height * resizeFactor);
-                    int rw = int(imgSize.width * resizeFactor);
-                    resize(images[i], images[i], Size(rw, rh), INTER_LINEAR);
-
-                    int top = (size.height - rh)/2;
-                    int bottom = size.height - top - rh;
-                    int left = (size.width - rw)/2;
-                    int right = size.width - left - rw;
-                    copyMakeBorder(images[i], images[i], top, bottom, left, right, BORDER_CONSTANT);
-                }
-                else
-                    resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
+                resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
             }
         }
 
-        Scalar mean = param.mean;
-        if (param.swapRB)
-        {
-            std::swap(mean[0], mean[2]);
-            std::swap(scalefactor[0], scalefactor[2]);
-        }
-
         if (images[i].depth() == CV_8U && param.ddepth == CV_32F)
             images[i].convertTo(images[i], CV_32F);
 
-        images[i] -= mean;
+        subtract(images[i], mean, images[i]);
         multiply(images[i], scalefactor, images[i]);
     }
 
     size_t nimages = images.size();
-    Mat image0 = images[0];
+    Tmat image0 = images[0];
     CV_Assert(image0.dims == 2);
 
     if (param.datalayout == DNN_LAYOUT_NCHW)
@@ -171,21 +221,22 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
         {
             int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
             blob_.create(4, sz, param.ddepth);
-            Mat blob = blob_.getMat();
-            Mat ch[4];
+            std::vector<Tmat> ch(4);
 
             for (size_t i = 0; i < nimages; i++)
             {
-                const Mat& image = images[i];
+                const Tmat& image = images[i];
                 CV_Assert(image.depth() == blob_.depth());
                 nch = image.channels();
                 CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
                 CV_Assert(image.size() == image0.size());
 
-                for (int j = 0; j < nch; j++)
-                    ch[j] = Mat(image.rows, image.cols, param.ddepth, blob.ptr((int)i, j));
+                for (int j = 0; j < nch; j++) {
+                    getChannelFromBlob(ch[j], blob_, i, j ,image.rows, image.cols, param.ddepth);
+                }
                 if (param.swapRB)
                     std::swap(ch[0], ch[2]);
+
                 split(image, ch);
             }
         }
@@ -194,11 +245,12 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
             CV_Assert(nch == 1);
             int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
             blob_.create(4, sz, param.ddepth);
-            Mat blob = blob_.getMat();
+            Mat blob;
+            getMat(blob, blob_, ACCESS_RW);
 
             for (size_t i = 0; i < nimages; i++)
             {
-                const Mat& image = images[i];
+                const Tmat& image = images[i];
                 CV_Assert(image.depth() == blob_.depth());
                 nch = image.channels();
                 CV_Assert(image.dims == 2 && (nch == 1));
@@ -212,26 +264,97 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
     {
         int sz[] = { (int)nimages, image0.rows, image0.cols, nch};
         blob_.create(4, sz, param.ddepth);
-        Mat blob = blob_.getMat();
+        Mat blob;
+        getMat(blob, blob_, ACCESS_RW);
         int subMatType = CV_MAKETYPE(param.ddepth, nch);
         for (size_t i = 0; i < nimages; i++)
         {
-            const Mat& image = images[i];
+            const Tmat& image = images[i];
             CV_Assert(image.depth() == blob_.depth());
             CV_Assert(image.channels() == image0.channels());
             CV_Assert(image.size() == image0.size());
-            if (param.swapRB)
+            if (nch > 2 && param.swapRB)
             {
                 Mat tmpRB;
                 cvtColor(image, tmpRB, COLOR_BGR2RGB);
                 tmpRB.copyTo(Mat(tmpRB.rows, tmpRB.cols, subMatType, blob.ptr((int)i, 0)));
             }
             else
+            {
                 image.copyTo(Mat(image.rows, image.cols, subMatType, blob.ptr((int)i, 0)));
+            }
         }
     }
     else
+    {
         CV_Error(Error::StsUnsupportedFormat, "Unsupported data layout in blobFromImagesWithParams function.");
+    }
+    CV_Assert(blob_.total());
+}
+
+void blobFromImagesWithParams(InputArrayOfArrays images, OutputArray blob, const Image2BlobParams& param) {
+    CV_TRACE_FUNCTION();
+
+    if (images.kind() == _InputArray::STD_VECTOR_UMAT) {
+        if(blob.kind() == _InputArray::UMAT) {
+            UMat& u = blob.getUMatRef();
+            blobFromImagesWithParamsImpl<cv::UMat>(images, u, param);
+            return;
+        } else if(blob.kind() == _InputArray::MAT) {
+            UMat u = blob.getMatRef().getUMat(ACCESS_WRITE);
+            blobFromImagesWithParamsImpl<cv::UMat>(images, u, param);
+            u.copyTo(blob);
+            return;
+        }
+    } else if (images.kind() == _InputArray::STD_VECTOR_MAT) {
+        if(blob.kind() == _InputArray::UMAT) {
+            Mat m = blob.getUMatRef().getMat(ACCESS_WRITE);
+            blobFromImagesWithParamsImpl<cv::Mat>(images, m, param);
+            m.copyTo(blob);
+            return;
+        } else if(blob.kind() == _InputArray::MAT) {
+            Mat& m = blob.getMatRef();
+            blobFromImagesWithParamsImpl<cv::Mat>(images, m, param);
+            return;
+        }
+    }
+
+    CV_Error(Error::StsBadArg, "Images are expected to be a vector of either a Mat or UMat and Blob is expected to be either a Mat or UMat");
+}
+
+void blobFromImageWithParams(InputArray image, OutputArray blob, const Image2BlobParams& param)
+{
+    CV_TRACE_FUNCTION();
+
+    if (image.kind() == _InputArray::UMAT) {
+        if(blob.kind() == _InputArray::UMAT) {
+            UMat& u = blob.getUMatRef();
+            std::vector<UMat> images(1, image.getUMat());
+            blobFromImagesWithParamsImpl<cv::UMat>(images, u, param);
+            return;
+        } else if(blob.kind() == _InputArray::MAT) {
+            UMat u = blob.getMatRef().getUMat(ACCESS_RW);
+            std::vector<UMat> images(1, image.getUMat());
+            blobFromImagesWithParamsImpl<cv::UMat>(images, u, param);
+            u.copyTo(blob);
+            return;
+        }
+    } else if (image.kind() == _InputArray::MAT) {
+        if(blob.kind() == _InputArray::UMAT) {
+            Mat m = blob.getUMatRef().getMat(ACCESS_RW);
+            std::vector<Mat> images(1, image.getMat());
+            blobFromImagesWithParamsImpl<cv::Mat>(images, m, param);
+            m.copyTo(blob);
+            return;
+        } else if(blob.kind() == _InputArray::MAT) {
+            Mat& m = blob.getMatRef();
+            std::vector<Mat> images(1, image.getMat());
+            blobFromImagesWithParamsImpl<cv::Mat>(images, m, param);
+            return;
+        }
+    }
+
+    CV_Error(Error::StsBadArg, "Image an Blob are expected to be either a Mat or UMat");
 }
 
 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
@@ -259,6 +382,66 @@ void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
     }
 }
 
+Rect Image2BlobParams::blobRectToImageRect(const Rect &r, const Size &oriImage)
+{
+    CV_Assert(!oriImage.empty());
+    std::vector<Rect> rImg, rBlob;
+    rBlob.push_back(Rect(r));
+    rImg.resize(1);
+    this->blobRectsToImageRects(rBlob, rImg, oriImage);
+    return Rect(rImg[0]);
+}
+
+void Image2BlobParams::blobRectsToImageRects(const std::vector<Rect> &rBlob, std::vector<Rect>& rImg, const Size& imgSize)
+{
+    Size size = this->size;
+    rImg.resize(rBlob.size());
+    if (size != imgSize)
+    {
+        if (this->paddingmode == DNN_PMODE_CROP_CENTER)
+        {
+            float resizeFactor = std::max(size.width / (float)imgSize.width,
+                size.height / (float)imgSize.height);
+            for (int i = 0; i < rBlob.size(); i++)
+            {
+                rImg[i] = Rect((rBlob[i].x + 0.5 * (imgSize.width * resizeFactor - size.width)) / resizeFactor,
+                               (rBlob[i].y + 0.5 * (imgSize.height * resizeFactor - size.height)) / resizeFactor,
+                               rBlob[i].width / resizeFactor,
+                               rBlob[i].height / resizeFactor);
+            }
+        }
+        else if (this->paddingmode == DNN_PMODE_LETTERBOX)
+        {
+            float resizeFactor = std::min(size.width / (float)imgSize.width,
+                size.height / (float)imgSize.height);
+            int rh = int(imgSize.height * resizeFactor);
+            int rw = int(imgSize.width * resizeFactor);
+
+            int top = (size.height - rh) / 2;
+            int left = (size.width - rw) / 2;
+            for (int i = 0; i < rBlob.size(); i++)
+            {
+                rImg[i] = Rect((rBlob[i].x - left) / resizeFactor,
+                               (rBlob[i].y - top) / resizeFactor,
+                               rBlob[i].width / resizeFactor,
+                               rBlob[i].height / resizeFactor);
+            }
+        }
+        else if (this->paddingmode == DNN_PMODE_NULL)
+        {
+            for (int i = 0; i < rBlob.size(); i++)
+            {
+                rImg[i] = Rect(rBlob[i].x * (float)imgSize.width / size.width,
+                               rBlob[i].y * (float)imgSize.height / size.height,
+                               rBlob[i].width * (float)imgSize.width / size.width,
+                               rBlob[i].height * (float)imgSize.height / size.height);
+            }
+        }
+        else
+            CV_Error(cv::Error::StsBadArg, "Unknown padding mode");
+    }
+}
+
 
 CV__DNN_INLINE_NS_END
 }}  // namespace cv::dnn
diff --git a/modules/dnn/src/graph_simplifier.cpp b/modules/dnn/src/graph_simplifier.cpp
index e58e0e38e853..2e1dc400be83 100644
--- a/modules/dnn/src/graph_simplifier.cpp
+++ b/modules/dnn/src/graph_simplifier.cpp
@@ -77,68 +77,107 @@ int Subgraph::getInputNodeId(const Ptr<ImportGraphWrapper>& net,
 }
 
 bool Subgraph::match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                     std::vector<int>& matchedNodesIds,
-                     std::vector<int>& targetNodesIds)
+                     std::vector<int>& matchedNodesIds)
 {
     matchedNodesIds.clear();
-    targetNodesIds.clear();
 
-    std::queue<int> nodesToMatch;
-    std::queue<int> targetNodes;
-    nodesToMatch.push(nodeId);
-    targetNodes.push(nodes.size() - 1);
-    while (!nodesToMatch.empty())
+    // Collection of all matchings states across branching.
+    // If there is no commutative ops in the subgraph - there would be just a single map.
+    std::vector<std::shared_ptr<std::map<int, int>>> matchCandidates;
+    matchCandidates.push_back(makePtr<std::map<int, int>>());
+
+    struct State
     {
-        int nodeToMatch = nodesToMatch.front();
-        int targetNodeId = targetNodes.front();
-        nodesToMatch.pop();
-        targetNodes.pop();
+        int nodeToMatch;
+        int targetNodeId;
+        // Every state refers to current matchings pairs as well as
+        // matchings from parent branches produced by commutative ops.
+        std::vector<std::shared_ptr<std::map<int, int>>> matchings;
+
+        // When we register a matching pair we should register it in every parent branch.
+        // This is actual for branching in case of commutative ops only.
+        void addMatch(std::pair<int, int> match)
+        {
+            for (auto& m : matchings)
+                m->insert(match);
+        }
+    };
+
+    std::queue<State> states;
+    states.push({nodeId, (int)nodes.size() - 1, matchCandidates});
+
+    while (!states.empty())
+    {
+        auto state = states.front();
+        states.pop();
+        int nodeToMatch = state.nodeToMatch;
+        int targetNodeId = state.targetNodeId;
+        auto matchings = state.matchings.back();
+
+        if (matchings->find(targetNodeId) != matchings->end())
+            continue;
 
-        if (std::find(matchedNodesIds.begin(), matchedNodesIds.end(), nodeToMatch) !=
-            matchedNodesIds.end())
+        // Empty placeholder matches with any input type
+        if (nodes[targetNodeId].empty()) {
+            state.addMatch({targetNodeId, nodeToMatch});
             continue;
+        }
 
         const Ptr<ImportNodeWrapper> node = net->getNode(nodeToMatch);
         if (node->getType() != nodes[targetNodeId])
-            return false;
+            continue;
 
         std::vector<int>& inputNodes = inputs[targetNodeId];
         if (inputNodes.size() != node->getNumInputs())
-            return false;
+            continue;
 
-        for (int j = 0; j < inputNodes.size(); ++j)
+        state.addMatch({targetNodeId, nodeToMatch});
+
+        bool isCommutative = net->isCommutativeOp(node->getType());
+        if (isCommutative)
         {
-            if (nodes[inputNodes[j]].empty() || node->getInputName(j).empty())  // Unknown input node type.
-                continue;
-            nodeId = getInputNodeId(net, node, j);
-            const Ptr<ImportNodeWrapper> inpNode = net->getNode(nodeId);
-            if (inpNode->getType() != "Const" && inpNode->getType() != "Constant")
+            if (inputNodes.size() != 2)
+                CV_Error(Error::StsNotImplemented, "Commutative op fusion with more than 2 inputs");
+
+            auto newMatchings = makePtr<std::map<int, int>>(*matchings);
+            matchCandidates.push_back(newMatchings);
+            state.matchings.push_back(newMatchings);
+            states.push({getInputNodeId(net, node, 0), inputNodes[0], state.matchings});
+            states.push({getInputNodeId(net, node, 1), inputNodes[1], state.matchings});
+            state.matchings.pop_back();
+
+            newMatchings = makePtr<std::map<int, int>>(*matchings);
+            matchCandidates.push_back(newMatchings);
+            state.matchings.push_back(newMatchings);
+            states.push({getInputNodeId(net, node, 0), inputNodes[1], state.matchings});
+            states.push({getInputNodeId(net, node, 1), inputNodes[0], state.matchings});
+            state.matchings.pop_back();
+        }
+        else
+        {
+            for (int j = 0; j < inputNodes.size(); ++j)
             {
-                nodesToMatch.push(nodeId);
-                targetNodes.push(inputNodes[j]);
+                nodeId = getInputNodeId(net, node, j);
+                states.push({nodeId, inputNodes[j], state.matchings});
             }
-            else if (nodes[inputNodes[j]] != "Const" && nodes[inputNodes[j]] != "Constant")
-                return false;
         }
-        matchedNodesIds.push_back(nodeToMatch);
-        targetNodesIds.push_back(targetNodeId);
     }
-
-    const int n = matchedNodesIds.size();
-    std::vector<std::pair<int, int> > elements(n);
-    for (int i = 0; i < n; ++i)
-        elements[i] = std::make_pair(matchedNodesIds[i], targetNodesIds[i]);
-    std::sort(elements.begin(), elements.end());
-    for (int i = 0; i < n; ++i)
+    for (auto& matchings : matchCandidates)
     {
-        matchedNodesIds[i] = elements[i].first;
-        targetNodesIds[i] = elements[i].second;
+        if (matchings->size() != nodes.size())
+            continue;
+        matchedNodesIds.resize(matchings->size());
+        for (int i = 0; i < matchings->size(); ++i)
+        {
+            CV_Assert(matchings->find(i) != matchings->end());
+            matchedNodesIds[i] = matchings->at(i);
+        }
+        return true;
     }
-    return true;
+    return false;
 }
 
-void Subgraph::replace(const Ptr<ImportGraphWrapper>& net, const std::vector<int>& matchedNodesIds,
-                       const std::vector<int>& targetNodesIds)
+void Subgraph::replace(const Ptr<ImportGraphWrapper>& net, const std::vector<int>& matchedNodesIds)
 {
     // Extract names of input nodes.
     std::vector<std::string> inputsNames(fusedNodeInputs.size());
@@ -149,9 +188,9 @@ void Subgraph::replace(const Ptr<ImportGraphWrapper>& net, const std::vector<int
         for (int j = 0; j < matchedNodesIds.size() && inpName.empty(); ++j)
         {
             Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[j]);
-            std::vector<int>& inpIndices = inputs[targetNodesIds[j]];
+            std::vector<int>& inpIndices = inputs[j];
 
-            CV_Assert(node->getNumInputs() == inpIndices.size());
+            CV_Assert(inpIndices.empty() || node->getNumInputs() == inpIndices.size());
             for (int k = 0; k < inpIndices.size(); ++k)
             {
                 if (inpIndices[k] == fusedNodeInputs[i])
@@ -165,10 +204,7 @@ void Subgraph::replace(const Ptr<ImportGraphWrapper>& net, const std::vector<int
         inputsNames[i] = inpName;
     }
 
-    // Remove matched nodes except the last one. Indices in ascending order are expected.
     Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds.back());
-    for (int i = matchedNodesIds.size() - 2; i >= 0; --i)
-        net->removeNode(matchedNodesIds[i]);
 
     // Modify the last node to be a fused one.
     node->setType(fusedNodeOp);
@@ -190,16 +226,61 @@ void simplifySubgraphs(const Ptr<ImportGraphWrapper>& net,
                        const std::vector<Ptr<Subgraph> >& patterns)
 {
     int numNodes = net->getNumNodes();
-    std::vector<int> matchedNodesIds, targetNodesIds;
+    std::vector<int> matchedNodesIds;
+    std::vector<int> nodesToRemove;
     for (int j = 0; j < patterns.size(); ++j)
     {
         for (int i = 0; i < numNodes; ++i)
         {
-            if (patterns[j]->match(net, i, matchedNodesIds, targetNodesIds))
+            if (patterns[j]->match(net, i, matchedNodesIds))
             {
-                patterns[j]->replace(net, matchedNodesIds, targetNodesIds);
-                numNodes -= matchedNodesIds.size() - 1;  // #matchedNodes removed and one added.
+                patterns[j]->replace(net, matchedNodesIds);
+                // Remove matched nodes except the last one.
+                nodesToRemove.insert(nodesToRemove.end(), matchedNodesIds.begin(), matchedNodesIds.end() - 1);
+            }
+        }
+    }
+
+    if (nodesToRemove.empty())
+        return;
+
+    // Collect reference counts for every node
+    std::vector<int> refcounts(net->getNumNodes(), 0);
+    std::map<std::string, int> nodeIds;
+
+    // Register node outputs.
+    // Every usage of one of the node's outputs should be counted.
+    for (int nodeId = 0; nodeId < refcounts.size(); ++nodeId) {
+        for (int i = 0; i < net->getNumOutputs(nodeId); ++i) {
+            std::string name = net->getOutputName(nodeId, i);
+            nodeIds[name] = nodeId;
+        }
+    }
+
+    for (int nodeId = 0; nodeId < refcounts.size(); ++nodeId) {
+        // Increase counters for node's inputs
+        auto node = net->getNode(nodeId);
+        for (int i = 0; i < node->getNumInputs(); ++i) {
+            std::string inpName = node->getInputName(i);
+            if (inpName.empty())
+                continue;
+            CV_Assert(nodeIds.find(inpName) != nodeIds.end());
+            refcounts[nodeIds[inpName]] += 1;
+        }
+    }
+
+    // Remove all fused nodes. Indices expected to be in descending order.
+    std::sort(nodesToRemove.begin(), nodesToRemove.end(), [](int a, int b) { return a > b; });
+    for (int nodeId : nodesToRemove) {
+        if (refcounts[nodeId] == 0) {
+            // Decrease references to node's inputs and remove node itself
+            auto node = net->getNode(nodeId);
+            for (int i = 0; i < node->getNumInputs(); ++i) {
+                std::string inpName = node->getInputName(i);
+                refcounts[nodeIds[inpName]] -= 1;
             }
+            net->removeNode(nodeId);
+            refcounts[nodeId] = -1;  // Same node cannot be removed twice
         }
     }
 }
diff --git a/modules/dnn/src/graph_simplifier.hpp b/modules/dnn/src/graph_simplifier.hpp
index 39d6262c1b50..aa9be32a9110 100644
--- a/modules/dnn/src/graph_simplifier.hpp
+++ b/modules/dnn/src/graph_simplifier.hpp
@@ -17,7 +17,7 @@ namespace cv { namespace dnn {
 class ImportNodeWrapper
 {
 public:
-    virtual ~ImportNodeWrapper() {};
+    virtual ~ImportNodeWrapper() {}
 
     virtual int getNumInputs() const = 0;
 
@@ -33,7 +33,7 @@ class ImportNodeWrapper
 class ImportGraphWrapper
 {
 public:
-    virtual ~ImportGraphWrapper() {};
+    virtual ~ImportGraphWrapper() {}
 
     virtual Ptr<ImportNodeWrapper> getNode(int idx) const = 0;
 
@@ -44,6 +44,8 @@ class ImportGraphWrapper
     virtual std::string getOutputName(int nodeId, int outId) const = 0;
 
     virtual void removeNode(int idx) = 0;
+
+    virtual bool isCommutativeOp(const std::string& type) const = 0;
 };
 
 class Subgraph  // Interface to match and replace subgraphs.
@@ -75,12 +77,10 @@ class Subgraph  // Interface to match and replace subgraphs.
     // Match TensorFlow subgraph starting from <nodeId> with a set of nodes to be fused.
     // Const nodes are skipped during matching. Returns true if nodes are matched and can be fused.
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds);
+                       std::vector<int>& matchedNodesIds);
 
     // Fuse matched subgraph.
-    void replace(const Ptr<ImportGraphWrapper>& net, const std::vector<int>& matchedNodesIds,
-                 const std::vector<int>& targetNodesIds);
+    void replace(const Ptr<ImportGraphWrapper>& net, const std::vector<int>& matchedNodesIds);
 
     virtual void finalize(const Ptr<ImportGraphWrapper>& net,
                           const Ptr<ImportNodeWrapper>& fusedNode,
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index a49976de742b..6e7b9f9be5aa 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -14,7 +14,7 @@
 #include <opencv2/dnn/shape_utils.hpp>
 
 #ifdef HAVE_DNN_NGRAPH
-#include <ie_extension.h>
+#include <openvino/core/extension.hpp>
 #endif  // HAVE_DNN_NGRAPH
 
 #include <opencv2/core/utils/configuration.private.hpp>
@@ -35,36 +35,6 @@ static bool DNN_IE_SERIALIZE = utils::getConfigurationParameterBool("OPENCV_DNN_
 static std::string kDefaultInpLayerName = "opencv_ngraph_empty_inp_layer_name";
 static constexpr const char* kOpenCVLayersType = "opencv_ngraph_layer";
 
-#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2022_1)
-static std::string shapesToStr(const std::vector<Mat>& mats)
-{
-    std::ostringstream shapes;
-    shapes << mats.size() << " ";
-    for (const Mat& m : mats)
-    {
-        shapes << m.dims << " ";
-        for (int i = 0; i < m.dims; ++i)
-            shapes << m.size[i] << " ";
-    }
-    return shapes.str();
-}
-
-static void strToShapes(const std::string& str, std::vector<std::vector<size_t> >& shapes)
-{
-    std::istringstream ss(str);
-    int num, dims;
-    ss >> num;
-    shapes.resize(num);
-    for (int i = 0; i < num; ++i)
-    {
-        ss >> dims;
-        shapes[i].resize(dims);
-        for (int j = 0; j < dims; ++j)
-            ss >> shapes[i][j];
-    }
-}
-#endif // OpenVINO < 2022.1
-
 static std::vector<Ptr<NgraphBackendWrapper> >
 ngraphWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
 {
@@ -78,13 +48,11 @@ ngraphWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
     return wrappers;
 }
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
-
 class NgraphCustomOp: public ov::op::Op {
 public:
     OPENVINO_OP(kOpenCVLayersType);
 
-    NgraphCustomOp(const ngraph::OutputVector& inputs, Ptr<Layer>& cvLayer, const std::vector<Mat>& outputs, const std::vector<Mat>& internals):
+    NgraphCustomOp(const ov::OutputVector& inputs, Ptr<Layer>& cvLayer, const std::vector<Mat>& outputs, const std::vector<Mat>& internals):
         Op(inputs), cvLayer(cvLayer), outputs(outputs), internals(internals)
     {
         constructor_validate_and_infer_types();
@@ -103,7 +71,7 @@ class NgraphCustomOp: public ov::op::Op {
         }
     }
 
-    std::shared_ptr<ngraph::Node> clone_with_new_inputs(const ngraph::OutputVector& new_args) const override
+    std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override
     {
         return std::make_shared<NgraphCustomOp>(new_args, cvLayer, outputs, internals);
     }
@@ -131,296 +99,34 @@ class NgraphCustomOp: public ov::op::Op {
     std::vector<Mat> outputs, internals;
 };
 
-#else
-
-class NgraphCustomOp: public ngraph::op::Op {
-public:
-    const ngraph::NodeTypeInfo& get_type_info() const override
-    {
-        static constexpr ngraph::NodeTypeInfo type_info{kOpenCVLayersType, static_cast<uint64_t>(0)};
-        return type_info;
-    }
-
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_3)
-    NgraphCustomOp(const ngraph::OutputVector& inputs,
-#else
-    NgraphCustomOp(const ngraph::NodeVector& inputs,
-#endif
-                   const std::map<std::string, InferenceEngine::Parameter>& params = {}):
-        Op(inputs), params(params)
-    {
-        constructor_validate_and_infer_types();
-    }
-
-    ~NgraphCustomOp()
-    {
-        // nothing
-    }
-
-    void validate_and_infer_types() override
-    {
-        std::vector<std::vector<size_t> > shapes;
-        strToShapes(params["outputs"], shapes);
-        set_output_size(shapes.size());
-        for (size_t i = 0; i < shapes.size(); ++i)
-        {
-            ngraph::Shape output_shape(shapes[i]);
-            set_output_type(i, get_input_element_type(0), output_shape);
-        }
-    }
-
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_4)
-    std::shared_ptr<ngraph::Node> clone_with_new_inputs(const ngraph::OutputVector& new_args) const override
-    {
-        return std::make_shared<NgraphCustomOp>(new_args, params);
-    }
-#else
-    std::shared_ptr<ngraph::Node> copy_with_new_args(const ngraph::NodeVector& new_args) const override
-    {
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_3)
-        return std::make_shared<NgraphCustomOp>(ngraph::as_output_vector(new_args), params);
-#else
-        return std::make_shared<NgraphCustomOp>(new_args, params);
-#endif
-    }
-#endif
-
-    bool visit_attributes(ngraph::AttributeVisitor& visitor) override
-    {
-        for (auto& attr : params)
-        {
-            if (attr.second.is<std::string>())
-                visitor.on_attribute(attr.first, attr.second.as<std::string>());
-        }
-        return true;
-    }
-
-    std::map<std::string, InferenceEngine::Parameter> params;
-};
-
-
-class InfEngineNgraphCustomLayer : public InferenceEngine::ILayerExecImpl
-{
-public:
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2)
-    explicit InfEngineNgraphCustomLayer(const std::shared_ptr<ngraph::Node>& _node)
-    {
-        node = std::dynamic_pointer_cast<NgraphCustomOp>(_node);
-        CV_Assert(node);
-        std::string implStr = node->params["impl"];
-        std::istringstream iss(implStr);
-#else
-    explicit InfEngineNgraphCustomLayer(const InferenceEngine::CNNLayer& layer) : cnnLayer(layer)
-    {
-        std::istringstream iss(layer.GetParamAsString("impl"));
-#endif
-        size_t ptr;
-        iss >> ptr;
-        cvLayer = (Layer*)ptr;
-
-        std::vector<std::vector<size_t> > shapes;
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2)
-        strToShapes(node->params["internals"], shapes);
-#else
-        strToShapes(layer.GetParamAsString("internals"), shapes);
-#endif
-        internals.resize(shapes.size());
-        for (int i = 0; i < shapes.size(); ++i)
-            internals[i].create(std::vector<int>(shapes[i].begin(), shapes[i].end()), CV_32F);
-    }
-
-    ~InfEngineNgraphCustomLayer()
-    {
-        // nothing
-    }
-
-    virtual InferenceEngine::StatusCode execute(std::vector<InferenceEngine::Blob::Ptr>& inputs,
-                                                std::vector<InferenceEngine::Blob::Ptr>& outputs,
-                                                InferenceEngine::ResponseDesc *resp) noexcept
-    {
-        std::vector<Mat> inpMats, outMats;
-        infEngineBlobsToMats(inputs, inpMats);
-        infEngineBlobsToMats(outputs, outMats);
-
-        try
-        {
-            cvLayer->forward(inpMats, outMats, internals);
-            return InferenceEngine::StatusCode::OK;
-        }
-        catch (...)
-        {
-            return InferenceEngine::StatusCode::GENERAL_ERROR;
-        }
-    }
-
-    virtual InferenceEngine::StatusCode
-    getSupportedConfigurations(std::vector<InferenceEngine::LayerConfig>& conf,
-                               InferenceEngine::ResponseDesc* resp) noexcept
-    {
-        std::vector<InferenceEngine::DataConfig> inDataConfig;
-        std::vector<InferenceEngine::DataConfig> outDataConfig;
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2)
-        InferenceEngine::SizeVector order;
-        for (int i = 0; i < node->get_input_size(); ++i)
-        {
-            InferenceEngine::DataConfig conf;
-            auto shape = node->input_value(i).get_shape();
-            order.resize(shape.size());
-            std::iota(order.begin(), order.end(), 0);
-            conf.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, shape, {shape, order});
-            inDataConfig.push_back(conf);
-        }
-
-        for (int i = 0; i < node->get_output_size(); ++i)
-        {
-            InferenceEngine::DataConfig conf;
-            auto shape = node->output(i).get_shape();
-            order.resize(shape.size());
-            std::iota(order.begin(), order.end(), 0);
-            conf.desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, shape, {shape, order});
-            outDataConfig.push_back(conf);
-        }
-#else
-        for (auto& it : cnnLayer.insData)
-        {
-            InferenceEngine::DataConfig conf;
-            conf.desc = it.lock()->getTensorDesc();
-            inDataConfig.push_back(conf);
-        }
-
-        for (auto& it : cnnLayer.outData)
-        {
-            InferenceEngine::DataConfig conf;
-            conf.desc = it->getTensorDesc();
-            outDataConfig.push_back(conf);
-        }
-#endif
-
-        InferenceEngine::LayerConfig layerConfig;
-        layerConfig.inConfs = inDataConfig;
-        layerConfig.outConfs = outDataConfig;
-
-        conf.push_back(layerConfig);
-        return InferenceEngine::StatusCode::OK;
-    }
-
-    InferenceEngine::StatusCode init(InferenceEngine::LayerConfig& config,
-                                     InferenceEngine::ResponseDesc *resp) noexcept
-    {
-        return InferenceEngine::StatusCode::OK;
-    }
-
-private:
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2)
-    std::shared_ptr<NgraphCustomOp> node;
-#else
-    InferenceEngine::CNNLayer cnnLayer;
-#endif
-    dnn::Layer* cvLayer;
-    std::vector<Mat> internals;
-};
-
-#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2020_2)
-class InfEngineNgraphCustomLayerFactory : public InferenceEngine::ILayerImplFactory {
-public:
-    explicit InfEngineNgraphCustomLayerFactory(const InferenceEngine::CNNLayer* layer) : cnnLayer(*layer)
-    {
-        // nothing
-    }
-
-    InferenceEngine::StatusCode
-    getImplementations(std::vector<InferenceEngine::ILayerImpl::Ptr>& impls,
-                       InferenceEngine::ResponseDesc* resp) noexcept override
-    {
-        impls.push_back(std::make_shared<InfEngineNgraphCustomLayer>(cnnLayer));
-        return InferenceEngine::StatusCode::OK;
-    }
-
-private:
-    InferenceEngine::CNNLayer cnnLayer;
-};
-#endif
-
-
-class InfEngineNgraphExtension : public InferenceEngine::IExtension
-{
-public:
-    void Unload() noexcept override {}
-    void Release() noexcept override { delete this; }
-    void GetVersion(const InferenceEngine::Version*&) const noexcept override {}
-
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2)
-    std::vector<std::string> getImplTypes(const std::shared_ptr<ngraph::Node>& node) override {
-        return {"CPU"};
-    }
-
-    InferenceEngine::ILayerImpl::Ptr getImplementation(const std::shared_ptr<ngraph::Node>& node, const std::string& implType) override {
-        if (std::dynamic_pointer_cast<NgraphCustomOp>(node) && implType == "CPU") {
-            return std::make_shared<InfEngineNgraphCustomLayer>(node);
-        }
-        return nullptr;
-    }
-#else
-    virtual void SetLogCallback(InferenceEngine::IErrorListener&) noexcept {}
-
-    virtual InferenceEngine::StatusCode getPrimitiveTypes(char**&, unsigned int&,
-                                                          InferenceEngine::ResponseDesc*) noexcept
-    {
-        return InferenceEngine::StatusCode::OK;
-    }
-
-    InferenceEngine::StatusCode getFactoryFor(InferenceEngine::ILayerImplFactory*& factory,
-                                              const InferenceEngine::CNNLayer* cnnLayer,
-                                              InferenceEngine::ResponseDesc* resp) noexcept
-    {
-        if (cnnLayer->type != kOpenCVLayersType)
-            return InferenceEngine::StatusCode::NOT_IMPLEMENTED;
-        factory = new InfEngineNgraphCustomLayerFactory(cnnLayer);
-        return InferenceEngine::StatusCode::OK;
-    }
-#endif
-};
-
-#endif // OpenVINO >= 2022.1
-
-InfEngineNgraphNode::InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node)
-    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {}
+InfEngineNgraphNode::InfEngineNgraphNode(ov::Output<ov::Node>&& _node)
+    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {
+    CV_Assert(node.get_node());
+    CV_Assert(node.get_node_shared_ptr());
+}
 
-InfEngineNgraphNode::InfEngineNgraphNode(const std::shared_ptr<ngraph::Node>& _node)
-    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {}
+InfEngineNgraphNode::InfEngineNgraphNode(const ov::Output<ov::Node>& _node)
+    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {
+    CV_Assert(node.get_node());
+    CV_Assert(node.get_node_shared_ptr());
+}
 
 InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& nodes,
                                          Ptr<Layer>& cvLayer_, std::vector<Mat*>& inputs,
                                          std::vector<Mat>& outputs, std::vector<Mat>& internals)
     : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), cvLayer(cvLayer_)
 {
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_3)
-    ngraph::OutputVector inp_nodes;
-#else
-    ngraph::NodeVector inp_nodes;
-#endif
+    ov::OutputVector inp_nodes;
     for (const auto& node : nodes)
         inp_nodes.emplace_back(node.dynamicCast<InfEngineNgraphNode>()->node);
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
     node = std::make_shared<NgraphCustomOp>(inp_nodes, cvLayer, outputs, internals);
-#else
-    std::ostringstream oss;
-    oss << (size_t)cvLayer.get();
-    std::map<std::string, InferenceEngine::Parameter> params = {
-        {"impl", oss.str()},
-        {"outputs", shapesToStr(outputs)},
-        {"internals", shapesToStr(internals)}
-    };
-    node = std::make_shared<NgraphCustomOp>(inp_nodes, params);
-#endif
-
     CV_Assert(!cvLayer->name.empty());
     setName(cvLayer->name);
 }
 
 void InfEngineNgraphNode::setName(const std::string& name) {
-    node->set_friendly_name(name);
+    node.get_node()->set_friendly_name(name);
 }
 
 InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl)
@@ -430,7 +136,7 @@ InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl)
     device_name = "CPU";
 }
 
-InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl, InferenceEngine::CNNNetwork& net)
+InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl, std::shared_ptr<ov::Model>& net)
     : netImpl_(netImpl)
     , cnn(net)
 {
@@ -441,172 +147,39 @@ InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl, InferenceEn
 void InfEngineNgraphNet::addOutput(const Ptr<InfEngineNgraphNode>& node)
 {
     CV_Assert(node);
-    CV_Assert(node->node);
-    const std::string& name = node->node->get_friendly_name();
+    const std::string& name = node->node.get_node()->get_friendly_name();
     requestedOutputs.insert({name, node.get()});
 }
 
-void InfEngineNgraphNet::setNodePtr(std::shared_ptr<ngraph::Node>* ptr) {
-    all_nodes.emplace((*ptr)->get_friendly_name(), ptr);
-}
-
- void InfEngineNgraphNet::release()
- {
-     // FIXIT release should not be conditional, release ALL
-     for (auto& node : components.back()) {
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-         if (!(ngraph::op::is_parameter(node) || ngraph::op::is_output(node) || ngraph::op::is_constant(node)) ) {
-#else
-         if (!(node->is_parameter() || node->is_output() || node->is_constant()) ) {
-#endif
-             auto it = all_nodes.find(node->get_friendly_name());
-             if (it != all_nodes.end()) {
-                 it->second->reset();
-                 all_nodes.erase(it);
-             }
-         }
-     }
- }
-
-void InfEngineNgraphNet::dfs(std::shared_ptr<ngraph::Node>& node,
-                             std::vector<std::shared_ptr<ngraph::Node>>& comp,
-                             std::unordered_map<std::string, bool>& used) {
-    used[node->get_friendly_name()] = true;
-    comp.push_back(node);
-    auto inputs = node->get_users();
-    for (size_t i = 0; i < node->get_input_size(); ++i) {
-        inputs.push_back(node->input_value(i).get_node()->shared_from_this());
-    }
-
-    for (auto& to : inputs) {
-        if (!used[to->get_friendly_name()]) {
-            dfs(to, comp, used);
-        }
-    }
-}
-
-int InfEngineNgraphNet::getNumComponents()
-{
-    if (!components.empty()) {
-        return components.size();
-    }
-    std::unordered_map<std::string, bool> used;
-    auto inputs = ngraph_function->get_ordered_ops();
-    for (auto& node : inputs) {
-        used.emplace(node->get_friendly_name(), false);
-    }
-
-    for (auto& node : inputs) {
-        if (!used[node->get_friendly_name()]) {
-            std::vector<std::shared_ptr<ngraph::Node>> current_comp;
-            dfs(node, current_comp, used);
-            components.push_back(current_comp);
-        }
-    }
-    return components.size();
-}
-
 void InfEngineNgraphNet::createNet(Target targetId) {
     if (!hasNetOwner)
     {
         CV_Assert(!requestedOutputs.empty());
-        ngraph::ResultVector outs;
+        ov::ResultVector outs;
 
         for (auto output_node_it = requestedOutputs.begin(); output_node_it != requestedOutputs.end(); ++output_node_it)
         {
             CV_LOG_DEBUG(NULL, "DNN/NGRAPH: Add 'Result' output: " << output_node_it->first);
             CV_Assert(output_node_it->second);
-            auto out = std::make_shared<ngraph::op::Result>(output_node_it->second->node);
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
-            out->set_friendly_name(output_node_it->first + (output_node_it->second->node->get_output_size() == 1 ? "" : ".0"));
-#endif
+            auto out = std::make_shared<ov::op::v0::Result>(output_node_it->second->node);
+            out->set_friendly_name(output_node_it->first + (output_node_it->second->node.get_node()->get_output_size() == 1 ? "" : ".0"));
             outs.push_back(out);
         }
         CV_Assert_N(!inputs_vec.empty(), !outs.empty());
-        ngraph_function = std::make_shared<ngraph::Function>(outs, inputs_vec);
-
-        int num_comp = getNumComponents();
-        CV_LOG_DEBUG(NULL, "DNN/IE: number of subgraphs: " << num_comp);
-        if (num_comp > 1) {
-            for (int i = num_comp - 1; i >= 0; --i) {
-                ngraph::ResultVector outputs;
-                ngraph::ParameterVector inps;
-                for (auto& node : components.back()) {
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-                    if (ngraph::op::is_parameter(node)) {
-#else
-                    if (node->is_parameter()) {
-#endif
-                        CV_LOG_DEBUG(NULL, "DNN/IE: subgraph[" << i << "]: +input[" << inps.size() << "] = '" << node->get_friendly_name() << "'");
-                        auto parameter = std::dynamic_pointer_cast<ngraph::op::Parameter>(node);
-                        inps.push_back(parameter);
-                    }
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-                    else if (ngraph::op::is_output(node)) {
-#else
-                    else if (node->is_output()) {
-#endif
-                        CV_LOG_DEBUG(NULL, "DNN/IE: subgraph[" << i << "]: +output[" << outputs.size() << "] = '" << node->get_friendly_name() << "'");
-                        auto result = std::dynamic_pointer_cast<ngraph::op::Result>(node);
-                        outputs.push_back(result);
-                    }
-                }
-                CV_LOG_DEBUG(NULL, "DNN/IE: subgraph[" << i << ": nodes=" << components.back().size() << " inputs=" << inps.size() << " outputs=" << outputs.size());
-                isInit = false;
-                CV_Assert_N(!inps.empty(), !outputs.empty());
-                ngraph_function = std::make_shared<ngraph::Function>(outputs, inps);
-                release();
-                components.pop_back();
-                init(targetId);
-            }
-        } else {
-            release();
-            components.clear();
-            init(targetId);
-        }
+        ngraph_function = std::make_shared<ov::Model>(outs, inputs_vec);
+        init(targetId);
     }
 }
 
-#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2022_1)
-static inline
-InferenceEngine::Layout estimateLayout(size_t dims);
-#endif
-
 void InfEngineNgraphNet::init(Target targetId)
 {
     if (!hasNetOwner)
     {
         if (targetId == DNN_TARGET_OPENCL_FP16)
         {
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
             ov::pass::ConvertFP32ToFP16().run_on_model(ngraph_function);
-#else
-            auto nodes = ngraph_function->get_ordered_ops();
-            for (auto& node : nodes)
-            {
-                auto parameter = std::dynamic_pointer_cast<ngraph::op::Parameter>(node);
-                if (parameter && parameter->get_element_type() == ngraph::element::f32)
-                {
-                    parameter->set_element_type(ngraph::element::f16);
-                }
-                auto constant = std::dynamic_pointer_cast<ngraph::op::Constant>(node);
-                if (constant && constant->get_element_type() == ngraph::element::f32)
-                {
-                    const float* floatsData = constant->get_data_ptr<float>();
-                    size_t total = ngraph::shape_size(constant->get_shape());
-                    Mat floats(1, total, CV_32F, (void*)floatsData);
-                    Mat halfs;
-                    cv::convertFp16(floats, halfs);
-
-                    auto new_const = std::make_shared<ngraph::op::Constant>(ngraph::element::f16, constant->get_shape(), halfs.data);
-                    new_const->set_friendly_name(constant->get_friendly_name());
-                    ngraph::replace_node(constant, new_const);
-                }
-            }
-            ngraph_function->validate_nodes_and_infer_types();
-#endif  // OpenVINO >= 2022.1
         }
-        cnn = InferenceEngine::CNNNetwork(ngraph_function);
+        cnn = ngraph_function;
 
         if (DNN_IE_SERIALIZE)
         {
@@ -614,7 +187,7 @@ void InfEngineNgraphNet::init(Target targetId)
             std::string dumpFileNameBase = netImpl_.getDumpFileNameBase();
             try
             {
-                cnn.serialize(dumpFileNameBase + "_ngraph.xml", dumpFileNameBase + "_ngraph.bin");
+                ov::pass::Serialize(dumpFileNameBase + "_ngraph.xml", dumpFileNameBase + "_ngraph.bin").run_on_model(cnn);
             }
             catch (const std::exception& e)
             {
@@ -652,11 +225,9 @@ void InfEngineNgraphNet::init(Target targetId)
             CV_Error(Error::StsNotImplemented, "Unknown target");
     };
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
-    auto model = cnn.getFunction();
-    ov::preprocess::PrePostProcessor ppp(model);
+    ov::preprocess::PrePostProcessor ppp(cnn);
     int i = 0;
-    for (const auto& inp : model->inputs()) {  // TODO: not sure why but ngraph_function->inputs() here causes segfault.
+    for (const auto& inp : cnn->inputs()) {  // TODO: not sure why but ngraph_function->inputs() here causes segfault.
         const std::string& name = inp.get_node()->get_friendly_name();
         auto blobIt = allBlobs.find(name);
         CV_Assert(blobIt != allBlobs.end());
@@ -668,7 +239,7 @@ void InfEngineNgraphNet::init(Target targetId)
     }
 
     i = 0;
-    for (const auto& it : model->outputs())
+    for (const auto& it : cnn->outputs())
     {
         const std::string& name = it.get_node()->get_friendly_name();
         auto blobIt = allBlobs.find(name);
@@ -684,52 +255,26 @@ void InfEngineNgraphNet::init(Target targetId)
             allBlobs[name] = ov::Tensor(src.get_element_type(), outShape, src.data());
         }
 
-        ppp.output(i++).tensor().set_element_type(ov::element::f32);  // Should be always FP32
+        ppp.output(i++).tensor().set_element_type(src.get_element_type());
     }
 
     ppp.build();
 
-#else
-
-    for (const auto& it : cnn.getInputsInfo())
-    {
-        const std::string& name = it.first;
-        auto blobIt = allBlobs.find(name);
-        CV_Assert(blobIt != allBlobs.end());
-        it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision());
-    }
-
-    for (const auto& it : cnn.getOutputsInfo())
-    {
-        const std::string& name = it.first;
-        auto blobIt = allBlobs.find(name);
-        CV_Assert(blobIt != allBlobs.end());
-        InferenceEngine::TensorDesc& desc = blobIt->second->getTensorDesc();
-
-        auto outShape = it.second->getDims();
-        if (outShape != desc.getDims()) {
-            desc.reshape(outShape, estimateLayout(outShape.size()));
-        }
-
-        it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision());  // Should be always FP32
-    }
-#endif // OpenVINO >= 2022.1
-
     initPlugin(cnn);
 }
 
-ngraph::ParameterVector InfEngineNgraphNet::setInputs(const std::vector<cv::Mat>& inputs,
+ov::ParameterVector InfEngineNgraphNet::setInputs(const std::vector<cv::Mat>& inputs,
                                    const std::vector<std::string>& names) {
     CV_Assert_N(inputs.size() == names.size());
-    ngraph::ParameterVector current_inp;
+    ov::ParameterVector current_inp;
     for (size_t i = 0; i < inputs.size(); i++)
     {
         std::vector<size_t> shape = getShape<size_t>(inputs[i]);
-        auto inp = std::make_shared<ngraph::op::Parameter>(ngraph::element::f32, ngraph::Shape(shape));
+        auto inp = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape(shape));
         inp->set_friendly_name(names[i]);
 
         auto it = std::find_if(inputs_vec.begin(), inputs_vec.end(),
-                                [&inp](const std::shared_ptr<ngraph::op::Parameter>& a) {
+                                [&inp](const std::shared_ptr<ov::op::v0::Parameter>& a) {
                                 return a->get_friendly_name() == inp->get_friendly_name();
                   });
         if (it == inputs_vec.end()) {
@@ -743,14 +288,14 @@ ngraph::ParameterVector InfEngineNgraphNet::setInputs(const std::vector<cv::Mat>
 }
 
 
-void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
+void InfEngineNgraphNet::initPlugin(std::shared_ptr<ov::Model>& net)
 {
     CV_Assert(!isInitialized());
 
     try
     {
         AutoLock lock(getInitializationMutex());
-        InferenceEngine::Core& ie = getCore(device_name);
+        ov::Core& ie = getCore(device_name);
         {
             isInit = true;
             std::vector<std::string> candidates;
@@ -765,18 +310,7 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
                 const std::string& libName = candidates[i];
                 try
                 {
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
                     ie.add_extension(libName);
-#else
-                    InferenceEngine::IExtensionPtr extension =
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_4)
-                        std::make_shared<InferenceEngine::Extension>(libName);
-#else
-                        InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(libName);
-#endif
-
-                    ie.AddExtension(extension, "CPU");
-#endif
                     CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << libName);
                     found = true;
                     break;
@@ -787,30 +321,11 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
             {
                 CV_LOG_WARNING(NULL, "DNN-IE: Can't load extension plugin (extra layers for some networks). Specify path via OPENCV_DNN_IE_EXTRA_PLUGIN_PATH parameter");
             }
-#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2022_1)
-            // Some of networks can work without a library of extra layers.
-            // OpenCV fallbacks as extensions.
-            try
-            {
-                ie.AddExtension(std::make_shared<InfEngineNgraphExtension>(), "CPU");
-            }
-            catch(const std::exception& e)
-            {
-                CV_LOG_INFO(NULL, "DNN-IE: Can't register OpenCV custom layers nGraph extension: " << e.what());
-            }
-#endif // OpenVINO < 2022.1
 #ifndef _WIN32
             // Limit the number of CPU threads.
             if (device_name == "CPU")
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
                 ie.set_property(device_name, ov::inference_num_threads(getNumThreads()));
-#else
-                ie.SetConfig({{
-                    InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
-                }}, device_name);
-#endif // OpenVINO >= 2022.1
 #endif
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_2)
             if (device_name.find("GPU") == 0)
             {
 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
@@ -821,24 +336,13 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
                 if (!cache_path.empty() && cache_path != "disabled")
                 {
                     CV_LOG_INFO(NULL, "OpenCV/nGraph: using GPU kernels cache: " << cache_path);
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
                     ie.set_property(device_name, ov::cache_dir(cache_path));
-#else
-                    ie.SetConfig({{
-                        InferenceEngine::PluginConfigParams::KEY_CACHE_DIR, cache_path,
-                    }}, device_name);
-#endif // OpenVINO >= 2022.1
                 }
             }
-#endif
         }
-        std::map<std::string, std::string> config;
+        ov::AnyMap config;
         if (device_name == "MYRIAD" || device_name == "HDDL") {
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
             config.emplace("MYRIAD_DETECT_NETWORK_BATCH", "NO");
-#else
-            config.emplace("VPU_DETECT_NETWORK_BATCH", "NO");
-#endif
         }
 
         bool isHetero = device_name == "FPGA";
@@ -846,7 +350,7 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
         // We do not check IR models because they can be with version less than IRv10
         if (!isHetero && device_name != "CPU" && !hasNetOwner)
         {
-            for (auto& node : net.getFunction()->get_ops())
+            for (auto& node : net->get_ops())
             {
                 if (node->description() == kOpenCVLayersType)
                 {
@@ -858,7 +362,7 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
 
         std::string ieDevice = isHetero ? ("HETERO:" + device_name + ",CPU") : device_name;
         CV_LOG_INFO(NULL, "DNN/IE: Calling LoadNetwork(device=" << ieDevice << ")...");
-        netExec = ie.LoadNetwork(net, ieDevice, config);
+        netExec = ie.compile_model(net, ieDevice, config);
     }
     catch (const std::exception& ex)
     {
@@ -876,14 +380,13 @@ bool NgraphBackendLayer::getMemoryShapes(const std::vector<MatShape> &inputs,
                                             std::vector<MatShape> &outputs,
                                             std::vector<MatShape> &internals) const
 {
-    auto ngraphFunction = t_net.getFunction();
     bool equal_flag = true;
-    std::map<std::string, std::vector<size_t> > inShapes;
+    std::map<std::string, ov::PartialShape> inShapes;
     int i = 0;
-    for (const auto& inp : ngraphFunction->get_parameters())
+    for (const auto& inp : t_net->get_parameters())
     {
-        std::vector<size_t> oldShape = inp->get_shape();
-        std::vector<size_t> newShape(inputs[i].begin(), inputs[i].end());
+        ov::Shape oldShape = inp->get_shape();
+        ov::Shape newShape(inputs[i].begin(), inputs[i].end());
         inShapes.insert({inp->get_friendly_name(), newShape});
         if (oldShape != newShape)
         {
@@ -894,21 +397,17 @@ bool NgraphBackendLayer::getMemoryShapes(const std::vector<MatShape> &inputs,
 
     if (!equal_flag)
     {
-        InferenceEngine::CNNNetwork curr_t_net(t_net);
-        curr_t_net.reshape(inShapes);
+        std::shared_ptr<ov::Model> curr_t_net(t_net);
+        curr_t_net->reshape(inShapes);
     }
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
     std::vector<size_t> dims;
-    for (const auto& it : ngraphFunction->outputs()) {
+    for (const auto& it : t_net->outputs()) {
         if (it.get_node()->get_friendly_name() == name) {
             dims = it.get_partial_shape().get_max_shape();
         }
     }
     if (dims.empty())
         CV_Error(Error::StsError, format("Unable find result with name %s", name.c_str()));
-#else
-    std::vector<size_t> dims = t_net.getOutputsInfo()[name]->getDims();
-#endif
     outputs.push_back(MatShape(dims.begin(), dims.end()));
     return false;
 }
@@ -926,74 +425,20 @@ void NgraphBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArrays
     CV_Error(Error::StsInternal, "Choose Inference Engine as a preferable backend.");
 }
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
-
 ov::Tensor wrapToNgraphBlob(const Mat& m) {
     std::vector<size_t> shape = getShape<size_t>(m);
     if (m.type() == CV_32F)
         return ov::Tensor(ov::element::f32, shape, m.data);
     else if (m.type() == CV_8U)
         return ov::Tensor(ov::element::u8, shape, m.data);
+    else if (m.type() == CV_8SC1)
+        return ov::Tensor(ov::element::i8, shape, m.data);
     else if (m.type() == CV_32SC1)
         return ov::Tensor(ov::element::i32, shape, m.data);
     else
         CV_Error(Error::StsNotImplemented, format("Unsupported data type %s", typeToString(m.type()).c_str()));
 }
 
-#else
-
-static InferenceEngine::Layout estimateLayout(int dims)
-{
-    if (dims == 4)
-        return InferenceEngine::Layout::NCHW;
-    else if (dims == 3)
-        return InferenceEngine::Layout::CHW;
-    else if (dims == 2)
-        return InferenceEngine::Layout::NC;
-    else if (dims == 1)
-        return InferenceEngine::Layout::C;
-    else if (dims == 5)
-        return InferenceEngine::Layout::NCDHW;
-    else
-        return InferenceEngine::Layout::ANY;
-}
-static inline
-InferenceEngine::Layout estimateLayout(size_t dims)
-{
-    return estimateLayout((int)dims);
-}
-
-static inline
-InferenceEngine::Layout estimateLayout(const Mat& m)
-{
-    return estimateLayout(m.dims);
-}
-
-InferenceEngine::Blob::Ptr wrapToNgraphBlob(const Mat& m, const std::vector<size_t>& shape,
-                                               InferenceEngine::Layout layout)
-{
-    if (m.type() == CV_32F)
-        return InferenceEngine::make_shared_blob<float>(
-               {InferenceEngine::Precision::FP32, shape, layout}, (float*)m.data);
-    else if (m.type() == CV_8U)
-        return InferenceEngine::make_shared_blob<uint8_t>(
-               {InferenceEngine::Precision::U8, shape, layout}, (uint8_t*)m.data);
-    else if (m.type() == CV_32SC1)
-        return InferenceEngine::make_shared_blob<int32_t>(
-               {InferenceEngine::Precision::I32, shape, layout}, (int32_t*)m.data);
-    else
-        CV_Error(Error::StsNotImplemented, format("Unsupported data type %s", typeToString(m.type()).c_str()));
-}
-
-InferenceEngine::Blob::Ptr wrapToNgraphBlob(const Mat& m, InferenceEngine::Layout layout)
-{
-    std::vector<size_t> shape = getShape<size_t>(m);
-    return wrapToNgraphBlob(m, shape, layout);
-}
-
-InferenceEngine::Blob::Ptr wrapToNgraphBlob(const Mat& m) { return wrapToNgraphBlob(m, estimateLayout(m)); }
-
-#endif // OpenVINO >= 2022.1
 
 NgraphBackendWrapper::NgraphBackendWrapper(int targetId, const cv::Mat& m)
     : BackendWrapper(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, targetId)
@@ -1033,36 +478,10 @@ void NgraphBackendWrapper::setHostDirty()
     //CV_Error(Error::StsNotImplemented, "");
 }
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
 ov::Tensor copyBlob(const ov::Tensor& blob)
 {
     return ov::Tensor(blob.get_element_type(), blob.get_shape());
 }
-#else
-InferenceEngine::Blob::Ptr copyBlob(const InferenceEngine::Blob::Ptr& blob)
-{
-    InferenceEngine::Blob::Ptr copy;
-    auto description = blob->getTensorDesc();
-    InferenceEngine::Precision precision = description.getPrecision();
-    if (precision == InferenceEngine::Precision::FP32)
-    {
-        copy = InferenceEngine::make_shared_blob<float>(description);
-    }
-    else if (precision == InferenceEngine::Precision::U8)
-    {
-        copy = InferenceEngine::make_shared_blob<uint8_t>(description);
-    }
-    else
-    {
-        std::ostringstream msg;
-        msg << precision;
-        CV_Error_(Error::StsNotImplemented, ("Unsupported blob precision: %s", msg.str().c_str()));
-    }
-    copy->allocate();
-    return copy;
-}
-
-#endif // OpenVINO < 2022.1
 
 void InfEngineNgraphNet::reset()
 {
@@ -1114,7 +533,7 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
         reqWrapper = Ptr<NgraphReqWrapper>(new NgraphReqWrapper());
         try
         {
-            reqWrapper->req = netExec.CreateInferRequest();
+            reqWrapper->req = netExec.create_infer_request();
         }
         catch (const std::exception& ex)
         {
@@ -1122,7 +541,6 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
         }
         infRequests.push_back(reqWrapper);
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
         int i = 0;
         for (const auto& it : netExec.inputs())
         {
@@ -1140,27 +558,7 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
             CV_Assert(blobIt != allBlobs.end());
             reqWrapper->req.set_output_tensor(i++, isAsync ? copyBlob(blobIt->second) : blobIt->second);
         }
-#else
-        InferenceEngine::BlobMap inpBlobs, outBlobs;
-        for (const auto& it : cnn.getInputsInfo())
-        {
-            const std::string& name = it.first;
-            auto blobIt = allBlobs.find(name);
-            CV_Assert(blobIt != allBlobs.end());
-            inpBlobs[name] = isAsync ? copyBlob(blobIt->second) : blobIt->second;
-        }
-        for (const auto& it : cnn.getOutputsInfo())
-        {
-            const std::string& name = it.first;
-            auto blobIt = allBlobs.find(name);
-            CV_Assert(blobIt != allBlobs.end());
-            outBlobs[name] = isAsync ? copyBlob(blobIt->second) : blobIt->second;
-        }
-        reqWrapper->req.SetInput(inpBlobs);
-        reqWrapper->req.SetOutput(outBlobs);
-#endif
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
     if (isAsync) {
         bool* isReady = &reqWrapper->isReady;
         auto* promises = &reqWrapper->outProms;
@@ -1204,86 +602,13 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
             *isReady = true;
         });
     }
-#else // OpenVINO >= 2022.1
-
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_4)
-        InferenceEngine::InferRequest infRequest = reqWrapper->req;
-        NgraphReqWrapper* wrapperPtr = reqWrapper.get();
-        CV_Assert(wrapperPtr && "Internal error");
-#else
-        InferenceEngine::IInferRequest::Ptr infRequestPtr = reqWrapper->req;
-        CV_Assert(infRequestPtr);
-        InferenceEngine::IInferRequest& infRequest = *infRequestPtr.get();
-        infRequest.SetUserData(reqWrapper.get(), 0);
-#endif
-
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_4)
-        // do NOT capture 'reqWrapper' (smart ptr) in the lambda callback
-        infRequest.SetCompletionCallback<std::function<void(InferenceEngine::InferRequest, InferenceEngine::StatusCode)>>(
-            [wrapperPtr](InferenceEngine::InferRequest /*request*/, InferenceEngine::StatusCode status)
-#else
-        infRequest.SetCompletionCallback(
-            [](InferenceEngine::IInferRequest::Ptr requestPtr, InferenceEngine::StatusCode status)
-#endif
-            {
-                CV_LOG_DEBUG(NULL, "DNN(nGraph): completionCallback(" << (int)status << ")");
-#if !INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_4)
-                CV_Assert(requestPtr);
-                InferenceEngine::IInferRequest& request = *requestPtr.get();
-
-                NgraphReqWrapper* wrapperPtr;
-                request.GetUserData((void**)&wrapperPtr, 0);
-                CV_Assert(wrapperPtr && "Internal error");
-#endif
-                NgraphReqWrapper& wrapper = *wrapperPtr;
-
-                size_t processedOutputs = 0;
-                try
-                {
-                    for (; processedOutputs < wrapper.outProms.size(); ++processedOutputs)
-                    {
-                        const std::string& name = wrapper.outsNames[processedOutputs];
-                        Mat m = infEngineBlobToMat(wrapper.req.GetBlob(name));
-
-                        try
-                        {
-                            CV_Assert(status == InferenceEngine::StatusCode::OK);
-                            wrapper.outProms[processedOutputs].setValue(m.clone());
-                        }
-                        catch (...)
-                        {
-                            try {
-                                wrapper.outProms[processedOutputs].setException(std::current_exception());
-                            } catch(...) {
-                                CV_LOG_ERROR(NULL, "DNN: Exception occurred during async inference exception propagation");
-                            }
-                        }
-                    }
-                }
-                catch (...)
-                {
-                    std::exception_ptr e = std::current_exception();
-                    for (; processedOutputs < wrapper.outProms.size(); ++processedOutputs)
-                    {
-                        try {
-                            wrapper.outProms[processedOutputs].setException(e);
-                        } catch(...) {
-                            CV_LOG_ERROR(NULL, "DNN: Exception occurred during async inference exception propagation");
-                        }
-                    }
-                }
-                wrapper.isReady = true;
-            }
-        );
-#endif // OpenVINO >= 2022.1
     }
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
     if (isAsync)
     {
         // Copy actual data to infer request's input blobs.
         int i = 0;
-        for (const auto& it : cnn.getFunction()->get_parameters())
+        for (const auto& it : cnn->get_parameters())
         {
             const std::string& name = it->get_friendly_name();
             auto blobIt = allBlobs.find(name);
@@ -1302,30 +627,32 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
     {
         reqWrapper->req.infer();
     }
-#else
-    if (isAsync)
-    {
-        // Copy actual data to infer request's input blobs.
-        for (const auto& it : cnn.getInputsInfo())
-        {
-            const std::string& name = it.first;
-            auto blobIt = allBlobs.find(name);
-            Mat srcMat = infEngineBlobToMat(blobIt->second);
-            Mat dstMat = infEngineBlobToMat(reqWrapper->req.GetBlob(name));
-            srcMat.copyTo(dstMat);
-        }
-
-        // Set promises to output blobs wrappers.
-        reqWrapper->makePromises(outBlobsWrappers);
+}
 
-        reqWrapper->isReady = false;
-        reqWrapper->req.StartAsync();
-    }
-    else
-    {
-        reqWrapper->req.Infer();
-    }
-#endif // OpenVINO >= 2022.1
+ov::Output<ov::Node> ngraphQuantize(ov::Output<ov::Node> input, float output_sc, float output_zp) {
+    float outLow = -128, outHigh = 127;
+    float inpLow = output_sc * (outLow - output_zp);
+    float inpHigh = output_sc * (outHigh - output_zp);
+    return std::make_shared<ov::op::v0::FakeQuantize>(input,
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &inpLow),
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &inpHigh),
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &outLow),
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &outHigh),
+        256 // levels
+    );
+}
+
+ov::Output<ov::Node> ngraphDequantize(ov::Output<ov::Node> input, float input_sc, float input_zp) {
+    float inpLow = -128, inpHigh = 127;
+    float outLow = input_sc * (inpLow - input_zp);
+    float outHigh = input_sc * (inpHigh - input_zp);
+    return std::make_shared<ov::op::v0::FakeQuantize>(input,
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &inpLow),
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &inpHigh),
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &outLow),
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &outHigh),
+        256 // levels
+    );
 }
 
 #endif
diff --git a/modules/dnn/src/ie_ngraph.hpp b/modules/dnn/src/ie_ngraph.hpp
index 09afc7f11700..19e07d62ac32 100644
--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@@ -17,7 +17,8 @@
 #pragma warning(disable : 4245)
 #pragma warning(disable : 4268)
 #endif
-#include <ngraph/ngraph.hpp>
+#include <openvino/openvino.hpp>
+#include <openvino/op/ops.hpp>
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
@@ -30,12 +31,11 @@ namespace cv { namespace dnn {
 
 class InfEngineNgraphNode;
 
-
 class InfEngineNgraphNet
 {
 public:
     InfEngineNgraphNet(detail::NetImplBase& netImpl);
-    InfEngineNgraphNet(detail::NetImplBase& netImpl, InferenceEngine::CNNNetwork& net);
+    InfEngineNgraphNet(detail::NetImplBase& netImpl, std::shared_ptr<ov::Model>& net);
 
     void addOutput(const Ptr<InfEngineNgraphNode>& node);
 
@@ -44,35 +44,23 @@ class InfEngineNgraphNet
 
     void forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers, bool isAsync);
 
-    void initPlugin(InferenceEngine::CNNNetwork& net);
-    ngraph::ParameterVector setInputs(const std::vector<cv::Mat>& inputs, const std::vector<std::string>& names);
+    void initPlugin(std::shared_ptr<ov::Model>& net);
+    ov::ParameterVector setInputs(const std::vector<cv::Mat>& inputs, const std::vector<std::string>& names);
 
     void addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs);
 
     void createNet(Target targetId);
-    void setNodePtr(std::shared_ptr<ngraph::Node>* ptr);
 
     void reset();
 
 //private:
     detail::NetImplBase& netImpl_;
 
-    void release();
-    int getNumComponents();
-    void dfs(std::shared_ptr<ngraph::Node>& node, std::vector<std::shared_ptr<ngraph::Node>>& comp,
-             std::unordered_map<std::string, bool>& used);
-
-    ngraph::ParameterVector inputs_vec;
-    std::shared_ptr<ngraph::Function> ngraph_function;
-    std::vector<std::vector<std::shared_ptr<ngraph::Node>>> components;
-    std::unordered_map<std::string, std::shared_ptr<ngraph::Node>* > all_nodes;
+    ov::ParameterVector inputs_vec;
+    std::shared_ptr<ov::Model> ngraph_function;
 
-    InferenceEngine::ExecutableNetwork netExec;
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
+    ov::CompiledModel netExec;
     std::map<std::string, ov::Tensor> allBlobs;
-#else
-    InferenceEngine::BlobMap allBlobs;
-#endif
     std::string device_name;
     bool isInit = false;
 
@@ -82,14 +70,14 @@ class InfEngineNgraphNet
 
         void makePromises(const std::vector<Ptr<BackendWrapper> >& outs);
 
-        InferenceEngine::InferRequest req;
+        ov::InferRequest req;
         std::vector<cv::AsyncPromise> outProms;
         std::vector<std::string> outsNames;
         bool isReady;
     };
     std::vector<Ptr<NgraphReqWrapper> > infRequests;
 
-    InferenceEngine::CNNNetwork cnn;
+    std::shared_ptr<ov::Model> cnn;
     bool hasNetOwner;
     std::unordered_map<std::string, InfEngineNgraphNode*> requestedOutputs;
 };
@@ -101,13 +89,13 @@ class InfEngineNgraphNode : public BackendNode
                         std::vector<Mat*>& inputs, std::vector<Mat>& outputs,
                         std::vector<Mat>& internals);
 
-    InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node);
-    InfEngineNgraphNode(const std::shared_ptr<ngraph::Node>& _node);
+    InfEngineNgraphNode(ov::Output<ov::Node>&& _node);
+    InfEngineNgraphNode(const ov::Output<ov::Node>& _node);
 
     void setName(const std::string& name);
 
     // Inference Engine network object that allows to obtain the outputs of this layer.
-    std::shared_ptr<ngraph::Node> node;
+    ov::Output<ov::Node> node;
     Ptr<InfEngineNgraphNet> net;
     Ptr<dnn::Layer> cvLayer;
 };
@@ -126,11 +114,7 @@ class NgraphBackendWrapper : public BackendWrapper
 
     Mat* host;
     std::string name;
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
     ov::Tensor blob;
-#else
-    InferenceEngine::Blob::Ptr blob;
-#endif
     AsyncArray futureMat;
 };
 
@@ -140,7 +124,7 @@ class NgraphBackendWrapper : public BackendWrapper
 class NgraphBackendLayer : public Layer
 {
 public:
-    NgraphBackendLayer(const InferenceEngine::CNNNetwork &t_net_) : t_net(t_net_) {};
+    NgraphBackendLayer(const std::shared_ptr<ov::Model> &t_net_) : t_net(t_net_) {};
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                  const int requiredOutputs,
@@ -153,9 +137,12 @@ class NgraphBackendLayer : public Layer
     virtual bool supportBackend(int backendId) CV_OVERRIDE;
 
 private:
-    InferenceEngine::CNNNetwork t_net;
+    std::shared_ptr<ov::Model> t_net;
 };
 
+ov::Output<ov::Node> ngraphQuantize(ov::Output<ov::Node> input, float output_sc, float output_zp);
+ov::Output<ov::Node> ngraphDequantize(ov::Output<ov::Node> input, float input_sc, float input_zp);
+
 #endif  // HAVE_DNN_NGRAPH
 
 }}  // namespace cv::dnn
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 2ce54ac0bbe6..e8450c18f9dc 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -101,6 +101,8 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(Reduce,         ReduceLayer);
     CV_DNN_REGISTER_LAYER_CLASS(LRN,            LRNLayer);
     CV_DNN_REGISTER_LAYER_CLASS(InnerProduct,   InnerProductLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Gemm,           GemmLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(MatMul,         MatMulLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Softmax,        SoftmaxLayer);
     CV_DNN_REGISTER_LAYER_CLASS(SoftMax,        SoftmaxLayer);  // For compatibility. See https://github.com/opencv/opencv/issues/16877
     CV_DNN_REGISTER_LAYER_CLASS(MVN,            MVNLayer);
@@ -156,7 +158,12 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(Arg,            ArgLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Reciprocal,     ReciprocalLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Gather,         GatherLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(GatherElements, GatherElementsLayer);
     CV_DNN_REGISTER_LAYER_CLASS(LayerNormalization, LayerNormLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Expand,         ExpandLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(InstanceNormalization, InstanceNormLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Attention,      AttentionLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(GroupNormalization, GroupNormLayer);
 
     CV_DNN_REGISTER_LAYER_CLASS(Crop,           CropLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Eltwise,        EltwiseLayer);
@@ -183,6 +190,7 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(LSTM,           LSTMLayer);
     CV_DNN_REGISTER_LAYER_CLASS(GRU,            GRULayer);
     CV_DNN_REGISTER_LAYER_CLASS(CumSum,         CumSumLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Einsum,         EinsumLayer);
 
     CV_DNN_REGISTER_LAYER_CLASS(Scatter,        ScatterLayer);
     CV_DNN_REGISTER_LAYER_CLASS(ScatterND,      ScatterNDLayer);
@@ -204,6 +212,7 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(SigmoidInt8,      ActivationLayerInt8);
     CV_DNN_REGISTER_LAYER_CLASS(TanHInt8,         ActivationLayerInt8);
     CV_DNN_REGISTER_LAYER_CLASS(SwishInt8,        ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(HardSwishInt8,    ActivationLayerInt8);
     CV_DNN_REGISTER_LAYER_CLASS(MishInt8,         ActivationLayerInt8);
     CV_DNN_REGISTER_LAYER_CLASS(ELUInt8,          ActivationLayerInt8);
     CV_DNN_REGISTER_LAYER_CLASS(BNLLInt8,         ActivationLayerInt8);
diff --git a/modules/dnn/src/int8layers/batch_norm_layer.cpp b/modules/dnn/src/int8layers/batch_norm_layer.cpp
index a3a9ebb261f1..3fbf8cd19177 100644
--- a/modules/dnn/src/int8layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/int8layers/batch_norm_layer.cpp
@@ -5,8 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
-
-#include <opencv2/dnn/shape_utils.hpp>
+#include "../ie_ngraph.hpp"
 
 namespace cv
 {
@@ -110,7 +109,8 @@ class BatchNormLayerInt8Impl CV_FINAL : public BatchNormLayerInt8
             return true;
         }
 
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@@ -238,6 +238,27 @@ class BatchNormLayerInt8Impl CV_FINAL : public BatchNormLayerInt8
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        std::vector<size_t> shape(input.get_shape().size(), 1);
+        shape[1] = origin_weights.total();
+
+        ov::Output<ov::Node> res;
+        auto ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::f32, shape, origin_weights.data);
+        auto ieBias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, shape, origin_bias.data);
+        res = std::make_shared<ov::op::v1::Multiply>(input, ieWeights);
+        res = std::make_shared<ov::op::v1::Add>(res, ieBias);
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
index 3d6f6bc8248b..25132542cde0 100644
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -10,6 +10,7 @@
 #include "opencv2/core/hal/hal.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include <iostream>
 #include <numeric>
 
@@ -18,7 +19,7 @@ namespace cv
 namespace dnn
 {
 
-#if CV_SIMD
+#if CV_SIMD128
 static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
                                     v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
 {
@@ -28,10 +29,10 @@ static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
 
     v_int32x4 t0, t1;
     v_mul_expand(a0, b0, t0, t1);
-    out0 += t0; out1 += t1;
+    out0 = v_add(out0, t0); out1 = v_add(out1, t1);
 
     v_mul_expand(a1, b1, t0, t1);
-    out2 += t0; out3 += t1;
+    out2 = v_add(out2, t0); out3 = v_add(out3, t1);
 }
 #endif
 
@@ -195,7 +196,8 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
         }
 #endif
         // Only default backend and Conv1D/Conv2D/Conv3D are supported
-        return backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3;
+        return (backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3) ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -546,7 +548,7 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
         {
             // for Conv1d
             if (group != 1)
-                CV_Error( CV_StsNotImplemented, " Grouped Conv1d or Depth-Wise Conv1d are not supported by "
+                CV_Error( cv::Error::StsNotImplemented, " Grouped Conv1d or Depth-Wise Conv1d are not supported by "
                                                 "TimVX Backend. Please try OpenCV Backend.");
             tvConv = graph->CreateOperation<tim::vx::ops::Conv1d>(
                     tvConvWeightShape[2], tvPadType, (uint32_t)kernel_size[0],
@@ -561,6 +563,126 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(!blobs.empty());
+        CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
+        CV_CheckTypeEQ(weightsMat.type(), CV_8S, "");
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<size_t> dims = ieInpNode.get_shape();
+        CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
+        CV_Assert(ieInpNode.get_element_type() == ov::element::f32);
+        ov::Output<ov::Node> ieWeights;
+        if (nodes.size() > 1)
+            ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        const int inpCn = dims[1];
+        const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
+        const int group = inpCn / inpGroupCn;
+
+        std::vector<size_t> kernel_shape;
+        if (group != 1)
+        {
+            kernel_shape.push_back(group);
+        }
+        kernel_shape.push_back(numOutput / group);
+        kernel_shape.push_back(inpCn / group);
+        std::copy(kernel_size.begin(), kernel_size.end(), back_inserter(kernel_shape));
+
+        if (nodes.size() == 1)
+        {
+            ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::i8, kernel_shape, blobs[0].data);
+        }
+        else
+        {
+            auto shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                             ov::Shape{kernel_shape.size()}, std::vector<int64_t>(kernel_shape.begin(), kernel_shape.end()));
+            ieWeights  = std::make_shared<ov::op::v1::Reshape>(ieWeights, shape, true);
+        }
+
+        ov::op::PadType pad_type = ov::op::PadType::EXPLICIT;
+        if (!padMode.empty())
+            pad_type = padMode == "VALID" ? ov::op::PadType::VALID : ov::op::PadType::SAME_UPPER;
+
+        ieInpNode = ngraphDequantize(ieInpNode, input_sc, input_zp);
+
+        const float low = -128, high = 127;
+        std::vector<float> inpLows(numOutput, low);
+        std::vector<float> inpHighs(numOutput, high);
+        std::vector<float> outLows(numOutput);
+        std::vector<float> outHighs(numOutput);
+        std::vector<size_t> quantShape(kernel_shape.size(), 1);
+        if (group != 1)
+        {
+            quantShape[0] = group;
+            quantShape[1] = numOutput / group;
+        }
+        else
+        {
+            quantShape[0] = numOutput;
+        }
+
+        for (int i = 0; i < numOutput; ++i) {
+            outLows[i] = low * outputMultiplier[i] * output_sc / input_sc;
+            outHighs[i] = high * outputMultiplier[i] * output_sc / input_sc;
+        }
+        ieWeights = std::make_shared<ov::op::v0::Convert>(ieWeights, ov::element::f32);
+        ieWeights = std::make_shared<ov::op::v0::FakeQuantize>(ieWeights,
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, quantShape, inpLows.data()),
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, quantShape, inpHighs.data()),
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, quantShape, outLows.data()),
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, quantShape, outHighs.data()),
+            256 // levels
+        );
+
+        ov::Output<ov::Node> conv_node;
+        if (group != 1) {
+            conv_node = std::make_shared<ov::op::v1::GroupConvolution>(
+                                ieInpNode, ieWeights,
+                                ov::Strides(strides),
+                                ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                                ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(),   pads_end.end())),
+                                ov::Strides(dilations),
+                                pad_type);
+        } else {
+            conv_node = std::make_shared<ov::op::v1::Convolution>(
+                                ieInpNode, ieWeights,
+                                ov::Strides(strides),
+                                ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                                ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
+                                ov::Strides(dilations),
+                                pad_type);
+        }
+
+        std::vector<size_t> shape(conv_node.get_shape().size(), 1);
+        shape[1] = conv_node.get_shape()[1];
+        if (biasvec.size() || nodes.size() == 3)
+        {
+            std::shared_ptr<ov::Node> bias;
+            if (nodes.size() == 3)
+            {
+                auto bias_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                    ov::Shape{shape.size()}, std::vector<int64_t>(shape.begin(), shape.end()));
+                bias = std::make_shared<ov::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
+            }
+            else
+            {
+                std::vector<float> ovBias(numOutput);
+                for (int i = 0; i < numOutput; ++i) {
+                    ovBias[i] = (biasvec[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier[i] * output_sc;
+                }
+                bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape(shape), ovBias.data());
+            }
+            conv_node = std::make_shared<ov::op::v1::Add>(conv_node, bias, ov::op::AutoBroadcastType::NUMPY);
+        }
+
+        conv_node = ngraphQuantize(conv_node, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(conv_node);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     class ParallelConv : public cv::ParallelLoopBody
     {
     public:
@@ -580,13 +702,14 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
         bool useAVX2;
         bool useAVX512;
         bool useLASX;
+        bool useRVV;
         int blk_size_cn;
         int inpZp, outZp;
         const std::vector<float>* multiplier;
 
         ParallelConv()
             : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
-              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false)
+              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false)
             , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
         {}
 
@@ -643,6 +766,7 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
 
             p.useLASX   = checkHardwareSupport(CPU_LASX) && isConv2D;
+            p.useRVV   = checkHardwareSupport(CPU_RVV) && isConv2D;
 
             int kernel_d = isConv3D? kernel_size[0] : 1;
             int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
@@ -847,6 +971,20 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
                                     stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                     biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
                             else
+                        #endif
+                        #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
+                            if(useRVV)
+                                opt_RVV::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
+                        #endif
+                        #if CV_RVP052
+                            if(isConv2D)
+                                opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
                         #endif
                             {
                                 const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@@ -893,7 +1031,7 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
                                         outptr[0] = std::min(std::max(out1, -128), 127);
                                         out_j = 1;
                                     }
-                                #if CV_SIMD
+                                #if CV_SIMD128
                                     if( stride_w == 1 )
                                     {
                                         const int out_delta = 16;
@@ -933,10 +1071,10 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
                                             v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
                                             v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
 
-                                            vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult);
-                                            vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult);
-                                            vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult);
-                                            vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult);
+                                            vout0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout0), vmult)));
+                                            vout1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout1), vmult)));
+                                            vout2 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout2), vmult)));
+                                            vout3 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout3), vmult)));
 
                                             vout0 = v_min(v_max(vout0, outmin), outmax);
                                             vout1 = v_min(v_max(vout1, outmin), outmax);
@@ -1226,6 +1364,18 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
                             opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                           outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
                         else
+                    #endif
+                    #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
+                        if(useRVV)
+                            opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
+                    #endif
+                    #if CV_RVP052
+                        if(isConv2D)
+                            opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
                     #endif
                         for( int i = 0; i < outCn; i += 2 )
                         {
@@ -1286,12 +1436,12 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
                                     vs12 = v_dotprod_expand_fast(w1, r2, vs12);
                                     vs13 = v_dotprod_expand_fast(w1, r3, vs13);
                                 }
-                                s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03));
-                                s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13));
+                                s0 = v_add(s0, v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03)));
+                                s1 = v_add(s1, v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13)));
                                 if( cn1 == inpCn )
                                 {
-                                    s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0);
-                                    s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1);
+                                    s0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s0), vmult0)));
+                                    s1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s1), vmult1)));
 
                                     s0 = v_min(v_max(s0, outmin), outmax);
                                     s1 = v_min(v_max(s1, outmin), outmax);
diff --git a/modules/dnn/src/int8layers/elementwise_layers.cpp b/modules/dnn/src/int8layers/elementwise_layers.cpp
index f1b78f48fb6d..f522efa0c1e4 100644
--- a/modules/dnn/src/int8layers/elementwise_layers.cpp
+++ b/modules/dnn/src/int8layers/elementwise_layers.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 #include <opencv2/dnn/shape_utils.hpp>
 #include <iostream>
@@ -56,7 +57,7 @@ class ActivationLayerInt8Impl CV_FINAL : public ActivationLayerInt8
             return tvActType != tvActNotSupported;
         }
 #endif
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -244,6 +245,44 @@ class ActivationLayerInt8Impl CV_FINAL : public ActivationLayerInt8
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ov::Output<ov::Node> res;
+        if (type == "ReLU6Int8") {
+            res = std::make_shared<ov::op::v0::Clamp>(input, 0.0f, 6.0f);
+        } else if (type == "ReLUInt8") {
+            if (slope) {
+                auto param = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &slope);
+                res = std::make_shared<ov::op::v0::PRelu>(input, param);
+            } else {
+                res = std::make_shared<ov::op::v0::Relu>(input);
+            }
+        } else if (type == "ELUInt8") {
+            res = std::make_shared<ov::op::v0::Elu>(input, 1.0f);
+        } else if (type == "MishInt8") {
+            res = std::make_shared<ov::op::v4::Mish>(input);
+        } else if (type == "HardSwishInt8") {
+            res = std::make_shared<ov::op::v4::HSwish>(input);
+        } else if (type == "AbsValInt8") {
+            res = std::make_shared<ov::op::v0::Abs>(input);
+        } else if (type == "SigmoidInt8") {
+            res = std::make_shared<ov::op::v0::Sigmoid>(input);
+        } else {
+            CV_Error(Error::StsNotImplemented, type + " activation with OpenVINO");
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
diff --git a/modules/dnn/src/int8layers/eltwise_layer.cpp b/modules/dnn/src/int8layers/eltwise_layer.cpp
index e0a8d4787cd5..214d11525a38 100644
--- a/modules/dnn/src/int8layers/eltwise_layer.cpp
+++ b/modules/dnn/src/int8layers/eltwise_layer.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
 namespace cv
@@ -138,7 +139,7 @@ class EltwiseLayerInt8Impl CV_FINAL : public EltwiseLayerInt8
         // For TimVX Backend, only ELTWISE_CHANNNELS_SAME was supported.
         if (backendId == DNN_BACKEND_TIMVX && haveTimVX())
             return channelsModeInput == ELTWISE_CHANNNELS_SAME;
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -369,6 +370,38 @@ class EltwiseLayerInt8Impl CV_FINAL : public EltwiseLayerInt8
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Assert(nodes.size() >= 2);
+        std::vector<ov::Output<ov::Node>> ieInpNodes(nodes.size());
+        for (size_t i = 0; i < nodes.size(); i++)
+        {
+            ieInpNodes[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+
+            float input_sc = !coeffs.empty() ? coeffs[i] : 1.0f;
+            float input_zp = op == PROD ? zeropoints[i] : 0.0f;
+            ieInpNodes[i] = ngraphDequantize(ieInpNodes[i], input_sc, input_zp);
+        }
+
+        auto res = ieInpNodes[0];
+        for (size_t i = 1; i < ieInpNodes.size(); i++)
+        {
+            switch (op) {
+                case SUM:  res = std::make_shared<ov::op::v1::Add>(res, ieInpNodes[i]); break;
+                case PROD: res = std::make_shared<ov::op::v1::Multiply>(res, ieInpNodes[i]); break;
+                case MAX:  res = std::make_shared<ov::op::v1::Maximum>(res, ieInpNodes[i]); break;
+                default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
+            }
+        }
+
+        res = ngraphQuantize(res, 1.0f, offset);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     class EltwiseInvoker : public ParallelLoopBody
     {
         EltwiseLayerInt8Impl& self;
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
index 867f002dd4ba..105b2dbaac28 100644
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 #include <opencv2/dnn/shape_utils.hpp>
 
@@ -86,7 +87,8 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
                return false;
         }
 
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@@ -226,7 +228,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
     {
     public:
         FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
-                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {}
+                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false) {}
 
         static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
                         const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
@@ -251,6 +253,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
             p.useAVX2 = checkHardwareSupport(CPU_AVX2);
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
             p.useLASX = checkHardwareSupport(CPU_LASX);
+            p.useRVV = checkHardwareSupport(CPU_RVV);
 
             parallel_for_(Range(0, nstripes), p, nstripes);
         }
@@ -300,10 +303,20 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
                 if( useLASX )
                     opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
                 else
+            #endif
+            #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
+                if( useRVV)
+                    opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
+            #endif
+            #if CV_RVP052
+                if( 1 )
+                    opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
             #endif
                 {
                     int i = 0;
-            #if CV_SIMD
+            #if CV_SIMD128
                     for( ; i  <= nw - 4; i += 4, wptr += 4*wstep )
                     {
                         v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
@@ -321,8 +334,8 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
                             vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3);
                         }
 
-                        s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3));
-                        v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult);
+                        s = v_add(s, v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3)));
+                        v_int32x4 out = v_add(outzp, v_round(v_mul(v_cvt_f32(s), mult)));
                         v_store(dptr + i, v_min(v_max(out, outmin), outmax));
                     }
             #endif
@@ -356,6 +369,7 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
         bool useAVX2;
         bool useAVX512;
         bool useLASX;
+        bool useRVV;
     };
 
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
@@ -395,6 +409,77 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
 
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_CheckTypeEQ(blobs[0].type(), CV_8S, "");  // weights
+        CV_CheckTypeEQ(blobs[1].type(), CV_32S, "");  // bias
+        CV_CheckTypeEQ(outputMultiplier.type(), CV_32F, "");
+
+        ov::Output<ov::Node> input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        ov::Output<ov::Node> ieWeights, ieBias, matmul;
+        bool transA = false, transB = true;
+        size_t numOutput = blobs[0].size[0];
+
+        if (nodes.size() == 2)
+        {
+            CV_Error(Error::StsNotImplemented, "");
+            // auto inp2 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+            // matmul = std::make_shared<ov::op::v0::MatMul>(ieInpNode, inp2, transA, transB);
+        }
+        else
+        {
+            std::vector<int> shape(1 + normalize_axis(axis, input.get_shape().size()), 0);
+            shape[shape.size() - 1] = -1;
+            input = std::make_shared<ov::op::v1::Reshape>(
+                input,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{shape.size()}, shape.data()),
+                true
+            );
+
+            input = ngraphDequantize(input, input_sc, input_zp);
+
+            const float low = -128, high = 127;
+            std::vector<float> inpLows(numOutput, low);
+            std::vector<float> inpHighs(numOutput, high);
+            std::vector<float> outLows(numOutput);
+            std::vector<float> outHighs(numOutput);
+            for (int i = 0; i < numOutput; ++i) {
+                outLows[i] = low * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
+                outHighs[i] = high * outputMultiplier.ptr<float>()[i] * output_sc / input_sc;
+            }
+
+            std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
+            ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::i8, weight_shape, blobs[0].data);
+            ieWeights = std::make_shared<ov::op::v0::Convert>(ieWeights, ov::element::f32);
+            ieWeights = std::make_shared<ov::op::v0::FakeQuantize>(ieWeights,
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{numOutput, 1}, inpLows.data()),
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{numOutput, 1}, inpHighs.data()),
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{numOutput, 1}, outLows.data()),
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{numOutput, 1}, outHighs.data()),
+                256 // levels
+            );
+            matmul = std::make_shared<ov::op::v0::MatMul>(input, ieWeights, transA, transB);
+        }
+
+        if (blobs.size() > 1) {
+            int32_t* bias = blobs[1].ptr<int32_t>();
+            std::vector<float> ovBias(blobs[1].total());
+            for (int i = 0; i < ovBias.size(); ++i) {
+                ovBias[i] = (bias[i] + input_zp * cv::sum(blobs[0].row(i))[0]) * outputMultiplier.ptr<float>()[i] * output_sc;
+            }
+            auto bias_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                            ov::Shape{blobs[1].total()}, ovBias.data());
+            matmul = std::make_shared<ov::op::v1::Add>(matmul, bias_node);
+        }
+
+        matmul = ngraphQuantize(matmul, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(matmul);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     Mat weightsMat, biasMat, outputMultiplier, activationLUT;
     Ptr<ActivationLayerInt8> activ;
 };
diff --git a/modules/dnn/src/int8layers/layers_common.hpp b/modules/dnn/src/int8layers/layers_common.hpp
index 5fdafbeab830..4612feed48d8 100644
--- a/modules/dnn/src/int8layers/layers_common.hpp
+++ b/modules/dnn/src/int8layers/layers_common.hpp
@@ -13,6 +13,8 @@
 #include "int8layers/layers_common.simd_declarations.hpp"
 #undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
+#include "./layers_rvp052.hpp"
+
 #ifdef HAVE_OPENCL
 #include "../ocl4dnn/include/ocl4dnn.hpp"
 #endif
diff --git a/modules/dnn/src/int8layers/layers_common.simd.hpp b/modules/dnn/src/int8layers/layers_common.simd.hpp
index 1b3ac7a4b81a..7f9dca505e27 100644
--- a/modules/dnn/src/int8layers/layers_common.simd.hpp
+++ b/modules/dnn/src/int8layers/layers_common.simd.hpp
@@ -1257,5 +1257,440 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights,
 }
 #endif // CV_LASX
 
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000
+
+static const size_t __cv_rvv_e8m1_max = __riscv_vsetvlmax_e8m1();
+static const size_t __cv_rvv_e16m1_max = __riscv_vsetvlmax_e16m1();
+static const size_t __cv_rvv_e32m2_max = __riscv_vsetvlmax_e32m2();
+
+inline vint32m2_t __riscv_vwmacc_vv_i32m2(vint32m2_t& dst, const vint8m1_t& a, const vint8m1_t& b, size_t vl) {
+    vint16m2_t tmp = __riscv_vwmul(a, b, vl);
+    dst = __riscv_vwadd_wv_i32m2_tu(dst, dst, __riscv_vget_i16m1(tmp, 0), vl);
+    dst = __riscv_vwadd_wv_i32m2_tu(dst, dst, __riscv_vget_i16m1(tmp, 1), vl > __cv_rvv_e16m1_max ? vl - __cv_rvv_e16m1_max : 0);
+    return dst;
+}
+
+enum { FASCONV_BASE_VECSZ = 4 };
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput )
+{
+    const size_t e8m1 = __cv_rvv_e8m1_max;
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        int unroll_tail = FASCONV_BASE_VECSZ;
+        const int8_t* wptr0 = weights + i*wstep;
+        const int8_t* wptr1 = wptr0 + wstep;
+        const int8_t* wptr2 = wptr1 + wstep;
+        int* outptr0 = output + i*outPlaneSize;
+        int* outptr1 = outptr0 + outPlaneSize;
+        int* outptr2 = outptr1 + outPlaneSize;
+        int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+        float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            mult2 = mult1;
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+                mult2 = mult1 = mult0;
+            }
+        }
+
+        int j = 0;
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
+        {
+            const int8_t* rptr = rowbuf + j*vecsize_aligned;
+            const int8_t *rptr1 = rptr + vecsize_aligned*1,
+                        *rptr2 = rptr + vecsize_aligned*2,
+                        *rptr3 = rptr + vecsize_aligned*3;
+
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                unroll_tail = blockSize - j;
+                rptr1 = rptr + vecsize_aligned*std::min(1, unroll_tail-1);
+                rptr2 = rptr + vecsize_aligned*std::min(2, unroll_tail-1);
+                rptr3 = rptr + vecsize_aligned*std::min(3, unroll_tail-1);
+            }
+
+            int vl, avl = vecsize;
+
+            vint32m2_t
+                vs00 = __riscv_vmv_v_x_i32m2(0, e8m1), vs10 = __riscv_vmv_v_x_i32m2(0, e8m1), vs20 = __riscv_vmv_v_x_i32m2(0, e8m1),
+                vs01 = __riscv_vmv_v_x_i32m2(0, e8m1), vs11 = __riscv_vmv_v_x_i32m2(0, e8m1), vs21 = __riscv_vmv_v_x_i32m2(0, e8m1),
+                vs02 = __riscv_vmv_v_x_i32m2(0, e8m1), vs12 = __riscv_vmv_v_x_i32m2(0, e8m1), vs22 = __riscv_vmv_v_x_i32m2(0, e8m1),
+                vs03 = __riscv_vmv_v_x_i32m2(0, e8m1), vs13 = __riscv_vmv_v_x_i32m2(0, e8m1), vs23 = __riscv_vmv_v_x_i32m2(0, e8m1);
+            for (int k = 0; k < vecsize; k += vl, avl -= vl)
+            {
+                vl = __riscv_vsetvl_e8m1(avl);
+
+                vint8m1_t w0 = (__riscv_vle8_v_i8m1(wptr0 + k, vl));
+                vint8m1_t w1 = (__riscv_vle8_v_i8m1(wptr1 + k, vl));
+                vint8m1_t w2 = (__riscv_vle8_v_i8m1(wptr2 + k, vl));
+                vint8m1_t r0 = (__riscv_vle8_v_i8m1(rptr, vl));
+
+
+                vs00 = __riscv_vwmacc_vv_i32m2(vs00, w0, r0, vl);
+                vs10 = __riscv_vwmacc_vv_i32m2(vs10, w1, r0, vl);
+                vs20 = __riscv_vwmacc_vv_i32m2(vs20, w2, r0, vl);
+
+                r0 = (__riscv_vle8_v_i8m1(rptr1, vl));
+                vs01 = __riscv_vwmacc_vv_i32m2(vs01, w0, r0, vl);
+                vs11 = __riscv_vwmacc_vv_i32m2(vs11, w1, r0, vl);
+                vs21 = __riscv_vwmacc_vv_i32m2(vs21, w2, r0, vl);
+
+                r0 = (__riscv_vle8_v_i8m1(rptr2, vl));
+                vs02 = __riscv_vwmacc_vv_i32m2(vs02, w0, r0, vl);
+                vs12 = __riscv_vwmacc_vv_i32m2(vs12, w1, r0, vl);
+                vs22 = __riscv_vwmacc_vv_i32m2(vs22, w2, r0, vl);
+
+                r0 = (__riscv_vle8_v_i8m1(rptr3, vl));
+                vs03 = __riscv_vwmacc_vv_i32m2(vs03, w0, r0, vl);
+                vs13 = __riscv_vwmacc_vv_i32m2(vs13, w1, r0, vl);
+                vs23 = __riscv_vwmacc_vv_i32m2(vs23, w2, r0, vl);
+
+                rptr += vl;  rptr1 += vl; rptr2 += vl; rptr3 += vl;
+            }
+
+            // compute sum of each vs
+            vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, e8m1);
+            int sum0[FASCONV_BASE_VECSZ], sum1[FASCONV_BASE_VECSZ], sum2[FASCONV_BASE_VECSZ];
+
+            sum0[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs00, zero, e8m1));
+            sum0[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs01, zero, e8m1));
+            sum0[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs02, zero, e8m1));
+            sum0[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs03, zero, e8m1));
+
+            sum1[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, e8m1));
+            sum1[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, e8m1));
+            sum1[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, e8m1));
+            sum1[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, e8m1));
+
+            sum2[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs20, zero, e8m1));
+            sum2[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs21, zero, e8m1));
+            sum2[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs22, zero, e8m1));
+            sum2[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs23, zero, e8m1));
+
+            vint32m1_t s0, s1, s2;
+            if( initOutput )
+            {
+                s0 = __riscv_vmv_v_x_i32m1(bias0, unroll_tail);
+                s1 = __riscv_vmv_v_x_i32m1(bias1, unroll_tail);
+                s2 = __riscv_vmv_v_x_i32m1(bias2, unroll_tail);
+            }
+            else
+            {
+                s0 = __riscv_vle32_v_i32m1(outptr0 + j, unroll_tail);
+                s1 = __riscv_vle32_v_i32m1(outptr1 + j, unroll_tail);
+                s2 = __riscv_vle32_v_i32m1(outptr2 + j, unroll_tail);
+            }
+            s0 = __riscv_vadd(__riscv_vle32_v_i32m1(sum0, unroll_tail), s0, unroll_tail);
+            s1 = __riscv_vadd(__riscv_vle32_v_i32m1(sum1, unroll_tail), s1, unroll_tail);
+            s2 = __riscv_vadd(__riscv_vle32_v_i32m1(sum2, unroll_tail), s2, unroll_tail);
+
+            if( finalOutput )
+            {
+                s0 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s0, unroll_tail), mult0, unroll_tail), unroll_tail), outZp, unroll_tail);
+                s1 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s1, unroll_tail), mult1, unroll_tail), unroll_tail), outZp, unroll_tail);
+                s2 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s2, unroll_tail), mult2, unroll_tail), unroll_tail), outZp, unroll_tail);
+
+                s0 = __riscv_vmin(__riscv_vmax(s0, -128, unroll_tail), 127, unroll_tail);
+                s1 = __riscv_vmin(__riscv_vmax(s1, -128, unroll_tail), 127, unroll_tail);
+                s2 = __riscv_vmin(__riscv_vmax(s2, -128, unroll_tail), 127, unroll_tail);
+            }
+
+            __riscv_vse32(outptr0 + j, s0, unroll_tail);
+            __riscv_vse32(outptr1 + j, s1, unroll_tail);
+            __riscv_vse32(outptr2 + j, s2, unroll_tail);
+        }
+    }
+}
+
+void fastDepthwiseConv( const int8_t* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const int* biasptr, const float* multptr,
+                     const int8_t* inptr_,
+                     int height, int width,
+                     int* outptr_,
+                     int out_d, int outH, int outW,
+                     int inpZp, int outZp)
+{
+    int vl;
+    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float mult = multptr[out_d];
+    int bias = biasptr[out_d];
+    int biasCopy;
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const int8_t* imgptr0 = inptr_ + in_i*width;
+        const int8_t* imgptr1 = imgptr0 + dilation_h*width;
+        const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        int8_t  w00 = w00_, w01 = w01_, w02 = w02_;
+        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+        int out, out1;
+        biasCopy = bias;
+        if (in_i < 0)
+        {
+            biasCopy += inpZp * (w00 + w01 + w02);
+            w00 = w01 = w02 = 0;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            biasCopy += inpZp * (w20 + w21 + w22);
+            w20 = w21 = w22 = 0;
+            imgptr2 = imgptr1;
+        }
+        int* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
+                  (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
+                  (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
+                  biasCopy + inpZp*(w00 + w10 + w20);
+            out1 = outZp + (int)std::round(out*mult);
+            outptr[0] = std::min(std::max(out1, -128), 127);
+            out_j = 1;
+        }
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            int avl = outW1 - out_j;
+            if( stride_w == 1 )
+                for( ; out_j < outW1; out_j += vl, avl -= vl)
+                {
+                    vl = __riscv_vsetvl_e8m2(avl);
+                    int in_j = out_j * stride_w - pad_l;
+
+                    vint32m8_t vout = __riscv_vmv_v_x_i32m8(biasCopy, vl);
+                    vout = __riscv_vwmacc(vout, w00, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j               , vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w01, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j + dilation_w  , vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w02, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j + dilation_w*2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w10, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j               , vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w11, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j + dilation_w  , vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w12, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j + dilation_w*2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w20, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j               , vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w21, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j + dilation_w  , vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w22, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j + dilation_w*2, vl), vl), vl);
+
+                    vout = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m8(vout, vl), mult, vl), vl);
+                    vout = __riscv_vadd(vout, outZp, vl);
+                    vout = __riscv_vmin(__riscv_vmax(vout, -128, vl), 127, vl);
+
+                    __riscv_vse32_v_i32m8(outptr + out_j, vout, vl);
+
+                }
+            else //stride_w == 2 && dilation_w == 1;
+            {
+                for( ; out_j < outW1; out_j += vl, avl -= vl)
+                {
+                    vl = __riscv_vsetvl_e8m2(avl);
+                    int in_j = out_j * stride_w - pad_l;
+
+                    vint32m8_t vout = __riscv_vmv_v_x_i32m8(biasCopy, vl);
+
+                    vout = __riscv_vwmacc(vout, w00, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j  , 2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w01, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j+1, 2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w02, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j+2, 2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w10, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j  , 2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w11, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j+1, 2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w12, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j+2, 2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w20, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j  , 2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w21, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j+1, 2, vl), vl), vl);
+                    vout = __riscv_vwmacc(vout, w22, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j+2, 2, vl), vl), vl);
+
+                    vout = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m8(vout, vl), mult, vl), vl);
+                    vout = __riscv_vadd(vout, outZp, vl);
+                    vout = __riscv_vmin(__riscv_vmax(vout, -128, vl), 127, vl);
+
+                    __riscv_vse32_v_i32m8(outptr + out_j, vout, vl);
+                }
+            }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
+                  (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
+                  (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            int s0 = 1, s1 = 1, s2 = 1;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0;
+                biasCopy += inpZp*(w00 + w10 + w20);
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0;
+                biasCopy += inpZp*(w01 + w11 + w21);
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0;
+                biasCopy += inpZp*(w02 + w12 + w22);
+            }
+            out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
+                  (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
+                  (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+    }
+}
+
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp )
+{
+    int i = 0;
+    for( ; i <= nvecs - 15; i += 15 )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        vint32m2_t
+               vs0 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs1 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs2 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs3 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs4 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs5 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs6 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs7 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs8 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs9 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs10 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs11 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs12 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs13 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs14 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max);
+        int avl = vecsize, vl;
+        for(int k = 0 ; k < vecsize; k += vl, wptr += vl, avl -= vl)
+        {
+            vl = __riscv_vsetvl_e8m1(avl);
+            vint8m1_t v = __riscv_vle8_v_i8m1(vec + k, vl);
+
+            vs0 = __riscv_vwmacc_vv_i32m2(vs0, __riscv_vle8_v_i8m1(wptr, vl), v, vl);
+            vs1 = __riscv_vwmacc_vv_i32m2(vs1, __riscv_vle8_v_i8m1(wptr + wstep, vl), v, vl);
+            vs2 = __riscv_vwmacc_vv_i32m2(vs2, __riscv_vle8_v_i8m1(wptr + wstep*2, vl), v, vl);
+            vs3 = __riscv_vwmacc_vv_i32m2(vs3, __riscv_vle8_v_i8m1(wptr + wstep*3, vl), v, vl);
+            vs4 = __riscv_vwmacc_vv_i32m2(vs4, __riscv_vle8_v_i8m1(wptr + wstep*4, vl), v, vl);
+            vs5 = __riscv_vwmacc_vv_i32m2(vs5, __riscv_vle8_v_i8m1(wptr + wstep*5, vl), v, vl);
+            vs6 = __riscv_vwmacc_vv_i32m2(vs6, __riscv_vle8_v_i8m1(wptr + wstep*6, vl), v, vl);
+            vs7 = __riscv_vwmacc_vv_i32m2(vs7, __riscv_vle8_v_i8m1(wptr + wstep*7, vl), v, vl);
+            vs8 = __riscv_vwmacc_vv_i32m2(vs8, __riscv_vle8_v_i8m1(wptr + wstep*8, vl), v, vl);
+            vs9 = __riscv_vwmacc_vv_i32m2(vs9, __riscv_vle8_v_i8m1(wptr + wstep*9, vl), v, vl);
+            vs10 = __riscv_vwmacc_vv_i32m2(vs10, __riscv_vle8_v_i8m1(wptr + wstep*10, vl), v, vl);
+            vs11 = __riscv_vwmacc_vv_i32m2(vs11, __riscv_vle8_v_i8m1(wptr + wstep*11, vl), v, vl);
+            vs12 = __riscv_vwmacc_vv_i32m2(vs12, __riscv_vle8_v_i8m1(wptr + wstep*12, vl), v, vl);
+            vs13 = __riscv_vwmacc_vv_i32m2(vs13, __riscv_vle8_v_i8m1(wptr + wstep*13, vl), v, vl);
+            vs14 = __riscv_vwmacc_vv_i32m2(vs14, __riscv_vle8_v_i8m1(wptr + wstep*14, vl), v, vl);
+        }
+
+        int sum[15];
+        vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, __cv_rvv_e32m2_max);
+        sum[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs0, zero, __cv_rvv_e32m2_max));
+        sum[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs1, zero, __cv_rvv_e32m2_max));
+        sum[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs2, zero, __cv_rvv_e32m2_max));
+        sum[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs3, zero, __cv_rvv_e32m2_max));
+        sum[4] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs4, zero, __cv_rvv_e32m2_max));
+        sum[5] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs5, zero, __cv_rvv_e32m2_max));
+        sum[6] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs6, zero, __cv_rvv_e32m2_max));
+        sum[7] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs7, zero, __cv_rvv_e32m2_max));
+        sum[8] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs8, zero, __cv_rvv_e32m2_max));
+        sum[9] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs9, zero, __cv_rvv_e32m2_max));
+        sum[10] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, __cv_rvv_e32m2_max));
+        sum[11] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, __cv_rvv_e32m2_max));
+        sum[12] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, __cv_rvv_e32m2_max));
+        sum[13] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, __cv_rvv_e32m2_max));
+        sum[14] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs14, zero, __cv_rvv_e32m2_max));
+
+        vint32m4_t s0 = __riscv_vadd(__riscv_vle32_v_i32m4(sum, 15), __riscv_vle32_v_i32m4(bias + i, 15), 15);
+
+        s0 = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m4(s0, 15), __riscv_vle32_v_f32m4(multiplier + i, 15), 15), 15);
+        s0 = __riscv_vadd(s0, outZp, 15);
+        s0 = __riscv_vmin(__riscv_vmax(s0, -128, 15), 127, 15);
+        __riscv_vse32_v_i32m4(dst + i, s0, 15);
+    }
+    int unroll_tail = nvecs - i;
+    if (unroll_tail > 0)
+    {
+        const int8_t* wptr = weights + i*wstep;
+        vint32m2_t
+               vs0 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs1 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs2 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs3 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs4 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs5 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs6 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs7 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs8 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs9 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs10 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs11 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max),
+               vs12 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs13 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max);
+        int avl = vecsize, vl;
+        for(int k = 0 ; k < vecsize; k += vl, wptr += vl, avl -= vl)
+        {
+            vl = __riscv_vsetvl_e8m1(avl);
+            vint8m1_t v = __riscv_vle8_v_i8m1(vec + k, vl);
+
+            vs0 = __riscv_vwmacc_vv_i32m2(vs0, __riscv_vle8_v_i8m1(wptr, vl), v, vl);
+            vs1 = __riscv_vwmacc_vv_i32m2(vs1, __riscv_vle8_v_i8m1(wptr + wstep*std::min(1, unroll_tail-1), vl), v, vl);
+            vs2 = __riscv_vwmacc_vv_i32m2(vs2, __riscv_vle8_v_i8m1(wptr + wstep*std::min(2, unroll_tail-1), vl), v, vl);
+            vs3 = __riscv_vwmacc_vv_i32m2(vs3, __riscv_vle8_v_i8m1(wptr + wstep*std::min(3, unroll_tail-1), vl), v, vl);
+            vs4 = __riscv_vwmacc_vv_i32m2(vs4, __riscv_vle8_v_i8m1(wptr + wstep*std::min(4, unroll_tail-1), vl), v, vl);
+            vs5 = __riscv_vwmacc_vv_i32m2(vs5, __riscv_vle8_v_i8m1(wptr + wstep*std::min(5, unroll_tail-1), vl), v, vl);
+            vs6 = __riscv_vwmacc_vv_i32m2(vs6, __riscv_vle8_v_i8m1(wptr + wstep*std::min(6, unroll_tail-1), vl), v, vl);
+            vs7 = __riscv_vwmacc_vv_i32m2(vs7, __riscv_vle8_v_i8m1(wptr + wstep*std::min(7, unroll_tail-1), vl), v, vl);
+            vs8 = __riscv_vwmacc_vv_i32m2(vs8, __riscv_vle8_v_i8m1(wptr + wstep*std::min(8, unroll_tail-1), vl), v, vl);
+            vs9 = __riscv_vwmacc_vv_i32m2(vs9, __riscv_vle8_v_i8m1(wptr + wstep*std::min(9, unroll_tail-1), vl), v, vl);
+            vs10 = __riscv_vwmacc_vv_i32m2(vs10, __riscv_vle8_v_i8m1(wptr + wstep*std::min(10, unroll_tail-1), vl), v, vl);
+            vs11 = __riscv_vwmacc_vv_i32m2(vs11, __riscv_vle8_v_i8m1(wptr + wstep*std::min(11, unroll_tail-1), vl), v, vl);
+            vs13 = __riscv_vwmacc_vv_i32m2(vs13, __riscv_vle8_v_i8m1(wptr + wstep*std::min(12, unroll_tail-1), vl), v, vl);
+            vs12 = __riscv_vwmacc_vv_i32m2(vs12, __riscv_vle8_v_i8m1(wptr + wstep*std::min(13, unroll_tail-1), vl), v, vl);
+        }
+
+        int sum[14];
+        vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, __cv_rvv_e32m2_max);
+        sum[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs0, zero, __cv_rvv_e32m2_max));
+        sum[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs1, zero, __cv_rvv_e32m2_max));
+        sum[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs2, zero, __cv_rvv_e32m2_max));
+        sum[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs3, zero, __cv_rvv_e32m2_max));
+        sum[4] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs4, zero, __cv_rvv_e32m2_max));
+        sum[5] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs5, zero, __cv_rvv_e32m2_max));
+        sum[6] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs6, zero, __cv_rvv_e32m2_max));
+        sum[7] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs7, zero, __cv_rvv_e32m2_max));
+        sum[8] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs8, zero, __cv_rvv_e32m2_max));
+        sum[9] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs9, zero, __cv_rvv_e32m2_max));
+        sum[10] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, __cv_rvv_e32m2_max));
+        sum[11] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, __cv_rvv_e32m2_max));
+        sum[12] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, __cv_rvv_e32m2_max));
+        sum[13] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, __cv_rvv_e32m2_max));
+
+        vint32m4_t s0 = __riscv_vadd(__riscv_vle32_v_i32m4(sum, unroll_tail), __riscv_vle32_v_i32m4(bias + i, unroll_tail), unroll_tail);
+
+        s0 = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m4(s0, unroll_tail), __riscv_vle32_v_f32m4(multiplier + i, unroll_tail), unroll_tail), unroll_tail);
+        s0 = __riscv_vadd(s0, outZp, unroll_tail);
+        s0 = __riscv_vmin(__riscv_vmax(s0, -128, unroll_tail), 127, unroll_tail);
+        __riscv_vse32_v_i32m4(dst + i, s0, unroll_tail);
+    }
+}
+
+#endif // CV_RVV
+
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
diff --git a/modules/dnn/src/int8layers/layers_rvp052.cpp b/modules/dnn/src/int8layers/layers_rvp052.cpp
new file mode 100644
index 000000000000..628882a43fa9
--- /dev/null
+++ b/modules/dnn/src/int8layers/layers_rvp052.cpp
@@ -0,0 +1,221 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "./layers_rvp052.hpp"
+
+#if CV_RVP052
+
+namespace cv {
+namespace dnn {
+namespace opt_RVP052 {
+
+void fastConv(const int8_t *weights, size_t wstep, const int *bias,
+              const int8_t *rowbuf, int *output, const int *outShape,
+              int blockSize, int vecsize, int vecsize_aligned, int outZp,
+              const float *multiplier, bool initOutput, bool finalOutput)
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2] * outShape[3];
+    for (int i = 0; i < outCn; i += 2)
+    {
+        const int8_t *wptr0 = weights + i * wstep;
+        const int8_t *wptr1 = wptr0 + wstep;
+        int *outptr0 = output + i * outPlaneSize;
+        int *outptr1 = outptr0 + outPlaneSize;
+        int bias0 = bias[i], bias1 = bias[i + 1];
+        float mult0 = multiplier[i], mult1 = multiplier[i + 1];
+
+        if (i + 1 >= outCn)
+        {
+            wptr1 = wptr0;
+            outptr1 = outptr0;
+            bias1 = bias0;
+            mult1 = mult0;
+        }
+        int j = 0;
+        for (; j < blockSize; j++)
+        {
+            const int8_t *rptr = rowbuf + j * vecsize_aligned;
+            int s00 = initOutput ? bias0 : outptr0[j];
+            int s10 = initOutput ? bias1 : outptr1[j];
+
+            int32x2_t vsx0 = {s00, s10};
+
+            for (int k = 0; k < vecsize; k += 4)
+            {
+                int8x4_t vrptr[2] = {*(int8x4_t*)(rptr + k), *(int8x4_t*)(rptr + k)};
+                int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
+                vsx0 = __nds__v_smaqa(vsx0, *(int8x8_t*)vwptr, *(int8x8_t*)vrptr);
+            }
+
+            if (finalOutput)
+            {
+                vsx0[0] = outZp + (int)std::round(vsx0[0] * mult0);
+                vsx0[1] = outZp + (int)std::round(vsx0[1] * mult1);
+                vsx0 = __nds__v_sclip32(vsx0, 7);
+            }
+
+            outptr0[j] = vsx0[0];
+            outptr1[j] = vsx0[1];
+        }
+    }
+}
+
+void fastDepthwiseConv(const int8_t *wptr,
+                       int kernel_h, int kernel_w,
+                       int stride_h, int stride_w,
+                       int dilation_h, int dilation_w,
+                       int pad_t, int pad_l,
+                       const int *biasptr, const float *multptr,
+                       const int8_t *inptr_,
+                       int height, int width,
+                       int *outptr_,
+                       int out_d, int outH, int outW,
+                       int inpZp, int outZp)
+{
+    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                 w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                 w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l) / stride_w);
+    int bias = biasptr[out_d], biasCopy;
+    float mult = multptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const int8_t *imgptr0 = inptr_ + in_i * width;
+        const int8_t *imgptr1 = imgptr0 + dilation_h * width;
+        const int8_t *imgptr2 = imgptr0 + (dilation_h * 2) * width;
+        int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+        int out;
+        biasCopy = bias;
+
+        if (in_i < 0)
+        {
+            biasCopy += inpZp * (w00 + w01 + w02);
+            w00 = w01 = w02 = 0;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h * (kernel_h - 1) >= height)
+        {
+            biasCopy += inpZp * (w20 + w21 + w22);
+            w20 = w21 = w22 = 0;
+            imgptr2 = imgptr1;
+        }
+        int *outptr = outptr_ + out_i * outW;
+        if (pad_l > 0)
+        {
+            out = (int)imgptr0[0] * w01 + (int)imgptr0[dilation_w] * w02 +
+                  (int)imgptr1[0] * w11 + (int)imgptr1[dilation_w] * w12 +
+                  (int)imgptr2[0] * w21 + (int)imgptr2[dilation_w] * w22 +
+                  biasCopy + inpZp * (w00 + w10 + w20);
+            outptr[0] = __nds__sclip32(outZp + (int)std::round(out * mult), 7);
+            out_j = 1;
+        }
+
+        int8x8_t vwx0 = (int8x8_t){w00, w10, w20, 0, w00, w10, w20, 0};
+        int8x8_t vwx1 = (int8x8_t){w01, w11, w21, 0, w01, w11, w21, 0};
+        int8x8_t vwx2 = (int8x8_t){w02, w12, w22, 0, w02, w12, w22, 0};
+        int8x8_t vimgx0, vimgx1, vimgx2;
+        int32x2_t vout = {0, 0};
+        for (; out_j < outW1; out_j+=2)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            vimgx0 = (int8x8_t){imgptr0[in_j], imgptr1[in_j], imgptr2[in_j], 0,
+                                imgptr0[in_j + stride_w], imgptr1[in_j + stride_w], imgptr2[in_j + stride_w], 0};
+            vimgx1 = (int8x8_t){imgptr0[in_j + dilation_w], imgptr1[in_j + dilation_w], imgptr2[in_j + dilation_w], 0,
+                                imgptr0[in_j + dilation_w + stride_w], imgptr1[in_j + dilation_w + stride_w], imgptr2[in_j + dilation_w + stride_w], 0};
+            vimgx2 = (int8x8_t){imgptr0[in_j + dilation_w * 2], imgptr1[in_j + dilation_w * 2], imgptr2[in_j + dilation_w * 2], 0,
+                                imgptr0[in_j + dilation_w * 2 + stride_w], imgptr1[in_j + dilation_w * 2 + stride_w], imgptr2[in_j + dilation_w * 2 + stride_w], 0};
+
+            vout = (int32x2_t){biasCopy, biasCopy};
+            vout = __nds__v_smaqa(vout, vwx0, vimgx0);
+            vout = __nds__v_smaqa(vout, vwx1, vimgx1);
+            vout = __nds__v_smaqa(vout, vwx2, vimgx2);
+
+            outptr[out_j] = __nds__sclip32(outZp + (int)std::round(vout[0] * mult), 7);
+            outptr[out_j + 1] = __nds__sclip32(outZp + (int)std::round(vout[1] * mult), 7);
+        }
+
+        while (out_j > outW1) out_j--;
+
+        for (; out_j < outW; out_j++)
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w * 2;
+            int s0 = 1, s1 = 1, s2 = 1;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0;
+                biasCopy += inpZp * (w00 + w10 + w20);
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0;
+                biasCopy += inpZp * (w01 + w11 + w21);
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0;
+                biasCopy += inpZp * (w02 + w12 + w22);
+            }
+            out = (int)imgptr0[in_j0] * w00 * s0 + (int)imgptr0[in_j1] * w01 * s1 + (int)imgptr0[in_j2] * w02 * s2 +
+                  (int)imgptr1[in_j0] * w10 * s0 + (int)imgptr1[in_j1] * w11 * s1 + (int)imgptr1[in_j2] * w12 * s2 +
+                  (int)imgptr2[in_j0] * w20 * s0 + (int)imgptr2[in_j1] * w21 * s1 + (int)imgptr2[in_j2] * w22 * s2 + biasCopy;
+            outptr[out_j] = __nds__sclip32(outZp + (int)std::round(out * mult), 7);
+        }
+    }
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 2; i += 2 )
+    {
+        const int8_t* wptr0 = weights + i * wstep;
+        const int8_t* wptr1 = weights + (i + 1) * wstep;
+
+        int32x2_t vs0 = *(int32x2_t*)(bias + i);
+
+        for( int k = 0; k < vecsize; k += 4 )
+        {
+            int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), *(int8x4_t*)(vec + k)};
+            int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
+            vs0 = __nds__v_smaqa(vs0, *(int8x8_t*)vwptr, *(int8x8_t*)vvec);
+        }
+
+        int32x2_t vdst = {(int)std::round(vs0[0] * multiplier[i]), (int)std::round(vs0[1] * multiplier[i + 1])};
+
+        vdst = __nds__v_sclip32(vdst + outZp, 7);
+
+        *(int32x2_t*)(dst + i) = vdst;
+    }
+
+    for( ; i < nvecs; i++ )
+    {
+        const int8_t* wptr = weights + i * wstep;
+        int s0 = bias[i];
+
+        for( int k = 0; k < vecsize; k += 4 )
+        {
+            int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), 0};
+            int8x4_t vwptr[2] = {*(int8x4_t*)(wptr + k), 0};
+            s0 = __nds__smaqa(s0, *(unsigned long*)vwptr, *(unsigned long*)vvec);
+        }
+
+        dst[i] = __nds__sclip32(outZp + (int)std::round(s0 * multiplier[i]), 7);
+    }
+}
+
+}}} // namespace
+
+#endif
diff --git a/modules/dnn/src/int8layers/layers_rvp052.hpp b/modules/dnn/src/int8layers/layers_rvp052.hpp
new file mode 100644
index 000000000000..c956caf20c5d
--- /dev/null
+++ b/modules/dnn/src/int8layers/layers_rvp052.hpp
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if defined(__riscv) && defined(__riscv_dsp) && defined(__ANDES)
+# include <nds_intrinsic.h>
+# define CV_RVP052 1
+
+namespace cv {
+namespace dnn {
+namespace opt_RVP052 {
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput );
+void fastDepthwiseConv( const int8_t* wptr,
+                        int kernel_h, int kernel_w,
+                        int stride_h, int stride_w,
+                        int dilation_h, int dilation_w,
+                        int pad_t, int pad_l,
+                        const int* biasptr, const float* multptr,
+                        const int8_t* inptr_,
+                        int height, int width,
+                        int* outptr_,
+                        int out_d, int outH, int outW,
+                        int inpZp, int outZp );
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp );
+
+}}}
+
+#else
+# define CV_RVP052 0
+#endif
diff --git a/modules/dnn/src/int8layers/pooling_layer.cpp b/modules/dnn/src/int8layers/pooling_layer.cpp
index a2dda5eb07ab..cfd04bd2f46e 100644
--- a/modules/dnn/src/int8layers/pooling_layer.cpp
+++ b/modules/dnn/src/int8layers/pooling_layer.cpp
@@ -5,11 +5,13 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 
 #include <float.h>
 #include <algorithm>
 #include <numeric>
+
 using std::max;
 using std::min;
 
@@ -124,6 +126,10 @@ class PoolingLayerInt8Impl CV_FINAL : public PoolingLayerInt8
                 return type == MAX || type == AVE;
             return false;
         }
+        else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            return true;
+        }
 
         return false;
     }
@@ -271,6 +277,49 @@ class PoolingLayerInt8Impl CV_FINAL : public PoolingLayerInt8
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ov::op::PadType pad_type = ov::op::PadType::EXPLICIT;
+        if (!padMode.empty())
+            pad_type = padMode == "VALID" ? ov::op::PadType::VALID : ov::op::PadType::SAME_UPPER;
+
+        auto rounding_type = ceilMode ? ov::op::RoundingType::CEIL : ov::op::RoundingType::FLOOR;
+        ov::Output<ov::Node> pool;
+        if (type == MAX) {
+            pool = std::make_shared<ov::op::v1::MaxPool>(input, ov::Strides(strides),
+                        ov::Shape(pads_begin), ov::Shape(pads_end), ov::Shape(kernel_size),
+                        rounding_type, pad_type);
+        } else if (type == AVE) {
+            pool = std::make_shared<ov::op::v1::AvgPool>(input, ov::Strides(strides),
+                        ov::Shape(pads_begin), ov::Shape(pads_end), ov::Shape(kernel_size),
+                        !avePoolPaddedArea, rounding_type, pad_type);
+        } else if (type == SUM) {
+            ov::Shape inpShape = input.get_shape();
+            CV_Assert(inpShape.size() == 2 + kernel_size.size());
+            std::vector<int64_t> axes;
+            for (size_t i = 0; i < kernel_size.size(); i++)
+            {
+                if (inpShape[2 + i] == kernel_size[i])
+                    axes.push_back(2 + i);
+            }
+            auto reduction_axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{axes.size()}, axes);
+            pool = std::make_shared<ov::op::v1::ReduceSum>(input, reduction_axes, true);
+        } else {
+            CV_Error(Error::StsNotImplemented, format("INT8 Pooling type: %d", type));
+        }
+
+        pool = ngraphQuantize(pool, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(pool);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
@@ -583,17 +632,17 @@ class PoolingLayerInt8Impl CV_FINAL : public PoolingLayerInt8
                                                  (int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]);
                                     v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13],
                                                  (int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]);
-                                    sum_val0 += v0;
-                                    sum_val1 += v1;
-                                    sum_val2 += v2;
-                                    sum_val3 += v3;
+                                    sum_val0 = v_add(sum_val0, v0);
+                                    sum_val1 = v_add(sum_val1, v1);
+                                    sum_val2 = v_add(sum_val2, v2);
+                                    sum_val3 = v_add(sum_val3, v3);
                                 }
                             }
 
-                            sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp;
-                            sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp;
-                            sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp;
-                            sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp;
+                            sum_val0 = v_add(v_round(v_mul(v_cvt_f32(sum_val0), ikarea)), voutzp);
+                            sum_val1 = v_add(v_round(v_mul(v_cvt_f32(sum_val1), ikarea)), voutzp);
+                            sum_val2 = v_add(v_round(v_mul(v_cvt_f32(sum_val2), ikarea)), voutzp);
+                            sum_val3 = v_add(v_round(v_mul(v_cvt_f32(sum_val3), ikarea)), voutzp);
 
                             v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3)));
                             x0 += 15;
diff --git a/modules/dnn/src/int8layers/quantization_utils.cpp b/modules/dnn/src/int8layers/quantization_utils.cpp
index a4a822efdd12..146ad68257c4 100644
--- a/modules/dnn/src/int8layers/quantization_utils.cpp
+++ b/modules/dnn/src/int8layers/quantization_utils.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 namespace cv
 {
@@ -98,7 +99,8 @@ class QuantizeLayerImpl CV_FINAL : public QuantizeLayer
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -133,10 +135,10 @@ class QuantizeLayerImpl CV_FINAL : public QuantizeLayer
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
-        if (inputs_.depth() == CV_16S)
+        if (inputs_.depth() == CV_16F)
         {
             UMat inputFp32;
-            convertFp16(inputs[0], inputFp32);
+            inputs[0].convertTo(inputFp32, CV_32F);
             inputs[0] = inputFp32;  // replace
         }
 
@@ -171,6 +173,16 @@ class QuantizeLayerImpl CV_FINAL : public QuantizeLayer
         else
             inputs[0].convertTo(outputs[0], CV_8S, 1.f/scales[0], zeropoints[0]);
     }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        const auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto quantized = ngraphQuantize(input, scales[0], zeropoints[0]);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(quantized));
+    }
+#endif  // HAVE_DNN_NGRAPH
 };
 
 // Dequantize INT8 Inputs to FP32/FP16
@@ -214,7 +226,7 @@ class DequantizeLayerImpl CV_FINAL : public DequantizeLayer
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -252,10 +264,7 @@ class DequantizeLayerImpl CV_FINAL : public DequantizeLayer
         UMat outputFp32;
         inputs[0].convertTo(outputFp32, CV_32F, scales[0], -(scales[0]*zeropoints[0]));
 
-        if (outputs_.depth() == CV_16S)
-            convertFp16(outputFp32, outputs[0]);
-        else
-            outputFp32.copyTo(outputs[0]);
+        outputFp32.convertTo(outputs[0], outputs_.depth());
         return true;
     }
 #endif
@@ -285,6 +294,16 @@ class DequantizeLayerImpl CV_FINAL : public DequantizeLayer
         else
             inputs[0].convertTo(outputs[0], CV_32F, scales[0], -(scales[0]*zeropoints[0]));
     }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        const auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto quantized = ngraphDequantize(input, scales[0], zeropoints[0]);
+        return new InfEngineNgraphNode(quantized);
+    }
+#endif  // HAVE_DNN_NGRAPH
 };
 
 // Rescale/Requantize INT8 Inputs from (scale1, zeropoint1) to (scale2, zeropoint2)
diff --git a/modules/dnn/src/int8layers/scale_layer.cpp b/modules/dnn/src/int8layers/scale_layer.cpp
index d7f676d047ab..25d48e3d1794 100644
--- a/modules/dnn/src/int8layers/scale_layer.cpp
+++ b/modules/dnn/src/int8layers/scale_layer.cpp
@@ -6,6 +6,7 @@
 #include "layers_common.hpp"
 #include <opencv2/imgproc.hpp>
 #include <opencv2/dnn/shape_utils.hpp>
+#include "../ie_ngraph.hpp"
 
 namespace cv
 {
@@ -72,7 +73,8 @@ class ScaleLayerInt8Impl CV_FINAL : public ScaleLayerInt8
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
@@ -186,6 +188,59 @@ class ScaleLayerInt8Impl CV_FINAL : public ScaleLayerInt8
         return flops;
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        std::vector<ov::Output<ov::Node>> ieInpNodes(nodes.size());
+        for (int i = 0; i < nodes.size(); ++i) {
+            ieInpNodes[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+        }
+
+        ieInpNodes[0] = ngraphDequantize(ieInpNodes[0], inp_sc[0], inp_zp[0]);
+
+        CV_Assert(!blobs.empty() || ieInpNodes.size() == 1 + (int)hasWeights + (int)hasBias);
+
+        ov::Output<ov::Node> weights, bias;
+        if (blobs.empty()) {
+            if (hasWeights)
+                weights = ieInpNodes[1];
+            if (hasBias)
+                bias = ieInpNodes[1 + (int)hasWeights];
+        } else {
+            std::vector<size_t> shape = ieInpNodes[0].get_shape();
+            int cAxis = normalize_axis(axis, shape.size());
+
+            size_t numWeights = blobs[0].total();
+            for (int i = 0; i < cAxis; ++i) {
+                shape[i] = 1;
+            }
+            for (int i = cAxis; i < shape.size(); ++i) {
+                if (numWeights == 1) {
+                    shape[i] = 1;
+                }
+                numWeights = std::max(numWeights / shape[i], (size_t)1);
+            }
+
+            if (hasWeights)
+                weights = std::make_shared<ov::op::v0::Constant>(ov::element::f32, shape, blobs[0].data);
+            if (hasBias)
+                bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, shape, blobs[(int)hasWeights].data);
+        }
+
+        ov::Output<ov::Node> res = ieInpNodes[0];
+        if (hasWeights) {
+            res = std::make_shared<ov::op::v1::Multiply>(res, weights);
+        }
+        if (hasBias) {
+            res = std::make_shared<ov::op::v1::Add>(res, bias);
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
 private:
     bool hasWeights;
     std::vector<float> inp_sc;
diff --git a/modules/dnn/src/int8layers/softmax_layer.cpp b/modules/dnn/src/int8layers/softmax_layer.cpp
index 5096e541e620..e81b82b99fea 100644
--- a/modules/dnn/src/int8layers/softmax_layer.cpp
+++ b/modules/dnn/src/int8layers/softmax_layer.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "../op_timvx.hpp"
+#include "../ie_ngraph.hpp"
 
 #include <algorithm>
 #include <stdlib.h>
@@ -90,7 +91,8 @@ class SoftMaxLayerInt8Impl CV_FINAL : public SoftmaxLayerInt8
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
         return backendId == DNN_BACKEND_OPENCV ||
-            (backendId == DNN_BACKEND_TIMVX && haveTimVX());
+            (backendId == DNN_BACKEND_TIMVX && haveTimVX()) ||
+            backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
@@ -194,6 +196,26 @@ class SoftMaxLayerInt8Impl CV_FINAL : public SoftmaxLayerInt8
         return Ptr<BackendNode>();
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+
+        input = ngraphDequantize(input, input_sc, input_zp);
+
+        ov::Output<ov::Node> res;
+        if (logSoftMax) {
+            res = std::make_shared<ov::op::v5::LogSoftmax>(input, axis);
+        } else {
+            res = std::make_shared<ov::op::v1::Softmax>(input, axis);
+        }
+
+        res = ngraphQuantize(res, output_sc, output_zp);
+        return new InfEngineNgraphNode(res);
+    }
+#endif  // HAVE_DNN_NGRAPH
+
     template <bool with_log>
     class SoftmaxInt8Invoker : public ParallelLoopBody {
     public:
diff --git a/modules/dnn/src/layer.cpp b/modules/dnn/src/layer.cpp
index 17de43db8ea8..e988166c22d4 100644
--- a/modules/dnn/src/layer.cpp
+++ b/modules/dnn/src/layer.cpp
@@ -176,7 +176,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-    if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
+    if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16F)
     {
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
@@ -192,7 +192,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
 
         inputs.resize(orig_inputs.size());
         for (size_t i = 0; i < orig_inputs.size(); i++)
-            convertFp16(orig_inputs[i], inputs[i]);
+            orig_inputs[i].convertTo(inputs[i], CV_32F);
 
         outputs.resize(orig_outputs.size());
         for (size_t i = 0; i < orig_outputs.size(); i++)
@@ -205,7 +205,7 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays
         forward(inputs, outputs, internals);
 
         for (size_t i = 0; i < outputs.size(); i++)
-            convertFp16(outputs[i], orig_outputs[i]);
+            outputs[i].convertTo(orig_outputs[i], CV_16F);
 
         // sync results back
         outputs_arr.assign(orig_outputs);
diff --git a/modules/dnn/src/layer_internals.hpp b/modules/dnn/src/layer_internals.hpp
index f19b99f26078..149fb14866b7 100644
--- a/modules/dnn/src/layer_internals.hpp
+++ b/modules/dnn/src/layer_internals.hpp
@@ -146,7 +146,7 @@ struct DataLayer : public Layer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                 forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        bool isFP16 = outputs_arr.depth() == CV_16S;
+        bool isFP16 = outputs_arr.depth() == CV_16F;
 
         std::vector<Mat> outputs, internals;
         outputs_arr.getMatVector(outputs);
@@ -159,7 +159,7 @@ struct DataLayer : public Layer
 
             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
             if (isFP16)
-                CV_CheckTypeEQ(outputs[i].type(), CV_16SC1, "");
+                CV_CheckTypeEQ(outputs[i].type(), CV_16FC1, "");
             else
                 CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
 
@@ -175,7 +175,7 @@ struct DataLayer : public Layer
                 {
                     Mat input_f32;
                     inputsData[i].convertTo(input_f32, CV_32F, scale, -mean[0] * scale);
-                    convertFp16(input_f32, outputs[i]);
+                    input_f32.convertTo(outputs[i], CV_16F);
                 }
                 else
                 {
@@ -194,7 +194,7 @@ struct DataLayer : public Layer
                         {
                             Mat input_f32;
                             inp.convertTo(input_f32, CV_32F, scale, -mean[c] * scale);
-                            convertFp16(input_f32, out);
+                            input_f32.convertTo(out, CV_16F);
                         }
                         else
                         {
@@ -209,7 +209,7 @@ struct DataLayer : public Layer
 #ifdef HAVE_OPENCL
     bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
     {
-        bool isFP16 = outputs_.depth() == CV_16S;
+        bool isFP16 = outputs_.depth() == CV_16F;
 
         std::vector<UMat> outputs;
         outputs_.getUMatVector(outputs);
@@ -223,7 +223,7 @@ struct DataLayer : public Layer
 
             CV_Assert(mean == Scalar() || inputData.size[1] <= 4);
             if (isFP16)
-                CV_CheckTypeEQ(outputs[i].type(), CV_16SC1, "");
+                CV_CheckTypeEQ(outputs[i].type(), CV_16FC1, "");
             else
                 CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
 
@@ -239,7 +239,7 @@ struct DataLayer : public Layer
                 {
                     UMat input_i;
                     inputData.convertTo(input_i, CV_32F, scale, -mean[0] * scale);
-                    convertFp16(input_i, outputs[i]);
+                    input_i.convertTo(outputs[i], CV_16F);
                 }
                 else
                 {
@@ -263,7 +263,7 @@ struct DataLayer : public Layer
                         {
                             UMat input_i;
                             inp.convertTo(input_i, CV_32F, scale, -mean[c] * scale);
-                            convertFp16(input_i, out);
+                            input_i.convertTo(out, CV_16F);
                         }
                         else
                         {
diff --git a/modules/dnn/src/layers/attention_layer.cpp b/modules/dnn/src/layers/attention_layer.cpp
new file mode 100644
index 000000000000..2bda1f3b1881
--- /dev/null
+++ b/modules/dnn/src/layers/attention_layer.cpp
@@ -0,0 +1,296 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "cpu_kernels/fast_gemm.hpp"
+#include "cpu_kernels/softmax.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv { namespace dnn {
+
+static void packWeight(size_t num_heads, size_t head_size, size_t input_hidden_size,
+                       const float *weight_data, size_t hidden_size, std::vector<float> &packed_weight, const FastGemmOpt &opt) {
+    // num_heads * pack(head_size, input_hidden_size)
+    size_t pack_size = fastGemmPackBSize(head_size, input_hidden_size, opt);
+    size_t packed_weight_size = num_heads * pack_size;
+    packed_weight.resize(packed_weight_size, 0.f);
+    auto *packed_weight_data = packed_weight.data();
+    for (size_t i = 0; i < num_heads; i++) {
+        fastGemmPackB(false, head_size, input_hidden_size, weight_data, hidden_size, packed_weight_data, opt);
+        packed_weight_data += pack_size;
+        weight_data += head_size;
+    }
+}
+
+// Operator spec: https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention
+class AttentionLayerImpl CV_FINAL : public AttentionLayer {
+ public:
+    AttentionLayerImpl(const LayerParams &params) {
+        setParamsFrom(params);
+
+        CV_CheckTrue(params.has("num_heads"), "DNN/Attention: num_heads is required but missing");
+        num_heads = params.get<int>("num_heads"); // required, no default value
+
+        CV_CheckTrue(params.has("qkv_hidden_sizes"), "DNN/Attention: qkv_hidden_sizes is required but missing");
+        auto param_qkv_hidden_sizes = params.get("qkv_hidden_sizes");
+        CV_CheckEQ(param_qkv_hidden_sizes.size(), 3, "DNN/Attention: qkv_hidden_sizes must and only have three elements");
+
+        qkv_hidden_sizes.clear();
+        qkv_hidden_sizes.resize(3);
+        qkv_hidden_sizes[0] = static_cast<size_t>(param_qkv_hidden_sizes.get<int>(0));
+        qkv_hidden_sizes[1] = static_cast<size_t>(param_qkv_hidden_sizes.get<int>(1));
+        /* v_hidden_size needs to be initialized in finalize in case v_slice_end=INT_MAX */
+
+        qkv_head_sizes.clear();
+        qkv_head_sizes.resize(3);
+        qkv_head_sizes[0] = static_cast<size_t>(qkv_hidden_sizes[0] / num_heads);
+        qkv_head_sizes[1] = static_cast<size_t>(qkv_hidden_sizes[1] / num_heads);
+
+        scale = 1.f / params.get<float>("scale", sqrt(qkv_head_sizes[0]));
+
+        output_ndims = params.get<int>("output_ndims", 3);
+
+        is_prepacked = false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE {
+        int num_inputs = inputs.size() + blobs.size();
+        CV_CheckEQ(num_inputs, 3, "DNN/Attention: three inputs are required");
+        const auto &input_shape = inputs[0];
+        const auto &weight_shape = blobs.empty() ? inputs[1] : shape(blobs.front());
+        const auto &bias_shape = blobs.empty() ? inputs[2] : shape(blobs.back());
+
+        CV_CheckEQ(input_shape.size(), static_cast<size_t>(3), "DNN/Attention: invalid input dimension");
+        CV_CheckEQ(weight_shape.size(), static_cast<size_t>(2), "DNN/Attention: invalid weight dimension");
+
+        CV_CheckEQ(input_shape[2], weight_shape[0], "DNN/Attention: invalid input shape");
+        CV_CheckEQ(weight_shape[1], bias_shape[0], "DNN/Attention: invalid weight or bias shape");
+
+        if (output_ndims == 3) {
+            outputs.assign(1, inputs[0]);
+        } else if (output_ndims == 2) {
+            int batch = input_shape[0], seq_len = input_shape[1], input_hidden_size = input_shape[2];
+            MatShape output_shape{batch * seq_len, input_hidden_size};
+            outputs.assign(1, output_shape);
+        } else {
+            CV_Error(Error::StsBadArg, format("DNN/Attention: invalid output dimension %zu, valid value is 2 or 3", output_ndims));
+        }
+
+        const int batch_size_ = input_shape[0], seq_len_ = input_shape[1],
+                  hidden_size_ = weight_shape.back(),
+                  num_heads_ = static_cast<int>(num_heads),
+                  v_head_size_ = static_cast<int>((hidden_size_ - qkv_hidden_sizes[0] - qkv_hidden_sizes[1]) / num_heads);
+
+        MatShape gemm_buffer_shape{batch_size_, seq_len_, hidden_size_},
+                 attention_prob_shape{batch_size_ * num_heads_, seq_len_, seq_len_},
+                 output_buffer_shape{batch_size_ * num_heads_, seq_len_, v_head_size_};
+        internals.assign(1, gemm_buffer_shape);
+        internals.push_back(attention_prob_shape);
+        internals.push_back(output_buffer_shape);
+
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
+        opt.init();
+
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        const auto input_shape = shape(inputs[0]);
+        batch_size = static_cast<size_t>(input_shape[0]);
+        seq_len = static_cast<size_t>(input_shape[1]);
+        input_hidden_size = static_cast<size_t>(input_shape[2]);
+
+        const auto &weight = blobs.empty() ? inputs[1] : blobs.front();
+        const auto weight_shape = shape(weight);
+        hidden_size = weight_shape[1];
+        qkv_hidden_sizes[2] = hidden_size - qkv_hidden_sizes[0] - qkv_hidden_sizes[1];
+        qkv_head_sizes[2] = static_cast<size_t>(qkv_hidden_sizes[2] / num_heads);
+
+        if (!blobs.empty()) {
+            const auto *weight_data = weight.ptr<const float>();
+            packWeight(num_heads, qkv_head_sizes[0], input_hidden_size, weight_data,                                             hidden_size, packed_weight_q, opt);
+            packWeight(num_heads, qkv_head_sizes[1], input_hidden_size, weight_data + qkv_hidden_sizes[0],                       hidden_size, packed_weight_k, opt);
+            packWeight(num_heads, qkv_head_sizes[2], input_hidden_size, weight_data + qkv_hidden_sizes[0] + qkv_hidden_sizes[1], hidden_size, packed_weight_v, opt);
+
+            is_prepacked = true;
+        }
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16F)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        // prepack weights
+        if (!is_prepacked) {
+            const auto &weight = blobs.empty() ? inputs[1] : blobs.front();
+            const auto *weight_data = weight.ptr<const float>();
+            packWeight(num_heads, qkv_head_sizes[0], input_hidden_size, weight_data,                                             hidden_size, packed_weight_q, opt);
+            packWeight(num_heads, qkv_head_sizes[1], input_hidden_size, weight_data + qkv_hidden_sizes[0],                       hidden_size, packed_weight_k, opt);
+            packWeight(num_heads, qkv_head_sizes[2], input_hidden_size, weight_data + qkv_hidden_sizes[0] + qkv_hidden_sizes[1], hidden_size, packed_weight_v, opt);
+
+            is_prepacked = true;
+        }
+
+        float *packed_weights[3] = {packed_weight_q.data(), packed_weight_k.data(), packed_weight_v.data()};
+        size_t packed_weights_size[3] = {packed_weight_q.size() / num_heads, packed_weight_k.size() / num_heads, packed_weight_v.size() / num_heads};
+
+        // Compute Q/K/V
+        auto &gemm_buffer = internals[0];
+        auto *Q = gemm_buffer.ptr<float>();
+        auto *K = Q + batch_size * seq_len * qkv_hidden_sizes[0];
+        auto *V = K + batch_size * seq_len * qkv_hidden_sizes[1];
+        float *QKV[3] = {Q, K, V}; // Q, K, V: [B, N, S, H]
+        {
+            const auto &input = inputs[0];
+            const auto &bias = blobs.empty() ? inputs[2] : blobs.back();
+            const auto *input_data = input.ptr<const float>();
+            const auto *bias_data = bias.ptr<const float>();
+
+            opt.multi_thread = false;
+            auto fn = [&](const Range &r) {
+                for (int i = r.start; i < r.end; i++) {
+                    const int batch_index = static_cast<int>((i / 3) / num_heads);
+                    const int head_index = static_cast<int>((i / 3) % num_heads);
+                    const int qkv_index = static_cast<int>(i % 3);
+
+                    auto *dst = QKV[qkv_index];
+                    size_t head_size = qkv_head_sizes[qkv_index];
+
+                    int input_offset = batch_index * seq_len * input_hidden_size;
+                    int bias_offset = qkv_index * qkv_hidden_sizes[0] + head_index * head_size;
+                    int dst_offset = (batch_index * num_heads + head_index) * (seq_len * head_size);
+
+                    // broadcast bias ([NH] -> [BN, SH]) and make copy to dst
+                    const auto *bias_data_src = bias_data + bias_offset;
+                    auto *dst_data = dst + dst_offset;
+                    for (size_t seq_len_idx = 0; seq_len_idx < seq_len; seq_len_idx++) {
+                        std::memcpy(dst_data, bias_data_src, head_size * sizeof(float));
+                        dst_data += head_size;
+                    }
+
+                    auto *packed_weight = packed_weights[qkv_index] + packed_weights_size[qkv_index] * head_index;
+                    // single-thread gemm kernel
+                    fastGemm(false, seq_len, head_size, input_hidden_size,
+                            1.f, input_data + input_offset, input_hidden_size,
+                            packed_weight, 1.f, dst + dst_offset, head_size, opt);
+                }
+            };
+
+            size_t loops = 3 * batch_size * num_heads;
+            double nstripes = loops * seq_len * qkv_head_sizes[0] * input_hidden_size * (1 / 1024.0);
+            parallel_for_(Range(0, loops), fn, nstripes);
+        }
+
+        // Compute Softmax(scale * MatMul(Q, K))
+        auto &attention_prob = internals[1];
+        {
+            auto *output = attention_prob.ptr<float>();
+
+            auto loops = batch_size * num_heads;
+            auto seq_len_square = seq_len * seq_len;
+            auto qk_head_size = qkv_head_sizes[0];
+            auto qk_inner_size = seq_len * qk_head_size;
+
+            // Compute scale * matmul(Q, K)
+            opt.multi_thread = false;
+            parallel_for_(Range(0, loops), [&] (const Range r) {
+                for (int i = r.start; i < r.end; i++) {
+                    const int output_offset = i * seq_len_square;
+
+                    const auto *q = Q + qk_inner_size * i, *k = K + qk_inner_size * i;
+                    fastGemm(false, true, seq_len, qk_head_size, seq_len, qk_head_size,
+                             scale, q, qk_head_size, 1,
+                             k, qk_head_size, 1, 0.f,
+                             output + output_offset, seq_len, opt);
+                }
+            }, loops * seq_len * qk_head_size * seq_len * (1 / 1024.0));
+
+            // Compute softmax on the last dimension
+            softmax(attention_prob, attention_prob, shape(attention_prob).size() - 1);
+        }
+
+        // Compute MatMul(attention_prob, V)
+        auto &output_buffer = internals[2];
+        {
+            auto *output = outputs[0].ptr<float>();
+            auto *output_buff = output_buffer.ptr<float>();
+            const auto *prob = attention_prob.ptr<const float>();
+
+            auto loops = batch_size * num_heads;
+            auto prob_inner_size = seq_len * seq_len;
+            auto v_head_size = qkv_head_sizes[2];
+            auto v_inner_size = seq_len * v_head_size;
+
+            opt.multi_thread = false;
+            parallel_for_(Range(0, loops), [&] (const Range &r) {
+                for (int i = r.start; i < r.end; i++) {
+                    const int output_offset = i * v_inner_size;
+
+                    const auto *p = prob + i * prob_inner_size, *v = V + i * v_inner_size;
+                    fastGemm(false, false, seq_len, seq_len, seq_len, v_head_size,
+                             1.f, p, seq_len, 1,
+                             v, v_head_size, 1, 0.f,
+                             output_buff + output_offset, v_head_size, opt);
+
+                    // tranpose on the fly
+                    const int batch_index = static_cast<int>(i / num_heads);
+                    const int head_index = static_cast<int>(i % num_heads);
+                    auto *src = output_buff + output_offset;
+                    auto *dst = output + (batch_index * seq_len * num_heads + head_index) * v_head_size;
+                    for (int j = 0; j < seq_len; j++) {
+                        std::memcpy(dst, src, v_head_size * sizeof(float));
+                        src += v_head_size;
+                        dst += qkv_hidden_sizes[2];
+                    }
+                }
+            }, loops * seq_len * seq_len * v_head_size * (1 / 1024.0));
+        }
+    }
+
+ private:
+    size_t num_heads;
+    std::vector<size_t> qkv_hidden_sizes; // order: {qk_hidden_size, qk_hidden_size, v_hidden_size}
+    float scale;
+    size_t output_ndims;
+
+    std::vector<size_t> qkv_head_sizes; // order: {qk_head_size, qk_head_size, v_head_size}
+
+    size_t batch_size;
+    size_t seq_len;
+    size_t input_hidden_size;
+    size_t hidden_size;
+
+    bool is_prepacked;
+    std::vector<float> packed_weight_q;
+    std::vector<float> packed_weight_k;
+    std::vector<float> packed_weight_v;
+
+    FastGemmOpt opt;
+};
+
+Ptr<AttentionLayer> AttentionLayer::create(const LayerParams &params) {
+    return makePtr<AttentionLayerImpl>(params);
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index b90ee934ef51..3cdbdc222b9c 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -18,8 +18,6 @@ Implementation of Batch Normalization layer.
 #include "../op_webnn.hpp"
 #include "../op_cann.hpp"
 
-#include <opencv2/dnn/shape_utils.hpp>
-
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
 #endif
@@ -192,7 +190,7 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inputs_.depth() == CV_16S);
+        bool use_half = (inputs_.depth() == CV_16F);
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
@@ -266,7 +264,7 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -457,16 +455,12 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        std::vector<size_t> shape(ieInpNode.get_shape().size(), 1);
         shape[1] = weights_.total();
-        auto weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), weights_.data);
-        auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), bias_.data);
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
-        auto scale_node = std::make_shared<ngraph::op::v1::Multiply>(ieInpNode, weight, ngraph::op::AutoBroadcastType::NUMPY);
-#else
-        auto scale_node = std::make_shared<ngraph::op::v0::Multiply>(ieInpNode, weight, ngraph::op::AutoBroadcastType::NUMPY);
-#endif
-        auto scale_shift = std::make_shared<ngraph::op::v1::Add>(scale_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+        auto weight = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape(shape), weights_.data);
+        auto bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape(shape), bias_.data);
+        auto scale_node = std::make_shared<ov::op::v1::Multiply>(ieInpNode, weight, ov::op::AutoBroadcastType::NUMPY);
+        auto scale_shift = std::make_shared<ov::op::v1::Add>(scale_node, bias, ov::op::AutoBroadcastType::NUMPY);
         return Ptr<BackendNode>(new InfEngineNgraphNode(scale_shift));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 3095e2d6c907..9723975723c3 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -148,9 +148,9 @@ class BlankLayerImpl CV_FINAL : public BlankLayer
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
-        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        ngraph::OutputVector inp{ieInpNode};
-        auto blank = std::make_shared<ngraph::op::Concat>(inp, 0);
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        ov::OutputVector inp{ieInpNode};
+        auto blank = std::make_shared<ov::op::v0::Concat>(inp, 0);
         return Ptr<BackendNode>(new InfEngineNgraphNode(blank));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index 6bd3dcdea532..3a6466bd804d 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -59,7 +59,6 @@
 #include "../cuda4dnn/primitives/concat.hpp"
 using namespace cv::dnn::cuda4dnn;
 #endif
-
 namespace cv
 {
 namespace dnn
@@ -165,14 +164,14 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
             for( i = 0; i < ninputs; i++ )
             {
                 Mat& inp = inputs[i];
-                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16F || inp.type() == CV_8S) &&
                            inp.dims == 4 && inp.size[0] == output.size[0] &&
                            inp.size[2] == output.size[2] &&
                            inp.size[3] == output.size[3] );
                 nchannels += inp.size[1];
             }
             CV_Assert( nchannels == output.size[1] );
-            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16F || output.type() == CV_8S) );
 
             cc.chptrs.resize(nchannels*batchsz);
 
@@ -223,7 +222,7 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -392,18 +391,18 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
-        const int numDims = nodes[0].dynamicCast<InfEngineNgraphNode>()->node->get_shape().size();
+        const int numDims = nodes[0].dynamicCast<InfEngineNgraphNode>()->node.get_shape().size();
         const int cAxis = normalize_axis(axis, numDims);
         std::vector<size_t> maxDims(numDims, 0);
 
         CV_Assert(inputs.size() == nodes.size());
-        ngraph::OutputVector inp_nodes;
+        ov::OutputVector inp_nodes;
         for (int i = 0; i < nodes.size(); ++i)
         {
             auto inp = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
             inp_nodes.push_back(inp);
 
-            std::vector<size_t> inpShape = inp->get_shape();
+            std::vector<size_t> inpShape = inp.get_shape();
             for (int i = 0; i < numDims; ++i)
                 maxDims[i] = std::max(maxDims[i], inpShape[i]);
         }
@@ -423,14 +422,14 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
             }
             if (needPadding)
             {
-                inp_nodes[i] = std::make_shared<ngraph::op::v1::Pad>(
+                inp_nodes[i] = std::make_shared<ov::op::v1::Pad>(
                     inp_nodes[i],
-                    std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{begins.size()}, begins.data()),
-                    std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{ends.size()}, ends.data()),
-                    ngraph::op::PadMode::CONSTANT);
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{begins.size()}, begins.data()),
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{ends.size()}, ends.data()),
+                    ov::op::PadMode::CONSTANT);
             }
         }
-        auto concat = std::make_shared<ngraph::op::Concat>(inp_nodes, cAxis);
+        auto concat = std::make_shared<ov::op::v0::Concat>(inp_nodes, cAxis);
         return Ptr<BackendNode>(new InfEngineNgraphNode(concat));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp
index 34f958782514..2246c1632071 100644
--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@@ -62,10 +62,15 @@ class ConstLayerImpl CV_FINAL : public ConstLayer
     {
         std::vector<UMat> outputs;
         outs.getUMatVector(outputs);
-        if (outs.depth() == CV_16S)
-            convertFp16(blobs[0], outputs[0]);
+        if (outs.depth() == CV_16F) {
+            auto blob = blobs[0];
+            if (blob.type() != CV_32F) {
+                blob.convertTo(blob, CV_32F);
+            }
+            blob.convertTo(outputs[0], CV_16F);
+        }
         else
-            blobs[0].copyTo(outputs[0]);
+            blobs[0].convertTo(outputs[0], outputs[0].type());
         return true;
     }
 #endif
@@ -80,7 +85,7 @@ class ConstLayerImpl CV_FINAL : public ConstLayer
 
         std::vector<Mat> outputs;
         outputs_arr.getMatVector(outputs);
-        blobs[0].copyTo(outputs[0]);
+        blobs[0].convertTo(outputs[0], outputs[0].type());
     }
 
 #ifdef HAVE_CANN
@@ -123,9 +128,23 @@ class ConstLayerImpl CV_FINAL : public ConstLayer
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
-        auto node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
+        ov::element::Type dType;
+        if (blobs[0].depth() == CV_32F) {
+            dType = ov::element::f32;
+        } else if (blobs[0].depth() == CV_32S) {
+            dType = ov::element::i32;
+        } else if (blobs[0].depth() == CV_8S) {
+            dType = ov::element::i8;
+        } else {
+            CV_Error(Error::StsNotImplemented, format("Unexpected Const data depth: %d", blobs[0].depth()));
+        }
+        std::shared_ptr<ov::Node> node =
+                    std::make_shared<ov::op::v0::Constant>(dType,
                                                            getShape<size_t>(blobs[0]),
                                                            blobs[0].data);
+        if (node->get_element_type() != ov::element::f32) {
+            node = std::make_shared<ov::op::v0::Convert>(node, ov::element::f32);
+        }
         return Ptr<BackendNode>(new InfEngineNgraphNode(node));
     }
 #endif  // HAVE_DNN_NGRAPH
@@ -151,7 +170,11 @@ class ConstLayerImpl CV_FINAL : public ConstLayer
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 
         CV_Assert(blobs.size() == 1);
-        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blobs[0]);
+        Mat blob = blobs[0];
+        if (blob.type() != CV_32F) {
+            blob.convertTo(blob, CV_32F);
+        }
+        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blob);
     }
 #endif
 
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 2787d64880d6..d0791ecddd96 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -62,9 +62,6 @@
 #include "opencl_kernels_dnn.hpp"
 using namespace cv::dnn::ocl4dnn;
 #endif
-#ifdef HAVE_TENGINE
-#include "../tengine4dnn/include/tengine_graph_convolution.hpp"
-#endif
 
 #ifdef HAVE_CUDA
 #include "../cuda4dnn/primitives/convolution.hpp"
@@ -143,7 +140,7 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
         }
 
         const Mat &input = inputs[0];
-        CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16S));
+        CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16F));
         for (size_t i = 0; i < outputs.size(); i++)
         {
             CV_Assert(inputs[i].type() == input.type());
@@ -245,8 +242,6 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
 };
 
 
-#define IS_POWER_LAYER(layer) \
-            (!layer.empty() && !layer->type.compare("Power"))
 //TODO: simultaneously convolution and bias addition for cache optimization
 class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 {
@@ -267,10 +262,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
     float power;
 #endif
 
-#ifdef HAVE_TENGINE
-    teng_graph_t tengine_graph;
-#endif
-
 #ifdef HAVE_CUDA
     cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode;
     cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
@@ -289,20 +280,8 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 #ifdef HAVE_CUDA
         cudaFusionMode = cuda4dnn::ConvolutionConfiguration::FusionMode::NONE;
         cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY;
-#endif
-#ifdef HAVE_TENGINE
-        tengine_graph=NULL;
 #endif
     }
-#ifdef HAVE_TENGINE
-    ~ConvolutionLayerImpl()
-    {
-        if(NULL != tengine_graph )
-        {
-            tengine_release(tengine_graph);
-        }
-    }
-#endif
 
     MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
     {
@@ -466,13 +445,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             for(int i = 0; i < numOutput; i++ )
                 biasvec[i] = biasMat.at<float>(i);
         }
-#ifdef HAVE_TENGINE
-        if(NULL != tengine_graph )
-        {
-            tengine_release(tengine_graph);
-            tengine_graph = NULL ;
-        }
-#endif
 #ifdef HAVE_OPENCL
         convolutionOp.release();
 #endif
@@ -848,13 +820,13 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         CV_Assert(!blobs.empty());
         CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::vector<size_t> dims = ieInpNode->get_shape();
+        std::vector<size_t> dims = ieInpNode.get_shape();
         CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
-        std::shared_ptr<ngraph::Node> ieWeights = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
+        ov::Output<ov::Node> ieWeights;
         if (nodes.size() > 1)
-            CV_Assert(ieWeights);  // dynamic_cast should not fail
+            ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
         const int inpCn = dims[1];
-        const int inpGroupCn = nodes.size() > 1 ? ieWeights->get_shape()[1] : blobs[0].size[1];
+        const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
         const int group = inpCn / inpGroupCn;
 
         std::vector<size_t> kernel_shape;
@@ -868,49 +840,49 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 
         if (nodes.size() == 1)
         {
-            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, blobs[0].data);
+            ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::f32, kernel_shape, blobs[0].data);
             if (fusedWeights)
             {
                 if (weightsMat.isContinuous())
                 {
-                    ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, weightsMat.data);
+                    ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::f32, kernel_shape, weightsMat.data);
                 }
                 else
                 {
                     Mat newWeights;
                     Mat cvWeights = weightsMat.colRange(0, blobs[0].total() / numOutput);
                     cvWeights.copyTo(newWeights);
-                    ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, newWeights.data);
+                    ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::f32, kernel_shape, newWeights.data);
                 }
             }
         }
         else
         {
-            auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                             ngraph::Shape{kernel_shape.size()}, std::vector<int64_t>(kernel_shape.begin(), kernel_shape.end()));
-            ieWeights  = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
+            auto shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                             ov::Shape{kernel_shape.size()}, std::vector<int64_t>(kernel_shape.begin(), kernel_shape.end()));
+            ieWeights  = std::make_shared<ov::op::v1::Reshape>(ieWeights, shape, true);
         }
 
-        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
+        ov::op::PadType pad_type = ov::op::PadType::EXPLICIT;
         if (!padMode.empty())
-            pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
+            pad_type = padMode == "VALID" ? ov::op::PadType::VALID : ov::op::PadType::SAME_UPPER;
 
-        std::shared_ptr<ngraph::Node> conv_node;
+        std::shared_ptr<ov::Node> conv_node;
         if (group != 1) {
-            conv_node = std::make_shared<ngraph::op::v1::GroupConvolution>(
+            conv_node = std::make_shared<ov::op::v1::GroupConvolution>(
                                 ieInpNode, ieWeights,
-                                ngraph::Strides(strides),
-                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
-                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(),   pads_end.end())),
-                                ngraph::Strides(dilations),
+                                ov::Strides(strides),
+                                ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                                ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(),   pads_end.end())),
+                                ov::Strides(dilations),
                                 pad_type);
         } else {
-            conv_node = std::make_shared<ngraph::op::v1::Convolution>(
+            conv_node = std::make_shared<ov::op::v1::Convolution>(
                                 ieInpNode, ieWeights,
-                                ngraph::Strides(strides),
-                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
-                                ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
-                                ngraph::Strides(dilations),
+                                ov::Strides(strides),
+                                ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                                ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_end.begin(), pads_end.end())),
+                                ov::Strides(dilations),
                                 pad_type);
         }
 
@@ -918,18 +890,18 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         {
             std::vector<size_t> shape(conv_node->get_shape().size(), 1);
             shape[1] = conv_node->get_shape()[1];
-            std::shared_ptr<ngraph::Node> bias;
+            std::shared_ptr<ov::Node> bias;
             if (nodes.size() == 3)
             {
-                auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                    ngraph::Shape{shape.size()}, std::vector<int64_t>(shape.begin(), shape.end()));
-                bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
+                auto bias_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                    ov::Shape{shape.size()}, std::vector<int64_t>(shape.begin(), shape.end()));
+                bias = std::make_shared<ov::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
             }
             else
             {
-                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), biasvec.data());
+                bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape(shape), biasvec.data());
             }
-            auto conv_bias = std::make_shared<ngraph::op::v1::Add>(conv_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+            auto conv_bias = std::make_shared<ov::op::v1::Add>(conv_node, bias, ov::op::AutoBroadcastType::NUMPY);
             return Ptr<BackendNode>(new InfEngineNgraphNode(conv_bias));
         }
         return Ptr<BackendNode>(new InfEngineNgraphNode(conv_node));
@@ -1051,7 +1023,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -1065,6 +1037,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             umat_blobs.resize(n);
             for (size_t i = 0; i < n; i++)
             {
+                CV_Assert(!use_half);  // TODO: not implemented
                 inputs[i + 1].copyTo(umat_blobs[i]);
             }
             inputs.resize(1);
@@ -1077,7 +1050,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             for (size_t i = 0; i < n; i++)
             {
                 if (use_half)
-                    convertFp16(blobs[i], umat_blobs[i]);
+                    blobs[i].convertTo(umat_blobs[i], CV_16F);
                 else
                     blobs[i].copyTo(umat_blobs[i]);
             }
@@ -1095,7 +1068,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             config.pads = pads;
             config.stride = stride;
             config.dilation = dilation;
-            if (inputs[0].dims != 4 && inputs[0].dims != umat_blobs[0].dims)
+            if (inputs[0].dims != 4 && inputs[0].dims != (blobs.empty() ? umat_blobs[0].dims : blobs[0].dims))
             {
                 static bool bypassCheck = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_CONVOLUTION_IGNORE_INPUT_DIMS_4_CHECK", false);
                 if (!bypassCheck)
@@ -1107,7 +1080,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
                     return false;
                 }
             }
-            config.group = inputs[0].size[1] / umat_blobs[0].size[1];
+            config.group = inputs[0].size[1] / (blobs.empty() ? umat_blobs[0].size[1] : blobs[0].size[1]);
             if (config.group < 1)  // config.group == 0 causes div by zero in ocl4dnn code
             {
                 CV_LOG_WARNING(NULL, "DNN/OpenCL: Unsupported config.group=" << config.group
@@ -1158,7 +1131,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         if (fusedWeights)
         {
             if (use_half)
-                convertFp16(weightsMat, umat_blobs[0]);
+                weightsMat.convertTo(umat_blobs[0], CV_16F);
             else
                 weightsMat.copyTo(umat_blobs[0]);
             fusedWeights = false;
@@ -1168,7 +1141,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             if ( umat_blobs.size() < 2 )
                 umat_blobs.resize(2);
             if (use_half)
-                convertFp16(Mat(biasvec, true), umat_blobs[1]);
+                Mat(biasvec, true).convertTo(umat_blobs[1], CV_16F);
             else
                 Mat(biasvec, true).copyTo(umat_blobs[1]);
             convolutionOp->setBias(true);
@@ -1231,7 +1204,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -1305,65 +1278,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             }
         }
 
-#ifdef HAVE_TENGINE
-        bool tengine_ret = false;
-
-        std::vector<Mat> teng_in, teng_out;
-        inputs_arr.getMatVector(teng_in);
-        outputs_arr.getMatVector(teng_out);
-
-        int inch = teng_in[0].size[1];    // inch
-        int in_h = teng_in[0].size[2];    // in_h
-        int in_w = teng_in[0].size[3];    // in_w
-
-        int out_b = teng_out[0].size[0];  // out batch size
-        int outch = teng_out[0].size[1];  // outch
-        int out_h = teng_out[0].size[2];  // out_h
-        int out_w = teng_out[0].size[3];  // out_w
-
-        float *input_  = teng_in[0].ptr<float>();
-        float *output_ = teng_out[0].ptr<float>();
-        float *kernel_ = weightsMat.ptr<float>();
-        float *teg_bias = &biasvec[0];
-
-        int nstripes = std::max(getNumThreads(), 1);
-
-        /* tengine_init will run when first time. */
-        if(NULL == tengine_graph)
-        {
-            // pads_begin: 0 - pad_top,    1 - pad_left
-            // pads_end:   0 - pad_bottom, 1 - pad_right
-            // pad_h0: pad_top,  pad_h1: pad_bottom
-            // pad_w0: pad_left, pad_w1: pad_right
-            tengine_graph = tengine_init(name.c_str(), input_, inch, ngroups, in_h, in_w,
-                                         output_, out_b, outch, out_h, out_w,
-                                         kernel_, kernel_size.size(), kernel.height, kernel.width,
-                                         teg_bias, stride.height, stride.width,
-                                         pads_begin[0], pads_end[0], pads_begin[1], pads_end[1], dilation.height, dilation.width,
-                                         weightsMat.step1(), padMode, tengine_graph, nstripes);
-            // printf("Init(%s):  input=%p(%d %d %d %d ),output=%p(%d %d %d %d ),kernel=%p(%ld %d %d ), bias=%p ,"
-            //        "stride(%d %d), pad(%d %d %d %d), dilation(%d %d) ,weightsMat=%ld, padMode=%s ,tengine_graph = %p \n",
-            //        name.c_str(),input_, inch, ngroups, in_h, in_w,
-            //        output_, out_b, outch, out_h, out_w,
-            //        kernel_, kernel_size.size(), kernel.height, kernel.width,
-            //        teg_bias, stride.height, stride.width,
-            //        pads_begin[0], pads_end[0], pads_begin[1], pads_end[1], dilation.height, dilation.width,
-            //        weightsMat.step1(), padMode.c_str() ,tengine_graph);
-        }
-        if(NULL != tengine_graph)
-        {
-            tengine_ret = tengine_forward(tengine_graph);
-        }
-        /* activation */
-        if((true == tengine_ret) && activ )
-        {
-            int out_cstep = out_h * out_w;	    // out_cstep
-
-            ActivationLayer* activ_ = activ.get();
-            activ_->forwardSlice(output_, output_, out_cstep, out_cstep, 0, outch);
-        }
-        if(false == tengine_ret)
-#endif
         {
             int nstripes = std::max(getNumThreads(), 1);
             int conv_dim = CONV_2D;
@@ -1385,6 +1299,10 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
                 fastConvImpl = initFastConv(weightsMat, &biasvec[0], ngroups, K, C, kernel_size, strides,
                                             dilations, pads_begin, pads_end, conv_dim,
                                             preferableTarget == DNN_TARGET_CPU_FP16, canUseWinograd);
+                // This is legal to release weightsMat here as this is not used anymore for
+                // OpenCV inference. If network needs to be reinitialized (new shape, new backend)
+                // a new version of weightsMat is created at .finalize() from original weights
+                weightsMat.release();
             }
 
             runFastConv(inputs[0], outputs[0], fastConvImpl, nstripes, activ, reluslope, fusedAdd);
@@ -1491,6 +1409,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         params.set("input_zeropoint", inputZp);
         params.set("input_scale", inputScale);
 
+        Mat weightsMat = blobs[0].reshape(1, numOutput);
         Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
         Mat biasQuantized(1, numOutput, CV_32S);
         Mat outputMultiplier(1, numOutput, CV_32F);
@@ -1970,7 +1889,7 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         std::vector<UMat> outputs;
         std::vector<UMat> internals;
 
-        if (inputs_.depth() == CV_16S)
+        if (inputs_.depth() == CV_16F)
             return false;
 
         inputs_.getUMatVector(inputs);
@@ -2077,7 +1996,7 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr));
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -2338,13 +2257,13 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 
        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
        std::vector<size_t> kernel_shape = getShape<size_t>(blobs[0]);
-       auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, blobs[0].data);
+       auto ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::f32, kernel_shape, blobs[0].data);
 
         if (fusedWeights)
         {
             Mat newWeights;
             transpose(weightsMat, newWeights);
-            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, newWeights.data);
+            ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::f32, kernel_shape, newWeights.data);
         }
         std::vector<size_t> paddings_end;
         if (padMode == "SAME")
@@ -2356,24 +2275,24 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         } else {
             paddings_end = pads_end;
         }
-        ngraph::op::PadType pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::EXPLICIT;
+        ov::op::PadType pad_type = padMode == "VALID" ? ov::op::PadType::VALID : ov::op::PadType::EXPLICIT;
 
-        auto deconv = std::make_shared<ngraph::op::v1::ConvolutionBackpropData>(
+        auto deconv = std::make_shared<ov::op::v1::ConvolutionBackpropData>(
                           ieInpNode,
                           ieWeights,
-                          ngraph::Strides(strides),
-                          ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
-                          ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(paddings_end.begin(), paddings_end.end())),
-                          ngraph::Strides(dilations),
+                          ov::Strides(strides),
+                          ov::CoordinateDiff(std::vector<std::ptrdiff_t>(pads_begin.begin(), pads_begin.end())),
+                          ov::CoordinateDiff(std::vector<std::ptrdiff_t>(paddings_end.begin(), paddings_end.end())),
+                          ov::Strides(dilations),
                           pad_type,
-                          ngraph::CoordinateDiff(std::vector<std::ptrdiff_t>(adjust_pads.begin(), adjust_pads.end())));
+                          ov::CoordinateDiff(std::vector<std::ptrdiff_t>(adjust_pads.begin(), adjust_pads.end())));
 
         if (hasBias() || fusedBias)
         {
             std::vector<size_t> shape(deconv->get_shape().size(), 1);
             shape[1] = numOutput;
-            auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), blobs[1].data);
-            auto deconv_bias = std::make_shared<ngraph::op::v1::Add>(deconv, bias, ngraph::op::AutoBroadcastType::NUMPY);
+            auto bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape(shape), blobs[1].data);
+            auto deconv_bias = std::make_shared<ov::op::v1::Add>(deconv, bias, ov::op::AutoBroadcastType::NUMPY);
             return Ptr<BackendNode>(new InfEngineNgraphNode(deconv_bias));
         }
 
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
index 27b0d4ba1f45..1734dccc636e 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
@@ -8,16 +8,26 @@ namespace cv {
 namespace dnn {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR);
+void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR);
 
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
+
+// FP 16 branch.
+void convBlock_F16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width,
+                    const int convMR_fp16, const int convNR_fp16);
+
+void convBlockMR1_F16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c,
+                       const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16);
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY)
+
+#if CV_AVX
 
 #if !CV_FMA3 // AVX workaround
 #undef _mm256_fmadd_ps
 #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
 #endif
 
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
+void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
 {
     CV_Assert(convMR == 4 && convNR == 24);
     __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
@@ -121,16 +131,11 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
     _mm256_zeroupper();
 }
 
-#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+#endif
 
-CV_CPU_OPTIMIZATION_NAMESPACE_END
+#if CV_NEON
 
-// NEON code work around.
-namespace opt_NEON
-{
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
-
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
+void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
 {
 #if CV_NEON_AARCH64
     if (convMR == 4 && convNR == 28) // AARCH64
@@ -298,104 +303,104 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
     }
     else
 #endif
-    if (convMR == 4 && convNR == 12) // ARMv7
-    {
-        float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0;
-        float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3;
-        float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6;
-        float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9;
-
-        float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
-        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
-
-        if (width > 8)
+        if (convMR == 4 && convNR == 12) // ARMv7
         {
-            for (int p = 0; p < np; p++, a += convMR, b += convNR)
-            {
-                a0 = vld1_f32(a), a1 = vld1_f32(a+2);
-                b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
-
-                c0 = vmlaq_lane_f32(c0, b0, a0, 0);
-                c1 = vmlaq_lane_f32(c1, b1, a0, 0);
-                c2 = vmlaq_lane_f32(c2, b2, a0, 0);
-
-                c3 = vmlaq_lane_f32(c3, b0, a0, 1);
-                c4 = vmlaq_lane_f32(c4, b1, a0, 1);
-                c5 = vmlaq_lane_f32(c5, b2, a0, 1);
+            float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0;
+            float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3;
+            float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6;
+            float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9;
 
-                c6 = vmlaq_lane_f32(c6, b0, a1, 0);
-                c7 = vmlaq_lane_f32(c7, b1, a1, 0);
-                c8 = vmlaq_lane_f32(c8, b2, a1, 0);
+            float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
+            float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
 
-                c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
-                c10 = vmlaq_lane_f32(c10, b1, a1, 1);
-                c11 = vmlaq_lane_f32(c11, b2, a1, 1);
+            if (width > 8)
+            {
+                for (int p = 0; p < np; p++, a += convMR, b += convNR)
+                {
+                    a0 = vld1_f32(a), a1 = vld1_f32(a+2);
+                    b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+
+                    c0 = vmlaq_lane_f32(c0, b0, a0, 0);
+                    c1 = vmlaq_lane_f32(c1, b1, a0, 0);
+                    c2 = vmlaq_lane_f32(c2, b2, a0, 0);
+
+                    c3 = vmlaq_lane_f32(c3, b0, a0, 1);
+                    c4 = vmlaq_lane_f32(c4, b1, a0, 1);
+                    c5 = vmlaq_lane_f32(c5, b2, a0, 1);
+
+                    c6 = vmlaq_lane_f32(c6, b0, a1, 0);
+                    c7 = vmlaq_lane_f32(c7, b1, a1, 0);
+                    c8 = vmlaq_lane_f32(c8, b2, a1, 0);
+
+                    c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+                    c10 = vmlaq_lane_f32(c10, b1, a1, 1);
+                    c11 = vmlaq_lane_f32(c11, b2, a1, 1);
+                }
             }
-        }
-        else if (width > 4)
-        {
-            for (int p = 0; p < np; p++, a += convMR, b += convNR)
+            else if (width > 4)
             {
-                a0 = vld1_f32(a), a1 = vld1_f32(a+2);
-                b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4);
+                for (int p = 0; p < np; p++, a += convMR, b += convNR)
+                {
+                    a0 = vld1_f32(a), a1 = vld1_f32(a+2);
+                    b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4);
 
-                c0 = vmlaq_lane_f32(c0, b0, a0, 0);
-                c1 = vmlaq_lane_f32(c1, b1, a0, 0);
+                    c0 = vmlaq_lane_f32(c0, b0, a0, 0);
+                    c1 = vmlaq_lane_f32(c1, b1, a0, 0);
 
-                c3 = vmlaq_lane_f32(c3, b0, a0, 1);
-                c4 = vmlaq_lane_f32(c4, b1, a0, 1);
+                    c3 = vmlaq_lane_f32(c3, b0, a0, 1);
+                    c4 = vmlaq_lane_f32(c4, b1, a0, 1);
 
-                c6 = vmlaq_lane_f32(c6, b0, a1, 0);
-                c7 = vmlaq_lane_f32(c7, b1, a1, 0);
+                    c6 = vmlaq_lane_f32(c6, b0, a1, 0);
+                    c7 = vmlaq_lane_f32(c7, b1, a1, 0);
 
-                c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
-                c10 = vmlaq_lane_f32(c10, b1, a1, 1);
+                    c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+                    c10 = vmlaq_lane_f32(c10, b1, a1, 1);
+                }
             }
-        }
-        else
-        {
-            for (int p = 0; p < np; p++, a += convMR, b += convNR)
+            else
             {
-                a0 = vld1_f32(a), a1 = vld1_f32(a+2);
-                b0 = vld1q_f32(b);
-
-                c0 = vmlaq_lane_f32(c0, b0, a0, 0);
-                c3 = vmlaq_lane_f32(c3, b0, a0, 1);
-                c6 = vmlaq_lane_f32(c6, b0, a1, 0);
-                c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+                for (int p = 0; p < np; p++, a += convMR, b += convNR)
+                {
+                    a0 = vld1_f32(a), a1 = vld1_f32(a+2);
+                    b0 = vld1q_f32(b);
+
+                    c0 = vmlaq_lane_f32(c0, b0, a0, 0);
+                    c3 = vmlaq_lane_f32(c3, b0, a0, 1);
+                    c6 = vmlaq_lane_f32(c6, b0, a1, 0);
+                    c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+                }
             }
-        }
 
-        if (!init_c)
-        {
-            c0 = vaddq_f32(c0, vld1q_f32(c));
-            c1 = vaddq_f32(c1, vld1q_f32(c + 4));
-            c2 = vaddq_f32(c2, vld1q_f32(c + 8));
+            if (!init_c)
+            {
+                c0 = vaddq_f32(c0, vld1q_f32(c));
+                c1 = vaddq_f32(c1, vld1q_f32(c + 4));
+                c2 = vaddq_f32(c2, vld1q_f32(c + 8));
 
-            c3 = vaddq_f32(c3, vld1q_f32(c + ldc));
-            c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4));
-            c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8));
+                c3 = vaddq_f32(c3, vld1q_f32(c + ldc));
+                c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4));
+                c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8));
 
-            c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2));
-            c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4));
-            c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8));
+                c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2));
+                c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4));
+                c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8));
 
-            c9  = vaddq_f32(c9 , vld1q_f32(c + ldc * 3));
-            c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4));
-            c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8));
-        }
+                c9  = vaddq_f32(c9 , vld1q_f32(c + ldc * 3));
+                c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4));
+                c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8));
+            }
 
-        vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2);
-        vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5);
-        vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8);
-        vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11);
-    }
-    else
-        CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock");
+            vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2);
+            vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5);
+            vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8);
+            vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11);
+        }
+        else
+            CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock");
 }
 
 void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const float bias, bool init_c,
-                  const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR)
+                      const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR)
 {
     CV_Assert(convNR == 28);
     float32x4_t c0 = vdupq_n_f32(bias), c1 = c0, c2 = c0;
@@ -452,13 +457,13 @@ void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const
 
     if (init_c)
     {
-        c0 += vld1q_f32(c);
-        c1 += vld1q_f32(c + 4);
-        c2 += vld1q_f32(c + 8);
-        c3 += vld1q_f32(c + 12);
-        c4 += vld1q_f32(c + 16);
-        c5 += vld1q_f32(c + 20);
-        c6 += vld1q_f32(c + 24);
+        c0 = vaddq_f32(c0, vld1q_f32(c));
+        c1 = vaddq_f32(c1, vld1q_f32(c + 4));
+        c2 = vaddq_f32(c2, vld1q_f32(c + 8));
+        c3 = vaddq_f32(c3, vld1q_f32(c + 12));
+        c4 = vaddq_f32(c4, vld1q_f32(c + 16));
+        c5 = vaddq_f32(c5, vld1q_f32(c + 20));
+        c6 = vaddq_f32(c6, vld1q_f32(c + 24));
     }
 
     if (ifMinMaxAct)
@@ -482,22 +487,16 @@ void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const
     vst1q_f32(c + 20, c5);
     vst1q_f32(c + 24, c6);
 }
+#endif
 
-#if CV_NEON_AARCH64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-// Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h.
-typedef __fp16 float16_t;
+#if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
-#ifndef __ARM_FEATURE_FMA // Work around without FMA support.
-#define vfmaq_f16(a, b, c) (a + b * c)
-#endif
-void convBlock_FP16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width,
+void convBlock_F16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width,
                     const int convMR_fp16, const int convNR_fp16)
 {
-#if 1
-    const float16_t* a = (const float16_t*)_a;
-    const float16_t* b = (const float16_t*)_b;
-    float16_t* c = (float16_t*)_c;
-
+    const __fp16* a = (const __fp16*)_a;
+    const __fp16* b = (const __fp16*)_b;
+    __fp16* c = (__fp16*)_c;
     CV_Assert(convMR_fp16 == 8 && convNR_fp16 == 24);
 
     float16x8_t c00 = vdupq_n_f16(0), c01 = c00, c02 = c00;
@@ -603,8 +602,8 @@ void convBlock_FP16(int np, const char * _a, const char * _b, char * _c, int ldc
 
     if (!init_c)
     {
-#undef _FX_UPDATE_CBUF_ROW
-#define _FX_UPDATE_CBUF_ROW(row) \
+        #undef _FX_UPDATE_CBUF_ROW
+        #define _FX_UPDATE_CBUF_ROW(row) \
         c##row##0 = c##row##0 + vld1q_f16(c + row*ldc); \
         c##row##1 = c##row##1 + vld1q_f16(c + row*ldc + 8); \
         c##row##2 = c##row##2 + vld1q_f16(c + row*ldc + 16)
@@ -619,8 +618,8 @@ void convBlock_FP16(int np, const char * _a, const char * _b, char * _c, int ldc
         _FX_UPDATE_CBUF_ROW(7);
     }
 
-#undef _FX_STORE_CBUF_ROW
-#define _FX_STORE_CBUF_ROW(row) \
+    #undef _FX_STORE_CBUF_ROW
+    #define _FX_STORE_CBUF_ROW(row) \
     vst1q_f16(c + row*ldc, c##row##0); \
     vst1q_f16(c + row*ldc + 8, c##row##1); \
     vst1q_f16(c + row*ldc + 16, c##row##2)
@@ -633,51 +632,16 @@ void convBlock_FP16(int np, const char * _a, const char * _b, char * _c, int ldc
     _FX_STORE_CBUF_ROW(5);
     _FX_STORE_CBUF_ROW(6);
     _FX_STORE_CBUF_ROW(7);
-#else
-    // reference only.
-    const float16_t* a = (const float16_t*)_a;
-    const float16_t* b = (const float16_t*)_b;
-    float16_t* c = (float16_t*)_c;
-    float cbuf[convMR_fp16*convNR_fp16];
-    memset(cbuf, 0, sizeof(cbuf));
-
-    for( int p = 0; p < np; p++ )
-    {
-        for( int i = 0; i < convMR_fp16; i++ )
-        {
-            float ai = float(a[convMR_fp16*p + i]);
-                for( int j = 0; j < convNR_fp16; j++ )
-                    cbuf[i*convNR_fp16+j] += float(b[convNR_fp16*p + j]) * ai;
-        }
-    }
-
-    if (!init_c)
-    {
-    for(int i = 0; i < convMR_fp16; i++)
-        {
-            for(int j = 0; j < convNR_fp16; j++)
-                c[i*ldc + j] = float16_t(float(c[i*ldc + j]) + cbuf[i*convNR_fp16 + j]);
-        }
-    }
-    else
-    {
-        for(int i = 0; i < convMR_fp16; i++)
-        {
-            for(int j = 0; j < convNR_fp16; j++)
-                c[i*ldc + j] = (float16_t)(cbuf[i*convNR_fp16 + j]);
-        }
-    }
-#endif
 }
 
-void convBlockMR1_FP16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c,
-                            const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16)
+void convBlockMR1_F16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c,
+                       const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16)
 {
     CV_Assert(convNR_FP16 == 24); // CONV_NR_FP16 = 24
-    const float16_t* a = (const float16_t*)_a;
-    const float16_t* b = (const float16_t*)_b;
+    const __fp16* a = (const __fp16*)_a;
+    const __fp16* b = (const __fp16*)_b;
 
-    const float16_t bias = (float16_t)_bias;
+    const __fp16 bias = (__fp16)_bias;
 
     float16x8_t c0 = vdupq_n_f16(bias), c1 = c0, c2 = c0;
 
@@ -685,7 +649,7 @@ void convBlockMR1_FP16(int np, const char* _a, const char* _b, float *c, const f
     {
         for (int p = 0; p < np; p++, a++, b += convNR_FP16)
         {
-            float16x8_t a0= vdupq_n_f16(a[0]);
+            float16x8_t a0 = vdupq_n_f16(a[0]);
             float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16);
 
             c0 = vfmaq_f16(c0, a0, b0);
@@ -754,6 +718,7 @@ void convBlockMR1_FP16(int np, const char* _a, const char* _b, float *c, const f
 }
 #endif
 
-#endif
-}
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
index 3e969336ad57..8c1c643abeb2 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
@@ -92,7 +92,7 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
         ofstab[k] = dy * Wi + dx;
     }
 
-    const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
+    const float *weights0 = conv->getWeights(), *bias = conv->biasBuf.data();
     const float* relu = reluslope.data();
     CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
 
@@ -236,13 +236,11 @@ void depthWiseBlockConv2D(const float* wptr,
                             v21 = v_load(imgptr2 + in_j + dilation_w),
                             v22 = v_load(imgptr2 + in_j + dilation_w*2);
 
-                    v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 +
-                                     v10*vw10 + v11*vw11 + v12*vw12 +
-                                     v20*vw20 + v21*vw21 + v22*vw22 + vbias;
+                    v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias);
                     if (fusedAdd)
-                        vout = v_load(outptr + out_j) + vout;
+                        vout = v_add(v_load(outptr + out_j), vout);
                     if (relu)
-                        vout = v_select(vout > z, vout, vout*vrc);
+                        vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
                     v_store(outptr + out_j, vout);
                 }
             }
@@ -268,14 +266,12 @@ void depthWiseBlockConv2D(const float* wptr,
                     v_load_deinterleave(imgptr2 + in_j, v20, v21);
                     v_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
 
-                    v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 +
-                            v10 * vw10 + v11 * vw11 + v12 * vw12 +
-                            v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias;
+                    v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias);
 
                     if (fusedAdd)
-                        vout = v_load(outptr + out_j) + vout;
+                        vout = v_add(v_load(outptr + out_j), vout);
                     if (relu)
-                        vout = v_select(vout > z, vout, vout*vrc);
+                        vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
                     v_store(outptr + out_j, vout);
                 }
             }
@@ -381,11 +377,11 @@ void depthWiseBlockConv1D(const float* wptr,
                         v01 = v_load(imgptr0 + in_j + dilation_w),
                         v02 = v_load(imgptr0 + in_j + dilation_w*2);
 
-                v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias;
+                v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias);
                 if (fusedAdd)
-                    vout = v_load(outptr + out_j) + vout;
+                    vout = v_add(v_load(outptr + out_j), vout);
                 if (relu)
-                    vout = v_select(vout > z, vout, vout*vrc);
+                    vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
                 v_store(outptr + out_j, vout);
             }
         }
@@ -407,13 +403,13 @@ void depthWiseBlockConv1D(const float* wptr,
                 v_load_deinterleave(imgptr0 + in_j, v00, v01);
                 v_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
 
-                v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias;
+                v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias);
 
                 if (fusedAdd)
-                    vout = v_load(outptr + out_j) + vout;
+                    vout = v_add(v_load(outptr + out_j), vout);
 
                 if (relu)
-                    vout = v_select(vout > z, vout, vout*vrc);
+                    vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
                 v_store(outptr + out_j, vout);
             }
         }
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp
index 1d561e986425..6d4b211b8ce5 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp
@@ -209,34 +209,6 @@ void fastDepthwiseConv( const float* wptr,
 
 #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV
 
-/*
-Example for load_deinterleave:
-    input: ptr[16] = {1,2,3, ... ,14,15,16}
-    output: a = {1, 3, 5, 7, 9, 11, 13, 15}
-    output: b = {2, 4, 6, 8,10, 12, 14, 16}
-*/
-static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b, int vl)
-{
-    vuint64m4_t mask = vmv_v_x_u64m4(1,vl*2);
-    vuint32m4_t mask_re = vreinterpret_v_u64m4_u32m4(mask);
-    vbool8_t mask0 = vmseq_vx_u32m4_b8 (mask_re, 1, vl*2);
-    vbool8_t mask1 = vmseq_vx_u32m4_b8 (mask_re, 0, vl*2);
-    vfloat32m4_t tempa = vundefined_f32m4(), tempb = vundefined_f32m4();
-    vfloat32m4_t vw = vle32_v_f32m4(ptr, vl*2);
-    tempa = vcompress_vm_f32m4(mask0, tempa, vw, vl*2);
-    tempb = vcompress_vm_f32m4(mask1, tempb, vw, vl*2);
-    /* The following instructions have not to be supported by the GNU toolchain.
-       So we temporarily use store and load instead.
-    // a = vlmul_trunc_v_f32m4_f32m2(tempa);
-    // b = vlmul_trunc_v_f32m4_f32m2(tempb);
-    */
-    cv::AutoBuffer<float> cvBuffer(sizeof(float)*vl*2);
-    float* buffer = (float*)cvBuffer.data();
-    vse32_v_f32m4(buffer, tempa, vl);
-    a = vle32_v_f32m2(buffer, vl);
-    vse32_v_f32m4(buffer, tempb, vl);
-    b = vle32_v_f32m2(buffer, vl);
-}
 
 void fastDepthwiseConv( const float* wptr,
                      int kernel_h, int kernel_w,
@@ -292,64 +264,40 @@ void fastDepthwiseConv( const float* wptr,
             if( stride_w == 1 )
                 for( ; out_j < outW1; out_j += vl, avl -= vl)
                 {
-                    vl = vsetvl_e32m2(avl);
+                    vl = vsetvl_e32m8(avl);
                     int in_j = out_j * stride_w - pad_l;
-                    vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl),
-                           v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl),
-                           v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl),
-                           v10 = vle32_v_f32m2(imgptr1 + in_j, vl),
-                           v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl),
-                           v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl),
-                           v20 = vle32_v_f32m2(imgptr2 + in_j, vl),
-                           v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl),
-                           v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl);
-
-                    vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
-                    vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
-                    vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
-                    vout0 = vfadd_vf_f32m2(vout0, bias, vl);
-
-                    vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
-                    vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
-                    vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
-
-                    vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
-                    vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
-                    vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
-
-                    vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
+                    vfloat32m8_t vout0 = vfmacc_vf_f32m8(vfmv_v_f_f32m8(bias, vl), w00, vle32_v_f32m8(imgptr0 + in_j, vl), vl);
+                    vout0 = vfmacc_vf_f32m8(vout0, w01, vle32_v_f32m8(imgptr0 + in_j + dilation_w, vl), vl);
+                    vout0 = vfmacc_vf_f32m8(vout0, w02, vle32_v_f32m8(imgptr0 + in_j + dilation_w*2, vl), vl);
+                    vout0 = vfmacc_vf_f32m8(vout0, w10, vle32_v_f32m8(imgptr1 + in_j, vl),vl);
+                    vout0 = vfmacc_vf_f32m8(vout0, w11, vle32_v_f32m8(imgptr1 + in_j + dilation_w, vl),vl);
+                    vout0 = vfmacc_vf_f32m8(vout0, w12, vle32_v_f32m8(imgptr1 + in_j + dilation_w*2, vl),vl);
+                    vout0 = vfmacc_vf_f32m8(vout0, w20, vle32_v_f32m8(imgptr2 + in_j, vl), vl);
+                    vout0 = vfmacc_vf_f32m8(vout0, w21, vle32_v_f32m8(imgptr2 + in_j + dilation_w, vl), vl);
+                    vout0 = vfmacc_vf_f32m8(vout0, w22, vle32_v_f32m8(imgptr2 + in_j + dilation_w*2, vl), vl);
                     if (relu)
                     {
-                        vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
-                        vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
+                        vbool4_t m = vmfgt_vf_f32m8_b4(vout0, 0, vl);
+                        vout0 = vmerge_vvm_f32m8(m, vfmul_vf_f32m8(vout0, relu_coeff, vl), vout0, vl);
                     }
-                    vse32_v_f32m2(outptr + out_j, vout0, vl);
+                    vse32_v_f32m8(outptr + out_j, vout0, vl);
                 }
             else //stride_w == 2 && dilation_w == 1
                 for( ; out_j < outW1; out_j += vl, avl -= vl)
                 {
                     vl = vsetvl_e32m2(avl);
                     int in_j = out_j * stride_w - pad_l;
-                    vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
-                    vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01, vl);
-                    vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused, vl);
-                    vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11, vl);
-                    vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused, vl);
-                    vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21, vl);
-                    vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused, vl);
-
-                    vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
-                    vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
-                    vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
-                    vout0 = vfadd_vf_f32m2(vout0, bias, vl);
-
-                    vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
-                    vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
-                    vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
-
-                    vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
-                    vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
-                    vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
+                    vfloat32m2_t vout0 = vfmacc_vf_f32m2(vfmv_v_f_f32m2(bias, vl), w00, vlse32_v_f32m2(imgptr0+in_j  , 8, vl), vl);
+                    vfloat32m2_t vout1 = vfmul_vf_f32m2(vlse32_v_f32m2(imgptr0+in_j+1, 8, vl), w01, vl);
+                    vfloat32m2_t vout2 = vfmul_vf_f32m2(vlse32_v_f32m2(imgptr0+in_j+2, 8, vl), w02, vl);
+
+                    vout0 = vfmacc_vf_f32m2(vout0, w10, vlse32_v_f32m2(imgptr1+in_j  , 8, vl), vl);
+                    vout1 = vfmacc_vf_f32m2(vout1, w11, vlse32_v_f32m2(imgptr1+in_j+1, 8, vl), vl);
+                    vout2 = vfmacc_vf_f32m2(vout2, w12, vlse32_v_f32m2(imgptr1+in_j+2, 8, vl), vl);
+
+                    vout0 = vfmacc_vf_f32m2(vout0, w20, vlse32_v_f32m2(imgptr2+in_j  , 8, vl), vl);
+                    vout1 = vfmacc_vf_f32m2(vout1, w21, vlse32_v_f32m2(imgptr2+in_j+1, 8, vl), vl);
+                    vout2 = vfmacc_vf_f32m2(vout2, w22, vlse32_v_f32m2(imgptr2+in_j+2, 8, vl), vl);
 
                     vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
                     if (relu)
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
index a18943994ca7..46e220e69f3a 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
@@ -20,15 +20,15 @@ namespace cv { namespace dnn {
 #if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
 enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
 
-void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
                             const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
 
 /*Input transform*/
-void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
                           float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
 
 /*Output transform*/
-void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
+void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
                           float bias, float minval, float maxval, bool ifMinMaxAct);
 
 int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
@@ -67,6 +67,28 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
 #endif
     const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
 
+    int CONV_WINO_ATOM = CONV_WINO_ATOM_F32;
+    int CONV_WINO_NATOMS = CONV_WINO_NATOMS_F32;
+
+#ifdef CONV_ARM_FP16
+    // FP 16
+    const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
+    const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
+#endif
+
+    int esz = sizeof(float );
+
+#ifdef CONV_ARM_FP16
+    const bool useFP16 = conv->useFP16;
+    if (useFP16)
+    {
+        // works at FP 16.
+        CONV_WINO_ATOM = CONV_WINO_ATOM_F16;
+        CONV_WINO_NATOMS = CONV_WINO_NATOMS_F16;
+        esz = sizeof(__fp16);
+    }
+#endif
+
     int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
     const size_t inp_planesize = (size_t)Hi*Wi;
     const size_t out_planesize = (size_t)H0*W0;
@@ -78,9 +100,9 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
 
     size_t totalbufsize = (size_t)N*C*blocks_per_plane_aligned*CONV_WINO_AREA;
 
-    AutoBuffer<float> _buf;
-    _buf.allocate(totalbufsize + VEC_ALIGN);
-    float* wbuf_all = alignPtr(_buf.data(), VEC_ALIGN);
+    AutoBuffer<char> _buf;
+    _buf.allocate((totalbufsize + VEC_ALIGN) * esz);
+    char* wbuf_all = alignPtr(_buf.data(), VEC_ALIGN * esz);
 
     float* inp = input.ptr<float>();
     float* out = output.ptr<float>();
@@ -104,14 +126,15 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
             int c = nc0 - n*C;
             int g = c / Cg;
             c -= g*Cg;
+
             for (int block_id = 0; block_id < blocks_per_plane; block_id += CONV_WINO_IBLOCK)
             {
                 for (int db = 0; db < CONV_WINO_IBLOCK; db++)
                 {
                     size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned +
                                      block_id)*Cg*CONV_WINO_AREA +
-                                    (c*CONV_WINO_IBLOCK + db)*CONV_WINO_ATOM_F32;
-                    float* inwptr = (float*)wbuf_all + inwofs;
+                                    (c*CONV_WINO_IBLOCK + db) * CONV_WINO_ATOM;
+                    char* inwptr = wbuf_all + inwofs * esz;
 
                     if (block_id + db < blocks_per_plane)
                     {
@@ -152,27 +175,40 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
                             inptr = inpbuf;
                             inpstep = CONV_WINO_SIZE;
                         }
+
 #if CV_TRY_AVX2
                         if (conv->useAVX2)
-                            opt_AVX2::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
+                            opt_AVX2::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
                         else
 #endif
 #if CV_TRY_AVX
                         if (conv->useAVX)
-                            opt_AVX::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
+                            opt_AVX::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
                         else
 #endif
 #if CV_NEON && CV_NEON_AARCH64
                         if (conv->useNEON)
-                            opt_NEON::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
+                        {
+#ifdef CONV_ARM_FP16
+                            if (useFP16)
+                            {
+                                opt_NEON_FP16::winofunc_BtXB_8x8_F16(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK,
+                                                                CONV_WINO_ATOM);
+                            }
+                            else
+#endif
+                            opt_NEON::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK,
+                                                            CONV_WINO_ATOM);
+                        }
                         else
 #endif
-                        winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
+                        winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM);
+
                     }
                     else
                     {
-                        for (int i = 0; i < CONV_WINO_NATOMS_F32; i++, inwptr += CONV_WINO_IBLOCK*CONV_WINO_ATOM_F32)
-                            memset(inwptr, 0, CONV_WINO_ATOM_F32*sizeof(inwptr[0]));
+                        for (int i = 0; i < CONV_WINO_NATOMS; i++, inwptr += CONV_WINO_IBLOCK * CONV_WINO_ATOM * esz)
+                            memset(inwptr, 0, CONV_WINO_ATOM * esz);
                     }
                 }
             }
@@ -182,19 +218,37 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
     // Phase 2. compute elemwise-weighted sums of transformed blocks,
     // apply inverse Winograd transforms to the sums,
     // add bias, apply activation function if any and store the results.
+    char* wptr0 = nullptr;
+#ifdef CONV_ARM_FP16
+    if (useFP16)
+    {
+        CV_Assert(!conv->weightsWinoBuf_FP16.empty());
+        wptr0 = (char *)conv->getWeightsWinoFP16();
+    }
+    else
+#endif
+    {
+        CV_Assert(!conv->weightsWinoBuf.empty());
+        wptr0 = (char *)conv->getWeightsWino();
+    }
+
     parallel_for_(Range(0, ntasks), [&](const Range& r0) {
     for (int task_id = r0.start; task_id < r0.end; task_id++)
     {
-        size_t out_wbuf_size = CONV_WINO_AREA*CONV_WINO_KBLOCK*CONV_WINO_IBLOCK;
+        size_t out_wbuf_size = CONV_WINO_AREA * CONV_WINO_KBLOCK * CONV_WINO_IBLOCK;
         size_t outbuf_size = CONV_WINO_AREA;
-        AutoBuffer<float> out_wbuf_, outbuf_;
-        out_wbuf_.allocate(out_wbuf_size + VEC_ALIGN);
-        float* out_wbuf = alignPtr(out_wbuf_.data(), VEC_ALIGN);
+
+        // For saving the accumulation output.
+        AutoBuffer<char> out_wbuf_;
+        out_wbuf_.allocate((out_wbuf_size + VEC_ALIGN) * esz);
+        char* out_wbuf = alignPtr(out_wbuf_.data(), VEC_ALIGN * esz);
+        memset(out_wbuf, 0, out_wbuf_size * esz);
+
+        // For saving the fuse_Add data.
+        AutoBuffer<float> outbuf_;
         outbuf_.allocate(outbuf_size + VEC_ALIGN);
         float* outbuf = alignPtr(outbuf_.data(), VEC_ALIGN);
-
-        memset(out_wbuf, 0, out_wbuf_size * sizeof(float));
-        memset(outbuf, 0, outbuf_size * sizeof(float));
+        memset(outbuf, 0, outbuf_size * sizeof(outbuf[0]));
 
         int ngk0 = (int)(((int64_t)N*Kg_nblocks*ngroups)*task_id/ntasks);
         int ngk1 = (int)(((int64_t)N*Kg_nblocks*ngroups)*(task_id+1)/ntasks);
@@ -214,30 +268,40 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
                 size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + block_id0)*Cg*CONV_WINO_AREA;
                 size_t wofs = (g*Kg_nblocks*CONV_WINO_KBLOCK + k0)*Cg*CONV_WINO_AREA;
 
-                float* inwptr = wbuf_all + inwofs;
-                const float* wptr = conv->weightsWinoBufPtr + wofs;
+                char* inwptr = wbuf_all + inwofs * esz;
+                char* wptr = wptr0 + wofs * esz;
 
 #if CV_TRY_AVX2
                 if (conv->useAVX2)
-                    opt_AVX2::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
+                    opt_AVX2::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
                 else
 #endif
 #if CV_TRY_AVX
                 if (conv->useAVX)
-                    opt_AVX::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
+                    opt_AVX::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
                 else
 #endif
 #if CV_NEON && CV_NEON_AARCH64
                 if (conv->useNEON)
-                    opt_NEON::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
+                {
+#ifdef CONV_ARM_FP16
+                    if (useFP16)
+                    {
+                        opt_NEON_FP16::winofunc_accum_F16(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                                     CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
+                    }
+                    else
+#endif
+                    opt_NEON::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                                 CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
+                }
                 else
 #endif
+                winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS);
 
-                winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
-                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
                 for (int k = k0; k < k1; k++)
                 {
                     float biasv = conv->biasBuf[g*Kg + k];
@@ -274,31 +338,42 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
                         }
 #if CV_TRY_AVX2
                         if (conv->useAVX2)
-                            opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                            opt_AVX2::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
                                                                 bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
                         else
 #endif
 #if CV_TRY_AVX
                         if (conv->useAVX)
-                            opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                            opt_AVX::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
                                                                 bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
                         else
 #endif
 #if CV_NEON && CV_NEON_AARCH64
+                        // NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics.
                         if (conv->useNEON)
-                            // NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics.
-                            opt_NEON::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                        {
+#ifdef CONV_ARM_FP16
+                            if (useFP16)
+                            {
+                                opt_NEON_FP16::winofunc_AtXA_8x8_F16(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA * esz, CONV_WINO_SIZE,
                                                                 bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
+                            }
+                            else
+#endif
+                            opt_NEON::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                                                            bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
+                        }
                         else
 #endif
-                        winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                        winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
                                                   bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
+
                         if (partial)
                         {
                             if (activ)
                                 activ->forwardSlice(outptr, outptr, CONV_WINO_SIZE*CONV_WINO_STEP, 0, g*Kg + k, g*Kg + k + 1);
                             for (int y = 0; y < dy1; y++)
-                                memcpy(outptr0 + y*W0, outptr + y*CONV_WINO_SIZE,dx1*sizeof(outptr0[0]));
+                                memcpy(outptr0 + y*W0, outptr + y*CONV_WINO_SIZE, dx1*sizeof(outptr0[0]));
                         }
                     }
                 }
@@ -314,7 +389,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
 
 #if CV_SIMD128
 
-void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
                             const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
 {
 #if 1
@@ -411,7 +486,7 @@ void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, i
 }
 
 /*Input transform*/
-void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
                           float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
 {
     CV_Assert(winoIblock == 3 && winoAtomF32 == 4);
@@ -430,32 +505,32 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
         /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
         v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
-        t00 = x40 - x20;
-        t01 = x41 - x21;
-        t10 = x30 - x50;
-        t11 = x31 - x51;
-        v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60);
-        v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61);
-        v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10);
-        v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11);
+        t00 = v_sub(x40, x20);
+        t01 = v_sub(x41, x21);
+        t10 = v_sub(x30, x50);
+        t11 = v_sub(x31, x51);
+        v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60));
+        v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61));
+        v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10));
+        v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11));
 
         /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
         /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
         v_float32x4 qm4_25 = v_setall_f32(-4.25f);
-        t00 = v_fma(x30, qm4_25, x10 + x50);
-        t01 = v_fma(x31, qm4_25, x11 + x51);
-        t10 = v_fma(x40, qm4_25, x20 + x60);
-        t11 = v_fma(x41, qm4_25, x21 + x61);
+        t00 = v_fma(x30, qm4_25, v_add(x10, x50));
+        t01 = v_fma(x31, qm4_25, v_add(x11, x51));
+        t10 = v_fma(x40, qm4_25, v_add(x20, x60));
+        t11 = v_fma(x41, qm4_25, v_add(x21, x61));
 
-        v_float32x4 y10 = t00 + t10, y11 = t01 + t11;
-        v_float32x4 y20 = t10 - t00, y21 = t11 - t01;
+        v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11);
+        v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01);
 
         /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
         /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
         v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
         v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
-        t00 = v_fma(x10, q0_5, x50 + x50);
-        t01 = v_fma(x11, q0_5, x51 + x51);
+        t00 = v_fma(x10, q0_5, v_add(x50, x50));
+        t01 = v_fma(x11, q0_5, v_add(x51, x51));
         t10 = v_fma(x20, q0_25, x60);
         t11 = v_fma(x21, q0_25, x61);
         t00 = v_fma(x30, qm2_5, t00);
@@ -463,14 +538,14 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         t10 = v_fma(x40, qm1_25, t10);
         t11 = v_fma(x41, qm1_25, t11);
 
-        v_float32x4 y30 = t00 + t10, y31 = t01 + t11;
-        v_float32x4 y40 = t10 - t00, y41 = t11 - t01;
+        v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11);
+        v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01);
 
         /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
         /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
         v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
-        t00 = v_fma(x50, q0_5, x10 + x10);
-        t01 = v_fma(x51, q0_5, x11 + x11);
+        t00 = v_fma(x50, q0_5, v_add(x10, x10));
+        t01 = v_fma(x51, q0_5, v_add(x11, x11));
         t10 = v_fma(x20, q4   , x60);
         t11 = v_fma(x21, q4   , x61);
         t00 = v_fma(x30, qm2_5, t00);
@@ -478,8 +553,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         t10 = v_fma(x40, qm5  , t10);
         t11 = v_fma(x41, qm5  , t11);
 
-        v_float32x4 y50 = t00 + t10, y51 = t01 + t11;
-        v_float32x4 y60 = t10 - t00, y61 = t11 - t01;
+        v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11);
+        v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01);
 
         /* transpose 8x8 matrix with v_transpose4x4 */
 
@@ -491,29 +566,29 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
 
         /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
         /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
-        t00 = y010 - y200;
-        t01 = y410 - y600;
-        t10 = y300 - y110;
-        t11 = y700 - y510;
-        z00 = v_fma(t00, q5_25, y000 - y210);
-        z01 = v_fma(t01, q5_25, y400 - y610);
-        z70 = v_fma(t10, q5_25, y310 - y100);
-        z71 = v_fma(t11, q5_25, y710 - y500);
+        t00 = v_sub(y010, y200);
+        t01 = v_sub(y410, y600);
+        t10 = v_sub(y300, y110);
+        t11 = v_sub(y700, y510);
+        z00 = v_fma(t00, q5_25, v_sub(y000, y210));
+        z01 = v_fma(t01, q5_25, v_sub(y400, y610));
+        z70 = v_fma(t10, q5_25, v_sub(y310, y100));
+        z71 = v_fma(t11, q5_25, v_sub(y710, y500));
 
         /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
         /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
-        t00 = v_fma(y300, qm4_25, y100 + y110);
-        t01 = v_fma(y700, qm4_25, y500 + y510);
-        t10 = v_fma(y010, qm4_25, y200 + y210);
-        t11 = v_fma(y410, qm4_25, y600 + y610);
+        t00 = v_fma(y300, qm4_25, v_add(y100, y110));
+        t01 = v_fma(y700, qm4_25, v_add(y500, y510));
+        t10 = v_fma(y010, qm4_25, v_add(y200, y210));
+        t11 = v_fma(y410, qm4_25, v_add(y600, y610));
 
-        z10 = t00 + t10; z11 = t01 + t11;
-        z20 = t10 - t00; z21 = t11 - t01;
+        z10 = v_add(t00, t10); z11 = v_add(t01, t11);
+        z20 = v_sub(t10, t00); z21 = v_sub(t11, t01);
 
         /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
         /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
-        t00 = v_fma(y100, q0_5, y110 + y110);
-        t01 = v_fma(y500, q0_5, y510 + y510);
+        t00 = v_fma(y100, q0_5, v_add(y110, y110));
+        t01 = v_fma(y500, q0_5, v_add(y510, y510));
         t10 = v_fma(y200, q0_25, y210);
         t11 = v_fma(y600, q0_25, y610);
         t00 = v_fma(y300, qm2_5, t00);
@@ -521,13 +596,13 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         t10 = v_fma(y010, qm1_25, t10);
         t11 = v_fma(y410, qm1_25, t11);
 
-        z30 = t00 + t10; z31 = t01 + t11;
-        z40 = t10 - t00; z41 = t11 - t01;
+        z30 = v_add(t00, t10); z31 = v_add(t01, t11);
+        z40 = v_sub(t10, t00); z41 = v_sub(t11, t01);
 
         /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
         /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
-        t00 = v_fma(y110, q0_5, y100 + y100);
-        t01 = v_fma(y510, q0_5, y500 + y500);
+        t00 = v_fma(y110, q0_5, v_add(y100, y100));
+        t01 = v_fma(y510, q0_5, v_add(y500, y500));
         t10 = v_fma(y200, q4, y210);
         t11 = v_fma(y600, q4, y610);
         t00 = v_fma(y300, qm2_5, t00);
@@ -535,8 +610,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         t10 = v_fma(y010, qm5, t10);
         t11 = v_fma(y410, qm5, t11);
 
-        z50 = t00 + t10; z51 = t01 + t11;
-        z60 = t10 - t00; z61 = t11 - t01;
+        z50 = v_add(t00, t10); z51 = v_add(t01, t11);
+        z60 = v_sub(t10, t00); z61 = v_sub(t11, t01);
     }
 
     const int outstep = winoIblock*winoAtomF32*Cg;
@@ -585,7 +660,7 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
     the Winograd-transformed weights should also be transposed.
     init_conv() (see OpConv.fx) takes care of that.
 */
-void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
+void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
                           float* bpptr, int bpstep, float* outptr, int outstep,
                           float bias, float minval, float maxval, bool ifMinMaxAct)
 {
@@ -601,12 +676,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
 
     {
         v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
-        s12_0 = x10 + x20; s12_1 = x11 + x21;
-        s34_0 = x30 + x40; s34_1 = x31 + x41;
-        s56_0 = x50 + x60; s56_1 = x51 + x61;
+        s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21);
+        s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41);
+        s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61);
 
-        v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0;
-        v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1;
+        v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0);
+        v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1);
 
         v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
         v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@@ -616,13 +691,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
         v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
 
-        s12_0 = x10 - x20; s12_1 = x11 - x21;
-        s34_0 = x30 - x40; s34_1 = x31 - x41;
-        s56_0 = x50 - x60; s56_1 = x51 - x61;
+        s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21);
+        s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41);
+        s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61);
 
         a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
-        v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0));
-        v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1));
+        v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0)));
+        v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1)));
 
         a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
         v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@@ -642,12 +717,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
         v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
 
-        s12_0 = y100 + y200; s12_1 = y500 + y600;
-        s34_0 = y300 + y010; s34_1 = y700 + y410;
-        s56_0 = y110 + y210; s56_1 = y510 + y610;
+        s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600);
+        s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410);
+        s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610);
 
-        z00 = y000 + s12_0 + s34_0 + s56_0;
-        z01 = y400 + s12_1 + s34_1 + s56_1;
+        z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0);
+        z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1);
 
         a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
         z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@@ -657,13 +732,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
         z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
 
-        s12_0 = y100 - y200; s12_1 = y500 - y600;
-        s34_0 = y300 - y010; s34_1 = y700 - y410;
-        s56_0 = y110 - y210; s56_1 = y510 - y610;
+        s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600);
+        s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410);
+        s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610);
 
         a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
-        z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y310 + s12_0));
-        z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y710 + s12_1));
+        z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0)));
+        z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1)));
         a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
         z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
         z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
@@ -673,34 +748,34 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
 
         v_float32x4 vbias = v_setall_f32(bias);
-        z00 += vbias;
-        z01 += vbias;
-        z10 += vbias;
-        z11 += vbias;
-        z20 += vbias;
-        z21 += vbias;
-        z30 += vbias;
-        z31 += vbias;
-        z40 += vbias;
-        z41 += vbias;
-        z50 += vbias;
-        z51 += vbias;
+        z00 = v_add(z00, vbias);
+        z01 = v_add(z01, vbias);
+        z10 = v_add(z10, vbias);
+        z11 = v_add(z11, vbias);
+        z20 = v_add(z20, vbias);
+        z21 = v_add(z21, vbias);
+        z30 = v_add(z30, vbias);
+        z31 = v_add(z31, vbias);
+        z40 = v_add(z40, vbias);
+        z41 = v_add(z41, vbias);
+        z50 = v_add(z50, vbias);
+        z51 = v_add(z51, vbias);
     }
 
     if (bpptr)
     {
-        z00 += v_load(bpptr);
-        z01 += v_load_low(bpptr + 4);
-        z10 += v_load(bpptr + bpstep);
-        z11 += v_load_low(bpptr + bpstep + 4);
-        z20 += v_load(bpptr + bpstep*2);
-        z21 += v_load_low(bpptr + bpstep*2 + 4);
-        z30 += v_load(bpptr + bpstep*3);
-        z31 += v_load_low(bpptr + bpstep*3 + 4);
-        z40 += v_load(bpptr + bpstep*4);
-        z41 += v_load_low(bpptr + bpstep*4 + 4);
-        z50 += v_load(bpptr + bpstep*5);
-        z51 += v_load_low(bpptr + bpstep*5 + 4);
+        z00 = v_add(z00, v_load(bpptr));
+        z01 = v_add(z01, v_load_low(bpptr + 4));
+        z10 = v_add(z10, v_load(bpptr + bpstep));
+        z11 = v_add(z11, v_load_low(bpptr + bpstep + 4));
+        z20 = v_add(z20, v_load(bpptr + bpstep * 2));
+        z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4));
+        z30 = v_add(z30, v_load(bpptr + bpstep * 3));
+        z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4));
+        z40 = v_add(z40, v_load(bpptr + bpstep * 4));
+        z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4));
+        z50 = v_add(z50, v_load(bpptr + bpstep * 5));
+        z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4));
     }
 
     if (ifMinMaxAct)
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp
new file mode 100644
index 000000000000..70b716f9c757
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp
@@ -0,0 +1,476 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../../precomp.hpp"
+#include "convolution.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv {
+namespace dnn {
+
+// NEON code work around.
+namespace opt_NEON
+{
+
+#if CV_NEON && CV_NEON_AARCH64
+
+/* Accumulate */
+void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
+{
+    CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4);
+    if (iblock > 3)
+    {
+        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
+                outbuf += winoAtomF32)
+        {
+            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
+            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
+            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
+            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
+            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
+                                         wptr += winoKblock*winoAtomF32) {
+                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
+                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
+                float32x4_t x0, x1;
+                x0 = vld1q_f32(inwptr);
+                x1 = vld1q_f32(inwptr + 4);
+                s00 = vfmaq_f32(s00, w0, x0);
+                s01 = vfmaq_f32(s01, w0, x1);
+                s10 = vfmaq_f32(s10, w1, x0);
+                s11 = vfmaq_f32(s11, w1, x1);
+                s20 = vfmaq_f32(s20, w2, x0);
+                s21 = vfmaq_f32(s21, w2, x1);
+                s30 = vfmaq_f32(s30, w3, x0);
+                s31 = vfmaq_f32(s31, w3, x1);
+                x0 = vld1q_f32(inwptr + 8);
+                x1 = vld1q_f32(inwptr + 12);
+                s02 = vfmaq_f32(s02, w0, x0);
+                s03 = vfmaq_f32(s03, w0, x1);
+                s12 = vfmaq_f32(s12, w1, x0);
+                s13 = vfmaq_f32(s13, w1, x1);
+                s22 = vfmaq_f32(s22, w2, x0);
+                s23 = vfmaq_f32(s23, w2, x1);
+                s32 = vfmaq_f32(s32, w3, x0);
+                s33 = vfmaq_f32(s33, w3, x1);
+                x0 = vld1q_f32(inwptr + 16);
+                x1 = vld1q_f32(inwptr + 20);
+                s04 = vfmaq_f32(s04, w0, x0);
+                s05 = vfmaq_f32(s05, w0, x1);
+                s14 = vfmaq_f32(s14, w1, x0);
+                s15 = vfmaq_f32(s15, w1, x1);
+                s24 = vfmaq_f32(s24, w2, x0);
+                s25 = vfmaq_f32(s25, w2, x1);
+                s34 = vfmaq_f32(s34, w3, x0);
+                s35 = vfmaq_f32(s35, w3, x1);
+            }
+
+            vst1q_f32(outbuf, s00);
+            vst1q_f32(outbuf + 1*64, s01);
+            vst1q_f32(outbuf + 2*64, s02);
+            vst1q_f32(outbuf + 3*64, s03);
+            vst1q_f32(outbuf + 4*64, s04);
+            vst1q_f32(outbuf + 5*64, s05);
+
+            vst1q_f32(outbuf + 6*64, s10);
+            vst1q_f32(outbuf + 7*64, s11);
+            vst1q_f32(outbuf + 8*64, s12);
+            vst1q_f32(outbuf + 9*64, s13);
+            vst1q_f32(outbuf + 10*64, s14);
+            vst1q_f32(outbuf + 11*64, s15);
+
+            vst1q_f32(outbuf + 12*64, s20);
+            vst1q_f32(outbuf + 13*64, s21);
+            vst1q_f32(outbuf + 14*64, s22);
+            vst1q_f32(outbuf + 15*64, s23);
+            vst1q_f32(outbuf + 16*64, s24);
+            vst1q_f32(outbuf + 17*64, s25);
+
+            vst1q_f32(outbuf + 18*64, s30);
+            vst1q_f32(outbuf + 19*64, s31);
+            vst1q_f32(outbuf + 20*64, s32);
+            vst1q_f32(outbuf + 21*64, s33);
+            vst1q_f32(outbuf + 22*64, s34);
+            vst1q_f32(outbuf + 23*64, s35);
+        }
+    }
+    else
+    {
+        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
+                outbuf += winoAtomF32)
+        {
+            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
+            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00;
+            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00;
+            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00;
+            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
+                                         wptr += winoKblock*winoAtomF32) {
+                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
+                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
+                float32x4_t x0, x1, x2;
+                x0 = vld1q_f32(inwptr);
+                x1 = vld1q_f32(inwptr + 4);
+                x2 = vld1q_f32(inwptr + 8);
+                s00 = vfmaq_f32(s00, w0, x0);
+                s01 = vfmaq_f32(s01, w0, x1);
+                s02 = vfmaq_f32(s02, w0, x2);
+                s10 = vfmaq_f32(s10, w1, x0);
+                s11 = vfmaq_f32(s11, w1, x1);
+                s12 = vfmaq_f32(s12, w1, x2);
+                s20 = vfmaq_f32(s20, w2, x0);
+                s21 = vfmaq_f32(s21, w2, x1);
+                s22 = vfmaq_f32(s22, w2, x2);
+                s30 = vfmaq_f32(s30, w3, x0);
+                s31 = vfmaq_f32(s31, w3, x1);
+                s32 = vfmaq_f32(s32, w3, x2);
+            }
+
+            vst1q_f32(outbuf, s00);
+            vst1q_f32(outbuf + 1*64, s01);
+            vst1q_f32(outbuf + 2*64, s02);
+            vst1q_f32(outbuf + 6*64, s10);
+            vst1q_f32(outbuf + 7*64, s11);
+            vst1q_f32(outbuf + 8*64, s12);
+            vst1q_f32(outbuf + 12*64, s20);
+            vst1q_f32(outbuf + 13*64, s21);
+            vst1q_f32(outbuf + 14*64, s22);
+            vst1q_f32(outbuf + 18*64, s30);
+            vst1q_f32(outbuf + 19*64, s31);
+            vst1q_f32(outbuf + 20*64, s32);
+        }
+    }
+}
+
+#undef T4x4
+#define T4x4(a, b, c, d, tr0, tr1) \
+    tr0 = vtrnq_f32(a, b); \
+    tr1 = vtrnq_f32(c, d); \
+    a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
+    b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
+    c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
+    d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
+
+/*Input transform*/
+void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
+                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
+{
+    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
+    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
+    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
+    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
+    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
+    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
+    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
+    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
+
+    float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
+
+    {
+        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
+        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
+        float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11;
+        t00 = vsubq_f32(x40, x20);
+        t01 = vsubq_f32(x41, x21);
+        t10 = vsubq_f32(x30, x50);
+        t11 = vsubq_f32(x31, x51);
+        float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25);
+        float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25);
+        float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25);
+        float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25);
+
+        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
+        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
+        float32x4_t qm4_25 = vdupq_n_f32(-4.25f);
+        t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25);
+        t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25);
+        t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25);
+        t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25);
+
+        float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11);
+        float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01);
+
+        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
+        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
+        float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f);
+        float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f);
+        t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5);
+        t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5);
+        t10 = vfmaq_f32(x60, x20, q0_25);
+        t11 = vfmaq_f32(x61, x21, q0_25);
+        t00 = vfmaq_f32(t00, x30, qm2_5);
+        t01 = vfmaq_f32(t01, x31, qm2_5);
+        t10 = vfmaq_f32(t10, x40, qm1_25);
+        t11 = vfmaq_f32(t11, x41, qm1_25);
+
+        float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11);
+        float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01);
+
+        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
+        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
+        float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f);
+        t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5);
+        t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5);
+        t10 = vfmaq_f32(x60, x20, q4);
+        t11 = vfmaq_f32(x61, x21, q4);
+        t00 = vfmaq_f32(t00, x30, qm2_5);
+        t01 = vfmaq_f32(t01, x31, qm2_5);
+        t10 = vfmaq_f32(t10, x40, qm5);
+        t11 = vfmaq_f32(t11, x41, qm5);
+
+        float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11);
+        float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01);
+
+        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
+        /* Y:              */
+        /*        y00 y01  */
+        /*        y10 y11  */
+        /*        ...      */
+        /*        y70 y71  */
+        /*   Y':           */
+        /*        y00 y40  */
+        /*        y10 y50  */
+        /*        y20 y60  */
+        /*        y30 y70  */
+        /*        y01 y41  */
+        /*        y11 y51  */
+        /*        y21 y61  */
+        /*        y31 y71  */
+        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
+        float32x4x2_t tr0, tr1;
+
+        T4x4(y00, y10, y20, y30, tr0, tr1);
+        T4x4(y01, y11, y21, y31, tr0, tr1);
+        T4x4(y40, y50, y60, y70, tr0, tr1);
+        T4x4(y41, y51, y61, y71, tr0, tr1);
+
+        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
+        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
+        t00 = vsubq_f32(y01, y20);
+        t01 = vsubq_f32(y41, y60);
+        t10 = vsubq_f32(y30, y11);
+        t11 = vsubq_f32(y70, y51);
+        z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25);
+        z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25);
+        z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25);
+        z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25);
+
+        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
+        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
+        t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25);
+        t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25);
+        t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25);
+        t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25);
+
+        z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11);
+        z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01);
+
+        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
+        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
+        t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5);
+        t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5);
+        t10 = vfmaq_f32(y21, y20, q0_25);
+        t11 = vfmaq_f32(y61, y60, q0_25);
+        t00 = vfmaq_f32(t00, y30, qm2_5);
+        t01 = vfmaq_f32(t01, y70, qm2_5);
+        t10 = vfmaq_f32(t10, y01, qm1_25);
+        t11 = vfmaq_f32(t11, y41, qm1_25);
+
+        z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11);
+        z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01);
+
+        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
+        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
+        t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5);
+        t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5);
+        t10 = vfmaq_f32(y21, y20, q4);
+        t11 = vfmaq_f32(y61, y60, q4);
+        t00 = vfmaq_f32(t00, y30, qm2_5);
+        t01 = vfmaq_f32(t01, y70, qm2_5);
+        t10 = vfmaq_f32(t10, y01, qm5);
+        t11 = vfmaq_f32(t11, y41, qm5);
+
+        z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11);
+        z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01);
+    }
+
+    const int outstep = winoIblock*winoAtomF32*Cg;
+
+    vst1q_f32(outptr, z00);
+    vst1q_f32(outptr + outstep, z01);
+    vst1q_f32(outptr + outstep*2, z10);
+    vst1q_f32(outptr + outstep*3, z11);
+    vst1q_f32(outptr + outstep*4, z20);
+    vst1q_f32(outptr + outstep*5, z21);
+    vst1q_f32(outptr + outstep*6, z30);
+    vst1q_f32(outptr + outstep*7, z31);
+    vst1q_f32(outptr + outstep*8, z40);
+    vst1q_f32(outptr + outstep*9, z41);
+    vst1q_f32(outptr + outstep*10, z50);
+    vst1q_f32(outptr + outstep*11, z51);
+    vst1q_f32(outptr + outstep*12, z60);
+    vst1q_f32(outptr + outstep*13, z61);
+    vst1q_f32(outptr + outstep*14, z70);
+    vst1q_f32(outptr + outstep*15, z71);
+}
+
+/*Output transform*/
+void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
+                          float* bpptr, int bpstep, float* outptr, int outstep,
+                          float bias, float minval, float maxval, bool ifMinMaxAct)
+{
+    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
+    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
+    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
+    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
+    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
+    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
+    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
+    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
+    float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
+
+    {
+        float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
+        s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21);
+        s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41);
+        s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61);
+
+        float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0);
+        float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1);
+        float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
+        float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
+        float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
+        float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
+
+        s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21);
+        s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41);
+        s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61);
+
+        float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0),
+                                      s34_0, 32.f), s56_0, 1.f/32);
+        float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1),
+                                      s34_1, 32.f), s56_1, 1.f/32);
+        float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
+        float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
+        float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
+        float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
+        float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
+
+        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
+        /*  Y: */
+        /*        y00 y01 */
+        /*        y10 y11 */
+        /*        ... */
+        /*        y50 y51 */
+        /*        0   0 */
+        /*        0   0 */
+        /*   Y': */
+        /*        y00 y40 */
+        /*        y10 y50 */
+        /*        y20 y60 */
+        /*        y30 y70 */
+        /*        y01 y41 */
+        /*        y11 y51 */
+        /*        y21 y61 */
+        /*        y31 y71 */
+        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
+        float32x4x2_t tr0, tr1;
+
+        T4x4(y00, y10, y20, y30, tr0, tr1);
+        T4x4(y01, y11, y21, y31, tr0, tr1);
+        T4x4(y40, y50, y60, y70, tr0, tr1);
+        T4x4(y41, y51, y61, y71, tr0, tr1);
+
+        s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60);
+        s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41);
+        s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61);
+
+        z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0);
+        z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1);
+        z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
+        z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
+        z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
+        z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
+
+        s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60);
+        s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41);
+        s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61);
+
+        z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0),
+                          s34_0, 32.f), s56_0, 1.f/32);
+        z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1),
+                          s34_1, 32.f), s56_1, 1.f/32);
+        z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
+        z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
+        z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
+        z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
+        float32x4_t vbias = vdupq_n_f32(bias);
+
+        z00 = vaddq_f32(z00, vbias);
+        z01 = vaddq_f32(z01, vbias);
+        z10 = vaddq_f32(z10, vbias);
+        z11 = vaddq_f32(z11, vbias);
+        z20 = vaddq_f32(z20, vbias);
+        z21 = vaddq_f32(z21, vbias);
+        z30 = vaddq_f32(z30, vbias);
+        z31 = vaddq_f32(z31, vbias);
+        z40 = vaddq_f32(z40, vbias);
+        z41 = vaddq_f32(z41, vbias);
+        z50 = vaddq_f32(z50, vbias);
+        z51 = vaddq_f32(z51, vbias);
+    }
+
+    if (bpptr)
+    {
+        float32x2_t zhalf = vdup_n_f32(0.f);
+        z00 = vaddq_f32(z00, vld1q_f32(bpptr));
+        z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf));
+        z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep));
+        z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf));
+        z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2));
+        z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf));
+        z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3));
+        z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf));
+        z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4));
+        z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf));
+        z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5));
+        z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf));
+    }
+
+    if (ifMinMaxAct)
+    {
+        float32x4_t vmax = vdupq_n_f32(maxval);
+        float32x4_t vmin = vdupq_n_f32(minval);
+
+        z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax);
+        z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax);
+        z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax);
+        z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax);
+        z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax);
+        z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax);
+        z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax);
+        z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax);
+        z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax);
+        z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax);
+        z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax);
+        z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax);
+    }
+
+    vst1q_f32(outptr, z00);
+    vst1_f32(outptr + 4, vget_low_f32(z01));
+    vst1q_f32(outptr + outstep, z10);
+    vst1_f32(outptr + outstep + 4, vget_low_f32(z11));
+    vst1q_f32(outptr + outstep*2, z20);
+    vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21));
+    vst1q_f32(outptr + outstep*3, z30);
+    vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31));
+    vst1q_f32(outptr + outstep*4, z40);
+    vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
+    vst1q_f32(outptr + outstep*5, z50);
+    vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
+}
+
+#endif
+}
+
+}} // namespace
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
index 2688c757850d..e44d0f8004a0 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
@@ -9,26 +9,37 @@ namespace dnn {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
 /* Accumulate */
-void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
                             const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
 
 /*Input transform*/
-void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
                                float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
 
 /*Output transform*/
-void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
+void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
                                float* bpptr, int bpstep, float* outptr, int outstep,
                                float bias, float minval, float maxval, bool ifMinMaxAct);
 
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
+// FP 16 branch, only ARMv8 supports.
+void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, int Cg, int iblock,
+                        const int winoIblock, const int winoKblock, const int winoAtomF16, const int winoNatomF16);
+void winofunc_BtXB_8x8_F16(const float * inptr, int inpstep,
+                           char * _outptr, int Cg, const int winoIblock, const int winoAtomF16);
+void winofunc_AtXA_8x8_F16(const char* inptr, int inpstep,
+                           float * bpptr, int bpstep, float* outptr, int outstep,
+                           float bias, float minval, float maxval, bool ifMinMaxAct);
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY)
+
+#if CV_AVX
 
 #if !CV_FMA3 // AVX workaround
 #undef _mm256_fmadd_ps
 #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
 #endif
 
-void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
                             const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
 {
     CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 8);
@@ -187,7 +198,7 @@ void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m25
 }
 
 /*Input transform*/
-void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
                                float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
 {
     __m256 x00 = _mm256_loadu_ps(inptr);
@@ -311,7 +322,7 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
      0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
      0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
 */
-void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
+void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
                           float* bpptr, int bpstep, float* outptr, int outstep,
                           float bias, float minval, float maxval, bool ifMinMaxAct)
 {
@@ -405,166 +416,181 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
     STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM);
     _mm256_zeroupper();
 }
-#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-CV_CPU_OPTIMIZATION_NAMESPACE_END
+#endif // CV_AVX
 
-// NEON code work around.
-namespace opt_NEON
-{
+// FP16, currently, only ARMv8 may support it
+#if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
-#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON && CV_NEON_AARCH64
-/* Accumulate */
-void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
-                        const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
+#undef T4x4
+#define T4x4(a, b, c, d, tr0, tr1) \
+    tr0 = vtrnq_f32(a, b); \
+    tr1 = vtrnq_f32(c, d); \
+    a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
+    b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
+    c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
+    d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
 
-/*Input transform*/
-void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
-                            float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
+/* Accumulate */
+void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, int Cg, int iblock,
+                        const int winoIblock, const int winoKblock, const int winoAtomF16, const int winoNatomF16)
+{
+    const __fp16* inwptr = (const __fp16*)_inwptr;
+    const __fp16* wptr = (const __fp16*)_wptr;
+    __fp16* outbuf = (__fp16*)_outbuf;
 
-/*Output transform*/
-void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
-                            float* bpptr, int bpstep, float* outptr, int outstep,
-                            float bias, float minval, float maxval, bool ifMinMaxAct);
+    CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF16 == 8);
 
-void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
-                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
-{
-    CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4);
     if (iblock > 3)
     {
-        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
-                outbuf += winoAtomF32)
+        for (int atom_id = 0; atom_id < winoNatomF16; atom_id++, outbuf += winoAtomF16)
         {
-            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
-            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
-            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
-            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
-            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
-                                         wptr += winoKblock*winoAtomF32) {
-                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
-                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
-                float32x4_t x0, x1;
-                x0 = vld1q_f32(inwptr);
-                x1 = vld1q_f32(inwptr + 4);
-                s00 = vfmaq_f32(s00, w0, x0);
-                s01 = vfmaq_f32(s01, w0, x1);
-                s10 = vfmaq_f32(s10, w1, x0);
-                s11 = vfmaq_f32(s11, w1, x1);
-                s20 = vfmaq_f32(s20, w2, x0);
-                s21 = vfmaq_f32(s21, w2, x1);
-                s30 = vfmaq_f32(s30, w3, x0);
-                s31 = vfmaq_f32(s31, w3, x1);
-                x0 = vld1q_f32(inwptr + 8);
-                x1 = vld1q_f32(inwptr + 12);
-                s02 = vfmaq_f32(s02, w0, x0);
-                s03 = vfmaq_f32(s03, w0, x1);
-                s12 = vfmaq_f32(s12, w1, x0);
-                s13 = vfmaq_f32(s13, w1, x1);
-                s22 = vfmaq_f32(s22, w2, x0);
-                s23 = vfmaq_f32(s23, w2, x1);
-                s32 = vfmaq_f32(s32, w3, x0);
-                s33 = vfmaq_f32(s33, w3, x1);
-                x0 = vld1q_f32(inwptr + 16);
-                x1 = vld1q_f32(inwptr + 20);
-                s04 = vfmaq_f32(s04, w0, x0);
-                s05 = vfmaq_f32(s05, w0, x1);
-                s14 = vfmaq_f32(s14, w1, x0);
-                s15 = vfmaq_f32(s15, w1, x1);
-                s24 = vfmaq_f32(s24, w2, x0);
-                s25 = vfmaq_f32(s25, w2, x1);
-                s34 = vfmaq_f32(s34, w3, x0);
-                s35 = vfmaq_f32(s35, w3, x1);
+            float16x8_t s00 = vdupq_n_f16(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
+            float16x8_t s10 = vdupq_n_f16(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
+            float16x8_t s20 = vdupq_n_f16(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
+            float16x8_t s30 = vdupq_n_f16(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
+
+            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF16,
+                                         wptr += winoKblock*winoAtomF16)
+            {
+                float16x8_t w0 = vld1q_f16(wptr), w1 = vld1q_f16(wptr + 8);
+                float16x8_t w2 = vld1q_f16(wptr + 16), w3 = vld1q_f16(wptr + 24);
+
+                float16x8_t x0, x1, x2;
+                x0 = vld1q_f16(inwptr);
+                x1 = vld1q_f16(inwptr + 8);
+                x2 = vld1q_f16(inwptr + 16);
+
+                s00 = vfmaq_f16(s00, w0, x0);
+                s01 = vfmaq_f16(s01, w0, x1);
+                s02 = vfmaq_f16(s02, w0, x2);
+
+                s10 = vfmaq_f16(s10, w1, x0);
+                s11 = vfmaq_f16(s11, w1, x1);
+                s12 = vfmaq_f16(s12, w1, x2);
+
+                s20 = vfmaq_f16(s20, w2, x0);
+                s21 = vfmaq_f16(s21, w2, x1);
+                s22 = vfmaq_f16(s22, w2, x2);
+
+                s30 = vfmaq_f16(s30, w3, x0);
+                s31 = vfmaq_f16(s31, w3, x1);
+                s32 = vfmaq_f16(s32, w3, x2);
+
+                x0 = vld1q_f16(inwptr + 24);
+                x1 = vld1q_f16(inwptr + 32);
+                x2 = vld1q_f16(inwptr + 40);
+
+                s03 = vfmaq_f16(s03, w0, x0);
+                s04 = vfmaq_f16(s04, w0, x1);
+                s05 = vfmaq_f16(s05, w0, x2);
+
+                s13 = vfmaq_f16(s13, w1, x0);
+                s14 = vfmaq_f16(s14, w1, x1);
+                s15 = vfmaq_f16(s15, w1, x2);
+
+                s23 = vfmaq_f16(s23, w2, x0);
+                s24 = vfmaq_f16(s24, w2, x1);
+                s25 = vfmaq_f16(s25, w2, x2);
+
+                s33 = vfmaq_f16(s33, w3, x0);
+                s34 = vfmaq_f16(s34, w3, x1);
+                s35 = vfmaq_f16(s35, w3, x2);
             }
 
-            vst1q_f32(outbuf, s00);
-            vst1q_f32(outbuf + 1*64, s01);
-            vst1q_f32(outbuf + 2*64, s02);
-            vst1q_f32(outbuf + 3*64, s03);
-            vst1q_f32(outbuf + 4*64, s04);
-            vst1q_f32(outbuf + 5*64, s05);
-
-            vst1q_f32(outbuf + 6*64, s10);
-            vst1q_f32(outbuf + 7*64, s11);
-            vst1q_f32(outbuf + 8*64, s12);
-            vst1q_f32(outbuf + 9*64, s13);
-            vst1q_f32(outbuf + 10*64, s14);
-            vst1q_f32(outbuf + 11*64, s15);
-
-            vst1q_f32(outbuf + 12*64, s20);
-            vst1q_f32(outbuf + 13*64, s21);
-            vst1q_f32(outbuf + 14*64, s22);
-            vst1q_f32(outbuf + 15*64, s23);
-            vst1q_f32(outbuf + 16*64, s24);
-            vst1q_f32(outbuf + 17*64, s25);
-
-            vst1q_f32(outbuf + 18*64, s30);
-            vst1q_f32(outbuf + 19*64, s31);
-            vst1q_f32(outbuf + 20*64, s32);
-            vst1q_f32(outbuf + 21*64, s33);
-            vst1q_f32(outbuf + 22*64, s34);
-            vst1q_f32(outbuf + 23*64, s35);
+            vst1q_f16(outbuf, s00);
+            vst1q_f16(outbuf + 1*64, s01);
+            vst1q_f16(outbuf + 2*64, s02);
+            vst1q_f16(outbuf + 3*64, s03);
+            vst1q_f16(outbuf + 4*64, s04);
+            vst1q_f16(outbuf + 5*64, s05);
+
+            vst1q_f16(outbuf + 6*64, s10);
+            vst1q_f16(outbuf + 7*64, s11);
+            vst1q_f16(outbuf + 8*64, s12);
+            vst1q_f16(outbuf + 9*64, s13);
+            vst1q_f16(outbuf + 10*64, s14);
+            vst1q_f16(outbuf + 11*64, s15);
+
+            vst1q_f16(outbuf + 12*64, s20);
+            vst1q_f16(outbuf + 13*64, s21);
+            vst1q_f16(outbuf + 14*64, s22);
+            vst1q_f16(outbuf + 15*64, s23);
+            vst1q_f16(outbuf + 16*64, s24);
+            vst1q_f16(outbuf + 17*64, s25);
+
+            vst1q_f16(outbuf + 18*64, s30);
+            vst1q_f16(outbuf + 19*64, s31);
+            vst1q_f16(outbuf + 20*64, s32);
+            vst1q_f16(outbuf + 21*64, s33);
+            vst1q_f16(outbuf + 22*64, s34);
+            vst1q_f16(outbuf + 23*64, s35);
         }
     }
     else
     {
-        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
-                outbuf += winoAtomF32)
+        for (int atom_id = 0; atom_id < winoNatomF16; atom_id++,
+                outbuf += winoAtomF16)
         {
-            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
-            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00;
-            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00;
-            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00;
-            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
-                                         wptr += winoKblock*winoAtomF32) {
-                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
-                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
-                float32x4_t x0, x1, x2;
-                x0 = vld1q_f32(inwptr);
-                x1 = vld1q_f32(inwptr + 4);
-                x2 = vld1q_f32(inwptr + 8);
-                s00 = vfmaq_f32(s00, w0, x0);
-                s01 = vfmaq_f32(s01, w0, x1);
-                s02 = vfmaq_f32(s02, w0, x2);
-                s10 = vfmaq_f32(s10, w1, x0);
-                s11 = vfmaq_f32(s11, w1, x1);
-                s12 = vfmaq_f32(s12, w1, x2);
-                s20 = vfmaq_f32(s20, w2, x0);
-                s21 = vfmaq_f32(s21, w2, x1);
-                s22 = vfmaq_f32(s22, w2, x2);
-                s30 = vfmaq_f32(s30, w3, x0);
-                s31 = vfmaq_f32(s31, w3, x1);
-                s32 = vfmaq_f32(s32, w3, x2);
+            float16x8_t s00 = vdupq_n_f16(0.f), s01 = s00, s02 = s00;
+            float16x8_t s10 = vdupq_n_f16(0.f), s11 = s00, s12 = s00;
+            float16x8_t s20 = vdupq_n_f16(0.f), s21 = s00, s22 = s00;
+            float16x8_t s30 = vdupq_n_f16(0.f), s31 = s00, s32 = s00;
+
+            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF16,
+                                         wptr += winoKblock*winoAtomF16)
+            {
+                float16x8_t w0 = vld1q_f16(wptr), w1 = vld1q_f16(wptr + 8);
+                float16x8_t w2 = vld1q_f16(wptr + 16), w3 = vld1q_f16(wptr + 24);
+                float16x8_t x0, x1, x2;
+
+                x0 = vld1q_f16(inwptr);
+                x1 = vld1q_f16(inwptr + 8);
+                x2 = vld1q_f16(inwptr + 16);
+
+                s00 = vfmaq_f16(s00, w0, x0);
+                s01 = vfmaq_f16(s01, w0, x1);
+                s02 = vfmaq_f16(s02, w0, x2);
+
+                s10 = vfmaq_f16(s10, w1, x0);
+                s11 = vfmaq_f16(s11, w1, x1);
+                s12 = vfmaq_f16(s12, w1, x2);
+
+                s20 = vfmaq_f16(s20, w2, x0);
+                s21 = vfmaq_f16(s21, w2, x1);
+                s22 = vfmaq_f16(s22, w2, x2);
+
+                s30 = vfmaq_f16(s30, w3, x0);
+                s31 = vfmaq_f16(s31, w3, x1);
+                s32 = vfmaq_f16(s32, w3, x2);
             }
 
-            vst1q_f32(outbuf, s00);
-            vst1q_f32(outbuf + 1*64, s01);
-            vst1q_f32(outbuf + 2*64, s02);
-            vst1q_f32(outbuf + 6*64, s10);
-            vst1q_f32(outbuf + 7*64, s11);
-            vst1q_f32(outbuf + 8*64, s12);
-            vst1q_f32(outbuf + 12*64, s20);
-            vst1q_f32(outbuf + 13*64, s21);
-            vst1q_f32(outbuf + 14*64, s22);
-            vst1q_f32(outbuf + 18*64, s30);
-            vst1q_f32(outbuf + 19*64, s31);
-            vst1q_f32(outbuf + 20*64, s32);
+            vst1q_f16(outbuf, s00);
+            vst1q_f16(outbuf + 1*64, s01);
+            vst1q_f16(outbuf + 2*64, s02);
+
+            vst1q_f16(outbuf + 6*64, s10);
+            vst1q_f16(outbuf + 7*64, s11);
+            vst1q_f16(outbuf + 8*64, s12);
+
+            vst1q_f16(outbuf + 12*64, s20);
+            vst1q_f16(outbuf + 13*64, s21);
+            vst1q_f16(outbuf + 14*64, s22);
+
+            vst1q_f16(outbuf + 18*64, s30);
+            vst1q_f16(outbuf + 19*64, s31);
+            vst1q_f16(outbuf + 20*64, s32);
         }
     }
 }
 
-#define T4x4(a, b, c, d, tr0, tr1) \
-    tr0 = vtrnq_f32(a, b); \
-    tr1 = vtrnq_f32(c, d); \
-    a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
-    b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
-    c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
-    d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
-
 /*Input transform*/
-void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
-                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
+//NOTE: Since we don't have the fully fp16 support. Current work around is that we need packing the data and
+// convert it to FP16 in input transform stage. And at output transform stage we will convert it back to FP32.
+void winofunc_BtXB_8x8_F16(const float * inptr, int inpstep,
+                           char * _outptr, int Cg, const int winoIblock, const int winoAtomF16)
 {
+    __fp16* outptr = (__fp16*)_outptr;
     float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
     float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
     float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
@@ -577,8 +603,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
     float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
 
     {
-        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
-        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
+        // Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X
+        // Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X
         float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11;
         t00 = vsubq_f32(x40, x20);
         t01 = vsubq_f32(x41, x21);
@@ -589,8 +615,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25);
         float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25);
 
-        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
-        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
+        // Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X
+        // Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X
         float32x4_t qm4_25 = vdupq_n_f32(-4.25f);
         t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25);
         t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25);
@@ -600,8 +626,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11);
         float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01);
 
-        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
-        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
+        // Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X
+        // Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X
         float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f);
         float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f);
         t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5);
@@ -616,8 +642,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11);
         float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01);
 
-        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
-        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
+        // Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X
+        // Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X
         float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f);
         t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5);
         t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5);
@@ -631,22 +657,22 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11);
         float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01);
 
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        /* Y:              */
-        /*        y00 y01  */
-        /*        y10 y11  */
-        /*        ...      */
-        /*        y70 y71  */
-        /*   Y':           */
-        /*        y00 y40  */
-        /*        y10 y50  */
-        /*        y20 y60  */
-        /*        y30 y70  */
-        /*        y01 y41  */
-        /*        y11 y51  */
-        /*        y21 y61  */
-        /*        y31 y71  */
-        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
+        // transpose 8x8 matrix in-place with some renumeration of the elements:
+        // Y:
+        //        y00 y01
+        //        y10 y11
+        //        ...
+        //        y70 y71
+        // Y':
+        //        y00 y40
+        //        y10 y50
+        //        y20 y60
+        //        y30 y70
+        //        y01 y41
+        //        y11 y51
+        //        y21 y61
+        //        y31 y71
+        // in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31
         float32x4x2_t tr0, tr1;
 
         T4x4(y00, y10, y20, y30, tr0, tr1);
@@ -654,8 +680,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         T4x4(y40, y50, y60, y70, tr0, tr1);
         T4x4(y41, y51, y61, y71, tr0, tr1);
 
-        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
-        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
+        // Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y
+        // Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y
         t00 = vsubq_f32(y01, y20);
         t01 = vsubq_f32(y41, y60);
         t10 = vsubq_f32(y30, y11);
@@ -665,8 +691,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25);
         z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25);
 
-        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
-        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
+        // Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y
+        // Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y
         t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25);
         t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25);
         t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25);
@@ -675,8 +701,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11);
         z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01);
 
-        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
-        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
+        // Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y
+        // Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y
         t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5);
         t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5);
         t10 = vfmaq_f32(y21, y20, q0_25);
@@ -689,8 +715,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11);
         z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01);
 
-        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
-        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
+        // Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y
+        // Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y
         t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5);
         t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5);
         t10 = vfmaq_f32(y21, y20, q4);
@@ -704,39 +730,41 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01);
     }
 
-    const int outstep = winoIblock*winoAtomF32*Cg;
-
-    vst1q_f32(outptr, z00);
-    vst1q_f32(outptr + outstep, z01);
-    vst1q_f32(outptr + outstep*2, z10);
-    vst1q_f32(outptr + outstep*3, z11);
-    vst1q_f32(outptr + outstep*4, z20);
-    vst1q_f32(outptr + outstep*5, z21);
-    vst1q_f32(outptr + outstep*6, z30);
-    vst1q_f32(outptr + outstep*7, z31);
-    vst1q_f32(outptr + outstep*8, z40);
-    vst1q_f32(outptr + outstep*9, z41);
-    vst1q_f32(outptr + outstep*10, z50);
-    vst1q_f32(outptr + outstep*11, z51);
-    vst1q_f32(outptr + outstep*12, z60);
-    vst1q_f32(outptr + outstep*13, z61);
-    vst1q_f32(outptr + outstep*14, z70);
-    vst1q_f32(outptr + outstep*15, z71);
+    const int outstep = winoIblock*winoAtomF16*Cg;
+
+    vst1_f16(outptr, vcvt_f16_f32(z00));
+    vst1_f16(outptr + 4, vcvt_f16_f32(z01));
+    vst1_f16(outptr + outstep, vcvt_f16_f32(z10));
+    vst1_f16(outptr + outstep + 4, vcvt_f16_f32(z11));
+    vst1_f16(outptr + outstep*2, vcvt_f16_f32(z20));
+    vst1_f16(outptr + outstep*2 + 4, vcvt_f16_f32(z21));
+    vst1_f16(outptr + outstep*3, vcvt_f16_f32(z30));
+    vst1_f16(outptr + outstep*3 + 4, vcvt_f16_f32(z31));
+    vst1_f16(outptr + outstep*4, vcvt_f16_f32(z40));
+    vst1_f16(outptr + outstep*4 + 4, vcvt_f16_f32(z41));
+    vst1_f16(outptr + outstep*5, vcvt_f16_f32(z50));
+    vst1_f16(outptr + outstep*5 + 4, vcvt_f16_f32(z51));
+    vst1_f16(outptr + outstep*6, vcvt_f16_f32(z60));
+    vst1_f16(outptr + outstep*6 + 4, vcvt_f16_f32(z61));
+    vst1_f16(outptr + outstep*7, vcvt_f16_f32(z70));
+    vst1_f16(outptr + outstep*7 + 4, vcvt_f16_f32(z71));
 }
 
-/*Output transform*/
-void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
-                          float* bpptr, int bpstep, float* outptr, int outstep,
-                          float bias, float minval, float maxval, bool ifMinMaxAct)
+// Output transform
+void winofunc_AtXA_8x8_F16(const char* _inptr, int inpstep,
+                           float * bpptr, int bpstep, float* outptr, int outstep,
+                           float bias, float minval, float maxval, bool ifMinMaxAct)
 {
-    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
-    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
-    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
-    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
-    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
-    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
-    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
-    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
+    const __fp16* inptr = (const __fp16*)_inptr;
+
+    float32x4_t x00 = vcvt_f32_f16(vld1_f16(inptr)), x01 = vcvt_f32_f16(vld1_f16(inptr + 4));
+    float32x4_t x10 = vcvt_f32_f16(vld1_f16(inptr + inpstep)), x11 = vcvt_f32_f16(vld1_f16(inptr + inpstep + 4));
+    float32x4_t x20 = vcvt_f32_f16(vld1_f16(inptr + inpstep*2)), x21 = vcvt_f32_f16(vld1_f16(inptr + inpstep*2 + 4));
+    float32x4_t x30 = vcvt_f32_f16(vld1_f16(inptr + inpstep*3)), x31 = vcvt_f32_f16(vld1_f16(inptr + inpstep*3 + 4));
+    float32x4_t x40 = vcvt_f32_f16(vld1_f16(inptr + inpstep*4)), x41 = vcvt_f32_f16(vld1_f16(inptr + inpstep*4 + 4));
+    float32x4_t x50 = vcvt_f32_f16(vld1_f16(inptr + inpstep*5)), x51 = vcvt_f32_f16(vld1_f16(inptr + inpstep*5 + 4));
+    float32x4_t x60 = vcvt_f32_f16(vld1_f16(inptr + inpstep*6)), x61 = vcvt_f32_f16(vld1_f16(inptr + inpstep*6 + 4));
+    float32x4_t x70 = vcvt_f32_f16(vld1_f16(inptr + inpstep*7)), x71 = vcvt_f32_f16(vld1_f16(inptr + inpstep*7 + 4));
     float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
 
     {
@@ -757,33 +785,33 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61);
 
         float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0),
-                                      s34_0, 32.f), s56_0, 1.f/32);
+                                                  s34_0, 32.f), s56_0, 1.f/32);
         float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1),
-                                      s34_1, 32.f), s56_1, 1.f/32);
+                                                  s34_1, 32.f), s56_1, 1.f/32);
         float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
         float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
         float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
         float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
         float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
 
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        /*  Y: */
-        /*        y00 y01 */
-        /*        y10 y11 */
-        /*        ... */
-        /*        y50 y51 */
-        /*        0   0 */
-        /*        0   0 */
-        /*   Y': */
-        /*        y00 y40 */
-        /*        y10 y50 */
-        /*        y20 y60 */
-        /*        y30 y70 */
-        /*        y01 y41 */
-        /*        y11 y51 */
-        /*        y21 y61 */
-        /*        y31 y71 */
-        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
+        // transpose 8x8 matrix in-place with some renumeration of the elements:
+        // Y:
+        //        y00 y01
+        //        y10 y11
+        //        ...
+        //        y50 y51
+        //        0   0
+        //        0   0
+        // Y':
+        //        y00 y40
+        //        y10 y50
+        //        y20 y60
+        //        y30 y70
+        //        y01 y41
+        //        y11 y51
+        //        y21 y61
+        //        y31 y71
+        // in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31
         float32x4x2_t tr0, tr1;
 
         T4x4(y00, y10, y20, y30, tr0, tr1);
@@ -807,9 +835,9 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61);
 
         z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0),
-                          s34_0, 32.f), s56_0, 1.f/32);
+                                      s34_0, 32.f), s56_0, 1.f/32);
         z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1),
-                          s34_1, 32.f), s56_1, 1.f/32);
+                                      s34_1, 32.f), s56_1, 1.f/32);
         z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
         z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
         z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
@@ -879,8 +907,8 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
     vst1q_f32(outptr + outstep*5, z50);
     vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
 }
-
 #endif
-}
+#endif
 
+CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
index c76b3494e2e7..33fb62a47bfd 100644
--- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@@ -14,15 +14,76 @@
 
 #include "conv_block.simd.hpp"
 #include "layers/cpu_kernels/conv_block.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
+#include <opencv2/core/utils/logger.hpp>
 
 namespace cv { namespace dnn {
-enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
+enum { VEC_ALIGN = 32}; // Memory alignment.
 
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
+void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
                const int convMR, const int convNR);
-void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+void convBlockMR1_F32(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
                   const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
 
+#ifdef CONV_ARM_FP16
+// Fast convert float 32 to float16
+static inline void _cvt32f16f(const float* src, __fp16* dst, int len)
+{
+    int j = 0;
+    const int VECSZ = 4;
+    __fp16* dst_FP16 = (__fp16 *)dst;
+    if (len > VECSZ * 4)
+    {
+        const int VECSZ4 = 4 * VECSZ;
+        for( ; j + VECSZ4 < len; j += VECSZ4)
+        {
+
+            float32x4_t v0 = vld1q_f32(src + j);
+            float32x4_t v1 = vld1q_f32(src + j + 4);
+            float32x4_t v2 = vld1q_f32(src + j + 8);
+            float32x4_t v3 = vld1q_f32(src + j + 12);
+
+            vst1q_f16(dst_FP16 + j, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
+            vst1q_f16(dst_FP16 + j + 8, vcombine_f16(vcvt_f16_f32(v2), vcvt_f16_f32(v3)));
+        }
+    }
+
+    for( ; j < len; j += VECSZ )
+    {
+        if( j > len - VECSZ )
+        {
+            if( j == 0 )
+                break;
+            j = len - VECSZ;
+        }
+
+        float16x4_t hv = vcvt_f16_f32(vld1q_f32(src + j));
+        vst1_f16(dst_FP16 + j, hv);
+    }
+    for( ; j < len; j++ )
+        dst[j] = __fp16(src[j]);
+}
+#endif
+
+float* FastConv::getWeights()
+{
+    return alignPtr(weightsBuf.data(), VEC_ALIGN);
+}
+
+float* FastConv::getWeightsWino()
+{
+    return alignPtr(weightsWinoBuf.data(), VEC_ALIGN);
+}
+
+hfloat* FastConv::getWeightsFP16()
+{
+    return alignPtr(weightsBuf_FP16.data(), VEC_ALIGN);
+}
+
+hfloat* FastConv::getWeightsWinoFP16()
+{
+    return alignPtr(weightsWinoBuf_FP16.data(), VEC_ALIGN);
+}
+
 Ptr<FastConv> initFastConv(
         InputArray _weightsMat,
         float* srcBias,
@@ -119,9 +180,16 @@ Ptr<FastConv> initFastConv(
 
     conv->useFP16 = false;
 #ifdef CONV_ARM_FP16
-    // TODO: add FP16 support for Winograd.
-    if (_useFP16 && (conv->conv_type == CONV_TYPE_GENERIC || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN))
+    if (_useFP16 && (conv->conv_type == CONV_TYPE_GENERIC || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN
+    || conv->conv_type == CONV_TYPE_WINOGRAD3X3))
         conv->useFP16 = true;
+
+    // Runtime FP16 check.
+    if (conv->useFP16 && !checkHardwareSupport(CPU_NEON_FP16))
+    {
+        conv->useFP16 = false;
+        CV_LOG_ONCE_WARNING(NULL, "DNN: the CPU does not support the instruction set required by FP16, fallback to FP32.");
+    }
 #endif
 
     float *srcWeights = (float *)weightsMat.data;
@@ -141,31 +209,23 @@ Ptr<FastConv> initFastConv(
         if (conv->useFP16)
         {
             conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN);
-            conv->weightsBufPtr_FP16 = alignPtr(conv->weightsBuf_FP16.data(), VEC_ALIGN * sizeof(float16_t ));
-            memset(conv->weightsBufPtr_FP16, 0, nweights * sizeof(float16_t ));
-            auto weightsBufPtr_FP16 = conv->weightsBufPtr_FP16;
+            auto weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
 
             parallel_for_(Range(0, C), [&](const Range& r0){
-            for(int c = r0.start; c < r0.end; c++)
-            {
-                for (int k = 0; k < ksize; k++)
-                    weightsBufPtr_FP16[c*padded_ksize + k] = (float16_t)srcWeights[c*wstep + k];
-            }});
+                for(int c = r0.start; c < r0.end; c++)
+                    _cvt32f16f(srcWeights + c*wstep, weightsPtr_FP16 + c*padded_ksize, ksize);
+            });
         }
         else
 #endif
         {
             conv->weightsBuf.resize(nweights + VEC_ALIGN);
-            conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN * sizeof(float ));
-            memset(conv->weightsBufPtr, 0, nweights*sizeof(float ));
-            auto weightsBufPtr = conv->weightsBufPtr;
+            auto weightsPtr = conv->getWeights();
 
-            parallel_for_(Range(0, C), [&](const Range& r0){
-            for(int c = r0.start; c < r0.end; c++)
-            {
-                for (int k = 0; k < ksize; k++)
-                    weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k];
-            }});
+            parallel_for_(Range(0, C), [&](const Range& r0) {
+                for(int c = r0.start; c < r0.end; c++)
+                    memcpy(weightsPtr + c*padded_ksize, srcWeights + c*wstep, ksize*sizeof(weightsPtr[0]));
+            });
         }
     }
     else if(conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
@@ -209,21 +269,17 @@ Ptr<FastConv> initFastConv(
 
         float* wptrWino = nullptr;
 #ifdef CONV_ARM_FP16
-        float16_t* wptrWino_FP16 = nullptr;
+        __fp16* wptrWino_FP16 = nullptr;
         if (conv->useFP16)
         {
             conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN);
-            conv->weightsWinoBufPtr_FP16 = alignPtr(conv->weightsWinoBuf_FP16.data(), VEC_ALIGN);
-            wptrWino_FP16 = conv->weightsWinoBufPtr_FP16;
-            memset(wptrWino_FP16, 0, nweights * sizeof(wptrWino_FP16[0]));
+            wptrWino_FP16 = (__fp16*)conv->getWeightsWinoFP16();
         }
         else
 #endif
         {
             conv->weightsWinoBuf.resize(nweights + VEC_ALIGN);
-            conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
-            wptrWino = conv->weightsWinoBufPtr;
-            memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
+            wptrWino = conv->getWeightsWino();
         }
 
         parallel_for_(Range(0, K), [&](const Range& r0){
@@ -267,15 +323,15 @@ Ptr<FastConv> initFastConv(
 #ifdef CONV_ARM_FP16
                 if (conv->useFP16)
                 {
-                    float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
+                    __fp16* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
                                   (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F16;
                     for (int i = 0; i < CONV_WINO_NATOMS_F16; i++,
                             wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16)
                     {
-                        CV_Assert(conv->weightsWinoBufPtr_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= conv->weightsWinoBufPtr_FP16 + nweights);
+                        CV_Assert(wptrWino_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= wptrWino_FP16 + nweights);
                         for (int j = 0; j < CONV_WINO_ATOM_F16; j++)
                         {
-                            wptr[j] = (float16_t)kernelTm[i * CONV_WINO_ATOM_F16 + j];
+                            wptr[j] = (__fp16)kernelTm[i * CONV_WINO_ATOM_F16 + j];
                         }
                     }
                 }
@@ -287,7 +343,7 @@ Ptr<FastConv> initFastConv(
                     for (int i = 0; i < CONV_WINO_NATOMS_F32; i++,
                             wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32)
                     {
-                        CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
+                        CV_Assert(wptrWino <= wptr && wptr + CONV_WINO_ATOM_F32 <= wptrWino + nweights);
                         memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0]));
                     }
                 }
@@ -305,29 +361,24 @@ Ptr<FastConv> initFastConv(
         int numStripsMR = (Kg + CONV_MR_FP32 - 1) / CONV_MR_FP32;
         int Kg_aligned = numStripsMR * CONV_MR_FP32;
         size_t nweights = ngroups*Kg_aligned*DkHkWkCg;
-
-        float* weightsBufPtr = nullptr;
+        float* weightsPtr = nullptr;
 
 #ifdef CONV_ARM_FP16
         int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16;
         int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16;
         size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg;
+        __fp16* weightsPtr_FP16 = nullptr;
 
-        float16_t* weightsBufPtr_FP16 = nullptr;
         if (conv->useFP16)
         {
             conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN);
-            conv->weightsBufPtr_FP16 = alignPtr(conv->weightsBuf_FP16.data(), VEC_ALIGN);
-            weightsBufPtr_FP16 = conv->weightsBufPtr_FP16;
-            memset(weightsBufPtr_FP16, 0, nweights_FP16*sizeof(weightsBufPtr_FP16[0]));
+            weightsPtr_FP16 = (__fp16*)conv->getWeightsFP16();
         }
         else
 #endif
         {
             conv->weightsBuf.resize(nweights + VEC_ALIGN);
-            conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
-            weightsBufPtr = conv->weightsBufPtr;
-            memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
+            weightsPtr = conv->getWeights();
         }
 
         // Pack the weight.
@@ -343,7 +394,7 @@ Ptr<FastConv> initFastConv(
                 int startK = si * CONV_MR_FP16;
                 CV_Assert(startK < Kg_aligned_FP16);
 
-                float16_t* packed_wptr = weightsBufPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
+                __fp16* packed_wptr = weightsPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
                 int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding.
 
                 int k_idx = g*Kg + startK;
@@ -354,9 +405,9 @@ Ptr<FastConv> initFastConv(
                         const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
                         int k = 0;
                         for(; k < dk; k++, wptr += wstep)
-                            packed_wptr[k] = (float16_t)(*wptr);
+                            packed_wptr[k] = (__fp16)(*wptr);
                         for(; k < CONV_MR_FP16; k++)
-                            packed_wptr[k] = (float16_t)0.f;
+                            packed_wptr[k] = (__fp16)0.f;
                     }
                 }
             }});
@@ -373,7 +424,7 @@ Ptr<FastConv> initFastConv(
                 int startK = si * CONV_MR_FP32;
                 CV_Assert(startK < Kg_aligned);
 
-                float* packed_wptr = weightsBufPtr + DkHkWkCg * (startK + g * Kg_aligned);
+                float* packed_wptr = weightsPtr + DkHkWkCg * (startK + g * Kg_aligned);
                 int dk = Kg - startK < CONV_MR_FP32 ? Kg - startK : CONV_MR_FP32; // check if we need zero padding.
 
                 int k_idx = g*Kg + startK;
@@ -393,7 +444,7 @@ Ptr<FastConv> initFastConv(
         }
     }
     else
-        CV_Error(CV_StsUnsupportedFormat, "Unknown convolution type.");
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unknown convolution type.");
 
     // store bias; append some zero's to make sure that
     // we can always read MR elements starting from any valid index
@@ -410,14 +461,14 @@ Ptr<FastConv> initFastConv(
 }
 
 static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
-                      const int stride_w, const int ksize, const int esz)
+                             const int stride_w, const int ksize, const int esz)
 {
     char * inpbufC = inpbuf + s0 * esz;
     float* inptrInC = (float* )inptrIn;
 
 #ifdef CONV_ARM_FP16
-    float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
-    if (esz == sizeof(float16_t))
+    __fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
+    if (esz == sizeof(__fp16))
     {
         if (stride_w == 1)
         {
@@ -435,16 +486,8 @@ static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0,
             for (int k = 0; k < ksize; k++)
             {
                 int k1 = ofstab[k];
-                float32x4_t v0, v1;
-
-                v0[0] = inptrInC[k1];
-                v0[1] = inptrInC[k1 + stride_w];
-                v0[2] = inptrInC[k1 + 2*stride_w];
-                v0[3] = inptrInC[k1 + 3*stride_w];
-                v1[0] = inptrInC[k1 + 4*stride_w];
-                v1[1] = inptrInC[k1 + 5*stride_w];
-                v1[2] = inptrInC[k1 + 6*stride_w];
-                v1[3] = inptrInC[k1 + 7*stride_w];
+                float32x4_t v0 = {inptrInC[k1], inptrInC[k1 + stride_w], inptrInC[k1 + 2*stride_w], inptrInC[k1 + 3*stride_w]};
+                float32x4_t v1 = {inptrInC[k1 + 4*stride_w], inptrInC[k1 + 5*stride_w], inptrInC[k1 + 6*stride_w], inptrInC[k1 + 7*stride_w]};
 
                 vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
             }
@@ -516,22 +559,22 @@ static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0,
 }
 
 static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
-                      const int stride_w, const int ksize, const int esz)
+                             const int stride_w, const int ksize, const int esz)
 {
     char* inpbufC = inpbuf + s0 * esz;
     float* inptrInC = inptrIn;
 
 #ifdef CONV_ARM_FP16
-    float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
-    if (esz == sizeof(float16_t))
+    __fp16* inpbufC_FP16 = (__fp16 *)inpbufC;
+    if (esz == sizeof(__fp16))
     {
         for (int k = 0; k < ksize; k++)
         {
             int k1 = ofstab[k];
             float v0 = inptrInC[k1];
             float v1 = inptrInC[k1 + stride_w];
-            inpbufC_FP16[k*CONV_NR_FP16] = (float16_t)v0;
-            inpbufC_FP16[k*CONV_NR_FP16+1] = (float16_t)v1;
+            inpbufC_FP16[k*CONV_NR_FP16] = (__fp16)v0;
+            inpbufC_FP16[k*CONV_NR_FP16+1] = (__fp16)v1;
         }
     } else
 #endif
@@ -553,46 +596,6 @@ static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0
     in_w += stride_w;
 }
 
-#ifdef CONV_ARM_FP16
-// Fast convert float 32 to float16
-static inline void _cvt32f16f( const float* src, float16_t* dst, int len)
-{
-    int j = 0;
-    const int VECSZ = 4;
-    __fp16* dst_FP16 = (__fp16 *)dst;
-    if (len > VECSZ * 4)
-    {
-        const int VECSZ4 = 4 * VECSZ;
-        for( ; j + VECSZ4 < len; j += VECSZ4)
-        {
-
-            float32x4_t v0 = vld1q_f32(src + j);
-            float32x4_t v1 = vld1q_f32(src + j + 4);
-            float32x4_t v2 = vld1q_f32(src + j + 8);
-            float32x4_t v3 = vld1q_f32(src + j + 12);
-
-            vst1q_f16(dst_FP16 + j, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
-            vst1q_f16(dst_FP16 + j + 8, vcombine_f16(vcvt_f16_f32(v2), vcvt_f16_f32(v3)));
-        }
-    }
-
-    for( ; j < len; j += VECSZ )
-    {
-        if( j > len - VECSZ )
-        {
-            if( j == 0 )
-                break;
-            j = len - VECSZ;
-        }
-
-        float16x4_t hv = vcvt_f16_f32(vld1q_f32(src + j));
-        vst1_f16(dst_FP16 + j, hv);
-    }
-    for( ; j < len; j++ )
-        dst[j] = float16_t(src[j]);
-}
-#endif
-
 static inline void packInputData(char* inpbuf_task, float* inp, const int* ofstab, const int* dhwTab, int zyx0, int zyx_limit,
                                  int ksize, int stride_d, int stride_h, int stride_w, int pad_front, int pad_top, int pad_left,
                                  int Dk, int Hk, int Wk, int dilation_d, int dilation_h, int dilation_w, int Di, int Hi, int Wi,
@@ -627,7 +630,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                 if (useFP16)
                 {
                     for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
-                        _cvt32f16f(inptr, (float16_t *)inpbuf, CONV_NR);
+                        _cvt32f16f(inptr, (__fp16 *)inpbuf, CONV_NR);
                 }
                 else
 #endif
@@ -641,8 +644,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                 {
                     for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
                     {
-                        _cvt32f16f(inptr, (float16_t *)inpbuf, slice_len);
-                        memset(inpbuf + slice_len * esz, 0, (CONV_NR - slice_len) * esz);
+                        _cvt32f16f(inptr, (__fp16 *)inpbuf, slice_len);
                     }
                 }
                 else
@@ -650,7 +652,6 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                 for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
                 {
                     memcpy(inpbuf, inptr, slice_len * esz);
-                    memset(inpbuf + slice_len * esz, 0, (CONV_NR - slice_len) * esz);
                 }
             }
         }
@@ -703,11 +704,11 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
 #ifdef CONV_ARM_FP16
                             if (useFP16)
                             {
-                                float16_t* inpbufC = (float16_t *)inpbuf + s0;
+                                __fp16* inpbufC = (__fp16 *)inpbuf + s0;
                                 for (int w = w0; w < w1; w++)
                                 {
                                     int imgofs = w*dilation_w;
-                                    inpbufC[w*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                    inpbufC[w*CONV_NR] = (__fp16)inptrInC[imgofs];
                                 }
                             }
                             else
@@ -764,14 +765,14 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
 #ifdef CONV_ARM_FP16
                             if (useFP16)
                             {
-                                float16_t* inpbufC = (float16_t *)inpbuf + s0;
+                                __fp16* inpbufC = (__fp16 *)inpbuf + s0;
 
                                 for (int h = h0; h < h1; h++)
                                 {
                                     for (int w = w0; w < w1; w++)
                                     {
                                         int imgofs = h*(dilation_h*Wi) + w*dilation_w;
-                                        inpbufC[(h*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                        inpbufC[(h*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs];
                                     }
                                 }
                             }
@@ -837,7 +838,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
 #ifdef CONV_ARM_FP16
                             if (useFP16)
                             {
-                                float16_t* inpbufC = (float16_t* )inpbuf + s0;
+                                __fp16* inpbufC = (__fp16* )inpbuf + s0;
 
                                 for ( int d = d0; d < d1; d++)
                                 {
@@ -846,7 +847,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                                         for (int w = w0; w < w1; w++)
                                         {
                                             int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
-                                            inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                            inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (__fp16)inptrInC[imgofs];
                                         }
                                     }
                                 }
@@ -888,7 +889,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                 {
                     float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i;
 #ifdef CONV_ARM_FP16
-                    float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i;
+                    __fp16 * inpbuf_ki_FP16 = (__fp16 *)inpbuf + k * CONV_NR * Cg + i;
 #endif
 
                     int zi = z0 * stride_d + dz - pad_front;
@@ -939,11 +940,8 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                                 {
                                     for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
                                     {
-                                        float32x4_t v0, v1;
-                                        v0[0] = inptr_ki[0], v0[1] = inptr_ki[2];
-                                        v0[2] = inptr_ki[4], v0[3] = inptr_ki[6];
-                                        v1[0] = inptr_ki[8], v1[1] = inptr_ki[10];
-                                        v1[2] = inptr_ki[12], v1[3] = inptr_ki[14];
+                                        float32x4_t v0 = {inptr_ki[0], inptr_ki[2], inptr_ki[4], inptr_ki[6]};
+                                        float32x4_t v1 = {inptr_ki[8], inptr_ki[10], inptr_ki[12], inptr_ki[14]};
                                         vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
                                     }
                                 }
@@ -972,12 +970,8 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                                 {
                                     for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
                                     {
-                                        float32x4_t v0, v1;
-
-                                        v0[0] = inptr_ki[0], v0[1] = inptr_ki[stride_w];
-                                        v0[2] = inptr_ki[stride_w * 2], v0[3] = inptr_ki[stride_w * 3];
-                                        v1[0] = inptr_ki[stride_w * 4], v1[1] = inptr_ki[stride_w * 5];
-                                        v1[2] = inptr_ki[stride_w * 6], v1[3] = inptr_ki[stride_w * 7];
+                                        float32x4_t v0 = {inptr_ki[0], inptr_ki[stride_w], inptr_ki[stride_w * 2], inptr_ki[stride_w * 3]};
+                                        float32x4_t v1 = {inptr_ki[stride_w * 4], inptr_ki[stride_w * 5], inptr_ki[stride_w * 6], inptr_ki[stride_w * 7]};
                                         vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
                                     }
                                 }
@@ -1034,9 +1028,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                                 {
                                     for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
                                     {
-                                        float32x4_t v0;
-                                        v0[0] = inptr_ki[0], v0[1] = inptr_ki[stride_w];
-                                        v0[2] = inptr_ki[stride_w * 2], v0[3] = inptr_ki[stride_w * 3];
+                                        float32x4_t v0 = {inptr_ki[0], inptr_ki[stride_w], inptr_ki[stride_w * 2], inptr_ki[stride_w * 3]};
                                         vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
                                     }
                                 }
@@ -1061,7 +1053,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                             if (useFP16)
                             {
                                 for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
-                                    inpbuf_ki_FP16[0] = (float16_t)(*inptr_ki);
+                                    inpbuf_ki_FP16[0] = (__fp16)(*inptr_ki);
                             }
                             else
 #endif
@@ -1077,7 +1069,7 @@ static inline void packInputData(char* inpbuf_task, float* inp, const int* ofsta
                         if (useFP16)
                         {
                             for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR)
-                                inpbuf_ki_FP16[0] = (float16_t)0.f;
+                                inpbuf_ki_FP16[0] = (__fp16)0.f;
                         }
                         else
 #endif
@@ -1174,10 +1166,9 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     else
         activ = nullptr;
 
-    // TODO: support FP16 for winograd.
     if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
     {
-        CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D && !useFP16);
+        CV_Assert((!conv->weightsWinoBuf.empty() || !conv->weightsWinoBuf_FP16.empty()) && input.dims == 4 && conv_dim == CONV_2D);
         if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
             return;
     }
@@ -1266,7 +1257,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
         // works at FP 16.
         CONV_NR = CONV_NR_FP16;
         CONV_MR = CONV_MR_FP16;
-        esz = sizeof(float16_t);
+        esz = sizeof(__fp16);
     }
 #endif
 
@@ -1290,7 +1281,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     else
         Kg_nblocks = 1;
 
-    bool separateIm2col = fast_1x1 || stripes_per_plane == 1;
+    bool separateIm2col = (fast_1x1 || stripes_per_plane == 1) && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN;
 
     int Kstripes = Kg_nblocks * stripes_per_plane;
     int nsubtasks = N * ngroups * Kstripes;
@@ -1437,13 +1428,13 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                 if (useFP16)
                 {
                     CV_Assert(!conv->weightsBuf_FP16.empty());
-                    weights = (char *)conv->weightsBufPtr_FP16;
+                    weights = (char *)conv->getWeightsFP16();
                 }
                 else
 #endif
                 {
                     CV_Assert(!conv->weightsBuf.empty());
-                    weights = (char *)conv->weightsBufPtr;
+                    weights = (char *)conv->getWeights();
                 }
                 // optional branch, only for depth-wise convolution which was implemented by generic convolution.
                 // In this case, CONV_MR is 1, and CONV_NR remains the same.
@@ -1477,7 +1468,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
 #ifdef CONV_ARM_FP16
                             if (useFP16)
                             {
-                                opt_NEON::convBlockMR1_FP16(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+                                opt_NEON_FP16::convBlockMR1_F16(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
                             }
                             else
 #endif
@@ -1485,7 +1476,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                         }
                         else
 #endif
-                        convBlockMR1(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+                        convBlockMR1_F32(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
 
                         if (ifBuffer)
                         {
@@ -1520,18 +1511,18 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
 
                             char *wptr = weights + (k0_block * DkHkWkCg + c0 * CONV_MR) * esz;
                             float *cptr = cbuf_task + stripe * CONV_NR;
-                            float16_t* cptr_f16 = (float16_t*)cbuf_task + stripe*CONV_NR;
+                            hfloat* cptr_f16 = (hfloat*)cbuf_task + stripe*CONV_NR;
                             for (int k = k0_block; k < k1_block; k += CONV_MR,
                                     wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc)
                             {
 #if CV_TRY_AVX2
                                 if (conv->useAVX2)
-                                    opt_AVX2::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
+                                    opt_AVX2::convBlock_F32(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                                 else
 #endif
 #if CV_TRY_AVX
                                 if (conv->useAVX)
-                                    opt_AVX::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
+                                    opt_AVX::convBlock_F32(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                                 else
 #endif
 #if CV_NEON
@@ -1540,23 +1531,23 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
 #ifdef CONV_ARM_FP16
                                     if (useFP16)
                                     {
-                                        opt_NEON::convBlock_FP16(c1 - c0, wptr, inptr, (char *)cptr_f16, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
+                                        opt_NEON_FP16::convBlock_F16(c1 - c0, wptr, inptr, (char *)cptr_f16, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                                     }
                                     else
 #endif
-                                    opt_NEON::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
+                                    opt_NEON::convBlock_F32(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                                 }
                                 else
 #endif
                                 // The possible outLen range is 24 or 8~1.
-                                convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
+                                convBlock_F32(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                             }
                         }
                     }
 
                     size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
                     const float *cptr = cbuf_task;
-                    const float16_t *cptr_fp16 = (const float16_t *)cbuf_task;
+                    const hfloat *cptr_fp16 = (const hfloat *)cbuf_task;
                     float *outptr = out + outofs;
                     const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0;
 
@@ -1838,7 +1829,7 @@ static inline void convBlockMR1x12(int np, const float* a, const float* b, float
 }
 #endif
 
-void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+void convBlockMR1_F32(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
                   const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
 {
 #if CV_SIMD128
@@ -2088,7 +2079,7 @@ static inline void convBlockNoSIMD(int np, const float* a, const float* b, float
     }
 }
 
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
+void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
                const int convMR, const int convNR)
 {
     // The possible outLen range is [24, 8~1].
diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.hpp b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
index 22ef9a857527..5c8055337c5b 100644
--- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
@@ -14,7 +14,7 @@
 #define CONV_NR_FP32 28
 
 // The FP16 can only be supported by ARM64 and with FP16 FMA supported.
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // check FP16 FMA.
+#if CV_FP16 && CV_TRY_NEON_FP16 // check FP16 FMA.
 #define CONV_ARM_FP16 1
 #endif
 
@@ -22,7 +22,6 @@
 // Currently, only ARM 64 support FP16.
 #define CONV_MR_FP16 8
 #define CONV_NR_FP16 24
-typedef __fp16 float16_t; // Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h.
 #endif
 
 #elif CV_NEON              // 16 registers.
@@ -58,17 +57,15 @@ struct FastConv
     int pad_top, pad_bottom, pad_left, pad_right, pad_front, pad_behind;
 
     std::vector<float> weightsBuf;     // For generic Conv 2D
-    float* weightsBufPtr;
     std::vector<float> weightsWinoBuf; // For Winograd F(6x6, 3x3).
-    float* weightsWinoBufPtr;
     std::vector<float> biasBuf;
+    float* getWeights();
+    float* getWeightsWino();
 
-#if CV_NEON && CV_NEON_AARCH64 && CV_FP16
-    std::vector<float16_t> weightsBuf_FP16;
-    float16_t* weightsBufPtr_FP16;
-    std::vector<float16_t> weightsWinoBuf_FP16;
-    float16_t* weightsWinoBufPtr_FP16;
-#endif
+    std::vector<hfloat> weightsBuf_FP16;
+    std::vector<hfloat> weightsWinoBuf_FP16;
+    hfloat* getWeightsFP16();
+    hfloat* getWeightsWinoFP16();
 
     int conv_type;
     int conv_dim;  // Flag for conv1d, conv2d, or conv3d.
@@ -115,6 +112,32 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
 int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
                   float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
 
+// Work around of NEON, the following functions are only used internally.
+namespace opt_NEON {
+#if CV_NEON
+void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR);
+
+void convBlockMR1_F32(int np, const float* a, const float* b, float* c, const float bias, bool init_c,
+                      const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR);
+
+#if CV_NEON_AARCH64
+/* Accumulate */
+void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+                    const int winoIblock, const int winoKblock, const int winoAtom, const int winoNatom);
+
+/*Input transform*/
+void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep,
+                       float* outptr, int Cg, const int winoIblock, const int winoAtom);
+
+/*Output transform*/
+void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep,
+                       float* bpptr, int bpstep, float* outptr, int outstep,
+                       float bias, float minval, float maxval, bool ifMinMaxAct);
+#endif // CV_NEON_AARCH64
+#endif // CV_NEON
+} // namespace opt_NEON.
+
+
 } // namespace dnn
 } // namespace cv
 
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp
new file mode 100644
index 000000000000..f8fe2bb40e1f
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp
@@ -0,0 +1,402 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/runtime/ficus/impl/gemm.impl.h).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include "../../precomp.hpp"
+#include "fast_gemm.hpp"
+
+#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+#include "fast_gemm_kernels.simd.hpp"
+#include "layers/cpu_kernels/fast_gemm_kernels.simd_declarations.hpp"
+#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+#include "fast_gemm_kernels.default.hpp"
+
+namespace cv { namespace dnn {
+
+size_t fastGemmPackBSize(size_t N, size_t K, const FastGemmOpt &opt) {
+#if CV_TRY_NEON
+    if (opt.use_neon) {
+        return static_cast<size_t>(opt_NEON::fastGemmPackBSize(N, K));
+    } else
+#endif
+#if CV_TRY_AVX2
+    if (opt.use_avx2) {
+        return static_cast<size_t>(opt_AVX2::fastGemmPackBSize(N, K));
+    } else
+#endif
+#if CV_TRY_AVX
+    if (opt.use_avx) {
+        return static_cast<size_t>(opt_AVX::fastGemmPackBSize(N, K));
+    } else
+#endif
+#if CV_TRY_LASX
+    if (opt.use_lasx) {
+        return static_cast<size_t>(opt_LASX::fastGemmPackBSize(N, K));
+    } else
+#endif
+    {
+        return static_cast<size_t>(cpu_baseline::fastGemmPackBSize(N, K));
+    }
+}
+
+void fastGemmPackB(const Mat &B, std::vector<float> &packed_B, bool trans, FastGemmOpt &opt) {
+    CV_CheckTypeEQ(B.type(), CV_32F, "fastGemmPackB: only float32 is supported for now");
+
+    auto B_shape = shape(B);
+    int batch = total(B_shape, 0, B_shape.size() - 2),
+        K = B_shape[B_shape.size() - 2], N = B_shape.back(), ldb0 = N, ldb1 = 1;
+    if (trans) {
+        std::swap(K, N);
+        std::swap(ldb0, ldb1);
+    }
+
+    const auto *b = B.ptr<const char>();
+    int esz = B.elemSize();
+
+#if CV_TRY_NEON
+    if (opt.use_neon) {
+        int size_packed_B = opt_NEON::fastGemmPackBSize(N, K);
+        packed_B.resize(size_packed_B * batch);
+        auto *packed_b = (char*)packed_B.data();
+        for (int i = 0; i < batch; i++) {
+            opt_NEON::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz);
+            b += N * K * esz;
+            packed_b += size_packed_B * esz;
+        }
+    } else
+#endif
+#if CV_TRY_AVX2
+    if (opt.use_avx2) {
+        int size_packed_B = opt_AVX2::fastGemmPackBSize(N, K);
+        packed_B.resize(size_packed_B * batch);
+        auto *packed_b = (char*)packed_B.data();
+        for (int i = 0; i < batch; i++) {
+            opt_AVX2::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz);
+            b += N * K * esz;
+            packed_b += size_packed_B * esz;
+        }
+    } else
+#endif
+#if CV_TRY_AVX
+    if (opt.use_avx) {
+        int size_packed_B = opt_AVX::fastGemmPackBSize(N, K);
+        packed_B.resize(size_packed_B * batch);
+        auto *packed_b = (char*)packed_B.data();
+        for (int i = 0; i < batch; i++) {
+            opt_AVX::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz);
+            b += N * K * esz;
+            packed_b += size_packed_B * esz;
+        }
+    } else
+#endif
+#if CV_TRY_LASX
+    if (opt.use_lasx) {
+        int size_packed_B = opt_LASX::fastGemmPackBSize(N, K);
+        packed_B.resize(size_packed_B * batch);
+        auto *packed_b = (char*)packed_B.data();
+        for (int i = 0; i < batch; i++) {
+            opt_LASX::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz);
+            b += N * K * esz;
+            packed_b += size_packed_B * esz;
+        }
+    } else
+#endif
+    {
+        int size_packed_B = cpu_baseline::fastGemmPackBSize(N, K);
+        packed_B.resize(size_packed_B * batch);
+        auto *packed_b = (char*)packed_B.data();
+        for (int i = 0; i < batch; i++) {
+            cpu_baseline::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, esz);
+            b += N * K * esz;
+            packed_b += size_packed_B * esz;
+        }
+    }
+}
+
+void fastGemmPackB(bool trans, size_t N, size_t K, const float *B, size_t ldb, float *packed_B, const FastGemmOpt &opt) {
+    size_t ldb0 = ldb, ldb1 = 1;
+    if (trans) {
+        std::swap(K, N);
+        std::swap(ldb0, ldb1);
+    }
+
+    const auto &b = (const char *)B;
+    auto *packed_b = (char *)packed_B;
+
+#if CV_TRY_NEON
+    if (opt.use_neon) {
+        opt_NEON::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, sizeof(float));
+    } else
+#endif
+#if CV_TRY_AVX2
+    if (opt.use_avx2) {
+        opt_AVX2::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, sizeof(float));
+    } else
+#endif
+#if CV_TRY_AVX
+    if (opt.use_avx) {
+        opt_AVX::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, sizeof(float));
+    } else
+#endif
+#if CV_TRY_LASX
+    if (opt.use_lasx) {
+        opt_LASX::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, sizeof(float));
+    } else
+#endif
+    {
+        cpu_baseline::fastGemmPackBKernel(b, packed_b, N, K, ldb0, ldb1, sizeof(float));
+    }
+}
+
+static void fast_gemm_thin(float alpha, float beta, int M, int N, int K,
+                           const char *a_, int lda0, int lda1,
+                           const char *b_, int ldb,
+                           char *c_, int ldc, bool multi_thread) {
+    const float* a = (const float*)a_;
+
+    auto fn = [&](const Range &r) {
+        for(int start = r.start ; start < r.end; start++ ) {
+            float* c_i = (float*)c_ + start * ldc;
+            if (beta == 0.f)
+                for(int j = 0; j < N; j++ ) c_i[j] = 0.f;
+            else if (beta != 1.f)
+                for(int j = 0; j < N; j++ ) c_i[j] *= beta;
+            for(int k = 0; k < K; k++ ) {
+                const float* b_k = (const float*)b_ + k * ldb;
+                float aval = alpha * a[start * lda0 + k * lda1];
+                for(int j = 0; j < N; j++ )
+                    c_i[j] += aval * b_k[j];
+            }
+        }
+    };
+
+    if (multi_thread) {
+        int total = M; // outer loops
+        int cost_per_thread = static_cast<int>(K * N); // inner loops
+        double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
+    } else {
+        fn(Range(0, M));
+    }
+}
+
+void fastGemm(bool trans_a, int M, int N, int K,
+              float alpha, const float *A, int lda,
+              const float *packed_B, float beta,
+              float *C, int ldc, FastGemmOpt &opt) {
+    const char *a = (const char *)A;
+    const char *packed_b = (const char *)packed_B;
+    char *c = (char *)C;
+
+    int lda0 = lda, lda1 = 1;
+    if (trans_a) {
+        std::swap(lda0, lda1);
+    }
+
+#if CV_TRY_NEON
+    if (opt.use_neon) {
+        opt_NEON::fastGemmKernel(M, N, K, alpha, a, lda0, lda1, packed_b, beta, c, ldc, sizeof(float), opt.multi_thread);
+    } else
+#endif
+#if CV_TRY_AVX2
+    if (opt.use_avx2) {
+        opt_AVX2::fastGemmKernel(M, N, K, alpha, a, lda0, lda1, packed_b, beta, c, ldc, sizeof(float), opt.multi_thread);
+    } else
+#endif
+#if CV_TRY_AVX
+    if (opt.use_avx) {
+        opt_AVX::fastGemmKernel(M, N, K, alpha, a, lda0, lda1, packed_b, beta, c, ldc, sizeof(float), opt.multi_thread);
+    } else
+#endif
+#if CV_TRY_LASX
+    if (opt.use_lasx) {
+        opt_LASX::fastGemmKernel(M, N, K, alpha, a, lda0, lda1, packed_b, beta, c, ldc, sizeof(float), opt.multi_thread);
+    } else
+#endif
+    {
+        cpu_baseline::fastGemmKernel(M, N, K, alpha, a, lda0, lda1, packed_b, beta, c, ldc, sizeof(float), opt.multi_thread);
+    }
+}
+
+void fastGemm(bool trans_a, bool trans_b, int ma, int na, int mb, int nb,
+              float alpha, const float *A, int lda0, int lda1, const float *B, int ldb0, int ldb1,
+              float beta, float *C, int ldc, FastGemmOpt &opt) {
+    const char *a = (const char *)A;
+    const char *b = (const char *)B;
+    char *c = (char *)C;
+
+    int M = trans_a ? na : ma;
+    int N = trans_b ? mb : nb;
+    int K = trans_a ? ma : na;
+
+    if (trans_a) {
+        std::swap(lda0, lda1);
+    }
+    if (trans_b) {
+        std::swap(ldb0, ldb1);
+    }
+
+    if (!trans_b && ldb1 == 1 && (M <= 4 || (uint64_t)M * N * K <= 10000)) {
+        return fast_gemm_thin(alpha, beta, M, N, K, a, lda0, lda1, b, ldb0, c, ldc, opt.multi_thread);
+    }
+
+#if CV_TRY_NEON
+    if (opt.use_neon) {
+        opt_NEON::fastGemmKernel(M, N, K, alpha, a, lda0, lda1,
+                                 b, ldb0, ldb1, beta,
+                                 c, ldc, sizeof(float), opt.multi_thread);
+    } else
+#endif
+#if CV_TRY_AVX2
+    if (opt.use_avx2) {
+        opt_AVX2::fastGemmKernel(M, N, K, alpha, a, lda0, lda1,
+                                 b, ldb0, ldb1, beta,
+                                 c, ldc, sizeof(float), opt.multi_thread);
+    } else
+#endif
+#if CV_TRY_AVX
+    if (opt.use_avx) {
+        opt_AVX::fastGemmKernel(M, N, K, alpha, a, lda0, lda1,
+                                 b, ldb0, ldb1, beta,
+                                 c, ldc, sizeof(float), opt.multi_thread);
+    } else
+#endif
+#if CV_TRY_LASX
+    if (opt.use_lasx) {
+        opt_LASX::fastGemmKernel(M, N, K, alpha, a, lda0, lda1,
+                                 b, ldb0, ldb1, beta,
+                                 c, ldc, sizeof(float), opt.multi_thread);
+    } else
+#endif
+    {
+        cpu_baseline::fastGemmKernel(M, N, K, alpha, a, lda0, lda1,
+                                     b, ldb0, ldb1, beta,
+                                     c, ldc, sizeof(float), opt.multi_thread);
+    }
+}
+
+void fastGemm(bool trans_a, bool trans_b,
+              float alpha, const Mat &A, const Mat &B,
+              float beta, Mat &C, FastGemmOpt &opt) {
+    CV_CheckTypeEQ(A.type(), CV_32F, "DNN/fastGemm: only support float32 for now");
+    CV_CheckTypeEQ(A.type(), B.type(), "DNN/fastGemm: A and B should have the same type");
+    CV_CheckTypeEQ(B.type(), C.type(), "DNN/fastGemm: B and C should have the same type");
+
+    const auto shape_a = shape(A);
+    CV_CheckEQ(shape_a.size(), static_cast<size_t>(2), "DNN/fastGemm: A must be 2-dimensional");
+    const auto shape_b = shape(B);
+    CV_CheckEQ(shape_b.size(), static_cast<size_t>(2), "DNN/fastGemm: B must be 2-dimensional");
+    const auto shape_c = shape(C);
+    CV_CheckEQ(shape_c.size(), static_cast<size_t>(2), "DNN/fastGemm: C must be 2-dimensional");
+
+    int ma = shape_a[0], na = shape_a[1];
+    int mb = shape_b[0], nb = shape_b[1];
+
+    int lda0 = na, lda1 = 1, ldb0 = nb, ldb1 = 1, ldc = shape_c[1];
+
+    const float *a = A.ptr<const float>();
+    const float *b = B.ptr<const float>();
+    float *c = C.ptr<float>();
+
+    fastGemm(trans_a, trans_b, ma, na, mb, nb,
+             alpha, a, lda0, lda1, b, ldb0, ldb1,
+             beta, c, ldc, opt);
+}
+
+void fastGemmBatch(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                   int M, int N, int K, float alpha, const float *A, int lda0, int lda1,
+                   const float *B, int ldb0, int ldb1, float beta, float *C, int ldc, FastGemmOpt &opt) {
+    const char *a = (const char *)A;
+    const char *b = (const char *)B;
+    char *c = (char *)C;
+
+#if CV_TRY_NEON
+    if (opt.use_neon) {
+        opt_NEON::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float));
+    } else
+#endif
+#if CV_TRY_AVX2
+    if (opt.use_avx2) {
+        opt_AVX2::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float));
+    } else
+#endif
+#if CV_TRY_AVX
+    if (opt.use_avx) {
+        opt_AVX::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float));
+    } else
+#endif
+#if CV_TRY_LASX
+    if (opt.use_lasx) {
+        opt_LASX::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float));
+    } else
+#endif
+    {
+        cpu_baseline::fastGemmBatchKernel(batch, A_offsets, B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, ldb0, ldb1, beta, c, ldc, sizeof(float));
+    }
+}
+
+void fastGemmBatch(size_t batch, const size_t *A_offsets, const size_t *packed_B_offsets, const size_t *C_offsets,
+                   int M, int N, int K, float alpha, const float *A, int lda0, int lda1,
+                   const float *packed_B, float beta, float *C, int ldc, FastGemmOpt &opt) {
+    const char *a = (const char *)A;
+    const char *b = (const char *)packed_B;
+    char *c = (char *)C;
+
+#if CV_TRY_NEON
+    if (opt.use_neon) {
+        opt_NEON::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float));
+    } else
+#endif
+#if CV_TRY_AVX2
+    if (opt.use_avx2) {
+        opt_AVX2::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float));
+    } else
+#endif
+#if CV_TRY_AVX
+    if (opt.use_avx) {
+        opt_AVX::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float));
+    } else
+#endif
+#if CV_TRY_LASX
+    if (opt.use_lasx) {
+        opt_LASX::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float));
+    } else
+#endif
+    {
+        cpu_baseline::fastGemmBatchKernel(batch, A_offsets, packed_B_offsets, C_offsets, M, N, K, alpha, a, lda0, lda1, b, beta, c, ldc, sizeof(float));
+    }
+}
+
+void fastGemmBatch(bool trans_a, bool trans_b,
+                   float alpha, const Mat &A, const Mat &B,
+                   float beta, Mat &C, FastGemmOpt &opt) {
+    CV_CheckTypeEQ(A.type(), B.type(), "DNN/fastGemmBatch: A and B should have the same type");
+    CV_CheckTypeEQ(B.type(), C.type(), "DNN/fastGemmBatch: B and C should have the same type");
+    CV_CheckTypeEQ(A.type(), CV_32F, "DNN/fastGemmBatch: only support float32 for now");
+
+    const auto shape_a = shape(A);
+    const auto shape_b = shape(B);
+    const auto shape_c = shape(C);
+    CV_CheckGE(shape_a.size(), static_cast<size_t>(2), "DNN/fastGemmBatch: A must be n-dimensional (n >= 2)");
+    CV_CheckGE(shape_b.size(), static_cast<size_t>(2), "DNN/fastGemmBatch: B must be n-dimensional (n >= 2)");
+
+    const float *a = A.ptr<const float>();
+    const float *b = B.ptr<const float>();
+    float *c = C.ptr<float>();
+
+    MatMulHelper helper;
+    helper.compute(trans_a, trans_b, shape_a, shape_b, shape_c);
+
+    fastGemmBatch(helper.batch, helper.A_offsets.data(), helper.B_offsets.data(), helper.C_offsets.data(),
+                  helper.M, helper.N, helper.K, alpha, a, helper.lda0, helper.lda1, b, helper.ldb0,
+                  helper.ldb1, beta, c, helper.ldc, opt);
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm.hpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm.hpp
new file mode 100644
index 000000000000..a207c63c3c79
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm.hpp
@@ -0,0 +1,181 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/runtime/ficus/impl/gemm.impl.h).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#ifndef OPENCV_DNN_FAST_GEMM_HPP
+#define OPENCV_DNN_FAST_GEMM_HPP
+
+#include "opencv2/core/hal/intrin.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv { namespace dnn {
+
+struct FastGemmOpt {
+    bool use_avx;
+    bool use_avx2;
+    bool use_neon;
+    bool use_lasx;
+    bool multi_thread;
+
+    FastGemmOpt() {
+        use_avx = false;
+        use_avx2 = false;
+        use_neon = false;
+        use_lasx = false;
+        multi_thread = false;
+    }
+
+    void init() {
+        use_avx = checkHardwareSupport(CPU_AVX);
+        use_avx2 = checkHardwareSupport(CPU_AVX2);
+        use_neon = checkHardwareSupport(CPU_NEON);
+        use_lasx = checkHardwareSupport(CPU_LASX);
+        multi_thread = true;
+    }
+
+    bool all() {
+        return use_avx || use_avx2 || use_neon || use_lasx;
+    }
+};
+
+struct MatMulHelper {
+    std::vector<size_t> A_offsets;
+    std::vector<size_t> B_offsets;
+    std::vector<size_t> packed_B_offsets;
+    std::vector<size_t> C_offsets;
+    std::vector<size_t> A_rows;
+    std::vector<size_t> B_rows;
+    std::vector<size_t> C_rows;
+    size_t batch;
+
+    int lda0, lda1;
+    int ldb0, ldb1;
+    int ldc;
+
+    int M, N, K;
+
+    MatMulHelper() {
+        A_offsets = {0};
+        B_offsets = {0};
+        packed_B_offsets = {0};
+        C_offsets = {0};
+        A_rows = {0};
+        B_rows = {0};
+        C_rows = {0};
+
+        batch = 0;
+    }
+
+    bool empty() const {
+        return batch == 0;
+    }
+
+    void compute(bool trans_a, bool trans_b, MatShape A_shape, MatShape B_shape, MatShape C_shape) {
+        auto A_ndims = A_shape.size(), B_ndims = B_shape.size(), C_ndims =  C_shape.size();
+        int ma = A_shape[A_ndims - 2], na = A_shape.back();
+        int mb = B_shape[B_ndims - 2], nb = B_shape.back();
+        lda0 = na, lda1 = 1;
+        ldb0 = nb, ldb1 = 1;
+        ldc = C_shape.back();
+
+        M = trans_a ? na : ma;
+        N = trans_b ? mb : nb;
+        K = trans_a ? ma : na;
+
+        if (trans_a) {
+            std::swap(lda0, lda1);
+        }
+        if (trans_b) {
+            std::swap(ldb0, ldb1);
+        }
+
+        // compute offsets
+        auto batch_ndims = C_ndims - 2;
+
+        batch = total(C_shape, 0, batch_ndims);
+
+        A_offsets.resize(batch, 0);
+        B_offsets.resize(batch, 0);
+        C_offsets.resize(batch, 0);
+        A_rows.resize(batch, 0);
+        B_rows.resize(batch, 0);
+        C_rows.resize(batch, 0);
+
+        // build C_offsets
+        size_t C_step = total(C_shape, C_ndims - 2);
+
+        MatShape A_broadcast_shape(C_ndims, 1);
+        std::memcpy(A_broadcast_shape.data() + (C_ndims - A_ndims), A_shape.data(), A_ndims * sizeof(int));
+        MatShape B_broadcast_shape(C_shape.size(), 1);
+        std::memcpy(B_broadcast_shape.data() + (C_ndims - B_ndims), B_shape.data(), B_shape.size() * sizeof(int));
+        std::vector<size_t> A_steps(C_ndims, 1), B_steps(C_ndims, 1);
+        for (int i = C_ndims - 2; i >= 0; i--) {
+            A_steps[i] = A_steps[i + 1] * A_broadcast_shape[i + 1];
+            B_steps[i] = B_steps[i + 1] * B_broadcast_shape[i + 1];
+        }
+        size_t t, idx;
+        for (size_t i = 0; i < batch; i++) {
+            C_offsets[i] = i * C_step;
+            C_rows[i] = i;
+
+            size_t A_offset = 0, B_offset = 0;
+            t = i;
+            for (int j = batch_ndims - 1; j >= 0; j--) {
+                idx = t / C_shape[j];
+                int idx_offset = (int)(t - idx * C_shape[j]);
+                A_offset += A_broadcast_shape[j] == 1 ? 0 : idx_offset * A_steps[j];
+                B_offset += B_broadcast_shape[j] == 1 ? 0 : idx_offset * B_steps[j];
+                t = idx;
+            }
+            A_offsets[i] = A_offset;
+            B_offsets[i] = B_offset;
+            A_rows[i] = A_offset / (M * K);
+            B_rows[i] = B_offset / (N * K);
+        }
+    }
+
+    // only run after compute
+    void updatePackedBOffsets(size_t packed_B_size) {
+        size_t packed_B_inner_size = packed_B_size / batch;
+        packed_B_offsets.resize(B_offsets.size());
+        for (size_t i = 0; i < packed_B_offsets.size(); i++) {
+            packed_B_offsets[i] = (B_offsets[i] / (N * K)) * packed_B_inner_size;
+        }
+    }
+};
+
+size_t fastGemmPackBSize(size_t N, size_t K, const FastGemmOpt &opt);
+
+void fastGemmPackB(const Mat &m, std::vector<float> &packed_B, bool trans, FastGemmOpt &opt);
+void fastGemmPackB(bool trans, size_t N, size_t K, const float *B, size_t ldb, float *packed_B, const FastGemmOpt &opt);
+
+void fastGemm(bool trans_a, int M, int N, int K,
+              float alpha, const float *A, int lda,
+              const float *packed_B, float beta,
+              float *C, int ldc, FastGemmOpt &opt);
+void fastGemm(bool trans_a, bool trans_b, int ma, int na, int mb, int nb,
+              float alpha, const float *A, int lda0, int lda1, const float *B, int ldb0, int ldb1,
+              float beta, float *C, int ldc, FastGemmOpt &opt);
+void fastGemm(bool trans_a, bool trans_b,
+              float alpha, const Mat &A, const Mat &B,
+              float beta, Mat &C, FastGemmOpt &opt);
+
+void fastGemmBatch(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                   int M, int N, int K, float alpha, const float *A, int lda0, int lda1,
+                   const float *B, int ldb0, int ldb1, float beta, float *C, int ldc, FastGemmOpt &opt);
+void fastGemmBatch(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                   int M, int N, int K, float alpha, const float *A, int lda0, int lda1,
+                   const float *packed_B, float beta, float *C, int ldc, FastGemmOpt &opt);
+void fastGemmBatch(bool trans_a, bool trans_b, float alpha, const Mat &A,
+                   const Mat &B, float beta, Mat &C, FastGemmOpt &opt);
+
+}} // cv::dnn
+
+#endif // OPENCV_DNN_FAST_GEMM_HPP
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp
new file mode 100644
index 000000000000..f6bd7317a238
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.default.hpp
@@ -0,0 +1,489 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/runtime/ficus/impl/gemm.impl.h).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include <opencv2/core/hal/intrin.hpp>
+#include <opencv2/core/utility.hpp> // parallel_for_
+
+#define FAST_GEMM_STORAGE (1<<20) // 2^20
+#define FAST_GEMM_MAX_STACKBUF (1 << 14)
+
+#define FAST_GEMM_F32_MC 64
+#define FAST_GEMM_F32_NC 240
+#define FAST_GEMM_F32_MR 8
+#define FAST_GEMM_F32_NR 12
+#define FAST_GEMM_F32_PACKED_STRIDE_K 64
+
+#define FAST_GEMM_IMPLEMENT_PACK(N, suffix, styp, dtyp) \
+static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
+                                      int lda0, int lda1, void* packA_ ) \
+{ \
+    const styp* A = (const styp*)A_; \
+    dtyp* packA = (dtyp*)packA_; \
+    for( int i = 0; i < m; i += N ) { \
+        if (i + N-1 < m) { \
+            const styp* a_ptr = A + lda0*i; \
+            for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
+            { \
+                FAST_GEMM_LOAD_TO_BUF_##N(styp); \
+                FAST_GEMM_PACK##suffix##_##N(buf, packA); \
+            } \
+        } else { \
+            const styp* a_ptr[N]; \
+            for (int k = 0; k < N; k++) a_ptr[k] = A + lda0*(i+k < m ? i+k : i); \
+            for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
+            { \
+                FAST_GEMM_LOAD_TO_BUF_BORDERS_##N(styp); \
+                FAST_GEMM_PACK##suffix##_##N(buf, packA); \
+            } \
+        } \
+    } \
+}
+
+#define FAST_GEMM_LOAD_TO_BUF_8(styp) \
+    styp buf[] = { \
+        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
+        a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7] }
+
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_8(styp) \
+    styp buf[] = { \
+        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
+        a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j] }
+
+#define FAST_GEMM_LOAD_TO_BUF_12(styp) \
+    styp buf[] = { \
+        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
+        a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7], \
+        a_ptr[j+lda0*8], a_ptr[j+lda0*9], a_ptr[j+lda0*10], a_ptr[j+lda0*11] }
+
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_12(styp) \
+    styp buf[] = { \
+        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
+        a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j], \
+        a_ptr[8][j], a_ptr[9][j], a_ptr[10][j], a_ptr[11][j] }
+
+#define FAST_GEMM_PACK_COPY(src, dst, N) \
+    memcpy((dst), (src), N*sizeof(src[0]))
+#define FAST_GEMM_PACK_f32_8(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 8)
+#define FAST_GEMM_PACK_f32_12(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 12)
+
+namespace cv { namespace dnn { namespace cpu_baseline {
+
+int fastGemmPackBSize(int N, int K);
+
+void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz);
+
+void fastGemmKernel(int M, int N, int K,
+                    float alpha, const char *A, int lda0, int lda1,
+                    const char *B, int ldb0, int ldb1,
+                    float beta, char *C, int ldc, int esz, bool multi_thread);
+void fastGemmKernel(int M, int N, int K,
+                    float alpha, const char *A, int lda0, int lda1,
+                    const char *packed_B, float beta, char *C, int ldc, int esz, bool multi_thread);
+
+void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                         int M, int N, int K, float alpha, const char *A, int lda0, int lda1,
+                         const char *B, int ldb0, int ldb1, float beta, char *C, int ldc, int esz);
+void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                         int M, int N, int K, float alpha, const char *A, int lda0, int lda1,
+                         const char *packed_B, float beta, char *C, int ldc, int esz);
+
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float)
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float)
+
+int fastGemmPackBSize(int N, int K) {
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+
+    return static_cast<int>((N + NC - 1) / NC) * NC * K;
+}
+
+void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    int n_tiles = (N + NC - 1) / NC;
+    for (int r = 0; r < n_tiles; ++r) {
+        int j0 = r * NC;
+        int nc = N - j0 < NC ? N - j0 : NC;
+        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
+        for (int k = 0; k < K; k += KC) {
+            int kc = K - k < KC ? K - k : KC;
+            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+            packed_B += _nc * kc;
+        }
+    }
+}
+
+static inline void fast_gemm_f32(int k, const char *a_, const char *b_,
+                                 char *c_, int ldc, float alpha) {
+    const float* a = (const float*)a_;
+    const float* b = (const float*)b_;
+    float* c = (float*)c_;
+
+    float sbuf[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR];
+    memset(sbuf, 0, sizeof(sbuf));
+    for(int p = 0; p < k; p++) {
+        for( int i = 0; i < FAST_GEMM_F32_MR; i++ ) {
+            float ai = a[FAST_GEMM_F32_MR * p + i];
+            for( int j = 0; j < FAST_GEMM_F32_NR; j++ )
+                sbuf[i * FAST_GEMM_F32_NR + j] += b[FAST_GEMM_F32_NR * p + j] * ai;
+        }
+    }
+    for (int i = 0; i < FAST_GEMM_F32_MR; i++) {
+        for (int j = 0; j < FAST_GEMM_F32_NR; j++)
+            c[i * ldc + j] += alpha * sbuf[i * FAST_GEMM_F32_NR + j];
+    }
+}
+
+static void fast_gemm_macro_kernel(int m, int n, int k,
+                                   const char *packed_A, const char *packed_B,
+                                   float alpha, char *c, int ldc0, int esz) {
+    int ldc0_esz = ldc0 * esz;
+
+    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
+    for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
+        for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
+            char* cptr0 = &c[i * ldc0_esz + j * esz];
+            char* cptr = cptr0;
+            int ldc = ldc0;
+            int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
+            int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
+            int nr_esz = nr * esz;
+            bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
+            if (partial) {
+                memset(tempC, 0, sizeof(tempC));
+                cptr = (char *)tempC;
+                ldc = FAST_GEMM_F32_NR;
+                for(int p = 0; p < mr; p++)
+                    memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
+            }
+            fast_gemm_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+
+            if (partial) {
+                for(int p = 0; p < mr; p++)
+                    memcpy(cptr0 + p * ldc0_esz, cptr + p * (ldc * esz), nr_esz);
+            }
+        }
+    }
+}
+
+void fastGemmKernel(int M, int N, int K,
+                    float alpha, const char *A, int lda0, int lda1,
+                    const char *B, int ldb0, int ldb1,
+                    float beta, char *C, int ldc, int esz, bool multi_thread) {
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
+
+    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
+    KC = KC > 8 ? KC : 8;
+    KC = KC < K ? KC : K;
+
+    size_t buff_size = KC * (MC + NC) * esz;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
+    int m_tiles = (M + MC - 1) / MC;
+    int n_tiles = (N + NC - 1) / NC;
+    int total_tiles = m_tiles * n_tiles;
+
+    auto fn = [&](const Range &r) {
+        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
+        char* packed_b = packed_a + KC * MC * esz;
+        int start = r.start;
+        int end = r.end;
+
+        for (int tile_idx = start; tile_idx < end; tile_idx++) {
+            int i0 = (tile_idx / n_tiles) * MC;
+            int j0 = (tile_idx % n_tiles) * NC;
+            int mc = M - i0 < MC ? M - i0 : MC;
+            int nc = N - j0 < NC ? N - j0 : NC;
+            int ldc_block = ldc;
+            char* c_block = C + (i0 * ldc + j0) * esz;
+
+            if (beta == 0.f) {
+                for(int i = 0; i < mc; i++)
+                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
+            } else if (beta != 1.f) {
+                for(int i = 0; i < mc; i++) {
+                    float* c_i = (float*)c_block + i * ldc_block;
+                    for(int j = 0; j < nc; j++)
+                        c_i[j] *= beta;
+                }
+            }
+
+            for(int k0 = 0; k0 < K; k0 += KC)
+            {
+                int kc = K - k0 < KC ? K - k0 : KC;
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
+            }
+        }
+
+        if (!use_stackbuff) {
+            free(packed_a);
+        }
+    };
+
+    if (multi_thread) {
+        int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
+        double nstripes = (size_t)total_tiles * cost_per_thread * (1 / 1024.0);
+        parallel_for_(Range(0, total_tiles), fn, nstripes);
+    } else {
+        fn(Range(0, total_tiles));
+    }
+}
+
+void fastGemmKernel(int M, int N, int K,
+                    float alpha, const char *A, int lda0, int lda1,
+                    const char *packed_B, float beta, char *C, int ldc, int esz, bool multi_thread) {
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
+
+    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    size_t buff_size = KC * MC * esz;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
+    int m_tiles = (M + MC - 1) / MC;
+    int n_tiles = (N + NC - 1) / NC;
+    int total_tiles = m_tiles * n_tiles;
+
+    auto fn = [&](const Range &r) {
+        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); // TODO: use AutoBuffer
+        const char *packed_b_ = packed_B;
+        int start = r.start;
+        int end = r.end;
+
+        for (int tile_idx = start; tile_idx < end; tile_idx++) {
+            int i0 = (tile_idx / n_tiles) * MC;
+            int j0 = (tile_idx % n_tiles) * NC;
+            int mc = M - i0 < MC ? M - i0 : MC;
+            int nc = N - j0 < NC ? N - j0 : NC;
+            int ldc_block = ldc;
+            char* c_block = C + (i0 * ldc + j0) * esz;
+            packed_b_ = packed_B + j0 * K * esz;
+
+            if (beta == 0.f) {
+                for(int i = 0; i < mc; i++)
+                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
+            } else if (beta != 1.f) {
+                for(int i = 0; i < mc; i++) {
+                    float* c_i = (float*)c_block + i * ldc_block;
+                    for(int j = 0; j < nc; j++)
+                        c_i[j] *= beta;
+                }
+            }
+
+            int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
+            for(int k0 = 0; k0 < K; k0 += KC)
+            {
+                int kc = K - k0 < KC ? K - k0 : KC;
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
+                packed_b_ += _nc * kc;
+            }
+        }
+
+        if (!use_stackbuff) {
+            free(packed_a);
+        }
+    };
+
+    if (multi_thread) {
+        int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
+        double nstripes = (size_t)total_tiles * cost_per_thread * (1 / 1024.0);
+        parallel_for_(Range(0, total_tiles), fn, nstripes);
+    } else {
+        fn(Range(0, total_tiles));
+    }
+}
+
+void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                         int M, int N, int K, float alpha, const char *A, int lda0, int lda1,
+                         const char *B, int ldb0, int ldb1, float beta, char *C, int ldc, int esz) {
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
+
+    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    size_t buff_size = KC * (MC + NC) * esz;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
+    int m_tiles = (M + MC - 1) / MC;
+    int n_tiles = (N + NC - 1) / NC;
+    int total_tiles = m_tiles * n_tiles;
+
+    auto fn = [&](const Range &r) {
+        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
+        char* packed_b = packed_a + KC * MC * esz;
+        int start = r.start;
+        int end = r.end;
+
+        for (int tile_idx = start; tile_idx < end; tile_idx++) {
+            const int batch_index = static_cast<int>(tile_idx / total_tiles);
+            const int m_tiles_index = static_cast<int>((tile_idx - batch_index * total_tiles) / n_tiles);
+            const int n_tiles_index = static_cast<int>(tile_idx % n_tiles);
+
+            int i0 = m_tiles_index * MC;
+            int j0 = n_tiles_index * NC;
+            int mc = M - i0 < MC ? M - i0 : MC;
+            int nc = N - j0 < NC ? N - j0 : NC;
+            int ldc_block = ldc;
+            const char *a_block = A + A_offsets[batch_index] * esz;
+            const char *b_block = B + B_offsets[batch_index] * esz;
+            char* c_block = C + C_offsets[batch_index] * esz + (i0 * ldc + j0) * esz;
+
+            if (beta == 0.f) {
+                for(int i = 0; i < mc; i++)
+                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
+            } else if (beta != 1.f) {
+                for(int i = 0; i < mc; i++) {
+                    float* c_i = (float*)c_block + i * ldc_block;
+                    for(int j = 0; j < nc; j++)
+                        c_i[j] *= beta;
+                }
+            }
+
+            for(int k0 = 0; k0 < K; k0 += KC)
+            {
+                int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+                fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+
+                // pack b
+                fast_gemm_pack12_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+
+                // run kernel
+                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
+            }
+        }
+
+        if (!use_stackbuff) {
+            free(packed_a);
+        }
+    };
+
+    int total = batch * total_tiles;
+    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
+    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
+    parallel_for_(Range(0, total), fn, nstripes);
+}
+
+void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                         int M, int N, int K, float alpha, const char *A, int lda0, int lda1,
+                         const char *packed_B, float beta, char *C, int ldc, int esz) {
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
+
+    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    size_t buff_size = KC * MC * esz;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
+    int m_tiles = (M + MC - 1) / MC;
+    int n_tiles = (N + NC - 1) / NC;
+    int total_tiles = m_tiles * n_tiles;
+
+    auto fn = [&](const Range &r) {
+        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
+        const char *packed_b = packed_B;
+        int start = r.start;
+        int end = r.end;
+
+        for (int tile_idx = start; tile_idx < end; tile_idx++) {
+            const int batch_index = static_cast<int>(tile_idx / total_tiles);
+            const int m_tiles_index = static_cast<int>((tile_idx - batch_index * total_tiles) / n_tiles);
+            const int n_tiles_index = static_cast<int>(tile_idx % n_tiles);
+
+            int i0 = m_tiles_index * MC;
+            int j0 = n_tiles_index * NC;
+            int mc = M - i0 < MC ? M - i0 : MC;
+            int nc = N - j0 < NC ? N - j0 : NC;
+            int ldc_block = ldc;
+            const char *a_block = A + A_offsets[batch_index] * esz;
+            packed_b = packed_B + B_offsets[batch_index] * esz + j0 * K * esz;
+            char* c_block = C + C_offsets[batch_index] * esz + (i0 * ldc + j0) * esz;
+
+            if (beta == 0.f) {
+                for(int i = 0; i < mc; i++)
+                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
+            } else if (beta != 1.f) {
+                for(int i = 0; i < mc; i++) {
+                    float* c_i = (float*)c_block + i * ldc_block;
+                    for(int j = 0; j < nc; j++)
+                        c_i[j] *= beta;
+                }
+            }
+
+            int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
+            for(int k0 = 0; k0 < K; k0 += KC)
+            {
+                int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+                fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+
+                // run kernel
+                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
+                packed_b += _nc * kc;
+            }
+        }
+
+        if (!use_stackbuff) {
+            free(packed_a);
+        }
+    };
+
+    int total = batch * total_tiles;
+    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
+    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
+    parallel_for_(Range(0, total), fn, nstripes);
+}
+
+}}} // cv::dnn::cpu_baseline
+
+#undef FAST_GEMM_STORAGE
+#undef FAST_GEMM_MAX_STACKBUF
+#ifdef FAST_GEMM_F32_MC
+#undef FAST_GEMM_F32_MC
+#endif
+#ifdef FAST_GEMM_F32_NC
+#undef FAST_GEMM_F32_NC
+#endif
+#ifdef FAST_GEMM_F32_MR
+#undef FAST_GEMM_F32_MR
+#endif
+#ifdef FAST_GEMM_F32_NR
+#undef FAST_GEMM_F32_NR
+#endif
+#ifdef FAST_GEMM_F32_PACKED_STRIDE_K
+#undef FAST_GEMM_F32_PACKED_STRIDE_K
+#endif
+#undef FAST_GEMM_IMPLEMENT_PACK
+#undef FAST_GEMM_LOAD_TO_BUF_8
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_8
+#undef FAST_GEMM_LOAD_TO_BUF_12
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_12
+#undef FAST_GEMM_PACK_COPY
+#undef FAST_GEMM_PACK_f32_8
+#undef FAST_GEMM_PACK_f32_12
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
new file mode 100644
index 000000000000..39bf6800d7ff
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm_kernels.simd.hpp
@@ -0,0 +1,941 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/runtime/ficus/impl/gemm.impl.h).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include <opencv2/core/hal/intrin.hpp>
+#include <opencv2/core/utility.hpp> // parallel_for_
+
+#define FAST_GEMM_STORAGE (1<<20) // 2^20
+#define FAST_GEMM_MAX_STACKBUF (1 << 14)
+
+#if CV_AVX
+#define FAST_GEMM_F32_MC 60
+#define FAST_GEMM_F32_NC 320
+#elif CV_LASX
+#define FAST_GEMM_F32_MC 48
+#define FAST_GEMM_F32_NC 128
+#else // CV_NEON_AARCH64, SIMD128
+#define FAST_GEMM_F32_MC 144
+#define FAST_GEMM_F32_NC 72
+#endif
+
+#if CV_AVX
+#define FAST_GEMM_F32_MR 12
+#define FAST_GEMM_F32_NR 8
+#elif CV_LASX
+#define FAST_GEMM_F32_MR 12
+#define FAST_GEMM_F32_NR 16
+#else // CV_NEON_AARCH64, CV_SIMD128
+#define FAST_GEMM_F32_MR 8
+#define FAST_GEMM_F32_NR 12
+#endif
+
+#if CV_AVX
+#define FAST_GEMM_F32_PACKED_STRIDE_K 128
+#else // CV_LASX, CV_NEON_AARCH64, CV_SIMD128
+#define FAST_GEMM_F32_PACKED_STRIDE_K 64
+#endif
+
+#define FAST_GEMM_IMPLEMENT_PACK(N, suffix, styp, dtyp) \
+static void fast_gemm_pack##N##suffix( int m, int k, const void* A_, \
+                                      int lda0, int lda1, void* packA_ ) \
+{ \
+    const styp* A = (const styp*)A_; \
+    dtyp* packA = (dtyp*)packA_; \
+    for( int i = 0; i < m; i += N ) { \
+        if (i + N-1 < m) { \
+            const styp* a_ptr = A + lda0*i; \
+            for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
+            { \
+                FAST_GEMM_LOAD_TO_BUF_##N(styp); \
+                FAST_GEMM_PACK##suffix##_##N(buf, packA); \
+            } \
+        } else { \
+            const styp* a_ptr[N]; \
+            for (int k = 0; k < N; k++) a_ptr[k] = A + lda0*(i+k < m ? i+k : i); \
+            for( int j = 0; j < k*lda1; packA += N, j += lda1 ) \
+            { \
+                FAST_GEMM_LOAD_TO_BUF_BORDERS_##N(styp); \
+                FAST_GEMM_PACK##suffix##_##N(buf, packA); \
+            } \
+        } \
+    } \
+}
+
+#define FAST_GEMM_LOAD_TO_BUF_8(styp) \
+    styp buf[] = { \
+        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
+        a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7] }
+
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_8(styp) \
+    styp buf[] = { \
+        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
+        a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j] }
+
+#define FAST_GEMM_LOAD_TO_BUF_12(styp) \
+    styp buf[] = { \
+        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
+        a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7], \
+        a_ptr[j+lda0*8], a_ptr[j+lda0*9], a_ptr[j+lda0*10], a_ptr[j+lda0*11] }
+
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_12(styp) \
+    styp buf[] = { \
+        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
+        a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j], \
+        a_ptr[8][j], a_ptr[9][j], a_ptr[10][j], a_ptr[11][j] }
+
+#define FAST_GEMM_LOAD_TO_BUF_16(styp) \
+    styp buf[] = { \
+        a_ptr[j], a_ptr[j+lda0], a_ptr[j+lda0*2], a_ptr[j+lda0*3], \
+        a_ptr[j+lda0*4], a_ptr[j+lda0*5], a_ptr[j+lda0*6], a_ptr[j+lda0*7], \
+        a_ptr[j+lda0*8], a_ptr[j+lda0*9], a_ptr[j+lda0*10], a_ptr[j+lda0*11], \
+        a_ptr[j+lda0*12], a_ptr[j+lda0*13], a_ptr[j+lda0*14], a_ptr[j+lda0*15] }
+
+#define FAST_GEMM_LOAD_TO_BUF_BORDERS_16(styp) \
+    styp buf[] = { \
+        a_ptr[0][j], a_ptr[1][j], a_ptr[2][j], a_ptr[3][j], \
+        a_ptr[4][j], a_ptr[5][j], a_ptr[6][j], a_ptr[7][j], \
+        a_ptr[8][j], a_ptr[9][j], a_ptr[10][j], a_ptr[11][j], \
+        a_ptr[12][j], a_ptr[13][j], a_ptr[14][j], a_ptr[15][j] }
+
+#define FAST_GEMM_PACK_COPY(src, dst, N) \
+    memcpy((dst), (src), N*sizeof(src[0]))
+#define FAST_GEMM_PACK_f32_8(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 8)
+#define FAST_GEMM_PACK_f32_12(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 12)
+#define FAST_GEMM_PACK_f32_16(src, dst) FAST_GEMM_PACK_COPY((src), (dst), 16)
+
+namespace cv { namespace dnn {
+
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+int fastGemmPackBSize(int N, int K);
+
+void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz);
+
+void fastGemmKernel(int M, int N, int K,
+                    float alpha, const char *A, int lda0, int lda1,
+                    const char *B, int ldb0, int ldb1,
+                    float beta, char *C, int ldc, int esz, bool multi_thread);
+void fastGemmKernel(int M, int N, int K,
+                    float alpha, const char *A, int lda0, int lda1,
+                    const char *packed_B, float beta, char *C, int ldc, int esz, bool multi_thread);
+
+void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                         int M, int N, int K, float alpha, const char *A, int lda0, int lda1,
+                         const char *B, int ldb0, int ldb1, float beta, char *C, int ldc, int esz);
+void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                         int M, int N, int K, float alpha, const char *A, int lda0, int lda1,
+                         const char *packed_B, float beta, char *C, int ldc, int esz);
+
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+/*
+    Compute kernels that optimized for different platforms
+*/
+#if CV_NEON && CV_NEON_AARCH64 // AARCH64: 32 x 128-bit registers
+
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
+
+static inline void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
+                                     char *c_, int ldc, float alpha) {
+    const float* a = (const float*)a_;
+    const float* b = (const float*)b_;
+    float* c = (float*)c_;
+
+    float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
+    float32x4_t s10 = s00, s11 = s00, s12 = s00;
+    float32x4_t s20 = s00, s21 = s00, s22 = s00;
+    float32x4_t s30 = s00, s31 = s00, s32 = s00;
+    float32x4_t s40 = s00, s41 = s00, s42 = s00;
+    float32x4_t s50 = s00, s51 = s00, s52 = s00;
+    float32x4_t s60 = s00, s61 = s00, s62 = s00;
+    float32x4_t s70 = s00, s71 = s00, s72 = s00;
+
+    for(int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR)
+    {
+        float32x4_t a0 = vld1q_f32(a);
+        float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+
+        s00 = vfmaq_laneq_f32(s00, b0, a0, 0);
+        s01 = vfmaq_laneq_f32(s01, b1, a0, 0);
+        s02 = vfmaq_laneq_f32(s02, b2, a0, 0);
+        s10 = vfmaq_laneq_f32(s10, b0, a0, 1);
+        s11 = vfmaq_laneq_f32(s11, b1, a0, 1);
+        s12 = vfmaq_laneq_f32(s12, b2, a0, 1);
+
+        s20 = vfmaq_laneq_f32(s20, b0, a0, 2);
+        s21 = vfmaq_laneq_f32(s21, b1, a0, 2);
+        s22 = vfmaq_laneq_f32(s22, b2, a0, 2);
+        s30 = vfmaq_laneq_f32(s30, b0, a0, 3);
+        s31 = vfmaq_laneq_f32(s31, b1, a0, 3);
+        s32 = vfmaq_laneq_f32(s32, b2, a0, 3);
+
+        a0 = vld1q_f32(a + 4);
+
+        s40 = vfmaq_laneq_f32(s40, b0, a0, 0);
+        s41 = vfmaq_laneq_f32(s41, b1, a0, 0);
+        s42 = vfmaq_laneq_f32(s42, b2, a0, 0);
+        s50 = vfmaq_laneq_f32(s50, b0, a0, 1);
+        s51 = vfmaq_laneq_f32(s51, b1, a0, 1);
+        s52 = vfmaq_laneq_f32(s52, b2, a0, 1);
+
+        s60 = vfmaq_laneq_f32(s60, b0, a0, 2);
+        s61 = vfmaq_laneq_f32(s61, b1, a0, 2);
+        s62 = vfmaq_laneq_f32(s62, b2, a0, 2);
+        s70 = vfmaq_laneq_f32(s70, b0, a0, 3);
+        s71 = vfmaq_laneq_f32(s71, b1, a0, 3);
+        s72 = vfmaq_laneq_f32(s72, b2, a0, 3);
+    }
+
+    float32x4_t c0, c1, c2, c3, c4, c5, v_alpha = vdupq_n_f32(alpha);
+#define FAST_GEMM_FINALE(row0, row1)         \
+    c0 = vld1q_f32(c + row0 * ldc);          \
+    c1 = vld1q_f32(c + row0 * ldc + 4);      \
+    c2 = vld1q_f32(c + row0 * ldc + 8);      \
+    c3 = vld1q_f32(c + row1 * ldc);          \
+    c4 = vld1q_f32(c + row1 * ldc + 4);      \
+    c5 = vld1q_f32(c + row1 * ldc + 8);      \
+    c0 = vfmaq_f32(c0, s##row0##0, v_alpha); \
+    c1 = vfmaq_f32(c1, s##row0##1, v_alpha); \
+    c2 = vfmaq_f32(c2, s##row0##2, v_alpha); \
+    c3 = vfmaq_f32(c3, s##row1##0, v_alpha); \
+    c4 = vfmaq_f32(c4, s##row1##1, v_alpha); \
+    c5 = vfmaq_f32(c5, s##row1##2, v_alpha); \
+    vst1q_f32(c + row0 * ldc, c0);           \
+    vst1q_f32(c + row0 * ldc + 4, c1);       \
+    vst1q_f32(c + row0 * ldc + 8, c2);       \
+    vst1q_f32(c + row1 * ldc, c3);           \
+    vst1q_f32(c + row1 * ldc + 4, c4);       \
+    vst1q_f32(c + row1 * ldc + 8, c5);
+
+    FAST_GEMM_FINALE(0, 1);
+    FAST_GEMM_FINALE(2, 3);
+    FAST_GEMM_FINALE(4, 5);
+    FAST_GEMM_FINALE(6, 7);
+#undef FAST_GEMM_FINALE
+}
+
+#elif CV_AVX // AVX and AVX2 (16 x 256-bit registers)
+
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
+
+#if !CV_FMA3 // AVX workaround for FMA
+#undef _mm256_fmadd_ps
+#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
+#endif
+
+static inline void fast_gemm12x8_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
+    const float* a = (const float*)a_;
+    const float* b = (const float*)b_;
+    float* c = (float*)c_;
+
+    __m256 s00 = _mm256_setzero_ps(),
+           s10 = _mm256_setzero_ps(),
+           s20 = _mm256_setzero_ps(),
+           s30 = _mm256_setzero_ps(),
+           s40 = _mm256_setzero_ps(),
+           s50 = _mm256_setzero_ps(),
+           s60 = _mm256_setzero_ps(),
+           s70 = _mm256_setzero_ps(),
+           s80 = _mm256_setzero_ps(),
+           s90 = _mm256_setzero_ps(),
+           s100 = _mm256_setzero_ps(),
+           s110 = _mm256_setzero_ps();
+    for (int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR) {
+        __m256 b0 = _mm256_loadu_ps(b);
+
+        __m256 a0 = _mm256_set1_ps(*a);
+        s00 = _mm256_fmadd_ps(b0, a0, s00);
+        __m256 a1 = _mm256_set1_ps(*(a + 1));
+        s10 = _mm256_fmadd_ps(b0, a1, s10);
+        __m256 a2 = _mm256_set1_ps(*(a + 2));
+        s20 = _mm256_fmadd_ps(b0, a2, s20);
+
+        a0 = _mm256_set1_ps(*(a + 3));
+        s30 = _mm256_fmadd_ps(b0, a0, s30);
+        a1 = _mm256_set1_ps(*(a + 4));
+        s40 = _mm256_fmadd_ps(b0, a1, s40);
+        a2 = _mm256_set1_ps(*(a + 5));
+        s50 = _mm256_fmadd_ps(b0, a2, s50);
+
+        a0 = _mm256_set1_ps(*(a + 6));
+        s60 = _mm256_fmadd_ps(b0, a0, s60);
+        a1 = _mm256_set1_ps(*(a + 7));
+        s70 = _mm256_fmadd_ps(b0, a1, s70);
+        a2 = _mm256_set1_ps(*(a + 8));
+        s80 = _mm256_fmadd_ps(b0, a2, s80);
+
+        a0 = _mm256_set1_ps(*(a + 9));
+        s90 = _mm256_fmadd_ps(b0, a0, s90);
+        a1 = _mm256_set1_ps(*(a + 10));
+        s100 = _mm256_fmadd_ps(b0, a1, s100);
+        a2 = _mm256_set1_ps(*(a + 11));
+        s110 = _mm256_fmadd_ps(b0, a2, s110);
+    }
+
+    __m256 c0, c1, c2, c3, v_alpha = _mm256_set1_ps(alpha);
+#define FAST_GEMM_FINALE(row0, row1, row2, row3)    \
+    c0 = _mm256_loadu_ps(c + row0 * ldc);   \
+    c1 = _mm256_loadu_ps(c + row1 * ldc);   \
+    c2 = _mm256_loadu_ps(c + row2 * ldc);   \
+    c3 = _mm256_loadu_ps(c + row3 * ldc);   \
+    c0 = _mm256_fmadd_ps(s##row0##0, v_alpha, c0);  \
+    c1 = _mm256_fmadd_ps(s##row1##0, v_alpha, c1);  \
+    c2 = _mm256_fmadd_ps(s##row2##0, v_alpha, c2);  \
+    c3 = _mm256_fmadd_ps(s##row3##0, v_alpha, c3);  \
+    _mm256_storeu_ps(c + row0 * ldc, c0);   \
+    _mm256_storeu_ps(c + row1 * ldc, c1);   \
+    _mm256_storeu_ps(c + row2 * ldc, c2);   \
+    _mm256_storeu_ps(c + row3 * ldc, c3);   \
+
+    FAST_GEMM_FINALE(0, 1,  2,  3);
+    FAST_GEMM_FINALE(4, 5,  6,  7);
+    FAST_GEMM_FINALE(8, 9, 10, 11);
+#undef FAST_GEMM_FINALE
+}
+
+#elif CV_LASX // LASX (32 x 256-bit registers)
+
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(16, _f32, float, float) // b packer
+
+static inline void fast_gemm12x16_f32(int k, const char *a_, const char *b_, char *c_, int ldc, float alpha) {
+    const float* a = (const float*)a_;
+    const float* b = (const float*)b_;
+    float* c = (float*)c_;
+
+    __m256 s00  = _v256_setall_ps(0), s01  = s00,
+           s10  = s00, s11  = s00,
+           s20  = s00, s21  = s00,
+           s30  = s00, s31  = s00,
+           s40  = s00, s41  = s00,
+           s50  = s00, s51  = s00,
+           s60  = s00, s61  = s00,
+           s70  = s00, s71  = s00,
+           s80  = s00, s81  = s00,
+           s90  = s00, s91  = s00,
+           s100 = s00, s101 = s00,
+           s110 = s00, s111 = s00;
+    for (int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR) {
+        __m256 b0 = (__m256)__lasx_xvld(b, 0), b1 = (__m256)__lasx_xvld(b + 8, 0);
+
+        __m256 a0 = _v256_setall_ps(*a);
+        s00 = __lasx_xvfmadd_s(b0, a0, s00);
+        s01 = __lasx_xvfmadd_s(b1, a0, s01);
+        __m256 a1 = _v256_setall_ps(*(a + 1));
+        s10 = __lasx_xvfmadd_s(b0, a1, s10);
+        s11 = __lasx_xvfmadd_s(b1, a1, s11);
+        __m256 a2 = _v256_setall_ps(*(a + 2));
+        s20 = __lasx_xvfmadd_s(b0, a2, s20);
+        s21 = __lasx_xvfmadd_s(b1, a2, s21);
+        __m256 a3 = _v256_setall_ps(*(a + 3));
+        s30 = __lasx_xvfmadd_s(b0, a3, s30);
+        s31 = __lasx_xvfmadd_s(b1, a3, s31);
+
+        a0 = _v256_setall_ps(*(a + 4));
+        s40 = __lasx_xvfmadd_s(b0, a0, s40);
+        s41 = __lasx_xvfmadd_s(b1, a0, s41);
+        a1 = _v256_setall_ps(*(a + 5));
+        s50 = __lasx_xvfmadd_s(b0, a1, s50);
+        s51 = __lasx_xvfmadd_s(b1, a1, s51);
+        a2 = _v256_setall_ps(*(a + 6));
+        s60 = __lasx_xvfmadd_s(b0, a2, s60);
+        s61 = __lasx_xvfmadd_s(b1, a2, s61);
+        a3 = _v256_setall_ps(*(a + 7));
+        s70 = __lasx_xvfmadd_s(b0, a3, s70);
+        s71 = __lasx_xvfmadd_s(b1, a3, s71);
+
+        a0 = _v256_setall_ps(*(a + 8));
+        s80 = __lasx_xvfmadd_s(b0, a0, s80);
+        s81 = __lasx_xvfmadd_s(b1, a0, s81);
+        a1 = _v256_setall_ps(*(a + 9));
+        s90 = __lasx_xvfmadd_s(b0, a1, s90);
+        s91 = __lasx_xvfmadd_s(b1, a1, s91);
+        a2 = _v256_setall_ps(*(a + 10));
+        s100 = __lasx_xvfmadd_s(b0, a2, s100);
+        s101 = __lasx_xvfmadd_s(b1, a2, s101);
+        a3 = _v256_setall_ps(*(a + 11));
+        s110 = __lasx_xvfmadd_s(b0, a3, s110);
+        s111 = __lasx_xvfmadd_s(b1, a3, s111);
+    }
+
+    __m256 c0, c1, c2, c3, c4, c5, c6, c7, v_alpha = _v256_setall_ps(alpha);
+#define FAST_GEMM_FINALE(row0, row1, row2, row3)       \
+    c0 = (__m256)__lasx_xvld(c + row0 * ldc, 0);       \
+    c1 = (__m256)__lasx_xvld(c + row0 * ldc, 8 * 4);   \
+    c2 = (__m256)__lasx_xvld(c + row1 * ldc, 0);       \
+    c3 = (__m256)__lasx_xvld(c + row1 * ldc, 8 * 4);   \
+    c4 = (__m256)__lasx_xvld(c + row2 * ldc, 0);       \
+    c5 = (__m256)__lasx_xvld(c + row2 * ldc, 8 * 4);   \
+    c6 = (__m256)__lasx_xvld(c + row3 * ldc, 0);       \
+    c7 = (__m256)__lasx_xvld(c + row3 * ldc, 8 * 4);   \
+    c0 = __lasx_xvfmadd_s(s##row0##0, v_alpha, c0);    \
+    c1 = __lasx_xvfmadd_s(s##row0##1, v_alpha, c1);    \
+    c2 = __lasx_xvfmadd_s(s##row1##0, v_alpha, c2);    \
+    c3 = __lasx_xvfmadd_s(s##row1##1, v_alpha, c3);    \
+    c4 = __lasx_xvfmadd_s(s##row2##0, v_alpha, c4);    \
+    c5 = __lasx_xvfmadd_s(s##row2##1, v_alpha, c5);    \
+    c6 = __lasx_xvfmadd_s(s##row3##0, v_alpha, c6);    \
+    c7 = __lasx_xvfmadd_s(s##row3##1, v_alpha, c7);    \
+    __lasx_xvst(c0, c + row0 * ldc,     0);            \
+    __lasx_xvst(c1, c + row0 * ldc, 8 * 4);            \
+    __lasx_xvst(c2, c + row1 * ldc,     0);            \
+    __lasx_xvst(c3, c + row1 * ldc, 8 * 4);            \
+    __lasx_xvst(c4, c + row2 * ldc,     0);            \
+    __lasx_xvst(c5, c + row2 * ldc, 8 * 4);            \
+    __lasx_xvst(c6, c + row3 * ldc,     0);            \
+    __lasx_xvst(c7, c + row3 * ldc, 8 * 4);
+
+    FAST_GEMM_FINALE(0, 1,  2,  3);
+    FAST_GEMM_FINALE(4, 5,  6,  7);
+    FAST_GEMM_FINALE(8, 9, 10, 11);
+#undef FAST_GEMM_FINALE
+}
+
+#elif CV_SIMD128 // armv7: 16 x 128-bit registers
+
+FAST_GEMM_IMPLEMENT_PACK(8, _f32, float, float) // a packer
+FAST_GEMM_IMPLEMENT_PACK(12, _f32, float, float) // b packer
+
+static inline void fast_gemm8x12_f32(int k, const char *a_, const char *b_,
+                                     char *c_, int ldc, float alpha) {
+    const float* a = (const float*)a_;
+    const float* b = (const float*)b_;
+    float* c = (float*)c_;
+
+    v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
+    v_float32x4 s10 = s00, s11 = s00, s12 = s00;
+    v_float32x4 s20 = s00, s21 = s00, s22 = s00;
+    v_float32x4 s30 = s00, s31 = s00, s32 = s00;
+    v_float32x4 s40 = s00, s41 = s00, s42 = s00;
+    v_float32x4 s50 = s00, s51 = s00, s52 = s00;
+    v_float32x4 s60 = s00, s61 = s00, s62 = s00;
+    v_float32x4 s70 = s00, s71 = s00, s72 = s00;
+
+    for(int p = 0; p < k; p++, a += FAST_GEMM_F32_MR, b += FAST_GEMM_F32_NR) {
+        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+
+        v_float32x4 a0 = v_setall_f32(*a);
+        s00 = v_fma(b0, a0, s00);
+        s01 = v_fma(b1, a0, s01);
+        s02 = v_fma(b2, a0, s02);
+        v_float32x4 a1 = v_setall_f32(*(a + 1));
+        s10 = v_fma(b0, a1, s10);
+        s11 = v_fma(b1, a1, s11);
+        s12 = v_fma(b2, a1, s12);
+
+        v_float32x4 a2 = v_setall_f32(*(a + 2));
+        s20 = v_fma(b0, a2, s20);
+        s21 = v_fma(b1, a2, s21);
+        s22 = v_fma(b2, a2, s22);
+        v_float32x4 a3 = v_setall_f32(*(a + 3));
+        s30 = v_fma(b0, a3, s30);
+        s31 = v_fma(b1, a3, s31);
+        s32 = v_fma(b2, a3, s32);
+
+        a0 = v_setall_f32(*(a + 4));
+        s40 = v_fma(b0, a0, s40);
+        s41 = v_fma(b1, a0, s41);
+        s42 = v_fma(b2, a0, s42);
+        a1 = v_setall_f32(*(a + 5));
+        s50 = v_fma(b0, a1, s50);
+        s51 = v_fma(b1, a1, s51);
+        s52 = v_fma(b2, a1, s52);
+
+        a2 = v_setall_f32(*(a + 6));
+        s60 = v_fma(b0, a2, s60);
+        s61 = v_fma(b1, a2, s61);
+        s62 = v_fma(b2, a2, s62);
+        a3 = v_setall_f32(*(a + 7));
+        s70 = v_fma(b0, a3, s70);
+        s71 = v_fma(b1, a3, s71);
+        s72 = v_fma(b2, a3, s72);
+    }
+
+    v_float32x4 c0, c1, c2, c3, c4, c5, v_alpha = v_setall_f32(alpha);
+#define FAST_GEMM_FINALE(row0, row1)       \
+    c0 = v_load(c + row0 * ldc);         \
+    c1 = v_load(c + row0 * ldc + 4);     \
+    c2 = v_load(c + row0 * ldc + 8);     \
+    c3 = v_load(c + row1 * ldc);         \
+    c4 = v_load(c + row1 * ldc + 4);     \
+    c5 = v_load(c + row1 * ldc + 8);     \
+    c0 = v_fma(s##row0##0, v_alpha, c0); \
+    c1 = v_fma(s##row0##1, v_alpha, c1); \
+    c2 = v_fma(s##row0##2, v_alpha, c2); \
+    c3 = v_fma(s##row1##0, v_alpha, c3); \
+    c4 = v_fma(s##row1##1, v_alpha, c4); \
+    c5 = v_fma(s##row1##2, v_alpha, c5); \
+    v_store(c + row0 * ldc, c0);         \
+    v_store(c + row0 * ldc + 4, c1);     \
+    v_store(c + row0 * ldc + 8, c2);     \
+    v_store(c + row1 * ldc, c3);         \
+    v_store(c + row1 * ldc + 4, c4);     \
+    v_store(c + row1 * ldc + 8, c5);
+
+    FAST_GEMM_FINALE(0, 1);
+    FAST_GEMM_FINALE(2, 3);
+    FAST_GEMM_FINALE(4, 5);
+    FAST_GEMM_FINALE(6, 7);
+#undef FAST_GEMM_FINALE
+}
+
+#endif
+
+static inline void fast_gemm_macro_kernel(int m, int n, int k,
+                                          const char *packed_A, const char *packed_B,
+                                          float alpha, char *c, int ldc0, int esz) {
+    int ldc0_esz = ldc0 * esz;
+
+    double tempC[FAST_GEMM_F32_MR * FAST_GEMM_F32_NR]; // make sure the buffer is big enough
+    for(int i = 0; i < m; i += FAST_GEMM_F32_MR) {
+        for(int j = 0; j < n; j += FAST_GEMM_F32_NR) {
+            char* cptr0 = &c[i * ldc0_esz + j * esz];
+            char* cptr = cptr0;
+            int ldc = ldc0;
+            int mr = m - i < FAST_GEMM_F32_MR ? m - i : FAST_GEMM_F32_MR;
+            int nr = n - j < FAST_GEMM_F32_NR ? n - j : FAST_GEMM_F32_NR;
+            int nr_esz = nr * esz;
+            bool partial = (bool)((mr < FAST_GEMM_F32_MR) | (nr < FAST_GEMM_F32_NR));
+            if (partial) {
+                memset(tempC, 0, sizeof(tempC));
+                cptr = (char *)tempC;
+                ldc = FAST_GEMM_F32_NR;
+                for(int p = 0; p < mr; p++)
+                    memcpy(cptr + p * (ldc * esz), cptr0 + p * ldc0_esz, nr_esz);
+            }
+#if CV_NEON && CV_NEON_AARCH64
+            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_AVX
+            fast_gemm12x8_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_LASX
+            fast_gemm12x16_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#elif CV_SIMD128
+            fast_gemm8x12_f32(k, packed_A + i * k * esz, packed_B + j * k * esz, cptr, ldc, alpha);
+#endif
+
+            if (partial) {
+                for(int p = 0; p < mr; p++)
+                    memcpy(cptr0 + p * ldc0_esz, cptr + p * (ldc * esz), nr_esz);
+            }
+        }
+    }
+}
+
+int fastGemmPackBSize(int N, int K) {
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+
+    return static_cast<int>((N + NC - 1) / NC) * NC * K;
+}
+
+void fastGemmPackBKernel(const char *B, char *packed_B, int N, int K, int ldb0, int ldb1, int esz) {
+    int GEMM_NC = FAST_GEMM_F32_NC, GEMM_NR = FAST_GEMM_F32_NR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    int n_tiles = (N + NC - 1) / NC;
+    for (int r = 0; r < n_tiles; ++r) {
+        int j0 = r * NC;
+        int nc = N - j0 < NC ? N - j0 : NC;
+        int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
+        for (int k = 0; k < K; k += KC) {
+            int kc = K - k < KC ? K - k : KC;
+#if CV_NEON && CV_NEON_AARCH64
+            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_AVX
+            fast_gemm_pack8_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_LASX
+            fast_gemm_pack16_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#elif CV_SIMD128
+            fast_gemm_pack12_f32(nc, kc, B + (k * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_B);
+#endif
+            packed_B += _nc * kc;
+        }
+    }
+}
+
+void fastGemmKernel(int M, int N, int K,
+                    float alpha, const char *A, int lda0, int lda1,
+                    const char *B, int ldb0, int ldb1,
+                    float beta, char *C, int ldc, int esz, bool multi_thread) {
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
+
+    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = FAST_GEMM_STORAGE / ((MC + NC) * esz);
+    KC = KC > 8 ? KC : 8;
+    KC = KC < K ? KC : K;
+
+    size_t buff_size = KC * (MC + NC) * esz;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
+    int m_tiles = (M + MC - 1) / MC;
+    int n_tiles = (N + NC - 1) / NC;
+    int total_tiles = m_tiles * n_tiles;
+
+    auto fn = [&](const Range &r) {
+        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
+        char* packed_b = packed_a + KC * MC * esz;
+        int start = r.start;
+        int end = r.end;
+
+        for (int tile_idx = start; tile_idx < end; tile_idx++) {
+            int i0 = (tile_idx / n_tiles) * MC;
+            int j0 = (tile_idx % n_tiles) * NC;
+            int mc = M - i0 < MC ? M - i0 : MC;
+            int nc = N - j0 < NC ? N - j0 : NC;
+            int ldc_block = ldc;
+            char* c_block = C + (i0 * ldc + j0) * esz;
+
+            if (beta == 0.f) {
+                for(int i = 0; i < mc; i++)
+                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
+            } else if (beta != 1.f) {
+                for(int i = 0; i < mc; i++) {
+                    float* c_i = (float*)c_block + i * ldc_block;
+                    for(int j = 0; j < nc; j++)
+                        c_i[j] *= beta;
+                }
+            }
+
+            for(int k0 = 0; k0 < K; k0 += KC)
+            {
+                int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_AVX
+                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_LASX
+                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_SIMD128
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#endif
+
+                // pack b
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_AVX
+                fast_gemm_pack8_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_LASX
+                fast_gemm_pack16_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_SIMD128
+                fast_gemm_pack12_f32(nc, kc, B + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#endif
+
+                // run kernel
+                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
+            }
+        }
+
+        if (!use_stackbuff) {
+            free(packed_a);
+        }
+    };
+
+    if (multi_thread) {
+        int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
+        double nstripes = (size_t)total_tiles * cost_per_thread * (1 / 1024.0);
+        parallel_for_(Range(0, total_tiles), fn, nstripes);
+    } else {
+        fn(Range(0, total_tiles));
+    }
+
+}
+
+void fastGemmKernel(int M, int N, int K,
+                    float alpha, const char *A, int lda0, int lda1,
+                    const char *packed_B, float beta, char *C, int ldc, int esz, bool multi_thread) {
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
+
+    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    size_t buff_size = KC * MC * esz;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
+    int m_tiles = (M + MC - 1) / MC;
+    int n_tiles = (N + NC - 1) / NC;
+    int total_tiles = m_tiles * n_tiles;
+
+    auto fn = [&](const Range &r) {
+        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size)); // TODO: use AutoBuffer
+        const char *packed_b_ = packed_B;
+        int start = r.start;
+        int end = r.end;
+
+        for (int tile_idx = start; tile_idx < end; tile_idx++) {
+            int i0 = (tile_idx / n_tiles) * MC;
+            int j0 = (tile_idx % n_tiles) * NC;
+            int mc = M - i0 < MC ? M - i0 : MC;
+            int nc = N - j0 < NC ? N - j0 : NC;
+            int ldc_block = ldc;
+            char* c_block = C + (i0 * ldc + j0) * esz;
+            packed_b_ = packed_B + j0 * K * esz;
+
+            if (beta == 0.f) {
+                for(int i = 0; i < mc; i++)
+                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
+            } else if (beta != 1.f) {
+                for(int i = 0; i < mc; i++) {
+                    float* c_i = (float*)c_block + i * ldc_block;
+                    for(int j = 0; j < nc; j++)
+                        c_i[j] *= beta;
+                }
+            }
+
+            int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
+            for(int k0 = 0; k0 < K; k0 += KC)
+            {
+                int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_AVX
+                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_LASX
+                fast_gemm_pack12_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_SIMD128
+                fast_gemm_pack8_f32(mc, kc, A + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#endif
+
+                // run kernel
+                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b_, alpha, c_block, ldc_block, esz);
+                packed_b_ += _nc * kc;
+            }
+        }
+
+        if (!use_stackbuff) {
+            free(packed_a);
+        }
+    };
+
+    if (multi_thread) {
+        int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
+        double nstripes = (size_t)total_tiles * cost_per_thread * (1 / 1024.0);
+        parallel_for_(Range(0, total_tiles), fn, nstripes);
+    } else {
+        fn(Range(0, total_tiles));
+    }
+}
+
+void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                         int M, int N, int K, float alpha, const char *A, int lda0, int lda1,
+                         const char *B, int ldb0, int ldb1, float beta, char *C, int ldc, int esz) {
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
+
+    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    size_t buff_size = KC * (MC + NC) * esz;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
+    int m_tiles = (M + MC - 1) / MC;
+    int n_tiles = (N + NC - 1) / NC;
+    int total_tiles = m_tiles * n_tiles;
+
+    auto fn = [&](const Range &r) {
+        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
+        char* packed_b = packed_a + KC * MC * esz;
+        int start = r.start;
+        int end = r.end;
+
+        for (int tile_idx = start; tile_idx < end; tile_idx++) {
+            const int batch_index = static_cast<int>(tile_idx / total_tiles);
+            const int m_tiles_index = static_cast<int>((tile_idx - batch_index * total_tiles) / n_tiles);
+            const int n_tiles_index = static_cast<int>(tile_idx % n_tiles);
+
+            int i0 = m_tiles_index * MC;
+            int j0 = n_tiles_index * NC;
+            int mc = M - i0 < MC ? M - i0 : MC;
+            int nc = N - j0 < NC ? N - j0 : NC;
+            int ldc_block = ldc;
+            const char *a_block = A + A_offsets[batch_index] * esz;
+            const char *b_block = B + B_offsets[batch_index] * esz;
+            char* c_block = C + C_offsets[batch_index] * esz + (i0 * ldc + j0) * esz;
+
+            if (beta == 0.f) {
+                for(int i = 0; i < mc; i++)
+                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
+            } else if (beta != 1.f) {
+                for(int i = 0; i < mc; i++) {
+                    float* c_i = (float*)c_block + i * ldc_block;
+                    for(int j = 0; j < nc; j++)
+                        c_i[j] *= beta;
+                }
+            }
+
+            for(int k0 = 0; k0 < K; k0 += KC)
+            {
+                int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_AVX
+                fast_gemm_pack12_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_LASX
+                fast_gemm_pack12_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_SIMD128
+                fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#endif
+
+                // pack b
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack12_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_AVX
+                fast_gemm_pack8_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_LASX
+                fast_gemm_pack16_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#elif CV_SIMD128
+                fast_gemm_pack12_f32(nc, kc, b_block + (k0 * ldb0 + j0 * ldb1) * esz, ldb1, ldb0, packed_b);
+#endif
+
+                // run kernel
+                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
+            }
+        }
+
+        if (!use_stackbuff) {
+            free(packed_a);
+        }
+    };
+
+    int total = batch * total_tiles;
+    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
+    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
+    parallel_for_(Range(0, total), fn, nstripes);
+}
+
+void fastGemmBatchKernel(size_t batch, const size_t *A_offsets, const size_t *B_offsets, const size_t *C_offsets,
+                         int M, int N, int K, float alpha, const char *A, int lda0, int lda1,
+                         const char *packed_B, float beta, char *C, int ldc, int esz) {
+    int GEMM_MC = FAST_GEMM_F32_MC,
+        GEMM_NC = FAST_GEMM_F32_NC,
+        GEMM_MR = FAST_GEMM_F32_MR,
+        GEMM_NR = FAST_GEMM_F32_NR;
+
+    int MC = (((GEMM_MC < M ? GEMM_MC : M) + GEMM_MR - 1) / GEMM_MR) * GEMM_MR;
+    int NC = (((GEMM_NC < N ? GEMM_NC : N) + GEMM_NR - 1) / GEMM_NR) * GEMM_NR;
+    int KC = std::min(FAST_GEMM_F32_PACKED_STRIDE_K, K);
+
+    size_t buff_size = KC * MC * esz;
+    bool use_stackbuff = buff_size <= FAST_GEMM_MAX_STACKBUF;
+    int m_tiles = (M + MC - 1) / MC;
+    int n_tiles = (N + NC - 1) / NC;
+    int total_tiles = m_tiles * n_tiles;
+
+    auto fn = [&](const Range &r) {
+        char* packed_a = (char*)(use_stackbuff ? alloca(buff_size) : malloc(buff_size));
+        const char *packed_b = packed_B;
+        int start = r.start;
+        int end = r.end;
+
+        for (int tile_idx = start; tile_idx < end; tile_idx++) {
+            const int batch_index = static_cast<int>(tile_idx / total_tiles);
+            const int m_tiles_index = static_cast<int>((tile_idx - batch_index * total_tiles) / n_tiles);
+            const int n_tiles_index = static_cast<int>(tile_idx % n_tiles);
+
+            int i0 = m_tiles_index * MC;
+            int j0 = n_tiles_index * NC;
+            int mc = M - i0 < MC ? M - i0 : MC;
+            int nc = N - j0 < NC ? N - j0 : NC;
+            int ldc_block = ldc;
+            const char *a_block = A + A_offsets[batch_index] * esz;
+            packed_b = packed_B + B_offsets[batch_index] * esz + j0 * K * esz;
+            char* c_block = C + C_offsets[batch_index] * esz + (i0 * ldc + j0) * esz;
+
+            if (beta == 0.f) {
+                for(int i = 0; i < mc; i++)
+                    memset(c_block + i * ldc_block * esz, 0, nc * esz);
+            } else if (beta != 1.f) {
+                for(int i = 0; i < mc; i++) {
+                    float* c_i = (float*)c_block + i * ldc_block;
+                    for(int j = 0; j < nc; j++)
+                        c_i[j] *= beta;
+                }
+            }
+
+            int _nc = static_cast<int>((nc + GEMM_NR - 1) / GEMM_NR) * GEMM_NR * esz;
+            for(int k0 = 0; k0 < K; k0 += KC)
+            {
+                int kc = K - k0 < KC ? K - k0 : KC;
+                // pack a
+#if CV_NEON && CV_NEON_AARCH64
+                fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_AVX
+                fast_gemm_pack12_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_LASX
+                fast_gemm_pack12_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#elif CV_SIMD128
+                fast_gemm_pack8_f32(mc, kc, a_block + (i0 * lda0 + k0 * lda1) * esz, lda0, lda1, packed_a);
+#endif
+
+                // run kernel
+                fast_gemm_macro_kernel(mc, nc, kc, packed_a, packed_b, alpha, c_block, ldc_block, esz);
+                packed_b += _nc * kc;
+            }
+        }
+
+        if (!use_stackbuff) {
+            free(packed_a);
+        }
+    };
+
+    int total = batch * total_tiles;
+    int cost_per_thread = static_cast<int>((K / KC) * (MC / GEMM_MR) * (NC / GEMM_NR));
+    double nstripes = (size_t)total * cost_per_thread * (1 / 1024.0);
+    parallel_for_(Range(0, total), fn, nstripes);
+}
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+
+}} // cv::dnn
+
+#undef FAST_GEMM_STORAGE
+#undef FAST_GEMM_MAX_STACKBUF
+#ifdef FAST_GEMM_F32_MC
+#undef FAST_GEMM_F32_MC
+#endif
+#ifdef FAST_GEMM_F32_NC
+#undef FAST_GEMM_F32_NC
+#endif
+#ifdef FAST_GEMM_F32_MR
+#undef FAST_GEMM_F32_MR
+#endif
+#ifdef FAST_GEMM_F32_NR
+#undef FAST_GEMM_F32_NR
+#endif
+#ifdef FAST_GEMM_F32_PACKED_STRIDE_K
+#undef FAST_GEMM_F32_PACKED_STRIDE_K
+#endif
+#undef FAST_GEMM_IMPLEMENT_PACK
+#undef FAST_GEMM_LOAD_TO_BUF_8
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_8
+#undef FAST_GEMM_LOAD_TO_BUF_12
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_12
+#undef FAST_GEMM_LOAD_TO_BUF_16
+#undef FAST_GEMM_LOAD_TO_BUF_BORDERS_16
+#undef FAST_GEMM_PACK_COPY
+#undef FAST_GEMM_PACK_f32_8
+#undef FAST_GEMM_PACK_f32_12
+#undef FAST_GEMM_PACK_f32_16
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp b/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp
new file mode 100644
index 000000000000..35f354ed2969
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp
@@ -0,0 +1,208 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../../precomp.hpp"
+#include "fast_norm.hpp"
+
+namespace cv { namespace dnn {
+
+void fastNorm(const Mat &input, Mat &output, float epsilon, size_t normalized_axis, bool normalize_variance) {
+    const auto input_shape = shape(input);
+    CV_CheckLT(normalized_axis, input_shape.size(), "fastNorm: axis out of range");
+
+    size_t loops = static_cast<size_t>(total(input_shape, 0, static_cast<int>(normalized_axis))),
+           norm_size = static_cast<size_t>(total(input_shape, static_cast<int>(normalized_axis)));
+    float inv_norm_size = 1.0 / norm_size;
+
+    auto fn = [&](const Range &r) {
+        const auto *input_data = input.ptr<const float>();
+        auto *output_data = output.ptr<float>();
+        for (int i = r.start; i < r.end; i++) {
+            const auto *x = input_data + norm_size * i;
+            auto *y = output_data + norm_size * i;
+
+            float mean = 0.f, mean_square = 0.f;
+            for (int j = 0; j < norm_size; j++) {
+                float v = x[j];
+                mean += v;
+                mean_square += v * v;
+            }
+
+            mean *= inv_norm_size;
+            mean_square = std::sqrt(std::max(0.f, mean_square * inv_norm_size - mean * mean) + epsilon);
+            float inv_stdev = normalize_variance ? 1.f / mean_square : 1.f;
+
+            for (size_t j = 0; j < norm_size; j++) {
+                y[j] = (x[j] - mean) * inv_stdev;
+            }
+        }
+    };
+    double nstripes = loops * norm_size * (1 / 1024.0);
+    parallel_for_(Range(0, loops), fn, nstripes);
+}
+
+void fastNorm(const Mat &input, const Mat &scale, Mat &output, float epsilon, size_t normalized_axis) {
+    const auto input_shape = shape(input);
+    CV_CheckLT(normalized_axis, input_shape.size(), "fastNorm: axis out of range");
+
+    size_t loops = static_cast<size_t>(total(input_shape, 0, static_cast<int>(normalized_axis))),
+           norm_size = static_cast<size_t>(total(input_shape, static_cast<int>(normalized_axis)));
+    float inv_norm_size = 1.0 / norm_size;
+
+    auto fn = [&](const Range &r) {
+        const auto *input_data = input.ptr<const float>();
+        const auto *scale_data = scale.ptr<const float>();
+        auto *output_data = output.ptr<float>();
+        for (int i = r.start; i < r.end; i++) {
+            const auto *x = input_data + norm_size * i;
+            auto *y = output_data + norm_size * i;
+
+            float mean = 0.f, mean_square = 0.f;
+            for (int j = 0; j < norm_size; j++) {
+                float v = x[j];
+                mean += v;
+                mean_square += v * v;
+            }
+
+            mean *= inv_norm_size;
+            mean_square = std::sqrt(std::max(0.f, mean_square * inv_norm_size - mean * mean) + epsilon);
+            float inv_stdev = 1.f / mean_square;
+
+            for (size_t j = 0; j < norm_size; j++) {
+                y[j] = scale_data[j] * (x[j] - mean) * inv_stdev;
+            }
+        }
+    };
+    double nstripes = loops * norm_size * (1 / 1024.0);
+    parallel_for_(Range(0, loops), fn, nstripes);
+}
+
+void fastNorm(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t normalized_axis) {
+    const auto input_shape = shape(input);
+    CV_CheckLT(normalized_axis, input_shape.size(), "fastNorm: axis out of range");
+    CV_CheckEQ(scale.total(), bias.total(), "fastNorm: scale and bias should have the same shape");
+
+    size_t loops = static_cast<size_t>(total(input_shape, 0, static_cast<int>(normalized_axis))),
+           norm_size = static_cast<size_t>(total(input_shape, static_cast<int>(normalized_axis)));
+    float inv_norm_size = 1.0 / norm_size;
+
+    auto fn = [&](const Range &r) {
+        const auto *input_data = input.ptr<const float>();
+        const auto *scale_data = scale.ptr<const float>();
+        const auto *bias_data = bias.ptr<const float>();
+        auto *output_data = output.ptr<float>();
+        for (int i = r.start; i < r.end; i++) {
+            const auto *x = input_data + norm_size * i;
+            auto *y = output_data + norm_size * i;
+
+            float mean = 0.f, mean_square = 0.f;
+            for (int j = 0; j < norm_size; j++) {
+                float v = x[j];
+                mean += v;
+                mean_square += v * v;
+            }
+
+            mean *= inv_norm_size;
+            mean_square = std::sqrt(std::max(0.f, mean_square * inv_norm_size - mean * mean) + epsilon);
+            float inv_stdev = 1.f / mean_square;
+
+            for (size_t j = 0; j < norm_size; j++) {
+                y[j] = scale_data[j] * (x[j] - mean) * inv_stdev + bias_data[j];
+            }
+        }
+    };
+    double nstripes = loops * norm_size * (1 / 1024.0);
+    parallel_for_(Range(0, loops), fn, nstripes);
+}
+
+void fastNormChannel(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon) {
+    const auto input_shape = shape(input);
+    size_t N = input_shape[0], C = input_shape[1];
+    CV_CheckEQ(scale.total(), bias.total(), "fastNormChannel: scale and bias should have the same shape");
+    CV_CheckEQ(scale.total(), C, "fastNormChannel: scale should be a 1d tensor and match the channel of input");
+    CV_CheckGE(input.dims, 3, "fastNormChannel: input dimension >= 3");
+
+    size_t loops = N * C,
+           norm_size = static_cast<size_t>(total(input_shape, 2));
+    float inv_norm_size = 1.0 / norm_size;
+
+    auto fn = [&](const Range &r) {
+        const auto *input_data = input.ptr<const float>();
+        const auto *scale_data = scale.ptr<const float>();
+        const auto *bias_data = bias.ptr<const float>();
+        auto *output_data = output.ptr<float>();
+        for (int i = r.start; i < r.end; i++) {
+            const auto *x = input_data + norm_size * i;
+            auto *y = output_data + norm_size * i;
+
+            float mean = 0.f, mean_square = 0.f;
+            for (int j = 0; j < norm_size; j++) {
+                float v = x[j];
+                mean += v;
+                mean_square += v * v;
+            }
+
+            mean *= inv_norm_size;
+            mean_square = std::sqrt(std::max(0.f, mean_square * inv_norm_size - mean * mean) + epsilon);
+            float inv_stdev = 1.f / mean_square;
+
+            size_t c = i % C;
+            float s = scale_data[c] * inv_stdev, b = bias_data[c];
+            for (size_t j = 0; j < norm_size; j++) {
+                y[j] = s * (x[j] - mean) + b;
+            }
+        }
+    };
+    double nstripes = loops * norm_size * (1 / 1024.0);
+    parallel_for_(Range(0, loops), fn, nstripes);
+}
+
+void fastNormGroup(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t num_groups) {
+    const auto input_shape = shape(input);
+    size_t N = input_shape[0], C = input_shape[1];
+    CV_CheckEQ(scale.total(), bias.total(), "fastNormGroup: scale and bias should have the same shape");
+    CV_CheckEQ(scale.total(), C, "fastNormGroup: scale should be a 1d tensor and match the channel of input");
+    CV_CheckGE(input.dims, 3, "fastNormGroup: input dimension >= 3");
+
+    size_t channels_per_group = C / num_groups;
+    size_t loops = N * num_groups;
+    size_t norm_size = static_cast<size_t>(total(input_shape, 2) * channels_per_group);
+    size_t step = norm_size / channels_per_group;
+    float inv_norm_size = 1.0 / norm_size;
+
+    auto fn = [&](const Range &r) {
+        const auto *input_data = input.ptr<const float>();
+        const auto *scale_data = scale.ptr<const float>();
+        const auto *bias_data = bias.ptr<const float>();
+        auto *output_data = output.ptr<float>();
+
+        for (int i = r.start; i < r.end; i++) {
+            const auto *x = input_data + norm_size * i;
+            auto *y = output_data + norm_size * i;
+
+            float mean = 0.f, mean_square = 0.f;
+            for (int j = 0; j < norm_size; j++) {
+                float v = x[j];
+                mean += v;
+                mean_square += v * v;
+            }
+
+            mean *= inv_norm_size;
+            mean_square = std::sqrt(std::max(0.f, mean_square * inv_norm_size - mean * mean) + epsilon);
+            float inv_stdev = 1.f / mean_square;
+
+            size_t group_idx = i % num_groups * channels_per_group;
+            for (size_t j = 0; j < norm_size; j++) {
+                size_t c = group_idx + (j / step);
+                float s = scale_data[c] * inv_stdev, b = bias_data[c];
+                y[j] = s * (x[j] - mean) + b;
+            }
+        }
+    };
+
+    double nstripes = loops * norm_size * (1 / 1024.0);
+    parallel_for_(Range(0, loops), fn, nstripes);
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp b/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp
new file mode 100644
index 000000000000..72cbdad0a7ca
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_FAST_NORM_HPP
+#define OPENCV_DNN_FAST_NORM_HPP
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv { namespace dnn {
+
+// Normalization speedup by multi-threading, mainly for Caffe MVN layer which has normalize_variance parameter.
+void fastNorm(const Mat &input, Mat &output, float epsilon, size_t normalized_axis = 0, bool normalize_variance = true);
+
+// Normalization speedup by multi-threading with absent bias. Mainly for LayerNormalization.
+void fastNorm(const Mat &input, const Mat &scale, Mat &output, float epsilon, size_t normalized_axis = 0);
+
+// Normalization speedup by multi-threading with scale and bias. Mainly for LayerNormalization.
+void fastNorm(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t normalized_axis = 0);
+
+// Channel-wise Normalization speedup by multi-threading. Scale and bias should have the same shape (C). Input should have dimension >= 3.
+void fastNormChannel(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon);
+
+// Group-wise Normalization speedup by multi-threading. Scale and bias should have the same shape (C). Input should have dimension >= 3.
+void fastNormGroup(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t num_groups);
+
+}} // cv::dnn
+
+#endif // OPENCV_DNN_FAST_NORM_HPP
diff --git a/modules/dnn/src/layers/cpu_kernels/softmax.cpp b/modules/dnn/src/layers/cpu_kernels/softmax.cpp
new file mode 100644
index 000000000000..eb258ecfa210
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/softmax.cpp
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpNN.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include "../../precomp.hpp"
+#include "softmax.hpp"
+
+namespace cv { namespace dnn {
+
+void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
+    CV_Assert(src.type() == CV_32F);
+    CV_Assert(src.isContinuous() && dst.isContinuous());
+    CV_Assert(src.size == dst.size);
+    axis = normalize_axis(axis, src.dims);
+
+    size_t outerSize = src.total(0, axis),
+           innerSize = src.total(axis + 1);
+
+    const float *srcPtr = src.ptr<float>();
+    float *dstPtr = dst.ptr<float>();
+
+    size_t outerStep = src.total(axis);
+    size_t cnStep = src.total(axis + 1);
+
+    // multi-threads
+    size_t totalTasks = outerSize * innerSize;
+    double nstripes = (double) totalTasks / 1024.0;
+    // make the channel axis to be multiple of 8
+    size_t channelAxis = (axisStep + 7) & -8;
+
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int nlanes = VTraits<v_float32>::vlanes();
+    // the number of redundant dimension
+    size_t redundantDim = nlanes - axisStep % nlanes;
+#endif
+
+    parallel_for_(Range(0, (int) totalTasks), [&](const Range &range) {
+        AutoBuffer<float> axisBuf_(channelAxis);
+        float *axisBuf = axisBuf_.data();
+
+        for (size_t i = range.start; i < range.end; i++) {
+            size_t outerDim = i / innerSize;
+            size_t innerDim = i % innerSize;
+            size_t srcOffset = outerDim * outerStep + innerDim;
+            // copy data from src to buf along axis, since the data may not be continuous
+            for (size_t cnDim = 0; cnDim < axisStep; cnDim++)
+                axisBuf[cnDim] = srcPtr[srcOffset + (cnDim + axisBias) * cnStep];
+
+            float s = 0.f;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            // make the value of the redundant dimension to be -FLT_MAX
+            if (redundantDim != nlanes) {
+                for (size_t j = axisStep; j < axisStep + redundantDim; j++)
+                    axisBuf[j] = -FLT_MAX;
+            }
+            // calculate the max value along the axis
+            v_float32 vmax = vx_load(axisBuf);
+            for (size_t cnDim = nlanes; cnDim < axisStep; cnDim += nlanes) {
+                v_float32 val = vx_load(axisBuf + cnDim);
+                vmax = v_max(vmax, val);
+            }
+            float maxVal = v_reduce_max(vmax);
+
+            // calculate the exp value along the axis
+            v_float32 vs = vx_setzero_f32();
+            vmax = vx_setall_f32(maxVal);
+            // initialize vexp constant
+            v_float32 _vexp_lo = vx_setall_f32(-88.3762626647949f);
+            v_float32 _vexp_hi = vx_setall_f32(88.3762626647949f);
+            v_float32 _vexp_half = vx_setall_f32(0.5f);
+            v_float32 _vexp_one = vx_setall_f32(1.f);
+            v_float32 _vexp_LOG2EF = vx_setall_f32(1.44269504088896341f);
+            v_float32 _vexp_C1 = vx_setall_f32(-0.693359375f);
+            v_float32 _vexp_C2 = vx_setall_f32(2.12194440e-4f);
+            v_float32 _vexp_p0 = vx_setall_f32(1.9875691500E-4f);
+            v_float32 _vexp_p1 = vx_setall_f32(1.3981999507E-3f);
+            v_float32 _vexp_p2 = vx_setall_f32(8.3334519073E-3f);
+            v_float32 _vexp_p3 = vx_setall_f32(4.1665795894E-2f);
+            v_float32 _vexp_p4 = vx_setall_f32(1.6666665459E-1f);
+            v_float32 _vexp_p5 = vx_setall_f32(5.0000001201E-1f);
+            // initialize temp vectors for vexp
+            v_float32 val, _vexp_, _vexp_x, _vexp_y, _vexp_z;
+            v_int32 _vexp_mm;
+
+            // calculate and sum all data along axis
+            for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) {
+                val = vx_load(axisBuf + cnDim);
+                val = v_sub(val, vmax);
+
+                // compute vexp of val
+                _vexp_x = v_min(val, _vexp_hi);
+                _vexp_x = v_max(_vexp_x, _vexp_lo);
+                _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF, _vexp_half);
+                _vexp_mm = v_floor(_vexp_);
+                _vexp_ = v_cvt_f32(_vexp_mm);
+                _vexp_mm = v_add(_vexp_mm, vx_setall_s32(0x7f));
+                _vexp_mm = v_shl(_vexp_mm, 23);
+                _vexp_x = v_fma(_vexp_, _vexp_C1, _vexp_x);
+                _vexp_x = v_fma(_vexp_, _vexp_C2, _vexp_x);
+                _vexp_z = v_mul(_vexp_x, _vexp_x);
+                _vexp_y = v_fma(_vexp_x, _vexp_p0, _vexp_p1);
+                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2);
+                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3);
+                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4);
+                _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5);
+                _vexp_y = v_fma(_vexp_y, _vexp_z, _vexp_x);
+                _vexp_y = v_add(_vexp_y, _vexp_one);
+                val = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
+
+                vs = v_add(vs, val);
+                v_store(axisBuf + cnDim, val);
+            }
+
+            s = v_reduce_sum(vs);
+            // subtract the value of the redundant dimension
+            if (redundantDim != nlanes) {
+                float _val[VTraits<v_float32>::max_nlanes];
+                v_store(_val, val);
+                for (size_t j = nlanes - redundantDim; j < nlanes; j++)
+                    s -= _val[j];
+            }
+#else
+            float maxVal = axisBuf[0];
+            for (size_t cnDim = 1; cnDim < axisStep; cnDim++) {
+                maxVal = std::max(maxVal, axisBuf[cnDim]);
+            }
+            for (size_t j = 0; j < axisStep; j++) {
+                axisBuf[j] = expf(axisBuf[j] - maxVal);
+                s += axisBuf[j];
+            }
+#endif
+            s = 1.f / s;
+
+            // copy back the result to src
+            for (size_t cnDim = 0; cnDim < axisStep; cnDim++)
+                dstPtr[srcOffset + (cnDim + axisBias) * cnStep] = axisBuf[cnDim] * s;
+        }
+    }, nstripes);
+}
+
+void softmax(Mat &dst, const Mat &src, int axis) {
+    softmax(dst, src, axis, 0, src.size[axis]);
+}
+
+void logSoftmax(Mat &dst, const Mat &src, int axis) {
+    softmax(dst, src, axis);
+    log(dst, dst);
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/softmax.hpp b/modules/dnn/src/layers/cpu_kernels/softmax.hpp
new file mode 100644
index 000000000000..19a89fa8786a
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/softmax.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpNN.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#ifndef OPENCV_DNN_SOFTMAX_HPP
+#define OPENCV_DNN_SOFTMAX_HPP
+
+#include "opencv2/core/hal/intrin.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv { namespace dnn {
+
+void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep);
+
+void softmax(Mat &dst, const Mat &src, int axis);
+
+void logSoftmax(Mat &dst, const Mat &src, int axis);
+
+}} // cv::dnn
+
+#endif // OPENCV_DNN_SOFTMAX_HPP
diff --git a/modules/dnn/src/layers/crop_and_resize_layer.cpp b/modules/dnn/src/layers/crop_and_resize_layer.cpp
index eb8822870fa0..6c29d6b8f493 100644
--- a/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp
@@ -55,7 +55,7 @@ class CropAndResizeLayerImpl CV_FINAL : public CropAndResizeLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -133,28 +133,28 @@ class CropAndResizeLayerImpl CV_FINAL : public CropAndResizeLayer
         auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         auto rois = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
 
-        auto rois_shape = rois->get_shape();
+        auto rois_shape = rois.get_shape();
         std::vector<int64_t> dims(rois_shape.begin(), rois_shape.end()), offsets(4, 0);
         offsets[3] = 2;
         dims[3] = 7;
 
-        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                             ngraph::Shape{offsets.size()}, offsets.data());
-        auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                             ngraph::Shape{dims.size()}, dims.data());
-        auto strides = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                        ngraph::Shape{dims.size()}, std::vector<int64_t>((int64_t)dims.size(), 1));
-        auto slice = std::make_shared<ngraph::op::v1::StridedSlice>(rois,
+        auto lower_bounds = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                             ov::Shape{offsets.size()}, offsets.data());
+        auto upper_bounds = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                             ov::Shape{dims.size()}, dims.data());
+        auto strides = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                        ov::Shape{dims.size()}, std::vector<int64_t>((int64_t)dims.size(), 1));
+        auto slice = std::make_shared<ov::op::v1::StridedSlice>(rois,
                                       lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
 
         // Reshape rois from 4D to 2D
         std::vector<int64_t> shapeData = {dims[2], 5};
-        auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shapeData.data());
-        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(slice, shape, true);
+        auto shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, shapeData.data());
+        auto reshape = std::make_shared<ov::op::v1::Reshape>(slice, shape, true);
 
         auto roiPooling =
-            std::make_shared<ngraph::op::v0::ROIPooling>(input, reshape,
-                                                         ngraph::Shape{(size_t)outHeight, (size_t)outWidth},
+            std::make_shared<ov::op::v0::ROIPooling>(input, reshape,
+                                                         ov::Shape{(size_t)outHeight, (size_t)outWidth},
                                                          1.0f, "bilinear");
 
         return Ptr<BackendNode>(new InfEngineNgraphNode(roiPooling));
diff --git a/modules/dnn/src/layers/cumsum_layer.cpp b/modules/dnn/src/layers/cumsum_layer.cpp
index 9c70f306d486..565d09abaac1 100644
--- a/modules/dnn/src/layers/cumsum_layer.cpp
+++ b/modules/dnn/src/layers/cumsum_layer.cpp
@@ -3,6 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
 #include "layers_common.hpp"
 
 #include <opencv2/dnn/shape_utils.hpp>
@@ -29,7 +31,13 @@ class CumSumLayerImpl CV_FINAL : public CumSumLayer
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
         Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
-        return true;
+        return exclusive_raw == 0;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
@@ -37,7 +45,7 @@ class CumSumLayerImpl CV_FINAL : public CumSumLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -47,75 +55,108 @@ class CumSumLayerImpl CV_FINAL : public CumSumLayer
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
 
-        // Get x tensor.
-        const auto &src_mat = inputs[0];
-        const auto *src_ptr = src_mat.ptr<float>();
+        // Get input tensor.
+        const auto& src_mat = inputs[0];
+        const auto* src_ptr = src_mat.ptr<float>();
 
-        // Get axis.
-        const int axis = normalize_axis(axis_raw, src_mat.dims);
+        // Get target axis.
+        int axis = inputs.size() > 1 ? parseAxis(inputs[1]) : axis_raw;
+        axis = normalize_axis(axis, src_mat.dims);
 
-        // Get y tensor.
-        auto &dst_mat = outputs[0];
-        src_mat.copyTo(dst_mat);
-        auto *dst_ptr = dst_mat.ptr<float>();
+
+        // Get output tensor.
+        auto& dst_mat = outputs[0];
+        auto* dst_ptr = dst_mat.ptr<float>();
 
         // Get flags.
         const auto exclusive = exclusive_raw == 1;
         const auto reverse = reverse_raw == 1;
 
-        // Get parameters to iterate outer dimension.
+        // Data with [dim_1, .. , dim_k-1, target_dim, dim_k+1, .. , dim_n]
+        // dimensions is represented here as [outer_dim, target_dim, inner_dim]
         const size_t outer_size = src_mat.total(0, axis);
-        const size_t outer_step_length = src_mat.total(axis);
-
-        // Get parameters to iterate inner dimension.
-        const size_t inner_size = src_mat.size[axis];
-
-        if (!inner_size)
-            return;
+        const size_t target_size = src_mat.size[axis];
+        const size_t inner_size = src_mat.total(axis + 1);
+        const size_t outer_step_length = target_size * inner_size;
 
-        const size_t inner_step_length = src_mat.total(axis + 1);
-        const int inner_step = (reverse ? -1 : 1) * inner_step_length;
-        const int inner_start = reverse ? inner_size - 1 : 0;
-        const int inner_stop = reverse ? -1 : inner_size;
-        const int inner_delta = reverse ? -1 : 1;
+        // Calculating steps in target dimensions
+        const int target_start = reverse ? target_size - 1 : 0;
+        const int target_stop = reverse ? -1 : target_size;
+        const int target_delta = reverse ? -1 : 1;
+        const int target_step = target_delta * inner_size;
 
-        // Get parameters to populate channels.
-        const size_t num_channels = src_mat.total(axis + 1);
+        // If exclusive, the j-th output element would be the sum of the first (j-1) elements.
+        // Otherwise, it would be the sum of the first j elements.
+        const int exclusive_delta = exclusive ? target_step : 0;
 
-        for (size_t outer_dim = 0; outer_dim < outer_size; outer_dim++)
+        for (size_t outer_idx = 0; outer_idx < outer_size; outer_idx++)
         {
-            const size_t outer_offset = outer_dim * outer_step_length;
-            size_t src_offset = outer_offset + inner_start * inner_step_length;
-
-            // Populate first element of inner dimension.
-            for (size_t channel = 0; channel < num_channels; channel++)
+            const size_t target_offset = outer_idx * outer_step_length;
+
+            // Handle first element of target dimension.
+            size_t first_inner_offset = target_offset + target_start * inner_size;
+            if (exclusive)
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
+                    dst_ptr[first_inner_offset + inner_idx] = 0.0f;
+            else
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
+                    dst_ptr[first_inner_offset + inner_idx] = src_ptr[first_inner_offset + inner_idx];
+
+            // Handle remaining elements of target dimension.
+            for (int target_idx = target_start + target_delta; target_idx != target_stop; target_idx += target_delta)
             {
-                if (exclusive)
-                {
-                    dst_ptr[src_offset + channel] = 0.0f;
-                }
-                else
+                const size_t inner_offset = target_offset + target_idx * inner_size;
+
+                for (size_t inner_idx = 0; inner_idx < inner_size; inner_idx++)
                 {
-                    dst_ptr[src_offset + channel] = src_ptr[src_offset + channel];
-                    src_offset += inner_step;
+                    dst_ptr[inner_offset + inner_idx] = dst_ptr[inner_offset - target_step + inner_idx] +
+                        src_ptr[inner_offset - exclusive_delta + inner_idx];
                 }
             }
+        }
+    }
 
-            // Populate remaining elements of inner dimension.
-            for (int inner_dim = inner_start + inner_delta; inner_dim != inner_stop; inner_dim += inner_delta)
-            {
-                const size_t dst_offset = outer_offset + inner_dim * inner_step_length;
+    int parseAxis(const Mat& axis_mat) {
+        CV_CheckEQ(axis_mat.total(), 1u, "Axis tensor should contain single value");
+        if (axis_mat.type() == CV_32SC1)
+            return axis_mat.at<int32_t>(0);
+        else
+        {
+            Mat axis_mat_int;
+            axis_mat.convertTo(axis_mat_int, CV_32SC1);
+            return axis_mat_int.at<int32_t>(0);
+        }
+    }
 
-                for (size_t channel = 0; channel < num_channels; channel++)
-                {
-                    const size_t previous_dst_offset = dst_offset - inner_step;
-                    dst_ptr[dst_offset + channel] = dst_ptr[previous_dst_offset + channel] +
-                            src_ptr[src_offset + channel];
-                    src_offset += inner_step;
-                }
-            }
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        std::shared_ptr<ov::op::v0::CumSum> cumsum;
+        if (nodes.size() == 2)
+        {
+            int32_t axis_shape = 1;
+            auto axis_scalar = std::make_shared<ov::op::v1::Reshape>(
+                nodes[1].dynamicCast<InfEngineNgraphNode>()->node,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, &axis_shape),
+                false);
+            cumsum = std::make_shared<ov::op::v0::CumSum>(
+                nodes[0].dynamicCast<InfEngineNgraphNode>()->node,
+                std::make_shared<ov::op::v0::Convert>(axis_scalar, ov::element::i32),
+                exclusive_raw,
+                reverse_raw);
+        }
+        else
+        {
+            cumsum = std::make_shared<ov::op::v0::CumSum>(
+                nodes[0].dynamicCast<InfEngineNgraphNode>()->node,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, &axis_raw),
+                exclusive_raw,
+                reverse_raw);
         }
+        return Ptr<BackendNode>(new InfEngineNgraphNode(cumsum));
     }
+#endif  // HAVE_DNN_NGRAPH
 
     int axis_raw;
     int exclusive_raw;
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index 61d4f444328c..c7b2272550d1 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -55,11 +55,7 @@
 
 #ifdef HAVE_DNN_NGRAPH
 #include "../ie_ngraph.hpp"
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-#include <ngraph/op/detection_output.hpp>
-#else
-#include <ngraph/op/experimental/layers/detection_output.hpp>
-#endif
+#include <openvino/op/detection_output.hpp>
 #endif
 
 #ifdef HAVE_CUDA
@@ -221,7 +217,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
     {
         return backendId == DNN_BACKEND_OPENCV ||
                (backendId == DNN_BACKEND_CUDA && !_groupByClasses) ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && !_locPredTransposed && _bboxesNormalized);
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -337,7 +333,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
         std::vector<UMat> outputs;
         outs.getUMatVector(outputs);
 
-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
         if (use_half)
         {
             std::vector<UMat> orig_inputs;
@@ -345,7 +341,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
 
             inputs.resize(orig_inputs.size());
             for (size_t i = 0; i < orig_inputs.size(); i++)
-                convertFp16(orig_inputs[i], inputs[i]);
+                orig_inputs[i].convertTo(inputs[i], CV_32F);
         }
         else
         {
@@ -410,7 +406,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
         if (use_half)
         {
             UMat half_umat;
-            convertFp16(umat, half_umat);
+            umat.convertTo(half_umat, CV_16F);
             outs.assign(std::vector<UMat>(1, half_umat));
         }
 
@@ -428,7 +424,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
             CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                        forward_ocl(inputs_arr, outputs_arr, internals_arr))
         }
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -1006,11 +1002,32 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         CV_Assert(nodes.size() == 3);
-        auto& box_logits  = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        auto& class_preds = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-        auto& proposals   = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+        auto box_logits  = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto class_preds = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        auto proposals   = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+
+        if (_locPredTransposed) {
+            // Convert box predictions from yxYX to xyXY
+            box_logits = std::make_shared<ov::op::v1::Reshape>(box_logits,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int32_t>{0, -1, 2}),
+                true
+            );
+            int axis = 2;
+            box_logits = std::make_shared<ov::op::v1::Reverse>(box_logits,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, &axis),
+                ov::op::v1::Reverse::Mode::INDEX
+            );
+        }
+
+        auto shape = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, std::vector<int32_t>{0, -1});
+        box_logits = std::make_shared<ov::op::v1::Reshape>(box_logits, shape, true);
+        class_preds = std::make_shared<ov::op::v1::Reshape>(class_preds, shape, true);
+        proposals = std::make_shared<ov::op::v1::Reshape>(proposals,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int32_t>{0, _varianceEncodedInTarget ? 1 : 2, -1}),
+            true
+        );
 
-        ngraph::op::DetectionOutputAttrs attrs;
+        ov::op::v0::DetectionOutput::Attributes attrs;
         attrs.num_classes                = _numClasses;
         attrs.background_label_id        = _backgroundLabelId;
         attrs.top_k                      = _topK > 0 ? _topK : _keepTopK;
@@ -1023,7 +1040,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
         attrs.code_type                  = std::string{"caffe.PriorBoxParameter." + _codeType};
         attrs.normalized                 = true;
 
-        auto det_out = std::make_shared<ngraph::op::DetectionOutput>(box_logits, class_preds,
+        auto det_out = std::make_shared<ov::op::v0::DetectionOutput>(box_logits, class_preds,
                        proposals, attrs);
         return Ptr<BackendNode>(new InfEngineNgraphNode(det_out));
     }
diff --git a/modules/dnn/src/layers/einsum_layer.cpp b/modules/dnn/src/layers/einsum_layer.cpp
new file mode 100644
index 000000000000..4e3eca237881
--- /dev/null
+++ b/modules/dnn/src/layers/einsum_layer.cpp
@@ -0,0 +1,1375 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <inttypes.h>
+#include <opencv2/dnn/shape_utils.hpp>
+#include "../precomp.hpp"
+#include "../ie_ngraph.hpp"
+#include "layers_common.hpp"
+#include "cpu_kernels/fast_gemm.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+
+static bool IsTransposeReshapeForEinsum(const std::vector<size_t>& perm,
+                                        std::vector<int> input_dims,
+                                        MatShape& new_shape) {
+    // As long as the dims with values > 1 stay in the same order, it's a reshape.
+    // Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1).
+    size_t last_permuted_axis = 0;
+    for (size_t i = 0; i < perm.size(); ++i) {
+        if (input_dims[perm[i]] == 1)
+            continue;
+        if (perm[i] < last_permuted_axis)
+            return false;
+        last_permuted_axis = perm[i];
+    }
+    new_shape.assign(input_dims.begin(), input_dims.end());
+    for (size_t i = 0; i < perm.size(); ++i) {
+        new_shape[i] = input_dims[perm[i]];
+    }
+    return true;
+}
+
+
+static Mat Transpose(
+    const Mat& input,
+    const MatShape& input_shape_override,
+    const std::vector<size_t> permutation)
+{
+
+    int input_rank = input_shape_override.size();
+    CV_Assert(input_rank == permutation.size());
+
+    bool reshape = input.dims != input_rank;
+
+    Mat input_reshaped;
+    if(reshape){
+        input_reshaped = input.reshape(1, input_shape_override.size(), input_shape_override.data());
+    }
+
+    MatShape outputDims;
+    outputDims.reserve(input_rank);
+    for (const auto& dim : permutation)
+        outputDims.emplace_back(input_shape_override[dim]);
+
+    Mat output;
+    MatShape order(permutation.begin(), permutation.end());
+
+    cv::transposeND((reshape ? input_reshaped : input), order, output);
+    return output;
+}
+
+
+bool IsTransposeRequired(size_t input_rank, const std::vector<size_t>& permutation) {
+    CV_Assert(input_rank == permutation.size());
+
+    // No transpose required for scalars
+    if (input_rank == 0){
+        return false;
+    }
+
+    // Weeds out cases where permutation is something like [0, 1, 2] for a 3D input and so on
+    bool transpose_required = false;
+    for (size_t i = 0; i < input_rank; ++i) {
+        if (permutation[i] != i) {
+            transpose_required = true;
+            break;
+        }
+    }
+
+  return transpose_required;
+}
+
+
+bool IsTransposeRequiredForDiagonal(int dim1, int dim2, int rank) {
+    // If the input is 2D, we don't need a transpose
+    if (rank == 2)
+        return false;
+
+    // If the two dims are the innermost dims, no transpose is required
+    if ((dim1 == rank - 1 && dim2 == rank - 2) ||
+        (dim1 == rank - 2 && dim2 == rank - 1))
+        return false;
+
+    // Transpose is required
+    return true;
+}
+
+template <typename T>
+Mat DiagonalDataAssignment(Mat input) {
+
+    int rank = input.dims;
+    CV_Assert(rank >= 2);
+    CV_Assert(input.size[rank - 1] == input.size[rank - 2]);
+    MatShape original_dims = shape(input);
+
+    if (rank > 3){
+        //reshape to 3D mat
+        int collapsed_size = 1;
+        for (int i = 0; i < rank - 2; ++i) {
+            collapsed_size *= input.size[i];
+        }
+        std::vector<int> reshaped_dims = {collapsed_size, input.size[rank - 2], input.size[rank - 1]};
+        input = input.reshape(1, reshaped_dims);
+    }
+
+    // Compute total number of higher-dimensional slices
+    int total_slices = input.size[0];
+
+    original_dims[rank - 1] = 1;  // Set the last dimension to 1, as we have extracted the diagonal
+    Mat output = Mat(original_dims, input.type());
+
+    int inner_stride = input.size[input.dims - 1];
+    auto inputPtr = input.ptr<T>();
+    auto outputPtr = output.ptr<T>();
+    for (int slice = 0; slice < total_slices; ++slice) {
+        for (int j = 0; j < inner_stride; ++j) {
+            // Direct memory access using raw pointers
+            outputPtr[slice * inner_stride + j] = inputPtr[slice * inner_stride * inner_stride + j * inner_stride + j];
+        }
+    }
+    return output;
+}
+
+/* Extract the diagonal elements from the last two dimensions of the tensor.
+For instance, given an input_shape of [1, 2, 3, 3]:
+
+The flexibility in this implementation allows one to choose which of the two
+last dimensions retains its value, determined by the `preserve_innermost_dim_val` parameter.
+
+When preserve_innermost_dim_val == true:
+    The resulting shape is [1, 2, 1, 3], indicating the diagonal has 3 elements,
+    and it keeps the dimension value of the innermost dimension.
+
+When preserve_innermost_dim_val == false:
+    The resulting shape is [1, 2, 3, 1], indicating the diagonal also has 3 elements,
+    but it retains the dimension value of the penultimate dimension. */
+Mat DiagonalInnermostDims(const Mat& input, bool preserve_innermost_dim_val) {
+    const MatShape input_dims = shape(input);
+    int rank = input_dims.size();
+
+    // This is an internal method and we already have finished all validations in the calling method.
+    // We proceed without duplicating all validations again here.
+
+    // We have a minimalistic check here to make sure the innermost dims have the same dim value
+    // as the calling method may have done a transpose before calling this method
+    CV_CheckEQ(input.size[rank - 1], input.size[rank - 2],
+        "innermost dims should have the same dim value to parse the diagonal elements");
+
+    MatShape output_dims = input_dims;  // Copy the original dims
+    if (preserve_innermost_dim_val) {
+        output_dims[rank - 2] = 1;
+    } else {
+        output_dims[rank - 1] = 1;
+    }
+
+    // TODO: hande different types
+    Mat output = DiagonalDataAssignment<float>(input);
+
+    if (output_dims != shape(output)){
+        CV_Error(Error::StsError, "Output shape does not match with calculated shape");
+    }
+    return output;
+}
+
+Mat Diagonal(const Mat& input, int dim1, int dim2)
+{
+    const MatShape input_dims = shape(input);
+    int rank = input_dims.size();
+
+    if (!(rank >= 2 && dim1 != dim2 && input_dims[dim1] == input_dims[dim2])){
+        std::string input_dims_str = std::accumulate(std::next(input_dims.begin()), input_dims.end(), std::to_string(input_dims[0]),
+                                                    [](const std::string& a, int b) {
+                                                        return a + ' ' + std::to_string(b);
+                                                    });
+        CV_Error(Error::StsError, cv::format("Cannot parse the diagonal elements along dims %d and %d for input shape %s",dim1, dim2, input_dims_str.c_str()));
+    }
+
+    int first_dim = std::min(dim1, dim2);
+    int second_dim = std::max(dim1, dim2);
+
+    Mat output;
+    bool preserve_innermost_dim_val = false;
+
+    bool is_transpose_required = IsTransposeRequiredForDiagonal(dim1, dim2, rank);
+    if (is_transpose_required)
+    {
+        std::vector<size_t> permutation(rank, 0);
+        int first_dim_axis = -1;  // This is the axis eventually occupied by the first_dim
+
+        // If one of the diagonal dimensions is one of the 2 innermost dims, then leave it as such
+        // so as to avoid transpose overhead
+        if (first_dim == rank - 2) {  // If rank - 2 is occupied by first_dim, keep it there
+            permutation[rank - 2] = first_dim;
+            first_dim_axis = rank - 2;
+        } else {
+            if (second_dim != rank - 2) {  // If rank - 2 is not occupied by second_dim, then put first_dim there
+                permutation[rank - 2] = first_dim;
+                first_dim_axis = rank - 2;
+            } else {  // If rank - 2 is occupied by second_dim, then put first_dim in rank - 1
+                permutation[rank - 1] = first_dim;
+                first_dim_axis = rank - 1;
+                preserve_innermost_dim_val = true;  // We always want to preserve the dim value of the first_dim
+            }
+        }
+
+        // Put the second_dim in the dim not occupied by the first_dim
+        if (first_dim_axis != rank - 1) {
+            permutation[rank - 1] = second_dim;
+        } else {
+            permutation[rank - 2] = second_dim;
+        }
+
+        size_t iter = 0;
+        for (int i = 0; i < rank; ++i) {
+            if (i != first_dim && i != second_dim) {
+                permutation[iter++] = i;
+            }
+        }
+
+        // Permutate the input so that the dims from which we need the diagonal forms the innermost dims
+        Mat transposed = Transpose(input, input_dims, permutation);
+
+        // Parse the diagonal from the innermost dims
+        output = DiagonalInnermostDims(transposed, preserve_innermost_dim_val);
+
+        // Swap back the dimensions to the original axes ordering using a "reverse permutation"
+        // Find the "reverse" permutation
+        iter = 0;
+        std::vector<size_t> reverse_permutation(rank, 0);
+        for (const auto& perm : permutation) {
+            reverse_permutation[perm] = iter++;
+        }
+
+        // Permutate using the reverse permutation to get back the original axes ordering
+        // (Pass in CPU Transpose function here as this Diagonal method will only be used for CPU based diagonal parsing)
+        output = Transpose(output, shape(output), reverse_permutation);
+    } else {
+        // No transposing required
+        output = DiagonalInnermostDims(input, preserve_innermost_dim_val);
+    }
+
+    // Make copy of the output dims
+    MatShape output_dims = shape(output);
+
+    // Unsqueeze the reduced dim
+    auto iter = output_dims.begin() + second_dim;
+    output_dims.erase(iter);
+    output = output.reshape(1, output_dims);
+    return output;
+}
+
+/**
+ * Returns the index associated with the input character.
+ * - Returns a value between 0 and 25 for inputs in the range 'a' to 'z'.
+ * - Returns a value between 26 and 51 for inputs in the range 'A' to 'Z'.
+ * - Returns -1 for invalid input that is not in the range 'a' to 'z' or 'A' to 'Z' (the caller should handle the returned result accordingly).
+ */
+int letterToIndex(const char ch) {
+    if (ch >= 'a' && ch <= 'z') {
+        return static_cast<int>(ch) - 'a';
+    }
+
+    if (ch >= 'A' && ch <= 'Z') {
+        return static_cast<int>('z') + static_cast<int>(ch) - 'A';
+    }
+    // invalid character - return error value
+    return -1;
+}
+
+// Implementation of the Einsum layer is heavily influenced by Onnxruntime at the time of writing.
+// Main logic is borrowed from onnxrutime:
+// https://github.com/microsoft/onnxruntime/blob/eaea34f8e29df9fb21fab675a3a895084407f306/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc#L8
+class LayerEinsumImpl CV_FINAL : public EinsumLayer
+{
+private:
+    Ptr<ReduceLayer> reduce;
+public:
+    // Number of inputs and outputs of the layer
+    int numInputs;
+
+    // inputShapes;
+    std::vector<MatShape> einsumInpShapes;
+
+    // Preprocessed inputs
+    std::vector<Mat> preProcessedInputs;
+
+    // This is container for preporcessed inputs
+    std::vector<MatShape> homogenizedInputDims;
+
+    // Collect outpus dimentions
+    MatShape einsumOutDims; // vector to store output dimentions
+
+    // These hold equation subring, left hand side and right it of
+    String lhs_eq, rhs_eq, equation;
+
+    // Holds token from left hand side of the equation
+    std::vector<String> lhs_eq_tokens;
+
+    // Idicates if equation substring is defined in explit way such as "ij, jk->ik"
+    // as opposed to "ij->"
+    bool explicitEquation = false;
+
+    // Stores the subscript indices for each input in the equation
+    std::vector<std::vector<int>> inputSubscriptIndices;
+
+    // Keeps track of the input index of the last input that had the subscript label
+    // If the value is `-1`, it means the subscript label was never encountered or it appears in the output
+    std::vector<int> subscriptIndicesToLastInput;
+
+    // Holds the dimension value of the index corresponding to the subscript label
+    // `-1` indicates that the corresponding label was not encountered at all
+    std::vector<int> subscriptIndicesToDimValue;
+
+    // Index corresponding to each output dim corresponding to each subscript index
+    // A value of -1 means the corresponding subscript index is not found in the output
+    std::vector<int> subscriptIndicesToOutputIndices;
+
+    // Hold max number of alphabetic numbers
+    static const size_t numOfLetters = 52;
+
+    // Stores the count corresponding to each letter encountered
+    // A value of `0` indicates that the corresponding letter hasn't been seen at all
+    std::array<int, numOfLetters> letter2count;
+
+    // Hold the assigned index corresponding to the letter seen
+    // `-1` means the corresponding letter wasn't seen at all
+    std::array<int, numOfLetters> letter2index;
+
+    // Represents the count of unique subscript labels (subscript indices)
+    // Example 1: For the equation 'ij, jk -> ik', num_subscript_indices_ = 3 (i, j, k)
+    // Example 2: For the equation '...ij', 'jk' -> '...ik',
+    // num_subscript_indices_ = 3 (i, j, k) + number of dimensions specified by an ellipsis (across all inputs)
+    int numLetterIndices = 0;
+
+    // The number of dimensions that are encompassed by an "ellipsis" - "...".
+    size_t numOfEllipsisDims = 0;
+
+    // Backend for fastgemm
+    FastGemmOpt opt;
+
+    void parseEquation(String equation);
+    void processEquation(const std::vector<MatShape>& inputs);
+    void processBroadcastedDims();
+    void validateOutputSubscript();
+    void calculateOutputShape();
+    void preProcessInputs(InputArrayOfArrays& inputs);
+    Mat reduceSum(Mat& src, MatShape& reduceAxis);
+    Mat FinalizeOutput(const Mat& candidateOuput, const MatShape& ordered_subscript_indices_in_candidate);
+    Mat pairwiseOperandProcess(
+        const Mat& left,
+        const MatShape& leftShapeOverride,
+        const Mat& right,
+        const MatShape& rightShapeOverride,
+        const MatShape& reduceDims,
+        bool isFinalPair
+    );
+    Mat batchwiseMatMul(
+        const Mat& input1,
+        const MatShape& input1ShapeOverride,
+        const Mat& input2,
+        const MatShape& input2ShapeOverride
+    );
+
+    // constructor
+    LayerEinsumImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        equation = params.get<String>("equation");
+        int outputSize = params.get<int>("outputSize");
+        numInputs  = params.get<int>("inputSize");
+
+        CV_CheckEQ(outputSize, 1, "Einsum layer should only have one output");
+
+        // get the input shapes from onnx importer
+        for (int i=0; i < numInputs; i++){
+            auto param = params.get("inputShapes" + cv::format("%d", i));
+            int inputDims = param.size();
+            std::vector<int> shape;
+            for (int i = 0; i < inputDims; ++i)
+                shape.emplace_back(param.get<int>(i));
+            einsumInpShapes.emplace_back(shape);
+        }
+
+        opt.init();
+
+        // Maintains a mapping between input indices and their corresponding subscript labels for each input
+        inputSubscriptIndices.reserve(numInputs);
+
+        // We allocate space for 10 values as a precaution,
+        // assuming that we won't encounter any input with a rank greater than 10.
+        // In such cases, the value of num_subscript_indices_ would be greater than 10.
+        subscriptIndicesToLastInput.reserve(10);
+        subscriptIndicesToDimValue.reserve(10);
+
+        // fill in vectors to avoid getting random numbers
+        letter2count.fill(0);
+        letter2index.fill(-1);
+
+        // parser equation and extract tokens from the equation
+        // save token to lhs_eq_tokens variable
+        parseEquation(equation); // TODO: return lhs_eq_tokens
+
+        // Start preprocessing related to equation parsing
+        // and dimention broadcasting
+        processEquation(einsumInpShapes);
+        processBroadcastedDims();
+
+        // calculate output shape
+        validateOutputSubscript();
+        calculateOutputShape();
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
+    }
+
+    // getMeoryShapes
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_UNUSED(internals);
+
+        // check if passed and parsed inputs match up in number and dimensions
+        CV_CheckEQ(static_cast<int>(inputs.size()), numInputs,
+            "Number of inputs in forward and inputs during graph constructions do not match");
+        for (int i = 0; i < numInputs; i++)
+        {
+            if (inputs[i] != einsumInpShapes[i])
+                CV_Error(Error::StsAssert, "Passed input shapes do not match with parsed input shapes!");
+        }
+
+        outputs.clear();
+        outputs.emplace_back(einsumOutDims);
+        return true;
+
+    } // getMemoryShape
+
+    // forward
+    void forward(InputArrayOfArrays inputs_arr,
+                 OutputArrayOfArrays outputs_arr,
+                 OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16F)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        // homogenize inputs
+        preProcessInputs(inputs_arr);
+
+        std::vector<cv::Mat> rawInputs, outputs;
+        inputs_arr.getMatVector(rawInputs);
+        outputs_arr.getMatVector(outputs);
+        Mat result;
+
+        // Pre-process the first input so as to reduce any dims that only it has
+        {
+            MatShape reducedDims;
+            MatShape preservedDims;
+            MatShape preservedShape;
+
+            reducedDims.reserve(numLetterIndices);    // num_subscript_labels is the upper bound. No harm in over-reserving.
+            preservedDims.reserve(numLetterIndices);  // num_subscript_labels is the upper bound. No harm in over-reserving.
+
+            for (size_t i = 0; i < numLetterIndices; ++i) {
+                if (subscriptIndicesToLastInput[i] == 0) {
+                    reducedDims.push_back(i);
+                } else {
+                    preservedDims.push_back(i);
+                }
+            }
+
+            // Reduce the dims that are last seen in the first input alone
+            if (reducedDims.size() != 0)
+            {
+                result = reduceSum((!preProcessedInputs[0].empty() ? preProcessedInputs[0] : rawInputs[0]), reducedDims);
+            } else {
+                // Check if there is a pre-processed version of this input
+                // If so assign it to result
+                if (!preProcessedInputs[0].empty())
+                {
+                    result = preProcessedInputs[0];
+                }
+            }
+
+            // Finalize the output at this stage if num_inputs == 1
+            if (numInputs == 1) {
+                // Finalize the output by applying any transpose required to get
+                // it to the required output ordering and move it to the op's output
+                result = FinalizeOutput(!result.empty() ? result : rawInputs[0], preservedDims);
+            }
+        }
+
+
+        // Process the operands in a pair-wise fashion
+        {
+            bool isFinalPair = false;
+            // Keep processing each input pair-wise
+            for (int input = 1; input < numInputs; ++input) {
+                MatShape reducedDims;
+                reducedDims.reserve(numLetterIndices);  // num_subscript_labels is the upper bound. No harm in over-reserving by a small margin.
+                for (int dim = 0; dim < numLetterIndices; ++dim)
+                {
+                    if (subscriptIndicesToLastInput[dim] == input)
+                    {
+                        // This is the last input we are seeing this dimension (and it doesn't occur in the output), so reduce along the dimension
+                        reducedDims.push_back(dim);
+                    }
+                }
+
+                if (input == numInputs - 1)
+                    isFinalPair = true;
+
+                // create temporary variable
+                MatShape tmpResult;
+                for (int i = 0; i < result.size.dims(); i++)
+                    tmpResult.emplace_back(result.size[i]);
+
+
+                // Use either the preprocessed inputs (if it is available) or the corresponding raw inputs
+                result = pairwiseOperandProcess(!result.empty() ? result : rawInputs[0],
+                                                !result.empty() ? tmpResult : homogenizedInputDims[0],
+                                                !preProcessedInputs[input].empty() ? preProcessedInputs[input] : rawInputs[input],
+                                                homogenizedInputDims[input],
+                                                reducedDims,
+                                                isFinalPair);
+            }
+        }
+
+        // check of product of output dimentions and computed output dimentions match
+        size_t reqProd = std::accumulate(einsumOutDims.begin(), einsumOutDims.end(), 1, std::multiplies<int>());
+        MatShape realOutputDims = shape(result);
+        size_t realProd = std::accumulate(realOutputDims.begin(), realOutputDims.end(), 1, std::multiplies<int>());
+
+        CV_CheckEQ(reqProd, realProd, "Real output can not be shaped in to required output");
+
+        // reduce dimentions
+        result = result.reshape(1, einsumOutDims.size(), einsumOutDims.data());
+        result.copyTo(outputs[0]);
+    } // forward
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >&,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
+        ov::OutputVector inputs(nodes.size());
+        for (size_t i = 0; i < nodes.size(); ++i) {
+            inputs[i] = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
+        }
+        auto einsum = std::make_shared<ov::op::v7::Einsum>(inputs, equation);
+        return new InfEngineNgraphNode(einsum);
+    }
+#endif // HAVE_DNN_NGRAPH
+
+}; // EinsumClass
+
+Mat LayerEinsumImpl::reduceSum(Mat& src, MatShape& reduceAxis)
+{
+    // initialize ReduceLayer
+    LayerParams lp;
+    lp.set("reduce", "SUM");
+    int num_axes = reduceAxis.size();
+    lp.set("axes", DictValue::arrayInt(&reduceAxis[0] , num_axes));
+    reduce = ReduceLayer::create(lp);
+
+    // Compute output shapes
+    std::vector<MatShape> inputShapes{shape(src)};
+    std::vector<MatShape> outputShapes, internalShapes;
+    reduce->getMemoryShapes(inputShapes, 1, outputShapes, internalShapes);
+
+    Mat output(outputShapes[0], CV_32F);
+
+    std::vector<Mat> inputs;
+    std::vector<Mat> outputs;
+    std::vector<Mat> internals;
+    inputs.emplace_back(src);
+    outputs.emplace_back(output);
+
+    reduce->forward(inputs, outputs, internals);
+    return outputs[0];
+}
+
+void LayerEinsumImpl::preProcessInputs(InputArrayOfArrays& inputs_arr)
+{
+    std::vector<cv::Mat> inputs;
+    inputs_arr.getMatVector(inputs);
+
+    preProcessedInputs.reserve(inputs.size());
+    homogenizedInputDims.reserve(inputs.size());
+
+    int inputIter = 0;
+    for(const Mat& input : inputs)
+    {
+        Mat preprocessed;
+
+        // variable to hold processed version of the original input
+        MatShape input_dims = shape(input);
+
+        const auto& currSubscriptIndices = inputSubscriptIndices[inputIter];
+
+        // There should be subscript index (subscript label) for each dim of the input
+        CV_CheckEQ(input_dims.size(), currSubscriptIndices.size(),
+            "Rank of the input must match number of subscript labels corresponding to the input");
+
+        std::vector<int> subscriptIndicesToInputIndex(numLetterIndices, -1);
+        // this will hold input dims after reordering so that all inputs have
+        // same axes order
+        MatShape homogenizedInputDims_(numLetterIndices, 1);
+
+        int dimIndexInIreprocessedInput = 0;
+        int dimIndexInOriginalInput = 0;
+
+        for (const auto& subscriptIndex : currSubscriptIndices)
+        {
+            if(subscriptIndicesToInputIndex[subscriptIndex] == -1){
+                subscriptIndicesToInputIndex[subscriptIndex] = dimIndexInIreprocessedInput++;
+                homogenizedInputDims_[subscriptIndex] = input_dims[dimIndexInOriginalInput];
+            } else {
+                // Call diagonal
+                preprocessed = Diagonal(
+                    !preprocessed.empty() ? preprocessed : inputs[inputIter],
+                    subscriptIndicesToInputIndex[subscriptIndex],
+                    dimIndexInIreprocessedInput);
+            }
+            ++dimIndexInOriginalInput;
+        }
+
+        std::vector<size_t> permutation;
+        for(auto& d : subscriptIndicesToInputIndex)
+        {
+            if (d != -1)
+                permutation.emplace_back(d);
+        }
+
+        if (IsTransposeRequired(
+            !preprocessed.empty() ? preprocessed.size.dims() : inputs[inputIter].size.dims(),
+            permutation))
+        {
+            // call transpose
+            preprocessed = Transpose(
+                !preprocessed.empty() ? preprocessed : inputs[inputIter],
+                !preprocessed.empty() ? shape(preprocessed) : shape(inputs[inputIter]),
+                permutation);
+        }
+
+        if (!preprocessed.empty())
+        {
+            preprocessed = preprocessed.reshape(1, homogenizedInputDims_.size(), homogenizedInputDims_.data());
+        }
+
+        preProcessedInputs.emplace_back(preprocessed);
+        homogenizedInputDims.emplace_back(homogenizedInputDims_);
+        ++inputIter;
+    }
+}
+
+void LayerEinsumImpl::parseEquation(String equation)
+{
+    // remove white spaces in the copy
+    equation.erase(std::remove_if(equation.begin(), equation.end(), ::isspace), equation.end());
+
+    // check if '->' - the output subscript label is present in the equation;
+    std::size_t arrow_idx = equation.find("->");
+    if (arrow_idx != std::string::npos)
+    {
+        // split left and righ hand sides of the equation
+        lhs_eq = equation.substr(0, arrow_idx);
+        rhs_eq = equation.substr(arrow_idx + 2);
+        explicitEquation = true;
+    } else {
+        lhs_eq = equation;
+    }
+
+    // split lhs_eq by ',' - comma and put all created token - splits
+    // into lhs_eq_tokens vector
+    std::stringstream src(lhs_eq);
+    for (std::string token; std::getline(src, token, ',');) {
+        lhs_eq_tokens.emplace_back(token);
+    }
+}
+
+
+void LayerEinsumImpl::calculateOutputShape()
+{
+    // Traverse through each of the subscript labels within the output subscript.
+    bool middleOfEllipsis = false;
+    int ellipsisCharCount = 0;
+
+    subscriptIndicesToOutputIndices.resize(numLetterIndices, -1);
+
+    std::array<int, numOfLetters> outputLetterToCount;
+    outputLetterToCount.fill(0);
+
+    int outputDimCounter = 0;
+    for (auto letter : rhs_eq)
+    {
+        if(letter == '.')
+        {
+            middleOfEllipsis = true;
+            // Make sure there aren't more than 3 '.'s in the current subscript
+            if (++ellipsisCharCount > 3) {
+                CV_Error(Error::StsError, "Found a '.' not part of an ellipsis in the output subscript provided");
+            }
+
+            if (ellipsisCharCount == 3) {  // Ellipsis is complete. Process it.
+                middleOfEllipsis = false;
+                for (size_t i = 0; i < numOfEllipsisDims; ++i) {
+                    einsumOutDims.emplace_back(subscriptIndicesToDimValue[i]);
+                    // The ellipsis is seen in the output and hence the corresponding dims are to not be reduced
+                    subscriptIndicesToLastInput[i] = -1;
+                    subscriptIndicesToOutputIndices[i] = outputDimCounter++;
+                }
+            }
+        } else {
+            CV_CheckEQ(middleOfEllipsis, false,
+                "Encountered '.' character that is not part of output subscript");
+
+            auto letterIndex = letterToIndex(letter);
+
+            CV_CheckNE(letterIndex, -1,
+                "The only permissible subscript labels are lowercase letters (a-z) and uppercase letters (A-Z).");
+            CV_CheckEQ(outputLetterToCount[letterIndex], 0,
+                "Output subscript constains repeated letters");
+
+            ++outputLetterToCount[letterIndex];
+            auto mappedIndex = letter2index[letterIndex];
+
+            CV_CheckNE(mappedIndex, -1,
+                "Output subscript has letters that were not encountered in the inputs");
+
+            // Push output dimention
+            // Einsum layer only has one output vector
+            einsumOutDims.emplace_back(subscriptIndicesToDimValue[mappedIndex]);
+
+            // Reset the last input index for this subscript label
+            // given that it is seen in the output and hence can't be reduced
+            subscriptIndicesToLastInput[mappedIndex] = -1;
+            subscriptIndicesToOutputIndices[mappedIndex] = outputDimCounter++;
+        }
+    }
+}
+
+void LayerEinsumImpl::validateOutputSubscript()
+{
+    // The explicit form requires no operation, as the output
+    // would have already been parsed during the input parsing process.
+    if(explicitEquation)
+    {
+        // Ensure that the provided explicit equation includes an ellipsis if the input contains ellipses.
+        if(numOfEllipsisDims > 0)
+        {
+            if(rhs_eq.find("...") == std::string::npos)
+            {
+                CV_Error(Error::StsError,
+                "Provided output subscript does not include ellipsis while Inputs subscrits constain ellipsis");
+            }
+        }
+    }
+}
+
+void LayerEinsumImpl::processBroadcastedDims()
+{
+    // Only compute this function if ellipsis "..." was found in the equation
+    if (numOfEllipsisDims > 0)
+    {
+        // extend the number of subscript labels to include each ellipsis dim as
+        // theoretically each ellipsis dim does correspond to a "virtual" subscript label
+        numLetterIndices += numOfEllipsisDims;
+
+        // We are going to assign the broadcasted dims outermost subscript indices (i.e.) 0 -> numOfEllipsisDims - 1
+        // as most likely bradcasted dims will be batch dimensions (i.e.) outermost dimensions and hence we don't have to pay
+        // transposing while "homogenizing" the input
+
+        // Hence offset all subscript indices by numOfEllipsisDims
+        for (size_t i = 0; i < numOfLetters; ++i){
+            if (letter2count[i] != -1){
+                letter2index[i] += numOfEllipsisDims;
+            }
+        }
+
+        std::vector<int> tempIndex2LastInput(numLetterIndices, -1);
+        for (int i = 0; i < subscriptIndicesToLastInput.size(); ++i){
+            tempIndex2LastInput[i + numOfEllipsisDims] = subscriptIndicesToLastInput[i];
+        }
+        subscriptIndicesToLastInput = std::move(tempIndex2LastInput);
+
+        std::vector<int> tempIndexToDimValue(numLetterIndices, -1);
+        for (int i = 0; i < subscriptIndicesToDimValue.size(); ++i){
+            tempIndexToDimValue[i + numOfEllipsisDims] = subscriptIndicesToDimValue[i];
+        }
+        subscriptIndicesToDimValue = std::move(tempIndexToDimValue);
+
+        for (size_t i = 0; i < inputSubscriptIndices.size(); ++i)
+        {
+            auto& currentInputDimIndicesToSubscriptIndices = inputSubscriptIndices[i];
+            std::vector<int> tempCurrentInputDimIndicesToSubscriptIndices;
+            tempCurrentInputDimIndicesToSubscriptIndices.reserve(currentInputDimIndicesToSubscriptIndices.size());
+
+            // make sure it is correct
+            const auto& dims = einsumInpShapes[i];
+            auto rank = dims.size();
+
+            size_t dimIter = 0;
+            size_t numBroadcastedIndices = 0;
+            while (dimIter < currentInputDimIndicesToSubscriptIndices.size())
+            {
+                auto value = currentInputDimIndicesToSubscriptIndices[dimIter];
+                if (value == numOfLetters)
+                {  // This is a broadcasted dim
+                    // Shouldn't hit this error - just a sanity check
+                    CV_Assert(numBroadcastedIndices < numOfEllipsisDims);
+                    tempCurrentInputDimIndicesToSubscriptIndices.push_back(static_cast<int>(numBroadcastedIndices));
+                    subscriptIndicesToLastInput[numBroadcastedIndices] = i;
+
+                    // This is the first time we are seeing this broadcasted dim
+                    if (subscriptIndicesToDimValue[numBroadcastedIndices] == -1)
+                    {
+                        subscriptIndicesToDimValue[numBroadcastedIndices] = dims[dimIter];
+                    } else {  // We have seen this broadcasted dim before
+                        // Check if the previous value is equal to the current value
+                        if (subscriptIndicesToDimValue[numBroadcastedIndices] != dims[dimIter])
+                        {
+                            // If they are not equal, one of them needs to be 1
+                            if (subscriptIndicesToDimValue[numBroadcastedIndices] == 1)
+                            {
+                                subscriptIndicesToDimValue[numBroadcastedIndices] = dims[dimIter];
+                            } else {
+                                CV_CheckEQ(dims[dimIter], 1, "The broadcasted dimensions of the inputs are incompatible");
+                            }
+                        }
+                    }
+                    ++numBroadcastedIndices;
+                } else {  // This is a regular dim - offset it by number of broadcasted dims
+                    tempCurrentInputDimIndicesToSubscriptIndices.push_back(value + static_cast<int>(numOfEllipsisDims));
+                }
+                ++dimIter;
+            }
+            // Shouldn't hit this error - just a sanity check
+            CV_Assert(dimIter == rank);
+            currentInputDimIndicesToSubscriptIndices = std::move(tempCurrentInputDimIndicesToSubscriptIndices);
+        }
+    }
+}
+
+
+
+void LayerEinsumImpl::processEquation(const std::vector<MatShape>& inputs)
+{
+
+    // Check if number of tokens in equal to number of inputs.
+    // For install "ij, jk -> ik" needs to have 2 inputs tensors
+    int num_input_tensors = inputs.size();
+    CV_CheckEQ(static_cast<int>(lhs_eq_tokens.size()), num_input_tensors,
+        "Number of input tensors does not match the number of subscripts in the input equation");
+
+    int inputIdx = 0;
+    for (const auto& token : lhs_eq_tokens)
+    {
+        const MatShape shape = inputs[inputIdx];
+        size_t rank = shape.size();
+        size_t dim_count = 0;
+
+        std::vector<int> currTokenIndices;
+        currTokenIndices.reserve(rank);
+
+        // Variable to deal with "ellipsis" - '...' in the input
+        bool middleOfellipsis = false;
+        int ellipsisCharCount = 0;
+        for (auto letter : token)
+        {
+            if (letter == '.')
+            {
+                middleOfellipsis = true;
+
+                // there should not be more than 3 '.'s in the current subscript
+                if (++ellipsisCharCount > 3)
+                {
+                    CV_Error(Error::StsError, cv::format("Found a '.' not part of an ellipsis in input: %d", inputIdx));
+                }
+
+                // We have seen all 3 '.'s. We can safely process the ellipsis now.
+                if (ellipsisCharCount == 3)
+                {
+                    middleOfellipsis = false;
+
+                    // Example for the following line of code
+                    // Subscript "...ij" for an input of rank 6
+                    // numOfEllipsisDims = 6 - 5 + 3 = 4
+                    int currentNumOfEllipsisDims = static_cast<int>(rank) - token.length() + 3;
+                    CV_CheckGE(currentNumOfEllipsisDims, 0,
+                        "Einsum subscripts string contains too many subscript labels when compared to the rank of the input");
+
+                    // Theoretically, currentNumOfEllipsisDims could be 0
+                    // Example: For an input of rank 2 paired with a subscript "...ij"
+                    if (currentNumOfEllipsisDims != 0)
+                    {
+                        // We have seen a ellipsis before - make sure ranks align as per the ONNX spec -
+                        // "Ellipsis must indicate a fixed number of dimensions."
+                        if (numOfEllipsisDims != 0){
+                            CV_CheckEQ(numOfEllipsisDims, static_cast<size_t>(currentNumOfEllipsisDims),
+                                "Ellipsis must indicate a fixed number of dimensions across all inputs");
+                        } else {
+                            numOfEllipsisDims = static_cast<size_t>(currentNumOfEllipsisDims);
+                        }
+
+                        // We reserve 'numOfLetters' for broadcasted dims as we only allow 'a' - 'z'
+                        // and 'A' - 'Z' (0 - 51) for non-broadcasted dims.
+                        // We will assign appropriate indices (based on number of dimensions the ellipsis corresponds to)
+                        // during broadcasting related post-processing.
+                        for (size_t i = 0; i < numOfEllipsisDims; ++i){
+                            currTokenIndices.push_back(numOfLetters);
+                        }
+
+                        // Offset 'dim_count' by number of dimensions the ellipsis corresponds to
+                        dim_count += numOfEllipsisDims;
+                    }
+                }
+            } else {
+                if (middleOfellipsis){
+                    CV_Error(Error::StsAssert,
+                    cv::format(
+                        "Encountered '.' character that is not part of an ellipsis in the input: [%d]",
+                        inputIdx));
+                }
+
+                int letterIdx = letterToIndex(letter);
+                CV_CheckNE(letterIdx, -1,
+                    "The only permissible subscript labels are lowercase letters (a-z) and uppercase letters (A-Z).");
+
+                int dimValue = shape[dim_count];
+
+                // The subscript label was not found in the global subscript label array
+                // Therefore, it is added to both the local and global subscript arrays
+                if(letter2count[letterIdx] == 0){
+                    letter2index[letterIdx] = numLetterIndices++;
+                    subscriptIndicesToDimValue.push_back(dimValue);
+                    subscriptIndicesToLastInput.push_back(inputIdx);
+
+                } else {
+                    // This letter has been seen in at least one other operand's subscript
+                    // It must be equal unless one of them is a 1 (Numpy allows this)
+                    auto mappedIndx = letter2index[letterIdx];
+                    subscriptIndicesToLastInput[mappedIndx] = inputIdx;
+
+                    if (subscriptIndicesToDimValue[mappedIndx] != dimValue) {
+                        if (dimValue != 1) {
+                            CV_Error(Error::StsError, cv::format("Einsum operands can not be broadcasted."
+                                                                "Check input shapes/equation passed."
+                                                                "Input shape of operand [%d]", inputIdx) +
+                                                    cv::format(" is incompatible in the dimention [%zu].", static_cast<size_t>(dim_count)));
+                        }
+                    }
+                }
+                ++letter2count[letterIdx];
+                currTokenIndices.push_back(letter2index[letterIdx]);
+
+                CV_CheckLE(++dim_count, rank,
+                    "The Einsum subscripts string has an excessive number of subscript labels compared to the rank of the input.");
+            }
+        }
+
+        // When no broadcasting is requested, the number of subscript labels (dim_counter) should match the input's rank.
+        CV_Assert(!(numOfEllipsisDims == 0 && dim_count != rank)
+            && "The Einsum subscripts string does not contain required amount of subscript labels and no ellipsis is provided in the input.");
+
+        inputSubscriptIndices.emplace_back(std::move(currTokenIndices));
+        ++inputIdx;
+    }
+}
+
+Mat LayerEinsumImpl::FinalizeOutput(
+    const Mat& candidateOutput,
+    const MatShape& ordered_subscript_indices_in_candidate)
+{
+    const std::vector<int>& subscript_indices_to_output_indices = subscriptIndicesToOutputIndices;
+    const auto output_dims = einsumOutDims;
+
+    MatShape output_shape = output_dims;
+    const auto output_rank = output_dims.size();
+
+    // CV_CheckEQ((int) candidateOutput.dims,  (int) output_shape.size(),
+    //           "Einsum op: The candidate output cannot be reshaped into the op's output");
+
+    const MatShape candidate_output_dims = MatShape(candidateOutput.size.p, candidateOutput.size.p + candidateOutput.dims);
+    const int candidate_output_rank = candidate_output_dims.size();
+
+    // This vector holds the shape of the candidate_output after removing the dims that have
+    // been reduced in the final output
+    MatShape candidate_output_shape_without_reduced_dims;
+    candidate_output_shape_without_reduced_dims.reserve(candidate_output_rank);  // reserve upper bound
+
+    // Identify the permutation required by the op's output
+    std::vector<size_t> output_permutation;
+    output_permutation.resize(output_rank, 0);
+    size_t output_iter = 0;
+
+    for (size_t iter = 0, end = ordered_subscript_indices_in_candidate.size(); iter < end; ++iter)
+    {
+        auto output_index = subscript_indices_to_output_indices[ordered_subscript_indices_in_candidate[iter]];
+
+        // If output_index is -1, then this dimension does not show up in the op's output and has been reduced along the way
+        if (output_index != -1)
+        {
+            output_permutation[output_index] = output_iter++;
+            candidate_output_shape_without_reduced_dims.push_back(candidate_output_dims[iter]);
+        } else {
+            // This dim doesn't show up in the op's output and hence we check if the dim has been reduced in the candidate output
+            CV_CheckEQ(candidate_output_dims[iter], 1,
+            "Not all dimensions to be reduced have been reduced in the candidate output. Candidate output dims: "); //%d", candidateOutput.size));
+        }
+    }
+
+    // Transpose to the required final output order
+    // (Identify no-op transposes and prevent triggering the transpose)
+
+    if (IsTransposeRequired(candidate_output_shape_without_reduced_dims.size(), output_permutation))
+    {
+        auto candidate_output_transposed = Transpose(
+                                            candidateOutput,
+                                            candidate_output_shape_without_reduced_dims,
+                                            output_permutation);
+        return candidate_output_transposed;
+    }
+    return candidateOutput;
+}
+
+Mat LayerEinsumImpl::pairwiseOperandProcess(
+    const Mat& left,
+    const MatShape& leftShapeOverride,
+    const Mat& right,
+    const MatShape& rightShapeOverride,
+    const MatShape& reduceDims,
+    bool isFinalPair
+)
+{
+    size_t matDimSize = left.total();
+    size_t overrideDimSize = total(leftShapeOverride);
+
+    CV_CheckEQ(matDimSize, overrideDimSize, "Override dims are not compatible with left tensor shape");
+
+    matDimSize = right.total();
+    overrideDimSize = total(rightShapeOverride);
+
+    CV_CheckEQ(matDimSize, overrideDimSize, "Override dims are not compatible with right tensor shape");
+
+    // Make copy as this may be overridden downstream
+    const auto& leftDims = leftShapeOverride;
+    const auto& rightDims = rightShapeOverride;
+
+    int leftRank = static_cast<int>(leftDims.size());
+    int rightRank = static_cast<int>(rightDims.size());
+
+    Mat currentLeft;
+    Mat currentRight;
+
+    CV_CheckEQ(leftRank, rightRank, "Raks of pair-wise operands must be equal");
+
+    // Following vectors hold:
+    // lro: dim indices that are present in left, right, and reduce_dims
+    // lo: dim indices that are present in left and reduce_dims
+    // ro: dim indices that are present in right and reduce_dims
+    std::vector<size_t> lro;
+    lro.reserve(5);  // Reserve an arbitrary amount of space for this vector (not bound to see a tensor of rank > kTensorShapeSmallBufferElementsSize)
+
+    std::vector<size_t> lo;
+    lo.reserve(5);  // Reserve an arbitrary amount of space for this vector (not bound to see a tensor of rank > kTensorShapeSmallBufferElementsSize)
+
+    std::vector<size_t> ro;
+    ro.reserve(5);  // Reserve an arbitrary amount of space for this vector (not bound to see a tensor of rank > kTensorShapeSmallBufferElementsSize)
+
+    // Maintain sizes to create reshaped "views"
+    int lro_size = 1;
+    int lo_size = 1;
+    int ro_size = 1;
+    int reduced_size = 1;
+
+    size_t reduceDimsIter = 0;
+    size_t reduceDimsSize = reduceDims.size();
+
+    for (int i = 0; i < leftRank; ++i)
+    {
+        int leftDim = leftDims[i];
+        int rightDim = rightDims[i];
+
+        bool hasLeftDim = leftDim > 1;    // non-trivial dimension (dim_value != 1)
+        bool hasRightDim = rightDim > 1;  // non-trivial dimension (dim_value != 1)
+
+        if (reduceDimsIter < reduceDimsSize && reduceDims[reduceDimsIter] == i)
+        {
+            // This dimension is to be reduced after this pair-wise operation
+            ++reduceDimsIter;
+            if (hasLeftDim && hasRightDim){
+                // Both inputs have non-trivial dim values along this dimension
+                // Both the left and right operands have non-trivial dimension value along this axis
+                CV_CheckEQ(leftDim, rightDim, "Einsum op: Input dimensions must be equal along an axis to be reduced across all inputs");
+                reduced_size *= leftDim;
+
+            } else if (hasLeftDim){
+                // if the dim to be reduced is only in one of left and right, we can reduce right away
+                Mat tensorToReduce = !currentLeft.empty() ? currentLeft : left;
+                MatShape shapeToReduce = !currentLeft.empty() ? shape(currentLeft) : leftDims;
+                currentLeft = reduceSum(tensorToReduce, shapeToReduce);
+
+            } else if (hasRightDim){
+                Mat tensorToReduce = !currentRight.empty() ? currentRight : right;
+                MatShape shapeToReduce = !currentRight.empty() ? shape(currentRight) : rightDims;
+                currentLeft = reduceSum(tensorToReduce, shapeToReduce);
+            }
+
+        } else {
+            // This dimension is not reduced (i.e.) it appears in the output after processing these 2 operands
+            // Both the left and right operands have non-trivial dimension value along this axis
+            // They must be equal
+            if (hasLeftDim && hasRightDim){
+                CV_CheckEQ(leftDim, rightDim, "Input shapes do not align");
+                lro.push_back(i);
+                lro_size *= leftDim;
+
+            } else if (hasLeftDim) {
+                // The left operand has non-trivial dimension value
+                lo.push_back(i);
+                lo_size *= leftDim;
+
+            } else {
+                // The right operand may or may not have non-trivial dim value
+                // If it has trivial dim value (1),
+                // it will just form a trailing dimension for the right operand
+                ro.push_back(i);
+                ro_size *= rightDim;
+            }
+        }
+    }
+
+
+    // Permutate the left operand so that the axes order go like this: [lro, lo, reduce_dims, ro]
+    MatShape reshaped_dims;
+    std::vector<size_t> left_permutation;
+    left_permutation.reserve(lro.size() + lo.size() + reduceDims.size() + ro.size());
+    left_permutation.insert(left_permutation.end(), lro.begin(), lro.end());
+    left_permutation.insert(left_permutation.end(), lo.begin(), lo.end());
+    //  left_permutation.insert(left_permutation.end(), reduce_dims.begin(), reduce_dims.end());
+
+    for (auto& a : reduceDims)
+    {
+        left_permutation.push_back(a);
+    }
+    left_permutation.insert(left_permutation.end(), ro.begin(), ro.end());
+
+    if (IsTransposeRequired(!currentLeft.empty() ? currentLeft.dims : leftDims.size(),
+                                        left_permutation))
+    {
+        if (!currentLeft.empty() && IsTransposeReshapeForEinsum(left_permutation,
+                                                                shape(currentLeft),
+                                                                reshaped_dims))
+        {
+            // This can be done because curent_* tensors (if they exist) and output tensors are
+            // intermediate tensors and cannot be input tensors to the Einsum node itself
+            // (which are immutable).
+            currentLeft = currentLeft.reshape(1, reshaped_dims.size(), reshaped_dims.data());
+        } else {
+            // Covered by ExplicitEinsumAsTensorContraction, DiagonalWithMatmul, ...
+            currentLeft = Transpose(!currentLeft.empty() ? currentLeft: left,
+                                    !currentLeft.empty() ? shape(currentLeft) : leftDims,
+                                    left_permutation);
+        }
+    }
+
+    // Permutate the right operand so that the axes order go like this: [lro, reduce_dims, ro, lo]
+    std::vector<size_t> right_permutation;
+    right_permutation.reserve(lro.size() + lo.size() + reduceDims.size() + ro.size());
+    right_permutation.insert(right_permutation.end(), lro.begin(), lro.end());
+    //  right_permutation.insert(right_permutation.end(), reduce_dims.begin(), reduce_dims.end());
+    for (auto& a : reduceDims) {
+        right_permutation.push_back(a);
+    }
+    right_permutation.insert(right_permutation.end(), ro.begin(), ro.end());
+    right_permutation.insert(right_permutation.end(), lo.begin(), lo.end());
+
+    if (IsTransposeRequired(!currentRight.empty() ? currentRight.dims: rightDims.size(),
+                                        right_permutation))
+    {
+        if (!currentRight.empty() && IsTransposeReshapeForEinsum(right_permutation,
+                                                                shape(currentRight),
+                                                                reshaped_dims))
+        {
+            currentRight = currentRight.reshape(1, reshaped_dims.size(), reshaped_dims.data());
+        } else {
+            currentRight = Transpose(!currentRight.empty() ? currentRight : right,
+                                    !currentRight.empty() ? shape(currentRight) : rightDims,
+                                    right_permutation);
+        }
+    }
+
+    // Calculate output size
+    // Output shape will be determined by rules of MatMul:
+    // because we are multiplying two tensors of shapes [lro, lo, reduce_dims] , [lro, reduce_dims, ro]
+    // [dim_value of `lro` dims,
+    //  dim_value of `lo` dims,
+    // `1` for each of the `reduce_dims`,
+    // dim_value of `ro` dims]
+    MatShape outputDims;
+    outputDims.reserve(lro.size() + lo.size() + reduceDims.size() + ro.size());
+    for (size_t i = 0; i < lro.size(); ++i)
+    {
+        outputDims.emplace_back(leftDims[lro[i]]);
+    }
+
+    for (size_t i = 0; i < lo.size(); ++i)
+    {
+        outputDims.emplace_back(leftDims[lo[i]]);
+    }
+
+    for (size_t i = 0; i < reduceDims.size(); ++i)
+    {
+        outputDims.emplace_back(1);  // reduced dimensions will have a value 1 in it
+    }
+
+    for (size_t i = 0; i < ro.size(); ++i) {
+        outputDims.emplace_back(rightDims[ro[i]]);
+    }
+
+    MatShape currentSubscriptOrder;
+    // Calculate output permutation
+    // After the MatMul op, the because the two operands have been permutated,
+    // the output is permutated as well with respect to the original ordering of the axes.
+    // The permutated order will be the dims in: [lro, lo, reduced_dims, ro]
+    // Hence invert the permutation by a permutation that puts the axes in the same ordering
+    std::vector<size_t> outputPermutation;
+    if (!isFinalPair) {  // If this is not the final pair, we need to permutate the result to match the pre-fixed order for the next iteration
+        outputPermutation.resize(lro.size() + lo.size() + reduceDims.size() + ro.size(), 0);
+        size_t iter = 0;
+        for (size_t i = 0; i < lro.size(); ++i)
+        {
+            outputPermutation[lro[i]] = iter++;
+        }
+
+        for (size_t i = 0; i < lo.size(); ++i)
+        {
+            outputPermutation[lo[i]] = iter++;
+        }
+
+        for (size_t i = 0; i < reduceDims.size(); ++i)
+        {
+            outputPermutation[reduceDims[i]] = iter++;
+        }
+
+        for (size_t i = 0; i < ro.size(); ++i)
+        {
+            outputPermutation[ro[i]] = iter++;
+        }
+
+    } else {
+        currentSubscriptOrder.reserve(lro.size() + lo.size() + reduceDims.size() + ro.size());
+        currentSubscriptOrder.insert(currentSubscriptOrder.end(), lro.begin(), lro.end());
+        currentSubscriptOrder.insert(currentSubscriptOrder.end(), lo.begin(), lo.end());
+        currentSubscriptOrder.insert(currentSubscriptOrder.end(), reduceDims.begin(), reduceDims.end());
+        currentSubscriptOrder.insert(currentSubscriptOrder.end(), ro.begin(), ro.end());
+    }
+
+    Mat output = batchwiseMatMul(
+        !currentLeft.empty() ? currentLeft : left,
+        MatShape({static_cast<int>(lro_size), static_cast<int>(lo_size), static_cast<int>(reduced_size)}),
+        !currentRight.empty() ? currentRight : right,
+        MatShape({static_cast<int>(lro_size), static_cast<int>(reduced_size), static_cast<int>(ro_size)})
+        );
+
+    //reshape
+    output = output.reshape(1, outputDims.size(), outputDims.data());
+
+    if (!isFinalPair)
+    {  // This is not the final pair - so bring the axes order to what the inputs conformed to
+        if (IsTransposeRequired(outputDims.size(), outputPermutation))
+        {
+            if (IsTransposeReshapeForEinsum(outputPermutation,
+                                            outputDims,
+                                            reshaped_dims))
+            {
+                // See note following the previous call of function IsTransposeReshapeForEinsum.
+                // Covered by ExplicitEinsumAsTensorContractionReshapeFinal.
+                output = output.reshape(1, reshaped_dims.size(), reshaped_dims.data());
+            }
+            else {
+                output = Transpose(
+                    output,
+                    outputDims,
+                    outputPermutation);
+            }
+        }
+    } else {  // This is the final pair - Transpose directly to the output ordering required and copy the contents to the op's output
+        // not sure if this finalize shape is needed at all
+        output = FinalizeOutput(output, currentSubscriptOrder);
+    }
+    return output;
+};
+
+Mat LayerEinsumImpl::batchwiseMatMul(
+    const Mat& input1,
+    const MatShape& input1ShapeOverride,
+    const Mat& input2,
+    const MatShape& input2ShapeOverride)
+{
+    // Sanity checks before the actual MatMul
+    CV_CheckType(input1.type(), input2.type(), "Data types of the inputs must match for MatMul");
+    CV_CheckEQ(input1ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
+    CV_CheckEQ(input2ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
+    CV_CheckEQ((size_t) input1ShapeOverride[0], (size_t) input2ShapeOverride[0], "Batch dimension should match for MatMul;");
+    CV_CheckEQ((size_t) input1ShapeOverride[2], (size_t) input2ShapeOverride[1], "Incompatible matrix dimensions for matMul");
+
+    int batches = input1ShapeOverride[0];
+    int M = input1ShapeOverride[1];
+    int K = input1ShapeOverride[2];
+    int N = input2ShapeOverride[2];
+
+    Mat reshapedInput1 = input1;
+    Mat reshapedInput2 = input2;
+
+    Mat output;
+    if (batches > 1)
+    {
+        // create tmpout with type like input1
+        output = Mat({batches, M, N}, input1.type());
+
+        reshapedInput2 = reshapedInput2.reshape(1, input2ShapeOverride);
+        reshapedInput1 = reshapedInput1.reshape(1, input1ShapeOverride);
+
+        fastGemmBatch(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, output, opt);
+    } else {
+
+        // input1 should of size MxK
+        if (input1.dims > 2 || input1.size[0] != M || input1.size[1] != K)
+        {
+            int shape[] = {M, K};
+            reshapedInput1 = input1.reshape(1, 2, shape);
+        }
+
+        // input2 should be of size KxN
+        if (input2.dims > 2 || input2.size[0] != K || input2.size[1] != N)
+        {
+            int shape2[] = {K, N};
+            reshapedInput2 = input2.reshape(1, 2, shape2);
+        }
+
+        output = Mat(M, N, reshapedInput1.type());
+        fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, output, opt);
+
+        output = output.reshape(1, {1, M, N});
+    }
+    return output;
+};
+Ptr<EinsumLayer> EinsumLayer::create(const LayerParams& params)
+{
+    return makePtr<LayerEinsumImpl>(params);
+}
+
+}} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 4c2e0f3b076c..6b7909b1b7bb 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -138,7 +138,7 @@ class ElementWiseLayer : public Func::Layer
             {
                 const float* srcptr = src_->ptr<float>(i) + stripeStart;
                 float* dstptr = dst_->ptr<float>(i) + stripeStart;
-                func_->apply(srcptr, dstptr, (int)(stripeEnd - stripeStart), planeSize, 0, outCn);
+                func_->apply(srcptr, dstptr, stripeStart, (int)(stripeEnd - stripeStart), planeSize, 0, outCn);
             }
         }
     };
@@ -243,7 +243,7 @@ class ElementWiseLayer : public Func::Layer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget),
                    func.applyOCL(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -268,7 +268,7 @@ class ElementWiseLayer : public Func::Layer
 
     void forwardSlice(const float* src, float* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
     {
-        func.apply(src, dst, len, planeSize, cn0, cn1);
+        func.apply(src, dst, -1, len, planeSize, cn0, cn1);
     }
 
 #ifdef HAVE_CUDA
@@ -355,8 +355,9 @@ struct ReLUFunctor : public BaseFunctor
                backendId == DNN_BACKEND_CANN;
     }
 
-    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const
     {
+        CV_UNUSED(stripeStart);
         float s = slope;
         for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
         {
@@ -369,10 +370,10 @@ struct ReLUFunctor : public BaseFunctor
                 v_float32x4 x1 = v_load(srcptr + i + 4);
                 v_float32x4 x2 = v_load(srcptr + i + 8);
                 v_float32x4 x3 = v_load(srcptr + i + 12);
-                x0 = v_select(x0 >= z, x0, x0*s4);
-                x1 = v_select(x1 >= z, x1, x1*s4);
-                x2 = v_select(x2 >= z, x2, x2*s4);
-                x3 = v_select(x3 >= z, x3, x3*s4);
+                x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4));
+                x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4));
+                x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4));
+                x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4));
                 v_store(dstptr + i, x0);
                 v_store(dstptr + i + 4, x1);
                 v_store(dstptr + i + 8, x2);
@@ -489,13 +490,13 @@ struct ReLUFunctor : public BaseFunctor
 #endif
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
         if (slope) {
-            auto param = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &slope);
-            return std::make_shared<ngraph::op::PRelu>(node, param);
+            auto param = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &slope);
+            return std::make_shared<ov::op::v0::PRelu>(node, param);
         }
-        return std::make_shared<ngraph::op::Relu>(node);
+        return std::make_shared<ov::op::v0::Relu>(node);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -559,8 +560,9 @@ struct ReLU6Functor : public BaseFunctor
                backendId == DNN_BACKEND_CANN;
     }
 
-    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const
     {
+        CV_UNUSED(stripeStart);
         for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
         {
             int i = 0;
@@ -672,9 +674,9 @@ struct ReLU6Functor : public BaseFunctor
 
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        return std::make_shared<ngraph::op::Clamp>(node, minValue, maxValue);
+        return std::make_shared<ov::op::v0::Clamp>(node, minValue, maxValue);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -704,8 +706,9 @@ struct ReLU6Functor : public BaseFunctor
 template <class T>
 struct BaseDefaultFunctor : public BaseFunctor
 {
-    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const
     {
+        CV_UNUSED(stripeStart);
         for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
         {
             for( int i = 0; i < len; i++ )
@@ -793,7 +796,7 @@ struct BaseDefaultFunctor : public BaseFunctor
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
         CV_Error(Error::StsNotImplemented, "");
     }
@@ -818,7 +821,7 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
 
     bool supportBackend(int backendId, int)
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
     }
 
     inline float calculate(float x) const
@@ -826,6 +829,13 @@ struct GeluFunctor : public BaseDefaultFunctor<GeluFunctor>
         return 0.5f * x * (1.0f + erf(x * M_SQRT1_2));
     }
 
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
+    {
+        return make_cuda_node<cuda4dnn::GeluOp>(target, stream);
+    }
+#endif
+
     int64 getFLOPSPerElement() const { return 100; }
 };
 
@@ -919,9 +929,9 @@ struct TanHFunctor : public BaseDefaultFunctor<TanHFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        return std::make_shared<ngraph::op::Tanh>(node);
+        return std::make_shared<ov::op::v0::Tanh>(node);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -988,10 +998,10 @@ struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        auto sigmoid = std::make_shared<ngraph::op::Sigmoid>(node);
-        return std::make_shared<ngraph::op::v1::Multiply>(node, sigmoid);
+        auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(node);
+        return std::make_shared<ov::op::v1::Multiply>(node, sigmoid);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -1064,15 +1074,9 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        float one = 1.0f;
-        auto constant = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &one);
-        auto exp_node = std::make_shared<ngraph::op::v0::Exp>(node);
-        auto sum = std::make_shared<ngraph::op::v1::Add>(constant, exp_node, ngraph::op::AutoBroadcastType::NUMPY);
-        auto log_node = std::make_shared<ngraph::op::v0::Log>(sum);
-        auto tanh_node = std::make_shared<ngraph::op::Tanh>(log_node);
-        return std::make_shared<ngraph::op::v1::Multiply>(node, tanh_node);
+        return std::make_shared<ov::op::v4::Mish>(node);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -1147,9 +1151,9 @@ struct SigmoidFunctor : public BaseDefaultFunctor<SigmoidFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        return std::make_shared<ngraph::op::Sigmoid>(node);
+        return std::make_shared<ov::op::v0::Sigmoid>(node);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -1227,9 +1231,9 @@ struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        return std::make_shared<ngraph::op::Elu>(node, alpha);
+        return std::make_shared<ov::op::v0::Elu>(node, alpha);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -1297,12 +1301,9 @@ struct AbsValFunctor : public BaseDefaultFunctor<AbsValFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        float coeff = -0.999999f;
-        // float coeff = preferableTarget == DNN_TARGET_MYRIAD ? -0.999f : -0.999999f;
-        auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeff);
-        return std::make_shared<ngraph::op::PRelu>(node, slope);
+        return std::make_shared<ov::op::v0::Abs>(node);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -1593,9 +1594,9 @@ struct SqrtFunctor : public BaseDefaultFunctor<SqrtFunctor>
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        return std::make_shared<ngraph::op::v0::Sqrt>(node);
+        return std::make_shared<ov::op::v0::Sqrt>(node);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -1889,7 +1890,9 @@ struct HardSwishFunctor : public BaseDefaultFunctor<HardSwishFunctor>
 
     bool supportBackend(int backendId, int)
     {
-        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA   ||
+               backendId == DNN_BACKEND_CANN;
     }
 
     inline float calculate(float x) const
@@ -1904,6 +1907,27 @@ struct HardSwishFunctor : public BaseDefaultFunctor<HardSwishFunctor>
     }
 #endif
 
+#ifdef HAVE_CANN
+    Ptr<BackendNode> initCannOp(const std::string& name,
+                                const std::vector<Ptr<BackendWrapper> > &inputs,
+                                const std::vector<Ptr<BackendNode> >& nodes)
+    {
+        auto x = inputs[0].dynamicCast<CannBackendWrapper>();
+
+        auto op = std::make_shared<ge::op::HardSwish>(name);
+
+        auto op_x = nodes[0].dynamicCast<CannBackendNode>()->getOp();
+        op->set_input_x_by_name(*op_x, x->name.c_str());
+        auto x_desc = x->getTensorDesc();
+        op->update_input_desc_x(*x_desc);
+
+        auto output_desc = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_y(*output_desc);
+
+        return Ptr<BackendNode>(new CannBackendNode(op));
+    }
+#endif
+
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -2226,8 +2250,9 @@ struct PowerFunctor : public BaseFunctor
         shift = originShift;
     }
 
-    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const
     {
+        CV_UNUSED(stripeStart);
         float a = scale, b = shift, p = power;
         if( p == 1.f )
         {
@@ -2318,22 +2343,22 @@ struct PowerFunctor : public BaseFunctor
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                                                 ngraph::Shape{1}, &scale);
-        auto shift_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                                                 ngraph::Shape{1}, &shift);
+        auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                                                 ov::Shape{1}, &scale);
+        auto shift_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                                                 ov::Shape{1}, &shift);
 
-        auto mul = std::make_shared<ngraph::op::v1::Multiply>(scale_node, node, ngraph::op::AutoBroadcastType::NUMPY);
-        auto scale_shift = std::make_shared<ngraph::op::v1::Add>(mul, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
+        auto mul = std::make_shared<ov::op::v1::Multiply>(scale_node, node, ov::op::AutoBroadcastType::NUMPY);
+        auto scale_shift = std::make_shared<ov::op::v1::Add>(mul, shift_node, ov::op::AutoBroadcastType::NUMPY);
 
         if (power == 1)
             return scale_shift;
 
-        auto power_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                                                 ngraph::Shape{1}, &power);
-        return std::make_shared<ngraph::op::v1::Power>(scale_shift, power_node, ngraph::op::AutoBroadcastType::NUMPY);
+        auto power_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                                                 ov::Shape{1}, &power);
+        return std::make_shared<ov::op::v1::Power>(scale_shift, power_node, ov::op::AutoBroadcastType::NUMPY);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -2428,15 +2453,15 @@ struct ExpFunctor : public BaseDefaultFunctor<ExpFunctor>
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
-        auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                                                 ngraph::Shape{1}, &normScale);
-        auto shift_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                                                 ngraph::Shape{1}, &normShift);
-        auto mul = std::make_shared<ngraph::op::v1::Multiply>(scale_node, node, ngraph::op::AutoBroadcastType::NUMPY);
-        auto scale_shift = std::make_shared<ngraph::op::v1::Add>(mul, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
-        return std::make_shared<ngraph::op::v0::Exp>(scale_shift);
+        auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                                                 ov::Shape{1}, &normScale);
+        auto shift_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                                                 ov::Shape{1}, &normShift);
+        auto mul = std::make_shared<ov::op::v1::Multiply>(scale_node, node, ov::op::AutoBroadcastType::NUMPY);
+        auto scale_shift = std::make_shared<ov::op::v1::Add>(mul, shift_node, ov::op::AutoBroadcastType::NUMPY);
+        return std::make_shared<ov::op::v0::Exp>(scale_shift);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -2452,6 +2477,7 @@ struct ChannelsPReLUFunctor : public BaseFunctor
     Mat scale;
 #ifdef HAVE_OPENCL
     UMat scale_umat;
+    std::string oclKernelName = "ChannelsPReLUForward";
 #endif
 
     explicit ChannelsPReLUFunctor(const Mat& scale_=Mat()) : scale(scale_)
@@ -2470,8 +2496,9 @@ struct ChannelsPReLUFunctor : public BaseFunctor
                backendId == DNN_BACKEND_CANN;
     }
 
-    void apply(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const
     {
+        CV_UNUSED(stripeStart);
         CV_Assert(scale.isContinuous() && scale.type() == CV_32F);
 
         const float* scaleptr = scale.ptr<float>();
@@ -2489,10 +2516,10 @@ struct ChannelsPReLUFunctor : public BaseFunctor
                 v_float32x4 x1 = v_load(srcptr + i + 4);
                 v_float32x4 x2 = v_load(srcptr + i + 8);
                 v_float32x4 x3 = v_load(srcptr + i + 12);
-                x0 = v_select(x0 >= z, x0, x0*s4);
-                x1 = v_select(x1 >= z, x1, x1*s4);
-                x2 = v_select(x2 >= z, x2, x2*s4);
-                x3 = v_select(x3 >= z, x3, x3*s4);
+                x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4));
+                x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4));
+                x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4));
+                x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4));
                 v_store(dstptr + i, x0);
                 v_store(dstptr + i + 4, x1);
                 v_store(dstptr + i + 8, x2);
@@ -2525,7 +2552,7 @@ struct ChannelsPReLUFunctor : public BaseFunctor
             UMat& src = inputs[i];
             UMat& dst = outputs[i];
 
-            ocl::Kernel kernel("PReLUForward", ocl::dnn::activations_oclsrc, buildopt);
+            ocl::Kernel kernel(oclKernelName.c_str(), ocl::dnn::activations_oclsrc, buildopt);
             kernel.set(0, (int)src.total());
             kernel.set(1, (int)src.size[1]);
             kernel.set(2, (int)total(shape(src), 2));
@@ -2585,11 +2612,11 @@ struct ChannelsPReLUFunctor : public BaseFunctor
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
     {
         const size_t numChannels = scale.total();
-        auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numChannels}, scale.data);
-        return std::make_shared<ngraph::op::PRelu>(node, slope);
+        auto slope = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{numChannels}, scale.data);
+        return std::make_shared<ov::op::v0::PRelu>(node, slope);
     }
 #endif  // HAVE_DNN_NGRAPH
 
@@ -2605,6 +2632,75 @@ struct ChannelsPReLUFunctor : public BaseFunctor
     int64 getFLOPSPerElement() const { return 1; }
 };
 
+struct PReLUFunctor : public ChannelsPReLUFunctor
+{
+    explicit PReLUFunctor(const Mat& scale_=Mat()) : ChannelsPReLUFunctor(scale_)
+    {
+#ifdef HAVE_OPENCL
+        oclKernelName = "PReLUForward";
+#endif
+    }
+
+    bool supportBackend(int backendId, int)
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CANN ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
+    }
+
+    void apply(const float* srcptr, float* dstptr, int stripeStart, int len, size_t planeSize, int cn0, int cn1) const
+    {
+        CV_UNUSED(stripeStart);
+        CV_Assert(scale.isContinuous() && scale.type() == CV_32F);
+
+        if (stripeStart < 0)
+            CV_Error(Error::StsNotImplemented, "PReLUFunctor requires stripe offset parameter");
+
+        const float* scaleptr = scale.ptr<float>() + cn0 * planeSize + stripeStart;
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize, scaleptr += planeSize )
+        {
+            int i = 0;
+        #if CV_SIMD128
+            v_float32x4 z = v_setzero_f32();
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_float32x4 x0 = v_load(srcptr + i);
+                v_float32x4 x1 = v_load(srcptr + i + 4);
+                v_float32x4 x2 = v_load(srcptr + i + 8);
+                v_float32x4 x3 = v_load(srcptr + i + 12);
+                v_float32x4 s0 = v_load(scaleptr + i);
+                v_float32x4 s1 = v_load(scaleptr + i + 4);
+                v_float32x4 s2 = v_load(scaleptr + i + 8);
+                v_float32x4 s3 = v_load(scaleptr + i + 12);
+                x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s0));
+                x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s1));
+                x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s2));
+                x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s3));
+                v_store(dstptr + i, x0);
+                v_store(dstptr + i + 4, x1);
+                v_store(dstptr + i + 8, x2);
+                v_store(dstptr + i + 12, x3);
+            }
+        #endif
+            for( ; i < len; i++ )
+            {
+                float x = srcptr[i];
+                float s = scaleptr[i];
+                dstptr[i] = x >= 0.f ? x : s*x;
+            }
+        }
+    }
+
+#ifdef HAVE_DNN_NGRAPH
+    std::shared_ptr<ov::Node> initNgraphAPI(const ov::Output<ov::Node>& node)
+    {
+        auto shape = getShape<size_t>(scale);
+        auto slope = std::make_shared<ov::op::v0::Constant>(ov::element::f32, shape, scale.ptr<float>());
+        return std::make_shared<ov::op::v0::PRelu>(node, slope);
+    }
+#endif  // HAVE_DNN_NGRAPH
+};
+
 struct SignFunctor : public BaseDefaultFunctor<SignFunctor>
 {
     typedef SignLayer Layer;
@@ -2695,11 +2791,6 @@ template<>
 const char* const ReciprocalFunctor::BaseDefaultFunctor<ReciprocalFunctor>::ocl_kernel_name = "ReciprocalForward";
 
 
-#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
-Ptr<_Layer> _Layer::create() { \
-    return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); }
-
-
 Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
 {
     float negativeSlope = params.get<float>("negative_slope", 0.f);
@@ -3040,13 +3131,26 @@ Ptr<ExpLayer> ExpLayer::create(const LayerParams& params)
 Ptr<Layer> ChannelsPReLULayer::create(const LayerParams& params)
 {
     CV_Assert(params.blobs.size() == 1);
-    if (params.blobs[0].total() == 1)
+    Mat scale = params.blobs[0];
+    float slope = *scale.ptr<float>();
+    if (scale.total() == 1 || countNonZero(scale != slope) == 0)
     {
         LayerParams reluParams = params;
-        reluParams.set("negative_slope", *params.blobs[0].ptr<float>());
+        reluParams.set("negative_slope", slope);
         return ReLULayer::create(reluParams);
     }
-    Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0])));
+
+    Ptr<Layer> l;
+    // Check first two dimensions of scale (batch, channels)
+    MatShape scaleShape = shape(scale);
+    if (std::count_if(scaleShape.begin(), scaleShape.end(), [](int d){ return d != 1;}) > 1)
+    {
+        l = new ElementWiseLayer<PReLUFunctor>(PReLUFunctor(scale));
+    }
+    else
+    {
+        l = new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(scale));
+    }
     l->setParamsFrom(params);
 
     return l;
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 8ed1b799eb95..e9363bcbea8d 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -590,7 +590,7 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
+        if ((inputs_.depth() == CV_16F && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
             return false;
 
         if (hasVecInput)
@@ -610,7 +610,7 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
                         size_t localsize[] = { 128 };
                         size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
                         String opts;
-                        if (inputs_.depth() == CV_16S)
+                        if (inputs_.depth() == CV_16F)
                             opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
                         else
                             opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
@@ -636,7 +636,7 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
                     }
                     else
                     {
-                        if (inputs_.depth() == CV_16S)
+                        if (inputs_.depth() == CV_16F)
                             return false;
 
                         float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
@@ -689,7 +689,7 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -896,29 +896,32 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
+        CV_Assert(nodes.size() >= 2);
         auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         if (!coeffs.empty()) {
-            auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
-            curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
+            auto coeff = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &coeffs[0]);
+            curr_node = std::make_shared<ov::op::v1::Multiply>(curr_node, coeff, ov::op::AutoBroadcastType::NUMPY);
         }
 
+        std::shared_ptr<ov::Node> res;
         for (size_t i = 1; i < nodes.size(); i++)
         {
             auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
             if (!coeffs.empty()) {
-                auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[i]);
-                next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
+                auto coeff = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &coeffs[i]);
+                next_node = std::make_shared<ov::op::v1::Multiply>(next_node, coeff, ov::op::AutoBroadcastType::NUMPY);
             }
             switch (op) {
-                case SUM:  curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
-                case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
-                case DIV:  curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
-                case MAX:  curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
-                case MIN:  curr_node = std::make_shared<ngraph::op::v1::Minimum>(curr_node, next_node); break;
+                case SUM:  res = std::make_shared<ov::op::v1::Add>(curr_node, next_node); break;
+                case PROD: res = std::make_shared<ov::op::v1::Multiply>(curr_node, next_node); break;
+                case DIV:  res = std::make_shared<ov::op::v1::Divide>(curr_node, next_node); break;
+                case MAX:  res = std::make_shared<ov::op::v1::Maximum>(curr_node, next_node); break;
+                case MIN:  res = std::make_shared<ov::op::v1::Minimum>(curr_node, next_node); break;
                 default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
             }
+            curr_node = res;
         }
-        return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
+        return Ptr<BackendNode>(new InfEngineNgraphNode(res));
     }
 #endif  // HAVE_DNN_NGRAPH
 
diff --git a/modules/dnn/src/layers/expand_layer.cpp b/modules/dnn/src/layers/expand_layer.cpp
new file mode 100644
index 000000000000..09f2d78c4f10
--- /dev/null
+++ b/modules/dnn/src/layers/expand_layer.cpp
@@ -0,0 +1,172 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv { namespace dnn {
+
+class ExpandLayerImpl CV_FINAL : public ExpandLayer
+{
+public:
+    ExpandLayerImpl(const LayerParams &params) {
+        setParamsFrom(params);
+
+        // shape as param
+        CV_CheckTrue(params.has("shape"), "DNN/Expand: shape is required in Expand layer initialization");
+        DictValue param_shape = params.get("shape");
+        int ndims_shape = param_shape.size();
+        CV_CheckGT(ndims_shape, 0, "DNN/Expand: ndims of shape must be > 0");
+        target_shape.resize(ndims_shape);
+        for (int i = 0; i < ndims_shape; i++) {
+            target_shape[i] = param_shape.get<int>(i);
+        }
+
+        // FIXME: remove when 0d/1d mat is available
+        const_input_1d = params.get("const_input_1d", false);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE {
+        CV_CheckGE(inputs.size(), static_cast<size_t>(1), "DNN/Expand: one input at least");
+        CV_CheckLE(inputs.size(), static_cast<size_t>(2), "DNN/Expand: two input at most");
+        CV_CheckFalse(target_shape.empty(), "DNN/Expand: shape must known before memory is set");
+
+        MatShape input_shape = inputs[0]; // 1d tensor is represented as 2d mat, e.g. [3] -> [3, 1]
+        if (const_input_1d) {
+            input_shape = {inputs[0][0]};
+        }
+
+        auto& moreDimension = input_shape.size() > target_shape.size() ? input_shape : target_shape;
+        auto& lessDimension = input_shape.size() <= target_shape.size() ? input_shape : target_shape;
+
+        /*  Example:
+                             i = 3
+                               |
+            moreDimension: 1 2 3 4 5, assign non-aligned dimensions to output shape
+            lessDimension:     1 1 5, when dimension is aligned, check valid dimension (either equal or one of them is 1) and assign bigger one
+                               |
+                             j = 0 = i - (moreDimension.size() - lessDimension.size());
+        */
+        MatShape outputShape(moreDimension.size(), 1);
+        for (int i = 0; i < moreDimension.size(); i++) {
+            int d = moreDimension[i];
+            int j = i - (moreDimension.size() - lessDimension.size());
+            if (j >= 0) {
+                if (d == 1 || lessDimension[j] == 1 || // broadcast
+                    d == lessDimension[j]) {           // plain copy
+                    outputShape[i] = std::max(d, lessDimension[j]);
+                } else {
+                    CV_Error(Error::StsBadSize, cv::format("DNN/Expand: invalid dimension, d (%d) != d (%d)", moreDimension[i], lessDimension[j]));
+                }
+            } else {
+                outputShape[i] = d;
+            }
+        }
+        outputs.assign(1, outputShape);
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        const auto &input = inputs[0];
+        auto input_shape = shape(input);
+        if (const_input_1d) {
+            input_shape = {input_shape[0]};
+        }
+
+        auto& moreDimension = input_shape.size() > target_shape.size() ? input_shape : target_shape;
+        auto& lessDimension = input_shape.size() <= target_shape.size() ? input_shape : target_shape;
+
+        MatShape final_target_shape(moreDimension.size(), 1);
+        for (int i = 0; i < moreDimension.size(); i++) {
+            int d = moreDimension[i];
+            int j = i - (moreDimension.size() - lessDimension.size());
+            if (j >= 0) {
+                final_target_shape[i] = std::max(lessDimension[j], d);
+            } else {
+                final_target_shape[i] = d;
+            }
+        }
+        target_shape.clear();
+        target_shape = std::move(final_target_shape);
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16F)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        int target_shape_total = std::accumulate(target_shape.begin(), target_shape.end(), 1, std::multiplies<int>());
+        if (target_shape_total == inputs[0].total()) {
+            const char *data = inputs[0].ptr<const char>();
+            char *output = outputs[0].ptr<char>();
+            int step = target_shape_total * outputs[0].elemSize();
+            std::memcpy(output, data, step);
+            return;
+        }
+
+        if (const_input_1d) {
+            const char *data = inputs[0].ptr<const char>();
+            char *output = outputs[0].ptr<char>();
+            int step = target_shape.back() * outputs[0].elemSize();
+            int total = std::accumulate(target_shape.begin(), target_shape.end() - 1, 1, std::multiplies<int>());
+            for (int i = 0; i < total; i++) {
+                std::memcpy(output + i * step, data, step);
+            }
+        } else {
+            cv::broadcast(inputs[0], target_shape, outputs[0]);
+        }
+    }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto input_shape = nodes[0].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
+        CV_CheckGE(target_shape.size(), input_shape.size(), "");
+
+        std::vector<int32_t> output_shape(target_shape.begin(), target_shape.end());
+        for (int i = 1; i < input_shape.size() + 1; ++i)
+            output_shape[output_shape.size() - i] = std::max(
+                (int32_t)input_shape[input_shape.size() - i],
+                output_shape[output_shape.size() - i]);
+
+        auto shape_node = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{output_shape.size()}, output_shape.data());
+        auto expand = std::make_shared<ov::op::v3::Broadcast>(nodes[0].dynamicCast<InfEngineNgraphNode>()->node, shape_node);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(expand));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+private:
+    MatShape target_shape;
+    bool const_input_1d;
+};
+
+Ptr<ExpandLayer> ExpandLayer::create(const LayerParams &params) {
+    return makePtr<ExpandLayerImpl>(params);
+}
+
+}}  // cv::dnn
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index 6a502af7e927..48950601f2fd 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -56,6 +56,7 @@
 using namespace cv::dnn::cuda4dnn;
 #endif
 
+
 namespace cv
 {
 namespace dnn
@@ -209,7 +210,7 @@ class FlattenLayerImpl CV_FINAL : public FlattenLayer
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::vector<size_t> dims = ieInpNode->get_shape();
+        std::vector<size_t> dims = ieInpNode.get_shape();
 
         int numAxes = dims.size();
         int startAxis = normalize_axis(_startAxis, numAxes);
@@ -224,9 +225,9 @@ class FlattenLayerImpl CV_FINAL : public FlattenLayer
         outputShapeVec.push_back(flattenedDimensionSize);
         outputShapeVec.insert(outputShapeVec.end(), dims.begin() + endAxis + 1, dims.end());
 
-        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                       ngraph::Shape({outputShapeVec.size()}), outputShapeVec.data());
-        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, shape, true);
+        auto shape   = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                       ov::Shape({outputShapeVec.size()}), outputShapeVec.data());
+        auto reshape = std::make_shared<ov::op::v1::Reshape>(ieInpNode, shape, true);
         return Ptr<BackendNode>(new InfEngineNgraphNode(reshape));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index e0fdac1039a0..4ff6fc74a420 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -180,15 +180,12 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
         bool tranAorB = transA || transB;
-#ifdef HAVE_INF_ENGINE
-        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-            return axis == 1 && !tranAorB;
-#endif
         return backendId == DNN_BACKEND_OPENCV ||
                backendId == DNN_BACKEND_CUDA ||
                (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !tranAorB) ||
                (backendId == DNN_BACKEND_WEBNN && axis == 1 && !tranAorB) ||
                backendId == DNN_BACKEND_CANN ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
                (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !tranAorB);
     }
 
@@ -311,7 +308,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
                         }
 
                         v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
-                        s += v_load(biasptr + i);
+                        s = v_add(s, v_load(biasptr + i));
                         v_store(dptr + i, s);
                     }
             #endif
@@ -360,7 +357,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -388,9 +385,9 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
 
                 if (use_half)
                 {
-                    convertFp16(A, A_fp32);
-                    convertFp16(B, B_fp32);
-                    convertFp16(C, C_fp32);
+                    A.convertTo(A_fp32, CV_32F);
+                    B.convertTo(B_fp32, CV_32F);
+                    C.convertTo(C_fp32, CV_32F);
                 }
                 else
                 {
@@ -401,9 +398,9 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
                 cv::gemm(A_fp32, B_fp32, 1, noArray(), 0, C_fp32);
                 if (use_half)
                 {
-                    convertFp16(A_fp32, A);
-                    convertFp16(B_fp32, B);
-                    convertFp16(C_fp32, C);
+                    A_fp32.convertTo(A, CV_16F);
+                    B_fp32.convertTo(B, CV_16F);
+                    C_fp32.convertTo(C, CV_16F);
                 }
             }
             return true;
@@ -434,7 +431,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
                 for (int i = 0; i < umat_blobs.size(); i++)
                 {
                     if (!umat_blobs[i].empty())
-                        convertFp16(umat_blobs[i], half_blobs[i]);
+                        umat_blobs[i].convertTo(half_blobs[i], CV_16F);
                 }
             }
 
@@ -458,13 +455,6 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
                 ret = false;
                 break;
             }
-
-            if (!use_half && bias && (outerSize > 1))
-            {
-                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
-                UMat& biases = umat_blobs[1];
-                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
-            }
         }
 
         if (ret) return true;
@@ -482,8 +472,8 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
 
             if (use_half)
             {
-                convertFp16(srcMat, srcMat_fp32);
-                convertFp16(dstMat, dstMat_fp32);
+                srcMat.convertTo(srcMat_fp32, CV_32F);
+                dstMat.convertTo(dstMat_fp32, CV_32F);
             }
             else
             {
@@ -501,8 +491,8 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
             }
             if (use_half)
             {
-                convertFp16(srcMat_fp32, srcMat);
-                convertFp16(dstMat_fp32, dstMat);
+                srcMat_fp32.convertTo(srcMat, CV_16F);
+                dstMat_fp32.convertTo(dstMat, CV_16F);
             }
         }
 
@@ -518,7 +508,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && !isMatMul,
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -630,8 +620,10 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
 
             if(input_wrapper->getRank() == inp2Dim)
                 return make_cuda_node<cuda4dnn::MatMulOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), oriMat, biasMat_, transA, transB);
-            else
+            else {
+                CV_LOG_INFO(NULL, "DNN/CUDA: no implementation for MatMul with rank " << input_wrapper->getRank());
                 return Ptr<BackendNode>();
+            }
         }
 
         auto flatten_start_axis = normalize_axis(axis, input_wrapper->getRank());
@@ -795,28 +787,37 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::shared_ptr<ngraph::Node> matmul;
+        std::shared_ptr<ov::Node> matmul;
 
         if (nodes.size() == 2)
         {
             auto& inp2 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-            matmul = std::make_shared<ngraph::op::MatMul>(ieInpNode, inp2, false, false);
+            matmul = std::make_shared<ov::op::v0::MatMul>(ieInpNode, inp2, transA, transB);
         }
         else
         {
-            std::vector<int64_t> data = {(int64_t)ieInpNode->get_shape()[0], (int64_t)blobs[0].size[1]};
-            auto new_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, data.data());
-            auto inp = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, new_shape, true);
-
-            std::vector<size_t> weight_shape{(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
-            auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, weight_shape, blobs[0].data);
-            matmul = std::make_shared<ngraph::op::MatMul>(inp, ieWeights, false, true);
+            std::vector<int> shape(1 + normalize_axis(axis, ieInpNode.get_shape().size()), 0);
+            shape[shape.size() - 1] = -1;
+            auto inp = std::make_shared<ov::op::v1::Reshape>(
+                ieInpNode,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{shape.size()}, shape.data()),
+                true
+            );
+
+            std::vector<size_t> weight_shape;
+            if (isMatMul) {
+                weight_shape = getShape<size_t>(oriMat);
+            } else {
+                weight_shape = {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1]};
+            }
+            auto ieWeights = std::make_shared<ov::op::v0::Constant>(ov::element::f32, weight_shape, blobs[0].data);
+            matmul = std::make_shared<ov::op::v0::MatMul>(inp, ieWeights, transA, transB);
         }
 
         if (bias) {
-            auto bias_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                              ngraph::Shape{(size_t)blobs[1].size[1]}, blobs[1].data);
-            matmul = std::make_shared<ngraph::op::v1::Add>(matmul, bias_node, ngraph::op::AutoBroadcastType::NUMPY);
+            auto bias_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                              ov::Shape{(size_t)blobs[1].size[1]}, blobs[1].data);
+            matmul = std::make_shared<ov::op::v1::Add>(matmul, bias_node, ov::op::AutoBroadcastType::NUMPY);
         }
         return Ptr<BackendNode>(new InfEngineNgraphNode(matmul));
     }
diff --git a/modules/dnn/src/layers/gather_elements_layer.cpp b/modules/dnn/src/layers/gather_elements_layer.cpp
new file mode 100644
index 000000000000..de15b439a904
--- /dev/null
+++ b/modules/dnn/src/layers/gather_elements_layer.cpp
@@ -0,0 +1,182 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv { namespace dnn {
+
+static inline int calculateOffset(int outer_dim, const MatShape &shape_indices, int axis_skip, const MatStep &step_data) {
+    int offset = 0;
+    for (int axis = static_cast<int>(shape_indices.size()) - 2; axis >= 0; axis--) {
+        int dim = shape_indices[axis];
+        if (axis != axis_skip) {
+            offset += (outer_dim % dim) * step_data[axis];
+        }
+        outer_dim /= dim;
+    }
+    return offset;
+}
+
+class GatherElementsLayerImpl CV_FINAL : public GatherElementsLayer
+{
+public:
+    GatherElementsLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        axis = params.get<int>("axis", 0);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_CheckEQ(inputs.size(), 2ull, "GatherElements: requires two inputs");
+
+        const auto &data = inputs[0];
+        const auto &indices = inputs[1];
+        CV_CheckEQ(data.size(), indices.size(), "GatherElements: data and indices should have the same dimension");
+
+        int normalized_axis = normalize_axis(axis, static_cast<int>(data.size()));
+        CV_CheckGE(normalized_axis, 0, "GatherElements: axis out of range");
+        CV_CheckLT(normalized_axis, static_cast<int>(data.size()), "GatherElements: axis out of range");
+        for (size_t i = 0; i < data.size(); i++) {
+            if (i != normalized_axis) {
+                CV_CheckEQ(data[i], indices[i], "GatherElements: shape mismatched");
+            }
+        }
+
+        outputs.assign(1, inputs[1]); // shape of output is same as indices
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        const auto &data = inputs[0];
+        axis = normalize_axis(axis, data.dims);
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16F)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const Mat& data = inputs[0];
+        const Mat& indices = inputs[1];
+        Mat& out = outputs[0];
+
+        typeDispatch(outputs[0].type(), data, indices, out);
+    }
+
+    template <typename T>
+    void forward_impl(const Mat& data_, const Mat& indices_,  Mat& out_)
+    {
+        const auto *ptr_data = data_.ptr<const T>();
+        const auto *ptr_indices = indices_.ptr<const T>();
+        auto *ptr_out = out_.ptr<T>();
+
+        const auto shape_data = shape(data_);
+        const auto &step_data = data_.step;
+        const auto shape_indices = shape(indices_);
+
+        int inner_most_dim = shape_indices.back();
+        int axis_dim = shape_data[axis];
+        size_t axis_step = static_cast<size_t>(step_data[axis] / sizeof(T));
+
+        bool innermost_axis = axis == static_cast<int>(shape_data.size() - 1);
+
+        auto fn = [&](const Range &r) {
+            for (int i = r.start; i < r.end; i++) {
+                auto *data = ptr_data + static_cast<size_t>(calculateOffset(i, shape_indices, axis, step_data) / sizeof(T));
+                auto *indices = ptr_indices + i * inner_most_dim;
+                auto *out = ptr_out + i * inner_most_dim;
+
+                if (innermost_axis) {
+                    for (int j = 0; j < inner_most_dim; j++) {
+                        int index = static_cast<int>((indices[j] + axis_dim)) % axis_dim; // TODO: Check out-of-range index
+                        out[j] = data[index];
+                    }
+                } else {
+                    for (int j = 0; j < inner_most_dim; j++) {
+                        int index = static_cast<int>(indices[j] + axis_dim) % axis_dim; // TODO: Check out-of-range index
+                        out[j] = data[index * axis_step + j];
+                    }
+                }
+            }
+        };
+
+        int outer_dims = total(shape_indices, 0, shape_indices.size() - 1);
+        double nstripes = static_cast<size_t>(outer_dims * inner_most_dim * (1 / 1024.0));
+        parallel_for_(Range(0, outer_dims), fn, nstripes);
+    }
+
+    template<typename... Args>
+    inline void typeDispatch(const int type, Args&&... args)
+    {
+        switch (type)
+        {
+            case CV_8U:
+                forward_impl<uint8_t>(std::forward<Args>(args)...);
+                break;
+            case CV_32S:
+                forward_impl<int32_t>(std::forward<Args>(args)...);
+                break;
+            case CV_32F:
+                forward_impl<float>(std::forward<Args>(args)...);
+                break;
+            default:
+                CV_Error(cv::Error::BadDepth, "DNN/GatherElements: Unsupported type.");
+        };
+    }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        int32_t indicesBoundValue = nodes[0].dynamicCast<InfEngineNgraphNode>()->node.get_shape()[axis];
+        auto indicesBound = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, &indicesBoundValue);
+        auto indices = std::make_shared<ov::op::v0::Convert>(nodes[1].dynamicCast<InfEngineNgraphNode>()->node, ov::element::i32);
+        auto indicesNonNegative = std::make_shared<ov::op::v1::Mod>(
+            std::make_shared<ov::op::v1::Add>(indices, indicesBound),
+            indicesBound);
+
+        auto gatherElements = std::make_shared<ov::op::v6::GatherElements>(
+            nodes[0].dynamicCast<InfEngineNgraphNode>()->node,
+            indicesNonNegative,
+            axis);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(gatherElements));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
+private:
+    int axis;
+};
+
+Ptr<GatherElementsLayer> GatherElementsLayer::create(const LayerParams& params)
+{
+    return makePtr<GatherElementsLayerImpl>(params);
+}
+
+}} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/gather_layer.cpp b/modules/dnn/src/layers/gather_layer.cpp
index 924b5fcbc191..7b731e6d545e 100644
--- a/modules/dnn/src/layers/gather_layer.cpp
+++ b/modules/dnn/src/layers/gather_layer.cpp
@@ -3,6 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
 #include "layers_common.hpp"
 
 
@@ -20,7 +22,8 @@ class GatherLayerImpl CV_FINAL : public GatherLayer
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -57,12 +60,12 @@ class GatherLayerImpl CV_FINAL : public GatherLayer
         const Mat& inp = inputs[0];
 
         int indicesType = inputs[1].type();
-        CV_CheckType(indicesType, indicesType == CV_32FC1 || indicesType == CV_16SC1, "");
+        CV_CheckType(indicesType, indicesType == CV_32FC1 || indicesType == CV_16FC1, "");
         Mat indices32S;
-        if (indicesType == CV_16S/*FP16*/)
+        if (indicesType == CV_16F/*FP16*/)
         {
             Mat indicesF32;
-            convertFp16(inputs[1], indicesF32);
+            inputs[1].convertTo(indicesF32, CV_32F);
             indicesF32.convertTo(indices32S, CV_32S);
         }
         else
@@ -113,6 +116,19 @@ class GatherLayerImpl CV_FINAL : public GatherLayer
         }
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto axisNode = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, &m_axis);
+        auto gather = std::make_shared<ov::op::v8::Gather>(
+            nodes[0].dynamicCast<InfEngineNgraphNode>()->node,
+            std::make_shared<ov::op::v0::Convert>(nodes[1].dynamicCast<InfEngineNgraphNode>()->node, ov::element::i32),
+            axisNode);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(gather));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
 private:
     // The axis to gather along
     int m_axis;
diff --git a/modules/dnn/src/layers/gemm_layer.cpp b/modules/dnn/src/layers/gemm_layer.cpp
new file mode 100644
index 000000000000..ac0914c2c210
--- /dev/null
+++ b/modules/dnn/src/layers/gemm_layer.cpp
@@ -0,0 +1,381 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+// backends
+#include "../op_cuda.hpp"
+#ifdef HAVE_CUDA
+// #include "../cuda4dnn/primitives/matmul.hpp"
+#include "../cuda4dnn/primitives/inner_product.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+#include "../op_cann.hpp"
+#include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+#include "cpu_kernels/fast_gemm.hpp"
+
+namespace cv { namespace dnn {
+
+class GemmLayerImpl CV_FINAL : public GemmLayer {
+public:
+    GemmLayerImpl(const LayerParams& params) {
+        setParamsFrom(params);
+
+        trans_a = params.get<bool>("transA", false);
+        trans_b = params.get<bool>("transB", false);
+        alpha = params.get<float>("alpha", 1.0f);
+        beta = params.get<float>("beta", 1.0f);
+
+        const_B = params.get<bool>("constB", false); // true means blobs[0] is B
+        const_C = params.get<bool>("constC", false); // true means blobs.back() is C
+        have_bias = params.get<bool>("have_bias", false); // NOTE: have_bias being true does not mean bias is constant
+
+        real_ndims_C = params.get<int>("real_ndims_C", -1);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE {
+        return backendId == DNN_BACKEND_OPENCV ||
+               (backendId == DNN_BACKEND_CUDA && const_B && !trans_a) ||
+               backendId == DNN_BACKEND_CANN ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !have_bias && !trans_a);
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE {
+        int num_inputs = static_cast<int>(inputs.size() + blobs.size());
+        CV_CheckGE(num_inputs, 2, "DNN/Gemm: Gemm takes at least two inputs");
+        CV_CheckLE(num_inputs, 3, "DNN/Gemm: Gemm takes at most three inputs");
+
+        // Check whether A and B are two dimensional
+        const auto shape_A = inputs[0];
+        const auto shape_B = const_B ? shape(blobs[0]) : inputs[1];
+        CV_CheckGE(shape_A.size(), static_cast<size_t>(2), "DNN/Gemm: Tensor A must be n-dimensional (n >= 2)");
+        CV_CheckEQ(shape_B.size(), static_cast<size_t>(2), "DNN/Gemm: Tensor B must be two dimensional");
+
+        // Check legal matrix multiplication
+        size_t dims_A = shape_A.size();
+        int ma = shape_A[dims_A - 2], na = shape_A[dims_A - 1];
+        int mb = shape_B[0], nb = shape_B[1];
+        int M = trans_a ? na : ma;
+        int N = trans_b ? mb : nb;
+        int K_a = trans_a ? ma : na;
+        int K_b = trans_b ? nb : mb;
+        CV_CheckEQ(K_a, K_b, "DNN/Gemm: Invalid dimension of dim K");
+
+        // Check whether C can be unidirectional broadcast to (M, N). Handle carefully with 1D Mat.
+        if (have_bias) {
+            const auto shape_C = const_C ? shape(blobs.back()) : inputs.back();
+
+            auto ndims_C = shape_C.size();
+            CV_CheckLE(ndims_C, static_cast<size_t>(2), "DNN/Gemm: C can only be 0d (scalar) / 1d / 2d tensor");
+
+            if (real_ndims_C == 1) { // (1,) or (N,)
+                CV_Check(shape_C[0], shape_C[0] == 1 || shape_C[0] == N, "DNN/Gemm: invalid dimension of C");
+            } else if (real_ndims_C == 2) { // (1, 1) or (1, N) or (M, 1) or (M, N)
+                // printf("shape_C=[%d, %d]\n", shape_C[0], shape_C[1]);
+                CV_Check(shape_C[0], (shape_C[0] == 1 && shape_C[1] == 1) ||
+                                     (shape_C[0] == 1 && shape_C[1] == N) ||
+                                     (shape_C[0] == M && shape_C[1] == 1) ||
+                                     (shape_C[0] == M && shape_C[1] == N),
+                                     "DNN/Gemm: C must be of shape (1, 1) or (1, N) or (M, 1) or (M, N)");
+                if (shape_C[0] == 1) {
+                    CV_Check(shape_C[1], shape_C[1] == 1 || shape_C[1] == N, "DNN/Gemm: invalid dimension of C");
+                } else if (shape_C[0] == M) {
+                    CV_Check(shape_C[1], shape_C[1] == 1 || shape_C[1] == N, "DNN/Gemm: invalid dimension of C");
+                } else {
+                    CV_Error(Error::StsBadSize, "DNN/Gemm: invalid dimension of C");
+                }
+            }
+        }
+
+        int batches = std::accumulate(shape_A.begin(), shape_A.end() - 2, 1, std::multiplies<int>());
+        MatShape shape_y{M * batches, N};
+        outputs.assign(1, shape_y);
+        return false;
+    }
+
+    // TODO: replace with cv::broadcast() once 1d mat is supported
+    // FIXME: fix if conditions if 1d mat is supported properly
+    void broadcastCWtihBeta(int M, int N, const Mat &C) {
+        if (beta != 0 && !C.empty()) {
+            broadcast_C.clear();
+            broadcast_C.resize(M * N, 0.f);
+
+            const float *ptr_c = C.ptr<const float>();
+            const auto shape_C = shape(C);
+            if ((real_ndims_C == 0) || (real_ndims_C == 1 && shape_C[0] == 1) ||
+                (real_ndims_C == 2 && shape_C[0] == 1 && shape_C[1] == 1)) {
+                // (), (1,), (1, 1)
+                float c = *ptr_c;
+                int total = M * N;
+                for (int i = 0; i < total; ++i) {
+                    broadcast_C[i] = beta * c;
+                }
+            } else if ((real_ndims_C == 1 && shape_C[0] == N) ||
+                       (real_ndims_C == 2 && shape_C[0] == 1 && shape_C[1] == N)) {
+                // (N,), (1, N)
+                for (int i = 0; i < M; ++i) {
+                    int step = i * N;
+                    for (int j = 0; j < N; ++j) {
+                        broadcast_C[step + j] = beta * ptr_c[j];
+                    }
+                }
+            } else if (real_ndims_C == 2 && shape_C[0] == M && shape_C[1] == 1) {
+                // (M, 1)
+                for (int i = 0; i < M; ++i) {
+                    int step = i * N;
+                    for (int j = 0; j < N; ++j) {
+                        broadcast_C[step + j] = beta * ptr_c[i];
+                    }
+                }
+            } else {
+                // (M, N)
+                std::transform(ptr_c, ptr_c + M * N, broadcast_C.begin(), [this] (const float &c) {
+                    return this->beta * c; });
+            }
+        }
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
+        opt.init();
+
+        // pack B if it is const
+        if (const_B) {
+            fastGemmPackB(blobs[0], packed_B, trans_b, opt);
+        }
+
+        // also pre-broadcast bias
+        if (const_C) {
+            const auto &C = blobs.back();
+
+            std::vector<Mat> outputs;
+            outputs_arr.getMatVector(outputs);
+            const auto &Y = outputs[0];
+            const auto shape_Y = shape(Y);
+            size_t dims_Y = shape_Y.size();
+            int M = shape_Y[dims_Y - 2], N = shape_Y[dims_Y - 1];
+
+            // broadcast
+            broadcastCWtihBeta(M, N, C);
+        }
+    }
+
+    // Y = A * B + C, note that C is unidirectionaly broadcastable to (A * B).
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16F)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const auto &A = inputs[0];
+        auto &Y = outputs[0];
+
+        const auto shape_A = shape(A), shape_Y = shape(Y);
+        size_t dims_A = shape_A.size();
+        int ma = shape_A[dims_A - 2], na = shape_A[dims_A - 1];
+        size_t dims_Y = shape_Y.size();
+        int M = shape_Y[dims_Y - 2], N = shape_Y[dims_Y - 1];
+        int K = trans_a ? ma : na;
+
+        // broadcast C and copy C to output
+        if (have_bias) {
+            if (!const_C) {
+                broadcastCWtihBeta(M, N, inputs.back());
+            }
+            int step = M * N;
+            CV_CheckEQ(broadcast_C.size(), static_cast<size_t>(step), "DNN/Gemm: C is not broadcast properly");
+            float *ptr_y = Y.ptr<float>();
+            std::memcpy(ptr_y, broadcast_C.data(), step * sizeof(float));
+        } else { // initialization
+            float *ptr_y = Y.ptr<float>();
+            size_t total = Y.total();
+            std::memset(ptr_y, 0, total * sizeof(float));
+        }
+
+        if (const_B) {
+            CV_CheckGT(packed_B.size(), static_cast<size_t>(0), "DNN/Gemm: constant B is not pre-packed");
+            fastGemm(trans_a, M, N, K, alpha, A.ptr<const float>(), na, packed_B.data(), 1.f, Y.ptr<float>(), N, opt);
+        } else {
+            fastGemmBatch(trans_a, trans_b, alpha, A, inputs[1], 1.f, Y, opt);
+        }
+    }
+
+#ifdef HAVE_CUDA
+    // Y = A * B + C. B should be guaranteed as two dimensional.
+    Ptr<BackendNode> initCUDA(void *context_,
+                              const std::vector<Ptr<BackendWrapper>>& inputs,
+                              const std::vector<Ptr<BackendWrapper>>& outputs) CV_OVERRIDE {
+        CV_CheckFalse(trans_a, "DNN/Gemm/Cuda: does not support transA");
+        CV_CheckTrue(const_B, "DNN/Gemm/Cuda: input B (weight) is required to be constant");
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        auto wrapper_A = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto B = blobs[0];
+        auto C = have_bias && const_C ? blobs[1] : Mat(); // in most cases C is constant
+
+        if (!trans_b)
+            cv::transpose(B, B);
+        auto flatten_start_axis = normalize_axis(1, wrapper_A->getRank());
+        return make_cuda_node<cuda4dnn::InnerProductOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), flatten_start_axis, B, C);
+    }
+#endif // HAVE_CUDA
+
+#ifdef HAVE_CANN
+    // Y = A * B + C.
+    virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                      const std::vector<Ptr<BackendWrapper> > &outputs,
+                                      const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
+        auto x1 = inputs[0].dynamicCast<CannBackendWrapper>();
+        auto desc_x1 = x1->getTensorDesc();
+        auto op_x1 = nodes[0].dynamicCast<CannBackendNode>()->getOp();
+
+        auto op = std::make_shared<ge::op::MatMulV2>(name);
+
+        // set attributes
+        op->set_attr_transpose_x1(trans_a);
+        op->set_attr_transpose_x2(trans_b);
+
+        // set inputs
+        // set inputs : x1
+        op->set_input_x1_by_name(*op_x1, x1->name.c_str());
+        op->update_input_desc_x1(*desc_x1);
+        // set inputs : x2
+        if (const_B) {
+            auto B = blobs[0];
+            auto op_const_B = std::make_shared<CannConstOp>(B.data, B.type(), shape(B), cv::format("%s_w", name.c_str()));
+            op->set_input_x2_by_name(*(op_const_B->getOp()), "y");
+            op->update_input_desc_x2(*(op_const_B->getTensorDesc()));
+        } else {
+            CV_CheckGE(inputs.size(), static_cast<size_t>(2), "DNN/Gemm/CANN: input B is required since it is not constant");
+            CV_CheckGE(nodes.size(), static_cast<size_t>(2), "DNN/Gemm/CANN: input B is required since it is not constant");
+            auto op_x2 = nodes[1].dynamicCast<CannBackendNode>()->getOp();
+            auto desc_x2 = inputs[1].dynamicCast<CannBackendWrapper>()->getTensorDesc();
+            op->set_input_x2_by_name(*op_x2, "y");
+            op->update_input_desc_x2(*desc_x2);
+        }
+        // set inputs : bias
+        auto mat_C = have_bias && const_C ? blobs.back() : Mat::zeros(1, 1, CV_32F);
+        auto shape_C = shape(mat_C);
+        if (real_ndims_C == 1) {
+            int dim = static_cast<int>(mat_C.total());
+            shape_C = std::vector<int>{dim};
+        }
+        auto op_const_C = std::make_shared<CannConstOp>(mat_C.data, mat_C.type(), shape_C, cv::format("%s_b", name.c_str()));
+        op->set_input_bias(*(op_const_C->getOp()));
+        op->update_input_desc_bias(*(op_const_C->getTensorDesc()));
+
+        // set outputs
+        auto output_desc = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_y(*output_desc);
+        return Ptr<BackendNode>(new CannBackendNode(op));
+    }
+#endif // HAVE_CANN
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        ov::Output<ov::Node> nodeA = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        ov::Output<ov::Node> nodeB;
+        if (const_B)
+            nodeB = std::make_shared<ov::op::v0::Constant>(ov::element::f32, getShape(blobs[0]), blobs[0].data);
+        else
+            nodeB = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+
+        int flatten_axis = nodeA.get_shape().size() - nodeB.get_shape().size();
+        if (flatten_axis > 0) {
+            std::vector<int> shape(1 + flatten_axis, 0);
+            shape[shape.size() - 1] = -1;
+            nodeA = std::make_shared<ov::op::v1::Reshape>(
+                nodeA,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{shape.size()}, shape.data()),
+                true);
+        }
+
+        std::shared_ptr<ov::Node> nodeAB = std::make_shared<ov::op::v0::MatMul>(nodeA, nodeB, trans_a, trans_b);
+        if (alpha != 1.0f)
+        {
+            nodeAB = std::make_shared<ov::op::v1::Multiply>(
+                nodeAB,
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &alpha));
+        }
+
+        if (!have_bias)
+            return Ptr<BackendNode>(new InfEngineNgraphNode(nodeAB));
+
+        ov::Output<ov::Node> nodeC;
+        if (const_C)
+        {
+            auto shape_C = blobs.back().total() == blobs.back().size[0] ? ov::Shape{blobs.back().total()} : getShape(blobs.back());
+            nodeC = std::make_shared<ov::op::v0::Constant>(ov::element::f32, shape_C, blobs.back().data);
+        }
+        else
+        {
+            nodeC = nodes.back().dynamicCast<InfEngineNgraphNode>()->node;
+        }
+
+        if (beta != 1.0f)
+        {
+            nodeC = std::make_shared<ov::op::v1::Multiply>(
+                nodeC,
+                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &beta));
+        }
+
+        auto nodeGemm = std::make_shared<ov::op::v1::Add>(nodeAB, nodeC, ov::op::AutoBroadcastType::NUMPY);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(nodeGemm));
+    }
+#endif // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_VULKAN
+    // Y = A * B + C. Currently support 2d matrix multiplication without bias.
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                       std::vector<Ptr<BackendWrapper> > &outputs) CV_OVERRIDE
+    {
+        // does not support with bias; only 2d matmul
+        auto wrapper_Y = outputs[0].dynamicCast<VkComBackendWrapper>();
+        auto shape_Y = shape(*(wrapper_Y->getMat()));
+        if (have_bias || shape_Y.size() > static_cast<size_t>(2)) {
+            return Ptr<BackendNode>();
+        }
+
+        std::vector<Mat> vkBlobs;
+        if (const_B) {
+            vkBlobs.push_back(blobs[0]);
+        }
+
+        auto wrapper_A = inputs[0].dynamicCast<VkComBackendWrapper>();
+        auto shape_A = shape(*wrapper_A->getMat());
+        Ptr<vkcom::OpBase> op = (new vkcom::OpMatMul(vkBlobs, shape_A[0], shape_A[1], shape_Y[1]));
+        return Ptr<BackendNode>(new VkComBackendNode(inputs, op, outputs));
+    }
+#endif
+
+private:
+    bool const_B;
+    bool const_C;
+    bool have_bias;
+    std::vector<float> packed_B;
+    std::vector<float> broadcast_C;
+    int real_ndims_C;
+    FastGemmOpt opt;
+};
+
+Ptr<GemmLayer> GemmLayer::create(const LayerParams& params) {
+    return makePtr<GemmLayerImpl>(params);
+}
+
+}} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/group_norm_layer.cpp b/modules/dnn/src/layers/group_norm_layer.cpp
new file mode 100644
index 000000000000..f8df14b98c2f
--- /dev/null
+++ b/modules/dnn/src/layers/group_norm_layer.cpp
@@ -0,0 +1,190 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include "./cpu_kernels/fast_norm.hpp"
+
+// CUDA backend
+#include "../op_cuda.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/group_norm.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+// OpenCL backend
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/math_functions.hpp"
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+namespace cv {
+namespace dnn {
+
+// https://github.com/onnx/onnx/blob/main/docs/Operators.md#GroupNormalization
+class GroupNormLayerImpl CV_FINAL : public GroupNormLayer {
+public:
+    GroupNormLayerImpl(const LayerParams &params) {
+        setParamsFrom(params);
+
+        epsilon = params.get<float>("epsilon", 1e-5);
+        num_groups = params.get<int>("num_groups");
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE {
+        const auto &input = inputs[0];
+        const auto &scale = inputs[1];
+        const auto &bias = inputs[2];
+        CV_CheckGE(input.size(), static_cast<size_t>(3), "DNN/GroupNorm: input dimension >= 3 is required");
+
+        int C = input[1];
+        int scale_dim = std::accumulate(scale.begin(), scale.end(), 1, std::multiplies<int>());
+        CV_CheckEQ(scale_dim, C, "DNN/InstanceNorm: scale must be a 1d tensor and match the channel of input");
+        int bias_dim = std::accumulate(bias.begin(), bias.end(), 1, std::multiplies<int>());
+        CV_CheckEQ(bias_dim, C, "DNN/InstanceNorm: bias must be a 1d tensor and match the channel of input");
+
+        outputs.assign(1, inputs[0]);
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16F) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const auto& input = inputs[0];
+        const auto& scale = inputs[1];
+        const auto& bias = inputs[2];
+
+        fastNormGroup(input, scale, bias, outputs[0], epsilon, num_groups);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        const auto &input = inputs[0], &scale = inputs[1], &bias = inputs[2];
+        auto &output = outputs[0];
+
+        const auto input_shape = shape(input);
+        size_t N = input_shape[0], C = input_shape[1];
+        size_t num_groups = this->num_groups;
+        size_t channels_per_group = C / num_groups;
+        size_t loops = N * num_groups, norm_size = static_cast<size_t>(total(input_shape, 2)) * channels_per_group;
+        float inv_norm_size = 1.f / norm_size;
+
+        // no fp16 support
+        if (input.depth() == CV_16F) {
+            return false;
+        }
+
+        String base_opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
+
+        // Calculate mean
+        UMat one = UMat::ones(norm_size, 1, CV_32F);
+        UMat mean = UMat(loops, 1, CV_32F);
+        UMat mean_square = UMat(loops, 1, CV_32F);
+        UMat tmp = UMat(loops, norm_size, CV_32F);
+        bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                               input, 0, one, 0, 0.f, mean, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate mean_square
+        int num_vector = (norm_size % 8 == 0) ? 8 : ((norm_size % 4 == 0) ? 4 : 1);
+        size_t global[] = {loops, static_cast<size_t>(norm_size / num_vector)};
+        String build_opt = format(" -DNUM=%d", num_vector) + base_opts;
+        String mean_square_kernel_name = format("calc_mean%d", num_vector);
+        ocl::Kernel mean_square_kernel(mean_square_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt + " -DKERNEL_MEAN");
+        if (mean_square_kernel.empty()) {
+            return false;
+        }
+        mean_square_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mean_square_kernel.set(1, (int)loops);
+        mean_square_kernel.set(2, (int)norm_size);
+        mean_square_kernel.set(3, ocl::KernelArg::PtrReadOnly(mean));
+        mean_square_kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmp));
+        ret = mean_square_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+        ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                          tmp, 0, one, 0, 0.f, mean_square, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate group norm: output = scale * (x - mean) / sqrt(var + eps) + bias
+        String mvn_group_kernel_name = format("mvn_group%d", num_vector);
+        build_opt += " -DNORM_VARIANCE -DKERNEL_MVN_GROUP";
+        ocl::Kernel mvn_group_kernel(mvn_group_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt);
+        if (mvn_group_kernel.empty()) {
+            return false;
+        }
+        mvn_group_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mvn_group_kernel.set(1, (int)loops);
+        mvn_group_kernel.set(2, (int)norm_size);
+        mvn_group_kernel.set(3, (float)epsilon);
+        mvn_group_kernel.set(4, ocl::KernelArg::PtrReadOnly(mean));
+        mvn_group_kernel.set(5, ocl::KernelArg::PtrReadOnly(mean_square));
+        mvn_group_kernel.set(6, ocl::KernelArg::PtrReadOnly(scale));
+        mvn_group_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias));
+        mvn_group_kernel.set(8, (int)C);
+        mvn_group_kernel.set(9, (int)num_groups);
+        mvn_group_kernel.set(10, (float)0.f);
+        mvn_group_kernel.set(11, ocl::KernelArg::PtrWriteOnly(output));
+        ret = mvn_group_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+
+        return true;
+        }
+#endif
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(void *context_,
+                          const std::vector<Ptr<BackendWrapper>>& inputs,
+                          const std::vector<Ptr<BackendWrapper>>& outputs) override {
+    auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+    auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+    auto input_shape = input_wrapper->getShape();
+    size_t N = input_shape[0];
+    size_t num_groups = this->num_groups;
+    size_t loops = N * num_groups;
+
+    return make_cuda_node<cuda4dnn::GroupNormOp>(preferableTarget, std::move(context->stream), epsilon, loops, num_groups);
+}
+#endif // HAVE_CUDA
+
+private:
+    float epsilon;
+    size_t num_groups;
+};
+
+Ptr<GroupNormLayer> GroupNormLayer::create(const LayerParams &params) {
+    return Ptr<GroupNormLayer>(new GroupNormLayerImpl(params));
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/instance_norm_layer.cpp b/modules/dnn/src/layers/instance_norm_layer.cpp
new file mode 100644
index 000000000000..ae61f15656c9
--- /dev/null
+++ b/modules/dnn/src/layers/instance_norm_layer.cpp
@@ -0,0 +1,273 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include "./cpu_kernels/fast_norm.hpp"
+
+// CANN backend
+#include "../op_cann.hpp"
+
+// OpenVINO backend
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+// CUDA backend
+#include "../op_cuda.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/instance_norm.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+// OpenCL backend
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/math_functions.hpp"
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+namespace cv { namespace dnn {
+
+// https://github.com/onnx/onnx/blob/main/docs/Operators.md#InstanceNormalization
+class InstanceNormLayerImpl CV_FINAL : public InstanceNormLayer {
+public:
+    InstanceNormLayerImpl(const LayerParams &params) {
+        setParamsFrom(params);
+
+        epsilon = params.get<float>("epsilon", 1e-5);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE {
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return true;
+#endif
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+            //    backendId == DNN_BACKEND_CANN; // not supported due to 1d mat shape issue
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE {
+        const auto &input = inputs[0];
+        const auto &scale = inputs[1];
+        const auto &bias = inputs[2];
+        CV_CheckGE(input.size(), static_cast<size_t>(3), "DNN/InstanceNorm: input dimension >= 3 is required");
+
+        int C = input[1];
+        int scale_dim = std::accumulate(scale.begin(), scale.end(), 1, std::multiplies<int>());
+        CV_CheckEQ(scale_dim, C, "DNN/InstanceNorm: scale must be a 1d tensor and match the channel of input");
+        int bias_dim = std::accumulate(bias.begin(), bias.end(), 1, std::multiplies<int>());
+        CV_CheckEQ(bias_dim, C, "DNN/InstanceNorm: bias must be a 1d tensor and match the channel of input");
+
+        outputs.assign(1, inputs[0]);
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16F)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const auto &input = inputs[0];
+        const auto &scale = inputs[1];
+        const auto &bias = inputs[2];
+
+        fastNormChannel(input, scale, bias, outputs[0], epsilon);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        const auto &input = inputs[0], &scale = inputs[1], &bias = inputs[2];
+        auto &output = outputs[0];
+
+        const auto input_shape = shape(input);
+        size_t N = input_shape[0], C = input_shape[1],
+               loops = N * C, norm_size = static_cast<size_t>(total(input_shape, 2));
+        float inv_norm_size = 1.f / norm_size;
+
+        // no fp16 support
+        if (input.depth() == CV_16F) {
+            return false;
+        }
+
+        String base_opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
+
+        // Calculate mean
+        UMat one = UMat::ones(norm_size, 1, CV_32F);
+        UMat mean = UMat(loops, 1, CV_32F);
+        UMat mean_square = UMat(loops, 1, CV_32F);
+        UMat tmp = UMat(loops, norm_size, CV_32F);
+        bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                               input, 0, one, 0, 0.f, mean, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate mean_square
+        int num_vector = (norm_size % 8 == 0) ? 8 : ((norm_size % 4 == 0) ? 4 : 1);
+        size_t global[] = {loops, static_cast<size_t>(norm_size / num_vector)};
+        String build_opt = format(" -DNUM=%d", num_vector) + base_opts;
+        String mean_square_kernel_name = format("calc_mean%d", num_vector);
+        ocl::Kernel mean_square_kernel(mean_square_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt + " -DKERNEL_MEAN");
+        if (mean_square_kernel.empty()) {
+            return false;
+        }
+        mean_square_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mean_square_kernel.set(1, (int)loops);
+        mean_square_kernel.set(2, (int)norm_size);
+        mean_square_kernel.set(3, ocl::KernelArg::PtrReadOnly(mean));
+        mean_square_kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmp));
+        ret = mean_square_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+        ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                          tmp, 0, one, 0, 0.f, mean_square, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate instance norm: output = scale * (x - mean) / sqrt(var + eps) + bias
+        String mvn_kernel_name = format("mvn%d", num_vector);
+        build_opt += " -DNORM_VARIANCE -DFUSE_BATCH_NORM -DKERNEL_MVN";
+        ocl::Kernel mvn_kernel(mvn_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt);
+        if (mvn_kernel.empty()) {
+            return false;
+        }
+        mvn_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mvn_kernel.set(1, (int)loops);
+        mvn_kernel.set(2, (int)norm_size);
+        mvn_kernel.set(3, (float)epsilon);
+        mvn_kernel.set(4, ocl::KernelArg::PtrReadOnly(mean));
+        mvn_kernel.set(5, ocl::KernelArg::PtrReadOnly(mean_square));
+        mvn_kernel.set(6, ocl::KernelArg::PtrReadOnly(scale));
+        mvn_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias));
+        mvn_kernel.set(8, (int)C);
+        mvn_kernel.set(9, (float)0.f);
+        mvn_kernel.set(10, ocl::KernelArg::PtrWriteOnly(output));
+        ret = mvn_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+
+        return true;
+    }
+#endif
+
+#ifdef HAVE_CANN
+    virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                      const std::vector<Ptr<BackendWrapper> > &outputs,
+                                      const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
+        auto input_tensor_wrapper = inputs[0].dynamicCast<CannBackendWrapper>();
+        auto input_tensor_desc = input_tensor_wrapper->getTensorDesc();
+
+        auto scale_tensor_wrapper = inputs[1].dynamicCast<CannBackendWrapper>();
+        auto scale_tensor_desc = scale_tensor_wrapper->getTensorDesc();
+
+        auto bias_tensor_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
+        auto bias_tensor_desc = bias_tensor_wrapper->getTensorDesc();
+
+        auto last_node = nodes[0].dynamicCast<CannBackendNode>()->getOp();
+        auto scale_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
+        auto bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
+
+        auto op = std::make_shared<ge::op::InstanceNorm>(name);
+
+        // set attrs
+        op->set_attr_epsilon(epsilon);
+
+        // set inputs
+        // set inputs : x
+        op->set_input_x_by_name(*last_node, input_tensor_wrapper->name.c_str());
+        op->update_input_desc_x(*input_tensor_desc);
+        // set inputs : gamma
+        op->set_input_gamma_by_name((*scale_node), scale_tensor_wrapper->name.c_str());
+        op->update_input_desc_gamma(*scale_tensor_desc);
+        // set inputs : beta
+        op->set_input_beta_by_name(*bias_node, bias_tensor_wrapper->name.c_str());
+        op->update_input_desc_beta(*bias_tensor_desc);
+
+        // set outputs
+        auto output_desc_y = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_y(*output_desc_y);
+        auto output_desc_mean = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_mean(*output_desc_mean);
+        auto output_desc_var = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_variance(*output_desc_var);
+
+        return Ptr<BackendNode>(new CannBackendNode(op));
+    }
+#endif // HAVE_CANN
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
+        // onnx to openvino convertion: https://github.com/openvinotoolkit/openvino/blob/2023.1.0/src/frontends/onnx/frontend/src/op/instance_norm.cpp
+
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        const auto &input_shape = ieInpNode.get_shape();
+        std::shared_ptr<ov::Node> mvn, result;
+
+        // mvn
+        // https://docs.openvino.ai/2023.1/openvino_docs_ops_normalization_MVN_6.html
+        std::vector<int64_t> axes_v(input_shape.size() - 2);
+        std::iota(axes_v.begin(), axes_v.end(), 2); // {2, 3, ...} for nd input tensor, n>=3
+        auto axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{axes_v.size()}, axes_v.data());
+        bool normalize_variance = true;
+        mvn = std::make_shared<ov::op::v6::MVN>(ieInpNode, axes, normalize_variance, epsilon, ov::op::MVNEpsMode::INSIDE_SQRT);
+
+        // instance norm = scale * mvn + bias
+        auto scale = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+        std::vector<int64_t> shared_shape_v(input_shape.size(), 1);
+        shared_shape_v[1] = -1;
+        auto shared_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{shared_shape_v.size()}, shared_shape_v.data());
+        scale  = std::make_shared<ov::op::v1::Reshape>(scale, shared_shape, true);
+        result = std::make_shared<ov::op::v1::Multiply>(mvn, scale);
+        auto bias = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+        bias  = std::make_shared<ov::op::v1::Reshape>(bias, shared_shape, true);
+        result = std::make_shared<ov::op::v1::Add>(result, bias);
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(result));
+    }
+#endif // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(void *context_,
+                              const std::vector<Ptr<BackendWrapper>>& inputs,
+                              const std::vector<Ptr<BackendWrapper>>& outputs) override {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto input_shape = input_wrapper->getShape();
+        size_t loops = static_cast<size_t>(total(input_shape, 0, 2));
+
+        return make_cuda_node<cuda4dnn::InstanceNormOp>(preferableTarget, std::move(context->stream), epsilon, loops);
+    }
+#endif // HAVE_CUDA
+
+};
+
+Ptr<InstanceNormLayer> InstanceNormLayer::create(const LayerParams &params) {
+    return Ptr<InstanceNormLayer>(new InstanceNormLayerImpl(params));
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/layer_norm.cpp b/modules/dnn/src/layers/layer_norm.cpp
index a760766a3f1f..487383efdcc6 100644
--- a/modules/dnn/src/layers/layer_norm.cpp
+++ b/modules/dnn/src/layers/layer_norm.cpp
@@ -4,27 +4,56 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "cpu_kernels/fast_norm.hpp"
+
+// CANN backend
+#include "../op_cann.hpp"
+
+// OpenVINO backend
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+// CUDA backend
+#include "../op_cuda.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/layer_norm.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+// OpenCL backend
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/math_functions.hpp"
+#include "opencl_kernels_dnn.hpp"
+#endif
 
 namespace cv { namespace dnn {
 
+// https://github.com/onnx/onnx/blob/main/docs/Operators.md#LayerNormalization
 class LayerNormLayerImpl CV_FINAL : public LayerNormLayer
 {
+#ifdef HAVE_OPENCL
+    UMat weight_umat, bias_umat;
+#endif
+
 public:
     LayerNormLayerImpl(const LayerParams& params)
     {
         setParamsFrom(params);
 
         // standard attr
-        axis = params.get<int>("axis", 0);
+        axis = params.get<int>("axis", -1);
         epsilon = params.get<float>("epsilon", 1e-5);
-
-        // opencv attr
-        hasBias = params.get<bool>("hasBias", false);
     }
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+#ifdef HAVE_INF_ENGINE
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            return true;
+#endif
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA   ||
+               (backendId == DNN_BACKEND_CANN && axis != -1); // axis=-1 not supported due to 1d mat shape problem
     }
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -33,139 +62,312 @@ class LayerNormLayerImpl CV_FINAL : public LayerNormLayer
                                  std::vector<MatShape> &internals) const CV_OVERRIDE
     {
         // check shapes of weight and bias if existed
-        // inputs >= 2 (X and Weight are requested, bias is optional)
-        CV_Check(inputs.size(), inputs.size() >= 2 && inputs.size() <= 3, "LayerNorm: require two (x, weight) or three (x, weight, bias) inputs");
+        // inputs >= 2 (X and Weight are required, bias is optional)
+        int num_inputs = inputs.size() + blobs.size();
+        CV_Check(num_inputs, num_inputs >= 2 && num_inputs <= 3, "LayerNorm: require two (x, weight) or three (x, weight, bias) inputs");
 
         auto x_shape = inputs[0];
         int x_ndims = static_cast<int>(x_shape.size());
 
-        auto w_shape = inputs[1];
+        // Weight and bias are either constants or variable
+        auto w_shape = blobs.empty() ? inputs[1] : shape(blobs.front());
         // if axis == last_dim, scale and b are both 1d tensor (represented as 2d mat nx1)
         int w_ndims = static_cast<int>(w_shape.size());
         w_ndims = (axis == x_ndims - 1 && w_ndims == 2) ? w_ndims - 1 : w_ndims;
         CV_CheckEQ(x_ndims - axis, w_ndims, "LayerNorm: shape of weight does not match with given axis and shape of input");
         for (int i = 0; i < w_ndims; ++i)
             CV_CheckEQ(x_shape[axis+i], w_shape[i], "LayerNorm: weight dimensions does not match with input dimensions");
-        if (hasBias)
+        if (num_inputs >= 3)
         {
-            CV_CheckEQ(inputs.size(), (size_t)3, "");
-            auto b_shape = inputs[2];
+            auto b_shape = blobs.empty() ? inputs[2] : shape(blobs.back());
             CV_CheckEQ(w_shape.size(), b_shape.size(), "LayerNorm: shape of weight does not match with shape of bias");
             for (size_t i = 0; i < w_shape.size(); ++i)
                 CV_CheckEQ(w_shape[i], b_shape[i], "LayerNorm: bias dimensions does not match with weight dimensions");
         }
 
-        // only one output is needed; Mean & InvStdDev are not needed
-        // in inference and should beomitted in onnx importer
         outputs.assign(1, inputs[0]);
         return false;
     }
 
-    template<bool hasBias>
-    class LayerNormInvoker : public ParallelLoopBody
-    {
-    public:
-        const Mat& src;
-        const float* scaleData;
-        const float* biasData;
-        Mat& dst;
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        const auto input_shape = shape(inputs[0]);
+        axis = normalize_axis(axis, static_cast<int>(input_shape.size()));
+
+#ifdef HAVE_OPENCL
+        weight_umat.release();
+        bias_umat.release();
+#endif
+    }
 
-        float epsilon;
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        int total;
-        int normSize;
-        float invNormSize;
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        LayerNormInvoker(const Mat& src_, const Mat& scale, const Mat* b, Mat& dst_, int axis, float epsilon_)
-            : src(src_), scaleData(scale.ptr<float>()), biasData(nullptr), dst(dst_), epsilon(epsilon_)
+        if (inputs_arr.depth() == CV_16F)
         {
-            if (hasBias)
-            {
-                CV_Assert(b != nullptr);
-                CV_Assert(b->isContinuous());
-                biasData = (const float*)b->ptr<float>();
-            }
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-            auto dstShape = shape(dst);
-            total = std::accumulate(dstShape.begin(), dstShape.begin() + axis, 1, std::multiplies<int>());
-            normSize = std::accumulate(dstShape.begin() + axis, dstShape.end(), 1, std::multiplies<int>());
-            invNormSize = 1.0f / normSize;
+        const auto &input = inputs[0];
+        const auto &scale = blobs.empty() ? inputs[1] : blobs.front();
+        auto &output = outputs[0];
+
+        if ((inputs.size() + blobs.size()) >= 3) {
+            const auto &bias = blobs.empty() ? inputs[2] : blobs.back();
+            fastNorm(input, scale, bias, output, epsilon, static_cast<size_t>(axis));
+        } else {
+            fastNorm(input, scale, output, epsilon, static_cast<size_t>(axis));
         }
+    }
 
-        static void run(const Mat& src, const Mat& scale, const Mat* b, Mat& dst, int axis, float epsilon)
-        {
-            CV_Assert(src.isContinuous());
-            CV_Assert(dst.isContinuous());
-            CV_CheckTypeEQ(src.type(), CV_32F, "DNN/LayerNorm: only support float32");
-            CV_CheckTypeEQ(src.type(), dst.type(), "");
-            CV_Assert(scale.isContinuous());
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
 
-            CV_CheckGE(epsilon, 0.0f, "");
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
 
-            LayerNormInvoker p(src, scale, b, dst, axis, epsilon);
+        const auto &input = inputs[0];
 
-            double nstripes = ((size_t)p.total * p.normSize) * (1 / 1024.0);
-            // double nstripes = ((size_t)p.total) * (1 / 1024.0);
-            parallel_for_(Range(0, p.total), p, nstripes);
+        // no fp16 support
+        if (input.depth() == CV_16F) {
+            return false;
         }
 
-        void operator()(const Range& r) const CV_OVERRIDE
-        {
-            int stripeStart = r.start;
-            int stripeEnd = r.end;
-
-            const float* srcData = src.ptr<float>();
-            float* dstData = dst.ptr<float>();
-
-            for (int ofs = stripeStart; ofs < stripeEnd; ++ofs)
-            {
-                const float* first = srcData + ofs * normSize;
-                float* dstFirst = dstData + ofs * normSize;
-
-                float mean = 0;
-                float meanSquare = 0;
-                for (int h = 0; h < normSize; ++h)
-                {
-                    float v = first[h];
-                    mean += v;
-                    meanSquare += v * v;
-                }
-                mean *= invNormSize;
-                meanSquare = std::sqrt(std::max(0.f, meanSquare * invNormSize - mean * mean) + epsilon);
-                float invMeanSquare = 1.0f / meanSquare;
-                for (int h = 0; h < normSize; ++h)
-                {
-                    float v = (first[h] - mean) * invMeanSquare * scaleData[h];
-                    if (hasBias) {
-                        v = v + biasData[h];
-                    }
-                    dstFirst[h] = v;
+        auto &output = outputs[0];
+
+        const auto input_shape = shape(input);
+        size_t loops = static_cast<size_t>(total(input_shape, 0, axis)),
+               norm_size = static_cast<size_t>(total(input_shape, axis));
+        float inv_norm_size = 1.f / norm_size;
+
+        if (weight_umat.empty()) {
+            if (blobs.empty()) {
+                weight_umat = inputs[1];
+            } else {
+                blobs.front().copyTo(weight_umat);
+            }
+        }
+        if (bias_umat.empty()) {
+            if ((inputs.size() + blobs.size()) == 3) {
+                if (blobs.empty()) {
+                    bias_umat = inputs[2];
+                } else {
+                    blobs.back().copyTo(bias_umat);
                 }
+            } else {
+                bias_umat = UMat::zeros(norm_size, 1, CV_32F);
             }
         }
-    };
 
-    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
-    {
-        CV_TRACE_FUNCTION();
-        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+        String base_opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
 
-        if (inputs_arr.depth() == CV_16S)
-        {
-            forward_fallback(inputs_arr, outputs_arr, internals_arr);
-            return;
+        // Calculate mean
+        UMat one = UMat::ones(norm_size, 1, CV_32F);
+        UMat mean = UMat(loops, 1, CV_32F);
+        UMat mean_square = UMat(loops, 1, CV_32F);
+        UMat tmp = UMat(loops, norm_size, CV_32F);
+        bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                               input, 0, one, 0, 0.f, mean, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate mean_square
+        int num_vector = (norm_size % 8 == 0) ? 8 : ((norm_size % 4 == 0) ? 4 : 1);
+        size_t global[] = {loops, static_cast<size_t>(norm_size / num_vector)};
+        String build_opt = format(" -DNUM=%d", num_vector) + base_opts;
+        String mean_square_kernel_name = format("calc_mean%d", num_vector);
+        ocl::Kernel mean_square_kernel(mean_square_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt + " -DKERNEL_MEAN");
+        if (mean_square_kernel.empty()) {
+            return false;
+        }
+        mean_square_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mean_square_kernel.set(1, (int)loops);
+        mean_square_kernel.set(2, (int)norm_size);
+        mean_square_kernel.set(3, ocl::KernelArg::PtrReadOnly(mean));
+        mean_square_kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmp));
+        ret = mean_square_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+        ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                          tmp, 0, one, 0, 0.f, mean_square, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate instance norm: output = weight * (x - mean) / sqrt(var + eps) + bias
+        String mvn_kernel_name = format("mvn%d", num_vector);
+        build_opt += " -DNORM_VARIANCE -DLAYER_NORM -DKERNEL_MVN";
+        ocl::Kernel mvn_kernel(mvn_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt);
+        if (mvn_kernel.empty()) {
+            return false;
+        }
+        mvn_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mvn_kernel.set(1, (int)loops);
+        mvn_kernel.set(2, (int)norm_size);
+        mvn_kernel.set(3, (float)epsilon);
+        mvn_kernel.set(4, ocl::KernelArg::PtrReadOnly(mean));
+        mvn_kernel.set(5, ocl::KernelArg::PtrReadOnly(mean_square));
+        mvn_kernel.set(6, ocl::KernelArg::PtrReadOnly(weight_umat));
+        mvn_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias_umat));
+        mvn_kernel.set(8, (int)1);
+        mvn_kernel.set(9, (float)0.f);
+        mvn_kernel.set(10, ocl::KernelArg::PtrWriteOnly(output));
+        ret = mvn_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
         }
 
-        std::vector<Mat> inputs, outputs;
-        inputs_arr.getMatVector(inputs);
-        outputs_arr.getMatVector(outputs);
+        return true;
+    }
+#endif
+
+#ifdef HAVE_CANN
+    virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                      const std::vector<Ptr<BackendWrapper> > &outputs,
+                                      const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
+        CV_CheckEQ(inputs.size(), static_cast<size_t>(3), "LayerNorm/CANN: requires three input wrappers");
+        CV_CheckEQ(nodes.size(), static_cast<size_t>(3), "LayerNorm/CANN: requires three input nodes");
 
-        if (hasBias) {
-            LayerNormInvoker<true>::run(inputs[0], inputs[1], &inputs[2], outputs[0], axis, epsilon);
+        auto input_tensor_wrapper = inputs[0].dynamicCast<CannBackendWrapper>();
+        auto input_tensor_desc = input_tensor_wrapper->getTensorDesc();
+
+        CV_CheckNE(axis, static_cast<int>(input_tensor_desc->GetShape().GetDimNum() - 1), "LayerNorm: CANN does not support axis set as last axis due to 1D mat compatibility issue");
+
+        auto last_node = nodes[0].dynamicCast<CannBackendNode>()->getOp();
+
+        auto op = std::make_shared<ge::op::LayerNorm>(name);
+
+        // set attrs
+        op->set_attr_begin_norm_axis(axis);
+        op->set_attr_begin_params_axis(axis);
+        op->set_attr_epsilon(epsilon);
+
+        // set inputs
+        // set inputs : x
+        op->set_input_x_by_name(*last_node, input_tensor_wrapper->name.c_str());
+        op->update_input_desc_x(*input_tensor_desc);
+        // set inputs : gamma & beta
+        if (blobs.empty()) {
+            auto scale_tensor_wrapper = inputs[1].dynamicCast<CannBackendWrapper>();
+            auto scale_tensor_desc = scale_tensor_wrapper->getTensorDesc();
+            auto scale_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
+            op->set_input_gamma_by_name(*scale_node, scale_tensor_wrapper->name.c_str());
+            op->update_input_desc_gamma(*scale_tensor_desc);
+
+            if (inputs.size() == 3) {
+                auto bias_tensor_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
+                auto bias_tensor_desc = bias_tensor_wrapper->getTensorDesc();
+                auto bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
+                op->set_input_beta_by_name(*bias_node, bias_tensor_wrapper->name.c_str());
+                op->update_input_desc_beta(*bias_tensor_desc);
+            }
         } else {
-            LayerNormInvoker<false>::run(inputs[0], inputs[1], nullptr, outputs[0], axis, epsilon);
+            const auto &scale_mat = blobs.front();
+            const auto op_const_scale = std::make_shared<CannConstOp>(scale_mat.data, scale_mat.type(), shape(scale_mat), cv::format("%s_w", name.c_str()));
+            op->set_input_gamma(*(op_const_scale->getOp()));
+            op->update_input_desc_gamma(*(op_const_scale->getTensorDesc()));
+
+            if ((inputs.size() + blobs.size()) >= 3) {
+                const auto &bias_mat = blobs.back();
+                const auto op_const_bias = std::make_shared<CannConstOp>(bias_mat.data, bias_mat.type(), shape(bias_mat), cv::format("%s_b", name.c_str()));
+                op->set_input_beta(*(op_const_bias->getOp()));
+                op->update_input_desc_beta(*(op_const_bias->getTensorDesc()));
+            }
         }
+
+        // set outputs
+        auto output_desc_y = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_y(*output_desc_y);
+        auto output_desc_mean = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_mean(*output_desc_mean);
+        auto output_desc_var = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_variance(*output_desc_var);
+
+        return Ptr<BackendNode>(new CannBackendNode(op));
+    }
+#endif // HAVE_CANN
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        const auto &input_shape = ieInpNode.get_shape();
+        std::shared_ptr<ov::Node> mvn, result;
+        ov::Output<ov::Node> scale, bias;
+
+        // mvn
+        // https://docs.openvino.ai/2023.1/openvino_docs_ops_normalization_MVN_6.html
+        std::vector<int64_t> axes_v(input_shape.size() - axis);
+        std::iota(axes_v.begin(), axes_v.end(), axis);
+        auto axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{axes_v.size()}, axes_v.data());
+        bool normalize_variance = true;
+        mvn = std::make_shared<ov::op::v6::MVN>(ieInpNode, axes, normalize_variance, epsilon, ov::op::MVNEpsMode::INSIDE_SQRT);
+
+        // layer norm = scale * mvn + bias
+        if (blobs.empty()) {
+            scale = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+            if (nodes.size() == 3) {
+                bias = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+            }
+        } else {
+            auto scale_mat = blobs.front();
+            const auto scale_shape = shape(scale_mat);
+            scale = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(scale_shape.begin(), scale_shape.end()), scale_mat.data);
+            if ((nodes.size() + blobs.size()) == 3) {
+                auto bias_mat = blobs.back();
+                const auto bias_shape = shape(bias_mat);
+                bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(bias_shape.begin(), bias_shape.end()), bias_mat.data);
+            }
+        }
+        if (axis == -1 || axis == input_shape.size() - 1) { // special case for 1D tensor (2D mat)
+            std::vector<int64_t> shared_shape_v(input_shape.size(), 1);
+            shared_shape_v.back() = -1;
+            auto shared_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{shared_shape_v.size()}, shared_shape_v.data());
+            scale = std::make_shared<ov::op::v1::Reshape>(scale, shared_shape, true);
+            if ((nodes.size() + blobs.size()) == 3) {
+                bias = std::make_shared<ov::op::v1::Reshape>(bias, shared_shape, true);
+            }
+        }
+
+        result = std::make_shared<ov::op::v1::Multiply>(mvn, scale);
+        if ((nodes.size() + blobs.size()) == 3) {
+            result = std::make_shared<ov::op::v1::Add>(result, bias);
+        }
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(result));
+    }
+#endif // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(void *context_,
+                              const std::vector<Ptr<BackendWrapper>>& inputs,
+                              const std::vector<Ptr<BackendWrapper>>& outputs) override {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto input_shape = input_wrapper->getShape();
+        size_t loops = static_cast<size_t>(total(input_shape, 0, axis));
+
+        const auto scale = blobs.empty() ? Mat() : blobs.front(),
+                   bias = blobs.empty() ? Mat() : blobs.back();
+
+        return make_cuda_node<cuda4dnn::LayerNormOp>(preferableTarget, std::move(context->stream), scale, bias, axis, epsilon, loops);
     }
+#endif // HAVE_CUDA
 };
 
 Ptr<LayerNormLayer> LayerNormLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/layers_common.cpp b/modules/dnn/src/layers/layers_common.cpp
index b128872817d4..3b3a007b0676 100644
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@@ -149,10 +149,11 @@ void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kern
                             std::vector<size_t>& strides, cv::String &padMode)
 {
     bool is_global = params.get<bool>("global_pooling", false);
-    globalPooling.resize(3);
-    globalPooling[0] = params.get<bool>("global_pooling_d", is_global);
-    globalPooling[1] = params.get<bool>("global_pooling_h", is_global);
-    globalPooling[2] = params.get<bool>("global_pooling_w", is_global);
+    globalPooling.assign({
+        params.get<bool>("global_pooling_d", is_global),
+        params.get<bool>("global_pooling_h", is_global),
+        params.get<bool>("global_pooling_w", is_global)
+    });
 
     if (globalPooling[0] || globalPooling[1] || globalPooling[2])
     {
@@ -194,7 +195,7 @@ void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>&
     util::getStrideAndPadding(params, pads_begin, pads_end, strides, padMode, kernel.size());
     util::getParameter(params, "dilation", "dilation", dilations, true, std::vector<size_t>(kernel.size(), 1));
     util::getParameter(params, "adj", "adj", adjust_pads, true, std::vector<size_t>(kernel.size(), 0));
-    useWinograd = params.get<bool>("use_winograd", true);
+    useWinograd = params.get<bool>("use_winograd", useWinograd);
 
     for (int i = 0; i < dilations.size(); i++)
         CV_Assert(dilations[i] > 0);
diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp
index 61c2224e363b..4863be2e3df2 100644
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -121,7 +121,7 @@ class LRNLayerImpl CV_FINAL : public LRNLayer
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -166,7 +166,7 @@ class LRNLayerImpl CV_FINAL : public LRNLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -480,11 +480,11 @@ class LRNLayerImpl CV_FINAL : public LRNLayer
         if (type != SPATIAL_NRM) {
             axes = {1};
         } else {
-            axes.resize(ieInpNode->get_shape().size() - 2);
+            axes.resize(ieInpNode.get_shape().size() - 2);
             std::iota(axes.begin(), axes.end(), 2);
         }
-        auto ngraph_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes.data());
-        auto lrn = std::make_shared<ngraph::op::LRN>(ieInpNode, ngraph_axes, alphaSize, beta, bias, size);
+        auto ngraph_axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{axes.size()}, axes.data());
+        auto lrn = std::make_shared<ov::op::v0::LRN>(ieInpNode, ngraph_axes, alphaSize, beta, bias, size);
         return Ptr<BackendNode>(new InfEngineNgraphNode(lrn));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/matmul_layer.cpp b/modules/dnn/src/layers/matmul_layer.cpp
new file mode 100644
index 000000000000..448af27c1879
--- /dev/null
+++ b/modules/dnn/src/layers/matmul_layer.cpp
@@ -0,0 +1,486 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+#include "cpu_kernels/fast_gemm.hpp"
+
+// OpenVINO backend
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
+
+// Vulkan backend
+#include "../op_vkcom.hpp"
+
+// CUDA backend
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/matmul_broadcast.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+// CANN backend
+#include "../op_cann.hpp"
+
+namespace cv { namespace dnn {
+
+class MatMulLayerImpl CV_FINAL : public MatMulLayer {
+#ifdef HAVE_OPENCL
+    UMat weight_umat, bias_umat;
+#endif
+
+ public:
+    MatMulLayerImpl(const LayerParams& params) {
+        setParamsFrom(params);
+
+        trans_a = params.get<bool>("transA", false);
+        trans_b = params.get<bool>("transB", false);
+        alpha = params.get<float>("alpha", 1.f);
+        beta = params.get<float>("beta", 1.f);
+
+        real_ndims_C = params.get<int>("real_ndims_C", -1);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
+               (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !trans_a && !trans_b) ||
+               backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_CANN;
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE {
+        int num_inputs = inputs.size() + blobs.size();
+        CV_CheckGE(num_inputs, 2, "DNN/MatMul: two inputs at least");
+        CV_CheckLE(num_inputs, 3, "DNN/MatMul: three inputs at most");
+
+        const auto shape_A = inputs[0], shape_B = blobs.empty() ? inputs[1] : shape(blobs[0]);
+        CV_CheckGE(shape_A.size(), static_cast<size_t>(2), "DNN/MatMul: invalid shape of input A");
+        CV_CheckGE(shape_B.size(), static_cast<size_t>(2), "DNN/MatMul: invalid shape of input B");
+
+        // Check legal matrix multiplication
+        int mA = shape_A[shape_A.size() - 2], nA = shape_A.back();
+        int mB = shape_B[shape_B.size() - 2], nB = shape_B.back();
+        int M = trans_a ? nA : mA;
+        int N = trans_b ? mB : nB;
+        int K_A = trans_a ? mA : nA;
+        int K_B = trans_b ? nB : mB;
+        CV_CheckEQ(K_A, K_B, "DNN/MatMul: invalid dimension K");
+
+        // Check if inputs are broadcastable.
+        MatShape common_shape;
+        if (shape_A.size() != 2 || shape_B.size() != 2) {
+            const auto &shape_more_dims = shape_A.size() > shape_B.size() ? shape_A : shape_B;
+            const auto &shape_less_dims = shape_A.size() > shape_B.size() ? shape_B : shape_A;
+            size_t diff_dims = shape_more_dims.size() - shape_less_dims.size();
+            common_shape = shape_more_dims;
+            for (size_t i = 0; i < shape_less_dims.size() - 2; i++) {
+                const auto dl = shape_less_dims[i], dm = shape_more_dims[i + diff_dims];
+                if (dl != 1 && dm != 1 && dl != dm) {
+                    CV_Error(Error::StsBadSize, format("DNN/MatMul: invalid shape for broadcasting, shape_A[%zu]=%d, shape_B[%zu]=%d\n", i, shape_less_dims[i], i, shape_more_dims[i + diff_dims]));
+                }
+
+                if (dm == 1) {
+                    common_shape[i + diff_dims] = dl;
+                }
+            }
+            common_shape[common_shape.size() - 2] = M;
+            common_shape[common_shape.size() - 1] = N;
+        } else {
+            common_shape.resize(2);
+            common_shape[0] = M;
+            common_shape[1] = N;
+        }
+
+        // Check if bias is broadcastable
+        if (num_inputs == 3) {
+            const auto shape_C = blobs.empty() ? inputs.back() : shape(blobs.back());
+            if (real_ndims_C == 1) { // (1) or (N)
+                CV_Check(shape_C[0], shape_C[0] == 1 || shape_C[0] == N, "DNN/MatMul: invalid dimension of C");
+            } else if (real_ndims_C >= 2) {
+                const auto &shape_large = common_shape.size() > shape_C.size() ? common_shape : shape_C;
+                const auto &shape_small = common_shape.size() > shape_C.size() ? shape_C : common_shape;
+                size_t diff_dims = shape_large.size() - shape_small.size();
+                for (size_t i = 0; i < shape_small.size(); i++) {
+                    const auto dl = shape_small[i], dm = shape_large[i + diff_dims];
+                    if (dl != 1 && dm != 1 && dl != dm) {
+                        CV_Error(Error::StsBadSize, "DNN/MatMul: invalid shape of C");
+                    }
+                }
+            }
+        }
+
+        outputs.assign(1, common_shape);
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
+        opt.init();
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const auto A_shape = shape(inputs[0]),
+                   B_shape = blobs.empty() ? shape(inputs[1]) : shape(blobs[0]),
+                   C_shape = shape(outputs[0]);
+        helper.compute(trans_a, trans_b, A_shape, B_shape, C_shape);
+
+        if (!blobs.empty()) {
+            fastGemmPackB(blobs[0], packed_input_B, trans_b, opt);
+            helper.updatePackedBOffsets(packed_input_B.size());
+        }
+
+        // broadcast bias if needed
+        if ((inputs.size() + blobs.size()) >= 3 && blobs.size() >= 2) {
+            const auto bias_mat = blobs.back();
+            const auto bias_shape = shape(bias_mat);
+            bool is_broadcast_needed = real_ndims_C == 0 || real_ndims_C == 1 || (total(bias_shape) != total(C_shape) || bias_shape.size() != C_shape.size());
+
+            if (is_broadcast_needed) {
+                broadcast_bias = Mat(C_shape, CV_32F);
+                auto *broadcast_bias_ptr = broadcast_bias.ptr<float>();
+
+                const auto *bias = bias_mat.ptr<const float>();
+                if (bias_mat.total() == 1) { // [], [1], [1, ...]
+                    float b = (*bias) * beta;
+                    for (size_t i = 0; i < broadcast_bias.total(); i++) {
+                        broadcast_bias_ptr[i] = b;
+                    }
+                } else if (real_ndims_C == 1) { // [n]
+                    size_t inner_size = C_shape.back(),
+                        loops = total(C_shape) / inner_size;
+                    for (size_t i = 0; i < loops; i++) {
+                        size_t step = i * inner_size;
+                        for (size_t j = 0; j < inner_size; j++) {
+                            broadcast_bias_ptr[step + j] = beta * bias[j];
+                        }
+                    }
+                } else {
+                    broadcast(bias_mat, C_shape, broadcast_bias);
+                }
+            } else {
+                broadcast_bias = blobs.back();
+            }
+        }
+
+#ifdef HAVE_OPENCL
+        weight_umat.release();
+        bias_umat.release();
+#endif
+    }
+
+    // works like Y = numpy.matmul(A, B)
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        if (inputs_arr.depth() == CV_16F)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const auto &A = inputs[0];
+        auto &Y = outputs[0];
+
+        const auto *a = A.ptr<const float>();
+        auto *y = Y.ptr<float>();
+        // add bias if existed
+        if ((inputs.size() + blobs.size()) >= 3) {
+            const auto &shape_Y = shape(Y);
+            if (blobs.empty()) { // bias from input
+                const auto &bias_mat = inputs.back();
+                const auto *bias = bias_mat.ptr<const float>();
+                if (bias_mat.total() == 1) { // [], [1], [1, ...]
+                    float b = (*bias) * beta;
+                    for (size_t i = 0; i < Y.total(); i++) {
+                        y[i] = b;
+                    }
+                } else if (real_ndims_C == 1) { // [n]
+                    const size_t inner_size = shape_Y.back(),
+                                 batches = total(Y) / inner_size;
+                    parallel_for_(Range(0, batches), [&] (const Range &r) {
+                        for (int i = r.start; i < r.end; i++) {
+                            const size_t output_offset = i * inner_size;
+                            for (size_t j = 0; j < inner_size; j++) {
+                                y[output_offset + j] = beta * bias[j];
+                            }
+                        }
+                    }, double(batches * inner_size * (1 / 1024.0)));
+                } else {
+                    broadcast(bias_mat, shape_Y, Y);
+                }
+            } else { // bias from constant
+                const auto *bias = broadcast_bias.ptr<const float>();
+                std::memcpy(y, bias, total(shape_Y) * sizeof(float));
+            }
+        } else {
+            std::memset(y, 0, Y.total() * sizeof(float));
+        }
+
+        if (blobs.empty()) {
+            const auto &B = inputs[1];
+            const auto *b = B.ptr<const float>();
+            fastGemmBatch(helper.batch, helper.A_offsets.data(), helper.B_offsets.data(), helper.C_offsets.data(),
+                          helper.M, helper.N, helper.K, alpha, a, helper.lda0, helper.lda1,
+                          b, helper.ldb0, helper.ldb1, beta, y, helper.ldc, opt);
+        } else {
+            fastGemmBatch(helper.batch, helper.A_offsets.data(), helper.packed_B_offsets.data(), helper.C_offsets.data(),
+                          helper.M, helper.N, helper.K, alpha, a, helper.lda0, helper.lda1,
+                          packed_input_B.data(), beta, y, helper.ldc, opt);
+        }
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, InputArrayOfArrays internals) {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        bool use_half = (inputs_arr.depth() == CV_16F);
+        inputs_arr.getUMatVector(inputs);
+        outputs_arr.getUMatVector(outputs);
+
+        // does not support bias as input
+        if (inputs.size() >= 3) {
+            return false;
+        }
+
+        const auto &input_A = inputs[0];
+        auto &output = outputs[0];
+        const auto output_shape = shape(output);
+
+        if (blobs.empty()) {
+            weight_umat = inputs[1];
+            if ((inputs.size() + blobs.size() >= 3)) {
+                bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F);
+            }
+        } else {
+            if (weight_umat.empty()) {
+                blobs.front().copyTo(weight_umat);
+            }
+            if ((inputs.size() + blobs.size() >= 3)) {
+                if (bias_umat.empty()) {
+                    broadcast_bias.copyTo(bias_umat);
+                }
+            } else {
+                if (bias_umat.empty()) {
+                    bias_umat = UMat::zeros(output_shape.size(), output_shape.data(), CV_32F);
+                }
+            }
+        }
+
+        auto &input_B = weight_umat;
+
+        int M = static_cast<int>(helper.M),
+            N = static_cast<int>(helper.N),
+            K = static_cast<int>(helper.K),
+            batch = static_cast<int>(helper.batch);
+        int batch_A = total(shape(input_A)) / (M * K),
+            batch_B = total(shape(input_B)) / (N * K);
+        MatShape new_shape_A{batch_A, M * K}, new_shape_B{batch_B, N * K}, new_shape_output{batch, M * N};
+
+        const auto input_A_2d = input_A.reshape(1, new_shape_A.size(), &new_shape_A[0]),
+                   input_B_2d = input_B.reshape(1, new_shape_B.size(), &new_shape_B[0]);
+        auto output_2d = output.reshape(1, new_shape_output.size(), &new_shape_output[0]);
+        UMat A, B, C, A_fp32, B_fp32, C_fp32;
+        for (int i = 0; i < batch; i++) {
+            A = input_A_2d.row(helper.A_rows[i]).reshape(1, trans_a ? K : M);
+            B = input_B_2d.row(helper.B_rows[i]).reshape(1, trans_b ? N : K);
+            C = output_2d.row(helper.C_rows[i]).reshape(1, M);
+
+            if (trans_a) {
+                A = A.t();
+            }
+            if (trans_b) {
+                B = B.t();
+            }
+
+            if (use_half) {
+                A.convertTo(A_fp32, CV_32F);
+                B.convertTo(B_fp32, CV_32F);
+                C.convertTo(C_fp32, CV_32F);
+            } else {
+                A_fp32 = A;
+                B_fp32 = B;
+                C_fp32 = C;
+            }
+            cv::gemm(A_fp32, B_fp32, 1.f, noArray(), 0.f, C_fp32);
+            if (use_half) {
+                A_fp32.convertTo(A, CV_16F);
+                B_fp32.convertTo(B, CV_16F);
+                C_fp32.convertTo(C, CV_16F);
+            }
+        }
+
+        // add bias
+        if (!bias_umat.empty()) {
+            cv::add(output, bias_umat, output);
+        }
+
+        return true;
+    }
+#endif // HAVE_OPENCL
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
+        auto& input_A_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        std::shared_ptr<ov::Node> result;
+        ov::Output<ov::Node> bias;
+
+        if (blobs.empty()) {
+            auto &input_B_node = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+            result = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            if (nodes.size() >= 3) {
+                bias = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
+                result = std::make_shared<ov::op::v1::Add>(result, bias);
+            }
+        } else {
+            auto input_B_shape = getShape<size_t>(blobs[0]);
+            auto input_B_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, input_B_shape, blobs[0].data);
+            result = std::make_shared<ov::op::v0::MatMul>(input_A_node, input_B_node, trans_a, trans_b);
+            if ((nodes.size() + blobs.size()) >= 3) {
+                const auto bias_shape = shape(broadcast_bias);
+                bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32, std::vector<size_t>(bias_shape.begin(), bias_shape.end()), broadcast_bias.data);
+                result = std::make_shared<ov::op::v1::Add>(result, bias);
+            }
+        }
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(result));
+    }
+#endif // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_VULKAN
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                       std::vector<Ptr<BackendWrapper> > &outputs) CV_OVERRIDE {
+        auto input_A_wrapper = inputs[0].dynamicCast<VkComBackendWrapper>();
+        auto output_wrapper = outputs[0].dynamicCast<VkComBackendWrapper>();
+
+        const auto input_A_shape = shape(*input_A_wrapper->getMat());
+        const auto output_shape = shape(*output_wrapper->getMat());
+        if ((inputs.size() + blobs.size()) >= 3 || output_shape.size() != 2) {
+            return Ptr<BackendNode>();
+        }
+
+        std::vector<Mat> constants;
+
+        if (!blobs.empty()) {
+            constants.push_back(blobs[0]);
+        }
+
+        Ptr<vkcom::OpBase> op = new vkcom::OpMatMul(constants, input_A_shape[0], input_A_shape[1], output_shape[1]);
+        return Ptr<BackendNode>(new VkComBackendNode(inputs, op, outputs));
+    }
+#endif
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(void *context_,
+                              const std::vector<Ptr<BackendWrapper>>& inputs,
+                              const std::vector<Ptr<BackendWrapper>>& outputs) override {
+        auto context = reinterpret_cast<csl::CSLContext*>(context_);
+        auto input_B = Mat(), bias = Mat();
+        if (!blobs.empty()) {
+            input_B = blobs.front();
+            if (blobs.size() >= 2) {
+                bias = broadcast_bias;
+            }
+        }
+
+        CV_CheckFalse(helper.empty(), "DNN/MatMul/CUDA: MatMulHelper is not initialized");
+
+        return make_cuda_node<cuda4dnn::MatMulBroadcastOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), input_B, bias, trans_a, trans_b, helper.A_offsets, helper.B_offsets, helper.C_offsets, helper.batch);
+    }
+#endif // HAVE_CUDA
+
+#ifdef HAVE_CANN
+    virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                      const std::vector<Ptr<BackendWrapper> > &outputs,
+                                      const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE {
+        auto input_A_wrapper = inputs[0].dynamicCast<CannBackendWrapper>();
+        auto input_A_desc = input_A_wrapper->getTensorDesc();
+        auto input_A_node = nodes[0].dynamicCast<CannBackendNode>()->getOp();
+
+        auto op = std::make_shared<ge::op::BatchMatMulV2>(name);
+
+        // set attributes
+        op->set_attr_adj_x1(trans_a);
+        op->set_attr_adj_x2(trans_b);
+
+        // set inputs
+        // set inputs : x1
+        op->set_input_x1_by_name(*input_A_node, input_A_wrapper->name.c_str());
+        op->update_input_desc_x1(*input_A_desc);
+        // set inputs : x2
+        if (blobs.empty()) { // varaible input B
+            auto input_B_wrapper = inputs[1].dynamicCast<CannBackendWrapper>();
+            auto input_B_desc = input_B_wrapper->getTensorDesc();
+            auto input_B_node = nodes[1].dynamicCast<CannBackendNode>()->getOp();
+            op->set_input_x2_by_name(*input_B_node, "y");
+            op->update_input_desc_x2(*input_B_desc);
+            if (inputs.size() >= 3) {
+                auto input_bias_wrapper = inputs[2].dynamicCast<CannBackendWrapper>();
+                auto input_bias_desc = input_bias_wrapper->getTensorDesc();
+                auto input_bias_node = nodes[2].dynamicCast<CannBackendNode>()->getOp();
+                op->set_input_bias_by_name(*input_bias_node, "y");
+                op->update_input_desc_bias(*input_bias_desc);
+            }
+        } else { // constant input B
+            auto B = blobs[0];
+            auto const_B_node = std::make_shared<CannConstOp>(B.data, B.type(), shape(B), cv::format("%s_B", name.c_str()));
+            op->set_input_x2_by_name(*(const_B_node->getOp()), "y");
+            op->update_input_desc_x2(*(const_B_node->getTensorDesc()));
+            if ((inputs.size() + blobs.size()) >= 3) { // does not support broadcast bias
+                auto bias_mat = blobs.back();
+                auto bias_shape = shape(bias_mat);
+
+                // reshape if 1d
+                if (real_ndims_C == 1 && bias_shape.front() != 1) {
+                    bias_shape = std::vector<int>{bias_shape.front()};
+                }
+
+                auto const_bias_node = std::make_shared<CannConstOp>(bias_mat.data, bias_mat.type(), bias_shape, cv::format("%s_bias", name.c_str()));
+                op->set_input_bias_by_name(*(const_bias_node->getOp()), "y");
+                op->update_input_desc_bias(*(const_bias_node->getTensorDesc()));
+            }
+        }
+
+        // set outputs
+        auto output_desc = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
+        op->update_output_desc_y(*output_desc);
+        return Ptr<BackendNode>(new CannBackendNode(op));
+    }
+#endif // HAVE_CANN
+
+ private:
+    bool trans_a;
+    bool trans_b;
+    float alpha;
+    float beta;
+
+    int real_ndims_C;
+
+    std::vector<float> packed_input_B;
+    Mat broadcast_bias;
+
+    FastGemmOpt opt;
+    MatMulHelper helper;
+};
+
+Ptr<MatMulLayer> MatMulLayer::create(const LayerParams& params)
+{
+    return makePtr<MatMulLayerImpl>(params);
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/max_unpooling_layer.cpp b/modules/dnn/src/layers/max_unpooling_layer.cpp
index a44d25ce899c..aa645c17aee5 100644
--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@@ -13,7 +13,9 @@ Implementation of Batch Normalization layer.
 #include "layers_common.hpp"
 #include "../op_cuda.hpp"
 #include "../op_halide.hpp"
+#include "../ie_ngraph.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/core/utils/logger.hpp>
 
 #ifdef HAVE_CUDA
 #include "../cuda4dnn/primitives/max_unpooling.hpp"
@@ -40,6 +42,7 @@ class MaxUnpoolLayerImpl CV_FINAL : public MaxUnpoolLayer
     {
         return backendId == DNN_BACKEND_OPENCV ||
                backendId == DNN_BACKEND_CUDA ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
                (backendId == DNN_BACKEND_HALIDE && haveHalide() && !poolPad.width && !poolPad.height);
     }
 
@@ -72,7 +75,7 @@ class MaxUnpoolLayerImpl CV_FINAL : public MaxUnpoolLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -110,17 +113,12 @@ class MaxUnpoolLayerImpl CV_FINAL : public MaxUnpoolLayer
                     int index = idxptr[i_wh];
                     if (!(0 <= index && index < outPlaneTotal))
                     {
-                        std::cerr
-                            << "i_n=" << i_n << std::endl
-                            << "i_c=" << i_c << std::endl
-                            << "i_wh=" << i_wh << std::endl
-                            << "index=" << index << std::endl
-                            << "maxval=" << inptr[i_wh] << std::endl
-                            << "outPlaneTotal=" << outPlaneTotal << std::endl
-                            << "input.size=" << input.size << std::endl
-                            << "indices.size=" << indices.size << std::endl
-                            << "outBlob=" << outBlob.size << std::endl
-                            ;
+                        CV_LOG_ERROR(NULL, cv::format(
+                            "i_n=%d\ni_c=%d\ni_wh=%d\nindex=%d\nmaxval=%lf\noutPlaneTotal=%d\n",
+                            i_n, i_c, i_wh, index, inptr[i_wh], outPlaneTotal));
+                        CV_LOG_ERROR(NULL, "input.size=" << input.size);
+                        CV_LOG_ERROR(NULL, "indices.size=" << indices.size);
+                        CV_LOG_ERROR(NULL, "outBlob=" << outBlob.size);
                         CV_Assert(0 <= index && index < outPlaneTotal);
                     }
                     outptr[index] = inptr[i_wh];
@@ -185,6 +183,50 @@ class MaxUnpoolLayerImpl CV_FINAL : public MaxUnpoolLayer
 #endif  // HAVE_HALIDE
         return Ptr<BackendNode>();
     }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto features = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto indices = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+
+        std::vector<MatShape> inpShapes(nodes.size());
+        std::vector<MatShape> outShapes, internals;
+        for (int i = 0; i < nodes.size(); ++i) {
+            std::vector<size_t> shape = nodes[i].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
+            inpShapes[i] = std::vector<int>(shape.begin(), shape.end());
+        }
+        getMemoryShapes(inpShapes, 1, outShapes, internals);
+
+        Mat zeros = Mat::zeros(1, total(outShapes[0]), CV_32F);
+        auto zeroInp = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{zeros.total()}, zeros.data);
+
+        int newShape = -1;
+        features = std::make_shared<ov::op::v1::Reshape>(
+            features,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, &newShape),
+            true
+        );
+        indices = std::make_shared<ov::op::v1::Reshape>(
+            indices,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, &newShape),
+            true
+        );
+        if (indices.get_element_type() != ov::element::i32 && indices.get_element_type() != ov::element::i64) {
+            indices = std::make_shared<ov::op::v0::Convert>(indices, ov::element::i64);
+        }
+
+        int axis = 0;
+        std::shared_ptr<ov::Node> unpool = std::make_shared<ov::op::v3::ScatterElementsUpdate>(zeroInp, indices, features,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, &axis));
+
+        auto shape = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{outShapes[0].size()}, outShapes[0].data());
+        unpool = std::make_shared<ov::op::v1::Reshape>(unpool, shape, true);
+
+        return Ptr<BackendNode>(new InfEngineNgraphNode(unpool));
+    }
+#endif  // HAVE_DNN_NGRAPH
 };
 
 Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index dc23656b7a78..d59e339ac45e 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -46,6 +46,8 @@
 #include "../ie_ngraph.hpp"
 #include "../op_cuda.hpp"
 
+#include "./cpu_kernels/fast_norm.hpp"
+
 #include <opencv2/dnn/shape_utils.hpp>
 
 #ifdef HAVE_OPENCL
@@ -69,9 +71,12 @@ class MVNLayerImpl CV_FINAL : public MVNLayer
     MVNLayerImpl(const LayerParams& params)
     {
         setParamsFrom(params);
+
+        // Caffe params
         normVariance = params.get<bool>("normalize_variance", true);
         acrossChannels = params.get<bool>("across_channels", false);
         eps = params.get<double>("eps", 1e-9);
+
         fuse_batch_norm = false;
         fuse_relu = false;
         relu_slope = 0.f;
@@ -144,7 +149,7 @@ class MVNLayerImpl CV_FINAL : public MVNLayer
         UMat& bnorm_bias = umat_shift;
 
         const unsigned LOCAL_SIZE = 128;
-        bool use_half = (inputs[0].depth() == CV_16S);
+        bool use_half = (inputs[0].depth() == CV_16F);
         String opts = format(" -DT=%s -DT4=%s -Dconvert_T=%s -DLOCAL_SIZE=%u", use_half ? "half" : "float",
                              use_half ? "half4" : "float4", use_half ? "convert_half4" : "convert_float4",
                              LOCAL_SIZE
@@ -159,7 +164,7 @@ class MVNLayerImpl CV_FINAL : public MVNLayer
             CV_Assert(newRows != 0);
 
             MatShape s = shape(newRows, inpMat.total() / newRows);
-            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16S : CV_32F);
+            UMat meanMat = UMat(s[0], 1, (use_half) ? CV_16F : CV_32F);
             UMat tmpMat  = UMat(s[0], s[1], CV_32F);
             float alpha = 1.0f / s[1];
 
@@ -221,7 +226,7 @@ class MVNLayerImpl CV_FINAL : public MVNLayer
         if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
             return fast_forward_ocl(inputs, outputs);
 
-        if (inputs[0].depth() == CV_16S)
+        if (inputs[0].depth() == CV_16F)
             return false;
 
         String opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
@@ -304,79 +309,24 @@ class MVNLayerImpl CV_FINAL : public MVNLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
         }
 
-        std::vector<Mat> inputs, outputs, internals;
-        inputs_arr.getMatVector(inputs);
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs); // assume only one input
         outputs_arr.getMatVector(outputs);
-        internals_arr.getMatVector(internals);
-
-        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
-        {
-            Mat &inpBlob = inputs[inpIdx];
-            Mat &outBlob = outputs[inpIdx];
 
-            int splitDim = (acrossChannels) ? 1 : 2;
-            int i, newRows = 1;
-            for( i = 0; i < splitDim; i++ )
-                newRows *= inpBlob.size[i];
+        const auto &input = inputs[0];
 
-            Mat inpMat = inpBlob.reshape(1, newRows);
-            Mat outMat = outBlob.reshape(1, newRows);
-
-            if ( inpBlob.total() == newRows )
-            {
-                // MVN is applied to single values at an every row.
-                if (shift.empty())
-                {
-                    outBlob.setTo(0);
-                }
-                else
-                {
-                    for ( i = 0; i < newRows; i++ )
-                    {
-                        outMat.row(i).setTo(((float*)shift.data)[i]);
-                    }
-                }
-                return;
-            }
-
-            Scalar mean, dev;
-            for ( i = 0; i < newRows; i++)
-            {
-                Mat inpRow = inpMat.row(i);
-                Mat outRow = outMat.row(i);
-                float weight = 1.f;
-                float bias = 0.f;
-                if (fuse_batch_norm)
-                {
-                    weight = i < scale.cols ? ((float*)scale.data)[i] : weight;
-                    bias = i < shift.cols ? ((float*)shift.data)[i] : bias;
-                }
-                cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
-                double alpha = 1;
-                if (normVariance)
-                {
-                    alpha = 1 / std::sqrt(eps + dev[0]*dev[0]);
-                }
-                double normalizationScale = 1.0;
-                double normalizationShift = 0.0;
-                if (fuse_batch_norm)
-                {
-                    normalizationScale = alpha * weight;
-                    normalizationShift = -mean[0] * normalizationScale + bias;
-                }
-                else
-                {
-                    normalizationScale = alpha;
-                    normalizationShift = -mean[0] * alpha;
-                }
-                inpRow.convertTo(outRow, outRow.type(), normalizationScale, normalizationShift);
-            }
+        if (fuse_batch_norm) { // channel-wise scale/bias of shape (C)
+            CV_CheckTrue(normVariance, "DNN/MVN: not supported");
+            fastNormChannel(input, scale, shift, outputs[0], eps);
+        } else {
+            size_t axis = acrossChannels ? 1 : 2;
+            fastNorm(input, outputs[0], eps, axis, normVariance);
         }
     }
 
@@ -386,15 +336,11 @@ class MVNLayerImpl CV_FINAL : public MVNLayer
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2021_2)
-        auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
-#else
         int64_t start_axis = acrossChannels ? 1 : 2;
-        std::vector<int64_t> axes_v(ieInpNode->get_shape().size() - start_axis);
+        std::vector<int64_t> axes_v(ieInpNode.get_shape().size() - start_axis);
         std::iota(axes_v.begin(), axes_v.end(), start_axis);
-        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_v.size()}, axes_v.data());
-        auto mvn = std::make_shared<ngraph::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ngraph::op::MVNEpsMode::INSIDE_SQRT);
-#endif
+        auto axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{axes_v.size()}, axes_v.data());
+        auto mvn = std::make_shared<ov::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ov::op::MVNEpsMode::INSIDE_SQRT);
         return Ptr<BackendNode>(new InfEngineNgraphNode(mvn));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
index fadbf5824417..e3a8b2a583d2 100644
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -7,6 +7,7 @@
 #include "../op_cuda.hpp"
 #include "../op_cann.hpp"
 #include "../ie_ngraph.hpp"
+#include "../op_vkcom.hpp"
 
 #include <opencv2/dnn/shape_utils.hpp>
 
@@ -24,8 +25,171 @@ namespace cv
 namespace dnn
 {
 
+namespace {
+static int _mod(int x, int y) {
+    int res = x % y;
+    if ((res < 0 && y > 0) || (res > 0 && y < 0)) {
+        res += y;
+    }
+    return res;
+}
+}
+
+class NaryEltwiseHelper CV_FINAL
+{
+public:
+    int ninputs;
+    int narrays;
+    int max_ndims;
+    std::vector<int> all_ndims;
+    std::vector<std::vector<int>> orig_shapes;
+    std::vector<std::vector<size_t>> orig_steps;
+    std::vector<char*> ptrs;
+    std::vector<std::vector<int>> shapes;
+    std::vector<std::vector<size_t>> steps;
+    std::vector<size_t> elemsize;
+
+    NaryEltwiseHelper() {
+    }
+
+    void init(const std::vector<Mat>& inputs, const std::vector<Mat>& outputs)
+    {
+        narrays = 0;
+        max_ndims = 0;
+        all_ndims.clear();
+        orig_shapes.clear();
+        orig_steps.clear();
+        ptrs.clear();
+        shapes.clear();
+        steps.clear();
+        elemsize.clear();
+
+        ninputs = inputs.size();
+        narrays = ninputs + 1;
+
+        // collect ndims
+        std::vector<int> v_inp_dims;
+        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
+        const int* inp_ndims = v_inp_dims.data();
+        int out_ndims = outputs[0].dims;
+
+        // find max ndims for broadcasting
+        int i;
+        max_ndims = out_ndims > 2 ? out_ndims : 2;
+        for(i = 0; i < ninputs; i++)
+            max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
+
+        shapes = std::vector<std::vector<int>>(narrays, std::vector<int>(max_ndims, 0));
+        steps = std::vector<std::vector<size_t>>(narrays, std::vector<size_t>(max_ndims, 0));
+        ptrs = std::vector<char*>(narrays, nullptr);
+
+        for(i = 0; i <= ninputs; i++) {
+            all_ndims.push_back(i == 0 ? out_ndims : inp_ndims[i-1]);
+            std::vector<int> _size;
+            std::vector<size_t> _step;
+            if (!i) {
+                std::transform(outputs[0].size.p, outputs[0].size.p + outputs[0].dims, std::back_inserter(_size), [](int s) { return s; });
+                std::transform(outputs[0].step.p, outputs[0].step.p + outputs[0].dims, std::back_inserter(_step), [](size_t s) { return s; });
+            }
+            else {
+                std::transform(inputs[i-1].size.p, inputs[i-1].size.p + inputs[i-1].dims, std::back_inserter(_size), [](int s) { return s; });
+                std::transform(inputs[i-1].step.p, inputs[i-1].step.p + inputs[i-1].dims, std::back_inserter(_step), [](size_t s) { return s; });
+            }
+            orig_shapes.push_back(_size);
+            orig_steps.push_back(_step);
+
+            int esz = i == 0 ? outputs[0].elemSize() : inputs[i - 1].elemSize();
+            elemsize.push_back(esz);
+        }
+    }
+
+    void reInit(size_t newElemSize) {
+        std::vector<size_t> newElemSizes(elemsize.size(), newElemSize);
+        reInit(newElemSizes);
+    }
+
+    void reInit(std::vector<size_t> newElemSizes) {
+        for (size_t array_index = 0; array_index < orig_steps.size(); array_index++) {
+            auto &step = orig_steps[array_index];
+            int esz = elemsize[array_index];
+            int new_esz = newElemSizes[array_index];
+            for (size_t step_index = 0; step_index < step.size(); step_index++) {
+                step[step_index] = static_cast<size_t>(step[step_index] / esz * new_esz);
+            }
+            elemsize[array_index] = newElemSizes[array_index];
+        }
+        prepare_for_broadcast_op();
+    }
+
+    bool prepare_for_broadcast_op()
+    {
+        int i, j, k;
+
+        // step 1.
+        // * make all inputs and the output max_ndims-dimensional.
+        // ** prepend dimension 1 to the mat of less dims
+        // * compute proper step's
+        for (i = this->max_ndims-1; i >= 0; i--) {
+            for (k = 0; k < this->narrays; k++) {
+                j = this->all_ndims[k] - (this->max_ndims - i);
+                int sz_i = j >= 0 ? this->orig_shapes[k][j] : 1;
+                size_t st_i = j >= 0 && this->orig_steps[k][j] > 0 ? this->orig_steps[k][j] :
+                    i == this->max_ndims-1 ? elemsize[k] : this->steps[k][i+1]*this->shapes[k][i+1];
+                assert(st_i % elemsize[k] == 0);
+                this->shapes[k][i] = sz_i;
+                this->steps[k][i] = st_i;
+                if (this->shapes[k][i] == 0)
+                    return false;
+            }
+        }
+
+        // step 3. Let's do the flattening first,
+        // since we'd need proper values of steps to check continuity.
+        // this loop is probably the most tricky part
+        // in the whole implementation of broadcasting.
+        j = this->max_ndims-1;
+        for (i = j - 1; i >= 0; i--) {
+            bool all_contiguous = true, all_scalars = true, all_consistent = true;
+            for(k = 0; k < this->narrays; k++) {
+                size_t st = this->steps[k][j]*this->shapes[k][j];
+                bool prev_scalar = this->shapes[k][j] == 1;
+                bool scalar = this->shapes[k][i] == 1;
+                all_contiguous = all_contiguous && (st == this->steps[k][i]);
+                all_scalars = all_scalars && scalar;
+                all_consistent = all_consistent && (scalar == prev_scalar);
+            }
+            if (all_contiguous && (all_consistent || all_scalars)) {
+                for(k = 0; k < this->narrays; k++)
+                    this->shapes[k][j] *= this->shapes[k][i];
+            } else {
+                j--;
+                if (i < j) {
+                    for(k = 0; k < this->narrays; k++) {
+                        this->shapes[k][j] = this->shapes[k][i];
+                        this->steps[k][j] = this->steps[k][i];
+                    }
+                }
+            }
+        }
+
+        // step 2. Set some step's to 0's.
+        for (i = this->max_ndims-1; i >= j; i--) {
+            for (k = 0; k < this->narrays; k++)
+                this->steps[k][i] = this->shapes[k][i] == 1 ? 0 : this->steps[k][i];
+        }
+        for (; i >= 0; i--) {
+            for (k = 0; k < this->narrays; k++) {
+                this->steps[k][i] = 0;
+                this->shapes[k][i] = 1;
+            }
+        }
+        return true;
+    }
+};
+
 class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
 {
+    NaryEltwiseHelper helper;
 public:
     enum class OPERATION
     {
@@ -42,7 +206,8 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
         MAX,
         MEAN,
         MIN,
-        MOD,
+        MOD,  // Integer Mod. Reminder's sign = Divisor's sign.
+        FMOD, // Floating-point Mod. Reminder's sign = Dividend's sign.
         PROD,
         SUB,
         SUM,
@@ -79,6 +244,8 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
             op = OPERATION::MIN;
         else if (operation == "mod")
             op = OPERATION::MOD;
+        else if (operation == "fmod")
+            op = OPERATION::FMOD;
         else if (operation == "mul")
             op = OPERATION::PROD;
         else if (operation == "sub")
@@ -106,17 +273,29 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
 #ifdef HAVE_CANN
         if (backendId == DNN_BACKEND_CANN)
             return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB ||
-                   op == OPERATION::DIV || op == OPERATION::MAX  || op == OPERATION::MIN;
+                   op == OPERATION::DIV || op == OPERATION::MAX  || op == OPERATION::MIN ||
+                   op == OPERATION::MOD || op == OPERATION::FMOD;
 #endif
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
             return (op == OPERATION::ADD ||
                     op == OPERATION::PROD ||
                     op == OPERATION::GREATER_EQUAL ||
-                    op == OPERATION::LESS_EQUAL
+                    op == OPERATION::LESS_EQUAL ||
+                    op == OPERATION::MOD ||
+                    op == OPERATION::FMOD
             );
-        if (op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM ||
-            op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD)
-            return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_CUDA;
+
+#ifdef HAVE_VULKAN
+        if (backendId == DNN_BACKEND_VKCOM)
+            return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB ||
+                   op == OPERATION::DIV ;
+#endif
+
+        if (backendId == DNN_BACKEND_CUDA) {
+            return op == OPERATION::MAX  || op == OPERATION::MIN  || op == OPERATION::SUM ||
+                   op == OPERATION::PROD || op == OPERATION::DIV  || op == OPERATION::ADD ||
+                   op == OPERATION::SUB  || op == OPERATION::MOD || op == OPERATION::FMOD;
+        }
         return backendId == DNN_BACKEND_OPENCV;
     }
 
@@ -148,72 +327,14 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
         return outShape;
     }
 
-    static bool prepare_for_broadcast_op(
-        int narrays, int max_ndims, const size_t* elemsize,
-        const int* ndims, const int** shape_, const size_t** step_,
-        int** shape, size_t** step)
-    {
-        int i, j, k;
-
-        // step 1.
-        // * make all inputs and the output max_ndims-dimensional.
-        // ** prepend dimension 1 to the mat of less dims
-        // * compute proper step's
-        for (i = max_ndims-1; i >= 0; i-- ) {
-            for (k = 0; k < narrays; k++) {
-                j = ndims[k] - (max_ndims - i);
-                int sz_i = j >= 0 ? shape_[k][j] : 1;
-                size_t st_i = j >= 0 && step_ && step_[k] && step_[k][j] > 0 ? step_[k][j] :
-                    i == max_ndims-1 ? elemsize[k] : step[k][i+1]*shape[k][i+1];
-                assert(st_i % elemsize[k] == 0);
-                shape[k][i] = sz_i;
-                step[k][i] = st_i;
-                if (shape[k][i] == 0)
-                    return false;
-            }
-        }
 
-        // step 3. Let's do the flattening first,
-        // since we'd need proper values of steps to check continuity.
-        // this loop is probably the most tricky part
-        // in the whole implementation of broadcasting.
-        j = max_ndims-1;
-        for (i = j - 1; i >= 0; i--) {
-            bool all_contiguous = true, all_scalars = true, all_consistent = true;
-            for(k = 0; k < narrays; k++) {
-                size_t st = step[k][j]*shape[k][j];
-                bool prev_scalar = shape[k][j] == 1;
-                bool scalar = shape[k][i] == 1;
-                all_contiguous = all_contiguous && (st == step[k][i]);
-                all_scalars = all_scalars && scalar;
-                all_consistent = all_consistent && (scalar == prev_scalar);
-            }
-            if (all_contiguous && (all_consistent || all_scalars)) {
-                for(k = 0; k < narrays; k++)
-                    shape[k][j] *= shape[k][i];
-            } else {
-                j--;
-                if (i < j) {
-                    for(k = 0; k < narrays; k++) {
-                        shape[k][j] = shape[k][i];
-                        step[k][j] = step[k][i];
-                    }
-                }
-            }
-        }
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
 
-        // step 2. Set some step's to 0's.
-        for (i = max_ndims-1; i >= j; i--) {
-            for (k = 0; k < narrays; k++)
-                step[k][i] = shape[k][i] == 1 ? 0 : step[k][i];
-        }
-        for (; i >= 0; i--) {
-            for (k = 0; k < narrays; k++) {
-                step[k][i] = 0;
-                shape[k][i] = 1;
-            }
-        }
-        return true;
+        helper.init(inputs, outputs);
+        CV_Assert(helper.prepare_for_broadcast_op());
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -228,10 +349,10 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
 
     template <typename T, typename Functor>
     void binary_forward_impl(
-            int ndims, const int* shape,
-            const char* data1, const size_t* step1,
-            const char* data2, const size_t* step2,
-            char* data, const size_t* step,
+            int ndims, const std::vector<int>& shape,
+            const char* data1, const std::vector<size_t>& step1,
+            const char* data2, const std::vector<size_t>& step2,
+            char* data, const std::vector<size_t>& step,
             const Functor& op)
     {
         assert(ndims >= 2);
@@ -287,63 +408,18 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
         const Mat& a = inputs[0];
         const Mat& b = inputs[1];
         Mat& out = outputs[0];
-
-        // collect info of inputs and output
-        const int* in_shape[] = {a.size.p, b.size.p};
-        const size_t* in_step[] = {a.step.p, b.step.p};
-        const int* out_shape = out.size.p;
-        const size_t* out_step = out.step.p;
-        const int in_ndims[] = {a.dims, b.dims};
-        int out_ndims = out.dims;
-
-        int max_ndims = std::max(a.dims, std::max(b.dims, out.dims));
-
-        // buf holds the folllowing for a, b & output:
-        //  * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), 3*4 elements in total
-        //  * shape_buf & step_buf, 3*2*max_ndims elements in total
-        //  * all_ndims, 3*1 elements in total
-        //  * all_type_sizes, 3*1 elements in total
-        AutoBuffer<size_t> buf(3 * (2 * max_ndims + 6));
-
-        int** orig_shapes = (int**)(buf.data());
-        int** shapes = orig_shapes + 3;
-        size_t** orig_steps = (size_t**)(shapes + 3);
-        size_t** steps = orig_steps + 3;
-
-        int* shape_buf = (int*)(steps + 3);
-        size_t* step_buf = (size_t*)(shape_buf + 3 * max_ndims);
-
-        int* all_ndims = (int*)(step_buf + 3 * max_ndims);
-        size_t* all_type_sizes = (size_t*)(all_ndims + 3);
-
-        // assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
-        for (int i = 0; i < 3; i++)
-        {
-            orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]);
-            orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
-            shapes[i] = shape_buf + i * max_ndims;
-            steps[i] = step_buf + i * max_ndims;
-            all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
-            all_type_sizes[i] = sizeof(T);
-        }
-
-        if (!prepare_for_broadcast_op(3, max_ndims, all_type_sizes,
-                                      all_ndims, (const int**)orig_shapes,
-                                      (const size_t**)orig_steps,
-                                      shapes, steps))
-            return;
-
+        CV_Assert(helper.shapes.size() == 3 && helper.steps.size() == 3);
         binary_forward_impl<T, Functor>(
-                max_ndims, shapes[0], a.ptr<char>(), steps[1],
-                b.ptr<char>(), steps[2], out.ptr<char>(), steps[0],
+                helper.max_ndims, helper.shapes[0], a.ptr<char>(), helper.steps[1],
+                b.ptr<char>(), helper.steps[2], out.ptr<char>(), helper.steps[0],
                 f);
     }
 
     template<typename T, typename Functor>
     void nary_forward_impl(
-        const Functor& f, const T scale, int ninputs, int ndims, const int* shape,
+        const Functor& f, const T scale, int ninputs, int ndims, const std::vector<int>& shape,
         const char** inp, char* out,
-        const size_t** steps, char** ptrs)
+        const std::vector<std::vector<size_t>>& steps, std::vector<char*>& ptrs)
     {
         CV_Assert(ndims >= 2);
         size_t dp = steps[0][ndims-1]/sizeof(T);
@@ -428,77 +504,16 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
         const std::vector<Mat>& inputs, std::vector<Mat>& outputs
         )
     {
-        int ninputs = inputs.size();
-
-        // collect all input
+        // collect all input info
         std::vector<const char*> v_inp;
         std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp), [] (const Mat& m) { return m.template ptr<const char>(); });
         const char** inp = v_inp.data();
 
-        // collect ndims of all input
-        std::vector<int> v_inp_dims;
-        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_dims), [] (const Mat& m) { return m.dims; });
-        const int* inp_ndims = v_inp_dims.data();
-
-        // collect shapes of all input
-        std::vector<const int*> v_inp_shape;
-        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_shape), [] (const Mat& m) { return m.size.p; });
-        const int** inp_shape = v_inp_shape.data();
-
-        // collect steps of all input
-        std::vector<const size_t*> v_inp_step;
-        std::transform(inputs.begin(), inputs.end(), std::back_inserter(v_inp_step), [] (const Mat& m) { return m.step.p; });
-        const size_t** inp_step = v_inp_step.data();
-
-        // collect info of output (ndims, shape, step)
+        // collect output info
         char* out = outputs[0].ptr<char>();
-        int out_ndims = outputs[0].dims;
-        const int* out_shape = outputs[0].size.p;
-        const size_t* out_step = outputs[0].step.p;
-
-        // find max ndims for broadcasting
-        int i, max_ndims = out_ndims > 2 ? out_ndims : 2;
-        for(i = 0; i < ninputs; i++)
-            max_ndims = max_ndims > inp_ndims[i] ? max_ndims : inp_ndims[i];
-
-        // buf holds the following buffers for inputs & output:
-        //  * orig_shapes, shapes (result_shape), orig_steps, steps (result_step), (ninputs+1)*4 elements in total
-        //  * ptrs, (ninputs+1)*1 elements in total
-        //  * shape_buf & step_buf, (ninputs+1)*2*max_ndims elements in total
-        //  * all_ndims, (ninputs+1)*1 elements in total
-        //  * all_type_sizes, (ninputs+1)*1 elements in total
-        AutoBuffer<size_t> buf((ninputs + 1) * (2 * max_ndims + 7));
-
-        int** orig_shapes = (int**)buf.data();
-        int** shapes = orig_shapes + ninputs + 1;
-        size_t** orig_steps = (size_t**)(shapes + ninputs + 1);
-        size_t** steps = orig_steps + ninputs + 1;
-
-        char** ptrs = (char**)(steps + ninputs + 1);
-
-        size_t* step_buf = (size_t*)(ptrs + ninputs + 1);
-        int* shape_buf = (int*)(step_buf + (ninputs + 1)*max_ndims);
-
-        int* all_ndims = shape_buf + (ninputs + 1)*max_ndims;
-        size_t* all_type_sizes = (size_t*)(all_ndims + ninputs + 1);
-
-        for(i = 0; i <= ninputs; i++) {
-            all_ndims[i] = i == 0 ? out_ndims : inp_ndims[i-1];
-            all_type_sizes[i] = sizeof(T);
-            orig_shapes[i] = (int*)(i == 0 ? out_shape : inp_shape ? inp_shape[i-1] : 0);
-            orig_steps[i] = (size_t*)(i == 0 ? out_step : inp_step ? inp_step[i-1] : 0);
-            shapes[i] = shape_buf + max_ndims*i;
-            steps[i] = step_buf + max_ndims*i;
-        }
-
-        if (!prepare_for_broadcast_op(ninputs + 1, max_ndims, all_type_sizes,
-                                      all_ndims, (const int**)orig_shapes,
-                                      (const size_t**)orig_steps,
-                                      shapes, steps))
-            return;
 
         nary_forward_impl<T>(
-                f, scale, ninputs, max_ndims, shapes[0], inp, out, (const size_t **) steps, ptrs);
+                f, scale, helper.ninputs, helper.max_ndims, helper.shapes[0], inp, out, helper.steps, helper.ptrs);
     }
 
     template <typename T, typename Functor>
@@ -509,59 +524,21 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
         const Mat& c = inputs[2];
         Mat& out = outputs[0];
 
-        // collect info of inputs and output
-        const int* in_shape[] = {a.size.p, b.size.p, c.size.p};
-        const size_t* in_step[] = {a.step.p, b.step.p, c.step.p};
-        const int* out_shape = out.size.p;
-        const size_t* out_step = out.step.p;
-        const int in_ndims[] = {a.dims, b.dims, c.dims};
-        int out_ndims = out.dims;
-
-        int max_ndims = std::max(a.dims, std::max(b.dims, std::max(c.dims, out.dims)));
-
-        AutoBuffer<size_t> buf(4 * (2 * max_ndims + 6));
-
-        int** orig_shapes = (int**)(buf.data());
-        int** shapes = orig_shapes + 4;
-        size_t** orig_steps = (size_t**)(shapes + 4);
-        size_t** steps = orig_steps + 4;
-
-        int* shape_buf = (int*)(steps + 4);
-        size_t* step_buf = (size_t*)(shape_buf + 4 * max_ndims);
-
-        int* all_ndims = (int*)(step_buf + 4 * max_ndims);
-        size_t* all_type_sizes = (size_t*)(all_ndims + 4);
-
-        // assign orig_shapes, shapes, orig_steps, steps, all_ndims, all_type_sizes
-        for (int i = 0; i < 4; i++)
-        {
-            orig_shapes[i] = (int*)(i == 0 ? out_shape : in_shape[i-1]);
-            orig_steps[i] = (size_t*)(i == 0 ? out_step : in_step[i-1]);
-            shapes[i] = shape_buf + i * max_ndims;
-            steps[i] = step_buf + i * max_ndims;
-            all_ndims[i] = i == 0 ? out_ndims : in_ndims[i-1];
-            all_type_sizes[i] = sizeof(T);
-        }
-
-        if (!prepare_for_broadcast_op(4, max_ndims, all_type_sizes,
-                                      all_ndims, (const int**)orig_shapes,
-                                      (const size_t**)orig_steps,
-                                      shapes, steps))
-            return;
+        CV_Assert(helper.shapes.size() == 4 && helper.steps.size() == 4);
 
         trinary_forward_impl<T, Functor>(
-                max_ndims, shapes[0], a.ptr<char>(), steps[1], b.ptr<char>(), steps[2],
-                c.ptr<char>(), steps[3], out.ptr<char>(), steps[0],
+                helper.max_ndims, helper.shapes[0], a.ptr<char>(), helper.steps[1], b.ptr<char>(), helper.steps[2],
+                c.ptr<char>(), helper.steps[3], out.ptr<char>(), helper.steps[0],
                 f);
     }
 
     template <typename T, typename Functor>
     void trinary_forward_impl(
-            int ndims, const int* shape,
-            const char* data1, const size_t* step1,
-            const char* data2, const size_t* step2,
-            const char* data3, const size_t* step3,
-            char* data, const size_t* step,
+            int ndims, const std::vector<int>& shape,
+            const char* data1, const std::vector<size_t>& step1,
+            const char* data2, const std::vector<size_t>& step2,
+            const char* data3, const std::vector<size_t>& step3,
+            char* data, const std::vector<size_t>& step,
             const Functor& op)
     {
         assert(ndims >= 2);
@@ -620,8 +597,9 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
+            helper.reInit(sizeof(float));
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
         }
@@ -701,10 +679,16 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
             }
             case OPERATION::MOD:
             {
-                auto mod = [](const uint8_t &a, const uint8_t &b) { return a % b; };
+                auto mod = [] (const T &a, const T &b) { return static_cast<T>(_mod(int(a), int(b))); };
                 binary_forward<T>(mod, std::forward<Args>(args)...);
                 break;
             }
+            case OPERATION::FMOD:
+            {
+                auto fmod = [](const T &a, const T &b) { return std::fmod(a, b); };
+                binary_forward<T>(fmod, std::forward<Args>(args)...);
+                break;
+            }
             case OPERATION::PROD:
             {
                 auto prod = [](const T &a, const T &b) { return a * b; };
@@ -770,15 +754,18 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
         switch (type)
         {
             case CV_8U:
+                // TODO: integrate with type inference
+                helper.reInit(sizeof(uint8_t));
                 opDispatch<uint8_t>(std::forward<Args>(args)...);
                 break;
             case CV_32S:
+                // TODO: integrate with type inference
+                helper.reInit(sizeof(int32_t));
                 opDispatch<int32_t>(std::forward<Args>(args)...);
                 break;
             case CV_32F:
-                CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::MOD &&
-                          op != OPERATION::AND && op != OPERATION::OR &&
-                          op != OPERATION::XOR);
+                CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::AND &&
+                          op != OPERATION::OR && op != OPERATION::XOR);
                 opDispatch<float>(std::forward<Args>(args)...);
                 break;
             default:
@@ -795,19 +782,6 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
     {
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 
-        auto input_0_shape = inputs[0].dynamicCast<CUDABackendWrapper>()->getShape();
-        for (int i = 1; i < inputs.size(); i++)
-        {
-            auto input_i_shape = inputs[i].dynamicCast<CUDABackendWrapper>()->getShape();
-            if (input_0_shape.size() != input_i_shape.size())
-                return Ptr<BackendNode>();
-            // check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode
-            for (int j = 0; j < input_0_shape.size(); j++)
-                if (input_0_shape[j] != input_i_shape[j] &&
-                    input_0_shape[j] != 1 && input_i_shape[j] != 1)
-                    return Ptr<BackendNode>();
-        }
-
         cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;
         switch (op) {
             case OPERATION::MAX:
@@ -828,6 +802,15 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
             case OPERATION::ADD:
                 op_ = cuda4dnn::EltwiseOpType::SUM;
                 break;
+            case OPERATION::SUB:
+                op_ = cuda4dnn::EltwiseOpType::SUB;
+                break;
+            case OPERATION::MOD:
+                op_ = cuda4dnn::EltwiseOpType::MOD;
+                break;
+            case OPERATION::FMOD:
+                op_ = cuda4dnn::EltwiseOpType::FMOD;
+                break;
             default: return Ptr<BackendNode>(); // return empty cuda_node if the EltwiseOpType is unsupported type.
         };
 
@@ -872,6 +855,8 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
             BUILD_CANN_ELTWISE_OP(OPERATION::DIV,  Xdivy,   name);
             BUILD_CANN_ELTWISE_OP(OPERATION::MAX,  Maximum, name);
             BUILD_CANN_ELTWISE_OP(OPERATION::MIN,  Minimum, name);
+            BUILD_CANN_ELTWISE_OP(OPERATION::MOD,  Mod,     name);
+            BUILD_CANN_ELTWISE_OP(OPERATION::FMOD, Mod,     name);
 #undef BUILD_CANN_ELTWISE_OP
             default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
         }
@@ -900,29 +885,49 @@ class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
         auto& inp0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         auto& inp1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
 
-        if (inp0->get_element_type() != inp1->get_element_type()) {
+        if (inp0.get_element_type() != inp1.get_element_type()) {
             auto dtype = preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD ?
-                        ngraph::element::f16 : ngraph::element::f32;
-            if (inp0->get_element_type() != dtype)
-                inp0 = std::make_shared<ngraph::op::v0::Convert>(inp0, dtype);
-            if (inp1->get_element_type() != dtype)
-                inp1 = std::make_shared<ngraph::op::v0::Convert>(inp1, dtype);
+                        ov::element::f16 : ov::element::f32;
+            if (inp0.get_element_type() != dtype)
+                inp0 = std::make_shared<ov::op::v0::Convert>(inp0, dtype);
+            if (inp1.get_element_type() != dtype)
+                inp1 = std::make_shared<ov::op::v0::Convert>(inp1, dtype);
         }
 
-        std::shared_ptr<ngraph::Node> node;
+        std::shared_ptr<ov::Node> node;
         if (op == OPERATION::ADD)
-            node = std::make_shared<ngraph::op::v1::Add>(inp0, inp1);
+            node = std::make_shared<ov::op::v1::Add>(inp0, inp1);
         else if (op == OPERATION::PROD)
-            node = std::make_shared<ngraph::op::v1::Multiply>(inp0, inp1);
+            node = std::make_shared<ov::op::v1::Multiply>(inp0, inp1);
         else if (op == OPERATION::GREATER_EQUAL)
-            node = std::make_shared<ngraph::op::v1::GreaterEqual>(inp0, inp1);
+            node = std::make_shared<ov::op::v1::GreaterEqual>(inp0, inp1);
         else if (op == OPERATION::LESS_EQUAL)
-            node = std::make_shared<ngraph::op::v1::LessEqual>(inp0, inp1);
+            node = std::make_shared<ov::op::v1::LessEqual>(inp0, inp1);
+        // Ideally we should do this but int32 internal blobs are converted to float32 data type in inference.
+        // TODO: Remove data type convertion when we have type inference.
+        else if (op == OPERATION::MOD) {
+            auto inp0_i64 = std::make_shared<ov::op::v0::Convert>(inp0, ov::element::i64);
+            auto inp1_i64 = std::make_shared<ov::op::v0::Convert>(inp1, ov::element::i64);
+            auto mod = std::make_shared<ov::op::v1::FloorMod>(inp0_i64, inp1_i64);
+            node = std::make_shared<ov::op::v0::Convert>(mod, ov::element::f32);
+        }
+        else if (op == OPERATION::FMOD)
+            node = std::make_shared<ov::op::v1::Mod>(inp0, inp1);
         else
             CV_Error(Error::StsNotImplemented, "Operation is not implemented for nGraph backend");
         return Ptr<BackendNode>(new InfEngineNgraphNode(node));
     }
 #endif
+
+#ifdef HAVE_VULKAN
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                       std::vector<Ptr<BackendWrapper> > &outputs) CV_OVERRIDE
+    {
+        Ptr<vkcom::OpBase> op = makePtr<vkcom::OpNary>((vkcom::OpNary::OPERATION) this->op, helper.ninputs, helper.max_ndims, helper.shapes, helper.steps);
+        return Ptr<BackendNode>(makePtr<VkComBackendNode>(inputs, op, outputs));
+    }
+#endif
+
 };
 
 Ptr<NaryEltwiseLayer> NaryEltwiseLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index f0ad6e6f6171..d2324b2a9404 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -112,7 +112,7 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer
         std::vector<UMat> outputs;
         std::vector<UMat> internals;
 
-        if (inputs_.depth() == CV_16S)
+        if (inputs_.depth() == CV_16F)
             return false;
 
         inputs_.getUMatVector(inputs);
@@ -193,7 +193,7 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -273,32 +273,28 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        const size_t batch = ieInpNode->get_shape()[0];
-        const size_t numChannels = ieInpNode->get_shape()[1];
+        const size_t batch = ieInpNode.get_shape()[0];
+        const size_t numChannels = ieInpNode.get_shape()[1];
 
         std::vector<int64_t> axes_data;
         if (!acrossSpatial) {
             axes_data.push_back(1);
         } else {
-            axes_data.resize(ieInpNode->get_shape().size() - 1);
+            axes_data.resize(ieInpNode.get_shape().size() - 1);
             std::iota(axes_data.begin(), axes_data.end(), 1);
         }
-        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_data.size()}, axes_data);
-        auto norm = std::make_shared<ngraph::op::v0::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
+        auto axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{axes_data.size()}, axes_data);
+        auto norm = std::make_shared<ov::op::v0::NormalizeL2>(ieInpNode, axes, epsilon, ov::op::EpsMode::ADD);
 
         CV_Assert(blobs.empty() || numChannels == blobs[0].total());
-        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        std::vector<size_t> shape(ieInpNode.get_shape().size(), 1);
         shape[0] = blobs.empty() ? 1 : batch;
         shape[1] = numChannels;
         if (!blobs.empty())
         {
-            auto weight = std::make_shared<ngraph::op::Constant>(
-                                      ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
-            auto mul = std::make_shared<ngraph::op::v1::Multiply>(norm, weight, ngraph::op::AutoBroadcastType::NUMPY);
-#else
-            auto mul = std::make_shared<ngraph::op::v0::Multiply>(norm, weight, ngraph::op::AutoBroadcastType::NUMPY);
-#endif
+            auto weight = std::make_shared<ov::op::v0::Constant>(
+                                      ov::element::f32, ov::Shape(shape), blobs[0].data);
+            auto mul = std::make_shared<ov::op::v1::Multiply>(norm, weight, ov::op::AutoBroadcastType::NUMPY);
             return Ptr<BackendNode>(new InfEngineNgraphNode(mul));
         }
         return Ptr<BackendNode>(new InfEngineNgraphNode(norm));
diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp
index f66d44b222a6..947bca86181e 100644
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@@ -129,17 +129,7 @@ class PaddingLayerImpl CV_FINAL : public PaddingLayer
 
         if (paddingType == "constant")
         {
-            if (inputs_arr.depth() == CV_16S)
-            {
-                std::vector<float> paddingValue_fp32(1, paddingValue);
-                std::vector<int16_t> paddingValue_fp16(1);
-                cv::convertFp16(paddingValue_fp32, paddingValue_fp16);
-                outputs[0].setTo(paddingValue_fp16[0]);
-            }
-            else if (inputs_arr.depth() == CV_8S)
-                outputs[0].setTo(saturate_cast<int8_t>(paddingValue));
-            else
-                outputs[0].setTo(paddingValue);
+            outputs[0].setTo(paddingValue);
             inputs[0].copyTo(outputs[0](dstRanges));
         }
         else if (paddingType == "reflect" || paddingType == "edge")
@@ -278,14 +268,14 @@ class PaddingLayerImpl CV_FINAL : public PaddingLayer
             begins[i] = static_cast<int64_t>(paddings[i].first);
             ends[i]   = static_cast<int64_t>(paddings[i].second);
         }
-        auto padding_below = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{begins.size()}, begins.data());
-        auto padding_above = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{ends.size()}, ends.data());
-        auto pad_mode = paddingType == "constant" ? ngraph::op::PadMode::CONSTANT : ngraph::op::PadMode::REFLECT; // SYMMETRIC
-        auto arg_pad_value = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{}, &paddingValue);;
+        auto padding_below = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{begins.size()}, begins.data());
+        auto padding_above = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{ends.size()}, ends.data());
+        auto pad_mode = paddingType == "constant" ? ov::op::PadMode::CONSTANT : ov::op::PadMode::REFLECT; // SYMMETRIC
+        auto arg_pad_value = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, &paddingValue);;
 
         auto pad = paddingType == "constant" ?
-             std::make_shared<ngraph::op::v1::Pad>(ieInpNode, padding_below, padding_above, arg_pad_value, pad_mode) :
-             std::make_shared<ngraph::op::v1::Pad>(ieInpNode, padding_below, padding_above, pad_mode);
+             std::make_shared<ov::op::v1::Pad>(ieInpNode, padding_below, padding_above, arg_pad_value, pad_mode) :
+             std::make_shared<ov::op::v1::Pad>(ieInpNode, padding_below, padding_above, pad_mode);
         return Ptr<BackendNode>(new InfEngineNgraphNode(pad));
     }
 #endif
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index 4e6ca2543d39..7304302314a0 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -319,7 +319,7 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
             mnew_stride.copyTo(unew_stride);
         }
 
-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
         String opts = format("-DDtype=%s", use_half ? "half" : "float");
         for (size_t i = 0; i < inputs.size(); i++)
         {
@@ -350,7 +350,7 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
                    inputs_arr.depth() != CV_8S,
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -475,9 +475,9 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         std::vector<int64_t> order(_order.begin(), _order.end());
-        auto tr_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                       ngraph::Shape({order.size()}), order.data());
-        auto transpose = std::make_shared<ngraph::op::Transpose>(ieInpNode, tr_axes);
+        auto tr_axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                       ov::Shape({order.size()}), order.data());
+        auto transpose = std::make_shared<ov::op::v1::Transpose>(ieInpNode, tr_axes);
         return Ptr<BackendNode>(new InfEngineNgraphNode(transpose));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index c58405507e2c..fbf075f7d3f3 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -51,13 +51,8 @@
 
 #ifdef HAVE_DNN_NGRAPH
 #include "../ie_ngraph.hpp"
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-#include <ngraph/op/roi_pooling.hpp>
-#include <ngraph/op/psroi_pooling.hpp>
-#else
-#include <ngraph/op/experimental/layers/roi_pooling.hpp>
-#include <ngraph/op/experimental/layers/psroi_pooling.hpp>
-#endif
+#include <openvino/op/roi_pooling.hpp>
+#include <openvino/op/psroi_pooling.hpp>
 #endif
 
 #include "../op_vkcom.hpp"
@@ -209,7 +204,8 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer
 #ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         {
-            return !computeMaxIdx && type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin());
+            return type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin()) &&
+                   (!computeMaxIdx || INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1));
         }
 #endif
         if (backendId == DNN_BACKEND_OPENCV)
@@ -292,7 +288,7 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -352,7 +348,7 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer
             CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                        forward_ocl(inputs_arr, outputs_arr, internals_arr))
         }
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -587,20 +583,20 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer
         CV_Assert_N((inputs.size() == 1 && (type == MAX || type == AVE || type == SUM)) || inputs.size() == 2, nodes.size() == inputs.size());
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
 
-        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
+        ov::op::PadType pad_type = ov::op::PadType::EXPLICIT;
         if (!padMode.empty())
-            pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
+            pad_type = padMode == "VALID" ? ov::op::PadType::VALID : ov::op::PadType::SAME_UPPER;
 
-        auto rounding_type = ceilMode ? ngraph::op::RoundingType::CEIL : ngraph::op::RoundingType::FLOOR;
+        auto rounding_type = ceilMode ? ov::op::RoundingType::CEIL : ov::op::RoundingType::FLOOR;
         if (type == AVE) {
             auto exclude_pad = !avePoolPaddedArea;
-            auto ave_pool = std::make_shared<ngraph::op::v1::AvgPool>(ieInpNode, ngraph::Strides(strides),
-                            ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
+            auto ave_pool = std::make_shared<ov::op::v1::AvgPool>(ieInpNode, ov::Strides(strides),
+                            ov::Shape(pads_begin), ov::Shape(pads_end), ov::Shape(kernel_size),
                             exclude_pad, rounding_type, pad_type);
             return Ptr<BackendNode>(new InfEngineNgraphNode(ave_pool));
         }
         else if (type == SUM) {
-            ngraph::Shape inpShape = ieInpNode->get_shape();
+            ov::Shape inpShape = ieInpNode.get_shape();
             CV_Assert(inpShape.size() == 2 + kernel_size.size());
             std::vector<int64_t> axes;
             for (size_t i = 0; i < kernel_size.size(); i++)
@@ -608,25 +604,33 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer
                 if (inpShape[2 + i] == kernel_size[i])
                     axes.push_back(2 + i);
             }
-            auto reduction_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes);
-            auto reduce_sum = std::make_shared<ngraph::op::v1::ReduceSum>(ieInpNode, reduction_axes, true);
+            auto reduction_axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{axes.size()}, axes);
+            auto reduce_sum = std::make_shared<ov::op::v1::ReduceSum>(ieInpNode, reduction_axes, true);
             return Ptr<BackendNode>(new InfEngineNgraphNode(reduce_sum));
         }
         else if (type == MAX) {
-            auto max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
-                            ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
-                            rounding_type, pad_type);
+            std::shared_ptr<ov::Node> max_pool;
+            if (computeMaxIdx) {
+                std::vector<size_t> dilations(kernel_size.size(), 1);
+                max_pool = std::make_shared<ov::op::v8::MaxPool>(ieInpNode, ov::Strides(strides), ov::Strides(dilations),
+                                ov::Shape(pads_begin), ov::Shape(pads_end), ov::Shape(kernel_size),
+                                rounding_type, pad_type);
+            } else {
+                max_pool = std::make_shared<ov::op::v1::MaxPool>(ieInpNode, ov::Strides(strides),
+                                ov::Shape(pads_begin), ov::Shape(pads_end), ov::Shape(kernel_size),
+                                rounding_type, pad_type);
+            }
             return Ptr<BackendNode>(new InfEngineNgraphNode(max_pool));
         }
         else if (type == ROI) {
             auto& coords = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-            auto roi = std::make_shared<ngraph::op::ROIPooling>(ieInpNode, coords,
-                       ngraph::Shape{(size_t)pooledSize.height, (size_t)pooledSize.width}, spatialScale, "max");
+            auto roi = std::make_shared<ov::op::v0::ROIPooling>(ieInpNode, coords,
+                       ov::Shape{(size_t)pooledSize.height, (size_t)pooledSize.width}, spatialScale, "max");
             return Ptr<BackendNode>(new InfEngineNgraphNode(roi));
         }
         else if (type == PSROI) {
             auto& coords = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-            auto psroi = std::make_shared<ngraph::op::PSROIPooling>(ieInpNode, coords,
+            auto psroi = std::make_shared<ov::op::v0::PSROIPooling>(ieInpNode, coords,
                          (size_t)psRoiOutChannels, (size_t)pooledSize.width, spatialScale, 1, 1, "average");
             return Ptr<BackendNode>(new InfEngineNgraphNode(psroi));
         }
@@ -885,25 +889,25 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer
                                 v_float32x4 max_idx0 = v_setall_f32(-1.f);
                                 v_float32x4 max_idx1 = max_idx0;
                                 int index0 = ystart * inp_width + xstart;
-                                v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
-                                v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
+                                v_float32x4 idx0 = v_add(idx00, v_setall_f32((float)index0));
+                                v_float32x4 idx1 = v_add(idx0, v_setall_f32((float)(stride_w * 4)));
 
                                 for (int y = ystart; y < yend; ++y)
                                 {
-                                    for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
+                                    for (int x = xstart; x < xend; ++x, idx0 = v_add(idx0, ones), idx1 = v_add(idx1, ones))
                                     {
                                         const int index = y * inp_width + x;
                                         v_float32x4 v0(srcData[index], srcData[index + stride_w],
                                                        srcData[index + stride_w*2], srcData[index + stride_w*3]);
                                         v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
                                                        srcData[index + stride_w*6], srcData[index + stride_w*7]);
-                                        max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
-                                        max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
+                                        max_idx0 = v_select(v_gt(v0, max_val0), idx0, max_idx0);
+                                        max_idx1 = v_select(v_gt(v1, max_val1), idx1, max_idx1);
                                         max_val0 = v_max(max_val0, v0);
                                         max_val1 = v_max(max_val1, v1);
                                     }
-                                    idx0 += idx_delta;
-                                    idx1 += idx_delta;
+                                    idx0 = v_add(idx0, idx_delta);
+                                    idx1 = v_add(idx1, idx_delta);
                                 }
                                 v_store(dstData + x0, max_val0);
                                 v_store(dstData + x0 + 4, max_val1);
@@ -1056,12 +1060,12 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer
                                                    srcData[index + stride_w*2], srcData[index + stride_w*3]);
                                     v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
                                                    srcData[index + stride_w*6], srcData[index + stride_w*7]);
-                                    sum_val0 += v0;
-                                    sum_val1 += v1;
+                                    sum_val0 = v_add(sum_val0, v0);
+                                    sum_val1 = v_add(sum_val1, v1);
                                 }
                             }
-                            v_store(dstData + x0, sum_val0*ikarea);
-                            v_store(dstData + x0 + 4, sum_val1*ikarea);
+                            v_store(dstData + x0, v_mul(sum_val0, ikarea));
+                            v_store(dstData + x0 + 4, v_mul(sum_val1, ikarea));
                             x0 += 7;
                         }
                         else
diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index bf25927480d5..bb3aa99cd31f 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -47,13 +47,8 @@
 
 #ifdef HAVE_DNN_NGRAPH
 #include "../ie_ngraph.hpp"
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-#include <ngraph/op/prior_box.hpp>
-#include <ngraph/op/prior_box_clustered.hpp>
-#else
-#include <ngraph/op/experimental/layers/prior_box.hpp>
-#include <ngraph/op/experimental/layers/prior_box_clustered.hpp>
-#endif
+#include <openvino/op/prior_box.hpp>
+#include <openvino/op/prior_box_clustered.hpp>
 #endif
 
 #include "../op_vkcom.hpp"
@@ -346,7 +341,7 @@ class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inps.depth() == CV_16S);
+        bool use_half = (inps.depth() == CV_16F);
         inps.getUMatVector(inputs);
         outs.getUMatVector(outputs);
 
@@ -431,7 +426,7 @@ class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -513,23 +508,23 @@ class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer
         CV_Assert(nodes.size() == 2);
         auto layer = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         auto image = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
-        auto layer_shape = std::make_shared<ngraph::op::ShapeOf>(layer);
-        auto image_shape = std::make_shared<ngraph::op::ShapeOf>(image);
+        auto layer_shape = std::make_shared<ov::op::v3::ShapeOf>(layer);
+        auto image_shape = std::make_shared<ov::op::v3::ShapeOf>(image);
 
-        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{2});
-        auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{4});
-        auto strides      = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{1});
+        auto lower_bounds = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{2});
+        auto upper_bounds = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{4});
+        auto strides      = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
 
-        auto slice_layer = std::make_shared<ngraph::op::v1::StridedSlice>(layer_shape,
+        auto slice_layer = std::make_shared<ov::op::v1::StridedSlice>(layer_shape,
                                             lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
-        auto slice_image = std::make_shared<ngraph::op::v1::StridedSlice>(image_shape,
+        auto slice_image = std::make_shared<ov::op::v1::StridedSlice>(image_shape,
                                             lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
 
         if (_explicitSizes)
         {
             CV_Assert_N(!_boxWidths.empty(), !_boxHeights.empty(), !_variance.empty());
             CV_Assert(_boxWidths.size() == _boxHeights.size());
-            ngraph::op::PriorBoxClusteredAttrs attrs;
+            ov::op::v0::PriorBoxClustered::Attributes attrs;
             attrs.widths = _boxWidths;
             attrs.heights = _boxHeights;
             attrs.clip = _clip;
@@ -539,14 +534,14 @@ class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer
             attrs.step_widths = _stepX;
             attrs.variances = _variance;
 
-            auto priorBox = std::make_shared<ngraph::op::PriorBoxClustered>(slice_layer, slice_image, attrs);
-            auto axis = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{0});
-            auto unsqueeze = std::make_shared<ngraph::op::v0::Unsqueeze>(priorBox, axis);
+            auto priorBox = std::make_shared<ov::op::v0::PriorBoxClustered>(slice_layer, slice_image, attrs);
+            auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
+            auto unsqueeze = std::make_shared<ov::op::v0::Unsqueeze>(priorBox, axis);
             return Ptr<BackendNode>(new InfEngineNgraphNode(unsqueeze));
         }
         else
         {
-            ngraph::op::PriorBoxAttrs attrs;
+            ov::op::v0::PriorBox::Attributes attrs;
             attrs.min_size = _minSize;
             attrs.max_size = _maxSize;
             // doesn't work with empty aspectRatio
@@ -560,9 +555,9 @@ class PriorBoxLayerImpl CV_FINAL : public PriorBoxLayer
             attrs.step = _stepX;
             attrs.scale_all_sizes = !_aspectRatios.empty();
 
-            auto priorBox = std::make_shared<ngraph::op::PriorBox>(slice_layer, slice_image, attrs);
-            auto axis = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{1}, std::vector<int64_t>{0});
-            auto unsqueeze = std::make_shared<ngraph::op::v0::Unsqueeze>(priorBox, axis);
+            auto priorBox = std::make_shared<ov::op::v0::PriorBox>(slice_layer, slice_image, attrs);
+            auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
+            auto unsqueeze = std::make_shared<ov::op::v0::Unsqueeze>(priorBox, axis);
             return Ptr<BackendNode>(new InfEngineNgraphNode(unsqueeze));
         }
     }
diff --git a/modules/dnn/src/layers/proposal_layer.cpp b/modules/dnn/src/layers/proposal_layer.cpp
index e9edcf1547cc..66559ab9ffc7 100644
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@@ -10,11 +10,7 @@
 
 #ifdef HAVE_DNN_NGRAPH
 #include "../ie_ngraph.hpp"
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-#include <ngraph/op/proposal.hpp>
-#else
-#include <ngraph/op/experimental/layers/proposal.hpp>
-#endif
+#include <openvino/op/proposal.hpp>
 #endif
 
 namespace cv { namespace dnn {
@@ -186,7 +182,7 @@ class ProposalLayerImpl CV_FINAL : public ProposalLayer
         std::vector<UMat> outputs;
         std::vector<UMat> internals;
 
-        if (inputs_.depth() == CV_16S)
+        if (inputs_.depth() == CV_16F)
             return false;
 
         inputs_.getUMatVector(inputs);
@@ -269,7 +265,7 @@ class ProposalLayerImpl CV_FINAL : public ProposalLayer
                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -344,7 +340,7 @@ class ProposalLayerImpl CV_FINAL : public ProposalLayer
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         CV_Assert(nodes.size() == 3);
-        ngraph::op::ProposalAttrs attr;
+        ov::op::v0::Proposal::Attributes attr;
         attr.base_size     = baseSize;
         attr.nms_thresh    = nmsThreshold;
         attr.feat_stride   = featStride;
@@ -366,13 +362,13 @@ class ProposalLayerImpl CV_FINAL : public ProposalLayer
         auto& class_logits = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
         auto& image_shape  = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
 
-        CV_Assert_N(image_shape->get_shape().size() == 2, image_shape->get_shape().front() == 1);
-        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                       ngraph::Shape{1},
-                       std::vector<int64_t>{(int64_t)image_shape->get_shape().back()});
-        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(image_shape, shape, true);
+        CV_Assert_N(image_shape.get_shape().size() == 2, image_shape.get_shape().front() == 1);
+        auto shape   = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                       ov::Shape{1},
+                       std::vector<int64_t>{(int64_t)image_shape.get_shape().back()});
+        auto reshape = std::make_shared<ov::op::v1::Reshape>(image_shape, shape, true);
 
-        auto proposal = std::make_shared<ngraph::op::Proposal>(class_probs, class_logits, reshape, attr);
+        auto proposal = std::make_shared<ov::op::v0::Proposal>(class_probs, class_logits, reshape, attr);
         return Ptr<BackendNode>(new InfEngineNgraphNode(proposal));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index dc973816ef76..7448511816c0 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -390,7 +390,7 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -906,7 +906,7 @@ class RNNLayerImpl : public RNNLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -1066,7 +1066,7 @@ class GRULayerImpl CV_FINAL : public GRULayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
diff --git a/modules/dnn/src/layers/reduce_layer.cpp b/modules/dnn/src/layers/reduce_layer.cpp
index b983a791c563..d9d8b111fde2 100644
--- a/modules/dnn/src/layers/reduce_layer.cpp
+++ b/modules/dnn/src/layers/reduce_layer.cpp
@@ -380,9 +380,10 @@ class ReduceLayerImpl CV_FINAL : public ReduceLayer
                         if (unprojected_indices[j] < shape_src[unreduced_axes[j]]) {
                             break;
                         }
-                        unprojected_indices[j] = 0;
+                        unprojected_indices[j] -= shape_src[unreduced_axes[j]];
+                        current_step -= shape_src[unreduced_axes[j]] * steps_src[unreduced_axes[j]];
                         ++unprojected_indices[j - 1];
-                        current_step = steps_src[unreduced_axes[j - 1]];
+                        current_step += steps_src[unreduced_axes[j - 1]];
                     }
                 }
             }
@@ -425,7 +426,7 @@ class ReduceLayerImpl CV_FINAL : public ReduceLayer
             dtype* p_dst = dst.ptr<dtype>();
 
             size_t main_index = start / last_unreduced_dim;
-            size_t loop = start / last_unreduced_dim;
+            size_t loop = start % last_unreduced_dim;
             size_t origin = unprojected_steps[main_index] + loop * last_unreduced_step;
             for (int i = start; i < end; ++i) {
                 Op accumulator(n_reduce, p_src[origin + projected_steps[0]]);
@@ -456,7 +457,7 @@ class ReduceLayerImpl CV_FINAL : public ReduceLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp
index 7ab8cdd93ff9..409cdfa38b4e 100644
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -45,6 +45,7 @@
 #include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/dnn/all_layers.hpp>
 #include "../nms.inl.hpp"
+#include "cpu_kernels/softmax.hpp"
 
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
@@ -59,7 +60,6 @@
 using namespace cv::dnn::cuda4dnn;
 #endif
 
-
 namespace cv
 {
 namespace dnn
@@ -121,7 +121,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
     {
 #ifdef HAVE_DNN_NGRAPH
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-        return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2) && preferableTarget != DNN_TARGET_MYRIAD && new_coords == 0;
+        return preferableTarget != DNN_TARGET_MYRIAD && new_coords == 0;
 #endif
 #ifdef HAVE_CUDA
         if (backendId == DNN_BACKEND_CUDA)
@@ -160,7 +160,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
         std::vector<UMat> outputs;
 
         // TODO: implement a logistic activation to classification scores.
-        if (useLogistic || inps.depth() == CV_16S)
+        if (useLogistic || inps.depth() == CV_16F)
             return false;
 
         inps.getUMatVector(inputs);
@@ -231,7 +231,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -280,10 +280,8 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
                 }
 
                 if (useSoftmax) {  // Yolo v2
-                    for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
-                        int index = cell_size*i;
-                        softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
-                    }
+                    Mat _inpBlob = inpBlob.reshape(0, outBlob.dims, outBlob.size);
+                    softmax(outBlob, _inpBlob, -1, 5, classes);
                 }
                 else if (useLogistic) {  // Yolo v3
                     for (int i = 0; i < batch_size*rows*cols*anchors; ++i){
@@ -466,7 +464,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        auto parent_shape = input->get_shape();
+        auto parent_shape = input.get_shape();
         int64_t b = parent_shape[0];
         int64_t h = parent_shape[1];
         int64_t w = parent_shape[2];
@@ -474,55 +472,55 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
 
         int64_t cols = b * h * w * anchors;
         int64_t rows = c / anchors;
-        auto shape_node = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2},  std::vector<int64_t>{cols, rows});
-        auto tr_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, 0});
+        auto shape_node = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2},  std::vector<int64_t>{cols, rows});
+        auto tr_axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, std::vector<int64_t>{1, 0});
 
-        std::shared_ptr<ngraph::Node> input2d;
+        std::shared_ptr<ov::Node> input2d;
         {
-            input2d = std::make_shared<ngraph::op::v1::Reshape>(input, shape_node, true);
-            input2d = std::make_shared<ngraph::op::Transpose>(input2d, tr_axes);
+            input2d = std::make_shared<ov::op::v1::Reshape>(input, shape_node, true);
+            input2d = std::make_shared<ov::op::v1::Transpose>(input2d, tr_axes);
         }
 
-        std::shared_ptr<ngraph::Node> region;
+        std::shared_ptr<ov::Node> region;
         {
-            auto new_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{4}, std::vector<int64_t>{0, 3, 1, 2});
-            auto tr_input = std::make_shared<ngraph::op::Transpose>(input, new_axes);
+            auto new_axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 3, 1, 2});
+            auto tr_input = std::make_shared<ov::op::v1::Transpose>(input, new_axes);
 
             std::vector<float> anchors_vec(blobs[0].ptr<float>(), blobs[0].ptr<float>() + blobs[0].total());
             std::vector<int64_t> mask(anchors, 1);
-            region = std::make_shared<ngraph::op::RegionYolo>(tr_input, coords, classes, anchors, useSoftmax, mask, 1, 3, anchors_vec);
+            region = std::make_shared<ov::op::v0::RegionYolo>(tr_input, coords, classes, anchors, useSoftmax, mask, 1, 3, anchors_vec);
 
             auto tr_shape = tr_input->get_shape();
-            auto shape_as_inp = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                                                       ngraph::Shape{tr_shape.size()},
+            auto shape_as_inp = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                                                       ov::Shape{tr_shape.size()},
                                                                        std::vector<int64_t>(tr_shape.begin(), tr_shape.end()));
 
-            region = std::make_shared<ngraph::op::v1::Reshape>(region, shape_as_inp, true);
-            new_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{4}, std::vector<int64_t>{0, 2, 3, 1});
-            region = std::make_shared<ngraph::op::Transpose>(region, new_axes);
+            region = std::make_shared<ov::op::v1::Reshape>(region, shape_as_inp, true);
+            new_axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 2, 3, 1});
+            region = std::make_shared<ov::op::v1::Transpose>(region, new_axes);
 
-            region = std::make_shared<ngraph::op::v1::Reshape>(region, shape_node, true);
-            region = std::make_shared<ngraph::op::Transpose>(region, tr_axes);
+            region = std::make_shared<ov::op::v1::Reshape>(region, shape_node, true);
+            region = std::make_shared<ov::op::v1::Transpose>(region, tr_axes);
         }
 
-        auto strides = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, 1});
+        auto strides = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, std::vector<int64_t>{1, 1});
         std::vector<int64_t> boxes_shape{b, anchors, h, w};
-        auto shape_3d = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{boxes_shape.size()}, boxes_shape.data());
+        auto shape_3d = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{boxes_shape.size()}, boxes_shape.data());
 
-        ngraph::Shape box_broad_shape{1, (size_t)anchors, (size_t)h, (size_t)w};
-        auto scale_x_y_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &scale_x_y);
-        auto shift_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{0.5});
+        ov::Shape box_broad_shape{1, (size_t)anchors, (size_t)h, (size_t)w};
+        auto scale_x_y_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &scale_x_y);
+        auto shift_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, std::vector<float>{0.5});
 
-        auto axis = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{}, {0});
-        auto splits = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{5}, {1, 1, 1, 1, rows - 4});
-        auto split = std::make_shared<ngraph::op::v1::VariadicSplit>(input2d, axis, splits);
-        std::shared_ptr<ngraph::Node> box_x;
+        auto axis = ov::op::v0::Constant::create<int64_t>(ov::element::i64, ov::Shape{}, {0});
+        auto splits = ov::op::v0::Constant::create<int64_t>(ov::element::i64, ov::Shape{5}, {1, 1, 1, 1, rows - 4});
+        auto split = std::make_shared<ov::op::v1::VariadicSplit>(input2d, axis, splits);
+        std::shared_ptr<ov::Node> box_x;
         {
-            box_x = std::make_shared<ngraph::op::Sigmoid>(split->output(0));
-            box_x = std::make_shared<ngraph::op::v1::Subtract>(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
-            box_x = std::make_shared<ngraph::op::v1::Multiply>(box_x, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY);
-            box_x = std::make_shared<ngraph::op::v1::Add>(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
-            box_x = std::make_shared<ngraph::op::v1::Reshape>(box_x, shape_3d, true);
+            box_x = std::make_shared<ov::op::v0::Sigmoid>(split->output(0));
+            box_x = std::make_shared<ov::op::v1::Subtract>(box_x, shift_node, ov::op::AutoBroadcastType::NUMPY);
+            box_x = std::make_shared<ov::op::v1::Multiply>(box_x, scale_x_y_node, ov::op::AutoBroadcastType::NUMPY);
+            box_x = std::make_shared<ov::op::v1::Add>(box_x, shift_node, ov::op::AutoBroadcastType::NUMPY);
+            box_x = std::make_shared<ov::op::v1::Reshape>(box_x, shape_3d, true);
 
             std::vector<float> x_indices(w * h * anchors);
             auto begin = x_indices.begin();
@@ -535,20 +533,20 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
             {
                 std::copy(begin, begin + w * anchors, begin + j * w * anchors);
             }
-            auto horiz = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, box_broad_shape, x_indices.data());
-            box_x = std::make_shared<ngraph::op::v1::Add>(box_x, horiz, ngraph::op::AutoBroadcastType::NUMPY);
+            auto horiz = std::make_shared<ov::op::v0::Constant>(ov::element::f32, box_broad_shape, x_indices.data());
+            box_x = std::make_shared<ov::op::v1::Add>(box_x, horiz, ov::op::AutoBroadcastType::NUMPY);
 
-            auto cols_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{float(w)});
-            box_x = std::make_shared<ngraph::op::v1::Divide>(box_x, cols_node, ngraph::op::AutoBroadcastType::NUMPY);
+            auto cols_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, std::vector<float>{float(w)});
+            box_x = std::make_shared<ov::op::v1::Divide>(box_x, cols_node, ov::op::AutoBroadcastType::NUMPY);
         }
 
-        std::shared_ptr<ngraph::Node> box_y;
+        std::shared_ptr<ov::Node> box_y;
         {
-            box_y = std::make_shared<ngraph::op::Sigmoid>(split->output(1));
-            box_y = std::make_shared<ngraph::op::v1::Subtract>(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
-            box_y = std::make_shared<ngraph::op::v1::Multiply>(box_y, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY);
-            box_y = std::make_shared<ngraph::op::v1::Add>(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
-            box_y = std::make_shared<ngraph::op::v1::Reshape>(box_y, shape_3d, true);
+            box_y = std::make_shared<ov::op::v0::Sigmoid>(split->output(1));
+            box_y = std::make_shared<ov::op::v1::Subtract>(box_y, shift_node, ov::op::AutoBroadcastType::NUMPY);
+            box_y = std::make_shared<ov::op::v1::Multiply>(box_y, scale_x_y_node, ov::op::AutoBroadcastType::NUMPY);
+            box_y = std::make_shared<ov::op::v1::Add>(box_y, shift_node, ov::op::AutoBroadcastType::NUMPY);
+            box_y = std::make_shared<ov::op::v1::Reshape>(box_y, shape_3d, true);
 
             std::vector<float> y_indices(h * anchors);
             for (int i = 0; i < h; i++)
@@ -556,18 +554,18 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
                 std::fill(y_indices.begin() + i * anchors, y_indices.begin() + (i + 1) * anchors, i);
             }
 
-            auto vert = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1, (size_t)anchors, (size_t)h, 1}, y_indices.data());
-            box_y = std::make_shared<ngraph::op::v1::Add>(box_y, vert, ngraph::op::AutoBroadcastType::NUMPY);
-            auto rows_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{float(h)});
-            box_y = std::make_shared<ngraph::op::v1::Divide>(box_y, rows_node, ngraph::op::AutoBroadcastType::NUMPY);
+            auto vert = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, (size_t)anchors, (size_t)h, 1}, y_indices.data());
+            box_y = std::make_shared<ov::op::v1::Add>(box_y, vert, ov::op::AutoBroadcastType::NUMPY);
+            auto rows_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, std::vector<float>{float(h)});
+            box_y = std::make_shared<ov::op::v1::Divide>(box_y, rows_node, ov::op::AutoBroadcastType::NUMPY);
         }
 
-        std::shared_ptr<ngraph::Node> box_w, box_h;
+        std::shared_ptr<ov::Node> box_w, box_h;
         {
             int hNorm, wNorm;
             if (nodes.size() > 1)
             {
-                auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
+                auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
                 hNorm = node_1_shape[2];
                 wNorm = node_1_shape[3];
             }
@@ -597,53 +595,53 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
                 std::copy(bias_h.begin(), bias_h.begin() + h * anchors, bias_h.begin() + i * h * anchors);
             }
 
-            box_w = std::make_shared<ngraph::op::v0::Exp>(split->output(2));
-            box_w = std::make_shared<ngraph::op::v1::Reshape>(box_w, shape_3d, true);
-            auto anchor_w_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, box_broad_shape, bias_w.data());
-            box_w = std::make_shared<ngraph::op::v1::Multiply>(box_w, anchor_w_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_w = std::make_shared<ov::op::v0::Exp>(split->output(2));
+            box_w = std::make_shared<ov::op::v1::Reshape>(box_w, shape_3d, true);
+            auto anchor_w_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, box_broad_shape, bias_w.data());
+            box_w = std::make_shared<ov::op::v1::Multiply>(box_w, anchor_w_node, ov::op::AutoBroadcastType::NUMPY);
 
-            box_h = std::make_shared<ngraph::op::v0::Exp>(split->output(3));
-            box_h = std::make_shared<ngraph::op::v1::Reshape>(box_h, shape_3d, true);
-            auto anchor_h_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, box_broad_shape, bias_h.data());
-            box_h = std::make_shared<ngraph::op::v1::Multiply>(box_h, anchor_h_node, ngraph::op::AutoBroadcastType::NUMPY);
+            box_h = std::make_shared<ov::op::v0::Exp>(split->output(3));
+            box_h = std::make_shared<ov::op::v1::Reshape>(box_h, shape_3d, true);
+            auto anchor_h_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, box_broad_shape, bias_h.data());
+            box_h = std::make_shared<ov::op::v1::Multiply>(box_h, anchor_h_node, ov::op::AutoBroadcastType::NUMPY);
         }
 
-        auto region_splits = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{3}, {4, 1, rows - 5});
-        auto region_split = std::make_shared<ngraph::op::v1::VariadicSplit>(region, axis, region_splits);
+        auto region_splits = ov::op::v0::Constant::create<int64_t>(ov::element::i64, ov::Shape{3}, {4, 1, rows - 5});
+        auto region_split = std::make_shared<ov::op::v1::VariadicSplit>(region, axis, region_splits);
 
-        std::shared_ptr<ngraph::Node> scale;
+        std::shared_ptr<ov::Node> scale;
         {
             float thr = classfix == -1 ? 0.5 : 0;
-            auto thresh_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{thr});
-            auto mask = std::make_shared<ngraph::op::v1::Less>(region_split->output(1), thresh_node);
-            auto zero_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, mask->get_shape(), std::vector<float>(cols, 0));
-            scale = std::make_shared<ngraph::op::v1::Select>(mask, zero_node, region_split->output(1));
+            auto thresh_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, std::vector<float>{thr});
+            auto mask = std::make_shared<ov::op::v1::Less>(region_split->output(1), thresh_node);
+            auto zero_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, mask->get_shape(), std::vector<float>(cols, 0));
+            scale = std::make_shared<ov::op::v1::Select>(mask, zero_node, region_split->output(1));
         }
 
-        std::shared_ptr<ngraph::Node> probs;
+        std::shared_ptr<ov::Node> probs;
         {
-            probs = std::make_shared<ngraph::op::v1::Multiply>(region_split->output(2), scale, ngraph::op::AutoBroadcastType::NUMPY);
-            auto thresh_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &thresh);
-            auto mask = std::make_shared<ngraph::op::v1::Greater>(probs, thresh_node);
-            auto zero_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, mask->get_shape(), std::vector<float>((rows - 5) * cols, 0));
-            probs = std::make_shared<ngraph::op::v1::Select>(mask, probs, zero_node);
+            probs = std::make_shared<ov::op::v1::Multiply>(region_split->output(2), scale, ov::op::AutoBroadcastType::NUMPY);
+            auto thresh_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, &thresh);
+            auto mask = std::make_shared<ov::op::v1::Greater>(probs, thresh_node);
+            auto zero_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, mask->get_shape(), std::vector<float>((rows - 5) * cols, 0));
+            probs = std::make_shared<ov::op::v1::Select>(mask, probs, zero_node);
         }
 
 
-        auto concat_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, cols});
-        box_x = std::make_shared<ngraph::op::v1::Reshape>(box_x, concat_shape, true);
-        box_y = std::make_shared<ngraph::op::v1::Reshape>(box_y, concat_shape, true);
-        box_w = std::make_shared<ngraph::op::v1::Reshape>(box_w, concat_shape, true);
-        box_h = std::make_shared<ngraph::op::v1::Reshape>(box_h, concat_shape, true);
+        auto concat_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, std::vector<int64_t>{1, cols});
+        box_x = std::make_shared<ov::op::v1::Reshape>(box_x, concat_shape, true);
+        box_y = std::make_shared<ov::op::v1::Reshape>(box_y, concat_shape, true);
+        box_w = std::make_shared<ov::op::v1::Reshape>(box_w, concat_shape, true);
+        box_h = std::make_shared<ov::op::v1::Reshape>(box_h, concat_shape, true);
 
-        ngraph::NodeVector inp_nodes{box_x, box_y, box_w, box_h, scale, probs};
-        std::shared_ptr<ngraph::Node> result = std::make_shared<ngraph::op::Concat>(inp_nodes, 0);
-        result = std::make_shared<ngraph::op::Transpose>(result, tr_axes);
+        ov::NodeVector inp_nodes{box_x, box_y, box_w, box_h, scale, probs};
+        std::shared_ptr<ov::Node> result = std::make_shared<ov::op::v0::Concat>(inp_nodes, 0);
+        result = std::make_shared<ov::op::v1::Transpose>(result, tr_axes);
         if (b > 1)
         {
             std::vector<int64_t> sizes{b, static_cast<int64_t>(result->get_shape()[0]) / b, static_cast<int64_t>(result->get_shape()[1])};
-            auto shape_node = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{sizes.size()}, sizes.data());
-            result = std::make_shared<ngraph::op::v1::Reshape>(result, shape_node, true);
+            auto shape_node = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{sizes.size()}, sizes.data());
+            result = std::make_shared<ov::op::v1::Reshape>(result, shape_node, true);
         }
 
         return Ptr<BackendNode>(new InfEngineNgraphNode(result));
diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp
index ac7d1abfb1f2..986bb64c829b 100644
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@@ -52,11 +52,7 @@
 #include "../op_inf_engine.hpp"
 #ifdef HAVE_DNN_NGRAPH
 #include "../ie_ngraph.hpp"
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-#include <ngraph/op/reorg_yolo.hpp>
-#else
-#include <ngraph/op/experimental/layers/reorg_yolo.hpp>
-#endif
+#include <openvino/op/reorg_yolo.hpp>
 #endif
 
 #include "../op_cuda.hpp"
@@ -184,7 +180,7 @@ class ReorgLayerImpl CV_FINAL : public ReorgLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -205,7 +201,7 @@ class ReorgLayerImpl CV_FINAL : public ReorgLayer
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        auto reorg = std::make_shared<ngraph::op::ReorgYolo>(ieInpNode, ngraph::Strides{(size_t)reorgStride});
+        auto reorg = std::make_shared<ov::op::v0::ReorgYolo>(ieInpNode, ov::Strides{(size_t)reorgStride});
         return Ptr<BackendNode>(new InfEngineNgraphNode(reorg));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index a72236c47268..f259629e9625 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -369,9 +369,9 @@ class ReshapeLayerImpl CV_FINAL : public ReshapeLayer
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
 
         std::vector<int64_t> out(outShapes[0].begin(), outShapes[0].end());
-        auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                       ngraph::Shape{out.size()}, out.data());
-        auto reshape = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, shape, true);
+        auto shape   = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                       ov::Shape{out.size()}, out.data());
+        auto reshape = std::make_shared<ov::op::v1::Reshape>(ieInpNode, shape, true);
         return Ptr<BackendNode>(new InfEngineNgraphNode(reshape));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp
index 02ac29de8d6e..bf8f1b674c4d 100644
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@@ -13,11 +13,7 @@
 
 #ifdef HAVE_DNN_NGRAPH
 #include "../ie_ngraph.hpp"
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-#include <ngraph/op/interpolate.hpp>
-#else
-#include <ngraph/op/experimental/layers/interpolate.hpp>
-#endif
+#include <openvino/op/interpolate.hpp>
 #endif
 
 #ifdef HAVE_CUDA
@@ -115,7 +111,7 @@ class ResizeLayerImpl : public ResizeLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -376,75 +372,39 @@ class ResizeLayerImpl : public ResizeLayer
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
 
-#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2021_2)
-        ngraph::op::InterpolateAttrs attrs;
-        attrs.pads_begin.push_back(0);
-        attrs.pads_end.push_back(0);
-        attrs.axes = ngraph::AxisSet{2, 3};
-        attrs.align_corners = alignCorners;
-
-        if (interpolation == "nearest") {
-            attrs.mode = "nearest";
-            attrs.antialias = false;
-        } else if (interpolation == "bilinear") {
-            attrs.mode = "linear";
-        } else {
-            CV_Error(Error::StsNotImplemented, "Unsupported interpolation: " + interpolation);
-        }
-
-        std::vector<int64_t> shape = {outHeight, outWidth};
-        auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
-        auto interp = std::make_shared<ngraph::op::Interpolate>(ieInpNode, out_shape, attrs);
-#else
-        ngraph::op::v4::Interpolate::InterpolateAttrs attrs;
+        ov::op::v4::Interpolate::InterpolateAttrs attrs;
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
         if (interpolation == "nearest") {
-            attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::NEAREST;
-            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::HALF_PIXEL;
+            attrs.mode = ov::op::v4::Interpolate::InterpolateMode::NEAREST;
+            attrs.coordinate_transformation_mode = ov::op::v4::Interpolate::CoordinateTransformMode::HALF_PIXEL;
         } else if (interpolation == "bilinear") {
-            attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::LINEAR_ONNX;
-            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::ASYMMETRIC;
+            attrs.mode = ov::op::v4::Interpolate::InterpolateMode::LINEAR_ONNX;
+            attrs.coordinate_transformation_mode = ov::op::v4::Interpolate::CoordinateTransformMode::ASYMMETRIC;
         } else {
             CV_Error(Error::StsNotImplemented, format("Unsupported interpolation: %s", interpolation.c_str()));
         }
-        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::SIZES;
+        attrs.shape_calculation_mode = ov::op::v4::Interpolate::ShapeCalcMode::SIZES;
 
-        if (alignCorners) {
-            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::ALIGN_CORNERS;
+        CV_Assert(!halfPixelCenters || !alignCorners);
+        if (halfPixelCenters) {
+            attrs.coordinate_transformation_mode = ov::op::v4::Interpolate::CoordinateTransformMode::HALF_PIXEL;
+        } else if (alignCorners) {
+            attrs.coordinate_transformation_mode = ov::op::v4::Interpolate::CoordinateTransformMode::ALIGN_CORNERS;
         }
 
-        attrs.nearest_mode = ngraph::op::v4::Interpolate::NearestMode::ROUND_PREFER_FLOOR;
-#else
-        if (interpolation == "nearest") {
-            attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::nearest;
-            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::half_pixel;
-        } else if (interpolation == "bilinear") {
-            attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::linear_onnx;
-            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::asymmetric;
-        } else {
-            CV_Error(Error::StsNotImplemented, format("Unsupported interpolation: %s", interpolation.c_str()));
-        }
-        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::sizes;
-
-        if (alignCorners) {
-            attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::align_corners;
-        }
+        attrs.nearest_mode = ov::op::v4::Interpolate::NearestMode::ROUND_PREFER_FLOOR;
 
-        attrs.nearest_mode = ngraph::op::v4::Interpolate::NearestMode::round_prefer_floor;
-#endif // OpenVINO >= 2022.1
 
         std::vector<int64_t> shape = {outHeight, outWidth};
-        auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
+        auto out_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, shape.data());
 
-        auto& input_shape = ieInpNode->get_shape();
+        auto& input_shape = ieInpNode.get_shape();
         CV_Assert_N(input_shape[2] != 0, input_shape[3] != 0);
         std::vector<float> scales = {static_cast<float>(outHeight) / input_shape[2], static_cast<float>(outWidth) / input_shape[3]};
-        auto scales_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{2}, scales.data());
+        auto scales_shape = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2}, scales.data());
 
-        auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{2, 3});
-        auto interp = std::make_shared<ngraph::op::v4::Interpolate>(ieInpNode, out_shape, scales_shape, axes, attrs);
-#endif
+        auto axes = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, std::vector<int64_t>{2, 3});
+        auto interp = std::make_shared<ov::op::v4::Interpolate>(ieInpNode, out_shape, scales_shape, axes, attrs);
         return Ptr<BackendNode>(new InfEngineNgraphNode(interp));
     }
 #endif  // HAVE_DNN_NGRAPH
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 5338ab2215b6..4c2d585ce019 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -107,7 +107,7 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -331,44 +331,41 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto ieInpNode0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        auto ieInpNode1 = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
+        ov::Output<ov::Node> ieInpNode1;
+        if (nodes.size() > 1)
+            ieInpNode1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
 
         size_t numChannels = 1;
         if (blobs.empty())
-            for (const size_t& dim : ieInpNode1->get_shape())
+            for (const size_t& dim : ieInpNode1.get_shape())
                 numChannels *= dim;
         else
             numChannels = blobs[0].total();
 
-        std::vector<size_t> shape(ieInpNode0->get_shape().size(), 1);
+        std::vector<size_t> shape(ieInpNode0.get_shape().size(), 1);
         int cAxis = normalize_axis(axis, shape.size());
         shape[cAxis] = numChannels;
 
-        auto node = ieInpNode0;
+        std::shared_ptr<ov::Node> node;
         if (hasWeights)
         {
-            auto weight = blobs.empty() ? ieInpNode1 :
-                          std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
-
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
-            node = std::make_shared<ngraph::op::v1::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
-#else
-            node = std::make_shared<ngraph::op::v0::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
-#endif
+            ov::Output<ov::Node> weight = blobs.empty() ? ieInpNode1 :
+                          std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape(shape), blobs[0].data);
+            node = std::make_shared<ov::op::v1::Multiply>(ieInpNode0, weight, ov::op::AutoBroadcastType::NUMPY);
         }
         if (hasBias || !hasWeights)
         {
-            std::shared_ptr<ngraph::Node> bias;
+            ov::Output<ov::Node> bias;
             if (hasBias)
             {
                 bias = blobs.empty() ? ieInpNode1 :
-                       std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                                              ngraph::Shape(shape), blobs.back().data);
+                       std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                                              ov::Shape(shape), blobs.back().data);
             }
             else
-                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
-                                                              ngraph::Shape(shape), std::vector<float>(numChannels, 0).data());
-            node = std::make_shared<ngraph::op::v1::Add>(node, bias, ngraph::op::AutoBroadcastType::NUMPY);
+                bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+                                                              ov::Shape(shape), std::vector<float>(numChannels, 0).data());
+            node = std::make_shared<ov::op::v1::Add>(node, bias, ov::op::AutoBroadcastType::NUMPY);
         }
         return Ptr<BackendNode>(new InfEngineNgraphNode(node));
     }
diff --git a/modules/dnn/src/layers/scatterND_layer.cpp b/modules/dnn/src/layers/scatterND_layer.cpp
index 648d35fc0c69..e64cbfae3e06 100644
--- a/modules/dnn/src/layers/scatterND_layer.cpp
+++ b/modules/dnn/src/layers/scatterND_layer.cpp
@@ -3,6 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
 #include "layers_common.hpp"
 
 #include <algorithm> // for std::max & std::min
@@ -42,7 +44,8 @@ class ScatterNDLayerImpl CV_FINAL : public ScatterNDLayer
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && reduction == REDUCTION::NONE);
     }
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -74,6 +77,11 @@ class ScatterNDLayerImpl CV_FINAL : public ScatterNDLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        if (inputs_arr.depth() == CV_16F) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
         std::vector<Mat> inputs, outputs;
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
@@ -89,49 +97,59 @@ class ScatterNDLayerImpl CV_FINAL : public ScatterNDLayer
     // NOTE: This impl does not check whether indices have duplicate entries.
     //       The last duplicate entry will overwrite the previous.
     template<typename T, typename Functor>
-    void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out)
-    {
-        data.copyTo(out);
-
-        const int* shape = data.size.p;
-        const size_t* step = data.step.p;
+    void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat& output_mat) {
+        input_mat.copyTo(output_mat);
 
-        const int ind_ndims = indices.dims;
-        const int* ind_shape = indices.size.p;
-        const T* p_indices = indices.ptr<const T>();
+        const auto &input_mat_shape = shape(input_mat);
+        std::vector<size_t> input_mat_step(input_mat_shape.size());
+        for (int i = 0; i < input_mat.dims; i++) {
+            input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
+        }
 
-        const int upd_ndims = updates.dims;
-        const int* upd_shape = updates.size.p;
-        const T* p_updates = updates.ptr<const T>();
+        const int indices_mat_ndims = indices_mat.dims;
+        const auto &indices_mat_shape = shape(indices_mat);
 
-        T* p_out = out.ptr<T>();
+        const int updates_mat_ndims = updates_mat.dims;
+        const auto &updates_mat_shape = shape(updates_mat);
 
-        int k = ind_shape[ind_ndims - 1]; // last dim of indices
-        size_t total = (size_t)(indices.total() / k);
+        int indices_last_dim = indices_mat_shape[indices_mat_ndims - 1]; // last dim of indices
 
         size_t updates_size = 1;
-        for (int i = ind_ndims - 1; i < upd_ndims; i++)
-            updates_size *= upd_shape[i];
-
-        size_t inp_start_offset = 0;
-        size_t ind_start_offset = 0;
-        size_t upd_start_offset = 0;
-        for (size_t i = 0; i < total; i++, ind_start_offset += k, upd_start_offset += updates_size)
-        {
-            const T* tmp_p_indices = p_indices + ind_start_offset;
-            inp_start_offset = 0;
-            for (int j = 0; j < k; j++)
-            {
-                CV_Assert(tmp_p_indices[j] < shape[j] && tmp_p_indices[j] > -shape[j]);
-                inp_start_offset += (((int)tmp_p_indices[j] + shape[j]) % shape[j]) * step[j];
+        for (int i = indices_mat_ndims - 1; i < updates_mat_ndims; i++)
+            updates_size *= updates_mat_shape[i];
+
+        auto fn = [&](const Range &r) {
+            size_t input_offset = 0,
+                   indices_offset = r.start * indices_last_dim,
+                   updates_offset = r.start * updates_size;
+            for (int i = r.start; i < r.end; i++) {
+                const T* indices = indices_mat.ptr<const T>();
+                const T* updates = updates_mat.ptr<const T>();
+                T* output = output_mat.ptr<T>();
+
+                input_offset = 0;
+                indices += indices_offset;
+                for (int j = 0; j < indices_last_dim; j++) {
+                    int index = static_cast<int>(*(indices + j));
+                    index = (index + input_mat_shape[j]) % input_mat_shape[j];
+                    CV_Assert(index < input_mat_shape[j] && index >= 0);
+                    input_offset += index * input_mat_step[j];
+                }
+
+                updates += updates_offset;
+                output += input_offset;
+                for (int j = 0; j < updates_size; j++) {
+                    output[j] = reduce_operation(output[j], updates[j]);
+                }
+
+                indices_offset += indices_last_dim;
+                updates_offset += updates_size;
             }
-            inp_start_offset /= sizeof(T);
+        };
 
-            const T* tmp_p_updates = p_updates + upd_start_offset;
-            T* tmp_p_out = p_out + inp_start_offset;
-            for (int j = 0; j < updates_size; j++)
-                tmp_p_out[j] = rd(tmp_p_out[j], tmp_p_updates[j]);
-        }
+        size_t total = (size_t)(indices_mat.total() / indices_last_dim);
+        double nstripes = (size_t)total * (indices_last_dim + updates_size) * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
     }
 
     template<typename... Args>
@@ -192,6 +210,18 @@ class ScatterNDLayerImpl CV_FINAL : public ScatterNDLayer
                 CV_Error(Error::StsBadArg, "Unsupported reduction.");
         };
     }
+
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto scatterND = std::make_shared<ov::op::v3::ScatterNDUpdate>(
+            nodes[0].dynamicCast<InfEngineNgraphNode>()->node,
+            std::make_shared<ov::op::v0::Convert>(nodes[1].dynamicCast<InfEngineNgraphNode>()->node, ov::element::i32),
+            nodes[2].dynamicCast<InfEngineNgraphNode>()->node);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(scatterND));
+    }
+#endif  // HAVE_DNN_NGRAPH
 };
 
 Ptr<ScatterNDLayer> ScatterNDLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/scatter_layer.cpp b/modules/dnn/src/layers/scatter_layer.cpp
index 084eecb03c48..58d2c2daead7 100644
--- a/modules/dnn/src/layers/scatter_layer.cpp
+++ b/modules/dnn/src/layers/scatter_layer.cpp
@@ -3,6 +3,8 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
 #include "layers_common.hpp"
 
 #include <algorithm> // for std::max & std::min
@@ -43,7 +45,8 @@ class ScatterLayerImpl CV_FINAL : public ScatterLayer
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && reduction == REDUCTION::NONE);
     }
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -68,6 +71,11 @@ class ScatterLayerImpl CV_FINAL : public ScatterLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        if (inputs_arr.depth() == CV_16F) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
         std::vector<Mat> inputs, outputs;
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
@@ -81,59 +89,62 @@ class ScatterLayerImpl CV_FINAL : public ScatterLayer
     }
 
     template<typename T, typename Functor>
-    void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out)
-    {
-        data.copyTo(out);
+    void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat &output_mat) {
+        input_mat.copyTo(output_mat);
 
-        const int ndims = data.dims;
-        const int* shape = data.size.p;
-        const size_t* step = data.step.p;
+        const int ndims = input_mat.dims;
 
-        const int* ind_shape = indices.size.p;
-        const size_t* ind_step = indices.step.p;
+        const auto &input_mat_shape = shape(input_mat);
+        std::vector<size_t> input_mat_step(ndims);
 
-        size_t inp_offset = 0;
-        size_t ind_offset = 0;
-        const T* p_index = indices.ptr<const T>();
-        const T* p_update = updates.ptr<const T>();
-        T* p_out = out.ptr<T>();
+        const auto &indices_mat_shape = shape(indices_mat);
+        std::vector<size_t> indices_mat_step(ndims);
 
-        size_t total = indices.total();
+        for (int i = 0; i < ndims; i++) {
+            input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
+            indices_mat_step[i] = static_cast<size_t>(indices_mat.step.p[i] / sizeof(T));
+        }
 
-        int j, offset_at_idx, index;
-        size_t t, idx;
-        for (size_t i = 0; i < total; i++)
-        {
-            t = i;
-            inp_offset = 0;
-            ind_offset = 0;
-            int offset_at_axis = 0;
-            for (j = ndims - 1; j >= 0; j--)
-            {
-                idx = t / ind_shape[j];
-                offset_at_idx = (int)(t - idx * ind_shape[j]);
-                ind_offset += offset_at_idx * ind_step[j];
-                inp_offset += offset_at_idx * step[j];
-                t = idx;
-                if (j == axis)
-                {
-                    offset_at_axis = offset_at_idx * step[j];
+        auto fn = [&](const Range &r) {
+            size_t input_offset = 0, indices_offset = 0;
+
+            int indices_index, index;
+            size_t axis_offset, tmp_index, j_index;
+            for (int i = r.start; i < r.end; i++) {
+                const T* indices = indices_mat.ptr<const T>();
+                const T* updates = updates_mat.ptr<const T>();
+                T* output = output_mat.ptr<T>();
+
+                input_offset = 0;
+                indices_offset = 0;
+                indices_index = i;
+                axis_offset = 0;
+                for (int j = ndims - 1; j >= 0; j--) {
+                    tmp_index = indices_index / indices_mat_shape[j];
+                    j_index = (size_t)(indices_index - tmp_index * indices_mat_shape[j]);
+                    input_offset += j_index * input_mat_step[j];
+                    indices_offset += j_index * indices_mat_step[j];
+                    indices_index = tmp_index;
+                    if (j == axis) {
+                        axis_offset = j_index * input_mat_step[j];
+                    }
                 }
-            }
-            ind_offset /= sizeof(T);
 
-            // get index and overwrite current indices
-            const T* tmp_p_index = p_index + ind_offset;
-            index = (int)(*tmp_p_index);
-            CV_Assert(index < shape[axis] && index > -shape[axis]);
+                // get index and overwrite current indices
+                index = static_cast<int>(*(indices + indices_offset));
+                index = (index + input_mat_shape[axis]) % input_mat_shape[axis];
+                CV_Assert(index < input_mat_shape[axis] && index >= 0);
+                input_offset = input_offset - axis_offset + index * input_mat_step[axis];
 
-            inp_offset = inp_offset - offset_at_axis + ((index + shape[axis]) % shape[axis]) * step[axis];
-            inp_offset /= sizeof(T);
+                updates += indices_offset;
+                output += input_offset;
+                *output = reduce_operation(*output, *updates);
+            }
+        };
 
-            const T* tmp_p_update = p_update + ind_offset;
-            T* tmp_p_out = p_out + inp_offset;
-            *tmp_p_out = rd(*tmp_p_out, *tmp_p_update);
-        }
+        size_t total = indices_mat.total();
+        double nstripes = (size_t)total * ndims * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
     }
 
     template<typename... Args>
@@ -195,6 +206,27 @@ class ScatterLayerImpl CV_FINAL : public ScatterLayer
         };
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        int32_t indicesBoundValue = nodes[0].dynamicCast<InfEngineNgraphNode>()->node.get_shape()[axis];
+        auto indicesBound = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, &indicesBoundValue);
+        auto indices = std::make_shared<ov::op::v0::Convert>(nodes[1].dynamicCast<InfEngineNgraphNode>()->node, ov::element::i32);
+        auto indicesNonNegative = std::make_shared<ov::op::v1::Mod>(
+            std::make_shared<ov::op::v1::Add>(indices, indicesBound),
+            indicesBound);
+
+        auto axis_node = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, &axis);
+        auto scatterElements = std::make_shared<ov::op::v3::ScatterElementsUpdate>(
+            nodes[0].dynamicCast<InfEngineNgraphNode>()->node,
+            indicesNonNegative,
+            nodes[2].dynamicCast<InfEngineNgraphNode>()->node,
+            axis_node);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(scatterElements));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
 private:
     // Attributes
     int axis;
diff --git a/modules/dnn/src/layers/shuffle_channel_layer.cpp b/modules/dnn/src/layers/shuffle_channel_layer.cpp
index 2a698d270fa8..0d0ee2dfef5b 100644
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
@@ -107,7 +107,7 @@ class ShuffleChannelLayerImpl CV_FINAL : public ShuffleChannelLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index d3675e23a536..de302ec29194 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -621,7 +621,7 @@ class SliceLayerImpl : public SliceLayer
             {
                 std::vector<int> inpIdx(dimsNum, 0);
                 std::vector<int> outIdx(dimsNum, 0);
-                if (inpMat.type() == CV_16S)
+                if (inpMat.type() == CV_16F)
                     getSliceRecursive<int16_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
                 else if (inpMat.type() == CV_8S)
                     getSliceRecursive<int8_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
@@ -759,7 +759,7 @@ class SliceLayerImpl : public SliceLayer
     {
         CV_Assert_N(nodes.size() <= 2);
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        CV_Assert(finalSliceRanges[0].size() == ieInpNode->get_shape().size());
+        CV_Assert(finalSliceRanges[0].size() == ieInpNode.get_shape().size());
 
         std::vector<int64_t> offsets, dims;
         for (int i = 0; i < finalSliceRanges[0].size(); ++i)
@@ -768,14 +768,14 @@ class SliceLayerImpl : public SliceLayer
             dims.push_back(finalSliceRanges[0][i].end);
         }
 
-        auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                             ngraph::Shape{offsets.size()}, offsets.data());
-        auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                             ngraph::Shape{dims.size()}, dims.data());
-        auto strides = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                                        ngraph::Shape{dims.size()}, std::vector<int64_t>((int64_t)dims.size(), 1));
+        auto lower_bounds = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                             ov::Shape{offsets.size()}, offsets.data());
+        auto upper_bounds = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                             ov::Shape{dims.size()}, dims.data());
+        auto strides = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
+                                        ov::Shape{dims.size()}, std::vector<int64_t>((int64_t)dims.size(), 1));
 
-        auto slice = std::make_shared<ngraph::op::v1::StridedSlice>(ieInpNode,
+        auto slice = std::make_shared<ov::op::v1::StridedSlice>(ieInpNode,
                                       lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
 
         return Ptr<BackendNode>(new InfEngineNgraphNode(slice));
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index b74f2b6791f8..239ad1574bb1 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -52,6 +52,7 @@
 #include <algorithm>
 #include <stdlib.h>
 #include <opencv2/core/utils/logger.hpp>
+#include "cpu_kernels/softmax.hpp"
 using std::max;
 
 #ifdef HAVE_OPENCL
@@ -75,7 +76,7 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
 
     SoftMaxLayerImpl(const LayerParams& params)
     {
-        axisRaw = params.get<int>("axis", 1);
+        axisRaw = params.get<int>("axis", -1);
         logSoftMax = params.get<bool>("log_softmax", false);
         setParamsFrom(params);
     }
@@ -131,7 +132,7 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
         std::vector<UMat> outputs;
         std::vector<UMat> internals;
 
-        bool use_half = (inputs_.depth() == CV_16S);
+        bool use_half = (inputs_.depth() == CV_16F);
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
         internals_.getUMatVector(internals);
@@ -216,7 +217,7 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -225,89 +226,15 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
         std::vector<Mat> inputs, outputs, internals;
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
-        internals_arr.getMatVector(internals);
 
         const Mat &src = inputs[0];
         Mat &dst = outputs[0];
-
         int axis = normalize_axis(axisRaw, src.dims);
-        size_t outerSize = src.total(0, axis), channels = src.size[axis],
-                innerSize = src.total(axis + 1);
-
-        CV_Assert(src.type() == CV_32F);
-        CV_Assert(src.isContinuous() && dst.isContinuous());
-
-        const float *srcPtr = src.ptr<float>();
-        float *dstPtr = dst.ptr<float>();
-        float *bufPtr = internals[0].ptr<float>();
-
-        size_t outerStep = src.total(axis);
-        size_t cnStep = src.total(axis + 1);
-
-        //compute max along axis
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
-
-            memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
-
-            for (size_t cnDim = 1; cnDim < channels; cnDim++)
-            {
-                for (size_t i = 0; i < innerSize; i++)
-                    bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
-            }
-        }
-
-        //subtract max
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
-
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                const int offset = srcOffset + cnDim * cnStep;
-                for (size_t i = 0; i < innerSize; i++)
-                    dstPtr[offset + i] = srcPtr[offset + i] - bufPtr[bufOffset + i];
-            }
-        }
-
-        cv::exp(dst, dst);
-
-        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-        {
-            size_t srcOffset = outerDim * outerStep;
-            size_t bufOffset = outerDim * cnStep;
-
-            //sum exp along axis
-            for (size_t i = 0; i < innerSize; i++)
-                bufPtr[bufOffset + i] = 0.f;
-
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                const int offset = srcOffset + cnDim * cnStep;
-                for (size_t i = 0; i < innerSize; i++)
-                    bufPtr[bufOffset + i] += dstPtr[offset + i];
-            }
 
-            //divide by computed sum
-            for (size_t cnDim = 0; cnDim < channels; cnDim++)
-            {
-                const int offset = srcOffset + cnDim * cnStep;
-                for (size_t i = 0; i < innerSize; i++)
-                    dstPtr[offset + i] /= bufPtr[bufOffset + i];
-            }
-            if (logSoftMax)
-            {
-                for (size_t cnDim = 0; cnDim < channels; cnDim++)
-                {
-                    const int offset = srcOffset + cnDim * cnStep;
-                    for (size_t i = 0; i < innerSize; i++)
-                        dstPtr[offset + i] = log(dstPtr[offset + i]);
-                }
-            }
-        }
+        if(logSoftMax)
+            logSoftmax(dst, src, axis);
+        else
+            softmax(dst, src, axis);
     }
 
 #ifdef HAVE_CUDA
@@ -385,12 +312,12 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        int axis = normalize_axis(axisRaw, ieInpNode->get_shape().size());
-        auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
-        if (logSoftMax)
-            return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));
-
-        return Ptr<BackendNode>(new InfEngineNgraphNode(softmax));
+        int axis = normalize_axis(axisRaw, ieInpNode.get_shape().size());
+        if (logSoftMax) {
+            return new InfEngineNgraphNode(std::make_shared<ov::op::v5::LogSoftmax>(ieInpNode, axis));
+        } else {
+            return new InfEngineNgraphNode(std::make_shared<ov::op::v1::Softmax>(ieInpNode, axis));
+        }
     }
 #endif  // HAVE_DNN_NGRAPH
 
diff --git a/modules/dnn/src/layers/tile_layer.cpp b/modules/dnn/src/layers/tile_layer.cpp
index abaf96bd4afb..1357b9e89e46 100644
--- a/modules/dnn/src/layers/tile_layer.cpp
+++ b/modules/dnn/src/layers/tile_layer.cpp
@@ -4,6 +4,8 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
+#include "../ie_ngraph.hpp"
 
 #include <opencv2/dnn/shape_utils.hpp>
 
@@ -31,7 +33,8 @@ class TileLayerImpl CV_FINAL : public TileLayer
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV;
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;
     }
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -85,6 +88,16 @@ class TileLayerImpl CV_FINAL : public TileLayer
         tmp.copyTo(out);
     }
 
+#ifdef HAVE_DNN_NGRAPH
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        auto repeats_node = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{repeats.size()}, repeats.data());
+        auto tile = std::make_shared<ov::op::v0::Tile>(nodes[0].dynamicCast<InfEngineNgraphNode>()->node, repeats_node);
+        return Ptr<BackendNode>(new InfEngineNgraphNode(tile));
+    }
+#endif  // HAVE_DNN_NGRAPH
+
 private:
     std::vector<int> repeats;
 };
diff --git a/modules/dnn/src/legacy_backend.cpp b/modules/dnn/src/legacy_backend.cpp
index b092fb057c0a..12ccdf690490 100644
--- a/modules/dnn/src/legacy_backend.cpp
+++ b/modules/dnn/src/legacy_backend.cpp
@@ -24,7 +24,7 @@ BackendNode::BackendNode(int backendId)
     : backendId(backendId)
 {}
 
-BackendNode::~BackendNode() {};
+BackendNode::~BackendNode() {}
 
 BackendWrapper::BackendWrapper(int backendId, int targetId)
     : backendId(backendId)
diff --git a/modules/dnn/src/model.cpp b/modules/dnn/src/model.cpp
index 8d1a7889561f..6d045e494520 100644
--- a/modules/dnn/src/model.cpp
+++ b/modules/dnn/src/model.cpp
@@ -37,6 +37,7 @@ struct Model::Impl
 
     virtual void setPreferableBackend(Backend backendId) { net.setPreferableBackend(backendId); }
     virtual void setPreferableTarget(Target targetId) { net.setPreferableTarget(targetId); }
+    virtual void enableWinograd(bool useWinograd) { net.enableWinograd(useWinograd); }
 
     virtual
     void initNet(const Net& network)
@@ -89,6 +90,11 @@ struct Model::Impl
     {
         swapRB = swapRB_;
     }
+    /*virtual*/
+    void setOutputNames(const std::vector<String>& outNames_)
+    {
+        outNames = outNames_;
+    }
 
     /*virtual*/
     void processFrame(InputArray frame, OutputArrayOfArrays outs)
@@ -151,6 +157,7 @@ Model& Model::setPreferableBackend(Backend backendId)
     impl->setPreferableBackend(backendId);
     return *this;
 }
+
 Model& Model::setPreferableTarget(Target targetId)
 {
     CV_DbgAssert(impl);
@@ -158,6 +165,13 @@ Model& Model::setPreferableTarget(Target targetId)
     return *this;
 }
 
+Model& Model::enableWinograd(bool useWinograd)
+{
+    CV_DbgAssert(impl);
+    impl->enableWinograd(useWinograd);
+    return *this;
+}
+
 Model& Model::setInputSize(const Size& size)
 {
     CV_DbgAssert(impl);
@@ -195,6 +209,13 @@ Model& Model::setInputSwapRB(bool swapRB)
     return *this;
 }
 
+Model& Model::setOutputNames(const std::vector<String>& outNames)
+{
+    CV_DbgAssert(impl);
+    impl->setOutputNames(outNames);
+    return *this;
+}
+
 void Model::setInputParams(double scale, const Size& size, const Scalar& mean,
                            bool swapRB, bool crop)
 {
@@ -306,9 +327,9 @@ void ClassificationModel::classify(InputArray frame, int& classId, float& conf)
 }
 
 KeypointsModel::KeypointsModel(const String& model, const String& config)
-    : Model(model, config) {};
+    : Model(model, config) {}
 
-KeypointsModel::KeypointsModel(const Net& network) : Model(network) {};
+KeypointsModel::KeypointsModel(const Net& network) : Model(network) {}
 
 std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
 {
@@ -364,15 +385,17 @@ std::vector<Point2f> KeypointsModel::estimate(InputArray frame, float thresh)
 }
 
 SegmentationModel::SegmentationModel(const String& model, const String& config)
-    : Model(model, config) {};
+    : Model(model, config) {}
 
-SegmentationModel::SegmentationModel(const Net& network) : Model(network) {};
+SegmentationModel::SegmentationModel(const Net& network) : Model(network) {}
 
 void SegmentationModel::segment(InputArray frame, OutputArray mask)
 {
     std::vector<Mat> outs;
     impl->processFrame(frame, outs);
-    CV_Assert(outs.size() == 1);
+    // default output is the first one
+    if(outs.size() > 1)
+        outs.resize(1);
     Mat score = outs[0];
 
     const int chns = score.size[1];
diff --git a/modules/dnn/src/net.cpp b/modules/dnn/src/net.cpp
index 3b200a108e26..b4ed3570f881 100644
--- a/modules/dnn/src/net.cpp
+++ b/modules/dnn/src/net.cpp
@@ -13,6 +13,7 @@ CV__DNN_INLINE_NS_BEGIN
 Net::Net()
     : impl(makePtr<Net::Impl>())
 {
+    setPreferableBackend(DNN_BACKEND_DEFAULT);
 }
 
 Net::~Net()
@@ -215,6 +216,16 @@ void Net::dumpToFile(const String& path)
     file.close();
 }
 
+void Net::dumpToPbtxt(const String& path)
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(impl);
+    CV_Assert(!empty());
+    std::ofstream file(path.c_str());
+    file << impl->dumpToPbtxt(true);
+    file.close();
+}
+
 Ptr<Layer> Net::getLayer(int layerId) const
 {
     CV_Assert(impl);
diff --git a/modules/dnn/src/net_cann.cpp b/modules/dnn/src/net_cann.cpp
index a3eb52200f44..103c7c8dd275 100644
--- a/modules/dnn/src/net_cann.cpp
+++ b/modules/dnn/src/net_cann.cpp
@@ -304,9 +304,9 @@ std::shared_ptr<ge::ModelBufferData> compileCannGraph(std::shared_ptr<ge::Graph>
         bool ok;
         if ((child=fork()) == 0)
         {
-            // initialize engine
+            // initialize engine   Ascend310/Ascend310P3/Ascend910B/Ascend310B
             std::map<ge::AscendString, ge::AscendString> options = {
-                {ge::AscendString(ge::ir_option::SOC_VERSION), ge::AscendString("Ascend310")},
+                {ge::AscendString(ge::ir_option::SOC_VERSION), ge::AscendString(aclrtGetSocName())},
             };
             ACL_CHECK_GRAPH_RET(ge::aclgrphBuildInitialize(options));
 
diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp
index c8341e4c6f7c..28db8c0566fa 100644
--- a/modules/dnn/src/net_impl.cpp
+++ b/modules/dnn/src/net_impl.cpp
@@ -514,7 +514,7 @@ void Net::Impl::allocateLayer(int lid, const LayersShapesMap& layersShapes)
     CV_Assert(layerShapesIt != layersShapes.end());
 
     if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_OPENCL_FP16 && ld.dtype == CV_32F)
-        ld.dtype = CV_16S;
+        ld.dtype = CV_16F;
 
     std::vector<LayerPin> pinsForInternalBlobs;
     blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
@@ -572,7 +572,7 @@ void Net::Impl::allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
             preferableTarget == DNN_TARGET_OPENCL_FP16 &&
             layers[0].dtype == CV_32F)
         {
-            layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
+            layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16F);
         }
         inputShapes.push_back(shape(inp));
     }
@@ -656,20 +656,20 @@ void Net::Impl::forwardLayer(LayerData& ld)
                     {
                         UMat& u = umat_outputBlobs[i];
                         Mat m;
-                        if (u.depth() == CV_16S)  // FP16
-                            convertFp16(u, m);
+                        if (u.depth() == CV_16F)  // FP16
+                            u.convertTo(m, CV_32F);
                         else
                             m = u.getMat(ACCESS_READ);
                         if (!checkRange(m))
                         {
-                            std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
-                            std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
+                            CV_LOG_WARNING(NULL, "NaN detected in layer output: id=" << ld.id << " name=" << layer->name
+                                           << " output id=" << i << " output shape=" << shape(m));
                             fail = true;
                         }
                         else if (!checkRange(m, true, NULL, -1e6, 1e6))
                         {
-                            std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
-                            std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
+                            CV_LOG_WARNING(NULL, "Inf detected in layer output: id=" << ld.id << " name=" << layer->name
+                                           << " output id=" << i << " output shape=" << shape(m));
                             fail = true;
                         }
                     }
@@ -679,8 +679,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
                         {
                             UMat& u = umat_inputBlobs[i];
                             Mat m;
-                            if (u.depth() == CV_16S)  // FP16
-                                convertFp16(u, m);
+                            if (u.depth() == CV_16F)  // FP16
+                                u.convertTo(m, CV_32F);
                             else
                                 m = u.getMat(ACCESS_READ);
                             std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
@@ -690,8 +690,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
                         {
                             UMat& u = umat_outputBlobs[i];
                             Mat m;
-                            if (u.depth() == CV_16S)  // FP16
-                                convertFp16(u, m);
+                            if (u.depth() == CV_16F)  // FP16
+                                u.convertTo(m, CV_32F);
                             else
                                 m = u.getMat(ACCESS_READ);
                             std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
@@ -701,8 +701,8 @@ void Net::Impl::forwardLayer(LayerData& ld)
                         {
                             UMat& u = umat_internalBlobs[i];
                             Mat m;
-                            if (u.depth() == CV_16S)  // FP16
-                                convertFp16(u, m);
+                            if (u.depth() == CV_16F)  // FP16
+                                u.convertTo(m, CV_32F);
                             else
                                 m = u.getMat(ACCESS_READ);
                             std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
@@ -738,14 +738,14 @@ void Net::Impl::forwardLayer(LayerData& ld)
                         const Mat& m = ld.outputBlobs[i];
                         if (!checkRange(m))
                         {
-                            std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
-                            std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
+                            CV_LOG_WARNING(NULL, "NaN detected in layer output: "
+                                << cv::format("id=%d name=%s output id=%zu output shape=", ld.id, layer->name.c_str(), i) << shape(m));
                             fail = true;
                         }
                         else if (!checkRange(m, true, NULL, -1e6, 1e6))
                         {
-                            std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
-                            std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
+                            CV_LOG_WARNING(NULL, "Inf detected in layer output: "
+                                << cv::format("id=%d name=%s output id=%zu output shape=", ld.id, layer->name.c_str(), i) << shape(m));
                             fail = true;
                         }
                     }
@@ -918,7 +918,6 @@ AsyncArray Net::Impl::forwardAsync(const String& outputName)
     CV_Assert(!empty());
     FPDenormalsIgnoreHintScope fp_denormals_ignore_scope;
 
-#ifdef CV_CXX11
     String layerName = outputName;
 
     if (layerName.empty())
@@ -939,9 +938,6 @@ AsyncArray Net::Impl::forwardAsync(const String& outputName)
     isAsync = false;
 
     return getBlobAsync(layerName);
-#else
-    CV_Error(Error::StsNotImplemented, "DNN: Asynchronous forward requires build with enabled C++11");
-#endif  // CV_CXX11
 }
 
 
@@ -985,12 +981,12 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
                 ld.outputBlobsWrappers[i]->copyToHost();
             }
         }
-        if (ld.outputBlobs[0].depth() == CV_16S)
+        if (ld.outputBlobs[0].depth() == CV_16F)
         {
             std::vector<Mat>& outputvec = *(std::vector<Mat>*)outputBlobs.getObj();
             outputvec.resize(ld.outputBlobs.size());
             for (int i = 0; i < outputvec.size(); i++)
-                convertFp16(ld.outputBlobs[i], outputvec[i]);
+                ld.outputBlobs[i].convertTo(outputvec[i], CV_32F);
         }
         else
         {
@@ -1013,7 +1009,7 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
                 std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
                 outputvec.resize(out_vec.size());
                 for (int i = 0; i < out_vec.size(); i++)
-                    convertFp16(out_vec[i], outputvec[i]);
+                    out_vec[i].convertTo(outputvec[i], CV_32F);
             }
         }
         else
@@ -1279,7 +1275,7 @@ void Net::Impl::updateLayersShapes()
             preferableTarget == DNN_TARGET_OPENCL_FP16 &&
             inputLayerData.dtype == CV_32F)
         {
-            inp.create(inp.dims, inp.size, CV_16S);
+            inp.create(inp.dims, inp.size, CV_16F);
         }
         inputShapes.push_back(shape(inp));
     }
@@ -1348,10 +1344,10 @@ Mat Net::Impl::getBlob(const LayerPin& pin) const
         ld.outputBlobsWrappers[pin.oid]->copyToHost();
     }
 
-    if (ld.outputBlobs[pin.oid].depth() == CV_16S)
+    if (ld.outputBlobs[pin.oid].depth() == CV_16F)
     {
         Mat output_blob;
-        convertFp16(ld.outputBlobs[pin.oid], output_blob);
+        ld.outputBlobs[pin.oid].convertTo(output_blob, CV_32F);
         return output_blob;
     }
     else
@@ -1834,15 +1830,278 @@ string Net::Impl::dump(bool forceAllocation) const
     return out.str();
 }
 
+static void dumpTensorToString(std::ostringstream &out, const Mat &m, const int num_indent_spaces = 4) {
+    string indent_spaces(num_indent_spaces, ' ');
+
+    int type = 1;
+    /* Check TensorProto::DataType from https://github.com/onnx/onnx/blob/main/onnx/onnx.proto */
+    switch (m.type()) {
+        case CV_32F: break;
+        case CV_8U:  type = 2; break;
+        case CV_8S:  type = 3; break;
+        case CV_16U: type = 4; break;
+        case CV_16S: type = 5; break;
+        case CV_32S: type = 6; break;
+#if CV_VERSION_MAJOR > 4
+        case CV_64S: type = 7; break;
+        // STRING: 8
+        case CV_BOOL: type = 9; break;
+#endif
+        case CV_16F: type = 10; break;
+        case CV_64F: type = 11; break;
+#if CV_VERSION_MAJOR > 4
+        case CV_32U: type = 12; break;
+        case CV_64U: type = 13; break;
+        // COMPLEX64: 14
+        // COMPLEX128: 15
+        case CV_16BF: type = 16; break;
+#endif
+        default: CV_Error(Error::StsUnsupportedFormat, "Type of mat is not supported");
+    }
+    const auto &mshape = shape(m);
+
+    out << indent_spaces << "type {\n"
+        << indent_spaces << "  tensor_type {\n"
+        << indent_spaces << "    elem_type: " << type << "\n";
+    out << indent_spaces << "    shape {\n";
+    for (size_t i = 0; i < mshape.size(); i++) {
+        out << indent_spaces << format("      dim { dim_value: %d }\n", mshape[i]);
+    }
+    out << indent_spaces << "    }\n" // shape{}
+        << indent_spaces << "  }\n" // tensor_type{}
+        << indent_spaces << "}\n"; // type{}
+}
+
+
+static void dumpParamToString(std::ostringstream &out, const std::string &key, const DictValue &value, const int num_indent_spaces = 2) {
+    std::string indent_spaces(num_indent_spaces, ' ');
+
+    out << indent_spaces << "attribute {\n"
+        << indent_spaces << format("  name: \"%s\"\n", key.c_str());
+    if (value.size() == 1) {
+        if (value.isString()) {
+            out << indent_spaces << format("  type: STRING\n")
+                << indent_spaces << format("  s: \"%s\"\n", value.getStringValue(0).c_str());
+        } else if (value.isInt()) {
+            out << indent_spaces << format("  type: INT\n")
+                << indent_spaces << format("  i: %d\n", value.getIntValue(0));
+        } else if (value.isReal()) {
+            out << indent_spaces << format("  type: FLOAT\n")
+                << indent_spaces << format("  f: %f\n", value.getRealValue(0));
+        } else {
+            out << indent_spaces << format("  type: UNKNOWN-SCALAR\n");
+        }
+    } else {
+        if (value.isString()) {
+            out << indent_spaces << format("  type: STRINGS\n");
+        } else if (value.isInt()) {
+            out << indent_spaces << format("  type: INTS\n");
+        } else if (value.isReal()) {
+            out << indent_spaces << format("  type: FLOATS\n");
+        } else {
+            out << indent_spaces << format("  type: UNKNOWN-ARRAY\n");
+        }
+        for (int i = 0; i < value.size(); i++) {
+            if (value.isString()) {
+                out << indent_spaces << format("  strings: \"%s\"\n", value.getStringValue(i).c_str());
+            } else if (value.isInt()) {
+                out << indent_spaces << format("  ints: %d\n", value.getIntValue(i));
+            } else if (value.isReal()) {
+                out << indent_spaces << format("  floats: %f\n", value.getRealValue());
+            }
+        }
+    }
+    out << indent_spaces << "}\n"; // attribute{}
+}
+
+static void dumpLayerToString(std::ostringstream &out,
+                              const std::vector<std::string> &inputs,
+                              const std::vector<std::string> &outputs,
+                              const std::string &name,
+                              const std::string &op_type,
+                              const LayerParams &params,
+                              const std::string &backend_name,
+                              const std::string &target_name,
+                              const int num_indent_spaces = 2) {
+    std::string indent_spaces(num_indent_spaces, ' ');
+
+    for (size_t i = 0; i < inputs.size(); i++) {
+        out << indent_spaces << format("input: \"%s\"\n", inputs[i].c_str());
+    }
+    for (size_t i = 0; i < outputs.size(); i++) {
+        out << indent_spaces << format("output: \"%s\"\n", outputs[i].c_str());
+    }
+    if (!name.empty()) {
+        out << indent_spaces << format("name: \"%s\"\n", name.c_str());
+    }
+    if (!op_type.empty()) {
+        out << indent_spaces << format("op_type: \"%s\"\n", op_type.c_str());
+    }
+    if (!params.name.empty()) {
+        for (auto param_iter = params.begin(); param_iter != params.end(); param_iter++) {
+            auto key = param_iter->first;
+            auto value = param_iter->second;
+            dumpParamToString(out, key, value, num_indent_spaces);
+        }
+    }
+    if (!backend_name.empty()) {
+        DictValue dvb(backend_name);
+        dumpParamToString(out, "Backend", dvb, num_indent_spaces);
+    }
+    if (!target_name.empty()) {
+        DictValue dvt(target_name);
+        dumpParamToString(out, "Target", dvt, num_indent_spaces);
+    }
+}
+
+string Net::Impl::dumpToPbtxt(bool forceAllocation) const {
+    if (forceAllocation && !netWasAllocated) {
+        const_cast<Net::Impl*>(this)->setUpNet();
+    }
+
+    std::ostringstream out;
+    const std::map<int, LayerData> &map = layers;
+    std::map<String, Mat*> value_info;
+
+    Backend prefBackend = (Backend)preferableBackend;
+    Target prefTarget = (Target)preferableTarget;
+
+    auto GetBackendName = [] (int backendId) {
+        std::string backend = "Unknown";
+        switch (backendId) {
+            case DNN_BACKEND_DEFAULT:   backend = "DEFAULT"; break;
+            #if CV_VERSION_MAJOR <= 4
+            case DNN_BACKEND_HALIDE:    backend = "HALIDE"; break;
+            #endif
+            case DNN_BACKEND_INFERENCE_ENGINE:  // fallthru
+            case DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019:  // fallthru
+            case DNN_BACKEND_INFERENCE_ENGINE_NGRAPH: backend = "OpenVINO"; break;
+            case DNN_BACKEND_OPENCV:    backend = "OCV"; break;
+            case DNN_BACKEND_VKCOM:     backend = "VULKAN"; break;
+            case DNN_BACKEND_CUDA:      backend = "CUDA"; break;
+            case DNN_BACKEND_WEBNN:     backend = "WEBNN"; break;
+            case DNN_BACKEND_TIMVX:     backend = "TIMVX"; break;
+            case DNN_BACKEND_CANN:      backend = "CANN"; break;
+        }
+        return backend;
+    };
+    auto GetTargetName = [] (int targetId) {
+        std::string target = "Unknown";
+        switch (targetId) {
+            case DNN_TARGET_CPU:         target = "CPU"; break;
+            case DNN_TARGET_OPENCL:      target = "OCL"; break;
+            case DNN_TARGET_OPENCL_FP16: target = "OCL_FP16"; break;
+            case DNN_TARGET_MYRIAD:      target = "MYRIAD"; break;
+            case DNN_TARGET_VULKAN:      target = "VULKAN"; break;
+            case DNN_TARGET_FPGA:        target = "FPGA"; break;
+            case DNN_TARGET_CUDA:        target = "CUDA"; break;
+            case DNN_TARGET_CUDA_FP16:   target = "CUDA_FP16"; break;
+            case DNN_TARGET_HDDL:        target = "HDDL"; break;
+            case DNN_TARGET_NPU:         target = "NPU"; break;
+            case DNN_TARGET_CPU_FP16:    target = "CPU_FP16"; break;
+        }
+        return target;
+    };
+
+    const int num_indent_spaces = 2;
+    std::string indent_spaces(num_indent_spaces, ' ');
+    out << "producer_name: \"opencv dnn\"\n"
+        << "producer_version: \"" << getVersionString() << "\"\n"
+        << "graph {\n";
+    // Add nodes, inputs and outputs
+    for (std::map<int, LayerData>::const_iterator iter = map.begin(); iter != map.end(); iter++) {
+        auto &ld = iter->second;
+        if (ld.id == 0) {
+            for (int i = 0; i < ld.outputBlobs.size(); i++) {
+                const auto &name = netInputLayer->outNames.empty() ? cv::format("%s_%d", ld.name.c_str(), i) : netInputLayer->outNames[i];
+                out << indent_spaces << "input {\n"
+                    << indent_spaces << format("  name: \"%s\"\n", name.c_str());
+                // Add shape
+                if (!ld.outputBlobs.empty()) {
+                    dumpTensorToString(out, ld.outputBlobs[i], num_indent_spaces + 2);
+                }
+                out << indent_spaces << "}\n"; // input{}
+            }
+        } else if (ld.consumers.size() == 0) {
+            out << indent_spaces << "output {\n"
+                << indent_spaces << format("  name: \"%s\"\n", ld.name.c_str());
+            // Add shape
+            if (!ld.outputBlobs.empty()) {
+                dumpTensorToString(out, ld.outputBlobs.front(), num_indent_spaces + 2);
+            }
+            out << indent_spaces << "}\n"; // output{}
+        } else {
+            out << indent_spaces << "node {\n";
+            const auto &name = ld.name;
+            const auto &op_type = "cv::dnn::" + ld.type;
+            std::vector<std::string> inputs, outputs;
+            // Collect names of inputs
+            for (size_t i = 0; i < ld.inputBlobsId.size(); i++) {
+                int lid = ld.inputBlobsId[i].lid;
+                int oid = ld.inputBlobsId[i].oid;
+                std::string name;
+                if (lid == 0) {
+                    name = netInputLayer->outNames.empty() ? cv::format("%s_%d", ld.name.c_str(), oid) : netInputLayer->outNames[oid];
+                } else {
+                    name = format("%s_output%d", map.find(lid)->second.name.c_str(), oid);
+                    if (!ld.inputBlobs.empty()) {
+                        value_info.insert({name, ld.inputBlobs[i]});
+                    }
+                }
+                inputs.push_back(name);
+            }
+            // Collect names of outputs
+            for (size_t i = 0; i < ld.consumers.size(); i++) {
+                int lid = ld.consumers[i].lid;
+                const auto &layer_output_layer = map.find(lid)->second;
+                std::string name;
+                if (layer_output_layer.consumers.size() == 0) {
+                    name = layer_output_layer.name;
+                } else {
+                    name = format("%s_output%zu", ld.name.c_str(), i);
+                }
+                outputs.push_back(name);
+            }
+            const auto &params = ld.params;
+            // Collect backend and target
+            const Backend backend = ld.backendNodes.find(prefBackend) == ld.backendNodes.end() ? DNN_BACKEND_OPENCV : prefBackend;
+            const std::string backend_name = GetBackendName(backend);
+            const Target target = ld.layerInstance.empty() ? DNN_TARGET_CPU : (Target)(ld.layerInstance->preferableTarget);
+            const std::string target_name = GetTargetName(target);
+            dumpLayerToString(out, inputs, outputs, name, op_type, params, backend_name, target_name, num_indent_spaces + 2);
+            out << indent_spaces << "}\n"; // node{}
+        }
+    }
+    // Add value_info
+    for (std::map<String, Mat*>::const_iterator iter = value_info.begin(); iter != value_info.end(); iter++) {
+        out << indent_spaces << "value_info {\n"
+            << indent_spaces << format("  name: \"%s\"\n", iter->first.c_str());
+        dumpTensorToString(out, *(iter->second), num_indent_spaces + 2);
+        out << indent_spaces << "}\n"; // value_info{}
+    }
+    out << "}\n"; // graph{}
+
+    // Add preferable backend and target as metadata
+    out << "metadata_props {\n";
+    out << indent_spaces << format("  key: \"%s\"", "Preferable Backend")
+        << indent_spaces << format("  value: \"%s\"", GetBackendName(prefBackend).c_str());
+    out << "}\n"; // metadata_props{}
+    out << "metadata_props {\n";
+    out << indent_spaces << format("  key: \"%s\"", "Preferable Target")
+        << indent_spaces << format("  value: \"%s\"", GetTargetName(prefTarget).c_str());
+    out << "}\n"; // metadata_props{}
+
+    return out.str();
+}
 
 void Net::Impl::dumpNetworkToFile() const
 {
 #ifndef OPENCV_DNN_DISABLE_NETWORK_AUTO_DUMP
     string dumpFileNameBase = getDumpFileNameBase();
-    string dumpFileName = dumpFileNameBase + ".dot";
+    string dumpFileName = dumpFileNameBase + ".pbtxt";
     try
     {
-        string dumpStr = dump();
+        string dumpStr = dumpToPbtxt();
         std::ofstream out(dumpFileName.c_str(), std::ios::out | std::ios::binary);
         out << dumpStr;
     }
diff --git a/modules/dnn/src/net_impl.hpp b/modules/dnn/src/net_impl.hpp
index d935655c4aee..ba907e14b788 100644
--- a/modules/dnn/src/net_impl.hpp
+++ b/modules/dnn/src/net_impl.hpp
@@ -273,13 +273,12 @@ struct Net::Impl : public detail::NetImplBase
 
     Mat getBlob(String outputName) const;
 
-#ifdef CV_CXX11
     virtual AsyncArray getBlobAsync(const LayerPin& pin);
 
     AsyncArray getBlobAsync(String outputName);
-#endif  // CV_CXX11
 
     string dump(bool forceAllocation = false) const;
+    string dumpToPbtxt(bool forceAllocation = false) const;
 
     void dumpNetworkToFile() const;
 
diff --git a/modules/dnn/src/net_impl_backend.cpp b/modules/dnn/src/net_impl_backend.cpp
index d29b6934a280..b53908f8ec9e 100644
--- a/modules/dnn/src/net_impl_backend.cpp
+++ b/modules/dnn/src/net_impl_backend.cpp
@@ -170,11 +170,19 @@ void Net::Impl::setPreferableBackend(Net& net, int backendId)
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
         backendId = DNN_BACKEND_INFERENCE_ENGINE_NGRAPH;  // = getInferenceEngineBackendTypeParam();
 
-    if (netWasQuantized && backendId != DNN_BACKEND_OPENCV && backendId != DNN_BACKEND_TIMVX)
+    if (netWasQuantized && backendId != DNN_BACKEND_OPENCV && backendId != DNN_BACKEND_TIMVX &&
+        backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
     {
-        CV_LOG_WARNING(NULL, "DNN: Only default and TIMVX backends support quantized networks");
+        CV_LOG_WARNING(NULL, "DNN: Only default, TIMVX and OpenVINO backends support quantized networks");
         backendId = DNN_BACKEND_OPENCV;
     }
+#ifdef HAVE_DNN_NGRAPH
+    if (netWasQuantized && backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2023_0))
+    {
+        CV_LOG_WARNING(NULL, "DNN: OpenVINO 2023.0 and higher is required to supports quantized networks");
+        backendId = DNN_BACKEND_OPENCV;
+    }
+#endif
 
     if (preferableBackend != backendId)
     {
@@ -243,6 +251,14 @@ void Net::Impl::setPreferableTarget(int targetId)
 #endif
 
         clear();
+
+        if (targetId == DNN_TARGET_CPU_FP16)
+        {
+            if (useWinograd) {
+                CV_LOG_INFO(NULL, "DNN: DNN_TARGET_CPU_FP16 is set => Winograd convolution is disabled by default to preserve accuracy. If needed, enable it explicitly using enableWinograd(true).");
+                enableWinograd(false);
+            }
+        }
     }
 }
 
diff --git a/modules/dnn/src/net_impl_fuse.cpp b/modules/dnn/src/net_impl_fuse.cpp
index 4570d2b36082..b81bf14accf0 100644
--- a/modules/dnn/src/net_impl_fuse.cpp
+++ b/modules/dnn/src/net_impl_fuse.cpp
@@ -210,7 +210,7 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                 if (!nextData->params.has("operation") || toLowerCase(nextData->params.get<String>("operation")) != "add")
                 {
                     CV_LOG_DEBUG(NULL, "DNN/CPU: fusion with NaryEltwise or Eltwise Layer operation is not supported: "
-                        << nextData->params.get<String>("operation"));
+                        << toLowerCase(nextData->params.get<String>("operation", "sum")));
                     break;
                 }
 
@@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                     if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
                         break;
 #ifdef HAVE_CUDA
+                    /* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance
+                             that Concat's output is filled with data in both host and device, leading to data missing.
+                             See https://github.com/opencv/opencv/issues/24721 for more details.
+                    */
                     if (preferableBackend == DNN_BACKEND_CUDA &&
                         (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
                          (inp_i_data->layerInstance->type != "Convolution" &&
diff --git a/modules/dnn/src/net_openvino.cpp b/modules/dnn/src/net_openvino.cpp
index 5704cb9b6495..501a596e5d0a 100644
--- a/modules/dnn/src/net_openvino.cpp
+++ b/modules/dnn/src/net_openvino.cpp
@@ -9,6 +9,12 @@
 #include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/logger.hpp>
 
+#include "op_inf_engine.hpp"
+
+#ifdef HAVE_INF_ENGINE
+#include <openvino/op/util/op_types.hpp>
+#endif
+
 #include "net_impl.hpp"
 
 #include "backend.hpp"
@@ -48,7 +54,6 @@ class NetImplOpenVINO CV_FINAL : public Net::Impl
         CV_Assert(basePtr_);
         Net::Impl& base = *basePtr_;
         CV_Assert(!base.netWasAllocated);
-        CV_Assert(!base.netWasQuantized);
         netInputLayer = base.netInputLayer;
         blobsToKeep = base.blobsToKeep;
         layers = base.layers;
@@ -147,7 +152,7 @@ class NetImplOpenVINO CV_FINAL : public Net::Impl
     //string dump(bool forceAllocation = false) const override;
 
     static
-    Net createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork& ieNet);
+    Net createNetworkFromModelOptimizer(std::shared_ptr<ov::Model>& ieNet);
 
 };  // NetImplOpenVINO
 
@@ -252,7 +257,7 @@ void NetImplOpenVINO::addNgraphOutputs(LayerData& ld)
             CV_Assert(!ieInpNode->net.empty());
             if (layerNet != ieInpNode->net)
             {
-                CV_LOG_DEBUG(NULL, "DNN/IE: pin output between subnets: " << ieInpNode->node->get_friendly_name());
+                CV_LOG_DEBUG(NULL, "DNN/IE: pin output between subnets: " << ieInpNode->node.get_node()->get_friendly_name());
                 ieInpNode->net->addOutput(ieInpNode);
             }
         }
@@ -321,9 +326,6 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
         return;
     }
 
-    bool supportsCPUFallback = !isArmComputePlugin() && (preferableTarget == DNN_TARGET_CPU ||
-                               openvino::checkTarget(DNN_TARGET_CPU));
-
     // Build Inference Engine networks from sets of layers that support this
     // backend. Split a whole model on several Inference Engine networks if
     // some of layers are not implemented.
@@ -341,55 +343,8 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
 
         bool fused = ld.skip;
         Ptr<Layer> layer = ld.layerInstance;
-        if (!fused && !layer->supportBackend(preferableBackend))
-        {
-            CV_LOG_DEBUG(NULL, "DNN/IE:    NOT supported!");
-            bool customizable = ld.id != 0 && supportsCPUFallback;
-
-            // TODO: there is a bug in Myriad plugin with custom layers shape infer.
-            if (preferableTarget == DNN_TARGET_MYRIAD || preferableTarget == DNN_TARGET_HDDL)
-            {
-                for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
-                {
-                    customizable = ld.inputBlobs[i]->size[0] == 1;
-                }
-            }
-
-            // TODO: fix these workarounds
-            if (preferableTarget == DNN_TARGET_MYRIAD ||
-                preferableTarget == DNN_TARGET_HDDL ||
-                preferableTarget == DNN_TARGET_OPENCL ||
-                preferableTarget == DNN_TARGET_OPENCL_FP16)
-                customizable &= ld.type != "Concat";
-
-            if (preferableTarget == DNN_TARGET_OPENCL ||
-                preferableTarget == DNN_TARGET_OPENCL_FP16)
-                customizable &= ld.type != "Power";
-
-            if (preferableTarget == DNN_TARGET_OPENCL)
-                customizable &= ld.type != "Eltwise";
-
-            if (!customizable)
-            {
-                CV_LOG_DEBUG(NULL, "DNN/IE:    NOT customizable!");
-                addNgraphOutputs(ld);
-                net = Ptr<InfEngineNgraphNet>();
-                layer->preferableTarget = DNN_TARGET_CPU;
-
-                for (int i = 0; i < ld.inputBlobsId.size(); ++i)
-                {
-                    LayerData& inpLd = layers[ld.inputBlobsId[i].lid];
-                    Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
-                    if (!inpNode.empty())
-                    {
-                        Ptr<InfEngineNgraphNode> ieNode = inpNode.dynamicCast<InfEngineNgraphNode>();
-                        CV_Assert(!ieNode.empty());
-                        ieNode->net->addOutput(ieNode);
-                    }
-                }
-                continue;
-            }
-        }
+        if (ld.id == 0)
+            continue;
         ld.skip = true;  // Initially skip all Inference Engine supported layers.
 
         // Create a new network if one of inputs from different Inference Engine graph.
@@ -476,36 +431,23 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
             {
                 int lid = ld.inputBlobsId[i].lid;
                 int oid = ld.inputBlobsId[i].oid;
-                if (oid == 0 || lid == 0)
-                    continue;
 
                 auto ieInpNode = inputNodes[i].dynamicCast<InfEngineNgraphNode>();
-                const auto& ngraph_input_node = ieInpNode->node;
+                const auto& ngraph_input_node = ieInpNode->node.get_node_shared_ptr();
                 CV_LOG_DEBUG(NULL, "DNN/IE: bind output port " << lid << ":" << oid << " (" << ngraph_input_node->get_friendly_name() << ":" << ngraph_input_node->get_type_info().name << ")");
 
-                // Handle parameters from other subnets. Output port is not used in this case
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-                if ((ngraph::op::is_parameter(ngraph_input_node) || ngraph::op::is_constant(ngraph_input_node)) &&
-#else
-                if ((ngraph_input_node->is_parameter() || ngraph_input_node->is_constant()) &&
-#endif
+                if ((oid == 0 && ngraph_input_node->get_output_size() == 1) || lid == 0)
+                    continue;
 
-                        ngraph_input_node->get_output_size() == 1)
+                // Handle parameters from other subnets. Output port is not used in this case
+                if ((ov::op::util::is_parameter(ngraph_input_node) || ov::op::util::is_constant(ngraph_input_node)) &&
+                    ngraph_input_node->get_output_size() == 1)
                 {
                     inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ngraph_input_node));
                     continue;
                 }
                 CV_CheckLT((size_t)oid, ngraph_input_node->get_output_size(), "");
-#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-                // FIXIT refactor ".initNgraph()" API to use Output<Node>
-                // WA: use Concat to emulate Identity operation with requested output port
-                auto oid_node = std::make_shared<ngraph::op::Concat>(ngraph::OutputVector { ngraph_input_node->output(oid) }, 0);
-                inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(oid_node));
-#elif INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_3)
-                inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid)));
-#else
-                inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid, false)));
-#endif
+                inputNodes[i] = new InfEngineNgraphNode(ngraph_input_node->output(oid));
             }
 
             if (layer->supportBackend(preferableBackend))
@@ -549,13 +491,42 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
                 break;
             }
         }
-        ieNode->net->setNodePtr(&ieNode->node);
 
         net->addBlobs(ld.inputBlobsWrappers);
         net->addBlobs(ld.outputBlobsWrappers);
         addNgraphOutputs(ld);
     }
 
+    // User may choose to return only intermediate blobs but not network's result (see Test_TFLite.max_unpooling)
+    // Such layers should not be skipped when forwardLayer is called.
+    // Also, perform a sanity check that there is no double inferred networks (a single skip=false per unique net instance)
+    std::set<Ptr<InfEngineNgraphNet>> uniqueNets;
+    if (!blobsToKeep_.empty())
+    {
+        LayerPin latestLayerPin = getLatestLayerPin(blobsToKeep_);
+        for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
+        {
+            LayerData& ld = it->second;
+            auto iter = ld.backendNodes.find(preferableBackend);
+            if (iter == ld.backendNodes.end())
+                continue;
+
+            Ptr<BackendNode>& node = iter->second;
+            if (node.empty())
+                continue;
+
+            Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
+            if (ieNode.empty())
+                continue;
+
+            if (ld.id == latestLayerPin.lid) {
+                ld.skip = false;
+                uniqueNets.insert(ieNode->net);
+                break;
+            }
+        }
+    }
+
     // Initialize all networks.
     for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
     {
@@ -578,9 +549,13 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
         {
             ieNode->net->addOutput(ieNode);
             ieNode->net->createNet((Target)preferableTarget);
-            ld.skip = false;
+            if (uniqueNets.find(ieNode->net) == uniqueNets.end()) {
+                ld.skip = false;
+                uniqueNets.insert(ieNode->net);
+            }
         }
     }
+    CV_Assert(uniqueNets.size() == 1);
 }
 
 
@@ -680,18 +655,15 @@ void switchToOpenVINOBackend(Net& net)
 
 
 /*static*/
-Net NetImplOpenVINO::createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork& ieNet)
+Net NetImplOpenVINO::createNetworkFromModelOptimizer(std::shared_ptr<ov::Model>& ieNet)
 {
     CV_TRACE_FUNCTION();
 
     CV_TRACE_REGION("register_inputs");
 
-    auto ngraphFunction = ieNet.getFunction();
-    CV_Assert(ngraphFunction);
-
     std::vector<String> inputsNames;
     std::vector<MatShape> inp_shapes;
-    for (auto& it : ngraphFunction->get_parameters())
+    for (auto& it : ieNet->get_parameters())
     {
         inputsNames.push_back(it->get_friendly_name());
         std::vector<size_t> dims = it->get_shape();
@@ -700,16 +672,9 @@ Net NetImplOpenVINO::createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork
     // nGraph models produce output "Result" layers which have "/sink_port" suffix in their names.
     // Their inputs are actual model outputs and we change friendly name to it.
     // By this workaround, we produce similar outputs names comparing to ieNet.getOutputsInfo()
-    for (int i = 0; i < ngraphFunction->get_output_size(); ++i) {
-        auto res = ngraphFunction->output(i);
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
+    for (int i = 0; i < ieNet->get_output_size(); ++i) {
+        auto res = ieNet->output(i);
         const std::string& name = res.get_any_name();
-#else
-        auto out = res.get_node()->input(0).get_source_output();
-        std::string name = out.get_node()->get_friendly_name();
-        if (out.get_node()->get_output_size() > 1)
-            name += "." + std::to_string(out.get_index());
-#endif
         if (res.get_node()->get_friendly_name() != name)
             res.get_node()->set_friendly_name(name);
     }
@@ -731,7 +696,7 @@ Net NetImplOpenVINO::createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork
 
     Ptr<BackendNode> backendNode;
     {
-        auto fake_node = std::make_shared<ngraph::op::Parameter>(ngraph::element::f32, ngraph::Shape {});
+        auto fake_node = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape {});
         Ptr<InfEngineNgraphNode> backendNodeNGraph(new InfEngineNgraphNode(fake_node));
         backendNodeNGraph->net = Ptr<InfEngineNgraphNet>(new InfEngineNgraphNet(openvino_impl, ieNet));
         backendNode = backendNodeNGraph;
@@ -739,9 +704,9 @@ Net NetImplOpenVINO::createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork
 
     CV_TRACE_REGION_NEXT("register_outputs");
 
-    std::vector<std::shared_ptr<ngraph::Node>> ngraphOperations = ngraphFunction->get_ops();
+    std::vector<std::shared_ptr<ov::Node>> ngraphOperations = ieNet->get_ops();
 
-    for (auto& it : ngraphFunction->get_results())
+    for (auto& it : ieNet->get_results())
     {
         CV_TRACE_REGION("output");
         const auto& outputName = it->get_friendly_name();
@@ -806,11 +771,11 @@ Net openvino_readNetwork(const String& modelPath, const String& binPath)
 {
     FPDenormalsIgnoreHintScope fp_denormals_ignore_scope;
 
-    InferenceEngine::Core& ie = getCore("");
-    InferenceEngine::CNNNetwork ieNet;
+    ov::Core& ie = getCore("");
+    std::shared_ptr<ov::Model> ieNet;
     try
     {
-        ieNet = ie.ReadNetwork(modelPath, binPath);
+        ieNet = ie.read_model(modelPath, binPath);
     }
     catch (const std::exception& e)
     {
@@ -829,22 +794,15 @@ Net openvino_readNetwork(
 {
     FPDenormalsIgnoreHintScope fp_denormals_ignore_scope;
 
-    InferenceEngine::Core& ie = getCore("");
+    ov::Core& ie = getCore("");
 
     std::string model; model.assign((char*)bufferModelConfigPtr, bufferModelConfigSize);
 
-    InferenceEngine::CNNNetwork ieNet;
+    std::shared_ptr<ov::Model> ieNet;
     try
     {
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
         ov::Tensor weights_blob(ov::element::u8, {bufferWeightsSize}, (void*)bufferWeightsPtr);
         ieNet = ie.read_model(model, weights_blob);
-#else
-        InferenceEngine::TensorDesc tensorDesc(InferenceEngine::Precision::U8, { bufferWeightsSize }, InferenceEngine::Layout::C);
-        InferenceEngine::Blob::CPtr weights_blob = InferenceEngine::make_shared_blob<uint8_t>(tensorDesc, (uint8_t*)bufferWeightsPtr, bufferWeightsSize);
-
-        ieNet = ie.ReadNetwork(model, weights_blob);
-#endif
     }
     catch (const std::exception& e)
     {
diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
index c924d66b12b4..1da14c4c6341 100644
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@@ -156,7 +156,7 @@ static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
     CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
              gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;
 
-    bool halfPrecisionMode = (A.depth() == CV_16S);
+    bool halfPrecisionMode = (A.depth() == CV_16F);
 
     if (is_image_a)
     {
@@ -439,7 +439,7 @@ static bool ocl4dnnFastBufferGEMM(const CBLAS_TRANSPOSE TransA,
     CHECK_EQ(gemm_type == GEMM_TYPE_FAST_BUFFER, true)
              << "Invalid fast buffer gemm type." << std::endl;
 
-    bool halfPrecisionMode = (A.depth() == CV_16S);
+    bool halfPrecisionMode = (A.depth() == CV_16F);
 
     size_t sub_group_size = 8;
     bool is_small_batch = (M == 2 || M == 4 || M == 8);
@@ -544,7 +544,7 @@ bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
                        const UMat B_image, UMat C,
                        const size_t max_image_size)
 {
-    bool halfPrecisionMode = (A.depth() == CV_16S);
+    bool halfPrecisionMode = (A.depth() == CV_16F);
     gemm_type_t gemm_type = halfPrecisionMode ? GEMM_TYPE_FAST_BUFFER : GEMM_TYPE_FAST_IMAGE_32_1;
 
     if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
@@ -594,7 +594,7 @@ bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
                  const int32_t offy)
 {
     bool ret = false;
-    bool use_half = (A.depth() == CV_16S);
+    bool use_half = (A.depth() == CV_16F);
     String opts;
     if (use_half)
         opts = format("-DDtype=%s -DDtype4=%s -Dconvert_Dtype=convert_%s", "half", "half4", "half");
@@ -665,7 +665,7 @@ bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
                  const UMat X, const int32_t offX, UMat Y,
                  const int32_t offY)
 {
-    bool use_half = (X.depth() == CV_16S);
+    bool use_half = (X.depth() == CV_16F);
     String opts;
     if (use_half)
         opts = "-DDtype=half -DDtype4=half4 -Dconvert_Dtype=convert_half";
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 90cc2108d673..5df82b24e4c1 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -582,10 +582,10 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
     }
 
     if (use_half_ && !bias.empty())
-        CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
+        CV_CheckTypeEQ(bias.type(), CV_16FC1, "");
 
     if (use_half_)
-        CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
+        CV_CheckTypeEQ(weight.type(), CV_16FC1, "");
 
     prepareKernel(bottom, top, weight, bias, numImages);
     if (bestKernelConfig.empty())
@@ -740,7 +740,7 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
     if (swizzled_weights_umat.empty())
         swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
                                      kernel_h_ * (int)alignSize(kernel_w_, 2),
-                                     (use_half_) ? CV_16SC1 : CV_32FC1);
+                                     (use_half_) ? CV_16FC1 : CV_32FC1);
 
     if (!interleave) {
         int32_t channels = channels_ / group_;
@@ -777,8 +777,8 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
         UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
         if (use_half_)
         {
-            CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
-            convertFp16(weight, weight_tmp);
+            CV_CheckTypeEQ(weight.type(), CV_16FC1, "");
+            weight.convertTo(weight_tmp, CV_32F);
             weightMat = weight_tmp.getMat(ACCESS_READ);
             swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
         }
@@ -817,7 +817,7 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
         weightMat.release();
 
         if (use_half_)
-            convertFp16(swizzledWeightMat, swizzled_weights_umat);
+            swizzledWeightMat.convertTo(swizzled_weights_umat, CV_16F);
     }
 
     return true;
@@ -1140,7 +1140,7 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
 
     //int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
     CV_CheckEQ(top.total(), (size_t)numImages * num_output_ * output_h_ * output_w_, "");
-    CV_CheckTypeEQ(top.type(), (use_half_) ? CV_16SC1 : CV_32FC1, "");
+    CV_CheckTypeEQ(top.type(), (use_half_) ? CV_16FC1 : CV_32FC1, "");
     top.setTo(Scalar::all(0));
 
     bool saved_tuned = tuned_;
@@ -1154,8 +1154,8 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
     Mat mat_top, mat_verify_top;
     if (use_half_)
     {
-        convertFp16(top, new_top);
-        convertFp16(verifyTop, new_verify_top);
+        top.convertTo(new_top, CV_32F);
+        verifyTop.convertTo(new_verify_top, CV_32F);
 
         mat_top = new_top.getMat(ACCESS_READ);
         mat_verify_top = new_verify_top.getMat(ACCESS_READ);
@@ -1462,6 +1462,16 @@ void OCL4DNNConvSpatial<float>::generate_gemmlike_tuneritems(std::vector< cv::Pt
             return;
     }
 
+    // issue #24734
+    // OpenCL 1.2: https://registry.khronos.org/OpenCL/specs/opencl-1.2.pdf
+    // section 6.1.2 page 200: "Supported values of n are 2, 3, 4, 8, and 16 for all vector data types."
+    // besides of builtin types, kernel code defines extra types up to float15 (see float15 definition)
+    if (kernel_w_ > 16)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/OCL: skip KERNEL_TYPE_GEMM_LIKE with blockMKN=[" << blockM << ", " << blockK << ", " << blockN << "] kernel=" << kernel_w_ << " x " << kernel_h_);
+        return;
+    }
+
     tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
 }
 
@@ -1817,7 +1827,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
     if (loadTunedConfig())  // check external storage
         return;
 
-    UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
+    UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16FC1 : CV_32FC1);
 
     calculateBenchmark(bottom, benchData, weight, bias, numImages);
 
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
index ee7a2c7b0121..51b459ea1e29 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
@@ -97,15 +97,19 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
                                            max_image_size);
         }
 
-        if (use_half_ && bias_term_)
-        {
-            UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
-            UMat newbias, tmpTop;
+        if (bias_term_) {
+            if (use_half_) {
+                UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
+                UMat newbias, tmpTop;
 
-            convertFp16(bias, newbias);
-            convertFp16(top, tmpTop);
-            cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
-            convertFp16(tmpTop, top);
+                bias.convertTo(newbias, CV_32F);
+                top.convertTo(tmpTop, CV_32F);
+                cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
+                tmpTop.convertTo(top, CV_16F);
+            } else {
+                UMat biasOnesMat = UMat::ones(M_, 1, CV_32F);
+                cv::gemm(biasOnesMat, bias, 1, top, 1, top, 0);
+            }
         }
 
         return ret;
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index 8f9be6e96137..4b857ebc16f2 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -13,6 +13,7 @@
 
 #include <opencv2/core/utils/logger.hpp>
 #include <queue>
+#include <limits>
 
 namespace cv { namespace dnn {
 CV__DNN_INLINE_NS_BEGIN
@@ -64,6 +65,12 @@ class ONNXGraphWrapper : public ImportGraphWrapper
 public:
     ONNXGraphWrapper(opencv_onnx::GraphProto& _net) : net(_net)
     {
+        // Add a fake initializer with empty name.
+        // Some ONNX models skip their inputs. For example,
+        // Resize which has 4 inputs but 2 of them have empty names.
+        // So we add a fake empty node to which such ops may refer as input.
+        net.add_initializer();
+
         numInputs = net.input_size();
         numInitializers = net.initializer_size();
     }
@@ -76,6 +83,36 @@ class ONNXGraphWrapper : public ImportGraphWrapper
         return makePtr<ONNXNodeWrapper>(node);
     }
 
+    int getTensorShapeSize(int node_id, int node_input_id) {
+        const auto node = getNode(node_id);
+        const auto &input_name = node->getInputName(node_input_id);
+        // try to get from value_info
+        for (int i = 0; i < net.value_info_size(); i++) {
+            const auto value_info = net.value_info(i);
+            if (value_info.name() == input_name) {
+                if (value_info.has_type() && value_info.type().has_tensor_type() &&
+                    value_info.type().tensor_type().has_shape()) {
+                    return value_info.type().tensor_type().shape().dim_size();
+                } else {
+                    return -1;
+                }
+            }
+        }
+        // try to get from input
+        for (int i = 0; i < net.input_size(); i++) {
+            const auto input = net.input(i);
+            if (input.name() == input_name) {
+                if (input.has_type() && input.type().has_tensor_type() &&
+                    input.type().tensor_type().has_shape()) {
+                    return input.type().tensor_type().shape().dim_size();
+                } else {
+                    return -1;
+                }
+            }
+        }
+        return -1;
+    }
+
     int getInputInitializerId(int node_id, int node_input_id)
     {
         auto node = getNode(node_id);
@@ -125,8 +162,13 @@ class ONNXGraphWrapper : public ImportGraphWrapper
 
     virtual void removeNode(int idx) CV_OVERRIDE
     {
-        CV_Assert(idx >= numInputs + numInitializers);
-        net.mutable_node()->DeleteSubrange(idx - numInputs - numInitializers, 1);
+        if (idx >= numInputs + numInitializers)
+            net.mutable_node()->DeleteSubrange(idx - numInputs - numInitializers, 1);
+    }
+
+    virtual inline bool isCommutativeOp(const std::string& type) const CV_OVERRIDE
+    {
+        return type == "Add" || type == "Mul" || type == "Equal" || type == "Max";
     }
 
 private:
@@ -134,6 +176,476 @@ class ONNXGraphWrapper : public ImportGraphWrapper
     opencv_onnx::GraphProto& net;
 };
 
+static Mat extractConstant(const Ptr<ImportGraphWrapper>& net, int node_id, int input_id)
+{
+    auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
+    int initializer_id = onnx_net->getInputInitializerId(node_id, input_id);
+    if (initializer_id != -1)
+    {
+        return onnx_net->getMatFromInitializer(initializer_id);
+    }
+    else
+    {
+        const Ptr<ImportNodeWrapper> node = net->getNode(node_id);
+        int constant_id = Subgraph::getInputNodeId(net, node, input_id);
+        Ptr<ImportNodeWrapper> constant_ptr = net->getNode(constant_id);
+        opencv_onnx::NodeProto* constant_node = constant_ptr.dynamicCast<ONNXNodeWrapper>()->node;
+        opencv_onnx::TensorProto constant_proto = constant_node->attribute(0).t();
+        return getMatFromTensor(constant_proto);
+    }
+}
+
+static std::string getInputName(const Ptr<ImportGraphWrapper>& net, int node_id, int input_id) {
+    auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
+    int initializer_id = onnx_net->getInputInitializerId(node_id, input_id);
+    if (initializer_id != -1) {
+        return onnx_net->getNameOfInitializer(initializer_id);
+    } else {
+        const auto node = net->getNode(node_id);
+        return node->getInputName(input_id);
+    }
+}
+
+/*  Slice operator has two optional inputs "axes" and "steps". Some models may be set to have
+    Slice with optional inputs of default values, some of them don't. This Subgraph adjusts
+    all optional inputs of Slice up to 5.
+*/
+class AdjustSliceAllOptionalInputsSubgraph : public Subgraph {
+ public:
+    AdjustSliceAllOptionalInputsSubgraph(size_t num_inputs = 4) {
+        num_inputs_ = num_inputs;
+
+        int input = addNodeToMatch("");
+        int starts = addNodeToMatch("");
+        int ends = addNodeToMatch("");
+        std::vector<int> inputs{input, starts, ends};
+        for (size_t i = 3; i < num_inputs_; i++) { // axes and steps
+            inputs.push_back(addNodeToMatch(""));
+        }
+
+        slice_id = addNodeToMatch("Slice", inputs);
+
+        setFusedNode("Slice", inputs);
+    }
+
+    virtual void finalize(const Ptr<ImportGraphWrapper>&,
+                          const Ptr<ImportNodeWrapper>& fusedNode,
+                          std::vector<Ptr<ImportNodeWrapper> >&) CV_OVERRIDE
+    {
+        opencv_onnx::NodeProto* node = fusedNode.dynamicCast<ONNXNodeWrapper>()->node;
+        for (int i = num_inputs_; i < 5; ++i) {
+            node->add_input("");
+        }
+    }
+ private:
+    int slice_id;
+    size_t num_inputs_;
+};
+
+/* Fusion for biased MatMul.
+
+   Graph before fusion: [Input] -> MatMul -> Add -> [Output]
+
+   Graph after fusion:  [Input] -> MatMul -> [Output]
+                                     \
+                                     bias
+*/
+
+class BiasedMatmulSubgraph : public Subgraph {
+ public:
+    BiasedMatmulSubgraph() {
+        int input = addNodeToMatch("");
+        matmul_id = addNodeToMatch("MatMul", input, addNodeToMatch(""));
+        add_id = addNodeToMatch("Add", addNodeToMatch(""), matmul_id);
+
+        setFusedNode("MatMul", input);
+    }
+
+    virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE {
+        if (Subgraph::match(net, nodeId, matchedNodesIds)) {
+            auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
+
+            // get input weight from MatMul
+            {
+                // make sure that input A is not Constant
+                if (onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 0) >= 0) {
+                    return false;
+                } else {
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[matmul_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 0);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        return false;
+                    }
+                }
+
+                bool is_weight_const = false;
+                int initializer_id = onnx_net->getInputInitializerId(matchedNodesIds[matmul_id], 1);
+                if (initializer_id != -1) { // Initializer
+                    weight_name = onnx_net->getNameOfInitializer(initializer_id);
+                    is_weight_const = true;
+                } else { // Constant layer
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[matmul_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 1);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        weight_name = node->getInputName(1);
+                        is_weight_const = true;
+                    }
+                }
+
+                if (!is_weight_const) {
+                    return false;
+                }
+            }
+
+            // get input bias from Add
+            {
+                bool is_bias_const = false;
+                int initializer_id = std::max(onnx_net->getInputInitializerId(matchedNodesIds[add_id], 0),
+                                              onnx_net->getInputInitializerId(matchedNodesIds[add_id], 1));
+                if (initializer_id != -1) {
+                    bias_name = onnx_net->getNameOfInitializer(initializer_id);
+                    is_bias_const = true;
+                } else { // Constant layer
+                    const Ptr<ImportNodeWrapper> node = net->getNode(matchedNodesIds[add_id]);
+
+                    int constant_id = Subgraph::getInputNodeId(net, node, 0);
+                    auto constant_node = net->getNode(constant_id);
+                    if (constant_node->getType() == "Constant") {
+                        bias_name = node->getInputName(0);
+                        is_bias_const = true;
+                    } else {
+                        constant_id = Subgraph::getInputNodeId(net, node, 1);
+                        constant_node = net->getNode(constant_id);
+                        if (constant_node->getType() == "Constant") {
+                            bias_name = node->getInputName(1);
+                            is_bias_const = true;
+                        }
+                    }
+                }
+                if (!is_bias_const) {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+        return false;
+    }
+
+    virtual void finalize(const Ptr<ImportGraphWrapper>& net,
+                          const Ptr<ImportNodeWrapper>& fusedNode,
+                          std::vector<Ptr<ImportNodeWrapper> >&) CV_OVERRIDE {
+        opencv_onnx::NodeProto* node = fusedNode.dynamicCast<ONNXNodeWrapper>()->node;
+        // add inputs
+        node->add_input(weight_name);
+        node->add_input(bias_name);
+    }
+
+ private:
+    int matmul_id, add_id;
+    std::string weight_name, bias_name;
+};
+
+/*  The fusion for the multi-head attention from vision transformer.
+
+    Abbreviations:
+        B - batch_size, symbolic;
+        S - sequence_length, symbolic;
+        W - hidden_size, W = N * H;
+        N - num_heads;
+        H - head_size;
+
+    Graph before fusion:
+                    [Input](BxSxW)
+                      |
+                   LayerNorm
+                      |
+                   Transpose(perm=[1, 0, 2])
+                      |
+                      | (SxBxW)
+                      |
+                    Matmul[Weight(Wx3W)]
+                      |
+                     Add[Bias(3W)]
+          /           |           \
+      q_Slice      k_Slice      v_Slice   (output(SxBxW))
+         |            |            |
+     q_Reshape    k_Reshape    v_Reshape  (output(Sx(BxN)xH), could be optional if N=1)
+         |            |            |
+    q_Transpose  k_Transpose  v_Transpose
+      (1,0,2)      (1,2,0)    (perm=1,0,2)
+         |((BxN)xSxH) |((BxN)xHxS) |
+       q_Div         /            /
+         \          /            /
+          qk_MatMul             /
+              |                /
+         qk_Softmax           /
+              | ((BxN)xSxS)  / ((BxN)xSxH)
+               \            /
+                 qkv_MatMul  (output((BxN)xSxH))
+                     |
+                 Transpose(perm=1,2,0)
+                     |
+                  Reshape  (output(SxH))
+                     |
+                   MatMul
+                     |
+                    Add
+                     |
+                  [Output](BxSxW)
+
+
+    Attributes:
+        num_heads - number of attention heads
+        qkv_hidden_sizes - hidden size of qkv respectively, [qk_hidden_size, qk_hidden_size, v_hidden_size],
+                          assume qk_hidden_size = v_hidden_size for now. TODO: support qk_hidden_size != v_hidden_size
+        scale - scale factor of q, defaults to sqrt(1/num_heads)
+    Inputs:
+        weight - merged Q, K, V weights of shape [input_hidden_size, qk_hidden_size + qk_hidden_size + v_hidden_size]
+        bias - bias of shape [qk_hidden_size + qk_hidden_size + v_hidden_size]
+
+    Graph after fusion:
+            [Input](BxSxW)
+               |
+            LayerNorm
+               |
+           Transpose
+               |
+           Attention[weight, bias]
+               |
+             MatMul
+               |
+              Add
+               |
+            [Output](BxSxW)
+
+    More details see See https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention.
+*/
+class AttentionSubGraph : public Subgraph {
+ public:
+    AttentionSubGraph() {
+        int input = addNodeToMatch("");
+        int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph
+        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph
+
+        // v_path
+        slice_v = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        int reshape_v = addNodeToMatch("Reshape", slice_v, addNodeToMatch(""));
+        int transpose_v = addNodeToMatch("Transpose", reshape_v);
+
+        // q_path
+        slice_q = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        reshape_q = addNodeToMatch("Reshape", slice_q, addNodeToMatch(""));
+        int transpose_q = addNodeToMatch("Transpose", reshape_q);
+        div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch(""));
+
+        // k_path
+        slice_k = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        int reshape_k = addNodeToMatch("Reshape", slice_k, addNodeToMatch(""));
+        int transpose_k = addNodeToMatch("Transpose", reshape_k);
+
+        // qk
+        int matmul_qk = addNodeToMatch("MatMul", div_q, transpose_k);
+        int softmax_qk = addNodeToMatch("Softmax", matmul_qk);
+
+        // qkv
+        int matmul_qkv = addNodeToMatch("MatMul", softmax_qk, transpose_v);
+        int transpose_qkv = addNodeToMatch("Transpose", matmul_qkv);
+        last_reshape = addNodeToMatch("Reshape", transpose_qkv, addNodeToMatch(""));
+
+        setFusedNode("Attention", input);
+    }
+
+    virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE {
+        if (Subgraph::match(net, nodeId, matchedNodesIds)) {
+            // get attrs - qkv_hidden_sizes
+            qkv_hidden_sizes.clear();
+            auto fill_qkv_hidden_sizes = [&] (const int slice_node_id) {
+                int slice_start = extractConstant(net, matchedNodesIds[slice_node_id], 1).at<int>(0);
+                int slice_end = extractConstant(net, matchedNodesIds[slice_node_id], 2).at<int>(0);
+                if (slice_end == std::numeric_limits<int>::max()) {
+                    qkv_hidden_sizes.push_back(0); // workaround for Slice with end=INT_MAX
+                } else {
+                    int64_t hidden_size = static_cast<int64_t>(slice_end - slice_start);
+                    qkv_hidden_sizes.push_back(hidden_size);
+                }
+            };
+            fill_qkv_hidden_sizes(slice_q);
+            fill_qkv_hidden_sizes(slice_k);
+            fill_qkv_hidden_sizes(slice_v); // TODO: take care of INT64_MAX
+            CV_CheckEQ(qkv_hidden_sizes.size(), static_cast<size_t>(3), "ONNXSimplifier/Attention: invalid qkv hidden sizes");
+            CV_CheckEQ(int(qkv_hidden_sizes[0]), int(qkv_hidden_sizes[1]), "ONNXSimplifier/Attention: invalid qkv hidden sizes, q_hidden_size == v_hidden_size is required");
+            // get attrs - num_heads, scale
+            num_heads = extractConstant(net, matchedNodesIds[reshape_q], 1).at<int>(1);
+            scale = extractConstant(net, matchedNodesIds[div_q], 1).at<float>(0);
+            output_ndims = extractConstant(net, matchedNodesIds[last_reshape], 1).size[0];
+
+            // get names
+            weight_name = getInputName(net, matchedNodesIds[att_matmul], 1);
+            bias_name = getInputName(net, matchedNodesIds[att_matmul], 2);
+            return true;
+        }
+        return false;
+    }
+
+    virtual void finalize(const Ptr<ImportGraphWrapper>& net,
+                          const Ptr<ImportNodeWrapper>& fusedNode,
+                          std::vector<Ptr<ImportNodeWrapper> >&) CV_OVERRIDE {
+        // add attrs
+        opencv_onnx::NodeProto* node = fusedNode.dynamicCast<ONNXNodeWrapper>()->node;
+        opencv_onnx::AttributeProto* attr_num_heads = node->add_attribute();
+        attr_num_heads->set_name("num_heads");
+        attr_num_heads->set_i(num_heads);
+        opencv_onnx::AttributeProto* attr_qkv_hidden_sizes = node->add_attribute();
+        attr_qkv_hidden_sizes->set_name("qkv_hidden_sizes");
+        attr_qkv_hidden_sizes->add_ints(qkv_hidden_sizes[0]);
+        attr_qkv_hidden_sizes->add_ints(qkv_hidden_sizes[1]);
+        attr_qkv_hidden_sizes->add_ints(qkv_hidden_sizes[2]);
+        opencv_onnx::AttributeProto* attr_scale = node->add_attribute();
+        attr_scale->set_name("scale");
+        attr_scale->set_f(scale);
+
+        // add customized attrs
+        opencv_onnx::AttributeProto* attr_output_ndims = node->add_attribute();
+        attr_output_ndims->set_name("output_ndims");
+        attr_output_ndims->set_i(output_ndims);
+
+        // add inputs
+        node->add_input(weight_name);
+        node->add_input(bias_name);
+    }
+
+ private:
+    int att_matmul;
+    int slice_q, slice_k, slice_v;
+    int reshape_q, div_q, last_reshape;
+
+    std::vector<int64_t> qkv_hidden_sizes; // order: [qk_hidden_size, qk_hidden_size, v_hidden_size]
+    int64_t num_heads;
+    float scale;
+
+    int64_t output_ndims;
+
+    std::string weight_name;
+    std::string bias_name;
+};
+
+/*  Attention subgraph with single head.
+    No Reshape operator is appended after each Slice operator.
+*/
+class AttentionSingleHeadSubGraph : public Subgraph {
+ public:
+    AttentionSingleHeadSubGraph() {
+        int input = addNodeToMatch("");
+        int transpose = addNodeToMatch("Transpose", input); // tranpose does not make any differences to the accuracy here in this subgraph
+        att_matmul = addNodeToMatch("MatMul", transpose, addNodeToMatch(""), addNodeToMatch("")); // add is fused into matmul via BiasedMatMulSubgraph
+
+        // v_path
+        slice_v = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        int transpose_v = addNodeToMatch("Transpose", slice_v);
+
+        // q_path
+        slice_q = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        int transpose_q = addNodeToMatch("Transpose", slice_q);
+        div_q = addNodeToMatch("Div", transpose_q, addNodeToMatch(""));
+
+        // k_path
+        slice_k = addNodeToMatch("Slice", std::vector<int>{att_matmul, addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch(""), addNodeToMatch("")});
+        int transpose_k = addNodeToMatch("Transpose", slice_k);
+
+        // qk
+        int matmul_qk = addNodeToMatch("MatMul", div_q, transpose_k);
+        int softmax_qk = addNodeToMatch("Softmax", matmul_qk);
+
+        // qkv
+        int matmul_qkv = addNodeToMatch("MatMul", softmax_qk, transpose_v);
+        int transpose_qkv = addNodeToMatch("Transpose", matmul_qkv);
+        last_reshape = addNodeToMatch("Reshape", transpose_qkv, addNodeToMatch(""));
+
+        setFusedNode("Attention", input);
+    }
+
+    virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE {
+        if (Subgraph::match(net, nodeId, matchedNodesIds)) {
+            // get attrs - qkv_hidden_sizes
+            qkv_hidden_sizes.clear();
+            auto fill_qkv_hidden_sizes = [&] (const int slice_node_id) {
+                int slice_start = extractConstant(net, matchedNodesIds[slice_node_id], 1).at<int>(0);
+                int slice_end = extractConstant(net, matchedNodesIds[slice_node_id], 2).at<int>(0);
+                if (slice_end == std::numeric_limits<int>::max()) {
+                    qkv_hidden_sizes.push_back(0); // workaround for Slice with end=INT_MAX
+                } else {
+                    int64_t hidden_size = static_cast<int64_t>(slice_end - slice_start);
+                    qkv_hidden_sizes.push_back(hidden_size);
+                }
+            };
+            fill_qkv_hidden_sizes(slice_q);
+            fill_qkv_hidden_sizes(slice_k);
+            fill_qkv_hidden_sizes(slice_v);
+            CV_CheckEQ(qkv_hidden_sizes.size(), static_cast<size_t>(3), "ONNXSimplifier/Attention: invalid qkv hidden sizes");
+            CV_CheckEQ(int(qkv_hidden_sizes[0]), int(qkv_hidden_sizes[1]), "ONNXSimplifier/Attention: invalid qkv hidden sizes, q_hidden_size == v_hidden_size is required");
+            // get attrs - num_heads, scale
+            num_heads = 1;
+            scale = extractConstant(net, matchedNodesIds[div_q], 1).at<float>(0);
+            output_ndims = extractConstant(net, matchedNodesIds[last_reshape], 1).size[0];
+
+            // get names
+            weight_name = getInputName(net, matchedNodesIds[att_matmul], 1);
+            bias_name = getInputName(net, matchedNodesIds[att_matmul], 2);
+            return true;
+        }
+        return false;
+    }
+
+    virtual void finalize(const Ptr<ImportGraphWrapper>& net,
+                          const Ptr<ImportNodeWrapper>& fusedNode,
+                          std::vector<Ptr<ImportNodeWrapper> >&) CV_OVERRIDE {
+        // add attrs
+        opencv_onnx::NodeProto* node = fusedNode.dynamicCast<ONNXNodeWrapper>()->node;
+        opencv_onnx::AttributeProto* attr_num_heads = node->add_attribute();
+        attr_num_heads->set_name("num_heads");
+        attr_num_heads->set_i(num_heads);
+        opencv_onnx::AttributeProto* attr_qkv_hidden_sizes = node->add_attribute();
+        attr_qkv_hidden_sizes->set_name("qkv_hidden_sizes");
+        attr_qkv_hidden_sizes->add_ints(qkv_hidden_sizes[0]);
+        attr_qkv_hidden_sizes->add_ints(qkv_hidden_sizes[1]);
+        attr_qkv_hidden_sizes->add_ints(qkv_hidden_sizes[2]);
+        opencv_onnx::AttributeProto* attr_scale = node->add_attribute();
+        attr_scale->set_name("scale");
+        attr_scale->set_f(scale);
+
+        // add customized attrs
+        opencv_onnx::AttributeProto* attr_output_ndims = node->add_attribute();
+        attr_output_ndims->set_name("output_ndims");
+        attr_output_ndims->set_i(output_ndims);
+
+        // add inputs
+        node->add_input(weight_name);
+        node->add_input(bias_name);
+    }
+
+ protected:
+    int att_matmul;
+    int slice_q, slice_k, slice_v;
+    int div_q, last_reshape;
+
+    std::vector<int64_t> qkv_hidden_sizes; // order: [qk_hidden_size, qk_hidden_size, v_hidden_size]
+    int64_t num_heads;
+    float scale;
+
+    int64_t output_ndims;
+
+    std::string weight_name;
+    std::string bias_name;
+};
+
 /*  Fusion for Gelu.
 
     Graph before fusion:
@@ -151,54 +663,32 @@ class GeluSubGraph : public Subgraph
     GeluSubGraph()
     {
         int input = addNodeToMatch("");
-        int div = addNodeToMatch("Div", input, addNodeToMatch("") /* B=sqrt(2) */ );
+        div = addNodeToMatch("Div", input, addNodeToMatch("") /* B=sqrt(2) */ );
         int erf = addNodeToMatch("Erf", div);
-        int add = addNodeToMatch("Add", erf, addNodeToMatch("") /* B=1 */ );
+        add = addNodeToMatch("Add", erf, addNodeToMatch("") /* B=1 */ );
         int mul = addNodeToMatch("Mul", input, add);
-        addNodeToMatch("Mul", mul, addNodeToMatch("") /* B=0.5 */) ;
+        mul2 = addNodeToMatch("Mul", mul, addNodeToMatch("") /* B=0.5 */) ;
 
         setFusedNode("Gelu", input);
     }
 
-    static float extractConstant(const Ptr<ImportGraphWrapper>& net, int node_id, int input_id)
-    {
-        auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
-        int initializer_id = onnx_net->getInputInitializerId(node_id, input_id);
-        if (initializer_id != -1)
-        {
-            Mat const_mat = onnx_net->getMatFromInitializer(initializer_id);
-            return *const_mat.ptr<float>();
-        }
-        else
-        {
-            const Ptr<ImportNodeWrapper> node = net->getNode(node_id);
-            int constant_id = getInputNodeId(net, node, input_id);
-            Ptr<ImportNodeWrapper> constant_ptr = net->getNode(constant_id);
-            opencv_onnx::NodeProto* constant_node = constant_ptr.dynamicCast<ONNXNodeWrapper>()->node;
-            opencv_onnx::TensorProto constant_proto = constant_node->attribute(0).t();
-            Mat constant_mat = getMatFromTensor(constant_proto);
-            return *constant_mat.ptr<float>();
-        }
-    }
-
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        if (Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        if (Subgraph::match(net, nodeId, matchedNodesIds))
         {
             // Check Div[B=sqrt(2)]
-            float divisor = extractConstant(net, matchedNodesIds[0], 1);
+            float divisor = extractConstant(net, matchedNodesIds[div], 1).at<float>(0);
             if (std::fabs(divisor - M_SQRT2) >= std::numeric_limits<float>::epsilon())
                 return false;
 
             // Check Add[B=1]
-            float add_const = extractConstant(net, matchedNodesIds[2], 1);
+            float add_const = extractConstant(net, matchedNodesIds[add], 1).at<float>(0);
             if (std::fabs(add_const - 1.f) >= std::numeric_limits<float>::epsilon())
                 return false;
 
             // Check Mul[B=0.5]
-            float mul_const = extractConstant(net, matchedNodesIds[4], 1);
+            float mul_const = extractConstant(net, matchedNodesIds[mul2], 1).at<float>(0);
             if (std::fabs(mul_const - 0.5f) >= std::numeric_limits<float>::epsilon())
                 return false;
 
@@ -206,6 +696,9 @@ class GeluSubGraph : public Subgraph
         }
         return false;
     }
+
+private:
+    int div, add, mul2;
 };
 
 /*  Fusion for GeluApproximation.
@@ -229,61 +722,39 @@ class GeluApproximationSubGraph : public Subgraph
         int input = addNodeToMatch("");
         int mul0 = addNodeToMatch("Mul", input, input);
         int mul1 = addNodeToMatch("Mul", input, mul0);
-        int mul2 = addNodeToMatch("Mul", addNodeToMatch("") /* A=0.044714998453855515 */, mul1);
+        mul2 = addNodeToMatch("Mul", addNodeToMatch("") /* A=0.044714998453855515 */, mul1);
         int add0 = addNodeToMatch("Add", input, mul2);
-        int mul3 = addNodeToMatch("Mul", addNodeToMatch("") /* A=sqrt(2/pie) */, add0);
+        mul3 = addNodeToMatch("Mul", addNodeToMatch("") /* A=sqrt(2/pie) */, add0);
         int tanh = addNodeToMatch("Tanh", mul3);
-        int add1 = addNodeToMatch("Add", addNodeToMatch("") /* A=1 */, tanh);
+        add1 = addNodeToMatch("Add", addNodeToMatch("") /* A=1 */, tanh);
         int mul4 = addNodeToMatch("Mul", input, add1);
-        addNodeToMatch("Mul", addNodeToMatch("") /* A=0.5 */, mul4);
+        mul5 = addNodeToMatch("Mul", addNodeToMatch("") /* A=0.5 */, mul4);
 
         setFusedNode("GeluApproximation", input);
     }
 
-    static float extractConstant(const Ptr<ImportGraphWrapper>& net, int node_id, int input_id)
-    {
-        auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
-        int initializer_id = onnx_net->getInputInitializerId(node_id, input_id);
-        if (initializer_id != -1)
-        {
-            Mat const_mat = onnx_net->getMatFromInitializer(initializer_id);
-            return *const_mat.ptr<float>();
-        }
-        else
-        {
-            const Ptr<ImportNodeWrapper> node = net->getNode(node_id);
-            int constant_id = getInputNodeId(net, node, input_id);
-            Ptr<ImportNodeWrapper> constant_ptr = net->getNode(constant_id);
-            opencv_onnx::NodeProto* constant_node = constant_ptr.dynamicCast<ONNXNodeWrapper>()->node;
-            opencv_onnx::TensorProto constant_proto = constant_node->attribute(0).t();
-            Mat constant_mat = getMatFromTensor(constant_proto);
-            return *constant_mat.ptr<float>();
-        }
-    }
-
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        if (Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        if (Subgraph::match(net, nodeId, matchedNodesIds))
         {
             // Check Mul[A=0.044714998453855515]
-            float coef = extractConstant(net, matchedNodesIds[2], 0);
+            float coef = extractConstant(net, matchedNodesIds[mul2], 0).at<float>(0);
             if (coef - 0.044714998453855515 >= 1e-6)
                 return false;
 
             // Check Mul[A=sqrt(2/pie)]
-            float sqrt_2_pie = extractConstant(net, matchedNodesIds[4], 0);
+            float sqrt_2_pie = extractConstant(net, matchedNodesIds[mul3], 0).at<float>(0);
             if (sqrt_2_pie - 0.7978845834732056 >= 1e-6)
                 return false;
 
             // Check Add[A=1]
-            float add_const = extractConstant(net, matchedNodesIds[6], 0);
+            float add_const = extractConstant(net, matchedNodesIds[add1], 0).at<float>(0);
             if (add_const - 1.f >= 1e-6)
                 return false;
 
             // Check Mul[A=0.5]
-            float mul_const = extractConstant(net, matchedNodesIds[8], 0);
+            float mul_const = extractConstant(net, matchedNodesIds[mul5], 0).at<float>(0);
             if (mul_const - 0.5f >= 1e-6)
                 return false;
 
@@ -291,6 +762,9 @@ class GeluApproximationSubGraph : public Subgraph
         }
         return false;
     }
+
+private:
+    int mul2, mul3, add1, mul5;
 };
 
 /*  Fusion for LayerNormalization.
@@ -306,6 +780,10 @@ class GeluApproximationSubGraph : public Subgraph
         [Input] -> LayerNorm -> [Output]
                         \
                     [weight], [bias]
+
+    Note: axes of ReduceMean must be:
+          - last element is the axis of last dimension (-1 or (input_ndims - 1))
+          - a list of adjacent axes, e.g. [1, 2, 3, ..., input_ndims - 1]
 */
 class LayerNormSubGraph : public Subgraph
 {
@@ -313,93 +791,79 @@ class LayerNormSubGraph : public Subgraph
     LayerNormSubGraph() : axis(-1), epsilon(1e-5)
     {
         int input = addNodeToMatch("");
-        int mean = addNodeToMatch("ReduceMean", input);
+        mean = addNodeToMatch("ReduceMean", input);
 
         int sub = addNodeToMatch("Sub", input, mean);
 
-        int pow = addNodeToMatch("Pow", sub, addNodeToMatch(""));
-        int mean1 = addNodeToMatch("ReduceMean", pow);
-        int add = addNodeToMatch("Add", mean1, addNodeToMatch(""));
+        pow = addNodeToMatch("Pow", sub, addNodeToMatch(""));
+        mean1 = addNodeToMatch("ReduceMean", pow);
+        add = addNodeToMatch("Add", mean1, addNodeToMatch(""));
         int sqrt = addNodeToMatch("Sqrt", add);
 
         int div = addNodeToMatch("Div", sub, sqrt);
-        int mul = addNodeToMatch("Mul", div, addNodeToMatch(""));
-        addNodeToMatch("Add", mul, addNodeToMatch(""));
+        mul = addNodeToMatch("Mul", div, addNodeToMatch(""));
+        bias = addNodeToMatch("Add", mul, addNodeToMatch(""));
 
         setFusedNode("LayerNormalization", input);
     }
 
-    static float extractConstant(const Ptr<ImportGraphWrapper>& net, int node_id, int input_id)
-    {
-        auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
-        int initializer_id = onnx_net->getInputInitializerId(node_id, input_id);
-        if (initializer_id != -1) // initializer
-        {
-            Mat const_mat = onnx_net->getMatFromInitializer(initializer_id);
-            return *const_mat.ptr<float>();
-        }
-        else
-        {
-            const Ptr<ImportNodeWrapper> node = net->getNode(node_id);
-            int constant_id = getInputNodeId(net, node, input_id);
-            Ptr<ImportNodeWrapper> constant_ptr = net->getNode(constant_id);
-            opencv_onnx::NodeProto* constant_node = constant_ptr.dynamicCast<ONNXNodeWrapper>()->node;
-            opencv_onnx::TensorProto constant_proto = constant_node->attribute(0).t();
-            Mat constant_mat = getMatFromTensor(constant_proto);
-            return *constant_mat.ptr<float>();
-        }
-    }
-
-    static float extractAxis(const Ptr<ImportGraphWrapper>& net, int node_id)
+    static std::vector<int64_t> extractAxis(const Ptr<ImportGraphWrapper>& net, int node_id)
     {
+        // TODO: consider ReduceMean-18 which has axes as one of the inputs instead of attributes
         Ptr<ImportNodeWrapper> mean_ptr = net->getNode(node_id);
         opencv_onnx::NodeProto* mean_node = mean_ptr.dynamicCast<ONNXNodeWrapper>()->node;
-        int axis_ = -1;
+        std::vector<int64_t> axes;
         for (int i = 0; i < mean_node->attribute_size(); i++)
         {
             opencv_onnx::AttributeProto attr = mean_node->attribute(i);
             if (attr.name() != "axes")
                 continue;
-            axis_ = static_cast<int>(attr.ints(0));
-        }
-        return axis_;
-    }
-
-    static std::string getInputName(const Ptr<ImportGraphWrapper>& net, int node_id, int input_id)
-    {
-        auto onnx_net = net.dynamicCast<ONNXGraphWrapper>();
-        int initializer_id = onnx_net->getInputInitializerId(node_id, input_id);
-        if (initializer_id != -1)
-        {
-            return onnx_net->getNameOfInitializer(initializer_id);
-        }
-        else
-        {
-            const auto node = net->getNode(node_id);
-            return node->getInputName(input_id);
+            for (int j = 0; j < attr.ints_size(); j++) {
+                axes.push_back(attr.ints(j));
+            }
         }
+        return axes;
     }
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        if (Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        if (Subgraph::match(net, nodeId, matchedNodesIds))
         {
-            float pow_exp = extractConstant(net, matchedNodesIds[2], 1);
+            float pow_exp = extractConstant(net, matchedNodesIds[pow], 1).at<float>(0);
             if (pow_exp - 2 > 1e-5) // not pow(2)
                 return false;
 
-            int axis_mean1 = extractAxis(net, matchedNodesIds[0]);
-            int axis_mean2 = extractAxis(net, matchedNodesIds[3]);
-            if (axis_mean1 != axis_mean2)
+            std::vector<int64_t> axes = extractAxis(net, matchedNodesIds[mean]);
+            // check whether it is -1 or last_axis or [axis, ..., last_axis]
+            int64_t input_ndims = static_cast<int64_t>(net.dynamicCast<ONNXGraphWrapper>()->getTensorShapeSize(matchedNodesIds[mean], 0));
+            if (input_ndims == -1) {
+                return false; // input shape unknown
+            }
+            // assume that axes are sorted in ascending order, e.g. [0, 1, 2, 3] or [-3, -2, -1]
+            if (axes.back() != -1 && axes.back() != (input_ndims - 1)) {
+                return false;
+            }
+            for (size_t i = 0; i < axes.size() - 1; i++) {
+                if (axes[i] - axes[i + 1] != -1) {
+                    return false;
+                }
+            }
+
+            std::vector<int64_t> axes1 = extractAxis(net, matchedNodesIds[mean1]);
+            if (axes.size() != axes1.size())
                 return false;
-            axis = axis_mean1;
+            for (size_t i = 0; i < axes.size(); i++) {
+                if (((axes[i] + input_ndims) % input_ndims) != ((axes1[i] + input_ndims) % input_ndims)) {
+                    return false;
+                }
+            }
+            axis = axes[0];
 
-            epsilon = extractConstant(net, matchedNodesIds[4], 1);
+            epsilon = extractConstant(net, matchedNodesIds[add], 1).at<float>(0);
 
-            weight_name = getInputName(net, matchedNodesIds[7], 1);
-            bias_name = getInputName(net, matchedNodesIds[8], 1);
+            weight_name = getInputName(net, matchedNodesIds[mul], 1);
+            bias_name = getInputName(net, matchedNodesIds[bias], 1);
 
             return true;
         }
@@ -429,6 +893,7 @@ class LayerNormSubGraph : public Subgraph
     float epsilon;
     std::string weight_name;
     std::string bias_name;
+    int pow, mean, mean1, add, mul, bias;
 };
 
 class SoftMaxSubgraphBase : public Subgraph
@@ -437,10 +902,9 @@ class SoftMaxSubgraphBase : public Subgraph
     SoftMaxSubgraphBase() : axis(1), id(-1) {}
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        if (Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        if (Subgraph::match(net, nodeId, matchedNodesIds))
         {
             CV_Assert(id >= 0 && id < matchedNodesIds.size());
             Ptr<ImportNodeWrapper> sum = net->getNode(matchedNodesIds[id]);
@@ -485,7 +949,7 @@ class SoftMaxSubgraph : public SoftMaxSubgraphBase
         int inpExp = addNodeToMatch("Exp", input);
 
         int sum = addNodeToMatch("ReduceSum", inpExp);
-        id = 1;
+        id = sum;
 
         addNodeToMatch("Div", inpExp, sum);
         setFusedNode("Softmax", input);
@@ -498,7 +962,7 @@ class SoftMaxSubgraph2 : public SoftMaxSubgraphBase {
         int input = addNodeToMatch("");
 
         int reducemax = addNodeToMatch("ReduceMax", input);
-        id = 0;
+        id = reducemax;
 
         int sub = addNodeToMatch("Sub", input, reducemax);
         int exp = addNodeToMatch("Exp", sub);
@@ -516,7 +980,7 @@ class LogSoftMaxSubgraph : public SoftMaxSubgraphBase
         int input = addNodeToMatch("");
 
         int reducemax = addNodeToMatch("ReduceMax", input);
-        id = 0;
+        id = reducemax;
 
         int sub_1 = addNodeToMatch("Sub", input, reducemax);
         int exp = addNodeToMatch("Exp", sub_1);
@@ -533,18 +997,17 @@ class HardSwishSubgraph : public Subgraph
     HardSwishSubgraph()
     {
         int input = addNodeToMatch("");
-        int hardSigmoid = addNodeToMatch("HardSigmoid", input);
-        addNodeToMatch("Mul", input, hardSigmoid);
+        hardSigmoidId = addNodeToMatch("HardSigmoid", input);
+        addNodeToMatch("Mul", input, hardSigmoidId);
         setFusedNode("HardSwish", input);
     }
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        if (Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        if (Subgraph::match(net, nodeId, matchedNodesIds))
         {
-            Ptr<ImportNodeWrapper> hardSigmoid = net->getNode(matchedNodesIds[0]);
+            Ptr<ImportNodeWrapper> hardSigmoid = net->getNode(matchedNodesIds[hardSigmoidId]);
             opencv_onnx::NodeProto* node = hardSigmoid.dynamicCast<ONNXNodeWrapper>()->node;
 
             uint8_t matched = 0;
@@ -561,6 +1024,9 @@ class HardSwishSubgraph : public Subgraph
         }
         return false;
     }
+
+private:
+    int hardSigmoidId;
 };
 
 class CeluSubgraph : public Subgraph
@@ -569,9 +1035,9 @@ class CeluSubgraph : public Subgraph
     CeluSubgraph() : alpha(1.f)
     {
         int input = addNodeToMatch("");
-        int div = addNodeToMatch("Div", input, addNodeToMatch(""));
-        int elu = addNodeToMatch("Elu", div);
-        addNodeToMatch("Mul", addNodeToMatch(""), elu);
+        div = addNodeToMatch("Div", input, addNodeToMatch(""));
+        elu = addNodeToMatch("Elu", div);
+        mul = addNodeToMatch("Mul", addNodeToMatch(""), elu);
         setFusedNode("Celu", input);
     }
 
@@ -587,16 +1053,15 @@ class CeluSubgraph : public Subgraph
     }
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        if (Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        if (Subgraph::match(net, nodeId, matchedNodesIds))
         {
-            float alpha_div = extractAlpha(net, matchedNodesIds[0], 1);
-            float alpha_mul = extractAlpha(net, matchedNodesIds[2], 0);
+            float alpha_div = extractAlpha(net, matchedNodesIds[div], 1);
+            float alpha_mul = extractAlpha(net, matchedNodesIds[mul], 0);
             float alpha_elu = 1.f;
 
-            Ptr<ImportNodeWrapper> elu_ptr = net->getNode(matchedNodesIds[1]);
+            Ptr<ImportNodeWrapper> elu_ptr = net->getNode(matchedNodesIds[elu]);
             opencv_onnx::NodeProto* elu_node = elu_ptr.dynamicCast<ONNXNodeWrapper>()->node;
 
             for (int i = 0; i < elu_node->attribute_size(); i++)
@@ -625,18 +1090,18 @@ class CeluSubgraph : public Subgraph
 
 protected:
     float alpha;
+    int div, mul, elu;
 };
 
 class NormalizeSubgraphBase : public Subgraph
 {
 public:
-    NormalizeSubgraphBase(int _normNodeOrder = 0) : axis(1), normNodeOrder(_normNodeOrder) {}
+    NormalizeSubgraphBase(int _normNodeOrder = 1) : axis(1), normNodeOrder(_normNodeOrder) {}
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        if (Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        if (Subgraph::match(net, nodeId, matchedNodesIds))
         {
             Ptr<ImportNodeWrapper> norm = net->getNode(matchedNodesIds[normNodeOrder]);
             opencv_onnx::NodeProto* node = norm.dynamicCast<ONNXNodeWrapper>()->node;
@@ -725,7 +1190,7 @@ class NormalizeSubgraph2_2 : public NormalizeSubgraphBase
 class NormalizeSubgraph3 : public NormalizeSubgraphBase
 {
 public:
-    NormalizeSubgraph3() : NormalizeSubgraphBase(1)
+    NormalizeSubgraph3() : NormalizeSubgraphBase(3)
     {
         int input = addNodeToMatch("");
         int power = addNodeToMatch("Constant");
@@ -743,7 +1208,7 @@ class NormalizeSubgraph3 : public NormalizeSubgraphBase
 class NormalizeSubgraph4 : public NormalizeSubgraphBase
 {
 public:
-    NormalizeSubgraph4() : NormalizeSubgraphBase(1)
+    NormalizeSubgraph4() : NormalizeSubgraphBase(2)
     {
         int input = addNodeToMatch("");
         int mul = addNodeToMatch("Mul", input, input);
@@ -760,7 +1225,7 @@ class NormalizeSubgraph4 : public NormalizeSubgraphBase
 class NormalizeSubgraph5 : public NormalizeSubgraphBase
 {
 public:
-    NormalizeSubgraph5() : NormalizeSubgraphBase(1)
+    NormalizeSubgraph5() : NormalizeSubgraphBase(2)
     {
         int input = addNodeToMatch("");
         int mul = addNodeToMatch("Mul", input, input);
@@ -781,25 +1246,24 @@ class GatherCastSubgraph : public Subgraph
     {
         int input = addNodeToMatch("");
         int index = addNodeToMatch("Constant");
-        int gather = addNodeToMatch("Gather", input, index);
-        addNodeToMatch("Cast", gather);
+        gather = addNodeToMatch("Gather", input, index);
+        cast = addNodeToMatch("Cast", gather);
         setFusedNode("Gather", input, index);
     }
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        bool retVal = Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds);
+        bool retVal = Subgraph::match(net, nodeId, matchedNodesIds);
         size_t matchedNodesNum = matchedNodesIds.size();
         // Now we check if merging can be made for these Gather and Cast nodes
         if (!retVal || matchedNodesNum < 2)
             return retVal;
         else {
-            int nodeToMatch = matchedNodesIds[matchedNodesNum - 1];
+            int nodeToMatch = matchedNodesIds[cast];
             const Ptr<ImportNodeWrapper> node = net->getNode(nodeToMatch);
             if (node->getType() == "Cast") {
-                int inpNodeId = matchedNodesIds[matchedNodesNum - 2];
+                int inpNodeId = matchedNodesIds[gather];
                 const Ptr<ImportNodeWrapper> inpNode = net->getNode(inpNodeId);
                 if (inpNode->getType() == "Gather") {
                     int numNodes = net->getNumNodes();
@@ -819,8 +1283,21 @@ class GatherCastSubgraph : public Subgraph
         }
         return retVal;
     }
+
+private:
+    int cast, gather;
 };
 
+/*  Constant folding shape for Expand.
+
+    Before fusion:
+             +--------------------------------------------------------------+ (X)
+             |                                                              |
+    ConstantOfShape[input=[4]] -> Mul[B=-1] -> Equal[A=[2, -1, -1, -1]] -> Where[Y=[2, -1, -1, -1]] -> Expand
+             \                                                           \
+             value=[1]                                                   (condition)
+
+*/
 class ExpandSubgraph : public Subgraph
 {
 public:
@@ -828,15 +1305,115 @@ class ExpandSubgraph : public Subgraph
     {
         int input = addNodeToMatch("");
         int values = addNodeToMatch("");
-        int init = addNodeToMatch("ConstantOfShape", values);
+        init = addNodeToMatch("ConstantOfShape", values);
         int coeff = addNodeToMatch("Constant");
-        int mul = addNodeToMatch("Mul", init, coeff);
+        mul = addNodeToMatch("Mul", init, coeff);
         int shape = addNodeToMatch("Constant");
-        int condition = addNodeToMatch("Equal", shape, mul);
-        int where = addNodeToMatch("Where", condition, init, addNodeToMatch("Constant"));
+        condition = addNodeToMatch("Equal", shape, mul);
+        where = addNodeToMatch("Where", condition, init, addNodeToMatch("Constant"));
         addNodeToMatch("Expand", input, where);
         setFusedNode("Expand", input, shape);
     }
+
+    static int extractValue(const Ptr<ImportGraphWrapper>& net, int node_id, int64_t &val) {
+        Ptr<ImportNodeWrapper> node_wrapper = net->getNode(node_id);
+        opencv_onnx::NodeProto* node = node_wrapper.dynamicCast<ONNXNodeWrapper>()->node;
+
+        if (node->attribute_size() == 0) {
+            val = 0;
+            return 1;
+        } else if (node->attribute_size() == 1) {
+            opencv_onnx::AttributeProto attr = node->attribute(0);
+            if (attr.name() != "value") {
+                return 0;
+            }
+            Mat mat_value = getMatFromTensor(attr.t());
+            switch (mat_value.type()) {
+                case CV_32S: {
+                    val = static_cast<int64_t>(mat_value.at<int>());
+                } break;
+                default: return 0;
+            }
+            return 1;
+        }
+        return 0;
+    }
+
+    virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE {
+        if (Subgraph::match(net, nodeId, matchedNodesIds)) {
+            int64_t value_ConstantOfShape;
+            if (!extractValue(net, matchedNodesIds[init], value_ConstantOfShape)) {
+                return false;
+            }
+            std::vector<int> input_ConstantOfShape = extractConstant(net, matchedNodesIds[init], 0);
+            if (input_ConstantOfShape.size() != static_cast<size_t>(1)) {
+                return false;
+            }
+            std::vector<int> B_Mul = extractConstant(net, matchedNodesIds[mul], 1);
+            if (B_Mul.size() != static_cast<size_t>(1)) {
+                return false;
+            }
+
+            std::vector<int> A_Equal = extractConstant(net, matchedNodesIds[condition], 0);
+            if (A_Equal.size() != static_cast<size_t>(input_ConstantOfShape[0])) {
+                return false;
+            }
+
+            std::vector<int> Y_Where = extractConstant(net, matchedNodesIds[where], 2);
+            if (Y_Where.size() != A_Equal.size()) {
+                return false;
+            }
+
+            // run ConstantOfShape
+            std::vector<int64_t> output_ConstantOfShape(std::accumulate(input_ConstantOfShape.begin(), input_ConstantOfShape.end(), static_cast<int64_t>(1), std::multiplies<int64_t>()), value_ConstantOfShape);
+            // run Mul
+            std::vector<int64_t> output_Mul = output_ConstantOfShape;
+            for (size_t i = 0; i < output_Mul.size(); i++) {
+                int64_t b = B_Mul[0];
+                output_Mul[i] *= b;
+            }
+            // run Equal
+            std::vector<bool> output_Equal(output_Mul.size());
+            for (int i = 0; i < output_Equal.size(); i++) {
+                if (A_Equal[i] == output_Mul[i]) {
+                    output_Equal[i] = true;
+                } else {
+                    output_Equal[i] = false;
+                }
+            }
+            // run Where
+            std::vector<int64_t> output_Where(output_Equal.size());
+            for (int i = 0; i < output_Where.size(); i++) {
+                if (output_Equal[i]) {
+                    output_Where[i] = output_ConstantOfShape[i];
+                } else {
+                    output_Where[i] = Y_Where[i];
+                }
+            }
+            shape = output_Where;
+
+            return true;
+        }
+        return false;
+    }
+
+    virtual void finalize(const Ptr<ImportGraphWrapper>& graph,
+                          const Ptr<ImportNodeWrapper>& fusedNode,
+                          std::vector<Ptr<ImportNodeWrapper> >& inputs) CV_OVERRIDE {
+        // replace values
+        opencv_onnx::NodeProto* node_shape = inputs[1].dynamicCast<ONNXNodeWrapper>()->node;
+        auto attr = node_shape->mutable_attribute()->Mutable(0);
+        auto tensor = attr->mutable_t();
+        tensor->clear_raw_data();
+        tensor->set_raw_data(std::string((const char*)(shape.data()), shape.size() * sizeof(int64_t)));
+    }
+
+protected:
+    std::vector<int64_t> shape;
+
+private:
+    int init, mul, condition, where;
 };
 
 class MishSubgraph : public Subgraph
@@ -847,7 +1424,7 @@ class MishSubgraph : public Subgraph
         int input = addNodeToMatch("");
         int softplus = addNodeToMatch("Softplus", input);
         int tanh = addNodeToMatch("Tanh", softplus);
-        addNodeToMatch("Mul", input, tanh);
+        addNodeToMatch("Mul", tanh, input);
         setFusedNode("Mish", input);
     }
 };
@@ -867,20 +1444,6 @@ class SoftplusSubgraph: public Subgraph
     }
 };
 
-class SoftplusSubgraph2: public Subgraph
-{
-public:
-    SoftplusSubgraph2()
-    {
-        int input = addNodeToMatch("");
-        int exp = addNodeToMatch("Exp", input);
-        int addVal = addNodeToMatch("");
-        int add = addNodeToMatch("Add", exp, addVal);
-        addNodeToMatch("Log", add);
-        setFusedNode("Softplus", input);
-    }
-};
-
 class MulCastSubgraph : public Subgraph
 {
 public:
@@ -978,7 +1541,11 @@ class ResizeSubgraph1 : public ExtractScalesSubgraph
     ResizeSubgraph1() : ExtractScalesSubgraph()
     {
         int shape = addNodeToMatch("Shape", input);
-        int slice = addNodeToMatch("Slice", shape, addNodeToMatch("Constant"), addNodeToMatch("Constant"), addNodeToMatch("Constant"));
+        int slice = addNodeToMatch("Slice", {shape,
+                                             addNodeToMatch(""),
+                                             addNodeToMatch(""),
+                                             addNodeToMatch(""),
+                                             addNodeToMatch("")});
 
         int castConcat = addNodeToMatch("Cast", concatId);
         int concat = addNodeToMatch("Concat", slice, castConcat);
@@ -1004,6 +1571,37 @@ class ResizeSubgraph2 : public ExtractScalesSubgraph
     }
 };
 
+class ResizeSubgraph3 : public Subgraph
+{
+public:
+    ResizeSubgraph3() : Subgraph()
+    {
+        int shapeSrc = addNodeToMatch("");
+        int input = addNodeToMatch("");
+
+        int shape_h = addNodeToMatch("Shape", shapeSrc);
+        int shape_w = addNodeToMatch("Shape", shapeSrc);
+        int gather_h = addNodeToMatch("Gather", shape_h, addNodeToMatch("Constant"));
+        int gather_w = addNodeToMatch("Gather", shape_w, addNodeToMatch("Constant"));
+        int unsqueeze_h = addNodeToMatch("Unsqueeze", gather_h);
+        int unsqueeze_w = addNodeToMatch("Unsqueeze", gather_w);
+        int concat1 = addNodeToMatch("Concat", unsqueeze_h, unsqueeze_w);
+        int cast = addNodeToMatch("Cast", concat1);
+
+        int shape2 = addNodeToMatch("Shape", input);
+        int slice = addNodeToMatch("Slice", {shape2,
+                                             addNodeToMatch(""),
+                                             addNodeToMatch(""),
+                                             addNodeToMatch(""),
+                                             addNodeToMatch("")});
+        int concat2 = addNodeToMatch("Concat", slice, cast);
+        addNodeToMatch("Resize", input, addNodeToMatch("Constant"), addNodeToMatch("Constant"), concat2);
+
+        setFusedNode("Upsample", input, shapeSrc);
+    }
+};
+
+
 class BatchNormalizationSubgraphBase : public Subgraph
 {
 public:
@@ -1067,6 +1665,9 @@ class BatchNormalizationSubgraph2 : public BatchNormalizationSubgraphBase
 void simplifySubgraphs(opencv_onnx::GraphProto& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
+    subgraphs.push_back(makePtr<BiasedMatmulSubgraph>());
+    subgraphs.push_back(makePtr<AdjustSliceAllOptionalInputsSubgraph>(3));
+    subgraphs.push_back(makePtr<AdjustSliceAllOptionalInputsSubgraph>(4));
     subgraphs.push_back(makePtr<GeluSubGraph>());
     subgraphs.push_back(makePtr<GeluApproximationSubGraph>());
     subgraphs.push_back(makePtr<LayerNormSubGraph>());
@@ -1075,6 +1676,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<UpsampleSubgraph>());
     subgraphs.push_back(makePtr<ResizeSubgraph1>());
     subgraphs.push_back(makePtr<ResizeSubgraph2>());
+    subgraphs.push_back(makePtr<ResizeSubgraph3>());
     subgraphs.push_back(makePtr<SoftMaxSubgraph>());
     subgraphs.push_back(makePtr<SoftMaxSubgraph2>());
     subgraphs.push_back(makePtr<LogSoftMaxSubgraph>());
@@ -1088,10 +1690,13 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());
     subgraphs.push_back(makePtr<ExpandSubgraph>());
     subgraphs.push_back(makePtr<SoftplusSubgraph>());
-    subgraphs.push_back(makePtr<SoftplusSubgraph2>());
     subgraphs.push_back(makePtr<MishSubgraph>());
     subgraphs.push_back(makePtr<NormalizeSubgraph4>());
     subgraphs.push_back(makePtr<NormalizeSubgraph5>());
+    if (getParam_DNN_BACKEND_DEFAULT() == DNN_BACKEND_OPENCV) {
+        subgraphs.push_back(makePtr<AttentionSubGraph>());
+        subgraphs.push_back(makePtr<AttentionSingleHeadSubGraph>());
+    }
 
     simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
@@ -1137,12 +1742,12 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
 #endif
             const ::google::protobuf::RepeatedField<int32_t> field = tensor_proto.int32_data();
 
-            AutoBuffer<float16_t, 16> aligned_val;
+            AutoBuffer<hfloat, 16> aligned_val;
             size_t sz = tensor_proto.int32_data().size();
             aligned_val.allocate(sz);
-            float16_t* bufPtr = aligned_val.data();
+            hfloat* bufPtr = aligned_val.data();
 
-            float16_t *fp16Ptr = (float16_t *)field.data();
+            hfloat *fp16Ptr = (hfloat *)field.data();
             for (int i = 0; i < sz; i++)
             {
                 bufPtr[i] = fp16Ptr[i*2 + offset];
@@ -1154,11 +1759,11 @@ Mat getMatFromTensor(const opencv_onnx::TensorProto& tensor_proto)
             char* val = const_cast<char*>(tensor_proto.raw_data().c_str());
 #if CV_STRONG_ALIGNMENT
             // Aligned pointer is required.
-            AutoBuffer<float16_t, 16> aligned_val;
-            if (!isAligned<sizeof(float16_t)>(val))
+            AutoBuffer<hfloat, 16> aligned_val;
+            if (!isAligned<sizeof(hfloat)>(val))
             {
                 size_t sz = tensor_proto.raw_data().size();
-                aligned_val.allocate(divUp(sz, sizeof(float16_t)));
+                aligned_val.allocate(divUp(sz, sizeof(hfloat)));
                 memcpy(aligned_val.data(), val, sz);
                 val = (char*)aligned_val.data();
             }
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 5cd22057ad77..7b63e39a3abd 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -22,6 +22,7 @@
 
 #ifdef HAVE_PROTOBUF
 
+#include <array>
 #include <iostream>
 #include <fstream>
 #include <string>
@@ -90,11 +91,10 @@ class ONNXImporter
 
     void addConstant(const std::string& name, const Mat& blob);
     void addLayer(LayerParams& layerParams,
-                  const opencv_onnx::NodeProto& node_proto);
+                  const opencv_onnx::NodeProto& node_proto,
+                  int num_inputs = std::numeric_limits<int>::max());
     void setParamsDtype(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
 
-    void expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
-                   const std::string& input, size_t n);
     void lstm_extractConsts(LayerParams& layerParams, const opencv_onnx::NodeProto& lstm_proto, size_t idx, int* blobShape_, int size);
     void lstm_add_reshape(const std::string& input_name, const std::string& output_name, int* layerShape, size_t n);
     std::string lstm_add_slice(int index, const std::string& input_name, int* begin, int* end, size_t n);
@@ -113,7 +113,7 @@ class ONNXImporter
     std::unique_ptr<ONNXLayerHandler> layerHandler;
     Net& dstNet;
 
-    opencv_onnx::GraphProto graph_proto;
+    opencv_onnx::GraphProto* graph_proto;
     std::string framework_name;
 
     std::map<std::string, Mat> constBlobs;
@@ -181,6 +181,7 @@ class ONNXImporter
     void parseCast                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseConstantFill         (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseGather               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseGatherElements       (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseConcat               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseResize               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseUpsample             (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
@@ -194,6 +195,7 @@ class ONNXImporter
     void parseTile                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseLayerNorm            (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseSimpleLayers         (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseEinsum               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
 
     // Domain: com.microsoft
     // URL: https://github.com/microsoft/onnxruntime/blob/master/docs/ContribOperators.md
@@ -207,6 +209,7 @@ class ONNXImporter
     void parseQConcat              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseQGemm                (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
     void parseQSoftmax             (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseAttention            (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
 
     // '???' domain or '???' layer type
     void parseCustomLayer          (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
@@ -384,7 +387,7 @@ void runLayer(LayerParams& params, const std::vector<Mat>& inputs,
     {
         inpShapes[i] = shape(inputs[i]);
         if (i > 0 && ddepth != inputs[i].depth())
-            CV_Error(Error::StsNotImplemented, "Mixed input data types.");
+            CV_Error(Error::StsNotImplemented, cv::format("Mixed input data types. Required type: %d, actual type: %d", ddepth, inputs[i].depth()));
 
         // Quantize and Dequantize layer have different output type than input.
         if (params.type != "Quantize" && params.type != "Dequantize")
@@ -615,7 +618,8 @@ ONNXImporter::TensorInfo ONNXImporter::getBlobExtraInfo(const std::string& input
 }
 
 void ONNXImporter::addLayer(LayerParams& layerParams,
-                            const opencv_onnx::NodeProto& node_proto)
+                            const opencv_onnx::NodeProto& node_proto,
+                            int num_inputs)
 {
     int depth = layerParams.get<int>("depth", CV_32F);
     int id = dstNet.addLayer(layerParams.name, layerParams.type, depth, layerParams);
@@ -630,7 +634,8 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
 
     std::vector<MatShape> layerInpShapes, layerOutShapes, layerInternalShapes;
     int inpNum = 0;
-    for (int j = 0; j < node_proto.input_size(); j++)
+    num_inputs = std::min(node_proto.input_size(), num_inputs);
+    for (int j = 0; j < num_inputs; j++)
     {
         const std::string& input_name = node_proto.input(j);
         IterLayerId_t layerId = layer_id.find(input_name);
@@ -656,37 +661,6 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
     }
 }
 
-/** @brief Make N copies of input layer and set them as input to node_proto.
- * @param prefix prefix of new layers' names
- * @param node_proto node which will contain all copies as inputs
- * @param input name of the node to copy
- * @param n number of copies
- */
-void ONNXImporter::expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
-                             const std::string& input, size_t n)
-{
-    std::vector<std::string> input_names;
-    input_names.reserve(n);
-    for (size_t j = 0; j < n; j++)
-    {
-        LayerParams copyLP;
-        copyLP.name = format("%s/copy_%zu", prefix.c_str(), j);
-        copyLP.type = "Identity";
-        CV_Assert((layer_id.find(copyLP.name) == layer_id.end()) &&
-            "Couldn't copy the node: generated name already exists in the graph.");
-        input_names.push_back(copyLP.name);
-
-        node_proto.set_input(0, input);
-        node_proto.set_output(0, copyLP.name);
-        addLayer(copyLP, node_proto);
-    }
-    node_proto.clear_input();
-    for (size_t i = 0; i < input_names.size(); i++)
-    {
-        node_proto.add_input(input_names[i]);
-    }
-}
-
 void ONNXImporter::addConstant(const std::string& name, const Mat& blob)
 {
     CV_LOG_DEBUG(NULL, "DNN/ONNX: add constant '" << name << "' shape=" << toString(shape(blob)) << ": " << toString(blob));
@@ -816,7 +790,7 @@ void ONNXImporter::setParamsDtype(LayerParams& layerParams, const opencv_onnx::N
 void ONNXImporter::populateNet()
 {
     CV_Assert(model_proto.has_graph());
-    graph_proto = model_proto.graph();
+    graph_proto = model_proto.mutable_graph();
 
     std::string framework_version;
     if (model_proto.has_producer_name())
@@ -828,25 +802,25 @@ void ONNXImporter::populateNet()
             << (model_proto.has_ir_version() ? cv::format(" v%d", (int)model_proto.ir_version()) : cv::String())
             << " model produced by '" << framework_name << "'"
             << (framework_version.empty() ? cv::String() : cv::format(":%s", framework_version.c_str()))
-            << ". Number of nodes = " << graph_proto.node_size()
-            << ", initializers = " << graph_proto.initializer_size()
-            << ", inputs = " << graph_proto.input_size()
-            << ", outputs = " << graph_proto.output_size()
+            << ". Number of nodes = " << graph_proto->node_size()
+            << ", initializers = " << graph_proto->initializer_size()
+            << ", inputs = " << graph_proto->input_size()
+            << ", outputs = " << graph_proto->output_size()
             );
 
     parseOperatorSet();
 
-    simplifySubgraphs(graph_proto);
+    simplifySubgraphs(*graph_proto);
 
-    const int layersSize = graph_proto.node_size();
+    const int layersSize = graph_proto->node_size();
     CV_LOG_DEBUG(NULL, "DNN/ONNX: graph simplified to " << layersSize << " nodes");
 
-    constBlobs = getGraphTensors(graph_proto);  // scan GraphProto.initializer
+    constBlobs = getGraphTensors(*graph_proto);  // scan GraphProto.initializer
     std::vector<String> netInputs;  // map with network inputs (without const blobs)
     // Add all the inputs shapes. It includes as constant blobs as network's inputs shapes.
-    for (int i = 0; i < graph_proto.input_size(); ++i)
+    for (int i = 0; i < graph_proto->input_size(); ++i)
     {
-        const opencv_onnx::ValueInfoProto& valueInfoProto = graph_proto.input(i);
+        const opencv_onnx::ValueInfoProto& valueInfoProto = graph_proto->input(i);
         CV_Assert(valueInfoProto.has_name());
         const std::string& name = valueInfoProto.name();
         CV_Assert(valueInfoProto.has_type());
@@ -902,26 +876,26 @@ void ONNXImporter::populateNet()
     }
 
     // dump outputs
-    for (int i = 0; i < graph_proto.output_size(); ++i)
+    for (int i = 0; i < graph_proto->output_size(); ++i)
     {
-        dumpValueInfoProto(i, graph_proto.output(i), "output");
+        dumpValueInfoProto(i, graph_proto->output(i), "output");
     }
 
     if (DNN_DIAGNOSTICS_RUN) {
         CV_LOG_INFO(NULL, "DNN/ONNX: start diagnostic run!");
-        layerHandler->fillRegistry(graph_proto);
+        layerHandler->fillRegistry(*graph_proto);
     }
 
     for(int li = 0; li < layersSize; li++)
     {
-        const opencv_onnx::NodeProto& node_proto = graph_proto.node(li);
+        const opencv_onnx::NodeProto& node_proto = graph_proto->node(li);
         handleNode(node_proto);
     }
 
     // register outputs
-    for (int i = 0; i < graph_proto.output_size(); ++i)
+    for (int i = 0; i < graph_proto->output_size(); ++i)
     {
-        const std::string& output_name = graph_proto.output(i).name();
+        const std::string& output_name = graph_proto->output(i).name();
         if (output_name.empty())
         {
             CV_LOG_ERROR(NULL, "DNN/ONNX: can't register output without name: " << i);
@@ -1228,7 +1202,12 @@ void ONNXImporter::parseReduce(LayerParams& layerParams, const opencv_onnx::Node
 
 void ONNXImporter::parseSlice(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
-    MatShape inpShape = outShapes[node_proto.input(0)];
+    MatShape inpShape;
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+        inpShape = shape(getBlob(node_proto, 0));
+    else {
+        inpShape = outShapes[node_proto.input(0)];
+    }
     int dims = inpShape.size();
     std::vector<int> begin(dims, 0);
     std::vector<int> end(dims, INT_MAX);
@@ -1266,7 +1245,7 @@ void ONNXImporter::parseSlice(LayerParams& layerParams, const opencv_onnx::NodeP
         starts_ = DictValue::arrayInt(start_blob.begin<int>(), start_blob.total());
         ends_ = DictValue::arrayInt(end_blob.begin<int>(), end_blob.total());
 
-        if (inp_size > 3)
+        if (inp_size > 3 && !getBlob(node_proto, 3).empty())
         {
             Mat axes_blob = getBlob(node_proto, 3);
             CV_Assert(axes_blob.total() == start_blob.total());
@@ -1275,7 +1254,7 @@ void ONNXImporter::parseSlice(LayerParams& layerParams, const opencv_onnx::NodeP
             has_axes = true;
         }
 
-        if (inp_size == 5)
+        if (inp_size == 5 && !getBlob(node_proto, 4).empty())
         {
             Mat step_blob = getBlob(node_proto, 4);
             CV_Assert(step_blob.total() == start_blob.total());
@@ -1385,13 +1364,19 @@ void ONNXImporter::parseSplit(LayerParams& layerParams, const opencv_onnx::NodeP
         CV_Assert(constBlobs.find(node_proto.input(1)) != constBlobs.end());
         Mat splitsBlob = getBlob(node_proto, 1);
         int splitSize = splitsBlob.total();
-
-        std::vector<int> slicePoints(splitSize - 1, splitsBlob.at<int>(0));
-        for (int i = 1; i < splitSize - 1; ++i)
+        if (splitSize == 1)
+        {
+            layerParams.set("num_split", 1);
+        }
+        else
         {
-            slicePoints[i] = slicePoints[i - 1] + splitsBlob.at<int>(i);
+            std::vector<int> slicePoints(splitSize - 1, splitsBlob.at<int>(0));
+            for (int i = 1; i < splitSize - 1; ++i)
+            {
+                slicePoints[i] = slicePoints[i - 1] + splitsBlob.at<int>(i);
+            }
+            layerParams.set("slice_point", DictValue::arrayInt(&slicePoints[0], slicePoints.size()));
         }
-        layerParams.set("slice_point", DictValue::arrayInt(&slicePoints[0], slicePoints.size()));
     }
     else
     {
@@ -1524,7 +1509,7 @@ void ONNXImporter::lstm_extractConsts(LayerParams& layerParams, const opencv_onn
             blob = Mat(blobShape, CV_32FC1, 0.);
         }
         layerParams.blobs.push_back(blob);
-};
+}
 
 void ONNXImporter::lstm_add_reshape(const std::string& input_name, const std::string& output_name, int* layerShape, size_t n)
 {
@@ -1539,7 +1524,7 @@ void ONNXImporter::lstm_add_reshape(const std::string& input_name, const std::st
     reshape_proto.add_input(input_name);
     reshape_proto.add_output(output_name);
     addLayer(reshapeLp, reshape_proto);
-};
+}
 
 std::string ONNXImporter::lstm_add_slice(int index, const std::string& input_name, int* begin, int* end, size_t n)
 {
@@ -1558,7 +1543,7 @@ std::string ONNXImporter::lstm_add_slice(int index, const std::string& input_nam
     addLayer(sliceLP, slice_proto);
 
     return slice_proto.output(0);
-};
+}
 
 std::string ONNXImporter::lstm_fix_dims(LayerParams& layerParams, const opencv_onnx::NodeProto& lstm_proto,
                                         int batch_size, int num_directions, int hidden_size, bool need_y, const std::string& y_name,
@@ -1586,7 +1571,7 @@ std::string ONNXImporter::lstm_fix_dims(LayerParams& layerParams, const opencv_o
     addLayer(permuteLP, permute_proto);
 
     return permute_proto.output(0);
-};
+}
 
 void ONNXImporter::lstm_add_transform(int num_directions, int batch_size, int hidden_size,
                                       int index, const std::string& input_name, const std::string& output_name)
@@ -1628,7 +1613,7 @@ void ONNXImporter::lstm_add_transform(int num_directions, int batch_size, int hi
         int layerShape[] = {2, batch_size, hidden_size};
         lstm_add_reshape(concat_proto.output(0), output_name, layerShape, sizeof(layerShape) / sizeof(layerShape[0]));
     }
-};
+}
 
 void ONNXImporter::parseLSTM(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
@@ -1822,7 +1807,7 @@ void ONNXImporter::parseClip(LayerParams& layerParams, const opencv_onnx::NodePr
 
     layerParams.set("min_value", layerParams.get<float>("min", min_value));
     layerParams.set("max_value", layerParams.get<float>("max", max_value));
-    addLayer(layerParams, node_proto);
+    addLayer(layerParams, node_proto, 1);
 }
 
 void ONNXImporter::parseLeakyRelu(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
@@ -1869,44 +1854,43 @@ void ONNXImporter::parseLRN(LayerParams& layerParams, const opencv_onnx::NodePro
     addLayer(layerParams, node_proto);
 }
 
-void ONNXImporter::parseInstanceNormalization(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
-{
-    opencv_onnx::NodeProto node_proto = node_proto_;
-    if (node_proto.input_size() != 3)
-        CV_Error(Error::StsNotImplemented,
-                 "Expected input, scale, bias");
-
-    layerParams.blobs.resize(4);
-    layerParams.blobs[2] = getBlob(node_proto, 1);  // weightData
-    layerParams.blobs[3] = getBlob(node_proto, 2);  // biasData
-    layerParams.set("has_bias", true);
-    layerParams.set("has_weight", true);
-
-    // Get number of channels in input
-    int size = layerParams.blobs[2].total();
-    layerParams.blobs[0] = Mat::zeros(size, 1, CV_32F); // mean
-    layerParams.blobs[1] = Mat::ones(size, 1, CV_32F); // std
-
-    LayerParams mvnParams;
-    mvnParams.name = layerParams.name + "/MVN";
-    mvnParams.type = "MVN";
-    mvnParams.set("eps", layerParams.get<float>("epsilon"));
-    layerParams.erase("epsilon");
-
-    //Create MVN layer
-    int id = dstNet.addLayer(mvnParams.name, mvnParams.type, mvnParams);
-    //Connect to input
-    IterLayerId_t layerId = layer_id.find(node_proto.input(0));
-    CV_Assert(layerId != layer_id.end());
-    dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
-    //Add shape
-    layer_id.insert(std::make_pair(mvnParams.name, LayerInfo(id, 0)));
-    outShapes[mvnParams.name] = outShapes[node_proto.input(0)];
-
-    //Replace Batch Norm's input to MVN
-    node_proto.set_input(0, mvnParams.name);
-    layerParams.type = "BatchNorm";
-    addLayer(layerParams, node_proto);
+void ONNXImporter::parseInstanceNormalization(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto) {
+    int num_inputs = node_proto.input_size();
+    CV_CheckEQ(num_inputs, 3, "DNN/ONNXImporter - InstanceNorm: three inputs are required");
+
+    bool found_input = constBlobs.find(node_proto.input(0)) != constBlobs.end();
+    bool found_scale = constBlobs.find(node_proto.input(1)) != constBlobs.end();
+    bool found_bias = constBlobs.find(node_proto.input(2)) != constBlobs.end();
+
+    if (found_input && found_scale && found_bias) {
+        std::vector<Mat> inputs, output;
+
+        Mat input = getBlob(node_proto, 0);
+        Mat scale = getBlob(node_proto, 1);
+        Mat bias = getBlob(node_proto, 2);
+        inputs.push_back(input);
+        inputs.push_back(scale);
+        inputs.push_back(bias);
+
+        runLayer(layerParams, inputs, output);
+        addConstant(node_proto.output(0), output[0]);
+    } else {
+        auto add_const_node = [&] (int i) {
+            LayerParams const_params;
+            const_params.name = node_proto.input(i);
+            const_params.type = "Const";
+            Mat blob = getBlob(node_proto, i);
+            const_params.blobs.push_back(blob);
+
+            opencv_onnx::NodeProto proto;
+            proto.add_output(const_params.name);
+            addLayer(const_params, proto);
+        };
+        if (found_input && layer_id.find(node_proto.input(0)) == layer_id.end()) { add_const_node(0); }
+        if (found_scale && layer_id.find(node_proto.input(1)) == layer_id.end()) { add_const_node(1); }
+        if (found_bias && layer_id.find(node_proto.input(2)) == layer_id.end()) { add_const_node(2); }
+        addLayer(layerParams, node_proto);
+    }
 }
 
 void ONNXImporter::parseBatchNormalization(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
@@ -1941,115 +1925,80 @@ void ONNXImporter::parseBatchNormalization(LayerParams& layerParams, const openc
     addLayer(layerParams, node_proto);
 }
 
-// A * B + C = Y, we require that the dimension of A is [m, k], and the dimension of B is [n, k].
-// And the dim of output Y is [m, n]
-void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
-    CV_Assert(node_proto.input_size() >= 2);
-    layerParams.type = "InnerProduct";
-    int transA = layerParams.get<int>("transA", 0);
-    layerParams.set("transA", transA == 1);
-
-    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-    {
-        Mat inputBuf = getBlob(node_proto, 0);
+    auto node_proto = node_proto_;
+    layerParams.type = "Gemm";
+    CV_CheckGE(node_proto.input_size(), 2, "DNN/ONNXImporter: Gemm requires at least two inputs");
+    CV_CheckLE(node_proto.input_size(), 3, "DNN/ONNXImporter: Gemm have at most three inputs.");
+
+    for (int i = 0; i < node_proto.input_size(); ++i) {
+        if (i == 2) {
+            layerParams.set("have_bias", true);
+        }
+        if (constBlobs.find(node_proto.input(i)) == constBlobs.end()) {
+            continue;
+        }
 
-        LayerParams constParams;
-        constParams.name = node_proto.input(0);
-        constParams.type = "Const";
-        constParams.blobs.push_back(inputBuf);
+        if (i == 2 && constBlobsExtraInfo.find(node_proto.input(2)) != constBlobsExtraInfo.end()) {
+            layerParams.set("real_ndims_C", getBlobExtraInfo(node_proto, 2).real_ndims);
+        }
 
-        opencv_onnx::NodeProto proto;
-        proto.add_output(constParams.name);
-        addLayer(constParams, proto);
-    }
+        Mat blob = getBlob(node_proto, i);
 
-    int transB = layerParams.get<int>("transB", 0);
-    if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
-    {
-        Mat weights = getBlob(node_proto, 1);
+        if (i == 0) { // A, always as inputs without prepacking
+            LayerParams const_A_params;
+            const_A_params.name = layerParams.name + "/const_A";
+            const_A_params.type = "Const";
+            const_A_params.blobs.push_back(blob);
 
-        if (transA == 0) // optimized barnch, for now, we can only optimize the Gemm when transA = 0.
-        {
-            if (transB == 0)
-            {
-                transpose(weights, weights);
-            }
-            layerParams.set("transB", false);
-            layerParams.blobs.push_back(weights);
-            layerParams.set("num_output", layerParams.blobs[0].size[0]);
-        }
-        else // no optimized branch, TODO! optimize when the transA==1.
-        {
-            LayerParams constParams;
-            constParams.name = node_proto.input(1);
-            constParams.type = "Const";
-            constParams.blobs.push_back(weights);
+            opencv_onnx::NodeProto const_node_proto;
+            const_node_proto.add_output(const_A_params.name);
+            addLayer(const_A_params, const_node_proto);
+            node_proto.set_input(0, const_A_params.name);
+        } else { // B or C
+            std::string const_params_name = i == 1 ? "B" : "C";
 
-            opencv_onnx::NodeProto proto;
-            proto.add_output(constParams.name);
-            addLayer(constParams, proto);
-            layerParams.set("transB", transB == 1);
+            layerParams.blobs.push_back(blob);
+            layerParams.set(cv::format("const%s", const_params_name.c_str()), true);
         }
     }
-    else
-        layerParams.set("transB", transB == 1);
-
-    if (node_proto.input_size() == 3)
-    {
-        Mat bias = getBlob(node_proto, 2);
-        layerParams.blobs.push_back(bias);
-    }
 
-    layerParams.set("bias_term", node_proto.input_size() == 3);
-    layerParams.set("is_matmul", true);
     addLayer(layerParams, node_proto);
 }
 
-void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
-{
-    opencv_onnx::NodeProto node_proto = node_proto_;
-    CV_Assert(node_proto.input_size() == 2);
-    layerParams.type = "InnerProduct";
-    layerParams.set("bias_term", false);
-    int firstInpDims, secondInpDims;
+void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) {
+    auto node_proto = node_proto_;
+    CV_CheckGE(node_proto.input_size(), 2, "ONNXImporter/MatMul: two inputs required at least");
+    CV_CheckLE(node_proto.input_size(), 3, "ONNXImporter/MatMul: three inputs required at most");
 
-    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-    {
-        Mat blob = getBlob(node_proto, 0);
-        firstInpDims = blob.dims;
-        LayerParams constParams;
-        constParams.name = layerParams.name + "/const_0";
-        constParams.type = "Const";
-        constParams.blobs.push_back(blob);
+    for (int i = 0; i < node_proto.input_size(); i++) {
+        if (constBlobs.find(node_proto.input(i)) == constBlobs.end()) {
+            continue;
+        }
+
+        Mat blob = getBlob(node_proto, i);
 
-        opencv_onnx::NodeProto tmpProto;
-        tmpProto.add_output(constParams.name);
-        addLayer(constParams, tmpProto);
+        if (i == 0) {
+            LayerParams const_params;
+            const_params.name = node_proto.input(i);
+            const_params.type = "Const";
+            const_params.blobs.push_back(blob);
+
+            opencv_onnx::NodeProto const_node_proto;
+            const_node_proto.add_output(const_params.name);
+            addLayer(const_params, const_node_proto);
+
+            node_proto.set_input(i, const_params.name);
+        } else {
+            layerParams.blobs.push_back(blob);
+        }
 
-        node_proto.set_input(0, constParams.name);
+        if (i == 2 && constBlobsExtraInfo.find(node_proto.input(2)) != constBlobsExtraInfo.end()) {
+            layerParams.set("real_ndims_C", getBlobExtraInfo(node_proto, 2).real_ndims);
+        }
     }
-    else
-        firstInpDims = outShapes[node_proto.input(0)].size();
 
-    if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
-    {
-        Mat blob = getBlob(node_proto, 1);
-        Mat transBlob;
-        secondInpDims = blob.dims;
-        // create order transposing last 2 dimensions
-        std::vector<int> order(secondInpDims);
-        std::iota(order.begin(), order.end(), 0);
-        std::swap(order[secondInpDims - 2], order[secondInpDims - 1]);
-        transposeND(blob, order, transBlob);
-        layerParams.blobs.push_back(transBlob);
-        int numOutput = layerParams.blobs[0].total(0, secondInpDims - 1);
-        layerParams.set("num_output", numOutput);
-        layerParams.set("is_matmul", true);
-    } else
-        secondInpDims = outShapes[node_proto.input(1)].size();
-
-    layerParams.set("axis", firstInpDims - 1);
     addLayer(layerParams, node_proto);
 }
 
@@ -2357,137 +2306,38 @@ void ONNXImporter::parseUnsqueeze(LayerParams& layerParams, const opencv_onnx::N
     addLayer(layerParams, node_proto);
 }
 
-void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
-    opencv_onnx::NodeProto node_proto = node_proto_;
-    CV_CheckEQ(node_proto.input_size(), 2, "");
-    const std::string& input0 = node_proto.input(0);
-    const std::string& input1 = node_proto.input(1);
-    const std::string output_name = node_proto.output(0);
-    Mat newShapeMat = getBlob(input1);
-    MatShape targetShape(newShapeMat.ptr<int>(), newShapeMat.ptr<int>() + newShapeMat.total());
-
-    MatShape inpShape;
-    bool haveVariables = constBlobs.find(input0) == constBlobs.end();
-    if (haveVariables)
-    {
-        IterShape_t shapeIt = outShapes.find(input0);
-        CV_Assert(shapeIt != outShapes.end());
-        inpShape = shapeIt->second;
-    }
-    else
-    {
-        Mat blob = getBlob(input0);
-        if (constBlobsExtraInfo.find(node_proto.input(0)) != constBlobsExtraInfo.end() &&
-            getBlobExtraInfo(node_proto, 0).real_ndims == 1) {
-            inpShape = {(int)blob.total()};
-        } else {
-            inpShape = shape(blob);
-        }
-    }
-
-    String srcName = input0;
-    // Unsqueeze and repeat along new axis
-    if (targetShape.size() > inpShape.size())
-    {
-        inpShape.insert(inpShape.begin(), targetShape.size() - inpShape.size(), 1);
-        for (int i = 0; i < targetShape.size(); i++)
-        {
-            if (abs(targetShape[i]) == 1)
-                targetShape[i] = inpShape[i];
-        }
-        if (haveVariables)
-        {
-            LayerParams reshapeLp;
-            reshapeLp.name = layerParams.name + "/reshape";
-            reshapeLp.type = "Reshape";
-            CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
-            reshapeLp.set("dim", DictValue::arrayInt(&inpShape[0], inpShape.size()));
-
-            opencv_onnx::NodeProto proto;
-            proto.add_input(node_proto.input(0));
-            proto.add_output(reshapeLp.name);
-            addLayer(reshapeLp, proto);
-            srcName = reshapeLp.name;
-        }
-    }
-    CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");
-
-    std::vector<int> broadcast_axes;
-    // shapes aren't right-aligned here because targetShape.size() == inpShape.size()
-    for (int i = 0; i < targetShape.size(); i++)
-    {
-        if (targetShape[i] != inpShape[i])
-        {
-            if (inpShape[i] == 1)
-            {
-                broadcast_axes.push_back(i);
-            }
-            else if (targetShape[i] != 1)
-            {
-                CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i));
+    CV_CheckEQ(node_proto.input_size(), 2, "DNN/ONNXImporter-Expand: two inputs are required");
+    // input shape must be constant and it is passed as param to the layer
+    CV_CheckTrue(constBlobs.find(node_proto.input(1)) != constBlobs.end(),
+                 "DNN/ONNXImporter-Expand: input shape must be constant");
+
+    Mat mat_input_shape = getBlob(node_proto, 1);
+    CV_CheckTypeEQ(mat_input_shape.depth(), CV_32S, "DNN/ONNXImporter-Expand: data type of input shape must be CV_32S");
+    for (int i = 0; i < mat_input_shape.total(); ++i) {
+        CV_Check(i, *(mat_input_shape.ptr<int>() + i) >= 0, "DNN/ONNXImporter-Expand: invalid shape dimension");
+    }
+    layerParams.set("shape", DictValue::arrayInt(mat_input_shape.ptr<int>(), mat_input_shape.total()));
+
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end()) {
+        bool const_input_1d = false;
+        if (constBlobsExtraInfo.find(node_proto.input(0)) != constBlobsExtraInfo.end()) {
+            if (getBlobExtraInfo(node_proto, 0).real_ndims == 1) {
+                const_input_1d = true;
             }
         }
-    }
-
-    if (!haveVariables)
-    {
-        if (broadcast_axes.empty())
-        {
-            addConstant(output_name, getBlob(node_proto, 0).reshape(1, targetShape));
-            return;
-        }
+        layerParams.set("const_input_1d", const_input_1d);
 
         Mat input = getBlob(node_proto, 0);
-        MatShape subTargetShape = inpShape;
-        for (auto broadcast_axis : broadcast_axes)
-        {
-            subTargetShape[broadcast_axis] = targetShape[broadcast_axis];
-            input = input.reshape(0, total(inpShape, 0, broadcast_axis));
-            Mat output = cv::repeat(input, 1, subTargetShape[broadcast_axis]);
-            input = output.reshape(0, subTargetShape);
-        }
-        addConstant(output_name, input);
+        std::vector<Mat> inputs, expanded;
+        inputs.push_back(input);
+        runLayer(layerParams, inputs, expanded);
+        CV_CheckEQ(expanded.size(), static_cast<size_t>(1), "DNN/Expand: only one output is expected when folding constant");
+        addConstant(node_proto.output(0), expanded[0]);
         return;
     }
 
-    if (broadcast_axes.size() == 2 &&
-        broadcast_axes[0] == broadcast_axes[1] - 1 && broadcast_axes[1] == inpShape.size() - 1)
-    {
-        LayerParams constParams;
-        constParams.name = layerParams.name + "/const";
-        CV_Assert(layer_id.find(constParams.name) == layer_id.end());
-        constParams.type = "Const";
-
-        Mat inp = Mat::ones(newShapeMat.total(), newShapeMat.ptr<int>(), CV_32F);
-        constParams.blobs.push_back(inp);
-
-        opencv_onnx::NodeProto proto;
-        proto.add_output(constParams.name);
-        addLayer(constParams, proto);
-
-        layerParams.type = "Scale";
-        layerParams.set("bias_term", false);
-        node_proto.set_input(0, constParams.name);
-        node_proto.set_input(1, srcName);
-    }
-    else if (broadcast_axes.size() == 1)
-    {
-        // FIXME: this will end up creating massive amount of Identity nodes for broadcasting,
-        //        for example, broadcast 1 to 256 needs 256 Identity nodes and 1 Concat node.
-        //        Possible improvement is to use "Scale".
-        expandMid(layerParams.name, node_proto, srcName, targetShape[broadcast_axes[0]]);
-
-        layerParams.set("axis", broadcast_axes[0]);
-        layerParams.type = "Concat";
-        node_proto.set_output(0, output_name);
-    }
-    else if (broadcast_axes.empty())
-    {
-        layerParams.type = "Identity";
-    }
-    else
-        CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
     addLayer(layerParams, node_proto);
 }
 
@@ -2606,7 +2456,7 @@ void ONNXImporter::parseCast(LayerParams& layerParams, const opencv_onnx::NodePr
             case opencv_onnx::TensorProto_DataType_FLOAT:   type = CV_32F; break;
             case opencv_onnx::TensorProto_DataType_UINT8:   type = CV_8U; break;
             case opencv_onnx::TensorProto_DataType_UINT16:  type = CV_16U; break;
-            case opencv_onnx::TensorProto_DataType_FLOAT16: type = CV_16S; break;
+            case opencv_onnx::TensorProto_DataType_FLOAT16: type = CV_16F; break;
             case opencv_onnx::TensorProto_DataType_INT8:
             case opencv_onnx::TensorProto_DataType_INT16:
             case opencv_onnx::TensorProto_DataType_INT32:
@@ -2651,11 +2501,11 @@ void ONNXImporter::parseGather(LayerParams& layerParams, const opencv_onnx::Node
     CV_CheckEQ(node_proto.input_size(), 2, "");
 
     // TODO: get rid of the type conversions and 1-d/0-d special-casing when the time comes
-    if (layer_id.find(node_proto.input(1)) == layer_id.end())
+    if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
     {
         int real_ndims = getBlobExtraInfo(node_proto.input(1)).real_ndims;
         layerParams.set("real_ndims", real_ndims);
-        if (layer_id.find(node_proto.input(0)) == layer_id.end())
+        if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
         {
             std::vector<Mat> inputs, output;
 
@@ -2701,6 +2551,53 @@ void ONNXImporter::parseGather(LayerParams& layerParams, const opencv_onnx::Node
     addLayer(layerParams, node_proto);
 }
 
+void ONNXImporter::parseGatherElements(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_CheckEQ(node_proto.input_size(), 2, "GatherElements: two inputs are required");
+
+    size_t num_const = 0;
+    for (size_t i = 0; i < node_proto.input_size(); ++i){
+        if (constBlobs.find(node_proto.input(i)) != constBlobs.end())
+            ++num_const;
+    }
+
+    if (num_const == node_proto.input_size())
+    {
+        std::vector<Mat> inputs, output;
+        for (size_t i = 0; i < node_proto.input_size(); i++) {
+            Mat blob = getBlob(node_proto, i);
+            if (i == 1) { // indices, from int32/int64 to float32 for compatibility
+                blob.convertTo(blob, CV_32F);
+            }
+            inputs.push_back(blob);
+        }
+        runLayer(layerParams, inputs, output);
+        CV_Assert(output.size() == 1);
+        addConstant(node_proto.output(0), output[0]);
+        return;
+    } else if (num_const > 0) {
+        for (size_t i = 0; i < node_proto.input_size(); i++) {
+            if (constBlobs.find(node_proto.input(i)) != constBlobs.end()) {
+                Mat blob = getBlob(node_proto, i);
+                if (i == 1) { // indices, from int32/int64 to float32 for compatibility
+                    blob.convertTo(blob, CV_32F);
+                }
+
+                LayerParams constParams;
+                constParams.name = node_proto.input(i);
+                constParams.type = "Const";
+                constParams.blobs.push_back(blob);
+
+                opencv_onnx::NodeProto proto;
+                proto.add_output(constParams.name);
+                addLayer(constParams, proto);
+            }
+        }
+    }
+
+    addLayer(layerParams, node_proto);
+}
+
 void ONNXImporter::parseConcat(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     bool hasVariableInps = false;
@@ -2734,6 +2631,7 @@ void ONNXImporter::parseConcat(LayerParams& layerParams, const opencv_onnx::Node
 
         // Concat-1 has default value for axis is 1: https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Concat-1
         int axis = layerParams.get<int>("axis", 1);
+        axis = normalize_axis(axis, inputShape.size());
         for (size_t i = 0; i < inputs.size(); ++i)
         {
             MatShape targetShape = inputShape;
@@ -2888,6 +2786,13 @@ void ONNXImporter::parseUpsample(LayerParams& layerParams, const opencv_onnx::No
 void ONNXImporter::parseSoftMax(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     const std::string& layer_type = node_proto.op_type();
+    int axis;
+    if (onnx_opset != 0 && onnx_opset <= 11) {
+        axis = layerParams.get<int>("axis", 1);
+    } else {
+        axis = layerParams.get<int>("axis", -1);
+    }
+    layerParams.set<int>("axis", axis);
     layerParams.type = "Softmax";
     layerParams.set("log_softmax", layer_type == "LogSoftmax");
     addLayer(layerParams, node_proto);
@@ -2940,6 +2845,11 @@ void ONNXImporter::parseElementWise(LayerParams& layerParams, const opencv_onnx:
 
     layerParams.type = "NaryEltwise";
     layerParams.set("operation", toLowerCase(node_proto.op_type()));
+    if (node_proto.op_type() == "Mod") {
+        if (layerParams.get<int>("fmod", 0)) {
+            layerParams.set("operation", "fmod");
+        };
+    }
 
     // element-wise layers that can have >=1 inputs but actually have one input
     if (node_proto.input_size() == 1 && (op_type == "max" || op_type == "min" || op_type == "mean" || op_type == "sum"))
@@ -3100,22 +3010,22 @@ void ONNXImporter::parseRange(LayerParams& layerParams, const opencv_onnx::NodeP
     CV_Assert(const_id.size() == 3);
 
     Mat startMat = getBlob(node_proto, 0);
-    CV_Assert(startMat.type() == CV_32SC1);
-    int start = startMat.at<int>(0);
+    startMat.convertTo(startMat, CV_32F);
+    float start = startMat.at<float>(0);
 
     Mat limitMat = getBlob(node_proto, 1);
-    CV_Assert(limitMat.type() == CV_32SC1);
-    int limit = limitMat.at<int>(0);
+    limitMat.convertTo(limitMat, CV_32F);
+    float limit = limitMat.at<float>(0);
 
     Mat deltaMat = getBlob(node_proto, 2);
-    CV_Assert(deltaMat.type() == CV_32SC1);
-    int delta = deltaMat.at<int>(0);
+    deltaMat.convertTo(deltaMat, CV_32F);
+    float delta = deltaMat.at<float>(0);
 
     int number_of_elements = std::max(int(std::ceil((limit - start) / delta)), 0);
-    Mat r(number_of_elements, 1, CV_32SC1);
+    Mat r(1, number_of_elements, CV_32FC1); // should be 1d tensor, but Mat doesn't support it
     for (int i = 0; i < number_of_elements; i++)
     {
-        r.at<int>(i) = start + (i * delta);
+        r.at<float>(i) = start + (i * delta);
     }
     addConstant(node_proto.output(0), r);
     constBlobsExtraInfo.insert(std::make_pair(node_proto.output(0), TensorInfo(1)));
@@ -3260,37 +3170,31 @@ void ONNXImporter::parseLayerNorm(LayerParams& layerParams, const opencv_onnx::N
     axis = (axis + inputDims) % inputDims;
     layerParams.set("axis", axis);
 
-    // check if bias existed
-    bool hasBias = false;
-    if (node_proto.input_size() > 2)
-        hasBias = true;
-    layerParams.set("hasBias", hasBias);
-
     // constants as constant inputs
     for (size_t i = 1; i < node_proto.input_size(); i++)
     {
-        if (layer_id.find(node_proto.input(i)) == layer_id.end())
-        {
+        if (constBlobs.find(node_proto.input(i)) != constBlobs.end()) {
             Mat blob = getBlob(node_proto, i);
-
-            LayerParams constParams;
-            constParams.name = node_proto.input(i);
-            constParams.type = "Const";
-            constParams.blobs.push_back(blob);
-
-            opencv_onnx::NodeProto proto;
-            proto.add_output(constParams.name);
-            addLayer(constParams, proto);
+            layerParams.blobs.push_back(blob);
         }
     }
 
     // Remove additional outputs (Mean, InvStdDev)
     if (node_proto.output_size() > 1)
     {
+        // remove from graph proto
+        for (size_t i = 1; i < node_proto.output_size(); i++) {
+            for (int j = graph_proto->output_size() - 1; j >= 0; j--) {
+                if (graph_proto->output(j).name() == node_proto.output(i)) {
+                    graph_proto->mutable_output()->DeleteSubrange(j, 1);
+                    break;
+                }
+            }
+        }
+        // remove from node proto
         auto outputName = node_proto.output(0);
         opencv_onnx::NodeProto node_proto_ = node_proto;
-        node_proto_.clear_output();
-        node_proto_.add_output(outputName);
+        node_proto_.mutable_output()->DeleteSubrange(1, node_proto_.output_size() - 1);
         addLayer(layerParams, node_proto_);
     }
     else
@@ -3327,6 +3231,65 @@ void ONNXImporter::parseSimpleLayers(LayerParams& layerParams, const opencv_onnx
     addLayer(layerParams, node_proto);
 }
 
+void ONNXImporter::parseEinsum(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    std::vector<MatShape> einsumInpShapes;
+    for (int j = 0; j < node_proto.input_size(); j++)
+    {
+        // create Const layer for constants and mark its shape
+        std::vector<int> input_shape;
+        if (layer_id.find(node_proto.input(j)) == layer_id.end()) {
+            Mat blob = getBlob(node_proto, j);
+
+            LayerParams const_params;
+            const_params.name = node_proto.input(j);
+            const_params.type = "Const";
+            const_params.blobs.push_back(blob);
+
+            opencv_onnx::NodeProto proto;
+            proto.add_output(const_params.name);
+            addLayer(const_params, proto);
+
+            input_shape.resize(blob.dims);
+            for (size_t i = 0; i < input_shape.size(); i++) {
+                input_shape[i] = blob.size[i];
+            }
+        }
+
+        // also try getting shape from inferred shapes
+        if (input_shape.empty()) {
+            const auto& inputLayerName = node_proto.input(j);
+            auto it = outShapes.find(inputLayerName);
+            if (it != outShapes.end()) {
+                input_shape = it->second;
+            }
+        }
+
+        if (input_shape.empty()) {
+            CV_Error(Error::StsAssert, format("ERROR input shape of %s not found", node_proto.input(j).c_str()));
+        } else {
+            einsumInpShapes.emplace_back(input_shape);
+        }
+    }
+
+    CV_CheckFalse(einsumInpShapes.empty(), "ERROR no inputs shapes");
+    for (int i = 0; i < einsumInpShapes.size(); i++) {
+        layerParams.set("inputShapes" + cv::format("%d", i), DictValue::arrayInt(einsumInpShapes[i].begin(), einsumInpShapes[i].size()));
+    }
+
+    // Check if of eqution is valid
+    std::string equation = layerParams.get<std::string>("equation");
+    CV_CheckFalse(equation.empty(), "Equation is empty");
+
+    // Save number of inputs. We need it in layer initialization
+    layerParams.set("inputSize", node_proto.input_size());
+
+    // Save number of outputs. We need it in layer initialization
+    layerParams.set("outputSize", node_proto.output_size());
+
+    addLayer(layerParams, node_proto);
+}
+
 void ONNXImporter::parseCustomLayer(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
 {
     const std::string& name = layerParams.name;
@@ -3596,7 +3559,7 @@ void ONNXImporter::parseQGemm(LayerParams& layerParams, const opencv_onnx::NodeP
     Mat bias;
     if (constBlobs.find(node_proto.input(6)) != constBlobs.end())
         bias = getBlob(node_proto, 6);
-    else
+    if (bias.empty())
         bias = Mat::zeros(1, outCn, CV_32S);
 
     Mat biasFused(1, outCn, CV_32S);
@@ -3724,9 +3687,9 @@ void ONNXImporter::parseQEltwise(LayerParams& layerParams, const opencv_onnx::No
                 layerParams.type = "ScaleInt8";
                 layerParams.set("bias_term", op == "sum");
                 int axis = 1;
-                for (int i = 0; i < graph_proto.initializer_size(); i++)
+                for (int i = 0; i < graph_proto->initializer_size(); i++)
                 {
-                    opencv_onnx::TensorProto tensor_proto = graph_proto.initializer(i);
+                    opencv_onnx::TensorProto tensor_proto = graph_proto->initializer(i);
                     if (tensor_proto.name() == node_proto.input(constId))
                     {
                         axis = inpShape.size() - tensor_proto.dims_size();
@@ -3968,6 +3931,23 @@ void ONNXImporter::parseQSoftmax(LayerParams& layerParams, const opencv_onnx::No
     addLayer(layerParams, node_proto);
 }
 
+void ONNXImporter::parseAttention(LayerParams& params, const opencv_onnx::NodeProto& node_proto) {
+    CV_CheckTrue(params.has("num_heads"), "ONNXImporter/parseAttention: num_heads is required but missing");
+    CV_CheckTrue(params.has("qkv_hidden_sizes"), "ONNXImporter/parseAttention: qkv_hidden_sizes is required but missing");
+
+    auto param_qkv_hidden_sizes = params.get("qkv_hidden_sizes");
+    CV_CheckEQ(param_qkv_hidden_sizes.size(), 3, "ONNXImporter/parseAttention: qkv_hidden_sizes is must and only have three elements");
+
+    for (size_t i = 1; i < node_proto.input_size(); i++) {
+        if (constBlobs.find(node_proto.input(i)) != constBlobs.end()) {
+            Mat blob = getBlob(node_proto, i);
+            params.blobs.push_back(blob);
+        }
+    }
+
+    addLayer(params, node_proto);
+}
+
 // Domain: ai.onnx (default)
 // URL: https://github.com/onnx/onnx/blob/master/docs/Operators.md
 void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
@@ -4015,28 +3995,31 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
     dispatch["Cast"] = &ONNXImporter::parseCast;
     dispatch["ConstantFill"] = dispatch["ConstantOfShape"] = &ONNXImporter::parseConstantFill;
     dispatch["Gather"] = &ONNXImporter::parseGather;
+    dispatch["GatherElements"] = &ONNXImporter::parseGatherElements;
     dispatch["Concat"] = &ONNXImporter::parseConcat;
     dispatch["Resize"] = &ONNXImporter::parseResize;
     dispatch["Upsample"] = &ONNXImporter::parseUpsample;
-    dispatch["SoftMax"] = dispatch["LogSoftmax"] = &ONNXImporter::parseSoftMax;
+    dispatch["SoftMax"] = dispatch["Softmax"] = dispatch["LogSoftmax"] = &ONNXImporter::parseSoftMax;
     dispatch["DetectionOutput"] = &ONNXImporter::parseDetectionOutput;
     dispatch["CumSum"] = &ONNXImporter::parseCumSum;
     dispatch["SpaceToDepth"] = dispatch["DepthToSpace"] = &ONNXImporter::parseDepthToSpace;
     dispatch["ScatterElements"] = dispatch["Scatter"] = dispatch["ScatterND"] = &ONNXImporter::parseScatter;
     dispatch["Tile"] = &ONNXImporter::parseTile;
     dispatch["LayerNormalization"] = &ONNXImporter::parseLayerNorm;
+    dispatch["GroupNormalization"] = &ONNXImporter::parseInstanceNormalization;
 
     dispatch["Equal"] = dispatch["Greater"] = dispatch["Less"] = dispatch["Pow"] = dispatch["Add"] =
             dispatch["Sub"] = dispatch["Mul"] = dispatch["Div"] = dispatch["GreaterOrEqual"] =
-            dispatch["LessOrEqual"] = &ONNXImporter::parseElementWise;
+            dispatch["LessOrEqual"] = dispatch["Mod"] = &ONNXImporter::parseElementWise;
 
     dispatch["Sum"] = dispatch["Min"] = dispatch["Max"] = &ONNXImporter::parseElementWise;
     dispatch["Where"] = &ONNXImporter::parseElementWise;
     dispatch["Range"] = &ONNXImporter::parseRange;
+    dispatch["Einsum"] = &ONNXImporter::parseEinsum;
 
     std::vector<std::string> simpleLayers{"Acos", "Acosh", "Asin", "Asinh", "Atan", "Atanh", "Ceil", "Celu", "Cos",
                                           "Cosh", "Dropout", "Erf", "Exp", "Floor", "HardSigmoid", "HardSwish",
-                                          "Identity", "Log", "Round", "Reciprocal", "Selu", "Sign", "Sigmoid", "Sin", "Sinh", "Softmax",
+                                          "Identity", "Log", "Round", "Reciprocal", "Selu", "Sign", "Sigmoid", "Sin", "Sinh",
                                           "Softplus", "Softsign", "Shrink", "Sqrt", "Tan", "ThresholdedRelu", "Gelu",
                                           "GeluApproximation"};
     for (const auto& name : simpleLayers)
@@ -4049,6 +4032,11 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
     dispatch["QLinearConv"] = &ONNXImporter::parseQConv;
     dispatch["QLinearMatMul"] = &ONNXImporter::parseQMatMul;
 
+    // com.microsft: This operator is added for compatibility via onnx graph simplifier.
+    //               Opset domain cannot be modified from onnx_graph_simplifier.cpp so this
+    //               operator cannot be parsed if only added in buildDispatchMap_COM_MICROSOFT
+    dispatch["Attention"] = &ONNXImporter::parseAttention;
+
     domain_dispatch_map[str_domain_ai_onnx] = dispatch;
 }
 
@@ -4066,6 +4054,7 @@ void ONNXImporter::buildDispatchMap_COM_MICROSOFT(int opset_version)
     dispatch["QLinearConcat"] = &ONNXImporter::parseQConcat;
     dispatch["QGemm"] = &ONNXImporter::parseQGemm;
     dispatch["QLinearSoftmax"] = &ONNXImporter::parseQSoftmax;
+    dispatch["Attention"] = &ONNXImporter::parseAttention;
 
     domain_dispatch_map["com.microsoft"] = dispatch;
 }
diff --git a/modules/dnn/src/op_cann.hpp b/modules/dnn/src/op_cann.hpp
index c60c311b7f31..1d15eab6a3ef 100644
--- a/modules/dnn/src/op_cann.hpp
+++ b/modules/dnn/src/op_cann.hpp
@@ -10,7 +10,11 @@
 #include "graph/graph.h" // ge::Graph; ge::Operator from operator.h
 #include "graph/ge_error_codes.h" // GRAPH_SUCCESS, ...
 
-#include "op_proto/built-in/inc/all_ops.h" // ge::Conv2D, ...
+#ifdef CANN_VERSION_BELOW_6_3_ALPHA002
+    #include "op_proto/built-in/inc/all_ops.h" // ge::Conv2D, ...
+#else
+    #include "built-in/op_proto/inc/all_ops.h" // ge::Conv2D, ...
+#endif
 #include "graph/tensor.h" // ge::Shape, ge::Tensor, ge::TensorDesc
 #include "graph/types.h" // DT_FLOAT, ... ; FORMAT_NCHW, ...
 
diff --git a/modules/dnn/src/op_halide.cpp b/modules/dnn/src/op_halide.cpp
index 653de36146cc..db1a72278e70 100644
--- a/modules/dnn/src/op_halide.cpp
+++ b/modules/dnn/src/op_halide.cpp
@@ -14,6 +14,7 @@
 #include "halide_scheduler.hpp"
 
 #include <HalideRuntimeOpenCL.h>
+#include <thread>
 #endif  // HAVE_HALIDE
 
 namespace cv {
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index f9e3993d206b..04f1da7c7150 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -10,7 +10,7 @@
 #include <opencv2/dnn/shape_utils.hpp>
 
 #ifdef HAVE_INF_ENGINE
-#include <ie_extension.h>
+#include <openvino/core/extension.hpp>
 #elif defined(ENABLE_PLUGINS)
 // using plugin API
 #include "backend.hpp"
@@ -39,60 +39,6 @@ cv::String setInferenceEngineBackendType(const cv::String& newBackendType)
 
 CV__DNN_INLINE_NS_END
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
-namespace InferenceEngine {
-
-CNNNetwork::CNNNetwork() {}
-
-CNNNetwork::CNNNetwork(std::shared_ptr<ov::Model> model) : model(model) {}
-
-std::shared_ptr<ov::Model> CNNNetwork::getFunction() const {
-    return model;
-}
-
-void CNNNetwork::serialize(const std::string& xmlPath, const std::string& binPath) {
-    ov::pass::Serialize(xmlPath, binPath).run_on_model(model);
-}
-
-void CNNNetwork::reshape(const std::map<std::string, std::vector<size_t> >& shapes) {
-    std::map<std::string, ov::PartialShape> partialShapes;
-    for (const auto& it : shapes) {
-        ov::PartialShape shape;
-        shape.insert(shape.begin(), it.second.begin(), it.second.end());
-        partialShapes.insert({it.first, shape});
-    }
-    model->reshape(partialShapes);
-}
-
-std::vector<std::string> Core::GetAvailableDevices() {
-    return get_available_devices();
-}
-
-void Core::UnregisterPlugin(const std::string& id) {
-    unload_plugin(id);
-}
-
-CNNNetwork Core::ReadNetwork(const std::string& xmlPath, const std::string& binPath) {
-    return read_model(xmlPath, binPath);
-}
-
-ExecutableNetwork Core::LoadNetwork(CNNNetwork net, const std::string& device,
-                                    const std::map<std::string, std::string>& config) {
-    ov::AnyMap props;
-    for (const auto& it : config) {
-        props.insert(it);
-    }
-    return compile_model(net.getFunction(), device, props);
-}
-
-ExecutableNetwork::ExecutableNetwork() {}
-
-ExecutableNetwork::ExecutableNetwork(const ov::CompiledModel& copy) : CompiledModel(copy) {}
-
-ov::InferRequest ExecutableNetwork::CreateInferRequest() { return create_infer_request(); }
-
-}  // namespace InferenceEngine
-
 Mat infEngineBlobToMat(const ov::Tensor& blob)
 {
     std::vector<size_t> dims = blob.get_shape();
@@ -118,67 +64,40 @@ void infEngineBlobsToMats(const ov::TensorVector& blobs,
         mats[i] = infEngineBlobToMat(blobs[i]);
 }
 
-#else
-
-Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
-{
-    // NOTE: Inference Engine sizes are reversed.
-    std::vector<size_t> dims = blob->getTensorDesc().getDims();
-    std::vector<int> size(dims.begin(), dims.end());
-    auto precision = blob->getTensorDesc().getPrecision();
-
-    int type = -1;
-    switch (precision)
-    {
-        case InferenceEngine::Precision::FP32: type = CV_32F; break;
-        case InferenceEngine::Precision::U8: type = CV_8U; break;
-        default:
-            CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
-    }
-    return Mat(size, type, (void*)blob->buffer());
-}
-
-void infEngineBlobsToMats(const std::vector<InferenceEngine::Blob::Ptr>& blobs,
-                          std::vector<Mat>& mats)
-{
-    mats.resize(blobs.size());
-    for (int i = 0; i < blobs.size(); ++i)
-        mats[i] = infEngineBlobToMat(blobs[i]);
-}
-#endif // OpenVINO >= 2022.1
 
 static bool init_IE_plugins()
 {
     // load and hold IE plugins
-    static InferenceEngine::Core* init_core = new InferenceEngine::Core();  // 'delete' is never called
-    (void)init_core->GetAvailableDevices();
+    static ov::Core* init_core = new ov::Core();  // 'delete' is never called
+    (void)init_core->get_available_devices();
     return true;
 }
-static InferenceEngine::Core& retrieveIECore(const std::string& id, std::map<std::string, std::shared_ptr<InferenceEngine::Core> >& cores)
+static ov::Core& retrieveIECore(const std::string& id, std::map<std::string, std::shared_ptr<ov::Core> >& cores)
 {
     AutoLock lock(getInitializationMutex());
-    std::map<std::string, std::shared_ptr<InferenceEngine::Core> >::iterator i = cores.find(id);
+    std::map<std::string, std::shared_ptr<ov::Core> >::iterator i = cores.find(id);
     if (i == cores.end())
     {
-        std::shared_ptr<InferenceEngine::Core> core = std::make_shared<InferenceEngine::Core>();
+        std::shared_ptr<ov::Core> core = std::make_shared<ov::Core>();
         cores[id] = core;
         return *core.get();
     }
     return *(i->second).get();
 }
-static InferenceEngine::Core& create_IE_Core_instance(const std::string& id)
+static ov::Core& create_IE_Core_instance(const std::string& id)
 {
-    static std::map<std::string, std::shared_ptr<InferenceEngine::Core> > cores;
+    static std::map<std::string, std::shared_ptr<ov::Core> > cores;
     return retrieveIECore(id, cores);
 }
-static InferenceEngine::Core& create_IE_Core_pointer(const std::string& id)
+static ov::Core& create_IE_Core_pointer(const std::string& id)
 {
     // load and hold IE plugins
-    static std::map<std::string, std::shared_ptr<InferenceEngine::Core> >* cores =
-            new std::map<std::string, std::shared_ptr<InferenceEngine::Core> >();
+    static std::map<std::string, std::shared_ptr<ov::Core> >* cores =
+            new std::map<std::string, std::shared_ptr<ov::Core> >();
     return retrieveIECore(id, *cores);
 }
-InferenceEngine::Core& getCore(const std::string& id)
+
+ov::Core& getCore(const std::string& id)
 {
     // to make happy memory leak tools use:
     // - OPENCV_DNN_INFERENCE_ENGINE_HOLD_PLUGINS=0
@@ -195,7 +114,7 @@ InferenceEngine::Core& getCore(const std::string& id)
 #endif
             );
 
-    InferenceEngine::Core& core = param_DNN_INFERENCE_ENGINE_CORE_LIFETIME_WORKAROUND
+    ov::Core& core = param_DNN_INFERENCE_ENGINE_CORE_LIFETIME_WORKAROUND
             ? create_IE_Core_pointer(id)
             : create_IE_Core_instance(id);
     return core;
@@ -204,17 +123,13 @@ InferenceEngine::Core& getCore(const std::string& id)
 
 static bool detectArmPlugin_()
 {
-    InferenceEngine::Core& ie = getCore("CPU");
-    const std::vector<std::string> devices = ie.GetAvailableDevices();
+    ov::Core& ie = getCore("CPU");
+    const std::vector<std::string> devices = ie.get_available_devices();
     for (std::vector<std::string>::const_iterator i = devices.begin(); i != devices.end(); ++i)
     {
         if (i->find("CPU") != std::string::npos)
         {
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
             const std::string name = ie.get_property(*i, ov::device::full_name);
-#else
-            const std::string name = ie.GetMetric(*i, METRIC_KEY(FULL_DEVICE_NAME)).as<std::string>();
-#endif
             CV_LOG_INFO(NULL, "CPU plugin: " << name);
             return name.find("arm_compute::NEON") != std::string::npos;
         }
@@ -228,17 +143,13 @@ static bool detectMyriadX_(const std::string& device)
     AutoLock lock(getInitializationMutex());
 
     // Lightweight detection
-    InferenceEngine::Core& ie = getCore(device);
-    const std::vector<std::string> devices = ie.GetAvailableDevices();
+    ov::Core& ie = getCore(device);
+    const std::vector<std::string> devices = ie.get_available_devices();
     for (std::vector<std::string>::const_iterator i = devices.begin(); i != devices.end(); ++i)
     {
         if (i->find(device) != std::string::npos)
         {
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
             const std::string name = ie.get_property(*i, ov::device::full_name);
-#else
-            const std::string name = ie.GetMetric(*i, METRIC_KEY(FULL_DEVICE_NAME)).as<std::string>();
-#endif
             CV_LOG_INFO(NULL, "Myriad device: " << name);
             return name.find("MyriadX") != std::string::npos || name.find("Myriad X") != std::string::npos || name.find("HDDL") != std::string::npos;
         }
@@ -259,11 +170,11 @@ void resetMyriadDevice()
 
     AutoLock lock(getInitializationMutex());
 
-    InferenceEngine::Core& ie = getCore("MYRIAD");
+    ov::Core& ie = getCore("MYRIAD");
     try
     {
-        ie.UnregisterPlugin("MYRIAD");
-        ie.UnregisterPlugin("HETERO");
+        ie.unload_plugin("MYRIAD");
+        ie.unload_plugin("HETERO");
     }
     catch (...) {}
 #endif  // HAVE_INF_ENGINE
@@ -276,11 +187,11 @@ void releaseHDDLPlugin()
 
     AutoLock lock(getInitializationMutex());
 
-    InferenceEngine::Core& ie = getCore("HDDL");
+    ov::Core& ie = getCore("HDDL");
     try
     {
-        ie.UnregisterPlugin("HDDL");
-        ie.UnregisterPlugin("HETERO");
+        ie.unload_plugin("HDDL");
+        ie.unload_plugin("HETERO");
     }
     catch (...) {}
 #endif  // HAVE_INF_ENGINE
@@ -351,7 +262,7 @@ namespace openvino {
 bool checkTarget(Target target)
 {
     // Lightweight detection
-    const std::vector<std::string> devices = getCore("").GetAvailableDevices();
+    const std::vector<std::string> devices = getCore("").get_available_devices();
     for (std::vector<std::string>::const_iterator i = devices.begin(); i != devices.end(); ++i)
     {
         if (std::string::npos != i->find("MYRIAD") && target == DNN_TARGET_MYRIAD)
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index 45913d3b3188..236b21b1a3a6 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -19,18 +19,13 @@
 
 #ifdef HAVE_INF_ENGINE
 
-#define INF_ENGINE_RELEASE_2020_2 2020020000
-#define INF_ENGINE_RELEASE_2020_3 2020030000
-#define INF_ENGINE_RELEASE_2020_4 2020040000
-#define INF_ENGINE_RELEASE_2021_1 2021010000
-#define INF_ENGINE_RELEASE_2021_2 2021020000
-#define INF_ENGINE_RELEASE_2021_3 2021030000
-#define INF_ENGINE_RELEASE_2021_4 2021040000
 #define INF_ENGINE_RELEASE_2022_1 2022010000
+#define INF_ENGINE_RELEASE_2023_0 2023000000
+#define INF_ENGINE_RELEASE_2024_0 2024000000
 
 #ifndef INF_ENGINE_RELEASE
-#warning("IE version have not been provided via command-line. Using 2021.4 by default")
-#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2021_4
+#warning("IE version have not been provided via command-line. Using 2022.1 by default")
+#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2022_1
 #endif
 
 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
@@ -44,13 +39,9 @@
 #pragma GCC diagnostic ignored "-Wsuggest-override"
 #endif
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
 #include <openvino/openvino.hpp>
 #include <openvino/pass/serialize.hpp>
 #include <openvino/pass/convert_fp32_to_fp16.hpp>
-#else
-#include <inference_engine.hpp>
-#endif
 
 #if defined(__GNUC__) && __GNUC__ >= 5
 //#pragma GCC diagnostic pop
@@ -75,18 +66,10 @@ CV__DNN_INLINE_NS_END
 
 Backend& getInferenceEngineBackendTypeParam();
 
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
 Mat infEngineBlobToMat(const ov::Tensor& blob);
 
 void infEngineBlobsToMats(const ov::TensorVector& blobs,
                           std::vector<Mat>& mats);
-#else
-Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob);
-
-void infEngineBlobsToMats(const std::vector<InferenceEngine::Blob::Ptr>& blobs,
-                          std::vector<Mat>& mats);
-#endif  // OpenVINO >= 2022.1
-
 
 CV__DNN_INLINE_NS_BEGIN
 
@@ -98,54 +81,7 @@ bool isArmComputePlugin();
 
 CV__DNN_INLINE_NS_END
 
-// A series of wrappers for classes from OpenVINO API 2.0.
-// Need just for less conditional compilation inserts.
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
-namespace InferenceEngine {
-
-class CNNNetwork {
-public:
-    CNNNetwork();
-
-    CNNNetwork(std::shared_ptr<ov::Model> model);
-
-    std::shared_ptr<ov::Model> getFunction() const;
-
-    void serialize(const std::string& xmlPath, const std::string& binPath);
-
-    void reshape(const std::map<std::string, std::vector<size_t> >& shapes);
-
-private:
-    std::shared_ptr<ov::Model> model = nullptr;
-};
-
-typedef ov::InferRequest InferRequest;
-
-class ExecutableNetwork : public ov::CompiledModel {
-public:
-    ExecutableNetwork();
-
-    ExecutableNetwork(const ov::CompiledModel& copy);
-
-    ov::InferRequest CreateInferRequest();
-};
-
-class Core : public ov::Core {
-public:
-    std::vector<std::string> GetAvailableDevices();
-
-    void UnregisterPlugin(const std::string& id);
-
-    CNNNetwork ReadNetwork(const std::string& xmlPath, const std::string& binPath);
-
-    ExecutableNetwork LoadNetwork(CNNNetwork net, const std::string& device,
-                                  const std::map<std::string, std::string>& config);
-};
-
-}
-#endif // OpenVINO >= 2022.1
-
-InferenceEngine::Core& getCore(const std::string& id);
+ov::Core& getCore(const std::string& id);
 
 template<typename T = size_t>
 static inline std::vector<T> getShape(const Mat& mat)
diff --git a/modules/dnn/src/op_webnn.hpp b/modules/dnn/src/op_webnn.hpp
index 5b77b1082766..6f96289b8052 100644
--- a/modules/dnn/src/op_webnn.hpp
+++ b/modules/dnn/src/op_webnn.hpp
@@ -111,7 +111,7 @@ class WebnnNet
     void addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs);
 
     void createNet(Target targetId);
-    // void setNodePtr(std::shared_ptr<ngraph::Node>* ptr);
+    // void setNodePtr(std::shared_ptr<ov::Node>* ptr);
 
     void reset();
 
diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl
index 317d2c1e6240..96b56725fb76 100644
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@@ -73,14 +73,23 @@ __kernel void ReLU6Forward(const int count, __global const T* in, __global T* ou
   }
 }
 
+__kernel void ChannelsPReLUForward(const int count, const int channels, const int plane_size,
+                                   __global const T* in, __global T* out,
+                                   __global const KERNEL_ARG_DTYPE* slope_data)
+{
+  int index = get_global_id(0);
+  int c = (index / plane_size) % channels;
+  if(index < count)
+    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
+}
+
 __kernel void PReLUForward(const int count, const int channels, const int plane_size,
                            __global const T* in, __global T* out,
                            __global const KERNEL_ARG_DTYPE* slope_data)
 {
   int index = get_global_id(0);
-  int c = (index / plane_size) % channels;
   if(index < count)
-  out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
+    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[index];
 }
 
 __kernel void TanHForward(const int count, __global T* in, __global T* out) {
@@ -352,4 +361,4 @@ __kernel void ReciprocalForward(const int n, __global T* in, __global T* out)
     int index = get_global_id(0);
     if(index < n)
         out[index] = 1.0f/in[index];
-}
\ No newline at end of file
+}
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index c9ddacfb8e50..0ac074b26bf0 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -95,23 +95,23 @@
 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
 #define LOOP0(VAR, STMT)
-#define LOOP1(VAR, STMT) (STMT); (VAR)++;
-#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
-#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
-#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
-#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
-#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
-#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
-#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
-#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
-#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
-#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
-#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
-#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
-#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
-#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
-#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
-#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
+#define LOOP1(VAR, STMT) STMT; (VAR)++;
+#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); STMT; (VAR)++;
+#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); STMT; (VAR)++;
+#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); STMT; (VAR)++;
+#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); STMT; (VAR)++;
+#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); STMT; (VAR)++;
+#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); STMT; (VAR)++;
+#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); STMT; (VAR)++;
+#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); STMT; (VAR)++;
+#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); STMT; (VAR)++;
+#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); STMT; (VAR)++;
+#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); STMT; (VAR)++;
+#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); STMT; (VAR)++;
+#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); STMT; (VAR)++;
+#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); STMT; (VAR)++;
+#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); STMT; (VAR)++;
+#define LOOP(N, VAR, STMT) CAT(LOOP, N)(VAR, STMT)
 
 #if defined(convolve_simd) || defined(Conv_Interleaved)
 #if TYPE == TYPE_HALF
diff --git a/modules/dnn/src/opencl/gemm_buffer.cl b/modules/dnn/src/opencl/gemm_buffer.cl
index b345983aee6f..70028b0eecf3 100644
--- a/modules/dnn/src/opencl/gemm_buffer.cl
+++ b/modules/dnn/src/opencl/gemm_buffer.cl
@@ -453,14 +453,14 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
     int w;
     for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
         barrier(CLK_LOCAL_MEM_FENCE);
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(0, K, local_index))), 0, (__local float *)(slm_brow + mad24(0, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(1, K, local_index))), 0, (__local float *)(slm_brow + mad24(1, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(2, K, local_index))), 0, (__local float *)(slm_brow + mad24(2, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(3, K, local_index))), 0, (__local float *)(slm_brow + mad24(3, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(4, K, local_index))), 0, (__local float *)(slm_brow + mad24(4, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(5, K, local_index))), 0, (__local float *)(slm_brow + mad24(5, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(6, K, local_index))), 0, (__local float *)(slm_brow + mad24(6, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(7, K, local_index))), 0, (__local float *)(slm_brow + mad24(7, SLM_BLOCK, local_index)));
+        vstore8(vload8(0, src1_read0 + mad24(0, K, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(1, K, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(2, K, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(3, K, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(4, K, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(5, K, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(6, K, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(7, K, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
         barrier(CLK_LOCAL_MEM_FENCE);
 
         slm_brow0 = slm_brow + local_x * (TILE_K / 8);
@@ -469,17 +469,17 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
         while( w + TILE_K <= end_w ) {
             Dtype8 arow;
 
-            brow0 = as_half8(vload4(0, (__local float *)(slm_brow0 + 0 * SLM_BLOCK)));
-            brow1 = as_half8(vload4(0, (__local float *)(slm_brow0 + 1 * SLM_BLOCK)));
-            brow2 = as_half8(vload4(0, (__local float *)(slm_brow0 + 2 * SLM_BLOCK)));
-            brow3 = as_half8(vload4(0, (__local float *)(slm_brow0 + 3 * SLM_BLOCK)));
-            brow4 = as_half8(vload4(0, (__local float *)(slm_brow0 + 4 * SLM_BLOCK)));
-            brow5 = as_half8(vload4(0, (__local float *)(slm_brow0 + 5 * SLM_BLOCK)));
-            brow6 = as_half8(vload4(0, (__local float *)(slm_brow0 + 6 * SLM_BLOCK)));
-            brow7 = as_half8(vload4(0, (__local float *)(slm_brow0 + 7 * SLM_BLOCK)));
+            brow0 = vload8(0, slm_brow0 + 0 * SLM_BLOCK);
+            brow1 = vload8(0, slm_brow0 + 1 * SLM_BLOCK);
+            brow2 = vload8(0, slm_brow0 + 2 * SLM_BLOCK);
+            brow3 = vload8(0, slm_brow0 + 3 * SLM_BLOCK);
+            brow4 = vload8(0, slm_brow0 + 4 * SLM_BLOCK);
+            brow5 = vload8(0, slm_brow0 + 5 * SLM_BLOCK);
+            brow6 = vload8(0, slm_brow0 + 6 * SLM_BLOCK);
+            brow7 = vload8(0, slm_brow0 + 7 * SLM_BLOCK);
 
 #define MM_DOT_PRODUCT( _row, _dot )   \
-            arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K)));                           \
+            arow = vload8(0, src0_read + _row * K); \
             _dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \
             _dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \
             _dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \
@@ -510,7 +510,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
         Dtype8 arow;
 
 #define READ_BROW(_brow, _row) \
-        _brow = as_half8(vload4(0, (__local float *)(slm_brow0 + _row * SLM_BLOCK))); \
+        _brow = vload8(0, slm_brow0 + _row * SLM_BLOCK); \
         _brow.s0 = (mad24(local_x, 8, w) < K) ? _brow.s0 : 0.0f; \
         _brow.s1 = (mad24(local_x, 8, w + 1) < K) ? _brow.s1 : 0.0f; \
         _brow.s2 = (mad24(local_x, 8, w + 2) < K) ? _brow.s2 : 0.0f; \
@@ -532,7 +532,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
 #undef READ_BROW
 
 #define MM_DOT_PRODUCT( _row, _dot )   \
-        arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K)));                           \
+        arow = vload8(0, src0_read + _row * K);                           \
         arow.s0 = (mad24(local_x, 8, w) < K) ? arow.s0 : 0.0f; \
         arow.s1 = (mad24(local_x, 8, w + 1) < K) ? arow.s1 : 0.0f; \
         arow.s2 = (mad24(local_x, 8, w + 2) < K) ? arow.s2 : 0.0f; \
diff --git a/modules/dnn/src/opencl/mvn.cl b/modules/dnn/src/opencl/mvn.cl
index f84d04502c81..053749b483fe 100644
--- a/modules/dnn/src/opencl/mvn.cl
+++ b/modules/dnn/src/opencl/mvn.cl
@@ -54,6 +54,7 @@
     #define vec_type Dtype8
     #define CALC_MEAN calc_mean8
     #define MVN mvn8
+    #define MVN_GROUP mvn_group8
     #define MEAN_FUSE mean_fuse8
     #define MVN_FUSE mvn_fuse8
 #elif NUM == 4
@@ -62,6 +63,7 @@
     #define vec_type Dtype4
     #define CALC_MEAN calc_mean4
     #define MVN mvn4
+    #define MVN_GROUP mvn_group4
     #define MEAN_FUSE mean_fuse4
     #define MVN_FUSE mvn_fuse4
 #elif NUM == 1
@@ -70,6 +72,7 @@
     #define vec_type Dtype
     #define CALC_MEAN calc_mean1
     #define MVN mvn1
+    #define MVN_GROUP mvn_group1
     #define MEAN_FUSE mean_fuse1
     #define MVN_FUSE mvn_fuse1
 #endif
@@ -126,12 +129,66 @@ __kernel void MVN(__global const Dtype* src,
     alpha = 1;
 #endif
 
+#ifdef LAYER_NORM
+    vec_type w = load(bnorm_weight, y), b = load(bnorm_bias, y);
+#else
+
     Dtype w = 1.f, b = 0.f;
 #ifdef FUSE_BATCH_NORM
     w = bnorm_weight[x % channels];
     b = bnorm_bias[x % channels];
 #endif
 
+#endif // LAYER_NORM
+
+    vec_type src_vec = load(src, index) - (vec_type)mean_val;
+    vec_type dst_vec = src_vec * alpha;
+    dst_vec = dst_vec * w + (vec_type)b;
+
+#ifdef FUSE_RELU
+    vec_type new_val = dst_vec * relu_slope;
+    dst_vec = select(new_val, dst_vec, dst_vec > (vec_type)0.f);
+#endif
+
+    store(dst_vec, dst, index);
+}
+
+#elif defined KERNEL_MVN_GROUP
+
+__kernel void MVN_GROUP(__global const Dtype* src,
+                            const int rows,
+                            const int cols,
+                            const Dtype eps,
+                            __global const Dtype* mean,
+                            __global const Dtype* dev,
+                            __global const Dtype* weight,
+                            __global const Dtype* bias,
+                            const int channels,
+                            const int num_groups,
+                            const float relu_slope,
+                            __global Dtype* dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * NUM;
+    int index = x * cols + y;
+
+    if (x >= rows || y >= cols)
+        return;
+
+    int group_size = channels / num_groups;
+    int step = norm_size / group_size;
+    int channel_index = x % num_groups * group_size + y / step
+    Dtype mean_val = mean[x];
+    Dtype dev_val = dev[x];
+    Dtype alpha;
+#ifdef NORM_VARIANCE
+    alpha = 1 / sqrt(eps + dev_val);
+#else
+    alpha = 1;
+#endif
+
+    Dtype w = weight[channel_index], b = bias[channel_index];
+
     vec_type src_vec = load(src, index) - (vec_type)mean_val;
     vec_type dst_vec = src_vec * alpha;
     dst_vec = dst_vec * w + (vec_type)b;
diff --git a/modules/dnn/src/tengine4dnn/include/tengine_graph_convolution.hpp b/modules/dnn/src/tengine4dnn/include/tengine_graph_convolution.hpp
deleted file mode 100644
index 8ec99c9685da..000000000000
--- a/modules/dnn/src/tengine4dnn/include/tengine_graph_convolution.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-#ifndef TENGINE_GRAPH_CONVOLUTION_HPP
-#define TENGINE_GRAPH_CONVOLUTION_HPP
-
-#define FLOAT_TO_REALSIZE (4)
-#ifdef HAVE_TENGINE
-
-#include "tengine_c_api.h"
-
-namespace cv
-{
-namespace dnn
-{
-// pad_h0: pad_top
-// pad_h1: pad_bottom
-// pad_w0: pad_left
-// pad_w1: pad_right
-teng_graph_t  tengine_init(const char* name , float* input_, int inch, int group, int in_h, int in_w,
-                        float *output_, int out_b, int outch, int out_h, int out_w,
-                        float *kernel_,int kernel_s , int kernel_h, int kernel_w,
-                        float *teg_bias, int stride_h, int stride_w,
-                        int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w,
-                        size_t wstep, const std::string padMode , teng_graph_t& graph, int nstripes) ;
-
-bool tengine_forward(teng_graph_t& graph) ;
-bool tengine_release(teng_graph_t& graph) ;
-}
-}
-#endif
-#endif /* TENGINE_GRAPH_CONVOLUTION_HPP */
\ No newline at end of file
diff --git a/modules/dnn/src/tengine4dnn/src/tengine_graph_convolution.cpp b/modules/dnn/src/tengine4dnn/src/tengine_graph_convolution.cpp
deleted file mode 100644
index d35937006cd7..000000000000
--- a/modules/dnn/src/tengine4dnn/src/tengine_graph_convolution.cpp
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-#include "../../precomp.hpp"
-#include <iostream>
-#include <vector>
-
-#include <opencv2/core/utils/configuration.private.hpp>
-#include <opencv2/core/utils/logger.hpp>
-
-#include "../include/tengine_graph_convolution.hpp"
-
-#ifdef HAVE_TENGINE
-
-#include "tengine_c_api.h"
-
-
-namespace cv
-{
-namespace dnn
-{
-static int create_input_node(teng_graph_t graph, const char* node_name, int inch, int in_h, int in_w)
-{
-    node_t node     = teng_create_graph_node(graph, node_name, "InputOp");
-    tensor_t tensor = teng_create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
-    teng_set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
-
-    int dims[4] = {1, inch, in_h, in_w};
-    teng_set_tensor_shape(tensor, dims, 4);
-
-    teng_release_graph_tensor(tensor);
-    teng_release_graph_node(node);
-
-    return 0;
-}
-
-static int create_conv_node(teng_graph_t graph, const char* node_name, const char* input_name, int in_h, int in_w, int out_h, int out_w,
-    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_h1, int pad_w0, int pad_w1, int inch, int outch, int group,
-    int dilation_h, int dilation_w, int activation, std::string padMode)
-{
-    node_t conv_node      = teng_create_graph_node(graph, node_name, "Convolution");
-    tensor_t input_tensor = teng_get_graph_tensor(graph, input_name);
-
-    if (input_tensor == NULL)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: input_tensor is NULL." );
-        return -1;
-    }
-
-    teng_set_node_input_tensor(conv_node, 0, input_tensor);
-    teng_release_graph_tensor(input_tensor);
-
-    /* output */
-    tensor_t output_tensor = teng_create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
-
-    teng_set_node_output_tensor(conv_node, 0, output_tensor, TENSOR_TYPE_VAR);
-    teng_release_graph_tensor(output_tensor);
-
-    /* weight */
-    std::string weight_name(node_name);
-    weight_name += "/weight";
-
-    node_t w_node = teng_create_graph_node(graph, weight_name.c_str(), "Const");
-    tensor_t w_tensor = teng_create_graph_tensor(graph, weight_name.c_str(), TENGINE_DT_FP32);
-    teng_set_node_output_tensor(w_node, 0, w_tensor, TENSOR_TYPE_CONST);
-    teng_set_node_input_tensor(conv_node, 1, w_tensor);
-    int w_dims[] = {outch, inch / group, kernel_h, kernel_w};
-
-    teng_set_tensor_shape(w_tensor, w_dims, 4);
-
-    teng_release_graph_node(w_node);
-    teng_release_graph_tensor(w_tensor);
-
-    /* bias */
-    std::string bias_name(node_name);
-    bias_name += "/bias";
-
-    node_t b_node = teng_create_graph_node(graph, bias_name.c_str(), "Const");
-    tensor_t b_tensor = teng_create_graph_tensor(graph, bias_name.c_str(), TENGINE_DT_FP32);
-    teng_set_node_output_tensor(b_node, 0, b_tensor, TENSOR_TYPE_CONST);
-    int b_dims[] = {outch};
-
-    teng_set_tensor_shape(b_tensor, b_dims, 1);
-
-    teng_set_node_input_tensor(conv_node, 2, b_tensor);
-    teng_release_graph_node(b_node);
-    teng_release_graph_tensor(b_tensor);
-
-    if (!padMode.empty())
-    {
-        if (padMode == "SAME")
-        {
-            int out_h_temp = (in_h-kernel_h + 2*pad_h0)/stride_h + 1;
-            int out_w_temp = (in_w-kernel_w + 2*pad_w0)/stride_w + 1;
-
-            if (out_h_temp < out_h)
-                pad_h1 += 1;
-            if (out_w_temp < out_w)
-                pad_w1 += 1;
-        }
-    }
-
-    /* attr */
-    teng_set_node_attr_int(conv_node, "kernel_h", &kernel_h);
-    teng_set_node_attr_int(conv_node, "kernel_w", &kernel_w);
-    teng_set_node_attr_int(conv_node, "stride_h", &stride_h);
-    teng_set_node_attr_int(conv_node, "stride_w", &stride_w);
-    teng_set_node_attr_int(conv_node, "pad_h0", &pad_h0);
-    teng_set_node_attr_int(conv_node, "pad_w0", &pad_w0);
-    teng_set_node_attr_int(conv_node, "pad_h1", &pad_h1);
-    teng_set_node_attr_int(conv_node, "pad_w1", &pad_w1);
-    teng_set_node_attr_int(conv_node, "output_channel", &outch);
-    teng_set_node_attr_int(conv_node, "input_channel", &inch);
-    teng_set_node_attr_int(conv_node, "group", &group);
-    teng_set_node_attr_int(conv_node, "dilation_h", &dilation_h);
-    teng_set_node_attr_int(conv_node, "dilation_w", &dilation_w);
-  //  set_node_attr_int(conv_node, "activation", &activation);
-
-    teng_release_graph_node(conv_node);
-
-    return 0;
-}
-
-static teng_graph_t create_conv_graph(const char* layer_name, float* input_data, int inch, int group, int in_h, int in_w,
-                        float* output_data, int outch, int out_h, int out_w,
-                        int kernel_h, int kernel_w,
-                        int stride_h,int stride_w,
-                        int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w, int activation,
-                        float* teg_weight, float* teg_bias, std::string padMode, int nstripes)
-{
-    node_t    conv_node     = NULL;
-
-    tensor_t  input_tensor  = NULL;
-    tensor_t  output_tensor = NULL;
-    tensor_t  weight_tensor = NULL;
-    tensor_t  bias_tensor   = NULL;
-
-    /* create graph for convolution */
-    int in_size  = in_h * in_w * inch;
-    int out_size  = out_h * out_w * outch;
-    int weight_size = outch * (inch / group) * kernel_w * kernel_h;
-    int bias_size = outch;
-
-    int buf_size  = 0;
-    int input_num = 0;
-
-    /* create graph */
-    teng_graph_t graph = teng_create_graph(NULL, NULL, NULL);
-    bool ok = true;
-
-    if(graph == NULL)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: create_graph failed." );
-        ok = false;
-    }
-
-    const char* input_name = "data";
-    const char* conv_name  = layer_name;
-
-    if (ok && create_input_node(graph, input_name, inch, in_h, in_w) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: create_input_node failed." );
-        ok = false;
-    }
-
-    if (ok && create_conv_node(graph, conv_name, input_name, in_h, in_w, out_h, out_w, kernel_h, kernel_w,
-        stride_h, stride_w, pad_h0, pad_h1, pad_w0, pad_w1, inch, outch, group, dilation_h, dilation_w, activation, padMode) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: create conv node failed." );
-        ok = false;
-    }
-
-    /* set input/output node */
-    const char* inputs_name[]  = {input_name};
-    const char* outputs_name[] = {conv_name};
-
-    if (ok && teng_set_graph_input_node(graph, inputs_name, sizeof(inputs_name) / sizeof(char*)) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: set inputs failed." );
-        ok = false;
-    }
-
-    if (ok && teng_set_graph_output_node(graph, outputs_name, sizeof(outputs_name) / sizeof(char*)) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: set outputs failed." );
-        ok = false;
-    }
-
-    /* set input data */
-    if (ok)
-    {
-        input_tensor = teng_get_graph_input_tensor(graph, 0, 0);
-        buf_size     = teng_get_tensor_buffer_size(input_tensor);
-        if (buf_size != in_size * FLOAT_TO_REALSIZE)
-        {
-            CV_LOG_WARNING(NULL,"Tengine: Input data size check failed.");
-            ok = false;
-        }
-    }
-
-    if (ok)
-    {
-        teng_set_tensor_buffer(input_tensor, (float *)input_data, buf_size);
-        teng_release_graph_tensor(input_tensor);
-
-        /* create convolution node */
-        /* set weight node */
-        conv_node     = teng_get_graph_node(graph, conv_name);
-        weight_tensor = teng_get_node_input_tensor(conv_node, 1);
-        buf_size      = teng_get_tensor_buffer_size(weight_tensor);
-
-        if (buf_size != weight_size * FLOAT_TO_REALSIZE)
-        {
-            CV_LOG_WARNING(NULL,"Tengine: Input weight size check failed.");
-            ok = false;
-        }
-    }
-
-    if (ok)
-    {
-        teng_set_tensor_buffer(weight_tensor, teg_weight, buf_size);
-
-        /* set bias node */
-        input_num = teng_get_node_input_number(conv_node);
-        if (input_num > 2)
-        {
-            bias_tensor = teng_get_node_input_tensor(conv_node, 2);
-            buf_size    = teng_get_tensor_buffer_size(bias_tensor);
-            if (buf_size != bias_size * FLOAT_TO_REALSIZE)
-            {
-                CV_LOG_WARNING(NULL,"Tengine: Input bias size check failed.");
-                ok = false;
-            }
-            else teng_set_tensor_buffer(bias_tensor, teg_bias, buf_size);
-        }
-    }
-
-    /* prerun */
-    if (ok && teng_prerun_graph_multithread(graph, TENGINE_CLUSTER_BIG, nstripes) < 0)
-    {
-        CV_LOG_WARNING(NULL, "Tengine: prerun_graph failed.");
-        ok = false;
-    }
-
-    if (ok)
-    {
-        /* set output data */
-        output_tensor = teng_get_node_output_tensor(conv_node, 0);
-        int ret = teng_set_tensor_buffer(output_tensor, output_data, out_size * FLOAT_TO_REALSIZE);
-        if(ret)
-        {
-            CV_LOG_WARNING(NULL,"Tengine: Set output tensor buffer failed." );
-            ok = false;
-        }
-    }
-
-    if (false == ok)
-    {
-        teng_destroy_graph(graph) ;
-        return NULL ;
-    }
-    return graph;
-}
-static bool tengine_init_flag = false;
-teng_graph_t tengine_init(const char* layer_name, float* input_, int inch, int group, int in_h, int in_w,
-                        float *output_, int out_b, int outch, int out_h, int out_w,
-                        float *kernel_, int kernel_s ,int kernel_h, int kernel_w,
-                        float *teg_bias, int stride_h, int stride_w,
-                        int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w,
-                        size_t wstep, const std::string padMode, teng_graph_t &graph, int nstripes)
-{
-    std::vector<float> teg_weight_vec;
-    float *teg_weight = NULL;
-    int kernel_inwh = (inch / group) * kernel_w * kernel_h;
-    // Do not using the activation fuse mode, just convolution only.
-    int activation = -1;
-
-    if (!(kernel_s == 2 && kernel_h == kernel_w
-        && dilation_h == dilation_w && stride_h == stride_w
-        && out_b == 1 && pad_h0 < 10 && pad_h1 < 10 && pad_w0 < 10 && pad_w1 < 10)) // just for Conv2D
-    {
-       // printf("return : just for Conv2D\n");
-        return NULL;
-    }
-
-    {
-      /*   printf("Tengine(%s): input (1 x %d x %d x %d),output (%d x %d x %d x %d), kernel (%d x %d), stride (%d x %d), dilation (%d x %d), pad (%d x %d).\n",
-               layer_name, inch, in_h, in_w,
-               out_b, outch, out_h, out_w,
-               kernel_w, kernel_h,
-               stride_w, stride_h,
-               dilation_w, dilation_h,
-               pad_h0, pad_h1, pad_w0, pad_w1);
-     */
-        // weight
-        if (kernel_inwh != wstep)
-        {
-            teg_weight_vec.resize(kernel_inwh * outch);
-            teg_weight = &teg_weight_vec[0];
-            for (int i=0; i<outch; i++)
-            {
-                memcpy(teg_weight+i*kernel_inwh, kernel_+i*wstep, kernel_inwh*FLOAT_TO_REALSIZE);
-            }
-        }
-        else
-        {
-            teg_weight = kernel_;
-        }
-
-        /* initial the resource of tengine */
-        if(false == tengine_init_flag)
-        {
-            init_tengine();
-            tengine_init_flag = true;
-        }
-
-        /* create the convolution graph */
-        graph = create_conv_graph(layer_name, input_, inch, group, in_h, in_w,
-                                    output_, outch, out_h, out_w,
-                                    kernel_h, kernel_w, stride_h,stride_w,
-                                    pad_h0, pad_h1, pad_w0, pad_w1, dilation_h, dilation_w, activation,
-                                    teg_weight, teg_bias, padMode, nstripes);
-        if(NULL == graph )
-        {
-            return NULL;
-        }
-    }
-    return graph ;
-}
-
-bool tengine_forward(teng_graph_t &graph)
-{
-    /* run */
-    if(teng_run_graph(graph, 1) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: run_graph failed.");
-        return false ;
-    }
-    return true;
-}
-bool tengine_release(teng_graph_t &graph)
-{
-    teng_postrun_graph(graph);
-    teng_destroy_graph(graph);
-    return true;
-}
-}
-}
-#endif
diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
index 2e8cde27998b..45fcacbe3488 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@@ -98,6 +98,14 @@ class TFGraphWrapper : public ImportGraphWrapper
         net.mutable_node()->DeleteSubrange(idx, 1);
     }
 
+    virtual inline bool isCommutativeOp(const std::string& type) const CV_OVERRIDE
+    {
+        return type == "Add" || type == "Sum" ||
+               type == "Mul" || type == "Prod" ||
+               type == "Max" || type == "Maximum" || type == "Minimum" ||
+               type == "Mean" || type == "SquaredDifference";
+    }
+
     tensorflow::GraphDef& net;
 };
 
@@ -282,24 +290,26 @@ class ReLU6KerasSubgraph : public Subgraph
     {
         int input = addNodeToMatch("");
         int relu = addNodeToMatch("Relu", input);
-        int maxValue = addNodeToMatch("Const");
+        maxValueId = addNodeToMatch("Const");
         int clipValue = addNodeToMatch("Const");
-        int minimum = addNodeToMatch("Minimum", relu, maxValue);
+        int minimum = addNodeToMatch("Minimum", relu, maxValueId);
         addNodeToMatch("Maximum", minimum, clipValue);
 
         setFusedNode("Relu6", input);
     }
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
-        if (!Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds))
+        if (!Subgraph::match(net, nodeId, matchedNodesIds))
             return false;
-        tensorflow::NodeDef* node = net->getNode(matchedNodesIds.front() + 1).dynamicCast<TFNodeWrapper>()->node;
+        tensorflow::NodeDef* node = net->getNode(matchedNodesIds[maxValueId]).dynamicCast<TFNodeWrapper>()->node;
         Mat maxValue = getTensorContent(node->attr().at("value").tensor());
         return maxValue.type() == CV_32FC1 && maxValue.total() == 1 && maxValue.at<float>(0) == 6;
     }
+
+private:
+    int maxValueId;
 };
 
 // Keras' reshape stores output shape in separate Const nodes by one value.
@@ -328,15 +338,14 @@ class ReshapeKerasSubgraph : public TFSubgraph
     }
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
-                       std::vector<int>& matchedNodesIds,
-                       std::vector<int>& targetNodesIds) CV_OVERRIDE
+                       std::vector<int>& matchedNodesIds) CV_OVERRIDE
     {
         Ptr<ImportNodeWrapper> node = net->getNode(nodeId);
         if (node->getNumInputs() == 0)
             return false;
 
         inpName = node->getInputName(0);
-        return Subgraph::match(net, nodeId, matchedNodesIds, targetNodesIds);
+        return Subgraph::match(net, nodeId, matchedNodesIds);
     }
 
 
@@ -906,22 +915,22 @@ Mat getTensorContentRef_(const tensorflow::TensorProto& tensor)
         }
         case tensorflow::DT_HALF:
         {
-            Mat halfs;
             if (!content.empty())
             {
                 static const int kHalfSize = 2;
-                halfs = Mat(1, content.size() / kHalfSize, CV_16UC1, (void*)content.c_str());
+                Mat halfs(1, content.size() / kHalfSize, CV_16FC1, (void*)content.c_str());
+                halfs.convertTo(m, CV_32F);
             }
             else
             {
                 const RepeatedField<int32_t>& field = tensor.half_val();
                 CV_Assert(!field.empty());
                 Mat ints(1, field.size(), CV_32SC1, (void*)field.data());
+                Mat halfs;
                 ints.convertTo(halfs, CV_16UC1);
+                Mat halfsSigned(halfs.size(), CV_16FC1, halfs.data);
+                halfsSigned.convertTo(m, CV_32F);
             }
-            // Reinterpret as a signed shorts just for a convertFp16 call.
-            Mat halfsSigned(halfs.size(), CV_16SC1, halfs.data);
-            convertFp16(halfsSigned, m);
             break;
         }
         case tensorflow::DT_QUINT8:
@@ -1120,15 +1129,16 @@ void removePhaseSwitches(tensorflow::GraphDef& net)
             inpName = inpName.substr(1 + (int)inpName.find('^'), inpName.rfind(':'));
             nodesMapIt = nodesMap.find(inpName);
             CV_Assert(nodesMapIt != nodesMap.end());
-
             int inpNodeId = nodesMapIt->second;
+
+            CV_CheckGT(numConsumers[inpNodeId], 0,
+                       "Input node of the current node should have at least one output node");
             if (numConsumers[inpNodeId] == 1)
             {
                 mergeOpSubgraphNodes.push(inpNodeId);
                 nodesToRemove.push_back(inpNodeId);
             }
-            else if (numConsumers[inpNodeId] > 0)
-                numConsumers[inpNodeId] -= 1;
+            numConsumers[inpNodeId] -= 1;
         }
     }
     std::sort(nodesToRemove.begin(), nodesToRemove.end());
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 8e9970528a33..3e73037a6017 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -260,6 +260,11 @@ const tensorflow::AttrValue& getLayerAttr(const tensorflow::NodeDef &layer, cons
     return layer.attr().at(name);
 }
 
+#if defined(__GNUC__) && (__GNUC__ >= 13)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdangling-reference"
+#endif
+
 static DataLayout getDataLayout(const tensorflow::NodeDef& layer)
 {
     if (hasLayerAttr(layer, "data_format"))
@@ -589,6 +594,7 @@ class TFImporter
     void parsePack               (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
     void parseClipByValue        (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
     void parseLeakyRelu          (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parsePReLU              (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
     void parseActivation         (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
     void parseExpandDims         (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
     void parseSquare             (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
@@ -668,6 +674,7 @@ TFImporter::DispatchMap TFImporter::buildDispatchMap()
     dispatch["Pack"] = &TFImporter::parsePack;
     dispatch["ClipByValue"] = &TFImporter::parseClipByValue;
     dispatch["LeakyRelu"] = &TFImporter::parseLeakyRelu;
+    dispatch["PReLU"] = &TFImporter::parsePReLU;
     dispatch["Abs"] = dispatch["Tanh"] = dispatch["Sigmoid"] = dispatch["Relu"] =
             dispatch["Elu"] = dispatch["Exp"] = dispatch["Identity"] = dispatch["Relu6"] = &TFImporter::parseActivation;
     dispatch["ExpandDims"] = &TFImporter::parseExpandDims;
@@ -2293,6 +2300,12 @@ void TFImporter::parseSoftmax(tensorflow::GraphDef& net, const tensorflow::NodeD
     CV_CheckGT(num_inputs, 0, "");
     if (hasLayerAttr(layer, "axis"))
         layerParams.set("axis", getLayerAttr(layer, "axis").i());
+    // if tf version is 2.x, use axis -1 as default
+    else if(netBin.has_versions() && (int)netBin.versions().producer() >= 2)
+        layerParams.set("axis", -1);
+    // else use axis 1 as default
+    else
+        layerParams.set("axis", 1);
 
     int id = dstNet.addLayer(name, "Softmax", layerParams);
     layer_id[name] = id;
@@ -2622,6 +2635,27 @@ void TFImporter::parseLeakyRelu(tensorflow::GraphDef& net, const tensorflow::Nod
     connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
 }
 
+void TFImporter::parsePReLU(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+
+    Mat scales;
+    blobFromTensor(getConstBlob(layer, value_id, 1), scales);
+
+    layerParams.blobs.resize(1);
+
+    if (scales.dims == 3) {
+        // Considering scales from Keras wih HWC layout;
+        transposeND(scales, {2, 0, 1}, layerParams.blobs[0]);
+    } else {
+        layerParams.blobs[0] = scales;
+    }
+
+    int id = dstNet.addLayer(name, "PReLU", layerParams);
+    layer_id[name] = id;
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+}
+
 // "Abs" "Tanh" "Sigmoid" "Relu" "Elu" "Exp" "Identity" "Relu6"
 void TFImporter::parseActivation(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
 {
@@ -2642,14 +2676,20 @@ void TFImporter::parseActivation(tensorflow::GraphDef& net, const tensorflow::No
     connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
 }
 
+// ArgMin or ArgMax node
 void TFImporter::parseArg(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
 {
     const std::string& name = layer.name();
     const std::string& type = layer.op();
 
-    Mat dimension = getTensorContent(getConstBlob(layer, value_id, 1));
-    CV_Assert(dimension.total() == 1 && dimension.type() == CV_32SC1);
-    layerParams.set("axis", *dimension.ptr<int>());
+    if (layer.input_size() < 2)
+        layerParams.set("axis", 0); // default dimension is 0
+    else
+    {
+        Mat dimension = getTensorContent(getConstBlob(layer, value_id, 1));
+        CV_Assert(dimension.total() == 1 && dimension.type() == CV_32SC1);
+        layerParams.set("axis", dimension.at<int>(0));
+    }
     layerParams.set("op", type == "ArgMax" ? "max" : "min");
     layerParams.set("keepdims", false); //tensorflow doesn't have this atrr, the output's dims minus one(default);
 
@@ -2843,6 +2883,7 @@ const tensorflow::TensorProto& TFImporter::getConstBlob(const tensorflow::NodeDe
 
     if (input_blob_index == -1)
         CV_Error(Error::StsError, "Const input blob for weights not found");
+    CV_CheckLT(input_blob_index, layer.input_size(), "Input index is out of range");
 
     Pin kernel_inp = parsePin(layer.input(input_blob_index));
     if (const_layers.find(kernel_inp.name) == const_layers.end())
@@ -2948,6 +2989,10 @@ static void addConstNodes(tensorflow::GraphDef& net, std::map<String, int>& cons
     CV_LOG_DEBUG(NULL, "DNN/TF: layers_to_ignore.size() = " << layers_to_ignore.size());
 }
 
+#if defined(__GNUC__) && (__GNUC__ == 13)
+#pragma GCC diagnostic pop
+#endif
+
 // If all inputs of specific layer have the same data layout we can say that
 // this layer's output has this data layout too. Returns DNN_LAYOUT_UNKNOWN otherwise.
 DataLayout TFImporter::predictOutputDataLayout(const tensorflow::NodeDef& layer)
@@ -3197,7 +3242,7 @@ void TFLayerHandler::fillRegistry(const tensorflow::GraphDef& net)
         }
     }
     printMissing();
-};
+}
 
 bool TFLayerHandler::handleMissing(const tensorflow::NodeDef& layer)
 {
diff --git a/modules/dnn/src/tflite/tflite_importer.cpp b/modules/dnn/src/tflite/tflite_importer.cpp
index 4a186eaee00e..92bfeeef65bf 100644
--- a/modules/dnn/src/tflite/tflite_importer.cpp
+++ b/modules/dnn/src/tflite/tflite_importer.cpp
@@ -59,17 +59,28 @@ class TFLiteImporter {
     void parseUnpooling(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseReshape(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseConcat(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parsePack(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseResize(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseDeconvolution(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseQuantize(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseDequantize(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseDetectionPostProcess(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseActivation(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parseSplit(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parseFullyConnected(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parseSoftmax(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parseCast(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parseTranspose(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parseGlobalPooling(const Operator& op, const std::string& opcode, LayerParams& layerParams);
 
     void parseFusedActivation(const Operator& op, ActivationFunctionType activ);
     void parseActivation(const Operator& op, const std::string& opcode, LayerParams& layerParams, bool isFused);
     void addLayer(LayerParams& layerParams, const Operator& op);
     int addPermuteLayer(const std::vector<int>& order, const std::string& permName, const std::pair<int, int>& inpId, int dtype);
+    int addReshapeLayer(const std::vector<int>& shape, int axis, int num_axes,
+                        const std::string& name, const std::pair<int, int>& inpId, int dtype);
+    int addFlattenLayer(int axis, int end_axis, const std::string& name, const std::pair<int, int>& inpId, int dtype);
+
     inline bool isInt8(const Operator& op);
     inline void getQuantParams(const Operator& op, float& inpScale, int& inpZero, float& outScale, int& outZero);
 };
@@ -98,7 +109,7 @@ Mat TFLiteImporter::parseTensor(const Tensor& tensor)
         dtype = CV_32S;
         break;
     case TensorType_FLOAT16:
-        dtype = CV_16S;
+        dtype = CV_16F;
         break;
     case TensorType_INT8:
         dtype = CV_8S;
@@ -106,7 +117,7 @@ Mat TFLiteImporter::parseTensor(const Tensor& tensor)
     default:
         CV_Error(Error::StsNotImplemented, format("Parse tensor with type %s", EnumNameTensorType(tensor.type())));
     }
-    return Mat(shape, dtype, const_cast<void*>(data));
+    return shape.empty() ? Mat() : Mat(shape, dtype, const_cast<void*>(data));
 }
 
 TFLiteImporter::TFLiteImporter(Net& dstNet, const char* modelBuffer, size_t bufSize)
@@ -224,7 +235,7 @@ void TFLiteImporter::populateNet()
                 if (!data.empty()) {
                     // Dequantize a buffer
                     Mat dataFP32;
-                    convertFp16(data, dataFP32);
+                    data.convertTo(dataFP32, CV_32F);
                     allTensors[op_outputs->Get(0)] = dataFP32;
                     continue;
                 }
@@ -267,11 +278,18 @@ TFLiteImporter::DispatchMap TFLiteImporter::buildDispatchMap()
     dispatch["PAD"] = &TFLiteImporter::parsePadding;
     dispatch["RESHAPE"] = &TFLiteImporter::parseReshape;
     dispatch["CONCATENATION"] = &TFLiteImporter::parseConcat;
+    dispatch["PACK"] = &TFLiteImporter::parsePack;
     dispatch["RESIZE_BILINEAR"] = dispatch["RESIZE_NEAREST_NEIGHBOR"] = &TFLiteImporter::parseResize;
     dispatch["Convolution2DTransposeBias"] = &TFLiteImporter::parseDeconvolution;
     dispatch["QUANTIZE"] = &TFLiteImporter::parseQuantize;
     dispatch["DEQUANTIZE"] = &TFLiteImporter::parseDequantize;
+    dispatch["SPLIT"] = &TFLiteImporter::parseSplit;
+    dispatch["FULLY_CONNECTED"] = &TFLiteImporter::parseFullyConnected;
+    dispatch["SOFTMAX"] = &TFLiteImporter::parseSoftmax;
+    dispatch["CAST"] = &TFLiteImporter::parseCast;
     dispatch["TFLite_Detection_PostProcess"] = &TFLiteImporter::parseDetectionPostProcess;
+    dispatch["TRANSPOSE"] = &TFLiteImporter::parseTranspose;
+    dispatch["MEAN"] = dispatch["REDUCE_MAX"] = &TFLiteImporter::parseGlobalPooling;
     return dispatch;
 }
 
@@ -288,6 +306,10 @@ void TFLiteImporter::addLayer(LayerParams& layerParams, const Operator& op) {
             Mat blob = allTensors[idx];
             layerParams.blobs.push_back(blob.u ? blob : blob.clone());  // some tensors are owned by OpenCV
         }
+    } else {
+        for (auto& blob : layerParams.blobs) {
+            CV_Assert(blob.u);
+        }
     }
 
     int dtype = CV_32F;
@@ -596,16 +618,6 @@ void TFLiteImporter::parseUnpooling(const Operator& op, const std::string& opcod
 void TFLiteImporter::parseReshape(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
     DataLayout inpLayout = layouts[op.inputs()->Get(0)];
 
-    if (inpLayout == DNN_LAYOUT_NHWC) {
-        // Permute to NCHW
-        std::vector<int> order = {0, 2, 3, 1};
-        const std::string name = layerParams.name + "/permute";
-        auto inpId = layerIds[op.inputs()->Get(0)];
-        int permId = addPermuteLayer(order, name, inpId, isInt8(op) ? CV_8S : CV_32F);  // NCHW -> NHWC
-        layerIds[op.inputs()->Get(0)] = std::make_pair(permId, 0);
-        layouts[op.outputs()->Get(0)] = DNN_LAYOUT_NCHW;
-    }
-
     layerParams.type = "Reshape";
     std::vector<int> shape;
     if (op.inputs()->size() > 1) {
@@ -615,6 +627,22 @@ void TFLiteImporter::parseReshape(const Operator& op, const std::string& opcode,
         CV_Assert(options);
         shape.assign(options->new_shape()->begin(), options->new_shape()->end());
     }
+
+    if (inpLayout == DNN_LAYOUT_NHWC) {
+        if (shape.size() == 4) {
+            // Keep data but change a shape to OpenCV's NCHW order
+            std::swap(shape[2], shape[3]);
+            std::swap(shape[1], shape[2]);
+        } else {
+            // Permute to NCHW entire data and reshape to given a shape
+            std::vector<int> order = {0, 2, 3, 1};
+            const std::string name = layerParams.name + "/permute";
+            auto inpId = layerIds[op.inputs()->Get(0)];
+            int permId = addPermuteLayer(order, name, inpId, isInt8(op) ? CV_8S : CV_32F);  // NCHW -> NHWC
+            layerIds[op.inputs()->Get(0)] = std::make_pair(permId, 0);
+            layouts[op.outputs()->Get(0)] = DNN_LAYOUT_NCHW;
+        }
+    }
     layerParams.set("dim", DictValue::arrayInt<int*>(shape.data(), shape.size()));
     addLayer(layerParams, op);
 }
@@ -636,6 +664,47 @@ void TFLiteImporter::parseConcat(const Operator& op, const std::string& opcode,
     parseFusedActivation(op, options->fused_activation_function());
 }
 
+void TFLiteImporter::parsePack(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
+    auto options = reinterpret_cast<const PackOptions*>(op.builtin_options());
+    int axis = options->axis();
+
+    DataLayout inpLayout = layouts[op.inputs()->Get(0)];
+    if (inpLayout == DNN_LAYOUT_NHWC) {
+        // OpenCV works in NCHW data layout. So change the axis correspondingly.
+        axis = normalize_axis(axis, 5);  // 5 because Pack adds a new axis so -1 would mean 4
+        static const int remap[] = {0, 1, 3, 4, 2};
+        axis = remap[axis];
+    }
+
+    // Replace Pack layer to Reshape + Concat
+    // Use a set because there are models which replicate single layer data by Pack.
+    std::set<int> op_inputs(op.inputs()->begin(), op.inputs()->end());
+    std::map<int, std::pair<int, int> > originLayerIds;
+    for (int inp : op_inputs) {
+        auto inpId = layerIds[inp];
+        int dims = modelTensors->Get(inp)->shape()->size();
+
+        std::vector<int> shape{1, -1};
+        if (axis == dims) {
+            std::swap(shape[0], shape[1]);
+        }
+        const auto name = modelTensors->Get(inp)->name()->str() + "/reshape";
+        int reshapeId = addReshapeLayer(shape, axis == dims ? dims - 1 : axis, 1,
+                                        name, inpId, isInt8(op) ? CV_8S : CV_32F);
+
+        originLayerIds[inp] = layerIds[inp];
+        layerIds[inp] = std::make_pair(reshapeId, 0);
+    }
+    layerParams.type = "Concat";
+    layerParams.set("axis", axis);
+    addLayer(layerParams, op);
+
+    // Restore origin layer inputs
+    for (const auto& ids : originLayerIds) {
+        layerIds[ids.first] = ids.second;
+    }
+}
+
 void TFLiteImporter::parseResize(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
     layerParams.type = "Resize";
 
@@ -656,6 +725,80 @@ void TFLiteImporter::parseResize(const Operator& op, const std::string& opcode,
     addLayer(layerParams, op);
 }
 
+void TFLiteImporter::parseTranspose(const Operator& op, const std::string& opcode, LayerParams& layerParams)
+{
+    layerParams.type = "Permute";
+    std::vector<int> perm = allTensors[op.inputs()->Get(1)];
+
+    DataLayout inpLayout = layouts[op.inputs()->Get(0)];
+    if (inpLayout == DNN_LAYOUT_NHWC && perm.size() == 4) {
+
+        // OpenCV operates under the assumption that NCHW format, whereas TFLite defaults to NHWC.
+        // Therfore, to align these layouts, the axes of the permutation vector should be adjusted accordingly.
+        // For implementation details, please refer to the disscusion:
+        // https://github.com/opencv/opencv/pull/25297#issuecomment-2049762298
+
+        if (perm[0] != 0) {
+            CV_Error(Error::StsParseError, "The first axis should not be permuted.");
+        }
+        if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3) {
+            std::vector<int> orderLP = {0, 1, 2, 3};
+            layerParams.set("order", DictValue::arrayInt<int*>(orderLP.data(), orderLP.size()));
+            layouts[op.outputs()->Get(0)] = DNN_LAYOUT_NCHW;
+        }
+        else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 2) {
+            std::vector<int> orderLP = {0, 3, 2, 1};
+            layerParams.set("order", DictValue::arrayInt<int*>(orderLP.data(), orderLP.size()));
+        }
+        else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3) {
+            std::vector<int> orderLP = {0, 1, 3, 2};
+            layerParams.set("order", DictValue::arrayInt<int*>(orderLP.data(), orderLP.size()));
+            layouts[op.outputs()->Get(0)] = DNN_LAYOUT_NCHW;
+        }
+        else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 1) {
+            std::vector<int> orderLP = {0, 2, 3, 1};
+            layerParams.set("order", DictValue::arrayInt<int*>(orderLP.data(), orderLP.size()));
+        }
+
+    }
+    else {
+        layerParams.set("order", DictValue::arrayInt<int*>(perm.data(), perm.size()));
+    }
+
+    addLayer(layerParams, op);
+}
+
+void TFLiteImporter::parseGlobalPooling(const Operator& op, const std::string& opcode, LayerParams& layerParams)
+{
+    layerParams.type = "Pooling";
+    if(opcode == "MEAN") {
+        layerParams.set("pool", "ave");
+    }
+    else if (opcode == "REDUCE_MAX") {
+        layerParams.set("pool", "max");
+    }
+    else {
+        CV_Error(Error::StsNotImplemented, "Unsupported pooling " + opcode);
+    }
+    layerParams.set("global_pooling", true);
+    auto options = op.builtin_options_as_ReducerOptions();
+    bool keep_dims = options->keep_dims();
+
+    if (!keep_dims) {
+        const auto name = layerParams.name;
+        layerParams.name += "/global_pooling";
+        addLayer(layerParams, op);
+
+        int out = op.outputs()->Get(0);
+        auto outId = layerIds[out];
+        int flattenId = addFlattenLayer(1, -1, name, outId, isInt8(op) ? CV_8S : CV_32F);
+        layerIds[out] = std::make_pair(flattenId, 0);
+    }
+    else {
+        addLayer(layerParams, op);
+    }
+}
+
 int TFLiteImporter::addPermuteLayer(const std::vector<int>& order, const std::string& permName,
                                     const std::pair<int, int>& inpId, int dtype)
 {
@@ -666,6 +809,28 @@ int TFLiteImporter::addPermuteLayer(const std::vector<int>& order, const std::st
     return permId;
 }
 
+int TFLiteImporter::addReshapeLayer(const std::vector<int>& shape, int axis, int num_axes,
+                                    const std::string& name, const std::pair<int, int>& inpId, int dtype)
+{
+    LayerParams lp;
+    lp.set("axis", axis);
+    lp.set("dim", DictValue::arrayInt<const int*>(shape.data(), shape.size()));
+    lp.set("num_axes", num_axes);
+    int id = dstNet.addLayer(name, "Reshape", dtype, lp);
+    dstNet.connect(inpId.first, inpId.second, id, 0);
+    return id;
+}
+
+int TFLiteImporter::addFlattenLayer(int axis, int end_axis, const std::string& name, const std::pair<int, int>& inpId, int dtype)
+{
+    LayerParams lp;
+    lp.set("axis", axis);
+    lp.set("end_axis", end_axis);
+    int id = dstNet.addLayer(name, "Flatten", dtype, lp);
+    dstNet.connect(inpId.first, inpId.second, id, 0);
+    return id;
+}
+
 void TFLiteImporter::parseDeconvolution(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
     layerParams.type = "Deconvolution";
 
@@ -746,6 +911,35 @@ void TFLiteImporter::parseDequantize(const Operator& op, const std::string& opco
     addLayer(layerParams, op);
 }
 
+void TFLiteImporter::parseSplit(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
+    layerParams.type = "Slice";
+    auto options = op.builtin_options_as_SplitOptions();
+    CV_Assert(options);
+    layerParams.set("num_split", options->num_splits());
+    addLayer(layerParams, op);
+}
+
+void TFLiteImporter::parseFullyConnected(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
+    layerParams.type = "Gemm";
+    auto options = op.builtin_options_as_FullyConnectedOptions();
+    CV_Assert(options);
+
+    layerParams.set("transB", true);
+    layerParams.set("constB", true);
+    addLayer(layerParams, op);
+    parseFusedActivation(op, options->fused_activation_function());
+}
+
+void TFLiteImporter::parseSoftmax(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
+    layerParams.type = "Softmax";
+    addLayer(layerParams, op);
+}
+
+void TFLiteImporter::parseCast(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
+    layerParams.type = "Identity";
+    addLayer(layerParams, op);
+}
+
 void TFLiteImporter::parseDetectionPostProcess(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
     // Parse parameters;
     std::vector<std::string> keys(1, "");
@@ -771,6 +965,8 @@ void TFLiteImporter::parseDetectionPostProcess(const Operator& op, const std::st
         parameters[keys[i]] = *reinterpret_cast<const uint32_t*>(data + offset + i * 4);
     }
 
+    parameters["num_classes"] = modelTensors->Get(op.inputs()->Get(1))->shape()->Get(2);
+
     layerParams.type = "DetectionOutput";
     layerParams.set("num_classes", parameters["num_classes"]);
     layerParams.set("share_location", true);
@@ -780,7 +976,6 @@ void TFLiteImporter::parseDetectionPostProcess(const Operator& op, const std::st
     layerParams.set("top_k", parameters["max_detections"]);
     layerParams.set("keep_top_k", parameters["max_detections"]);
     layerParams.set("code_type", "CENTER_SIZE");
-    layerParams.set("variance_encoded_in_target", true);
     layerParams.set("loc_pred_transposed", true);
 
     // Replace third input from tensor to Const layer with the priors
@@ -796,10 +991,27 @@ void TFLiteImporter::parseDetectionPostProcess(const Operator& op, const std::st
     priors.col(2) = priors.col(0) + priors.col(3);
     priors.col(3) = priors.col(1) + tmp;
 
+    float x_scale = *(float*)&parameters["x_scale"];
+    float y_scale = *(float*)&parameters["y_scale"];
+    float w_scale = *(float*)&parameters["w_scale"];
+    float h_scale = *(float*)&parameters["h_scale"];
+    if (x_scale != 1.0f || y_scale != 1.0f || w_scale != 1.0f || h_scale != 1.0f) {
+        int numPriors = priors.rows;
+        priors.resize(numPriors * 2);
+        Mat_<float> scales({1, 4}, {1.f / x_scale, 1.f / y_scale,
+                                    1.f / w_scale, 1.f / h_scale});
+        repeat(scales, numPriors, 1, priors.rowRange(numPriors, priors.rows));
+        priors = priors.reshape(1, {1, 2, (int)priors.total() / 2});
+        layerParams.set("variance_encoded_in_target", false);
+    } else {
+        priors = priors.reshape(1, {1, 1, (int)priors.total()});
+        layerParams.set("variance_encoded_in_target", true);
+    }
+
     LayerParams priorsLP;
     priorsLP.name = layerParams.name + "/priors";
     priorsLP.type = "Const";
-    priorsLP.blobs.resize(1, priors.reshape(1, {1, 1, (int)priors.total()}));
+    priorsLP.blobs.resize(1, priors);
 
     int priorsId = dstNet.addLayer(priorsLP.name, priorsLP.type, priorsLP);
     layerIds[op.inputs()->Get(2)] = std::make_pair(priorsId, 0);
@@ -858,6 +1070,8 @@ void TFLiteImporter::parseActivation(const Operator& op, const std::string& opco
                 y = std::min(std::max(x, 0.f), 6.f);
             else if (opcode == "LOGISTIC")
                 y = 1.0f / (1.0f + std::exp(-x));
+            else if (opcode == "HARD_SWISH")
+                y = x * max(0.f, min(1.f, x / 6.f + 0.5f));
             else
                 CV_Error(Error::StsNotImplemented, "Lookup table for " + opcode);
 
diff --git a/modules/dnn/src/torch/THDiskFile.cpp b/modules/dnn/src/torch/THDiskFile.cpp
index 84b6b23e8124..bede95e02154 100644
--- a/modules/dnn/src/torch/THDiskFile.cpp
+++ b/modules/dnn/src/torch/THDiskFile.cpp
@@ -375,15 +375,21 @@ static long THDiskFile_readString(THFile *self, const char *format, char **str_)
     long total = TBRS_BSZ;
     long pos = 0L;
 
+    if (p == NULL)
+        THError("read error: failed to allocate buffer");
     for (;;)
     {
       if(total-pos == 0) /* we need more space! */
       {
         total += TBRS_BSZ;
-        p = (char*)THRealloc(p, total);
+        char *new_p = (char*)THRealloc(p, total);
+        if (new_p == NULL)
+        {
+          THFree(p);
+          THError("read error: failed to reallocate buffer");
+        }
+        p = new_p;
       }
-      if (p == NULL)
-        THError("read error: failed to allocate buffer");
       pos += fread(p+pos, 1, total-pos, dfself->handle);
       if (pos < total) /* eof? */
       {
@@ -409,15 +415,21 @@ static long THDiskFile_readString(THFile *self, const char *format, char **str_)
     long pos = 0L;
     long size;
 
+    if (p == NULL)
+        THError("read error: failed to allocate buffer");
     for (;;)
     {
       if(total-pos <= 1) /* we can only write '\0' in there! */
       {
         total += TBRS_BSZ;
-        p = (char*)THRealloc(p, total);
+        char *new_p = (char*)THRealloc(p, total);
+        if (new_p == NULL)
+        {
+          THFree(p);
+          THError("read error: failed to reallocate buffer");
+        }
+        p = new_p;
       }
-      if (p == NULL)
-        THError("read error: failed to allocate buffer");
       if (fgets(p+pos, total-pos, dfself->handle) == NULL) /* eof? */
       {
         if(pos == 0L)
diff --git a/modules/dnn/src/torch/torch_importer.cpp b/modules/dnn/src/torch/torch_importer.cpp
index 3a46c8f7c0dc..08822102c7f3 100644
--- a/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
@@ -84,7 +84,7 @@ enum TorchType
     TYPE_FLOAT  = CV_32F,
     TYPE_BYTE   = CV_8U,
     TYPE_CHAR   = CV_8S,
-    TYPE_SHORT  = CV_16S,
+    TYPE_SHORT  = CV_16F,
     TYPE_INT    = CV_32S,
     TYPE_LONG   = CV_32SC2
 };
@@ -276,7 +276,7 @@ struct TorchImporter
             THFile_readByteRaw(file, (uchar*)storageMat.data, size);
             break;
         case TYPE_SHORT:
-            storageMat.create(1, size, CV_16S);
+            storageMat.create(1, size, CV_16F);
             THFile_readShortRaw(file, (short*)storageMat.data, size);
             break;
         case TYPE_INT:
@@ -874,6 +874,9 @@ struct TorchImporter
             {
                 newModule->apiType = "Softmax";
                 layerParams.set("log_softmax", nnName == "LogSoftMax");
+                // set default axis to 1
+                if(!layerParams.has("axis"))
+                    layerParams.set("axis", 1);
                 curModule->modules.push_back(newModule);
             }
             else if (nnName == "SpatialCrossMapLRN")
@@ -899,7 +902,7 @@ struct TorchImporter
             {
                 readTorchTable(scalarParams, tensorParams);
 
-                float power;
+                float power = 1.0f;
                 if (nnName == "Square") power = 2.0f;
                 else if (nnName == "Sqrt") power = 0.5f;
                 else if (nnName == "Power") power = scalarParams.get<float>("pow", 1.0f);
diff --git a/modules/dnn/src/vkcom/include/op_naryeltwise.hpp b/modules/dnn/src/vkcom/include/op_naryeltwise.hpp
new file mode 100644
index 000000000000..1d108298bfe8
--- /dev/null
+++ b/modules/dnn/src/vkcom/include/op_naryeltwise.hpp
@@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_OP_NARY_HPP
+#define OPENCV_OP_NARY_HPP
+
+#include "vkcom.hpp"
+#include "op_base.hpp"
+
+namespace cv { namespace dnn { namespace vkcom {
+
+#ifdef HAVE_VULKAN
+
+enum NaryShaderType
+{
+    kNaryShaderTypeBinary,
+    kNaryShaderTypeTrinary,
+    kNaryShaderTypeNary,
+    kNaryShaderTest,
+};
+
+struct NaryShaderConfig
+{
+    int local_size_x;
+    int local_size_y;
+    int local_size_z;
+};
+
+
+class OpNary : public OpBase
+{
+public:
+    // Copied from nary_eltwise_layers.cpp
+    enum class OPERATION
+    {
+        AND = 0,
+        EQUAL,
+        GREATER,
+        GREATER_EQUAL,
+        LESS,
+        LESS_EQUAL,
+        OR,
+        POW,
+        XOR,
+        BITSHIFT,
+        MAX,
+        MEAN,
+        MIN,
+        MOD,
+        PROD,
+        SUB,
+        SUM,
+        ADD,
+        DIV,
+        WHERE,
+    };
+
+    OpNary(const OPERATION naryOpType, int ninputs, int max_ndims, const std::vector<std::vector<int>> shapes, const std::vector<std::vector<size_t>> steps);
+
+    void firstForward(); // Execute only in the first forward.
+    virtual bool forward(std::vector<Tensor>& ins, std::vector<Tensor>& outs) CV_OVERRIDE;
+    Ptr<Tensor> weightTensorPtr;
+private:
+    bool computeGroupCount();
+    bool binaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
+    bool trinaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
+    bool naryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs);
+
+    const OPERATION naryOpType;
+    NaryShaderType shaderType;
+    NaryShaderConfig config;
+    int ninputs;
+    int max_ndims;
+    AutoBuffer<int32_t> shapesBuf;
+    AutoBuffer<int32_t> stepsBuf;
+    int nplanes; // number of planes computations are to be performed on
+    int N2; // value of shape[ndims - 2]
+    int N1; // value of shape[ndims - 1]
+
+    bool firstForwardFinsh = false;
+};
+
+#endif // HAVE_VULKAN
+
+}}} // namespace cv::dnn::vkcom
+#endif //OPENCV_OP_MATMUL_HPP
diff --git a/modules/dnn/src/vkcom/include/vkcom.hpp b/modules/dnn/src/vkcom/include/vkcom.hpp
index 4c774abfb06f..c152a74a1f45 100644
--- a/modules/dnn/src/vkcom/include/vkcom.hpp
+++ b/modules/dnn/src/vkcom/include/vkcom.hpp
@@ -51,5 +51,6 @@ bool isAvailable();
 #include "op_base.hpp"
 #include "op_conv.hpp"
 #include "op_matmul.hpp"
+#include "op_naryeltwise.hpp"
 
 #endif // OPENCV_DNN_VKCOM_HPP
diff --git a/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp
new file mode 100644
index 000000000000..295f157a884a
--- /dev/null
+++ b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward.comp
@@ -0,0 +1,116 @@
+#version 450
+// #extension GL_EXT_debug_printf : enable
+#define ALL_THREAD 1024
+// #define ALL_THREAD 128 // Experimental batched operation
+#define STEP_SIZE 65536
+
+layout(binding = 0) readonly buffer Input1{
+    float matA[];
+};
+
+layout(binding = 1) readonly buffer Input2{
+    float matB[];
+};
+
+layout(binding = 2) writeonly buffer Output{
+    float matOut[];
+};
+
+layout(binding = 3) uniform Params {
+    int opType;
+    int ndims;
+} params;
+
+layout(binding = 4) readonly buffer Shape {
+    int shape[];
+};
+
+layout(binding = 5) readonly buffer Step {
+    int matStep[];
+};
+
+/* local_size_x, local_size_y, local_size_z there defines the number of invocations
+   of this compute shader in the current work group. */
+// TODO: Check if this makes any sense
+// TODO: Check if it is required to fetch PhysicalDeviceLimit from Context
+// TODO: here we shall assume that maxGroupInvocation is 1024.
+layout(local_size_x = ALL_THREAD, local_size_y = 1, local_size_z = 1) in; // TODO: Check if this makes any sense
+
+const int AND = 0;
+const int EQUAL = 1;
+const int GREATER = 2;
+const int GREATER_EQUAL = 3;
+const int LESS = 4;
+const int LESS_EQUAL = 5;
+const int OR = 6;
+const int POW = 7;
+const int XOR = 8;
+const int BITSHIFT = 9;
+const int MAX = 10;
+const int MEAN = 11;
+const int MIN = 12;
+const int MOD = 13;
+const int FMOD = 14;
+const int PROD = 15;
+const int SUB = 16;
+const int SUM = 17;
+const int ADD = 18;
+const int DIV = 19;
+const int WHERE = 20;
+
+void binary_forward()
+{
+    int ndims = params.ndims;
+    int dp1 = matStep[2 * ndims - 1];
+    int dp2 = matStep[3 * ndims - 1];
+    int dp = matStep[ndims - 1];
+    int n1 = shape[ndims - 1], n2 = shape[ndims - 2];
+
+    int plane_idx = int(gl_WorkGroupID.x);
+
+    int ptr1 = 0;
+    int ptr2 = 0;
+    int ptr = 0;
+    int idx = plane_idx;
+
+    for (int k = ndims - 3; k >= 0; --k) {
+        int next_idx = idx / shape[k];
+        int i_k = idx - next_idx * shape[k]; // i_k = idx % shape[k]
+        ptr1 += i_k * matStep[ndims + k];
+        ptr2 += i_k * matStep[2 * ndims + k];
+        ptr += i_k * matStep[k];
+        idx = next_idx;
+    }
+
+    int i2_offset = int(gl_WorkGroupID.y);
+    int i1_offset = int(gl_LocalInvocationID.x);
+
+    ptr1 += i2_offset * matStep[2 * ndims - 2];
+    ptr2 += i2_offset * matStep[3 * ndims - 2];
+    ptr += i2_offset * matStep[ndims - 2];
+
+    for (int i1 = i1_offset; i1 < n1; i1 += ALL_THREAD) {
+        switch (params.opType) {
+            case int(ADD):
+                matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] + matB[ptr2 + i1 * dp2];
+                break;
+            case int(SUB):
+                matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] - matB[ptr2 + i1 * dp2];
+                break;
+            case int(PROD):
+                matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] * matB[ptr2 + i1 * dp2];
+                break;
+            case int(DIV):
+                matOut[ptr + i1 * dp] = matA[ptr1 + i1 * dp1] / matB[ptr2 + i1 * dp2];
+                break;
+        }
+    }
+}
+
+
+void main()
+{
+    // debugPrintfEXT("nary_eltwise_binary_forward.comp loaded\n");
+    binary_forward();
+    return;
+}
diff --git a/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp
new file mode 100644
index 000000000000..e4c994a8539b
--- /dev/null
+++ b/modules/dnn/src/vkcom/shader/nary_eltwise_binary_forward_spv.cpp
@@ -0,0 +1,232 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../../precomp.hpp"
+
+namespace cv { namespace dnn { namespace vkcom {
+
+extern const unsigned int nary_eltwise_binary_forward_spv[1757] = {
+    0x07230203,0x00010000,0x0008000b,0x00000131,0x00000000,0x00020011,0x00000001,0x0006000b,
+    0x00000001,0x4c534c47,0x6474732e,0x3035342e,0x00000000,0x0003000e,0x00000000,0x00000001,
+    0x0007000f,0x00000005,0x00000004,0x6e69616d,0x00000000,0x0000003c,0x00000083,0x00060010,
+    0x00000004,0x00000011,0x00000400,0x00000001,0x00000001,0x00030003,0x00000002,0x000001c2,
+    0x00040005,0x00000004,0x6e69616d,0x00000000,0x00060005,0x00000006,0x616e6962,0x665f7972,
+    0x6177726f,0x00286472,0x00040005,0x0000000a,0x6d69646e,0x00000073,0x00040005,0x0000000b,
+    0x61726150,0x0000736d,0x00050006,0x0000000b,0x00000000,0x7954706f,0x00006570,0x00050006,
+    0x0000000b,0x00000001,0x6d69646e,0x00000073,0x00040005,0x0000000d,0x61726170,0x0000736d,
+    0x00030005,0x00000012,0x00317064,0x00040005,0x00000014,0x70657453,0x00000000,0x00050006,
+    0x00000014,0x00000000,0x5374616d,0x00706574,0x00030005,0x00000016,0x00000000,0x00030005,
+    0x0000001e,0x00327064,0x00030005,0x00000025,0x00007064,0x00030005,0x0000002a,0x0000316e,
+    0x00040005,0x0000002c,0x70616853,0x00000065,0x00050006,0x0000002c,0x00000000,0x70616873,
+    0x00000065,0x00030005,0x0000002e,0x00000000,0x00030005,0x00000033,0x0000326e,0x00050005,
+    0x00000038,0x6e616c70,0x64695f65,0x00000078,0x00060005,0x0000003c,0x575f6c67,0x476b726f,
+    0x70756f72,0x00004449,0x00040005,0x00000042,0x31727470,0x00000000,0x00040005,0x00000043,
+    0x32727470,0x00000000,0x00030005,0x00000044,0x00727470,0x00030005,0x00000045,0x00786469,
+    0x00030005,0x00000047,0x0000006b,0x00050005,0x00000052,0x7478656e,0x7864695f,0x00000000,
+    0x00030005,0x00000058,0x006b5f69,0x00050005,0x0000007d,0x6f5f3269,0x65736666,0x00000074,
+    0x00050005,0x00000082,0x6f5f3169,0x65736666,0x00000074,0x00080005,0x00000083,0x4c5f6c67,
+    0x6c61636f,0x6f766e49,0x69746163,0x44496e6f,0x00000000,0x00030005,0x000000a1,0x00003169,
+    0x00040005,0x000000b4,0x7074754f,0x00007475,0x00050006,0x000000b4,0x00000000,0x4f74616d,
+    0x00007475,0x00030005,0x000000b6,0x00000000,0x00040005,0x000000bd,0x75706e49,0x00003174,
+    0x00050006,0x000000bd,0x00000000,0x4174616d,0x00000000,0x00030005,0x000000bf,0x00000000,
+    0x00040005,0x000000c9,0x75706e49,0x00003274,0x00050006,0x000000c9,0x00000000,0x4274616d,
+    0x00000000,0x00030005,0x000000cb,0x00000000,0x00050048,0x0000000b,0x00000000,0x00000023,
+    0x00000000,0x00050048,0x0000000b,0x00000001,0x00000023,0x00000004,0x00030047,0x0000000b,
+    0x00000002,0x00040047,0x0000000d,0x00000022,0x00000000,0x00040047,0x0000000d,0x00000021,
+    0x00000003,0x00040047,0x00000013,0x00000006,0x00000004,0x00040048,0x00000014,0x00000000,
+    0x00000018,0x00050048,0x00000014,0x00000000,0x00000023,0x00000000,0x00030047,0x00000014,
+    0x00000003,0x00040047,0x00000016,0x00000022,0x00000000,0x00040047,0x00000016,0x00000021,
+    0x00000005,0x00040047,0x0000002b,0x00000006,0x00000004,0x00040048,0x0000002c,0x00000000,
+    0x00000018,0x00050048,0x0000002c,0x00000000,0x00000023,0x00000000,0x00030047,0x0000002c,
+    0x00000003,0x00040047,0x0000002e,0x00000022,0x00000000,0x00040047,0x0000002e,0x00000021,
+    0x00000004,0x00040047,0x0000003c,0x0000000b,0x0000001a,0x00040047,0x00000083,0x0000000b,
+    0x0000001b,0x00040047,0x000000b3,0x00000006,0x00000004,0x00040048,0x000000b4,0x00000000,
+    0x00000019,0x00050048,0x000000b4,0x00000000,0x00000023,0x00000000,0x00030047,0x000000b4,
+    0x00000003,0x00040047,0x000000b6,0x00000022,0x00000000,0x00040047,0x000000b6,0x00000021,
+    0x00000002,0x00040047,0x000000bc,0x00000006,0x00000004,0x00040048,0x000000bd,0x00000000,
+    0x00000018,0x00050048,0x000000bd,0x00000000,0x00000023,0x00000000,0x00030047,0x000000bd,
+    0x00000003,0x00040047,0x000000bf,0x00000022,0x00000000,0x00040047,0x000000bf,0x00000021,
+    0x00000000,0x00040047,0x000000c8,0x00000006,0x00000004,0x00040048,0x000000c9,0x00000000,
+    0x00000018,0x00050048,0x000000c9,0x00000000,0x00000023,0x00000000,0x00030047,0x000000c9,
+    0x00000003,0x00040047,0x000000cb,0x00000022,0x00000000,0x00040047,0x000000cb,0x00000021,
+    0x00000001,0x00040047,0x0000011f,0x0000000b,0x00000019,0x00020013,0x00000002,0x00030021,
+    0x00000003,0x00000002,0x00040015,0x00000008,0x00000020,0x00000001,0x00040020,0x00000009,
+    0x00000007,0x00000008,0x0004001e,0x0000000b,0x00000008,0x00000008,0x00040020,0x0000000c,
+    0x00000002,0x0000000b,0x0004003b,0x0000000c,0x0000000d,0x00000002,0x0004002b,0x00000008,
+    0x0000000e,0x00000001,0x00040020,0x0000000f,0x00000002,0x00000008,0x0003001d,0x00000013,
+    0x00000008,0x0003001e,0x00000014,0x00000013,0x00040020,0x00000015,0x00000002,0x00000014,
+    0x0004003b,0x00000015,0x00000016,0x00000002,0x0004002b,0x00000008,0x00000017,0x00000000,
+    0x0004002b,0x00000008,0x00000018,0x00000002,0x0004002b,0x00000008,0x0000001f,0x00000003,
+    0x0003001d,0x0000002b,0x00000008,0x0003001e,0x0000002c,0x0000002b,0x00040020,0x0000002d,
+    0x00000002,0x0000002c,0x0004003b,0x0000002d,0x0000002e,0x00000002,0x00040015,0x00000039,
+    0x00000020,0x00000000,0x00040017,0x0000003a,0x00000039,0x00000003,0x00040020,0x0000003b,
+    0x00000001,0x0000003a,0x0004003b,0x0000003b,0x0000003c,0x00000001,0x0004002b,0x00000039,
+    0x0000003d,0x00000000,0x00040020,0x0000003e,0x00000001,0x00000039,0x00020014,0x00000050,
+    0x0004002b,0x00000039,0x0000007e,0x00000001,0x0004003b,0x0000003b,0x00000083,0x00000001,
+    0x00030016,0x000000b2,0x00000020,0x0003001d,0x000000b3,0x000000b2,0x0003001e,0x000000b4,
+    0x000000b3,0x00040020,0x000000b5,0x00000002,0x000000b4,0x0004003b,0x000000b5,0x000000b6,
+    0x00000002,0x0003001d,0x000000bc,0x000000b2,0x0003001e,0x000000bd,0x000000bc,0x00040020,
+    0x000000be,0x00000002,0x000000bd,0x0004003b,0x000000be,0x000000bf,0x00000002,0x00040020,
+    0x000000c5,0x00000002,0x000000b2,0x0003001d,0x000000c8,0x000000b2,0x0003001e,0x000000c9,
+    0x000000c8,0x00040020,0x000000ca,0x00000002,0x000000c9,0x0004003b,0x000000ca,0x000000cb,
+    0x00000002,0x0004002b,0x00000008,0x00000119,0x00000400,0x0004002b,0x00000039,0x0000011e,
+    0x00000400,0x0006002c,0x0000003a,0x0000011f,0x0000011e,0x0000007e,0x0000007e,0x0004002b,
+    0x00000008,0x00000120,0x00000004,0x0004002b,0x00000008,0x00000121,0x00000005,0x0004002b,
+    0x00000008,0x00000122,0x00000006,0x0004002b,0x00000008,0x00000123,0x00000007,0x0004002b,
+    0x00000008,0x00000124,0x00000008,0x0004002b,0x00000008,0x00000125,0x00000009,0x0004002b,
+    0x00000008,0x00000126,0x0000000a,0x0004002b,0x00000008,0x00000127,0x0000000b,0x0004002b,
+    0x00000008,0x00000128,0x0000000c,0x0004002b,0x00000008,0x00000129,0x0000000d,0x0004002b,
+    0x00000008,0x0000012a,0x0000000e,0x0004002b,0x00000008,0x0000012b,0x0000000f,0x0004002b,
+    0x00000008,0x0000012c,0x00000010,0x0004002b,0x00000008,0x0000012d,0x00000011,0x0004002b,
+    0x00000008,0x0000012e,0x00000012,0x0004002b,0x00000008,0x0000012f,0x00000013,0x0004002b,
+    0x00000008,0x00000130,0x00000014,0x00050036,0x00000002,0x00000004,0x00000000,0x00000003,
+    0x000200f8,0x00000005,0x00040039,0x00000002,0x0000011c,0x00000006,0x000100fd,0x00010038,
+    0x00050036,0x00000002,0x00000006,0x00000000,0x00000003,0x000200f8,0x00000007,0x0004003b,
+    0x00000009,0x0000000a,0x00000007,0x0004003b,0x00000009,0x00000012,0x00000007,0x0004003b,
+    0x00000009,0x0000001e,0x00000007,0x0004003b,0x00000009,0x00000025,0x00000007,0x0004003b,
+    0x00000009,0x0000002a,0x00000007,0x0004003b,0x00000009,0x00000033,0x00000007,0x0004003b,
+    0x00000009,0x00000038,0x00000007,0x0004003b,0x00000009,0x00000042,0x00000007,0x0004003b,
+    0x00000009,0x00000043,0x00000007,0x0004003b,0x00000009,0x00000044,0x00000007,0x0004003b,
+    0x00000009,0x00000045,0x00000007,0x0004003b,0x00000009,0x00000047,0x00000007,0x0004003b,
+    0x00000009,0x00000052,0x00000007,0x0004003b,0x00000009,0x00000058,0x00000007,0x0004003b,
+    0x00000009,0x0000007d,0x00000007,0x0004003b,0x00000009,0x00000082,0x00000007,0x0004003b,
+    0x00000009,0x000000a1,0x00000007,0x00050041,0x0000000f,0x00000010,0x0000000d,0x0000000e,
+    0x0004003d,0x00000008,0x00000011,0x00000010,0x0003003e,0x0000000a,0x00000011,0x0004003d,
+    0x00000008,0x00000019,0x0000000a,0x00050084,0x00000008,0x0000001a,0x00000018,0x00000019,
+    0x00050082,0x00000008,0x0000001b,0x0000001a,0x0000000e,0x00060041,0x0000000f,0x0000001c,
+    0x00000016,0x00000017,0x0000001b,0x0004003d,0x00000008,0x0000001d,0x0000001c,0x0003003e,
+    0x00000012,0x0000001d,0x0004003d,0x00000008,0x00000020,0x0000000a,0x00050084,0x00000008,
+    0x00000021,0x0000001f,0x00000020,0x00050082,0x00000008,0x00000022,0x00000021,0x0000000e,
+    0x00060041,0x0000000f,0x00000023,0x00000016,0x00000017,0x00000022,0x0004003d,0x00000008,
+    0x00000024,0x00000023,0x0003003e,0x0000001e,0x00000024,0x0004003d,0x00000008,0x00000026,
+    0x0000000a,0x00050082,0x00000008,0x00000027,0x00000026,0x0000000e,0x00060041,0x0000000f,
+    0x00000028,0x00000016,0x00000017,0x00000027,0x0004003d,0x00000008,0x00000029,0x00000028,
+    0x0003003e,0x00000025,0x00000029,0x0004003d,0x00000008,0x0000002f,0x0000000a,0x00050082,
+    0x00000008,0x00000030,0x0000002f,0x0000000e,0x00060041,0x0000000f,0x00000031,0x0000002e,
+    0x00000017,0x00000030,0x0004003d,0x00000008,0x00000032,0x00000031,0x0003003e,0x0000002a,
+    0x00000032,0x0004003d,0x00000008,0x00000034,0x0000000a,0x00050082,0x00000008,0x00000035,
+    0x00000034,0x00000018,0x00060041,0x0000000f,0x00000036,0x0000002e,0x00000017,0x00000035,
+    0x0004003d,0x00000008,0x00000037,0x00000036,0x0003003e,0x00000033,0x00000037,0x00050041,
+    0x0000003e,0x0000003f,0x0000003c,0x0000003d,0x0004003d,0x00000039,0x00000040,0x0000003f,
+    0x0004007c,0x00000008,0x00000041,0x00000040,0x0003003e,0x00000038,0x00000041,0x0003003e,
+    0x00000042,0x00000017,0x0003003e,0x00000043,0x00000017,0x0003003e,0x00000044,0x00000017,
+    0x0004003d,0x00000008,0x00000046,0x00000038,0x0003003e,0x00000045,0x00000046,0x0004003d,
+    0x00000008,0x00000048,0x0000000a,0x00050082,0x00000008,0x00000049,0x00000048,0x0000001f,
+    0x0003003e,0x00000047,0x00000049,0x000200f9,0x0000004a,0x000200f8,0x0000004a,0x000400f6,
+    0x0000004c,0x0000004d,0x00000000,0x000200f9,0x0000004e,0x000200f8,0x0000004e,0x0004003d,
+    0x00000008,0x0000004f,0x00000047,0x000500af,0x00000050,0x00000051,0x0000004f,0x00000017,
+    0x000400fa,0x00000051,0x0000004b,0x0000004c,0x000200f8,0x0000004b,0x0004003d,0x00000008,
+    0x00000053,0x00000045,0x0004003d,0x00000008,0x00000054,0x00000047,0x00060041,0x0000000f,
+    0x00000055,0x0000002e,0x00000017,0x00000054,0x0004003d,0x00000008,0x00000056,0x00000055,
+    0x00050087,0x00000008,0x00000057,0x00000053,0x00000056,0x0003003e,0x00000052,0x00000057,
+    0x0004003d,0x00000008,0x00000059,0x00000045,0x0004003d,0x00000008,0x0000005a,0x00000052,
+    0x0004003d,0x00000008,0x0000005b,0x00000047,0x00060041,0x0000000f,0x0000005c,0x0000002e,
+    0x00000017,0x0000005b,0x0004003d,0x00000008,0x0000005d,0x0000005c,0x00050084,0x00000008,
+    0x0000005e,0x0000005a,0x0000005d,0x00050082,0x00000008,0x0000005f,0x00000059,0x0000005e,
+    0x0003003e,0x00000058,0x0000005f,0x0004003d,0x00000008,0x00000060,0x00000058,0x0004003d,
+    0x00000008,0x00000061,0x0000000a,0x0004003d,0x00000008,0x00000062,0x00000047,0x00050080,
+    0x00000008,0x00000063,0x00000061,0x00000062,0x00060041,0x0000000f,0x00000064,0x00000016,
+    0x00000017,0x00000063,0x0004003d,0x00000008,0x00000065,0x00000064,0x00050084,0x00000008,
+    0x00000066,0x00000060,0x00000065,0x0004003d,0x00000008,0x00000067,0x00000042,0x00050080,
+    0x00000008,0x00000068,0x00000067,0x00000066,0x0003003e,0x00000042,0x00000068,0x0004003d,
+    0x00000008,0x00000069,0x00000058,0x0004003d,0x00000008,0x0000006a,0x0000000a,0x00050084,
+    0x00000008,0x0000006b,0x00000018,0x0000006a,0x0004003d,0x00000008,0x0000006c,0x00000047,
+    0x00050080,0x00000008,0x0000006d,0x0000006b,0x0000006c,0x00060041,0x0000000f,0x0000006e,
+    0x00000016,0x00000017,0x0000006d,0x0004003d,0x00000008,0x0000006f,0x0000006e,0x00050084,
+    0x00000008,0x00000070,0x00000069,0x0000006f,0x0004003d,0x00000008,0x00000071,0x00000043,
+    0x00050080,0x00000008,0x00000072,0x00000071,0x00000070,0x0003003e,0x00000043,0x00000072,
+    0x0004003d,0x00000008,0x00000073,0x00000058,0x0004003d,0x00000008,0x00000074,0x00000047,
+    0x00060041,0x0000000f,0x00000075,0x00000016,0x00000017,0x00000074,0x0004003d,0x00000008,
+    0x00000076,0x00000075,0x00050084,0x00000008,0x00000077,0x00000073,0x00000076,0x0004003d,
+    0x00000008,0x00000078,0x00000044,0x00050080,0x00000008,0x00000079,0x00000078,0x00000077,
+    0x0003003e,0x00000044,0x00000079,0x0004003d,0x00000008,0x0000007a,0x00000052,0x0003003e,
+    0x00000045,0x0000007a,0x000200f9,0x0000004d,0x000200f8,0x0000004d,0x0004003d,0x00000008,
+    0x0000007b,0x00000047,0x00050082,0x00000008,0x0000007c,0x0000007b,0x0000000e,0x0003003e,
+    0x00000047,0x0000007c,0x000200f9,0x0000004a,0x000200f8,0x0000004c,0x00050041,0x0000003e,
+    0x0000007f,0x0000003c,0x0000007e,0x0004003d,0x00000039,0x00000080,0x0000007f,0x0004007c,
+    0x00000008,0x00000081,0x00000080,0x0003003e,0x0000007d,0x00000081,0x00050041,0x0000003e,
+    0x00000084,0x00000083,0x0000003d,0x0004003d,0x00000039,0x00000085,0x00000084,0x0004007c,
+    0x00000008,0x00000086,0x00000085,0x0003003e,0x00000082,0x00000086,0x0004003d,0x00000008,
+    0x00000087,0x0000007d,0x0004003d,0x00000008,0x00000088,0x0000000a,0x00050084,0x00000008,
+    0x00000089,0x00000018,0x00000088,0x00050082,0x00000008,0x0000008a,0x00000089,0x00000018,
+    0x00060041,0x0000000f,0x0000008b,0x00000016,0x00000017,0x0000008a,0x0004003d,0x00000008,
+    0x0000008c,0x0000008b,0x00050084,0x00000008,0x0000008d,0x00000087,0x0000008c,0x0004003d,
+    0x00000008,0x0000008e,0x00000042,0x00050080,0x00000008,0x0000008f,0x0000008e,0x0000008d,
+    0x0003003e,0x00000042,0x0000008f,0x0004003d,0x00000008,0x00000090,0x0000007d,0x0004003d,
+    0x00000008,0x00000091,0x0000000a,0x00050084,0x00000008,0x00000092,0x0000001f,0x00000091,
+    0x00050082,0x00000008,0x00000093,0x00000092,0x00000018,0x00060041,0x0000000f,0x00000094,
+    0x00000016,0x00000017,0x00000093,0x0004003d,0x00000008,0x00000095,0x00000094,0x00050084,
+    0x00000008,0x00000096,0x00000090,0x00000095,0x0004003d,0x00000008,0x00000097,0x00000043,
+    0x00050080,0x00000008,0x00000098,0x00000097,0x00000096,0x0003003e,0x00000043,0x00000098,
+    0x0004003d,0x00000008,0x00000099,0x0000007d,0x0004003d,0x00000008,0x0000009a,0x0000000a,
+    0x00050082,0x00000008,0x0000009b,0x0000009a,0x00000018,0x00060041,0x0000000f,0x0000009c,
+    0x00000016,0x00000017,0x0000009b,0x0004003d,0x00000008,0x0000009d,0x0000009c,0x00050084,
+    0x00000008,0x0000009e,0x00000099,0x0000009d,0x0004003d,0x00000008,0x0000009f,0x00000044,
+    0x00050080,0x00000008,0x000000a0,0x0000009f,0x0000009e,0x0003003e,0x00000044,0x000000a0,
+    0x0004003d,0x00000008,0x000000a2,0x00000082,0x0003003e,0x000000a1,0x000000a2,0x000200f9,
+    0x000000a3,0x000200f8,0x000000a3,0x000400f6,0x000000a5,0x000000a6,0x00000000,0x000200f9,
+    0x000000a7,0x000200f8,0x000000a7,0x0004003d,0x00000008,0x000000a8,0x000000a1,0x0004003d,
+    0x00000008,0x000000a9,0x0000002a,0x000500b1,0x00000050,0x000000aa,0x000000a8,0x000000a9,
+    0x000400fa,0x000000aa,0x000000a4,0x000000a5,0x000200f8,0x000000a4,0x00050041,0x0000000f,
+    0x000000ab,0x0000000d,0x00000017,0x0004003d,0x00000008,0x000000ac,0x000000ab,0x000300f7,
+    0x000000b1,0x00000000,0x000b00fb,0x000000ac,0x000000b1,0x00000012,0x000000ad,0x00000010,
+    0x000000ae,0x0000000f,0x000000af,0x00000013,0x000000b0,0x000200f8,0x000000ad,0x0004003d,
+    0x00000008,0x000000b7,0x00000044,0x0004003d,0x00000008,0x000000b8,0x000000a1,0x0004003d,
+    0x00000008,0x000000b9,0x00000025,0x00050084,0x00000008,0x000000ba,0x000000b8,0x000000b9,
+    0x00050080,0x00000008,0x000000bb,0x000000b7,0x000000ba,0x0004003d,0x00000008,0x000000c0,
+    0x00000042,0x0004003d,0x00000008,0x000000c1,0x000000a1,0x0004003d,0x00000008,0x000000c2,
+    0x00000012,0x00050084,0x00000008,0x000000c3,0x000000c1,0x000000c2,0x00050080,0x00000008,
+    0x000000c4,0x000000c0,0x000000c3,0x00060041,0x000000c5,0x000000c6,0x000000bf,0x00000017,
+    0x000000c4,0x0004003d,0x000000b2,0x000000c7,0x000000c6,0x0004003d,0x00000008,0x000000cc,
+    0x00000043,0x0004003d,0x00000008,0x000000cd,0x000000a1,0x0004003d,0x00000008,0x000000ce,
+    0x0000001e,0x00050084,0x00000008,0x000000cf,0x000000cd,0x000000ce,0x00050080,0x00000008,
+    0x000000d0,0x000000cc,0x000000cf,0x00060041,0x000000c5,0x000000d1,0x000000cb,0x00000017,
+    0x000000d0,0x0004003d,0x000000b2,0x000000d2,0x000000d1,0x00050081,0x000000b2,0x000000d3,
+    0x000000c7,0x000000d2,0x00060041,0x000000c5,0x000000d4,0x000000b6,0x00000017,0x000000bb,
+    0x0003003e,0x000000d4,0x000000d3,0x000200f9,0x000000b1,0x000200f8,0x000000ae,0x0004003d,
+    0x00000008,0x000000d6,0x00000044,0x0004003d,0x00000008,0x000000d7,0x000000a1,0x0004003d,
+    0x00000008,0x000000d8,0x00000025,0x00050084,0x00000008,0x000000d9,0x000000d7,0x000000d8,
+    0x00050080,0x00000008,0x000000da,0x000000d6,0x000000d9,0x0004003d,0x00000008,0x000000db,
+    0x00000042,0x0004003d,0x00000008,0x000000dc,0x000000a1,0x0004003d,0x00000008,0x000000dd,
+    0x00000012,0x00050084,0x00000008,0x000000de,0x000000dc,0x000000dd,0x00050080,0x00000008,
+    0x000000df,0x000000db,0x000000de,0x00060041,0x000000c5,0x000000e0,0x000000bf,0x00000017,
+    0x000000df,0x0004003d,0x000000b2,0x000000e1,0x000000e0,0x0004003d,0x00000008,0x000000e2,
+    0x00000043,0x0004003d,0x00000008,0x000000e3,0x000000a1,0x0004003d,0x00000008,0x000000e4,
+    0x0000001e,0x00050084,0x00000008,0x000000e5,0x000000e3,0x000000e4,0x00050080,0x00000008,
+    0x000000e6,0x000000e2,0x000000e5,0x00060041,0x000000c5,0x000000e7,0x000000cb,0x00000017,
+    0x000000e6,0x0004003d,0x000000b2,0x000000e8,0x000000e7,0x00050083,0x000000b2,0x000000e9,
+    0x000000e1,0x000000e8,0x00060041,0x000000c5,0x000000ea,0x000000b6,0x00000017,0x000000da,
+    0x0003003e,0x000000ea,0x000000e9,0x000200f9,0x000000b1,0x000200f8,0x000000af,0x0004003d,
+    0x00000008,0x000000ec,0x00000044,0x0004003d,0x00000008,0x000000ed,0x000000a1,0x0004003d,
+    0x00000008,0x000000ee,0x00000025,0x00050084,0x00000008,0x000000ef,0x000000ed,0x000000ee,
+    0x00050080,0x00000008,0x000000f0,0x000000ec,0x000000ef,0x0004003d,0x00000008,0x000000f1,
+    0x00000042,0x0004003d,0x00000008,0x000000f2,0x000000a1,0x0004003d,0x00000008,0x000000f3,
+    0x00000012,0x00050084,0x00000008,0x000000f4,0x000000f2,0x000000f3,0x00050080,0x00000008,
+    0x000000f5,0x000000f1,0x000000f4,0x00060041,0x000000c5,0x000000f6,0x000000bf,0x00000017,
+    0x000000f5,0x0004003d,0x000000b2,0x000000f7,0x000000f6,0x0004003d,0x00000008,0x000000f8,
+    0x00000043,0x0004003d,0x00000008,0x000000f9,0x000000a1,0x0004003d,0x00000008,0x000000fa,
+    0x0000001e,0x00050084,0x00000008,0x000000fb,0x000000f9,0x000000fa,0x00050080,0x00000008,
+    0x000000fc,0x000000f8,0x000000fb,0x00060041,0x000000c5,0x000000fd,0x000000cb,0x00000017,
+    0x000000fc,0x0004003d,0x000000b2,0x000000fe,0x000000fd,0x00050085,0x000000b2,0x000000ff,
+    0x000000f7,0x000000fe,0x00060041,0x000000c5,0x00000100,0x000000b6,0x00000017,0x000000f0,
+    0x0003003e,0x00000100,0x000000ff,0x000200f9,0x000000b1,0x000200f8,0x000000b0,0x0004003d,
+    0x00000008,0x00000102,0x00000044,0x0004003d,0x00000008,0x00000103,0x000000a1,0x0004003d,
+    0x00000008,0x00000104,0x00000025,0x00050084,0x00000008,0x00000105,0x00000103,0x00000104,
+    0x00050080,0x00000008,0x00000106,0x00000102,0x00000105,0x0004003d,0x00000008,0x00000107,
+    0x00000042,0x0004003d,0x00000008,0x00000108,0x000000a1,0x0004003d,0x00000008,0x00000109,
+    0x00000012,0x00050084,0x00000008,0x0000010a,0x00000108,0x00000109,0x00050080,0x00000008,
+    0x0000010b,0x00000107,0x0000010a,0x00060041,0x000000c5,0x0000010c,0x000000bf,0x00000017,
+    0x0000010b,0x0004003d,0x000000b2,0x0000010d,0x0000010c,0x0004003d,0x00000008,0x0000010e,
+    0x00000043,0x0004003d,0x00000008,0x0000010f,0x000000a1,0x0004003d,0x00000008,0x00000110,
+    0x0000001e,0x00050084,0x00000008,0x00000111,0x0000010f,0x00000110,0x00050080,0x00000008,
+    0x00000112,0x0000010e,0x00000111,0x00060041,0x000000c5,0x00000113,0x000000cb,0x00000017,
+    0x00000112,0x0004003d,0x000000b2,0x00000114,0x00000113,0x00050088,0x000000b2,0x00000115,
+    0x0000010d,0x00000114,0x00060041,0x000000c5,0x00000116,0x000000b6,0x00000017,0x00000106,
+    0x0003003e,0x00000116,0x00000115,0x000200f9,0x000000b1,0x000200f8,0x000000b1,0x000200f9,
+    0x000000a6,0x000200f8,0x000000a6,0x0004003d,0x00000008,0x0000011a,0x000000a1,0x00050080,
+    0x00000008,0x0000011b,0x0000011a,0x00000119,0x0003003e,0x000000a1,0x0000011b,0x000200f9,
+    0x000000a3,0x000200f8,0x000000a5,0x000100fd,0x00010038
+};
+
+}}} // namespace cv::dnn::vkcom
diff --git a/modules/dnn/src/vkcom/shader/spv_shader.cpp b/modules/dnn/src/vkcom/shader/spv_shader.cpp
index 7f6b9d3ab48f..42285e5f776c 100644
--- a/modules/dnn/src/vkcom/shader/spv_shader.cpp
+++ b/modules/dnn/src/vkcom/shader/spv_shader.cpp
@@ -12,10 +12,11 @@ std::map<std::string, std::pair<const unsigned int *, size_t> > SPVMaps;
 void initSPVMaps()
 {
     SPVMaps.insert(std::make_pair("conv_1x1_fast_spv", std::make_pair(conv_1x1_fast_spv, 3134)));
-    SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902)));
+    SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092)));
     SPVMaps.insert(std::make_pair("conv_depthwise_3x3_spv", std::make_pair(conv_depthwise_3x3_spv, 1977)));
     SPVMaps.insert(std::make_pair("conv_implicit_gemm_spv", std::make_pair(conv_implicit_gemm_spv, 3565)));
-    SPVMaps.insert(std::make_pair("conv_depthwise_spv", std::make_pair(conv_depthwise_spv, 2092)));
+    SPVMaps.insert(std::make_pair("gemm_spv", std::make_pair(gemm_spv, 2902)));
+    SPVMaps.insert(std::make_pair("nary_eltwise_binary_forward_spv", std::make_pair(nary_eltwise_binary_forward_spv, 1757)));
 }
 
 }}} // namespace cv::dnn::vkcom
diff --git a/modules/dnn/src/vkcom/shader/spv_shader.hpp b/modules/dnn/src/vkcom/shader/spv_shader.hpp
index e90cf605c486..1573a926252d 100644
--- a/modules/dnn/src/vkcom/shader/spv_shader.hpp
+++ b/modules/dnn/src/vkcom/shader/spv_shader.hpp
@@ -9,10 +9,11 @@
 namespace cv { namespace dnn { namespace vkcom {
 
 extern const unsigned int conv_1x1_fast_spv[3134];
-extern const unsigned int gemm_spv[2902];
+extern const unsigned int conv_depthwise_spv[2092];
 extern const unsigned int conv_depthwise_3x3_spv[1977];
 extern const unsigned int conv_implicit_gemm_spv[3565];
-extern const unsigned int conv_depthwise_spv[2092];
+extern const unsigned int gemm_spv[2902];
+extern const unsigned int nary_eltwise_binary_forward_spv[1757];
 
 extern std::map<std::string, std::pair<const unsigned int *, size_t> > SPVMaps;
 
diff --git a/modules/dnn/src/vkcom/src/context.cpp b/modules/dnn/src/vkcom/src/context.cpp
index 2cb355a247f4..4432d58aa052 100644
--- a/modules/dnn/src/vkcom/src/context.cpp
+++ b/modules/dnn/src/vkcom/src/context.cpp
@@ -225,7 +225,7 @@ void Context::createInstance()
 
         if (result != VK_SUCCESS)
         {
-            CV_Error(CV_StsError, "Vulkan: vkEnumerateInstanceLayerProperties failed!");
+            CV_Error(cv::Error::StsError, "Vulkan: vkEnumerateInstanceLayerProperties failed!");
             return;
         }
 
@@ -234,7 +234,7 @@ void Context::createInstance()
 
         if (result != VK_SUCCESS)
         {
-            CV_Error(CV_StsError, "Vulkan: vkEnumerateInstanceLayerProperties failed!");
+            CV_Error(cv::Error::StsError, "Vulkan: vkEnumerateInstanceLayerProperties failed!");
             return;
         }
 
@@ -388,7 +388,7 @@ Context::Context()
     vkEnumeratePhysicalDevices(kInstance, &deviceCount, NULL);
     if (deviceCount == 0)
     {
-        CV_Error(CV_StsError, "Vulkan Backend: could not find a device with vulkan support!");
+        CV_Error(cv::Error::StsError, "Vulkan Backend: could not find a device with vulkan support!");
     }
 
     std::vector<VkPhysicalDevice> devices(deviceCount);
@@ -442,7 +442,7 @@ Context::Context()
     if (!cmdPoolPtr)
         cmdPoolPtr = CommandPool::create(kQueue, kQueueFamilyIndex);
     else
-        CV_Error(CV_StsError, "cmdPoolPtr has been created before!!");
+        CV_Error(cv::Error::StsError, "cmdPoolPtr has been created before!!");
 
     pipelineFactoryPtr = PipelineFactory::create();
 }
diff --git a/modules/dnn/src/vkcom/src/op_conv.cpp b/modules/dnn/src/vkcom/src/op_conv.cpp
index 9c84ffccdf60..22aa53711100 100644
--- a/modules/dnn/src/vkcom/src/op_conv.cpp
+++ b/modules/dnn/src/vkcom/src/op_conv.cpp
@@ -244,7 +244,7 @@ bool OpConv::computeGroupCount()
         group_z_ = 1;
     }
     else
-        CV_Error(CV_StsNotImplemented, "shader type is not supported at compute GroupCount.");
+        CV_Error(cv::Error::StsNotImplemented, "shader type is not supported at compute GroupCount.");
 
     CV_Assert(group_x_ <= MAX_GROUP_COUNT_X);
     CV_Assert(group_y_ <= MAX_GROUP_COUNT_Y);
diff --git a/modules/dnn/src/vkcom/src/op_naryEltwise.cpp b/modules/dnn/src/vkcom/src/op_naryEltwise.cpp
new file mode 100644
index 000000000000..bcc5cd7e2e00
--- /dev/null
+++ b/modules/dnn/src/vkcom/src/op_naryEltwise.cpp
@@ -0,0 +1,197 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../../precomp.hpp"
+#include "internal.hpp"
+#include "../include/op_naryeltwise.hpp"
+
+namespace cv { namespace dnn { namespace vkcom {
+
+#ifdef HAVE_VULKAN
+
+#define STEP_SIZE 65536
+
+#define MAX_GROUP_COUNT_X 65535
+#define MAX_GROUP_COUNT_Y 65535
+#define MAX_GROUP_COUNT_Z 65535
+
+OpNary::OpNary(const OpNary::OPERATION _naryOpType, int _ninputs, int _max_ndims,
+               const std::vector<std::vector<int>> shapes, const std::vector<std::vector<size_t>> steps)
+                : naryOpType(_naryOpType), ninputs(_ninputs), max_ndims(_max_ndims)
+{
+    CV_Assert(ninputs > 1);
+
+    shapesBuf.resize((ninputs + 1) * max_ndims);
+    stepsBuf.resize((ninputs + 1) * max_ndims);
+    for (int i = 0; i <= ninputs; i++)
+    {
+        std::copy(shapes[i].begin(), shapes[i].end(), shapesBuf.data() + i * max_ndims);
+        std::copy(steps[i].begin(), steps[i].end(), stepsBuf.data() + i * max_ndims);
+    }
+
+    // TODO(VK): support more types of operation
+    switch(naryOpType) {
+        // case OPERATION::EQUAL:
+        // case OPERATION::GREATER:
+        // case OPERATION::GREATER_EQUAL:
+        // case OPERATION::LESS:
+        // case OPERATION::LESS_EQUAL:
+        // case OPERATION::POW:
+        // case OPERATION::BITSHIFT:
+        // case OPERATION::MOD:
+        case OPERATION::PROD:
+        case OPERATION::SUB:
+        case OPERATION::ADD:
+        case OPERATION::DIV:
+        // case OPERATION::AND:
+        // case OPERATION::OR:
+        // case OPERATION::XOR:
+        {
+            CV_Assert(ninputs == 2);
+            CV_Assert(max_ndims >= 2);
+            shaderType = kNaryShaderTypeBinary;
+            shader_name = "nary_eltwise_binary_forward_spv";
+
+            // TODO(VK): confirm if this makes any sense
+            nplanes = std::accumulate(shapesBuf.data(), shapesBuf.data() + max_ndims - 2, 1, [](int32_t a, int32_t b) { return a * b; } );
+            N2 = shapesBuf.data()[max_ndims - 2];
+            N1 = shapesBuf.data()[max_ndims - 1];
+            CV_LOG_DEBUG(NULL, "max_ndims="<<max_ndims<<", nplanes="<<nplanes<<", N2="<<N2<<", N1="<<N1);
+            break;
+        }
+        case OPERATION::WHERE:
+        {
+            CV_Assert(ninputs == 3);
+            CV_Assert(max_ndims >= 2);
+            shaderType = kNaryShaderTypeTrinary;
+            shader_name = "nary_eltwise_trinary_forward_spv";
+            break;
+        }
+        // case OPERATION::MAX:
+        // case OPERATION::MEAN:
+        // case OPERATION::MIN:
+        case OPERATION::SUM:
+        {
+            CV_Assert(max_ndims >= 2);
+            shaderType = kNaryShaderTypeNary;
+            shader_name = "nary_eltwise_nary_forward_spv";
+            break;
+        }
+        //TODO(VK) add other cases
+        default:
+            CV_Error(Error::StsNotImplemented, "Unsupported nary operation type");
+    }
+    // TODO(VK): initialize OpNary class
+}
+
+void OpNary::firstForward()
+{
+    if (!firstForwardFinsh)
+    {
+        config.local_size_x = 1; // TODO(vk) determine local_size_y if necessary
+        config.local_size_y = 1; // TODO(vk) determine local_size_y if necessary
+        config.local_size_z = 1; // TODO(vk) determine local_size_z if necessary
+        computeGroupCount();
+        firstForwardFinsh = true;
+    }
+    else
+        return;
+}
+
+bool OpNary::binaryForward(std::vector<Tensor>& ins, std::vector<Tensor>& outs)
+{
+    std::vector<int32_t> param = {(int32_t)naryOpType, max_ndims};
+    std::vector<int32_t> paramSize = {(int32_t)param.size()};
+    std::vector<int32_t> dimSizes = {(ninputs + 1) * max_ndims};
+    std::vector<int32_t> actualSteps;
+
+    // TODO(VK): compute step for different dtype. Currently this is for kFormatFp32.
+    actualSteps.resize(stepsBuf.size());
+    std::transform(stepsBuf.data(), stepsBuf.data() + dimSizes[0], actualSteps.begin(), [](int32_t sz){ return sz / 4; });
+
+    Tensor paramTensor = Tensor(reinterpret_cast<const char *>(param.data()), paramSize, kFormatInt32, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
+    Tensor shapeTensor = Tensor(reinterpret_cast<const char *>(shapesBuf.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
+    Tensor stepTensor = Tensor(reinterpret_cast<const char *>(actualSteps.data()), dimSizes, kFormatInt32, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
+
+    destTypes = {
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input1
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // input2
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // out
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, // param
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // shape
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // step
+    };
+
+
+    Ptr<Pipeline> pipeline = pipelineFactoryPtr->getPipeline(shader_name, destTypes);
+    Ptr<CommandBuffer> cmdBuffer = cmdPoolPtr->allocBuffer();
+    Ptr<Descriptor> desSet = pipeline->createSet();
+    VkCommandBuffer cmdBufferReal = cmdBuffer->get();
+
+    desSet->writeTensor(ins[0], 0);
+    desSet->writeTensor(ins[1], 1);
+    desSet->writeTensor(outs[0], 2);
+    desSet->writeTensor(paramTensor, 3);
+    desSet->writeTensor(shapeTensor, 4);
+    desSet->writeTensor(stepTensor, 5);
+
+    cmdBuffer->beginRecord();
+    pipeline->bind(cmdBufferReal, desSet->get());
+    vkCmdDispatch(cmdBufferReal, group_x_, group_y_, group_z_);
+    cmdBuffer->endRecord();
+    cmdPoolPtr->submitAndWait(cmdBufferReal);
+
+    return true;
+}
+
+bool OpNary::forward(std::vector<Tensor>& ins, std::vector<Tensor>& outs)
+{
+
+    firstForward();
+
+    // TODO(VK): Support more dtypes. Currently only kFormatFp32 is supported.
+    for (auto &tensor: ins)
+    {
+        CV_Assert(tensor.getFormat() == kFormatFp32);
+    }
+    for (auto &tensor: outs)
+    {
+        CV_Assert(tensor.getFormat() == kFormatFp32);
+    }
+
+    switch(shaderType) {
+        case kNaryShaderTypeBinary: {
+            return binaryForward(ins, outs);
+            break;
+        }
+        default:
+            CV_Error(Error::StsNotImplemented, "Unsupported shader type invoked.");
+    }
+
+    return true;
+}
+
+bool OpNary::computeGroupCount()
+{
+    if (shaderType == kNaryShaderTypeBinary)
+    {
+        group_x_ = nplanes; // parallelism at plane level
+        group_y_ = N2;
+        group_z_ = 1;
+    }
+    else
+    {
+        CV_Error(cv::Error::StsNotImplemented, "shader type is not supported at compute GroupCount.");
+    }
+
+    CV_Assert(group_x_ <= MAX_GROUP_COUNT_X);
+    CV_Assert(group_y_ <= MAX_GROUP_COUNT_Y);
+    CV_Assert(group_z_ <= MAX_GROUP_COUNT_Z);
+
+    return true;
+}
+
+#endif // HAVE_VULKAN
+
+}}} // namespace cv::dnn::vkcom
diff --git a/modules/dnn/src/vkcom/src/pipeline.cpp b/modules/dnn/src/vkcom/src/pipeline.cpp
index ea29d8824a9e..240102fba4ab 100644
--- a/modules/dnn/src/vkcom/src/pipeline.cpp
+++ b/modules/dnn/src/vkcom/src/pipeline.cpp
@@ -279,7 +279,7 @@ Ptr<Pipeline> PipelineFactory::getPipeline(const std::string& key, const std::ve
     // retrieve spv from SPVMaps with given key
     auto iterSPV  = SPVMaps.find(key);
     if (iterSPV == SPVMaps.end())
-        CV_Error(CV_StsError, "Can not create SPV with the given name:"+key+"!");
+        CV_Error(cv::Error::StsError, "Can not create SPV with the given name:"+key+"!");
 
     const uint32_t* spv = iterSPV->second.first;
     size_t length = iterSPV->second.second;
@@ -292,7 +292,7 @@ Ptr<Pipeline> PipelineFactory::getPipeline(const std::string& key, const std::ve
     }
     else
     {
-        CV_Error(CV_StsError, "Can not Created the VkPipeline "+key);
+        CV_Error(cv::Error::StsError, "Can not Created the VkPipeline "+key);
     }
 
     return pipeline;
diff --git a/modules/dnn/test/imagenet_cls_test_alexnet.py b/modules/dnn/test/imagenet_cls_test_alexnet.py
index 0d2564c1b7a0..30910cbb4484 100644
--- a/modules/dnn/test/imagenet_cls_test_alexnet.py
+++ b/modules/dnn/test/imagenet_cls_test_alexnet.py
@@ -155,6 +155,21 @@ def get_output(self, input_blob):
         self.net.setInput(input_blob, self.in_blob_name)
         return self.net.forward(self.out_blob_name)
 
+class DNNOnnxModel(Framework):
+    net = object
+
+    def __init__(self, onnx_file, in_blob_name, out_blob_name):
+        self.net = cv.dnn.readNetFromONNX(onnx_file)
+        self.in_blob_name = in_blob_name
+        self.out_blob_name = out_blob_name
+
+    def get_name(self):
+        return 'DNN (ONNX)'
+
+    def get_output(self, input_blob):
+        self.net.setInput(input_blob, self.in_blob_name)
+        return self.net.forward(self.out_blob_name)
+
 
 class ClsAccEvaluation:
     log = sys.stdout
diff --git a/modules/dnn/test/pascal_semsegm_test_fcn.py b/modules/dnn/test/pascal_semsegm_test_fcn.py
index d79f6be13bb7..8754d3b7f9e1 100644
--- a/modules/dnn/test/pascal_semsegm_test_fcn.py
+++ b/modules/dnn/test/pascal_semsegm_test_fcn.py
@@ -5,7 +5,7 @@
 import argparse
 import time
 
-from imagenet_cls_test_alexnet import CaffeModel, DnnCaffeModel
+from imagenet_cls_test_alexnet import CaffeModel, DNNOnnxModel
 try:
     import cv2 as cv
 except ImportError:
@@ -58,14 +58,14 @@ def __init__(self):
         pass
 
     @staticmethod
-    def process(img):
-        image_data = np.array(img).transpose(2, 0, 1).astype(np.float32)
-        mean = np.ones(image_data.shape)
-        mean[0] *= 104
-        mean[1] *= 117
-        mean[2] *= 123
-        image_data -= mean
-        image_data = np.expand_dims(image_data, 0)
+    def process(img, framework):
+        image_data = None
+        if framework == "Caffe":
+            image_data = cv.dnn.blobFromImage(img, scalefactor=1.0, mean=(123.0, 117.0, 104.0), swapRB=True)
+        elif framework == "DNN (ONNX)":
+            image_data = cv.dnn.blobFromImage(img, scalefactor=0.019, mean=(123.675, 116.28, 103.53), swapRB=True)
+        else:
+            raise ValueError("Unknown framework")
         return image_data
 
 
@@ -105,10 +105,10 @@ class PASCALDataFetch(DatasetImageFetch):
     colors = []
     i = 0
 
-    def __init__(self, img_dir, segm_dir, names_file, segm_cls_colors_file, preproc):
+    def __init__(self, img_dir, segm_dir, names_file, segm_cls_colors, preproc):
         self.img_dir = img_dir
         self.segm_dir = segm_dir
-        self.colors = self.read_colors(segm_cls_colors_file)
+        self.colors = self.read_colors(segm_cls_colors)
         self.data_prepoc = preproc
         self.i = 0
 
@@ -117,26 +117,30 @@ def __init__(self, img_dir, segm_dir, names_file, segm_cls_colors_file, preproc)
                 self.names.append(l.rstrip())
 
     @staticmethod
-    def read_colors(img_classes_file):
+    def read_colors(colors):
         result = []
-        with open(img_classes_file) as f:
-            for l in f.readlines():
-                color = np.array(map(int, l.split()[1:]))
-                result.append(DatasetImageFetch.pix_to_c(color))
+        for color in colors:
+            result.append(DatasetImageFetch.pix_to_c(color))
         return result
 
     def __iter__(self):
         return self
 
-    def next(self):
+    def __next__(self):
         if self.i < len(self.names):
             name = self.names[self.i]
             self.i += 1
             segm_file = self.segm_dir + name + ".png"
             img_file = self.img_dir + name + ".jpg"
             gt = self.color_to_gt(cv.imread(segm_file, cv.IMREAD_COLOR)[:, :, ::-1], self.colors)
-            img = self.data_prepoc.process(cv.imread(img_file, cv.IMREAD_COLOR)[:, :, ::-1])
-            return img, gt
+            img = cv.imread(img_file, cv.IMREAD_COLOR)
+            img_caffe = self.data_prepoc.process(img[:, :, ::-1], "Caffe")
+            img_dnn = self.data_prepoc.process(img[:, :, ::-1], "DNN (ONNX)")
+            img_dict = {
+                "Caffe": img_caffe,
+                "DNN (ONNX)": img_dnn
+            }
+            return img_dict, gt
         else:
             self.i = 0
             raise StopIteration
@@ -160,12 +164,13 @@ def process(self, frameworks, data_fetcher):
         blobs_l_inf_diff = [sys.float_info.min] * len(frameworks)
         inference_time = [0.0] * len(frameworks)
 
-        for in_blob, gt in data_fetcher:
+        for in_blob_dict, gt in data_fetcher:
             frameworks_out = []
             samples_handled += 1
             for i in range(len(frameworks)):
                 start = time.time()
-                out = frameworks[i].get_output(in_blob)
+                framework_name = frameworks[i].get_name()
+                out = frameworks[i].get_output(in_blob_dict[framework_name])
                 end = time.time()
                 segm = eval_segm_result(out)
                 conf_mats[i] += get_conf_mat(gt, segm[0])
@@ -198,28 +203,53 @@ def process(self, frameworks, data_fetcher):
             log_str = frameworks[0].get_name() + " vs " + frameworks[i].get_name() + ':'
             print('Final l1 diff', log_str, blobs_l1_diff[i] / blobs_l1_diff_count[i], file=self.log)
 
+# PASCAL VOC 2012 classes colors
+colors_pascal_voc_2012 = [
+    [0, 0, 0],
+    [128, 0, 0],
+    [0, 128, 0],
+    [128, 128, 0],
+    [0, 0, 128],
+    [128, 0, 128],
+    [0, 128, 128],
+    [128, 128, 128],
+    [64, 0, 0],
+    [192, 0, 0],
+    [64, 128, 0],
+    [192, 128, 0],
+    [64, 0, 128],
+    [192, 0, 128],
+    [64, 128, 128],
+    [192, 128, 128],
+    [0, 64, 0],
+    [128, 64, 0],
+    [0, 192, 0],
+    [128, 192, 0],
+    [0, 64, 128],
+]
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--imgs_dir", help="path to PASCAL VOC 2012 images dir, data/VOC2012/JPEGImages")
     parser.add_argument("--segm_dir", help="path to PASCAL VOC 2012 segmentation dir, data/VOC2012/SegmentationClass/")
     parser.add_argument("--val_names", help="path to file with validation set image names, download it here: "
                         "https://github.com/shelhamer/fcn.berkeleyvision.org/blob/master/data/pascal/seg11valid.txt")
-    parser.add_argument("--cls_file", help="path to file with colors for classes, download it here: "
-                        "https://github.com/opencv/opencv/blob/4.x/samples/data/dnn/pascal-classes.txt")
     parser.add_argument("--prototxt", help="path to caffe prototxt, download it here: "
                         "https://github.com/opencv/opencv/blob/4.x/samples/data/dnn/fcn8s-heavy-pascal.prototxt")
     parser.add_argument("--caffemodel", help="path to caffemodel file, download it here: "
                                              "http://dl.caffe.berkeleyvision.org/fcn8s-heavy-pascal.caffemodel")
-    parser.add_argument("--log", help="path to logging file")
+    parser.add_argument("--onnxmodel", help="path to onnx model file, download it here: "
+                                             "https://github.com/onnx/models/raw/491ce05590abb7551d7fae43c067c060eeb575a6/validated/vision/object_detection_segmentation/fcn/model/fcn-resnet50-12.onnx")
+    parser.add_argument("--log", help="path to logging file", default='log.txt')
     parser.add_argument("--in_blob", help="name for input blob", default='data')
     parser.add_argument("--out_blob", help="name for output blob", default='score')
     args = parser.parse_args()
 
     prep = MeanChannelsPreproc()
-    df = PASCALDataFetch(args.imgs_dir, args.segm_dir, args.val_names, args.cls_file, prep)
+    df = PASCALDataFetch(args.imgs_dir, args.segm_dir, args.val_names, colors_pascal_voc_2012, prep)
 
     fw = [CaffeModel(args.prototxt, args.caffemodel, args.in_blob, args.out_blob, True),
-          DnnCaffeModel(args.prototxt, args.caffemodel, '', args.out_blob)]
+        DNNOnnxModel(args.onnxmodel, args.in_blob, args.out_blob)]
 
     segm_eval = SemSegmEvaluation(args.log)
     segm_eval.process(fw, df)
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index da666ace017a..591ec63515c9 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -49,7 +49,10 @@ class DNNTestNetwork : public DNNTestLayer
         net.setInput(inp);
         net.setPreferableBackend(backend);
         net.setPreferableTarget(target);
-        net.enableWinograd(useWinograd);
+
+        if (target == DNN_TARGET_CPU_FP16)
+            net.enableWinograd(false);
+
         if (backend == DNN_BACKEND_HALIDE && !halideScheduler.empty())
         {
             halideScheduler = findDataFile(halideScheduler);
@@ -99,6 +102,12 @@ class DNNTestNetwork : public DNNTestLayer
     Net net;
 };
 
+TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) {
+    processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0");
+    expectNoFallbacksFromIE(net);
+    expectNoFallbacksFromCUDA(net);
+}
+
 TEST_P(DNNTestNetwork, AlexNet)
 {
     applyTestTag(CV_TEST_TAG_MEMORY_1GB);
@@ -114,7 +123,7 @@ TEST_P(DNNTestNetwork, ResNet_50)
 {
     applyTestTag(
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
-        CV_TEST_TAG_DEBUG_LONG
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 
     processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
@@ -167,10 +176,12 @@ TEST_P(DNNTestNetwork, ENet)
 {
     applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB);
 
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
@@ -194,7 +205,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
     float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0;
     float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063  : 0.0;
     float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262  : FLT_MIN;
-         processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
+         processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
                     inp, "detection_out", "", scoreDiff, iouDiff, detectionConfThresh);
     expectNoFallbacksFromIE(net);
 }
@@ -237,7 +248,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
         scoreDiff = 0.03;
         iouDiff = 0.08;
     }
-    processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
+    processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
                 inp, "detection_out", "", scoreDiff, iouDiff);
     expectNoFallbacksFromIE(net);
 }
@@ -325,8 +336,11 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
 
 TEST_P(DNNTestNetwork, SSD_VGG16)
 {
-    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
-                 CV_TEST_TAG_DEBUG_VERYLONG);
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_2GB,
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
     if (backend == DNN_BACKEND_HALIDE && target == DNN_TARGET_CPU)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);  // TODO HALIDE_CPU
 
@@ -447,7 +461,7 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
     applyTestTag(
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
-        CV_TEST_TAG_DEBUG_LONG
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 #if defined(INF_ENGINE_RELEASE)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
@@ -529,7 +543,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
     Mat img = imread(findDataFile("dnn/googlenet_1.png"));
     Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
     // Output image has values in range [-143.526, 148.539].
-    float l1 = 2e-4, lInf = 2e-3;
+    float l1 = 2e-4, lInf = 2.4e-3;
     if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
     {
         l1 = 0.4;
@@ -543,7 +557,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
     else if (target == DNN_TARGET_CPU_FP16)
     {
         l1 = 0.4;
-        lInf = 19.;
+        lInf = 22.;
     }
     else if (target == DNN_TARGET_VULKAN)
     {
@@ -573,4 +587,1008 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false, true, true));
 
+/*
+    Backend tests of layers
+*/
+
+static void testLayer(Mat& input, Net& net, Backend backendId, Target targetId, bool skipCheck = false, bool randInput = true, double l1 = 0.0, double lInf = 0.0)
+{
+    DNNTestLayer::checkBackend(backendId, targetId);
+    if (randInput)
+        randu(input, -1.0f, 1.0f);
+
+    net.setInput(input);
+    net.setPreferableBackend(DNN_BACKEND_OPENCV);
+    Mat outputDefault = net.forward().clone();
+
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+    Mat output = net.forward().clone();
+
+    if (skipCheck)
+        return;
+
+    double default_l1, default_lInf;
+    DNNTestLayer::getDefaultThresholds(backendId, targetId, &default_l1, &default_lInf);
+    if (l1 == 0.0)
+        l1 = default_l1;
+    if (lInf == 0.0)
+        lInf = default_lInf;
+    normAssert(outputDefault, output, "", l1, lInf);
+    if (cvtest::debugLevel > 0 || testing::Test::HasFailure())
+    {
+        std::cout << "l1=" << l1 << "  lInf=" << lInf << std::endl;
+        std::cout << outputDefault.reshape(1, outputDefault.total()).t() << std::endl;
+        std::cout << output.reshape(1, outputDefault.total()).t() << std::endl;
+    }
+}
+
+static void testLayer(LayerParams& params, Mat& input, Backend backendId, Target targetId, bool skipCheck = false, double l1 = 0.0, double lInf = 0.0)
+{
+    Net net;
+    net.addLayerToPrev(params.name, params.type, params);
+    testLayer(input, net, backendId, targetId, skipCheck, true, l1, lInf);
+}
+
+class Test_layers_backends : public DNNTestLayer {};
+
+////////////////////////////////////////////////////////////////////////////////
+// Padding
+////////////////////////////////////////////////////////////////////////////////
+TEST_P(Test_layers_backends, Padding)
+{
+    static const int kNumRuns = 10;
+    std::vector<int> paddings(8);
+    cv::RNG& rng = cv::theRNG();
+    for (int t = 0; t < kNumRuns; ++t)
+    {
+        for (int i = 0; i < paddings.size(); ++i)
+            paddings[i] = rng(5);
+
+        LayerParams lp;
+        lp.set("paddings", DictValue::arrayInt<int*>(&paddings[0], paddings.size()));
+        lp.type = "Padding";
+        lp.name = "testLayer";
+
+        int sz[] = {1 + (int)rng(10), 1 + (int)rng(10), 1 + (int)rng(10), 1 + (int)rng(10)};
+        Mat input(4, &sz[0], CV_32F);
+        testLayer(lp, input, backend, target);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Convolution
+////////////////////////////////////////////////////////////////////////////////
+typedef TestWithParam<tuple<Vec3i, Size, Size, Size, Size, Size, bool, tuple<Backend, Target> > > Convolution;
+TEST_P(Convolution, Accuracy)
+{
+    int inChannels = get<0>(GetParam())[0];
+    int outChannels = get<0>(GetParam())[1];
+    int group = get<0>(GetParam())[2];
+    Size inSize = get<1>(GetParam());
+    Size kernel = get<2>(GetParam());
+    Size stride = get<3>(GetParam());
+    Size pad = get<4>(GetParam());
+    Size dilation = get<5>(GetParam());
+    bool hasBias = get<6>(GetParam());
+    Backend backendId = get<0>(get<7>(GetParam()));
+    Target targetId = get<1>(get<7>(GetParam()));
+
+    bool skipCheck = false;
+
+    int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
+    Mat weights(4, &sz[0], CV_32F);
+    randu(weights, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.set("kernel_w", kernel.width);
+    lp.set("kernel_h", kernel.height);
+    lp.set("pad_w", pad.width);
+    lp.set("pad_h", pad.height);
+    lp.set("stride_w", stride.width);
+    lp.set("stride_h", stride.height);
+    lp.set("dilation_w", dilation.width);
+    lp.set("dilation_h", dilation.height);
+    lp.set("num_output", outChannels);
+    lp.set("group", group);
+    lp.set("bias_term", hasBias);
+    lp.type = "Convolution";
+    lp.name = "testLayer";
+    lp.blobs.push_back(weights);
+    if (hasBias)
+    {
+        Mat bias(1, outChannels, CV_32F);
+        randu(bias, -1.0f, 1.0f);
+        lp.blobs.push_back(bias);
+    }
+    int inpSz[] = {1, inChannels, inSize.height, inSize.width};
+    Mat input(4, &inpSz[0], CV_32F);
+    testLayer(lp, input, backendId, targetId, skipCheck);
+    if (skipCheck)
+        throw SkipTestException("Skip checks in unstable test");
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Convolution, testing::Combine(
+/*in channels, out channels, group*/
+             testing::Values(Vec3i(6, 4, 1), Vec3i(6, 9, 1),
+                    Vec3i(6, 4, 2), Vec3i(6, 9, 3)),
+/*in size*/  testing::Values(Size(5, 6)),
+/*kernel*/   testing::Values(Size(3, 1), Size(1, 3)),
+/*stride*/   testing::Values(Size(1, 1), Size(2, 2)),
+/*pad*/      testing::Values(Size(1, 0), Size(0, 1)),
+/*dilation*/ testing::Values(Size(1, 1), Size(2, 2)),
+/*has bias*/ testing::Bool(),
+             dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// Deconvolution
+////////////////////////////////////////////////////////////////////////////////
+typedef TestWithParam<tuple<Vec3i, Size, Size, Size, Size, Vec4i, bool, tuple<Backend, Target> > > Deconvolution;
+TEST_P(Deconvolution, Accuracy)
+{
+    int inChannels = get<0>(GetParam())[0];
+    int outChannels = get<0>(GetParam())[1];
+    int group = get<0>(GetParam())[2];
+    Size inSize = get<1>(GetParam());
+    Size kernel = get<2>(GetParam());
+    Size pad = get<3>(GetParam());
+    Size dilation = get<4>(GetParam());
+    Size stride = Size(get<5>(GetParam())[0], get<5>(GetParam())[1]);
+    Size adjPad = Size(get<5>(GetParam())[2], get<5>(GetParam())[3]);
+    bool hasBias = get<6>(GetParam());
+    Backend backendId = get<0>(get<7>(GetParam()));
+    Target targetId = get<1>(get<7>(GetParam()));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
+            && inChannels == 6 && outChannels == 4 && group == 1
+            && kernel == Size(3, 1) && pad == Size(0, 1)
+            && stride == Size(1, 1) && dilation == Size(1, 1))
+        applyTestTag(targetId == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
+            CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION
+        );
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
+            && inChannels == 6 && outChannels == 4 && group == 1
+            && kernel == Size(1, 3) && pad == Size(1, 0)
+            && stride == Size(1, 1) && dilation == Size(1, 1))
+        applyTestTag(targetId == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
+            CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION
+        );
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
+            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
+            && inChannels == 6 && outChannels == 4 && group == 1
+            && kernel == Size(1, 3) && pad == Size(1, 0)
+            && stride == Size(1, 1) && dilation == Size(1, 1))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+#endif
+
+    if (targetId == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+
+    int sz[] = {inChannels, outChannels / group, kernel.height, kernel.width};
+    Mat weights(4, &sz[0], CV_32F);
+    randu(weights, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.set("kernel_w", kernel.width);
+    lp.set("kernel_h", kernel.height);
+    lp.set("pad_w", pad.width);
+    lp.set("pad_h", pad.height);
+    lp.set("stride_w", stride.width);
+    lp.set("stride_h", stride.height);
+    lp.set("dilation_w", dilation.width);
+    lp.set("dilation_h", dilation.height);
+    lp.set("adj_w", adjPad.width);
+    lp.set("adj_h", adjPad.height);
+    lp.set("num_output", outChannels);
+    lp.set("group", group);
+    lp.set("bias_term", hasBias);
+    lp.type = "Deconvolution";
+    lp.name = "testLayer";
+    lp.blobs.push_back(weights);
+    if (hasBias)
+    {
+        Mat bias(1, outChannels, CV_32F);
+        randu(bias, -1.0f, 1.0f);
+        lp.blobs.push_back(bias);
+    }
+    int inpSz[] = {1, inChannels, inSize.height, inSize.width};
+    Mat input(4, &inpSz[0], CV_32F);
+    testLayer(lp, input, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Deconvolution, testing::Combine(
+/*in channels, out channels, group*/
+             testing::Values(Vec3i(6, 4, 1), Vec3i(6, 9, 3)),
+/*in size*/  testing::Values(Size(5, 6)),
+/*kernel*/   testing::Values(Size(3, 1), Size(1, 3)),
+/*pad*/      testing::Values(Size(1, 0), Size(0, 1)),
+/*dilation*/ testing::Values(Size(1, 1)),
+/*stride, adj. pad*/ testing::Values(Vec4i(1,1, 0,0), Vec4i(2,2, 1,0), Vec4i(1,2, 0,1)),
+/*has bias*/ testing::Bool(),
+             dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// LRN
+////////////////////////////////////////////////////////////////////////////////
+typedef TestWithParam<tuple<Vec3i, int, Vec3f, bool, std::string, tuple<Backend, Target> > > LRN;
+TEST_P(LRN, Accuracy)
+{
+    int inChannels = get<0>(GetParam())[0];
+    Size inSize = Size(get<0>(GetParam())[1], get<0>(GetParam())[2]);
+    int localSize = get<1>(GetParam());
+    float alpha = get<2>(GetParam())[0];
+    float beta = get<2>(GetParam())[1];
+    float bias = get<2>(GetParam())[2];
+    bool normBySize = get<3>(GetParam());
+    std::string nrmType = get<4>(GetParam());
+    Backend backendId = get<0>(get<5>(GetParam()));
+    Target targetId = get<1>(get<5>(GetParam()));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
+    if ((inSize.width == 5 || inSize.height == 5) && targetId == DNN_TARGET_MYRIAD &&
+        nrmType == "ACROSS_CHANNELS")
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+#endif
+
+    LayerParams lp;
+    lp.set("norm_region", nrmType);
+    lp.set("local_size", localSize);
+    lp.set("alpha", alpha);
+    lp.set("beta", beta);
+    lp.set("bias", bias);
+    lp.set("norm_by_size", normBySize);
+    lp.type = "LRN";
+    lp.name = "testLayer";
+
+    int sz[] = {1, inChannels, inSize.height, inSize.width};
+    Mat input(4, &sz[0], CV_32F);
+
+    double l1 = 0.0, lInf = 0.0;
+    // The OpenCL kernels use the native_ math functions which have
+    // implementation defined accuracy, so we use relaxed thresholds. See
+    // https://github.com/opencv/opencv/issues/9821 for more details.
+    if (targetId == DNN_TARGET_OPENCL)
+    {
+        l1 = 0.01;
+        lInf = 0.01;
+    }
+    testLayer(lp, input, backendId, targetId, false, l1, lInf);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, LRN, testing::Combine(
+/*input ch,w,h*/ testing::Values(Vec3i(6, 5, 8), Vec3i(7, 11, 6)),
+/*local size*/   testing::Values(3, 5),
+                 testing::Values(Vec3f(0.9f, 1.0f, 1.1f), Vec3f(0.9f, 1.1f, 1.0f),
+/*alpha, beta, bias*/   Vec3f(1.0f, 0.9f, 1.1f), Vec3f(1.0f, 1.1f, 0.9f),
+                        Vec3f(1.1f, 0.9f, 1.0f), Vec3f(1.1f, 1.0f, 0.9f)),
+/*norm_by_size*/ testing::Bool(),
+/*norm_type*/    testing::Values("ACROSS_CHANNELS", "WITHIN_CHANNEL"),
+                 dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// Average pooling
+////////////////////////////////////////////////////////////////////////////////
+typedef TestWithParam<tuple<int, Size, Size, Size, tuple<Backend, Target> > > AvePooling;
+TEST_P(AvePooling, Accuracy)
+{
+    int inChannels = get<0>(GetParam());
+    Size outSize = get<1>(GetParam());;  // Input size will be computed from parameters.
+    Size kernel = get<2>(GetParam());
+    Size stride = get<3>(GetParam());
+    Backend backendId = get<0>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
+
+#if defined(INF_ENGINE_RELEASE)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
+            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
+            && kernel == Size(1, 1) && (stride == Size(1, 1) || stride == Size(2, 2)))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+#endif
+
+    const int inWidth = (outSize.width - 1) * stride.width + kernel.width;
+    const int inHeight = (outSize.height - 1) * stride.height + kernel.height;
+
+    LayerParams lp;
+    lp.set("pool", "ave");
+    lp.set("kernel_w", kernel.width);
+    lp.set("kernel_h", kernel.height);
+    lp.set("stride_w", stride.width);
+    lp.set("stride_h", stride.height);
+    lp.type = "Pooling";
+    lp.name = "testLayer";
+
+    int sz[] = {1, inChannels, inHeight, inWidth};
+    Mat input(4, &sz[0], CV_32F);
+    testLayer(lp, input, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, AvePooling, testing::Combine(
+/*in channels*/ testing::Values(3, 4),
+/*out size*/    testing::Values(Size(1, 1), Size(2, 2), Size(3, 2), Size(4, 7)),
+/*kernel*/      testing::Values(Size(1, 1), Size(2, 2), Size(3, 3), Size(3, 2)),
+/*stride*/      testing::Values(Size(1, 1), Size(2, 2), Size(3, 2)),
+                dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// Maximum pooling
+////////////////////////////////////////////////////////////////////////////////
+typedef TestWithParam<tuple<int, Size, Size, Size, Size, tuple<Backend, Target> > > MaxPooling;
+TEST_P(MaxPooling, Accuracy)
+{
+    int inChannels = get<0>(GetParam());
+    Size inSize = get<1>(GetParam());
+    Size kernel = get<2>(GetParam());
+    Size stride = get<3>(GetParam());
+    Size pad = get<4>(GetParam());
+    Backend backendId = get<0>(get<5>(GetParam()));
+    Target targetId = get<1>(get<5>(GetParam()));
+
+    // https://github.com/openvinotoolkit/openvino/issues/18731
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && stride != Size(1, 1)) {
+        int ow = ceil(static_cast<float>(inSize.width + 2 * pad.width - kernel.width) / stride.width);
+        int oh = ceil(static_cast<float>(inSize.height + 2 * pad.height - kernel.height) / stride.height);
+        if (ow * stride.width >= inSize.width + pad.width || oh * stride.height >= inSize.height + pad.height)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    }
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
+            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
+            && (stride == Size(1, 1) || stride == Size(2, 2))
+            && (pad == Size(0, 1) || pad == Size(1, 1))
+    )
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2020020000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    LayerParams lp;
+    lp.set("pool", "max");
+    lp.set("kernel_w", kernel.width);
+    lp.set("kernel_h", kernel.height);
+    lp.set("stride_w", stride.width);
+    lp.set("stride_h", stride.height);
+    lp.set("pad_w", pad.width);
+    lp.set("pad_h", pad.height);
+    lp.type = "Pooling";
+    lp.name = "testLayer";
+
+    int sz[] = {1, inChannels, inSize.height, inSize.width};
+    Mat input(4, &sz[0], CV_32F);
+    testLayer(lp, input, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, MaxPooling, testing::Combine(
+/*in channels*/ testing::Values(3, 4),
+/*in size*/     testing::Values(Size(5, 5), Size(7, 6)),
+/*kernel*/      testing::Values(Size(2, 2), Size(3, 3), Size(3, 2)),
+/*stride*/      testing::Values(Size(1, 1), Size(2, 2), Size(3, 2)),
+/*pad*/         testing::Values(Size(0, 0), Size(1, 1), Size(0, 1)),
+                dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// Fully-connected
+////////////////////////////////////////////////////////////////////////////////
+typedef TestWithParam<tuple<int, int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
+TEST_P(FullyConnected, Accuracy)
+{
+    int batch = get<0>(GetParam());
+    int inChannels = get<1>(GetParam());
+    Size inSize = get<2>(GetParam());
+    int outChannels = get<3>(GetParam());
+    bool hasBias = get<4>(GetParam());
+    Backend backendId = get<0>(get<5>(GetParam()));
+    Target targetId = get<1>(get<5>(GetParam()));
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
+    if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (targetId == DNN_TARGET_OPENCL_FP16 ||
+       (targetId == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X))) {
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+    }
+#endif
+    // https://github.com/openvinotoolkit/openvino/issues/19436
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16 && batch == 16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2023000000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL && batch == 16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL);
+#endif
+
+    Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F);
+    randu(weights, -1.0f, 1.0f);
+
+    Mat bias(1, outChannels, CV_32F);
+    randu(bias, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.set("num_output", outChannels);
+    lp.set("bias_term", hasBias);
+    lp.blobs.push_back(weights);
+    lp.blobs.push_back(bias);
+    lp.type = "InnerProduct";
+    lp.name = "testLayer";
+
+    int sz[] = {batch, inChannels, inSize.height, inSize.width};
+    Mat input(4, &sz[0], CV_32F);
+
+    double l1 = 0.0;
+    double lInf = 0.0;
+#if defined(INF_ENGINE_RELEASE)
+    if (targetId == DNN_TARGET_MYRIAD)
+    {
+        l1 = 0.015;
+        lInf = 0.025;
+    }
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16)
+    {
+        l1 = 0.01;
+        if (INF_ENGINE_VER_MAJOR_GE(2023000000))
+            lInf = 0.016;
+    }
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL)
+    {
+        l1 = 5e-3;
+        lInf = INF_ENGINE_VER_MAJOR_GE(2023000000) ? 0.016 : 7e-3;
+    }
+#endif
+    if (targetId == DNN_TARGET_CUDA_FP16)
+        l1 = 0.015;
+
+    testLayer(lp, input, backendId, targetId, false, l1, lInf);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, FullyConnected, testing::Combine(
+/*batch*/        testing::Values(1, 2, 4, 8, 16),
+/*in channels*/  testing::Values(3, 4),
+/*in size*/      testing::Values(Size(5, 4), Size(4, 5), Size(1, 1)),
+/*out channels*/ testing::Values(3, 4),
+/*has bias*/     testing::Bool(),
+                 dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// SoftMax
+////////////////////////////////////////////////////////////////////////////////
+typedef TestWithParam<tuple<int,  tuple<Backend, Target> > > SoftMax;
+TEST_P(SoftMax, Accuracy)
+{
+    int inChannels = get<0>(GetParam());
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+    LayerParams lp;
+    lp.type = "Softmax";
+    lp.name = "testLayer";
+
+    int sz[] = {1, inChannels, 1, 1};
+    Mat input(4, &sz[0], CV_32F);
+    testLayer(lp, input, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, SoftMax, testing::Combine(
+    testing::Values(3, 4, 5, 1024),
+    dnnBackendsAndTargets()
+));
+
+//////////////////////////////////////////////////////////////////////////////
+// Max pooling - unpooling
+//////////////////////////////////////////////////////////////////////////////
+TEST_P(Test_layers_backends, MaxPoolUnpool)
+{
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
+
+    LayerParams pool;
+    pool.set("pool", "max");
+    pool.set("kernel_w", 2);
+    pool.set("kernel_h", 2);
+    pool.set("stride_w", 2);
+    pool.set("stride_h", 2);
+    pool.set("pad_w", 0);
+    pool.set("pad_h", 0);
+    pool.type = "Pooling";
+    pool.name = "testPool";
+
+    LayerParams unpool;
+    unpool.set("pool_k_w", 2);
+    unpool.set("pool_k_h", 2);
+    unpool.set("pool_stride_w", 2);
+    unpool.set("pool_stride_h", 2);
+    unpool.set("pool_pad_w", 0);
+    unpool.set("pool_pad_h", 0);
+    unpool.type = "MaxUnpool";
+    unpool.name = "testUnpool";
+
+    Net net;
+    int poolId = net.addLayer(pool.name, pool.type, pool);
+    net.connect(0, 0, poolId, 0);
+
+    int unpoolId = net.addLayer(unpool.name, unpool.type, unpool);
+    net.connect(poolId, 0, unpoolId, 0);
+    net.connect(poolId, 1, unpoolId, 1);
+
+    int sz[] = {1, 1, 4, 4};
+    Mat input(4, &sz[0], CV_32F);
+    testLayer(input, net, backend, target);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AvePooling + in-place layers
+////////////////////////////////////////////////////////////////////////////////
+static const int kNumChannels = 3;
+
+void testInPlaceActivation(LayerParams& lp, Backend backendId, Target targetId, double l1 = 0.0, double lInf = 0.0)
+{
+    EXPECT_FALSE(lp.name.empty());
+
+    LayerParams pool;
+    pool.set("pool", "ave");
+    pool.set("kernel_w", 2);
+    pool.set("kernel_h", 2);
+    pool.set("stride_w", 2);
+    pool.set("stride_h", 2);
+    pool.type = "Pooling";
+    pool.name = "ave_pool";
+
+    Net net;
+    int poolId = net.addLayer(pool.name, pool.type, pool);
+    net.connect(0, 0, poolId, 0);
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    int sz[] = {1, kNumChannels, 10, 10};
+    Mat input(4, &sz[0], CV_32F);
+    testLayer(input, net, backendId, targetId, false, true, l1, lInf);
+}
+
+typedef TestWithParam<tuple<bool, bool, float, tuple<Backend, Target> > > BatchNorm;
+TEST_P(BatchNorm, Accuracy)
+{
+    bool hasWeights = get<0>(GetParam());
+    bool hasBias = get<1>(GetParam());
+    float epsilon = get<2>(GetParam());
+    Backend backendId = get<0>(get<3>(GetParam()));
+    Target targetId = get<1>(get<3>(GetParam()));
+
+    LayerParams lp;
+    lp.set("has_weight", hasWeights);
+    lp.set("has_bias", hasBias);
+    lp.set("eps", epsilon);
+    lp.type = "BatchNorm";
+    lp.name = "testLayer";
+
+    lp.blobs.reserve(4);
+    for (int i = 0; i < 3; ++i)
+        lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
+    if (hasBias || hasWeights)
+        lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
+
+    for (int i = 0; i < lp.blobs.size(); ++i)
+        randu(lp.blobs[i], 0.0f, 1.0f);
+
+    testInPlaceActivation(lp, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, BatchNorm, testing::Combine(
+/*has weights*/ testing::Bool(),
+/*has bias*/    testing::Bool(),
+/*epsilon*/     testing::Values(1e-3f, 1e-5f),
+                dnnBackendsAndTargets()
+));
+
+typedef TestWithParam<tuple<float, tuple<Backend, Target> > > ReLU;
+TEST_P(ReLU, Accuracy)
+{
+    float negativeSlope = get<0>(GetParam());
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019020000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD && negativeSlope < 0)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    LayerParams lp;
+    lp.set("negative_slope", negativeSlope);
+    lp.type = "ReLU";
+    lp.name = "testLayer";
+    testInPlaceActivation(lp, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, ReLU, testing::Combine(
+/*negative slope*/ testing::Values(2.0f, 0.3f, -0.1f, 0.0f),
+                   dnnBackendsAndTargets()
+));
+
+typedef TestWithParam<tuple<std::string, tuple<Backend, Target> > > NoParamActivation;
+TEST_P(NoParamActivation, Accuracy)
+{
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+    std::string layer_type = get<0>(GetParam());
+
+    LayerParams lp;
+    lp.type = layer_type;
+    lp.name = "testLayer";
+    testInPlaceActivation(lp, backendId, targetId);
+}
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NoParamActivation, testing::Combine(
+/*type*/ testing::Values("TanH", "Sigmoid", "AbsVal", "BNLL", "Swish", "Mish"),
+         dnnBackendsAndTargets()
+));
+
+typedef TestWithParam<tuple<Vec3f, tuple<Backend, Target> > > Power;
+TEST_P(Power, Accuracy)
+{
+    float power = get<0>(GetParam())[0];
+    float scale = get<0>(GetParam())[1];
+    float shift = get<0>(GetParam())[2];
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+    LayerParams lp;
+    lp.set("power", power);
+    lp.set("scale", scale);
+    lp.set("shift", shift);
+    lp.type = "Power";
+    lp.name = "testLayer";
+    testInPlaceActivation(lp, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Power, testing::Combine(
+/*power, scale, shift*/ testing::Values(Vec3f(0.9f, 1.0f, 1.1f), Vec3f(0.9f, 1.1f, 1.0f),
+                               Vec3f(1.0f, 0.9f, 1.1f), Vec3f(1.0f, 1.1f, 0.9f),
+                               Vec3f(1.1f, 0.9f, 1.0f), Vec3f(1.1f, 1.0f, 0.9f)),
+                        dnnBackendsAndTargets()
+));
+
+typedef TestWithParam<tuple<Vec3f, tuple<Backend, Target> > > Exp;
+TEST_P(Exp, Accuracy)
+{
+    float base = get<0>(GetParam())[0];
+    float scale = get<0>(GetParam())[1];
+    float shift = get<0>(GetParam())[2];
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+    LayerParams lp;
+    lp.set("base", base);
+    lp.set("scale", scale);
+    lp.set("shift", shift);
+    lp.type = "Exp";
+    lp.name = "testLayer";
+    testInPlaceActivation(lp, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Exp, testing::Combine(
+/*base, scale, shift*/ testing::Values(Vec3f(0.9f, -1.0f, 1.1f), Vec3f(0.9f, 1.1f, -1.0f),
+                              Vec3f(-1.0f, 0.9f, 1.1f), Vec3f(-1.0f, 1.1f, 0.9f),
+                              Vec3f(1.1f, 0.9f, -1.0f), Vec3f(1.1f, -1.0f, 0.9f)),
+                       dnnBackendsAndTargets()
+));
+
+TEST_P(Test_layers_backends, ChannelsPReLU)
+{
+    LayerParams lp;
+    lp.type = "ChannelsPReLU";
+    lp.name = "testLayer";
+    lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
+    randu(lp.blobs[0], -1.0f, 1.0f);
+
+    testInPlaceActivation(lp, backend, target);
+}
+
+typedef TestWithParam<tuple<bool, tuple<Backend, Target> > > Scale;
+TEST_P(Scale, Accuracy)
+{
+    bool hasBias = get<0>(GetParam());
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+    LayerParams lp;
+    lp.set("bias_term", hasBias);
+    lp.type = "Scale";
+    lp.name = "testLayer";
+    lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
+    randu(lp.blobs[0], -1.0f, 1.0f);
+    if (hasBias)
+    {
+        lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
+        randu(lp.blobs[1], -1.0f, 1.0f);
+    }
+    testInPlaceActivation(lp, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Scale, testing::Combine(
+    testing::Bool(),
+    dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// Concat layer
+////////////////////////////////////////////////////////////////////////////////
+//
+// input --- conv --- concat --- output
+//      `--- conv ----^ ^ ^
+//      `---- ... ------' '
+//      `-----------------'
+typedef TestWithParam<tuple<Vec3i, Vec3i, tuple<Backend, Target> > > Concat;
+TEST_P(Concat, Accuracy)
+{
+    Vec3i inSize = get<0>(GetParam());
+    Vec3i numChannels = get<1>(GetParam());
+    Backend backendId = get<0>(get<2>(GetParam()));
+    Target targetId = get<1>(get<2>(GetParam()));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
+            && inSize == Vec3i(1, 4, 5) && numChannels == Vec3i(1, 6, 2)
+    )
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);  // crash
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_CPU
+            && inSize == Vec3i(1, 4, 5) && numChannels == Vec3i(1, 6, 2)
+    )
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);  // TODO: IE_CPU
+#endif
+
+    Net net;
+
+    std::vector<int> convLayerIds;
+    convLayerIds.reserve(numChannels.channels);
+    for (int i = 0, n = numChannels.channels; i < n; ++i)
+    {
+        if (!numChannels[i])
+            break;
+
+        int sz[] = {numChannels[i], inSize[0], 1, 1};
+        Mat weights(4, &sz[0], CV_32F);
+        randu(weights, -1.0f, 1.0f);
+
+        LayerParams convParam;
+        convParam.set("kernel_w", 1);
+        convParam.set("kernel_h", 1);
+        convParam.set("num_output", numChannels[i]);
+        convParam.set("bias_term", false);
+        convParam.type = "Convolution";
+        std::ostringstream ss;
+        ss << "convLayer" << i;
+        convParam.name = ss.str();
+        convParam.blobs.push_back(weights);
+
+        int layerId = net.addLayer(convParam.name, convParam.type, convParam);
+        convLayerIds.push_back(layerId);
+        net.connect(0, 0, layerId, 0);
+    }
+
+    LayerParams concatParam;
+    concatParam.type = "Concat";
+    concatParam.name = "testLayer";
+    int concatId = net.addLayer(concatParam.name, concatParam.type, concatParam);
+    net.connect(0, 0, concatId, 0);
+    for (int i = 0; i < convLayerIds.size(); ++i)
+    {
+        net.connect(convLayerIds[i], 0, concatId, i + 1);
+    }
+
+    int sz[] = {1, inSize[0], inSize[1], inSize[2]};
+    Mat input(4, &sz[0], CV_32F);
+    testLayer(input, net, backendId, targetId);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Concat, testing::Combine(
+/*input size*/ testing::Values(Vec3i(1, 4, 5), Vec3i(2, 8, 6)),
+/*channels*/   testing::Values(Vec3i(2, 0, 0), Vec3i(3, 4, 0), Vec3i(1, 6, 2)),
+               dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// Element-wise layers
+////////////////////////////////////////////////////////////////////////////////
+//
+// input --- conv --- eltwise --- output
+//      `--- conv ----^ ^ ^
+//      `---- ... ------' '
+//      `-----------------'
+typedef TestWithParam<tuple<Vec3i, std::string, int, bool, tuple<Backend, Target> > > Eltwise;
+TEST_P(Eltwise, Accuracy)
+{
+    Vec3i inSize = get<0>(GetParam());
+    std::string op = get<1>(GetParam());
+    int numConv = get<2>(GetParam());
+    bool weighted = get<3>(GetParam());
+    Backend backendId = get<0>(get<4>(GetParam()));
+    Target targetId = get<1>(get<4>(GetParam()));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021040000)
+    // accuracy
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL &&
+        inSize == Vec3i(1, 4, 5) && op == "sum" && numConv == 1 && !weighted)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL &&
+        inSize == Vec3i(2, 8, 6) && op == "sum" && numConv == 1 && !weighted)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD &&
+        inSize == Vec3i(1, 4, 5))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000) && INF_ENGINE_VER_MAJOR_LT(2021040000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && numConv > 1)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_OPENCL &&
+        op == "sum" && numConv == 1 && !weighted)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && numConv > 1)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    bool convInputShift = 1;
+    int numEltwiseInputs = numConv;
+    if (op == "div")
+    {
+        numConv = 1;
+        convInputShift = 0; // first input is convolution
+    }
+
+    Net net;
+
+    std::vector<int> convLayerIds(numConv);
+    for (int i = 0; i < numConv; ++i)
+    {
+        int sz[] = {inSize[0], inSize[0], 1, 1};
+        Mat weights(4, &sz[0], CV_32F);
+        randu(weights, -1.0f, 1.0f);
+
+        LayerParams convParam;
+        convParam.set("kernel_w", 1);
+        convParam.set("kernel_h", 1);
+        convParam.set("num_output", inSize[0]);
+        convParam.set("bias_term", false);
+        convParam.type = "Convolution";
+        std::ostringstream ss;
+        ss << "convLayer" << i;
+        convParam.name = ss.str();
+        convParam.blobs.push_back(weights);
+
+        convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
+        net.connect(0, 0, convLayerIds[i], 0);
+    }
+
+    LayerParams eltwiseParam;
+    eltwiseParam.set("operation", op);
+    if (op == "sum" && weighted)
+    {
+        RNG& rng = cv::theRNG();
+        std::vector<float> coeff(1 + numConv);
+        for (int i = 0; i < coeff.size(); ++i)
+        {
+            coeff[i] = rng.uniform(-2.0f, 2.0f);
+        }
+        eltwiseParam.set("coeff", DictValue::arrayReal<float*>(&coeff[0], coeff.size()));
+    }
+    eltwiseParam.type = "Eltwise";
+    eltwiseParam.name = "testLayer";
+    int eltwiseId = net.addLayer(eltwiseParam.name, eltwiseParam.type, eltwiseParam);
+    if (convInputShift == 1)
+        net.connect(0, 0, eltwiseId, 0);
+    for (int i = 0; i < numConv; ++i)
+    {
+        net.connect(convLayerIds[i], 0, eltwiseId, i + convInputShift);
+    }
+    if (convInputShift == 0)
+        net.connect(0, 0, eltwiseId, numConv);
+    for (int i = numConv; i < numEltwiseInputs; ++i)
+    {
+        net.connect(0, 0, eltwiseId, i + 1);
+    }
+
+    int sz[] = {1, inSize[0], inSize[1], inSize[2]};
+    Mat input(4, &sz[0], CV_32F);
+    if (op == "div")
+        randu(input, 1.0f, 1.0f);  // ensure no divisor value has absouluate value of less than 0.5
+    testLayer(input, net, backendId, targetId, /*skipCheck*/false, (op == "div") ? false : true);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine(
+/*input size*/ testing::Values(Vec3i(1, 4, 5), Vec3i(2, 8, 6)),
+/*operation*/  testing::Values("prod", "sum", "div", "max", "min"),
+/*num convs*/  testing::Values(1, 2, 3),
+/*weighted(for sum only)*/ testing::Bool(),
+               dnnBackendsAndTargets()
+));
+
+////////////////////////////////////////////////////////////////////////////////
+// Element-wise layers
+////////////////////////////////////////////////////////////////////////////////
+using NaryEltwiseConcat = TestWithParam<tuple<std::vector<int>, tuple<Backend, Target>>>;
+TEST_P(NaryEltwiseConcat, Accuracy) {
+    auto param = GetParam();
+    std::vector<int> input_shape = get<0>(param);
+    auto backend_id = get<0>(get<1>(param));
+    auto target_id = get<1>(get<1>(param));
+
+    /* Build the following net:
+
+           <1x4x84>
+           /
+        [Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output]
+                 |                     |
+                 +-> Sigmoid ----------+
+
+    */
+    Net net;
+
+    std::vector<int> mul_B_shape(input_shape.size() - 1, 1);
+    mul_B_shape.back() = input_shape.back();
+    Mat mul_B(mul_B_shape, CV_32FC1);
+    randn(mul_B, 0.f, 1.f);
+    LayerParams mul_B_lp;
+    mul_B_lp.name = "mul_B";
+    mul_B_lp.type = "Const";
+    mul_B_lp.blobs.push_back(mul_B);
+    int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp);
+
+    LayerParams mul_lp;
+    mul_lp.name = "mul";
+    mul_lp.type = "NaryEltwise";
+    mul_lp.set("operation", "mul");
+    int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp);
+    net.connect(0, 0, id_mul, 0);
+    net.connect(id_mul_B, 0, id_mul, 1);
+
+    LayerParams sigmoid_lp;
+    sigmoid_lp.name = "sigmoid";
+    sigmoid_lp.type = "Sigmoid";
+    int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp);
+    net.connect(0, 0, id_sigmoid, 0);
+
+    LayerParams concat_lp;
+    concat_lp.name = "concat";
+    concat_lp.type = "Concat";
+    concat_lp.set("axis", 1);
+    int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp);
+    net.connect(id_mul, 0, id_concat, 0);
+    net.connect(id_sigmoid, 0, id_concat, 1);
+
+    // Run test
+    Mat input(input_shape, CV_32FC1);
+    testLayer(input, net, backend_id, target_id, false);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine(
+    testing::Values(std::vector<int>{1, 4, 84}),
+    dnnBackendsAndTargets())
+);
+
+
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets());
+
 }} // namespace
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 809b959a210b..4db7796e5841 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -62,6 +62,10 @@ class Test_Caffe_nets : public DNNTestLayer
                                    findDataFile("dnn/" + model, false));
         net.setPreferableBackend(backend);
         net.setPreferableTarget(target);
+
+        if (target == DNN_TARGET_CPU_FP16)
+            net.enableWinograd(false);
+
         Mat img = imread(findDataFile("dnn/dog416.png"));
         resize(img, img, Size(800, 600));
         Mat blob = blobFromImage(img, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
@@ -219,6 +223,9 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
     net.setPreferableTarget(targetId);
 
+    if (targetId == DNN_TARGET_CPU_FP16)
+        net.enableWinograd(false);
+
     Mat sample = imread(_tf("grace_hopper_227.png"));
     ASSERT_TRUE(!sample.empty());
 
@@ -263,7 +270,11 @@ TEST(Reproducibility_FCN, Accuracy)
 
 TEST(Reproducibility_SSD, Accuracy)
 {
-    applyTestTag(CV_TEST_TAG_MEMORY_512MB, CV_TEST_TAG_DEBUG_LONG);
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_512MB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
     Net net;
     {
         const string proto = findDataFile("dnn/ssd_vgg16.prototxt");
@@ -290,8 +301,8 @@ TEST(Reproducibility_SSD, Accuracy)
 typedef testing::TestWithParam<tuple<Backend, Target> > Reproducibility_MobileNet_SSD;
 TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
 {
-    const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
-    const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
+    const string proto = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false);
+    const string model = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false);
     Net net = readNetFromCaffe(proto, model);
     int backendId = get<0>(GetParam());
     int targetId = get<1>(GetParam());
@@ -383,6 +394,9 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
     net.setPreferableTarget(targetId);
 
+    if (targetId == DNN_TARGET_CPU_FP16)
+        net.enableWinograd(false);
+
     float l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 3e-5 : 1e-5;
     float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 6e-3 : 1e-4;
 
@@ -490,7 +504,10 @@ TEST(Reproducibility_GoogLeNet_fp16, Accuracy)
 // https://github.com/richzhang/colorization
 TEST_P(Test_Caffe_nets, Colorization)
 {
-    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+    applyTestTag(
+        target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
     checkBackend();
 
     Mat inp = blobFromNPY(_tf("colorization_inp.npy"));
@@ -503,6 +520,10 @@ TEST_P(Test_Caffe_nets, Colorization)
     net.setPreferableBackend(backend);
     net.setPreferableTarget(target);
 
+    // This model has bad accuracy when the FP16 and Winograd are enable at same time.
+    if (target == DNN_TARGET_CPU_FP16)
+        net.enableWinograd(false);
+
     net.getLayer(net.getLayerId("class8_ab"))->blobs.push_back(kernel);
     net.getLayer(net.getLayerId("conv8_313_rh"))->blobs.push_back(Mat(1, 313, CV_32F, 2.606));
 
@@ -568,10 +589,15 @@ TEST_P(Test_Caffe_nets, DenseNet_121)
     {
         l1 = 0.11; lInf = 0.5;
     }
-    else if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
+    else if (target == DNN_TARGET_CUDA_FP16)
     {
         l1 = 0.04; lInf = 0.2;
     }
+    else if (target == DNN_TARGET_CPU_FP16)
+    {
+        l1 = 0.06; lInf = 0.3;
+    }
+
     normAssert(outs[0], ref, "", l1, lInf);
     if (target != DNN_TARGET_MYRIAD || getInferenceEngineVPUType() != CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
         expectNoFallbacksFromIE(model.getNetwork_());
@@ -731,21 +757,23 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
 
-    double scoreDiff = 0.0;
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
-    // Check 'backward_compatible_check || in_out_elements_equal' failed at core/src/op/reshape.cpp:427:
-    // While validating node 'v1::Reshape bbox_pred_reshape (bbox_pred[0]:f32{1,84}, Constant_265242[0]:i64{4}) -> (f32{?,?,?,?})' with friendly_name 'bbox_pred_reshape':
-    // Requested output shape {1,6300,4,1} is incompatible with input shape {1, 84}
+    double scoreDiff = 0.0012, iouDiff = 0.03;
+#if defined(INF_ENGINE_RELEASE)
     if (target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-    if (target == DNN_TARGET_OPENCL_FP16)
-        scoreDiff = 0.02;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        iouDiff = 0.02;
+        if (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16) {
+            scoreDiff = 0.04;
+            iouDiff = 0.06;
+        }
+    }
 #endif
 
     static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
                                            0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
                                            0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
-    testFaster("faster_rcnn_vgg16.prototxt", "VGG16_faster_rcnn_final.caffemodel", ref, scoreDiff);
+    testFaster("faster_rcnn_vgg16.prototxt", "VGG16_faster_rcnn_final.caffemodel", ref, scoreDiff, iouDiff);
 }
 
 TEST_P(Test_Caffe_nets, FasterRCNN_zf)
@@ -756,7 +784,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
 #else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
 #endif
-        CV_TEST_TAG_DEBUG_LONG
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021040000)
     // IE exception: Ngraph operation Reshape with name rpn_cls_score_reshape has dynamic output shape on 0 port, but CPU plug-in supports only static shape
@@ -766,9 +794,6 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
         );
 #endif
 
-    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
-         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
     if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
          backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
@@ -779,7 +804,14 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
     static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
                                            0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
                                            0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
-    testFaster("faster_rcnn_zf.prototxt", "ZF_faster_rcnn_final.caffemodel", ref);
+
+    double scoreDiff = 0.003, iouDiff = 0.07;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        scoreDiff = 0.02;
+        iouDiff = 0.13;
+    }
+
+    testFaster("faster_rcnn_zf.prototxt", "ZF_faster_rcnn_final.caffemodel", ref, scoreDiff, iouDiff);
 }
 
 TEST_P(Test_Caffe_nets, RFCN)
@@ -802,8 +834,8 @@ TEST_P(Test_Caffe_nets, RFCN)
         iouDiff = 0.12;
     }
 
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+#if defined(INF_ENGINE_RELEASE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
     {
         scoreDiff = 0.1f;
         iouDiff = 0.2f;
diff --git a/modules/dnn/test/test_common.hpp b/modules/dnn/test/test_common.hpp
index 6262a0f7a495..435f481566b0 100644
--- a/modules/dnn/test/test_common.hpp
+++ b/modules/dnn/test/test_common.hpp
@@ -232,6 +232,8 @@ class DNNTestLayer : public TestWithParam<tuple<Backend, Target> >
             expectNoFallbacks(net);
     }
 
+    size_t getTopMemoryUsageMB();
+
 protected:
     void checkBackend(Mat* inp = 0, Mat* ref = 0)
     {
diff --git a/modules/dnn/test/test_common.impl.hpp b/modules/dnn/test/test_common.impl.hpp
index 2a4ee34b023c..e78dc6c1e4f7 100644
--- a/modules/dnn/test/test_common.impl.hpp
+++ b/modules/dnn/test/test_common.impl.hpp
@@ -15,6 +15,14 @@
 #include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/logger.hpp>
 
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <psapi.h>
+#endif  // _WIN32
+
 namespace cv { namespace dnn {
 CV__DNN_INLINE_NS_BEGIN
 
@@ -502,4 +510,28 @@ void initDNNTests()
     );
 }
 
+size_t DNNTestLayer::getTopMemoryUsageMB()
+{
+#ifdef _WIN32
+    PROCESS_MEMORY_COUNTERS proc;
+    GetProcessMemoryInfo(GetCurrentProcess(), &proc, sizeof(proc));
+    return proc.PeakWorkingSetSize / pow(1024, 2);  // bytes to megabytes
+#else
+    std::ifstream status("/proc/self/status");
+    std::string line, title;
+    while (std::getline(status, line))
+    {
+        std::istringstream iss(line);
+        iss >> title;
+        if (title == "VmHWM:")
+        {
+            size_t mem;
+            iss >> mem;
+            return mem / 1024;
+        }
+    }
+    return 0l;
+#endif
+}
+
 } // namespace
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 2d614267693d..ba2a7f14c6ac 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -102,11 +102,14 @@ TEST(Test_Darknet, read_yolo_voc_stream)
 class Test_Darknet_layers : public DNNTestLayer
 {
 public:
-    void testDarknetLayer(const std::string& name, bool hasWeights = false, bool testBatchProcessing = true)
+    void testDarknetLayer(const std::string& name, bool hasWeights = false, bool testBatchProcessing = true,
+                          double l1 = 0.0, double lInf = 0.0)
     {
         SCOPED_TRACE(name);
         Mat inp = blobFromNPY(findDataFile("dnn/darknet/" + name + "_in.npy"));
         Mat ref = blobFromNPY(findDataFile("dnn/darknet/" + name + "_out.npy"));
+        l1 = l1 ? l1 : default_l1;
+        lInf = lInf ? lInf : default_lInf;
 
         std::string cfg = findDataFile("dnn/darknet/" + name + ".cfg");
         std::string model = "";
@@ -120,7 +123,7 @@ class Test_Darknet_layers : public DNNTestLayer
         net.setPreferableTarget(target);
         net.setInput(inp);
         Mat out = net.forward();
-        normAssert(out, ref, "", default_l1, default_lInf);
+        normAssert(out, ref, "", l1, lInf);
 
         if (inp.size[0] == 1 && testBatchProcessing)  // test handling of batch size
         {
@@ -166,8 +169,8 @@ class Test_Darknet_layers : public DNNTestLayer
             }*/
             ASSERT_EQ(out2.dims, ref2.dims) << ref.dims;
 
-            normAssert(out2(ranges0), ref2, "", default_l1, default_lInf);
-            normAssert(out2(ranges1), ref2, "", default_l1, default_lInf);
+            normAssert(out2(ranges0), ref2, "", l1, lInf);
+            normAssert(out2(ranges1), ref2, "", l1, lInf);
         }
     }
 };
@@ -354,7 +357,8 @@ TEST_P(Test_Darknet_nets, YoloVoc)
 #else
         CV_TEST_TAG_MEMORY_1GB,
 #endif
-        CV_TEST_TAG_LONG
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
@@ -470,7 +474,8 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc)
                                     1, 6,  0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f); // a car
 
     double scoreDiff = 8e-5, iouDiff = 3e-4;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
+    bool useWinograd = true;
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
     {
         scoreDiff = 8e-3;
         iouDiff = 0.018;
@@ -480,18 +485,24 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc)
         scoreDiff = 0.008;
         iouDiff = 0.02;
     }
+    else if (target == DNN_TARGET_CPU_FP16)
+    {
+        useWinograd = false;
+        scoreDiff = 8e-3;
+        iouDiff = 0.018;
+    }
 
     std::string config_file = "tiny-yolo-voc.cfg";
     std::string weights_file = "tiny-yolo-voc.weights";
 
     {
     SCOPED_TRACE("batch size 1");
-    testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff);
+    testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff, 0.24, 0.4, useWinograd);
     }
 
     {
     SCOPED_TRACE("batch size 2");
-    testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+    testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, 0.4, useWinograd);
     }
 }
 
@@ -887,12 +898,12 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
 
     {
         SCOPED_TRACE("batch size 1");
-        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold);
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold, 0.4, false);
     }
 
     {
         SCOPED_TRACE("batch size 2");
-        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, confThreshold);
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, confThreshold, 0.4, false);
     }
 
 #if defined(INF_ENGINE_RELEASE)
@@ -909,10 +920,10 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
 TEST_P(Test_Darknet_nets, YOLOv4x_mish)
 {
     applyTestTag(
-            CV_TEST_TAG_LONG,
-            CV_TEST_TAG_MEMORY_2GB,
-            CV_TEST_TAG_DEBUG_VERYLONG
-            );
+        CV_TEST_TAG_MEMORY_2GB,
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021040000)
     // IE exception: Ngraph operation Transpose with name permute_168 has dynamic output shape on 0 port, but CPU plug-in supports only static shape
@@ -1039,6 +1050,11 @@ TEST_P(Test_Darknet_layers, avgpool_softmax)
     testDarknetLayer("avgpool_softmax");
 }
 
+TEST_P(Test_Darknet_layers, crop)
+{
+    testDarknetLayer("crop");
+}
+
 TEST_P(Test_Darknet_layers, region)
 {
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
@@ -1046,7 +1062,7 @@ TEST_P(Test_Darknet_layers, region)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
 
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000)
     // accuracy on CPU, OpenCL
     // Expected: (normL1) <= (l1), actual: 0.000358148 vs 1e-05
     //   |ref| = 1.207319974899292
@@ -1116,7 +1132,12 @@ TEST_P(Test_Darknet_layers, connected)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
-    testDarknetLayer("connected", true);
+    double l1 = 0.0;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+    {
+        l1 = 3e-5;
+    }
+    testDarknetLayer("connected", true, true, l1);
 }
 
 TEST_P(Test_Darknet_layers, relu)
diff --git a/modules/dnn/test/test_graph_simplifier.cpp b/modules/dnn/test/test_graph_simplifier.cpp
new file mode 100644
index 000000000000..24da7e65b008
--- /dev/null
+++ b/modules/dnn/test/test_graph_simplifier.cpp
@@ -0,0 +1,153 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+class Test_Graph_Simplifier : public ::testing::Test {
+ public:
+    bool required;
+
+    Test_Graph_Simplifier() : required(true) {}
+
+    void test_conformance(const std::string &basename, const std::string &expected_layer) {
+        test(basename + std::string("/model"), std::vector<std::string>{expected_layer}, std::string("dnn/onnx/conformance/node/"));
+    }
+
+    void test(const std::string &basename, const std::string &expected_layer) {
+        test(basename, std::vector<std::string>{expected_layer});
+    }
+
+    void test(const std::string &basename, const std::vector<std::string> &expected_layers, const std::string &model_path_prefix = std::string("dnn/onnx/models/")) {
+        std::string model_path = findDataFile(model_path_prefix + basename + std::string(".onnx"), required);
+        auto net = readNet(model_path);
+        std::vector<std::string> layers;
+        net.getLayerTypes(layers);
+
+        // remove Const, Identity (output layer), __NetInputLayer__ (input layer)
+        layers.erase(std::remove_if(layers.begin(), layers.end(), [] (const std::string l) { return l == "Const" || l == "Identity" || l == "__NetInputLayer__"; }), layers.end());
+
+        EXPECT_EQ(layers, expected_layers);
+    }
+};
+
+TEST_F(Test_Graph_Simplifier, GeluSubGraph) {
+    test("gelu", "Gelu");
+    test("bias_gelu", std::vector<std::string>{"Gelu", "NaryEltwise"});
+}
+
+TEST_F(Test_Graph_Simplifier, GeluApproximationSubGraph) {
+    test("gelu_approximation", "GeluApproximation");
+}
+
+TEST_F(Test_Graph_Simplifier, LayerNormSubGraph) {
+    test("layer_norm_expanded", "LayerNormalization");
+    test("layer_norm_expanded_with_initializers", "LayerNormalization");
+}
+
+TEST_F(Test_Graph_Simplifier, LayerNormNoFusionSubGraph) {
+    test("layer_norm_no_fusion", std::vector<std::string>{"NaryEltwise", "Reduce", "Sqrt"});
+}
+
+TEST_F(Test_Graph_Simplifier, ResizeSubgraph) {
+    /* Test for 6 subgraphs:
+        - GatherCastSubgraph
+        - MulCastSubgraph
+        - UpsampleSubgraph
+        - ResizeSubgraph1
+        - ResizeSubgraph2
+        - ResizeSubgraph3
+    */
+    test("upsample_unfused_torch1.2", std::vector<std::string>{"BatchNorm", "Resize"});
+    test("resize_nearest_unfused_opset11_torch1.3", std::vector<std::string>{"BatchNorm", "Convolution", "Resize"});
+    test("resize_nearest_unfused_opset11_torch1.4", std::vector<std::string>{"BatchNorm", "Convolution", "Resize"});
+    test("upsample_unfused_opset9_torch1.4", std::vector<std::string>{"BatchNorm", "Convolution", "Resize"});
+    test("two_resizes_with_shared_subgraphs", std::vector<std::string>{"NaryEltwise", "Resize"});
+}
+
+TEST_F(Test_Graph_Simplifier, SoftmaxSubgraph) {
+    /* Test for 3 subgraphs
+        - SoftMaxSubgraph
+        - SoftMaxSubgraph2 (conformance)
+        - LogSoftMaxSubgraph (conformance)
+    */
+    test("softmax_unfused", "Softmax");
+    test_conformance("test_softmax_example_expanded", "Softmax");
+    test_conformance("test_softmax_axis_2_expanded", "Softmax");
+    test_conformance("test_softmax_default_axis_expanded", "Softmax");
+    test_conformance("test_softmax_axis_0_expanded", "Softmax");
+    test_conformance("test_softmax_axis_1_expanded", "Softmax");
+    test_conformance("test_softmax_large_number_expanded", "Softmax");
+    test_conformance("test_softmax_negative_axis_expanded", "Softmax");
+    test_conformance("test_logsoftmax_axis_2_expanded", "Softmax");
+    test_conformance("test_logsoftmax_example_1_expanded", "Softmax");
+    test_conformance("test_logsoftmax_negative_axis_expanded", "Softmax");
+    test_conformance("test_logsoftmax_axis_0_expanded", "Softmax");
+    test_conformance("test_logsoftmax_axis_1_expanded", "Softmax");
+    test_conformance("test_logsoftmax_large_number_expanded", "Softmax");
+    test_conformance("test_logsoftmax_default_axis_expanded", "Softmax");
+}
+
+TEST_F(Test_Graph_Simplifier, HardSwishSubgraph) {
+    test_conformance("test_hardswish_expanded", "HardSwish");
+}
+
+TEST_F(Test_Graph_Simplifier, CeluSubgraph) {
+    test_conformance("test_celu_expanded", "Celu");
+}
+
+TEST_F(Test_Graph_Simplifier, NormalizeSubgraph) {
+    /* Test for 6 subgraphs
+        - NormalizeSubgraph1
+        - NormalizeSubgraph2
+        - NormalizeSubgraph2_2
+        - NormalizeSubgraph3
+        - NormalizeSubgraph4
+        - NormalizeSubgraph5
+    */
+    test("reduceL2_subgraph_2", "Normalize");
+    test("reduceL2_subgraph", "Normalize");
+    test("normalize_fusion", "Normalize");
+}
+
+TEST_F(Test_Graph_Simplifier, BatchNormalizationSubgraph) {
+    /* Test for 2 subgraphs
+        - BatchNormalizationSubgraph1
+        - BatchNormalizationSubgraph2
+    */
+    test("frozenBatchNorm2d", "BatchNorm");
+    test("batch_norm_subgraph", "BatchNorm");
+}
+
+TEST_F(Test_Graph_Simplifier, ExpandSubgraph) {
+    test("expand_neg_batch", "Expand");
+}
+
+TEST_F(Test_Graph_Simplifier, MishSubgraph) {
+    /* Test for 2 subgraphs
+        - SoftplusSubgraph
+        - MishSubgraph
+    */
+    test("mish_no_softplus", "Mish");
+    test("mish", "Mish");
+}
+
+TEST_F(Test_Graph_Simplifier, AttentionSubgraph) {
+    /* Test for 2 subgraphs
+        - AttentionSubgraph
+        - AttentionSingleHeadSubgraph
+    */
+    test("attention", "Attention");
+    test("attention_single_head", "Attention");
+}
+
+TEST_F(Test_Graph_Simplifier, BiasedMatMulSubgraph) {
+    /* Test for 1 subgraphs
+        - BiasedMatMulSubgraph
+    */
+    test("biased_matmul", "MatMul");
+}
+
+}}
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp
deleted file mode 100644
index 6a7958eceee0..000000000000
--- a/modules/dnn/test/test_halide_layers.cpp
+++ /dev/null
@@ -1,996 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2017-2019, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-// This tests doesn't require any external data. They just compare outputs of
-// layers using different computation backends. Input and parameters are random.
-
-#include "test_precomp.hpp"
-
-namespace opencv_test { namespace {
-
-using namespace cv;
-using namespace cv::dnn;
-using namespace testing;
-
-static void test(Mat& input, Net& net, Backend backendId, Target targetId, bool skipCheck = false, bool randInput = true, double l1 = 0.0, double lInf = 0.0)
-{
-    DNNTestLayer::checkBackend(backendId, targetId);
-    if (randInput)
-        randu(input, -1.0f, 1.0f);
-
-    net.setInput(input);
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    Mat outputDefault = net.forward().clone();
-
-    net.setPreferableBackend(backendId);
-    net.setPreferableTarget(targetId);
-    Mat outputHalide = net.forward().clone();
-
-    if (skipCheck)
-        return;
-
-    double default_l1, default_lInf;
-    DNNTestLayer::getDefaultThresholds(backendId, targetId, &default_l1, &default_lInf);
-    if (l1 == 0.0)
-        l1 = default_l1;
-    if (lInf == 0.0)
-        lInf = default_lInf;
-    normAssert(outputDefault, outputHalide, "", l1, lInf);
-    if (cvtest::debugLevel > 0 || testing::Test::HasFailure())
-    {
-        std::cout << "l1=" << l1 << "  lInf=" << lInf << std::endl;
-        std::cout << outputDefault.reshape(1, outputDefault.total()).t() << std::endl;
-        std::cout << outputHalide.reshape(1, outputDefault.total()).t() << std::endl;
-    }
-}
-
-static void test(LayerParams& params, Mat& input, Backend backendId, Target targetId, bool skipCheck = false, double l1 = 0.0, double lInf = 0.0)
-{
-    Net net;
-    net.addLayerToPrev(params.name, params.type, params);
-    test(input, net, backendId, targetId, skipCheck, true, l1, lInf);
-}
-
-static inline testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargetsWithHalide()
-{
-    return dnnBackendsAndTargets(true, true, false); // OpenCV/CPU is used as reference
-}
-
-class Test_Halide_layers : public DNNTestLayer {};
-
-////////////////////////////////////////////////////////////////////////////////
-// Padding
-////////////////////////////////////////////////////////////////////////////////
-TEST_P(Test_Halide_layers, Padding)
-{
-    static const int kNumRuns = 10;
-    std::vector<int> paddings(8);
-    cv::RNG& rng = cv::theRNG();
-    for (int t = 0; t < kNumRuns; ++t)
-    {
-        for (int i = 0; i < paddings.size(); ++i)
-            paddings[i] = rng(5);
-
-        LayerParams lp;
-        lp.set("paddings", DictValue::arrayInt<int*>(&paddings[0], paddings.size()));
-        lp.type = "Padding";
-        lp.name = "testLayer";
-
-        int sz[] = {1 + (int)rng(10), 1 + (int)rng(10), 1 + (int)rng(10), 1 + (int)rng(10)};
-        Mat input(4, &sz[0], CV_32F);
-        test(lp, input, backend, target);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Convolution
-////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<Vec3i, Size, Size, Size, Size, Size, bool, tuple<Backend, Target> > > Convolution;
-TEST_P(Convolution, Accuracy)
-{
-    int inChannels = get<0>(GetParam())[0];
-    int outChannels = get<0>(GetParam())[1];
-    int group = get<0>(GetParam())[2];
-    Size inSize = get<1>(GetParam());
-    Size kernel = get<2>(GetParam());
-    Size stride = get<3>(GetParam());
-    Size pad = get<4>(GetParam());
-    Size dilation = get<5>(GetParam());
-    bool hasBias = get<6>(GetParam());
-    Backend backendId = get<0>(get<7>(GetParam()));
-    Target targetId = get<1>(get<7>(GetParam()));
-
-    bool skipCheck = false;
-
-    int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width};
-    Mat weights(4, &sz[0], CV_32F);
-    randu(weights, -1.0f, 1.0f);
-
-    LayerParams lp;
-    lp.set("kernel_w", kernel.width);
-    lp.set("kernel_h", kernel.height);
-    lp.set("pad_w", pad.width);
-    lp.set("pad_h", pad.height);
-    lp.set("stride_w", stride.width);
-    lp.set("stride_h", stride.height);
-    lp.set("dilation_w", dilation.width);
-    lp.set("dilation_h", dilation.height);
-    lp.set("num_output", outChannels);
-    lp.set("group", group);
-    lp.set("bias_term", hasBias);
-    lp.type = "Convolution";
-    lp.name = "testLayer";
-    lp.blobs.push_back(weights);
-    if (hasBias)
-    {
-        Mat bias(1, outChannels, CV_32F);
-        randu(bias, -1.0f, 1.0f);
-        lp.blobs.push_back(bias);
-    }
-    int inpSz[] = {1, inChannels, inSize.height, inSize.width};
-    Mat input(4, &inpSz[0], CV_32F);
-    test(lp, input, backendId, targetId, skipCheck);
-    if (skipCheck)
-        throw SkipTestException("Skip checks in unstable test");
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Convolution, Combine(
-/*in channels, out channels, group*/
-             Values(Vec3i(6, 4, 1), Vec3i(6, 9, 1),
-                    Vec3i(6, 4, 2), Vec3i(6, 9, 3)),
-/*in size*/  Values(Size(5, 6)),
-/*kernel*/   Values(Size(3, 1), Size(1, 3)),
-/*stride*/   Values(Size(1, 1), Size(2, 2)),
-/*pad*/      Values(Size(1, 0), Size(0, 1)),
-/*dilation*/ Values(Size(1, 1), Size(2, 2)),
-/*has bias*/ Bool(),
-             dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////////
-// Deconvolution
-////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<Vec3i, Size, Size, Size, Size, Vec4i, bool, tuple<Backend, Target> > > Deconvolution;
-TEST_P(Deconvolution, Accuracy)
-{
-    int inChannels = get<0>(GetParam())[0];
-    int outChannels = get<0>(GetParam())[1];
-    int group = get<0>(GetParam())[2];
-    Size inSize = get<1>(GetParam());
-    Size kernel = get<2>(GetParam());
-    Size pad = get<3>(GetParam());
-    Size dilation = get<4>(GetParam());
-    Size stride = Size(get<5>(GetParam())[0], get<5>(GetParam())[1]);
-    Size adjPad = Size(get<5>(GetParam())[2], get<5>(GetParam())[3]);
-    bool hasBias = get<6>(GetParam());
-    Backend backendId = get<0>(get<7>(GetParam()));
-    Target targetId = get<1>(get<7>(GetParam()));
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
-            && inChannels == 6 && outChannels == 4 && group == 1
-            && kernel == Size(3, 1) && pad == Size(0, 1)
-            && stride == Size(1, 1) && dilation == Size(1, 1))
-        applyTestTag(targetId == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
-            CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION
-        );
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
-            && inChannels == 6 && outChannels == 4 && group == 1
-            && kernel == Size(1, 3) && pad == Size(1, 0)
-            && stride == Size(1, 1) && dilation == Size(1, 1))
-        applyTestTag(targetId == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
-            CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION
-        );
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
-            && inChannels == 6 && outChannels == 4 && group == 1
-            && kernel == Size(1, 3) && pad == Size(1, 0)
-            && stride == Size(1, 1) && dilation == Size(1, 1))
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#endif
-
-    if (targetId == DNN_TARGET_CUDA_FP16)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
-
-    int sz[] = {inChannels, outChannels / group, kernel.height, kernel.width};
-    Mat weights(4, &sz[0], CV_32F);
-    randu(weights, -1.0f, 1.0f);
-
-    LayerParams lp;
-    lp.set("kernel_w", kernel.width);
-    lp.set("kernel_h", kernel.height);
-    lp.set("pad_w", pad.width);
-    lp.set("pad_h", pad.height);
-    lp.set("stride_w", stride.width);
-    lp.set("stride_h", stride.height);
-    lp.set("dilation_w", dilation.width);
-    lp.set("dilation_h", dilation.height);
-    lp.set("adj_w", adjPad.width);
-    lp.set("adj_h", adjPad.height);
-    lp.set("num_output", outChannels);
-    lp.set("group", group);
-    lp.set("bias_term", hasBias);
-    lp.type = "Deconvolution";
-    lp.name = "testLayer";
-    lp.blobs.push_back(weights);
-    if (hasBias)
-    {
-        Mat bias(1, outChannels, CV_32F);
-        randu(bias, -1.0f, 1.0f);
-        lp.blobs.push_back(bias);
-    }
-    int inpSz[] = {1, inChannels, inSize.height, inSize.width};
-    Mat input(4, &inpSz[0], CV_32F);
-    test(lp, input, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Deconvolution, Combine(
-/*in channels, out channels, group*/
-             Values(Vec3i(6, 4, 1), Vec3i(6, 9, 3)),
-/*in size*/  Values(Size(5, 6)),
-/*kernel*/   Values(Size(3, 1), Size(1, 3)),
-/*pad*/      Values(Size(1, 0), Size(0, 1)),
-/*dilation*/ Values(Size(1, 1)),
-/*stride, adj. pad*/ Values(Vec4i(1,1, 0,0), Vec4i(2,2, 1,0), Vec4i(1,2, 0,1)),
-/*has bias*/ Bool(),
-             dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////////
-// LRN
-////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<Vec3i, int, Vec3f, bool, std::string, tuple<Backend, Target> > > LRN;
-TEST_P(LRN, Accuracy)
-{
-    int inChannels = get<0>(GetParam())[0];
-    Size inSize = Size(get<0>(GetParam())[1], get<0>(GetParam())[2]);
-    int localSize = get<1>(GetParam());
-    float alpha = get<2>(GetParam())[0];
-    float beta = get<2>(GetParam())[1];
-    float bias = get<2>(GetParam())[2];
-    bool normBySize = get<3>(GetParam());
-    std::string nrmType = get<4>(GetParam());
-    Backend backendId = get<0>(get<5>(GetParam()));
-    Target targetId = get<1>(get<5>(GetParam()));
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
-    if ((inSize.width == 5 || inSize.height == 5) && targetId == DNN_TARGET_MYRIAD &&
-        nrmType == "ACROSS_CHANNELS")
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
-#endif
-
-    LayerParams lp;
-    lp.set("norm_region", nrmType);
-    lp.set("local_size", localSize);
-    lp.set("alpha", alpha);
-    lp.set("beta", beta);
-    lp.set("bias", bias);
-    lp.set("norm_by_size", normBySize);
-    lp.type = "LRN";
-    lp.name = "testLayer";
-
-    int sz[] = {1, inChannels, inSize.height, inSize.width};
-    Mat input(4, &sz[0], CV_32F);
-
-    double l1 = 0.0, lInf = 0.0;
-    // The OpenCL kernels use the native_ math functions which have
-    // implementation defined accuracy, so we use relaxed thresholds. See
-    // https://github.com/opencv/opencv/issues/9821 for more details.
-    if (targetId == DNN_TARGET_OPENCL)
-    {
-        l1 = 0.01;
-        lInf = 0.01;
-    }
-    test(lp, input, backendId, targetId, false, l1, lInf);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, LRN, Combine(
-/*input ch,w,h*/ Values(Vec3i(6, 5, 8), Vec3i(7, 11, 6)),
-/*local size*/   Values(3, 5),
-                 Values(Vec3f(0.9f, 1.0f, 1.1f), Vec3f(0.9f, 1.1f, 1.0f),
-/*alpha, beta, bias*/   Vec3f(1.0f, 0.9f, 1.1f), Vec3f(1.0f, 1.1f, 0.9f),
-                        Vec3f(1.1f, 0.9f, 1.0f), Vec3f(1.1f, 1.0f, 0.9f)),
-/*norm_by_size*/ Bool(),
-/*norm_type*/    Values("ACROSS_CHANNELS", "WITHIN_CHANNEL"),
-                 dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////////
-// Average pooling
-////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int, Size, Size, Size, tuple<Backend, Target> > > AvePooling;
-TEST_P(AvePooling, Accuracy)
-{
-    int inChannels = get<0>(GetParam());
-    Size outSize = get<1>(GetParam());;  // Input size will be computed from parameters.
-    Size kernel = get<2>(GetParam());
-    Size stride = get<3>(GetParam());
-    Backend backendId = get<0>(get<4>(GetParam()));
-    Target targetId = get<1>(get<4>(GetParam()));
-
-#if defined(INF_ENGINE_RELEASE)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
-            && kernel == Size(1, 1) && (stride == Size(1, 1) || stride == Size(2, 2)))
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#endif
-
-    const int inWidth = (outSize.width - 1) * stride.width + kernel.width;
-    const int inHeight = (outSize.height - 1) * stride.height + kernel.height;
-
-    LayerParams lp;
-    lp.set("pool", "ave");
-    lp.set("kernel_w", kernel.width);
-    lp.set("kernel_h", kernel.height);
-    lp.set("stride_w", stride.width);
-    lp.set("stride_h", stride.height);
-    lp.type = "Pooling";
-    lp.name = "testLayer";
-
-    int sz[] = {1, inChannels, inHeight, inWidth};
-    Mat input(4, &sz[0], CV_32F);
-    test(lp, input, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, AvePooling, Combine(
-/*in channels*/ Values(3, 4),
-/*out size*/    Values(Size(1, 1), Size(2, 2), Size(3, 2), Size(4, 7)),
-/*kernel*/      Values(Size(1, 1), Size(2, 2), Size(3, 3), Size(3, 2)),
-/*stride*/      Values(Size(1, 1), Size(2, 2), Size(3, 2)),
-                dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////////
-// Maximum pooling
-////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int, Size, Size, Size, Size, tuple<Backend, Target> > > MaxPooling;
-TEST_P(MaxPooling, Accuracy)
-{
-    int inChannels = get<0>(GetParam());
-    Size inSize = get<1>(GetParam());
-    Size kernel = get<2>(GetParam());
-    Size stride = get<3>(GetParam());
-    Size pad = get<4>(GetParam());
-    Backend backendId = get<0>(get<5>(GetParam()));
-    Target targetId = get<1>(get<5>(GetParam()));
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
-            && inSize == Size(7, 6) && kernel == Size(3, 2)
-            && (stride == Size(1, 1) || stride == Size(2, 2))
-            && (pad == Size(0, 1) || pad == Size(1, 1))
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
-            && (kernel == Size(2, 2) || kernel == Size(3, 2))
-            && stride == Size(1, 1) && (pad == Size(0, 0) || pad == Size(0, 1))
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
-            && (stride == Size(1, 1) || stride == Size(2, 2))
-            && (pad == Size(0, 1) || pad == Size(1, 1))
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2020020000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_MYRIAD)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-    LayerParams lp;
-    lp.set("pool", "max");
-    lp.set("kernel_w", kernel.width);
-    lp.set("kernel_h", kernel.height);
-    lp.set("stride_w", stride.width);
-    lp.set("stride_h", stride.height);
-    lp.set("pad_w", pad.width);
-    lp.set("pad_h", pad.height);
-    lp.type = "Pooling";
-    lp.name = "testLayer";
-
-    int sz[] = {1, inChannels, inSize.height, inSize.width};
-    Mat input(4, &sz[0], CV_32F);
-    test(lp, input, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, MaxPooling, Combine(
-/*in channels*/ Values(3, 4),
-/*in size*/     Values(Size(5, 5), Size(7, 6)),
-/*kernel*/      Values(Size(2, 2), Size(3, 3), Size(3, 2)),
-/*stride*/      Values(Size(1, 1), Size(2, 2), Size(3, 2)),
-/*pad*/         Values(Size(0, 0), Size(1, 1), Size(0, 1)),
-                dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////////
-// Fully-connected
-////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
-TEST_P(FullyConnected, Accuracy)
-{
-    int inChannels = get<0>(GetParam());
-    Size inSize = get<1>(GetParam());
-    int outChannels = get<2>(GetParam());
-    bool hasBias = get<3>(GetParam());
-    Backend backendId = get<0>(get<4>(GetParam()));
-    Target targetId = get<1>(get<4>(GetParam()));
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
-    if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
-         backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (targetId == DNN_TARGET_OPENCL_FP16 ||
-       (targetId == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X))) {
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-    }
-#endif
-
-    Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F);
-    randu(weights, -1.0f, 1.0f);
-
-    Mat bias(1, outChannels, CV_32F);
-    randu(bias, -1.0f, 1.0f);
-
-    LayerParams lp;
-    lp.set("num_output", outChannels);
-    lp.set("bias_term", hasBias);
-    lp.blobs.push_back(weights);
-    lp.blobs.push_back(bias);
-    lp.type = "InnerProduct";
-    lp.name = "testLayer";
-
-    int sz[] = {1, inChannels, inSize.height, inSize.width};
-    Mat input(4, &sz[0], CV_32F);
-
-    double l1 = 0.0;
-    double lInf = 0.0;
-#if defined(INF_ENGINE_RELEASE)
-    if (targetId == DNN_TARGET_MYRIAD)
-    {
-        l1 = 0.015;
-        lInf = 0.025;
-    }
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16)
-    {
-        l1 = 0.01;
-    }
-#endif
-    if (targetId == DNN_TARGET_CUDA_FP16)
-        l1 = 0.015;
-
-    test(lp, input, backendId, targetId, false, l1, lInf);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, FullyConnected, Combine(
-/*in channels*/  Values(3, 4),
-/*in size*/      Values(Size(5, 4), Size(4, 5), Size(1, 1)),
-/*out channels*/ Values(3, 4),
-/*has bias*/     Bool(),
-                 dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////////
-// SoftMax
-////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int,  tuple<Backend, Target> > > SoftMax;
-TEST_P(SoftMax, Accuracy)
-{
-    int inChannels = get<0>(GetParam());
-    Backend backendId = get<0>(get<1>(GetParam()));
-    Target targetId = get<1>(get<1>(GetParam()));
-    LayerParams lp;
-    lp.type = "Softmax";
-    lp.name = "testLayer";
-
-    int sz[] = {1, inChannels, 1, 1};
-    Mat input(4, &sz[0], CV_32F);
-    test(lp, input, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, SoftMax, Combine(
-    Values(3, 4, 5, 1024),
-    dnnBackendsAndTargetsWithHalide()
-));
-
-//////////////////////////////////////////////////////////////////////////////
-// Max pooling - unpooling
-//////////////////////////////////////////////////////////////////////////////
-TEST_P(Test_Halide_layers, MaxPoolUnpool)
-{
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
-
-    LayerParams pool;
-    pool.set("pool", "max");
-    pool.set("kernel_w", 2);
-    pool.set("kernel_h", 2);
-    pool.set("stride_w", 2);
-    pool.set("stride_h", 2);
-    pool.set("pad_w", 0);
-    pool.set("pad_h", 0);
-    pool.type = "Pooling";
-    pool.name = "testPool";
-
-    LayerParams unpool;
-    unpool.set("pool_k_w", 2);
-    unpool.set("pool_k_h", 2);
-    unpool.set("pool_stride_w", 2);
-    unpool.set("pool_stride_h", 2);
-    unpool.set("pool_pad_w", 0);
-    unpool.set("pool_pad_h", 0);
-    unpool.type = "MaxUnpool";
-    unpool.name = "testUnpool";
-
-    Net net;
-    int poolId = net.addLayer(pool.name, pool.type, pool);
-    net.connect(0, 0, poolId, 0);
-
-    int unpoolId = net.addLayer(unpool.name, unpool.type, unpool);
-    net.connect(poolId, 0, unpoolId, 0);
-    net.connect(poolId, 1, unpoolId, 1);
-
-    int sz[] = {1, 1, 4, 4};
-    Mat input(4, &sz[0], CV_32F);
-    test(input, net, backend, target);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// AvePooling + in-place layers
-////////////////////////////////////////////////////////////////////////////////
-static const int kNumChannels = 3;
-
-void testInPlaceActivation(LayerParams& lp, Backend backendId, Target targetId, double l1 = 0.0, double lInf = 0.0)
-{
-    EXPECT_FALSE(lp.name.empty());
-
-    LayerParams pool;
-    pool.set("pool", "ave");
-    pool.set("kernel_w", 2);
-    pool.set("kernel_h", 2);
-    pool.set("stride_w", 2);
-    pool.set("stride_h", 2);
-    pool.type = "Pooling";
-    pool.name = "ave_pool";
-
-    Net net;
-    int poolId = net.addLayer(pool.name, pool.type, pool);
-    net.connect(0, 0, poolId, 0);
-    net.addLayerToPrev(lp.name, lp.type, lp);
-
-    int sz[] = {1, kNumChannels, 10, 10};
-    Mat input(4, &sz[0], CV_32F);
-    test(input, net, backendId, targetId, false, true, l1, lInf);
-}
-
-typedef TestWithParam<tuple<bool, bool, float, tuple<Backend, Target> > > BatchNorm;
-TEST_P(BatchNorm, Accuracy)
-{
-    bool hasWeights = get<0>(GetParam());
-    bool hasBias = get<1>(GetParam());
-    float epsilon = get<2>(GetParam());
-    Backend backendId = get<0>(get<3>(GetParam()));
-    Target targetId = get<1>(get<3>(GetParam()));
-
-    LayerParams lp;
-    lp.set("has_weight", hasWeights);
-    lp.set("has_bias", hasBias);
-    lp.set("eps", epsilon);
-    lp.type = "BatchNorm";
-    lp.name = "testLayer";
-
-    lp.blobs.reserve(4);
-    for (int i = 0; i < 3; ++i)
-        lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
-    if (hasBias || hasWeights)
-        lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
-
-    for (int i = 0; i < lp.blobs.size(); ++i)
-        randu(lp.blobs[i], 0.0f, 1.0f);
-
-    testInPlaceActivation(lp, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, BatchNorm, Combine(
-/*has weights*/ Bool(),
-/*has bias*/    Bool(),
-/*epsilon*/     Values(1e-3f, 1e-5f),
-                dnnBackendsAndTargetsWithHalide()
-));
-
-typedef TestWithParam<tuple<float, tuple<Backend, Target> > > ReLU;
-TEST_P(ReLU, Accuracy)
-{
-    float negativeSlope = get<0>(GetParam());
-    Backend backendId = get<0>(get<1>(GetParam()));
-    Target targetId = get<1>(get<1>(GetParam()));
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019020000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD && negativeSlope < 0)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-    LayerParams lp;
-    lp.set("negative_slope", negativeSlope);
-    lp.type = "ReLU";
-    lp.name = "testLayer";
-    testInPlaceActivation(lp, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, ReLU, Combine(
-/*negative slope*/ Values(2.0f, 0.3f, -0.1f, 0.0f),
-                   dnnBackendsAndTargetsWithHalide()
-));
-
-typedef TestWithParam<tuple<std::string, tuple<Backend, Target> > > NoParamActivation;
-TEST_P(NoParamActivation, Accuracy)
-{
-    Backend backendId = get<0>(get<1>(GetParam()));
-    Target targetId = get<1>(get<1>(GetParam()));
-    std::string layer_type = get<0>(GetParam());
-
-    LayerParams lp;
-    lp.type = layer_type;
-    lp.name = "testLayer";
-    testInPlaceActivation(lp, backendId, targetId);
-}
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, NoParamActivation, Combine(
-/*type*/ Values("TanH", "Sigmoid", "AbsVal", "BNLL", "Swish", "Mish"),
-         dnnBackendsAndTargetsWithHalide()
-));
-
-typedef TestWithParam<tuple<Vec3f, tuple<Backend, Target> > > Power;
-TEST_P(Power, Accuracy)
-{
-    float power = get<0>(GetParam())[0];
-    float scale = get<0>(GetParam())[1];
-    float shift = get<0>(GetParam())[2];
-    Backend backendId = get<0>(get<1>(GetParam()));
-    Target targetId = get<1>(get<1>(GetParam()));
-
-    LayerParams lp;
-    lp.set("power", power);
-    lp.set("scale", scale);
-    lp.set("shift", shift);
-    lp.type = "Power";
-    lp.name = "testLayer";
-    testInPlaceActivation(lp, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Power, Combine(
-/*power, scale, shift*/ Values(Vec3f(0.9f, 1.0f, 1.1f), Vec3f(0.9f, 1.1f, 1.0f),
-                               Vec3f(1.0f, 0.9f, 1.1f), Vec3f(1.0f, 1.1f, 0.9f),
-                               Vec3f(1.1f, 0.9f, 1.0f), Vec3f(1.1f, 1.0f, 0.9f)),
-                        dnnBackendsAndTargetsWithHalide()
-));
-
-typedef TestWithParam<tuple<Vec3f, tuple<Backend, Target> > > Exp;
-TEST_P(Exp, Accuracy)
-{
-    float base = get<0>(GetParam())[0];
-    float scale = get<0>(GetParam())[1];
-    float shift = get<0>(GetParam())[2];
-    Backend backendId = get<0>(get<1>(GetParam()));
-    Target targetId = get<1>(get<1>(GetParam()));
-
-    LayerParams lp;
-    lp.set("base", base);
-    lp.set("scale", scale);
-    lp.set("shift", shift);
-    lp.type = "Exp";
-    lp.name = "testLayer";
-    testInPlaceActivation(lp, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Exp, Combine(
-/*base, scale, shift*/ Values(Vec3f(0.9f, -1.0f, 1.1f), Vec3f(0.9f, 1.1f, -1.0f),
-                              Vec3f(-1.0f, 0.9f, 1.1f), Vec3f(-1.0f, 1.1f, 0.9f),
-                              Vec3f(1.1f, 0.9f, -1.0f), Vec3f(1.1f, -1.0f, 0.9f)),
-                       dnnBackendsAndTargetsWithHalide()
-));
-
-TEST_P(Test_Halide_layers, ChannelsPReLU)
-{
-    LayerParams lp;
-    lp.type = "ChannelsPReLU";
-    lp.name = "testLayer";
-    lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
-    randu(lp.blobs[0], -1.0f, 1.0f);
-
-    testInPlaceActivation(lp, backend, target);
-}
-
-typedef TestWithParam<tuple<bool, tuple<Backend, Target> > > Scale;
-TEST_P(Scale, Accuracy)
-{
-    bool hasBias = get<0>(GetParam());
-    Backend backendId = get<0>(get<1>(GetParam()));
-    Target targetId = get<1>(get<1>(GetParam()));
-
-    LayerParams lp;
-    lp.set("bias_term", hasBias);
-    lp.type = "Scale";
-    lp.name = "testLayer";
-    lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
-    randu(lp.blobs[0], -1.0f, 1.0f);
-    if (hasBias)
-    {
-        lp.blobs.push_back(Mat(1, kNumChannels, CV_32F));
-        randu(lp.blobs[1], -1.0f, 1.0f);
-    }
-    testInPlaceActivation(lp, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Scale, Combine(
-    Bool(),
-    dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////////
-// Concat layer
-////////////////////////////////////////////////////////////////////////////////
-//
-// input --- conv --- concat --- output
-//      `--- conv ----^ ^ ^
-//      `---- ... ------' '
-//      `-----------------'
-typedef TestWithParam<tuple<Vec3i, Vec3i, tuple<Backend, Target> > > Concat;
-TEST_P(Concat, Accuracy)
-{
-    Vec3i inSize = get<0>(GetParam());
-    Vec3i numChannels = get<1>(GetParam());
-    Backend backendId = get<0>(get<2>(GetParam()));
-    Target targetId = get<1>(get<2>(GetParam()));
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD
-            && inSize == Vec3i(1, 4, 5) && numChannels == Vec3i(1, 6, 2)
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);  // crash
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_CPU
-            && inSize == Vec3i(1, 4, 5) && numChannels == Vec3i(1, 6, 2)
-    )
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);  // TODO: IE_CPU
-#endif
-
-    Net net;
-
-    std::vector<int> convLayerIds;
-    convLayerIds.reserve(numChannels.channels);
-    for (int i = 0, n = numChannels.channels; i < n; ++i)
-    {
-        if (!numChannels[i])
-            break;
-
-        int sz[] = {numChannels[i], inSize[0], 1, 1};
-        Mat weights(4, &sz[0], CV_32F);
-        randu(weights, -1.0f, 1.0f);
-
-        LayerParams convParam;
-        convParam.set("kernel_w", 1);
-        convParam.set("kernel_h", 1);
-        convParam.set("num_output", numChannels[i]);
-        convParam.set("bias_term", false);
-        convParam.type = "Convolution";
-        std::ostringstream ss;
-        ss << "convLayer" << i;
-        convParam.name = ss.str();
-        convParam.blobs.push_back(weights);
-
-        int layerId = net.addLayer(convParam.name, convParam.type, convParam);
-        convLayerIds.push_back(layerId);
-        net.connect(0, 0, layerId, 0);
-    }
-
-    LayerParams concatParam;
-    concatParam.type = "Concat";
-    concatParam.name = "testLayer";
-    int concatId = net.addLayer(concatParam.name, concatParam.type, concatParam);
-    net.connect(0, 0, concatId, 0);
-    for (int i = 0; i < convLayerIds.size(); ++i)
-    {
-        net.connect(convLayerIds[i], 0, concatId, i + 1);
-    }
-
-    int sz[] = {1, inSize[0], inSize[1], inSize[2]};
-    Mat input(4, &sz[0], CV_32F);
-    test(input, net, backendId, targetId);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Concat, Combine(
-/*input size*/ Values(Vec3i(1, 4, 5), Vec3i(2, 8, 6)),
-/*channels*/   Values(Vec3i(2, 0, 0), Vec3i(3, 4, 0), Vec3i(1, 6, 2)),
-               dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////////
-// Element-wise layers
-////////////////////////////////////////////////////////////////////////////////
-//
-// input --- conv --- eltwise --- output
-//      `--- conv ----^ ^ ^
-//      `---- ... ------' '
-//      `-----------------'
-typedef TestWithParam<tuple<Vec3i, std::string, int, bool, tuple<Backend, Target> > > Eltwise;
-TEST_P(Eltwise, Accuracy)
-{
-    Vec3i inSize = get<0>(GetParam());
-    std::string op = get<1>(GetParam());
-    int numConv = get<2>(GetParam());
-    bool weighted = get<3>(GetParam());
-    Backend backendId = get<0>(get<4>(GetParam()));
-    Target targetId = get<1>(get<4>(GetParam()));
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021040000)
-    // accuracy
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL &&
-        inSize == Vec3i(1, 4, 5) && op == "sum" && numConv == 1 && !weighted)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL &&
-        inSize == Vec3i(2, 8, 6) && op == "sum" && numConv == 1 && !weighted)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD &&
-        inSize == Vec3i(1, 4, 5))
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000) && INF_ENGINE_VER_MAJOR_LT(2021040000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && numConv > 1)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_OPENCL &&
-        op == "sum" && numConv == 1 && !weighted)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
-    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && numConv > 1)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
-#endif
-
-    bool convInputShift = 1;
-    int numEltwiseInputs = numConv;
-    if (op == "div")
-    {
-        numConv = 1;
-        convInputShift = 0; // first input is convolution
-    }
-
-    Net net;
-
-    std::vector<int> convLayerIds(numConv);
-    for (int i = 0; i < numConv; ++i)
-    {
-        int sz[] = {inSize[0], inSize[0], 1, 1};
-        Mat weights(4, &sz[0], CV_32F);
-        randu(weights, -1.0f, 1.0f);
-
-        LayerParams convParam;
-        convParam.set("kernel_w", 1);
-        convParam.set("kernel_h", 1);
-        convParam.set("num_output", inSize[0]);
-        convParam.set("bias_term", false);
-        convParam.type = "Convolution";
-        std::ostringstream ss;
-        ss << "convLayer" << i;
-        convParam.name = ss.str();
-        convParam.blobs.push_back(weights);
-
-        convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
-        net.connect(0, 0, convLayerIds[i], 0);
-    }
-
-    LayerParams eltwiseParam;
-    eltwiseParam.set("operation", op);
-    if (op == "sum" && weighted)
-    {
-        RNG& rng = cv::theRNG();
-        std::vector<float> coeff(1 + numConv);
-        for (int i = 0; i < coeff.size(); ++i)
-        {
-            coeff[i] = rng.uniform(-2.0f, 2.0f);
-        }
-        eltwiseParam.set("coeff", DictValue::arrayReal<float*>(&coeff[0], coeff.size()));
-    }
-    eltwiseParam.type = "Eltwise";
-    eltwiseParam.name = "testLayer";
-    int eltwiseId = net.addLayer(eltwiseParam.name, eltwiseParam.type, eltwiseParam);
-    if (convInputShift == 1)
-        net.connect(0, 0, eltwiseId, 0);
-    for (int i = 0; i < numConv; ++i)
-    {
-        net.connect(convLayerIds[i], 0, eltwiseId, i + convInputShift);
-    }
-    if (convInputShift == 0)
-        net.connect(0, 0, eltwiseId, numConv);
-    for (int i = numConv; i < numEltwiseInputs; ++i)
-    {
-        net.connect(0, 0, eltwiseId, i + 1);
-    }
-
-    int sz[] = {1, inSize[0], inSize[1], inSize[2]};
-    Mat input(4, &sz[0], CV_32F);
-    if (op == "div")
-        randu(input, 1.0f, 1.0f);  // ensure no divisor value has absouluate value of less than 0.5
-    test(input, net, backendId, targetId, /*skipCheck*/false, (op == "div") ? false : true);
-}
-
-INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Eltwise, Combine(
-/*input size*/ Values(Vec3i(1, 4, 5), Vec3i(2, 8, 6)),
-/*operation*/  Values("prod", "sum", "div", "max", "min"),
-/*num convs*/  Values(1, 2, 3),
-/*weighted(for sum only)*/ Bool(),
-               dnnBackendsAndTargetsWithHalide()
-));
-
-////////////////////////////////////////////////////////////////////////////
-// Mixed backends
-////////////////////////////////////////////////////////////////////////////
-#ifdef HAVE_HALIDE
-TEST(MixedBackends_Halide_Default_Halide, Accuracy)
-{
-    // Just a layer that supports Halide backend.
-    LayerParams lrn;
-    lrn.type = "LRN";
-    lrn.name = "testLRN";
-
-    // Some of layers that doesn't supports Halide backend yet.
-    LayerParams mvn;
-    mvn.type = "MVN";
-    mvn.name = "testMVN";
-
-    // Halide layer again.
-    LayerParams lrn2;
-    lrn2.type = "LRN";
-    lrn2.name = "testLRN2";
-
-    Net net;
-    int lrnId = net.addLayer(lrn.name, lrn.type, lrn);
-    net.connect(0, 0, lrnId, 0);
-    net.addLayerToPrev(mvn.name, mvn.type, mvn);
-    net.addLayerToPrev(lrn2.name, lrn2.type, lrn2);
-
-    int sz[] = {4, 3, 5, 6};
-    Mat input(4, &sz[0], CV_32F);
-    randu(input, -1.0f, 1.0f);
-    net.setInput(input);
-    net.setPreferableBackend(DNN_BACKEND_OPENCV);
-    Mat outputDefault = net.forward().clone();
-
-    net.setPreferableBackend(DNN_BACKEND_HALIDE);
-    net.setInput(input);
-    Mat outputHalide = net.forward().clone();
-    normAssert(outputDefault, outputHalide);
-
-    net.setPreferableTarget(DNN_TARGET_OPENCL);
-    net.setInput(input);
-    outputHalide = net.forward().clone();
-    normAssert(outputDefault, outputHalide);
-}
-#endif  // HAVE_HALIDE
-
-INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_Halide_layers, dnnBackendsAndTargetsWithHalide());
-
-}} // namespace
diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp
index 135caa906495..eff389035d10 100644
--- a/modules/dnn/test/test_ie_models.cpp
+++ b/modules/dnn/test/test_ie_models.cpp
@@ -9,7 +9,6 @@
 #ifdef HAVE_INF_ENGINE
 #include <opencv2/core/utils/filesystem.hpp>
 
-
 //
 // Synchronize headers include statements with src/op_inf_engine.hpp
 //
@@ -26,14 +25,11 @@
 #pragma GCC visibility push(default)
 #endif
 
-#include <inference_engine.hpp>
-#include <ie_icnn_network.hpp>
-#include <ie_extension.h>
-
 #if defined(__GNUC__)
 #pragma GCC visibility pop
 #endif
 
+#include <openvino/runtime/core.hpp>
 
 namespace opencv_test { namespace {
 
@@ -62,7 +58,6 @@ static void initDLDTDataPath()
 
 using namespace cv;
 using namespace cv::dnn;
-using namespace InferenceEngine;
 
 struct OpenVINOModelTestCaseInfo
 {
@@ -161,27 +156,6 @@ inline static std::string getOpenVINOModel(const std::string &modelName, bool is
     return std::string();
 }
 
-static inline void genData(const InferenceEngine::TensorDesc& desc, Mat& m, Blob::Ptr& dataPtr)
-{
-    const std::vector<size_t>& dims = desc.getDims();
-    if (desc.getPrecision() == InferenceEngine::Precision::FP32)
-    {
-        m.create(std::vector<int>(dims.begin(), dims.end()), CV_32F);
-        randu(m, -1, 1);
-        dataPtr = make_shared_blob<float>(desc, (float*)m.data);
-    }
-    else if (desc.getPrecision() == InferenceEngine::Precision::I32)
-    {
-        m.create(std::vector<int>(dims.begin(), dims.end()), CV_32S);
-        randu(m, -100, 100);
-        dataPtr = make_shared_blob<int>(desc, (int*)m.data);
-    }
-    else
-    {
-        FAIL() << "Unsupported precision: " << desc.getPrecision();
-    }
-}
-
 void runIE(Target target, const std::string& xmlPath, const std::string& binPath,
            std::map<std::string, cv::Mat>& inputsMap, std::map<std::string, cv::Mat>& outputsMap)
 {
@@ -189,25 +163,12 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
 
     std::string device_name;
 
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
-    Core ie;
-#else
-    InferenceEnginePluginPtr enginePtr;
-    InferencePlugin plugin;
-#endif
+    ov::Core core;
 
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019030000)
-    CNNNetwork net = ie.ReadNetwork(xmlPath, binPath);
-#else
-    CNNNetReader reader;
-    reader.ReadNetwork(xmlPath);
-    reader.ReadWeights(binPath);
-
-    CNNNetwork net = reader.getNetwork();
-#endif
+    auto model = core.read_model(xmlPath, binPath);
 
-    ExecutableNetwork netExec;
-    InferRequest infRequest;
+    ov::CompiledModel compiledModel;
+    ov::InferRequest infRequest;
 
     try
     {
@@ -230,10 +191,6 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
                 CV_Error(Error::StsNotImplemented, "Unknown target");
         };
 
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
-        auto dispatcher = InferenceEngine::PluginDispatcher({""});
-        enginePtr = dispatcher.getPluginByDevice(device_name);
-#endif
         if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA)
         {
             std::string suffixes[] = {"_avx2", "_sse4", ""};
@@ -255,68 +212,90 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
 #endif  // _WIN32
                 try
                 {
-                    IExtensionPtr extension = make_so_pointer<IExtension>(libName);
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
-                    ie.AddExtension(extension, device_name);
-#else
-                    enginePtr->AddExtension(extension, 0);
-#endif
+                    core.add_extension(libName);
                     break;
                 }
                 catch(...) {}
             }
             // Some of networks can work without a library of extra layers.
         }
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
-        netExec = ie.LoadNetwork(net, device_name);
-#else
-        plugin = InferencePlugin(enginePtr);
-        netExec = plugin.LoadNetwork(net, {});
-#endif
-        infRequest = netExec.CreateInferRequest();
+        compiledModel = core.compile_model(model, device_name);
+        infRequest = compiledModel.create_infer_request();
     }
     catch (const std::exception& ex)
     {
         CV_Error(Error::StsAssert, format("Failed to initialize Inference Engine backend: %s", ex.what()));
     }
 
-    // Fill input blobs.
+    // Fill input tensors.
     inputsMap.clear();
-    BlobMap inputBlobs;
-    for (auto& it : net.getInputsInfo())
+    for (auto&& it : model->inputs())
     {
-        const InferenceEngine::TensorDesc& desc = it.second->getTensorDesc();
-        genData(desc, inputsMap[it.first], inputBlobs[it.first]);
+        auto type = it.get_element_type();
+        auto shape = it.get_shape();
+        auto& m = inputsMap[it.get_any_name()];
+
+        auto tensor = ov::Tensor(type, shape);
+        if (type == ov::element::f32)
+        {
+            m.create(std::vector<int>(shape.begin(), shape.end()), CV_32F);
+            randu(m, -1, 1);
+        }
+        else if (type == ov::element::i32)
+        {
+            m.create(std::vector<int>(shape.begin(), shape.end()), CV_32S);
+            randu(m, -100, 100);
+        }
+        else
+        {
+            FAIL() << "Unsupported precision: " << type;
+        }
+        std::memcpy(tensor.data(), m.data, tensor.get_byte_size());
+
         if (cvtest::debugLevel > 0)
         {
-            const std::vector<size_t>& dims = desc.getDims();
-            std::cout << "Input: '" << it.first << "' precision=" << desc.getPrecision() << " dims=" << dims.size() << " [";
-            for (auto d : dims)
+            std::cout << "Input: '" << it.get_any_name() << "' precision=" << type << " dims=" << shape << " [";
+            for (auto d : shape)
                 std::cout << " " << d;
-            std::cout << "]  ocv_mat=" << inputsMap[it.first].size << " of " << typeToString(inputsMap[it.first].type()) << std::endl;
+            std::cout << "]  ocv_mat=" << inputsMap[it.get_any_name()].size << " of " << typeToString(inputsMap[it.get_any_name()].type()) << std::endl;
         }
+        infRequest.set_tensor(it, tensor);
     }
-    infRequest.SetInput(inputBlobs);
+    infRequest.infer();
+
 
-    // Fill output blobs.
+    // Fill output tensors.
     outputsMap.clear();
-    BlobMap outputBlobs;
-    for (auto& it : net.getOutputsInfo())
+    for (const auto& it : model->outputs())
     {
-        const InferenceEngine::TensorDesc& desc = it.second->getTensorDesc();
-        genData(desc, outputsMap[it.first], outputBlobs[it.first]);
+        auto type = it.get_element_type();
+        auto shape = it.get_shape();
+        auto& m = outputsMap[it.get_any_name()];
+
+        auto tensor = infRequest.get_tensor(it);
+        if (type == ov::element::f32)
+        {
+            m.create(std::vector<int>(shape.begin(), shape.end()), CV_32F);
+        }
+        else if (type == ov::element::i32)
+        {
+            m.create(std::vector<int>(shape.begin(), shape.end()), CV_32S);
+        }
+        else
+        {
+            FAIL() << "Unsupported precision: " << type;
+        }
+        std::memcpy(m.data, tensor.data(), tensor.get_byte_size());
+
         if (cvtest::debugLevel > 0)
         {
-            const std::vector<size_t>& dims = desc.getDims();
-            std::cout << "Output: '" << it.first << "' precision=" << desc.getPrecision() << " dims=" << dims.size() << " [";
-            for (auto d : dims)
+            std::cout << "Output: '" << it.get_any_name() << "' precision=" << type << " dims=" << shape << " [";
+            for (auto d : shape)
                 std::cout << " " << d;
-            std::cout << "]  ocv_mat=" << outputsMap[it.first].size << " of " << typeToString(outputsMap[it.first].type()) << std::endl;
+            std::cout << "]  ocv_mat=" << outputsMap[it.get_any_name()].size << " of " << typeToString(outputsMap[it.get_any_name()].type()) << std::endl;
         }
-    }
-    infRequest.SetOutput(outputBlobs);
 
-    infRequest.Infer();
+    }
 }
 
 void runCV(Backend backendId, Target targetId, const std::string& xmlPath, const std::string& binPath,
@@ -465,8 +444,8 @@ TEST_P(DNNTestHighLevelAPI, predict)
     const std::string modelPath = getOpenVINOModel(modelName, isFP16);
     ASSERT_FALSE(modelPath.empty()) << modelName;
 
-    std::string xmlPath = findDataFile(modelPath + ".xml");
-    std::string binPath = findDataFile(modelPath + ".bin");
+    std::string xmlPath = findDataFile(modelPath + ".xml", false);
+    std::string binPath = findDataFile(modelPath + ".bin", false);
 
     Model model(xmlPath, binPath);
     Mat frame = imread(findDataFile("dnn/googlenet_1.png"));
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
index 8b3cd01f2989..bc5d9388a98d 100644
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -14,6 +14,9 @@ testing::internal::ParamGenerator< tuple<Backend, Target> > dnnBackendsAndTarget
     targets.push_back(make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
 #ifdef HAVE_TIMVX
     targets.push_back(make_tuple(DNN_BACKEND_TIMVX, DNN_TARGET_NPU));
+#endif
+#ifdef HAVE_INF_ENGINE
+    targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU));
 #endif
     return testing::ValuesIn(targets);
 }
@@ -66,8 +69,6 @@ class Test_Int8_layers : public DNNTestLayer
             outPath = _tf("onnx/data/output_" + basename);
         }
         ASSERT_FALSE(net.empty());
-        net.setPreferableBackend(backend);
-        net.setPreferableTarget(target);
 
         for (int i = 0; i < numInps; i++)
             inps[i] = blobFromNPY(inpPath + ((numInps > 1) ? cv::format("_%d.npy", i) : ".npy"));
@@ -78,6 +79,8 @@ class Test_Int8_layers : public DNNTestLayer
         qnet = net.quantize(inps, CV_8S, CV_8S, perChannel);
         qnet.getInputDetails(inputScale, inputZp);
         qnet.getOutputDetails(outputScale, outputZp);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
 
         // Quantize inputs to int8
         // int8_value = float_value/scale + zero-point
@@ -94,7 +97,7 @@ class Test_Int8_layers : public DNNTestLayer
         for (int i = 0; i < numOuts; i++)
         {
             outs_int8[i].convertTo(outs_dequantized[i], CV_32F, outputScale[i], -(outputScale[i] * outputZp[i]));
-            normAssert(refs[i], outs_dequantized[i], "", l1, lInf);
+            normAssert(refs[i], outs_dequantized[i], basename.c_str(), l1, lInf);
         }
     }
 };
@@ -197,10 +200,13 @@ TEST_P(Test_Int8_layers, Padding)
 
 TEST_P(Test_Int8_layers, AvePooling)
 {
-    testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
+    // Some tests failed with OpenVINO due to wrong padded area calculation
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
     testLayer("ave_pool_same", "TensorFlow", 0.00153, 0.0041);
     testLayer("average_pooling_1d", "ONNX", 0.002, 0.0048);
-    testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
     testLayer("average_pooling_dynamic_axes", "ONNX", 0.0014, 0.006);
 
     if (target != DNN_TARGET_CPU)
@@ -216,8 +222,6 @@ TEST_P(Test_Int8_layers, MaxPooling)
         throw SkipTestException("Only CPU is supported");
     testLayer("pool_conv_3d", "ONNX", 0.0033, 0.0124);
 
-    /* All the below tests have MaxPooling as last layer, so computeMaxIdx is set to true
-       which is not supported by int8 maxpooling
     testLayer("layer_pooling_max", "Caffe", 0.0021, 0.004);
     testLayer("max_pool_even", "TensorFlow", 0.0048, 0.0139);
     testLayer("max_pool_odd_valid", "TensorFlow", 0.0043, 0.012);
@@ -227,7 +231,7 @@ TEST_P(Test_Int8_layers, MaxPooling)
     testLayer("two_maxpooling_1d", "ONNX", 0.0037, 0.0052);
     testLayer("maxpooling", "ONNX", 0.0034, 0.0065);
     testLayer("two_maxpooling", "ONNX", 0.0025, 0.0052);
-    testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);*/
+    testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);
 }
 
 TEST_P(Test_Int8_layers, Reduce)
@@ -322,7 +326,10 @@ TEST_P(Test_Int8_layers, DISABLED_Softmax_unfused_ONNX)  // FIXIT Support 'Ident
 TEST_P(Test_Int8_layers, Concat)
 {
     testLayer("layer_concat_shared_input", "Caffe", 0.0076, 0.029, 1, 1, true, false);
-    testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
+    if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        // Crashes with segfault
+        testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
+    }
     testLayer("keras_pad_concat", "TensorFlow", 0.0032, 0.0089);
     testLayer("concat_3d", "TensorFlow", 0.005, 0.014);
     testLayer("concatenation", "ONNX", 0.0032, 0.009);
@@ -366,7 +373,7 @@ TEST_P(Test_Int8_layers, InnerProduct)
     testLayer("matmul_layout", "TensorFlow", 0.035, 0.06);
     testLayer("tf2_dense", "TensorFlow", 0, 0);
     testLayer("matmul_add", "ONNX", 0.041, 0.082);
-    testLayer("linear", "ONNX", 0.0018, 0.0029);
+    testLayer("linear", "ONNX", 0.0027, 0.0046);
 
     if (backend == DNN_BACKEND_TIMVX)
         testLayer("constant", "ONNX", 0.00048, 0.0013);
@@ -384,7 +391,7 @@ TEST_P(Test_Int8_layers, InnerProduct)
         testLayer("matmul_layout", "TensorFlow", 0.035, 0.095, 1, 1, false, true, false, false);
         testLayer("tf2_dense", "TensorFlow", 0, 0, 1, 1, false, true, false, false);
         testLayer("matmul_add", "ONNX", 0.041, 0.082, 1, 1, false, true, false, false);
-        testLayer("linear", "ONNX", 0.0022, 0.004, 1, 1, false, true, false, false);
+        testLayer("linear", "ONNX", 0.0027, 0.005, 1, 1, false, true, false, false);
         testLayer("constant", "ONNX", 0.00038, 0.0012, 1, 1, false, true, false, false);
         testLayer("lin_with_constant", "ONNX", 0.0011, 0.0016, 1, 1, false, true, false, false);
     }
@@ -400,10 +407,13 @@ TEST_P(Test_Int8_layers, Reshape)
         testLayer("reshape_nchw", "TensorFlow", 0.0089, 0.029);
 
     testLayer("reshape_conv", "TensorFlow", 0.035, 0.054);
-    testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        testLayer("reshape_reduce", "TensorFlow", 0.0053, 0.011);
+    else
+        testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
     testLayer("reshape_as_shape", "TensorFlow", 0.0014, 0.0028);
     testLayer("reshape_no_reorder", "TensorFlow", 0.0014, 0.0028);
-    testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, 0.014);
+    testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.016 : 0.014);
     testLayer("dynamic_reshape", "ONNX", 0.0047, 0.0079);
     testLayer("dynamic_reshape_opset_11", "ONNX", 0.0048, 0.0081);
     testLayer("flatten_by_prod", "ONNX", 0.0048, 0.0081);
@@ -491,10 +501,10 @@ TEST_P(Test_Int8_layers, Eltwise)
 
     testLayer("conv_2_inps", "Caffe", 0.0086, 0.0232, 2, 1, true, false);
     testLayer("eltwise_sub", "TensorFlow", 0.015, 0.047);
-    testLayer("eltwise_add_vec", "TensorFlow", 0.037, 0.21); // tflite 0.0095, 0.0365
+    testLayer("eltwise_add_vec", "TensorFlow", 0.037, backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.24 : 0.21); // tflite 0.0095, 0.0365
     testLayer("eltwise_mul_vec", "TensorFlow", 0.173, 1.14); // tflite 0.0028, 0.017
     testLayer("channel_broadcast", "TensorFlow", 0.0025, 0.0063);
-    testLayer("split_equals", "TensorFlow", 0.02, 0.065);
+    testLayer("split_equals", "TensorFlow", backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.021 : 0.02, 0.065);
     testLayer("mul", "ONNX", 0.0039, 0.014);
     testLayer("split_max", "ONNX", 0.004, 0.012);
 }
@@ -551,10 +561,10 @@ class Test_Int8_nets : public DNNTestLayer
         Mat blob = readTensorFromONNX(findDataFile("dnn/onnx/data/input_" + basename + ".pb"));
         Mat ref = readTensorFromONNX(findDataFile("dnn/onnx/data/output_" + basename + ".pb"));
         Net baseNet = readNetFromONNX(onnxmodel);
-        baseNet.setPreferableBackend(backend);
-        baseNet.setPreferableTarget(target);
 
         Net qnet = baseNet.quantize(blob, CV_32F, CV_32F, perChannel);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
         qnet.setInput(blob);
         Mat out = qnet.forward();
 
@@ -699,9 +709,6 @@ TEST_P(Test_Int8_nets, AlexNet)
 #else
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
 #endif
-    if (backend != DNN_BACKEND_OPENCV)
-        throw SkipTestException("Only OpenCV backend is supported");
-
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
@@ -741,9 +748,10 @@ TEST_P(Test_Int8_nets, GoogLeNet)
 
 TEST_P(Test_Int8_nets, ResNet50)
 {
-    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
-    if (backend != DNN_BACKEND_OPENCV)
-        throw SkipTestException("Only OpenCV backend is supported");
+    applyTestTag(
+        target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
 
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
@@ -774,6 +782,8 @@ TEST_P(Test_Int8_nets, DenseNet121)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     Net net = readNetFromCaffe(findDataFile("dnn/DenseNet_121.prototxt", false),
                                findDataFile("dnn/DenseNet_121.caffemodel", false));
@@ -837,7 +847,7 @@ TEST_P(Test_Int8_nets, RCNN_ILSVRC13)
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
 
-    float l1 = 0.02, lInf = 0.042;
+    float l1 = 0.02, lInf = 0.047;
     testONNXNet("rcnn_ilsvrc13", l1, lInf);
 }
 
@@ -878,14 +888,14 @@ TEST_P(Test_Int8_nets, MobileNet_SSD)
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
 
-    Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy.prototxt", false),
-                               findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false));
+    Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false),
+                               findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false));
 
     Mat inp = imread(_tf("street.png"));
     Mat blob = blobFromImage(inp, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
     Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
 
-    float confThreshold = FLT_MIN, scoreDiff = 0.059, iouDiff = 0.11;
+    float confThreshold = FLT_MIN, scoreDiff = 0.084, iouDiff = 0.43;
     testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -955,6 +965,8 @@ TEST_P(Test_Int8_nets, opencv_face_detector)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     Net net = readNetFromCaffe(findDataFile("dnn/opencv_face_detector.prototxt"),
                                findDataFile("dnn/opencv_face_detector.caffemodel", false));
@@ -1021,7 +1033,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_resnet50)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
-
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
 
@@ -1048,7 +1061,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_inceptionv2)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
-
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
 
@@ -1079,6 +1093,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_vgg16.prototxt"),
                                findDataFile("dnn/VGG16_faster_rcnn_final.caffemodel", false));
@@ -1106,6 +1122,8 @@ TEST_P(Test_Int8_nets, FasterRCNN_zf)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_zf.prototxt"),
                                findDataFile("dnn/ZF_faster_rcnn_final.caffemodel", false));
@@ -1138,6 +1156,9 @@ TEST_P(Test_Int8_nets, RFCN)
                                     0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
 
     float confThreshold = 0.8, scoreDiff = 0.15, iouDiff = 0.11;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+        iouDiff = 0.12;
+    }
     testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -1182,7 +1203,10 @@ TEST_P(Test_Int8_nets, YoloVoc)
 
 TEST_P(Test_Int8_nets, TinyYoloVoc)
 {
-    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_512MB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
 
     if (target == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
@@ -1317,6 +1341,8 @@ TEST_P(Test_Int8_nets, YOLOv4_tiny)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 
     const float confThreshold = 0.6;
 
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 763d94b99cf2..48c968fb6a42 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -215,7 +215,13 @@ TEST_P(Test_Caffe_layers, InnerProduct)
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
 
-    testLayerUsingCaffeModels("layer_inner_product", true);
+    double l1 = 0.0, lInf = 0.0;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+    {
+        l1 = 5e-3;
+        lInf = 2e-2;
+    }
+    testLayerUsingCaffeModels("layer_inner_product", true, true, l1, lInf);
 }
 
 TEST_P(Test_Caffe_layers, Pooling_max)
@@ -301,6 +307,11 @@ TEST_P(Test_Caffe_layers, Dropout)
 
 TEST_P(Test_Caffe_layers, Concat)
 {
+    if (cvtest::skipUnstableTests && (backend == DNN_BACKEND_VKCOM))
+    {
+        throw SkipTestException("Test_Caffe_layers.Concat test produces unstable result with Vulkan");
+    }
+
 #if defined(INF_ENGINE_RELEASE)
 #if INF_ENGINE_VER_MAJOR_GE(2019010000) && INF_ENGINE_VER_MAJOR_LT(2019020000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
@@ -407,10 +418,12 @@ TEST_P(Test_Caffe_layers, layer_prelu_fc)
 
 TEST_P(Test_Caffe_layers, Reshape_Split_Slice)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
 
     Net net = readNetFromCaffe(_tf("reshape_and_slice_routines.prototxt"));
     ASSERT_FALSE(net.empty());
@@ -756,11 +769,15 @@ TEST_F(Layer_RNN_Test, get_set_test)
 
 TEST_P(Test_Caffe_layers, Accum)
 {
+#ifdef OPENCV_DNN_EXTERNAL_PROTOBUF
+    throw SkipTestException("Requires patched protobuf");
+#else
     if (backend == DNN_BACKEND_OPENCV && target != DNN_TARGET_CPU)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL, CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
 
     testLayerUsingCaffeModels("accum", false, false, 0.0, 0.0, 2);
     testLayerUsingCaffeModels("accum_ref", false, false, 0.0, 0.0, 2);
+#endif
 }
 
 TEST_P(Test_Caffe_layers, FlowWarp)
@@ -780,27 +797,41 @@ TEST_P(Test_Caffe_layers, ChannelNorm)
 
 TEST_P(Test_Caffe_layers, DataAugmentation)
 {
+#ifdef OPENCV_DNN_EXTERNAL_PROTOBUF
+    throw SkipTestException("Requires patched protobuf");
+#else
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     testLayerUsingCaffeModels("data_augmentation", true, false);
     testLayerUsingCaffeModels("data_augmentation_2x1", true, false);
     testLayerUsingCaffeModels("data_augmentation_8x6", true, false);
+#endif
 }
 
 TEST_P(Test_Caffe_layers, Resample)
 {
+#ifdef OPENCV_DNN_EXTERNAL_PROTOBUF
+    throw SkipTestException("Requires patched protobuf");
+#else
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend != DNN_BACKEND_OPENCV)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     testLayerUsingCaffeModels("nearest_2inps", false, false, 0.0, 0.0, 2);
     testLayerUsingCaffeModels("nearest", false, false);
+#endif
 }
 
 TEST_P(Test_Caffe_layers, Correlation)
 {
+#ifdef OPENCV_DNN_EXTERNAL_PROTOBUF
+    throw SkipTestException("Requires patched protobuf");
+#else
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER,
                      CV_TEST_TAG_DNN_SKIP_OPENCL, CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     testLayerUsingCaffeModels("correlation", false, false, 0.0, 0.0, 2);
+#endif
 }
 
 TEST_P(Test_Caffe_layers, Convolution2Inputs)
@@ -1587,7 +1618,7 @@ class CustomInterpLayer CV_FINAL : public Layer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        if (inputs_arr.depth() == CV_16S)
+        if (inputs_arr.depth() == CV_16F)
         {
             forward_fallback(inputs_arr, outputs_arr, internals_arr);
             return;
@@ -1641,12 +1672,11 @@ class CustomInterpLayer CV_FINAL : public Layer
     int outWidth, outHeight, zoomFactor;
 };
 
-#ifndef OPENCV_DNN_EXTERNAL_PROTOBUF
 TEST_P(Test_Caffe_layers, Interp)
-#else
-TEST_P(Test_Caffe_layers, DISABLED_Interp)  // requires patched protobuf (available in OpenCV source tree only)
-#endif
 {
+#ifdef OPENCV_DNN_EXTERNAL_PROTOBUF
+    throw SkipTestException("Requires patched protobuf");
+#else
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
@@ -1670,6 +1700,7 @@ TEST_P(Test_Caffe_layers, DISABLED_Interp)  // requires patched protobuf (availa
 
     // Test an implemented layer.
     testLayerUsingCaffeModels("layer_interp", false, false);
+#endif
 }
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_Caffe_layers, dnnBackendsAndTargets());
@@ -1769,6 +1800,50 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_ShuffleChannel, Combine(
 /*group*/        Values(1, 2, 3, 6), dnnBackendsAndTargets(/*with IE*/ false)
 ));
 
+TEST(Layer_Test_ReduceMean, accuracy_input_0)
+{
+    vector<int> szData = { 2, 1, 2, 1 ,2 };
+    std::vector<float> initData = { 0, 1, 2, 3, 4, 5, 6, 7 };
+    Mat inpInitA(szData, CV_32FC1, Mat(initData).data);
+    std::vector<float> resAxes0 = { 2, 3, 4, 5 };
+    std::vector<float> resAxes1 = { 0, 1, 2, 3, 4, 5, 6, 7 };
+    std::vector<float> resAxes2 = { 1, 2, 5, 6 };
+    std::vector<float> resAxes3 = { 0, 1, 2, 3, 4, 5, 6, 7 };
+    std::vector<float> resAxes4 = { 0.5, 2.5, 4.5, 6.5 };
+    std::vector < vector<float>> resReduceMean = { resAxes0, resAxes1, resAxes2, resAxes3, resAxes4 };
+
+
+    for (int i = 0; i < resReduceMean.size(); i++)
+    {
+        Net net;
+        LayerParams lp;
+        lp.set("keepdims", 0);
+        lp.type = "Reduce";
+        lp.set("reduce", "MEAN");
+        lp.name = "testReduceMean";
+        lp.set("axes", i);
+        lp.blobs.push_back(inpInitA);
+
+        net.addLayerToPrev(lp.name, lp.type, lp);
+        net.setInput(inpInitA);
+        net.setPreferableBackend(DNN_BACKEND_OPENCV);
+
+        Mat output = net.forward();
+        MatShape gt_shape;
+        for (int j = 0; j < szData.size(); j++)
+        {
+            if (i == j) continue;
+            gt_shape.push_back(szData[j]);
+        }
+
+        EXPECT_EQ(gt_shape, shape(output));
+
+        Mat a = output.reshape(1, output.total());
+        normAssert(a, Mat(resReduceMean[i]));
+    }
+}
+
+
 // Check if relu is not fused to convolution if we requested it's output
 TEST(Layer_Test_Convolution, relu_fusion)
 {
@@ -2024,7 +2099,7 @@ struct Layer_Test_Eltwise_bcast : testing::TestWithParam<tuple<string, int, tupl
         net.setPreferableTarget(target);
 
         Mat re;
-        ASSERT_NO_THROW(re = net.forward()); // runtime error
+        re = net.forward();
         auto ptr_re = (float *) re.data;
         for (int i = 0; i < re.total(); i++)
             if (op == "sum"){
diff --git a/modules/dnn/test/test_main.cpp b/modules/dnn/test/test_main.cpp
index a0d876b08754..b7b95d24855f 100644
--- a/modules/dnn/test/test_main.cpp
+++ b/modules/dnn/test/test_main.cpp
@@ -4,4 +4,4 @@
     #include <hpx/hpx_main.hpp>
 #endif
 
-CV_TEST_MAIN("", initDNNTests());
+CV_TEST_MAIN("", initDNNTests())
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index 4ee3e013cb7d..b4b691e3183d 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -6,23 +6,90 @@
 // Third party copyrights are property of their respective owners.
 
 #include "test_precomp.hpp"
+#include "npy_blob.hpp"
 #include <opencv2/core/ocl.hpp>
 #include <opencv2/core/opencl/ocl_defs.hpp>
 #include <opencv2/dnn/layer.details.hpp>  // CV_DNN_REGISTER_LAYER_CLASS
 
 namespace opencv_test { namespace {
 
+TEST(blobRectToImageRect, DNN_PMODE_NULL)
+{
+    Size inputSize(50 + (rand() % 100) / 4 * 4, 50 + (rand() % 100) / 4 * 4);
+    Size imgSize(200 + (rand() % 100) / 4 * 4, 200 + (rand() % 100) / 4 * 4);
+    Rect rBlob(inputSize.width / 2 - inputSize.width / 4, inputSize.height / 2 - inputSize.height / 4, inputSize.width / 2, inputSize.height / 2);
+    Image2BlobParams paramNet;
+    paramNet.scalefactor = Scalar::all(1.f);
+    paramNet.size = inputSize;
+    paramNet.ddepth = CV_32F;
+    paramNet.mean = Scalar();
+    paramNet.swapRB = false;
+    paramNet.datalayout = DNN_LAYOUT_NHWC;
+    paramNet.paddingmode = DNN_PMODE_NULL;
+    Rect rOri = paramNet.blobRectToImageRect(rBlob, imgSize);
+    Rect rImg = Rect(rBlob.x * (float)imgSize.width / inputSize.width, rBlob.y * (float)imgSize.height / inputSize.height,
+        rBlob.width * (float)imgSize.width / inputSize.width, rBlob.height * (float)imgSize.height / inputSize.height);
+    ASSERT_EQ(rImg, rOri);
+}
+
+TEST(blobRectToImageRect, DNN_PMODE_CROP_CENTER)
+{
+    Size inputSize(50 + (rand() % 100) / 4 * 4, 50 + (rand() % 100) / 4 * 4);
+    Size imgSize(200 + (rand() % 100) / 4 * 4, 200 + (rand() % 100) / 4 * 4);
+    Rect rBlob(inputSize.width / 2 - inputSize.width / 4, inputSize.height / 2 - inputSize.height / 4, inputSize.width / 2, inputSize.height / 2);
+    Image2BlobParams paramNet;
+    paramNet.scalefactor = Scalar::all(1.f);
+    paramNet.size = inputSize;
+    paramNet.ddepth = CV_32F;
+    paramNet.mean = Scalar();
+    paramNet.swapRB = false;
+    paramNet.datalayout = DNN_LAYOUT_NHWC;
+    paramNet.paddingmode = DNN_PMODE_CROP_CENTER;
+    Rect rOri = paramNet.blobRectToImageRect(rBlob, imgSize);
+    float resizeFactor = std::max(inputSize.width / (float)imgSize.width,
+        inputSize.height / (float)imgSize.height);
+    Rect rImg = Rect((rBlob.x + 0.5 * (imgSize.width * resizeFactor - inputSize.width)) / resizeFactor, (rBlob.y + 0.5 * (imgSize.height * resizeFactor - inputSize.height)) / resizeFactor,
+        rBlob.width / resizeFactor, rBlob.height / resizeFactor);
+    ASSERT_EQ(rImg, rOri);
+}
+
+TEST(blobRectToImageRect, DNN_PMODE_LETTERBOX)
+{
+    Size inputSize(50 + (rand() % 100) / 4 * 4, 50 + (rand() % 100) / 4 * 4);
+    Size imgSize(200 + (rand() % 100) / 4 * 4, 200 + (rand() % 100) / 4 * 4);
+    Rect rBlob(inputSize.width / 2 - inputSize.width / 4, inputSize.height / 2 - inputSize.height / 4, inputSize.width / 2, inputSize.height / 2);
+    Image2BlobParams paramNet;
+    paramNet.scalefactor = Scalar::all(1.f);
+    paramNet.size = inputSize;
+    paramNet.ddepth = CV_32F;
+    paramNet.mean = Scalar();
+    paramNet.swapRB = false;
+    paramNet.datalayout = DNN_LAYOUT_NHWC;
+    paramNet.paddingmode = DNN_PMODE_LETTERBOX;
+    Rect rOri = paramNet.blobRectToImageRect(rBlob, imgSize);
+    float resizeFactor = std::min(inputSize.width / (float)imgSize.width,
+        inputSize.height / (float)imgSize.height);
+    int rh = int(imgSize.height * resizeFactor);
+    int rw = int(imgSize.width * resizeFactor);
+
+    int top = (inputSize.height - rh) / 2;
+    int left = (inputSize.width - rw) / 2;
+    Rect rImg = Rect((rBlob.x - left) / resizeFactor, (rBlob.y - top) / resizeFactor, rBlob.width / resizeFactor, rBlob.height / resizeFactor);
+    ASSERT_EQ(rImg, rOri);
+}
+
+
 TEST(blobFromImage_4ch, Regression)
 {
     Mat ch[4];
-    for(int i = 0; i < 4; i++)
-        ch[i] = Mat::ones(10, 10, CV_8U)*i;
+    for (int i = 0; i < 4; i++)
+        ch[i] = Mat::ones(10, 10, CV_8U) * i;
 
     Mat img;
     merge(ch, 4, img);
     Mat blob = dnn::blobFromImage(img, 1., Size(), Scalar(), false, false);
 
-    for(int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++)
     {
         ch[i] = Mat(img.rows, img.cols, CV_32F, blob.ptr(0, i));
         ASSERT_DOUBLE_EQ(cvtest::norm(ch[i], cv::NORM_INF), i);
@@ -31,7 +98,7 @@ TEST(blobFromImage_4ch, Regression)
 
 TEST(blobFromImage, allocated)
 {
-    int size[] = {1, 3, 4, 5};
+    int size[] = { 1, 3, 4, 5 };
     Mat img(size[2], size[3], CV_32FC(size[1]));
     Mat blob(4, size, CV_32F);
     void* blobData = blob.data;
@@ -65,8 +132,8 @@ TEST(imagesFromBlob, Regression)
 
 TEST(blobFromImageWithParams_4ch, NHWC_scalar_scale)
 {
-    Mat img(10, 10, CV_8UC4, cv::Scalar(0,1,2,3));
-    std::vector<double> factorVec = {0.1, 0.2, 0.3, 0.4};
+    Mat img(10, 10, CV_8UC4, cv::Scalar(0, 1, 2, 3));
+    std::vector<double> factorVec = { 0.1, 0.2, 0.3, 0.4 };
 
     Scalar scalefactor(factorVec[0], factorVec[1], factorVec[2], factorVec[3]);
 
@@ -76,7 +143,7 @@ TEST(blobFromImageWithParams_4ch, NHWC_scalar_scale)
     Mat blob = dnn::blobFromImageWithParams(img, param); // [1, 10, 10, 4]
 
     float* blobPtr = blob.ptr<float>(0);
-    std::vector<float> targetVec = {(float )factorVec[0] * 0, (float )factorVec[1] * 1, (float )factorVec[2] * 2, (float )factorVec[3] * 3}; // Target Value.
+    std::vector<float> targetVec = { (float)factorVec[0] * 0, (float)factorVec[1] * 1, (float)factorVec[2] * 2, (float)factorVec[3] * 3 }; // Target Value.
     for (int hi = 0; hi < 10; hi++)
     {
         for (int wi = 0; wi < 10; wi++)
@@ -92,18 +159,51 @@ TEST(blobFromImageWithParams_4ch, NHWC_scalar_scale)
     }
 }
 
+TEST(blobFromImageWithParams_CustomPadding, letter_box)
+{
+    Mat img(40, 20, CV_8UC4, Scalar(0, 1, 2, 3));
+
+    // Custom padding value that you have added
+    Scalar customPaddingValue(5, 6, 7, 8); // Example padding value
+
+    Size targetSize(20, 20);
+
+    Mat targetImg = img.clone();
+
+    cv::copyMakeBorder(
+        targetImg, targetImg, 0, 0,
+        targetSize.width / 2,
+        targetSize.width / 2,
+        BORDER_CONSTANT,
+        customPaddingValue);
+
+    // Set up Image2BlobParams with your new functionality
+    Image2BlobParams param;
+    param.size = targetSize;
+    param.paddingmode = DNN_PMODE_LETTERBOX;
+    param.borderValue = customPaddingValue; // Use your new feature here
+
+    // Create blob with custom padding
+    Mat blob = dnn::blobFromImageWithParams(img, param);
+
+    // Create target blob for comparison
+    Mat targetBlob = dnn::blobFromImage(targetImg, 1.0, targetSize);
+
+    EXPECT_EQ(0, cvtest::norm(targetBlob, blob, NORM_INF));
+}
+
 TEST(blobFromImageWithParams_4ch, letter_box)
 {
-    Mat img(40, 20, CV_8UC4, cv::Scalar(0,1,2,3));
+    Mat img(40, 20, CV_8UC4, cv::Scalar(0, 1, 2, 3));
 
     // Construct target mat.
     Mat targetCh[4];
     // The letterbox will add zero at the left and right of output blob.
     // After the letterbox, every row data would have same value showing as valVec.
-    std::vector<uint8_t> valVec = {0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0};
+    std::vector<uint8_t> valVec = { 0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0 };
     Mat rowM(1, 20, CV_8UC1, valVec.data());
 
-    for(int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++)
     {
         targetCh[i] = rowM * i;
     }
@@ -120,6 +220,28 @@ TEST(blobFromImageWithParams_4ch, letter_box)
     EXPECT_EQ(0, cvtest::norm(targetBlob, blob, NORM_INF));
 }
 
+TEST(blobFromImagesWithParams_4ch, multi_image)
+{
+    Mat img(10, 10, CV_8UC4, cv::Scalar(0, 1, 2, 3));
+    Scalar scalefactor(0.1, 0.2, 0.3, 0.4);
+
+    Image2BlobParams param;
+    param.scalefactor = scalefactor;
+    param.datalayout = DNN_LAYOUT_NHWC;
+
+    Mat blobs = blobFromImagesWithParams(std::vector<Mat> { img, 2 * img }, param);
+    vector<Range> ranges;
+    ranges.push_back(Range(0, 1));
+    ranges.push_back(Range(0, blobs.size[1]));
+    ranges.push_back(Range(0, blobs.size[2]));
+    ranges.push_back(Range(0, blobs.size[3]));
+    Mat blob0 = blobs(ranges);
+    ranges[0] = Range(1, 2);
+    Mat blob1 = blobs(ranges);
+
+    EXPECT_EQ(0, cvtest::norm(2 * blob0, blob1, NORM_INF));
+}
+
 TEST(readNet, Regression)
 {
     Net net = readNet(findDataFile("dnn/squeezenet_v1.1.prototxt"),
@@ -849,6 +971,35 @@ TEST_P(Test_Model_Optimizer, flexible_inputs)
     normAssert(ref, out, 0, 0);
 }
 
+TEST_P(Test_Model_Optimizer, readONNX)
+{
+    const Backend backendId = get<0>(GetParam());
+    const Target targetId = get<1>(GetParam());
+
+    ASSERT_EQ(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, backendId);
+
+    const std::string& model = findDataFile("dnn/onnx/models/convolution.onnx");
+
+    std::vector<Net> nets = {
+        // Old API
+        readNetFromModelOptimizer(model, ""),
+        readNet("", model, "dldt"),
+        // New API
+        readNetFromModelOptimizer(model),
+        readNet(model, "", "openvino")
+    };
+
+    Mat inp = blobFromNPY(findDataFile("dnn/onnx/data/input_convolution.npy"));
+    Mat ref = blobFromNPY(findDataFile("dnn/onnx/data/output_convolution.npy"));
+
+    for (int i = 0; i < nets.size(); ++i) {
+        nets[i].setPreferableTarget(targetId);
+        nets[i].setInput(inp);
+        Mat out = nets[i].forward();
+        normAssert(out, ref, format("Index: %d", i).c_str());
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_Model_Optimizer,
     dnnBackendsAndTargetsIE()
 );
@@ -884,14 +1035,10 @@ TEST_P(Test_two_inputs, basic)
     randu(firstInp, 0, 100);
     randu(secondInp, 0, 100);
 
-#ifndef CV_CXX11
     std::vector<String> input_names;
     input_names.push_back("data");
     input_names.push_back("second_input");
     net.setInputsNames(input_names);
-#else
-    net.setInputsNames({"data", "second_input"});
-#endif
     net.setInput(firstInp, "data", kScale);
     net.setInput(secondInp, "second_input", kScaleInv);
     net.setPreferableBackend(backendId);
diff --git a/modules/dnn/test/test_model.cpp b/modules/dnn/test/test_model.cpp
index bd03551ab853..902526930fe4 100644
--- a/modules/dnn/test/test_model.cpp
+++ b/modules/dnn/test/test_model.cpp
@@ -40,6 +40,8 @@ class Test_Model : public DNNTestLayer
         model.setPreferableTarget(target);
 
         model.setNmsAcrossClasses(nmsAcrossClasses);
+        if (target == DNN_TARGET_CPU_FP16)
+            model.enableWinograd(false);
 
         std::vector<int> classIds;
         std::vector<float> confidences;
@@ -98,7 +100,7 @@ class Test_Model : public DNNTestLayer
     void testSegmentationModel(const std::string& weights_file, const std::string& config_file,
                                const std::string& inImgPath, const std::string& outImgPath,
                                float norm, const Size& size = {-1, -1}, Scalar mean = Scalar(),
-                               double scale = 1.0, bool swapRB = false, bool crop = false)
+                               double scale = 1.0, bool swapRB = false, bool crop = false, const std::string outname = "")
     {
         checkBackend();
 
@@ -113,6 +115,9 @@ class Test_Model : public DNNTestLayer
         model.setPreferableBackend(backend);
         model.setPreferableTarget(target);
 
+        if(!outname.empty())
+            model.setOutputNames({outname});
+
         model.segment(frame, mask);
         normAssert(mask, exp, "", norm, norm);
     }
@@ -286,8 +291,9 @@ TEST_P(Test_Model, Classify)
 TEST_P(Test_Model, DetectRegion)
 {
     applyTestTag(
+        CV_TEST_TAG_MEMORY_2GB,
         CV_TEST_TAG_LONG,
-        CV_TEST_TAG_MEMORY_2GB
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
@@ -346,8 +352,9 @@ TEST_P(Test_Model, DetectRegion)
 TEST_P(Test_Model, DetectRegionWithNmsAcrossClasses)
 {
     applyTestTag(
+        CV_TEST_TAG_MEMORY_2GB,
         CV_TEST_TAG_LONG,
-        CV_TEST_TAG_MEMORY_2GB
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
@@ -406,6 +413,8 @@ TEST_P(Test_Model, DetectRegionWithNmsAcrossClasses)
 
 TEST_P(Test_Model, DetectionOutput)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
     // Check 'backward_compatible_check || in_out_elements_equal' failed at core/src/op/reshape.cpp:427:
     // While validating node 'v1::Reshape bbox_pred_reshape (ave_bbox_pred_rois[0]:f32{1,8,1,1}, Constant_388[0]:i64{4}) -> (f32{?,?,?,?})' with friendly_name 'bbox_pred_reshape':
@@ -447,14 +456,17 @@ TEST_P(Test_Model, DetectionOutput)
     {
         if (backend == DNN_BACKEND_OPENCV)
             scoreDiff = 4e-3;
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000)
-        else if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-            scoreDiff = 4e-2;
-#endif
         else
             scoreDiff = 2e-2;
         iouDiff = 1.8e-1;
     }
+#if defined(INF_ENGINE_RELEASE)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        {
+            scoreDiff = 0.05;
+            iouDiff = 0.08;
+        }
+#endif
 
     testDetectModel(weights_file, config_file, img_path, refClassIds, refConfidences, refBoxes,
                     scoreDiff, iouDiff, confThreshold, nmsThreshold, size, mean);
@@ -487,8 +499,8 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
         refBoxes.emplace_back(left, top, width, height);
     }
 
-    std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
-    std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
+    std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
+    std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
 
     Scalar mean = Scalar(127.5, 127.5, 127.5);
     double scale = 1.0 / 127.5;
@@ -508,7 +520,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
     }
     else if (target == DNN_TARGET_CUDA_FP16)
     {
-        scoreDiff = 0.0021;
+        scoreDiff = 0.0028;
         iouDiff = 1e-2;
     }
     float confThreshold = FLT_MIN;
@@ -592,8 +604,8 @@ TEST_P(Test_Model, Detection_normalized)
     std::vector<float> refConfidences = {0.999222f};
     std::vector<Rect2d> refBoxes = {Rect2d(0, 4, 227, 222)};
 
-    std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
-    std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
+    std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
+    std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
 
     Scalar mean = Scalar(127.5, 127.5, 127.5);
     double scale = 1.0 / 127.5;
@@ -626,7 +638,8 @@ TEST_P(Test_Model, Detection_normalized)
 TEST_P(Test_Model, Segmentation)
 {
     applyTestTag(
-        CV_TEST_TAG_MEMORY_2GB
+        CV_TEST_TAG_MEMORY_2GB,
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 
     float norm = 0;
@@ -659,20 +672,19 @@ TEST_P(Test_Model, Segmentation)
     if ((backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16))
         || (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16))
     {
-        norm = 2.0f;  // l1 = 0.01 lInf = 2
+        norm = 7.0f;  // l1 = 0.01 lInf = 7
     }
 
     std::string inp = _tf("dog416.png");
-    std::string weights_file = _tf("fcn8s-heavy-pascal.prototxt");
-    std::string config_file = _tf("fcn8s-heavy-pascal.caffemodel", false);
+    std::string weights_file = _tf("onnx/models/fcn-resnet50-12.onnx", false);
     std::string exp = _tf("segmentation_exp.png");
 
     Size size{128, 128};
-    double scale = 1.0;
-    Scalar mean = Scalar();
-    bool swapRB = false;
+    double scale = 0.019;
+    Scalar mean = Scalar(0.485*255, 0.456*255, 0.406*255);
+    bool swapRB = true;
 
-    testSegmentationModel(weights_file, config_file, inp, exp, norm, size, mean, scale, swapRB);
+    testSegmentationModel(weights_file, "", inp, exp, norm, size, mean, scale, swapRB, false, "out");
 }
 
 TEST_P(Test_Model, TextRecognition)
@@ -741,6 +753,8 @@ TEST_P(Test_Model, TextRecognitionWithCTCPrefixBeamSearch)
 
 TEST_P(Test_Model, TextDetectionByDB)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
     if (target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_CPU_FP16)
@@ -783,6 +797,8 @@ TEST_P(Test_Model, TextDetectionByDB)
 
 TEST_P(Test_Model, TextDetectionByEAST)
 {
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
     std::string imgPath = _tf("text_det_test2.jpg");
     std::string weightPath = _tf("frozen_east_text_detection.pb", false);
 
diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp
index b238427dfb21..1ca3f2f75bcd 100644
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@@ -311,6 +311,8 @@ static const TestCase testConformanceConfig[] = {
     {"test_gridsample_nearest", 2, 1},
     {"test_gridsample_reflection_padding", 2, 1},
     {"test_gridsample_zeros_padding", 2, 1},
+    {"test_group_normalization_epsilon", 3, 1},
+    {"test_group_normalization_example", 3, 1},
     {"test_gru_batchwise", 3, 2},
     {"test_gru_defaults", 3, 1},
     {"test_gru_seq_length", 4, 1},
@@ -339,6 +341,25 @@ static const TestCase testConformanceConfig[] = {
     {"test_isinf_negative", 1, 1},
     {"test_isinf_positive", 1, 1},
     {"test_isnan", 1, 1},
+    {"test_layer_normalization_2d_axis0", 3, 1},
+    {"test_layer_normalization_2d_axis1", 3, 1},
+    {"test_layer_normalization_2d_axis_negative_1", 3, 1},
+    {"test_layer_normalization_2d_axis_negative_2", 3, 1},
+    {"test_layer_normalization_3d_axis0_epsilon", 3, 1},
+    {"test_layer_normalization_3d_axis1_epsilon", 3, 1},
+    {"test_layer_normalization_3d_axis2_epsilon", 3, 1},
+    {"test_layer_normalization_3d_axis_negative_1_epsilon", 3, 1},
+    {"test_layer_normalization_3d_axis_negative_2_epsilon", 3, 1},
+    {"test_layer_normalization_3d_axis_negative_3_epsilon", 3, 1},
+    {"test_layer_normalization_4d_axis0", 3, 1},
+    {"test_layer_normalization_4d_axis1", 3, 1},
+    {"test_layer_normalization_4d_axis2", 3, 1},
+    {"test_layer_normalization_4d_axis3", 3, 1},
+    {"test_layer_normalization_4d_axis_negative_1", 3, 1},
+    {"test_layer_normalization_4d_axis_negative_2", 3, 1},
+    {"test_layer_normalization_4d_axis_negative_3", 3, 1},
+    {"test_layer_normalization_4d_axis_negative_4", 3, 1},
+    {"test_layer_normalization_default_axis", 3, 1},
     {"test_leakyrelu", 1, 1},
     {"test_leakyrelu_default", 1, 1},
     {"test_leakyrelu_example", 1, 1},
@@ -1257,4 +1278,4 @@ INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_conformance,
     printOnnxConfParams
 );
 
-};
+}
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp
index 4c05f1030524..96778ef5d484 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__cuda_denylist.inl.hpp
@@ -46,6 +46,13 @@
 "test_conv_with_strides_and_asymmetric_padding",
 "test_conv_with_strides_no_padding",
 "test_conv_with_strides_padding",
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
 "test_div_bcast",
 "test_div_uint8",
 "test_dropout_default_ratio",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__halide_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__halide_denylist.inl.hpp
index 4924aaf9dac0..da7170c6952e 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__halide_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__halide_denylist.inl.hpp
@@ -45,6 +45,13 @@
 "test_castlike_FLOAT_to_STRING_expanded",
 "test_castlike_STRING_to_FLOAT_expanded",
 "test_ceil",
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
 "test_concat_1d_axis_negative_1",
 "test_concat_3d_axis_1",
 "test_div",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
index e6a35dfab9a6..509cf6007dbe 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
@@ -579,9 +579,7 @@ CASE(test_dropout_default_mask_ratio)
 CASE(test_dropout_default_old)
     // no filter
 CASE(test_dropout_default_ratio)
-#if SKIP_SET_1
-    SKIP;
-#endif
+    // no filter
 CASE(test_dropout_random_old)
     // no filter
 CASE(test_dynamicquantizelinear)
@@ -599,7 +597,7 @@ CASE(test_dynamicquantizelinear_min_adjusted_expanded)
 CASE(test_edge_pad)
     // no filter
 CASE(test_einsum_batch_diagonal)
-    // no filter
+    SKIP;
 CASE(test_einsum_batch_matmul)
     // no filter
 CASE(test_einsum_inner_prod)
@@ -738,6 +736,10 @@ CASE(test_gridsample_reflection_padding)
     // no filter
 CASE(test_gridsample_zeros_padding)
     // no filter
+CASE(test_group_normalization_epsilon)
+    // no filter
+CASE(test_group_normalization_example)
+    // no filter
 CASE(test_gru_batchwise)
     // no filter
 CASE(test_gru_defaults)
@@ -794,6 +796,44 @@ CASE(test_isinf_positive)
     // no filter
 CASE(test_isnan)
     // no filter
+CASE(test_layer_normalization_2d_axis0)
+    // no filter
+CASE(test_layer_normalization_2d_axis1)
+    // no filter
+CASE(test_layer_normalization_2d_axis_negative_1)
+    // no filter
+CASE(test_layer_normalization_2d_axis_negative_2)
+    // no filter
+CASE(test_layer_normalization_3d_axis0_epsilon)
+    // no filter
+CASE(test_layer_normalization_3d_axis1_epsilon)
+    // no filter
+CASE(test_layer_normalization_3d_axis2_epsilon)
+    // no filter
+CASE(test_layer_normalization_3d_axis_negative_1_epsilon)
+    // no filter
+CASE(test_layer_normalization_3d_axis_negative_2_epsilon)
+    // no filter
+CASE(test_layer_normalization_3d_axis_negative_3_epsilon)
+    // no filter
+CASE(test_layer_normalization_4d_axis0)
+    // no filter
+CASE(test_layer_normalization_4d_axis1)
+    // no filter
+CASE(test_layer_normalization_4d_axis2)
+    // no filter
+CASE(test_layer_normalization_4d_axis3)
+    // no filter
+CASE(test_layer_normalization_4d_axis_negative_1)
+    // no filter
+CASE(test_layer_normalization_4d_axis_negative_2)
+    // no filter
+CASE(test_layer_normalization_4d_axis_negative_3)
+    // no filter
+CASE(test_layer_normalization_4d_axis_negative_4)
+    // no filter
+CASE(test_layer_normalization_default_axis)
+    // no filter
 CASE(test_leakyrelu)
     // no filter
 CASE(test_leakyrelu_default)
@@ -1020,10 +1060,25 @@ CASE(test_mod_int64_fmod)
     // no filter
 CASE(test_mod_mixed_sign_float16)
     // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104141 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156212 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_float32)
     // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104141 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156212 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_float64)
     // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104167 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156251 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_int16)
     // no filter
 CASE(test_mod_mixed_sign_int32)
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
index 8156686428e9..f87e16a42fbc 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__vulkan_denylist.inl.hpp
@@ -40,6 +40,13 @@
 "test_cast_STRING_to_FLOAT",
 "test_castlike_FLOAT_to_STRING_expanded",
 "test_castlike_STRING_to_FLOAT_expanded",
+"test_cumsum_1d",
+"test_cumsum_1d_exclusive",
+"test_cumsum_1d_reverse",
+"test_cumsum_1d_reverse_exclusive",
+"test_cumsum_2d_axis_0",
+"test_cumsum_2d_axis_1",
+"test_cumsum_2d_negative_axis",
 "test_concat_1d_axis_negative_1",
 "test_div_uint8",
 "test_flatten_axis0",
@@ -48,6 +55,9 @@
 "test_flatten_negative_axis1",
 "test_flatten_negative_axis2",
 "test_flatten_negative_axis4",
+"test_gather_elements_0",
+"test_gather_elements_1",
+"test_gather_elements_negative_indices",
 "test_logsoftmax_default_axis",
 "test_maxpool_2d_dilations",
 "test_maxpool_2d_same_lower",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
index 292cd2a06672..0da0111990f5 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
@@ -41,9 +41,8 @@
 "test_cast_STRING_to_FLOAT",
 "test_castlike_FLOAT_to_STRING_expanded",
 "test_castlike_STRING_to_FLOAT_expanded",
-"test_concat_1d_axis_negative_1",
+"test_concat_1d_axis_negative_1", // 1d support is required
 "test_div_uint8",  // output type mismatch
-"test_logsoftmax_default_axis",
 "test_maxpool_2d_dilations",
 "test_maxpool_2d_same_lower",
 "test_maxpool_2d_uint8",  // output type mismatch
@@ -51,7 +50,6 @@
 "test_maxpool_with_argmax_2d_precomputed_strides",
 "test_maxunpool_export_with_output_shape",  // exception during net.forward() call
 "test_mul_uint8",  // output type mismatch
-"test_softmax_default_axis",
 "test_sub_bcast",
 "test_sub_uint8",  // output type mismatch
 "test_upsample_nearest",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
index 0630833b1ff3..cb008e9670c4 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@@ -89,13 +89,6 @@
 "test_convtranspose_pad",
 "test_convtranspose_pads",
 "test_convtranspose_with_kernel",
-"test_cumsum_1d",
-"test_cumsum_1d_exclusive",
-"test_cumsum_1d_reverse",
-"test_cumsum_1d_reverse_exclusive",
-"test_cumsum_2d_axis_0",
-"test_cumsum_2d_axis_1",
-"test_cumsum_2d_negative_axis",
 "test_dequantizelinear",
 "test_dequantizelinear_axis",
 "test_det_2d",
@@ -110,11 +103,7 @@
 "test_dynamicquantizelinear_min_adjusted",
 "test_dynamicquantizelinear_min_adjusted_expanded",
 "test_edge_pad",
-"test_einsum_batch_diagonal",
-"test_einsum_batch_matmul",
 "test_einsum_inner_prod",
-"test_einsum_sum",
-"test_einsum_transpose",
 "test_equal",
 "test_equal_bcast",
 "test_expand_dim_changed",
@@ -125,9 +114,6 @@
 "test_gather_0",
 "test_gather_1",
 "test_gather_2d_indices",
-"test_gather_elements_0",
-"test_gather_elements_1",
-"test_gather_elements_negative_indices",
 "test_gather_negative_indices",
 "test_gathernd_example_float32",
 "test_gathernd_example_int32",
@@ -135,8 +121,6 @@
 "test_gemm_all_attributes",
 "test_gemm_alpha",
 "test_gemm_beta",
-"test_gemm_default_matrix_bias",
-"test_gemm_default_no_bias",
 "test_gemm_default_scalar_bias",
 "test_gemm_default_single_elem_vector_bias",
 "test_gemm_default_vector_bias",
@@ -173,8 +157,6 @@
 "test_if",
 "test_if_opt",
 "test_if_seq",
-"test_instancenorm_epsilon",
-"test_instancenorm_example",
 "test_isinf",
 "test_isinf_negative",
 "test_isinf_positive",
@@ -226,9 +208,6 @@
 "test_min_uint8",
 "test_mod_broadcast",
 "test_mod_int64_fmod",
-"test_mod_mixed_sign_float16",
-"test_mod_mixed_sign_float32",
-"test_mod_mixed_sign_float64",
 "test_mod_mixed_sign_int16",
 "test_mod_mixed_sign_int32",
 "test_mod_mixed_sign_int64",
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 49908e7ff1aa..82b10fb1ba64 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -9,8 +9,18 @@
 #include "test_precomp.hpp"
 #include "npy_blob.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
+#include <numeric>
 namespace opencv_test { namespace {
 
+void yoloPostProcessing(
+    std::vector<Mat>& outs,
+    std::vector<int>& keep_classIds,
+    std::vector<float>& keep_confidences,
+    std::vector<Rect2d>& keep_boxes,
+    float conf_threshold,
+    float iou_threshold,
+    const std::string& test_name);
+
 template<typename TString>
 static std::string _tf(TString filename, bool required = true)
 {
@@ -52,8 +62,9 @@ class Test_ONNX_layers : public DNNTestLayer
     }
 
     void testONNXModels(const String& basename, const Extension ext = npy,
-                        const double l1 = 0, const float lInf = 0, const bool useSoftmax = false,
-                        bool checkNoFallbacks = true, int numInps = 1)
+                        double l1 = 0, double lInf = 0, const bool useSoftmax = false,
+                        bool checkNoFallbacks = true, int numInps = 1,
+                        bool testShapes = true, bool useWinograd = true)
     {
         String onnxmodel = _tf("models/" + basename + ".onnx", required);
         std::vector<Mat> inps(numInps);
@@ -75,10 +86,12 @@ class Test_ONNX_layers : public DNNTestLayer
         Net net = readNetFromONNX(onnxmodel);
         ASSERT_FALSE(net.empty());
 
-        testInputShapes(net, inps);
+        if (testShapes)
+            testInputShapes(net, inps);
 
         net.setPreferableBackend(backend);
         net.setPreferableTarget(target);
+        net.enableWinograd(useWinograd);
 
         std::vector<String> inputNames;
         for (int i = 0; i < numInps; ++i)
@@ -102,7 +115,12 @@ class Test_ONNX_layers : public DNNTestLayer
             netSoftmax.setInput(ref);
             ref = netSoftmax.forward();
         }
-        normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        {
+            l1 = std::max(l1, 1.4e-3);
+            lInf = std::max(lInf, 8e-3);
+        }
+        normAssert(ref, out, basename.c_str(), l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
         if (checkNoFallbacks)
             expectNoFallbacksFromIE(net);
     }
@@ -110,9 +128,6 @@ class Test_ONNX_layers : public DNNTestLayer
 
 TEST_P(Test_ONNX_layers, InstanceNorm)
 {
-    if(backend == DNN_BACKEND_CUDA)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); /* MVN is not supported */
-
     if (target == DNN_TARGET_MYRIAD)
         testONNXModels("instancenorm", npy, 0, 0, false, false);
     else
@@ -238,6 +253,14 @@ TEST_P(Test_ONNX_layers, GatherMulti)
     testONNXModels("gather_multi", npy, 0, 0, false, false);
 }
 
+TEST_P(Test_ONNX_layers, Gather_shared_indices) {
+    testONNXModels("gather_shared_indices", npy, 0, 0, false, false, 1);
+}
+
+TEST_P(Test_ONNX_layers, Two_resizes_with_shared_subgraphs) {
+    testONNXModels("two_resizes_with_shared_subgraphs", npy, 0, 0, false, false, 3, /*testShapes*/ false);
+}
+
 TEST_P(Test_ONNX_layers, Convolution3D)
 {
     if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
@@ -666,6 +689,9 @@ TEST_P(Test_ONNX_layers, Compare_GT)
 
     testONNXModels("greater");
 }
+TEST_P(Test_ONNX_layers, Greater_input_dtype_int64) {
+    testONNXModels("greater_input_dtype_int64");
+}
 
 TEST_P(Test_ONNX_layers, Compare_LT)
 {
@@ -773,6 +799,17 @@ TEST_P(Test_ONNX_layers, Concatenation)
     testONNXModels("concat_const_blobs");
 }
 
+TEST_P(Test_ONNX_layers, CumSumExclusiveInplace)
+{
+    testONNXModels("cumsum_exclusive_inplace");
+}
+
+TEST_P(Test_ONNX_layers, Range)
+{
+    testONNXModels("range_float");
+    testONNXModels("range_float_negative");
+}
+
 TEST_P(Test_ONNX_layers, Eltwise3D)
 {
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
@@ -982,9 +1019,21 @@ TEST_P(Test_ONNX_layers, MatMulAdd)
 TEST_P(Test_ONNX_layers, Expand)
 {
     testONNXModels("expand");
+}
+
+TEST_P(Test_ONNX_layers, ExpandIdentity) {
     testONNXModels("expand_identity");
+}
+
+TEST_P(Test_ONNX_layers, ExpandBatch) {
     testONNXModels("expand_batch");
+}
+
+TEST_P(Test_ONNX_layers, ExpandChannels) {
     testONNXModels("expand_channels");
+}
+
+TEST_P(Test_ONNX_layers, ExpandNegBatch) {
     testONNXModels("expand_neg_batch");
 }
 
@@ -1036,10 +1085,12 @@ TEST_P(Test_ONNX_layers, ResizeUnfused)
 
 TEST_P(Test_ONNX_layers, ResizeUnfusedTwoInputs)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     testONNXModels("upsample_unfused_two_inputs_opset9_torch1.4", npy, 0, 0, false, true, 2);
     testONNXModels("upsample_unfused_two_inputs_opset11_torch1.4", npy, 0, 0, false, true, 2);
 }
@@ -1143,10 +1194,12 @@ TEST_P(Test_ONNX_layers, ReduceL2)
 
 TEST_P(Test_ONNX_layers, Split)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     testONNXModels("split_0");
     testONNXModels("split_1");
     testONNXModels("split_2");
@@ -1222,10 +1275,12 @@ TEST_P(Test_ONNX_layers, Softmax)
 
 TEST_P(Test_ONNX_layers, Split_EltwiseMax)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     testONNXModels("split_max");
 }
 
@@ -1410,6 +1465,69 @@ TEST_P(Test_ONNX_layers, LSTM_layout_batch)
     testONNXModels("lstm_layout_1", npy, 0.005, 0.005, false, false, 3);
 }
 
+TEST_P(Test_ONNX_layers, DISABLED_Einsum_1D)
+{
+    testONNXModels("einsum_1d", npy, 0, 0, false, false, 2);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_2D)
+{
+    testONNXModels("einsum_2d", npy, 0, 0, false, false, 2);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_2D_Ellipses)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    testONNXModels("einsum_2d_ellipses", npy, 0, 0, false, false, 2);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_3D)
+{
+    testONNXModels("einsum_3d", npy, 0, 0, false, false, 2);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_4D)
+{
+    testONNXModels("einsum_4d", npy, 0, 0, false, false, 2);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_5D)
+{
+    testONNXModels("einsum_5d", npy, 0, 0, false, false, 2);
+}
+
+TEST_P(Test_ONNX_layers, DISABLED_Einsum_InnerProduct)
+{
+    testONNXModels("einsum_inner", npy, 0, 0, false, false, 2);
+}
+
+TEST_P(Test_ONNX_layers, DISABLED_Einsum_HadamardProduct)
+{
+    testONNXModels("einsum_hadamard", npy, 0, 0, false, false, 2);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_Batch_Diagonal)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    testONNXModels("einsum_batch_diagonal", npy, 0, 0, false, false, 1);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_Sum)
+{
+    testONNXModels("einsum_sum", npy, 0, 0, false, false, 1);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_transpose)
+{
+    testONNXModels("einsum_transpose", npy, 0, 0, false, false, 1);
+}
+
+TEST_P(Test_ONNX_layers, Einsum_const_inputs) {
+    testONNXModels("einsum_const_inputs", npy, 0, 0, false, false, 1);
+}
+
 TEST_P(Test_ONNX_layers, Pad2d_Unfused)
 {
     testONNXModels("ReflectionPad2d");
@@ -1835,7 +1953,9 @@ TEST_P(Test_ONNX_layers, ConvResizePool1d)
 #endif
     }
 #endif
-    testONNXModels("conv_resize_pool_1d");
+
+    const double lInf = (target == DNN_TARGET_CPU_FP16) ? 0.024 : default_lInf;
+    testONNXModels("conv_resize_pool_1d", npy, default_l1, lInf);
 }
 
 TEST_P(Test_ONNX_layers, DepthWiseAdd)
@@ -1981,12 +2101,16 @@ TEST_P(Test_ONNX_layers, Quantized_Unsqueeze)
 TEST_P(Test_ONNX_layers, Quantized_Resize)
 {
     testONNXModels("quantized_resize_nearest");
-    testONNXModels("quantized_resize_bilinear", npy, 2e-4, 0.003);
-    testONNXModels("quantized_resize_bilinear_align", npy, 3e-4, 0.003);
+    double l1 = backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.0013 : 2e-4;
+    testONNXModels("quantized_resize_bilinear", npy, l1, 0.003);
+    l1 = backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ? 0.0013 : 3e-4;
+    testONNXModels("quantized_resize_bilinear_align", npy, l1, 0.003);
 }
 
 TEST_P(Test_ONNX_layers, Quantized_Concat)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     testONNXModels("quantized_concat");
     testONNXModels("quantized_concat_const_blob");
 }
@@ -2003,6 +2127,8 @@ TEST_P(Test_ONNX_layers, OutputRegistration)
 
 TEST_P(Test_ONNX_layers, QLinearSoftmax)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     testONNXModels("qlinearsoftmax_v11", npy, 0.002, 0.002); // 2D coerced
     testONNXModels("qlinearsoftmax_v13", npy, 0.002, 0.002);
 }
@@ -2030,6 +2156,7 @@ TEST_P(Test_ONNX_nets, Alexnet)
 
     net.setPreferableBackend(backend);
     net.setPreferableTarget(target);
+    net.enableWinograd(false);
 
     Mat inp = imread(_tf("../grace_hopper_227.png"));
     Mat ref = blobFromNPY(_tf("../caffe_alexnet_prob.npy"));
@@ -2043,6 +2170,34 @@ TEST_P(Test_ONNX_nets, Alexnet)
     expectNoFallbacksFromIE(net);
 }
 
+TEST_P(Test_ONNX_nets, RAFT)
+{
+    applyTestTag(CV_TEST_TAG_LONG, CV_TEST_TAG_DEBUG_VERYLONG, CV_TEST_TAG_MEMORY_2GB);
+
+    std::string weight_path = _tf("models/optical_flow_estimation_raft_2023aug.onnx", false);
+    std::string img0_path = findDataFile(std::string("gpu/opticalflow/frame0.png"));
+    std::string img1_path = findDataFile(std::string("gpu/opticalflow/frame1.png"));
+
+    Size target_size{480, 360};
+    auto img0 = imread(img0_path);
+    auto img1 = imread(img1_path);
+    auto blob0 = blobFromImage(img0, 1.0, target_size, 0, true);
+    auto blob1 = blobFromImage(img1, 1.0, target_size, 0, true);
+
+    auto net = readNet(weight_path);
+    net.setInput(blob0, "0");
+    net.setInput(blob1, "1");
+    std::vector<std::string> outnames{"12007", "12006"};
+    std::vector<Mat> outs;
+    net.forward(outs, outnames);
+
+    // output 12006 is not checked to save space in opencv_extra since its ref is > 1MB,
+    // and output 12006 is calculated from 12007 so checking 12007 is sufficient.
+    std::string ref_12700_path = _tf("data/output_optical_flow_estimation_raft_2023aug.npy");
+    auto ref0 = blobFromNPY(ref_12700_path);
+    normAssert(ref0, outs[0], "", 1e-5, 1.8e-4);
+}
+
 TEST_P(Test_ONNX_nets, Squeezenet)
 {
     testONNXModels("squeezenet", pb);
@@ -2074,6 +2229,9 @@ TEST_P(Test_ONNX_nets, Googlenet)
     net.setPreferableBackend(backend);
     net.setPreferableTarget(target);
 
+    if (target == DNN_TARGET_CPU_FP16)
+        net.enableWinograd(false);
+
     std::vector<Mat> images;
     images.push_back( imread(_tf("../googlenet_0.png")) );
     images.push_back( imread(_tf("../googlenet_1.png")) );
@@ -2150,7 +2308,13 @@ TEST_P(Test_ONNX_nets, ResNet50v1)
     applyTestTag(CV_TEST_TAG_MEMORY_512MB);
 
     // output range: [-67; 75], after Softmax [0, 0.98]
+    size_t hwm0 = getTopMemoryUsageMB();
     testONNXModels("resnet50v1", pb, default_l1, default_lInf, true, target != DNN_TARGET_MYRIAD);
+    size_t hwm1 = getTopMemoryUsageMB();
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU)
+    {
+        EXPECT_LE(hwm1 - hwm0, 350) << "Top allocated memory";
+    }
 }
 
 TEST_P(Test_ONNX_nets, ResNet50_Int8)
@@ -2218,7 +2382,7 @@ TEST_P(Test_ONNX_nets, TinyYolov2)
     }
 #endif
 
-    testONNXModels("tiny_yolo2", pb, l1, lInf);
+    testONNXModels("tiny_yolo2", pb, l1, lInf, false, true, 1, true, false);
 }
 
 TEST_P(Test_ONNX_nets, CNN_MNIST)
@@ -2246,7 +2410,7 @@ TEST_P(Test_ONNX_nets, LResNet100E_IR)
 #else
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
 #endif
-        CV_TEST_TAG_DEBUG_LONG
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
     {
@@ -2263,6 +2427,7 @@ TEST_P(Test_ONNX_nets, LResNet100E_IR)
 
     double l1 = default_l1, lInf = default_lInf;
     // output range: [-3; 3]
+    bool useWinograd = true;
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
     {
         l1 = 0.009;
@@ -2278,7 +2443,14 @@ TEST_P(Test_ONNX_nets, LResNet100E_IR)
         l1 = 0.009;
         lInf = 0.04;
     }
-    testONNXModels("LResNet100E_IR", pb, l1, lInf);
+    else if (target == DNN_TARGET_CPU_FP16)
+    {
+        useWinograd = false;
+        l1 = 0.009;
+        lInf = 0.035;
+    }
+
+    testONNXModels("LResNet100E_IR", pb, l1, lInf, false, true, 1, true, useWinograd);
 }
 
 TEST_P(Test_ONNX_nets, Emotion_ferplus)
@@ -2293,7 +2465,7 @@ TEST_P(Test_ONNX_nets, Emotion_ferplus)
 
     double l1 = default_l1;
     double lInf = default_lInf;
-
+    bool useWinograd = true;
     // Output values are in range [-2.011, 2.111]
     if ((backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) || (target == DNN_TARGET_CUDA_FP16))
         l1 = 0.007;
@@ -2306,6 +2478,11 @@ TEST_P(Test_ONNX_nets, Emotion_ferplus)
         l1 = 2.4e-4;
         lInf = 6e-4;
     }
+    else if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
+    {
+        useWinograd = false;
+        l1 = 0.007;
+    }
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2020040000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
     {
@@ -2313,7 +2490,7 @@ TEST_P(Test_ONNX_nets, Emotion_ferplus)
     }
 #endif
 
-    testONNXModels("emotion_ferplus", pb, l1, lInf);
+    testONNXModels("emotion_ferplus", pb, l1, lInf, false, true, 1, true, useWinograd);
 }
 
 TEST_P(Test_ONNX_nets, Inception_v2)
@@ -2457,30 +2634,16 @@ TEST_P(Test_ONNX_layers, CumSum)
     testONNXModels("cumsum_3d_dim_2");
 }
 
-// This test is mainly to test:
-//  1. identity node with constant input
-//  2. limited support to range operator (all inputs are constant)
-//  3. parseExpand with multiple broadcast axes
-//  4. 1D mat dimension issue with the output of range operator
-TEST_P(Test_ONNX_layers, YOLOv7)
+static void testYOLO(const std::string& weightPath, const std::vector<int>& refClassIds,
+                     const std::vector<float>& refScores, const std::vector<Rect2d>& refBoxes,
+                     Image2BlobParams imgParams, float conf_threshold = 0.3, float iou_threshold = 0.5,
+                     double scores_diff = 1e-5, double boxes_iou_diff = 1e-4, const std::string test_name = "")
 {
-    std::string weightPath = _tf("models/yolov7_not_simplified.onnx", false);
     std::string imgPath = _tf("../dog_orig_size.png");
 
-    Size targetSize{640, 640};
-    float conf_threshold = 0.3;
-    float iou_threshold = 0.5;
-
-    // Reference, which is collected with input size of 640x640
-    std::vector<int> refClassIds{1, 16, 7};
-    std::vector<float> refScores{0.9614331f, 0.9589417f, 0.8679074f};
-    // [x1, y1, x2, y2] x 3
-    std::vector<Rect2d> refBoxes{Rect2d(105.973236f, 150.16716f,  472.59012f, 466.48834f),
-                                  Rect2d(109.97953f,  246.17862f, 259.83676f, 600.76624f),
-                                  Rect2d(385.96185f, 83.02809f,  576.07355f,  189.82793f)};
-
     Mat img = imread(imgPath);
-    Mat inp = blobFromImage(img, 1/255.0, targetSize, Scalar(0, 0, 0), true, false);
+
+    Mat inp = blobFromImageWithParams(img, imgParams);
 
     Net net = readNet(weightPath);
 
@@ -2488,92 +2651,310 @@ TEST_P(Test_ONNX_layers, YOLOv7)
     std::vector<Mat> outs;
     net.forward(outs, net.getUnconnectedOutLayersNames());
 
-    Mat preds = outs[3].reshape(1, outs[3].size[1]); // [1, 25200, 85]
+    // Retrieve
+    std::vector<int> keep_classIds;
+    std::vector<float> keep_confidences;
+    std::vector<Rect2d> keep_boxes;
+    yoloPostProcessing(outs, keep_classIds, keep_confidences, keep_boxes, conf_threshold, iou_threshold, test_name);
+
+    normAssertDetections(
+        refClassIds, refScores, refBoxes,
+        keep_classIds, keep_confidences, keep_boxes,
+        "", 0.0, scores_diff, boxes_iou_diff);
+}
+
+void yoloPostProcessing(
+    std::vector<Mat>& outs,
+    std::vector<int>& keep_classIds,
+    std::vector<float>& keep_confidences,
+    std::vector<Rect2d>& keep_boxes,
+    float conf_threshold,
+    float iou_threshold,
+    const std::string& test_name
+){
 
     // Retrieve
     std::vector<int> classIds;
     std::vector<float> confidences;
     std::vector<Rect2d> boxes;
-    // each row is [cx, cy, w, h, conf_obj, conf_class1, ..., conf_class80]
-    for (int i = 0; i < preds.rows; ++i)
-    {
-        // filter out non objects
-        float obj_conf = preds.row(i).at<float>(4);
-        if (obj_conf < conf_threshold)
-            continue;
-
-        // get class id and conf
-        Mat scores = preds.row(i).colRange(5, preds.cols);
-        double conf;
-        Point maxLoc;
-        minMaxLoc(scores, 0, &conf, 0, &maxLoc);
-        conf *= obj_conf;
-        if (conf < conf_threshold)
-            continue;
-
-        // get bbox coords
-        float* det = preds.ptr<float>(i);
-        double cx = det[0];
-        double cy = det[1];
-        double w = det[2];
-        double h = det[3];
-        // [x1, y1, x2, y2]
-        boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h,
-                                cx + 0.5 * w, cy + 0.5 * h));
-        classIds.push_back(maxLoc.x);
-        confidences.push_back(conf);
+
+    if (test_name == "yolov8"){
+        cv::transposeND(outs[0], {0, 2, 1}, outs[0]);
+    }
+
+    if (test_name == "yolonas"){
+        // outs contains 2 elemets of shape [1, 8400, 80] and [1, 8400, 4]. Concat them to get [1, 8400, 84]
+        Mat concat_out;
+        // squeeze the first dimension
+        outs[0] = outs[0].reshape(1, outs[0].size[1]);
+        outs[1] = outs[1].reshape(1, outs[1].size[1]);
+        cv::hconcat(outs[1], outs[0], concat_out);
+        outs[0] = concat_out;
+        // remove the second element
+        outs.pop_back();
+        // unsqueeze the first dimension
+        outs[0] = outs[0].reshape(0, std::vector<int>{1, 8400, 84});
+    }
+
+    for (auto preds : outs){
+
+        preds = preds.reshape(1, preds.size[1]); // [1, 8400, 85] -> [8400, 85]
+        for (int i = 0; i < preds.rows; ++i)
+        {
+            // filter out non object
+            float obj_conf = (test_name == "yolov8" || test_name == "yolonas") ? 1.0f : preds.at<float>(i, 4) ;
+            if (obj_conf < conf_threshold)
+                continue;
+
+            Mat scores = preds.row(i).colRange((test_name == "yolov8" || test_name == "yolonas") ? 4 : 5, preds.cols);
+            double conf;
+            Point maxLoc;
+            minMaxLoc(scores, 0, &conf, 0, &maxLoc);
+
+            conf = (test_name == "yolov8" || test_name == "yolonas") ? conf : conf * obj_conf;
+            if (conf < conf_threshold)
+                continue;
+
+            // get bbox coords
+            float* det = preds.ptr<float>(i);
+            double cx = det[0];
+            double cy = det[1];
+            double w = det[2];
+            double h = det[3];
+
+            // std::cout << "cx: " << cx << " cy: " << cy << " w: " << w << " h: " << h << " conf: " << conf << " idx: " << maxLoc.x << std::endl;
+            // [x1, y1, x2, y2]
+            if (test_name == "yolonas"){
+                boxes.push_back(Rect2d(cx, cy, w, h));
+            } else {
+                boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h,
+                                        cx + 0.5 * w, cy + 0.5 * h));
+            }
+           classIds.push_back(maxLoc.x);
+            confidences.push_back(conf);
+        }
     }
 
     // NMS
     std::vector<int> keep_idx;
     NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, keep_idx);
 
-    std::vector<int> keep_classIds;
-    std::vector<float> keep_confidences;
-    std::vector<Rect2d> keep_boxes;
     for (auto i : keep_idx)
     {
         keep_classIds.push_back(classIds[i]);
         keep_confidences.push_back(confidences[i]);
         keep_boxes.push_back(boxes[i]);
     }
+}
+
+
+TEST_P(Test_ONNX_nets, YOLOX)
+{
+    applyTestTag(CV_TEST_TAG_DEBUG_VERYLONG);
+
+    std::string weightPath = _tf("models/yolox_s_inf_decoder.onnx", false);
+
+    Size targetSize{640, 640};
+    float conf_threshold = 0.50;
+    float iou_threshold = 0.50;
 
-    normAssertDetections(refClassIds, refScores, refBoxes, keep_classIds, keep_confidences, keep_boxes);
+    std::vector<int> refClassIds{1, 16, 7};
+    std::vector<float> refScores{0.9649f, 0.9163f, 0.6879f};
+
+    std::vector<Rect2d> refBoxes{
+        Rect2d(105.5384, 179.4100, 470.6339, 428.5553),
+        Rect2d(111.4482, 263.4098, 258.7438, 526.1140),
+        Rect2d(389.1421, 143.9286, 577.9495, 222.0294)
+        };
+
+    Image2BlobParams imgParams(
+        Scalar::all(1),
+        targetSize,
+        Scalar::all(0),
+        true,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        DNN_PMODE_LETTERBOX,
+        Scalar::all(114)
+        );
+
+    testYOLO(
+        weightPath, refClassIds, refScores, refBoxes,
+        imgParams, conf_threshold, iou_threshold,
+        1.0e-4, 1.0e-4);
 }
 
-TEST_P(Test_ONNX_layers, Tile)
+TEST_P(Test_ONNX_nets, YOLONas)
 {
-    testONNXModels("tile", pb);
+    // model information: https://dl.opencv.org/models/yolo-nas/Readme.md
+    std::string weightPath = _tf("models/yolo_nas_s.onnx", false);
+
+    Size targetSize{640, 640};
+    float conf_threshold = 0.50;
+    float iou_threshold = 0.50;
+
+    std::vector<int> refClassIds{1, 16, 7};
+    std::vector<float> refScores{0.9720f, 0.9283f, 0.8990f};
+    // [x1, y1, x2, y2]
+    std::vector<Rect2d> refBoxes{
+        Rect2d(105.516, 173.696, 471.323, 430.433),
+        Rect2d(109.241, 263.406, 259.872, 531.858),
+        Rect2d(390.153, 142.492, 574.932, 222.709)
+        };
+
+    Image2BlobParams imgParams(
+        Scalar::all(1/255.0),
+        targetSize,
+        Scalar::all(0),
+        false,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        DNN_PMODE_LETTERBOX,
+        Scalar::all(114)
+        );
+
+    testYOLO(
+        weightPath, refClassIds, refScores, refBoxes,
+        imgParams, conf_threshold, iou_threshold,
+        1.0e-4, 1.0e-4, "yolonas");
 }
 
-TEST_P(Test_ONNX_layers, LayerNorm)
+TEST_P(Test_ONNX_nets, YOLOv8)
 {
-    testONNXModels("test_layer_normalization_2d_axis0", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_2d_axis1", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_2d_axis_negative_1", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_2d_axis_negative_2", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_3d_axis0_epsilon", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_3d_axis1_epsilon", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_3d_axis2_epsilon", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_3d_axis_negative_1_epsilon", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_3d_axis_negative_2_epsilon", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_3d_axis_negative_3_epsilon", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_4d_axis0", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_4d_axis1", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_4d_axis2", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_4d_axis3", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_4d_axis_negative_1", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_4d_axis_negative_2", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_4d_axis_negative_3", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_4d_axis_negative_4", pb, 0, 0, false, true, 3);
-    testONNXModels("test_layer_normalization_default_axis", pb, 0, 0, false, true, 3);
+    std::string weightPath = _tf("models/yolov8n.onnx", false);
+
+    Size targetSize{640, 640};
+    float conf_threshold = 0.25;
+    float iou_threshold = 0.50;
+
+    std::vector<int> refClassIds{16, 1, 2};
+    std::vector<float> refScores{0.9332f, 0.8959f, 0.6157f};
+    // [x1, y1, x2, y2]
+    std::vector<Rect2d> refBoxes{
+        Rect2d(108.8965, 261.9094, 257.1633, 530.3049),
+        Rect2d(110.4020, 192.9843, 473.4418, 429.5965),
+        Rect2d(389.1603, 143.2506, 577.3542, 223.0615),
+        };
+
+    Image2BlobParams imgParams(
+        Scalar::all(1/255.0),
+        targetSize,
+        Scalar::all(0),
+        true,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        DNN_PMODE_LETTERBOX,
+        Scalar::all(114)
+        );
+
+    testYOLO(
+        weightPath, refClassIds, refScores, refBoxes,
+        imgParams, conf_threshold, iou_threshold,
+        1.0e-4, 1.0e-4, "yolov8");
+}
+
+// This test is mainly to test:
+//  1. identity node with constant input
+//  2. limited support to range operator (all inputs are constant)
+//  3. parseExpand with multiple broadcast axes
+//  4. 1D mat dimension issue with the output of range operator
+TEST_P(Test_ONNX_nets, YOLOv7)
+{
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_2GB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+    std::string weightPath = _tf("models/yolov7.onnx", false);
+    // Reference, which is collected with input size of 640x640
+    std::vector<int> refClassIds{1, 16, 7};
+    std::vector<float> refScores{0.9614331f, 0.9589417f, 0.8679074f};
+    // [x1, y1, x2, y2] x 3
+    std::vector<Rect2d> refBoxes{Rect2d(105.973236f, 150.16716f,  472.59012f, 466.48834f),
+                                 Rect2d(109.97953f,  246.17862f, 259.83676f, 600.76624f),
+                                 Rect2d(385.96185f, 83.02809f,  576.07355f,  189.82793f)};
+
+    Size targetSize{640, 640};
+
+    Image2BlobParams imgParams(
+        Scalar::all(1/255.0),
+        targetSize,
+        Scalar::all(0),
+        true,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        DNN_PMODE_NULL,
+        Scalar::all(0)
+        );
+
+    testYOLO(weightPath, refClassIds, refScores, refBoxes, imgParams);
+}
+
+TEST_P(Test_ONNX_nets, YOLOv6)
+{
+    std::string weightPath = _tf("models/yolov6n.onnx", false);
+
+    Size targetSize{640, 640};
+    float conf_threshold = 0.30;
+    float iou_threshold = 0.50;
+
+    std::vector<int> refClassIds{1, 16, 7, 1};
+    std::vector<float> refScores{0.95031f, 0.87123f,  0.65453f, 0.34142f};
+    // [x1, y1, x2, y2] x 3
+    std::vector<Rect2d> refBoxes{Rect2d(98.84, 177.91, 473.29, 431.19),
+                                 Rect2d(109.80, 265.50, 258.86, 531.97),
+                                 Rect2d(387.79, 141.61, 576.98, 223.52),
+                                 Rect2d(105.62, 199.24, 218.37, 389.84),
+                                 };
+
+    Image2BlobParams imgParams(
+        Scalar::all(1/255.0),
+        targetSize,
+        Scalar::all(0),
+        true,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        DNN_PMODE_LETTERBOX,
+        Scalar::all(114)
+        );
+
+    testYOLO(
+        weightPath, refClassIds, refScores, refBoxes,
+        imgParams, conf_threshold, iou_threshold,
+        1.0e-4, 1.0e-3);
+}
+
+TEST_P(Test_ONNX_nets, YOLOv5n)
+{
+    std::string weightPath = findDataFile("dnn/yolov5n.onnx", false);
+    // Reference, which is collected with input size of 640x640
+    std::vector<int> refClassIds{16, 2, 1};
+    std::vector<float> refScores{0.749053f, 0.616853f, 0.32506f};
+    // [x1, y1, x2, y2] x 4
+
+    std::vector<Rect2d> refBoxes{Rect2d(108.088f, 239.293f, 266.196f, 607.658f),
+                                 Rect2d(392.028f, 89.9233f, 579.152f, 190.447f),
+                                 Rect2d(120.278f, 159.76, 214.481f, 241.473f)};
+
+    Size targetSize{640, 640};
+
+    Image2BlobParams imgParams(
+        Scalar::all(1/255.0),
+        targetSize,
+        Scalar::all(0),
+        true,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        DNN_PMODE_NULL,
+        Scalar::all(0)
+        );
+
+    testYOLO(weightPath, refClassIds, refScores, refBoxes, imgParams);
 }
 
-// for testing graph simplification
-TEST_P(Test_ONNX_layers, LayerNormExpanded)
+TEST_P(Test_ONNX_layers, Tile)
 {
-    testONNXModels("layer_norm_expanded");
-    testONNXModels("layer_norm_expanded_with_initializers");
+    testONNXModels("tile", pb);
 }
 
 TEST_P(Test_ONNX_layers, Gelu)
@@ -2592,6 +2973,139 @@ TEST_P(Test_ONNX_layers, where_node)
     testONNXModels("where_layer");
 }
 
+TEST_P(Test_ONNX_layers, Gemm_all_attributes) {
+    testONNXModels("test_gemm_all_attributes", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_alpha) {
+    testONNXModels("test_gemm_alpha", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_beta) {
+    testONNXModels("test_gemm_beta", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_default_matrix_bias) {
+    testONNXModels("test_gemm_default_matrix_bias", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_default_no_bias) {
+    testONNXModels("test_gemm_default_no_bias", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_default_scalar_bias) {
+    testONNXModels("test_gemm_default_scalar_bias", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_default_single_elem_vector_bias) {
+    testONNXModels("test_gemm_default_single_elem_vector_bias", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_default_vector_bias) {
+    testONNXModels("test_gemm_default_vector_bias", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_default_zero_bias) {
+    testONNXModels("test_gemm_default_zero_bias", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_transposeA) {
+    testONNXModels("test_gemm_transposeA", pb, 0, 0, false, true, 2);
+}
+TEST_P(Test_ONNX_layers, Gemm_transposeB) {
+    testONNXModels("test_gemm_transposeB", pb, 0, 0, false, true, 2);
+}
+
+// Note: These tests are converted from onnx/onnx so that they have constant shape as input.
+// TODO: They can be moved into conformance tests once dynamic input is properly supported.
+TEST_P(Test_ONNX_layers, Expand_dim_changed) {
+    testONNXModels("test_expand_dim_changed", pb, 0, 0, false, true, 1);
+}
+TEST_P(Test_ONNX_layers, Expand_dim_unchanged) {
+    testONNXModels("test_expand_dim_unchanged", pb, 0, 0, false, true, 1);
+}
+TEST_P(Test_ONNX_layers, Expand_shape_model1) {
+    testONNXModels("test_expand_shape_model1", pb, 0, 0, false, true, 1);
+}
+TEST_P(Test_ONNX_layers, Expand_shape_model2) {
+    testONNXModels("test_expand_shape_model2", pb, 0, 0, false, true, 1);
+}
+TEST_P(Test_ONNX_layers, Expand_shape_model3) {
+    testONNXModels("test_expand_shape_model3", pb, 0, 0, false, true, 1);
+}
+TEST_P(Test_ONNX_layers, Expand_shape_model4) {
+    testONNXModels("test_expand_shape_model4", pb, 0, 0, false, true, 1);
+}
+
+TEST_P(Test_ONNX_layers, Attention) {
+    testONNXModels("attention");
+}
+TEST_P(Test_ONNX_layers, AttentionSingleHead) {
+    testONNXModels("attention_single_head");
+}
+
+TEST_P(Test_ONNX_nets, ViT_B_32) {
+    applyTestTag(CV_TEST_TAG_LONG, CV_TEST_TAG_DEBUG_LONG);
+
+    const std::string model_path = _tf("models/vit_b_32.onnx", false);
+
+    auto net = readNet(model_path);
+    ASSERT_FALSE(net.empty());
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    auto image = imread(_tf("../googlenet_0.png"));
+    auto blob = blobFromImage(image, 1.f, Size(224, 224));
+    auto ref = blobFromNPY(_tf("data/output_vit_b_32.npy"));
+    checkBackend(&blob, &ref);
+
+    net.setInput(blob);
+    auto out = net.forward();
+
+    double l1 = default_l1;
+    double lInf = default_lInf;
+    if (target == DNN_TARGET_CUDA_FP16)
+    {
+        l1 = 0.01;
+        lInf = 0.06;
+    }
+    if (target == DNN_TARGET_OPENCL_FP16)
+    {
+        l1 = 0.008;
+        lInf = 0.04;
+    }
+
+    normAssert(ref, out, "ViTB_32", l1, lInf);
+}
+
+TEST_P(Test_ONNX_nets, VitTrack) {
+    auto image = imread(_tf("../dog_orig_size.png"));
+    auto input0 = blobFromImage(image, 1.f, Size(128, 128));
+    auto input1 = blobFromImage(image, 1.f, Size(256, 256));
+
+    auto net = readNet(_tf("models/object_tracking_vittrack_2023sep.onnx", false));
+    net.setInput(input0, "template");
+    net.setInput(input1, "search");
+
+    std::vector<std::string> output_names{"output1", "output2", "output3"};
+    std::vector<Mat> outputs;
+    net.forward(outputs, output_names);
+
+    auto ref_output1 = blobFromNPY(_tf("data/output_object_tracking_vittrack_2023sep_0.npy"));
+    auto ref_output2 = blobFromNPY(_tf("data/output_object_tracking_vittrack_2023sep_1.npy"));
+    auto ref_output3 = blobFromNPY(_tf("data/output_object_tracking_vittrack_2023sep_2.npy"));
+
+    normAssert(ref_output1, outputs[0], "VitTrack output1");
+    normAssert(ref_output2, outputs[1], "VitTrack output2");
+    normAssert(ref_output3, outputs[2], "VitTrack output3");
+}
+
+TEST_P(Test_ONNX_layers, LayerNormNoFusion) {
+    testONNXModels("layer_norm_no_fusion");
+}
+
+TEST_P(Test_ONNX_layers, MatMulAddFusion) {
+    double l1 = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.0018 : default_l1;
+    double lInf = (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) ? 0.011 : default_lInf;
+    testONNXModels("biased_matmul", npy, l1, lInf);
+}
+
+TEST_P(Test_ONNX_layers, ClipDivSharedConstant) {
+    testONNXModels("clip_div_shared_constant");
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index b795076f55a8..964fcbbbe8b5 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -619,10 +619,12 @@ TEST_P(Test_TensorFlow_layers, pooling_reduce_sum_1_2_true)
 
 TEST_P(Test_TensorFlow_layers, max_pool_grad)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     runTensorFlowNet("max_pool_grad");
 }
 
@@ -972,6 +974,9 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
     net.setPreferableBackend(backend);
     net.setPreferableTarget(target);
 
+    if (target == DNN_TARGET_CPU_FP16)
+        net.enableWinograd(false);
+
     net.setInput(blob);
     // Output has shape 1x1xNx7 where N - number of detections.
     // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
@@ -1277,7 +1282,7 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
 {
     applyTestTag(
         (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
-        CV_TEST_TAG_DEBUG_LONG
+        CV_TEST_TAG_DEBUG_VERYLONG
     );
 
 #if defined(INF_ENGINE_RELEASE)
@@ -1305,6 +1310,8 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
 
     net.setPreferableBackend(backend);
     net.setPreferableTarget(target);
+    if (target == DNN_TARGET_CPU_FP16)
+        net.enableWinograd(false);
 
     Mat img = imread(imgPath);
     Mat inp = blobFromImage(img, 1.0, Size(), Scalar(123.68, 116.78, 103.94), true, false);
@@ -1339,8 +1346,9 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
     }
     else if (target == DNN_TARGET_CPU_FP16)
     {
-        lInf_scores = 0.1;
-        l1_geometry = 0.28; lInf_geometry = 5.94;
+        lInf_scores = 0.17;
+        l1_geometry = 0.28;
+        lInf_geometry = 5.94;
     }
     else
     {
@@ -1496,17 +1504,21 @@ TEST_P(Test_TensorFlow_layers, split)
 
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     runTensorFlowNet("split");
 }
 
 TEST_P(Test_TensorFlow_layers, split_equals)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
     runTensorFlowNet("split_equals");
 }
 
@@ -1581,7 +1593,7 @@ TEST_P(Test_TensorFlow_layers, relu6)
 
 TEST_P(Test_TensorFlow_layers, subpixel)
 {
-#if defined(INF_ENGINE_RELEASE)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
@@ -1621,8 +1633,10 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_align_corners)
 // TF case: align_corners=False, half_pixel_centers=True
 TEST_P(Test_TensorFlow_layers, resize_bilinear_half_pixel)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
 
     runTensorFlowNet("resize_bilinear", false, 0.0, 0.0, false, "_half_pixel");
 }
@@ -1636,8 +1650,10 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_factor)
 // TF case: align_corners=False, half_pixel_centers=True
 TEST_P(Test_TensorFlow_layers, resize_bilinear_factor_half_pixel)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
 
     runTensorFlowNet("resize_bilinear_factor", false, 0.0, 0.0, false, "_half_pixel");
 }
@@ -1675,6 +1691,7 @@ TEST_P(Test_TensorFlow_layers, clip_by_value)
 
 TEST_P(Test_TensorFlow_layers, tf2_prelu)
 {
+    double l1 = 0, lInf = 0;
     if (backend == DNN_BACKEND_CUDA)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA); // not supported; only across channels is supported
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
@@ -1686,6 +1703,11 @@ TEST_P(Test_TensorFlow_layers, tf2_prelu)
         applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
             CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION
         );
+#elif defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2023000000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) {
+        l1 = 1e-4;
+        lInf = 1e-3;
+    }
 #elif defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021040000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
     {
@@ -1705,7 +1727,7 @@ TEST_P(Test_TensorFlow_layers, tf2_prelu)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 #endif
 
-    runTensorFlowNet("tf2_prelu");
+    runTensorFlowNet("tf2_prelu", false, l1, lInf);
 }
 
 TEST_P(Test_TensorFlow_layers, tf2_permute_nhwc_ncwh)
@@ -1782,7 +1804,10 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
     if (target == DNN_TARGET_CUDA_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
 
-    applyTestTag(CV_TEST_TAG_MEMORY_1GB, CV_TEST_TAG_DEBUG_VERYLONG);
+    applyTestTag(
+        CV_TEST_TAG_MEMORY_2GB,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
     Mat img = imread(findDataFile("dnn/street.png"));
     std::string proto = findDataFile("dnn/mask_rcnn_inception_v2_coco_2018_01_28.pbtxt");
     std::string model = findDataFile("dnn/mask_rcnn_inception_v2_coco_2018_01_28.pb", false);
@@ -1794,6 +1819,8 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
 
     net.setPreferableBackend(backend);
     net.setPreferableTarget(target);
+    if (target == DNN_TARGET_CPU_FP16)
+        net.enableWinograd(false);
 
     net.setInput(blob);
 
@@ -1810,6 +1837,11 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
 
     double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.2 : 2e-5;
     double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.018 : default_lInf;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+    {
+        scoreDiff = std::max(scoreDiff, 0.06);
+        iouDiff = std::max(iouDiff, 0.01);
+    }
     normAssertDetections(refDetections, outDetections, "", /*threshold for zero confidence*/1e-5, scoreDiff, iouDiff);
 
     // Output size of masks is NxCxHxW where
diff --git a/modules/dnn/test/test_tflite_importer.cpp b/modules/dnn/test/test_tflite_importer.cpp
index 5a1742ed9739..8d374dc0505d 100644
--- a/modules/dnn/test/test_tflite_importer.cpp
+++ b/modules/dnn/test/test_tflite_importer.cpp
@@ -20,6 +20,14 @@ namespace opencv_test { namespace {
 using namespace cv;
 using namespace cv::dnn;
 
+class Test_TFLite : public DNNTestLayer {
+public:
+    void testModel(Net& net, const std::string& modelName, const Mat& input, double l1 = 0, double lInf = 0);
+    void testModel(const std::string& modelName, const Mat& input, double l1 = 0, double lInf = 0);
+    void testModel(const std::string& modelName, const Size& inpSize, double l1 = 0, double lInf = 0);
+    void testLayer(const std::string& modelName, double l1 = 0, double lInf = 0);
+};
+
 void testInputShapes(const Net& net, const std::vector<Mat>& inps) {
     std::vector<MatShape> inLayerShapes;
     std::vector<MatShape> outLayerShapes;
@@ -31,9 +39,14 @@ void testInputShapes(const Net& net, const std::vector<Mat>& inps) {
     }
 }
 
-void testModel(const std::string& modelName, const Mat& input, double l1 = 1e-5, double lInf = 1e-4)
+void Test_TFLite::testModel(Net& net, const std::string& modelName, const Mat& input, double l1, double lInf)
 {
-    Net net = readNet(findDataFile("dnn/tflite/" + modelName + ".tflite", false));
+    l1 = l1 ? l1 : default_l1;
+    lInf = lInf ? lInf : default_lInf;
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
     testInputShapes(net, {input});
     net.setInput(input);
 
@@ -45,41 +58,102 @@ void testModel(const std::string& modelName, const Mat& input, double l1 = 1e-5,
     ASSERT_EQ(outs.size(), outNames.size());
     for (int i = 0; i < outNames.size(); ++i) {
         Mat ref = blobFromNPY(findDataFile(format("dnn/tflite/%s_out_%s.npy", modelName.c_str(), outNames[i].c_str())));
-        normAssert(ref.reshape(1, 1), outs[i].reshape(1, 1), outNames[i].c_str(), l1, lInf);
+        // A workaround solution for the following cases due to inconsistent shape definitions.
+        // The details please see: https://github.com/opencv/opencv/pull/25297#issuecomment-2039081369
+        if (modelName == "face_landmark" || modelName == "selfie_segmentation") {
+            ref = ref.reshape(1, 1);
+            outs[i] = outs[i].reshape(1, 1);
+        }
+        normAssert(ref, outs[i], outNames[i].c_str(), l1, lInf);
     }
 }
 
-void testModel(const std::string& modelName, const Size& inpSize, double l1 = 1e-5, double lInf = 1e-4)
+void Test_TFLite::testModel(const std::string& modelName, const Mat& input, double l1, double lInf)
+{
+    Net net = readNet(findDataFile("dnn/tflite/" + modelName + ".tflite", false));
+    testModel(net, modelName, input, l1, lInf);
+}
+
+void Test_TFLite::testModel(const std::string& modelName, const Size& inpSize, double l1, double lInf)
 {
     Mat input = imread(findDataFile("cv/shared/lena.png"));
     input = blobFromImage(input, 1.0 / 255, inpSize, 0, true);
     testModel(modelName, input, l1, lInf);
 }
 
+void Test_TFLite::testLayer(const std::string& modelName, double l1, double lInf)
+{
+    Mat inp = blobFromNPY(findDataFile("dnn/tflite/" + modelName + "_inp.npy"));
+    Net net = readNet(findDataFile("dnn/tflite/" + modelName + ".tflite"));
+    testModel(net, modelName, inp, l1, lInf);
+}
+
 // https://google.github.io/mediapipe/solutions/face_mesh
-TEST(Test_TFLite, face_landmark)
+TEST_P(Test_TFLite, face_landmark)
 {
-    testModel("face_landmark", Size(192, 192), 2e-5, 2e-4);
+    if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+    double l1 = 2e-5, lInf = 2e-4;
+    if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL))
+    {
+        l1 = 0.15;
+        lInf = 0.82;
+    }
+    testModel("face_landmark", Size(192, 192), l1, lInf);
 }
 
 // https://google.github.io/mediapipe/solutions/face_detection
-TEST(Test_TFLite, face_detection_short_range)
+TEST_P(Test_TFLite, face_detection_short_range)
 {
-    testModel("face_detection_short_range", Size(128, 128));
+    double l1 = 0, lInf = 2e-4;
+    if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL))
+    {
+        l1 = 0.04;
+        lInf = 0.8;
+    }
+    testModel("face_detection_short_range", Size(128, 128), l1, lInf);
 }
 
 // https://google.github.io/mediapipe/solutions/selfie_segmentation
-TEST(Test_TFLite, selfie_segmentation)
+TEST_P(Test_TFLite, selfie_segmentation)
 {
-    testModel("selfie_segmentation", Size(256, 256));
+    double l1 = 0, lInf = 0;
+    if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL))
+    {
+        l1 = 0.01;
+        lInf = 0.48;
+    }
+    testModel("selfie_segmentation", Size(256, 256), l1, lInf);
 }
 
-TEST(Test_TFLite, max_unpooling)
+TEST_P(Test_TFLite, max_unpooling)
 {
+    if (backend == DNN_BACKEND_CUDA)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2022010000)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU) {
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+    }
+
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
     // Due Max Unpoling is a numerically unstable operation and small difference between frameworks
     // might lead to positional difference of maximal elements in the tensor, this test checks
     // behavior of Max Unpooling layer only.
     Net net = readNet(findDataFile("dnn/tflite/hair_segmentation.tflite", false));
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
 
     Mat input = imread(findDataFile("cv/shared/lena.png"));
     cvtColor(input, input, COLOR_BGR2RGBA);
@@ -90,6 +164,7 @@ TEST(Test_TFLite, max_unpooling)
 
     std::vector<std::vector<Mat> > outs;
     net.forward(outs, {"p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
+
     ASSERT_EQ(outs.size(), 4);
     ASSERT_EQ(outs[0].size(), 1);
     ASSERT_EQ(outs[1].size(), 2);
@@ -105,6 +180,8 @@ TEST(Test_TFLite, max_unpooling)
     ASSERT_EQ(poolOut.size, poolIds.size);
     ASSERT_EQ(poolOut.size, unpoolInp.size);
 
+    ASSERT_EQ(countNonZero(poolInp), poolInp.total());
+
     for (int c = 0; c < 32; ++c) {
         float *poolInpData = poolInp.ptr<float>(0, c);
         float *poolOutData = poolOut.ptr<float>(0, c);
@@ -123,15 +200,23 @@ TEST(Test_TFLite, max_unpooling)
                     }
                 }
                 EXPECT_EQ(poolInpData[maxIdx], poolOutData[y * 64 + x]) << errMsg;
-                EXPECT_EQ(poolIdsData[y * 64 + x], (float)maxIdx) << errMsg;
+                if (backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
+                    EXPECT_EQ(poolIdsData[y * 64 + x], (float)maxIdx) << errMsg;
+                }
                 EXPECT_EQ(unpoolOutData[maxIdx], unpoolInpData[y * 64 + x]) << errMsg;
             }
         }
     }
 }
 
-TEST(Test_TFLite, EfficientDet_int8) {
+TEST_P(Test_TFLite, EfficientDet_int8) {
+    if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
+        backend != DNN_BACKEND_TIMVX && backend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)) {
+        throw SkipTestException("Only OpenCV, TimVX and OpenVINO targets support INT8 on CPU");
+    }
     Net net = readNet(findDataFile("dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", false));
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
 
     Mat img = imread(findDataFile("dnn/dog416.png"));
     Mat blob = blobFromImage(img, 1.0, Size(320, 320));
@@ -146,6 +231,45 @@ TEST(Test_TFLite, EfficientDet_int8) {
     normAssertDetections(ref, out, "", 0.5, 0.05, 0.1);
 }
 
+TEST_P(Test_TFLite, replicate_by_pack) {
+    double l1 = 0, lInf = 0;
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+    {
+        l1 = 4e-4;
+        lInf = 2e-3;
+    }
+    testLayer("replicate_by_pack", l1, lInf);
+}
+
+TEST_P(Test_TFLite, split) {
+    testLayer("split");
+}
+
+TEST_P(Test_TFLite, fully_connected) {
+    if (backend == DNN_BACKEND_VKCOM)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_VULKAN);
+    testLayer("fully_connected");
+}
+
+TEST_P(Test_TFLite, permute) {
+    testLayer("permutation_3d");
+    // Temporarily disabled as TFLiteConverter produces a incorrect graph in this case
+    //testLayer("permutation_4d_0123");
+    testLayer("permutation_4d_0132");
+    testLayer("permutation_4d_0213");
+    testLayer("permutation_4d_0231");
+}
+
+TEST_P(Test_TFLite, global_average_pooling_2d) {
+    testLayer("global_average_pooling_2d");
+}
+
+TEST_P(Test_TFLite, global_max_pooling_2d) {
+    testLayer("global_max_pooling_2d");
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Test_TFLite, dnnBackendsAndTargets());
+
 }}  // namespace
 
 #endif  // OPENCV_TEST_DNN_TFLITE
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index 8510ec4e64c0..f1d7521e7b0a 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -358,6 +358,8 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy)
 
     net.setPreferableBackend(backend);
     net.setPreferableTarget(target);
+    if (target == DNN_TARGET_CPU_FP16)
+        net.enableWinograd(false);
 
     Mat sample = imread(findDataFile("cv/shared/lena.png"));
     Mat sampleF32(sample.size(), CV_32FC3);
@@ -449,7 +451,7 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
         throw SkipTestException("");
     }
 #endif
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2023000000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 #endif
@@ -542,6 +544,9 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
         Mat img = imread(findDataFile("dnn/googlenet_1.png"));
         Mat inputBlob = blobFromImage(img, 1.0, Size(), Scalar(103.939, 116.779, 123.68), false);
 
+        if (target == DNN_TARGET_CPU_FP16)
+            net.enableWinograd(false);
+
         net.setInput(inputBlob);
         Mat out = net.forward();
 
@@ -566,14 +571,14 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
         }
         else if(target == DNN_TARGET_CUDA_FP16)
         {
-            normAssert(out, refBlob, "", 0.6, 25);
+            normAssert(out, refBlob, "", 0.6, 26);
         }
         else if (target == DNN_TARGET_CPU_FP16)
         {
-            normAssert(out, refBlob, "", 0.62, 25);
+            normAssert(out, refBlob, "", 0.7, 25);
         }
         else
-            normAssert(out, refBlob, "", 0.5, 1.1);
+            normAssert(out, refBlob, "", 0.5, 1.16);
     }
 }
 
diff --git a/modules/features2d/3rdparty/mscr/chi_table.h b/modules/features2d/3rdparty/mscr/chi_table.h
new file mode 100644
index 000000000000..c0e9bae046d2
--- /dev/null
+++ b/modules/features2d/3rdparty/mscr/chi_table.h
@@ -0,0 +1,135 @@
+/*
+**
+**                           License Agreement
+**                           For chi_table.h
+**
+** Copyright (C) 2007 Per-Erik Forssen, all rights reserved.
+**
+** Redistribution and use in source and binary forms, with or without modification,
+** are permitted provided that the following conditions are met:
+**
+**   * Redistribution's of source code must retain the above copyright notice,
+**     this list of conditions and the following disclaimer.
+**
+**   * Redistribution's in binary form must reproduce the above copyright notice,
+**     this list of conditions and the following disclaimer in the documentation
+**     and/or other materials provided with the distribution.
+**
+**   * The name of the copyright holders may not be used to endorse or promote products
+**     derived from this software without specific prior written permission.
+**
+** This software is provided by the copyright holders and contributors "as is" and
+** any express or implied warranties, including, but not limited to, the implied
+** warranties of merchantability and fitness for a particular purpose are disclaimed.
+** In no event shall the Intel Corporation or contributors be liable for any direct,
+** indirect, incidental, special, exemplary, or consequential damages
+** (including, but not limited to, procurement of substitute goods or services;
+** loss of use, data, or profits; or business interruption) however caused
+** and on any theory of liability, whether in contract, strict liability,
+** or tort (including negligence or otherwise) arising in any way out of
+** the use of this software, even if advised of the possibility of such damage.
+**
+** Content origin: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
+*/
+#define TABLE_SIZE 400
+
+static double chitab3[]={0,  0.0150057,  0.0239478,  0.0315227,
+                  0.0383427,  0.0446605,  0.0506115,  0.0562786,
+                  0.0617174,  0.0669672,  0.0720573,  0.0770099,
+                  0.081843,  0.0865705,  0.0912043,  0.0957541,
+                  0.100228,  0.104633,  0.108976,  0.113261,
+                  0.117493,  0.121676,  0.125814,  0.12991,
+                  0.133967,  0.137987,  0.141974,  0.145929,
+                  0.149853,  0.15375,  0.15762,  0.161466,
+                  0.165287,  0.169087,  0.172866,  0.176625,
+                  0.180365,  0.184088,  0.187794,  0.191483,
+                  0.195158,  0.198819,  0.202466,  0.2061,
+                  0.209722,  0.213332,  0.216932,  0.220521,
+                  0.2241,  0.22767,  0.231231,  0.234783,
+                  0.238328,  0.241865,  0.245395,  0.248918,
+                  0.252435,  0.255947,  0.259452,  0.262952,
+                  0.266448,  0.269939,  0.273425,  0.276908,
+                  0.280386,  0.283862,  0.287334,  0.290803,
+                  0.29427,  0.297734,  0.301197,  0.304657,
+                  0.308115,  0.311573,  0.315028,  0.318483,
+                  0.321937,  0.32539,  0.328843,  0.332296,
+                  0.335749,  0.339201,  0.342654,  0.346108,
+                  0.349562,  0.353017,  0.356473,  0.35993,
+                  0.363389,  0.366849,  0.37031,  0.373774,
+                  0.377239,  0.380706,  0.384176,  0.387648,
+                  0.391123,  0.3946,  0.39808,  0.401563,
+                  0.405049,  0.408539,  0.412032,  0.415528,
+                  0.419028,  0.422531,  0.426039,  0.429551,
+                  0.433066,  0.436586,  0.440111,  0.44364,
+                  0.447173,  0.450712,  0.454255,  0.457803,
+                  0.461356,  0.464915,  0.468479,  0.472049,
+                  0.475624,  0.479205,  0.482792,  0.486384,
+                  0.489983,  0.493588,  0.4972,  0.500818,
+                  0.504442,  0.508073,  0.511711,  0.515356,
+                  0.519008,  0.522667,  0.526334,  0.530008,
+                  0.533689,  0.537378,  0.541075,  0.54478,
+                  0.548492,  0.552213,  0.555942,  0.55968,
+                  0.563425,  0.56718,  0.570943,  0.574715,
+                  0.578497,  0.582287,  0.586086,  0.589895,
+                  0.593713,  0.597541,  0.601379,  0.605227,
+                  0.609084,  0.612952,  0.61683,  0.620718,
+                  0.624617,  0.628526,  0.632447,  0.636378,
+                  0.64032,  0.644274,  0.648239,  0.652215,
+                  0.656203,  0.660203,  0.664215,  0.668238,
+                  0.672274,  0.676323,  0.680384,  0.684457,
+                  0.688543,  0.692643,  0.696755,  0.700881,
+                  0.70502,  0.709172,  0.713339,  0.717519,
+                  0.721714,  0.725922,  0.730145,  0.734383,
+                  0.738636,  0.742903,  0.747185,  0.751483,
+                  0.755796,  0.760125,  0.76447,  0.768831,
+                  0.773208,  0.777601,  0.782011,  0.786438,
+                  0.790882,  0.795343,  0.799821,  0.804318,
+                  0.808831,  0.813363,  0.817913,  0.822482,
+                  0.827069,  0.831676,  0.836301,  0.840946,
+                  0.84561,  0.850295,  0.854999,  0.859724,
+                  0.864469,  0.869235,  0.874022,  0.878831,
+                  0.883661,  0.888513,  0.893387,  0.898284,
+                  0.903204,  0.908146,  0.913112,  0.918101,
+                  0.923114,  0.928152,  0.933214,  0.938301,
+                  0.943413,  0.94855,  0.953713,  0.958903,
+                  0.964119,  0.969361,  0.974631,  0.979929,
+                  0.985254,  0.990608,  0.99599,  1.0014,
+                  1.00684,  1.01231,  1.01781,  1.02335,
+                  1.02891,  1.0345,  1.04013,  1.04579,
+                  1.05148,  1.05721,  1.06296,  1.06876,
+                  1.07459,  1.08045,  1.08635,  1.09228,
+                  1.09826,  1.10427,  1.11032,  1.1164,
+                  1.12253,  1.1287,  1.1349,  1.14115,
+                  1.14744,  1.15377,  1.16015,  1.16656,
+                  1.17303,  1.17954,  1.18609,  1.19269,
+                  1.19934,  1.20603,  1.21278,  1.21958,
+                  1.22642,  1.23332,  1.24027,  1.24727,
+                  1.25433,  1.26144,  1.26861,  1.27584,
+                  1.28312,  1.29047,  1.29787,  1.30534,
+                  1.31287,  1.32046,  1.32812,  1.33585,
+                  1.34364,  1.3515,  1.35943,  1.36744,
+                  1.37551,  1.38367,  1.39189,  1.4002,
+                  1.40859,  1.41705,  1.42561,  1.43424,
+                  1.44296,  1.45177,  1.46068,  1.46967,
+                  1.47876,  1.48795,  1.49723,  1.50662,
+                  1.51611,  1.52571,  1.53541,  1.54523,
+                  1.55517,  1.56522,  1.57539,  1.58568,
+                  1.59611,  1.60666,  1.61735,  1.62817,
+                  1.63914,  1.65025,  1.66152,  1.67293,
+                  1.68451,  1.69625,  1.70815,  1.72023,
+                  1.73249,  1.74494,  1.75757,  1.77041,
+                  1.78344,  1.79669,  1.81016,  1.82385,
+                  1.83777,  1.85194,  1.86635,  1.88103,
+                  1.89598,  1.91121,  1.92674,  1.94257,
+                  1.95871,  1.97519,  1.99201,  2.0092,
+                  2.02676,  2.04471,  2.06309,  2.08189,
+                  2.10115,  2.12089,  2.14114,  2.16192,
+                  2.18326,  2.2052,  2.22777,  2.25101,
+                  2.27496,  2.29966,  2.32518,  2.35156,
+                  2.37886,  2.40717,  2.43655,  2.46709,
+                  2.49889,  2.53206,  2.56673,  2.60305,
+                  2.64117,  2.6813,  2.72367,  2.76854,
+                  2.81623,  2.86714,  2.92173,  2.98059,
+                  3.04446,  3.1143,  3.19135,  3.27731,
+                  3.37455,  3.48653,  3.61862,  3.77982,
+                  3.98692,  4.2776,  4.77167,  133.333 };
diff --git a/modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt b/modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt
new file mode 100644
index 000000000000..66b272dd2d09
--- /dev/null
+++ b/modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt
@@ -0,0 +1,28 @@
+                          License Agreement
+                          For chi_table.h
+
+Copyright (C) 2007 Per-Erik Forssen, all rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+  * Redistribution's of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+  * Redistribution's in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+  * The name of the copyright holders may not be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and
+any express or implied warranties, including, but not limited to, the implied
+warranties of merchantability and fitness for a particular purpose are disclaimed.
+In no event shall the Intel Corporation or contributors be liable for any direct,
+indirect, incidental, special, exemplary, or consequential damages
+(including, but not limited to, procurement of substitute goods or services;
+loss of use, data, or profits; or business interruption) however caused
+and on any theory of liability, whether in contract, strict liability,
+or tort (including negligence or otherwise) arising in any way out of
+the use of this software, even if advised of the possibility of such damage.
diff --git a/modules/features2d/CMakeLists.txt b/modules/features2d/CMakeLists.txt
index a586d4606e79..91fea8bcc8d3 100644
--- a/modules/features2d/CMakeLists.txt
+++ b/modules/features2d/CMakeLists.txt
@@ -7,3 +7,5 @@ if(DEBUG_opencv_features2d)
   list(APPEND debug_modules opencv_highgui)
 endif()
 ocv_define_module(features2d opencv_imgproc ${debug_modules} OPTIONAL opencv_flann WRAP java objc python js)
+
+ocv_install_3rdparty_licenses(mscr "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mscr/chi_table_LICENSE.txt")
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 98ae85f35351..b4c4dde7121f 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -56,15 +56,15 @@
     @defgroup features2d_main Feature Detection and Description
     @defgroup features2d_match Descriptor Matchers
 
-Matchers of keypoint descriptors in OpenCV have wrappers with a common interface that enables you to
-easily switch between different algorithms solving the same problem. This section is devoted to
-matching descriptors that are represented as vectors in a multidimensional space. All objects that
-implement vector descriptor matchers inherit the DescriptorMatcher interface.
+    Matchers of keypoint descriptors in OpenCV have wrappers with a common interface that enables
+    you to easily switch between different algorithms solving the same problem. This section is
+    devoted to matching descriptors that are represented as vectors in a multidimensional space.
+    All objects that implement vector descriptor matchers inherit the DescriptorMatcher interface.
 
     @defgroup features2d_draw Drawing Function of Keypoints and Matches
     @defgroup features2d_category Object Categorization
 
-This section describes approaches based on local 2D features and used to categorize objects.
+    This section describes approaches based on local 2D features and used to categorize objects.
 
     @defgroup feature2d_hal Hardware Acceleration Layer
     @{
@@ -567,10 +567,6 @@ class CV_EXPORTS_W MSER : public Feature2D
     CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
 };
 
-//! @} features2d_main
-
-//! @addtogroup features2d_main
-//! @{
 
 /** @brief Wrapping class for feature detection using the FAST method. :
  */
@@ -627,10 +623,6 @@ detection, use cv.FAST.detect() method.
 CV_EXPORTS void FAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
                       int threshold, bool nonmaxSuppression, FastFeatureDetector::DetectorType type );
 
-//! @} features2d_main
-
-//! @addtogroup features2d_main
-//! @{
 
 /** @brief Wrapping class for feature detection using the AGAST method. :
  */
@@ -793,10 +785,6 @@ class CV_EXPORTS_W SimpleBlobDetector : public Feature2D
   CV_WRAP virtual const std::vector<std::vector<cv::Point> >& getBlobContours() const;
 };
 
-//! @} features2d_main
-
-//! @addtogroup features2d_main
-//! @{
 
 /** @brief Class implementing the KAZE keypoint detector and descriptor extractor, described in @cite ABD12 .
 
@@ -889,11 +877,15 @@ class CV_EXPORTS_W AKAZE : public Feature2D
     @param nOctaveLayers Default number of sublevels per scale level
     @param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
     DIFF_CHARBONNIER
+    @param max_points Maximum amount of returned points. In case if image contains
+    more features, then the features with highest response are returned.
+    Negative value means no limitation.
      */
     CV_WRAP static Ptr<AKAZE> create(AKAZE::DescriptorType descriptor_type = AKAZE::DESCRIPTOR_MLDB,
                                      int descriptor_size = 0, int descriptor_channels = 3,
                                      float threshold = 0.001f, int nOctaves = 4,
-                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2);
+                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2,
+                                     int max_points = -1);
 
     CV_WRAP virtual void setDescriptorType(AKAZE::DescriptorType dtype) = 0;
     CV_WRAP virtual AKAZE::DescriptorType getDescriptorType() const = 0;
@@ -916,9 +908,11 @@ class CV_EXPORTS_W AKAZE : public Feature2D
     CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
     CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
     CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    CV_WRAP virtual void setMaxPoints(int max_points) = 0;
+    CV_WRAP virtual int getMaxPoints() const = 0;
 };
 
-//! @} features2d_main
 
 /****************************************************************************************\
 *                                      Distance                                          *
@@ -983,6 +977,8 @@ struct L1
     }
 };
 
+//! @} features2d_main
+
 /****************************************************************************************\
 *                                  DescriptorMatcher                                     *
 \****************************************************************************************/
@@ -1424,6 +1420,9 @@ CV_EXPORTS_AS(drawMatchesKnn) void drawMatches( InputArray img1, const std::vect
 *   Functions to evaluate the feature detectors and [generic] descriptor extractors      *
 \****************************************************************************************/
 
+//! @addtogroup features2d_main
+//! @{
+
 CV_EXPORTS void evaluateFeatureDetector( const Mat& img1, const Mat& img2, const Mat& H1to2,
                                          std::vector<KeyPoint>* keypoints1, std::vector<KeyPoint>* keypoints2,
                                          float& repeatability, int& correspCount,
@@ -1436,6 +1435,8 @@ CV_EXPORTS void computeRecallPrecisionCurve( const std::vector<std::vector<DMatc
 CV_EXPORTS float getRecall( const std::vector<Point2f>& recallPrecisionCurve, float l_precision );
 CV_EXPORTS int getNearestPoint( const std::vector<Point2f>& recallPrecisionCurve, float l_precision );
 
+//! @}
+
 /****************************************************************************************\
 *                                     Bag of visual words                                *
 \****************************************************************************************/
@@ -1537,8 +1538,8 @@ class CV_EXPORTS_W BOWImgDescriptorExtractor
     @param dmatcher Descriptor matcher that is used to find the nearest word of the trained vocabulary
     for each keypoint descriptor of the image.
      */
-    CV_WRAP BOWImgDescriptorExtractor( const Ptr<DescriptorExtractor>& dextractor,
-                               const Ptr<DescriptorMatcher>& dmatcher );
+    CV_WRAP BOWImgDescriptorExtractor( const Ptr<Feature2D>& dextractor,
+                                       const Ptr<DescriptorMatcher>& dmatcher );
     /** @overload */
     BOWImgDescriptorExtractor( const Ptr<DescriptorMatcher>& dmatcher );
     virtual ~BOWImgDescriptorExtractor();
@@ -1596,8 +1597,6 @@ class CV_EXPORTS_W BOWImgDescriptorExtractor
 
 //! @} features2d_category
 
-//! @} features2d
-
 } /* namespace cv */
 
 #endif
diff --git a/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java b/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java
index fd98cddee11f..a64b6ae4ad87 100644
--- a/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java
+++ b/modules/features2d/misc/java/test/AKAZEDescriptorExtractorTest.java
@@ -58,7 +58,7 @@ public void testWriteYml() {
 
         extractor.write(filename);
 
-        String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.AKAZE\"\ndescriptor: 5\ndescriptor_channels: 3\ndescriptor_size: 0\nthreshold: 1.0000000474974513e-03\noctaves: 4\nsublevels: 4\ndiffusivity: 1\n";
+        String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.AKAZE\"\ndescriptor: 5\ndescriptor_channels: 3\ndescriptor_size: 0\nthreshold: 0.0010000000474974513\noctaves: 4\nsublevels: 4\ndiffusivity: 1\nmax_points: -1\n";
         String actual = readFile(filename);
         actual = actual.replaceAll("e([+-])0(\\d\\d)", "e$1$2"); // NOTE: workaround for different platforms double representation
         assertEquals(truth, actual);
diff --git a/modules/features2d/misc/java/test/BOWImgDescriptorExtractorTest.java b/modules/features2d/misc/java/test/BOWImgDescriptorExtractorTest.java
new file mode 100644
index 000000000000..a8ff001e858b
--- /dev/null
+++ b/modules/features2d/misc/java/test/BOWImgDescriptorExtractorTest.java
@@ -0,0 +1,48 @@
+package org.opencv.test.features2d;
+
+import org.opencv.core.Core;
+import org.opencv.core.CvType;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfKeyPoint;
+import org.opencv.core.Point;
+import org.opencv.core.Scalar;
+import org.opencv.core.KeyPoint;
+import org.opencv.features2d.ORB;
+import org.opencv.features2d.DescriptorMatcher;
+import org.opencv.features2d.BOWImgDescriptorExtractor;
+import org.opencv.test.OpenCVTestCase;
+import org.opencv.test.OpenCVTestRunner;
+import org.opencv.imgproc.Imgproc;
+
+public class BOWImgDescriptorExtractorTest extends OpenCVTestCase {
+
+    ORB extractor;
+    DescriptorMatcher matcher;
+    int matSize;
+
+    public static void assertDescriptorsClose(Mat expected, Mat actual, int allowedDistance) {
+        double distance = Core.norm(expected, actual, Core.NORM_HAMMING);
+        assertTrue("expected:<" + allowedDistance + "> but was:<" + distance + ">", distance <= allowedDistance);
+    }
+
+    private Mat getTestImg() {
+        Mat cross = new Mat(matSize, matSize, CvType.CV_8U, new Scalar(255));
+        Imgproc.line(cross, new Point(20, matSize / 2), new Point(matSize - 21, matSize / 2), new Scalar(100), 2);
+        Imgproc.line(cross, new Point(matSize / 2, 20), new Point(matSize / 2, matSize - 21), new Scalar(100), 2);
+
+        return cross;
+    }
+
+    @Override
+    protected void setUp() throws Exception {
+        super.setUp();
+        extractor = ORB.create();
+        matcher = DescriptorMatcher.create(DescriptorMatcher.BRUTEFORCE);
+        matSize = 100;
+    }
+
+    public void testCreate() {
+        BOWImgDescriptorExtractor bow = new BOWImgDescriptorExtractor(extractor, matcher);
+    }
+
+}
diff --git a/modules/features2d/misc/java/test/GFTTFeatureDetectorTest.java b/modules/features2d/misc/java/test/GFTTFeatureDetectorTest.java
index 86e42cbc1d0f..d21d4f2475ae 100644
--- a/modules/features2d/misc/java/test/GFTTFeatureDetectorTest.java
+++ b/modules/features2d/misc/java/test/GFTTFeatureDetectorTest.java
@@ -58,7 +58,7 @@ public void testWriteYml() {
 
         detector.write(filename);
 
-        String truth = "%YAML:1.0\n---\nname: \"Feature2D.GFTTDetector\"\nnfeatures: 1000\nqualityLevel: 1.0000000000000000e-02\nminDistance: 1.\nblockSize: 3\ngradSize: 3\nuseHarrisDetector: 0\nk: 4.0000000000000001e-02\n";
+        String truth = "%YAML:1.0\n---\nname: \"Feature2D.GFTTDetector\"\nnfeatures: 1000\nqualityLevel: 0.01\nminDistance: 1.\nblockSize: 3\ngradSize: 3\nuseHarrisDetector: 0\nk: 0.040000000000000001\n";
         String actual = readFile(filename);
         actual = actual.replaceAll("e([+-])0(\\d\\d)", "e$1$2"); // NOTE: workaround for different platforms double representation
         assertEquals(truth, actual);
diff --git a/modules/features2d/misc/java/test/KAZEDescriptorExtractorTest.java b/modules/features2d/misc/java/test/KAZEDescriptorExtractorTest.java
index 69ca35e01597..d33ee24f49ce 100644
--- a/modules/features2d/misc/java/test/KAZEDescriptorExtractorTest.java
+++ b/modules/features2d/misc/java/test/KAZEDescriptorExtractorTest.java
@@ -57,7 +57,7 @@ public void testWriteYml() {
 
         extractor.write(filename);
 
-        String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.KAZE\"\nextended: 0\nupright: 0\nthreshold: 1.0000000474974513e-03\noctaves: 4\nsublevels: 4\ndiffusivity: 1\n";
+        String truth = "%YAML:1.0\n---\nformat: 3\nname: \"Feature2D.KAZE\"\nextended: 0\nupright: 0\nthreshold: 0.0010000000474974513\noctaves: 4\nsublevels: 4\ndiffusivity: 1\n";
         String actual = readFile(filename);
         actual = actual.replaceAll("e([+-])0(\\d\\d)", "e$1$2"); // NOTE: workaround for different platforms double representation
         assertEquals(truth, actual);
diff --git a/modules/features2d/misc/java/test/MSERFeatureDetectorTest.java b/modules/features2d/misc/java/test/MSERFeatureDetectorTest.java
index 7f5f1c1849db..956e0600e373 100644
--- a/modules/features2d/misc/java/test/MSERFeatureDetectorTest.java
+++ b/modules/features2d/misc/java/test/MSERFeatureDetectorTest.java
@@ -61,7 +61,7 @@ public void testWriteYml() {
 
         detector.write(filename);
 
-        String truth = "%YAML:1.0\n---\nname: \"Feature2D.MSER\"\ndelta: 5\nminArea: 60\nmaxArea: 14400\nmaxVariation: 2.5000000000000000e-01\nminDiversity: 2.0000000000000001e-01\nmaxEvolution: 200\nareaThreshold: 1.0100000000000000e+00\nminMargin: 3.0000000000000001e-03\nedgeBlurSize: 5\npass2Only: 0\n";
+        String truth = "%YAML:1.0\n---\nname: \"Feature2D.MSER\"\ndelta: 5\nminArea: 60\nmaxArea: 14400\nmaxVariation: 0.25\nminDiversity: 0.20000000000000001\nmaxEvolution: 200\nareaThreshold: 1.01\nminMargin: 0.0030000000000000001\nedgeBlurSize: 5\npass2Only: 0\n";
         String actual = readFile(filename);
         actual = actual.replaceAll("e([+-])0(\\d\\d)", "e$1$2"); // NOTE: workaround for different platforms double representation
         assertEquals(truth, actual);
diff --git a/modules/features2d/misc/java/test/ORBDescriptorExtractorTest.java b/modules/features2d/misc/java/test/ORBDescriptorExtractorTest.java
index 6bc9bb6299dc..a1a96491f515 100644
--- a/modules/features2d/misc/java/test/ORBDescriptorExtractorTest.java
+++ b/modules/features2d/misc/java/test/ORBDescriptorExtractorTest.java
@@ -111,7 +111,7 @@ public void testWriteYml() {
 
         extractor.write(filename);
 
-        String truth = "%YAML:1.0\n---\nname: \"Feature2D.ORB\"\nnfeatures: 500\nscaleFactor: 1.2000000476837158e+00\nnlevels: 8\nedgeThreshold: 31\nfirstLevel: 0\nwta_k: 2\nscoreType: 0\npatchSize: 31\nfastThreshold: 20\n";
+        String truth = "%YAML:1.0\n---\nname: \"Feature2D.ORB\"\nnfeatures: 500\nscaleFactor: 1.2000000476837158\nnlevels: 8\nedgeThreshold: 31\nfirstLevel: 0\nwta_k: 2\nscoreType: 0\npatchSize: 31\nfastThreshold: 20\n";
 //        String truth = "%YAML:1.0\n---\n";
         String actual = readFile(filename);
         actual = actual.replaceAll("e\\+000", "e+00"); // NOTE: workaround for different platforms double representation
diff --git a/modules/features2d/misc/java/test/SIFTDescriptorExtractorTest.java b/modules/features2d/misc/java/test/SIFTDescriptorExtractorTest.java
index 63a59aa58c76..dcd8564c3cd1 100644
--- a/modules/features2d/misc/java/test/SIFTDescriptorExtractorTest.java
+++ b/modules/features2d/misc/java/test/SIFTDescriptorExtractorTest.java
@@ -100,7 +100,7 @@ public void testWriteYml() {
 
         extractor.write(filename);
 
-        String truth = "%YAML:1.0\n---\nname: \"Feature2D.SIFT\"\nnfeatures: 0\nnOctaveLayers: 3\ncontrastThreshold: 4.0000000000000001e-02\nedgeThreshold: 10.\nsigma: 1.6000000000000001e+00\ndescriptorType: 5\n";
+        String truth = "%YAML:1.0\n---\nname: \"Feature2D.SIFT\"\nnfeatures: 0\nnOctaveLayers: 3\ncontrastThreshold: 0.040000000000000001\nedgeThreshold: 10.\nsigma: 1.6000000000000001\ndescriptorType: 5\n";
         String actual = readFile(filename);
         actual = actual.replaceAll("e([+-])0(\\d\\d)", "e$1$2"); // NOTE: workaround for different platforms double representation
         assertEquals(truth, actual);
diff --git a/modules/features2d/misc/java/test/SIMPLEBLOBFeatureDetectorTest.java b/modules/features2d/misc/java/test/SIMPLEBLOBFeatureDetectorTest.java
index a67a0e8c3ae4..75817ca6b177 100644
--- a/modules/features2d/misc/java/test/SIMPLEBLOBFeatureDetectorTest.java
+++ b/modules/features2d/misc/java/test/SIMPLEBLOBFeatureDetectorTest.java
@@ -133,8 +133,7 @@ public void testWrite() {
         String filename = OpenCVTestRunner.getTempFileName("xml");
 
         detector.write(filename);
-
-        String truth = "<?xml version=\"1.0\"?>\n<opencv_storage>\n<format>3</format>\n<thresholdStep>10.</thresholdStep>\n<minThreshold>50.</minThreshold>\n<maxThreshold>220.</maxThreshold>\n<minRepeatability>2</minRepeatability>\n<minDistBetweenBlobs>10.</minDistBetweenBlobs>\n<filterByColor>1</filterByColor>\n<blobColor>0</blobColor>\n<filterByArea>1</filterByArea>\n<minArea>25.</minArea>\n<maxArea>5000.</maxArea>\n<filterByCircularity>0</filterByCircularity>\n<minCircularity>8.0000001192092896e-01</minCircularity>\n<maxCircularity>3.4028234663852886e+38</maxCircularity>\n<filterByInertia>1</filterByInertia>\n<minInertiaRatio>1.0000000149011612e-01</minInertiaRatio>\n<maxInertiaRatio>3.4028234663852886e+38</maxInertiaRatio>\n<filterByConvexity>1</filterByConvexity>\n<minConvexity>9.4999998807907104e-01</minConvexity>\n<maxConvexity>3.4028234663852886e+38</maxConvexity>\n<collectContours>0</collectContours>\n</opencv_storage>\n";
+        String truth = "<?xml version=\"1.0\"?>\n<opencv_storage>\n<format>3</format>\n<thresholdStep>10.</thresholdStep>\n<minThreshold>50.</minThreshold>\n<maxThreshold>220.</maxThreshold>\n<minRepeatability>2</minRepeatability>\n<minDistBetweenBlobs>10.</minDistBetweenBlobs>\n<filterByColor>1</filterByColor>\n<blobColor>0</blobColor>\n<filterByArea>1</filterByArea>\n<minArea>25.</minArea>\n<maxArea>5000.</maxArea>\n<filterByCircularity>0</filterByCircularity>\n<minCircularity>0.80000001192092896</minCircularity>\n<maxCircularity>3.4028234663852886e+38</maxCircularity>\n<filterByInertia>1</filterByInertia>\n<minInertiaRatio>0.10000000149011612</minInertiaRatio>\n<maxInertiaRatio>3.4028234663852886e+38</maxInertiaRatio>\n<filterByConvexity>1</filterByConvexity>\n<minConvexity>0.94999998807907104</minConvexity>\n<maxConvexity>3.4028234663852886e+38</maxConvexity>\n<collectContours>0</collectContours>\n</opencv_storage>\n";
         assertEquals(truth, readFile(filename));
     }
 }
diff --git a/modules/features2d/src/akaze.cpp b/modules/features2d/src/akaze.cpp
index 7aa97dae36ef..a41ee552004a 100644
--- a/modules/features2d/src/akaze.cpp
+++ b/modules/features2d/src/akaze.cpp
@@ -61,7 +61,7 @@ namespace cv
     {
     public:
         AKAZE_Impl(DescriptorType _descriptor_type, int _descriptor_size, int _descriptor_channels,
-                 float _threshold, int _octaves, int _sublevels, KAZE::DiffusivityType _diffusivity)
+                 float _threshold, int _octaves, int _sublevels, KAZE::DiffusivityType _diffusivity, int _max_points)
         : descriptor(_descriptor_type)
         , descriptor_channels(_descriptor_channels)
         , descriptor_size(_descriptor_size)
@@ -69,6 +69,7 @@ namespace cv
         , octaves(_octaves)
         , sublevels(_sublevels)
         , diffusivity(_diffusivity)
+        , max_points(_max_points)
         {
         }
 
@@ -98,6 +99,9 @@ namespace cv
         void setDiffusivity(KAZE::DiffusivityType diff_) CV_OVERRIDE{ diffusivity = diff_; }
         KAZE::DiffusivityType getDiffusivity() const CV_OVERRIDE{ return diffusivity; }
 
+        void setMaxPoints(int max_points_) CV_OVERRIDE { max_points = max_points_; }
+        int getMaxPoints() const CV_OVERRIDE { return max_points; }
+
         // returns the descriptor size in bytes
         int descriptorSize() const CV_OVERRIDE
         {
@@ -195,6 +199,12 @@ namespace cv
                 KeyPointsFilter::runByPixelsMask(keypoints, mask.getMat());
             }
 
+            if (max_points > 0 && (int)keypoints.size() > max_points) {
+                std::partial_sort(keypoints.begin(), keypoints.begin() + max_points, keypoints.end(),
+                    [](const cv::KeyPoint& k1, const cv::KeyPoint& k2) {return k1.response > k2.response;});
+                keypoints.erase(keypoints.begin() + max_points, keypoints.end());
+            }
+
             if(descriptors.needed())
             {
                 impl.Compute_Descriptors(keypoints, descriptors);
@@ -215,6 +225,7 @@ namespace cv
             fs << "octaves" << octaves;
             fs << "sublevels" << sublevels;
             fs << "diffusivity" << diffusivity;
+            fs << "max_points" << max_points;
         }
 
         void read(const FileNode& fn) CV_OVERRIDE
@@ -234,6 +245,8 @@ namespace cv
                 sublevels = (int)fn["sublevels"];
             if (!fn["diffusivity"].empty())
                 diffusivity = static_cast<KAZE::DiffusivityType>((int)fn["diffusivity"]);
+            if (!fn["max_points"].empty())
+                max_points = (int)fn["max_points"];
         }
 
         DescriptorType descriptor;
@@ -243,15 +256,16 @@ namespace cv
         int octaves;
         int sublevels;
         KAZE::DiffusivityType diffusivity;
+        int max_points;
     };
 
     Ptr<AKAZE> AKAZE::create(DescriptorType descriptor_type,
                              int descriptor_size, int descriptor_channels,
                              float threshold, int octaves,
-                             int sublevels, KAZE::DiffusivityType diffusivity)
+                             int sublevels, KAZE::DiffusivityType diffusivity, int max_points)
     {
         return makePtr<AKAZE_Impl>(descriptor_type, descriptor_size, descriptor_channels,
-                                   threshold, octaves, sublevels, diffusivity);
+                                   threshold, octaves, sublevels, diffusivity, max_points);
     }
 
     String AKAZE::getDefaultName() const
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index 163f02717e09..cb088eb5351a 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -120,8 +120,8 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                         for (; j < img.cols - 16 - 3; j += 16, ptr += 16)
                         {
                             v_uint8x16 v = v_load(ptr);
-                            v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
-                            v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
+                            v_int8x16 v0 = v_reinterpret_as_s8(v_xor(v_add(v, t), delta));
+                            v_int8x16 v1 = v_reinterpret_as_s8(v_xor(v_sub(v, t), delta));
 
                             v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
                             v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
@@ -129,15 +129,15 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                             v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
 
                             v_int8x16 m0, m1;
-                            m0 = (v0 < x0) & (v0 < x1);
-                            m1 = (x0 < v1) & (x1 < v1);
-                            m0 = m0 | ((v0 < x1) & (v0 < x2));
-                            m1 = m1 | ((x1 < v1) & (x2 < v1));
-                            m0 = m0 | ((v0 < x2) & (v0 < x3));
-                            m1 = m1 | ((x2 < v1) & (x3 < v1));
-                            m0 = m0 | ((v0 < x3) & (v0 < x0));
-                            m1 = m1 | ((x3 < v1) & (x0 < v1));
-                            m0 = m0 | m1;
+                            m0 = v_and(v_lt(v0, x0), v_lt(v0, x1));
+                            m1 = v_and(v_lt(x0, v1), v_lt(x1, v1));
+                            m0 = v_or(m0, v_and(v_lt(v0, x1), v_lt(v0, x2)));
+                            m1 = v_or(m1, v_and(v_lt(x1, v1), v_lt(x2, v1)));
+                            m0 = v_or(m0, v_and(v_lt(v0, x2), v_lt(v0, x3)));
+                            m1 = v_or(m1, v_and(v_lt(x2, v1), v_lt(x3, v1)));
+                            m0 = v_or(m0, v_and(v_lt(v0, x3), v_lt(v0, x0)));
+                            m1 = v_or(m1, v_and(v_lt(x3, v1), v_lt(x0, v1)));
+                            m0 = v_or(m0, m1);
 
                             if( !v_check_any(m0) )
                                 continue;
@@ -154,18 +154,18 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                             v_uint8x16 max1 = v_setzero_u8();
                             for( k = 0; k < N; k++ )
                             {
-                                v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
-                                m0 = v0 < x;
-                                m1 = x < v1;
+                                v_int8x16 x = v_reinterpret_as_s8(v_xor(v_load((ptr + pixel[k])), delta));
+                                m0 = v_lt(v0, x);
+                                m1 = v_lt(x, v1);
 
-                                c0 = v_sub_wrap(c0, m0) & m0;
-                                c1 = v_sub_wrap(c1, m1) & m1;
+                                c0 = v_and(v_sub_wrap(c0, m0), m0);
+                                c1 = v_and(v_sub_wrap(c1, m1), m1);
 
                                 max0 = v_max(max0, v_reinterpret_as_u8(c0));
                                 max1 = v_max(max1, v_reinterpret_as_u8(c1));
                             }
 
-                            max0 = K16 < v_max(max0, max1);
+                            max0 = v_lt(K16, v_max(max0, max1));
                             unsigned int m = v_signmask(v_reinterpret_as_s8(max0));
 
                             for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
@@ -190,7 +190,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                                             a1 = v_min(a1, v_nms);
                                             b1 = v_max(b1, v_nms);
                                         }
-                                        curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1);
+                                        curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_sub(v_setzero_s16(), v_min(b0, b1)))) - 1);
                                     }
                                 }
                             }
diff --git a/modules/features2d/src/fast_score.cpp b/modules/features2d/src/fast_score.cpp
index 0bc011af491f..0c43ad55528d 100644
--- a/modules/features2d/src/fast_score.cpp
+++ b/modules/features2d/src/fast_score.cpp
@@ -160,7 +160,7 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
             q0 = v_max(q0, v_min(a, v0));
             q1 = v_min(q1, v_max(b, v0));
         }
-        q0 = v_max(q0, v_setzero_s16() - q1);
+        q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
         threshold = v_reduce_max(q0) - 1;
     }
     else
@@ -251,7 +251,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
             q0 = v_max(q0, v_min(a, v0));
             q1 = v_min(q1, v_max(b, v0));
         }
-        q0 = v_max(q0, v_setzero_s16() - q1);
+        q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
         threshold = v_reduce_max(q0) - 1;
     }
     else
@@ -323,7 +323,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
         v0 = v_load(d + 5);
         q0 = v_max(q0, v_min(a, v0));
         q1 = v_min(q1, v_max(b, v0));
-        q0 = v_max(q0, v_setzero_s16() - q1);
+        q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
         threshold = v_reduce_max(q0) - 1;
     }
     else
diff --git a/modules/features2d/src/hal_replacement.hpp b/modules/features2d/src/hal_replacement.hpp
index f9fbf96daacd..6476d216515c 100644
--- a/modules/features2d/src/hal_replacement.hpp
+++ b/modules/features2d/src/hal_replacement.hpp
@@ -64,9 +64,12 @@
 //! @{
 /**
    @brief Detects corners using the FAST algorithm, returns mask.
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination mask
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination mask data
+   @param dst_step Destination mask step
+   @param width Source image width
+   @param height Source image height
    @param type FAST type
 */
 inline int hal_ni_FAST_dense(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, cv::FastFeatureDetector::DetectorType type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
@@ -89,8 +92,10 @@ inline int hal_ni_FAST_NMS(const uchar* src_data, size_t src_step, uchar* dst_da
 
 /**
    @brief Detects corners using the FAST algorithm.
-   @param src_data,src_step Source image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param width Source image width
+   @param height Source image height
    @param keypoints_data Pointer to keypoints
    @param keypoints_count Count of keypoints
    @param threshold Threshold for keypoint
diff --git a/modules/features2d/src/kaze/nldiffusion_functions.cpp b/modules/features2d/src/kaze/nldiffusion_functions.cpp
index 59939a2bbf48..942b8d787513 100644
--- a/modules/features2d/src/kaze/nldiffusion_functions.cpp
+++ b/modules/features2d/src/kaze/nldiffusion_functions.cpp
@@ -86,9 +86,9 @@ void image_derivatives_scharr(const cv::Mat& src, cv::Mat& dst, int xorder, int
 /**
  * @brief This function computes the Perona and Malik conductivity coefficient g1
  * g1 = exp(-|dL|^2/k^2)
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
  * @param k Contrast factor parameter
  */
 void pm_g1(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
@@ -117,9 +117,9 @@ void pm_g1(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
 /**
  * @brief This function computes the Perona and Malik conductivity coefficient g2
  * g2 = 1 / (1 + dL^2 / k^2)
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
  * @param k Contrast factor parameter
  */
 void pm_g2(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
@@ -146,9 +146,9 @@ void pm_g2(InputArray _Lx, InputArray _Ly, OutputArray _dst, float k) {
 /* ************************************************************************* */
 /**
  * @brief This function computes Weickert conductivity coefficient gw
- * @param Lx First order image derivative in X-direction (horizontal)
- * @param Ly First order image derivative in Y-direction (vertical)
- * @param dst Output image
+ * @param _Lx First order image derivative in X-direction (horizontal)
+ * @param _Ly First order image derivative in Y-direction (vertical)
+ * @param _dst Output image
  * @param k Contrast factor parameter
  * @note For more information check the following paper: J. Weickert
  * Applications of nonlinear diffusion in image processing and computer vision,
@@ -183,9 +183,9 @@ void weickert_diffusivity(InputArray _Lx, InputArray _Ly, OutputArray _dst, floa
 /**
 * @brief This function computes Charbonnier conductivity coefficient gc
 * gc = 1 / sqrt(1 + dL^2 / k^2)
-* @param Lx First order image derivative in X-direction (horizontal)
-* @param Ly First order image derivative in Y-direction (vertical)
-* @param dst Output image
+* @param _Lx First order image derivative in X-direction (horizontal)
+* @param _Ly First order image derivative in Y-direction (vertical)
+* @param _dst Output image
 * @param k Contrast factor parameter
 * @note For more information check the following paper: J. Weickert
 * Applications of nonlinear diffusion in image processing and computer vision,
@@ -323,7 +323,7 @@ void compute_scharr_derivatives(const cv::Mat& src, cv::Mat& dst, int xorder, in
  * @param _ky Vertical kernel values
  * @param dx Derivative order in X-direction (horizontal)
  * @param dy Derivative order in Y-direction (vertical)
- * @param scale_ Scale factor or derivative size
+ * @param scale Scale factor or derivative size
  */
 void compute_derivative_kernels(cv::OutputArray _kx, cv::OutputArray _ky, int dx, int dy, int scale) {
     CV_INSTRUMENT_REGION();
@@ -415,7 +415,7 @@ class Nld_Step_Scalar_Invoker : public cv::ParallelLoopBody
 /* ************************************************************************* */
 /**
 * @brief This function performs a scalar non-linear diffusion step
-* @param Ld2 Output image in the evolution
+* @param Ld Output image in the evolution
 * @param c Conductivity image
 * @param Lstep Previous image in the evolution
 * @param stepsize The step size in time units
@@ -490,7 +490,7 @@ void nld_step_scalar(cv::Mat& Ld, const cv::Mat& c, cv::Mat& Lstep, float stepsi
 /* ************************************************************************* */
 /**
 * @brief This function downsamples the input image using OpenCV resize
-* @param img Input image to be downsampled
+* @param src Input image to be downsampled
 * @param dst Output image with half of the resolution of the input image
 */
 void halfsample_image(const cv::Mat& src, cv::Mat& dst) {
diff --git a/modules/features2d/src/kaze/utils.h b/modules/features2d/src/kaze/utils.h
index 44e5b7693514..63199430628b 100644
--- a/modules/features2d/src/kaze/utils.h
+++ b/modules/features2d/src/kaze/utils.h
@@ -6,7 +6,7 @@
  * @brief This function computes the value of a 2D Gaussian function
  * @param x X Position
  * @param y Y Position
- * @param sig Standard Deviation
+ * @param sigma Standard Deviation
  */
 inline float gaussian(float x, float y, float sigma) {
   return expf(-(x*x + y*y) / (2.0f*sigma*sigma));
diff --git a/modules/features2d/src/mser.cpp b/modules/features2d/src/mser.cpp
index d59ed3957416..5c8db481b1c3 100644
--- a/modules/features2d/src/mser.cpp
+++ b/modules/features2d/src/mser.cpp
@@ -30,18 +30,23 @@
  * OpenCV functions for MSER extraction
  *
  * 1. there are two different implementation of MSER, one for gray image, one for color image
- * 2. the gray image algorithm is taken from: Linear Time Maximally Stable Extremal Regions;
+ * 2. the gray image algorithm is taken from:
+ *      Linear Time Maximally Stable Extremal Regions;
  *    the paper claims to be faster than union-find method;
  *    it actually get 1.5~2m/s on my centrino L7200 1.2GHz laptop.
- * 3. the color image algorithm is taken from: Maximally Stable Colour Regions for Recognition and Match;
+ * 3. the color image algorithm is taken from:
+ *      Maximally Stable Colour Regions for Recognition and Match;
  *    it should be much slower than gray image method ( 3~4 times );
- *    the chi_table.h file is taken directly from paper's source code which is distributed under permissive BSD-like license: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
+ *    the chi_table.h file is taken directly from the paper's source code:
+ *    http://users.isy.liu.se/cvl/perfo/software/chi_table.h
+ *    license (BSD-like) is located in the file: 3rdparty/mscr/chi_table_LICENSE.txt
  * 4. though the name is *contours*, the result actually is a list of point set.
  */
 
 #include "precomp.hpp"
 #include "opencv2/imgproc/imgproc_c.h"
 #include <limits>
+#include "../3rdparty/mscr/chi_table.h"
 
 namespace cv
 {
@@ -613,113 +618,6 @@ the color MSER has not been completely refactored yet. We leave it mostly as-is,
 with just enough changes to convert C structures to C++ ones and
 add support for color images into MSER_Impl::detectAndLabel.
 */
-
-const int TABLE_SIZE = 400;
-
-static const float chitab3[]=
-{
-    0.f,  0.0150057f,  0.0239478f,  0.0315227f,
-    0.0383427f,  0.0446605f,  0.0506115f,  0.0562786f,
-    0.0617174f,  0.0669672f,  0.0720573f,  0.0770099f,
-    0.081843f,  0.0865705f,  0.0912043f,  0.0957541f,
-    0.100228f,  0.104633f,  0.108976f,  0.113261f,
-    0.117493f,  0.121676f,  0.125814f,  0.12991f,
-    0.133967f,  0.137987f,  0.141974f,  0.145929f,
-    0.149853f,  0.15375f,  0.15762f,  0.161466f,
-    0.165287f,  0.169087f,  0.172866f,  0.176625f,
-    0.180365f,  0.184088f,  0.187794f,  0.191483f,
-    0.195158f,  0.198819f,  0.202466f,  0.2061f,
-    0.209722f,  0.213332f,  0.216932f,  0.220521f,
-    0.2241f,  0.22767f,  0.231231f,  0.234783f,
-    0.238328f,  0.241865f,  0.245395f,  0.248918f,
-    0.252435f,  0.255947f,  0.259452f,  0.262952f,
-    0.266448f,  0.269939f,  0.273425f,  0.276908f,
-    0.280386f,  0.283862f,  0.287334f,  0.290803f,
-    0.29427f,  0.297734f,  0.301197f,  0.304657f,
-    0.308115f,  0.311573f,  0.315028f,  0.318483f,
-    0.321937f,  0.32539f,  0.328843f,  0.332296f,
-    0.335749f,  0.339201f,  0.342654f,  0.346108f,
-    0.349562f,  0.353017f,  0.356473f,  0.35993f,
-    0.363389f,  0.366849f,  0.37031f,  0.373774f,
-    0.377239f,  0.380706f,  0.384176f,  0.387648f,
-    0.391123f,  0.3946f,  0.39808f,  0.401563f,
-    0.405049f,  0.408539f,  0.412032f,  0.415528f,
-    0.419028f,  0.422531f,  0.426039f,  0.429551f,
-    0.433066f,  0.436586f,  0.440111f,  0.44364f,
-    0.447173f,  0.450712f,  0.454255f,  0.457803f,
-    0.461356f,  0.464915f,  0.468479f,  0.472049f,
-    0.475624f,  0.479205f,  0.482792f,  0.486384f,
-    0.489983f,  0.493588f,  0.4972f,  0.500818f,
-    0.504442f,  0.508073f,  0.511711f,  0.515356f,
-    0.519008f,  0.522667f,  0.526334f,  0.530008f,
-    0.533689f,  0.537378f,  0.541075f,  0.54478f,
-    0.548492f,  0.552213f,  0.555942f,  0.55968f,
-    0.563425f,  0.56718f,  0.570943f,  0.574715f,
-    0.578497f,  0.582287f,  0.586086f,  0.589895f,
-    0.593713f,  0.597541f,  0.601379f,  0.605227f,
-    0.609084f,  0.612952f,  0.61683f,  0.620718f,
-    0.624617f,  0.628526f,  0.632447f,  0.636378f,
-    0.64032f,  0.644274f,  0.648239f,  0.652215f,
-    0.656203f,  0.660203f,  0.664215f,  0.668238f,
-    0.672274f,  0.676323f,  0.680384f,  0.684457f,
-    0.688543f,  0.692643f,  0.696755f,  0.700881f,
-    0.70502f,  0.709172f,  0.713339f,  0.717519f,
-    0.721714f,  0.725922f,  0.730145f,  0.734383f,
-    0.738636f,  0.742903f,  0.747185f,  0.751483f,
-    0.755796f,  0.760125f,  0.76447f,  0.768831f,
-    0.773208f,  0.777601f,  0.782011f,  0.786438f,
-    0.790882f,  0.795343f,  0.799821f,  0.804318f,
-    0.808831f,  0.813363f,  0.817913f,  0.822482f,
-    0.827069f,  0.831676f,  0.836301f,  0.840946f,
-    0.84561f,  0.850295f,  0.854999f,  0.859724f,
-    0.864469f,  0.869235f,  0.874022f,  0.878831f,
-    0.883661f,  0.888513f,  0.893387f,  0.898284f,
-    0.903204f,  0.908146f,  0.913112f,  0.918101f,
-    0.923114f,  0.928152f,  0.933214f,  0.938301f,
-    0.943413f,  0.94855f,  0.953713f,  0.958903f,
-    0.964119f,  0.969361f,  0.974631f,  0.979929f,
-    0.985254f,  0.990608f,  0.99599f,  1.0014f,
-    1.00684f,  1.01231f,  1.01781f,  1.02335f,
-    1.02891f,  1.0345f,  1.04013f,  1.04579f,
-    1.05148f,  1.05721f,  1.06296f,  1.06876f,
-    1.07459f,  1.08045f,  1.08635f,  1.09228f,
-    1.09826f,  1.10427f,  1.11032f,  1.1164f,
-    1.12253f,  1.1287f,  1.1349f,  1.14115f,
-    1.14744f,  1.15377f,  1.16015f,  1.16656f,
-    1.17303f,  1.17954f,  1.18609f,  1.19269f,
-    1.19934f,  1.20603f,  1.21278f,  1.21958f,
-    1.22642f,  1.23332f,  1.24027f,  1.24727f,
-    1.25433f,  1.26144f,  1.26861f,  1.27584f,
-    1.28312f,  1.29047f,  1.29787f,  1.30534f,
-    1.31287f,  1.32046f,  1.32812f,  1.33585f,
-    1.34364f,  1.3515f,  1.35943f,  1.36744f,
-    1.37551f,  1.38367f,  1.39189f,  1.4002f,
-    1.40859f,  1.41705f,  1.42561f,  1.43424f,
-    1.44296f,  1.45177f,  1.46068f,  1.46967f,
-    1.47876f,  1.48795f,  1.49723f,  1.50662f,
-    1.51611f,  1.52571f,  1.53541f,  1.54523f,
-    1.55517f,  1.56522f,  1.57539f,  1.58568f,
-    1.59611f,  1.60666f,  1.61735f,  1.62817f,
-    1.63914f,  1.65025f,  1.66152f,  1.67293f,
-    1.68451f,  1.69625f,  1.70815f,  1.72023f,
-    1.73249f,  1.74494f,  1.75757f,  1.77041f,
-    1.78344f,  1.79669f,  1.81016f,  1.82385f,
-    1.83777f,  1.85194f,  1.86635f,  1.88103f,
-    1.89598f,  1.91121f,  1.92674f,  1.94257f,
-    1.95871f,  1.97519f,  1.99201f,  2.0092f,
-    2.02676f,  2.04471f,  2.06309f,  2.08189f,
-    2.10115f,  2.12089f,  2.14114f,  2.16192f,
-    2.18326f,  2.2052f,  2.22777f,  2.25101f,
-    2.27496f,  2.29966f,  2.32518f,  2.35156f,
-    2.37886f,  2.40717f,  2.43655f,  2.46709f,
-    2.49889f,  2.53206f,  2.56673f,  2.60305f,
-    2.64117f,  2.6813f,  2.72367f,  2.76854f,
-    2.81623f,  2.86714f,  2.92173f,  2.98059f,
-    3.04446f,  3.1143f,  3.19135f,  3.27731f,
-    3.37455f,  3.48653f,  3.61862f,  3.77982f,
-    3.98692f,  4.2776f,  4.77167f,  133.333f
-};
-
 struct MSCRNode;
 
 struct TempMSCR
diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp
index 8589a0225c34..2c5cf9f9978e 100644
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@@ -210,24 +210,24 @@ float calcOrientationHist(
     cv::hal::magnitude32f(X, Y, Mag, len);
 
     k = 0;
-#if CV_SIMD
-    const int vecsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int vecsize = VTraits<v_float32>::vlanes();
     v_float32 nd360 = vx_setall_f32(n/360.f);
     v_int32 __n = vx_setall_s32(n);
-    int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize];
-    float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
+    int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[VTraits<v_float32>::max_nlanes];
+    float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[VTraits<v_float32>::max_nlanes];
 
     for( ; k <= len - vecsize; k += vecsize )
     {
         v_float32 w = vx_load_aligned( W + k );
         v_float32 mag = vx_load_aligned( Mag + k );
         v_float32 ori = vx_load_aligned( Ori + k );
-        v_int32 bin = v_round( nd360 * ori );
+        v_int32 bin = v_round( v_mul(nd360, ori) );
 
-        bin = v_select(bin >= __n, bin - __n, bin);
-        bin = v_select(bin < vx_setzero_s32(), bin + __n, bin);
+        bin = v_select(v_ge(bin, __n), v_sub(bin, __n), bin);
+        bin = v_select(v_lt(bin, vx_setzero_s32()), v_add(bin, __n), bin);
 
-        w = w * mag;
+        w = v_mul(w, mag);
         v_store_aligned(bin_buf, bin);
         v_store_aligned(w_mul_mag_buf, w);
         for(int vi = 0; vi < vecsize; vi++)
@@ -253,19 +253,19 @@ float calcOrientationHist(
     temphist[n+1] = temphist[1];
 
     i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 d_1_16 = vx_setall_f32(1.f/16.f);
     v_float32 d_4_16 = vx_setall_f32(4.f/16.f);
     v_float32 d_6_16 = vx_setall_f32(6.f/16.f);
-    for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
+    for( ; i <= n - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
     {
         v_float32 tn2 = vx_load_aligned(temphist + i-2);
         v_float32 tn1 = vx_load(temphist + i-1);
         v_float32 t0 = vx_load(temphist + i);
         v_float32 t1 = vx_load(temphist + i+1);
         v_float32 t2 = vx_load(temphist + i+2);
-        v_float32 _hist = v_fma(tn2 + t2, d_1_16,
-            v_fma(tn1 + t1, d_4_16, t0 * d_6_16));
+        v_float32 _hist = v_fma(v_add(tn2, t2), d_1_16,
+            v_fma(v_add(tn1, t1), d_4_16, v_mul(t0, d_6_16)));
         v_store(hist + i, _hist);
     }
 #endif
@@ -452,8 +452,8 @@ class findScaleSpaceExtremaT
             const sift_wt* nextptr = next.ptr<sift_wt>(r);
             int c = SIFT_IMG_BORDER;
 
-#if CV_SIMD && !(DoG_TYPE_SHORT)
-            const int vecsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE) && !(DoG_TYPE_SHORT)
+            const int vecsize = VTraits<v_float32>::vlanes();
             for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize)
             {
                 v_float32 val = vx_load(&currptr[c]);
@@ -464,7 +464,7 @@ class findScaleSpaceExtremaT
                 v_float32 vmin,vmax;
 
 
-                v_float32 cond = v_abs(val) > vx_setall_f32((float)threshold);
+                v_float32 cond = v_gt(v_abs(val), vx_setall_f32((float)this->threshold));
                 if (!v_check_any(cond))
                 {
                     continue;
@@ -477,10 +477,10 @@ class findScaleSpaceExtremaT
                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
 
-                v_float32 condp = cond & (val > vx_setall_f32(0)) & (val >= vmax);
-                v_float32 condm = cond & (val < vx_setall_f32(0)) & (val <= vmin);
+                v_float32 condp = v_and(v_and(cond, v_gt(val, vx_setall_f32(0))), v_ge(val, vmax));
+                v_float32 condm = v_and(v_and(cond, v_lt(val, vx_setall_f32(0))), v_le(val, vmin));
 
-                cond = condp | condm;
+                cond = v_or(condp, condm);
                 if (!v_check_any(cond))
                 {
                     continue;
@@ -493,10 +493,10 @@ class findScaleSpaceExtremaT
                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
 
-                condp &= (val >= vmax);
-                condm &= (val <= vmin);
+                condp = v_and(condp, v_ge(val, vmax));
+                condm = v_and(condm, v_le(val, vmin));
 
-                cond = condp | condm;
+                cond = v_or(condp, condm);
                 if (!v_check_any(cond))
                 {
                     continue;
@@ -515,10 +515,10 @@ class findScaleSpaceExtremaT
                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
 
-                condp &= (val >= v_max(vmax,max_middle));
-                condm &= (val <= v_min(vmin,min_middle));
+                condp = v_and(condp, v_ge(val, v_max(vmax, max_middle)));
+                condm = v_and(condm, v_le(val, v_min(vmin, min_middle)));
 
-                cond = condp | condm;
+                cond = v_or(condp, condm);
                 if (!v_check_any(cond))
                 {
                     continue;
@@ -777,11 +777,11 @@ void calcSIFTDescriptor(
     cv::hal::exp32f(W, W, len);
 
     k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
-        const int vecsize = v_float32::nlanes;
-        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize];
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize];
+        const int vecsize = VTraits<v_float32>::vlanes();
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[VTraits<v_float32>::max_nlanes];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*VTraits<v_float32>::max_nlanes];
         const v_float32 __ori  = vx_setall_f32(ori);
         const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad);
         const v_int32 __n = vx_setall_s32(n);
@@ -792,28 +792,28 @@ void calcSIFTDescriptor(
         {
             v_float32 rbin = vx_load_aligned(RBin + k);
             v_float32 cbin = vx_load_aligned(CBin + k);
-            v_float32 obin = (vx_load_aligned(Ori + k) - __ori) * __bins_per_rad;
-            v_float32 mag = vx_load_aligned(Mag + k) * vx_load_aligned(W + k);
+            v_float32 obin = v_mul(v_sub(vx_load_aligned(Ori + k), __ori), __bins_per_rad);
+            v_float32 mag = v_mul(vx_load_aligned(Mag + k), vx_load_aligned(W + k));
 
             v_int32 r0 = v_floor(rbin);
             v_int32 c0 = v_floor(cbin);
             v_int32 o0 = v_floor(obin);
-            rbin -= v_cvt_f32(r0);
-            cbin -= v_cvt_f32(c0);
-            obin -= v_cvt_f32(o0);
-
-            o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0);
-            o0 = v_select(o0 >= __n, o0 - __n, o0);
-
-            v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
-            v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
-            v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
-            v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
-            v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
-            v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
-            v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
-
-            v_int32 idx = v_muladd(v_muladd(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
+            rbin = v_sub(rbin, v_cvt_f32(r0));
+            cbin = v_sub(cbin, v_cvt_f32(c0));
+            obin = v_sub(obin, v_cvt_f32(o0));
+
+            o0 = v_select(v_lt(o0, vx_setzero_s32()), v_add(o0, __n), o0);
+            o0 = v_select(v_ge(o0, __n), v_sub(o0, __n), o0);
+
+            v_float32 v_r1 = v_mul(mag, rbin), v_r0 = v_sub(mag, v_r1);
+            v_float32 v_rc11 = v_mul(v_r1, cbin), v_rc10 = v_sub(v_r1, v_rc11);
+            v_float32 v_rc01 = v_mul(v_r0, cbin), v_rc00 = v_sub(v_r0, v_rc01);
+            v_float32 v_rco111 = v_mul(v_rc11, obin), v_rco110 = v_sub(v_rc11, v_rco111);
+            v_float32 v_rco101 = v_mul(v_rc10, obin), v_rco100 = v_sub(v_rc10, v_rco101);
+            v_float32 v_rco011 = v_mul(v_rc01, obin), v_rco010 = v_sub(v_rc01, v_rco011);
+            v_float32 v_rco001 = v_mul(v_rc00, obin), v_rco000 = v_sub(v_rc00, v_rco001);
+
+            v_int32 idx = v_muladd(v_muladd(v_add(r0, __1), __d_plus_2, v_add(c0, __1)), __n_plus_2, o0);
             v_store_aligned(idx_buf, idx);
 
             v_store_aligned(rco_buf,           v_rco000);
@@ -894,11 +894,11 @@ void calcSIFTDescriptor(
     float nrm2 = 0;
     len = d*d*n;
     k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         v_float32 __nrm2 = vx_setzero_f32();
         v_float32 __rawDst;
-        for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
+        for( ; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
         {
             __rawDst = vx_load_aligned(rawDst + k);
             __nrm2 = v_fma(__rawDst, __rawDst, __nrm2);
@@ -949,15 +949,15 @@ void calcSIFTDescriptor(
 if( dstMat.type() == CV_32F )
 {
     float* dst = dstMat.ptr<float>(row);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 __dst;
     v_float32 __min = vx_setzero_f32();
     v_float32 __max = vx_setall_f32(255.0f); // max of uchar
     v_float32 __nrm2 = vx_setall_f32(nrm2);
-    for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes )
+    for( k = 0; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
     {
         __dst = vx_load_aligned(rawDst + k);
-        __dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max);
+        __dst = v_min(v_max(v_cvt_f32(v_round(v_mul(__dst, __nrm2))), __min), __max);
         v_store(dst + k, __dst);
     }
 #endif
@@ -976,16 +976,16 @@ if( dstMat.type() == CV_32F )
 else // CV_8U
 {
     uint8_t* dst = dstMat.ptr<uint8_t>(row);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 __dst0, __dst1;
     v_uint16 __pack01;
     v_float32 __nrm2 = vx_setall_f32(nrm2);
-    for( k = 0; k <= len - v_float32::nlanes * 2; k += v_float32::nlanes * 2 )
+    for( k = 0; k <= len - VTraits<v_float32>::vlanes() * 2; k += VTraits<v_float32>::vlanes() * 2 )
     {
         __dst0 = vx_load_aligned(rawDst + k);
-        __dst1 = vx_load_aligned(rawDst + k + v_float32::nlanes);
+        __dst1 = vx_load_aligned(rawDst + k + VTraits<v_float32>::vlanes());
 
-        __pack01 = v_pack_u(v_round(__dst0 * __nrm2), v_round(__dst1 * __nrm2));
+        __pack01 = v_pack_u(v_round(v_mul(__dst0, __nrm2)), v_round(v_mul(__dst1, __nrm2)));
         v_pack_store(dst + k, __pack01);
     }
 #endif
diff --git a/modules/features2d/test/ocl/test_feature2d.cpp b/modules/features2d/test/ocl/test_feature2d.cpp
index 7024854ce11d..ad4008bb818a 100644
--- a/modules/features2d/test/ocl/test_feature2d.cpp
+++ b/modules/features2d/test/ocl/test_feature2d.cpp
@@ -5,6 +5,7 @@
 #include "../test_precomp.hpp"
 #include "cvconfig.h"
 #include "opencv2/ts/ocl_test.hpp"
+#include <functional>
 
 #ifdef HAVE_OPENCL
 
@@ -16,7 +17,7 @@ namespace ocl {
     "../stitching/a3.png", \
     "../stitching/s2.jpg")
 
-PARAM_TEST_CASE(Feature2DFixture, Ptr<Feature2D>, std::string)
+PARAM_TEST_CASE(Feature2DFixture, std::function<Ptr<Feature2D>()>, std::string)
 {
     std::string filename;
     Mat image, descriptors;
@@ -27,7 +28,7 @@ PARAM_TEST_CASE(Feature2DFixture, Ptr<Feature2D>, std::string)
 
     virtual void SetUp()
     {
-        feature = GET_PARAM(0);
+        feature = GET_PARAM(0)();
         filename = GET_PARAM(1);
 
         image = readImage(filename);
@@ -61,10 +62,10 @@ OCL_TEST_P(Feature2DFixture, DescriptorsSame)
 }
 
 OCL_INSTANTIATE_TEST_CASE_P(AKAZE, Feature2DFixture,
-    testing::Combine(testing::Values(AKAZE::create()), TEST_IMAGES));
+    testing::Combine(testing::Values([]() { return AKAZE::create(); }), TEST_IMAGES));
 
 OCL_INSTANTIATE_TEST_CASE_P(AKAZE_DESCRIPTOR_KAZE, Feature2DFixture,
-    testing::Combine(testing::Values(AKAZE::create(AKAZE::DESCRIPTOR_KAZE)), TEST_IMAGES));
+    testing::Combine(testing::Values([]() { return AKAZE::create(AKAZE::DESCRIPTOR_KAZE); }), TEST_IMAGES));
 
 }//ocl
 }//cvtest
diff --git a/modules/features2d/test/test_descriptors_invariance.cpp b/modules/features2d/test/test_descriptors_invariance.cpp
index 9faf56a1c8aa..0ac381f667fb 100644
--- a/modules/features2d/test/test_descriptors_invariance.cpp
+++ b/modules/features2d/test/test_descriptors_invariance.cpp
@@ -18,31 +18,31 @@ const static std::string IMAGE_BIKES = "detectors_descriptors_evaluation/images_
  */
 
 INSTANTIATE_TEST_CASE_P(SIFT, DescriptorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, SIFT::create(), SIFT::create(), 0.98f));
+                        Value(IMAGE_TSUKUBA, []() { return SIFT::create(); }, []() { return SIFT::create(); }, 0.98f));
 
 INSTANTIATE_TEST_CASE_P(BRISK, DescriptorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, BRISK::create(), BRISK::create(), 0.99f));
+                        Value(IMAGE_TSUKUBA, []() { return BRISK::create(); }, []() { return BRISK::create(); }, 0.99f));
 
 INSTANTIATE_TEST_CASE_P(ORB, DescriptorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, ORB::create(), ORB::create(), 0.99f));
+                        Value(IMAGE_TSUKUBA, []() { return ORB::create(); }, []() { return ORB::create(); }, 0.99f));
 
 INSTANTIATE_TEST_CASE_P(AKAZE, DescriptorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, AKAZE::create(), AKAZE::create(), 0.99f));
+                        Value(IMAGE_TSUKUBA, []() { return AKAZE::create(); }, []() { return AKAZE::create(); }, 0.99f));
 
 INSTANTIATE_TEST_CASE_P(AKAZE_DESCRIPTOR_KAZE, DescriptorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, AKAZE::create(AKAZE::DESCRIPTOR_KAZE), AKAZE::create(AKAZE::DESCRIPTOR_KAZE), 0.99f));
+                        Value(IMAGE_TSUKUBA, []() { return AKAZE::create(AKAZE::DESCRIPTOR_KAZE); }, []() { return AKAZE::create(AKAZE::DESCRIPTOR_KAZE); }, 0.99f));
 
 /*
  * Descriptor's scale invariance check
  */
 
 INSTANTIATE_TEST_CASE_P(SIFT, DescriptorScaleInvariance,
-                        Value(IMAGE_BIKES, SIFT::create(0, 3, 0.09), SIFT::create(0, 3, 0.09), 0.78f));
+                        Value(IMAGE_BIKES, []() { return SIFT::create(0, 3, 0.09); }, []() { return SIFT::create(0, 3, 0.09); }, 0.78f));
 
 INSTANTIATE_TEST_CASE_P(AKAZE, DescriptorScaleInvariance,
-                        Value(IMAGE_BIKES, AKAZE::create(), AKAZE::create(), 0.6f));
+                        Value(IMAGE_BIKES, []() { return AKAZE::create(); }, []() { return AKAZE::create(); }, 0.6f));
 
 INSTANTIATE_TEST_CASE_P(AKAZE_DESCRIPTOR_KAZE, DescriptorScaleInvariance,
-                        Value(IMAGE_BIKES, AKAZE::create(AKAZE::DESCRIPTOR_KAZE), AKAZE::create(AKAZE::DESCRIPTOR_KAZE), 0.55f));
+                        Value(IMAGE_BIKES, []() { return AKAZE::create(AKAZE::DESCRIPTOR_KAZE); }, []() { return AKAZE::create(AKAZE::DESCRIPTOR_KAZE); }, 0.55f));
 
 }} // namespace
diff --git a/modules/features2d/test/test_descriptors_invariance.impl.hpp b/modules/features2d/test/test_descriptors_invariance.impl.hpp
index 780e86e72480..055500b1f9a2 100644
--- a/modules/features2d/test/test_descriptors_invariance.impl.hpp
+++ b/modules/features2d/test/test_descriptors_invariance.impl.hpp
@@ -3,12 +3,17 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "test_invariance_utils.hpp"
+#include <functional>
 
 namespace opencv_test { namespace {
 
 #define SHOW_DEBUG_LOG 1
 
-typedef tuple<std::string, Ptr<FeatureDetector>, Ptr<DescriptorExtractor>, float>
+// NOTE: using factory function (function<Ptr<Type>()>) instead of object instance (Ptr<Type>) as a
+// test parameter, because parameters exist during whole test program run and consume a lot of memory
+typedef std::function<cv::Ptr<cv::FeatureDetector>()> DetectorFactory;
+typedef std::function<cv::Ptr<cv::DescriptorExtractor>()> ExtractorFactory;
+typedef tuple<std::string, DetectorFactory, ExtractorFactory, float>
     String_FeatureDetector_DescriptorExtractor_Float_t;
 
 
@@ -61,8 +66,8 @@ class DescriptorInvariance : public TestWithParam<String_FeatureDetector_Descrip
         image0 = imread(filename);
         ASSERT_FALSE(image0.empty()) << "couldn't read input image";
 
-        featureDetector = get<1>(GetParam());
-        descriptorExtractor = get<2>(GetParam());
+        featureDetector = get<1>(GetParam())();
+        descriptorExtractor = get<2>(GetParam())();
         minInliersRatio = get<3>(GetParam());
     }
 
diff --git a/modules/features2d/test/test_descriptors_regression.cpp b/modules/features2d/test/test_descriptors_regression.cpp
index 0258fea0f392..e44edb076919 100644
--- a/modules/features2d/test/test_descriptors_regression.cpp
+++ b/modules/features2d/test/test_descriptors_regression.cpp
@@ -142,7 +142,7 @@ TEST_P(DescriptorImage, no_crash)
 {
     vector<String> fnames;
     glob(cvtest::TS::ptr()->get_data_path() + pattern, fnames, false);
-    sort(fnames.begin(), fnames.end());
+    std::sort(fnames.begin(), fnames.end());
 
     Ptr<AKAZE> akaze_mldb = AKAZE::create(AKAZE::DESCRIPTOR_MLDB);
     Ptr<AKAZE> akaze_mldb_upright = AKAZE::create(AKAZE::DESCRIPTOR_MLDB_UPRIGHT);
diff --git a/modules/features2d/test/test_detectors_invariance.cpp b/modules/features2d/test/test_detectors_invariance.cpp
index 11e7ecfedc08..72fe87f63eb3 100644
--- a/modules/features2d/test/test_detectors_invariance.cpp
+++ b/modules/features2d/test/test_detectors_invariance.cpp
@@ -18,40 +18,40 @@ const static std::string IMAGE_BIKES = "detectors_descriptors_evaluation/images_
  */
 
 INSTANTIATE_TEST_CASE_P(SIFT, DetectorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, SIFT::create(), 0.45f, 0.70f));
+                        Value(IMAGE_TSUKUBA, []() { return SIFT::create(); }, 0.45f, 0.70f));
 
 INSTANTIATE_TEST_CASE_P(BRISK, DetectorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, BRISK::create(), 0.45f, 0.76f));
+                        Value(IMAGE_TSUKUBA, []() { return BRISK::create(); }, 0.45f, 0.76f));
 
 INSTANTIATE_TEST_CASE_P(ORB, DetectorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, ORB::create(), 0.5f, 0.76f));
+                        Value(IMAGE_TSUKUBA, []() { return ORB::create(); }, 0.5f, 0.76f));
 
 INSTANTIATE_TEST_CASE_P(AKAZE, DetectorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, AKAZE::create(), 0.5f, 0.71f));
+                        Value(IMAGE_TSUKUBA, []() { return AKAZE::create(); }, 0.5f, 0.71f));
 
 INSTANTIATE_TEST_CASE_P(AKAZE_DESCRIPTOR_KAZE, DetectorRotationInvariance,
-                        Value(IMAGE_TSUKUBA, AKAZE::create(AKAZE::DESCRIPTOR_KAZE), 0.5f, 0.71f));
+                        Value(IMAGE_TSUKUBA, []() { return AKAZE::create(AKAZE::DESCRIPTOR_KAZE); }, 0.5f, 0.71f));
 
 /*
  * Detector's scale invariance check
  */
 
 INSTANTIATE_TEST_CASE_P(SIFT, DetectorScaleInvariance,
-                        Value(IMAGE_BIKES, SIFT::create(0, 3, 0.09), 0.60f, 0.98f));
+                        Value(IMAGE_BIKES, []() { return SIFT::create(0, 3, 0.09); }, 0.60f, 0.98f));
 
 INSTANTIATE_TEST_CASE_P(BRISK, DetectorScaleInvariance,
-                        Value(IMAGE_BIKES, BRISK::create(), 0.08f, 0.49f));
+                        Value(IMAGE_BIKES, []() { return BRISK::create(); }, 0.08f, 0.49f));
 
 INSTANTIATE_TEST_CASE_P(ORB, DetectorScaleInvariance,
-                        Value(IMAGE_BIKES, ORB::create(), 0.08f, 0.49f));
+                        Value(IMAGE_BIKES, []() { return ORB::create(); }, 0.08f, 0.49f));
 
 INSTANTIATE_TEST_CASE_P(KAZE, DetectorScaleInvariance,
-                        Value(IMAGE_BIKES, KAZE::create(), 0.08f, 0.49f));
+                        Value(IMAGE_BIKES, []() { return KAZE::create(); }, 0.08f, 0.49f));
 
 INSTANTIATE_TEST_CASE_P(AKAZE, DetectorScaleInvariance,
-                        Value(IMAGE_BIKES, AKAZE::create(), 0.08f, 0.49f));
+                        Value(IMAGE_BIKES, []() { return AKAZE::create(); }, 0.08f, 0.49f));
 
 INSTANTIATE_TEST_CASE_P(AKAZE_DESCRIPTOR_KAZE, DetectorScaleInvariance,
-                        Value(IMAGE_BIKES, AKAZE::create(AKAZE::DESCRIPTOR_KAZE), 0.08f, 0.49f));
+                        Value(IMAGE_BIKES, []() { return AKAZE::create(AKAZE::DESCRIPTOR_KAZE); }, 0.08f, 0.49f));
 
 }} // namespace
diff --git a/modules/features2d/test/test_detectors_invariance.impl.hpp b/modules/features2d/test/test_detectors_invariance.impl.hpp
index e50316fed95d..a60233350966 100644
--- a/modules/features2d/test/test_detectors_invariance.impl.hpp
+++ b/modules/features2d/test/test_detectors_invariance.impl.hpp
@@ -3,12 +3,16 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "test_invariance_utils.hpp"
+#include <functional>
 
 namespace opencv_test { namespace {
 
 #define SHOW_DEBUG_LOG 1
 
-typedef tuple<std::string, Ptr<FeatureDetector>, float, float> String_FeatureDetector_Float_Float_t;
+// NOTE: using factory function (function<Ptr<Type>()>) instead of object instance (Ptr<Type>) as a
+// test parameter, because parameters exist during whole test program run and consume a lot of memory
+typedef std::function<cv::Ptr<cv::FeatureDetector>()> DetectorFactory;
+typedef tuple<std::string, DetectorFactory, float, float> String_FeatureDetector_Float_Float_t;
 
 
 static
@@ -56,7 +60,7 @@ class DetectorInvariance : public TestWithParam<String_FeatureDetector_Float_Flo
         image0 = imread(filename);
         ASSERT_FALSE(image0.empty()) << "couldn't read input image";
 
-        featureDetector = get<1>(GetParam());
+        featureDetector = get<1>(GetParam())();
         minKeyPointMatchesRatio = get<2>(GetParam());
         minInliersRatio = get<3>(GetParam());
     }
diff --git a/modules/flann/include/opencv2/flann/any.h b/modules/flann/include/opencv2/flann/any.h
index 4906fec081e3..2228bd1cfcbf 100644
--- a/modules/flann/include/opencv2/flann/any.h
+++ b/modules/flann/include/opencv2/flann/any.h
@@ -19,16 +19,39 @@
 #include <ostream>
 #include <typeinfo>
 
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/utility.hpp"
+
 namespace cvflann
 {
 
 namespace anyimpl
 {
 
-struct bad_any_cast
+struct bad_any_cast : public std::exception
 {
+    bad_any_cast() = default;
+
+    bad_any_cast(const char* src, const char* dst)
+        : message_(cv::format("cvflann::bad_any_cast(from %s to %s)", src, dst)) {}
+
+
+    const char* what() const noexcept override
+    {
+        return message_.c_str();
+    }
+
+private:
+    std::string message_{"cvflann::bad_any_cast"};
 };
 
+#ifndef CV_THROW_IF_TYPE_MISMATCH
+#define CV_THROW_IF_TYPE_MISMATCH(src_type_info, dst_type_info) \
+    if ((src_type_info) != (dst_type_info)) \
+        throw cvflann::anyimpl::bad_any_cast((src_type_info).name(), \
+                                             (dst_type_info).name())
+#endif
+
 struct empty_any
 {
 };
@@ -271,7 +294,7 @@ struct any
     template<typename T>
     T& cast()
     {
-        if (policy->type() != typeid(T)) throw anyimpl::bad_any_cast();
+        CV_THROW_IF_TYPE_MISMATCH(policy->type(), typeid(T));
         T* r = reinterpret_cast<T*>(policy->get_value(&object));
         return *r;
     }
@@ -280,7 +303,7 @@ struct any
     template<typename T>
     const T& cast() const
     {
-        if (policy->type() != typeid(T)) throw anyimpl::bad_any_cast();
+        CV_THROW_IF_TYPE_MISMATCH(policy->type(), typeid(T));
         const T* r = reinterpret_cast<const T*>(policy->get_value(&object));
         return *r;
     }
diff --git a/modules/flann/include/opencv2/flann/composite_index.h b/modules/flann/include/opencv2/flann/composite_index.h
index f1af41ac2622..37a6223f882d 100644
--- a/modules/flann/include/opencv2/flann/composite_index.h
+++ b/modules/flann/include/opencv2/flann/composite_index.h
@@ -80,7 +80,6 @@ class CompositeIndex : public NNIndex<Distance>
      * @param inputData dataset containing the points to index
      * @param params Index parameters
      * @param d Distance functor
-     * @return
      */
     CompositeIndex(const Matrix<ElementType>& inputData, const IndexParams& params = CompositeIndexParams(),
                    Distance d = Distance()) : index_params_(params)
diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h
index 2d7cbf13de99..3029ebb5ef8e 100644
--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -1,4 +1,4 @@
-﻿/***********************************************************************
+/***********************************************************************
  * Software License Agreement (BSD License)
  *
  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
@@ -49,7 +49,7 @@ typedef unsigned __int64 uint64_t;
 # include <Intrin.h>
 #endif
 
-#if defined(__ARM_NEON__) && !defined(__CUDACC__)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
 # include "arm_neon.h"
 #endif
 
@@ -559,7 +559,7 @@ struct Hamming
     ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
     {
         ResultType result = 0;
-#if defined(__ARM_NEON__) && !defined(__CUDACC__)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
         {
             const unsigned char* a2 = reinterpret_cast<const unsigned char*> (a);
             const unsigned char* b2 = reinterpret_cast<const unsigned char*> (b);
@@ -611,7 +611,7 @@ struct Hamming
     {
         (void)b;
         ResultType result = 0;
-#if defined(__ARM_NEON__) && !defined(__CUDACC__)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
         {
             const unsigned char* a2 = reinterpret_cast<const unsigned char*> (a);
             uint32x4_t bits = vmovq_n_u32(0);
diff --git a/modules/flann/include/opencv2/flann/dynamic_bitset.h b/modules/flann/include/opencv2/flann/dynamic_bitset.h
index a00ce1bb7e0e..676cb0b71ebd 100644
--- a/modules/flann/include/opencv2/flann/dynamic_bitset.h
+++ b/modules/flann/include/opencv2/flann/dynamic_bitset.h
@@ -97,7 +97,6 @@ class DynamicBitset
     }
 
     /** @brief set one bit to 0
-     * @param index
      */
     void reset(size_t index)
     {
@@ -108,7 +107,6 @@ class DynamicBitset
      * This function is useful when resetting a given set of bits so that the
      * whole bitset ends up being 0: if that's the case, we don't care about setting
      * other bits to 0
-     * @param index
      */
     void reset_block(size_t index)
     {
@@ -116,7 +114,6 @@ class DynamicBitset
     }
 
     /** resize the bitset so that it contains at least sz bits
-     * @param sz
      */
     void resize(size_t sz)
     {
diff --git a/modules/flann/include/opencv2/flann/general.h b/modules/flann/include/opencv2/flann/general.h
index 29fa8be12114..e65cba2f8af4 100644
--- a/modules/flann/include/opencv2/flann/general.h
+++ b/modules/flann/include/opencv2/flann/general.h
@@ -31,6 +31,8 @@
 #ifndef OPENCV_FLANN_GENERAL_H_
 #define OPENCV_FLANN_GENERAL_H_
 
+#include "opencv2/core/version.hpp"
+
 #if CV_VERSION_MAJOR <= 4
 
 //! @cond IGNORED
diff --git a/modules/flann/include/opencv2/flann/logger.h b/modules/flann/include/opencv2/flann/logger.h
index 8911812a7744..31f9bbd77fa2 100644
--- a/modules/flann/include/opencv2/flann/logger.h
+++ b/modules/flann/include/opencv2/flann/logger.h
@@ -101,7 +101,6 @@ class Logger
      * Print log message
      * @param level Log level
      * @param fmt Message format
-     * @return
      */
     static int log(int level, const char* fmt, ...)
     {
diff --git a/modules/flann/include/opencv2/flann/lsh_table.h b/modules/flann/include/opencv2/flann/lsh_table.h
index a189562d3af4..3f51457cbb09 100644
--- a/modules/flann/include/opencv2/flann/lsh_table.h
+++ b/modules/flann/include/opencv2/flann/lsh_table.h
@@ -214,8 +214,6 @@ class LshTable
     }
 
     /** Get a bucket given the key
-     * @param key
-     * @return
      */
     inline const Bucket* getBucketFromKey(BucketKey key) const
     {
@@ -253,7 +251,6 @@ class LshTable
     }
 
     /** Get statistics about the table
-     * @return
      */
     LshStats getStats() const;
 
@@ -427,7 +424,7 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons
         size_t mask_block = mask_[i / sizeof(size_t)];
         while (mask_block) {
             // Get the lowest set bit in the mask block
-            size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
+            size_t lowest_bit = mask_block & ~(mask_block - 1);
             // Add it to the current subsignature if necessary
             subsignature += (feature_block & lowest_bit) ? bit_index : 0;
             // Reset the bit in the mask block
diff --git a/modules/flann/include/opencv2/flann/matrix.h b/modules/flann/include/opencv2/flann/matrix.h
index fb871bd73ced..bfbf91ef5cd0 100644
--- a/modules/flann/include/opencv2/flann/matrix.h
+++ b/modules/flann/include/opencv2/flann/matrix.h
@@ -35,6 +35,9 @@
 
 #include <stdio.h>
 
+#include "opencv2/core/cvdef.h"
+#include "opencv2/flann/defines.h"
+
 namespace cvflann
 {
 
diff --git a/modules/flann/include/opencv2/flann/params.h b/modules/flann/include/opencv2/flann/params.h
index c9093cde8cf4..1a8e127035a0 100644
--- a/modules/flann/include/opencv2/flann/params.h
+++ b/modules/flann/include/opencv2/flann/params.h
@@ -72,11 +72,16 @@ struct SearchParams : public IndexParams
 
 
 template<typename T>
-T get_param(const IndexParams& params, cv::String name, const T& default_value)
+T get_param(const IndexParams& params, const cv::String& name, const T& default_value)
 {
     IndexParams::const_iterator it = params.find(name);
     if (it != params.end()) {
-        return it->second.cast<T>();
+        try {
+            return it->second.cast<T>();
+        } catch (const std::exception& e) {
+            CV_Error_(cv::Error::StsBadArg,
+                      ("FLANN '%s' param type mismatch: %s", name.c_str(), e.what()));
+        }
     }
     else {
         return default_value;
@@ -84,11 +89,16 @@ T get_param(const IndexParams& params, cv::String name, const T& default_value)
 }
 
 template<typename T>
-T get_param(const IndexParams& params, cv::String name)
+T get_param(const IndexParams& params, const cv::String& name)
 {
     IndexParams::const_iterator it = params.find(name);
     if (it != params.end()) {
-        return it->second.cast<T>();
+        try {
+            return it->second.cast<T>();
+        } catch (const std::exception& e) {
+            CV_Error_(cv::Error::StsBadArg,
+                      ("FLANN '%s' param type mismatch: %s", name.c_str(), e.what()));
+        }
     }
     else {
         FLANN_THROW(cv::Error::StsBadArg, cv::String("Missing parameter '")+name+cv::String("' in the parameters given"));
diff --git a/modules/flann/include/opencv2/flann/random.h b/modules/flann/include/opencv2/flann/random.h
index 2c1809c3a93a..5a12ef3046a6 100644
--- a/modules/flann/include/opencv2/flann/random.h
+++ b/modules/flann/include/opencv2/flann/random.h
@@ -106,7 +106,6 @@ class UniqueRandom
     /**
      * Constructor.
      * @param n Size of the interval from which to generate
-     * @return
      */
     UniqueRandom(int n)
     {
diff --git a/modules/flann/include/opencv2/flann/result_set.h b/modules/flann/include/opencv2/flann/result_set.h
index 47ad23110568..aa679df71c21 100644
--- a/modules/flann/include/opencv2/flann/result_set.h
+++ b/modules/flann/include/opencv2/flann/result_set.h
@@ -40,6 +40,9 @@
 #include <set>
 #include <vector>
 
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/cvdef.h"
+
 namespace cvflann
 {
 
@@ -357,7 +360,6 @@ class UniqueResultSet : public ResultSet<DistanceType>
     }
 
     /** The number of neighbors in the set
-     * @return
      */
     size_t size() const
     {
@@ -366,7 +368,6 @@ class UniqueResultSet : public ResultSet<DistanceType>
 
     /** The distance of the furthest neighbor
      * If we don't have enough neighbors, it returns the max possible value
-     * @return
      */
     inline DistanceType worstDist() const CV_OVERRIDE
     {
@@ -487,7 +488,6 @@ class RadiusUniqueResultSet : public UniqueResultSet<DistanceType>
 
     /** The distance of the furthest neighbor
      * If we don't have enough neighbors, it returns the max possible value
-     * @return
      */
     inline DistanceType worstDist() const CV_OVERRIDE
     {
diff --git a/modules/flann/misc/python/pyopencv_flann.hpp b/modules/flann/misc/python/pyopencv_flann.hpp
index 3d97edbb5930..086ca5f09f80 100644
--- a/modules/flann/misc/python/pyopencv_flann.hpp
+++ b/modules/flann/misc/python/pyopencv_flann.hpp
@@ -17,57 +17,89 @@ PyObject* pyopencv_from(const cvflann_flann_distance_t& value)
 template<>
 bool pyopencv_to(PyObject *o, cv::flann::IndexParams& p, const ArgInfo& info)
 {
-    CV_UNUSED(info);
-    bool ok = true;
-    PyObject* key = NULL;
-    PyObject* item = NULL;
-    Py_ssize_t pos = 0;
-
     if (!o || o == Py_None)
+    {
         return true;
+    }
+
+    if(!PyDict_Check(o))
+    {
+        failmsg("Argument '%s' is not a dictionary", info.name);
+        return false;
+    }
+
+    PyObject* key_obj = NULL;
+    PyObject* value_obj = NULL;
+    Py_ssize_t key_pos = 0;
 
-    if(PyDict_Check(o)) {
-        while(PyDict_Next(o, &pos, &key, &item))
+    while(PyDict_Next(o, &key_pos, &key_obj, &value_obj))
+    {
+        // get key
+        std::string key;
+        if (!getUnicodeString(key_obj, key))
         {
-            // get key
-            std::string k;
-            if (!getUnicodeString(key, k))
+            failmsg("Key at pos %lld is not a string", static_cast<int64_t>(key_pos));
+            return false;
+        }
+        // key_arg_info.name is bound to key lifetime
+        const ArgInfo key_arg_info(key.c_str(), false);
+
+        // get value
+        if (isBool(value_obj))
+        {
+            npy_bool npy_value = NPY_FALSE;
+            if (PyArray_BoolConverter(value_obj, &npy_value) >= 0)
             {
-                ok = false;
-                break;
+                p.setBool(key, npy_value == NPY_TRUE);
+                continue;
             }
-            // get value
-            if( !!PyBool_Check(item) )
+            PyErr_Clear();
+        }
+
+        int int_value = 0;
+        if (pyopencv_to(value_obj, int_value, key_arg_info))
+        {
+            if (key == "algorithm")
             {
-                p.setBool(k, item == Py_True);
+                p.setAlgorithm(int_value);
             }
-            else if( PyInt_Check(item) )
+            else
             {
-                int value = (int)PyInt_AsLong(item);
-                if( strcmp(k.c_str(), "algorithm") == 0 )
-                    p.setAlgorithm(value);
-                else
-                    p.setInt(k, value);
+                p.setInt(key, int_value);
             }
-            else if( PyFloat_Check(item) )
+            continue;
+        }
+        PyErr_Clear();
+
+        double flt_value = 0.0;
+        if (pyopencv_to(value_obj, flt_value, key_arg_info))
+        {
+            if (key == "eps")
             {
-                double value = PyFloat_AsDouble(item);
-                p.setDouble(k, value);
+                p.setFloat(key, static_cast<float>(flt_value));
             }
             else
             {
-                std::string val_str;
-                if (!getUnicodeString(item, val_str))
-                {
-                    ok = false;
-                    break;
-                }
-                p.setString(k, val_str);
+                p.setDouble(key, flt_value);
             }
+            continue;
         }
-    }
+        PyErr_Clear();
 
-    return ok && !PyErr_Occurred();
+        std::string str_value;
+        if (getUnicodeString(value_obj, str_value))
+        {
+            p.setString(key, str_value);
+            continue;
+        }
+        PyErr_Clear();
+        // All conversions are failed
+        failmsg("Failed to parse IndexParam with key '%s'. "
+                "Supported types: [bool, int, float, str]", key.c_str());
+        return false;
+
+    }
+    return true;
 }
 
 template<>
diff --git a/modules/flann/misc/python/test/test_flann_based_matcher.py b/modules/flann/misc/python/test/test_flann_based_matcher.py
new file mode 100644
index 000000000000..cf8c2ededd97
--- /dev/null
+++ b/modules/flann/misc/python/test/test_flann_based_matcher.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# Python 2/3 compatibility
+from __future__ import print_function
+
+import cv2
+import numpy as np
+
+from tests_common import NewOpenCVTests
+
+
+class FlannBasedMatcher(NewOpenCVTests):
+    def test_all_parameters_can_be_passed(self):
+        img1 = self.get_sample("samples/data/right01.jpg")
+        img2 = self.get_sample("samples/data/right02.jpg")
+
+        orb = cv2.ORB.create()
+
+        kp1, des1 = orb.detectAndCompute(img1, None)
+        kp2, des2 = orb.detectAndCompute(img2, None)
+        FLANN_INDEX_KDTREE = 1
+        index_param = dict(algorithm=FLANN_INDEX_KDTREE, trees=4)
+        search_param = dict(checks=32, sorted=True, eps=0.5,
+                            explore_all_trees=False)
+        matcher = cv2.FlannBasedMatcher(index_param, search_param)
+        matches = matcher.knnMatch(np.float32(des1), np.float32(des2), k=2)
+        self.assertGreater(len(matches), 0)
+
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/flann/src/flann.cpp b/modules/flann/src/flann.cpp
index 388418f8894e..b7930c548a44 100644
--- a/modules/flann/src/flann.cpp
+++ b/modules/flann/src/flann.cpp
@@ -35,7 +35,7 @@ namespace cvflann
      * \deprecated Provided for backward compatibility
     */
     flann_distance_t flann_distance_type_ = FLANN_DIST_L2;
-    flann_distance_t flann_distance_type() { return flann_distance_type_; }
+    CV_DEPRECATED flann_distance_t flann_distance_type() { return flann_distance_type_; }
 
     /**
      * Set distance type to used
diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index 31322d533a77..f18290ca7dde 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -41,7 +41,7 @@ if(MSVC)
     # and IE deprecated code warning C4996
     ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4503 /wd4996)
   endif()
-  if(MSVC_VERSION LESS 1920)  # MSVS 2015/2017
+  if((MSVC_VERSION LESS 1920) OR ARM OR AARCH64) # MSVS 2015/2017 on x86 and ARM
     ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4702)  # 'unreachable code'
   endif()
 endif()
@@ -79,6 +79,7 @@ set(gapi_srcs
     src/api/gframe.cpp
     src/api/gkernel.cpp
     src/api/gbackend.cpp
+    src/api/gcommon.cpp
     src/api/gproto.cpp
     src/api/gnode.cpp
     src/api/gcall.cpp
@@ -88,6 +89,7 @@ set(gapi_srcs
     src/api/kernels_imgproc.cpp
     src/api/kernels_video.cpp
     src/api/kernels_nnparsers.cpp
+    src/api/kernels_ot.cpp
     src/api/kernels_streaming.cpp
     src/api/kernels_stereo.cpp
     src/api/render.cpp
@@ -120,8 +122,10 @@ set(gapi_srcs
     src/executor/gabstractstreamingexecutor.cpp
     src/executor/gexecutor.cpp
     src/executor/gtbbexecutor.cpp
+    src/executor/gthreadedexecutor.cpp
     src/executor/gstreamingexecutor.cpp
     src/executor/gasync.cpp
+    src/executor/thread_pool.cpp
 
     # CPU Backend (currently built-in)
     src/backends/cpu/gcpubackend.cpp
@@ -130,6 +134,7 @@ set(gapi_srcs
     src/backends/cpu/gcpustereo.cpp
     src/backends/cpu/gcpuvideo.cpp
     src/backends/cpu/gcpucore.cpp
+    src/backends/cpu/gcpuot.cpp
     src/backends/cpu/gnnparsers.cpp
 
     # Fluid Backend (also built-in, FIXME:move away)
@@ -162,6 +167,8 @@ set(gapi_srcs
 
     # ONNX backend
     src/backends/onnx/gonnxbackend.cpp
+    src/backends/onnx/dml_ep.cpp
+    src/backends/onnx/coreml_ep.cpp
 
     # Render backend
     src/backends/render/grenderocv.cpp
@@ -189,6 +196,9 @@ set(gapi_srcs
     src/backends/ov/bindings_ov.cpp
     src/backends/python/gpythonbackend.cpp
 
+    # Queue Streaming source
+    src/streaming/queue_source.cpp
+
     # OpenVPL Streaming source
     src/streaming/onevpl/source.cpp
     src/streaming/onevpl/source_priv.cpp
@@ -236,18 +246,25 @@ set(gapi_srcs
     src/utils/itt.cpp
     )
 
+file(GLOB_RECURSE gapi_3rdparty_srcs
+    "${CMAKE_CURRENT_LIST_DIR}/src/3rdparty/vasot/src/*.cpp"
+)
+
 ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2)
 ocv_add_dispatched_file(backends/fluid/gfluidcore_func SSE4_1 AVX2)
 
 ocv_list_add_prefix(gapi_srcs "${CMAKE_CURRENT_LIST_DIR}/")
 
 # For IDE users
-ocv_source_group("Src"     FILES ${gapi_srcs})
+ocv_source_group("Src"     FILES ${gapi_srcs} ${gapi_3rdparty_srcs})
 ocv_source_group("Include" FILES ${gapi_ext_hdrs})
 
-ocv_set_module_sources(HEADERS ${gapi_ext_hdrs} SOURCES ${gapi_srcs})
+ocv_set_module_sources(HEADERS ${gapi_ext_hdrs} SOURCES ${gapi_srcs} ${gapi_3rdparty_srcs})
 ocv_module_include_directories("${CMAKE_CURRENT_LIST_DIR}/src")
 
+# VAS Object Tracking includes
+ocv_module_include_directories(${CMAKE_CURRENT_LIST_DIR}/src/3rdparty/vasot/include)
+
 ocv_create_module()
 
 ocv_target_link_libraries(${the_module} PRIVATE ade)
@@ -363,19 +380,27 @@ if(WIN32)
   ocv_target_link_libraries(${the_module} PRIVATE wsock32 ws2_32)
 endif()
 
+if(HAVE_DIRECTML)
+  ocv_target_compile_definitions(${the_module} PRIVATE HAVE_DIRECTML=1)
+endif()
+
 if(HAVE_ONNX)
   ocv_target_link_libraries(${the_module} PRIVATE ${ONNX_LIBRARY})
   ocv_target_compile_definitions(${the_module} PRIVATE HAVE_ONNX=1)
+  if(HAVE_ONNX_DML)
+    ocv_target_compile_definitions(${the_module} PRIVATE HAVE_ONNX_DML=1)
+  endif()
   if(TARGET opencv_test_gapi)
     ocv_target_compile_definitions(opencv_test_gapi PRIVATE HAVE_ONNX=1)
     ocv_target_link_libraries(opencv_test_gapi PRIVATE ${ONNX_LIBRARY})
   endif()
 endif()
 
+ocv_install_3rdparty_licenses(vasot "${CMAKE_CURRENT_SOURCE_DIR}/src/3rdparty/vasot/LICENSE.txt")
+
 ocv_add_perf_tests()
 ocv_add_samples()
 
-
 # Required for sample with inference on host
 if(TARGET example_gapi_onevpl_infer_with_advanced_device_selection)
   if(TARGET ocv.3rdparty.openvino AND OPENCV_GAPI_WITH_OPENVINO)
diff --git a/modules/gapi/cmake/DownloadADE.cmake b/modules/gapi/cmake/DownloadADE.cmake
index e22c4f1a32b6..871f99b419c1 100644
--- a/modules/gapi/cmake/DownloadADE.cmake
+++ b/modules/gapi/cmake/DownloadADE.cmake
@@ -1,7 +1,7 @@
 set(ade_src_dir "${OpenCV_BINARY_DIR}/3rdparty/ade")
-set(ade_filename "v0.1.2a.zip")
-set(ade_subdir "ade-0.1.2a")
-set(ade_md5 "fa4b3e25167319cb0fa9432ef8281945")
+set(ade_filename "v0.1.2d.zip")
+set(ade_subdir "ade-0.1.2d")
+set(ade_md5 "dbb095a8bf3008e91edbbf45d8d34885")
 ocv_download(FILENAME ${ade_filename}
              HASH ${ade_md5}
              URL
diff --git a/modules/gapi/doc/00-root.markdown b/modules/gapi/doc/00-root.markdown
index b28fbc6619db..cb99495c1b36 100644
--- a/modules/gapi/doc/00-root.markdown
+++ b/modules/gapi/doc/00-root.markdown
@@ -41,6 +41,10 @@ G-API documentation is organized into the following chapters:
 
 - API Reference: functions and classes
 
+    - @subpage gapi_ref
+
+      Core G-API classes, data types, backends, etc.
+
     - @subpage gapi_core
 
       Core G-API operations - arithmetic, boolean, and other matrix
@@ -51,6 +55,14 @@ G-API documentation is organized into the following chapters:
       Image processing functions: color space conversions, various
       filters, etc.
 
+    - @subpage gapi_video
+
+      Video processing functionality.
+
+    - @subpage gapi_draw
+
+      Drawing and composition functionality
+
 # API Example {#gapi_example}
 
 A very basic example of G-API pipeline is shown below:
diff --git a/modules/gapi/doc/30-implementation.markdown b/modules/gapi/doc/30-implementation.markdown
index 7ce4b8012e0c..cdb5df413beb 100644
--- a/modules/gapi/doc/30-implementation.markdown
+++ b/modules/gapi/doc/30-implementation.markdown
@@ -4,7 +4,7 @@
 
 # G-API Implementation details
 
-Note -- this section is still in progress.
+@note this section is still in progress.
 
 # API layer {#gapi_detail_api}
 
diff --git a/modules/gapi/include/opencv2/gapi.hpp b/modules/gapi/include/opencv2/gapi.hpp
index f10dfd471dbf..2087641023c8 100644
--- a/modules/gapi/include/opencv2/gapi.hpp
+++ b/modules/gapi/include/opencv2/gapi.hpp
@@ -10,7 +10,7 @@
 
 #include <memory>
 
-/** \defgroup gapi G-API framework
+/** \defgroup gapi_ref G-API framework
 @{
     @defgroup gapi_main_classes G-API Main Classes
     @defgroup gapi_data_objects G-API Data Types
diff --git a/modules/gapi/include/opencv2/gapi/cpu/ot.hpp b/modules/gapi/include/opencv2/gapi/cpu/ot.hpp
new file mode 100644
index 000000000000..03dbe904cc9d
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/cpu/ot.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CPU_OT_API_HPP
+#define OPENCV_GAPI_CPU_OT_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+/**
+ * @brief This namespace contains G-API Operation Types for
+ * VAS Object Tracking module functionality.
+ */
+namespace ot {
+namespace cpu {
+GAPI_EXPORTS_W GKernelPackage kernels();
+} // namespace cpu
+} // namespace ot
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_OT_API_HPP
diff --git a/modules/gapi/include/opencv2/gapi/garg.hpp b/modules/gapi/include/opencv2/gapi/garg.hpp
index bfe147f8f065..2a8315f9d83e 100644
--- a/modules/gapi/include/opencv2/gapi/garg.hpp
+++ b/modules/gapi/include/opencv2/gapi/garg.hpp
@@ -241,6 +241,7 @@ namespace gapi
  *
  * @brief G-API functions and classes for serialization and deserialization.
  */
+
 /** @brief Wraps deserialized output GRunArgs to GRunArgsP which can be used by GCompiled.
  *
  * Since it's impossible to get modifiable output arguments from deserialization
@@ -254,6 +255,7 @@ namespace gapi
  * @see deserialize
  */
 GAPI_EXPORTS cv::GRunArgsP bind(cv::GRunArgs &out_args);
+
 /** @brief Wraps output GRunArgsP available during graph execution to GRunArgs which can be serialized.
  *
  * GRunArgsP is pointer-to-value, so to be serialized they need to be binded to real values
diff --git a/modules/gapi/include/opencv2/gapi/garray.hpp b/modules/gapi/include/opencv2/gapi/garray.hpp
index b6aa71551804..a2951993f2b2 100644
--- a/modules/gapi/include/opencv2/gapi/garray.hpp
+++ b/modules/gapi/include/opencv2/gapi/garray.hpp
@@ -102,17 +102,17 @@ namespace detail
         GAPI_Assert(m_hint != nullptr);
         using U = typename std::decay<T>::type;
         return dynamic_cast<TypeHint<U>*>(m_hint.get()) != nullptr;
-    };
+    }
 
     template <typename T>
     void GArrayU::specifyType(){
         m_hint.reset(new TypeHint<typename std::decay<T>::type>);
-    };
+    }
 
     template <typename T>
     void GArrayU::storeKind(){
         setKind(cv::detail::GOpaqueTraits<T>::kind);
-    };
+    }
 
     // This class represents a typed STL vector reference.
     // Depending on origins, this reference may be either "just a" reference to
diff --git a/modules/gapi/include/opencv2/gapi/gcommon.hpp b/modules/gapi/include/opencv2/gapi/gcommon.hpp
index b08baaa36597..c61110e4d5b5 100644
--- a/modules/gapi/include/opencv2/gapi/gcommon.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcommon.hpp
@@ -249,6 +249,8 @@ template<typename T> struct wrap_serialize
 } // namespace s11n
 } // namespace gapi
 
+/** @} gapi_compile_args */
+
 /**
  * @brief Ask G-API to dump compiled graph in Graphviz format under
  * the given file name.
@@ -261,7 +263,20 @@ struct graph_dump_path
 {
     std::string m_dump_path;
 };
-/** @} */
+
+/**
+ * @brief Ask G-API to use threaded executor when cv::GComputation
+ * is compiled via cv::GComputation::compile method.
+ *
+ * Specifies a number of threads that should be used by executor.
+ */
+struct GAPI_EXPORTS use_threaded_executor
+{
+    use_threaded_executor();
+    explicit use_threaded_executor(const uint32_t nthreads);
+
+    uint32_t num_threads;
+};
 
 namespace detail
 {
@@ -269,6 +284,11 @@ namespace detail
     {
         static const char* tag() { return "gapi.graph_dump_path"; }
     };
+
+    template<> struct CompileArgTag<cv::use_threaded_executor>
+    {
+        static const char* tag() { return "gapi.threaded_executor"; }
+    };
 }
 
 } // namespace cv
diff --git a/modules/gapi/include/opencv2/gapi/gcomputation.hpp b/modules/gapi/include/opencv2/gapi/gcomputation.hpp
index 13944c78528f..196eb37c6b4f 100644
--- a/modules/gapi/include/opencv2/gapi/gcomputation.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcomputation.hpp
@@ -50,6 +50,7 @@ namespace s11n {
  *
  * @brief G-API classes for constructed and compiled graphs.
  */
+
 /**
  * @brief GComputation class represents a captured computation
  * graph. GComputation objects form boundaries for expression code
diff --git a/modules/gapi/include/opencv2/gapi/gkernel.hpp b/modules/gapi/include/opencv2/gapi/gkernel.hpp
index 1b910adc826c..6ec6bf573d0a 100644
--- a/modules/gapi/include/opencv2/gapi/gkernel.hpp
+++ b/modules/gapi/include/opencv2/gapi/gkernel.hpp
@@ -430,7 +430,7 @@ namespace gapi {
 
         virtual ~GFunctor() = default;
     protected:
-        GFunctor(const char* id) : m_id(id) { };
+        GFunctor(const char* id) : m_id(id) { }
     private:
         const char* m_id;
     };
@@ -692,7 +692,7 @@ namespace gapi {
         int unused[] = { 0, (pkg.include<KK>(), 0)... };
         cv::util::suppress_unused_warning(unused);
         return pkg;
-    };
+    }
 
     template<typename... FF>
     GKernelPackage kernels(FF&... functors)
@@ -701,7 +701,7 @@ namespace gapi {
         int unused[] = { 0, (pkg.include(functors), 0)... };
         cv::util::suppress_unused_warning(unused);
         return pkg;
-    };
+    }
 
     /** @} */
 
diff --git a/modules/gapi/include/opencv2/gapi/gmat.hpp b/modules/gapi/include/opencv2/gapi/gmat.hpp
index 7bea97bbc52b..6d6f74ff7f4d 100644
--- a/modules/gapi/include/opencv2/gapi/gmat.hpp
+++ b/modules/gapi/include/opencv2/gapi/gmat.hpp
@@ -48,6 +48,7 @@ struct GOrigin;
  *    `cv::GOpaque<T>`   | T
  *    cv::GFrame         | cv::MediaFrame
  */
+
 /**
  * @brief GMat class represents image or tensor data in the
  * graph.
@@ -76,6 +77,18 @@ class GAPI_EXPORTS_W_SIMPLE GMat
      */
     GAPI_WRAP GMat();                       // Empty constructor
 
+    /**
+     * @brief Constructs a value-initialized GMat
+     *
+     * GMat may be associated with a buffer at graph construction time.
+     * It is useful when some operation has a Mat input which doesn't
+     * change during the program execution, and is set only once.
+     * In this case, there's no need to declare such GMat as graph input.
+     *
+     * @param m a cv::Mat buffer to associate with this GMat object.
+     */
+    GAPI_WRAP explicit GMat(cv::Mat m);     // Value-initialization constructor
+
     /// @private
     GMat(const GNode &n, std::size_t out);  // Operation result constructor
     /// @private
diff --git a/modules/gapi/include/opencv2/gapi/gopaque.hpp b/modules/gapi/include/opencv2/gapi/gopaque.hpp
index 1d12f127da1b..a3f98a9867e8 100644
--- a/modules/gapi/include/opencv2/gapi/gopaque.hpp
+++ b/modules/gapi/include/opencv2/gapi/gopaque.hpp
@@ -98,18 +98,18 @@ namespace detail
         GAPI_Assert(m_hint != nullptr);
         using U = util::decay_t<T>;
         return dynamic_cast<TypeHint<U>*>(m_hint.get()) != nullptr;
-    };
+    }
 
     template <typename T>
     void GOpaqueU::specifyType(){
         m_hint.reset(new TypeHint<util::decay_t<T>>);
-    };
+    }
 
     template <typename T>
     void GOpaqueU::storeKind(){
         // FIXME: Add assert here on cv::Mat and cv::Scalar?
         setKind(cv::detail::GOpaqueTraits<T>::kind);
-    };
+    }
 
     // This class represents a typed object reference.
     // Depending on origins, this reference may be either "just a" reference to
diff --git a/modules/gapi/include/opencv2/gapi/gscalar.hpp b/modules/gapi/include/opencv2/gapi/gscalar.hpp
index 43d12c782a5d..de0dfe1383c7 100644
--- a/modules/gapi/include/opencv2/gapi/gscalar.hpp
+++ b/modules/gapi/include/opencv2/gapi/gscalar.hpp
@@ -54,12 +54,11 @@ class GAPI_EXPORTS_W_SIMPLE GScalar
     /**
      * @brief Constructs a value-initialized GScalar
      *
-     * In contrast with GMat (which can be either an explicit graph input
-     * or a result of some operation), GScalars may have their values
-     * be associated at graph construction time. It is useful when
-     * some operation has a GScalar input which doesn't change during
-     * the program execution, and is set only once. In this case,
-     * there is no need to declare such GScalar as a graph input.
+     * GScalars may have their values be associated at graph
+     * construction time. It is useful when some operation has a
+     * GScalar input which doesn't change during the program
+     * execution, and is set only once. In this case, there is no need
+     * to declare such GScalar as a graph input.
      *
      * @note The value of GScalar may be overwritten by assigning some
      * other GScalar to the object using `operator=` -- on the
diff --git a/modules/gapi/include/opencv2/gapi/gstreaming.hpp b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
index 5677768a96fe..d413195b8178 100644
--- a/modules/gapi/include/opencv2/gapi/gstreaming.hpp
+++ b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
@@ -388,7 +388,6 @@ class GAPI_EXPORTS_W_SIMPLE GStreamingCompiled
     /// @private
     std::shared_ptr<Priv> m_priv;
 };
-/** @} */
 
 namespace gapi {
 
@@ -409,11 +408,10 @@ namespace streaming {
 struct GAPI_EXPORTS_W_SIMPLE queue_capacity
 {
     GAPI_WRAP
-    explicit queue_capacity(size_t cap = 1) : capacity(cap) { };
+    explicit queue_capacity(size_t cap = 1) : capacity(cap) { }
     GAPI_PROP_RW
     size_t capacity;
 };
-/** @} */
 } // namespace streaming
 } // namespace gapi
 
@@ -425,6 +423,8 @@ template<> struct CompileArgTag<cv::gapi::streaming::queue_capacity>
 };
 }
 
+/** @} gapi_main_classes */
+
 }
 
 #endif // OPENCV_GAPI_GSTREAMING_COMPILED_HPP
diff --git a/modules/gapi/include/opencv2/gapi/gtransform.hpp b/modules/gapi/include/opencv2/gapi/gtransform.hpp
index 109bc87b7f04..ce88c894d7f3 100644
--- a/modules/gapi/include/opencv2/gapi/gtransform.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtransform.hpp
@@ -91,7 +91,7 @@ class GTransformImpl<K, std::function<R(Args...)>> : public cv::detail::TransHel
     {                                                           \
     struct G_DESCR_HELPER_CLASS(Class)                          \
     {                                                           \
-        static constexpr const char *descr() { return Descr; }; \
+        static constexpr const char *descr() { return Descr; }  \
     };                                                          \
     }
 
diff --git a/modules/gapi/include/opencv2/gapi/gtype_traits.hpp b/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
index b56175788f2a..c42d64a7617c 100644
--- a/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
@@ -141,8 +141,10 @@ namespace detail
     template<typename U> struct GTypeOf<std::vector<U> >       { using type = cv::GArray<U>; };
     template<typename U> struct GTypeOf                        { using type = cv::GOpaque<U>;};
     template<>           struct GTypeOf<cv::MediaFrame>        { using type = cv::GFrame;    };
-    // FIXME: This is not quite correct since IStreamSource may produce not only Mat but also Scalar
-    // and vector data. TODO: Extend the type dispatching on these types too.
+
+    // FIXME: This is not quite correct since IStreamSource may
+    // produce not only Mat but also MediaFrame, Scalar and vector
+    // data. TODO: Extend the type dispatching on these types too.
     template<>           struct GTypeOf<cv::gapi::wip::IStreamSource::Ptr> { using type = cv::GMat;};
     template<class T> using g_type_of_t = typename GTypeOf<T>::type;
 
@@ -229,10 +231,10 @@ template<typename T> struct GObtainCtor {
     static HostCtor get() { return HostCtor{}; }
 };
 template<typename T> struct GObtainCtor<GArray<T> > {
-    static HostCtor get() { return HostCtor{ConstructVec{&GArray<T>::VCtor}}; };
+    static HostCtor get() { return HostCtor{ConstructVec{&GArray<T>::VCtor}}; }
 };
 template<typename T> struct GObtainCtor<GOpaque<T> > {
-    static HostCtor get() { return HostCtor{ConstructOpaque{&GOpaque<T>::Ctor}}; };
+    static HostCtor get() { return HostCtor{ConstructOpaque{&GOpaque<T>::Ctor}}; }
 };
 } // namespace detail
 } // namespace cv
diff --git a/modules/gapi/include/opencv2/gapi/gtyped.hpp b/modules/gapi/include/opencv2/gapi/gtyped.hpp
index c1c16d17670c..2acc2f7ffbf0 100644
--- a/modules/gapi/include/opencv2/gapi/gtyped.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtyped.hpp
@@ -40,7 +40,7 @@ namespace detail
     //workaround for MSVC 19.0 bug
     template <typename T>
     auto make_default()->decltype(T{}) {return {};}
-}; // detail
+} // detail
 
 /**
  * @brief This class is a typed wrapper over a regular GComputation.
diff --git a/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp b/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp
index af9f3c6f6f7d..fb2376ece881 100644
--- a/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/bindings_onnx.hpp
@@ -33,6 +33,24 @@ class GAPI_EXPORTS_W_SIMPLE PyParams {
     GAPI_WRAP
     PyParams& cfgNormalize(const std::string &layer_name, bool flag);
 
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::OpenVINO ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::DirectML ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::CoreML ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::CUDA ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::TensorRT ep);
+
+    GAPI_WRAP
+    PyParams& cfgDisableMemPattern();
+
     GBackend backend() const;
     std::string tag() const;
     cv::util::any params() const;
diff --git a/modules/gapi/include/opencv2/gapi/infer/ie.hpp b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
index b403479ca2da..9f9518d0b8e0 100644
--- a/modules/gapi/include/opencv2/gapi/infer/ie.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
@@ -173,7 +173,7 @@ template<typename Net> class Params {
               , {}
               , {}
               , {} } {
-    };
+    }
 
     /** @overload
     Use this constructor to work with pre-compiled network.
@@ -202,7 +202,7 @@ template<typename Net> class Params {
               , {}
               , {}
               , {} } {
-    };
+    }
 
     /** @brief Specifies sequence of network input layers names for inference.
 
@@ -547,7 +547,7 @@ class Params<cv::gapi::Generic> {
                 detail::ParamDesc::Kind::Load, true, {}, {}, {}, 1u,
                 {}, {}, {}, {}, InferMode::Async, {}, {}, {}, {} },
           m_tag(tag) {
-    };
+    }
 
     /** @overload
 
@@ -565,7 +565,7 @@ class Params<cv::gapi::Generic> {
                 detail::ParamDesc::Kind::Import, true, {}, {}, {}, 1u,
                 {}, {}, {}, {}, InferMode::Async, {}, {}, {}, {} },
           m_tag(tag) {
-    };
+    }
 
     /** @see ie::Params::pluginConfig. */
     Params& pluginConfig(const IEConfig& cfg) {
diff --git a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
index dc9a51e541c7..f985b41d71bf 100644
--- a/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/onnx.hpp
@@ -11,6 +11,7 @@
 #include <string>
 #include <array>
 #include <tuple> // tuple, tuple_size
+#include <map>
 
 #include <opencv2/gapi/opencv_includes.hpp>
 #include <opencv2/gapi/util/any.hpp>
@@ -27,6 +28,277 @@ namespace gapi {
  */
 namespace onnx {
 
+/**
+ * @brief This namespace contains Execution Providers structures for G-API ONNX Runtime backend.
+ */
+namespace ep {
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX CoreML Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE CoreML {
+    /** @brief Class constructor.
+
+    Constructs CoreML parameters.
+
+    */
+    GAPI_WRAP
+    CoreML() = default;
+
+    /** @brief Limit CoreML Execution Provider to run on CPU only.
+
+    This function is used to limit CoreML to run on CPU only.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_use_cpu_only
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgUseCPUOnly() {
+        use_cpu_only = true;
+        return *this;
+    }
+
+    /** @brief Enable CoreML EP to run on a subgraph in the body of a control flow ONNX operator (i.e. a Loop, Scan or If operator).
+
+    This function is used to enable CoreML EP to run on
+    a subgraph of a control flow of ONNX operation.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_enable_on_subgraph
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgEnableOnSubgraph() {
+        enable_on_subgraph = true;
+        return *this;
+    }
+
+    /** @brief Enable CoreML EP to run only on Apple Neural Engine.
+
+    This function is used to enable CoreML EP to run only on Apple Neural Engine.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_only_enable_device_with_ane
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgEnableOnlyNeuralEngine() {
+        enable_only_ane = true;
+        return *this;
+    }
+
+    bool use_cpu_only = false;
+    bool enable_on_subgraph = false;
+    bool enable_only_ane = false;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for CUDA Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#cuda-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE CUDA {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    CUDA() = default;
+
+    /** @brief Class constructor.
+
+    Constructs CUDA parameters based on device type information.
+
+    @param dev_id Target device id to use.
+    */
+    GAPI_WRAP
+    explicit CUDA(const int dev_id)
+        : device_id(dev_id) {
+    }
+
+    int device_id;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for TensorRT Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#tensorrt-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE TensorRT {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    TensorRT() = default;
+
+    /** @brief Class constructor.
+
+    Constructs TensorRT parameters based on device type information.
+
+    @param dev_id Target device id to use.
+    */
+    GAPI_WRAP
+    explicit TensorRT(const int dev_id)
+        : device_id(dev_id) {
+    }
+
+    int device_id;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX OpenVINO Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#summary-of-options
+ */
+struct GAPI_EXPORTS_W_SIMPLE OpenVINO {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    OpenVINO() = default;
+
+    /** @brief Class constructor.
+
+    Constructs OpenVINO parameters based on device type information.
+
+    @param dev_type Target device type to use. ("CPU", "GPU", "GPU.0" etc)
+    */
+    GAPI_WRAP
+    explicit OpenVINO(const std::string &dev_type)
+        : device_type(dev_type) {
+    }
+
+    /** @brief Class constructor.
+
+    Constructs OpenVINO parameters based on map of options passed.
+
+    * @param params A map of parameter names and their corresponding string values.
+    */
+    GAPI_WRAP
+    explicit OpenVINO(const std::map<std::string, std::string>& params)
+        : params_map(params) {
+    }
+
+    /** @brief Specifies OpenVINO Execution Provider cache dir.
+
+    This function is used to explicitly specify the path to save and load
+    the blobs enabling model caching feature.
+
+    @param dir Path to the directory what will be used as cache.
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgCacheDir(const std::string &dir) {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        cache_dir = dir;
+        return *this;
+    }
+
+    /** @brief Specifies OpenVINO Execution Provider number of threads.
+
+    This function is used to override the accelerator default value
+    of number of threads with this value at runtime.
+
+    @param nthreads Number of threads.
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgNumThreads(size_t nthreads) {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        num_of_threads = nthreads;
+        return *this;
+    }
+
+    /** @brief Enables OpenVINO Execution Provider opencl throttling.
+
+    This function is used to enable OpenCL queue throttling for GPU devices
+    (reduces CPU utilization when using GPU).
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgEnableOpenCLThrottling() {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        enable_opencl_throttling = true;
+        return *this;
+    }
+
+    /** @brief Enables OpenVINO Execution Provider dynamic shapes.
+
+    This function is used to enable OpenCL queue throttling for GPU devices
+    (reduces CPU utilization when using GPU).
+    This function is used to enable work with dynamic shaped models
+    whose shape will be set dynamically based on the infer input
+    image/data shape at run time in CPU.
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgEnableDynamicShapes() {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        enable_dynamic_shapes = true;
+        return *this;
+    }
+
+    std::string device_type;
+    std::string cache_dir;
+    size_t num_of_threads = 0;
+    bool enable_opencl_throttling = false;
+    bool enable_dynamic_shapes = false;
+    std::map<std::string, std::string> params_map;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX DirectML Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html#directml-execution-provider
+ */
+class GAPI_EXPORTS_W_SIMPLE DirectML {
+public:
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    DirectML() = default;
+
+    /** @brief Class constructor.
+
+    Constructs DirectML parameters based on device id.
+
+    @param device_id Target device id to use. ("0", "1", etc)
+    */
+    GAPI_WRAP
+    explicit DirectML(const int device_id) : ddesc(device_id) { };
+
+    /** @brief Class constructor.
+
+    Constructs DirectML parameters based on adapter name.
+
+    @param adapter_name Target adapter_name to use.
+    */
+    GAPI_WRAP
+    explicit DirectML(const std::string &adapter_name) : ddesc(adapter_name) { };
+
+    using DeviceDesc = cv::util::variant<int, std::string>;
+    DeviceDesc ddesc;
+};
+
+using EP = cv::util::variant< cv::util::monostate
+                            , OpenVINO
+                            , DirectML
+                            , CoreML
+                            , CUDA
+                            , TensorRT>;
+
+} // namespace ep
+
 GAPI_EXPORTS cv::gapi::GBackend backend();
 
 enum class TraitAs: int {
@@ -78,6 +350,9 @@ struct ParamDesc {
     // when the generic infer parameters are unpacked (see GONNXBackendImpl::unpackKernel)
     std::unordered_map<std::string, std::pair<cv::Scalar, cv::Scalar> > generic_mstd;
     std::unordered_map<std::string, bool> generic_norm;
+
+    std::vector<cv::gapi::onnx::ep::EP> execution_providers;
+    bool disable_mem_pattern;
 };
 } // namespace detail
 
@@ -115,7 +390,8 @@ template<typename Net> class Params {
         desc.num_in  = std::tuple_size<typename Net::InArgs>::value;
         desc.num_out = std::tuple_size<typename Net::OutArgs>::value;
         desc.is_generic = false;
-    };
+        desc.disable_mem_pattern = false;
+    }
 
     /** @brief Specifies sequence of network input layers names for inference.
 
@@ -279,6 +555,85 @@ template<typename Net> class Params {
         return *this;
     }
 
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime OpenVINO Execution Provider options.
+
+    @param ep OpenVINO Execution Provider options.
+    @see cv::gapi::onnx::ep::OpenVINO.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::OpenVINO&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime DirectML Execution Provider options.
+
+    @param ep DirectML Execution Provider options.
+    @see cv::gapi::onnx::ep::DirectML.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::DirectML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime CoreML Execution Provider options.
+
+    @param ep CoreML Execution Provider options.
+    @see cv::gapi::onnx::ep::CoreML.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::CoreML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime CUDA Execution Provider options.
+
+    @param ep CUDA Execution Provider options.
+    @see cv::gapi::onnx::ep::CUDA.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::CUDA&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime TensorRT Execution Provider options.
+
+    @param ep TensorRT Execution Provider options.
+    @see cv::gapi::onnx::ep::TensorRT.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::TensorRT&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Disables the memory pattern optimization.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgDisableMemPattern() {
+        desc.disable_mem_pattern = true;
+        return *this;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend() const { return cv::gapi::onnx::backend(); }
     std::string   tag()     const { return Net::tag(); }
@@ -306,18 +661,50 @@ class Params<cv::gapi::Generic> {
     @param model_path path to model file (.onnx file).
     */
     Params(const std::string& tag, const std::string& model_path)
-        : desc{model_path, 0u, 0u, {}, {}, {}, {}, {}, {}, {}, {}, {}, true, {}, {} }, m_tag(tag) {}
+        : desc{model_path, 0u, 0u, {}, {}, {}, {}, {}, {}, {}, {}, {}, true, {}, {}, {}, false }, m_tag(tag) {}
 
+    /** @see onnx::Params::cfgMeanStdDev. */
     void cfgMeanStdDev(const std::string &layer,
                        const cv::Scalar &m,
                        const cv::Scalar &s) {
         desc.generic_mstd[layer] = std::make_pair(m, s);
     }
 
+    /** @see onnx::Params::cfgNormalize. */
     void cfgNormalize(const std::string &layer, bool flag) {
         desc.generic_norm[layer] = flag;
     }
 
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::OpenVINO&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::DirectML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::CoreML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::CUDA&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::TensorRT&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgDisableMemPattern. */
+    void cfgDisableMemPattern() {
+        desc.disable_mem_pattern = true;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend() const { return cv::gapi::onnx::backend(); }
     std::string   tag()     const { return m_tag; }
diff --git a/modules/gapi/include/opencv2/gapi/infer/ov.hpp b/modules/gapi/include/opencv2/gapi/infer/ov.hpp
index 99d701f9372a..782792489bac 100644
--- a/modules/gapi/include/opencv2/gapi/infer/ov.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/ov.hpp
@@ -679,7 +679,31 @@ class Params<cv::gapi::Generic> {
 };
 
 } // namespace ov
+
+namespace wip { namespace ov {
+/**
+ * @brief Ask G-API OpenVINO backend to run only inference of model provided.
+ *
+ * G-API OpenVINO backend will perform only the inference of the model provided
+ * without populating input and copying back output data.
+ * This mode is used to evaluate the pure inference performance of the model without
+ * taking into account the i/o data transfer.
+ */
+struct benchmark_mode { };
+
+} // namespace ov
+} // namespace wip
+
 } // namespace gapi
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::gapi::wip::ov::benchmark_mode>
+    {
+        static const char* tag() { return "gapi.wip.ov.benchmark_mode"; }
+    };
+}
+
 } // namespace cv
 
 #endif // OPENCV_GAPI_INFER_OV_HPP
diff --git a/modules/gapi/include/opencv2/gapi/media.hpp b/modules/gapi/include/opencv2/gapi/media.hpp
index 5da8eeab48a2..1470f00d042e 100644
--- a/modules/gapi/include/opencv2/gapi/media.hpp
+++ b/modules/gapi/include/opencv2/gapi/media.hpp
@@ -33,6 +33,7 @@ namespace cv {
  * @brief Extra G-API data structures used to pass input/output data
  * to the graph for processing.
  */
+
 /**
  * @brief cv::MediaFrame class represents an image/media frame
  * obtained from an external source.
diff --git a/modules/gapi/include/opencv2/gapi/ot.hpp b/modules/gapi/include/opencv2/gapi/ot.hpp
new file mode 100644
index 000000000000..b73d7e6ee003
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/ot.hpp
@@ -0,0 +1,194 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_OT_HPP
+#define OPENCV_GAPI_OT_HPP
+
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/s11n.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+
+namespace cv {
+namespace gapi {
+/**
+ * @brief This namespace contains G-API Operation Types for
+ * VAS Object Tracking module functionality.
+ */
+namespace ot {
+
+/**
+ * @enum TrackingStatus
+ *
+ * Tracking status twin for vas::ot::TrackingStatus
+ */
+enum TrackingStatus
+{
+    NEW = 0,     /**< The object is newly added. */
+    TRACKED,     /**< The object is being tracked. */
+    LOST         /**< The object gets lost now. The object can be tracked again
+                      by specifying detected object manually. */
+};
+
+struct GAPI_EXPORTS_W_SIMPLE ObjectTrackerParams
+{
+    /**
+     * Maximum number of trackable objects in a frame.
+     * Valid range: 1 <= max_num_objects. Or it can be -1 if there is no limitation
+     * of maximum number in X86. KMB/TBH has limitation up to 1024.
+     * Default value is -1 which means there is no limitation in X86. KMB/TBH is -1 means 200.
+     */
+    GAPI_PROP_RW int32_t max_num_objects = -1;
+
+    /**
+     * Input color format. Supports 0(BGR), 1(NV12), 2(BGRX) and 4(I420)
+     */
+    GAPI_PROP_RW int32_t input_image_format = 0;
+
+    /**
+     * Specifies whether tracker to use detection class for keeping id of an object.
+     * If it is true, new detection will be associated from previous tracking only when
+     * those two have same class.
+     * class id of an object is fixed across video frames.
+     * If it is false, new detection can be associated across different-class objects.
+     * In this case, the class id of an object may change across video frames depending on the tracker input.
+     * It is recommended to turn this option off when it is likely that detector confuses the class of object.
+     * For example, when detector confuses bicycle and motorbike. Turning this option off will increase
+     * the tracking reliability as tracker will ignore the class label of detector.
+     * @n
+     * Default value is true.
+     */
+    GAPI_PROP_RW bool tracking_per_class = true;
+
+    bool operator==(const ObjectTrackerParams& other) const
+    {
+        return max_num_objects == other.max_num_objects
+            && input_image_format == other.input_image_format
+            && tracking_per_class == other.tracking_per_class;
+    }
+};
+
+using GTrackedInfo = std::tuple<cv::GArray<cv::Rect>, cv::GArray<int32_t>, cv::GArray<uint64_t>, cv::GArray<int>>;
+
+G_API_OP(GTrackFromMat, <GTrackedInfo(cv::GMat, cv::GArray<cv::Rect>, cv::GArray<int32_t>, float)>, "com.intel.track_from_mat")
+{
+    static std::tuple<cv::GArrayDesc, cv::GArrayDesc,
+                      cv::GArrayDesc, cv::GArrayDesc> outMeta(cv::GMatDesc, cv::GArrayDesc, cv::GArrayDesc, float)
+    {
+        return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc(),
+                               cv::empty_array_desc(), cv::empty_array_desc());
+    }
+};
+
+G_API_OP(GTrackFromFrame, <GTrackedInfo(cv::GFrame, cv::GArray<cv::Rect>, cv::GArray<int32_t>, float)>, "com.intel.track_from_frame")
+{
+    static std::tuple<cv::GArrayDesc, cv::GArrayDesc,
+                      cv::GArrayDesc, cv::GArrayDesc> outMeta(cv::GFrameDesc, cv::GArrayDesc, cv::GArrayDesc, float)
+    {
+       return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc(),
+                              cv::empty_array_desc(), cv::empty_array_desc());
+    }
+};
+
+/**
+ * @brief   Tracks objects with video frames.
+ *          If a detected object is overlapped enough with one of tracked object, the tracked object's
+ *          informationis updated with the input detected object.
+ *          On the other hand, if a detected object is overlapped with none of tracked objects,
+ *          the detected object is newly added and ObjectTracker starts to track the object.
+ *          In zero term tracking type, ObjectTracker clears tracked objects in case that empty
+ *          list of detected objects is passed in.
+ *
+ * @param mat                       Input frame.
+ * @param detected_rects            Detected objects rectangles in the input frame.
+ * @param detected_class_labels     Detected objects class labels in the input frame.
+ * @param delta                     Frame_delta_t Delta time between two consecutive tracking in seconds.
+ *                                  The valid range is [0.005 ~ 0.5].
+ * @return                          Tracking results of target objects.
+ *                                  cv::GArray<cv::Rect>  Array of rectangles for tracked objects.
+ *                                  cv::GArray<int32_t>   Array of detected objects labels.
+ *                                  cv::GArray<uint64_t>  Array of tracking IDs for objects.
+ *                                                        Numbering sequence starts from 1.
+ *                                                        The value 0 means the tracking ID of this object has
+ *                                                        not been assigned.
+ *                                  cv::GArray<int>       Array of tracking statuses for objects.
+ */
+GAPI_EXPORTS_W std::tuple<cv::GArray<cv::Rect>,
+                          cv::GArray<int>,
+                          cv::GArray<uint64_t>,
+                          cv::GArray<int>>
+    track(const cv::GMat& mat,
+          const cv::GArray<cv::Rect>& detected_rects,
+          const cv::GArray<int>& detected_class_labels,
+          float delta);
+
+
+/**
+   @overload
+ * @brief   Tracks objects with video frames. Overload of track(...) for frame as GFrame.
+ *
+ * @param frame                     Input frame.
+ * @param detected_rects            Detected objects rectangles in the input frame.
+ * @param detected_class_labels     Detected objects class labels in the input frame.
+ * @param delta                     Frame_delta_t Delta time between two consecutive tracking in seconds.
+ *                                  The valid range is [0.005 ~ 0.5].
+ * @return                          Tracking results of target objects.
+ * @return                          Tracking results of target objects.
+ *                                  cv::GArray<cv::Rect>          Array of rectangles for tracked objects.
+ *                                  cv::GArray<int32_t>           Array of detected objects labels.
+ *                                  cv::GArray<uint64_t>          Array of tracking IDs for objects.
+ *                                                                Numbering sequence starts from 1.
+ *                                                                The value 0 means the tracking ID of this object has
+ *                                                                not been assigned.
+ *                                  cv::GArray<int>    Array of tracking statuses for objects.
+ */
+GAPI_EXPORTS_W std::tuple<cv::GArray<cv::Rect>,
+                         cv::GArray<int>,
+                         cv::GArray<uint64_t>,
+                         cv::GArray<int>>
+    track(const cv::GFrame& frame,
+          const cv::GArray<cv::Rect>& detected_rects,
+          const cv::GArray<int>& detected_class_labels,
+          float delta);
+} // namespace ot
+} // namespace gapi
+} // namespace cv
+
+// FIXME: move to a separate file?
+namespace cv
+{
+namespace detail
+{
+template<> struct CompileArgTag<cv::gapi::ot::ObjectTrackerParams>
+{
+    static const char* tag()
+    {
+        return "cv.gapi.ot.object_tracker_params";
+    }
+};
+} // namespace detail
+
+namespace gapi
+{
+namespace s11n
+{
+namespace detail
+{
+template<> struct S11N<cv::gapi::ot::ObjectTrackerParams> {
+    static void serialize(IOStream &os, const cv::gapi::ot::ObjectTrackerParams &p) {
+        os << p. max_num_objects << p.input_image_format << p.tracking_per_class;
+    }
+    static cv::gapi::ot::ObjectTrackerParams deserialize(IIStream &is) {
+        cv::gapi::ot::ObjectTrackerParams p;
+        is >> p. max_num_objects >> p.input_image_format >> p.tracking_per_class;
+        return p;
+    }
+};
+} // namespace detail
+} // namespace s11n
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_OT_HPP
diff --git a/modules/gapi/include/opencv2/gapi/own/convert.hpp b/modules/gapi/include/opencv2/gapi/own/convert.hpp
index 1a8ecd8bc677..f587e2478702 100644
--- a/modules/gapi/include/opencv2/gapi/own/convert.hpp
+++ b/modules/gapi/include/opencv2/gapi/own/convert.hpp
@@ -31,7 +31,7 @@ namespace cv
         return (m.dims == 2)
             ?  cv::gapi::own::Mat{m.rows, m.cols, m.type(), m.data, m.step}
             :  cv::gapi::own::Mat{to_own<int>(m.size), m.type(), m.data};
-    };
+    }
 
 namespace gapi
 {
diff --git a/modules/gapi/include/opencv2/gapi/own/scalar.hpp b/modules/gapi/include/opencv2/gapi/own/scalar.hpp
index bda91c83b58a..3b107befccaa 100644
--- a/modules/gapi/include/opencv2/gapi/own/scalar.hpp
+++ b/modules/gapi/include/opencv2/gapi/own/scalar.hpp
@@ -21,7 +21,7 @@ class GAPI_EXPORTS Scalar
 {
 public:
     Scalar() = default;
-    explicit Scalar(double v0) { val[0] = v0; };
+    explicit Scalar(double v0) { val[0] = v0; }
     Scalar(double v0, double v1, double v2 = 0, double v3 = 0)
         : val{v0, v1, v2, v3}
     {
diff --git a/modules/gapi/include/opencv2/gapi/s11n.hpp b/modules/gapi/include/opencv2/gapi/s11n.hpp
index 0bf368a8562d..a94f55c249af 100644
--- a/modules/gapi/include/opencv2/gapi/s11n.hpp
+++ b/modules/gapi/include/opencv2/gapi/s11n.hpp
@@ -337,7 +337,7 @@ namespace detail {
 template<typename V>
 IOStream& put_v(IOStream&, const V&, std::size_t) {
     GAPI_Error("variant>>: requested index is invalid");
-};
+}
 
 template<typename V, typename X, typename... Xs>
 IOStream& put_v(IOStream& os, const V& v, std::size_t x) {
diff --git a/modules/gapi/include/opencv2/gapi/streaming/queue_source.hpp b/modules/gapi/include/opencv2/gapi/streaming/queue_source.hpp
new file mode 100644
index 000000000000..bd385ed16e73
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/streaming/queue_source.hpp
@@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_QUEUE_SOURCE_HPP
+#define OPENCV_GAPI_STREAMING_QUEUE_SOURCE_HPP
+
+#include <memory>                      // shared_ptr
+#include <type_traits>                 // is_base_of
+
+#include <opencv2/gapi/garg.hpp>       // GRunArgs
+#include <opencv2/gapi/gmetaarg.hpp>   // GMetaArg + all descr_of
+#include <opencv2/gapi/streaming/source.hpp> // IStreamSource
+
+namespace cv {
+namespace gapi {
+namespace wip {
+struct Data; // fwd-declare to avoid circular? header dependencies
+
+class GAPI_EXPORTS QueueSourceBase: public cv::gapi::wip::IStreamSource {
+    class Priv;
+    std::shared_ptr<Priv> m_priv;
+    // FIXME: Need to understand how it works with IStreamSource's shared_from_this
+    // Can we avoid having too many shared_ptrs here?
+
+public:
+    explicit QueueSourceBase(const cv::GMetaArg &m);
+    void push(Data &&data);
+    virtual bool pull(Data &data) override;
+    virtual void halt() override;
+    virtual GMetaArg descr_of() const override;
+    virtual ~QueueSourceBase() = default;
+};
+
+/**
+ * @brief Queued streaming pipeline source.
+ *
+ */
+template<class T>
+class QueueSource final: public QueueSourceBase
+{
+public:
+    using Meta = decltype(cv::descr_of(T{}));
+    explicit QueueSource(Meta m) : QueueSourceBase(GMetaArg{m}) {
+    }
+    void push(T t) {
+        QueueSourceBase::push(Data{t});
+    }
+};
+
+class GAPI_EXPORTS QueueInput {
+    std::vector<std::shared_ptr<QueueSourceBase> > m_sources;
+
+public:
+    explicit QueueInput(const cv::GMetaArgs &args);
+
+    void push(cv::GRunArgs &&ins);
+    operator cv::GRunArgs();
+};
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_SOURCE_HPP
diff --git a/modules/gapi/include/opencv2/gapi/streaming/source.hpp b/modules/gapi/include/opencv2/gapi/streaming/source.hpp
index 6597cad8f808..267469ad1b30 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/source.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/source.hpp
@@ -16,7 +16,7 @@
 namespace cv {
 namespace gapi {
 namespace wip {
-    struct Data; // "forward-declaration" of GRunArg
+struct Data; // forward-declaration of Data to avoid circular dependencies
 
 /**
  * @brief Abstract streaming pipeline source.
@@ -43,6 +43,11 @@ class IStreamSource: public std::enable_shared_from_this<IStreamSource>
     Ptr ptr() { return shared_from_this(); }
     virtual bool pull(Data &data) = 0;
     virtual GMetaArg descr_of() const = 0;
+    virtual void halt() {
+        // Do nothing by default to maintain compatibility with the existing sources...
+        // In fact needs to be decorated atop of the child classes to maintain the behavior
+        // FIXME: Make it mandatory in OpenCV 5.0
+    };
     virtual ~IStreamSource() = default;
 };
 
diff --git a/modules/gapi/include/opencv2/gapi/util/variant.hpp b/modules/gapi/include/opencv2/gapi/util/variant.hpp
index f412110deb76..48b55646c53d 100644
--- a/modules/gapi/include/opencv2/gapi/util/variant.hpp
+++ b/modules/gapi/include/opencv2/gapi/util/variant.hpp
@@ -509,6 +509,11 @@ namespace util
         return v.index() == util::variant<Types...>::template index_of<T>();
     }
 
+#if defined(__GNUC__) && (__GNUC__ == 11 || __GNUC__ == 12)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
     template<typename... Us> bool operator==(const variant<Us...> &lhs,
                                              const variant<Us...> &rhs)
     {
@@ -524,6 +529,10 @@ namespace util
         return (eqs[lhs.index()])(lhs.memory, rhs.memory);
     }
 
+#if defined(__GNUC__) && (__GNUC__ == 11 || __GNUC__ == 12)
+#pragma GCC diagnostic pop
+#endif
+
     template<typename... Us> bool operator!=(const variant<Us...> &lhs,
                                              const variant<Us...> &rhs)
     {
diff --git a/modules/gapi/misc/python/package/gapi/__init__.py b/modules/gapi/misc/python/package/gapi/__init__.py
index 992fa2aee253..2b21e54e4195 100644
--- a/modules/gapi/misc/python/package/gapi/__init__.py
+++ b/modules/gapi/misc/python/package/gapi/__init__.py
@@ -56,6 +56,14 @@ class Int():
         def __new__(self):
             return cv.GOpaqueT(cv.gapi.CV_INT)
 
+    class Int64():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_INT64)
+
+    class UInt64():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_UINT64)
+
     class Double():
         def __new__(self):
             return cv.GOpaqueT(cv.gapi.CV_DOUBLE)
@@ -111,6 +119,14 @@ class Int():
         def __new__(self):
             return cv.GArrayT(cv.gapi.CV_INT)
 
+    class Int64():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_INT64)
+
+    class UInt64():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_UINT64)
+
     class Double():
         def __new__(self):
             return cv.GArrayT(cv.gapi.CV_DOUBLE)
@@ -170,6 +186,8 @@ def op(op_id, in_types, out_types):
     garray_types= {
             cv.GArray.Bool:    cv.gapi.CV_BOOL,
             cv.GArray.Int:     cv.gapi.CV_INT,
+            cv.GArray.Int64:   cv.gapi.CV_INT64,
+            cv.GArray.UInt64:  cv.gapi.CV_UINT64,
             cv.GArray.Double:  cv.gapi.CV_DOUBLE,
             cv.GArray.Float:   cv.gapi.CV_FLOAT,
             cv.GArray.String:  cv.gapi.CV_STRING,
@@ -190,6 +208,8 @@ def op(op_id, in_types, out_types):
             cv.GOpaque.Rect:    cv.gapi.CV_RECT,
             cv.GOpaque.Bool:    cv.gapi.CV_BOOL,
             cv.GOpaque.Int:     cv.gapi.CV_INT,
+            cv.GOpaque.Int64:   cv.gapi.CV_INT64,
+            cv.GOpaque.UInt64:  cv.gapi.CV_UINT64,
             cv.GOpaque.Double:  cv.gapi.CV_DOUBLE,
             cv.GOpaque.Float:   cv.gapi.CV_FLOAT,
             cv.GOpaque.String:  cv.gapi.CV_STRING,
@@ -205,6 +225,8 @@ def op(op_id, in_types, out_types):
     type2str = {
         cv.gapi.CV_BOOL:      'cv.gapi.CV_BOOL' ,
         cv.gapi.CV_INT:       'cv.gapi.CV_INT' ,
+        cv.gapi.CV_INT64:     'cv.gapi.CV_INT64' ,
+        cv.gapi.CV_UINT64:    'cv.gapi.CV_UINT64' ,
         cv.gapi.CV_DOUBLE:    'cv.gapi.CV_DOUBLE' ,
         cv.gapi.CV_FLOAT:     'cv.gapi.CV_FLOAT' ,
         cv.gapi.CV_STRING:    'cv.gapi.CV_STRING' ,
diff --git a/modules/gapi/misc/python/pyopencv_gapi.hpp b/modules/gapi/misc/python/pyopencv_gapi.hpp
index 70698ffd48f8..66c3910756b0 100644
--- a/modules/gapi/misc/python/pyopencv_gapi.hpp
+++ b/modules/gapi/misc/python/pyopencv_gapi.hpp
@@ -29,6 +29,11 @@ using map_string_and_string         = std::map<std::string, std::string>;
 using map_string_and_vector_size_t  = std::map<std::string, std::vector<size_t>>;
 using map_string_and_vector_float   = std::map<std::string, std::vector<float>>;
 using map_int_and_double            = std::map<int, double>;
+using ep_OpenVINO                   = cv::gapi::onnx::ep::OpenVINO;
+using ep_DirectML                   = cv::gapi::onnx::ep::DirectML;
+using ep_CoreML                     = cv::gapi::onnx::ep::CoreML;
+using ep_CUDA                       = cv::gapi::onnx::ep::CUDA;
+using ep_TensorRT                   = cv::gapi::onnx::ep::TensorRT;
 
 // NB: Python wrapper generate T_U for T<U>
 // This behavior is only observed for inputs
@@ -242,6 +247,7 @@ PyObject* pyopencv_from(const cv::GArg& value)
         HANDLE_CASE(BOOL,      bool);
         HANDLE_CASE(INT,       int);
         HANDLE_CASE(INT64,     int64_t);
+        HANDLE_CASE(UINT64,    uint64_t);
         HANDLE_CASE(DOUBLE,    double);
         HANDLE_CASE(FLOAT,     float);
         HANDLE_CASE(STRING,    std::string);
@@ -254,7 +260,6 @@ PyObject* pyopencv_from(const cv::GArg& value)
         HANDLE_CASE(MAT,       cv::Mat);
         HANDLE_CASE(UNKNOWN,   cv::detail::PyObjectHolder);
         HANDLE_CASE(DRAW_PRIM, cv::gapi::wip::draw::Prim);
-        UNSUPPORTED(UINT64);
 #undef HANDLE_CASE
 #undef UNSUPPORTED
     }
@@ -300,6 +305,7 @@ PyObject* pyopencv_from(const cv::detail::OpaqueRef& o)
         case cv::detail::OpaqueKind::CV_BOOL      : return pyopencv_from(o.rref<bool>());
         case cv::detail::OpaqueKind::CV_INT       : return pyopencv_from(o.rref<int>());
         case cv::detail::OpaqueKind::CV_INT64     : return pyopencv_from(o.rref<int64_t>());
+        case cv::detail::OpaqueKind::CV_UINT64    : return pyopencv_from(o.rref<uint64_t>());
         case cv::detail::OpaqueKind::CV_DOUBLE    : return pyopencv_from(o.rref<double>());
         case cv::detail::OpaqueKind::CV_FLOAT     : return pyopencv_from(o.rref<float>());
         case cv::detail::OpaqueKind::CV_STRING    : return pyopencv_from(o.rref<std::string>());
@@ -310,14 +316,13 @@ PyObject* pyopencv_from(const cv::detail::OpaqueRef& o)
         case cv::detail::OpaqueKind::CV_RECT      : return pyopencv_from(o.rref<cv::Rect>());
         case cv::detail::OpaqueKind::CV_UNKNOWN   : return pyopencv_from(o.rref<cv::GArg>());
         case cv::detail::OpaqueKind::CV_DRAW_PRIM : return pyopencv_from(o.rref<cv::gapi::wip::draw::Prim>());
-        case cv::detail::OpaqueKind::CV_UINT64    : break;
         case cv::detail::OpaqueKind::CV_SCALAR    : break;
         case cv::detail::OpaqueKind::CV_MAT       : break;
     }
 
     PyErr_SetString(PyExc_TypeError, "Unsupported GOpaque type");
     return NULL;
-};
+}
 
 template <>
 PyObject* pyopencv_from(const cv::detail::VectorRef& v)
@@ -327,6 +332,7 @@ PyObject* pyopencv_from(const cv::detail::VectorRef& v)
         case cv::detail::OpaqueKind::CV_BOOL      : return pyopencv_from_generic_vec(v.rref<bool>());
         case cv::detail::OpaqueKind::CV_INT       : return pyopencv_from_generic_vec(v.rref<int>());
         case cv::detail::OpaqueKind::CV_INT64     : return pyopencv_from_generic_vec(v.rref<int64_t>());
+        case cv::detail::OpaqueKind::CV_UINT64    : return pyopencv_from_generic_vec(v.rref<uint64_t>());
         case cv::detail::OpaqueKind::CV_DOUBLE    : return pyopencv_from_generic_vec(v.rref<double>());
         case cv::detail::OpaqueKind::CV_FLOAT     : return pyopencv_from_generic_vec(v.rref<float>());
         case cv::detail::OpaqueKind::CV_STRING    : return pyopencv_from_generic_vec(v.rref<std::string>());
@@ -339,7 +345,6 @@ PyObject* pyopencv_from(const cv::detail::VectorRef& v)
         case cv::detail::OpaqueKind::CV_MAT       : return pyopencv_from_generic_vec(v.rref<cv::Mat>());
         case cv::detail::OpaqueKind::CV_UNKNOWN   : return pyopencv_from_generic_vec(v.rref<cv::GArg>());
         case cv::detail::OpaqueKind::CV_DRAW_PRIM : return pyopencv_from_generic_vec(v.rref<cv::gapi::wip::draw::Prim>());
-        case cv::detail::OpaqueKind::CV_UINT64    : break;
     }
 
     PyErr_SetString(PyExc_TypeError, "Unsupported GArray type");
@@ -498,6 +503,8 @@ static cv::detail::OpaqueRef extract_opaque_ref(PyObject* from, cv::detail::Opaq
     {
         HANDLE_CASE(BOOL,    bool);
         HANDLE_CASE(INT,     int);
+        HANDLE_CASE(INT64,   int64_t);
+        HANDLE_CASE(UINT64,  uint64_t);
         HANDLE_CASE(DOUBLE,  double);
         HANDLE_CASE(FLOAT,   float);
         HANDLE_CASE(STRING,  std::string);
@@ -507,8 +514,6 @@ static cv::detail::OpaqueRef extract_opaque_ref(PyObject* from, cv::detail::Opaq
         HANDLE_CASE(SIZE,    cv::Size);
         HANDLE_CASE(RECT,    cv::Rect);
         HANDLE_CASE(UNKNOWN, cv::GArg);
-        UNSUPPORTED(UINT64);
-        UNSUPPORTED(INT64);
         UNSUPPORTED(SCALAR);
         UNSUPPORTED(MAT);
         UNSUPPORTED(DRAW_PRIM);
@@ -531,6 +536,8 @@ static cv::detail::VectorRef extract_vector_ref(PyObject* from, cv::detail::Opaq
     {
         HANDLE_CASE(BOOL,      bool);
         HANDLE_CASE(INT,       int);
+        HANDLE_CASE(INT64,     int64_t);
+        HANDLE_CASE(UINT64,    uint64_t);
         HANDLE_CASE(DOUBLE,    double);
         HANDLE_CASE(FLOAT,     float);
         HANDLE_CASE(STRING,    std::string);
@@ -543,8 +550,6 @@ static cv::detail::VectorRef extract_vector_ref(PyObject* from, cv::detail::Opaq
         HANDLE_CASE(MAT,       cv::Mat);
         HANDLE_CASE(UNKNOWN,   cv::GArg);
         HANDLE_CASE(DRAW_PRIM, cv::gapi::wip::draw::Prim);
-        UNSUPPORTED(UINT64);
-        UNSUPPORTED(INT64);
 #undef HANDLE_CASE
 #undef UNSUPPORTED
     }
diff --git a/modules/gapi/misc/python/python_bridge.hpp b/modules/gapi/misc/python/python_bridge.hpp
index 53edf38b30d9..f384b6907bd6 100644
--- a/modules/gapi/misc/python/python_bridge.hpp
+++ b/modules/gapi/misc/python/python_bridge.hpp
@@ -31,6 +31,7 @@ using cv::gapi::wip::draw::Prim;
 WRAP_ARGS(bool        , cv::gapi::ArgType::CV_BOOL,      G)  \
 WRAP_ARGS(int         , cv::gapi::ArgType::CV_INT,       G)  \
 WRAP_ARGS(int64_t     , cv::gapi::ArgType::CV_INT64,     G)  \
+WRAP_ARGS(uint64_t    , cv::gapi::ArgType::CV_UINT64,    G)  \
 WRAP_ARGS(double      , cv::gapi::ArgType::CV_DOUBLE,    G)  \
 WRAP_ARGS(float       , cv::gapi::ArgType::CV_FLOAT,     G)  \
 WRAP_ARGS(std::string , cv::gapi::ArgType::CV_STRING,    G)  \
@@ -49,6 +50,7 @@ WRAP_ARGS(cv::GMat    , cv::gapi::ArgType::CV_GMAT,      G2) \
 WRAP_ARGS(bool        , cv::gapi::ArgType::CV_BOOL,    G)  \
 WRAP_ARGS(int         , cv::gapi::ArgType::CV_INT,     G)  \
 WRAP_ARGS(int64_t     , cv::gapi::ArgType::CV_INT64,   G)  \
+WRAP_ARGS(uint64_t    , cv::gapi::ArgType::CV_UINT64,  G)  \
 WRAP_ARGS(double      , cv::gapi::ArgType::CV_DOUBLE,  G)  \
 WRAP_ARGS(float       , cv::gapi::ArgType::CV_FLOAT,   G)  \
 WRAP_ARGS(std::string , cv::gapi::ArgType::CV_STRING,  G)  \
@@ -67,6 +69,7 @@ enum ArgType {
     CV_BOOL,
     CV_INT,
     CV_INT64,
+    CV_UINT64,
     CV_DOUBLE,
     CV_FLOAT,
     CV_STRING,
@@ -137,7 +140,7 @@ class GAPI_EXPORTS_W_SIMPLE GOpaqueT
     using Storage = cv::detail::MakeVariantType<cv::GOpaque, GOPAQUE_TYPE_LIST_G(ID_, ID)>;
 
     template<typename T>
-    GOpaqueT(cv::GOpaque<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { };
+    GOpaqueT(cv::GOpaque<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { }
 
     GAPI_WRAP GOpaqueT(gapi::ArgType type) : m_type(type)
     {
@@ -175,7 +178,7 @@ class GAPI_EXPORTS_W_SIMPLE GArrayT
     using Storage = cv::detail::MakeVariantType<cv::GArray, GARRAY_TYPE_LIST_G(ID_, ID)>;
 
     template<typename T>
-    GArrayT(cv::GArray<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { };
+    GArrayT(cv::GArray<T> arg) : m_type(cv::detail::ArgTypeTraits<T>::type), m_arg(arg) { }
 
     GAPI_WRAP GArrayT(gapi::ArgType type) : m_type(type)
     {
diff --git a/modules/gapi/misc/python/shadow_gapi.hpp b/modules/gapi/misc/python/shadow_gapi.hpp
index c0c1e38136a8..a87115753a47 100644
--- a/modules/gapi/misc/python/shadow_gapi.hpp
+++ b/modules/gapi/misc/python/shadow_gapi.hpp
@@ -8,6 +8,7 @@ struct GAPI_EXPORTS_W_SIMPLE GCompileArg
     GAPI_WRAP GCompileArg(GKernelPackage arg);
     GAPI_WRAP GCompileArg(gapi::GNetPackage arg);
     GAPI_WRAP GCompileArg(gapi::streaming::queue_capacity arg);
+    GAPI_WRAP GCompileArg(gapi::ot::ObjectTrackerParams arg);
 };
 
 class GAPI_EXPORTS_W_SIMPLE GInferInputs
diff --git a/modules/gapi/misc/python/test/test_gapi_infer.py b/modules/gapi/misc/python/test/test_gapi_infer.py
index 8ecc957e416d..d075651e8706 100644
--- a/modules/gapi/misc/python/test/test_gapi_infer.py
+++ b/modules/gapi/misc/python/test/test_gapi_infer.py
@@ -38,8 +38,8 @@ def test_age_gender_infer(self):
                 return
 
             root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id    = 'CPU'
 
             img_path  = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -73,8 +73,8 @@ def test_age_gender_infer_roi(self):
                 return
 
             root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id    = 'CPU'
 
             img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -112,8 +112,8 @@ def test_age_gender_infer_roi_list(self):
                 return
 
             root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id    = 'CPU'
 
             rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
@@ -161,8 +161,8 @@ def test_age_gender_infer2_roi(self):
                 return
 
             root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id    = 'CPU'
 
             rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
@@ -211,8 +211,8 @@ def test_person_detection_retail_0013(self):
                 return
 
             root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
             device_id    = 'CPU'
             img          = cv.resize(cv.imread(img_path), (544, 320))
@@ -270,8 +270,8 @@ def test_person_detection_retail_0013(self):
                 return
 
             root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
             device_id    = 'CPU'
             img          = cv.resize(cv.imread(img_path), (544, 320))
diff --git a/modules/gapi/misc/python/test/test_gapi_infer_ov.py b/modules/gapi/misc/python/test/test_gapi_infer_ov.py
index b4022b6e2d38..f48ec9636997 100644
--- a/modules/gapi/misc/python/test/test_gapi_infer_ov.py
+++ b/modules/gapi/misc/python/test/test_gapi_infer_ov.py
@@ -86,8 +86,8 @@ def test_age_gender_infer_image(self):
             skip_if_openvino_not_available()
 
             root_path  = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id  = 'CPU'
 
             img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -119,8 +119,8 @@ def test_age_gender_infer_tensor(self):
             skip_if_openvino_not_available()
 
             root_path  = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id  = 'CPU'
 
             img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -148,8 +148,8 @@ def test_age_gender_infer_batch(self):
             skip_if_openvino_not_available()
 
             root_path  = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id  = 'CPU'
 
             img_path1 = self.find_file('cv/face/david1.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -190,8 +190,8 @@ def test_age_gender_infer_planar(self):
             skip_if_openvino_not_available()
 
             root_path  = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id  = 'CPU'
 
             img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
diff --git a/modules/gapi/misc/python/test/test_gapi_ot.py b/modules/gapi/misc/python/test/test_gapi_ot.py
new file mode 100644
index 000000000000..794ed018e3f1
--- /dev/null
+++ b/modules/gapi/misc/python/test/test_gapi_ot.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+import numpy as np
+import cv2 as cv
+import os
+import sys
+import unittest
+
+from tests_common import NewOpenCVTests
+
+
+try:
+
+    if sys.version_info[:2] < (3, 0):
+        raise unittest.SkipTest('Python 2.x is not supported')
+
+    class gapi_ot_test(NewOpenCVTests):
+
+        def test_ot_smoke(self):
+            # Input
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            in_image = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2BGR)
+            in_rects = [ (138, 89, 71, 64) ]
+            in_rects_cls = [ 0 ]
+
+            # G-API
+            g_in = cv.GMat()
+            g_in_rects = cv.GArray.Rect()
+            g_in_rects_cls = cv.GArray.Int()
+            delta = 0.5
+
+            g_out_rects, g_out_rects_cls, g_track_ids, g_track_sts = \
+                cv.gapi.ot.track(g_in, g_in_rects, g_in_rects_cls, delta)
+
+
+            comp = cv.GComputation(cv.GIn(g_in, g_in_rects, g_in_rects_cls),
+                                   cv.GOut(g_out_rects, g_out_rects_cls,
+                                           g_track_ids, g_track_sts))
+
+            __, __, __, sts = comp.apply(cv.gin(in_image, in_rects, in_rects_cls),
+                args=cv.gapi.compile_args(cv.gapi.ot.cpu.kernels()))
+
+            self.assertEqual(cv.gapi.ot.NEW, sts[0])
+
+except unittest.SkipTest as e:
+
+    message = str(e)
+
+    class TestSkip(unittest.TestCase):
+        def setUp(self):
+            self.skipTest('Skip tests: ' + message)
+
+        def test_skip():
+            pass
+
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/misc/python/test/test_gapi_types.py b/modules/gapi/misc/python/test/test_gapi_types.py
index 41bfbabd6171..cdf8a089fddf 100644
--- a/modules/gapi/misc/python/test/test_gapi_types.py
+++ b/modules/gapi/misc/python/test/test_gapi_types.py
@@ -17,10 +17,10 @@
     class gapi_types_test(NewOpenCVTests):
 
         def test_garray_type(self):
-            types = [cv.gapi.CV_BOOL  , cv.gapi.CV_INT   , cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT,
-                     cv.gapi.CV_STRING, cv.gapi.CV_POINT , cv.gapi.CV_POINT2F, cv.gapi.CV_POINT3F ,
-                     cv.gapi.CV_SIZE  , cv.gapi.CV_RECT  , cv.gapi.CV_SCALAR , cv.gapi.CV_MAT  ,
-                     cv.gapi.CV_GMAT]
+            types = [cv.gapi.CV_BOOL   , cv.gapi.CV_INT    , cv.gapi.CV_INT64 , cv.gapi.CV_UINT64,
+                     cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT  , cv.gapi.CV_STRING, cv.gapi.CV_POINT ,
+                     cv.gapi.CV_POINT2F, cv.gapi.CV_POINT3F, cv.gapi.CV_SIZE  , cv.gapi.CV_RECT  ,
+                     cv.gapi.CV_SCALAR , cv.gapi.CV_MAT    , cv.gapi.CV_GMAT]
 
             for t in types:
                 g_array = cv.GArrayT(t)
@@ -28,9 +28,9 @@ def test_garray_type(self):
 
 
         def test_gopaque_type(self):
-            types = [cv.gapi.CV_BOOL  , cv.gapi.CV_INT  , cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT  ,
-                     cv.gapi.CV_STRING, cv.gapi.CV_POINT, cv.gapi.CV_POINT2F, cv.gapi.CV_POINT3F,
-                     cv.gapi.CV_SIZE  , cv.gapi.CV_RECT]
+            types = [cv.gapi.CV_BOOL   , cv.gapi.CV_INT    ,  cv.gapi.CV_INT64 , cv.gapi.CV_UINT64,
+                     cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT  ,  cv.gapi.CV_STRING, cv.gapi.CV_POINT ,
+                     cv.gapi.CV_POINT2F, cv.gapi.CV_POINT3F,  cv.gapi.CV_SIZE  , cv.gapi.CV_RECT]
 
             for t in types:
                 g_opaque = cv.GOpaqueT(t)
diff --git a/modules/gapi/perf/common/gapi_render_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_render_perf_tests_inl.hpp
index 66e8c3731909..04d814eac740 100644
--- a/modules/gapi/perf/common/gapi_render_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_render_perf_tests_inl.hpp
@@ -16,7 +16,7 @@ void create_rand_mats(const cv::Size &size, MatType type, cv::Mat &ref_mat, cv::
     ref_mat.create(size, type);
     cv::randu(ref_mat, cv::Scalar::all(0), cv::Scalar::all(255));
     ref_mat.copyTo(gapi_mat);
-};
+}
 
 } // namespace
 
diff --git a/modules/gapi/src/3rdparty/vasot/LICENSE.txt b/modules/gapi/src/3rdparty/vasot/LICENSE.txt
new file mode 100644
index 000000000000..8a60c284c6b0
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2018-2019 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/modules/gapi/src/3rdparty/vasot/include/vas/common.hpp b/modules/gapi/src/3rdparty/vasot/include/vas/common.hpp
new file mode 100644
index 000000000000..ae6303dbb156
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/include/vas/common.hpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_COMMON_HPP
+#define VAS_COMMON_HPP
+
+#include <cstdint>
+
+#define OT_VERSION_MAJOR 1
+#define OT_VERSION_MINOR 0
+#define OT_VERSION_PATCH 0
+
+#define VAS_EXPORT //__attribute__((visibility("default")))
+
+namespace vas {
+
+/**
+ * @class Version
+ *
+ * Contains version information.
+ */
+class Version {
+  public:
+    /**
+     * Constructor.
+     *
+     * @param[in] major Major version.
+     * @param[in] minor Minor version.
+     * @param[in] patch Patch version.
+     */
+    explicit Version(uint32_t major, uint32_t minor, uint32_t patch) : major_(major), minor_(minor), patch_(patch) {
+    }
+
+    /**
+     * Returns major version.
+     */
+    uint32_t GetMajor() const noexcept {
+        return major_;
+    }
+
+    /**
+     * Returns minor version.
+     */
+    uint32_t GetMinor() const noexcept {
+        return minor_;
+    }
+
+    /**
+     * Returns patch version.
+     */
+    uint32_t GetPatch() const noexcept {
+        return patch_;
+    }
+
+  private:
+    uint32_t major_;
+    uint32_t minor_;
+    uint32_t patch_;
+};
+
+/**
+ * @enum BackendType
+ *
+ * Represents HW backend types.
+ */
+enum class BackendType {
+    CPU,  /**< CPU */
+    GPU  /**< GPU */
+};
+
+/**
+ * @enum ColorFormat
+ *
+ * Represents Color formats.
+ */
+enum class ColorFormat { BGR, NV12, BGRX, GRAY, I420 };
+
+}; // namespace vas
+
+#endif // VAS_COMMON_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/include/vas/ot.hpp b/modules/gapi/src/3rdparty/vasot/include/vas/ot.hpp
new file mode 100644
index 000000000000..ef5e1f4b80d2
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/include/vas/ot.hpp
@@ -0,0 +1,440 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_HPP
+#define VAS_OT_HPP
+
+#include <vas/common.hpp>
+
+#include <opencv2/core.hpp>
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <vector>
+
+namespace vas {
+
+/**
+ * @namespace vas::ot
+ * @brief %vas::ot namespace.
+ *
+ * The ot namespace has classes, functions, and definitions for object tracker.
+ * It is a general tracker, and an object is represented as a rectangular box.
+ * Thus, you can use any kind of detector if it generates a rectangular box as output.
+ * Once an object is added to object tracker, the object is started to be tracked.
+ */
+namespace ot {
+
+/**
+ * Returns current version.
+ */
+VAS_EXPORT vas::Version GetVersion() noexcept;
+
+/**
+ * @enum TrackingType
+ *
+ * Tracking type.
+ */
+enum class TrackingType {
+    LONG_TERM,
+    SHORT_TERM,
+    ZERO_TERM,
+    SHORT_TERM_KCFVAR,
+    SHORT_TERM_IMAGELESS,
+    ZERO_TERM_IMAGELESS,
+    ZERO_TERM_COLOR_HISTOGRAM
+};
+
+/**
+ * @enum TrackingStatus
+ *
+ * Tracking status.
+ */
+enum class TrackingStatus {
+    NEW = 0, /**< The object is newly added. */
+    TRACKED, /**< The object is being tracked. */
+    LOST     /**< The object gets lost now. The object can be tracked again automatically(long term tracking) or by
+                specifying detected object manually(short term and zero term tracking). */
+};
+
+/**
+ * @class DetectedObject
+ * @brief Represents an input object.
+ *
+ * In order to track an object, detected object should be added one or more times to ObjectTracker.
+ * When an object is required to be added to ObjectTracker, you can create an instance of this class and fill its
+ * values.
+ */
+class DetectedObject {
+  public:
+    /**
+     * Default constructor.
+     */
+    DetectedObject() : rect(), class_label() {
+    }
+
+    /**
+     * Constructor with specific values.
+     *
+     * @param[in] input_rect Rectangle of input object.
+     * @param[in] input_class_label Class label of input object.
+     */
+    DetectedObject(const cv::Rect &input_rect, int32_t input_class_label)
+        : rect(input_rect), class_label(input_class_label) {
+    }
+
+  public:
+    /**
+     * Object rectangle.
+     */
+    cv::Rect rect;
+
+    /**
+     * Input class label.
+     * It is an arbitrary value that is specified by user.
+     * You can utilize this value to categorize input objects.
+     * Same value will be assigned to the class_label in Object class.
+     */
+    int32_t class_label;
+};
+
+/**
+ * @class Object
+ * @brief Represents tracking result of a target object.
+ *
+ * It contains tracking information of a target object.
+ * ObjectTracker generates an instance of this class per tracked object when Track method is called.
+ */
+class Object {
+  public:
+    /**
+     * Object rectangle.
+     */
+    cv::Rect rect;
+
+    /**
+     * Tracking ID.
+     * Numbering sequence starts from 1.
+     * The value 0 means the tracking ID of this object has not been assigned.
+     */
+    uint64_t tracking_id;
+
+    /**
+     * Class label.
+     * This is specified by DetectedObject.
+     */
+    int32_t class_label;
+
+    /**
+     * Tracking status.
+     */
+    TrackingStatus status;
+
+    /**
+     * Index in the DetectedObject vector.
+     * If the Object was not in detection input at this frame, then it will be -1.
+     */
+    int32_t association_idx;
+};
+
+VAS_EXPORT std::ostream &operator<<(std::ostream &os, TrackingStatus ts);
+VAS_EXPORT std::ostream &operator<<(std::ostream &os, const Object &object);
+
+/**
+ * @class ObjectTracker
+ * @brief Tracks objects from video frames.
+ *
+ * This class tracks objects from the input frames.
+ * In order to create an instance of this class, you need to use ObjectTracker::Builder class.
+ * @n
+ * ObjectTracker can run in three different ways as TrackingType defines.
+ * @n
+ * In short term tracking, an object is added at the beginning, and the object is tracked with consecutive input frames.
+ * It is recommended to update the tracked object's information for every 10-20 frames.
+ * @n
+ * Zero term tracking can be thought as association between a detected object and tracked object.
+ * Detected objects should always be added when Track method is invoked.
+ * For each frame, detected objects are mapped to tracked objects with this tracking type, which enables ID tracking of
+ detected objects.
+ * @n
+ * Long term tracking is deprecated.
+ * In long term tracking, an object is added at the beginning, and the object is tracked with consecutive input frames.
+ * User doesn't need to update manually the object's information.
+ * Long term tracking takes relatively long time to track objects.
+ * @n
+ * You can specify tracking type by setting attributes of Builder class when you create instances of this class.
+ * It is not possible to run ObjectTracker with two or more different tracking types in one instance.
+ * You can also limit the number of tracked objects by setting attributes of Builder class.
+ * @n
+ * Currently, ObjectTracker does not support HW offloading.
+ * It is possible to run ObjectTracker only on CPU.
+ * @n@n
+ * Following sample code shows how to use short term tracking type.
+ * Objects are added to ObjectTracker at the beginnning of tracking and in the middle of tracking periodically as well.
+ * @code
+    cv::VideoCapture video("/path/to/video/source");
+    cv::Mat frame;
+    cv::Mat first_frame;
+    video >> first_frame;
+
+    vas::ot::ObjectTracker::Builder ot_builder;
+    auto ot = ot_builder.Build(vas::ot::TrackingType::SHORT_TERM);
+
+    vas::pvd::PersonVehicleDetector::Builder pvd_builder;
+    auto pvd = pvd_builder.Build("/path/to/directory/of/fd/model/files");
+
+    std::vector<vas::pvd::PersonVehicle> person_vehicles;
+    std::vector<vas::ot::DetectedObject> detected_objects;
+
+    // Assume that there're objects in the first frame
+    person_vehicles = pvd->Detect(first_frame);
+    for (const auto& pv : person_vehicles)
+        detected_objects.emplace_back(pv.rect, static_cast<int32_t>(pv.type));
+
+    ot->Track(first_frame, detected_objects);
+
+    // Assume that now pvd is running in another thread
+    StartThread(pvd);
+
+    while (video.read(frame))
+    {
+        detected_objects.clear();
+
+        // Assume that frames are forwarded to the thread on which pvd is running
+        EnqueueFrame(frame);
+
+        // Assumes that pvd is adding its result into a queue in another thread.
+        // Assumes also that latency from the last pvd frame to current frame is ignorable.
+        person_vehicles = DequeuePersonVehicles();
+        if (!person_vehicles.empty())
+        {
+            detected_objects.clear();
+            for (const auto& pv : person_vehicles)
+                detected_objects.emplace_back(pv.rect, static_cast<int32_t>(pv.type));
+        }
+
+        auto objects = ot->Track(frame, detected_objects);
+        for (const auto& object : objects)
+        {
+            // Handle tracked object
+        }
+    }
+ * @endcode
+ * @n
+ * Following sample code shows how to use zero term tracking type.
+ * In this sample, pvd runs for each input frame.
+ * After pvd generates results, ot runs with the results and object IDs are preserved.
+ * @code
+    cv::VideoCapture video("/path/to/video/source");
+    cv::Mat frame;
+
+    vas::ot::ObjectTracker::Builder ot_builder;
+    auto ot = ot_builder.Build(vas::ot::TrackingType::ZERO_TERM);
+
+    vas::pvd::PersonVehicleDetector::Builder pvd_builder;
+    auto pvd = pvd_builder.Build("/path/to/directory/of/fd/model/files");
+
+    std::vector<vas::ot::DetectedObject> detected_objects;
+
+    ot->SetFrameDeltaTime(0.033f);
+    while (video.read(frame))
+    {
+        detected_objects.clear();
+
+        auto person_vehicles = pvd->Detect(first_frame);
+        for (const auto& pv : person_vehicles)
+            detected_objects.emplace_back(pv.rect, static_cast<int32_t>(pv.type));
+
+        auto objects = ot->Track(frame, detected_objects);
+        for (const auto& object : objects)
+        {
+            // Handle tracked object
+        }
+    }
+ * @endcode
+ */
+class ObjectTracker {
+  public:
+    class Builder;
+
+  public:
+    ObjectTracker() = delete;
+    ObjectTracker(const ObjectTracker &) = delete;
+    ObjectTracker(ObjectTracker &&) = delete;
+
+    /**
+     * Destructor.
+     */
+    VAS_EXPORT ~ObjectTracker();
+
+  public:
+    ObjectTracker &operator=(const ObjectTracker &) = delete;
+    ObjectTracker &operator=(ObjectTracker &&) = delete;
+
+  public:
+    /**
+     * Tracks objects with video frames.
+     * Also, this method is used to add detected objects.
+     * If a detected object is overlapped enough with one of tracked object, the tracked object's information is updated
+     * with the input detected object. On the other hand, if a detected object is overlapped with none of tracked
+     * objects, the detected object is newly added and ObjectTracker starts to track the object. In long term and short
+     * term tracking type, ObjectTracker continues to track objects in case that empty list of detected objects is
+     * passed in. In zero term tracking type, however, ObjectTracker clears tracked objects in case that empty list of
+     * detected objects is passed in.
+     * @n
+     * The supported color formats are BGR, NV12, BGRx and I420.
+     *
+     * @param[in] frame Input frame.
+     * @param[in] detected_objects Detected objects in the input frame. Default value is an empty vector.
+     * @return Information of tracked objects.
+     * @exception std::invalid_argument Input frame is invalid.
+     */
+    VAS_EXPORT std::vector<Object>
+    Track(const cv::Mat &frame, const std::vector<DetectedObject> &detected_objects = std::vector<DetectedObject>());
+
+    /**
+     * This function is to set a parameter indicating 'delta time' between now and last call to Track() in seconds.
+     * The default value of the delta time is 0.033f which is tuned for 30 fps video frame rate.
+     * It is to achieve improved tracking quality for other frame rates or inconstant frame rate by frame drops.
+     * If input frames come from a video stream of constant frame rate, then a user needs to set this value as 1.0/fps
+     * just after video open. For example, 60 fps video stream should set 0.0167f. If input frames have inconstant frame
+     * rate, then a user needs to call this function before the Track() function.
+     *
+     * @param[in] frame_delta_t Delta time between two consecutive tracking in seconds. The valid range is [0.005 ~
+     * 0.5].
+     */
+    VAS_EXPORT void SetFrameDeltaTime(float frame_delta_t);
+
+    /**
+     * Returns the tracking type of current instance.
+     */
+    VAS_EXPORT TrackingType GetTrackingType() const noexcept;
+
+    /**
+     * Returns the currently set maximum number of trackable objects.
+     */
+    VAS_EXPORT int32_t GetMaxNumObjects() const noexcept;
+
+    /**
+     * Returns the currently set frame delta time.
+     */
+    VAS_EXPORT float GetFrameDeltaTime() const noexcept;
+
+    /**
+     * Returns the currently set color format.
+     */
+    VAS_EXPORT vas::ColorFormat GetInputColorFormat() const noexcept;
+
+    /**
+     * Returns the backend type of current instance.
+     */
+    VAS_EXPORT vas::BackendType GetBackendType() const noexcept;
+
+    /**
+     * Returns the current set tracking per class.
+     */
+    VAS_EXPORT bool GetTrackingPerClass() const noexcept;
+
+  private:
+    class Impl;
+
+  private:
+    explicit ObjectTracker(Impl *impl);
+
+  private:
+    std::unique_ptr<Impl> impl_;
+    friend class Builder;
+};
+
+/**
+ * @class ObjectTracker::Builder
+ * @brief Creates ObjectTracker instances.
+ *
+ * This class is used to build ObjectTracker instances.
+ * All the attributes of this class affects how ObjectTracker is initialized.
+ */
+class ObjectTracker::Builder {
+  public:
+    /**
+     * Default constructor.
+     */
+    VAS_EXPORT Builder();
+
+    /**
+     * Destructor.
+     */
+    VAS_EXPORT ~Builder();
+
+  public:
+    /**
+     * Creates an instance of ObjectTracker based on tracking type and attributes you set.
+     * In case that you set valid values for all attributes, an instance of ObjectTracker is created successfully.
+     *
+     * @param[in] tracking_type Tracking type for newly created ObjectTracker instance.
+     * @exception std::invalid_argument One or more attributes you set are invalid.
+     * @return ObjectTracker instance.
+     */
+    VAS_EXPORT std::unique_ptr<ObjectTracker> Build(TrackingType tracking_type) const;
+
+  public:
+    /**
+     * Specifies HW backend on which object tracker runs.
+     * @n
+     * Default value is vas::BackendType::CPU.
+     */
+    vas::BackendType backend_type;
+
+    /**
+     * Maximum number of trackable objects in a frame.
+     * @n
+     * Valid range: 1 <= max_num_objects. Or it can be -1 if there is no limitation of maximum number in X86.
+     * @n
+     * Default value is -1 which means there is no limitation in X86.
+     */
+    int32_t max_num_objects;
+
+    /**
+     * Input color format vas::ColorFormat. Supports BGR, BGRX, NV12 and I420
+     * @n
+     * Default value is BGR.
+     */
+    vas::ColorFormat input_image_format;
+
+    /**
+     * Specifies whether tracker to use detection class for keeping id of an object.
+     * If it is true, new detection will be associated from previous tracking only when those two have same class.
+     * class id of an object is fixed across video frames.
+     * If it is false, new detection can be associated across different-class objects.
+     * In this case, the class id of an object may change across video frames depending on the tracker input.
+     * It is recommended to turn this option off when it is likely that detector confuses the class of object.
+     * For example, when detector confuses bicycle and motorbike. Turning this option off will increase the tracking
+     * reliability as tracker will ignore the class label of detector.
+     * @n
+     * Default value is true.
+     */
+    bool tracking_per_class;
+
+    /**
+     * Platform configuration
+     * You can set various configuraions for each platform using predefined configurations
+     * @n
+     * For Parallelization in KCFVAR mode, use key "max_num_threads" to set the maximum number of threads. Consult the
+     * following format
+     * @code platform_config["max_num_threads"] = "2"; // set maximum number of threads(concurrency level) to 2 @endcode
+     * @n
+     * Default value is 1
+     * if value >=1, set value as the number of threads to process OT in parallel mode
+     * if value >= Number of available cores OR value is -1, limit concurrency level to maximum available logical cores
+     * otherwise: @exception Invalid input
+     */
+    std::map<std::string, std::string> platform_config;
+};
+
+}; // namespace ot
+}; // namespace vas
+
+#endif // VAS_OT_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/common/exception.hpp b/modules/gapi/src/3rdparty/vasot/src/common/exception.hpp
new file mode 100644
index 000000000000..72c8e510a3fe
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/common/exception.hpp
@@ -0,0 +1,24 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_COMMON_EXCEPTION_HPP
+#define VAS_COMMON_EXCEPTION_HPP
+
+#include <vas/common.hpp>
+
+#include <exception>
+#include <stdexcept>
+
+#define ETHROW(condition, exception_class, message, ...)                                                               \
+    {                                                                                                                  \
+        if (!(condition)) {                                                                                            \
+            throw std::exception_class(message);                                                                       \
+        }                                                                                                              \
+    }
+
+#define TRACE(fmt, ...)
+
+#endif // VAS_COMMON_EXCEPTION_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/common/prof.hpp b/modules/gapi/src/3rdparty/vasot/src/common/prof.hpp
new file mode 100644
index 000000000000..7584841584b2
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/common/prof.hpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_COMMON_PROF_HPP
+#define VAS_COMMON_PROF_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <list>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <stack>
+#include <vector>
+
+#define PROF_COMP_NAME(comp) vas::Prof::Component::comp
+
+#ifdef BUILD_OPTION_PROFILING
+#define PROF_INIT(component) vas::Prof::Init(PROF_COMP_NAME(component))
+#define PROF_START(tag) vas::Prof::Start(tag, __FUNCTION__, __LINE__)
+#define PROF_END(tag) vas::Prof::End(tag)
+#define PROF_EXTRA(tag, value) vas::Prof::SetExtra(tag, value)
+#define PROF_FLUSH(component) vas::Prof::GetInstance(PROF_COMP_NAME(component)).Flush()
+#else
+#define PROF_INIT(tag)
+#define PROF_START(tag)
+#define PROF_END(tag)
+#define PROF_EXTRA(tag, value)
+#define PROF_FLUSH(component)
+#endif
+
+#define PROF_TAG_GENERATE(component, group_id, description)                                                            \
+    { PROF_COMP_NAME(component), group_id, description }
+
+namespace vas {
+
+/**
+ * @class Prof
+ *
+ * Global Prof instance accumulates all ProfData in a Tree structure.
+ * Parallel codes within sigle vas component (ex. STKCF TBB) creates wrong profile result.
+ */
+class Prof {
+  public:
+    enum class Component : int32_t { FD, FR, PVD, CD, FAC, OT, PAC, HD, REID, BASE, KN, kCount };
+
+    typedef uint64_t GroupId;
+    typedef uint64_t UniqueId;
+
+    /**
+     * @class Prof::ProfData
+     *
+     * Data Node withtin Prof class
+     * Accumulates elapsed times between PROF_START / PROF_END
+     */
+    class ProfData {
+      public:
+        ProfData(UniqueId id, GroupId group_id, size_t depth, const char *function_name, const int64_t line,
+                 const char *description);
+        ProfData(const ProfData &other);
+        ~ProfData() = default;
+        ProfData &operator=(const ProfData &) = delete;
+        bool operator==(const ProfData &) const;
+        ProfData *clone();
+
+        std::vector<int64_t> accum_time;
+        std::list<ProfData *> children;
+
+        const UniqueId id;
+        const GroupId group_id;
+        const size_t depth;
+
+        const char *function_name;
+        const int64_t line;
+        const char *description;
+        int64_t start_time;
+    };
+
+    typedef struct _ProfTag {
+        vas::Prof::Component component;
+        vas::Prof::GroupId group_id;
+        const char *description;
+    } ProfTag;
+
+  public:
+    Prof();
+    ~Prof() = default;
+
+    static void Init(Component comp);
+    static void Start(const ProfTag &tag, const char *function_name, int64_t line);
+    static void End(const ProfTag &tag);
+
+    static Prof &GetInstance(Component comp);
+
+    static void SetExtra(const ProfTag &tag, int32_t value);
+
+    void StartProfile(GroupId group_id, const char *function_name, int64_t line, const char *description);
+    void EndProfile();
+    void SetExtraData(const std::string &key, int32_t value);
+    void Flush();
+
+    void MergeToMainInstance(Prof *in);
+
+  private:
+    const char *GetComponentName(Component comp);
+    void Clear();
+
+    // Print detailed prof data.
+    void PrintSummary1(std::ostream *out);
+
+    // Print prof data merged in same stack.
+    void PrintSummary2(std::ostream *out);
+
+    // Print prof data merged with the same group-id.
+    void PrintSummary3(std::ostream *out);
+
+    void PrintSummary1ToCSV(std::ostream *out);
+
+    void PrintExtra(std::ostream *out);
+
+    void PrintAllData(std::ostream *out);
+
+    void Traverse(const ProfData *root, const std::list<ProfData *> &data_list,
+                  void (*print_function)(const ProfData *, const ProfData &, std::ostream *), std::ostream *out);
+    void TraverseMergeSameStackGroups(const std::list<ProfData *> &in_data_list, std::list<ProfData *> *out_data_list);
+    void TraverseMergeAllGroups(const std::list<ProfData *> &in_data_list, std::list<ProfData *> *out_data_list);
+
+    void MergeProfDataList(std::list<Prof::ProfData *> *mergeList, const std::list<Prof::ProfData *> &addList);
+
+  private:
+    std::string outdir_;
+    std::string out_prof_file_;
+    Component component_;
+    std::list<ProfData *> root_data_list_;
+    std::stack<ProfData *> current_data_;
+    std::map<std::string, std::vector<int32_t>> extra_data_;
+};
+
+} // namespace vas
+
+#endif // VAS_COMMON_PROF_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/kalman_filter/kalman_filter_no_opencv.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/kalman_filter/kalman_filter_no_opencv.cpp
new file mode 100644
index 000000000000..31aa8cb17eb6
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/kalman_filter/kalman_filter_no_opencv.cpp
@@ -0,0 +1,292 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "kalman_filter_no_opencv.hpp"
+#include "../../../common/exception.hpp"
+
+#include <cstring>
+
+#define KALMAN_FILTER_NSHIFT 4
+#define KF_USE_PARTIAL_64F
+
+namespace vas {
+
+// R
+const int32_t kNoiseCovarFactor = 8;
+const float kDefaultDeltaT = 0.033f;
+const int32_t kDefaultErrCovFactor = 1;
+
+// lower noise = slowly changed
+KalmanFilterNoOpencv::KalmanFilterNoOpencv(const cv::Rect2f &initial_rect) : delta_t_(kDefaultDeltaT) {
+    int32_t left = static_cast<int32_t>(initial_rect.x);
+    int32_t right = static_cast<int32_t>(initial_rect.x + initial_rect.width);
+    int32_t top = static_cast<int32_t>(initial_rect.y);
+    int32_t bottom = static_cast<int32_t>(initial_rect.y + initial_rect.height);
+    int32_t cX = (left + right) << (KALMAN_FILTER_NSHIFT - 1);
+    int32_t cY = (top + bottom) << (KALMAN_FILTER_NSHIFT - 1);
+    int32_t cRX = (right - left) << (KALMAN_FILTER_NSHIFT - 1);
+    int32_t cRY = (bottom - top) << (KALMAN_FILTER_NSHIFT - 1);
+    kalmanfilter1d32i_init(&kfX, &cX, 0);
+    kalmanfilter1d32i_init(&kfY, &cY, 0);
+    kalmanfilter1d32i_init(&kfRX, &cRX, 0);
+    kalmanfilter1d32i_init(&kfRY, &cRY, 0);
+
+    int32_t object_size = std::max(64, (cRX * cRY));
+    noise_ratio_coordinates_ = kMeasurementNoiseCoordinate;
+    noise_ratio_rect_size_ = kMeasurementNoiseRectSize;
+
+    // Set default Q
+    int32_t cood_cov = static_cast<int32_t>(object_size * noise_ratio_coordinates_);
+    int32_t size_cov = static_cast<int32_t>(object_size * noise_ratio_rect_size_);
+    kfX.Q[0][0] = cood_cov;
+    kfX.Q[1][1] = cood_cov;
+    kfY.Q[0][0] = cood_cov;
+    kfY.Q[1][1] = cood_cov;
+    kfRX.Q[0][0] = size_cov;
+    kfRY.Q[0][0] = size_cov;
+}
+
+cv::Rect2f KalmanFilterNoOpencv::Predict(float delta_tf) {
+    delta_t_ = delta_tf;
+
+    kalmanfilter1d32i_predict_phase(&kfX, delta_tf);
+    kalmanfilter1d32i_predict_phase(&kfY, delta_tf);
+    kalmanfilter1d32i_predict_phase(&kfRX, 0);
+    kalmanfilter1d32i_predict_phase(&kfRY, 0);
+
+    int32_t cp_x = kfX.Xk[0] >> KALMAN_FILTER_NSHIFT;
+    int32_t cp_y = kfY.Xk[0] >> KALMAN_FILTER_NSHIFT;
+    int32_t rx = kfRX.Xk[0] >> KALMAN_FILTER_NSHIFT;
+    int32_t ry = kfRY.Xk[0] >> KALMAN_FILTER_NSHIFT;
+
+    int32_t pre_x = cp_x - rx;
+    int32_t pre_y = cp_y - ry;
+    auto width = 2 * rx;
+    auto height = 2 * ry;
+
+    // printf(" - In Predict: result (%d, %d  %dx%d)\n", pre_x, pre_y, width, height);
+    return cv::Rect2f(float(pre_x), float(pre_y),
+                      float(width), float(height));
+}
+
+cv::Rect2f KalmanFilterNoOpencv::Correct(const cv::Rect2f &measured_region) {
+    int32_t pX = static_cast<int32_t>(measured_region.x + (measured_region.x + measured_region.width))
+                 << (KALMAN_FILTER_NSHIFT - 1);
+    int32_t pY = static_cast<int32_t>(measured_region.y + (measured_region.y + measured_region.height))
+                 << (KALMAN_FILTER_NSHIFT - 1);
+    int32_t pRX = static_cast<int32_t>(measured_region.width) << (KALMAN_FILTER_NSHIFT - 1);
+    int32_t pRY = static_cast<int32_t>(measured_region.height) << (KALMAN_FILTER_NSHIFT - 1);
+    int32_t cX = 0;
+    int32_t cY = 0;
+    int32_t cRX = 0;
+    int32_t cRY = 0;
+
+    int32_t delta_t = static_cast<int32_t>(delta_t_ * 31.3f);
+    if (delta_t < kDefaultErrCovFactor)
+        delta_t = kDefaultErrCovFactor;
+
+    // Set rect-size-adaptive process/observation noise covariance
+    int32_t object_size = std::max(64, (pRX * pRY));
+    // Q
+    int32_t cood_cov = static_cast<int32_t>(object_size * noise_ratio_coordinates_ * delta_t);
+    int32_t size_cov = static_cast<int32_t>(object_size * noise_ratio_rect_size_ * delta_t);
+
+    kfX.Q[0][0] = cood_cov;
+    kfX.Q[1][1] = cood_cov;
+    kfY.Q[0][0] = cood_cov;
+    kfY.Q[1][1] = cood_cov;
+    kfRX.Q[0][0] = size_cov;
+    kfRY.Q[0][0] = size_cov;
+
+    if (kfX.Xk[0] == 0 && kfY.Xk[0] == 0) {
+        kalmanfilter1d32i_predict_phase(&kfX, delta_t_);
+        kalmanfilter1d32i_predict_phase(&kfY, delta_t_);
+        kalmanfilter1d32i_predict_phase(&kfRX, 0);
+        kalmanfilter1d32i_predict_phase(&kfRY, 0);
+    }
+
+    // R
+    int32_t noise_covariance = object_size >> (kNoiseCovarFactor + delta_t);
+    kfX.R = noise_covariance;
+    kfY.R = noise_covariance;
+    kfRX.R = noise_covariance;
+    kfRY.R = noise_covariance;
+
+    kalmanfilter1d32i_update_phase(&kfX, pX, &cX);
+    kalmanfilter1d32i_update_phase(&kfY, pY, &cY);
+    kalmanfilter1d32i_update_phase(&kfRX, pRX, &cRX);
+    kalmanfilter1d32i_update_phase(&kfRY, pRY, &cRY);
+
+    auto x = (cX - cRX) >> KALMAN_FILTER_NSHIFT;
+    auto y = (cY - cRY) >> KALMAN_FILTER_NSHIFT;
+    auto width = (cRX >> (KALMAN_FILTER_NSHIFT - 1));
+    auto height = (cRY >> (KALMAN_FILTER_NSHIFT - 1));
+
+    // printf(" - In Correct: result (%d, %d  %dx%d)\n", x, y, width, height);
+    return cv::Rect2f(float(x), float(y),
+                      float(width), float(height));
+}
+
+void KalmanFilterNoOpencv::kalmanfilter1d32i_init(kalmanfilter1d32i *kf, int32_t *z, int32_t var) {
+    std::memset(kf, 0, sizeof(kalmanfilter1d32i));
+    if (z) {
+        kf->X[0] = *z;
+    }
+
+    kf->P[0][0] = var;
+    kf->P[1][1] = 0;
+}
+
+static void mul_matvec_32f(int32_t Ab[2], float A[2][2], int32_t b[2]) {
+    Ab[0] = static_cast<int32_t>(A[0][0] * b[0] + A[0][1] * b[1]); // b[0] + dt * b[1]
+    Ab[1] = static_cast<int32_t>(A[1][0] * b[0] + A[1][1] * b[1]); // b[1] ( A[1][0] == 0)
+}
+
+static void mul_matmat_32f(int32_t AB[2][2], float trans_mat[2][2], int32_t B[2][2]) {
+    AB[0][0] =
+        static_cast<int32_t>(trans_mat[0][0] * B[0][0] + trans_mat[0][1] * B[1][0]); // kf->P[0][0] + dt * kf->P[1][0]
+    AB[0][1] =
+        static_cast<int32_t>(trans_mat[0][0] * B[0][1] + trans_mat[0][1] * B[1][1]); // kf->P[0][1] + dt * kf->P[1][1]
+    AB[1][0] = static_cast<int32_t>(trans_mat[1][1] * B[1][0]);
+    AB[1][1] = static_cast<int32_t>(trans_mat[1][1] * B[1][1]);
+}
+
+#ifndef KF_USE_PARTIAL_64F
+static void mul_matmat_32i(int32_t AB[2][2], int32_t A[2][2], int32_t B[2][2]) {
+    AB[0][0] = A[0][0] * B[0][0] + A[0][1] * B[1][0];
+    AB[0][1] = A[0][0] * B[0][1] + A[0][1] * B[1][1];
+    AB[1][0] = A[1][0] * B[0][0] + A[1][1] * B[1][0];
+    AB[1][1] = A[1][0] * B[0][1] + A[1][1] * B[1][1];
+}
+
+static void mul_matmatT_32i(int32_t ABt[2][2], int32_t A[2][2], int32_t B[2][2]) {
+    ABt[0][0] = A[0][0] * B[0][0] + A[0][1] * B[0][1];
+    ABt[0][1] = A[0][0] * B[1][0] + A[0][1] * B[1][1];
+    ABt[1][0] = A[1][0] * B[0][0] + A[1][1] * B[0][1];
+    ABt[1][1] = A[1][0] * B[1][0] + A[1][1] * B[1][1];
+}
+#endif
+
+static void mul_matmatT_32f(int32_t ABt[2][2], int32_t A[2][2], float B[2][2]) {
+    ABt[0][0] = static_cast<int32_t>(A[0][0] * B[0][0] + A[0][1] * B[0][1]);
+    ABt[0][1] = static_cast<int32_t>(A[0][0] * B[1][0] + A[0][1] * B[1][1]);
+    ABt[1][0] = static_cast<int32_t>(A[1][0] * B[0][0] + A[1][1] * B[0][1]);
+    ABt[1][1] = static_cast<int32_t>(A[1][0] * B[1][0] + A[1][1] * B[1][1]);
+}
+
+static void add_matmat_32i(int32_t A_B[2][2], int32_t A[2][2], int32_t B[2][2]) {
+    A_B[0][0] = A[0][0] + B[0][0];
+    A_B[0][1] = A[0][1] + B[0][1];
+    A_B[1][0] = A[1][0] + B[1][0];
+    A_B[1][1] = A[1][1] + B[1][1];
+}
+
+void KalmanFilterNoOpencv::kalmanfilter1d32i_predict_phase(kalmanfilter1d32i *kf, float dt) {
+    float F[2][2] = {{1.f, 1.f}, {0.f, 1.f}};
+    float A[2][2] = {{1.f, 1.f}, {0.f, 1.f}};
+    int32_t AP[2][2];
+    int32_t APAt[2][2];
+
+    float weight = 8.f; // 2^(KALMAN_FILTER_NSHIFT - 1)
+    float delta_t = dt * weight;
+
+    F[0][1] = delta_t;
+
+    // Predict state
+    //  - [x(k) = F x(k-1)]
+    mul_matvec_32f(kf->Xk, F, kf->X);
+
+    // Predict error estimate covariance matrix (Predicted estimate covariance) : P(k)
+    //  - [P(k) = F P(k-1) Ft + Q]
+    mul_matmat_32f(AP, A, kf->P);
+    mul_matmatT_32f(APAt, AP, A);
+    add_matmat_32i(kf->Pk, APAt, kf->Q);
+
+    // Update kf->x from x(k-1) to x(k)
+    kf->X[0] = kf->Xk[0];
+    kf->X[1] = kf->Xk[1];
+
+    // Update kf->P from P(k-1) to P(k)
+    kf->P[0][0] = kf->Pk[0][0];
+    kf->P[0][1] = kf->Pk[0][1];
+    kf->P[1][0] = kf->Pk[1][0];
+    kf->P[1][1] = kf->Pk[1][1];
+}
+
+void KalmanFilterNoOpencv::kalmanfilter1d32i_update_phase(kalmanfilter1d32i *kf, int32_t z, int32_t *x) {
+    int32_t y;
+    int32_t S;
+    int32_t K[2];
+    int32_t I_KH[2][2];
+
+    if (kf->Xk[0] == 0 && kf->Pk[0][0] == 0) {
+        (*x) = z;
+        return;
+    }
+
+    // Compute measurement pre-fit residual : Y
+    // H    : measurement matrix
+    // z(k) : actual reading(observed) result of k
+    //  - [ Y(k) = z(k) - H * X(k) ]
+    y = z - kf->Xk[0];
+
+    // Compute residual covariance : S
+    //  - [ S = H*P(k)*Ht + R]
+    S = kf->Pk[0][0] + kf->R;
+
+    if (S == 0) {
+        (*x) = z;
+        return;
+    }
+
+    // Compute optimal kalman gain : K(k)
+    //  - [ K(k) = P(k)*Ht*inv(S)]
+    // K[0] = kf->P[0][0]/S;
+    // K[1] = kf->P[1][0]/S;
+    K[0] = kf->Pk[0][0];
+    K[1] = kf->Pk[1][0];
+
+    // Get updated state
+    //  - [ x'(k) = x(k) + K'*Y )]
+    kf->X[0] = kf->Xk[0] + K[0] * y / S;
+    kf->X[1] = kf->Xk[1] + K[1] * y / S;
+
+    // 7. Get updated estimate covariance : P'(k)
+    //  - [ P'(k) = (I - K(k) * H) * P(k)]
+    I_KH[0][0] = S - K[0];
+    I_KH[0][1] = 0;
+    I_KH[1][0] = -K[1];
+    I_KH[1][1] = S;
+
+    // modified by chan - 20110329 - start
+    // Here, INTEGER is 32bit.
+    // To avoid overflow in the below matrix multiplecation, this code is modified.
+#ifdef KF_USE_PARTIAL_64F
+    {
+        kf->P[0][0] = static_cast<int32_t>(
+            (I_KH[0][0] * static_cast<double>(kf->Pk[0][0]) + I_KH[0][1] * static_cast<double>(kf->Pk[1][0])) / S);
+        kf->P[0][1] = static_cast<int32_t>(
+            (I_KH[0][0] * static_cast<double>(kf->Pk[0][1]) + I_KH[0][1] * static_cast<double>(kf->Pk[1][1])) / S);
+        kf->P[1][0] = static_cast<int32_t>(
+            (I_KH[1][0] * static_cast<double>(kf->Pk[0][0]) + I_KH[1][1] * static_cast<double>(kf->Pk[1][0])) / S);
+        kf->P[1][1] = static_cast<int32_t>(
+            (I_KH[1][0] * static_cast<double>(kf->Pk[0][1]) + I_KH[1][1] * static_cast<double>(kf->Pk[1][1])) / S);
+    }
+#else  // KF_USE_PARTIAL_64F
+    {
+        mul_matmat_32i(kf->P, I_KH, kf->Pk);
+        kf->P[0][0] /= S;
+        kf->P[0][1] /= S;
+        kf->P[1][0] /= S;
+        kf->P[1][1] /= S;
+    }
+#endif // KF_USE_PARTIAL_64F
+
+    // 9. return result
+    (*x) = kf->X[0];
+}
+
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/kalman_filter/kalman_filter_no_opencv.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/kalman_filter/kalman_filter_no_opencv.hpp
new file mode 100644
index 000000000000..1f8cd4cf3517
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/kalman_filter/kalman_filter_no_opencv.hpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_KALMAN_FILTER_NO_OPENCV_HPP
+#define VAS_OT_KALMAN_FILTER_NO_OPENCV_HPP
+
+#include <vas/common.hpp>
+
+#include <opencv2/core.hpp>
+
+const float kMeasurementNoiseCoordinate = 0.001f;
+
+const float kMeasurementNoiseRectSize = 0.002f;
+
+namespace vas {
+
+/*
+ * This class implements a kernel of a standard kalman filter without using of OpenCV.
+ * It supplies simple and common APIs to be use by all components.
+ *
+ */
+class KalmanFilterNoOpencv {
+  public:
+    /** @brief Create & initialize KalmanFilterNoOpencv
+     *      This function initializes Kalman filter with a spectific value of the ratio for measurement noise covariance
+     * matrix. If you consider the detection method is enough reliable, it is recommended to use lower ratio value than
+     * the default value.
+     * @code
+     *      cv::Rect2f input_rect(50.f, 50.f, 100.f, 100.f);
+     *      cv::Rect2f predicted, corrected;
+     *      vas::KalmanFilter kalman_filter = new vas::KalmanFilter(input_rect);
+     *      predicted = kalman_filter->Predict();
+     *      corrected = kalman_filter->Correct(cv::Rect(52, 52, 105, 105));
+     *      delete kalman_filter;
+     * @endcode
+     * @param
+     *      initial_rect                        Initial rectangular coordinates
+     */
+    explicit KalmanFilterNoOpencv(const cv::Rect2f &initial_rect);
+    KalmanFilterNoOpencv() = delete;
+
+    KalmanFilterNoOpencv(const KalmanFilterNoOpencv &) = delete;
+    KalmanFilterNoOpencv &operator=(const KalmanFilterNoOpencv &) = delete;
+
+    /* @brief Destroy Kalman filter kernel
+     */
+    ~KalmanFilterNoOpencv() = default;
+
+    /*
+     * This function computes a predicted state.
+     * input 'delta_t' is not used.
+     */
+    cv::Rect2f Predict(float delta_t = 0.033f);
+
+    /*
+     * This function updates the predicted state from the measurement.
+     */
+    cv::Rect2f Correct(const cv::Rect2f &detect_rect);
+
+  private:
+    struct kalmanfilter1d32i {
+        int32_t X[2];
+        int32_t P[2][2];
+        int32_t Q[2][2];
+        int32_t R;
+
+        int32_t Pk[2][2]; // buffer to copy from Pk-1 to Pk
+        int32_t Xk[2];    // buffer to copy form Xk-1 to Xk
+    };
+
+    void kalmanfilter1d32i_init(kalmanfilter1d32i *kf, int32_t *z, int32_t var);
+    void kalmanfilter1d32i_predict_phase(kalmanfilter1d32i *kf, float dt);
+    void kalmanfilter1d32i_update_phase(kalmanfilter1d32i *kf, int32_t z, int32_t *x);
+    void kalmanfilter1d32i_filter(kalmanfilter1d32i *kf, int32_t *z, int32_t dt, int32_t *x);
+
+    kalmanfilter1d32i kfX;
+    kalmanfilter1d32i kfY;
+    kalmanfilter1d32i kfRX;
+    kalmanfilter1d32i kfRY;
+
+    float noise_ratio_coordinates_;
+    float noise_ratio_rect_size_;
+    float delta_t_;
+};
+
+}; // namespace vas
+
+#endif // VAS_OT_KALMAN_FILTER_NO_OPENCV_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/hungarian_wrap.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/hungarian_wrap.cpp
new file mode 100644
index 000000000000..ee6273e4f9de
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/hungarian_wrap.cpp
@@ -0,0 +1,325 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "hungarian_wrap.hpp"
+#include "../../../common/exception.hpp"
+
+#include <vas/common.hpp>
+
+#include <stdint.h>
+#include <stdio.h>
+
+const float kHungarianValueScale = 1024.0f;
+namespace vas {
+namespace ot {
+
+HungarianAlgo::HungarianAlgo(const cv::Mat_<float> &cost_map)
+    : size_width_(cost_map.cols), size_height_(cost_map.rows), int_cost_map_rows_(), int_cost_map_(), problem_() {
+    // Convert float 2D cost matrix into int32_t** 2D array with scaling
+    int_cost_map_rows_.resize(size_height_, nullptr);
+    int_cost_map_.create(size_height_, size_width_);
+    for (int32_t r = 0; r < size_height_; ++r) {
+        int_cost_map_rows_[r] = reinterpret_cast<int32_t *>(int_cost_map_.ptr(r));
+
+        for (int32_t c = 0; c < size_width_; ++c)
+            int_cost_map_(r, c) = static_cast<int32_t>(cost_map(r, c) * kHungarianValueScale);
+    }
+}
+
+HungarianAlgo::~HungarianAlgo() {
+    FreeHungarian();
+}
+
+cv::Mat_<uint8_t> HungarianAlgo::Solve() {
+    ETHROW(size_height_ > 0 && size_width_ > 0, invalid_argument, "Initialized with invalid cost_map size in Solve");
+
+    // Initialize the gungarian_problem using the cost matrix
+    cv::Mat_<uint8_t> assignment_map;
+    int32_t matrix_size = InitHungarian(kHungarianModeMinimizeCost);
+
+    // Solve the assignement problem
+    SolveHungarian();
+
+    // Copy assignment map
+    assignment_map.create(matrix_size, matrix_size);
+    for (int32_t r = 0; r < matrix_size; ++r) {
+        for (int32_t c = 0; c < matrix_size; ++c) {
+            (assignment_map)(r, c) = static_cast<uint8_t>(problem_.assignment[r][c]);
+        }
+    }
+
+    // Free used memory
+    FreeHungarian();
+
+    return assignment_map;
+}
+
+// Returns the row size of the assignment matrix
+int32_t HungarianAlgo::InitHungarian(int32_t mode) {
+    int32_t max_cost = 0;
+    int32_t **cost_matrix = &int_cost_map_rows_[0];
+
+    // Is the number of cols  not equal to number of size_height_ : if yes, expand with 0-size_width_ / 0-size_width_
+    ETHROW(size_height_ > 0 && size_width_ > 0, invalid_argument,
+           "Initialized with invalid cost_map size in InitHungarian");
+    problem_.num_rows = (size_width_ < size_height_) ? size_height_ : size_width_;
+    problem_.num_cols = problem_.num_rows;
+
+    problem_.cost.resize(problem_.num_rows);
+    problem_.assignment.resize(problem_.num_rows);
+
+    for (int32_t i = 0; i < problem_.num_rows; ++i) {
+        problem_.cost[i].resize(problem_.num_cols, 0);
+        problem_.assignment[i].resize(problem_.num_cols, 0);
+    }
+
+    for (int32_t i = 0; i < problem_.num_rows; ++i) {
+        for (int32_t j = 0; j < problem_.num_cols; ++j) {
+            problem_.cost[i][j] = (i < size_height_ && j < size_width_) ? cost_matrix[i][j] : 0;
+            problem_.assignment[i][j] = 0;
+
+            if (max_cost < problem_.cost[i][j])
+                max_cost = problem_.cost[i][j];
+        }
+    }
+
+    if (mode == kHungarianModeMaximizeUtil) {
+        for (int32_t i = 0; i < problem_.num_rows; ++i) {
+            for (int32_t j = 0; j < problem_.num_cols; ++j) {
+                problem_.cost[i][j] = max_cost - problem_.cost[i][j];
+            }
+        }
+    } else if (mode == kHungarianModeMinimizeCost) {
+        // Nothing to do
+    } else {
+        TRACE(" Unknown mode. Mode was set to HUNGARIAN_MODE_MINIMIZE_COST");
+    }
+
+    return problem_.num_rows;
+}
+
+//
+//
+void HungarianAlgo::FreeHungarian() {
+}
+
+void HungarianAlgo::SolveHungarian() {
+    int32_t k = 0;
+    int32_t l = 0;
+    int32_t unmatched = 0;
+
+    ETHROW(problem_.cost.size() != 0 && problem_.assignment.size() != 0, logic_error, "Unexpected solve");
+
+    std::unique_ptr<int32_t[]> vert_col(new int32_t[problem_.num_rows]);
+    std::unique_ptr<int32_t[]> row_unselected(new int32_t[problem_.num_rows]);
+    std::unique_ptr<int32_t[]> row_dec(new int32_t[problem_.num_rows]);
+    std::unique_ptr<int32_t[]> row_slack(new int32_t[problem_.num_rows]);
+
+    std::unique_ptr<int32_t[]> vert_row(new int32_t[problem_.num_cols]);
+    std::unique_ptr<int32_t[]> parent_row(new int32_t[problem_.num_cols]);
+    std::unique_ptr<int32_t[]> col_inc(new int32_t[problem_.num_cols]);
+    std::unique_ptr<int32_t[]> slack(new int32_t[problem_.num_cols]);
+
+    for (int32_t i = 0; i < problem_.num_rows; ++i) {
+        vert_col[i] = 0;
+        row_unselected[i] = 0;
+        row_dec[i] = 0;
+        row_slack[i] = 0;
+    }
+
+    for (int32_t i = 0; i < problem_.num_cols; ++i) {
+        vert_row[i] = 0;
+        parent_row[i] = 0;
+        col_inc[i] = 0;
+        slack[i] = 0;
+    }
+
+    for (int32_t i = 0; i < problem_.num_rows; ++i)
+        for (int32_t j = 0; j < problem_.num_cols; ++j)
+            problem_.assignment[i][j] = kHungarianNotAssigned;
+
+    // Begin subtract column minima in order to start with lots of zeroes 12
+    TRACE(" Using heuristic");
+
+    for (int32_t i = 0; i < problem_.num_cols; ++i) {
+        int32_t s = problem_.cost[0][i];
+        for (int32_t j = 1; j < problem_.num_rows; ++j) {
+            if (problem_.cost[j][i] < s)
+                s = problem_.cost[j][i];
+        }
+
+        if (s != 0) {
+            for (int32_t j = 0; j < problem_.num_rows; ++j)
+                problem_.cost[j][i] -= s;
+        }
+    }
+    // End subtract column minima in order to start with lots of zeroes 12
+
+    // Begin initial state 16
+    int32_t t = 0;
+    for (int32_t i = 0; i < problem_.num_cols; ++i) {
+        vert_row[i] = -1;
+        parent_row[i] = -1;
+        col_inc[i] = 0;
+        slack[i] = kIntMax;
+    }
+
+    for (k = 0; k < problem_.num_rows; ++k) {
+        bool is_row_done = false;
+        int32_t s = problem_.cost[k][0];
+        for (l = 1; l < problem_.num_cols; ++l) {
+            if (problem_.cost[k][l] < s)
+                s = problem_.cost[k][l];
+        }
+        row_dec[k] = s;
+
+        for (l = 0; l < problem_.num_cols; ++l) {
+            if (s == problem_.cost[k][l] && vert_row[l] < 0) {
+                vert_col[k] = l;
+                vert_row[l] = k;
+                TRACE(" Matching col (%d)==row (%d)", l, k);
+
+                is_row_done = true;
+                break;
+            }
+        }
+
+        if (is_row_done == true) {
+            continue;
+        } else {
+            vert_col[k] = -1;
+            TRACE(" Node %d: unmatched row %d", t, k);
+            row_unselected[t++] = k;
+        }
+    }
+    // End initial state 16
+
+    // Begin Hungarian algorithm 18
+    if (t == 0)
+        goto done;
+
+    unmatched = t;
+    while (1) {
+        TRACE("Matched %d rows.", problem_.num_rows - t);
+        int32_t q = 0;
+        while (1) {
+            while (q < t) {
+                // Begin explore node q of the forest 19
+                k = row_unselected[q];
+                int32_t s = row_dec[k];
+                for (l = 0; l < problem_.num_cols; ++l) {
+                    if (slack[l] == 0)
+                        continue;
+
+                    int32_t del = problem_.cost[k][l] - s + col_inc[l];
+                    if (del >= slack[l])
+                        continue;
+
+                    if (del == 0) {
+                        if (vert_row[l] < 0)
+                            goto leave_break_thru;
+                        slack[l] = 0;
+                        parent_row[l] = k;
+                        TRACE("node %d: row %d==col %d--row %d", t, vert_row[l], l, k);
+
+                        row_unselected[t++] = vert_row[l];
+                    } else {
+                        slack[l] = del;
+                        row_slack[l] = k;
+                    }
+                }
+                // End explore node q of the forest 19
+
+                q++;
+            }
+
+            // Begin introduce a new zero into the matrix 21
+            int32_t s = kIntMax;
+            for (int32_t i = 0; i < problem_.num_cols; ++i) {
+                if (slack[i] && slack[i] < s)
+                    s = slack[i];
+            }
+
+            for (q = 0; q < t; ++q) {
+                row_dec[row_unselected[q]] += s;
+            }
+
+            for (l = 0; l < problem_.num_cols; ++l) {
+                if (slack[l]) {
+                    slack[l] -= s;
+                    if (slack[l] == 0) {
+                        // Begin look at a new zero 22
+                        k = row_slack[l];
+                        TRACE("Decreasing uncovered elements by %d produces zero at [%d,%d]", s, k, l);
+                        if (vert_row[l] < 0) {
+                            for (int32_t j = l + 1; j < problem_.num_cols; ++j) {
+                                if (slack[j] == 0)
+                                    col_inc[j] += s;
+                            }
+
+                            goto leave_break_thru;
+                        } else {
+                            parent_row[l] = k;
+                            TRACE("node %d: row %d==col %d--row %d", t, vert_row[l], l, k);
+                            row_unselected[t++] = vert_row[l];
+                        }
+                        // End look at a new zero 22
+                    }
+                } else {
+                    col_inc[l] += s;
+                }
+            }
+            // End introduce a new zero into the matrix 21
+        } // while (1)
+
+    leave_break_thru:
+        TRACE("Breakthrough at node %d of %d!", q, t);
+        while (1) {
+            int32_t j = vert_col[k];
+            vert_col[k] = l;
+            vert_row[l] = k;
+            TRACE("rematching col %d==row %d", l, k);
+            if (j < 0)
+                break;
+
+            k = parent_row[j];
+            l = j;
+        }
+
+        // End update the matching 20
+        if (--unmatched == 0)
+            goto done;
+
+        // Begin get ready for another stage 17
+        t = 0;
+        for (int32_t i = 0; i < problem_.num_cols; ++i) {
+            parent_row[i] = -1;
+            slack[i] = kIntMax;
+        }
+
+        for (int32_t i = 0; i < problem_.num_rows; ++i) {
+            if (vert_col[i] < 0) {
+                TRACE(" Node %d: unmatched row %d", t, i);
+                row_unselected[t++] = i;
+            }
+        }
+        // End get ready for another stage 17
+    }
+
+done:
+    for (int32_t i = 0; i < problem_.num_rows; ++i) {
+        problem_.assignment[i][vert_col[i]] = kHungarianAssigned;
+    }
+
+    for (int32_t i = 0; i < problem_.num_rows; ++i) {
+        for (int32_t j = 0; j < problem_.num_cols; ++j) {
+            problem_.cost[i][j] = problem_.cost[i][j] - row_dec[i] + col_inc[j];
+        }
+    }
+}
+
+}; // namespace ot
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/hungarian_wrap.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/hungarian_wrap.hpp
new file mode 100644
index 000000000000..80a9dddaffb4
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/hungarian_wrap.hpp
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_HUNGARIAN_WRAP_HPP
+#define VAS_OT_HUNGARIAN_WRAP_HPP
+
+#include <opencv2/core.hpp>
+
+#include <cstdint>
+#include <vector>
+
+namespace vas {
+namespace ot {
+
+const int32_t kHungarianModeMinimizeCost = 0;
+const int32_t kHungarianModeMaximizeUtil = 1;
+
+typedef struct {
+    int32_t num_rows;
+    int32_t num_cols;
+
+    std::vector<std::vector<int32_t>> cost;
+    std::vector<std::vector<int32_t>> assignment;
+} hungarian_problem_t;
+
+class HungarianAlgo {
+  public:
+    explicit HungarianAlgo(const cv::Mat_<float> &cost_map);
+    ~HungarianAlgo();
+
+    cv::Mat_<uint8_t> Solve();
+
+    HungarianAlgo() = delete;
+    HungarianAlgo(const HungarianAlgo &) = delete;
+    HungarianAlgo(HungarianAlgo &&) = delete;
+    HungarianAlgo &operator=(const HungarianAlgo &) = delete;
+    HungarianAlgo &operator=(HungarianAlgo &&) = delete;
+
+  protected:
+    /*  This method initializes the hungarian_problem structure and the  cost matrices (missing lines or columns are
+     *filled with 0). It returns the size of the quadratic(!) assignment matrix.
+     **/
+    int32_t InitHungarian(int32_t mode);
+
+    // Computes the optimal assignment
+    void SolveHungarian();
+
+    // Free the memory allocated by Init
+    void FreeHungarian();
+
+    int32_t size_width_;
+    int32_t size_height_;
+
+  private:
+    const int32_t kHungarianNotAssigned = 0;
+    const int32_t kHungarianAssigned = 1;
+    const int32_t kIntMax = INT_MAX;
+
+    std::vector<int32_t *> int_cost_map_rows_;
+    cv::Mat_<int32_t> int_cost_map_;
+
+    hungarian_problem_t problem_;
+};
+
+}; // namespace ot
+}; // namespace vas
+
+#endif // VAS_OT_HUNGARIAN_WRAP_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.cpp
new file mode 100644
index 000000000000..f2ad032dcbef
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.cpp
@@ -0,0 +1,183 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "objects_associator.hpp"
+#include "hungarian_wrap.hpp"
+#include "rgb_histogram.hpp"
+#include "../prof_def.hpp"
+#include "../../../common/exception.hpp"
+
+namespace vas {
+namespace ot {
+
+const float kAssociationCostThreshold = 1.0f;
+const float kRgbHistDistScale = 0.25f;
+const float kNormCenterDistScale = 0.5f;
+const float kNormShapeDistScale = 0.75f;
+
+ObjectsAssociator::ObjectsAssociator(bool tracking_per_class) : tracking_per_class_(tracking_per_class) {
+}
+
+ObjectsAssociator::~ObjectsAssociator() {
+}
+
+std::pair<std::vector<bool>, std::vector<int32_t>>
+ObjectsAssociator::Associate(const std::vector<Detection> &detections,
+                             const std::vector<std::shared_ptr<Tracklet>> &tracklets,
+                             const std::vector<cv::Mat> *detection_rgb_features) {
+    PROF_START(PROF_COMPONENTS_OT_ASSOCIATE_COMPUTE_DIST_TABLE);
+    std::vector<std::vector<float>> d2t_rgb_dist_table;
+
+    if (detection_rgb_features != nullptr) {
+        d2t_rgb_dist_table = ComputeRgbDistance(detections, tracklets, detection_rgb_features);
+    }
+
+    auto n_detections = detections.size();
+    auto n_tracklets = tracklets.size();
+
+    std::vector<bool> d_is_associated(n_detections, false);
+    std::vector<int32_t> t_associated_d_index(n_tracklets, -1);
+
+    // Compute detection-tracklet normalized position distance table
+    std::vector<std::vector<float>> d2t_pos_dist_table(n_detections, std::vector<float>(n_tracklets, 1000.0f));
+    for (std::size_t d = 0; d < n_detections; ++d) {
+        TRACE("input detect(%.0f,%.0f %.0fx%.0f)", detections[d].rect.x, detections[d].rect.y, detections[d].rect.width,
+              detections[d].rect.height);
+        for (std::size_t t = 0; t < n_tracklets; ++t) {
+            if (tracking_per_class_ && (detections[d].class_label != tracklets[t]->label))
+                continue;
+
+            d2t_pos_dist_table[d][t] = NormalizedCenterDistance(detections[d].rect, tracklets[t]->trajectory.back());
+        }
+    }
+
+    // Compute detection-tracklet normalized shape distance table
+    std::vector<std::vector<float>> d2t_shape_dist_table(n_detections, std::vector<float>(n_tracklets, 1000.0f));
+    for (std::size_t d = 0; d < n_detections; ++d) {
+        for (std::size_t t = 0; t < n_tracklets; ++t) {
+            if (tracking_per_class_ && (detections[d].class_label != tracklets[t]->label))
+                continue;
+
+            d2t_shape_dist_table[d][t] = NormalizedShapeDistance(detections[d].rect, tracklets[t]->trajectory.back());
+        }
+    }
+    PROF_END(PROF_COMPONENTS_OT_ASSOCIATE_COMPUTE_DIST_TABLE);
+
+    PROF_START(PROF_COMPONENTS_OT_ASSOCIATE_COMPUTE_COST_TABLE);
+    // Compute detection-tracklet association cost table
+    cv::Mat_<float> d2t_cost_table;
+    d2t_cost_table.create(static_cast<int32_t>(detections.size()),
+                          static_cast<int32_t>(tracklets.size() + detections.size()));
+    d2t_cost_table = kAssociationCostThreshold + 1.0f;
+
+    for (std::size_t t = 0; t < n_tracklets; ++t) {
+        const auto &tracklet = tracklets[t];
+        float rgb_hist_dist_scale = kRgbHistDistScale;
+
+        float const_ratio = 0.95f;
+        float norm_center_dist_scale =
+            (1.0f - const_ratio) * kNormCenterDistScale * tracklet->association_delta_t / 0.033f +
+            const_ratio * kNormCenterDistScale; // adaptive to delta_t
+        float norm_shape_dist_scale =
+            (1.0f - const_ratio) * kNormShapeDistScale * tracklet->association_delta_t / 0.033f +
+            const_ratio * kNormShapeDistScale; // adaptive to delta_t
+        float log_term = logf(rgb_hist_dist_scale * norm_center_dist_scale * norm_shape_dist_scale);
+
+        for (std::size_t d = 0; d < n_detections; ++d) {
+            if (tracking_per_class_ && (detections[d].class_label != tracklets[t]->label))
+                continue;
+
+            d2t_cost_table(static_cast<int32_t>(d), static_cast<int32_t>(t)) =
+                log_term + d2t_pos_dist_table[d][t] / norm_center_dist_scale +
+                d2t_shape_dist_table[d][t] / norm_shape_dist_scale;
+
+            if (d2t_rgb_dist_table.empty() == false) {
+                d2t_cost_table(static_cast<int32_t>(d), static_cast<int32_t>(t)) +=
+                    d2t_rgb_dist_table[d][t] / kRgbHistDistScale;
+            }
+        }
+    }
+
+    for (std::size_t d = 0; d < n_detections; ++d) {
+        d2t_cost_table(static_cast<int32_t>(d), static_cast<int32_t>(d + n_tracklets)) =
+            kAssociationCostThreshold;
+    }
+    PROF_END(PROF_COMPONENTS_OT_ASSOCIATE_COMPUTE_COST_TABLE);
+
+    // Solve detection-tracking association using Hungarian algorithm
+    PROF_START(PROF_COMPONENTS_OT_ASSOCIATE_WITH_HUNGARIAN);
+    HungarianAlgo hungarian(d2t_cost_table);
+    cv::Mat_<uint8_t> d2t_assign_table = hungarian.Solve();
+    PROF_END(PROF_COMPONENTS_OT_ASSOCIATE_WITH_HUNGARIAN);
+
+    for (std::size_t d = 0; d < n_detections; ++d) {
+        for (std::size_t t = 0; t < n_tracklets; ++t) {
+            if (d2t_assign_table(static_cast<int32_t>(d), static_cast<int32_t>(t))) {
+                d_is_associated[d] = true;
+                t_associated_d_index[t] = static_cast<int32_t>(d);
+                break;
+            }
+        }
+    }
+
+    return std::make_pair(d_is_associated, t_associated_d_index);
+}
+
+std::vector<std::vector<float>>
+ObjectsAssociator::ComputeRgbDistance(const std::vector<Detection> &detections,
+                                      const std::vector<std::shared_ptr<Tracklet>> &tracklets,
+                                      const std::vector<cv::Mat> *detection_rgb_features) {
+    auto n_detections = detections.size();
+    auto n_tracklets = tracklets.size();
+
+    // Compute detection-tracklet RGB feature distance table
+    std::vector<std::vector<float>> d2t_rgb_dist_table(n_detections, std::vector<float>(n_tracklets, 1000.0f));
+    for (std::size_t d = 0; d < n_detections; ++d) {
+        const auto &d_rgb_feature = (*detection_rgb_features)[d];
+        for (std::size_t t = 0; t < n_tracklets; ++t) {
+            if (tracking_per_class_ && (detections[d].class_label != tracklets[t]->label))
+                continue;
+
+            // Find best match in rgb feature history
+            float min_dist = 1000.0f;
+            for (const auto &t_rgb_feature : *(tracklets[t]->GetRgbFeatures())) {
+                min_dist = std::min(min_dist, 1.0f - RgbHistogram::ComputeSimilarity(d_rgb_feature, t_rgb_feature));
+            }
+            d2t_rgb_dist_table[d][t] = min_dist;
+        }
+    }
+
+    return d2t_rgb_dist_table;
+}
+
+float ObjectsAssociator::NormalizedCenterDistance(const cv::Rect2f &r1, const cv::Rect2f &r2) {
+    float normalizer = std::min(0.5f * (r1.width + r1.height), 0.5f * (r2.width + r2.height));
+
+    float r1x = r1.x + 0.5f * r1.width;
+    float r1y = r1.y + 0.5f * r1.height;
+    float r2x = r2.x + 0.5f * r2.width;
+    float r2y = r2.y + 0.5f * r2.height;
+    float dx = (r2x - r1x) / normalizer;
+    float dy = (r2y - r1y) / normalizer;
+    return std::sqrt(dx * dx + dy * dy);
+}
+
+float ObjectsAssociator::NormalizedShapeDistance(const cv::Rect2f &r1, const cv::Rect2f &r2) {
+    int32_t normalize_w = int32_t(r1.width);
+    int32_t normalize_h = int32_t(r1.height);
+
+    if (r2.width + r2.height < r1.width + r1.height) {
+        normalize_w = int32_t(r2.width);
+        normalize_h = int32_t(r2.height);
+    }
+
+    float dw = (r2.width - r1.width) / normalize_w;
+    float dh = (r2.height - r1.height) / normalize_h;
+    return std::sqrt(dw * dw + dh * dh);
+}
+
+}; // namespace ot
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.hpp
new file mode 100644
index 000000000000..ebdb573ffbe7
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/objects_associator.hpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_OBJECTS_ASSOCIATOR_HPP
+#define VAS_OT_OBJECTS_ASSOCIATOR_HPP
+
+#include "../tracklet.hpp"
+
+namespace vas {
+namespace ot {
+
+class ObjectsAssociator {
+  public:
+    explicit ObjectsAssociator(bool tracking_per_class);
+    virtual ~ObjectsAssociator();
+    ObjectsAssociator() = delete;
+
+  public:
+    std::pair<std::vector<bool>, std::vector<int32_t>>
+    Associate(const std::vector<Detection> &detections, const std::vector<std::shared_ptr<Tracklet>> &tracklets,
+              const std::vector<cv::Mat> *detection_rgb_features = nullptr);
+
+  private:
+    std::vector<std::vector<float>> ComputeRgbDistance(const std::vector<Detection> &detections,
+                                                       const std::vector<std::shared_ptr<Tracklet>> &tracklets,
+                                                       const std::vector<cv::Mat> *detection_rgb_features);
+
+    static float NormalizedCenterDistance(const cv::Rect2f &r1, const cv::Rect2f &r2);
+    static float NormalizedShapeDistance(const cv::Rect2f &r1, const cv::Rect2f &r2);
+
+  private:
+    bool tracking_per_class_;
+};
+
+}; // namespace ot
+}; // namespace vas
+
+#endif // VAS_OT_OBJECTS_ASSOCIATOR_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/rgb_histogram.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/rgb_histogram.cpp
new file mode 100644
index 000000000000..bd121fbf4940
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/rgb_histogram.cpp
@@ -0,0 +1,126 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "rgb_histogram.hpp"
+
+namespace vas {
+namespace ot {
+
+RgbHistogram::RgbHistogram(int32_t rgb_bin_size)
+    : rgb_bin_size_(rgb_bin_size), rgb_num_bins_(256 / rgb_bin_size),
+      rgb_hist_size_(static_cast<int32_t>(pow(rgb_num_bins_, 3))) {
+}
+
+RgbHistogram::~RgbHistogram(void) {
+}
+
+void RgbHistogram::Compute(const cv::Mat &image, cv::Mat *hist) {
+    // Init output buffer
+    hist->create(1, rgb_hist_size_, CV_32FC1);
+    (*hist) = cv::Scalar(0);
+    float *hist_data = hist->ptr<float>();
+
+    // Compute quantized RGB histogram
+    AccumulateRgbHistogram(image, hist_data);
+}
+
+void RgbHistogram::ComputeFromBgra32(const cv::Mat &image, cv::Mat *hist) {
+    // Init output buffer
+    hist->create(1, rgb_hist_size_, CV_32FC1);
+    (*hist) = cv::Scalar(0);
+    float *hist_data = hist->ptr<float>();
+
+    // Compute quantized RGB histogram
+    AccumulateRgbHistogramFromBgra32(image, hist_data);
+}
+
+int32_t RgbHistogram::FeatureSize(void) const {
+    return rgb_hist_size_;
+}
+
+float RgbHistogram::ComputeSimilarity(const cv::Mat &hist1, const cv::Mat &hist2) {
+    // PROF_START(PROF_COMPONENTS_OT_SHORTTERM_HIST_SIMILARITY);
+    // Bhattacharyya coeff (w/o weights)
+    const float eps = 0.0001f;
+    const int32_t hist_size = hist1.cols;
+    const float *hist_data1 = hist1.ptr<float>();
+    const float *hist_data2 = hist2.ptr<float>();
+    float corr = 0.0f;
+    float sum1 = 0.0f;
+    float sum2 = 0.0f;
+    for (int32_t i = 0; i < hist_size; ++i) {
+        float v1 = hist_data1[i];
+        float v2 = hist_data2[i];
+        corr += sqrtf(v1 * v2);
+        sum1 += v1;
+        sum2 += v2;
+    }
+
+    // PROF_END(PROF_COMPONENTS_OT_SHORTTERM_HIST_SIMILARITY);
+    if (sum1 > eps && sum2 > eps) {
+        return corr / sqrtf(sum1 * sum2);
+    } else {
+        return 0.0f;
+    }
+}
+
+void RgbHistogram::AccumulateRgbHistogram(const cv::Mat &patch, float *rgb_hist) const {
+    for (int32_t y = 0; y < patch.rows; ++y) {
+        const cv::Vec3b *patch_ptr = patch.ptr<cv::Vec3b>(y);
+        for (int32_t x = 0; x < patch.cols; ++x) {
+            int32_t index0 = patch_ptr[x][0] / rgb_bin_size_;
+            int32_t index1 = patch_ptr[x][1] / rgb_bin_size_;
+            int32_t index2 = patch_ptr[x][2] / rgb_bin_size_;
+            int32_t hist_index = rgb_num_bins_ * (rgb_num_bins_ * index0 + index1) + index2;
+            rgb_hist[hist_index] += 1.0f;
+        }
+    }
+}
+
+void RgbHistogram::AccumulateRgbHistogram(const cv::Mat &patch, const cv::Mat &weight, float *rgb_hist) const {
+    for (int32_t y = 0; y < patch.rows; ++y) {
+        const cv::Vec3b *patch_ptr = patch.ptr<cv::Vec3b>(y);
+        const float *weight_ptr = weight.ptr<float>(y);
+        for (int32_t x = 0; x < patch.cols; ++x) {
+            int32_t index0 = patch_ptr[x][0] / rgb_bin_size_;
+            int32_t index1 = patch_ptr[x][1] / rgb_bin_size_;
+            int32_t index2 = patch_ptr[x][2] / rgb_bin_size_;
+            int32_t hist_index = rgb_num_bins_ * (rgb_num_bins_ * index0 + index1) + index2;
+            rgb_hist[hist_index] += weight_ptr[x];
+        }
+    }
+}
+
+void RgbHistogram::AccumulateRgbHistogramFromBgra32(const cv::Mat &patch, float *rgb_hist) const {
+    for (int32_t y = 0; y < patch.rows; ++y) {
+        const cv::Vec4b *patch_ptr = patch.ptr<cv::Vec4b>(y);
+        for (int32_t x = 0; x < patch.cols; ++x) {
+            int32_t index0 = patch_ptr[x][0] / rgb_bin_size_;
+            int32_t index1 = patch_ptr[x][1] / rgb_bin_size_;
+            int32_t index2 = patch_ptr[x][2] / rgb_bin_size_;
+            int32_t hist_index = rgb_num_bins_ * (rgb_num_bins_ * index0 + index1) + index2;
+            rgb_hist[hist_index] += 1.0f;
+        }
+    }
+}
+
+void RgbHistogram::AccumulateRgbHistogramFromBgra32(const cv::Mat &patch, const cv::Mat &weight,
+                                                    float *rgb_hist) const {
+    for (int32_t y = 0; y < patch.rows; ++y) {
+        const cv::Vec4b *patch_ptr = patch.ptr<cv::Vec4b>(y);
+        const float *weight_ptr = weight.ptr<float>(y);
+        for (int32_t x = 0; x < patch.cols; ++x) {
+            int32_t index0 = patch_ptr[x][0] / rgb_bin_size_;
+            int32_t index1 = patch_ptr[x][1] / rgb_bin_size_;
+            int32_t index2 = patch_ptr[x][2] / rgb_bin_size_;
+            int32_t hist_index = rgb_num_bins_ * (rgb_num_bins_ * index0 + index1) + index2;
+            rgb_hist[hist_index] += weight_ptr[x];
+        }
+    }
+}
+
+}; // namespace ot
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/rgb_histogram.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/rgb_histogram.hpp
new file mode 100644
index 000000000000..9a7847509b16
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/mtt/rgb_histogram.hpp
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_RGB_HISTOGRAM_HPP
+#define VAS_OT_RGB_HISTOGRAM_HPP
+
+#include <opencv2/core.hpp>
+#include <cstdint>
+
+namespace vas {
+namespace ot {
+
+class RgbHistogram {
+  public:
+    explicit RgbHistogram(int32_t rgb_bin_size);
+    virtual ~RgbHistogram(void);
+
+    virtual void Compute(const cv::Mat &image, cv::Mat *hist);
+    virtual void ComputeFromBgra32(const cv::Mat &image, cv::Mat *hist);
+    virtual int32_t FeatureSize(void) const; // currently 512 * float32
+
+    static float ComputeSimilarity(const cv::Mat &hist1, const cv::Mat &hist2);
+
+  protected:
+    int32_t rgb_bin_size_;
+    int32_t rgb_num_bins_;
+    int32_t rgb_hist_size_;
+
+    void AccumulateRgbHistogram(const cv::Mat &patch, float *rgb_hist) const;
+    void AccumulateRgbHistogram(const cv::Mat &patch, const cv::Mat &weight, float *rgb_hist) const;
+
+    void AccumulateRgbHistogramFromBgra32(const cv::Mat &patch, float *rgb_hist) const;
+    void AccumulateRgbHistogramFromBgra32(const cv::Mat &patch, const cv::Mat &weight, float *rgb_hist) const;
+};
+
+}; // namespace ot
+}; // namespace vas
+
+#endif // VAS_OT_RGB_HISTOGRAM_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/object_tracker.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/object_tracker.cpp
new file mode 100644
index 000000000000..6e55ce5ddddd
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/object_tracker.cpp
@@ -0,0 +1,363 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "prof_def.hpp"
+#include "tracker.hpp"
+#include "../../common/exception.hpp"
+
+#include <vas/ot.hpp>
+#include <vas/common.hpp>
+
+#include <memory>
+
+namespace vas {
+namespace ot {
+const float kDefaultDeltaTime = 0.033f;
+const int kDefaultNumThreads = 1;
+const char kNameMaxNumThreads[] = "max_num_threads";
+
+vas::Version GetVersion() noexcept {
+    vas::Version version(OT_VERSION_MAJOR, OT_VERSION_MINOR, OT_VERSION_PATCH);
+    return version;
+}
+
+std::ostream &operator<<(std::ostream &os, TrackingStatus ts) {
+    if (ts == TrackingStatus::NEW)
+        os << "NEW";
+    else if (ts == TrackingStatus::TRACKED)
+        os << "TRACKED";
+    // else if (ts == TrackingStatus::LOST)
+    else
+        os << "LOST";
+
+    return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const Object &object) {
+    os << "Object:" << std::endl;
+    os << "    rect            -> " << object.rect << std::endl;
+    os << "    tracking id     -> " << object.tracking_id << std::endl;
+    os << "    class label     -> " << object.class_label << std::endl;
+    os << "    tracking status -> " << object.status;
+
+    return os;
+}
+
+// Internal implementation: includes OT component
+class ObjectTracker::Impl {
+  public:
+    class InitParameters : public vas::ot::Tracker::InitParameters {
+      public:
+        TrackingType tracking_type;
+        vas::BackendType backend_type;
+    };
+
+  public:
+    explicit Impl(const InitParameters &param);
+
+    Impl() = delete;
+    ~Impl();
+    Impl(const Impl &) = delete;
+    Impl(Impl &&) = delete;
+    Impl &operator=(const Impl &) = delete;
+    Impl &operator=(Impl &&) = delete;
+
+  public:
+    int32_t GetMaxNumObjects() const noexcept;
+    TrackingType GetTrackingType() const noexcept;
+    vas::ColorFormat GetInputColorFormat() const noexcept;
+    float GetDeltaTime() const noexcept;
+    vas::BackendType GetBackendType() const noexcept;
+    bool GetTrackingPerClass() const noexcept;
+    void SetDeltaTime(float delta_t);
+    std::vector<Object> Track(const cv::Mat &frame, const std::vector<DetectedObject> &objects);
+
+  private:
+    std::unique_ptr<vas::ot::Tracker> tracker_;
+    std::vector<std::shared_ptr<Tracklet>> produced_tracklets_;
+
+    int32_t max_num_objects_;
+    float delta_t_;
+    TrackingType tracking_type_;
+    vas::BackendType backend_type_;
+    vas::ColorFormat input_color_format_;
+    bool tracking_per_class_;
+#ifdef DUMP_OTAV
+    Otav otav_;
+#endif
+
+    friend class ObjectTracker::Builder;
+};
+
+namespace {
+void vas_exit() {
+}
+} // anonymous namespace
+
+ObjectTracker::ObjectTracker(ObjectTracker::Impl *impl) : impl_(impl) {
+    atexit(vas_exit);
+}
+
+ObjectTracker::~ObjectTracker() = default;
+
+int32_t ObjectTracker::GetMaxNumObjects() const noexcept {
+    return impl_->GetMaxNumObjects();
+}
+
+TrackingType ObjectTracker::GetTrackingType() const noexcept {
+    return impl_->GetTrackingType();
+}
+
+vas::ColorFormat ObjectTracker::GetInputColorFormat() const noexcept {
+    return impl_->GetInputColorFormat();
+}
+
+float ObjectTracker::GetFrameDeltaTime() const noexcept {
+    return impl_->GetDeltaTime();
+}
+
+vas::BackendType ObjectTracker::GetBackendType() const noexcept {
+    return impl_->GetBackendType();
+}
+
+bool ObjectTracker::GetTrackingPerClass() const noexcept {
+    return impl_->GetTrackingPerClass();
+}
+
+void ObjectTracker::SetFrameDeltaTime(float frame_delta_t) {
+    impl_->SetDeltaTime(frame_delta_t);
+}
+
+std::vector<Object> ObjectTracker::Track(const cv::Mat &frame, const std::vector<DetectedObject> &objects) {
+    return impl_->Track(frame, objects);
+}
+
+ObjectTracker::Impl::Impl(const InitParameters &param)
+    : max_num_objects_(param.max_num_objects), delta_t_(kDefaultDeltaTime), tracking_type_(param.tracking_type),
+      backend_type_(param.backend_type), input_color_format_(param.format),
+      tracking_per_class_(param.tracking_per_class) {
+    PROF_INIT(OT);
+    TRACE("BEGIN");
+    if ((param.max_num_objects) != -1 && (param.max_num_objects <= 0)) {
+        std::cout << "Error: Invalid maximum number of objects: " << param.max_num_objects << std::endl;
+        ETHROW(false, invalid_argument, "Invalid maximum number of objects");
+    }
+
+    TRACE("tracking_type: %d, backend_type: %d, color_format: %d, max_num_object: %d, tracking_per_class: %d",
+          static_cast<int32_t>(tracking_type_), static_cast<int32_t>(backend_type_),
+          static_cast<int32_t>(input_color_format_), max_num_objects_, tracking_per_class_);
+
+    if (param.backend_type == vas::BackendType::CPU) {
+        tracker_.reset(vas::ot::Tracker::CreateInstance(param));
+    } else {
+        std::cout << "Error: Unexpected backend type" << std::endl;
+        ETHROW(false, invalid_argument, "Unexpected backend type");
+    }
+
+    produced_tracklets_.clear();
+
+    TRACE("END");
+}
+
+ObjectTracker::Impl::~Impl() {
+    PROF_FLUSH(OT);
+}
+
+void ObjectTracker::Impl::SetDeltaTime(float delta_t) {
+    if (delta_t < 0.005f || delta_t > 0.5f) {
+        std::cout << "Error: Invalid argument for SetFrameDeltaTime " << delta_t << std::endl;
+        ETHROW(false, invalid_argument, "Invalid argument for SetFrameDeltaTime");
+    }
+
+    delta_t_ = delta_t;
+    return;
+}
+
+int32_t ObjectTracker::Impl::GetMaxNumObjects() const noexcept {
+    return max_num_objects_;
+}
+
+TrackingType ObjectTracker::Impl::GetTrackingType() const noexcept {
+    return tracking_type_;
+}
+
+vas::ColorFormat ObjectTracker::Impl::GetInputColorFormat() const noexcept {
+    return input_color_format_;
+}
+
+float ObjectTracker::Impl::GetDeltaTime() const noexcept {
+    return delta_t_;
+}
+
+vas::BackendType ObjectTracker::Impl::GetBackendType() const noexcept {
+    return backend_type_;
+}
+
+bool ObjectTracker::Impl::GetTrackingPerClass() const noexcept {
+    return tracking_per_class_;
+}
+
+std::vector<Object> ObjectTracker::Impl::Track(const cv::Mat &frame,
+                                               const std::vector<DetectedObject> &detected_objects) {
+    if (frame.cols <= 0 || frame.rows <= 0) {
+        std::cout << "Error: Invalid frame size(" << frame.cols << "x" << frame.rows << ") empty("
+                  << frame.empty() << ")" << std::endl;
+        ETHROW(false, invalid_argument, "Invalid frame size(%dx%d) empty(%d)\n", frame.cols, frame.rows, frame.empty());
+    }
+    int32_t frame_w = frame.cols;
+    int32_t frmae_h = (input_color_format_ == vas::ColorFormat::NV12) ? frame.rows * 2 / 3 : frame.rows;
+    cv::Rect frame_rect(0, 0, frame_w, frmae_h);
+
+    TRACE("START");
+    PROF_START(PROF_COMPONENTS_OT_RUN_TRACK);
+    std::vector<vas::ot::Detection> detections;
+
+    TRACE("+ Number: Detected objects (%d)", static_cast<int32_t>(detected_objects.size()));
+    int32_t index = 0;
+    for (const auto &object : detected_objects) {
+        vas::ot::Detection detection;
+
+        detection.class_label = object.class_label;
+        detection.rect = static_cast<cv::Rect2f>(object.rect);
+        detection.index = index;
+
+        detections.emplace_back(detection);
+        index++;
+    }
+
+    std::vector<Object> objects;
+    if (backend_type_ == vas::BackendType::CPU) {
+        tracker_->TrackObjects(frame, detections, &produced_tracklets_, delta_t_);
+        TRACE("+ Number: Tracking objects (%d)", static_cast<int32_t>(produced_tracklets_.size()));
+
+        for (const auto &tracklet : produced_tracklets_) // result 'Tracklet'
+        {
+            cv::Rect rect = static_cast<cv::Rect>(tracklet->trajectory_filtered.back());
+            if ((rect & frame_rect).area() > 0) {
+                Object object;
+                // TRACE("     - ID(%d) Status(%d)", tracklet.id, tracklet.status);
+                object.rect = static_cast<cv::Rect>(tracklet->trajectory_filtered.back());
+                object.tracking_id = tracklet->id;
+                object.class_label = tracklet->label;
+                object.association_idx = tracklet->association_idx;
+                object.status = vas::ot::TrackingStatus::LOST;
+                switch (tracklet->status) {
+                case ST_NEW:
+                    object.status = vas::ot::TrackingStatus::NEW;
+                    break;
+                case ST_TRACKED:
+                    object.status = vas::ot::TrackingStatus::TRACKED;
+                    break;
+                case ST_LOST:
+                default:
+                    object.status = vas::ot::TrackingStatus::LOST;
+                }
+                objects.emplace_back(object);
+            } else {
+                TRACE("[ %d, %d, %d, %d ] is out of the image bound! -> Filtered out.", rect.x, rect.y, rect.width,
+                      rect.height);
+            }
+        }
+    } else {
+        ETHROW(false, invalid_argument, "Unexpected input backend type for VAS-OT.")
+    }
+    TRACE("+ Number: Result objects (%d)", static_cast<int32_t>(objects.size()));
+
+    PROF_END(PROF_COMPONENTS_OT_RUN_TRACK);
+
+#ifdef DUMP_OTAV
+    otav_.Dump(frame, detections, produced_tracklets_, tracker_->GetFrameCount() - 1);
+#endif
+
+    TRACE("END");
+    return objects;
+}
+
+ObjectTracker::Builder::Builder()
+    : backend_type(vas::BackendType::CPU), max_num_objects(kDefaultMaxNumObjects),
+      input_image_format(vas::ColorFormat::BGR), tracking_per_class(true) {
+}
+
+ObjectTracker::Builder::~Builder() {
+}
+
+std::unique_ptr<ObjectTracker> ObjectTracker::Builder::Build(TrackingType tracking_type) const {
+    TRACE("BEGIN");
+
+    ObjectTracker::Impl *ot_impl = nullptr;
+    ObjectTracker::Impl::InitParameters param;
+
+    param.max_num_objects = max_num_objects;
+    param.format = input_image_format;
+    param.backend_type = backend_type;
+    param.tracking_type = tracking_type;
+    param.tracking_per_class = tracking_per_class;
+
+    if (static_cast<int32_t>(vas::ColorFormat::BGR) > static_cast<int32_t>(input_image_format) ||
+        static_cast<int32_t>(vas::ColorFormat::I420) < static_cast<int32_t>(input_image_format)) {
+        ETHROW(false, invalid_argument, "Invalid color format(%d)", static_cast<int32_t>(input_image_format));
+    }
+
+    switch (tracking_type) {
+    case vas::ot::TrackingType::LONG_TERM:
+        param.profile = vas::ot::Tracker::PROFILE_LONG_TERM;
+        break;
+    case vas::ot::TrackingType::SHORT_TERM:
+        param.profile = vas::ot::Tracker::PROFILE_SHORT_TERM;
+        break;
+    case vas::ot::TrackingType::SHORT_TERM_KCFVAR:
+        param.profile = vas::ot::Tracker::PROFILE_SHORT_TERM_KCFVAR;
+        break;
+    case vas::ot::TrackingType::SHORT_TERM_IMAGELESS:
+        param.profile = vas::ot::Tracker::PROFILE_SHORT_TERM_IMAGELESS;
+        break;
+    case vas::ot::TrackingType::ZERO_TERM:
+        param.profile = vas::ot::Tracker::PROFILE_ZERO_TERM;
+        break;
+    case vas::ot::TrackingType::ZERO_TERM_COLOR_HISTOGRAM:
+        param.profile = vas::ot::Tracker::PROFILE_ZERO_TERM_COLOR_HISTOGRAM;
+        break;
+    case vas::ot::TrackingType::ZERO_TERM_IMAGELESS:
+        param.profile = vas::ot::Tracker::PROFILE_ZERO_TERM_IMAGELESS;
+        break;
+    default:
+        std::cout << "Error: Invalid tracker type vas::ot::Tracker" << std::endl;
+        ETHROW(false, invalid_argument, "Invalid tracker type vas::ot::Tracker");
+        return nullptr;
+    }
+
+    // Not exposed to external parameter
+    param.min_region_ratio_in_boundary =
+        kMinRegionRatioInImageBoundary; // Ratio threshold of size: used by zttchist, zttimgless, sttkcfvar, sttimgless
+
+    for (const auto &item : platform_config) {
+        (void)item; // resolves ununsed warning when LOG_TRACE is OFF
+        TRACE("platform_config[%s] = %s", item.first.c_str(), item.second.c_str());
+    }
+
+    int max_num_threads = kDefaultNumThreads;
+    auto max_num_threads_iter = platform_config.find(kNameMaxNumThreads);
+    if (max_num_threads_iter != platform_config.end()) {
+        try {
+            max_num_threads = std::stoi(max_num_threads_iter->second);
+        } catch (const std::exception &) {
+            ETHROW(false, invalid_argument, "max_num_threads should be integer");
+        }
+
+        if (max_num_threads == 0 || max_num_threads < -1)
+            ETHROW(false, invalid_argument, "max_num_threads cannot be 0 or smaller than -1");
+    }
+    param.max_num_threads = max_num_threads;
+
+    ot_impl = new ObjectTracker::Impl(param);
+    std::unique_ptr<ObjectTracker> ot(new ObjectTracker(ot_impl));
+
+    TRACE("END");
+    return ot;
+}
+
+}; // namespace ot
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/prof_def.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/prof_def.hpp
new file mode 100644
index 000000000000..e16f901613a6
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/prof_def.hpp
@@ -0,0 +1,30 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_PROF_DEF_HPP
+#define VAS_OT_PROF_DEF_HPP
+
+#include "../../common/prof.hpp"
+
+// 0 ~ 999 : Reserved group id for UnitTests.
+// 1000 ~  : User Defined id range under modules.
+
+#define PROF_COMPONENTS_OT_RUN_TRACK PROF_TAG_GENERATE(OT, 1000, " ObjectTracker::Track")
+
+#define PROF_COMPONENTS_OT_ZEROTERM_RUN_TRACKER PROF_TAG_GENERATE(OT, 1400, " ZeroTermTracker::TrackObjects")
+#define PROF_COMPONENTS_OT_ZEROTERM_KALMAN_PREDICTION PROF_TAG_GENERATE(OT, 1411, " ZeroTermTracker::KalmanPrediction")
+#define PROF_COMPONENTS_OT_ZEROTERM_RUN_ASSOCIATION PROF_TAG_GENERATE(OT, 1421, " ZeroTermTracker::Association")
+#define PROF_COMPONENTS_OT_ZEROTERM_UPDATE_STATUS PROF_TAG_GENERATE(OT, 1441, " ZeroTermTracker::UpdateTrackedStatus")
+#define PROF_COMPONENTS_OT_ZEROTERM_COMPUTE_OCCLUSION PROF_TAG_GENERATE(OT, 1461, " ZeroTermTracker::ComputeOcclusion")
+#define PROF_COMPONENTS_OT_ZEROTERM_UPDATE_MODEL PROF_TAG_GENERATE(OT, 1481, " ZeroTermTracker::UpdateModel")
+#define PROF_COMPONENTS_OT_ZEROTERM_REGISTER_OBJECT PROF_TAG_GENERATE(OT, 1491, " ZeroTermTracker::RegisterObject")
+
+#define PROF_COMPONENTS_OT_ASSOCIATE_COMPUTE_DIST_TABLE                                                                \
+    PROF_TAG_GENERATE(OT, 1600, " Association::ComputeDistanceTable")
+#define PROF_COMPONENTS_OT_ASSOCIATE_COMPUTE_COST_TABLE PROF_TAG_GENERATE(OT, 1610, " Association::ComputeCostTable")
+#define PROF_COMPONENTS_OT_ASSOCIATE_WITH_HUNGARIAN PROF_TAG_GENERATE(OT, 1620, " Association::AssociateWithHungarian")
+
+#endif // __OT_PROF_DEF_H__
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/short_term_imageless_tracker.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/short_term_imageless_tracker.cpp
new file mode 100644
index 000000000000..be788bba6261
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/short_term_imageless_tracker.cpp
@@ -0,0 +1,256 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "short_term_imageless_tracker.hpp"
+#include "prof_def.hpp"
+#include "../../common/exception.hpp"
+
+#include <memory>
+
+namespace vas {
+namespace ot {
+
+const int32_t kMaxAssociationLostCount = 2;    // ST_TRACKED -> ST_LOST
+const int32_t kMaxAssociationFailCount = 20;   // ST_LOST -> ST_DEAD
+const int32_t kMaxOutdatedCountInTracked = 30; // ST_TRACKED -> ST_LOST
+const int32_t kMaxOutdatedCountInLost = 20;    // ST_LOST -> ST_DEAD
+const int32_t kMaxTrajectorySize = 30;
+
+/**
+ *
+ * ShortTermImagelessTracker
+ *
+ **/
+ShortTermImagelessTracker::ShortTermImagelessTracker(vas::ot::Tracker::InitParameters init_param)
+    : Tracker(init_param.max_num_objects, init_param.min_region_ratio_in_boundary, init_param.format,
+              init_param.tracking_per_class),
+      image_sz(0, 0) {
+    TRACE(" - Created tracker = ShortTermImagelessTracker");
+}
+
+ShortTermImagelessTracker::~ShortTermImagelessTracker() {
+}
+
+int32_t ShortTermImagelessTracker::TrackObjects(const cv::Mat &mat, const std::vector<Detection> &detections,
+                                                std::vector<std::shared_ptr<Tracklet>> *tracklets, float delta_t) {
+    PROF_START(PROF_COMPONENTS_OT_SHORTTERM_RUN_TRACKER);
+
+    int32_t input_img_width = mat.cols;
+    int32_t input_img_height = mat.rows;
+
+    if (input_image_format_ == vas::ColorFormat::NV12 || input_image_format_ == vas::ColorFormat::I420) {
+        input_img_height = mat.rows / 3 * 2;
+    }
+
+    const cv::Rect2f image_boundary(0.0f, 0.0f, static_cast<float>(input_img_width),
+                                    static_cast<float>(input_img_height));
+
+    TRACE("Start TrackObjects frame_count_: %d, detection: %d, tracklet: %d ----------------", frame_count_,
+          detections.size(), tracklets_.size());
+    bool is_dead = false;
+    if (image_sz.width != input_img_width || image_sz.height != input_img_height) {
+        if (image_sz.width != 0 || image_sz.height != 0) {
+            is_dead = true;
+        }
+        image_sz.width = input_img_width;
+        image_sz.height = input_img_height;
+    }
+
+    PROF_START(PROF_COMPONENTS_OT_SHORTTERM_KALMAN_PREDICTION);
+    // Predict tracklets state
+    for (auto &tracklet : tracklets_) {
+        auto sttimgless_tracklet = std::dynamic_pointer_cast<ShortTermImagelessTracklet>(tracklet);
+        cv::Rect2f predicted_rect = sttimgless_tracklet->kalman_filter->Predict(delta_t);
+        sttimgless_tracklet->predicted = predicted_rect;
+        sttimgless_tracklet->trajectory.push_back(predicted_rect);
+        sttimgless_tracklet->trajectory_filtered.push_back(predicted_rect);
+        sttimgless_tracklet->association_delta_t += delta_t;
+        // Reset association index every frame for new detection input
+        sttimgless_tracklet->association_idx = kNoMatchDetection;
+    }
+
+    PROF_END(PROF_COMPONENTS_OT_SHORTTERM_KALMAN_PREDICTION);
+
+    PROF_START(PROF_COMPONENTS_OT_SHORTTERM_UPDATE_STATUS);
+    // Conduct tracking of SOT for each tracklet
+    TRACE(" Update status");
+    for (auto &tracklet : tracklets_) {
+        if (is_dead) {
+            tracklet->status = ST_DEAD;
+            continue;
+        }
+
+        // tracklet->association_delta_t = 0.0f;  // meaning updated by SOT
+    }
+
+    if (is_dead) {
+        RemoveDeadTracklets();
+    }
+    PROF_END(PROF_COMPONENTS_OT_SHORTTERM_UPDATE_STATUS);
+
+    PROF_START(PROF_COMPONENTS_OT_SHORTTERM_RUN_ASSOCIATION);
+
+    // Tracklet-detection association
+    int32_t n_detections = static_cast<int32_t>(detections.size());
+    int32_t n_tracklets = static_cast<int32_t>(tracklets_.size());
+
+    std::vector<bool> d_is_associated(n_detections, false);
+    std::vector<int32_t> t_associated_d_index(n_tracklets, -1);
+
+    if (n_detections > 0) {
+        auto result = associator_.Associate(detections, tracklets_);
+        d_is_associated = result.first;
+        t_associated_d_index = result.second;
+    }
+
+    PROF_END(PROF_COMPONENTS_OT_SHORTTERM_RUN_ASSOCIATION);
+
+    PROF_START(PROF_COMPONENTS_OT_SHORTTERM_UPDATE_STATUS);
+    // Update tracklets' state
+    if (n_detections > 0) {
+        for (int32_t t = 0; t < n_tracklets; ++t) {
+            auto &tracklet = tracklets_[t];
+            if (t_associated_d_index[t] >= 0) {
+                tracklet->association_delta_t = 0.0f;
+                int32_t associated_d_index = t_associated_d_index[t];
+                const cv::Rect2f &d_bounding_box = detections[associated_d_index].rect & image_boundary;
+
+                // Apply associated detection to tracklet
+                tracklet->association_idx = detections[associated_d_index].index;
+                tracklet->association_fail_count = 0;
+                tracklet->age = 0;
+                tracklet->label = detections[associated_d_index].class_label;
+
+                auto sttimgless_tracklet = std::dynamic_pointer_cast<ShortTermImagelessTracklet>(tracklet);
+                if (!sttimgless_tracklet)
+                    continue;
+
+                if (sttimgless_tracklet->status == ST_NEW) {
+                    sttimgless_tracklet->trajectory.back() = d_bounding_box;
+                    sttimgless_tracklet->trajectory_filtered.back() =
+                        sttimgless_tracklet->kalman_filter->Correct(sttimgless_tracklet->trajectory.back());
+                    sttimgless_tracklet->status = ST_TRACKED;
+                } else if (sttimgless_tracklet->status == ST_TRACKED) {
+                    sttimgless_tracklet->trajectory.back() = d_bounding_box;
+                    sttimgless_tracklet->trajectory_filtered.back() =
+                        sttimgless_tracklet->kalman_filter->Correct(sttimgless_tracklet->trajectory.back());
+                } else if (sttimgless_tracklet->status == ST_LOST) {
+                    sttimgless_tracklet->RenewTrajectory(d_bounding_box);
+                    sttimgless_tracklet->status = ST_TRACKED;
+                }
+            } else // Association failure
+            {
+                tracklet->association_fail_count++;
+                if (tracklet->status == ST_NEW) {
+                    tracklet->status = ST_DEAD; // regard non-consecutive association as false alarm
+                } else if (tracklet->status == ST_TRACKED) {
+                    if (tracklet->association_fail_count > kMaxAssociationLostCount) {
+                        // # association fail > threshold while tracking -> MISSING
+                        tracklet->status = ST_LOST;
+                        tracklet->association_fail_count = 0;
+                        tracklet->age = 0;
+                    }
+                } else if (tracklet->status == ST_LOST) {
+                    if (tracklet->association_fail_count > kMaxAssociationFailCount) {
+                        // # association fail > threshold while missing -> DEAD
+                        tracklet->status = ST_DEAD;
+                    }
+                }
+            }
+        }
+    } else // detections.size() == 0
+    {
+        for (int32_t t = 0; t < static_cast<int32_t>(tracklets_.size()); ++t) {
+            auto &tracklet = tracklets_[t];
+            // Always change ST_NEW to ST_TRACKED: no feature tracking from previous detection input.
+            if (tracklet->status == ST_NEW) {
+                tracklet->status = ST_TRACKED;
+            }
+
+            auto sttimgless_tracklet = std::dynamic_pointer_cast<ShortTermImagelessTracklet>(tracklet);
+            if (!sttimgless_tracklet)
+                continue;
+
+            if (sttimgless_tracklet->status == ST_TRACKED) {
+                if (sttimgless_tracklet->age > kMaxOutdatedCountInTracked) {
+                    sttimgless_tracklet->status = ST_LOST;
+                    sttimgless_tracklet->association_fail_count = 0;
+                    sttimgless_tracklet->age = 0;
+                } else {
+                    sttimgless_tracklet->trajectory_filtered.back() =
+                        sttimgless_tracklet->kalman_filter->Correct(sttimgless_tracklet->trajectory.back());
+                }
+            }
+
+            if (sttimgless_tracklet->status == ST_LOST) {
+                if (sttimgless_tracklet->age >= kMaxOutdatedCountInLost) {
+                    // # association fail > threshold while missing -> DEAD
+                    sttimgless_tracklet->status = ST_DEAD;
+                }
+            }
+        }
+    }
+    PROF_END(PROF_COMPONENTS_OT_SHORTTERM_UPDATE_STATUS);
+
+    PROF_START(PROF_COMPONENTS_OT_SHORTTERM_COMPUTE_OCCLUSION);
+    ComputeOcclusion();
+    PROF_END(PROF_COMPONENTS_OT_SHORTTERM_COMPUTE_OCCLUSION);
+
+    PROF_START(PROF_COMPONENTS_OT_SHORTTERM_REGISTER_OBJECT);
+    // Register remaining detections as new objects
+    for (int32_t d = 0; d < static_cast<int32_t>(detections.size()); ++d) {
+        if (d_is_associated[d] == false) {
+            if (static_cast<int32_t>(tracklets_.size()) >= max_objects_ && max_objects_ != -1)
+                continue;
+
+            std::unique_ptr<ShortTermImagelessTracklet> tracklet(new ShortTermImagelessTracklet());
+
+            tracklet->status = ST_NEW;
+            tracklet->id = GetNextTrackingID();
+            tracklet->label = detections[d].class_label;
+            tracklet->association_idx = detections[d].index;
+
+            const cv::Rect2f &bounding_box = detections[d].rect & image_boundary;
+            tracklet->InitTrajectory(bounding_box);
+            tracklet->kalman_filter.reset(new KalmanFilterNoOpencv(bounding_box));
+            tracklets_.push_back(std::move(tracklet));
+        }
+    }
+    PROF_END(PROF_COMPONENTS_OT_SHORTTERM_REGISTER_OBJECT);
+
+    RemoveDeadTracklets();
+    RemoveOutOfBoundTracklets(input_img_width, input_img_height);
+    TrimTrajectories();
+
+    *tracklets = tracklets_;
+
+    // Increase age
+    for (auto &tracklet : tracklets_) {
+        tracklet->age++;
+    }
+
+    IncreaseFrameCount();
+    PROF_END(PROF_COMPONENTS_OT_SHORTTERM_RUN_TRACKER);
+    return 0;
+}
+
+void ShortTermImagelessTracker::TrimTrajectories() {
+    for (auto &tracklet : tracklets_) {
+        auto &trajectory = tracklet->trajectory;
+        while (trajectory.size() > kMaxTrajectorySize) {
+            trajectory.pop_front();
+        }
+
+        //
+        auto &trajectory_filtered = tracklet->trajectory_filtered;
+        while (trajectory_filtered.size() > kMaxTrajectorySize) {
+            trajectory_filtered.pop_front();
+        }
+    }
+}
+
+}; // namespace ot
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/short_term_imageless_tracker.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/short_term_imageless_tracker.hpp
new file mode 100644
index 000000000000..ac04936deb37
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/short_term_imageless_tracker.hpp
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_SHORT_TERM_IMAGELESS_TRACKER_HPP
+#define VAS_OT_SHORT_TERM_IMAGELESS_TRACKER_HPP
+
+#include "tracker.hpp"
+
+#include <deque>
+#include <vector>
+
+namespace vas {
+namespace ot {
+
+class ShortTermImagelessTracker : public Tracker {
+  public:
+    explicit ShortTermImagelessTracker(vas::ot::Tracker::InitParameters init_param);
+    virtual ~ShortTermImagelessTracker();
+
+    virtual int32_t TrackObjects(const cv::Mat &mat, const std::vector<Detection> &detections,
+            std::vector<std::shared_ptr<Tracklet>> *tracklets, float delta_t) override;
+
+    ShortTermImagelessTracker(const ShortTermImagelessTracker &) = delete;
+    ShortTermImagelessTracker &operator=(const ShortTermImagelessTracker &) = delete;
+
+  private:
+    void TrimTrajectories();
+
+  private:
+    cv::Size image_sz;
+};
+
+}; // namespace ot
+}; // namespace vas
+
+#endif // VAS_OT_SHORT_TERM_IMAGELESS_TRACKER_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/tracker.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracker.cpp
new file mode 100644
index 000000000000..cba960d595dd
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracker.cpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "short_term_imageless_tracker.hpp"
+#include "zero_term_imageless_tracker.hpp"
+
+#include "../../common/exception.hpp"
+
+namespace vas {
+namespace ot {
+
+Tracker::Tracker(int32_t max_objects, float min_region_ratio_in_boundary, vas::ColorFormat format, bool class_per_class)
+    : max_objects_(max_objects), next_id_(1), frame_count_(0),
+      min_region_ratio_in_boundary_(min_region_ratio_in_boundary), input_image_format_(format),
+      associator_(ObjectsAssociator(class_per_class)) {
+}
+
+Tracker::~Tracker() {
+}
+
+Tracker *Tracker::CreateInstance(InitParameters init_parameters) {
+    TRACE("START CreateInstance Tracker");
+
+    Tracker *tracker = nullptr;
+    if (init_parameters.profile == PROFILE_SHORT_TERM_IMAGELESS) {
+        tracker = new ShortTermImagelessTracker(init_parameters);
+    } else if (init_parameters.profile == PROFILE_ZERO_TERM_IMAGELESS) {
+        tracker = new ZeroTermImagelessTracker(init_parameters);
+    } else {
+        throw std::runtime_error("Unsupported tracking type");
+    }
+
+    TRACE(" - max_num_objects(%d)", init_parameters.max_num_objects);
+
+    TRACE("END");
+    return tracker;
+}
+
+int32_t Tracker::RemoveObject(const int32_t id) {
+    if (id == 0)
+        return -1;
+
+    for (auto tracklet = tracklets_.begin(); tracklet != tracklets_.end(); ++tracklet) {
+        if ((*tracklet)->id == id) {
+            tracklet = tracklets_.erase(tracklet);
+            return 0;
+        }
+    }
+    return -1;
+}
+
+void Tracker::Reset(void) {
+    frame_count_ = 0;
+    tracklets_.clear();
+}
+
+int32_t Tracker::GetFrameCount(void) const {
+    return frame_count_;
+}
+
+int32_t Tracker::GetNextTrackingID() {
+    return next_id_++;
+}
+
+void Tracker::IncreaseFrameCount() {
+    frame_count_++;
+}
+
+void Tracker::ComputeOcclusion() {
+    // Compute occlusion ratio
+    for (int32_t t0 = 0; t0 < static_cast<int32_t>(tracklets_.size()); ++t0) {
+        auto &tracklet0 = tracklets_[t0];
+        if (tracklet0->status != ST_TRACKED)
+            continue;
+
+        const cv::Rect2f &r0 = tracklet0->trajectory.back();
+        float max_occlusion_ratio = 0.0f;
+        for (int32_t t1 = 0; t1 < static_cast<int32_t>(tracklets_.size()); ++t1) {
+            const auto &tracklet1 = tracklets_[t1];
+            if (t0 == t1 || tracklet1->status == ST_LOST)
+                continue;
+
+            const cv::Rect2f &r1 = tracklet1->trajectory.back();
+            max_occlusion_ratio = std::max(max_occlusion_ratio, (r0 & r1).area() / r0.area()); // different from IoU
+        }
+        tracklets_[t0]->occlusion_ratio = max_occlusion_ratio;
+    }
+}
+
+void Tracker::RemoveOutOfBoundTracklets(int32_t input_width, int32_t input_height, bool is_filtered) {
+    const cv::Rect2f image_region(0.0f, 0.0f, static_cast<float>(input_width), static_cast<float>(input_height));
+    for (auto tracklet = tracklets_.begin(); tracklet != tracklets_.end();) {
+        const cv::Rect2f &object_region =
+            is_filtered ? (*tracklet)->trajectory_filtered.back() : (*tracklet)->trajectory.back();
+        if ((image_region & object_region).area() / object_region.area() <
+            min_region_ratio_in_boundary_) { // only 10% is in image boundary
+            tracklet = tracklets_.erase(tracklet);
+        } else {
+            ++tracklet;
+        }
+    }
+}
+
+void Tracker::RemoveDeadTracklets() {
+    for (auto tracklet = tracklets_.begin(); tracklet != tracklets_.end();) {
+        if ((*tracklet)->status == ST_DEAD) {
+            tracklet = tracklets_.erase(tracklet);
+        } else {
+            ++tracklet;
+        }
+    }
+}
+
+bool Tracker::RemoveOneLostTracklet() {
+    for (auto tracklet = tracklets_.begin(); tracklet != tracklets_.end();) {
+        if ((*tracklet)->status == ST_LOST) {
+            // The first tracklet is the oldest
+            tracklet = tracklets_.erase(tracklet);
+            return true;
+        } else {
+            ++tracklet;
+        }
+    }
+
+    return false;
+}
+
+}; // namespace ot
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/tracker.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracker.hpp
new file mode 100644
index 000000000000..424bd1b45dc8
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracker.hpp
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_TRACKER_HPP
+#define VAS_OT_TRACKER_HPP
+
+#include "mtt/objects_associator.hpp"
+#include "tracklet.hpp"
+
+#include <vas/common.hpp>
+
+#include <cstdint>
+#include <deque>
+
+namespace vas {
+namespace ot {
+
+const int32_t kDefaultMaxNumObjects = -1;
+const float kMaxTargetAreaFactor = 0.8f;
+const float kMinRegionRatioInImageBoundary = 0.75f; // MIN_REGION_RATIO_IN_IMAGE_BOUNDARY
+
+class Tracker {
+  public:
+    enum Profile {
+        PROFILE_LONG_TERM = 0,        // for long-term tracking usage
+        PROFILE_SHORT_TERM,           // for short-term tracking usage (suitable for using with an object detector)
+        PROFILE_SHORT_TERM_KCFVAR,    // alias of 'PROFILE_SHORT_TERM'. 'PROFILE_SHORT_TERM' will be deprecated
+        PROFILE_SHORT_TERM_IMAGELESS, // for short-term tracking usage with kalman tracking
+        PROFILE_ZERO_TERM, // for zero-term tracking usage (only works with object association algorithm, not tracking)
+        PROFILE_ZERO_TERM_IMAGELESS,       // for zero-term tracking usage with kalman tracking
+        PROFILE_ZERO_TERM_COLOR_HISTOGRAM, // alias of 'PROFILE_ZERO_TERM'. 'PROFILE_ZERO_TERM' will be deprecated
+    };
+
+    class InitParameters {
+      public:
+        Profile profile; // tracking type
+        int32_t max_num_objects;
+        int32_t max_num_threads; // for Parallelization
+        vas::ColorFormat format;
+        bool tracking_per_class;
+
+        // Won't be exposed to the external
+        float min_region_ratio_in_boundary; // For ST, ZT
+    };
+
+  public:
+    virtual ~Tracker();
+
+    /**
+     * create new object tracker instance
+     * @param InitParameters
+     */
+    static Tracker *CreateInstance(InitParameters init_parameters);
+
+    /**
+     * perform tracking
+     *
+     * @param[in] mat Input frame
+     * @param[in] detection Newly detected object data vector which will be added to the tracker. put zero length vector
+     *            if there is no new object in the frame.
+     * @param[in] delta_t Time passed after the latest call to TrackObjects() in seconds. Use 1.0/FPS in case of
+     * constant frame rate
+     * @param[out] tracklets Tracked object data vector.
+     * @return 0 for success. negative value for failure
+     */
+    virtual int32_t TrackObjects(const cv::Mat &mat, const std::vector<Detection> &detections,
+                                 std::vector<std::shared_ptr<Tracklet>> *tracklets, float delta_t = 0.033f) = 0;
+
+    /**
+     * remove object
+     *
+     * @param[in] id Object id for removing. it should be the 'id' value of the Tracklet
+     * @return 0 for success. negative value for failure.
+     */
+    int32_t RemoveObject(const int32_t id);
+
+    /**
+     * reset all internal state to its initial.
+     *
+     * @return 0 for success. negative value for failure.
+     */
+    void Reset(void);
+
+    /**
+     * get cumulated frame number
+     *
+     * @return 0
+     */
+    int32_t GetFrameCount(void) const;
+
+  protected:
+    explicit Tracker(int32_t max_objects, float min_region_ratio_in_boundary, vas::ColorFormat format,
+                     bool class_per_class = true);
+    Tracker() = delete;
+
+    int32_t GetNextTrackingID();
+    void IncreaseFrameCount();
+
+    void ComputeOcclusion();
+
+    void RemoveOutOfBoundTracklets(int32_t input_width, int32_t input_height, bool is_filtered = false);
+    void RemoveDeadTracklets();
+    bool RemoveOneLostTracklet();
+
+  protected:
+    int32_t max_objects_; // -1 means no limitation
+    int32_t next_id_;
+    int32_t frame_count_;
+
+    float min_region_ratio_in_boundary_;
+    vas::ColorFormat input_image_format_;
+
+    ObjectsAssociator associator_;
+    std::vector<std::shared_ptr<Tracklet>> tracklets_;
+};
+
+}; // namespace ot
+}; // namespace vas
+
+#endif // VAS_OT_TRACKER_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.cpp
new file mode 100644
index 000000000000..62e8d10cf673
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.cpp
@@ -0,0 +1,147 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "tracklet.hpp"
+
+#include <sstream>
+
+namespace vas {
+namespace ot {
+
+Tracklet::Tracklet()
+    : id(0), label(-1), association_idx(kNoMatchDetection), status(ST_DEAD), age(0), confidence(0.f),
+      occlusion_ratio(0.f), association_delta_t(0.f), association_fail_count(0) {
+}
+
+Tracklet::~Tracklet() {
+}
+
+void Tracklet::ClearTrajectory() {
+    trajectory.clear();
+    trajectory_filtered.clear();
+}
+
+void Tracklet::InitTrajectory(const cv::Rect2f &bounding_box) {
+    trajectory.push_back(bounding_box);
+    trajectory_filtered.push_back(bounding_box);
+}
+
+void Tracklet::AddUpdatedTrajectory(const cv::Rect2f &bounding_box, const cv::Rect2f &corrected_box) {
+    trajectory.push_back(bounding_box);
+    trajectory_filtered.push_back(corrected_box);
+}
+
+void Tracklet::UpdateLatestTrajectory(const cv::Rect2f &bounding_box, const cv::Rect2f &corrected_box) {
+    trajectory.back() = bounding_box;
+    trajectory_filtered.back() = corrected_box;
+}
+
+void Tracklet::RenewTrajectory(const cv::Rect2f &bounding_box) {
+    ClearTrajectory();
+    trajectory.push_back(bounding_box);
+    trajectory_filtered.push_back(bounding_box);
+}
+
+#define DEFINE_STRING_VAR(var_name, value)                                                                             \
+    std::stringstream __##var_name;                                                                                    \
+    __##var_name << value;                                                                                             \
+    std::string var_name = __##var_name.str();
+
+#define ROUND_F(value, scale) (round((value)*scale) / scale)
+
+std::string Tracklet::Serialize() const {
+#ifdef DUMP_OTAV
+    DEFINE_STRING_VAR(s_id, id);
+    DEFINE_STRING_VAR(s_label, label);
+    DEFINE_STRING_VAR(s_association_idx, association_idx);
+    DEFINE_STRING_VAR(s_status, static_cast<int>(status));
+    DEFINE_STRING_VAR(s_age, age);
+    DEFINE_STRING_VAR(s_confidence, ROUND_F(confidence, 100.0));
+    DEFINE_STRING_VAR(s_occlusion_ratio, ROUND_F(occlusion_ratio, 100.0));
+    DEFINE_STRING_VAR(s_association_delta_t, association_delta_t);
+    DEFINE_STRING_VAR(s_association_fail_count, association_fail_count);
+    DEFINE_STRING_VAR(t_x, ROUND_F(trajectory.back().x, 10.0));
+    DEFINE_STRING_VAR(t_y, ROUND_F(trajectory.back().y, 10.0));
+    DEFINE_STRING_VAR(t_w, ROUND_F(trajectory.back().width, 10.0));
+    DEFINE_STRING_VAR(t_h, ROUND_F(trajectory.back().height, 10.0));
+    DEFINE_STRING_VAR(tf_x, ROUND_F(trajectory_filtered.back().x, 10.0));
+    DEFINE_STRING_VAR(tf_y, ROUND_F(trajectory_filtered.back().y, 10.0));
+    DEFINE_STRING_VAR(tf_w, ROUND_F(trajectory_filtered.back().width, 10.0));
+    DEFINE_STRING_VAR(tf_h, ROUND_F(trajectory_filtered.back().height, 10.0));
+    DEFINE_STRING_VAR(p_x, ROUND_F(predicted.x, 10.0));
+    DEFINE_STRING_VAR(p_y, ROUND_F(predicted.y, 10.0));
+    DEFINE_STRING_VAR(p_w, ROUND_F(predicted.width, 10.0));
+    DEFINE_STRING_VAR(p_h, ROUND_F(predicted.height, 10.0));
+    std::string formatted_msg = "meta:\"" + s_id + "," + s_label + "," + s_association_idx + "," + s_status + "," +
+                                s_age + "," + s_confidence + "," + s_occlusion_ratio + "," + s_association_delta_t +
+                                "," + s_association_fail_count + "\", roi:\"" + t_x + "," + t_y + "," + t_w + "," +
+                                t_h + "\", roif:\"" + tf_x + "," + tf_y + "," + tf_w + "," + tf_h + "\", roip:\"" +
+                                p_x + "," + p_y + "," + p_w + "," + p_h + "\"";
+
+    std::string free_msg;
+    if (otav_msg.size() > 0) {
+        free_msg = ", msg: [";
+        for (auto line : otav_msg) {
+            if (line.size() > 0)
+                free_msg += "\n\"" + line + "\",";
+        }
+        free_msg += "]";
+        otav_msg.clear();
+    }
+    return formatted_msg + free_msg;
+#else
+    return "";
+#endif
+}
+
+std::deque<cv::Mat> *Tracklet::GetRgbFeatures() {
+    return nullptr;
+}
+
+ZeroTermImagelessTracklet::ZeroTermImagelessTracklet() : Tracklet(), birth_count(1) {
+}
+
+ZeroTermImagelessTracklet::~ZeroTermImagelessTracklet() {
+}
+
+void ZeroTermImagelessTracklet::RenewTrajectory(const cv::Rect2f &bounding_box) {
+    float velo_x = bounding_box.x - trajectory.back().x;
+    float velo_y = bounding_box.y - trajectory.back().y;
+    cv::Rect rect_predict(int(bounding_box.x + velo_x / 3), int(bounding_box.y + velo_y / 3),
+                          int(bounding_box.width), int(bounding_box.height));
+
+    ClearTrajectory();
+    kalman_filter.reset(new KalmanFilterNoOpencv(bounding_box));
+    kalman_filter->Predict();
+    kalman_filter->Correct(rect_predict);
+
+    trajectory.push_back(bounding_box);
+    trajectory_filtered.push_back(bounding_box);
+}
+
+ShortTermImagelessTracklet::ShortTermImagelessTracklet() : Tracklet() {
+}
+
+ShortTermImagelessTracklet::~ShortTermImagelessTracklet() {
+}
+
+void ShortTermImagelessTracklet::RenewTrajectory(const cv::Rect2f &bounding_box) {
+    float velo_x = bounding_box.x - trajectory.back().x;
+    float velo_y = bounding_box.y - trajectory.back().y;
+    cv::Rect rect_predict(int(bounding_box.x + velo_x / 3), int(bounding_box.y + velo_y / 3),
+                          int(bounding_box.width), int(bounding_box.height));
+
+    ClearTrajectory();
+    kalman_filter.reset(new KalmanFilterNoOpencv(bounding_box));
+    kalman_filter->Predict();
+    kalman_filter->Correct(rect_predict);
+
+    trajectory.push_back(bounding_box);
+    trajectory_filtered.push_back(bounding_box);
+}
+
+}; // namespace ot
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.hpp
new file mode 100644
index 000000000000..762e3f6ea652
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/tracklet.hpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_TRACKET_HPP
+#define VAS_OT_TRACKET_HPP
+
+#include "kalman_filter/kalman_filter_no_opencv.hpp"
+
+#include <vas/common.hpp>
+
+#include <cstdint>
+#include <deque>
+
+namespace vas {
+namespace ot {
+
+const int32_t kNoMatchDetection = -1;
+
+enum Status {
+    ST_DEAD = -1,   // dead
+    ST_NEW = 0,     // new
+    ST_TRACKED = 1, // tracked
+    ST_LOST = 2     // lost but still alive (in the detection phase if it configured)
+};
+
+struct Detection {
+    cv::Rect2f rect;
+    int32_t class_label = -1;
+    int32_t index = -1;
+};
+
+class Tracklet {
+  public:
+    Tracklet();
+    virtual ~Tracklet();
+
+  public:
+    void ClearTrajectory();
+    void InitTrajectory(const cv::Rect2f &bounding_box);
+    void AddUpdatedTrajectory(const cv::Rect2f &bounding_box, const cv::Rect2f &corrected_box);
+    void UpdateLatestTrajectory(const cv::Rect2f &bounding_box, const cv::Rect2f &corrected_box);
+    virtual void RenewTrajectory(const cv::Rect2f &bounding_box);
+
+    virtual std::deque<cv::Mat> *GetRgbFeatures();
+    virtual std::string Serialize() const; // Returns key:value with comma separated format
+
+  public:
+    int32_t id; // If hasnot been assigned : -1 to 0
+    int32_t label;
+    int32_t association_idx;
+    Status status;
+    int32_t age;
+    float confidence;
+
+    float occlusion_ratio;
+    float association_delta_t;
+    int32_t association_fail_count;
+
+    std::deque<cv::Rect2f> trajectory;
+    std::deque<cv::Rect2f> trajectory_filtered;
+    cv::Rect2f predicted;                      // Result from Kalman prediction. It is for debugging (OTAV)
+    mutable std::vector<std::string> otav_msg; // Messages for OTAV
+};
+
+class ZeroTermImagelessTracklet : public Tracklet {
+  public:
+    ZeroTermImagelessTracklet();
+    virtual ~ZeroTermImagelessTracklet();
+
+    void RenewTrajectory(const cv::Rect2f &bounding_box) override;
+
+  public:
+    int32_t birth_count;
+    std::unique_ptr<KalmanFilterNoOpencv> kalman_filter;
+};
+
+class ShortTermImagelessTracklet : public Tracklet {
+  public:
+    ShortTermImagelessTracklet();
+    virtual ~ShortTermImagelessTracklet();
+
+    void RenewTrajectory(const cv::Rect2f &bounding_box) override;
+
+  public:
+    std::unique_ptr<KalmanFilterNoOpencv> kalman_filter;
+};
+
+}; // namespace ot
+}; // namespace vas
+
+#endif // VAS_OT_TRACKET_HPP
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/zero_term_imageless_tracker.cpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/zero_term_imageless_tracker.cpp
new file mode 100644
index 000000000000..c56c7813fc15
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/zero_term_imageless_tracker.cpp
@@ -0,0 +1,185 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#include "zero_term_imageless_tracker.hpp"
+#include "prof_def.hpp"
+#include "../../common/exception.hpp"
+
+#include <cmath>
+#include <memory>
+
+namespace vas {
+namespace ot {
+
+// const int32_t kMaxAssociationFailCount = 600;  // about 20 seconds
+const int32_t kMaxAssociationFailCount = 120; // about 4 seconds
+const int32_t kMaxTrajectorySize = 30;
+
+const int32_t kMinBirthCount = 3;
+
+/**
+ *
+ * ZeroTermImagelessTracker
+ *
+ **/
+ZeroTermImagelessTracker::ZeroTermImagelessTracker(vas::ot::Tracker::InitParameters init_param)
+    : Tracker(init_param.max_num_objects, init_param.min_region_ratio_in_boundary, init_param.format,
+              init_param.tracking_per_class) {
+    TRACE(" - Created tracker = ZeroTermImagelessTracker");
+}
+
+ZeroTermImagelessTracker::~ZeroTermImagelessTracker() {
+}
+
+int32_t ZeroTermImagelessTracker::TrackObjects(const cv::Mat &mat, const std::vector<Detection> &detections,
+                                               std::vector<std::shared_ptr<Tracklet>> *tracklets, float delta_t) {
+    PROF_START(PROF_COMPONENTS_OT_ZEROTERM_RUN_TRACKER);
+
+    int32_t input_img_width = mat.cols;
+    int32_t input_img_height = mat.rows;
+
+    if (input_image_format_ == vas::ColorFormat::NV12 || input_image_format_ == vas::ColorFormat::I420) {
+        input_img_height = mat.rows / 3 * 2;
+    }
+    const cv::Rect2f image_boundary(0.0f, 0.0f, static_cast<float>(input_img_width),
+                                    static_cast<float>(input_img_height));
+
+    PROF_START(PROF_COMPONENTS_OT_ZEROTERM_KALMAN_PREDICTION);
+    // Predict tracklets state
+    for (auto &tracklet : tracklets_) {
+        auto zttimgless_tracklet = std::dynamic_pointer_cast<ZeroTermImagelessTracklet>(tracklet);
+        cv::Rect2f predicted_rect = zttimgless_tracklet->kalman_filter->Predict(delta_t);
+        zttimgless_tracklet->predicted = predicted_rect;
+        zttimgless_tracklet->trajectory.push_back(predicted_rect);
+        zttimgless_tracklet->trajectory_filtered.push_back(predicted_rect);
+        zttimgless_tracklet->association_delta_t += delta_t;
+        // Reset association index every frame for new detection input
+        zttimgless_tracklet->association_idx = kNoMatchDetection;
+    }
+
+    PROF_END(PROF_COMPONENTS_OT_ZEROTERM_KALMAN_PREDICTION);
+
+    PROF_START(PROF_COMPONENTS_OT_ZEROTERM_RUN_ASSOCIATION);
+
+    // Tracklet-detection association
+    int32_t n_detections = static_cast<int32_t>(detections.size());
+    int32_t n_tracklets = static_cast<int32_t>(tracklets_.size());
+
+    std::vector<bool> d_is_associated(n_detections, false);
+    std::vector<int32_t> t_associated_d_index(n_tracklets, -1);
+
+    if (n_detections > 0) {
+        auto result = associator_.Associate(detections, tracklets_);
+        d_is_associated = result.first;
+        t_associated_d_index = result.second;
+    }
+
+    PROF_END(PROF_COMPONENTS_OT_ZEROTERM_RUN_ASSOCIATION);
+
+    PROF_START(PROF_COMPONENTS_OT_ZEROTERM_UPDATE_STATUS);
+    // Update tracklets' state
+    for (int32_t t = 0; t < n_tracklets; ++t) {
+        auto &tracklet = tracklets_[t];
+        if (t_associated_d_index[t] >= 0) {
+            tracklet->association_delta_t = 0.0f;
+            int32_t associated_d_index = t_associated_d_index[t];
+            const cv::Rect2f &d_bounding_box = detections[associated_d_index].rect & image_boundary;
+
+            // Apply associated detection to tracklet
+            tracklet->association_idx = detections[associated_d_index].index;
+            tracklet->association_fail_count = 0;
+            tracklet->label = detections[associated_d_index].class_label;
+
+            auto zttimgless_tracklet = std::dynamic_pointer_cast<ZeroTermImagelessTracklet>(tracklet);
+            if (!zttimgless_tracklet)
+                continue;
+
+            if (zttimgless_tracklet->status == ST_NEW) {
+                zttimgless_tracklet->trajectory.back() = d_bounding_box;
+                zttimgless_tracklet->trajectory_filtered.back() =
+                    zttimgless_tracklet->kalman_filter->Correct(zttimgless_tracklet->trajectory.back());
+                zttimgless_tracklet->birth_count += 1;
+                if (zttimgless_tracklet->birth_count >= kMinBirthCount) {
+                    zttimgless_tracklet->status = ST_TRACKED;
+                }
+            } else if (zttimgless_tracklet->status == ST_TRACKED) {
+                zttimgless_tracklet->trajectory.back() = d_bounding_box;
+                zttimgless_tracklet->trajectory_filtered.back() =
+                    zttimgless_tracklet->kalman_filter->Correct(zttimgless_tracklet->trajectory.back());
+            } else if (zttimgless_tracklet->status == ST_LOST) {
+                zttimgless_tracklet->RenewTrajectory(d_bounding_box);
+                zttimgless_tracklet->status = ST_TRACKED;
+            }
+        } else {
+            if (tracklet->status == ST_NEW) {
+                tracklet->status = ST_DEAD; // regard non-consecutive association as false alarm
+            } else if (tracklet->status == ST_TRACKED) {
+                tracklet->status = ST_LOST;
+                tracklet->association_fail_count = 0;
+            } else if (tracklet->status == ST_LOST) {
+                if (++tracklet->association_fail_count >= kMaxAssociationFailCount) {
+                    // # association fail > threshold while missing -> DEAD
+                    tracklet->status = ST_DEAD;
+                }
+            }
+        }
+    }
+    PROF_END(PROF_COMPONENTS_OT_ZEROTERM_UPDATE_STATUS);
+
+    PROF_START(PROF_COMPONENTS_OT_ZEROTERM_COMPUTE_OCCLUSION);
+    ComputeOcclusion();
+    PROF_END(PROF_COMPONENTS_OT_ZEROTERM_COMPUTE_OCCLUSION);
+
+    PROF_START(PROF_COMPONENTS_OT_ZEROTERM_REGISTER_OBJECT);
+    // Register remaining detections as new objects
+    for (int32_t d = 0; d < static_cast<int32_t>(detections.size()); ++d) {
+        if (d_is_associated[d] == false) {
+            if (static_cast<int32_t>(tracklets_.size()) >= max_objects_ && max_objects_ != -1)
+                continue;
+
+            std::unique_ptr<ZeroTermImagelessTracklet> tracklet(new ZeroTermImagelessTracklet());
+
+            tracklet->status = ST_NEW;
+            tracklet->id = GetNextTrackingID();
+            tracklet->label = detections[d].class_label;
+            tracklet->association_idx = detections[d].index;
+
+            const cv::Rect2f &bounding_box = detections[d].rect & image_boundary;
+            tracklet->InitTrajectory(bounding_box);
+            tracklet->kalman_filter.reset(new KalmanFilterNoOpencv(bounding_box));
+            tracklets_.push_back(std::move(tracklet));
+        }
+    }
+    PROF_END(PROF_COMPONENTS_OT_ZEROTERM_REGISTER_OBJECT);
+
+    RemoveDeadTracklets();
+    RemoveOutOfBoundTracklets(input_img_width, input_img_height);
+    TrimTrajectories();
+
+    *tracklets = tracklets_;
+
+    IncreaseFrameCount();
+    PROF_END(PROF_COMPONENTS_OT_ZEROTERM_RUN_TRACKER);
+    return 0;
+}
+
+void ZeroTermImagelessTracker::TrimTrajectories() {
+    for (auto &tracklet : tracklets_) {
+        auto &trajectory = tracklet->trajectory;
+        while (trajectory.size() > kMaxTrajectorySize) {
+            trajectory.pop_front();
+        }
+
+        //
+        auto &trajectory_filtered = tracklet->trajectory_filtered;
+        while (trajectory_filtered.size() > kMaxTrajectorySize) {
+            trajectory_filtered.pop_front();
+        }
+    }
+}
+
+}; // namespace ot
+}; // namespace vas
diff --git a/modules/gapi/src/3rdparty/vasot/src/components/ot/zero_term_imageless_tracker.hpp b/modules/gapi/src/3rdparty/vasot/src/components/ot/zero_term_imageless_tracker.hpp
new file mode 100644
index 000000000000..1f5f760e62b4
--- /dev/null
+++ b/modules/gapi/src/3rdparty/vasot/src/components/ot/zero_term_imageless_tracker.hpp
@@ -0,0 +1,37 @@
+/*******************************************************************************
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ ******************************************************************************/
+
+#ifndef VAS_OT_ZERO_TERM_IMAGELESS_TRACKER_HPP
+#define VAS_OT_ZERO_TERM_IMAGELESS_TRACKER_HPP
+
+#include "tracker.hpp"
+
+#include <deque>
+#include <vector>
+
+namespace vas {
+namespace ot {
+
+class ZeroTermImagelessTracker : public Tracker {
+  public:
+    explicit ZeroTermImagelessTracker(vas::ot::Tracker::InitParameters init_param);
+    virtual ~ZeroTermImagelessTracker();
+
+    virtual int32_t TrackObjects(const cv::Mat &mat, const std::vector<Detection> &detections,
+            std::vector<std::shared_ptr<Tracklet>> *tracklets, float delta_t) override;
+
+    ZeroTermImagelessTracker() = delete;
+    ZeroTermImagelessTracker(const ZeroTermImagelessTracker &) = delete;
+    ZeroTermImagelessTracker &operator=(const ZeroTermImagelessTracker &) = delete;
+
+  private:
+    void TrimTrajectories();
+};
+
+}; // namespace ot
+}; // namespace vas
+
+#endif // VAS_OT_ZERO_TERM_IMAGELESS_TRACKER_HPP
diff --git a/modules/gapi/src/api/gbackend.cpp b/modules/gapi/src/api/gbackend.cpp
index efbe17a3059d..dc51eef76df3 100644
--- a/modules/gapi/src/api/gbackend.cpp
+++ b/modules/gapi/src/api/gbackend.cpp
@@ -36,7 +36,6 @@ cv::gapi::GBackend::Priv::compile(const ade::Graph&,
 {
     // ...and this method is here for the same reason!
     GAPI_Error("InternalError");
-    return {};
 }
 
 std::unique_ptr<cv::gimpl::GIslandExecutable>
@@ -81,6 +80,9 @@ bool cv::gapi::GBackend::Priv::allowsMerge(const cv::gimpl::GIslandModel::Graph
     return true;
 }
 
+bool cv::gapi::GBackend::Priv::supportsConst(cv::GShape) const {
+    return false;
+}
 
 // GBackend public implementation //////////////////////////////////////////////
 cv::gapi::GBackend::GBackend()
@@ -224,7 +226,6 @@ void bindOutArg(Mag& mag, const RcDesc &rc, const GRunArgP &arg, HandleRMat hand
 
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
@@ -256,7 +257,6 @@ void resetInternalData(Mag& mag, const Data &d)
 
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
@@ -284,7 +284,6 @@ cv::GRunArg getArg(const Mag& mag, const RcDesc &ref)
                        mag.meta<cv::MediaFrame>().at(ref.id));
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
@@ -327,7 +326,6 @@ cv::GRunArgP getObjPtr(Mag& mag, const RcDesc &rc, bool is_umat)
 
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
@@ -359,7 +357,6 @@ void writeBack(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg)
 
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
diff --git a/modules/gapi/src/api/gbackend_priv.hpp b/modules/gapi/src/api/gbackend_priv.hpp
index 5609b304aa59..80436abde80d 100644
--- a/modules/gapi/src/api/gbackend_priv.hpp
+++ b/modules/gapi/src/api/gbackend_priv.hpp
@@ -84,6 +84,14 @@ class GAPI_EXPORTS cv::gapi::GBackend::Priv
                              const ade::NodeHandle &slot_nh,
                              const ade::NodeHandle &b_nh) const;
 
+    // Ask backend if it supports CONST_VAL data of the given shape or not.
+    // If the backend does support this data type, a Data node with such
+    // value can be fused into the backend's Island body.
+    // If the backend doesn't support this data type, a Data node won't
+    // be fused into the Islands's body -- will be marked as an in-graph
+    // input connection for this Island.
+    virtual bool supportsConst(cv::GShape shape) const;
+
     virtual ~Priv() = default;
 };
 
diff --git a/modules/gapi/src/api/gcommon.cpp b/modules/gapi/src/api/gcommon.cpp
new file mode 100644
index 000000000000..68c9e54001e5
--- /dev/null
+++ b/modules/gapi/src/api/gcommon.cpp
@@ -0,0 +1,18 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+#include "precomp.hpp"
+
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/core/utility.hpp>
+
+cv::use_threaded_executor::use_threaded_executor()
+    : num_threads(cv::getNumThreads()) {
+}
+
+cv::use_threaded_executor::use_threaded_executor(const uint32_t nthreads)
+    : num_threads(nthreads) {
+}
diff --git a/modules/gapi/src/api/gcomputation.cpp b/modules/gapi/src/api/gcomputation.cpp
index c57c66b8791c..076c1acee325 100644
--- a/modules/gapi/src/api/gcomputation.cpp
+++ b/modules/gapi/src/api/gcomputation.cpp
@@ -191,8 +191,8 @@ void cv::GComputation::recompile(GMetaArgs&& in_metas, GCompileArgs &&args)
     if (m_priv->m_lastMetas != in_metas)
     {
         if (m_priv->m_lastCompiled &&
-                m_priv->m_lastCompiled.canReshape() &&
-                formats_are_same(m_priv->m_lastMetas, in_metas))
+            m_priv->m_lastCompiled.canReshape() &&
+            formats_are_same(m_priv->m_lastMetas, in_metas))
         {
             m_priv->m_lastCompiled.reshape(in_metas, args);
         }
@@ -203,6 +203,11 @@ void cv::GComputation::recompile(GMetaArgs&& in_metas, GCompileArgs &&args)
         }
         m_priv->m_lastMetas = in_metas;
     }
+    else if (in_metas.size() == 0) {
+        // Happens when the graph is head-less (e.g. starts with const-vals only)
+        // always compile ad-hoc
+        m_priv->m_lastCompiled = compile(GMetaArgs(in_metas), std::move(args));
+    }
 }
 
 void cv::GComputation::apply(GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args)
diff --git a/modules/gapi/src/api/gmat.cpp b/modules/gapi/src/api/gmat.cpp
index 03f2e736be9b..c6bcb288ba41 100644
--- a/modules/gapi/src/api/gmat.cpp
+++ b/modules/gapi/src/api/gmat.cpp
@@ -26,6 +26,10 @@ cv::GMat::GMat(const GNode &n, std::size_t out)
 {
 }
 
+cv::GMat::GMat(cv::Mat m)
+    : m_priv(new GOrigin(GShape::GMAT, cv::gimpl::ConstVal(m))) {
+}
+
 cv::GOrigin& cv::GMat::priv()
 {
     return *m_priv;
diff --git a/modules/gapi/src/api/gproto.cpp b/modules/gapi/src/api/gproto.cpp
index 43bcfb9c14a9..bb60f4e51f3e 100644
--- a/modules/gapi/src/api/gproto.cpp
+++ b/modules/gapi/src/api/gproto.cpp
@@ -80,6 +80,7 @@ cv::GRunArg cv::value_of(const cv::GOrigin &origin)
     {
     case GShape::GSCALAR: return GRunArg(util::get<cv::Scalar>(origin.value));
     case GShape::GARRAY:  return GRunArg(util::get<cv::detail::VectorRef>(origin.value));
+    case GShape::GMAT:    return GRunArg(util::get<cv::Mat>(origin.value));
     default: util::throw_error(std::logic_error("Unsupported shape for constant"));
     }
 }
diff --git a/modules/gapi/src/api/kernels_ot.cpp b/modules/gapi/src/api/kernels_ot.cpp
new file mode 100644
index 000000000000..412b07d151d6
--- /dev/null
+++ b/modules/gapi/src/api/kernels_ot.cpp
@@ -0,0 +1,43 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#include <opencv2/gapi/ot.hpp>
+#include <opencv2/gapi/cpu/gcpukernel.hpp>
+
+#include <vas/ot.hpp>
+
+namespace cv
+{
+namespace gapi
+{
+namespace ot
+{
+GAPI_EXPORTS_W std::tuple<cv::GArray<cv::Rect>,
+                          cv::GArray<int32_t>,
+                          cv::GArray<uint64_t>,
+                          cv::GArray<int>>
+    track(const cv::GMat& mat,
+          const cv::GArray<cv::Rect>& detected_rects,
+          const cv::GArray<int>& detected_class_labels,
+          float delta)
+{
+    return GTrackFromMat::on(mat, detected_rects, detected_class_labels, delta);
+}
+
+GAPI_EXPORTS_W std::tuple<cv::GArray<cv::Rect>,
+                          cv::GArray<int32_t>,
+                          cv::GArray<uint64_t>,
+                          cv::GArray<int>>
+    track(const cv::GFrame& frame,
+          const cv::GArray<cv::Rect>& detected_rects,
+          const cv::GArray<int>& detected_class_labels,
+          float delta)
+{
+    return GTrackFromFrame::on(frame, detected_rects, detected_class_labels, delta);
+}
+}   // namespace ot
+}   // namespace gapi
+}   // namespace cv
diff --git a/modules/gapi/src/api/render_ocv.cpp b/modules/gapi/src/api/render_ocv.cpp
index f1e9be4b4893..e15f56bfebfc 100644
--- a/modules/gapi/src/api/render_ocv.cpp
+++ b/modules/gapi/src/api/render_ocv.cpp
@@ -67,7 +67,7 @@ inline void mosaic(cv::Mat& mat, const cv::Rect &rect, int cellSz)
             cell_roi = cv::mean(cell_roi);
         }
     }
-};
+}
 
 inline void blendImage(const cv::Mat& img,
                        const cv::Mat& alpha,
@@ -120,7 +120,7 @@ inline void poly(cv::Mat& mat,
 {
     std::vector<std::vector<cv::Point>> points{pp.points};
     cv::fillPoly(mat, points, pp.color, pp.lt, pp.shift);
-};
+}
 
 struct BGR2YUVConverter
 {
@@ -133,13 +133,13 @@ struct BGR2YUVConverter
         return {y, u, v};
     }
 
-    void cvtImg(const cv::Mat& in, cv::Mat& out) { cv::cvtColor(in, out, cv::COLOR_BGR2YUV); };
+    void cvtImg(const cv::Mat& in, cv::Mat& out) { cv::cvtColor(in, out, cv::COLOR_BGR2YUV); }
 };
 
 struct EmptyConverter
 {
-    cv::Scalar cvtColor(const cv::Scalar& bgr)   const { return bgr; };
-    void cvtImg(const cv::Mat& in, cv::Mat& out) const { out = in;   };
+    cv::Scalar cvtColor(const cv::Scalar& bgr)   const { return bgr; }
+    void cvtImg(const cv::Mat& in, cv::Mat& out) const { out = in;   }
 };
 
 // FIXME util::visitor ?
diff --git a/modules/gapi/src/backends/common/serialization.cpp b/modules/gapi/src/backends/common/serialization.cpp
index 2a71a782b0ad..6fe924e61ba5 100644
--- a/modules/gapi/src/backends/common/serialization.cpp
+++ b/modules/gapi/src/backends/common/serialization.cpp
@@ -8,9 +8,8 @@
 #include <map> // map
 #include <ade/util/zip_range.hpp> // indexed
 
-#define NOMINMAX
-
 #ifdef _WIN32
+#define NOMINMAX
 #include <winsock.h>      // htonl, ntohl
 #else
 #include <netinet/in.h>   // htonl, ntohl
diff --git a/modules/gapi/src/backends/common/serialization.hpp b/modules/gapi/src/backends/common/serialization.hpp
index a64805e25c94..384004c72553 100644
--- a/modules/gapi/src/backends/common/serialization.hpp
+++ b/modules/gapi/src/backends/common/serialization.hpp
@@ -195,7 +195,7 @@ class GAPI_EXPORTS ByteMemoryInStream final: public IIStream {
     size_t m_idx = 0u;
 
     void check(std::size_t n) { (void) n; GAPI_DbgAssert(m_idx+n-1 < m_storage.size()); }
-    uint32_t getU32() { uint32_t v{}; *this >> v; return v; };
+    uint32_t getU32() { uint32_t v{}; *this >> v; return v; }
 
     //virtual IIStream& operator>> (uint32_t &) final;
 
diff --git a/modules/gapi/src/backends/cpu/gcpubackend.cpp b/modules/gapi/src/backends/cpu/gcpubackend.cpp
index f50f8ecd2816..c6a35ead2753 100644
--- a/modules/gapi/src/backends/cpu/gcpubackend.cpp
+++ b/modules/gapi/src/backends/cpu/gcpubackend.cpp
@@ -65,6 +65,17 @@ namespace
         {
             return EPtr{new cv::gimpl::GCPUExecutable(graph, compileArgs, nodes)};
         }
+
+        virtual bool supportsConst(cv::GShape shape) const override
+        {
+            // Supports all types of const values
+            return shape == cv::GShape::GOPAQUE
+                || shape == cv::GShape::GSCALAR
+                || shape == cv::GShape::GARRAY;
+            // yes, value-initialized GMats are not supported currently
+            // as in-island data -- compiler will lift these values to the
+            // GIslandModel's SLOT level (will be handled uniformly)
+        }
    };
 }
 
diff --git a/modules/gapi/src/backends/cpu/gcpuot.cpp b/modules/gapi/src/backends/cpu/gcpuot.cpp
new file mode 100644
index 000000000000..abe7327be8c9
--- /dev/null
+++ b/modules/gapi/src/backends/cpu/gcpuot.cpp
@@ -0,0 +1,163 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#include <opencv2/gapi/ot.hpp>
+#include <opencv2/gapi/cpu/ot.hpp>
+#include <opencv2/gapi/cpu/gcpukernel.hpp>
+
+#include <vas/ot.hpp>
+
+namespace cv
+{
+namespace gapi
+{
+namespace ot
+{
+
+// Helper functions for OT kernels
+namespace {
+void GTrackImplSetup(cv::GArrayDesc, cv::GArrayDesc, float,
+                     std::shared_ptr<vas::ot::ObjectTracker>& state,
+                     const ObjectTrackerParams& params) {
+    vas::ot::ObjectTracker::Builder ot_builder;
+    ot_builder.max_num_objects = params.max_num_objects;
+    ot_builder.input_image_format = vas::ColorFormat(params.input_image_format);
+    ot_builder.tracking_per_class = params.tracking_per_class;
+
+    state = ot_builder.Build(vas::ot::TrackingType::ZERO_TERM_IMAGELESS);
+}
+
+void GTrackImplPrepare(const std::vector<cv::Rect>& in_rects,
+                       const std::vector<int32_t>& in_class_labels,
+                       float delta,
+                       std::vector<vas::ot::DetectedObject>& detected_objs,
+                       vas::ot::ObjectTracker& state)
+{
+    if (in_rects.size() != in_class_labels.size())
+    {
+        cv::util::throw_error(std::invalid_argument("Track() implementation run() method: in_rects and in_class_labels "
+                                                    "sizes are different."));
+    }
+
+    detected_objs.reserve(in_rects.size());
+
+    for (std::size_t i = 0; i < in_rects.size(); ++i)
+    {
+        detected_objs.emplace_back(in_rects[i], in_class_labels[i]);
+    }
+
+    state.SetFrameDeltaTime(delta);
+}
+} // anonymous namespace
+
+GAPI_OCV_KERNEL_ST(GTrackFromMatImpl, cv::gapi::ot::GTrackFromMat, vas::ot::ObjectTracker)
+{
+    static void setup(cv::GMatDesc, cv::GArrayDesc rects_desc,
+                      cv::GArrayDesc labels_desc, float delta,
+                      std::shared_ptr<vas::ot::ObjectTracker>& state,
+                      const cv::GCompileArgs& compile_args)
+    {
+        auto params = cv::gapi::getCompileArg<ObjectTrackerParams>(compile_args)
+            .value_or(ObjectTrackerParams{});
+
+        GAPI_Assert(params.input_image_format == 0 && "Only BGR input as cv::GMat is supported for now");
+        GTrackImplSetup(rects_desc, labels_desc, delta, state, params);
+    }
+
+    static void run(const cv::Mat& in_mat, const std::vector<cv::Rect>& in_rects,
+                    const std::vector<int32_t>& in_class_labels, float delta,
+                    std::vector<cv::Rect>& out_tr_rects,
+                    std::vector<int32_t>& out_rects_classes,
+                    std::vector<uint64_t>& out_tr_ids,
+                    std::vector<int>& out_tr_statuses,
+                    vas::ot::ObjectTracker& state)
+    {
+        std::vector<vas::ot::DetectedObject> detected_objs;
+        GTrackImplPrepare(in_rects, in_class_labels, delta, detected_objs, state);
+
+        GAPI_Assert(in_mat.type() == CV_8UC3 && "Input mat is not in BGR format");
+
+        auto objects = state.Track(in_mat, detected_objs);
+
+        for (auto&& object : objects)
+        {
+            out_tr_rects.push_back(object.rect);
+            out_rects_classes.push_back(object.class_label);
+            out_tr_ids.push_back(object.tracking_id);
+            out_tr_statuses.push_back(static_cast<int>(object.status));
+        }
+    }
+};
+
+GAPI_OCV_KERNEL_ST(GTrackFromFrameImpl, cv::gapi::ot::GTrackFromFrame, vas::ot::ObjectTracker)
+{
+    static void setup(cv::GFrameDesc, cv::GArrayDesc rects_desc,
+                      cv::GArrayDesc labels_desc, float delta,
+                      std::shared_ptr<vas::ot::ObjectTracker>& state,
+                      const cv::GCompileArgs& compile_args)
+    {
+        auto params = cv::gapi::getCompileArg<ObjectTrackerParams>(compile_args)
+            .value_or(ObjectTrackerParams{});
+
+        GAPI_Assert(params.input_image_format == 1 && "Only NV12 input as cv::GFrame is supported for now");
+        GTrackImplSetup(rects_desc, labels_desc, delta, state, params);
+    }
+
+    static void run(const cv::MediaFrame& in_frame, const std::vector<cv::Rect>& in_rects,
+                    const std::vector<int32_t>& in_class_labels, float delta,
+                    std::vector<cv::Rect>& out_tr_rects,
+                    std::vector<int32_t>& out_rects_classes,
+                    std::vector<uint64_t>& out_tr_ids,
+                    std::vector<int>& out_tr_statuses,
+                    vas::ot::ObjectTracker& state)
+    {
+        std::vector<vas::ot::DetectedObject> detected_objs;
+        GTrackImplPrepare(in_rects, in_class_labels, delta, detected_objs, state);
+
+        // Extract metadata from MediaFrame and construct cv::Mat atop of it
+        cv::MediaFrame::View view = in_frame.access(cv::MediaFrame::Access::R);
+        auto ptrs = view.ptr;
+        auto strides = view.stride;
+        auto desc = in_frame.desc();
+
+        GAPI_Assert((desc.fmt == cv::MediaFormat::NV12 || desc.fmt == cv::MediaFormat::BGR) \
+                    && "Input frame is not in NV12 or BGR format");
+
+        cv::Mat in;
+        if (desc.fmt == cv::MediaFormat::NV12) {
+            GAPI_Assert(ptrs[0] != nullptr && "Y plane pointer is empty");
+            GAPI_Assert(ptrs[1] != nullptr && "UV plane pointer is empty");
+            if (strides[0] > 0) {
+                in = cv::Mat(desc.size, CV_8UC1, ptrs[0], strides[0]);
+            } else {
+                in = cv::Mat(desc.size, CV_8UC1, ptrs[0]);
+            }
+        }
+
+        auto objects = state.Track(in, detected_objs);
+
+        for (auto&& object : objects)
+        {
+            out_tr_rects.push_back(object.rect);
+            out_rects_classes.push_back(object.class_label);
+            out_tr_ids.push_back(object.tracking_id);
+            out_tr_statuses.push_back(static_cast<int>(object.status));
+        }
+    }
+};
+
+cv::gapi::GKernelPackage cpu::kernels()
+{
+    return cv::gapi::kernels
+        <
+          GTrackFromFrameImpl,
+          GTrackFromMatImpl
+        >();
+}
+
+}   // namespace ot
+}   // namespace gapi
+}   // namespace cv
diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
index c2686c7bd34a..50615b26523d 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -13,7 +13,7 @@
 #include <opencv2/core/hal/hal.hpp>
 #include <opencv2/core/hal/intrin.hpp>
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #include "gfluidcore_func.hpp"
 #endif
 
@@ -113,7 +113,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
 // Fluid kernels: addWeighted
 //
 //---------------------------
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
 {
     return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
@@ -150,8 +150,8 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
                   ((std::is_same<SRC, short>::value) && (std::is_same<DST, short>::value)),
                   "This templated overload is only for short and ushort type combinations.");
 
-    constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
-                                                                static_cast<int>(v_int16::nlanes);
+    const int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(VTraits<v_uint16>::vlanes()) :
+                                                                static_cast<int>(VTraits<v_int16>::vlanes());
 
     if (length < nlanes)
         return 0;
@@ -189,7 +189,7 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], uchar out[],
                                const float _alpha, const float _beta,
                                const float _gamma, int length)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -298,7 +298,7 @@ GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)
 
 enum Arithm { ARITHM_ABSDIFF, ARITHM_ADD, ARITHM_SUBTRACT, ARITHM_MULTIPLY, ARITHM_DIVIDE };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 CV_ALWAYS_INLINE void absdiff_store(short out[], const v_int16& a, const v_int16& b, int x)
 {
     vx_store(&out[x], v_absdiffs(a, b));
@@ -322,7 +322,7 @@ CV_ALWAYS_INLINE void absdiff_store(float out[], const v_float32& a, const v_flo
 template<typename T, typename VT>
 CV_ALWAYS_INLINE int absdiff_impl(const T in1[], const T in2[], T out[], int length)
 {
-    constexpr int nlanes = static_cast<int>(VT::nlanes);
+    const int nlanes = static_cast<int>(VTraits<VT>::vlanes());
 
     if (length < nlanes)
         return 0;
@@ -403,7 +403,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
     {
         case ARITHM_ADD:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = add_simd(in1, in2, out, length);
 #endif
             for (; x < length; ++x)
@@ -412,7 +412,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_SUBTRACT:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = sub_simd(in1, in2, out, length);
 #endif
             for (; x < length; ++x)
@@ -421,7 +421,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_MULTIPLY:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = mul_simd(in1, in2, out, length, scale);
 #endif
             for (; x < length; ++x)
@@ -430,7 +430,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
         }
         case ARITHM_DIVIDE:
         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             x = div_simd(in1, in2, out, length, scale);
 #endif
             for (; x < length; ++x)
@@ -569,7 +569,7 @@ static void run_absdiff(Buffer &dst, const View &src1, const View &src2)
 
     int x = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = absdiff_simd(in1, in2, out, length);
 #endif
     for (; x < length; ++x)
@@ -660,7 +660,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_ADD:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = addc_simd(in, scalar, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -671,7 +671,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_SUBTRACT:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = subc_simd(in, scalar, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -681,7 +681,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
     case ARITHM_MULTIPLY:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = mulc_simd(in, scalar, out, length, chan, scale);
 #endif
         for (; w < width; ++w)
@@ -709,7 +709,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
     case ARITHM_SUBTRACT:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = subrc_simd(scalar, in, out, length, chan);
 #endif
         for (; w < length; ++w)
@@ -721,7 +721,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
     case ARITHM_DIVIDE:
     {
         int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = divrc_simd(scalar, in, out, length, chan, scale);
 #endif
         for (; w < length; ++w)
@@ -744,7 +744,7 @@ CV_ALWAYS_INLINE void setScratchSize(Buffer& scratch, const int buflen)
 
 CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
     constexpr int maxNlanes = 16;
 
@@ -783,7 +783,7 @@ CV_ALWAYS_INLINE void run_absdiffc(Buffer& dst, const View& src, const float sca
     const int length = width * chan;
 
     int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     w = absdiffc_simd(in, scalar, out, length, chan);
 #endif
 
@@ -1076,7 +1076,7 @@ CV_ALWAYS_INLINE void run_divc(Buffer& dst, const View& src, Buffer& scratch,
     const int length = width * chan;
 
     int w = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int scratch_length = scratch.length();
     int indicator_offset = scratch_length - 1;
     const int set_mask_indicator = static_cast<int>(*(scratch.OutLine<float>() + (indicator_offset)));
@@ -1143,7 +1143,7 @@ GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true)
 
     static void initScratch(const GMatDesc&, const GScalarDesc&, double, int, Buffer& scratch)
     {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             // 512 bits / 32 bits = 16 elements of float32 a AVX512 SIMD vector can contain.
             constexpr int maxNlanes = 16;
 
@@ -1565,7 +1565,7 @@ template<typename SRC, typename DST>
 CV_ALWAYS_INLINE void convertto_impl(const SRC in[], DST out[], const int length)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = convertto_simd(in, out, length);
 #endif
     // tail of SIMD cycle
@@ -1580,7 +1580,7 @@ CV_ALWAYS_INLINE void convertto_impl(const SRC *in, DST* out, const float alpha,
                                      const int length)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     x = convertto_scaled_simd(in, out, alpha, beta, length);
 #endif
 
@@ -2096,9 +2096,7 @@ static void run_inrange3(uchar out[], const uchar in[], int width,
         v_load_deinterleave(&in[3*w], i0, i1, i2);
 
         v_uint8x16 o;
-        o = (i0 >= v_setall_u8(lower[0])) & (i0 <= v_setall_u8(upper[0])) &
-            (i1 >= v_setall_u8(lower[1])) & (i1 <= v_setall_u8(upper[1])) &
-            (i2 >= v_setall_u8(lower[2])) & (i2 <= v_setall_u8(upper[2]));
+        o = v_and(v_and(v_and(v_and(v_and(v_ge(i0, v_setall_u8(lower[0])), v_le(i0, v_setall_u8(upper[0]))), v_ge(i1, v_setall_u8(lower[1]))), v_le(i1, v_setall_u8(upper[1]))), v_ge(i2, v_setall_u8(lower[2]))), v_le(i2, v_setall_u8(upper[2])));
 
         v_store(&out[w], o);
     }
@@ -2226,7 +2224,7 @@ static void run_select_row3(int width, uchar out[], uchar in1[], uchar in2[], uc
         v_load_deinterleave(&in2[3*w], a2, b2, c2);
 
         mask = v_load(&in3[w]);
-        mask = mask != v_setzero_u8();
+        mask = v_ne(mask, v_setzero_u8());
 
         a = v_select(mask, a1, a2);
         b = v_select(mask, b1, b2);
@@ -2332,7 +2330,7 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
         int width = src.length();
         int w = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = split3_simd(in, out1, out2, out3, width);
 #endif
 
@@ -2364,7 +2362,7 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
         int width = src.length();
         int w = 0;
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         w = split4_simd(in, out1, out2, out3, out4, width);
     #endif
 
@@ -2389,7 +2387,7 @@ CV_ALWAYS_INLINE void run_merge3(Buffer& dst, const View& src1, const View& src2
     int width = dst.length();
     int w = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         w = merge3_simd(in1, in2, in3, out, width);
 #endif
 
@@ -2442,7 +2440,7 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
 
         int w = 0; // cycle counter
 
-    #if CV_SIMD
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
         w = merge4_simd(in1, in2, in3, in4, out, width);
     #endif
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
index 05d34170249b..a0ef4b1479b0 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -7,7 +7,7 @@
 #if !defined(GAPI_STANDALONE)
 
 #include <opencv2/core/hal/intrin.hpp>
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #include "gfluidcore_func.hpp"
 #include "gfluidcore_func.simd.hpp"
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
index 0511f4e09588..0186ea020ef4 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -6,7 +6,7 @@
 
 #pragma once
 
-#if !defined(GAPI_STANDALONE) && CV_SIMD
+#if !defined(GAPI_STANDALONE) && (CV_SIMD || CV_SIMD_SCALABLE)
 
 #include <opencv2/core.hpp>
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
index aed0ee97d864..6191e9ab05b4 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -402,22 +402,22 @@ CV_ALWAYS_INLINE v_float32 vg_load_f32(const uchar* in)
 
 CV_ALWAYS_INLINE v_float32 mul_op(scale_tag, const v_float32& a, const v_float32& b, const v_float32& scale)
 {
-    return (scale*a * b);
+    return (v_mul(v_mul(scale, a), b));
 }
 
 CV_ALWAYS_INLINE v_float32 mul_op(not_scale_tag, const v_float32& a, const v_float32& b, const v_float32&)
 {
-    return a * b;
+    return v_mul(a, b);
 }
 
 CV_ALWAYS_INLINE v_float32 div_op(scale_tag, const v_float32& a, const v_float32& div, const v_float32& scale)
 {
-    return (a*scale/div);
+    return (v_div(v_mul(a, scale), div));
 }
 
 CV_ALWAYS_INLINE v_float32 div_op(not_scale_tag, const v_float32& a, const v_float32& div, const v_float32&)
 {
-    return a / div;
+    return v_div(a, div);
 }
 
 CV_ALWAYS_INLINE void v_store_i16(short* dst, const v_int32& res1, const v_int32& res2)
@@ -433,13 +433,13 @@ CV_ALWAYS_INLINE void v_store_i16(ushort* dst, const v_int32& res1, const v_int3
 CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int16& v_zero,
                                      const v_int32& res1, const v_int32& res2)
 {
-    vx_store(dst, v_select(div == v_zero, v_zero, v_pack(res1, res2)));
+    vx_store(dst, v_select(v_eq(div, v_zero), v_zero, v_pack(res1, res2)));
 }
 
 CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero,
                                      const v_int32& res1, const v_int32& res2)
 {
-    vx_store(dst, v_select(v_reinterpret_as_u16(div == v_zero),
+    vx_store(dst, v_select(v_reinterpret_as_u16(v_eq(div, v_zero)),
                            v_reinterpret_as_u16(v_zero), v_pack_u(res1, res2)));
 }
 
@@ -451,7 +451,7 @@ void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
                    const v_float32& a3, const v_float32& a4, const uchar* in2x,
                    uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load_expand(in2x));
     v_int16 div2 = v_reinterpret_as_s16(vx_load_expand(&in2x[nlanes/2]));
@@ -466,8 +466,8 @@ void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
             sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
             sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
 
-    v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
-    v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
+    v_int16 res1 = v_select((v_eq(div1, v_zero)), v_zero, v_pack(sum1, sum2));
+    v_int16 res2 = v_select((v_eq(div2, v_zero)), v_zero, v_pack(sum3, sum4));
 
     vx_store(outx, v_pack_u(res1, res2));
 }
@@ -480,7 +480,7 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
               const v_float32& a3, const v_float32& a4, const SRC* in2x,
               uchar* outx, const v_float32& v_scale, const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(in2x));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2x[nlanes/2]));
@@ -495,8 +495,8 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
             sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)),
             sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale));
 
-    v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2));
-    v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4));
+    v_int16 res1 = v_select((v_eq(div1, v_zero)), v_zero, v_pack(sum1, sum2));
+    v_int16 res2 = v_select((v_eq(div2, v_zero)), v_zero, v_pack(sum3, sum4));
 
     vx_store(outx, v_pack_u(res1, res2));
 }
@@ -507,7 +507,7 @@ CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
                                     const v_float32& a4, const float* in2x, uchar* outx,
                                     const v_float32& v_scale, const v_float32& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 div1 = vg_load_f32(in2x);
     v_float32 div2 = vg_load_f32(&in2x[nlanes / 4]);
@@ -519,10 +519,10 @@ CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1,
     v_float32 r3 = div_op(s_tag, a3, div3, v_scale);
     v_float32 r4 = div_op(s_tag, a4, div4, v_scale);
 
-    v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1);
-    v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2);
-    v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3);
-    v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4);
+    v_float32 sel1 = v_select((v_eq(div1, v_zero)), v_zero, r1);
+    v_float32 sel2 = v_select((v_eq(div2, v_zero)), v_zero, r2);
+    v_float32 sel3 = v_select((v_eq(div3, v_zero)), v_zero, r3);
+    v_float32 sel4 = v_select((v_eq(div4, v_zero)), v_zero, r4);
 
     v_int32 res1 = v_round(sel1);
     v_int32 res2 = v_round(sel2);
@@ -536,7 +536,7 @@ template<typename scale_tag_t, typename SRC, typename Vtype>
 CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, uchar* outx,
                               const v_float32& v_scale, const Vtype& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vg_load_f32(in1x);
     v_float32 a2 = vg_load_f32(&in1x[nlanes / 4]);
@@ -595,7 +595,7 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
               const float* in2x, DST* outx, const v_float32& v_scale,
               const v_float32& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(in2x);
     v_float32 fdiv2 = vg_load_f32(&in2x[nlanes / 2]);
@@ -603,8 +603,8 @@ div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2,
     v_float32 r1 = div_op(s_tag, a1, fdiv1, v_scale);
     v_float32 r2 = div_op(s_tag, a2, fdiv2, v_scale);
 
-    v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1));
-    v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2));
+    v_int32 res1 = v_round(v_select((v_eq(fdiv1, v_zero)), v_zero, r1));
+    v_int32 res2 = v_round(v_select((v_eq(fdiv2, v_zero)), v_zero, r2));
 
     v_store_i16(outx, res1, res2);
 }
@@ -616,7 +616,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
 div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, DST* outx,
         const v_float32& v_scale, const Vtype& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 a1 = vg_load_f32(in1x);
     v_float32 a2 = vg_load_f32(&in1x[nlanes / 2]);
@@ -648,12 +648,12 @@ template<typename scale_tag_t, typename SRC, typename DST>
 CV_ALWAYS_INLINE int div_simd_common(scale_tag_t s_tag, const SRC in1[], const SRC in2[],
                                      DST out[], const int length, float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
 
-    const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+    const zero_vec_type_of_t<SRC> v_zero = vx_setall<typename VTraits< zero_vec_type_of_t<SRC> >::lane_type>(0);
     v_float32 v_scale = vx_setall_f32(scale);
 
     int x = 0;
@@ -724,7 +724,7 @@ typename std::enable_if<(std::is_same<SRC, short>::value && std::is_same<DST, us
                         (std::is_same<SRC, ushort>::value && std::is_same<DST, short>::value), int>::type
 mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -769,7 +769,7 @@ typename std::enable_if<std::is_same<SRC, short>::value ||
                         std::is_same<SRC, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -824,7 +824,7 @@ template<typename scale_tag_t>
 CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[],
                              const int length, double _scale)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -869,7 +869,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -914,7 +914,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, int>::type
 mul_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -954,7 +954,7 @@ template<typename scale_tag_t, typename SRC>
 CV_ALWAYS_INLINE int mul_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[],
                              const int length, double _scale)
 {
-    constexpr int nlanes = v_float32::nlanes;
+    const int nlanes = VTraits<v_float32>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1049,7 +1049,7 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx,       const v_in
                                                    const v_int32& c4, const v_int32& c5,
                                                    const v_int32& c6)
 {
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
     vx_store(outx,           v_pack(c1, c2));
     vx_store(&outx[nlanes],   v_pack(c3, c4));
     vx_store(&outx[2*nlanes], v_pack(c5, c6));
@@ -1060,7 +1060,7 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx,      const v_in
                                                    const v_int32& c4, const v_int32& c5,
                                                    const v_int32& c6)
 {
-    constexpr int nlanes = v_uint16::nlanes;
+    const int nlanes = VTraits<v_uint16>::vlanes();
     vx_store(outx,            v_pack_u(c1, c2));
     vx_store(&outx[nlanes],   v_pack_u(c3, c4));
     vx_store(&outx[2*nlanes], v_pack_u(c5, c6));
@@ -1068,37 +1068,37 @@ CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx,      const v_in
 
 CV_ALWAYS_INLINE v_float32 oper(add_tag, const v_float32& a, const v_float32& sc)
 {
-    return a + sc;
+    return v_add(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc)
 {
-    return a - sc;
+    return v_sub(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(subr_tag, const v_float32& a, const v_float32& sc)
 {
-    return sc - a;
+    return v_sub(sc, a);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc)
 {
-    return a * sc;
+    return v_mul(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper_scaled(mul_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
 {
-    return v_scale * a * v_scalar;
+    return v_mul(v_mul(v_scale, a), v_scalar);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(div_tag, const v_float32& a, const v_float32& sc)
 {
-    return a / sc;
+    return v_div(a, sc);
 }
 
 CV_ALWAYS_INLINE v_float32 oper_scaled(div_tag, const v_float32& a, const v_float32& v_scalar, const v_float32& v_scale)
 {
-    return a*v_scale / v_scalar;
+    return v_div(v_mul(a, v_scale), v_scalar);
 }
 
 CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc)
@@ -1223,8 +1223,8 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_c3(oper_tag t, const SRC in[],
                                             const int length)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -1263,7 +1263,7 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[],
                                                 const float scalar[], DST out[],
                                                 const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1489,8 +1489,8 @@ CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_c3(oper_tag op, const SRC in[],
                                                   const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -1576,7 +1576,7 @@ CV_ALWAYS_INLINE int arithmOpScalarScaled_simd_common(oper_tag op, const SRC in[
                                                       const float scalar[], DST out[],
                                                       const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1675,10 +1675,10 @@ divc_simd_common_impl(scale_tag_t s_tag, const SRC in[], DST out[],
                       const v_float32& v_scalar, const v_float32& v_scale,
                       const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask = (v_scalar == v_zero);
+    v_float32 v_mask = (v_eq(v_scalar, v_zero));
 
     int x = 0;
     for (;;)
@@ -1709,10 +1709,10 @@ CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
                                            uchar out[], const v_float32& v_scalar,
                                            const v_float32& v_scale, const int length)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask = (v_scalar == v_zero);
+    v_float32 v_mask = (v_eq(v_scalar, v_zero));
 
     int x = 0;
     for (;;)
@@ -1747,7 +1747,7 @@ CV_ALWAYS_INLINE int divc_simd_common_impl(scale_tag_t s_tag, const SRC in[],
                                            float out[], const v_float32& v_scalar,
                                            const v_float32& v_scale, const int length)
 {
-    constexpr int nlanes = v_float32::nlanes;
+    const int nlanes = VTraits<v_float32>::vlanes();
     int x = 0;
     for (;;)
     {
@@ -1774,7 +1774,7 @@ CV_ALWAYS_INLINE int divc_mask_simd_common(scale_tag_t tag, const SRC in[],
                                            const float scalar[], DST out[],
                                            const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -1796,9 +1796,9 @@ divc_simd_c3_impl(scale_tag_t s_tag, SRC in[], DST out[], const v_float32& s1,
                   const int nlanes, const int lanes)
 {
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask1 = (s1 == v_zero);
-    v_float32 v_mask2 = (s2 == v_zero);
-    v_float32 v_mask3 = (s3 == v_zero);
+    v_float32 v_mask1 = (v_eq(s1, v_zero));
+    v_float32 v_mask2 = (v_eq(s2, v_zero));
+    v_float32 v_mask3 = (v_eq(s3, v_zero));
 
     int x = 0;
     for (;;)
@@ -1839,9 +1839,9 @@ CV_ALWAYS_INLINE int divc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, uchar*
                                        const int length, const int nlanes, const int lanes)
 {
     v_float32 v_zero = vx_setzero_f32();
-    v_float32 v_mask1 = (s1 == v_zero);
-    v_float32 v_mask2 = (s2 == v_zero);
-    v_float32 v_mask3 = (s3 == v_zero);
+    v_float32 v_mask1 = (v_eq(s1, v_zero));
+    v_float32 v_mask2 = (v_eq(s2, v_zero));
+    v_float32 v_mask3 = (v_eq(s3, v_zero));
 
     int x = 0;
     for (;;)
@@ -1917,8 +1917,8 @@ CV_ALWAYS_INLINE int divc_mask_simd_c3(scale_tag_t s_tag, const SRC in[],
                                        const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -2084,7 +2084,7 @@ CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
                                        const float scalar[], DST out[],
                                        const int length, const float scale)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -2092,7 +2092,7 @@ CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[],
     v_float32 v_scalar = vx_load(scalar);
     v_float32 v_scale = vx_setall_f32(scale);
     zero_vec_type_of_t<SRC> v_zero =
-                         vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+                         vx_setall<typename VTraits<zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2121,7 +2121,7 @@ CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, uc
                                          const v_uint8& v_zero)
 {
     v_uint8 div = vx_load(inx);
-    v_uint8 v_mask = (div == v_zero);
+    v_uint8 v_mask = (v_eq(div, v_zero));
 
     v_uint16 div1 = v_expand_low(div);
     v_uint16 div2 = v_expand_high(div);
@@ -2147,13 +2147,13 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, uchar* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
 
-    v_int16 v_mask1 = (div1 == v_zero);
-    v_int16 v_mask2 = (div2 == v_zero);
+    v_int16 v_mask1 = (v_eq(div1, v_zero));
+    v_int16 v_mask2 = (v_eq(div2, v_zero));
 
     v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1));
     v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1));
@@ -2175,17 +2175,17 @@ CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, uc
                                          const v_float32& s3, const v_float32& v_scale,
                                          const v_float32& v_zero)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(inx);
     v_float32 fdiv2 = vg_load_f32(&inx[nlanes / 4]);
     v_float32 fdiv3 = vg_load_f32(&inx[nlanes / 2]);
     v_float32 fdiv4 = vg_load_f32(&inx[3 * nlanes / 4]);
 
-    v_float32 v_mask1 = (fdiv1 == v_zero);
-    v_float32 v_mask2 = (fdiv2 == v_zero);
-    v_float32 v_mask3 = (fdiv3 == v_zero);
-    v_float32 v_mask4 = (fdiv4 == v_zero);
+    v_float32 v_mask1 = (v_eq(fdiv1, v_zero));
+    v_float32 v_mask2 = (v_eq(fdiv2, v_zero));
+    v_float32 v_mask3 = (v_eq(fdiv3, v_zero));
+    v_float32 v_mask4 = (v_eq(fdiv4, v_zero));
 
     vx_store(outx,
              v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
@@ -2202,7 +2202,7 @@ CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], uchar
                                         const int length, const int nlanes, const int lanes)
 {
     univ_zero_vec_type_of_t<SRC> v_zero =
-        vx_setall<typename univ_zero_vec_type_of_t<SRC>::lane_type>(0);
+        vx_setall<typename VTraits<univ_zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2235,7 +2235,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
     v_uint8 div = vx_load(inx);
 
     v_int16 div1 = v_reinterpret_as_s16(v_expand_low(div));
@@ -2268,7 +2268,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_int16& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_int16 div1 = v_reinterpret_as_s16(vx_load(inx));
     v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes]));
@@ -2298,7 +2298,7 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
                    const v_float32& s3, const v_float32& v_scale,
                    const v_float32& v_zero)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 fdiv1 = vg_load_f32(inx);
     v_float32 fdiv2 = vg_load_f32(&inx[nlanes/2]);
@@ -2307,12 +2307,12 @@ divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx,
     v_float32 fdiv5 = vg_load_f32(&inx[2*nlanes]);
     v_float32 fdiv6 = vg_load_f32(&inx[5*nlanes/2]);
 
-    v_store_i16(outx, v_round(v_select(fdiv1 == v_zero, v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
-                      v_round(v_select(fdiv2 == v_zero, v_zero, div_op(s_tag, s2, fdiv2, v_scale))));
-    v_store_i16(&outx[nlanes], v_round(v_select(fdiv3 == v_zero, v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
-                               v_round(v_select(fdiv4 == v_zero, v_zero, div_op(s_tag, s1, fdiv4, v_scale))));
-    v_store_i16(&outx[2*nlanes], v_round(v_select(fdiv5 == v_zero, v_zero, div_op(s_tag, s2, fdiv5, v_scale))),
-                                 v_round(v_select(fdiv6 == v_zero, v_zero, div_op(s_tag, s3, fdiv6, v_scale))));
+    v_store_i16(outx, v_round(v_select(v_eq(fdiv1, v_zero), v_zero, div_op(s_tag, s1, fdiv1, v_scale))),
+                      v_round(v_select(v_eq(fdiv2, v_zero), v_zero, div_op(s_tag, s2, fdiv2, v_scale))));
+    v_store_i16(&outx[nlanes], v_round(v_select(v_eq(fdiv3, v_zero), v_zero, div_op(s_tag, s3, fdiv3, v_scale))),
+                               v_round(v_select(v_eq(fdiv4, v_zero), v_zero, div_op(s_tag, s1, fdiv4, v_scale))));
+    v_store_i16(&outx[2*nlanes], v_round(v_select(v_eq(fdiv5, v_zero), v_zero, div_op(s_tag, s2, fdiv5, v_scale))),
+                                 v_round(v_select(v_eq(fdiv6, v_zero), v_zero, div_op(s_tag, s3, fdiv6, v_scale))));
 }
 
 template<typename scale_tag_t, typename SRC, typename DST>
@@ -2325,7 +2325,7 @@ divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], DST out[], const v_float32
                    const int, const int lanes)
 {
     zero_vec_type_of_t<SRC> v_zero =
-        vx_setall<typename zero_vec_type_of_t<SRC>::lane_type>(0);
+        vx_setall<typename VTraits<zero_vec_type_of_t<SRC>>::lane_type>(0);
 
     int x = 0;
     for (;;)
@@ -2385,8 +2385,8 @@ CV_ALWAYS_INLINE int divrc_simd_c3(scale_tag_t s_tag, const SRC in[],
                                    const int length, const float scale)
 {
     constexpr int chan = 3;
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
-    constexpr int lanes = chan * nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
+    const int lanes = chan * nlanes;
 
     if (length < lanes)
         return 0;
@@ -2473,7 +2473,7 @@ DIVRC_SIMD(float, float)
 int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
                 const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2507,7 +2507,7 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[],
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                 uchar out3[], uchar out4[], const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2543,7 +2543,7 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
 int merge3_simd(const T in1[], const T in2[], const T in3[],        \
                 T out[], const int width)                           \
 {                                                                   \
-    constexpr int nlanes = vector_type_of_t<T>::nlanes;             \
+    const int nlanes = VTraits<vector_type_of_t<T>>::vlanes();      \
     if (width < nlanes)                                             \
         return 0;                                                   \
                                                                     \
@@ -2584,7 +2584,7 @@ MERGE3_SIMD(float)
 int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
                 const uchar in4[], uchar out[], const int width)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width < nlanes)
         return 0;
 
@@ -2618,13 +2618,13 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
 template<typename VT>
 CV_ALWAYS_INLINE VT oper(add_tag, const VT& a, const VT& b)
 {
-    return a + b;
+    return v_add(a, b);
 }
 
 template<typename VT>
 CV_ALWAYS_INLINE VT oper(sub_tag, const VT& a, const VT& b)
 {
-    return a - b;
+    return v_sub(a, b);
 }
 
 CV_ALWAYS_INLINE void pack_store_uchar(uchar* outx, const v_uint16& c1, const v_uint16& c2)
@@ -2653,7 +2653,7 @@ typename std::enable_if<std::is_same<SRC, short>::value ||
                         std::is_same<SRC, ushort>::value, void>::type
 arithmOp_simd_impl(oper_tag op, const SRC* in1x, const SRC* in2x, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     vector_type_of_t<SRC> a1 = vx_load(in1x);
     vector_type_of_t<SRC> a2 = vx_load(&in1x[nlanes / 2]);
@@ -2667,7 +2667,7 @@ template<typename oper_tag>
 CV_ALWAYS_INLINE void arithmOp_simd_impl(oper_tag op, const float* in1x,
                                          const float* in2x, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vx_load(in1x);
     v_float32 a2 = vx_load(&in1x[nlanes / 4]);
@@ -2709,7 +2709,7 @@ typename std::enable_if<std::is_same<DST, short>::value ||
                         std::is_same<DST, ushort>::value, void>::type
 arithmOp_simd_impl(oper_tag op, const float* in1x, const float* in2x, DST* outx)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
     v_float32 a1 = vx_load(in1x);
     v_float32 a2 = vx_load(&in1x[nlanes/2]);
     v_float32 b1 = vx_load(in2x);
@@ -2761,7 +2761,7 @@ template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE int arithmOp_simd(oper_tag op, const SRC in1[], const SRC in2[],
                                    DST out[], const int length)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     if (length < nlanes)
         return 0;
@@ -2869,7 +2869,7 @@ CV_ALWAYS_INLINE void store_i16(short* outx, const v_int16& res)
 
 CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const float* inx, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int32 a1 = v_round(vx_load(inx));
     v_int32 a2 = v_round(vx_load(&inx[nlanes/4]));
@@ -2887,7 +2887,7 @@ CV_ALWAYS_INLINE
 typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
 convertto_simd_nocoeff_impl(const SRC* inx, uchar* outx)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     vector_type_of_t<SRC> a1 = vx_load(inx);
     vector_type_of_t<SRC> a2 = vx_load(&inx[nlanes/2]);
@@ -2902,7 +2902,7 @@ CV_ALWAYS_INLINE
 typename std::enable_if<DST_SHORT_OR_USHORT, void>::type
 convertto_simd_nocoeff_impl(const float* inx, DST* outx)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_int32 a1 = v_round(vx_load(inx));
     v_int32 a2 = v_round(vx_load(&inx[nlanes/2]));
@@ -2942,7 +2942,7 @@ CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx)
 #define CONVERTTO_NOCOEF_SIMD(SRC, DST)                            \
 int convertto_simd(const SRC in[], DST out[], const int length)    \
 {                                                                  \
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;          \
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();   \
     if (length < nlanes)                                           \
         return 0;                                                  \
                                                                    \
@@ -2982,7 +2982,7 @@ CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const float* inx, uchar* outx,
                                                  const v_float32& v_alpha,
                                                  const v_float32& v_beta)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_float32 a1 = vx_load(inx);
     v_float32 a2 = vx_load(&inx[nlanes / 4]);
@@ -3003,7 +3003,7 @@ typename std::enable_if<SRC_SHORT_OR_USHORT, void>::type
 convertto_scaled_simd_impl(const SRC* inx, uchar* outx, const v_float32& v_alpha,
                            const v_float32& v_beta)
 {
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     v_int16 a = v_reinterpret_as_s16(vx_load(inx));
     v_int16 b = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2]));
@@ -3050,7 +3050,7 @@ convertto_scaled_simd_impl(const float* inx, DST* outx,
                            const v_float32& v_alpha,
                            const v_float32& v_beta)
 {
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();
 
     v_float32 a1 = vx_load(inx);
     v_float32 a2 = vx_load(&inx[nlanes / 2]);
@@ -3111,7 +3111,7 @@ CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const SRC* inx, float* outx,
 int convertto_scaled_simd(const SRC in[], DST out[], const float alpha,     \
                           const float beta, const int length)               \
 {                                                                           \
-    constexpr int nlanes = vector_type_of_t<DST>::nlanes;                   \
+    const int nlanes = VTraits<vector_type_of_t<DST>>::vlanes();            \
     if (length < nlanes)                                                    \
         return 0;                                                           \
                                                                             \
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
index 9766cf7cc6f0..927f08d30a60 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -175,7 +175,7 @@ RUN_MEDBLUR3X3_IMPL( float)
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename SRC>
 static inline v_float32 vx_load_f32(const SRC* ptr)
 {
@@ -228,8 +228,8 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
     GAPI_Assert(rc + gc + bc <= unity);
     GAPI_Assert(rc + gc + bc >= USHRT_MAX);
 
-#if CV_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int nlanes = VTraits<v_uint8>::vlanes();
     if (width >= nlanes)
     {
         for (int w=0; w < width; )
@@ -247,14 +247,8 @@ void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
 
                 v_uint16 y0, y1;
                 static const ushort half = 1 << 7; // Q0.8.8
-                y0 = (v_mul_hi(r0 << 8, vx_setall_u16(rc)) +
-                      v_mul_hi(g0 << 8, vx_setall_u16(gc)) +
-                      v_mul_hi(b0 << 8, vx_setall_u16(bc)) +
-                                        vx_setall_u16(half)) >> 8;
-                y1 = (v_mul_hi(r1 << 8, vx_setall_u16(rc)) +
-                      v_mul_hi(g1 << 8, vx_setall_u16(gc)) +
-                      v_mul_hi(b1 << 8, vx_setall_u16(bc)) +
-                                        vx_setall_u16(half)) >> 8;
+                y0 = v_shr<8>(v_add(v_add(v_add(v_mul_hi(v_shl<8>(r0), vx_setall_u16(rc)), v_mul_hi(v_shl<8>(g0), vx_setall_u16(gc))), v_mul_hi(v_shl<8>(b0), vx_setall_u16(bc))), vx_setall_u16(half)));
+                y1 = v_shr<8>(v_add(v_add(v_add(v_mul_hi(v_shl<8>(r1), vx_setall_u16(rc)), v_mul_hi(v_shl<8>(g1), vx_setall_u16(gc))), v_mul_hi(v_shl<8>(b1), vx_setall_u16(bc))), vx_setall_u16(half)));
 
                 v_uint8 y;
                 y = v_pack(y0, y1);
@@ -316,10 +310,10 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             v_uint8x16 v_min_rgb = v_min(v_min(r, g), b);
             v_uint8x16 v_max_rgb = v_max(v_max(r, g), b);
 
-            v_uint8x16 v_diff = v_max_rgb - v_min_rgb;
+            v_uint8x16 v_diff = v_sub(v_max_rgb, v_min_rgb);
 
-            v_uint8x16 v_r_eq_max = (r == v_max_rgb);
-            v_uint8x16 v_g_eq_max = (g == v_max_rgb);
+            v_uint8x16 v_r_eq_max = (v_eq(r, v_max_rgb));
+            v_uint8x16 v_g_eq_max = (v_eq(g, v_max_rgb));
 
             v_uint8x16 v;
             // get V-ch
@@ -327,10 +321,10 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
 
             // divide v into 4x4 vectors because later int32 required
             v_uint32x4 v_idx[4];
-            v_idx[0] = v_reinterpret_as_u32(v & mask1);
-            v_idx[1] = v_reinterpret_as_u32(v & mask2) >> 8;
-            v_idx[2] = v_reinterpret_as_u32(v & mask3) >> 16;
-            v_idx[3] = v_reinterpret_as_u32(v & mask4) >> 24;
+            v_idx[0] = v_reinterpret_as_u32(v_and(v, mask1));
+            v_idx[1] = v_shr<8>(v_reinterpret_as_u32(v_and(v, mask2)));
+            v_idx[2] = v_shr<16>(v_reinterpret_as_u32(v_and(v, mask3)));
+            v_idx[3] = v_shr<24>(v_reinterpret_as_u32(v_and(v, mask4)));
 
             v_uint32x4 sv_elems_32[4];
             sv_elems_32[0] = v_reinterpret_as_u32(v_lut(sdiv_table, v_reinterpret_as_s32(v_idx[0])));
@@ -341,19 +335,19 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             // divide and calculate s according to above feature
             v_uint32x4 ss[4];
 
-            v_uint32x4 v_add = v_setall_u32(1) << (hsv_shift - 1);
+            v_uint32x4 vadd = v_shl(v_setall_u32(1), (hsv_shift - 1));
 
             v_uint32x4 v_diff_exp[4];
-            v_diff_exp[0] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask1);
-            v_diff_exp[1] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask2) >> 8;
-            v_diff_exp[2] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask3) >> 16;
-            v_diff_exp[3] = v_reinterpret_as_u32(v_reinterpret_as_u8(v_diff) & mask4) >> 24;
+            v_diff_exp[0] = v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask1));
+            v_diff_exp[1] = v_shr<8>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask2)));
+            v_diff_exp[2] = v_shr<16>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask3)));
+            v_diff_exp[3] = v_shr<24>(v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask4)));
 
             // s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-            ss[0] = (v_diff_exp[0] * sv_elems_32[0] + v_add) >> hsv_shift;
-            ss[1] = (v_diff_exp[1] * sv_elems_32[1] + v_add) >> hsv_shift;
-            ss[2] = (v_diff_exp[2] * sv_elems_32[2] + v_add) >> hsv_shift;
-            ss[3] = (v_diff_exp[3] * sv_elems_32[3] + v_add) >> hsv_shift;
+            ss[0] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[0], sv_elems_32[0]), vadd));
+            ss[1] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[1], sv_elems_32[1]), vadd));
+            ss[2] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[2], sv_elems_32[2]), vadd));
+            ss[3] = v_shr<hsv_shift>(v_add(v_mul(v_diff_exp[3], sv_elems_32[3]), vadd));
 
             // reconstruct order of S-ch
             v_uint32x4 zip[8];
@@ -412,18 +406,18 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             // start computing H-ch
             //h = (_vr & (g - b)) + (~_vr & ((_vg & (b - r + 2 * diff)) + ((~_vg) & (r - g + 4 * diff))));
             v_int32x4 hh[4];
-            hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(gg[0] - bb[0]),
-                                         v_select(p[0], v_reinterpret_as_s32(bb[0] - rr[0] + v_setall_u32(2) * vdd[0]),
-                                                        v_reinterpret_as_s32(rr[0] - gg[0] + v_setall_u32(4) * vdd[0]))));
-            hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(gg[1] - bb[1]),
-                                         v_select(p[1], v_reinterpret_as_s32(bb[1] - rr[1] + v_setall_u32(2) * vdd[1]),
-                                                        v_reinterpret_as_s32(rr[1] - gg[1] + v_setall_u32(4) * vdd[1]))));
-            hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(gg[2] - bb[2]),
-                                         v_select(p[2], v_reinterpret_as_s32(bb[2] - rr[2] + v_setall_u32(2) * vdd[2]),
-                                                        v_reinterpret_as_s32(rr[2] - gg[2] + v_setall_u32(4) * vdd[2]))));
-            hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(gg[3] - bb[3]),
-                                         v_select(p[3], v_reinterpret_as_s32(bb[3] - rr[3] + v_setall_u32(2) * vdd[3]),
-                                                        v_reinterpret_as_s32(rr[3] - gg[3] + v_setall_u32(4) * vdd[3]))));
+            hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(v_sub(gg[0], bb[0])),
+                                         v_select(p[0], v_reinterpret_as_s32(v_add(v_sub(bb[0], rr[0]), v_mul(v_setall_u32(2), vdd[0]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[0], gg[0]), v_mul(v_setall_u32(4), vdd[0]))))));
+            hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(v_sub(gg[1], bb[1])),
+                                         v_select(p[1], v_reinterpret_as_s32(v_add(v_sub(bb[1], rr[1]), v_mul(v_setall_u32(2), vdd[1]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[1], gg[1]), v_mul(v_setall_u32(4), vdd[1]))))));
+            hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(v_sub(gg[2], bb[2])),
+                                         v_select(p[2], v_reinterpret_as_s32(v_add(v_sub(bb[2], rr[2]), v_mul(v_setall_u32(2), vdd[2]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[2], gg[2]), v_mul(v_setall_u32(4), vdd[2]))))));
+            hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(v_sub(gg[3], bb[3])),
+                                         v_select(p[3], v_reinterpret_as_s32(v_add(v_sub(bb[3], rr[3]), v_mul(v_setall_u32(2), vdd[3]))),
+                                                        v_reinterpret_as_s32(v_add(v_sub(rr[3], gg[3]), v_mul(v_setall_u32(4), vdd[3]))))));
 
             //h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
             v_uint32x4 h_elems_32[4];
@@ -432,23 +426,23 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             h_elems_32[2] = v_reinterpret_as_u32(v_lut(hdiv_table, v_reinterpret_as_s32(vdd[2])));
             h_elems_32[3] = v_reinterpret_as_u32(v_lut(hdiv_table, v_reinterpret_as_s32(vdd[3])));
 
-            hh[0] = (hh[0] * v_reinterpret_as_s32(h_elems_32[0]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[1] = (hh[1] * v_reinterpret_as_s32(h_elems_32[1]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[2] = (hh[2] * v_reinterpret_as_s32(h_elems_32[2]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
-            hh[3] = (hh[3] * v_reinterpret_as_s32(h_elems_32[3]) + v_reinterpret_as_s32(v_add)) >> hsv_shift;
+            hh[0] = v_shr(v_add(v_mul(hh[0], v_reinterpret_as_s32(h_elems_32[0])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[1] = v_shr(v_add(v_mul(hh[1], v_reinterpret_as_s32(h_elems_32[1])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[2] = v_shr(v_add(v_mul(hh[2], v_reinterpret_as_s32(h_elems_32[2])), v_reinterpret_as_s32(vadd)), hsv_shift);
+            hh[3] = v_shr(v_add(v_mul(hh[3], v_reinterpret_as_s32(h_elems_32[3])), v_reinterpret_as_s32(vadd)), hsv_shift);
 
             // check for negative H
             v_int32x4 v_h_less_0[4];
-            v_h_less_0[0] = (hh[0] < v_setall_s32(0));
-            v_h_less_0[1] = (hh[1] < v_setall_s32(0));
-            v_h_less_0[2] = (hh[2] < v_setall_s32(0));
-            v_h_less_0[3] = (hh[3] < v_setall_s32(0));
+            v_h_less_0[0] = (v_lt(hh[0], v_setall_s32(0)));
+            v_h_less_0[1] = (v_lt(hh[1], v_setall_s32(0)));
+            v_h_less_0[2] = (v_lt(hh[2], v_setall_s32(0)));
+            v_h_less_0[3] = (v_lt(hh[3], v_setall_s32(0)));
 
             v_int32x4 v_h_180[4];
-            v_h_180[0] = hh[0] + v_setall_s32(180);
-            v_h_180[1] = hh[1] + v_setall_s32(180);
-            v_h_180[2] = hh[2] + v_setall_s32(180);
-            v_h_180[3] = hh[3] + v_setall_s32(180);
+            v_h_180[0] = v_add(hh[0], v_setall_s32(180));
+            v_h_180[1] = v_add(hh[1], v_setall_s32(180));
+            v_h_180[2] = v_add(hh[2], v_setall_s32(180));
+            v_h_180[3] = v_add(hh[3], v_setall_s32(180));
 
             hh[0] = v_select(v_h_less_0[0], v_h_180[0], hh[0]);
             hh[1] = v_select(v_h_less_0[1], v_h_180[1], hh[1]);
@@ -534,7 +528,7 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             // calculate b-channel
             v_expand(b2, l_1, r_1);
             v_expand(b2_offset, l_2, r_2);
-            v_uint8x16 b2_sum = v_rshr_pack<1>(l_1 + l_2, r_1 + r_2);
+            v_uint8x16 b2_sum = v_rshr_pack<1>(v_add(l_1, l_2), v_add(r_1, r_2));
 
             v_uint8x16 b_low, b_high;
             v_zip(b2_sum, b2_offset, b_low, b_high);
@@ -547,9 +541,9 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             v_expand(r3_offset, l_4, r_4);
 
             v_uint8x16 r13offset_sum, r13_sum;
-            r13offset_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                           r_1 + r_2 + r_3 + r_4);
-            r13_sum = v_rshr_pack<1>(l_1 + l_3, r_1 + r_3);
+            r13offset_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                           v_add(v_add(v_add(r_1, r_2), r_3), r_4));
+            r13_sum = v_rshr_pack<1>(v_add(l_1, l_3), v_add(r_1, r_3));
 
             v_uint8x16 r_low, r_high;
             v_zip(r13_sum, r13offset_sum, r_low, r_high);
@@ -561,8 +555,8 @@ void run_bayergr2rgb_bg_impl(uchar out[], const uchar **in, int width)
             v_expand(g2, l_3, r_3);
             v_expand(g2_offset, l_4, r_4);
 
-            v_uint8x16 g_out_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                                  r_1 + r_2 + r_3 + r_4);
+            v_uint8x16 g_out_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                                  v_add(v_add(v_add(r_1, r_2), r_3), r_4));
 
             v_uint8x16 g_low, g_high;
             v_zip(g2, g_out_sum, g_low, g_high);
@@ -646,7 +640,7 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             // calculate r-channel
             v_expand(r2, l_1, r_1);
             v_expand(r2_offset, l_2, r_2);
-            v_uint8x16 r2_sum = v_rshr_pack<1>(l_1 + l_2, r_1 + r_2);
+            v_uint8x16 r2_sum = v_rshr_pack<1>(v_add(l_1, l_2), v_add(r_1, r_2));
 
             v_uint8x16 r_low, r_high;
             v_zip(r2, r2_sum, r_low, r_high);
@@ -659,9 +653,9 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             v_expand(b3_offset, l_4, r_4);
 
             v_uint8x16 b13offset_sum, b13_sum;
-            b13offset_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                           r_1 + r_2 + r_3 + r_4);
-            b13_sum = v_rshr_pack<1>(l_2 + l_4, r_2 + r_4);
+            b13offset_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                           v_add(v_add(v_add(r_1, r_2), r_3), r_4));
+            b13_sum = v_rshr_pack<1>(v_add(l_2, l_4), v_add(r_2, r_4));
 
             v_uint8x16 b_low, b_high;
             v_zip(b13offset_sum, b13_sum, b_low, b_high);
@@ -673,8 +667,8 @@ void run_bayergr2rgb_gr_impl(uchar out[], const uchar **in, int width)
             v_expand(g2, l_3, r_3);
             v_expand(g2_offset, l_4, r_4);
 
-            v_uint8x16 g_out_sum = v_rshr_pack<2>(l_1 + l_2 + l_3 + l_4,
-                                                  r_1 + r_2 + r_3 + r_4);
+            v_uint8x16 g_out_sum = v_rshr_pack<2>(v_add(v_add(v_add(l_1, l_2), l_3), l_4),
+                                                  v_add(v_add(v_add(r_1, r_2), r_3), r_4));
 
             v_uint8x16 g_low, g_high;
             v_zip(g_out_sum, g2_offset, g_low, g_high);
@@ -749,8 +743,8 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
 
     int w = 0;
 
-#if CV_SIMD
-    static const int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    static const int nlanes = VTraits<v_uint8>::vlanes();
     for ( ; w <= width - nlanes; w += nlanes)
     {
         v_uint8 r, g, b;
@@ -761,20 +755,16 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
         v_expand(g, _g0, _g1);
         v_expand(b, _b0, _b1);
 
-        _r0 = _r0 << 7;                         // Q0.9.7 un-signed
-        _r1 = _r1 << 7;
-        _g0 = _g0 << 7;
-        _g1 = _g1 << 7;
-        _b0 = _b0 << 7;
-        _b1 = _b1 << 7;
+        _r0 = v_shl<7>(_r0);                         // Q0.9.7 un-signed
+        _r1 = v_shl<7>(_r1);
+        _g0 = v_shl<7>(_g0);
+        _g1 = v_shl<7>(_g1);
+        _b0 = v_shl<7>(_b0);
+        _b1 = v_shl<7>(_b1);
 
         v_uint16 _y0, _y1;
-        _y0 = v_mul_hi(vx_setall_u16(c0), _r0)  // Q0.9.7
-            + v_mul_hi(vx_setall_u16(c1), _g0)
-            + v_mul_hi(vx_setall_u16(c2), _b0);
-        _y1 = v_mul_hi(vx_setall_u16(c0), _r1)
-            + v_mul_hi(vx_setall_u16(c1), _g1)
-            + v_mul_hi(vx_setall_u16(c2), _b1);
+        _y0 = v_add(v_add(v_mul_hi(vx_setall_u16(c0), _r0), v_mul_hi(vx_setall_u16(c1), _g0)), v_mul_hi(vx_setall_u16(c2), _b0));
+        _y1 = v_add(v_add(v_mul_hi(vx_setall_u16(c0), _r1), v_mul_hi(vx_setall_u16(c1), _g1)), v_mul_hi(vx_setall_u16(c2), _b1));
 
         v_int16 r0, r1, b0, b1, y0, y1;
         r0 = v_reinterpret_as_s16(_r0);         // Q1.8.7 signed
@@ -785,18 +775,18 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
         y1 = v_reinterpret_as_s16(_y1);
 
         v_int16 u0, u1, v0, v1;
-        u0 = v_mul_hi(vx_setall_s16(c3), b0 - y0);  // Q1.12.3
-        u1 = v_mul_hi(vx_setall_s16(c3), b1 - y1);
-        v0 = v_mul_hi(vx_setall_s16(c4), r0 - y0);
-        v1 = v_mul_hi(vx_setall_s16(c4), r1 - y1);
+        u0 = v_mul_hi(vx_setall_s16(c3), v_sub(b0, y0));  // Q1.12.3
+        u1 = v_mul_hi(vx_setall_s16(c3), v_sub(b1, y1));
+        v0 = v_mul_hi(vx_setall_s16(c4), v_sub(r0, y0));
+        v1 = v_mul_hi(vx_setall_s16(c4), v_sub(r1, y1));
 
         v_uint8 y, u, v;
-        y = v_pack((_y0 + vx_setall_u16(1 << 6)) >> 7,
-                   (_y1 + vx_setall_u16(1 << 6)) >> 7);
-        u = v_pack_u((u0 + vx_setall_s16(257 << 2)) >> 3,  // 257 << 2 = 128.5 * (1 << 3)
-                     (u1 + vx_setall_s16(257 << 2)) >> 3);
-        v = v_pack_u((v0 + vx_setall_s16(257 << 2)) >> 3,
-                     (v1 + vx_setall_s16(257 << 2)) >> 3);
+        y = v_pack(v_shr<7>(v_add(_y0, vx_setall_u16(1 << 6))),
+                   v_shr<7>(v_add(_y1, vx_setall_u16(1 << 6))));
+        u = v_pack_u(v_shr<3>(v_add(u0, vx_setall_s16(257 << 2))),  // 257 << 2 = 128.5 * (1 << 3)
+                     v_shr<3>(v_add(u1, vx_setall_s16(257 << 2))));
+        v = v_pack_u(v_shr<3>(v_add(v0, vx_setall_s16(257 << 2))),
+                     v_shr<3>(v_add(v1, vx_setall_s16(257 << 2))));
 
         v_store_interleave(&out[3*w], y, u, v);
     }
@@ -825,8 +815,8 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
 
     int w = 0;
 
-#if CV_SIMD
-    static const int nlanes = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    static const int nlanes = VTraits<v_uint8>::vlanes();
     for ( ; w <= width - nlanes; w += nlanes)
     {
         v_uint8 y, u, v;
@@ -845,30 +835,28 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
         v0 = v_reinterpret_as_s16(_v0);
         v1 = v_reinterpret_as_s16(_v1);
 
-        y0 =  y0 << 3;                              // Q1.12.3
-        y1 =  y1 << 3;
-        u0 = (u0 - vx_setall_s16(128)) << 7;        // Q1.8.7
-        u1 = (u1 - vx_setall_s16(128)) << 7;
-        v0 = (v0 - vx_setall_s16(128)) << 7;
-        v1 = (v1 - vx_setall_s16(128)) << 7;
+        y0 =  v_shl<3>(y0);                              // Q1.12.3
+        y1 =  v_shl<3>(y1);
+        u0 = v_shl<7>(v_sub(u0, vx_setall_s16(128)));        // Q1.8.7
+        u1 = v_shl<7>(v_sub(u1, vx_setall_s16(128)));
+        v0 = v_shl<7>(v_sub(v0, vx_setall_s16(128)));
+        v1 = v_shl<7>(v_sub(v1, vx_setall_s16(128)));
 
         v_int16 r0, r1, g0, g1, b0, b1;
-        r0 = y0 + v_mul_hi(vx_setall_s16(c0), v0);  // Q1.12.3
-        r1 = y1 + v_mul_hi(vx_setall_s16(c0), v1);
-        g0 = y0 + v_mul_hi(vx_setall_s16(c1), u0)
-                + v_mul_hi(vx_setall_s16(c2), v0);
-        g1 = y1 + v_mul_hi(vx_setall_s16(c1), u1)
-                + v_mul_hi(vx_setall_s16(c2), v1);
-        b0 = y0 + v_mul_hi(vx_setall_s16(c3), u0);
-        b1 = y1 + v_mul_hi(vx_setall_s16(c3), u1);
+        r0 = v_add(y0, v_mul_hi(vx_setall_s16(c0), v0));  // Q1.12.3
+        r1 = v_add(y1, v_mul_hi(vx_setall_s16(c0), v1));
+        g0 = v_add(v_add(y0, v_mul_hi(vx_setall_s16(c1), u0)), v_mul_hi(vx_setall_s16(c2), v0));
+        g1 = v_add(v_add(y1, v_mul_hi(vx_setall_s16(c1), u1)), v_mul_hi(vx_setall_s16(c2), v1));
+        b0 = v_add(y0, v_mul_hi(vx_setall_s16(c3), u0));
+        b1 = v_add(y1, v_mul_hi(vx_setall_s16(c3), u1));
 
         v_uint8 r, g, b;
-        r = v_pack_u((r0 + vx_setall_s16(1 << 2)) >> 3,
-                     (r1 + vx_setall_s16(1 << 2)) >> 3);
-        g = v_pack_u((g0 + vx_setall_s16(1 << 2)) >> 3,
-                     (g1 + vx_setall_s16(1 << 2)) >> 3);
-        b = v_pack_u((b0 + vx_setall_s16(1 << 2)) >> 3,
-                     (b1 + vx_setall_s16(1 << 2)) >> 3);
+        r = v_pack_u(v_shr<3>(v_add(r0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(r1, vx_setall_s16(1 << 2))));
+        g = v_pack_u(v_shr<3>(v_add(g0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(g1, vx_setall_s16(1 << 2))));
+        b = v_pack_u(v_shr<3>(v_add(b0, vx_setall_s16(1 << 2))),
+                     v_shr<3>(v_add(b1, vx_setall_s16(1 << 2))));
 
         v_store_interleave(&out[3*w], r, g, b);
     }
@@ -920,41 +908,37 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
             v_expand(g, gg1, gg2);
             v_expand(b, bb1, bb2);
 
-            rr1 = rr1 << 7;
-            rr2 = rr2 << 7;
-            gg1 = gg1 << 7;
-            gg2 = gg2 << 7;
-            bb1 = bb1 << 7;
-            bb2 = bb2 << 7;
+            rr1 = v_shl<7>(rr1);
+            rr2 = v_shl<7>(rr2);
+            gg1 = v_shl<7>(gg1);
+            gg2 = v_shl<7>(gg2);
+            bb1 = v_shl<7>(bb1);
+            bb2 = v_shl<7>(bb2);
 
             v_uint16x8 yy1, yy2;
 
-            yy1 = v_mul_hi(v_setall_u16(c0), rr1) +
-                  v_mul_hi(v_setall_u16(c1), gg1) +
-                  v_mul_hi(v_setall_u16(c2), bb1);
+            yy1 = v_add(v_add(v_mul_hi(v_setall_u16(c0), rr1), v_mul_hi(v_setall_u16(c1), gg1)), v_mul_hi(v_setall_u16(c2), bb1));
 
-            yy2 = v_mul_hi(v_setall_u16(c0), rr2) +
-                  v_mul_hi(v_setall_u16(c1), gg2) +
-                  v_mul_hi(v_setall_u16(c2), bb2);
+            yy2 = v_add(v_add(v_mul_hi(v_setall_u16(c0), rr2), v_mul_hi(v_setall_u16(c1), gg2)), v_mul_hi(v_setall_u16(c2), bb2));
 
             v_int16x8 u1, u2, v1, v2;
 
-            u1 = v_mul_hi(v_setall_s16(c3), v_reinterpret_as_s16(bb1) - v_reinterpret_as_s16(yy1));
-            u2 = v_mul_hi(v_setall_s16(c3), v_reinterpret_as_s16(bb2) - v_reinterpret_as_s16(yy2));
-            v1 = v_mul_hi(v_setall_s16(c4), v_reinterpret_as_s16(rr1) - v_reinterpret_as_s16(yy1));
-            v2 = v_mul_hi(v_setall_s16(c4), v_reinterpret_as_s16(rr2) - v_reinterpret_as_s16(yy2));
+            u1 = v_mul_hi(v_setall_s16(c3), v_sub(v_reinterpret_as_s16(bb1), v_reinterpret_as_s16(yy1)));
+            u2 = v_mul_hi(v_setall_s16(c3), v_sub(v_reinterpret_as_s16(bb2), v_reinterpret_as_s16(yy2)));
+            v1 = v_mul_hi(v_setall_s16(c4), v_sub(v_reinterpret_as_s16(rr1), v_reinterpret_as_s16(yy1)));
+            v2 = v_mul_hi(v_setall_s16(c4), v_sub(v_reinterpret_as_s16(rr2), v_reinterpret_as_s16(yy2)));
 
-            y = v_pack((yy1 + v_setall_u16(1 << 6)) >> 7,
-                       (yy2 + v_setall_u16(1 << 6)) >> 7);
-            u = v_pack_u((u1 + v_setall_s16(257 << 2)) >> 3,
-                         (u2 + v_setall_s16(257 << 2)) >> 3);
-            v = v_pack_u((v1 + v_setall_s16(257 << 2)) >> 3,
-                         (v2 + v_setall_s16(257 << 2)) >> 3);
+            y = v_pack(v_shr<7>(v_add(yy1, v_setall_u16(1 << 6))),
+                       v_shr<7>(v_add(yy2, v_setall_u16(1 << 6))));
+            u = v_pack_u(v_shr<3>(v_add(u1, v_setall_s16(257 << 2))),
+                         v_shr<3>(v_add(u2, v_setall_s16(257 << 2))));
+            v = v_pack_u(v_shr<3>(v_add(v1, v_setall_s16(257 << 2))),
+                         v_shr<3>(v_add(v2, v_setall_s16(257 << 2))));
 
             uint8_t ff = 0xff;
             v_uint8x16 mask(ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0, ff, 0);
-            v_uint8x16 uu = u & mask;
-            v_uint8x16 vv = v & mask;
+            v_uint8x16 uu = v_and(u, mask);
+            v_uint8x16 vv = v_and(v, mask);
             // extract even u and v
             v_uint8x16 u_low = v_pack(v_reinterpret_as_u16(uu), v_reinterpret_as_u16(uu));
             v_uint8x16 v_low = v_pack(v_reinterpret_as_u16(vv), v_reinterpret_as_u16(vv));
@@ -1001,7 +985,7 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
 //
 //-----------------------------
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // this variant not using buf[] appears 15% faster than reference any-2-float code below
 template<bool noscale, typename SRC>
 static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
@@ -1016,7 +1000,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
 
     for (int l=0; l < length; )
     {
-        static const int nlanes = v_float32::nlanes;
+        static const int nlanes = VTraits<v_float32>::vlanes();
 
         // main part
         for ( ; l <= length - nlanes; l += nlanes)
@@ -1026,7 +1010,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
                 v_float32 t0 = vx_load_f32(&i[l - shift]);
                 v_float32 t1 = vx_load_f32(&i[l        ]);
                 v_float32 t2 = vx_load_f32(&i[l + shift]);
-                v_float32 t = t0 * vx_setall_f32(kx0);
+                v_float32 t = v_mul(t0, vx_setall_f32(kx0));
                     t = v_fma(t1,  vx_setall_f32(kx1), t);
                     t = v_fma(t2,  vx_setall_f32(kx2), t);
                 return t;
@@ -1035,7 +1019,7 @@ static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width,
             v_float32 s0 = xsum(in[0]);
             v_float32 s1 = xsum(in[1]);
             v_float32 s2 = xsum(in[2]);
-            v_float32 s = s0 * vx_setall_f32(ky0);
+            v_float32 s = v_mul(s0, vx_setall_f32(ky0));
                 s = v_fma(s1,  vx_setall_f32(ky1), s);
                 s = v_fma(s2,  vx_setall_f32(ky2), s);
 
@@ -1097,16 +1081,16 @@ static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, in
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_int16::nlanes;
+        const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
+            v_float32 sum0 = v_mul(vx_load(&buf[r0][l]), vx_setall_f32(ky0));
                 sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
                 sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
 
-            v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
+            v_float32 sum1 = v_mul(vx_load(&buf[r0][l + nlanes / 2]), vx_setall_f32(ky0));
                 sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
                 sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
 
@@ -1181,24 +1165,24 @@ static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, i
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_uint8::nlanes;
+        const int nlanes = VTraits<v_uint8>::vlanes();
 
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
+            v_float32 sum0 = v_mul(vx_load(&buf[r0][l]), vx_setall_f32(ky0));
                 sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
                 sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
 
-            v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum1 = v_mul(vx_load(&buf[r0][l + nlanes / 4]), vx_setall_f32(ky0));
                 sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
                 sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
 
-            v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum2 = v_mul(vx_load(&buf[r0][l + 2 * nlanes / 4]), vx_setall_f32(ky0));
                 sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
                 sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
 
-            v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
+            v_float32 sum3 = v_mul(vx_load(&buf[r0][l + 3 * nlanes / 4]), vx_setall_f32(ky0));
                 sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
                 sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
 
@@ -1284,7 +1268,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
     {
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = v_int16::nlanes;
+            const int nlanes = VTraits<v_int16>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -1292,9 +1276,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
                 v_uint16 t0 = vx_load_expand(&in[k][l - shift]);  // previous
                 v_uint16 t1 = vx_load_expand(&in[k][l        ]);  // current
                 v_uint16 t2 = vx_load_expand(&in[k][l + shift]);  // next pixel
-                v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
-                            v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
-                            v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
+                v_int16 t = v_add(v_add(v_mul(v_reinterpret_as_s16(t0), vx_setall_s16(ikx0)), v_mul(v_reinterpret_as_s16(t1), vx_setall_s16(ikx1))), v_mul(v_reinterpret_as_s16(t2), vx_setall_s16(ikx2)));
                 v_store(&ibuf[r[k]][l], t);
             }
 
@@ -1311,7 +1293,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = v_int16::nlanes;
+        const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
@@ -1319,13 +1301,11 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
             v_int16 s0 = vx_load(&ibuf[r[0]][l]);  // previous
             v_int16 s1 = vx_load(&ibuf[r[1]][l]);  // current
             v_int16 s2 = vx_load(&ibuf[r[2]][l]);  // next row
-            v_int16 s = s0 * vx_setall_s16(iky0) +
-                        s1 * vx_setall_s16(iky1) +
-                        s2 * vx_setall_s16(iky2);
+            v_int16 s = v_add(v_add(v_mul(s0, vx_setall_s16(iky0)), v_mul(s1, vx_setall_s16(iky1))), v_mul(s2, vx_setall_s16(iky2)));
 
             if (!noscale)
             {
-                s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+                s = v_add(v_mul_hi(v_shl<1>(s), vx_setall_s16(iscale)), vx_setall_s16(idelta));
             }
 
             v_store(&out[l], s);
@@ -1399,7 +1379,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
                                   float scale, float delta,
                                   float *buf[], int y, int y0)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
@@ -1407,7 +1387,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
 
 #if USE_SEPFILTER3X3_CHAR2SHORT
     if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
-        length >= v_int16::nlanes)
+        length >= VTraits<v_int16>::vlanes())
     {
         // only slightly faster than more generic any-to-short (see below)
         run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
@@ -1419,7 +1399,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
 #endif
 
     if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
-        length >= v_float32::nlanes)
+        length >= VTraits<v_float32>::vlanes())
     {
         // appears 15% faster than reference any-to-float code (called below)
         run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
@@ -1427,7 +1407,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
@@ -1436,7 +1416,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
@@ -1445,7 +1425,7 @@ static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         // appears 10-40x faster than reference due to much faster rounding
         run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
@@ -1499,7 +1479,7 @@ RUN_SEPFILTER3X3_IMPL(float, float)
 //
 //-----------------------------
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 // this code with manually vectored rounding to uchar
 template<bool noscale, typename SRC>
@@ -1549,17 +1529,17 @@ static void run_sepfilter5x5_any2char(uchar out[], const SRC *in[], int width, i
 
     // vertical pass
 
-    constexpr int nlanes = v_uint8::nlanes;
+    const int nlanes = VTraits<v_uint8>::vlanes();
 
     for (int l = 0; l < length;)
     {
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
-            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 4]) * vx_setall_f32(ky[0]);
-            v_float32 sum2 = vx_load(&buf[r[0]][l + 2 * nlanes / 4]) * vx_setall_f32(ky[0]);
-            v_float32 sum3 = vx_load(&buf[r[0]][l + 3 * nlanes / 4]) * vx_setall_f32(ky[0]);
+            v_float32 sum0 = v_mul(vx_load(&buf[r[0]][l]), vx_setall_f32(ky[0]));
+            v_float32 sum1 = v_mul(vx_load(&buf[r[0]][l + nlanes / 4]), vx_setall_f32(ky[0]));
+            v_float32 sum2 = v_mul(vx_load(&buf[r[0]][l + 2 * nlanes / 4]), vx_setall_f32(ky[0]));
+            v_float32 sum3 = v_mul(vx_load(&buf[r[0]][l + 3 * nlanes / 4]), vx_setall_f32(ky[0]));
 
             for (int n = 1; n < kyLen; ++n)
             {
@@ -1647,15 +1627,15 @@ static void run_sepfilter5x5_any2short(DST out[], const SRC *in[], int width, in
 
     // vertical pass
 
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
     for (int l = 0; l < length;)
     {
         //GAPI_Assert(length >= nlanes);
         // main part of row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
-            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 2]) * vx_setall_f32(ky[0]);
+            v_float32 sum0 = v_mul(vx_load(&buf[r[0]][l]), vx_setall_f32(ky[0]));
+            v_float32 sum1 = v_mul(vx_load(&buf[r[0]][l + nlanes / 2]), vx_setall_f32(ky[0]));
 
             for (int j = 1; j < kyLen; ++j)
             {
@@ -1702,14 +1682,10 @@ static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width,
                                        const float kx[], const float ky[], int border,
                                        float scale, float delta)
 {
-    constexpr int kxLen = 5;
-    constexpr int kyLen = kxLen;
-    constexpr int buffSize = 5;
-
     const int length = width * chan;
     const int shift = chan;
 
-    static const int nlanes = v_float32::nlanes;
+    static const int nlanes = VTraits<v_float32>::vlanes();
     for (int l = 0; l < length; )
     {
         //GAPI_Assert(length >= nlanes);
@@ -1717,33 +1693,33 @@ static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width,
         for (; l <= length - nlanes; l += nlanes)
         {
             auto xsum = [l, border, shift, kx](const SRC inp[])
-            {
-                v_float32 t[5];
-                for (int i = 0; i < 5; ++i)
-                {
-                    t[i] = vx_load_f32(&inp[l + (i - border)*shift]);
-                }
-
-                v_float32 sum = t[0] * vx_setall_f32(kx[0]);
-                for (int j = 1; j < 5; ++j)
-                {
-                    sum = v_fma(t[j], vx_setall_f32(kx[j]), sum);
-                }
+            { //buffSize = 5
+                v_float32 t0 = vx_load_f32(&inp[l + (0 - border)*shift]);
+                v_float32 t1 = vx_load_f32(&inp[l + (1 - border)*shift]);
+                v_float32 t2 = vx_load_f32(&inp[l + (2 - border)*shift]);
+                v_float32 t3 = vx_load_f32(&inp[l + (3 - border)*shift]);
+                v_float32 t4 = vx_load_f32(&inp[l + (4 - border)*shift]);
+
+                v_float32 sum = v_mul(t0, vx_setall_f32(kx[0]));
+                sum = v_fma(t1, vx_setall_f32(kx[1]), sum);
+                sum = v_fma(t2, vx_setall_f32(kx[2]), sum);
+                sum = v_fma(t3, vx_setall_f32(kx[3]), sum);
+                sum = v_fma(t4, vx_setall_f32(kx[4]), sum);
 
                 return sum;
             };
 
-            v_float32 s[buffSize];
-            for (int m = 0; m < buffSize; ++m)
-            {
-                s[m] = xsum(in[m]);
-            }
+            v_float32 s0 = xsum(in[0]);
+            v_float32 s1 = xsum(in[1]);
+            v_float32 s2 = xsum(in[2]);
+            v_float32 s3 = xsum(in[3]);
+            v_float32 s4 = xsum(in[4]);
 
-            v_float32 sum = s[0] * vx_setall_f32(ky[0]);
-            for (int n = 1; n < kyLen; ++n)
-            {
-                sum = v_fma(s[n], vx_setall_f32(ky[n]), sum);
-            }
+            v_float32 sum = v_mul(s0, vx_setall_f32(ky[0]));
+            sum = v_fma(s1, vx_setall_f32(ky[1]), sum);
+            sum = v_fma(s2, vx_setall_f32(ky[2]), sum);
+            sum = v_fma(s3, vx_setall_f32(ky[3]), sum);
+            sum = v_fma(s4, vx_setall_f32(ky[4]), sum);
 
             if (!noscale)
             {
@@ -1819,7 +1795,7 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
     // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
     int k0 = (y == y0) ? 0 : 4;
 
-    constexpr int nlanes = v_int16::nlanes;
+    const int nlanes = VTraits<v_int16>::vlanes();
 
     for (int k = k0; k < kyLen; ++k)
     {
@@ -1830,16 +1806,18 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
             {
-                v_uint16 t[kxLen];
                 v_int16 sum = vx_setzero_s16();
 
-                for (int i = 0; i < kxLen; ++i)
-                {
-                    // previous, current, next pixels
-                    t[i] = vx_load_expand(&in[k][l + (i - border)*shift]);
+                auto process = [&](int i) {
+                    v_uint16 t = vx_load_expand(&in[k][l + (i - border)*shift]);
+                    return v_add(sum, v_mul(v_reinterpret_as_s16(t), vx_setall_s16(ikx[i])));
+                };
 
-                    sum += v_reinterpret_as_s16(t[i]) * vx_setall_s16(ikx[i]);
-                }
+                sum = process(0);
+                sum = process(1);
+                sum = process(2);
+                sum = process(3);
+                sum = process(4);
 
                 v_store(&ibuf[r[k]][l], sum);
             }
@@ -1861,20 +1839,21 @@ static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int widt
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
-            v_int16 s[buffSize];
             v_int16 sum = vx_setzero_s16();
 
-            for (int i = 0; i < kyLen; ++i)
-            {
-                // previous, current, next rows
-                s[i] = vx_load(&ibuf[r[i]][l]);
-
-                sum += s[i] * vx_setall_s16(iky[i]);
-            }
+            auto process = [&](int i) {
+                v_int16 s = vx_load(&ibuf[r[i]][l]);
+                return v_add(sum, v_mul(s, vx_setall_s16(iky[i])));
+            };
+            sum = process(0);
+            sum = process(1);
+            sum = process(2);
+            sum = process(3);
+            sum = process(4);
 
             if (!noscale)
             {
-                sum = v_mul_hi(sum << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+                sum = v_add(v_mul_hi(v_shl<1>(sum), vx_setall_s16(iscale)), vx_setall_s16(idelta));
             }
 
             v_store(&out[l], sum);
@@ -1965,14 +1944,14 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
                                   const float kx[], const float ky[], int border,
                                   float scale, float delta, float *buf[], int y, int y0)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void)length;
 
     if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
-        length >= v_int16::nlanes)
+        length >= VTraits<v_int16>::vlanes())
     {
         run_sepfilter5x5_char2short<noscale>(reinterpret_cast<short*>(out),
                                              reinterpret_cast<const uchar**>(in),
@@ -1982,14 +1961,14 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
     }
 
     if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
-        length >= v_float32::nlanes)
+        length >= VTraits<v_float32>::vlanes())
     {
         run_sepfilter5x5_any2float<noscale>(reinterpret_cast<float*>(out), in, width,
                                             chan, kx, ky, border, scale, delta);
         return;
     }
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_sepfilter5x5_any2short<noscale>(reinterpret_cast<short*>(out), in, width,
                                             chan, kx, ky, border, scale, delta,
@@ -1997,7 +1976,7 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_sepfilter5x5_any2short<noscale>(reinterpret_cast<ushort*>(out), in, width,
                                             chan, kx, ky, border, scale, delta,
@@ -2005,7 +1984,7 @@ static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int cha
         return;
     }
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_sepfilter5x5_any2char<noscale>(reinterpret_cast<uchar*>(out), in, width,
                                            chan, kx, ky, border, scale, delta,
@@ -2086,7 +2065,7 @@ static void run_filter2d_3x3_reference(DST out[], const SRC *in[], int width, in
     }
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // assume DST is short or ushort
 template<bool noscale, typename DST, typename SRC>
 static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, int chan,
@@ -2106,14 +2085,14 @@ static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, in
 
     for (int l=0; l < length;)
     {
-        static constexpr int nlanes = v_int16::nlanes;
+        static const int nlanes = VTraits<v_int16>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
             auto sumx = [in, shift, &k](int i, int j)
             {
-                v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
+                v_float32 s = v_mul(vx_load_f32(&in[i][j - shift]), vx_setall_f32(k[i][0]));
                     s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
                     s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
                 return s;
@@ -2121,8 +2100,8 @@ static void run_filter2d_3x3_any2short(DST out[], const SRC *in[], int width, in
 
             int l0 = l;
             int l1 = l + nlanes/2;
-            v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
-            v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
+            v_float32 sum0 = v_add(sumx(0, l0), sumx(1, l0), sumx(2, l0));
+            v_float32 sum1 = v_add(sumx(0, l1), sumx(1, l1), sumx(2, l1));
 
             if (!noscale)
             {
@@ -2172,14 +2151,14 @@ static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, i
 
     for (int l=0; l < length;)
     {
-        static constexpr int nlanes = v_uint8::nlanes;
+        static const int nlanes = VTraits<v_uint8>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
         {
             auto sumx = [in, shift, &k](int i, int j)
             {
-                v_float32 s = vx_load_f32(&in[i][j - shift]) * vx_setall_f32(k[i][0]);
+                v_float32 s = v_mul(vx_load_f32(&in[i][j - shift]), vx_setall_f32(k[i][0]));
                     s = v_fma(vx_load_f32(&in[i][j        ]),  vx_setall_f32(k[i][1]), s);
                     s = v_fma(vx_load_f32(&in[i][j + shift]),  vx_setall_f32(k[i][2]), s);
                 return s;
@@ -2189,10 +2168,10 @@ static void run_filter2d_3x3_any2char(uchar out[], const SRC *in[], int width, i
             int l1 = l +   nlanes/4;
             int l2 = l + 2*nlanes/4;
             int l3 = l + 3*nlanes/4;
-            v_float32 sum0 = sumx(0, l0) + sumx(1, l0) + sumx(2, l0);
-            v_float32 sum1 = sumx(0, l1) + sumx(1, l1) + sumx(2, l1);
-            v_float32 sum2 = sumx(0, l2) + sumx(1, l2) + sumx(2, l2);
-            v_float32 sum3 = sumx(0, l3) + sumx(1, l3) + sumx(2, l3);
+            v_float32 sum0 = v_add(sumx(0, l0), sumx(1, l0), sumx(2, l0));
+            v_float32 sum1 = v_add(sumx(0, l1), sumx(1, l1), sumx(2, l1));
+            v_float32 sum2 = v_add(sumx(0, l2), sumx(1, l2), sumx(2, l2));
+            v_float32 sum3 = v_add(sumx(0, l3), sumx(1, l3), sumx(2, l3));
 
             if (!noscale)
             {
@@ -2228,20 +2207,20 @@ template<bool noscale, typename DST, typename SRC>
 static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int chan,
                                   const float kernel[], float scale, float delta)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<DST, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_filter2d_3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
                                             width, chan, kernel, scale, delta);
         return;
     }
 
-    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<DST, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_filter2d_3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
                                             width, chan, kernel, scale, delta);
@@ -2249,7 +2228,7 @@ static void run_filter2d_3x3_code(DST out[], const SRC *in[], int width, int cha
     }
 
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<DST, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_filter2d_3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
                                            width, chan, kernel, scale, delta);
@@ -2446,7 +2425,7 @@ static void run_morphology3x3_reference(T out[], const T *in[], int width, int c
     CV_Error(cv::Error::StsBadArg, "unsupported morphology");
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T, typename VT, typename S>
 static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
                                    const uchar k[], MorphShape k_type,
@@ -2467,7 +2446,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2503,7 +2482,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2537,7 +2516,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
 
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = VT::nlanes;
+            const int nlanes = VTraits<VT>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -2575,7 +2554,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2611,7 +2590,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
         {
             for (int l=0; l < length;)
             {
-                constexpr int nlanes = VT::nlanes;
+                const int nlanes = VTraits<VT>::vlanes();
 
                 // main part of output row
                 for (; l <= length - nlanes; l += nlanes)
@@ -2645,7 +2624,7 @@ static void run_morphology3x3_simd(T out[], const T *in[], int width, int chan,
 
         for (int l=0; l < length;)
         {
-            constexpr int nlanes = VT::nlanes;
+            const int nlanes = VTraits<VT>::vlanes();
 
             // main part of output row
             for (; l <= length - nlanes; l += nlanes)
@@ -2686,13 +2665,13 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
                                    const uchar k[], MorphShape k_type,
                                    Morphology morphology)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    if (std::is_same<T, float>::value && length >= VTraits<v_float32>::vlanes())
     {
         run_morphology3x3_simd<float, v_float32>(reinterpret_cast<float*>(out),
                                                  reinterpret_cast<const float**>(in),
@@ -2701,7 +2680,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<T, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_morphology3x3_simd<short, v_int16>(reinterpret_cast<short*>(out),
                                                reinterpret_cast<const short**>(in),
@@ -2710,7 +2689,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<T, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_morphology3x3_simd<ushort, v_uint16>(reinterpret_cast<ushort*>(out),
                                                  reinterpret_cast<const ushort**>(in),
@@ -2719,7 +2698,7 @@ static void run_morphology3x3_code(T out[], const T *in[], int width, int chan,
         return;
     }
 
-    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<T, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_morphology3x3_simd<uchar, v_uint8>(reinterpret_cast<uchar*>(out),
                                                reinterpret_cast<const uchar**>(in),
@@ -2796,7 +2775,7 @@ static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan
     }
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename VT, typename T>
 static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 {
@@ -2808,7 +2787,7 @@ static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 
     for (int l=0; l < length;)
     {
-        constexpr int nlanes = VT::nlanes;
+        const int nlanes = VTraits<VT>::vlanes();
 
         // main part of output row
         for (; l <= length - nlanes; l += nlanes)
@@ -2866,13 +2845,13 @@ static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
 template<typename T>
 static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int length = width * chan;
 
     // length variable may be unused if types do not match at 'if' statements below
     (void) length;
 
-    if (std::is_same<T, float>::value && length >= v_float32::nlanes)
+    if (std::is_same<T, float>::value && length >= VTraits<v_float32>::vlanes())
     {
         run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
                                        reinterpret_cast<const float**>(in),
@@ -2880,7 +2859,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, short>::value && length >= v_int16::nlanes)
+    if (std::is_same<T, short>::value && length >= VTraits<v_int16>::vlanes())
     {
         run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
                                      reinterpret_cast<const short**>(in),
@@ -2888,7 +2867,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
+    if (std::is_same<T, ushort>::value && length >= VTraits<v_uint16>::vlanes())
     {
         run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
                                       reinterpret_cast<const ushort**>(in),
@@ -2896,7 +2875,7 @@ static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
         return;
     }
 
-    if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
+    if (std::is_same<T, uchar>::value && length >= VTraits<v_uint8>::vlanes())
     {
         run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
                                      reinterpret_cast<const uchar**>(in),
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
index e246f0613bab..f7a502f1504b 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
@@ -64,7 +64,7 @@ CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
     bool xRatioEq1 = inSz.width == outSz.width;
     bool yRatioEq1 = inSz.height == outSz.height;
 
-    constexpr int nlanes = v_float32x8::nlanes;
+    const int nlanes = VTraits<v_float32x8>::vlanes();
 
     if (!xRatioEq1 && !yRatioEq1)
     {
diff --git a/modules/gapi/src/backends/ie/giebackend.cpp b/modules/gapi/src/backends/ie/giebackend.cpp
index 6026f29ae584..4fc4fe9a8ddd 100644
--- a/modules/gapi/src/backends/ie/giebackend.cpp
+++ b/modules/gapi/src/backends/ie/giebackend.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2023 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation
 
 #include "precomp.hpp"
 
@@ -10,7 +10,7 @@
 // (cv::gapi::ie::backend() is still there and is defined always)
 #include "backends/ie/giebackend.hpp"
 
-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2023010000
 
 #if INF_ENGINE_RELEASE <= 2019010000
 #   error G-API IE module supports only OpenVINO IE >= 2019 R1
@@ -381,7 +381,7 @@ inline void copyFromIE(const IE::Blob::Ptr &blob, MatType &mat) {
         HANDLE(U8, uint8_t);
         HANDLE(FP32, float);
         HANDLE(I32, int);
-        HANDLE(FP16, cv::float16_t);
+        HANDLE(FP16, cv::hfloat);
 #undef HANDLE
         case IE::Precision::I64: {
             GAPI_LOG_WARNING(NULL, "INT64 isn't supported for cv::Mat. Conversion to INT32 is used.");
@@ -2082,15 +2082,8 @@ struct InferList2: public cv::detail::KernelTag {
              ? uu.net.getInputsInfo().at(input_name_0)->getTensorDesc()
              : uu.this_network.GetInputsInfo().at(input_name_0)->getTensorDesc();
 
-        if (cv::util::holds_alternative<cv::GMatDesc>(mm_0) ||
-            cv::util::holds_alternative<cv::GFrameDesc>(mm_0)) {
-            const auto trait = clarifyTrait(mm_0, tensor_desc_0.getDims());
-            if (trait != cv::gapi::ie::TraitAs::IMAGE) {
-                util::throw_error(std::runtime_error(
-                            "IE Backend: Only images is"
-                            " supported as the 0th argument"));
-            }
-        } else {
+        if (!(cv::util::holds_alternative<cv::GMatDesc>(mm_0) ||
+              cv::util::holds_alternative<cv::GFrameDesc>(mm_0))) {
             util::throw_error(std::runtime_error(
                         "IE Backend: Unsupported input meta"
                         " for 0th argument in IE backend"));
@@ -2107,7 +2100,10 @@ struct InferList2: public cv::detail::KernelTag {
                         && "Non-array inputs are not supported");
 
             if (op.k.inKinds[idx] == cv::detail::OpaqueKind::CV_RECT) {
-                const auto input_trait = cv::gapi::ie::TraitAs::IMAGE;
+                const auto input_trait = clarifyTrait(mm_0, tensor_desc_0.getDims());
+                GAPI_Assert(input_trait == cv::gapi::ie::TraitAs::IMAGE
+                            && "IE Backend: Only image is supported as the 0th argument for an input array of cv::Rect");
+
                 // NB: Configuring input precision and network reshape must be done
                 // only in the loadNetwork case.
                 if (uu.params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
diff --git a/modules/gapi/src/backends/ie/giebackend.hpp b/modules/gapi/src/backends/ie/giebackend.hpp
index c7d938878dfa..1f000600dcd3 100644
--- a/modules/gapi/src/backends/ie/giebackend.hpp
+++ b/modules/gapi/src/backends/ie/giebackend.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation
 
 #ifndef OPENCV_GAPI_GIEBACKEND_HPP
 #define OPENCV_GAPI_GIEBACKEND_HPP
@@ -10,7 +10,7 @@
 // Include anyway - cv::gapi::ie::backend() still needs to be defined
 #include "opencv2/gapi/infer/ie.hpp"
 
-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2023010000
 
 #include <ade/util/algorithm.hpp> // type_list_index
 #include <condition_variable>
diff --git a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
index a185e7b8ce24..e3537edf8fb3 100644
--- a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
+++ b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
@@ -2,9 +2,9 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2024 Intel Corporation
 
-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2023010000
 
 #include <vector>
 #include <string>
diff --git a/modules/gapi/src/backends/ocl/goclbackend.cpp b/modules/gapi/src/backends/ocl/goclbackend.cpp
index 9c6d7154e4cf..eb63cec35be1 100644
--- a/modules/gapi/src/backends/ocl/goclbackend.cpp
+++ b/modules/gapi/src/backends/ocl/goclbackend.cpp
@@ -62,6 +62,17 @@ namespace
         {
             return EPtr{new cv::gimpl::GOCLExecutable(graph, nodes)};
         }
+
+        virtual bool supportsConst(cv::GShape shape) const override
+        {
+            // Supports all types of const values
+            return shape == cv::GShape::GOPAQUE
+                || shape == cv::GShape::GSCALAR
+                || shape == cv::GShape::GARRAY;
+            // yes, value-initialized GMats are not supported currently
+            // as in-island data -- compiler will lift these values to the
+            // GIslandModel's SLOT level (will be handled uniformly)
+        }
    };
 }
 
diff --git a/modules/gapi/src/backends/onnx/bindings_onnx.cpp b/modules/gapi/src/backends/onnx/bindings_onnx.cpp
index c9c5fc58fa59..0703f1753dd8 100644
--- a/modules/gapi/src/backends/onnx/bindings_onnx.cpp
+++ b/modules/gapi/src/backends/onnx/bindings_onnx.cpp
@@ -21,6 +21,42 @@ cv::gapi::onnx::PyParams& cv::gapi::onnx::PyParams::cfgNormalize(const std::stri
     return *this;
 }
 
+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgAddExecutionProvider(cv::gapi::onnx::ep::OpenVINO ep) {
+    m_priv->cfgAddExecutionProvider(std::move(ep));
+    return *this;
+}
+
+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgAddExecutionProvider(cv::gapi::onnx::ep::DirectML ep) {
+    m_priv->cfgAddExecutionProvider(std::move(ep));
+    return *this;
+}
+
+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgAddExecutionProvider(cv::gapi::onnx::ep::CoreML ep) {
+    m_priv->cfgAddExecutionProvider(std::move(ep));
+    return *this;
+}
+
+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgAddExecutionProvider(cv::gapi::onnx::ep::CUDA ep) {
+    m_priv->cfgAddExecutionProvider(std::move(ep));
+    return *this;
+}
+
+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgAddExecutionProvider(cv::gapi::onnx::ep::TensorRT ep) {
+    m_priv->cfgAddExecutionProvider(std::move(ep));
+    return *this;
+}
+
+cv::gapi::onnx::PyParams&
+cv::gapi::onnx::PyParams::cfgDisableMemPattern() {
+    m_priv->cfgDisableMemPattern();
+    return *this;
+}
+
 cv::gapi::GBackend cv::gapi::onnx::PyParams::backend() const {
     return m_priv->backend();
 }
diff --git a/modules/gapi/src/backends/onnx/coreml_ep.cpp b/modules/gapi/src/backends/onnx/coreml_ep.cpp
new file mode 100644
index 000000000000..3c9507863d32
--- /dev/null
+++ b/modules/gapi/src/backends/onnx/coreml_ep.cpp
@@ -0,0 +1,50 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#include "backends/onnx/coreml_ep.hpp"
+#include "logger.hpp"
+
+#ifdef HAVE_ONNX
+#include <onnxruntime_cxx_api.h>
+
+#ifdef HAVE_ONNX_COREML
+#include "../providers/coreml/coreml_provider_factory.h"
+
+void cv::gimpl::onnx::addCoreMLExecutionProvider(Ort::SessionOptions *session_options,
+                                                 const cv::gapi::onnx::ep::CoreML &coreml_ep) {
+    uint32_t flags = 0u;
+    if (coreml_ep.use_cpu_only) {
+        flags |= COREML_FLAG_USE_CPU_ONLY;
+    }
+
+    if (coreml_ep.enable_on_subgraph) {
+        flags |= COREML_FLAG_ENABLE_ON_SUBGRAPH;
+    }
+
+    if (coreml_ep.enable_only_ane) {
+        flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
+    }
+
+    try {
+        OrtSessionOptionsAppendExecutionProvider_CoreML(*session_options, flags);
+    } catch (const std::exception &e) {
+        std::stringstream ss;
+        ss << "ONNX Backend: Failed to enable CoreML"
+           << " Execution Provider: " << e.what();
+        cv::util::throw_error(std::runtime_error(ss.str()));
+    }
+}
+
+#else  // HAVE_ONNX_COREML
+
+void cv::gimpl::onnx::addCoreMLExecutionProvider(Ort::SessionOptions*,
+                                                 const cv::gapi::onnx::ep::CoreML&) {
+     util::throw_error(std::runtime_error("G-API has been compiled with ONNXRT"
+                                          " without CoreML support"));
+}
+
+#endif  // HAVE_ONNX_COREML
+#endif  // HAVE_ONNX
diff --git a/modules/gapi/src/backends/onnx/coreml_ep.hpp b/modules/gapi/src/backends/onnx/coreml_ep.hpp
new file mode 100644
index 000000000000..ddc2baeae9a5
--- /dev/null
+++ b/modules/gapi/src/backends/onnx/coreml_ep.hpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_COREML_EP_HPP
+#define OPENCV_GAPI_COREML_EP_HPP
+
+#include "opencv2/gapi/infer/onnx.hpp"
+#ifdef HAVE_ONNX
+
+#include <onnxruntime_cxx_api.h>
+
+namespace cv {
+namespace gimpl {
+namespace onnx {
+void addCoreMLExecutionProvider(Ort::SessionOptions *session_options,
+                                const cv::gapi::onnx::ep::CoreML &coreml_ep);
+}}}
+
+#endif  // HAVE_ONNX
+#endif  // OPENCV_GAPI_COREML_EP_HPP
diff --git a/modules/gapi/src/backends/onnx/dml_ep.cpp b/modules/gapi/src/backends/onnx/dml_ep.cpp
new file mode 100644
index 000000000000..671fa2dbcb8a
--- /dev/null
+++ b/modules/gapi/src/backends/onnx/dml_ep.cpp
@@ -0,0 +1,277 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#include "backends/onnx/dml_ep.hpp"
+#include "logger.hpp"
+
+#ifdef HAVE_ONNX
+#include <onnxruntime_cxx_api.h>
+
+#ifdef HAVE_ONNX_DML
+#include "../providers/dml/dml_provider_factory.h"
+
+#ifdef HAVE_DIRECTML
+
+#undef WINVER
+#define WINVER 0x0A00
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x0A00
+
+#include <initguid.h>
+
+#include <d3d11.h>
+#include <dxgi1_2.h>
+#include <dxgi1_4.h>
+#include <dxgi.h>
+#include <dxcore.h>
+#include <dxcore_interface.h>
+#include <d3d12.h>
+#include <directml.h>
+
+#pragma comment (lib, "d3d11.lib")
+#pragma comment (lib, "d3d12.lib")
+#pragma comment (lib, "dxgi.lib")
+#pragma comment (lib, "dxcore.lib")
+#pragma comment (lib, "directml.lib")
+
+#endif  // HAVE_DIRECTML
+
+static void addDMLExecutionProviderWithAdapterName(Ort::SessionOptions *session_options,
+                                                   const std::string &adapter_name);
+
+void cv::gimpl::onnx::addDMLExecutionProvider(Ort::SessionOptions *session_options,
+                                              const cv::gapi::onnx::ep::DirectML &dml_ep) {
+    namespace ep = cv::gapi::onnx::ep;
+    switch (dml_ep.ddesc.index()) {
+        case ep::DirectML::DeviceDesc::index_of<int>(): {
+            const int device_id = cv::util::get<int>(dml_ep.ddesc);
+            try {
+                OrtSessionOptionsAppendExecutionProvider_DML(*session_options, device_id);
+            } catch (const std::exception &e) {
+                std::stringstream ss;
+                ss << "ONNX Backend: Failed to enable DirectML"
+                   << " Execution Provider: " << e.what();
+                cv::util::throw_error(std::runtime_error(ss.str()));
+            }
+            break;
+        }
+        case ep::DirectML::DeviceDesc::index_of<std::string>(): {
+            const std::string adapter_name = cv::util::get<std::string>(dml_ep.ddesc);
+            addDMLExecutionProviderWithAdapterName(session_options, adapter_name);
+            break;
+        }
+        default:
+            GAPI_Assert(false && "Invalid DirectML device description");
+    }
+}
+
+#ifdef HAVE_DIRECTML
+
+#define THROW_IF_FAILED(hr, error_msg)       \
+{                                            \
+    if ((hr) != S_OK)                        \
+        throw std::runtime_error(error_msg); \
+}
+
+template <typename T>
+void release(T *ptr) {
+    if (ptr) {
+        ptr->Release();
+    }
+}
+
+template <typename T>
+using ComPtrGuard = std::unique_ptr<T, decltype(&release<T>)>;
+
+template <typename T>
+ComPtrGuard<T> make_com_ptr(T *ptr) {
+    return ComPtrGuard<T>{ptr, &release<T>};
+}
+
+struct AdapterDesc {
+    ComPtrGuard<IDXCoreAdapter> ptr;
+    std::string description;
+};
+
+static std::vector<AdapterDesc> getAvailableAdapters() {
+        std::vector<AdapterDesc> all_adapters;
+
+        IDXCoreAdapterFactory* factory_ptr;
+        GAPI_LOG_DEBUG(nullptr, "Create IDXCoreAdapterFactory");
+        THROW_IF_FAILED(
+            DXCoreCreateAdapterFactory(
+                __uuidof(IDXCoreAdapterFactory), (void**)&factory_ptr),
+            "Failed to create IDXCoreAdapterFactory");
+        auto factory = make_com_ptr<IDXCoreAdapterFactory>(factory_ptr);
+
+        IDXCoreAdapterList* adapter_list_ptr;
+        const GUID dxGUIDs[] = { DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE };
+        GAPI_LOG_DEBUG(nullptr, "CreateAdapterList");
+        THROW_IF_FAILED(
+            factory->CreateAdapterList(
+                ARRAYSIZE(dxGUIDs), dxGUIDs, __uuidof(IDXCoreAdapterList), (void**)&adapter_list_ptr),
+            "Failed to create IDXCoreAdapterList");
+        auto adapter_list = make_com_ptr<IDXCoreAdapterList>(adapter_list_ptr);
+
+        for (UINT i = 0; i < adapter_list->GetAdapterCount(); i++)
+        {
+            IDXCoreAdapter* curr_adapter_ptr;
+            GAPI_LOG_DEBUG(nullptr, "GetAdapter");
+            THROW_IF_FAILED(
+                adapter_list->GetAdapter(
+                    i, __uuidof(IDXCoreAdapter), (void**)&curr_adapter_ptr),
+                "Failed to obtain IDXCoreAdapter"
+            );
+            auto curr_adapter = make_com_ptr<IDXCoreAdapter>(curr_adapter_ptr);
+
+            bool is_hardware = false;
+            curr_adapter->GetProperty(DXCoreAdapterProperty::IsHardware, &is_hardware);
+            // NB: Filter out if not hardware adapter.
+            if (!is_hardware) {
+                continue;
+            }
+
+            size_t desc_size = 0u;
+            char description[256];
+            curr_adapter->GetPropertySize(DXCoreAdapterProperty::DriverDescription, &desc_size);
+            curr_adapter->GetProperty(DXCoreAdapterProperty::DriverDescription, desc_size, &description);
+            all_adapters.push_back(AdapterDesc{std::move(curr_adapter), description});
+        }
+        return all_adapters;
+};
+
+struct DMLDeviceInfo {
+    ComPtrGuard<IDMLDevice> device;
+    ComPtrGuard<ID3D12CommandQueue> cmd_queue;
+};
+
+static DMLDeviceInfo createDMLInfo(IDXCoreAdapter* adapter) {
+    auto pAdapter = make_com_ptr<IUnknown>(adapter);
+    D3D_FEATURE_LEVEL d3dFeatureLevel = D3D_FEATURE_LEVEL_1_0_CORE;
+    if (adapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS))
+    {
+        GAPI_LOG_INFO(nullptr, "DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS is supported");
+        d3dFeatureLevel = D3D_FEATURE_LEVEL::D3D_FEATURE_LEVEL_11_0;
+
+        IDXGIFactory4* dxgiFactory4;
+        GAPI_LOG_DEBUG(nullptr, "CreateDXGIFactory2");
+        THROW_IF_FAILED(
+            CreateDXGIFactory2(0, __uuidof(IDXGIFactory4), (void**)&dxgiFactory4),
+            "Failed to create IDXGIFactory4"
+        );
+        // If DXGI factory creation was successful then get the IDXGIAdapter from the LUID
+        // acquired from the selectedAdapter
+        LUID adapterLuid;
+        IDXGIAdapter* spDxgiAdapter;
+
+        GAPI_LOG_DEBUG(nullptr, "Get DXCoreAdapterProperty::InstanceLuid property");
+        THROW_IF_FAILED(
+            adapter->GetProperty(DXCoreAdapterProperty::InstanceLuid, &adapterLuid),
+            "Failed to get DXCoreAdapterProperty::InstanceLuid property");
+
+        GAPI_LOG_DEBUG(nullptr, "Get IDXGIAdapter by luid");
+        THROW_IF_FAILED(
+            dxgiFactory4->EnumAdapterByLuid(
+                adapterLuid, __uuidof(IDXGIAdapter), (void**)&spDxgiAdapter),
+            "Failed to get IDXGIAdapter");
+        pAdapter = make_com_ptr<IUnknown>(spDxgiAdapter);
+    } else {
+        GAPI_LOG_INFO(nullptr, "DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS isn't supported");
+    }
+
+    ID3D12Device* d3d12_device_ptr;
+    GAPI_LOG_DEBUG(nullptr, "Create D3D12Device");
+    THROW_IF_FAILED(
+        D3D12CreateDevice(
+            pAdapter.get(), d3dFeatureLevel, __uuidof(ID3D12Device), (void**)&d3d12_device_ptr),
+        "Failed to create ID3D12Device");
+    auto d3d12_device = make_com_ptr<ID3D12Device>(d3d12_device_ptr);
+
+    D3D12_COMMAND_LIST_TYPE commandQueueType = D3D12_COMMAND_LIST_TYPE_COMPUTE;
+    ID3D12CommandQueue* cmd_queue_ptr;
+    D3D12_COMMAND_QUEUE_DESC commandQueueDesc = {};
+    commandQueueDesc.Type = commandQueueType;
+    GAPI_LOG_DEBUG(nullptr, "Create D3D12CommandQueue");
+    THROW_IF_FAILED(
+        d3d12_device->CreateCommandQueue(
+            &commandQueueDesc, __uuidof(ID3D12CommandQueue), (void**)&cmd_queue_ptr),
+        "Failed to create D3D12CommandQueue"
+    );
+    GAPI_LOG_DEBUG(nullptr, "Create D3D12CommandQueue - successful");
+    auto cmd_queue = make_com_ptr<ID3D12CommandQueue>(cmd_queue_ptr);
+
+    IDMLDevice* dml_device_ptr;
+    GAPI_LOG_DEBUG(nullptr, "Create DirectML device");
+    THROW_IF_FAILED(
+        DMLCreateDevice(
+            d3d12_device.get(), DML_CREATE_DEVICE_FLAG_NONE, IID_PPV_ARGS(&dml_device_ptr)),
+        "Failed to create IDMLDevice");
+    GAPI_LOG_DEBUG(nullptr, "Create DirectML device - successful");
+    auto dml_device = make_com_ptr<IDMLDevice>(dml_device_ptr);
+
+    return {std::move(dml_device), std::move(cmd_queue)};
+};
+
+static void addDMLExecutionProviderWithAdapterName(Ort::SessionOptions *session_options,
+                                                   const std::string &adapter_name) {
+    auto all_adapters = getAvailableAdapters();
+
+    std::vector<AdapterDesc> selected_adapters;
+    std::stringstream log_msg;
+    for (auto&& adapter : all_adapters) {
+        log_msg << adapter.description << std::endl;
+        if (std::strstr(adapter.description.c_str(), adapter_name.c_str())) {
+            selected_adapters.emplace_back(std::move(adapter));
+        }
+    }
+    GAPI_LOG_INFO(NULL, "\nAvailable DirectML adapters:\n" << log_msg.str());
+
+    if (selected_adapters.empty()) {
+        std::stringstream error_msg;
+        error_msg << "ONNX Backend: No DirectML adapters found match to \"" << adapter_name << "\"";
+        cv::util::throw_error(std::runtime_error(error_msg.str()));
+    } else if (selected_adapters.size() > 1) {
+        std::stringstream error_msg;
+        error_msg << "ONNX Backend: More than one adapter matches to \"" << adapter_name << "\":\n";
+        for (const auto &selected_adapter : selected_adapters) {
+            error_msg << selected_adapter.description << "\n";
+        }
+        cv::util::throw_error(std::runtime_error(error_msg.str()));
+    }
+
+    GAPI_LOG_INFO(NULL, "Selected device: " << selected_adapters.front().description);
+    auto dml = createDMLInfo(selected_adapters.front().ptr.get());
+    try {
+        OrtSessionOptionsAppendExecutionProviderEx_DML(
+            *session_options, dml.device.release(), dml.cmd_queue.release());
+    } catch (const std::exception &e) {
+        std::stringstream ss;
+        ss << "ONNX Backend: Failed to enable DirectML"
+           << " Execution Provider: " << e.what();
+        cv::util::throw_error(std::runtime_error(ss.str()));
+    }
+}
+
+#else  // HAVE_DIRECTML
+
+static void addDMLExecutionProviderWithAdapterName(Ort::SessionOptions*, const std::string&) {
+    std::stringstream ss;
+    ss << "ONNX Backend: Failed to add DirectML Execution Provider with adapter name."
+       << " DirectML support is required.";
+    cv::util::throw_error(std::runtime_error(ss.str()));
+}
+
+#endif  // HAVE_DIRECTML
+#else  // HAVE_ONNX_DML
+
+void cv::gimpl::onnx::addDMLExecutionProvider(Ort::SessionOptions*,
+                                              const cv::gapi::onnx::ep::DirectML&) {
+     util::throw_error(std::runtime_error("G-API has been compiled with ONNXRT"
+                                          " without DirectML support"));
+}
+
+#endif  // HAVE_ONNX_DML
+#endif  // HAVE_ONNX
diff --git a/modules/gapi/src/backends/onnx/dml_ep.hpp b/modules/gapi/src/backends/onnx/dml_ep.hpp
new file mode 100644
index 000000000000..d7e43dc8885a
--- /dev/null
+++ b/modules/gapi/src/backends/onnx/dml_ep.hpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_DML_EP_HPP
+#define OPENCV_GAPI_DML_EP_HPP
+
+#include "opencv2/gapi/infer/onnx.hpp"
+#ifdef HAVE_ONNX
+
+#include <onnxruntime_cxx_api.h>
+
+namespace cv {
+namespace gimpl {
+namespace onnx {
+void addDMLExecutionProvider(Ort::SessionOptions *session_options,
+                             const cv::gapi::onnx::ep::DirectML &dml_ep);
+}}}
+
+#endif  // HAVE_ONNX
+#endif  // OPENCV_GAPI_DML_EP_HPP
diff --git a/modules/gapi/src/backends/onnx/gonnxbackend.cpp b/modules/gapi/src/backends/onnx/gonnxbackend.cpp
index 1194caeeb3c4..92b908da700d 100644
--- a/modules/gapi/src/backends/onnx/gonnxbackend.cpp
+++ b/modules/gapi/src/backends/onnx/gonnxbackend.cpp
@@ -9,6 +9,9 @@
 
 #ifdef HAVE_ONNX
 
+#include "backends/onnx/dml_ep.hpp"
+#include "backends/onnx/coreml_ep.hpp"
+
 #include <ade/util/algorithm.hpp> // any_of
 #include <ade/util/zip_range.hpp>
 #include <opencv2/gapi/infer.hpp>
@@ -143,6 +146,106 @@ class ONNXCompiled {
     void run();
 };
 
+static void addCUDAExecutionProvider(Ort::SessionOptions *session_options,
+                                     const cv::gapi::onnx::ep::CUDA &cuda_ep) {
+     OrtCUDAProviderOptions options{};
+     options.device_id = cuda_ep.device_id;
+
+     try {
+        session_options->AppendExecutionProvider_CUDA(options);
+     } catch (const std::exception &e) {
+         std::stringstream ss;
+         ss << "ONNX Backend: Failed to enable CUDA"
+            << " Execution Provider: " << e.what();
+         cv::util::throw_error(std::runtime_error(ss.str()));
+     }
+}
+
+static void addTensorRTExecutionProvider(Ort::SessionOptions *session_options,
+                                         const cv::gapi::onnx::ep::TensorRT &trt_ep) {
+     OrtTensorRTProviderOptions options{};
+     options.device_id = trt_ep.device_id;
+
+     try {
+        session_options->AppendExecutionProvider_TensorRT(options);
+     } catch (const std::exception &e) {
+         std::stringstream ss;
+         ss << "ONNX Backend: Failed to enable TensorRT"
+            << " Execution Provider: " << e.what();
+         cv::util::throw_error(std::runtime_error(ss.str()));
+     }
+}
+
+static void addOpenVINOExecutionProvider(Ort::SessionOptions *session_options,
+                                         const cv::gapi::onnx::ep::OpenVINO &ov_ep) {
+     std::unordered_map<std::string, std::string> options;
+
+     try {
+        // If the OpenVINO Execution Provider object was initialized with a parameters map,
+        // those parameters are used directly.
+        // Otherwise, the function constructs the options map from the individual member
+        // variables of the OpenVINO object.
+        if (ov_ep.params_map.empty()) {
+            options = {
+                {"device_type", ov_ep.device_type},
+                {"cache_dir", ov_ep.cache_dir},
+                {"num_of_threads", ov_ep.num_of_threads > 0 ? std::to_string(ov_ep.num_of_threads) : ""},
+                {"enable_opencl_throttling", ov_ep.enable_opencl_throttling ? "True" : "False"},
+                {"enable_dynamic_shapes", ov_ep.enable_dynamic_shapes ? "True" : "False"},
+            };
+        } else {
+            options.insert(ov_ep.params_map.begin(), ov_ep.params_map.end());
+        }
+        //  AppendExecutionProvider function expects a const std::unordered_map as its second argument
+        session_options->AppendExecutionProvider("OpenVINO", options);
+     } catch (const std::exception &e) {
+         std::stringstream ss;
+         ss << "ONNX Backend: Failed to enable OpenVINO"
+            << " Execution Provider: " << e.what();
+         cv::util::throw_error(std::runtime_error(ss.str()));
+     }
+}
+
+static void addExecutionProvider(Ort::SessionOptions          *session_options,
+                                 const cv::gapi::onnx::ep::EP &execution_provider) {
+    namespace ep = cv::gapi::onnx::ep;
+    switch (execution_provider.index()) {
+        case ep::EP::index_of<ep::OpenVINO>(): {
+             GAPI_LOG_INFO(NULL, "OpenVINO Execution Provider is added.");
+             const auto &ov_ep = cv::util::get<ep::OpenVINO>(execution_provider);
+             addOpenVINOExecutionProvider(session_options, ov_ep);
+             break;
+        }
+        case ep::EP::index_of<ep::DirectML>(): {
+            GAPI_LOG_INFO(NULL, "DirectML Execution Provider is added.");
+            const auto &dml_ep = cv::util::get<ep::DirectML>(execution_provider);
+            addDMLExecutionProvider(session_options, dml_ep);
+            break;
+        }
+        case ep::EP::index_of<ep::CoreML>(): {
+            GAPI_LOG_INFO(NULL, "CoreML Execution Provider is added.");
+            const auto &coreml_ep = cv::util::get<ep::CoreML>(execution_provider);
+            addCoreMLExecutionProvider(session_options, coreml_ep);
+            break;
+        }
+        case ep::EP::index_of<ep::CUDA>(): {
+            GAPI_LOG_INFO(NULL, "CUDA Execution Provider is added.");
+            const auto &cuda_ep = cv::util::get<ep::CUDA>(execution_provider);
+            addCUDAExecutionProvider(session_options, cuda_ep);
+            break;
+        }
+        case ep::EP::index_of<ep::TensorRT>(): {
+            GAPI_LOG_INFO(NULL, "TensorRT Execution Provider is added.");
+            const auto &trt_ep = cv::util::get<ep::TensorRT>(execution_provider);
+            addTensorRTExecutionProvider(session_options, trt_ep);
+            break;
+        }
+        default:
+            GAPI_LOG_INFO(NULL, "CPU Execution Provider is added.");
+            break;
+    }
+}
+
 } // namespace onnx
 } // namespace gimpl
 } // namespace cv
@@ -592,9 +695,16 @@ ONNXCompiled::ONNXCompiled(const gapi::onnx::detail::ParamDesc &pp)
         cv::util::throw_error(std::logic_error("Please specify output layer names for "
                                                + params.model_path));
     }
-
     // Create and initialize the ONNX session
     Ort::SessionOptions session_options;
+    GAPI_LOG_INFO(NULL, "Adding Execution Providers for \"" << pp.model_path << "\"");
+    for (const auto &ep : pp.execution_providers) {
+        cv::gimpl::onnx::addExecutionProvider(&session_options, ep);
+    }
+
+    if (pp.disable_mem_pattern) {
+        session_options.DisableMemPattern();
+    }
     this_env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "");
 #ifndef _WIN32
     this_session = Ort::Session(this_env, params.model_path.data(), session_options);
diff --git a/modules/gapi/src/backends/ov/govbackend.cpp b/modules/gapi/src/backends/ov/govbackend.cpp
index 8384a9d18831..dde4da2bb768 100644
--- a/modules/gapi/src/backends/ov/govbackend.cpp
+++ b/modules/gapi/src/backends/ov/govbackend.cpp
@@ -129,7 +129,7 @@ static int toCV(const ov::element::Type &type) {
 static void copyFromOV(const ov::Tensor &tensor, cv::Mat &mat) {
     const auto total = mat.total() * mat.channels();
     if (toCV(tensor.get_element_type()) != mat.depth() ||
-        tensor.get_size()               != total ) {
+        tensor.get_size()               != total) {
         std::stringstream ss;
         ss << "Failed to copy data from ov::Tensor to cv::Mat."
            << " Data type or number of elements mismatch."
@@ -151,6 +151,30 @@ static void copyFromOV(const ov::Tensor &tensor, cv::Mat &mat) {
     }
 }
 
+static cv::Mat wrapOV(const cv::MediaFrame::View& view,
+               const cv::GFrameDesc& desc) {
+    cv::Mat out;
+    switch (desc.fmt) {
+        case cv::MediaFormat::BGR: {
+            out = cv::Mat(desc.size, CV_8UC3, view.ptr[0], view.stride[0]);
+            return out;
+        }
+        case cv::MediaFormat::NV12: {
+            auto y_plane  = cv::Mat(desc.size, CV_8UC1, view.ptr[0], view.stride[0]);
+            auto uv_plane = cv::Mat(desc.size / 2, CV_8UC2, view.ptr[1], view.stride[1]);
+            cvtColorTwoPlane(y_plane, uv_plane, out, cv::COLOR_YUV2BGR_NV12);
+            return out;
+        }
+        case cv::MediaFormat::GRAY: {
+            out = cv::Mat(desc.size, CV_8UC1, view.ptr[0], view.stride[0]);
+            return out;
+        }
+        default:
+            GAPI_Error("OV Backend: Unsupported media format");
+    }
+    return out;
+}
+
 static void copyToOV(const cv::Mat &mat, ov::Tensor &tensor) {
     // TODO: Ideally there should be check that mat and tensor
     // dimensions are compatible.
@@ -177,6 +201,12 @@ static void copyToOV(const cv::Mat &mat, ov::Tensor &tensor) {
     }
 }
 
+static void copyToOV(const cv::MediaFrame &frame, ov::Tensor &tensor) {
+    const auto view = cv::MediaFrame::View(frame.access(cv::MediaFrame::Access::R));
+    auto matFromFrame = wrapOV(view, frame.desc());
+    copyToOV(matFromFrame, tensor);
+}
+
 std::vector<int> cv::gapi::ov::util::to_ocv(const ::ov::Shape &shape) {
     return toCV(shape);
 }
@@ -252,7 +282,8 @@ class OVCallContext
                   const std::vector<cv::gimpl::RcDesc>              &  outs,
                   cv::GRunArg::Meta                                 && meta,
                   std::vector<cv::gimpl::GIslandExecutable::InObj>  && input_objs,
-                  std::vector<cv::gimpl::GIslandExecutable::OutObj> && output_objs);
+                  std::vector<cv::gimpl::GIslandExecutable::OutObj> && output_objs,
+                  const cv::gimpl::ov::Options                      &  options);
 
     const cv::GArgs& inArgs() const;
 
@@ -268,8 +299,9 @@ class OVCallContext
     }
 
     // Syntax sugar
-          cv::GShape      inShape(std::size_t input) const;
-    const cv::Mat&        inMat  (std::size_t input) const;
+          cv::GShape      inShape (std::size_t input) const;
+    const cv::Mat&        inMat   (std::size_t input) const;
+    const cv::MediaFrame& inFrame (std::size_t input) const;
 
     cv::GRunArgP output (std::size_t idx);
     cv::Mat&     outMatR(std::size_t idx);
@@ -281,6 +313,9 @@ class OVCallContext
     std::exception_ptr eptr;
 
     const cv::GRunArg::Meta& getMeta() { return m_meta; };
+
+    const cv::gimpl::ov::Options& getOptions() const { return m_options; };
+
 private:
     cv::detail::VectorRef& outVecRef(std::size_t idx);
 
@@ -301,6 +336,8 @@ class OVCallContext
     // Input parameters passed to an inference operation.
     cv::GArgs m_args;
     cv::GShapes m_in_shapes;
+
+    cv::gimpl::ov::Options m_options;
 };
 
 OVCallContext::OVCallContext(const OVUnit                                      &  unit,
@@ -309,9 +346,11 @@ OVCallContext::OVCallContext(const OVUnit                                      &
                              const std::vector<cv::gimpl::RcDesc>              &  outs,
                              cv::GRunArg::Meta                                 && meta,
                              std::vector<cv::gimpl::GIslandExecutable::InObj>  && input_objs,
-                             std::vector<cv::gimpl::GIslandExecutable::OutObj> && output_objs)
+                             std::vector<cv::gimpl::GIslandExecutable::OutObj> && output_objs,
+                             const cv::gimpl::ov::Options                      &  options)
 : uu(unit), out(output), m_meta(std::move(meta)),
-  m_input_objs(std::move(input_objs)), m_output_objs(std::move(output_objs))
+  m_input_objs(std::move(input_objs)), m_output_objs(std::move(output_objs)),
+  m_options(options)
 {
     for (auto& it : m_input_objs)  cv::gimpl::magazine::bindInArg (m_res, it.first, it.second);
     for (auto& it : m_output_objs) cv::gimpl::magazine::bindOutArg(m_res, it.first, it.second);
@@ -347,6 +386,10 @@ const cv::Mat& OVCallContext::inMat(std::size_t input) const {
     return inArg<cv::Mat>(input);
 }
 
+const cv::MediaFrame& OVCallContext::inFrame(std::size_t input) const {
+    return inArg<cv::MediaFrame>(input);
+}
+
 cv::Mat& OVCallContext::outMatR(std::size_t idx) {
     return *cv::util::get<cv::Mat*>(m_results.at(idx));
 }
@@ -386,6 +429,8 @@ cv::GArg OVCallContext::packArg(const cv::GArg &arg) {
     //   (and constructed by either bindIn/Out or resetInternal)
     case cv::GShape::GOPAQUE:  return cv::GArg(m_res.slot<cv::detail::OpaqueRef>().at(ref.id));
 
+    case cv::GShape::GFRAME:  return cv::GArg(m_res.slot<cv::MediaFrame>()[ref.id]);
+
     default:
         cv::util::throw_error(std::logic_error("Unsupported GShape type"));
         break;
@@ -577,9 +622,10 @@ static void PostOutputs(::ov::InferRequest             &infer_request,
 
     ctx->eptr = std::move(eptr);
     for (auto i : ade::util::iota(ctx->uu.params.num_out)) {
-        // NB: Copy data back only if execution finished sucessfuly.
-        // Otherwise just post outputs to keep streaming executor contract.
-        if (!ctx->eptr) {
+        // NB: Copy data back only if execution finished sucessfuly
+        // and inference only mode is disabled.
+        // Otherwise just post outputs to maintain streaming executor contract.
+        if (!ctx->eptr && !ctx->getOptions().inference_only) {
             const auto& out_name = ctx->uu.params.output_names[i];
             copyFromOV(infer_request.get_tensor(out_name),
                        ctx->outMatR(i));
@@ -646,6 +692,19 @@ void PostOutputsList::operator()(::ov::InferRequest &infer_request,
     }
 }
 
+static void copyToOV(std::shared_ptr<OVCallContext> ctx, uint32_t input_idx, ov::Tensor &tensor) {
+    switch (ctx->inShape(input_idx)) {
+        case cv::GShape::GMAT:
+            copyToOV(ctx->inMat(input_idx), tensor);
+            break;
+        case cv::GShape::GFRAME:
+            copyToOV(ctx->inFrame(input_idx), tensor);
+            break;
+        default:
+            GAPI_Assert("Unsupported input shape for OV backend");
+    }
+}
+
 namespace cv {
 namespace gimpl {
 namespace ov {
@@ -721,6 +780,38 @@ static cv::Mat preprocess(const cv::Mat     &in_mat,
     return out;
 }
 
+// NB: This function is used to preprocess input image
+// for InferROI, InferList, InferList2 kernels.
+static cv::Mat preprocess(MediaFrame::View&     view,
+                          const cv::GFrameDesc& desc,
+                          const cv::Rect&       roi,
+                          const ::ov::Shape     &model_shape) {
+    return preprocess(wrapOV(view, desc), roi, model_shape);
+}
+
+static void preprocess_and_copy(std::shared_ptr<OVCallContext> ctx,
+                                uint32_t input_idx,
+                                const cv::Rect &roi,
+                                const ::ov::Shape &model_shape,
+                                ::ov::Tensor& tensor) {
+    switch (ctx->inShape(input_idx)) {
+        case cv::GShape::GMAT: {
+            auto roi_mat = preprocess(ctx->inMat(input_idx), roi, model_shape);
+            copyToOV(roi_mat, tensor);
+            break;
+        }
+        case cv::GShape::GFRAME: {
+            auto currentFrame = ctx->inFrame(input_idx);
+            auto view = cv::MediaFrame::View(currentFrame.access(cv::MediaFrame::Access::R));
+            auto roi_mat = preprocess(view, currentFrame.desc(), roi, model_shape);
+            copyToOV(roi_mat, tensor);
+            break;
+        }
+        default:
+            GAPI_Assert("Unsupported input shape for OV backend");
+    }
+}
+
 static bool isImage(const cv::GMatDesc &desc,
                     const ::ov::Shape  &model_shape) {
     return (model_shape.size() == 4u)                      &&
@@ -730,6 +821,16 @@ static bool isImage(const cv::GMatDesc &desc,
            (desc.depth == CV_8U);
 }
 
+static bool isImage(const cv::GMetaArg &meta,
+                    const ::ov::Shape  &shape) {
+    if (cv::util::holds_alternative<GFrameDesc>(meta)) {
+        return true;
+    }
+    GAPI_Assert(cv::util::holds_alternative<GMatDesc>(meta));
+    auto matdesc = cv::util::get<GMatDesc>(meta);
+    return isImage(matdesc, shape);
+}
+
 class PrePostProcWrapper {
 public:
     PrePostProcWrapper(std::shared_ptr<::ov::Model>   &model,
@@ -765,14 +866,19 @@ class PrePostProcWrapper {
         if (explicit_in_model_layout) {
             input_info.model().set_layout(::ov::Layout(*explicit_in_model_layout));
         } else if (m_model->input(input_name).get_shape().size() == 4u) {
-            // NB: Back compatibility with IR's without any layout information.
-            // Note that default is only applicable for 4D inputs in order to
-            // support auto resize for image use cases.
-            GAPI_LOG_WARNING(NULL, "Failed to find layout for input layer \""
-                    << input_name << "\" - NCHW is set by default");
-            const std::string default_layout = "NCHW";
-            input_info.model().set_layout(::ov::Layout(default_layout));
-            m_input_model_layout.emplace(input_name, default_layout);
+            const auto& input_layout = ::ov::layout::get_layout(m_model->input(input_name));
+            if (!input_layout.empty()) {
+                GAPI_LOG_INFO(NULL, "Model input layout " << input_name << " found: " << input_layout.to_string() << ".");
+            } else {
+                // NB: Back compatibility with IR's without any layout information.
+                // Note that default is only applicable for 4D inputs in order to
+                // support auto resize for image use cases.
+                GAPI_LOG_WARNING(NULL, "Failed to find layout for input layer \""
+                        << input_name << "\" - NCHW is set by default");
+                const std::string default_layout = "NCHW";
+                input_info.model().set_layout(::ov::Layout(default_layout));
+                m_input_model_layout.emplace(input_name, default_layout);
+            }
         }
         const auto explicit_in_tensor_layout = lookUp(m_input_tensor_layout, input_name);
         if (explicit_in_tensor_layout) {
@@ -780,13 +886,24 @@ class PrePostProcWrapper {
         }
     }
 
-    void cfgScaleMean(const std::string &input_name) {
+    void cfgScaleMean(const std::string &input_name,
+                      const GMetaArg &input_meta) {
         auto &input_info = m_ppp.input(input_name);
+
         const auto mean_vec = lookUp(m_mean_values, input_name);
+        const auto scale_vec = lookUp(m_scale_values, input_name);
+
+        if (mean_vec || scale_vec) {
+            GAPI_Assert(cv::util::holds_alternative<cv::GMatDesc>(input_meta));
+            const auto depth = cv::util::get<cv::GMatDesc>(input_meta).depth;
+            const bool depth_is_real = (depth == CV_32F) || (depth == CV_16F);
+            if (!depth_is_real) {
+                input_info.preprocess().convert_element_type(toOV(CV_32F));
+            }
+        }
         if (mean_vec) {
             input_info.preprocess().mean(*mean_vec);
         }
-        const auto scale_vec = lookUp(m_scale_values, input_name);
         if (scale_vec) {
             input_info.preprocess().scale(*scale_vec);
         }
@@ -796,9 +913,8 @@ class PrePostProcWrapper {
     void cfgPreProcessing(const std::string  &input_name,
                           const cv::GMetaArg &input_meta,
                           const bool         disable_img_resize = false) {
-        GAPI_Assert(cv::util::holds_alternative<cv::GMatDesc>(input_meta));
-        const auto &matdesc = cv::util::get<cv::GMatDesc>(input_meta);
-
+        GAPI_Assert(cv::util::holds_alternative<cv::GMatDesc>(input_meta) ||
+                    cv::util::holds_alternative<cv::GFrameDesc>(input_meta));
         const auto explicit_in_tensor_layout = lookUp(m_input_tensor_layout, input_name);
         const auto explicit_in_model_layout  = lookUp(m_input_model_layout, input_name);
         const auto explicit_resize = lookUp(m_interpolation, input_name);
@@ -813,24 +929,35 @@ class PrePostProcWrapper {
         const auto &input_shape = m_model->input(input_name).get_shape();
         auto &input_info = m_ppp.input(input_name);
 
-        m_ppp.input(input_name).tensor().set_element_type(toOV(matdesc.depth));
-        if (isImage(matdesc, input_shape)) {
+        auto isMat = cv::util::holds_alternative<cv::GMatDesc>(input_meta);
+        auto prec  = isMat ? cv::util::get<cv::GMatDesc>(input_meta).depth : CV_8U;
+        m_ppp.input(input_name).tensor().set_element_type(toOV(prec));
+
+        const auto &matdesc   = isMat ? cv::util::get<cv::GMatDesc>(input_meta) : cv::GMatDesc();
+        const auto &framedesc = !isMat ? cv::util::get<cv::GFrameDesc>(input_meta) : cv::GFrameDesc();
+        if (isImage(input_meta, input_shape)) {
             // NB: Image case - all necessary preprocessng is configured automatically.
             GAPI_LOG_DEBUG(NULL, "OV Backend: Input: \"" << input_name << "\" is image.");
-            if (explicit_in_tensor_layout &&
-                *explicit_in_tensor_layout != "NHWC") {
+            if (explicit_in_tensor_layout && *explicit_in_tensor_layout != "NHWC") {
+                std::stringstream desc_str;
+                if (isMat) {
+                    desc_str << matdesc;
+                } else {
+                    desc_str << framedesc;
+                }
                 std::stringstream ss;
                 ss << "OV Backend: Provided tensor layout " << *explicit_in_tensor_layout
-                   << " is not compatible with input data " << matdesc << " for layer \""
-                   << input_name << "\". Expecting NHWC";
+                << " is not compatible with input data " << desc_str.str() << " for layer \""
+                << input_name << "\". Expecting NHWC";
                 util::throw_error(std::logic_error(ss.str()));
             } else {
                 input_info.tensor().set_layout(::ov::Layout("NHWC"));
             }
 
             if (!disable_img_resize) {
-                input_info.tensor().set_spatial_static_shape(matdesc.size.height,
-                                                             matdesc.size.width);
+                const auto size = isMat ? cv::util::get<cv::GMatDesc>(input_meta).size : cv::util::get<cv::GFrameDesc>(input_meta).size;
+                input_info.tensor().set_spatial_static_shape(size.height,
+                                                             size.width);
                 // NB: Even though resize is automatically configured
                 // user have an opportunity to specify the interpolation algorithm.
                 auto interp = explicit_resize
@@ -852,8 +979,8 @@ class PrePostProcWrapper {
                     if (!explicit_in_tensor_layout && model_layout.empty()) {
                         std::stringstream ss;
                         ss << "Resize for input layer: " << input_name
-                           << "can't be configured."
-                           << " Failed to extract H and W positions from layout.";
+                        << "can't be configured."
+                        << " Failed to extract H and W positions from layout.";
                         util::throw_error(std::logic_error(ss.str()));
                     } else {
                         const auto layout = explicit_in_tensor_layout
@@ -957,10 +1084,9 @@ struct Infer: public cv::detail::KernelTag {
                                             ade::util::toRange(in_metas))) {
                 const auto &input_name = std::get<0>(it);
                 const auto &mm = std::get<1>(it);
-
                 ppp.cfgLayouts(input_name);
                 ppp.cfgPreProcessing(input_name, mm);
-                ppp.cfgScaleMean(input_name);
+                ppp.cfgScaleMean(input_name, mm);
             }
             ppp.cfgPostProcessing();
             ppp.finalize();
@@ -990,12 +1116,17 @@ struct Infer: public cv::detail::KernelTag {
         reqPool.getIdleRequest()->execute(
                 IInferExecutor::Task {
                     [ctx](::ov::InferRequest &infer_request) {
+                        // NB: No need to populate model inputs with data
+                        // if it's inference only mode.
+                        if (ctx->getOptions().inference_only) {
+                            return;
+                        }
                         for (auto i : ade::util::iota(ctx->uu.params.num_in)) {
                             const auto& input_name = ctx->uu.params.input_names[i];
                             auto input_tensor = infer_request.get_tensor(input_name);
                             // TODO: In some cases wrapping existing data pointer
                             // might be faster than copy. Make it a strategy.
-                            copyToOV(ctx->inMat(i), input_tensor);
+                            copyToOV(ctx, i, input_tensor);
                         }
                     },
                     std::bind(PostOutputs, _1, _2, ctx)
@@ -1024,13 +1155,13 @@ struct InferROI: public cv::detail::KernelTag {
 
         const auto &input_name = uu.params.input_names.at(0);
         const auto &mm = in_metas.at(1u);
-        GAPI_Assert(cv::util::holds_alternative<cv::GMatDesc>(mm));
-        const auto &matdesc = cv::util::get<cv::GMatDesc>(mm);
-
+        GAPI_Assert(cv::util::holds_alternative<cv::GMatDesc>(mm) ||
+                    cv::util::holds_alternative<cv::GFrameDesc>(mm));
         const bool is_model = cv::util::holds_alternative<ParamDesc::Model>(uu.params.kind);
         const auto &input_shape = is_model ? uu.model->input(input_name).get_shape()
                                            : uu.compiled_model.input(input_name).get_shape();
-        if (!isImage(matdesc, input_shape)) {
+
+        if (!isImage(mm, input_shape)) {
             util::throw_error(std::runtime_error(
                 "OV Backend: InferROI supports only image as the 1th argument"));
         }
@@ -1043,7 +1174,7 @@ struct InferROI: public cv::detail::KernelTag {
 
             ppp.cfgLayouts(input_name);
             ppp.cfgPreProcessing(input_name, mm, true /*disable_img_resize*/);
-            ppp.cfgScaleMean(input_name);
+            ppp.cfgScaleMean(input_name, mm);
             ppp.cfgPostProcessing();
             ppp.finalize();
         }
@@ -1069,6 +1200,10 @@ struct InferROI: public cv::detail::KernelTag {
     static void run(std::shared_ptr<OVCallContext> ctx,
                     cv::gimpl::ov::RequestPool     &reqPool) {
         using namespace std::placeholders;
+        if (ctx->getOptions().inference_only) {
+            cv::util::throw_error(
+                    std::logic_error("OV Backend: Inference only mode is not supported for InferROI!"));
+        }
         reqPool.getIdleRequest()->execute(
             IInferExecutor::Task {
                 [ctx](::ov::InferRequest &infer_request) {
@@ -1077,8 +1212,7 @@ struct InferROI: public cv::detail::KernelTag {
                     auto input_tensor = infer_request.get_tensor(input_name);
                     const auto &shape = input_tensor.get_shape();
                     const auto &roi = ctx->inArg<cv::detail::OpaqueRef>(0).rref<cv::Rect>();
-                    const auto roi_mat = preprocess(ctx->inMat(1), roi, shape);
-                    copyToOV(roi_mat, input_tensor);
+                    preprocess_and_copy(ctx, 1, roi, shape, input_tensor);
                 },
                 std::bind(PostOutputs, _1, _2, ctx)
             }
@@ -1113,11 +1247,11 @@ struct InferList: public cv::detail::KernelTag {
             size_t idx = 1u;
             for (auto &&input_name : uu.params.input_names) {
                 const auto &mm = in_metas[idx++];
-                GAPI_Assert(cv::util::holds_alternative<cv::GMatDesc>(mm));
-                const auto &matdesc = cv::util::get<cv::GMatDesc>(mm);
+                GAPI_Assert(cv::util::holds_alternative<cv::GMatDesc>(mm) ||
+                            cv::util::holds_alternative<cv::GFrameDesc>(mm));
                 const auto &input_shape = uu.model->input(input_name).get_shape();
 
-                if (!isImage(matdesc, input_shape)) {
+                if (!isImage(mm, input_shape)) {
                     util::throw_error(std::runtime_error(
                         "OV Backend: Only image is supported"
                         " as the " + std::to_string(idx) + "th argument for InferList"));
@@ -1125,7 +1259,7 @@ struct InferList: public cv::detail::KernelTag {
 
                 ppp.cfgLayouts(input_name);
                 ppp.cfgPreProcessing(input_name, mm, true /*disable_img_resize*/);
-                ppp.cfgScaleMean(input_name);
+                ppp.cfgScaleMean(input_name, mm);
             }
             ppp.cfgPostProcessing();
             ppp.finalize();
@@ -1141,6 +1275,10 @@ struct InferList: public cv::detail::KernelTag {
 
     static void run(std::shared_ptr<OVCallContext> ctx,
                     cv::gimpl::ov::RequestPool     &reqPool) {
+        if (ctx->getOptions().inference_only) {
+            cv::util::throw_error(
+                    std::logic_error("OV Backend: Inference only mode is not supported for InferList!"));
+        }
         const auto& in_roi_vec = ctx->inArg<cv::detail::VectorRef>(0u).rref<cv::Rect>();
         // NB: In case there is no input data need to post output anyway
         if (in_roi_vec.empty()) {
@@ -1170,8 +1308,7 @@ struct InferList: public cv::detail::KernelTag {
                         const auto &input_name = ctx->uu.params.input_names[0];
                         auto input_tensor = infer_request.get_tensor(input_name);
                         const auto &shape = input_tensor.get_shape();
-                        const auto roi_mat = preprocess(ctx->inMat(1), rc, shape);
-                        copyToOV(roi_mat, input_tensor);
+                        preprocess_and_copy(ctx, 1, rc, shape, input_tensor);
                     },
                     std::bind(callback, std::placeholders::_1, std::placeholders::_2, pos)
                 }
@@ -1209,12 +1346,18 @@ struct InferList2: public cv::detail::KernelTag {
 
         const auto &input_name_0 = uu.params.input_names.front();
         const auto &mm_0 = in_metas[0u];
-        const auto &matdesc = cv::util::get<cv::GMatDesc>(mm_0);
+
+        if (!(cv::util::holds_alternative<cv::GMatDesc>(mm_0) ||
+              cv::util::holds_alternative<cv::GFrameDesc>(mm_0))) {
+            util::throw_error(std::runtime_error(
+                        "OV Backend: Unsupported input meta"
+                        " for 0th argument in OV backend"));
+        }
 
         const bool is_model = cv::util::holds_alternative<ParamDesc::Model>(uu.params.kind);
         const auto &input_shape = is_model ? uu.model->input(input_name_0).get_shape()
                                            : uu.compiled_model.input(input_name_0).get_shape();
-        if (!isImage(matdesc, input_shape)) {
+        if (!isImage(mm_0, input_shape)) {
             util::throw_error(std::runtime_error(
                 "OV Backend: InferList2 supports only image as the 0th argument"));
         }
@@ -1240,7 +1383,7 @@ struct InferList2: public cv::detail::KernelTag {
                     GAPI_Assert(op.k.inKinds[idx] == cv::detail::OpaqueKind::CV_MAT);
                 }
 
-                ppp.cfgScaleMean(input_name);
+                ppp.cfgScaleMean(input_name, mm_0);
                 idx++; // NB: Never forget to increment the counter
             }
             ppp.cfgPostProcessing();
@@ -1257,6 +1400,10 @@ struct InferList2: public cv::detail::KernelTag {
 
     static void run(std::shared_ptr<OVCallContext> ctx,
                     cv::gimpl::ov::RequestPool     &reqPool) {
+        if (ctx->getOptions().inference_only) {
+            cv::util::throw_error(
+                    std::logic_error("OV Backend: Inference only mode is not supported for InferList2!"));
+        }
         GAPI_Assert(ctx->inArgs().size() > 1u
                 && "This operation must have at least two arguments");
         // NB: This blob will be used to make roi from its, so
@@ -1348,9 +1495,9 @@ class GOVBackendImpl final: public cv::gapi::GBackend::Priv {
     }
 
     virtual EPtr compile(const ade::Graph &graph,
-                         const cv::GCompileArgs &,
+                         const cv::GCompileArgs &compileArgs,
                          const std::vector<ade::NodeHandle> &nodes) const override {
-        return EPtr{new cv::gimpl::ov::GOVExecutable(graph, nodes)};
+        return EPtr{new cv::gimpl::ov::GOVExecutable(graph, compileArgs, nodes)};
     }
 
     virtual cv::GKernelPackage auxiliaryKernels() const override {
@@ -1391,9 +1538,12 @@ createInferRequests(::ov::CompiledModel &compiled_model,
 
 // GOVExecutable implementation //////////////////////////////////////////////
 cv::gimpl::ov::GOVExecutable::GOVExecutable(const ade::Graph &g,
+                                            const cv::GCompileArgs &compileArgs,
                                             const std::vector<ade::NodeHandle> &nodes)
     : m_g(g), m_gm(m_g) {
 
+    m_options.inference_only =
+        cv::gapi::getCompileArg<cv::gapi::wip::ov::benchmark_mode>(compileArgs).has_value();
     // FIXME: Currently this backend is capable to run a single inference node only.
     // Need to extend our island fusion with merge/not-to-merge decision making parametrization
     GConstGOVModel ovm(g);
@@ -1471,7 +1621,7 @@ void cv::gimpl::ov::GOVExecutable::run(cv::gimpl::GIslandExecutable::IInput  &in
     const auto &op = m_gm.metadata(this_nh).get<Op>();
 
     auto ctx = std::make_shared<OVCallContext>(uu, out, op.args, op.outs,
-            std::move(stub_meta), std::move(input_objs), std::move(output_objs));
+            std::move(stub_meta), std::move(input_objs), std::move(output_objs), m_options);
 
     const auto &kk = giem.metadata(this_nh).get<OVCallable>();
 
diff --git a/modules/gapi/src/backends/ov/govbackend.hpp b/modules/gapi/src/backends/ov/govbackend.hpp
index 0ac858dc52cb..ff9793afadf2 100644
--- a/modules/gapi/src/backends/ov/govbackend.hpp
+++ b/modules/gapi/src/backends/ov/govbackend.hpp
@@ -26,6 +26,12 @@ struct OVCompiled {
 
 class RequestPool;
 
+struct Options {
+    // Only performs inference of the model
+    // without i/o data transfer if enabled.
+    bool inference_only = false;
+};
+
 class GOVExecutable final: public GIslandExecutable
 {
     const ade::Graph &m_g;
@@ -42,8 +48,12 @@ class GOVExecutable final: public GIslandExecutable
     // To manage multiple async requests
     std::unique_ptr<RequestPool> m_reqPool;
 
+    // To manage additional execution options
+    Options m_options;
+
 public:
     GOVExecutable(const ade::Graph                   &graph,
+                  const cv::GCompileArgs             &compileArgs,
                   const std::vector<ade::NodeHandle> &nodes);
 
     virtual inline bool canReshape() const override { return false; }
diff --git a/modules/gapi/src/backends/streaming/gstreamingbackend.cpp b/modules/gapi/src/backends/streaming/gstreamingbackend.cpp
index ae7125f2e502..0f966648d82c 100644
--- a/modules/gapi/src/backends/streaming/gstreamingbackend.cpp
+++ b/modules/gapi/src/backends/streaming/gstreamingbackend.cpp
@@ -159,7 +159,7 @@ struct Copy: public cv::detail::KernelTag
         return cv::gapi::streaming::IActor::Ptr(new Actor(args));
     }
 
-    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; }
 };
 
 void Copy::Actor::run(cv::gimpl::GIslandExecutable::IInput  &in,
@@ -249,7 +249,7 @@ struct GOCVBGR: public cv::detail::KernelTag
     {
         return cv::gapi::streaming::IActor::Ptr(new Actor(args));
     }
-    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; }
 };
 
 void GOCVBGR::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
@@ -323,7 +323,7 @@ struct GOCVY: public cv::detail::KernelTag
     {
         return cv::gapi::streaming::IActor::Ptr(new Actor(args));
     }
-    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; }
 };
 
 void GOCVY::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
@@ -389,7 +389,7 @@ struct GOCVUV: public cv::detail::KernelTag
     {
         return cv::gapi::streaming::IActor::Ptr(new Actor(args));
     }
-    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; };
+    static cv::gapi::streaming::GStreamingKernel kernel() { return {&create}; }
 };
 
 void GOCVUV::Actor::extractRMat(const cv::MediaFrame& frame, cv::RMat& rmat)
diff --git a/modules/gapi/src/compiler/gcompiled_priv.hpp b/modules/gapi/src/compiler/gcompiled_priv.hpp
index 3f873aba2378..20b76781a5f4 100644
--- a/modules/gapi/src/compiler/gcompiled_priv.hpp
+++ b/modules/gapi/src/compiler/gcompiled_priv.hpp
@@ -27,7 +27,7 @@ namespace cv {
 namespace gimpl
 {
     struct GRuntimeArgs;
-};
+}
 
 // FIXME: GAPI_EXPORTS is here only due to tests and Windows linker issues
 class GAPI_EXPORTS GCompiled::Priv
diff --git a/modules/gapi/src/compiler/gcompiler.cpp b/modules/gapi/src/compiler/gcompiler.cpp
index 526b2746dcd6..568251f19eed 100644
--- a/modules/gapi/src/compiler/gcompiler.cpp
+++ b/modules/gapi/src/compiler/gcompiler.cpp
@@ -33,6 +33,7 @@
 #include "compiler/passes/pattern_matching.hpp"
 
 #include "executor/gexecutor.hpp"
+#include "executor/gthreadedexecutor.hpp"
 #include "executor/gstreamingexecutor.hpp"
 #include "backends/common/gbackend.hpp"
 #include "backends/common/gmetabackend.hpp"
@@ -452,8 +453,16 @@ cv::GCompiled cv::gimpl::GCompiler::produceCompiled(GPtr &&pg)
         .get<OutputMeta>().outMeta;
     // FIXME: select which executor will be actually used,
     // make GExecutor abstract.
-    std::unique_ptr<GExecutor> pE(new GExecutor(std::move(pg)));
 
+    auto use_threaded_exec = cv::gapi::getCompileArg<cv::use_threaded_executor>(m_args);
+    std::unique_ptr<GAbstractExecutor> pE;
+    if (use_threaded_exec) {
+        const auto num_threads = use_threaded_exec.value().num_threads;
+        GAPI_LOG_INFO(NULL, "Threaded executor with " << num_threads << " thread(s) will be used");
+        pE.reset(new GThreadedExecutor(num_threads, std::move(pg)));
+    } else {
+        pE.reset(new GExecutor(std::move(pg)));
+    }
     GCompiled compiled;
     compiled.priv().setup(m_metas, outMetas, std::move(pE));
 
diff --git a/modules/gapi/src/compiler/gislandmodel.hpp b/modules/gapi/src/compiler/gislandmodel.hpp
index 3a1a8d5ab9de..ade13a6f3338 100644
--- a/modules/gapi/src/compiler/gislandmodel.hpp
+++ b/modules/gapi/src/compiler/gislandmodel.hpp
@@ -192,6 +192,7 @@ class GIslandEmitter
 public:
     // Obtain next value from the emitter
     virtual bool pull(GRunArg &) = 0;
+    virtual void halt() = 0;
     virtual ~GIslandEmitter() = default;
 };
 
diff --git a/modules/gapi/src/compiler/gobjref.hpp b/modules/gapi/src/compiler/gobjref.hpp
index bca6fa525e47..8f68142cc59f 100644
--- a/modules/gapi/src/compiler/gobjref.hpp
+++ b/modules/gapi/src/compiler/gobjref.hpp
@@ -24,6 +24,7 @@ namespace gimpl
     < util::monostate
     , cv::Scalar
     , cv::detail::VectorRef
+    , cv::Mat
     >;
 
     struct RcDesc
diff --git a/modules/gapi/src/compiler/gstreaming_priv.hpp b/modules/gapi/src/compiler/gstreaming_priv.hpp
index 0fd5fc7b7f78..fc5ba73be0d1 100644
--- a/modules/gapi/src/compiler/gstreaming_priv.hpp
+++ b/modules/gapi/src/compiler/gstreaming_priv.hpp
@@ -16,7 +16,7 @@ namespace cv {
 namespace gimpl
 {
     struct GRuntimeArgs;
-};
+}
 
 // FIXME: GAPI_EXPORTS is here only due to tests and Windows linker issues
 // FIXME: It seems it clearly duplicates the GStreamingCompiled and
diff --git a/modules/gapi/src/compiler/passes/exec.cpp b/modules/gapi/src/compiler/passes/exec.cpp
index 93d833d6024d..f5cb48162de1 100644
--- a/modules/gapi/src/compiler/passes/exec.cpp
+++ b/modules/gapi/src/compiler/passes/exec.cpp
@@ -9,7 +9,7 @@
 
 #include <string>
 #include <list> // list
-#include <iomanip>  // setw, etc
+#include <iomanip> // setw, etc
 #include <fstream> // ofstream
 #include <memory>
 #include <functional>
@@ -85,7 +85,7 @@ namespace
 
         const auto& backend = *src_g.metadata().get<ActiveBackends>().backends.cbegin();
         const auto& proto = src_g.metadata().get<Protocol>();
-        GIsland::node_set all, in_ops, out_ops;
+        GIsland::node_set all, in_ops, out_ops, in_cvals;
 
         all.insert(src_g.nodes().begin(), src_g.nodes().end());
 
@@ -99,7 +99,22 @@ namespace
             all.erase(nh);
             out_ops.insert(nh->inNodes().begin(), nh->inNodes().end());
         }
-
+        for (const auto& nh : src_g.nodes())
+        {
+            if (src_g.metadata(nh).get<NodeType>().t == NodeType::DATA)
+            {
+                const auto &d = src_g.metadata(nh).get<Data>();
+                if (d.storage == Data::Storage::CONST_VAL
+                    && !backend.priv().supportsConst(d.shape)) {
+                    // don't put this node into the island's graph - so the island
+                    // executable don't need to handle value-initialized G-type manually.
+                    // Still mark its readers as inputs
+                    all.erase(nh);
+                    in_cvals.insert(nh);
+                    in_ops.insert(nh->outNodes().begin(), nh->outNodes().end());
+                }
+            }
+        }
         auto isl = std::make_shared<GIsland>(backend,
                                              std::move(all),
                                              std::move(in_ops),
@@ -108,7 +123,8 @@ namespace
 
         auto ih = GIslandModel::mkIslandNode(g, std::move(isl));
 
-        for (const auto& nh : proto.in_nhs)
+        for (const auto& nh : ade::util::chain(ade::util::toRange(proto.in_nhs),
+                                               ade::util::toRange(in_cvals)))
         {
             auto slot = GIslandModel::mkSlotNode(g, nh);
             g.link(slot, ih);
@@ -142,6 +158,11 @@ namespace
         std::unordered_set<CycleCausers, CycleHasher> cycle_causers;
     };
 
+#if defined(__GNUC__) && (__GNUC__ >= 13)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdangling-reference"
+#endif
+
     bool canMerge(const GIslandModel::Graph &g,
                   const ade::NodeHandle &a_nh,
                   const ade::NodeHandle &slot_nh,
@@ -191,6 +212,10 @@ namespace
         return true;
     }
 
+#if defined(__GNUC__) && (__GNUC__ == 13)
+#pragma GCC diagnostic pop
+#endif
+
     inline bool isProducedBy(const ade::NodeHandle &slot,
                              const ade::NodeHandle &island)
     {
diff --git a/modules/gapi/src/compiler/passes/pattern_matching.cpp b/modules/gapi/src/compiler/passes/pattern_matching.cpp
index d52b48a63130..71ed859413ee 100644
--- a/modules/gapi/src/compiler/passes/pattern_matching.cpp
+++ b/modules/gapi/src/compiler/passes/pattern_matching.cpp
@@ -73,7 +73,7 @@ bool compareDataNodes(const ade::NodeHandle& first, const std::vector<std::size_
     // check that first and second nodes have the same type of DATA::Storage.
 
     return true;
-};
+}
 
 // Returns true if two OP nodes semantically and structurally identical:
 //    - both nodes have the same kernel name
@@ -130,7 +130,7 @@ bool compareOpNodes(const VisitedMatchings& matchedVisitedNodes,
     }
 
     return true;
-};
+}
 
 // Retrieves and return sample from the cartesian product of candidates sets
 VisitedMatchings sampleFromProduct(std::size_t sampleIdx, // index of the sample in the product
@@ -168,7 +168,7 @@ std::size_t labelOf (const ade::NodeHandle& node, // reader node
     else {
         return graph.metadata(edge).get<cv::gimpl::Output>().port;
     }
-};
+}
 
 inline bool IS_STARTPOINT(const ade::NodeHandle& nh){
     return nh->inEdges().empty();
diff --git a/modules/gapi/src/executor/gexecutor.cpp b/modules/gapi/src/executor/gexecutor.cpp
index bf25302b758e..f68aa18fbbef 100644
--- a/modules/gapi/src/executor/gexecutor.cpp
+++ b/modules/gapi/src/executor/gexecutor.cpp
@@ -208,6 +208,12 @@ void cv::gimpl::GExecutor::initResource(const ade::NodeHandle & nh, const ade::N
     switch (d.shape)
     {
     case GShape::GMAT:
+        if (d.storage == Data::Storage::CONST_VAL)
+        {
+            auto rc = RcDesc{d.rc, d.shape, d.ctor};
+            magazine::bindInArgExec(m_res, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
+        }
+        else
         {
             // Let island allocate it's outputs if it can,
             // allocate cv::Mat and wrap it with RMat otherwise
diff --git a/modules/gapi/src/executor/gstreamingexecutor.cpp b/modules/gapi/src/executor/gstreamingexecutor.cpp
index 124b27f39ca6..6a397faca63a 100644
--- a/modules/gapi/src/executor/gstreamingexecutor.cpp
+++ b/modules/gapi/src/executor/gstreamingexecutor.cpp
@@ -41,6 +41,10 @@ using namespace cv::gimpl::stream;
 class VideoEmitter final: public cv::gimpl::GIslandEmitter {
     cv::gapi::wip::IStreamSource::Ptr src;
 
+    virtual void halt() override {
+        src->halt();
+    }
+
     virtual bool pull(cv::GRunArg &arg) override {
         // FIXME: probably we can maintain a pool of (then) pre-allocated
         // buffers to avoid runtime allocations.
@@ -62,6 +66,10 @@ class VideoEmitter final: public cv::gimpl::GIslandEmitter {
 class ConstEmitter final: public cv::gimpl::GIslandEmitter {
     cv::GRunArg m_arg;
 
+    virtual void halt() override {
+        // Not used here, but in fact can be used.
+    }
+
     virtual bool pull(cv::GRunArg &arg) override {
         arg = const_cast<const cv::GRunArg&>(m_arg); // FIXME: variant workaround
         return true;
@@ -1918,6 +1926,11 @@ void cv::gimpl::GStreamingExecutor::stop()
     for (auto &q : m_emitter_queues) {
         q.push(stream::Cmd{stream::Stop{}});
     }
+    // Also kindly ask emitter object to halt to break the blocking src->pull()
+    // loop
+    for (auto &nh : m_emitters) {
+        m_gim.metadata(nh).get<Emitter>().object->halt();
+    }
 
     // Pull messages from the final queue to ensure completion
     Cmd cmd;
diff --git a/modules/gapi/src/executor/gthreadedexecutor.cpp b/modules/gapi/src/executor/gthreadedexecutor.cpp
new file mode 100644
index 000000000000..ecd53eb9528a
--- /dev/null
+++ b/modules/gapi/src/executor/gthreadedexecutor.cpp
@@ -0,0 +1,515 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <ade/util/zip_range.hpp>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+
+#include "api/gproto_priv.hpp" // ptr(GRunArgP)
+#include "executor/gthreadedexecutor.hpp"
+#include "compiler/passes/passes.hpp"
+
+namespace cv {
+namespace gimpl {
+namespace magazine {
+namespace {
+
+void bindInArgExec(Mag& mag, const RcDesc &rc, const GRunArg &arg) {
+    if (rc.shape != GShape::GMAT) {
+        bindInArg(mag, rc, arg);
+        return;
+    }
+    auto& mag_rmat = mag.template slot<cv::RMat>()[rc.id];
+    switch (arg.index()) {
+    case GRunArg::index_of<Mat>() :
+        mag_rmat = make_rmat<RMatOnMat>(util::get<Mat>(arg));
+        break;
+    case GRunArg::index_of<cv::RMat>() :
+        mag_rmat = util::get<cv::RMat>(arg);
+        break;
+    default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+    }
+    // FIXME: has to take extra care about meta here for this particuluar
+    // case, just because this function exists at all
+    mag.meta<cv::RMat>()[rc.id] = arg.meta;
+}
+
+void bindOutArgExec(Mag& mag, const RcDesc &rc, const GRunArgP &arg) {
+    if (rc.shape != GShape::GMAT) {
+        bindOutArg(mag, rc, arg);
+        return;
+    }
+    auto& mag_rmat = mag.template slot<cv::RMat>()[rc.id];
+    switch (arg.index()) {
+    case GRunArgP::index_of<Mat*>() :
+        mag_rmat = make_rmat<RMatOnMat>(*util::get<Mat*>(arg)); break;
+    case GRunArgP::index_of<cv::RMat*>() :
+        mag_rmat = *util::get<cv::RMat*>(arg); break;
+    default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+    }
+}
+
+cv::GRunArgP getObjPtrExec(Mag& mag, const RcDesc &rc) {
+    if (rc.shape != GShape::GMAT) {
+        return getObjPtr(mag, rc);
+    }
+    return GRunArgP(&mag.slot<cv::RMat>()[rc.id]);
+}
+
+void writeBackExec(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg) {
+    if (rc.shape != GShape::GMAT) {
+        writeBack(mag, rc, g_arg);
+        return;
+    }
+
+    switch (g_arg.index()) {
+    case GRunArgP::index_of<cv::Mat*>() : {
+        // If there is a copy intrinsic at the end of the graph
+        // we need to actually copy the data to the user buffer
+        // since output runarg was optimized to simply point
+        // to the input of the copy kernel
+        // FIXME:
+        // Rework, find a better way to check if there should be
+        // a real copy (add a pass to StreamingBackend?)
+        // NB: In case RMat adapter not equal to "RMatOnMat" need to
+        // copy data back to the host as well.
+        auto& out_mat = *util::get<cv::Mat*>(g_arg);
+        const auto& rmat = mag.template slot<cv::RMat>().at(rc.id);
+        auto* adapter = rmat.get<RMatOnMat>();
+        if ((adapter != nullptr && out_mat.data != adapter->data()) ||
+            (adapter == nullptr)) {
+            auto view = rmat.access(RMat::Access::R);
+            asMat(view).copyTo(out_mat);
+        }
+        break;
+    }
+    case GRunArgP::index_of<cv::RMat*>() : /* do nothing */ break;
+    default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+    }
+}
+
+void assignMetaStubExec(Mag& mag, const RcDesc &rc, const cv::GRunArg::Meta &meta) {
+    switch (rc.shape) {
+    case GShape::GARRAY:  mag.meta<cv::detail::VectorRef>()[rc.id] = meta; break;
+    case GShape::GOPAQUE: mag.meta<cv::detail::OpaqueRef>()[rc.id] = meta; break;
+    case GShape::GSCALAR: mag.meta<cv::Scalar>()[rc.id]            = meta; break;
+    case GShape::GFRAME:  mag.meta<cv::MediaFrame>()[rc.id]        = meta; break;
+    case GShape::GMAT:
+        mag.meta<cv::Mat>() [rc.id] = meta;
+        mag.meta<cv::RMat>()[rc.id] = meta;
+#if !defined(GAPI_STANDALONE)
+        mag.meta<cv::UMat>()[rc.id] = meta;
+#endif
+        break;
+    default: util::throw_error(std::logic_error("Unsupported GShape type")); break;
+    }
+}
+
+} // anonymous namespace
+}}} // namespace cv::gimpl::magazine
+
+cv::gimpl::StreamMsg cv::gimpl::GThreadedExecutor::Input::get() {
+    std::lock_guard<std::mutex> lock{m_state.m};
+    cv::GRunArgs res;
+    for (const auto &rc : desc()) { res.emplace_back(magazine::getArg(m_state.mag, rc)); }
+    return cv::gimpl::StreamMsg{std::move(res)};
+}
+
+cv::gimpl::GThreadedExecutor::Input::Input(cv::gimpl::GraphState &state,
+                                           const std::vector<RcDesc> &rcs)
+    : m_state(state) {
+    set(rcs);
+};
+
+cv::GRunArgP cv::gimpl::GThreadedExecutor::Output::get(int idx) {
+    std::lock_guard<std::mutex> lock{m_state.m};
+    auto r = magazine::getObjPtrExec(m_state.mag, desc()[idx]);
+    // Remember the output port for this output object
+    m_out_idx[cv::gimpl::proto::ptr(r)] = idx;
+    return r;
+}
+
+void cv::gimpl::GThreadedExecutor::Output::post(cv::GRunArgP&&, const std::exception_ptr& e) {
+    if (e) {
+        m_eptr = e;
+    }
+}
+
+void cv::gimpl::GThreadedExecutor::Output::post(Exception&& ex) {
+    m_eptr = std::move(ex.eptr);
+}
+
+void cv::gimpl::GThreadedExecutor::Output::meta(const GRunArgP &out, const GRunArg::Meta &m) {
+    const auto idx = m_out_idx.at(cv::gimpl::proto::ptr(out));
+    std::lock_guard<std::mutex> lock{m_state.m};
+    magazine::assignMetaStubExec(m_state.mag, desc()[idx], m);
+}
+
+cv::gimpl::GThreadedExecutor::Output::Output(cv::gimpl::GraphState &state,
+                                             const std::vector<RcDesc> &rcs)
+    : m_state(state) {
+    set(rcs);
+}
+
+void cv::gimpl::GThreadedExecutor::Output::verify() {
+    if (m_eptr) {
+        std::rethrow_exception(m_eptr);
+    }
+}
+
+void cv::gimpl::GThreadedExecutor::initResource(const ade::NodeHandle &nh, const ade::NodeHandle &orig_nh) {
+    const Data &d = m_gm.metadata(orig_nh).get<Data>();
+
+    if (   d.storage != Data::Storage::INTERNAL
+        && d.storage != Data::Storage::CONST_VAL) {
+        return;
+    }
+
+    // INTERNALS+CONST only! no need to allocate/reset output objects
+    // to as it is bound externally (e.g. already in the m_state.mag)
+
+    switch (d.shape) {
+    case GShape::GMAT:
+        if (d.storage == Data::Storage::CONST_VAL) {
+            auto rc = RcDesc{d.rc, d.shape, d.ctor};
+            magazine::bindInArgExec(m_state.mag, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
+        } else {
+            // Let island allocate it's outputs if it can,
+            // allocate cv::Mat and wrap it with RMat otherwise
+            GAPI_Assert(!nh->inNodes().empty());
+            const auto desc = util::get<cv::GMatDesc>(d.meta);
+            auto& exec = m_gim.metadata(nh->inNodes().front()).get<IslandExec>().object;
+            auto& rmat = m_state.mag.slot<cv::RMat>()[d.rc];
+            if (exec->allocatesOutputs()) {
+                rmat = exec->allocate(desc);
+            } else {
+                Mat mat;
+                createMat(desc, mat);
+                rmat = make_rmat<RMatOnMat>(mat);
+            }
+        }
+        break;
+
+    case GShape::GSCALAR:
+        if (d.storage == Data::Storage::CONST_VAL) {
+            auto rc = RcDesc{d.rc, d.shape, d.ctor};
+            magazine::bindInArg(m_state.mag, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
+        }
+        break;
+
+    case GShape::GARRAY:
+        if (d.storage == Data::Storage::CONST_VAL) {
+            auto rc = RcDesc{d.rc, d.shape, d.ctor};
+            magazine::bindInArg(m_state.mag, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
+        }
+        break;
+    case GShape::GOPAQUE:
+        // Constructed on Reset, do nothing here
+        break;
+    case GShape::GFRAME: {
+        // Should be defined by backend, do nothing here
+        break;
+    }
+    default:
+        GAPI_Error("InternalError");
+    }
+}
+
+cv::gimpl::IslandActor::IslandActor(const std::vector<RcDesc>          &in_objects,
+                                    const std::vector<RcDesc>          &out_objects,
+                                    std::shared_ptr<GIslandExecutable> isl_exec,
+                                    cv::gimpl::GraphState              &state)
+    : m_isl_exec(isl_exec),
+      m_inputs(state, in_objects),
+      m_outputs(state, out_objects) {
+}
+
+void cv::gimpl::IslandActor::run() {
+    m_isl_exec->run(m_inputs, m_outputs);
+}
+
+void cv::gimpl::IslandActor::verify() {
+    m_outputs.verify();
+};
+
+class cv::gimpl::Task {
+    friend class TaskManager;
+public:
+    using Ptr = std::shared_ptr<Task>;
+    Task(TaskManager::F&& f, std::vector<Task::Ptr> &&producers);
+
+    struct ExecutionState {
+        cv::gapi::own::ThreadPool& tp;
+        cv::gapi::own::Latch& latch;
+    };
+
+    void run(ExecutionState& state);
+    bool isLast() const { return m_consumers.empty();  }
+    void reset()        { m_ready_producers.store(0u); }
+
+private:
+    TaskManager::F           m_f;
+    const uint32_t           m_num_producers;
+    std::atomic<uint32_t>    m_ready_producers;
+    std::vector<Task*>       m_consumers;
+};
+
+cv::gimpl::Task::Task(TaskManager::F         &&f,
+                      std::vector<Task::Ptr> &&producers)
+    : m_f(std::move(f)),
+      m_num_producers(static_cast<uint32_t>(producers.size())) {
+    for (auto producer : producers) {
+        producer->m_consumers.push_back(this);
+    }
+}
+
+void cv::gimpl::Task::run(ExecutionState& state) {
+    // Execute the task
+    m_f();
+    // Notify every consumer about completion one of its dependencies
+    for (auto* consumer : m_consumers) {
+        const auto num_ready =
+            consumer->m_ready_producers.fetch_add(1, std::memory_order_relaxed) + 1;
+        // The last completed producer schedule the consumer for execution
+        if (num_ready == consumer->m_num_producers) {
+            state.tp.schedule([&state, consumer](){
+                consumer->run(state);
+            });
+        }
+    }
+    // If tasks has no consumers this is the last task
+    // Execution lasts until all last tasks are completed
+    // Decrement the latch to notify about completion
+    if (isLast()) {
+        state.latch.count_down();
+    }
+}
+
+std::shared_ptr<cv::gimpl::Task>
+cv::gimpl::TaskManager::createTask(cv::gimpl::TaskManager::F &&f,
+                                   std::vector<std::shared_ptr<cv::gimpl::Task>> &&producers) {
+    const bool is_initial = producers.empty();
+    auto task = std::make_shared<cv::gimpl::Task>(std::move(f),
+                                                  std::move(producers));
+    m_all_tasks.emplace_back(task);
+    if (is_initial) {
+        m_initial_tasks.emplace_back(task);
+    }
+    return task;
+}
+
+void cv::gimpl::TaskManager::scheduleAndWait(cv::gapi::own::ThreadPool& tp) {
+    // Reset the number of ready dependencies for all tasks
+    for (auto& task : m_all_tasks) { task->reset(); }
+
+    // Count the number of last tasks
+    auto isLast = [](const std::shared_ptr<Task>& task) { return task->isLast(); };
+    const auto kNumLastsTasks =
+        std::count_if(m_all_tasks.begin(), m_all_tasks.end(), isLast);
+
+    // Initialize the latch, schedule initial tasks
+    // and wait until all lasts tasks are done
+    cv::gapi::own::Latch latch(kNumLastsTasks);
+    Task::ExecutionState state{tp, latch};
+    for (auto task : m_initial_tasks) {
+        state.tp.schedule([&state, task](){ task->run(state); });
+    }
+    latch.wait();
+}
+
+cv::gimpl::GThreadedExecutor::GThreadedExecutor(const uint32_t num_threads,
+                                                std::unique_ptr<ade::Graph> &&g_model)
+    : GAbstractExecutor(std::move(g_model)),
+      m_thread_pool(num_threads) {
+    auto sorted = m_gim.metadata().get<ade::passes::TopologicalSortData>();
+
+    std::unordered_map< ade::NodeHandle
+                       , std::shared_ptr<Task>
+                       , ade::HandleHasher<ade::Node>> m_tasks_map;
+    for (auto nh : sorted.nodes())
+    {
+        switch (m_gim.metadata(nh).get<NodeKind>().k)
+        {
+        case NodeKind::ISLAND:
+            {
+                std::vector<RcDesc> input_rcs;
+                std::vector<RcDesc> output_rcs;
+                input_rcs.reserve(nh->inNodes().size());
+                output_rcs.reserve(nh->outNodes().size());
+
+                auto xtract = [&](ade::NodeHandle slot_nh, std::vector<RcDesc> &vec) {
+                    const auto orig_data_nh
+                        = m_gim.metadata(slot_nh).get<DataSlot>().original_data_node;
+                    const auto &orig_data_info
+                        = m_gm.metadata(orig_data_nh).get<Data>();
+                    vec.emplace_back(RcDesc{ orig_data_info.rc
+                                           , orig_data_info.shape
+                                           , orig_data_info.ctor});
+                };
+                for (auto in_slot_nh  : nh->inNodes())  xtract(in_slot_nh,  input_rcs);
+                for (auto out_slot_nh : nh->outNodes()) xtract(out_slot_nh, output_rcs);
+
+                auto actor = std::make_shared<IslandActor>(std::move(input_rcs),
+                                                           std::move(output_rcs),
+                                                           m_gim.metadata(nh).get<IslandExec>().object,
+                                                           m_state);
+                m_actors.push_back(actor);
+
+                std::unordered_set<ade::NodeHandle, ade::HandleHasher<ade::Node>> producer_nhs;
+                for (auto slot_nh : nh->inNodes()) {
+                    for (auto island_nh : slot_nh->inNodes()) {
+                        GAPI_Assert(m_gim.metadata(island_nh).get<NodeKind>().k == NodeKind::ISLAND);
+                        producer_nhs.emplace(island_nh);
+                    }
+                }
+                std::vector<std::shared_ptr<Task>> producers;
+                producers.reserve(producer_nhs.size());
+                for (auto producer_nh : producer_nhs) {
+                    producers.push_back(m_tasks_map.at(producer_nh));
+                }
+                auto task = m_task_manager.createTask(
+                        [actor](){actor->run();}, std::move(producers));
+                m_tasks_map.emplace(nh, task);
+            }
+            break;
+
+        case NodeKind::SLOT:
+            {
+                const auto orig_data_nh
+                    = m_gim.metadata(nh).get<DataSlot>().original_data_node;
+                initResource(nh, orig_data_nh);
+                m_slots.emplace_back(DataDesc{nh, orig_data_nh});
+            }
+            break;
+
+        default:
+            GAPI_Error("InternalError");
+            break;
+        } // switch(kind)
+    } // for(gim nodes)
+
+    prepareForNewStream();
+}
+
+void cv::gimpl::GThreadedExecutor::run(cv::gimpl::GRuntimeArgs &&args) {
+    const auto proto = m_gm.metadata().get<Protocol>();
+
+    // Basic check if input/output arguments are correct
+    // FIXME: Move to GCompiled (do once for all GExecutors)
+    if (proto.inputs.size() != args.inObjs.size()) { // TODO: Also check types
+        util::throw_error(std::logic_error
+                          ("Computation's input protocol doesn\'t "
+                           "match actual arguments!"));
+    }
+    if (proto.outputs.size() != args.outObjs.size()) { // TODO: Also check types
+        util::throw_error(std::logic_error
+                          ("Computation's output protocol doesn\'t "
+                           "match actual arguments!"));
+    }
+
+    namespace util = ade::util;
+
+    // ensure that output Mat parameters are correctly allocated
+    // FIXME: avoid copy of NodeHandle and GRunRsltComp ?
+    for (auto index : util::iota(proto.out_nhs.size())) {
+        auto& nh = proto.out_nhs.at(index);
+        const Data &d = m_gm.metadata(nh).get<Data>();
+        if (d.shape == GShape::GMAT) {
+            using cv::util::get;
+            const auto desc = get<cv::GMatDesc>(d.meta);
+
+            auto check_rmat = [&desc, &args, &index]() {
+                auto& out_mat = *get<cv::RMat*>(args.outObjs.at(index));
+                GAPI_Assert(desc.canDescribe(out_mat));
+            };
+
+#if !defined(GAPI_STANDALONE)
+            // Building as part of OpenCV - follow OpenCV behavior In
+            // the case of cv::Mat if output buffer is not enough to
+            // hold the result, reallocate it
+            if (cv::util::holds_alternative<cv::Mat*>(args.outObjs.at(index))) {
+                auto& out_mat = *get<cv::Mat*>(args.outObjs.at(index));
+                createMat(desc, out_mat);
+            }
+            // In the case of RMat check to fit required meta
+            else {
+                check_rmat();
+            }
+#else
+            // Building standalone - output buffer should always exist,
+            // and _exact_ match our inferred metadata
+            if (cv::util::holds_alternative<cv::Mat*>(args.outObjs.at(index))) {
+                auto& out_mat = *get<cv::Mat*>(args.outObjs.at(index));
+                GAPI_Assert(out_mat.data != nullptr &&
+                        desc.canDescribe(out_mat));
+            }
+            // In the case of RMat check to fit required meta
+            else {
+                check_rmat();
+            }
+#endif // !defined(GAPI_STANDALONE)
+        }
+    }
+    // Update storage with user-passed objects
+    for (auto it : ade::util::zip(ade::util::toRange(proto.inputs),
+                                  ade::util::toRange(args.inObjs))) {
+        magazine::bindInArgExec(m_state.mag, std::get<0>(it), std::get<1>(it));
+    }
+    for (auto it : ade::util::zip(ade::util::toRange(proto.outputs),
+                                  ade::util::toRange(args.outObjs))) {
+        magazine::bindOutArgExec(m_state.mag, std::get<0>(it), std::get<1>(it));
+    }
+
+    // Reset internal data
+    for (auto &sd : m_slots) {
+        const auto& data = m_gm.metadata(sd.data_nh).get<Data>();
+        magazine::resetInternalData(m_state.mag, data);
+    }
+
+    m_task_manager.scheduleAndWait(m_thread_pool);
+    for (auto actor : m_actors) {
+        actor->verify();
+    }
+    for (auto it : ade::util::zip(ade::util::toRange(proto.outputs),
+                                  ade::util::toRange(args.outObjs))) {
+        magazine::writeBackExec(m_state.mag, std::get<0>(it), std::get<1>(it));
+    }
+}
+
+bool cv::gimpl::GThreadedExecutor::canReshape() const {
+    for (auto actor : m_actors) {
+        if (actor->exec()->canReshape()) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void cv::gimpl::GThreadedExecutor::reshape(const GMetaArgs& inMetas, const GCompileArgs& args) {
+    GAPI_Assert(canReshape());
+    auto& g = *m_orig_graph.get();
+    ade::passes::PassContext ctx{g};
+    passes::initMeta(ctx, inMetas);
+    passes::inferMeta(ctx, true);
+
+    // NB: Before reshape islands need to re-init resources for every slot.
+    for (auto slot : m_slots) {
+        initResource(slot.slot_nh, slot.data_nh);
+    }
+
+    for (auto actor : m_actors) {
+        actor->exec()->reshape(g, args);
+    }
+}
+
+void cv::gimpl::GThreadedExecutor::prepareForNewStream() {
+    for (auto actor : m_actors) {
+        actor->exec()->handleNewStream();
+    }
+}
diff --git a/modules/gapi/src/executor/gthreadedexecutor.hpp b/modules/gapi/src/executor/gthreadedexecutor.hpp
new file mode 100644
index 000000000000..9792b70f63f4
--- /dev/null
+++ b/modules/gapi/src/executor/gthreadedexecutor.hpp
@@ -0,0 +1,123 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
+#define OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
+
+#include <utility> // tuple, required by magazine
+#include <unordered_map> // required by magazine
+
+#include "executor/gabstractexecutor.hpp"
+#include "executor/thread_pool.hpp"
+
+namespace cv {
+namespace gimpl {
+
+class Task;
+class TaskManager {
+public:
+    using F = std::function<void()>;
+
+    std::shared_ptr<Task> createTask(F &&f, std::vector<std::shared_ptr<Task>> &&producers);
+    void scheduleAndWait(cv::gapi::own::ThreadPool& tp);
+
+private:
+    std::vector<std::shared_ptr<Task>> m_all_tasks;
+    std::vector<std::shared_ptr<Task>> m_initial_tasks;
+};
+
+struct GraphState {
+    Mag mag;
+    std::mutex m;
+};
+
+class IslandActor;
+class GThreadedExecutor final: public GAbstractExecutor {
+public:
+    class Input;
+    class Output;
+
+    explicit GThreadedExecutor(const uint32_t num_threads,
+                               std::unique_ptr<ade::Graph> &&g_model);
+    void run(cv::gimpl::GRuntimeArgs &&args) override;
+
+    bool canReshape() const override;
+    void reshape(const GMetaArgs& inMetas, const GCompileArgs& args) override;
+
+    void prepareForNewStream() override;
+
+private:
+    struct DataDesc
+    {
+        ade::NodeHandle slot_nh;
+        ade::NodeHandle data_nh;
+    };
+
+    void initResource(const ade::NodeHandle &nh, const ade::NodeHandle &orig_nh);
+
+    GraphState                                m_state;
+    std::vector<DataDesc>                     m_slots;
+    cv::gapi::own::ThreadPool                 m_thread_pool;
+    TaskManager                               m_task_manager;
+    std::vector<std::shared_ptr<IslandActor>> m_actors;
+};
+
+class GThreadedExecutor::Input final: public GIslandExecutable::IInput
+{
+public:
+    Input(GraphState& state, const std::vector<RcDesc> &rcs);
+
+private:
+    virtual StreamMsg get() override;
+    virtual StreamMsg try_get() override { return get(); }
+
+private:
+    GraphState& m_state;
+};
+
+class GThreadedExecutor::Output final: public GIslandExecutable::IOutput
+{
+public:
+    Output(GraphState &state, const std::vector<RcDesc> &rcs);
+    void verify();
+
+private:
+    GRunArgP get(int idx) override;
+    void post(cv::GRunArgP&&, const std::exception_ptr& e) override;
+    void post(Exception&& ex) override;
+    void post(EndOfStream&&) override {};
+    void meta(const GRunArgP &out, const GRunArg::Meta &m) override;
+
+private:
+    GraphState& m_state;
+    std::unordered_map<const void*, int> m_out_idx;
+    std::exception_ptr m_eptr;
+};
+
+class IslandActor {
+public:
+    using Ptr = std::shared_ptr<IslandActor>;
+    IslandActor(const std::vector<RcDesc>          &in_objects,
+                const std::vector<RcDesc>          &out_objects,
+                std::shared_ptr<GIslandExecutable> isl_exec,
+                GraphState                         &state);
+
+    void run();
+    void verify();
+    std::shared_ptr<GIslandExecutable> exec() { return m_isl_exec; }
+
+private:
+    std::shared_ptr<GIslandExecutable> m_isl_exec;
+    GThreadedExecutor::Input           m_inputs;
+    GThreadedExecutor::Output          m_outputs;
+};
+
+
+} // namespace gimpl
+} // namespace cv
+
+#endif // OPENCV_GAPI_GTHREADEDEXECUTOR_HPP
diff --git a/modules/gapi/src/executor/thread_pool.cpp b/modules/gapi/src/executor/thread_pool.cpp
new file mode 100644
index 000000000000..a666d7a52f21
--- /dev/null
+++ b/modules/gapi/src/executor/thread_pool.cpp
@@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+
+#include "thread_pool.hpp"
+
+#include <opencv2/gapi/util/throw.hpp>
+
+cv::gapi::own::Latch::Latch(const uint64_t expected)
+    : m_expected(expected) {
+}
+
+void cv::gapi::own::Latch::count_down() {
+    std::lock_guard<std::mutex> lk{m_mutex};
+    --m_expected;
+    if (m_expected == 0) {
+        m_all_done.notify_all();
+    }
+}
+
+void cv::gapi::own::Latch::wait() {
+    std::unique_lock<std::mutex> lk{m_mutex};
+    while (m_expected != 0u) {
+        m_all_done.wait(lk);
+    }
+}
+
+cv::gapi::own::ThreadPool::ThreadPool(const uint32_t num_workers) {
+    m_workers.reserve(num_workers);
+    for (uint32_t i = 0; i < num_workers; ++i) {
+        m_workers.emplace_back(
+                cv::gapi::own::ThreadPool::worker, std::ref(m_queue));
+    }
+}
+
+void cv::gapi::own::ThreadPool::worker(QueueClass<Task>& queue) {
+    while (true) {
+        cv::gapi::own::ThreadPool::Task task;
+        queue.pop(task);
+        if (!task) {
+            break;
+        }
+        task();
+    }
+}
+
+void cv::gapi::own::ThreadPool::schedule(cv::gapi::own::ThreadPool::Task&& task) {
+    m_queue.push(std::move(task));
+};
+
+void cv::gapi::own::ThreadPool::shutdown() {
+    for (size_t i = 0; i < m_workers.size(); ++i) {
+        // NB: Empty task - is an indicator for workers to stop their loops
+        m_queue.push({});
+    }
+    for (auto& worker : m_workers) {
+        worker.join();
+    }
+    m_workers.clear();
+}
+
+cv::gapi::own::ThreadPool::~ThreadPool() {
+    shutdown();
+}
diff --git a/modules/gapi/src/executor/thread_pool.hpp b/modules/gapi/src/executor/thread_pool.hpp
new file mode 100644
index 000000000000..71997bd84fd0
--- /dev/null
+++ b/modules/gapi/src/executor/thread_pool.hpp
@@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+#ifndef OPENCV_GAPI_THREAD_POOL_HPP
+#define OPENCV_GAPI_THREAD_POOL_HPP
+
+#include <functional>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <atomic>
+#include <condition_variable>
+
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+#if defined(HAVE_TBB)
+#  include <tbb/concurrent_queue.h> // FIXME: drop it from here!
+template<typename T> using QueueClass = tbb::concurrent_bounded_queue<T>;
+#else
+#  include "executor/conc_queue.hpp"
+template<typename T> using QueueClass = cv::gapi::own::concurrent_bounded_queue<T>;
+#endif // TBB
+
+namespace cv {
+namespace gapi {
+namespace own {
+
+// NB: Only for tests
+class GAPI_EXPORTS Latch {
+public:
+    explicit Latch(const uint64_t expected);
+
+    Latch(const Latch&) = delete;
+    Latch& operator=(const Latch&) = delete;
+
+    void count_down();
+    void wait();
+
+private:
+    uint64_t                m_expected;
+    std::mutex              m_mutex;
+    std::condition_variable m_all_done;
+};
+
+// NB: Only for tests
+class GAPI_EXPORTS ThreadPool {
+public:
+    using Task = std::function<void()>;
+    explicit ThreadPool(const uint32_t num_workers);
+
+    ThreadPool(const ThreadPool&) = delete;
+    ThreadPool& operator=(const ThreadPool&) = delete;
+
+    void schedule(Task&& task);
+    ~ThreadPool();
+
+private:
+    static void worker(QueueClass<Task>& queue);
+    void shutdown();
+
+private:
+    std::vector<std::thread> m_workers;
+    QueueClass<Task>         m_queue;
+};
+
+}}} // namespace cv::gapi::own
+
+#endif // OPENCV_GAPI_THREAD_POOL_HPP
diff --git a/modules/gapi/src/streaming/onevpl/file_data_provider.hpp b/modules/gapi/src/streaming/onevpl/file_data_provider.hpp
index 10171999a0cc..129c6a9d6845 100644
--- a/modules/gapi/src/streaming/onevpl/file_data_provider.hpp
+++ b/modules/gapi/src/streaming/onevpl/file_data_provider.hpp
@@ -18,6 +18,14 @@ namespace cv {
 namespace gapi {
 namespace wip {
 namespace onevpl {
+
+// With gcc13, std::unique_ptr(FILE, decltype(&fclose)> causes ignored-attributes warning.
+// See https://stackoverflow.com/questions/76849365/can-we-add-attributes-to-standard-function-declarations-without-breaking-standar
+#if defined(__GNUC__) && (__GNUC__ >= 13)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
 struct GAPI_EXPORTS FileDataProvider : public IDataProvider {
 
     using file_ptr = std::unique_ptr<FILE, decltype(&fclose)>;
@@ -34,6 +42,11 @@ struct GAPI_EXPORTS FileDataProvider : public IDataProvider {
     mfx_codec_id_type codec;
     const uint32_t bitstream_data_size;
 };
+
+#if defined(__GNUC__) && (__GNUC__ == 13)
+#pragma GCC diagnostic pop
+#endif
+
 } // namespace onevpl
 } // namespace wip
 } // namespace gapi
diff --git a/modules/gapi/src/streaming/queue_source.cpp b/modules/gapi/src/streaming/queue_source.cpp
new file mode 100644
index 000000000000..59fde09c44cf
--- /dev/null
+++ b/modules/gapi/src/streaming/queue_source.cpp
@@ -0,0 +1,98 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#include <chrono>
+#include <atomic>
+
+#include <ade/util/zip_range.hpp>
+
+#include <opencv2/gapi/streaming/queue_source.hpp>
+#include <opencv2/gapi/streaming/meta.hpp>
+
+#include "executor/conc_queue.hpp"
+
+namespace cv {
+namespace gapi {
+namespace wip {
+
+class QueueSourceBase::Priv {
+public:
+    explicit Priv(const cv::GMetaArg &meta) {
+        m = meta;
+        halted = false;
+    }
+
+    cv::GMetaArg m;
+    cv::gapi::own::concurrent_bounded_queue<cv::GRunArg> q;
+    int64_t c = 0;
+    std::atomic<bool> halted;
+};
+
+QueueSourceBase::QueueSourceBase(const cv::GMetaArg &m)
+    : m_priv(new Priv(m)) {
+}
+
+void QueueSourceBase::push(Data &&data) {
+
+    // Tag data with seq_id/ts
+    const auto now = std::chrono::system_clock::now();
+    const auto dur = std::chrono::duration_cast<std::chrono::microseconds>
+        (now.time_since_epoch());
+    data.meta[cv::gapi::streaming::meta_tag::timestamp] = int64_t{dur.count()};
+    data.meta[cv::gapi::streaming::meta_tag::seq_id]    = int64_t{m_priv->c++};
+
+    m_priv->q.push(data);
+}
+
+bool QueueSourceBase::pull(Data &data) {
+    m_priv->q.pop(data);
+
+    if (m_priv->halted) {
+        return false;
+    }
+    return true;
+}
+
+void QueueSourceBase::halt() {
+    m_priv->halted.store(true);
+    m_priv->q.push(cv::GRunArg{});
+}
+
+cv::GMetaArg QueueSourceBase::descr_of() const {
+    return m_priv->m;
+}
+
+QueueInput::QueueInput(const cv::GMetaArgs &args) {
+    for (auto &&m : args) {
+        m_sources.emplace_back(new cv::gapi::wip::QueueSourceBase(m));
+    }
+}
+
+void QueueInput::push(cv::GRunArgs &&args) {
+    GAPI_Assert(m_sources.size() == args.size());
+    for (auto && it : ade::util::zip(ade::util::toRange(m_sources),
+                                     ade::util::toRange(args)))
+    {
+        auto &src = std::get<0>(it);
+        auto &obj = std::get<1>(it);
+
+        Data d;
+        d = obj;
+        src->push(std::move(d));
+    }
+}
+
+QueueInput::operator cv::GRunArgs () {
+    cv::GRunArgs args;
+    for (auto &&s : m_sources) {
+        args.push_back(s->ptr());
+    }
+    return args;
+}
+
+} // wip
+} // gapi
+} // cv
diff --git a/modules/gapi/test/common/gapi_core_tests_inl.hpp b/modules/gapi/test/common/gapi_core_tests_inl.hpp
index 11b6e066a64e..ae81ca205514 100644
--- a/modules/gapi/test/common/gapi_core_tests_inl.hpp
+++ b/modules/gapi/test/common/gapi_core_tests_inl.hpp
@@ -1699,7 +1699,7 @@ namespace {
             return cv::MediaFrame::View(std::move(pp), std::move(ss));
         }
     };
-};
+}
 
 namespace {
     class TestMediaGray final : public cv::MediaFrame::IAdapter {
@@ -1718,7 +1718,7 @@ namespace {
             return cv::MediaFrame::View(std::move(pp), std::move(ss));
         }
     };
-};
+}
 
 TEST_P(SizeMFTest, ParseTest)
 {
diff --git a/modules/gapi/test/common/gapi_render_tests.cpp b/modules/gapi/test/common/gapi_render_tests.cpp
index e29406d7835c..abfef991212a 100644
--- a/modules/gapi/test/common/gapi_render_tests.cpp
+++ b/modules/gapi/test/common/gapi_render_tests.cpp
@@ -92,6 +92,6 @@ void blendImageRef(cv::Mat& mat, const cv::Point& org, const cv::Mat& img, const
     roi32f += img32f;
 
     roi32f.convertTo(roi, CV_8U, 255.0);
-};
+}
 
 } // namespace opencv_test
diff --git a/modules/gapi/test/common/gapi_render_tests.hpp b/modules/gapi/test/common/gapi_render_tests.hpp
index 30caca9e6dfe..73924d96ac94 100644
--- a/modules/gapi/test/common/gapi_render_tests.hpp
+++ b/modules/gapi/test/common/gapi_render_tests.hpp
@@ -115,7 +115,7 @@ struct Fixture : public RenderNV12TestBase API {                  \
     __WRAP_VAARGS(DEFINE_SPECIFIC_PARAMS_##Number(__VA_ARGS__))   \
     Fixture() {                                                   \
         Init(sz_);                                                \
-    };                                                            \
+    }                                                             \
 };
 
 #define GAPI_RENDER_TEST_FIXTURE_BGR(Fixture, API, Number, ...)  \
@@ -123,7 +123,7 @@ struct Fixture : public RenderBGRTestBase API {                  \
     __WRAP_VAARGS(DEFINE_SPECIFIC_PARAMS_##Number(__VA_ARGS__))   \
     Fixture() {                                                   \
         Init(sz_);                                                \
-    };                                                            \
+    }                                                             \
 };
 
 #define GET_VA_ARGS(...) __VA_ARGS__
diff --git a/modules/gapi/test/common/gapi_tests_common.hpp b/modules/gapi/test/common/gapi_tests_common.hpp
index f84ee05f4984..2b8ee25512a0 100644
--- a/modules/gapi/test/common/gapi_tests_common.hpp
+++ b/modules/gapi/test/common/gapi_tests_common.hpp
@@ -370,7 +370,7 @@ class TestFunctional
             initMatByPointsVectorRandU<Pt<double>>(sz_in);
             break;
         case CV_16F:
-            initMatByPointsVectorRandU<Pt<cv::float16_t>>(sz_in);
+            initMatByPointsVectorRandU<Pt<cv::hfloat>>(sz_in);
             break;
         default:
             GAPI_Error("Unsupported depth");
diff --git a/modules/gapi/test/cpu/gapi_ocv_stateful_kernel_tests.cpp b/modules/gapi/test/cpu/gapi_ocv_stateful_kernel_tests.cpp
index b462e701f236..b9985e1377a4 100644
--- a/modules/gapi/test/cpu/gapi_ocv_stateful_kernel_tests.cpp
+++ b/modules/gapi/test/cpu/gapi_ocv_stateful_kernel_tests.cpp
@@ -165,7 +165,7 @@ namespace
             out = true;
         }
     };
-};
+}
 
 TEST(StatefulKernel, StateInitOnceInRegularMode)
 {
@@ -190,7 +190,7 @@ TEST(StatefulKernel, StateInitOnceInRegularMode)
         EXPECT_TRUE(params.pSetupsCount != nullptr);
         EXPECT_EQ(1, *params.pSetupsCount);
     }
-};
+}
 
 struct StateInitOnce : public ::testing::TestWithParam<bool>{};
 TEST_P(StateInitOnce, StreamingCompiledWithMeta)
@@ -454,6 +454,13 @@ namespace
 
 TEST(StatefulKernel, StateIsInitViaCompArgsInStreaming)
 {
+    // This test is long as it runs BG subtractor (a) twice
+    // (in G-API + for reference) over (b) two files. In fact
+    // it is one more BG Subtractor accuracy test, but not
+    // a stateful initialization test -- the latter must be
+    // done through a light-weight mock object. So for now:
+    applyTestTag(CV_TEST_TAG_VERYLONG);
+
     // G-API graph declaration
     cv::GMat in;
     cv::GMat out = GBackSub::on(in);
diff --git a/modules/gapi/test/cpu/gapi_ot_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_ot_tests_cpu.cpp
new file mode 100644
index 000000000000..c02185e6211e
--- /dev/null
+++ b/modules/gapi/test/cpu/gapi_ot_tests_cpu.cpp
@@ -0,0 +1,287 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+
+#include <opencv2/gapi/ot.hpp>
+#include <opencv2/gapi/cpu/ot.hpp>
+
+#include "opencv2/gapi/streaming/meta.hpp"
+#include "opencv2/gapi/streaming/cap.hpp"
+
+namespace {
+cv::gapi::ot::TrackingStatus from_string(const std::string& status) {
+    if (status == "NEW") {
+        return cv::gapi::ot::TrackingStatus::NEW;
+    }
+    else if (status == "TRACKED") {
+        return cv::gapi::ot::TrackingStatus::TRACKED;
+    }
+    else if (status == "LOST") {
+        return cv::gapi::ot::TrackingStatus::LOST;
+    }
+
+    throw std::runtime_error("String representation for cv::gapi::ot::TrackingStatus: \""
+        + status + "\" contains incorrect value!");
+}
+} // anonymous namespace
+
+namespace opencv_test {
+struct FrameDetections {
+    std::size_t frame_no{};
+    std::vector<std::vector<cv::Rect>> boxes;
+    std::vector<std::vector<int32_t>> box_ids;
+    FrameDetections() {}
+    FrameDetections(std::size_t in_frame_no, const std::vector<std::vector<cv::Rect>>& in_boxes,
+                    const std::vector<std::vector<int32_t>>& in_box_ids) :
+        frame_no(in_frame_no),
+        boxes(in_boxes),
+        box_ids(in_box_ids) {}
+};
+
+struct TrackerReference {
+    std::size_t frame_no{};
+    std::vector<std::vector<cv::Rect>> tracked_boxes;
+    std::vector<std::vector<int32_t>> tracked_box_ids;
+    std::vector<std::vector<uint64_t>> tracking_ids;
+    std::vector<std::vector<cv::gapi::ot::TrackingStatus>> tracking_statuses;
+    TrackerReference() {}
+    TrackerReference(std::size_t in_frame_no,
+                     const std::vector<std::vector<cv::Rect>>& in_tracked_boxes,
+                     const std::vector<std::vector<int32_t>>& in_tracked_box_ids,
+                     const std::vector<std::vector<uint64_t>>& in_tracking_ids,
+                     const std::vector<std::vector<cv::gapi::ot::TrackingStatus>>&
+                         in_tracking_statuses) :
+        frame_no(in_frame_no),
+        tracked_boxes(in_tracked_boxes),
+        tracked_box_ids(in_tracked_box_ids),
+        tracking_ids(in_tracking_ids),
+        tracking_statuses(in_tracking_statuses) {}
+};
+
+struct FrameDetectionsParams {
+    FrameDetections value;
+};
+
+struct TrackerReferenceParams {
+    TrackerReference value;
+};
+} // namespace opencv_test
+
+namespace cv {
+    namespace detail {
+        template<> struct CompileArgTag<opencv_test::FrameDetectionsParams> {
+            static const char* tag() {
+                return "org.opencv.test.frame_detections_params";
+            }
+        };
+    } // namespace detail
+
+    namespace detail {
+        template<> struct CompileArgTag<opencv_test::TrackerReferenceParams> {
+            static const char* tag() {
+                return "org.opencv.test.tracker_reference_params";
+            }
+        };
+    } // namespace detail
+} // namespace cv
+
+namespace opencv_test {
+G_API_OP(CvVideo768x576_Detect, <std::tuple<cv::GArray<cv::Rect>, cv::GArray<int32_t>>(cv::GMat)>,
+    "test.custom.cv_video_768x576_detect") {
+    static std::tuple<cv::GArrayDesc, cv::GArrayDesc> outMeta(cv::GMatDesc) {
+        return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc());
+    }
+};
+
+GAPI_OCV_KERNEL_ST(OCV_CvVideo768x576_Detect, CvVideo768x576_Detect, FrameDetections) {
+    static void setup(cv::GMatDesc,
+                      std::shared_ptr<FrameDetections> &state,
+                      const cv::GCompileArgs &compileArgs) {
+        auto params = cv::gapi::getCompileArg<opencv_test::FrameDetectionsParams>(compileArgs)
+            .value_or(opencv_test::FrameDetectionsParams{ });
+        state = std::make_shared<FrameDetections>(params.value);
+    }
+
+    static void run(const cv::Mat&,
+                    std::vector<cv::Rect> &out_boxes,
+                    std::vector<int32_t> &out_box_ids,
+                    FrameDetections &state) {
+        if (state.frame_no < state.boxes.size()) {
+            out_boxes = state.boxes[state.frame_no];
+            out_box_ids = state.box_ids[state.frame_no];
+            ++state.frame_no;
+        }
+    }
+};
+
+G_API_OP(CheckTrackerResults, <cv::GOpaque<bool>(cv::GArray<cv::Rect>, cv::GArray<int32_t>,
+                                                 cv::GArray<uint64_t>, cv::GArray<int>)>,
+    "test.custom.check_tracker_results") {
+    static cv::GOpaqueDesc outMeta(cv::GArrayDesc, cv::GArrayDesc, cv::GArrayDesc, cv::GArrayDesc) {
+        return cv::empty_gopaque_desc();
+    }
+};
+
+GAPI_OCV_KERNEL_ST(OCVCheckTrackerResults, CheckTrackerResults, TrackerReference) {
+    static void setup(cv::GArrayDesc, cv::GArrayDesc,
+                      cv::GArrayDesc, cv::GArrayDesc,
+                      std::shared_ptr<TrackerReference> &state,
+                      const cv::GCompileArgs &compileArgs) {
+        auto params = cv::gapi::getCompileArg<opencv_test::TrackerReferenceParams>(compileArgs)
+            .value_or(opencv_test::TrackerReferenceParams{ });
+        state = std::make_shared<TrackerReference>(params.value);
+    }
+
+    static void run(const std::vector<cv::Rect> &in_tr_rcts,
+                    const std::vector<int32_t> &in_det_ids,
+                    const std::vector<uint64_t> &in_tr_ids,
+                    const std::vector<int> &in_tr_statuses,
+                    bool& success,
+                    TrackerReference& state) {
+
+        if (state.frame_no < state.tracked_boxes.size()) {
+            auto reference_boxes = state.tracked_boxes[state.frame_no];
+            auto reference_box_ids = state.tracked_box_ids[state.frame_no];
+            auto reference_tr_ids = state.tracking_ids[state.frame_no];
+            auto reference_tr_statuses = state.tracking_statuses[state.frame_no];
+
+            success = true;
+            GAPI_Assert(in_tr_rcts.size() == reference_boxes.size());
+            GAPI_Assert(in_det_ids.size() == reference_box_ids.size());
+            GAPI_Assert(in_tr_ids.size() == reference_tr_ids.size());
+            GAPI_Assert(in_tr_statuses.size() == reference_tr_statuses.size());
+            for (uint32_t i = 0; (i < in_tr_rcts.size() && success); ++i) {
+                const cv::Rect& reference_rc = reference_boxes[i];
+                const cv::Rect& in_rc = in_tr_rcts[i];
+
+                success &= (reference_rc == in_rc);
+                success &= (reference_box_ids[i] == in_det_ids[i]);
+                success &= (reference_tr_ids[i] == in_tr_ids[i]);
+                success &= (reference_tr_statuses[i] == in_tr_statuses[i]);
+            }
+
+            ++state.frame_no;
+        }
+        else {
+            success = true;
+        }
+    }
+};
+
+TEST(VASObjectTracker, PipelineTest)
+{
+    constexpr int32_t frames_to_handle = 30;
+    std::string pathToVideo = opencv_test::findDataFile("cv/video/768x576.avi");
+
+    std::vector<std::vector<cv::Rect>> input_boxes(frames_to_handle);
+    std::vector<std::vector<int32_t>> input_det_ids(frames_to_handle);
+
+    std::string path_to_boxes = opencv_test::findDataFile("cv/video/vas_object_tracking/detections_30_frames.yml");
+
+    cv::FileStorage fs_input_boxes(path_to_boxes, cv::FileStorage::READ);
+    cv::FileNode fn_input_boxes = fs_input_boxes.root();
+    for (auto it = fn_input_boxes.begin(); it != fn_input_boxes.end(); ++it) {
+        cv::FileNode fn_frame = *it;
+        std::string frame_name = fn_frame.name();
+        int frame_no = std::stoi(frame_name.substr(frame_name.find("_") + 1));
+
+        for (auto fit = fn_frame.begin(); fit != fn_frame.end(); ++fit) {
+            cv::FileNode fn_box = *fit;
+
+            cv::Rect box((int)fn_box["x"], (int)fn_box["y"],
+                (int)fn_box["width"], (int)fn_box["height"]);
+            input_boxes[frame_no].push_back(box);
+            input_det_ids[frame_no].push_back(fn_box["id"]);
+        }
+    }
+
+    std::vector<std::vector<cv::Rect>> reference_trackings(frames_to_handle);
+    std::vector<std::vector<int32_t>> reference_trackings_det_ids(frames_to_handle);
+    std::vector<std::vector<uint64_t>> reference_trackings_tr_ids(frames_to_handle);
+    std::vector<std::vector<cv::gapi::ot::TrackingStatus>> reference_trackings_tr_statuses(frames_to_handle);
+
+    std::string path_to_trackings = opencv_test::findDataFile("cv/video/vas_object_tracking/trackings_30_frames.yml");
+
+    cv::FileStorage fs_reference_trackings(path_to_trackings, cv::FileStorage::READ);
+    cv::FileNode fn_reference_trackings = fs_reference_trackings.root();
+    for (auto it =  fn_reference_trackings.begin(); it != fn_reference_trackings.end(); ++it) {
+        cv::FileNode fn_frame = *it;
+        std::string frame_name = fn_frame.name();
+        int frame_no = std::stoi(frame_name.substr(frame_name.find("_") + 1));
+
+        for (auto fit = fn_frame.begin(); fit != fn_frame.end(); ++fit) {
+            cv::FileNode fn_tracked_box = *fit;
+
+            cv::Rect tracked_box((int)fn_tracked_box["x"], (int)fn_tracked_box["y"],
+                (int)fn_tracked_box["width"], (int)fn_tracked_box["height"]);
+            reference_trackings[frame_no].push_back(tracked_box);
+            reference_trackings_det_ids[frame_no].push_back(fn_tracked_box["id"]);
+            reference_trackings_tr_ids[frame_no].push_back(int(fn_tracked_box["tracking_id"]));
+            reference_trackings_tr_statuses[frame_no].push_back(
+                from_string(fn_tracked_box["tracking_status"]));
+        }
+    }
+
+    cv::GMat in;
+
+    cv::GArray<cv::Rect> detections;
+    cv::GArray<int> det_ids;
+    std::tie(detections, det_ids) = CvVideo768x576_Detect::on(in);
+
+    constexpr float delta_time = 0.055f;
+    cv::GArray<cv::Rect> tracking_rects;
+    cv::GArray<int32_t> tracking_det_ids;
+    cv::GArray<uint64_t> tracking_ids;
+    cv::GArray<int> tracking_statuses;
+    std::tie(tracking_rects, tracking_det_ids, tracking_ids, tracking_statuses) =
+        cv::gapi::ot::track(in, detections, det_ids, delta_time);
+
+    cv::GOpaque<bool> check_result =
+        CheckTrackerResults::on(tracking_rects, tracking_det_ids, tracking_ids, tracking_statuses);
+
+    cv::GComputation ccomp(cv::GIn(in), cv::GOut(check_result));
+
+
+    opencv_test::FrameDetections fds { 0, input_boxes, input_det_ids };
+    opencv_test::TrackerReference tr { 0, reference_trackings,
+                                       reference_trackings_det_ids,
+                                       reference_trackings_tr_ids,
+                                       reference_trackings_tr_statuses };
+
+    // Graph compilation for streaming mode:
+    auto compiled =
+        ccomp.compileStreaming(cv::compile_args(
+            cv::gapi::combine(cv::gapi::kernels<OCV_CvVideo768x576_Detect,
+                                                OCVCheckTrackerResults>(),
+                              cv::gapi::ot::cpu::kernels()),
+            opencv_test::FrameDetectionsParams{ fds },
+            opencv_test::TrackerReferenceParams{ tr }));
+
+    EXPECT_TRUE(compiled);
+    EXPECT_FALSE(compiled.running());
+
+    compiled.setSource<cv::gapi::wip::GCaptureSource>(pathToVideo);
+
+    // Start of streaming:
+    compiled.start();
+    EXPECT_TRUE(compiled.running());
+
+    // Streaming:
+    bool success;
+
+    std::size_t counter { }, limit { 30 };
+    while(compiled.pull(cv::gout(success)) && (counter < limit)) {
+         ++counter;
+     }
+
+     compiled.stop();
+
+     EXPECT_TRUE(success);
+     EXPECT_FALSE(compiled.running());
+}
+} // namespace opencv_test
diff --git a/modules/gapi/test/gapi_async_test.cpp b/modules/gapi/test/gapi_async_test.cpp
index 5a7194a17f8c..7086f47c5c60 100644
--- a/modules/gapi/test/gapi_async_test.cpp
+++ b/modules/gapi/test/gapi_async_test.cpp
@@ -207,7 +207,7 @@ struct CallBack: crtp_cast<crtp_final_t> {
             mtx.unlock();
             cv.notify_one();
         };
-    };
+    }
 
     template<typename... Args >
     void start_async(Args&&... args){
diff --git a/modules/gapi/test/gapi_fluid_test.cpp b/modules/gapi/test/gapi_fluid_test.cpp
index 03c98e3ef386..22884934ed86 100644
--- a/modules/gapi/test/gapi_fluid_test.cpp
+++ b/modules/gapi/test/gapi_fluid_test.cpp
@@ -28,12 +28,12 @@ namespace
     void WriteFunction(uint8_t* row, int nr, int w) {
         for (int i = 0; i < w; i++)
             row[i] = static_cast<uint8_t>(nr+i);
-    };
+    }
     void ReadFunction1x1(const uint8_t* row, int w) {
         for (int i = 0; i < w; i++)
             std::cout << std::setw(4) << static_cast<int>(row[i]) << " ";
         std::cout << "\n";
-    };
+    }
     void ReadFunction3x3(const uint8_t* rows[3], int w) {
         for (int i = 0; i < 3; i++) {
             for (int j = -1; j < w+1; j++) {
@@ -42,7 +42,7 @@ namespace
             std::cout << "\n";
         }
         std::cout << "\n";
-    };
+    }
 }
 
 TEST(FluidBuffer, InputTest)
diff --git a/modules/gapi/test/gapi_kernel_tests.cpp b/modules/gapi/test/gapi_kernel_tests.cpp
index dbb0a7f26989..5adb66875276 100644
--- a/modules/gapi/test/gapi_kernel_tests.cpp
+++ b/modules/gapi/test/gapi_kernel_tests.cpp
@@ -215,7 +215,7 @@ TEST(KernelPackage, RemoveBackend)
     EXPECT_FALSE(pkg.includes<J::Foo>());
     EXPECT_FALSE(pkg.includes<J::Bar>());
     EXPECT_TRUE(pkg.includes<S::Baz>());
-};
+}
 
 TEST(KernelPackage, RemoveAPI)
 {
@@ -228,7 +228,7 @@ TEST(KernelPackage, RemoveAPI)
     pkg.remove<I::Foo>();
     EXPECT_TRUE(pkg.includes<J::Bar>());
     EXPECT_FALSE(pkg.includes<J::Foo>());
-};
+}
 
 TEST(KernelPackage, CreateHetero)
 {
diff --git a/modules/gapi/test/gapi_mat_tests.cpp b/modules/gapi/test/gapi_mat_tests.cpp
new file mode 100644
index 000000000000..0f2d1cc7a20b
--- /dev/null
+++ b/modules/gapi/test/gapi_mat_tests.cpp
@@ -0,0 +1,114 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include <opencv2/gapi/cpu/core.hpp>
+#include <opencv2/gapi/ocl/core.hpp>
+#include <opencv2/gapi/fluid/core.hpp>
+
+namespace opencv_test
+{
+namespace
+{
+enum class KernelPackage: int
+{
+    OCV,
+    OCL,
+    FLUID,
+};
+std::ostream& operator<< (std::ostream &os, const KernelPackage &e)
+{
+    switch (e)
+    {
+#define _C(X) case KernelPackage::X: os << #X; break
+        _C(OCV);
+        _C(OCL);
+        _C(FLUID);
+#undef _C
+    default: GAPI_Error("Unknown package");
+    }
+    return os;
+}
+} // namespace
+
+struct GMatWithValue : public TestWithParam <KernelPackage> {
+    cv::GKernelPackage getKernelPackage() {
+        switch (GetParam()) {
+        case KernelPackage::OCV: return cv::gapi::core::cpu::kernels();
+        case KernelPackage::OCL: return cv::gapi::core::ocl::kernels();
+        case KernelPackage::FLUID: return cv::gapi::core::fluid::kernels();
+        default: GAPI_Error("Unknown package");
+        }
+    }
+};
+
+TEST_P(GMatWithValue, SingleIsland)
+{
+    cv::Size sz(2, 2);
+    cv::Mat in_mat = cv::Mat::eye(sz, CV_8U);
+
+    cv::GComputationT<cv::GMat(cv::GMat)> addEye([&](cv::GMat in) {
+        return in + cv::GMat(cv::Mat::eye(sz, CV_8U));
+    });
+
+    cv::Mat out_mat;
+    addEye.apply(in_mat, out_mat, cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+
+    cv::Mat out_mat_ref = in_mat*2;
+    EXPECT_EQ(0, cvtest::norm(out_mat, out_mat_ref, NORM_INF));
+}
+
+TEST_P(GMatWithValue, GraphWithNoInput)
+{
+    cv::Mat cval = cv::Mat::eye(cv::Size(2, 2), CV_8U);
+    cv::GMat gval = cv::GMat(cval);
+    cv::GMat out = cv::gapi::bitwise_not(gval);
+
+    cv::Mat out_mat;
+    cv::GComputation f(cv::GIn(), cv::GOut(out));
+
+    // Compiling this isn't supported for now
+    EXPECT_ANY_THROW(f.compile(cv::descr_of(cval),
+                               cv::compile_args(cv::gapi::use_only{getKernelPackage()})));
+}
+
+INSTANTIATE_TEST_CASE_P(GAPI_GMat, GMatWithValue,
+                        Values(KernelPackage::OCV,
+                               KernelPackage::OCL,
+                               KernelPackage::FLUID));
+
+TEST(GAPI_MatWithValue, MultipleIslands)
+{
+    // This test employs a non-trivial island fusion process
+    // as there's multiple backends in the graph
+
+    cv::Size sz(2, 2);
+    cv::Mat cval2 = cv::Mat::eye(sz, CV_8U) * 2;
+    cv::Mat cval1 = cv::Mat::eye(sz, CV_8U);
+
+    cv::GMat in;
+    cv::GMat tmp = in  + cv::GMat(cval2); // Will be a Fluid operation
+    cv::GMat out = tmp - cv::GMat(cval1); // Will be an OCV operation
+
+    cv::GKernelPackage fluid_kernels = cv::gapi::core::fluid::kernels();
+    cv::GKernelPackage opencv_kernels = cv::gapi::core::cpu::kernels();
+    fluid_kernels.remove<cv::gapi::core::GSub>();
+    opencv_kernels.remove<cv::gapi::core::GAdd>();
+    auto kernels = cv::gapi::combine(fluid_kernels, opencv_kernels);
+
+    cv::Mat in_mat = cv::Mat::zeros(sz, CV_8U);
+    cv::Mat out_mat;
+    auto cc = cv::GComputation(in, out)
+        .compile(cv::descr_of(in_mat),
+                 cv::compile_args(cv::gapi::use_only{kernels}));
+    cc(cv::gin(in_mat), cv::gout(out_mat));
+
+    EXPECT_EQ(0, cvtest::norm(out_mat, cv::Mat::eye(sz, CV_8U), NORM_INF));
+}
+
+} // namespace opencv_test
diff --git a/modules/gapi/test/gapi_sample_pipelines.cpp b/modules/gapi/test/gapi_sample_pipelines.cpp
index da71cd0ab03b..4b5520ca42c7 100644
--- a/modules/gapi/test/gapi_sample_pipelines.cpp
+++ b/modules/gapi/test/gapi_sample_pipelines.cpp
@@ -13,6 +13,8 @@
 
 #include <opencv2/gapi/core.hpp>
 
+#include "executor/thread_pool.hpp"
+
 namespace opencv_test
 {
 
@@ -67,6 +69,38 @@ namespace
         }
     };
 
+    G_TYPED_KERNEL(GBusyWait, <GMat(GMat, uint32_t)>, "org.busy_wait") {
+        static GMatDesc outMeta(GMatDesc in, uint32_t)
+        {
+            return in;
+        }
+    };
+
+    GAPI_OCV_KERNEL(GOCVBusyWait, GBusyWait)
+    {
+        static void run(const cv::Mat& in,
+                        const uint32_t time_in_ms,
+                        cv::Mat&       out)
+        {
+            using namespace std::chrono;
+            auto s = high_resolution_clock::now();
+            in.copyTo(out);
+            auto e = high_resolution_clock::now();
+
+            const auto elapsed_in_ms =
+                static_cast<int32_t>(duration_cast<milliseconds>(e-s).count());
+
+            int32_t diff = time_in_ms - elapsed_in_ms;
+            const auto need_to_wait_in_ms = static_cast<uint32_t>(std::max(0, diff));
+
+            s = high_resolution_clock::now();
+            e = s;
+            while (duration_cast<milliseconds>(e-s).count() < need_to_wait_in_ms) {
+                e = high_resolution_clock::now();
+            }
+        }
+    };
+
     // These definitions test the correct macro work if the kernel has multiple output values
     G_TYPED_KERNEL(GRetGArrayTupleOfGMat2Kernel,  <GArray<std::tuple<GMat, GMat>>(GMat, Scalar)>,                                         "org.opencv.test.retarrayoftupleofgmat2kernel")  {};
     G_TYPED_KERNEL(GRetGArraTupleyOfGMat3Kernel,  <GArray<std::tuple<GMat, GMat, GMat>>(GMat)>,                                           "org.opencv.test.retarrayoftupleofgmat3kernel")  {};
@@ -513,4 +547,29 @@ TEST(GAPI_Pipeline, 1DMatWithinSingleIsland)
     EXPECT_EQ(0, cv::norm(out_mat, ref_mat));
 }
 
+TEST(GAPI_Pipeline, BranchesExecutedInParallel)
+{
+    cv::GMat in;
+    // NB: cv::gapi::copy used to prevent fusing OCV backend operations
+    // into the single island where they will be executed in turn
+    auto out0 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
+    auto out1 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
+    auto out2 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
+    auto out3 = GBusyWait::on(cv::gapi::copy(in), 1000u /*1sec*/);
+
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out0,out1,out2,out3));
+    cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Mat out_mat0, out_mat1, out_mat2, out_mat3;
+
+    using namespace std::chrono;
+    auto s = high_resolution_clock::now();
+    comp.apply(cv::gin(in_mat), cv::gout(out_mat0, out_mat1, out_mat2, out_mat3),
+               cv::compile_args(cv::use_threaded_executor(4u),
+                                cv::gapi::kernels<GOCVBusyWait>()));
+    auto e = high_resolution_clock::now();
+    const auto elapsed_in_ms = duration_cast<milliseconds>(e-s).count();;
+
+    EXPECT_GE(1200u, elapsed_in_ms);
+}
+
 } // namespace opencv_test
diff --git a/modules/gapi/test/infer/gapi_infer_ie_test.cpp b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
index 58e37040e899..3998d68099fd 100644
--- a/modules/gapi/test/infer/gapi_infer_ie_test.cpp
+++ b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
@@ -6,7 +6,7 @@
 
 #include "../test_precomp.hpp"
 
-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2023010000
 
 #include <stdexcept>
 #include <mutex>
@@ -187,8 +187,8 @@ std::string compileAgeGenderBlob(const std::string& device) {
         cv::gapi::ie::detail::ParamDesc params;
         const std::string model_name = "age-gender-recognition-retail-0013";
         const std::string output  = model_name + ".blob";
-        params.model_path   = findDataFile(SUBDIR + model_name + ".xml");
-        params.weights_path = findDataFile(SUBDIR + model_name + ".bin");
+        params.model_path   = findDataFile(SUBDIR + model_name + ".xml", false);
+        params.weights_path = findDataFile(SUBDIR + model_name + ".bin", false);
         params.device_id    = device;
         compileBlob(params, output, IE::Precision::U8);
         return output;
@@ -205,8 +205,8 @@ TEST(TestAgeGenderIE, InferBasicTensor)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -256,8 +256,8 @@ TEST(TestAgeGenderIE, InferBasicImage)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // FIXME: Ideally it should be an image from disk
@@ -334,8 +334,8 @@ struct InferWithReshape: public ::testing::Test {
         reshape_dims = {1, 3, 70, 70};
 
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
 
         params.device_id = "CPU";
 
@@ -432,8 +432,8 @@ struct ROIList: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         // FIXME: it must be cv::imread(findDataFile("../dnn/grace_hopper_227.png", false));
@@ -505,8 +505,8 @@ struct ROIListNV12: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         cv::Size sz{320, 240};
@@ -585,8 +585,8 @@ struct SingleROI: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         // FIXME: it must be cv::imread(findDataFile("../dnn/grace_hopper_227.png", false));
@@ -644,8 +644,8 @@ struct SingleROINV12: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         cv::Size sz{320, 240};
@@ -809,8 +809,8 @@ TEST(TestAgeGenderIE, GenericInfer)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Mat in_mat(cv::Size(320, 240), CV_8UC3);
@@ -859,8 +859,8 @@ TEST(TestAgeGenderIE, InvalidConfigGeneric)
 {
     initDLDTDataPath();
 
-    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     std::string device_id    = "CPU";
 
     // Configure & run G-API
@@ -885,8 +885,8 @@ TEST(TestAgeGenderIE, CPUConfigGeneric)
 {
     initDLDTDataPath();
 
-    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     std::string device_id    = "CPU";
 
     // Configure & run G-API
@@ -912,8 +912,8 @@ TEST(TestAgeGenderIE, InvalidConfig)
 {
     initDLDTDataPath();
 
-    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     std::string device_id    = "CPU";
 
     using AGInfo = std::tuple<cv::GMat, cv::GMat>;
@@ -937,8 +937,8 @@ TEST(TestAgeGenderIE, CPUConfig)
 {
     initDLDTDataPath();
 
-    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     std::string device_id    = "CPU";
 
     using AGInfo = std::tuple<cv::GMat, cv::GMat>;
@@ -1017,8 +1017,8 @@ TEST(TestAgeGenderIE, MediaInputNV12)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Size sz{320, 240};
@@ -1082,8 +1082,8 @@ TEST(TestAgeGenderIE, MediaInputBGR)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Size sz{320, 240};
@@ -1134,8 +1134,8 @@ TEST(InferROI, MediaInputBGR)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Size sz{320, 240};
@@ -1196,8 +1196,8 @@ TEST(InferROI, MediaInputNV12)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Size sz{320, 240};
@@ -1582,13 +1582,16 @@ TEST(Infer, SetInvalidNumberOfRequests)
 
 TEST(Infer, TestStreamingInfer)
 {
+    if (cvtest::skipUnstableTests)
+        throw SkipTestException("Skip InferROI.TestStreamingInfer as it hangs sporadically");
+
     initDLDTDataPath();
 
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1649,13 +1652,16 @@ TEST(Infer, TestStreamingInfer)
 
 TEST(InferROI, TestStreamingInfer)
 {
+    if (cvtest::skipUnstableTests)
+        throw SkipTestException("Skip InferROI.TestStreamingInfer as it hangs sporadically");
+
     initDLDTDataPath();
 
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1727,13 +1733,16 @@ TEST(InferROI, TestStreamingInfer)
 
 TEST(InferList, TestStreamingInfer)
 {
+    if (cvtest::skipUnstableTests)
+        throw SkipTestException("Skip InferList.TestStreamingInfer as it hangs sporadically");
+
     initDLDTDataPath();
 
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1821,8 +1830,8 @@ TEST(Infer2, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1911,8 +1920,8 @@ TEST(InferEmptyList, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1965,8 +1974,8 @@ TEST(Infer2EmptyList, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -2294,8 +2303,8 @@ struct LimitedSourceInfer: public ::testing::Test {
 
     GStreamingCompiled compileStreaming(int nireq) {
         cv::gapi::ie::detail::ParamDesc params;
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         auto pp = cv::gapi::ie::Params<AgeGender> {
@@ -2348,8 +2357,8 @@ TEST(TestAgeGenderIE, InferWithBatch)
 
     constexpr int batch_size = 4;
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Mat in_mat({batch_size, 3, 62, 62}, CV_8U);
@@ -3091,8 +3100,8 @@ struct AgeGenderInferTest: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        m_params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        m_params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        m_params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        m_params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         m_params.device_id = "CPU";
 
         m_plugin = cv::gimpl::ie::wrap::getPlugin(m_params);
@@ -3191,8 +3200,8 @@ TEST(TestAgeGenderIE, InferTensorWithPreproc) {
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
diff --git a/modules/gapi/test/infer/gapi_infer_ov_tests.cpp b/modules/gapi/test/infer/gapi_infer_ov_tests.cpp
index 09b54c1a46d5..49652db387f7 100644
--- a/modules/gapi/test/infer/gapi_infer_ov_tests.cpp
+++ b/modules/gapi/test/infer/gapi_infer_ov_tests.cpp
@@ -255,8 +255,8 @@ class AGNetOVComp {
 struct BaseAgeGenderOV: public ::testing::Test {
     BaseAgeGenderOV() {
         initDLDTDataPath();
-        xml_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        bin_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        xml_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        bin_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         device    = "CPU";
         blob_path = "age-gender-recognition-retail-0013.blob";
     }
@@ -319,8 +319,174 @@ struct TestAgeGenderListOV : public BaseAgeGenderOV {
     }
 };
 
+class TestMediaBGR final: public cv::MediaFrame::IAdapter {
+    cv::Mat m_mat;
+    using Cb = cv::MediaFrame::View::Callback;
+    Cb m_cb;
+
+public:
+    explicit TestMediaBGR(cv::Mat m, Cb cb = [](){})
+        : m_mat(m), m_cb(cb) {
+    }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{cv::MediaFormat::BGR, cv::Size(m_mat.cols, m_mat.rows)};
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = { m_mat.ptr(), nullptr, nullptr, nullptr };
+        cv::MediaFrame::View::Strides ss = { m_mat.step, 0u, 0u, 0u };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss), Cb{m_cb});
+    }
+};
+
+struct MediaFrameTestAgeGenderOV: public ::testing::Test {
+    MediaFrameTestAgeGenderOV() {
+        initDLDTDataPath();
+        xml_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        bin_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
+        device    = "CPU";
+        blob_path = "age-gender-recognition-retail-0013.blob";
+
+        cv::Size sz{62, 62};
+        m_in_mat = cv::Mat(sz, CV_8UC3);
+        cv::resize(m_in_mat, m_in_mat, sz);
+
+        m_in_y = cv::Mat{sz, CV_8UC1};
+        cv::randu(m_in_y, 0, 255);
+        m_in_uv = cv::Mat{sz / 2, CV_8UC2};
+        cv::randu(m_in_uv, 0, 255);
+    }
+
+    cv::Mat m_in_y;
+    cv::Mat m_in_uv;
+
+    cv::Mat m_in_mat;
+
+    cv::Mat m_out_ov_age;
+    cv::Mat m_out_ov_gender;
+
+    cv::Mat m_out_gapi_age;
+    cv::Mat m_out_gapi_gender;
+
+    std::string xml_path;
+    std::string bin_path;
+    std::string blob_path;
+    std::string device;
+    std::string image_path;
+
+    using AGInfo = std::tuple<cv::GMat, cv::GMat>;
+    G_API_NET(AgeGender, <AGInfo(cv::GMat)>, "typed-age-gender");
+
+    void validate() {
+        normAssert(m_out_ov_age,    m_out_gapi_age,    "0: Test age output");
+        normAssert(m_out_ov_gender, m_out_gapi_gender, "0: Test gender output");
+    }
+}; // MediaFrameTestAgeGenderOV
+
 } // anonymous namespace
 
+TEST_F(MediaFrameTestAgeGenderOV, InferMediaInputBGR)
+{
+    // OpenVINO
+    AGNetOVComp ref(xml_path, bin_path, device);
+    ref.cfgPrePostProcessing([](ov::preprocess::PrePostProcessor &ppp) {
+        ppp.input().tensor().set_element_type(ov::element::u8);
+        ppp.input().tensor().set_layout("NHWC");
+    });
+    ref.compile()(m_in_mat, m_out_ov_age, m_out_ov_gender);
+
+    // G-API
+    cv::GFrame in;
+    cv::GMat age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(in);
+    cv::GComputation comp{cv::GIn(in), cv::GOut(age, gender)};
+
+    auto frame = MediaFrame::Create<TestMediaBGR>(m_in_mat);
+    auto pp = cv::gapi::ov::Params<AgeGender> {
+        xml_path, bin_path, device
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+
+    comp.apply(cv::gin(frame),
+               cv::gout(m_out_gapi_age, m_out_gapi_gender),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+TEST_F(MediaFrameTestAgeGenderOV, InferROIGenericMediaInputBGR) {
+    // OpenVINO
+    cv::Rect roi(cv::Rect(cv::Point{20, 25}, cv::Size{16, 16}));
+    auto frame = MediaFrame::Create<TestMediaBGR>(m_in_mat);
+    static constexpr const char* tag = "age-gender-generic";
+
+    // OpenVINO
+    AGNetOVComp ref(xml_path, bin_path, device);
+    ref.cfgPrePostProcessing([](ov::preprocess::PrePostProcessor &ppp) {
+        ppp.input().tensor().set_element_type(ov::element::u8);
+        ppp.input().tensor().set_layout("NHWC");
+    });
+    ref.compile()(m_in_mat, roi, m_out_ov_age, m_out_ov_gender);
+
+    // G-API
+    cv::GFrame in;
+    cv::GOpaque<cv::Rect> rr;
+    GInferInputs inputs;
+    inputs["data"] = in;
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>(tag, rr, inputs);
+    auto age = outputs.at("age_conv3");
+    auto gender = outputs.at("prob");
+    cv::GComputation comp{cv::GIn(in, rr), cv::GOut(age, gender)};
+
+    auto pp = AGNetROIGenComp::params(xml_path, bin_path, device);
+
+    comp.apply(cv::gin(frame, roi), cv::gout(m_out_gapi_age, m_out_gapi_gender),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+    validate();
+}
+
+class TestMediaNV12 final: public cv::MediaFrame::IAdapter {
+    cv::Mat m_y;
+    cv::Mat m_uv;
+
+public:
+    TestMediaNV12(cv::Mat y, cv::Mat uv) : m_y(y), m_uv(uv) {
+    }
+    cv::GFrameDesc meta() const override {
+        return cv::GFrameDesc{cv::MediaFormat::NV12, cv::Size(m_y.cols, m_y.rows)};
+    }
+    cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        cv::MediaFrame::View::Ptrs pp = {
+            m_y.ptr(), m_uv.ptr(), nullptr, nullptr
+        };
+        cv::MediaFrame::View::Strides ss = {
+            m_y.step, m_uv.step, 0u, 0u
+        };
+        return cv::MediaFrame::View(std::move(pp), std::move(ss));
+    }
+};
+
+TEST_F(MediaFrameTestAgeGenderOV, TestMediaNV12AgeGenderOV)
+{
+    cv::GFrame in;
+    cv::GOpaque<cv::Rect> rr;
+    GInferInputs inputs;
+    inputs["data"] = in;
+    static constexpr const char* tag = "age-gender-generic";
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>(tag, rr, inputs);
+    auto age = outputs.at("age_conv3");
+    auto gender = outputs.at("prob");
+    cv::GComputation comp{cv::GIn(in, rr), cv::GOut(age, gender)};
+
+    auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
+    auto pp = AGNetROIGenComp::params(xml_path, bin_path, device);
+
+    cv::Rect roi(cv::Rect(cv::Point{20, 25}, cv::Size{16, 16}));
+
+    EXPECT_NO_THROW(comp.apply(cv::gin(frame, roi),
+                    cv::gout(m_out_gapi_age, m_out_gapi_gender),
+                    cv::compile_args(cv::gapi::networks(pp))));
+}
+
 // TODO: Make all of tests below parmetrized to avoid code duplication
 TEST_F(TestAgeGenderOV, Infer_Tensor) {
     const auto in_mat = getRandomTensor({1, 3, 62, 62}, CV_32F);
@@ -657,6 +823,187 @@ TEST_F(TestAgeGenderListOV, InferList2Generic_Image) {
     validate();
 }
 
+static ov::element::Type toOV(int depth) {
+    switch (depth) {
+    case CV_8U:  return ov::element::u8;
+    case CV_32S: return ov::element::i32;
+    case CV_32F: return ov::element::f32;
+    case CV_16F: return ov::element::f16;
+    default: GAPI_Error("OV Backend: Unsupported data type");
+    }
+    return ov::element::undefined;
+}
+
+struct TestMeanScaleOV : public ::testing::TestWithParam<int>{
+    G_API_NET(IdentityNet, <cv::GMat(cv::GMat)>, "test-identity-net");
+
+    static cv::GComputation create() {
+        cv::GMat in;
+        cv::GMat out;
+        out = cv::gapi::infer<IdentityNet>(in);
+
+        return cv::GComputation{cv::GIn(in), cv::GOut(out)};
+    }
+
+    using Params = cv::gapi::ov::Params<IdentityNet>;
+    static Params params(const std::string &xml_path,
+                         const std::string &bin_path,
+                         const std::string &device) {
+        return Params {
+            xml_path, bin_path, device
+        }.cfgInputModelLayout("NHWC")
+         .cfgOutputLayers({ "output" });
+    }
+
+    TestMeanScaleOV() {
+        initDLDTDataPath();
+
+        m_model_path = findDataFile("gapi/ov/identity_net_100x100.xml");
+        m_weights_path = findDataFile("gapi/ov/identity_net_100x100.bin");
+        m_device_id = "CPU";
+
+        m_ov_model = cv::gapi::ov::wrap::getCore()
+            .read_model(m_model_path, m_weights_path);
+
+        auto input_depth = GetParam();
+        auto input = cv::imread(findDataFile("gapi/gapi_logo.jpg"));
+        input.convertTo(m_in_mat, input_depth);
+    }
+
+    void addPreprocToOV(
+        std::function<void(ov::preprocess::PrePostProcessor&)> f) {
+
+        auto input_depth = GetParam();
+
+        ov::preprocess::PrePostProcessor ppp(m_ov_model);
+        ppp.input().tensor().set_layout(ov::Layout("NHWC"))
+                            .set_element_type(toOV(input_depth))
+                            .set_shape({ 1, 100, 100, 3 });
+        ppp.input().model().set_layout(ov::Layout("NHWC"));
+        f(ppp);
+        m_ov_model = ppp.build();
+    }
+
+    void runOV() {
+        auto compiled_model = cv::gapi::ov::wrap::getCore()
+            .compile_model(m_ov_model, m_device_id);
+        auto infer_request = compiled_model.create_infer_request();
+
+        auto input_tensor = infer_request.get_input_tensor();
+        cv::gapi::ov::util::to_ov(m_in_mat, input_tensor);
+
+        infer_request.infer();
+
+        auto out_tensor = infer_request.get_tensor("output");
+        m_out_mat_ov.create(cv::gapi::ov::util::to_ocv(out_tensor.get_shape()),
+                            cv::gapi::ov::util::to_ocv(out_tensor.get_element_type()));
+        cv::gapi::ov::util::to_ocv(out_tensor, m_out_mat_ov);
+    }
+
+    std::string m_model_path;
+    std::string m_weights_path;
+    std::string m_device_id;
+
+    std::shared_ptr<ov::Model> m_ov_model;
+
+    cv::Mat m_in_mat;
+    cv::Mat m_out_mat_gapi;
+    cv::Mat m_out_mat_ov;
+};
+
+TEST_P(TestMeanScaleOV, Mean)
+{
+    int input_depth = GetParam();
+
+    std::vector<float> mean_values{ 220.1779, 218.9857, 217.8986 };
+
+    // Run OV reference pipeline:
+    {
+        addPreprocToOV([&](ov::preprocess::PrePostProcessor& ppp) {
+            if (input_depth == CV_8U || input_depth == CV_32S) {
+                ppp.input().preprocess().convert_element_type(ov::element::f32);
+            }
+            ppp.input().preprocess().mean(mean_values);
+            });
+        runOV();
+    }
+
+    // Run G-API
+    GComputation comp = create();
+    auto pp = params(m_model_path, m_weights_path, m_device_id);
+    pp.cfgMean(mean_values);
+
+    comp.apply(cv::gin(m_in_mat), cv::gout(m_out_mat_gapi),
+               cv::compile_args(cv::gapi::networks(pp)));
+
+    // Validate OV results against G-API ones:
+    normAssert(m_out_mat_ov, m_out_mat_gapi, "Test output");
+}
+
+TEST_P(TestMeanScaleOV, Scale)
+{
+    int input_depth = GetParam();
+
+    std::vector<float> scale_values{ 2., 2., 2. };
+
+    // Run OV reference pipeline:
+    {
+        addPreprocToOV([&](ov::preprocess::PrePostProcessor& ppp) {
+            if (input_depth == CV_8U || input_depth == CV_32S) {
+                ppp.input().preprocess().convert_element_type(ov::element::f32);
+            }
+            ppp.input().preprocess().scale(scale_values);
+            });
+        runOV();
+    }
+
+    // Run G-API
+    GComputation comp = create();
+    auto pp = params(m_model_path, m_weights_path, m_device_id);
+    pp.cfgScale(scale_values);
+
+    comp.apply(cv::gin(m_in_mat), cv::gout(m_out_mat_gapi),
+        cv::compile_args(cv::gapi::networks(pp)));
+
+    // Validate OV results against G-API ones:
+    normAssert(m_out_mat_ov, m_out_mat_gapi, "Test output");
+}
+
+TEST_P(TestMeanScaleOV, MeanAndScale)
+{
+    int input_depth = GetParam();
+
+    std::vector<float> mean_values{ 220.1779, 218.9857, 217.8986 };
+    std::vector<float> scale_values{ 2., 2., 2. };
+
+    // Run OV reference pipeline:
+    {
+        addPreprocToOV([&](ov::preprocess::PrePostProcessor& ppp) {
+            if (input_depth == CV_8U || input_depth == CV_32S) {
+                ppp.input().preprocess().convert_element_type(ov::element::f32);
+            }
+            ppp.input().preprocess().mean(mean_values);
+            ppp.input().preprocess().scale(scale_values);
+            });
+        runOV();
+    }
+
+    // Run G-API
+    GComputation comp = create();
+    auto pp = params(m_model_path, m_weights_path, m_device_id);
+    pp.cfgMean(mean_values);
+    pp.cfgScale(scale_values);
+
+    comp.apply(cv::gin(m_in_mat), cv::gout(m_out_mat_gapi),
+        cv::compile_args(cv::gapi::networks(pp)));
+
+    // Validate OV results against G-API ones:
+    normAssert(m_out_mat_ov, m_out_mat_gapi, "Test output");
+}
+
+INSTANTIATE_TEST_CASE_P(Instantiation, TestMeanScaleOV,
+                        Values(CV_8U, CV_32S, CV_16F, CV_32F));
+
 } // namespace opencv_test
 
 #endif // HAVE_INF_ENGINE && INF_ENGINE_RELEASE >= 2022010000
diff --git a/modules/gapi/test/internal/gapi_int_executor_tests.cpp b/modules/gapi/test/internal/gapi_int_executor_tests.cpp
index 79117aebf3c1..9bed7b505829 100644
--- a/modules/gapi/test/internal/gapi_int_executor_tests.cpp
+++ b/modules/gapi/test/internal/gapi_int_executor_tests.cpp
@@ -55,7 +55,7 @@ class GMockExecutable final: public cv::gimpl::GIslandExecutable
     GMockExecutable(bool can_reshape = true)
         : m_priv(new Priv{can_reshape, 0, 0})
     {
-    };
+    }
 
     void setReshape(bool can_reshape) { m_priv->m_can_reshape = can_reshape; }
 
@@ -92,7 +92,7 @@ class GMockBackendImpl final: public cv::gapi::GBackend::Priv
     }
 
 public:
-    GMockBackendImpl(const GMockExecutable& exec) : m_exec(exec) { };
+    GMockBackendImpl(const GMockExecutable& exec) : m_exec(exec) { }
     int getCompileCounter() const { return m_compile_counter; }
 };
 
@@ -124,8 +124,8 @@ GMockFunctor mock_kernel(const cv::gapi::GBackend& backend, Callable c)
                        };
 }
 
-void dummyFooImpl(const cv::Mat&, cv::Mat&)                 { };
-void dummyBarImpl(const cv::Mat&, const cv::Mat&, cv::Mat&) { };
+void dummyFooImpl(const cv::Mat&, cv::Mat&)                 { }
+void dummyBarImpl(const cv::Mat&, const cv::Mat&, cv::Mat&) { }
 
 struct GExecutorReshapeTest: public ::testing::Test
 {
@@ -155,7 +155,7 @@ struct GExecutorReshapeTest: public ::testing::Test
     std::shared_ptr<GMockBackendImpl> backend_impl2;
     cv::gapi::GBackend                backend2;
     cv::GKernelPackage                pkg;
-    cv::Mat                           in_mat1, in_mat2, out_mat;;
+    cv::Mat                           in_mat1, in_mat2, out_mat;
 };
 
 } // anonymous namespace
diff --git a/modules/gapi/test/internal/gapi_int_island_tests.cpp b/modules/gapi/test/internal/gapi_int_island_tests.cpp
index 7da1670ecc18..dbc6ad12f49a 100644
--- a/modules/gapi/test/internal/gapi_int_island_tests.cpp
+++ b/modules/gapi/test/internal/gapi_int_island_tests.cpp
@@ -627,7 +627,7 @@ namespace
         void assignIsland(const std::string &s)
         {
             cv::gapi::island(s, cv::GIn(tmp[0]), cv::GOut(tmp[2]));
-        };
+        }
     };
     TEST_P(CheckName, Test)
     {
diff --git a/modules/gapi/test/own/thread_pool_tests.cpp b/modules/gapi/test/own/thread_pool_tests.cpp
new file mode 100644
index 000000000000..ce92c7eab6ca
--- /dev/null
+++ b/modules/gapi/test/own/thread_pool_tests.cpp
@@ -0,0 +1,124 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2024 Intel Corporation
+
+#include "../test_precomp.hpp"
+
+#include <chrono>
+#include <thread>
+
+#include "executor/thread_pool.hpp"
+
+namespace opencv_test
+{
+
+using namespace cv::gapi;
+
+TEST(ThreadPool, ScheduleNotBlock)
+{
+    own::Latch latch(1u);
+    std::atomic<uint32_t> counter{0u};
+
+    own::ThreadPool tp(4u);
+    tp.schedule([&](){
+        std::this_thread::sleep_for(std::chrono::milliseconds{500u});
+        counter++;
+        latch.count_down();
+    });
+
+    EXPECT_EQ(0u, counter);
+    latch.wait();
+    EXPECT_EQ(1u, counter);
+}
+
+TEST(ThreadPool, MultipleTasks)
+{
+    const uint32_t kNumTasks = 100u;
+    own::Latch latch(kNumTasks);
+    std::atomic<uint32_t> completed{0u};
+
+    own::ThreadPool tp(4u);
+    for (uint32_t i = 0; i < kNumTasks; ++i) {
+        tp.schedule([&]() {
+            ++completed;
+            latch.count_down();
+        });
+    }
+    latch.wait();
+
+    EXPECT_EQ(kNumTasks, completed.load());
+}
+
+struct ExecutionState {
+    ExecutionState(const uint32_t num_threads,
+                   const uint32_t num_tasks)
+        : guard(0u),
+          critical(0u),
+          limit(num_tasks),
+          latch(num_threads),
+          tp(num_threads) {
+    }
+
+    std::atomic<uint32_t> guard;
+    std::atomic<uint32_t> critical;
+    const uint32_t        limit;
+    own::Latch            latch;
+    own::ThreadPool       tp;
+};
+
+static void doRecursive(ExecutionState& state) {
+    // NB: Protects function to be executed no more than limit number of times
+    if (state.guard.fetch_add(1u) >= state.limit) {
+        state.latch.count_down();
+        return;
+    }
+    // NB: This simulates critical section
+    std::this_thread::sleep_for(std::chrono::milliseconds{50});
+    ++state.critical;
+    // NB: Schedule the new one recursively
+    state.tp.schedule([&](){ doRecursive(state); });
+}
+
+TEST(ThreadPool, ScheduleRecursively)
+{
+    const int kNumThreads = 5u;
+    const uint32_t kNumTasks = 100u;
+
+    ExecutionState state(kNumThreads, kNumTasks);
+    for (uint32_t i = 0; i < kNumThreads; ++i) {
+        state.tp.schedule([&](){
+            doRecursive(state);
+        });
+    }
+    state.latch.wait();
+
+    EXPECT_EQ(kNumTasks, state.critical.load());
+}
+
+TEST(ThreadPool, ExecutionIsParallel)
+{
+    const uint32_t kNumThreads = 4u;
+    std::atomic<uint32_t> counter{0};
+    own::Latch latch{kNumThreads};
+
+    own::ThreadPool tp(kNumThreads);
+    auto start = std::chrono::high_resolution_clock::now();
+    for (uint32_t i = 0; i < kNumThreads; ++i) {
+      tp.schedule([&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds{800u});
+        ++counter;
+        latch.count_down();
+      });
+    }
+    latch.wait();
+
+    auto end = std::chrono::high_resolution_clock::now();
+    auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+    EXPECT_GE(1000u, elapsed);
+    EXPECT_EQ(kNumThreads, counter.load());
+}
+
+} // namespace opencv_test
diff --git a/modules/gapi/test/streaming/gapi_streaming_queue_source_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_queue_source_tests.cpp
new file mode 100644
index 000000000000..093e65471529
--- /dev/null
+++ b/modules/gapi/test/streaming/gapi_streaming_queue_source_tests.cpp
@@ -0,0 +1,127 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+
+#include <opencv2/gapi/gstreaming.hpp>
+#include <opencv2/gapi/streaming/queue_source.hpp>
+#include <opencv2/gapi/streaming/cap.hpp>
+
+namespace opencv_test
+{
+
+TEST(GAPI_Streaming_Queue_Source, SmokeTest) {
+    // This is more like an example on G-API Queue Source
+
+    cv::GMat in;
+    cv::GMat out = in + 1;
+    cv::GStreamingCompiled comp = cv::GComputation(in, out).compileStreaming();
+
+    // Queue source needs to know format information to maintain contracts
+    auto src = std::make_shared<cv::gapi::wip::QueueSource<cv::Mat> >
+        (cv::GMatDesc{CV_8U, 1, cv::Size{128, 128}});
+
+    comp.setSource(cv::gin(src->ptr()));
+    comp.start();
+
+    // It is perfectly legal to start a pipeline at this point - the source was passed.
+    // Now we can push data through the source and get the pipeline results.
+
+    cv::Mat eye = cv::Mat::eye(cv::Size{128, 128}, CV_8UC1);
+    src->push(eye);    // Push I (identity matrix)
+    src->push(eye*2);  // Push I*2
+
+    // Now its time to pop. The data could be already processed at this point.
+    // Note the queue source queues are unbounded to avoid deadlocks
+
+    cv::Mat result;
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye + 1, result, NORM_INF));
+
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye*2 + 1, result, NORM_INF));
+}
+
+TEST(GAPI_Streaming_Queue_Source, Mixed) {
+    // Mixing a regular "live" source (which runs on its own) with a
+    // manually controlled queue source may make a little sense, but
+    // is perfectly legal and possible.
+
+    cv::GMat in1;
+    cv::GMat in2;
+    cv::GMat out = in2 - in1;
+    cv::GStreamingCompiled comp = cv::GComputation(in1, in2, out).compileStreaming();
+
+    // Queue source needs to know format information to maintain contracts
+    auto src1 = std::make_shared<cv::gapi::wip::QueueSource<cv::Mat> >
+        (cv::GMatDesc{CV_8U, 3, cv::Size{768, 576}});
+
+    std::shared_ptr<cv::gapi::wip::IStreamSource> src2;
+    auto path = findDataFile("cv/video/768x576.avi");
+    try {
+        src2 = cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(path);
+    } catch(...) {
+        throw SkipTestException("Video file can not be opened");
+    }
+
+    comp.setSource(cv::gin(src1->ptr(), src2)); // FIXME: quite inconsistent
+    comp.start();
+
+    cv::Mat eye = cv::Mat::eye(cv::Size{768, 576}, CV_8UC3);
+    src1->push(eye);    // Push I (identity matrix)
+    src1->push(eye);    // Push I (again)
+
+    cv::Mat ref, result;
+    cv::VideoCapture cap(path);
+
+    cap >> ref;
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(ref - eye, result, NORM_INF));
+
+    cap >> ref;
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(ref - eye, result, NORM_INF));
+}
+
+TEST(GAPI_Streaming_Queue_Input, SmokeTest) {
+
+    // Queue Input: a tiny wrapper atop of multiple queue sources.
+    // Allows users to pass all input data at once.
+
+    cv::GMat in1;
+    cv::GScalar in2;
+    cv::GMat out = in1 + in2;
+    cv::GStreamingCompiled comp = cv::GComputation(cv::GIn(in1, in2), cv::GOut(out))
+        .compileStreaming();
+
+    // FIXME: This API is too raw
+    cv::gapi::wip::QueueInput input({
+            cv::GMetaArg{ cv::GMatDesc{CV_8U, 1, cv::Size{64,64} } },
+            cv::GMetaArg{ cv::empty_scalar_desc() }
+        });
+    comp.setSource(input); // Implicit conversion allows it to be passed as-is.
+    comp.start();
+
+    // Push data via queue input
+    cv::Mat eye = cv::Mat::eye(cv::Size{64, 64}, CV_8UC1);
+    input.push(cv::gin(eye, cv::Scalar(1)));
+    input.push(cv::gin(eye, cv::Scalar(2)));
+    input.push(cv::gin(eye, cv::Scalar(3)));
+
+    // Pop data and validate
+    cv::Mat result;
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye+1, result, NORM_INF));
+
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye+2, result, NORM_INF));
+
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye+3, result, NORM_INF));
+}
+
+} // namespace opencv_test
diff --git a/modules/gapi/test/streaming/gapi_streaming_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
index bdb0ae9cd93f..e50e11d5c88d 100644
--- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
@@ -716,14 +716,16 @@ TEST_P(GAPI_Streaming, SmokeTest_AutoMeta_VideoScalar)
     EXPECT_EQ(165u, test_frames);
 }
 
+// Instantiate tests with different backends, but default queue capacity
 INSTANTIATE_TEST_CASE_P(TestStreaming, GAPI_Streaming,
-                        Combine(Values(  KernelPackage::OCV
-                                    //, KernelPackage::OCL // FIXME: Fails bit-exactness check, maybe relax it?
-                                      , KernelPackage::OCV_FLUID
-                                    //, KernelPackage::OCL // FIXME: Fails bit-exactness check, maybe relax it?
-                                ),
-                                Values(cv::optional<size_t>{}, 1u, 4u))
-                        );
+                        Combine(Values( KernelPackage::OCV
+                                      , KernelPackage::OCV_FLUID),
+                                Values(cv::optional<size_t>{})));
+
+// Instantiate tests with the same backend but various queue capacity
+INSTANTIATE_TEST_CASE_P(TestStreaming_QC, GAPI_Streaming,
+                        Combine(Values(KernelPackage::OCV_FLUID),
+                                Values(1u, 4u)));
 
 namespace TypesTest
 {
diff --git a/modules/gapi/test/streaming/gapi_streaming_utils_test.cpp b/modules/gapi/test/streaming/gapi_streaming_utils_test.cpp
index 5599b8826f8c..764e2c3a51a3 100644
--- a/modules/gapi/test/streaming/gapi_streaming_utils_test.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_utils_test.cpp
@@ -245,6 +245,7 @@ TEST(OneVPL_ElasticBarrier, single_thread_visit)
 
 TEST(OneVPL_ElasticBarrier, multi_thread_visit)
 {
+    applyTestTag(CV_TEST_TAG_VERYLONG);
     TestBarrier tested_barrier;
 
     static const size_t max_visit_count = 10000000;
diff --git a/modules/highgui/cmake/detect_wayland.cmake b/modules/highgui/cmake/detect_wayland.cmake
index 24e70d1b85b7..7fcb5bd7a997 100644
--- a/modules/highgui/cmake/detect_wayland.cmake
+++ b/modules/highgui/cmake/detect_wayland.cmake
@@ -9,7 +9,7 @@ macro(ocv_wayland_generate protocol_file output_file)
     list(APPEND WAYLAND_PROTOCOL_SOURCES ${output_file}.h ${output_file}.c)
 endmacro()
 
-ocv_clear_vars(HAVE_WAYLAND_CLIENT HAVE_WAYLAND_CURSOR HAVE_XKBCOMMON HAVE_WAYLAND_PROTOCOLS)
+ocv_clear_vars(HAVE_WAYLAND_CLIENT HAVE_WAYLAND_CURSOR HAVE_XKBCOMMON HAVE_WAYLAND_PROTOCOLS HAVE_WAYLAND_EGL)
 if(WITH_WAYLAND)
     ocv_check_modules(WAYLAND_CLIENT wayland-client)
     if(WAYLAND_CLIENT_FOUND)
@@ -32,4 +32,10 @@ if(WITH_WAYLAND)
     if(HAVE_WAYLAND_CLIENT AND HAVE_WAYLAND_CURSOR AND HAVE_XKBCOMMON AND HAVE_WAYLAND_PROTOCOLS)
         set(HAVE_WAYLAND TRUE)
     endif()
+
+    # WAYLAND_EGL is option
+    ocv_check_modules(WAYLAND_EGL wayland-egl)
+    if(WAYLAND_EGL_FOUND)
+        set(HAVE_WAYLAND_EGL ON)
+    endif()
 endif()
diff --git a/modules/highgui/doc/highgui_qt.cpp b/modules/highgui/doc/highgui_qt.cpp
new file mode 100644
index 000000000000..a3a1f938b42c
--- /dev/null
+++ b/modules/highgui/doc/highgui_qt.cpp
@@ -0,0 +1,39 @@
+#include "opencv2/highgui.hpp"
+
+int main(int argc, char *argv[])
+{
+    int value = 50;
+    int value2 = 0;
+
+    namedWindow("main1",WINDOW_NORMAL);
+    namedWindow("main2",WINDOW_AUTOSIZE | WINDOW_GUI_NORMAL);
+    createTrackbar( "track1", "main1", &value, 255,  NULL);
+
+    String nameb1 = "button1";
+    String nameb2 = "button2";
+
+    createButton(nameb1,callbackButton,&nameb1,QT_CHECKBOX,1);
+    createButton(nameb2,callbackButton,NULL,QT_CHECKBOX,0);
+    createTrackbar( "track2", NULL, &value2, 255, NULL);
+    createButton("button5",callbackButton1,NULL,QT_RADIOBOX,0);
+    createButton("button6",callbackButton2,NULL,QT_RADIOBOX,1);
+
+    setMouseCallback( "main2",on_mouse,NULL );
+
+    Mat img1 = imread("files/flower.jpg");
+    VideoCapture video;
+    video.open("files/hockey.avi");
+
+    Mat img2,img3;
+    while( waitKey(33) != 27 )
+    {
+        img1.convertTo(img2,-1,1,value);
+        video >> img3;
+
+        imshow("main1",img2);
+        imshow("main2",img3);
+    }
+
+    destroyAllWindows();
+    return 0;
+}
diff --git a/modules/highgui/include/opencv2/highgui.hpp b/modules/highgui/include/opencv2/highgui.hpp
index 71c0cf6e85bb..35b64bceae90 100644
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@@ -85,50 +85,8 @@ It provides easy interface to:
         created. Then, a new button is attached to it.
 
     See below the example used to generate the figure:
-    @code
-        int main(int argc, char *argv[])
-        {
-
-            int value = 50;
-            int value2 = 0;
-
-
-            namedWindow("main1",WINDOW_NORMAL);
-            namedWindow("main2",WINDOW_AUTOSIZE | WINDOW_GUI_NORMAL);
-            createTrackbar( "track1", "main1", &value, 255,  NULL);
-
-            String nameb1 = "button1";
-            String nameb2 = "button2";
-
-            createButton(nameb1,callbackButton,&nameb1,QT_CHECKBOX,1);
-            createButton(nameb2,callbackButton,NULL,QT_CHECKBOX,0);
-            createTrackbar( "track2", NULL, &value2, 255, NULL);
-            createButton("button5",callbackButton1,NULL,QT_RADIOBOX,0);
-            createButton("button6",callbackButton2,NULL,QT_RADIOBOX,1);
-
-            setMouseCallback( "main2",on_mouse,NULL );
-
-            Mat img1 = imread("files/flower.jpg");
-            VideoCapture video;
-            video.open("files/hockey.avi");
-
-            Mat img2,img3;
-
-            while( waitKey(33) != 27 )
-            {
-                img1.convertTo(img2,-1,1,value);
-                video >> img3;
-
-                imshow("main1",img2);
-                imshow("main2",img3);
-            }
-
-            destroyAllWindows();
-
-            return 0;
-        }
-    @endcode
 
+    @include highgui_qt.cpp
 
     @defgroup highgui_winrt WinRT support
 
@@ -139,36 +97,34 @@ It provides easy interface to:
 
     See below the example used to generate the figure:
     @code
-        void sample_app::MainPage::ShowWindow()
+    void sample_app::MainPage::ShowWindow()
+    {
+        static cv::String windowName("sample");
+        cv::winrt_initContainer(this->cvContainer);
+        cv::namedWindow(windowName); // not required
+
+        cv::Mat image = cv::imread("Assets/sample.jpg");
+        cv::Mat converted = cv::Mat(image.rows, image.cols, CV_8UC4);
+        cv::cvtColor(image, converted, COLOR_BGR2BGRA);
+        cv::imshow(windowName, converted); // this will create window if it hasn't been created before
+
+        int state = 42;
+        cv::TrackbarCallback callback = [](int pos, void* userdata)
         {
-            static cv::String windowName("sample");
-            cv::winrt_initContainer(this->cvContainer);
-            cv::namedWindow(windowName); // not required
-
-            cv::Mat image = cv::imread("Assets/sample.jpg");
-            cv::Mat converted = cv::Mat(image.rows, image.cols, CV_8UC4);
-            cv::cvtColor(image, converted, COLOR_BGR2BGRA);
-            cv::imshow(windowName, converted); // this will create window if it hasn't been created before
-
-            int state = 42;
-            cv::TrackbarCallback callback = [](int pos, void* userdata)
-            {
-                if (pos == 0) {
-                    cv::destroyWindow(windowName);
-                }
-            };
-            cv::TrackbarCallback callbackTwin = [](int pos, void* userdata)
-            {
-                if (pos >= 70) {
-                    cv::destroyAllWindows();
-                }
-            };
-            cv::createTrackbar("Sample trackbar", windowName, &state, 100, callback);
-            cv::createTrackbar("Twin brother", windowName, &state, 100, callbackTwin);
-        }
+            if (pos == 0) {
+                cv::destroyWindow(windowName);
+            }
+        };
+        cv::TrackbarCallback callbackTwin = [](int pos, void* userdata)
+        {
+            if (pos >= 70) {
+                cv::destroyAllWindows();
+            }
+        };
+        cv::createTrackbar("Sample trackbar", windowName, &state, 100, callback);
+        cv::createTrackbar("Twin brother", windowName, &state, 100, callbackTwin);
+    }
     @endcode
-
-    @defgroup highgui_c C API
 @}
 */
 
@@ -300,9 +256,7 @@ You can call cv::destroyWindow or cv::destroyAllWindows to close the window and
 memory usage. For a simple program, you do not really have to call these functions because all the
 resources and windows of the application are closed automatically by the operating system upon exit.
 
-@note
-
-Qt backend supports additional flags:
+@note Qt backend supports additional flags:
  -   **WINDOW_NORMAL or WINDOW_AUTOSIZE:** WINDOW_NORMAL enables you to resize the
      window, whereas WINDOW_AUTOSIZE adjusts automatically the window size to fit the
      displayed image (see imshow ), and you cannot change the window size manually.
@@ -331,13 +285,20 @@ The function destroyAllWindows destroys all of the opened HighGUI windows.
  */
 CV_EXPORTS_W void destroyAllWindows();
 
+
+/** @brief HighGUI backend used.
+
+The function returns HighGUI backend name used: could be COCOA, GTK2/3, QT, WAYLAND or WIN32.
+Returns empty string if there is no available UI backend.
+ */
+CV_EXPORTS_W const std::string currentUIFramework();
+
+
 CV_EXPORTS_W int startWindowThread();
 
 /** @brief Similar to #waitKey, but returns full key code.
 
-@note
-
-Key code is implementation specific and depends on used backend: QT/GTK/Win32/etc
+@note Key code is implementation specific and depends on used backend: QT/GTK/Win32/etc
 
 */
 CV_EXPORTS_W int waitKeyEx(int delay = 0);
@@ -404,11 +365,12 @@ For example, **waitKey(0)** will display the window infinitely until any keypres
 for image display). **waitKey(25)** will display a frame and wait approximately 25 ms for a key
 press (suitable for displaying a video frame-by-frame). To remove the window, use cv::destroyWindow.
 
-@note
-
-[__Windows Backend Only__] Pressing Ctrl+C will copy the image to the clipboard.
-
-[__Windows Backend Only__] Pressing Ctrl+S will show a dialog to save the image.
+@note [__Windows Backend Only__] Pressing Ctrl+C will copy the image to the clipboard. Pressing Ctrl+S will show a dialog to save the image.
+@note [__Wayland Backend Only__] Supoorting format is extended.
+-   If the image is 8-bit signed, the pixels are biased by 128. That is, the
+    value range [-128,127] is mapped to [0,255].
+-   If the image is 16-bit signed, the pixels are divided by 256 and biased by 128. That is, the
+    value range [-32768,32767] is mapped to [0,255].
 
 @param winname Name of the window.
 @param mat Image to be shown.
@@ -417,10 +379,8 @@ CV_EXPORTS_W void imshow(const String& winname, InputArray mat);
 
 /** @brief Resizes the window to the specified size
 
-@note
-
--   The specified window size is for the image area. Toolbars are not counted.
--   Only windows created without cv::WINDOW_AUTOSIZE flag can be resized.
+@note The specified window size is for the image area. Toolbars are not counted.
+Only windows created without cv::WINDOW_AUTOSIZE flag can be resized.
 
 @param winname Window name.
 @param width The new window width.
@@ -439,6 +399,8 @@ CV_EXPORTS_W void resizeWindow(const String& winname, const cv::Size& size);
 @param winname Name of the window.
 @param x The new x-coordinate of the window.
 @param y The new y-coordinate of the window.
+
+@note [__Wayland Backend Only__] This function is not supported by the Wayland protocol limitation.
  */
 CV_EXPORTS_W void moveWindow(const String& winname, int x, int y);
 
@@ -449,6 +411,8 @@ The function setWindowProperty enables changing properties of a window.
 @param winname Name of the window.
 @param prop_id Window property to edit. The supported operation flags are: (cv::WindowPropertyFlags)
 @param prop_value New value of the window property. The supported flags are: (cv::WindowFlags)
+
+@note [__Wayland Backend Only__] This function is not supported.
  */
 CV_EXPORTS_W void setWindowProperty(const String& winname, int prop_id, double prop_value);
 
@@ -466,6 +430,8 @@ The function getWindowProperty returns properties of a window.
 @param prop_id Window property to retrieve. The following operation flags are available: (cv::WindowPropertyFlags)
 
 @sa setWindowProperty
+
+@note [__Wayland Backend Only__] This function is not supported.
  */
 CV_EXPORTS_W double getWindowProperty(const String& winname, int prop_id);
 
@@ -476,6 +442,8 @@ The function getWindowImageRect returns the client screen coordinates, width and
 @param winname Name of the window.
 
 @sa resizeWindow moveWindow
+
+@note [__Wayland Backend Only__] This function is not supported by the Wayland protocol limitation.
  */
 CV_EXPORTS_W Rect getWindowImageRect(const String& winname);
 
@@ -502,9 +470,7 @@ For cv::EVENT_MOUSEWHEEL positive and negative values mean forward and backward
 respectively. For cv::EVENT_MOUSEHWHEEL, where available, positive and negative values mean right and
 left scrolling, respectively.
 
-@note
-
-Mouse-wheel events are currently supported only on Windows and Cocoa
+@note Mouse-wheel events are currently supported only on Windows and Cocoa.
 
 @param flags The mouse callback flags parameter.
  */
@@ -559,9 +525,7 @@ and range, assigns a variable value to be a position synchronized with the track
 the callback function onChange to be called on the trackbar position change. The created trackbar is
 displayed in the specified window winname.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar should be attached to the
+@note [__Qt Backend Only__] winname can be empty if the trackbar should be attached to the
 control panel.
 
 Clicking the label of each trackbar enables editing the trackbar values manually.
@@ -587,9 +551,7 @@ CV_EXPORTS int createTrackbar(const String& trackbarname, const String& winname,
 
 The function returns the current position of the specified trackbar.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -601,9 +563,7 @@ CV_EXPORTS_W int getTrackbarPos(const String& trackbarname, const String& winnam
 
 The function sets the position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -616,9 +576,7 @@ CV_EXPORTS_W void setTrackbarPos(const String& trackbarname, const String& winna
 
 The function sets the maximum position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -631,9 +589,7 @@ CV_EXPORTS_W void setTrackbarMax(const String& trackbarname, const String& winna
 
 The function sets the minimum position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
diff --git a/modules/highgui/misc/java/src/java/highgui+HighGui.java b/modules/highgui/misc/java/src/java/highgui+HighGui.java
index 87a0ec127acd..7a2509516665 100644
--- a/modules/highgui/misc/java/src/java/highgui+HighGui.java
+++ b/modules/highgui/misc/java/src/java/highgui+HighGui.java
@@ -62,14 +62,9 @@ public static Image toBufferedImage(Mat m) {
         if (m.channels() > 1) {
             type = BufferedImage.TYPE_3BYTE_BGR;
         }
-
-        int bufferSize = m.channels() * m.cols() * m.rows();
-        byte[] b = new byte[bufferSize];
-        m.get(0, 0, b); // get all the pixels
         BufferedImage image = new BufferedImage(m.cols(), m.rows(), type);
-
         final byte[] targetPixels = ((DataBufferByte) image.getRaster().getDataBuffer()).getData();
-        System.arraycopy(b, 0, targetPixels, 0, b.length);
+        m.get(0, 0, targetPixels);
 
         return image;
     }
diff --git a/modules/highgui/src/backend.hpp b/modules/highgui/src/backend.hpp
index 7c32846ce4a3..93d51da11966 100644
--- a/modules/highgui/src/backend.hpp
+++ b/modules/highgui/src/backend.hpp
@@ -106,6 +106,7 @@ class CV_EXPORTS UIBackend
 
     virtual int waitKeyEx(int delay /*= 0*/) = 0;
     virtual int pollKey() = 0;
+    virtual const std::string getName() const = 0;
 };
 
 std::shared_ptr<UIBackend>& getCurrentUIBackend();
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index 9f0ea59b6044..2bbfd6c14a0d 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -98,6 +98,7 @@ void cvSetModeWindow_WinRT(const char* name, double prop_value);
 CvRect cvGetWindowRect_W32(const char* name);
 CvRect cvGetWindowRect_GTK(const char* name);
 CvRect cvGetWindowRect_COCOA(const char* name);
+CvRect cvGetWindowRect_WAYLAND(const char* name);
 
 double cvGetModeWindow_W32(const char* name);
 double cvGetModeWindow_GTK(const char* name);
diff --git a/modules/highgui/src/registry.impl.hpp b/modules/highgui/src/registry.impl.hpp
index 66693f1b07e0..23f4e9f4e1a0 100644
--- a/modules/highgui/src/registry.impl.hpp
+++ b/modules/highgui/src/registry.impl.hpp
@@ -61,7 +61,7 @@ std::vector<BackendInfo>& getBuiltinBackendsInfo()
 #endif
     };
     return g_backends;
-};
+}
 
 static
 bool sortByPriority(const BackendInfo &lhs, const BackendInfo &rhs)
diff --git a/modules/highgui/src/roiSelector.cpp b/modules/highgui/src/roiSelector.cpp
index 1bbd246c0584..56881a97f480 100644
--- a/modules/highgui/src/roiSelector.cpp
+++ b/modules/highgui/src/roiSelector.cpp
@@ -118,7 +118,7 @@ class ROISelector
         bool drawFromCenter;
 
         // initializer list
-        handlerT() : isDrawing(false), drawFromCenter(true){};
+        handlerT() : isDrawing(false), drawFromCenter(true){}
     } selectorParams;
 
   private:
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 2e528fe8e583..e6972b300b02 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -217,9 +217,9 @@ CV_IMPL void cvSetWindowProperty(const char* name, int prop_id, double prop_valu
     switch(prop_id)
     {
     //change between fullscreen or not.
-    case CV_WND_PROP_FULLSCREEN:
+    case cv::WND_PROP_FULLSCREEN:
 
-        if (prop_value != CV_WINDOW_NORMAL && prop_value != CV_WINDOW_FULLSCREEN)  // bad argument
+        if (prop_value != cv::WINDOW_NORMAL && prop_value != cv::WINDOW_FULLSCREEN)  // bad argument
             break;
 
         #if defined (HAVE_QT)
@@ -236,13 +236,13 @@ CV_IMPL void cvSetWindowProperty(const char* name, int prop_id, double prop_valu
 
     break;
 
-    case CV_WND_PROP_AUTOSIZE:
+    case cv::WND_PROP_AUTOSIZE:
         #if defined (HAVE_QT)
             cvSetPropWindow_QT(name,prop_value);
         #endif
     break;
 
-    case CV_WND_PROP_ASPECTRATIO:
+    case cv::WND_PROP_ASPECT_RATIO:
         #if defined (HAVE_QT)
             cvSetRatioWindow_QT(name,prop_value);
         #endif
@@ -305,7 +305,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
 #else
     switch(prop_id)
     {
-    case CV_WND_PROP_FULLSCREEN:
+    case cv::WND_PROP_FULLSCREEN:
 
         #if defined (HAVE_QT)
             return cvGetModeWindow_QT(name);
@@ -322,7 +322,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
         #endif
     break;
 
-    case CV_WND_PROP_AUTOSIZE:
+    case cv::WND_PROP_AUTOSIZE:
 
         #if defined (HAVE_QT)
             return cvGetPropWindow_QT(name);
@@ -335,7 +335,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
         #endif
     break;
 
-    case CV_WND_PROP_ASPECTRATIO:
+    case cv::WND_PROP_ASPECT_RATIO:
 
         #if defined (HAVE_QT)
             return cvGetRatioWindow_QT(name);
@@ -348,7 +348,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
         #endif
     break;
 
-    case CV_WND_PROP_OPENGL:
+    case cv::WND_PROP_OPENGL:
 
         #if defined (HAVE_QT)
             return cvGetOpenGlProp_QT(name);
@@ -361,7 +361,7 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
         #endif
     break;
 
-    case CV_WND_PROP_VISIBLE:
+    case cv::WND_PROP_VISIBLE:
         #if defined (HAVE_QT)
             return cvGetPropVisible_QT(name);
         #elif defined(HAVE_WIN32UI)
@@ -436,6 +436,8 @@ cv::Rect cv::getWindowImageRect(const String& winname)
         return cvGetWindowRect_GTK(winname.c_str());
     #elif defined (HAVE_COCOA)
         return cvGetWindowRect_COCOA(winname.c_str());
+    #elif defined (HAVE_WAYLAND)
+        return cvGetWindowRect_WAYLAND(winname.c_str());
     #else
         return Rect(-1, -1, -1, -1);
     #endif
@@ -1091,23 +1093,50 @@ void cv::imshow(const String& winname, const ogl::Texture2D& _tex)
 #endif
 }
 
+const std::string cv::currentUIFramework()
+{
+    CV_TRACE_FUNCTION();
+
+    // plugin and backend-compatible implementations
+    auto backend = getCurrentUIBackend();
+    if (backend)
+    {
+        return backend->getName();
+    }
+
+    // builtin backends
+#if defined(HAVE_WIN32UI)
+    CV_Assert(false); // backend-compatible
+#elif defined (HAVE_GTK)
+    CV_Assert(false); // backend-compatible
+#elif defined (HAVE_QT)
+    return std::string("QT");
+#elif defined (HAVE_COCOA)
+    return std::string("COCOA");
+#elif defined (HAVE_WAYLAND)
+    return std::string("WAYLAND");
+#else
+    return std::string();
+#endif
+}
+
 // Without OpenGL
 
 #ifndef HAVE_OPENGL
 
 CV_IMPL void cvSetOpenGlDrawCallback(const char*, CvOpenGlDrawCallback, void*)
 {
-    CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support");
+    CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support");
 }
 
 CV_IMPL void cvSetOpenGlContext(const char*)
 {
-    CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support");
+    CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support");
 }
 
 CV_IMPL void cvUpdateWindow(const char*)
 {
-    CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support");
+    CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support");
 }
 
 #endif // !HAVE_OPENGL
@@ -1176,52 +1205,52 @@ static const char* NO_QT_ERR_MSG = "The library is compiled without QT support";
 
 cv::QtFont cv::fontQt(const String&, int, Scalar, int,  int, int)
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 void cv::addText( const Mat&, const String&, Point, const QtFont&)
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 void cv::addText(const Mat&, const String&, Point, const String&, int, Scalar, int, int, int)
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 void cv::displayStatusBar(const String&,  const String&, int)
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 void cv::displayOverlay(const String&,  const String&, int )
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 int cv::startLoop(int (*)(int argc, char *argv[]), int , char**)
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 void cv::stopLoop()
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 void cv::saveWindowParameters(const String&)
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 void cv::loadWindowParameters(const String&)
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 int cv::createButton(const String&, ButtonCallback, void*, int , bool )
 {
-    CV_Error(CV_StsNotImplemented, NO_QT_ERR_MSG);
+    CV_Error(cv::Error::StsNotImplemented, NO_QT_ERR_MSG);
 }
 
 #endif
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index ecf8b61df019..726cb6969509 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -147,7 +147,7 @@ CV_IMPL CvFont cvFontQt(const char* nameFont, int pointSize,CvScalar color,int w
 CV_IMPL void cvAddText(const CvArr* img, const char* text, CvPoint org, CvFont* font)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "putText",
@@ -162,7 +162,7 @@ CV_IMPL void cvAddText(const CvArr* img, const char* text, CvPoint org, CvFont*
 double cvGetRatioWindow_QT(const char* name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     double result = -1;
     QMetaObject::invokeMethod(guiMainThread,
@@ -176,7 +176,7 @@ double cvGetRatioWindow_QT(const char* name)
 
 double cvGetPropVisible_QT(const char* name) {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     double result = 0;
 
@@ -193,7 +193,7 @@ void cvSetRatioWindow_QT(const char* name,double prop_value)
 {
 
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "setRatioWindow",
@@ -205,7 +205,7 @@ void cvSetRatioWindow_QT(const char* name,double prop_value)
 double cvGetPropWindow_QT(const char* name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     double result = -1;
     QMetaObject::invokeMethod(guiMainThread,
@@ -221,7 +221,7 @@ double cvGetPropWindow_QT(const char* name)
 void cvSetPropWindow_QT(const char* name,double prop_value)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "setPropWindow",
@@ -246,7 +246,7 @@ void setWindowTitle_QT(const String& winname, const String& title)
 void cvSetModeWindow_QT(const char* name, double prop_value)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "toggleFullScreen",
@@ -258,7 +258,7 @@ void cvSetModeWindow_QT(const char* name, double prop_value)
 CvRect cvGetWindowRect_QT(const char* name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     CvRect result = cvRect(-1, -1, -1, -1);
 
@@ -274,7 +274,7 @@ CvRect cvGetWindowRect_QT(const char* name)
 double cvGetModeWindow_QT(const char* name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     double result = -1;
 
@@ -291,7 +291,7 @@ double cvGetModeWindow_QT(const char* name)
 CV_IMPL void cvDisplayOverlay(const char* name, const char* text, int delayms)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "displayInfo",
@@ -305,7 +305,7 @@ CV_IMPL void cvDisplayOverlay(const char* name, const char* text, int delayms)
 CV_IMPL void cvSaveWindowParameters(const char* name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "saveWindowParameters",
@@ -317,7 +317,7 @@ CV_IMPL void cvSaveWindowParameters(const char* name)
 CV_IMPL void cvLoadWindowParameters(const char* name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "loadWindowParameters",
@@ -329,7 +329,7 @@ CV_IMPL void cvLoadWindowParameters(const char* name)
 CV_IMPL void cvDisplayStatusBar(const char* name, const char* text, int delayms)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "displayStatusBar",
@@ -492,7 +492,7 @@ static CvTrackbar* icvFindTrackBarByName(const char* name_trackbar, const char*
         QPointer<CvWindow> w = icvFindWindowByName(nameWinQt);
 
         if (!w)
-            CV_Error(CV_StsNullPtr, "NULL window handler");
+            CV_Error(cv::Error::StsNullPtr, "NULL window handler");
 
         if (w->param_gui_mode == CV_GUI_NORMAL)
             return (CvTrackbar*) icvFindBarByName(w->myBarLayout, nameQt, type_CvTrackbar);
@@ -575,7 +575,7 @@ CV_IMPL int cvNamedWindow(const char* name, int flags)
 CV_IMPL void cvDestroyWindow(const char* name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "destroyWindow",
@@ -598,7 +598,7 @@ CV_IMPL void cvDestroyAllWindows()
 CV_IMPL void* cvGetWindowHandle(const char* name)
 {
     if (!name)
-        CV_Error( CV_StsNullPtr, "NULL name string" );
+        CV_Error( cv::Error::StsNullPtr, "NULL name string" );
 
     return (void*) icvFindWindowByName(QLatin1String(name));
 }
@@ -607,7 +607,7 @@ CV_IMPL void* cvGetWindowHandle(const char* name)
 CV_IMPL const char* cvGetWindowName(void* window_handle)
 {
     if( !window_handle )
-        CV_Error( CV_StsNullPtr, "NULL window handler" );
+        CV_Error( cv::Error::StsNullPtr, "NULL window handler" );
 
     return ((CvWindow*)window_handle)->objectName().toLatin1().data();
 }
@@ -616,7 +616,7 @@ CV_IMPL const char* cvGetWindowName(void* window_handle)
 CV_IMPL void cvMoveWindow(const char* name, int x, int y)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
     QMetaObject::invokeMethod(guiMainThread,
         "moveWindow",
         autoBlockingConnection(),
@@ -628,7 +628,7 @@ CV_IMPL void cvMoveWindow(const char* name, int x, int y)
 CV_IMPL void cvResizeWindow(const char* name, int width, int height)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
     QMetaObject::invokeMethod(guiMainThread,
         "resizeWindow",
         autoBlockingConnection(),
@@ -641,7 +641,7 @@ CV_IMPL void cvResizeWindow(const char* name, int width, int height)
 CV_IMPL int cvCreateTrackbar2(const char* name_bar, const char* window_name, int* val, int count, CvTrackbarCallback2 on_notify, void* userdata)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "addSlider2",
@@ -666,7 +666,7 @@ CV_IMPL int cvStartWindowThread()
 CV_IMPL int cvCreateTrackbar(const char* name_bar, const char* window_name, int* value, int count, CvTrackbarCallback on_change)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "addSlider",
@@ -684,7 +684,7 @@ CV_IMPL int cvCreateTrackbar(const char* name_bar, const char* window_name, int*
 CV_IMPL int cvCreateButton(const char* button_name, CvButtonCallback on_change, void* userdata, int button_type, int initial_button_state)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     if (initial_button_state < 0 || initial_button_state > 1)
         return 0;
@@ -750,7 +750,7 @@ CV_IMPL void cvSetMouseCallback(const char* window_name, CvMouseCallback on_mous
     QPointer<CvWindow> w = icvFindWindowByName(QLatin1String(window_name));
 
     if (!w)
-        CV_Error(CV_StsNullPtr, "NULL window handler");
+        CV_Error(cv::Error::StsNullPtr, "NULL window handler");
 
     w->setMouseCallBack(on_mouse, param);
 
@@ -780,7 +780,7 @@ CV_IMPL void cvShowImage(const char* name, const CvArr* arr)
 CV_IMPL void cvSetOpenGlDrawCallback(const char* window_name, CvOpenGlDrawCallback callback, void* userdata)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "setOpenGlDrawCallback",
@@ -794,7 +794,7 @@ CV_IMPL void cvSetOpenGlDrawCallback(const char* window_name, CvOpenGlDrawCallba
 CV_IMPL void cvSetOpenGlContext(const char* window_name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "setOpenGlContext",
@@ -806,7 +806,7 @@ CV_IMPL void cvSetOpenGlContext(const char* window_name)
 CV_IMPL void cvUpdateWindow(const char* window_name)
 {
     if (!guiMainThread)
-        CV_Error( CV_StsNullPtr, "NULL guiReceiver (please create a window)" );
+        CV_Error( cv::Error::StsNullPtr, "NULL guiReceiver (please create a window)" );
 
     QMetaObject::invokeMethod(guiMainThread,
         "updateWindow",
@@ -1016,7 +1016,7 @@ double GuiReceiver::isFullScreen(QString name)
     if (!w)
         return -1;
 
-    return w->isFullScreen() ? CV_WINDOW_FULLSCREEN : CV_WINDOW_NORMAL;
+    return w->isFullScreen() ? cv::WINDOW_FULLSCREEN : cv::WINDOW_NORMAL;
 }
 
 
@@ -1036,7 +1036,7 @@ void GuiReceiver::toggleFullScreen(QString name, double arg2)
 void GuiReceiver::createWindow(QString name, int flags)
 {
     if (!qApp)
-        CV_Error(CV_StsNullPtr, "NULL session handler" );
+        CV_Error(cv::Error::StsNullPtr, "NULL session handler" );
 
     // Check the name in the storage
     if (icvFindWindowByName(name.toLatin1().data()))
@@ -1127,7 +1127,7 @@ void GuiReceiver::destroyWindow(QString name)
 void GuiReceiver::destroyAllWindow()
 {
     if (!qApp)
-        CV_Error(CV_StsNullPtr, "NULL session handler" );
+        CV_Error(cv::Error::StsNullPtr, "NULL session handler" );
 
     if (multiThreads)
     {
@@ -1256,7 +1256,7 @@ void GuiReceiver::addSlider2(QString bar_name, QString window_name, void* value,
         return;
 
     if (count <= 0) //count is the max value of the slider, so must be bigger than 0
-        CV_Error(CV_StsNullPtr, "Max value of the slider must be bigger than 0" );
+        CV_Error(cv::Error::StsNullPtr, "Max value of the slider must be bigger than 0" );
 
     CvWindow::addSlider2(w, bar_name, (int*)value, count, (CvTrackbarCallback2) on_change, userdata);
 }
@@ -1286,10 +1286,10 @@ void GuiReceiver::addSlider(QString bar_name, QString window_name, void* value,
         return;
 
     if (!value)
-        CV_Error(CV_StsNullPtr, "NULL value pointer" );
+        CV_Error(cv::Error::StsNullPtr, "NULL value pointer" );
 
     if (count <= 0) //count is the max value of the slider, so must be bigger than 0
-        CV_Error(CV_StsNullPtr, "Max value of the slider must be bigger than 0" );
+        CV_Error(cv::Error::StsNullPtr, "Max value of the slider must be bigger than 0" );
 
     CvWindow::addSlider(w, bar_name, (int*)value, count, (CvTrackbarCallback) on_change);
 }
@@ -1675,6 +1675,7 @@ CvWinProperties::~CvWinProperties()
 
 CvWindow::CvWindow(QString name, int arg2)
 {
+    Q_INIT_RESOURCE(window_QT);
     type = type_CvWindow;
 
     param_flags = arg2 & 0x0000000F;
@@ -1701,11 +1702,11 @@ CvWindow::CvWindow(QString name, int arg2)
 
     //3: my view
 #ifndef HAVE_QT_OPENGL
-    if (arg2 & CV_WINDOW_OPENGL)
-        CV_Error( CV_OpenGlNotSupported, "Library was built without OpenGL support" );
+    if (arg2 & cv::WINDOW_OPENGL)
+        CV_Error( cv::Error::OpenGlNotSupported, "Library was built without OpenGL support" );
     mode_display = CV_MODE_NORMAL;
 #else
-    mode_display = arg2 & CV_WINDOW_OPENGL ? CV_MODE_OPENGL : CV_MODE_NORMAL;
+    mode_display = arg2 & cv::WINDOW_OPENGL ? CV_MODE_OPENGL : CV_MODE_NORMAL;
     if (mode_display == CV_MODE_OPENGL)
         param_gui_mode = CV_GUI_NORMAL;
 #endif
@@ -1724,14 +1725,14 @@ CvWindow::CvWindow(QString name, int arg2)
 
     //Now attach everything
     if (myToolBar)
-        myGlobalLayout->addWidget(myToolBar, Qt::AlignCenter);
+        myGlobalLayout->addWidget(myToolBar, 0, Qt::AlignLeft);
 
-    myGlobalLayout->addWidget(myView->getWidget(), Qt::AlignCenter);
+    myGlobalLayout->addWidget(myView->getWidget(), 0, Qt::AlignCenter);
 
-    myGlobalLayout->addLayout(myBarLayout, Qt::AlignCenter);
+    myGlobalLayout->addLayout(myBarLayout);
 
     if (myStatusBar)
-        myGlobalLayout->addWidget(myStatusBar, Qt::AlignCenter);
+        myGlobalLayout->addWidget(myStatusBar, 0, Qt::AlignLeft);
 
     setLayout(myGlobalLayout);
     show();
@@ -1836,13 +1837,13 @@ void CvWindow::setPropWindow(int flags)
 
     switch(flags)
     {
-    case CV_WINDOW_NORMAL:
+    case cv::WINDOW_NORMAL:
         myGlobalLayout->setSizeConstraint(QLayout::SetMinAndMaxSize);
         param_flags = flags;
 
         break;
 
-    case CV_WINDOW_AUTOSIZE:
+    case cv::WINDOW_AUTOSIZE:
         myGlobalLayout->setSizeConstraint(QLayout::SetFixedSize);
         param_flags = flags;
 
@@ -1855,14 +1856,14 @@ void CvWindow::setPropWindow(int flags)
 
 void CvWindow::toggleFullScreen(int flags)
 {
-    if (isFullScreen() && flags == CV_WINDOW_NORMAL)
+    if (isFullScreen() && flags == cv::WINDOW_NORMAL)
     {
         showTools();
         showNormal();
         return;
     }
 
-    if (!isFullScreen() && flags == CV_WINDOW_FULLSCREEN)
+    if (!isFullScreen() && flags == cv::WINDOW_FULLSCREEN)
     {
         hideTools();
         showFullScreen();
@@ -2012,9 +2013,9 @@ void CvWindow::createGlobalLayout()
 #endif
     setMinimumSize(1, 1);
 
-    if (param_flags == CV_WINDOW_AUTOSIZE)
+    if (param_flags == cv::WINDOW_AUTOSIZE)
         myGlobalLayout->setSizeConstraint(QLayout::SetFixedSize);
-    else if (param_flags == CV_WINDOW_NORMAL)
+    else if (param_flags == cv::WINDOW_NORMAL)
         myGlobalLayout->setSizeConstraint(QLayout::SetMinAndMaxSize);
 }
 
@@ -2141,7 +2142,6 @@ void CvWindow::createStatusBar()
 {
     myStatusBar = new QStatusBar(this);
     myStatusBar->setSizeGripEnabled(false);
-    myStatusBar->setFixedHeight(20);
     myStatusBar->setMinimumWidth(1);
     myStatusBar_msg = new QLabel;
 
@@ -2543,6 +2543,10 @@ DefaultViewPort::DefaultViewPort(CvWindow* arg, int arg2) : QGraphicsView(arg),
 
     setInteractive(false);
     setMouseTracking(true); //receive mouse event everytime
+
+    // #13657 Tab key disables arrow keys
+    // #20215 QT backend: cv::waitKey() and cv::waitKeyEx() do not capture arrow keys once you click on the image or press TAB
+    setFocusPolicy(Qt::NoFocus);
 }
 
 
@@ -2601,7 +2605,7 @@ void DefaultViewPort::setRatio(int flags)
         return;
 
     //if valid flags
-    if (flags == CV_WINDOW_FREERATIO || flags == CV_WINDOW_KEEPRATIO)
+    if (flags == cv::WINDOW_FREERATIO || flags == cv::WINDOW_KEEPRATIO)
     {
         centralWidget->param_ratio_mode = flags;
         param_keepRatio = flags;
@@ -2658,19 +2662,19 @@ void DefaultViewPort::startDisplayInfo(QString text, int delayms)
 
 void DefaultViewPort::setOpenGlDrawCallback(CvOpenGlDrawCallback /*callback*/, void* /*userdata*/)
 {
-    CV_Error(CV_OpenGlNotSupported, "Window doesn't support OpenGL");
+    CV_Error(cv::Error::OpenGlNotSupported, "Window doesn't support OpenGL");
 }
 
 
 void DefaultViewPort::makeCurrentOpenGlContext()
 {
-    CV_Error(CV_OpenGlNotSupported, "Window doesn't support OpenGL");
+    CV_Error(cv::Error::OpenGlNotSupported, "Window doesn't support OpenGL");
 }
 
 
 void DefaultViewPort::updateGl()
 {
-    CV_Error(CV_OpenGlNotSupported, "Window doesn't support OpenGL");
+    CV_Error(cv::Error::OpenGlNotSupported, "Window doesn't support OpenGL");
 }
 
 
@@ -2773,7 +2777,7 @@ void DefaultViewPort::saveView()
             return;
         }
 
-        CV_Error(CV_StsNullPtr, "file extension not recognized, please choose between JPG, JPEG, BMP or PNG");
+        CV_Error(cv::Error::StsNullPtr, "file extension not recognized, please choose between JPG, JPEG, BMP or PNG");
     }
 }
 
@@ -2812,7 +2816,7 @@ void DefaultViewPort::resizeEvent(QResizeEvent* evnt)
     ratioX = width() / float(image2Draw_mat->cols);
     ratioY = height() / float(image2Draw_mat->rows);
 
-    if (param_keepRatio == CV_WINDOW_KEEPRATIO)//to keep the same aspect ratio
+    if (param_keepRatio == cv::WINDOW_KEEPRATIO)//to keep the same aspect ratio
     {
         QSize newSize = QSize(image2Draw_mat->cols, image2Draw_mat->rows);
         newSize.scale(evnt->size(), Qt::KeepAspectRatio);
diff --git a/modules/highgui/src/window_cocoa.mm b/modules/highgui/src/window_cocoa.mm
index 86f38d0ae809..7e364220fadf 100644
--- a/modules/highgui/src/window_cocoa.mm
+++ b/modules/highgui/src/window_cocoa.mm
@@ -195,6 +195,9 @@ CV_IMPL void cvDestroyWindow( const char* name)
     //cout << "cvDestroyWindow" << endl;
     CVWindow *window = cvGetWindow(name);
     if(window) {
+        if ([window styleMask] & NSFullScreenWindowMask) {
+            [window toggleFullScreen:nil];
+        }
         [window close];
         [windows removeObjectForKey:[NSString stringWithFormat:@"%s", name]];
     }
@@ -701,7 +704,11 @@ CvRect cvGetWindowRect_COCOA( const char* name )
 void cvSetModeWindow_COCOA( const char* name, double prop_value )
 {
     CVWindow *window = nil;
+
+#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_7
     NSDictionary *fullscreenOptions = nil;
+#endif
+
     NSAutoreleasePool* localpool = nil;
 
     CV_FUNCNAME( "cvSetModeWindow_COCOA" );
@@ -725,6 +732,31 @@ void cvSetModeWindow_COCOA( const char* name, double prop_value )
 
     localpool = [[NSAutoreleasePool alloc] init];
 
+#if MAC_OS_X_VERSION_MAX_ALLOWED > MAC_OS_X_VERSION_10_6
+    if ( ([window styleMask] & NSFullScreenWindowMask) && prop_value==CV_WINDOW_NORMAL )
+    {
+        [window toggleFullScreen:nil];
+
+        window.status=CV_WINDOW_NORMAL;
+    }
+    else if( !([window styleMask] & NSFullScreenWindowMask) && prop_value==CV_WINDOW_FULLSCREEN )
+    {
+        [window setCollectionBehavior:NSWindowCollectionBehaviorFullScreenPrimary];
+
+        NSScreen* screen = [window screen];
+
+        NSRect frame = [screen frame];
+        [window setFrame:frame display:YES];
+
+        [window setContentSize:frame.size];
+
+        [window toggleFullScreen:nil];
+
+        [window setFrameTopLeftPoint: frame.origin];
+
+        window.status=CV_WINDOW_FULLSCREEN;
+    }
+#else
     fullscreenOptions = [NSDictionary dictionaryWithObject:[NSNumber numberWithBool:YES] forKey:NSFullScreenModeSetting];
     if ( [[window contentView] isInFullScreenMode] && prop_value==CV_WINDOW_NORMAL )
     {
@@ -736,7 +768,7 @@ void cvSetModeWindow_COCOA( const char* name, double prop_value )
         [[window contentView] enterFullScreenMode:[NSScreen mainScreen] withOptions:fullscreenOptions];
         window.status=CV_WINDOW_FULLSCREEN;
     }
-
+#endif
     [localpool drain];
 
     __END__;
@@ -810,7 +842,7 @@ void cvSetPropTopmost_COCOA( const char* name, const bool topmost )
         CV_ERROR( CV_StsNullPtr, "NULL window" );
     }
 
-    if ([[window contentView] isInFullScreenMode])
+    if (([window styleMask] & NSFullScreenWindowMask))
     {
         EXIT;
     }
diff --git a/modules/highgui/src/window_gtk.cpp b/modules/highgui/src/window_gtk.cpp
index ee590bd9c41f..88421be8b812 100644
--- a/modules/highgui/src/window_gtk.cpp
+++ b/modules/highgui/src/window_gtk.cpp
@@ -271,7 +271,7 @@ cvImageWidget_get_preferred_width (GtkWidget *widget, gint *minimal_width, gint
   CvImageWidget * image_widget = CV_IMAGE_WIDGET( widget );
 
   if(image_widget->original_image != NULL) {
-    *minimal_width = (image_widget->flags & CV_WINDOW_AUTOSIZE) != CV_WINDOW_AUTOSIZE ?
+    *minimal_width = (image_widget->flags & cv::WINDOW_AUTOSIZE) != cv::WINDOW_AUTOSIZE ?
       gdk_window_get_width(gtk_widget_get_window(widget)) : image_widget->original_image->cols;
   }
   else {
@@ -295,7 +295,7 @@ cvImageWidget_get_preferred_height (GtkWidget *widget, gint *minimal_height, gin
   CvImageWidget * image_widget = CV_IMAGE_WIDGET( widget );
 
   if(image_widget->original_image != NULL) {
-    *minimal_height = (image_widget->flags & CV_WINDOW_AUTOSIZE) != CV_WINDOW_AUTOSIZE ?
+    *minimal_height = (image_widget->flags & cv::WINDOW_AUTOSIZE) != cv::WINDOW_AUTOSIZE ?
       gdk_window_get_height(gtk_widget_get_window(widget)) : image_widget->original_image->rows;
   }
   else {
@@ -322,7 +322,7 @@ cvImageWidget_size_request (GtkWidget      *widget,
     //printf("cvImageWidget_size_request ");
     // the case the first time cvShowImage called or when AUTOSIZE
     if( image_widget->original_image &&
-        ((image_widget->flags & CV_WINDOW_AUTOSIZE) ||
+        ((image_widget->flags & cv::WINDOW_AUTOSIZE) ||
          (image_widget->flags & CV_WINDOW_NO_IMAGE)))
     {
         //printf("original ");
@@ -351,7 +351,7 @@ static void cvImageWidget_set_size(GtkWidget * widget, int max_width, int max_he
     //printf("cvImageWidget_set_size %d %d\n", max_width, max_height);
 
     // don't allow to set the size
-    if(image_widget->flags & CV_WINDOW_AUTOSIZE) return;
+    if(image_widget->flags & cv::WINDOW_AUTOSIZE) return;
     if(!image_widget->original_image) return;
 
     CvSize scaled_image_size = cvImageWidget_calc_size( image_widget->original_image->cols,
@@ -390,7 +390,7 @@ cvImageWidget_size_allocate (GtkWidget     *widget,
   image_widget = CV_IMAGE_WIDGET (widget);
 
 
-  if( (image_widget->flags & CV_WINDOW_AUTOSIZE)==0 && image_widget->original_image ){
+  if( (image_widget->flags & cv::WINDOW_AUTOSIZE)==0 && image_widget->original_image ){
       // (re) allocated scaled image
       if( image_widget->flags & CV_WINDOW_NO_IMAGE ){
           cvImageWidget_set_size( widget, image_widget->original_image->cols,
@@ -407,7 +407,7 @@ cvImageWidget_size_allocate (GtkWidget     *widget,
       image_widget = CV_IMAGE_WIDGET (widget);
 
       if( image_widget->original_image &&
-              ((image_widget->flags & CV_WINDOW_AUTOSIZE) ||
+              ((image_widget->flags & cv::WINDOW_AUTOSIZE) ||
                (image_widget->flags & CV_WINDOW_NO_IMAGE)) )
       {
 #if defined (GTK_VERSION3)
@@ -744,7 +744,7 @@ CvRect cvGetWindowRect_GTK(const char* name)
     CV_LOCK_MUTEX();
     const auto window = icvFindWindowByName(name);
     if (!window)
-        CV_Error( CV_StsNullPtr, "NULL window" );
+        CV_Error( cv::Error::StsNullPtr, "NULL window" );
 
     return cvRect(getImageRect_(window));
 }
@@ -786,7 +786,7 @@ double cvGetModeWindow_GTK(const char* name)//YV
     CV_LOCK_MUTEX();
     const auto window = icvFindWindowByName(name);
     if (!window)
-        CV_Error( CV_StsNullPtr, "NULL window" );
+        CV_Error( cv::Error::StsNullPtr, "NULL window" );
 
     double result = window->status;
     return result;
@@ -801,14 +801,14 @@ void cvSetModeWindow_GTK( const char* name, double prop_value)//Yannick Verdie
 
     const auto window = icvFindWindowByName(name);
     if (!window)
-        CV_Error( CV_StsNullPtr, "NULL window" );
+        CV_Error( cv::Error::StsNullPtr, "NULL window" );
 
     setModeWindow_(window, (int)prop_value);
 }
 
 static bool setModeWindow_(const std::shared_ptr<CvWindow>& window, int mode)
 {
-    if (window->flags & CV_WINDOW_AUTOSIZE) //if the flag CV_WINDOW_AUTOSIZE is set
+    if (window->flags & cv::WINDOW_AUTOSIZE) //if the flag cv::WINDOW_AUTOSIZE is set
         return false;
 
     //so easy to do fullscreen here, Linux rocks !
@@ -816,17 +816,17 @@ static bool setModeWindow_(const std::shared_ptr<CvWindow>& window, int mode)
     if (window->status == mode)
         return true;
 
-    if (window->status==CV_WINDOW_FULLSCREEN && mode==CV_WINDOW_NORMAL)
+    if (window->status==cv::WINDOW_FULLSCREEN && mode==cv::WINDOW_NORMAL)
     {
         gtk_window_unfullscreen(GTK_WINDOW(window->frame));
-        window->status=CV_WINDOW_NORMAL;
+        window->status=cv::WINDOW_NORMAL;
         return true;
     }
 
-    if (window->status==CV_WINDOW_NORMAL && mode==CV_WINDOW_FULLSCREEN)
+    if (window->status==cv::WINDOW_NORMAL && mode==cv::WINDOW_FULLSCREEN)
     {
         gtk_window_fullscreen(GTK_WINDOW(window->frame));
-        window->status=CV_WINDOW_FULLSCREEN;
+        window->status=cv::WINDOW_FULLSCREEN;
         return true;
     }
 
@@ -859,7 +859,7 @@ double cvGetPropWindowAutoSize_GTK(const char* name)
     if (!window)
         return -1; // keep silence here
 
-    double result = window->flags & CV_WINDOW_AUTOSIZE;
+    double result = window->flags & cv::WINDOW_AUTOSIZE;
     return result;
 }
 
@@ -917,11 +917,11 @@ namespace
         // Try double-buffered visual
         glconfig = gdk_gl_config_new_by_mode((GdkGLConfigMode)(GDK_GL_MODE_RGB | GDK_GL_MODE_DEPTH | GDK_GL_MODE_DOUBLE));
         if (!glconfig)
-            CV_Error( CV_OpenGlApiCallError, "Can't Create A GL Device Context" );
+            CV_Error( cv::Error::OpenGlApiCallError, "Can't Create A GL Device Context" );
 
         // Set OpenGL-capability to the widget
         if (!gtk_widget_set_gl_capability(window->widget, glconfig, NULL, TRUE, GDK_GL_RGBA_TYPE))
-            CV_Error( CV_OpenGlApiCallError, "Can't Create A GL Device Context" );
+            CV_Error( cv::Error::OpenGlApiCallError, "Can't Create A GL Device Context" );
 
         window->useGl = true;
     }
@@ -932,7 +932,7 @@ namespace
         GdkGLDrawable* gldrawable = gtk_widget_get_gl_drawable(window->widget);
 
         if (!gdk_gl_drawable_gl_begin (gldrawable, glcontext))
-            CV_Error( CV_OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
+            CV_Error( cv::Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
 
         glViewport(0, 0, gtk_widget_get_allocated_width(window->widget), gtk_widget_get_allocated_height(window->widget));
 
@@ -1037,7 +1037,7 @@ static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags
     auto window_ptr = std::make_shared<CvWindow>(name);
     CvWindow* window = window_ptr.get();
     window->flags = flags;
-    window->status = CV_WINDOW_NORMAL;//YV
+    window->status = cv::WINDOW_NORMAL;//YV
 
     window->frame = gtk_window_new( GTK_WINDOW_TOPLEVEL );
 
@@ -1049,10 +1049,10 @@ static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags
     gtk_widget_show( window->paned );
 
 #ifndef HAVE_OPENGL
-    if (flags & CV_WINDOW_OPENGL)
-        CV_Error( CV_OpenGlNotSupported, "Library was built without OpenGL support" );
+    if (flags & cv::WINDOW_OPENGL)
+        CV_Error( cv::Error::OpenGlNotSupported, "Library was built without OpenGL support" );
 #else
-    if (flags & CV_WINDOW_OPENGL)
+    if (flags & cv::WINDOW_OPENGL)
         createGlContext(window);
 
     window->glDrawCallback = 0;
@@ -1097,7 +1097,7 @@ static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags
         getGTKWindows().push_back(window_ptr);
     }
 
-    bool b_nautosize = ((flags & CV_WINDOW_AUTOSIZE) == 0);
+    bool b_nautosize = ((flags & cv::WINDOW_AUTOSIZE) == 0);
     gtk_window_set_resizable( GTK_WINDOW(window->frame), b_nautosize );
 
     // allow window to be resized
@@ -1131,16 +1131,16 @@ CV_IMPL void cvSetOpenGlContext(const char* name)
 
     auto window = icvFindWindowByName(name);
     if (!window)
-        CV_Error( CV_StsNullPtr, "NULL window" );
+        CV_Error( cv::Error::StsNullPtr, "NULL window" );
 
     if (!window->useGl)
-        CV_Error( CV_OpenGlNotSupported, "Window doesn't support OpenGL" );
+        CV_Error( cv::Error::OpenGlNotSupported, "Window doesn't support OpenGL" );
 
     glcontext = gtk_widget_get_gl_context(window->widget);
     gldrawable = gtk_widget_get_gl_drawable(window->widget);
 
     if (!gdk_gl_drawable_make_current(gldrawable, glcontext))
-        CV_Error( CV_OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
+        CV_Error( cv::Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
 }
 
 CV_IMPL void cvUpdateWindow(const char* name)
@@ -1168,7 +1168,7 @@ CV_IMPL void cvSetOpenGlDrawCallback(const char* name, CvOpenGlDrawCallback call
         return;
 
     if (!window->useGl)
-        CV_Error( CV_OpenGlNotSupported, "Window was created without OpenGL context" );
+        CV_Error( cv::Error::OpenGlNotSupported, "Window was created without OpenGL context" );
 
     window->glDrawCallback = callback;
     window->glDrawData = userdata;
@@ -1333,7 +1333,7 @@ void resizeWindow_(const std::shared_ptr<CvWindow>& window, int width, int heigh
 {
     CV_Assert(window);
     CvImageWidget* image_widget = CV_IMAGE_WIDGET( window->widget );
-    //if(image_widget->flags & CV_WINDOW_AUTOSIZE)
+    //if(image_widget->flags & cv::WINDOW_AUTOSIZE)
         //EXIT;
 
     gtk_window_set_resizable( GTK_WINDOW(window->frame), 1 );
@@ -1384,7 +1384,7 @@ icvCreateTrackbar( const char* trackbar_name, const char* window_name,
     CV_Assert(trackbar_name && "NULL trackbar name");
 
     if( count <= 0 )
-        CV_Error( CV_StsOutOfRange, "Bad trackbar maximal value" );
+        CV_Error( cv::Error::StsOutOfRange, "Bad trackbar maximal value" );
 
     CV_LOCK_MUTEX();
 
@@ -1557,7 +1557,7 @@ CV_IMPL void cvSetTrackbarPos( const char* trackbar_name, const char* window_nam
     const auto trackbar = icvFindTrackbarByName(window, trackbar_name);
     if (!trackbar)
     {
-        CV_Error( CV_StsNullPtr, "No trackbar found" );
+        CV_Error( cv::Error::StsNullPtr, "No trackbar found" );
     }
 
     return setTrackbarPos_(trackbar, pos);
@@ -1916,7 +1916,7 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
     if( cv_event >= 0 )
     {
         // scale point if image is scaled
-        if( (image_widget->flags & CV_WINDOW_AUTOSIZE)==0 &&
+        if( (image_widget->flags & cv::WINDOW_AUTOSIZE)==0 &&
              image_widget->original_image &&
              image_widget->scaled_image )
         {
@@ -2080,17 +2080,17 @@ class GTKWindow
         // see cvGetWindowProperty
         switch (prop)
         {
-        case CV_WND_PROP_FULLSCREEN:
+        case cv::WND_PROP_FULLSCREEN:
             return (double)window->status;
 
-        case CV_WND_PROP_AUTOSIZE:
-            return (window->flags & CV_WINDOW_AUTOSIZE) ? 1.0 : 0.0;
+        case cv::WND_PROP_AUTOSIZE:
+            return (window->flags & cv::WINDOW_AUTOSIZE) ? 1.0 : 0.0;
 
-        case CV_WND_PROP_ASPECTRATIO:
+        case cv::WND_PROP_ASPECT_RATIO:
             return getRatioWindow_(window);
 
 #ifdef HAVE_OPENGL
-        case CV_WND_PROP_OPENGL:
+        case cv::WND_PROP_OPENGL:
             return window->useGl ? 1.0 : 0.0;
 #endif
 
@@ -2107,8 +2107,8 @@ class GTKWindow
         // see cvSetWindowProperty
         switch (prop)
         {
-        case CV_WND_PROP_FULLSCREEN:
-            if (value != CV_WINDOW_NORMAL && value != CV_WINDOW_FULLSCREEN)  // bad arg
+        case cv::WND_PROP_FULLSCREEN:
+            if (value != cv::WINDOW_NORMAL && value != cv::WINDOW_FULLSCREEN)  // bad arg
                 break;
             setModeWindow_(window, value);
             return true;
@@ -2286,6 +2286,19 @@ class GTKBackendUI : public UIBackend
     {
         return cvWaitKey(1);  // TODO
     }
+
+    const std::string getName() const CV_OVERRIDE
+    {
+#if GTK_MAJOR_VERSION == 2
+        return "GTK2";
+#elif GTK_MAJOR_VERSION == 3
+        return "GTK3";
+#elif GTK_MAJOR_VERSION == 4
+        return "GTK4";
+#else
+#error "Unsupported GTK version"
+#endif
+    }
 };  // GTKBackendUI
 
 static
diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp
index a3a71a6cc465..32b239290407 100644
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -587,7 +587,7 @@ void cvSetModeWindow_W32(const char* name, double prop_value)//Yannick Verdie
 
 static bool setModeWindow_(CvWindow& window, int mode)
 {
-    if (window.flags & CV_WINDOW_AUTOSIZE)//if the flag CV_WINDOW_AUTOSIZE is set
+    if (window.flags & cv::WINDOW_AUTOSIZE)//if the flag cv::WINDOW_AUTOSIZE is set
         return false;
 
     if (window.status == mode)
@@ -597,18 +597,18 @@ static bool setModeWindow_(CvWindow& window, int mode)
         DWORD dwStyle = (DWORD)GetWindowLongPtr(window.frame, GWL_STYLE);
         CvRect position;
 
-        if (window.status == CV_WINDOW_FULLSCREEN && mode == CV_WINDOW_NORMAL)
+        if (window.status == cv::WINDOW_FULLSCREEN && mode == cv::WINDOW_NORMAL)
         {
             icvLoadWindowPos(window.name.c_str(), position);
             SetWindowLongPtr(window.frame, GWL_STYLE, dwStyle | WS_CAPTION | WS_THICKFRAME);
 
             SetWindowPos(window.frame, HWND_TOP, position.x, position.y , position.width,position.height, SWP_NOZORDER | SWP_FRAMECHANGED);
-            window.status=CV_WINDOW_NORMAL;
+            window.status=cv::WINDOW_NORMAL;
 
             return true;
         }
 
-        if (window.status == CV_WINDOW_NORMAL && mode == CV_WINDOW_FULLSCREEN)
+        if (window.status == cv::WINDOW_NORMAL && mode == cv::WINDOW_FULLSCREEN)
         {
             //save dimension
             RECT rect = { 0 };
@@ -630,7 +630,7 @@ static bool setModeWindow_(CvWindow& window, int mode)
             SetWindowLongPtr(window.frame, GWL_STYLE, dwStyle & ~WS_CAPTION & ~WS_THICKFRAME);
 
             SetWindowPos(window.frame, HWND_TOP, position.x, position.y , position.width,position.height, SWP_NOZORDER | SWP_FRAMECHANGED);
-            window.status=CV_WINDOW_FULLSCREEN;
+            window.status=cv::WINDOW_FULLSCREEN;
 
             return true;
         }
@@ -836,7 +836,7 @@ double cvGetPropWindowAutoSize_W32(const char* name)
     if (!window)
         CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
-    result = window->flags & CV_WINDOW_AUTOSIZE;
+    result = window->flags & cv::WINDOW_AUTOSIZE;
 
     return result;
 }
@@ -1055,11 +1055,11 @@ static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags
     CvRect rect;
     icvLoadWindowPos(name.c_str(), rect);
 
-    if (!(flags & CV_WINDOW_AUTOSIZE))//YV add border in order to resize the window
+    if (!(flags & cv::WINDOW_AUTOSIZE))//YV add border in order to resize the window
        defStyle |= WS_SIZEBOX;
 
 #ifdef HAVE_OPENGL
-    if (flags & CV_WINDOW_OPENGL)
+    if (flags & cv::WINDOW_OPENGL)
         defStyle |= WS_CLIPCHILDREN | WS_CLIPSIBLINGS;
 #endif
 
@@ -1076,14 +1076,14 @@ static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags
         CV_Error(Error::StsError, "Frame window can not be created");
 
 #ifndef HAVE_OPENGL
-    if (flags & CV_WINDOW_OPENGL)
+    if (flags & cv::WINDOW_OPENGL)
         CV_Error(Error::OpenGlNotSupported, "Library was built without OpenGL support");
 #else
     useGl = false;
     hGLDC = 0;
     hGLRC = 0;
 
-    if (flags & CV_WINDOW_OPENGL)
+    if (flags & cv::WINDOW_OPENGL)
         createGlContext(hWnd, hGLDC, hGLRC, useGl);
 #endif
 
@@ -1117,7 +1117,7 @@ static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags
 #endif
 
     window->last_key = 0;
-    window->status = CV_WINDOW_NORMAL;//YV
+    window->status = cv::WINDOW_NORMAL;//YV
 
     window->on_mouse = 0;
     window->on_mouse_param = 0;
@@ -1338,7 +1338,7 @@ static void icvUpdateWindowPos(CvWindow& window)
 {
     RECT rect = { 0 };
 
-    if ((window.flags & CV_WINDOW_AUTOSIZE) && window.image)
+    if ((window.flags & cv::WINDOW_AUTOSIZE) && window.image)
     {
         int i;
         SIZE size = {0,0};
@@ -1383,7 +1383,7 @@ cvShowImage(const char* name, const CvArr* arr)
         window = icvFindWindowByName(name);
         if (!window)
         {
-            cvNamedWindow(name, CV_WINDOW_AUTOSIZE);
+            cvNamedWindow(name, cv::WINDOW_AUTOSIZE);
             window = icvFindWindowByName(name);
         }
     }
@@ -1548,7 +1548,7 @@ MainWindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam)
         break;
 
     case WM_GETMINMAXINFO:
-        if (!(window.flags & CV_WINDOW_AUTOSIZE))
+        if (!(window.flags & cv::WINDOW_AUTOSIZE))
         {
             MINMAXINFO* minmax = (MINMAXINFO*)lParam;
             RECT rect = { 0 };
@@ -1579,7 +1579,7 @@ MainWindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam)
                 MoveWindow(window.toolbar.toolbar, 0, 0, pos->cx, rect.bottom - rect.top, TRUE);
             }
 
-            if (!(window.flags & CV_WINDOW_AUTOSIZE))
+            if (!(window.flags & cv::WINDOW_AUTOSIZE))
                 icvUpdateWindowPos(window);
 
             break;
@@ -1846,7 +1846,7 @@ static LRESULT CALLBACK HighGUIProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
             pt.x = GET_X_LPARAM(lParam);
             pt.y = GET_Y_LPARAM(lParam);
 
-            if (window.flags & CV_WINDOW_AUTOSIZE)
+            if (window.flags & cv::WINDOW_AUTOSIZE)
             {
                 // As user can't change window size, do not scale window coordinates. Underlying windowing system
                 // may prevent full window from being displayed and in this case coordinates should not be scaled.
@@ -1908,7 +1908,7 @@ static LRESULT CALLBACK HighGUIProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
                 SetDIBColorTable(window.dc, 0, 255, table);
             }
 
-            if (window.flags & CV_WINDOW_AUTOSIZE)
+            if (window.flags & cv::WINDOW_AUTOSIZE)
             {
                 BitBlt(hdc, 0, 0, size.cx, size.cy, window.dc, 0, 0, SRCCOPY);
             }
@@ -2161,7 +2161,7 @@ static void showSaveDialog(CvWindow& window)
 #ifdef HAVE_TIFF
                       "TIFF Files (*.tiff;*.tif)\0*.tiff;*.tif\0"
 #endif
-#ifdef HAVE_JASPER
+#if defined(HAVE_JASPER) || defined(HAVE_OPENJPEG)
                       "JPEG-2000 files (*.jp2)\0*.jp2\0"
 #endif
 #ifdef HAVE_WEBP
@@ -3004,6 +3004,11 @@ class Win32BackendUI : public UIBackend
     {
         return pollKey_W32();
     }
+
+    const std::string getName() const CV_OVERRIDE
+    {
+        return "WIN32";
+    }
 };  // Win32BackendUI
 
 static
diff --git a/modules/highgui/src/window_wayland.cpp b/modules/highgui/src/window_wayland.cpp
index 69231c00724a..3bf87c06ec22 100644
--- a/modules/highgui/src/window_wayland.cpp
+++ b/modules/highgui/src/window_wayland.cpp
@@ -84,20 +84,73 @@ static int xkb_keysym_to_ascii(xkb_keysym_t keysym) {
     return static_cast<int>(keysym & 0xff);
 }
 
+static void write_mat_to_xrgb8888(cv::Mat const &img_, void *data) {
+    // Validate destination data.
+    CV_CheckFalse((data == nullptr), "Destination Address must not be nullptr.");
+
+    // Validate source img parameters.
+    CV_CheckFalse(img_.empty(), "Source Mat must not be empty.");
+    const int ncn   = img_.channels();
+    CV_CheckType(img_.type(),
+        ( (ncn == 1) || (ncn == 3) || (ncn == 4)),
+        "Unsupported channels, please convert to 1, 3 or 4 channels"
+    );
 
-static void draw_xrgb8888(void *d, uint8_t a, uint8_t r, uint8_t g, uint8_t b) {
-    *((uint32_t *) d) = ((a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void write_mat_to_xrgb8888(cv::Mat const &img, void *data) {
-    CV_Assert(data != nullptr);
-    CV_Assert(img.isContinuous());
+    // The supported Mat depth is according to imshow() specification.
+    const int depth = CV_MAT_DEPTH(img_.type());
+    CV_CheckDepth(img_.type(),
+        ( (depth == CV_8U)  || (depth == CV_8S)  ||
+          (depth == CV_16U) || (depth == CV_16S) ||
+          (depth == CV_32F) || (depth == CV_64F) ),
+        "Unsupported depth, please convert to CV_8U"
+    );
 
-    for (int y = 0; y < img.rows; y++) {
-        for (int x = 0; x < img.cols; x++) {
-            auto p = img.at<cv::Vec3b>(y, x);
-            draw_xrgb8888((char *) data + (y * img.cols + x) * 4, 0x00, p[2], p[1], p[0]);
-        }
+    // Convert to CV_8U
+    cv::Mat img;
+    const int mtype = CV_MAKE_TYPE(CV_8U, ncn);
+    switch(CV_MAT_DEPTH(depth))
+    {
+    case CV_8U:
+        img = img_; // do nothing.
+        break;
+    case CV_8S:
+        // [-128,127] -> [0,255]
+        img_.convertTo(img, mtype, 1.0, 128);
+        break;
+    case CV_16U:
+        // [0,65535] -> [0,255]
+        img_.convertTo(img, mtype, 1.0/255. );
+        break;
+    case CV_16S:
+        // [-32768,32767] -> [0,255]
+        img_.convertTo(img, mtype, 1.0/255. , 128);
+        break;
+    case CV_32F:
+    case CV_64F:
+        // [0, 1] -> [0,255]
+        img_.convertTo(img, mtype, 255.);
+        break;
+    default:
+        // it cannot be reachable.
+        break;
+    }
+    CV_CheckDepthEQ(CV_MAT_DEPTH(img.type()), CV_8U, "img should be CV_8U");
+
+    // XRGB8888 in Little Endian(Wayland Request) = [B8:G8:R8:X8] in data array.
+    // X is not used to show. So we can use cvtColor() with GRAY2BGRA or BGR2BGRA or copyTo().
+    cv::Mat dst(img.size(), CV_MAKE_TYPE(CV_8U, 4), (uint8_t*)data);
+    if(ncn == 1)
+    {
+        cvtColor(img, dst, cv::COLOR_GRAY2BGRA);
+    }
+    else if(ncn == 3)
+    {
+        cvtColor(img, dst, cv::COLOR_BGR2BGRA);
+    }
+    else
+    {
+        CV_CheckTrue(ncn==4, "Unexpected channels");
+        img.copyTo(dst);
     }
 }
 
@@ -231,7 +284,13 @@ class cv_wl_mouse {
             &handle_pointer_motion, &handle_pointer_button,
             &handle_pointer_axis, &handle_pointer_frame,
             &handle_pointer_axis_source, &handle_pointer_axis_stop,
-            &handle_pointer_axis_discrete
+            &handle_pointer_axis_discrete,
+#ifdef WL_POINTER_AXIS_VALUE120_SINCE_VERSION
+            &handle_axis_value120,
+#endif
+#ifdef WL_POINTER_AXIS_RELATIVE_DIRECTION_SINCE_VERSION
+            &handle_axis_relative_direction,
+#endif
     };
     cv_wl_window *focus_window_{};
 
@@ -277,6 +336,27 @@ class cv_wl_mouse {
         CV_UNUSED(axis);
         CV_UNUSED(discrete);
     }
+
+#ifdef WL_POINTER_AXIS_VALUE120_SINCE_VERSION
+    static void
+    handle_axis_value120(void *data, struct wl_pointer *wl_pointer, uint32_t axis, int32_t value120) {
+        CV_UNUSED(data);
+        CV_UNUSED(wl_pointer);
+        CV_UNUSED(axis);
+        CV_UNUSED(value120);
+    }
+#endif
+
+#ifdef WL_POINTER_AXIS_RELATIVE_DIRECTION_SINCE_VERSION
+    static void
+    handle_axis_relative_direction(void *data, struct wl_pointer *wl_pointer, uint32_t axis, uint32_t direction) {
+        CV_UNUSED(data);
+        CV_UNUSED(wl_pointer);
+        CV_UNUSED(axis);
+        CV_UNUSED(direction);
+    }
+#endif
+
 };
 
 class cv_wl_keyboard {
@@ -543,6 +623,9 @@ class cv_wl_viewer : public cv_wl_widget {
     cv::Rect last_img_area_;
     bool image_changed_ = false;
 
+    int real_img_width = 0;
+    cv::Scalar const outarea_color_ = CV_RGB(0, 0, 0);
+
     void *param_ = nullptr;
     CvMouseCallback callback_ = nullptr;
 };
@@ -659,7 +742,7 @@ class cv_wl_window {
 
     void show_image(cv::Mat const &image);
 
-    void create_trackbar(std::string const &name, int *value, int count, CvTrackbarCallback2 on_change, void *userdata);
+    int create_trackbar(std::string const &name, int *value, int count, CvTrackbarCallback2 on_change, void *userdata);
 
     weak_ptr<cv_wl_trackbar> get_trackbar(std::string const &) const;
 
@@ -695,7 +778,13 @@ class cv_wl_window {
     };
     struct xdg_toplevel *xdg_toplevel_;
     struct xdg_toplevel_listener xdgtop_listener_{
-            &handle_toplevel_configure, &handle_toplevel_close
+            &handle_toplevel_configure, &handle_toplevel_close,
+#ifdef XDG_TOPLEVEL_CONFIGURE_BOUNDS_SINCE_VERSION
+            &handle_toplevel_configure_bounds,
+#endif
+#ifdef XDG_TOPLEVEL_WM_CAPABILITIES_SINCE_VERSION
+            &handle_toplevel_wm_capabilities,
+#endif
     };
     bool wait_for_configure_ = true;
 
@@ -742,6 +831,27 @@ class cv_wl_window {
 
     static void handle_toplevel_close(void *data, struct xdg_toplevel *surface);
 
+#ifdef XDG_TOPLEVEL_CONFIGURE_BOUNDS_SINCE_VERSION
+    static void
+    handle_toplevel_configure_bounds(void *data, struct xdg_toplevel *xdg_toplevel, int32_t width, int32_t height)
+    {
+        CV_UNUSED(data);
+        CV_UNUSED(xdg_toplevel);
+        CV_UNUSED(width);
+        CV_UNUSED(height);
+    }
+#endif
+
+#ifdef XDG_TOPLEVEL_WM_CAPABILITIES_SINCE_VERSION
+    static void
+    handle_toplevel_wm_capabilities(void *data, struct xdg_toplevel *xdg_toplevel, struct wl_array *capabilities)
+    {
+        CV_UNUSED(data);
+        CV_UNUSED(xdg_toplevel);
+        CV_UNUSED(capabilities);
+    }
+#endif
+
     static void handle_frame_callback(void *data, struct wl_callback *cb, uint32_t time);
 };
 
@@ -1055,7 +1165,7 @@ void cv_wl_keyboard::handle_kb_keymap(void *data, struct wl_keyboard *kb, uint32
     } catch (std::exception &e) {
         if (keyboard->xkb_.keymap)
             xkb_keymap_unref(keyboard->xkb_.keymap);
-        std::cerr << "OpenCV Error: " << e.what() << std::endl;
+        CV_LOG_ERROR(NULL, "OpenCV Error: " << e.what());
     }
 
     close(fd);
@@ -1442,18 +1552,18 @@ cv::Rect cv_wl_titlebar::draw(void *data, cv::Size const &size, bool force) {
             cv::putText(
                     buf_, window_->get_title(),
                     origin, title_.face, title_.scale,
-                    CV_RGB(0xff, 0xff, 0xff), title_.thickness, CV_AA
+                    CV_RGB(0xff, 0xff, 0xff), title_.thickness, cv::LINE_AA
             );
         }
 
         buf_(cv::Rect(btn_min_.tl(), cv::Size(titlebar_min_width, size.height))) = bg_color_;
-        cv::line(buf_, btn_cls.tl(), btn_cls.br(), line_color_, 1, CV_AA);
+        cv::line(buf_, btn_cls.tl(), btn_cls.br(), line_color_, 1, cv::LINE_AA);
         cv::line(buf_, btn_cls.tl() + cv::Point(btn_cls.width, 0), btn_cls.br() - cv::Point(btn_cls.width, 0),
-                 line_color_, 1, CV_AA);
-        cv::rectangle(buf_, btn_max.tl(), btn_max.br(), line_color_, 1, CV_AA);
+                 line_color_, 1, cv::LINE_AA);
+        cv::rectangle(buf_, btn_max.tl(), btn_max.br(), line_color_, 1, cv::LINE_AA);
         cv::line(buf_, cv::Point(btn_min_.x + 8, btn_min_.height / 2),
-                 cv::Point(btn_min_.x + btn_min_.width - 8, btn_min_.height / 2), line_color_, 1, CV_AA);
-        cv::line(buf_, cv::Point(0, 0), cv::Point(buf_.size().width, 0), border_color_, 1, CV_AA);
+                 cv::Point(btn_min_.x + btn_min_.width - 8, btn_min_.height / 2), line_color_, 1, cv::LINE_AA);
+        cv::line(buf_, cv::Point(0, 0), cv::Point(buf_.size().width, 0), border_color_, 1, cv::LINE_AA);
 
         write_mat_to_xrgb8888(buf_, data);
         last_size_ = size;
@@ -1473,14 +1583,30 @@ cv_wl_viewer::cv_wl_viewer(cv_wl_window *window, int flags)
 }
 
 void cv_wl_viewer::set_image(cv::Mat const &image) {
-    if (image.type() == CV_8UC1) {
-        cv::Mat bgr;
-        cv::cvtColor(image, bgr, CV_GRAY2BGR);
-        image_ = bgr.clone();
-    } else {
-        image_ = image.clone();
-    }
+    image_ = image.clone();
     image_changed_ = true;
+
+    // See https://github.com/opencv/opencv/issues/25560
+    // If image_ width is too small enough to show title and buttons, expand it.
+
+    // Keep real image width to limit x position for callback functions
+    real_img_width = image_.size().width;
+
+    // Minimum width of title is not defined, so use button width * 3 instead of it.
+    const int view_min_width = cv_wl_titlebar::btn_width * 3 + cv_wl_titlebar::titlebar_min_width;
+
+    const int margin = view_min_width - real_img_width;
+    if(margin > 0)
+    {
+        copyMakeBorder(image_,               // src
+                       image_,               // dst
+                       0,                    // top
+                       0,                    // bottom
+                       0,                    // left
+                       margin,               // right
+                       cv::BORDER_CONSTANT,  // borderType
+                       outarea_color_ );     // value(color)
+    }
 }
 
 void cv_wl_viewer::set_mouse_callback(CvMouseCallback callback, void *param) {
@@ -1509,7 +1635,7 @@ void cv_wl_viewer::get_preferred_height_for_width(int width, int &minimum, int &
         minimum = natural = image_.size().height;
     } else {
         natural = static_cast<int>(width * aspect_ratio(image_.size()));
-        minimum = (flags_ & CV_WINDOW_FREERATIO ? 0 : natural);
+        minimum = (flags_ & cv::WINDOW_FREERATIO ? 0 : natural);
     }
 }
 
@@ -1533,6 +1659,8 @@ void cv_wl_viewer::on_mouse(int event, cv::Point const &p, int flag) {
             int x = static_cast<int>((p.x - last_img_area_.x) * ((double) image_.size().width / last_img_area_.width));
             int y = static_cast<int>((p.y - last_img_area_.y) *
                                      ((double) image_.size().height / last_img_area_.height));
+
+            x = cv::min(x, real_img_width);
             callback_(event, x, y, flag, param_);
         }
     }
@@ -1548,11 +1676,11 @@ cv::Rect cv_wl_viewer::draw(void *data, cv::Size const &size, bool force) {
         CV_Assert(image_.size() == size);
         write_mat_to_xrgb8888(image_, data);
     } else {
-        if (flags_ & CV_WINDOW_FREERATIO) {
+        if (flags_ & cv::WINDOW_FREERATIO) {
             cv::Mat resized;
             cv::resize(image_, resized, size);
             write_mat_to_xrgb8888(resized, data);
-        } else /* CV_WINDOW_KEEPRATIO */ {
+        } else /* cv::WINDOW_KEEPRATIO */ {
             auto rect = cv::Rect(cv::Point(0, 0), size);
             if (aspect_ratio(size) >= aspect_ratio(image_.size())) {
                 rect.height = static_cast<int>(image_.size().height * ((double) rect.width / image_.size().width));
@@ -1588,6 +1716,11 @@ cv_wl_trackbar::cv_wl_trackbar(cv_wl_window *window, std::string name,
     on_change_.value = value;
     on_change_.data = data;
     on_change_.callback = on_change;
+
+    // initilize slider_.value if value is not nullptr.
+    if (value != nullptr){
+        set_pos(*value);
+    }
 }
 
 std::string const &cv_wl_trackbar::name() const {
@@ -1603,6 +1736,12 @@ void cv_wl_trackbar::set_pos(int value) {
         slider_.value = value;
         slider_moved_ = true;
         window_->show();
+
+        // Update user-ptr value and call on_change() function if cv_wl_trackbar::draw() is not called.
+        if(slider_moved_) {
+            on_change_.update(slider_.value);
+            on_change_.call(slider_.value);
+        }
     }
 }
 
@@ -1612,6 +1751,12 @@ void cv_wl_trackbar::set_max(int maxval) {
         slider_.value = maxval;
         slider_moved_ = true;
         window_->show();
+
+        // Update user-ptr and call on_change() function if cv_wl_trackbar::draw() is not called.
+        if(slider_moved_) {
+            on_change_.update(slider_.value);
+            on_change_.call(slider_.value);
+        }
     }
 }
 
@@ -1657,12 +1802,12 @@ cv::Rect cv_wl_trackbar::draw(void *data, cv::Size const &size, bool force) {
                 data_,
                 (name_ + ": " + std::to_string(slider_.value)),
                 bar_.text_orig, bar_.fontface, bar_.fontscale,
-                CV_RGB(0x00, 0x00, 0x00), bar_.font_thickness, CV_AA);
+                CV_RGB(0x00, 0x00, 0x00), bar_.font_thickness, cv::LINE_AA);
 
-        cv::line(data_, bar_.left, bar_.right, color_.bg, bar_.thickness + 3, CV_AA);
-        cv::line(data_, bar_.left, bar_.right, color_.fg, bar_.thickness, CV_AA);
-        cv::circle(data_, slider_.pos, slider_.radius, color_.fg, -1, CV_AA);
-        cv::circle(data_, slider_.pos, slider_.radius, color_.bg, 1, CV_AA);
+        cv::line(data_, bar_.left, bar_.right, color_.bg, bar_.thickness + 3, cv::LINE_AA);
+        cv::line(data_, bar_.left, bar_.right, color_.fg, bar_.thickness, cv::LINE_AA);
+        cv::circle(data_, slider_.pos, slider_.radius, color_.fg, -1, cv::LINE_AA);
+        cv::circle(data_, slider_.pos, slider_.radius, color_.bg, 1, cv::LINE_AA);
 
         write_mat_to_xrgb8888(data_, data);
         damage = cv::Rect(cv::Point(0, 0), size);
@@ -1782,8 +1927,9 @@ void cv_wl_window::show_image(cv::Mat const &image) {
     this->show();
 }
 
-void cv_wl_window::create_trackbar(std::string const &name, int *value, int count, CvTrackbarCallback2 on_change,
+int cv_wl_window::create_trackbar(std::string const &name, int *value, int count, CvTrackbarCallback2 on_change,
                                    void *userdata) {
+    int ret = 0;
     auto exists = this->get_trackbar(name).lock();
     if (!exists) {
         auto trackbar =
@@ -1792,7 +1938,9 @@ void cv_wl_window::create_trackbar(std::string const &name, int *value, int coun
                 );
         widgets_.emplace_back(trackbar);
         widget_geometries_.emplace_back(0, 0, 0, 0);
+        ret = 1;
     }
+    return ret;
 }
 
 weak_ptr<cv_wl_trackbar> cv_wl_window::get_trackbar(std::string const &trackbar_name) const {
@@ -2270,6 +2418,7 @@ std::string const &cv_wl_core::get_window_name(void *handle) {
 }
 
 bool cv_wl_core::create_window(std::string const &name, int flags) {
+    CV_CheckTrue(display_ != nullptr, "Display is not connected.");
     auto window = std::make_shared<cv_wl_window>(display_, name, flags);
     auto result = windows_.insert(std::make_pair(name, window));
     handles_[window.get()] = window->get_title();
@@ -2348,6 +2497,13 @@ CV_IMPL void cvResizeWindow(const char *name, int width, int height) {
         throw_system_error("Could not get window name", errno)
 }
 
+CvRect cvGetWindowRect_WAYLAND(const char* name)
+{
+    CV_UNUSED(name);
+    CV_LOG_ONCE_WARNING(nullptr, "Function not implemented: User cannot get window rect in Wayland");
+    return cvRect(-1, -1, -1, -1);
+}
+
 CV_IMPL int cvCreateTrackbar(const char *name_bar, const char *window_name, int *value, int count,
                              CvTrackbarCallback on_change) {
     CV_UNUSED(name_bar);
@@ -2362,10 +2518,11 @@ CV_IMPL int cvCreateTrackbar(const char *name_bar, const char *window_name, int
 
 CV_IMPL int cvCreateTrackbar2(const char *trackbar_name, const char *window_name, int *val, int count,
                               CvTrackbarCallback2 on_notify, void *userdata) {
+    int ret = 0;
     if (auto window = CvWlCore::getInstance().get_window(window_name))
-        window->create_trackbar(trackbar_name, val, count, on_notify, userdata);
+        ret = window->create_trackbar(trackbar_name, val, count, on_notify, userdata);
 
-    return 0;
+    return ret;
 }
 
 CV_IMPL int cvGetTrackbarPos(const char *trackbar_name, const char *window_name) {
@@ -2406,11 +2563,17 @@ CV_IMPL void cvSetMouseCallback(const char *window_name, CvMouseCallback on_mous
 }
 
 CV_IMPL void cvShowImage(const char *name, const CvArr *arr) {
-    auto cv_core = CvWlCore::getInstance();
-    auto window = cv_core.get_window(name);
+    // see https://github.com/opencv/opencv/issues/25497
+    /*
+     * To reuse the result of getInstance() repeatedly looks like better efficient implementation.
+     * However, it defined as static shared_ptr member variable in CvWlCore.
+     * If it reaches out of scope, cv_wl_core::~cv_wl_core() is called and all windows will be destroyed.
+     * For workaround, avoid it.
+     */
+    auto window = CvWlCore::getInstance().get_window(name);
     if (!window) {
-        cv_core.create_window(name, cv::WINDOW_AUTOSIZE);
-        if (!(window = cv_core.get_window(name)))
+        CvWlCore::getInstance().create_window(name, cv::WINDOW_AUTOSIZE);
+        if (!(window = CvWlCore::getInstance().get_window(name)))
             CV_Error_(StsNoMem, ("Failed to create window: %s", name));
     }
 
@@ -2425,11 +2588,16 @@ void setWindowTitle_WAYLAND(const cv::String &winname, const cv::String &title)
 
 CV_IMPL int cvWaitKey(int delay) {
     int key = -1;
-    auto limit = ch::duration_cast<ch::nanoseconds>(ch::milliseconds(delay));
+    auto limit = ch::duration_cast<ch::nanoseconds>(ch::milliseconds(delay)).count();
     auto start_time = ch::duration_cast<ch::nanoseconds>(
             ch::steady_clock::now().time_since_epoch())
             .count();
 
+    // See https://github.com/opencv/opencv/issues/25501
+    // Too long sleep_for() makes no response to Wayland ping-pong mechanism
+    // So interval is limited to 33ms (1000ms / 30fps).
+    auto sleep_time_min = ch::duration_cast<ch::nanoseconds>(ch::milliseconds(33)).count();
+
     while (true) {
 
         auto res = CvWlCore::getInstance().display().run_once();
@@ -2448,11 +2616,11 @@ CV_IMPL int cvWaitKey(int delay) {
                 .count();
 
         auto elapsed = end_time - start_time;
-        if (limit.count() > 0 && elapsed >= limit.count()) {
+        if (limit > 0 && elapsed >= limit) {
             break;
         }
 
-        auto sleep_time = 64000 - elapsed;
+        auto sleep_time = std::min(limit - elapsed, sleep_time_min);
         if (sleep_time > 0) {
             std::this_thread::sleep_for(ch::nanoseconds(sleep_time));
         }
diff --git a/modules/highgui/src/window_winrt.cpp b/modules/highgui/src/window_winrt.cpp
index af771bd00b9c..93d14e3aef56 100644
--- a/modules/highgui/src/window_winrt.cpp
+++ b/modules/highgui/src/window_winrt.cpp
@@ -36,7 +36,7 @@
 
 #define CV_WINRT_NO_GUI_ERROR( funcname )       \
 {                                               \
-    cvError( CV_StsNotImplemented, funcname,    \
+    cvError( cv::Error::StsNotImplemented, funcname,    \
     "The function is not implemented. ",        \
     __FILE__, __LINE__ );                       \
 }
@@ -65,7 +65,7 @@ CV_IMPL void cvShowImage(const char* name, const CvArr* arr)
     CvMat stub, *image;
 
     if (!name)
-        CV_ERROR(CV_StsNullPtr, "NULL name");
+        CV_ERROR(cv::Error::StsNullPtr, "NULL name");
 
     CvWindow* window = HighguiBridge::getInstance().namedWindow(name);
 
@@ -89,7 +89,7 @@ CV_IMPL int cvNamedWindow(const char* name, int flags)
     CV_FUNCNAME("cvNamedWindow");
 
     if (!name)
-        CV_ERROR(CV_StsNullPtr, "NULL name");
+        CV_ERROR(cv::Error::StsNullPtr, "NULL name");
 
     HighguiBridge::getInstance().namedWindow(name);
 
@@ -101,7 +101,7 @@ CV_IMPL void cvDestroyWindow(const char* name)
     CV_FUNCNAME("cvDestroyWindow");
 
     if (!name)
-        CV_ERROR(CV_StsNullPtr, "NULL name string");
+        CV_ERROR(cv::Error::StsNullPtr, "NULL name string");
 
     HighguiBridge::getInstance().destroyWindow(name);
 }
@@ -119,16 +119,16 @@ CV_IMPL int cvCreateTrackbar2(const char* trackbar_name, const char* window_name
     int pos = 0;
 
     if (!window_name || !trackbar_name)
-        CV_ERROR(CV_StsNullPtr, "NULL window or trackbar name");
+        CV_ERROR(cv::Error::StsNullPtr, "NULL window or trackbar name");
 
     if (count < 0)
-        CV_ERROR(CV_StsOutOfRange, "Bad trackbar max value");
+        CV_ERROR(cv::Error::StsOutOfRange, "Bad trackbar max value");
 
     CvWindow* window = HighguiBridge::getInstance().namedWindow(window_name);
 
     if (!window)
     {
-        CV_ERROR(CV_StsNullPtr, "NULL window");
+        CV_ERROR(cv::Error::StsNullPtr, "NULL window");
     }
 
     window->createSlider(trackbar_name, val, count, on_notify, userdata);
@@ -143,7 +143,7 @@ CV_IMPL void cvSetTrackbarPos(const char* trackbar_name, const char* window_name
     CvTrackbar* trackbar = 0;
 
     if (trackbar_name == 0 || window_name == 0)
-        CV_ERROR(CV_StsNullPtr, "NULL trackbar or window name");
+        CV_ERROR(cv::Error::StsNullPtr, "NULL trackbar or window name");
 
     CvWindow* window = HighguiBridge::getInstance().findWindowByName(window_name);
     if (window)
@@ -160,7 +160,7 @@ CV_IMPL void cvSetTrackbarMax(const char* trackbar_name, const char* window_name
     if (maxval >= 0)
     {
         if (trackbar_name == 0 || window_name == 0)
-            CV_ERROR(CV_StsNullPtr, "NULL trackbar or window name");
+            CV_ERROR(cv::Error::StsNullPtr, "NULL trackbar or window name");
 
         CvTrackbar* trackbar = HighguiBridge::getInstance().findTrackbarByName(trackbar_name, window_name);
 
@@ -176,7 +176,7 @@ CV_IMPL void cvSetTrackbarMin(const char* trackbar_name, const char* window_name
     if (minval >= 0)
     {
         if (trackbar_name == 0 || window_name == 0)
-            CV_ERROR(CV_StsNullPtr, "NULL trackbar or window name");
+            CV_ERROR(cv::Error::StsNullPtr, "NULL trackbar or window name");
 
         CvTrackbar* trackbar = HighguiBridge::getInstance().findTrackbarByName(trackbar_name, window_name);
 
@@ -192,7 +192,7 @@ CV_IMPL int cvGetTrackbarPos(const char* trackbar_name, const char* window_name)
     CV_FUNCNAME("cvGetTrackbarPos");
 
     if (trackbar_name == 0 || window_name == 0)
-        CV_ERROR(CV_StsNullPtr, "NULL trackbar or window name");
+        CV_ERROR(cv::Error::StsNullPtr, "NULL trackbar or window name");
 
     CvTrackbar* trackbar = HighguiBridge::getInstance().findTrackbarByName(trackbar_name, window_name);
 
@@ -229,7 +229,7 @@ CV_IMPL void cvSetMouseCallback(const char* window_name, CvMouseCallback on_mous
     CV_FUNCNAME("cvSetMouseCallback");
 
     if (!window_name)
-        CV_ERROR(CV_StsNullPtr, "NULL window name");
+        CV_ERROR(cv::Error::StsNullPtr, "NULL window name");
 
     CvWindow* window = HighguiBridge::getInstance().findWindowByName(window_name);
     if (!window)
@@ -253,19 +253,19 @@ CV_IMPL void cvResizeWindow(const char* name, int width, int height)
 CV_IMPL int cvInitSystem(int, char**)
 {
     CV_WINRT_NO_GUI_ERROR("cvInitSystem");
-    return CV_StsNotImplemented;
+    return cv::Error::StsNotImplemented;
 }
 
 CV_IMPL void* cvGetWindowHandle(const char*)
 {
     CV_WINRT_NO_GUI_ERROR("cvGetWindowHandle");
-    return (void*) CV_StsNotImplemented;
+    return (void*) cv::Error::StsNotImplemented;
 }
 
 CV_IMPL const char* cvGetWindowName(void*)
 {
     CV_WINRT_NO_GUI_ERROR("cvGetWindowName");
-    return (const char*) CV_StsNotImplemented;
+    return (const char*) cv::Error::StsNotImplemented;
 }
 
 void cvSetModeWindow_WinRT(const char* name, double prop_value) {
@@ -274,10 +274,10 @@ void cvSetModeWindow_WinRT(const char* name, double prop_value) {
 
 double cvGetModeWindow_WinRT(const char* name) {
     CV_WINRT_NO_GUI_ERROR("cvGetModeWindow");
-    return CV_StsNotImplemented;
+    return cv::Error::StsNotImplemented;
 }
 
 CV_IMPL int cvStartWindowThread() {
     CV_WINRT_NO_GUI_ERROR("cvStartWindowThread");
-    return CV_StsNotImplemented;
+    return cv::Error::StsNotImplemented;
 }
diff --git a/modules/highgui/test/test_gui.cpp b/modules/highgui/test/test_gui.cpp
index de40e80ede7d..5b72545faf5f 100644
--- a/modules/highgui/test/test_gui.cpp
+++ b/modules/highgui/test/test_gui.cpp
@@ -48,6 +48,16 @@ inline void verify_size(const std::string &nm, const cv::Mat &img)
 {
     EXPECT_NO_THROW(imshow(nm, img));
     EXPECT_EQ(-1, waitKey(200));
+
+    // see https://github.com/opencv/opencv/issues/25550
+    // Wayland backend is not supported getWindowImageRect().
+    string framework;
+    EXPECT_NO_THROW(framework = currentUIFramework());
+    if(framework == "WAYLAND")
+    {
+       return;
+    }
+
     Rect rc;
     EXPECT_NO_THROW(rc = getWindowImageRect(nm));
     EXPECT_EQ(rc.size(), img.size());
@@ -205,4 +215,42 @@ TEST(Highgui_GUI, trackbar)
     EXPECT_NO_THROW(destroyAllWindows());
 }
 
+// See https://github.com/opencv/opencv/issues/25560
+#if (!defined(ENABLE_PLUGINS) \
+        && !defined HAVE_GTK \
+        && !defined HAVE_QT \
+        && !defined HAVE_WIN32UI \
+        && !defined HAVE_WAYLAND)
+TEST(Highgui_GUI, DISABLED_small_width_image)
+#else
+TEST(Highgui_GUI, small_width_image)
+#endif
+{
+    const std::string window_name("trackbar_test_window");
+    cv::Mat src(1,1,CV_8UC3,cv::Scalar(0));
+    EXPECT_NO_THROW(destroyAllWindows());
+    ASSERT_NO_THROW(namedWindow(window_name));
+    ASSERT_NO_THROW(imshow(window_name, src));
+    EXPECT_NO_THROW(waitKey(10));
+    EXPECT_NO_THROW(destroyAllWindows());
+}
+
+TEST(Highgui_GUI, currentUIFramework)
+{
+    auto framework = currentUIFramework();
+    std::cout << "UI framework: \"" << framework << "\"" << std::endl;
+#if (!defined(ENABLE_PLUGINS) \
+        && !defined HAVE_GTK \
+        && !defined HAVE_QT \
+        && !defined HAVE_WIN32UI \
+        && !defined HAVE_COCOA \
+        && !defined HAVE_WAYLAND \
+    )
+    EXPECT_TRUE(framework.empty());
+#elif !defined(ENABLE_PLUGINS)
+    EXPECT_GT(framework.size(), 0);  // builtin backends
+#endif
+}
+
+
 }} // namespace
diff --git a/modules/imgcodecs/CMakeLists.txt b/modules/imgcodecs/CMakeLists.txt
index 8183837c4374..1468d4d73b1c 100644
--- a/modules/imgcodecs/CMakeLists.txt
+++ b/modules/imgcodecs/CMakeLists.txt
@@ -137,11 +137,11 @@ if(APPLE OR APPLE_FRAMEWORK)
   list(APPEND imgcodecs_srcs ${CMAKE_CURRENT_LIST_DIR}/src/apple_conversions.h)
   list(APPEND imgcodecs_srcs ${CMAKE_CURRENT_LIST_DIR}/src/apple_conversions.mm)
 endif()
-if(IOS)
+if(IOS OR XROS)
   list(APPEND imgcodecs_srcs ${CMAKE_CURRENT_LIST_DIR}/src/ios_conversions.mm)
   list(APPEND IMGCODECS_LIBRARIES "-framework UIKit")
 endif()
-if(APPLE AND (NOT IOS))
+if(APPLE AND (NOT IOS) AND (NOT XROS))
   list(APPEND imgcodecs_srcs ${CMAKE_CURRENT_LIST_DIR}/src/macosx_conversions.mm)
   list(APPEND IMGCODECS_LIBRARIES "-framework AppKit")
 endif()
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index c1bdf722913d..eba25ce1cfb8 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -48,7 +48,6 @@
 /**
   @defgroup imgcodecs Image file reading and writing
   @{
-    @defgroup imgcodecs_c C API
     @defgroup imgcodecs_flags Flags used for image file reading and writing
     @defgroup imgcodecs_ios iOS glue
     @defgroup imgcodecs_macosx MacOS(OSX) glue
@@ -88,23 +87,25 @@ enum ImwriteFlags {
        IMWRITE_JPEG_PROGRESSIVE    = 2,  //!< Enable JPEG features, 0 or 1, default is False.
        IMWRITE_JPEG_OPTIMIZE       = 3,  //!< Enable JPEG features, 0 or 1, default is False.
        IMWRITE_JPEG_RST_INTERVAL   = 4,  //!< JPEG restart interval, 0 - 65535, default is 0 - no restart.
-       IMWRITE_JPEG_LUMA_QUALITY   = 5,  //!< Separate luma quality level, 0 - 100, default is -1 - don't use.
-       IMWRITE_JPEG_CHROMA_QUALITY = 6,  //!< Separate chroma quality level, 0 - 100, default is -1 - don't use.
+       IMWRITE_JPEG_LUMA_QUALITY   = 5,  //!< Separate luma quality level, 0 - 100, default is -1 - don't use. If JPEG_LIB_VERSION < 70, Not supported.
+       IMWRITE_JPEG_CHROMA_QUALITY = 6,  //!< Separate chroma quality level, 0 - 100, default is -1 - don't use. If JPEG_LIB_VERSION < 70, Not supported.
        IMWRITE_JPEG_SAMPLING_FACTOR = 7, //!< For JPEG, set sampling factor. See cv::ImwriteJPEGSamplingFactorParams.
        IMWRITE_PNG_COMPRESSION     = 16, //!< For PNG, it can be the compression level from 0 to 9. A higher value means a smaller size and longer compression time. If specified, strategy is changed to IMWRITE_PNG_STRATEGY_DEFAULT (Z_DEFAULT_STRATEGY). Default value is 1 (best speed setting).
        IMWRITE_PNG_STRATEGY        = 17, //!< One of cv::ImwritePNGFlags, default is IMWRITE_PNG_STRATEGY_RLE.
        IMWRITE_PNG_BILEVEL         = 18, //!< Binary level PNG, 0 or 1, default is 0.
        IMWRITE_PXM_BINARY          = 32, //!< For PPM, PGM, or PBM, it can be a binary format flag, 0 or 1. Default value is 1.
-       IMWRITE_EXR_TYPE            = (3 << 4) + 0, /* 48 */ //!< override EXR storage type (FLOAT (FP32) is default)
-       IMWRITE_EXR_COMPRESSION     = (3 << 4) + 1, /* 49 */ //!< override EXR compression type (ZIP_COMPRESSION = 3 is default)
-       IMWRITE_EXR_DWA_COMPRESSION_LEVEL = (3 << 4) + 2, /* 50 */ //!< override EXR DWA compression level (45 is default)
+       IMWRITE_EXR_TYPE            = (3 << 4) + 0 /* 48 */, //!< override EXR storage type (FLOAT (FP32) is default)
+       IMWRITE_EXR_COMPRESSION     = (3 << 4) + 1 /* 49 */, //!< override EXR compression type (ZIP_COMPRESSION = 3 is default)
+       IMWRITE_EXR_DWA_COMPRESSION_LEVEL = (3 << 4) + 2 /* 50 */, //!< override EXR DWA compression level (45 is default)
        IMWRITE_WEBP_QUALITY        = 64, //!< For WEBP, it can be a quality from 1 to 100 (the higher is the better). By default (without any parameter) and for quality above 100 the lossless compression is used.
-       IMWRITE_HDR_COMPRESSION     = (5 << 4) + 0, /* 80 */ //!< specify HDR compression
+       IMWRITE_HDR_COMPRESSION     = (5 << 4) + 0 /* 80 */, //!< specify HDR compression
        IMWRITE_PAM_TUPLETYPE       = 128,//!< For PAM, sets the TUPLETYPE field to the corresponding string value that is defined for the format
        IMWRITE_TIFF_RESUNIT        = 256,//!< For TIFF, use to specify which DPI resolution unit to set; see libtiff documentation for valid values
        IMWRITE_TIFF_XDPI           = 257,//!< For TIFF, use to specify the X direction DPI
        IMWRITE_TIFF_YDPI           = 258,//!< For TIFF, use to specify the Y direction DPI
-       IMWRITE_TIFF_COMPRESSION    = 259,//!< For TIFF, use to specify the image compression scheme. See libtiff for integer constants corresponding to compression formats. Note, for images whose depth is CV_32F, only libtiff's SGILOG compression scheme is used. For other supported depths, the compression scheme can be specified by this flag; LZW compression is the default.
+       IMWRITE_TIFF_COMPRESSION    = 259,//!< For TIFF, use to specify the image compression scheme. See cv::ImwriteTiffCompressionFlags. Note, for images whose depth is CV_32F, only libtiff's SGILOG compression scheme is used. For other supported depths, the compression scheme can be specified by this flag; LZW compression is the default.
+       IMWRITE_TIFF_ROWSPERSTRIP   = 278,//!< For TIFF, use to specify the number of rows per strip.
+       IMWRITE_TIFF_PREDICTOR      = 317,//!< For TIFF, use to specify predictor. See cv::ImwriteTiffPredictorFlags.
        IMWRITE_JPEG2000_COMPRESSION_X1000 = 272,//!< For JPEG2000, use to specify the target compression rate (multiplied by 1000). The value can be from 0 to 1000. Default is 1000.
        IMWRITE_AVIF_QUALITY        = 512,//!< For AVIF, it can be a quality between 0 and 100 (the higher the better). Default is 95.
        IMWRITE_AVIF_DEPTH          = 513,//!< For AVIF, it can be 8, 10 or 12. If >8, it is stored/read as CV_32F. Default is 8.
@@ -119,6 +120,48 @@ enum ImwriteJPEGSamplingFactorParams {
        IMWRITE_JPEG_SAMPLING_FACTOR_444 = 0x111111  //!< 1x1,1x1,1x1(No subsampling)
      };
 
+enum ImwriteTiffCompressionFlags {
+        IMWRITE_TIFF_COMPRESSION_NONE = 1,            //!< dump mode
+        IMWRITE_TIFF_COMPRESSION_CCITTRLE = 2,        //!< CCITT modified Huffman RLE
+        IMWRITE_TIFF_COMPRESSION_CCITTFAX3 = 3,       //!< CCITT Group 3 fax encoding
+        IMWRITE_TIFF_COMPRESSION_CCITT_T4 = 3,        //!< CCITT T.4 (TIFF 6 name)
+        IMWRITE_TIFF_COMPRESSION_CCITTFAX4 = 4,       //!< CCITT Group 4 fax encoding
+        IMWRITE_TIFF_COMPRESSION_CCITT_T6 = 4,        //!< CCITT T.6 (TIFF 6 name)
+        IMWRITE_TIFF_COMPRESSION_LZW = 5,             //!< Lempel-Ziv  & Welch
+        IMWRITE_TIFF_COMPRESSION_OJPEG = 6,           //!< !6.0 JPEG
+        IMWRITE_TIFF_COMPRESSION_JPEG = 7,            //!< %JPEG DCT compression
+        IMWRITE_TIFF_COMPRESSION_T85 = 9,             //!< !TIFF/FX T.85 JBIG compression
+        IMWRITE_TIFF_COMPRESSION_T43 = 10,            //!< !TIFF/FX T.43 colour by layered JBIG compression
+        IMWRITE_TIFF_COMPRESSION_NEXT = 32766,        //!< NeXT 2-bit RLE
+        IMWRITE_TIFF_COMPRESSION_CCITTRLEW = 32771,   //!< #1 w/ word alignment
+        IMWRITE_TIFF_COMPRESSION_PACKBITS = 32773,    //!< Macintosh RLE
+        IMWRITE_TIFF_COMPRESSION_THUNDERSCAN = 32809, //!< ThunderScan RLE
+        IMWRITE_TIFF_COMPRESSION_IT8CTPAD = 32895,    //!< IT8 CT w/padding
+        IMWRITE_TIFF_COMPRESSION_IT8LW = 32896,       //!< IT8 Linework RLE
+        IMWRITE_TIFF_COMPRESSION_IT8MP = 32897,       //!< IT8 Monochrome picture
+        IMWRITE_TIFF_COMPRESSION_IT8BL = 32898,       //!< IT8 Binary line art
+        IMWRITE_TIFF_COMPRESSION_PIXARFILM = 32908,   //!< Pixar companded 10bit LZW
+        IMWRITE_TIFF_COMPRESSION_PIXARLOG = 32909,    //!< Pixar companded 11bit ZIP
+        IMWRITE_TIFF_COMPRESSION_DEFLATE = 32946,     //!< Deflate compression, legacy tag
+        IMWRITE_TIFF_COMPRESSION_ADOBE_DEFLATE = 8,   //!< Deflate compression, as recognized by Adobe
+        IMWRITE_TIFF_COMPRESSION_DCS = 32947,         //!< Kodak DCS encoding
+        IMWRITE_TIFF_COMPRESSION_JBIG = 34661,        //!< ISO JBIG
+        IMWRITE_TIFF_COMPRESSION_SGILOG = 34676,      //!< SGI Log Luminance RLE
+        IMWRITE_TIFF_COMPRESSION_SGILOG24 = 34677,    //!< SGI Log 24-bit packed
+        IMWRITE_TIFF_COMPRESSION_JP2000 = 34712,      //!< Leadtools JPEG2000
+        IMWRITE_TIFF_COMPRESSION_LERC = 34887,        //!< ESRI Lerc codec: https://github.com/Esri/lerc
+        IMWRITE_TIFF_COMPRESSION_LZMA = 34925,        //!< LZMA2
+        IMWRITE_TIFF_COMPRESSION_ZSTD = 50000,        //!< ZSTD: WARNING not registered in Adobe-maintained registry
+        IMWRITE_TIFF_COMPRESSION_WEBP = 50001,        //!< WEBP: WARNING not registered in Adobe-maintained registry
+        IMWRITE_TIFF_COMPRESSION_JXL = 50002          //!< JPEGXL: WARNING not registered in Adobe-maintained registry
+};
+
+enum ImwriteTiffPredictorFlags {
+        IMWRITE_TIFF_PREDICTOR_NONE = 1,              //!< no prediction scheme used
+        IMWRITE_TIFF_PREDICTOR_HORIZONTAL = 2,        //!< horizontal differencing
+        IMWRITE_TIFF_PREDICTOR_FLOATINGPOINT = 3      //!< floating point predictor
+
+};
 
 enum ImwriteEXRTypeFlags {
        /*IMWRITE_EXR_TYPE_UNIT = 0, //!< not supported */
@@ -227,6 +270,17 @@ Currently, the following file formats are supported:
 */
 CV_EXPORTS_W Mat imread( const String& filename, int flags = IMREAD_COLOR );
 
+/** @brief Loads an image from a file.
+
+This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts and the return value.
+@param filename Name of file to be loaded.
+@param dst object in which the image will be loaded.
+@param flags Flag that can take values of cv::ImreadModes
+@note
+The image passing through the img parameter can be pre-allocated. The memory is reused if the shape and the type match with the load image.
+ */
+CV_EXPORTS_W void imread( const String& filename, OutputArray dst, int flags = IMREAD_COLOR );
+
 /** @brief Loads a multi-page image from a file.
 
 The function imreadmulti loads a multi-page image from the specified file into a vector of Mat objects.
@@ -297,7 +351,7 @@ It also demonstrates how to save multiple images in a TIFF file:
 CV_EXPORTS_W bool imwrite( const String& filename, InputArray img,
               const std::vector<int>& params = std::vector<int>());
 
-/// @overload multi-image overload for bindings
+//! @brief multi-image overload for bindings
 CV_WRAP static inline
 bool imwritemulti(const String& filename, InputArrayOfArrays img,
                   const std::vector<int>& params = std::vector<int>())
@@ -319,10 +373,11 @@ See cv::imread for the list of supported formats and flags description.
 CV_EXPORTS_W Mat imdecode( InputArray buf, int flags );
 
 /** @overload
-@param buf
-@param flags
+@param buf Input array or vector of bytes.
+@param flags The same flags as in cv::imread, see cv::ImreadModes.
 @param dst The optional output placeholder for the decoded matrix. It can save the image
-reallocations when the function is called repeatedly for images of the same size.
+reallocations when the function is called repeatedly for images of the same size. In case of decoder
+failure the function returns empty cv::Mat object, but does not release user-provided dst buffer.
 */
 CV_EXPORTS Mat imdecode( InputArray buf, int flags, Mat* dst);
 
@@ -337,8 +392,9 @@ See cv::imreadmulti for the list of supported formats and flags description.
 @param buf Input array or vector of bytes.
 @param flags The same flags as in cv::imread, see cv::ImreadModes.
 @param mats A vector of Mat objects holding each page, if more than one.
+@param range A continuous selection of pages.
 */
-CV_EXPORTS_W bool imdecodemulti(InputArray buf, int flags, CV_OUT std::vector<Mat>& mats);
+CV_EXPORTS_W bool imdecodemulti(InputArray buf, int flags, CV_OUT std::vector<Mat>& mats, const cv::Range& range = Range::all());
 
 /** @brief Encodes an image into a memory buffer.
 
diff --git a/modules/imgcodecs/misc/objc/gen_dict.json b/modules/imgcodecs/misc/objc/gen_dict.json
new file mode 100644
index 000000000000..cfcd5889e7f3
--- /dev/null
+++ b/modules/imgcodecs/misc/objc/gen_dict.json
@@ -0,0 +1,5 @@
+{
+    "SourceMap" : {
+        "visionos" : "ios"
+    }
+}
diff --git a/modules/imgcodecs/perf/perf_jpeg.cpp b/modules/imgcodecs/perf/perf_jpeg.cpp
index 694e2e698e6a..7063ca909c9c 100644
--- a/modules/imgcodecs/perf/perf_jpeg.cpp
+++ b/modules/imgcodecs/perf/perf_jpeg.cpp
@@ -5,6 +5,9 @@
 
 namespace opencv_test
 {
+
+#ifdef HAVE_JPEG
+
 using namespace perf;
 
 PERF_TEST(JPEG, Decode)
@@ -35,4 +38,6 @@ PERF_TEST(JPEG, Encode)
     SANITY_CHECK_NOTHING();
 }
 
+#endif // HAVE_JPEG
+
 } // namespace
\ No newline at end of file
diff --git a/modules/imgcodecs/perf/perf_png.cpp b/modules/imgcodecs/perf/perf_png.cpp
index 7893d841c487..1af4780882fb 100644
--- a/modules/imgcodecs/perf/perf_png.cpp
+++ b/modules/imgcodecs/perf/perf_png.cpp
@@ -6,6 +6,9 @@
 
 namespace opencv_test
 {
+
+#if defined(HAVE_PNG) || defined(HAVE_SPNG)
+
 using namespace perf;
 
 typedef perf::TestBaseWithParam<std::string> PNG;
@@ -38,4 +41,6 @@ PERF_TEST(PNG, encode)
     SANITY_CHECK_NOTHING();
 }
 
+#endif // HAVE_PNG
+
 } // namespace
diff --git a/modules/imgcodecs/src/exif.cpp b/modules/imgcodecs/src/exif.cpp
index 28d52047d828..8ed976055646 100644
--- a/modules/imgcodecs/src/exif.cpp
+++ b/modules/imgcodecs/src/exif.cpp
@@ -133,7 +133,7 @@ bool ExifReader::parseExif(unsigned char* data, const size_t size)
  * @brief Filling m_exif member with exif directory elements
  *          This is internal function and is not exposed to client
  *
- *  @return The function doesn't return any value. In case of unsuccessful parsing
+ *  The function doesn't return any value. In case of unsuccessful parsing
  *      the m_exif member is not filled up
  */
 void ExifReader::parseExif()
@@ -165,7 +165,7 @@ void ExifReader::parseExif()
  *
  * @return INTEL, MOTO or NONE
  */
-Endianess_t ExifReader::getFormat() const
+Endianness_t ExifReader::getFormat() const
 {
     if (m_data.size() < 1)
         return NONE;
diff --git a/modules/imgcodecs/src/exif.hpp b/modules/imgcodecs/src/exif.hpp
index 6cc95afb1abb..a8914bec039f 100644
--- a/modules/imgcodecs/src/exif.hpp
+++ b/modules/imgcodecs/src/exif.hpp
@@ -79,7 +79,7 @@ enum ExifTagName
     INVALID_TAG             = 0xFFFF    ///< Shows that the tag was not recognized
 };
 
-enum Endianess_t
+enum Endianness_t
 {
     INTEL = 0x49,
     MOTO = 0x4D,
@@ -179,7 +179,7 @@ class ExifReader
 private:
     std::vector<unsigned char> m_data;
     std::map<int, ExifEntry_t > m_exif;
-    Endianess_t m_format;
+    Endianness_t m_format;
 
     void parseExif();
     bool checkTagMark() const;
@@ -193,7 +193,7 @@ class ExifReader
     uint16_t getResolutionUnit( const size_t offset ) const;
     uint16_t getYCbCrPos( const size_t offset ) const;
 
-    Endianess_t getFormat() const;
+    Endianness_t getFormat() const;
 
     ExifEntry_t parseExifEntry( const size_t offset );
 
diff --git a/modules/imgcodecs/src/grfmt_avif.cpp b/modules/imgcodecs/src/grfmt_avif.cpp
index e8d1446cbe6e..4b39ada60a9f 100644
--- a/modules/imgcodecs/src/grfmt_avif.cpp
+++ b/modules/imgcodecs/src/grfmt_avif.cpp
@@ -8,6 +8,7 @@
 
 #include <avif/avif.h>
 #include <fstream>
+#include <memory>
 
 #include <opencv2/core/utils/configuration.private.hpp>
 #include "opencv2/imgproc.hpp"
@@ -147,15 +148,6 @@ AvifDecoder::~AvifDecoder() {
 
 size_t AvifDecoder::signatureLength() const { return kAvifSignatureSize; }
 
-bool AvifDecoder::checkSignature(const String &signature) const {
-  avifDecoderSetIOMemory(decoder_,
-                         reinterpret_cast<const uint8_t *>(signature.c_str()),
-                         signature.size());
-  decoder_->io->sizeHint = 1e9;
-  const avifResult status = avifDecoderParse(decoder_);
-  return (status == AVIF_RESULT_OK || status == AVIF_RESULT_TRUNCATED_DATA);
-}
-
 #define OPENCV_AVIF_CHECK_STATUS(X, ENCDEC)               \
   {                                                       \
     const avifResult status = (X);                        \
@@ -167,6 +159,20 @@ bool AvifDecoder::checkSignature(const String &signature) const {
     }                                                     \
   }
 
+bool AvifDecoder::checkSignature(const String &signature) const {
+  std::unique_ptr<avifDecoder, decltype(&avifDecoderDestroy)> decoder(
+      avifDecoderCreate(), avifDecoderDestroy);
+  if (!decoder) return false;
+  OPENCV_AVIF_CHECK_STATUS(
+      avifDecoderSetIOMemory(
+          decoder.get(), reinterpret_cast<const uint8_t *>(signature.c_str()),
+          signature.size()),
+      decoder);
+  decoder->io->sizeHint = 1e9;
+  const avifResult status = avifDecoderParse(decoder.get());
+  return (status == AVIF_RESULT_OK || status == AVIF_RESULT_TRUNCATED_DATA);
+}
+
 ImageDecoder AvifDecoder::newDecoder() const { return makePtr<AvifDecoder>(); }
 
 bool AvifDecoder::readHeader() {
diff --git a/modules/imgcodecs/src/grfmt_exr.cpp b/modules/imgcodecs/src/grfmt_exr.cpp
index 786b9d176b2b..65a0e5e03bfe 100644
--- a/modules/imgcodecs/src/grfmt_exr.cpp
+++ b/modules/imgcodecs/src/grfmt_exr.cpp
@@ -729,7 +729,7 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
     Mat exrMat;
     if( type == HALF )
     {
-        convertFp16(img, exrMat);
+        img.convertTo(exrMat, CV_16F);
         buffer = (char *)const_cast<uchar *>( exrMat.ptr() );
         bufferstep = exrMat.step;
         size = 2;
diff --git a/modules/imgcodecs/src/grfmt_jpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg.cpp
index 506cebdf496a..4e3b1df48d98 100644
--- a/modules/imgcodecs/src/grfmt_jpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg.cpp
@@ -402,14 +402,12 @@ int my_jpeg_load_dht (struct jpeg_decompress_struct *info, unsigned char *dht,
 bool  JpegDecoder::readData( Mat& img )
 {
     volatile bool result = false;
-    size_t step = img.step;
-    bool color = img.channels() > 1;
+    const bool color = img.channels() > 1;
 
     if( m_state && m_width && m_height )
     {
         jpeg_decompress_struct* cinfo = &((JpegState*)m_state)->cinfo;
         JpegErrorMgr* jerr = &((JpegState*)m_state)->jerr;
-        JSAMPARRAY buffer = 0;
 
         if( setjmp( jerr->setjmp_buffer ) == 0 )
         {
@@ -429,17 +427,30 @@ bool  JpegDecoder::readData( Mat& img )
             }
 #endif
 
+            // See https://github.com/opencv/opencv/issues/25274
+            // Conversion CMYK->BGR is not supported in libjpeg-turbo.
+            // So supporting both directly and indirectly is necessary.
+            bool doDirectRead = false;
+
             if( color )
             {
                 if( cinfo->num_components != 4 )
                 {
+#ifdef JCS_EXTENSIONS
+                    cinfo->out_color_space = JCS_EXT_BGR;
+                    cinfo->out_color_components = 3;
+                    doDirectRead = true; // BGR -> BGR
+#else
                     cinfo->out_color_space = JCS_RGB;
                     cinfo->out_color_components = 3;
+                    doDirectRead = false; // RGB -> BGR
+#endif
                 }
                 else
                 {
                     cinfo->out_color_space = JCS_CMYK;
                     cinfo->out_color_components = 4;
+                    doDirectRead = false; // CMYK -> BGR
                 }
             }
             else
@@ -448,11 +459,13 @@ bool  JpegDecoder::readData( Mat& img )
                 {
                     cinfo->out_color_space = JCS_GRAYSCALE;
                     cinfo->out_color_components = 1;
+                    doDirectRead = true; // GRAY -> GRAY
                 }
                 else
                 {
                     cinfo->out_color_space = JCS_CMYK;
                     cinfo->out_color_components = 4;
+                    doDirectRead = false; // CMYK -> GRAY
                 }
             }
 
@@ -481,26 +494,38 @@ bool  JpegDecoder::readData( Mat& img )
 
             jpeg_start_decompress( cinfo );
 
-            buffer = (*cinfo->mem->alloc_sarray)((j_common_ptr)cinfo,
-                                              JPOOL_IMAGE, m_width*4, 1 );
-
-            uchar* data = img.ptr();
-            for( ; m_height--; data += step )
+            if( doDirectRead)
             {
-                jpeg_read_scanlines( cinfo, buffer, 1 );
-                if( color )
+                for( int iy = 0 ; iy < m_height; iy ++ )
                 {
-                    if( cinfo->out_color_components == 3 )
-                        icvCvt_RGB2BGR_8u_C3R( buffer[0], 0, data, 0, Size(m_width,1) );
-                    else
-                        icvCvt_CMYK2BGR_8u_C4C3R( buffer[0], 0, data, 0, Size(m_width,1) );
+                    uchar* data = img.ptr<uchar>(iy);
+                    jpeg_read_scanlines( cinfo, &data, 1 );
                 }
-                else
+            }
+            else
+            {
+                JSAMPARRAY buffer = (*cinfo->mem->alloc_sarray)((j_common_ptr)cinfo,
+                                                                 JPOOL_IMAGE, m_width*4, 1 );
+
+                for( int iy = 0 ; iy < m_height; iy ++ )
                 {
-                    if( cinfo->out_color_components == 1 )
-                        memcpy( data, buffer[0], m_width );
+                    uchar* data = img.ptr<uchar>(iy);
+                    jpeg_read_scanlines( cinfo, buffer, 1 );
+
+                    if( color )
+                    {
+                        if( cinfo->out_color_components == 3 )
+                            icvCvt_RGB2BGR_8u_C3R( buffer[0], 0, data, 0, Size(m_width,1) );
+                        else
+                            icvCvt_CMYK2BGR_8u_C4C3R( buffer[0], 0, data, 0, Size(m_width,1) );
+                    }
                     else
-                        icvCvt_CMYK2Gray_8u_C4C1R( buffer[0], 0, data, 0, Size(m_width,1) );
+                    {
+                        if( cinfo->out_color_components == 1 )
+                            memcpy( data, buffer[0], m_width );
+                        else
+                            icvCvt_CMYK2Gray_8u_C4C1R( buffer[0], 0, data, 0, Size(m_width,1) );
+                    }
                 }
             }
 
@@ -593,8 +618,6 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
     int width = img.cols, height = img.rows;
 
     std::vector<uchar> out_buf(1 << 12);
-    AutoBuffer<uchar> _buffer;
-    uchar* buffer;
 
     struct jpeg_compress_struct cinfo;
     JpegErrorMgr jerr;
@@ -629,8 +652,41 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
 
         int _channels = img.channels();
         int channels = _channels > 1 ? 3 : 1;
-        cinfo.input_components = channels;
-        cinfo.in_color_space = channels > 1 ? JCS_RGB : JCS_GRAYSCALE;
+
+        bool doDirectWrite = false;
+        switch( _channels )
+        {
+            case 1:
+                cinfo.input_components = 1;
+                cinfo.in_color_space = JCS_GRAYSCALE;
+                doDirectWrite = true; // GRAY -> GRAY
+                break;
+            case 3:
+#ifdef JCS_EXTENSIONS
+                cinfo.input_components = 3;
+                cinfo.in_color_space = JCS_EXT_BGR;
+                doDirectWrite = true; // BGR -> BGR
+#else
+                cinfo.input_components = 3;
+                cinfo.in_color_space = JCS_RGB;
+                doDirectWrite = false; // BGR -> RGB
+#endif
+                break;
+            case 4:
+#ifdef JCS_EXTENSIONS
+                cinfo.input_components = 4;
+                cinfo.in_color_space = JCS_EXT_BGRX;
+                doDirectWrite = true; // BGRX -> BGRX
+#else
+                cinfo.input_components = 3;
+                cinfo.in_color_space = JCS_RGB;
+                doDirectWrite = false; // BGRA -> RGB
+#endif
+                break;
+            default:
+                CV_Error(cv::Error::StsError, cv::format("Unsupported number of _channels: %06d", _channels) );
+                break;
+        }
 
         int quality = 95;
         int progressive = 0;
@@ -727,9 +783,9 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
             cinfo.comp_info[1].h_samp_factor = 1;
         }
 
-#if JPEG_LIB_VERSION >= 70
         if (luma_quality >= 0 && chroma_quality >= 0)
         {
+#if JPEG_LIB_VERSION >= 70
             cinfo.q_scale_factor[0] = jpeg_quality_scaling(luma_quality);
             cinfo.q_scale_factor[1] = jpeg_quality_scaling(chroma_quality);
             if ( luma_quality != chroma_quality )
@@ -741,31 +797,43 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
                 cinfo.comp_info[1].h_samp_factor = 1;
             }
             jpeg_default_qtables( &cinfo, TRUE );
-        }
+#else
+            // See https://github.com/opencv/opencv/issues/25646
+            CV_LOG_ONCE_WARNING(NULL, cv::format("IMWRITE_JPEG_LUMA/CHROMA_QUALITY are not supported bacause JPEG_LIB_VERSION < 70."));
 #endif // #if JPEG_LIB_VERSION >= 70
+        }
 
         jpeg_start_compress( &cinfo, TRUE );
 
-        if( channels > 1 )
-            _buffer.allocate(width*channels);
-        buffer = _buffer.data();
-
-        for( int y = 0; y < height; y++ )
+        if( doDirectWrite )
         {
-            uchar *data = img.data + img.step*y, *ptr = data;
-
-            if( _channels == 3 )
+            for( int y = 0; y < height; y++ )
             {
-                icvCvt_BGR2RGB_8u_C3R( data, 0, buffer, 0, Size(width,1) );
-                ptr = buffer;
+                uchar *data = const_cast<uchar*>(img.ptr<uchar>(y));
+                jpeg_write_scanlines( &cinfo, &data, 1 );
             }
-            else if( _channels == 4 )
+        }
+        else
+        {
+            CV_Check(_channels, (_channels == 3) || (_channels == 4), "Unsupported number of channels(indirect write)");
+
+            AutoBuffer<uchar> _buffer;
+            _buffer.allocate(width*channels);
+            uchar *buffer = _buffer.data();
+
+            for( int y = 0; y < height; y++ )
             {
-                icvCvt_BGRA2BGR_8u_C4C3R( data, 0, buffer, 0, Size(width,1), 2 );
-                ptr = buffer;
+                uchar *data = const_cast<uchar*>(img.ptr<uchar>(y));
+                if( _channels == 3 )
+                {
+                    icvCvt_BGR2RGB_8u_C3R( data, 0, buffer, 0, Size(width,1) );
+                }
+                else // if( _channels == 4 )
+                {
+                    icvCvt_BGRA2BGR_8u_C4C3R( data, 0, buffer, 0, Size(width,1), 2 );
+                }
+                jpeg_write_scanlines( &cinfo, &buffer, 1 );
             }
-
-            jpeg_write_scanlines( &cinfo, &ptr, 1 );
         }
 
         jpeg_finish_compress( &cinfo );
diff --git a/modules/imgcodecs/src/grfmt_pfm.cpp b/modules/imgcodecs/src/grfmt_pfm.cpp
index 97cf07b27e33..addae34b4f0d 100644
--- a/modules/imgcodecs/src/grfmt_pfm.cpp
+++ b/modules/imgcodecs/src/grfmt_pfm.cpp
@@ -27,7 +27,7 @@ bool is_byte_order_swapped(double scale)
   #endif
 }
 
-void swap_endianess(uint32_t& ui)
+void swap_endianness(uint32_t& ui)
 {
   static const uint32_t A(0x000000ffU);
   static const uint32_t B(0x0000ff00U);
@@ -137,7 +137,7 @@ bool PFMDecoder::readData(Mat& mat)
       for (int i = 0; i < m_width * buffer.channels(); ++i) {
         static_assert( sizeof(uint32_t) == sizeof(float),
                        "uint32_t and float must have same size." );
-        swap_endianess(buffer.ptr<uint32_t>(y)[i]);
+        swap_endianness(buffer.ptr<uint32_t>(y)[i]);
       }
     }
   }
diff --git a/modules/imgcodecs/src/grfmt_png.cpp b/modules/imgcodecs/src/grfmt_png.cpp
index 388a3fcaf42e..aca73bd000fc 100644
--- a/modules/imgcodecs/src/grfmt_png.cpp
+++ b/modules/imgcodecs/src/grfmt_png.cpp
@@ -58,11 +58,7 @@
 #  define _FILE_OFFSET_BITS 0
 #endif
 
-#ifdef HAVE_LIBPNG_PNG_H
-#include <libpng/png.h>
-#else
 #include <png.h>
-#endif
 #include <zlib.h>
 
 #include "grfmt_png.hpp"
diff --git a/modules/imgcodecs/src/grfmt_pxm.cpp b/modules/imgcodecs/src/grfmt_pxm.cpp
index 8da23487282d..76290c43de45 100644
--- a/modules/imgcodecs/src/grfmt_pxm.cpp
+++ b/modules/imgcodecs/src/grfmt_pxm.cpp
@@ -43,7 +43,7 @@
 #include "precomp.hpp"
 #include "utils.hpp"
 #include "grfmt_pxm.hpp"
-#include <iostream>
+#include <opencv2/core/utils/logger.hpp>
 
 #ifdef HAVE_IMGCODEC_PXM
 
@@ -191,7 +191,7 @@ bool PxMDecoder::readHeader()
     }
     catch (...)
     {
-        std::cerr << "PXM::readHeader(): unknown C++ exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "PXM::readHeader(): unknown C++ exception");
         throw;
     }
 
@@ -364,7 +364,7 @@ bool PxMDecoder::readData( Mat& img )
     }
     catch (...)
     {
-        std::cerr << "PXM::readData(): unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "PXM::readData(): unknown exception");
         throw;
     }
 
diff --git a/modules/imgcodecs/src/grfmt_spng.cpp b/modules/imgcodecs/src/grfmt_spng.cpp
index e3c61164e4e0..fa15bd46c79f 100644
--- a/modules/imgcodecs/src/grfmt_spng.cpp
+++ b/modules/imgcodecs/src/grfmt_spng.cpp
@@ -521,7 +521,6 @@ int SPngEncoder::writeDataToBuf(void *ctx, void *user, void *dst_src, size_t len
 
 bool SPngEncoder::write(const Mat &img, const std::vector<int> &params)
 {
-    int fmt;
     spng_ctx *ctx = spng_ctx_new(SPNG_CTX_ENCODER);
     FILE *volatile f = 0;
     int width = img.cols, height = img.rows;
@@ -558,11 +557,10 @@ bool SPngEncoder::write(const Mat &img, const std::vector<int> &params)
                 isBilevel = params[i + 1] != 0;
             }
         }
-        fmt = channels == 1 ? SPNG_COLOR_TYPE_GRAYSCALE : channels == 3 ? SPNG_COLOR_TYPE_TRUECOLOR
-                                                                        : SPNG_COLOR_TYPE_TRUECOLOR_ALPHA;
 
         ihdr.bit_depth = depth == CV_8U ? isBilevel ? 1 : 8 : 16;
-        ihdr.color_type = fmt;
+        ihdr.color_type = (uint8_t)(channels == 1 ? SPNG_COLOR_TYPE_GRAYSCALE : channels == 3 ? SPNG_COLOR_TYPE_TRUECOLOR
+                                                                                              : SPNG_COLOR_TYPE_TRUECOLOR_ALPHA);
         ihdr.interlace_method = SPNG_INTERLACE_NONE;
         ihdr.filter_method = SPNG_FILTER_NONE;
         ihdr.compression_method = 0;
@@ -597,7 +595,7 @@ bool SPngEncoder::write(const Mat &img, const std::vector<int> &params)
             ret = spng_encode_image(ctx, nullptr, 0, SPNG_FMT_PNG, SPNG_ENCODE_PROGRESSIVE);
             if (channels > 1)
             {
-                int error;
+                int error = SPNG_OK;
                 if (ret == SPNG_OK)
                 {
                     if (depth == CV_16U)
@@ -659,7 +657,7 @@ bool SPngEncoder::write(const Mat &img, const std::vector<int> &params)
             }
             else
             {
-                int error;
+                int error = SPNG_OK;
                 for (int y = 0; y < height; y++)
                 {
                     error = spng_encode_row(ctx, img.data + y * img.step, width * channels * (depth == CV_16U ? 2 : 1));
@@ -711,17 +709,17 @@ void spngCvt_BGRA2Gray_8u_C4C1R(const uchar *bgra, int rgba_step,
                                 uchar *gray, int gray_step,
                                 cv::Size size, int _swap_rb)
 {
-    int i;
     for (; size.height--; gray += gray_step)
     {
         double cBGR0 = 0.1140441895;
+        double cBGR1 = 0.5869750977;
         double cBGR2 = 0.2989807129;
+
         if (_swap_rb)
             std::swap(cBGR0, cBGR2);
-        for (i = 0; i < size.width; i++, bgra += 4)
+        for (int i = 0; i < size.width; i++, bgra += 4)
         {
-            int t = cBGR0 * bgra[0] + 0.5869750977 * bgra[1] + cBGR2 * bgra[2];
-            gray[i] = (uchar)t;
+            gray[i] = cv::saturate_cast<uchar>(cBGR0 * bgra[0] + cBGR1 * bgra[1] + cBGR2 * bgra[2]);
         }
 
         bgra += rgba_step - size.width * 4;
@@ -732,17 +730,17 @@ void spngCvt_BGRA2Gray_16u_CnC1R(const ushort *bgr, int bgr_step,
                                  ushort *gray, int gray_step,
                                  cv::Size size, int ncn, int _swap_rb)
 {
-    int i;
     for (; size.height--; gray += gray_step)
     {
         double cBGR0 = 0.1140441895;
+        double cBGR1 = 0.5869750977;
         double cBGR2 = 0.2989807129;
+
         if (_swap_rb)
             std::swap(cBGR0, cBGR2);
-        for (i = 0; i < size.width; i++, bgr += ncn)
+        for (int i = 0; i < size.width; i++, bgr += ncn)
         {
-            int t = cBGR0 * bgr[0] + 0.5869750977 * bgr[1] + cBGR2 * bgr[2];
-            gray[i] = (ushort)t;
+            gray[i] = (ushort)(cBGR0 * bgr[0] + cBGR1 * bgr[1] + cBGR2 * bgr[2]);
         }
 
         bgr += bgr_step - size.width * ncn;
diff --git a/modules/imgcodecs/src/grfmt_tiff.cpp b/modules/imgcodecs/src/grfmt_tiff.cpp
index ed21f3f14c9d..3890df96bd0b 100644
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@@ -53,12 +53,8 @@
 #include "grfmt_tiff.hpp"
 #include <limits>
 
-// TODO FIXIT Conflict declarations for common types like int64/uint64
-namespace tiff_dummy_namespace {
 #include "tiff.h"
 #include "tiffio.h"
-}
-using namespace tiff_dummy_namespace;
 
 namespace cv
 {
@@ -72,11 +68,6 @@ static void extend_cvtColor( InputArray _src, OutputArray _dst, int code );
         CV_Error(Error::StsError, "OpenCV TIFF: failed " #call); \
     }
 
-#define CV_TIFF_CHECK_CALL_INFO(call) \
-    if (0 == (call)) { \
-        CV_LOG_INFO(NULL, "OpenCV TIFF(line " << __LINE__ << "): failed optional call: " #call ", ignoring"); \
-    }
-
 #define CV_TIFF_CHECK_CALL_DEBUG(call) \
     if (0 == (call)) { \
         CV_LOG_DEBUG(NULL, "OpenCV TIFF(line " << __LINE__ << "): failed optional call: " #call ", ignoring"); \
@@ -245,7 +236,7 @@ bool TiffDecoder::readHeader()
     if (!tif)
     {
         // TIFFOpen() mode flags are different to fopen().  A 'b' in mode "rb" has no effect when reading.
-        // http://www.remotesensing.org/libtiff/man/TIFFOpen.3tiff.html
+        // http://www.simplesystems.org/libtiff/functions/TIFFOpen.html
         if ( !m_buf.empty() )
         {
             m_buf_pos = 0;
@@ -269,8 +260,8 @@ bool TiffDecoder::readHeader()
 
     if (tif)
     {
-        uint32 wdth = 0, hght = 0;
-        uint16 photometric = 0;
+        uint32_t wdth = 0, hght = 0;
+        uint16_t photometric = 0;
 
         CV_TIFF_CHECK_CALL(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &wdth));
         CV_TIFF_CHECK_CALL(TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &hght));
@@ -278,7 +269,7 @@ bool TiffDecoder::readHeader()
 
         {
             bool isGrayScale = photometric == PHOTOMETRIC_MINISWHITE || photometric == PHOTOMETRIC_MINISBLACK;
-            uint16 bpp = 8, ncn = isGrayScale ? 1 : 3;
+            uint16_t bpp = 8, ncn = isGrayScale ? 1 : 3;
             if (0 == TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp))
             {
                 // TIFF bi-level images don't require TIFFTAG_BITSPERSAMPLE tag
@@ -301,7 +292,7 @@ bool TiffDecoder::readHeader()
                 (ncn != 1 && ncn != 3 && ncn != 4)))
                 bpp = 8;
 
-            uint16 sample_format = SAMPLEFORMAT_UINT;
+            uint16_t sample_format = SAMPLEFORMAT_UINT;
             TIFFGetField(tif, TIFFTAG_SAMPLEFORMAT, &sample_format);
             int wanted_channels = normalizeChannelsNumber(ncn);
             switch (bpp)
@@ -383,7 +374,7 @@ bool TiffDecoder::nextPage()
            readHeader();
 }
 
-static void fixOrientationPartial(Mat &img, uint16 orientation)
+static void fixOrientationPartial(Mat &img, uint16_t orientation)
 {
     switch(orientation) {
         case ORIENTATION_RIGHTTOP:
@@ -439,7 +430,7 @@ static void fixOrientationFull(Mat &img, int orientation)
  * For 8 bit some corrections are done by TIFFReadRGBAStrip/Tile already.
  * Not so for 16/32/64 bit.
  */
-static void fixOrientation(Mat &img, uint16 orientation, bool isOrientationFull)
+static void fixOrientation(Mat &img, uint16_t orientation, bool isOrientationFull)
 {
     if( isOrientationFull )
     {
@@ -600,7 +591,7 @@ bool  TiffDecoder::readData( Mat& img )
     CV_Assert(!m_tif.empty());
     TIFF* tif = (TIFF*)m_tif.get();
 
-    uint16 photometric = (uint16)-1;
+    uint16_t photometric = (uint16_t)-1;
     CV_TIFF_CHECK_CALL(TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric));
 
     if (m_hdr && depth >= CV_32F)
@@ -616,14 +607,14 @@ bool  TiffDecoder::readData( Mat& img )
     {
         int is_tiled = TIFFIsTiled(tif) != 0;
         bool isGrayScale = photometric == PHOTOMETRIC_MINISWHITE || photometric == PHOTOMETRIC_MINISBLACK;
-        uint16 bpp = 8, ncn = isGrayScale ? 1 : 3;
+        uint16_t bpp = 8, ncn = isGrayScale ? 1 : 3;
         if (0 == TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp))
         {
             // TIFF bi-level images don't require TIFFTAG_BITSPERSAMPLE tag
             bpp = 1;
         }
         CV_TIFF_CHECK_CALL_DEBUG(TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &ncn));
-        uint16 img_orientation = ORIENTATION_TOPLEFT;
+        uint16_t img_orientation = ORIENTATION_TOPLEFT;
         CV_TIFF_CHECK_CALL_DEBUG(TIFFGetField(tif, TIFFTAG_ORIENTATION, &img_orientation));
         constexpr const int bitsPerByte = 8;
         int dst_bpp = (int)(img.elemSize1() * bitsPerByte);
@@ -633,7 +624,7 @@ bool  TiffDecoder::readData( Mat& img )
         int wanted_channels = normalizeChannelsNumber(img.channels());
         bool doReadScanline = false;
 
-        uint32 tile_width0 = m_width, tile_height0 = 0;
+        uint32_t tile_width0 = m_width, tile_height0 = 0;
 
         if (is_tiled)
         {
@@ -651,7 +642,7 @@ bool  TiffDecoder::readData( Mat& img )
                 tile_width0 = m_width;
 
             if (tile_height0 == 0 ||
-                    (!is_tiled && tile_height0 == std::numeric_limits<uint32>::max()) )
+                    (!is_tiled && tile_height0 == std::numeric_limits<uint32_t>::max()) )
                 tile_height0 = m_height;
 
             const int TILE_MAX_WIDTH = (1 << 24);
@@ -676,7 +667,7 @@ bool  TiffDecoder::readData( Mat& img )
                     ( (uint64_t) MAX_TILE_SIZE * 95 / 100)
                 )
                 {
-                    uint16_t planerConfig = (uint16)-1;
+                    uint16_t planerConfig = (uint16_t)-1;
                     CV_TIFF_CHECK_CALL(TIFFGetField(tif, TIFFTAG_PLANARCONFIG, &planerConfig));
 
                     doReadScanline = (!is_tiled) // no tile
@@ -732,7 +723,7 @@ bool  TiffDecoder::readData( Mat& img )
                     MAX_TILE_SIZE * 95 / 100
                 )
                 {
-                    uint16_t planerConfig = (uint16)-1;
+                    uint16_t planerConfig = (uint16_t)-1;
                     CV_TIFF_CHECK_CALL(TIFFGetField(tif, TIFFTAG_PLANARCONFIG, &planerConfig));
 
                     doReadScanline = (!is_tiled) // no tile
@@ -816,7 +807,7 @@ bool  TiffDecoder::readData( Mat& img )
                             uchar* bstart = src_buffer;
                             if (doReadScanline)
                             {
-                                CV_TIFF_CHECK_CALL((int)TIFFReadScanline(tif, (uint32*)src_buffer, y) >= 0);
+                                CV_TIFF_CHECK_CALL((int)TIFFReadScanline(tif, (uint32_t*)src_buffer, y) >= 0);
 
                                 if ( isNeedConvert16to8 )
                                 {
@@ -838,11 +829,11 @@ bool  TiffDecoder::readData( Mat& img )
                             }
                             else if (!is_tiled)
                             {
-                                CV_TIFF_CHECK_CALL(TIFFReadRGBAStrip(tif, y, (uint32*)src_buffer));
+                                CV_TIFF_CHECK_CALL(TIFFReadRGBAStrip(tif, y, (uint32_t*)src_buffer));
                             }
                             else
                             {
-                                CV_TIFF_CHECK_CALL(TIFFReadRGBATile(tif, x, y, (uint32*)src_buffer));
+                                CV_TIFF_CHECK_CALL(TIFFReadRGBATile(tif, x, y, (uint32_t*)src_buffer));
                                 // Tiles fill the buffer from the bottom up
                                 bstart += (tile_height0 - tile_height) * tile_width0 * 4;
                             }
@@ -936,15 +927,15 @@ bool  TiffDecoder::readData( Mat& img )
                         {
                             if (doReadScanline)
                             {
-                                CV_TIFF_CHECK_CALL((int)TIFFReadScanline(tif, (uint32*)src_buffer, y) >= 0);
+                                CV_TIFF_CHECK_CALL((int)TIFFReadScanline(tif, (uint32_t*)src_buffer, y) >= 0);
                             }
                             else if (!is_tiled)
                             {
-                                CV_TIFF_CHECK_CALL((int)TIFFReadEncodedStrip(tif, tileidx, (uint32*)src_buffer, src_buffer_size) >= 0);
+                                CV_TIFF_CHECK_CALL((int)TIFFReadEncodedStrip(tif, tileidx, (uint32_t*)src_buffer, src_buffer_size) >= 0);
                             }
                             else
                             {
-                                CV_TIFF_CHECK_CALL((int)TIFFReadEncodedTile(tif, tileidx, (uint32*)src_buffer, src_buffer_size) >= 0);
+                                CV_TIFF_CHECK_CALL((int)TIFFReadEncodedTile(tif, tileidx, (uint32_t*)src_buffer, src_buffer_size) >= 0);
                             }
 
                             for (int i = 0; i < tile_height; i++)
@@ -1118,7 +1109,7 @@ class TiffEncoderBufHelper
     TIFF* open ()
     {
         // do NOT put "wb" as the mode, because the b means "big endian" mode, not "binary" mode.
-        // http://www.remotesensing.org/libtiff/man/TIFFOpen.3tiff.html
+        // http://www.simplesystems.org/libtiff/functions/TIFFOpen.html
         return TIFFClientOpen( "", "w", reinterpret_cast<thandle_t>(this), &TiffEncoderBufHelper::read,
                                &TiffEncoderBufHelper::write, &TiffEncoderBufHelper::seek,
                                &TiffEncoderBufHelper::close, &TiffEncoderBufHelper::size,
@@ -1200,7 +1191,7 @@ static bool readParam(const std::vector<int>& params, int key, int& value)
 bool TiffEncoder::writeLibTiff( const std::vector<Mat>& img_vec, const std::vector<int>& params)
 {
     // do NOT put "wb" as the mode, because the b means "big endian" mode, not "binary" mode.
-    // http://www.remotesensing.org/libtiff/man/TIFFOpen.3tiff.html
+    // http://www.simplesystems.org/libtiff/functions/TIFFOpen.html
     TIFF* tif = NULL;
 
     TiffEncoderBufHelper buf_helper(m_buf);
@@ -1224,7 +1215,7 @@ bool TiffEncoder::writeLibTiff( const std::vector<Mat>& img_vec, const std::vect
     int resUnit = -1, dpiX = -1, dpiY = -1;
 
     readParam(params, IMWRITE_TIFF_COMPRESSION, compression);
-    readParam(params, TIFFTAG_PREDICTOR, predictor);
+    readParam(params, IMWRITE_TIFF_PREDICTOR, predictor);
     readParam(params, IMWRITE_TIFF_RESUNIT, resUnit);
     readParam(params, IMWRITE_TIFF_XDPI, dpiX);
     readParam(params, IMWRITE_TIFF_YDPI, dpiY);
@@ -1261,7 +1252,7 @@ bool TiffEncoder::writeLibTiff( const std::vector<Mat>& img_vec, const std::vect
         int page_compression = compression;
 
         int bitsPerChannel = -1;
-        uint16 sample_format = SAMPLEFORMAT_INT;
+        uint16_t sample_format = SAMPLEFORMAT_INT;
         switch (depth)
         {
             case CV_8U:
@@ -1313,7 +1304,7 @@ bool TiffEncoder::writeLibTiff( const std::vector<Mat>& img_vec, const std::vect
         CV_Assert(fileStep > 0);
 
         int rowsPerStrip = (int)((1 << 13) / fileStep);
-        readParam(params, TIFFTAG_ROWSPERSTRIP, rowsPerStrip);
+        readParam(params, IMWRITE_TIFF_ROWSPERSTRIP, rowsPerStrip);
         rowsPerStrip = std::max(1, std::min(height, rowsPerStrip));
 
         int colorspace = channels > 1 ? PHOTOMETRIC_RGB : PHOTOMETRIC_MINISBLACK;
diff --git a/modules/imgcodecs/src/ios_conversions.mm b/modules/imgcodecs/src/ios_conversions.mm
index 2aba323a2dec..5fea68664331 100644
--- a/modules/imgcodecs/src/ios_conversions.mm
+++ b/modules/imgcodecs/src/ios_conversions.mm
@@ -39,6 +39,8 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+#include <TargetConditionals.h>
+#if (TARGET_OS_IOS || TARGET_OS_VISION) && !TARGET_OS_MACCATALYST
 
 #import <UIKit/UIKit.h>
 #include "apple_conversions.h"
@@ -61,3 +63,4 @@ void UIImageToMat(const UIImage* image, cv::Mat& m, bool alphaExist) {
     CGImageRef imageRef = image.CGImage;
     CGImageToMat(imageRef, m, alphaExist);
 }
+#endif
diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp
index d0413c1ade29..2586fc1fa4b1 100644
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@@ -210,15 +210,8 @@ struct ImageCodecInitializer
 static
 ImageCodecInitializer& getCodecs()
 {
-#ifdef CV_CXX11
     static ImageCodecInitializer g_codecs;
     return g_codecs;
-#else
-    // C++98 doesn't guarantee correctness of multi-threaded initialization of static global variables
-    // (memory leak here is not critical, use C++11 to avoid that)
-    static ImageCodecInitializer* g_codecs = new ImageCodecInitializer();
-    return *g_codecs;
-#endif
 }
 
 /**
@@ -437,12 +430,12 @@ imread_( const String& filename, int flags, Mat& mat )
     }
     catch (const cv::Exception& e)
     {
-        std::cerr << "imread_('" << filename << "'): can't read header: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imread_('" << filename << "'): can't read header: " << e.what());
         return 0;
     }
     catch (...)
     {
-        std::cerr << "imread_('" << filename << "'): can't read header: unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imread_('" << filename << "'): can't read header: unknown exception");
         return 0;
     }
 
@@ -464,7 +457,16 @@ imread_( const String& filename, int flags, Mat& mat )
             type = CV_MAKETYPE(CV_MAT_DEPTH(type), 1);
     }
 
-    mat.create( size.height, size.width, type );
+    if (mat.empty())
+    {
+        mat.create( size.height, size.width, type );
+    }
+    else
+    {
+        CV_CheckEQ(size, mat.size(), "");
+        CV_CheckTypeEQ(type, mat.type(), "");
+        CV_Assert(mat.isContinuous());
+    }
 
     // read the image data
     bool success = false;
@@ -475,11 +477,11 @@ imread_( const String& filename, int flags, Mat& mat )
     }
     catch (const cv::Exception& e)
     {
-        std::cerr << "imread_('" << filename << "'): can't read data: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imread_('" << filename << "'): can't read data: " << e.what());
     }
     catch (...)
     {
-        std::cerr << "imread_('" << filename << "'): can't read data: unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imread_('" << filename << "'): can't read data: unknown exception");
     }
     if (!success)
     {
@@ -542,12 +544,12 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats, int star
     }
     catch (const cv::Exception& e)
     {
-        std::cerr << "imreadmulti_('" << filename << "'): can't read header: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imreadmulti_('" << filename << "'): can't read header: " << e.what());
         return 0;
     }
     catch (...)
     {
-        std::cerr << "imreadmulti_('" << filename << "'): can't read header: unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imreadmulti_('" << filename << "'): can't read header: unknown exception");
         return 0;
     }
 
@@ -591,11 +593,11 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats, int star
         }
         catch (const cv::Exception& e)
         {
-            std::cerr << "imreadmulti_('" << filename << "'): can't read data: " << e.what() << std::endl << std::flush;
+            CV_LOG_ERROR(NULL, "imreadmulti_('" << filename << "'): can't read data: " << e.what());
         }
         catch (...)
         {
-            std::cerr << "imreadmulti_('" << filename << "'): can't read data: unknown exception" << std::endl << std::flush;
+            CV_LOG_ERROR(NULL, "imreadmulti_('" << filename << "'): can't read data: unknown exception");
         }
         if (!success)
             break;
@@ -639,6 +641,16 @@ Mat imread( const String& filename, int flags )
     return img;
 }
 
+void imread( const String& filename, OutputArray dst, int flags )
+{
+    CV_TRACE_FUNCTION();
+
+    Mat img = dst.getMat();
+
+    /// load the data
+    imread_(filename, flags, img);
+}
+
 /**
 * Read a multi-page image
 *
@@ -672,7 +684,7 @@ size_t imcount_(const String& filename, int flags)
         return collection.size();
     } catch(cv::Exception const& e) {
         // Reading header or finding decoder for the filename is failed
-        std::cerr << "imcount_('" << filename << "'): can't read header or can't find decoder: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imcount_('" << filename << "'): can't read header or can't find decoder: " << e.what());
     }
     return 0;
 }
@@ -768,14 +780,13 @@ static bool imwrite_( const String& filename, const std::vector<Mat>& img_vec,
     }
     catch (const cv::Exception& e)
     {
-        std::cerr << "imwrite_('" << filename << "'): can't write data: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imwrite_('" << filename << "'): can't write data: " << e.what());
     }
     catch (...)
     {
-        std::cerr << "imwrite_('" << filename << "'): can't write data: unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imwrite_('" << filename << "'): can't write data: unknown exception");
     }
 
-    //    CV_Assert( code );
     return code;
 }
 
@@ -808,7 +819,7 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
 
     ImageDecoder decoder = findDecoder(buf_row);
     if( !decoder )
-        return 0;
+        return false;
 
     int scale_denom = 1;
     if( flags > IMREAD_LOAD_GDAL )
@@ -829,7 +840,7 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
         filename = tempfile();
         FILE* f = fopen( filename.c_str(), "wb" );
         if( !f )
-            return 0;
+            return false;
         size_t bufSize = buf_row.total()*buf.elemSize();
         if (fwrite(buf_row.ptr(), 1, bufSize, f) != bufSize)
         {
@@ -851,11 +862,11 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
     }
     catch (const cv::Exception& e)
     {
-        std::cerr << "imdecode_('" << filename << "'): can't read header: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imdecode_('" << filename << "'): can't read header: " << e.what());
     }
     catch (...)
     {
-        std::cerr << "imdecode_('" << filename << "'): can't read header: unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imdecode_('" << filename << "'): can't read header: unknown exception");
     }
     if (!success)
     {
@@ -864,10 +875,10 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
         {
             if (0 != remove(filename.c_str()))
             {
-                std::cerr << "unable to remove temporary file:" << filename << std::endl << std::flush;
+                CV_LOG_WARNING(NULL, "unable to remove temporary file:" << filename);
             }
         }
-        return 0;
+        return false;
     }
 
     // established the required input image size
@@ -896,24 +907,23 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
     }
     catch (const cv::Exception& e)
     {
-        std::cerr << "imdecode_('" << filename << "'): can't read data: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imdecode_('" << filename << "'): can't read data: " << e.what());
     }
     catch (...)
     {
-        std::cerr << "imdecode_('" << filename << "'): can't read data: unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imdecode_('" << filename << "'): can't read data: unknown exception");
     }
 
     if (!filename.empty())
     {
         if (0 != remove(filename.c_str()))
         {
-            std::cerr << "unable to remove temporary file:" << filename << std::endl << std::flush;
+            CV_LOG_WARNING(NULL, "unable to remove temporary file: " << filename);
         }
     }
 
     if (!success)
     {
-        mat.release();
         return false;
     }
 
@@ -937,7 +947,8 @@ Mat imdecode( InputArray _buf, int flags )
     CV_TRACE_FUNCTION();
 
     Mat buf = _buf.getMat(), img;
-    imdecode_( buf, flags, img );
+    if (!imdecode_(buf, flags, img))
+        img.release();
 
     return img;
 }
@@ -948,9 +959,10 @@ Mat imdecode( InputArray _buf, int flags, Mat* dst )
 
     Mat buf = _buf.getMat(), img;
     dst = dst ? dst : &img;
-    imdecode_( buf, flags, *dst );
-
-    return *dst;
+    if (imdecode_(buf, flags, *dst))
+        return *dst;
+    else
+        return cv::Mat();
 }
 
 static bool
@@ -1000,11 +1012,11 @@ imdecodemulti_(const Mat& buf, int flags, std::vector<Mat>& mats, int start, int
     }
     catch (const cv::Exception& e)
     {
-        std::cerr << "imreadmulti_('" << filename << "'): can't read header: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imreadmulti_('" << filename << "'): can't read header: " << e.what());
     }
     catch (...)
     {
-        std::cerr << "imreadmulti_('" << filename << "'): can't read header: unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "imreadmulti_('" << filename << "'): can't read header: unknown exception");
     }
 
     int current = start;
@@ -1025,7 +1037,7 @@ imdecodemulti_(const Mat& buf, int flags, std::vector<Mat>& mats, int start, int
         {
             if (0 != remove(filename.c_str()))
             {
-                std::cerr << "unable to remove temporary file:" << filename << std::endl << std::flush;
+                CV_LOG_WARNING(NULL, "unable to remove temporary file: " << filename);
             }
         }
         return 0;
@@ -1060,11 +1072,11 @@ imdecodemulti_(const Mat& buf, int flags, std::vector<Mat>& mats, int start, int
         }
         catch (const cv::Exception& e)
         {
-            std::cerr << "imreadmulti_('" << filename << "'): can't read data: " << e.what() << std::endl << std::flush;
+            CV_LOG_ERROR(NULL, "imreadmulti_('" << filename << "'): can't read data: " << e.what());
         }
         catch (...)
         {
-            std::cerr << "imreadmulti_('" << filename << "'): can't read data: unknown exception" << std::endl << std::flush;
+            CV_LOG_ERROR(NULL, "imreadmulti_('" << filename << "'): can't read data: unknown exception");
         }
         if (!success)
             break;
@@ -1087,7 +1099,7 @@ imdecodemulti_(const Mat& buf, int flags, std::vector<Mat>& mats, int start, int
     {
         if (0 != remove(filename.c_str()))
         {
-            std::cerr << "unable to remove temporary file:" << filename << std::endl << std::flush;
+            CV_LOG_WARNING(NULL, "unable to remove temporary file: " << filename);
         }
     }
 
@@ -1096,12 +1108,21 @@ imdecodemulti_(const Mat& buf, int flags, std::vector<Mat>& mats, int start, int
     return !mats.empty();
 }
 
-bool imdecodemulti(InputArray _buf, int flags, CV_OUT std::vector<Mat>& mats)
+bool imdecodemulti(InputArray _buf, int flags, CV_OUT std::vector<Mat>& mats, const Range& range)
 {
     CV_TRACE_FUNCTION();
 
     Mat buf = _buf.getMat();
-    return imdecodemulti_(buf, flags, mats, 0, -1);
+    if (range == Range::all())
+    {
+        return imdecodemulti_(buf, flags, mats, 0, -1);
+    }
+    else
+    {
+        CV_CheckGE(range.start, 0, "Range start cannot be negative.");
+        CV_CheckGT(range.size(), 0, "Range cannot be empty.");
+        return imdecodemulti_(buf, flags, mats, range.start, range.size());
+    }
 }
 
 bool imencode( const String& ext, InputArray _image,
@@ -1317,10 +1338,10 @@ Mat ImageCollection::Impl::readData() {
             success = true;
     }
     catch (const cv::Exception &e) {
-        std::cerr << "ImageCollection class: can't read data: " << e.what() << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "ImageCollection class: can't read data: " << e.what());
     }
     catch (...) {
-        std::cerr << "ImageCollection class:: can't read data: unknown exception" << std::endl << std::flush;
+        CV_LOG_ERROR(NULL, "ImageCollection class:: can't read data: unknown exception");
     }
     if (!success)
         return cv::Mat();
diff --git a/modules/imgcodecs/test/test_avif.cpp b/modules/imgcodecs/test/test_avif.cpp
index 99b8f7769c8e..72b7f54feaa5 100644
--- a/modules/imgcodecs/test/test_avif.cpp
+++ b/modules/imgcodecs/test/test_avif.cpp
@@ -166,7 +166,7 @@ TEST_P(Imgcodecs_Avif_Image_EncodeDecodeSuite, imencode_imdecode) {
                  cv::Exception);
     return;
   }
-  bool result;
+  bool result = true;
   EXPECT_NO_THROW(
       result = cv::imencode(".avif", img_original, buf, encoding_params_););
   EXPECT_TRUE(result);
diff --git a/modules/imgcodecs/test/test_grfmt.cpp b/modules/imgcodecs/test/test_grfmt.cpp
index 4ea3716d320e..1e0bf47b4713 100644
--- a/modules/imgcodecs/test/test_grfmt.cpp
+++ b/modules/imgcodecs/test/test_grfmt.cpp
@@ -87,11 +87,17 @@ const string all_images[] =
     "readwrite/uint16-mono2.dcm",
     "readwrite/uint8-rgb.dcm",
 #endif
+#if defined(HAVE_PNG) || defined(HAVE_SPNG)
     "readwrite/color_palette_alpha.png",
+#endif
+#ifdef HAVE_TIFF
     "readwrite/multipage.tif",
+#endif
     "readwrite/ordinary.bmp",
     "readwrite/rle8.bmp",
+#ifdef HAVE_JPEG
     "readwrite/test_1_c1.jpg",
+#endif
 #ifdef HAVE_IMGCODEC_HDR
     "readwrite/rle.hdr"
 #endif
@@ -482,6 +488,19 @@ TEST(Imgcodecs, write_parameter_type)
     EXPECT_EQ(0, remove(tmp_file.c_str()));
 }
 
+TEST(Imgcodecs, imdecode_user_buffer)
+{
+    cv::Mat encoded = cv::Mat::zeros(1, 1024, CV_8UC1);
+    cv::Mat user_buffer(1, 1024, CV_8UC1);
+    cv::Mat result = cv::imdecode(encoded, IMREAD_ANYCOLOR, &user_buffer);
+    EXPECT_TRUE(result.empty());
+    // the function does not release user-provided buffer
+    EXPECT_FALSE(user_buffer.empty());
+
+    result = cv::imdecode(encoded, IMREAD_ANYCOLOR);
+    EXPECT_TRUE(result.empty());
+}
+
 }} // namespace
 
 #if defined(HAVE_OPENEXR) && defined(OPENCV_IMGCODECS_ENABLE_OPENEXR_TESTS)
diff --git a/modules/imgcodecs/test/test_jpeg.cpp b/modules/imgcodecs/test/test_jpeg.cpp
index b7932a00200d..ee9da01aa79e 100644
--- a/modules/imgcodecs/test/test_jpeg.cpp
+++ b/modules/imgcodecs/test/test_jpeg.cpp
@@ -7,6 +7,10 @@ namespace opencv_test { namespace {
 
 #ifdef HAVE_JPEG
 
+extern "C" {
+#include "jpeglib.h"
+}
+
 /**
  * Test for check whether reading exif orientation tag was processed successfully or not
  * The test info is the set of 8 images named testExifRotate_{1 to 8}.jpg
@@ -178,6 +182,44 @@ TEST(Imgcodecs_Jpeg, encode_decode_rst_jpeg)
     EXPECT_EQ(0, remove(output_normal.c_str()));
 }
 
+// See https://github.com/opencv/opencv/issues/25274
+typedef testing::TestWithParam<int> Imgcodecs_Jpeg_decode_cmyk;
+TEST_P(Imgcodecs_Jpeg_decode_cmyk, regression25274)
+{
+    const int imread_flag = GetParam();
+
+    /*
+     * "test_1_c4.jpg" is CMYK-JPEG.
+     * $ convert test_1_c3.jpg -colorspace CMYK test_1_c4.jpg
+     * $ identify test_1_c4.jpg
+     * test_1_c4.jpg JPEG 480x640 480x640+0+0 8-bit CMYK 11240B 0.000u 0:00.000
+     */
+
+    cvtest::TS& ts = *cvtest::TS::ptr();
+
+    string  rgb_filename  = string(ts.get_data_path()) + "readwrite/test_1_c3.jpg";
+    cv::Mat rgb_img       = cv::imread(rgb_filename, imread_flag);
+    ASSERT_FALSE(rgb_img.empty());
+
+    string  cmyk_filename = string(ts.get_data_path()) + "readwrite/test_1_c4.jpg";
+    cv::Mat cmyk_img      = cv::imread(cmyk_filename, imread_flag);
+    ASSERT_FALSE(cmyk_img.empty());
+
+    EXPECT_EQ(rgb_img.size(), cmyk_img.size());
+    EXPECT_EQ(rgb_img.type(), cmyk_img.type());
+
+    // Jpeg is lossy compression.
+    // There may be small differences in decoding results by environments.
+    // -> 255 * 1% = 2.55 .
+    EXPECT_LE(cvtest::norm(rgb_img, cmyk_img, NORM_INF), 3); // norm() <= 3
+}
+
+INSTANTIATE_TEST_CASE_P( /* nothing */,
+                        Imgcodecs_Jpeg_decode_cmyk,
+                        testing::Values(cv::IMREAD_COLOR,
+                                        cv::IMREAD_GRAYSCALE,
+                                        cv::IMREAD_ANYCOLOR));
+
 //==================================================================================================
 
 static const uint32_t default_sampling_factor = static_cast<uint32_t>(0x221111);
@@ -270,6 +312,64 @@ TEST(Imgcodecs_Jpeg, encode_subsamplingfactor_usersetting_invalid)
     }
 }
 
+//==================================================================================================
+// See https://github.com/opencv/opencv/issues/25646
+typedef testing::TestWithParam<std::tuple<int, int>> Imgcodecs_Jpeg_encode_withLumaChromaQuality;
+
+TEST_P(Imgcodecs_Jpeg_encode_withLumaChromaQuality, basic)
+{
+    const int luma   = get<0>(GetParam());
+    const int chroma = get<1>(GetParam());
+
+    cvtest::TS& ts = *cvtest::TS::ptr();
+    string fname = string(ts.get_data_path()) + "../cv/shared/lena.png";
+
+    cv::Mat src = imread(fname, cv::IMREAD_COLOR);
+    ASSERT_FALSE(src.empty());
+
+    std::vector<uint8_t> jpegNormal;
+    ASSERT_NO_THROW(cv::imencode(".jpg", src, jpegNormal));
+
+    std::vector<int> param;
+    param.push_back(IMWRITE_JPEG_LUMA_QUALITY);
+    param.push_back(luma);
+    param.push_back(IMWRITE_JPEG_CHROMA_QUALITY);
+    param.push_back(chroma);
+
+    std::vector<uint8_t> jpegCustom;
+    ASSERT_NO_THROW(cv::imencode(".jpg", src, jpegCustom, param));
+
+#if JPEG_LIB_VERSION >= 70
+    // For jpeg7+, we can support IMWRITE_JPEG_LUMA_QUALITY and IMWRITE_JPEG_CHROMA_QUALITY.
+    if( (luma == 95 /* Default Luma Quality */ ) && ( chroma == 95 /* Default Chroma Quality */))
+    {
+        EXPECT_EQ(jpegNormal, jpegCustom);
+    }
+    else
+    {
+        EXPECT_NE(jpegNormal, jpegCustom);
+    }
+#else
+    // For jpeg6-, we cannot support IMWRITE_JPEG_LUMA/CHROMA_QUALITY because jpeg_default_qtables() is missing.
+    // - IMWRITE_JPEG_LUMA_QUALITY updates internal parameter of IMWRITE_JPEG_QUALITY.
+    // - IMWRITE_JPEG_CHROMA_QUALITY updates nothing.
+    if( luma == 95 /* Default Jpeg Quality */ )
+    {
+        EXPECT_EQ(jpegNormal, jpegCustom);
+    }
+    else
+    {
+        EXPECT_NE(jpegNormal, jpegCustom);
+    }
+#endif
+}
+
+INSTANTIATE_TEST_CASE_P( /* nothing */,
+                        Imgcodecs_Jpeg_encode_withLumaChromaQuality,
+                        testing::Combine(
+                            testing::Values(70, 95, 100),    // IMWRITE_JPEG_LUMA_QUALITY
+                            testing::Values(70, 95, 100) )); // IMWRITE_JPEG_CHROMA_QUALITY
+
 #endif // HAVE_JPEG
 
 }} // namespace
diff --git a/modules/imgcodecs/test/test_png.cpp b/modules/imgcodecs/test/test_png.cpp
index cdc7da39b282..655c59430d4b 100644
--- a/modules/imgcodecs/test/test_png.cpp
+++ b/modules/imgcodecs/test/test_png.cpp
@@ -7,6 +7,19 @@ namespace opencv_test { namespace {
 
 #if defined(HAVE_PNG) || defined(HAVE_SPNG)
 
+TEST(Imgcodecs_Png, imread_passing_mat)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string imgName = root + "../cv/shared/lena.png";
+
+    Mat ref = imread(imgName);
+    Mat img(ref.size(), ref.type());
+    void* ptr = img.data;
+    imread(imgName, img);
+    EXPECT_EQ(cv::norm(ref, img, NORM_INF), 0);
+    EXPECT_EQ(img.data, ptr);
+}
+
 TEST(Imgcodecs_Png, write_big)
 {
     const string root = cvtest::TS::ptr()->get_data_path();
diff --git a/modules/imgcodecs/test/test_read_write.cpp b/modules/imgcodecs/test/test_read_write.cpp
index 2320147a4ecc..39c02ca95cea 100644
--- a/modules/imgcodecs/test/test_read_write.cpp
+++ b/modules/imgcodecs/test/test_read_write.cpp
@@ -303,6 +303,7 @@ TEST(Imgcodecs_Image, write_umat)
     EXPECT_EQ(0, remove(dst_name.c_str()));
 }
 
+#ifdef HAVE_TIFF
 TEST(Imgcodecs_Image, multipage_collection_size)
 {
     const string root = cvtest::TS::ptr()->get_data_path();
@@ -479,6 +480,7 @@ TEST(ImgCodecs, multipage_collection_two_iterator_operatorpp)
          EXPECT_TRUE(cv::norm(img1, img[i], NORM_INF) == 0);
     }
 }
+#endif
 
 
 TEST(Imgcodecs_Params, imwrite_regression_22752)
diff --git a/modules/imgcodecs/test/test_tiff.cpp b/modules/imgcodecs/test/test_tiff.cpp
index 3aea5b85d5aa..ee40c54b59dd 100644
--- a/modules/imgcodecs/test/test_tiff.cpp
+++ b/modules/imgcodecs/test/test_tiff.cpp
@@ -9,43 +9,6 @@ namespace opencv_test { namespace {
 
 #ifdef HAVE_TIFF
 
-// these defines are used to resolve conflict between tiff.h and opencv2/core/types_c.h
-#define uint64 uint64_hack_
-#define int64 int64_hack_
-#include "tiff.h"
-
-// Re-define Mat type as enum for showing on Google Test.
-enum CV_ddtCn{
-  _CV_8UC1  = CV_8UC1,  _CV_8UC3  = CV_8UC3,  _CV_8UC4  = CV_8UC4,
-  _CV_8SC1  = CV_8SC1,  _CV_8SC3  = CV_8SC3,  _CV_8SC4  = CV_8SC4,
-  _CV_16UC1 = CV_16UC1, _CV_16UC3 = CV_16UC3, _CV_16UC4 = CV_16UC4,
-  _CV_16SC1 = CV_16SC1, _CV_16SC3 = CV_16SC3, _CV_16SC4 = CV_16SC4,
-  _CV_32SC1 = CV_32SC1, _CV_32SC3 = CV_32SC3, _CV_32SC4 = CV_32SC4,
-  _CV_16FC1 = CV_16FC1, _CV_16FC3 = CV_16FC3, _CV_16FC4 = CV_16FC4,
-  _CV_32FC1 = CV_32FC1, _CV_32FC3 = CV_32FC3, _CV_32FC4 = CV_32FC4,
-  _CV_64FC1 = CV_64FC1, _CV_64FC3 = CV_64FC3, _CV_64FC4 = CV_64FC4,
-};
-
-static inline
-void PrintTo(const CV_ddtCn& val, std::ostream* os)
-{
-    const int val_type = static_cast<int>(val);
-
-    switch ( CV_MAT_DEPTH(val_type) )
-    {
-    case CV_8U  : *os << "CV_8U" ; break;
-    case CV_16U : *os << "CV_16U" ; break;
-    case CV_8S  : *os << "CV_8S" ; break;
-    case CV_16S : *os << "CV_16S" ; break;
-    case CV_32S : *os << "CV_32S" ; break;
-    case CV_16F : *os << "CV_16F" ; break;
-    case CV_32F : *os << "CV_32F" ; break;
-    case CV_64F : *os << "CV_64F" ; break;
-    default     : *os << "CV_???" ; break;
-    }
-    *os << "C" << CV_MAT_CN(val_type);
-}
-
 #ifdef __ANDROID__
 // Test disabled as it uses a lot of memory.
 // It is killed with SIGKILL by out of memory killer.
@@ -60,7 +23,7 @@ TEST(Imgcodecs_Tiff, decode_tile16384x16384)
     string file4 = cv::tempfile(".tiff");
 
     std::vector<int> params;
-    params.push_back(TIFFTAG_ROWSPERSTRIP);
+    params.push_back(IMWRITE_TIFF_ROWSPERSTRIP);
     params.push_back(big.rows);
     EXPECT_NO_THROW(cv::imwrite(file4, big, params));
     EXPECT_NO_THROW(cv::imwrite(file3, big.colRange(0, big.cols - 1), params));
@@ -722,15 +685,22 @@ TEST(Imgcodecs_Tiff, readWrite_unsigned)
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/gray_8u.tif";
     const string filenameOutput = cv::tempfile(".tiff");
-    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    Mat img;
+    ASSERT_NO_THROW(img = cv::imread(filenameInput, IMREAD_UNCHANGED));
     ASSERT_FALSE(img.empty());
     ASSERT_EQ(CV_8UC1, img.type());
 
     Mat matS8;
     img.convertTo(matS8, CV_8SC1);
 
-    ASSERT_TRUE(cv::imwrite(filenameOutput, matS8));
-    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    bool ret_imwrite = false;
+    ASSERT_NO_THROW(ret_imwrite = cv::imwrite(filenameOutput, matS8));
+    ASSERT_TRUE(ret_imwrite);
+
+    Mat img2;
+    ASSERT_NO_THROW(img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED));
+    ASSERT_FALSE(img2.empty());
     ASSERT_EQ(img2.type(), matS8.type());
     ASSERT_EQ(img2.size(), matS8.size());
     EXPECT_LE(cvtest::norm(matS8, img2, NORM_INF | NORM_RELATIVE), 1e-3);
@@ -742,12 +712,19 @@ TEST(Imgcodecs_Tiff, readWrite_32FC1)
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test32FC1.tiff";
     const string filenameOutput = cv::tempfile(".tiff");
-    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    Mat img;
+    ASSERT_NO_THROW(img = cv::imread(filenameInput, IMREAD_UNCHANGED));
     ASSERT_FALSE(img.empty());
     ASSERT_EQ(CV_32FC1,img.type());
 
-    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
-    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    bool ret_imwrite = false;
+    ASSERT_NO_THROW(ret_imwrite = cv::imwrite(filenameOutput, img));
+    ASSERT_TRUE(ret_imwrite);
+
+    Mat img2;
+    ASSERT_NO_THROW(img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED));
+    ASSERT_FALSE(img2.empty());
     ASSERT_EQ(img2.type(), img.type());
     ASSERT_EQ(img2.size(), img.size());
     EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
@@ -759,12 +736,19 @@ TEST(Imgcodecs_Tiff, readWrite_64FC1)
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test64FC1.tiff";
     const string filenameOutput = cv::tempfile(".tiff");
-    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    Mat img;
+    ASSERT_NO_THROW(img = cv::imread(filenameInput, IMREAD_UNCHANGED));
     ASSERT_FALSE(img.empty());
     ASSERT_EQ(CV_64FC1, img.type());
 
-    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
-    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    bool ret_imwrite = false;
+    ASSERT_NO_THROW(ret_imwrite = cv::imwrite(filenameOutput, img));
+    ASSERT_TRUE(ret_imwrite);
+
+    Mat img2;
+    ASSERT_NO_THROW(img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED));
+    ASSERT_FALSE(img2.empty());
     ASSERT_EQ(img2.type(), img.type());
     ASSERT_EQ(img2.size(), img.size());
     EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
@@ -776,12 +760,19 @@ TEST(Imgcodecs_Tiff, readWrite_32FC3_SGILOG)
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test32FC3_sgilog.tiff";
     const string filenameOutput = cv::tempfile(".tiff");
-    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    Mat img;
+    ASSERT_NO_THROW(img = cv::imread(filenameInput, IMREAD_UNCHANGED));
     ASSERT_FALSE(img.empty());
     ASSERT_EQ(CV_32FC3, img.type());
 
-    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
-    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    bool ret_imwrite = false;
+    ASSERT_NO_THROW(ret_imwrite = cv::imwrite(filenameOutput, img));
+    ASSERT_TRUE(ret_imwrite);
+
+    Mat img2;
+    ASSERT_NO_THROW(img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED));
+    ASSERT_FALSE(img2.empty());
     ASSERT_EQ(img2.type(), img.type());
     ASSERT_EQ(img2.size(), img.size());
     EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 0.01);
@@ -793,16 +784,23 @@ TEST(Imgcodecs_Tiff, readWrite_32FC3_RAW)
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test32FC3_raw.tiff";
     const string filenameOutput = cv::tempfile(".tiff");
-    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    Mat img;
+    ASSERT_NO_THROW(img = cv::imread(filenameInput, IMREAD_UNCHANGED));
     ASSERT_FALSE(img.empty());
     ASSERT_EQ(CV_32FC3, img.type());
 
     std::vector<int> params;
     params.push_back(IMWRITE_TIFF_COMPRESSION);
-    params.push_back(1/*COMPRESSION_NONE*/);
+    params.push_back(IMWRITE_TIFF_COMPRESSION_NONE);
 
-    ASSERT_TRUE(cv::imwrite(filenameOutput, img, params));
-    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    bool ret_imwrite = false;
+    ASSERT_NO_THROW(ret_imwrite = cv::imwrite(filenameOutput, img, params));
+    ASSERT_TRUE(ret_imwrite);
+
+    Mat img2;
+    ASSERT_NO_THROW(img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED));
+    ASSERT_FALSE(img2.empty());
     ASSERT_EQ(img2.type(), img.type());
     ASSERT_EQ(img2.size(), img.size());
     EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
@@ -814,7 +812,8 @@ TEST(Imgcodecs_Tiff, read_palette_color_image)
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test_palette_color_image.tif";
 
-    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+    Mat img;
+    ASSERT_NO_THROW(img = cv::imread(filenameInput, IMREAD_UNCHANGED));
     ASSERT_FALSE(img.empty());
     ASSERT_EQ(CV_8UC3, img.type());
 }
@@ -824,7 +823,8 @@ TEST(Imgcodecs_Tiff, read_4_bit_palette_color_image)
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/4-bit_palette_color.tif";
 
-    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+    Mat img;
+    ASSERT_NO_THROW(img = cv::imread(filenameInput, IMREAD_UNCHANGED));
     ASSERT_FALSE(img.empty());
     ASSERT_EQ(CV_8UC3, img.type());
 }
@@ -848,22 +848,26 @@ TEST(Imgcodecs_Tiff, readWrite_predictor)
 
     cv::Mat mat(10, 16, CV_8UC1, (void*)sample_data);
     int methods[] = {
-        COMPRESSION_NONE,     COMPRESSION_LZW,
-        COMPRESSION_PACKBITS, COMPRESSION_DEFLATE,  COMPRESSION_ADOBE_DEFLATE
+        IMWRITE_TIFF_COMPRESSION_NONE,     IMWRITE_TIFF_COMPRESSION_LZW,
+        IMWRITE_TIFF_COMPRESSION_PACKBITS, IMWRITE_TIFF_COMPRESSION_DEFLATE,
+        IMWRITE_TIFF_COMPRESSION_ADOBE_DEFLATE
     };
     for (size_t i = 0; i < sizeof(methods) / sizeof(int); i++)
     {
         string out = cv::tempfile(".tif");
 
         std::vector<int> params;
-        params.push_back(TIFFTAG_COMPRESSION);
+        params.push_back(IMWRITE_TIFF_COMPRESSION);
         params.push_back(methods[i]);
-        params.push_back(TIFFTAG_PREDICTOR);
-        params.push_back(PREDICTOR_HORIZONTAL);
+        params.push_back(IMWRITE_TIFF_PREDICTOR);
+        params.push_back(IMWRITE_TIFF_PREDICTOR_HORIZONTAL);
 
-        EXPECT_NO_THROW(cv::imwrite(out, mat, params));
+        bool ret_imwrite = false;
+        ASSERT_NO_THROW(ret_imwrite = cv::imwrite(out, mat, params));
+        ASSERT_TRUE(ret_imwrite);
 
-        const Mat img = cv::imread(out, IMREAD_UNCHANGED);
+        Mat img;
+        ASSERT_NO_THROW(img = cv::imread(out, IMREAD_UNCHANGED));
         ASSERT_FALSE(img.empty());
 
         ASSERT_EQ(0, cv::norm(mat, img, cv::NORM_INF));
@@ -874,7 +878,7 @@ TEST(Imgcodecs_Tiff, readWrite_predictor)
 
 // See https://github.com/opencv/opencv/issues/23416
 
-typedef std::pair<CV_ddtCn,bool> Imgcodes_Tiff_TypeAndComp;
+typedef std::pair<perf::MatType,bool> Imgcodes_Tiff_TypeAndComp;
 typedef testing::TestWithParam< Imgcodes_Tiff_TypeAndComp > Imgcodecs_Tiff_Types;
 
 TEST_P(Imgcodecs_Tiff_Types, readWrite_alltypes)
@@ -895,7 +899,7 @@ TEST_P(Imgcodecs_Tiff_Types, readWrite_alltypes)
     {
         std::vector<int> params;
         params.push_back(IMWRITE_TIFF_COMPRESSION);
-        params.push_back(COMPRESSION_LZW);
+        params.push_back(IMWRITE_TIFF_COMPRESSION_LZW);
         ASSERT_NO_THROW(cv::imencode(".tiff", src, bufLZW, params));
 
         Mat dstLZW;
@@ -910,7 +914,7 @@ TEST_P(Imgcodecs_Tiff_Types, readWrite_alltypes)
     {
         std::vector<int> params;
         params.push_back(IMWRITE_TIFF_COMPRESSION);
-        params.push_back(COMPRESSION_NONE);
+        params.push_back(IMWRITE_TIFF_COMPRESSION_NONE);
         ASSERT_NO_THROW(cv::imencode(".tiff", src, bufRAW, params));
 
         Mat dstRAW;
@@ -925,13 +929,13 @@ TEST_P(Imgcodecs_Tiff_Types, readWrite_alltypes)
 }
 
 Imgcodes_Tiff_TypeAndComp all_types[] = {
-    { _CV_8UC1,  true  }, { _CV_8UC3,  true  }, { _CV_8UC4,  true  },
-    { _CV_8SC1,  true  }, { _CV_8SC3,  true  }, { _CV_8SC4,  true  },
-    { _CV_16UC1, true  }, { _CV_16UC3, true  }, { _CV_16UC4, true  },
-    { _CV_16SC1, true  }, { _CV_16SC3, true  }, { _CV_16SC4, true  },
-    { _CV_32SC1, true  }, { _CV_32SC3, true  }, { _CV_32SC4, true  },
-    { _CV_32FC1, false }, { _CV_32FC3, false }, { _CV_32FC4, false }, // No compression
-    { _CV_64FC1, false }, { _CV_64FC3, false }, { _CV_64FC4, false }  // No compression
+    { CV_8UC1,  true  }, { CV_8UC3,  true  }, { CV_8UC4,  true  },
+    { CV_8SC1,  true  }, { CV_8SC3,  true  }, { CV_8SC4,  true  },
+    { CV_16UC1, true  }, { CV_16UC3, true  }, { CV_16UC4, true  },
+    { CV_16SC1, true  }, { CV_16SC3, true  }, { CV_16SC4, true  },
+    { CV_32SC1, true  }, { CV_32SC3, true  }, { CV_32SC4, true  },
+    { CV_32FC1, false }, { CV_32FC3, false }, { CV_32FC4, false }, // No compression
+    { CV_64FC1, false }, { CV_64FC3, false }, { CV_64FC4, false }  // No compression
 };
 
 INSTANTIATE_TEST_CASE_P(AllTypes, Imgcodecs_Tiff_Types, testing::ValuesIn(all_types));
@@ -965,7 +969,7 @@ TEST_P(Imgcodecs_Tiff_Modes, decode_multipage)
     }
 }
 
-TEST_P(Imgcodecs_Tiff_Modes, decode_multipage_use_memory_buffer)
+TEST_P(Imgcodecs_Tiff_Modes, decode_multipage_use_memory_buffer_all_pages)
 {
     const int mode = GetParam();
     const string root = cvtest::TS::ptr()->get_data_path();
@@ -984,13 +988,14 @@ TEST_P(Imgcodecs_Tiff_Modes, decode_multipage_use_memory_buffer)
     FILE* fp = fopen(filename.c_str(), "rb");
     ASSERT_TRUE(fp != NULL);
     fseek(fp, 0, SEEK_END);
-    long pos = ftell(fp);
-
-    std::vector<uchar> buf;
-    buf.resize((size_t)pos);
+    const size_t file_size = ftell(fp);
     fseek(fp, 0, SEEK_SET);
-    buf.resize(fread(&buf[0], 1, buf.size(), fp));
+
+    std::vector<uchar> buf(file_size);
+    const size_t actual_read = fread(&buf[0], 1, file_size, fp);
     fclose(fp);
+    ASSERT_EQ(file_size, actual_read);
+    ASSERT_EQ(file_size, static_cast<size_t>(buf.size()));
 
     bool res = imdecodemulti(buf, mode, pages);
     ASSERT_TRUE(res == true);
@@ -1002,6 +1007,60 @@ TEST_P(Imgcodecs_Tiff_Modes, decode_multipage_use_memory_buffer)
     }
 }
 
+TEST_P(Imgcodecs_Tiff_Modes, decode_multipage_use_memory_buffer_selected_pages)
+{
+    const int mode = GetParam();
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filename = root + "readwrite/multipage.tif";
+    const string page_files[] = {
+        "readwrite/multipage_p1.tif",
+        "readwrite/multipage_p2.tif",
+        "readwrite/multipage_p3.tif",
+        "readwrite/multipage_p4.tif",
+        "readwrite/multipage_p5.tif",
+        "readwrite/multipage_p6.tif"
+    };
+    const size_t page_count = sizeof(page_files) / sizeof(page_files[0]);
+
+    FILE* fp = fopen(filename.c_str(), "rb");
+    ASSERT_TRUE(fp != NULL);
+    fseek(fp, 0, SEEK_END);
+    const size_t file_size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    std::vector<uchar> buf(file_size);
+    const size_t actual_read = fread(&buf[0], 1, file_size, fp);
+    fclose(fp);
+    ASSERT_EQ(file_size, actual_read);
+    ASSERT_EQ(file_size, static_cast<size_t>(buf.size()));
+
+    const Range range(1, page_count - 1);
+    ASSERT_GE(range.size(), 1);
+
+    vector<Mat> middle_pages_from_imread;
+    for (int page_i = range.start; page_i < range.end; page_i++)
+    {
+        const Mat page = imread(root + page_files[page_i], mode);
+        middle_pages_from_imread.push_back(page);
+    }
+    ASSERT_EQ(
+        static_cast<size_t>(range.size()),
+        static_cast<size_t>(middle_pages_from_imread.size())
+    );
+
+    vector<Mat> middle_pages_from_imdecodemulti;
+    const bool res = imdecodemulti(buf, mode, middle_pages_from_imdecodemulti, range);
+    ASSERT_TRUE(res == true);
+    EXPECT_EQ(middle_pages_from_imread.size(), middle_pages_from_imdecodemulti.size());
+
+    for (int i = 0, e = range.size(); i < e; i++)
+    {
+        EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0),
+            middle_pages_from_imread[i],
+            middle_pages_from_imdecodemulti[i]);
+    }
+}
+
 const int all_modes[] =
 {
     IMREAD_UNCHANGED,
@@ -1045,6 +1104,7 @@ TEST(Imgcodecs_Tiff_Modes, write_multipage)
     {
         EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0), read_pages[i], pages[i]);
     }
+    EXPECT_EQ(0, remove(tmp_filename.c_str()));
 }
 
 //==================================================================================================
diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt
index 8ee300c32043..10aed6bedd8d 100644
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -12,8 +12,10 @@ ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(sumpixels SSE2 AVX2 AVX512_SKX)
 ocv_define_module(imgproc opencv_core WRAP java objc python js)
 
-ocv_check_environment_variables(OPENCV_IPP_GAUSSIAN_BLUR)
-option(OPENCV_IPP_GAUSSIAN_BLUR "Enable IPP optimizations for GaussianBlur (+8Mb in binary size)" OFF)
-if(OPENCV_IPP_GAUSSIAN_BLUR)
-  ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/smooth.dispatch.cpp "ENABLE_IPP_GAUSSIAN_BLUR=1")
+if(HAVE_IPP)
+  # OPENCV_IPP_ENABLE_ALL is defined in modules/core/CMakeList.txt
+  OCV_OPTION(OPENCV_IPP_GAUSSIAN_BLUR "Enable IPP optimizations for GaussianBlur (+8Mb in binary size)" OPENCV_IPP_ENABLE_ALL)
+  if(OPENCV_IPP_GAUSSIAN_BLUR)
+    ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/smooth.dispatch.cpp "ENABLE_IPP_GAUSSIAN_BLUR=1")
+  endif()
 endif()
diff --git a/modules/imgproc/doc/colors.markdown b/modules/imgproc/doc/colors.markdown
index c1c2abfce735..97d0907a6248 100644
--- a/modules/imgproc/doc/colors.markdown
+++ b/modules/imgproc/doc/colors.markdown
@@ -1,11 +1,13 @@
 Color conversions {#imgproc_color_conversions}
 =================
+
 See cv::cvtColor and cv::ColorConversionCodes
+
 @todo document other conversion modes
 
 @anchor color_convert_rgb_gray
-RGB \f$\leftrightarrow\f$ GRAY
-------------------------------
+RGB <-> GRAY
+------------
 Transformations within RGB space like adding/removing the alpha channel, reversing the channel
 order, conversion to/from 16-bit RGB color (R5:G6:B5 or R5:G5:B5), as well as conversion
 to/from grayscale using:
@@ -20,8 +22,8 @@ More advanced channel reordering can also be done with cv::mixChannels.
 @see cv::COLOR_BGR2GRAY, cv::COLOR_RGB2GRAY, cv::COLOR_GRAY2BGR, cv::COLOR_GRAY2RGB
 
 @anchor color_convert_rgb_xyz
-RGB \f$\leftrightarrow\f$ CIE XYZ.Rec 709 with D65 white point
---------------------------------------------------------------
+RGB <-> CIE XYZ.Rec 709 with D65 white point
+--------------------------------------------
 \f[\begin{bmatrix} X  \\ Y  \\ Z
   \end{bmatrix} \leftarrow \begin{bmatrix} 0.412453 & 0.357580 & 0.180423 \\ 0.212671 & 0.715160 & 0.072169 \\ 0.019334 & 0.119193 & 0.950227
   \end{bmatrix} \cdot \begin{bmatrix} R  \\ G  \\ B
@@ -35,8 +37,8 @@ RGB \f$\leftrightarrow\f$ CIE XYZ.Rec 709 with D65 white point
 @see cv::COLOR_BGR2XYZ, cv::COLOR_RGB2XYZ, cv::COLOR_XYZ2BGR, cv::COLOR_XYZ2RGB
 
 @anchor color_convert_rgb_ycrcb
-RGB \f$\leftrightarrow\f$ YCrCb JPEG (or YCC)
----------------------------------------------
+RGB <-> YCrCb JPEG (or YCC)
+---------------------------
 \f[Y  \leftarrow 0.299  \cdot R + 0.587  \cdot G + 0.114  \cdot B\f]
 \f[Cr  \leftarrow (R-Y)  \cdot 0.713 + delta\f]
 \f[Cb  \leftarrow (B-Y)  \cdot 0.564 + delta\f]
@@ -48,9 +50,40 @@ where
 Y, Cr, and Cb cover the whole value range.
 @see cv::COLOR_BGR2YCrCb, cv::COLOR_RGB2YCrCb, cv::COLOR_YCrCb2BGR, cv::COLOR_YCrCb2RGB
 
+@anchor color_convert_rgb_yuv_42x
+RGB <-> YUV with subsampling
+------------------------------
+Only 8-bit values are supported.
+The coefficients correspond to BT.601 standard with resulting values Y [16, 235], U and V [16, 240] centered at 128.
+
+Two subsampling schemes are supported: 4:2:0 (Fourcc codes NV12, NV21, YV12, I420 and synonimic)
+and 4:2:2 (Fourcc codes UYVY, YUY2, YVYU and synonimic).
+
+In both subsampling schemes Y values are written for each pixel so that Y plane is in fact a scaled and biased gray version
+of a source image.
+
+In 4:2:0 scheme U and V values are averaged over 2x2 squares, i.e. only 1 U and 1 V value is saved per each 4 pixels.
+U and V values are saved interleaved into a separate plane (NV12, NV21) or into two separate semi-planes (YV12, I420).
+
+In 4:2:2 scheme U and V values are averaged horizontally over each pair of pixels, i.e. only 1 U and 1 V value is saved
+per each 2 pixels. U and V values are saved interleaved with Y values for both pixels according to its Fourcc code.
+
+Note that different conversions are perfomed with different precision for speed or compatibility purposes. For example,
+RGB to YUV 4:2:2 is converted using 14-bit fixed-point arithmetics while other conversions use 20 bits.
+
+\f[R \leftarrow 1.164 \cdot (Y - 16) + 1.596 \cdot (V - 128)\f]
+\f[G \leftarrow 1.164 \cdot (Y - 16) - 0.813 \cdot (V - 128) - 0.391 \cdot (U - 128)\f]
+\f[B \leftarrow 1.164 \cdot (Y - 16) + 2.018 \cdot (U - 128)\f]
+
+\f[Y \leftarrow (R \cdot 0.299 + G \cdot 0.587 + B \cdot 0.114) \cdot \frac{236 - 16}{256} + 16 \f]
+\f[U \leftarrow -0.148 \cdot R_{avg} - 0.291 \cdot G_{avg} + 0.439 \cdot B_{avg} + 128 \f]
+\f[V \leftarrow  0.439 \cdot R_{avg} - 0.368 \cdot G_{avg} - 0.071 \cdot B_{avg} + 128 \f]
+
+@see cv::COLOR_YUV2RGB_NV12, cv::COLOR_YUV2RGBA_YUY2, cv::COLOR_BGR2YUV_YV12 and similar ones
+
 @anchor color_convert_rgb_hsv
-RGB \f$\leftrightarrow\f$ HSV
------------------------------
+RGB <-> HSV
+-----------
 In case of 8-bit and 16-bit images, R, G, and B are converted to the floating-point format and
 scaled to fit the 0 to 1 range.
 
@@ -65,14 +98,14 @@ If \f$H<0\f$ then \f$H \leftarrow H+360\f$ . On output \f$0 \leq V \leq 1\f$, \f
 
 The values are then converted to the destination data type:
 - 8-bit images: \f$V  \leftarrow 255 V, S  \leftarrow 255 S, H  \leftarrow H/2  \text{(to fit to 0 to 255)}\f$
-- 16-bit images: (currently not supported) \f$V <- 65535 V, S <- 65535 S, H <- H\f$
+- 16-bit images: (currently not supported) \f$V \leftarrow 65535 V, S \leftarrow 65535 S, H \leftarrow H\f$
 - 32-bit images: H, S, and V are left as is
 
 @see cv::COLOR_BGR2HSV, cv::COLOR_RGB2HSV, cv::COLOR_HSV2BGR, cv::COLOR_HSV2RGB
 
 @anchor color_convert_rgb_hls
-RGB \f$\leftrightarrow\f$ HLS
------------------------------
+RGB <-> HLS
+-----------
 In case of 8-bit and 16-bit images, R, G, and B are converted to the floating-point format and
 scaled to fit the 0 to 1 range.
 
@@ -90,14 +123,14 @@ If \f$H<0\f$ then \f$H \leftarrow H+360\f$ . On output \f$0 \leq L \leq 1\f$, \f
 
 The values are then converted to the destination data type:
 - 8-bit images:  \f$V  \leftarrow 255 \cdot V, S  \leftarrow 255 \cdot S, H  \leftarrow H/2 \; \text{(to fit to 0 to 255)}\f$
-- 16-bit images: (currently not supported)  \f$V <- 65535 \cdot V, S <- 65535 \cdot S, H <- H\f$
+- 16-bit images: (currently not supported)  \f$V \leftarrow 65535 \cdot V, S \leftarrow 65535 \cdot S, H \leftarrow H\f$
 - 32-bit images: H, S, V are left as is
 
 @see cv::COLOR_BGR2HLS, cv::COLOR_RGB2HLS, cv::COLOR_HLS2BGR, cv::COLOR_HLS2RGB
 
 @anchor color_convert_rgb_lab
-RGB \f$\leftrightarrow\f$ CIE L\*a\*b\*
----------------------------------------
+RGB <-> CIE L\*a\*b\*
+---------------------
 In case of 8-bit and 16-bit images, R, G, and B are converted to the floating-point format and
 scaled to fit the 0 to 1 range.
 
@@ -121,8 +154,8 @@ are then converted to the destination data type:
 @see cv::COLOR_BGR2Lab, cv::COLOR_RGB2Lab, cv::COLOR_Lab2BGR, cv::COLOR_Lab2RGB
 
 @anchor color_convert_rgb_luv
-RGB \f$\leftrightarrow\f$ CIE L\*u\*v\*
----------------------------------------
+RGB <-> CIE L\*u\*v\*
+---------------------
 In case of 8-bit and 16-bit images, R, G, and B are converted to the floating-point format and
 scaled to fit 0 to 1 range.
 
@@ -148,8 +181,8 @@ sources on the web, primarily from the Charles Poynton site <http://www.poynton.
 @see cv::COLOR_BGR2Luv, cv::COLOR_RGB2Luv, cv::COLOR_Luv2BGR, cv::COLOR_Luv2RGB
 
 @anchor color_convert_bayer
-Bayer \f$\rightarrow\f$ RGB
----------------------------
+Bayer -> RGB
+------------
 The Bayer pattern is widely used in CCD and CMOS cameras. It enables you to get color pictures
 from a single plane where R, G, and B pixels (sensors of a particular component) are interleaved
 as follows:
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 13ce0fcac230..471a857f63fc 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -46,143 +46,143 @@
 #include "opencv2/core.hpp"
 
 /**
-  @defgroup imgproc Image Processing
+@defgroup imgproc Image Processing
 
 This module includes image-processing functions.
 
-  @{
+@{
     @defgroup imgproc_filter Image Filtering
 
-Functions and classes described in this section are used to perform various linear or non-linear
-filtering operations on 2D images (represented as Mat's). It means that for each pixel location
-\f$(x,y)\f$ in the source image (normally, rectangular), its neighborhood is considered and used to
-compute the response. In case of a linear filter, it is a weighted sum of pixel values. In case of
-morphological operations, it is the minimum or maximum values, and so on. The computed response is
-stored in the destination image at the same location \f$(x,y)\f$. It means that the output image
-will be of the same size as the input image. Normally, the functions support multi-channel arrays,
-in which case every channel is processed independently. Therefore, the output image will also have
-the same number of channels as the input one.
-
-Another common feature of the functions and classes described in this section is that, unlike
-simple arithmetic functions, they need to extrapolate values of some non-existing pixels. For
-example, if you want to smooth an image using a Gaussian \f$3 \times 3\f$ filter, then, when
-processing the left-most pixels in each row, you need pixels to the left of them, that is, outside
-of the image. You can let these pixels be the same as the left-most image pixels ("replicated
-border" extrapolation method), or assume that all the non-existing pixels are zeros ("constant
-border" extrapolation method), and so on. OpenCV enables you to specify the extrapolation method.
-For details, see #BorderTypes
-
-@anchor filter_depths
-### Depth combinations
-Input depth (src.depth()) | Output depth (ddepth)
---------------------------|----------------------
-CV_8U                     | -1/CV_16S/CV_32F/CV_64F
-CV_16U/CV_16S             | -1/CV_32F/CV_64F
-CV_32F                    | -1/CV_32F
-CV_64F                    | -1/CV_64F
-
-@note when ddepth=-1, the output image will have the same depth as the source.
-
-@note if you need double floating-point accuracy and using single floating-point input data
-(CV_32F input and CV_64F output depth combination), you can use @ref Mat.convertTo to convert
-the input data to the desired precision.
+    Functions and classes described in this section are used to perform various linear or non-linear
+    filtering operations on 2D images (represented as Mat's). It means that for each pixel location
+    \f$(x,y)\f$ in the source image (normally, rectangular), its neighborhood is considered and used to
+    compute the response. In case of a linear filter, it is a weighted sum of pixel values. In case of
+    morphological operations, it is the minimum or maximum values, and so on. The computed response is
+    stored in the destination image at the same location \f$(x,y)\f$. It means that the output image
+    will be of the same size as the input image. Normally, the functions support multi-channel arrays,
+    in which case every channel is processed independently. Therefore, the output image will also have
+    the same number of channels as the input one.
+
+    Another common feature of the functions and classes described in this section is that, unlike
+    simple arithmetic functions, they need to extrapolate values of some non-existing pixels. For
+    example, if you want to smooth an image using a Gaussian \f$3 \times 3\f$ filter, then, when
+    processing the left-most pixels in each row, you need pixels to the left of them, that is, outside
+    of the image. You can let these pixels be the same as the left-most image pixels ("replicated
+    border" extrapolation method), or assume that all the non-existing pixels are zeros ("constant
+    border" extrapolation method), and so on. OpenCV enables you to specify the extrapolation method.
+    For details, see #BorderTypes
+
+    @anchor filter_depths
+    ### Depth combinations
+    Input depth (src.depth()) | Output depth (ddepth)
+    --------------------------|----------------------
+    CV_8U                     | -1/CV_16S/CV_32F/CV_64F
+    CV_16U/CV_16S             | -1/CV_32F/CV_64F
+    CV_32F                    | -1/CV_32F
+    CV_64F                    | -1/CV_64F
+
+    @note when ddepth=-1, the output image will have the same depth as the source.
+
+    @note if you need double floating-point accuracy and using single floating-point input data
+    (CV_32F input and CV_64F output depth combination), you can use @ref Mat.convertTo to convert
+    the input data to the desired precision.
 
     @defgroup imgproc_transform Geometric Image Transformations
 
-The functions in this section perform various geometrical transformations of 2D images. They do not
-change the image content but deform the pixel grid and map this deformed grid to the destination
-image. In fact, to avoid sampling artifacts, the mapping is done in the reverse order, from
-destination to the source. That is, for each pixel \f$(x, y)\f$ of the destination image, the
-functions compute coordinates of the corresponding "donor" pixel in the source image and copy the
-pixel value:
-
-\f[\texttt{dst} (x,y)= \texttt{src} (f_x(x,y), f_y(x,y))\f]
-
-In case when you specify the forward mapping \f$\left<g_x, g_y\right>: \texttt{src} \rightarrow
-\texttt{dst}\f$, the OpenCV functions first compute the corresponding inverse mapping
-\f$\left<f_x, f_y\right>: \texttt{dst} \rightarrow \texttt{src}\f$ and then use the above formula.
-
-The actual implementations of the geometrical transformations, from the most generic remap and to
-the simplest and the fastest resize, need to solve two main problems with the above formula:
-
-- Extrapolation of non-existing pixels. Similarly to the filtering functions described in the
-previous section, for some \f$(x,y)\f$, either one of \f$f_x(x,y)\f$, or \f$f_y(x,y)\f$, or both
-of them may fall outside of the image. In this case, an extrapolation method needs to be used.
-OpenCV provides the same selection of extrapolation methods as in the filtering functions. In
-addition, it provides the method #BORDER_TRANSPARENT. This means that the corresponding pixels in
-the destination image will not be modified at all.
-
-- Interpolation of pixel values. Usually \f$f_x(x,y)\f$ and \f$f_y(x,y)\f$ are floating-point
-numbers. This means that \f$\left<f_x, f_y\right>\f$ can be either an affine or perspective
-transformation, or radial lens distortion correction, and so on. So, a pixel value at fractional
-coordinates needs to be retrieved. In the simplest case, the coordinates can be just rounded to the
-nearest integer coordinates and the corresponding pixel can be used. This is called a
-nearest-neighbor interpolation. However, a better result can be achieved by using more
-sophisticated [interpolation methods](http://en.wikipedia.org/wiki/Multivariate_interpolation) ,
-where a polynomial function is fit into some neighborhood of the computed pixel \f$(f_x(x,y),
-f_y(x,y))\f$, and then the value of the polynomial at \f$(f_x(x,y), f_y(x,y))\f$ is taken as the
-interpolated pixel value. In OpenCV, you can choose between several interpolation methods. See
-#resize for details.
-
-@note The geometrical transformations do not work with `CV_8S` or `CV_32S` images.
+    The functions in this section perform various geometrical transformations of 2D images. They do not
+    change the image content but deform the pixel grid and map this deformed grid to the destination
+    image. In fact, to avoid sampling artifacts, the mapping is done in the reverse order, from
+    destination to the source. That is, for each pixel \f$(x, y)\f$ of the destination image, the
+    functions compute coordinates of the corresponding "donor" pixel in the source image and copy the
+    pixel value:
+
+    \f[\texttt{dst} (x,y)= \texttt{src} (f_x(x,y), f_y(x,y))\f]
+
+    In case when you specify the forward mapping \f$\left<g_x, g_y\right>: \texttt{src} \rightarrow
+    \texttt{dst}\f$, the OpenCV functions first compute the corresponding inverse mapping
+    \f$\left<f_x, f_y\right>: \texttt{dst} \rightarrow \texttt{src}\f$ and then use the above formula.
+
+    The actual implementations of the geometrical transformations, from the most generic remap and to
+    the simplest and the fastest resize, need to solve two main problems with the above formula:
+
+    - Extrapolation of non-existing pixels. Similarly to the filtering functions described in the
+    previous section, for some \f$(x,y)\f$, either one of \f$f_x(x,y)\f$, or \f$f_y(x,y)\f$, or both
+    of them may fall outside of the image. In this case, an extrapolation method needs to be used.
+    OpenCV provides the same selection of extrapolation methods as in the filtering functions. In
+    addition, it provides the method #BORDER_TRANSPARENT. This means that the corresponding pixels in
+    the destination image will not be modified at all.
+
+    - Interpolation of pixel values. Usually \f$f_x(x,y)\f$ and \f$f_y(x,y)\f$ are floating-point
+    numbers. This means that \f$\left<f_x, f_y\right>\f$ can be either an affine or perspective
+    transformation, or radial lens distortion correction, and so on. So, a pixel value at fractional
+    coordinates needs to be retrieved. In the simplest case, the coordinates can be just rounded to the
+    nearest integer coordinates and the corresponding pixel can be used. This is called a
+    nearest-neighbor interpolation. However, a better result can be achieved by using more
+    sophisticated [interpolation methods](http://en.wikipedia.org/wiki/Multivariate_interpolation) ,
+    where a polynomial function is fit into some neighborhood of the computed pixel \f$(f_x(x,y),
+    f_y(x,y))\f$, and then the value of the polynomial at \f$(f_x(x,y), f_y(x,y))\f$ is taken as the
+    interpolated pixel value. In OpenCV, you can choose between several interpolation methods. See
+    #resize for details.
+
+    @note The geometrical transformations do not work with `CV_8S` or `CV_32S` images.
 
     @defgroup imgproc_misc Miscellaneous Image Transformations
     @defgroup imgproc_draw Drawing Functions
 
-Drawing functions work with matrices/images of arbitrary depth. The boundaries of the shapes can be
-rendered with antialiasing (implemented only for 8-bit images for now). All the functions include
-the parameter color that uses an RGB value (that may be constructed with the Scalar constructor )
-for color images and brightness for grayscale images. For color images, the channel ordering is
-normally *Blue, Green, Red*. This is what imshow, imread, and imwrite expect. So, if you form a
-color using the Scalar constructor, it should look like:
+    Drawing functions work with matrices/images of arbitrary depth. The boundaries of the shapes can be
+    rendered with antialiasing (implemented only for 8-bit images for now). All the functions include
+    the parameter color that uses an RGB value (that may be constructed with the Scalar constructor )
+    for color images and brightness for grayscale images. For color images, the channel ordering is
+    normally *Blue, Green, Red*. This is what imshow, imread, and imwrite expect. So, if you form a
+    color using the Scalar constructor, it should look like:
 
-\f[\texttt{Scalar} (blue \_ component, green \_ component, red \_ component[, alpha \_ component])\f]
+    \f[\texttt{Scalar} (blue \_ component, green \_ component, red \_ component[, alpha \_ component])\f]
 
-If you are using your own image rendering and I/O functions, you can use any channel ordering. The
-drawing functions process each channel independently and do not depend on the channel order or even
-on the used color space. The whole image can be converted from BGR to RGB or to a different color
-space using cvtColor .
+    If you are using your own image rendering and I/O functions, you can use any channel ordering. The
+    drawing functions process each channel independently and do not depend on the channel order or even
+    on the used color space. The whole image can be converted from BGR to RGB or to a different color
+    space using cvtColor .
 
-If a drawn figure is partially or completely outside the image, the drawing functions clip it. Also,
-many drawing functions can handle pixel coordinates specified with sub-pixel accuracy. This means
-that the coordinates can be passed as fixed-point numbers encoded as integers. The number of
-fractional bits is specified by the shift parameter and the real point coordinates are calculated as
-\f$\texttt{Point}(x,y)\rightarrow\texttt{Point2f}(x*2^{-shift},y*2^{-shift})\f$ . This feature is
-especially effective when rendering antialiased shapes.
+    If a drawn figure is partially or completely outside the image, the drawing functions clip it. Also,
+    many drawing functions can handle pixel coordinates specified with sub-pixel accuracy. This means
+    that the coordinates can be passed as fixed-point numbers encoded as integers. The number of
+    fractional bits is specified by the shift parameter and the real point coordinates are calculated as
+    \f$\texttt{Point}(x,y)\rightarrow\texttt{Point2f}(x*2^{-shift},y*2^{-shift})\f$ . This feature is
+    especially effective when rendering antialiased shapes.
 
-@note The functions do not support alpha-transparency when the target image is 4-channel. In this
-case, the color[3] is simply copied to the repainted pixels. Thus, if you want to paint
-semi-transparent shapes, you can paint them in a separate buffer and then blend it with the main
-image.
+    @note The functions do not support alpha-transparency when the target image is 4-channel. In this
+    case, the color[3] is simply copied to the repainted pixels. Thus, if you want to paint
+    semi-transparent shapes, you can paint them in a separate buffer and then blend it with the main
+    image.
 
     @defgroup imgproc_color_conversions Color Space Conversions
     @defgroup imgproc_colormap ColorMaps in OpenCV
 
-The human perception isn't built for observing fine changes in grayscale images. Human eyes are more
-sensitive to observing changes between colors, so you often need to recolor your grayscale images to
-get a clue about them. OpenCV now comes with various colormaps to enhance the visualization in your
-computer vision application.
+    The human perception isn't built for observing fine changes in grayscale images. Human eyes are more
+    sensitive to observing changes between colors, so you often need to recolor your grayscale images to
+    get a clue about them. OpenCV now comes with various colormaps to enhance the visualization in your
+    computer vision application.
 
-In OpenCV you only need applyColorMap to apply a colormap on a given image. The following sample
-code reads the path to an image from command line, applies a Jet colormap on it and shows the
-result:
+    In OpenCV you only need applyColorMap to apply a colormap on a given image. The following sample
+    code reads the path to an image from command line, applies a Jet colormap on it and shows the
+    result:
 
-@include snippets/imgproc_applyColorMap.cpp
+    @include snippets/imgproc_applyColorMap.cpp
 
-@see #ColormapTypes
+    @see #ColormapTypes
 
     @defgroup imgproc_subdiv2d Planar Subdivision
 
-The Subdiv2D class described in this section is used to perform various planar subdivision on
-a set of 2D points (represented as vector of Point2f). OpenCV subdivides a plane into triangles
-using the Delaunay's algorithm, which corresponds to the dual graph of the Voronoi diagram.
-In the figure below, the Delaunay's triangulation is marked with black lines and the Voronoi
-diagram with red lines.
+    The Subdiv2D class described in this section is used to perform various planar subdivision on
+    a set of 2D points (represented as vector of Point2f). OpenCV subdivides a plane into triangles
+    using the Delaunay's algorithm, which corresponds to the dual graph of the Voronoi diagram.
+    In the figure below, the Delaunay's triangulation is marked with black lines and the Voronoi
+    diagram with red lines.
 
-![Delaunay triangulation (black) and Voronoi (red)](pics/delaunay_voronoi.png)
+    ![Delaunay triangulation (black) and Voronoi (red)](pics/delaunay_voronoi.png)
 
-The subdivisions can be used for the 3D piece-wise transformation of a plane, morphing, fast
-location of points on the plane, building special graphs (such as NNG,RNG), and so forth.
+    The subdivisions can be used for the 3D piece-wise transformation of a plane, morphing, fast
+    location of points on the plane, building special graphs (such as NNG,RNG), and so forth.
 
     @defgroup imgproc_hist Histograms
     @defgroup imgproc_shape Structural Analysis and Shape Descriptors
@@ -190,7 +190,6 @@ location of points on the plane, building special graphs (such as NNG,RNG), and
     @defgroup imgproc_feature Feature Detection
     @defgroup imgproc_object Object Detection
     @defgroup imgproc_segmentation Image Segmentation
-    @defgroup imgproc_c C API
     @defgroup imgproc_hal Hardware Acceleration Layer
     @{
         @defgroup imgproc_hal_functions Functions
@@ -275,7 +274,8 @@ enum InterpolationFlags{
     - flag is __not__ set: \f$dst( \rho , \phi ) = src(x,y)\f$
     - flag is set: \f$dst(x,y) = src( \rho , \phi )\f$
     */
-    WARP_INVERSE_MAP     = 16
+    WARP_INVERSE_MAP     = 16,
+    WARP_RELATIVE_MAP    = 32
 };
 
 /** \brief Specify the polar mapping mode
@@ -641,112 +641,109 @@ enum ColorConversionCodes {
     COLOR_YUV2BGR      = 84,
     COLOR_YUV2RGB      = 85,
 
-    //! YUV 4:2:0 family to RGB
-    COLOR_YUV2RGB_NV12  = 90,
-    COLOR_YUV2BGR_NV12  = 91,
-    COLOR_YUV2RGB_NV21  = 92,
-    COLOR_YUV2BGR_NV21  = 93,
-    COLOR_YUV420sp2RGB  = COLOR_YUV2RGB_NV21,
-    COLOR_YUV420sp2BGR  = COLOR_YUV2BGR_NV21,
-
-    COLOR_YUV2RGBA_NV12 = 94,
-    COLOR_YUV2BGRA_NV12 = 95,
-    COLOR_YUV2RGBA_NV21 = 96,
-    COLOR_YUV2BGRA_NV21 = 97,
-    COLOR_YUV420sp2RGBA = COLOR_YUV2RGBA_NV21,
-    COLOR_YUV420sp2BGRA = COLOR_YUV2BGRA_NV21,
-
-    COLOR_YUV2RGB_YV12  = 98,
-    COLOR_YUV2BGR_YV12  = 99,
-    COLOR_YUV2RGB_IYUV  = 100,
-    COLOR_YUV2BGR_IYUV  = 101,
-    COLOR_YUV2RGB_I420  = COLOR_YUV2RGB_IYUV,
-    COLOR_YUV2BGR_I420  = COLOR_YUV2BGR_IYUV,
-    COLOR_YUV420p2RGB   = COLOR_YUV2RGB_YV12,
-    COLOR_YUV420p2BGR   = COLOR_YUV2BGR_YV12,
-
-    COLOR_YUV2RGBA_YV12 = 102,
-    COLOR_YUV2BGRA_YV12 = 103,
-    COLOR_YUV2RGBA_IYUV = 104,
-    COLOR_YUV2BGRA_IYUV = 105,
-    COLOR_YUV2RGBA_I420 = COLOR_YUV2RGBA_IYUV,
-    COLOR_YUV2BGRA_I420 = COLOR_YUV2BGRA_IYUV,
-    COLOR_YUV420p2RGBA  = COLOR_YUV2RGBA_YV12,
-    COLOR_YUV420p2BGRA  = COLOR_YUV2BGRA_YV12,
-
-    COLOR_YUV2GRAY_420  = 106,
-    COLOR_YUV2GRAY_NV21 = COLOR_YUV2GRAY_420,
-    COLOR_YUV2GRAY_NV12 = COLOR_YUV2GRAY_420,
-    COLOR_YUV2GRAY_YV12 = COLOR_YUV2GRAY_420,
-    COLOR_YUV2GRAY_IYUV = COLOR_YUV2GRAY_420,
-    COLOR_YUV2GRAY_I420 = COLOR_YUV2GRAY_420,
-    COLOR_YUV420sp2GRAY = COLOR_YUV2GRAY_420,
-    COLOR_YUV420p2GRAY  = COLOR_YUV2GRAY_420,
-
-    //! YUV 4:2:2 family to RGB
-    COLOR_YUV2RGB_UYVY = 107,
-    COLOR_YUV2BGR_UYVY = 108,
-    //COLOR_YUV2RGB_VYUY = 109,
-    //COLOR_YUV2BGR_VYUY = 110,
-    COLOR_YUV2RGB_Y422 = COLOR_YUV2RGB_UYVY,
-    COLOR_YUV2BGR_Y422 = COLOR_YUV2BGR_UYVY,
-    COLOR_YUV2RGB_UYNV = COLOR_YUV2RGB_UYVY,
-    COLOR_YUV2BGR_UYNV = COLOR_YUV2BGR_UYVY,
-
-    COLOR_YUV2RGBA_UYVY = 111,
-    COLOR_YUV2BGRA_UYVY = 112,
-    //COLOR_YUV2RGBA_VYUY = 113,
-    //COLOR_YUV2BGRA_VYUY = 114,
-    COLOR_YUV2RGBA_Y422 = COLOR_YUV2RGBA_UYVY,
-    COLOR_YUV2BGRA_Y422 = COLOR_YUV2BGRA_UYVY,
-    COLOR_YUV2RGBA_UYNV = COLOR_YUV2RGBA_UYVY,
-    COLOR_YUV2BGRA_UYNV = COLOR_YUV2BGRA_UYVY,
-
-    COLOR_YUV2RGB_YUY2 = 115,
-    COLOR_YUV2BGR_YUY2 = 116,
-    COLOR_YUV2RGB_YVYU = 117,
-    COLOR_YUV2BGR_YVYU = 118,
-    COLOR_YUV2RGB_YUYV = COLOR_YUV2RGB_YUY2,
-    COLOR_YUV2BGR_YUYV = COLOR_YUV2BGR_YUY2,
-    COLOR_YUV2RGB_YUNV = COLOR_YUV2RGB_YUY2,
-    COLOR_YUV2BGR_YUNV = COLOR_YUV2BGR_YUY2,
-
-    COLOR_YUV2RGBA_YUY2 = 119,
-    COLOR_YUV2BGRA_YUY2 = 120,
-    COLOR_YUV2RGBA_YVYU = 121,
-    COLOR_YUV2BGRA_YVYU = 122,
-    COLOR_YUV2RGBA_YUYV = COLOR_YUV2RGBA_YUY2,
-    COLOR_YUV2BGRA_YUYV = COLOR_YUV2BGRA_YUY2,
-    COLOR_YUV2RGBA_YUNV = COLOR_YUV2RGBA_YUY2,
-    COLOR_YUV2BGRA_YUNV = COLOR_YUV2BGRA_YUY2,
-
-    COLOR_YUV2GRAY_UYVY = 123,
-    COLOR_YUV2GRAY_YUY2 = 124,
-    //CV_YUV2GRAY_VYUY    = CV_YUV2GRAY_UYVY,
-    COLOR_YUV2GRAY_Y422 = COLOR_YUV2GRAY_UYVY,
-    COLOR_YUV2GRAY_UYNV = COLOR_YUV2GRAY_UYVY,
-    COLOR_YUV2GRAY_YVYU = COLOR_YUV2GRAY_YUY2,
-    COLOR_YUV2GRAY_YUYV = COLOR_YUV2GRAY_YUY2,
-    COLOR_YUV2GRAY_YUNV = COLOR_YUV2GRAY_YUY2,
+    COLOR_YUV2RGB_NV12  = 90, //!< convert between 4:2:0-subsampled YUV NV12 and RGB, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_NV12  = 91, //!< convert between 4:2:0-subsampled YUV NV12 and BGR, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_NV21  = 92, //!< convert between 4:2:0-subsampled YUV NV21 and RGB, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_NV21  = 93, //!< convert between 4:2:0-subsampled YUV NV21 and BGR, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV420sp2RGB  = COLOR_YUV2RGB_NV21, //!< synonym to NV21
+    COLOR_YUV420sp2BGR  = COLOR_YUV2BGR_NV21, //!< synonym to NV21
+
+    COLOR_YUV2RGBA_NV12 = 94, //!< convert between 4:2:0-subsampled YUV NV12 and RGBA, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_NV12 = 95, //!< convert between 4:2:0-subsampled YUV NV12 and BGRA, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_NV21 = 96, //!< convert between 4:2:0-subsampled YUV NV21 and RGBA, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_NV21 = 97, //!< convert between 4:2:0-subsampled YUV NV21 and BGRA, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV420sp2RGBA = COLOR_YUV2RGBA_NV21, //!< synonym to NV21
+    COLOR_YUV420sp2BGRA = COLOR_YUV2BGRA_NV21, //!< synonym to NV21
+
+    COLOR_YUV2RGB_YV12  =  98, //!< convert between 4:2:0-subsampled YUV YV12 and RGB, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YV12  =  99, //!< convert between 4:2:0-subsampled YUV YV12 and BGR, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_IYUV  = 100, //!< convert between 4:2:0-subsampled YUV IYUV and RGB, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_IYUV  = 101, //!< convert between 4:2:0-subsampled YUV IYUV and BGR, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_I420  = COLOR_YUV2RGB_IYUV, //!< synonym to IYUV
+    COLOR_YUV2BGR_I420  = COLOR_YUV2BGR_IYUV, //!< synonym to IYUV
+    COLOR_YUV420p2RGB   = COLOR_YUV2RGB_YV12, //!< synonym to YV12
+    COLOR_YUV420p2BGR   = COLOR_YUV2BGR_YV12, //!< synonym to YV12
+
+    COLOR_YUV2RGBA_YV12 = 102, //!< convert between 4:2:0-subsampled YUV YV12 and RGBA, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YV12 = 103, //!< convert between 4:2:0-subsampled YUV YV12 and BGRA, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_IYUV = 104, //!< convert between 4:2:0-subsampled YUV YV12 and RGBA, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_IYUV = 105, //!< convert between 4:2:0-subsampled YUV YV12 and BGRA, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_I420 = COLOR_YUV2RGBA_IYUV, //!< synonym to IYUV
+    COLOR_YUV2BGRA_I420 = COLOR_YUV2BGRA_IYUV, //!< synonym to IYUV
+    COLOR_YUV420p2RGBA  = COLOR_YUV2RGBA_YV12, //!< synonym to YV12
+    COLOR_YUV420p2BGRA  = COLOR_YUV2BGRA_YV12, //!< synonym to YV12
+
+    COLOR_YUV2GRAY_420  = 106, //!< extract Y channel from YUV 4:2:0 image
+    COLOR_YUV2GRAY_NV21 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_NV12 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_YV12 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_IYUV = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_I420 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV420sp2GRAY = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV420p2GRAY  = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+
+    COLOR_YUV2RGB_UYVY = 107, //!< convert between YUV UYVY and RGB, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_UYVY = 108, //!< convert between YUV UYVY and BGR, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2RGB_VYUY = 109, //!< convert between YUV VYUY and RGB, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2BGR_VYUY = 110, //!< convert between YUV VYUY and BGR, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_Y422 = COLOR_YUV2RGB_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGR_Y422 = COLOR_YUV2BGR_UYVY, //!< synonym to UYVY
+    COLOR_YUV2RGB_UYNV = COLOR_YUV2RGB_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGR_UYNV = COLOR_YUV2BGR_UYVY, //!< synonym to UYVY
+
+    COLOR_YUV2RGBA_UYVY = 111, //!< convert between YUV UYVY and RGBA, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_UYVY = 112, //!< convert between YUV UYVY and BGRA, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2RGBA_VYUY = 113, //!< convert between YUV VYUY and RGBA, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2BGRA_VYUY = 114, //!< convert between YUV VYUY and BGRA, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_Y422 = COLOR_YUV2RGBA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGRA_Y422 = COLOR_YUV2BGRA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2RGBA_UYNV = COLOR_YUV2RGBA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGRA_UYNV = COLOR_YUV2BGRA_UYVY, //!< synonym to UYVY
+
+    COLOR_YUV2RGB_YUY2 = 115, //!< convert between YUV YUY2 and RGB, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YUY2 = 116, //!< convert between YUV YUY2 and BGR, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_YVYU = 117, //!< convert between YUV YVYU and RGB, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YVYU = 118, //!< convert between YUV YVYU and BGR, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_YUYV = COLOR_YUV2RGB_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGR_YUYV = COLOR_YUV2BGR_YUY2, //!< synonym to YUY2
+    COLOR_YUV2RGB_YUNV = COLOR_YUV2RGB_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGR_YUNV = COLOR_YUV2BGR_YUY2, //!< synonym to YUY2
+
+    COLOR_YUV2RGBA_YUY2 = 119, //!< convert between YUV YUY2 and RGBA, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YUY2 = 120, //!< convert between YUV YUY2 and BGRA, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_YVYU = 121, //!< convert between YUV YVYU and RGBA, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YVYU = 122, //!< convert between YUV YVYU and BGRA, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_YUYV = COLOR_YUV2RGBA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGRA_YUYV = COLOR_YUV2BGRA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2RGBA_YUNV = COLOR_YUV2RGBA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGRA_YUNV = COLOR_YUV2BGRA_YUY2, //!< synonym to YUY2
+
+    COLOR_YUV2GRAY_UYVY = 123, //!< extract Y channel from YUV 4:2:2 image
+    COLOR_YUV2GRAY_YUY2 = 124, //!< extract Y channel from YUV 4:2:2 image
+    //CV_YUV2GRAY_VYUY  = CV_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_Y422 = COLOR_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_UYNV = COLOR_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_YVYU = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
+    COLOR_YUV2GRAY_YUYV = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
+    COLOR_YUV2GRAY_YUNV = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
 
     //! alpha premultiplication
     COLOR_RGBA2mRGBA    = 125,
     COLOR_mRGBA2RGBA    = 126,
 
-    //! RGB to YUV 4:2:0 family
-    COLOR_RGB2YUV_I420  = 127,
-    COLOR_BGR2YUV_I420  = 128,
-    COLOR_RGB2YUV_IYUV  = COLOR_RGB2YUV_I420,
-    COLOR_BGR2YUV_IYUV  = COLOR_BGR2YUV_I420,
-
-    COLOR_RGBA2YUV_I420 = 129,
-    COLOR_BGRA2YUV_I420 = 130,
-    COLOR_RGBA2YUV_IYUV = COLOR_RGBA2YUV_I420,
-    COLOR_BGRA2YUV_IYUV = COLOR_BGRA2YUV_I420,
-    COLOR_RGB2YUV_YV12  = 131,
-    COLOR_BGR2YUV_YV12  = 132,
-    COLOR_RGBA2YUV_YV12 = 133,
-    COLOR_BGRA2YUV_YV12 = 134,
+    COLOR_RGB2YUV_I420  = 127, //!< convert between RGB and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_I420  = 128, //!< convert between BGR and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_IYUV  = COLOR_RGB2YUV_I420, //!< synonym to I420
+    COLOR_BGR2YUV_IYUV  = COLOR_BGR2YUV_I420, //!< synonym to I420
+
+    COLOR_RGBA2YUV_I420 = 129, //!< convert between RGBA and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_I420 = 130, //!< convert between BGRA and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_IYUV = COLOR_RGBA2YUV_I420, //!< synonym to I420
+    COLOR_BGRA2YUV_IYUV = COLOR_BGRA2YUV_I420, //!< synonym to I420
+    COLOR_RGB2YUV_YV12  = 131, //!< convert between RGB and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YV12  = 132, //!< convert between BGR and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YV12 = 133, //!< convert between RGBA and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YV12 = 134, //!< convert between BGRA and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
 
     //! Demosaicing, see @ref color_convert_bayer "color conversions" for additional information
     COLOR_BayerBG2BGR = 46, //!< equivalent to RGGB Bayer pattern
@@ -842,7 +839,39 @@ enum ColorConversionCodes {
     COLOR_BayerRG2RGBA = COLOR_BayerBG2BGRA, //!< equivalent to BGGR Bayer pattern
     COLOR_BayerGR2RGBA = COLOR_BayerGB2BGRA, //!< equivalent to GBRG Bayer pattern
 
-    COLOR_COLORCVT_MAX  = 143
+    COLOR_RGB2YUV_UYVY = 143, //!< convert between RGB and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_UYVY = 144, //!< convert between BGR and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_Y422 = COLOR_RGB2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGR2YUV_Y422 = COLOR_BGR2YUV_UYVY, //!< synonym to UYVY
+    COLOR_RGB2YUV_UYNV = COLOR_RGB2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGR2YUV_UYNV = COLOR_BGR2YUV_UYVY, //!< synonym to UYVY
+
+    COLOR_RGBA2YUV_UYVY = 145, //!< convert between RGBA and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_UYVY = 146, //!< convert between BGRA and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_Y422 = COLOR_RGBA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGRA2YUV_Y422 = COLOR_BGRA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_RGBA2YUV_UYNV = COLOR_RGBA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGRA2YUV_UYNV = COLOR_BGRA2YUV_UYVY, //!< synonym to UYVY
+
+    COLOR_RGB2YUV_YUY2 = 147, //!< convert between RGB and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YUY2 = 148, //!< convert between BGR and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_YVYU = 149, //!< convert between RGB and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YVYU = 150, //!< convert between BGR and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_YUYV = COLOR_RGB2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGR2YUV_YUYV = COLOR_BGR2YUV_YUY2, //!< synonym to YUY2
+    COLOR_RGB2YUV_YUNV = COLOR_RGB2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGR2YUV_YUNV = COLOR_BGR2YUV_YUY2, //!< synonym to YUY2
+
+    COLOR_RGBA2YUV_YUY2 = 151, //!< convert between RGBA and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YUY2 = 152, //!< convert between BGRA and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YVYU = 153, //!< convert between RGBA and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YVYU = 154, //!< convert between BGRA and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YUYV = COLOR_RGBA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGRA2YUV_YUYV = COLOR_BGRA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_RGBA2YUV_YUNV = COLOR_RGBA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGRA2YUV_YUNV = COLOR_BGRA2YUV_YUY2, //!< synonym to YUY2
+
+    COLOR_COLORCVT_MAX  = 155
 };
 
 //! @addtogroup imgproc_shape
@@ -2227,7 +2256,7 @@ too large, some circles may be missed.
 @param param1 First method-specific parameter. In case of #HOUGH_GRADIENT and #HOUGH_GRADIENT_ALT,
 it is the higher threshold of the two passed to the Canny edge detector (the lower one is twice smaller).
 Note that #HOUGH_GRADIENT_ALT uses #Scharr algorithm to compute image derivatives, so the threshold value
-shough normally be higher, such as 300 or normally exposed and contrasty images.
+should normally be higher, such as 300 or normally exposed and contrasty images.
 @param param2 Second method-specific parameter. In case of #HOUGH_GRADIENT, it is the
 accumulator threshold for the circle centers at the detection stage. The smaller it is, the more
 false circles may be detected. Circles, corresponding to the larger accumulator values, will be
@@ -2456,6 +2485,7 @@ CV_EXPORTS_W void warpPerspective( InputArray src, OutputArray dst,
 The function remap transforms the source image using the specified map:
 
 \f[\texttt{dst} (x,y) =  \texttt{src} (map_x(x,y),map_y(x,y))\f]
+\f[\texttt{dst} (x,y) =  \texttt{src} (x+map_x(x,y),y+map_y(x,y))\f] with WARP_RELATIVE_MAP
 
 where values of pixels with non-integer coordinates are computed using one of available
 interpolation methods. \f$map_x\f$ and \f$map_y\f$ can be encoded as separate floating-point maps
@@ -2475,7 +2505,9 @@ representation to fixed-point for speed.
 @param map2 The second map of y values having the type CV_16UC1, CV_32FC1, or none (empty map
 if map1 is (x,y) points), respectively.
 @param interpolation Interpolation method (see #InterpolationFlags). The methods #INTER_AREA
-and #INTER_LINEAR_EXACT are not supported by this function.
+#INTER_LINEAR_EXACT and #INTER_NEAREST_EXACT are not supported by this function.
+The extra flag WARP_RELATIVE_MAP that can be ORed to the interpolation method
+(e.g. INTER_LINEAR | WARP_RELATIVE_MAP)
 @param borderMode Pixel extrapolation method (see #BorderTypes). When
 borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image that
 corresponds to the "outliers" in the source image are not modified by the function.
@@ -3247,7 +3279,7 @@ images[0].channels() + images[1].channels()-1, and so on.
 size and depth as images[0] .
 @param ranges Array of arrays of the histogram bin boundaries in each dimension. See #calcHist .
 @param scale Optional scale factor for the output back projection.
-@param uniform Flag indicating whether the histogram is uniform or not (see above).
+@param uniform Flag indicating whether the histogram is uniform or not (see #calcHist).
 
 @sa calcHist, compareHist
  */
@@ -3699,10 +3731,10 @@ stored in two planes.
 
 This function only supports YUV420 to RGB conversion as of now.
 
-@param src1: 8-bit image (#CV_8U) of the Y plane.
-@param src2: image containing interleaved U/V plane.
-@param dst: output image.
-@param code: Specifies the type of conversion. It can take any of the following values:
+@param src1 8-bit image (#CV_8U) of the Y plane.
+@param src2 image containing interleaved U/V plane.
+@param dst output image.
+@param code Specifies the type of conversion. It can take any of the following values:
 - #COLOR_YUV2BGR_NV12
 - #COLOR_YUV2RGB_NV12
 - #COLOR_YUV2BGRA_NV12
@@ -3756,8 +3788,8 @@ CV_EXPORTS_W void demosaicing(InputArray src, OutputArray dst, int code, int dst
 The function computes moments, up to the 3rd order, of a vector shape or a rasterized shape. The
 results are returned in the structure cv::Moments.
 
-@param array Raster image (single-channel, 8-bit or floating-point 2D array) or an array (
-\f$1 \times N\f$ or \f$N \times 1\f$ ) of 2D points (Point or Point2f ).
+@param array Single chanel raster image (CV_8U, CV_16U, CV_16S, CV_32F, CV_64F) or an array (
+\f$1 \times N\f$ or \f$N \times 1\f$ ) of 2D points (Point or Point2f).
 @param binaryImage If it is true, all non-zero image pixels are treated as 1's. The parameter is
 used for images only.
 @returns moments.
@@ -3993,15 +4025,18 @@ CV_EXPORTS_W void findContours( InputArray image, OutputArrayOfArrays contours,
 CV_EXPORTS void findContours( InputArray image, OutputArrayOfArrays contours,
                               int mode, int method, Point offset = Point());
 
-/** @example samples/cpp/squares.cpp
-A program using pyramid scaling, Canny, contours and contour simplification to find
-squares in a list of images (pic1-6.png). Returns sequence of squares detected on the image.
-*/
+//! @brief Find contours using link runs algorithm
+//!
+//! This function implements an algorithm different from cv::findContours:
+//! - doesn't allocate temporary image internally, thus it has reduced memory consumption
+//! - supports CV_8UC1 images only
+//! - outputs 2-level hierarhy only (RETR_CCOMP mode)
+//! - doesn't support approximation change other than CHAIN_APPROX_SIMPLE
+//! In all other aspects this function is compatible with cv::findContours.
+CV_EXPORTS_W void findContoursLinkRuns(InputArray image, OutputArrayOfArrays contours, OutputArray hierarchy);
 
-/** @example samples/tapi/squares.cpp
-A program using pyramid scaling, Canny, contours and contour simplification to find
-squares in the input image.
-*/
+//! @overload
+CV_EXPORTS_W void findContoursLinkRuns(InputArray image, OutputArrayOfArrays contours);
 
 /** @brief Approximates a polygonal curve(s) with the specified precision.
 
@@ -4436,7 +4471,7 @@ An example using applyColorMap function
 
 /** @brief Applies a GNU Octave/MATLAB equivalent colormap on a given image.
 
-@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
 @param dst The result is the colormapped source image. Note: Mat::create is called on dst.
 @param colormap The colormap to apply, see #ColormapTypes
 */
@@ -4444,8 +4479,8 @@ CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, int colormap);
 
 /** @brief Applies a user colormap on a given image.
 
-@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
-@param dst The result is the colormapped source image. Note: Mat::create is called on dst.
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
+@param dst The result is the colormapped source image of the same number of channels as userColor. Note: Mat::create is called on dst.
 @param userColor The colormap to apply of type CV_8UC1 or CV_8UC3 and size 256
 */
 CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, InputArray userColor);
diff --git a/modules/imgproc/include/opencv2/imgproc/detail/legacy.hpp b/modules/imgproc/include/opencv2/imgproc/detail/legacy.hpp
new file mode 100644
index 000000000000..029d9c90e83a
--- /dev/null
+++ b/modules/imgproc/include/opencv2/imgproc/detail/legacy.hpp
@@ -0,0 +1,38 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_IMGPROC_DETAIL_LEGACY_HPP
+#define OPENCV_IMGPROC_DETAIL_LEGACY_HPP
+
+#include "opencv2/imgproc.hpp"
+
+namespace cv {
+
+#ifdef __OPENCV_BUILD
+
+CV_EXPORTS void findContours_legacy(InputArray _image,
+                                    OutputArrayOfArrays _contours,
+                                    OutputArray _hierarchy,
+                                    int mode,
+                                    int method,
+                                    Point offset = Point());
+CV_EXPORTS void findContours_legacy(InputArray image,
+                                    OutputArrayOfArrays contours,
+                                    int mode,
+                                    int method,
+                                    Point offset = Point());
+
+CV_EXPORTS float EMD_legacy( InputArray _signature1, InputArray _signature2,
+               int distType, InputArray _cost,
+               float* lowerBound, OutputArray _flow );
+
+CV_EXPORTS float wrapperEMD_legacy(InputArray _signature1, InputArray _signature2,
+               int distType, InputArray _cost,
+               Ptr<float> lowerBound, OutputArray _flow);
+
+#endif
+
+}  // namespace cv
+
+#endif  // OPENCV_IMGPROC_DETAIL_LEGACY_HPP
diff --git a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
index f129012ba69a..48851ece073c 100644
--- a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
@@ -224,6 +224,11 @@ CV_EXPORTS void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                                     int width, int height,
                                     int dcn, bool swapBlue, int uIdx, int ycn);
 
+CV_EXPORTS void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int width, int height,
+                                    int scn, bool swapBlue, int uIdx, int ycn);
+
 CV_EXPORTS void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
                                         uchar * dst_data, size_t dst_step,
                                         int width, int height);
diff --git a/modules/imgproc/include/opencv2/imgproc/imgproc_c.h b/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
index 86dc119fdd37..e97b802e6919 100644
--- a/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
+++ b/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
@@ -209,6 +209,10 @@ CVAPI(void)  cvCvtColor( const CvArr* src, CvArr* dst, int code );
 CVAPI(void)  cvResize( const CvArr* src, CvArr* dst,
                        int interpolation CV_DEFAULT( CV_INTER_LINEAR ));
 
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 5054 )
+#endif
 /** @brief Warps image with affine transform
 @note ::cvGetQuadrangleSubPix is similar to ::cvWarpAffine, but the outliers are extrapolated using
 replication border mode.
@@ -273,6 +277,10 @@ CVAPI(void)  cvLinearPolar( const CvArr* src, CvArr* dst,
                          CvPoint2D32f center, double maxRadius,
                          int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
 
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
 /** @brief Returns a structuring element of the specified size and shape for morphological operations.
 
 @note the created structuring element IplConvKernel\* element must be released in the end using
diff --git a/modules/imgproc/include/opencv2/imgproc/types_c.h b/modules/imgproc/include/opencv2/imgproc/types_c.h
index d3e55f576f6e..255ed0c37f65 100644
--- a/modules/imgproc/include/opencv2/imgproc/types_c.h
+++ b/modules/imgproc/include/opencv2/imgproc/types_c.h
@@ -376,8 +376,9 @@ enum
 /** ... and other image warping flags */
 enum
 {
-    CV_WARP_FILL_OUTLIERS =8,
-    CV_WARP_INVERSE_MAP  =16
+    CV_WARP_FILL_OUTLIERS = 8,
+    CV_WARP_INVERSE_MAP   = 16,
+    CV_WARP_RELATIVE_MAP  = 32
 };
 
 /** Shapes of a structuring element for morphological operations
diff --git a/modules/imgproc/misc/java/test/ImgprocTest.java b/modules/imgproc/misc/java/test/ImgprocTest.java
index 3b30d4eddb63..c9c5753da7f0 100644
--- a/modules/imgproc/misc/java/test/ImgprocTest.java
+++ b/modules/imgproc/misc/java/test/ImgprocTest.java
@@ -639,7 +639,7 @@ public void testDistanceTransformWithLabels() {
         Imgproc.distanceTransformWithLabels(gray128, dst, labels, Imgproc.DIST_L2, 3);
 
         assertMatEqual(dstLables, labels);
-        assertMatEqual(getMat(CvType.CV_32FC1, 8192), dst, EPS);
+        assertMatEqual(getMat(CvType.CV_32FC1, 65533.805), dst, EPS);
     }
 
     public void testDrawContoursMatListOfMatIntScalar() {
diff --git a/modules/imgproc/perf/opencl/perf_color.cpp b/modules/imgproc/perf/opencl/perf_color.cpp
index 9525e1f275ae..dce588879b66 100644
--- a/modules/imgproc/perf/opencl/perf_color.cpp
+++ b/modules/imgproc/perf/opencl/perf_color.cpp
@@ -58,7 +58,7 @@ CV_ENUM(ConversionTypes, COLOR_RGB2GRAY, COLOR_RGB2BGR, COLOR_RGB2YUV, COLOR_YUV
         COLOR_YCrCb2RGB, COLOR_RGB2XYZ, COLOR_XYZ2RGB, COLOR_RGB2HSV, COLOR_HSV2RGB, COLOR_RGB2HLS,
         COLOR_HLS2RGB, COLOR_BGR5652BGR, COLOR_BGR2BGR565, COLOR_RGBA2mRGBA, COLOR_mRGBA2RGBA,
         COLOR_RGB2Lab, COLOR_Lab2BGR, COLOR_RGB2Luv, COLOR_Luv2LBGR, COLOR_YUV2RGB_NV12, COLOR_YUV2RGB_IYUV,
-        COLOR_YUV2GRAY_420, COLOR_RGB2YUV_IYUV, COLOR_YUV2RGB_YUY2, COLOR_YUV2GRAY_YUY2)
+        COLOR_YUV2GRAY_420, COLOR_RGB2YUV_IYUV, COLOR_YUV2RGB_YUY2, COLOR_RGB2YUV_YUY2, COLOR_YUV2GRAY_YUY2)
 
 typedef tuple<Size, tuple<ConversionTypes, int, int> > CvtColorParams;
 typedef TestBaseWithParam<CvtColorParams> CvtColorFixture;
@@ -91,6 +91,7 @@ OCL_PERF_TEST_P(CvtColorFixture, CvtColor, testing::Combine(
                     make_tuple(ConversionTypes(COLOR_YUV2GRAY_420), 1, 1),
                     make_tuple(ConversionTypes(COLOR_RGB2YUV_IYUV), 3, 1),
                     make_tuple(ConversionTypes(COLOR_YUV2RGB_YUY2), 2, 3),
+                    make_tuple(ConversionTypes(COLOR_RGB2YUV_YUY2), 3, 2),
                     make_tuple(ConversionTypes(COLOR_YUV2GRAY_YUY2), 2, 1)
                     )))
 {
diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp
index ab169ecfca0f..5915b507ceb0 100644
--- a/modules/imgproc/perf/perf_cvt_color.cpp
+++ b/modules/imgproc/perf/perf_cvt_color.cpp
@@ -178,7 +178,9 @@ CV_ENUM(CvtModeBayer,
 CV_ENUM(CvtMode2, COLOR_YUV2BGR_NV12, COLOR_YUV2BGRA_NV12, COLOR_YUV2RGB_NV12, COLOR_YUV2RGBA_NV12, COLOR_YUV2BGR_NV21, COLOR_YUV2BGRA_NV21, COLOR_YUV2RGB_NV21, COLOR_YUV2RGBA_NV21,
                   COLOR_YUV2BGR_YV12, COLOR_YUV2BGRA_YV12, COLOR_YUV2RGB_YV12, COLOR_YUV2RGBA_YV12, COLOR_YUV2BGR_IYUV, COLOR_YUV2BGRA_IYUV, COLOR_YUV2RGB_IYUV, COLOR_YUV2RGBA_IYUV,
                   COLOR_YUV2GRAY_420, COLOR_YUV2RGB_UYVY, COLOR_YUV2BGR_UYVY, COLOR_YUV2RGBA_UYVY, COLOR_YUV2BGRA_UYVY, COLOR_YUV2RGB_YUY2, COLOR_YUV2BGR_YUY2, COLOR_YUV2RGB_YVYU,
-                  COLOR_YUV2BGR_YVYU, COLOR_YUV2RGBA_YUY2, COLOR_YUV2BGRA_YUY2, COLOR_YUV2RGBA_YVYU, COLOR_YUV2BGRA_YVYU)
+                  COLOR_YUV2BGR_YVYU, COLOR_YUV2RGBA_YUY2, COLOR_YUV2BGRA_YUY2, COLOR_YUV2RGBA_YVYU, COLOR_YUV2BGRA_YVYU,
+                  COLOR_RGB2YUV_UYVY, COLOR_BGR2YUV_UYVY, COLOR_RGBA2YUV_UYVY, COLOR_BGRA2YUV_UYVY, COLOR_RGB2YUV_YUY2, COLOR_BGR2YUV_YUY2, COLOR_RGB2YUV_YVYU,
+                  COLOR_BGR2YUV_YVYU, COLOR_RGBA2YUV_YUY2, COLOR_BGRA2YUV_YUY2, COLOR_RGBA2YUV_YVYU, COLOR_BGRA2YUV_YVYU)
 
 CV_ENUM(CvtMode3, COLOR_RGB2YUV_IYUV, COLOR_BGR2YUV_IYUV, COLOR_RGBA2YUV_IYUV, COLOR_BGRA2YUV_IYUV,
                   COLOR_RGB2YUV_YV12, COLOR_BGR2YUV_YV12, COLOR_RGBA2YUV_YV12, COLOR_BGRA2YUV_YV12)
@@ -225,12 +227,20 @@ static ChPair getConversionInfo(int cvtMode)
     case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2:
     case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
         return ChPair(2,3);
+    case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY:
+    case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2:
+    case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU:
+        return ChPair(3,2);
     case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA:
     case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA:
     case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
     case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
     case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
         return ChPair(2,4);
+    case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY:
+    case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2:
+    case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
+        return ChPair(4,2);
     case COLOR_BGR2GRAY: case COLOR_RGB2GRAY:
     case COLOR_RGB2YUV_IYUV: case COLOR_RGB2YUV_YV12:
     case COLOR_BGR2YUV_IYUV: case COLOR_BGR2YUV_YV12:
diff --git a/modules/imgproc/perf/perf_integral.cpp b/modules/imgproc/perf/perf_integral.cpp
index 2b1ab381e7ed..0a4fc4932981 100644
--- a/modules/imgproc/perf/perf_integral.cpp
+++ b/modules/imgproc/perf/perf_integral.cpp
@@ -13,7 +13,7 @@ enum PerfSqMatDepth{
     DEPTH_32F_64F,
     DEPTH_64F_64F};
 
-CV_ENUM(IntegralOutputDepths, DEPTH_32S_32S, DEPTH_32S_32F, DEPTH_32S_64F, DEPTH_32F_32F, DEPTH_32F_64F, DEPTH_64F_64F);
+CV_ENUM(IntegralOutputDepths, DEPTH_32S_32S, DEPTH_32S_32F, DEPTH_32S_64F, DEPTH_32F_32F, DEPTH_32F_64F, DEPTH_64F_64F)
 
 static int extraOutputDepths[6][2] = {{CV_32S, CV_32S}, {CV_32S, CV_32F}, {CV_32S, CV_64F}, {CV_32F, CV_32F}, {CV_32F, CV_64F}, {CV_64F, CV_64F}};
 
diff --git a/modules/imgproc/perf/perf_moments.cpp b/modules/imgproc/perf/perf_moments.cpp
index 7f52e70b7690..5d9c0366a174 100644
--- a/modules/imgproc/perf/perf_moments.cpp
+++ b/modules/imgproc/perf/perf_moments.cpp
@@ -35,7 +35,7 @@ PERF_TEST_P(MomentsFixture_val, Moments1,
     mat += 1;
 
 
-    SANITY_CHECK_MOMENTS(m, 2e-4, ERROR_RELATIVE);
+    SANITY_CHECK_MOMENTS(m, 3.3e-4, ERROR_RELATIVE);
 }
 
 } // namespace
diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp
index a0ea0804cc87..0f470a5f8156 100644
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@@ -202,9 +202,9 @@ typedef TestBaseWithParam<tuple<MatType, Size, double> > MatInfo_Size_Scale_Area
 
 PERF_TEST_P(MatInfo_Size_Scale_Area, ResizeArea,
             testing::Combine(
-                testing::Values(CV_8UC1, CV_8UC4),
-                testing::Values(szVGA, szqHD, sz720p),
-                testing::Values(2.4, 3.4, 1.3)
+                testing::Values(CV_8UC1, CV_8UC3, CV_8UC4),
+                testing::Values(szVGA, szqHD, sz720p, sz1080p, sz2160p),
+                testing::Values(0.1, 0.25, 0.81)
                 )
             )
 {
diff --git a/modules/imgproc/perf/perf_warp.cpp b/modules/imgproc/perf/perf_warp.cpp
index 4e7de8d2fecd..3716e663f96d 100644
--- a/modules/imgproc/perf/perf_warp.cpp
+++ b/modules/imgproc/perf/perf_warp.cpp
@@ -9,14 +9,15 @@ enum{HALF_SIZE=0, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH};
 
 CV_ENUM(BorderMode, BORDER_CONSTANT, BORDER_REPLICATE)
 CV_ENUM(InterType, INTER_NEAREST, INTER_LINEAR)
+CV_ENUM(InterTypeExtended, INTER_NEAREST, INTER_LINEAR, WARP_RELATIVE_MAP)
 CV_ENUM(RemapMode, HALF_SIZE, UPSIDE_DOWN, REFLECTION_X, REFLECTION_BOTH)
 
 typedef TestBaseWithParam< tuple<Size, InterType, BorderMode> > TestWarpAffine;
 typedef TestBaseWithParam< tuple<Size, InterType, BorderMode> > TestWarpPerspective;
 typedef TestBaseWithParam< tuple<Size, InterType, BorderMode, MatType> > TestWarpPerspectiveNear_t;
-typedef TestBaseWithParam< tuple<MatType, Size, InterType, BorderMode, RemapMode> > TestRemap;
+typedef TestBaseWithParam< tuple<MatType, Size, InterTypeExtended, BorderMode, RemapMode> > TestRemap;
 
-void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode );
+void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode, bool relative = false );
 
 PERF_TEST_P( TestWarpAffine, WarpAffine,
              Combine(
@@ -204,7 +205,7 @@ PERF_TEST_P( TestRemap, remap,
              Combine(
                  Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
                  Values( szVGA, sz1080p ),
-                 InterType::all(),
+                 InterTypeExtended::all(),
                  BorderMode::all(),
                  RemapMode::all()
                  )
@@ -224,7 +225,7 @@ PERF_TEST_P( TestRemap, remap,
 
     declare.in(source, WARMUP_RNG);
 
-    update_map(source, map_x, map_y, remapMode);
+    update_map(source, map_x, map_y, remapMode, ((interpolationType & WARP_RELATIVE_MAP) != 0));
 
     TEST_CYCLE()
     {
@@ -234,7 +235,7 @@ PERF_TEST_P( TestRemap, remap,
     SANITY_CHECK_NOTHING();
 }
 
-void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
+void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode, bool relative )
 {
     for( int j = 0; j < src.rows; j++ )
     {
@@ -267,6 +268,12 @@ void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
                 map_y.at<float>(j,i) = static_cast<float>(src.rows - j) ;
                 break;
             } // end of switch
+
+            if( relative )
+            {
+                map_x.at<float>(j,i) -= static_cast<float>(i);
+                map_y.at<float>(j,i) -= static_cast<float>(j);
+            }
         }
     }
 }
diff --git a/modules/imgproc/src/accum.dispatch.cpp b/modules/imgproc/src/accum.dispatch.cpp
index 8bbf37cc4acc..4d2e04493347 100644
--- a/modules/imgproc/src/accum.dispatch.cpp
+++ b/modules/imgproc/src/accum.dispatch.cpp
@@ -17,4 +17,4 @@ DEF_ACC_FLT_FUNCS(32f, float, float)
 DEF_ACC_FLT_FUNCS(32f64f, float, double)
 DEF_ACC_FLT_FUNCS(64f, double, double)
 
-} //cv::hal
\ No newline at end of file
+} //cv::hal
diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp
index 6b0e6d6fbe73..13363026135d 100644
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@@ -139,7 +139,7 @@ void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int
     }
 #if CV_AVX && !CV_AVX2
     _mm256_zeroupper();
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     vx_cleanup();
 #endif
 }
@@ -187,7 +187,7 @@ accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int
     }
 #if CV_AVX && !CV_AVX2
     _mm256_zeroupper();
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     vx_cleanup();
 #endif
 }
@@ -236,7 +236,7 @@ accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int
     }
 #if CV_AVX && !CV_AVX2
     _mm256_zeroupper();
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     vx_cleanup();
 #endif
 }
@@ -285,16 +285,16 @@ accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double
     }
 #if CV_AVX && !CV_AVX2
     _mm256_zeroupper();
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     vx_cleanup();
 #endif
 }
 void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -309,10 +309,10 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+            v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+            v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
         }
     }
     else
@@ -323,9 +323,9 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
                 v_uint8 v_src = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint16 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
 
@@ -333,10 +333,10 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+                v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+                v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
             }
         }
         else if (cn == 3)
@@ -344,12 +344,12 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
                 v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
@@ -373,18 +373,18 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
                 v_load_deinterleave(dst + ((x + step * 2) * cn), v_dst010, v_dst110, v_dst210);
                 v_load_deinterleave(dst + ((x + step * 3) * cn), v_dst011, v_dst111, v_dst211);
 
-                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
-                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
-                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
-                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
-                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
-                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
-                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
-                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
-                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
-                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
-                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
-                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
+                v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000)));
+                v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100)));
+                v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200)));
+                v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001)));
+                v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101)));
+                v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201)));
+                v_dst010 = v_add(v_dst010, v_cvt_f32(v_reinterpret_as_s32(v_src010)));
+                v_dst110 = v_add(v_dst110, v_cvt_f32(v_reinterpret_as_s32(v_src110)));
+                v_dst210 = v_add(v_dst210, v_cvt_f32(v_reinterpret_as_s32(v_src210)));
+                v_dst011 = v_add(v_dst011, v_cvt_f32(v_reinterpret_as_s32(v_src011)));
+                v_dst111 = v_add(v_dst111, v_cvt_f32(v_reinterpret_as_s32(v_src111)));
+                v_dst211 = v_add(v_dst211, v_cvt_f32(v_reinterpret_as_s32(v_src211)));
 
                 v_store_interleave(dst + (x * cn), v_dst000, v_dst100, v_dst200);
                 v_store_interleave(dst + ((x + step) * cn), v_dst001, v_dst101, v_dst201);
@@ -400,9 +400,9 @@ void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
 void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -413,8 +413,8 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
             v_uint32 v_src0, v_src1;
             v_expand(v_src, v_src0, v_src1);
 
-            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
-            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+            v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src0))));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src1))));
         }
     }
     else
@@ -425,14 +425,14 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint32 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
 
-                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0)));
-                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src1)));
+                v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src0))));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src1))));
             }
         }
         else if (cn == 3)
@@ -441,12 +441,12 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
@@ -456,12 +456,12 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_dst00 += v_cvt_f32(v_reinterpret_as_s32(v_src00));
-                v_dst01 += v_cvt_f32(v_reinterpret_as_s32(v_src01));
-                v_dst10 += v_cvt_f32(v_reinterpret_as_s32(v_src10));
-                v_dst11 += v_cvt_f32(v_reinterpret_as_s32(v_src11));
-                v_dst20 += v_cvt_f32(v_reinterpret_as_s32(v_src20));
-                v_dst21 += v_cvt_f32(v_reinterpret_as_s32(v_src21));
+                v_dst00 = v_add(v_dst00, v_cvt_f32(v_reinterpret_as_s32(v_src00)));
+                v_dst01 = v_add(v_dst01, v_cvt_f32(v_reinterpret_as_s32(v_src01)));
+                v_dst10 = v_add(v_dst10, v_cvt_f32(v_reinterpret_as_s32(v_src10)));
+                v_dst11 = v_add(v_dst11, v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_dst20 = v_add(v_dst20, v_cvt_f32(v_reinterpret_as_s32(v_src20)));
+                v_dst21 = v_add(v_dst21, v_cvt_f32(v_reinterpret_as_s32(v_src21)));
 
                 v_store_interleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_store_interleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
@@ -475,9 +475,9 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn
 void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -493,8 +493,8 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
         #else
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
-            v_store(dst + x, vx_load(dst + x) + vx_load(src + x));
-            v_store(dst + x + step, vx_load(dst + x + step) + vx_load(src + x + step));
+            v_store(dst + x, v_add(vx_load(dst + x), vx_load(src + x)));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), vx_load(src + x + step)));
         }
         #endif // CV_AVX && !CV_AVX2
     }
@@ -508,11 +508,11 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
                 v_uint16 v_masku16 = vx_load_expand(mask + x);
                 v_uint32 v_masku320, v_masku321;
                 v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0))));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0))));
 
-                v_store(dst + x, vx_load(dst + x) + (vx_load(src + x) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + (vx_load(src + x + step) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(vx_load(src + x), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(vx_load(src + x + step), v_mask1)));
             }
         }
         else if (cn == 3)
@@ -522,25 +522,25 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
                 v_uint16 v_masku16 = vx_load_expand(mask + x);
                 v_uint32 v_masku320, v_masku321;
                 v_expand(v_masku16, v_masku320, v_masku321);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0)));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0)));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_masku320, v_reinterpret_as_u32(v_0))));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_masku321, v_reinterpret_as_u32(v_0))));
 
                 v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                 v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);
 
                 v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
             }
         }
     }
@@ -551,9 +551,9 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -586,14 +586,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
             v_float64 v_dst6 = vx_load(dst + x + step * 6);
             v_float64 v_dst7 = vx_load(dst + x + step * 7);
 
-            v_dst0 = v_dst0 + v_src0;
-            v_dst1 = v_dst1 + v_src1;
-            v_dst2 = v_dst2 + v_src2;
-            v_dst3 = v_dst3 + v_src3;
-            v_dst4 = v_dst4 + v_src4;
-            v_dst5 = v_dst5 + v_src5;
-            v_dst6 = v_dst6 + v_src6;
-            v_dst7 = v_dst7 + v_src7;
+            v_dst0 = v_add(v_dst0, v_src0);
+            v_dst1 = v_add(v_dst1, v_src1);
+            v_dst2 = v_add(v_dst2, v_src2);
+            v_dst3 = v_add(v_dst3, v_src3);
+            v_dst4 = v_add(v_dst4, v_src4);
+            v_dst5 = v_add(v_dst5, v_src5);
+            v_dst6 = v_add(v_dst6, v_src6);
+            v_dst7 = v_add(v_dst7, v_src7);
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -613,9 +613,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint8 v_src  = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint16 v_int0, v_int1;
                 v_expand(v_src, v_int0, v_int1);
 
@@ -641,14 +641,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                 v_float64 v_dst6 = vx_load(dst + x + step * 6);
                 v_float64 v_dst7 = vx_load(dst + x + step * 7);
 
-                v_dst0 = v_dst0 + v_src0;
-                v_dst1 = v_dst1 + v_src1;
-                v_dst2 = v_dst2 + v_src2;
-                v_dst3 = v_dst3 + v_src3;
-                v_dst4 = v_dst4 + v_src4;
-                v_dst5 = v_dst5 + v_src5;
-                v_dst6 = v_dst6 + v_src6;
-                v_dst7 = v_dst7 + v_src7;
+                v_dst0 = v_add(v_dst0, v_src0);
+                v_dst1 = v_add(v_dst1, v_src1);
+                v_dst2 = v_add(v_dst2, v_src2);
+                v_dst3 = v_add(v_dst3, v_src3);
+                v_dst4 = v_add(v_dst4, v_src4);
+                v_dst5 = v_add(v_dst5, v_src5);
+                v_dst6 = v_add(v_dst6, v_src6);
+                v_dst7 = v_add(v_dst7, v_src7);
 
                 v_store(dst + x, v_dst0);
                 v_store(dst + x + step, v_dst1);
@@ -665,12 +665,12 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
                 v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
@@ -726,14 +726,14 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
                 v_load_deinterleave(dst + ((x + step * 6) * cn), v_dst0110, v_dst1110, v_dst2110);
                 v_load_deinterleave(dst + ((x + step * 7) * cn), v_dst0111, v_dst1111, v_dst2111);
 
-                v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000);
-                v_store_interleave(dst + ((x + step) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001);
-                v_store_interleave(dst + ((x + step * 2) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010);
-                v_store_interleave(dst + ((x + step * 3) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011);
-                v_store_interleave(dst + ((x + step * 4) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100);
-                v_store_interleave(dst + ((x + step * 5) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101);
-                v_store_interleave(dst + ((x + step * 6) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110);
-                v_store_interleave(dst + ((x + step * 7) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111);
+                v_store_interleave(dst + (x * cn), v_add(v_dst0000, v_src0000), v_add(v_dst1000, v_src1000), v_add(v_dst2000, v_src2000));
+                v_store_interleave(dst + ((x + step) * cn), v_add(v_dst0001, v_src0001), v_add(v_dst1001, v_src1001), v_add(v_dst2001, v_src2001));
+                v_store_interleave(dst + ((x + step * 2) * cn), v_add(v_dst0010, v_src0010), v_add(v_dst1010, v_src1010), v_add(v_dst2010, v_src2010));
+                v_store_interleave(dst + ((x + step * 3) * cn), v_add(v_dst0011, v_src0011), v_add(v_dst1011, v_src1011), v_add(v_dst2011, v_src2011));
+                v_store_interleave(dst + ((x + step * 4) * cn), v_add(v_dst0100, v_src0100), v_add(v_dst1100, v_src1100), v_add(v_dst2100, v_src2100));
+                v_store_interleave(dst + ((x + step * 5) * cn), v_add(v_dst0101, v_src0101), v_add(v_dst1101, v_src1101), v_add(v_dst2101, v_src2101));
+                v_store_interleave(dst + ((x + step * 6) * cn), v_add(v_dst0110, v_src0110), v_add(v_dst1110, v_src1110), v_add(v_dst2110, v_src2110));
+                v_store_interleave(dst + ((x + step * 7) * cn), v_add(v_dst0111, v_src0111), v_add(v_dst1111, v_src1111), v_add(v_dst2111, v_src2111));
             }
         }
     }
@@ -744,9 +744,9 @@ void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn
 void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -767,10 +767,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
             v_float64 v_dst2 = vx_load(dst + x + step * 2);
             v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-            v_dst0 = v_dst0 + v_src0;
-            v_dst1 = v_dst1 + v_src1;
-            v_dst2 = v_dst2 + v_src2;
-            v_dst3 = v_dst3 + v_src3;
+            v_dst0 = v_add(v_dst0, v_src0);
+            v_dst1 = v_add(v_dst1, v_src1);
+            v_dst2 = v_add(v_dst2, v_src2);
+            v_dst3 = v_add(v_dst3, v_src3);
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -786,9 +786,9 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src  = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint32 v_int0, v_int1;
                 v_expand(v_src, v_int0, v_int1);
 
@@ -802,10 +802,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
                 v_float64 v_dst2 = vx_load(dst + x + step * 2);
                 v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-                v_dst0 = v_dst0 + v_src0;
-                v_dst1 = v_dst1 + v_src1;
-                v_dst2 = v_dst2 + v_src2;
-                v_dst3 = v_dst3 + v_src3;
+                v_dst0 = v_add(v_dst0, v_src0);
+                v_dst1 = v_add(v_dst1, v_src1);
+                v_dst2 = v_add(v_dst2, v_src2);
+                v_dst3 = v_add(v_dst3, v_src3);
 
                 v_store(dst + x, v_dst0);
                 v_store(dst + x + step, v_dst1);
@@ -818,12 +818,12 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
             for ( ; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
@@ -848,10 +848,10 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
                 v_load_deinterleave(dst + (x + step * 2) * cn, v_dst02, v_dst12, v_dst22);
                 v_load_deinterleave(dst + (x + step * 3) * cn, v_dst03, v_dst13, v_dst23);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
-                v_store_interleave(dst + (x + step * 2) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22);
-                v_store_interleave(dst + (x + step * 3) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
+                v_store_interleave(dst + (x + step * 2) * cn, v_add(v_dst02, v_src02), v_add(v_dst12, v_src12), v_add(v_dst22, v_src22));
+                v_store_interleave(dst + (x + step * 3) * cn, v_add(v_dst03, v_src03), v_add(v_dst13, v_src13), v_add(v_dst23, v_src23));
             }
         }
     }
@@ -862,9 +862,9 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c
 void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -889,8 +889,8 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
             v_float64 v_src0 = v_cvt_f64(v_src);
             v_float64 v_src1 = v_cvt_f64_high(v_src);
 
-            v_store(dst + x, vx_load(dst + x) + v_src0);
-            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+            v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
         }
         #endif // CV_AVX && !CV_AVX2
     }
@@ -904,15 +904,15 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
                 v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float32 v_src = vx_load(src + x);
-                v_float64 v_src0 = v_cvt_f64(v_src) & v_mask0;
-                v_float64 v_src1 = v_cvt_f64_high(v_src) & v_mask1;
+                v_float64 v_src0 = v_and(v_cvt_f64(v_src), v_mask0);
+                v_float64 v_src1 = v_and(v_cvt_f64_high(v_src), v_mask1);
 
-                v_store(dst + x, vx_load(dst + x) + v_src0);
-                v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+                v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
             }
         }
         else if (cn == 3)
@@ -922,24 +922,24 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
                 v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float32 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_float64 v_src00 = v_cvt_f64(v_src0) & v_mask0;
-                v_float64 v_src01 = v_cvt_f64_high(v_src0) & v_mask1;
-                v_float64 v_src10 = v_cvt_f64(v_src1) & v_mask0;
-                v_float64 v_src11 = v_cvt_f64_high(v_src1) & v_mask1;
-                v_float64 v_src20 = v_cvt_f64(v_src2) & v_mask0;
-                v_float64 v_src21 = v_cvt_f64_high(v_src2) & v_mask1;
+                v_float64 v_src00 = v_and(v_cvt_f64(v_src0), v_mask0);
+                v_float64 v_src01 = v_and(v_cvt_f64_high(v_src0), v_mask1);
+                v_float64 v_src10 = v_and(v_cvt_f64(v_src1), v_mask0);
+                v_float64 v_src11 = v_and(v_cvt_f64_high(v_src1), v_mask1);
+                v_float64 v_src20 = v_and(v_cvt_f64(v_src2), v_mask0);
+                v_float64 v_src21 = v_and(v_cvt_f64_high(v_src2), v_mask1);
 
                 v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
             }
         }
     }
@@ -950,9 +950,9 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn
 void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -971,8 +971,8 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
             v_float64 v_src0 = vx_load(src + x);
             v_float64 v_src1 = vx_load(src + x + step);
 
-            v_store(dst + x, vx_load(dst + x) + v_src0);
-            v_store(dst + x + step, vx_load(dst + x + step) + v_src1);
+            v_store(dst + x, v_add(vx_load(dst + x), v_src0));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_src1));
         }
         #endif // CV_AVX && !CV_AVX2
     }
@@ -986,14 +986,14 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
                 v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_src0 = vx_load(src + x);
                 v_float64 v_src1 = vx_load(src + x + step);
 
-                v_store(dst + x, vx_load(dst + x) + (v_src0 & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + (v_src1 & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_src0, v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_src1, v_mask1)));
             }
         }
         else if (cn == 3)
@@ -1003,25 +1003,25 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
                 v_uint32 v_masku32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_masku32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                 v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);
 
                 v_float64 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
             }
         }
     }
@@ -1033,9 +1033,9 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c
 void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -1052,10 +1052,10 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+            v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+            v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
         }
     }
     else
@@ -1066,9 +1066,9 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
                 v_uint8 v_src = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint16 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
                 v_src0 = v_mul_wrap(v_src0, v_src0);
@@ -1078,10 +1078,10 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+                v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+                v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
             }
         }
         else if (cn == 3)
@@ -1089,13 +1089,13 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
             for ( ; x <= len - cVectorWidth ; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
 
                 v_uint8 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
 
                 v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_expand(v_src0, v_src00, v_src01);
@@ -1126,20 +1126,20 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
                 v_load_deinterleave(dst + (x + step * 2) * cn, v_dst010, v_dst110, v_dst210);
                 v_load_deinterleave(dst + (x + step * 3) * cn, v_dst011, v_dst111, v_dst211);
 
-                v_dst000 += v_cvt_f32(v_reinterpret_as_s32(v_src000));
-                v_dst001 += v_cvt_f32(v_reinterpret_as_s32(v_src001));
-                v_dst010 += v_cvt_f32(v_reinterpret_as_s32(v_src010));
-                v_dst011 += v_cvt_f32(v_reinterpret_as_s32(v_src011));
+                v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000)));
+                v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001)));
+                v_dst010 = v_add(v_dst010, v_cvt_f32(v_reinterpret_as_s32(v_src010)));
+                v_dst011 = v_add(v_dst011, v_cvt_f32(v_reinterpret_as_s32(v_src011)));
 
-                v_dst100 += v_cvt_f32(v_reinterpret_as_s32(v_src100));
-                v_dst101 += v_cvt_f32(v_reinterpret_as_s32(v_src101));
-                v_dst110 += v_cvt_f32(v_reinterpret_as_s32(v_src110));
-                v_dst111 += v_cvt_f32(v_reinterpret_as_s32(v_src111));
+                v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100)));
+                v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101)));
+                v_dst110 = v_add(v_dst110, v_cvt_f32(v_reinterpret_as_s32(v_src110)));
+                v_dst111 = v_add(v_dst111, v_cvt_f32(v_reinterpret_as_s32(v_src111)));
 
-                v_dst200 += v_cvt_f32(v_reinterpret_as_s32(v_src200));
-                v_dst201 += v_cvt_f32(v_reinterpret_as_s32(v_src201));
-                v_dst210 += v_cvt_f32(v_reinterpret_as_s32(v_src210));
-                v_dst211 += v_cvt_f32(v_reinterpret_as_s32(v_src211));
+                v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200)));
+                v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201)));
+                v_dst210 = v_add(v_dst210, v_cvt_f32(v_reinterpret_as_s32(v_src210)));
+                v_dst211 = v_add(v_dst211, v_cvt_f32(v_reinterpret_as_s32(v_src211)));
 
                 v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
                 v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
@@ -1155,9 +1155,9 @@ void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int
 void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -1186,13 +1186,13 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
                 v_uint16 v_mask16 = vx_load_expand(mask + x);
                 v_uint32 v_mask0, v_mask1;
                 v_expand(v_mask16, v_mask0, v_mask1);
-                v_mask0 = ~(v_mask0 == v_0);
-                v_mask1 = ~(v_mask1 == v_0);
+                v_mask0 = v_not(v_eq(v_mask0, v_0));
+                v_mask1 = v_not(v_eq(v_mask1, v_0));
                 v_uint16 v_src = vx_load(src + x);
                 v_uint32 v_src0, v_src1;
                 v_expand(v_src, v_src0, v_src1);
-                v_src0 = v_src0 & v_mask0;
-                v_src1 = v_src1 & v_mask1;
+                v_src0 = v_and(v_src0, v_mask0);
+                v_src1 = v_and(v_src1, v_mask1);
 
                 v_float32 v_float0, v_float1;
                 v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0));
@@ -1209,8 +1209,8 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
                 v_uint16 v_mask16 = vx_load_expand(mask + x);
                 v_uint32 v_mask0, v_mask1;
                 v_expand(v_mask16, v_mask0, v_mask1);
-                v_mask0 = ~(v_mask0 == v_0);
-                v_mask1 = ~(v_mask1 == v_0);
+                v_mask0 = v_not(v_eq(v_mask0, v_0));
+                v_mask1 = v_not(v_eq(v_mask1, v_0));
 
                 v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
@@ -1218,12 +1218,12 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
                 v_expand(v_src2, v_int20, v_int21);
-                v_int00 = v_int00 & v_mask0;
-                v_int01 = v_int01 & v_mask1;
-                v_int10 = v_int10 & v_mask0;
-                v_int11 = v_int11 & v_mask1;
-                v_int20 = v_int20 & v_mask0;
-                v_int21 = v_int21 & v_mask1;
+                v_int00 = v_and(v_int00, v_mask0);
+                v_int01 = v_and(v_int01, v_mask1);
+                v_int10 = v_and(v_int10, v_mask0);
+                v_int11 = v_and(v_int11, v_mask1);
+                v_int20 = v_and(v_int20, v_mask0);
+                v_int21 = v_and(v_int21, v_mask1);
 
                 v_float32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00));
@@ -1256,9 +1256,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int
 void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -1293,12 +1293,12 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
                 v_uint16 v_mask16 = vx_load_expand(mask + x);
                 v_uint32 v_mask_0, v_mask_1;
                 v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0)));
                 v_float32 v_src0 = vx_load(src + x);
                 v_float32 v_src1 = vx_load(src + x + step);
-                v_src0 = v_src0 & v_mask0;
-                v_src1 = v_src1 & v_mask1;
+                v_src0 = v_and(v_src0, v_mask0);
+                v_src1 = v_and(v_src1, v_mask1);
 
                 v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
                 v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
@@ -1311,18 +1311,18 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
                 v_uint16 v_mask16 = vx_load_expand(mask + x);
                 v_uint32 v_mask_0, v_mask_1;
                 v_expand(v_mask16, v_mask_0, v_mask_1);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask_1, v_0)));
 
                 v_float32 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                 v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);
 
                 v_float32 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@@ -1347,9 +1347,9 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int
 void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -1390,9 +1390,9 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src = vx_load_expand(src + x);
-                v_uint16 v_int = v_src & v_mask;
+                v_uint16 v_int = v_and(v_src, v_mask);
 
                 v_uint32 v_int0, v_int1;
                 v_expand(v_int, v_int0, v_int1);
@@ -1430,10 +1430,10 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
                 v_uint16 v_int2 = v_expand_low(v_src2);
 
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_int0 = v_int0 & v_mask;
-                v_int1 = v_int1 & v_mask;
-                v_int2 = v_int2 & v_mask;
+                v_mask = v_not(v_eq(v_mask, v_0));
+                v_int0 = v_and(v_int0, v_mask);
+                v_int1 = v_and(v_int1, v_mask);
+                v_int2 = v_and(v_int2, v_mask);
 
                 v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_int0, v_int00, v_int01);
@@ -1486,9 +1486,9 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int
 void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -1531,9 +1531,9 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src = vx_load(src + x);
-                v_src = v_src & v_mask;
+                v_src = v_and(v_src, v_mask);
                 v_uint32 v_int_0, v_int_1;
                 v_expand(v_src, v_int_0, v_int_1);
 
@@ -1566,12 +1566,12 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_mask;
-                v_src1 = v_src1 & v_mask;
-                v_src2 = v_src2 & v_mask;
+                v_src0 = v_and(v_src0, v_mask);
+                v_src1 = v_and(v_src1, v_mask);
+                v_src2 = v_and(v_src2, v_mask);
                 v_uint32 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21;
                 v_expand(v_src0, v_int00, v_int01);
                 v_expand(v_src1, v_int10, v_int11);
@@ -1625,9 +1625,9 @@ void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, in
 void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -1667,9 +1667,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint32 v_mask = vx_load_expand_q(mask + x);;
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_float32 v_src = vx_load(src + x);
-                v_src = v_src & v_reinterpret_as_f32(v_mask);
+                v_src = v_and(v_src, v_reinterpret_as_f32(v_mask));
                 v_float64 v_src0 = v_cvt_f64(v_src);
                 v_float64 v_src1 = v_cvt_f64_high(v_src);
 
@@ -1682,13 +1682,13 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
 
                 v_float32 v_src0, v_src1, v_src2;
                 v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
-                v_src0 = v_src0 & v_reinterpret_as_f32(v_mask);
-                v_src1 = v_src1 & v_reinterpret_as_f32(v_mask);
-                v_src2 = v_src2 & v_reinterpret_as_f32(v_mask);
+                v_src0 = v_and(v_src0, v_reinterpret_as_f32(v_mask));
+                v_src1 = v_and(v_src1, v_reinterpret_as_f32(v_mask));
+                v_src2 = v_and(v_src2, v_reinterpret_as_f32(v_mask));
 
                 v_float64 v_src00 = v_cvt_f64(v_src0);
                 v_float64 v_src01 = v_cvt_f64_high(v_src0);
@@ -1720,9 +1720,9 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int
 void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -1756,12 +1756,12 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
                 v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
                 v_float64 v_src0 = vx_load(src + x);
                 v_float64 v_src1 = vx_load(src + x + step);
-                v_src0 = v_src0 & v_mask0;
-                v_src1 = v_src1 & v_mask1;
+                v_src0 = v_and(v_src0, v_mask0);
+                v_src1 = v_and(v_src1, v_mask1);
                 v_store(dst + x, v_fma(v_src0, v_src0, vx_load(dst + x)));
                 v_store(dst + x + step, v_fma(v_src1, v_src1, vx_load(dst + x + step)));
             }
@@ -1773,18 +1773,18 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
                 v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20);
                 v_load_deinterleave(src + (x + step) * cn, v_src01, v_src11, v_src21);
-                v_src00 = v_src00 & v_mask0;
-                v_src01 = v_src01 & v_mask1;
-                v_src10 = v_src10 & v_mask0;
-                v_src11 = v_src11 & v_mask1;
-                v_src20 = v_src20 & v_mask0;
-                v_src21 = v_src21 & v_mask1;
+                v_src00 = v_and(v_src00, v_mask0);
+                v_src01 = v_and(v_src01, v_mask1);
+                v_src10 = v_and(v_src10, v_mask0);
+                v_src11 = v_and(v_src11, v_mask1);
+                v_src20 = v_and(v_src20, v_mask0);
+                v_src21 = v_and(v_src21, v_mask1);
 
                 v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@@ -1810,9 +1810,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in
 void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_uint32>::vlanes();
 
     if (!mask)
     {
@@ -1829,10 +1829,10 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
             v_expand(v_src0, v_src00, v_src01);
             v_expand(v_src1, v_src10, v_src11);
 
-            v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-            v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-            v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-            v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+            v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+            v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+            v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+            v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
         }
     }
     else
@@ -1843,11 +1843,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint8 v_1src = vx_load(src1 + x);
                 v_uint8 v_2src = vx_load(src2 + x);
-                v_1src = v_1src & v_mask;
-                v_2src = v_2src & v_mask;
+                v_1src = v_and(v_1src, v_mask);
+                v_2src = v_and(v_2src, v_mask);
 
                 v_uint16 v_src0, v_src1;
                 v_mul_expand(v_1src, v_2src, v_src0, v_src1);
@@ -1856,10 +1856,10 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                 v_expand(v_src0, v_src00, v_src01);
                 v_expand(v_src1, v_src10, v_src11);
 
-                v_store(dst + x, vx_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00)));
-                v_store(dst + x + step, vx_load(dst + x + step) + v_cvt_f32(v_reinterpret_as_s32(v_src01)));
-                v_store(dst + x + step * 2, vx_load(dst + x + step * 2) + v_cvt_f32(v_reinterpret_as_s32(v_src10)));
-                v_store(dst + x + step * 3, vx_load(dst + x + step * 3) + v_cvt_f32(v_reinterpret_as_s32(v_src11)));
+                v_store(dst + x, v_add(vx_load(dst + x), v_cvt_f32(v_reinterpret_as_s32(v_src00))));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_cvt_f32(v_reinterpret_as_s32(v_src01))));
+                v_store(dst + x + step * 2, v_add(vx_load(dst + x + step * 2), v_cvt_f32(v_reinterpret_as_s32(v_src10))));
+                v_store(dst + x + step * 3, v_add(vx_load(dst + x + step * 3), v_cvt_f32(v_reinterpret_as_s32(v_src11))));
             }
         }
         else if (cn == 3)
@@ -1867,16 +1867,16 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint8 v_mask = vx_load(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_mask;
-                v_1src1 = v_1src1 & v_mask;
-                v_1src2 = v_1src2 & v_mask;
-                v_2src0 = v_2src0 & v_mask;
-                v_2src1 = v_2src1 & v_mask;
-                v_2src2 = v_2src2 & v_mask;
+                v_1src0 = v_and(v_1src0, v_mask);
+                v_1src1 = v_and(v_1src1, v_mask);
+                v_1src2 = v_and(v_1src2, v_mask);
+                v_2src0 = v_and(v_2src0, v_mask);
+                v_2src1 = v_and(v_2src1, v_mask);
+                v_2src2 = v_and(v_2src2, v_mask);
 
                 v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                 v_mul_expand(v_1src0, v_2src0, v_src00, v_src01);
@@ -1896,18 +1896,18 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
                 v_load_deinterleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
                 v_load_deinterleave(dst + (x + step * 2) * cn, v_dst002, v_dst102, v_dst202);
                 v_load_deinterleave(dst + (x + step * 3) * cn, v_dst003, v_dst103, v_dst203);
-                v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000));
-                v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001));
-                v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002));
-                v_dst003 = v_dst003 + v_cvt_f32(v_reinterpret_as_s32(v_src003));
-                v_dst100 = v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100));
-                v_dst101 = v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101));
-                v_dst102 = v_dst102 + v_cvt_f32(v_reinterpret_as_s32(v_src102));
-                v_dst103 = v_dst103 + v_cvt_f32(v_reinterpret_as_s32(v_src103));
-                v_dst200 = v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200));
-                v_dst201 = v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201));
-                v_dst202 = v_dst202 + v_cvt_f32(v_reinterpret_as_s32(v_src202));
-                v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203));
+                v_dst000 = v_add(v_dst000, v_cvt_f32(v_reinterpret_as_s32(v_src000)));
+                v_dst001 = v_add(v_dst001, v_cvt_f32(v_reinterpret_as_s32(v_src001)));
+                v_dst002 = v_add(v_dst002, v_cvt_f32(v_reinterpret_as_s32(v_src002)));
+                v_dst003 = v_add(v_dst003, v_cvt_f32(v_reinterpret_as_s32(v_src003)));
+                v_dst100 = v_add(v_dst100, v_cvt_f32(v_reinterpret_as_s32(v_src100)));
+                v_dst101 = v_add(v_dst101, v_cvt_f32(v_reinterpret_as_s32(v_src101)));
+                v_dst102 = v_add(v_dst102, v_cvt_f32(v_reinterpret_as_s32(v_src102)));
+                v_dst103 = v_add(v_dst103, v_cvt_f32(v_reinterpret_as_s32(v_src103)));
+                v_dst200 = v_add(v_dst200, v_cvt_f32(v_reinterpret_as_s32(v_src200)));
+                v_dst201 = v_add(v_dst201, v_cvt_f32(v_reinterpret_as_s32(v_src201)));
+                v_dst202 = v_add(v_dst202, v_cvt_f32(v_reinterpret_as_s32(v_src202)));
+                v_dst203 = v_add(v_dst203, v_cvt_f32(v_reinterpret_as_s32(v_src203)));
 
                 v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200);
                 v_store_interleave(dst + (x + step) * cn, v_dst001, v_dst101, v_dst201);
@@ -1923,9 +1923,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar
 void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -1956,10 +1956,10 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
 
-                v_uint16 v_1src = vx_load(src1 + x) & v_mask;
-                v_uint16 v_2src = vx_load(src2 + x) & v_mask;
+                v_uint16 v_1src = v_and(vx_load(src1 + x), v_mask);
+                v_uint16 v_2src = v_and(vx_load(src2 + x), v_mask);
 
                 v_uint32 v_1src0, v_1src1, v_2src0, v_2src1;
                 v_expand(v_1src, v_1src0, v_1src1);
@@ -1979,17 +1979,17 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_0 == v_mask);
+                v_mask = v_not(v_eq(v_0, v_mask));
 
                 v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_mask;
-                v_1src1 = v_1src1 & v_mask;
-                v_1src2 = v_1src2 & v_mask;
-                v_2src0 = v_2src0 & v_mask;
-                v_2src1 = v_2src1 & v_mask;
-                v_2src2 = v_2src2 & v_mask;
+                v_1src0 = v_and(v_1src0, v_mask);
+                v_1src1 = v_and(v_1src1, v_mask);
+                v_1src2 = v_and(v_1src2, v_mask);
+                v_2src0 = v_and(v_2src0, v_mask);
+                v_2src1 = v_and(v_2src1, v_mask);
+                v_2src2 = v_and(v_2src2, v_mask);
 
                 v_uint32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
                 v_expand(v_1src0, v_1src00, v_1src01);
@@ -2035,9 +2035,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uch
 void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -2069,11 +2069,11 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
             {
                 v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
                 v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0)));
 
-                v_store(dst + x, vx_load(dst + x) + ((vx_load(src1 + x) * vx_load(src2 + x)) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + ((vx_load(src1 + x + step) * vx_load(src2 + x + step)) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(vx_load(src1 + x), vx_load(src2 + x)), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(vx_load(src1 + x + step), vx_load(src2 + x + step)), v_mask1)));
             }
         }
         else if (cn == 3)
@@ -2082,8 +2082,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
             {
                 v_uint32 v_mask32_0 = vx_load_expand_q(mask + x);
                 v_uint32 v_mask32_1 = vx_load_expand_q(mask + x + step);
-                v_float32 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0));
-                v_float32 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0));
+                v_float32 v_mask0 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_0, v_0)));
+                v_float32 v_mask1 = v_reinterpret_as_f32(v_not(v_eq(v_mask32_1, v_0)));
 
                 v_float32 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
                 v_float32 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@@ -2096,8 +2096,8 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0));
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1));
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_and(v_mul(v_1src00, v_2src00), v_mask0)), v_add(v_dst10, v_and(v_mul(v_1src10, v_2src10), v_mask0)), v_add(v_dst20, v_and(v_mul(v_1src20, v_2src20), v_mask0)));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_and(v_mul(v_1src01, v_2src01), v_mask1)), v_add(v_dst11, v_and(v_mul(v_1src11, v_2src11), v_mask1)), v_add(v_dst21, v_and(v_mul(v_1src21, v_2src21), v_mask1)));
             }
         }
     }
@@ -2108,9 +2108,9 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar
 void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2153,9 +2153,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_uint16 v_1int = vx_load_expand(src1 + x) & v_mask;
-                v_uint16 v_2int = vx_load_expand(src2 + x) & v_mask;
+                v_mask = v_not(v_eq(v_mask, v_0));
+                v_uint16 v_1int = v_and(vx_load_expand(src1 + x), v_mask);
+                v_uint16 v_2int = v_and(vx_load_expand(src2 + x), v_mask);
 
                 v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
                 v_expand(v_1int, v_1int_0, v_1int_1);
@@ -2198,13 +2198,13 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
                 v_uint16 v_2int2 = v_expand_low(v_2src2);
 
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
-                v_1int0 = v_1int0 & v_mask;
-                v_1int1 = v_1int1 & v_mask;
-                v_1int2 = v_1int2 & v_mask;
-                v_2int0 = v_2int0 & v_mask;
-                v_2int1 = v_2int1 & v_mask;
-                v_2int2 = v_2int2 & v_mask;
+                v_mask = v_not(v_eq(v_mask, v_0));
+                v_1int0 = v_and(v_1int0, v_mask);
+                v_1int1 = v_and(v_1int1, v_mask);
+                v_1int2 = v_and(v_1int2, v_mask);
+                v_2int0 = v_and(v_2int0, v_mask);
+                v_2int1 = v_and(v_2int1, v_mask);
+                v_2int2 = v_and(v_2int2, v_mask);
 
                 v_uint32 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21;
                 v_uint32 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21;
@@ -2248,9 +2248,9 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha
 void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2293,11 +2293,11 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_1src = vx_load(src1 + x);
                 v_uint16 v_2src = vx_load(src2 + x);
-                v_1src = v_1src & v_mask;
-                v_2src = v_2src & v_mask;
+                v_1src = v_and(v_1src, v_mask);
+                v_2src = v_and(v_2src, v_mask);
 
                 v_uint32 v_1int_0, v_1int_1, v_2int_0, v_2int_1;
                 v_expand(v_1src, v_1int_0, v_1int_1);
@@ -2329,16 +2329,16 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint16 v_mask = vx_load_expand(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_uint16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_mask;
-                v_1src1 = v_1src1 & v_mask;
-                v_1src2 = v_1src2 & v_mask;
-                v_2src0 = v_2src0 & v_mask;
-                v_2src1 = v_2src1 & v_mask;
-                v_2src2 = v_2src2 & v_mask;
+                v_1src0 = v_and(v_1src0, v_mask);
+                v_1src1 = v_and(v_1src1, v_mask);
+                v_1src2 = v_and(v_1src2, v_mask);
+                v_2src0 = v_and(v_2src0, v_mask);
+                v_2src1 = v_and(v_2src1, v_mask);
+                v_2src2 = v_and(v_2src2, v_mask);
 
                 v_uint32 v_1int_00, v_1int_01, v_2int_00, v_2int_01;
                 v_uint32 v_1int_10, v_1int_11, v_2int_10, v_2int_11;
@@ -2398,9 +2398,9 @@ void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uc
 void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float32::nlanes;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float32>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2447,11 +2447,11 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_float32 v_1src = vx_load(src1 + x);
                 v_float32 v_2src = vx_load(src2 + x);
-                v_1src = v_1src & v_reinterpret_as_f32(v_mask);
-                v_2src = v_2src & v_reinterpret_as_f32(v_mask);
+                v_1src = v_and(v_1src, v_reinterpret_as_f32(v_mask));
+                v_2src = v_and(v_2src, v_reinterpret_as_f32(v_mask));
 
                 v_float64 v_1src0 = v_cvt_f64(v_1src);
                 v_float64 v_1src1 = v_cvt_f64_high(v_1src);
@@ -2467,16 +2467,16 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
             for (; x <= len - cVectorWidth; x += cVectorWidth)
             {
                 v_uint32 v_mask = vx_load_expand_q(mask + x);
-                v_mask = ~(v_mask == v_0);
+                v_mask = v_not(v_eq(v_mask, v_0));
                 v_float32 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2;
                 v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2);
                 v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2);
-                v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask);
-                v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask);
-                v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask);
-                v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask);
-                v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask);
-                v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask);
+                v_1src0 = v_and(v_1src0, v_reinterpret_as_f32(v_mask));
+                v_1src1 = v_and(v_1src1, v_reinterpret_as_f32(v_mask));
+                v_1src2 = v_and(v_1src2, v_reinterpret_as_f32(v_mask));
+                v_2src0 = v_and(v_2src0, v_reinterpret_as_f32(v_mask));
+                v_2src1 = v_and(v_2src1, v_reinterpret_as_f32(v_mask));
+                v_2src2 = v_and(v_2src2, v_reinterpret_as_f32(v_mask));
 
                 v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
@@ -2501,9 +2501,9 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha
 void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn)
 {
     int x = 0;
-#if CV_SIMD_64F
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2542,16 +2542,16 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                 v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_src00 = vx_load(src1 + x);
                 v_float64 v_src01 = vx_load(src1 + x + step);
                 v_float64 v_src10 = vx_load(src2 + x);
                 v_float64 v_src11 = vx_load(src2 + x + step);
 
-                v_store(dst + x, vx_load(dst + x) + ((v_src00 * v_src10) & v_mask0));
-                v_store(dst + x + step, vx_load(dst + x + step) + ((v_src01 * v_src11) & v_mask1));
+                v_store(dst + x, v_add(vx_load(dst + x), v_and(v_mul(v_src00, v_src10), v_mask0)));
+                v_store(dst + x + step, v_add(vx_load(dst + x + step), v_and(v_mul(v_src01, v_src11), v_mask1)));
             }
         }
         else if (cn == 3)
@@ -2561,8 +2561,8 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                 v_uint32 v_mask32 = vx_load_expand_q(mask + x);
                 v_uint64 v_masku640, v_masku641;
                 v_expand(v_mask32, v_masku640, v_masku641);
-                v_float64 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0));
-                v_float64 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0));
+                v_float64 v_mask0 = v_reinterpret_as_f64(v_not(v_eq(v_masku640, v_0)));
+                v_float64 v_mask1 = v_reinterpret_as_f64(v_not(v_eq(v_masku641, v_0)));
 
                 v_float64 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21;
                 v_float64 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21;
@@ -2570,19 +2570,19 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
                 v_load_deinterleave(src1 + (x + step) * cn, v_1src01, v_1src11, v_1src21);
                 v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20);
                 v_load_deinterleave(src2 + (x + step) * cn, v_2src01, v_2src11, v_2src21);
-                v_float64 v_src00 = (v_1src00 & v_mask0) * v_2src00;
-                v_float64 v_src01 = (v_1src01 & v_mask1) * v_2src01;
-                v_float64 v_src10 = (v_1src10 & v_mask0) * v_2src10;
-                v_float64 v_src11 = (v_1src11 & v_mask1) * v_2src11;
-                v_float64 v_src20 = (v_1src20 & v_mask0) * v_2src20;
-                v_float64 v_src21 = (v_1src21 & v_mask1) * v_2src21;
+                v_float64 v_src00 = v_mul(v_and(v_1src00, v_mask0), v_2src00);
+                v_float64 v_src01 = v_mul(v_and(v_1src01, v_mask1), v_2src01);
+                v_float64 v_src10 = v_mul(v_and(v_1src10, v_mask0), v_2src10);
+                v_float64 v_src11 = v_mul(v_and(v_1src11, v_mask1), v_2src11);
+                v_float64 v_src20 = v_mul(v_and(v_1src20, v_mask0), v_2src20);
+                v_float64 v_src21 = v_mul(v_and(v_1src21, v_mask1), v_2src21);
 
                 v_float64 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x + step) * cn, v_dst01, v_dst11, v_dst21);
 
-                v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20);
-                v_store_interleave(dst + (x + step) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21);
+                v_store_interleave(dst + x * cn, v_add(v_dst00, v_src00), v_add(v_dst10, v_src10), v_add(v_dst20, v_src20));
+                v_store_interleave(dst + (x + step) * cn, v_add(v_dst01, v_src01), v_add(v_dst11, v_src11), v_add(v_dst21, v_src21));
             }
         }
     }
@@ -2594,11 +2594,11 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc
 void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const v_float32 v_alpha = vx_setall_f32((float)alpha);
     const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = v_uint8::nlanes;
-    const int step = v_float32::nlanes;
+    const int cVectorWidth = VTraits<v_uint8>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -2619,10 +2619,10 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
             v_float32 v_dst10 = vx_load(dst + x + step * 2);
             v_float32 v_dst11 = vx_load(dst + x + step * 3);
 
-            v_dst00 = v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha);
-            v_dst01 = v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha);
-            v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
-            v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
+            v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha));
+            v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha));
+            v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha));
+            v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha));
 
             v_store(dst + x           , v_dst00);
             v_store(dst + x + step    , v_dst01);
@@ -2663,15 +2663,15 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
                 v_float32 v_dst10 = vx_load(dst + x + step * 2);
                 v_float32 v_dst11 = vx_load(dst + x + step * 3);
 
-                v_mf00 = v_mf00 != zero;
-                v_mf01 = v_mf01 != zero;
-                v_mf10 = v_mf10 != zero;
-                v_mf11 = v_mf11 != zero;
+                v_mf00 = v_ne(v_mf00, zero);
+                v_mf01 = v_ne(v_mf01, zero);
+                v_mf10 = v_ne(v_mf10, zero);
+                v_mf11 = v_ne(v_mf11, zero);
 
-                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00);
-                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01);
-                v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10);
-                v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11);
+                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha)), v_dst00);
+                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha)), v_dst01);
+                v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha)), v_dst10);
+                v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha)), v_dst11);
 
                 v_store(dst + x           , v_dst00);
                 v_store(dst + x + step    , v_dst01);
@@ -2719,25 +2719,25 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
                 v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10));
                 v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11));
 
-                v_mf00 = v_mf00 != zero;
-                v_mf01 = v_mf01 != zero;
-                v_mf10 = v_mf10 != zero;
-                v_mf11 = v_mf11 != zero;
+                v_mf00 = v_ne(v_mf00, zero);
+                v_mf01 = v_ne(v_mf01, zero);
+                v_mf10 = v_ne(v_mf10, zero);
+                v_mf11 = v_ne(v_mf11, zero);
 
-                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src000)) * v_alpha), v_dst00);
-                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src001)) * v_alpha), v_dst01);
-                v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src010)) * v_alpha), v_dst02);
-                v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src011)) * v_alpha), v_dst03);
+                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_alpha)), v_dst00);
+                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_alpha)), v_dst01);
+                v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_alpha)), v_dst02);
+                v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_alpha)), v_dst03);
 
-                v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src100)) * v_alpha), v_dst10);
-                v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src101)) * v_alpha), v_dst11);
-                v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src110)) * v_alpha), v_dst12);
-                v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src111)) * v_alpha), v_dst13);
+                v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_alpha)), v_dst10);
+                v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_alpha)), v_dst11);
+                v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_alpha)), v_dst12);
+                v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_alpha)), v_dst13);
 
-                v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src200)) * v_alpha), v_dst20);
-                v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src201)) * v_alpha), v_dst21);
-                v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src210)) * v_alpha), v_dst22);
-                v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src211)) * v_alpha), v_dst23);
+                v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src200)), v_alpha)), v_dst20);
+                v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src201)), v_alpha)), v_dst21);
+                v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src210)), v_alpha)), v_dst22);
+                v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src211)), v_alpha)), v_dst23);
 
                 v_store_interleave(dst + x * cn               , v_dst00, v_dst10, v_dst20);
                 v_store_interleave(dst + ( x + step     ) * cn, v_dst01, v_dst11, v_dst21);
@@ -2753,11 +2753,11 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
 void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const v_float32 v_alpha = vx_setall_f32((float)alpha);
     const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -2770,8 +2770,8 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
 
             v_float32 v_dst0 = vx_load(dst + x);
             v_float32 v_dst1 = vx_load(dst + x + step);
-            v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);
-            v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_int0)), v_alpha));
+            v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_int1)), v_alpha));
 
             v_store(dst + x       , v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -2799,11 +2799,11 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
                 v_float32 v_dst0 = vx_load(dst + x);
                 v_float32 v_dst1 = vx_load(dst + x + step);
 
-                v_mf0 = v_mf0 != zero;
-                v_mf1 = v_mf1 != zero;
+                v_mf0 = v_ne(v_mf0, zero);
+                v_mf1 = v_ne(v_mf1, zero);
 
-                v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src0)) * v_alpha), v_dst0);
-                v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src1)) * v_alpha), v_dst1);
+                v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src0)), v_alpha)), v_dst0);
+                v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src1)), v_alpha)), v_dst1);
 
                 v_store(dst + x       , v_dst0);
                 v_store(dst + x + step, v_dst1);
@@ -2833,16 +2833,16 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
                 v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0));
                 v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1));
 
-                v_mf0 = v_mf0 != zero;
-                v_mf1 = v_mf1 != zero;
+                v_mf0 = v_ne(v_mf0, zero);
+                v_mf1 = v_ne(v_mf1, zero);
 
-                v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00);
-                v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10);
-                v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src20)) * v_alpha), v_dst20);
+                v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_alpha)), v_dst00);
+                v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_alpha)), v_dst10);
+                v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src20)), v_alpha)), v_dst20);
 
-                v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01);
-                v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11);
-                v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src21)) * v_alpha), v_dst21);
+                v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_alpha)), v_dst01);
+                v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_alpha)), v_dst11);
+                v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_mul(v_cvt_f32(v_reinterpret_as_s32(v_src21)), v_alpha)), v_dst21);
 
                 v_store_interleave(dst + x * cn               , v_dst00, v_dst10, v_dst20);
                 v_store_interleave(dst + ( x + step     ) * cn, v_dst01, v_dst11, v_dst21);
@@ -2870,11 +2870,11 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn
             _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha)));
         }
     }
-#elif CV_SIMD
+#elif (CV_SIMD || CV_SIMD_SCALABLE)
     const v_float32 v_alpha = vx_setall_f32((float)alpha);
     const v_float32 v_beta = vx_setall_f32((float)(1.0f - alpha));
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float32::nlanes;
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float32>::vlanes();
 
     if (!mask)
     {
@@ -2884,8 +2884,8 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn
             v_float32 v_dst0 = vx_load(dst + x);
             v_float32 v_dst1 = vx_load(dst + x + step);
 
-            v_dst0 = v_fma(v_dst0, v_beta, vx_load(src + x) * v_alpha);
-            v_dst1 = v_fma(v_dst1, v_beta, vx_load(src + x + step) * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, v_mul(vx_load(src + x), v_alpha));
+            v_dst1 = v_fma(v_dst1, v_beta, v_mul(vx_load(src + x + step), v_alpha));
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -2898,11 +2898,11 @@ void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn
 void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const v_float64 v_alpha = vx_setall_f64(alpha);
     const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2927,10 +2927,10 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c
             v_float64 v_dst2 = vx_load(dst + x + step * 2);
             v_float64 v_dst3 = vx_load(dst + x + step * 3);
 
-            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
-            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
-            v_dst2 = v_fma(v_dst2, v_beta, v_src2 * v_alpha);
-            v_dst3 = v_fma(v_dst3, v_beta, v_src3 * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_src0, v_alpha));
+            v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_src1, v_alpha));
+            v_dst2 = v_fma(v_dst2, v_beta, v_mul(v_src2, v_alpha));
+            v_dst3 = v_fma(v_dst3, v_beta, v_mul(v_src3, v_alpha));
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
@@ -2945,11 +2945,11 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c
 void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha)
 {
     int x = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const v_float64 v_alpha = vx_setall_f64(alpha);
     const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
-    const int cVectorWidth = v_uint16::nlanes;
-    const int step = v_float64::nlanes;
+    const int cVectorWidth = VTraits<v_uint16>::vlanes();
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -2973,10 +2973,10 @@ void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int
             v_float64 v_dst10 = vx_load(dst + x + step * 2);
             v_float64 v_dst11 = vx_load(dst + x + step * 3);
 
-            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
-            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
-            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
-            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
+            v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_src00, v_alpha));
+            v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_src01, v_alpha));
+            v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_src10, v_alpha));
+            v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_src11, v_alpha));
 
             v_store(dst + x, v_dst00);
             v_store(dst + x + step, v_dst01);
@@ -3014,11 +3014,11 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c
             _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha)));
         }
     }
-#elif CV_SIMD_64F
+#elif (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const v_float64 v_alpha = vx_setall_f64(alpha);
     const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
-    const int cVectorWidth = v_float32::nlanes * 2;
-    const int step = v_float64::nlanes;
+    const int cVectorWidth = VTraits<v_float32>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -3026,7 +3026,7 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c
         for (; x <= size - cVectorWidth; x += cVectorWidth)
         {
             v_float32 v_src0 = vx_load(src + x);
-            v_float32 v_src1 = vx_load(src + x + v_float32::nlanes);
+            v_float32 v_src1 = vx_load(src + x + VTraits<v_float32>::vlanes());
             v_float64 v_src00 = v_cvt_f64(v_src0);
             v_float64 v_src01 = v_cvt_f64_high(v_src0);
             v_float64 v_src10 = v_cvt_f64(v_src1);
@@ -3037,10 +3037,10 @@ void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int c
             v_float64 v_dst10 = vx_load(dst + x + step * 2);
             v_float64 v_dst11 = vx_load(dst + x + step * 3);
 
-            v_dst00 = v_fma(v_dst00, v_beta, v_src00 * v_alpha);
-            v_dst01 = v_fma(v_dst01, v_beta, v_src01 * v_alpha);
-            v_dst10 = v_fma(v_dst10, v_beta, v_src10 * v_alpha);
-            v_dst11 = v_fma(v_dst11, v_beta, v_src11 * v_alpha);
+            v_dst00 = v_fma(v_dst00, v_beta, v_mul(v_src00, v_alpha));
+            v_dst01 = v_fma(v_dst01, v_beta, v_mul(v_src01, v_alpha));
+            v_dst10 = v_fma(v_dst10, v_beta, v_mul(v_src10, v_alpha));
+            v_dst11 = v_fma(v_dst11, v_beta, v_mul(v_src11, v_alpha));
 
             v_store(dst + x, v_dst00);
             v_store(dst + x + step, v_dst01);
@@ -3072,11 +3072,11 @@ void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int
             _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha)));
         }
     }
-#elif CV_SIMD_64F
+#elif (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const v_float64 v_alpha = vx_setall_f64(alpha);
     const v_float64 v_beta = vx_setall_f64(1.0f - alpha);
-    const int cVectorWidth = v_float64::nlanes * 2;
-    const int step = v_float64::nlanes;
+    const int cVectorWidth = VTraits<v_float64>::vlanes() * 2;
+    const int step = VTraits<v_float64>::vlanes();
 
     if (!mask)
     {
@@ -3089,8 +3089,8 @@ void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int
             v_float64 v_dst0 = vx_load(dst + x);
             v_float64 v_dst1 = vx_load(dst + x + step);
 
-            v_dst0 = v_fma(v_dst0, v_beta, v_src0 * v_alpha);
-            v_dst1 = v_fma(v_dst1, v_beta, v_src1 * v_alpha);
+            v_dst0 = v_fma(v_dst0, v_beta, v_mul(v_src0, v_alpha));
+            v_dst1 = v_fma(v_dst1, v_beta, v_mul(v_src1, v_alpha));
 
             v_store(dst + x, v_dst0);
             v_store(dst + x + step, v_dst1);
diff --git a/modules/imgproc/src/approx.cpp b/modules/imgproc/src/approx.cpp
index efbc6d33a5ac..f05a6bcf3c1a 100644
--- a/modules/imgproc/src/approx.cpp
+++ b/modules/imgproc/src/approx.cpp
@@ -103,9 +103,9 @@ CvSeq* icvApproximateChainTC89( CvChain* chain, int header_size,
         /* calc 1-curvature */
         s = abs_diff[reader.code - prev_code + 7];
 
-        if( method <= CV_CHAIN_APPROX_SIMPLE )
+        if( method <= cv::CHAIN_APPROX_SIMPLE )
         {
-            if( method == CV_CHAIN_APPROX_NONE || s != 0 )
+            if( method == cv::CHAIN_APPROX_NONE || s != 0 )
             {
                 CV_WRITE_SEQ_ELEM( pt, writer );
             }
@@ -121,7 +121,7 @@ CvSeq* icvApproximateChainTC89( CvChain* chain, int header_size,
 
     //CV_Assert( pt.x == chain->origin.x && pt.y == chain->origin.y );
 
-    if( method <= CV_CHAIN_APPROX_SIMPLE )
+    if( method <= cv::CHAIN_APPROX_SIMPLE )
         return cvEndWriteSeq( &writer );
 
     current->next = 0;
@@ -176,7 +176,7 @@ CvSeq* icvApproximateChainTC89( CvChain* chain, int header_size,
         current->k = --k;
 
         /* determine cosine curvature if it should be used */
-        if( method == CV_CHAIN_APPROX_TC89_KCOS )
+        if( method == cv::CHAIN_APPROX_TC89_KCOS )
         {
             /* calc k-cosine curvature */
             for( j = k, s = 0; j > 0; j-- )
@@ -288,7 +288,7 @@ CvSeq* icvApproximateChainTC89( CvChain* chain, int header_size,
     }
     while( current != 0 );
 
-    if( method == CV_CHAIN_APPROX_TC89_KCOS )
+    if( method == cv::CHAIN_APPROX_TC89_KCOS )
         goto copy_vect;
 
     /* Pass 4.
@@ -389,9 +389,9 @@ cvApproxChains( CvSeq*              src_seq,
     CvSeq *dst_seq = 0;
 
     if( !src_seq || !storage )
-        CV_Error( CV_StsNullPtr, "" );
-    if( method > CV_CHAIN_APPROX_TC89_KCOS || method <= 0 || minimal_perimeter < 0 )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
+    if( method > cv::CHAIN_APPROX_TC89_KCOS || method <= 0 || minimal_perimeter < 0 )
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     while( src_seq != 0 )
     {
@@ -403,14 +403,14 @@ cvApproxChains( CvSeq*              src_seq,
 
             switch( method )
             {
-            case CV_CHAIN_APPROX_NONE:
-            case CV_CHAIN_APPROX_SIMPLE:
-            case CV_CHAIN_APPROX_TC89_L1:
-            case CV_CHAIN_APPROX_TC89_KCOS:
+            case cv::CHAIN_APPROX_NONE:
+            case cv::CHAIN_APPROX_SIMPLE:
+            case cv::CHAIN_APPROX_TC89_L1:
+            case cv::CHAIN_APPROX_TC89_KCOS:
                 contour = icvApproximateChainTC89( (CvChain *) src_seq, sizeof( CvContour ), storage, method );
                 break;
             default:
-                CV_Error( CV_StsOutOfRange, "" );
+                CV_Error( cv::Error::StsOutOfRange, "" );
             }
 
             if( contour->total > 0 )
@@ -681,7 +681,7 @@ void cv::approxPolyDP( InputArray _curve, OutputArray _approxCurve,
     //from being used.
     if (epsilon < 0.0 || !(epsilon < 1e30))
     {
-        CV_Error(CV_StsOutOfRange, "Epsilon not valid.");
+        CV_Error(cv::Error::StsOutOfRange, "Epsilon not valid.");
     }
 
     Mat curve = _curve.getMat();
@@ -704,7 +704,7 @@ void cv::approxPolyDP( InputArray _curve, OutputArray _approxCurve,
     else if( depth == CV_32F )
         nout = approxPolyDP_(curve.ptr<Point2f>(), npoints, (Point2f*)buf, closed, epsilon, _stack);
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
     Mat(nout, 1, CV_MAKETYPE(depth, 2), buf).copyTo(_approxCurve);
 }
@@ -728,7 +728,7 @@ cvApproxPoly( const void* array, int header_size,
     {
         src_seq = (CvSeq*)array;
         if( !CV_IS_SEQ_POLYLINE( src_seq ))
-            CV_Error( CV_StsBadArg, "Unsupported sequence type" );
+            CV_Error( cv::Error::StsBadArg, "Unsupported sequence type" );
 
         recursive = parameter2;
 
@@ -743,10 +743,10 @@ cvApproxPoly( const void* array, int header_size,
     }
 
     if( !storage )
-        CV_Error( CV_StsNullPtr, "NULL storage pointer " );
+        CV_Error( cv::Error::StsNullPtr, "NULL storage pointer " );
 
     if( header_size < 0 )
-        CV_Error( CV_StsOutOfRange, "header_size is negative. "
+        CV_Error( cv::Error::StsOutOfRange, "header_size is negative. "
                  "Pass 0 to make the destination header_size == input header_size" );
 
     if( header_size == 0 )
@@ -756,12 +756,12 @@ cvApproxPoly( const void* array, int header_size,
     {
         if( CV_IS_SEQ_CHAIN( src_seq ))
         {
-            CV_Error( CV_StsBadArg, "Input curves are not polygonal. "
+            CV_Error( cv::Error::StsBadArg, "Input curves are not polygonal. "
                      "Use cvApproxChains first" );
         }
         else
         {
-            CV_Error( CV_StsBadArg, "Input curves have unknown type" );
+            CV_Error( cv::Error::StsBadArg, "Input curves have unknown type" );
         }
     }
 
@@ -769,10 +769,10 @@ cvApproxPoly( const void* array, int header_size,
         header_size = src_seq->header_size;
 
     if( header_size < (int)sizeof(CvContour) )
-        CV_Error( CV_StsBadSize, "New header size must be non-less than sizeof(CvContour)" );
+        CV_Error( cv::Error::StsBadSize, "New header size must be non-less than sizeof(CvContour)" );
 
     if( method != CV_POLY_APPROX_DP )
-        CV_Error( CV_StsOutOfRange, "Unknown approximation method" );
+        CV_Error( cv::Error::StsOutOfRange, "Unknown approximation method" );
 
     while( src_seq != 0 )
     {
@@ -782,7 +782,7 @@ cvApproxPoly( const void* array, int header_size,
         {
         case CV_POLY_APPROX_DP:
             if( parameter < 0 )
-                CV_Error( CV_StsOutOfRange, "Accuracy must be non-negative" );
+                CV_Error( cv::Error::StsOutOfRange, "Accuracy must be non-negative" );
 
             CV_Assert( CV_SEQ_ELTYPE(src_seq) == CV_32SC2 ||
                       CV_SEQ_ELTYPE(src_seq) == CV_32FC2 );
@@ -804,7 +804,7 @@ cvApproxPoly( const void* array, int header_size,
                 nout = cv::approxPolyDP_((cv::Point2f*)src, npoints,
                                          (cv::Point2f*)dst, closed, parameter, stack);
             else
-                CV_Error( CV_StsUnsupportedFormat, "" );
+                CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
             contour = cvCreateSeq( src_seq->flags, header_size,
                                   src_seq->elem_size, storage );
@@ -812,7 +812,7 @@ cvApproxPoly( const void* array, int header_size,
             }
             break;
         default:
-            CV_Error( CV_StsBadArg, "Invalid approximation method" );
+            CV_Error( cv::Error::StsBadArg, "Invalid approximation method" );
         }
 
         CV_Assert( contour );
diff --git a/modules/imgproc/src/bilateral_filter.dispatch.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp
index e2a9dcba3964..4ccec1249646 100644
--- a/modules/imgproc/src/bilateral_filter.dispatch.cpp
+++ b/modules/imgproc/src/bilateral_filter.dispatch.cpp
@@ -415,6 +415,9 @@ void bilateralFilter( InputArray _src, OutputArray _dst, int d,
 
     Mat src = _src.getMat(), dst = _dst.getMat();
 
+    CALL_HAL(bilateralFilter, cv_hal_bilateralFilter, src.data, src.step, dst.data, dst.step, src.cols, src.rows, src.depth(),
+             src.channels(), d, sigmaColor, sigmaSpace, borderType);
+
     CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType));
 
     if( src.depth() == CV_8U )
@@ -422,7 +425,7 @@ void bilateralFilter( InputArray _src, OutputArray _dst, int d,
     else if( src.depth() == CV_32F )
         bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType );
     else
-        CV_Error( CV_StsUnsupportedFormat,
+        CV_Error( cv::Error::StsUnsupportedFormat,
         "Bilateral filtering is only implemented for 8u and 32f images" );
 }
 
diff --git a/modules/imgproc/src/bilateral_filter.simd.hpp b/modules/imgproc/src/bilateral_filter.simd.hpp
index 0d2c39436810..77e0328678b4 100644
--- a/modules/imgproc/src/bilateral_filter.simd.hpp
+++ b/modules/imgproc/src/bilateral_filter.simd.hpp
@@ -99,33 +99,33 @@ class BilateralFilter_8u_Invoker :
                     const uchar* ksptr2 = sptr + space_ofs[k+2];
                     const uchar* ksptr3 = sptr + space_ofs[k+3];
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                     v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                     v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                     v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                     {
                         v_uint32 rval = vx_load_expand_q(sptr + j);
 
                         v_uint32 val = vx_load_expand_q(ksptr0 + j);
-                        v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 w = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
                         v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j));
 
                         val = vx_load_expand_q(ksptr1 + j);
-                        w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                         v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
 
                         val = vx_load_expand_q(ksptr2 + j);
-                        w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                         v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
 
                         val = vx_load_expand_q(ksptr3 + j);
-                        w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
-                        v_wsum += w;
+                        w = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
+                        v_wsum = v_add(v_wsum, w);
                         v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
 
                         v_store_aligned(wsum + j, v_wsum);
@@ -140,9 +140,9 @@ class BilateralFilter_8u_Invoker :
 #if CV_SIMD128
                         v_uint32x4 rval = v_setall_u32(sptr[j]);
                         v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
-                        v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
                         wsum[j] += v_reduce_sum(w);
-                        sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w);
+                        sum[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(val)), w));
 #else
                         int rval = sptr[j];
 
@@ -172,13 +172,13 @@ class BilateralFilter_8u_Invoker :
                 {
                     const uchar* ksptr = sptr + space_ofs[k];
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                     {
                         v_uint32 val = vx_load_expand_q(ksptr + j);
-                        v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j))));
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
+                        v_float32 w = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))));
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
                         v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)));
                     }
 #endif
@@ -191,10 +191,10 @@ class BilateralFilter_8u_Invoker :
                     }
                 }
                 j = 0;
-#if CV_SIMD
-                for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes)
-                    v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j                    ) / vx_load_aligned(wsum + j                    )),
-                                                    v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes))));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes())
+                    v_pack_u_store(dptr + j, v_pack(v_round(v_div(vx_load_aligned(sum + j), vx_load_aligned(wsum + j))),
+                                                    v_round(v_div(vx_load_aligned(sum + j + VTraits<v_float32>::vlanes()), vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())))));
 #endif
                 for (; j < size.width; j++)
                 {
@@ -221,13 +221,13 @@ class BilateralFilter_8u_Invoker :
                     const uchar* ksptr3 = sptr + space_ofs[k+3];
                     const uchar* rsptr = sptr;
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                     v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                     v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                     v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes,
-                                                              ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes)
+                    for (; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), rsptr += 3*VTraits<v_uint8>::vlanes(),
+                                                              ksptr0 += 3*VTraits<v_uint8>::vlanes(), ksptr1 += 3*VTraits<v_uint8>::vlanes(), ksptr2 += 3*VTraits<v_uint8>::vlanes(), ksptr3 += 3*VTraits<v_uint8>::vlanes())
                     {
                         v_uint8 kb, kg, kr, rb, rg, rr;
                         v_load_deinterleave(rsptr, rb, rg, rr);
@@ -236,163 +236,163 @@ class BilateralFilter_8u_Invoker :
                         v_uint16 val0, val1, val2, val3, val4;
                         v_expand(v_absdiff(kb, rb), val0, val1);
                         v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                         v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
 
                         v_uint32 vall, valh;
                         v_expand(val0, vall, valh);
-                        v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_float32 w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        v_float32 w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                         v_expand(kb, val0, val2);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                         v_expand(kg, val0, val3);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                         v_expand(kr, val0, val4);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
 
                         v_expand(val1, vall, valh);
-                        w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight0, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes())));
 
                         v_load_deinterleave(ksptr1, kb, kg, kr);
                         v_expand(v_absdiff(kb, rb), val0, val1);
                         v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                         v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
 
                         v_expand(val0, vall, valh);
-                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                         v_expand(kb, val0, val2);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                         v_expand(kg, val0, val3);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                         v_expand(kr, val0, val4);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
 
                         v_expand(val1, vall, valh);
-                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight1, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
 
                         v_load_deinterleave(ksptr2, kb, kg, kr);
                         v_expand(v_absdiff(kb, rb), val0, val1);
                         v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                         v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
 
                         v_expand(val0, vall, valh);
-                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                         v_expand(kb, val0, val2);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                         v_expand(kg, val0, val3);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                         v_expand(kr, val0, val4);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
 
                         v_expand(val1, vall, valh);
-                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight2, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
 
                         v_load_deinterleave(ksptr3, kb, kg, kr);
                         v_expand(v_absdiff(kb, rb), val0, val1);
                         v_expand(v_absdiff(kg, rg), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
                         v_expand(v_absdiff(kr, rr), val2, val3);
-                        val0 += val2; val1 += val3;
+                        val0 = v_add(val0, val2); val1 = v_add(val1, val3);
 
                         v_expand(val0, vall, valh);
-                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j, v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j + VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
                         v_expand(kb, val0, val2);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
                         v_expand(kg, val0, val3);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
                         v_expand(kr, val0, val4);
                         v_expand(val0, vall, valh);
                         v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
 
                         v_expand(val1, vall, valh);
-                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
-                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
-                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        w0 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(vall)));
+                        w1 = v_mul(kweight3, v_lut(color_weight, v_reinterpret_as_s32(valh)));
+                        v_store_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes(), v_add(w0, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val2, vall, valh);
-                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val3, vall, valh);
-                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(val4, vall, valh);
-                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes())));
                     }
 #endif
 #if CV_SIMD128
@@ -407,11 +407,11 @@ class BilateralFilter_8u_Invoker :
                             v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
                             v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
                             v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
-                            v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr)));
+                            v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_add(v_add(v_absdiff(b, rb), v_absdiff(g, rg)), v_absdiff(r, rr)))));
                             wsum[j] += v_reduce_sum(w);
-                            sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w);
-                            sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w);
-                            sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w);
+                            sum_b[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(b)), w));
+                            sum_g[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(g)), w));
+                            sum_r[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(r)), w));
 #else
                         int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
 
@@ -442,9 +442,9 @@ class BilateralFilter_8u_Invoker :
                     const uchar* ksptr = sptr + space_ofs[k];
                     const uchar* rsptr = sptr;
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes)
+                    for (; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), ksptr += 3*VTraits<v_uint8>::vlanes(), rsptr += 3*VTraits<v_uint8>::vlanes())
                     {
                         v_uint8 kb, kg, kr, rb, rg, rr;
                         v_load_deinterleave(ksptr, kb, kg, kr);
@@ -456,39 +456,39 @@ class BilateralFilter_8u_Invoker :
                         v_expand(v_absdiff(kr, rr), r_l, r_h);
 
                         v_uint32 val0, val1, val2, val3;
-                        v_expand(b_l + g_l + r_l, val0, val1);
-                        v_expand(b_h + g_h + r_h, val2, val3);
+                        v_expand(v_add(v_add(b_l, g_l), r_l), val0, val1);
+                        v_expand(v_add(v_add(b_h, g_h), r_h), val2, val3);
 
                         v_expand(kb, b_l, b_h);
                         v_expand(kg, g_l, g_h);
                         v_expand(kr, r_l, r_h);
 
-                        v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0));
-                        v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1));
-                        v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2));
-                        v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3));
-                        v_store_aligned(wsum + j                      , w0 + vx_load_aligned(wsum + j));
-                        v_store_aligned(wsum + j +   v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
-                        v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes));
-                        v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes));
+                        v_float32 w0 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val0)));
+                        v_float32 w1 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val1)));
+                        v_float32 w2 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val2)));
+                        v_float32 w3 = v_mul(kweight, v_lut(color_weight, v_reinterpret_as_s32(val3)));
+                        v_store_aligned(wsum + j                      , v_add(w0, vx_load_aligned(wsum + j)));
+                        v_store_aligned(wsum + j +   VTraits<v_float32>::vlanes(), v_add(w1, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 2*VTraits<v_float32>::vlanes(), v_add(w2, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes())));
+                        v_store_aligned(wsum + j + 3*VTraits<v_float32>::vlanes(), v_add(w3, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes())));
                         v_expand(b_l, val0, val1);
                         v_expand(b_h, val2, val3);
                         v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_b + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_b + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*VTraits<v_float32>::vlanes())));
                         v_expand(g_l, val0, val1);
                         v_expand(g_h, val2, val3);
                         v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_g + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_g + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*VTraits<v_float32>::vlanes())));
                         v_expand(r_l, val0, val1);
                         v_expand(r_h, val2, val3);
                         v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j)));
-                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
-                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j +   VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*VTraits<v_float32>::vlanes())));
+                        v_store_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes(), v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*VTraits<v_float32>::vlanes())));
                     }
 #endif
                     for(; j < size.width; j++, ksptr += 3, rsptr += 3)
@@ -500,27 +500,27 @@ class BilateralFilter_8u_Invoker :
                     }
                 }
                 j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 v_one = vx_setall_f32(1.f);
-                for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes)
+                for(; j <= size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes(), dptr += 3*VTraits<v_uint8>::vlanes())
                 {
-                    v_float32 w0 = v_one / vx_load_aligned(wsum + j);
-                    v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes);
-                    v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes);
-                    v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes);
-
-                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))),
-                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))),
-                                             v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)),
-                                                             v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))),
-                                                      v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)),
-                                                             v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes)))));
+                    v_float32 w0 = v_div(v_one, vx_load_aligned(wsum + j));
+                    v_float32 w1 = v_div(v_one, vx_load_aligned(wsum + j + VTraits<v_float32>::vlanes()));
+                    v_float32 w2 = v_div(v_one, vx_load_aligned(wsum + j + 2 * VTraits<v_float32>::vlanes()));
+                    v_float32 w3 = v_div(v_one, vx_load_aligned(wsum + j + 3 * VTraits<v_float32>::vlanes()));
+
+                    v_store_interleave(dptr, v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_b + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_b + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_b + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_b + j + 3 * VTraits<v_float32>::vlanes()))))),
+                                             v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_g + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_g + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_g + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_g + j + 3 * VTraits<v_float32>::vlanes()))))),
+                                             v_pack_u(v_pack(v_round(v_mul(w0, vx_load_aligned(sum_r + j))),
+                                                             v_round(v_mul(w1, vx_load_aligned(sum_r + j + VTraits<v_float32>::vlanes())))),
+                                                      v_pack(v_round(v_mul(w2, vx_load_aligned(sum_r + j + 2 * VTraits<v_float32>::vlanes()))),
+                                                             v_round(v_mul(w3, vx_load_aligned(sum_r + j + 3 * VTraits<v_float32>::vlanes()))))));
                 }
 #endif
                 for(; j < size.width; j++)
@@ -533,7 +533,7 @@ class BilateralFilter_8u_Invoker :
                 }
             }
         }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         vx_cleanup();
 #endif
     }
@@ -589,7 +589,7 @@ class BilateralFilter_32f_Invoker :
                 memset(buf.data(), 0, buf.size() * sizeof(float));
                 float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
                 float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 v_one = vx_setall_f32(1.f);
                 v_float32 sindex = vx_setall_f32(scale_index);
 #endif
@@ -601,50 +601,50 @@ class BilateralFilter_32f_Invoker :
                     const float* ksptr2 = sptr + space_ofs[k + 2];
                     const float* ksptr3 = sptr + space_ofs[k + 3];
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                     v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                     v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                     v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                     {
                         v_float32 rval = vx_load(sptr + j);
 
                         v_float32 val = vx_load(ksptr0 + j);
                         v_float32 knan = v_not_nan(val);
-                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
-                        v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j));
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
+                        v_float32 v_sum = v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j));
 
                         val = vx_load(ksptr1 + j);
                         knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);
 
                         val = vx_load(ksptr2 + j);
                         knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);
 
                         val = vx_load(ksptr3 + j);
                         knan = v_not_nan(val);
-                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum = v_muladd(val & knan, w, v_sum);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum = v_muladd(v_and(val, knan), w, v_sum);
 
                         v_store_aligned(wsum + j, v_wsum);
                         v_store_aligned(sum + j, v_sum);
@@ -661,12 +661,12 @@ class BilateralFilter_32f_Invoker :
                         v_float32x4 rval = v_setall_f32(sptr[j]);
                         v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
                         v_float32x4 knan = v_not_nan(val);
-                        v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan;
+                        v_float32x4 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex4), v_not_nan(rval)), knan);
                         v_int32x4 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan);
                         wsum[j] += v_reduce_sum(w);
-                        sum[j] += v_reduce_sum((val & knan) * w);
+                        sum[j] += v_reduce_sum(v_mul(v_and(val, knan), w));
 #else
                         float rval = sptr[j];
 
@@ -720,20 +720,20 @@ class BilateralFilter_32f_Invoker :
                 {
                     const float* ksptr = sptr + space_ofs[k];
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                     {
                         v_float32 val = vx_load(ksptr + j);
                         v_float32 rval = vx_load(sptr + j);
                         v_float32 knan = v_not_nan(val);
-                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        v_float32 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex), v_not_nan(rval)), knan);
                         v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
 
-                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j)));
+                        v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
+                        v_store_aligned(sum + j, v_muladd(v_and(val, knan), w, vx_load_aligned(sum + j)));
                     }
 #endif
                     for (; j < size.width; j++)
@@ -752,11 +752,11 @@ class BilateralFilter_32f_Invoker :
                     }
                 }
                 j = 0;
-#if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
                 {
                     v_float32 v_val = vx_load(sptr + j);
-                    v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val))));
+                    v_store(dptr + j, v_div(v_add(vx_load_aligned(sum + j), v_and(v_val, v_not_nan(v_val))), v_add(vx_load_aligned(wsum + j), v_and(v_one, v_not_nan(v_val)))));
                 }
 #endif
                 for (; j < size.width; j++)
@@ -774,7 +774,7 @@ class BilateralFilter_32f_Invoker :
                 float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
                 float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
                 float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 v_one = vx_setall_f32(1.f);
                 v_float32 sindex = vx_setall_f32(scale_index);
 #endif
@@ -787,60 +787,60 @@ class BilateralFilter_32f_Invoker :
                     const float* ksptr3 = sptr + space_ofs[k+3];
                     const float* rsptr = sptr;
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight0 = vx_setall_f32(space_weight[k]);
                     v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
                     v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
                     v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes,
-                                                                ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), rsptr += 3 * VTraits<v_float32>::vlanes(),
+                                                                ksptr0 += 3 * VTraits<v_float32>::vlanes(), ksptr1 += 3 * VTraits<v_float32>::vlanes(), ksptr2 += 3 * VTraits<v_float32>::vlanes(), ksptr3 += 3 * VTraits<v_float32>::vlanes())
                     {
                         v_float32 kb, kg, kr, rb, rg, rr;
                         v_load_deinterleave(rsptr, rb, rg, rr);
 
                         v_load_deinterleave(ksptr0, kb, kg, kr);
-                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
-                        v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j));
-                        v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j));
-                        v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j));
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32 w = v_and(v_mul(kweight0, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_float32 v_wsum = v_add(vx_load_aligned(wsum + j), w);
+                        v_float32 v_sum_b = v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j));
+                        v_float32 v_sum_g = v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j));
+                        v_float32 v_sum_r = v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j));
 
                         v_load_deinterleave(ksptr1, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight1, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
 
                         v_load_deinterleave(ksptr2, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight2, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
 
                         v_load_deinterleave(ksptr3, kb, kg, kr);
-                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_wsum += w;
-                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
-                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
-                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        w = v_and(v_mul(kweight3, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_wsum = v_add(v_wsum, w);
+                        v_sum_b = v_muladd(v_and(kb, knan), w, v_sum_b);
+                        v_sum_g = v_muladd(v_and(kg, knan), w, v_sum_g);
+                        v_sum_r = v_muladd(v_and(kr, knan), w, v_sum_r);
 
                         v_store_aligned(wsum + j, v_wsum);
                         v_store_aligned(sum_b + j, v_sum_b);
@@ -862,15 +862,15 @@ class BilateralFilter_32f_Invoker :
                         v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
                         v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
                         v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
-                        v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_float32x4 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        v_float32x4 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex4), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         v_int32x4 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan);
                         wsum[j] += v_reduce_sum(w);
-                        sum_b[j] += v_reduce_sum((kb & knan) * w);
-                        sum_g[j] += v_reduce_sum((kg & knan) * w);
-                        sum_r[j] += v_reduce_sum((kr & knan) * w);
+                        sum_b[j] += v_reduce_sum(v_mul(v_and(kb, knan), w));
+                        sum_g[j] += v_reduce_sum(v_mul(v_and(kg, knan), w));
+                        sum_r[j] += v_reduce_sum(v_mul(v_and(kr, knan), w));
 #else
                         float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
                         bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
@@ -938,24 +938,24 @@ class BilateralFilter_32f_Invoker :
                     const float* ksptr = sptr + space_ofs[k];
                     const float* rsptr = sptr;
                     j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_float32 kweight = vx_setall_f32(space_weight[k]);
-                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes)
+                    for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), ksptr += 3*VTraits<v_float32>::vlanes(), rsptr += 3*VTraits<v_float32>::vlanes())
                     {
                         v_float32 kb, kg, kr, rb, rg, rr;
                         v_load_deinterleave(ksptr, kb, kg, kr);
                         v_load_deinterleave(rsptr, rb, rg, rr);
 
-                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_float32 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        v_float32 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         v_int32 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
 
-                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
-                        v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)));
+                        v_float32 w = v_and(v_mul(kweight, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one, alpha)))), knan);
+                        v_store_aligned(wsum + j, v_add(vx_load_aligned(wsum + j), w));
+                        v_store_aligned(sum_b + j, v_muladd(v_and(kb, knan), w, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_g + j, v_muladd(v_and(kg, knan), w, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_r + j, v_muladd(v_and(kr, knan), w, vx_load_aligned(sum_r + j)));
                     }
 #endif
                     for (; j < size.width; j++, ksptr += 3, rsptr += 3)
@@ -978,14 +978,14 @@ class BilateralFilter_32f_Invoker :
                     }
                 }
                 j = 0;
-#if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes(), sptr += 3*VTraits<v_float32>::vlanes(), dptr += 3*VTraits<v_float32>::vlanes())
                 {
                     v_float32 b, g, r;
                     v_load_deinterleave(sptr, b, g, r);
-                    v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r);
-                    v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask));
-                    v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w);
+                    v_float32 mask = v_and(v_and(v_not_nan(b), v_not_nan(g)), v_not_nan(r));
+                    v_float32 w = v_div(v_one, v_add(vx_load_aligned(wsum + j), v_and(v_one, mask)));
+                    v_store_interleave(dptr, v_mul(v_add(vx_load_aligned(sum_b + j), v_and(b, mask)), w), v_mul(v_add(vx_load_aligned(sum_g + j), v_and(g, mask)), w), v_mul(v_add(vx_load_aligned(sum_r + j), v_and(r, mask)), w));
                 }
 #endif
                 for (; j < size.width; j++)
@@ -1011,7 +1011,7 @@ class BilateralFilter_32f_Invoker :
                 }
             }
         }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         vx_cleanup();
 #endif
     }
diff --git a/modules/imgproc/src/blend.cpp b/modules/imgproc/src/blend.cpp
index 5a1296b50958..accb45e7ad87 100644
--- a/modules/imgproc/src/blend.cpp
+++ b/modules/imgproc/src/blend.cpp
@@ -48,12 +48,12 @@
 #include "opencv2/core/hal/intrin.hpp"
 
 namespace cv {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2)
 {
     const v_float32 v_eps = vx_setall_f32(1e-5f);
-    v_float32 v_denom = v_w1 + v_w2 + v_eps;
-    return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
+    v_float32 v_denom = v_add(v_add(v_w1, v_w2), v_eps);
+    return v_div(v_add(v_mul(v_src1, v_w1), v_mul(v_src2, v_w2)), v_denom);
 }
 static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
 {
@@ -105,7 +105,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
     switch(cn)
     {
     case 1:
-        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13;
             v_float32 v_src20, v_src21, v_src22, v_src23;
@@ -113,15 +113,15 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
             load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
 
             v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
-            v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes);
+            v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
 
             store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
         }
         break;
     case 2:
-        for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - 2*VTraits<v_uint8>::vlanes(); x += 2*VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_uint8 v_src10, v_src11, v_src20, v_src21;
             v_load_deinterleave(src1 + x, v_src10, v_src11);
@@ -135,12 +135,12 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
 
             v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
             v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
-            v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes);
-            v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes);
-            v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes);
-            v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes);
+            v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
+            v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*VTraits<v_float32>::vlanes());
 
             v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
             v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
@@ -148,7 +148,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
         }
         break;
     case 3:
-        for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
+        for(int weight_offset = 0 ; x <= width - 3*VTraits<v_uint8>::vlanes(); x += 3*VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_uint8>::vlanes())
         {
             v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
@@ -164,13 +164,13 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
             expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);
 
             v_float32 v_w10 = vx_load(weights1 + weight_offset);
-            v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes);
-            v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes);
-            v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes);
+            v_float32 v_w11 = vx_load(weights1 + weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*VTraits<v_float32>::vlanes());
             v_float32 v_w20 = vx_load(weights2 + weight_offset);
-            v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes);
-            v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes);
-            v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes);
+            v_float32 v_w21 = vx_load(weights2 + weight_offset + VTraits<v_float32>::vlanes());
+            v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*VTraits<v_float32>::vlanes());
+            v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*VTraits<v_float32>::vlanes());
             v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
             v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
             v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
@@ -192,7 +192,7 @@ int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1,
         }
         break;
     case 4:
-        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13;
             v_float32 v_src20, v_src21, v_src22, v_src23;
@@ -229,7 +229,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
     switch(cn)
     {
     case 1:
-        for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src1 = vx_load(src1 + x);
             v_float32 v_src2 = vx_load(src2 + x);
@@ -242,7 +242,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 2:
-        for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 2*VTraits<v_float32>::vlanes(); x += 2*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src20, v_src21;
             v_load_deinterleave(src1 + x, v_src10, v_src11);
@@ -257,7 +257,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 3:
-        for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 3*VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
@@ -273,7 +273,7 @@ int blendLinearSimd(const float* src1, const float* src2, const float* weights1,
         }
         break;
     case 4:
-        for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes)
+        for(int weight_offset = 0 ; x <= width - 4*VTraits<v_float32>::vlanes(); x += 4*VTraits<v_float32>::vlanes(), weight_offset += VTraits<v_float32>::vlanes())
         {
             v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
@@ -320,7 +320,7 @@ class BlendLinearInvoker :
             T * const dst_row = dst->ptr<T>(y);
 
             int x = 0;
-            #if CV_SIMD
+            #if (CV_SIMD || CV_SIMD_SCALABLE)
             x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
             #endif
 
diff --git a/modules/imgproc/src/box_filter.simd.hpp b/modules/imgproc/src/box_filter.simd.hpp
index 4a4d205216fb..ac0b0f1b5626 100644
--- a/modules/imgproc/src/box_filter.simd.hpp
+++ b/modules/imgproc/src/box_filter.simd.hpp
@@ -309,15 +309,15 @@ struct ColumnSum<int, uchar> :
             {
                 const int* Sp = (const int*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -339,37 +339,37 @@ struct ColumnSum<int, uchar> :
             if( haveScale )
             {
                 int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+                for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
-                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
-                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
+                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
 
                     v_uint16 v_dst = v_pack(v_s0d, v_s01d);
                     v_pack_store(D + i, v_dst);
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
-                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
+                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
 
                     v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
                     v_pack_store(D + i, v_dst);
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
             }
 #endif
 #endif
@@ -383,29 +383,29 @@ struct ColumnSum<int, uchar> :
             else
             {
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
                     v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
                     v_pack_store(D + i, v_dst);
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
                     v_pack_store(D + i, v_dst);
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -480,15 +480,15 @@ public BaseColumnFilter
             {
                 const ushort* Sp = (const ushort*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width - VTraits<v_uint16x8>::vlanes(); i += VTraits<v_uint16x8>::vlanes() )
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -510,27 +510,27 @@ public BaseColumnFilter
             if( haveScale )
             {
                 int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_uint32 _ds4 = vx_setall_u32((unsigned)ds);
                 v_uint16 _dd8 = vx_setall_u16((ushort)dd);
 
-                for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes )
+                for( ; i <= width-VTraits<v_uint8>::vlanes(); i+=VTraits<v_uint8>::vlanes() )
                 {
                     v_uint16 _sm0 = vx_load(Sm + i);
-                    v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes);
+                    v_uint16 _sm1 = vx_load(Sm + i + VTraits<v_uint16>::vlanes());
 
                     v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i));
-                    v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes));
+                    v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + VTraits<v_uint16>::vlanes()), vx_load(Sp + i + VTraits<v_uint16>::vlanes()));
 
                     v_uint32 _s00, _s01, _s10, _s11;
 
-                    v_expand(_s0 + _dd8, _s00, _s01);
-                    v_expand(_s1 + _dd8, _s10, _s11);
+                    v_expand(v_add(_s0, _dd8), _s00, _s01);
+                    v_expand(v_add(_s1, _dd8), _s10, _s11);
 
-                    _s00 = v_shr<SHIFT>(_s00*_ds4);
-                    _s01 = v_shr<SHIFT>(_s01*_ds4);
-                    _s10 = v_shr<SHIFT>(_s10*_ds4);
-                    _s11 = v_shr<SHIFT>(_s11*_ds4);
+                    _s00 = v_shr<SHIFT>(v_mul(_s00, _ds4));
+                    _s01 = v_shr<SHIFT>(v_mul(_s01, _ds4));
+                    _s10 = v_shr<SHIFT>(v_mul(_s10, _ds4));
+                    _s11 = v_shr<SHIFT>(v_mul(_s11, _ds4));
 
                     v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
                     v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
@@ -540,29 +540,29 @@ public BaseColumnFilter
 
                     v_store(D + i, v_pack_u(r0, r1));
                     v_store(SUM + i, _s0);
-                    v_store(SUM + i + v_uint16::nlanes, _s1);
+                    v_store(SUM + i + VTraits<v_uint16>::vlanes(), _s1);
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
                 v_uint16x8 dd8 = v_setall_u16((ushort)dd);
 
-                for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes )
+                for( ; i <= width-VTraits<v_uint8x16>::vlanes(); i+=VTraits<v_uint8x16>::vlanes() )
                 {
                     v_uint16x8 _sm0 = v_load(Sm + i);
-                    v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes);
+                    v_uint16x8 _sm1 = v_load(Sm + i + VTraits<v_uint16x8>::vlanes());
 
                     v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
-                    v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes));
+                    v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + VTraits<v_uint16x8>::vlanes()), v_load(Sp + i + VTraits<v_uint16x8>::vlanes()));
 
                     v_uint32x4 _s00, _s01, _s10, _s11;
 
-                    v_expand(_s0 + dd8, _s00, _s01);
-                    v_expand(_s1 + dd8, _s10, _s11);
+                    v_expand(v_add(_s0, dd8), _s00, _s01);
+                    v_expand(v_add(_s1, dd8), _s10, _s11);
 
-                    _s00 = v_shr<SHIFT>(_s00*ds4);
-                    _s01 = v_shr<SHIFT>(_s01*ds4);
-                    _s10 = v_shr<SHIFT>(_s10*ds4);
-                    _s11 = v_shr<SHIFT>(_s11*ds4);
+                    _s00 = v_shr<SHIFT>(v_mul(_s00, ds4));
+                    _s01 = v_shr<SHIFT>(v_mul(_s01, ds4));
+                    _s10 = v_shr<SHIFT>(v_mul(_s10, ds4));
+                    _s11 = v_shr<SHIFT>(v_mul(_s11, ds4));
 
                     v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
                     v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
@@ -572,7 +572,7 @@ public BaseColumnFilter
 
                     v_store(D + i, v_pack_u(r0, r1));
                     v_store(SUM + i, _s0);
-                    v_store(SUM + i + v_uint16x8::nlanes, _s1);
+                    v_store(SUM + i + VTraits<v_uint16x8>::vlanes(), _s1);
                 }
 #endif
 #endif
@@ -643,15 +643,15 @@ struct ColumnSum<int, short> :
             {
                 const int* Sp = (const int*)src[0];
                 i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -673,33 +673,33 @@ struct ColumnSum<int, short> :
             if( haveScale )
             {
                 i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+                for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
-                    v_int32 v_s0d =  v_round(v_cvt_f32(v_s0) * _v_scale);
-                    v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale);
+                    v_int32 v_s0d =  v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
+                    v_int32 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), _v_scale));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
+                for( ; i <= width-VTraits<v_int16x8>::vlanes(); i+=VTraits<v_int16x8>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
-                    v_int32x4 v_s0d =  v_round(v_cvt_f32(v_s0) * v_scale);
-                    v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
+                    v_int32x4 v_s0d =  v_round(v_mul(v_cvt_f32(v_s0), v_scale));
+                    v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -713,27 +713,27 @@ struct ColumnSum<int, short> :
             else
             {
                 i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
+                for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
                     v_store(D + i, v_pack(v_s0, v_s01));
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width-VTraits<v_int16x8>::vlanes(); i+=VTraits<v_int16x8>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_store(D + i, v_pack(v_s0, v_s01));
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -792,15 +792,15 @@ struct ColumnSum<int, ushort> :
             {
                 const int* Sp = (const int*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -822,33 +822,33 @@ struct ColumnSum<int, ushort> :
             if( haveScale )
             {
                 int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
-                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
-                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
+                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
-                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
+                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -862,27 +862,27 @@ struct ColumnSum<int, ushort> :
             else
             {
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
                     v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -939,15 +939,15 @@ struct ColumnSum<int, int> :
             {
                 const int* Sp = (const int*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -969,25 +969,25 @@ struct ColumnSum<int, int> :
             if( haveScale )
             {
                 int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
 
                     v_store(D + i, v_s0d);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
 
                     v_store(D + i, v_s0d);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                 }
 #endif
 #endif
@@ -1001,21 +1001,21 @@ struct ColumnSum<int, int> :
             else
             {
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
 
                     v_store(D + i, v_s0);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
 
                     v_store(D + i, v_s0);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                 }
 #endif
 #endif
@@ -1073,15 +1073,15 @@ struct ColumnSum<int, float> :
             {
                 const int* Sp = (const int*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -1105,21 +1105,21 @@ struct ColumnSum<int, float> :
             {
                 int i = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0) * _v_scale);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_store(D + i, v_mul(v_cvt_f32(v_s0), _v_scale));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+                for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0) * v_scale);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                 }
 #endif
 #endif
@@ -1134,19 +1134,19 @@ struct ColumnSum<int, float> :
             {
                 int i = 0;
 
-#if CV_SIMD
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
                     v_store(D + i, v_cvt_f32(v_s0));
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
-#if CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
+                for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
                     v_store(D + i, v_cvt_f32(v_s0));
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                 }
 #endif
 #endif
@@ -1200,7 +1200,7 @@ Ptr<BaseRowFilter> getRowSumFilter(int srcType, int sumType, int ksize, int anch
     if( sdepth == CV_64F && ddepth == CV_64F )
         return makePtr<RowSum<double, double> >(ksize, anchor);
 
-    CV_Error_( CV_StsNotImplemented,
+    CV_Error_( cv::Error::StsNotImplemented,
         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
         srcType, sumType));
 }
@@ -1241,7 +1241,7 @@ Ptr<BaseColumnFilter> getColumnSumFilter(int sumType, int dstType, int ksize, in
     if( ddepth == CV_64F && sdepth == CV_64F )
         return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
 
-    CV_Error_( CV_StsNotImplemented,
+    CV_Error_( cv::Error::StsNotImplemented,
         ("Unsupported combination of sum format (=%d), and destination format (=%d)",
         sumType, dstType));
 }
@@ -1339,7 +1339,7 @@ Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int a
     if( sdepth == CV_64F && ddepth == CV_64F )
         return makePtr<SqrRowSum<double, double> >(ksize, anchor);
 
-    CV_Error_( CV_StsNotImplemented,
+    CV_Error_( cv::Error::StsNotImplemented,
               ("Unsupported combination of source format (=%d), and buffer format (=%d)",
                srcType, sumType));
 }
diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index 9c14929dc8aa..c3229594417f 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -306,11 +306,11 @@ class parallelCanny : public ParallelLoopBody
         src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel),
         low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient)
     {
-#if CV_SIMD
-        for(int i = 0; i < v_int8::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for(int i = 0; i < VTraits<v_int8>::vlanes(); ++i)
         {
             smask[i] = 0;
-            smask[i + v_int8::nlanes] = (schar)-1;
+            smask[i + VTraits<v_int8>::vlanes()] = (schar)-1;
         }
         if (true)
             _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
@@ -330,11 +330,11 @@ class parallelCanny : public ParallelLoopBody
         src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel),
         low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient)
     {
-#if CV_SIMD
-        for(int i = 0; i < v_int8::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        for(int i = 0; i < VTraits<v_int8>::vlanes(); ++i)
         {
             smask[i] = 0;
-            smask[i + v_int8::nlanes] = (schar)-1;
+            smask[i + VTraits<v_int8>::vlanes()] = (schar)-1;
         }
         if (true)
             _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1);
@@ -396,7 +396,7 @@ class parallelCanny : public ParallelLoopBody
         }
 
         // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         AutoBuffer<int> buffer(3 * (mapstep * cn + CV_SIMD_WIDTH));
         _mag_p = alignPtr(buffer.data() + 1, CV_SIMD_WIDTH);
         _mag_a = alignPtr(_mag_p + mapstep * cn, CV_SIMD_WIDTH);
@@ -436,8 +436,8 @@ class parallelCanny : public ParallelLoopBody
                 if (L2gradient)
                 {
                     int j = 0, width = src.cols * cn;
-#if CV_SIMD
-                    for ( ; j <= width - v_int16::nlanes; j += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for ( ; j <= width - VTraits<v_int16>::vlanes(); j += VTraits<v_int16>::vlanes())
                     {
                         v_int16 v_dx = vx_load((const short*)(_dx + j));
                         v_int16 v_dy = vx_load((const short*)(_dy + j));
@@ -447,8 +447,8 @@ class parallelCanny : public ParallelLoopBody
                         v_expand(v_dx, v_dxp_low, v_dxp_high);
                         v_expand(v_dy, v_dyp_low, v_dyp_high);
 
-                        v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
-                        v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
+                        v_store_aligned((int *)(_mag_n + j), v_add(v_mul(v_dxp_low, v_dxp_low), v_mul(v_dyp_low, v_dyp_low)));
+                        v_store_aligned((int *)(_mag_n + j + VTraits<v_int32>::vlanes()), v_add(v_mul(v_dxp_high, v_dxp_high), v_mul(v_dyp_high, v_dyp_high)));
                     }
 #endif
                     for ( ; j < width; ++j)
@@ -457,8 +457,8 @@ class parallelCanny : public ParallelLoopBody
                 else
                 {
                     int j = 0, width = src.cols * cn;
-#if CV_SIMD
-                    for(; j <= width - v_int16::nlanes; j += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for(; j <= width - VTraits<v_int16>::vlanes(); j += VTraits<v_int16>::vlanes())
                     {
                         v_int16 v_dx = vx_load((const short *)(_dx + j));
                         v_int16 v_dy = vx_load((const short *)(_dy + j));
@@ -470,8 +470,8 @@ class parallelCanny : public ParallelLoopBody
                         v_expand(v_dx, v_dx_ml, v_dx_mh);
                         v_expand(v_dy, v_dy_ml, v_dy_mh);
 
-                        v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
-                        v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dx_mh + v_dy_mh);
+                        v_store_aligned((int *)(_mag_n + j), v_add(v_dx_ml, v_dy_ml));
+                        v_store_aligned((int *)(_mag_n + j + VTraits<v_int32>::vlanes()), v_add(v_dx_mh, v_dy_mh));
                     }
 #endif
                     for ( ; j < width; ++j)
@@ -515,7 +515,7 @@ class parallelCanny : public ParallelLoopBody
 
             // From here actual src row is (i - 1)
             // Set left and right border to 1
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (true)
                 _pmap = map.ptr<uchar>(i) + CV_SIMD_WIDTH;
             else
@@ -537,22 +537,22 @@ class parallelCanny : public ParallelLoopBody
 
             const int TG22 = 13573;
             int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 const v_int32 v_low = vx_setall_s32(low);
                 const v_int8 v_one = vx_setall_s8(1);
 
-                for (; j <= src.cols - v_int8::nlanes; j += v_int8::nlanes)
+                for (; j <= src.cols - VTraits<v_int8>::vlanes(); j += VTraits<v_int8>::vlanes())
                 {
                     v_store_aligned((signed char*)(_pmap + j), v_one);
-                    v_int8 v_cmp = v_pack(v_pack(vx_load_aligned((const int*)(_mag_a + j                    )) > v_low,
-                                                 vx_load_aligned((const int*)(_mag_a + j +   v_int32::nlanes)) > v_low),
-                                          v_pack(vx_load_aligned((const int*)(_mag_a + j + 2*v_int32::nlanes)) > v_low,
-                                                 vx_load_aligned((const int*)(_mag_a + j + 3*v_int32::nlanes)) > v_low));
+                    v_int8 v_cmp = v_pack(v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j)), v_low),
+                                                 v_gt(vx_load_aligned((const int *)(_mag_a + j + VTraits<v_int32>::vlanes())), v_low)),
+                                          v_pack(v_gt(vx_load_aligned((const int *)(_mag_a + j + 2 * VTraits<v_int32>::vlanes())), v_low),
+                                                 v_gt(vx_load_aligned((const int *)(_mag_a + j + 3 * VTraits<v_int32>::vlanes())), v_low)));
                     while (v_check_any(v_cmp))
                     {
                         int l = v_scan_forward(v_cmp);
-                        v_cmp &= vx_load(smask + v_int8::nlanes - 1 - l);
+                        v_cmp = v_and(v_cmp, vx_load(smask + VTraits<v_int8>::vlanes() - 1 - l));
                         int k = j + l;
 
                         int m = _mag_a[k];
@@ -693,8 +693,8 @@ class parallelCanny : public ParallelLoopBody
     ptrdiff_t mapstep;
     int cn;
     mutable Mutex mutex;
-#if CV_SIMD
-    schar smask[2*v_int8::nlanes];
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    schar smask[2*VTraits<v_int8>::max_nlanes];
 #endif
 };
 
@@ -718,31 +718,31 @@ class finalPass : public ParallelLoopBody
             int j = 0;
             uchar *pdst = dst.ptr<uchar>(i);
             const uchar *pmap = map.ptr<uchar>(i + 1);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (true)
                 pmap += CV_SIMD_WIDTH;
             else
 #endif
                 pmap += 1;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 const v_uint8 v_zero = vx_setzero_u8();
-                const v_uint8 v_ff = ~v_zero;
+                const v_uint8 v_ff = v_not(v_zero);
                 const v_uint8 v_two = vx_setall_u8(2);
 
-                for (; j <= dst.cols - v_uint8::nlanes; j += v_uint8::nlanes)
+                for (; j <= dst.cols - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
                 {
                     v_uint8 v_pmap = vx_load_aligned((const unsigned char*)(pmap + j));
-                    v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
+                    v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero);
                     v_store((pdst + j), v_pmap);
                 }
 
-                if (j <= dst.cols - v_uint8::nlanes/2)
+                if (j <= dst.cols - VTraits<v_uint8>::vlanes()/2)
                 {
                     v_uint8 v_pmap = vx_load_low((const unsigned char*)(pmap + j));
-                    v_pmap = v_select(v_pmap == v_two, v_ff, v_zero);
+                    v_pmap = v_select(v_eq(v_pmap, v_two), v_ff, v_zero);
                     v_store_low((pdst + j), v_pmap);
-                    j += v_uint8::nlanes/2;
+                    j += VTraits<v_uint8>::vlanes()/2;
                 }
             }
 #endif
@@ -844,7 +844,7 @@ void Canny( InputArray _src, OutputArray _dst,
     }
 
     if ((aperture_size & 1) == 0 || (aperture_size != -1 && (aperture_size < 3 || aperture_size > 7)))
-        CV_Error(CV_StsBadFlag, "Aperture size should be odd between 3 and 7");
+        CV_Error(cv::Error::StsBadFlag, "Aperture size should be odd between 3 and 7");
 
     if (aperture_size == 7)
     {
diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
index 677b6a0738d9..b787378ee3e0 100644
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -415,7 +415,7 @@ namespace
         else if (_src.type() == CV_16UC1)
             calcLutBody = cv::makePtr<CLAHE_CalcLut_Body<ushort, 65536, 0> >(srcForLut, lut_, tileSize, tilesX_, clipLimit, lutScale);
         else
-            CV_Error( CV_StsBadArg, "Unsupported type" );
+            CV_Error( cv::Error::StsBadArg, "Unsupported type" );
 
         cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), *calcLutBody);
 
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index cc4dd13c9216..dde8e1344cb6 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -90,6 +90,20 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         return oclCvtColorOnePlaneYUV2BGR(_src, _dst, dcn, bidx, uidx, yidx);
     }
+    case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY: case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY:
+    case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2: case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU:
+    case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2: case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
+    {
+        int yidx = (code==COLOR_RGB2YUV_UYVY || code==COLOR_RGBA2YUV_UYVY ||
+                    code==COLOR_BGR2YUV_UYVY || code==COLOR_BGRA2YUV_UYVY) ? 1 : 0;
+        int uidx = (code==COLOR_RGB2YUV_YVYU || code==COLOR_RGBA2YUV_YVYU ||
+                    code==COLOR_BGR2YUV_YVYU || code==COLOR_BGRA2YUV_YVYU) ? 2 : 0;
+        uidx = 1 - yidx + uidx;
+
+        bool res = oclCvtColorOnePlaneBGR2YUV(_src, _dst, dcn, bidx, uidx, yidx);
+
+        return res;
+    }
     case COLOR_BGR2YCrCb:
     case COLOR_RGB2YCrCb:
         return oclCvtColorBGR2YCrCb(_src, _dst, bidx);
@@ -163,7 +177,7 @@ void cvtColorTwoPlane( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, in
         case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12:
             break;
         default:
-            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+            CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported color conversion code" );
             return;
     }
 
@@ -339,6 +353,19 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                 break;
             }
 
+        case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY: case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY:
+        case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2: case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU:
+        case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2: case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
+            //http://www.fourcc.org/yuv.php#UYVY
+            //http://www.fourcc.org/yuv.php#YUY2
+            //http://www.fourcc.org/yuv.php#YVYU
+            {
+                int ycn  = (code==COLOR_RGB2YUV_UYVY ||  code==COLOR_BGR2YUV_UYVY ||
+                            code==COLOR_RGBA2YUV_UYVY || code==COLOR_BGRA2YUV_UYVY) ? 1 : 0;
+                cvtColorOnePlaneBGR2YUV(_src, _dst, swapBlue(code), uIndex(code), ycn);
+                break;
+            }
+
         case COLOR_YUV2GRAY_UYVY:
         case COLOR_YUV2GRAY_YUY2:
             cvtColorYUV2Gray_ch(_src, _dst, code == COLOR_YUV2GRAY_UYVY ? 1 : 0);
@@ -352,7 +379,7 @@ void cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             cvtColormRGBA2RGBA(_src, _dst);
             break;
         default:
-            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+            CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported color conversion code" );
     }
 }
 } //namespace cv
diff --git a/modules/imgproc/src/color.hpp b/modules/imgproc/src/color.hpp
index abbd65ec06c3..6ebca26a2c11 100644
--- a/modules/imgproc/src/color.hpp
+++ b/modules/imgproc/src/color.hpp
@@ -71,6 +71,8 @@ inline bool swapBlue(int code)
     case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2:
     case COLOR_YUV2BGRA_YUY2:  case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU:
     case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12:
+    case COLOR_BGR2YUV_UYVY:   case COLOR_BGRA2YUV_UYVY: case COLOR_BGR2YUV_YUY2:
+    case COLOR_BGRA2YUV_YUY2:  case COLOR_BGR2YUV_YVYU:  case COLOR_BGRA2YUV_YVYU:
         return false;
     default:
         return true;
@@ -124,6 +126,13 @@ inline int dstChannels(int code)
 
             return 3;
 
+        case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY: case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU:
+        case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2:
+        case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY: case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
+        case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2:
+
+            return 2;
+
         default:
             return 0;
     }
@@ -159,6 +168,7 @@ inline int uIndex(int code)
             return 2;
 
         case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
+        case COLOR_RGB2YUV_YVYU: case COLOR_BGR2YUV_YVYU: case COLOR_RGBA2YUV_YVYU: case COLOR_BGRA2YUV_YVYU:
         case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
         case COLOR_YUV2BGR_NV21:  case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21:
         case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
@@ -169,6 +179,8 @@ inline int uIndex(int code)
         case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
         case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
         case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2:
+        case COLOR_RGB2YUV_UYVY: case COLOR_BGR2YUV_UYVY: case COLOR_RGBA2YUV_UYVY: case COLOR_BGRA2YUV_UYVY:
+        case COLOR_RGB2YUV_YUY2: case COLOR_BGR2YUV_YUY2: case COLOR_RGBA2YUV_YUY2: case COLOR_BGRA2YUV_YUY2:
 
             return 0;
 
@@ -265,7 +277,7 @@ struct OclHelper
         int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
         int pxPerWIx = 1;
 
-        cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
+        cv::String baseOptions = format("-D SRC_DEPTH=%d -D SCN=%d -D PIX_PER_WI_Y=%d ",
                                         src.depth(), src.channels(), pxPerWIy);
 
         switch (sizePolicy)
@@ -529,6 +541,7 @@ bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx );
 bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx );
 
 bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx );
+bool oclCvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx );
 bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
 bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx );
 bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx );
@@ -547,6 +560,7 @@ void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb);
 void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb);
 
 void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn);
+void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, int uidx, int ycn);
 void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
 void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx );
 void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx );
diff --git a/modules/imgproc/src/color.simd_helpers.hpp b/modules/imgproc/src/color.simd_helpers.hpp
index 6642ff69c4bd..06b9ba3d0622 100644
--- a/modules/imgproc/src/color.simd_helpers.hpp
+++ b/modules/imgproc/src/color.simd_helpers.hpp
@@ -76,7 +76,7 @@ struct Set<i0, -1, -1>
 
 enum SizePolicy
 {
-    TO_YUV, FROM_YUV, FROM_UYVY, NONE
+    TO_YUV, FROM_YUV, FROM_UYVY, TO_UYVY, NONE
 };
 
 template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE >
@@ -109,6 +109,7 @@ struct CvtHelper
             dstSz = Size(sz.width, sz.height * 2 / 3);
             break;
         case FROM_UYVY:
+        case TO_UYVY:
             CV_Assert( sz.width % 2 == 0);
             dstSz = sz;
             break;
diff --git a/modules/imgproc/src/color_hsv.dispatch.cpp b/modules/imgproc/src/color_hsv.dispatch.cpp
index f1678f5deb68..8639784927ed 100644
--- a/modules/imgproc/src/color_hsv.dispatch.cpp
+++ b/modules/imgproc/src/color_hsv.dispatch.cpp
@@ -219,7 +219,7 @@ bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, b
     int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
 
     if(!h.createKernel("HSV2RGB", ocl::imgproc::color_hsv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
+                       format("-D DCN=%d -D BIDX=%d -D HRANGE=%d -D HSCALE=%ff", dcn, bidx, hrange, 6.f/hrange)))
     {
         return false;
     }
@@ -234,7 +234,7 @@ bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, b
     int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
 
     if(!h.createKernel("HLS2RGB", ocl::imgproc::color_hsv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
+                       format("-D DCN=%d -D BIDX=%d -D HRANGE=%d -D HSCALE=%ff", dcn, bidx, hrange, 6.f/hrange)))
     {
         return false;
     }
@@ -249,7 +249,7 @@ bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full
     float hscale = (_src.depth() == CV_32F ? 360.f : (!full ? 180.f : 256.f))/360.f;
 
     if(!h.createKernel("RGB2HLS", ocl::imgproc::color_hsv_oclsrc,
-                       format("-D hscale=%ff -D bidx=%d -D dcn=3", hscale, bidx)))
+                       format("-D HSCALE=%ff -D BIDX=%d -D DCN=3", hscale, bidx)))
     {
         return false;
     }
@@ -264,8 +264,8 @@ bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full
     int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 256);
 
     cv::String options = (_src.depth() == CV_8U ?
-                          format("-D hrange=%d -D bidx=%d -D dcn=3", hrange, bidx) :
-                          format("-D hscale=%ff -D bidx=%d -D dcn=3", hrange*(1.f/360.f), bidx));
+                          format("-D HRANGE=%d -D BIDX=%d -D DCN=3", hrange, bidx) :
+                          format("-D HSCALE=%ff -D BIDX=%d -D DCN=3", hrange*(1.f/360.f), bidx));
 
     if(!h.createKernel("RGB2HSV", ocl::imgproc::color_hsv_oclsrc, options))
     {
diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp
index bea1decc3ae6..bef9497760db 100644
--- a/modules/imgproc/src/color_hsv.simd.hpp
+++ b/modules/imgproc/src/color_hsv.simd.hpp
@@ -98,7 +98,7 @@ struct RGB2HSV_b
 
         int i = 0;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_uint8>::vlanes();
         for ( ; i <= n - vsize;
               i += vsize, src += scn*vsize, dst += 3*vsize)
@@ -274,7 +274,7 @@ struct RGB2HSV_f
     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
     { }
 
-    #if CV_SIMD || CV_SIMD_SCALABLE
+    #if (CV_SIMD || CV_SIMD_SCALABLE)
     inline void process(const v_float32& v_r, const v_float32& v_g, const v_float32& v_b,
                         v_float32& v_h, v_float32& v_s, v_float32& v_v,
                         float hscale) const
@@ -308,7 +308,7 @@ struct RGB2HSV_f
         float hscale = hrange*(1.f/360.f);
         n *= 3;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_float32>::vlanes();
         for ( ; i <= n - 3*vsize; i += 3*vsize, src += scn * vsize)
         {
@@ -368,7 +368,7 @@ struct RGB2HSV_f
 };
 
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 inline void HSV2RGB_simd(const v_float32& h, const v_float32& s, const v_float32& v,
                          v_float32& b, v_float32& g, v_float32& r, float hscale)
 {
@@ -473,7 +473,7 @@ struct HSV2RGB_f
         float hs = hscale;
         n *= 3;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_float32>::vlanes();
         v_float32 valpha = vx_setall_f32(alpha);
         for (; i <= n - vsize*3; i += vsize*3, dst += dcn * vsize)
@@ -530,7 +530,7 @@ struct HSV2RGB_b
         int j = 0, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_float32>::vlanes();
 
         for (j = 0; j <= (n - vsize*4) * 3; j += 3 * 4 * vsize, dst += dcn * 4 * vsize)
@@ -679,7 +679,7 @@ struct RGB2HLS_f
     {
     }
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     inline void process(const v_float32& r, const v_float32& g, const v_float32& b,
                         const v_float32& vhscale,
                         v_float32& h, v_float32& l, v_float32& s) const
@@ -718,7 +718,7 @@ struct RGB2HLS_f
 
         int i = 0, bidx = blueIdx, scn = srccn;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const int vsize = VTraits<v_float32>::vlanes();
         v_float32 vhscale = vx_setall_f32(hscale);
 
@@ -802,13 +802,13 @@ struct RGB2HLS_b
 
         int scn = srccn;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
 #else
         float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
 #endif
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         static const int fsize = VTraits<v_float32>::vlanes();
         //TODO: fix that when v_interleave is available
         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3];
@@ -823,7 +823,7 @@ struct RGB2HLS_b
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             v_float32 v255inv = vx_setall_f32(1.f/255.f);
             if (scn == 3)
             {
@@ -902,7 +902,7 @@ struct RGB2HLS_b
             cvt(buf, buf, dn);
 
             int j = 0;
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             for( ; j <= dn*3 - fsize*3*4; j += fsize*3*4)
             {
                 v_float32 f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11;
@@ -973,7 +973,7 @@ struct HLS2RGB_f
     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange)
     { }
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     inline void process(const v_float32& h, const v_float32& l, const v_float32& s,
                         v_float32& b, v_float32& g, v_float32& r) const
     {
@@ -1016,7 +1016,7 @@ struct HLS2RGB_f
         int i = 0, bidx = blueIdx, dcn = dstcn;
         float alpha = ColorChannel<float>::max();
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         static const int vsize = VTraits<v_float32>::vlanes();
         for (; i <= n - vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
         {
@@ -1099,13 +1099,13 @@ struct HLS2RGB_b
         int i, j, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) buf[bufChannels*BLOCK_SIZE];
 #else
         float CV_DECL_ALIGNED(16) buf[bufChannels*BLOCK_SIZE];
 #endif
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         static const int fsize = VTraits<v_float32>::vlanes();
         //TODO: fix that when v_interleave is available
         float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3];
@@ -1122,7 +1122,7 @@ struct HLS2RGB_b
             int dn = std::min(n - i, (int)BLOCK_SIZE);
             j = 0;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             for( ; j <= dn*3 - 3*4*fsize; j += 3*4*fsize)
             {
                 // 3x uchar -> 3*4 float
@@ -1179,7 +1179,7 @@ struct HLS2RGB_b
             }
             cvt(buf, buf, dn);
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             v_float32 v255 = vx_setall_f32(255.f);
             if(dcn == 3)
             {
diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp
index 3b18944a0c6b..fdf797808aa8 100644
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@@ -56,40 +56,38 @@ template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab
     return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template<typename _Tp> static inline cv::v_float32 splineInterpolate(const cv::v_float32& x, const _Tp* tab, int n)
 {
     using namespace cv;
     v_int32 ix = v_min(v_max(v_trunc(x), vx_setzero_s32()), vx_setall_s32(n-1));
-    cv::v_float32 xx = x - v_cvt_f32(ix);
-    ix = ix << 2;
+    cv::v_float32 xx = v_sub(x, v_cvt_f32(ix));
+    ix = v_shl<2>(ix);
 
-    v_float32 t[4];
-    // assume that v_float32::nlanes == v_int32::nlanes
-    if(v_float32::nlanes == 4)
+    v_float32 t0, t1, t2, t3;
+    // assume that VTraits<v_float32>::vlanes() == VTraits<v_int32>::vlanes()
+    if(VTraits<v_float32>::vlanes() == 4)
     {
-#if CV_SIMD_WIDTH == 16
         int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4];
         v_store_aligned(idx, ix);
-        v_float32x4 tt[4];
-        tt[0] = v_load(tab + idx[0]);
-        tt[1] = v_load(tab + idx[1]);
-        tt[2] = v_load(tab + idx[2]);
-        tt[3] = v_load(tab + idx[3]);
-        v_transpose4x4(tt[0], tt[1], tt[2], tt[3],
-                        t[0],  t[1],  t[2],  t[3]);
-#endif
+        v_float32 tt0, tt1, tt2, tt3;
+        tt0 = vx_load(tab + idx[0]);
+        tt1 = vx_load(tab + idx[1]);
+        tt2 = vx_load(tab + idx[2]);
+        tt3 = vx_load(tab + idx[3]);
+        v_transpose4x4(tt0, tt1, tt2, tt3,
+                        t0,  t1,  t2,  t3);
     }
     else
     {
-        t[0] = v_lut(tab + 0, ix);
-        t[1] = v_lut(tab + 1, ix);
-        t[2] = v_lut(tab + 2, ix);
-        t[3] = v_lut(tab + 3, ix);
+        t0 = v_lut(tab + 0, ix);
+        t1 = v_lut(tab + 1, ix);
+        t2 = v_lut(tab + 2, ix);
+        t3 = v_lut(tab + 3, ix);
     }
 
-    return v_fma(v_fma(v_fma(t[3], xx, t[2]), xx, t[1]), xx, t[0]);
+    return v_fma(v_fma(v_fma(t3, xx, t2), xx, t1), xx, t0);
 }
 
 #endif
@@ -207,8 +205,8 @@ struct RGB2XYZ_f<float>
               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_float32>::vlanes();
         v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
         v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
         v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
@@ -226,9 +224,9 @@ struct RGB2XYZ_f<float>
             }
 
             v_float32 x, y, z;
-            x = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
-            y = v_fma(b, vc3, v_fma(g, vc4, r*vc5));
-            z = v_fma(b, vc6, v_fma(g, vc7, r*vc8));
+            x = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2)));
+            y = v_fma(b, vc3, v_fma(g, vc4, v_mul(r, vc5)));
+            z = v_fma(b, vc6, v_fma(g, vc7, v_mul(r, vc8)));
 
             v_store_interleave(dst, x, y, z);
         }
@@ -313,8 +311,8 @@ struct RGB2XYZ_i<uchar>
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         int descaleShift = 1 << (shift-1);
         v_int16 vdescale = vx_setall_s16((short)descaleShift);
         v_int16 cxbg, cxr1, cybg, cyr1, czbg, czr1;
@@ -349,27 +347,36 @@ struct RGB2XYZ_i<uchar>
             sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1);
             sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1);
 
-            v_int16 bg[4], rd[4];
-            v_zip(sb0, sg0, bg[0], bg[1]);
-            v_zip(sb1, sg1, bg[2], bg[3]);
-            v_zip(sr0, vdescale, rd[0], rd[1]);
-            v_zip(sr1, vdescale, rd[2], rd[3]);
-
-            v_uint32 vx[4], vy[4], vz[4];
-            for(int j = 0; j < 4; j++)
-            {
-                vx[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cxbg) + v_dotprod(rd[j], cxr1)) >> shift;
-                vy[j] = v_reinterpret_as_u32(v_dotprod(bg[j], cybg) + v_dotprod(rd[j], cyr1)) >> shift;
-                vz[j] = v_reinterpret_as_u32(v_dotprod(bg[j], czbg) + v_dotprod(rd[j], czr1)) >> shift;
-            }
+            v_int16 bg0, bg1, bg2, bg3, rd0, rd1, rd2, rd3;
+            v_zip(sb0, sg0, bg0, bg1);
+            v_zip(sb1, sg1, bg2, bg3);
+            v_zip(sr0, vdescale, rd0, rd1);
+            v_zip(sr1, vdescale, rd2, rd3);
+
+            v_uint32 vx0, vx1, vx2, vx3;
+            v_uint32 vy0, vy1, vy2, vy3;
+            v_uint32 vz0, vz1, vz2, vz3;
+
+            vx0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1))));
+            vy0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1))));
+            vz0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1))));
+            vx1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1))));
+            vy1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1))));
+            vz1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1))));
+            vx2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cxbg), v_dotprod(rd2, cxr1))));
+            vy2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, cybg), v_dotprod(rd2, cyr1))));
+            vz2 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg2, czbg), v_dotprod(rd2, czr1))));
+            vx3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cxbg), v_dotprod(rd3, cxr1))));
+            vy3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, cybg), v_dotprod(rd3, cyr1))));
+            vz3 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg3, czbg), v_dotprod(rd3, czr1))));
 
             v_uint16 x0, x1, y0, y1, z0, z1;
-            x0 = v_pack(vx[0], vx[1]);
-            x1 = v_pack(vx[2], vx[3]);
-            y0 = v_pack(vy[0], vy[1]);
-            y1 = v_pack(vy[2], vy[3]);
-            z0 = v_pack(vz[0], vz[1]);
-            z1 = v_pack(vz[2], vz[3]);
+            x0 = v_pack(vx0, vx1);
+            x1 = v_pack(vx2, vx3);
+            y0 = v_pack(vy0, vy1);
+            y1 = v_pack(vy2, vy3);
+            z0 = v_pack(vz0, vz1);
+            z1 = v_pack(vz2, vz3);
 
             v_uint8 x, y, z;
             x = v_pack(x0, x1);
@@ -424,8 +431,8 @@ struct RGB2XYZ_i<ushort>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         const int descaleShift = 1 << (shift-1);
         v_int16 vdescale = vx_setall_s16(descaleShift);
         v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2);
@@ -464,29 +471,29 @@ struct RGB2XYZ_i<ushort>
             v_int16 ymr, ymg, ymb;
             v_int16 zmr, zmg, zmb;
 
-            v_int16 mr = sr < zero, mg = sg < zero, mb = sb < zero;
+            v_int16 mr = v_lt(sr, zero), mg = v_lt(sg, zero), mb = v_lt(sb, zero);
 
-            xmb = mb & vc0;
-            xmg = mg & vc1;
-            xmr = mr & vc2;
-            ymb = mb & vc3;
-            ymg = mg & vc4;
-            ymr = mr & vc5;
-            zmb = mb & vc6;
-            zmg = mg & vc7;
-            zmr = mr & vc8;
+            xmb = v_and(mb, vc0);
+            xmg = v_and(mg, vc1);
+            xmr = v_and(mr, vc2);
+            ymb = v_and(mb, vc3);
+            ymg = v_and(mg, vc4);
+            ymr = v_and(mr, vc5);
+            zmb = v_and(mb, vc6);
+            zmg = v_and(mg, vc7);
+            zmr = v_and(mr, vc8);
 
             v_int32 xfix0, xfix1, yfix0, yfix1, zfix0, zfix1;
-            v_expand(xmr + xmg + xmb, xfix0, xfix1);
-            v_expand(ymr + ymg + ymb, yfix0, yfix1);
-            v_expand(zmr + zmg + zmb, zfix0, zfix1);
+            v_expand(v_add(v_add(xmr, xmg), xmb), xfix0, xfix1);
+            v_expand(v_add(v_add(ymr, ymg), ymb), yfix0, yfix1);
+            v_expand(v_add(v_add(zmr, zmg), zmb), zfix0, zfix1);
 
-            xfix0 = xfix0 << 16;
-            xfix1 = xfix1 << 16;
-            yfix0 = yfix0 << 16;
-            yfix1 = yfix1 << 16;
-            zfix0 = zfix0 << 16;
-            zfix1 = zfix1 << 16;
+            xfix0 = v_shl<16>(xfix0);
+            xfix1 = v_shl<16>(xfix1);
+            yfix0 = v_shl<16>(yfix0);
+            yfix1 = v_shl<16>(yfix1);
+            zfix0 = v_shl<16>(zfix0);
+            zfix1 = v_shl<16>(zfix1);
 
             v_int16 bg0, bg1, rd0, rd1;
             v_zip(sb, sg, bg0, bg1);
@@ -494,12 +501,12 @@ struct RGB2XYZ_i<ushort>
 
             v_uint32 x0, x1, y0, y1, z0, z1;
 
-            x0 = v_reinterpret_as_u32(v_dotprod(bg0, cxbg) + v_dotprod(rd0, cxr1) + xfix0) >> shift;
-            x1 = v_reinterpret_as_u32(v_dotprod(bg1, cxbg) + v_dotprod(rd1, cxr1) + xfix1) >> shift;
-            y0 = v_reinterpret_as_u32(v_dotprod(bg0, cybg) + v_dotprod(rd0, cyr1) + yfix0) >> shift;
-            y1 = v_reinterpret_as_u32(v_dotprod(bg1, cybg) + v_dotprod(rd1, cyr1) + yfix1) >> shift;
-            z0 = v_reinterpret_as_u32(v_dotprod(bg0, czbg) + v_dotprod(rd0, czr1) + zfix0) >> shift;
-            z1 = v_reinterpret_as_u32(v_dotprod(bg1, czbg) + v_dotprod(rd1, czr1) + zfix1) >> shift;
+            x0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cxbg), v_dotprod(rd0, cxr1)), xfix0)));
+            x1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cxbg), v_dotprod(rd1, cxr1)), xfix1)));
+            y0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, cybg), v_dotprod(rd0, cyr1)), yfix0)));
+            y1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, cybg), v_dotprod(rd1, cyr1)), yfix1)));
+            z0 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg0, czbg), v_dotprod(rd0, czr1)), zfix0)));
+            z1 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_add(v_dotprod(bg1, czbg), v_dotprod(rd1, czr1)), zfix1)));
 
             v_uint16 x, y, z;
             x = v_pack(x0, x1);
@@ -593,8 +600,8 @@ struct XYZ2RGB_f<float>
               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_float32>::vlanes();
         v_float32 valpha = vx_setall_f32(alpha);
         v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
         v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
@@ -606,9 +613,9 @@ struct XYZ2RGB_f<float>
             v_load_deinterleave(src, x, y, z);
 
             v_float32 b, g, r;
-            b = v_fma(x, vc0, v_fma(y, vc1, z*vc2));
-            g = v_fma(x, vc3, v_fma(y, vc4, z*vc5));
-            r = v_fma(x, vc6, v_fma(y, vc7, z*vc8));
+            b = v_fma(x, vc0, v_fma(y, vc1, v_mul(z, vc2)));
+            g = v_fma(x, vc3, v_fma(y, vc4, v_mul(z, vc5)));
+            r = v_fma(x, vc6, v_fma(y, vc7, v_mul(z, vc8)));
 
             if(dcn == 4)
             {
@@ -707,8 +714,8 @@ struct XYZ2RGB_i<uchar>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         const int descaleShift = 1 << (shift - 1);
         v_uint8 valpha = vx_setall_u8(alpha);
         v_int16 vdescale = vx_setall_s16(descaleShift);
@@ -739,25 +746,35 @@ struct XYZ2RGB_i<uchar>
             z0 = v_reinterpret_as_s16(uz0);
             z1 = v_reinterpret_as_s16(uz1);
 
-            v_int32 b[4], g[4], r[4];
-
-            v_int16 xy[4], zd[4];
-            v_zip(x0, y0, xy[0], xy[1]);
-            v_zip(x1, y1, xy[2], xy[3]);
-            v_zip(z0, vdescale, zd[0], zd[1]);
-            v_zip(z1, vdescale, zd[2], zd[3]);
-
-            for(int j = 0; j < 4; j++)
-            {
-                b[j] = (v_dotprod(xy[j], cbxy) + v_dotprod(zd[j], cbz1)) >> shift;
-                g[j] = (v_dotprod(xy[j], cgxy) + v_dotprod(zd[j], cgz1)) >> shift;
-                r[j] = (v_dotprod(xy[j], crxy) + v_dotprod(zd[j], crz1)) >> shift;
-            }
+            v_int32 bb0, bb1, bb2, bb3,
+                    gg0, gg1, gg2, gg3,
+                    rr0, rr1, rr2, rr3;
+
+            v_int16 xy0, xy1, xy2, xy3;
+            v_int16 zd0, zd1, zd2, zd3;
+
+            v_zip(x0, y0, xy0, xy1);
+            v_zip(x1, y1, xy2, xy3);
+            v_zip(z0, vdescale, zd0, zd1);
+            v_zip(z1, vdescale, zd2, zd3);
+
+            bb0 = v_shr<shift>(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)));
+            gg0 = v_shr<shift>(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)));
+            rr0 = v_shr<shift>(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)));
+            bb1 = v_shr<shift>(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)));
+            gg1 = v_shr<shift>(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)));
+            rr1 = v_shr<shift>(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)));
+            bb2 = v_shr<shift>(v_add(v_dotprod(xy2, cbxy), v_dotprod(zd2, cbz1)));
+            gg2 = v_shr<shift>(v_add(v_dotprod(xy2, cgxy), v_dotprod(zd2, cgz1)));
+            rr2 = v_shr<shift>(v_add(v_dotprod(xy2, crxy), v_dotprod(zd2, crz1)));
+            bb3 = v_shr<shift>(v_add(v_dotprod(xy3, cbxy), v_dotprod(zd3, cbz1)));
+            gg3 = v_shr<shift>(v_add(v_dotprod(xy3, cgxy), v_dotprod(zd3, cgz1)));
+            rr3 = v_shr<shift>(v_add(v_dotprod(xy3, crxy), v_dotprod(zd3, crz1)));
 
             v_uint16 b0, b1, g0, g1, r0, r1;
-            b0 = v_pack_u(b[0], b[1]); b1 = v_pack_u(b[2], b[3]);
-            g0 = v_pack_u(g[0], g[1]); g1 = v_pack_u(g[2], g[3]);
-            r0 = v_pack_u(r[0], r[1]); r1 = v_pack_u(r[2], r[3]);
+            b0 = v_pack_u(bb0, bb1); b1 = v_pack_u(bb2, bb3);
+            g0 = v_pack_u(gg0, gg1); g1 = v_pack_u(gg2, gg3);
+            r0 = v_pack_u(rr0, rr1); r1 = v_pack_u(rr2, rr3);
 
             v_uint8 bb, gg, rr;
             bb = v_pack(b0, b1);
@@ -820,8 +837,8 @@ struct XYZ2RGB_i<ushort>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         const int descaleShift = 1 << (shift-1);
         v_uint16 valpha = vx_setall_u16(alpha);
         v_int16 vdescale = vx_setall_s16(descaleShift);
@@ -850,30 +867,30 @@ struct XYZ2RGB_i<ushort>
             sz = v_reinterpret_as_s16(z);
 
             // fixing 16bit signed multiplication
-            v_int16 mx = sx < zero, my = sy < zero, mz = sz < zero;
+            v_int16 mx = v_lt(sx, zero), my = v_lt(sy, zero), mz = v_lt(sz, zero);
 
             v_int16 bmx, bmy, bmz;
             v_int16 gmx, gmy, gmz;
             v_int16 rmx, rmy, rmz;
 
-            bmx = mx & vc0;
-            bmy = my & vc1;
-            bmz = mz & vc2;
-            gmx = mx & vc3;
-            gmy = my & vc4;
-            gmz = mz & vc5;
-            rmx = mx & vc6;
-            rmy = my & vc7;
-            rmz = mz & vc8;
+            bmx = v_and(mx, vc0);
+            bmy = v_and(my, vc1);
+            bmz = v_and(mz, vc2);
+            gmx = v_and(mx, vc3);
+            gmy = v_and(my, vc4);
+            gmz = v_and(mz, vc5);
+            rmx = v_and(mx, vc6);
+            rmy = v_and(my, vc7);
+            rmz = v_and(mz, vc8);
 
             v_int32 bfix0, bfix1, gfix0, gfix1, rfix0, rfix1;
-            v_expand(bmx + bmy + bmz, bfix0, bfix1);
-            v_expand(gmx + gmy + gmz, gfix0, gfix1);
-            v_expand(rmx + rmy + rmz, rfix0, rfix1);
+            v_expand(v_add(v_add(bmx, bmy), bmz), bfix0, bfix1);
+            v_expand(v_add(v_add(gmx, gmy), gmz), gfix0, gfix1);
+            v_expand(v_add(v_add(rmx, rmy), rmz), rfix0, rfix1);
 
-            bfix0 = bfix0 << 16; bfix1 = bfix1 << 16;
-            gfix0 = gfix0 << 16; gfix1 = gfix1 << 16;
-            rfix0 = rfix0 << 16; rfix1 = rfix1 << 16;
+            bfix0 = v_shl<16>(bfix0); bfix1 = v_shl<16>(bfix1);
+            gfix0 = v_shl<16>(gfix0); gfix1 = v_shl<16>(gfix1);
+            rfix0 = v_shl<16>(rfix0); rfix1 = v_shl<16>(rfix1);
 
             v_int16 xy0, xy1, zd0, zd1;
             v_zip(sx, sy, xy0, xy1);
@@ -881,12 +898,12 @@ struct XYZ2RGB_i<ushort>
 
             v_int32 b0, b1, g0, g1, r0, r1;
 
-            b0 = (v_dotprod(xy0, cbxy) + v_dotprod(zd0, cbz1) + bfix0) >> shift;
-            b1 = (v_dotprod(xy1, cbxy) + v_dotprod(zd1, cbz1) + bfix1) >> shift;
-            g0 = (v_dotprod(xy0, cgxy) + v_dotprod(zd0, cgz1) + gfix0) >> shift;
-            g1 = (v_dotprod(xy1, cgxy) + v_dotprod(zd1, cgz1) + gfix1) >> shift;
-            r0 = (v_dotprod(xy0, crxy) + v_dotprod(zd0, crz1) + rfix0) >> shift;
-            r1 = (v_dotprod(xy1, crxy) + v_dotprod(zd1, crz1) + rfix1) >> shift;
+            b0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, cbxy), v_dotprod(zd0, cbz1)), bfix0));
+            b1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, cbxy), v_dotprod(zd1, cbz1)), bfix1));
+            g0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, cgxy), v_dotprod(zd0, cgz1)), gfix0));
+            g1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, cgxy), v_dotprod(zd1, cgz1)), gfix1));
+            r0 = v_shr<shift>(v_add(v_add(v_dotprod(xy0, crxy), v_dotprod(zd0, crz1)), rfix0));
+            r1 = v_shr<shift>(v_add(v_add(v_dotprod(xy1, crxy), v_dotprod(zd1, crz1)), rfix1));
 
             v_uint16 b, g, r;
             b = v_pack_u(b0, b1); g = v_pack_u(g0, g1); r = v_pack_u(r0, r1);
@@ -1206,119 +1223,120 @@ static LABLUVLUT_s16_t initLUTforLABLUVs16(const softfloat & un, const softfloat
 }
 
 
-static void initLabTabs()
+static bool createLabTabs()
 {
-    static bool initialized = false;
-    if(!initialized)
+    softfloat f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1];
+    softfloat scale = softfloat::one()/softfloat(LabCbrtTabScale);
+    int i;
+    for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)
     {
-        softfloat f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1];
-        softfloat scale = softfloat::one()/softfloat(LabCbrtTabScale);
-        int i;
-        for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)
-        {
-            softfloat x = scale*softfloat(i);
-            f[i] = x < lthresh ? mulAdd(x, lscale, lbias) : cbrt(x);
-        }
-        LabCbrtTab = splineBuild(f, LAB_CBRT_TAB_SIZE);
+        softfloat x = scale*softfloat(i);
+        f[i] = x < lthresh ? mulAdd(x, lscale, lbias) : cbrt(x);
+    }
+    LabCbrtTab = splineBuild(f, LAB_CBRT_TAB_SIZE);
 
-        scale = softfloat::one()/softfloat(GammaTabScale);
-        for(i = 0; i <= GAMMA_TAB_SIZE; i++)
-        {
-            softfloat x = scale*softfloat(i);
-            g[i] = applyGamma(x);
-            ig[i] = applyInvGamma(x);
-        }
+    scale = softfloat::one()/softfloat(GammaTabScale);
+    for(i = 0; i <= GAMMA_TAB_SIZE; i++)
+    {
+        softfloat x = scale*softfloat(i);
+        g[i] = applyGamma(x);
+        ig[i] = applyInvGamma(x);
+    }
 
-        sRGBGammaTab = splineBuild(g, GAMMA_TAB_SIZE);
-        sRGBInvGammaTab = splineBuild(ig, GAMMA_TAB_SIZE);
+    sRGBGammaTab = splineBuild(g, GAMMA_TAB_SIZE);
+    sRGBInvGammaTab = splineBuild(ig, GAMMA_TAB_SIZE);
 
-        static const softfloat intScale(255*(1 << gamma_shift));
-        for(i = 0; i < 256; i++)
+    static const softfloat intScale(255*(1 << gamma_shift));
+    for(i = 0; i < 256; i++)
+    {
+        softfloat x = softfloat(i)/f255;
+        sRGBGammaTab_b[i] = (ushort)(cvRound(intScale*applyGamma(x)));
+        linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
+    }
+    static const softfloat invScale = softfloat::one()/softfloat((int)INV_GAMMA_TAB_SIZE);
+    for(i = 0; i < INV_GAMMA_TAB_SIZE; i++)
+    {
+        softfloat x = invScale*softfloat(i);
+        sRGBInvGammaTab_b[i] = (ushort)(cvRound(f255*applyInvGamma(x)));
+        linearInvGammaTab_b[i] = (ushort)(cvTrunc(f255*x));
+    }
+
+    static const softfloat cbTabScale(softfloat::one()/(f255*(1 << gamma_shift)));
+    static const softfloat lshift2(1 << lab_shift2);
+    for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
+    {
+        softfloat x = cbTabScale*softfloat(i);
+        LabCbrtTab_b[i] = (ushort)(cvRound(lshift2 * (x < lthresh ? mulAdd(x, lscale, lbias) : cbrt(x))));
+    }
+
+    //Lookup table for L to y and ify calculations
+    for(i = 0; i < 256; i++)
+    {
+        int y, ify;
+        //8 * 255.0 / 100.0 == 20.4
+        if( i <= 20)
         {
-            softfloat x = softfloat(i)/f255;
-            sRGBGammaTab_b[i] = (ushort)(cvRound(intScale*applyGamma(x)));
-            linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
+            //yy = li / 903.3f;
+            //y = L*100/903.3f; 903.3f = (29/3)^3, 255 = 17*3*5
+            y = cvRound(softfloat(i*LUT_BASE*20*9)/softfloat(17*29*29*29));
+            //fy = 7.787f * yy + 16.0f / 116.0f; 7.787f = (29/3)^3/(29*4)
+            ify = cvRound(softfloat((int)LUT_BASE)*(softfloat(16)/softfloat(116) + softfloat(i*5)/softfloat(3*17*29)));
         }
-        static const softfloat invScale = softfloat::one()/softfloat((int)INV_GAMMA_TAB_SIZE);
-        for(i = 0; i < INV_GAMMA_TAB_SIZE; i++)
+        else
         {
-            softfloat x = invScale*softfloat(i);
-            sRGBInvGammaTab_b[i] = (ushort)(cvRound(f255*applyInvGamma(x)));
-            linearInvGammaTab_b[i] = (ushort)(cvTrunc(f255*x));
+            //fy = (li + 16.0f) / 116.0f;
+            softfloat fy = (softfloat(i*100*LUT_BASE)/softfloat(255*116) +
+                            softfloat(16*LUT_BASE)/softfloat(116));
+            ify = cvRound(fy);
+            //yy = fy * fy * fy;
+            y = cvRound(fy*fy*fy/softfloat(LUT_BASE*LUT_BASE));
         }
 
-        static const softfloat cbTabScale(softfloat::one()/(f255*(1 << gamma_shift)));
-        static const softfloat lshift2(1 << lab_shift2);
-        for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
-        {
-            softfloat x = cbTabScale*softfloat(i);
-            LabCbrtTab_b[i] = (ushort)(cvRound(lshift2 * (x < lthresh ? mulAdd(x, lscale, lbias) : cbrt(x))));
-        }
+        LabToYF_b[i*2  ] = (ushort)y;   // 0 <= y <= BASE
+        LabToYF_b[i*2+1] = (ushort)ify; // 2260 <= ify <= BASE
+    }
 
-        //Lookup table for L to y and ify calculations
-        for(i = 0; i < 256; i++)
-        {
-            int y, ify;
-            //8 * 255.0 / 100.0 == 20.4
-            if( i <= 20)
-            {
-                //yy = li / 903.3f;
-                //y = L*100/903.3f; 903.3f = (29/3)^3, 255 = 17*3*5
-                y = cvRound(softfloat(i*LUT_BASE*20*9)/softfloat(17*29*29*29));
-                //fy = 7.787f * yy + 16.0f / 116.0f; 7.787f = (29/3)^3/(29*4)
-                ify = cvRound(softfloat((int)LUT_BASE)*(softfloat(16)/softfloat(116) + softfloat(i*5)/softfloat(3*17*29)));
-            }
-            else
-            {
-                //fy = (li + 16.0f) / 116.0f;
-                softfloat fy = (softfloat(i*100*LUT_BASE)/softfloat(255*116) +
-                                softfloat(16*LUT_BASE)/softfloat(116));
-                ify = cvRound(fy);
-                //yy = fy * fy * fy;
-                y = cvRound(fy*fy*fy/softfloat(LUT_BASE*LUT_BASE));
-            }
+    //Lookup table for a,b to x,z conversion
+    abToXZ_b = initLUTforABXZ();
 
-            LabToYF_b[i*2  ] = (ushort)y;   // 0 <= y <= BASE
-            LabToYF_b[i*2+1] = (ushort)ify; // 2260 <= ify <= BASE
-        }
+    softfloat dd = D65[0] + D65[1]*softdouble(15) + D65[2]*softdouble(3);
+    dd = softfloat::one()/max(dd, softfloat::eps());
+    softfloat un = dd*softfloat(13*4)*D65[0];
+    softfloat vn = dd*softfloat(13*9)*D65[1];
 
-        //Lookup table for a,b to x,z conversion
-        abToXZ_b = initLUTforABXZ();
+    //Luv LUT
+    LUVLUT = initLUTforLUV(un, vn);
 
-        softfloat dd = D65[0] + D65[1]*softdouble(15) + D65[2]*softdouble(3);
-        dd = softfloat::one()/max(dd, softfloat::eps());
-        softfloat un = dd*softfloat(13*4)*D65[0];
-        softfloat vn = dd*softfloat(13*9)*D65[1];
+    //try to suppress warning
+    static const bool calcLUT = enableRGB2LabInterpolation || enableRGB2LuvInterpolation;
+    if(calcLUT)
+    {
 
-        //Luv LUT
-        LUVLUT = initLUTforLUV(un, vn);
+        LABLUVLUTs16 = initLUTforLABLUVs16(un, vn);
 
-        //try to suppress warning
-        static const bool calcLUT = enableRGB2LabInterpolation || enableRGB2LuvInterpolation;
-        if(calcLUT)
+        for(int16_t p = 0; p < TRILINEAR_BASE; p++)
         {
-
-            LABLUVLUTs16 = initLUTforLABLUVs16(un, vn);
-
-            for(int16_t p = 0; p < TRILINEAR_BASE; p++)
+            int16_t pp = TRILINEAR_BASE - p;
+            for(int16_t q = 0; q < TRILINEAR_BASE; q++)
             {
-                int16_t pp = TRILINEAR_BASE - p;
-                for(int16_t q = 0; q < TRILINEAR_BASE; q++)
+                int16_t qq = TRILINEAR_BASE - q;
+                for(int16_t r = 0; r < TRILINEAR_BASE; r++)
                 {
-                    int16_t qq = TRILINEAR_BASE - q;
-                    for(int16_t r = 0; r < TRILINEAR_BASE; r++)
-                    {
-                        int16_t rr = TRILINEAR_BASE - r;
-                        int16_t* w = &trilinearLUT[8*p + 8*TRILINEAR_BASE*q + 8*TRILINEAR_BASE*TRILINEAR_BASE*r];
-                        w[0]  = pp * qq * rr; w[1]  = pp * qq * r ; w[2]  = pp * q  * rr; w[3]  = pp * q  * r ;
-                        w[4]  = p  * qq * rr; w[5]  = p  * qq * r ; w[6]  = p  * q  * rr; w[7]  = p  * q  * r ;
-                    }
+                    int16_t rr = TRILINEAR_BASE - r;
+                    int16_t* w = &trilinearLUT[8*p + 8*TRILINEAR_BASE*q + 8*TRILINEAR_BASE*TRILINEAR_BASE*r];
+                    w[0]  = pp * qq * rr; w[1]  = pp * qq * r ; w[2]  = pp * q  * rr; w[3]  = pp * q  * r ;
+                    w[4]  = p  * qq * rr; w[5]  = p  * qq * r ; w[6]  = p  * q  * rr; w[7]  = p  * q  * r ;
                 }
             }
         }
-
-        initialized = true;
     }
+    return true;
+}
+
+static bool initLabTabs()
+{
+    static bool initialized = createLabTabs();
+    return initialized;
 }
 
 
@@ -1371,16 +1389,16 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
                                               v_uint16x8& outA, v_uint16x8& outB, v_uint16x8& outC)
 {
     //LUT idx of origin pt of cube
-    v_uint16x8 idxsX = inX >> (lab_base_shift - lab_lut_shift);
-    v_uint16x8 idxsY = inY >> (lab_base_shift - lab_lut_shift);
-    v_uint16x8 idxsZ = inZ >> (lab_base_shift - lab_lut_shift);
+    v_uint16x8 idxsX = v_shr<lab_base_shift - lab_lut_shift>(inX);
+    v_uint16x8 idxsY = v_shr<lab_base_shift - lab_lut_shift>(inY);
+    v_uint16x8 idxsZ = v_shr<lab_base_shift - lab_lut_shift>(inZ);
 
     //x, y, z are [0; TRILINEAR_BASE)
     const uint16_t bitMask = (1 << trilinear_shift) - 1;
     v_uint16x8 bitMaskReg = v_setall_u16(bitMask);
-    v_uint16x8 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16x8 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16x8 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16x8 fracX = v_and(v_shr<lab_base_shift - 8 - 1>(inX), bitMaskReg);
+    v_uint16x8 fracY = v_and(v_shr<lab_base_shift - 8 - 1>(inY), bitMaskReg);
+    v_uint16x8 fracZ = v_and(v_shr<lab_base_shift - 8 - 1>(inZ), bitMaskReg);
 
     //load values to interpolate for pix0, pix1, .., pix7
     v_int16x8 a0, a1, a2, a3, a4, a5, a6, a7;
@@ -1390,9 +1408,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
     v_uint32x4 addrDw0, addrDw1, addrDw10, addrDw11;
     v_mul_expand(v_setall_u16(3*8), idxsX, addrDw0, addrDw1);
     v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM), idxsY, addrDw10, addrDw11);
-    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
     v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), idxsZ, addrDw10, addrDw11);
-    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
 
     uint32_t CV_DECL_ALIGNED(16) addrofs[8];
     v_store_aligned(addrofs, addrDw0);
@@ -1414,9 +1432,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
     v_int16x8 w0, w1, w2, w3, w4, w5, w6, w7;
     v_mul_expand(v_setall_u16(8), fracX, addrDw0, addrDw1);
     v_mul_expand(v_setall_u16(8*TRILINEAR_BASE), fracY, addrDw10, addrDw11);
-    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
     v_mul_expand(v_setall_u16(8*TRILINEAR_BASE*TRILINEAR_BASE), fracZ, addrDw10, addrDw11);
-    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
 
     v_store_aligned(addrofs, addrDw0);
     v_store_aligned(addrofs + 4, addrDw1);
@@ -1452,19 +1470,20 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
 #undef DOT_SHIFT_PACK
 }
 
-#elif CV_SIMD
+#elif CV_SIMD // Fixed size v_int16x8 used below, CV_SIMD_SCALABLE is disabled.
 
 // inValues are in [0; LAB_BASE]
 static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint16& inY, const v_uint16& inZ,
                                               const int16_t* LUT,
                                               v_uint16& outA, v_uint16& outB, v_uint16& outC)
 {
-    const int vsize = v_uint16::nlanes;
+    const int vsize = VTraits<v_uint16>::vlanes();
+    const int vsize_max = VTraits<v_uint16>::max_nlanes;
 
     // LUT idx of origin pt of cube
-    v_uint16 tx = inX >> (lab_base_shift - lab_lut_shift);
-    v_uint16 ty = inY >> (lab_base_shift - lab_lut_shift);
-    v_uint16 tz = inZ >> (lab_base_shift - lab_lut_shift);
+    v_uint16 tx = v_shr<lab_base_shift - lab_lut_shift>(inX);
+    v_uint16 ty = v_shr<lab_base_shift - lab_lut_shift>(inY);
+    v_uint16 tz = v_shr<lab_base_shift - lab_lut_shift>(inZ);
 
     v_uint32 btmp00, btmp01, btmp10, btmp11, btmp20, btmp21;
     v_uint32 baseIdx0, baseIdx1;
@@ -1472,19 +1491,19 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
     v_mul_expand(tx, vx_setall_u16(3*8), btmp00, btmp01);
     v_mul_expand(ty, vx_setall_u16(3*8*LAB_LUT_DIM), btmp10, btmp11);
     v_mul_expand(tz, vx_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), btmp20, btmp21);
-    baseIdx0 = btmp00 + btmp10 + btmp20;
-    baseIdx1 = btmp01 + btmp11 + btmp21;
+    baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20);
+    baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21);
 
-    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize];
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize_max];
     v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0);
     v_store_aligned(vbaseIdx + 1*vsize/2, baseIdx1);
 
     // fracX, fracY, fracZ are [0; TRILINEAR_BASE)
     const uint16_t bitMask = (1 << trilinear_shift) - 1;
     v_uint16 bitMaskReg = vx_setall_u16(bitMask);
-    v_uint16 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16 fracX = v_and(v_shr<lab_base_shift - 8 - 1>(inX), bitMaskReg);
+    v_uint16 fracY = v_and(v_shr<lab_base_shift - 8 - 1>(inY), bitMaskReg);
+    v_uint16 fracZ = v_and(v_shr<lab_base_shift - 8 - 1>(inZ), bitMaskReg);
 
     // trilinearIdx = 8*x + 8*TRILINEAR_BASE*y + 8*TRILINEAR_BASE*TRILINEAR_BASE*z
     v_uint32 trilinearIdx0, trilinearIdx1;
@@ -1493,16 +1512,16 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
     v_expand(fracY, fracY0, fracY1);
     v_expand(fracZ, fracZ0, fracZ1);
 
-    trilinearIdx0 = (fracX0 << 3) + (fracY0 << (3+trilinear_shift)) + (fracZ0 << (3+trilinear_shift*2));
-    trilinearIdx1 = (fracX1 << 3) + (fracY1 << (3+trilinear_shift)) + (fracZ1 << (3+trilinear_shift*2));
+    trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0));
+    trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1));
 
-    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize];
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize_max];
     v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0);
     v_store_aligned(vtrilinearIdx + 1*vsize/2, trilinearIdx1);
 
     v_uint32 a0, a1, b0, b1, c0, c1;
 
-    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize], vb[vsize], vc[vsize];
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize_max], vb[vsize_max], vc[vsize_max];
     for(int j = 0; j < vsize; j++)
     {
         const int16_t* baseLUT = LUT + vbaseIdx[j];
@@ -1528,12 +1547,12 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
 
     // CV_DESCALE
     const v_uint32 descaleShift = vx_setall_u32(1 << (trilinear_shift*3 - 1));
-    a0 = (a0 + descaleShift) >> (trilinear_shift*3);
-    a1 = (a1 + descaleShift) >> (trilinear_shift*3);
-    b0 = (b0 + descaleShift) >> (trilinear_shift*3);
-    b1 = (b1 + descaleShift) >> (trilinear_shift*3);
-    c0 = (c0 + descaleShift) >> (trilinear_shift*3);
-    c1 = (c1 + descaleShift) >> (trilinear_shift*3);
+    a0 = v_shr<trilinear_shift * 3>(v_add(a0, descaleShift));
+    a1 = v_shr<trilinear_shift * 3>(v_add(a1, descaleShift));
+    b0 = v_shr<trilinear_shift * 3>(v_add(b0, descaleShift));
+    b1 = v_shr<trilinear_shift * 3>(v_add(b1, descaleShift));
+    c0 = v_shr<trilinear_shift * 3>(v_add(c0, descaleShift));
+    c1 = v_shr<trilinear_shift * 3>(v_add(c1, descaleShift));
 
     outA = v_pack(a0, a1); outB = v_pack(b0, b1); outC = v_pack(c0, c1);
 }
@@ -1632,11 +1651,11 @@ struct RGB2Lab_b
         vL = v_shr<lab_shift2>(vL);
 
         /* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/
-        va = v_fma(vfX - vfY, v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
+        va = v_fma(v_sub(vfX, vfY), v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
         va = v_shr<lab_shift2>(va);
 
         /* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/
-        vb = v_fma(vfY - vfZ, v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
+        vb = v_fma(v_sub(vfY, vfZ), v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
         vb = v_shr<lab_shift2>(vb);
     }
 #endif // CV_NEON
@@ -1658,8 +1677,8 @@ struct RGB2Lab_b
 #if CV_NEON
         // On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of
         // Lab v_uint8s
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes,
-                src += scn*v_uint8::nlanes, dst += 3*v_uint8::nlanes )
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(),
+                src += scn*VTraits<v_uint8>::vlanes(), dst += 3*VTraits<v_uint8>::vlanes() )
         {
             // Load 4 batches of 4 src
             v_uint8 vRi, vGi, vBi;
@@ -1695,7 +1714,7 @@ struct RGB2Lab_b
 #endif // CV_NEON
 
 #if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+        const int vsize = VTraits<v_uint8>::vlanes();
         const int xyzDescaleShift = 1 << (lab_shift - 1);
         v_int16 vXYZdescale = vx_setall_s16(xyzDescaleShift);
         v_int16 cxrg, cxb1, cyrg, cyb1, czrg, czb1;
@@ -1735,7 +1754,7 @@ struct RGB2Lab_b
                 v_expand(drgb[k], qrgb[k*2+0], qrgb[k*2+1]);
             }
 
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[vsize*3];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[VTraits<v_uint8>::max_nlanes*3];
             for(int k = 0; k < 12; k++)
             {
                 v_store_aligned(vdrgb + k*vsize/4, qrgb[k]);
@@ -1767,14 +1786,14 @@ struct RGB2Lab_b
             v_uint32 x[4], y[4], z[4];
             for(int j = 0; j < 4; j++)
             {
-                x[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cxrg) + v_dotprod(bd[j], cxb1)) >> lab_shift;
-                y[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cyrg) + v_dotprod(bd[j], cyb1)) >> lab_shift;
-                z[j] = v_reinterpret_as_u32(v_dotprod(rg[j], czrg) + v_dotprod(bd[j], czb1)) >> lab_shift;
+                x[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cxrg), v_dotprod(bd[j], cxb1))));
+                y[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cyrg), v_dotprod(bd[j], cyb1))));
+                z[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], czrg), v_dotprod(bd[j], czb1))));
             }
 
             // [fX, fY, fZ] = LabCbrtTab_b[vx, vy, vz]
             // [4 per X, 4 per Y, 4 per Z]
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[vsize*3];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[VTraits<v_uint8>::max_nlanes*3];
             for(int j = 0; j < 4; j++)
             {
                 v_store_aligned(vxyz + (0*4+j)*vsize/4, x[j]);
@@ -1805,7 +1824,7 @@ struct RGB2Lab_b
             v_uint32 vLshift = vx_setall_u32((uint32_t)(Lshift + labDescaleShift));
             for(int k = 0; k < 4; k++)
             {
-                vL[k] = (vL[k] + vLshift) >> lab_shift2;
+                vL[k] = v_shr<lab_shift2>(v_add(vL[k], vLshift));
             }
             v_uint16 L0, L1;
             L0 = v_pack(vL[0], vL[1]);
@@ -1829,7 +1848,7 @@ struct RGB2Lab_b
             v_int32 abShift = vx_setall_s32(128*(1 << lab_shift2) + labDescaleShift);
             for(int k = 0; k < 8; k++)
             {
-                ab[k] = (ab[k] + abShift) >> lab_shift2;
+                ab[k] = v_shr<lab_shift2>(v_add(ab[k], abShift));
             }
             v_int16 a0, a1, b0, b1;
             a0 = v_pack(ab[0], ab[1]); a1 = v_pack(ab[2], ab[3]);
@@ -1924,7 +1943,7 @@ struct RGB2Lab_f
 #if CV_SIMD
             if(enablePackedLab)
             {
-                const int vsize = v_float32::nlanes;
+                const int vsize = VTraits<v_float32>::vlanes();
                 static const int nPixels = vsize*2;
                 for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
                 {
@@ -1956,8 +1975,8 @@ struct RGB2Lab_f
                     #undef clipv
                     /* int iR = R*LAB_BASE, iG = G*LAB_BASE, iB = B*LAB_BASE, iL, ia, ib; */
                     v_float32 basef = vx_setall_f32(LAB_BASE);
-                    rvec0 *= basef, gvec0 *= basef, bvec0 *= basef;
-                    rvec1 *= basef, gvec1 *= basef, bvec1 *= basef;
+                    rvec0 = v_mul(rvec0, basef), gvec0 = v_mul(gvec0, basef), bvec0 = v_mul(bvec0, basef);
+                    rvec1 = v_mul(rvec1, basef), gvec1 = v_mul(gvec1, basef), bvec1 = v_mul(bvec1, basef);
 
                     v_int32 irvec0, igvec0, ibvec0, irvec1, igvec1, ibvec1;
                     irvec0 = v_round(rvec0); irvec1 = v_round(rvec1);
@@ -1987,8 +2006,8 @@ struct RGB2Lab_f
 
                     /* dst[i] = L*100.0f */
                     v_float32 v100dBase = vx_setall_f32(100.0f/LAB_BASE);
-                    l_vec0 = l_vec0*v100dBase;
-                    l_vec1 = l_vec1*v100dBase;
+                    l_vec0 = v_mul(l_vec0, v100dBase);
+                    l_vec1 = v_mul(l_vec1, v100dBase);
                     /*
                     dst[i + 1] = a*256.0f - 128.0f;
                     dst[i + 2] = b*256.0f - 128.0f;
@@ -2026,8 +2045,8 @@ struct RGB2Lab_f
             static const float _a = (softfloat(16) / softfloat(116));
             int i = 0;
 #if CV_SIMD
-            const int vsize = v_float32::nlanes;
-            const int nrepeats = vsize == 4 ? 2 : 1;
+            const int vsize = VTraits<v_float32>::vlanes();
+            const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
             v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
             v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
             v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
@@ -2063,9 +2082,9 @@ struct RGB2Lab_f
                     v_float32 vgscale = vx_setall_f32(gscale);
                     for (int k = 0; k < nrepeats; k++)
                     {
-                        R[k] = splineInterpolate(R[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
-                        G[k] = splineInterpolate(G[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
-                        B[k] = splineInterpolate(B[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
+                        R[k] = splineInterpolate(v_mul(R[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
+                        G[k] = splineInterpolate(v_mul(G[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
+                        B[k] = splineInterpolate(v_mul(B[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
                     }
                 }
 
@@ -2073,26 +2092,26 @@ struct RGB2Lab_f
                 v_float32 FX[nrepeats], FY[nrepeats], FZ[nrepeats];
                 for (int k = 0; k < nrepeats; k++)
                 {
-                    X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
-                    Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
-                    Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
+                    X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2)));
+                    Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5)));
+                    Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8)));
 
                     // use spline interpolation instead of direct calculation
                     v_float32 vTabScale = vx_setall_f32(LabCbrtTabScale);
-                    FX[k] = splineInterpolate(X[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-                    FY[k] = splineInterpolate(Y[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-                    FZ[k] = splineInterpolate(Z[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                    FX[k] = splineInterpolate(v_mul(X[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                    FY[k] = splineInterpolate(v_mul(Y[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                    FZ[k] = splineInterpolate(v_mul(Z[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
                 }
 
                 v_float32 L[nrepeats], a[nrepeats], b[nrepeats];
                 for (int k = 0; k < nrepeats; k++)
                 {
                     // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
-                    v_float32 mask = Y[k] > (vx_setall_f32(0.008856f));
+                    v_float32 mask = v_gt(Y[k], (vx_setall_f32(0.008856f)));
                     v_float32 v116 = vx_setall_f32(116.f), vm16 = vx_setall_f32(-16.f);
-                    L[k] = v_select(mask, v_fma(v116, FY[k], vm16), vx_setall_f32(903.3f)*Y[k]);
-                    a[k] = vx_setall_f32(500.f) * (FX[k] - FY[k]);
-                    b[k] = vx_setall_f32(200.f) * (FY[k] - FZ[k]);
+                    L[k] = v_select(mask, v_fma(v116, FY[k], vm16), v_mul(vx_setall_f32(903.3f),Y[k]));
+                    a[k] = v_mul(vx_setall_f32(500.F), v_sub(FX[k], FY[k]));
+                    b[k] = v_mul(vx_setall_f32(200.F), v_sub(FY[k], FZ[k]));
 
                     v_store_interleave(dst + k*3*vsize, L[k], a[k], b[k]);
                 }
@@ -2187,7 +2206,7 @@ struct Lab2RGBfloat
         float alpha = ColorChannel<float>::max();
 
 #if CV_SIMD
-        const int vsize = v_float32::nlanes;
+        const int vsize = VTraits<v_float32>::vlanes();
         const int nrepeats = 2;
         v_float32 v16_116 = vx_setall_f32(16.0f / 116.0f);
         for( ; i <= n-vsize*nrepeats;
@@ -2204,14 +2223,14 @@ struct Lab2RGBfloat
             v_float32 vlThresh = vx_setall_f32(lThresh);
             for(int k = 0; k < nrepeats; k++)
             {
-                limask[k] = li[k] <= vlThresh;
+                limask[k] = v_le(li[k], vlThresh);
             }
             v_float32 ylo[nrepeats], yhi[nrepeats], fylo[nrepeats], fyhi[nrepeats];
             // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
             v_float32 vinv903 = vx_setall_f32(1.f/903.3f);
             for(int k = 0; k < nrepeats; k++)
             {
-                ylo[k] = li[k] * vinv903;
+                ylo[k] = v_mul(li[k], vinv903);
             }
             v_float32 v7787 = vx_setall_f32(7.787f);
             for(int k = 0; k < nrepeats; k++)
@@ -2221,11 +2240,11 @@ struct Lab2RGBfloat
             v_float32 v16 = vx_setall_f32(16.0f), vinv116 = vx_setall_f32(1.f/116.0f);
             for(int k = 0; k < nrepeats; k++)
             {
-                fyhi[k] = (li[k] + v16) * vinv116;
+                fyhi[k] = v_mul(v_add(li[k], v16), vinv116);
             }
             for(int k = 0; k < nrepeats; k++)
             {
-                yhi[k] = fyhi[k] * fyhi[k] * fyhi[k];
+                yhi[k] = v_mul(fyhi[k], fyhi[k], fyhi[k]);
             }
             for(int k = 0; k < nrepeats; k++)
             {
@@ -2248,9 +2267,9 @@ struct Lab2RGBfloat
                 for (int j = 0; j < 2; j++)
                 {
                     v_float32 f = fxz[k*2+j];
-                    v_float32 fmask = f <= vfTresh;
-                    v_float32 flo = (f - v16_116) * vinv7787;
-                    v_float32 fhi = f*f*f;
+                    v_float32 fmask = v_le(f, vfTresh);
+                    v_float32 flo = v_mul(v_sub(f, v16_116), vinv7787);
+                    v_float32 fhi = v_mul(v_mul(f, f), f);
                     fxz[k*2+j] = v_select(fmask, flo, fhi);
                 }
             }
@@ -2264,9 +2283,9 @@ struct Lab2RGBfloat
             v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
             for(int k = 0; k < nrepeats; k++)
             {
-                ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], vc2 * z[k]));
-                go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], vc5 * z[k]));
-                bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], vc8 * z[k]));
+                ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], v_mul(vc2, z[k])));
+                go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], v_mul(vc5, z[k])));
+                bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], v_mul(vc8, z[k])));
             }
             v_float32 one = vx_setall_f32(1.f), zero = vx_setzero_f32();
             for(int k = 0; k < nrepeats; k++)
@@ -2281,9 +2300,9 @@ struct Lab2RGBfloat
                 v_float32 vgscale = vx_setall_f32(gscale);
                 for(int k = 0; k < nrepeats; k++)
                 {
-                    ro[k] *= vgscale;
-                    go[k] *= vgscale;
-                    bo[k] *= vgscale;
+                    ro[k] = v_mul(ro[k], vgscale);
+                    go[k] = v_mul(go[k], vgscale);
+                    bo[k] = v_mul(bo[k], vgscale);
                 }
 
                 for(int k = 0; k < nrepeats; k++)
@@ -2483,8 +2502,8 @@ struct Lab2RGBinteger
         for(int k = 0; k < 4; k++)
         {
             yf[k] = v_lut((const int*)LabToYF_b, lq[k]);
-            y[k]   = yf[k] & mask16;
-            ify[k] = v_reinterpret_as_s32(v_reinterpret_as_u32(yf[k]) >> 16);
+            y[k]   = v_and(yf[k], mask16);
+            ify[k] = v_reinterpret_as_s32(v_shr(v_reinterpret_as_u32(yf[k]), 16));
         }
 
         v_int16 ify0, ify1;
@@ -2499,18 +2518,18 @@ struct Lab2RGBinteger
         v_uint16 mulA = vx_setall_u16(53687);
         v_uint32 ma[4];
         v_uint32 addA = vx_setall_u32(1 << 7);
-        v_mul_expand((a0 + (a0 << 2)), mulA, ma[0], ma[1]);
-        v_mul_expand((a1 + (a1 << 2)), mulA, ma[2], ma[3]);
-        adiv0 = v_reinterpret_as_s16(v_pack(((ma[0] + addA) >> 13), ((ma[1] + addA) >> 13)));
-        adiv1 = v_reinterpret_as_s16(v_pack(((ma[2] + addA) >> 13), ((ma[3] + addA) >> 13)));
+        v_mul_expand((v_add(a0, v_shl<2>(a0))), mulA, ma[0], ma[1]);
+        v_mul_expand((v_add(a1, v_shl<2>(a1))), mulA, ma[2], ma[3]);
+        adiv0 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[0], addA))), (v_shr<13>(v_add(ma[1], addA)))));
+        adiv1 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[2], addA))), (v_shr<13>(v_add(ma[3], addA)))));
 
         v_uint16 mulB = vx_setall_u16(41943);
         v_uint32 mb[4];
         v_uint32 addB = vx_setall_u32(1 << 4);
         v_mul_expand(b0, mulB, mb[0], mb[1]);
         v_mul_expand(b1, mulB, mb[2], mb[3]);
-        bdiv0 = v_reinterpret_as_s16(v_pack((mb[0] + addB) >> 9, (mb[1] + addB) >> 9));
-        bdiv1 = v_reinterpret_as_s16(v_pack((mb[2] + addB) >> 9, (mb[3] + addB) >> 9));
+        bdiv0 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[0], addB)), v_shr<9>(v_add(mb[1], addB))));
+        bdiv1 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[2], addB)), v_shr<9>(v_add(mb[3], addB))));
 
         // 0 <= adiv <= 8356, 0 <= bdiv <= 20890
         /* x = ifxz[0]; y = y; z = ifxz[1]; */
@@ -2553,7 +2572,7 @@ struct Lab2RGBinteger
         {
             bool srgb = issRGB;
             ushort* tab = sRGBInvGammaTab_b;
-            const int vsize = v_uint8::nlanes;
+            const int vsize = VTraits<v_uint8>::vlanes();
             v_uint8 valpha = vx_setall_u8(alpha);
             v_int32 vc[9];
             for(int k = 0; k < 9; k++)
@@ -2575,9 +2594,9 @@ struct Lab2RGBinteger
                 v_int32 rq[4], gq[4], bq[4];
                 for(int k = 0; k < 4; k++)
                 {
-                    rq[k] = (vc[0] * xq[k] + vc[1] * yq[k] + vc[2] * zq[k] + vdescale) >> shift;
-                    gq[k] = (vc[3] * xq[k] + vc[4] * yq[k] + vc[5] * zq[k] + vdescale) >> shift;
-                    bq[k] = (vc[6] * xq[k] + vc[7] * yq[k] + vc[8] * zq[k] + vdescale) >> shift;
+                    rq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[0], xq[k]), v_mul(vc[1], yq[k])), v_mul(vc[2], zq[k])), vdescale));
+                    gq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[3], xq[k]), v_mul(vc[4], yq[k])), v_mul(vc[5], zq[k])), vdescale));
+                    bq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[6], xq[k]), v_mul(vc[7], yq[k])), v_mul(vc[8], zq[k])), vdescale));
                 }
 
                 //limit indices in table and then substitute
@@ -2594,7 +2613,7 @@ struct Lab2RGBinteger
                 if(srgb)
                 {
                     // [RRR... , GGG... , BBB...]
-                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[vsize*3];
+                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[VTraits<v_uint8>::max_nlanes*3];
                     for (int k = 0; k < 4; k++)
                         v_store_aligned(vidx + 0*vsize + k*vsize/4, rq[k]);
                     for (int k = 0; k < 4; k++)
@@ -2614,9 +2633,9 @@ struct Lab2RGBinteger
                     // rgb = (rgb*255) >> inv_gamma_shift
                     for(int k = 0; k < 4; k++)
                     {
-                        rq[k] = ((rq[k] << 8) - rq[k]) >> inv_gamma_shift;
-                        gq[k] = ((gq[k] << 8) - gq[k]) >> inv_gamma_shift;
-                        bq[k] = ((bq[k] << 8) - bq[k]) >> inv_gamma_shift;
+                        rq[k] = v_shr((v_sub(v_shl(rq[k], 8), rq[k])), inv_gamma_shift);
+                        gq[k] = v_shr((v_sub(v_shl(gq[k], 8), gq[k])), inv_gamma_shift);
+                        bq[k] = v_shr((v_sub(v_shl(bq[k], 8), bq[k])), inv_gamma_shift);
                     }
                     rgb[0] = v_reinterpret_as_u16(v_pack(rq[0], rq[1]));
                     rgb[1] = v_reinterpret_as_u16(v_pack(rq[2], rq[3]));
@@ -2713,13 +2732,13 @@ struct Lab2RGB_b
         static const softfloat fl = softfloat(100)/f255;
 
 #if CV_SIMD
-        const int fsize = v_float32::nlanes;
+        const int fsize = VTraits<v_float32>::vlanes();
         v_float32 vl = vx_setall_f32((float)fl);
         v_float32 va = vx_setall_f32(1.f);
         v_float32 vb = vx_setall_f32(1.f);
         v_float32 vaLow = vx_setall_f32(-128.f), vbLow = vx_setall_f32(-128.f);
         //TODO: fix that when v_interleave is available
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
         v_store_interleave(interTmpM, vl, va, vb);
         v_store_interleave(interTmpA, vx_setzero_f32(), vaLow, vbLow);
         v_float32 mluv[3], aluv[3];
@@ -2737,7 +2756,7 @@ struct Lab2RGB_b
             j = 0;
 
 #if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+            const int vsize = VTraits<v_uint8>::vlanes();
             for( ; j <= (dn - vsize)*3; j += 3*vsize )
             {
                 v_uint8 s0, s1, s2;
@@ -2791,7 +2810,7 @@ struct Lab2RGB_b
                     v_int32 vi[4*3];
                     for(int k = 0; k < 4*3; k++)
                     {
-                        vi[k] = v_round(vf[k]*v255);
+                        vi[k] = v_round(v_mul(vf[k], v255));
                     }
 
                     v_uint8 rgb[3];
@@ -2813,7 +2832,7 @@ struct Lab2RGB_b
                     for(int k = 0; k < 4; k++)
                     {
                         vf[k] = vx_load_aligned(buf + j + k*fsize);
-                        vi[k] = v_round(vf[k]*v255);
+                        vi[k] = v_round(v_mul(vf[k], v255));
                     }
                     v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
                 }
@@ -2893,8 +2912,8 @@ struct RGB2Luvfloat
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 
 #if CV_SIMD
-        const int vsize = v_float32::nlanes;
-        const int nrepeats = vsize == 4 ? 2 : 1;
+        const int vsize = VTraits<v_float32>::vlanes();
+        const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
         for( ; i <= n-vsize*nrepeats;
              i+= vsize*nrepeats, src += scn*vsize*nrepeats, dst += 3*vsize*nrepeats)
         {
@@ -2927,9 +2946,9 @@ struct RGB2Luvfloat
                 v_float32 vgscale = vx_setall_f32(gscale);
                 for (int k = 0; k < nrepeats; k++)
                 {
-                    R[k] *= vgscale;
-                    G[k] *= vgscale;
-                    B[k] *= vgscale;
+                    R[k] = v_mul(R[k], vgscale);
+                    G[k] = v_mul(G[k], vgscale);
+                    B[k] = v_mul(B[k], vgscale);
                 }
 
                 for (int k = 0; k < nrepeats; k++)
@@ -2946,27 +2965,27 @@ struct RGB2Luvfloat
             v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
             for (int k = 0; k < nrepeats; k++)
             {
-                X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
-                Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
-                Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
+                X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2)));
+                Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5)));
+                Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8)));
             }
 
             v_float32 L[nrepeats], u[nrepeats], v[nrepeats];
             v_float32 vmun = vx_setall_f32(-un), vmvn = vx_setall_f32(-vn);
             for (int k = 0; k < nrepeats; k++)
             {
-                L[k] = splineInterpolate(Y[k]*vx_setall_f32(LabCbrtTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                L[k] = splineInterpolate(v_mul(Y[k], vx_setall_f32(LabCbrtTabScale)), LabCbrtTab, LAB_CBRT_TAB_SIZE);
                 // L = 116.f*L - 16.f;
                 L[k] = v_fma(L[k], vx_setall_f32(116.f), vx_setall_f32(-16.f));
 
                 v_float32 d;
                 // d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON)
                 d = v_fma(Y[k], vx_setall_f32(15.f), v_fma(Z[k], vx_setall_f32(3.f), X[k]));
-                d = vx_setall_f32(4.f*13.f) / v_max(d, vx_setall_f32(FLT_EPSILON));
+                d = v_div(vx_setall_f32(4.F * 13.F), v_max(d, vx_setall_f32(FLT_EPSILON)));
                 // u = L*(X*d - un)
-                u[k] = L[k]*v_fma(X[k], d, vmun);
+                u[k] = v_mul(L[k], v_fma(X[k], d, vmun));
                 // v = L*((9*0.25f)*Y*d - vn);
-                v[k] = L[k]*v_fma(vx_setall_f32(9.f*0.25f)*Y[k], d, vmvn);
+                v[k] = v_mul(L[k], v_fma(v_mul(vx_setall_f32(9.F * 0.25F), Y[k]), d, vmvn));
             }
 
             for (int k = 0; k < nrepeats; k++)
@@ -3082,8 +3101,8 @@ struct Luv2RGBfloat
         float _un = un, _vn = vn;
 
 #if CV_SIMD
-        const int vsize = v_float32::nlanes;
-        const int nrepeats = vsize == 4 ? 2 : 1;
+        const int vsize = VTraits<v_float32>::vlanes();
+        const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
         for( ; i <= n - vsize*nrepeats;
              i += vsize*nrepeats, src += vsize*3*nrepeats, dst += dcn*vsize*nrepeats)
         {
@@ -3103,13 +3122,13 @@ struct Luv2RGBfloat
                 v_float32 Ylo, Yhi;
 
                 // ((L + 16)/116)^3
-                Ylo = (L[k] + v16) * v116inv;
-                Ylo = Ylo*Ylo*Ylo;
+                Ylo = v_mul(v_add(L[k], v16), v116inv);
+                Ylo = v_mul(v_mul(Ylo, Ylo), Ylo);
                 // L*(3./29.)^3
-                Yhi = L[k] * v903inv;
+                Yhi = v_mul(L[k], v903inv);
 
                 // Y = (L <= 8) ? Y0 : Y1;
-                Y[k] = v_select(L[k] >= vx_setall_f32(8.f), Ylo, Yhi);
+                Y[k] = v_select(v_ge(L[k], vx_setall_f32(8.f)), Ylo, Yhi);
             }
 
             v_float32 v4inv = vx_setall_f32(0.25f), v3 = vx_setall_f32(3.f);
@@ -3118,18 +3137,18 @@ struct Luv2RGBfloat
                 v_float32 up, vp;
 
                 // up = 3*(u + L*_un);
-                up = v3*(v_fma(L[k], vx_setall_f32(_un), u[k]));
+                up = v_mul(v3, v_fma(L[k], vx_setall_f32(_un), u[k]));
                 // vp = 0.25/(v + L*_vn);
-                vp = v4inv/(v_fma(L[k], vx_setall_f32(_vn), v[k]));
+                vp = v_div(v4inv, v_fma(L[k], vx_setall_f32(_vn), v[k]));
 
                 // vp = max(-0.25, min(0.25, vp));
                 vp = v_max(vx_setall_f32(-0.25f), v_min(v4inv, vp));
 
                 //X = 3*up*vp; // (*Y) is done later
-                X[k] = v3*up*vp;
+                X[k] = v_mul(v_mul(v3, up), vp);
                 //Z = ((12*13*L - up)*vp - 5); // (*Y) is done later
                 // xor flips the sign, works like unary minus
-                Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (vx_setall_f32(-0.f) ^ up)), vp, vx_setall_f32(-5.f));
+                Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (v_xor(vx_setall_f32(-0.F), up))), vp, vx_setall_f32(-5.f));
             }
 
             v_float32 R[nrepeats], G[nrepeats], B[nrepeats];
@@ -3139,9 +3158,9 @@ struct Luv2RGBfloat
             for(int k = 0; k < nrepeats; k++)
             {
                 // R = (X*C0 + C1 + Z*C2)*Y; // here (*Y) is done
-                R[k] = v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1))*Y[k];
-                G[k] = v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4))*Y[k];
-                B[k] = v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7))*Y[k];
+                R[k] = v_mul(v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1)), Y[k]);
+                G[k] = v_mul(v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4)), Y[k]);
+                B[k] = v_mul(v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7)), Y[k]);
             }
 
             v_float32 vzero = vx_setzero_f32(), v1 = vx_setall_f32(1.f);
@@ -3157,9 +3176,9 @@ struct Luv2RGBfloat
                 v_float32 vgscale = vx_setall_f32(gscale);
                 for(int k = 0; k < nrepeats; k++)
                 {
-                    R[k] *= vgscale;
-                    G[k] *= vgscale;
-                    B[k] *= vgscale;
+                    R[k] = v_mul(R[k], vgscale);
+                    G[k] = v_mul(G[k], vgscale);
+                    B[k] = v_mul(B[k], vgscale);
                 }
                 for(int k = 0; k < nrepeats; k++)
                 {
@@ -3268,7 +3287,7 @@ struct RGB2Luvinterpolate
 #if CV_SIMD
         if(enablePackedRGB2Luv)
         {
-            const int vsize = v_uint16::nlanes;
+            const int vsize = VTraits<v_uint16>::vlanes();
             static const int nPixels = vsize*2;
             for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
             {
@@ -3298,9 +3317,9 @@ struct RGB2Luvinterpolate
                 v_expand(r, r0, r1);
                 v_expand(g, g0, g1);
                 v_expand(b, b0, b1);
-                r0 = r0 << (lab_base_shift - 8); r1 = r1 << (lab_base_shift - 8);
-                g0 = g0 << (lab_base_shift - 8); g1 = g1 << (lab_base_shift - 8);
-                b0 = b0 << (lab_base_shift - 8); b1 = b1 << (lab_base_shift - 8);
+                r0 = v_shl<lab_base_shift - 8>(r0); r1 = v_shl<lab_base_shift - 8>(r1);
+                g0 = v_shl<lab_base_shift - 8>(g0); g1 = v_shl<lab_base_shift - 8>(g1);
+                b0 = v_shl<lab_base_shift - 8>(b0); b1 = v_shl<lab_base_shift - 8>(b1);
 
                 /*
                     int L, u, v;
@@ -3315,9 +3334,9 @@ struct RGB2Luvinterpolate
                     dst[i+1] = saturate_cast<uchar>(u/baseDiv);
                     dst[i+2] = saturate_cast<uchar>(v/baseDiv);
                  */
-                l0 = l0 >> (lab_base_shift - 8); l1 = l1 >> (lab_base_shift - 8);
-                u0 = u0 >> (lab_base_shift - 8); u1 = u1 >> (lab_base_shift - 8);
-                v0 = v0 >> (lab_base_shift - 8); v1 = v1 >> (lab_base_shift - 8);
+                l0 = v_shr<lab_base_shift - 8>(l0); l1 = v_shr<lab_base_shift - 8>(l1);
+                u0 = v_shr<lab_base_shift - 8>(u0); u1 = v_shr<lab_base_shift - 8>(u1);
+                v0 = v_shr<lab_base_shift - 8>(v0); v1 = v_shr<lab_base_shift - 8>(v1);
                 v_uint8 l = v_pack(l0, l1);
                 v_uint8 u = v_pack(u0, u1);
                 v_uint8 v = v_pack(v0, v1);
@@ -3388,12 +3407,12 @@ struct RGB2Luv_b
         static const softfloat su = -uLow*f255/uRange;
         static const softfloat sv = -vLow*f255/vRange;
 #if CV_SIMD
-        const int fsize = v_float32::nlanes;
+        const int fsize = VTraits<v_float32>::vlanes();
         v_float32 ml = vx_setall_f32((float)fL), al = vx_setzero_f32();
         v_float32 mu = vx_setall_f32((float)fu), au = vx_setall_f32((float)su);
         v_float32 mv = vx_setall_f32((float)fv), av = vx_setall_f32((float)sv);
         //TODO: fix that when v_interleave is available
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
         v_store_interleave(interTmpM, ml, mu, mv);
         v_store_interleave(interTmpA, al, au, av);
         v_float32 mluv[3], aluv[3];
@@ -3435,7 +3454,7 @@ struct RGB2Luv_b
                     v_float32 f[3*4];
                     for(int k = 0; k < 3*4; k++)
                     {
-                        f[k] = v_cvt_f32(q[k])*v255inv;
+                        f[k] = v_mul(v_cvt_f32(q[k]), v255inv);
                     }
 
                     for(int k = 0; k < 4; k++)
@@ -3461,8 +3480,8 @@ struct RGB2Luv_b
                     v_int32 q0, q1;
                     v_expand(v_reinterpret_as_s16(d), q0, q1);
 
-                    v_store_aligned(buf + j + 0*fsize, v_cvt_f32(q0)*v255inv);
-                    v_store_aligned(buf + j + 1*fsize, v_cvt_f32(q1)*v255inv);
+                    v_store_aligned(buf + j + 0*fsize, v_mul(v_cvt_f32(q0), v255inv));
+                    v_store_aligned(buf + j + 1*fsize, v_mul(v_cvt_f32(q1), v255inv));
                 }
                 for( ; j < dn*bufChannels; j++, src++ )
                 {
@@ -3616,7 +3635,8 @@ struct Luv2RGBinteger
     inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
                                 v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
     {
-        const int vsize = v_uint8::nlanes;
+        const int vsize = VTraits<v_uint8>::vlanes();
+        const int vsize_max = VTraits<v_uint8>::max_nlanes;
 
         v_uint16 lv0, lv1;
         v_expand(lv, lv0, lv1);
@@ -3629,7 +3649,7 @@ struct Luv2RGBinteger
         v_int32 mask16 = vx_setall_s32(0xFFFF);
         for(int k = 0; k < 4; k++)
         {
-            y[k] = v_lut((const int*)LabToYF_b, v_reinterpret_as_s32(lq[k])) & mask16;
+            y[k] = v_and(v_lut((const int *)LabToYF_b, v_reinterpret_as_s32(lq[k])), mask16);
         }
 
         v_int32 up[4], vp[4];
@@ -3640,10 +3660,10 @@ struct Luv2RGBinteger
         v_expand(vv, vv0, vv1);
         // LL*256
         v_uint16 ll0, ll1;
-        ll0 = lv0 << 8; ll1 = lv1 << 8;
+        ll0 = v_shl<8>(lv0); ll1 = v_shl<8>(lv1);
         v_uint16 upidx0, upidx1, vpidx0, vpidx1;
-        upidx0 = ll0 + uv0; upidx1 = ll1 + uv1;
-        vpidx0 = ll0 + vv0; vpidx1 = ll1 + vv1;
+        upidx0 = v_add(ll0, uv0); upidx1 = v_add(ll1, uv1);
+        vpidx0 = v_add(ll0, vv0); vpidx1 = v_add(ll1, vv1);
         v_uint32 upidx[4], vpidx[4];
         v_expand(upidx0, upidx[0], upidx[1]); v_expand(upidx1, upidx[2], upidx[3]);
         v_expand(vpidx0, vpidx[0], vpidx[1]); v_expand(vpidx1, vpidx[2], vpidx[3]);
@@ -3655,7 +3675,7 @@ struct Luv2RGBinteger
 
         // long long int vpl = LUVLUT.LvToVpl_b[LL*256+v];
         v_int64 vpl[8];
-        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize];
+        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize_max];
         for(int k = 0; k < 4; k++)
         {
             v_store_aligned(vpidxstore + k*vsize/4, v_reinterpret_as_s32(vpidx[k]));
@@ -3667,12 +3687,13 @@ struct Luv2RGBinteger
 
         // not all 64-bit arithmetic is available in univ. intrinsics
         // need to handle it with scalar code
-        int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize];
+        int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize_max];
         for(int k = 0; k < 8; k++)
         {
             v_store_aligned(vvpl + k*vsize/8, vpl[k]);
         }
-        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize], vvp[vsize], vx[vsize], vy[vsize], vzm[vsize];
+        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize_max], vvp[vsize_max],
+                                               vx[vsize_max], vy[vsize_max], vzm[vsize_max];
         for(int k = 0; k < 4; k++)
         {
             v_store_aligned(vup + k*vsize/4, up[k]);
@@ -3707,7 +3728,7 @@ struct Luv2RGBinteger
         // z = zm/256 + zm/65536;
         for (int k = 0; k < 4; k++)
         {
-            z[k] = (zm[k] >> 8) + (zm[k] >> 16);
+            z[k] = v_add(v_shr<8>(zm[k]), v_shr<16>(zm[k]));
         }
 
         // (x, z) = clip((x, z), min=0, max=2*BASE)
@@ -3734,7 +3755,7 @@ struct Luv2RGBinteger
         {
             ushort* tab = sRGBInvGammaTab_b;
             bool srgb = issRGB;
-            static const int vsize = v_uint8::nlanes;
+            static const int vsize = VTraits<v_uint8>::vlanes();
             const int descaleShift = 1 << (shift-1);
             v_int16 vdescale = vx_setall_s16(descaleShift);
             v_int16 vc[9];
@@ -3754,12 +3775,12 @@ struct Luv2RGBinteger
             // fixing 16bit signed multiplication
             // by subtracting 2^(base_shift-1) and then adding result back
             v_int32 dummy32, fm[3];
-            v_expand(vc[0]+vc[1]+vc[2], fm[0], dummy32);
-            v_expand(vc[3]+vc[4]+vc[5], fm[1], dummy32);
-            v_expand(vc[6]+vc[7]+vc[8], fm[2], dummy32);
-            fm[0] = fm[0] << (base_shift-1);
-            fm[1] = fm[1] << (base_shift-1);
-            fm[2] = fm[2] << (base_shift-1);
+            v_expand(v_add(vc[0],vc[1],vc[2]), fm[0], dummy32);
+            v_expand(v_add(vc[3],vc[4],vc[5]), fm[1], dummy32);
+            v_expand(v_add(vc[6],vc[7],vc[8]), fm[2], dummy32);
+            fm[0] = v_shl(fm[0], (base_shift-1));
+            fm[1] = v_shl(fm[1], (base_shift-1));
+            fm[2] = v_shl(fm[2], (base_shift-1));
 
             for (; i <= n-vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
             {
@@ -3799,15 +3820,15 @@ struct Luv2RGBinteger
                 // a bit faster than one loop for all
                 for(int k = 0; k < 4; k++)
                 {
-                    i_rgb[k+4*0] = (v_dotprod(xy[k], crxy) + v_dotprod(zd[k], crz1) + fm[0]) >> shift;
+                    i_rgb[k+4*0] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], crxy), v_dotprod(zd[k], crz1)), fm[0]));
                 }
                 for(int k = 0; k < 4; k++)
                 {
-                    i_rgb[k+4*1] = (v_dotprod(xy[k], cgxy) + v_dotprod(zd[k], cgz1) + fm[1]) >> shift;
+                    i_rgb[k+4*1] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], cgxy), v_dotprod(zd[k], cgz1)), fm[1]));
                 }
                 for(int k = 0; k < 4; k++)
                 {
-                    i_rgb[k+4*2] = (v_dotprod(xy[k], cbxy) + v_dotprod(zd[k], cbz1) + fm[2]) >> shift;
+                    i_rgb[k+4*2] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], cbxy), v_dotprod(zd[k], cbz1)), fm[2]));
                 }
 
                 // [rrggbb]
@@ -3825,7 +3846,7 @@ struct Luv2RGBinteger
                 if(srgb)
                 {
                     // [rr.., gg.., bb..]
-                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*vsize];
+                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*VTraits<v_uint8>::max_nlanes];
                     for(int k = 0; k < 12; k++)
                     {
                         v_store_aligned(rgbshifts + k*vsize/4, i_rgb[k]);
@@ -3840,7 +3861,7 @@ struct Luv2RGBinteger
                     // rgb = (rgb*255) >> inv_gamma_shift
                     for(int k = 0; k < 12; k++)
                     {
-                        i_rgb[k] = ((i_rgb[k] << 8) - i_rgb[k]) >> inv_gamma_shift;
+                        i_rgb[k] = v_shr((v_sub((v_shl(i_rgb[k], 8)), i_rgb[k])), inv_gamma_shift);
                     }
 
                     for(int k = 0; k < 6; k++)
@@ -3923,13 +3944,13 @@ struct Luv2RGB_b
         static const softfloat fv = vRange/f255;
 
 #if CV_SIMD
-        const int fsize = v_float32::nlanes;
+        const int fsize = VTraits<v_float32>::vlanes();
         v_float32 vl = vx_setall_f32((float)fl);
         v_float32 vu = vx_setall_f32((float)fu);
         v_float32 vv = vx_setall_f32((float)fv);
         v_float32 vuLow = vx_setall_f32((float)uLow), vvLow = vx_setall_f32((float)vLow);
         //TODO: fix that when v_interleave is available
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
         v_store_interleave(interTmpM, vl, vu, vv);
         v_store_interleave(interTmpA, vx_setzero_f32(), vuLow, vvLow);
         v_float32 mluv[3], aluv[3];
@@ -3947,7 +3968,7 @@ struct Luv2RGB_b
             j = 0;
 
 #if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+            const int vsize = VTraits<v_uint8>::vlanes();
             for( ; j <= (dn - vsize)*3; j += 3*vsize )
             {
                 v_uint8 s0, s1, s2;
@@ -4000,7 +4021,7 @@ struct Luv2RGB_b
                     v_int32 vi[4*3];
                     for(int k = 0; k < 4*3; k++)
                     {
-                        vi[k] = v_round(vf[k]*v255);
+                        vi[k] = v_round(v_mul(vf[k], v255));
                     }
 
                     v_uint8 rgb[3];
@@ -4022,7 +4043,7 @@ struct Luv2RGB_b
                     for(int k = 0; k < 4; k++)
                     {
                         vf[k] = vx_load_aligned(buf + j + k*fsize);
-                        vi[k] = v_round(vf[k]*v255);
+                        vi[k] = v_round(v_mul(vf[k], v255));
                     }
                     v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
                 }
@@ -4403,7 +4424,7 @@ bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb)
     OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
 
     if(!h.createKernel("BGR2Luv", ocl::imgproc::color_lab_oclsrc,
-                       format("-D dcn=3 -D bidx=%d%s", bidx, srgb ? " -D SRGB" : "")))
+                       format("-D DCN=3 -D BIDX=%d%s", bidx, srgb ? " -D SRGB" : "")))
     {
         return false;
     }
@@ -4471,7 +4492,7 @@ bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb
     OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
 
     if(!h.createKernel("BGR2Lab", ocl::imgproc::color_lab_oclsrc,
-                       format("-D dcn=3 -D bidx=%d%s", bidx, srgb ? " -D SRGB" : "")))
+                       format("-D DCN=3 -D BIDX=%d%s", bidx, srgb ? " -D SRGB" : "")))
     {
         return false;
     }
@@ -4566,7 +4587,7 @@ bool oclCvtColorLab2BGR(InputArray _src, OutputArray _dst, int dcn, int bidx, bo
     OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
 
     if(!h.createKernel("Lab2BGR", ocl::imgproc::color_lab_oclsrc,
-                       format("-D dcn=%d -D bidx=%d%s", dcn, bidx, srgb ? " -D SRGB" : "")))
+                       format("-D DCN=%d -D BIDX=%d%s", dcn, bidx, srgb ? " -D SRGB" : "")))
     {
         return false;
     }
@@ -4617,7 +4638,7 @@ bool oclCvtColorLuv2BGR(InputArray _src, OutputArray _dst, int dcn, int bidx, bo
     OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
 
     if(!h.createKernel("Luv2BGR", ocl::imgproc::color_lab_oclsrc,
-                       format("-D dcn=%d -D bidx=%d%s", dcn, bidx, srgb ? " -D SRGB" : "")))
+                       format("-D DCN=%d -D BIDX=%d%s", dcn, bidx, srgb ? " -D SRGB" : "")))
     {
         return false;
     }
@@ -4671,7 +4692,7 @@ bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx )
     OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
 
     if(!h.createKernel("RGB2XYZ", ocl::imgproc::color_lab_oclsrc,
-                       format("-D dcn=3 -D bidx=%d", bidx)))
+                       format("-D DCN=3 -D BIDX=%d", bidx)))
     {
         return false;
     }
@@ -4719,7 +4740,7 @@ bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx )
     OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
 
     if(!h.createKernel("XYZ2RGB", ocl::imgproc::color_lab_oclsrc,
-                       format("-D dcn=%d -D bidx=%d", dcn, bidx)))
+                       format("-D DCN=%d -D BIDX=%d", dcn, bidx)))
     {
         return false;
     }
diff --git a/modules/imgproc/src/color_rgb.dispatch.cpp b/modules/imgproc/src/color_rgb.dispatch.cpp
index ed2961f0fb6d..efe6c9d6cb60 100644
--- a/modules/imgproc/src/color_rgb.dispatch.cpp
+++ b/modules/imgproc/src/color_rgb.dispatch.cpp
@@ -428,7 +428,7 @@ bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool revers
     OclHelper< Set<3, 4>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
 
     if(!h.createKernel("RGB", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER")))
+                       format("-D DCN=%d -D BIDX=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER")))
     {
         return false;
     }
@@ -441,7 +441,7 @@ bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits
     OclHelper< Set<3, 4>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
 
     if(!h.createKernel("RGB2RGB5x5", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, gbits)))
+                       format("-D DCN=2 -D BIDX=%d -D GREENBITS=%d", bidx, gbits)))
     {
         return false;
     }
@@ -454,7 +454,7 @@ bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, i
     OclHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
 
     if(!h.createKernel("RGB5x52RGB", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, gbits)))
+                       format("-D DCN=%d -D BIDX=%d -D GREENBITS=%d", dcn, bidx, gbits)))
     {
         return false;
     }
@@ -467,7 +467,7 @@ bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits)
     OclHelper< Set<2>, Set<1>, Set<CV_8U> > h(_src, _dst, 1);
 
     if(!h.createKernel("BGR5x52Gray", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=1 -D bidx=0 -D greenbits=%d", gbits)))
+                       format("-D DCN=1 -D BIDX=0 -D GREENBITS=%d", gbits)))
     {
         return false;
     }
@@ -480,7 +480,7 @@ bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits)
     OclHelper< Set<1>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
 
     if(!h.createKernel("Gray2BGR5x5", ocl::imgproc::color_rgb_oclsrc,
-                        format("-D dcn=2 -D bidx=0 -D greenbits=%d", gbits)))
+                        format("-D DCN=2 -D BIDX=0 -D GREENBITS=%d", gbits)))
     {
         return false;
     }
@@ -494,7 +494,7 @@ bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx)
 
     int stripeSize = 1;
     if(!h.createKernel("RGB2Gray", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", bidx, stripeSize)))
+                       format("-D DCN=1 -D BIDX=%d -D STRIPE_SIZE=%d", bidx, stripeSize)))
     {
         return false;
     }
@@ -507,7 +507,7 @@ bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn)
 {
     OclHelper< Set<1>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
     if(!h.createKernel("Gray2RGB", ocl::imgproc::color_rgb_oclsrc,
-                       format("-D bidx=0 -D dcn=%d", dcn)))
+                       format("-D BIDX=0 -D DCN=%d", dcn)))
     {
         return false;
     }
@@ -520,7 +520,7 @@ bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst)
     OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
 
     if(!h.createKernel("RGBA2mRGBA", ocl::imgproc::color_rgb_oclsrc,
-                       "-D dcn=4 -D bidx=3"))
+                       "-D DCN=4 -D BIDX=3"))
     {
         return false;
     }
@@ -533,7 +533,7 @@ bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst)
     OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
 
     if(!h.createKernel("mRGBA2RGBA", ocl::imgproc::color_rgb_oclsrc,
-                       "-D dcn=4 -D bidx=3"))
+                       "-D DCN=4 -D BIDX=3"))
     {
         return false;
     }
diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp
index 6e1102019749..ca39d8a9083e 100644
--- a/modules/imgproc/src/color_rgb.simd.hpp
+++ b/modules/imgproc/src/color_rgb.simd.hpp
@@ -122,8 +122,8 @@ struct RGB2RGB
         int i = 0;
         _Tp alphav = ColorChannel<_Tp>::max();
 
-#if CV_SIMD
-        const int vsize = vt::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<vt>::vlanes();
 
         for(; i <= n-vsize;
             i += vsize, src += vsize*scn, dst += vsize*dcn)
@@ -138,8 +138,13 @@ struct RGB2RGB
                 v_load_deinterleave(src, a, b, c);
                 d = v_set<_Tp>::set(alphav);
             }
-            if(bi == 2)
+            if(bi == 2) {
+                #if CV_SIMD_SCALABLE
+                auto t = a; a = c; c = t; // swap(a, c);
+                #else
                 swap(a, c);
+                #endif
+            }
 
             if(dcn == 4)
             {
@@ -185,53 +190,57 @@ struct RGB5x52RGB
         int dcn = dstcn, bidx = blueIdx, gb = greenBits;
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint8 vz = vx_setzero_u8(), vn0 = vx_setall_u8(255);
         for(; i <= n-vsize;
             i += vsize, src += vsize*sizeof(ushort), dst += vsize*dcn)
         {
             v_uint16 t0 = v_reinterpret_as_u16(vx_load(src));
             v_uint16 t1 = v_reinterpret_as_u16(vx_load(src +
-                                                       sizeof(ushort)*v_uint16::nlanes));
+                                                       sizeof(ushort)*VTraits<v_uint16>::vlanes()));
 
             //TODO: shorten registers use when v_interleave is available
             v_uint8 r, g, b, a;
-            v_uint16 b0 = (t0 << 11) >> 8;
-            v_uint16 b1 = (t1 << 11) >> 8;
+            v_uint16 b0 = v_shr<8>(v_shl<11>(t0));
+            v_uint16 b1 = v_shr<8>(v_shl<11>(t1));
             b = v_pack(b0, b1);
 
             v_uint16 g0, g1, r0, r1, a0, a1;
 
             if( gb == 6 )
             {
-                g0 = ((t0 >> 5) << 10) >> 8;
-                g1 = ((t1 >> 5) << 10) >> 8;
+                g0 = v_shr<8>(v_shl<10>(v_shr<5>(t0)));
+                g1 = v_shr<8>(v_shl<10>(v_shr<5>(t1)));
 
-                r0 = (t0 >> 11) << 3;
-                r1 = (t1 >> 11) << 3;
+                r0 = v_shl<3>(v_shr<11>(t0));
+                r1 = v_shl<3>(v_shr<11>(t1));
 
                 a = vn0;
             }
             else
             {
-                g0 = ((t0 >> 5) << 11) >> 8;
-                g1 = ((t1 >> 5) << 11) >> 8;
+                g0 = v_shr<8>(v_shl<11>(v_shr<5>(t0)));
+                g1 = v_shr<8>(v_shl<11>(v_shr<5>(t1)));
 
-                r0 = ((t0 >> 10) << 11) >> 8;
-                r1 = ((t1 >> 10) << 11) >> 8;
+                r0 = v_shr<8>(v_shl<11>(v_shr<10>(t0)));
+                r1 = v_shr<8>(v_shl<11>(v_shr<10>(t1)));
 
-                a0 = t0 >> 15;
-                a1 = t1 >> 15;
+                a0 = v_shr<15>(t0);
+                a1 = v_shr<15>(t1);
                 a = v_pack(a0, a1);
-                a = a != vz;
+                a = v_ne(a, vz);
             }
             g = v_pack(g0, g1);
             r = v_pack(r0, r1);
 
-            if(bidx == 2)
+            if(bidx == 2) {
+                #if CV_SIMD_SCALABLE
+                auto t = r; r = b; b = t; // swap(b, r);
+                #else
                 swap(b, r);
-
+                #endif
+            }
             if(dcn == 4)
             {
                 v_store_interleave(dst, b, g, r, a);
@@ -289,8 +298,8 @@ struct RGB2RGB5x5
         int scn = srccn, bidx = blueIdx, gb = greenBits;
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint16 vn3 = vx_setall_u16((ushort)(~3));
         v_uint16 vn7 = vx_setall_u16((ushort)(~7));
         v_uint16 vz = vx_setzero_u16();
@@ -308,10 +317,15 @@ struct RGB2RGB5x5
             {
                 v_load_deinterleave(src, b, g, r, a);
             }
-            if(bidx == 2)
+            if(bidx == 2){
+                #if CV_SIMD_SCALABLE
+                auto t = r; r = b; b = t; // swap(b, r);
+                #else
                 swap(b, r);
+                #endif
+            }
 
-            r = r & v7;
+            r = v_and(r, v7);
 
             //TODO: shorten registers use when v_deinterleave is available
             v_uint16 r0, r1, g0, g1, b0, b1, a0, a1;
@@ -322,20 +336,20 @@ struct RGB2RGB5x5
 
             v_uint16 d0, d1;
 
-            b0 = b0 >> 3;
-            b1 = b1 >> 3;
-            a0 = (a0 != vz) << 15;
-            a1 = (a1 != vz) << 15;
+            b0 = v_shr<3>(b0);
+            b1 = v_shr<3>(b1);
+            a0 = v_shl<15>(v_ne(a0, vz));
+            a1 = v_shl<15>(v_ne(a1, vz));
 
             if(gb == 6)
             {
-                d0 = b0 | ((g0 & vn3) << 3) | (r0 << 8);
-                d1 = b1 | ((g1 & vn3) << 3) | (r1 << 8);
+                d0 = v_or(v_or(b0, v_shl<3>(v_and(g0, vn3))), v_shl<8>(r0));
+                d1 = v_or(v_or(b1, v_shl<3>(v_and(g1, vn3))), v_shl<8>(r1));
             }
             else
             {
-                d0 = b0 | ((g0 & vn7) << 2) | (r0 << 7) | a0;
-                d1 = b1 | ((g1 & vn7) << 2) | (r1 << 7) | a1;
+                d0 = v_or(v_or(v_or(b0, v_shl<2>(v_and(g0, vn7))), v_shl<7>(r0)), a0);
+                d1 = v_or(v_or(v_or(b1, v_shl<2>(v_and(g1, vn7))), v_shl<7>(r1)), a1);
             }
 
             v_store((ushort*)dst, d0);
@@ -382,8 +396,8 @@ struct Gray2RGB
         int i = 0;
         _Tp alpha = ColorChannel<_Tp>::max();
 
-#if CV_SIMD
-        const int vsize = vt::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<vt>::vlanes();
         vt valpha = v_set<_Tp>::set(alpha);
         for(; i <= n-vsize;
             i += vsize, src += vsize, dst += vsize*dcn)
@@ -424,8 +438,8 @@ struct Gray2RGB5x5
     {
         int gb = greenBits;
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         v_uint16 v3 = vx_setall_u16((ushort)(~3));
         for(; i <= n-vsize;
             i += vsize, src += vsize, dst += vsize*sizeof(ushort))
@@ -433,16 +447,16 @@ struct Gray2RGB5x5
             v_uint8 t8 = vx_load_low(src);
             v_uint16 t = v_expand_low(t8);
 
-            v_uint16 t3 = t >> 3;
+            v_uint16 t3 = v_shr<3>(t);
 
             v_uint16 d = t3;
             if(gb == 6)
             {
-                d |= ((t & v3) << 3) | (t3 << 11);
+                d = v_or(d, v_or(v_shl<3>(v_and(t, v3)), v_shl<11>(t3)));
             }
             else
             {
-                d |= (t3 << 5) | (t3 << 10);
+                d = v_or(d, v_or(v_shl<5>(t3), v_shl<10>(t3)));
             }
 
             v_store((ushort*)dst, d);
@@ -488,8 +502,8 @@ struct RGB5x52Gray
     {
         int gb = greenBits;
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
 
         v_int16 bg2y;
         v_int16 r12y;
@@ -504,17 +518,17 @@ struct RGB5x52Gray
             v_uint16 t = vx_load((ushort*)src);
 
             v_uint16 r, g, b;
-            b = (t << 11) >> 8;
+            b = v_shr<8>(v_shl<11>(t));
 
             if(gb == 5)
             {
-                g = ((t >> 5) << 11) >> 8;
-                r = ((t >> 10) << 11) >> 8;
+                g = v_shr<8>(v_shl<11>(v_shr<5>(t)));
+                r = v_shr<8>(v_shl<11>(v_shr<10>(t)));
             }
             else
             {
-                g = ((t >> 5) << 10) >> 8;
-                r = (t >> 11) << 3;
+                g = v_shr<8>(v_shl<10>(v_shr<5>(t)));
+                r = v_shl<3>(v_shr<11>(t));
             }
 
             v_uint8 d;
@@ -530,11 +544,11 @@ struct RGB5x52Gray
             v_zip(sr, delta, rd0, rd1);
 
             v_uint32 d0, d1;
-            d0 = v_reinterpret_as_u32(v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y));
-            d1 = v_reinterpret_as_u32(v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y));
+            d0 = v_reinterpret_as_u32(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)));
+            d1 = v_reinterpret_as_u32(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)));
 
-            d0 = d0 >> shift;
-            d1 = d1 >> shift;
+            d0 = v_shr<shift>(d0);
+            d1 = v_shr<shift>(d1);
 
             dx = v_pack(d0, d1);
             // high part isn't used
@@ -611,8 +625,8 @@ struct RGB2Gray<float>
         int scn = srccn, i = 0;
         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
 
-#if CV_SIMD
-        const int vsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_float32>::vlanes();
         v_float32 rv = vx_setall_f32(cr), gv = vx_setall_f32(cg), bv = vx_setall_f32(cb);
         for(; i <= n-vsize;
             i += vsize, src += vsize*scn, dst += vsize)
@@ -627,7 +641,7 @@ struct RGB2Gray<float>
                 v_load_deinterleave(src, b, g, r, a);
             }
 
-            v_float32 d = v_fma(r, rv, v_fma(g, gv, b*bv));
+            v_float32 d = v_fma(r, rv, v_fma(g, gv, v_mul(b, bv)));
 
             v_store(dst, d);
         }
@@ -669,8 +683,8 @@ struct RGB2Gray<uchar>
         short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_int16 bg2y;
         v_int16 r12y;
         v_int16 dummy;
@@ -706,10 +720,10 @@ struct RGB2Gray<uchar>
             v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11);
 
             v_uint32 y00, y01, y10, y11;
-            y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
-            y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
-            y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
-            y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
+            y00 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))));
+            y01 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))));
+            y10 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))));
+            y11 = v_shr<shift>(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))));
 
             v_uint16 y0, y1;
             y0 = v_pack(y00, y01);
@@ -762,8 +776,8 @@ struct RGB2Gray<ushort>
         short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
 
         v_int16 b2y = vx_setall_s16(cb);
         v_int16 g2y = vx_setall_s16(cg);
@@ -802,13 +816,13 @@ struct RGB2Gray<ushort>
 
             // fixing 16bit signed multiplication
             v_int16 mr, mg, mb;
-            mr = (sr < z) & r2y;
-            mg = (sg < z) & g2y;
-            mb = (sb < z) & b2y;
-            v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift;
+            mr = v_and(v_lt(sr, z), r2y);
+            mg = v_and(v_lt(sg, z), g2y);
+            mb = v_and(v_lt(sb, z), b2y);
+            v_int16 fixmul = v_shl<fix_shift>(v_add_wrap(mr, v_add_wrap(mg, mb)));
 
-            v_int32 sy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift;
-            v_int32 sy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift;
+            v_int32 sy0 = v_shr<shift>(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)));
+            v_int32 sy1 = v_shr<shift>(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)));
 
             v_int16 y = v_add_wrap(v_pack(sy0, sy1), fixmul);
 
@@ -868,7 +882,7 @@ struct RGBA2mRGBA<uchar>
 
         int i = 0;
 #if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000));
         v_uint16 vh = vx_setall_u16(half_val+1);
 
@@ -887,27 +901,27 @@ struct RGBA2mRGBA<uchar>
 
             v_uint16 a16[4];
             for(int j = 0; j < 4; j++)
-                a16[j] = v_reinterpret_as_u16(v[j] & amask);
+                a16[j] = v_reinterpret_as_u16(v_and(v[j], amask));
 
             v_uint32 a32[4];
             for(int j = 0; j < 4; j++)
-                a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8));
+                a32[j] = v_reinterpret_as_u32(v_or(a16[j], (v_shr(a16[j], 8))));
 
             v_uint8 a[4];
             for(int j = 0; j < 4; j++)
-                a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16));
+                a[j] = v_reinterpret_as_u8(v_or(a32[j], (v_shr(a32[j], 16))));
 
             v_uint16 m[8];
             for(int j = 0; j < 4; j++)
                 v_mul_expand(v[j], a[j], m[j], m[j+4]);
 
             for(int j = 0; j < 8; j++)
-                m[j] += vh;
+                m[j] = v_add(m[j], vh);
 
             // div 255: (v+1+(v>>8))>8
             // +1 is in vh, has no effect on (v>>8)
             for(int j = 0; j < 8; j++)
-                m[j] = (m[j] + (m[j] >> 8)) >> 8;
+                m[j] = v_shr((v_add(m[j], (v_shr(m[j], 8)))), 8);
 
             v_uint8 d[4];
             for(int j = 0; j < 4; j++)
@@ -973,8 +987,8 @@ struct mRGBA2RGBA<uchar>
         uchar max_val = ColorChannel<uchar>::max();
         int i = 0;
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000));
         v_uint8 vmax = vx_setall_u8(max_val);
 
@@ -989,9 +1003,9 @@ struct mRGBA2RGBA<uchar>
             v_uint8 a;
             v_uint16 a16;
             v_uint32 a32;
-            a16 = v_reinterpret_as_u16(s & amask);
-            a32 = v_reinterpret_as_u32(a16 | (a16 >> 8));
-            a = v_reinterpret_as_u8(a32 | (a32 >> 16));
+            a16 = v_reinterpret_as_u16(v_and(s, amask));
+            a32 = v_reinterpret_as_u32(v_or(a16, v_shr<8>(a16)));
+            a = v_reinterpret_as_u8(v_or(a32, v_shr<16>(a32)));
 
             // s *= max_val
             v_uint16 s0, s1;
@@ -1000,7 +1014,7 @@ struct mRGBA2RGBA<uchar>
             // s += a/2
             v_uint16 ae0, ae1;
             v_expand(a, ae0, ae1);
-            s0 += ae0 >> 1; s1 += ae1 >> 1;
+            s0 = v_add(s0, v_shr<1>(ae0)); s1 = v_add(s1, v_shr<1>(ae1));
 
             // s, a -> u32 -> float
             v_uint32 u00, u01, u10, u11;
@@ -1035,10 +1049,10 @@ struct mRGBA2RGBA<uchar>
 
             // float d = (float)s/(float)a
             v_float32 fd00, fd01, fd10, fd11;
-            fd00 = fs00/fa00;
-            fd01 = fs01/fa01;
-            fd10 = fs10/fa10;
-            fd11 = fs11/fa11;
+            fd00 = v_div(fs00, fa00);
+            fd01 = v_div(fs01, fa01);
+            fd10 = v_div(fs10, fa10);
+            fd11 = v_div(fs11, fa11);
 
             // d -> u32 -> u8
             v_uint32 ud00, ud01, ud10, ud11;
@@ -1054,8 +1068,8 @@ struct mRGBA2RGBA<uchar>
 
             // if a == 0 then d = 0
             v_uint8 am;
-            am = a != vx_setzero_u8();
-            d = d & am;
+            am = v_ne(a, vx_setzero_u8());
+            d = v_and(d, am);
 
             // put alpha values
             d = v_select(amask, a, d);
diff --git a/modules/imgproc/src/color_yuv.dispatch.cpp b/modules/imgproc/src/color_yuv.dispatch.cpp
index 559005e07f70..71d840d85767 100644
--- a/modules/imgproc/src/color_yuv.dispatch.cpp
+++ b/modules/imgproc/src/color_yuv.dispatch.cpp
@@ -115,6 +115,9 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
+// 4:2:0, two planes in one array: Y, UV interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int dst_width, int dst_height,
@@ -129,6 +132,9 @@ void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
             dst_width, dst_height, dcn, swapBlue, uIdx);
 }
 
+// 4:2:0, two planes: Y, UV interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int dst_width, int dst_height,
@@ -139,6 +145,9 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src
     cvtTwoPlaneYUVtoBGR(y_data, src_step, uv_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
 }
 
+// 4:2:0, two planes: Y, UV interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
                          uchar * dst_data, size_t dst_step,
                          int dst_width, int dst_height,
@@ -153,6 +162,9 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_d
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
+// 4:2:0, three planes in one array: Y, U, V
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                            uchar * dst_data, size_t dst_step,
                            int dst_width, int dst_height,
@@ -166,6 +178,9 @@ void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
+// 4:2:0, three planes in one array: Y, U, V
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
                            uchar * dst_data, size_t dst_step,
                            int width, int height,
@@ -179,6 +194,9 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
+// 4:2:0, two planes: Y, UV interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
                          uchar * y_data, uchar * uv_data, size_t dst_step,
                          int width, int height,
@@ -193,6 +211,9 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
+// 4:2:2 interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int width, int height,
@@ -206,6 +227,22 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
         CV_CPU_DISPATCH_MODES_ALL);
 }
 
+// 4:2:2 interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 14-bit fixed-point arithmetics is used
+void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int scn, bool swapBlue, int uIdx, int ycn)
+{
+    CV_INSTRUMENT_REGION();
+
+    CALL_HAL(cvtOnePlaneBGRtoYUV, cv_hal_cvtOnePlaneBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn);
+
+    CV_CPU_DISPATCH(cvtOnePlaneBGRtoYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx, ycn),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+
 } // namespace hal
 
 //
@@ -219,7 +256,7 @@ bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx )
     OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
 
     if(!h.createKernel("YUV2RGB", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d", dcn, bidx)))
+                       format("-D DCN=%d -D BIDX=%d", dcn, bidx)))
     {
         return false;
     }
@@ -232,7 +269,7 @@ bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx )
     OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
 
     if(!h.createKernel("RGB2YUV", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=3 -D bidx=%d", bidx)))
+                       format("-D DCN=3 -D BIDX=%d", bidx)))
     {
         return false;
     }
@@ -245,7 +282,7 @@ bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx)
     OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
 
     if(!h.createKernel("YCrCb2RGB", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d", dcn, bidx)))
+                       format("-D DCN=%d -D BIDX=%d", dcn, bidx)))
     {
         return false;
     }
@@ -258,7 +295,7 @@ bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx)
     OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
 
     if(!h.createKernel("RGB2YCrCb", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=3 -D bidx=%d", bidx)))
+                       format("-D DCN=3 -D BIDX=%d", bidx)))
     {
         return false;
     }
@@ -272,7 +309,7 @@ bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int
 
     bool optimized = _src.offset() % 4 == 0 && _src.step() % 4 == 0;
     if(!h.createKernel("YUV2RGB_422", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx,
+                       format("-D DCN=%d -D BIDX=%d -D UIDX=%d -D YIDX=%d%s", dcn, bidx, uidx, yidx,
                        optimized ? " -D USE_OPTIMIZED_LOAD" : "")))
     {
         return false;
@@ -281,6 +318,20 @@ bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int
     return h.run();
 }
 
+bool oclCvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx )
+{
+    OclHelper< Set<3, 4>, Set<2>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
+
+    if(!h.createKernel("RGB2YUV_422", ocl::imgproc::color_yuv_oclsrc,
+                       format("-D DCN=%d -D BIDX=%d -D UIDX=%d -D YIDX=%d", dcn, bidx, uidx, yidx
+                       )))
+    {
+        return false;
+    }
+
+    return h.run();
+}
+
 bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
 {
     OclHelper< Set<1>, Set<1>, Set<CV_8U>, FROM_YUV> h(_src, _dst, 1);
@@ -294,7 +345,7 @@ bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int
     OclHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV > h(_src, _dst, dcn);
 
     if(!h.createKernel("YUV2RGB_NVx", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx)))
+                       format("-D DCN=%d -D BIDX=%d -D UIDX=%d", dcn, bidx, uidx)))
     {
         return false;
     }
@@ -307,7 +358,7 @@ bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, i
     OclHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV > h(_src, _dst, dcn);
 
     if(!h.createKernel("YUV2RGB_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
+                       format("-D DCN=%d -D BIDX=%d -D UIDX=%d%s", dcn, bidx, uidx,
                        _src.isContinuous() ? " -D SRC_CONT" : "")))
     {
         return false;
@@ -321,7 +372,7 @@ bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx,
     OclHelper< Set<3, 4>, Set<1>, Set<CV_8U>, TO_YUV > h(_src, _dst, 1);
 
     if(!h.createKernel("RGB2YUV_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc,
-                       format("-D dcn=1 -D bidx=%d -D uidx=%d", bidx, uidx)))
+                       format("-D DCN=1 -D BIDX=%d -D UIDX=%d", bidx, uidx)))
     {
         return false;
     }
@@ -352,6 +403,9 @@ void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, int dcn, bool swapb, boo
                      h.depth, dcn, swapb, crcb);
 }
 
+// 4:2:2 interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn)
 {
     CvtHelper< Set<2>, Set<3, 4>, Set<CV_8U>, FROM_UYVY > h(_src, _dst, dcn);
@@ -360,6 +414,17 @@ void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool s
                              dcn, swapb, uidx, ycn);
 }
 
+// 4:2:2 interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 14-bit fixed-point arithmetics is used
+void cvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, int uidx, int ycn)
+{
+    CvtHelper< Set<3, 4>, Set<2>, Set<CV_8U>, TO_UYVY > h(_src, _dst, 2);
+
+    hal::cvtOnePlaneBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows,
+                             h.scn, swapb, uidx, ycn);
+}
+
 void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi )
 {
     CV_Assert( _src.channels() == 2 && _src.depth() == CV_8U );
@@ -367,6 +432,9 @@ void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi )
     extractChannel(_src, _dst, coi);
 }
 
+// 4:2:0, three planes in one array: Y, U, V
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx)
 {
     CvtHelper< Set<3, 4>, Set<1>, Set<CV_8U>, TO_YUV > h(_src, _dst, 1);
@@ -389,6 +457,9 @@ void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst )
     h.src(Range(0, h.dstSz.height), Range::all()).copyTo(h.dst);
 }
 
+// 4:2:0, three planes in one array: Y, U, V
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx)
 {
     if(dcn <= 0) dcn = 3;
@@ -398,9 +469,10 @@ void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool
                                dcn, swapb, uidx);
 }
 
-// http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
-// http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
-
+// 4:2:0, two planes in one array: Y, UV interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
+// see also: http://www.fourcc.org/yuv.php#NV21, http://www.fourcc.org/yuv.php#NV12
 void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx )
 {
     if(dcn <= 0) dcn = 3;
@@ -410,6 +482,9 @@ void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool s
                              dcn, swapb, uidx);
 }
 
+// 4:2:0, two planes: Y, UV interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx )
 {
     int stype = _ysrc.type();
diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp
index b5f73d873a73..44f38a3418e0 100644
--- a/modules/imgproc/src/color_yuv.simd.hpp
+++ b/modules/imgproc/src/color_yuv.simd.hpp
@@ -37,6 +37,10 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
                          int width, int height,
                          int dcn, bool swapBlue, int uIdx, int ycn);
+void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int scn, bool swapBlue, int uIdx, int ycn);
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
@@ -49,6 +53,15 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
 namespace {
 //constants for conversion from/to RGB and YUV, YCrCb according to BT.601
 
+#if CV_SIMD_SCALABLE
+template <class T>
+static void swap(T&a, T&b) {
+    T t = a;
+    a = b;
+    b = t;
+}
+#endif
+
 //to YCbCr
 static const float YCBF = 0.564f; // == 1/2/(1-B2YF)
 static const float YCRF = 0.713f; // == 1/2/(1-R2YF)
@@ -143,11 +156,11 @@ struct RGB2YCrCb_f<float>
         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
 
         int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
         v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4);
         v_float32 vdelta = vx_setall_f32(delta);
-        const int vsize = v_float32::nlanes;
+        const int vsize = VTraits<v_float32>::vlanes();
         for( ; i <= n-vsize;
              i += vsize, src += vsize*scn, dst += vsize*3)
         {
@@ -162,13 +175,13 @@ struct RGB2YCrCb_f<float>
             }
 
             v_float32 y, cr, cb;
-            y = v_fma(b, vc0, v_fma(g, vc1, r*vc2));
+            y = v_fma(b, vc0, v_fma(g, vc1, v_mul(r, vc2)));
 
             if(bidx)
-                std::swap(r, b);
+                swap(r, b);
 
-            cr = v_fma(r - y, vc3, vdelta);
-            cb = v_fma(b - y, vc4, vdelta);
+            cr = v_fma(v_sub(r, y), vc3, vdelta);
+            cb = v_fma(v_sub(b, y), vc4, vdelta);
 
             if(yuvOrder)
             {
@@ -266,8 +279,8 @@ struct RGB2YCrCb_i<ushort>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
         int sdelta = ColorChannel<ushort>::half()*(1 << shift);
         int i = 0;
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         const int descale = 1 << (shift-1);
 
         v_int16 b2y = vx_setall_s16((short)C0);
@@ -312,13 +325,13 @@ struct RGB2YCrCb_i<ushort>
 
             // fixing 16bit signed multiplication
             v_int16 mr, mg, mb;
-            mr = (sr < z) & r2y;
-            mg = (sg < z) & g2y;
-            mb = (sb < z) & b2y;
-            v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift;
+            mr = v_and(v_lt(sr, z), r2y);
+            mg = v_and(v_lt(sg, z), g2y);
+            mb = v_and(v_lt(sb, z), b2y);
+            v_int16 fixmul = v_shl(v_add_wrap(mr, v_add_wrap(mg, mb)), fix_shift);
 
-            v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift;
-            v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift;
+            v_int32 ssy0 = v_shr(v_add(v_dotprod(bg0, bg2y), v_dotprod(rd0, r12y)), shift);
+            v_int32 ssy1 = v_shr(v_add(v_dotprod(bg1, bg2y), v_dotprod(rd1, r12y)), shift);
 
             y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul));
 
@@ -340,15 +353,15 @@ struct RGB2YCrCb_i<ushort>
             v_int32 sy0 = v_reinterpret_as_s32(uy0);
             v_int32 sy1 = v_reinterpret_as_s32(uy1);
 
-            sr0 = sr0 - sy0; sr1 = sr1 - sy1;
-            sb0 = sb0 - sy0; sb1 = sb1 - sy1;
+            sr0 = v_sub(sr0, sy0); sr1 = v_sub(sr1, sy1);
+            sb0 = v_sub(sb0, sy0); sb1 = v_sub(sb1, sy1);
 
             v_int32 v_scr0, v_scr1, v_scb0, v_scb1;
 
-            v_scr0 = (sr0*vc3 + vdd) >> shift;
-            v_scr1 = (sr1*vc3 + vdd) >> shift;
-            v_scb0 = (sb0*vc4 + vdd) >> shift;
-            v_scb1 = (sb1*vc4 + vdd) >> shift;
+            v_scr0 = v_shr(v_add(v_mul(sr0, vc3), vdd), shift);
+            v_scr1 = v_shr(v_add(v_mul(sr1, vc3), vdd), shift);
+            v_scb0 = v_shr(v_add(v_mul(sb0, vc4), vdd), shift);
+            v_scb1 = v_shr(v_add(v_mul(sb1, vc4), vdd), shift);
 
             // saturate and pack
             cr = v_pack_u(v_scr0, v_scr1);
@@ -407,8 +420,8 @@ struct RGB2YCrCb_i<uchar>
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
         int delta = ColorChannel<uchar>::half()*(1 << shift);
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         const int descaleShift = 1 << (shift-1);
         v_int16 bg2y;
         v_int16 r12y;
@@ -458,10 +471,10 @@ struct RGB2YCrCb_i<uchar>
                 v_zip(sr0, vdescale, rd00, rd01);
                 v_zip(sr1, vdescale, rd10, rd11);
 
-                y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift;
-                y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift;
-                y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift;
-                y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift;
+                y00 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg00, bg2y), v_dotprod(rd00, r12y))), shift);
+                y01 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg01, bg2y), v_dotprod(rd01, r12y))), shift);
+                y10 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg10, bg2y), v_dotprod(rd10, r12y))), shift);
+                y11 = v_shr(v_reinterpret_as_u32(v_add(v_dotprod(bg11, bg2y), v_dotprod(rd11, r12y))), shift);
             }
 
             v_uint16 y0, y1;
@@ -512,15 +525,15 @@ struct RGB2YCrCb_i<uchar>
 
             v_uint8 cr, cb;
 
-            cr00 = cr00 >> shift;
-            cr01 = cr01 >> shift;
-            cr10 = cr10 >> shift;
-            cr11 = cr11 >> shift;
+            cr00 = v_shr(cr00, shift);
+            cr01 = v_shr(cr01, shift);
+            cr10 = v_shr(cr10, shift);
+            cr11 = v_shr(cr11, shift);
 
-            cb00 = cb00 >> shift;
-            cb01 = cb01 >> shift;
-            cb10 = cb10 >> shift;
-            cb11 = cb11 >> shift;
+            cb00 = v_shr(cb00, shift);
+            cb01 = v_shr(cb01, shift);
+            cb10 = v_shr(cb10, shift);
+            cb11 = v_shr(cb11, shift);
 
             v_int16 cr0, cr1, cb0, cb1;
             cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11);
@@ -623,12 +636,12 @@ struct YCrCb2RGB_f<float>
         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
 
         int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1);
         v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3);
         v_float32 vdelta = vx_setall_f32(delta);
         v_float32 valpha = vx_setall_f32(alpha);
-        const int vsize = v_float32::nlanes;
+        const int vsize = VTraits<v_float32>::vlanes();
         for( ; i <= n-vsize;
              i += vsize, src += vsize*3, dst += vsize*dcn)
         {
@@ -640,7 +653,7 @@ struct YCrCb2RGB_f<float>
 
             v_float32 b, g, r;
 
-            cb -= vdelta; cr -= vdelta;
+            cb = v_sub(cb, vdelta); cr = v_sub(cr, vdelta);
             b = v_fma(cb, vc3, y);
             g = v_fma(cr, vc1, v_fma(cb, vc2, y));
             r = v_fma(cr, vc0, y);
@@ -746,8 +759,8 @@ struct YCrCb2RGB_i<uchar>
         const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
 
-#if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint8 valpha = vx_setall_u8(alpha);
         v_uint8 vdelta = vx_setall_u8(delta);
         const int descaleShift = 1 << (shift - 1);
@@ -794,8 +807,8 @@ struct YCrCb2RGB_i<uchar>
                 v_int32 cb00, cb01, cb10, cb11;
                 v_expand(v_scb0, cb00, cb01);
                 v_expand(v_scb1, cb10, cb11);
-                b00 += cb00 << 15; b01 += cb01 << 15;
-                b10 += cb10 << 15; b11 += cb11 << 15;
+                b00 = v_add(b00, v_shl<15>(cb00)); b01 = v_add(b01, v_shl<15>(cb01));
+                b10 = v_add(b10, v_shl<15>(cb10)); b11 = v_add(b11, v_shl<15>(cb11));
             }
 
             v_int32 t00, t01, t10, t11;
@@ -803,17 +816,17 @@ struct YCrCb2RGB_i<uchar>
             v_mul_expand(v_scb1, vc2, t10, t11);
             v_mul_expand(v_scr0, vc1, g00, g01);
             v_mul_expand(v_scr1, vc1, g10, g11);
-            g00 += t00; g01 += t01;
-            g10 += t10; g11 += t11;
+            g00 = v_add(g00, t00); g01 = v_add(g01, t01);
+            g10 = v_add(g10, t10); g11 = v_add(g11, t11);
             v_mul_expand(v_scr0, vc0, r00, r01);
             v_mul_expand(v_scr1, vc0, r10, r11);
 
-            b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift;
-            b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift;
-            g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift;
-            g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift;
-            r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift;
-            r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift;
+            b00 = v_shr(v_add(b00, vdescale), shift); b01 = v_shr(v_add(b01, vdescale), shift);
+            b10 = v_shr(v_add(b10, vdescale), shift); b11 = v_shr(v_add(b11, vdescale), shift);
+            g00 = v_shr(v_add(g00, vdescale), shift); g01 = v_shr(v_add(g01, vdescale), shift);
+            g10 = v_shr(v_add(g10, vdescale), shift); g11 = v_shr(v_add(g11, vdescale), shift);
+            r00 = v_shr(v_add(r00, vdescale), shift); r01 = v_shr(v_add(r01, vdescale), shift);
+            r10 = v_shr(v_add(r10, vdescale), shift); r11 = v_shr(v_add(r11, vdescale), shift);
 
             v_int16 b0, b1, g0, g1, r0, r1;
             b0 = v_pack(b00, b01); b1 = v_pack(b10, b11);
@@ -897,8 +910,8 @@ struct YCrCb2RGB_i<ushort>
         const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
 
-#if CV_SIMD
-        const int vsize = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int vsize = VTraits<v_uint16>::vlanes();
         const int descaleShift = 1 << (shift-1);
         v_uint16 valpha = vx_setall_u16(alpha);
         v_uint16 vdelta = vx_setall_u16(delta);
@@ -939,22 +952,22 @@ struct YCrCb2RGB_i<ushort>
                 // so we fix the multiplication
                 v_int32 cb0, cb1;
                 v_expand(scb, cb0, cb1);
-                b0 += cb0 << 15;
-                b1 += cb1 << 15;
+                b0 = v_add(b0, v_shl<15>(cb0));
+                b1 = v_add(b1, v_shl<15>(cb1));
             }
             v_int32 t0, t1;
             v_mul_expand(scb, vc2, t0, t1);
             v_mul_expand(scr, vc1, g0, g1);
-            g0 += t0; g1 += t1;
+            g0 = v_add(g0, t0); g1 = v_add(g1, t1);
             v_mul_expand(scr, vc0, r0, r1);
 
             // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits
-            b0 = ((b0 + vdescale) >> shift) + y0;
-            b1 = ((b1 + vdescale) >> shift) + y1;
-            g0 = ((g0 + vdescale) >> shift) + y0;
-            g1 = ((g1 + vdescale) >> shift) + y1;
-            r0 = ((r0 + vdescale) >> shift) + y0;
-            r1 = ((r1 + vdescale) >> shift) + y1;
+            b0 = v_add(v_shr(v_add(b0, vdescale), shift), y0);
+            b1 = v_add(v_shr(v_add(b1, vdescale), shift), y1);
+            g0 = v_add(v_shr(v_add(g0, vdescale), shift), y0);
+            g1 = v_add(v_shr(v_add(g1, vdescale), shift), y1);
+            r0 = v_add(v_shr(v_add(r0, vdescale), shift), y0);
+            r1 = v_add(v_shr(v_add(r1, vdescale), shift), y1);
 
             // saturate and pack
             v_uint16 b, g, r;
@@ -1002,7 +1015,7 @@ struct YCrCb2RGB_i<ushort>
 
 ///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
 
-static const int ITUR_BT_601_CY = 1220542;
+static const int ITUR_BT_601_CY  = 1220542;
 static const int ITUR_BT_601_CUB = 2116026;
 static const int ITUR_BT_601_CUG = -409993;
 static const int ITUR_BT_601_CVG = -852492;
@@ -1010,14 +1023,14 @@ static const int ITUR_BT_601_CVR = 1673527;
 static const int ITUR_BT_601_SHIFT = 20;
 
 // Coefficients for RGB to YUV420p conversion
-static const int ITUR_BT_601_CRY =  269484;
-static const int ITUR_BT_601_CGY =  528482;
-static const int ITUR_BT_601_CBY =  102760;
-static const int ITUR_BT_601_CRU = -155188;
-static const int ITUR_BT_601_CGU = -305135;
-static const int ITUR_BT_601_CBU =  460324;
-static const int ITUR_BT_601_CGV = -385875;
-static const int ITUR_BT_601_CBV = -74448;
+static const int ITUR_BT_601_CRY =  269484; // 0.299055 * (236-16)/256 * (1 << ITUR_BT_601_SHIFT)
+static const int ITUR_BT_601_CGY =  528482; // 0.586472 * (236-16)/256 * (1 << ITUR_BT_601_SHIFT)
+static const int ITUR_BT_601_CBY =  102760; // 0.114035 * (236-16)/256 * (1 << ITUR_BT_601_SHIFT)
+static const int ITUR_BT_601_CRU = -155188; // -0.148 * (1 << (ITUR_BT_601_SHIFT-1))
+static const int ITUR_BT_601_CGU = -305135; // -0.291 * (1 << (ITUR_BT_601_SHIFT-1))
+static const int ITUR_BT_601_CBU =  460324; //  0.439 * (1 << (ITUR_BT_601_SHIFT-1))
+static const int ITUR_BT_601_CGV = -385875; // -0.368 * (1 << (ITUR_BT_601_SHIFT-1))
+static const int ITUR_BT_601_CBV =  -74448; // -0.071 * (1 << (ITUR_BT_601_SHIFT-1))
 
 //R = 1.164(Y - 16) + 1.596(V - 128)
 //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -1038,11 +1051,11 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
     buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
-                             v_int32 (&ruv)[4],
-                             v_int32 (&guv)[4],
-                             v_int32 (&buv)[4])
+                             v_int32 &ruv0, v_int32 &ruv1, v_int32 &ruv2, v_int32 &ruv3,
+                             v_int32 &guv0, v_int32 &guv1, v_int32 &guv2, v_int32 &guv3,
+                             v_int32 &buv0, v_int32 &buv1, v_int32 &buv2, v_int32 &buv3)
 {
     v_uint8 v128 = vx_setall_u8(128);
     v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
@@ -1051,9 +1064,10 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
     v_int16 uu0, uu1, vv0, vv1;
     v_expand(su, uu0, uu1);
     v_expand(sv, vv0, vv1);
-    v_int32 uu[4], vv[4];
-    v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]);
-    v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]);
+    v_int32 uuu0, uuu1, uuu2, uuu3;
+    v_int32 vvv0, vvv1, vvv2, vvv3;
+    v_expand(uu0, uuu0, uuu1); v_expand(uu1, uuu2, uuu3);
+    v_expand(vv0, vvv0, vvv1); v_expand(vv1, vvv2, vvv3);
 
     v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1));
     v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR);
@@ -1061,12 +1075,15 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
     v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG);
     v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB);
 
-    for (int k = 0; k < 4; k++)
-    {
-        ruv[k] = vshift + vr * vv[k];
-        guv[k] = vshift + vg * vv[k] + ug * uu[k];
-        buv[k] = vshift + ub * uu[k];
-    }
+    auto process_uv = [&](v_int32& ruv, v_int32& guv, v_int32& buv, const v_int32& vv, const v_int32& uu) {
+        ruv = v_add(vshift, v_mul(vr, vv));
+        guv = v_add(v_add(vshift, v_mul(vg, vv)), v_mul(ug, uu));
+        buv = v_add(vshift, v_mul(ub, uu));
+    };
+    process_uv(ruv0, guv0, buv0, vvv0, uuu0);
+    process_uv(ruv1, guv1, buv1, vvv1, uuu1);
+    process_uv(ruv2, guv2, buv2, vvv2, uuu2);
+    process_uv(ruv3, guv3, buv3, vvv3, uuu3);
 }
 #endif
 
@@ -1081,44 +1098,48 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co
     a = uchar(0xff);
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void yRGBuvToRGBA(const v_uint8& vy,
-                                const v_int32 (&ruv)[4],
-                                const v_int32 (&guv)[4],
-                                const v_int32 (&buv)[4],
+                                const v_int32 &ruv0, const v_int32 &ruv1, const v_int32 &ruv2, const v_int32 &ruv3,
+                                const v_int32 &guv0, const v_int32 &guv1, const v_int32 &guv2, const v_int32 &guv3,
+                                const v_int32 &buv0, const v_int32 &buv1, const v_int32 &buv2, const v_int32 &buv3,
                                 v_uint8& rr, v_uint8& gg, v_uint8& bb)
 {
     v_uint8 v16 = vx_setall_u8(16);
-    v_uint8 posY = vy - v16;
+    v_uint8 posY = v_sub(vy, v16);
     v_uint16 yy0, yy1;
     v_expand(posY, yy0, yy1);
-    v_int32 yy[4];
-    v_int32 yy00, yy01, yy10, yy11;
-    v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]);
-    v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]);
+    v_int32 yyy0, yyy1, yyy2, yyy3;
+    v_expand(v_reinterpret_as_s16(yy0), yyy0, yyy1);
+    v_expand(v_reinterpret_as_s16(yy1), yyy2, yyy3);
 
     v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY);
 
-    v_int32 y[4], r[4], g[4], b[4];
-    for(int k = 0; k < 4; k++)
-    {
-        y[k] = yy[k]*vcy;
-        r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT;
-        g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT;
-        b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT;
-    }
+    v_int32 y0, y1, y2, y3, r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
 
-    v_int16 r0, r1, g0, g1, b0, b1;
-    r0 = v_pack(r[0], r[1]);
-    r1 = v_pack(r[2], r[3]);
-    g0 = v_pack(g[0], g[1]);
-    g1 = v_pack(g[2], g[3]);
-    b0 = v_pack(b[0], b[1]);
-    b1 = v_pack(b[2], b[3]);
-
-    rr = v_pack_u(r0, r1);
-    gg = v_pack_u(g0, g1);
-    bb = v_pack_u(b0, b1);
+    auto process_yrgb = [&](const v_int32& yy, v_int32& y, v_int32& r, v_int32& g, v_int32& b,
+                            const v_int32& ruv, const v_int32& guv, const v_int32& buv) {
+        y = v_mul(yy, vcy);
+        r = v_shr(v_add(y, ruv), ITUR_BT_601_SHIFT);
+        g = v_shr(v_add(y, guv), ITUR_BT_601_SHIFT);
+        b = v_shr(v_add(y, buv), ITUR_BT_601_SHIFT);
+    };
+    process_yrgb(yyy0, y0, r0, g0, b0, ruv0, guv0, buv0);
+    process_yrgb(yyy1, y1, r1, g1, b1, ruv1, guv1, buv1);
+    process_yrgb(yyy2, y2, r2, g2, b2, ruv2, guv2, buv2);
+    process_yrgb(yyy3, y3, r3, g3, b3, ruv3, guv3, buv3);
+
+    v_int16 _r0, _r1, _g0, _g1, _b0, _b1;
+    _r0 = v_pack(r0, r1);
+    _r1 = v_pack(r2, r3);
+    _g0 = v_pack(g0, g1);
+    _g1 = v_pack(g2, g3);
+    _b0 = v_pack(b0, b1);
+    _b1 = v_pack(b2, b3);
+
+    rr = v_pack_u(_r0, _r1);
+    gg = v_pack_u(_g0, _g1);
+    bb = v_pack_u(_b0, _b1);
 }
 #endif
 
@@ -1201,8 +1222,8 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
             const uchar* y2 = y1 + my1_step;
 
             int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
             v_uint8 a = vx_setall_u8(uchar(0xff));
             for( ; i <= width - 2*vsize;
                  i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
@@ -1215,36 +1236,50 @@ struct YUV420sp2RGB8Invoker : ParallelLoopBody
                     swap(u, v);
                 }
 
-                v_uint8 vy[4];
-                v_load_deinterleave(y1 + i, vy[0], vy[1]);
-                v_load_deinterleave(y2 + i, vy[2], vy[3]);
-
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
-
-                v_uint8 r[4], g[4], b[4];
-
-                for(int k = 0; k < 4; k++)
-                {
-                    yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
-                }
+                v_uint8 vy0, vy1, vy2, vy3;
+                v_load_deinterleave(y1 + i, vy0, vy1);
+                v_load_deinterleave(y2 + i, vy2, vy3);
+
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);
+
+                v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
+
+                auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) {
+                    yRGBuvToRGBA(vy,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r, g, b);
+                };
+                call_yRGBuvToRGBA(vy0, r0, g0, b0);
+                call_yRGBuvToRGBA(vy1, r1, g1, b1);
+                call_yRGBuvToRGBA(vy2, r2, g2, b2);
+                call_yRGBuvToRGBA(vy3, r3, g3, b3);
 
                 if(bIdx)
                 {
-                    for(int k = 0; k < 4; k++)
-                        swap(r[k], b[k]);
+                    swap(r0, b0);
+                    swap(r1, b1);
+                    swap(r2, b2);
+                    swap(r3, b3);
                 }
 
                 // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                 v_uint8 r0_0, r0_1, r1_0, r1_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_zip(r[2], r[3], r1_0, r1_1);
+                v_zip(r0, r1, r0_0, r0_1);
+                v_zip(r2, r3, r1_0, r1_1);
                 v_uint8 g0_0, g0_1, g1_0, g1_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_zip(g[2], g[3], g1_0, g1_1);
+                v_zip(g0, g1, g0_0, g0_1);
+                v_zip(g2, g3, g1_0, g1_1);
                 v_uint8 b0_0, b0_1, b1_0, b1_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-                v_zip(b[2], b[3], b1_0, b1_1);
+                v_zip(b0, b1, b0_0, b0_1);
+                v_zip(b2, b3, b1_0, b1_1);
 
                 if(dcn == 4)
                 {
@@ -1319,8 +1354,8 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
             const uchar* y2 = y1 + stride;
             int i = 0;
 
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
             v_uint8 a = vx_setall_u8(uchar(0xff));
             for( ; i <= width/2 - vsize;
                  i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2)
@@ -1329,36 +1364,50 @@ struct YUV420p2RGB8Invoker : ParallelLoopBody
                 u = vx_load(u1 + i);
                 v = vx_load(v1 + i);
 
-                v_uint8 vy[4];
-                v_load_deinterleave(y1 + 2*i, vy[0], vy[1]);
-                v_load_deinterleave(y2 + 2*i, vy[2], vy[3]);
-
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
-
-                v_uint8 r[4], g[4], b[4];
-
-                for(int k = 0; k < 4; k++)
-                {
-                    yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]);
-                }
+                v_uint8 vy0, vy1, vy2, vy3;
+                v_load_deinterleave(y1 + 2*i, vy0, vy1);
+                v_load_deinterleave(y2 + 2*i, vy2, vy3);
+
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);
+
+                v_uint8 r0, r1, r2, r3, g0, g1, g2, g3, b0, b1, b2, b3;
+
+                auto call_yRGBuvToRGBA = [&](const v_uint8& vy, v_uint8& r, v_uint8& g, v_uint8& b) {
+                    yRGBuvToRGBA(vy,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r, g, b);
+                };
+                call_yRGBuvToRGBA(vy0, r0, g0, b0);
+                call_yRGBuvToRGBA(vy1, r1, g1, b1);
+                call_yRGBuvToRGBA(vy2, r2, g2, b2);
+                call_yRGBuvToRGBA(vy3, r3, g3, b3);
 
                 if(bIdx)
                 {
-                    for(int k = 0; k < 4; k++)
-                        swap(r[k], b[k]);
+                    swap(r0, b0);
+                    swap(r1, b1);
+                    swap(r2, b2);
+                    swap(r3, b3);
                 }
 
                 // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                 v_uint8 r0_0, r0_1, r1_0, r1_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
-                v_zip(r[2], r[3], r1_0, r1_1);
+                v_zip(r0, r1, r0_0, r0_1);
+                v_zip(r2, r3, r1_0, r1_1);
                 v_uint8 g0_0, g0_1, g1_0, g1_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
-                v_zip(g[2], g[3], g1_0, g1_1);
+                v_zip(g0, g1, g0_0, g0_1);
+                v_zip(g2, g3, g1_0, g1_1);
                 v_uint8 b0_0, b0_1, b1_0, b1_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
-                v_zip(b[2], b[3], b1_0, b1_1);
+                v_zip(b0, b1, b0_0, b0_1);
+                v_zip(b2, b3, b1_0, b1_1);
 
                 if(dcn == 4)
                 {
@@ -1430,7 +1479,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b)
     return saturate_cast<uchar>(yy >> ITUR_BT_601_SHIFT);
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b)
 {
     const int shifted16 = (16 << ITUR_BT_601_SHIFT);
@@ -1440,25 +1489,25 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint
     v_expand(g, g0, g1);
     v_expand(b, b0, b1);
 
-    v_uint32 rq[4], gq[4], bq[4];
-    v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]);
-    v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]);
-    v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]);
+    v_uint32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3;
+    v_expand(r0, rq0, rq1); v_expand(r1, rq2, rq3);
+    v_expand(g0, gq0, gq1); v_expand(g1, gq2, gq3);
+    v_expand(b0, bq0, bq1); v_expand(b1, bq2, bq3);
 
     v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY);
     v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16);
 
-    v_uint32 y[4];
-    for(int k = 0; k < 4; k++)
-    {
-        y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT;
-    }
+    v_uint32 y0, y1, y2, y3;
+    y0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq0, ry), v_mul(gq0, gy)), v_mul(bq0, by)), shift));
+    y1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq1, ry), v_mul(gq1, gy)), v_mul(bq1, by)), shift));
+    y2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq2, ry), v_mul(gq2, gy)), v_mul(bq2, by)), shift));
+    y3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(rq3, ry), v_mul(gq3, gy)), v_mul(bq3, by)), shift));
 
-    v_uint16 y0, y1;
-    y0 = v_pack(y[0], y[1]);
-    y1 = v_pack(y[2], y[3]);
+    v_uint16 _y0, _y1;
+    _y0 = v_pack(y0, y1);
+    _y1 = v_pack(y2, y3);
 
-    return v_pack(y0, y1);
+    return v_pack(_y0, _y1);
 }
 #endif
 
@@ -1473,27 +1522,27 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v)
     v = saturate_cast<uchar>(vv >> ITUR_BT_601_SHIFT);
 }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1,
                               const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v)
 {
     // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..]
     v_int16 vlowByte = vx_setall_s16(0x00ff);
     v_int16 rd0, rd1, gd0, gd1, bd0, bd1;
-    rd0 = v_reinterpret_as_s16(r0) & vlowByte;
-    rd1 = v_reinterpret_as_s16(r1) & vlowByte;
-    gd0 = v_reinterpret_as_s16(g0) & vlowByte;
-    gd1 = v_reinterpret_as_s16(g1) & vlowByte;
-    bd0 = v_reinterpret_as_s16(b0) & vlowByte;
-    bd1 = v_reinterpret_as_s16(b1) & vlowByte;
-
-    v_int32 rq[4], gq[4], bq[4];
-    v_expand(rd0, rq[0], rq[1]);
-    v_expand(rd1, rq[2], rq[3]);
-    v_expand(gd0, gq[0], gq[1]);
-    v_expand(gd1, gq[2], gq[3]);
-    v_expand(bd0, bq[0], bq[1]);
-    v_expand(bd1, bq[2], bq[3]);
+    rd0 = v_and(v_reinterpret_as_s16(r0), vlowByte);
+    rd1 = v_and(v_reinterpret_as_s16(r1), vlowByte);
+    gd0 = v_and(v_reinterpret_as_s16(g0), vlowByte);
+    gd1 = v_and(v_reinterpret_as_s16(g1), vlowByte);
+    bd0 = v_and(v_reinterpret_as_s16(b0), vlowByte);
+    bd1 = v_and(v_reinterpret_as_s16(b1), vlowByte);
+
+    v_int32 rq0, rq1, rq2, rq3, gq0, gq1, gq2, gq3, bq0, bq1, bq2, bq3;
+    v_expand(rd0, rq0, rq1);
+    v_expand(rd1, rq2, rq3);
+    v_expand(gd0, gq0, gq1);
+    v_expand(gd1, gq2, gq3);
+    v_expand(bd0, bq0, bq1);
+    v_expand(bd1, bq2, bq3);
 
     const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
     const int shifted128 = (128 << ITUR_BT_601_SHIFT);
@@ -1505,18 +1554,21 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint
     bu = vx_setall_s32(ITUR_BT_601_CBU);
     bv = vx_setall_s32(ITUR_BT_601_CBV);
 
-    v_int32 uq[4], vq[4];
-    for(int k = 0; k < 4; k++)
-    {
-        uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT;
-        vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT;
-    }
+    v_int32 uq0, uq1, uq2, uq3, vq0, vq1, vq2, vq3;
+    uq0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq0), v_mul(gu, gq0)), v_mul(bu, bq0)), shift));
+    vq0 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq0), v_mul(gv, gq0)), v_mul(bv, bq0)), shift));
+    uq1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq1), v_mul(gu, gq1)), v_mul(bu, bq1)), shift));
+    vq1 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq1), v_mul(gv, gq1)), v_mul(bv, bq1)), shift));
+    uq2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq2), v_mul(gu, gq2)), v_mul(bu, bq2)), shift));
+    vq2 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq2), v_mul(gv, gq2)), v_mul(bv, bq2)), shift));
+    uq3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(ru, rq3), v_mul(gu, gq3)), v_mul(bu, bq3)), shift));
+    vq3 = v_shr<ITUR_BT_601_SHIFT>(v_add(v_add(v_add(v_mul(bu, rq3), v_mul(gv, gq3)), v_mul(bv, bq3)), shift));
 
     v_int16 u0, u1, v0, v1;
-    u0 = v_pack(uq[0], uq[1]);
-    u1 = v_pack(uq[2], uq[3]);
-    v0 = v_pack(vq[0], vq[1]);
-    v1 = v_pack(vq[2], vq[3]);
+    u0 = v_pack(uq0, uq1);
+    u1 = v_pack(uq2, uq3);
+    v0 = v_pack(vq0, vq1);
+    v1 = v_pack(vq2, vq3);
 
     u = v_pack_u(u0, u1);
     v = v_pack_u(v0, v1);
@@ -1559,8 +1611,8 @@ struct RGB8toYUV420pInvoker: public ParallelLoopBody
                 }
             }
             int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
 
             for( ; i <= w/2 - vsize;
                  i += vsize)
@@ -1708,47 +1760,61 @@ struct YUV422toRGB8Invoker : ParallelLoopBody
         {
             uchar* row = dst_data + dst_step * j;
             int i = 0;
-#if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+            const int vsize = VTraits<v_uint8>::vlanes();
             v_uint8 a = vx_setall_u8(uchar(0xff));
             for(; i <= 2*width - 4*vsize;
                 i += 4*vsize, row += vsize*dcn*2)
             {
-                v_uint8 u, v, vy[2];
+                v_uint8 u, v, vy0, vy1;
                 if(yIdx == 1) // UYVY
                 {
-                    v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]);
+                    v_load_deinterleave(yuv_src + i, u, vy0, v, vy1);
                 }
                 else // YUYV or YVYU
                 {
-                    v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v);
+                    v_load_deinterleave(yuv_src + i, vy0, u, vy1, v);
                     if(uIdx == 1) // YVYU
                     {
                         swap(u, v);
                     }
                 }
 
-                v_int32 ruv[4], guv[4], buv[4];
-                uvToRGBuv(u, v, ruv, guv, buv);
+                v_int32 ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3;
+                uvToRGBuv(u, v,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3);
+
+                v_uint8 r0, r1, g0, g1, b0, b1;
 
-                v_uint8 r[2], g[2], b[2];
 
-                yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]);
-                yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]);
+                yRGBuvToRGBA(vy0,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r0, g0, b0);
+                yRGBuvToRGBA(vy1,
+                        ruv0, ruv1, ruv2, ruv3,
+                        guv0, guv1, guv2, guv3,
+                        buv0, buv1, buv2, buv3,
+                        r1, g1, b1);
 
                 if(bIdx)
                 {
-                    swap(r[0], b[0]);
-                    swap(r[1], b[1]);
+                    swap(r0, b0);
+                    swap(r1, b1);
                 }
 
                 // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...]
                 v_uint8 r0_0, r0_1;
-                v_zip(r[0], r[1], r0_0, r0_1);
+                v_zip(r0, r1, r0_0, r0_1);
                 v_uint8 g0_0, g0_1;
-                v_zip(g[0], g[1], g0_0, g0_1);
+                v_zip(g0, g1, g0_0, g0_1);
                 v_uint8 b0_0, b0_1;
-                v_zip(b[0], b[1], b0_0, b0_1);
+                v_zip(b0, b1, b0_0, b0_1);
 
                 if(dcn == 4)
                 {
@@ -1790,6 +1856,119 @@ inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_
         converter(Range(0, height));
 }
 
+
+///////////////////////////////////// RGB -> YUV422 /////////////////////////////////////
+
+static const int RGB2YUV422_SHIFT = 14;
+
+// Coefficients based on ITU.BT-601, ISBN 1-878707-09-4 (https://fourcc.org/fccyvrgb.php)
+// The conversion coefficients for RGB to YUV422 are based on the ones for RGB to YUV.
+// For both Y components, the coefficients are applied as given in the link to each input RGB pixel
+// separately. For U and V, they are reduced by half to account for two RGB pixels contributing
+// to the same U and V values. In other words, the U and V contributions from the two RGB pixels
+// are averaged. The integer versions are obtained by multiplying the float versions by 16384
+// and rounding to the nearest integer so that resulting values are in these bounds:
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+
+static const int R2Y422 =  4211; // 0.299077 * (236 - 16) / 256 * 16384
+static const int G2Y422 =  8258; // 0.586506 * (236 - 16) / 256 * 16384
+static const int B2Y422 =  1606; // 0.114062 * (236 - 16) / 256 * 16384
+
+static const int R2U422 = -1212; // -0.148 * 8192
+static const int G2U422 = -2384; // -0.291 * 8192
+static const int B2U422 =  3596; //  0.439 * 8192
+static const int G2V422 = -3015; // -0.368 * 8192
+static const int B2V422 =  -582; // -0.071 * 8192
+
+static inline void RGB2Y(const uchar r, const uchar g, const uchar b, uchar& y)
+{
+    int y_ = r * R2Y422 + g * G2Y422 + b * B2Y422 + (1 << RGB2YUV422_SHIFT) * 16;
+    y = saturate_cast<uchar>(((1 << (RGB2YUV422_SHIFT-1)) + y_) >> RGB2YUV422_SHIFT);
+}
+
+static inline void RGB2UV(const uchar r1, const uchar g1, const uchar b1,
+                          const uchar r2, const uchar g2, const uchar b2,
+                          uchar& u, uchar& v)
+{
+    int sr = r1 + r2, sg = g1 + g2, sb = b1 + b2;
+
+    int u_ = sr * R2U422 + sg * G2U422 + sb * B2U422 + (1 << (RGB2YUV422_SHIFT-1)) * 256;
+    u = saturate_cast<uchar>(((1 << (RGB2YUV422_SHIFT-1)) + u_) >> RGB2YUV422_SHIFT);
+
+    int v_ = sr * B2U422 + sg * G2V422 + sb * B2V422 + (1 << (RGB2YUV422_SHIFT-1)) * 256;
+    v = saturate_cast<uchar>(((1 << (RGB2YUV422_SHIFT-1)) + v_) >> RGB2YUV422_SHIFT);
+}
+
+template<int yidx, int uidx, int vidx>
+static inline void cvtRGB82Yuv422(const uchar r1, const uchar g1, const uchar b1,
+                                    const uchar r2, const uchar g2, const uchar b2,
+                                    uchar* row)
+{
+    uchar &u = row[uidx], &v = row[vidx], &y1 = row[yidx], &y2 = row[yidx+2];
+
+    RGB2Y(r1, g1, b1, y1);
+    RGB2Y(r2, g2, b2, y2);
+
+    RGB2UV(r1, g1, b1, r2, g2, b2, u, v);
+}
+
+// bIdx is 0 or 2; [uIdx, yIdx] is [0, 0], [0, 1], [1, 0]; scn is 3 or 4
+template<int bIdx, int uIdx, int yIdx, int scn>
+struct RGB8toYUV422Invoker : ParallelLoopBody
+{
+    uchar * dst_data;
+    size_t dst_step;
+    const uchar * src_data;
+    size_t src_step;
+    int width;
+
+    RGB8toYUV422Invoker(uchar * _dst_data, size_t _dst_step,
+                        const uchar * _src_data, size_t _src_step,
+                        int _width)
+        : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {}
+
+    void operator()(const Range& range) const CV_OVERRIDE
+    {
+        int rangeBegin = range.start;
+        int rangeEnd = range.end;
+
+        // [yIdx, uIdx] | [uidx, vidx]:
+        //     0, 0     |     1, 3
+        //     0, 1     |     3, 1
+        //     1, 0     |     0, 2
+        const int uidx = 1 - yIdx + uIdx * 2;
+        const int vidx = (2 + uidx) % 4;
+        const int ridx = (2-bIdx);
+        const uchar* rgb_src = src_data + rangeBegin * (src_step);
+        const uchar* rgb_src2 = rgb_src+scn;
+
+        for (int j = rangeBegin; j < rangeEnd; j++, rgb_src += src_step, rgb_src2 = rgb_src+scn)
+        {
+            uchar* row = dst_data + (dst_step) * j;
+            int i = 0;
+            for (; i < scn * width; i += (scn << 1), row += 4)
+            {
+                const uchar r1 = rgb_src[i+ridx], g1 = rgb_src[i+1], b1 = rgb_src[i+bIdx];
+                const uchar r2 = rgb_src2[i+ridx], g2 = rgb_src2[i+1], b2 = rgb_src2[i+bIdx];
+
+                cvtRGB82Yuv422<yIdx, uidx, vidx>(r1, g1, b1, r2, g2, b2, row);
+            }
+        }
+    }
+};
+
+template<int bIdx, int uIdx, int yIdx, int scn>
+inline void cvtRGBtoYUV422(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step,
+                           int width, int height)
+{
+    RGB8toYUV422Invoker<bIdx, uIdx, yIdx, scn> converter(dst_data, dst_step, src_data, src_step, width);
+    if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
+        parallel_for_(Range(0, height), converter);
+    else
+        converter(Range(0, height));
+}
+
+
 } // namespace anon
 
 
@@ -1826,14 +2005,17 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
         CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_f<float>(dcn, blueIdx, isCbCr));
 }
 
-typedef void (*cvt_2plane_yuv_ptr_t)(uchar * /* dst_data*/,
-                       size_t /* dst_step */,
-                       int /* dst_width */,
-                       int /* dst_height */,
-                       const uchar* /* _y1 */,
-                       size_t /* _y1_step */,
-                       const uchar* /* _uv */,
-                       size_t /* _uv_step */);
+// 4:2:0, two planes: Y, UV interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
+typedef void (*cvt_2plane_yuv_ptr_t)(uchar *      /* dst_data   */,
+                                     size_t       /* dst_step   */,
+                                     int          /* dst_width  */,
+                                     int          /* dst_height */,
+                                     const uchar* /* _y1        */,
+                                     size_t       /* _y1_step   */,
+                                     const uchar* /* _uv        */,
+                                     size_t       /* _uv_step   */);
 
 void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
                          uchar * dst_data, size_t dst_step,
@@ -1855,27 +2037,30 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_d
     case 401: cvtPtr = cvtYUV420sp2RGB<0, 1, 4>; break;
     case 420: cvtPtr = cvtYUV420sp2RGB<2, 0, 4>; break;
     case 421: cvtPtr = cvtYUV420sp2RGB<2, 1, 4>; break;
-    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    default: CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported color conversion code" ); break;
     };
 
     cvtPtr(dst_data, dst_step, dst_width, dst_height, y_data, y_step, uv_data, uv_step);
 }
 
-typedef void (*cvt_3plane_yuv_ptr_t)(uchar * /* dst_data */,
-                                     size_t /* dst_step */,
-                                     int /* dst_width */,
-                                     int /* dst_height */,
-                                     size_t /* _stride */,
-                                     const uchar* /* _y1 */,
-                                     const uchar* /* _u */,
-                                     const uchar* /* _v */,
-                                     int /* ustepIdx */,
-                                     int /* vstepIdx */);
+// 4:2:0, three planes in one array: Y, U, V
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
+typedef void (*cvt_3plane_yuv_ptr_t)(uchar *      /* dst_data   */,
+                                     size_t       /* dst_step   */,
+                                     int          /* dst_width  */,
+                                     int          /* dst_height */,
+                                     size_t       /* _stride    */,
+                                     const uchar* /* _y1        */,
+                                     const uchar* /* _u         */,
+                                     const uchar* /* _v         */,
+                                     int          /* ustepIdx   */,
+                                     int          /* vstepIdx   */);
 
 void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
-                                  uchar * dst_data, size_t dst_step,
-                                  int dst_width, int dst_height,
-                                  int dcn, bool swapBlue, int uIdx)
+                                 uchar * dst_data, size_t dst_step,
+                                 int dst_width, int dst_height,
+                                 int dcn, bool swapBlue, int uIdx)
 {
     CV_INSTRUMENT_REGION();
 
@@ -1895,12 +2080,15 @@ void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
     case 32: cvtPtr = cvtYUV420p2RGB<2, 3>; break;
     case 40: cvtPtr = cvtYUV420p2RGB<0, 4>; break;
     case 42: cvtPtr = cvtYUV420p2RGB<2, 4>; break;
-    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    default: CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported color conversion code" ); break;
     };
 
     cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx);
 }
 
+// 4:2:0, three planes in one array: Y, U, V
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
                            uchar * dst_data, size_t dst_step,
                            int width, int height,
@@ -1919,6 +2107,9 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
         cvt(Range(0, height/2));
 }
 
+// 4:2:0, two planes: Y, UV interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
 void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
                          uchar * y_data, uchar * uv_data, size_t dst_step,
                          int width, int height,
@@ -1935,12 +2126,15 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
         cvt(Range(0, height/2));
 }
 
-typedef void (*cvt_1plane_yuv_ptr_t)(uchar * /* dst_data */,
-                                     size_t /* dst_step */,
+// 4:2:2 interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 20-bit fixed-point arithmetics
+typedef void (*cvt_1plane_yuv_ptr_t)(uchar *       /* dst_data */,
+                                     size_t        /* dst_step */,
                                      const uchar * /* src_data */,
-                                     size_t /* src_step */,
-                                     int /* width */,
-                                     int /* height */);
+                                     size_t        /* src_step */,
+                                     int           /* width    */,
+                                     int           /* height   */);
 
 void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                          uchar * dst_data, size_t dst_step,
@@ -1965,7 +2159,39 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
     case 4200: cvtPtr = cvtYUV422toRGB<2,0,0,4>; break;
     case 4201: cvtPtr = cvtYUV422toRGB<2,0,1,4>; break;
     case 4210: cvtPtr = cvtYUV422toRGB<2,1,0,4>; break;
-    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    default: CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported color conversion code" ); break;
+    };
+
+    cvtPtr(dst_data, dst_step, src_data, src_step, width, height);
+}
+
+// 4:2:2 interleaved
+// Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+// 14-bit fixed-point arithmetics is used
+void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int scn, bool swapBlue, int uIdx, int ycn)
+{
+    CV_INSTRUMENT_REGION();
+
+    cvt_1plane_yuv_ptr_t cvtPtr;
+    int blueIdx = swapBlue ? 2 : 0;
+    switch(scn*1000 + blueIdx*100 + uIdx*10 + ycn)
+    {
+    case 3000: cvtPtr = cvtRGBtoYUV422<0,0,0,3>; break;
+    case 3001: cvtPtr = cvtRGBtoYUV422<0,0,1,3>; break;
+    case 3010: cvtPtr = cvtRGBtoYUV422<0,1,0,3>; break;
+    case 3200: cvtPtr = cvtRGBtoYUV422<2,0,0,3>; break;
+    case 3201: cvtPtr = cvtRGBtoYUV422<2,0,1,3>; break;
+    case 3210: cvtPtr = cvtRGBtoYUV422<2,1,0,3>; break;
+    case 4000: cvtPtr = cvtRGBtoYUV422<0,0,0,4>; break;
+    case 4001: cvtPtr = cvtRGBtoYUV422<0,0,1,4>; break;
+    case 4010: cvtPtr = cvtRGBtoYUV422<0,1,0,4>; break;
+    case 4200: cvtPtr = cvtRGBtoYUV422<2,0,0,4>; break;
+    case 4201: cvtPtr = cvtRGBtoYUV422<2,0,1,4>; break;
+    case 4210: cvtPtr = cvtRGBtoYUV422<2,1,0,4>; break;
+    default: CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported color conversion code" ); break;
     };
 
     cvtPtr(dst_data, dst_step, src_data, src_step, width, height);
diff --git a/modules/imgproc/src/connectedcomponents.cpp b/modules/imgproc/src/connectedcomponents.cpp
index a2f4b6e890dc..d402ea91c37a 100644
--- a/modules/imgproc/src/connectedcomponents.cpp
+++ b/modules/imgproc/src/connectedcomponents.cpp
@@ -5716,7 +5716,7 @@ namespace cv{
             }
         }
 
-        CV_Error(CV_StsUnsupportedFormat, "unsupported label/image type");
+        CV_Error(cv::Error::StsUnsupportedFormat, "unsupported label/image type");
     }
 
 }
@@ -5738,7 +5738,7 @@ int cv::connectedComponents(InputArray img_, OutputArray _labels, int connectivi
         return connectedComponents_sub1(img, labels, connectivity, ccltype, sop);
     }
     else{
-        CV_Error(CV_StsUnsupportedFormat, "the type of labels must be 16u or 32s");
+        CV_Error(cv::Error::StsUnsupportedFormat, "the type of labels must be 16u or 32s");
     }
 }
 
@@ -5763,7 +5763,7 @@ int cv::connectedComponentsWithStats(InputArray img_, OutputArray _labels, Outpu
         return connectedComponents_sub1(img, labels, connectivity, ccltype, sop);
     }
     else{
-        CV_Error(CV_StsUnsupportedFormat, "the type of labels must be 16u or 32s");
+        CV_Error(cv::Error::StsUnsupportedFormat, "the type of labels must be 16u or 32s");
         return 0;
     }
 }
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index d8823206f29c..0577ca16c1a0 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -40,6 +40,7 @@
 //M*/
 #include "precomp.hpp"
 #include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/imgproc/detail/legacy.hpp"
 
 using namespace cv;
 
@@ -59,10 +60,10 @@ cvStartReadChainPoints( CvChain * chain, CvChainPtReader * reader )
     int i;
 
     if( !chain || !reader )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( chain->elem_size != 1 || chain->header_size < (int)sizeof(CvChain))
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     cvStartReadSeq( (CvSeq *) chain, (CvSeqReader *) reader, 0 );
 
@@ -80,7 +81,7 @@ CV_IMPL CvPoint
 cvReadChainPoint( CvChainPtReader * reader )
 {
     if( !reader )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     cv::Point2i pt = reader->pt;
 
@@ -170,12 +171,9 @@ typedef struct _CvContourScanner
 }
 _CvContourScanner;
 
-#define _CV_FIND_CONTOURS_FLAGS_EXTERNAL_ONLY    1
-#define _CV_FIND_CONTOURS_FLAGS_HIERARCHIC       2
-
 /*
    Initializes scanner structure.
-   Prepare image for scanning ( clear borders and convert all pixels to 0-1.
+   Prepare image for scanning ( clear borders and convert all pixels to 0-1 ).
 */
 static CvContourScanner
 cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
@@ -183,7 +181,7 @@ cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
                      int  method, CvPoint offset, int needFillBorder )
 {
     if( !storage )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     CvMat stub, *mat = cvGetMat( _img, &stub );
 
@@ -192,7 +190,7 @@ cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
 
     if( !((CV_IS_MASK_ARR( mat ) && mode < CV_RETR_FLOODFILL) ||
           (CV_MAT_TYPE(mat->type) == CV_32SC1 && mode == CV_RETR_FLOODFILL)) )
-        CV_Error( CV_StsUnsupportedFormat,
+        CV_Error( cv::Error::StsUnsupportedFormat,
                   "[Start]FindContours supports only CV_8UC1 images when mode != CV_RETR_FLOODFILL "
                   "otherwise supports CV_32SC1 images only" );
 
@@ -201,10 +199,10 @@ cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
     uchar* img = (uchar*)(mat->data.ptr);
 
     if( method < 0 || method > CV_CHAIN_APPROX_TC89_KCOS )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     if( header_size < (int) (method == CV_CHAIN_CODE ? sizeof( CvChain ) : sizeof( CvContour )))
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     CvContourScanner scanner = (CvContourScanner)cvAlloc( sizeof( *scanner ));
     memset( scanner, 0, sizeof(*scanner) );
@@ -214,7 +212,7 @@ cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
     scanner->img = (schar *) (img + step);
     scanner->img_step = step;
     scanner->img_size.width = size.width - 1;   /* exclude rightest column */
-    scanner->img_size.height = size.height - 1; /* exclude bottomost row */
+    scanner->img_size.height = size.height - 1; /* exclude bottommost row */
     scanner->mode = mode;
     scanner->offset = offset;
     scanner->pt.x = scanner->pt.y = 1;
@@ -304,7 +302,7 @@ cvStartFindContours_Impl( void* _img, CvMemStorage* storage,
 
     /* converts all pixels to 0 or 1 */
     if( CV_MAT_TYPE(mat->type) != CV_32S )
-        cvThreshold( mat, mat, 0, 1, CV_THRESH_BINARY );
+        cvThreshold( mat, mat, 0, 1, cv::THRESH_BINARY );
 
     return scanner;
 }
@@ -490,7 +488,7 @@ cvSubstituteContour( CvContourScanner scanner, CvSeq * new_contour )
     _CvContourInfo *l_cinfo;
 
     if( !scanner )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     l_cinfo = scanner->l_cinfo;
     if( l_cinfo && l_cinfo->contour && l_cinfo->contour != new_contour )
@@ -1032,7 +1030,7 @@ CvSeq *
 cvFindNextContour( CvContourScanner scanner )
 {
     if( !scanner )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     CV_Assert(scanner->img_step >= 0);
 
@@ -1080,7 +1078,7 @@ cvFindNextContour( CvContourScanner scanner )
             }
             else
             {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 if ((p = img[x]) != prev)
                 {
                     goto _next_contour;
@@ -1088,9 +1086,9 @@ cvFindNextContour( CvContourScanner scanner )
                 else
                 {
                     v_uint8 v_prev = vx_setall_u8((uchar)prev);
-                    for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+                    for (; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
                     {
-                        v_uint8 vmask = (vx_load((uchar*)(img + x)) != v_prev);
+                        v_uint8 vmask = (v_ne(vx_load((uchar *)(img + x)), v_prev));
                         if (v_check_any(vmask))
                         {
                             p = img[(x += v_scan_forward(vmask))];
@@ -1105,7 +1103,7 @@ cvFindNextContour( CvContourScanner scanner )
 
             if( x >= width )
                 break;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         _next_contour:
 #endif
             {
@@ -1316,7 +1314,7 @@ cvEndFindContours( CvContourScanner * _scanner )
     CvSeq *first = 0;
 
     if( !_scanner )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     scanner = *_scanner;
 
     if( scanner )
@@ -1353,11 +1351,11 @@ CvLinkedRunPoint;
 
 inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint8 v_zero = vx_setzero_u8();
-    for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
+    for (; j <= img_size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
     {
-        v_uint8 vmask = (vx_load((uchar*)(src_data + j)) != v_zero);
+        v_uint8 vmask = (v_ne(vx_load((uchar *)(src_data + j)), v_zero));
         if (v_check_any(vmask))
         {
             j += v_scan_forward(vmask);
@@ -1372,7 +1370,7 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j)
 
 inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     if (j < img_size.width && !src_data[j])
     {
         return j;
@@ -1380,9 +1378,9 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j)
     else
     {
         v_uint8 v_zero = vx_setzero_u8();
-        for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes)
+        for (; j <= img_size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
         {
-            v_uint8 vmask = (vx_load((uchar*)(src_data + j)) == v_zero);
+            v_uint8 vmask = (v_eq(vx_load((uchar *)(src_data + j)), v_zero));
             if (v_check_any(vmask))
             {
                 j += v_scan_forward(vmask);
@@ -1441,13 +1439,13 @@ icvFindContoursInInterval( const CvArr* src,
     CvSeq* prev = 0;
 
     if( !storage )
-        CV_Error( CV_StsNullPtr, "NULL storage pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL storage pointer" );
 
     if( !result )
-        CV_Error( CV_StsNullPtr, "NULL double CvSeq pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL double CvSeq pointer" );
 
     if( contourHeaderSize < (int)sizeof(CvContour))
-        CV_Error( CV_StsBadSize, "Contour header size must be >= sizeof(CvContour)" );
+        CV_Error( cv::Error::StsBadSize, "Contour header size must be >= sizeof(CvContour)" );
 
     storage00.reset(cvCreateChildMemStorage(storage));
     storage01.reset(cvCreateChildMemStorage(storage));
@@ -1456,7 +1454,7 @@ icvFindContoursInInterval( const CvArr* src,
 
     mat = cvGetMat( src, &stub );
     if( !CV_IS_MASK_ARR(mat))
-        CV_Error( CV_StsBadArg, "Input array must be 8uC1 or 8sC1" );
+        CV_Error( cv::Error::StsBadArg, "Input array must be 8uC1 or 8sC1" );
     src_data = mat->data.ptr;
     img_step = mat->step;
     img_size = cvGetMatSize(mat);
@@ -1748,14 +1746,14 @@ cvFindContours_Impl( void*  img,  CvMemStorage*  storage,
     int count = -1;
 
     if( !firstContour )
-        CV_Error( CV_StsNullPtr, "NULL double CvSeq pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL double CvSeq pointer" );
 
     *firstContour = 0;
 
     if( method == CV_LINK_RUNS )
     {
         if( offset.x != 0 || offset.y != 0 )
-            CV_Error( CV_StsOutOfRange,
+            CV_Error( cv::Error::StsOutOfRange,
             "Nonzero offset is not supported in CV_LINK_RUNS yet" );
 
         count = icvFindContoursInInterval( img, storage, firstContour, cntHeaderSize );
@@ -1816,7 +1814,7 @@ cvFindContours( void*  img,  CvMemStorage*  storage,
     return cvFindContours_Impl(img, storage, firstContour, cntHeaderSize, mode, method, offset, 1);
 }
 
-void cv::findContours( InputArray _image, OutputArrayOfArrays _contours,
+void cv::findContours_legacy( InputArray _image, OutputArrayOfArrays _contours,
                    OutputArray _hierarchy, int mode, int method, Point offset )
 {
     CV_INSTRUMENT_REGION();
@@ -1881,12 +1879,12 @@ void cv::findContours( InputArray _image, OutputArrayOfArrays _contours,
     }
 }
 
-void cv::findContours( InputArray _image, OutputArrayOfArrays _contours,
+void cv::findContours_legacy( InputArray _image, OutputArrayOfArrays _contours,
                        int mode, int method, Point offset)
 {
     CV_INSTRUMENT_REGION();
 
-    findContours(_image, _contours, noArray(), mode, method, offset);
+    findContours_legacy(_image, _contours, noArray(), mode, method, offset);
 }
 
 /* End of file. */
diff --git a/modules/imgproc/src/contours_approx.cpp b/modules/imgproc/src/contours_approx.cpp
new file mode 100644
index 000000000000..176c4904681f
--- /dev/null
+++ b/modules/imgproc/src/contours_approx.cpp
@@ -0,0 +1,353 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/types.hpp"
+#include "opencv2/imgproc.hpp"
+#include "contours_common.hpp"
+#include <vector>
+
+using namespace std;
+using namespace cv;
+
+namespace {
+
+struct ApproxItem
+{
+    Point pt;
+    size_t k;  // support region
+    int s;  // 1-curvature
+    bool removed;
+    ApproxItem() : k(0), s(0), removed(false) {}
+    ApproxItem(const Point& pt_, int s_) : pt(pt_), k(0), s(s_), removed(false) {}
+};
+
+static const schar abs_diff[16] = {1, 2, 3, 4, 3, 2, 1, 0, 1, 2, 3, 4, 3, 2, 1};
+static const Point chainCodeDeltas[8] =
+    {{1, 0}, {1, -1}, {0, -1}, {-1, -1}, {-1, 0}, {-1, 1}, {0, 1}, {1, 1}};
+
+// Pass 0.
+// Restores all the digital curve points from the chain code.
+// Removes the points (from the resultant polygon)
+// that have zero 1-curvature
+static vector<ApproxItem> pass_0(const vector<schar>& chain, Point pt, bool isApprox, bool isFull)
+{
+    vector<ApproxItem> res;
+    const size_t len = chain.size();
+    res.reserve(len / 2);
+    for (size_t i = 0; i < len; ++i)
+    {
+        const schar prev = (i == 0) ? chain[len - 1] : chain[i - 1];
+        const schar cur = chain[i];
+        const schar s = abs_diff[cur - prev + 7];
+        if ((!isApprox && (isFull || s != 0)) || isApprox)
+        {
+            res.push_back(ApproxItem(pt, s));
+            if (s == 0)
+                (res.end() - 1)->removed = true;
+        }
+        pt += chainCodeDeltas[cur];
+    }
+    return res;
+}
+
+static vector<Point> gatherPoints(const vector<ApproxItem>& ares)
+{
+    vector<Point> res;
+    res.reserve(ares.size() / 2);
+    for (const ApproxItem& item : ares)
+    {
+        if (item.removed)
+            continue;
+        res.push_back(item.pt);
+    }
+    return res;
+}
+
+static size_t calc_support(const vector<ApproxItem>& ares, size_t i)
+{
+    const size_t len = ares.size();
+    /* determine support region */
+    int d_num = 0;
+    int l = 0;
+    size_t k = 1;
+    for (;; k++)
+    {
+        CV_Assert(k <= len);
+        /* calc indices */
+        const size_t i1 = (i >= k) ? (i - k) : (len - k + i);
+        const size_t i2 = (i + k < len) ? (i + k) : (i + k - len);
+
+        const int dx = ares[i2].pt.x - ares[i1].pt.x;
+        const int dy = ares[i2].pt.y - ares[i1].pt.y;
+
+        /* distance between p_(i - k) and p_(i + k) */
+        const int lk = dx * dx + dy * dy;
+
+        /* distance between p_i and the line (p_(i-k), p_(i+k)) */
+        const int dk_num =
+            (ares[i].pt.x - ares[i1].pt.x) * dy - (ares[i].pt.y - ares[i1].pt.y) * dx;
+
+        union
+        {
+            int i;
+            float f;
+        } d;
+        d.f = (float)(((double)d_num) * lk - ((double)dk_num) * l);
+
+        if (k > 1 && (l >= lk || ((d_num > 0 && d.i <= 0) || (d_num < 0 && d.i >= 0))))
+            break;
+
+        d_num = dk_num;
+        l = lk;
+    }
+    return k - 1;
+}
+
+static int calc_cosine(const vector<ApproxItem>& ares, size_t i)
+{
+    const size_t k = ares[i].k;
+    size_t j;
+    int s;
+    const size_t len = ares.size();
+    /* calc k-cosine curvature */
+    for (j = k, s = 0; j > 0; j--)
+    {
+        const size_t i1 = (i >= j) ? (i - j) : (len - j + i);
+        const size_t i2 = (i + j < len) ? (i + j) : (i + j - len);
+
+        const int dx1 = ares[i1].pt.x - ares[i].pt.x;
+        const int dy1 = ares[i1].pt.y - ares[i].pt.y;
+        const int dx2 = ares[i2].pt.x - ares[i].pt.x;
+        const int dy2 = ares[i2].pt.y - ares[i].pt.y;
+
+        if ((dx1 | dy1) == 0 || (dx2 | dy2) == 0)
+            break;
+
+        double temp_num = dx1 * dx2 + dy1 * dy2;
+        temp_num = (float)(temp_num / sqrt(((double)dx1 * dx1 + (double)dy1 * dy1) *
+                                           ((double)dx2 * dx2 + (double)dy2 * dy2)));
+        Cv32suf sk;
+        sk.f = (float)(temp_num + 1.1);
+
+        CV_Assert(0 <= sk.f && sk.f <= 2.2);
+        if (j < k && sk.i <= s)
+            break;
+
+        s = sk.i;
+    }
+    return s;
+}
+
+static bool calc_nms_cleanup(const vector<ApproxItem>& ares, size_t i)
+{
+    const size_t k2 = ares[i].k >> 1;
+    const int s = ares[i].s;
+    const size_t len = ares.size();
+    size_t j;
+    for (j = 1; j <= k2; j++)
+    {
+        const size_t i1 = (i >= j) ? (i - j) : (len - j + i);
+        const size_t i2 = (i + j < len) ? (i + j) : (i + j - len);
+        if (ares[i1].s > s || ares[i2].s > s)
+            break;
+    }
+    return j <= k2;
+}
+
+static bool calc_dominance(const vector<ApproxItem>& ares, size_t i)
+{
+    const size_t len = ares.size();
+    CV_Assert(len > 0);
+    const size_t i1 = (i >= 1) ? (i - 1) : (len - 1 + i);
+    const size_t i2 = (i + 1 < len) ? (i + 1) : (i + 1 - len);
+    return ares[i].s <= ares[i1].s || ares[i].s <= ares[i2].s;
+}
+
+inline size_t get_next_idx(const vector<ApproxItem>& ares, const size_t start)
+{
+    const size_t len = ares.size();
+    size_t res = start + 1;
+    for (; res < len; ++res)
+    {
+        if (!ares[res].removed)
+            break;
+    }
+    return res;
+}
+
+inline void clear_until(vector<ApproxItem>& ares, const size_t start, const size_t finish)
+{
+    const size_t len = ares.size();
+    for (size_t i = start + 1; i < finish && i < len; ++i)
+    {
+        ares[i].removed = true;
+    }
+}
+
+static bool calc_new_start(vector<ApproxItem>& ares, size_t& res)
+{
+    const size_t len = ares.size();
+    CV_Assert(len > 0);
+    size_t i1;
+    // remove all previous items from the beginning
+    for (i1 = 1; i1 < len && ares[i1].s != 0; i1++)
+    {
+        ares[i1 - 1].s = 0;
+    }
+    if (i1 == len)
+    {
+        // all points survived - skip to the end
+        return false;
+    }
+    i1--;
+
+    size_t i2;
+    // remove all following items from the end
+    for (i2 = len - 2; i2 > 0 && ares[i2].s != 0; i2--)
+    {
+        clear_until(ares, i2, len);
+        ares[i2 + 1].s = 0;
+    }
+    i2++;
+
+    // only two points left
+    if (i1 == 0 && i2 == len - 1)
+    {
+        // find first non-removed element from the start
+        i1 = get_next_idx(ares, 0);
+        // append first item to the end
+        ares.push_back(ares[0]);
+        (ares.end() - 1)->removed = false;
+    }
+    res = i1;
+    return true;
+}
+
+static void pass_cleanup(vector<ApproxItem>& ares, size_t start_idx)
+{
+    int count = 1;
+
+    const size_t len = ares.size();
+    size_t first = start_idx;
+    for (size_t i = start_idx, prev = start_idx; i < len; ++i)
+    {
+        ApproxItem& item = ares[i];
+        if (item.removed)
+            continue;
+        size_t next_idx = get_next_idx(ares, i);
+        if (next_idx == len || next_idx - i != 1)
+        {
+            if (count >= 2)
+            {
+                if (count == 2)
+                {
+                    const int s1 = ares[prev].s;
+                    const int s2 = ares[i].s;
+
+                    if (s1 > s2 || (s1 == s2 && ares[prev].k <= ares[i].k))
+                        /* remove second */
+                        ares[i].removed = true;
+                    else
+                        /* remove first */
+                        ares[prev].removed = true;
+                }
+                else
+                {
+                    first = get_next_idx(ares, first);
+                    clear_until(ares, first, i);
+                }
+            }
+            first = i;
+            count = 1;
+        }
+        else
+        {
+            ++count;
+        }
+        prev = i;
+    }
+}
+
+}  // namespace
+
+
+vector<Point> cv::approximateChainTC89(vector<schar> chain, const Point& origin, const int method)
+{
+    if (chain.size() == 0)
+    {
+        return vector<Point>({origin});
+    }
+
+    const bool isApprox = method == CHAIN_APPROX_TC89_L1 || method == CHAIN_APPROX_TC89_KCOS;
+
+    ApproxItem root;
+    vector<ApproxItem> ares = pass_0(chain, origin, isApprox, method == CHAIN_APPROX_NONE);
+
+    if (isApprox)
+    {
+        CV_DbgAssert(ares.size() < (size_t)numeric_limits<int>::max());
+
+        // Pass 1.
+        // Determines support region for all the remained points */
+        for (size_t i = 0; i < ares.size(); ++i)
+        {
+            ApproxItem& item = ares[i];
+            if (item.removed)
+                continue;
+            item.k = calc_support(ares, i);
+
+            if (method == CHAIN_APPROX_TC89_KCOS)
+                item.s = calc_cosine(ares, i);
+        }
+
+        // Pass 2.
+        // Performs non-maxima suppression
+        for (size_t i = 0; i < ares.size(); ++i)
+        {
+            ApproxItem& item = ares[i];
+            if (calc_nms_cleanup(ares, i))
+            {
+                item.s = 0;  // "clear"
+                item.removed = true;
+            }
+        }
+
+        // Pass 3.
+        // Removes non-dominant points with 1-length support region */
+        for (size_t i = 0; i < ares.size(); ++i)
+        {
+            ApproxItem& item = ares[i];
+            if (item.removed)
+                continue;
+            if (item.k == 1 && calc_dominance(ares, i))
+            {
+                item.s = 0;
+                item.removed = true;
+            }
+        }
+
+        if (method == cv::CHAIN_APPROX_TC89_L1)
+        {
+            // Pass 4.
+            // Cleans remained couples of points
+            bool skip = false;
+            size_t new_start_idx = 0;
+            const size_t len = ares.size();
+            if (ares[0].s != 0 && ares[len - 1].s != 0)
+            {
+                if (!calc_new_start(ares, new_start_idx))
+                {
+                    skip = true;
+                }
+            }
+            if (!skip)
+            {
+                pass_cleanup(ares, new_start_idx);
+            }
+        }
+    }
+
+    return gatherPoints(ares);
+}
diff --git a/modules/imgproc/src/contours_common.cpp b/modules/imgproc/src/contours_common.cpp
new file mode 100644
index 000000000000..a8cb12c1a2f3
--- /dev/null
+++ b/modules/imgproc/src/contours_common.cpp
@@ -0,0 +1,75 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "contours_common.hpp"
+#include <map>
+#include <limits>
+
+using namespace std;
+using namespace cv;
+
+void cv::contourTreeToResults(CTree& tree,
+                              int res_type,
+                              OutputArrayOfArrays& _contours,
+                              OutputArray& _hierarchy)
+{
+    // check if there are no results
+    if (tree.isEmpty() || (tree.elem(0).body.isEmpty() && (tree.elem(0).first_child == -1)))
+    {
+        _contours.clear();
+        return;
+    }
+
+    // mapping for indexes (original -> resulting)
+    map<int, int> index_mapping;
+    index_mapping[-1] = -1;
+    index_mapping[0] = -1;
+
+    CV_Assert(tree.size() < (size_t)numeric_limits<int>::max());
+    const int total = (int)tree.size() - 1;
+    _contours.create(total, 1, 0, -1, true);
+    {
+        int i = 0;
+        CIterator it(tree);
+        while (!it.isDone())
+        {
+            const CNode& elem = it.getNext_s();
+            CV_Assert(elem.self() != -1);
+            if (elem.self() == 0)
+                continue;
+            index_mapping[elem.self()] = i;
+            CV_Assert(elem.body.size() < (size_t)numeric_limits<int>::max());
+            const int sz = (int)elem.body.size();
+            _contours.create(sz, 1, res_type, i, true);
+            if (sz > 0)
+            {
+                Mat cmat = _contours.getMat(i);
+                CV_Assert(cmat.isContinuous());
+                elem.body.copyTo(cmat.data);
+            }
+            ++i;
+        }
+    }
+
+    if (_hierarchy.needed())
+    {
+        _hierarchy.create(1, total, CV_32SC4, -1, true);
+        Mat h_mat = _hierarchy.getMat();
+        int i = 0;
+        CIterator it(tree);
+        while (!it.isDone())
+        {
+            const CNode& elem = it.getNext_s();
+            if (elem.self() == 0)
+                continue;
+            Vec4i& h_vec = h_mat.at<Vec4i>(i);
+            h_vec = Vec4i(index_mapping.at(elem.next),
+                          index_mapping.at(elem.prev),
+                          index_mapping.at(elem.first_child),
+                          index_mapping.at(elem.parent));
+            ++i;
+        }
+    }
+}
diff --git a/modules/imgproc/src/contours_common.hpp b/modules/imgproc/src/contours_common.hpp
new file mode 100644
index 000000000000..b22c5cfd0b69
--- /dev/null
+++ b/modules/imgproc/src/contours_common.hpp
@@ -0,0 +1,219 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_CONTOURS_COMMON_HPP
+#define OPENCV_CONTOURS_COMMON_HPP
+
+#include "precomp.hpp"
+#include <stack>
+
+namespace cv {
+
+static const schar MAX_SIZE = 16;
+
+static const cv::Point chainCodeDeltas[8] =
+    {{1, 0}, {1, -1}, {0, -1}, {-1, -1}, {-1, 0}, {-1, 1}, {0, 1}, {1, 1}};
+
+static inline int getDelta(schar s, size_t step)
+{
+    CV_DbgAssert(s >= 0 && s < 16);
+    const cv::Point res = chainCodeDeltas[s % 8];
+    return res.x + res.y * (int)step;
+}
+
+inline schar clamp_direction(schar dir)
+{
+    return std::min(dir, (schar)15);
+}
+
+template <typename T>
+class TreeNode
+{
+private:
+    int self_;
+
+public:
+    // tree hierarchy (parent - children)
+    int parent;
+    int first_child;
+    // 1st linked list - bidirectional - sibling children
+    int prev;
+    int next;
+    // 2nd linked list - unidirectional - not related to 1st list
+    int ctable_next;
+    T body;
+
+public:
+    TreeNode(int self) :
+        self_(self), parent(-1), first_child(-1), prev(-1), next(-1), ctable_next(-1)
+    {
+        CV_Assert(self >= 0);
+    }
+    int self() const
+    {
+        return self_;
+    }
+};
+
+template <typename T>
+class Tree
+{
+private:
+    std::vector<TreeNode<T>> nodes;
+
+public:
+    TreeNode<T>& newElem()
+    {
+        const size_t idx = nodes.size();
+        CV_DbgAssert(idx < (size_t)std::numeric_limits<int>::max());
+        nodes.push_back(TreeNode<T>((int)idx));
+        return nodes[idx];
+    }
+    TreeNode<T>& elem(int idx)
+    {
+        CV_DbgAssert(idx >= 0 && (size_t)idx < nodes.size());
+        return nodes[(size_t)idx];
+    }
+    const TreeNode<T>& elem(int idx) const
+    {
+        CV_DbgAssert(idx >= 0 && (size_t)idx < nodes.size());
+        return nodes[(size_t)idx];
+    }
+    int lastSibling(int e) const
+    {
+        if (e != -1)
+        {
+            while (true)
+            {
+                const TreeNode<T>& cur_elem = elem(e);
+                if (cur_elem.next == -1)
+                    break;
+                e = cur_elem.next;
+            }
+        }
+        return e;
+    }
+    void addSiblingAfter(int prev, int idx)
+    {
+        TreeNode<T>& prev_item = nodes[prev];
+        TreeNode<T>& child = nodes[idx];
+        child.parent = prev_item.parent;
+        if (prev_item.next != -1)
+        {
+            nodes[prev_item.next].prev = idx;
+            child.next = prev_item.next;
+        }
+        child.prev = prev;
+        prev_item.next = idx;
+    }
+    void addChild(int parent_idx, int child_idx)
+    {
+        TreeNode<T>& parent = nodes[parent_idx];
+        TreeNode<T>& child = nodes[child_idx];
+        if (parent.first_child != -1)
+        {
+            TreeNode<T>& fchild_ = nodes[parent.first_child];
+            fchild_.prev = child_idx;
+            child.next = parent.first_child;
+        }
+        parent.first_child = child_idx;
+        child.parent = parent_idx;
+        child.prev = -1;
+    }
+    bool isEmpty() const
+    {
+        return nodes.size() == 0;
+    }
+    size_t size() const
+    {
+        return nodes.size();
+    }
+};
+
+template <typename T>
+class TreeIterator
+{
+public:
+    TreeIterator(Tree<T>& tree_) : tree(tree_)
+    {
+        CV_Assert(!tree.isEmpty());
+        levels.push(0);
+    }
+    bool isDone() const
+    {
+        return levels.empty();
+    }
+    const TreeNode<T>& getNext_s()
+    {
+        int idx = levels.top();
+        levels.pop();
+        const TreeNode<T>& res = tree.elem(idx);
+        int cur = tree.lastSibling(res.first_child);
+        while (cur != -1)
+        {
+            levels.push(cur);
+            cur = tree.elem(cur).prev;
+        }
+        return res;
+    }
+
+private:
+    std::stack<int> levels;
+    Tree<T>& tree;
+};
+
+//==============================================================================
+
+class Contour
+{
+public:
+    cv::Rect brect;
+    cv::Point origin;
+    std::vector<cv::Point> pts;
+    std::vector<schar> codes;
+    bool isHole;
+    bool isChain;
+
+    Contour() : isHole(false), isChain(false) {}
+    void updateBoundingRect() {}
+    bool isEmpty() const
+    {
+        return pts.size() == 0 && codes.size() == 0;
+    }
+    size_t size() const
+    {
+        return isChain ? codes.size() : pts.size();
+    }
+    void copyTo(void* data) const
+    {
+        // NOTE: Mat::copyTo doesn't work because it creates new Mat object
+        //       instead of reusing existing vector data
+        if (isChain)
+        {
+            memcpy(data, &codes[0], codes.size() * sizeof(codes[0]));
+        }
+        else
+        {
+            memcpy(data, &pts[0], pts.size() * sizeof(pts[0]));
+        }
+    }
+};
+
+typedef TreeNode<Contour> CNode;
+typedef Tree<Contour> CTree;
+typedef TreeIterator<Contour> CIterator;
+
+
+void contourTreeToResults(CTree& tree,
+                          int res_type,
+                          cv::OutputArrayOfArrays& _contours,
+                          cv::OutputArray& _hierarchy);
+
+
+std::vector<Point>
+    approximateChainTC89(std::vector<schar> chain, const Point& origin, const int method);
+
+}  // namespace cv
+
+#endif  // OPENCV_CONTOURS_COMMON_HPP
diff --git a/modules/imgproc/src/contours_link.cpp b/modules/imgproc/src/contours_link.cpp
new file mode 100644
index 000000000000..8df88fc1238c
--- /dev/null
+++ b/modules/imgproc/src/contours_link.cpp
@@ -0,0 +1,415 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "contours_common.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+using namespace cv;
+using namespace std;
+
+//==============================================================================
+
+namespace {
+
+inline static int findStartContourPoint(uchar* src_data, Size img_size, int j)
+{
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    v_uint8 v_zero = vx_setzero_u8();
+    for (; j <= img_size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
+    {
+        v_uint8 vmask = (v_ne(vx_load((uchar*)(src_data + j)), v_zero));
+        if (v_check_any(vmask))
+        {
+            j += v_scan_forward(vmask);
+            return j;
+        }
+    }
+#endif
+    for (; j < img_size.width && !src_data[j]; ++j)
+        ;
+    return j;
+}
+
+inline static int findEndContourPoint(uchar* src_data, Size img_size, int j)
+{
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if (j < img_size.width && !src_data[j])
+    {
+        return j;
+    }
+    else
+    {
+        v_uint8 v_zero = vx_setzero_u8();
+        for (; j <= img_size.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
+        {
+            v_uint8 vmask = (v_eq(vx_load((uchar*)(src_data + j)), v_zero));
+            if (v_check_any(vmask))
+            {
+                j += v_scan_forward(vmask);
+                return j;
+            }
+        }
+    }
+#endif
+    for (; j < img_size.width && src_data[j]; ++j)
+        ;
+
+    return j;
+}
+
+//==============================================================================
+
+struct LinkRunPoint
+{
+    int link;
+    int next;
+    Point pt;
+    LinkRunPoint() : link(-1), next(-1) {}
+    LinkRunPoint(const Point& pt_) : link(-1), next(-1), pt(pt_) {}
+};
+
+typedef LinkRunPoint LRP;
+
+//==============================================================================
+
+class LinkRunner
+{
+public:
+    enum LinkConnectionDirection
+    {
+        ICV_SINGLE = 0,
+        ICV_CONNECTING_ABOVE = 1,
+        ICV_CONNECTING_BELOW = -1,
+    };
+
+    CTree tree;
+
+    vector<LRP> rns;
+    vector<int> ext_rns;
+    vector<int> int_rns;
+
+public:
+    LinkRunner()
+    {
+        tree.newElem();
+        rns.reserve(100);
+    }
+    void process(Mat& image);
+    void convertLinks(int& first, int& prev, bool isHole);
+    void establishLinks(int& prev_point,
+                        int upper_run,
+                        int lower_run,
+                        const int upper_total,
+                        const int lower_total);
+};
+
+void LinkRunner::convertLinks(int& first, int& prev, bool isHole)
+{
+    const vector<int>& contours = isHole ? int_rns : ext_rns;
+    int count = 0;
+    for (int j = 0; j < (int)contours.size(); j++, count++)
+    {
+        int start = contours[j];
+        int cur = start;
+
+        if (rns[cur].link == -1)
+            continue;
+
+        CNode& node = tree.newElem();
+        node.body.isHole = isHole;
+
+        do
+        {
+            node.body.pts.push_back(rns[cur].pt);
+            int p_temp = cur;
+            cur = rns[cur].link;
+            rns[p_temp].link = -1;
+        }
+        while (cur != start);
+
+        if (first == 0)
+        {
+            tree.addChild(0, node.self());
+            prev = first = node.self();
+        }
+        else
+        {
+            tree.addSiblingAfter(prev, node.self());
+            prev = node.self();
+        }
+    }
+}
+void LinkRunner::establishLinks(int& prev_point,
+                                int upper_run,
+                                int lower_run,
+                                const int upper_total,
+                                const int lower_total)
+{
+    int k, n;
+    int connect_flag = ICV_SINGLE;
+    for (k = 0, n = 0; k < upper_total / 2 && n < lower_total / 2;)
+    {
+        switch (connect_flag)
+        {
+            case ICV_SINGLE:
+                if (rns[rns[upper_run].next].pt.x < rns[rns[lower_run].next].pt.x)
+                {
+                    if (rns[rns[upper_run].next].pt.x >= rns[lower_run].pt.x - 1)
+                    {
+                        rns[lower_run].link = upper_run;
+                        connect_flag = ICV_CONNECTING_ABOVE;
+                        prev_point = rns[upper_run].next;
+                    }
+                    else
+                        rns[rns[upper_run].next].link = upper_run;
+                    k++;
+                    upper_run = rns[rns[upper_run].next].next;
+                }
+                else
+                {
+                    if (rns[upper_run].pt.x <= rns[rns[lower_run].next].pt.x + 1)
+                    {
+                        rns[lower_run].link = upper_run;
+                        connect_flag = ICV_CONNECTING_BELOW;
+                        prev_point = rns[lower_run].next;
+                    }
+                    else
+                    {
+                        rns[lower_run].link = rns[lower_run].next;
+                        // First point of contour
+                        ext_rns.push_back(lower_run);
+                    }
+                    n++;
+                    lower_run = rns[rns[lower_run].next].next;
+                }
+                break;
+            case ICV_CONNECTING_ABOVE:
+                if (rns[upper_run].pt.x > rns[rns[lower_run].next].pt.x + 1)
+                {
+                    rns[prev_point].link = rns[lower_run].next;
+                    connect_flag = ICV_SINGLE;
+                    n++;
+                    lower_run = rns[rns[lower_run].next].next;
+                }
+                else
+                {
+                    rns[prev_point].link = upper_run;
+                    if (rns[rns[upper_run].next].pt.x < rns[rns[lower_run].next].pt.x)
+                    {
+                        k++;
+                        prev_point = rns[upper_run].next;
+                        upper_run = rns[rns[upper_run].next].next;
+                    }
+                    else
+                    {
+                        connect_flag = ICV_CONNECTING_BELOW;
+                        prev_point = rns[lower_run].next;
+                        n++;
+                        lower_run = rns[rns[lower_run].next].next;
+                    }
+                }
+                break;
+            case ICV_CONNECTING_BELOW:
+                if (rns[lower_run].pt.x > rns[rns[upper_run].next].pt.x + 1)
+                {
+                    rns[rns[upper_run].next].link = prev_point;
+                    connect_flag = ICV_SINGLE;
+                    k++;
+                    upper_run = rns[rns[upper_run].next].next;
+                }
+                else
+                {
+                    // First point of contour
+                    int_rns.push_back(lower_run);
+
+                    rns[lower_run].link = prev_point;
+                    if (rns[rns[lower_run].next].pt.x < rns[rns[upper_run].next].pt.x)
+                    {
+                        n++;
+                        prev_point = rns[lower_run].next;
+                        lower_run = rns[rns[lower_run].next].next;
+                    }
+                    else
+                    {
+                        connect_flag = ICV_CONNECTING_ABOVE;
+                        k++;
+                        prev_point = rns[upper_run].next;
+                        upper_run = rns[rns[upper_run].next].next;
+                    }
+                }
+                break;
+        }
+    }  // k, n
+
+    for (; n < lower_total / 2; n++)
+    {
+        if (connect_flag != ICV_SINGLE)
+        {
+            rns[prev_point].link = rns[lower_run].next;
+            connect_flag = ICV_SINGLE;
+            lower_run = rns[rns[lower_run].next].next;
+            continue;
+        }
+        rns[lower_run].link = rns[lower_run].next;
+
+        // First point of contour
+        ext_rns.push_back(lower_run);
+        lower_run = rns[rns[lower_run].next].next;
+    }
+
+    for (; k < upper_total / 2; k++)
+    {
+        if (connect_flag != ICV_SINGLE)
+        {
+            rns[rns[upper_run].next].link = prev_point;
+            connect_flag = ICV_SINGLE;
+            upper_run = rns[rns[upper_run].next].next;
+            continue;
+        }
+        rns[rns[upper_run].next].link = upper_run;
+        upper_run = rns[rns[upper_run].next].next;
+    }
+}
+
+
+void LinkRunner::process(Mat& image)
+{
+    const Size sz = image.size();
+    int j;
+    int lower_total;
+    int upper_total;
+    int all_total;
+
+    Point cur_point;
+
+    rns.reserve(sz.height);  // optimization, assuming some contours exist
+
+    // First line. None of runs is binded
+    rns.push_back(LRP());
+    int upper_line = (int)rns.size() - 1;
+    int cur = upper_line;
+    for (j = 0; j < sz.width;)
+    {
+        j = findStartContourPoint(image.ptr<uchar>(), sz, j);
+
+        if (j == sz.width)
+            break;
+
+        cur_point.x = j;
+
+        rns.push_back(LRP(cur_point));
+        rns[cur].next = (int)rns.size() - 1;
+        cur = rns[cur].next;
+
+        j = findEndContourPoint(image.ptr<uchar>(), sz, j + 1);
+
+        cur_point.x = j - 1;
+
+        rns.push_back(LRP(cur_point));
+        rns[cur].next = (int)rns.size() - 1;
+        rns[cur].link = rns[cur].next;
+
+        // First point of contour
+        ext_rns.push_back(cur);
+        cur = rns[cur].next;
+    }
+    upper_line = rns[upper_line].next;
+    upper_total = (int)rns.size() - 1;  // runs->total - 1;
+
+    int last_elem = cur;
+    rns[cur].next = -1;
+    int prev_point = -1;
+    int lower_line = -1;
+    for (int i = 1; i < sz.height; i++)
+    {
+        // Find runs in next line
+        cur_point.y = i;
+        all_total = (int)rns.size();  // runs->total;
+        for (j = 0; j < sz.width;)
+        {
+            j = findStartContourPoint(image.ptr<uchar>(i), sz, j);
+
+            if (j == sz.width)
+                break;
+
+            cur_point.x = j;
+
+            rns.push_back(LRP(cur_point));
+            rns[cur].next = (int)rns.size() - 1;
+            cur = rns[cur].next;
+
+            j = findEndContourPoint(image.ptr<uchar>(i), sz, j + 1);
+
+            cur_point.x = j - 1;
+            rns.push_back(LRP(cur_point));
+            cur = rns[cur].next = (int)rns.size() - 1;
+        }  // j
+        lower_line = rns[last_elem].next;
+        lower_total = (int)rns.size() - all_total;  // runs->total - all_total;
+        last_elem = cur;
+        rns[cur].next = -1;
+
+        CV_DbgAssert(rns.size() < (size_t)numeric_limits<int>::max());
+
+        // Find links between runs of lower_line and upper_line
+        establishLinks(prev_point, upper_line, lower_line, upper_total, lower_total);
+
+        upper_line = lower_line;
+        upper_total = lower_total;
+    }  // i
+
+    // the last line of image
+    int upper_run = upper_line;
+    for (int k = 0; k < upper_total / 2; k++)
+    {
+        rns[rns[upper_run].next].link = upper_run;
+        upper_run = rns[rns[upper_run].next].next;
+    }
+
+    int first = 0;
+    int prev = 0;
+    convertLinks(first, prev, false);
+    convertLinks(first, prev, true);
+}
+
+}  // namespace
+
+//==============================================================================
+
+void cv::findContoursLinkRuns(InputArray _image,
+                              OutputArrayOfArrays _contours,
+                              OutputArray _hierarchy)
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_CheckType(_image.type(),
+                 _image.type() == CV_8UC1 || _image.type() == CV_8SC1,
+                 "Bad input image type, must be CV_8UC1 or CV_8SC1");
+
+    // Sanity check: output must be of type vector<vector<Point>>
+    CV_Assert(_contours.kind() == _InputArray::STD_VECTOR_VECTOR ||
+              _contours.kind() == _InputArray::STD_VECTOR_MAT ||
+              _contours.kind() == _InputArray::STD_VECTOR_UMAT);
+
+    if (!_contours.empty())
+        CV_CheckTypeEQ(_contours.type(), CV_32SC2, "Contours must have type CV_32SC2");
+
+    if (_hierarchy.needed())
+        _hierarchy.clear();
+
+    Mat image = _image.getMat();
+
+    LinkRunner runner;
+    runner.process(image);
+
+    contourTreeToResults(runner.tree, CV_32SC2, _contours, _hierarchy);
+}
+
+
+void cv::findContoursLinkRuns(InputArray _image, OutputArrayOfArrays _contours)
+{
+    CV_INSTRUMENT_REGION();
+    findContoursLinkRuns(_image, _contours, noArray());
+}
diff --git a/modules/imgproc/src/contours_new.cpp b/modules/imgproc/src/contours_new.cpp
new file mode 100644
index 000000000000..0b82496aaaed
--- /dev/null
+++ b/modules/imgproc/src/contours_new.cpp
@@ -0,0 +1,697 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "opencv2/imgproc.hpp"
+#include "precomp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/check.hpp"
+#include "opencv2/core/utils/logger.hpp"
+#include <iostream>
+#include <array>
+#include <limits>
+#include <map>
+
+#include "contours_common.hpp"
+
+using namespace std;
+using namespace cv;
+
+//==============================================================================
+
+namespace {
+
+template <typename T>
+struct Trait
+{
+};
+
+static const schar MASK8_RIGHT = '\x80';  // 1000 0000
+static const schar MASK8_NEW = '\x02';  // 0000 0010 (+2)
+static const schar MASK8_FLAGS = '\xFE';  // 1111 1110 (-2)
+static const schar MASK8_BLACK = '\x01';  // 0000 0001 - black pixel
+
+static const schar MASK8_LVAL = '\x7F';  // 0111 1111 (for table)
+
+template <>
+struct Trait<schar>
+{
+    static inline bool checkValue(const schar* elem, const schar*)
+    {
+        return *elem != 0;
+    }
+    static inline bool isVal(const schar* elem, const schar*)
+    {
+        return *elem == MASK8_BLACK;
+    }
+    static inline bool isRight(const schar* elem, const schar*)
+    {
+        return (*elem & MASK8_RIGHT) != 0;
+    }
+    static inline void setRightFlag(schar* elem, const schar*, schar nbd)
+    {
+        *elem = nbd | MASK8_RIGHT;
+    }
+    static inline void setNewFlag(schar* elem, const schar*, schar nbd)
+    {
+        *elem = nbd;
+    }
+};
+
+static const int MASK_RIGHT = 0x80000000;  // 100..000
+static const int MASK_NEW = 0x40000000;  // 010..000
+static const int MASK_FLAGS = 0xC0000000;  // right + new
+static const int MASK_VAL = 0x3FFFFFFF;  // ~flags - pixel label
+
+template <>
+struct Trait<int>
+{
+    static inline bool checkValue(const int* elem, const int* elem0)
+    {
+        return (*elem & MASK_VAL) == (*elem0 & MASK_VAL);
+    }
+    static inline bool isVal(const int* elem, const int* elem0)
+    {
+        return *elem == (*elem0 & MASK_VAL);
+    }
+    static inline bool isRight(const int* elem, const int* elem0)
+    {
+        return (*elem & MASK_RIGHT) == (*elem0 & MASK8_RIGHT);
+    }
+    static inline void setRightFlag(int* elem, const int* elem0, int)
+    {
+        *elem = (*elem0 & MASK_VAL) | MASK_NEW | MASK_RIGHT;
+    }
+    static inline void setNewFlag(int* elem, const int* elem0, int)
+    {
+        *elem = (*elem0 & MASK_VAL) | MASK_NEW;
+    }
+};
+
+}  // namespace
+
+
+//==============================================================================
+
+
+namespace {
+
+template <typename T>
+static bool icvTraceContour(Mat& image, const Point& start, const Point& end, bool isHole)
+{
+    const T* stop_ptr = image.ptr<T>(end.y, end.x);
+    const size_t step = image.step1();
+    const T *i0 = image.ptr<T>(start.y, start.x), *i1, *i3, *i4 = NULL;
+    const schar s_end = isHole ? 0 : 4;
+
+    schar s = s_end;
+    do
+    {
+        s = (s - 1) & 7;
+        i1 = i0 + getDelta(s, step);
+    }
+    while (!Trait<T>::checkValue(i1, i0) && s != s_end);
+
+    i3 = i0;
+
+    // check single pixel domain
+    if (s != s_end)
+    {
+        // follow border
+        for (;;)
+        {
+            CV_Assert(i3 != NULL);
+            s = clamp_direction(s);
+            while (s < MAX_SIZE - 1)
+            {
+                ++s;
+                i4 = i3 + getDelta(s, step);
+                CV_Assert(i4 != NULL);
+                if (Trait<T>::checkValue(i4, i0))
+                    break;
+            }
+
+            if (i3 == stop_ptr)
+            {
+                if (!Trait<T>::isRight(i3, i0))
+                {
+                    // it's the only contour
+                    return true;
+                }
+
+                // check if this is the last contour
+                // encountered during a raster scan
+                const T* i5;
+                schar t = s;
+                while (true)
+                {
+                    t = (t - 1) & 7;
+                    i5 = i3 + getDelta(t, step);
+                    if (*i5 != 0)
+                        break;
+                    if (t == 0)
+                        return true;
+                }
+            }
+
+            if ((i4 == i0 && i3 == i1))
+                break;
+
+            i3 = i4;
+            s = (s + 4) & 7;
+        }  // end of border following loop
+    }
+    else
+    {
+        return i3 == stop_ptr;
+    }
+
+    return false;
+}
+
+template <typename T>
+static void icvFetchContourEx(Mat& image,
+                              const Point& start,
+                              T nbd,
+                              Contour& res_contour,
+                              const bool isDirect)
+{
+    const size_t step = image.step1();
+    T *i0 = image.ptr<T>(start.y, start.x), *i1, *i3, *i4 = NULL;
+
+    Point pt = res_contour.origin;
+
+    cv::Rect rect(pt.x, pt.y, pt.x, pt.y);
+
+    schar s_end = res_contour.isHole ? 0 : 4;
+    schar s = s_end;
+    do
+    {
+        s = (s - 1) & 7;
+        i1 = i0 + getDelta(s, step);
+    }
+    while (!Trait<T>::checkValue(i1, i0) && s != s_end);
+
+    if (s == s_end)
+    {
+        Trait<T>::setRightFlag(i0, i0, nbd);
+        if (!res_contour.isChain)
+        {
+            res_contour.pts.push_back(pt);
+        }
+    }
+    else
+    {
+        i3 = i0;
+        schar prev_s = s ^ 4;
+
+        // follow border
+        for (;;)
+        {
+            s_end = s;
+            s = clamp_direction(s);
+            while (s < MAX_SIZE - 1)
+            {
+                ++s;
+                i4 = i3 + getDelta(s, step);
+                CV_Assert(i4 != NULL);
+                if (Trait<T>::checkValue(i4, i0))
+                    break;
+            }
+            s &= 7;
+
+            // check "right" bound
+            if ((unsigned)(s - 1) < (unsigned)s_end)
+            {
+                Trait<T>::setRightFlag(i3, i0, nbd);
+            }
+            else if (Trait<T>::isVal(i3, i0))
+            {
+                Trait<T>::setNewFlag(i3, i0, nbd);
+            }
+
+            if (res_contour.isChain)
+            {
+                res_contour.codes.push_back(s);
+            }
+            else if (s != prev_s || isDirect)
+            {
+                res_contour.pts.push_back(pt);
+            }
+
+            if (s != prev_s)
+            {
+                // update bounds
+                if (pt.x < rect.x)
+                    rect.x = pt.x;
+                else if (pt.x > rect.width)
+                    rect.width = pt.x;
+
+                if (pt.y < rect.y)
+                    rect.y = pt.y;
+                else if (pt.y > rect.height)
+                    rect.height = pt.y;
+            }
+
+            prev_s = s;
+            pt += chainCodeDeltas[s];
+
+            if (i4 == i0 && i3 == i1)
+                break;
+
+            i3 = i4;
+            s = (s + 4) & 7;
+        }
+    }
+    rect.width -= rect.x - 1;
+    rect.height -= rect.y - 1;
+    res_contour.brect = rect;
+}
+
+}  // namespace
+
+
+//==============================================================================
+
+//
+// Raster->Chain Tree (Suzuki algorithms)
+//
+
+// Structure that is used for sequential retrieving contours from the image.
+// It supports both hierarchical and plane variants of Suzuki algorithm.
+struct ContourScanner_
+{
+    Mat image;
+    Point offset;  // ROI offset: coordinates, added to each contour point
+    Point pt;  // current scanner position
+    Point lnbd;  // position of the last met contour
+    schar nbd;  // current mark val
+    int approx_method1;  // approx method when tracing
+    int approx_method2;  // final approx method
+    int mode;
+    CTree tree;
+    array<int, 128> ctable;
+
+public:
+    ContourScanner_() {}
+    ~ContourScanner_() {}
+    inline bool isInt() const
+    {
+        return (this->mode == RETR_FLOODFILL);
+    }
+    inline bool isSimple() const
+    {
+        return (this->mode == RETR_EXTERNAL || this->mode == RETR_LIST);
+    }
+
+    CNode& makeContour(schar& nbd_, const bool is_hole, const int x, const int y);
+    bool contourScan(const int prev, int& p, Point& last_pos, const int x, const int y);
+    int findFirstBoundingContour(const Point& last_pos, const int y, const int lval, int par);
+    int findNextX(int x, int y, int& prev, int& p);
+    bool findNext();
+
+    static shared_ptr<ContourScanner_> create(Mat img, int mode, int method, Point offset);
+};  // class ContourScanner_
+
+typedef shared_ptr<ContourScanner_> ContourScanner;
+
+
+shared_ptr<ContourScanner_> ContourScanner_::create(Mat img, int mode, int method, Point offset)
+{
+    if (mode == RETR_CCOMP && img.type() == CV_32SC1)
+        mode = RETR_FLOODFILL;
+
+    if (mode == RETR_FLOODFILL)
+        CV_CheckTypeEQ(img.type(), CV_32SC1, "RETR_FLOODFILL mode supports only CV_32SC1 images");
+    else
+        CV_CheckTypeEQ(img.type(),
+                       CV_8UC1,
+                       "Modes other than RETR_FLOODFILL and RETR_CCOMP support only CV_8UC1 "
+                       "images");
+
+    CV_Check(mode,
+             mode == RETR_EXTERNAL || mode == RETR_LIST || mode == RETR_CCOMP ||
+                 mode == RETR_TREE || mode == RETR_FLOODFILL,
+             "Wrong extraction mode");
+
+    CV_Check(method,
+             method == 0 || method == CHAIN_APPROX_NONE || method == CHAIN_APPROX_SIMPLE ||
+                 method == CHAIN_APPROX_TC89_L1 || method == CHAIN_APPROX_TC89_KCOS,
+             "Wrong approximation method");
+
+    Size size = img.size();
+    CV_Assert(size.height >= 1);
+
+    shared_ptr<ContourScanner_> scanner = make_shared<ContourScanner_>();
+    scanner->image = img;
+    scanner->mode = mode;
+    scanner->offset = offset;
+    scanner->pt = Point(1, 1);
+    scanner->lnbd = Point(0, 1);
+    scanner->nbd = 2;
+    CNode& root = scanner->tree.newElem();
+    CV_Assert(root.self() == 0);
+    root.body.isHole = true;
+    root.body.brect = Rect(Point(0, 0), size);
+    scanner->ctable.fill(-1);
+    scanner->approx_method2 = scanner->approx_method1 = method;
+    if (method == CHAIN_APPROX_TC89_L1 || method == CHAIN_APPROX_TC89_KCOS)
+        scanner->approx_method1 = CV_CHAIN_CODE;
+    return scanner;
+}
+
+CNode& ContourScanner_::makeContour(schar& nbd_, const bool is_hole, const int x, const int y)
+{
+    const bool isChain = (this->approx_method1 == CV_CHAIN_CODE);  // TODO: get rid of old constant
+    const bool isDirect = (this->approx_method1 == CHAIN_APPROX_NONE);
+
+    const Point start_pt(x - (is_hole ? 1 : 0), y);
+
+    CNode& res = tree.newElem();
+    if (isChain)
+        res.body.codes.reserve(200);
+    else
+        res.body.pts.reserve(200);
+    res.body.isHole = is_hole;
+    res.body.isChain = isChain;
+    res.body.origin = start_pt + offset;
+    if (isSimple())
+    {
+        icvFetchContourEx<schar>(this->image, start_pt, MASK8_NEW, res.body, isDirect);
+    }
+    else
+    {
+        schar lval;
+        if (isInt())
+        {
+            const int start_val = this->image.at<int>(start_pt);
+            lval = start_val & MASK8_LVAL;
+            icvFetchContourEx<int>(this->image, start_pt, 0, res.body, isDirect);
+        }
+        else
+        {
+            lval = nbd_;
+            // change nbd
+            nbd_ = (nbd_ + 1) & MASK8_LVAL;
+            if (nbd_ == 0)
+                nbd_ = MASK8_BLACK | MASK8_NEW;
+            icvFetchContourEx<schar>(this->image, start_pt, lval, res.body, isDirect);
+        }
+        res.body.brect.x -= this->offset.x;
+        res.body.brect.y -= this->offset.y;
+        res.ctable_next = this->ctable[lval];
+        this->ctable[lval] = res.self();
+    }
+    const Point prev_origin = res.body.origin;
+    res.body.origin = start_pt;
+    if (this->approx_method1 != this->approx_method2)
+    {
+        CV_Assert(res.body.isChain);
+        res.body.pts = approximateChainTC89(res.body.codes, prev_origin, this->approx_method2);
+        res.body.isChain = false;
+    }
+    return res;
+}
+
+bool ContourScanner_::contourScan(const int prev, int& p, Point& last_pos, const int x, const int y)
+{
+    bool is_hole = false;
+
+    /* if not external contour */
+    if (isInt())
+    {
+        if (!(((prev & MASK_FLAGS) != 0 || prev == 0) && (p & MASK_FLAGS) == 0))
+        {
+            if ((prev & MASK_FLAGS) != 0 || ((p & MASK_FLAGS) != 0))
+                return false;
+
+            if (prev & MASK_FLAGS)
+            {
+                last_pos.x = x - 1;
+            }
+            is_hole = true;
+        }
+    }
+    else
+    {
+        if (!(prev == 0 && p == 1))
+        {
+            if (p != 0 || prev < 1)
+                return false;
+
+            if (prev & MASK8_FLAGS)
+            {
+                last_pos.x = x - 1;
+            }
+            is_hole = true;
+        }
+    }
+
+    if (mode == RETR_EXTERNAL && (is_hole || this->image.at<schar>(last_pos) > 0))
+    {
+        return false;
+    }
+
+    /* find contour parent */
+    int main_parent = -1;
+    if (isSimple() || (!is_hole && (mode == RETR_CCOMP || mode == RETR_FLOODFILL)) ||
+        last_pos.x <= 0)
+    {
+        main_parent = 0;
+    }
+    else
+    {
+        int lval;
+        if (isInt())
+            lval = this->image.at<int>(last_pos.y, last_pos.x) & MASK8_LVAL;
+        else
+            lval = this->image.at<schar>(last_pos.y, last_pos.x) & MASK8_LVAL;
+
+        main_parent = findFirstBoundingContour(last_pos, y, lval, main_parent);
+
+        // if current contour is a hole and previous contour is a hole or
+        // current contour is external and previous contour is external then
+        // the parent of the contour is the parent of the previous contour else
+        // the parent is the previous contour itself.
+        {
+            CNode& main_parent_elem = tree.elem(main_parent);
+            if (main_parent_elem.body.isHole == is_hole)
+            {
+                if (main_parent_elem.parent != -1)
+                {
+                    main_parent = main_parent_elem.parent;
+                }
+                else
+                {
+                    main_parent = 0;
+                }
+            }
+        }
+
+        // hole flag of the parent must differ from the flag of the contour
+        {
+            CNode& main_parent_elem = tree.elem(main_parent);
+            CV_Assert(main_parent_elem.body.isHole != is_hole);
+        }
+    }
+
+    last_pos.x = x - (is_hole ? 1 : 0);
+
+    schar nbd_ = this->nbd;
+    CNode& new_contour = makeContour(nbd_, is_hole, x, y);
+    if (new_contour.parent == -1)
+    {
+        tree.addChild(main_parent, new_contour.self());
+    }
+    this->pt.x = !isInt() ? (x + 1) : (x + 1 - (is_hole ? 1 : 0));
+    this->pt.y = y;
+    this->nbd = nbd_;
+    return true;
+}
+
+int ContourScanner_::findFirstBoundingContour(const Point& last_pos,
+                                              const int y,
+                                              const int lval,
+                                              int par)
+{
+    const Point end_point(last_pos.x, y);
+    int res = par;
+    int cur = ctable[lval];
+    while (cur != -1)
+    {
+        CNode& cur_elem = tree.elem(cur);
+        if (((last_pos.x - cur_elem.body.brect.x) < cur_elem.body.brect.width) &&
+            ((last_pos.y - cur_elem.body.brect.y) < cur_elem.body.brect.height))
+        {
+            if (res != -1)
+            {
+                CNode& res_elem = tree.elem(res);
+                const Point origin = res_elem.body.origin;
+                const bool isHole = res_elem.body.isHole;
+                if (isInt())
+                {
+                    if (icvTraceContour<int>(this->image, origin, end_point, isHole))
+                        break;
+                }
+                else
+                {
+                    if (icvTraceContour<schar>(this->image, origin, end_point, isHole))
+                        break;
+                }
+            }
+            res = cur;
+        }
+        cur = cur_elem.ctable_next;
+    }
+    return res;
+}
+
+int ContourScanner_::findNextX(int x, int y, int& prev, int& p)
+{
+    const int width = this->image.size().width - 1;
+    if (isInt())
+    {
+        for (; x < width &&
+               ((p = this->image.at<int>(y, x)) == prev || (p & MASK_VAL) == (prev & MASK_VAL));
+             x++)
+            prev = p;
+    }
+    else
+    {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        if ((p = this->image.at<schar>(y, x)) != prev)
+        {
+            return x;
+        }
+        else
+        {
+            v_uint8 v_prev = vx_setall_u8((uchar)prev);
+            for (; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
+            {
+                v_uint8 vmask = (v_ne(vx_load(this->image.ptr<uchar>(y, x)), v_prev));
+                if (v_check_any(vmask))
+                {
+                    x += v_scan_forward(vmask);
+                    p = this->image.at<schar>(y, x);
+                    return x;
+                }
+            }
+        }
+#endif
+        for (; x < width && (p = this->image.at<schar>(y, x)) == prev; x++)
+            ;
+    }
+    return x;
+}
+
+bool ContourScanner_::findNext()
+{
+    int x = this->pt.x;
+    int y = this->pt.y;
+    int width = this->image.size().width - 1;
+    int height = this->image.size().height - 1;
+    Point last_pos = this->lnbd;
+    int prev = isInt() ? this->image.at<int>(y, x - 1) : this->image.at<schar>(y, x - 1);
+
+    for (; y < height; y++)
+    {
+        int p = 0;
+        for (; x < width; x++)
+        {
+            x = findNextX(x, y, prev, p);
+            if (x >= width)
+                break;
+            if (contourScan(prev, p, last_pos, x, y))
+            {
+                this->lnbd = last_pos;
+                return true;
+            }
+            else
+            {
+                prev = p;
+                if ((isInt() && (prev & MASK_FLAGS)) || (!isInt() && (prev & MASK8_FLAGS)))
+                {
+                    last_pos.x = x;
+                }
+            }
+        }
+        last_pos = Point(0, y + 1);
+        x = 1;
+        prev = 0;
+    }
+
+    return false;
+}
+
+//==============================================================================
+
+void cv::findContours(InputArray _image,
+                      OutputArrayOfArrays _contours,
+                      OutputArray _hierarchy,
+                      int mode,
+                      int method,
+                      Point offset)
+{
+    CV_INSTRUMENT_REGION();
+
+    // TODO: remove this block in future
+    if (method == 5 /*CV_LINK_RUNS*/)
+    {
+        CV_LOG_ONCE_WARNING(NULL,
+                            "LINK_RUNS mode has been extracted to separate function: "
+                            "cv::findContoursLinkRuns. "
+                            "Calling through cv::findContours will be removed in future.");
+        CV_CheckTrue(!_hierarchy.needed() || mode == RETR_CCOMP,
+                     "LINK_RUNS mode supports only simplified hierarchy output (mode=RETR_CCOMP)");
+        findContoursLinkRuns(_image, _contours, _hierarchy);
+        return;
+    }
+
+    // TODO: need enum value, need way to return contour starting points with chain codes
+    if (method == 0 /*CV_CHAIN_CODE*/)
+    {
+        CV_LOG_ONCE_WARNING(NULL,
+                            "Chain code output is an experimental feature and might change in "
+                            "future!");
+    }
+
+    // Sanity check: output must be of type vector<vector<Point>>
+    CV_Assert((_contours.kind() == _InputArray::STD_VECTOR_VECTOR) ||
+              (_contours.kind() == _InputArray::STD_VECTOR_MAT) ||
+              (_contours.kind() == _InputArray::STD_VECTOR_UMAT));
+
+    const int res_type = (method == 0 /*CV_CHAIN_CODE*/) ? CV_8SC1 : CV_32SC2;
+    if (!_contours.empty())
+    {
+        CV_CheckTypeEQ(_contours.type(),
+                       res_type,
+                       "Contours must have type CV_8SC1 (chain code) or CV_32SC2 (other methods)");
+    }
+
+    if (_hierarchy.needed())
+        _hierarchy.clear();
+
+    // preprocess
+    Mat image;
+    copyMakeBorder(_image, image, 1, 1, 1, 1, BORDER_CONSTANT | BORDER_ISOLATED, Scalar(0));
+    if (image.type() != CV_32SC1)
+        threshold(image, image, 0, 1, THRESH_BINARY);
+
+    // find contours
+    ContourScanner scanner = ContourScanner_::create(image, mode, method, offset + Point(-1, -1));
+    while (scanner->findNext())
+    {
+    }
+
+    contourTreeToResults(scanner->tree, res_type, _contours, _hierarchy);
+}
+
+void cv::findContours(InputArray _image,
+                      OutputArrayOfArrays _contours,
+                      int mode,
+                      int method,
+                      Point offset)
+{
+    CV_INSTRUMENT_REGION();
+    findContours(_image, _contours, noArray(), mode, method, offset);
+}
diff --git a/modules/imgproc/src/convhull.cpp b/modules/imgproc/src/convhull.cpp
index d6e02cc0fb9b..f16618ccb68f 100644
--- a/modules/imgproc/src/convhull.cpp
+++ b/modules/imgproc/src/convhull.cpp
@@ -471,7 +471,7 @@ cvConvexHull2( const CvArr* array, void* hull_storage,
     {
         ptseq = (CvSeq*)array;
         if( !CV_IS_SEQ_POINT_SET( ptseq ))
-            CV_Error( CV_StsBadArg, "Unsupported sequence type" );
+            CV_Error( cv::Error::StsBadArg, "Unsupported sequence type" );
         if( hull_storage == 0 )
             hull_storage = ptseq->storage;
     }
@@ -503,15 +503,15 @@ cvConvexHull2( const CvArr* array, void* hull_storage,
         mat = (CvMat*)hull_storage;
 
         if( (mat->cols != 1 && mat->rows != 1) || !CV_IS_MAT_CONT(mat->type))
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                      "The hull matrix should be continuous and have a single row or a single column" );
 
         if( mat->cols + mat->rows - 1 < ptseq->total )
-            CV_Error( CV_StsBadSize, "The hull matrix size might be not enough to fit the hull" );
+            CV_Error( cv::Error::StsBadSize, "The hull matrix size might be not enough to fit the hull" );
 
         if( CV_MAT_TYPE(mat->type) != CV_SEQ_ELTYPE(ptseq) &&
            CV_MAT_TYPE(mat->type) != CV_32SC1 )
-            CV_Error( CV_StsUnsupportedFormat,
+            CV_Error( cv::Error::StsUnsupportedFormat,
                      "The hull matrix must have the same type as input or 32sC1 (integers)" );
 
         hullseq = cvMakeSeqHeaderForArray(
@@ -526,7 +526,7 @@ cvConvexHull2( const CvArr* array, void* hull_storage,
     if( total == 0 )
     {
         if( !isStorage )
-            CV_Error( CV_StsBadSize,
+            CV_Error( cv::Error::StsBadSize,
                      "Point sequence can not be empty if the output is matrix" );
         return 0;
     }
@@ -592,7 +592,7 @@ CV_IMPL CvSeq* cvConvexityDefects( const CvArr* array,
     if( CV_IS_SEQ( ptseq ))
     {
         if( !CV_IS_SEQ_POINT_SET( ptseq ))
-            CV_Error( CV_StsUnsupportedFormat,
+            CV_Error( cv::Error::StsUnsupportedFormat,
                      "Input sequence is not a sequence of points" );
         if( !storage )
             storage = ptseq->storage;
@@ -603,13 +603,13 @@ CV_IMPL CvSeq* cvConvexityDefects( const CvArr* array,
     }
 
     if( CV_SEQ_ELTYPE( ptseq ) != CV_32SC2 )
-        CV_Error( CV_StsUnsupportedFormat, "Floating-point coordinates are not supported here" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Floating-point coordinates are not supported here" );
 
     if( CV_IS_SEQ( hull ))
     {
         int hulltype = CV_SEQ_ELTYPE( hull );
         if( hulltype != CV_SEQ_ELTYPE_PPOINT && hulltype != CV_SEQ_ELTYPE_INDEX )
-            CV_Error( CV_StsUnsupportedFormat,
+            CV_Error( cv::Error::StsUnsupportedFormat,
                      "Convex hull must represented as a sequence "
                      "of indices or sequence of pointers" );
         if( !storage )
@@ -620,15 +620,15 @@ CV_IMPL CvSeq* cvConvexityDefects( const CvArr* array,
         CvMat* mat = (CvMat*)hull;
 
         if( !CV_IS_MAT( hull ))
-            CV_Error(CV_StsBadArg, "Convex hull is neither sequence nor matrix");
+            CV_Error(cv::Error::StsBadArg, "Convex hull is neither sequence nor matrix");
 
         if( (mat->cols != 1 && mat->rows != 1) ||
            !CV_IS_MAT_CONT(mat->type) || CV_MAT_TYPE(mat->type) != CV_32SC1 )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                      "The matrix should be 1-dimensional and continuous array of int's" );
 
         if( mat->cols + mat->rows - 1 > ptseq->total )
-            CV_Error( CV_StsBadSize, "Convex hull is larger than the point sequence" );
+            CV_Error( cv::Error::StsBadSize, "Convex hull is larger than the point sequence" );
 
         hull = cvMakeSeqHeaderForArray(
                                        CV_SEQ_KIND_CURVE|CV_MAT_TYPE(mat->type)|CV_SEQ_FLAG_CLOSED,
@@ -639,13 +639,13 @@ CV_IMPL CvSeq* cvConvexityDefects( const CvArr* array,
     is_index = CV_SEQ_ELTYPE(hull) == CV_SEQ_ELTYPE_INDEX;
 
     if( !storage )
-        CV_Error( CV_StsNullPtr, "NULL storage pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL storage pointer" );
 
     defects = cvCreateSeq( CV_SEQ_KIND_GENERIC, sizeof(CvSeq), sizeof(CvConvexityDefect), storage );
 
     if( ptseq->total < 4 || hull->total < 3)
     {
-        //CV_ERROR( CV_StsBadSize,
+        //CV_ERROR( cv::Error::StsBadSize,
         //    "point seq size must be >= 4, convex hull size must be >= 3" );
         return defects;
     }
@@ -779,7 +779,7 @@ cvCheckContourConvexity( const CvArr* array )
     if( CV_IS_SEQ(contour) )
     {
         if( !CV_IS_SEQ_POINT_SET(contour))
-            CV_Error( CV_StsUnsupportedFormat,
+            CV_Error( cv::Error::StsUnsupportedFormat,
                      "Input sequence must be polygon (closed 2d curve)" );
     }
     else
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index f0ea0b5bb5e1..1d6ee1ac04d5 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -74,21 +74,21 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
 #endif // CV_TRY_AVX
             j = 0;
 
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
-            v_float32x4 half = v_setall_f32(0.5f);
-            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            v_float32 half = vx_setall_f32(0.5f);
+            for( ; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes() )
             {
-                v_float32x4 v_a, v_b, v_c, v_t;
+                v_float32 v_a, v_b, v_c, v_t;
                 v_load_deinterleave(cov + j*3, v_a, v_b, v_c);
-                v_a *= half;
-                v_c *= half;
-                v_t = v_a - v_c;
-                v_t = v_muladd(v_b, v_b, (v_t * v_t));
-                v_store(dst + j, (v_a + v_c) - v_sqrt(v_t));
+                v_a = v_mul(v_a, half);
+                v_c = v_mul(v_c, half);
+                v_t = v_sub(v_a, v_c);
+                v_t = v_muladd(v_b, v_b, (v_mul(v_t, v_t)));
+                v_store(dst + j, v_sub(v_add(v_a, v_c), v_sqrt(v_t)));
             }
         }
-#endif // CV_SIMD128
+#endif // CV_SIMD
 
         for( ; j < size.width; j++ )
         {
@@ -127,18 +127,18 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
 #endif // CV_TRY_AVX
             j = 0;
 
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
-            v_float32x4 v_k = v_setall_f32((float)k);
+            v_float32 v_k = vx_setall_f32((float)k);
 
-            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            for( ; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes() )
             {
-                v_float32x4 v_a, v_b, v_c;
+                v_float32 v_a, v_b, v_c;
                 v_load_deinterleave(cov + j * 3, v_a, v_b, v_c);
 
-                v_float32x4 v_ac_bb = v_a * v_c - v_b * v_b;
-                v_float32x4 v_ac = v_a + v_c;
-                v_float32x4 v_dst = v_ac_bb - v_k * v_ac * v_ac;
+                v_float32 v_ac_bb = v_sub(v_mul(v_a, v_c), v_mul(v_b, v_b));
+                v_float32 v_ac = v_add(v_a, v_c);
+                v_float32 v_dst = v_sub(v_ac_bb, v_mul(v_mul(v_k, v_ac), v_ac));
                 v_store(dst + j, v_dst);
             }
         }
@@ -282,22 +282,22 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
 #endif // CV_TRY_AVX
             j = 0;
 
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
-            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            for( ; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes() )
             {
-                v_float32x4 v_dx = v_load(dxdata + j);
-                v_float32x4 v_dy = v_load(dydata + j);
+                v_float32 v_dx = vx_load(dxdata + j);
+                v_float32 v_dy = vx_load(dydata + j);
 
-                v_float32x4 v_dst0, v_dst1, v_dst2;
-                v_dst0 = v_dx * v_dx;
-                v_dst1 = v_dx * v_dy;
-                v_dst2 = v_dy * v_dy;
+                v_float32 v_dst0, v_dst1, v_dst2;
+                v_dst0 = v_mul(v_dx, v_dx);
+                v_dst1 = v_mul(v_dx, v_dy);
+                v_dst2 = v_mul(v_dy, v_dy);
 
                 v_store_interleave(cov_data + j * 3, v_dst0, v_dst1, v_dst2);
             }
         }
-#endif // CV_SIMD128
+#endif // CV_SIMD
 
         for( ; j < size.width; j++ )
         {
@@ -693,9 +693,9 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
     if( src.depth() == CV_8U )
         factor *= 255;
     factor = 1./(factor * factor * factor);
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     float factor_f = (float)factor;
-    v_float32x4 v_factor = v_setall_f32(factor_f), v_m2 = v_setall_f32(-2.0f);
+    v_float32 v_factor = vx_setall_f32(factor_f), v_m2 = vx_setall_f32(-2.0f);
 #endif
 
     Size size = src.size();
@@ -711,18 +711,18 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
 
         j = 0;
 
-#if CV_SIMD128
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
-            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            for( ; j <= size.width - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes() )
             {
-                v_float32x4 v_dx = v_load(dxdata + j);
-                v_float32x4 v_dy = v_load(dydata + j);
+                v_float32 v_dx = vx_load(dxdata + j);
+                v_float32 v_dy = vx_load(dydata + j);
 
-                v_float32x4 v_s1 = (v_dx * v_dx) * v_load(d2ydata + j);
-                v_float32x4 v_s2 = v_muladd((v_dy * v_dy),  v_load(d2xdata + j), v_s1);
-                v_float32x4 v_s3 = v_muladd((v_dy * v_dx) * v_load(dxydata + j), v_m2, v_s2);
+                v_float32 v_s1 = v_mul(v_mul(v_dx, v_dx), vx_load(d2ydata + j));
+                v_float32 v_s2 = v_muladd((v_mul(v_dy, v_dy)),  vx_load(d2xdata + j), v_s1);
+                v_float32 v_s3 = v_muladd(v_mul(v_mul(v_dy, v_dx), vx_load(dxydata + j)), v_m2, v_s2);
 
-                v_store(dstdata + j, v_s3 * v_factor);
+                v_store(dstdata + j, v_mul(v_s3, v_factor));
             }
         }
 #endif
diff --git a/modules/imgproc/src/cornersubpix.cpp b/modules/imgproc/src/cornersubpix.cpp
index 1e0841271f5f..af5bd999dce2 100644
--- a/modules/imgproc/src/cornersubpix.cpp
+++ b/modules/imgproc/src/cornersubpix.cpp
@@ -49,8 +49,8 @@ void cv::cornerSubPix( InputArray _image, InputOutputArray _corners,
     const int MAX_ITERS = 100;
     int win_w = win.width * 2 + 1, win_h = win.height * 2 + 1;
     int i, j, k;
-    int max_iters = (criteria.type & CV_TERMCRIT_ITER) ? MIN(MAX(criteria.maxCount, 1), MAX_ITERS) : MAX_ITERS;
-    double eps = (criteria.type & CV_TERMCRIT_EPS) ? MAX(criteria.epsilon, 0.) : 0;
+    int max_iters = (criteria.type & TermCriteria::MAX_ITER) ? MIN(MAX(criteria.maxCount, 1), MAX_ITERS) : MAX_ITERS;
+    double eps = (criteria.type & TermCriteria::EPS) ? MAX(criteria.epsilon, 0.) : 0;
     eps *= eps; // use square of error in comparison operations
 
     cv::Mat src = _image.getMat(), cornersmat = _corners.getMat();
@@ -96,6 +96,7 @@ void cv::cornerSubPix( InputArray _image, InputOutputArray _corners,
     for( int pt_i = 0; pt_i < count; pt_i++ )
     {
         Point2f cT = corners[pt_i], cI = cT;
+        CV_Assert( Rect(0, 0, src.cols, src.rows).contains(cT) );
         int iter = 0;
         double err = 0;
 
@@ -140,9 +141,10 @@ void cv::cornerSubPix( InputArray _image, InputOutputArray _corners,
             cI2.x = (float)(cI.x + c*scale*bb1 - b*scale*bb2);
             cI2.y = (float)(cI.y - b*scale*bb1 + a*scale*bb2);
             err = (cI2.x - cI.x) * (cI2.x - cI.x) + (cI2.y - cI.y) * (cI2.y - cI.y);
-            cI = cI2;
-            if( cI.x < 0 || cI.x >= src.cols || cI.y < 0 || cI.y >= src.rows )
+            // if new point is out of image, leave previous point as the result
+            if( !Rect(0, 0, src.cols, src.rows).contains(cI2) )
                 break;
+            cI = cI2;
         }
         while( ++iter < max_iters && err > eps );
 
diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp
index 27dfc1520c5f..2bfa705b8abc 100644
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@@ -184,25 +184,25 @@ class SIMDBayerInterpolator_8u
 
         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
         {
-            v_uint16x8 r0 = v_load((ushort*)bayer);
-            v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
-            v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
+            v_uint16x8 r0 = v_reinterpret_as_u16(v_load(bayer));
+            v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
+            v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
 
-            v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7);
-            v_uint16x8 b0 = v_rotate_right<1>(b1) + b1;
-            b1 = v_rotate_right<1>(b1) << 1;
+            v_uint16x8 b1 = v_add(v_shr<7>(v_shl<8>(r0)), v_shr<7>(v_shl<8>(r2)));
+            v_uint16x8 b0 = v_add(v_rotate_right<1>(b1), b1);
+            b1 = v_shl<1>(v_rotate_right<1>(b1));
 
-            v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7);
-            v_uint16x8 g1 = (r1 << 8) >> 7;
-            g0 += v_rotate_right<1>(g1) + g1;
-            g1 = v_rotate_right<1>(g1) << 2;
+            v_uint16x8 g0 = v_add(v_shr<7>(r0), v_shr<7>(r2));
+            v_uint16x8 g1 = v_shr<7>(v_shl<8>(r1));
+            g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
+            g1 = v_shl<2>(v_rotate_right<1>(g1));
 
-            r0 = r1 >> 8;
-            r1 = (v_rotate_right<1>(r0) + r0) << 2;
-            r0 = r0 << 3;
+            r0 = v_shr<8>(r1);
+            r1 = v_shl<2>(v_add(v_rotate_right<1>(r0), r0));
+            r0 = v_shl<3>(r0);
 
-            g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2;
-            g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2;
+            g0 = v_shr<2>(v_add(v_add(v_mul_hi(b0, _b2y), v_mul_hi(g0, _g2y)), v_mul_hi(r0, _r2y)));
+            g1 = v_shr<2>(v_add(v_add(v_mul_hi(b1, _b2y), v_mul_hi(g1, _g2y)), v_mul_hi(r1, _r2y)));
             v_uint8x16 pack_lo, pack_hi;
             v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)),
                   v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)),
@@ -265,35 +265,35 @@ class SIMDBayerInterpolator_8u
 
         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
         {
-            v_uint16x8 r0 = v_load((ushort*)bayer);
-            v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
-            v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
+            v_uint16x8 r0 = v_reinterpret_as_u16(v_load(bayer));
+            v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
+            v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
 
-            v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
+            v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo));
             v_uint16x8 nextb1 = v_rotate_right<1>(b1);
-            v_uint16x8 b0 = b1 + nextb1;
-            b1 = (nextb1 + delta1) >> 1;
-            b0 = (b0 + delta2) >> 2;
+            v_uint16x8 b0 = v_add(b1, nextb1);
+            b1 = v_shr<1>(v_add(nextb1, delta1));
+            b0 = v_shr<2>(v_add(b0, delta2));
             // b0 b2 ... b14 b1 b3 ... b15
             b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
 
-            v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
-            v_uint16x8 g1 = r1 & masklo;
-            g0 += v_rotate_right<1>(g1) + g1;
+            v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2));
+            v_uint16x8 g1 = v_and(r1, masklo);
+            g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
             g1 = v_rotate_right<1>(g1);
-            g0 = (g0 + delta2) >> 2;
+            g0 = v_shr<2>(v_add(g0, delta2));
             // g0 g2 ... g14 g1 g3 ... g15
             g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
 
-            r0 = r1 >> 8;
-            r1 = v_rotate_right<1>(r0) + r0;
-            r1 = (r1 + delta1) >> 1;
+            r0 = v_shr<8>(r1);
+            r1 = v_add(v_rotate_right<1>(r0), r0);
+            r1 = v_shr<1>(v_add(r1, delta1));
             // r0 r2 ... r14 r1 r3 ... r15
             r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
 
-            b1 = (b0 ^ r0) & mask;
-            b0 = b0 ^ b1;
-            r0 = r0 ^ b1;
+            b1 = v_and(v_xor(b0, r0), mask);
+            b0 = v_xor(b0, b1);
+            r0 = v_xor(r0, b1);
 
             // b1 g1 b3 g3 b5 g5...
             v_uint8x16 pack_lo, pack_hi;
@@ -398,35 +398,35 @@ class SIMDBayerInterpolator_8u
 
         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
         {
-            v_uint16x8 r0 = v_load((ushort*)bayer);
-            v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
-            v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
+            v_uint16x8 r0 = v_reinterpret_as_u16(v_load(bayer));
+            v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
+            v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
 
-            v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
+            v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo));
             v_uint16x8 nextb1 = v_rotate_right<1>(b1);
-            v_uint16x8 b0 = b1 + nextb1;
-            b1 = (nextb1 + delta1) >> 1;
-            b0 = (b0 + delta2) >> 2;
+            v_uint16x8 b0 = v_add(b1, nextb1);
+            b1 = v_shr<1>(v_add(nextb1, delta1));
+            b0 = v_shr<2>(v_add(b0, delta2));
             // b0 b2 ... b14 b1 b3 ... b15
             b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
 
-            v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
-            v_uint16x8 g1 = r1 & masklo;
-            g0 += v_rotate_right<1>(g1) + g1;
+            v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2));
+            v_uint16x8 g1 = v_and(r1, masklo);
+            g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
             g1 = v_rotate_right<1>(g1);
-            g0 = (g0 + delta2) >> 2;
+            g0 = v_shr<2>(v_add(g0, delta2));
             // g0 g2 ... g14 g1 g3 ... g15
             g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
 
-            r0 = r1 >> 8;
-            r1 = v_rotate_right<1>(r0) + r0;
-            r1 = (r1 + delta1) >> 1;
+            r0 = v_shr<8>(r1);
+            r1 = v_add(v_rotate_right<1>(r0), r0);
+            r1 = v_shr<1>(v_add(r1, delta1));
             // r0 r2 ... r14 r1 r3 ... r15
             r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
 
-            b1 = (b0 ^ r0) & mask;
-            b0 = b0 ^ b1;
-            r0 = r0 ^ b1;
+            b1 = v_and(v_xor(b0, r0), mask);
+            b0 = v_xor(b0, b1);
+            r0 = v_xor(r0, b1);
 
             // b1 g1 b3 g3 b5 g5...
             v_uint8x16 pack_lo, pack_hi;
@@ -494,44 +494,44 @@ class SIMDBayerInterpolator_8u
              B G B G | B G B G | B G B G | B G B G
              */
 
-            v_uint16x8 r0 = v_load((ushort*)bayer);
-            v_uint16x8 r1 = v_load((ushort*)(bayer+bayer_step));
-            v_uint16x8 r2 = v_load((ushort*)(bayer+bayer_step*2));
+            v_uint16x8 r0 = v_reinterpret_as_u16(v_load(bayer));
+            v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
+            v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
 
-            v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow);
+            v_uint16x8 b1 = v_add(v_and(r0, masklow), v_and(r2, masklow));
             v_uint16x8 nextb1 = v_rotate_right<1>(b1);
-            v_uint16x8 b0 = b1 + nextb1;
-            b1 = (nextb1 + delta1) >> 1;
-            b0 = (b0 + delta2) >> 2;
+            v_uint16x8 b0 = v_add(b1, nextb1);
+            b1 = v_shr<1>(v_add(nextb1, delta1));
+            b0 = v_shr<2>(v_add(b0, delta2));
             // b0 b2 ... b14 b1 b3 ... b15
             b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
 
             // vertical sum
-            v_uint16x8 r0g = r0 >> 8;
-            v_uint16x8 r2g = r2 >> 8;
-            v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1;
+            v_uint16x8 r0g = v_shr<8>(r0);
+            v_uint16x8 r2g = v_shr<8>(r2);
+            v_uint16x8 sumv = v_shr<1>(v_add(v_add(r0g, r2g), delta1));
             // horizontal sum
-            v_uint16x8 g1 = r1 & masklow;
+            v_uint16x8 g1 = v_and(r1, masklow);
             v_uint16x8 nextg1 = v_rotate_right<1>(g1);
-            v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1;
+            v_uint16x8 sumg = v_shr<1>(v_add(v_add(g1, nextg1), delta1));
 
             // gradients
-            v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g);
-            v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1);
-            v_uint16x8 gmask = gradg > gradv;
-            v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full));
+            v_uint16x8 gradv = v_add(v_sub(r0g, r2g), v_sub(r2g, r0g));
+            v_uint16x8 gradg = v_add(v_sub(nextg1, g1), v_sub(g1, nextg1));
+            v_uint16x8 gmask = v_gt(gradg, gradv);
+            v_uint16x8 g0 = v_add(v_and(gmask, sumv), v_and(sumg, v_xor(gmask, full)));
             // g0 g2 ... g14 g1 g3 ...
             g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1)));
 
-            r0 = r1 >> 8;
-            r1 = v_rotate_right<1>(r0) + r0;
-            r1 = (r1 + delta1) >> 1;
+            r0 = v_shr<8>(r1);
+            r1 = v_add(v_rotate_right<1>(r0), r0);
+            r1 = v_shr<1>(v_add(r1, delta1));
             // r0 r2 ... r14 r1 r3 ... r15
             r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
 
-            b1 = (b0 ^ r0) & mask;
-            b0 = b0 ^ b1;
-            r0 = r0 ^ b1;
+            b1 = v_and(v_xor(b0, r0), mask);
+            b0 = v_xor(b0, b1);
+            r0 = v_xor(r0, b1);
 
             // b1 g1 b3 g3 b5 g5...
             v_uint8x16 pack_lo, pack_hi;
@@ -1060,19 +1060,19 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
 
                 v_uint16x8 b0, b1, b2, b3, b4, b5, b6;
 
-                b0 = (v_absdiff(s2, s8)<<1) + v_absdiff(s1, s7) + v_absdiff(s3, s9);
-                b1 = (v_absdiff(s4, s6)<<1) + v_absdiff(s1, s3) + v_absdiff(s7, s9);
-                b2 = v_absdiff(s3, s7)<<1;
-                b3 = v_absdiff(s1, s9)<<1;
+                b0 = v_add(v_add(v_shl<1>(v_absdiff(s2, s8)), v_absdiff(s1, s7)), v_absdiff(s3, s9));
+                b1 = v_add(v_add(v_shl<1>(v_absdiff(s4, s6)), v_absdiff(s1, s3)), v_absdiff(s7, s9));
+                b2 = v_shl<1>(v_absdiff(s3, s7));
+                b3 = v_shl<1>(v_absdiff(s1, s9));
 
                 v_store(brow, b0);
                 v_store(brow + N, b1);
                 v_store(brow + N2, b2);
                 v_store(brow + N3, b3);
 
-                b4 = b2 + v_absdiff(s2, s4) + v_absdiff(s6, s8);
-                b5 = b3 + v_absdiff(s2, s6) + v_absdiff(s4, s8);
-                b6 = (s2 + s4 + s6 + s8)>>1;
+                b4 = v_add(v_add(b2, v_absdiff(s2, s4)), v_absdiff(s6, s8));
+                b5 = v_add(v_add(b3, v_absdiff(s2, s6)), v_absdiff(s4, s8));
+                b6 = v_shr<1>(v_add(v_add(v_add(s2, s4), s6), s8));
 
                 v_store(brow + N4, b4);
                 v_store(brow + N5, b5);
@@ -1279,7 +1279,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
             v_uint16x8 one = v_setall_u16(1), z = v_setzero_u16();
             v_float32x4 _0_5 = v_setall_f32(0.5f);
 
-            #define v_merge_u16(a, b) (((a) & v_reinterpret_as_u16(emask)) | ((b) & v_reinterpret_as_u16(omask))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
+            #define v_merge_u16(a, b) (v_or((v_and((a), v_reinterpret_as_u16(emask))), (v_and((b), v_reinterpret_as_u16(omask))))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
             #define v_cvt_s16f32_lo(a)  v_cvt_f32(v_expand_low(v_reinterpret_as_s16(a)))   //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
             #define v_cvt_s16f32_hi(a)  v_cvt_f32(v_expand_high(v_reinterpret_as_s16(a)))   //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
 
@@ -1287,16 +1287,16 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
             for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
             {
                 //int gradN = brow0[0] + brow1[0];
-                v_uint16x8 gradN = v_load(brow0) + v_load(brow1);
+                v_uint16x8 gradN = v_add(v_load(brow0), v_load(brow1));
 
                 //int gradS = brow1[0] + brow2[0];
-                v_uint16x8 gradS = v_load(brow1) + v_load(brow2);
+                v_uint16x8 gradS = v_add(v_load(brow1), v_load(brow2));
 
                 //int gradW = brow1[N-1] + brow1[N];
-                v_uint16x8 gradW = v_load(brow1+N-1) + v_load(brow1+N);
+                v_uint16x8 gradW = v_add(v_load(brow1 + N - 1), v_load(brow1 + N));
 
                 //int gradE = brow1[N+1] + brow1[N];
-                v_uint16x8 gradE = v_load(brow1+N+1) + v_load(brow1+N);
+                v_uint16x8 gradE = v_add(v_load(brow1 + N + 1), v_load(brow1 + N));
 
                 //int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
                 //int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
@@ -1307,14 +1307,14 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
 
                 //int gradNE = brow0[N4+1] + brow1[N4];
                 //int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
-                grad0 = v_load(brow0+N4+1) + v_load(brow1+N4);
-                grad1 = v_load(brow0+N2) + v_load(brow0+N2+1) + v_load(brow1+N2) + v_load(brow1+N2+1);
+                grad0 = v_add(v_load(brow0 + N4 + 1), v_load(brow1 + N4));
+                grad1 = v_add(v_add(v_add(v_load(brow0 + N2), v_load(brow0 + N2 + 1)), v_load(brow1 + N2)), v_load(brow1 + N2 + 1));
                 v_uint16x8 gradNE = v_merge_u16(grad0, grad1);
 
                 //int gradSW = brow1[N4] + brow2[N4-1];
                 //int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
-                grad0 = v_load(brow2+N4-1) + v_load(brow1+N4);
-                grad1 = v_load(brow2+N2) + v_load(brow2+N2-1) + v_load(brow1+N2) + v_load(brow1+N2-1);
+                grad0 = v_add(v_load(brow2 + N4 - 1), v_load(brow1 + N4));
+                grad1 = v_add(v_add(v_add(v_load(brow2 + N2), v_load(brow2 + N2 - 1)), v_load(brow1 + N2)), v_load(brow1 + N2 - 1));
                 v_uint16x8 gradSW = v_merge_u16(grad0, grad1);
 
                 minGrad = v_min(v_min(minGrad, gradNE), gradSW);
@@ -1322,21 +1322,21 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
 
                 //int gradNW = brow0[N5-1] + brow1[N5];
                 //int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
-                grad0 = v_load(brow0+N5-1) + v_load(brow1+N5);
-                grad1 = v_load(brow0+N3) + v_load(brow0+N3-1) + v_load(brow1+N3) + v_load(brow1+N3-1);
+                grad0 = v_add(v_load(brow0 + N5 - 1), v_load(brow1 + N5));
+                grad1 = v_add(v_add(v_add(v_load(brow0 + N3), v_load(brow0 + N3 - 1)), v_load(brow1 + N3)), v_load(brow1 + N3 - 1));
                 v_uint16x8 gradNW = v_merge_u16(grad0, grad1);
 
                 //int gradSE = brow1[N5] + brow2[N5+1];
                 //int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
-                grad0 = v_load(brow2+N5+1) + v_load(brow1+N5);
-                grad1 = v_load(brow2+N3) + v_load(brow2+N3+1) + v_load(brow1+N3) + v_load(brow1+N3+1);
+                grad0 = v_add(v_load(brow2 + N5 + 1), v_load(brow1 + N5));
+                grad1 = v_add(v_add(v_add(v_load(brow2 + N3), v_load(brow2 + N3 + 1)), v_load(brow1 + N3)), v_load(brow1 + N3 + 1));
                 v_uint16x8 gradSE = v_merge_u16(grad0, grad1);
 
                 minGrad = v_min(v_min(minGrad, gradNW), gradSE);
                 maxGrad = v_max(v_max(maxGrad, gradNW), gradSE);
 
                 //int T = minGrad + maxGrad/2;
-                v_uint16x8 T = v_max((maxGrad >> 1), one) + minGrad;
+                v_uint16x8 T = v_add(v_max((v_shr<1>(maxGrad)), one), minGrad);
 
                 v_uint16x8 RGs = z, GRs = z, Bs = z, ng = z;
 
@@ -1361,133 +1361,135 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
                 v_uint16x8 t0, t1, mask;
 
                 // gradN ***********************************************
-                mask = (T > gradN); // mask = T>gradN
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradN)
+                mask = (v_gt(T, gradN)); // mask = T>gradN
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradN)
 
-                t0 = (x3 << 1);                                 // srow[-bstep]*2
-                t1 = v_load_expand(srow - bstep*2) + x0;  // srow[-bstep*2] + srow[0]
+                t0 = (v_shl<1>(x3));                                 // srow[-bstep]*2
+                t1 = v_add(v_load_expand(srow - bstep * 2), x0);  // srow[-bstep*2] + srow[0]
 
                 // RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
-                RGs += (t1 & mask);
+                RGs = v_add(RGs, v_and(t1, mask));
                 // GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
-                GRs += (v_merge_u16(t0, x2 + x4) & mask);
+                GRs = v_add(GRs, (v_and(v_merge_u16(t0, v_add(x2, x4)), mask)));
                 // Bs  += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
-                Bs  += (v_merge_u16(x1 + x5, t0) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x5), t0), mask));
 
                 // gradNE **********************************************
-                mask = (T > gradNE); // mask = T>gradNE
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradNE)
+                mask = (v_gt(T, gradNE)); // mask = T>gradNE
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradNE)
 
-                t0 = (x5 << 1);                                    // srow[-bstep+1]*2
-                t1 = v_load_expand(srow - bstep*2+2) + x0;   // srow[-bstep*2+2] + srow[0]
+                t0 = (v_shl<1>(x5));                                    // srow[-bstep+1]*2
+                t1 = v_add(v_load_expand(srow - bstep * 2 + 2), x0);   // srow[-bstep*2+2] + srow[0]
 
                 // RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
-                RGs += (v_merge_u16(t1, t0) & mask);
+                RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
                 // GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
-                GRs += (v_merge_u16(v_load(brow0+N6+1), x4 + x7) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6+1), v_add(x4, x7)), mask));
                 // Bs  += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])}  * (T>gradNE)
-                Bs  += (v_merge_u16(t0, x3 + x6) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x3, x6)), mask));
 
                 // gradE ***********************************************
-                mask = (T > gradE);  // mask = T>gradE
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradE)
+                mask = (v_gt(T, gradE));  // mask = T>gradE
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradE)
 
-                t0 = (x7 << 1);                         // srow[1]*2
-                t1 = v_load_expand(srow +2) + x0; // srow[2] + srow[0]
+                t0 = (v_shl<1>(x7));                         // srow[1]*2
+                t1 = v_add(v_load_expand(srow + 2), x0); // srow[2] + srow[0]
 
                 // RGs += (srow[2] + srow[0]) * (T>gradE)
-                RGs += (t1 & mask);
+                RGs = v_add(RGs, v_and(t1, mask));
                 // GRs += (srow[1]*2) * (T>gradE)
-                GRs += (t0 & mask);
+                GRs = v_add(GRs, v_and(t0, mask));
                 // Bs  += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
-                Bs  += (v_merge_u16(x5 + x9, x6 + x8) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_add(x5, x9), v_add(x6, x8)), mask));
 
                 // gradSE **********************************************
-                mask = (T > gradSE);  // mask = T>gradSE
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradSE)
+                mask = (v_gt(T, gradSE));  // mask = T>gradSE
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradSE)
 
-                t0 = (x9 << 1);                                 // srow[bstep+1]*2
-                t1 = v_load_expand(srow + bstep*2+2) + x0; // srow[bstep*2+2] + srow[0]
+                t0 = (v_shl<1>(x9));                                 // srow[bstep+1]*2
+                t1 = v_add(v_load_expand(srow + bstep * 2 + 2), x0); // srow[bstep*2+2] + srow[0]
 
                 // RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
-                RGs += (v_merge_u16(t1, t0) & mask);
+                RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
                 // GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
-                GRs += (v_merge_u16(v_load(brow2+N6+1), x7 + x10) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6+1), v_add(x7, x10)), mask));
                 // Bs  += {srow[bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
-                Bs  += (v_merge_u16((x9 << 1), x8 + x11) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16((v_shl<1>(x9)), v_add(x8, x11)), mask));
 
                 // gradS ***********************************************
-                mask = (T > gradS);  // mask = T>gradS
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradS)
+                mask = (v_gt(T, gradS));  // mask = T>gradS
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradS)
 
-                t0 = (x11 << 1);                             // srow[bstep]*2
-                t1 = v_load_expand(srow + bstep*2) + x0; // srow[bstep*2]+srow[0]
+                t0 = (v_shl<1>(x11));                             // srow[bstep]*2
+                t1 = v_add(v_load_expand(srow + bstep * 2), x0); // srow[bstep*2]+srow[0]
 
                 // RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
-                RGs += (t1 & mask);
+                RGs = v_add(RGs, v_and(t1, mask));
                 // GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
-                GRs += (v_merge_u16(t0, x10 + x12) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(t0, v_add(x10, x12)), mask));
                 // Bs  += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
-                Bs  += (v_merge_u16(x9 + x13, t0) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_add(x9, x13), t0), mask));
 
                 // gradSW **********************************************
-                mask = (T > gradSW);  // mask = T>gradSW
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradSW)
+                mask = (v_gt(T, gradSW));  // mask = T>gradSW
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradSW)
 
-                t0 = (x13 << 1);                                // srow[bstep-1]*2
-                t1 = v_load_expand(srow + bstep*2-2) + x0; // srow[bstep*2-2]+srow[0]
+                t0 = (v_shl<1>(x13));                                // srow[bstep-1]*2
+                t1 = v_add(v_load_expand(srow + bstep * 2 - 2), x0); // srow[bstep*2-2]+srow[0]
 
                 // RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
-                RGs += (v_merge_u16(t1, t0) & mask);
+                RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
                 // GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
-                GRs += (v_merge_u16(v_load(brow2+N6-1), x12 + x15) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6-1), v_add(x12, x15)), mask));
                 // Bs  += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
-                Bs  += (v_merge_u16(t0, x11 + x14) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x11, x14)), mask));
 
                 // gradW ***********************************************
-                mask = (T > gradW);  // mask = T>gradW
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradW)
+                mask = (v_gt(T, gradW));  // mask = T>gradW
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradW)
 
-                t0 = (x15 << 1);                         // srow[-1]*2
-                t1 = v_load_expand(srow -2) + x0; // srow[-2]+srow[0]
+                t0 = (v_shl<1>(x15));                         // srow[-1]*2
+                t1 = v_add(v_load_expand(srow - 2), x0); // srow[-2]+srow[0]
 
                 // RGs += (srow[-2]+srow[0]) * (T>gradW)
-                RGs += (t1 & mask);
+                RGs = v_add(RGs, v_and(t1, mask));
                 // GRs += (srow[-1]*2) * (T>gradW)
-                GRs += (t0 & mask);
+                GRs = v_add(GRs, v_and(t0, mask));
                 // Bs  += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
-                Bs  += (v_merge_u16(x1 + x13, x14 + x16) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x13), v_add(x14, x16)), mask));
 
                 // gradNW **********************************************
-                mask = (T > gradNW);  // mask = T>gradNW
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradNW)
+                mask = (v_gt(T, gradNW));  // mask = T>gradNW
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradNW)
 
-                t0 = (x1 << 1);                                 // srow[-bstep-1]*2
-                t1 = v_load_expand(srow -bstep*2-2) + x0; // srow[-bstep*2-2]+srow[0]
+                t0 = (v_shl<1>(x1));                                 // srow[-bstep-1]*2
+                t1 = v_add(v_load_expand(srow - bstep * 2 - 2), x0); // srow[-bstep*2-2]+srow[0]
 
                 // RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
-                RGs += (v_merge_u16(t1, t0) & mask);
+                RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
                 // GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
-                GRs += (v_merge_u16(v_load(brow0+N6-1), x2 + x15) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6-1), v_add(x2, x15)), mask));
                 // Bs  += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
-                Bs  += (v_merge_u16((x1 << 1), x3 + x16) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_shl<1>(x1), v_add(x3, x16)), mask));
 
-                v_float32x4 ngf0 = _0_5 / v_cvt_s16f32_lo(ng);
-                v_float32x4 ngf1 = _0_5 / v_cvt_s16f32_hi(ng);
+                v_float32x4 ngf0 = v_div(_0_5, v_cvt_s16f32_lo(ng));
+                v_float32x4 ngf1 = v_div(_0_5, v_cvt_s16f32_hi(ng));
 
                 // now interpolate r, g & b
-                t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(GRs) - v_reinterpret_as_s16(RGs));
-                t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(Bs) -  v_reinterpret_as_s16(RGs));
+                t0 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(GRs), v_reinterpret_as_s16(RGs)));
+                t1 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(Bs), v_reinterpret_as_s16(RGs)));
 
-                t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
+                t0 = v_reinterpret_as_u16(
+                    v_add(v_reinterpret_as_s16(x0),
                         v_pack(
-                          v_round(v_cvt_s16f32_lo(t0) * ngf0),
-                          v_round(v_cvt_s16f32_hi(t0) * ngf1)));
+                            v_round(v_mul(v_cvt_s16f32_lo(t0), ngf0)),
+                            v_round(v_mul(v_cvt_s16f32_hi(t0), ngf1)))));
 
-                t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
+                t1 = v_reinterpret_as_u16(
+                    v_add(v_reinterpret_as_s16(x0),
                         v_pack(
-                          v_round(v_cvt_s16f32_lo(t1) * ngf0),
-                          v_round(v_cvt_s16f32_hi(t1) * ngf1)));
+                            v_round(v_mul(v_cvt_s16f32_lo(t1), ngf0)),
+                            v_round(v_mul(v_cvt_s16f32_hi(t1), ngf1)))));
 
                 x1 = v_merge_u16(x0, t0);
                 x2 = v_merge_u16(t0, x0);
@@ -1706,7 +1708,7 @@ void cv::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn)
         else if( depth == CV_16U )
             Bayer2Gray_<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst, code);
         else
-            CV_Error(CV_StsUnsupportedFormat, "Bayer->Gray demosaicing only supports 8u and 16u types");
+            CV_Error(cv::Error::StsUnsupportedFormat, "Bayer->Gray demosaicing only supports 8u and 16u types");
         break;
 
     case COLOR_BayerBG2BGRA: case COLOR_BayerGB2BGRA: case COLOR_BayerRG2BGRA: case COLOR_BayerGR2BGRA:
@@ -1733,7 +1735,7 @@ void cv::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn)
                 else if( depth == CV_16U )
                     Bayer2RGB_<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst_, code);
                 else
-                    CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB demosaicing only supports 8u and 16u types");
+                    CV_Error(cv::Error::StsUnsupportedFormat, "Bayer->RGB demosaicing only supports 8u and 16u types");
             }
             else
             {
@@ -1756,11 +1758,11 @@ void cv::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn)
         else if (depth == CV_16U)
             Bayer2RGB_EdgeAware_T<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst, code);
         else
-            CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB Edge-Aware demosaicing only currently supports 8u and 16u types");
+            CV_Error(cv::Error::StsUnsupportedFormat, "Bayer->RGB Edge-Aware demosaicing only currently supports 8u and 16u types");
 
         break;
 
     default:
-        CV_Error( CV_StsBadFlag, "Unknown / unsupported color conversion code" );
+        CV_Error( cv::Error::StsBadFlag, "Unknown / unsupported color conversion code" );
     }
 }
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index 31803036ce85..cc9db282fd0b 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -101,7 +101,7 @@ static void getSobelKernels( OutputArray _kx, OutputArray _ky,
     Mat ky = _ky.getMat();
 
     if( _ksize % 2 == 0 || _ksize > 31 )
-        CV_Error( CV_StsOutOfRange, "The kernel size must be odd and not larger than 31" );
+        CV_Error( cv::Error::StsOutOfRange, "The kernel size must be odd and not larger than 31" );
     std::vector<int> kerI(std::max(ksizeX, ksizeY) + 1);
 
     CV_Assert( dx >= 0 && dy >= 0 && dx+dy > 0 );
@@ -583,9 +583,9 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
                                            "BORDER_REFLECT_101" };
 
         String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUS=%d%s%s"
-                                 " -D convertToWT=%s -D convertToDT=%s"
-                                 " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s"
-                                 " -D srcT=%s -D dstT=%s -D WT=%s"
+                                 " -D CONVERT_TO_WT=%s -D CONVERT_TO_DT=%s"
+                                 " -D %s -D SRC_T1=%s -D DST_T1=%s -D WT1=%s"
+                                 " -D SRC_T=%s -D DST_T=%s -D WT=%s"
                                  " -D CN=%d ",
                                  (int)lt2[0], (int)lt2[1], kernelX.cols / 2,
                                  ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(),
@@ -627,8 +627,8 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
     char cvt[2][50];
     ocl::Kernel k("sumConvert", ocl::imgproc::laplacian5_oclsrc,
                   format("-D ONLY_SUM_CONVERT "
-                         "-D srcT=%s -D WT=%s -D dstT=%s -D coeffT=%s -D wdepth=%d "
-                         "-D convertToWT=%s -D convertToDT=%s%s",
+                         "-D SRC_T=%s -D WT=%s -D DST_T=%s -D COEFF_T=%s -D WDEPTH=%d "
+                         "-D CONVERT_TO_WT=%s -D CONVERT_TO_DT=%s%s",
                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
                          ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
                          ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
diff --git a/modules/imgproc/src/distransform.cpp b/modules/imgproc/src/distransform.cpp
index e88ba44cfbfc..6a7026c8c802 100644
--- a/modules/imgproc/src/distransform.cpp
+++ b/modules/imgproc/src/distransform.cpp
@@ -45,24 +45,23 @@ namespace cv
 {
 
 static const int DIST_SHIFT = 16;
-static const int INIT_DIST0 = INT_MAX;
-static const int DIST_MAX   = (INT_MAX >> 2);
 #define  CV_FLT_TO_FIX(x,n)  cvRound((x)*(1<<(n)))
 
 static void
-initTopBottom( Mat& temp, int border )
+initTopBottom( Mat& temp, int border, unsigned int value )
 {
     Size size = temp.size();
+    unsigned int* ttop = (unsigned int*)temp.ptr<int>(0);
+    unsigned int* tbottom = (unsigned int*)temp.ptr<int>(size.height - 1);
     for( int i = 0; i < border; i++ )
     {
-        int* ttop = temp.ptr<int>(i);
-        int* tbottom = temp.ptr<int>(size.height - i - 1);
-
         for( int j = 0; j < size.width; j++ )
         {
-            ttop[j] = INIT_DIST0;
-            tbottom[j] = INIT_DIST0;
+            ttop[j] = value;
+            tbottom[j] = value;
         }
+        ttop += size.width;
+        tbottom -= size.width;
     }
 }
 
@@ -74,26 +73,26 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     int i, j;
     const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
     const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
+    const unsigned int DIST_MAX = UINT_MAX - DIAG_DIST;
     const float scale = 1.f/(1 << DIST_SHIFT);
 
     const uchar* src = _src.ptr();
     int* temp = _temp.ptr<int>();
-    float* dist = _dist.ptr<float>();
+    float* dist = _dist.ptr<float>(_dist.rows - 1);
     int srcstep = (int)(_src.step/sizeof(src[0]));
     int step = (int)(_temp.step/sizeof(temp[0]));
     int dststep = (int)(_dist.step/sizeof(dist[0]));
     Size size = _src.size();
 
-    initTopBottom( _temp, BORDER );
+    initTopBottom( _temp, BORDER, DIST_MAX );
 
     // forward pass
+    unsigned int* tmp = (unsigned int*)(temp + BORDER*step) + BORDER;
+    const uchar* s = src;
     for( i = 0; i < size.height; i++ )
     {
-        const uchar* s = src + i*srcstep;
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
-
         for( j = 0; j < BORDER; j++ )
-            tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
+            tmp[-j-1] = tmp[size.width + j] = DIST_MAX;
 
         for( j = 0; j < size.width; j++ )
         {
@@ -108,16 +107,18 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 if( t0 > t ) t0 = t;
                 t = tmp[j-1] + HV_DIST;
                 if( t0 > t ) t0 = t;
-                tmp[j] = t0;
+                tmp[j] = (t0 > DIST_MAX) ? DIST_MAX : t0;
             }
         }
+        tmp += step;
+        s += srcstep;
     }
 
     // backward pass
+    float* d = (float*)dist;
     for( i = size.height - 1; i >= 0; i-- )
     {
-        float* d = (float*)(dist + i*dststep);
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
+        tmp -= step;
 
         for( j = size.width - 1; j >= 0; j-- )
         {
@@ -134,9 +135,9 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 if( t0 > t ) t0 = t;
                 tmp[j] = t0;
             }
-            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
+        d -= dststep;
     }
 }
 
@@ -149,26 +150,26 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
     const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
     const unsigned int LONG_DIST = CV_FLT_TO_FIX( metrics[2], DIST_SHIFT );
+    const unsigned int DIST_MAX = UINT_MAX - LONG_DIST;
     const float scale = 1.f/(1 << DIST_SHIFT);
 
     const uchar* src = _src.ptr();
     int* temp = _temp.ptr<int>();
-    float* dist = _dist.ptr<float>();
+    float* dist = _dist.ptr<float>(_dist.rows - 1);
     int srcstep = (int)(_src.step/sizeof(src[0]));
     int step = (int)(_temp.step/sizeof(temp[0]));
     int dststep = (int)(_dist.step/sizeof(dist[0]));
     Size size = _src.size();
 
-    initTopBottom( _temp, BORDER );
+    initTopBottom( _temp, BORDER, DIST_MAX );
 
     // forward pass
+    unsigned int* tmp = (unsigned int*)(temp + BORDER*step) + BORDER;
+    const uchar* s = src;
     for( i = 0; i < size.height; i++ )
     {
-        const uchar* s = src + i*srcstep;
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
-
         for( j = 0; j < BORDER; j++ )
-            tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
+            tmp[-j-1] = tmp[size.width + j] = DIST_MAX;
 
         for( j = 0; j < size.width; j++ )
         {
@@ -191,16 +192,18 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 if( t0 > t ) t0 = t;
                 t = tmp[j-1] + HV_DIST;
                 if( t0 > t ) t0 = t;
-                tmp[j] = t0;
+                tmp[j] = (t0 > DIST_MAX) ? DIST_MAX : t0;
             }
         }
+        tmp += step;
+        s += srcstep;
     }
 
     // backward pass
+    float* d = (float*)dist;
     for( i = size.height - 1; i >= 0; i-- )
     {
-        float* d = (float*)(dist + i*dststep);
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
+        tmp -= step;
 
         for( j = size.width - 1; j >= 0; j-- )
         {
@@ -225,9 +228,9 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 if( t0 > t ) t0 = t;
                 tmp[j] = t0;
             }
-            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
+        d -= dststep;
     }
 }
 
@@ -241,11 +244,12 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
     const unsigned int HV_DIST = CV_FLT_TO_FIX( metrics[0], DIST_SHIFT );
     const unsigned int DIAG_DIST = CV_FLT_TO_FIX( metrics[1], DIST_SHIFT );
     const unsigned int LONG_DIST = CV_FLT_TO_FIX( metrics[2], DIST_SHIFT );
+    const unsigned int DIST_MAX = UINT_MAX - LONG_DIST;
     const float scale = 1.f/(1 << DIST_SHIFT);
 
     const uchar* src = _src.ptr();
     int* temp = _temp.ptr<int>();
-    float* dist = _dist.ptr<float>();
+    float* dist = _dist.ptr<float>(_dist.rows - 1);
     int* labels = _labels.ptr<int>();
     int srcstep = (int)(_src.step/sizeof(src[0]));
     int step = (int)(_temp.step/sizeof(temp[0]));
@@ -253,17 +257,16 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
     int lstep = (int)(_labels.step/sizeof(labels[0]));
     Size size = _src.size();
 
-    initTopBottom( _temp, BORDER );
+    initTopBottom( _temp, BORDER, DIST_MAX );
 
     // forward pass
+    const uchar* s = src;
+    unsigned int* tmp = (unsigned int*)(temp + BORDER*step) + BORDER;
+    int* lls = (int*)labels;
     for( i = 0; i < size.height; i++ )
     {
-        const uchar* s = src + i*srcstep;
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
-        int* lls = (int*)(labels + i*lstep);
-
         for( j = 0; j < BORDER; j++ )
-            tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
+            tmp[-j-1] = tmp[size.width + j] = DIST_MAX;
 
         for( j = 0; j < size.width; j++ )
         {
@@ -274,7 +277,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
             }
             else
             {
-                unsigned int t0 = INIT_DIST0, t;
+                unsigned int t0 = DIST_MAX, t;
                 int l0 = 0;
 
                 t = tmp[j-step*2-1] + LONG_DIST;
@@ -330,14 +333,17 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
                 lls[j] = l0;
             }
         }
+        s += srcstep;
+        tmp += step;
+        lls += lstep;
     }
 
     // backward pass
+    float* d = (float*)dist;
     for( i = size.height - 1; i >= 0; i-- )
     {
-        float* d = (float*)(dist + i*dststep);
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
-        int* lls = (int*)(labels + i*lstep);
+        tmp -= step;
+        lls -= lstep;
 
         for( j = size.width - 1; j >= 0; j-- )
         {
@@ -396,9 +402,9 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
                 tmp[j] = t0;
                 lls[j] = l0;
             }
-            t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
+        d -= dststep;
     }
 }
 
@@ -442,13 +448,13 @@ static void getDistanceTransformMask( int maskType, float *metrics )
         metrics[2] = 2.1969f;
         break;
     default:
-        CV_Error(CV_StsBadArg, "Unknown metric type");
+        CV_Error(cv::Error::StsBadArg, "Unknown metric type");
     }
 }
 
 struct DTColumnInvoker : ParallelLoopBody
 {
-    DTColumnInvoker( const Mat* _src, Mat* _dst, const int* _sat_tab, const int* _sqr_tab)
+    DTColumnInvoker( const Mat* _src, Mat* _dst, const int* _sat_tab, const unsigned int* _sqr_tab)
     {
         src = _src;
         dst = _dst;
@@ -489,12 +495,14 @@ struct DTColumnInvoker : ParallelLoopBody
     const Mat* src;
     Mat* dst;
     const int* sat_tab;
-    const int* sqr_tab;
+    const unsigned int* sqr_tab;
 };
 
+static const int PRECISE_DIST_MAX = 1 << 16;
+
 struct DTRowInvoker : ParallelLoopBody
 {
-    DTRowInvoker( Mat* _dst, const int* _sqr_tab, const float* _inv_tab )
+    DTRowInvoker( Mat* _dst, const unsigned int* _sqr_tab, const float* _inv_tab )
     {
         dst = _dst;
         sqr_tab = _sqr_tab;
@@ -521,7 +529,7 @@ struct DTRowInvoker : ParallelLoopBody
             z[1] = inf;
             f[0] = d[0];
 
-            for( q = 1, k = 0; q < n; q++ )
+            for( q = 1, k = 0; q < std::min(PRECISE_DIST_MAX, n); q++ )
             {
                 float fq = d[q];
                 f[q] = fq;
@@ -540,6 +548,25 @@ struct DTRowInvoker : ParallelLoopBody
                     }
                 }
             }
+            for(; q < n; q++ )
+            {
+                float fq = d[q];
+                f[q] = fq;
+
+                for(;;k--)
+                {
+                    p = v[k];
+                    float s = (fq - d[p] + static_cast<float>(q + p) * (q - p))*inv_tab[q - p];
+                    if( s > z[k] )
+                    {
+                        k++;
+                        v[k] = q;
+                        z[k] = s;
+                        z[k+1] = inf;
+                        break;
+                    }
+                }
+            }
 
             for( q = 0, k = 0; q < n; q++ )
             {
@@ -552,14 +579,14 @@ struct DTRowInvoker : ParallelLoopBody
     }
 
     Mat* dst;
-    const int* sqr_tab;
+    const unsigned int* sqr_tab;
     const float* inv_tab;
 };
 
 static void
 trueDistTrans( const Mat& src, Mat& dst )
 {
-    const int inf = INT_MAX;
+    const unsigned int inf = UINT_MAX;
 
     CV_Assert( src.size() == dst.size() );
 
@@ -568,12 +595,12 @@ trueDistTrans( const Mat& src, Mat& dst )
 
     cv::AutoBuffer<uchar> _buf(std::max(m*2*sizeof(int) + (m*3+1)*sizeof(int), n*2*sizeof(float)));
     // stage 1: compute 1d distance transform of each column
-    int* sqr_tab = (int*)_buf.data();
+    unsigned int* sqr_tab = (unsigned int*)_buf.data();
     int* sat_tab = cv::alignPtr((int*)(sqr_tab + m*2), sizeof(int));
     int shift = m*2;
 
     for( i = 0; i < m; i++ )
-        sqr_tab[i] = i*i;
+        sqr_tab[i] = i >= PRECISE_DIST_MAX ? inf : static_cast<unsigned int>(i) * i;
     for( i = m; i < m*2; i++ )
         sqr_tab[i] = inf;
     for( i = 0; i < shift; i++ )
@@ -591,7 +618,7 @@ trueDistTrans( const Mat& src, Mat& dst )
     for( i = 1; i < n; i++ )
     {
         inv_tab[i] = (float)(0.5/i);
-        sqr_tab[i] = i*i;
+        sqr_tab[i] = i >= PRECISE_DIST_MAX ? inf : static_cast<unsigned int>(i) * i;
     }
 
     cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(&dst, sqr_tab, inv_tab));
@@ -733,18 +760,18 @@ void cv::distanceTransform( InputArray _src, OutputArray _dst, OutputArray _labe
 
         _labels.create(src.size(), CV_32S);
         labels = _labels.getMat();
-        maskSize = CV_DIST_MASK_5;
+        maskSize = cv::DIST_MASK_5;
     }
 
     float _mask[5] = {0};
 
-    if( maskSize != CV_DIST_MASK_3 && maskSize != CV_DIST_MASK_5 && maskSize != CV_DIST_MASK_PRECISE )
-        CV_Error( CV_StsBadSize, "Mask size should be 3 or 5 or 0 (precise)" );
+    if( maskSize != cv::DIST_MASK_3 && maskSize != cv::DIST_MASK_5 && maskSize != cv::DIST_MASK_PRECISE )
+        CV_Error( cv::Error::StsBadSize, "Mask size should be 3 or 5 or 0 (precise)" );
 
-    if ((distType == CV_DIST_C || distType == CV_DIST_L1) && !need_labels)
-        maskSize = CV_DIST_MASK_3;
+    if ((distType == cv::DIST_C || distType == cv::DIST_L1) && !need_labels)
+        maskSize = cv::DIST_MASK_3;
 
-    if( maskSize == CV_DIST_MASK_PRECISE )
+    if( maskSize == cv::DIST_MASK_PRECISE )
     {
 
 #ifdef HAVE_IPP
@@ -782,22 +809,23 @@ void cv::distanceTransform( InputArray _src, OutputArray _dst, OutputArray _labe
         return;
     }
 
-    CV_Assert( distType == CV_DIST_C || distType == CV_DIST_L1 || distType == CV_DIST_L2 );
+    CV_Assert( distType == cv::DIST_C || distType == cv::DIST_L1 || distType == cv::DIST_L2 );
 
-    getDistanceTransformMask( (distType == CV_DIST_C ? 0 :
-        distType == CV_DIST_L1 ? 1 : 2) + maskSize*10, _mask );
+    getDistanceTransformMask( (distType == cv::DIST_C ? 0 :
+        distType == cv::DIST_L1 ? 1 : 2) + maskSize*10, _mask );
 
     Size size = src.size();
 
-    int border = maskSize == CV_DIST_MASK_3 ? 1 : 2;
-    Mat temp( size.height + border*2, size.width + border*2, CV_32SC1 );
+    int border = maskSize == cv::DIST_MASK_3 ? 1 : 2;
+    Mat temp;
 
     if( !need_labels )
     {
-        if( maskSize == CV_DIST_MASK_3 )
+        if( maskSize == cv::DIST_MASK_3 )
         {
-#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) && 0  // disabled: https://github.com/opencv/opencv/issues/15904
-            CV_IPP_CHECK()
+#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700)
+            bool has_int_overflow = (int64)src.cols * src.rows >= INT_MAX;
+            if (!has_int_overflow && CV_IPP_CHECK_COND)
             {
                 IppiSize roi = { src.cols, src.rows };
                 if (CV_INSTRUMENT_FUN_IPP(ippiDistanceTransform_3x3_8u32f_C1R, src.ptr<uchar>(), (int)src.step, dst.ptr<float>(), (int)dst.step, roi, _mask) >= 0)
@@ -809,12 +837,14 @@ void cv::distanceTransform( InputArray _src, OutputArray _dst, OutputArray _labe
             }
 #endif
 
+            temp.create(size.height + border*2, size.width + border*2, CV_32SC1);
             distanceTransform_3x3(src, temp, dst, _mask);
         }
         else
         {
 #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700)
-            CV_IPP_CHECK()
+            bool has_int_overflow = (int64)src.cols * src.rows >= INT_MAX;
+            if (!has_int_overflow && CV_IPP_CHECK_COND)
             {
                 IppiSize roi = { src.cols, src.rows };
                 if (CV_INSTRUMENT_FUN_IPP(ippiDistanceTransform_5x5_8u32f_C1R, src.ptr<uchar>(), (int)src.step, dst.ptr<float>(), (int)dst.step, roi, _mask) >= 0)
@@ -826,6 +856,7 @@ void cv::distanceTransform( InputArray _src, OutputArray _dst, OutputArray _labe
             }
 #endif
 
+            temp.create(size.height + border*2, size.width + border*2, CV_32SC1);
             distanceTransform_5x5(src, temp, dst, _mask);
         }
     }
@@ -833,7 +864,7 @@ void cv::distanceTransform( InputArray _src, OutputArray _dst, OutputArray _labe
     {
         labels.setTo(Scalar::all(0));
 
-        if( labelType == CV_DIST_LABEL_CCOMP )
+        if( labelType == cv::DIST_LABEL_CCOMP )
         {
             Mat zpix = src == 0;
             connectedComponents(zpix, labels, 8, CV_32S, CCL_WU);
@@ -852,7 +883,8 @@ void cv::distanceTransform( InputArray _src, OutputArray _dst, OutputArray _labe
             }
         }
 
-       distanceTransformEx_5x5( src, temp, dst, labels, _mask );
+        temp.create(size.height + border*2, size.width + border*2, CV_32SC1);
+        distanceTransformEx_5x5( src, temp, dst, labels, _mask );
     }
 }
 
@@ -861,7 +893,7 @@ void cv::distanceTransform( InputArray _src, OutputArray _dst,
 {
     CV_INSTRUMENT_REGION();
 
-    if (distanceType == CV_DIST_L1 && dstType==CV_8U)
+    if (distanceType == cv::DIST_L1 && dstType==CV_8U)
         distanceTransform_L1_8U(_src, _dst);
     else
         distanceTransform(_src, _dst, noArray(), distanceType, maskSize, DIST_LABEL_PIXEL);
diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
old mode 100755
new mode 100644
index a0cd8ae7727f..4d0f26f7a09b
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -39,6 +39,7 @@
 //
 //M*/
 #include "precomp.hpp"
+using namespace cv;
 
 namespace cv
 {
@@ -939,6 +940,7 @@ void ellipse2Poly( Point center, Size axes, int angle,
     }
 
     // If there are no points, it's a zero-size polygon
+    CV_Assert( !pts.empty() );
     if (pts.size() == 1) {
         pts.assign(2, center);
     }
@@ -1001,6 +1003,7 @@ void ellipse2Poly( Point2d center, Size2d axes, int angle,
     }
 
     // If there are no points, it's a zero-size polygon
+    CV_Assert( !pts.empty() );
     if( pts.size() == 1) {
         pts.assign(2,center);
     }
@@ -1021,7 +1024,6 @@ EllipseEx( Mat& img, Point2l center, Size2l axes,
 
     std::vector<Point2l> v;
     Point2l prevPt(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF);
-    v.resize(0);
     for (unsigned int i = 0; i < _v.size(); ++i)
     {
         Point2l pt;
@@ -1036,7 +1038,7 @@ EllipseEx( Mat& img, Point2l center, Size2l axes,
     }
 
     // If there are no points, it's a zero-size polygon
-    if (v.size() == 1) {
+    if (v.size() <= 1) {
         v.assign(2, center);
     }
 
@@ -1358,7 +1360,7 @@ FillEdgeCollection( Mat& img, std::vector<PolyEdge>& edges, const void* color, i
     int pix_size = (int)img.elemSize();
     int delta;
 
-    if (line_type < CV_AA)
+    if (line_type < cv::LINE_AA)
         delta = 0;
     else
         delta = XY_ONE - 1;
@@ -1556,7 +1558,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
                 ICV_HLINE( tptr1, x21, x22, color, pix_size );
             }
         }
-        else if( x11 < size.width && x12 >= 0 && y21 < size.height && y22 >= 0 )
+        else if( x11 < size.width && x12 >= 0 && y21 < size.height && y22 >= 0)
         {
             if( fill )
             {
@@ -1564,7 +1566,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
                 x12 = MIN( x12, size.width - 1 );
             }
 
-            if( (unsigned)y11 < (unsigned)size.height )
+            if( y11 >= 0 && y11 < size.height )
             {
                 uchar *tptr = ptr + y11 * step;
 
@@ -1579,7 +1581,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
                     ICV_HLINE( tptr, x11, x12, color, pix_size );
             }
 
-            if( (unsigned)y12 < (unsigned)size.height )
+            if( y12 >= 0 && y12 < size.height )
             {
                 uchar *tptr = ptr + y12 * step;
 
@@ -1602,7 +1604,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
                     x22 = MIN( x22, size.width - 1 );
                 }
 
-                if( (unsigned)y21 < (unsigned)size.height )
+                if( y21 >= 0 && y21 < size.height )
                 {
                     uchar *tptr = ptr + y21 * step;
 
@@ -1617,7 +1619,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
                         ICV_HLINE( tptr, x21, x22, color, pix_size );
                 }
 
-                if( (unsigned)y22 < (unsigned)size.height )
+                if( y22 >= 0 && y22 < size.height )
                 {
                     uchar *tptr = ptr + y22 * step;
 
@@ -2238,7 +2240,7 @@ static const int* getFontData(int fontFace)
         ascii = HersheyScriptComplex;
         break;
     default:
-        CV_Error( CV_StsOutOfRange, "Unknown font type" );
+        CV_Error( cv::Error::StsOutOfRange, "Unknown font type" );
     }
     return ascii;
 }
@@ -2309,7 +2311,7 @@ void putText( InputOutputArray _img, const String& text, Point org,
     int base_line = -(ascii[0] & 15);
     int hscale = cvRound(fontScale*XY_ONE), vscale = hscale;
 
-    if( line_type == CV_AA && img.depth() != CV_8U )
+    if( line_type == cv::LINE_AA && img.depth() != CV_8U )
         line_type = 8;
 
     if( bottomLeftOrigin )
@@ -2467,35 +2469,6 @@ void cv::polylines(InputOutputArray img, InputArrayOfArrays pts,
     polylines(img, (const Point**)ptsptr, npts, (int)ncontours, isClosed, color, thickness, lineType, shift);
 }
 
-namespace
-{
-using namespace cv;
-
-static void addChildContour(InputArrayOfArrays contours,
-                            size_t ncontours,
-                            const Vec4i* hierarchy,
-                            int i, std::vector<CvSeq>& seq,
-                            std::vector<CvSeqBlock>& block)
-{
-    for( ; i >= 0; i = hierarchy[i][0] )
-    {
-        Mat ci = contours.getMat(i);
-        cvMakeSeqHeaderForArray(CV_SEQ_POLYGON, sizeof(CvSeq), sizeof(Point),
-                                !ci.empty() ? (void*)ci.ptr() : 0, (int)ci.total(),
-                                &seq[i], &block[i] );
-
-        int h_next = hierarchy[i][0], h_prev = hierarchy[i][1],
-            v_next = hierarchy[i][2], v_prev = hierarchy[i][3];
-        seq[i].h_next = (0 <= h_next && h_next < (int)ncontours) ? &seq[h_next] : 0;
-        seq[i].h_prev = (0 <= h_prev && h_prev < (int)ncontours) ? &seq[h_prev] : 0;
-        seq[i].v_next = (0 <= v_next && v_next < (int)ncontours) ? &seq[v_next] : 0;
-        seq[i].v_prev = (0 <= v_prev && v_prev < (int)ncontours) ? &seq[v_prev] : 0;
-
-        if( v_next >= 0 )
-            addChildContour(contours, ncontours, hierarchy, v_next, seq, block);
-    }
-}
-}
 
 void cv::drawContours( InputOutputArray _image, InputArrayOfArrays _contours,
                    int contourIdx, const Scalar& color, int thickness,
@@ -2503,89 +2476,102 @@ void cv::drawContours( InputOutputArray _image, InputArrayOfArrays _contours,
                    int maxLevel, Point offset )
 {
     CV_INSTRUMENT_REGION();
-
-    Mat image = _image.getMat(), hierarchy = _hierarchy.getMat();
-    CvMat _cimage = cvMat(image);
-
-    size_t ncontours = _contours.total();
-    size_t i = 0, first = 0, last = ncontours;
-    std::vector<CvSeq> seq;
-    std::vector<CvSeqBlock> block;
-
-    if( !last )
+    CV_Assert( thickness <= MAX_THICKNESS );
+    const size_t ncontours = _contours.total();
+    if (!ncontours)
         return;
+    CV_Assert(ncontours <= (size_t)std::numeric_limits<int>::max());
+    if (lineType == cv::LINE_AA && _image.depth() != CV_8U)
+        lineType = 8;
+    Mat image = _image.getMat(), hierarchy = _hierarchy.getMat();
 
-    seq.resize(last);
-    block.resize(last);
-
-    for( i = first; i < last; i++ )
-        seq[i].first = 0;
-
-    if( contourIdx >= 0 )
-    {
-        CV_Assert( 0 <= contourIdx && contourIdx < (int)last );
-        first = contourIdx;
-        last = contourIdx + 1;
-    }
-
-    for( i = first; i < last; i++ )
+    if (thickness >= 0) // contour lines
     {
-        Mat ci = _contours.getMat((int)i);
-        if( ci.empty() )
-            continue;
-        int npoints = ci.checkVector(2, CV_32S);
-        CV_Assert( npoints > 0 );
-        cvMakeSeqHeaderForArray( CV_SEQ_POLYGON, sizeof(CvSeq), sizeof(Point),
-                                 ci.ptr(), npoints, &seq[i], &block[i] );
-    }
-
-    if( hierarchy.empty() || maxLevel == 0 )
-        for( i = first; i < last; i++ )
+        double color_buf[4] {};
+        scalarToRawData(color, color_buf, _image.type(), 0 );
+        int i = 0, end = (int)ncontours;
+        if (contourIdx >= 0)
         {
-            seq[i].h_next = i < last-1 ? &seq[i+1] : 0;
-            seq[i].h_prev = i > first ? &seq[i-1] : 0;
+            i = contourIdx;
+            end = i + 1;
         }
-    else
-    {
-        size_t count = last - first;
-        CV_Assert(hierarchy.total() == ncontours && hierarchy.type() == CV_32SC4 );
-        const Vec4i* h = hierarchy.ptr<Vec4i>();
-
-        if( count == ncontours )
+        for (; i < end; ++i)
         {
-            for( i = first; i < last; i++ )
+            Mat cnt = _contours.getMat(i);
+            if (cnt.empty())
+                continue;
+            const int npoints = cnt.checkVector(2, CV_32S);
+            CV_Assert(npoints > 0);
+            for (int j = 0; j < npoints; ++j)
             {
-                int h_next = h[i][0], h_prev = h[i][1],
-                    v_next = h[i][2], v_prev = h[i][3];
-                seq[i].h_next = (size_t)h_next < count ? &seq[h_next] : 0;
-                seq[i].h_prev = (size_t)h_prev < count ? &seq[h_prev] : 0;
-                seq[i].v_next = (size_t)v_next < count ? &seq[v_next] : 0;
-                seq[i].v_prev = (size_t)v_prev < count ? &seq[v_prev] : 0;
+                const bool isLastIter = j == npoints - 1;
+                const Point pt1 = cnt.at<Point>(j);
+                const Point pt2 = cnt.at<Point>(isLastIter ? 0 : j + 1);
+                cv::ThickLine(image, pt1 + offset, pt2 + offset, color_buf, thickness, lineType, 2, 0);
             }
         }
+    }
+    else // filled polygons
+    {
+        int i = 0, end = (int)ncontours;
+        if (contourIdx >= 0)
+        {
+            i = contourIdx;
+            end = i + 1;
+        }
+        std::vector<int> indexesToFill;
+        if (hierarchy.empty() || maxLevel == 0)
+        {
+            for (; i != end; ++i)
+                indexesToFill.push_back(i);
+        }
         else
         {
-            int child = h[first][2];
-            if( child >= 0 )
+            std::stack<int> indexes;
+            for (; i != end; ++i)
             {
-                addChildContour(_contours, ncontours, h, child, seq, block);
-                seq[first].v_next = &seq[child];
+                // either all from the top level or a single contour
+                if (hierarchy.at<Vec4i>(i)[3] < 0 || contourIdx >= 0)
+                    indexes.push(i);
+            }
+            while (!indexes.empty())
+            {
+                // get current element
+                const int cur = indexes.top();
+                indexes.pop();
+
+                //  check current element depth
+                int curLevel = -1;
+                int par = cur;
+                while (par >= 0)
+                {
+                    par = hierarchy.at<Vec4i>(par)[3]; // parent
+                    ++curLevel;
+                }
+                if (curLevel <= maxLevel)
+                {
+                    indexesToFill.push_back(cur);
+                }
+
+                int next = hierarchy.at<Vec4i>(cur)[2]; // first child
+                while (next > 0)
+                {
+                    indexes.push(next);
+                    next = hierarchy.at<Vec4i>(next)[0]; // next sibling
+                }
             }
         }
+        std::vector<Mat> contoursToFill;
+        for (const int & idx : indexesToFill)
+            contoursToFill.push_back(_contours.getMat(idx));
+        fillPoly(image, contoursToFill, color, lineType, 0, offset);
     }
-
-    cvDrawContours( &_cimage, &seq[first], cvScalar(color), cvScalar(color), contourIdx >= 0 ?
-                   -maxLevel : maxLevel, thickness, lineType, cvPoint(offset) );
 }
 
 
-
 static const int CodeDeltas[8][2] =
 { {1, 0}, {1, -1}, {0, -1}, {-1, -1}, {-1, 0}, {-1, 1}, {0, 1}, {1, 1} };
 
-#define CV_ADJUST_EDGE_COUNT( count, seq )  \
-    ((count) -= ((count) == (seq)->total && !CV_IS_SEQ_CLOSED(seq)))
-
 CV_IMPL void
 cvDrawContours( void* _img, CvSeq* contour,
                 CvScalar _externalColor, CvScalar _holeColor,
diff --git a/modules/imgproc/src/emd.cpp b/modules/imgproc/src/emd.cpp
index 3e065b0404cd..c4cdd51a41a7 100644
--- a/modules/imgproc/src/emd.cpp
+++ b/modules/imgproc/src/emd.cpp
@@ -57,6 +57,7 @@
     ==========================================================================
 */
 #include "precomp.hpp"
+#include "opencv2/imgproc/detail/legacy.hpp"
 
 #define MAX_ITERATIONS 500
 #define CV_EMD_INF   ((float)1e20)
@@ -174,28 +175,28 @@ CV_IMPL float cvCalcEMD2( const CvArr* signature_arr1,
     signature2 = cvGetMat( signature2, &sign_stub2 );
 
     if( signature1->cols != signature2->cols )
-        CV_Error( CV_StsUnmatchedSizes, "The arrays must have equal number of columns (which is number of dimensions but 1)" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "The arrays must have equal number of columns (which is number of dimensions but 1)" );
 
     dims = signature1->cols - 1;
     size1 = signature1->rows;
     size2 = signature2->rows;
 
     if( !CV_ARE_TYPES_EQ( signature1, signature2 ))
-        CV_Error( CV_StsUnmatchedFormats, "The array must have equal types" );
+        CV_Error( cv::Error::StsUnmatchedFormats, "The array must have equal types" );
 
     if( CV_MAT_TYPE( signature1->type ) != CV_32FC1 )
-        CV_Error( CV_StsUnsupportedFormat, "The signatures must be 32fC1" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "The signatures must be 32fC1" );
 
     if( flow )
     {
         flow = cvGetMat( flow, &flow_stub );
 
         if( flow->rows != size1 || flow->cols != size2 )
-            CV_Error( CV_StsUnmatchedSizes,
+            CV_Error( cv::Error::StsUnmatchedSizes,
             "The flow matrix size does not match to the signatures' sizes" );
 
         if( CV_MAT_TYPE( flow->type ) != CV_32FC1 )
-            CV_Error( CV_StsUnsupportedFormat, "The flow matrix must be 32fC1" );
+            CV_Error( cv::Error::StsUnsupportedFormat, "The flow matrix must be 32fC1" );
     }
 
     cost->data.fl = 0;
@@ -206,43 +207,43 @@ CV_IMPL float cvCalcEMD2( const CvArr* signature_arr1,
         if( cost_matrix )
         {
             if( dist_func )
-                CV_Error( CV_StsBadArg,
+                CV_Error( cv::Error::StsBadArg,
                 "Only one of cost matrix or distance function should be non-NULL in case of user-defined distance" );
 
             if( lower_bound )
-                CV_Error( CV_StsBadArg,
+                CV_Error( cv::Error::StsBadArg,
                 "The lower boundary can not be calculated if the cost matrix is used" );
 
             cost = cvGetMat( cost_matrix, &cost_stub );
             if( cost->rows != size1 || cost->cols != size2 )
-                CV_Error( CV_StsUnmatchedSizes,
+                CV_Error( cv::Error::StsUnmatchedSizes,
                 "The cost matrix size does not match to the signatures' sizes" );
 
             if( CV_MAT_TYPE( cost->type ) != CV_32FC1 )
-                CV_Error( CV_StsUnsupportedFormat, "The cost matrix must be 32fC1" );
+                CV_Error( cv::Error::StsUnsupportedFormat, "The cost matrix must be 32fC1" );
         }
         else if( !dist_func )
-            CV_Error( CV_StsNullPtr, "In case of user-defined distance Distance function is undefined" );
+            CV_Error( cv::Error::StsNullPtr, "In case of user-defined distance Distance function is undefined" );
     }
     else
     {
         if( dims == 0 )
-            CV_Error( CV_StsBadSize,
+            CV_Error( cv::Error::StsBadSize,
             "Number of dimensions can be 0 only if a user-defined metric is used" );
         user_param = (void *) (size_t)dims;
         switch (dist_type)
         {
-        case CV_DIST_L1:
+        case cv::DIST_L1:
             dist_func = icvDistL1;
             break;
-        case CV_DIST_L2:
+        case cv::DIST_L2:
             dist_func = icvDistL2;
             break;
-        case CV_DIST_C:
+        case cv::DIST_C:
             dist_func = icvDistC;
             break;
         default:
-            CV_Error( CV_StsBadFlag, "Bad or unsupported metric type" );
+            CV_Error( cv::Error::StsBadFlag, "Bad or unsupported metric type" );
         }
     }
 
@@ -279,7 +280,7 @@ CV_IMPL float cvCalcEMD2( const CvArr* signature_arr1,
                                       state.ssize, state.dsize, state.enter_x );
 
             if( min_delta == CV_EMD_INF )
-                CV_Error( CV_StsNoConv, "" );
+                CV_Error( cv::Error::StsNoConv, "" );
 
             /* if no negative deltamin, we found the optimal solution */
             if( min_delta >= -eps )
@@ -287,7 +288,7 @@ CV_IMPL float cvCalcEMD2( const CvArr* signature_arr1,
 
             /* improve solution */
             if(!icvNewSolution( &state ))
-                CV_Error( CV_StsNoConv, "" );
+                CV_Error( cv::Error::StsNoConv, "" );
         }
     }
 
@@ -387,7 +388,7 @@ static int icvInitEMD( const float* signature1, int size1,
 
         }
         else if( weight < 0 )
-            CV_Error(CV_StsBadArg, "signature1 must not contain negative weights");
+            CV_Error(cv::Error::StsBadArg, "signature1 must not contain negative weights");
     }
 
     for( i = 0; i < size2; i++ )
@@ -401,13 +402,13 @@ static int icvInitEMD( const float* signature1, int size1,
             state->idx2[dsize++] = i;
         }
         else if( weight < 0 )
-            CV_Error(CV_StsBadArg, "signature2 must not contain negative weights");
+            CV_Error(cv::Error::StsBadArg, "signature2 must not contain negative weights");
     }
 
     if( ssize == 0 )
-        CV_Error(CV_StsBadArg, "signature1 must contain at least one non-zero value");
+        CV_Error(cv::Error::StsBadArg, "signature1 must contain at least one non-zero value");
     if( dsize == 0 )
-        CV_Error(CV_StsBadArg, "signature2 must contain at least one non-zero value");
+        CV_Error(cv::Error::StsBadArg, "signature2 must contain at least one non-zero value");
 
     /* if supply different than the demand, add a zero-cost dummy cluster */
     diff = s_sum - d_sum;
@@ -1147,7 +1148,7 @@ icvDistC( const float *x, const float *y, void *user_param )
 }
 
 
-float cv::EMD( InputArray _signature1, InputArray _signature2,
+float cv::EMD_legacy( InputArray _signature1, InputArray _signature2,
                int distType, InputArray _cost,
                float* lowerBound, OutputArray _flow )
 {
@@ -1171,7 +1172,7 @@ float cv::EMD( InputArray _signature1, InputArray _signature2,
                        _flow.needed() ? &_cflow : 0, lowerBound, 0 );
 }
 
-float cv::wrapperEMD(InputArray _signature1, InputArray _signature2,
+float cv::wrapperEMD_legacy(InputArray _signature1, InputArray _signature2,
                int distType, InputArray _cost,
                Ptr<float> lowerBound, OutputArray _flow)
 {
diff --git a/modules/imgproc/src/emd_new.cpp b/modules/imgproc/src/emd_new.cpp
new file mode 100644
index 000000000000..59a6b2434276
--- /dev/null
+++ b/modules/imgproc/src/emd_new.cpp
@@ -0,0 +1,1011 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+/*
+    Partially based on Yossi Rubner code:
+    =========================================================================
+    emd.c
+
+    Last update: 3/14/98
+
+    An implementation of the Earth Movers Distance.
+    Based of the solution for the Transportation problem as described in
+    "Introduction to Mathematical Programming" by F. S. Hillier and
+    G. J. Lieberman, McGraw-Hill, 1990.
+
+    Copyright (C) 1998 Yossi Rubner
+    Computer Science Department, Stanford University
+    E-Mail: rubner@cs.stanford.edu   URL: http://vision.stanford.edu/~rubner
+    ==========================================================================
+*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+
+namespace {
+
+
+//==============================================================================
+// Distance functions
+
+typedef float (*DistFunc)(const float* a, const float* b, int dims);
+
+static float distL1(const float* x, const float* y, int dims)
+{
+    double s = 0;
+    for (int i = 0; i < dims; i++)
+    {
+        const double t = x[i] - y[i];
+        s += fabs(t);
+    }
+    return (float)s;
+}
+
+static float distL2(const float* x, const float* y, int dims)
+{
+    double s = 0;
+    for (int i = 0; i < dims; i++)
+    {
+        const double t = x[i] - y[i];
+        s += t * t;
+    }
+    return sqrt((float)s);
+}
+
+static float distC(const float* x, const float* y, int dims)
+{
+    double s = 0;
+    for (int i = 0; i < dims; i++)
+    {
+        const double t = fabs(x[i] - y[i]);
+        if (s < t)
+            s = t;
+    }
+    return (float)s;
+}
+
+
+//==============================================================================
+// Data structures
+
+/* Node1D is used for lists, representing 1D sparse array */
+struct Node1D
+{
+    float val;
+    Node1D* next;
+};
+
+/* Node2D is used for lists, representing 2D sparse matrix */
+struct Node2D
+{
+    float val;
+    int i, j;
+    Node2D* next[2]; /* next row & next column */
+};
+
+
+//==============================================================================
+// Main class
+
+struct EMDSolver
+{
+    static constexpr int MAX_ITERATIONS = 500;
+    static constexpr float CV_EMD_INF = 1e20f;
+    static constexpr float CV_EMD_EPS = 1e-5f;
+
+    int ssize, dsize;
+
+    float* cost_buf;
+    AutoBuffer<Node2D, 0> data_x;
+    Node2D* end_x;
+    Node2D* enter_x;
+    char* is_x;
+
+    Node2D** rows_x;
+    Node2D** cols_x;
+
+    Node1D* u;
+    Node1D* v;
+
+    int* idx1;
+    int* idx2;
+
+    /* find_loop buffers */
+    Node2D** loop;
+    char* is_used;
+
+    /* russel buffers */
+    float* s;
+    float* d;
+    float* delta;
+
+    float weight, max_cost;
+
+    utils::BufferArea area, area2;
+
+public:
+    float getWeight() const
+    {
+        return weight;
+    }
+
+    float& getCost(int i, int j)
+    {
+        return *(this->cost_buf + i * dsize + j);
+    }
+    const float& getCost(int i, int j) const
+    {
+        return *(this->cost_buf + i * dsize + j);
+    }
+    char& getIsX(int i, int j)
+    {
+        return *(this->is_x + i * dsize + j);
+    }
+    const char& getIsX(int i, int j) const
+    {
+        return *(this->is_x + i * dsize + j);
+    }
+
+    EMDSolver() :
+        ssize(0), dsize(0), cost_buf(0), end_x(0), enter_x(0), is_x(0), rows_x(0),
+        cols_x(0), u(0), v(0), idx1(0), idx2(0), loop(0), is_used(0), s(0), d(0), delta(0),
+        weight(0), max_cost(0)
+    {
+    }
+
+public:
+    bool init(const Mat& sign1,
+              const Mat& sign2,
+              int dims,
+              DistFunc dfunc,
+              const Mat& cost,
+              float* lowerBound);
+    bool checkLowerBound(const Mat& sign1,
+                         const Mat& sign2,
+                         int dims,
+                         DistFunc dfunc,
+                         float& lowerBound);
+    bool calcSums(const Mat& sign1, const Mat& sign2);
+    float calcCost(const Mat& sign1, const Mat& sign2, int dims, DistFunc dfunc, const Mat& cost);
+    void solve();
+    double calcFlow(Mat* flow_) const;
+    int findBasicVars() const;
+    float checkOptimal() const;
+    void callRussel();
+    bool checkNewSolution();
+    int findLoop() const;
+    void addBasicVar(int min_i,
+                     int min_j,
+                     Node1D* prev_u_min_i,
+                     Node1D* prev_v_min_j,
+                     Node1D* u_head);
+};
+
+
+//==============================================================================
+// Implementations
+
+bool EMDSolver::init(const Mat& sign1,
+                     const Mat& sign2,
+                     int dims,
+                     DistFunc dfunc,
+                     const Mat& cost,
+                     float* lowerBound)
+{
+    const int size1 = sign1.size().height;
+    const int size2 = sign2.size().height;
+
+    area.allocate(this->idx1, size1 + 1);
+    area.allocate(this->idx2, size2 + 1);
+    area.allocate(this->s, size1 + 1);
+    area.allocate(this->d, size2 + 1);
+    area.commit();
+    area.zeroFill();
+
+    const bool areSumsEqual = calcSums(sign1, sign2);
+    if (areSumsEqual && lowerBound)
+    {
+        if (checkLowerBound(sign1, sign2, dims, dfunc, *lowerBound))
+            return false;
+    }
+
+    area2.allocate(this->u, ssize, 64);
+    area2.allocate(this->v, dsize, 64);
+    area2.allocate(this->is_used, ssize + dsize);
+    area2.allocate(this->delta, ssize * dsize);
+    area2.allocate(this->cost_buf, ssize * dsize);
+    area2.allocate(this->is_x, ssize * dsize);
+    area2.allocate(this->rows_x, ssize, 64);
+    area2.allocate(this->cols_x, dsize, 64);
+    area2.allocate(this->loop, ssize + dsize + 1, 64);
+    area2.commit();
+    area2.zeroFill();
+
+    this->data_x.allocate(ssize + dsize);
+
+    this->end_x = this->data_x.data();
+    this->max_cost = calcCost(sign1, sign2, dims, dfunc, cost);
+    callRussel();
+    this->enter_x = (this->end_x)++;
+    return true;
+}
+
+
+bool EMDSolver::checkLowerBound(const Mat& sign1,
+                                const Mat& sign2,
+                                int dims,
+                                DistFunc dfunc,
+                                float& lowerBound)
+{
+    AutoBuffer<float> buf;
+    buf.allocate(dims * 2);
+    memset(buf.data(), 0, dims * 2 * sizeof(float));
+
+    float* xs = buf.data();
+    float* xd = buf.data() + dims;
+
+    for (int j = 0; j < sign1.rows; ++j)
+    {
+        const float weight_ = sign1.at<float>(j, 0);
+        for (int i = 0; i < dims; i++)
+            xs[i] += sign1.at<float>(j, i + 1) * weight_;
+    }
+
+    for (int j = 0; j < sign2.rows; ++j)
+    {
+        const float weight_ = sign2.at<float>(j, 0);
+        for (int i = 0; i < dims; i++)
+            xd[i] += sign2.at<float>(j, i + 1) * weight_;
+    }
+
+    const float lb = dfunc(xs, xd, dims) / this->weight;
+    const bool result = (lowerBound <= lb);
+    lowerBound = lb;
+    return result;
+}
+
+
+// return true if total sums of signatures are equal, false otherwise
+bool EMDSolver::calcSums(const Mat& sign1, const Mat& sign2)
+{
+    bool result = true;
+    /* sum up the supply and demand */
+    int ssize_ = 0, dsize_ = 0;
+    float s_sum = 0, d_sum = 0, diff;
+    for (int i = 0; i < sign1.size().height; i++)
+    {
+        const float weight_ = sign1.at<float>(i, 0);
+
+        if (weight_ > 0)
+        {
+            s_sum += weight_;
+            this->s[ssize_] = weight_;
+            this->idx1[ssize_++] = i;
+        }
+        else if (weight_ < 0)
+            CV_Error(cv::Error::StsBadArg, "sign1 must not contain negative weights");
+    }
+
+    for (int i = 0; i < sign2.size().height; i++)
+    {
+        const float weight_ = sign2.at<float>(i, 0);
+
+        if (weight_ > 0)
+        {
+            d_sum += weight_;
+            this->d[dsize_] = weight_;
+            this->idx2[dsize_++] = i;
+        }
+        else if (weight_ < 0)
+            CV_Error(cv::Error::StsBadArg, "sign2 must not contain negative weights");
+    }
+
+    if (ssize_ == 0)
+        CV_Error(cv::Error::StsBadArg, "sign1 must contain at least one non-zero value");
+    if (dsize_ == 0)
+        CV_Error(cv::Error::StsBadArg, "sign2 must contain at least one non-zero value");
+
+    /* if supply different than the demand, add a zero-cost dummy cluster */
+    diff = s_sum - d_sum;
+    if (fabs(diff) >= CV_EMD_EPS * s_sum)
+    {
+        result = false;
+        if (diff < 0)
+        {
+            this->s[ssize_] = -diff;
+            this->idx1[ssize_++] = -1;
+        }
+        else
+        {
+            this->d[dsize_] = diff;
+            this->idx2[dsize_++] = -1;
+        }
+    }
+
+    this->ssize = ssize_;
+    this->dsize = dsize_;
+    this->weight = s_sum > d_sum ? s_sum : d_sum;
+    return result;
+}
+
+
+// returns maximum cost over all possible s->d combinations
+float EMDSolver::calcCost(const Mat& sign1,
+                          const Mat& sign2,
+                          int dims,
+                          DistFunc dfunc,
+                          const Mat& cost)
+{
+    if (!dfunc)
+    {
+        CV_Assert(!cost.empty());
+    }
+    float result = 0;
+
+    /* compute the distance matrix */
+    for (int i = 0; i < ssize; i++)
+    {
+        const int ci = this->idx1[i];
+        if (ci >= 0)
+        {
+            for (int j = 0; j < dsize; j++)
+            {
+                const int cj = this->idx2[j];
+                if (cj < 0)
+                    getCost(i, j) = 0;
+                else
+                {
+                    float val;
+                    if (dfunc)
+                    {
+                        val = dfunc(sign1.ptr<float>(ci, 1), sign2.ptr<float>(cj, 1), dims);
+                    }
+                    else
+                    {
+                        val = cost.at<float>(ci, cj);
+                    }
+                    getCost(i, j) = val;
+                    if (result < val)
+                        result = val;
+                }
+            }
+        }
+        else
+        {
+            for (int j = 0; j < dsize; j++)
+                getCost(i, j) = 0;
+        }
+    }
+    return result;
+}
+
+
+// runs solver iterations
+void EMDSolver::solve()
+{
+    if (ssize > 1 && dsize > 1)
+    {
+        const float eps = CV_EMD_EPS * max_cost;
+        for (int itr = 1; itr < MAX_ITERATIONS; itr++)
+        {
+            /* find basic variables */
+            if (findBasicVars() < 0)
+                break;
+
+            /* check for optimality */
+            const float min_delta = checkOptimal();
+
+            if (min_delta == CV_EMD_INF)
+                CV_Error(cv::Error::StsNoConv, "");
+
+            /* if no negative deltamin, we found the optimal solution */
+            if (min_delta >= -eps)
+                break;
+
+            /* improve solution */
+            if (!checkNewSolution())
+                CV_Error(cv::Error::StsNoConv, "");
+        }
+    }
+}
+
+double EMDSolver::calcFlow(Mat* flow_) const
+{
+    double result = 0.;
+    const Node2D* xp = 0;
+    for (xp = data_x.data(); xp < end_x; xp++)
+    {
+        float val = xp->val;
+        const int i = xp->i;
+        const int j = xp->j;
+
+        if (xp == enter_x)
+            continue;
+
+        const int ci = idx1[i];
+        const int cj = idx2[j];
+
+        if (ci >= 0 && cj >= 0)
+        {
+            result += (double)val * getCost(i, j);
+            if (flow_)
+            {
+                flow_->at<float>(ci, cj) = val;
+            }
+        }
+    }
+    return result;
+}
+
+
+int EMDSolver::findBasicVars() const
+{
+    int i, j;
+    int u_cfound, v_cfound;
+    Node1D u0_head, u1_head, *cur_u, *prev_u;
+    Node1D v0_head, v1_head, *cur_v, *prev_v;
+    bool found;
+
+    CV_Assert(u != 0 && v != 0);
+
+    /* initialize the rows list (u) and the columns list (v) */
+    u0_head.next = u;
+    for (i = 0; i < ssize; i++)
+    {
+        u[i].next = u + i + 1;
+    }
+    u[ssize - 1].next = 0;
+    u1_head.next = 0;
+
+    v0_head.next = ssize > 1 ? v + 1 : 0;
+    for (i = 1; i < dsize; i++)
+    {
+        v[i].next = v + i + 1;
+    }
+    v[dsize - 1].next = 0;
+    v1_head.next = 0;
+
+    /* there are ssize+dsize variables but only ssize+dsize-1 independent equations,
+       so set v[0]=0 */
+    v[0].val = 0;
+    v1_head.next = v;
+    v1_head.next->next = 0;
+
+    /* loop until all variables are found */
+    u_cfound = v_cfound = 0;
+    while (u_cfound < ssize || v_cfound < dsize)
+    {
+        found = false;
+        if (v_cfound < dsize)
+        {
+            /* loop over all marked columns */
+            prev_v = &v1_head;
+            cur_v = v1_head.next;
+            found = found || (cur_v != 0);
+            for (; cur_v != 0; cur_v = cur_v->next)
+            {
+                float cur_v_val = cur_v->val;
+
+                j = (int)(cur_v - v);
+                /* find the variables in column j */
+                prev_u = &u0_head;
+                for (cur_u = u0_head.next; cur_u != 0;)
+                {
+                    i = (int)(cur_u - u);
+                    if (getIsX(i, j))
+                    {
+                        /* compute u[i] */
+                        cur_u->val = getCost(i, j) - cur_v_val;
+                        /* ...and add it to the marked list */
+                        prev_u->next = cur_u->next;
+                        cur_u->next = u1_head.next;
+                        u1_head.next = cur_u;
+                        cur_u = prev_u->next;
+                    }
+                    else
+                    {
+                        prev_u = cur_u;
+                        cur_u = cur_u->next;
+                    }
+                }
+                prev_v->next = cur_v->next;
+                v_cfound++;
+            }
+        }
+
+        if (u_cfound < ssize)
+        {
+            /* loop over all marked rows */
+            prev_u = &u1_head;
+            cur_u = u1_head.next;
+            found = found || (cur_u != 0);
+            for (; cur_u != 0; cur_u = cur_u->next)
+            {
+                float cur_u_val = cur_u->val;
+                i = (int)(cur_u - u);
+                /* find the variables in rows i */
+                prev_v = &v0_head;
+                for (cur_v = v0_head.next; cur_v != 0;)
+                {
+                    j = (int)(cur_v - v);
+                    if (getIsX(i, j))
+                    {
+                        /* compute v[j] */
+                        cur_v->val = getCost(i, j) - cur_u_val;
+                        /* ...and add it to the marked list */
+                        prev_v->next = cur_v->next;
+                        cur_v->next = v1_head.next;
+                        v1_head.next = cur_v;
+                        cur_v = prev_v->next;
+                    }
+                    else
+                    {
+                        prev_v = cur_v;
+                        cur_v = cur_v->next;
+                    }
+                }
+                prev_u->next = cur_u->next;
+                u_cfound++;
+            }
+        }
+
+        if (!found)
+            return -1;
+    }
+
+    return 0;
+}
+
+float EMDSolver::checkOptimal() const
+{
+    int i, j, min_i = 0, min_j = 0;
+
+    float min_delta = CV_EMD_INF;
+    /* find the minimal cij-ui-vj over all i,j */
+    for (i = 0; i < ssize; i++)
+    {
+        float u_val = u[i].val;
+        for (j = 0; j < dsize; j++)
+        {
+            if (!getIsX(i, j))
+            {
+                const float delta_ = getCost(i, j) - u_val - v[j].val;
+                if (min_delta > delta_)
+                {
+                    min_delta = delta_;
+                    min_i = i;
+                    min_j = j;
+                }
+            }
+        }
+    }
+
+    enter_x->i = min_i;
+    enter_x->j = min_j;
+
+    return min_delta;
+}
+
+bool EMDSolver::checkNewSolution()
+{
+    int i, j;
+    float min_val = CV_EMD_INF;
+    int steps;
+    Node2D head {0, 0, 0, {0, 0}}, *cur_x, *next_x, *leave_x = 0;
+    Node2D* enter_x_ = this->enter_x;
+    Node2D** loop_ = this->loop;
+
+    /* enter the new basic variable */
+    i = enter_x_->i;
+    j = enter_x_->j;
+    getIsX(i, j) = 1;
+    enter_x_->next[0] = this->rows_x[i];
+    enter_x->next[1] = this->cols_x[j];
+    enter_x_->val = 0;
+    this->rows_x[i] = enter_x_;
+    this->cols_x[j] = enter_x_;
+
+    /* find a chain reaction */
+    steps = findLoop();
+
+    if (steps == 0)
+        return false;
+
+    /* find the largest value in the loop */
+    for (i = 1; i < steps; i += 2)
+    {
+        float temp = loop_[i]->val;
+
+        if (min_val > temp)
+        {
+            leave_x = loop_[i];
+            min_val = temp;
+        }
+    }
+
+    /* update the loop */
+    for (i = 0; i < steps; i += 2)
+    {
+        float temp0 = loop_[i]->val + min_val;
+        float temp1 = loop_[i + 1]->val - min_val;
+
+        loop_[i]->val = temp0;
+        loop_[i + 1]->val = temp1;
+    }
+
+    /* remove the leaving basic variable */
+    CV_Assert(leave_x != NULL);
+    i = leave_x->i;
+    j = leave_x->j;
+    getIsX(i, j) = 0;
+
+    head.next[0] = this->rows_x[i];
+    cur_x = &head;
+    while ((next_x = cur_x->next[0]) != leave_x)
+    {
+        cur_x = next_x;
+        CV_Assert(cur_x);
+    }
+    cur_x->next[0] = next_x->next[0];
+    this->rows_x[i] = head.next[0];
+
+    head.next[1] = this->cols_x[j];
+    cur_x = &head;
+    while ((next_x = cur_x->next[1]) != leave_x)
+    {
+        cur_x = next_x;
+        CV_Assert(cur_x);
+    }
+    cur_x->next[1] = next_x->next[1];
+    this->cols_x[j] = head.next[1];
+
+    /* set enter_x to be the new empty slot */
+    this->enter_x = leave_x;
+
+    return true;
+}
+
+int EMDSolver::findLoop() const
+{
+    int i;
+
+    memset(is_used, 0, this->ssize + this->dsize);
+
+    Node2D* new_x = loop[0] = enter_x;
+    is_used[enter_x - data_x.data()] = 1;
+    int steps = 1;
+
+    do
+    {
+        if ((steps & 1) == 1)
+        {
+            /* find an unused x in the row */
+            new_x = this->rows_x[new_x->i];
+            while (new_x != 0 && is_used[new_x - data_x.data()])
+                new_x = new_x->next[0];
+        }
+        else
+        {
+            /* find an unused x in the column, or the entering x */
+            new_x = this->cols_x[new_x->j];
+            while (new_x != 0 && is_used[new_x - data_x.data()] && new_x != enter_x)
+                new_x = new_x->next[1];
+            if (new_x == enter_x)
+                break;
+        }
+
+        if (new_x != 0) /* found the next x */
+        {
+            /* add x to the loop */
+            loop[steps++] = new_x;
+            is_used[new_x - data_x.data()] = 1;
+        }
+        else /* didn't find the next x */
+        {
+            /* backtrack */
+            do
+            {
+                i = steps & 1;
+                new_x = loop[steps - 1];
+                do
+                {
+                    new_x = new_x->next[i];
+                }
+                while (new_x != 0 && is_used[new_x - data_x.data()]);
+
+                if (new_x == 0)
+                {
+                    is_used[loop[--steps] - data_x.data()] = 0;
+                }
+            }
+            while (new_x == 0 && steps > 0);
+
+            is_used[loop[steps - 1] - data_x.data()] = 0;
+            loop[steps - 1] = new_x;
+            is_used[new_x - data_x.data()] = 1;
+        }
+    }
+    while (steps > 0);
+
+    return steps;
+}
+
+void EMDSolver::callRussel()
+{
+    int i, j, min_i = -1, min_j = -1;
+    float min_delta, diff;
+    Node1D u_head, *cur_u, *prev_u;
+    Node1D v_head, *cur_v, *prev_v;
+    Node1D *prev_u_min_i = 0, *prev_v_min_j = 0, *remember;
+    float eps = CV_EMD_EPS * this->max_cost;
+
+    /* initialize the rows list (ur), and the columns list (vr) */
+    u_head.next = u;
+    for (i = 0; i < ssize; i++)
+    {
+        u[i].next = u + i + 1;
+    }
+    u[ssize - 1].next = 0;
+
+    v_head.next = v;
+    for (i = 0; i < dsize; i++)
+    {
+        v[i].val = -CV_EMD_INF;
+        v[i].next = v + i + 1;
+    }
+    v[dsize - 1].next = 0;
+
+    /* find the maximum row and column values (ur[i] and vr[j]) */
+    for (i = 0; i < ssize; i++)
+    {
+        float u_val = -CV_EMD_INF;
+        for (j = 0; j < dsize; j++)
+        {
+            float temp = getCost(i, j);
+
+            if (u_val < temp)
+                u_val = temp;
+            if (v[j].val < temp)
+                v[j].val = temp;
+        }
+        u[i].val = u_val;
+    }
+
+    /* compute the delta matrix */
+    for (i = 0; i < ssize; i++)
+    {
+        float u_val = u[i].val;
+        float* delta_row = delta + i * dsize;
+        for (j = 0; j < dsize; j++)
+        {
+            delta_row[j] = getCost(i, j) - u_val - v[j].val;
+        }
+    }
+
+    /* find the basic variables */
+    do
+    {
+        /* find the smallest delta[i][j] */
+        min_i = -1;
+        min_delta = CV_EMD_INF;
+        prev_u = &u_head;
+        for (cur_u = u_head.next; cur_u != 0; cur_u = cur_u->next)
+        {
+            i = (int)(cur_u - u);
+            float* delta_row = delta + i * dsize;
+
+            prev_v = &v_head;
+            for (cur_v = v_head.next; cur_v != 0; cur_v = cur_v->next)
+            {
+                j = (int)(cur_v - v);
+                if (min_delta > delta_row[j])
+                {
+                    min_delta = delta_row[j];
+                    min_i = i;
+                    min_j = j;
+                    prev_u_min_i = prev_u;
+                    prev_v_min_j = prev_v;
+                }
+                prev_v = cur_v;
+            }
+            prev_u = cur_u;
+        }
+
+        if (min_i < 0)
+            break;
+
+        /* add x[min_i][min_j] to the basis, and adjust supplies and cost */
+        remember = prev_u_min_i->next;
+        addBasicVar(min_i, min_j, prev_u_min_i, prev_v_min_j, &u_head);
+
+        /* update the necessary delta[][] */
+        if (remember == prev_u_min_i->next) /* line min_i was deleted */
+        {
+            for (cur_v = v_head.next; cur_v != 0; cur_v = cur_v->next)
+            {
+                j = (int)(cur_v - v);
+                if (cur_v->val == getCost(min_i, j)) /* column j needs updating */
+                {
+                    float max_val = -CV_EMD_INF;
+
+                    /* find the new maximum value in the column */
+                    for (cur_u = u_head.next; cur_u != 0; cur_u = cur_u->next)
+                    {
+                        float temp = getCost((int)(cur_u - u), j);
+
+                        if (max_val < temp)
+                            max_val = temp;
+                    }
+
+                    /* if needed, adjust the relevant delta[*][j] */
+                    diff = max_val - cur_v->val;
+                    cur_v->val = max_val;
+                    if (fabs(diff) < eps)
+                    {
+                        for (cur_u = u_head.next; cur_u != 0; cur_u = cur_u->next)
+                            *(delta + (cur_u - u) * dsize + j) += diff;
+                    }
+                }
+            }
+        }
+        else /* column min_j was deleted */
+        {
+            for (cur_u = u_head.next; cur_u != 0; cur_u = cur_u->next)
+            {
+                i = (int)(cur_u - u);
+                if (cur_u->val == getCost(i, min_j)) /* row i needs updating */
+                {
+                    float max_val = -CV_EMD_INF;
+
+                    /* find the new maximum value in the row */
+                    for (cur_v = v_head.next; cur_v != 0; cur_v = cur_v->next)
+                    {
+                        float temp = getCost(i, (int)(cur_v - v));
+
+                        if (max_val < temp)
+                            max_val = temp;
+                    }
+
+                    /* if needed, adjust the relevant delta[i][*] */
+                    diff = max_val - cur_u->val;
+                    cur_u->val = max_val;
+
+                    if (fabs(diff) < eps)
+                    {
+                        for (cur_v = v_head.next; cur_v != 0; cur_v = cur_v->next)
+                            *(delta + i * dsize + (cur_v - v)) += diff;
+                    }
+                }
+            }
+        }
+    }
+    while (u_head.next != 0 || v_head.next != 0);
+}
+
+void EMDSolver::addBasicVar(int min_i,
+                            int min_j,
+                            Node1D* prev_u_min_i,
+                            Node1D* prev_v_min_j,
+                            Node1D* u_head)
+{
+    float temp;
+
+    if (this->s[min_i] < this->d[min_j] + this->weight * CV_EMD_EPS)
+    { /* supply exhausted */
+        temp = this->s[min_i];
+        this->s[min_i] = 0;
+        this->d[min_j] -= temp;
+    }
+    else /* demand exhausted */
+    {
+        temp = this->d[min_j];
+        this->d[min_j] = 0;
+        this->s[min_i] -= temp;
+    }
+
+    /* x(min_i,min_j) is a basic variable */
+    getIsX(min_i, min_j) = 1;
+
+    end_x->val = temp;
+    end_x->i = min_i;
+    end_x->j = min_j;
+    end_x->next[0] = this->rows_x[min_i];
+    end_x->next[1] = this->cols_x[min_j];
+    this->rows_x[min_i] = end_x;
+    this->cols_x[min_j] = end_x;
+    this->end_x = end_x + 1;
+
+    /* delete supply row only if the empty, and if not last row */
+    if (this->s[min_i] == 0 && u_head->next->next != 0)
+        prev_u_min_i->next = prev_u_min_i->next->next; /* remove row from list */
+    else
+        prev_v_min_j->next = prev_v_min_j->next->next; /* remove column from list */
+}
+
+}  // namespace
+
+
+//==============================================================================
+// External interface
+
+float cv::EMD(InputArray _sign1,
+              InputArray _sign2,
+              int distType,
+              InputArray _cost,
+              float* lowerBound,
+              OutputArray _flow)
+{
+    CV_INSTRUMENT_REGION();
+
+    Mat sign1 = _sign1.getMat();
+    Mat sign2 = _sign2.getMat();
+    Mat cost = _cost.getMat();
+
+    CV_CheckEQ(sign1.cols, sign2.cols, "Signatures must have equal number of columns");
+    CV_CheckEQ(sign1.type(), CV_32FC1, "The sign1 must be 32FC1");
+    CV_CheckEQ(sign2.type(), CV_32FC1, "The sign2 must be 32FC1");
+
+    const int dims = sign1.cols - 1;
+    const int size1 = sign1.rows;
+    const int size2 = sign2.rows;
+
+    Mat flow;
+    if (_flow.needed())
+    {
+        _flow.create(sign1.rows, sign2.rows, CV_32F);
+        flow = _flow.getMat();
+        flow = Scalar::all(0);
+        CV_CheckEQ(flow.type(), CV_32FC1, "Flow matrix must have type 32FC1");
+        CV_CheckTrue(flow.rows == size1 && flow.cols == size2,
+                     "Flow matrix size does not match signatures");
+    }
+
+    DistFunc dfunc = 0;
+    if (distType == DIST_USER)
+    {
+        if (!cost.empty())
+        {
+            CV_CheckEQ(cost.type(), CV_32FC1, "Cost matrix must have type 32FC1");
+            CV_CheckTrue(cost.rows == size1 && cost.cols == size2,
+                         "Cost matrix size does not match signatures");
+            CV_CheckTrue(lowerBound == NULL,
+                         "Lower boundary can not be calculated if the cost matrix is used");
+        }
+        else
+        {
+            CV_CheckTrue(dfunc == NULL, "Dist function must be set if cost matrix is empty");
+        }
+    }
+    else
+    {
+        CV_CheckNE(dims, 0, "Number of dimensions can be 0 only if a user-defined metric is used");
+        switch (distType)
+        {
+            case cv::DIST_L1: dfunc = distL1; break;
+            case cv::DIST_L2: dfunc = distL2; break;
+            case cv::DIST_C: dfunc = distC; break;
+            default: CV_Error(cv::Error::StsBadFlag, "Bad or unsupported metric type");
+        }
+    }
+
+    EMDSolver state;
+    const bool result = state.init(sign1, sign2, dims, dfunc, cost, lowerBound);
+    if (!result && lowerBound)
+    {
+        return *lowerBound;
+    }
+    state.solve();
+    return (float)(state.calcFlow(_flow.needed() ? &flow : 0) / state.getWeight());
+}
+
+float cv::wrapperEMD(InputArray _sign1,
+                     InputArray _sign2,
+                     int distType,
+                     InputArray _cost,
+                     Ptr<float> lowerBound,
+                     OutputArray _flow)
+{
+    return EMD(_sign1, _sign2, distType, _cost, lowerBound.get(), _flow);
+}
diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp
index 90f2e36c2a45..b5acbc866d1e 100644
--- a/modules/imgproc/src/filter.dispatch.cpp
+++ b/modules/imgproc/src/filter.dispatch.cpp
@@ -163,8 +163,6 @@ void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
     wholeSize = Size(-1,-1);
 }
 
-#define VEC_ALIGN CV_MALLOC_ALIGN
-
 int FilterEngine::start(const Size& _wholeSize, const Size& sz, const Point& ofs)
 {
     CV_INSTRUMENT_REGION();
diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp
index 8dcf5235af51..c25345f40715 100644
--- a/modules/imgproc/src/filter.simd.hpp
+++ b/modules/imgproc/src/filter.simd.hpp
@@ -86,7 +86,6 @@ Ptr<BaseFilter> getLinearFilter(
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-typedef int CV_DECL_ALIGNED(1) unaligned_int;
 #define VEC_ALIGN CV_MALLOC_ALIGN
 
 int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs)
@@ -349,7 +348,7 @@ struct FilterNoVec
 };
 
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
 
@@ -383,7 +382,7 @@ struct RowVec_8u32s
 
         if( smallValues )
         {
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 const uchar* src = _src + i;
                 v_int32 s0 = vx_setzero_s32();
@@ -396,27 +395,27 @@ struct RowVec_8u32s
                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
                     v_uint8 x0, x1;
                     v_zip(vx_load(src), vx_load(src + cn), x0, x1);
-                    s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
-                    s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
-                    s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
-                    s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                    s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)));
+                    s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)));
+                    s2 = v_add(s2, v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)));
+                    s3 = v_add(s3, v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)));
                 }
                 if (k < _ksize)
                 {
                     v_int32 f = vx_setall_s32(_kx[k]);
                     v_uint16 x0, x1;
                     v_expand(vx_load(src), x0, x1);
-                    s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
-                    s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
-                    s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
-                    s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                    s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)));
+                    s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)));
+                    s2 = v_add(s2, v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)));
+                    s3 = v_add(s3, v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)));
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_int32::nlanes, s1);
-                v_store(dst + i + 2*v_int32::nlanes, s2);
-                v_store(dst + i + 3*v_int32::nlanes, s3);
+                v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<v_int32>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<v_int32>::vlanes(), s3);
             }
-            if( i <= width - v_uint16::nlanes )
+            if( i <= width - VTraits<v_uint16>::vlanes() )
             {
                 const uchar* src = _src + i;
                 v_int32 s0 = vx_setzero_s32();
@@ -427,22 +426,22 @@ struct RowVec_8u32s
                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
                     v_uint16 x0, x1;
                     v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1);
-                    s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
-                    s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
+                    s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)));
+                    s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)));
                 }
                 if( k < _ksize )
                 {
                     v_int32 f = vx_setall_s32(_kx[k]);
                     v_uint32 x0, x1;
                     v_expand(vx_load_expand(src), x0, x1);
-                    s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
-                    s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
+                    s0 = v_add(s0, v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)));
+                    s1 = v_add(s1, v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)));
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_int32::nlanes, s1);
-                i += v_uint16::nlanes;
+                v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                i += VTraits<v_uint16>::vlanes();
             }
-            if( i <= width - v_uint32::nlanes )
+            if( i <= width - VTraits<v_uint32>::vlanes() )
             {
                 v_int32 d = vx_setzero_s32();
                 k = 0;
@@ -452,12 +451,12 @@ struct RowVec_8u32s
                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
                     v_uint32 x0, x1;
                     v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1);
-                    d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f));
+                    d = v_add(d, v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f)));
                 }
                 if (k < _ksize)
-                    d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k])));
+                    d = v_add(d, v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k]))));
                 v_store(dst + i, d);
-                i += v_uint32::nlanes;
+                i += VTraits<v_uint32>::vlanes();
             }
         }
         return i;
@@ -480,7 +479,7 @@ struct RowVec_8u32f
         float* dst = (float*)_dst;
         const float* _kx = kernel.ptr<float>();
         width *= cn;
-        for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+        for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
         {
             v_float32 s0 = vx_setzero_f32();
             v_float32 s1 = vx_setzero_f32();
@@ -492,18 +491,18 @@ struct RowVec_8u32f
                 v_float32 f = vx_setall_f32(_kx[k]);
                 const uchar* src = (const uchar*)_src + i + k * cn;
                 v_float32 vs_ll = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src)));
-                v_float32 vs_lh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + v_float32::nlanes)));
-                v_float32 vs_hl = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 2*v_float32::nlanes)));
-                v_float32 vs_hh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 3*v_float32::nlanes)));
+                v_float32 vs_lh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + VTraits<v_float32>::vlanes())));
+                v_float32 vs_hl = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 2*VTraits<v_float32>::vlanes())));
+                v_float32 vs_hh = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src + 3*VTraits<v_float32>::vlanes())));
                 s0 = v_muladd(vs_ll, f, s0);
                 s1 = v_muladd(vs_lh, f, s1);
                 s2 = v_muladd(vs_hl, f, s2);
                 s3 = v_muladd(vs_hh, f, s3);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            v_store(dst + i + 2*v_float32::nlanes, s2);
-            v_store(dst + i + 3*v_float32::nlanes, s3);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
         }
         return i;
     }
@@ -553,7 +552,7 @@ struct SymmRowSmallVec_8u32s
             {
                 if( kx[0] == 2 && kx[1] == 1 )
                 {
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -562,29 +561,29 @@ struct SymmRowSmallVec_8u32s
                         x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l));
                         x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h));
                         v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l)));
-                        v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l)));
-                        v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h)));
-                        v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h)));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_reinterpret_as_s32(v_expand_high(x1l)));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_reinterpret_as_s32(v_expand_low(x1h)));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_reinterpret_as_s32(v_expand_high(x1h)));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_uint16 x = vx_load_expand(src);
                         x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)));
                         v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x)));
-                        v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x)));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_reinterpret_as_s32(v_expand_high(x)));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if( i <= width - v_uint32::nlanes )
+                    if( i <= width - VTraits<v_uint32>::vlanes() )
                     {
                         v_uint32 x = vx_load_expand_q(src);
-                        x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn);
+                        x = v_add(v_add(v_add(x, x), vx_load_expand_q(src - cn)), vx_load_expand_q(src + cn));
                         v_store(dst + i, v_reinterpret_as_s32(x));
-                        i += v_uint32::nlanes;
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
                 else if( kx[0] == -2 && kx[1] == 1 )
                 {
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -593,31 +592,31 @@ struct SymmRowSmallVec_8u32s
                         x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
                         x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
-                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
-                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x1l)));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_expand_low(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x1h)));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_uint16 x = vx_load_expand(src);
                         x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x));
                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x)));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if( i <= width - v_uint32::nlanes )
+                    if( i <= width - VTraits<v_uint32>::vlanes() )
                     {
                         v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
-                        x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x);
+                        x = v_sub(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), v_add(x, x));
                         v_store(dst + i, x);
-                        i += v_uint32::nlanes;
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
                 else
                 {
                     v_int16 k0 = vx_setall_s16((short)kx[0]);
                     v_int16 k1 = vx_setall_s16((short)kx[1]);
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -628,34 +627,34 @@ struct SymmRowSmallVec_8u32s
                         v_int16 x0, x1;
                         v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh);
                         v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1);
-                        dl += v_dotprod(x0, k1);
-                        dh += v_dotprod(x1, k1);
+                        dl = v_add(dl, v_dotprod(x0, k1));
+                        dh = v_add(dh, v_dotprod(x1, k1));
                         v_store(dst + i, dl);
-                        v_store(dst + i + v_int32::nlanes, dh);
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), dh);
 
                         v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh);
                         v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1);
-                        dl += v_dotprod(x0, k1);
-                        dh += v_dotprod(x1, k1);
-                        v_store(dst + i + 2*v_int32::nlanes, dl);
-                        v_store(dst + i + 3*v_int32::nlanes, dh);
+                        dl = v_add(dl, v_dotprod(x0, k1));
+                        dh = v_add(dh, v_dotprod(x1, k1));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), dl);
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), dh);
                     }
-                    if ( i <= width - v_uint16::nlanes )
+                    if ( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_int32 dl, dh;
                         v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh);
                         v_int16 x0, x1;
                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1);
-                        dl += v_dotprod(x0, k1);
-                        dh += v_dotprod(x1, k1);
+                        dl = v_add(dl, v_dotprod(x0, k1));
+                        dh = v_add(dh, v_dotprod(x1, k1));
                         v_store(dst + i, dl);
-                        v_store(dst + i + v_int32::nlanes, dh);
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), dh);
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if ( i <= width - v_uint32::nlanes )
+                    if ( i <= width - VTraits<v_uint32>::vlanes() )
                     {
-                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1])));
-                        i += v_uint32::nlanes;
+                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_mul(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), vx_setall_s32(kx[1]))));
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
             }
@@ -663,7 +662,7 @@ struct SymmRowSmallVec_8u32s
             {
                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
                 {
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
                         v_expand(vx_load(src - 2*cn), x0l, x0h);
@@ -672,31 +671,31 @@ struct SymmRowSmallVec_8u32s
                         x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
                         x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
-                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
-                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x1l)));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_expand_low(v_reinterpret_as_s16(x1h)));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x1h)));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_uint16 x = vx_load_expand(src);
                         x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x));
                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(v_reinterpret_as_s16(x)));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if( i <= width - v_uint32::nlanes )
+                    if( i <= width - VTraits<v_uint32>::vlanes() )
                     {
                         v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
-                        x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x);
+                        x = v_sub(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - 2 * cn), vx_load_expand_q(src + 2 * cn))), v_add(x, x));
                         v_store(dst + i, x);
-                        i += v_uint32::nlanes;
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
                 else
                 {
                     v_int16 k0 = vx_setall_s16((short)(kx[0]));
                     v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_int32 x0, x1, x2, x3;
                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
@@ -710,45 +709,45 @@ struct SymmRowSmallVec_8u32s
                         v_expand(vx_load(src + cn), x1l, x1h);
                         v_expand(vx_load(src - 2*cn), x2l, x2h);
                         v_expand(vx_load(src + 2*cn), x3l, x3h);
-                        v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh);
-                        x0 += v_dotprod(xl, k12);
-                        x1 += v_dotprod(xh, k12);
-                        v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh);
-                        x2 += v_dotprod(xl, k12);
-                        x3 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_add(x0l, x1l)), v_reinterpret_as_s16(v_add(x2l, x3l)), xl, xh);
+                        x0 = v_add(x0, v_dotprod(xl, k12));
+                        x1 = v_add(x1, v_dotprod(xh, k12));
+                        v_zip(v_reinterpret_as_s16(v_add(x0h, x1h)), v_reinterpret_as_s16(v_add(x2h, x3h)), xl, xh);
+                        x2 = v_add(x2, v_dotprod(xl, k12));
+                        x3 = v_add(x3, v_dotprod(xh, k12));
 
                         v_store(dst + i, x0);
-                        v_store(dst + i + v_int32::nlanes, x1);
-                        v_store(dst + i + 2*v_int32::nlanes, x2);
-                        v_store(dst + i + 3*v_int32::nlanes, x3);
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), x1);
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), x2);
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), x3);
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_int32 x1, x2;
                         v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2);
 
                         v_int16 xl, xh;
-                        v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh);
-                        x1 += v_dotprod(xl, k12);
-                        x2 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn))), v_reinterpret_as_s16(v_add(vx_load_expand(src - 2 * cn), vx_load_expand(src + 2 * cn))), xl, xh);
+                        x1 = v_add(x1, v_dotprod(xl, k12));
+                        x2 = v_add(x2, v_dotprod(xh, k12));
 
                         v_store(dst + i, x1);
-                        v_store(dst + i + v_int32::nlanes, x2);
-                        i += v_uint16::nlanes, src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), x2);
+                        i += VTraits<v_uint16>::vlanes(), src += VTraits<v_uint16>::vlanes();
                     }
-                    if( i <= width - v_uint32::nlanes )
+                    if( i <= width - VTraits<v_uint32>::vlanes() )
                     {
                         v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]),
-                                         v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]),
-                                                  v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2]))));
-                        i += v_uint32::nlanes;
+                                         v_muladd(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - cn), vx_load_expand_q(src + cn))), vx_setall_s32(kx[1]),
+                                                  v_mul(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - 2 * cn), vx_load_expand_q(src + 2 * cn))), vx_setall_s32(kx[2])))));
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
             }
             else
             {
                 v_int16 k0 = vx_setall_s16((short)(kx[0]));
-                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint8 v_src = vx_load(src);
                     v_int32 s0, s1, s2, s3;
@@ -764,12 +763,12 @@ struct SymmRowSmallVec_8u32s
                         v_uint8 v_src3 = vx_load(src + j + cn);
 
                         v_int16 xl, xh;
-                        v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
-                        v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
-                        s2 += v_dotprod(xl, k12);
-                        s3 += v_dotprod(xh, k12);
+                        v_zip(v_reinterpret_as_s16(v_add(v_expand_low(v_src0), v_expand_low(v_src2))), v_reinterpret_as_s16(v_add(v_expand_low(v_src1), v_expand_low(v_src3))), xl, xh);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
+                        v_zip(v_reinterpret_as_s16(v_add(v_expand_high(v_src0), v_expand_high(v_src2))), v_reinterpret_as_s16(v_add(v_expand_high(v_src1), v_expand_high(v_src3))), xl, xh);
+                        s2 = v_add(s2, v_dotprod(xl, k12));
+                        s3 = v_add(s3, v_dotprod(xh, k12));
                     }
                     if( k < _ksize / 2 + 1 )
                     {
@@ -780,48 +779,48 @@ struct SymmRowSmallVec_8u32s
 
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
-                        s0 += v_dotprod(xl, k1);
-                        s1 += v_dotprod(xh, k1);
+                        s0 = v_add(s0, v_dotprod(xl, k1));
+                        s1 = v_add(s1, v_dotprod(xh, k1));
                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
-                        s2 += v_dotprod(xl, k1);
-                        s3 += v_dotprod(xh, k1);
+                        s2 = v_add(s2, v_dotprod(xl, k1));
+                        s3 = v_add(s3, v_dotprod(xh, k1));
                     }
                     v_store(dst + i, s0);
-                    v_store(dst + i + v_int32::nlanes, s1);
-                    v_store(dst + i + 2*v_int32::nlanes, s2);
-                    v_store(dst + i + 3*v_int32::nlanes, s3);
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                    v_store(dst + i + 2*VTraits<v_int32>::vlanes(), s2);
+                    v_store(dst + i + 3*VTraits<v_int32>::vlanes(), s3);
                 }
-                if( i <= width - v_uint16::nlanes )
+                if( i <= width - VTraits<v_uint16>::vlanes() )
                 {
                     v_int32 s0, s1;
                     v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
                     for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn)
                     {
                         v_int16 xl, xh;
-                        v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh);
+                        v_zip(v_reinterpret_as_s16(v_add(vx_load_expand(src - j), vx_load_expand(src + j))), v_reinterpret_as_s16(v_add(vx_load_expand(src - j - cn), vx_load_expand(src + j + cn))), xl, xh);
                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16)));
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
                     }
                     if ( k < _ksize / 2 + 1 )
                     {
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh);
                         v_int16 k1 = vx_setall_s16((short)(kx[k]));
-                        s0 += v_dotprod(xl, k1);
-                        s1 += v_dotprod(xh, k1);
+                        s0 = v_add(s0, v_dotprod(xl, k1));
+                        s1 = v_add(s1, v_dotprod(xh, k1));
                     }
                     v_store(dst + i, s0);
-                    v_store(dst + i + v_int32::nlanes, s1);
-                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                    i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                 }
-                if( i <= width - v_uint32::nlanes )
+                if( i <= width - VTraits<v_uint32>::vlanes() )
                 {
-                    v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                    v_int32 s0 = v_mul(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]));
                     for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn )
-                        s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0);
+                        s0 = v_muladd(v_reinterpret_as_s32(v_add(vx_load_expand_q(src - j), vx_load_expand_q(src + j))), vx_setall_s32(kx[k]), s0);
                     v_store(dst + i, s0);
-                    i += v_uint32::nlanes;
+                    i += VTraits<v_uint32>::vlanes();
                 }
             }
         }
@@ -831,7 +830,7 @@ struct SymmRowSmallVec_8u32s
             {
                 if( kx[0] == 0 && kx[1] == 1 )
                 {
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -839,27 +838,27 @@ struct SymmRowSmallVec_8u32s
                         v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l));
                         v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h));
                         v_store(dst + i, v_expand_low(dl));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
-                        v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh));
-                        v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(dl));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_expand_low(dh));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_expand_high(dh));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn)));
                         v_store(dst + i, v_expand_low(dl));
-                        v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_expand_high(dl));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if (i <= width - v_uint32::nlanes)
+                    if (i <= width - VTraits<v_uint32>::vlanes())
                     {
-                        v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)));
-                        i += v_uint32::nlanes;
+                        v_store(dst + i, v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), v_reinterpret_as_s32(vx_load_expand_q(src - cn))));
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
                 else
                 {
                     v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16)));
-                    for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                    for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                     {
                         v_uint16 x0l, x0h, x2l, x2h;
                         v_expand(vx_load(src - cn), x0l, x0h);
@@ -867,30 +866,30 @@ struct SymmRowSmallVec_8u32s
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh);
                         v_store(dst + i, v_dotprod(xl, k0));
-                        v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_dotprod(xh, k0));
                         v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh);
-                        v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0));
-                        v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0));
+                        v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_dotprod(xl, k0));
+                        v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_dotprod(xh, k0));
                     }
-                    if( i <= width - v_uint16::nlanes )
+                    if( i <= width - VTraits<v_uint16>::vlanes() )
                     {
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh);
                         v_store(dst + i, v_dotprod(xl, k0));
-                        v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
-                        i += v_uint16::nlanes; src += v_uint16::nlanes;
+                        v_store(dst + i + VTraits<v_int32>::vlanes(), v_dotprod(xh, k0));
+                        i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                     }
-                    if (i <= width - v_uint32::nlanes)
+                    if (i <= width - VTraits<v_uint32>::vlanes())
                     {
-                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1])));
-                        i += v_uint32::nlanes;
+                        v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_mul(v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(-kx[1]))));
+                        i += VTraits<v_uint32>::vlanes();
                     }
                 }
             }
             else if( _ksize == 5 )
             {
                 v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
-                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
                     v_expand(vx_load(src - cn), x0l, x0h);
@@ -900,31 +899,31 @@ struct SymmRowSmallVec_8u32s
                     v_int16 x0, x1;
                     v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1);
                     v_store(dst + i, v_dotprod(x0, k0));
-                    v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), v_dotprod(x1, k0));
                     v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1);
-                    v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0));
-                    v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0));
+                    v_store(dst + i + 2*VTraits<v_int32>::vlanes(), v_dotprod(x0, k0));
+                    v_store(dst + i + 3*VTraits<v_int32>::vlanes(), v_dotprod(x1, k0));
                 }
-                if( i <= width - v_uint16::nlanes )
+                if( i <= width - VTraits<v_uint16>::vlanes() )
                 {
                     v_int16 x0, x1;
                     v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))),
                           v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1);
                     v_store(dst + i, v_dotprod(x0, k0));
-                    v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
-                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), v_dotprod(x1, k0));
+                    i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                 }
-                if( i <= width - v_uint32::nlanes )
+                if( i <= width - VTraits<v_uint32>::vlanes() )
                 {
-                    v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]),
-                                             (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2])));
-                    i += v_uint32::nlanes;
+                    v_store(dst + i, v_muladd(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), v_reinterpret_as_s32(vx_load_expand_q(src - cn))), vx_setall_s32(kx[1]),
+                                             v_mul(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + 2 * cn)), v_reinterpret_as_s32(vx_load_expand_q(src - 2 * cn))), vx_setall_s32(kx[2]))));
+                    i += VTraits<v_uint32>::vlanes();
                 }
             }
             else
             {
                 v_int16 k0 = vx_setall_s16((short)(kx[0]));
-                for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(), src += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint8 v_src = vx_load(src);
                     v_int32 s0, s1, s2, s3;
@@ -941,11 +940,11 @@ struct SymmRowSmallVec_8u32s
 
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh);
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
                         v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh);
-                        s2 += v_dotprod(xl, k12);
-                        s3 += v_dotprod(xh, k12);
+                        s2 = v_add(s2, v_dotprod(xl, k12));
+                        s3 = v_add(s3, v_dotprod(xh, k12));
                     }
                     if( k < _ksize / 2 + 1 )
                     {
@@ -955,18 +954,18 @@ struct SymmRowSmallVec_8u32s
 
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh);
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh);
-                        s2 += v_dotprod(xl, k12);
-                        s3 += v_dotprod(xh, k12);
+                        s2 = v_add(s2, v_dotprod(xl, k12));
+                        s3 = v_add(s3, v_dotprod(xh, k12));
                     }
                     v_store(dst + i, s0);
-                    v_store(dst + i + v_int32::nlanes, s1);
-                    v_store(dst + i + 2*v_int32::nlanes, s2);
-                    v_store(dst + i + 3*v_int32::nlanes, s3);
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                    v_store(dst + i + 2*VTraits<v_int32>::vlanes(), s2);
+                    v_store(dst + i + 3*VTraits<v_int32>::vlanes(), s3);
                 }
-                if( i <= width - v_uint16::nlanes )
+                if( i <= width - VTraits<v_uint16>::vlanes() )
                 {
                     v_int32 s0, s1;
                     v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
@@ -975,28 +974,28 @@ struct SymmRowSmallVec_8u32s
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh);
                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
-                        s0 += v_dotprod(xl, k12);
-                        s1 += v_dotprod(xh, k12);
+                        s0 = v_add(s0, v_dotprod(xl, k12));
+                        s1 = v_add(s1, v_dotprod(xh, k12));
                     }
                     if( k < _ksize / 2 + 1 )
                     {
                         v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
                         v_int16 xl, xh;
                         v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh);
-                        s0 += v_dotprod(xl, k1);
-                        s1 += v_dotprod(xh, k1);
+                        s0 = v_add(s0, v_dotprod(xl, k1));
+                        s1 = v_add(s1, v_dotprod(xh, k1));
                     }
                     v_store(dst + i, s0);
-                    v_store(dst + i + v_int32::nlanes, s1);
-                    i += v_uint16::nlanes; src += v_uint16::nlanes;
+                    v_store(dst + i + VTraits<v_int32>::vlanes(), s1);
+                    i += VTraits<v_uint16>::vlanes(); src += VTraits<v_uint16>::vlanes();
                 }
-                if( i <= width - v_uint32::nlanes )
+                if( i <= width - VTraits<v_uint32>::vlanes() )
                 {
-                    v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                    v_int32 s0 = v_mul(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]));
                     for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn)
-                        s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0);
+                        s0 = v_muladd(v_sub(v_reinterpret_as_s32(vx_load_expand_q(src + j)), v_reinterpret_as_s32(vx_load_expand_q(src - j))), vx_setall_s32(kx[k]), s0);
                     v_store(dst + i, s0);
-                    i += v_uint32::nlanes;
+                    i += VTraits<v_uint32>::vlanes();
                 }
             }
         }
@@ -1038,120 +1037,91 @@ struct SymmColumnVec_32s8u
         {
             v_float32 f0 = vx_setall_f32(ky[0]);
             v_float32 f1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 const int* S = src[0] + i;
                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
-                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
-                v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4);
-                v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + VTraits<v_int32>::vlanes())), f0, d4);
+                v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*VTraits<v_int32>::vlanes())), f0, d4);
+                v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*VTraits<v_int32>::vlanes())), f0, d4);
                 const int* S0 = src[1] + i;
                 const int* S1 = src[-1] + i;
-                s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0);
-                s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1);
-                s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2);
-                s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3);
+                s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f1, s0);
+                s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f1, s1);
+                s2 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load(S1 + 2 * VTraits<v_int32>::vlanes()))), f1, s2);
+                s3 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load(S1 + 3 * VTraits<v_int32>::vlanes()))), f1, s3);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
                     S0 = src[k] + i;
                     S1 = src[-k] + i;
-                    s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
-                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
-                    s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2);
-                    s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3);
+                    s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f, s0);
+                    s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f, s1);
+                    s2 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load(S1 + 2 * VTraits<v_int32>::vlanes()))), f, s2);
+                    s3 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load(S1 + 3 * VTraits<v_int32>::vlanes()))), f, s3);
                 }
                 v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
-            if( i <= width - v_uint16::nlanes )
+            if( i <= width - VTraits<v_uint16>::vlanes() )
             {
                 const int* S = src[0] + i;
                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
-                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + VTraits<v_int32>::vlanes())), f0, d4);
                 const int* S0 = src[1] + i;
                 const int* S1 = src[-1] + i;
-                s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0);
-                s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1);
+                s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f1, s0);
+                s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f1, s1);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
                     S0 = src[k] + i;
                     S1 = src[-k] + i;
-                    s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
-                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
+                    s0 = v_muladd(v_cvt_f32(v_add(vx_load(S0), vx_load(S1))), f, s0);
+                    s1 = v_muladd(v_cvt_f32(v_add(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f, s1);
                 }
                 v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                i += v_uint16::nlanes;
-            }
-#if CV_SIMD_WIDTH > 16
-            while( i <= width - v_int32x4::nlanes )
-#else
-            if( i <= width - v_int32x4::nlanes )
-#endif
-            {
-                v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta));
-                s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0);
-                for( k = 2; k <= ksize2; k++ )
-                    s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
-                v_int32x4 s32 = v_round(s0);
-                v_int16x8 s16 = v_pack(s32, s32);
-                *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
-                i += v_int32x4::nlanes;
+                i += VTraits<v_uint16>::vlanes();
             }
         }
         else
         {
             v_float32 f1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 const int* S0 = src[1] + i;
                 const int* S1 = src[-1] + i;
-                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4);
-                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4);
-                v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4);
-                v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4);
+                v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f1, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f1, d4);
+                v_float32 s2 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load(S1 + 2 * VTraits<v_int32>::vlanes()))), f1, d4);
+                v_float32 s3 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load(S1 + 3 * VTraits<v_int32>::vlanes()))), f1, d4);
                 for ( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
                     S0 = src[k] + i;
                     S1 = src[-k] + i;
-                    s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
-                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
-                    s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2);
-                    s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3);
+                    s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f, s0);
+                    s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f, s1);
+                    s2 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load(S1 + 2 * VTraits<v_int32>::vlanes()))), f, s2);
+                    s3 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load(S1 + 3 * VTraits<v_int32>::vlanes()))), f, s3);
                 }
                 v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
-            if( i <= width - v_uint16::nlanes )
+            if( i <= width - VTraits<v_uint16>::vlanes() )
             {
                 const int* S0 = src[1] + i;
                 const int* S1 = src[-1] + i;
-                v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4);
-                v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4);
+                v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f1, d4);
+                v_float32 s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f1, d4);
                 for ( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 f = vx_setall_f32(ky[k]);
                     S0 = src[k] + i;
                     S1 = src[-k] + i;
-                    s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
-                    s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
+                    s0 = v_muladd(v_cvt_f32(v_sub(vx_load(S0), vx_load(S1))), f, s0);
+                    s1 = v_muladd(v_cvt_f32(v_sub(vx_load(S0 + VTraits<v_int32>::vlanes()), vx_load(S1 + VTraits<v_int32>::vlanes()))), f, s1);
                 }
                 v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                i += v_uint16::nlanes;
-            }
-#if CV_SIMD_WIDTH > 16
-            while( i <= width - v_int32x4::nlanes )
-#else
-            if( i <= width - v_int32x4::nlanes )
-#endif
-            {
-                v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta));
-                for (k = 2; k <= ksize2; k++)
-                    s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
-                v_int32x4 s32 = v_round(s0);
-                v_int16x8 s16 = v_pack(s32, s32);
-                *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
-                i += v_int32x4::nlanes;
+                i += VTraits<v_uint16>::vlanes();
             }
         }
         return i;
@@ -1187,31 +1157,31 @@ struct SymmColumnVec_32f8u
 
         if( symmetrical )
         {
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 v_float32 v_ky0 = vx_setall_f32(ky[0]);
                 v_float32 v32_delta = vx_setall_f32(delta);
                 const float* S = src[0] + i;
                 v_float32 s0 = v_muladd(v_ky0, vx_load(S), v32_delta);
-                v_float32 s1 = v_muladd(v_ky0, vx_load(S + v_float32::nlanes), v32_delta);
-                v_float32 s2 = v_muladd(v_ky0, vx_load(S + 2*v_float32::nlanes), v32_delta);
-                v_float32 s3 = v_muladd(v_ky0, vx_load(S + 3*v_float32::nlanes), v32_delta);
+                v_float32 s1 = v_muladd(v_ky0, vx_load(S + VTraits<v_float32>::vlanes()), v32_delta);
+                v_float32 s2 = v_muladd(v_ky0, vx_load(S + 2*VTraits<v_float32>::vlanes()), v32_delta);
+                v_float32 s3 = v_muladd(v_ky0, vx_load(S + 3*VTraits<v_float32>::vlanes()), v32_delta);
                 for( k = 1; k <= ksize2; k++ )
                 {
                     v_float32 v_kyk = vx_setall_f32(ky[k]);
                     const float* S0 = src[k] + i;
                     const float* S1 = src[-k] + i;
-                    s0 = v_muladd(v_kyk, vx_load(S0) + vx_load(S1), s0);
-                    s1 = v_muladd(v_kyk, vx_load(S0 + v_float32::nlanes) + vx_load(S1 + v_float32::nlanes), s1);
-                    s2 = v_muladd(v_kyk, vx_load(S0 + 2*v_float32::nlanes) + vx_load(S1 + 2*v_float32::nlanes), s2);
-                    s3 = v_muladd(v_kyk, vx_load(S0 + 3*v_float32::nlanes) + vx_load(S1 + 3*v_float32::nlanes), s3);
+                    s0 = v_muladd(v_kyk, v_add(vx_load(S0), vx_load(S1)), s0);
+                    s1 = v_muladd(v_kyk, v_add(vx_load(S0 + VTraits<v_float32>::vlanes()), vx_load(S1 + VTraits<v_float32>::vlanes())), s1);
+                    s2 = v_muladd(v_kyk, v_add(vx_load(S0 + 2 * VTraits<v_float32>::vlanes()), vx_load(S1 + 2 * VTraits<v_float32>::vlanes())), s2);
+                    s3 = v_muladd(v_kyk, v_add(vx_load(S0 + 3 * VTraits<v_float32>::vlanes()), vx_load(S1 + 3 * VTraits<v_float32>::vlanes())), s3);
                 }
                 v_store(_dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
         }
         else
         {
-            for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+            for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
             {
                 v_float32 s0 = vx_setall_f32(delta);
                 v_float32 s1 = vx_setall_f32(delta);
@@ -1222,10 +1192,10 @@ struct SymmColumnVec_32f8u
                     v_float32 v_kyk = vx_setall_f32(ky[k]);
                     const float* S0 = src[k] + i;
                     const float* S1 = src[-k] + i;
-                    s0 = v_muladd(v_kyk, vx_load(S0) - vx_load(S1), s0);
-                    s1 = v_muladd(v_kyk, vx_load(S0 + v_float32::nlanes) - vx_load(S1 + v_float32::nlanes), s1);
-                    s2 = v_muladd(v_kyk, vx_load(S0 + 2*v_float32::nlanes) - vx_load(S1 + 2*v_float32::nlanes), s2);
-                    s3 = v_muladd(v_kyk, vx_load(S0 + 3*v_float32::nlanes) - vx_load(S1 + 3*v_float32::nlanes), s3);
+                    s0 = v_muladd(v_kyk, v_sub(vx_load(S0), vx_load(S1)), s0);
+                    s1 = v_muladd(v_kyk, v_sub(vx_load(S0 + VTraits<v_float32>::vlanes()), vx_load(S1 + VTraits<v_float32>::vlanes())), s1);
+                    s2 = v_muladd(v_kyk, v_sub(vx_load(S0 + 2 * VTraits<v_float32>::vlanes()), vx_load(S1 + 2 * VTraits<v_float32>::vlanes())), s2);
+                    s3 = v_muladd(v_kyk, v_sub(vx_load(S0 + 3 * VTraits<v_float32>::vlanes()), vx_load(S1 + 3 * VTraits<v_float32>::vlanes())), s3);
                 }
                 v_store(_dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
             }
@@ -1268,55 +1238,52 @@ struct SymmColumnSmallVec_32s16s
         {
             if( ky[0] == 2 && ky[1] == 1 )
             {
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
                     v_int32 s0 = vx_load(S1 + i);
-                    v_int32 s1 = vx_load(S1 + i + v_int32::nlanes);
-                    v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes);
-                    v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8);
-                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2),
-                                                              vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8);
+                    v_int32 s1 = vx_load(S1 + i + VTraits<v_int32>::vlanes());
+                    v_int32 s2 = vx_load(S1 + i + 2*VTraits<v_int32>::vlanes());
+                    v_int32 s3 = vx_load(S1 + i + 3*VTraits<v_int32>::vlanes());
+                    v_store(dst + i, v_add(v_pack(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(s0, s0)), v_add(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), v_add(s1, s1))), d8));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_add(v_pack(v_add(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes())), v_add(s2, s2)), v_add(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes())), v_add(s3, s3))), d8));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
                     v_int32 sl = vx_load(S1 + i);
-                    v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8);
-                    i += v_int16::nlanes;
+                    v_int32 sh = vx_load(S1 + i + VTraits<v_int32>::vlanes());
+                    v_store(dst + i, v_add(v_pack(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(sl, sl)), v_add(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), v_add(sh, sh))), d8));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
                     v_int32 s = vx_load(S1 + i);
-                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_add(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), vx_setall_s32(d)), v_add(s, s)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
             else if( ky[0] == -2 && ky[1] == 1 )
             {
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
                     v_int32 s0 = vx_load(S1 + i);
-                    v_int32 s1 = vx_load(S1 + i + v_int32::nlanes);
-                    v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes);
-                    v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0),
-                                            vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8);
-                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2),
-                                                              vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8);
+                    v_int32 s1 = vx_load(S1 + i + VTraits<v_int32>::vlanes());
+                    v_int32 s2 = vx_load(S1 + i + 2*VTraits<v_int32>::vlanes());
+                    v_int32 s3 = vx_load(S1 + i + 3*VTraits<v_int32>::vlanes());
+                    v_store(dst + i, v_add(v_pack(v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(s0, s0)), v_sub(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), v_add(s1, s1))), d8));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_add(v_pack(v_sub(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes())), v_add(s2, s2)), v_sub(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes())), v_add(s3, s3))), d8));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
                     v_int32 sl = vx_load(S1 + i);
-                    v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
-                    v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8);
-                    i += v_int16::nlanes;
+                    v_int32 sh = vx_load(S1 + i + VTraits<v_int32>::vlanes());
+                    v_store(dst + i, v_add(v_pack(v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i)), v_add(sl, sl)), v_sub(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), v_add(sh, sh))), d8));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
                     v_int32 s = vx_load(S1 + i);
-                    v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_sub(v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), vx_setall_s32(d)), v_add(s, s)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
 #if CV_NEON
@@ -1324,46 +1291,46 @@ struct SymmColumnSmallVec_32s16s
             {
                 v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
                 v_int32 d4 = vx_setall_s32(d);
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
-                                            v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
-                    v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)),
-                                                              v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4))));
+                    v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+                                            v_muladd(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits<v_int32>::vlanes()), k0, d4))));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_muladd(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + 2*VTraits<v_int32>::vlanes()), k0, d4)),
+                                                              v_muladd(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + 3*VTraits<v_int32>::vlanes()), k0, d4))));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
-                                            v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
-                    i += v_int16::nlanes;
+                    v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+                                            v_muladd(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits<v_int32>::vlanes()), k0, d4))));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
-                    v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
 #endif
             else
             {
                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
-                                            v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
-                    v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))),
-                                                              v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4)))));
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+                                            v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + VTraits<v_int32>::vlanes())), k0, df4)))));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*VTraits<v_int32>::vlanes())), k0, df4))),
+                                                              v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*VTraits<v_int32>::vlanes())), k0, df4)))));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
-                                            v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
-                    i += v_int16::nlanes;
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+                                            v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes()))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + VTraits<v_int32>::vlanes())), k0, df4)))));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
-                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(v_add(vx_load(S0 + i), vx_load(S2 + i))), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
         }
@@ -1373,42 +1340,42 @@ struct SymmColumnSmallVec_32s16s
             {
                 if( ky[1] < 0 )
                     std::swap(S0, S2);
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8);
-                    v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8);
+                    v_store(dst + i, v_add(v_pack(v_sub(vx_load(S2 + i), vx_load(S0 + i)), v_sub(vx_load(S2 + i + VTraits<v_int32>::vlanes()), vx_load(S0 + i + VTraits<v_int32>::vlanes()))), d8));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_add(v_pack(v_sub(vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes())), v_sub(vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()))), d8));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8);
-                    i += v_int16::nlanes;
+                    v_store(dst + i, v_add(v_pack(v_sub(vx_load(S2 + i), vx_load(S0 + i)), v_sub(vx_load(S2 + i + VTraits<v_int32>::vlanes()), vx_load(S0 + i + VTraits<v_int32>::vlanes()))), d8));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
-                    v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_add(v_sub(vx_load(S2 + i), vx_load(S0 + i)), vx_setall_s32(d)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
             else
             {
                 v_float32 k1 = vx_setall_f32(ky[1]);
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
-                                            v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
-                    v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)),
-                                                              v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4))));
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4)),
+                                            v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + VTraits<v_int32>::vlanes()), vx_load(S0 + i + VTraits<v_int32>::vlanes()))), k1, df4))));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()))), k1, df4)),
+                                                              v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()))), k1, df4))));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
-                                            v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
-                    i += v_int16::nlanes;
+                    v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4)),
+                                            v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i + VTraits<v_int32>::vlanes()), vx_load(S0 + i + VTraits<v_int32>::vlanes()))), k1, df4))));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
-                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(v_sub(vx_load(S2 + i), vx_load(S0 + i))), k1, df4)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
         }
@@ -1440,7 +1407,7 @@ struct RowVec_16s32f
         const float* _kx = kernel.ptr<float>();
         width *= cn;
 
-        for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+        for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
         {
             const short* src = (const short*)_src + i;
             v_float32 s0 = vx_setzero_f32();
@@ -1451,18 +1418,18 @@ struct RowVec_16s32f
             {
                 v_float32 f = vx_setall_f32(_kx[k]);
                 v_int16 xl = vx_load(src);
-                v_int16 xh = vx_load(src + v_int16::nlanes);
+                v_int16 xh = vx_load(src + VTraits<v_int16>::vlanes());
                 s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0);
                 s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1);
                 s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2);
                 s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            v_store(dst + i + 2*v_float32::nlanes, s2);
-            v_store(dst + i + 3*v_float32::nlanes, s3);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
         }
-        if( i <= width - v_int16::nlanes )
+        if( i <= width - VTraits<v_int16>::vlanes() )
         {
             const short* src = (const short*)_src + i;
             v_float32 s0 = vx_setzero_f32();
@@ -1475,17 +1442,17 @@ struct RowVec_16s32f
                 s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            i += v_int16::nlanes;
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            i += VTraits<v_int16>::vlanes();
         }
-        if( i <= width - v_float32::nlanes )
+        if( i <= width - VTraits<v_float32>::vlanes() )
         {
             const short* src = (const short*)_src + i;
             v_float32 s0 = vx_setzero_f32();
             for( k = 0; k < _ksize; k++, src += cn )
                 s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0);
             v_store(dst + i, s0);
-            i += v_float32::nlanes;
+            i += VTraits<v_float32>::vlanes();
         }
         return i;
     }
@@ -1524,92 +1491,92 @@ struct SymmColumnVec_32f16s
         {
             v_float32 k0 = vx_setall_f32(ky[0]);
             v_float32 k1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+            for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
-                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4);
-                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4);
-                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
-                s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1);
-                s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2);
-                s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), k0, d4);
+                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits<v_float32>::vlanes()), k0, d4);
+                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits<v_float32>::vlanes()), k0, d4);
+                s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0);
+                s1 = v_muladd(v_add(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, s1);
+                s2 = v_muladd(v_add(vx_load(src[1] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 2 * VTraits<v_float32>::vlanes())), k1, s2);
+                s3 = v_muladd(v_add(vx_load(src[1] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 3 * VTraits<v_float32>::vlanes())), k1, s3);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
-                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
-                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
+                    s2 = v_muladd(v_add(vx_load(src[k] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 2 * VTraits<v_float32>::vlanes())), k2, s2);
+                    s3 = v_muladd(v_add(vx_load(src[k] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 3 * VTraits<v_float32>::vlanes())), k2, s3);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+                v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(s2), v_round(s3)));
             }
-            if( i <= width - v_int16::nlanes )
+            if( i <= width - VTraits<v_int16>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
-                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
-                s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), k0, d4);
+                s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0);
+                s1 = v_muladd(v_add(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, s1);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                i += v_int16::nlanes;
+                i += VTraits<v_int16>::vlanes();
             }
-            if( i <= width - v_float32::nlanes )
+            if( i <= width - VTraits<v_float32>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0);
+                s0 = v_muladd(v_add(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, s0);
                 for( k = 2; k <= ksize2; k++ )
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0);
                 v_pack_store(dst + i, v_round(s0));
-                i += v_float32::nlanes;
+                i += VTraits<v_float32>::vlanes();
             }
         }
         else
         {
             v_float32 k1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+            for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
-                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
-                v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4);
-                v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
+                v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, d4);
+                v_float32 s2 = v_muladd(v_sub(vx_load(src[1] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 2 * VTraits<v_float32>::vlanes())), k1, d4);
+                v_float32 s3 = v_muladd(v_sub(vx_load(src[1] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 3 * VTraits<v_float32>::vlanes())), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
-                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
-                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
+                    s2 = v_muladd(v_sub(vx_load(src[k] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 2 * VTraits<v_float32>::vlanes())), k2, s2);
+                    s3 = v_muladd(v_sub(vx_load(src[k] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 3 * VTraits<v_float32>::vlanes())), k2, s3);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+                v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(s2), v_round(s3)));
             }
-            if( i <= width - v_int16::nlanes )
+            if( i <= width - VTraits<v_int16>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
-                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
+                v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
                 }
                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-                i += v_int16::nlanes;
+                i += VTraits<v_int16>::vlanes();
             }
-            if( i <= width - v_float32::nlanes )
+            if( i <= width - VTraits<v_float32>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0);
                 v_pack_store(dst + i, v_round(s0));
-                i += v_float32::nlanes;
+                i += VTraits<v_float32>::vlanes();
             }
         }
 
@@ -1682,52 +1649,52 @@ struct RowVec_32f
         }
 #endif
         v_float32 k0 = vx_setall_f32(_kx[0]);
-        for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+        for( ; i <= width - 4*VTraits<v_float32>::vlanes(); i += 4*VTraits<v_float32>::vlanes() )
         {
             const float* src = src0 + i;
-            v_float32 s0 = vx_load(src) * k0;
-            v_float32 s1 = vx_load(src + v_float32::nlanes) * k0;
-            v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0;
-            v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0;
+            v_float32 s0 = v_mul(vx_load(src), k0);
+            v_float32 s1 = v_mul(vx_load(src + VTraits<v_float32>::vlanes()), k0);
+            v_float32 s2 = v_mul(vx_load(src + 2 * VTraits<v_float32>::vlanes()), k0);
+            v_float32 s3 = v_mul(vx_load(src + 3 * VTraits<v_float32>::vlanes()), k0);
             src += cn;
             for( k = 1; k < _ksize; k++, src += cn )
             {
                 v_float32 k1 = vx_setall_f32(_kx[k]);
                 s0 = v_muladd(vx_load(src), k1, s0);
-                s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1);
-                s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2);
-                s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3);
+                s1 = v_muladd(vx_load(src + VTraits<v_float32>::vlanes()), k1, s1);
+                s2 = v_muladd(vx_load(src + 2*VTraits<v_float32>::vlanes()), k1, s2);
+                s3 = v_muladd(vx_load(src + 3*VTraits<v_float32>::vlanes()), k1, s3);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            v_store(dst + i + 2*v_float32::nlanes, s2);
-            v_store(dst + i + 3*v_float32::nlanes, s3);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
         }
-        if( i <= width - 2*v_float32::nlanes )
+        if( i <= width - 2*VTraits<v_float32>::vlanes() )
         {
             const float* src = src0 + i;
-            v_float32 s0 = vx_load(src) * k0;
-            v_float32 s1 = vx_load(src + v_float32::nlanes) * k0;
+            v_float32 s0 = v_mul(vx_load(src), k0);
+            v_float32 s1 = v_mul(vx_load(src + VTraits<v_float32>::vlanes()), k0);
             src += cn;
             for( k = 1; k < _ksize; k++, src += cn )
             {
                 v_float32 k1 = vx_setall_f32(_kx[k]);
                 s0 = v_muladd(vx_load(src), k1, s0);
-                s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1);
+                s1 = v_muladd(vx_load(src + VTraits<v_float32>::vlanes()), k1, s1);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            i += 2*v_float32::nlanes;
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            i += 2*VTraits<v_float32>::vlanes();
         }
-        if( i <= width - v_float32::nlanes )
+        if( i <= width - VTraits<v_float32>::vlanes() )
         {
             const float* src = src0 + i;
-            v_float32 s0 = vx_load(src) * k0;
+            v_float32 s0 = v_mul(vx_load(src), k0);
             src += cn;
             for( k = 1; k < _ksize; k++, src += cn )
                 s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0);
             v_store(dst + i, s0);
-            i += v_float32::nlanes;
+            i += VTraits<v_float32>::vlanes();
         }
         return i;
     }
@@ -1806,28 +1773,28 @@ struct SymmRowSmallVec_32f
                 {
 #if CV_FMA3 || CV_AVX2
                     v_float32 k0 = vx_setall_f32(kx[0]);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn)));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, v_add(vx_load(src - cn), vx_load(src + cn))));
 #else
                     if( kx[0] > 0 )
-                        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
                         {
                             v_float32 x = vx_load(src);
-                            v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x));
+                            v_store(dst + i, v_add(vx_load(src - cn), vx_load(src + cn), x , x));
                         }
                     else
-                        for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                        for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
                         {
                             v_float32 x = vx_load(src);
-                            v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x));
+                            v_store(dst + i, v_sub(v_add(vx_load(src - cn), vx_load(src + cn)), v_add(x, x)));
                         }
 #endif
                 }
                 else
                 {
                     v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, v_mul(v_add(vx_load(src - cn), vx_load(src + cn)), k1)));
                 }
             }
             else if( _ksize == 5 )
@@ -1836,21 +1803,21 @@ struct SymmRowSmallVec_32f
                 {
 #if CV_FMA3 || CV_AVX2
                     v_float32 k0 = vx_setall_f32(-2);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn)));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_muladd(vx_load(src), k0, v_add(vx_load(src - 2 * cn), vx_load(src + 2 * cn))));
 #else
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
                     {
                         v_float32 x = vx_load(src);
-                        v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x));
+                        v_store(dst + i, v_sub(v_add(vx_load(src - 2*cn), vx_load(src + 2*cn)), v_add(x, x)));
                     }
 #endif
                 }
                 else
                 {
                     v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_muladd(v_add(vx_load(src + 2 * cn), vx_load(src - 2 * cn)), k2, v_muladd(vx_load(src), k0, v_mul(v_add(vx_load(src - cn), vx_load(src + cn)), k1))));
                 }
             }
         }
@@ -1859,20 +1826,20 @@ struct SymmRowSmallVec_32f
             if( _ksize == 3 )
             {
                 if( kx[0] == 0 && kx[1] == 1 )
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, vx_load(src + cn) - vx_load(src - cn));
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_sub(vx_load(src + cn), vx_load(src - cn)));
                 else
                 {
                     v_float32 k1 = vx_setall_f32(kx[1]);
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                        v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1);
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                        v_store(dst + i, v_mul(v_sub(vx_load(src + cn), vx_load(src - cn)), k1));
                 }
             }
             else if( _ksize == 5 )
             {
                 v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
-                    v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1));
+                for ( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes(), src += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_muladd(v_sub(vx_load(src + 2 * cn), vx_load(src - 2 * cn)), k2, v_mul(v_sub(vx_load(src + cn), vx_load(src - cn)), k1)));
             }
         }
         return i;
@@ -1961,46 +1928,46 @@ struct SymmColumnVec_32f
 #endif
             const v_float32 d4 = vx_setall_f32(delta);
             const v_float32 k0 = vx_setall_f32(ky[0]);
-            for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+            for( ; i <= width - 4*VTraits<v_float32>::vlanes(); i += 4*VTraits<v_float32>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
-                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4);
-                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), k0, d4);
+                v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits<v_float32>::vlanes()), k0, d4);
+                v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits<v_float32>::vlanes()), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
                 {
                     v_float32 k1 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
-                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2);
-                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k1, s0);
+                    s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k1, s1);
+                    s2 = v_muladd(v_add(vx_load(src[k] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 2 * VTraits<v_float32>::vlanes())), k1, s2);
+                    s3 = v_muladd(v_add(vx_load(src[k] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 3 * VTraits<v_float32>::vlanes())), k1, s3);
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_float32::nlanes, s1);
-                v_store(dst + i + 2*v_float32::nlanes, s2);
-                v_store(dst + i + 3*v_float32::nlanes, s3);
+                v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
             }
-            if( i <= width - 2*v_float32::nlanes )
+            if( i <= width - 2*VTraits<v_float32>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
-                v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
+                v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
                 {
                     v_float32 k1 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), k1, s0);
+                    s1 = v_muladd(v_add(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k1, s1);
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_float32::nlanes, s1);
-                i += 2*v_float32::nlanes;
+                v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+                i += 2*VTraits<v_float32>::vlanes();
             }
-            if( i <= width - v_float32::nlanes )
+            if( i <= width - VTraits<v_float32>::vlanes() )
             {
                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
                 for( k = 1; k <= ksize2; k++ )
-                    s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                    s0 = v_muladd(v_add(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0);
                 v_store(dst + i, s0);
-                i += v_float32::nlanes;
+                i += VTraits<v_float32>::vlanes();
             }
         }
         else
@@ -2042,46 +2009,46 @@ struct SymmColumnVec_32f
 #endif
             const v_float32 d4 = vx_setall_f32(delta);
             const v_float32 k1 = vx_setall_f32(ky[1]);
-            for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+            for( ; i <= width - 4*VTraits<v_float32>::vlanes(); i += 4*VTraits<v_float32>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
-                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
-                v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4);
-                v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
+                v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, d4);
+                v_float32 s2 = v_muladd(v_sub(vx_load(src[1] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 2 * VTraits<v_float32>::vlanes())), k1, d4);
+                v_float32 s3 = v_muladd(v_sub(vx_load(src[1] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + 3 * VTraits<v_float32>::vlanes())), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
-                    s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2);
-                    s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
+                    s2 = v_muladd(v_sub(vx_load(src[k] + i + 2 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 2 * VTraits<v_float32>::vlanes())), k2, s2);
+                    s3 = v_muladd(v_sub(vx_load(src[k] + i + 3 * VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + 3 * VTraits<v_float32>::vlanes())), k2, s3);
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_float32::nlanes, s1);
-                v_store(dst + i + 2*v_float32::nlanes, s2);
-                v_store(dst + i + 3*v_float32::nlanes, s3);
+                v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
             }
-            if( i <= width - 2*v_float32::nlanes )
+            if( i <= width - 2*VTraits<v_float32>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
-                v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
+                v_float32 s1 = v_muladd(v_sub(vx_load(src[1] + i + VTraits<v_float32>::vlanes()), vx_load(src[-1] + i + VTraits<v_float32>::vlanes())), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
                 {
                     v_float32 k2 = vx_setall_f32(ky[k]);
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0);
-                    s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), k2, s0);
+                    s1 = v_muladd(v_sub(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), vx_load(src[-k] + i + VTraits<v_float32>::vlanes())), k2, s1);
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + v_float32::nlanes, s1);
-                i += 2*v_float32::nlanes;
+                v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+                i += 2*VTraits<v_float32>::vlanes();
             }
-            if( i <= width - v_float32::nlanes )
+            if( i <= width - VTraits<v_float32>::vlanes() )
             {
-                v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4);
+                v_float32 s0 = v_muladd(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i)), k1, d4);
                 for( k = 2; k <= ksize2; k++ )
-                    s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                    s0 = v_muladd(v_sub(vx_load(src[k] + i), vx_load(src[-k] + i)), vx_setall_f32(ky[k]), s0);
                 v_store(dst + i, s0);
-                i += v_float32::nlanes;
+                i += VTraits<v_float32>::vlanes();
             }
         }
         return i;
@@ -2123,28 +2090,28 @@ struct SymmColumnSmallVec_32f
             {
 #if CV_FMA3 || CV_AVX2
                 v_float32 k0 = vx_setall_f32(ky[0]);
-                for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
-                    v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4));
+                for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_muladd(vx_load(S1 + i), k0, v_add(v_add(vx_load(S0 + i), vx_load(S2 + i)), d4)));
 #else
                 if(ky[0] > 0)
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
                     {
                         v_float32 x = vx_load(S1 + i);
-                        v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x));
+                        v_store(dst + i, v_add(vx_load(S0 + i), vx_load(S2 + i), d4, x, x));
                     }
                 else
-                    for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                    for( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
                     {
                         v_float32 x = vx_load(S1 + i);
-                        v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x));
+                        v_store(dst + i, v_sub(v_add(vx_load(S0 + i), vx_load(S2 + i), d4), v_add(x, x)));
                     }
 #endif
             }
             else
             {
                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
-                    v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
+                for ( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)));
             }
         }
         else
@@ -2153,14 +2120,14 @@ struct SymmColumnSmallVec_32f
             {
                 if( ky[1] < 0 )
                     std::swap(S0, S2);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
-                    v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
+                for ( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_add(v_sub(vx_load(S2 + i), vx_load(S0 + i)), d4));
             }
             else
             {
                 v_float32 k1 = vx_setall_f32(ky[1]);
-                for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
-                    v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4));
+                for ( ; i <= width - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
+                    v_store(dst + i, v_muladd(v_sub(vx_load(S2 + i), vx_load(S0 + i)), k1, d4));
             }
         }
         return i;
@@ -2199,7 +2166,7 @@ struct FilterVec_8u
 
         v_float32 d4 = vx_setall_f32(delta);
         v_float32 f0 = vx_setall_f32(kf[0]);
-        for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+        for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
         {
             v_uint16 xl, xh;
             v_expand(vx_load(src[0] + i), xl, xh);
@@ -2223,7 +2190,7 @@ struct FilterVec_8u
             }
             v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
         }
-        if( i <= width - v_uint16::nlanes )
+        if( i <= width - VTraits<v_uint16>::vlanes() )
         {
             v_uint32 x0, x1;
             v_expand(vx_load_expand(src[0] + i), x0, x1);
@@ -2237,21 +2204,7 @@ struct FilterVec_8u
                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
             }
             v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-            i += v_uint16::nlanes;
-        }
-#if CV_SIMD_WIDTH > 16
-        while( i <= width - v_int32x4::nlanes )
-#else
-        if( i <= width - v_int32x4::nlanes )
-#endif
-        {
-            v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta));
-            for( k = 1; k < nz; k++ )
-                s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0);
-            v_int32x4 s32 = v_round(s0);
-            v_int16x8 s16 = v_pack(s32, s32);
-            *(unaligned_int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
-            i += v_int32x4::nlanes;
+            i += VTraits<v_uint16>::vlanes();
         }
         return i;
     }
@@ -2286,7 +2239,7 @@ struct FilterVec_8u16s
 
         v_float32 d4 = vx_setall_f32(delta);
         v_float32 f0 = vx_setall_f32(kf[0]);
-        for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+        for( ; i <= width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes() )
         {
             v_uint16 xl, xh;
             v_expand(vx_load(src[0] + i), xl, xh);
@@ -2304,9 +2257,9 @@ struct FilterVec_8u16s
                 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3);
             }
             v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-            v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
+            v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_round(s2), v_round(s3)));
         }
-        if( i <= width - v_uint16::nlanes )
+        if( i <= width - VTraits<v_uint16>::vlanes() )
         {
             v_uint16 x = vx_load_expand(src[0] + i);
             v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4);
@@ -2319,15 +2272,15 @@ struct FilterVec_8u16s
                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1);
             }
             v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
-            i += v_uint16::nlanes;
+            i += VTraits<v_uint16>::vlanes();
         }
-        if( i <= width - v_int32::nlanes )
+        if( i <= width - VTraits<v_int32>::vlanes() )
         {
             v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4);
             for( k = 1; k < nz; k++ )
                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
             v_pack_store(dst + i, v_round(s0));
-            i += v_int32::nlanes;
+            i += VTraits<v_int32>::vlanes();
         }
         return i;
     }
@@ -2360,46 +2313,46 @@ struct FilterVec_32f
 
         v_float32 d4 = vx_setall_f32(delta);
         v_float32 f0 = vx_setall_f32(kf[0]);
-        for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes )
+        for( ; i <= width - 4*VTraits<v_float32>::vlanes(); i += 4*VTraits<v_float32>::vlanes() )
         {
             v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
-            v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4);
-            v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4);
-            v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4);
+            v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), f0, d4);
+            v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*VTraits<v_float32>::vlanes()), f0, d4);
+            v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*VTraits<v_float32>::vlanes()), f0, d4);
             for( k = 1; k < nz; k++ )
             {
                 v_float32 f1 = vx_setall_f32(kf[k]);
                 s0 = v_muladd(vx_load(src[k] + i), f1, s0);
-                s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1);
-                s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2);
-                s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3);
+                s1 = v_muladd(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), f1, s1);
+                s2 = v_muladd(vx_load(src[k] + i + 2*VTraits<v_float32>::vlanes()), f1, s2);
+                s3 = v_muladd(vx_load(src[k] + i + 3*VTraits<v_float32>::vlanes()), f1, s3);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            v_store(dst + i + 2*v_float32::nlanes, s2);
-            v_store(dst + i + 3*v_float32::nlanes, s3);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<v_float32>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<v_float32>::vlanes(), s3);
         }
-        if( i <= width - 2*v_float32::nlanes )
+        if( i <= width - 2*VTraits<v_float32>::vlanes() )
         {
             v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
-            v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4);
+            v_float32 s1 = v_muladd(vx_load(src[0] + i + VTraits<v_float32>::vlanes()), f0, d4);
             for( k = 1; k < nz; k++ )
             {
                 v_float32 f1 = vx_setall_f32(kf[k]);
                 s0 = v_muladd(vx_load(src[k] + i), f1, s0);
-                s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1);
+                s1 = v_muladd(vx_load(src[k] + i + VTraits<v_float32>::vlanes()), f1, s1);
             }
             v_store(dst + i, s0);
-            v_store(dst + i + v_float32::nlanes, s1);
-            i += 2*v_float32::nlanes;
+            v_store(dst + i + VTraits<v_float32>::vlanes(), s1);
+            i += 2*VTraits<v_float32>::vlanes();
         }
-        if( i <= width - v_float32::nlanes )
+        if( i <= width - VTraits<v_float32>::vlanes() )
         {
             v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4);
             for( k = 1; k < nz; k++ )
                 s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0);
             v_store(dst + i, s0);
-            i += v_float32::nlanes;
+            i += VTraits<v_float32>::vlanes();
         }
         return i;
     }
@@ -3042,7 +2995,7 @@ Ptr<BaseRowFilter> getLinearRowFilter(
     if( sdepth == CV_64F && ddepth == CV_64F )
         return makePtr<RowFilter<double, double, RowNoVec> >(kernel, anchor);
 
-    CV_Error_( CV_StsNotImplemented,
+    CV_Error_( cv::Error::StsNotImplemented,
         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
         srcType, bufType));
 }
@@ -3140,7 +3093,7 @@ Ptr<BaseColumnFilter> getLinearColumnFilter(
                 (kernel, anchor, delta, symmetryType);
     }
 
-    CV_Error_( CV_StsNotImplemented,
+    CV_Error_( cv::Error::StsNotImplemented,
         ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
         bufType, dstType));
 }
@@ -3294,7 +3247,7 @@ Ptr<BaseFilter> getLinearFilter(
         return makePtr<Filter2D<double,
             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
 
-    CV_Error_( CV_StsNotImplemented,
+    CV_Error_( cv::Error::StsNotImplemented,
         ("Unsupported combination of source format (=%d), and destination format (=%d)",
         srcType, dstType));
 }
diff --git a/modules/imgproc/src/fixedpoint.inl.hpp b/modules/imgproc/src/fixedpoint.inl.hpp
index f5f433fec652..7303e06ad384 100644
--- a/modules/imgproc/src/fixedpoint.inl.hpp
+++ b/modules/imgproc/src/fixedpoint.inl.hpp
@@ -370,7 +370,7 @@ class ufixedpoint16
     static CV_ALWAYS_INLINE ufixedpoint16 one() { return ufixedpoint16((uint16_t)(1 << fixedShift)); }
 
     static CV_ALWAYS_INLINE ufixedpoint16 fromRaw(uint16_t v) { return ufixedpoint16(v); }
-    CV_ALWAYS_INLINE uint16_t raw() { return val; }
+    CV_ALWAYS_INLINE uint16_t raw() const { return val; }
 };
 
 }
diff --git a/modules/imgproc/src/floodfill.cpp b/modules/imgproc/src/floodfill.cpp
index 926c48e65dd2..273f220bb045 100644
--- a/modules/imgproc/src/floodfill.cpp
+++ b/modules/imgproc/src/floodfill.cpp
@@ -487,12 +487,12 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
 
     if ( (cn != 1) && (cn != 3) )
     {
-        CV_Error( CV_StsBadArg, "Number of channels in input image must be 1 or 3" );
+        CV_Error( cv::Error::StsBadArg, "Number of channels in input image must be 1 or 3" );
     }
 
     const int connectivity = flags & 255;
     if( connectivity != 0 && connectivity != 4 && connectivity != 8 )
-        CV_Error( CV_StsBadFlag, "Connectivity must be 4, 0(=4) or 8" );
+        CV_Error( cv::Error::StsBadFlag, "Connectivity must be 4, 0(=4) or 8" );
 
     if( _mask.empty() )
     {
@@ -513,13 +513,13 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
     for( i = 0; i < cn; i++ )
     {
         if( loDiff[i] < 0 || upDiff[i] < 0 )
-            CV_Error( CV_StsBadArg, "lo_diff and up_diff must be non-negative" );
+            CV_Error( cv::Error::StsBadArg, "lo_diff and up_diff must be non-negative" );
         is_simple = is_simple && fabs(loDiff[i]) < DBL_EPSILON && fabs(upDiff[i]) < DBL_EPSILON;
     }
 
     if( (unsigned)seedPoint.x >= (unsigned)size.width ||
        (unsigned)seedPoint.y >= (unsigned)size.height )
-        CV_Error( CV_StsOutOfRange, "Seed point is outside of image" );
+        CV_Error( cv::Error::StsOutOfRange, "Seed point is outside of image" );
 
     scalarToRawData( newVal, &nv_buf, type, 0);
     size_t buffer_size = MAX( size.width, size.height ) * 2;
@@ -550,7 +550,7 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
             else if( type == CV_32FC3 )
                 floodFill_CnIR(img, seedPoint, Vec3f(nv_buf.f), &comp, flags, &buffer);
             else
-                CV_Error( CV_StsUnsupportedFormat, "" );
+                CV_Error( cv::Error::StsUnsupportedFormat, "" );
             if( rect )
                 *rect = comp.rect;
             return comp.area;
@@ -560,8 +560,15 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
     if( depth == CV_8U )
         for( i = 0; i < cn; i++ )
         {
+#if defined(__GNUC__) && (__GNUC__ == 12)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
             ld_buf.b[i] = saturate_cast<uchar>(cvFloor(loDiff[i]));
             ud_buf.b[i] = saturate_cast<uchar>(cvFloor(upDiff[i]));
+#if defined(__GNUC__) && (__GNUC__ == 12)
+#pragma GCC diagnostic pop
+#endif
         }
     else if( depth == CV_32S )
         for( i = 0; i < cn; i++ )
@@ -576,7 +583,7 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
             ud_buf.f[i] = (float)upDiff[i];
         }
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
     uchar newMaskVal = (uchar)((flags & 0xff00) == 0 ? 1 : ((flags >> 8) & 255));
 
@@ -611,7 +618,7 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
                 Diff32fC3(ld_buf.f, ud_buf.f),
                 &comp, flags, &buffer);
     else
-        CV_Error(CV_StsUnsupportedFormat, "");
+        CV_Error(cv::Error::StsUnsupportedFormat, "");
 
     if( rect )
         *rect = comp.rect;
diff --git a/modules/imgproc/src/geometry.cpp b/modules/imgproc/src/geometry.cpp
index 701c3a647f03..ae582fcafc08 100644
--- a/modules/imgproc/src/geometry.cpp
+++ b/modules/imgproc/src/geometry.cpp
@@ -39,7 +39,9 @@
 //
 //M*/
 #include "precomp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
+using namespace cv;
 
 CV_IMPL CvRect
 cvMaxRect( const CvRect* rect1, const CvRect* rect2 )
@@ -87,7 +89,7 @@ CV_IMPL void
 cvBoxPoints( CvBox2D box, CvPoint2D32f pt[4] )
 {
     if( !pt )
-        CV_Error( CV_StsNullPtr, "NULL vertex array pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL vertex array pointer" );
     cv::RotatedRect(box).points((cv::Point2f*)pt);
 }
 
@@ -328,17 +330,16 @@ static LineSegmentIntersection parallelInt( Point2f a, Point2f b, Point2f c, Poi
 static LineSegmentIntersection intersectLineSegments( Point2f a, Point2f b, Point2f c,
                                                       Point2f d, Point2f& p, Point2f& q )
 {
-    double denom = a.x * (double)(d.y - c.y) + b.x * (double)(c.y - d.y) +
-                   d.x * (double)(b.y - a.y) + c.x * (double)(a.y - b.y);
+    double denom = (a.x - b.x) * (double)(d.y - c.y) - (a.y - b.y) * (double)(d.x - c.x);
 
     // If denom is zero, then segments are parallel: handle separately.
     if( denom == 0. )
         return parallelInt(a, b, c, d, p, q);
 
-    double num = a.x * (double)(d.y - c.y) + c.x * (double)(a.y - d.y) + d.x * (double)(c.y - a.y);
+    double num = (d.y - a.y) * (double)(a.x - c.x) + (a.x - d.x) * (double)(a.y - c.y);
     double s = num / denom;
 
-    num = a.x * (double)(b.y - c.y) + b.x * (double)(c.y - a.y) + c.x * (double)(a.y - b.y);
+    num = (b.y - a.y) * (double)(a.x - c.x) + (c.y - a.y) * (double)(b.x - a.x);
     double t = num / denom;
 
     p.x = (float)(a.x + s*(b.x - a.x));
@@ -593,3 +594,336 @@ float cv::intersectConvexConvex( InputArray _p1, InputArray _p2, OutputArray _p1
     }
     return (float)fabs(area);
 }
+
+static Rect maskBoundingRect( const Mat& img )
+{
+    CV_Assert( img.depth() <= CV_8S && img.channels() == 1 );
+
+    Size size = img.size();
+    int xmin = size.width, ymin = -1, xmax = -1, ymax = -1, i, j, k;
+
+    for( i = 0; i < size.height; i++ )
+    {
+        const uchar* _ptr = img.ptr(i);
+        const uchar* ptr = (const uchar*)alignPtr(_ptr, 4);
+        int have_nz = 0, k_min, offset = (int)(ptr - _ptr);
+        j = 0;
+        offset = MIN(offset, size.width);
+        for( ; j < offset; j++ )
+            if( _ptr[j] )
+            {
+                if( j < xmin )
+                    xmin = j;
+                if( j > xmax )
+                    xmax = j;
+                have_nz = 1;
+            }
+        if( offset < size.width )
+        {
+            xmin -= offset;
+            xmax -= offset;
+            size.width -= offset;
+            j = 0;
+            for( ; j <= xmin - 4; j += 4 )
+                if( *((int*)(ptr+j)) )
+                    break;
+            for( ; j < xmin; j++ )
+                if( ptr[j] )
+                {
+                    xmin = j;
+                    if( j > xmax )
+                        xmax = j;
+                    have_nz = 1;
+                    break;
+                }
+            k_min = MAX(j-1, xmax);
+            k = size.width - 1;
+            for( ; k > k_min && (k&3) != 3; k-- )
+                if( ptr[k] )
+                    break;
+            if( k > k_min && (k&3) == 3 )
+            {
+                for( ; k > k_min+3; k -= 4 )
+                    if( *((int*)(ptr+k-3)) )
+                        break;
+            }
+            for( ; k > k_min; k-- )
+                if( ptr[k] )
+                {
+                    xmax = k;
+                    have_nz = 1;
+                    break;
+                }
+            if( !have_nz )
+            {
+                j &= ~3;
+                for( ; j <= k - 3; j += 4 )
+                    if( *((int*)(ptr+j)) )
+                        break;
+                for( ; j <= k; j++ )
+                    if( ptr[j] )
+                    {
+                        have_nz = 1;
+                        break;
+                    }
+            }
+            xmin += offset;
+            xmax += offset;
+            size.width += offset;
+        }
+        if( have_nz )
+        {
+            if( ymin < 0 )
+                ymin = i;
+            ymax = i;
+        }
+    }
+
+    if( xmin >= size.width )
+        xmin = ymin = 0;
+    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
+}
+
+// Calculates bounding rectangle of a point set or retrieves already calculated
+static Rect pointSetBoundingRect( const Mat& points )
+{
+    int npoints = points.checkVector(2);
+    int depth = points.depth();
+    CV_Assert(npoints >= 0 && (depth == CV_32F || depth == CV_32S));
+
+    int  xmin = 0, ymin = 0, xmax = -1, ymax = -1, i;
+    bool is_float = depth == CV_32F;
+
+    if( npoints == 0 )
+        return Rect();
+
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, loop tail related.
+    if( !is_float )
+    {
+        const int32_t* pts = points.ptr<int32_t>();
+        int64_t firstval = 0;
+        std::memcpy(&firstval, pts, sizeof(pts[0]) * 2);
+        v_int32 minval, maxval;
+        minval = maxval = v_reinterpret_as_s32(vx_setall_s64(firstval)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
+        for( i = 1; i <= npoints - VTraits<v_int32>::vlanes()/2; i+= VTraits<v_int32>::vlanes()/2 )
+        {
+            v_int32 ptXY2 = vx_load(pts + 2 * i);
+            minval = v_min(ptXY2, minval);
+            maxval = v_max(ptXY2, maxval);
+        }
+        minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+        maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        if( i <= npoints - VTraits<v_int32>::vlanes()/4 )
+        {
+            v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + 2 * i))));
+            minval = v_min(ptXY, minval);
+            maxval = v_max(ptXY, maxval);
+            i += VTraits<v_int64>::vlanes()/2;
+        }
+        for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
+        {
+            minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+            maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        }
+        xmin = v_get0(minval);
+        xmax = v_get0(maxval);
+        ymin = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+        ymax = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
+#if CV_SIMD_WIDTH > 16
+        if( i < npoints )
+        {
+            v_int32x4 minval2, maxval2;
+            minval2 = maxval2 = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + 2 * i))));
+            for( i++; i < npoints; i++ )
+            {
+                v_int32x4 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + 2 * i))));
+                minval2 = v_min(ptXY, minval2);
+                maxval2 = v_max(ptXY, maxval2);
+            }
+            xmin = min(xmin, v_get0(minval2));
+            xmax = max(xmax, v_get0(maxval2));
+            ymin = min(ymin, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2)))));
+            ymax = max(ymax, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2)))));
+        }
+#endif // CV_SIMD
+    }
+    else
+    {
+        const float* pts = points.ptr<float>();
+        int64_t firstval = 0;
+        std::memcpy(&firstval, pts, sizeof(pts[0]) * 2);
+        v_float32 minval, maxval;
+        minval = maxval = v_reinterpret_as_f32(vx_setall_s64(firstval)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
+        for( i = 1; i <= npoints - VTraits<v_float32>::vlanes()/2; i+= VTraits<v_float32>::vlanes()/2 )
+        {
+            v_float32 ptXY2 = vx_load(pts + 2 * i);
+            minval = v_min(ptXY2, minval);
+            maxval = v_max(ptXY2, maxval);
+        }
+        minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
+        maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        if( i <= npoints - VTraits<v_float32>::vlanes()/4 )
+        {
+            v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + 2 * i))));
+            minval = v_min(ptXY, minval);
+            maxval = v_max(ptXY, maxval);
+            i += VTraits<v_float32>::vlanes()/4;
+        }
+        for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
+        {
+            minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
+            maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
+        }
+        xmin = cvFloor(v_get0(minval));
+        xmax = cvFloor(v_get0(maxval));
+        ymin = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))));
+        ymax = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))));
+#if CV_SIMD_WIDTH > 16
+        if( i < npoints )
+        {
+            v_float32x4 minval2, maxval2;
+            minval2 = maxval2 = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + 2 * i))));
+            for( i++; i < npoints; i++ )
+            {
+                v_float32x4 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + 2 * i))));
+                minval2 = v_min(ptXY, minval2);
+                maxval2 = v_max(ptXY, maxval2);
+            }
+            xmin = min(xmin, cvFloor(v_get0(minval2)));
+            xmax = max(xmax, cvFloor(v_get0(maxval2)));
+            ymin = min(ymin, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))))));
+            ymax = max(ymax, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))))));
+        }
+#endif
+    }
+#else
+    const Point* pts = points.ptr<Point>();
+    Point pt = pts[0];
+
+    if( !is_float )
+    {
+        xmin = xmax = pt.x;
+        ymin = ymax = pt.y;
+
+        for( i = 1; i < npoints; i++ )
+        {
+            pt = pts[i];
+
+            if( xmin > pt.x )
+                xmin = pt.x;
+
+            if( xmax < pt.x )
+                xmax = pt.x;
+
+            if( ymin > pt.y )
+                ymin = pt.y;
+
+            if( ymax < pt.y )
+                ymax = pt.y;
+        }
+    }
+    else
+    {
+        Cv32suf v;
+        // init values
+        xmin = xmax = CV_TOGGLE_FLT(pt.x);
+        ymin = ymax = CV_TOGGLE_FLT(pt.y);
+
+        for( i = 1; i < npoints; i++ )
+        {
+            pt = pts[i];
+            pt.x = CV_TOGGLE_FLT(pt.x);
+            pt.y = CV_TOGGLE_FLT(pt.y);
+
+            if( xmin > pt.x )
+                xmin = pt.x;
+
+            if( xmax < pt.x )
+                xmax = pt.x;
+
+            if( ymin > pt.y )
+                ymin = pt.y;
+
+            if( ymax < pt.y )
+                ymax = pt.y;
+        }
+
+        v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
+        v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
+        // because right and bottom sides of the bounding rectangle are not inclusive
+        // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
+        v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
+        v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
+    }
+#endif
+
+    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
+}
+
+
+cv::Rect cv::boundingRect(InputArray array)
+{
+    CV_INSTRUMENT_REGION();
+
+    Mat m = array.getMat();
+    return m.depth() <= CV_8U ? maskBoundingRect(m) : pointSetBoundingRect(m);
+}
+
+
+/* Calculates bounding rectangle of a point set or retrieves already calculated */
+CV_IMPL  CvRect
+cvBoundingRect( CvArr* array, int update )
+{
+    cv::Rect rect;
+    CvContour contour_header;
+    CvSeq* ptseq = 0;
+    CvSeqBlock block;
+
+    CvMat stub, *mat = 0;
+    int calculate = update;
+
+    if( CV_IS_SEQ( array ))
+    {
+        ptseq = (CvSeq*)array;
+        if( !CV_IS_SEQ_POINT_SET( ptseq ))
+            CV_Error( cv::Error::StsBadArg, "Unsupported sequence type" );
+
+        if( ptseq->header_size < (int)sizeof(CvContour))
+        {
+            update = 0;
+            calculate = 1;
+        }
+    }
+    else
+    {
+        mat = cvGetMat( array, &stub );
+        if( CV_MAT_TYPE(mat->type) == CV_32SC2 ||
+            CV_MAT_TYPE(mat->type) == CV_32FC2 )
+        {
+            ptseq = cvPointSeqFromMat(CV_SEQ_KIND_GENERIC, mat, &contour_header, &block);
+            mat = 0;
+        }
+        else if( CV_MAT_TYPE(mat->type) != CV_8UC1 &&
+                CV_MAT_TYPE(mat->type) != CV_8SC1 )
+            CV_Error( cv::Error::StsUnsupportedFormat,
+                "The image/matrix format is not supported by the function" );
+        update = 0;
+        calculate = 1;
+    }
+
+    if( !calculate )
+        return ((CvContour*)ptseq)->rect;
+
+    if( mat )
+    {
+        rect = cvRect(maskBoundingRect(cv::cvarrToMat(mat)));
+    }
+    else if( ptseq->total )
+    {
+        cv::AutoBuffer<double> abuf;
+        rect = cvRect(pointSetBoundingRect(cv::cvarrToMat(ptseq, false, false, 0, &abuf)));
+    }
+    if( update )
+        ((CvContour*)ptseq)->rect = cvRect(rect);
+    return cvRect(rect);
+}
diff --git a/modules/imgproc/src/grabcut.cpp b/modules/imgproc/src/grabcut.cpp
index 5ec5af2be4ce..358747843ef0 100644
--- a/modules/imgproc/src/grabcut.cpp
+++ b/modules/imgproc/src/grabcut.cpp
@@ -96,7 +96,7 @@ GMM::GMM( Mat& _model )
         _model.setTo(Scalar(0));
     }
     else if( (_model.type() != CV_64FC1) || (_model.rows != 1) || (_model.cols != modelSize*componentsCount) )
-        CV_Error( CV_StsBadArg, "_model must have CV_64FC1 type, rows == 1 and cols == 13*componentsCount" );
+        CV_Error( cv::Error::StsBadArg, "_model must have CV_64FC1 type, rows == 1 and cols == 13*componentsCount" );
 
     model = _model;
 
@@ -329,18 +329,18 @@ static void calcNWeights( const Mat& img, Mat& leftW, Mat& upleftW, Mat& upW, Ma
 static void checkMask( const Mat& img, const Mat& mask )
 {
     if( mask.empty() )
-        CV_Error( CV_StsBadArg, "mask is empty" );
+        CV_Error( cv::Error::StsBadArg, "mask is empty" );
     if( mask.type() != CV_8UC1 )
-        CV_Error( CV_StsBadArg, "mask must have CV_8UC1 type" );
+        CV_Error( cv::Error::StsBadArg, "mask must have CV_8UC1 type" );
     if( mask.cols != img.cols || mask.rows != img.rows )
-        CV_Error( CV_StsBadArg, "mask must have as many rows and cols as img" );
+        CV_Error( cv::Error::StsBadArg, "mask must have as many rows and cols as img" );
     for( int y = 0; y < mask.rows; y++ )
     {
         for( int x = 0; x < mask.cols; x++ )
         {
             uchar val = mask.at<uchar>(y,x);
             if( val!=GC_BGD && val!=GC_FGD && val!=GC_PR_BGD && val!=GC_PR_FGD )
-                CV_Error( CV_StsBadArg, "mask element value must be equal "
+                CV_Error( cv::Error::StsBadArg, "mask element value must be equal "
                     "GC_BGD or GC_FGD or GC_PR_BGD or GC_PR_FGD" );
         }
     }
@@ -389,14 +389,14 @@ static void initGMMs( const Mat& img, const Mat& mask, GMM& bgdGMM, GMM& fgdGMM
         int num_clusters = GMM::componentsCount;
         num_clusters = std::min(num_clusters, (int)bgdSamples.size());
         kmeans( _bgdSamples, num_clusters, bgdLabels,
-                TermCriteria( CV_TERMCRIT_ITER, kMeansItCount, 0.0), 0, kMeansType );
+                TermCriteria( TermCriteria::MAX_ITER, kMeansItCount, 0.0), 0, kMeansType );
     }
     {
         Mat _fgdSamples( (int)fgdSamples.size(), 3, CV_32FC1, &fgdSamples[0][0] );
         int num_clusters = GMM::componentsCount;
         num_clusters = std::min(num_clusters, (int)fgdSamples.size());
         kmeans( _fgdSamples, num_clusters, fgdLabels,
-                TermCriteria( CV_TERMCRIT_ITER, kMeansItCount, 0.0), 0, kMeansType );
+                TermCriteria( TermCriteria::MAX_ITER, kMeansItCount, 0.0), 0, kMeansType );
     }
 
     bgdGMM.initLearning();
@@ -552,9 +552,9 @@ void cv::grabCut( InputArray _img, InputOutputArray _mask, Rect rect,
     Mat& fgdModel = _fgdModel.getMatRef();
 
     if( img.empty() )
-        CV_Error( CV_StsBadArg, "image is empty" );
+        CV_Error( cv::Error::StsBadArg, "image is empty" );
     if( img.type() != CV_8UC3 )
-        CV_Error( CV_StsBadArg, "image must have CV_8UC3 type" );
+        CV_Error( cv::Error::StsBadArg, "image must have CV_8UC3 type" );
 
     GMM bgdGMM( bgdModel ), fgdGMM( fgdModel );
     Mat compIdxs( img.size(), CV_32SC1 );
diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
index 6eb956bfeeda..773fed9b482f 100644
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -298,11 +298,44 @@ inline int hal_ni_warpPerspective(int src_type, const uchar *src_data, size_t sr
 #define cv_hal_warpPerspective hal_ni_warpPerspective
 //! @endcond
 
+/**
+   @brief hal_remap with floating point maps
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   @param src_height source image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param mapx map for x values
+   @param mapx_step mapx matrix step
+   @param mapy map for y values
+   @param mapy_step mapy matrix step
+   @param interpolation interpolation mode (CV_HAL_INTER_NEAREST, ...)
+   @param border_type border processing mode (CV_HAL_BORDER_REFLECT, ...)
+   @param border_value values to use for CV_HAL_BORDER_CONSTANT mode
+   @sa cv::remap
+ */
+inline int hal_ni_remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                           uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+                           float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
+                           int interpolation, int border_type, const double border_value[4])
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_remap32f hal_ni_remap32f
+//! @endcond
+
 /**
    @brief hal_cvtBGRtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U, CV_32F)
    @param scn source image channels (3 or 4)
    @param dcn destination image channels (3 or 4)
@@ -313,9 +346,12 @@ inline int hal_ni_cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtBGRtoBGR5x5
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
    @param greenBits number of bits for green channel (5 or 6)
@@ -326,9 +362,12 @@ inline int hal_ni_cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar
 
 /**
    @brief hal_cvtBGR5x5toBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param greenBits number of bits for green channel (5 or 6)
@@ -339,9 +378,12 @@ inline int hal_ni_cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar
 
 /**
    @brief hal_cvtBGRtoGray
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -351,9 +393,12 @@ inline int hal_ni_cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar *
 
 /**
    @brief hal_cvtGraytoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param dcn destination image channels (3 or 4)
    Convert from 1-channel gray to BGR, RGB, RGBA or BGRA.
@@ -362,9 +407,12 @@ inline int hal_ni_cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar *
 
 /**
    @brief hal_cvtBGR5x5toGray
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param greenBits number of bits for green channel (5 or 6)
    Convert from packed BGR (16 bits per pixel, 555 or 565) to 1-channel gray.
    Support only CV_8U images.
@@ -373,9 +421,12 @@ inline int hal_ni_cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar
 
 /**
    @brief hal_cvtGraytoBGR5x5
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param greenBits number of bits for green channel (5 or 6)
    Convert from 1-channel gray to packed BGR (16 bits per pixel, 555 or 565).
    Support only CV_8U images.
@@ -384,9 +435,12 @@ inline int hal_ni_cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar
 
 /**
    @brief hal_cvtBGRtoYUV
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -397,9 +451,12 @@ inline int hal_ni_cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtYUVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
@@ -410,9 +467,12 @@ inline int hal_ni_cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtBGRtoXYZ
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -422,9 +482,12 @@ inline int hal_ni_cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtXYZtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U, CV_16U or CV_32F)
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
@@ -434,9 +497,12 @@ inline int hal_ni_cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtBGRtoHSV
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -448,9 +514,12 @@ inline int hal_ni_cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtHSVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U or CV_32F)
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
@@ -462,9 +531,12 @@ inline int hal_ni_cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtBGRtoLab
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U or CV_32F)
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
@@ -476,9 +548,12 @@ inline int hal_ni_cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtLabtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param depth image depth (one of CV_8U or CV_32F)
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
@@ -490,28 +565,37 @@ inline int hal_ni_cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * d
 
 /**
    @brief hal_cvtTwoPlaneYUVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param dst_width,dst_height destination image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
    Convert from YUV (YUV420sp (or NV12/NV21) - Y plane followed by interleaved U/V plane) to BGR, RGB, BGRA or RGBA.
    Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
  */
 inline int hal_ni_cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
 /**
    @brief Extended version of hal_cvtTwoPlaneYUVtoBGR.
-   @param y_data,y_step source image data and step (Y-plane)
-   @param uv_data,uv_step source image data and step (UV-plane)
-   @param dst_data,dst_step destination image data and step
-   @param dst_width,dst_height destination image size
+   @param y_data source image data (Y-plane)
+   @param y_step source image step (Y-plane)
+   @param uv_data source image data (UV-plane)
+   @param uv_step source image step (UV-plane)
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param uIdx U-channel index in the interleaved U/V plane (0 or 1)
    Convert from YUV (YUV420sp (or NV12/NV21) - Y plane followed by interleaved U/V plane) to BGR, RGB, BGRA or RGBA.
    Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
  */
 inline int hal_ni_cvtTwoPlaneYUVtoBGREx(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
                                       uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
@@ -519,15 +603,20 @@ inline int hal_ni_cvtTwoPlaneYUVtoBGREx(const uchar * y_data, size_t y_step, con
 
 /**
    @brief hal_cvtBGRtoTwoPlaneYUV
-   @param src_data,src_step source image data and step
-   @param y_data,y_step destination image data and step (Y-plane)
-   @param uv_data,uv_step destination image data and step (UV-plane)
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param y_data destination image data (Y-plane)
+   @param y_step destination image step (Y-plane)
+   @param uv_data destination image data (UV-plane)
+   @param uv_step destination image step (UV-plane)
+   @param width image width
+   @param height image height
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
    @param uIdx U-channel plane index (0 or 1)
    Convert from BGR, RGB, BGRA or RGBA to YUV (YUV420sp (or NV12/NV21) - Y plane followed by interleaved U/V plane).
    Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
  */
 inline int hal_ni_cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
                                       uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step,
@@ -536,50 +625,79 @@ inline int hal_ni_cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
 
 /**
    @brief hal_cvtThreePlaneYUVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param dst_width,dst_height destination image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param uIdx U-channel plane index (0 or 1)
    Convert from YUV (YUV420p (or YV12/YV21) - Y plane followed by U and V planes) to BGR, RGB, BGRA or RGBA.
    Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
  */
 inline int hal_ni_cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
 /**
    @brief hal_cvtBGRtoThreePlaneYUV
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param scn source image channels (3 or 4)
    @param swapBlue if set to true B and R source channels will be swapped (treat as RGB)
    @param uIdx U-channel plane index (0 or 1)
    Convert from BGR, RGB, BGRA or RGBA to YUV (YUV420p (or YV12/YV21) - Y plane followed by U and V planes).
    Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
  */
 inline int hal_ni_cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
 /**
    @brief hal_cvtOnePlaneYUVtoBGR
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    @param dcn destination image channels (3 or 4)
    @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
    @param uIdx U-channel index (0 or 1)
    @param ycn Y-channel index (0 or 1)
-   Convert from UYVY, YUY2 or YVYU to BGR, RGB, BGRA or RGBA.
+   Convert from interleaved YUV 4:2:2 (UYVY, YUY2 or YVYU) to BGR, RGB, BGRA or RGBA.
    Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
  */
 inline int hal_ni_cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
-
 /**
-   @brief hal_cvtRGBAtoMultipliedRGBA
+   @brief hal_cvtOnePlaneBGRtoYUV
    @param src_data,src_step source image data and step
    @param dst_data,dst_step destination image data and step
    @param width,height image size
+   @param scn source image channels (3 or 4)
+   @param swapBlue if set to true B and R destination channels will be swapped (write RGB)
+   @param uIdx U-channel index (0 or 1)
+   @param ycn Y-channel index (0 or 1)
+   Convert from BGR, RGB, BGRA or RGBA to interleaved YUV 4:2:2 (UYVY, YUY2 or YVYU).
+   Only for CV_8U.
+   Y : [16, 235]; Cb, Cr: [16, 240] centered at 128
+ */
+inline int hal_ni_cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int ycn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief hal_cvtRGBAtoMultipliedRGBA
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    Convert from BGRA or RGBA to format with multiplied alpha channel.
    Only for CV_8U.
  */
@@ -587,9 +705,12 @@ inline int hal_ni_cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_ste
 
 /**
    @brief hal_cvtMultipliedRGBAtoRGBA
-   @param src_data,src_step source image data and step
-   @param dst_data,dst_step destination image data and step
-   @param width,height image size
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width image width
+   @param height image height
    Convert from format with multiplied alpha channel to BGRA or RGBA.
    Only for CV_8U.
  */
@@ -617,18 +738,26 @@ inline int hal_ni_cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_ste
 #define cv_hal_cvtThreePlaneYUVtoBGR hal_ni_cvtThreePlaneYUVtoBGR
 #define cv_hal_cvtBGRtoThreePlaneYUV hal_ni_cvtBGRtoThreePlaneYUV
 #define cv_hal_cvtOnePlaneYUVtoBGR hal_ni_cvtOnePlaneYUVtoBGR
+#define cv_hal_cvtOnePlaneBGRtoYUV hal_ni_cvtOnePlaneBGRtoYUV
 #define cv_hal_cvtRGBAtoMultipliedRGBA hal_ni_cvtRGBAtoMultipliedRGBA
 #define cv_hal_cvtMultipliedRGBAtoRGBA hal_ni_cvtMultipliedRGBAtoRGBA
 //! @endcond
 
 /**
    @brief Calculate integral image
-   @param depth,sdepth,sqdepth Depths of source image, sum image and square sum image
-   @param src_data,src_step Source image
-   @param sum_data,sum_step Sum image
-   @param sqsum_data,sqsum_step Square sum image
-   @param tilted_data,tilted_step Tilted sum image
-   @param width,height Source image dimensions
+   @param depth Depth of source image
+   @param sdepth Depth of sum image
+   @param sqdepth Depth of square sum image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param sum_data Sum image data
+   @param sum_step Sum image step
+   @param sqsum_data Square sum image data
+   @param sqsum_step Square sum image step
+   @param tilted_data Tilted sum image data
+   @param tilted_step Tilted sum image step
+   @param width Source image width
+   @param height Source image height
    @param cn Number of channels
    @note Following combinations of image depths are used:
    Source | Sum | Square sum
@@ -655,9 +784,12 @@ inline int hal_ni_integral(int depth, int sdepth, int sqdepth, const uchar * src
 
 /**
    @brief Calculate medianBlur filter
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param depth Depths of source and destination image
    @param cn Number of channels
    @param ksize Size of kernel
@@ -668,11 +800,37 @@ inline int hal_ni_medianBlur(const uchar* src_data, size_t src_step, uchar* dst_
 #define cv_hal_medianBlur hal_ni_medianBlur
 //! @endcond
 
+/**
+   @brief Calculate bilateral filter. See https://homepages.inf.ed.ac.uk/rbf/CVonline/LOCAL_COPIES/MANDUCHI1/Bilateral_Filtering.html
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
+   @param depth Depths of source and destination image. Should support CV_8U and CV_32F
+   @param cn Number of channels
+   @param d Diameter of each pixel neighborhood that is used during filtering. If it is non-positive, it is computed from sigmaSpace
+   @param sigma_color Filter sigma in the color space
+   @param sigma_space Filter sigma in the coordinate space. When d>0, it specifies the neighborhood size regardless of sigmaSpace. Otherwise, d is proportional to sigmaSpace
+   @param border_type border mode used to extrapolate pixels outside of the image
+*/
+inline int hal_ni_bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                                  int width, int height, int depth, int cn, int d, double sigma_color, double sigma_space, int border_type)
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_bilateralFilter hal_ni_bilateralFilter
+//! @endcond
+
 /**
    @brief Calculates adaptive threshold
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param maxValue Value assigned to the pixels for which the condition is satisfied
    @param adaptiveMethod Adaptive thresholding algorithm
    @param thresholdType Thresholding type
@@ -687,9 +845,12 @@ inline int hal_ni_adaptiveThreshold(const uchar* src_data, size_t src_step, ucha
 
 /**
    @brief Calculates fixed-level threshold to each array element
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param depth Depths of source and destination image
    @param cn Number of channels
    @param thresh Threshold value
@@ -702,16 +863,44 @@ inline int hal_ni_threshold(const uchar* src_data, size_t src_step, uchar* dst_d
 #define cv_hal_threshold hal_ni_threshold
 //! @endcond
 
+/**
+   @brief Performs threshold filtering using threshold estimated by Otsu algorithm
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
+   @param depth Depths of source and destination image
+   @param maxValue Value assigned to the pixels for which the condition is satisfied
+   @param thresholdType Thresholding type
+   @param thresh Calculated threshold value
+*/
+inline int hal_ni_threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_threshold_otsu hal_ni_threshold_otsu
+//! @endcond
+
 /**
    @brief Calculate box filter
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
-   @param src_depth,dst_depth Depths of source and destination image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
+   @param src_depth Depth of source image
+   @param dst_depth Depts of destination image
    @param cn Number of channels
-   @param margin_left,margin_top,margin_right,margin_bottom Margins for source image
-   @param ksize_width,ksize_height Size of kernel
-   @param anchor_x,anchor_y Anchor point
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param ksize_width Width of kernel
+   @param ksize_height Height of kernel
+   @param anchor_x Anchor point x coordinate
+   @param anchor_y Anchor point y coordinate
    @param normalize If true then result is normalized
    @param border_type Border type
 */
@@ -722,15 +911,38 @@ inline int hal_ni_boxFilter(const uchar* src_data, size_t src_step, uchar* dst_d
 //! @endcond
 
 /**
-   @brief Blurs an image using a Gaussian filter.
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @brief Equalizes the histogram of a grayscale image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
+*/
+inline int hal_ni_equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_equalize_hist hal_ni_equalize_hist
+//! @endcond
+
+/**
+   @brief Blurs an image using a generic Gaussian filter.
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param depth Depth of source and destination image
    @param cn Number of channels
-   @param margin_left,margin_top,margin_right,margin_bottom Margins for source image
-   @param ksize_width,ksize_height Size of kernel
-   @param sigmaX,sigmaY Gaussian kernel standard deviation.
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param ksize_width Width of kernel
+   @param ksize_height Height of kernel
+   @param sigmaX Gaussian kernel standard deviation.
+   @param sigmaY Gaussian kernel standard deviation.
    @param border_type Border type
 */
 inline int hal_ni_gaussianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize_width, size_t ksize_height, double sigmaX, double sigmaY, int border_type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
@@ -739,15 +951,46 @@ inline int hal_ni_gaussianBlur(const uchar* src_data, size_t src_step, uchar* ds
 #define cv_hal_gaussianBlur hal_ni_gaussianBlur
 //! @endcond
 
+/**
+   @brief Blurs an image using a symmetric Gaussian filter with square kernel and sigma=0.
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
+   @param depth Depth of source and destination image
+   @param cn Number of channels
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param ksize Width of kernel
+   @param border_type Border type
+*/
+inline int hal_ni_gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_gaussianBlurBinomial hal_ni_gaussianBlurBinomial
+//! @endcond
+
 /**
    @brief Computes Sobel derivatives
-   @param src_depth,dst_depth Depths of source and destination image
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_depth Depth of source image
+   @param dst_depth Depts of destination image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param cn Number of channels
-   @param margin_left,margin_top,margin_right,margin_bottom Margins for source image
-   @param dx,dy orders of the derivative x and y respectively
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param dx orders of the derivative x
+   @param dy orders of the derivative y
    @param ksize Size of kernel
    @param scale Scale factor for the computed derivative values
    @param delta Delta value that is added to the results prior to storing them in dst
@@ -761,13 +1004,21 @@ inline int hal_ni_sobel(const uchar* src_data, size_t src_step, uchar* dst_data,
 
 /**
    @brief Computes Scharr filter
-   @param src_depth,dst_depth Depths of source and destination image
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_depth Depth of source image
+   @param dst_depth Depts of destination image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param cn Number of channels
-   @param margin_left,margin_top,margin_right,margin_bottom Margins for source image
-   @param dx,dy orders of the derivative x and y respectively
+   @param margin_left Left margins for source image
+   @param margin_top Top margins for source image
+   @param margin_right Right margins for source image
+   @param margin_bottom Bottom margins for source image
+   @param dx orders of the derivative x
+   @param dy orders of the derivative y
    @param scale Scale factor for the computed derivative values
    @param delta Delta value that is added to the results prior to storing them in dst
    @param border_type Border type
@@ -781,10 +1032,14 @@ inline int hal_ni_scharr(const uchar* src_data, size_t src_step, uchar* dst_data
 /**
    @brief Perform Gaussian Blur and downsampling for input tile.
    @param depth Depths of source and destination image
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param src_width,src_height Source image dimensions
-   @param dst_width,dst_height Destination image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param src_width Source image width
+   @param src_height Source image height
+   @param dst_width Destination image width
+   @param dst_height Destination image height
    @param cn Number of channels
    @param border_type Border type
 */
@@ -796,11 +1051,15 @@ inline int hal_ni_pyrdown(const uchar* src_data, size_t src_step, int src_width,
 
 /**
    @brief Canny edge detector
-   @param src_data,src_step Source image
-   @param dst_data,dst_step Destination image
-   @param width,height Source image dimensions
+   @param src_data Source image data
+   @param src_step Source image step
+   @param dst_data Destination image data
+   @param dst_step Destination image step
+   @param width Source image width
+   @param height Source image height
    @param cn Number of channels
-   @param lowThreshold, highThreshold Thresholds value
+   @param lowThreshold low hresholds value
+   @param highThreshold high thresholds value
    @param ksize Kernel size for Sobel operator.
    @param L2gradient Flag, indicating use L2 or L1 norma.
 */
@@ -810,6 +1069,38 @@ inline int hal_ni_canny(const uchar* src_data, size_t src_step, uchar* dst_data,
 #define cv_hal_canny hal_ni_canny
 //! @endcond
 
+/**
+   @brief Calculates all of the moments up to the third order of a polygon or rasterized shape for image
+   @param src_data Source image data
+   @param src_step Source image step
+   @param src_type source pints type
+   @param width Source image width
+   @param height Source image height
+   @param binary If it is true, all non-zero image pixels are treated as 1's
+   @param m Output array of moments (10 values) in the following order:
+    m00, m10, m01, m20, m11, m02, m30, m21, m12, m03.
+   @sa moments
+*/
+inline int hal_ni_imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+/**
+   @brief Calculates all of the moments up to the third order of a polygon of 2d points
+   @param src_data Source points (Point 2x32f or 2x32s)
+   @param src_size Source points count
+   @param src_type source pints type
+   @param m Output array of moments (10 values) in the following order:
+    m00, m10, m01, m20, m11, m02, m30, m21, m12, m03.
+   @sa moments
+*/
+inline int hal_ni_polygonMoments(const uchar* src_data, size_t src_size, int src_type, double m[10])
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_imageMoments hal_ni_imageMoments
+#define cv_hal_polygonMoments hal_ni_polygonMoments
+//! @endcond
+
 //! @}
 
 #if defined(__clang__)
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 068dfd3a2713..02e97ba1296b 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -1005,7 +1005,7 @@ void cv::calcHist( const Mat* images, int nimages, const int* channels,
     else if( depth == CV_32F )
         calcHist_<float>(ptrs, deltas, imsize, ihist, dims, ranges, _uniranges, uniform );
     else
-        CV_Error(CV_StsUnsupportedFormat, "");
+        CV_Error(cv::Error::StsUnsupportedFormat, "");
 
     ihist.convertTo(hist, CV_32F);
 }
@@ -1182,7 +1182,7 @@ static void calcHist( const Mat* images, int nimages, const int* channels,
     else if( depth == CV_32F )
         calcSparseHist_<float>(ptrs, deltas, imsize, hist, dims, ranges, _uniranges, uniform );
     else
-        CV_Error(CV_StsUnsupportedFormat, "");
+        CV_Error(cv::Error::StsUnsupportedFormat, "");
 
     if( !keepInt )
     {
@@ -1637,7 +1637,7 @@ void cv::calcBackProject( const Mat* images, int nimages, const int* channels,
     else if( depth == CV_32F )
         calcBackProj_<float, float>(ptrs, deltas, imsize, hist, dims, ranges, _uniranges, (float)scale, uniform );
     else
-        CV_Error(CV_StsUnsupportedFormat, "");
+        CV_Error(cv::Error::StsUnsupportedFormat, "");
 }
 
 
@@ -1810,7 +1810,7 @@ void cv::calcBackProject( const Mat* images, int nimages, const int* channels,
         calcSparseBackProj_<float, float>(ptrs, deltas, imsize, hist, dims, ranges,
                                           _uniranges, (float)scale, uniform );
     else
-        CV_Error(CV_StsUnsupportedFormat, "");
+        CV_Error(cv::Error::StsUnsupportedFormat, "");
 }
 
 #ifdef HAVE_OPENCL
@@ -2053,13 +2053,13 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_CORREL )
         {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             v_float64 v_s1 = vx_setzero_f64();
             v_float64 v_s2 = vx_setzero_f64();
             v_float64 v_s11 = vx_setzero_f64();
             v_float64 v_s12 = vx_setzero_f64();
             v_float64 v_s22 = vx_setzero_f64();
-            for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for ( ; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_a = vx_load(h1 + j);
                 v_float32 v_b = vx_load(h2 + j);
@@ -2070,8 +2070,8 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
                 v_s12 = v_muladd(v_ad, v_bd, v_s12);
                 v_s11 = v_muladd(v_ad, v_ad, v_s11);
                 v_s22 = v_muladd(v_bd, v_bd, v_s22);
-                v_s1 += v_ad;
-                v_s2 += v_bd;
+                v_s1 = v_add(v_s1, v_ad);
+                v_s2 = v_add(v_s2, v_bd);
 
                 // 2-3
                 v_ad = v_cvt_f64_high(v_a);
@@ -2079,8 +2079,8 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
                 v_s12 = v_muladd(v_ad, v_bd, v_s12);
                 v_s11 = v_muladd(v_ad, v_ad, v_s11);
                 v_s22 = v_muladd(v_bd, v_bd, v_s22);
-                v_s1 += v_ad;
-                v_s2 += v_bd;
+                v_s1 = v_add(v_s1, v_ad);
+                v_s2 = v_add(v_s2, v_bd);
             }
             s12 += v_reduce_sum(v_s12);
             s11 += v_reduce_sum(v_s11);
@@ -2093,7 +2093,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
             v_float32 v_s11 = vx_setzero_f32();
             v_float32 v_s12 = vx_setzero_f32();
             v_float32 v_s22 = vx_setzero_f32();
-            for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_a = vx_load(h1 + j);
                 v_float32 v_b = vx_load(h2 + j);
@@ -2124,20 +2124,20 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_INTERSECT )
         {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             v_float64 v_result = vx_setzero_f64();
-            for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for ( ; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j));
-                v_result += v_cvt_f64(v_src) + v_cvt_f64_high(v_src);
+                v_result = v_add(v_result, v_add(v_cvt_f64(v_src), v_cvt_f64_high(v_src)));
             }
             result += v_reduce_sum(v_result);
 #elif CV_SIMD
             v_float32 v_result = vx_setzero_f32();
-            for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j));
-                v_result += v_src;
+                v_result = v_add(v_result, v_src);
             }
             result += v_reduce_sum(v_result);
 #endif
@@ -2146,26 +2146,26 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_BHATTACHARYYA )
         {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
             v_float64 v_s1 = vx_setzero_f64();
             v_float64 v_s2 = vx_setzero_f64();
             v_float64 v_result = vx_setzero_f64();
-            for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for ( ; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_a = vx_load(h1 + j);
                 v_float32 v_b = vx_load(h2 + j);
 
                 v_float64 v_ad = v_cvt_f64(v_a);
                 v_float64 v_bd = v_cvt_f64(v_b);
-                v_s1 += v_ad;
-                v_s2 += v_bd;
-                v_result += v_sqrt(v_ad * v_bd);
+                v_s1 = v_add(v_s1, v_ad);
+                v_s2 = v_add(v_s2, v_bd);
+                v_result = v_add(v_result, v_sqrt(v_mul(v_ad, v_bd)));
 
                 v_ad = v_cvt_f64_high(v_a);
                 v_bd = v_cvt_f64_high(v_b);
-                v_s1 += v_ad;
-                v_s2 += v_bd;
-                v_result += v_sqrt(v_ad * v_bd);
+                v_s1 = v_add(v_s1, v_ad);
+                v_s2 = v_add(v_s2, v_bd);
+                v_result = v_add(v_result, v_sqrt(v_mul(v_ad, v_bd)));
             }
             s1 += v_reduce_sum(v_s1);
             s2 += v_reduce_sum(v_s2);
@@ -2174,7 +2174,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
             v_float32 v_s1 = vx_setzero_f32();
             v_float32 v_s2 = vx_setzero_f32();
             v_float32 v_result = vx_setzero_f32();
-            for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_a = vx_load(h1 + j);
                 v_float32 v_b = vx_load(h2 + j);
@@ -2211,7 +2211,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
             }
         }
         else
-            CV_Error( CV_StsBadArg, "Unknown comparison method" );
+            CV_Error( cv::Error::StsBadArg, "Unknown comparison method" );
     }
 
     if( method == CV_COMP_CHISQR_ALT )
@@ -2350,7 +2350,7 @@ double cv::compareHist( const SparseMat& H1, const SparseMat& H2, int method )
         }
     }
     else
-        CV_Error( CV_StsBadArg, "Unknown comparison method" );
+        CV_Error( cv::Error::StsBadArg, "Unknown comparison method" );
 
     if( method == CV_COMP_CHISQR_ALT )
         result *= 2;
@@ -2387,7 +2387,7 @@ cvCreateHist( int dims, int *sizes, CvHistType type, float** ranges, int uniform
     else if( type == CV_HIST_SPARSE )
         hist->bins = cvCreateSparseMat( dims, sizes, CV_HIST_DEFAULT_TYPE );
     else
-        CV_Error( CV_StsBadArg, "Invalid histogram type" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram type" );
 
     if( ranges )
         cvSetHistBinRanges( hist, ranges, uniform );
@@ -2402,10 +2402,10 @@ cvMakeHistHeaderForArray( int dims, int *sizes, CvHistogram *hist,
                           float *data, float **ranges, int uniform )
 {
     if( !hist )
-        CV_Error( CV_StsNullPtr, "Null histogram header pointer" );
+        CV_Error( cv::Error::StsNullPtr, "Null histogram header pointer" );
 
     if( !data )
-        CV_Error( CV_StsNullPtr, "Null data pointer" );
+        CV_Error( cv::Error::StsNullPtr, "Null data pointer" );
 
     hist->thresh2 = 0;
     hist->type = CV_HIST_MAGIC_VAL;
@@ -2414,7 +2414,7 @@ cvMakeHistHeaderForArray( int dims, int *sizes, CvHistogram *hist,
     if( ranges )
     {
         if( !uniform )
-            CV_Error( CV_StsBadArg, "Only uniform bin ranges can be used here "
+            CV_Error( cv::Error::StsBadArg, "Only uniform bin ranges can be used here "
                                     "(to avoid memory allocation)" );
         cvSetHistBinRanges( hist, ranges, uniform );
     }
@@ -2427,14 +2427,14 @@ CV_IMPL void
 cvReleaseHist( CvHistogram **hist )
 {
     if( !hist )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( *hist )
     {
         CvHistogram* temp = *hist;
 
         if( !CV_IS_HIST(temp))
-            CV_Error( CV_StsBadArg, "Invalid histogram header" );
+            CV_Error( cv::Error::StsBadArg, "Invalid histogram header" );
         *hist = 0;
 
         if( CV_IS_SPARSE_HIST( temp ))
@@ -2455,7 +2455,7 @@ CV_IMPL void
 cvClearHist( CvHistogram *hist )
 {
     if( !CV_IS_HIST(hist) )
-        CV_Error( CV_StsBadArg, "Invalid histogram header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram header" );
     cvZero( hist->bins );
 }
 
@@ -2465,13 +2465,13 @@ CV_IMPL void
 cvThreshHist( CvHistogram* hist, double thresh )
 {
     if( !CV_IS_HIST(hist) )
-        CV_Error( CV_StsBadArg, "Invalid histogram header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram header" );
 
     if( !CV_IS_SPARSE_MAT(hist->bins) )
     {
         CvMat mat;
         cvGetMat( hist->bins, &mat, 0, 1 );
-        cvThreshold( &mat, &mat, thresh, 0, CV_THRESH_TOZERO );
+        cvThreshold( &mat, &mat, thresh, 0, cv::THRESH_TOZERO );
     }
     else
     {
@@ -2497,7 +2497,7 @@ cvNormalizeHist( CvHistogram* hist, double factor )
     double sum = 0;
 
     if( !CV_IS_HIST(hist) )
-        CV_Error( CV_StsBadArg, "Invalid histogram header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram header" );
 
     if( !CV_IS_SPARSE_HIST(hist) )
     {
@@ -2544,7 +2544,7 @@ cvGetMinMaxHistValue( const CvHistogram* hist,
     int dims, size[CV_MAX_DIM];
 
     if( !CV_IS_HIST(hist) )
-        CV_Error( CV_StsBadArg, "Invalid histogram header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram header" );
 
     dims = cvGetDims( hist->bins, size );
 
@@ -2662,10 +2662,10 @@ cvCompareHist( const CvHistogram* hist1,
     int size1[CV_MAX_DIM], size2[CV_MAX_DIM], total = 1;
 
     if( !CV_IS_HIST(hist1) || !CV_IS_HIST(hist2) )
-        CV_Error( CV_StsBadArg, "Invalid histogram header[s]" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram header[s]" );
 
     if( CV_IS_SPARSE_MAT(hist1->bins) != CV_IS_SPARSE_MAT(hist2->bins))
-        CV_Error(CV_StsUnmatchedFormats, "One of histograms is sparse and other is not");
+        CV_Error(cv::Error::StsUnmatchedFormats, "One of histograms is sparse and other is not");
 
     if( !CV_IS_SPARSE_MAT(hist1->bins) )
     {
@@ -2678,13 +2678,13 @@ cvCompareHist( const CvHistogram* hist1,
     int dims2 = cvGetDims( hist2->bins, size2 );
 
     if( dims1 != dims2 )
-        CV_Error( CV_StsUnmatchedSizes,
+        CV_Error( cv::Error::StsUnmatchedSizes,
                  "The histograms have different numbers of dimensions" );
 
     for( i = 0; i < dims1; i++ )
     {
         if( size1[i] != size2[i] )
-            CV_Error( CV_StsUnmatchedSizes, "The histograms have different sizes" );
+            CV_Error( cv::Error::StsUnmatchedSizes, "The histograms have different sizes" );
         total *= size1[i];
     }
 
@@ -2804,7 +2804,7 @@ cvCompareHist( const CvHistogram* hist1,
         result = cv::compareHist( sH1, sH2, CV_COMP_KL_DIV );
     }
     else
-        CV_Error( CV_StsBadArg, "Unknown comparison method" );
+        CV_Error( cv::Error::StsBadArg, "Unknown comparison method" );
 
     if( method == CV_COMP_CHISQR_ALT )
         result *= 2;
@@ -2817,12 +2817,12 @@ CV_IMPL void
 cvCopyHist( const CvHistogram* src, CvHistogram** _dst )
 {
     if( !_dst )
-        CV_Error( CV_StsNullPtr, "Destination double pointer is NULL" );
+        CV_Error( cv::Error::StsNullPtr, "Destination double pointer is NULL" );
 
     CvHistogram* dst = *_dst;
 
     if( !CV_IS_HIST(src) || (dst && !CV_IS_HIST(dst)) )
-        CV_Error( CV_StsBadArg, "Invalid histogram header[s]" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram header[s]" );
 
     bool eq = false;
     int size1[CV_MAX_DIM];
@@ -2887,10 +2887,10 @@ cvSetHistBinRanges( CvHistogram* hist, float** ranges, int uniform )
     int i, j;
 
     if( !ranges )
-        CV_Error( CV_StsNullPtr, "NULL ranges pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL ranges pointer" );
 
     if( !CV_IS_HIST(hist) )
-        CV_Error( CV_StsBadArg, "Invalid histogram header" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram header" );
 
     dims = cvGetDims( hist->bins, size );
     for( i = 0; i < dims; i++ )
@@ -2901,7 +2901,7 @@ cvSetHistBinRanges( CvHistogram* hist, float** ranges, int uniform )
         for( i = 0; i < dims; i++ )
         {
             if( !ranges[i] )
-                CV_Error( CV_StsNullPtr, "One of <ranges> elements is NULL" );
+                CV_Error( cv::Error::StsNullPtr, "One of <ranges> elements is NULL" );
             hist->thresh[i][0] = ranges[i][0];
             hist->thresh[i][1] = ranges[i][1];
         }
@@ -2925,13 +2925,13 @@ cvSetHistBinRanges( CvHistogram* hist, float** ranges, int uniform )
             float val0 = -FLT_MAX;
 
             if( !ranges[i] )
-                CV_Error( CV_StsNullPtr, "One of <ranges> elements is NULL" );
+                CV_Error( cv::Error::StsNullPtr, "One of <ranges> elements is NULL" );
 
             for( j = 0; j <= size[i]; j++ )
             {
                 float val = ranges[i][j];
                 if( val <= val0 )
-                    CV_Error(CV_StsOutOfRange, "Bin ranges should go in ascenting order");
+                    CV_Error(cv::Error::StsOutOfRange, "Bin ranges should go in ascenting order");
                 val0 = dim_ranges[j] = val;
             }
 
@@ -2949,10 +2949,10 @@ CV_IMPL void
 cvCalcArrHist( CvArr** img, CvHistogram* hist, int accumulate, const CvArr* mask )
 {
     if( !CV_IS_HIST(hist))
-        CV_Error( CV_StsBadArg, "Bad histogram pointer" );
+        CV_Error( cv::Error::StsBadArg, "Bad histogram pointer" );
 
     if( !img )
-        CV_Error( CV_StsNullPtr, "Null double array pointer" );
+        CV_Error( cv::Error::StsNullPtr, "Null double array pointer" );
 
     int size[CV_MAX_DIM];
     int i, dims = cvGetDims( hist->bins, size);
@@ -3015,10 +3015,10 @@ CV_IMPL void
 cvCalcArrBackProject( CvArr** img, CvArr* dst, const CvHistogram* hist )
 {
     if( !CV_IS_HIST(hist))
-        CV_Error( CV_StsBadArg, "Bad histogram pointer" );
+        CV_Error( cv::Error::StsBadArg, "Bad histogram pointer" );
 
     if( !img )
-        CV_Error( CV_StsNullPtr, "Null double array pointer" );
+        CV_Error( cv::Error::StsNullPtr, "Null double array pointer" );
 
     int size[CV_MAX_DIM];
     int i, dims = cvGetDims( hist->bins, size );
@@ -3078,21 +3078,21 @@ cvCalcArrBackProjectPatch( CvArr** arr, CvArr* dst, CvSize patch_size, CvHistogr
     cv::Size size;
 
     if( !CV_IS_HIST(hist))
-        CV_Error( CV_StsBadArg, "Bad histogram pointer" );
+        CV_Error( cv::Error::StsBadArg, "Bad histogram pointer" );
 
     if( !arr )
-        CV_Error( CV_StsNullPtr, "Null double array pointer" );
+        CV_Error( cv::Error::StsNullPtr, "Null double array pointer" );
 
     if( norm_factor <= 0 )
-        CV_Error( CV_StsOutOfRange,
+        CV_Error( cv::Error::StsOutOfRange,
                   "Bad normalization factor (set it to 1.0 if unsure)" );
 
     if( patch_size.width <= 0 || patch_size.height <= 0 )
-        CV_Error( CV_StsBadSize, "The patch width and height must be positive" );
+        CV_Error( cv::Error::StsBadSize, "The patch width and height must be positive" );
 
     dims = cvGetDims( hist->bins );
     if (dims < 1)
-        CV_Error( CV_StsOutOfRange, "Invalid number of dimensions");
+        CV_Error( cv::Error::StsOutOfRange, "Invalid number of dimensions");
     cvNormalizeHist( hist, norm_factor );
 
     for( i = 0; i < dims; i++ )
@@ -3105,11 +3105,11 @@ cvCalcArrBackProjectPatch( CvArr** arr, CvArr* dst, CvSize patch_size, CvHistogr
 
     dstmat = cvGetMat( dst, &dststub, 0, 0 );
     if( CV_MAT_TYPE( dstmat->type ) != CV_32FC1 )
-        CV_Error( CV_StsUnsupportedFormat, "Resultant image must have 32fC1 type" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Resultant image must have 32fC1 type" );
 
     if( dstmat->cols != img[0]->width - patch_size.width + 1 ||
         dstmat->rows != img[0]->height - patch_size.height + 1 )
-        CV_Error( CV_StsUnmatchedSizes,
+        CV_Error( cv::Error::StsUnmatchedSizes,
             "The output map must be (W-w+1 x H-h+1), "
             "where the input images are (W x H) each and the patch is (w x h)" );
 
@@ -3146,18 +3146,18 @@ cvCalcBayesianProb( CvHistogram** src, int count, CvHistogram** dst )
     int i;
 
     if( !src || !dst )
-        CV_Error( CV_StsNullPtr, "NULL histogram array pointer" );
+        CV_Error( cv::Error::StsNullPtr, "NULL histogram array pointer" );
 
     if( count < 2 )
-        CV_Error( CV_StsOutOfRange, "Too small number of histograms" );
+        CV_Error( cv::Error::StsOutOfRange, "Too small number of histograms" );
 
     for( i = 0; i < count; i++ )
     {
         if( !CV_IS_HIST(src[i]) || !CV_IS_HIST(dst[i]) )
-            CV_Error( CV_StsBadArg, "Invalid histogram header" );
+            CV_Error( cv::Error::StsBadArg, "Invalid histogram header" );
 
         if( !CV_IS_MATND(src[i]->bins) || !CV_IS_MATND(dst[i]->bins) )
-            CV_Error( CV_StsBadArg, "The function supports dense histograms only" );
+            CV_Error( cv::Error::StsBadArg, "The function supports dense histograms only" );
     }
 
     cvZero( dst[0]->bins );
@@ -3178,10 +3178,10 @@ cvCalcProbDensity( const CvHistogram* hist, const CvHistogram* hist_mask,
                    CvHistogram* hist_dens, double scale )
 {
     if( scale <= 0 )
-        CV_Error( CV_StsOutOfRange, "scale must be positive" );
+        CV_Error( cv::Error::StsOutOfRange, "scale must be positive" );
 
     if( !CV_IS_HIST(hist) || !CV_IS_HIST(hist_mask) || !CV_IS_HIST(hist_dens) )
-        CV_Error( CV_StsBadArg, "Invalid histogram pointer[s]" );
+        CV_Error( cv::Error::StsBadArg, "Invalid histogram pointer[s]" );
 
     {
         CvArr* arrs[] = { hist->bins, hist_mask->bins, hist_dens->bins };
@@ -3191,7 +3191,7 @@ cvCalcProbDensity( const CvHistogram* hist, const CvHistogram* hist_mask,
         cvInitNArrayIterator( 3, arrs, 0, stubs, &iterator );
 
         if( CV_MAT_TYPE(iterator.hdr[0]->type) != CV_32FC1 )
-            CV_Error( CV_StsUnsupportedFormat, "All histograms must have 32fC1 type" );
+            CV_Error( cv::Error::StsUnsupportedFormat, "All histograms must have 32fC1 type" );
 
         do
         {
@@ -3452,6 +3452,8 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
     CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_EQUALIZE_HISTOGRAM>(src.cols, src.rows),
                openvx_equalize_hist(src, dst))
 
+    CALL_HAL(equalizeHist, cv_hal_equalize_hist, src.data, src.step, dst.data, dst.step, src.cols, src.rows);
+
     Mutex histogramLockInstance;
 
     const int hist_sz = EqualizeHistCalcHist_Invoker::HIST_SZ;
@@ -3533,7 +3535,7 @@ static void *icvReadHist( CvFileStorage * fs, CvFileNode * node )
         int i, sizes[CV_MAX_DIM];
 
         if(!CV_IS_MATND(mat))
-            CV_Error( CV_StsError, "Expected CvMatND");
+            CV_Error( cv::Error::StsError, "Expected CvMatND");
 
         for(i=0; i<mat->dims; i++)
             sizes[i] = mat->dim[i].size;
@@ -3554,7 +3556,7 @@ static void *icvReadHist( CvFileStorage * fs, CvFileNode * node )
     {
         h->bins = cvReadByName( fs, node, "bins" );
         if(!CV_IS_SPARSE_MAT(h->bins)){
-            CV_Error( CV_StsError, "Unknown Histogram type");
+            CV_Error( cv::Error::StsError, "Unknown Histogram type");
         }
     }
 
@@ -3571,7 +3573,7 @@ static void *icvReadHist( CvFileStorage * fs, CvFileNode * node )
 
         thresh_node = cvGetFileNodeByName( fs, node, "thresh" );
         if(!thresh_node)
-            CV_Error( CV_StsError, "'thresh' node is missing");
+            CV_Error( cv::Error::StsError, "'thresh' node is missing");
         cvStartReadRawData( fs, thresh_node, &reader );
 
         if(is_uniform)
diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp
index 96ba338f30fb..523e4491c5f3 100644
--- a/modules/imgproc/src/hough.cpp
+++ b/modules/imgproc/src/hough.cpp
@@ -1156,13 +1156,13 @@ class HoughCirclesAccumInvoker : public ParallelLoopBody
 
             for(; x < numCols; ++x )
             {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 {
                     v_uint8 v_zero = vx_setzero_u8();
 
-                    for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) {
-                        v_uint8 v_edge1 = (vx_load(edgeData + x                  ) != v_zero);
-                        v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero);
+                    for(; x <= numCols - 2*VTraits<v_uint8>::vlanes(); x += 2*VTraits<v_uint8>::vlanes()) {
+                        v_uint8 v_edge1 = (v_ne(vx_load(edgeData + x), v_zero));
+                        v_uint8 v_edge2 = (v_ne(vx_load(edgeData + x + VTraits<v_uint8>::vlanes()), v_zero));
 
                         if(v_check_any(v_edge1))
                         {
@@ -1172,7 +1172,7 @@ class HoughCirclesAccumInvoker : public ParallelLoopBody
 
                         if(v_check_any(v_edge2))
                         {
-                            x += v_uint8::nlanes + v_scan_forward(v_edge2);
+                            x += VTraits<v_uint8>::vlanes() + v_scan_forward(v_edge2);
                             goto _next_step;
                         }
                     }
@@ -1183,7 +1183,7 @@ class HoughCirclesAccumInvoker : public ParallelLoopBody
 
                 if(x == numCols)
                     continue;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 _next_step:
 #endif
                 float vx, vy;
@@ -1514,7 +1514,7 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
     int nzCount = 0;
     const Point* nz_ = &nz[0];
     int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
         const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
@@ -1522,9 +1522,9 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
         v_float32 v_curCenterX = vx_setall_f32(curCenter.x);
         v_float32 v_curCenterY = vx_setall_f32(curCenter.y);
 
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
-        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
-        for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes)
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits<v_float32>::max_nlanes];
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits<v_int32>::max_nlanes];
+        for(; j <= nzSz - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
         {
             v_float32 v_nzX, v_nzY;
             v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype
@@ -1532,16 +1532,16 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointList>::filterCircles(const Po
             v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX));
             v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY));
 
-            v_float32 v_dx = v_x - v_curCenterX;
-            v_float32 v_dy = v_y - v_curCenterY;
+            v_float32 v_dx = v_sub(v_x, v_curCenterX);
+            v_float32 v_dy = v_sub(v_y, v_curCenterY);
 
-            v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy);
-            v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2);
+            v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_mul(v_dy, v_dy));
+            v_float32 vmask = v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2));
             if (v_check_any(vmask))
             {
                 v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                 v_store_aligned(rbuf, v_r2);
-                for (int i = 0; i < v_int32::nlanes; ++i)
+                for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                     if (rmask[i]) ddata[nzCount++] = rbuf[i];
             }
         }
@@ -1573,13 +1573,13 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
     const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols));
     const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows));
 
-#if CV_SIMD
-    float v_seq[v_float32::nlanes];
-    for (int i = 0; i < v_float32::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    float v_seq[VTraits<v_float32>::max_nlanes];
+    for (int i = 0; i < VTraits<v_float32>::vlanes(); ++i)
         v_seq[i] = (float)i;
     const v_float32 v_minRadius2 = vx_setall_f32(minRadius2);
     const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2);
-    const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq);
+    const v_float32 v_curCenterX_0123 = v_sub(vx_setall_f32(curCenter.x), vx_load(v_seq));
 #endif
 
     for (int y = yOuter.start; y < yOuter.end; y++)
@@ -1589,27 +1589,27 @@ inline int HoughCircleEstimateRadiusInvoker<NZPointSet>::filterCircles(const Poi
         float dy2 = dy * dy;
 
         int x = xOuter.start;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             const v_float32 v_dy2 = vx_setall_f32(dy2);
             const v_uint32 v_zero_u32 = vx_setall_u32(0);
-            float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes];
-            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes];
-            for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes)
+            float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[VTraits<v_float32>::max_nlanes];
+            int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[VTraits<v_int32>::max_nlanes];
+            for (; x <= xOuter.end - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
             {
                 v_uint32 v_mask = vx_load_expand_q(ptr + x);
-                v_mask = v_mask != v_zero_u32;
+                v_mask = v_ne(v_mask, v_zero_u32);
 
                 v_float32 v_x = v_cvt_f32(vx_setall_s32(x));
-                v_float32 v_dx = v_x - v_curCenterX_0123;
+                v_float32 v_dx = v_sub(v_x, v_curCenterX_0123);
 
-                v_float32 v_r2 = (v_dx * v_dx) + v_dy2;
-                v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask);
+                v_float32 v_r2 = v_add(v_mul(v_dx, v_dx), v_dy2);
+                v_float32 vmask = v_and(v_and(v_le(v_minRadius2, v_r2), v_le(v_r2, v_maxRadius2)), v_reinterpret_as_f32(v_mask));
                 if (v_check_any(vmask))
                 {
                     v_store_aligned(rmask, v_reinterpret_as_s32(vmask));
                     v_store_aligned(rbuf, v_r2);
-                    for (int i = 0; i < v_int32::nlanes; ++i)
+                    for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                         if (rmask[i]) ddata[nzCount++] = rbuf[i];
                 }
             }
@@ -2396,11 +2396,11 @@ cvHoughLines2( CvArr* src_image, void* lineStorage, int method,
         mat = (CvMat*)lineStorage;
 
         if( !CV_IS_MAT_CONT( mat->type ) || (mat->rows != 1 && mat->cols != 1) )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
             "The destination matrix should be continuous and have a single row or a single column" );
 
         if( CV_MAT_TYPE( mat->type ) != lineType )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
             "The destination matrix data type is inappropriate, see the manual" );
 
         lines = cvMakeSeqHeaderForArray( lineType, sizeof(CvSeq), elemSize, mat->data.ptr,
@@ -2427,7 +2427,7 @@ cvHoughLines2( CvArr* src_image, void* lineStorage, int method,
                 threshold, iparam1, iparam2, l4, linesMax );
         break;
     default:
-        CV_Error( CV_StsBadArg, "Unrecognized method id" );
+        CV_Error( cv::Error::StsBadArg, "Unrecognized method id" );
     }
 
     int nlines = (int)(l2.size() + l4.size());
@@ -2473,7 +2473,7 @@ cvHoughCircles( CvArr* src_image, void* circle_storage,
     cv::Mat src = cv::cvarrToMat(src_image), circles_mat;
 
     if( !circle_storage )
-        CV_Error( CV_StsNullPtr, "NULL destination" );
+        CV_Error( cv::Error::StsNullPtr, "NULL destination" );
 
     bool isStorage = isStorageOrMat(circle_storage);
 
@@ -2490,7 +2490,7 @@ cvHoughCircles( CvArr* src_image, void* circle_storage,
 
         if( !CV_IS_MAT_CONT( mat->type ) || (mat->rows != 1 && mat->cols != 1) ||
             CV_MAT_TYPE(mat->type) != CV_32FC3 )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                       "The destination matrix should be continuous and have a single row or a single column" );
 
         circles = cvMakeSeqHeaderForArray( CV_32FC3, sizeof(CvSeq), sizeof(float)*3,
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 268555badac8..d7c9c64c3caf 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -206,7 +206,7 @@ static void initInterTab1D(int method, float* tab, int tabsz)
             interpolateLanczos4( i*scale, tab );
     }
     else
-        CV_Error( CV_StsBadArg, "Unknown interpolation method" );
+        CV_Error( cv::Error::StsBadArg, "Unknown interpolation method" );
 }
 
 
@@ -223,7 +223,7 @@ static const void* initInterTab2D( int method, bool fixpt )
     else if( method == INTER_LANCZOS4 )
         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
     else
-        CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
+        CV_Error( cv::Error::StsBadArg, "Unknown/unsupported interpolation type" );
 
     if( !inittab[method] )
     {
@@ -326,9 +326,9 @@ static inline int clip(int x, int a, int b)
 *                       General warping (affine, perspective, remap)                     *
 \****************************************************************************************/
 
-template<typename T>
+template<typename T, bool isRelative>
 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
-                          int borderType, const Scalar& _borderValue )
+                          int borderType, const Scalar& _borderValue, const Point& _offset )
 {
     Size ssize = _src.size(), dsize = _dst.size();
     const int cn = _src.channels();
@@ -341,7 +341,7 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
 
     unsigned width1 = ssize.width, height1 = ssize.height;
 
-    if( _dst.isContinuous() && _xy.isContinuous() )
+    if( _dst.isContinuous() && _xy.isContinuous() && !isRelative )
     {
         dsize.width *= dsize.height;
         dsize.height = 1;
@@ -351,12 +351,13 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
     {
         T* D = _dst.ptr<T>(dy);
         const short* XY = _xy.ptr<short>(dy);
-
+        const int off_y = isRelative ? (_offset.y+dy) : 0;
         if( cn == 1 )
         {
             for(int dx = 0; dx < dsize.width; dx++ )
             {
-                int sx = XY[dx*2], sy = XY[dx*2+1];
+                const int off_x = isRelative ? (_offset.x+dx) : 0;
+                int sx = XY[dx*2]+off_x, sy = XY[dx*2+1]+off_y;
                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
                     D[dx] = S0[sy*sstep + sx];
                 else
@@ -382,7 +383,8 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
         {
             for(int dx = 0; dx < dsize.width; dx++, D += cn )
             {
-                int sx = XY[dx*2], sy = XY[dx*2+1];
+                const int off_x = isRelative ? (_offset.x+dx) : 0;
+                int sx = XY[dx*2]+off_x, sy = XY[dx*2+1]+off_y;
                 const T *S;
                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
                 {
@@ -427,11 +429,11 @@ static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
     }
 }
 
-
+template<bool>
 struct RemapNoVec
 {
     int operator()( const Mat&, void*, const short*, const ushort*,
-                    const void*, int ) const { return 0; }
+                    const void*, int, cv::Point& ) const { return 0; }
 };
 
 #if CV_SIMD128
@@ -439,13 +441,13 @@ struct RemapNoVec
 typedef unsigned short CV_DECL_ALIGNED(1) unaligned_ushort;
 typedef int CV_DECL_ALIGNED(1) unaligned_int;
 
+template<bool isRelative>
 struct RemapVec_8u
 {
     int operator()( const Mat& _src, void* _dst, const short* XY,
-                    const ushort* FXY, const void* _wtab, int width ) const
+                    const ushort* FXY, const void* _wtab, int width, const Point& _offset ) const
     {
         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
-
         if( (cn != 1 && cn != 3 && cn != 4) || sstep >= 0x8000 )
             return 0;
 
@@ -455,7 +457,7 @@ struct RemapVec_8u
         v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2);
         v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16)));
         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
-        const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes;
+        const uchar* src_limit_8bytes = _src.datalimit - VTraits<v_int16x8>::vlanes();
 #define CV_PICK_AND_PACK_RGB(ptr, offset, result)  \
         {                                          \
             const uchar* const p = ((const uchar*)ptr) + (offset); \
@@ -483,7 +485,7 @@ struct RemapVec_8u
             v_uint8x16 rrggbbaa, dummy;            \
             v_uint16x8 rrggbbaa8, dummy8;          \
             v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
-            v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
+            v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + VTraits<v_int32x4>::vlanes()), 0, 0, 0)); \
             v_zip(rgba0, rgba1, rrggbbaa, dummy);  \
             v_expand(rrggbbaa, rrggbbaa8, dummy8); \
             result = v_reinterpret_as_s16(rrggbbaa8); \
@@ -493,12 +495,25 @@ struct RemapVec_8u
                        *(unaligned_ushort*)(base + offset[2]), *(unaligned_ushort*)(base + offset[3]), \
                        0, 0, 0, 0)
 
+        const short _rel_offset_x = static_cast<short>(_offset.x);
+        const short _rel_offset_y = static_cast<short>(_offset.y);
+        v_int16x8 v_dxy0(_rel_offset_x, _rel_offset_y, _rel_offset_x, _rel_offset_y, _rel_offset_x, _rel_offset_y, _rel_offset_x, _rel_offset_y);
+        v_int16x8 v_dxy1 = v_dxy0;
+        v_dxy0 = v_add(v_dxy0, v_int16x8(0, 0, 1, 0, 2, 0, 3, 0));
+        v_dxy1 = v_add(v_dxy1, v_int16x8(4, 0, 5, 0, 6, 0, 7, 0));
         if( cn == 1 )
         {
             for( ; x <= width - 8; x += 8 )
             {
                 v_int16x8 _xy0 = v_load(XY + x*2);
                 v_int16x8 _xy1 = v_load(XY + x*2 + 8);
+                if (isRelative)
+                {
+                    const short x_s16 = static_cast<short>(x);
+                    v_int16x8 v_dxy01(x_s16, 0, x_s16, 0, x_s16, 0, x_s16, 0);
+                    _xy0 = v_add(_xy0, v_add(v_dxy01, v_dxy0));
+                    _xy1 = v_add(_xy1, v_add(v_dxy01, v_dxy1));
+                }
                 v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2;
 
                 v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
@@ -534,8 +549,8 @@ struct RemapVec_8u
                 v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta);
                 v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
 
-                v0 = v0 >> INTER_REMAP_COEF_BITS;
-                v2 = v2 >> INTER_REMAP_COEF_BITS;
+                v0 = v_shr<INTER_REMAP_COEF_BITS>(v0);
+                v2 = v_shr<INTER_REMAP_COEF_BITS>(v2);
                 v_pack_u_store(D + x, v_pack(v0, v2));
             }
         }
@@ -545,6 +560,12 @@ struct RemapVec_8u
             {
                 v_int16x8 u0, v0, u1, v1;
                 v_int16x8 _xy0 = v_load(XY + x * 2);
+                if (isRelative)
+                {
+                    const short x_s16 = static_cast<short>(x);
+                    v_int16x8 v_dxy01(x_s16, 0, x_s16, 0, x_s16, 0, x_s16, 0);
+                    _xy0 = v_add(_xy0, v_add(v_dxy01, v_dxy0));
+                }
 
                 v_int32x4 xy0 = v_dotprod(_xy0, xy2ofs);
                 v_store(iofs0, xy0);
@@ -563,8 +584,8 @@ struct RemapVec_8u
                 CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1);
                 CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1);
 
-                v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
-                v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
+                v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
 
                 result0 = v_rotate_left<1>(result0);
                 v_int16x8 result8 = v_pack(result0, result1);
@@ -581,8 +602,8 @@ struct RemapVec_8u
                 CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1);
                 CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1);
 
-                result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
-                result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
+                result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
 
                 result0 = v_rotate_left<1>(result0);
                 result8 = v_pack(result0, result1);
@@ -595,10 +616,17 @@ struct RemapVec_8u
             for( ; x <= width - 4; x += 4, D += 16 )
             {
                 v_int16x8 _xy0 = v_load(XY + x * 2);
+                if (isRelative)
+                {
+                    const short x_s16 = static_cast<short>(x);
+                    v_int16x8 v_dxy01(x_s16, 0, x_s16, 0, x_s16, 0, x_s16, 0);
+                    _xy0 = v_add(_xy0, v_add(v_dxy01, v_dxy0));
+                }
                 v_int16x8 u0, v0, u1, v1;
 
                 v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
                 v_store(iofs0, xy0);
+
                 int offset0 = FXY[x] * 16;
                 int offset1 = FXY[x + 1] * 16;
                 int offset2 = FXY[x + 2] * 16;
@@ -613,8 +641,8 @@ struct RemapVec_8u
                 CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1);
                 CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1);
 
-                v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
-                v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
+                v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
                 v_int16x8 result8 = v_pack(result0, result1);
                 v_pack_u_store(D, result8);
 
@@ -627,8 +655,8 @@ struct RemapVec_8u
                 CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1);
                 CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1);
 
-                result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
-                result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
+                result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
                 result8 = v_pack(result0, result1);
                 v_pack_u_store(D + 8, result8);
             }
@@ -640,15 +668,14 @@ struct RemapVec_8u
 
 #else
 
-typedef RemapNoVec RemapVec_8u;
+template<bool isRelative> using RemapVec_8u = RemapNoVec<isRelative>;
 
 #endif
 
-
-template<class CastOp, class VecOp, typename AT>
+template<class CastOp, class VecOp, typename AT, bool isRelative>
 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                            const Mat& _fxy, const void* _wtab,
-                           int borderType, const Scalar& _borderValue )
+                           int borderType, const Scalar& _borderValue, const Point& _offset )
 {
     typedef typename CastOp::rtype T;
     typedef typename CastOp::type1 WT;
@@ -678,12 +705,12 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
         const ushort* FXY = _fxy.ptr<ushort>(dy);
         int X0 = 0;
         bool prevInlier = false;
-
+        const int off_y = (isRelative ? (_offset.y+dy) : 0);
         for(int dx = 0; dx <= dsize.width; dx++ )
         {
             bool curInlier = dx < dsize.width ?
-                (unsigned)XY[dx*2] < width1 &&
-                (unsigned)XY[dx*2+1] < height1 : !prevInlier;
+                (unsigned)XY[dx*2]+(isRelative ? (_offset.x+dx) : 0) < width1 &&
+                (unsigned)XY[dx*2+1]+off_y < height1 : !prevInlier;
             if( curInlier == prevInlier )
                 continue;
 
@@ -694,7 +721,8 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
 
             if( !curInlier )
             {
-                int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
+                Point subOffset(_offset.x+dx, _offset.y+dy);
+                int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx, subOffset );
                 D += len*cn;
                 dx += len;
 
@@ -702,7 +730,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                 {
                     for( ; dx < X1; dx++, D++ )
                     {
-                        int sx = XY[dx*2], sy = XY[dx*2+1];
+                        int sx = XY[dx*2]+(isRelative ? (_offset.x+dx) : 0), sy = XY[dx*2+1]+off_y;
                         const AT* w = wtab + FXY[dx]*4;
                         const T* S = S0 + sy*sstep + sx;
                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
@@ -711,7 +739,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                 else if( cn == 2 )
                     for( ; dx < X1; dx++, D += 2 )
                     {
-                        int sx = XY[dx*2], sy = XY[dx*2+1];
+                        int sx = XY[dx*2]+(isRelative ? (_offset.x+dx) : 0), sy = XY[dx*2+1]+off_y;
                         const AT* w = wtab + FXY[dx]*4;
                         const T* S = S0 + sy*sstep + sx*2;
                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
@@ -721,7 +749,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                 else if( cn == 3 )
                     for( ; dx < X1; dx++, D += 3 )
                     {
-                        int sx = XY[dx*2], sy = XY[dx*2+1];
+                        int sx = XY[dx*2]+(isRelative ? (_offset.x+dx) : 0), sy = XY[dx*2+1]+off_y;
                         const AT* w = wtab + FXY[dx]*4;
                         const T* S = S0 + sy*sstep + sx*3;
                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
@@ -732,7 +760,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                 else if( cn == 4 )
                     for( ; dx < X1; dx++, D += 4 )
                     {
-                        int sx = XY[dx*2], sy = XY[dx*2+1];
+                        int sx = XY[dx*2]+(isRelative ? (_offset.x+dx) : 0), sy = XY[dx*2+1]+off_y;
                         const AT* w = wtab + FXY[dx]*4;
                         const T* S = S0 + sy*sstep + sx*4;
                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
@@ -745,7 +773,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                 else
                     for( ; dx < X1; dx++, D += cn )
                     {
-                        int sx = XY[dx*2], sy = XY[dx*2+1];
+                        int sx = XY[dx*2]+(isRelative ? (_offset.x+dx) : 0), sy = XY[dx*2+1]+off_y;
                         const AT* w = wtab + FXY[dx]*4;
                         const T* S = S0 + sy*sstep + sx*cn;
                         for(int k = 0; k < cn; k++ )
@@ -757,22 +785,47 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
             }
             else
             {
+                if (borderType == BORDER_TRANSPARENT) {
+                    for (; dx < X1; dx++, D += cn) {
+                        if (dx >= dsize.width) continue;
+                        const int sx = XY[dx * 2]+(isRelative ? (_offset.x+dx) : 0), sy = XY[dx * 2 + 1]+off_y;
+                        // If the mapped point is still within bounds, it did not get computed
+                        // because it lacked 4 neighbors. Still, it can be computed with an
+                        // approximate formula. If it is outside, the point is left untouched.
+                        if (sx >= 0 && sx <= ssize.width - 1 && sy >= 0 && sy <= ssize.height - 1) {
+                            const AT* w = wtab + FXY[dx] * 4;
+                            WT w_tot = 0;
+                            if (sx >= 0 && sy >= 0) w_tot += w[0];
+                            if (sy >= 0 && sx < ssize.width - 1) w_tot += w[1];
+                            if (sx >= 0 && sy < ssize.height - 1) w_tot += w[2];
+                            if (sx < ssize.width - 1 && sy < ssize.height - 1) w_tot += w[3];
+                            if (w_tot == 0.f) continue;
+                            const WT w_tot_ini = (WT)w[0] + w[1] + w[2] + w[3];
+                            const T* S = S0 + sy * sstep + sx * cn;
+                            for (int k = 0; k < cn; k++) {
+                                WT t0 = 0;
+                                if (sx >= 0 && sy >= 0) t0 += S[k] * w[0];
+                                if (sy >= 0 && sx < ssize.width - 1) t0 += S[k + cn] * w[1];
+                                if (sx >= 0 && sy < ssize.height - 1) t0 += S[sstep + k] * w[2];
+                                if (sx < ssize.width - 1 && sy < ssize.height - 1) t0 += S[sstep + k + cn] * w[3];
+                                t0 = (WT)(t0 * (float)w_tot_ini / w_tot);
+                                D[k] = castOp(t0);
+                            }
+                        }
+                    }
+                    continue;
+                }
+
                 if( cn == 1 )
                     for( ; dx < X1; dx++, D++ )
                     {
-                        int sx = XY[dx*2], sy = XY[dx*2+1];
+                        int sx = XY[dx*2]+(isRelative ? (_offset.x+dx) : 0), sy = XY[dx*2+1]+off_y;
                         if( borderType == BORDER_CONSTANT &&
                             (sx >= ssize.width || sx+1 < 0 ||
                              sy >= ssize.height || sy+1 < 0) )
                         {
                             D[0] = cval[0];
                         }
-                        else if (borderType == BORDER_TRANSPARENT)
-                        {
-                            if (sx < ssize.width && sx >= 0 &&
-                                sy < ssize.height && sy >= 0)
-                                D[0] = S0[sy*sstep + sx];
-                        }
                         else
                         {
                             int sx0, sx1, sy0, sy1;
@@ -806,7 +859,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                 else
                     for( ; dx < X1; dx++, D += cn )
                     {
-                        int sx = XY[dx*2], sy = XY[dx*2+1];
+                        int sx = XY[dx*2]+(isRelative ? (_offset.x+dx) : 0), sy = XY[dx*2+1]+off_y;
                         if( borderType == BORDER_CONSTANT &&
                             (sx >= ssize.width || sx+1 < 0 ||
                              sy >= ssize.height || sy+1 < 0) )
@@ -814,13 +867,6 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                             for(int k = 0; k < cn; k++ )
                                 D[k] = cval[k];
                         }
-                        else if (borderType == BORDER_TRANSPARENT)
-                        {
-                            if (sx < ssize.width && sx >= 0 &&
-                                sy < ssize.height && sy >= 0)
-                                for(int k = 0; k < cn; k++ )
-                                    D[k] = S0[sy*sstep + sx*cn + k];
-                        }
                         else
                         {
                             int sx0, sx1, sy0, sy1;
@@ -837,10 +883,6 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
                                 v2 = S0 + sy1*sstep + sx0*cn;
                                 v3 = S0 + sy1*sstep + sx1*cn;
                             }
-                            else if( borderType == BORDER_TRANSPARENT &&
-                                ((unsigned)sx >= (unsigned)(ssize.width-1) ||
-                                (unsigned)sy >= (unsigned)(ssize.height-1)))
-                                continue;
                             else
                             {
                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
@@ -862,10 +904,10 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
 }
 
 
-template<class CastOp, typename AT, int ONE>
+template<class CastOp, typename AT, int ONE, bool isRelative>
 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
                           const Mat& _fxy, const void* _wtab,
-                          int borderType, const Scalar& _borderValue )
+                          int borderType, const Scalar& _borderValue, const Point& _offset )
 {
     typedef typename CastOp::rtype T;
     typedef typename CastOp::type1 WT;
@@ -884,7 +926,7 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
 
     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
 
-    if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
+    if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() && !isRelative )
     {
         dsize.width *= dsize.height;
         dsize.height = 1;
@@ -895,10 +937,11 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
         T* D = _dst.ptr<T>(dy);
         const short* XY = _xy.ptr<short>(dy);
         const ushort* FXY = _fxy.ptr<ushort>(dy);
-
+        const int off_y = isRelative ? (_offset.y+dy) : 0;
         for(int dx = 0; dx < dsize.width; dx++, D += cn )
         {
-            int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
+            const int off_x = isRelative ? (_offset.x+dx) : 0;
+            int sx = XY[dx*2]-1+off_x, sy = XY[dx*2+1]-1+off_y;
             const AT* w = wtab + FXY[dx]*16;
             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
             {
@@ -966,10 +1009,10 @@ static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
 }
 
 
-template<class CastOp, typename AT, int ONE>
+template<class CastOp, typename AT, int ONE, bool isRelative>
 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
                            const Mat& _fxy, const void* _wtab,
-                           int borderType, const Scalar& _borderValue )
+                           int borderType, const Scalar& _borderValue, const Point& _offset )
 {
     typedef typename CastOp::rtype T;
     typedef typename CastOp::type1 WT;
@@ -988,7 +1031,7 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
 
     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
 
-    if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
+    if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() && !isRelative )
     {
         dsize.width *= dsize.height;
         dsize.height = 1;
@@ -999,10 +1042,11 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
         T* D = _dst.ptr<T>(dy);
         const short* XY = _xy.ptr<short>(dy);
         const ushort* FXY = _fxy.ptr<ushort>(dy);
-
+        const int off_y = isRelative ? (_offset.y+dy) : 0;
         for(int dx = 0; dx < dsize.width; dx++, D += cn )
         {
-            int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
+            const int off_x = isRelative ? (_offset.x+dx) : 0;
+            int sx = XY[dx*2]-3+off_x, sy = XY[dx*2+1]-3+off_y;
             const AT* w = wtab + FXY[dx]*64;
             const T* S = S0 + sy*sstep + sx*cn;
             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
@@ -1077,11 +1121,11 @@ static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
 
 
 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
-                            int borderType, const Scalar& _borderValue );
+                            int borderType, const Scalar& _borderValue, const Point& _offset);
 
 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
                           const Mat& _fxy, const void* _wtab,
-                          int borderType, const Scalar& _borderValue);
+                          int borderType, const Scalar& _borderValue, const Point& _offset);
 
 class RemapInvoker :
     public ParallelLoopBody
@@ -1150,7 +1194,7 @@ class RemapInvoker :
 
                             #if CV_SIMD128
                             {
-                                int span = v_float32x4::nlanes;
+                                int span = VTraits<v_float32x4>::vlanes();
                                 for( ; x1 <= bcols - span * 2; x1 += span * 2 )
                                 {
                                     v_int32x4 ix0 = v_round(v_load(sX + x1));
@@ -1172,7 +1216,7 @@ class RemapInvoker :
                             }
                         }
                     }
-                    nnfunc( *src, dpart, bufxy, borderType, borderValue );
+                    nnfunc( *src, dpart, bufxy, borderType, borderValue, Point(x, y) );
                     continue;
                 }
 
@@ -1192,9 +1236,9 @@ class RemapInvoker :
                         #if CV_SIMD128
                         {
                             v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1);
-                            int span = v_uint16x8::nlanes;
+                            int span = VTraits<v_uint16x8>::vlanes();
                             for( ; x1 <= bcols - span; x1 += span )
-                                v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale);
+                                v_store((unsigned short*)(A + x1), v_and(v_load(sA + x1), v_scale));
                         }
                         #endif
                         for( ; x1 < bcols; x1++ )
@@ -1210,16 +1254,16 @@ class RemapInvoker :
                         {
                             v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
                             v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1);
-                            int span = v_float32x4::nlanes;
+                            int span = VTraits<v_float32x4>::vlanes();
                             for( ; x1 <= bcols - span * 2; x1 += span * 2 )
                             {
-                                v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1));
-                                v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1));
-                                v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span));
-                                v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span));
-                                v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2));
-                                v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2));
-                                v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8);
+                                v_int32x4 v_sx0 = v_round(v_mul(v_scale, v_load(sX + x1)));
+                                v_int32x4 v_sy0 = v_round(v_mul(v_scale, v_load(sY + x1)));
+                                v_int32x4 v_sx1 = v_round(v_mul(v_scale, v_load(sX + x1 + span)));
+                                v_int32x4 v_sy1 = v_round(v_mul(v_scale, v_load(sY + x1 + span)));
+                                v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_and(v_sx0, v_scale2), v_and(v_sx1, v_scale2)));
+                                v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_and(v_sy0, v_scale2), v_and(v_sy1, v_scale2)));
+                                v_uint16x8 v_v = v_or(v_shl<INTER_BITS>(v_sy8), v_sx8);
                                 v_store(A + x1, v_v);
 
                                 v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
@@ -1247,18 +1291,18 @@ class RemapInvoker :
                         {
                             v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
                             v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE);
-                            int span = v_float32x4::nlanes;
+                            int span = VTraits<v_float32x4>::vlanes();
                             for( ; x1 <= bcols - span * 2; x1 += span * 2 )
                             {
                                 v_float32x4 v_fx, v_fy;
                                 v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy);
-                                v_int32x4 v_sx0 = v_round(v_fx * v_scale);
-                                v_int32x4 v_sy0 = v_round(v_fy * v_scale);
+                                v_int32x4 v_sx0 = v_round(v_mul(v_fx, v_scale));
+                                v_int32x4 v_sy0 = v_round(v_mul(v_fy, v_scale));
                                 v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy);
-                                v_int32x4 v_sx1 = v_round(v_fx * v_scale);
-                                v_int32x4 v_sy1 = v_round(v_fy * v_scale);
-                                v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2));
-                                v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2));
+                                v_int32x4 v_sx1 = v_round(v_mul(v_fx, v_scale));
+                                v_int32x4 v_sy1 = v_round(v_mul(v_fy, v_scale));
+                                v_int32x4 v_v0 = v_muladd(v_scale3, (v_and(v_sy0, v_scale2)), (v_and(v_sx0, v_scale2)));
+                                v_int32x4 v_v1 = v_muladd(v_scale3, (v_and(v_sy1, v_scale2)), (v_and(v_sx1, v_scale2)));
                                 v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1));
                                 v_store(A + x1, v_v8);
                                 v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
@@ -1279,7 +1323,7 @@ class RemapInvoker :
                         }
                     }
                 }
-                ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
+                ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue, Point(x, y));
             }
         }
     }
@@ -1301,6 +1345,9 @@ class RemapInvoker :
 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
                       int interpolation, int borderType, const Scalar& borderValue)
 {
+    const bool hasRelativeFlag = ((interpolation & WARP_RELATIVE_MAP) != 0);
+    interpolation &= ~WARP_RELATIVE_MAP;
+
     const ocl::Device & dev = ocl::Device::getDefault();
     int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
             rowsPerWI = dev.isIntel() ? 4 : 1;
@@ -1340,17 +1387,18 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
     static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
     static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
                            "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
-    String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
+    String buildOptions = format("-D %s -D %s -D T=%s -D ROWS_PER_WI=%d -D WARP_RELATIVE=%d",
                                  interMap[interpolation], borderMap[borderType],
-                                 ocl::typeToStr(type), rowsPerWI);
+                                 ocl::typeToStr(type), rowsPerWI,
+                                 hasRelativeFlag ? 1 : 0);
 
     if (interpolation != INTER_NEAREST)
     {
         char cvt[3][50];
         int wdepth = std::max(CV_32F, depth);
         buildOptions = buildOptions
-                      + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
-                               " -D convertToWT2=%s -D WT2=%s",
+                      + format(" -D WT=%s -D CONVERT_TO_T=%s -D CONVERT_TO_WT=%s"
+                               " -D CONVERT_TO_WT2=%s -D WT2=%s",
                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
                                ocl::convertTypeStr(wdepth, depth, cn, cvt[0], sizeof(cvt[0])),
                                ocl::convertTypeStr(depth, wdepth, cn, cvt[1], sizeof(cvt[1])),
@@ -1359,7 +1407,7 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
     }
     int scalarcn = cn == 3 ? 4 : cn;
     int sctype = CV_MAKETYPE(depth, scalarcn);
-    buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
+    buildOptions += format(" -D T=%s -D T1=%s -D CN=%d -D ST=%s -D SRC_DEPTH=%d",
                            ocl::typeToStr(type), ocl::typeToStr(depth),
                            cn, ocl::typeToStr(sctype), depth);
 
@@ -1401,7 +1449,7 @@ static bool ocl_linearPolar(InputArray _src, OutputArray _dst,
     size_t h = dsize.height;
     String buildOptions;
     unsigned mem_size = 32;
-    if (flags & CV_WARP_INVERSE_MAP)
+    if (flags & cv::WARP_INVERSE_MAP)
     {
         buildOptions = "-D InverseMap";
     }
@@ -1416,7 +1464,7 @@ static bool ocl_linearPolar(InputArray _src, OutputArray _dst,
     ocl::KernelArg  ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
     ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
 
-    if (!(flags & CV_WARP_INVERSE_MAP))
+    if (!(flags & cv::WARP_INVERSE_MAP))
     {
 
 
@@ -1447,14 +1495,14 @@ static bool ocl_linearPolar(InputArray _src, OutputArray _dst,
     size_t globalThreads[2] = { (size_t)dsize.width , (size_t)dsize.height };
     size_t localThreads[2] = { mem_size , mem_size };
     k.run(2, globalThreads, localThreads, false);
-    remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
+    remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
     return true;
 }
 static bool ocl_logPolar(InputArray _src, OutputArray _dst,
     Point2f center, double M, int flags)
 {
     if (M <= 0)
-        CV_Error(CV_StsOutOfRange, "M should be >0");
+        CV_Error(cv::Error::StsOutOfRange, "M should be >0");
     UMat src_with_border; // don't scope this variable (it holds image data)
 
     UMat mapx, mapy, r, cp_sp;
@@ -1470,7 +1518,7 @@ static bool ocl_logPolar(InputArray _src, OutputArray _dst,
     size_t h = dsize.height;
     String buildOptions;
     unsigned mem_size = 32;
-    if (flags & CV_WARP_INVERSE_MAP)
+    if (flags & cv::WARP_INVERSE_MAP)
     {
         buildOptions = "-D InverseMap";
     }
@@ -1487,7 +1535,7 @@ static bool ocl_logPolar(InputArray _src, OutputArray _dst,
     ocl::KernelArg  ocl_cp_sp = ocl::KernelArg::PtrReadWrite(cp_sp);
     ocl::KernelArg ocl_r = ocl::KernelArg::PtrReadWrite(r);
 
-    if (!(flags & CV_WARP_INVERSE_MAP))
+    if (!(flags & cv::WARP_INVERSE_MAP))
     {
 
 
@@ -1518,7 +1566,7 @@ static bool ocl_logPolar(InputArray _src, OutputArray _dst,
     size_t globalThreads[2] = { (size_t)dsize.width , (size_t)dsize.height };
     size_t localThreads[2] = { mem_size , mem_size };
     k.run(2, globalThreads, localThreads, false);
-    remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
+    remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
     return true;
 }
 #endif
@@ -1601,12 +1649,12 @@ static bool openvx_remap(Mat src, Mat dst, Mat map1, Mat map2, int interpolation
     }
     catch (const ivx::RuntimeError & e)
     {
-        CV_Error(CV_StsInternal, e.what());
+        CV_Error(cv::Error::StsInternal, e.what());
         return false;
     }
     catch (const ivx::WrapperError & e)
     {
-        CV_Error(CV_StsInternal, e.what());
+        CV_Error(cv::Error::StsInternal, e.what());
         return false;
     }
     return true;
@@ -1673,38 +1721,73 @@ void cv::remap( InputArray _src, OutputArray _dst,
 {
     CV_INSTRUMENT_REGION();
 
-    static RemapNNFunc nn_tab[] =
+    const bool hasRelativeFlag = ((interpolation & WARP_RELATIVE_MAP) != 0);
+
+    static RemapNNFunc nn_tab[2][8] =
     {
-        remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
-        remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
+        {
+            remapNearest<uchar, false>, remapNearest<schar, false>, remapNearest<ushort, false>, remapNearest<short, false>,
+            remapNearest<int, false>, remapNearest<float, false>, remapNearest<double, false>, 0
+        },
+        {
+            remapNearest<uchar, true>, remapNearest<schar, true>, remapNearest<ushort, true>, remapNearest<short, true>,
+            remapNearest<int, true>, remapNearest<float, true>, remapNearest<double, true>, 0
+        }
     };
 
-    static RemapFunc linear_tab[] =
+    static RemapFunc linear_tab[2][8] =
     {
-        remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
-        remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
-        remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
-        remapBilinear<Cast<float, float>, RemapNoVec, float>,
-        remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
+        {
+            remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u<false>, short, false>, 0,
+            remapBilinear<Cast<float, ushort>, RemapNoVec<false>, float, false>,
+            remapBilinear<Cast<float, short>, RemapNoVec<false>, float, false>, 0,
+            remapBilinear<Cast<float, float>, RemapNoVec<false>, float, false>,
+            remapBilinear<Cast<double, double>, RemapNoVec<false>, float, false>, 0
+        },
+        {
+            remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u<true>, short, true>, 0,
+            remapBilinear<Cast<float, ushort>, RemapNoVec<true>, float, true>,
+            remapBilinear<Cast<float, short>, RemapNoVec<true>, float, true>, 0,
+            remapBilinear<Cast<float, float>, RemapNoVec<true>, float, true>,
+            remapBilinear<Cast<double, double>, RemapNoVec<true>, float, true>, 0
+        }
     };
 
-    static RemapFunc cubic_tab[] =
+    static RemapFunc cubic_tab[2][8] =
     {
-        remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
-        remapBicubic<Cast<float, ushort>, float, 1>,
-        remapBicubic<Cast<float, short>, float, 1>, 0,
-        remapBicubic<Cast<float, float>, float, 1>,
-        remapBicubic<Cast<double, double>, float, 1>, 0
-    };
+        {
+            remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, false>, 0,
+            remapBicubic<Cast<float, ushort>, float, 1, false>,
+            remapBicubic<Cast<float, short>, float, 1, false>, 0,
+            remapBicubic<Cast<float, float>, float, 1, false>,
+            remapBicubic<Cast<double, double>, float, 1, false>, 0
+        },
+        {
+            remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, true>, 0,
+            remapBicubic<Cast<float, ushort>, float, 1, true>,
+            remapBicubic<Cast<float, short>, float, 1, true>, 0,
+            remapBicubic<Cast<float, float>, float, 1, true>,
+            remapBicubic<Cast<double, double>, float, 1, true>, 0
+        }
+};
 
-    static RemapFunc lanczos4_tab[] =
+    static RemapFunc lanczos4_tab[2][8] =
     {
-        remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
-        remapLanczos4<Cast<float, ushort>, float, 1>,
-        remapLanczos4<Cast<float, short>, float, 1>, 0,
-        remapLanczos4<Cast<float, float>, float, 1>,
-        remapLanczos4<Cast<double, double>, float, 1>, 0
-    };
+        {
+            remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, false>, 0,
+            remapLanczos4<Cast<float, ushort>, float, 1, false>,
+            remapLanczos4<Cast<float, short>, float, 1, false>, 0,
+            remapLanczos4<Cast<float, float>, float, 1, false>,
+            remapLanczos4<Cast<double, double>, float, 1, false>, 0
+        },
+        {
+            remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE, true>, 0,
+            remapLanczos4<Cast<float, ushort>, float, 1, true>,
+            remapLanczos4<Cast<float, short>, float, 1, true>, 0,
+            remapLanczos4<Cast<float, float>, float, 1, true>,
+            remapLanczos4<Cast<double, double>, float, 1, true>, 0
+        }
+};
 
     CV_Assert( !_map1.empty() );
     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
@@ -1716,7 +1799,6 @@ void cv::remap( InputArray _src, OutputArray _dst,
     _dst.create( map1.size(), src.type() );
     Mat dst = _dst.getMat();
 
-
     CV_OVX_RUN(
         src.type() == CV_8UC1 && dst.type() == CV_8UC1 &&
         !ovx::skipSmallImages<VX_KERNEL_REMAP>(src.cols, src.rows) &&
@@ -1724,7 +1806,8 @@ void cv::remap( InputArray _src, OutputArray _dst,
         ((map1.type() == CV_32FC2 && map2.empty() && map1.size == dst.size) ||
          (map1.type() == CV_32FC1 && map2.type() == CV_32FC1 && map1.size == dst.size && map2.size == dst.size) ||
          (map1.empty() && map2.type() == CV_32FC2 && map2.size == dst.size)) &&
-        ((borderType & BORDER_ISOLATED) != 0 || !src.isSubmatrix()),
+        ((borderType & BORDER_ISOLATED) != 0 || !src.isSubmatrix()) &&
+        !hasRelativeFlag,
         openvx_remap(src, dst, map1, map2, interpolation, borderValue));
 
     CV_Assert( dst.cols < SHRT_MAX && dst.rows < SHRT_MAX && src.cols < SHRT_MAX && src.rows < SHRT_MAX );
@@ -1732,6 +1815,13 @@ void cv::remap( InputArray _src, OutputArray _dst,
     if( dst.data == src.data )
         src = src.clone();
 
+    if ((map1.type() == CV_32FC1) && (map2.type() == CV_32FC1))
+    {
+        CALL_HAL(remap32f, cv_hal_remap32f, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
+                 map1.ptr<float>(), map1.step, map2.ptr<float>(), map2.step, interpolation, borderType, borderValue.val);
+    }
+
+    interpolation &= ~WARP_RELATIVE_MAP;
     if( interpolation == INTER_AREA )
         interpolation = INTER_LINEAR;
 
@@ -1784,25 +1874,26 @@ void cv::remap( InputArray _src, OutputArray _dst,
     bool fixpt = depth == CV_8U;
     bool planar_input = false;
 
+    const int relativeOptionIndex = (hasRelativeFlag ? 1 : 0);
     if( interpolation == INTER_NEAREST )
     {
-        nnfunc = nn_tab[depth];
+        nnfunc = nn_tab[relativeOptionIndex][depth];
         CV_Assert( nnfunc != 0 );
     }
     else
     {
         if( interpolation == INTER_LINEAR )
-            ifunc = linear_tab[depth];
+            ifunc = linear_tab[relativeOptionIndex][depth];
         else if( interpolation == INTER_CUBIC ){
-            ifunc = cubic_tab[depth];
+            ifunc = cubic_tab[relativeOptionIndex][depth];
             CV_Assert( _src.channels() <= 4 );
         }
         else if( interpolation == INTER_LANCZOS4 ){
-            ifunc = lanczos4_tab[depth];
+            ifunc = lanczos4_tab[relativeOptionIndex][depth];
             CV_Assert( _src.channels() <= 4 );
         }
         else
-            CV_Error( CV_StsBadArg, "Unknown interpolation method" );
+            CV_Error( cv::Error::StsBadArg, "Unknown interpolation method" );
         CV_Assert( ifunc != 0 );
         ctab = initInterTab2D( interpolation, fixpt );
     }
@@ -1927,7 +2018,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 {
                     #if CV_SIMD128
                     {
-                        int span = v_int16x8::nlanes;
+                        int span = VTraits<v_int16x8>::vlanes();
                         for( ; x <= size.width - span; x += span )
                         {
                             v_int16x8 v_dst[2];
@@ -1959,21 +2050,21 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                         v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
                         v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
                         v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
-                        int span = v_float32x4::nlanes;
+                        int span = VTraits<v_float32x4>::vlanes();
                         for( ; x <= size.width - span * 2; x += span * 2 )
                         {
-                            v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x)));
-                            v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span)));
-                            v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x)));
-                            v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span)));
+                            v_int32x4 v_ix0 = v_round(v_mul(v_scale, v_load(src1f + x)));
+                            v_int32x4 v_ix1 = v_round(v_mul(v_scale, v_load(src1f + x + span)));
+                            v_int32x4 v_iy0 = v_round(v_mul(v_scale, v_load(src2f + x)));
+                            v_int32x4 v_iy1 = v_round(v_mul(v_scale, v_load(src2f + x + span)));
 
                             v_int16x8 v_dst[2];
                             v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
                             v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
                             v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
 
-                            v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask));
-                            v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask));
+                            v_int32x4 v_dst0 = v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask)));
+                            v_int32x4 v_dst1 = v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask)));
                             v_store(dst2 + x, v_pack_u(v_dst0, v_dst1));
                         }
                     }
@@ -1994,7 +2085,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
             if( nninterpolate )
             {
                 #if CV_SIMD128
-                int span = v_float32x4::nlanes;
+                int span = VTraits<v_float32x4>::vlanes();
                 {
                     for( ; x <= (size.width << 1) - span * 2; x += span * 2 )
                         v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)),
@@ -2020,16 +2111,16 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                         v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
                         v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
                         v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
-                        int span = v_uint16x8::nlanes;
+                        int span = VTraits<v_uint16x8>::vlanes();
                         for (; x <= size.width - span; x += span )
                         {
                             v_float32x4 v_src0[2], v_src1[2];
                             v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]);
                             v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]);
-                            v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale);
-                            v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale);
-                            v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale);
-                            v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale);
+                            v_int32x4 v_ix0 = v_round(v_mul(v_src0[0], v_scale));
+                            v_int32x4 v_ix1 = v_round(v_mul(v_src1[0], v_scale));
+                            v_int32x4 v_iy0 = v_round(v_mul(v_src0[1], v_scale));
+                            v_int32x4 v_iy1 = v_round(v_mul(v_src1[1], v_scale));
 
                             v_int16x8 v_dst[2];
                             v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
@@ -2037,8 +2128,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                             v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
 
                             v_store(dst2 + x, v_pack_u(
-                                v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)),
-                                v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask))));
+                                v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask))),
+                                v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask)))));
                         }
                     }
                     #endif
@@ -2060,13 +2151,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 v_uint16x8 v_mask2 =  v_setall_u16(INTER_TAB_SIZE2-1);
                 v_uint32x4 v_zero =   v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1);
                 v_float32x4 v_scale = v_setall_f32(scale);
-                int span = v_float32x4::nlanes;
+                int span = VTraits<v_float32x4>::vlanes();
                 for( ; x <= size.width - span * 2; x += span * 2 )
                 {
                     v_uint32x4 v_fxy1, v_fxy2;
                     if ( src2 )
                     {
-                        v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2;
+                        v_uint16x8 v_src2 = v_and(v_load(src2 + x), v_mask2);
                         v_expand(v_src2, v_fxy1, v_fxy2);
                     }
                     else
@@ -2077,9 +2168,9 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                     v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
                     v_expand(v_src[0], v_src0[0], v_src0[1]);
                     v_expand(v_src[1], v_src1[0], v_src1[1]);
-                    #define CV_COMPUTE_MAP_X(X, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\
+                    #define CV_COMPUTE_MAP_X(X, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_and((FXY), v_mask))),\
                                                                         v_cvt_f32(v_reinterpret_as_s32(X)))
-                    #define CV_COMPUTE_MAP_Y(Y, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\
+                    #define CV_COMPUTE_MAP_Y(Y, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_shr<INTER_BITS>((FXY)))),\
                                                                         v_cvt_f32(v_reinterpret_as_s32(Y)))
                     v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
                     v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
@@ -2109,13 +2200,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1);
                 v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1);
                 v_float32x4 v_scale = v_setall_f32(scale);
-                int span = v_int16x8::nlanes;
+                int span = VTraits<v_int16x8>::vlanes();
                 for( ; x <= size.width - span; x += span )
                 {
                     v_int32x4 v_fxy1, v_fxy2;
                     if (src2)
                     {
-                        v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2;
+                        v_int16x8 v_src2 = v_and(v_load((short *)src2 + x), v_mask2);
                         v_expand(v_src2, v_fxy1, v_fxy2);
                     }
                     else
@@ -2128,8 +2219,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                     v_expand(v_src[0], v_src0[0], v_src0[1]);
                     v_expand(v_src[1], v_src1[0], v_src1[1]);
 
-                    #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X))
-                    #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y))
+                    #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_and((FXY), v_mask)), v_cvt_f32(X))
+                    #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_shr<INTER_BITS>((FXY))), v_cvt_f32(Y))
                     v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
                     v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
                     v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]);
@@ -2150,7 +2241,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
             }
         }
         else
-            CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
+            CV_Error( cv::Error::StsNotImplemented, "Unsupported combination of input/output matrices" );
     }
 }
 
@@ -2220,12 +2311,12 @@ class WarpAffineInvoker :
                             #if CV_SIMD128
                             {
                                 v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
-                                int span = v_uint16x8::nlanes;
+                                int span = VTraits<v_uint16x8>::vlanes();
                                 for( ; x1 <= bw - span; x1 += span )
                                 {
                                     v_int16x8 v_dst[2];
-                                    #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(shift+v_load(ptr + offset)),\
-                                                                                    v_shr<AB_BITS>(shift+v_load(ptr + offset + 4)))
+                                    #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
+                                                                                    v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
                                     v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
                                     v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
                                     #undef CV_CONVERT_MAP
@@ -2258,21 +2349,21 @@ class WarpAffineInvoker :
                         {
                             v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
                             v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
-                            int span = v_float32x4::nlanes;
+                            int span = VTraits<v_float32x4>::vlanes();
                             for( ; x1 <= bw - span * 2; x1 += span * 2 )
                             {
-                                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
-                                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
-                                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
-                                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
+                                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1)));
+                                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1)));
+                                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1 + span)));
+                                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1 + span)));
 
                                 v_int16x8 v_xy[2];
                                 v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
                                 v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
                                 v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
 
-                                v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
-                                v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
+                                v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
+                                v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
                                 v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
                             }
                         }
@@ -2480,7 +2571,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
     String opts;
     if (interpolation == INTER_NEAREST)
     {
-        opts = format("-D INTER_NEAREST -D T=%s%s -D CT=%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
+        opts = format("-D INTER_NEAREST -D T=%s%s -D CT=%s -D T1=%s -D ST=%s -D CN=%d -D ROWS_PER_WI=%d",
                       ocl::typeToStr(type),
                       doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                       useDouble ? "double" : "float",
@@ -2490,8 +2581,8 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
     else
     {
         char cvt[2][50];
-        opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
-                      " -D convertToWT=%s -D convertToT=%s%s -D CT=%s -D cn=%d -D rowsPerWI=%d",
+        opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D SRC_DEPTH=%d"
+                      " -D CONVERT_TO_WT=%s -D CONVERT_TO_T=%s%s -D CT=%s -D CN=%d -D ROWS_PER_WI=%d",
                       interpolationMap[interpolation], ocl::typeToStr(type),
                       ocl::typeToStr(CV_MAT_DEPTH(type)),
                       ocl::typeToStr(sctype),
@@ -2665,8 +2756,13 @@ static bool ipp_warpAffine( InputArray _src, OutputArray _dst, int interpolation
     }
 
     return true;
+#else
+    CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(interpolation);
+    CV_UNUSED(borderType); CV_UNUSED(_M); CV_UNUSED(flags);
+    return false;
 #endif
 }
+
 #endif
 
 namespace hal {
@@ -2847,16 +2943,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
         v_int32x4 v_X0, v_Y0;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X0 = v_round(v_fX0, v_fX1);
             v_Y0 = v_round(v_fY0, v_fY1);
@@ -2866,16 +2962,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
         v_int32x4 v_X1, v_Y1;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X1 = v_round(v_fX0, v_fX1);
             v_Y1 = v_round(v_fY0, v_fY1);
@@ -2885,16 +2981,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
         v_int32x4 v_X2, v_Y2;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X2 = v_round(v_fX0, v_fX1);
             v_Y2 = v_round(v_fY0, v_fY1);
@@ -2904,16 +3000,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
         v_int32x4 v_X3, v_Y3;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X3 = v_round(v_fX0, v_fX1);
             v_Y3 = v_round(v_fY0, v_fY1);
@@ -2968,16 +3064,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
         v_int32x4 v_X0, v_Y0;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X0 = v_round(v_fX0, v_fX1);
             v_Y0 = v_round(v_fY0, v_fY1);
@@ -2987,16 +3083,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
         v_int32x4 v_X1, v_Y1;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X1 = v_round(v_fX0, v_fX1);
             v_Y1 = v_round(v_fY0, v_fY1);
@@ -3006,16 +3102,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
         v_int32x4 v_X2, v_Y2;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X2 = v_round(v_fX0, v_fX1);
             v_Y2 = v_round(v_fY0, v_fY1);
@@ -3025,35 +3121,35 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
         v_int32x4 v_X3, v_Y3;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X3 = v_round(v_fX0, v_fX1);
             v_Y3 = v_round(v_fY0, v_fY1);
         }
 
         // store alpha
-        v_int32x4 v_alpha0 = ((v_Y0 & v_itsi1) << INTER_BITS) + (v_X0 & v_itsi1);
-        v_int32x4 v_alpha1 = ((v_Y1 & v_itsi1) << INTER_BITS) + (v_X1 & v_itsi1);
+        v_int32x4 v_alpha0 = v_add(v_shl<INTER_BITS>(v_and(v_Y0, v_itsi1)), v_and(v_X0, v_itsi1));
+        v_int32x4 v_alpha1 = v_add(v_shl<INTER_BITS>(v_and(v_Y1, v_itsi1)), v_and(v_X1, v_itsi1));
         v_store((alpha + x1), v_pack(v_alpha0, v_alpha1));
 
-        v_alpha0 = ((v_Y2 & v_itsi1) << INTER_BITS) + (v_X2 & v_itsi1);
-        v_alpha1 = ((v_Y3 & v_itsi1) << INTER_BITS) + (v_X3 & v_itsi1);
+        v_alpha0 = v_add(v_shl<INTER_BITS>(v_and(v_Y2, v_itsi1)), v_and(v_X2, v_itsi1));
+        v_alpha1 = v_add(v_shl<INTER_BITS>(v_and(v_Y3, v_itsi1)), v_and(v_X3, v_itsi1));
         v_store((alpha + x1 + 8), v_pack(v_alpha0, v_alpha1));
 
         // convert to 16s
-        v_X0 = v_reinterpret_as_s32(v_pack(v_X0 >> INTER_BITS, v_X1 >> INTER_BITS));
-        v_X1 = v_reinterpret_as_s32(v_pack(v_X2 >> INTER_BITS, v_X3 >> INTER_BITS));
-        v_Y0 = v_reinterpret_as_s32(v_pack(v_Y0 >> INTER_BITS, v_Y1 >> INTER_BITS));
-        v_Y1 = v_reinterpret_as_s32(v_pack(v_Y2 >> INTER_BITS, v_Y3 >> INTER_BITS));
+        v_X0 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1)));
+        v_X1 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_X2), v_shr<INTER_BITS>(v_X3)));
+        v_Y0 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1)));
+        v_Y1 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_Y2), v_shr<INTER_BITS>(v_Y3)));
 
         v_store_interleave(xy + x1 * 2, (v_reinterpret_as_s16)(v_X0), (v_reinterpret_as_s16)(v_Y0));
         v_store_interleave(xy + x1 * 2 + 16, (v_reinterpret_as_s16)(v_X1), (v_reinterpret_as_s16)(v_Y1));
@@ -3519,7 +3615,7 @@ void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
     }
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "" );
 }
 
 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst, int solveMethod)
@@ -3544,7 +3640,7 @@ cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
     cv::Mat matrix = cv::cvarrToMat(marr);
     CV_Assert( src.type() == dst.type() );
     cv::warpAffine( src, dst, matrix, dst.size(), flags,
-        (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
+        (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
         fillval );
 }
 
@@ -3556,7 +3652,7 @@ cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
     cv::Mat matrix = cv::cvarrToMat(marr);
     CV_Assert( src.type() == dst.type() );
     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
-        (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
+        (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
         fillval );
 }
 
@@ -3569,7 +3665,7 @@ cvRemap( const CvArr* srcarr, CvArr* dstarr,
     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
-        (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
+        (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
         fillval );
     CV_Assert( dst0.data == dst.data );
 }
@@ -3653,7 +3749,7 @@ void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
     mapy.create(dsize, CV_32F);
     bool semiLog = (flags & WARP_POLAR_LOG) != 0;
 
-    if (!(flags & CV_WARP_INVERSE_MAP))
+    if (!(flags & cv::WARP_INVERSE_MAP))
     {
         CV_Assert(!dsize.empty());
         double Kangle = CV_2PI / dsize.height;
@@ -3693,7 +3789,7 @@ void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
                 my[rho] = (float)y;
             }
         }
-        remap(_src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
+        remap(_src, _dst, mapx, mapy, flags & cv::INTER_MAX, (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
     }
     else
     {
@@ -3746,7 +3842,7 @@ void cv::warpPolar(InputArray _src, OutputArray _dst, Size dsize,
             }
         }
         remap(src, _dst, mapx, mapy, flags & cv::INTER_MAX,
-              (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
+              (flags & cv::WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT);
     }
 }
 
diff --git a/modules/imgproc/src/intelligent_scissors.cpp b/modules/imgproc/src/intelligent_scissors.cpp
index 6e2dfc32885a..200757589120 100644
--- a/modules/imgproc/src/intelligent_scissors.cpp
+++ b/modules/imgproc/src/intelligent_scissors.cpp
@@ -90,9 +90,9 @@ struct IntelligentScissorsMB::Impl
     int laplacianKernelSize = 3;  // 1 or 3
 
     // image features
-    Mat_<Point2f> gradient_direction;  //< I: normalized laplacian x/y components
-    Mat_<float> gradient_magnitude;  //< Fg: gradient cost function
-    Mat_<uchar> non_edge_feature;  //< Fz: zero-crossing function
+    Mat_<Point2f> gradient_direction;  ///< I: normalized laplacian x/y components
+    Mat_<float> gradient_magnitude;  ///< Fg: gradient cost function
+    Mat_<uchar> non_edge_feature;  ///< Fz: zero-crossing function
 
     float weight_non_edge_compute = 0.0f;
 
diff --git a/modules/imgproc/src/linefit.cpp b/modules/imgproc/src/linefit.cpp
index cd205c60ab71..67a3affccd2f 100644
--- a/modules/imgproc/src/linefit.cpp
+++ b/modules/imgproc/src/linefit.cpp
@@ -358,7 +358,7 @@ static void fitLine2D( const Point2f * points, int count, int dist,
      calc_weights = (void ( * )(float *, int, float *)) _PFP.fp;
      break;*/
     default:
-        CV_Error(CV_StsBadArg, "Unknown distance type");
+        CV_Error(cv::Error::StsBadArg, "Unknown distance type");
     }
 
     AutoBuffer<float> wr(count*2);
@@ -499,7 +499,7 @@ static void fitLine3D( Point3f * points, int count, int dist,
         break;
 
     default:
-        CV_Error(CV_StsBadArg, "Unknown distance");
+        CV_Error(cv::Error::StsBadArg, "Unknown distance");
     }
 
     AutoBuffer<float> buf(count*2);
diff --git a/modules/imgproc/src/lsd.cpp b/modules/imgproc/src/lsd.cpp
index 8d26a016ab15..3b5b412e251b 100644
--- a/modules/imgproc/src/lsd.cpp
+++ b/modules/imgproc/src/lsd.cpp
@@ -214,7 +214,7 @@ class LineSegmentDetectorImpl CV_FINAL : public LineSegmentDetector
 /**
  * Draw lines on the given canvas.
  *
- * @param image     The image, where lines will be drawn.
+ * @param _image    The image, where lines will be drawn.
  *                  Should have the size of the image, where the lines were found
  * @param lines     The lines that need to be drawn
  */
@@ -226,7 +226,7 @@ class LineSegmentDetectorImpl CV_FINAL : public LineSegmentDetector
  * @param size      The size of the image, where lines1 and lines2 were found.
  * @param lines1    The first lines that need to be drawn. Color - Blue.
  * @param lines2    The second lines that need to be drawn. Color - Red.
- * @param image     An optional image, where lines will be drawn.
+ * @param _image    An optional image, where lines will be drawn.
  *                  Should have the size of the image, where the lines were found
  * @return          The number of mismatching pixels between lines1 and lines2.
  */
@@ -308,8 +308,6 @@ class LineSegmentDetectorImpl CV_FINAL : public LineSegmentDetector
  *
  * @param threshold      The minimum value of the angle that is considered defined, otherwise NOTDEF
  * @param n_bins         The number of bins with which gradients are ordered by, using bucket sort.
- * @param ordered_points Return: Vector of coordinate points that are pseudo ordered by magnitude.
- *                       Pixels would be ordered by norm value, up to a precision given by max_grad/n_bins.
  */
     void ll_angle(const double& threshold, const unsigned int& n_bins);
 
diff --git a/modules/imgproc/src/matchcontours.cpp b/modules/imgproc/src/matchcontours.cpp
index e676bf98049b..f278ddd3664e 100644
--- a/modules/imgproc/src/matchcontours.cpp
+++ b/modules/imgproc/src/matchcontours.cpp
@@ -158,7 +158,7 @@ double cv::matchShapes(InputArray contour1, InputArray contour2, int method, dou
         }
         break;
     default:
-        CV_Error( CV_StsBadArg, "Unknown comparison method" );
+        CV_Error( cv::Error::StsBadArg, "Unknown comparison method" );
     }
 
     //If anyA and anyB are both true, the result is correct.
diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp
index 90f0b2033021..1069d8abab41 100644
--- a/modules/imgproc/src/median_blur.simd.hpp
+++ b/modules/imgproc/src/median_blur.simd.hpp
@@ -179,10 +179,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                 for (k = 0; k < 16; ++k)
                 {
 #if CV_SIMD256
-                    v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k]));
+                    v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v_add(v256_setall_u16(2 * r + 1), v256_load(H.fine[k]))));
 #elif CV_SIMD128
-                    v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k]));
-                    v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8));
+                    v_store(H.fine[k], v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k)), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k])));
+                    v_store(H.fine[k] + 8, v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k] + 8)));
 #else
                     for (int ind = 0; ind < 16; ++ind)
                         H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
@@ -199,10 +199,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                 for( j = 0; j < 2*r; ++j, px += 16 )
                 {
 #if CV_SIMD256
-                    v_coarse += v256_load(px);
+                    v_coarse = v_add(v_coarse, v256_load(px));
 #elif CV_SIMD128
-                    v_coarsel += v_load(px);
-                    v_coarseh += v_load(px + 8);
+                    v_coarsel = v_add(v_coarsel, v_load(px));
+                    v_coarseh = v_add(v_coarseh, v_load(px + 8));
 #else
                     for (int ind = 0; ind < 16; ++ind)
                         H.coarse[ind] += px[ind];
@@ -216,11 +216,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
 
                     px = h_coarse + 16 * (n*c + std::min(j + r, n - 1));
 #if CV_SIMD256
-                    v_coarse += v256_load(px);
+                    v_coarse = v_add(v_coarse, v256_load(px));
                     v_store(H.coarse, v_coarse);
 #elif CV_SIMD128
-                    v_coarsel += v_load(px);
-                    v_coarseh += v_load(px + 8);
+                    v_coarsel = v_add(v_coarsel, v_load(px));
+                    v_coarseh = v_add(v_coarseh, v_load(px + 8));
                     v_store(H.coarse, v_coarsel);
                     v_store(H.coarse + 8, v_coarseh);
 #else
@@ -261,10 +261,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                         for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16)
                         {
 #if CV_SIMD256
-                            v_fine += v256_load(px);
+                            v_fine = v_add(v_fine, v256_load(px));
 #elif CV_SIMD128
-                            v_finel += v_load(px);
-                            v_fineh += v_load(px + 8);
+                            v_finel = v_add(v_finel, v_load(px));
+                            v_fineh = v_add(v_fineh, v_load(px + 8));
 #else
                             for (int ind = 0; ind < 16; ++ind)
                                 H.fine[k][ind] += px[ind];
@@ -275,10 +275,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                         {
                             px = h_fine + 16 * (n*(16 * c + k) + (n - 1));
 #if CV_SIMD256
-                            v_fine += v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n));
+                            v_fine = v_add(v_fine, v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n)));
 #elif CV_SIMD128
-                            v_finel += v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n)));
-                            v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)));
+                            v_finel = v_add(v_finel, v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n))));
+                            v_fineh = v_add(v_fineh, v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))));
 #else
                             for (int ind = 0; ind < 16; ++ind)
                                 H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]);
@@ -298,10 +298,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                         for ( ; luc[k] < j+r+1; ++luc[k] )
                         {
 #if CV_SIMD256
-                            v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
+                            v_fine = v_sub(v_add(v_fine, v256_load(px + 16 * MIN(luc[k], n - 1))), v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)));
 #elif CV_SIMD128
-                            v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1)    ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
-                            v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8);
+                            v_finel = v_sub(v_add(v_finel, v_load(px + 16 * MIN(luc[k], n - 1)    )), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)));
+                            v_fineh = v_sub(v_add(v_fineh, v_load(px + 16 * MIN(luc[k], n - 1) + 8)), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8));
 #else
                             for (int ind = 0; ind < 16; ++ind)
                                 H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind];
@@ -312,12 +312,12 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                     px = h_coarse + 16 * (n*c + MAX(j - r, 0));
 #if CV_SIMD256
                     v_store(H.fine[k], v_fine);
-                    v_coarse -= v256_load(px);
+                    v_coarse = v_sub(v_coarse, v256_load(px));
 #elif CV_SIMD128
                     v_store(H.fine[k], v_finel);
                     v_store(H.fine[k] + 8, v_fineh);
-                    v_coarsel -= v_load(px);
-                    v_coarseh -= v_load(px + 8);
+                    v_coarsel = v_sub(v_coarsel, v_load(px));
+                    v_coarseh = v_sub(v_coarseh, v_load(px + 8));
 #else
                     for (int ind = 0; ind < 16; ++ind)
                         H.coarse[ind] -= px[ind];
@@ -548,7 +548,7 @@ struct MinMax32f
     }
 };
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 struct MinMaxVec8u
 {
@@ -688,7 +688,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                 if( limit == size.width )
                     break;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
 #else
                 int nlanes = 1;
@@ -793,7 +793,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                 if( limit == size.width )
                     break;
 
-#if CV_SIMD || CV_SIMD_SCALABLE
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 int nlanes = VTraits<typename VecOp::arg_type>::vlanes();
 #else
                 int nlanes = 1;
@@ -867,7 +867,7 @@ void medianBlur(const Mat& src0, /*const*/ Mat& dst, int ksize)
         else if( src.depth() == CV_32F )
             medianBlur_SortNet<MinMax32f, MinMaxVec32f>( src, dst, ksize );
         else
-            CV_Error(CV_StsUnsupportedFormat, "");
+            CV_Error(cv::Error::StsUnsupportedFormat, "");
 
         return;
     }
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index 204c8654af7a..786573bbcb16 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -236,12 +236,12 @@ struct MomentsInTile_SIMD<uchar, int, int>
                 v_int16x8 p = v_reinterpret_as_s16(v_load_expand(ptr + x));
                 v_int16x8 sx = v_mul_wrap(qx, qx);
 
-                qx0 += v_reinterpret_as_u32(p);
+                qx0 = v_add(qx0, v_reinterpret_as_u32(p));
                 qx1 = v_reinterpret_as_u32(v_dotprod(p, qx, v_reinterpret_as_s32(qx1)));
                 qx2 = v_reinterpret_as_u32(v_dotprod(p, sx, v_reinterpret_as_s32(qx2)));
                 qx3 = v_reinterpret_as_u32(v_dotprod(v_mul_wrap(p, qx), sx, v_reinterpret_as_s32(qx3)));
 
-                qx += dx;
+                qx = v_add(qx, dx);
             }
 
             x0 = v_reduce_sum(qx0);
@@ -276,19 +276,19 @@ struct MomentsInTile_SIMD<ushort, int, int64>
             {
                 v_int32x4 v_src = v_reinterpret_as_s32(v_load_expand(ptr + x));
 
-                v_x0 += v_reinterpret_as_u32(v_src);
-                v_x1 += v_reinterpret_as_u32(v_src * v_ix0);
+                v_x0 = v_add(v_x0, v_reinterpret_as_u32(v_src));
+                v_x1 = v_add(v_x1, v_reinterpret_as_u32(v_mul(v_src, v_ix0)));
 
-                v_int32x4 v_ix1 = v_ix0 * v_ix0;
-                v_x2 += v_reinterpret_as_u32(v_src * v_ix1);
+                v_int32x4 v_ix1 = v_mul(v_ix0, v_ix0);
+                v_x2 = v_add(v_x2, v_reinterpret_as_u32(v_mul(v_src, v_ix1)));
 
-                v_ix1 = v_ix0 * v_ix1;
-                v_src = v_src * v_ix1;
+                v_ix1 = v_mul(v_ix0, v_ix1);
+                v_src = v_mul(v_src, v_ix1);
                 v_uint64x2 v_lo, v_hi;
                 v_expand(v_reinterpret_as_u32(v_src), v_lo, v_hi);
-                v_x3 += v_lo + v_hi;
+                v_x3 = v_add(v_x3, v_add(v_lo, v_hi));
 
-                v_ix0 += v_delta;
+                v_ix0 = v_add(v_ix0, v_delta);
             }
 
             x0 = v_reduce_sum(v_x0);
@@ -561,6 +561,37 @@ static bool ipp_moments(Mat &src, Moments &m )
 
 }
 
+namespace cv { namespace hal {
+
+static int moments(const cv::Mat& src, bool binary, cv::Moments& m)
+{
+    CV_INSTRUMENT_REGION();
+
+    double m_data[10];
+    int status = 0;
+    int type = src.type();
+    int depth = CV_MAT_DEPTH(type);
+
+    if( src.checkVector(2) >= 0 && (depth == CV_32F || depth == CV_32S))
+        status = cv_hal_polygonMoments(src.data, src.total()/2, src.type(), m_data);
+    else
+        status = cv_hal_imageMoments(src.data, src.step, src.type(), src.cols, src.rows, binary, m_data);
+
+    if (status == CV_HAL_ERROR_OK)
+    {
+        m = cv::Moments(m_data[0], m_data[1], m_data[2], m_data[3], m_data[4],
+                        m_data[5], m_data[6], m_data[7], m_data[8], m_data[9]);
+    }
+    else if (status != CV_HAL_ERROR_NOT_IMPLEMENTED)
+    {
+        CV_Error_(cv::Error::StsInternal,
+            ("HAL implementation moments ==> " CVAUX_STR(cv_hal_imageMoments) " returned %d (0x%08x)", status, status));
+    }
+
+    return status;
+}
+}}
+
 cv::Moments cv::moments( InputArray _src, bool binary )
 {
     CV_INSTRUMENT_REGION();
@@ -580,11 +611,15 @@ cv::Moments cv::moments( InputArray _src, bool binary )
 #endif
 
     Mat mat = _src.getMat();
+
+    if (hal::moments(mat, binary, m) == CV_HAL_ERROR_OK)
+        return m;
+
     if( mat.checkVector(2) >= 0 && (depth == CV_32F || depth == CV_32S))
         return contourMoments(mat);
 
     if( cn > 1 )
-        CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" );
+        CV_Error( cv::Error::StsBadArg, "Invalid image type (must be single-channel)" );
 
     CV_IPP_RUN(!binary, ipp_moments(mat, m), m);
 
@@ -599,7 +634,7 @@ cv::Moments cv::moments( InputArray _src, bool binary )
     else if( depth == CV_64F )
         func = momentsInTile<double, double, double>;
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
     Mat src0(mat);
 
@@ -616,7 +651,7 @@ cv::Moments cv::moments( InputArray _src, bool binary )
             if( binary )
             {
                 cv::Mat tmp(tileSize, CV_8U, nzbuf);
-                cv::compare( src, 0, tmp, CV_CMP_NE );
+                cv::compare( src, 0, tmp, cv::CMP_NE );
                 src = tmp;
             }
 
@@ -730,9 +765,9 @@ CV_IMPL double cvGetSpatialMoment( CvMoments * moments, int x_order, int y_order
     int order = x_order + y_order;
 
     if( !moments )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     if( (x_order | y_order) < 0 || order > 3 )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     return (&(moments->m00))[order + (order >> 1) + (order > 2) * 2 + y_order];
 }
@@ -743,9 +778,9 @@ CV_IMPL double cvGetCentralMoment( CvMoments * moments, int x_order, int y_order
     int order = x_order + y_order;
 
     if( !moments )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     if( (x_order | y_order) < 0 || order > 3 )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     return order >= 2 ? (&(moments->m00))[4 + order * 3 + y_order] :
     order == 0 ? moments->m00 : 0;
@@ -768,7 +803,7 @@ CV_IMPL double cvGetNormalizedCentralMoment( CvMoments * moments, int x_order, i
 CV_IMPL void cvGetHuMoments( CvMoments * mState, CvHuMoments * HuState )
 {
     if( !mState || !HuState )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     double m00s = mState->inv_sqrt_m00, m00 = m00s * m00s, s2 = m00 * m00, s3 = s2 * m00s;
 
diff --git a/modules/imgproc/src/morph.dispatch.cpp b/modules/imgproc/src/morph.dispatch.cpp
index a0857d1266c5..0cb50ec36895 100644
--- a/modules/imgproc/src/morph.dispatch.cpp
+++ b/modules/imgproc/src/morph.dispatch.cpp
@@ -1076,7 +1076,7 @@ static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op,
             return false;
         break;
     default:
-        CV_Error( CV_StsBadArg, "unknown morphological operation" );
+        CV_Error( cv::Error::StsBadArg, "unknown morphological operation" );
     }
 
     return true;
@@ -1249,7 +1249,7 @@ void morphologyEx( InputArray _src, OutputArray _dst, int op,
         }
         break;
     default:
-        CV_Error( CV_StsBadArg, "unknown morphological operation" );
+        CV_Error( cv::Error::StsBadArg, "unknown morphological operation" );
     }
 }
 
@@ -1296,7 +1296,7 @@ CV_IMPL void
 cvReleaseStructuringElement( IplConvKernel ** element )
 {
     if( !element )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
     cvFree( element );
 }
 
diff --git a/modules/imgproc/src/morph.simd.hpp b/modules/imgproc/src/morph.simd.hpp
index 9b3023f8f022..e93f829daa41 100644
--- a/modules/imgproc/src/morph.simd.hpp
+++ b/modules/imgproc/src/morph.simd.hpp
@@ -106,12 +106,12 @@ struct MorphNoVec
     int operator()(uchar**, int, uchar*, int) const { return 0; }
 };
 
-#if CV_SIMD
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
 
 template<class VecUpdate> struct MorphRowVec
 {
     typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
     MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
     int operator()(const uchar* src, uchar* dst, int width, int cn) const
     {
@@ -121,52 +121,52 @@ template<class VecUpdate> struct MorphRowVec
         width *= cn;
         VecUpdate updateOp;
 
-        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
+        for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
         {
             vtype s0 = vx_load((const stype*)src + i);
-            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
-            vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes);
-            vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes);
+            vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
+            vtype s2 = vx_load((const stype*)src + i + 2*VTraits<vtype>::vlanes());
+            vtype s3 = vx_load((const stype*)src + i + 3*VTraits<vtype>::vlanes());
             for (k = cn; k < _ksize; k += cn)
             {
                 s0 = updateOp(s0, vx_load((const stype*)src + i + k));
-                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
-                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes));
-                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
+                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*VTraits<vtype>::vlanes()));
+                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*VTraits<vtype>::vlanes()));
             }
             v_store((stype*)dst + i, s0);
-            v_store((stype*)dst + i + vtype::nlanes, s1);
-            v_store((stype*)dst + i + 2*vtype::nlanes, s2);
-            v_store((stype*)dst + i + 3*vtype::nlanes, s3);
+            v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
+            v_store((stype*)dst + i + 2*VTraits<vtype>::vlanes(), s2);
+            v_store((stype*)dst + i + 3*VTraits<vtype>::vlanes(), s3);
         }
-        if( i <= width - 2*vtype::nlanes )
+        if( i <= width - 2*VTraits<vtype>::vlanes() )
         {
             vtype s0 = vx_load((const stype*)src + i);
-            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
+            vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
             for( k = cn; k < _ksize; k += cn )
             {
                 s0 = updateOp(s0, vx_load((const stype*)src + i + k));
-                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
             }
             v_store((stype*)dst + i, s0);
-            v_store((stype*)dst + i + vtype::nlanes, s1);
-            i += 2*vtype::nlanes;
+            v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
+            i += 2*VTraits<vtype>::vlanes();
         }
-        if( i <= width - vtype::nlanes )
+        if( i <= width - VTraits<vtype>::vlanes() )
         {
             vtype s = vx_load((const stype*)src + i);
             for( k = cn; k < _ksize; k += cn )
                 s = updateOp(s, vx_load((const stype*)src + i + k));
             v_store((stype*)dst + i, s);
-            i += vtype::nlanes;
+            i += VTraits<vtype>::vlanes();
         }
-        if( i <= width - vtype::nlanes/2 )
+        if( i <= width - VTraits<vtype>::vlanes()/2 )
         {
             vtype s = vx_load_low((const stype*)src + i);
             for( k = cn; k < _ksize; k += cn )
                 s = updateOp(s, vx_load_low((const stype*)src + i + k));
             v_store_low((stype*)dst + i, s);
-            i += vtype::nlanes/2;
+            i += VTraits<vtype>::vlanes()/2;
         }
 
         return i - i % cn;
@@ -179,7 +179,7 @@ template<class VecUpdate> struct MorphRowVec
 template<class VecUpdate> struct MorphColumnVec
 {
     typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
     MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
     int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
     {
@@ -189,7 +189,7 @@ template<class VecUpdate> struct MorphColumnVec
         VecUpdate updateOp;
 
         for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 );
+            CV_Assert( ((size_t)_src[i] & (VTraits<v_uint8>::vlanes()-1)) == 0 );
 
         const stype** src = (const stype**)_src;
         stype* dst = (stype*)_dst;
@@ -197,58 +197,58 @@ template<class VecUpdate> struct MorphColumnVec
 
         for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
         {
-            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
+            for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
             {
                 const stype* sptr = src[1] + i;
                 vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
-                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
-                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
+                vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
+                vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());
 
                 for( k = 2; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
                     s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
-                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
-                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
                 }
 
                 sptr = src[0] + i;
                 v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
-                v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
+                v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                v_store(dst + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
+                v_store(dst + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
 
                 sptr = src[k] + i;
                 v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
-                v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
+                v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                v_store(dst + dststep + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
+                v_store(dst + dststep + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
             }
-            if( i <= width - 2*vtype::nlanes )
+            if( i <= width - 2*VTraits<vtype>::vlanes() )
             {
                 const stype* sptr = src[1] + i;
                 vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
 
                 for( k = 2; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
                     s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
                 }
 
                 sptr = src[0] + i;
                 v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
 
                 sptr = src[k] + i;
                 v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                i += 2*vtype::nlanes;
+                v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                i += 2*VTraits<vtype>::vlanes();
             }
-            if( i <= width - vtype::nlanes )
+            if( i <= width - VTraits<vtype>::vlanes() )
             {
                 vtype s0 = vx_load_aligned(src[1] + i);
 
@@ -257,9 +257,9 @@ template<class VecUpdate> struct MorphColumnVec
 
                 v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i)));
                 v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i)));
-                i += vtype::nlanes;
+                i += VTraits<vtype>::vlanes();
             }
-            if( i <= width - vtype::nlanes/2 )
+            if( i <= width - VTraits<vtype>::vlanes()/2 )
             {
                 vtype s0 = vx_load_low(src[1] + i);
 
@@ -268,66 +268,66 @@ template<class VecUpdate> struct MorphColumnVec
 
                 v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i)));
                 v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i)));
-                i += vtype::nlanes/2;
+                i += VTraits<vtype>::vlanes()/2;
             }
         }
 
         for( ; count > 0; count--, dst += dststep, src++ )
         {
-            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
+            for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
             {
                 const stype* sptr = src[0] + i;
                 vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
-                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
-                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
+                vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
+                vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());
 
                 for( k = 1; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
                     s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
-                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
-                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + vtype::nlanes, s1);
-                v_store(dst + i + 2*vtype::nlanes, s2);
-                v_store(dst + i + 3*vtype::nlanes, s3);
+                v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
             }
-            if( i <= width - 2*vtype::nlanes )
+            if( i <= width - 2*VTraits<vtype>::vlanes() )
             {
                 const stype* sptr = src[0] + i;
                 vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
 
                 for( k = 1; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
                     s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + vtype::nlanes, s1);
-                i += 2*vtype::nlanes;
+                v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+                i += 2*VTraits<vtype>::vlanes();
             }
-            if( i <= width - vtype::nlanes )
+            if( i <= width - VTraits<vtype>::vlanes() )
             {
                 vtype s0 = vx_load_aligned(src[0] + i);
 
                 for( k = 1; k < _ksize; k++ )
                     s0 = updateOp(s0, vx_load_aligned(src[k] + i));
                 v_store(dst + i, s0);
-                i += vtype::nlanes;
+                i += VTraits<vtype>::vlanes();
             }
-            if( i <= width - vtype::nlanes/2 )
+            if( i <= width - VTraits<vtype>::vlanes()/2 )
             {
                 vtype s0 = vx_load_low(src[0] + i);
 
                 for( k = 1; k < _ksize; k++ )
                     s0 = updateOp(s0, vx_load_low(src[k] + i));
                 v_store_low(dst + i, s0);
-                i += vtype::nlanes/2;
+                i += VTraits<vtype>::vlanes()/2;
             }
         }
 
@@ -341,7 +341,7 @@ template<class VecUpdate> struct MorphColumnVec
 template<class VecUpdate> struct MorphVec
 {
     typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
     int operator()(uchar** _src, int nz, uchar* _dst, int width) const
     {
         CV_INSTRUMENT_REGION();
@@ -351,56 +351,56 @@ template<class VecUpdate> struct MorphVec
         int i, k;
         VecUpdate updateOp;
 
-        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
+        for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
         {
             const stype* sptr = src[0] + i;
             vtype s0 = vx_load(sptr);
-            vtype s1 = vx_load(sptr + vtype::nlanes);
-            vtype s2 = vx_load(sptr + 2*vtype::nlanes);
-            vtype s3 = vx_load(sptr + 3*vtype::nlanes);
+            vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
+            vtype s2 = vx_load(sptr + 2*VTraits<vtype>::vlanes());
+            vtype s3 = vx_load(sptr + 3*VTraits<vtype>::vlanes());
             for( k = 1; k < nz; k++ )
             {
                 sptr = src[k] + i;
                 s0 = updateOp(s0, vx_load(sptr));
-                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
-                s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes));
-                s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes));
+                s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
+                s2 = updateOp(s2, vx_load(sptr + 2*VTraits<vtype>::vlanes()));
+                s3 = updateOp(s3, vx_load(sptr + 3*VTraits<vtype>::vlanes()));
             }
             v_store(dst + i, s0);
-            v_store(dst + i + vtype::nlanes, s1);
-            v_store(dst + i + 2*vtype::nlanes, s2);
-            v_store(dst + i + 3*vtype::nlanes, s3);
+            v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
         }
-        if( i <= width - 2*vtype::nlanes )
+        if( i <= width - 2*VTraits<vtype>::vlanes() )
         {
             const stype* sptr = src[0] + i;
             vtype s0 = vx_load(sptr);
-            vtype s1 = vx_load(sptr + vtype::nlanes);
+            vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
             for( k = 1; k < nz; k++ )
             {
                 sptr = src[k] + i;
                 s0 = updateOp(s0, vx_load(sptr));
-                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
+                s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
             }
             v_store(dst + i, s0);
-            v_store(dst + i + vtype::nlanes, s1);
-            i += 2*vtype::nlanes;
+            v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+            i += 2*VTraits<vtype>::vlanes();
         }
-        if( i <= width - vtype::nlanes )
+        if( i <= width - VTraits<vtype>::vlanes() )
         {
             vtype s0 = vx_load(src[0] + i);
             for( k = 1; k < nz; k++ )
                 s0 = updateOp(s0, vx_load(src[k] + i));
             v_store(dst + i, s0);
-            i += vtype::nlanes;
+            i += VTraits<vtype>::vlanes();
         }
-        if( i <= width - vtype::nlanes/2 )
+        if( i <= width - VTraits<vtype>::vlanes()/2 )
         {
             vtype s0 = vx_load_low(src[0] + i);
             for( k = 1; k < nz; k++ )
                 s0 = updateOp(s0, vx_load_low(src[k] + i));
             v_store_low(dst + i, s0);
-            i += vtype::nlanes/2;
+            i += VTraits<vtype>::vlanes()/2;
         }
         return i;
     }
@@ -753,7 +753,7 @@ Ptr<BaseRowFilter> getMorphologyRowFilter(int op, int type, int ksize, int ancho
                                       DilateRowVec64f> >(ksize, anchor);
     }
 
-    CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type));
+    CV_Error_( cv::Error::StsNotImplemented, ("Unsupported data type (=%d)", type));
 }
 
 Ptr<BaseColumnFilter> getMorphologyColumnFilter(int op, int type, int ksize, int anchor)
@@ -801,7 +801,7 @@ Ptr<BaseColumnFilter> getMorphologyColumnFilter(int op, int type, int ksize, int
                                          DilateColumnVec64f> >(ksize, anchor);
     }
 
-    CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type));
+    CV_Error_( cv::Error::StsNotImplemented, ("Unsupported data type (=%d)", type));
 }
 
 Ptr<BaseFilter> getMorphologyFilter(int op, int type, const Mat& kernel, Point anchor)
@@ -838,7 +838,7 @@ Ptr<BaseFilter> getMorphologyFilter(int op, int type, const Mat& kernel, Point a
             return makePtr<MorphFilter<MaxOp<double>, DilateVec64f> >(kernel, anchor);
     }
 
-    CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type));
+    CV_Error_( cv::Error::StsNotImplemented, ("Unsupported data type (=%d)", type));
 }
 
 #endif
diff --git a/modules/imgproc/src/opencl/color_hsv.cl b/modules/imgproc/src/opencl/color_hsv.cl
index eb883bdc96b0..8eec8edd1c76 100644
--- a/modules/imgproc/src/opencl/color_hsv.cl
+++ b/modules/imgproc/src/opencl/color_hsv.cl
@@ -46,21 +46,21 @@
 
 /**************************************PUBLICFUNC*************************************/
 
-#if depth == 0
+#if SRC_DEPTH == 0
     #define DATA_TYPE uchar
     #define MAX_NUM  255
     #define HALF_MAX_NUM 128
     #define COEFF_TYPE int
     #define SAT_CAST(num) convert_uchar_sat(num)
     #define DEPTH_0
-#elif depth == 2
+#elif SRC_DEPTH == 2
     #define DATA_TYPE ushort
     #define MAX_NUM  65535
     #define HALF_MAX_NUM 32768
     #define COEFF_TYPE int
     #define SAT_CAST(num) convert_ushort_sat(num)
     #define DEPTH_2
-#elif depth == 5
+#elif SRC_DEPTH == 5
     #define DATA_TYPE float
     #define MAX_NUM  1.0f
     #define HALF_MAX_NUM 0.5f
@@ -78,18 +78,18 @@ enum
     hsv_shift  = 12
 };
 
-#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
-#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+#define scnbytes ((int)sizeof(DATA_TYPE)*SCN)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*DCN)
 
-#ifndef hscale
-#define hscale 0
+#ifndef HSCALE
+#define HSCALE 0
 #endif
 
-#ifndef hrange
-#define hrange 0
+#ifndef HRANGE
+#define HRANGE 0
 #endif
 
-#if bidx == 0
+#if BIDX == 0
 #define R_COMP z
 #define G_COMP y
 #define B_COMP x
@@ -148,7 +148,7 @@ __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
                 h = (vr & (g - b)) +
                     (~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
                 h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
-                h += h < 0 ? hrange : 0;
+                h += h < 0 ? HRANGE : 0;
 
                 dst[dst_index] = convert_uchar_sat_rte(h);
                 dst[dst_index + 1] = (uchar)s;
@@ -188,7 +188,7 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
                 {
                     float tab[4];
                     int sector;
-                    h *= hscale;
+                    h *= HSCALE;
                     if( h < 0 )
                         do h += 6; while( h < 0 );
                     else if( h >= 6 )
@@ -213,10 +213,10 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
                 else
                     b = g = r = v;
 
-                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
+                dst[dst_index + BIDX] = convert_uchar_sat_rte(b*255.f);
                 dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
-                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
-#if dcn == 4
+                dst[dst_index + (BIDX^2)] = convert_uchar_sat_rte(r*255.f);
+#if DCN == 4
                 dst[dst_index + 3] = MAX_NUM;
 #endif
 
@@ -275,7 +275,7 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset
                 if( h < 0 )
                     h += 360.f;
 
-                dst[0] = h*hscale;
+                dst[0] = h*HSCALE;
                 dst[1] = s;
                 dst[2] = v;
 
@@ -316,7 +316,7 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset
                 {
                     float tab[4];
                     int sector;
-                    h *= hscale;
+                    h *= HSCALE;
                     if(h < 0)
                         do h += 6; while (h < 0);
                     else if (h >= 6)
@@ -341,10 +341,10 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset
                 else
                     b = g = r = v;
 
-                dst[bidx] = b;
+                dst[BIDX] = b;
                 dst[1] = g;
-                dst[bidx^2] = r;
-#if dcn == 4
+                dst[BIDX^2] = r;
+#if DCN == 4
                 dst[3] = MAX_NUM;
 #endif
 
@@ -410,7 +410,7 @@ __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
                         h += 360.f;
                 }
 
-                dst[dst_index] = convert_uchar_sat_rte(h*hscale);
+                dst[dst_index] = convert_uchar_sat_rte(h*HSCALE);
                 dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
                 dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
 
@@ -451,7 +451,7 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
                     float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
                     float p1 = 2*l - p2;
 
-                    h *= hscale;
+                    h *= HSCALE;
                     if( h < 0 )
                         do h += 6; while( h < 0 );
                     else if( h >= 6 )
@@ -472,10 +472,10 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
                 else
                     b = g = r = l;
 
-                dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
+                dst[dst_index + BIDX] = convert_uchar_sat_rte(b*255.f);
                 dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
-                dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
-#if dcn == 4
+                dst[dst_index + (BIDX^2)] = convert_uchar_sat_rte(r*255.f);
+#if DCN == 4
                 dst[dst_index + 3] = MAX_NUM;
 #endif
 
@@ -538,7 +538,7 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset
                     if( h < 0.f ) h += 360.f;
                 }
 
-                dst[0] = h*hscale;
+                dst[0] = h*HSCALE;
                 dst[1] = l;
                 dst[2] = s;
 
@@ -582,7 +582,7 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset
                     float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
                     float p1 = 2*l - p2;
 
-                    h *= hscale;
+                    h *= HSCALE;
                     if( h < 0 )
                         do h += 6; while( h < 0 );
                     else if( h >= 6 )
@@ -603,10 +603,10 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset
                 else
                     b = g = r = l;
 
-                dst[bidx] = b;
+                dst[BIDX] = b;
                 dst[1] = g;
-                dst[bidx^2] = r;
-#if dcn == 4
+                dst[BIDX^2] = r;
+#if DCN == 4
                 dst[3] = MAX_NUM;
 #endif
 
diff --git a/modules/imgproc/src/opencl/color_lab.cl b/modules/imgproc/src/opencl/color_lab.cl
index 16a96d25e702..1be74a2466ba 100644
--- a/modules/imgproc/src/opencl/color_lab.cl
+++ b/modules/imgproc/src/opencl/color_lab.cl
@@ -44,21 +44,21 @@
 //
 //M*/
 
-#if depth == 0
+#if SRC_DEPTH == 0
     #define DATA_TYPE uchar
     #define MAX_NUM  255
     #define HALF_MAX_NUM 128
     #define COEFF_TYPE int
     #define SAT_CAST(num) convert_uchar_sat(num)
     #define DEPTH_0
-#elif depth == 2
+#elif SRC_DEPTH == 2
     #define DATA_TYPE ushort
     #define MAX_NUM  65535
     #define HALF_MAX_NUM 32768
     #define COEFF_TYPE int
     #define SAT_CAST(num) convert_ushort_sat(num)
     #define DEPTH_2
-#elif depth == 5
+#elif SRC_DEPTH == 5
     #define DATA_TYPE float
     #define MAX_NUM  1.0f
     #define HALF_MAX_NUM 0.5f
@@ -76,8 +76,8 @@ enum
     xyz_shift  = 12,
 };
 
-#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
-#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+#define scnbytes ((int)sizeof(DATA_TYPE)*SCN)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*DCN)
 
 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
@@ -167,11 +167,11 @@ __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offse
                 DATA_TYPE dst0 = SAT_CAST(b);
                 DATA_TYPE dst1 = SAT_CAST(g);
                 DATA_TYPE dst2 = SAT_CAST(r);
-#if dcn == 3 || defined DEPTH_5
+#if DCN == 3 || defined DEPTH_5
                 dst[0] = dst0;
                 dst[1] = dst1;
                 dst[2] = dst2;
-#if dcn == 4
+#if DCN == 4
                 dst[3] = MAX_NUM;
 #endif
 #else
@@ -403,7 +403,7 @@ __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
 #endif
                     coeffs, lThresh, fThresh);
 
-#if dcn == 3
+#if DCN == 3
                 dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
                 dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
                 dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
@@ -455,7 +455,7 @@ __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offse
                     coeffs, lThresh, fThresh);
 
                 dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
-#if dcn == 4
+#if DCN == 4
                 dst[3] = MAX_NUM;
 #endif
                 ++y;
@@ -644,7 +644,7 @@ __kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offse
                 dst[0] = R;
                 dst[1] = G;
                 dst[2] = B;
-#if dcn == 4
+#if DCN == 4
                 dst[3] = MAX_NUM;
 #endif
                 ++y;
@@ -717,7 +717,7 @@ __kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
                 uchar dst1 = SAT_CAST(G * 255.0f);
                 uchar dst2 = SAT_CAST(B * 255.0f);
 
-#if dcn == 4
+#if DCN == 4
                 *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
 #else
                 dst[0] = dst0;
diff --git a/modules/imgproc/src/opencl/color_rgb.cl b/modules/imgproc/src/opencl/color_rgb.cl
index dd4563e111e6..d5cffe6dcdd6 100644
--- a/modules/imgproc/src/opencl/color_rgb.cl
+++ b/modules/imgproc/src/opencl/color_rgb.cl
@@ -46,21 +46,21 @@
 
 /**************************************PUBLICFUNC*************************************/
 
-#if depth == 0
+#if SRC_DEPTH == 0
     #define DATA_TYPE uchar
     #define MAX_NUM  255
     #define HALF_MAX_NUM 128
     #define COEFF_TYPE int
     #define SAT_CAST(num) convert_uchar_sat(num)
     #define DEPTH_0
-#elif depth == 2
+#elif SRC_DEPTH == 2
     #define DATA_TYPE ushort
     #define MAX_NUM  65535
     #define HALF_MAX_NUM 32768
     #define COEFF_TYPE int
     #define SAT_CAST(num) convert_ushort_sat(num)
     #define DEPTH_2
-#elif depth == 5
+#elif SRC_DEPTH == 5
     #define DATA_TYPE float
     #define MAX_NUM  1.0f
     #define HALF_MAX_NUM 0.5f
@@ -86,10 +86,10 @@ enum
 #define G2YF 0.587f
 #define R2YF 0.299f
 
-#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
-#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+#define scnbytes ((int)sizeof(DATA_TYPE)*SCN)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*DCN)
 
-#if bidx == 0
+#if BIDX == 0
 #define R_COMP z
 #define G_COMP y
 #define B_COMP x
@@ -160,9 +160,9 @@ __kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offs
                 __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
                 __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
                 DATA_TYPE val = src[0];
-#if dcn == 3 || defined DEPTH_5
+#if DCN == 3 || defined DEPTH_5
                 dst[0] = dst[1] = dst[2] = val;
-#if dcn == 4
+#if DCN == 4
                 dst[3] = MAX_NUM;
 #endif
 #else
@@ -197,7 +197,7 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
             {
                 __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
                 __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
-#if scn == 3
+#if SCN == 3
                 DATA_TYPE_3 src_pix = vload3(0, src);
 #else
                 DATA_TYPE_4 src_pix = vload4(0, src);
@@ -213,8 +213,8 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
                 dst[2] = src_pix.z;
 #endif
 
-#if dcn == 4
-#if scn == 3
+#if DCN == 4
+#if SCN == 3
                 dst[3] = MAX_NUM;
 #else
                 dst[3] = src[3];
@@ -250,18 +250,18 @@ __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset
             {
                 ushort t = *((__global const ushort*)(src + src_index));
 
-#if greenbits == 6
-                dst[dst_index + bidx] = (uchar)(t << 3);
+#if GREENBITS == 6
+                dst[dst_index + BIDX] = (uchar)(t << 3);
                 dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
-                dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
+                dst[dst_index + (BIDX^2)] = (uchar)((t >> 8) & ~7);
 #else
-                dst[dst_index + bidx] = (uchar)(t << 3);
+                dst[dst_index + BIDX] = (uchar)(t << 3);
                 dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
-                dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
+                dst[dst_index + (BIDX^2)] = (uchar)((t >> 7) & ~7);
 #endif
 
-#if dcn == 4
-#if greenbits == 6
+#if DCN == 4
+#if GREENBITS == 6
                 dst[dst_index + 3] = 255;
 #else
                 dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
@@ -295,9 +295,9 @@ __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset
             {
                 uchar4 src_pix = vload4(0, src + src_index);
 
-#if greenbits == 6
+#if GREENBITS == 6
                     *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
-#elif scn == 3
+#elif SCN == 3
                     *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
 #else
                     *((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
@@ -333,7 +333,7 @@ __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offse
             {
                 int t = *((__global const ushort*)(src + src_index));
 
-#if greenbits == 6
+#if GREENBITS == 6
                 dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, BY15, mad24((t >> 3) & 0xfc, GY15, ((t >> 8) & 0xf8) * RY15)), gray_shift);
 #else
                 dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, BY15, mad24((t >> 2) & 0xf8, GY15, ((t >> 7) & 0xf8) * RY15)), gray_shift);
@@ -365,7 +365,7 @@ __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offse
             {
                 int t = src[src_index];
 
-#if greenbits == 6
+#if GREENBITS == 6
                 *((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
 #else
                 t >>= 3;
diff --git a/modules/imgproc/src/opencl/color_yuv.cl b/modules/imgproc/src/opencl/color_yuv.cl
index bf75a1c5b8fd..ebef91743f16 100644
--- a/modules/imgproc/src/opencl/color_yuv.cl
+++ b/modules/imgproc/src/opencl/color_yuv.cl
@@ -46,21 +46,21 @@
 
 /**************************************PUBLICFUNC*************************************/
 
-#if depth == 0
+#if SRC_DEPTH == 0
     #define DATA_TYPE uchar
     #define MAX_NUM  255
     #define HALF_MAX_NUM 128
     #define COEFF_TYPE int
     #define SAT_CAST(num) convert_uchar_sat(num)
     #define DEPTH_0
-#elif depth == 2
+#elif SRC_DEPTH == 2
     #define DATA_TYPE ushort
     #define MAX_NUM  65535
     #define HALF_MAX_NUM 32768
     #define COEFF_TYPE int
     #define SAT_CAST(num) convert_ushort_sat(num)
     #define DEPTH_2
-#elif depth == 5
+#elif SRC_DEPTH == 5
     #define DATA_TYPE float
     #define MAX_NUM  1.0f
     #define HALF_MAX_NUM 0.5f
@@ -114,10 +114,10 @@ enum
 #define CR2GI -11698
 #define CB2BI 29049
 
-#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
-#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+#define scnbytes ((int)sizeof(DATA_TYPE)*SCN)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*DCN)
 
-#if bidx == 0
+#if BIDX == 0
 #define R_COMP z
 #define G_COMP y
 #define B_COMP x
@@ -127,12 +127,12 @@ enum
 #define B_COMP z
 #endif
 
-#ifndef uidx
-#define uidx 0
+#ifndef UIDX
+#define UIDX 0
 #endif
 
-#ifndef yidx
-#define yidx 0
+#ifndef YIDX
+#define YIDX 0
 #endif
 
 #ifndef PIX_PER_WI_X
@@ -234,10 +234,10 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset
                 const int b = Y + CV_DESCALE(mul24(U - HALF_MAX_NUM, coeffs[0]), yuv_shift);
 #endif
 
-                dst[bidx] = SAT_CAST( b );
+                dst[BIDX] = SAT_CAST( b );
                 dst[1] = SAT_CAST( g );
-                dst[bidx^2] = SAT_CAST( r );
-#if dcn == 4
+                dst[BIDX^2] = SAT_CAST( r );
+#if DCN == 4
                 dst[3] = MAX_NUM;
 #endif
                 ++y;
@@ -266,7 +266,7 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
             {
                 __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
                 __global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
-                __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));
+                __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, DCN<<1, dt_offset));
                 __global uchar*       dst2 = dst1 + dst_step;
 
                 float Y1 = ysrc[0];
@@ -274,8 +274,8 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
                 float Y3 = ysrc[src_step];
                 float Y4 = ysrc[src_step + 1];
 
-                float U  = ((float)usrc[uidx]) - HALF_MAX_NUM;
-                float V  = ((float)usrc[1-uidx]) - HALF_MAX_NUM;
+                float U  = ((float)usrc[UIDX]) - HALF_MAX_NUM;
+                float V  = ((float)usrc[1-UIDX]) - HALF_MAX_NUM;
 
                 __constant float* coeffs = c_YUV2RGBCoeffs_420;
                 float ruv = fma(coeffs[4], V, 0.5f);
@@ -283,34 +283,34 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
                 float buv = fma(coeffs[1], U, 0.5f);
 
                 Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
-                dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
+                dst1[2 - BIDX] = convert_uchar_sat(Y1 + ruv);
                 dst1[1]        = convert_uchar_sat(Y1 + guv);
-                dst1[bidx]     = convert_uchar_sat(Y1 + buv);
-#if dcn == 4
+                dst1[BIDX]     = convert_uchar_sat(Y1 + buv);
+#if DCN == 4
                 dst1[3]        = 255;
 #endif
 
                 Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
-                dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
-                dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);
-                dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);
-#if dcn == 4
+                dst1[DCN + 2 - BIDX] = convert_uchar_sat(Y2 + ruv);
+                dst1[DCN + 1]        = convert_uchar_sat(Y2 + guv);
+                dst1[DCN + BIDX]     = convert_uchar_sat(Y2 + buv);
+#if DCN == 4
                 dst1[7]        = 255;
 #endif
 
                 Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
-                dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
+                dst2[2 - BIDX] = convert_uchar_sat(Y3 + ruv);
                 dst2[1]        = convert_uchar_sat(Y3 + guv);
-                dst2[bidx]     = convert_uchar_sat(Y3 + buv);
-#if dcn == 4
+                dst2[BIDX]     = convert_uchar_sat(Y3 + buv);
+#if DCN == 4
                 dst2[3]        = 255;
 #endif
 
                 Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
-                dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
-                dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);
-                dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);
-#if dcn == 4
+                dst2[DCN + 2 - BIDX] = convert_uchar_sat(Y4 + ruv);
+                dst2[DCN + 1]        = convert_uchar_sat(Y4 + guv);
+                dst2[DCN + BIDX]     = convert_uchar_sat(Y4 + buv);
+#if DCN == 4
                 dst2[7]        = 255;
 #endif
             }
@@ -319,7 +319,7 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
     }
 }
 
-#if uidx < 2
+#if UIDX < 2
 
 __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
                                 __global uchar* dstptr, int dst_step, int dt_offset,
@@ -336,7 +336,7 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
             if (y < rows / 2 )
             {
                 __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
-                __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
+                __global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, x * (DCN<<1) + dt_offset);
                 __global uchar*       dst2 = dst1 + dst_step;
 
                 float Y1 = ysrc[0];
@@ -354,8 +354,8 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
                 __global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
                 float uv[2] = { ((float)usrc[0]) - HALF_MAX_NUM, ((float)vsrc[0]) - HALF_MAX_NUM };
 #endif
-                float U = uv[uidx];
-                float V = uv[1-uidx];
+                float U = uv[UIDX];
+                float V = uv[1-UIDX];
 
                 __constant float* coeffs = c_YUV2RGBCoeffs_420;
                 float ruv = fma(coeffs[4], V, 0.5f);
@@ -363,34 +363,34 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
                 float buv = fma(coeffs[1], U, 0.5f);
 
                 Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
-                dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
+                dst1[2 - BIDX] = convert_uchar_sat(Y1 + ruv);
                 dst1[1]        = convert_uchar_sat(Y1 + guv);
-                dst1[bidx]     = convert_uchar_sat(Y1 + buv);
-#if dcn == 4
+                dst1[BIDX]     = convert_uchar_sat(Y1 + buv);
+#if DCN == 4
                 dst1[3]        = 255;
 #endif
 
                 Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
-                dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
-                dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);
-                dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);
-#if dcn == 4
+                dst1[DCN + 2 - BIDX] = convert_uchar_sat(Y2 + ruv);
+                dst1[DCN + 1]        = convert_uchar_sat(Y2 + guv);
+                dst1[DCN + BIDX]     = convert_uchar_sat(Y2 + buv);
+#if DCN == 4
                 dst1[7]        = 255;
 #endif
 
                 Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
-                dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
+                dst2[2 - BIDX] = convert_uchar_sat(Y3 + ruv);
                 dst2[1]        = convert_uchar_sat(Y3 + guv);
-                dst2[bidx]     = convert_uchar_sat(Y3 + buv);
-#if dcn == 4
+                dst2[BIDX]     = convert_uchar_sat(Y3 + buv);
+#if DCN == 4
                 dst2[3]        = 255;
 #endif
 
                 Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
-                dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
-                dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);
-                dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);
-#if dcn == 4
+                dst2[DCN + 2 - BIDX] = convert_uchar_sat(Y4 + ruv);
+                dst2[DCN + 1]        = convert_uchar_sat(Y4 + guv);
+                dst2[DCN + BIDX]     = convert_uchar_sat(Y4 + buv);
+#if DCN == 4
                 dst2[7]        = 255;
 #endif
             }
@@ -401,7 +401,7 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
 
 #endif
 
-#if uidx < 2
+#if UIDX < 2
 
 __constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
                                             0.438999176f, -0.3679990768f, -0.0709991455f };
@@ -415,7 +415,7 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
 
     if (x < cols/2)
     {
-        int src_index  = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));
+        int src_index  = mad24(y << 1, src_step, mad24(x << 1, SCN, src_offset));
         int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
         int y_rows = rows / 3 * 2;
         int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
@@ -438,56 +438,56 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
                 int s11 = *((__global const int*) src1);
                 int s12 = *((__global const int*) src1 + 1);
                 int s13 = *((__global const int*) src1 + 2);
-#if scn == 4
+#if SCN == 4
                 int s14 = *((__global const int*) src1 + 3);
 #endif
                 int s21 = *((__global const int*) src2);
                 int s22 = *((__global const int*) src2 + 1);
                 int s23 = *((__global const int*) src2 + 2);
-#if scn == 4
+#if SCN == 4
                 int s24 = *((__global const int*) src2 + 3);
 #endif
-                float src_pix1[scn * 4], src_pix2[scn * 4];
+                float src_pix1[SCN * 4], src_pix2[SCN * 4];
 
                 *((float4*) src_pix1)     = convert_float4(as_uchar4(s11));
                 *((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
                 *((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
-#if scn == 4
+#if SCN == 4
                 *((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
 #endif
                 *((float4*) src_pix2)     = convert_float4(as_uchar4(s21));
                 *((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
                 *((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
-#if scn == 4
+#if SCN == 4
                 *((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
 #endif
                 uchar4 y1, y2;
-                y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[      2-bidx], fma(coeffs[1], src_pix1[      1], fma(coeffs[2], src_pix1[      bidx], 16.5f))));
-                y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[  scn+2-bidx], fma(coeffs[1], src_pix1[  scn+1], fma(coeffs[2], src_pix1[  scn+bidx], 16.5f))));
-                y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
-                y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
-                y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[      2-bidx], fma(coeffs[1], src_pix2[      1], fma(coeffs[2], src_pix2[      bidx], 16.5f))));
-                y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[  scn+2-bidx], fma(coeffs[1], src_pix2[  scn+1], fma(coeffs[2], src_pix2[  scn+bidx], 16.5f))));
-                y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
-                y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
+                y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[      2-BIDX], fma(coeffs[1], src_pix1[      1], fma(coeffs[2], src_pix1[      BIDX], 16.5f))));
+                y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[  SCN+2-BIDX], fma(coeffs[1], src_pix1[  SCN+1], fma(coeffs[2], src_pix1[  SCN+BIDX], 16.5f))));
+                y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*SCN+2-BIDX], fma(coeffs[1], src_pix1[2*SCN+1], fma(coeffs[2], src_pix1[2*SCN+BIDX], 16.5f))));
+                y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*SCN+2-BIDX], fma(coeffs[1], src_pix1[3*SCN+1], fma(coeffs[2], src_pix1[3*SCN+BIDX], 16.5f))));
+                y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[      2-BIDX], fma(coeffs[1], src_pix2[      1], fma(coeffs[2], src_pix2[      BIDX], 16.5f))));
+                y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[  SCN+2-BIDX], fma(coeffs[1], src_pix2[  SCN+1], fma(coeffs[2], src_pix2[  SCN+BIDX], 16.5f))));
+                y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*SCN+2-BIDX], fma(coeffs[1], src_pix2[2*SCN+1], fma(coeffs[2], src_pix2[2*SCN+BIDX], 16.5f))));
+                y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*SCN+2-BIDX], fma(coeffs[1], src_pix2[3*SCN+1], fma(coeffs[2], src_pix2[3*SCN+BIDX], 16.5f))));
 
                 *((__global int*) ydst1) = as_int(y1);
                 *((__global int*) ydst2) = as_int(y2);
 
-                float uv[4] = { fma(coeffs[3], src_pix1[      2-bidx], fma(coeffs[4], src_pix1[      1], fma(coeffs[5], src_pix1[      bidx], 128.5f))),
-                                fma(coeffs[5], src_pix1[      2-bidx], fma(coeffs[6], src_pix1[      1], fma(coeffs[7], src_pix1[      bidx], 128.5f))),
-                                fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
-                                fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
+                float uv[4] = { fma(coeffs[3], src_pix1[      2-BIDX], fma(coeffs[4], src_pix1[      1], fma(coeffs[5], src_pix1[      BIDX], 128.5f))),
+                                fma(coeffs[5], src_pix1[      2-BIDX], fma(coeffs[6], src_pix1[      1], fma(coeffs[7], src_pix1[      BIDX], 128.5f))),
+                                fma(coeffs[3], src_pix1[2*SCN+2-BIDX], fma(coeffs[4], src_pix1[2*SCN+1], fma(coeffs[5], src_pix1[2*SCN+BIDX], 128.5f))),
+                                fma(coeffs[5], src_pix1[2*SCN+2-BIDX], fma(coeffs[6], src_pix1[2*SCN+1], fma(coeffs[7], src_pix1[2*SCN+BIDX], 128.5f))) };
 
-                udst[0] = convert_uchar_sat(uv[uidx]    );
-                vdst[0] = convert_uchar_sat(uv[1 - uidx]);
-                udst[1] = convert_uchar_sat(uv[2 + uidx]);
-                vdst[1] = convert_uchar_sat(uv[3 - uidx]);
+                udst[0] = convert_uchar_sat(uv[UIDX]    );
+                vdst[0] = convert_uchar_sat(uv[1 - UIDX]);
+                udst[1] = convert_uchar_sat(uv[2 + UIDX]);
+                vdst[1] = convert_uchar_sat(uv[3 - UIDX]);
 #else
                 float4 src_pix1 = convert_float4(vload4(0, src1));
-                float4 src_pix2 = convert_float4(vload4(0, src1+scn));
+                float4 src_pix2 = convert_float4(vload4(0, src1+SCN));
                 float4 src_pix3 = convert_float4(vload4(0, src2));
-                float4 src_pix4 = convert_float4(vload4(0, src2+scn));
+                float4 src_pix4 = convert_float4(vload4(0, src2+SCN));
 
                 ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
                 ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
@@ -497,8 +497,8 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
                 float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),
                                 fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };
 
-                udst[0] = convert_uchar_sat(uv[uidx]  );
-                vdst[0] = convert_uchar_sat(uv[1-uidx]);
+                udst[0] = convert_uchar_sat(uv[UIDX]  );
+                vdst[0] = convert_uchar_sat(uv[1-UIDX]);
 #endif
                 ++y;
                 src_index += 2*src_step;
@@ -520,7 +520,7 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
     if (x < cols / 2)
     {
         __global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);
-        __global uchar*       dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));
+        __global uchar*       dst = dstptr + mad24(y, dst_step, mad24(x << 1, DCN, dst_offset));
 
         #pragma unroll
         for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
@@ -530,34 +530,34 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
                 __constant float* coeffs = c_YUV2RGBCoeffs_420;
 
 #ifndef USE_OPTIMIZED_LOAD
-                float U = ((float) src[uidx]) - HALF_MAX_NUM;
-                float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX_NUM;
-                float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
-                float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
+                float U = ((float) src[UIDX]) - HALF_MAX_NUM;
+                float V = ((float) src[(2 + UIDX) % 4]) - HALF_MAX_NUM;
+                float y00 = max(0.f, ((float) src[YIDX]) - 16.f) * coeffs[0];
+                float y01 = max(0.f, ((float) src[YIDX + 2]) - 16.f) * coeffs[0];
 #else
                 int load_src = *((__global int*) src);
                 float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
-                float U = vec_src[uidx] - HALF_MAX_NUM;
-                float V = vec_src[(2 + uidx) % 4] - HALF_MAX_NUM;
-                float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];
-                float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];
+                float U = vec_src[UIDX] - HALF_MAX_NUM;
+                float V = vec_src[(2 + UIDX) % 4] - HALF_MAX_NUM;
+                float y00 = max(0.f, vec_src[YIDX] - 16.f) * coeffs[0];
+                float y01 = max(0.f, vec_src[YIDX + 2] - 16.f) * coeffs[0];
 #endif
 
                 float ruv = fma(coeffs[4], V, 0.5f);
                 float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
                 float buv = fma(coeffs[1], U, 0.5f);
 
-                dst[2 - bidx] = convert_uchar_sat(y00 + ruv);
+                dst[2 - BIDX] = convert_uchar_sat(y00 + ruv);
                 dst[1]        = convert_uchar_sat(y00 + guv);
-                dst[bidx]     = convert_uchar_sat(y00 + buv);
-#if dcn == 4
+                dst[BIDX]     = convert_uchar_sat(y00 + buv);
+#if DCN == 4
                 dst[3]        = 255;
 #endif
 
-                dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
-                dst[dcn + 1]        = convert_uchar_sat(y01 + guv);
-                dst[dcn + bidx]     = convert_uchar_sat(y01 + buv);
-#if dcn == 4
+                dst[DCN + 2 - BIDX] = convert_uchar_sat(y01 + ruv);
+                dst[DCN + 1]        = convert_uchar_sat(y01 + guv);
+                dst[DCN + BIDX]     = convert_uchar_sat(y01 + buv);
+#if DCN == 4
                 dst[7]        = 255;
 #endif
             }
@@ -568,6 +568,76 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
     }
 }
 
+// Coefficients based on ITU.BT-601, ISBN 1-878707-09-4 (https://fourcc.org/fccyvrgb.php)
+// The conversion coefficients for RGB to YUV422 are based on the ones for RGB to YUV.
+// For both Y components, the coefficients are applied as given in the link to each input RGB pixel
+// separately. For U and V, they are reduced by half to account for two RGB pixels contributing
+// to the same U and V values. In other words, the U and V contributions from the two RGB pixels
+// are averaged. The integer versions are obtained by multiplying the float versions by 16384
+// and rounding to the nearest integer.
+
+__constant float c_RGB2YUV422Coeffs_f[10]  = {0.0625, 0.5, 0.257, 0.504, 0.098, -0.074 , -0.1455, 0.2195, -0.184 , -0.0355};
+__constant int   c_RGB2YUV422Coeffs_i[10]  = {1024 * HALF_MAX_NUM * 2, 8192 * HALF_MAX_NUM * 2, 4211,  8258,  1606, -1212, -2384,  3596, -3015,  -582};
+
+__kernel void RGB2YUV_422(__global const uchar* srcptr, int src_step, int src_offset,
+                          __global uchar* dstptr, int dst_step, int dst_offset,
+                          int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols/2)
+    {
+        int src_index = mad24(y, src_step, mad24(x << 1, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x << 1, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
+                __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
+                DATA_TYPE_3 src_pix1 = vload3(0, src);
+                DATA_TYPE b1 = src_pix1.B_COMP, g1 = src_pix1.G_COMP, r1 = src_pix1.R_COMP;
+                DATA_TYPE_3 src_pix2 = vload3(0, src+SCN);
+                DATA_TYPE b2 = src_pix2.B_COMP, g2 = src_pix2.G_COMP, r2 = src_pix2.R_COMP;
+
+
+#ifdef DEPTH_5
+                __constant float * coeffs = c_RGB2YUV422Coeffs_f;
+                #define MAC_fn fma
+                #define res_dtype DATA_TYPE
+                #define mul_fn(x,y) (x*y)
+                #define output_scale_fn(x) x
+#else
+                __constant int * coeffs = c_RGB2YUV422Coeffs_i;
+                #define MAC_fn mad24
+                #define res_dtype int
+                #define mul_fn mul24
+                #define output_scale_fn(x) SAT_CAST(CV_DESCALE(x, yuv_shift))
+#endif
+
+                const res_dtype Y1 = MAC_fn(coeffs[2], r1, coeffs[0] + MAC_fn(coeffs[3], g1, mul_fn(coeffs[4], b1)));
+                const res_dtype Y2 = MAC_fn(coeffs[2], r2, coeffs[0] + MAC_fn(coeffs[3], g2, mul_fn(coeffs[4], b2)));
+
+                const res_dtype sr = r1+r2, sg = g1+g2, sb = b1+b2;
+                const res_dtype U = MAC_fn(coeffs[5], sr, coeffs[1] + MAC_fn(coeffs[6], sg, mul_fn(coeffs[7], sb)));
+                const res_dtype V = MAC_fn(coeffs[7], sr, coeffs[1] + MAC_fn(coeffs[8], sg, mul_fn(coeffs[9], sb)));
+
+                dst[UIDX] = output_scale_fn(U);
+                dst[(2 + UIDX) % 4] = output_scale_fn(V);
+                dst[YIDX] = output_scale_fn(Y1);
+                dst[YIDX+2] = output_scale_fn(Y2);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
 
 __constant float c_RGB2YCrCbCoeffs_f[5] = {R2YF, G2YF, B2YF, YCRF, YCBF};
@@ -658,10 +728,10 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
                 int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX_NUM), yuv_shift);
 #endif
 
-                dstptr[(bidx^2)] = SAT_CAST(r);
+                dstptr[(BIDX^2)] = SAT_CAST(r);
                 dstptr[1] = SAT_CAST(g);
-                dstptr[bidx] = SAT_CAST(b);
-#if dcn == 4
+                dstptr[BIDX] = SAT_CAST(b);
+#if DCN == 4
                 dstptr[3] = MAX_NUM;
 #endif
 
diff --git a/modules/imgproc/src/opencl/filter2DSmall.cl b/modules/imgproc/src/opencl/filter2DSmall.cl
index 564bbcfd7381..38f23d2fe27c 100644
--- a/modules/imgproc/src/opencl/filter2DSmall.cl
+++ b/modules/imgproc/src/opencl/filter2DSmall.cl
@@ -256,21 +256,21 @@ inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar*
 }
 
 // Macros to ensure unrolled loops
-#define LOOP1(VAR, STMT) (STMT); (VAR)++;
-#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
-#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
-#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
-#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
-#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
-#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
-#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
-#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
-#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
-#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
-#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
-#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
+#define LOOP1(VAR, STMT) STMT; (VAR)++;
+#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); STMT; (VAR)++;
+#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); STMT; (VAR)++;
+#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); STMT; (VAR)++;
+#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); STMT; (VAR)++;
+#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); STMT; (VAR)++;
+#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); STMT; (VAR)++;
+#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); STMT; (VAR)++;
+#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); STMT; (VAR)++;
+#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); STMT; (VAR)++;
+#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); STMT; (VAR)++;
+#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); STMT; (VAR)++;
+#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); STMT; (VAR)++;
 
-#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
+#define LOOP(N, VAR, STMT) CAT(LOOP, N)(VAR, STMT)
 
 #define DIG(a) a,
 __constant WT1 kernelData[] = { COEFF };
diff --git a/modules/imgproc/src/opencl/filterSmall.cl b/modules/imgproc/src/opencl/filterSmall.cl
index 222edc6068f2..2b5cabd3cb0b 100644
--- a/modules/imgproc/src/opencl/filterSmall.cl
+++ b/modules/imgproc/src/opencl/filterSmall.cl
@@ -177,21 +177,21 @@ inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar *
 }
 
 // Macros to ensure unrolled loops
-#define LOOP1(VAR, STMT) (STMT); (VAR)++;
-#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
-#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
-#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
-#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
-#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
-#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
-#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
-#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
-#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
-#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
-#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
-#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
-
-#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
+#define LOOP1(VAR, STMT) STMT; (VAR)++;
+#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); STMT; (VAR)++;
+#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); STMT; (VAR)++;
+#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); STMT; (VAR)++;
+#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); STMT; (VAR)++;
+#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); STMT; (VAR)++;
+#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); STMT; (VAR)++;
+#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); STMT; (VAR)++;
+#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); STMT; (VAR)++;
+#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); STMT; (VAR)++;
+#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); STMT; (VAR)++;
+#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); STMT; (VAR)++;
+#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); STMT; (VAR)++;
+
+#define LOOP(N, VAR, STMT) CAT(LOOP, N)(VAR, STMT)
 
 #ifdef OP_BOX_FILTER
 #define PROCESS_ELEM \
diff --git a/modules/imgproc/src/opencl/laplacian5.cl b/modules/imgproc/src/opencl/laplacian5.cl
index 1404a8c51efc..d6f7b9e66485 100644
--- a/modules/imgproc/src/opencl/laplacian5.cl
+++ b/modules/imgproc/src/opencl/laplacian5.cl
@@ -13,25 +13,25 @@
 __kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1_offset,
                          __global const uchar * src2ptr, int src2_step, int src2_offset,
                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                         coeffT scale, coeffT delta)
+                         COEFF_T scale, COEFF_T delta)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (y < dst_rows && x < dst_cols)
     {
-        int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(srcT), src1_offset));
-        int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(srcT), src2_offset));
-        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));
+        int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(SRC_T), src1_offset));
+        int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(SRC_T), src2_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(DST_T), dst_offset));
 
-        __global const srcT * src1 = (__global const srcT *)(src1ptr + src1_index);
-        __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
-        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+        __global const SRC_T * src1 = (__global const SRC_T *)(src1ptr + src1_index);
+        __global const SRC_T * src2 = (__global const SRC_T *)(src2ptr + src2_index);
+        __global DST_T * dst = (__global DST_T *)(dstptr + dst_index);
 
-#if wdepth <= 4
-        dst[0] = convertToDT( mad24((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );
+#if WDEPTH <= 4
+        dst[0] = CONVERT_TO_DT( mad24((WT)(scale), CONVERT_TO_WT(src1[0]) + CONVERT_TO_WT(src2[0]), (WT)(delta)) );
 #else
-        dst[0] = convertToDT( mad((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );
+        dst[0] = CONVERT_TO_DT( mad((WT)(scale), CONVERT_TO_WT(src1[0]) + CONVERT_TO_WT(src2[0]), (WT)(delta)) );
 #endif
     }
 }
@@ -74,18 +74,18 @@ __kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1
 #endif
 
 #if CN != 3
-#define loadpix(addr) *(__global const srcT *)(addr)
-#define storepix(val, addr)  *(__global dstT *)(addr) = val
-#define SRCSIZE (int)sizeof(srcT)
-#define DSTSIZE (int)sizeof(dstT)
+#define loadpix(addr) *(__global const SRC_T *)(addr)
+#define storepix(val, addr)  *(__global DST_T *)(addr) = val
+#define SRCSIZE (int)sizeof(SRC_T)
+#define DSTSIZE (int)sizeof(DST_T)
 #else
-#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
-#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
-#define SRCSIZE (int)sizeof(srcT1)*3
-#define DSTSIZE (int)sizeof(dstT1)*3
+#define loadpix(addr)  vload3(0, (__global const SRC_T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global DST_T1 *)(addr))
+#define SRCSIZE (int)sizeof(SRC_T1)*3
+#define DSTSIZE (int)sizeof(DST_T1)*3
 #endif
 
-#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
+#define SRC(_x,_y) CONVERT_TO_WT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
 
 #ifdef BORDER_CONSTANT
 // CCCCCC|abcdefgh|CCCCCCC
@@ -173,7 +173,7 @@ __kernel void laplacian(__global uchar* Src, int src_step, int srcOffsetX, int s
             }
 
             WT sum = mad(scale_v, (sum1 + sum2), delta_v);
-            storepix(convertToDT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
+            storepix(CONVERT_TO_DT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
         }
 
         for (int i = liy * BLK_X + lix; i < (RADIUS*2) * (BLK_X+(RADIUS*2)); i += BLK_X * BLK_Y)
@@ -203,4 +203,4 @@ __kernel void laplacian(__global uchar* Src, int src_step, int srcOffsetX, int s
     }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/imgproc/src/opencl/pyr_down.cl b/modules/imgproc/src/opencl/pyr_down.cl
index 5d2e7156eff1..22b1472b3e53 100644
--- a/modules/imgproc/src/opencl/pyr_down.cl
+++ b/modules/imgproc/src/opencl/pyr_down.cl
@@ -67,7 +67,7 @@
 #error No extrapolation method
 #endif
 
-#if cn != 3
+#if CN != 3
 #define loadpix(addr)  *(__global const T*)(addr)
 #define storepix(val, addr)  *(__global T*)(addr) = (val)
 #define PIXSIZE ((int)sizeof(T))
@@ -77,9 +77,9 @@
 #define PIXSIZE ((int)sizeof(T1)*3)
 #endif
 
-#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
+#define SRC(_x,_y) CONVERT_TO_FT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
 
-#if kercn == 4
+#if KERCN == 4
 #define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))
 #endif
 
@@ -107,7 +107,7 @@
     smem[1][col_lcl] = sum1;
 
 
-#if kercn == 4
+#if KERCN == 4
 #define LOAD_LOCAL4(col_gl, col_lcl) \
     sum40 =     co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));           \
     sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40);   \
@@ -131,7 +131,7 @@
 __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
                          __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
 {
-    const int x = get_global_id(0)*kercn;
+    const int x = get_global_id(0)*KERCN;
     const int y = 2*get_global_id(1);
 
     __local FT smem[2][LOCAL_SIZE + 4];
@@ -150,7 +150,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
     {
 #undef EXTRAPOLATE_
 #define EXTRAPOLATE_(val, maxVal)   val
-#if kercn == 1
+#if KERCN == 1
         col = EXTRAPOLATE(x, src_cols);
         LOAD_LOCAL(col, 2 + get_local_id(0))
 #else
@@ -183,7 +183,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
     {
 #undef EXTRAPOLATE_
 #define EXTRAPOLATE_(val, maxVal)   EXTRAPOLATE(val, maxVal)
-#if kercn == 1
+#if KERCN == 1
         col = EXTRAPOLATE(x, src_cols);
         LOAD_LOCAL(col, 2 + get_local_id(0))
 #else
@@ -215,7 +215,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if kercn == 1
+#if KERCN == 1
     if (get_local_id(0) < LOCAL_SIZE / 2)
     {
         const int tid2 = get_local_id(0) * 2;
@@ -226,8 +226,8 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
         {
             for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
             {
-#if cn == 1
-#if fdepth <= 5
+#if CN == 1
+#if FDEPTH <= 5
                 FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
 #else
                 FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
@@ -239,7 +239,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
                 sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
 #endif
                 sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
-                storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
+                storepix(CONVERT_TO_T(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
             }
         }
     }
@@ -256,7 +256,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
             sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
             sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
             sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
-            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
+            storepix(CONVERT_TO_T(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
 
             dst_x ++;
             sum =     co3* smem[yin - y][2 + tid4 + 4];
@@ -264,7 +264,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
             sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
             sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
             sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
-            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
+            storepix(CONVERT_TO_T(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
             dst_x --;
         }
 
@@ -279,7 +279,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
             sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
             sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
 
-            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
+            storepix(CONVERT_TO_T(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
         }
     }
 #endif
diff --git a/modules/imgproc/src/opencl/pyr_up.cl b/modules/imgproc/src/opencl/pyr_up.cl
index d033d7ee4ec4..194be1c8308f 100644
--- a/modules/imgproc/src/opencl/pyr_up.cl
+++ b/modules/imgproc/src/opencl/pyr_up.cl
@@ -58,7 +58,7 @@
 #endif
 #endif
 
-#if cn != 3
+#if CN != 3
 #define loadpix(addr)  *(__global const T*)(addr)
 #define storepix(val, addr)  *(__global T*)(addr) = (val)
 #define PIXSIZE ((int)sizeof(T))
@@ -92,7 +92,7 @@ __kernel void pyrUp(__global const uchar * src, int src_step, int src_offset, in
         int srcx = EXTRAPOLATE(mad24((int)get_group_id(0), LOCAL_SIZE/2, tidx) - 1, src_cols);
         int srcy = EXTRAPOLATE(mad24((int)get_group_id(1), LOCAL_SIZE/2, tidy) - 1, src_rows);
 
-        s_srcPatch[tidy][tidx] = convertToFT(loadpix(srcData + srcy * src_step + srcx * PIXSIZE));
+        s_srcPatch[tidy][tidx] = CONVERT_TO_FT(loadpix(srcData + srcy * src_step + srcx * PIXSIZE));
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -124,7 +124,7 @@ __kernel void pyrUp(__global const uchar * src, int src_step, int src_offset, in
     sum = mad(coefy2, s_dstPatch[1 + ((tidy + 2) >> 1)][tidx], sum);
 
     if ((x < dst_cols) && (y < dst_rows))
-        storepix(convertToT(sum), dstData + y * dst_step + x * PIXSIZE);
+        storepix(CONVERT_TO_T(sum), dstData + y * dst_step + x * PIXSIZE);
 }
 
 
@@ -149,10 +149,10 @@ __kernel void pyrUp_unrolled(__global const uchar * src, int src_step, int src_o
         int srcx2 = EXTRAPOLATE(srcx+1, src_cols);
         int srcy1 = EXTRAPOLATE(srcy, src_rows);
         int srcy2 = EXTRAPOLATE(srcy+1, src_rows);
-        s_srcPatch[ly][lx] = convertToFT(loadpix(srcData + srcy1 * src_step + srcx1 * PIXSIZE));
-        s_srcPatch[ly+1][lx] = convertToFT(loadpix(srcData + srcy2 * src_step + srcx1 * PIXSIZE));
-        s_srcPatch[ly][lx+1] = convertToFT(loadpix(srcData + srcy1 * src_step + srcx2 * PIXSIZE));
-        s_srcPatch[ly+1][lx+1] = convertToFT(loadpix(srcData + srcy2 * src_step + srcx2 * PIXSIZE));
+        s_srcPatch[ly][lx] = CONVERT_TO_FT(loadpix(srcData + srcy1 * src_step + srcx1 * PIXSIZE));
+        s_srcPatch[ly+1][lx] = CONVERT_TO_FT(loadpix(srcData + srcy2 * src_step + srcx1 * PIXSIZE));
+        s_srcPatch[ly][lx+1] = CONVERT_TO_FT(loadpix(srcData + srcy1 * src_step + srcx2 * PIXSIZE));
+        s_srcPatch[ly+1][lx+1] = CONVERT_TO_FT(loadpix(srcData + srcy2 * src_step + srcx2 * PIXSIZE));
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -213,22 +213,22 @@ __kernel void pyrUp_unrolled(__global const uchar * src, int src_step, int src_o
         sum =       co3 * s_dstPatch[1 + get_local_id(1) - 1][lx];
         sum = mad(co1, s_dstPatch[1 + get_local_id(1)    ][lx], sum);
         sum = mad(co3, s_dstPatch[1 + get_local_id(1) + 1][lx], sum);
-        storepix(convertToT(sum), dstData + dst_y * dst_step + dst_x * PIXSIZE);
+        storepix(CONVERT_TO_T(sum), dstData + dst_y * dst_step + dst_x * PIXSIZE);
 
         // (x+1,y)
         sum =       co3 * s_dstPatch[1 + get_local_id(1) - 1][lx+1];
         sum = mad(co1, s_dstPatch[1 + get_local_id(1)    ][lx+1], sum);
         sum = mad(co3, s_dstPatch[1 + get_local_id(1) + 1][lx+1], sum);
-        storepix(convertToT(sum), dstData + dst_y * dst_step + (dst_x+1) * PIXSIZE);
+        storepix(CONVERT_TO_T(sum), dstData + dst_y * dst_step + (dst_x+1) * PIXSIZE);
 
         // (x,y+1)
         sum =       co2 * s_dstPatch[1 + get_local_id(1)    ][lx];
         sum = mad(co2, s_dstPatch[1 + get_local_id(1) + 1][lx], sum);
-        storepix(convertToT(sum), dstData + (dst_y+1) * dst_step + dst_x * PIXSIZE);
+        storepix(CONVERT_TO_T(sum), dstData + (dst_y+1) * dst_step + dst_x * PIXSIZE);
 
         // (x+1,y+1)
         sum =       co2 * s_dstPatch[1 + get_local_id(1)    ][lx+1];
         sum = mad(co2, s_dstPatch[1 + get_local_id(1) + 1][lx+1], sum);
-        storepix(convertToT(sum), dstData + (dst_y+1) * dst_step + (dst_x+1) * PIXSIZE);
+        storepix(CONVERT_TO_T(sum), dstData + (dst_y+1) * dst_step + (dst_x+1) * PIXSIZE);
     }
 }
diff --git a/modules/imgproc/src/opencl/remap.cl b/modules/imgproc/src/opencl/remap.cl
index 1a30c326b97c..9aadde021549 100644
--- a/modules/imgproc/src/opencl/remap.cl
+++ b/modules/imgproc/src/opencl/remap.cl
@@ -53,7 +53,7 @@
 
 #define noconvert
 
-#if cn != 3
+#if CN != 3
 #define loadpix(addr)  *(__global const T*)(addr)
 #define storepix(val, addr)  *(__global T*)(addr) = val
 #define TSIZE ((int)sizeof(T))
@@ -73,7 +73,7 @@ enum
 };
 
 #ifdef INTER_NEAREST
-#define convertToWT
+#define CONVERT_TO_WT
 #endif
 
 #ifdef BORDER_CONSTANT
@@ -82,7 +82,7 @@ enum
 #define EXTRAPOLATE(v2, v) \
     { \
         v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \
-        v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
+        v = CONVERT_TO_WT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
     }
 #elif defined BORDER_WRAP
 #define EXTRAPOLATE(v2, v) \
@@ -96,7 +96,7 @@ enum
             v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \
         if( v2.y >= src_rows ) \
             v2.y %= src_rows; \
-        v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
+        v = CONVERT_TO_WT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
     }
 #elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
 #ifdef BORDER_REFLECT
@@ -130,7 +130,7 @@ enum
                     v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \
             } \
             while (v2.y >= src_rows || v2.y < 0); \
-        v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
+        v = CONVERT_TO_WT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
     }
 #else
 #error No extrapolation method
@@ -147,7 +147,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                             ST nVal)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1) * rowsPerWI;
+    int y = get_global_id(1) * ROWS_PER_WI;
 
     if (x < dst_cols)
     {
@@ -158,7 +158,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
         int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
 
         #pragma unroll
-        for (int i = 0; i < rowsPerWI; ++i, ++y,
+        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
             map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
             if (y < dst_rows)
             {
@@ -168,6 +168,10 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
 
                 int gx = convert_int_sat_rte(map1[0]);
                 int gy = convert_int_sat_rte(map2[0]);
+                #if WARP_RELATIVE
+                gx += x;
+                gy += y;
+                #endif
 
                 if (NEED_EXTRAPOLATION(gx, gy))
                 {
@@ -193,7 +197,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
                           ST nVal)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1) * rowsPerWI;
+    int y = get_global_id(1) * ROWS_PER_WI;
 
     if (x < dst_cols)
     {
@@ -202,7 +206,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
         int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
 
         #pragma unroll
-        for (int i = 0; i < rowsPerWI; ++i, ++y,
+        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
             map_index += map_step, dst_index += dst_step)
             if (y < dst_rows)
             {
@@ -210,6 +214,11 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
                 __global T * dst = (__global T *)(dstptr + dst_index);
 
                 int2 gxy = convert_int2_sat_rte(map[0]);
+                #if WARP_RELATIVE
+                gxy.x += x;
+                gxy.y += y;
+                #endif
+
                 int gx = gxy.x, gy = gxy.y;
 
                 if (NEED_EXTRAPOLATION(gx, gy))
@@ -233,7 +242,7 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o
                           ST nVal)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1) * rowsPerWI;
+    int y = get_global_id(1) * ROWS_PER_WI;
 
     if (x < dst_cols)
     {
@@ -242,7 +251,7 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o
         int map_index = mad24(y, map_step, mad24(x, (int)sizeof(short2), map_offset));
 
         #pragma unroll
-        for (int i = 0; i < rowsPerWI; ++i, ++y,
+        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
             map_index += map_step, dst_index += dst_step)
             if (y < dst_rows)
             {
@@ -250,6 +259,11 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o
                 __global T * dst = (__global T *)(dstptr + dst_index);
 
                 int2 gxy = convert_int2(map[0]);
+                #if WARP_RELATIVE
+                gxy.x += x;
+                gxy.y += y;
+                #endif
+
                 int gx = gxy.x, gy = gxy.y;
 
                 if (NEED_EXTRAPOLATION(gx, gy))
@@ -274,7 +288,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
                                 ST nVal)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1) * rowsPerWI;
+    int y = get_global_id(1) * ROWS_PER_WI;
 
     if (x < dst_cols)
     {
@@ -284,7 +298,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
         int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
 
         #pragma unroll
-        for (int i = 0; i < rowsPerWI; ++i, ++y,
+        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
             map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
             if (y < dst_rows)
             {
@@ -296,6 +310,11 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
                 int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0;
                 int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0;
                 int2 gxy = convert_int2(map1[0]) + (int2)(dx, dy);
+                #if WARP_RELATIVE
+                gxy.x += x;
+                gxy.y += y;
+                #endif
+
                 int gx = gxy.x, gy = gxy.y;
 
                 if (NEED_EXTRAPOLATION(gx, gy))
@@ -330,17 +349,17 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
                                 ST nVal)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1) * rowsPerWI;
+    int y = get_global_id(1) * ROWS_PER_WI;
 
     if (x < dst_cols)
     {
-        WT scalar = convertToWT(convertScalar(nVal));
+        WT scalar = CONVERT_TO_WT(convertScalar(nVal));
         int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
         int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
         int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
 
         #pragma unroll
-        for (int i = 0; i < rowsPerWI; ++i, ++y,
+        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
             map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
             if (y < dst_rows)
             {
@@ -349,6 +368,10 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
                 __global T * dst = (__global T *)(dstptr + dst_index);
 
                 int2 map_dataA = convert_int2(map1[0]);
+                #if WARP_RELATIVE
+                map_dataA.x += x;
+                map_dataA.y += y;
+                #endif
                 int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
                 int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
                 int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
@@ -359,22 +382,22 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
                 WT a = scalar, b = scalar, c = scalar, d = scalar;
 
                 if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
-                    a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
+                    a = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataA, a);
 
                 if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
-                    b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
+                    b = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataB, b);
 
                 if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
-                    c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
+                    c = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataC, c);
 
                 if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
-                    d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
+                    d = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataD, d);
 
@@ -382,7 +405,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
                               b * (u.x)     * (1 - u.y) +
                               c * (1 - u.x) * (u.y) +
                               d * (u.x)     * (u.y);
-                storepix(convertToT(dst_data), dst);
+                storepix(CONVERT_TO_T(dst_data), dst);
             }
     }
 }
@@ -394,17 +417,17 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                             ST nVal)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1) * rowsPerWI;
+    int y = get_global_id(1) * ROWS_PER_WI;
 
     if (x < dst_cols)
     {
-        WT scalar = convertToWT(convertScalar(nVal));
+        WT scalar = CONVERT_TO_WT(convertScalar(nVal));
         int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
         int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
         int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
 
         #pragma unroll
-        for (int i = 0; i < rowsPerWI; ++i, ++y,
+        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
             map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
             if (y < dst_rows)
             {
@@ -414,8 +437,12 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
 
 #if defined BORDER_CONSTANT
                 float xf = map1[0], yf = map2[0];
-                int sx = convert_int_sat_rtz(mad(xf, (float)INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;
-                int sy = convert_int_sat_rtz(mad(yf, (float)INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;
+                int sx = (convert_int_sat_rtz(mad(xf, (float)INTER_TAB_SIZE, 0.5f)) >> INTER_BITS);
+                int sy = (convert_int_sat_rtz(mad(yf, (float)INTER_TAB_SIZE, 0.5f)) >> INTER_BITS);
+                #if WARP_RELATIVE
+                sx += x;
+                sy += y;
+                #endif
 
                 __constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
                 __constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
@@ -431,13 +458,13 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                         xsum = (WT)(0);
                         if (sx >= 0 && sx + 2 < src_cols)
                         {
-#if depth == 0 && cn == 1
+#if SRC_DEPTH == 0 && CN == 1
                             uchar2 value = vload2(0, srcptr + src_index);
                             xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
 #else
                             #pragma unroll
                             for (int xp = 0; xp < 2; ++xp)
-                                xsum = fma(convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);
+                                xsum = fma(CONVERT_TO_WT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);
 #endif
                         }
                         else
@@ -445,7 +472,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                             #pragma unroll
                             for (int xp = 0; xp < 2; ++xp)
                                 xsum = fma(sx + xp >= 0 && sx + xp < src_cols ?
-                                           convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);
+                                           CONVERT_TO_WT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);
                         }
                         sum = fma(xsum, coeffs_y[yp], sum);
                     }
@@ -453,9 +480,13 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                         sum = fma(scalar, coeffs_y[yp], sum);
                 }
 
-                storepix(convertToT(sum), dst);
+                storepix(CONVERT_TO_T(sum), dst);
 #else
                 float2 map_data = (float2)(map1[0], map2[0]);
+                #if WARP_RELATIVE
+                map_data.x += x;
+                map_data.y += y;
+                #endif
 
                 int2 map_dataA = convert_int2_sat_rtn(map_data);
                 int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
@@ -463,27 +494,27 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                 int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
 
                 float2 _u = map_data - convert_float2(map_dataA);
-                WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
-                WT scalar = convertToWT(convertScalar(nVal));
+                WT2 u = CONVERT_TO_WT2(convert_int2_rte(CONVERT_TO_WT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
+                WT scalar = CONVERT_TO_WT(convertScalar(nVal));
                 WT a = scalar, b = scalar, c = scalar, d = scalar;
 
                 if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
-                    a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
+                    a = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataA, a);
 
                 if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
-                    b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
+                    b = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataB, b);
 
                 if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
-                    c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
+                    c = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataC, c);
 
                 if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
-                    d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
+                    d = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataD, d);
 
@@ -491,7 +522,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
                               b * (u.x)     * (1 - u.y) +
                               c * (1 - u.x) * (u.y) +
                               d * (u.x)     * (u.y);
-                storepix(convertToT(dst_data), dst);
+                storepix(CONVERT_TO_T(dst_data), dst);
 #endif
             }
     }
@@ -503,16 +534,16 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
                           ST nVal)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1) * rowsPerWI;
+    int y = get_global_id(1) * ROWS_PER_WI;
 
     if (x < dst_cols)
     {
-        WT scalar = convertToWT(convertScalar(nVal));
+        WT scalar = CONVERT_TO_WT(convertScalar(nVal));
         int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
         int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
 
         #pragma unroll
-        for (int i = 0; i < rowsPerWI; ++i, ++y,
+        for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
             map_index += map_step, dst_index += dst_step)
             if (y < dst_rows)
             {
@@ -520,32 +551,36 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
                 __global T * dst = (__global T *)(dstptr + dst_index);
 
                 float2 map_data = map[0];
+                #if WARP_RELATIVE
+                map_data.x += x;
+                map_data.y += y;
+                #endif
                 int2 map_dataA = convert_int2_sat_rtn(map_data);
                 int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
                 int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
                 int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
 
                 float2 _u = map_data - convert_float2(map_dataA);
-                WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
+                WT2 u = CONVERT_TO_WT2(convert_int2_rte(CONVERT_TO_WT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
                 WT a = scalar, b = scalar, c = scalar, d = scalar;
 
                 if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
-                    a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
+                    a = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataA, a);
 
                 if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
-                    b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
+                    b = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataB, b);
 
                 if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
-                    c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
+                    c = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataC, c);
 
                 if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
-                    d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
+                    d = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
                 else
                     EXTRAPOLATE(map_dataD, d);
 
@@ -553,7 +588,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
                               b * (u.x)     * (1 - u.y) +
                               c * (1 - u.x) * (u.y) +
                               d * (u.x)     * (u.y);
-                storepix(convertToT(dst_data), dst);
+                storepix(CONVERT_TO_T(dst_data), dst);
             }
     }
 }
diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl
index a28c59296e00..e31c80dc3bba 100644
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@@ -55,44 +55,44 @@
 
 #define noconvert
 
-#if cn != 3
+#if CN != 3
 #define loadpix(addr)  *(__global const T *)(addr)
 #define storepix(val, addr)  *(__global T *)(addr) = val
 #define TSIZE (int)sizeof(T)
 #else
 #define loadpix(addr)  vload3(0, (__global const T1 *)(addr))
 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
-#define TSIZE (int)sizeof(T1)*cn
+#define TSIZE (int)sizeof(T1)*CN
 #endif
 
 #if defined USE_SAMPLER
 
-#if cn == 1
+#if CN == 1
 #define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).x
 #define INTERMEDIATE_TYPE  float
-#elif cn == 2
+#elif CN == 2
 #define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xy
 #define INTERMEDIATE_TYPE  float2
-#elif cn == 3
+#elif CN == 3
 #define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xyz
 #define INTERMEDIATE_TYPE  float3
-#elif cn == 4
+#elif CN == 4
 #define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z)
 #define INTERMEDIATE_TYPE  float4
 #endif
 
 #define __CAT(x, y) x##y
 #define CAT(x, y) __CAT(x, y)
-//#define INTERMEDIATE_TYPE CAT(float, cn)
+//#define INTERMEDIATE_TYPE CAT(float, CN)
 #define float1 float
 
-#if depth == 0
+#if SRC_DEPTH == 0
 #define RESULT_SCALE    255.0f
-#elif depth == 1
+#elif SRC_DEPTH == 1
 #define RESULT_SCALE    127.0f
-#elif depth == 2
+#elif SRC_DEPTH == 2
 #define RESULT_SCALE    65535.0f
-#elif depth == 3
+#elif SRC_DEPTH == 3
 #define RESULT_SCALE    32767.0f
 #else
 #define RESULT_SCALE    1.0f
@@ -114,10 +114,10 @@ __kernel void resizeSampler(__read_only image2d_t srcImage,
 
     INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));
 
-#if depth <= 4
-    T uval = convertToDT(round(intermediate * RESULT_SCALE));
+#if SRC_DEPTH <= 4
+    T uval = CONVERT_TO_DT(round(intermediate * RESULT_SCALE));
 #else
-    T uval = convertToDT(intermediate * RESULT_SCALE);
+    T uval = CONVERT_TO_DT(intermediate * RESULT_SCALE);
 #endif
 
     if(dx < dstcols && dy < dstrows)
@@ -149,15 +149,15 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
 
         int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
         src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
-        WT data0 = convertToWT(loadpix(srcptr + src_index0));
-        WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
-        WT data2 = convertToWT(loadpix(srcptr + src_index1));
-        WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));
+        WT data0 = CONVERT_TO_WT(loadpix(srcptr + src_index0));
+        WT data1 = CONVERT_TO_WT(loadpix(srcptr + src_index0 + TSIZE));
+        WT data2 = CONVERT_TO_WT(loadpix(srcptr + src_index1));
+        WT data3 = CONVERT_TO_WT(loadpix(srcptr + src_index1 + TSIZE));
 
         WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
                  ( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);
 
-        storepix(convertToDT((val + 2) >> 2),
+        storepix(CONVERT_TO_DT((val + 2) >> 2),
                  dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
     }
 }
@@ -186,7 +186,7 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
         int y_ = INC(y, src_rows);
         int x_ = INC(x, src_cols);
 
-#if depth <= 1  // 8U/8S only, 16U+ cause integer overflows
+#if SRC_DEPTH <= 1  // 8U/8S only, 16U+ cause integer overflows
 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
 #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
         u = u * INTER_RESIZE_COEF_SCALE;
@@ -197,24 +197,24 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
         int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
         int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
 
-        WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
-        WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
-        WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
-        WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
+        WT data0 = CONVERT_TO_WT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
+        WT data1 = CONVERT_TO_WT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
+        WT data2 = CONVERT_TO_WT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
+        WT data3 = CONVERT_TO_WT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
 
         WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
                    mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);
 
-        T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
+        T uval = CONVERT_TO_DT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
 #else
         float u1 = 1.f - u;
         float v1 = 1.f - v;
-        WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
-        WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
-        WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
-        WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
+        WT data0 = CONVERT_TO_WT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
+        WT data1 = CONVERT_TO_WT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
+        WT data2 = CONVERT_TO_WT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
+        WT data3 = CONVERT_TO_WT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
 
-        T uval = convertToDT((u1 * v1) * data0 + (u * v1) * data1 + (u1 * v) * data2 + (u * v) * data3);
+        T uval = CONVERT_TO_DT((u1 * v1) * data0 + (u * v1) * data1 + (u1 * v) * data2 + (u * v) * data3);
 #endif
         storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
     }
@@ -268,11 +268,11 @@ __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_
             for (int px = 0; px < XSCALE; ++px)
             {
                 int x = min(sx + px, src_cols - 1);
-                sum += convertToWTV(loadpix(src + src_index + x*TSIZE));
+                sum += CONVERT_TO_WTV(loadpix(src + src_index + x*TSIZE));
             }
         }
 
-        storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
+        storepix(CONVERT_TO_T(CONVERT_TO_WT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
     }
 }
 
@@ -314,12 +314,12 @@ __kernel void resizeAREA(__global const uchar * src, int src_step, int src_offse
             for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
             {
                 WTV alpha = (WTV)(xalpha_tab[xk]);
-                buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
+                buf += CONVERT_TO_WTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
             }
             sum += buf * beta;
         }
 
-        storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
+    storepix(CONVERT_TO_T(sum), dst + mad24(dx, TSIZE, dst_index));
     }
 }
 
diff --git a/modules/imgproc/src/opencl/warp_affine.cl b/modules/imgproc/src/opencl/warp_affine.cl
index bfbd0a6bfbcb..d937e237ff44 100644
--- a/modules/imgproc/src/opencl/warp_affine.cl
+++ b/modules/imgproc/src/opencl/warp_affine.cl
@@ -66,7 +66,7 @@
 #define ST T
 #endif
 
-#if cn != 3
+#if CN != 3
 #define loadpix(addr)  *(__global const T*)(addr)
 #define storepix(val, addr)  *(__global T*)(addr) = val
 #define scalar scalar_
@@ -89,7 +89,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                          __constant CT * M, ST scalar_)
 {
     int dx = get_global_id(0);
-    int dy0 = get_global_id(1) * rowsPerWI;
+    int dy0 = get_global_id(1) * ROWS_PER_WI;
 
     if (dx < dst_cols)
     {
@@ -99,7 +99,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         int Y0_ = rint(M[3] * dx * AB_SCALE);
         int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset));
 
-        for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step)
+        for (int dy = dy0, dy1 = min(dst_rows, dy0 + ROWS_PER_WI); dy < dy1; ++dy, dst_index += dst_step)
         {
             int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + round_delta;
             int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + round_delta;
@@ -133,7 +133,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                          __constant CT * M, ST scalar_)
 {
     int dx = get_global_id(0);
-    int dy0 = get_global_id(1) * rowsPerWI;
+    int dy0 = get_global_id(1) * ROWS_PER_WI;
 
     if (dx < dst_cols)
     {
@@ -141,7 +141,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         int X0_ = rint(M[0] * tmp);
         int Y0_ = rint(M[3] * tmp);
 
-        for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)
+        for (int dy = dy0, dy1 = min(dst_rows, dy0 + ROWS_PER_WI); dy < dy1; ++dy)
         {
             int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + ROUND_DELTA;
             int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + ROUND_DELTA;
@@ -151,21 +151,21 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
             short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS);
             short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
 
-#if defined AMD_DEVICE || depth > 4
+#if defined AMD_DEVICE || SRC_DEPTH > 4
             WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;
             if (sx >= 0 && sx < src_cols)
             {
                 if (sy >= 0 && sy < src_rows)
-                    v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))));
+                    v0 = CONVERT_TO_WT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))));
                 if (sy+1 >= 0 && sy+1 < src_rows)
-                    v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset))));
+                    v2 = CONVERT_TO_WT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset))));
             }
             if (sx+1 >= 0 && sx+1 < src_cols)
             {
                 if (sy >= 0 && sy < src_rows)
-                    v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset))));
+                    v1 = CONVERT_TO_WT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset))));
                 if (sy+1 >= 0 && sy+1 < src_rows)
-                    v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset))));
+                    v3 = CONVERT_TO_WT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset))));
             }
 
             float taby = 1.f/INTER_TAB_SIZE*ay;
@@ -173,18 +173,18 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
 
             int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
 
-#if depth <= 4
+#if SRC_DEPTH <= 4
             int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
             int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
             int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
             int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
 
             WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3)));
-            storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
+            storepix(CONVERT_TO_T((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
 #else
             float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
             WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby));
-            storepix(convertToT(val), dstptr + dst_index);
+            storepix(CONVERT_TO_T(val), dstptr + dst_index);
 #endif
 #else // INTEL_DEVICE
             __constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1);
@@ -202,13 +202,13 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                     xsum = (WT)(0);
                     if (sx >= 0 && sx + 2 < src_cols)
                     {
-#if depth == 0 && cn == 1
+#if SRC_DEPTH == 0 && CN == 1
                         uchar2 value = vload2(0, srcptr + src_index);
                         xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
 #else
                         #pragma unroll
                         for (int x = 0; x < 2; x++)
-                            xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
+                            xsum = fma(CONVERT_TO_WT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
 #endif
                     }
                     else
@@ -216,7 +216,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                         #pragma unroll
                         for (int x = 0; x < 2; x++)
                             xsum = fma(sx + x >= 0 && sx + x < src_cols ?
-                                       convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
+                                       CONVERT_TO_WT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
                     }
                     sum = fma(xsum, coeffs_y[y], sum);
                 }
@@ -224,7 +224,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                     sum = fma(scalar, coeffs_y[y], sum);
             }
 
-            storepix(convertToT(sum), dstptr + dst_index);
+            storepix(CONVERT_TO_T(sum), dstptr + dst_index);
 #endif
         }
     }
@@ -290,7 +290,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                 #pragma unroll
                 for (int x = 0; x < 4; x++)
                     v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ?
-                        convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar;
+                        CONVERT_TO_WT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar;
             }
             else
             {
@@ -310,7 +310,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
 
         WT sum = (WT)(0);
-#if depth <= 4
+#if SRC_DEPTH <= 4
         int itab[16];
 
         #pragma unroll
@@ -320,12 +320,12 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         #pragma unroll
         for (int i = 0; i < 16; i++)
             sum = mad24(v[i], itab[i], sum);
-        storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
+        storepix(CONVERT_TO_T( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
 #else
         #pragma unroll
         for (int i = 0; i < 16; i++)
             sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum);
-        storepix(convertToT( sum ), dstptr + dst_index);
+        storepix(CONVERT_TO_T( sum ), dstptr + dst_index);
 #endif
 #else // INTEL_DEVICE
         __constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2);
@@ -343,13 +343,13 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                 xsum = (WT)(0);
                 if (sx >= 0 && sx + 4 < src_cols)
                 {
-#if depth == 0 && cn == 1
+#if SRC_DEPTH == 0 && CN == 1
                     uchar4 value = vload4(0, srcptr + src_index);
                     xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
 #else
                     #pragma unroll
                     for (int x = 0; x < 4; x++)
-                        xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
+                        xsum = fma(CONVERT_TO_WT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
 #endif
                 }
                 else
@@ -357,7 +357,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                     #pragma unroll
                     for (int x = 0; x < 4; x++)
                         xsum = fma(sx + x >= 0 && sx + x < src_cols ?
-                                   convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
+                                   CONVERT_TO_WT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
                 }
                 sum = fma(xsum, coeffs_y[y], sum);
             }
@@ -365,7 +365,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
                 sum = fma(scalar, coeffs_y[y], sum);
         }
 
-        storepix(convertToT(sum), dstptr + dst_index);
+        storepix(CONVERT_TO_T(sum), dstptr + dst_index);
 #endif
     }
 }
diff --git a/modules/imgproc/src/opencl/warp_perspective.cl b/modules/imgproc/src/opencl/warp_perspective.cl
index 20e3a274040e..06bc2bd7fa19 100644
--- a/modules/imgproc/src/opencl/warp_perspective.cl
+++ b/modules/imgproc/src/opencl/warp_perspective.cl
@@ -65,7 +65,7 @@
 #define ST T
 #endif
 
-#if cn != 3
+#if CN != 3
 #define loadpix(addr)  *(__global const T*)(addr)
 #define storepix(val, addr)  *(__global T*)(addr) = val
 #define scalar scalar_
@@ -134,31 +134,31 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
         short ax = (short)(X & (INTER_TAB_SIZE - 1));
 
         WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?
-            convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
+            CONVERT_TO_WT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
         WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?
-            convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
+            CONVERT_TO_WT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
         WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
-            convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
+            CONVERT_TO_WT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
         WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
-            convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
+            CONVERT_TO_WT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
 
         float taby = 1.f/INTER_TAB_SIZE*ay;
         float tabx = 1.f/INTER_TAB_SIZE*ax;
 
         int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
 
-#if depth <= 4
+#if SRC_DEPTH <= 4
         int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
         int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
         int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
         int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
 
         WT val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
-        storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
+        storepix(CONVERT_TO_T((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
 #else
         float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
         WT val = v0 * tabx2 * taby2 +  v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;
-        storepix(convertToT(val), dstptr + dst_index);
+        storepix(CONVERT_TO_T(val), dstptr + dst_index);
 #endif
     }
 }
@@ -201,7 +201,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
             #pragma unroll
             for (int x = 0; x < 4; x++)
                 v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?
-                    convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
+                    CONVERT_TO_WT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
 
         float tab1y[4], tab1x[4];
 
@@ -213,7 +213,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
         int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
 
         WT sum = (WT)(0);
-#if depth <= 4
+#if SRC_DEPTH <= 4
         int itab[16];
 
         #pragma unroll
@@ -223,12 +223,12 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
         #pragma unroll
         for (int i = 0; i < 16; i++)
             sum += v[i] * itab[i];
-        storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
+        storepix(CONVERT_TO_T( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
 #else
         #pragma unroll
         for (int i = 0; i < 16; i++)
             sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];
-        storepix(convertToT( sum ), dstptr + dst_index);
+        storepix(CONVERT_TO_T( sum ), dstptr + dst_index);
 #endif
     }
 }
diff --git a/modules/imgproc/src/phasecorr.cpp b/modules/imgproc/src/phasecorr.cpp
index 9db436673ca5..dadc3a3da7f4 100644
--- a/modules/imgproc/src/phasecorr.cpp
+++ b/modules/imgproc/src/phasecorr.cpp
@@ -613,7 +613,7 @@ void cv::createHanningWindow(OutputArray _dst, cv::Size winSize, int type)
     AutoBuffer<double> _wc(cols);
     double* const wc = _wc.data();
 
-    double coeff0 = 2.0 * CV_PI / (double)(cols - 1), coeff1 = 2.0f * CV_PI / (double)(rows - 1);
+    double coeff0 = 2.0 * CV_PI / (double)(cols - 1), coeff1 = 2.0 * CV_PI / (double)(rows - 1);
     for(int j = 0; j < cols; j++)
         wc[j] = 0.5 * (1.0 - cos(coeff0 * j));
 
diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp
index a72d2a4d2aa7..33a921c78a94 100644
--- a/modules/imgproc/src/precomp.hpp
+++ b/modules/imgproc/src/precomp.hpp
@@ -50,6 +50,8 @@
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/ocl.hpp"
 #include "opencv2/core/hal/hal.hpp"
+#include "opencv2/core/check.hpp"
+#include "opencv2/core/utils/buffer_area.private.hpp"
 #include "opencv2/imgproc/hal/hal.hpp"
 #include "hal_replacement.hpp"
 
@@ -59,6 +61,7 @@
 #include <stdio.h>
 #include <limits.h>
 #include <float.h>
+#include <stack>
 
 #define GET_OPTIMIZED(func) (func)
 
@@ -112,7 +115,7 @@ inline bool isStorageOrMat(void * arr)
         return true;
     else if (CV_IS_MAT( arr ))
         return false;
-    CV_Error( CV_StsBadArg, "Destination is not CvMemStorage* nor CvMat*" );
+    CV_Error( cv::Error::StsBadArg, "Destination is not CvMemStorage* nor CvMat*" );
 }
 
 
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index c13354406968..64667a5f974a 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -84,7 +84,7 @@ template<typename T1, typename T2> int PyrUpVecV(T1**, T2**, int) { return 0; }
 
 template<typename T1, typename T2> int PyrUpVecVOneRow(T1**, T2*, int) { return 0; }
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width)
 {
@@ -93,10 +93,8 @@ template<> int PyrDownVecH<uchar, int, 1>(const uchar* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_reinterpret_as_s16(vx_load_expand(src01)), v_1_4) +
-                     v_dotprod(v_reinterpret_as_s16(vx_load_expand(src23)), v_6_4) +
-                     (v_reinterpret_as_s32(vx_load_expand(src4)) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(v_reinterpret_as_s16(vx_load_expand(src01)), v_1_4), v_dotprod(v_reinterpret_as_s16(vx_load_expand(src23)), v_6_4)), v_shr<16>(v_reinterpret_as_s32(vx_load_expand(src4)))));
     vx_cleanup();
 
     return x;
@@ -108,42 +106,40 @@ template<> int PyrDownVecH<uchar, int, 2>(const uchar* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) +
-                     v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) +
-                     (v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4), v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_pairs(vx_load_expand(src4))))));
     vx_cleanup();
 
     return x;
 }
 template<> int PyrDownVecH<uchar, int, 3>(const uchar* src, int* row, int width)
 {
-    int idx[v_int8::nlanes/2 + 4];
-    for (int i = 0; i < v_int8::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int8>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int8>::vlanes()/4 + 2; i++)
     {
         idx[i] = 6*i;
-        idx[i + v_int8::nlanes/4 + 2] = 6*i + 3;
+        idx[i + VTraits<v_int8>::vlanes()/4 + 2] = 6*i + 3;
     }
 
     int x = 0;
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int8::nlanes; x += 3*v_int8::nlanes/4, src += 6*v_int8::nlanes/4, row += 3*v_int8::nlanes/4)
+    for (; x <= width - VTraits<v_int8>::vlanes(); x += 3*VTraits<v_int8>::vlanes()/4, src += 6*VTraits<v_int8>::vlanes()/4, row += 3*VTraits<v_int8>::vlanes()/4)
     {
         v_uint16 r0l, r0h, r1l, r1h, r2l, r2h, r3l, r3h, r4l, r4h;
         v_expand(vx_lut_quads(src, idx                       ), r0l, r0h);
-        v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 2), r1l, r1h);
+        v_expand(vx_lut_quads(src, idx + VTraits<v_int8>::vlanes()/4 + 2), r1l, r1h);
         v_expand(vx_lut_quads(src, idx + 1                   ), r2l, r2h);
-        v_expand(vx_lut_quads(src, idx + v_int8::nlanes/4 + 3), r3l, r3h);
+        v_expand(vx_lut_quads(src, idx + VTraits<v_int8>::vlanes()/4 + 3), r3l, r3h);
         v_expand(vx_lut_quads(src, idx + 2                   ), r4l, r4h);
 
-        v_zip(r2l, r1l + r3l, r1l, r3l);
-        v_zip(r2h, r1h + r3h, r1h, r3h);
-        r0l += r4l; r0h += r4h;
+        v_zip(r2l, v_add(r1l, r3l), r1l, r3l);
+        v_zip(r2h, v_add(r1h, r3h), r1h, r3h);
+        r0l = v_add(r0l, r4l); r0h = v_add(r0h, r4h);
 
-        v_store(row                      , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1l), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0l))));
-        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3l), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0l))));
-        v_store(row + 6*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r1h), v_6_4) + v_reinterpret_as_s32(v_expand_low( r0h))));
-        v_store(row + 9*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(r3h), v_6_4) + v_reinterpret_as_s32(v_expand_high(r0h))));
+        v_store(row                      , v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r1l), v_6_4), v_reinterpret_as_s32(v_expand_low(r0l)))));
+        v_store(row + 3*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r3l), v_6_4), v_reinterpret_as_s32(v_expand_high(r0l)))));
+        v_store(row + 6*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r1h), v_6_4), v_reinterpret_as_s32(v_expand_low(r0h)))));
+        v_store(row + 9*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_dotprod(v_reinterpret_as_s16(r3h), v_6_4), v_reinterpret_as_s32(v_expand_high(r0h)))));
     }
     vx_cleanup();
 
@@ -156,10 +152,8 @@ template<> int PyrDownVecH<uchar, int, 4>(const uchar* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4) +
-                     v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4) +
-                     (v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src01))), v_1_4), v_dotprod(v_interleave_quads(v_reinterpret_as_s16(vx_load_expand(src23))), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_quads(vx_load_expand(src4))))));
     vx_cleanup();
 
     return x;
@@ -172,10 +166,8 @@ template<> int PyrDownVecH<short, int, 1>(const short* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(vx_load(src01), v_1_4) +
-                     v_dotprod(vx_load(src23), v_6_4) +
-                     (v_reinterpret_as_s32(vx_load(src4)) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(vx_load(src01), v_1_4), v_dotprod(vx_load(src23), v_6_4)), v_shr<16>(v_reinterpret_as_s32(vx_load(src4)))));
     vx_cleanup();
 
     return x;
@@ -187,34 +179,32 @@ template<> int PyrDownVecH<short, int, 2>(const short* src, int* row, int width)
 
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_interleave_pairs(vx_load(src01)), v_1_4) +
-                     v_dotprod(v_interleave_pairs(vx_load(src23)), v_6_4) +
-                     (v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))) >> 16));
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_dotprod(v_interleave_pairs(vx_load(src01)), v_1_4), v_dotprod(v_interleave_pairs(vx_load(src23)), v_6_4)), v_shr<16>(v_reinterpret_as_s32(v_interleave_pairs(vx_load(src4))))));
     vx_cleanup();
 
     return x;
 }
 template<> int PyrDownVecH<short, int, 3>(const short* src, int* row, int width)
 {
-    int idx[v_int16::nlanes/2 + 4];
-    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int16>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int16>::vlanes()/4 + 2; i++)
     {
         idx[i] = 6*i;
-        idx[i + v_int16::nlanes/4 + 2] = 6*i + 3;
+        idx[i + VTraits<v_int16>::vlanes()/4 + 2] = 6*i + 3;
     }
 
     int x = 0;
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4)
+    for (; x <= width - VTraits<v_int16>::vlanes(); x += 3*VTraits<v_int16>::vlanes()/4, src += 6*VTraits<v_int16>::vlanes()/4, row += 3*VTraits<v_int16>::vlanes()/4)
     {
         v_int16 r0, r1, r2, r3, r4;
-        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
-        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 3), r2, r3);
         r4 = vx_lut_quads(src, idx + 2);
-        v_store(row, v_pack_triplets(v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4)));
-        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4)));
+        v_store(row, v_pack_triplets(v_add(v_add(v_dotprod(r0, v_1_4), v_dotprod(r2, v_6_4)), v_expand_low(r4))));
+        v_store(row + 3*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_add(v_dotprod(r1, v_1_4), v_dotprod(r3, v_6_4)), v_expand_high(r4))));
     }
     vx_cleanup();
 
@@ -222,24 +212,24 @@ template<> int PyrDownVecH<short, int, 3>(const short* src, int* row, int width)
 }
 template<> int PyrDownVecH<short, int, 4>(const short* src, int* row, int width)
 {
-    int idx[v_int16::nlanes/2 + 4];
-    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int16>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int16>::vlanes()/4 + 2; i++)
     {
         idx[i] = 8*i;
-        idx[i + v_int16::nlanes/4 + 2] = 8*i + 4;
+        idx[i + VTraits<v_int16>::vlanes()/4 + 2] = 8*i + 4;
     }
 
     int x = 0;
     v_int16 v_1_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040001));
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
-    for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes)
+    for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes(), src += 2*VTraits<v_int16>::vlanes(), row += VTraits<v_int16>::vlanes())
     {
         v_int16 r0, r1, r2, r3, r4;
-        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
-        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 3), r2, r3);
         r4 = vx_lut_quads(src, idx + 2);
-        v_store(row, v_dotprod(r0, v_1_4) + v_dotprod(r2, v_6_4) + v_expand_low(r4));
-        v_store(row + v_int32::nlanes, v_dotprod(r1, v_1_4) + v_dotprod(r3, v_6_4) + v_expand_high(r4));
+        v_store(row, v_add(v_add(v_dotprod(r0, v_1_4), v_dotprod(r2, v_6_4)), v_expand_low(r4)));
+        v_store(row + VTraits<v_int32>::vlanes(), v_add(v_add(v_dotprod(r1, v_1_4), v_dotprod(r3, v_6_4)), v_expand_high(r4)));
     }
     vx_cleanup();
 
@@ -255,10 +245,8 @@ template<> int PyrDownVecH<ushort, int, 1>(const ushort* src, int* row, int widt
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
     v_uint16 v_half = vx_setall_u16(0x8000);
     v_int32 v_half15 = vx_setall_s32(0x00078000);
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half)), v_1_4) +
-                     v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half)), v_6_4) +
-                     v_reinterpret_as_s32(v_reinterpret_as_u32(vx_load(src4)) >> 16) + v_half15);
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half)), v_6_4)), v_reinterpret_as_s32(v_shr<16>(v_reinterpret_as_u32(vx_load(src4))))), v_half15));
     vx_cleanup();
 
     return x;
@@ -272,21 +260,19 @@ template<> int PyrDownVecH<ushort, int, 2>(const ushort* src, int* row, int widt
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
     v_uint16 v_half = vx_setall_u16(0x8000);
     v_int32 v_half15 = vx_setall_s32(0x00078000);
-    for (; x <= width - v_int32::nlanes; x += v_int32::nlanes, src01 += v_int16::nlanes, src23 += v_int16::nlanes, src4 += v_int16::nlanes, row += v_int32::nlanes)
-        v_store(row, v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half))), v_1_4) +
-                     v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half))), v_6_4) +
-                     v_reinterpret_as_s32(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4))) >> 16) + v_half15);
+    for (; x <= width - VTraits<v_int32>::vlanes(); x += VTraits<v_int32>::vlanes(), src01 += VTraits<v_int16>::vlanes(), src23 += VTraits<v_int16>::vlanes(), src4 += VTraits<v_int16>::vlanes(), row += VTraits<v_int32>::vlanes())
+        v_store(row, v_add(v_add(v_add(v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src01), v_half))), v_1_4), v_dotprod(v_interleave_pairs(v_reinterpret_as_s16(v_sub_wrap(vx_load(src23), v_half))), v_6_4)), v_reinterpret_as_s32(v_shr<16>(v_reinterpret_as_u32(v_interleave_pairs(vx_load(src4)))))), v_half15));
     vx_cleanup();
 
     return x;
 }
 template<> int PyrDownVecH<ushort, int, 3>(const ushort* src, int* row, int width)
 {
-    int idx[v_int16::nlanes/2 + 4];
-    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int16>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int16>::vlanes()/4 + 2; i++)
     {
         idx[i] = 6*i;
-        idx[i + v_int16::nlanes/4 + 2] = 6*i + 3;
+        idx[i + VTraits<v_int16>::vlanes()/4 + 2] = 6*i + 3;
     }
 
     int x = 0;
@@ -294,18 +280,14 @@ template<> int PyrDownVecH<ushort, int, 3>(const ushort* src, int* row, int widt
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
     v_uint16 v_half = vx_setall_u16(0x8000);
     v_int32 v_half15 = vx_setall_s32(0x00078000);
-    for (; x <= width - v_int16::nlanes; x += 3*v_int16::nlanes/4, src += 6*v_int16::nlanes/4, row += 3*v_int16::nlanes/4)
+    for (; x <= width - VTraits<v_int16>::vlanes(); x += 3*VTraits<v_int16>::vlanes()/4, src += 6*VTraits<v_int16>::vlanes()/4, row += 3*VTraits<v_int16>::vlanes()/4)
     {
         v_uint16 r0, r1, r2, r3, r4;
-        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
-        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 3), r2, r3);
         r4 = vx_lut_quads(src, idx + 2);
-        v_store(row                      , v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) +
-                                                           v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) +
-                                                           v_reinterpret_as_s32(v_expand_low(r4)) + v_half15));
-        v_store(row + 3*v_int32::nlanes/4, v_pack_triplets(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) +
-                                                           v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) +
-                                                           v_reinterpret_as_s32(v_expand_high(r4)) + v_half15));
+        v_store(row                      , v_pack_triplets(v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_low(r4))), v_half15)));
+        v_store(row + 3*VTraits<v_int32>::vlanes()/4, v_pack_triplets(v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_high(r4))), v_half15)));
     }
     vx_cleanup();
 
@@ -313,11 +295,11 @@ template<> int PyrDownVecH<ushort, int, 3>(const ushort* src, int* row, int widt
 }
 template<> int PyrDownVecH<ushort, int, 4>(const ushort* src, int* row, int width)
 {
-    int idx[v_int16::nlanes/2 + 4];
-    for (int i = 0; i < v_int16::nlanes/4 + 2; i++)
+    int idx[VTraits<v_int16>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_int16>::vlanes()/4 + 2; i++)
     {
         idx[i] = 8*i;
-        idx[i + v_int16::nlanes/4 + 2] = 8*i + 4;
+        idx[i + VTraits<v_int16>::vlanes()/4 + 2] = 8*i + 4;
     }
 
     int x = 0;
@@ -325,18 +307,14 @@ template<> int PyrDownVecH<ushort, int, 4>(const ushort* src, int* row, int widt
     v_int16 v_6_4 = v_reinterpret_as_s16(vx_setall_u32(0x00040006));
     v_uint16 v_half = vx_setall_u16(0x8000);
     v_int32 v_half15 = vx_setall_s32(0x00078000);
-    for (; x <= width - v_int16::nlanes; x += v_int16::nlanes, src += 2*v_int16::nlanes, row += v_int16::nlanes)
+    for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes(), src += 2*VTraits<v_int16>::vlanes(), row += VTraits<v_int16>::vlanes())
     {
         v_uint16 r0, r1, r2, r3, r4;
-        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + v_int16::nlanes/4 + 2), r0, r1);
-        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + v_int16::nlanes/4 + 3), r2, r3);
+        v_zip(vx_lut_quads(src, idx), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 2), r0, r1);
+        v_zip(vx_lut_quads(src, idx + 1), vx_lut_quads(src, idx + VTraits<v_int16>::vlanes()/4 + 3), r2, r3);
         r4 = vx_lut_quads(src, idx + 2);
-        v_store(row                  , v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4) +
-                                       v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4) +
-                                       v_reinterpret_as_s32(v_expand_low(r4)) + v_half15);
-        v_store(row + v_int32::nlanes, v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4) +
-                                       v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4) +
-                                       v_reinterpret_as_s32(v_expand_high(r4)) + v_half15);
+        v_store(row                  , v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r0, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r2, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_low(r4))), v_half15));
+        v_store(row + VTraits<v_int32>::vlanes(), v_add(v_add(v_add(v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r1, v_half)), v_1_4), v_dotprod(v_reinterpret_as_s16(v_sub_wrap(r3, v_half)), v_6_4)), v_reinterpret_as_s32(v_expand_high(r4))), v_half15));
     }
     vx_cleanup();
 
@@ -349,13 +327,13 @@ template<> int PyrDownVecH<float, float, 1>(const float* src, float* row, int wi
     const float *src01 = src, *src23 = src + 2, *src4 = src + 3;
 
     v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
-    for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src01 += 2*v_float32::nlanes, src23 += 2*v_float32::nlanes, src4 += 2*v_float32::nlanes, row+=v_float32::nlanes)
+    for (; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes(), src01 += 2*VTraits<v_float32>::vlanes(), src23 += 2*VTraits<v_float32>::vlanes(), src4 += 2*VTraits<v_float32>::vlanes(), row+=VTraits<v_float32>::vlanes())
     {
         v_float32 r0, r1, r2, r3, r4, rtmp;
         v_load_deinterleave(src01, r0, r1);
         v_load_deinterleave(src23, r2, r3);
         v_load_deinterleave(src4, rtmp, r4);
-        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+        v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4))));
     }
     vx_cleanup();
 
@@ -367,13 +345,13 @@ template<> int PyrDownVecH<float, float, 2>(const float* src, float* row, int wi
     const float *src01 = src, *src23 = src + 4, *src4 = src + 6;
 
     v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
-    for (; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, src01 += 4*v_float32::nlanes, src23 += 4*v_float32::nlanes, src4 += 4*v_float32::nlanes, row += 2*v_float32::nlanes)
+    for (; x <= width - 2*VTraits<v_float32>::vlanes(); x += 2*VTraits<v_float32>::vlanes(), src01 += 4*VTraits<v_float32>::vlanes(), src23 += 4*VTraits<v_float32>::vlanes(), src4 += 4*VTraits<v_float32>::vlanes(), row += 2*VTraits<v_float32>::vlanes())
     {
         v_float32 r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, rtmpa, rtmpb;
         v_load_deinterleave(src01, r0a, r0b, r1a, r1b);
         v_load_deinterleave(src23, r2a, r2b, r3a, r3b);
         v_load_deinterleave(src4, rtmpa, rtmpb, r4a, r4b);
-        v_store_interleave(row, v_muladd(r2a, _6, v_muladd(r1a + r3a, _4, r0a + r4a)), v_muladd(r2b, _6, v_muladd(r1b + r3b, _4, r0b + r4b)));
+        v_store_interleave(row, v_muladd(r2a, _6, v_muladd(v_add(r1a, r3a), _4, v_add(r0a, r4a))), v_muladd(r2b, _6, v_muladd(v_add(r1b, r3b), _4, v_add(r0b, r4b))));
     }
     vx_cleanup();
 
@@ -381,23 +359,23 @@ template<> int PyrDownVecH<float, float, 2>(const float* src, float* row, int wi
 }
 template<> int PyrDownVecH<float, float, 3>(const float* src, float* row, int width)
 {
-    int idx[v_float32::nlanes/2 + 4];
-    for (int i = 0; i < v_float32::nlanes/4 + 2; i++)
+    int idx[VTraits<v_float32>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_float32>::vlanes()/4 + 2; i++)
     {
         idx[i] = 6*i;
-        idx[i + v_float32::nlanes/4 + 2] = 6*i + 3;
+        idx[i + VTraits<v_float32>::vlanes()/4 + 2] = 6*i + 3;
     }
 
     int x = 0;
     v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
-    for (; x <= width - v_float32::nlanes; x += 3*v_float32::nlanes/4, src += 6*v_float32::nlanes/4, row += 3*v_float32::nlanes/4)
+    for (; x <= width - VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes()/4, src += 6*VTraits<v_float32>::vlanes()/4, row += 3*VTraits<v_float32>::vlanes()/4)
     {
         v_float32 r0 = vx_lut_quads(src, idx);
-        v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2);
+        v_float32 r1 = vx_lut_quads(src, idx + VTraits<v_float32>::vlanes()/4 + 2);
         v_float32 r2 = vx_lut_quads(src, idx + 1);
-        v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3);
+        v_float32 r3 = vx_lut_quads(src, idx + VTraits<v_float32>::vlanes()/4 + 3);
         v_float32 r4 = vx_lut_quads(src, idx + 2);
-        v_store(row, v_pack_triplets(v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4))));
+        v_store(row, v_pack_triplets(v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4)))));
     }
     vx_cleanup();
 
@@ -405,43 +383,43 @@ template<> int PyrDownVecH<float, float, 3>(const float* src, float* row, int wi
 }
 template<> int PyrDownVecH<float, float, 4>(const float* src, float* row, int width)
 {
-    int idx[v_float32::nlanes/2 + 4];
-    for (int i = 0; i < v_float32::nlanes/4 + 2; i++)
+    int idx[VTraits<v_float32>::max_nlanes/2 + 4];
+    for (int i = 0; i < VTraits<v_float32>::vlanes()/4 + 2; i++)
     {
         idx[i] = 8*i;
-        idx[i + v_float32::nlanes/4 + 2] = 8*i + 4;
+        idx[i + VTraits<v_float32>::vlanes()/4 + 2] = 8*i + 4;
     }
 
     int x = 0;
     v_float32 _4 = vx_setall_f32(4.f), _6 = vx_setall_f32(6.f);
-    for (; x <= width - v_float32::nlanes; x += v_float32::nlanes, src += 2*v_float32::nlanes, row += v_float32::nlanes)
+    for (; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes(), src += 2*VTraits<v_float32>::vlanes(), row += VTraits<v_float32>::vlanes())
     {
         v_float32 r0 = vx_lut_quads(src, idx);
-        v_float32 r1 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 2);
+        v_float32 r1 = vx_lut_quads(src, idx + VTraits<v_float32>::vlanes()/4 + 2);
         v_float32 r2 = vx_lut_quads(src, idx + 1);
-        v_float32 r3 = vx_lut_quads(src, idx + v_float32::nlanes/4 + 3);
+        v_float32 r3 = vx_lut_quads(src, idx + VTraits<v_float32>::vlanes()/4 + 3);
         v_float32 r4 = vx_lut_quads(src, idx + 2);
-        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+        v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4))));
     }
     vx_cleanup();
 
     return x;
 }
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<> int PyrDownVecH<double, double, 1>(const double* src, double* row, int width)
 {
     int x = 0;
     const double *src01 = src, *src23 = src + 2, *src4 = src + 3;
 
     v_float64 _4 = vx_setall_f64(4.f), _6 = vx_setall_f64(6.f);
-    for (; x <= width - v_float64::nlanes; x += v_float64::nlanes, src01 += 2*v_float64::nlanes, src23 += 2*v_float64::nlanes, src4 += 2*v_float64::nlanes, row += v_float64::nlanes)
+    for (; x <= width - VTraits<v_float64>::vlanes(); x += VTraits<v_float64>::vlanes(), src01 += 2*VTraits<v_float64>::vlanes(), src23 += 2*VTraits<v_float64>::vlanes(), src4 += 2*VTraits<v_float64>::vlanes(), row += VTraits<v_float64>::vlanes())
     {
         v_float64 r0, r1, r2, r3, r4, rtmp;
         v_load_deinterleave(src01, r0, r1);
         v_load_deinterleave(src23, r2, r3);
         v_load_deinterleave(src4, rtmp, r4);
-        v_store(row, v_muladd(r2, _6, v_muladd(r1 + r3, _4, r0 + r4)));
+        v_store(row, v_muladd(r2, _6, v_muladd(v_add(r1, r3), _4, v_add(r0, r4))));
     }
     vx_cleanup();
 
@@ -454,37 +432,38 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
-    for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes )
+    for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
     {
         v_uint16 r0, r1, r2, r3, r4, t0, t1;
-        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
-        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
-        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
-        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
-        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
-        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
-        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*v_int32::nlanes), vx_load(row0 + x + 3*v_int32::nlanes)));
-        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*v_int32::nlanes), vx_load(row1 + x + 3*v_int32::nlanes)));
-        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*v_int32::nlanes), vx_load(row2 + x + 3*v_int32::nlanes)));
-        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*v_int32::nlanes), vx_load(row3 + x + 3*v_int32::nlanes)));
-        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*v_int32::nlanes), vx_load(row4 + x + 3*v_int32::nlanes)));
-        t1 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes())));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + VTraits<v_int32>::vlanes())));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + VTraits<v_int32>::vlanes())));
+        t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row0 + x + 3*VTraits<v_int32>::vlanes())));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row1 + x + 3*VTraits<v_int32>::vlanes())));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row2 + x + 3*VTraits<v_int32>::vlanes())));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row3 + x + 3*VTraits<v_int32>::vlanes())));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x + 2*VTraits<v_int32>::vlanes()), vx_load(row4 + x + 3*VTraits<v_int32>::vlanes())));
+        t1 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
         v_store(dst + x, v_rshr_pack<8>(t0, t1));
     }
-    if (x <= width - v_int16::nlanes)
+    if (x <= width - VTraits<v_int16>::vlanes())
     {
         v_uint16 r0, r1, r2, r3, r4, t0;
-        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)));
-        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)));
-        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)));
-        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + v_int32::nlanes)));
-        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + v_int32::nlanes)));
-        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        r0 = v_reinterpret_as_u16(v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())));
+        r1 = v_reinterpret_as_u16(v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())));
+        r2 = v_reinterpret_as_u16(v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes())));
+        r3 = v_reinterpret_as_u16(v_pack(vx_load(row3 + x), vx_load(row3 + x + VTraits<v_int32>::vlanes())));
+        r4 = v_reinterpret_as_u16(v_pack(vx_load(row4 + x), vx_load(row4 + x + VTraits<v_int32>::vlanes())));
+        t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
         v_rshr_pack_store<8>(dst + x, t0);
-        x += v_uint16::nlanes;
+        x += VTraits<v_uint16>::vlanes();
     }
+    #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
-    for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+    for ( ; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
     {
         v_int32x4 r0, r1, r2, r3, r4, t0;
         r0 = v_load(row0 + x);
@@ -492,10 +471,23 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
         r2 = v_load(row2 + x);
         r3 = v_load(row3 + x);
         r4 = v_load(row4 + x);
-        t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
 
-        *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
+        *((unaligned_int*) (dst + x)) = v_get0(v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())));
     }
+    #else
+    for (; x <= width - 1; x += 1)
+    {
+        int r0 = *(row0 + x);
+        int r1 = *(row1 + x);
+        int r2 = *(row2 + x);
+        int r3 = *(row3 + x);
+        int r4 = *(row4 + x);
+        int t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
+        // Similar to v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16()).get0()
+        *(dst + x) = (int)((((unsigned int)t0) + ((1 << (8 - 1)))) >> 8);
+    }
+    #endif //CV_SIMD128
     vx_cleanup();
 
     return x;
@@ -508,7 +500,7 @@ int PyrDownVecV<float, float>(float** src, float* dst, int width)
     const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
     v_float32 _4 = vx_setall_f32(4.f), _scale = vx_setall_f32(1.f/256);
-    for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+    for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
     {
         v_float32 r0, r1, r2, r3, r4;
         r0 = vx_load(row0 + x);
@@ -516,7 +508,7 @@ int PyrDownVecV<float, float>(float** src, float* dst, int width)
         r2 = vx_load(row2 + x);
         r3 = vx_load(row3 + x);
         r4 = vx_load(row4 + x);
-        v_store(dst + x, v_muladd(r1 + r3 + r2, _4, r0 + r4 + (r2 + r2)) * _scale);
+        v_store(dst + x, v_mul(v_muladd(v_add(v_add(r1, r3), r2), _4, v_add(v_add(r0, r4), v_add(r2, r2))), _scale));
     }
     vx_cleanup();
 
@@ -528,30 +520,30 @@ template <> int PyrDownVecV<int, ushort>(int** src, ushort* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
-    for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+    for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
     {
         v_int32 r00 = vx_load(row0 + x),
-                r01 = vx_load(row0 + x + v_int32::nlanes),
+                r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 r10 = vx_load(row1 + x),
-                r11 = vx_load(row1 + x + v_int32::nlanes),
+                r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 r20 = vx_load(row2 + x),
-                r21 = vx_load(row2 + x + v_int32::nlanes),
+                r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes()),
                 r30 = vx_load(row3 + x),
-                r31 = vx_load(row3 + x + v_int32::nlanes),
+                r31 = vx_load(row3 + x + VTraits<v_int32>::vlanes()),
                 r40 = vx_load(row4 + x),
-                r41 = vx_load(row4 + x + v_int32::nlanes);
-        v_store(dst + x, v_rshr_pack_u<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
-                                            r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
+                r41 = vx_load(row4 + x + VTraits<v_int32>::vlanes());
+        v_store(dst + x, v_rshr_pack_u<8>(v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))),
+                                            v_add(v_add(v_add(r01, r41), v_add(r21, r21)), v_shl<2>(v_add(v_add(r11, r21), r31)))));
     }
-    if (x <= width - v_int32::nlanes)
+    if (x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 r00 = vx_load(row0 + x),
                 r10 = vx_load(row1 + x),
                 r20 = vx_load(row2 + x),
                 r30 = vx_load(row3 + x),
                 r40 = vx_load(row4 + x);
-        v_rshr_pack_u_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
-        x += v_int32::nlanes;
+        v_rshr_pack_u_store<8>(dst + x, v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -563,30 +555,30 @@ template <> int PyrDownVecV<int, short>(int** src, short* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
 
-    for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+    for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
     {
         v_int32 r00 = vx_load(row0 + x),
-                r01 = vx_load(row0 + x + v_int32::nlanes),
+                r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 r10 = vx_load(row1 + x),
-                r11 = vx_load(row1 + x + v_int32::nlanes),
+                r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 r20 = vx_load(row2 + x),
-                r21 = vx_load(row2 + x + v_int32::nlanes),
+                r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes()),
                 r30 = vx_load(row3 + x),
-                r31 = vx_load(row3 + x + v_int32::nlanes),
+                r31 = vx_load(row3 + x + VTraits<v_int32>::vlanes()),
                 r40 = vx_load(row4 + x),
-                r41 = vx_load(row4 + x + v_int32::nlanes);
-        v_store(dst + x, v_rshr_pack<8>(r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2),
-                                        r01 + r41 + (r21 + r21) + ((r11 + r21 + r31) << 2)));
+                r41 = vx_load(row4 + x + VTraits<v_int32>::vlanes());
+        v_store(dst + x, v_rshr_pack<8>(v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))),
+                                        v_add(v_add(v_add(r01, r41), v_add(r21, r21)), v_shl<2>(v_add(v_add(r11, r21), r31)))));
     }
-    if (x <= width - v_int32::nlanes)
+    if (x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 r00 = vx_load(row0 + x),
             r10 = vx_load(row1 + x),
             r20 = vx_load(row2 + x),
             r30 = vx_load(row3 + x),
             r40 = vx_load(row4 + x);
-        v_rshr_pack_store<8>(dst + x, r00 + r40 + (r20 + r20) + ((r10 + r20 + r30) << 2));
-        x += v_int32::nlanes;
+        v_rshr_pack_store<8>(dst + x, v_add(v_add(v_add(r00, r40), v_add(r20, r20)), v_shl<2>(v_add(v_add(r10, r20), r30))));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -599,39 +591,55 @@ template <> int PyrUpVecV<int, uchar>(int** src, uchar** dst, int width)
     uchar *dst0 = dst[0], *dst1 = dst[1];
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+    for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
     {
-        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)),
-                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)),
-                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)),
-                v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes));
-        v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11);
-        v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11)));
-        v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())),
+                v_r01 = v_pack(vx_load(row0 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row0 + x + 3 * VTraits<v_int32>::vlanes())),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())),
+                v_r11 = v_pack(vx_load(row1 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row1 + x + 3 * VTraits<v_int32>::vlanes())),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes())),
+                v_r21 = v_pack(vx_load(row2 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row2 + x + 3 * VTraits<v_int32>::vlanes()));
+        v_int16 v_2r10 = v_add(v_r10, v_r10), v_2r11 = (v_add(v_r11, v_r11));
+        v_store(dst0 + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_add(v_add(v_r01, v_r21), v_add(v_add(v_2r11, v_2r11), v_2r11))));
+        v_store(dst1 + x, v_rshr_pack_u<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21))));
     }
-    if(x <= width - v_uint16::nlanes)
+    if(x <= width - VTraits<v_uint16>::vlanes())
     {
-        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes));
-        v_int16 v_2r10 = v_r10 + v_r10;
-        v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10));
-        v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-        x += v_uint16::nlanes;
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes()));
+        v_int16 v_2r10 = v_add(v_r10, v_r10);
+        v_rshr_pack_u_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)));
+        v_rshr_pack_u_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20)));
+        x += VTraits<v_uint16>::vlanes();
     }
+    #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
-    for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+    for (; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_int32 v_2r10 = v_r10 + v_r10;
-        v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2);
-        *(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
-        *(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0();
+        v_int32 v_2r10 = v_add(v_r10, v_r10);
+        v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
+        *(unaligned_int*)(dst0 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())));
+        *(unaligned_int*)(dst1 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())));
+    }
+    #else
+    for (; x <= width - 1; x += 1)
+    {
+        int r00 = *(row0 + x),
+            r10 = *(row1 + x),
+            r20 = *(row2 + x);
+        int _2r10 = r10 + r10;
+        int d = r00 + r20 + (_2r10 + _2r10 + _2r10);
+        int d_shifted = (r10 + r20) << 2;
+        // Similar to v_rshr_pack_u<6>(d, vx_setzero_s16()).get0()
+        *(dst0 + x) = (int)((((unsigned int)d) + ((1 << (6 - 1)))) >> 6);
+        // Similar to v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16()).get0()
+        *(dst1 + x) = (int)((((unsigned int)d_shifted) + ((1 << (6 - 1)))) >> 6);
     }
+    #endif //CV_SIMD128
     vx_cleanup();
 
     return x;
@@ -643,25 +651,25 @@ template <> int PyrUpVecV<int, short>(int** src, short** dst, int width)
     short *dst0 = dst[0], *dst1 = dst[1];
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+    for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
-                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 v_r10 = vx_load(row1 + x),
-                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 v_r20 = vx_load(row2 + x),
-                v_r21 = vx_load(row2 + x + v_int32::nlanes);
-        v_store(dst0 + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
-        v_store(dst1 + x, v_rshr_pack<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
+                v_r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes());
+        v_store(dst0 + x, v_rshr_pack<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11)))));
+        v_store(dst1 + x, v_rshr_pack<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21))));
     }
-    if(x <= width - v_int32::nlanes)
+    if(x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_rshr_pack_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-        v_rshr_pack_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-        x += v_int32::nlanes;
+        v_rshr_pack_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))));
+        v_rshr_pack_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20)));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -674,25 +682,25 @@ template <> int PyrUpVecV<int, ushort>(int** src, ushort** dst, int width)
     ushort *dst0 = dst[0], *dst1 = dst[1];
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+    for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
-                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 v_r10 = vx_load(row1 + x),
-                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 v_r20 = vx_load(row2 + x),
-                v_r21 = vx_load(row2 + x + v_int32::nlanes);
-        v_store(dst0 + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
-        v_store(dst1 + x, v_rshr_pack_u<6>((v_r10 + v_r20) << 2, (v_r11 + v_r21) << 2));
+                v_r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes());
+        v_store(dst0 + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11)))));
+        v_store(dst1 + x, v_rshr_pack_u<6>(v_shl<2>(v_add(v_r10, v_r20)), v_shl<2>(v_add(v_r11, v_r21))));
     }
-    if(x <= width - v_int32::nlanes)
+    if(x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_rshr_pack_u_store<6>(dst0 + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-        v_rshr_pack_u_store<6>(dst1 + x, (v_r10 + v_r20) << 2);
-        x += v_int32::nlanes;
+        v_rshr_pack_u_store<6>(dst0 + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))));
+        v_rshr_pack_u_store<6>(dst1 + x, v_shl<2>(v_add(v_r10, v_r20)));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -706,13 +714,13 @@ template <> int PyrUpVecV<float, float>(float** src, float** dst, int width)
     float *dst0 = dst[0], *dst1 = dst[1];
 
     v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f), v_scale4 = vx_setall_f32(1.f/16.f);
-    for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+    for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
     {
         v_float32 v_r0 = vx_load(row0 + x),
                   v_r1 = vx_load(row1 + x),
                   v_r2 = vx_load(row2 + x);
-        v_store(dst1 + x, v_scale4 * (v_r1 + v_r2));
-        v_store(dst0 + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2));
+        v_store(dst1 + x, v_mul(v_scale4, v_add(v_r1, v_r2)));
+        v_store(dst0 + x, v_mul(v_scale, v_add(v_muladd(v_6, v_r1, v_r0), v_r2)));
     }
     vx_cleanup();
 
@@ -724,36 +732,50 @@ template <> int PyrUpVecVOneRow<int, uchar>(int** src, uchar* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
+    for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
     {
-        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                v_r01 = v_pack(vx_load(row0 + x + 2 * v_int32::nlanes), vx_load(row0 + x + 3 * v_int32::nlanes)),
-                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                v_r11 = v_pack(vx_load(row1 + x + 2 * v_int32::nlanes), vx_load(row1 + x + 3 * v_int32::nlanes)),
-                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes)),
-                v_r21 = v_pack(vx_load(row2 + x + 2 * v_int32::nlanes), vx_load(row2 + x + 3 * v_int32::nlanes));
-        v_int16 v_2r10 = v_r10 + v_r10, v_2r11 = (v_r11 + v_r11);
-        v_store(dst + x, v_rshr_pack_u<6>(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), v_r01 + v_r21 + (v_2r11 + v_2r11 + v_2r11)));
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())),
+                v_r01 = v_pack(vx_load(row0 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row0 + x + 3 * VTraits<v_int32>::vlanes())),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())),
+                v_r11 = v_pack(vx_load(row1 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row1 + x + 3 * VTraits<v_int32>::vlanes())),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes())),
+                v_r21 = v_pack(vx_load(row2 + x + 2 * VTraits<v_int32>::vlanes()), vx_load(row2 + x + 3 * VTraits<v_int32>::vlanes()));
+        v_int16 v_2r10 = v_add(v_r10, v_r10), v_2r11 = (v_add(v_r11, v_r11));
+        v_store(dst + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_add(v_add(v_r01, v_r21), v_add(v_add(v_2r11, v_2r11), v_2r11))));
     }
-    if(x <= width - v_uint16::nlanes)
+    if(x <= width - VTraits<v_uint16>::vlanes())
     {
-        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + v_int32::nlanes)),
-                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + v_int32::nlanes)),
-                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + v_int32::nlanes));
-        v_int16 v_2r10 = v_r10 + v_r10;
-        v_rshr_pack_u_store<6>(dst + x, v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10));
-        x += v_uint16::nlanes;
+        v_int16 v_r00 = v_pack(vx_load(row0 + x), vx_load(row0 + x + VTraits<v_int32>::vlanes())),
+                v_r10 = v_pack(vx_load(row1 + x), vx_load(row1 + x + VTraits<v_int32>::vlanes())),
+                v_r20 = v_pack(vx_load(row2 + x), vx_load(row2 + x + VTraits<v_int32>::vlanes()));
+        v_int16 v_2r10 = v_add(v_r10, v_r10);
+        v_rshr_pack_u_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)));
+        x += VTraits<v_uint16>::vlanes();
     }
+    #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
-    for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+    for (; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_int32 v_2r10 = v_r10 + v_r10;
-        v_int16 d = v_pack(v_r00 + v_r20 + (v_2r10 + v_2r10 + v_2r10), (v_r10 + v_r20) << 2);
-        *(unaligned_int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
+        v_int32 v_2r10 = v_add(v_r10, v_r10);
+        v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
+        *(unaligned_int*)(dst + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())));
+    }
+    #else
+    for (; x <= width - 1; x += 1)
+    {
+        int r00 = *(row0 + x),
+            r10 = *(row1 + x),
+            r20 = *(row2 + x);
+        int _2r10 = r10 + r10;
+        int d = r00 + r20 + (_2r10 + _2r10 + _2r10);
+        int d_shifted = (r10 + r20) << 2;
+        // Similar to v_rshr_pack_u<6>(d, vx_setzero_s16()).get0()
+        *(dst + x) = (int)((((unsigned int)d) + ((1 << (6 - 1)))) >> 6);
     }
+    #endif //CV_SIMD128
     vx_cleanup();
 
     return x;
@@ -764,23 +786,23 @@ template <> int PyrUpVecVOneRow<int, short>(int** src, short* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+    for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
-                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 v_r10 = vx_load(row1 + x),
-                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 v_r20 = vx_load(row2 + x),
-                v_r21 = vx_load(row2 + x + v_int32::nlanes);
-        v_store(dst + x, v_rshr_pack<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
+                v_r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes());
+        v_store(dst + x, v_rshr_pack<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11)))));
     }
-    if(x <= width - v_int32::nlanes)
+    if(x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_rshr_pack_store<6>(dst + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-        x += v_int32::nlanes;
+        v_rshr_pack_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -792,23 +814,23 @@ template <> int PyrUpVecVOneRow<int, ushort>(int** src, ushort* dst, int width)
     int x = 0;
     const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
-    for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+    for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
-                v_r01 = vx_load(row0 + x + v_int32::nlanes),
+                v_r01 = vx_load(row0 + x + VTraits<v_int32>::vlanes()),
                 v_r10 = vx_load(row1 + x),
-                v_r11 = vx_load(row1 + x + v_int32::nlanes),
+                v_r11 = vx_load(row1 + x + VTraits<v_int32>::vlanes()),
                 v_r20 = vx_load(row2 + x),
-                v_r21 = vx_load(row2 + x + v_int32::nlanes);
-        v_store(dst + x, v_rshr_pack_u<6>(v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)), v_r01 + v_r21 + ((v_r11 << 1) + (v_r11 << 2))));
+                v_r21 = vx_load(row2 + x + VTraits<v_int32>::vlanes());
+        v_store(dst + x, v_rshr_pack_u<6>(v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))), v_add(v_add(v_r01, v_r21), v_add(v_shl<1>(v_r11), v_shl<2>(v_r11)))));
     }
-    if(x <= width - v_int32::nlanes)
+    if(x <= width - VTraits<v_int32>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
-        v_rshr_pack_u_store<6>(dst + x, v_r00 + v_r20 + ((v_r10 << 1) + (v_r10 << 2)));
-        x += v_int32::nlanes;
+        v_rshr_pack_u_store<6>(dst + x, v_add(v_add(v_r00, v_r20), v_add(v_shl<1>(v_r10), v_shl<2>(v_r10))));
+        x += VTraits<v_int32>::vlanes();
     }
     vx_cleanup();
 
@@ -821,12 +843,12 @@ template <> int PyrUpVecVOneRow<float, float>(float** src, float* dst, int width
     const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
 
     v_float32 v_6 = vx_setall_f32(6.0f), v_scale = vx_setall_f32(1.f/64.f);
-    for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+    for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
     {
         v_float32 v_r0 = vx_load(row0 + x),
                   v_r1 = vx_load(row1 + x),
                   v_r2 = vx_load(row2 + x);
-        v_store(dst + x, v_scale * (v_muladd(v_6, v_r1, v_r0) + v_r2));
+        v_store(dst + x, v_mul(v_scale, v_add(v_muladd(v_6, v_r1, v_r0), v_r2)));
     }
     vx_cleanup();
 
@@ -1167,8 +1189,8 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
                                        "BORDER_REFLECT_101" };
     char cvt[2][50];
     String buildOptions = format(
-            "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
-            "-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d",
+            "-D T=%s -D FT=%s -D CONVERT_TO_T=%s -D CONVERT_TO_FT=%s%s "
+            "-D T1=%s -D CN=%d -D KERCN=%d -D FDEPTH=%d -D %s -D LOCAL_SIZE=%d",
             ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)),
             ocl::convertTypeStr(float_depth, depth, cn, cvt[0], sizeof(cvt[0])),
             ocl::convertTypeStr(depth, float_depth, cn, cvt[1], sizeof(cvt[1])),
@@ -1210,8 +1232,8 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     const int local_size = channels == 1 ? 16 : 8;
     char cvt[2][50];
     String buildOptions = format(
-            "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
-            "-D T1=%s -D cn=%d -D LOCAL_SIZE=%d",
+            "-D T=%s -D FT=%s -D CONVERT_TO_T=%s -D CONVERT_TO_FT=%s%s "
+            "-D T1=%s -D CN=%d -D LOCAL_SIZE=%d",
             ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
             ocl::convertTypeStr(float_depth, depth, channels, cvt[0], sizeof(cvt[0])),
             ocl::convertTypeStr(depth, float_depth, channels, cvt[1], sizeof(cvt[1])),
@@ -1426,7 +1448,7 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde
     else if( depth == CV_64F )
         func = pyrDown_< FltCast<double, 8> >;
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
     func( src, dst, borderType );
 }
@@ -1529,7 +1551,7 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT
     else if( depth == CV_64F )
         func = pyrUp_< FltCast<double, 6> >;
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
     func( src, dst, borderType );
 }
@@ -1700,7 +1722,7 @@ CV_IMPL void
 cvReleasePyramid( CvMat*** _pyramid, int extra_layers )
 {
     if( !_pyramid )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     if( *_pyramid )
         for( int i = 0; i <= extra_layers; i++ )
@@ -1721,7 +1743,7 @@ cvCreatePyramid( const CvArr* srcarr, int extra_layers, double rate,
     CvMat stub, *src = cvGetMat( srcarr, &stub );
 
     if( extra_layers < 0 )
-        CV_Error( CV_StsOutOfRange, "The number of extra layers must be non negative" );
+        CV_Error( cv::Error::StsOutOfRange, "The number of extra layers must be non negative" );
 
     int i, layer_step, elem_size = CV_ELEM_SIZE(src->type);
     cv::Size layer_size, size = cvGetMatSize(src);
@@ -1748,7 +1770,7 @@ cvCreatePyramid( const CvArr* srcarr, int extra_layers, double rate,
         }
 
         if( bufsize < 0 )
-            CV_Error( CV_StsOutOfRange, "The buffer is too small to fit the pyramid" );
+            CV_Error( cv::Error::StsOutOfRange, "The buffer is too small to fit the pyramid" );
         ptr = buf->data.ptr;
     }
 
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 456cfc4af916..7e45f1e0f499 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -346,8 +346,8 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
 {
     int i = 0;
     ufixedpoint16 src_0(src[0]);
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
     for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -358,7 +358,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
     {
         *(dst++) = src_0;
     }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     for (; i <= dst_max - 2*VECSZ; i += 2*VECSZ, m += 4*VECSZ, dst += 2*VECSZ)
     {
         v_uint16 v_src0, v_src1;
@@ -384,7 +384,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 1>(uint8_t* src, int, int *o
         *(dst++) = m[0] * px[0] + m[1] * px[1];
     }
     src_0 = (src + ofst[dst_width - 1])[0];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_src_0 = vx_setall_u16(*((uint16_t*)&src_0));
     for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -406,8 +406,8 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *o
     } srccn;
     ((ufixedpoint16*)(srccn.w))[0] = src[0];
     ((ufixedpoint16*)(srccn.w))[1] = src[1];
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
     for (; i <= dst_min - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -419,7 +419,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *o
         *(dst++) = ((ufixedpoint16*)(srccn.w))[0];
         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
     }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += VECSZ)
     {
         v_uint16 v_src0, v_src1;
@@ -440,7 +440,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 2>(uint8_t* src, int, int *o
         *(dst++) = m[0] * px[1] + m[1] * px[3];
     }
     ((ufixedpoint16*)(srccn.w))[0] = (src + 2 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 2 * ofst[dst_width - 1])[1];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_srccn = v_reinterpret_as_u16(vx_setall_u32(srccn.d));
     for (; i <= dst_width - VECSZ/2; i += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -465,8 +465,8 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *o
     ((ufixedpoint16*)(srccn.w))[1] = src[1];
     ((ufixedpoint16*)(srccn.w))[2] = src[2];
     ((ufixedpoint16*)(srccn.w))[3] = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
     for (; i <= dst_min - (VECSZ+2)/3; i += VECSZ/4, m += VECSZ/2, dst += 3*VECSZ/4) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -479,14 +479,14 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *o
         *(dst++) = ((ufixedpoint16*)(srccn.w))[1];
         *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
     }
-#if CV_SIMD
-    CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VECSZ/2];
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    CV_DECL_ALIGNED(CV_SIMD_WIDTH) int ofst3[VTraits<v_uint16>::max_nlanes/2];
     for (; i <= dst_max - (3*VECSZ/4 + (VECSZ+2)/3); i += VECSZ/2, m += VECSZ, dst += 3*VECSZ/2)
     {
-        v_store(ofst3, vx_load(ofst + i) * vx_setall_s32(3));
+        v_store(ofst3, v_mul(vx_load(ofst + i), vx_setall_s32(3)));
         v_uint8 v_src01, v_src23;
         v_uint16 v_src0, v_src1, v_src2, v_src3;
-        v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)) >> 8), v_src01, v_src23);
+        v_zip(vx_lut_quads(src, ofst3), v_reinterpret_as_u8(v_shr<8>(v_reinterpret_as_u32(vx_lut_quads(src+2, ofst3)))), v_src01, v_src23);
         v_expand(v_src01, v_src0, v_src1);
         v_expand(v_src23, v_src2, v_src3);
 
@@ -514,7 +514,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 3>(uint8_t* src, int, int *o
     ((ufixedpoint16*)(srccn.w))[0] = (src + 3*ofst[dst_width - 1])[0];
     ((ufixedpoint16*)(srccn.w))[1] = (src + 3*ofst[dst_width - 1])[1];
     ((ufixedpoint16*)(srccn.w))[2] = (src + 3*ofst[dst_width - 1])[2];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_srccn = v_pack_triplets(v_reinterpret_as_u16(vx_setall_u64(srccn.q)));
     for (; i <= dst_width - (VECSZ+2)/3; i += VECSZ/4, dst += 3*VECSZ/4) // Points that fall right from src image so became equal to rightmost src point
     {
@@ -540,8 +540,8 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
     ((ufixedpoint16*)(srccn.w))[1] = src[1];
     ((ufixedpoint16*)(srccn.w))[2] = src[2];
     ((ufixedpoint16*)(srccn.w))[3] = src[3];
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
     for (; i <= dst_min - VECSZ/4; i += VECSZ/4, m += VECSZ/2, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -555,7 +555,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
         *(dst++) = ((ufixedpoint16*)(srccn.w))[2];
         *(dst++) = ((ufixedpoint16*)(srccn.w))[3];
     }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     for (; i <= dst_max - VECSZ/2; i += VECSZ/2, m += VECSZ, dst += 2*VECSZ)
     {
         v_uint16 v_src0, v_src1, v_src2, v_src3;
@@ -586,7 +586,7 @@ void hlineResizeCn<uint8_t, ufixedpoint16, 2, true, 4>(uint8_t* src, int, int *o
     }
     ((ufixedpoint16*)(srccn.w))[0] = (src + 4 * ofst[dst_width - 1])[0]; ((ufixedpoint16*)(srccn.w))[1] = (src + 4 * ofst[dst_width - 1])[1];
     ((ufixedpoint16*)(srccn.w))[2] = (src + 4 * ofst[dst_width - 1])[2]; ((ufixedpoint16*)(srccn.w))[3] = (src + 4 * ofst[dst_width - 1])[3];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_srccn = v_reinterpret_as_u16(vx_setall_u64(srccn.q));
     for (; i <= dst_width - VECSZ/4; i += VECSZ/4, dst += VECSZ) // Points that fall right from src image so became equal to rightmost src point
     {
@@ -606,8 +606,8 @@ void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int
 {
     int i = 0;
     ufixedpoint32 src_0(src[0]);
-#if CV_SIMD
-    const int VECSZ = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     v_uint32 v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
     for (; i <= dst_min - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ) // Points that fall left from src image so became equal to leftmost src point
     {
@@ -618,16 +618,16 @@ void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int
     {
         *(dst++) = src_0;
     }
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     for (; i <= dst_max - VECSZ; i += VECSZ, m += 2*VECSZ, dst += VECSZ)
     {
         v_uint32 v_src0, v_src1;
         v_expand(vx_lut_pairs(src, ofst + i), v_src0, v_src1);
 
-        v_uint64 v_res0 = v_reinterpret_as_u64(v_src0 * vx_load((uint32_t*)m));
-        v_uint64 v_res1 = v_reinterpret_as_u64(v_src1 * vx_load((uint32_t*)m + VECSZ));
-        v_store((uint32_t*)dst, v_pack((v_res0 & vx_setall_u64(0xFFFFFFFF)) + (v_res0 >> 32),
-                                       (v_res1 & vx_setall_u64(0xFFFFFFFF)) + (v_res1 >> 32)));
+        v_uint64 v_res0 = v_reinterpret_as_u64(v_mul(v_src0, vx_load((uint32_t *)m)));
+        v_uint64 v_res1 = v_reinterpret_as_u64(v_mul(v_src1, vx_load((uint32_t *)m + VECSZ)));
+        v_store((uint32_t*)dst, v_pack(v_add(v_and(v_res0, vx_setall_u64(0xFFFFFFFF)), v_shr<32>(v_res0)),
+                                       v_add(v_and(v_res1, vx_setall_u64(0xFFFFFFFF)), v_shr<32>(v_res1))));
     }
 #endif
     for (; i < dst_max; i += 1, m += 2)
@@ -636,7 +636,7 @@ void hlineResizeCn<uint16_t, ufixedpoint32, 2, true, 1>(uint16_t* src, int, int
         *(dst++) = m[0] * px[0] + m[1] * px[1];
     }
     src_0 = (src + ofst[dst_width - 1])[0];
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_src_0 = vx_setall_u32(*((uint32_t*)&src_0));
     for (; i <= dst_width - VECSZ; i += VECSZ, dst += VECSZ)
     {
@@ -659,16 +659,16 @@ template <>
 void vlineSet<uint8_t, ufixedpoint16>(ufixedpoint16* src, uint8_t* dst, int dst_width)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint8::nlanes;
-    static const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint8>::vlanes();
+    const v_uint16 v_fixedRound = vx_setall_u16((uint16_t)((1U << 8) >> 1));
     for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
     {
         v_uint16 v_src0 = vx_load((uint16_t*)src);
         v_uint16 v_src1 = vx_load((uint16_t*)src + VECSZ/2);
 
-        v_uint16 v_res0 = (v_src0 + v_fixedRound) >> 8;
-        v_uint16 v_res1 = (v_src1 + v_fixedRound) >> 8;
+        v_uint16 v_res0 = v_shr<8>(v_add(v_src0, v_fixedRound));
+        v_uint16 v_res1 = v_shr<8>(v_add(v_src1, v_fixedRound));
 
         v_store(dst, v_pack(v_res0, v_res1));
     }
@@ -693,11 +693,11 @@ void vlineResize<uint8_t, ufixedpoint16, 2>(ufixedpoint16* src, size_t src_step,
 {
     int i = 0;
     ufixedpoint16* src1 = src + src_step;
-#if CV_SIMD
-    const int VECSZ = v_uint8::nlanes;
-    static const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1));
-    static const v_int16 v_128    = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15));
-    static const v_int8  v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint8>::vlanes();
+    const v_int32 v_fixedRound = vx_setall_s32((int32_t)((1 << 16) >> 1));
+    const v_int16 v_128    = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1<<15));
+    const v_int8  v_128_16 = v_reinterpret_as_s8 (vx_setall_u8 ((uint8_t) 1<<7));
 
     v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(((uint32_t*)m)[0]));
     for (; i <= dst_width - VECSZ; i += VECSZ, src += VECSZ, src1 += VECSZ, dst += VECSZ)
@@ -716,10 +716,10 @@ void vlineResize<uint8_t, ufixedpoint16, 2>(ufixedpoint16* src, size_t src_step,
         v_int32 v_res2 = v_dotprod(v_tmp0, v_mul);
         v_int32 v_res3 = v_dotprod(v_tmp1, v_mul);
 
-        v_int8 v_res = v_pack(v_pack((v_res0 + v_fixedRound) >> 16,
-                                     (v_res1 + v_fixedRound) >> 16),
-                              v_pack((v_res2 + v_fixedRound) >> 16,
-                                     (v_res3 + v_fixedRound) >> 16));
+        v_int8 v_res = v_pack(v_pack(v_shr<16>(v_add(v_res0, v_fixedRound)),
+                                     v_shr<16>(v_add(v_res1, v_fixedRound))),
+                              v_pack(v_shr<16>(v_add(v_res2, v_fixedRound)),
+                                     v_shr<16>(v_add(v_res3, v_fixedRound))));
 
         v_store(dst, v_reinterpret_as_u8(v_sub_wrap(v_res, v_128_16)));
     }
@@ -828,7 +828,7 @@ class resize_bitExactInvoker :
             hResize((ET*)(src + (src_height - 1) * src_step), cn, xoffsets, xcoeffs, endline, min_x, max_x, dst_width);
         for (; dy < range.end; dy++)
             vlineSet<ET, FT>(endline, (ET*)(dst + dst_step * dy), dst_width*cn);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         vx_cleanup();
 #endif
     }
@@ -1136,16 +1136,16 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody
             switch( pix_size )
             {
             case 1:
-#if CV_SIMD
-                for( ; x <= dsize.width - v_uint8::nlanes; x += v_uint8::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; x <= dsize.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
                     v_store(D + x, vx_lut(S, x_ofse + x));
 #endif
                 for( ; x < dsize.width; x++ )
                     D[x] = S[x_ofse[x]];
                 break;
             case 2:
-#if CV_SIMD
-                for( ; x <= dsize.width - v_uint16::nlanes; x += v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; x <= dsize.width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes() )
                     v_store((ushort*)D + x, vx_lut((ushort*)S, x_ofse + x));
 #endif
                 for( ; x < dsize.width; x++ )
@@ -1159,8 +1159,8 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody
                 }
                 break;
             case 4:
-#if CV_SIMD
-                for( ; x <= dsize.width - v_uint32::nlanes; x += v_uint32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; x <= dsize.width - VTraits<v_uint32>::vlanes(); x += VTraits<v_uint32>::vlanes() )
                     v_store((uint32_t*)D + x, vx_lut((uint32_t*)S, x_ofse + x));
 #endif
                 for( ; x < dsize.width; x++ )
@@ -1175,8 +1175,8 @@ class resizeNN_bitexactInvoker : public ParallelLoopBody
                 }
                 break;
             case 8:
-#if CV_SIMD
-                for( ; x <= dsize.width - v_uint64::nlanes; x += v_uint64::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; x <= dsize.width - VTraits<v_uint64>::vlanes(); x += VTraits<v_uint64>::vlanes() )
                     v_store((uint64_t*)D + x, vx_lut((uint64_t*)S, x_ofse + x));
 #endif
                 for( ; x < dsize.width; x++ )
@@ -1250,7 +1250,7 @@ struct HResizeNoVec
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 struct VResizeLinearVec_32s8u
 {
@@ -1260,22 +1260,17 @@ struct VResizeLinearVec_32s8u
         int x = 0;
         v_int16 b0 = vx_setall_s16(beta[0]), b1 = vx_setall_s16(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
-                v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load_aligned(S0 + x                      ) >> 4, vx_load_aligned(S0 + x +     v_int32::nlanes) >> 4), b0) +
-                                                  v_mul_hi(v_pack(vx_load_aligned(S1 + x                      ) >> 4, vx_load_aligned(S1 + x +     v_int32::nlanes) >> 4), b1),
-                                                  v_mul_hi(v_pack(vx_load_aligned(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
-                                                  v_mul_hi(v_pack(vx_load_aligned(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load_aligned(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
+        if( (((size_t)S0|(size_t)S1)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
+                v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S0 + x)), v_shr<4>(vx_load_aligned(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S1 + x)), v_shr<4>(vx_load_aligned(S1 + x + VTraits<v_int32>::vlanes()))), b1)),
+                                                  v_add(v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S0 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load_aligned(S0 + x + 3 * VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load_aligned(S1 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load_aligned(S1 + x + 3 * VTraits<v_int32>::vlanes()))), b1))));
         else
-            for( ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes)
-                v_store(dst + x, v_rshr_pack_u<2>(v_mul_hi(v_pack(vx_load(S0 + x                      ) >> 4, vx_load(S0 + x +     v_int32::nlanes) >> 4), b0) +
-                                                  v_mul_hi(v_pack(vx_load(S1 + x                      ) >> 4, vx_load(S1 + x +     v_int32::nlanes) >> 4), b1),
-                                                  v_mul_hi(v_pack(vx_load(S0 + x + 2 * v_int32::nlanes) >> 4, vx_load(S0 + x + 3 * v_int32::nlanes) >> 4), b0) +
-                                                  v_mul_hi(v_pack(vx_load(S1 + x + 2 * v_int32::nlanes) >> 4, vx_load(S1 + x + 3 * v_int32::nlanes) >> 4), b1)));
+            for( ; x <= width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes())
+                v_store(dst + x, v_rshr_pack_u<2>(v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)),
+                                                  v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S0 + x + 3 * VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x + 2 * VTraits<v_int32>::vlanes())), v_shr<4>(vx_load(S1 + x + 3 * VTraits<v_int32>::vlanes()))), b1))));
 
-        for( ; x < width - v_int16::nlanes; x += v_int16::nlanes)
-            v_rshr_pack_u_store<2>(dst + x, v_mul_hi(v_pack(vx_load(S0 + x) >> 4, vx_load(S0 + x + v_int32::nlanes) >> 4), b0) +
-                                            v_mul_hi(v_pack(vx_load(S1 + x) >> 4, vx_load(S1 + x + v_int32::nlanes) >> 4), b1));
+        for( ; x < width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
+            v_rshr_pack_u_store<2>(dst + x, v_add(v_mul_hi(v_pack(v_shr<4>(vx_load(S0 + x)), v_shr<4>(vx_load(S0 + x + VTraits<v_int32>::vlanes()))), b0), v_mul_hi(v_pack(v_shr<4>(vx_load(S1 + x)), v_shr<4>(vx_load(S1 + x + VTraits<v_int32>::vlanes()))), b1)));
 
         return x;
     }
@@ -1290,17 +1285,17 @@ struct VResizeLinearVec_32f16u
 
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
-                                          v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
+        if( (((size_t)S0|(size_t)S1)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
+                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, v_mul(vx_load_aligned(S1 + x), b1))),
+                                          v_round(v_muladd(vx_load_aligned(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load_aligned(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
         else
-            for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
-                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
-                                          v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
-        for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
+            for (; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
+                v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ), b0, v_mul(vx_load(S1 + x), b1))),
+                                          v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
+        for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
         {
-            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
+            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
             v_store_low(dst + x, v_pack_u(t0, t0));
         }
 
@@ -1317,17 +1312,17 @@ struct VResizeLinearVec_32f16s
 
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-                v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, vx_load_aligned(S1 + x                    ) * b1)),
-                                        v_round(v_muladd(vx_load_aligned(S0 + x + v_float32::nlanes), b0, vx_load_aligned(S1 + x + v_float32::nlanes) * b1))));
+        if( (((size_t)S0|(size_t)S1)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
+                v_store(dst + x, v_pack(v_round(v_muladd(vx_load_aligned(S0 + x                    ), b0, v_mul(vx_load_aligned(S1 + x), b1))),
+                                        v_round(v_muladd(vx_load_aligned(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load_aligned(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
         else
-            for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
-                v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ), b0, vx_load(S1 + x                    ) * b1)),
-                                        v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes), b0, vx_load(S1 + x + v_float32::nlanes) * b1))));
-        for( ; x < width - v_float32::nlanes; x += v_float32::nlanes)
+            for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
+                v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ), b0, v_mul(vx_load(S1 + x), b1))),
+                                        v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()), b0, v_mul(vx_load(S1 + x + VTraits<v_float32>::vlanes()), b1)))));
+        for( ; x < width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
         {
-            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
+            v_int32 t0 = v_round(v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
             v_store_low(dst + x, v_pack(t0, t0));
         }
 
@@ -1344,12 +1339,12 @@ struct VResizeLinearVec_32f
 
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]);
 
-        if( (((size_t)S0|(size_t)S1)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-                v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, vx_load_aligned(S1 + x) * b1));
+        if( (((size_t)S0|(size_t)S1)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
+                v_store(dst + x, v_muladd(vx_load_aligned(S0 + x), b0, v_mul(vx_load_aligned(S1 + x), b1)));
         else
-            for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
-                v_store(dst + x, v_muladd(vx_load(S0 + x), b0, vx_load(S1 + x) * b1));
+            for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
+                v_store(dst + x, v_muladd(vx_load(S0 + x), b0, v_mul(vx_load(S1 + x), b1)));
 
         return x;
     }
@@ -1367,26 +1362,26 @@ struct VResizeCubicVec_32s8u
         v_float32 b0 = vx_setall_f32(beta[0] * scale), b1 = vx_setall_f32(beta[1] * scale),
                   b2 = vx_setall_f32(beta[2] * scale), b3 = vx_setall_f32(beta[3] * scale);
 
-        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(CV_SIMD_WIDTH - 1)) == 0 )
-            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+        if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&(VTraits<v_uint8>::vlanes() - 1)) == 0 )
+            for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
                 v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x                    )),  b0,
                                                        v_muladd(v_cvt_f32(vx_load_aligned(S1 + x                    )),  b1,
                                                        v_muladd(v_cvt_f32(vx_load_aligned(S2 + x                    )),  b2,
-                                                                v_cvt_f32(vx_load_aligned(S3 + x                    )) * b3)))),
-                                               v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + v_float32::nlanes)),  b0,
-                                                       v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + v_float32::nlanes)),  b1,
-                                                       v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + v_float32::nlanes)),  b2,
-                                                                v_cvt_f32(vx_load_aligned(S3 + x + v_float32::nlanes)) * b3))))));
+                                                                v_mul(v_cvt_f32(vx_load_aligned(S3 + x)), b3))))),
+                                               v_round(v_muladd(v_cvt_f32(vx_load_aligned(S0 + x + VTraits<v_float32>::vlanes())),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S1 + x + VTraits<v_float32>::vlanes())),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load_aligned(S2 + x + VTraits<v_float32>::vlanes())),  b2,
+                                                                v_mul(v_cvt_f32(vx_load_aligned(S3 + x + VTraits<v_float32>::vlanes())), b3)))))));
         else
-            for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+            for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
                 v_pack_u_store(dst + x, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + x                    )),  b0,
                                                        v_muladd(v_cvt_f32(vx_load(S1 + x                    )),  b1,
                                                        v_muladd(v_cvt_f32(vx_load(S2 + x                    )),  b2,
-                                                                v_cvt_f32(vx_load(S3 + x                    )) * b3)))),
-                                               v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + v_float32::nlanes)),  b0,
-                                                       v_muladd(v_cvt_f32(vx_load(S1 + x + v_float32::nlanes)),  b1,
-                                                       v_muladd(v_cvt_f32(vx_load(S2 + x + v_float32::nlanes)),  b2,
-                                                                v_cvt_f32(vx_load(S3 + x + v_float32::nlanes)) * b3))))));
+                                                                v_mul(v_cvt_f32(vx_load(S3 + x)), b3))))),
+                                               v_round(v_muladd(v_cvt_f32(vx_load(S0 + x + VTraits<v_float32>::vlanes())),  b0,
+                                                       v_muladd(v_cvt_f32(vx_load(S1 + x + VTraits<v_float32>::vlanes())),  b1,
+                                                       v_muladd(v_cvt_f32(vx_load(S2 + x + VTraits<v_float32>::vlanes())),  b2,
+                                                                v_mul(v_cvt_f32(vx_load(S3 + x + VTraits<v_float32>::vlanes())), b3)))))));
         return x;
     }
 };
@@ -1400,15 +1395,15 @@ struct VResizeCubicVec_32f16u
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for (; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+        for (; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
             v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
                                               v_muladd(vx_load(S1 + x                    ),  b1,
                                               v_muladd(vx_load(S2 + x                    ),  b2,
-                                                       vx_load(S3 + x                    ) * b3)))),
-                                      v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-                                              v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
-                                              v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-                                                       vx_load(S3 + x + v_float32::nlanes) * b3))))));
+                                                       v_mul(vx_load(S3 + x), b3))))),
+                                      v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()),  b0,
+                                              v_muladd(vx_load(S1 + x + VTraits<v_float32>::vlanes()),  b1,
+                                              v_muladd(vx_load(S2 + x + VTraits<v_float32>::vlanes()),  b2,
+                                                       v_mul(vx_load(S3 + x + VTraits<v_float32>::vlanes()), b3)))))));
 
         return x;
     }
@@ -1423,15 +1418,15 @@ struct VResizeCubicVec_32f16s
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+        for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
             v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
                                             v_muladd(vx_load(S1 + x                    ),  b1,
                                             v_muladd(vx_load(S2 + x                    ),  b2,
-                                                     vx_load(S3 + x                    ) * b3)))),
-                                    v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-                                            v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
-                                            v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-                                                     vx_load(S3 + x + v_float32::nlanes) * b3))))));
+                                                     v_mul(vx_load(S3 + x), b3))))),
+                                    v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()),  b0,
+                                            v_muladd(vx_load(S1 + x + VTraits<v_float32>::vlanes()),  b1,
+                                            v_muladd(vx_load(S2 + x + VTraits<v_float32>::vlanes()),  b2,
+                                                     v_mul(vx_load(S3 + x + VTraits<v_float32>::vlanes()), b3)))))));
 
         return x;
     }
@@ -1446,11 +1441,11 @@ struct VResizeCubicVec_32f
         v_float32 b0 = vx_setall_f32(beta[0]), b1 = vx_setall_f32(beta[1]),
                   b2 = vx_setall_f32(beta[2]), b3 = vx_setall_f32(beta[3]);
 
-        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+        for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
             v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
                              v_muladd(vx_load(S1 + x),  b1,
                              v_muladd(vx_load(S2 + x),  b2,
-                                      vx_load(S3 + x) * b3))));
+                                      v_mul(vx_load(S3 + x), b3)))));
 
         return x;
     }
@@ -1484,7 +1479,7 @@ struct VResizeLanczos4Vec_32f16u
                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
 
-        for( ; x <= width - v_uint16::nlanes; x += v_uint16::nlanes)
+        for( ; x <= width - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
             v_store(dst + x, v_pack_u(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
                                               v_muladd(vx_load(S1 + x                    ),  b1,
                                               v_muladd(vx_load(S2 + x                    ),  b2,
@@ -1492,15 +1487,15 @@ struct VResizeLanczos4Vec_32f16u
                                               v_muladd(vx_load(S4 + x                    ),  b4,
                                               v_muladd(vx_load(S5 + x                    ),  b5,
                                               v_muladd(vx_load(S6 + x                    ),  b6,
-                                                       vx_load(S7 + x                    ) * b7)))))))),
-                                      v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-                                              v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
-                                              v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-                                              v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
-                                              v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
-                                              v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
-                                              v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
-                                                       vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
+                                                       v_mul(vx_load(S7 + x                    ), b7))))))))),
+                                      v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()),  b0,
+                                              v_muladd(vx_load(S1 + x + VTraits<v_float32>::vlanes()),  b1,
+                                              v_muladd(vx_load(S2 + x + VTraits<v_float32>::vlanes()),  b2,
+                                              v_muladd(vx_load(S3 + x + VTraits<v_float32>::vlanes()),  b3,
+                                              v_muladd(vx_load(S4 + x + VTraits<v_float32>::vlanes()),  b4,
+                                              v_muladd(vx_load(S5 + x + VTraits<v_float32>::vlanes()),  b5,
+                                              v_muladd(vx_load(S6 + x + VTraits<v_float32>::vlanes()),  b6,
+                                                       v_mul(vx_load(S7 + x + VTraits<v_float32>::vlanes()), b7)))))))))));
 
         return x;
     }
@@ -1520,7 +1515,7 @@ struct VResizeLanczos4Vec_32f16s
                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
 
-        for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+        for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
             v_store(dst + x, v_pack(v_round(v_muladd(vx_load(S0 + x                    ),  b0,
                                             v_muladd(vx_load(S1 + x                    ),  b1,
                                             v_muladd(vx_load(S2 + x                    ),  b2,
@@ -1528,15 +1523,15 @@ struct VResizeLanczos4Vec_32f16s
                                             v_muladd(vx_load(S4 + x                    ),  b4,
                                             v_muladd(vx_load(S5 + x                    ),  b5,
                                             v_muladd(vx_load(S6 + x                    ),  b6,
-                                                     vx_load(S7 + x                    ) * b7)))))))),
-                                    v_round(v_muladd(vx_load(S0 + x + v_float32::nlanes),  b0,
-                                            v_muladd(vx_load(S1 + x + v_float32::nlanes),  b1,
-                                            v_muladd(vx_load(S2 + x + v_float32::nlanes),  b2,
-                                            v_muladd(vx_load(S3 + x + v_float32::nlanes),  b3,
-                                            v_muladd(vx_load(S4 + x + v_float32::nlanes),  b4,
-                                            v_muladd(vx_load(S5 + x + v_float32::nlanes),  b5,
-                                            v_muladd(vx_load(S6 + x + v_float32::nlanes),  b6,
-                                                     vx_load(S7 + x + v_float32::nlanes) * b7))))))))));
+                                                     v_mul(vx_load(S7 + x), b7))))))))),
+                                    v_round(v_muladd(vx_load(S0 + x + VTraits<v_float32>::vlanes()),  b0,
+                                            v_muladd(vx_load(S1 + x + VTraits<v_float32>::vlanes()),  b1,
+                                            v_muladd(vx_load(S2 + x + VTraits<v_float32>::vlanes()),  b2,
+                                            v_muladd(vx_load(S3 + x + VTraits<v_float32>::vlanes()),  b3,
+                                            v_muladd(vx_load(S4 + x + VTraits<v_float32>::vlanes()),  b4,
+                                            v_muladd(vx_load(S5 + x + VTraits<v_float32>::vlanes()),  b5,
+                                            v_muladd(vx_load(S6 + x + VTraits<v_float32>::vlanes()),  b6,
+                                                     v_mul(vx_load(S7 + x + VTraits<v_float32>::vlanes()), b7)))))))))));
 
         return x;
     }
@@ -1555,7 +1550,7 @@ struct VResizeLanczos4Vec_32f
                   b4 = vx_setall_f32(beta[4]), b5 = vx_setall_f32(beta[5]),
                   b6 = vx_setall_f32(beta[6]), b7 = vx_setall_f32(beta[7]);
 
-        for( ; x <= width - v_float32::nlanes; x += v_float32::nlanes)
+        for( ; x <= width - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes())
             v_store(dst + x, v_muladd(vx_load(S0 + x),  b0,
                              v_muladd(vx_load(S1 + x),  b1,
                              v_muladd(vx_load(S2 + x),  b2,
@@ -1563,7 +1558,7 @@ struct VResizeLanczos4Vec_32f
                              v_muladd(vx_load(S4 + x),  b4,
                              v_muladd(vx_load(S5 + x),  b5,
                              v_muladd(vx_load(S6 + x),  b6,
-                                      vx_load(S7 + x) * b7))))))));
+                                      v_mul(vx_load(S7 + x), b7)))))))));
 
         return x;
     }
@@ -1620,8 +1615,8 @@ struct HResizeLinearVec_X4
                 DVT s1(S0[sx0+cn], S0[sx1+cn], S0[sx2+cn], S0[sx3+cn]);
                 DVT s0_u(S1[sx0], S1[sx1], S1[sx2], S1[sx3]);
                 DVT s1_u(S1[sx0+cn], S1[sx1+cn], S1[sx2+cn], S1[sx3+cn]);
-                v_store(&D1[dx], s0_u * a_even + s1_u * a_odd);
-                v_store(&D0[dx], s0 * a_even + s1 * a_odd);
+                v_store(&D1[dx], v_add(v_mul(s0_u, a_even), v_mul(s1_u, a_odd)));
+                v_store(&D0[dx], v_add(v_mul(s0, a_even), v_mul(s1, a_odd)));
             }
         }
         for( ; k < count; k++ )
@@ -1640,7 +1635,7 @@ struct HResizeLinearVec_X4
                 v_load_deinterleave(&alpha[dx*2], a_even, a_odd);
                 DVT s0(S[sx0], S[sx1], S[sx2], S[sx3]);
                 DVT s1(S[sx0+cn], S[sx1+cn], S[sx2+cn], S[sx3+cn]);
-                v_store(&D[dx], s0 * a_even + s1 * a_odd);
+                v_store(&D[dx], v_add(v_mul(s0, a_even), v_mul(s1, a_odd)));
             }
         }
         return dx;
@@ -1752,8 +1747,8 @@ struct HResizeLinearVecU8_X4
                 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
                 {
                     v_int16x8 a = v_load(alpha+dx*2);
-                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S0+xofs[dx]) | (v_load_expand_q(S0+xofs[dx]+cn)<<16)), a));
-                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S1+xofs[dx]) | (v_load_expand_q(S1+xofs[dx]+cn)<<16)), a));
+                    v_store(&D0[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S0 + xofs[dx]), v_shl<16>(v_load_expand_q(S0 + xofs[dx] + cn)))), a));
+                    v_store(&D1[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S1 + xofs[dx]), v_shl<16>(v_load_expand_q(S1 + xofs[dx] + cn)))), a));
                 }
             }
             for( ; k < count; k++ )
@@ -1763,7 +1758,7 @@ struct HResizeLinearVecU8_X4
                 for( dx = 0; (xofs[dx] + cn) < smax; dx += cn )
                 {
                     v_int16x8 a = v_load(alpha+dx*2);
-                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_load_expand_q(S+xofs[dx]) | (v_load_expand_q(S+xofs[dx]+cn)<<16)), a));
+                    v_store(&D[dx], v_dotprod(v_reinterpret_as_s16(v_or(v_load_expand_q(S + xofs[dx]), v_shl<16>(v_load_expand_q(S + xofs[dx] + cn)))), a));
                 }
             }
             /* Debug check to ensure truthiness that we never vector the final value. */
@@ -2452,56 +2447,56 @@ class ResizeAreaFastVec_SIMD_8u
         if (cn == 1)
         {
             v_uint16 masklow = vx_setall_u16(0x00ff);
-            for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += v_uint8::nlanes, S1 += v_uint8::nlanes, D += v_uint16::nlanes)
+            for ( ; dx <= w - VTraits<v_uint16>::vlanes(); dx += VTraits<v_uint16>::vlanes(), S0 += VTraits<v_uint8>::vlanes(), S1 += VTraits<v_uint8>::vlanes(), D += VTraits<v_uint16>::vlanes())
             {
                 v_uint16 r0 = v_reinterpret_as_u16(vx_load(S0));
                 v_uint16 r1 = v_reinterpret_as_u16(vx_load(S1));
-                v_rshr_pack_store<2>(D, (r0 >> 8) + (r0 & masklow) + (r1 >> 8) + (r1 & masklow));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<8>(r0), v_and(r0, masklow)), v_shr<8>(r1)), v_and(r1, masklow)));
             }
         }
         else if (cn == 3)
         {
             if (CV_SIMD_WIDTH > 64)
                 return 0;
-            for ( ; dx <= w - 3*v_uint8::nlanes; dx += 3*v_uint8::nlanes, S0 += 6*v_uint8::nlanes, S1 += 6*v_uint8::nlanes, D += 3*v_uint8::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_uint8>::vlanes(); dx += 3*VTraits<v_uint8>::vlanes(), S0 += 6*VTraits<v_uint8>::vlanes(), S1 += 6*VTraits<v_uint8>::vlanes(), D += 3*VTraits<v_uint8>::vlanes())
             {
                 v_uint16 t0, t1, t2, t3, t4, t5;
                 v_uint16 s0, s1, s2, s3, s4, s5;
-                s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
-                s1 = vx_load_expand(S0 +   v_uint16::nlanes) + vx_load_expand(S1 +   v_uint16::nlanes);
-                s2 = vx_load_expand(S0 + 2*v_uint16::nlanes) + vx_load_expand(S1 + 2*v_uint16::nlanes);
-                s3 = vx_load_expand(S0 + 3*v_uint16::nlanes) + vx_load_expand(S1 + 3*v_uint16::nlanes);
-                s4 = vx_load_expand(S0 + 4*v_uint16::nlanes) + vx_load_expand(S1 + 4*v_uint16::nlanes);
-                s5 = vx_load_expand(S0 + 5*v_uint16::nlanes) + vx_load_expand(S1 + 5*v_uint16::nlanes);
+                s0 = v_add(vx_load_expand(S0), vx_load_expand(S1));
+                s1 = v_add(vx_load_expand(S0 + VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + VTraits<v_uint16>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 2 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 2 * VTraits<v_uint16>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_uint16>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 4 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 4 * VTraits<v_uint16>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 5 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 5 * VTraits<v_uint16>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_uint16 bl, gl, rl;
 #if CV_SIMD_WIDTH == 16
-                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+                bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5);
 #elif CV_SIMD_WIDTH == 32
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
-                bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
+                bl = v_add(s0, s3); gl = v_add(s1, s4); rl = v_add(s2, s5);
 #elif CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
 #endif
-                s0 = vx_load_expand(S0 + 6*v_uint16::nlanes) + vx_load_expand(S1 + 6*v_uint16::nlanes);
-                s1 = vx_load_expand(S0 + 7*v_uint16::nlanes) + vx_load_expand(S1 + 7*v_uint16::nlanes);
-                s2 = vx_load_expand(S0 + 8*v_uint16::nlanes) + vx_load_expand(S1 + 8*v_uint16::nlanes);
-                s3 = vx_load_expand(S0 + 9*v_uint16::nlanes) + vx_load_expand(S1 + 9*v_uint16::nlanes);
-                s4 = vx_load_expand(S0 +10*v_uint16::nlanes) + vx_load_expand(S1 +10*v_uint16::nlanes);
-                s5 = vx_load_expand(S0 +11*v_uint16::nlanes) + vx_load_expand(S1 +11*v_uint16::nlanes);
+                s0 = v_add(vx_load_expand(S0 + 6 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 6 * VTraits<v_uint16>::vlanes()));
+                s1 = v_add(vx_load_expand(S0 + 7 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 7 * VTraits<v_uint16>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 8 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 8 * VTraits<v_uint16>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 9 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 9 * VTraits<v_uint16>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 10 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 10 * VTraits<v_uint16>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 11 * VTraits<v_uint16>::vlanes()), vx_load_expand(S1 + 11 * VTraits<v_uint16>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_uint16 bh, gh, rh;
 #if CV_SIMD_WIDTH == 16
-                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+                bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5);
 #elif CV_SIMD_WIDTH == 32
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
-                bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
+                bh = v_add(s0, s3); gh = v_add(s1, s4); rh = v_add(s2, s5);
 #elif CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
@@ -2513,7 +2508,7 @@ class ResizeAreaFastVec_SIMD_8u
         else
         {
             CV_Assert(cn == 4);
-            for ( ; dx <= w - v_uint8::nlanes; dx += v_uint8::nlanes, S0 += 2*v_uint8::nlanes, S1 += 2*v_uint8::nlanes, D += v_uint8::nlanes)
+            for ( ; dx <= w - VTraits<v_uint8>::vlanes(); dx += VTraits<v_uint8>::vlanes(), S0 += 2*VTraits<v_uint8>::vlanes(), S1 += 2*VTraits<v_uint8>::vlanes(), D += VTraits<v_uint8>::vlanes())
             {
                 v_uint32 r00, r01, r10, r11;
                 v_load_deinterleave((uint32_t*)S0, r00, r01);
@@ -2524,7 +2519,7 @@ class ResizeAreaFastVec_SIMD_8u
                 v_expand(v_reinterpret_as_u8(r01), r01l, r01h);
                 v_expand(v_reinterpret_as_u8(r10), r10l, r10h);
                 v_expand(v_reinterpret_as_u8(r11), r11l, r11h);
-                v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
+                v_store(D, v_rshr_pack<2>(v_add(v_add(v_add(r00l, r01l), r10l), r11l), v_add(v_add(v_add(r00h, r01h), r10h), r11h)));
             }
         }
 
@@ -2551,11 +2546,11 @@ class ResizeAreaFastVec_SIMD_16u
         if (cn == 1)
         {
             v_uint32 masklow = vx_setall_u32(0x0000ffff);
-            for (; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
+            for (; dx <= w - VTraits<v_uint32>::vlanes(); dx += VTraits<v_uint32>::vlanes(), S0 += VTraits<v_uint16>::vlanes(), S1 += VTraits<v_uint16>::vlanes(), D += VTraits<v_uint32>::vlanes())
             {
                 v_uint32 r0 = v_reinterpret_as_u32(vx_load(S0));
                 v_uint32 r1 = v_reinterpret_as_u32(vx_load(S1));
-                v_rshr_pack_store<2>(D, (r0 >> 16) + (r0 & masklow) + (r1 >> 16) + (r1 & masklow));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<16>(r0), v_and(r0, masklow)), v_shr<16>(r1)), v_and(r1, masklow)));
             }
         }
         else if (cn == 3)
@@ -2571,41 +2566,41 @@ class ResizeAreaFastVec_SIMD_16u
                 v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
             }
 #else
-                v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
 #endif
 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
-            for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
             {
                 v_uint32 t0, t1, t2, t3, t4, t5;
                 v_uint32 s0, s1, s2, s3, s4, s5;
-                s0 = vx_load_expand(S0                     ) + vx_load_expand(S1                     );
-                s1 = vx_load_expand(S0 +   v_uint32::nlanes) + vx_load_expand(S1 +   v_uint32::nlanes);
-                s2 = vx_load_expand(S0 + 2*v_uint32::nlanes) + vx_load_expand(S1 + 2*v_uint32::nlanes);
-                s3 = vx_load_expand(S0 + 3*v_uint32::nlanes) + vx_load_expand(S1 + 3*v_uint32::nlanes);
-                s4 = vx_load_expand(S0 + 4*v_uint32::nlanes) + vx_load_expand(S1 + 4*v_uint32::nlanes);
-                s5 = vx_load_expand(S0 + 5*v_uint32::nlanes) + vx_load_expand(S1 + 5*v_uint32::nlanes);
+                s0 = v_add(vx_load_expand(S0), vx_load_expand(S1));
+                s1 = v_add(vx_load_expand(S0 + VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + VTraits<v_uint32>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 2 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 2 * VTraits<v_uint32>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_uint32>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 4 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 4 * VTraits<v_uint32>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 5 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 5 * VTraits<v_uint32>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_uint32 bl, gl, rl;
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
 #if CV_SIMD_WIDTH == 32
-                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+                bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5);
 #else //CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
 #endif
-                s0 = vx_load_expand(S0 + 6*v_uint32::nlanes) + vx_load_expand(S1 + 6*v_uint32::nlanes);
-                s1 = vx_load_expand(S0 + 7*v_uint32::nlanes) + vx_load_expand(S1 + 7*v_uint32::nlanes);
-                s2 = vx_load_expand(S0 + 8*v_uint32::nlanes) + vx_load_expand(S1 + 8*v_uint32::nlanes);
-                s3 = vx_load_expand(S0 + 9*v_uint32::nlanes) + vx_load_expand(S1 + 9*v_uint32::nlanes);
-                s4 = vx_load_expand(S0 +10*v_uint32::nlanes) + vx_load_expand(S1 +10*v_uint32::nlanes);
-                s5 = vx_load_expand(S0 +11*v_uint32::nlanes) + vx_load_expand(S1 +11*v_uint32::nlanes);
+                s0 = v_add(vx_load_expand(S0 + 6 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 6 * VTraits<v_uint32>::vlanes()));
+                s1 = v_add(vx_load_expand(S0 + 7 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 7 * VTraits<v_uint32>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 8 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 8 * VTraits<v_uint32>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 9 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 9 * VTraits<v_uint32>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 10 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 10 * VTraits<v_uint32>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 11 * VTraits<v_uint32>::vlanes()), vx_load_expand(S1 + 11 * VTraits<v_uint32>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_uint32 bh, gh, rh;
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
 #if CV_SIMD_WIDTH == 32
-                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+                bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5);
 #else //CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
@@ -2614,7 +2609,7 @@ class ResizeAreaFastVec_SIMD_16u
             }
 #elif CV_SIMD_WIDTH >= 64
             v_uint32 masklow = vx_setall_u32(0x0000ffff);
-            for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 b0, g0, r0, b1, g1, r1;
                 v_load_deinterleave(S0, b0, g0, r0);
@@ -2622,8 +2617,8 @@ class ResizeAreaFastVec_SIMD_16u
                 v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
                 v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
                 v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
-                v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
-                v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
+                v_load_deinterleave(S0 + 3*VTraits<v_uint16>::vlanes(), b0, g0, r0);
+                v_load_deinterleave(S1 + 3*VTraits<v_uint16>::vlanes(), b1, g1, r1);
                 v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
                 v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
                 v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
@@ -2635,7 +2630,7 @@ class ResizeAreaFastVec_SIMD_16u
         {
             CV_Assert(cn == 4);
 #if CV_SIMD_WIDTH >= 64
-            for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
+            for ( ; dx <= w - VTraits<v_uint16>::vlanes(); dx += VTraits<v_uint16>::vlanes(), S0 += 2*VTraits<v_uint16>::vlanes(), S1 += 2*VTraits<v_uint16>::vlanes(), D += VTraits<v_uint16>::vlanes())
             {
                 v_uint64 r00, r01, r10, r11;
                 v_load_deinterleave((uint64_t*)S0, r00, r01);
@@ -2649,19 +2644,19 @@ class ResizeAreaFastVec_SIMD_16u
                 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
             }
 #else
-            for ( ; dx <= w - v_uint32::nlanes; dx += v_uint32::nlanes, S0 += v_uint16::nlanes, S1 += v_uint16::nlanes, D += v_uint32::nlanes)
+            for ( ; dx <= w - VTraits<v_uint32>::vlanes(); dx += VTraits<v_uint32>::vlanes(), S0 += VTraits<v_uint16>::vlanes(), S1 += VTraits<v_uint16>::vlanes(), D += VTraits<v_uint32>::vlanes())
             {
                 v_uint32 r0, r1, r2, r3;
                 v_expand(vx_load(S0), r0, r1);
                 v_expand(vx_load(S1), r2, r3);
-                r0 += r2; r1 += r3;
+                r0 = v_add(r0, r2); r1 = v_add(r1, r3);
                 v_uint32 v_d;
 #if CV_SIMD_WIDTH == 16
-                v_d = r0 + r1;
+                v_d = v_add(r0, r1);
 #elif CV_SIMD_WIDTH == 32
                 v_uint32 t0, t1;
                 v_recombine(r0, r1, t0, t1);
-                v_d = t0 + t1;
+                v_d = v_add(t0, t1);
 #endif
                 v_rshr_pack_store<2>(D, v_d);
             }
@@ -2691,51 +2686,51 @@ class ResizeAreaFastVec_SIMD_16s
         if (cn == 1)
         {
             v_int32 masklow = vx_setall_s32(0x0000ffff);
-            for (; dx <= w - v_int32::nlanes; dx += v_int32::nlanes, S0 += v_int16::nlanes, S1 += v_int16::nlanes, D += v_int32::nlanes)
+            for (; dx <= w - VTraits<v_int32>::vlanes(); dx += VTraits<v_int32>::vlanes(), S0 += VTraits<v_int16>::vlanes(), S1 += VTraits<v_int16>::vlanes(), D += VTraits<v_int32>::vlanes())
             {
                 v_int32 r0 = v_reinterpret_as_s32(vx_load(S0));
                 v_int32 r1 = v_reinterpret_as_s32(vx_load(S1));
-                v_rshr_pack_store<2>(D, (r0 >> 16) + (((r0 & masklow)<<16)>>16) + (r1 >> 16) + (((r1 & masklow)<<16)>>16));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_shr<16>(r0), v_shr<16>(v_shl<16>(v_and(r0, masklow)))), v_shr<16>(r1)), v_shr<16>(v_shl<16>(v_and(r1, masklow)))));
             }
         }
         else if (cn == 3)
         {
 #if CV_SIMD_WIDTH == 16
             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
-                v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
-            for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
             {
                 v_int32 t0, t1, t2, t3, t4, t5;
                 v_int32 s0, s1, s2, s3, s4, s5;
-                s0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
-                s1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
-                s2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
-                s3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
-                s4 = vx_load_expand(S0 + 4*v_int32::nlanes) + vx_load_expand(S1 + 4*v_int32::nlanes);
-                s5 = vx_load_expand(S0 + 5*v_int32::nlanes) + vx_load_expand(S1 + 5*v_int32::nlanes);
+                s0 = v_add(vx_load_expand(S0), vx_load_expand(S1));
+                s1 = v_add(vx_load_expand(S0 + VTraits<v_int32>::vlanes()), vx_load_expand(S1 + VTraits<v_int32>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 2 * VTraits<v_int32>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_int32>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 4 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 4 * VTraits<v_int32>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 5 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 5 * VTraits<v_int32>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_int32 bl, gl, rl;
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
 #if CV_SIMD_WIDTH == 32
-                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+                bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5);
 #else //CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bl = s0 + s3; gl = s1 + s4; rl = s2 + s5;
 #endif
-                s0 = vx_load_expand(S0 + 6*v_int32::nlanes) + vx_load_expand(S1 + 6*v_int32::nlanes);
-                s1 = vx_load_expand(S0 + 7*v_int32::nlanes) + vx_load_expand(S1 + 7*v_int32::nlanes);
-                s2 = vx_load_expand(S0 + 8*v_int32::nlanes) + vx_load_expand(S1 + 8*v_int32::nlanes);
-                s3 = vx_load_expand(S0 + 9*v_int32::nlanes) + vx_load_expand(S1 + 9*v_int32::nlanes);
-                s4 = vx_load_expand(S0 +10*v_int32::nlanes) + vx_load_expand(S1 +10*v_int32::nlanes);
-                s5 = vx_load_expand(S0 +11*v_int32::nlanes) + vx_load_expand(S1 +11*v_int32::nlanes);
+                s0 = v_add(vx_load_expand(S0 + 6 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 6 * VTraits<v_int32>::vlanes()));
+                s1 = v_add(vx_load_expand(S0 + 7 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 7 * VTraits<v_int32>::vlanes()));
+                s2 = v_add(vx_load_expand(S0 + 8 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 8 * VTraits<v_int32>::vlanes()));
+                s3 = v_add(vx_load_expand(S0 + 9 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 9 * VTraits<v_int32>::vlanes()));
+                s4 = v_add(vx_load_expand(S0 + 10 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 10 * VTraits<v_int32>::vlanes()));
+                s5 = v_add(vx_load_expand(S0 + 11 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 11 * VTraits<v_int32>::vlanes()));
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 v_int32 bh, gh, rh;
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
 #if CV_SIMD_WIDTH == 32
-                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+                bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5);
 #else //CV_SIMD_WIDTH == 64
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bh = s0 + s3; gh = s1 + s4; rh = s2 + s5;
@@ -2743,7 +2738,7 @@ class ResizeAreaFastVec_SIMD_16s
                 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
             }
 #elif CV_SIMD_WIDTH >= 64
-            for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
             {
                 v_int16 b0, g0, r0, b1, g1, r1;
                 v_load_deinterleave(S0, b0, g0, r0);
@@ -2751,8 +2746,8 @@ class ResizeAreaFastVec_SIMD_16s
                 v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
                 v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
                 v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
-                v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
-                v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
+                v_load_deinterleave(S0 + 3*VTraits<v_int16>::vlanes(), b0, g0, r0);
+                v_load_deinterleave(S1 + 3*VTraits<v_int16>::vlanes(), b1, g1, r1);
                 v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
                 v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
                 v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
@@ -2763,7 +2758,7 @@ class ResizeAreaFastVec_SIMD_16s
         else
         {
             CV_Assert(cn == 4);
-            for (; dx <= w - v_int16::nlanes; dx += v_int16::nlanes, S0 += 2 * v_int16::nlanes, S1 += 2 * v_int16::nlanes, D += v_int16::nlanes)
+            for (; dx <= w - VTraits<v_int16>::vlanes(); dx += VTraits<v_int16>::vlanes(), S0 += 2 * VTraits<v_int16>::vlanes(), S1 += 2 * VTraits<v_int16>::vlanes(), D += VTraits<v_int16>::vlanes())
             {
 #if CV_SIMD_WIDTH >= 64
                 v_int64 r00, r01, r10, r11;
@@ -2778,17 +2773,17 @@ class ResizeAreaFastVec_SIMD_16s
                 v_store(D, v_rshr_pack<2>(r00l + r01l + r10l + r11l, r00h + r01h + r10h + r11h));
 #else
                 v_int32 r0, r1, r2, r3;
-                r0 = vx_load_expand(S0                    ) + vx_load_expand(S1                    );
-                r1 = vx_load_expand(S0 +   v_int32::nlanes) + vx_load_expand(S1 +   v_int32::nlanes);
-                r2 = vx_load_expand(S0 + 2*v_int32::nlanes) + vx_load_expand(S1 + 2*v_int32::nlanes);
-                r3 = vx_load_expand(S0 + 3*v_int32::nlanes) + vx_load_expand(S1 + 3*v_int32::nlanes);
+                r0 = v_add(vx_load_expand(S0), vx_load_expand(S1));
+                r1 = v_add(vx_load_expand(S0 + VTraits<v_int32>::vlanes()), vx_load_expand(S1 + VTraits<v_int32>::vlanes()));
+                r2 = v_add(vx_load_expand(S0 + 2 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 2 * VTraits<v_int32>::vlanes()));
+                r3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_int32>::vlanes()));
                 v_int32 dl, dh;
 #if CV_SIMD_WIDTH == 16
-                dl = r0 + r1; dh = r2 + r3;
+                dl = v_add(r0, r1); dh = v_add(r2, r3);
 #elif CV_SIMD_WIDTH == 32
                 v_int32 t0, t1, t2, t3;
                 v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
-                dl = t0 + t1; dh = t2 + t3;
+                dl = v_add(t0, t1); dh = v_add(t2, t3);
 #endif
                 v_store(D, v_rshr_pack<2>(dl, dh));
 #endif
@@ -2822,27 +2817,27 @@ struct ResizeAreaFastVec_SIMD_32f
         if (cn == 1)
         {
             v_float32 v_025 = vx_setall_f32(0.25f);
-            for ( ; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
+            for ( ; dx <= w - VTraits<v_float32>::vlanes(); dx += VTraits<v_float32>::vlanes(), S0 += 2*VTraits<v_float32>::vlanes(), S1 += 2*VTraits<v_float32>::vlanes(), D += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_row00, v_row01, v_row10, v_row11;
                 v_load_deinterleave(S0, v_row00, v_row01);
                 v_load_deinterleave(S1, v_row10, v_row11);
-                v_store(D, ((v_row00 + v_row01) + (v_row10 + v_row11)) * v_025);
+                v_store(D, v_mul(v_add(v_add(v_row00, v_row01), v_add(v_row10, v_row11)), v_025));
             }
         }
         else if (cn == 4)
         {
 #if CV_SIMD_WIDTH == 16
             v_float32 v_025 = vx_setall_f32(0.25f);
-            for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
-                v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
+            for (; dx <= w - VTraits<v_float32>::vlanes(); dx += VTraits<v_float32>::vlanes(), S0 += 2*VTraits<v_float32>::vlanes(), S1 += 2*VTraits<v_float32>::vlanes(), D += VTraits<v_float32>::vlanes())
+                v_store(D, v_mul(v_add(v_add(vx_load(S0), vx_load(S0 + VTraits<v_float32>::vlanes())), v_add(vx_load(S1), vx_load(S1 + VTraits<v_float32>::vlanes()))), v_025));
 #elif CV_SIMD256
             v_float32x8 v_025 = v256_setall_f32(0.25f);
-            for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
+            for (; dx <= w - VTraits<v_float32x8>::vlanes(); dx += VTraits<v_float32x8>::vlanes(), S0 += 2*VTraits<v_float32x8>::vlanes(), S1 += 2*VTraits<v_float32x8>::vlanes(), D += VTraits<v_float32x8>::vlanes())
             {
                 v_float32x8 dst0, dst1;
-                v_recombine(v256_load(S0) + v256_load(S1), v256_load(S0 + v_float32x8::nlanes) + v256_load(S1 + v_float32x8::nlanes), dst0, dst1);
-                v_store(D, (dst0 + dst1) * v_025);
+                v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + VTraits<v_float32x8>::vlanes()), v256_load(S1 + VTraits<v_float32x8>::vlanes())), dst0, dst1);
+                v_store(D, v_mul(v_add(dst0, dst1), v_025));
             }
 #endif
         }
@@ -3024,6 +3019,111 @@ struct DecimateAlpha
 };
 
 
+namespace inter_area {
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+inline void saturate_store(const float* src, uchar* dst) {
+    const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
+    const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
+    const v_int32 tmp2 = v_round(vx_load(src + 2 * VTraits<v_float32>::vlanes()));
+    const v_int32 tmp3 = v_round(vx_load(src + 3 * VTraits<v_float32>::vlanes()));
+    v_store(dst, v_pack(v_pack_u(tmp0, tmp1), v_pack_u(tmp2, tmp3)));
+}
+
+inline void saturate_store(const float* src, ushort* dst) {
+    const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
+    const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
+    v_store(dst, v_pack_u(tmp0, tmp1));
+}
+
+inline void saturate_store(const float* src, short* dst) {
+    const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
+    const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
+    v_store(dst, v_pack(tmp0, tmp1));
+}
+
+static inline v_float32 vx_setall(float coeff) { return vx_setall_f32(coeff); }
+
+template <typename T>
+struct VArea {};
+
+template <>
+struct VArea<float> {
+    typedef v_float32 vWT;
+};
+#endif
+
+#if (CV_SIMD128_64F || CV_SIMD_SCALABLE_64F)
+static inline v_float64 vx_setall(double coeff) { return vx_setall_f64(coeff); }
+
+template <>
+struct VArea<double> {
+    typedef v_float64 vWT;
+};
+
+#else
+inline void mul(const double* buf, int width, double beta, double* sum) {
+    for (int dx = 0; dx < width; ++dx) {
+        sum[dx] = beta * buf[dx];
+    }
+}
+
+inline void muladd(const double* buf, int width, double beta, double* sum) {
+    for (int dx = 0; dx < width; ++dx) {
+        sum[dx] += beta * buf[dx];
+    }
+}
+#endif
+
+template <typename T, typename WT>
+inline void saturate_store(const WT* sum, int width, T* D) {
+    int dx = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int step = VTraits<typename VArea<WT>::vWT>::vlanes() * sizeof(WT) / sizeof(T);
+    for (; dx + step < width; dx += step) {
+        saturate_store(sum + dx, D + dx);
+    }
+#endif
+    for (; dx < width; ++dx) {
+        D[dx] = saturate_cast<T>(sum[dx]);
+    }
+}
+
+// Optimization when T == WT.
+template <typename WT>
+inline void saturate_store(const WT* sum, int width, WT* D) {
+    std::copy(sum, sum + width, D);
+}
+
+template <typename WT>
+inline void mul(const WT* buf, int width, WT beta, WT* sum) {
+    int dx = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
+    for (; dx + step < width; dx += step) {
+        vx_store(sum + dx, v_mul(vx_setall(beta), vx_load(buf + dx)));
+    }
+#endif
+    for (; dx < width; ++dx) {
+        sum[dx] = beta * buf[dx];
+    }
+}
+
+template <typename WT>
+inline void muladd(const WT* buf, int width, WT beta, WT* sum) {
+    int dx = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
+    for (; dx + step < width; dx += step) {
+        vx_store(sum + dx, v_add(vx_load(sum + dx), v_mul(vx_setall(beta), vx_load(buf + dx))));
+    }
+#endif
+    for (; dx < width; ++dx) {
+        sum[dx] += beta * buf[dx];
+    }
+}
+
+}  // namespace inter_area
+
 template<typename T, typename WT> class ResizeArea_Invoker :
     public ParallelLoopBody
 {
@@ -3125,27 +3225,17 @@ template<typename T, typename WT> class ResizeArea_Invoker :
 
             if( dy != prev_dy )
             {
-                T* D = dst->template ptr<T>(prev_dy);
-
-                for( dx = 0; dx < dsize.width; dx++ )
-                {
-                    D[dx] = saturate_cast<T>(sum[dx]);
-                    sum[dx] = beta*buf[dx];
-                }
+                inter_area::saturate_store(sum, dsize.width, dst->template ptr<T>(prev_dy));
+                inter_area::mul(buf, dsize.width, beta, sum);
                 prev_dy = dy;
             }
             else
             {
-                for( dx = 0; dx < dsize.width; dx++ )
-                    sum[dx] += beta*buf[dx];
+                inter_area::muladd(buf, dsize.width, beta, sum);
             }
         }
 
-        {
-        T* D = dst->template ptr<T>(prev_dy);
-        for( dx = 0; dx < dsize.width; dx++ )
-            D[dx] = saturate_cast<T>(sum[dx]);
-        }
+        inter_area::saturate_store(sum, dsize.width, dst->template ptr<T>(prev_dy));
     }
 
 private:
@@ -3306,8 +3396,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     {
         int wdepth = std::max(depth, CV_32S);
         char buf[2][50];
-        cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
-                        "-D convertToDT=%s -D cn=%d",
+        cv::String compileOpts = format("-D USE_SAMPLER -D SRC_DEPTH=%d -D T=%s -D T1=%s "
+                        "-D CONVERT_TO_DT=%s -D CN=%d",
                         depth, ocl::typeToStr(type), ocl::typeToStr(depth),
                         ocl::convertTypeStr(wdepth, depth, cn, buf[1], sizeof(buf[1])),
                         cn);
@@ -3371,8 +3461,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
             Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, _buffer.data()).copyTo(coeffs);
 
             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
-                     format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
-                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                     format("-D INTER_LINEAR_INTEGER -D SRC_DEPTH=%d -D T=%s -D T1=%s "
+                            "-D WT=%s -D CONVERT_TO_WT=%s -D CONVERT_TO_DT=%s -D CN=%d "
                             "-D INTER_RESIZE_COEF_BITS=%d",
                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
                             ocl::convertTypeStr(depth, wdepth, cn, buf[0], sizeof(buf[0])),
@@ -3389,8 +3479,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
             int wdepth = depth <= CV_8S ? CV_32S : std::max(depth, CV_32F);
             int wtype = CV_MAKETYPE(wdepth, cn);
             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
-                     format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
-                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                     format("-D INTER_LINEAR -D SRC_DEPTH=%d -D T=%s -D T1=%s "
+                            "-D WT=%s -D CONVERT_TO_WT=%s -D CONVERT_TO_DT=%s -D CN=%d "
                             "-D INTER_RESIZE_COEF_BITS=%d",
                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
                             ocl::convertTypeStr(depth, wdepth, cn, buf[0], sizeof(buf[0])),
@@ -3406,7 +3496,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     else if (interpolation == INTER_NEAREST)
     {
         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
+                 format("-D INTER_NEAREST -D T=%s -D T1=%s -D CN=%d",
                         ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
         if (k.empty())
             return false;
@@ -3420,7 +3510,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         int wtype = CV_MAKE_TYPE(wdepth, cn);
 
         char cvt[2][50];
-        String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
+        String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D CONVERT_TO_WTV=%s -D CN=%d",
                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0], sizeof(cvt[0])), cn);
 
@@ -3430,7 +3520,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         if (is_area_fast)
         {
             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
-            buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+            buildOption = buildOption + format(" -D CONVERT_TO_T=%s -D WT2V=%s -D CONVERT_TO_WT2V=%s -D INTER_AREA_FAST"
                                                 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
                                                 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0], sizeof(cvt[0])),
                                                 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1], sizeof(cvt[1])),
@@ -3442,7 +3532,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         }
         else
         {
-            buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0], sizeof(cvt[0])));
+            buildOption = buildOption + format(" -D CONVERT_TO_T=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0], sizeof(cvt[0])));
             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
             if (k.empty())
                 return false;
@@ -3934,7 +4024,7 @@ void resize(int src_type,
     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
         ksize = 2, func = linear_tab[depth];
     else
-        CV_Error( CV_StsBadArg, "Unknown interpolation method" );
+        CV_Error( cv::Error::StsBadArg, "Unknown interpolation method" );
     ksize2 = ksize/2;
 
     CV_Assert( func != 0 );
diff --git a/modules/imgproc/src/rotcalipers.cpp b/modules/imgproc/src/rotcalipers.cpp
index e3d81c7e0c0f..3bec592c9be4 100644
--- a/modules/imgproc/src/rotcalipers.cpp
+++ b/modules/imgproc/src/rotcalipers.cpp
@@ -245,7 +245,7 @@ static void rotatingCalipers( const Point2f* points, int n, int mode, float* out
                 base_b = lead_x;
                 break;
             default:
-                CV_Error(CV_StsError, "main_element should be 0, 1, 2 or 3");
+                CV_Error(cv::Error::StsError, "main_element should be 0, 1, 2 or 3");
             }
         }
         /* change base point of main edge */
diff --git a/modules/imgproc/src/samplers.cpp b/modules/imgproc/src/samplers.cpp
index 287e78c6df8b..5fb30c943441 100644
--- a/modules/imgproc/src/samplers.cpp
+++ b/modules/imgproc/src/samplers.cpp
@@ -417,7 +417,7 @@ void cv::getRectSubPix( InputArray _image, Size patchSize, Point2f center,
         getRectSubPix_Cn_<float, float, float, nop<float>, nop<float> >
         (image.ptr<float>(), image.step, image.size(), patch.ptr<float>(), patch.step, patch.size(), center, cn);
     else
-        CV_Error( CV_StsUnsupportedFormat, "Unsupported combination of input and output formats");
+        CV_Error( cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output formats");
 }
 
 
@@ -473,7 +473,7 @@ cvSampleLine( const void* _img, CvPoint pt1, CvPoint pt2,
     size_t pixsize = img.elemSize();
 
     if( !buffer )
-        CV_Error( CV_StsNullPtr, "" );
+        CV_Error( cv::Error::StsNullPtr, "" );
 
     for( int i = 0; i < li.count; i++, ++li )
     {
diff --git a/modules/imgproc/src/segmentation.cpp b/modules/imgproc/src/segmentation.cpp
index f4145c6cfac6..c6dad4ba5749 100644
--- a/modules/imgproc/src/segmentation.cpp
+++ b/modules/imgproc/src/segmentation.cpp
@@ -348,7 +348,7 @@ void cv::pyrMeanShiftFiltering( InputArray _src, OutputArray _dst,
     const int MAX_LEVELS = 8;
 
     if( (unsigned)max_level > (unsigned)MAX_LEVELS )
-        CV_Error( CV_StsOutOfRange, "The number of pyramid levels is too large or negative" );
+        CV_Error( cv::Error::StsOutOfRange, "The number of pyramid levels is too large or negative" );
 
     std::vector<cv::Mat> src_pyramid(max_level+1);
     std::vector<cv::Mat> dst_pyramid(max_level+1);
@@ -365,19 +365,19 @@ void cv::pyrMeanShiftFiltering( InputArray _src, OutputArray _dst,
 
 
     if( src0.type() != CV_8UC3 )
-        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 3-channel images are supported" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Only 8-bit, 3-channel images are supported" );
 
     if( src0.type() != dst0.type() )
-        CV_Error( CV_StsUnmatchedFormats, "The input and output images must have the same type" );
+        CV_Error( cv::Error::StsUnmatchedFormats, "The input and output images must have the same type" );
 
     if( src0.size() != dst0.size() )
-        CV_Error( CV_StsUnmatchedSizes, "The input and output images must have the same size" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "The input and output images must have the same size" );
 
-    if( !(termcrit.type & CV_TERMCRIT_ITER) )
+    if( !(termcrit.type & TermCriteria::MAX_ITER) )
         termcrit.maxCount = 5;
     termcrit.maxCount = MAX(termcrit.maxCount,1);
     termcrit.maxCount = MIN(termcrit.maxCount,100);
-    if( !(termcrit.type & CV_TERMCRIT_EPS) )
+    if( !(termcrit.type & TermCriteria::EPS) )
         termcrit.epsilon = 1.f;
     termcrit.epsilon = MAX(termcrit.epsilon, 0.f);
 
diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp
index 4c73910e277a..007bf9ac62b1 100644
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@@ -357,7 +357,7 @@ static RotatedRect fitEllipseNoDirect( InputArray _points )
     RotatedRect box;
 
     if( n < 5 )
-        CV_Error( CV_StsBadSize, "There should be at least 5 points to fit the ellipse" );
+        CV_Error( cv::Error::StsBadSize, "There should be at least 5 points to fit the ellipse" );
 
     // New fitellipse algorithm, contributed by Dr. Daniel Weiss
     Point2f c(0,0);
@@ -520,7 +520,7 @@ cv::RotatedRect cv::fitEllipseAMS( InputArray _points )
     RotatedRect box;
 
     if( n < 5 )
-        CV_Error( CV_StsBadSize, "There should be at least 5 points to fit the ellipse" );
+        CV_Error( cv::Error::StsBadSize, "There should be at least 5 points to fit the ellipse" );
 
     Point2f c(0,0);
 
@@ -705,7 +705,7 @@ cv::RotatedRect cv::fitEllipseDirect( InputArray _points )
     RotatedRect box;
 
     if( n < 5 )
-        CV_Error( CV_StsBadSize, "There should be at least 5 points to fit the ellipse" );
+        CV_Error( cv::Error::StsBadSize, "There should be at least 5 points to fit the ellipse" );
 
     Point2d c(0., 0.);
 
@@ -862,286 +862,6 @@ cv::RotatedRect cv::fitEllipseDirect( InputArray _points )
     return box;
 }
 
-
-namespace cv
-{
-
-// Calculates bounding rectangle of a point set or retrieves already calculated
-static Rect pointSetBoundingRect( const Mat& points )
-{
-    int npoints = points.checkVector(2);
-    int depth = points.depth();
-    CV_Assert(npoints >= 0 && (depth == CV_32F || depth == CV_32S));
-
-    int  xmin = 0, ymin = 0, xmax = -1, ymax = -1, i;
-    bool is_float = depth == CV_32F;
-
-    if( npoints == 0 )
-        return Rect();
-
-#if CV_SIMD
-    const int64_t* pts = points.ptr<int64_t>();
-
-    if( !is_float )
-    {
-        v_int32 minval, maxval;
-        minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
-        for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 )
-        {
-            v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i));
-            minval = v_min(ptXY2, minval);
-            maxval = v_max(ptXY2, maxval);
-        }
-        minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
-        maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
-        if( i <= npoints - v_int32::nlanes/4 )
-        {
-            v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
-            minval = v_min(ptXY, minval);
-            maxval = v_max(ptXY, maxval);
-            i += v_int64::nlanes/2;
-        }
-        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
-        {
-            minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
-            maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
-        }
-        xmin = minval.get0();
-        xmax = maxval.get0();
-        ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0();
-        ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0();
-#if CV_SIMD_WIDTH > 16
-        if( i < npoints )
-        {
-            v_int32x4 minval2, maxval2;
-            minval2 = maxval2 = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
-            for( i++; i < npoints; i++ )
-            {
-                v_int32x4 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
-                minval2 = v_min(ptXY, minval2);
-                maxval2 = v_max(ptXY, maxval2);
-            }
-            xmin = min(xmin, minval2.get0());
-            xmax = max(xmax, maxval2.get0());
-            ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0());
-            ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0());
-        }
-#endif
-    }
-    else
-    {
-        v_float32 minval, maxval;
-        minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
-        for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 )
-        {
-            v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i));
-            minval = v_min(ptXY2, minval);
-            maxval = v_max(ptXY2, maxval);
-        }
-        minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
-        maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
-        if( i <= npoints - v_float32::nlanes/4 )
-        {
-            v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
-            minval = v_min(ptXY, minval);
-            maxval = v_max(ptXY, maxval);
-            i += v_float32::nlanes/4;
-        }
-        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
-        {
-            minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
-            maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
-        }
-        xmin = cvFloor(minval.get0());
-        xmax = cvFloor(maxval.get0());
-        ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0());
-        ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0());
-#if CV_SIMD_WIDTH > 16
-        if( i < npoints )
-        {
-            v_float32x4 minval2, maxval2;
-            minval2 = maxval2 = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
-            for( i++; i < npoints; i++ )
-            {
-                v_float32x4 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
-                minval2 = v_min(ptXY, minval2);
-                maxval2 = v_max(ptXY, maxval2);
-            }
-            xmin = min(xmin, cvFloor(minval2.get0()));
-            xmax = max(xmax, cvFloor(maxval2.get0()));
-            ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()));
-            ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()));
-        }
-#endif
-    }
-#else
-    const Point* pts = points.ptr<Point>();
-    Point pt = pts[0];
-
-    if( !is_float )
-    {
-        xmin = xmax = pt.x;
-        ymin = ymax = pt.y;
-
-        for( i = 1; i < npoints; i++ )
-        {
-            pt = pts[i];
-
-            if( xmin > pt.x )
-                xmin = pt.x;
-
-            if( xmax < pt.x )
-                xmax = pt.x;
-
-            if( ymin > pt.y )
-                ymin = pt.y;
-
-            if( ymax < pt.y )
-                ymax = pt.y;
-        }
-    }
-    else
-    {
-        Cv32suf v;
-        // init values
-        xmin = xmax = CV_TOGGLE_FLT(pt.x);
-        ymin = ymax = CV_TOGGLE_FLT(pt.y);
-
-        for( i = 1; i < npoints; i++ )
-        {
-            pt = pts[i];
-            pt.x = CV_TOGGLE_FLT(pt.x);
-            pt.y = CV_TOGGLE_FLT(pt.y);
-
-            if( xmin > pt.x )
-                xmin = pt.x;
-
-            if( xmax < pt.x )
-                xmax = pt.x;
-
-            if( ymin > pt.y )
-                ymin = pt.y;
-
-            if( ymax < pt.y )
-                ymax = pt.y;
-        }
-
-        v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
-        v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
-        // because right and bottom sides of the bounding rectangle are not inclusive
-        // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
-        v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
-        v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
-    }
-#endif
-
-    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
-}
-
-
-static Rect maskBoundingRect( const Mat& img )
-{
-    CV_Assert( img.depth() <= CV_8S && img.channels() == 1 );
-
-    Size size = img.size();
-    int xmin = size.width, ymin = -1, xmax = -1, ymax = -1, i, j, k;
-
-    for( i = 0; i < size.height; i++ )
-    {
-        const uchar* _ptr = img.ptr(i);
-        const uchar* ptr = (const uchar*)alignPtr(_ptr, 4);
-        int have_nz = 0, k_min, offset = (int)(ptr - _ptr);
-        j = 0;
-        offset = MIN(offset, size.width);
-        for( ; j < offset; j++ )
-            if( _ptr[j] )
-            {
-                have_nz = 1;
-                break;
-            }
-        if( j < offset )
-        {
-            if( j < xmin )
-                xmin = j;
-            if( j > xmax )
-                xmax = j;
-        }
-        if( offset < size.width )
-        {
-            xmin -= offset;
-            xmax -= offset;
-            size.width -= offset;
-            j = 0;
-            for( ; j <= xmin - 4; j += 4 )
-                if( *((int*)(ptr+j)) )
-                    break;
-            for( ; j < xmin; j++ )
-                if( ptr[j] )
-                {
-                    xmin = j;
-                    if( j > xmax )
-                        xmax = j;
-                    have_nz = 1;
-                    break;
-                }
-            k_min = MAX(j-1, xmax);
-            k = size.width - 1;
-            for( ; k > k_min && (k&3) != 3; k-- )
-                if( ptr[k] )
-                    break;
-            if( k > k_min && (k&3) == 3 )
-            {
-                for( ; k > k_min+3; k -= 4 )
-                    if( *((int*)(ptr+k-3)) )
-                        break;
-            }
-            for( ; k > k_min; k-- )
-                if( ptr[k] )
-                {
-                    xmax = k;
-                    have_nz = 1;
-                    break;
-                }
-            if( !have_nz )
-            {
-                j &= ~3;
-                for( ; j <= k - 3; j += 4 )
-                    if( *((int*)(ptr+j)) )
-                        break;
-                for( ; j <= k; j++ )
-                    if( ptr[j] )
-                    {
-                        have_nz = 1;
-                        break;
-                    }
-            }
-            xmin += offset;
-            xmax += offset;
-            size.width += offset;
-        }
-        if( have_nz )
-        {
-            if( ymin < 0 )
-                ymin = i;
-            ymax = i;
-        }
-    }
-
-    if( xmin >= size.width )
-        xmin = ymin = 0;
-    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
-}
-
-}
-
-cv::Rect cv::boundingRect(InputArray array)
-{
-    CV_INSTRUMENT_REGION();
-
-    Mat m = array.getMat();
-    return m.depth() <= CV_8U ? maskBoundingRect(m) : pointSetBoundingRect(m);
-}
-
 ////////////////////////////////////////////// C API ///////////////////////////////////////////
 
 CV_IMPL int
@@ -1364,7 +1084,7 @@ cvContourArea( const void *array, CvSlice slice, int oriented )
     {
         contour = (CvSeq*)array;
         if( !CV_IS_SEQ_POLYLINE( contour ))
-            CV_Error( CV_StsBadArg, "Unsupported sequence type" );
+            CV_Error( cv::Error::StsBadArg, "Unsupported sequence type" );
     }
     else
     {
@@ -1379,7 +1099,7 @@ cvContourArea( const void *array, CvSlice slice, int oriented )
     }
 
     if( CV_SEQ_ELTYPE( contour ) != CV_32SC2 )
-        CV_Error( CV_StsUnsupportedFormat,
+        CV_Error( cv::Error::StsUnsupportedFormat,
         "Only curves with integer coordinates are supported in case of contour slice" );
     area = icvContourSecArea( contour, slice );
     return oriented ? area : fabs(area);
@@ -1405,7 +1125,7 @@ cvArcLength( const void *array, CvSlice slice, int is_closed )
     {
         contour = (CvSeq*)array;
         if( !CV_IS_SEQ_POLYLINE( contour ))
-            CV_Error( CV_StsBadArg, "Unsupported sequence type" );
+            CV_Error( cv::Error::StsBadArg, "Unsupported sequence type" );
         if( is_closed < 0 )
             is_closed = CV_IS_SEQ_CLOSED( contour );
     }
@@ -1482,62 +1202,4 @@ cvFitEllipse2( const CvArr* array )
     return cvBox2D(cv::fitEllipse(points));
 }
 
-/* Calculates bounding rectangle of a point set or retrieves already calculated */
-CV_IMPL  CvRect
-cvBoundingRect( CvArr* array, int update )
-{
-    cv::Rect rect;
-    CvContour contour_header;
-    CvSeq* ptseq = 0;
-    CvSeqBlock block;
-
-    CvMat stub, *mat = 0;
-    int calculate = update;
-
-    if( CV_IS_SEQ( array ))
-    {
-        ptseq = (CvSeq*)array;
-        if( !CV_IS_SEQ_POINT_SET( ptseq ))
-            CV_Error( CV_StsBadArg, "Unsupported sequence type" );
-
-        if( ptseq->header_size < (int)sizeof(CvContour))
-        {
-            update = 0;
-            calculate = 1;
-        }
-    }
-    else
-    {
-        mat = cvGetMat( array, &stub );
-        if( CV_MAT_TYPE(mat->type) == CV_32SC2 ||
-            CV_MAT_TYPE(mat->type) == CV_32FC2 )
-        {
-            ptseq = cvPointSeqFromMat(CV_SEQ_KIND_GENERIC, mat, &contour_header, &block);
-            mat = 0;
-        }
-        else if( CV_MAT_TYPE(mat->type) != CV_8UC1 &&
-                CV_MAT_TYPE(mat->type) != CV_8SC1 )
-            CV_Error( CV_StsUnsupportedFormat,
-                "The image/matrix format is not supported by the function" );
-        update = 0;
-        calculate = 1;
-    }
-
-    if( !calculate )
-        return ((CvContour*)ptseq)->rect;
-
-    if( mat )
-    {
-        rect = cvRect(cv::maskBoundingRect(cv::cvarrToMat(mat)));
-    }
-    else if( ptseq->total )
-    {
-        cv::AutoBuffer<double> abuf;
-        rect = cvRect(cv::pointSetBoundingRect(cv::cvarrToMat(ptseq, false, false, 0, &abuf)));
-    }
-    if( update )
-        ((CvContour*)ptseq)->rect = cvRect(rect);
-    return cvRect(rect);
-}
-
 /* End of file. */
diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp
index 8a521d6df389..d0f50a73bb7e 100644
--- a/modules/imgproc/src/smooth.dispatch.cpp
+++ b/modules/imgproc/src/smooth.dispatch.cpp
@@ -683,8 +683,22 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
 
             if (src.data == dst.data)
                 src = src.clone();
+
+            if ((sigma1 == 0.0) && (sigma2 == 0.0) && (ksize.height == ksize.width))
+            {
+                Point ofs;
+                Size wsz(src.cols, src.rows);
+                Mat src2 = src;
+                if(!(borderType & BORDER_ISOLATED))
+                    src2.locateROI( wsz, ofs );
+
+                CALL_HAL(gaussianBlurBinomial, cv_hal_gaussianBlurBinomial, src2.ptr(), src2.step, dst.ptr(), dst.step, src2.cols, src2.rows, sdepth, cn,
+                         ofs.x, ofs.y, wsz.width - src2.cols - ofs.x,  wsz.height - src2.rows - ofs.y, ksize.width, borderType&~BORDER_ISOLATED);
+            }
+
             CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType),
                 CV_CPU_DISPATCH_MODES_ALL);
+
             return;
         }
     }
@@ -720,8 +734,22 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
 
             if (src.data == dst.data)
                 src = src.clone();
+
+            if ((sigma1 == 0.0) && (sigma2 == 0.0) && (ksize.height == ksize.width))
+            {
+                Point ofs;
+                Size wsz(src.cols, src.rows);
+                Mat src2 = src;
+                if(!(borderType & BORDER_ISOLATED))
+                    src2.locateROI( wsz, ofs );
+
+                CALL_HAL(gaussianBlurBinomial, cv_hal_gaussianBlurBinomial, src2.ptr(), src2.step, dst.ptr(), dst.step, src2.cols, src2.rows, sdepth, cn,
+                         ofs.x, ofs.y, wsz.width - src2.cols - ofs.x,  wsz.height - src2.rows - ofs.y, ksize.width, borderType&~BORDER_ISOLATED);
+            }
+
             CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint32_t*)&fkx[0], (int)fkx.size(), (const uint32_t*)&fky[0], (int)fky.size(), borderType),
                 CV_CPU_DISPATCH_MODES_ALL);
+
             return;
         }
     }
@@ -784,7 +812,7 @@ cvSmooth( const void* srcarr, void* dstarr, int smooth_type,
         cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE );
 
     if( dst.data != dst0.data )
-        CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" );
+        CV_Error( cv::Error::StsUnmatchedFormats, "The destination image does not have the proper type" );
 }
 
 /* End of file. */
diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp
index 62ff31ac940c..7389cdbce947 100644
--- a/modules/imgproc/src/smooth.simd.hpp
+++ b/modules/imgproc/src/smooth.simd.hpp
@@ -81,11 +81,11 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
 {
     int lencn = len*cn;
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
-    v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
+    v_uint16 vmul = vx_setall_u16(*((uint16_t*)m));
     for (; i <= lencn - VECSZ; i += VECSZ)
-        v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i)));
+        v_store((uint16_t*)dst + i, v_mul(vmul, vx_load_expand(src + i)));
 #endif
     for (; i < lencn; i++)
         dst[i] = m[0] * src[i];
@@ -101,8 +101,8 @@ void hlineSmooth1N1<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const uf
 {
     int lencn = len*cn;
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= lencn - VECSZ; i += VECSZ)
         v_store((uint16_t*)dst + i, v_shl<8>(vx_load_expand(src + i)));
 #endif
@@ -168,16 +168,14 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
 
         src += cn; dst += cn;
         int i = cn, lencn = (len - 1)*cn;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const uint16_t* _m = (const uint16_t*)m;
-        const int VECSZ = v_uint16::nlanes;
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         v_uint16 v_mul2 = vx_setall_u16(_m[2]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) +
-                                    v_mul_wrap(vx_load_expand(src), v_mul1) +
-                                    v_mul_wrap(vx_load_expand(src + cn), v_mul2));
+            v_store((uint16_t*)dst, v_add(v_add(v_mul(vx_load_expand(src - cn), v_mul0), v_mul(vx_load_expand(src), v_mul1)), v_mul(vx_load_expand(src + cn), v_mul2)));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn];
@@ -220,10 +218,10 @@ void hlineSmooth3N121Impl(const ET* src, int cn, const FT*, int, FT* dst, int le
 
         src += cn; dst += cn;
         int i = cn, lencn = (len - 1)*cn;
-#if CV_SIMD
-        const int VECSZ = VFT::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<VFT>::vlanes();
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((typename FT::raw_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn) + (vx_load_expand(src) << 1)) << (FT::fixedShift-2));
+            v_store((typename FT::raw_t*)dst, v_shl<(FT::fixedShift-2)>(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn), v_shl<1>((vx_load_expand(src))))));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *dst = (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[0])>>1);
@@ -320,14 +318,13 @@ void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
 
         src += cn; dst += cn;
         int i = cn, lencn = (len - 1)*cn;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const uint16_t* _m = (const uint16_t*)m;
-        const int VECSZ = v_uint16::nlanes;
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) +
-                                    v_mul_wrap(vx_load_expand(src), v_mul1));
+            v_store((uint16_t*)dst, v_add(v_mul(v_add(  vx_load_expand(src - cn), vx_load_expand(src + cn)),  v_mul0), v_mul(vx_load_expand(src), v_mul1)));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = saturate_cast<uint16_t>(((uint16_t*)m)[1] * (uint32_t)(src[0]) + ((uint16_t*)m)[0] * ((uint32_t)(src[-cn]) + (uint32_t)(src[cn])));
@@ -514,20 +511,16 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
 
         src += 2 * cn; dst += 2 * cn;
         int i = 2*cn, lencn = (len - 2)*cn;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const uint16_t* _m = (const uint16_t*)m;
-        const int VECSZ = v_uint16::nlanes;
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         v_uint16 v_mul2 = vx_setall_u16(_m[2]);
         v_uint16 v_mul3 = vx_setall_u16(_m[3]);
         v_uint16 v_mul4 = vx_setall_u16(_m[4]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) +
-                                    v_mul_wrap(vx_load_expand(src - cn), v_mul1) +
-                                    v_mul_wrap(vx_load_expand(src), v_mul2) +
-                                    v_mul_wrap(vx_load_expand(src + cn), v_mul3) +
-                                    v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4));
+            v_store((uint16_t*)dst, v_add(v_add(v_add(v_add(v_mul(vx_load_expand(src - 2 * cn), v_mul0), v_mul(vx_load_expand(src - cn), v_mul1)), v_mul(vx_load_expand(src), v_mul2)), v_mul(vx_load_expand(src + cn), v_mul3)), v_mul(vx_load_expand(src + 2 * cn), v_mul4)));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn];
@@ -726,11 +719,11 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
 
         src += 2 * cn; dst += 2 * cn;
         int i = 2 * cn, lencn = (len - 2)*cn;
-#if CV_SIMD
-        const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_6 = vx_setall_u16(6);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4);
+            v_store((uint16_t*)dst, v_shl<4>(v_add(v_add(v_add(v_mul(vx_load_expand(src), v_6), v_shl<2>(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn)))), vx_load_expand(src - 2 * cn)), vx_load_expand(src + 2 * cn))));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4;
@@ -924,16 +917,14 @@ void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
 
         src += 2 * cn; dst += 2 * cn;
         int i = 2 * cn, lencn = (len - 2)*cn;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         const uint16_t* _m = (const uint16_t*)m;
-        const int VECSZ = v_uint16::nlanes;
+        const int VECSZ = VTraits<v_uint16>::vlanes();
         v_uint16 v_mul0 = vx_setall_u16(_m[0]);
         v_uint16 v_mul1 = vx_setall_u16(_m[1]);
         v_uint16 v_mul2 = vx_setall_u16(_m[2]);
         for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) +
-                                    v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) +
-                                    v_mul_wrap(vx_load_expand(src), v_mul2));
+            v_store((uint16_t*)dst, v_add(v_add(v_mul(v_add(vx_load_expand(src - 2 * cn), vx_load_expand(src + 2 * cn)), v_mul0), v_mul(v_add(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_mul1)), v_mul(vx_load_expand(src), v_mul2)));
 #endif
         for (; i < lencn; i++, src++, dst++)
             *((uint16_t*)dst) = saturate_cast<uint16_t>(((uint16_t*)m)[0] * ((uint32_t)(src[-2 * cn]) + (uint32_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint32_t)(src[-cn]) + (uint32_t)(src[cn])) + ((uint16_t*)m)[2] * (uint32_t)(src[0]));
@@ -1044,13 +1035,13 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
     }
     i *= cn;
     int lencn = (len - post_shift + 1)*cn;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ)
     {
-        v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m)));
+        v_uint16 v_res0 = v_mul(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m)));
         for (int j = 1; j < n; j++)
-            v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j))));
+            v_res0 = v_add(v_res0, v_mul(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t *)(m + j)))));
         v_store((uint16_t*)dst, v_res0);
     }
 #endif
@@ -1163,13 +1154,13 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
     }
     i *= cn;
     int lencn = (len - post_shift + 1)*cn;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
     {
-        v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift))));
+        v_uint16 v_res0 = v_mul(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift))));
         for (int j = 0; j < pre_shift; j ++)
-            v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j))));
+            v_res0 = v_add(v_res0, v_mul(v_add(vx_load_expand(src + j * cn), vx_load_expand(src + (n - 1 - j) * cn)), vx_setall_u16(*((uint16_t *)(m + j)))));
         v_store((uint16_t*)dst, v_res0);
     }
 #endif
@@ -1228,8 +1219,8 @@ void hlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const uint16_t* src, int cn,
     }
     i *= cn;
     int lencn = (len - post_shift + 1)*cn;
-#if CV_SIMD
-    const int VECSZ = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     for (; i <= lencn - VECSZ * 2; i += VECSZ * 2, src += VECSZ * 2, dst += VECSZ * 2)
     {
         v_uint32 v_res0, v_res1;
@@ -1239,11 +1230,11 @@ void hlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const uint16_t* src, int cn,
             v_uint16 v_weight = vx_setall_u16((uint16_t) *((uint32_t*)(m + j)));
             v_uint32 v_add0, v_add1;
             v_mul_expand(vx_load(src + j * cn), v_weight, v_add0, v_add1);
-            v_res0 += v_add0;
-            v_res1 += v_add1;
+            v_res0 = v_add(v_res0, v_add0);
+            v_res1 = v_add(v_res1, v_add1);
             v_mul_expand(vx_load(src + (n - 1 - j)*cn), v_weight, v_add0, v_add1);
-            v_res0 += v_add0;
-            v_res1 += v_add1;
+            v_res0 = v_add(v_res0, v_add0);
+            v_res1 = v_add(v_res1, v_add1);
         }
         v_store((uint32_t*)dst, v_res0);
         v_store((uint32_t*)dst + VECSZ, v_res1);
@@ -1285,8 +1276,8 @@ void vlineSmooth1N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
 {
     const ufixedpoint16* src0 = src[0];
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)<<1);
     for (; i <= len - VECSZ; i += VECSZ)
         v_rshr_pack_store<1>(dst + i, v_mul_hi(vx_load((uint16_t*)src0 + i), v_mul));
@@ -1306,8 +1297,8 @@ void vlineSmooth1N1<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, co
 {
     const ufixedpoint16* src0 = src[0];
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= len - VECSZ; i += VECSZ)
         v_rshr_pack_store<8>(dst + i, vx_load((uint16_t*)(src0 + i)));
 #endif
@@ -1324,10 +1315,10 @@ template <>
 void vlineSmooth3N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
     v_int32 v_128_4 = vx_setall_s32(128 << 16);
-    const int VECSZ = v_uint16::nlanes;
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     if (len >= VECSZ)
     {
         ufixedpoint32 val[] = { (m[0] + m[1] + m[2]) * ufixedpoint16((uint8_t)128) };
@@ -1370,26 +1361,26 @@ void vlineSmooth3N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
         v_src02 = vx_load(src2 + 2*VECSZ);
         v_src03 = vx_load(src2 + 3*VECSZ);
         v_mul_expand(v_add_wrap(v_src00, v_128), v_mul2, v_resj0, v_resj1);
-        v_res0 += v_resj0;
-        v_res1 += v_resj1;
+        v_res0 = v_add(v_res0, v_resj0);
+        v_res1 = v_add(v_res1, v_resj1);
         v_mul_expand(v_add_wrap(v_src01, v_128), v_mul2, v_resj0, v_resj1);
-        v_res2 += v_resj0;
-        v_res3 += v_resj1;
+        v_res2 = v_add(v_res2, v_resj0);
+        v_res3 = v_add(v_res3, v_resj1);
         v_mul_expand(v_add_wrap(v_src02, v_128), v_mul2, v_resj0, v_resj1);
-        v_res4 += v_resj0;
-        v_res5 += v_resj1;
+        v_res4 = v_add(v_res4, v_resj0);
+        v_res5 = v_add(v_res5, v_resj1);
         v_mul_expand(v_add_wrap(v_src03, v_128), v_mul2, v_resj0, v_resj1);
-        v_res6 += v_resj0;
-        v_res7 += v_resj1;
-
-        v_res0 += v_128_4;
-        v_res1 += v_128_4;
-        v_res2 += v_128_4;
-        v_res3 += v_128_4;
-        v_res4 += v_128_4;
-        v_res5 += v_128_4;
-        v_res6 += v_128_4;
-        v_res7 += v_128_4;
+        v_res6 = v_add(v_res6, v_resj0);
+        v_res7 = v_add(v_res7, v_resj1);
+
+        v_res0 = v_add(v_res0, v_128_4);
+        v_res1 = v_add(v_res1, v_128_4);
+        v_res2 = v_add(v_res2, v_128_4);
+        v_res3 = v_add(v_res3, v_128_4);
+        v_res4 = v_add(v_res4, v_128_4);
+        v_res5 = v_add(v_res5, v_128_4);
+        v_res6 = v_add(v_res6, v_128_4);
+        v_res7 = v_add(v_res7, v_128_4);
 
         v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
                                           v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
@@ -1410,8 +1401,8 @@ template <>
 void vlineSmooth3N121<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint32 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
@@ -1421,8 +1412,8 @@ void vlineSmooth3N121<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src,
         v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13);
         v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21);
         v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23);
-        v_store(dst + i, v_pack(v_rshr_pack<10>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)),
-                                v_rshr_pack<10>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13))));
+        v_store(dst + i, v_pack(v_rshr_pack<10>(v_add(v_add(v_src00, v_src20), v_add(v_src10, v_src10)), v_add(v_add(v_src01, v_src21), v_add(v_src11, v_src11))),
+                                v_rshr_pack<10>(v_add(v_add(v_src02, v_src22), v_add(v_src12, v_src12)), v_add(v_add(v_src03, v_src23), v_add(v_src13, v_src13)))));
     }
 #endif
     for (; i < len; i++)
@@ -1432,8 +1423,8 @@ template <>
 void vlineSmooth3N121<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32*, int, uint16_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint64 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
@@ -1443,8 +1434,8 @@ void vlineSmooth3N121<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src,
         v_expand(vx_load((uint32_t*)(src[1]) + i + VECSZ), v_src12, v_src13);
         v_expand(vx_load((uint32_t*)(src[2]) + i), v_src20, v_src21);
         v_expand(vx_load((uint32_t*)(src[2]) + i + VECSZ), v_src22, v_src23);
-        v_store(dst + i, v_pack(v_rshr_pack<18>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)),
-                                v_rshr_pack<18>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13))));
+        v_store(dst + i, v_pack(v_rshr_pack<18>(v_add(v_add(v_src00, v_src20), v_add(v_src10, v_src10)), v_add(v_add(v_src01, v_src21), v_add(v_src11, v_src11))),
+                                v_rshr_pack<18>(v_add(v_add(v_src02, v_src22), v_add(v_src12, v_src12)), v_add(v_add(v_src03, v_src23), v_add(v_src13, v_src13)))));
     }
 #endif
     for (; i < len; i++)
@@ -1460,13 +1451,13 @@ template <>
 void vlineSmooth5N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint16::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     if (len >= 4 * VECSZ)
     {
         ufixedpoint32 val[] = { (m[0] + m[1] + m[2] + m[3] + m[4]) * ufixedpoint16((uint8_t)128) };
         v_int32 v_128_4 = vx_setall_s32(*((int32_t*)val));
-        static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+        const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
         v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m)));
         v_int16 v_mul23 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m + 2))));
         v_int16 v_mul4 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 4))));
@@ -1509,17 +1500,17 @@ void vlineSmooth5N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
             v_src12 = vx_load(src3 + 2*VECSZ);
             v_src13 = vx_load(src3 + 3*VECSZ);
             v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
-            v_res0 += v_dotprod(v_tmp0, v_mul23);
-            v_res1 += v_dotprod(v_tmp1, v_mul23);
+            v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul23));
+            v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul23));
             v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
-            v_res2 += v_dotprod(v_tmp0, v_mul23);
-            v_res3 += v_dotprod(v_tmp1, v_mul23);
+            v_res2 = v_add(v_res2, v_dotprod(v_tmp0, v_mul23));
+            v_res3 = v_add(v_res3, v_dotprod(v_tmp1, v_mul23));
             v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
-            v_res4 += v_dotprod(v_tmp0, v_mul23);
-            v_res5 += v_dotprod(v_tmp1, v_mul23);
+            v_res4 = v_add(v_res4, v_dotprod(v_tmp0, v_mul23));
+            v_res5 = v_add(v_res5, v_dotprod(v_tmp1, v_mul23));
             v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
-            v_res6 += v_dotprod(v_tmp0, v_mul23);
-            v_res7 += v_dotprod(v_tmp1, v_mul23);
+            v_res6 = v_add(v_res6, v_dotprod(v_tmp0, v_mul23));
+            v_res7 = v_add(v_res7, v_dotprod(v_tmp1, v_mul23));
 
             v_int32 v_resj0, v_resj1;
             const int16_t* src4 = (const int16_t*)src[4] + i;
@@ -1528,26 +1519,26 @@ void vlineSmooth5N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, con
             v_src02 = vx_load(src4 + 2*VECSZ);
             v_src03 = vx_load(src4 + 3*VECSZ);
             v_mul_expand(v_add_wrap(v_src00, v_128), v_mul4, v_resj0, v_resj1);
-            v_res0 += v_resj0;
-            v_res1 += v_resj1;
+            v_res0 = v_add(v_res0, v_resj0);
+            v_res1 = v_add(v_res1, v_resj1);
             v_mul_expand(v_add_wrap(v_src01, v_128), v_mul4, v_resj0, v_resj1);
-            v_res2 += v_resj0;
-            v_res3 += v_resj1;
+            v_res2 = v_add(v_res2, v_resj0);
+            v_res3 = v_add(v_res3, v_resj1);
             v_mul_expand(v_add_wrap(v_src02, v_128), v_mul4, v_resj0, v_resj1);
-            v_res4 += v_resj0;
-            v_res5 += v_resj1;
+            v_res4 = v_add(v_res4, v_resj0);
+            v_res5 = v_add(v_res5, v_resj1);
             v_mul_expand(v_add_wrap(v_src03, v_128), v_mul4, v_resj0, v_resj1);
-            v_res6 += v_resj0;
-            v_res7 += v_resj1;
-
-            v_res0 += v_128_4;
-            v_res1 += v_128_4;
-            v_res2 += v_128_4;
-            v_res3 += v_128_4;
-            v_res4 += v_128_4;
-            v_res5 += v_128_4;
-            v_res6 += v_128_4;
-            v_res7 += v_128_4;
+            v_res6 = v_add(v_res6, v_resj0);
+            v_res7 = v_add(v_res7, v_resj1);
+
+            v_res0 = v_add(v_res0, v_128_4);
+            v_res1 = v_add(v_res1, v_128_4);
+            v_res2 = v_add(v_res2, v_128_4);
+            v_res3 = v_add(v_res3, v_128_4);
+            v_res4 = v_add(v_res4, v_128_4);
+            v_res5 = v_add(v_res5, v_128_4);
+            v_res6 = v_add(v_res6, v_128_4);
+            v_res7 = v_add(v_res7, v_128_4);
 
             v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
                                               v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
@@ -1569,9 +1560,9 @@ template <>
 void vlineSmooth5N14641<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint32 v_6 = vx_setall_u32(6);
-    const int VECSZ = v_uint16::nlanes;
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint32 v_src00, v_src10, v_src20, v_src30, v_src40;
@@ -1588,10 +1579,10 @@ void vlineSmooth5N14641<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src
         v_expand(vx_load((uint16_t*)(src[3]) + i + VECSZ), v_src32, v_src33);
         v_expand(vx_load((uint16_t*)(src[4]) + i), v_src40, v_src41);
         v_expand(vx_load((uint16_t*)(src[4]) + i + VECSZ), v_src42, v_src43);
-        v_store(dst + i, v_pack(v_rshr_pack<12>(v_src20*v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40,
-                                                v_src21*v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41),
-                                v_rshr_pack<12>(v_src22*v_6 + ((v_src12 + v_src32) << 2) + v_src02 + v_src42,
-                                                v_src23*v_6 + ((v_src13 + v_src33) << 2) + v_src03 + v_src43)));
+        v_store(dst + i, v_pack(v_rshr_pack<12>(v_add(v_add(v_add(v_mul(v_src20, v_6), v_shl<2>(v_add(v_src10, v_src30))), v_src00), v_src40),
+                                                v_add(v_add(v_add(v_mul(v_src21, v_6), v_shl<2>(v_add(v_src11, v_src31))), v_src01), v_src41)),
+                                v_rshr_pack<12>(v_add(v_add(v_add(v_mul(v_src22, v_6), v_shl<2>(v_add(v_src12, v_src32))), v_src02), v_src42),
+                                                v_add(v_add(v_add(v_mul(v_src23, v_6), v_shl<2>(v_add(v_src13, v_src33))), v_src03), v_src43))));
     }
 #endif
     for (; i < len; i++)
@@ -1603,8 +1594,8 @@ template <>
 void vlineSmooth5N14641<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32*, int, uint16_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    const int VECSZ = v_uint32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint64 v_src00, v_src10, v_src20, v_src30, v_src40;
@@ -1621,10 +1612,10 @@ void vlineSmooth5N14641<uint16_t, ufixedpoint32>(const ufixedpoint32* const * sr
         v_expand(vx_load((uint32_t*)(src[3]) + i + VECSZ), v_src32, v_src33);
         v_expand(vx_load((uint32_t*)(src[4]) + i), v_src40, v_src41);
         v_expand(vx_load((uint32_t*)(src[4]) + i + VECSZ), v_src42, v_src43);
-        v_store(dst + i, v_pack(v_rshr_pack<20>((v_src20 << 2) + (v_src20 << 1) + ((v_src10 + v_src30) << 2) + v_src00 + v_src40,
-                                                (v_src21 << 2) + (v_src21 << 1) + ((v_src11 + v_src31) << 2) + v_src01 + v_src41),
-                                v_rshr_pack<20>((v_src22 << 2) + (v_src22 << 1) + ((v_src12 + v_src32) << 2) + v_src02 + v_src42,
-                                                (v_src23 << 2) + (v_src23 << 1) + ((v_src13 + v_src33) << 2) + v_src03 + v_src43)));
+        v_store(dst + i, v_pack(v_rshr_pack<20>(v_add(v_add(v_add(v_add(v_shl<2>(v_src20), v_shl<1>(v_src20)), v_shl<2>(v_add(v_src10, v_src30))), v_src00), v_src40),
+                                                v_add(v_add(v_add(v_add(v_shl<2>(v_src21), v_shl<1>(v_src21)), v_shl<2>(v_add(v_src11, v_src31))), v_src01), v_src41)),
+                                v_rshr_pack<20>(v_add(v_add(v_add(v_add(v_shl<2>(v_src22), v_shl<1>(v_src22)), v_shl<2>(v_add(v_src12, v_src32))), v_src02), v_src42),
+                                                v_add(v_add(v_add(v_add(v_shl<2>(v_src23), v_shl<1>(v_src23)), v_shl<2>(v_add(v_src13, v_src33))), v_src03), v_src43))));
     }
 #endif
     for (; i < len; i++)
@@ -1643,14 +1634,23 @@ void vlineSmooth(const FT* const * src, const FT* m, int n, ET* dst, int len)
         dst[i] = val;
     }
 }
+
+inline uint32_t read_pair_as_u32(const ufixedpoint16 * mem)
+{
+    union Cv32sufX2 { uint32_t v32; int16_t v16[2]; } res;
+    res.v16[0] = mem->raw();
+    res.v16[1] = (mem + 1)->raw();
+    return res.v32;
+}
+
 template <>
 void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
-    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
     v_int32 v_128_4 = vx_setall_s32(128 << 16);
-    const int VECSZ = v_uint16::nlanes;
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     if (len >= VECSZ)
     {
         ufixedpoint16 msum = m[0] + m[1];
@@ -1664,7 +1664,7 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
         v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
         v_int16 v_tmp0, v_tmp1;
 
-        v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m)));
+        v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(read_pair_as_u32(m)));
 
         const int16_t* src0 = (const int16_t*)src[0] + i;
         const int16_t* src1 = (const int16_t*)src[1] + i;
@@ -1692,7 +1692,7 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
         int j = 2;
         for (; j < n - 1; j+=2)
         {
-            v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m+j))));
+            v_mul = v_reinterpret_as_s16(vx_setall_u32(read_pair_as_u32(m + j)));
 
             const int16_t* srcj0 = (const int16_t*)src[j] + i;
             const int16_t* srcj1 = (const int16_t*)src[j + 1] + i;
@@ -1705,17 +1705,17 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
             v_src12 = vx_load(srcj1 + 2*VECSZ);
             v_src13 = vx_load(srcj1 + 3*VECSZ);
             v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
-            v_res0 += v_dotprod(v_tmp0, v_mul);
-            v_res1 += v_dotprod(v_tmp1, v_mul);
+            v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul));
+            v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul));
             v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
-            v_res2 += v_dotprod(v_tmp0, v_mul);
-            v_res3 += v_dotprod(v_tmp1, v_mul);
+            v_res2 = v_add(v_res2, v_dotprod(v_tmp0, v_mul));
+            v_res3 = v_add(v_res3, v_dotprod(v_tmp1, v_mul));
             v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
-            v_res4 += v_dotprod(v_tmp0, v_mul);
-            v_res5 += v_dotprod(v_tmp1, v_mul);
+            v_res4 = v_add(v_res4, v_dotprod(v_tmp0, v_mul));
+            v_res5 = v_add(v_res5, v_dotprod(v_tmp1, v_mul));
             v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
-            v_res6 += v_dotprod(v_tmp0, v_mul);
-            v_res7 += v_dotprod(v_tmp1, v_mul);
+            v_res6 = v_add(v_res6, v_dotprod(v_tmp0, v_mul));
+            v_res7 = v_add(v_res7, v_dotprod(v_tmp1, v_mul));
         }
         if(j < n)
         {
@@ -1727,26 +1727,26 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
             v_src02 = vx_load(srcj + 2*VECSZ);
             v_src03 = vx_load(srcj + 3*VECSZ);
             v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_resj0, v_resj1);
-            v_res0 += v_resj0;
-            v_res1 += v_resj1;
+            v_res0 = v_add(v_res0, v_resj0);
+            v_res1 = v_add(v_res1, v_resj1);
             v_mul_expand(v_add_wrap(v_src01, v_128), v_mul, v_resj0, v_resj1);
-            v_res2 += v_resj0;
-            v_res3 += v_resj1;
+            v_res2 = v_add(v_res2, v_resj0);
+            v_res3 = v_add(v_res3, v_resj1);
             v_mul_expand(v_add_wrap(v_src02, v_128), v_mul, v_resj0, v_resj1);
-            v_res4 += v_resj0;
-            v_res5 += v_resj1;
+            v_res4 = v_add(v_res4, v_resj0);
+            v_res5 = v_add(v_res5, v_resj1);
             v_mul_expand(v_add_wrap(v_src03, v_128), v_mul, v_resj0, v_resj1);
-            v_res6 += v_resj0;
-            v_res7 += v_resj1;
-        }
-        v_res0 += v_128_4;
-        v_res1 += v_128_4;
-        v_res2 += v_128_4;
-        v_res3 += v_128_4;
-        v_res4 += v_128_4;
-        v_res5 += v_128_4;
-        v_res6 += v_128_4;
-        v_res7 += v_128_4;
+            v_res6 = v_add(v_res6, v_resj0);
+            v_res7 = v_add(v_res7, v_resj1);
+        }
+        v_res0 = v_add(v_res0, v_128_4);
+        v_res1 = v_add(v_res1, v_128_4);
+        v_res2 = v_add(v_res2, v_128_4);
+        v_res3 = v_add(v_res3, v_128_4);
+        v_res4 = v_add(v_res4, v_128_4);
+        v_res5 = v_add(v_res5, v_128_4);
+        v_res6 = v_add(v_res6, v_128_4);
+        v_res7 = v_add(v_res7, v_128_4);
 
         v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
                                           v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
@@ -1780,11 +1780,11 @@ template <>
 void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int pre_shift = n / 2;
-    static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
+    const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15));
     v_int32 v_128_4 = vx_setall_s32(128 << 16);
-    const int VECSZ = v_uint16::nlanes;
+    const int VECSZ = VTraits<v_uint16>::vlanes();
     if (len >= VECSZ)
     {
         ufixedpoint16 msum = m[0] + m[pre_shift] + m[n - 1];
@@ -1826,27 +1826,27 @@ void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * s
             v_src21 = vx_load(srcj1 + 2*VECSZ);
             v_src31 = vx_load(srcj1 + 3*VECSZ);
             v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src01, v_128), v_tmp0, v_tmp1);
-            v_res0 += v_dotprod(v_tmp0, v_mul);
-            v_res1 += v_dotprod(v_tmp1, v_mul);
+            v_res0 = v_add(v_res0, v_dotprod(v_tmp0, v_mul));
+            v_res1 = v_add(v_res1, v_dotprod(v_tmp1, v_mul));
             v_zip(v_add_wrap(v_src10, v_128), v_add_wrap(v_src11, v_128), v_tmp2, v_tmp3);
-            v_res2 += v_dotprod(v_tmp2, v_mul);
-            v_res3 += v_dotprod(v_tmp3, v_mul);
+            v_res2 = v_add(v_res2, v_dotprod(v_tmp2, v_mul));
+            v_res3 = v_add(v_res3, v_dotprod(v_tmp3, v_mul));
             v_zip(v_add_wrap(v_src20, v_128), v_add_wrap(v_src21, v_128), v_tmp4, v_tmp5);
-            v_res4 += v_dotprod(v_tmp4, v_mul);
-            v_res5 += v_dotprod(v_tmp5, v_mul);
+            v_res4 = v_add(v_res4, v_dotprod(v_tmp4, v_mul));
+            v_res5 = v_add(v_res5, v_dotprod(v_tmp5, v_mul));
             v_zip(v_add_wrap(v_src30, v_128), v_add_wrap(v_src31, v_128), v_tmp6, v_tmp7);
-            v_res6 += v_dotprod(v_tmp6, v_mul);
-            v_res7 += v_dotprod(v_tmp7, v_mul);
+            v_res6 = v_add(v_res6, v_dotprod(v_tmp6, v_mul));
+            v_res7 = v_add(v_res7, v_dotprod(v_tmp7, v_mul));
         }
 
-        v_res0 += v_128_4;
-        v_res1 += v_128_4;
-        v_res2 += v_128_4;
-        v_res3 += v_128_4;
-        v_res4 += v_128_4;
-        v_res5 += v_128_4;
-        v_res6 += v_128_4;
-        v_res7 += v_128_4;
+        v_res0 = v_add(v_res0, v_128_4);
+        v_res1 = v_add(v_res1, v_128_4);
+        v_res2 = v_add(v_res2, v_128_4);
+        v_res3 = v_add(v_res3, v_128_4);
+        v_res4 = v_add(v_res4, v_128_4);
+        v_res5 = v_add(v_res5, v_128_4);
+        v_res6 = v_add(v_res6, v_128_4);
+        v_res7 = v_add(v_res7, v_128_4);
 
         v_store(dst + i          , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
                                           v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
@@ -1868,9 +1868,9 @@ template <>
 void vlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32* m, int n, uint16_t* dst, int len)
 {
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int pre_shift = n / 2;
-    const int VECSZ = v_uint32::nlanes;
+    const int VECSZ = VTraits<v_uint32>::vlanes();
     for (; i <= len - 2*VECSZ; i += 2*VECSZ)
     {
         v_uint32 v_src00, v_src10, v_src01, v_src11;
@@ -1895,15 +1895,15 @@ void vlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const ufixedpoint32* const *
             v_src01 = vx_load(srcj1);
             v_mul_expand(v_src00, v_mul, v_tmp0, v_tmp1);
             v_mul_expand(v_src01, v_mul, v_tmp2, v_tmp3);
-            v_res0 += v_tmp0 + v_tmp2;
-            v_res1 += v_tmp1 + v_tmp3;
+            v_res0 = v_add(v_res0, v_add(v_tmp0, v_tmp2));
+            v_res1 = v_add(v_res1, v_add(v_tmp1, v_tmp3));
 
             v_src10 = vx_load(srcj0 + VECSZ);
             v_src11 = vx_load(srcj1 + VECSZ);
             v_mul_expand(v_src10, v_mul, v_tmp4, v_tmp5);
             v_mul_expand(v_src11, v_mul, v_tmp6, v_tmp7);
-            v_res2 += v_tmp4 + v_tmp6;
-            v_res3 += v_tmp5 + v_tmp7;
+            v_res2 = v_add(v_res2, v_add(v_tmp4, v_tmp6));
+            v_res3 = v_add(v_res3, v_add(v_tmp5, v_tmp7));
         }
 
         v_store(dst + i, v_pack(v_rshr_pack<32>(v_res0, v_res1),
diff --git a/modules/imgproc/src/spatialgradient.cpp b/modules/imgproc/src/spatialgradient.cpp
index 1aed1fa03166..f422609c40f6 100644
--- a/modules/imgproc/src/spatialgradient.cpp
+++ b/modules/imgproc/src/spatialgradient.cpp
@@ -57,15 +57,33 @@ namespace cv
  *           0  0  0
  *           1  2  1
  */
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template <typename T>
-static inline void spatialGradientKernel( T& vx, T& vy,
+static inline void spatialGradientKernel_vec( T& vx, T& vy,
                                           const T& v00, const T& v01, const T& v02,
                                           const T& v10,               const T& v12,
                                           const T& v20, const T& v21, const T& v22 )
 {
     // vx = (v22 - v00) + (v02 - v20) + 2 * (v12 - v10)
     // vy = (v22 - v00) + (v20 - v02) + 2 * (v21 - v01)
+    T tmp_add = v_sub(v22, v00),
+      tmp_sub = v_sub(v02, v20),
+      tmp_x   = v_sub(v12, v10),
+      tmp_y   = v_sub(v21, v01);
+
+    vx = v_add(v_add(v_add(tmp_add, tmp_sub), tmp_x), tmp_x);
+    vy = v_add(v_add(v_sub(tmp_add, tmp_sub), tmp_y), tmp_y);
+}
+#endif
 
+template <typename T>
+static inline void spatialGradientKernel( T& vx, T& vy,
+                                          const T& v00, const T& v01, const T& v02,
+                                          const T& v10,               const T& v12,
+                                          const T& v20, const T& v21, const T& v22 )
+{
+    // vx = (v22 - v00) + (v02 - v20) + 2 * (v12 - v10)
+    // vy = (v22 - v00) + (v20 - v02) + 2 * (v21 - v01)
     T tmp_add = v22 - v00,
       tmp_sub = v02 - v20,
       tmp_x   = v12 - v10,
@@ -125,7 +143,7 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
 
     int i_start = 0;
     int j_start = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     // Characters in variable names have the following meanings:
     // u: unsigned char
     // s: signed int
@@ -148,7 +166,7 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
         short *n_dy = dy.ptr<short>(i+1);
 
         // Process rest of columns 16-column chunks at a time
-        for ( j = 1; j < W - v_uint8::nlanes; j += v_uint8::nlanes)
+        for ( j = 1; j < W - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
         {
             // Load top row for 3x3 Sobel filter
             v_uint8 v_um = vx_load(&p_src[j-1]);
@@ -195,22 +213,22 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
 
             // dx & dy for rows 1, 2, 3
             v_int16 v_sdx1, v_sdy1;
-            spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
+            spatialGradientKernel_vec<v_int16>( v_sdx1, v_sdy1,
                                               v_s1m1, v_s1n1, v_s1p1,
                                               v_s2m1,         v_s2p1,
                                               v_s3m1, v_s3n1, v_s3p1 );
 
             v_int16 v_sdx2, v_sdy2;
-            spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
+            spatialGradientKernel_vec<v_int16>( v_sdx2, v_sdy2,
                                               v_s1m2, v_s1n2, v_s1p2,
                                               v_s2m2,         v_s2p2,
                                               v_s3m2, v_s3n2, v_s3p2 );
 
             // Store
             v_store(&c_dx[j],                 v_sdx1);
-            v_store(&c_dx[j+v_int16::nlanes], v_sdx2);
+            v_store(&c_dx[j+VTraits<v_int16>::vlanes()], v_sdx2);
             v_store(&c_dy[j],                 v_sdy1);
-            v_store(&c_dy[j+v_int16::nlanes], v_sdy2);
+            v_store(&c_dy[j+VTraits<v_int16>::vlanes()], v_sdy2);
 
             // Load fourth row for 3x3 Sobel filter
             v_um = vx_load(&m_src[j-1]);
@@ -227,21 +245,21 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
             v_int16 v_s4p2 = v_reinterpret_as_s16(v_up2);
 
             // dx & dy for rows 2, 3, 4
-            spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
+            spatialGradientKernel_vec<v_int16>( v_sdx1, v_sdy1,
                                               v_s2m1, v_s2n1, v_s2p1,
                                               v_s3m1,         v_s3p1,
                                               v_s4m1, v_s4n1, v_s4p1 );
 
-            spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
+            spatialGradientKernel_vec<v_int16>( v_sdx2, v_sdy2,
                                               v_s2m2, v_s2n2, v_s2p2,
                                               v_s3m2,         v_s3p2,
                                               v_s4m2, v_s4n2, v_s4p2 );
 
             // Store
             v_store(&n_dx[j],                 v_sdx1);
-            v_store(&n_dx[j+v_int16::nlanes], v_sdx2);
+            v_store(&n_dx[j+VTraits<v_int16>::vlanes()], v_sdx2);
             v_store(&n_dy[j],                 v_sdy1);
-            v_store(&n_dy[j+v_int16::nlanes], v_sdy2);
+            v_store(&n_dy[j+VTraits<v_int16>::vlanes()], v_sdy2);
         }
     }
     i_start = i;
diff --git a/modules/imgproc/src/stackblur.cpp b/modules/imgproc/src/stackblur.cpp
index 5d60a1d36535..a69e4b41017e 100644
--- a/modules/imgproc/src/stackblur.cpp
+++ b/modules/imgproc/src/stackblur.cpp
@@ -88,7 +88,7 @@ static unsigned char const stackblurShr[255] =
 
 namespace cv{
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T>
 inline int opRow(const T* , T* , const std::vector<ushort>& , const float , const int radius, const int CN, const int )
 {
@@ -107,7 +107,7 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
     const int mulValTab= stackblurMul[radius];
     const int shrValTab= stackblurShr[radius];
 
-    const int VEC_LINE = v_uint8::nlanes;
+    const int VEC_LINE = VTraits<v_uint8>::vlanes();
 
     if (kernelSize == 3)
     {
@@ -126,10 +126,10 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
             v_expand(x1l, y00, y01);
             v_expand(x1h, y10, y11);
 
-            y00 = (y00 * v_mulVal)>>shrValTab;
-            y01 = (y01 * v_mulVal)>>shrValTab;
-            y10 = (y10 * v_mulVal)>>shrValTab;
-            y11 = (y11 * v_mulVal)>>shrValTab;
+            y00 = v_shr(v_mul(y00, v_mulVal), shrValTab);
+            y01 = v_shr(v_mul(y01, v_mulVal), shrValTab);
+            y10 = v_shr(v_mul(y10, v_mulVal), shrValTab);
+            y11 = v_shr(v_mul(y11, v_mulVal), shrValTab);
 
             v_store(dstPtr + i, v_pack(v_pack(y00, y01), v_pack(y10, y11)));
         }
@@ -159,12 +159,12 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
                 v_uint8 v_src3 = vx_load(srcPtr + j + CN);
 
                 v_int16 xl, xh;
-                v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
-                s0 += v_dotprod(xl, k12);
-                s1 += v_dotprod(xh, k12);
-                v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
-                s2 += v_dotprod(xl, k12);
-                s3 += v_dotprod(xh, k12);
+                v_zip(v_reinterpret_as_s16(v_add(v_expand_low(v_src0), v_expand_low(v_src2))), v_reinterpret_as_s16(v_add(v_expand_low(v_src1), v_expand_low(v_src3))), xl, xh);
+                s0 = v_add(s0, v_dotprod(xl, k12));
+                s1 = v_add(s1, v_dotprod(xh, k12));
+                v_zip(v_reinterpret_as_s16(v_add(v_expand_high(v_src0), v_expand_high(v_src2))), v_reinterpret_as_s16(v_add(v_expand_high(v_src1), v_expand_high(v_src3))), xl, xh);
+                s2 = v_add(s2, v_dotprod(xl, k12));
+                s3 = v_add(s3, v_dotprod(xh, k12));
             }
             if( k < kernelSize / 2 + 1 )
             {
@@ -175,17 +175,17 @@ inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<us
 
                 v_int16 xl, xh;
                 v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
-                s0 += v_dotprod(xl, k1);
-                s1 += v_dotprod(xh, k1);
+                s0 = v_add(s0, v_dotprod(xl, k1));
+                s1 = v_add(s1, v_dotprod(xh, k1));
                 v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
-                s2 += v_dotprod(xl, k1);
-                s3 += v_dotprod(xh, k1);
+                s2 = v_add(s2, v_dotprod(xl, k1));
+                s3 = v_add(s3, v_dotprod(xh, k1));
             }
 
-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
-            s2 = (s2 * v_mulVal)>>shrValTab;
-            s3 = (s3 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
+            s2 = v_shr(v_mul(s2, v_mulVal), shrValTab);
+            s3 = v_shr(v_mul(s3, v_mulVal), shrValTab);
 
             v_store(dstPtr + i, v_pack(v_reinterpret_as_u16(v_pack(s0, s1)), v_reinterpret_as_u16(v_pack(s2, s3))));
         }
@@ -205,7 +205,7 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
     const int mulValTab= stackblurMul[radius];
     const int shrValTab= stackblurShr[radius];
 
-    const int VEC_LINE = v_uint16::nlanes;
+    const int VEC_LINE = VTraits<v_uint16>::vlanes();
 
     v_uint32 v_mulVal = vx_setall_u32(mulValTab);
     if (kernelSize == 3)
@@ -220,7 +220,7 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
             x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l));
             x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h));
 
-            v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab));
+            v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab)));
         }
     }
     else
@@ -243,25 +243,25 @@ inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector
                 v_uint16 k2 = vx_setall_u16(kx[k + 1]);
 
                 v_uint32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
-                v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
+                v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
             }
             if( k < kernelSize / 2 + 1 )
             {
                 v_uint16 k1 = vx_setall_u16(kx[k]);
 
                 v_uint32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
             }
 
-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
 
             v_store(dstPtr + i, v_pack(s0, s1));
         }
@@ -282,7 +282,7 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
     const int mulValTab= stackblurMul[radius];
     const int shrValTab= stackblurShr[radius];
 
-    const int VEC_LINE = v_int16::nlanes;
+    const int VEC_LINE = VTraits<v_int16>::vlanes();
     v_int32 v_mulVal = vx_setall_s32(mulValTab);
 
     if (kernelSize == 3)
@@ -297,7 +297,7 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
             x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l));
             x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h));
 
-            v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab));
+            v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab)));
         }
     }
     else
@@ -320,24 +320,24 @@ inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<us
 
                 v_int32 y0, y1;
 
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
-                v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
+                v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
             }
             if( k < kernelSize / 2 + 1 )
             {
                 v_int16 k1 = vx_setall_s16((short)kx[k]);
                 v_int32 y0, y1;
-                v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1);
-                s0 += y0;
-                s1 += y1;
+                v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1);
+                s0 = v_add(s0, y0);
+                s1 = v_add(s1, y1);
             }
 
-            s0 = (s0 * v_mulVal)>>shrValTab;
-            s1 = (s1 * v_mulVal)>>shrValTab;
+            s0 = v_shr(v_mul(s0, v_mulVal), shrValTab);
+            s1 = v_shr(v_mul(s1, v_mulVal), shrValTab);
 
             v_store(dstPtr + i, v_pack(s0, s1));
         }
@@ -352,7 +352,7 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
     int i = radius * CN;
 
     v_float32 v_mulVal = vx_setall_f32(mulVal);
-    const int VEC_LINE = v_float32::nlanes;
+    const int VEC_LINE = VTraits<v_float32>::vlanes();
     const int VEC_LINE4 = VEC_LINE * 4;
 
     if (kernelSize == 3)
@@ -364,22 +364,22 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
             v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + i);
             v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + i);
 
-            v_float32 v_sumVal0 =  v_srcPtr0 + v_srcPtr0 + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN);
-            v_float32 v_sumVal1 =  v_srcPtr1 + v_srcPtr1 + vx_load(srcPtr + VEC_LINE + i - CN) + vx_load(srcPtr + VEC_LINE + i + CN);
-            v_float32 v_sumVal2 =  v_srcPtr2 + v_srcPtr2 + vx_load(srcPtr + VEC_LINE * 2 + i - CN) + vx_load(srcPtr + VEC_LINE * 2 + i + CN);
-            v_float32 v_sumVal3 =  v_srcPtr3 + v_srcPtr3 + vx_load(srcPtr + VEC_LINE * 3 + i - CN) + vx_load(srcPtr + VEC_LINE * 3 + i + CN);
+            v_float32 v_sumVal0 =  v_add(v_add(v_add(v_srcPtr0, v_srcPtr0), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN));
+            v_float32 v_sumVal1 =  v_add(v_add(v_add(v_srcPtr1, v_srcPtr1), vx_load(srcPtr + VEC_LINE + i - CN)), vx_load(srcPtr + VEC_LINE + i + CN));
+            v_float32 v_sumVal2 =  v_add(v_add(v_add(v_srcPtr2, v_srcPtr2), vx_load(srcPtr + VEC_LINE * 2 + i - CN)), vx_load(srcPtr + VEC_LINE * 2 + i + CN));
+            v_float32 v_sumVal3 =  v_add(v_add(v_add(v_srcPtr3, v_srcPtr3), vx_load(srcPtr + VEC_LINE * 3 + i - CN)), vx_load(srcPtr + VEC_LINE * 3 + i + CN));
 
-            v_store(dstPtr + i, v_sumVal0 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE, v_sumVal1 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE * 2, v_sumVal2 * v_mulVal);
-            v_store(dstPtr + i + VEC_LINE * 3, v_sumVal3 * v_mulVal);
+            v_store(dstPtr + i, v_mul(v_sumVal0, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE, v_mul(v_sumVal1, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE * 2, v_mul(v_sumVal2, v_mulVal));
+            v_store(dstPtr + i + VEC_LINE * 3, v_mul(v_sumVal3, v_mulVal));
         }
 
         for (; i <= widthCN - VEC_LINE; i += VEC_LINE)
         {
             v_float32 v_srcPtr = vx_load(srcPtr + i);
-            v_float32 v_sumVal = v_srcPtr + v_srcPtr + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN);
-            v_store(dstPtr + i, v_sumVal * v_mulVal);
+            v_float32 v_sumVal = v_add(v_add(v_add(v_srcPtr, v_srcPtr), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN));
+            v_store(dstPtr + i, v_mul(v_sumVal, v_mulVal));
         }
     }
     else
@@ -392,7 +392,7 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
         {
             v_float32 v_src = vx_load(srcPtr);
             v_float32 s0;
-            s0 = v_src * k0;
+            s0 = v_mul(v_src, k0);
 
             int k = 1, j = CN;
             for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN)
@@ -400,17 +400,17 @@ inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<us
                 v_float32 k1 = vx_setall_f32((float)kx[k]);
                 v_float32 k2 = vx_setall_f32((float)kx[k + 1]);
 
-                s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1;
-                s0 += (vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN)) * k2;
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1));
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2));
             }
             if( k < kernelSize / 2 + 1 )
             {
                 v_float32 k1 = vx_setall_f32((float)kx[k]);
 
-                s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1;
+                s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1));
             }
 
-            v_store(dstPtr + i, s0 * v_mulVal);
+            v_store(dstPtr + i, v_mul(s0, v_mulVal));
         }
     }
     return i;
@@ -426,8 +426,8 @@ template<>
 inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const int w, const int CNR1)
 {
     int index = 0;
-    const int VEC_LINE_8 = v_uint8::nlanes;
-    const int VEC_LINE_32 = v_int32::nlanes;
+    const int VEC_LINE_8 = VTraits<v_uint8>::vlanes();
+    const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
     for (; index <= w - VEC_LINE_8; index += VEC_LINE_8, diff0+=VEC_LINE_8, srcPtr+=VEC_LINE_8)
     {
         v_uint16 x0l, x0h, x1l, x1h;
@@ -435,8 +435,8 @@ inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const in
         v_expand(vx_load(srcPtr), x1l, x1h);
 
         v_int32 y0, y1, y2, y3;
-        v_expand(v_reinterpret_as_s16(x0l) - v_reinterpret_as_s16(x1l), y0, y1);
-        v_expand(v_reinterpret_as_s16(x0h) - v_reinterpret_as_s16(x1h), y2, y3);
+        v_expand(v_sub(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x1l)), y0, y1);
+        v_expand(v_sub(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x1h)), y2, y3);
 
         v_store(diff0, y0);
         v_store(diff0 + VEC_LINE_32, y1);
@@ -517,7 +517,7 @@ class ParallelStackBlurRow : public ParallelLoopBody
 
                 // middle
                 int wc = radius * CN;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 wc = opRow<T>(srcPtr, dstPtr, kVec, mulVal, radius, CN, widthCN);
 #endif
                 for (; wc < widthCN; wc++)
@@ -547,7 +547,7 @@ class ParallelStackBlurRow : public ParallelLoopBody
         }
         else
         {
-            size_t bufSize = CN * (width + radius) * sizeof(TBuf) + 2 * CN * sizeof(TBuf);
+            size_t bufSize = CN * (width + kernelSize) * sizeof(TBuf) + 2 * CN * sizeof(TBuf);
             AutoBuffer<uchar> _buf(bufSize + 16);
             uchar* bufptr = alignPtr(_buf.data(), 16);
             TBuf* diffVal = (TBuf*)bufptr;
@@ -586,7 +586,7 @@ class ParallelStackBlurRow : public ParallelLoopBody
                 // middle
                 auto diff0 = diff + radius * CN;
                 int index = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 index = opComputeDiff(srcPtr, diff0, widthCN, CNR1);
 #endif
 
@@ -688,7 +688,7 @@ class ParallelStackBlurRow : public ParallelLoopBody
     float mulVal;
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template<typename T, typename TBuf>
 inline int opColumn(const T* , T* , T* , TBuf* , TBuf* , TBuf* , const float ,
                     const int , const int , const int , const int , const int )
@@ -703,7 +703,7 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
 {
     int k = 0;
     v_float32 v_mulVal = vx_setall_f32(mulVal);
-    const int VEC_LINE = v_float32::nlanes;
+    const int VEC_LINE = VTraits<v_float32>::vlanes();
     const int VEC_LINE4 = 4 * VEC_LINE;
 
     auto stackStartPtr = stack + ss * widthLen;
@@ -726,20 +726,20 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
         v_float32 v_sumIn2 = vx_load(sumIn + VEC_LINE * 2 + k);
         v_float32 v_sumIn3 = vx_load(sumIn + VEC_LINE * 3+ k);
 
-        v_store(dstPtr + k, v_sum0 * v_mulVal);
-        v_store(dstPtr + VEC_LINE + k, v_sum1 * v_mulVal);
-        v_store(dstPtr + VEC_LINE * 2 + k, v_sum2 * v_mulVal);
-        v_store(dstPtr + VEC_LINE * 3 + k, v_sum3 * v_mulVal);
+        v_store(dstPtr + k, v_mul(v_sum0, v_mulVal));
+        v_store(dstPtr + VEC_LINE + k, v_mul(v_sum1, v_mulVal));
+        v_store(dstPtr + VEC_LINE * 2 + k, v_mul(v_sum2, v_mulVal));
+        v_store(dstPtr + VEC_LINE * 3 + k, v_mul(v_sum3, v_mulVal));
 
-        v_sum0 -= v_sumOut0;
-        v_sum1 -= v_sumOut1;
-        v_sum2 -= v_sumOut2;
-        v_sum3 -= v_sumOut3;
+        v_sum0 = v_sub(v_sum0, v_sumOut0);
+        v_sum1 = v_sub(v_sum1, v_sumOut1);
+        v_sum2 = v_sub(v_sum2, v_sumOut2);
+        v_sum3 = v_sub(v_sum3, v_sumOut3);
 
-        v_sumOut0 -= vx_load(stackStartPtr + k);
-        v_sumOut1 -= vx_load(stackStartPtr + VEC_LINE + k);
-        v_sumOut2 -= vx_load(stackStartPtr + VEC_LINE * 2 + k);
-        v_sumOut3 -= vx_load(stackStartPtr + VEC_LINE * 3 + k);
+        v_sumOut0 = v_sub(v_sumOut0, vx_load(stackStartPtr + k));
+        v_sumOut1 = v_sub(v_sumOut1, vx_load(stackStartPtr + VEC_LINE + k));
+        v_sumOut2 = v_sub(v_sumOut2, vx_load(stackStartPtr + VEC_LINE * 2 + k));
+        v_sumOut3 = v_sub(v_sumOut3, vx_load(stackStartPtr + VEC_LINE * 3 + k));
 
         v_float32 v_srcPtr0 = vx_load(srcPtr + k);
         v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + k);
@@ -751,35 +751,35 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
         v_store(stackStartPtr + VEC_LINE * 2 + k, v_srcPtr2);
         v_store(stackStartPtr + VEC_LINE * 3 + k, v_srcPtr3);
 
-        v_sumIn0 += v_srcPtr0;
-        v_sumIn1 += v_srcPtr1;
-        v_sumIn2 += v_srcPtr2;
-        v_sumIn3 += v_srcPtr3;
+        v_sumIn0 = v_add(v_sumIn0, v_srcPtr0);
+        v_sumIn1 = v_add(v_sumIn1, v_srcPtr1);
+        v_sumIn2 = v_add(v_sumIn2, v_srcPtr2);
+        v_sumIn3 = v_add(v_sumIn3, v_srcPtr3);
 
-        v_store(sum + k, v_sum0 + v_sumIn0);
-        v_store(sum + VEC_LINE + k, v_sum1 + v_sumIn1);
-        v_store(sum + VEC_LINE * 2 + k, v_sum2 + v_sumIn2);
-        v_store(sum + VEC_LINE * 3 + k, v_sum3 + v_sumIn3);
+        v_store(sum + k, v_add(v_sum0, v_sumIn0));
+        v_store(sum + VEC_LINE + k, v_add(v_sum1, v_sumIn1));
+        v_store(sum + VEC_LINE * 2 + k, v_add(v_sum2, v_sumIn2));
+        v_store(sum + VEC_LINE * 3 + k, v_add(v_sum3, v_sumIn3));
 
         v_srcPtr0 = vx_load(stackSp1Ptr + k);
         v_srcPtr1 = vx_load(stackSp1Ptr + VEC_LINE + k);
         v_srcPtr2 = vx_load(stackSp1Ptr + VEC_LINE * 2 +  k);
         v_srcPtr3 = vx_load(stackSp1Ptr + VEC_LINE * 3 + k);
 
-        v_sumOut0 += v_srcPtr0;
-        v_sumOut1 += v_srcPtr1;
-        v_sumOut2 += v_srcPtr2;
-        v_sumOut3 += v_srcPtr3;
+        v_sumOut0 = v_add(v_sumOut0, v_srcPtr0);
+        v_sumOut1 = v_add(v_sumOut1, v_srcPtr1);
+        v_sumOut2 = v_add(v_sumOut2, v_srcPtr2);
+        v_sumOut3 = v_add(v_sumOut3, v_srcPtr3);
 
         v_store(sumOut + k, v_sumOut0);
         v_store(sumOut + VEC_LINE + k, v_sumOut1);
         v_store(sumOut + VEC_LINE * 2 + k, v_sumOut2);
         v_store(sumOut + VEC_LINE * 3 + k, v_sumOut3);
 
-        v_sumIn0 -= v_srcPtr0;
-        v_sumIn1 -= v_srcPtr1;
-        v_sumIn2 -= v_srcPtr2;
-        v_sumIn3 -= v_srcPtr3;
+        v_sumIn0 = v_sub(v_sumIn0, v_srcPtr0);
+        v_sumIn1 = v_sub(v_sumIn1, v_srcPtr1);
+        v_sumIn2 = v_sub(v_sumIn2, v_srcPtr2);
+        v_sumIn3 = v_sub(v_sumIn3, v_srcPtr3);
 
         v_store(sumIn + k, v_sumIn0);
         v_store(sumIn + VEC_LINE + k, v_sumIn1);
@@ -793,20 +793,20 @@ inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* sta
         v_float32 v_sumOut = vx_load(sumOut + k);
         v_float32 v_sumIn = vx_load(sumIn + k);
 
-        v_store(dstPtr + k, v_sum * v_mulVal);
-        v_sum -= v_sumOut;
-        v_sumOut -= vx_load(stackStartPtr + k);
+        v_store(dstPtr + k, v_mul(v_sum, v_mulVal));
+        v_sum = v_sub(v_sum, v_sumOut);
+        v_sumOut = v_sub(v_sumOut, vx_load(stackStartPtr + k));
 
         v_float32 v_srcPtr = vx_load(srcPtr + k);
         v_store(stackStartPtr + k, v_srcPtr);
 
-        v_sumIn += v_srcPtr;
-        v_store(sum + k, v_sum + v_sumIn);
+        v_sumIn = v_add(v_sumIn, v_srcPtr);
+        v_store(sum + k, v_add(v_sum, v_sumIn));
 
         v_srcPtr = vx_load(stackSp1Ptr + k);
-        v_sumOut += v_srcPtr;
+        v_sumOut = v_add(v_sumOut, v_srcPtr);
         v_store(sumOut + k, v_sumOut);
-        v_sumIn -= v_srcPtr;
+        v_sumIn = v_sub(v_sumIn, v_srcPtr);
         v_store(sumIn + k, v_sumIn);
     }
     return k;
@@ -820,8 +820,8 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
     int k = 0;
     if (mulValTab != 0 && shrValTab != 0)
     {
-        const int VEC_LINE_8 = v_uint8::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_8 = VTraits<v_uint8>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
         v_int32 v_mulVal = vx_setall_s32(mulValTab);
 
         auto stackStartPtr = stack + ss * widthLen;
@@ -850,13 +850,13 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
 
             v_store(dstPtr + k,
                     v_pack(
-                            v_reinterpret_as_u16(v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)),
-                            v_reinterpret_as_u16(v_pack((v_sum2 * v_mulVal)>>shrValTab, (v_sum3 * v_mulVal)>>shrValTab))));
+                            v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab))),
+                            v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum2, v_mulVal), shrValTab), v_shr(v_mul(v_sum3, v_mulVal), shrValTab)))));
 
-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
-            v_sum2 -= v_sumOut2;
-            v_sum3 -= v_sumOut3;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);
+            v_sum2 = v_sub(v_sum2, v_sumOut2);
+            v_sum3 = v_sub(v_sum3, v_sumOut3);
 
             v_uint16 x0l, x0h;
             v_int32 v_ss0, v_ss1, v_ss2, v_ss3;
@@ -865,10 +865,10 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
             v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
             v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3);
 
-            v_sumOut0 -= v_ss0;
-            v_sumOut1 -= v_ss1;
-            v_sumOut2 -= v_ss2;
-            v_sumOut3 -= v_ss3;
+            v_sumOut0 = v_sub(v_sumOut0, v_ss0);
+            v_sumOut1 = v_sub(v_sumOut1, v_ss1);
+            v_sumOut2 = v_sub(v_sumOut2, v_ss2);
+            v_sumOut3 = v_sub(v_sumOut3, v_ss3);
 
             v_expand(vx_load(srcPtr + k), x0l, x0h);
             v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
@@ -876,34 +876,34 @@ inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack
 
             memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_8 * sizeof (uchar));
 
-            v_sumIn0 += v_ss0;
-            v_sumIn1 += v_ss1;
-            v_sumIn2 += v_ss2;
-            v_sumIn3 += v_ss3;
+            v_sumIn0 = v_add(v_sumIn0, v_ss0);
+            v_sumIn1 = v_add(v_sumIn1, v_ss1);
+            v_sumIn2 = v_add(v_sumIn2, v_ss2);
+            v_sumIn3 = v_add(v_sumIn3, v_ss3);
 
-            v_store(sum + k, v_sum0 + v_sumIn0);
-            v_store(sum + VEC_LINE_32 + k, v_sum1 + v_sumIn1);
-            v_store(sum + VEC_LINE_32 * 2 + k, v_sum2 + v_sumIn2);
-            v_store(sum + VEC_LINE_32 * 3 + k, v_sum3 + v_sumIn3);
+            v_store(sum + k, v_add(v_sum0, v_sumIn0));
+            v_store(sum + VEC_LINE_32 + k, v_add(v_sum1, v_sumIn1));
+            v_store(sum + VEC_LINE_32 * 2 + k, v_add(v_sum2, v_sumIn2));
+            v_store(sum + VEC_LINE_32 * 3 + k, v_add(v_sum3, v_sumIn3));
 
             v_expand(vx_load(stackSp1Ptr + k), x0l, x0h);
             v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1);
             v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3);
 
-            v_sumOut0 += v_ss0;
-            v_sumOut1 += v_ss1;
-            v_sumOut2 += v_ss2;
-            v_sumOut3 += v_ss3;
+            v_sumOut0 = v_add(v_sumOut0, v_ss0);
+            v_sumOut1 = v_add(v_sumOut1, v_ss1);
+            v_sumOut2 = v_add(v_sumOut2, v_ss2);
+            v_sumOut3 = v_add(v_sumOut3, v_ss3);
 
             v_store(sumOut + k, v_sumOut0);
             v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
             v_store(sumOut + VEC_LINE_32 * 2 + k, v_sumOut2);
             v_store(sumOut + VEC_LINE_32 * 3 + k, v_sumOut3);
 
-            v_sumIn0 -= v_ss0;
-            v_sumIn1 -= v_ss1;
-            v_sumIn2 -= v_ss2;
-            v_sumIn3 -= v_ss3;
+            v_sumIn0 = v_sub(v_sumIn0, v_ss0);
+            v_sumIn1 = v_sub(v_sumIn1, v_ss1);
+            v_sumIn2 = v_sub(v_sumIn2, v_ss2);
+            v_sumIn3 = v_sub(v_sumIn3, v_ss3);
 
             v_store(sumIn + k, v_sumIn0);
             v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@@ -922,8 +922,8 @@ inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack
     int k = 0;
     if (mulValTab != 0 && shrValTab != 0)
     {
-        const int VEC_LINE_16 = v_int16::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_16 = VTraits<v_int16>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
         v_int32 v_mulVal = vx_setall_s32(mulValTab);
 
         auto stackStartPtr = stack + ss * widthLen;
@@ -943,39 +943,39 @@ inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack
             v_sumOut0 = vx_load(sumOut + k);
             v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32);
 
-            v_store(dstPtr + k,v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab));
+            v_store(dstPtr + k,v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab)));
 
-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);
 
             v_int32 v_ss0, v_ss1;
             v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1);
 
-            v_sumOut0 -= v_ss0;
-            v_sumOut1 -= v_ss1;
+            v_sumOut0 = v_sub(v_sumOut0, v_ss0);
+            v_sumOut1 = v_sub(v_sumOut1, v_ss1);
 
             v_expand(vx_load(srcPtr + k), v_ss0, v_ss1);
             memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (short));
 
-            v_sumIn0 += v_ss0;
-            v_sumIn1 += v_ss1;
+            v_sumIn0 = v_add(v_sumIn0, v_ss0);
+            v_sumIn1 = v_add(v_sumIn1, v_ss1);
 
-            v_sum0 += v_sumIn0;
-            v_sum1 += v_sumIn1;
+            v_sum0 = v_add(v_sum0, v_sumIn0);
+            v_sum1 = v_add(v_sum1, v_sumIn1);
 
             v_store(sum + k, v_sum0);
             v_store(sum + VEC_LINE_32 + k, v_sum1);
 
             v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1);
 
-            v_sumOut0 += v_ss0;
-            v_sumOut1 += v_ss1;
+            v_sumOut0 = v_add(v_sumOut0, v_ss0);
+            v_sumOut1 = v_add(v_sumOut1, v_ss1);
 
             v_store(sumOut + k, v_sumOut0);
             v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
 
-            v_sumIn0 -= v_ss0;
-            v_sumIn1 -= v_ss1;
+            v_sumIn0 = v_sub(v_sumIn0, v_ss0);
+            v_sumIn1 = v_sub(v_sumIn1, v_ss1);
 
             v_store(sumIn + k, v_sumIn0);
             v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@@ -992,8 +992,8 @@ inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* s
     int k = 0;
     if (mulValTab != 0 && shrValTab != 0)
     {
-        const int VEC_LINE_16 = v_uint16::nlanes;
-        const int VEC_LINE_32 = v_int32::nlanes;
+        const int VEC_LINE_16 = VTraits<v_uint16>::vlanes();
+        const int VEC_LINE_32 = VTraits<v_int32>::vlanes();
         v_uint32 v_mulVal = vx_setall_u32((uint32_t)mulValTab);
 
         auto stackStartPtr = stack + ss * widthLen;
@@ -1013,40 +1013,40 @@ inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* s
             v_sumOut0 = vx_load(sumOut + k);
             v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32);
 
-            v_store(dstPtr + k, v_pack((v_reinterpret_as_u32(v_sum0) * v_mulVal)>>shrValTab, (v_reinterpret_as_u32(v_sum1) * v_mulVal)>>shrValTab));
+            v_store(dstPtr + k, v_pack(v_shr(v_mul(v_reinterpret_as_u32(v_sum0), v_mulVal), shrValTab), v_shr(v_mul(v_reinterpret_as_u32(v_sum1), v_mulVal), shrValTab)));
 
-            v_sum0 -= v_sumOut0;
-            v_sum1 -= v_sumOut1;
+            v_sum0 = v_sub(v_sum0, v_sumOut0);
+            v_sum1 = v_sub(v_sum1, v_sumOut1);
 
             v_uint32 v_ss0, v_ss1;
             v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1);
 
-            v_sumOut0 -= v_reinterpret_as_s32(v_ss0);
-            v_sumOut1 -= v_reinterpret_as_s32(v_ss1);
+            v_sumOut0 = v_sub(v_sumOut0, v_reinterpret_as_s32(v_ss0));
+            v_sumOut1 = v_sub(v_sumOut1, v_reinterpret_as_s32(v_ss1));
 
             v_expand(vx_load(srcPtr + k), v_ss0, v_ss1);
 
             memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (ushort));
 
-            v_sumIn0 += v_reinterpret_as_s32(v_ss0);
-            v_sumIn1 += v_reinterpret_as_s32(v_ss1);
+            v_sumIn0 = v_add(v_sumIn0, v_reinterpret_as_s32(v_ss0));
+            v_sumIn1 = v_add(v_sumIn1, v_reinterpret_as_s32(v_ss1));
 
-            v_sum0 += v_sumIn0;
-            v_sum1 += v_sumIn1;
+            v_sum0 = v_add(v_sum0, v_sumIn0);
+            v_sum1 = v_add(v_sum1, v_sumIn1);
 
             v_store(sum + k, v_sum0);
             v_store(sum + VEC_LINE_32 + k, v_sum1);
 
             v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1);
 
-            v_sumOut0 += v_reinterpret_as_s32(v_ss0);
-            v_sumOut1 += v_reinterpret_as_s32(v_ss1);
+            v_sumOut0 = v_add(v_sumOut0, v_reinterpret_as_s32(v_ss0));
+            v_sumOut1 = v_add(v_sumOut1, v_reinterpret_as_s32(v_ss1));
 
             v_store(sumOut + k, v_sumOut0);
             v_store(sumOut + VEC_LINE_32 + k, v_sumOut1);
 
-            v_sumIn0 -= v_reinterpret_as_s32(v_ss0);
-            v_sumIn1 -= v_reinterpret_as_s32(v_ss1);
+            v_sumIn0 = v_sub(v_sumIn0, v_reinterpret_as_s32(v_ss0));
+            v_sumIn1 = v_sub(v_sumIn1, v_reinterpret_as_s32(v_ss1));
 
             v_store(sumIn + k, v_sumIn0);
             v_store(sumIn + VEC_LINE_32 + k, v_sumIn1);
@@ -1152,7 +1152,7 @@ class ParallelStackBlurColumn:
             }
 
             int k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             k = opColumn<T, TBuf>(srcPtr, dstPtr, stack, sum, sumIn, sumOut, mulVal, mulValTab, shrValTab,
                                       widthLen, stackStart, sp1);
 #endif
diff --git a/modules/imgproc/src/subdivision2d.cpp b/modules/imgproc/src/subdivision2d.cpp
index 24075ac6ebb4..2a2ab1dfeb7b 100644
--- a/modules/imgproc/src/subdivision2d.cpp
+++ b/modules/imgproc/src/subdivision2d.cpp
@@ -282,10 +282,10 @@ int Subdiv2D::locate(Point2f pt, int& _edge, int& _vertex)
     int i, maxEdges = (int)(qedges.size() * 4);
 
     if( qedges.size() < (size_t)4 )
-        CV_Error( CV_StsError, "Subdivision is empty" );
+        CV_Error( cv::Error::StsError, "Subdivision is empty" );
 
     if( pt.x < topLeft.x || pt.y < topLeft.y || pt.x >= bottomRight.x || pt.y >= bottomRight.y )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     int edge = recentEdge;
     CV_Assert(edge > 0);
@@ -417,10 +417,10 @@ int Subdiv2D::insert(Point2f pt)
     int location = locate( pt, curr_edge, curr_point );
 
     if( location == PTLOC_ERROR )
-        CV_Error( CV_StsBadSize, "" );
+        CV_Error( cv::Error::StsBadSize, "" );
 
     if( location == PTLOC_OUTSIDE_RECT )
-        CV_Error( CV_StsOutOfRange, "" );
+        CV_Error( cv::Error::StsOutOfRange, "" );
 
     if( location == PTLOC_VERTEX )
         return curr_point;
@@ -434,7 +434,7 @@ int Subdiv2D::insert(Point2f pt)
     else if( location == PTLOC_INSIDE )
         ;
     else
-        CV_Error_(CV_StsError, ("Subdiv2D::locate returned invalid location = %d", location) );
+        CV_Error_(cv::Error::StsError, ("Subdiv2D::locate returned invalid location = %d", location) );
 
     CV_Assert( curr_edge != 0 );
     validGeometry = false;
diff --git a/modules/imgproc/src/sumpixels.avx512_skx.hpp b/modules/imgproc/src/sumpixels.avx512_skx.hpp
index 81d9d1d846ef..09b777b2687a 100644
--- a/modules/imgproc/src/sumpixels.avx512_skx.hpp
+++ b/modules/imgproc/src/sumpixels.avx512_skx.hpp
@@ -26,7 +26,7 @@ template<size_t num_channels> class IntegralCalculator;
 template<size_t num_channels>
 class IntegralCalculator  {
 public:
-    IntegralCalculator() {};
+    IntegralCalculator() {}
 
 
     void calculate_integral_avx512(const uchar *src, size_t _srcstep,
diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp
index f5f3a92d852b..208ffc123111 100644
--- a/modules/imgproc/src/sumpixels.simd.hpp
+++ b/modules/imgproc/src/sumpixels.simd.hpp
@@ -114,7 +114,7 @@ struct Integral_SIMD<uchar, int, double>
 
                 v_int32 prev = vx_setzero_s32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_int32 el4l, el4h;
@@ -127,8 +127,8 @@ struct Integral_SIMD<uchar, int, double>
                     el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask));
                     prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask);
 #else
-                    el8 += v_rotate_left<1>(el8);
-                    el8 += v_rotate_left<2>(el8);
+                    el8 = v_add(el8, v_rotate_left<1>(el8));
+                    el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
                     el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
@@ -136,12 +136,12 @@ struct Integral_SIMD<uchar, int, double>
 #endif
 #endif
                     v_expand(el8, el4l, el4h);
-                    el4l += prev;
-                    el4h += el4l;
-                    prev = v_broadcast_element<v_int32::nlanes - 1>(el4h);
+                    el4l = v_add(el4l, prev);
+                    el4h = v_add(el4h, el4l);
+                    prev = v_broadcast_highest(el4h);
 #endif
-                    v_store(sum_row + j                  , el4l + vx_load(prev_sum_row + j                  ));
-                    v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
+                    v_store(sum_row + j                  , v_add(el4l, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
                 }
 
                 for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@@ -162,11 +162,11 @@ struct Integral_SIMD<uchar, int, double>
 
                 v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                for ( ; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
-                    v_int16 el8_1 = v_src_row & mask;
-                    v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
+                    v_int16 el8_1 = v_and(v_src_row, mask);
+                    v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
                     v_int32 el4l_1, el4h_1, el4l_2, el4h_2;
 #if CV_AVX2 && CV_SIMD_WIDTH == 32
                     __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@@ -183,10 +183,10 @@ struct Integral_SIMD<uchar, int, double>
                     prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask);
                     prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -197,20 +197,20 @@ struct Integral_SIMD<uchar, int, double>
 #endif
                     v_expand(el8_1, el4l_1, el4h_1);
                     v_expand(el8_2, el4l_2, el4h_2);
-                    el4l_1 += prev_1;
-                    el4l_2 += prev_2;
-                    el4h_1 += el4l_1;
-                    el4h_2 += el4l_2;
-                    prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
-                    prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
+                    el4l_1 = v_add(el4l_1, prev_1);
+                    el4l_2 = v_add(el4l_2, prev_2);
+                    el4h_1 = v_add(el4h_1, el4l_1);
+                    el4h_2 = v_add(el4h_2, el4l_2);
+                    prev_1 = v_broadcast_highest(el4h_1);
+                    prev_2 = v_broadcast_highest(el4h_2);
 #endif
                     v_int32 el4_1, el4_2, el4_3, el4_4;
                     v_zip(el4l_1, el4l_2, el4_1, el4_2);
                     v_zip(el4h_1, el4h_2, el4_3, el4_4);
-                    v_store(sum_row + j                      , el4_1 + vx_load(prev_sum_row + j                      ));
-                    v_store(sum_row + j + v_int32::nlanes    , el4_2 + vx_load(prev_sum_row + j + v_int32::nlanes    ));
-                    v_store(sum_row + j + v_int32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
-                    v_store(sum_row + j + v_int32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
+                    v_store(sum_row + j                      , v_add(el4_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes()    , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 3)));
                 }
 
                 for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -230,7 +230,7 @@ struct Integral_SIMD<uchar, int, double>
                 const uchar * src_row = src + _srcstep * i;
                 int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn;
                 int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn;
-                int row_cache[v_int32::nlanes * 6];
+                int row_cache[VTraits<v_int32>::max_nlanes * 6];
 
                 sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
 
@@ -238,10 +238,10 @@ struct Integral_SIMD<uchar, int, double>
                         prev_3 = vx_setzero_s32();
                 int j = 0;
                 const int j_max =
-                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
-                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
-                        : width - v_uint16::nlanes * cn;  // v_expand_low
-                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
+                        ((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
+                        ? width - VTraits<v_uint8>::vlanes() * cn    // uint8 in v_load_deinterleave()
+                        : width - VTraits<v_uint16>::vlanes() * cn;  // v_expand_low
+                for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@@ -270,49 +270,49 @@ struct Integral_SIMD<uchar, int, double>
                     prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
                     prev_3.val = _mm256_permutevar8x32_epi32(el4h_3.val, shmask);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_3 += v_rotate_left<1>(el8_3);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
-                    el8_3 += v_rotate_left<2>(el8_3);
+                    el8_1 = v_add(el8_1,v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2,v_rotate_left<1>(el8_2));
+                    el8_3 = v_add(el8_3,v_rotate_left<1>(el8_3));
+                    el8_1 = v_add(el8_1,v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2,v_rotate_left<2>(el8_2));
+                    el8_3 = v_add(el8_3,v_rotate_left<2>(el8_3));
 #if CV_SIMD_WIDTH >= 32
-                    el8_1 += v_rotate_left<4>(el8_1);
-                    el8_2 += v_rotate_left<4>(el8_2);
-                    el8_3 += v_rotate_left<4>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3));
 #if CV_SIMD_WIDTH == 64
-                    el8_1 += v_rotate_left<8>(el8_1);
-                    el8_2 += v_rotate_left<8>(el8_2);
-                    el8_3 += v_rotate_left<8>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3));
 #endif
 #endif
                     v_expand(el8_1, el4l_1, el4h_1);
                     v_expand(el8_2, el4l_2, el4h_2);
                     v_expand(el8_3, el4l_3, el4h_3);
-                    el4l_1 += prev_1;
-                    el4l_2 += prev_2;
-                    el4l_3 += prev_3;
-                    el4h_1 += el4l_1;
-                    el4h_2 += el4l_2;
-                    el4h_3 += el4l_3;
-                    prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
-                    prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
-                    prev_3 = v_broadcast_element<v_int32::nlanes - 1>(el4h_3);
+                    el4l_1 = v_add(el4l_1, prev_1);
+                    el4l_2 = v_add(el4l_2, prev_2);
+                    el4l_3 = v_add(el4l_3, prev_3);
+                    el4h_1 = v_add(el4h_1, el4l_1);
+                    el4h_2 = v_add(el4h_2, el4l_2);
+                    el4h_3 = v_add(el4h_3, el4l_3);
+                    prev_1 = v_broadcast_highest(el4h_1);
+                    prev_2 = v_broadcast_highest(el4h_2);
+                    prev_3 = v_broadcast_highest(el4h_3);
 #endif
                     v_store_interleave(row_cache                      , el4l_1, el4l_2, el4l_3);
-                    v_store_interleave(row_cache + v_int32::nlanes * 3, el4h_1, el4h_2, el4h_3);
+                    v_store_interleave(row_cache + VTraits<v_int32>::vlanes() * 3, el4h_1, el4h_2, el4h_3);
                     el4l_1 = vx_load(row_cache                      );
-                    el4l_2 = vx_load(row_cache + v_int32::nlanes    );
-                    el4l_3 = vx_load(row_cache + v_int32::nlanes * 2);
-                    el4h_1 = vx_load(row_cache + v_int32::nlanes * 3);
-                    el4h_2 = vx_load(row_cache + v_int32::nlanes * 4);
-                    el4h_3 = vx_load(row_cache + v_int32::nlanes * 5);
-                    v_store(sum_row + j                      , el4l_1 + vx_load(prev_sum_row + j                      ));
-                    v_store(sum_row + j + v_int32::nlanes    , el4l_2 + vx_load(prev_sum_row + j + v_int32::nlanes    ));
-                    v_store(sum_row + j + v_int32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
-                    v_store(sum_row + j + v_int32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
-                    v_store(sum_row + j + v_int32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_int32::nlanes * 4));
-                    v_store(sum_row + j + v_int32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 5));
+                    el4l_2 = vx_load(row_cache + VTraits<v_int32>::vlanes()    );
+                    el4l_3 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 2);
+                    el4h_1 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 3);
+                    el4h_2 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 4);
+                    el4h_3 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 5);
+                    v_store(sum_row + j                      ,            v_add(el4l_1, vx_load(prev_sum_row + j                      )));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes()    , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes()    )));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 3)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 4)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 5)));
                 }
 
                 for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -339,7 +339,7 @@ struct Integral_SIMD<uchar, int, double>
 
                 v_int32 prev = vx_setzero_s32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_int32 el4l, el4h;
@@ -356,8 +356,8 @@ struct Integral_SIMD<uchar, int, double>
 #endif
 #endif
                     v_expand(el8, el4l, el4h);
-                    el4l += prev;
-                    el4h += el4l;
+                    el4l = v_add(el4l, prev);
+                    el4h = v_add(el4h, el4l);
 #if CV_SIMD_WIDTH == 16
                     prev = el4h;
 #elif CV_SIMD_WIDTH == 32
@@ -368,8 +368,8 @@ struct Integral_SIMD<uchar, int, double>
                     prev = v_combine_low(t, t);
 #endif
 #endif
-                    v_store(sum_row + j                  , el4l + vx_load(prev_sum_row + j                  ));
-                    v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
+                    v_store(sum_row + j                  , v_add(el4l, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
                 }
 
                 for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -426,7 +426,7 @@ struct Integral_SIMD<uchar, float, double>
 
                 v_float32 prev = vx_setzero_f32();
                 int j = 0;
-                for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for (; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_float32 el4l, el4h;
@@ -439,8 +439,8 @@ struct Integral_SIMD<uchar, float, double>
                     el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask));
                     prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask);
 #else
-                    el8 += v_rotate_left<1>(el8);
-                    el8 += v_rotate_left<2>(el8);
+                    el8 = v_add(el8, v_rotate_left<1>(el8));
+                    el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
                     el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
@@ -449,12 +449,12 @@ struct Integral_SIMD<uchar, float, double>
 #endif
                     v_int32 el4li, el4hi;
                     v_expand(el8, el4li, el4hi);
-                    el4l = v_cvt_f32(el4li) + prev;
-                    el4h = v_cvt_f32(el4hi) + el4l;
-                    prev = v_broadcast_element<v_float32::nlanes - 1>(el4h);
+                    el4l = v_add(v_cvt_f32(el4li), prev);
+                    el4h = v_add(v_cvt_f32(el4hi), el4l);
+                    prev = v_broadcast_highest(el4h);
 #endif
-                    v_store(sum_row + j                    , el4l + vx_load(prev_sum_row + j                    ));
-                    v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
+                    v_store(sum_row + j                    , v_add(el4l, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
                 }
 
                 for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@@ -475,11 +475,11 @@ struct Integral_SIMD<uchar, float, double>
 
                 v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32();
                 int j = 0;
-                for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                for (; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
-                    v_int16 el8_1 = v_src_row & mask;
-                    v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
+                    v_int16 el8_1 = v_and(v_src_row, mask);
+                    v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
                     v_float32 el4l_1, el4h_1, el4l_2, el4h_2;
 #if CV_AVX2 && CV_SIMD_WIDTH == 32
                     __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@@ -496,10 +496,10 @@ struct Integral_SIMD<uchar, float, double>
                     prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask);
                     prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -511,20 +511,20 @@ struct Integral_SIMD<uchar, float, double>
                     v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
                     v_expand(el8_1, el4li_1, el4hi_1);
                     v_expand(el8_2, el4li_2, el4hi_2);
-                    el4l_1 = v_cvt_f32(el4li_1) + prev_1;
-                    el4l_2 = v_cvt_f32(el4li_2) + prev_2;
-                    el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
-                    el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
-                    prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
-                    prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
+                    el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1);
+                    el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2);
+                    el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1);
+                    el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2);
+                    prev_1 = v_broadcast_highest(el4h_1);
+                    prev_2 = v_broadcast_highest(el4h_2);
 #endif
                     v_float32 el4_1, el4_2, el4_3, el4_4;
                     v_zip(el4l_1, el4l_2, el4_1, el4_2);
                     v_zip(el4h_1, el4h_2, el4_3, el4_4);
-                    v_store(sum_row + j                        , el4_1 + vx_load(prev_sum_row + j                        ));
-                    v_store(sum_row + j + v_float32::nlanes    , el4_2 + vx_load(prev_sum_row + j + v_float32::nlanes    ));
-                    v_store(sum_row + j + v_float32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
-                    v_store(sum_row + j + v_float32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
+                    v_store(sum_row + j                        , v_add(el4_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes()    , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 3)));
                 }
 
                 for (float v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -543,7 +543,7 @@ struct Integral_SIMD<uchar, float, double>
                 const uchar * src_row = src + _srcstep * i;
                 float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn;
                 float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn;
-                float row_cache[v_float32::nlanes * 6];
+                float row_cache[VTraits<v_float32>::max_nlanes * 6];
 
                 sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
 
@@ -551,10 +551,10 @@ struct Integral_SIMD<uchar, float, double>
                           prev_3 = vx_setzero_f32();
                 int j = 0;
                 const int j_max =
-                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
-                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
-                        : width - v_uint16::nlanes * cn;  // v_expand_low
-                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
+                        ((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
+                        ? width - VTraits<v_uint8>::vlanes() * cn    // uint8 in v_load_deinterleave()
+                        : width - VTraits<v_uint16>::vlanes() * cn;  // v_expand_low
+                for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@@ -583,12 +583,12 @@ struct Integral_SIMD<uchar, float, double>
                     prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
                     prev_3.val = _mm256_permutevar8x32_ps(el4h_3.val, shmask);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_3 += v_rotate_left<1>(el8_3);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
-                    el8_3 += v_rotate_left<2>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -603,30 +603,30 @@ struct Integral_SIMD<uchar, float, double>
                     v_expand(el8_1, el4li_1, el4hi_1);
                     v_expand(el8_2, el4li_2, el4hi_2);
                     v_expand(el8_3, el4li_3, el4hi_3);
-                    el4l_1 = v_cvt_f32(el4li_1) + prev_1;
-                    el4l_2 = v_cvt_f32(el4li_2) + prev_2;
-                    el4l_3 = v_cvt_f32(el4li_3) + prev_3;
-                    el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
-                    el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
-                    el4h_3 = v_cvt_f32(el4hi_3) + el4l_3;
-                    prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
-                    prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
-                    prev_3 = v_broadcast_element<v_float32::nlanes - 1>(el4h_3);
+                    el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1);
+                    el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2);
+                    el4l_3 = v_add(v_cvt_f32(el4li_3), prev_3);
+                    el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1);
+                    el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2);
+                    el4h_3 = v_add(v_cvt_f32(el4hi_3), el4l_3);
+                    prev_1 = v_broadcast_highest(el4h_1);
+                    prev_2 = v_broadcast_highest(el4h_2);
+                    prev_3 = v_broadcast_highest(el4h_3);
 #endif
                     v_store_interleave(row_cache                        , el4l_1, el4l_2, el4l_3);
-                    v_store_interleave(row_cache + v_float32::nlanes * 3, el4h_1, el4h_2, el4h_3);
+                    v_store_interleave(row_cache + VTraits<v_float32>::vlanes() * 3, el4h_1, el4h_2, el4h_3);
                     el4l_1 = vx_load(row_cache                        );
-                    el4l_2 = vx_load(row_cache + v_float32::nlanes    );
-                    el4l_3 = vx_load(row_cache + v_float32::nlanes * 2);
-                    el4h_1 = vx_load(row_cache + v_float32::nlanes * 3);
-                    el4h_2 = vx_load(row_cache + v_float32::nlanes * 4);
-                    el4h_3 = vx_load(row_cache + v_float32::nlanes * 5);
-                    v_store(sum_row + j                        , el4l_1 + vx_load(prev_sum_row + j                        ));
-                    v_store(sum_row + j + v_float32::nlanes    , el4l_2 + vx_load(prev_sum_row + j + v_float32::nlanes    ));
-                    v_store(sum_row + j + v_float32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
-                    v_store(sum_row + j + v_float32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
-                    v_store(sum_row + j + v_float32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_float32::nlanes * 4));
-                    v_store(sum_row + j + v_float32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 5));
+                    el4l_2 = vx_load(row_cache + VTraits<v_float32>::vlanes()    );
+                    el4l_3 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 2);
+                    el4h_1 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 3);
+                    el4h_2 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 4);
+                    el4h_3 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 5);
+                    v_store(sum_row + j                        , v_add(el4l_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes()    , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 3)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 4)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 5)));
                 }
 
                 for (float v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -652,7 +652,7 @@ struct Integral_SIMD<uchar, float, double>
 
                 v_float32 prev = vx_setzero_f32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_float32 el4l, el4h;
@@ -670,8 +670,8 @@ struct Integral_SIMD<uchar, float, double>
 #endif
                     v_int32 el4li, el4hi;
                     v_expand(el8, el4li, el4hi);
-                    el4l = v_cvt_f32(el4li) + prev;
-                    el4h = v_cvt_f32(el4hi) + el4l;
+                    el4l = v_add(v_cvt_f32(el4li), prev);
+                    el4h = v_add(v_cvt_f32(el4hi), el4l);
 #if CV_SIMD_WIDTH == 16
                     prev = el4h;
 #elif CV_SIMD_WIDTH == 32
@@ -682,8 +682,8 @@ struct Integral_SIMD<uchar, float, double>
                     prev = v_combine_low(t, t);
 #endif
 #endif
-                    v_store(sum_row + j                    , el4l + vx_load(prev_sum_row + j                    ));
-                    v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
+                    v_store(sum_row + j                    , v_add(el4l, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
                 }
 
                 for (float v4 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -750,7 +750,7 @@ struct Integral_SIMD<uchar, double, double>
 
                 v_float64 prev = vx_setzero_f64();
                 int j = 0;
-                for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for (; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_float64 el4ll, el4lh, el4hl, el4hh;
@@ -767,8 +767,8 @@ struct Integral_SIMD<uchar, double, double>
                     el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4d);
                     prev.val = _mm256_permute4x64_pd(el4hh.val, 0xff);
 #else
-                    el8 += v_rotate_left<1>(el8);
-                    el8 += v_rotate_left<2>(el8);
+                    el8 = v_add(el8, v_rotate_left<1>(el8));
+                    el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
                     el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
@@ -777,17 +777,17 @@ struct Integral_SIMD<uchar, double, double>
 #endif
                     v_int32 el4li, el4hi;
                     v_expand(el8, el4li, el4hi);
-                    el4ll = v_cvt_f64(el4li) + prev;
-                    el4lh = v_cvt_f64_high(el4li) + prev;
-                    el4hl = v_cvt_f64(el4hi) + el4ll;
-                    el4hh = v_cvt_f64_high(el4hi) + el4lh;
-                    prev = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh));
-//                    prev = v_broadcast_element<v_float64::nlanes - 1>(el4hh);
+                    el4ll = v_add(v_cvt_f64(el4li), prev);
+                    el4lh = v_add(v_cvt_f64_high(el4li), prev);
+                    el4hl = v_add(v_cvt_f64(el4hi), el4ll);
+                    el4hh = v_add(v_cvt_f64_high(el4hi), el4lh);
+                    prev = vx_setall_f64(v_extract_highest(el4hh));
+//                    prev = v_broadcast_highest(el4hh);
 #endif
-                    v_store(sum_row + j                        , el4ll + vx_load(prev_sum_row + j                        ));
-                    v_store(sum_row + j + v_float64::nlanes    , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes    ));
-                    v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
-                    v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
+                    v_store(sum_row + j                        , v_add(el4ll, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes()    , v_add(el4lh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
                 }
 
                 for (double v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@@ -808,11 +808,11 @@ struct Integral_SIMD<uchar, double, double>
 
                 v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
                 int j = 0;
-                for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                for (; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
-                    v_int16 el8_1 = v_src_row & mask;
-                    v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
+                    v_int16 el8_1 = v_and(v_src_row, mask);
+                    v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
                     v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2;
 #if CV_AVX2 && CV_SIMD_WIDTH == 32
                     __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@@ -838,10 +838,10 @@ struct Integral_SIMD<uchar, double, double>
                     prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff);
                     prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -853,32 +853,32 @@ struct Integral_SIMD<uchar, double, double>
                     v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
                     v_expand(el8_1, el4li_1, el4hi_1);
                     v_expand(el8_2, el4li_2, el4hi_2);
-                    el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
-                    el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
-                    el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
-                    el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
-                    el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
-                    el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
-                    el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
-                    el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
-                    prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
-                    prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
-//                    prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
-//                    prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
+                    el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1);
+                    el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2);
+                    el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1);
+                    el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2);
+                    el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1);
+                    el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2);
+                    el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1);
+                    el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2);
+                    prev_1 = vx_setall_f64(v_extract_highest(el4hh_1));
+                    prev_2 = vx_setall_f64(v_extract_highest(el4hh_2));
+//                    prev_1 = v_broadcast_highest(el4hh_1);
+//                    prev_2 = v_broadcast_highest(el4hh_2);
 #endif
                     v_float64 el4_1, el4_2, el4_3, el4_4, el4_5, el4_6, el4_7, el4_8;
                     v_zip(el4ll_1, el4ll_2, el4_1, el4_2);
                     v_zip(el4lh_1, el4lh_2, el4_3, el4_4);
                     v_zip(el4hl_1, el4hl_2, el4_5, el4_6);
                     v_zip(el4hh_1, el4hh_2, el4_7, el4_8);
-                    v_store(sum_row + j                        , el4_1 + vx_load(prev_sum_row + j                        ));
-                    v_store(sum_row + j + v_float64::nlanes    , el4_2 + vx_load(prev_sum_row + j + v_float64::nlanes    ));
-                    v_store(sum_row + j + v_float64::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
-                    v_store(sum_row + j + v_float64::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
-                    v_store(sum_row + j + v_float64::nlanes * 4, el4_5 + vx_load(prev_sum_row + j + v_float64::nlanes * 4));
-                    v_store(sum_row + j + v_float64::nlanes * 5, el4_6 + vx_load(prev_sum_row + j + v_float64::nlanes * 5));
-                    v_store(sum_row + j + v_float64::nlanes * 6, el4_7 + vx_load(prev_sum_row + j + v_float64::nlanes * 6));
-                    v_store(sum_row + j + v_float64::nlanes * 7, el4_8 + vx_load(prev_sum_row + j + v_float64::nlanes * 7));
+                    v_store(sum_row + j                        , v_add(el4_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes()    , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 4, v_add(el4_5, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 4)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 5, v_add(el4_6, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 5)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 6, v_add(el4_7, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 6)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 7, v_add(el4_8, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 7)));
                 }
 
                 for (double v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -897,7 +897,7 @@ struct Integral_SIMD<uchar, double, double>
                 const uchar * src_row = src + _srcstep * i;
                 double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn;
                 double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn;
-                double row_cache[v_float64::nlanes * 12];
+                double row_cache[VTraits<v_float64>::max_nlanes * 12];
 
                 sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
 
@@ -905,10 +905,10 @@ struct Integral_SIMD<uchar, double, double>
                           prev_3 = vx_setzero_f64();
                 int j = 0;
                 const int j_max =
-                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
-                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
-                        : width - v_uint16::nlanes * cn;  // v_expand_low
-                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
+                        ((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
+                        ? width - VTraits<v_uint8>::vlanes() * cn    // uint8 in v_load_deinterleave()
+                        : width - VTraits<v_uint16>::vlanes() * cn;  // v_expand_low
+                for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@@ -951,12 +951,12 @@ struct Integral_SIMD<uchar, double, double>
                     prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
                     prev_3.val = _mm256_permute4x64_pd(el4hh_3.val, 0xff);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_3 += v_rotate_left<1>(el8_3);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
-                    el8_3 += v_rotate_left<2>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -971,53 +971,53 @@ struct Integral_SIMD<uchar, double, double>
                     v_expand(el8_1, el4li_1, el4hi_1);
                     v_expand(el8_2, el4li_2, el4hi_2);
                     v_expand(el8_3, el4li_3, el4hi_3);
-                    el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
-                    el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
-                    el4ll_3 = v_cvt_f64(el4li_3) + prev_3;
-                    el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
-                    el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
-                    el4lh_3 = v_cvt_f64_high(el4li_3) + prev_3;
-                    el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
-                    el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
-                    el4hl_3 = v_cvt_f64(el4hi_3) + el4ll_3;
-                    el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
-                    el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
-                    el4hh_3 = v_cvt_f64_high(el4hi_3) + el4lh_3;
-                    prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
-                    prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
-                    prev_3 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_3));
-//                    prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
-//                    prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
-//                    prev_3 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_3);
+                    el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1);
+                    el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2);
+                    el4ll_3 = v_add(v_cvt_f64(el4li_3), prev_3);
+                    el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1);
+                    el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2);
+                    el4lh_3 = v_add(v_cvt_f64_high(el4li_3), prev_3);
+                    el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1);
+                    el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2);
+                    el4hl_3 = v_add(v_cvt_f64(el4hi_3), el4ll_3);
+                    el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1);
+                    el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2);
+                    el4hh_3 = v_add(v_cvt_f64_high(el4hi_3), el4lh_3);
+                    prev_1 = vx_setall_f64(v_extract_highest(el4hh_1));
+                    prev_2 = vx_setall_f64(v_extract_highest(el4hh_2));
+                    prev_3 = vx_setall_f64(v_extract_highest(el4hh_3));
+//                    prev_1 = v_broadcast_highest(el4hh_1);
+//                    prev_2 = v_broadcast_highest(el4hh_2);
+//                    prev_3 = v_broadcast_highest(el4hh_3);
 #endif
                     v_store_interleave(row_cache                        , el4ll_1, el4ll_2, el4ll_3);
-                    v_store_interleave(row_cache + v_float64::nlanes * 3, el4lh_1, el4lh_2, el4lh_3);
-                    v_store_interleave(row_cache + v_float64::nlanes * 6, el4hl_1, el4hl_2, el4hl_3);
-                    v_store_interleave(row_cache + v_float64::nlanes * 9, el4hh_1, el4hh_2, el4hh_3);
+                    v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 3, el4lh_1, el4lh_2, el4lh_3);
+                    v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 6, el4hl_1, el4hl_2, el4hl_3);
+                    v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 9, el4hh_1, el4hh_2, el4hh_3);
                     el4ll_1 = vx_load(row_cache                         );
-                    el4ll_2 = vx_load(row_cache + v_float64::nlanes     );
-                    el4ll_3 = vx_load(row_cache + v_float64::nlanes * 2 );
-                    el4lh_1 = vx_load(row_cache + v_float64::nlanes * 3 );
-                    el4lh_2 = vx_load(row_cache + v_float64::nlanes * 4 );
-                    el4lh_3 = vx_load(row_cache + v_float64::nlanes * 5 );
-                    el4hl_1 = vx_load(row_cache + v_float64::nlanes * 6 );
-                    el4hl_2 = vx_load(row_cache + v_float64::nlanes * 7 );
-                    el4hl_3 = vx_load(row_cache + v_float64::nlanes * 8 );
-                    el4hh_1 = vx_load(row_cache + v_float64::nlanes * 9 );
-                    el4hh_2 = vx_load(row_cache + v_float64::nlanes * 10);
-                    el4hh_3 = vx_load(row_cache + v_float64::nlanes * 11);
-                    v_store(sum_row + j                         , el4ll_1 + vx_load(prev_sum_row + j                         ));
-                    v_store(sum_row + j + v_float64::nlanes     , el4ll_2 + vx_load(prev_sum_row + j + v_float64::nlanes     ));
-                    v_store(sum_row + j + v_float64::nlanes * 2 , el4ll_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2 ));
-                    v_store(sum_row + j + v_float64::nlanes * 3 , el4lh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 3 ));
-                    v_store(sum_row + j + v_float64::nlanes * 4 , el4lh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 4 ));
-                    v_store(sum_row + j + v_float64::nlanes * 5 , el4lh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 5 ));
-                    v_store(sum_row + j + v_float64::nlanes * 6 , el4hl_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 6 ));
-                    v_store(sum_row + j + v_float64::nlanes * 7 , el4hl_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 7 ));
-                    v_store(sum_row + j + v_float64::nlanes * 8 , el4hl_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 8 ));
-                    v_store(sum_row + j + v_float64::nlanes * 9 , el4hh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 9 ));
-                    v_store(sum_row + j + v_float64::nlanes * 10, el4hh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 10));
-                    v_store(sum_row + j + v_float64::nlanes * 11, el4hh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 11));
+                    el4ll_2 = vx_load(row_cache + VTraits<v_float64>::vlanes()     );
+                    el4ll_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 2 );
+                    el4lh_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 3 );
+                    el4lh_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 4 );
+                    el4lh_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 5 );
+                    el4hl_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 6 );
+                    el4hl_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 7 );
+                    el4hl_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 8 );
+                    el4hh_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 9 );
+                    el4hh_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 10);
+                    el4hh_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 11);
+                    v_store(sum_row + j                         , v_add(el4ll_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes()     , v_add(el4ll_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2 , v_add(el4ll_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3 , v_add(el4lh_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 4 , v_add(el4lh_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 4)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 5 , v_add(el4lh_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 5)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 6 , v_add(el4hl_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 6)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 7 , v_add(el4hl_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 7)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 8 , v_add(el4hl_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 8)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 9 , v_add(el4hh_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 9)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 10, v_add(el4hh_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 10)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 11, v_add(el4hh_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 11)));
                 }
 
                 for (double v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -1043,7 +1043,7 @@ struct Integral_SIMD<uchar, double, double>
 
                 v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_float64 el4ll, el4lh, el4hl, el4hh;
@@ -1065,10 +1065,10 @@ struct Integral_SIMD<uchar, double, double>
 #endif
                     v_int32 el4li, el4hi;
                     v_expand(el8, el4li, el4hi);
-                    el4ll = v_cvt_f64(el4li) + prev_1;
-                    el4lh = v_cvt_f64_high(el4li) + prev_2;
-                    el4hl = v_cvt_f64(el4hi) + el4ll;
-                    el4hh = v_cvt_f64_high(el4hi) + el4lh;
+                    el4ll = v_add(v_cvt_f64(el4li), prev_1);
+                    el4lh = v_add(v_cvt_f64_high(el4li), prev_2);
+                    el4hl = v_add(v_cvt_f64(el4hi), el4ll);
+                    el4hh = v_add(v_cvt_f64_high(el4hi), el4lh);
 #if CV_SIMD_WIDTH == 16
                     prev_1 = el4hl;
                     prev_2 = el4hh;
@@ -1078,10 +1078,10 @@ struct Integral_SIMD<uchar, double, double>
                     prev_1 = prev_2 = v_combine_high(el4hh, el4hh);
 #endif
 #endif
-                    v_store(sum_row + j                        , el4ll + vx_load(prev_sum_row + j                       ));
-                    v_store(sum_row + j + v_float64::nlanes    , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes   ));
-                    v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
-                    v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
+                    v_store(sum_row + j                        , v_add(el4ll, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes()    , v_add(el4lh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
                 }
 
                 for (double v4 = sum_row[j - 1] - prev_sum_row[j - 1],
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 67845e7429f0..5998bcbac7a7 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -145,7 +145,7 @@ void ConvolveBuf::create(Size image_size, Size templ_size)
     dft_size.width = std::max(getOptimalDFTSize(block_size.width + templ_size.width - 1), 2);
     dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
     if( dft_size.width <= 0 || dft_size.height <= 0 )
-        CV_Error( CV_StsOutOfRange, "the input arrays are too big" );
+        CV_Error( cv::Error::StsOutOfRange, "the input arrays are too big" );
 
     // recompute block size
     block_size.width = dft_size.width - templ_size.width + 1;
@@ -602,7 +602,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
     dftsize.width = std::max(getOptimalDFTSize(blocksize.width + templ.cols - 1), 2);
     dftsize.height = getOptimalDFTSize(blocksize.height + templ.rows - 1);
     if( dftsize.width <= 0 || dftsize.height <= 0 )
-        CV_Error( CV_StsOutOfRange, "the input arrays are too big" );
+        CV_Error( cv::Error::StsOutOfRange, "the input arrays are too big" );
 
     // recompute block size
     blocksize.width = dftsize.width - templ.cols + 1;
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index bed0d37f26f8..a8c89059a780 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -117,7 +117,7 @@ static void threshGeneric(Size roi, const T* src, size_t src_step, T* dst,
         return;
 
     default:
-        CV_Error( CV_StsBadArg, "" ); return;
+        CV_Error( cv::Error::StsBadArg, "" ); return;
     }
 }
 
@@ -190,7 +190,7 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     int j = 0;
     const uchar* src = _src.ptr();
     uchar* dst = _dst.ptr();
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint8 thresh_u = vx_setall_u8( thresh );
     v_uint8 maxval16 = vx_setall_u8( maxval );
 
@@ -199,12 +199,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_BINARY:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = thresh_u < v0;
-                v0 = v0 & maxval16;
+                v0 = v_lt(thresh_u, v0);
+                v0 = v_and(v0, maxval16);
                 v_store( dst + j, v0 );
             }
         }
@@ -213,12 +213,12 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_BINARY_INV:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = v0 <= thresh_u;
-                v0 = v0 & maxval16;
+                v0 = v_le(v0, thresh_u);
+                v0 = v_and(v0, maxval16);
                 v_store( dst + j, v0 );
             }
         }
@@ -227,11 +227,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_TRUNC:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = v0 - ( v0 - thresh_u );
+                v0 = v_sub(v0, v_sub(v0, thresh_u));
                 v_store( dst + j, v0 );
             }
         }
@@ -240,11 +240,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_TOZERO:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = ( thresh_u < v0 ) & v0;
+                v0 = v_and(v_lt(thresh_u, v0), v0);
                 v_store( dst + j, v0 );
             }
         }
@@ -253,11 +253,11 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
     case THRESH_TOZERO_INV:
         for( int i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
-            for( j = 0; j <= roi.width - v_uint8::nlanes; j += v_uint8::nlanes)
+            for( j = 0; j <= roi.width - VTraits<v_uint8>::vlanes(); j += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 v0;
                 v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh_u ) & v0;
+                v0 = v_and(v_le(v0, thresh_u), v0);
                 v_store( dst + j, v0 );
             }
         }
@@ -351,7 +351,7 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
 
     const ushort* src = _src.ptr<ushort>();
     ushort* dst = _dst.ptr<ushort>();
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int i, j;
     v_uint16 thresh_u = vx_setall_u16(thresh);
     v_uint16 maxval16 = vx_setall_u16(maxval);
@@ -361,25 +361,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
     case THRESH_BINARY:
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
-            for (j = 0; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (j = 0; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = thresh_u < v0;
-                v1 = thresh_u < v1;
-                v0 = v0 & maxval16;
-                v1 = v1 & maxval16;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_lt(thresh_u, v0);
+                v1 = v_lt(thresh_u, v1);
+                v0 = v_and(v0, maxval16);
+                v1 = v_and(v1, maxval16);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
-                v0 = thresh_u < v0;
-                v0 = v0 & maxval16;
+                v0 = v_lt(thresh_u, v0);
+                v0 = v_and(v0, maxval16);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -391,25 +391,25 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = v0 <= thresh_u;
-                v1 = v1 <= thresh_u;
-                v0 = v0 & maxval16;
-                v1 = v1 & maxval16;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_le(v0, thresh_u);
+                v1 = v_le(v1, thresh_u);
+                v0 = v_and(v0, maxval16);
+                v1 = v_and(v1, maxval16);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
-                v0 = v0 <= thresh_u;
-                v0 = v0 & maxval16;
+                v0 = v_le(v0, thresh_u);
+                v0 = v_and(v0, maxval16);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -421,22 +421,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
                 v0 = v_min(v0, thresh_u);
                 v1 = v_min(v1, thresh_u);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
                 v0 = v_min(v0, thresh_u);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -448,22 +448,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = (thresh_u < v0) & v0;
-                v1 = (thresh_u < v1) & v1;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_and(v_lt(thresh_u, v0), v0);
+                v1 = v_and(v_lt(thresh_u, v1), v1);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
-                v0 = (thresh_u < v0) & v0;
+                v0 = v_and(v_lt(thresh_u, v0), v0);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -475,22 +475,22 @@ thresh_16u(const Mat& _src, Mat& _dst, ushort thresh, ushort maxval, int type)
         for (i = 0; i < roi.height; i++, src += src_step, dst += dst_step)
         {
             j = 0;
-            for (; j <= roi.width - 2*v_uint16::nlanes; j += 2*v_uint16::nlanes)
+            for (; j <= roi.width - 2*VTraits<v_uint16>::vlanes(); j += 2*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0, v1;
                 v0 = vx_load(src + j);
-                v1 = vx_load(src + j + v_uint16::nlanes);
-                v0 = (v0 <= thresh_u) & v0;
-                v1 = (v1 <= thresh_u) & v1;
+                v1 = vx_load(src + j + VTraits<v_uint16>::vlanes());
+                v0 = v_and(v_le(v0, thresh_u), v0);
+                v1 = v_and(v_le(v1, thresh_u), v1);
                 v_store(dst + j, v0);
-                v_store(dst + j + v_uint16::nlanes, v1);
+                v_store(dst + j + VTraits<v_uint16>::vlanes(), v1);
             }
-            if (j <= roi.width - v_uint16::nlanes)
+            if (j <= roi.width - VTraits<v_uint16>::vlanes())
             {
                 v_uint16 v0 = vx_load(src + j);
-                v0 = (v0 <= thresh_u) & v0;
+                v0 = v_and(v_le(v0, thresh_u), v0);
                 v_store(dst + j, v0);
-                j += v_uint16::nlanes;
+                j += VTraits<v_uint16>::vlanes();
             }
 
             for (; j < roi.width; j++)
@@ -571,7 +571,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
     }
 #endif
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int i, j;
     v_int16 thresh8 = vx_setall_s16( thresh );
     v_int16 maxval8 = vx_setall_s16( maxval );
@@ -582,25 +582,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = thresh8 < v0;
-                v1 = thresh8 < v1;
-                v0 = v0 & maxval8;
-                v1 = v1 & maxval8;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_lt(thresh8, v0);
+                v1 = v_lt(thresh8, v1);
+                v0 = v_and(v0, maxval8);
+                v1 = v_and(v1, maxval8);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
-                v0 = thresh8 < v0;
-                v0 = v0 & maxval8;
+                v0 = v_lt(thresh8, v0);
+                v0 = v_and(v0, maxval8);
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -612,25 +612,25 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = v0 <= thresh8;
-                v1 = v1 <= thresh8;
-                v0 = v0 & maxval8;
-                v1 = v1 & maxval8;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_le(v0, thresh8);
+                v1 = v_le(v1, thresh8);
+                v0 = v_and(v0, maxval8);
+                v1 = v_and(v1, maxval8);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
-                v0 = v0 <= thresh8;
-                v0 = v0 & maxval8;
+                v0 = v_le(v0, thresh8);
+                v0 = v_and(v0, maxval8);
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -642,22 +642,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
                 v0 = v_min( v0, thresh8 );
                 v1 = v_min( v1, thresh8 );
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
                 v0 = v_min( v0, thresh8 );
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -669,22 +669,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = ( thresh8 < v0 ) & v0;
-                v1 = ( thresh8 < v1 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_and(v_lt(thresh8, v0), v0);
+                v1 = v_and(v_lt(thresh8, v1), v1);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
-                v0 = ( thresh8 < v0 ) & v0;
+                v0 = v_and(v_lt(thresh8, v0), v0);
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -696,22 +696,22 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_int16::nlanes; j += 2*v_int16::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_int16>::vlanes(); j += 2*VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_int16::nlanes );
-                v0 = ( v0 <= thresh8 ) & v0;
-                v1 = ( v1 <= thresh8 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_int16>::vlanes() );
+                v0 = v_and(v_le(v0, thresh8), v0);
+                v1 = v_and(v_le(v1, thresh8), v1);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_int16::nlanes, v1 );
+                v_store( dst + j + VTraits<v_int16>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_int16::nlanes )
+            if( j <= roi.width - VTraits<v_int16>::vlanes() )
             {
                 v_int16 v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh8 ) & v0;
+                v0 = v_and(v_le(v0, thresh8), v0);
                 v_store( dst + j, v0 );
-                j += v_int16::nlanes;
+                j += VTraits<v_int16>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -719,7 +719,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
         }
         break;
     default:
-        CV_Error( CV_StsBadArg, "" ); return;
+        CV_Error( cv::Error::StsBadArg, "" ); return;
     }
 #else
     threshGeneric<short>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
@@ -777,7 +777,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
     }
 #endif
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     int i, j;
     v_float32 thresh4 = vx_setall_f32( thresh );
     v_float32 maxval4 = vx_setall_f32( maxval );
@@ -788,25 +788,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = thresh4 < v0;
-                    v1 = thresh4 < v1;
-                    v0 = v0 & maxval4;
-                    v1 = v1 & maxval4;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_lt(thresh4, v0);
+                    v1 = v_lt(thresh4, v1);
+                    v0 = v_and(v0, maxval4);
+                    v1 = v_and(v1, maxval4);
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
-                    v0 = thresh4 < v0;
-                    v0 = v0 & maxval4;
+                    v0 = v_lt(thresh4, v0);
+                    v0 = v_and(v0, maxval4);
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -818,25 +818,25 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = v0 <= thresh4;
-                    v1 = v1 <= thresh4;
-                    v0 = v0 & maxval4;
-                    v1 = v1 & maxval4;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_le(v0, thresh4);
+                    v1 = v_le(v1, thresh4);
+                    v0 = v_and(v0, maxval4);
+                    v1 = v_and(v1, maxval4);
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
-                    v0 = v0 <= thresh4;
-                    v0 = v0 & maxval4;
+                    v0 = v_le(v0, thresh4);
+                    v0 = v_and(v0, maxval4);
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -848,22 +848,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
                     v0 = v_min( v0, thresh4 );
                     v1 = v_min( v1, thresh4 );
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
                     v0 = v_min( v0, thresh4 );
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -875,22 +875,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = ( thresh4 < v0 ) & v0;
-                    v1 = ( thresh4 < v1 ) & v1;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_and(v_lt(thresh4, v0), v0);
+                    v1 = v_and(v_lt(thresh4, v1), v1);
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
-                    v0 = ( thresh4 < v0 ) & v0;
+                    v0 = v_and(v_lt(thresh4, v0), v0);
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -902,22 +902,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
             {
                 j = 0;
-                for( ; j <= roi.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes )
+                for( ; j <= roi.width - 2*VTraits<v_float32>::vlanes(); j += 2*VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0, v1;
                     v0 = vx_load( src + j );
-                    v1 = vx_load( src + j + v_float32::nlanes );
-                    v0 = ( v0 <= thresh4 ) & v0;
-                    v1 = ( v1 <= thresh4 ) & v1;
+                    v1 = vx_load( src + j + VTraits<v_float32>::vlanes() );
+                    v0 = v_and(v_le(v0, thresh4), v0);
+                    v1 = v_and(v_le(v1, thresh4), v1);
                     v_store( dst + j, v0 );
-                    v_store( dst + j + v_float32::nlanes, v1 );
+                    v_store( dst + j + VTraits<v_float32>::vlanes(), v1 );
                 }
-                if( j <= roi.width - v_float32::nlanes )
+                if( j <= roi.width - VTraits<v_float32>::vlanes() )
                 {
                     v_float32 v0 = vx_load( src + j );
-                    v0 = ( v0 <= thresh4 ) & v0;
+                    v0 = v_and(v_le(v0, thresh4), v0);
                     v_store( dst + j, v0 );
-                    j += v_float32::nlanes;
+                    j += VTraits<v_float32>::vlanes();
                 }
 
                 for( ; j < roi.width; j++ )
@@ -925,7 +925,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
             }
             break;
         default:
-            CV_Error( CV_StsBadArg, "" ); return;
+            CV_Error( cv::Error::StsBadArg, "" ); return;
     }
 #else
     threshGeneric<float>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
@@ -948,7 +948,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         roi.height = 1;
     }
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     int i, j;
     v_float64 thresh2 = vx_setall_f64( thresh );
     v_float64 maxval2 = vx_setall_f64( maxval );
@@ -959,25 +959,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = thresh2 < v0;
-                v1 = thresh2 < v1;
-                v0 = v0 & maxval2;
-                v1 = v1 & maxval2;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_lt(thresh2, v0);
+                v1 = v_lt(thresh2, v1);
+                v0 = v_and(v0, maxval2);
+                v1 = v_and(v1, maxval2);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
-                v0 = thresh2 < v0;
-                v0 = v0 & maxval2;
+                v0 = v_lt(thresh2, v0);
+                v0 = v_and(v0, maxval2);
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -989,25 +989,25 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = v0 <= thresh2;
-                v1 = v1 <= thresh2;
-                v0 = v0 & maxval2;
-                v1 = v1 & maxval2;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_le(v0, thresh2);
+                v1 = v_le(v1, thresh2);
+                v0 = v_and(v0, maxval2);
+                v1 = v_and(v1, maxval2);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
-                v0 = v0 <= thresh2;
-                v0 = v0 & maxval2;
+                v0 = v_le(v0, thresh2);
+                v0 = v_and(v0, maxval2);
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -1019,22 +1019,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
                 v0 = v_min( v0, thresh2 );
                 v1 = v_min( v1, thresh2 );
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
                 v0 = v_min( v0, thresh2 );
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -1046,22 +1046,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = ( thresh2 < v0 ) & v0;
-                v1 = ( thresh2 < v1 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_and(v_lt(thresh2, v0), v0);
+                v1 = v_and(v_lt(thresh2, v1), v1);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
-                v0 = ( thresh2 < v0 ) & v0;
+                v0 = v_and(v_lt(thresh2, v0), v0);
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -1073,22 +1073,22 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
         {
             j = 0;
-            for( ; j <= roi.width - 2*v_float64::nlanes; j += 2*v_float64::nlanes )
+            for( ; j <= roi.width - 2*VTraits<v_float64>::vlanes(); j += 2*VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0, v1;
                 v0 = vx_load( src + j );
-                v1 = vx_load( src + j + v_float64::nlanes );
-                v0 = ( v0 <= thresh2 ) & v0;
-                v1 = ( v1 <= thresh2 ) & v1;
+                v1 = vx_load( src + j + VTraits<v_float64>::vlanes() );
+                v0 = v_and(v_le(v0, thresh2), v0);
+                v1 = v_and(v_le(v1, thresh2), v1);
                 v_store( dst + j, v0 );
-                v_store( dst + j + v_float64::nlanes, v1 );
+                v_store( dst + j + VTraits<v_float64>::vlanes(), v1 );
             }
-            if( j <= roi.width - v_float64::nlanes )
+            if( j <= roi.width - VTraits<v_float64>::vlanes() )
             {
                 v_float64 v0 = vx_load( src + j );
-                v0 = ( v0 <= thresh2 ) & v0;
+                v0 = v_and(v_le(v0, thresh2), v0);
                 v_store( dst + j, v0 );
-                j += v_float64::nlanes;
+                j += VTraits<v_float64>::vlanes();
             }
 
             for( ; j < roi.width; j++ )
@@ -1096,7 +1096,7 @@ thresh_64f(const Mat& _src, Mat& _dst, double thresh, double maxval, int type)
         }
         break;
     default:
-        CV_Error(CV_StsBadArg, ""); return;
+        CV_Error(cv::Error::StsBadArg, ""); return;
     }
 #else
     threshGeneric<double>(roi, src, src_step, dst, dst_step, thresh, maxval, type);
@@ -1545,26 +1545,31 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m
                 ocl_threshold(_src, _dst, thresh, maxval, type), thresh)
 
     Mat src = _src.getMat();
-    int automatic_thresh = (type & ~CV_THRESH_MASK);
+
+    _dst.create( src.size(), src.type() );
+    Mat dst = _dst.getMat();
+
+    int automatic_thresh = (type & ~cv::THRESH_MASK);
     type &= THRESH_MASK;
 
-    CV_Assert( automatic_thresh != (CV_THRESH_OTSU | CV_THRESH_TRIANGLE) );
-    if( automatic_thresh == CV_THRESH_OTSU )
+    CV_Assert( automatic_thresh != (cv::THRESH_OTSU | cv::THRESH_TRIANGLE) );
+    if( automatic_thresh == cv::THRESH_OTSU )
     {
         int src_type = src.type();
         CV_CheckType(src_type, src_type == CV_8UC1 || src_type == CV_16UC1, "THRESH_OTSU mode");
+
+        CALL_HAL_RET(thresholdOtsu, cv_hal_threshold_otsu, thresh, src.data, src.step, dst.data, dst.step,
+                     src.cols, src.rows, src_type, maxval, type);
+
         thresh = src.type() == CV_8UC1 ? getThreshVal_Otsu_8u( src )
                                        : getThreshVal_Otsu_16u( src );
     }
-    else if( automatic_thresh == CV_THRESH_TRIANGLE )
+    else if( automatic_thresh == cv::THRESH_TRIANGLE )
     {
         CV_Assert( src.type() == CV_8UC1 );
         thresh = getThreshVal_Triangle_8u( src );
     }
 
-    _dst.create( src.size(), src.type() );
-    Mat dst = _dst.getMat();
-
     if( src.depth() == CV_8U )
     {
         int ithresh = cvFloor(thresh);
@@ -1656,7 +1661,7 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m
     else if( src.depth() == CV_64F )
         ;
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "" );
 
     parallel_for_(Range(0, dst.rows),
                   ThresholdRunner(src, dst, thresh, maxval, type),
@@ -1704,21 +1709,21 @@ void cv::adaptiveThreshold( InputArray _src, OutputArray _dst, double maxValue,
         meanfloat.convertTo(mean, src.type());
     }
     else
-        CV_Error( CV_StsBadFlag, "Unknown/unsupported adaptive threshold method" );
+        CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported adaptive threshold method" );
 
     int i, j;
     uchar imaxval = saturate_cast<uchar>(maxValue);
     int idelta = type == THRESH_BINARY ? cvCeil(delta) : cvFloor(delta);
     uchar tab[768];
 
-    if( type == CV_THRESH_BINARY )
+    if( type == cv::THRESH_BINARY )
         for( i = 0; i < 768; i++ )
             tab[i] = (uchar)(i - 255 > -idelta ? imaxval : 0);
-    else if( type == CV_THRESH_BINARY_INV )
+    else if( type == cv::THRESH_BINARY_INV )
         for( i = 0; i < 768; i++ )
             tab[i] = (uchar)(i - 255 <= -idelta ? imaxval : 0);
     else
-        CV_Error( CV_StsBadFlag, "Unknown/unsupported threshold type" );
+        CV_Error( cv::Error::StsBadFlag, "Unknown/unsupported threshold type" );
 
     if( src.isContinuous() && mean.isContinuous() && dst.isContinuous() )
     {
diff --git a/modules/imgproc/src/utils.cpp b/modules/imgproc/src/utils.cpp
index 4f45cde5e0fd..518de58eab80 100644
--- a/modules/imgproc/src/utils.cpp
+++ b/modules/imgproc/src/utils.cpp
@@ -51,19 +51,19 @@ CV_IMPL CvSeq* cvPointSeqFromMat( int seq_kind, const CvArr* arr,
     CvMat* mat = (CvMat*)arr;
 
     if( !CV_IS_MAT( mat ))
-        CV_Error( CV_StsBadArg, "Input array is not a valid matrix" );
+        CV_Error( cv::Error::StsBadArg, "Input array is not a valid matrix" );
 
     if( CV_MAT_CN(mat->type) == 1 && mat->width == 2 )
         mat = cvReshape(mat, &hdr, 2);
 
     eltype = CV_MAT_TYPE( mat->type );
     if( eltype != CV_32SC2 && eltype != CV_32FC2 )
-        CV_Error( CV_StsUnsupportedFormat,
+        CV_Error( cv::Error::StsUnsupportedFormat,
         "The matrix can not be converted to point sequence because of "
         "inappropriate element type" );
 
     if( (mat->width != 1 && mat->height != 1) || !CV_IS_MAT_CONT(mat->type))
-        CV_Error( CV_StsBadArg,
+        CV_Error( cv::Error::StsBadArg,
         "The matrix converted to point sequence must be "
         "1-dimensional and continuous" );
 
diff --git a/modules/imgproc/test/ocl/test_color.cpp b/modules/imgproc/test/ocl/test_color.cpp
index bdbc6a90d0d8..55401ba73c79 100644
--- a/modules/imgproc/test/ocl/test_color.cpp
+++ b/modules/imgproc/test/ocl/test_color.cpp
@@ -458,6 +458,43 @@ OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGR_YVYU) { performTest(2, 3, CVTCODE(YUV2B
 OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2RGBA_YVYU) { performTest(2, 4, CVTCODE(YUV2RGBA_YVYU)); }
 OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGRA_YVYU) { performTest(2, 4, CVTCODE(YUV2BGRA_YVYU)); }
 
+// RGBA -> YUV422
+
+struct CvtColor_RGB2YUV_422 :
+        public CvtColor
+{
+    void generateTestData(int channelsIn, int channelsOut)
+    {
+        const int srcType = CV_MAKE_TYPE(depth, channelsIn);
+        const int dstType = CV_MAKE_TYPE(depth, channelsOut);
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+        roiSize.width *= 2;
+        roiSize.height *= 2;
+
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, srcType, 2, 100);
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 6, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
+    }
+};
+
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGB2YUV_UYVY)  { performTest(3, 2, CVTCODE(RGB2YUV_UYVY)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGR2YUV_UYVY)  { performTest(3, 2, CVTCODE(BGR2YUV_UYVY)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGBA2YUV_UYVY) { performTest(4, 2, CVTCODE(RGBA2YUV_UYVY)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGRA2YUV_UYVY) { performTest(4, 2, CVTCODE(BGRA2YUV_UYVY)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGB2YUV_YUY2)  { performTest(3, 2, CVTCODE(RGB2YUV_YUY2)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGR2YUV_YUY2)  { performTest(3, 2, CVTCODE(BGR2YUV_YUY2)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGBA2YUV_YUY2) { performTest(4, 2, CVTCODE(RGBA2YUV_YUY2)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGRA2YUV_YUY2) { performTest(4, 2, CVTCODE(BGRA2YUV_YUY2)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGB2YUV_YVYU)  { performTest(3, 2, CVTCODE(RGB2YUV_YVYU)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGR2YUV_YVYU)  { performTest(3, 2, CVTCODE(BGR2YUV_YVYU)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, RGBA2YUV_YVYU) { performTest(4, 2, CVTCODE(RGBA2YUV_YVYU)); }
+OCL_TEST_P(CvtColor_RGB2YUV_422, BGRA2YUV_YVYU) { performTest(4, 2, CVTCODE(BGRA2YUV_YVYU)); }
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor8u,
                             testing::Combine(testing::Values(MatDepth(CV_8U)), Bool()));
@@ -485,6 +522,11 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor_YUV2RGB_422,
                                 testing::Values(MatDepth(CV_8U)),
                                 Bool()));
 
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor_RGB2YUV_422,
+                            testing::Combine(
+                                testing::Values(MatDepth(CV_8U)),
+                                Bool()));
+
 } } // namespace opencv_test::ocl
 
 #endif
diff --git a/modules/imgproc/test/ocl/test_houghlines.cpp b/modules/imgproc/test/ocl/test_houghlines.cpp
index 4e7b8917ac9c..64d5b248a3fb 100644
--- a/modules/imgproc/test/ocl/test_houghlines.cpp
+++ b/modules/imgproc/test/ocl/test_houghlines.cpp
@@ -181,4 +181,4 @@ OCL_INSTANTIATE_TEST_CASE_P(Imgproc, HoughLinesP, Combine(Values(100, 150),
 
 } } // namespace opencv_test::ocl
 
-#endif // HAVE_OPENCL
\ No newline at end of file
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp
index b43c9b67320d..852dc465ab07 100644
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@@ -438,6 +438,101 @@ OCL_TEST_P(Remap_INTER_LINEAR, Mat)
     }
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// remap relative
+
+PARAM_TEST_CASE(RemapRelative, MatDepth, Channels, Interpolation, BorderType, bool)
+{
+    int srcType;
+    int interpolation;
+    int borderType;
+    bool useFixedPoint;
+
+    Scalar val;
+
+    TEST_DECLARE_INPUT_PARAMETER(map1);
+    TEST_DECLARE_INPUT_PARAMETER(map2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
+
+    UMat uSrc;
+    UMat uMapRelativeX32F;
+    UMat uMapRelativeY32F;
+    UMat uMapAbsoluteX32F;
+    UMat uMapAbsoluteY32F;
+    UMat uMapRelativeX16S;
+    UMat uMapRelativeY16S;
+    UMat uMapAbsoluteX16S;
+    UMat uMapAbsoluteY16S;
+
+    virtual void SetUp()
+    {
+        srcType = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
+        interpolation = GET_PARAM(2);
+        borderType = GET_PARAM(3);
+        useFixedPoint = GET_PARAM(4);
+
+        const int nChannels = CV_MAT_CN(srcType);
+        const cv::Size size(127, 61);
+        cv::Mat data64FC1(1, size.area()*nChannels, CV_64FC1);
+        data64FC1.forEach<double>([&](double& pixel, const int* position) {pixel = static_cast<double>(position[1]);});
+
+        cv::Mat src;
+        data64FC1.reshape(nChannels, size.height).convertTo(src, srcType);
+
+        cv::Mat mapRelativeX32F(size, CV_32FC1);
+        mapRelativeX32F.setTo(cv::Scalar::all(-0.33));
+
+        cv::Mat mapRelativeY32F(size, CV_32FC1);
+        mapRelativeY32F.setTo(cv::Scalar::all(-0.33));
+
+        cv::Mat mapAbsoluteX32F = mapRelativeX32F.clone();
+        mapAbsoluteX32F.forEach<float>([&](float& pixel, const int* position) {
+            pixel += static_cast<float>(position[1]);
+            });
+
+        cv::Mat mapAbsoluteY32F = mapRelativeY32F.clone();
+        mapAbsoluteY32F.forEach<float>([&](float& pixel, const int* position) {
+            pixel += static_cast<float>(position[0]);
+            });
+
+        OCL_ON(src.copyTo(uSrc));
+        OCL_ON(mapRelativeX32F.copyTo(uMapRelativeX32F));
+        OCL_ON(mapRelativeY32F.copyTo(uMapRelativeY32F));
+        OCL_ON(mapAbsoluteX32F.copyTo(uMapAbsoluteX32F));
+        OCL_ON(mapAbsoluteY32F.copyTo(uMapAbsoluteY32F));
+
+        if (useFixedPoint)
+        {
+            const bool nninterpolation = (interpolation == cv::INTER_NEAREST) || (interpolation == cv::INTER_NEAREST_EXACT);
+            OCL_ON(cv::convertMaps(uMapAbsoluteX32F, uMapAbsoluteY32F, uMapAbsoluteX16S, uMapAbsoluteY16S, CV_16SC2, nninterpolation));
+            OCL_ON(cv::convertMaps(uMapRelativeX32F, uMapRelativeY32F, uMapRelativeX16S, uMapRelativeY16S, CV_16SC2, nninterpolation));
+        }
+    }
+};
+
+OCL_TEST_P(RemapRelative, Mat)
+{
+    cv::UMat uDstAbsolute;
+    cv::UMat uDstRelative;
+    if (useFixedPoint)
+    {
+        OCL_ON(cv::remap(uSrc, uDstAbsolute, uMapAbsoluteX16S, uMapAbsoluteY16S, interpolation, borderType));
+        OCL_ON(cv::remap(uSrc, uDstRelative, uMapRelativeX16S, uMapRelativeY16S, interpolation | WARP_RELATIVE_MAP, borderType));
+    }
+    else
+    {
+        OCL_ON(cv::remap(uSrc, uDstAbsolute, uMapAbsoluteX32F, uMapAbsoluteY32F, interpolation, borderType));
+        OCL_ON(cv::remap(uSrc, uDstRelative, uMapRelativeX32F, uMapRelativeY32F, interpolation | WARP_RELATIVE_MAP, borderType));
+    }
+
+    cv::Mat dstAbsolute;
+    OCL_ON(uDstAbsolute.copyTo(dstAbsolute));
+    cv::Mat dstRelative;
+    OCL_ON(uDstRelative.copyTo(dstRelative));
+
+    EXPECT_MAT_NEAR(dstAbsolute, dstRelative, dstAbsolute.depth() == CV_32F ? 1e-3 : 1.0);
+}
+
 /////////////////////////////////////////////////////////////////////////////////////
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpAffine, Combine(
@@ -515,6 +610,20 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_NEAREST, Combine(
                                    (BorderType)BORDER_REFLECT_101),
                             Bool()));
 
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, RemapRelative, Combine(
+                            Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                            Values(1, 3, 4),
+                            Values((Interpolation)INTER_NEAREST,
+                                   (Interpolation)INTER_LINEAR,
+                                   (Interpolation)INTER_CUBIC,
+                                   (Interpolation)INTER_LANCZOS4),
+                            Values((BorderType)BORDER_CONSTANT,
+                                   (BorderType)BORDER_REPLICATE,
+                                   (BorderType)BORDER_WRAP,
+                                   (BorderType)BORDER_REFLECT,
+                                   (BorderType)BORDER_REFLECT_101),
+                            Bool()));
+
 } } // namespace opencv_test::ocl
 
 #endif // HAVE_OPENCL
diff --git a/modules/imgproc/test/test_boundingrect.cpp b/modules/imgproc/test/test_boundingrect.cpp
index 52a84e5cfe94..cf70b66e04d2 100644
--- a/modules/imgproc/test/test_boundingrect.cpp
+++ b/modules/imgproc/test/test_boundingrect.cpp
@@ -1,143 +1,99 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
 
+#include "opencv2/core/types.hpp"
 #include "test_precomp.hpp"
 
-namespace opencv_test { namespace {
-
-#define IMGPROC_BOUNDINGRECT_ERROR_DIFF 1
-
-#define MESSAGE_ERROR_DIFF "Bounding rectangle found by boundingRect function is incorrect."
-
-class CV_BoundingRectTest: public cvtest::ArrayTest
-{
-public:
-    CV_BoundingRectTest();
-    ~CV_BoundingRectTest();
-
-protected:
-    void run (int);
+using namespace cv;
+using namespace std;
 
-private:
-    template <typename T> void generate_src_points(vector <Point_<T> >& src, int n);
-    template <typename T> cv::Rect get_bounding_rect(const vector <Point_<T> > src);
-    template <typename T> bool checking_function_work(vector <Point_<T> >& src, int type);
-};
+namespace opencv_test { namespace {
 
-CV_BoundingRectTest::CV_BoundingRectTest() {}
-CV_BoundingRectTest::~CV_BoundingRectTest() {}
 
-template <typename T> void CV_BoundingRectTest::generate_src_points(vector <Point_<T> >& src, int n)
+template <typename T>
+cv::Rect calcBoundingRect(Mat pts)
 {
-    src.clear();
-    for (int i = 0; i < n; ++i)
-        src.push_back(Point_<T>(cv::randu<T>(), cv::randu<T>()));
+    CV_Assert(pts.type() == CV_32FC2 || pts.type() == CV_32SC2);
+    CV_Assert(pts.size().width == 1 && pts.size().height > 0);
+    const int N = pts.size().height;
+    // NOTE: using ::lowest(), not ::min()
+    T min_w = std::numeric_limits<T>::max(), max_w = std::numeric_limits<T>::lowest();
+    T min_h = min_w, max_h = max_w;
+    for (int i = 0; i < N; ++i)
+    {
+        const Point_<T> & pt = pts.at<Point_<T>>(i, 0);
+        min_w = std::min<T>(pt.x, min_w);
+        max_w = std::max<T>(pt.x, max_w);
+        min_h = std::min<T>(pt.y, min_h);
+        max_h = std::max<T>(pt.y, max_h);
+    }
+    return Rect(cvFloor(min_w), cvFloor(min_h), cvFloor(max_w) - cvFloor(min_w) + 1, cvFloor(max_h) - cvFloor(min_h) + 1);
 }
 
-template <typename T> cv::Rect CV_BoundingRectTest::get_bounding_rect(const vector <Point_<T> > src)
-{
-    int n = (int)src.size();
-    T min_w = std::numeric_limits<T>::max(), max_w = std::numeric_limits<T>::min();
-    T min_h = min_w, max_h = max_w;
+typedef ::testing::TestWithParam<int> Imgproc_BoundingRect_Types;
 
-    for (int i = 0; i < n; ++i)
+TEST_P(Imgproc_BoundingRect_Types, accuracy)
+{
+    const int depth = GetParam();
+    RNG& rng = ::cvtest::TS::ptr()->get_rng();
+    for (int k = 0; k < 1000; ++k)
     {
-        min_w = std::min<T>(src.at(i).x, min_w);
-        max_w = std::max<T>(src.at(i).x, max_w);
-        min_h = std::min<T>(src.at(i).y, min_h);
-        max_h = std::max<T>(src.at(i).y, max_h);
+        SCOPED_TRACE(cv::format("k=%d", k));
+        const int sz = rng.uniform(1, 10000);
+        Mat src(sz, 1, CV_MAKETYPE(depth, 2));
+        rng.fill(src, RNG::UNIFORM, Scalar(-100000, -100000), Scalar(100000, 100000));
+        Rect reference;
+        if (depth == CV_32F)
+            reference = calcBoundingRect<float>(src);
+        else if (depth == CV_32S)
+            reference = calcBoundingRect<int>(src);
+        else
+            CV_Error(Error::StsError, "Test error");
+        Rect result = cv::boundingRect(src);
+        EXPECT_EQ(reference, result);
     }
-
-    return Rect((int)min_w, (int)min_h, (int)max_w-(int)min_w + 1, (int)max_h-(int)min_h + 1);
 }
 
-template <typename T> bool CV_BoundingRectTest::checking_function_work(vector <Point_<T> >& src, int type)
+TEST_P(Imgproc_BoundingRect_Types, alignment)
 {
-    const int MAX_COUNT_OF_POINTS = 1000;
-    const int N = 10000;
-
-    for (int k = 0; k < N; ++k)
+    const int depth = GetParam();
+    const int SZ = 100;
+    int idata[SZ];
+    float fdata[SZ];
+    for (int i = 0; i < SZ; ++i)
     {
+        idata[i] = i;
+        fdata[i] = (float)i;
+    }
+    for (int i = 0; i < 10; ++i)
+    {
+        for (int len = 1; len < 40; ++len)
+        {
+            SCOPED_TRACE(cv::format("i=%d, len=%d", i, len));
+            Mat sub(len, 1, CV_MAKETYPE(depth, 2), (depth == CV_32S) ? (void*)(idata + i) : (void*)(fdata + i));
+            EXPECT_NO_THROW(boundingRect(sub));
+        }
+    }
+}
 
-        RNG& rng = ts->get_rng();
-
-        int n = rng.next()%MAX_COUNT_OF_POINTS + 1;
+INSTANTIATE_TEST_CASE_P(, Imgproc_BoundingRect_Types, ::testing::Values(CV_32S, CV_32F));
 
-        generate_src_points <T> (src, n);
 
-        cv::Rect right = get_bounding_rect <T> (src);
+TEST(Imgproc_BoundingRect, bug_24217)
+{
+    for (int image_width = 3; image_width < 20; image_width++)
+    {
+        for (int image_height = 1; image_height < 15; image_height++)
+        {
+            cv::Rect rect(0, image_height - 1, 3, 1);
 
-        cv::Rect rect[2] = { boundingRect(src), boundingRect(Mat(src)) };
+            cv::Mat image(cv::Size(image_width, image_height), CV_8UC1, cv::Scalar(0));
+            image(rect) = 255;
 
-        for (int i = 0; i < 2; ++i) if (rect[i] != right)
-        {
-            cout << endl; cout << "Checking for the work of boundingRect function..." << endl;
-            cout << "Type of src points: ";
-            switch (type)
-            {
-            case 0: {cout << "INT"; break;}
-            case 1: {cout << "FLOAT"; break;}
-            default: break;
-            }
-            cout << endl;
-            cout << "Src points are stored as "; if (i == 0) cout << "VECTOR" << endl; else cout << "MAT" << endl;
-            cout << "Number of points: " << n << endl;
-            cout << "Right rect (x, y, w, h): [" << right.x << ", " << right.y << ", " << right.width << ", " << right.height << "]" << endl;
-            cout << "Result rect (x, y, w, h): [" << rect[i].x << ", " << rect[i].y << ", " << rect[i].width << ", " << rect[i].height << "]" << endl;
-            cout << endl;
-            CV_Error(IMGPROC_BOUNDINGRECT_ERROR_DIFF, MESSAGE_ERROR_DIFF);
+            ASSERT_EQ(boundingRect(image), rect);
         }
-
     }
-
-    return true;
 }
 
-void CV_BoundingRectTest::run(int)
-{
-    vector <Point> src_veci; if (!checking_function_work(src_veci, 0)) return;
-    vector <Point2f> src_vecf; checking_function_work(src_vecf, 1);
-}
-
-TEST (Imgproc_BoundingRect, accuracy) { CV_BoundingRectTest test; test.safe_run(); }
-
 }} // namespace
diff --git a/modules/imgproc/test/test_color.cpp b/modules/imgproc/test/test_color.cpp
index 2c89932adc1c..5a5898415c8c 100644
--- a/modules/imgproc/test/test_color.cpp
+++ b/modules/imgproc/test/test_color.cpp
@@ -1824,7 +1824,7 @@ void CV_ColorBayerTest::prepare_to_validation( int /*test_case_idx*/ )
     else if( depth == CV_16U )
         bayer2BGR_<ushort>(src, dst, fwd_code);
     else
-        CV_Error(CV_StsUnsupportedFormat, "");
+        CV_Error(cv::Error::StsUnsupportedFormat, "");
 }
 
 
diff --git a/modules/imgproc/test/test_contours.cpp b/modules/imgproc/test/test_contours.cpp
index 6f4315225e2b..fbd3cabff8f0 100644
--- a/modules/imgproc/test/test_contours.cpp
+++ b/modules/imgproc/test/test_contours.cpp
@@ -289,7 +289,7 @@ int CV_FindContourTest::validate_test_results( int /*test_case_idx*/ )
 {
     int code = cvtest::TS::OK;
 
-    cvCmpS( img[0], 0, img[0], CV_CMP_GT );
+    cvCmpS( img[0], 0, img[0], cv::CMP_GT );
 
     if( count != count2 )
     {
@@ -408,7 +408,12 @@ int CV_FindContourTest::validate_test_results( int /*test_case_idx*/ )
     return code;
 }
 
-TEST(Imgproc_FindContours, accuracy) { CV_FindContourTest test; test.safe_run(); }
+TEST(Imgproc_FindContours, accuracy)
+{
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+    CV_FindContourTest test;
+    test.safe_run();
+}
 
 //rotate/flip a quadrant appropriately
 static void rot(int n, int *x, int *y, int rx, int ry)
@@ -459,7 +464,6 @@ TEST(Imgproc_FindContours, hilbert)
     dilate(img, img, Mat());
     vector<vector<Point> > contours;
     findContours(img, contours, noArray(), RETR_LIST, CHAIN_APPROX_SIMPLE);
-    printf("ncontours = %d, contour[0].npoints=%d\n", (int)contours.size(), (int)contours[0].size());
     img.setTo(Scalar::all(0));
 
     drawContours(img, contours, 0, Scalar::all(255), 1);
@@ -530,10 +534,12 @@ TEST(Imgproc_FindContours, regression_4363_shared_nbd)
 
     if (found)
     {
+        ASSERT_EQ(contours.size(), hierarchy.size());
         EXPECT_LT(hierarchy[index][3], 0) << "Desired result: (7,9) has no parent - Actual result: parent of (7,9) is another contour. index = " << index;
     }
 }
 
+
 TEST(Imgproc_PointPolygonTest, regression_10222)
 {
     vector<Point> contour;
diff --git a/modules/imgproc/test/test_contours_new.cpp b/modules/imgproc/test/test_contours_new.cpp
new file mode 100644
index 000000000000..fbeba0658788
--- /dev/null
+++ b/modules/imgproc/test/test_contours_new.cpp
@@ -0,0 +1,608 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+#include "opencv2/imgproc/detail/legacy.hpp"
+
+#define CHECK_OLD 1
+
+namespace opencv_test { namespace {
+
+// debug function
+template <typename T>
+inline static void print_pts(const T& c)
+{
+    for (const auto& one_pt : c)
+    {
+        cout << one_pt << " ";
+    }
+    cout << endl;
+}
+
+// debug function
+template <typename T>
+inline static void print_pts_2(vector<T>& cs)
+{
+    int cnt = 0;
+    cout << "Contours:" << endl;
+    for (const auto& one_c : cs)
+    {
+        cout << cnt++ << " : ";
+        print_pts(one_c);
+    }
+};
+
+// draw 1-2 px blob with orientation defined by 'kind'
+template <typename T>
+inline static void drawSmallContour(Mat& img, Point pt, int kind, int color_)
+{
+    const T color = static_cast<T>(color_);
+    img.at<T>(pt) = color;
+    switch (kind)
+    {
+        case 1: img.at<T>(pt + Point(1, 0)) = color; break;
+        case 2: img.at<T>(pt + Point(1, -1)) = color; break;
+        case 3: img.at<T>(pt + Point(0, -1)) = color; break;
+        case 4: img.at<T>(pt + Point(-1, -1)) = color; break;
+        case 5: img.at<T>(pt + Point(-1, 0)) = color; break;
+        case 6: img.at<T>(pt + Point(-1, 1)) = color; break;
+        case 7: img.at<T>(pt + Point(0, 1)) = color; break;
+        case 8: img.at<T>(pt + Point(1, 1)) = color; break;
+        default: break;
+    }
+}
+
+inline static void drawContours(Mat& img,
+                                const vector<vector<Point>>& contours,
+                                const Scalar& color = Scalar::all(255))
+{
+    for (const auto& contour : contours)
+    {
+        for (size_t n = 0, end = contour.size(); n < end; ++n)
+        {
+            size_t m = n + 1;
+            if (n == end - 1)
+                m = 0;
+            line(img, contour[m], contour[n], color, 1, LINE_8);
+        }
+    }
+}
+
+//==================================================================================================
+
+// Test parameters - mode + method
+typedef testing::TestWithParam<tuple<int, int>> Imgproc_FindContours_Modes1;
+
+
+// Draw random rectangle and find contours
+//
+TEST_P(Imgproc_FindContours_Modes1, rectangle)
+{
+    const int mode = get<0>(GetParam());
+    const int method = get<1>(GetParam());
+
+    const size_t ITER = 100;
+    RNG rng = TS::ptr()->get_rng();
+
+    for (size_t i = 0; i < ITER; ++i)
+    {
+        SCOPED_TRACE(cv::format("i=%zu", i));
+        const Size sz(rng.uniform(640, 1920), rng.uniform(480, 1080));
+        Mat img(sz, CV_8UC1, Scalar::all(0));
+        Mat img32s(sz, CV_32SC1, Scalar::all(0));
+        const Rect r(Point(rng.uniform(1, sz.width / 2 - 1), rng.uniform(1, sz.height / 2)),
+                     Point(rng.uniform(sz.width / 2 - 1, sz.width - 1),
+                           rng.uniform(sz.height / 2 - 1, sz.height - 1)));
+        rectangle(img, r, Scalar::all(255));
+        rectangle(img32s, r, Scalar::all(255), FILLED);
+
+        const vector<Point> ext_ref {r.tl(),
+                                     r.tl() + Point(0, r.height - 1),
+                                     r.br() + Point(-1, -1),
+                                     r.tl() + Point(r.width - 1, 0)};
+        const vector<Point> int_ref {ext_ref[0] + Point(0, 1),
+                                     ext_ref[0] + Point(1, 0),
+                                     ext_ref[3] + Point(-1, 0),
+                                     ext_ref[3] + Point(0, 1),
+                                     ext_ref[2] + Point(0, -1),
+                                     ext_ref[2] + Point(-1, 0),
+                                     ext_ref[1] + Point(1, 0),
+                                     ext_ref[1] + Point(0, -1)};
+        const size_t ext_perimeter = r.width * 2 + r.height * 2;
+        const size_t int_perimeter = ext_perimeter - 4;
+
+        vector<vector<Point>> contours;
+        vector<vector<schar>> chains;
+        vector<Vec4i> hierarchy;
+
+        // run functionn
+        if (mode == RETR_FLOODFILL)
+            if (method == 0)
+                findContours(img32s, chains, hierarchy, mode, method);
+            else
+                findContours(img32s, contours, hierarchy, mode, method);
+        else if (method == 0)
+            findContours(img, chains, hierarchy, mode, method);
+        else
+            findContours(img, contours, hierarchy, mode, method);
+
+        // verify results
+        if (mode == RETR_EXTERNAL)
+        {
+            if (method == 0)
+            {
+                ASSERT_EQ(1U, chains.size());
+            }
+            else
+            {
+                ASSERT_EQ(1U, contours.size());
+                if (method == CHAIN_APPROX_NONE)
+                {
+                    EXPECT_EQ(int_perimeter, contours[0].size());
+                }
+                else if (method == CHAIN_APPROX_SIMPLE)
+                {
+                    EXPECT_MAT_NEAR(Mat(ext_ref), Mat(contours[0]), 0);
+                }
+            }
+        }
+        else
+        {
+            if (method == 0)
+            {
+                ASSERT_EQ(2U, chains.size());
+            }
+            else
+            {
+                ASSERT_EQ(2U, contours.size());
+                if (mode == RETR_LIST)
+                {
+                    if (method == CHAIN_APPROX_NONE)
+                    {
+                        EXPECT_EQ(int_perimeter - 4, contours[0].size());
+                        EXPECT_EQ(int_perimeter, contours[1].size());
+                    }
+                    else if (method == CHAIN_APPROX_SIMPLE)
+                    {
+                        EXPECT_MAT_NEAR(Mat(int_ref), Mat(contours[0]), 0);
+                        EXPECT_MAT_NEAR(Mat(ext_ref), Mat(contours[1]), 0);
+                    }
+                }
+                else if (mode == RETR_CCOMP || mode == RETR_TREE)
+                {
+                    if (method == CHAIN_APPROX_NONE)
+                    {
+                        EXPECT_EQ(int_perimeter, contours[0].size());
+                        EXPECT_EQ(int_perimeter - 4, contours[1].size());
+                    }
+                    else if (method == CHAIN_APPROX_SIMPLE)
+                    {
+                        EXPECT_MAT_NEAR(Mat(ext_ref), Mat(contours[0]), 0);
+                        EXPECT_MAT_NEAR(Mat(int_ref), Mat(contours[1]), 0);
+                    }
+                }
+                else if (mode == RETR_FLOODFILL)
+                {
+                    if (method == CHAIN_APPROX_NONE)
+                    {
+                        EXPECT_EQ(int_perimeter + 4, contours[0].size());
+                    }
+                    else if (method == CHAIN_APPROX_SIMPLE)
+                    {
+                        EXPECT_EQ(int_ref.size(), contours[0].size());
+                        EXPECT_MAT_NEAR(Mat(ext_ref), Mat(contours[1]), 0);
+                    }
+                }
+            }
+        }
+
+#if CHECK_OLD
+        if (method != 0)  // old doesn't support chain codes
+        {
+            if (mode != RETR_FLOODFILL)
+            {
+                vector<vector<Point>> contours_o;
+                vector<Vec4i> hierarchy_o;
+                findContours_legacy(img, contours_o, hierarchy_o, mode, method);
+                ASSERT_EQ(contours.size(), contours_o.size());
+                for (size_t j = 0; j < contours.size(); ++j)
+                {
+                    SCOPED_TRACE(format("contour %zu", j));
+                    EXPECT_MAT_NEAR(Mat(contours[j]), Mat(contours_o[j]), 0);
+                }
+                EXPECT_MAT_NEAR(Mat(hierarchy), Mat(hierarchy_o), 0);
+            }
+            else
+            {
+                vector<vector<Point>> contours_o;
+                vector<Vec4i> hierarchy_o;
+                findContours_legacy(img32s, contours_o, hierarchy_o, mode, method);
+                ASSERT_EQ(contours.size(), contours_o.size());
+                for (size_t j = 0; j < contours.size(); ++j)
+                {
+                    SCOPED_TRACE(format("contour %zu", j));
+                    EXPECT_MAT_NEAR(Mat(contours[j]), Mat(contours_o[j]), 0);
+                }
+                EXPECT_MAT_NEAR(Mat(hierarchy), Mat(hierarchy_o), 0);
+            }
+        }
+#endif
+    }
+}
+
+
+// Draw many small 1-2px blobs and find contours
+//
+TEST_P(Imgproc_FindContours_Modes1, small)
+{
+    const int mode = get<0>(GetParam());
+    const int method = get<1>(GetParam());
+
+    const size_t DIM = 1000;
+    const Size sz(DIM, DIM);
+    const int num = (DIM / 10) * (DIM / 10);  // number of 10x10 squares
+
+    Mat img(sz, CV_8UC1, Scalar::all(0));
+    Mat img32s(sz, CV_32SC1, Scalar::all(0));
+    vector<Point> pts;
+    int extra_contours_32s = 0;
+    for (int j = 0; j < num; ++j)
+    {
+        const int kind = j % 9;
+        Point pt {(j % 100) * 10 + 4, (j / 100) * 10 + 4};
+        drawSmallContour<uchar>(img, pt, kind, 255);
+        drawSmallContour<int>(img32s, pt, kind, j + 1);
+        pts.push_back(pt);
+        // NOTE: for some reason these small diagonal contours (NW, SE)
+        //       result in 2 external contours for FLOODFILL mode
+        if (kind == 8 || kind == 4)
+            ++extra_contours_32s;
+    }
+    {
+        vector<vector<Point>> contours;
+        vector<vector<schar>> chains;
+        vector<Vec4i> hierarchy;
+
+        if (mode == RETR_FLOODFILL)
+        {
+            if (method == 0)
+            {
+                findContours(img32s, chains, hierarchy, mode, method);
+                ASSERT_EQ(pts.size() * 2 + extra_contours_32s, chains.size());
+            }
+            else
+            {
+                findContours(img32s, contours, hierarchy, mode, method);
+                ASSERT_EQ(pts.size() * 2 + extra_contours_32s, contours.size());
+#if CHECK_OLD
+                vector<vector<Point>> contours_o;
+                vector<Vec4i> hierarchy_o;
+                findContours_legacy(img32s, contours_o, hierarchy_o, mode, method);
+                ASSERT_EQ(contours.size(), contours_o.size());
+                for (size_t i = 0; i < contours.size(); ++i)
+                {
+                    SCOPED_TRACE(format("contour %zu", i));
+                    EXPECT_MAT_NEAR(Mat(contours[i]), Mat(contours_o[i]), 0);
+                }
+                EXPECT_MAT_NEAR(Mat(hierarchy), Mat(hierarchy_o), 0);
+#endif
+            }
+        }
+        else
+        {
+            if (method == 0)
+            {
+                findContours(img, chains, hierarchy, mode, method);
+                ASSERT_EQ(pts.size(), chains.size());
+            }
+            else
+            {
+                findContours(img, contours, hierarchy, mode, method);
+                ASSERT_EQ(pts.size(), contours.size());
+#if CHECK_OLD
+                vector<vector<Point>> contours_o;
+                vector<Vec4i> hierarchy_o;
+                findContours_legacy(img, contours_o, hierarchy_o, mode, method);
+                ASSERT_EQ(contours.size(), contours_o.size());
+                for (size_t i = 0; i < contours.size(); ++i)
+                {
+                    SCOPED_TRACE(format("contour %zu", i));
+                    EXPECT_MAT_NEAR(Mat(contours[i]), Mat(contours_o[i]), 0);
+                }
+                EXPECT_MAT_NEAR(Mat(hierarchy), Mat(hierarchy_o), 0);
+#endif
+            }
+        }
+    }
+}
+
+
+// Draw many nested rectangles and find contours
+//
+TEST_P(Imgproc_FindContours_Modes1, deep)
+{
+    const int mode = get<0>(GetParam());
+    const int method = get<1>(GetParam());
+
+    const size_t DIM = 1000;
+    const Size sz(DIM, DIM);
+    const size_t NUM = 249U;
+    Mat img(sz, CV_8UC1, Scalar::all(0));
+    Mat img32s(sz, CV_32SC1, Scalar::all(0));
+    Rect rect(1, 1, 998, 998);
+    for (size_t i = 0; i < NUM; ++i)
+    {
+        rectangle(img, rect, Scalar::all(255));
+        rectangle(img32s, rect, Scalar::all((double)i + 1), FILLED);
+        rect.x += 2;
+        rect.y += 2;
+        rect.width -= 4;
+        rect.height -= 4;
+    }
+    {
+        vector<vector<Point>> contours {{{0, 0}, {1, 1}}};
+        vector<vector<schar>> chains {{1, 2, 3}};
+        vector<Vec4i> hierarchy;
+
+        if (mode == RETR_FLOODFILL)
+        {
+            if (method == 0)
+            {
+                findContours(img32s, chains, hierarchy, mode, method);
+                ASSERT_EQ(2 * NUM, chains.size());
+            }
+            else
+            {
+                findContours(img32s, contours, hierarchy, mode, method);
+                ASSERT_EQ(2 * NUM, contours.size());
+#if CHECK_OLD
+                vector<vector<Point>> contours_o;
+                vector<Vec4i> hierarchy_o;
+                findContours_legacy(img32s, contours_o, hierarchy_o, mode, method);
+                ASSERT_EQ(contours.size(), contours_o.size());
+                for (size_t i = 0; i < contours.size(); ++i)
+                {
+                    SCOPED_TRACE(format("contour %zu", i));
+                    EXPECT_MAT_NEAR(Mat(contours[i]), Mat(contours_o[i]), 0);
+                }
+                EXPECT_MAT_NEAR(Mat(hierarchy), Mat(hierarchy_o), 0);
+#endif
+            }
+        }
+        else
+        {
+            const size_t expected_count = (mode == RETR_EXTERNAL) ? 1U : 2 * NUM;
+            if (method == 0)
+            {
+                findContours(img, chains, hierarchy, mode, method);
+                ASSERT_EQ(expected_count, chains.size());
+            }
+            else
+            {
+                findContours(img, contours, hierarchy, mode, method);
+                ASSERT_EQ(expected_count, contours.size());
+#if CHECK_OLD
+                vector<vector<Point>> contours_o;
+                vector<Vec4i> hierarchy_o;
+                findContours_legacy(img, contours_o, hierarchy_o, mode, method);
+                ASSERT_EQ(contours.size(), contours_o.size());
+                for (size_t i = 0; i < contours.size(); ++i)
+                {
+                    SCOPED_TRACE(format("contour %zu", i));
+                    EXPECT_MAT_NEAR(Mat(contours[i]), Mat(contours_o[i]), 0);
+                }
+                EXPECT_MAT_NEAR(Mat(hierarchy), Mat(hierarchy_o), 0);
+#endif
+            }
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ,
+    Imgproc_FindContours_Modes1,
+    testing::Combine(
+        testing::Values(RETR_EXTERNAL, RETR_LIST, RETR_CCOMP, RETR_TREE, RETR_FLOODFILL),
+        testing::Values(0,
+                        CHAIN_APPROX_NONE,
+                        CHAIN_APPROX_SIMPLE,
+                        CHAIN_APPROX_TC89_L1,
+                        CHAIN_APPROX_TC89_KCOS)));
+
+//==================================================================================================
+
+typedef testing::TestWithParam<tuple<int, int>> Imgproc_FindContours_Modes2;
+
+// Very approximate backport of an old accuracy test
+//
+TEST_P(Imgproc_FindContours_Modes2, new_accuracy)
+{
+    const int mode = get<0>(GetParam());
+    const int method = get<1>(GetParam());
+
+    RNG& rng = TS::ptr()->get_rng();
+    const int blob_count = rng.uniform(1, 10);
+    const Size sz(rng.uniform(640, 1920), rng.uniform(480, 1080));
+    const int blob_sz = 50;
+
+    // prepare image
+    Mat img(sz, CV_8UC1, Scalar::all(0));
+    vector<RotatedRect> rects;
+    for (int i = 0; i < blob_count; ++i)
+    {
+        const Point2f center((float)rng.uniform(blob_sz, sz.width - blob_sz),
+                             (float)rng.uniform(blob_sz, sz.height - blob_sz));
+        const Size2f rsize((float)rng.uniform(1, blob_sz), (float)rng.uniform(1, blob_sz));
+        RotatedRect rect(center, rsize, rng.uniform(0.f, 180.f));
+        rects.push_back(rect);
+        ellipse(img, rect, Scalar::all(100), FILLED);
+    }
+
+    // draw contours manually
+    Mat cont_img(sz, CV_8UC1, Scalar::all(0));
+    for (int y = 1; y < sz.height - 1; ++y)
+    {
+        for (int x = 1; x < sz.width - 1; ++x)
+        {
+            if (img.at<uchar>(y, x) != 0 &&
+                ((img.at<uchar>(y - 1, x) == 0) || (img.at<uchar>(y + 1, x) == 0) ||
+                 (img.at<uchar>(y, x + 1) == 0) || (img.at<uchar>(y, x - 1) == 0)))
+            {
+                cont_img.at<uchar>(y, x) = 255;
+            }
+        }
+    }
+
+    // find contours
+    vector<vector<Point>> contours;
+    vector<Vec4i> hierarchy;
+    findContours(img, contours, hierarchy, mode, method);
+
+    // 0 < contours <= rects
+    EXPECT_GT(contours.size(), 0U);
+    EXPECT_GE(rects.size(), contours.size());
+
+    // draw contours
+    Mat res_img(sz, CV_8UC1, Scalar::all(0));
+    drawContours(res_img, contours);
+
+    // compare resulting drawn contours with manually drawn contours
+    const double diff1 = cvtest::norm(cont_img, res_img, NORM_L1) / 255;
+
+    if (method == CHAIN_APPROX_NONE || method == CHAIN_APPROX_SIMPLE)
+    {
+        EXPECT_EQ(0., diff1);
+    }
+#if CHECK_OLD
+    vector<vector<Point>> contours_o;
+    vector<Vec4i> hierarchy_o;
+    findContours(img, contours_o, hierarchy_o, mode, method);
+    ASSERT_EQ(contours_o.size(), contours.size());
+    for (size_t i = 0; i < contours_o.size(); ++i)
+    {
+        SCOPED_TRACE(format("contour = %zu", i));
+        EXPECT_MAT_NEAR(Mat(contours_o[i]), Mat(contours[i]), 0);
+    }
+    EXPECT_MAT_NEAR(Mat(hierarchy_o), Mat(hierarchy), 0);
+#endif
+}
+
+TEST_P(Imgproc_FindContours_Modes2, approx)
+{
+    const int mode = get<0>(GetParam());
+    const int method = get<1>(GetParam());
+
+    const Size sz {500, 500};
+    Mat img = Mat::zeros(sz, CV_8UC1);
+
+    for (int c = 0; c < 4; ++c)
+    {
+        if (c != 0)
+        {
+            // noise + filter + threshold
+            RNG& rng = TS::ptr()->get_rng();
+            cvtest::randUni(rng, img, 0, 255);
+
+            Mat fimg;
+            boxFilter(img, fimg, CV_8U, Size(5, 5));
+
+            Mat timg;
+            const int level = 44 + c * 42;
+            // 'level' goes through:
+            // 86 - some black speckles on white
+            // 128 - 50/50 black/white
+            // 170 - some white speckles on black
+            cv::threshold(fimg, timg, level, 255, THRESH_BINARY);
+        }
+        else
+        {
+            // circle with cut
+            const Point center {250, 250};
+            const int r {20};
+            const Point cut {r, r};
+            circle(img, center, r, Scalar(255), FILLED);
+            rectangle(img, center, center + cut, Scalar(0), FILLED);
+        }
+
+        vector<vector<Point>> contours;
+        vector<Vec4i> hierarchy;
+        findContours(img, contours, hierarchy, mode, method);
+
+#if CHECK_OLD
+        // NOTE: old and new function results might not match when approximation mode is TC89.
+        // Currently this test passes, but might fail for other random data.
+        // See https://github.com/opencv/opencv/issues/25663 for details.
+        vector<vector<Point>> contours_o;
+        vector<Vec4i> hierarchy_o;
+        findContours_legacy(img, contours_o, hierarchy_o, mode, method);
+        ASSERT_EQ(contours_o.size(), contours.size());
+        for (size_t i = 0; i < contours_o.size(); ++i)
+        {
+            SCOPED_TRACE(format("c = %d, contour = %zu", c, i));
+            EXPECT_MAT_NEAR(Mat(contours_o[i]), Mat(contours[i]), 0);
+        }
+        EXPECT_MAT_NEAR(Mat(hierarchy_o), Mat(hierarchy), 0);
+#endif
+        // TODO: check something
+    }
+}
+
+// TODO: offset test
+
+// no RETR_FLOODFILL - no CV_32S input images
+INSTANTIATE_TEST_CASE_P(
+    ,
+    Imgproc_FindContours_Modes2,
+    testing::Combine(testing::Values(RETR_EXTERNAL, RETR_LIST, RETR_CCOMP, RETR_TREE),
+                     testing::Values(CHAIN_APPROX_NONE,
+                                     CHAIN_APPROX_SIMPLE,
+                                     CHAIN_APPROX_TC89_L1,
+                                     CHAIN_APPROX_TC89_KCOS)));
+
+TEST(Imgproc_FindContours, link_runs)
+{
+    const Size sz {500, 500};
+    Mat img = Mat::zeros(sz, CV_8UC1);
+
+    // noise + filter + threshold
+    RNG& rng = TS::ptr()->get_rng();
+    cvtest::randUni(rng, img, 0, 255);
+
+    Mat fimg;
+    boxFilter(img, fimg, CV_8U, Size(5, 5));
+
+    const int level = 135;
+    cv::threshold(fimg, img, level, 255, THRESH_BINARY);
+
+    vector<vector<Point>> contours;
+    vector<Vec4i> hierarchy;
+    findContoursLinkRuns(img, contours, hierarchy);
+
+    if (cvtest::debugLevel >= 10)
+    {
+        print_pts_2(contours);
+
+        Mat res = Mat::zeros(sz, CV_8UC1);
+        drawContours(res, contours);
+        imshow("res", res);
+        imshow("img", img);
+        waitKey(0);
+    }
+
+#if CHECK_OLD
+    vector<vector<Point>> contours_o;
+    vector<Vec4i> hierarchy_o;
+    findContours_legacy(img, contours_o, hierarchy_o, 0, 5);  // CV_LINK_RUNS method
+    ASSERT_EQ(contours_o.size(), contours.size());
+    for (size_t i = 0; i < contours_o.size(); ++i)
+    {
+        SCOPED_TRACE(format("contour = %zu", i));
+        EXPECT_MAT_NEAR(Mat(contours_o[i]), Mat(contours[i]), 0);
+    }
+    EXPECT_MAT_NEAR(Mat(hierarchy_o), Mat(hierarchy), 0);
+#endif
+}
+
+}}  // namespace opencv_test
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index 70251e3a2517..de45bf2c72cb 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -747,7 +747,7 @@ int CV_MinAreaRectTest::validate_test_results( int test_case_idx )
         cvCircle(img,cvPoint(cvRound(p[i].x*a+b),cvRound(p[i].y*c+d)), 3, CV_RGB(0,255,0), -1 );
     for( i = 0; i < n; i++ )
         bp[i] = cvPoint(cvRound(box_pt[i].x*a+b),cvRound(box_pt[i].y*c+d));
-    cvPolyLine( img, &bpp, &n, 1, 1, CV_RGB(255,255,0), 1, CV_AA, 0 );
+    cvPolyLine( img, &bpp, &n, 1, 1, CV_RGB(255,255,0), 1, cv::LINE_AA, 0 );
     cvShowImage( "test", img );
     cvWaitKey();
     cvReleaseImage(&img);
@@ -857,7 +857,7 @@ int CV_MinTriangleTest::validate_test_results( int test_case_idx )
         cvCircle(img,cvPoint(cvRound(p[i].x*a+b),cvRound(p[i].y*c+d)), 3, CV_RGB(0,255,0), -1 );
     for( i = 0; i < n; i++ )
         bp[i] = cvPoint(cvRound(triangle[i].x*a+b),cvRound(triangle[i].y*c+d));
-    cvPolyLine( img, &bpp, &n, 1, 1, CV_RGB(255,255,0), 1, CV_AA, 0 );
+    cvPolyLine( img, &bpp, &n, 1, 1, CV_RGB(255,255,0), 1, cv::LINE_AA, 0 );
     cvShowImage( "test", img );
     cvWaitKey();
     cvReleaseImage(&img);
@@ -1956,12 +1956,6 @@ int CV_ContourMomentsTest::validate_test_results( int test_case_idx )
 
     if( code < 0 )
     {
-#if 0
-        cvCmpS( img, 0, img, CV_CMP_GT );
-        cvNamedWindow( "test", 1 );
-        cvShowImage( "test", img );
-        cvWaitKey();
-#endif
         ts->set_failed_test_info( code );
     }
 
diff --git a/modules/imgproc/test/test_cornersubpix.cpp b/modules/imgproc/test/test_cornersubpix.cpp
new file mode 100644
index 000000000000..86484d248255
--- /dev/null
+++ b/modules/imgproc/test/test_cornersubpix.cpp
@@ -0,0 +1,68 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015-2023, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+TEST(Imgproc_CornerSubPix, out_of_image_corners)
+{
+    const uint8_t image_pixels[] = {
+        0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 1,
+        0, 0, 0, 0, 0, 0, 2,
+        0, 0, 0, 0, 0, 0, 3};
+
+    cv::Mat image(cv::Size(7, 7), CV_8UC1, (void*)image_pixels, cv::Mat::AUTO_STEP);
+    std::vector<cv::Point2f> corners = {cv::Point2f(5.25, 6.5)};
+    cv::Size win(1, 1);
+    cv::Size zeroZone(-1, -1);
+    cv::TermCriteria criteria;
+    cv::cornerSubPix(image, corners, win, zeroZone, criteria);
+
+    ASSERT_EQ(corners.size(), 1u);
+    ASSERT_TRUE(Rect(0, 0, image.cols, image.rows).contains(corners.front()));
+}
+
+}} // namespace
diff --git a/modules/imgproc/test/test_cvtyuv.cpp b/modules/imgproc/test/test_cvtyuv.cpp
index cb49baab0a46..7114ef035d7c 100644
--- a/modules/imgproc/test/test_cvtyuv.cpp
+++ b/modules/imgproc/test/test_cvtyuv.cpp
@@ -159,6 +159,42 @@ class I420Writer: public YUV420pWriter
     }
 };
 
+class YUV422Writer: public YUVwriter
+{
+    int channels() { return 2; }
+    Size size(Size imgSize) { return Size(imgSize.width, imgSize.height); }
+};
+
+class UYVYWriter: public YUV422Writer
+{
+    virtual void write(Mat& yuv, int row, int col, const YUV& val)
+    {
+        yuv.ptr<Vec2b>(row)[col][1] = val[0];
+        yuv.ptr<Vec2b>(row)[(col/2)*2][0] = val[1];
+        yuv.ptr<Vec2b>(row)[(col/2)*2 + 1][0] = val[2];
+    }
+};
+
+class YUY2Writer: public YUV422Writer
+{
+    virtual void write(Mat& yuv, int row, int col, const YUV& val)
+    {
+        yuv.ptr<Vec2b>(row)[col][0] = val[0];
+        yuv.ptr<Vec2b>(row)[(col/2)*2][1] = val[1];
+        yuv.ptr<Vec2b>(row)[(col/2)*2 + 1][1] = val[2];
+    }
+};
+
+class YVYUWriter: public YUV422Writer
+{
+    virtual void write(Mat& yuv, int row, int col, const YUV& val)
+    {
+        yuv.ptr<Vec2b>(row)[col][0] = val[0];
+        yuv.ptr<Vec2b>(row)[(col/2)*2 + 1][1] = val[1];
+        yuv.ptr<Vec2b>(row)[(col/2)*2][1] = val[2];
+    }
+};
+
 class YUV420Reader: public YUVreader
 {
     int channels() { return 1; }
@@ -357,6 +393,36 @@ class RGB2YUV_Converter
     }
 };
 
+class RGB2YUV422_Converter
+{
+public:
+    YUV convert(RGB rgb1, RGB rgb2, int idx)
+    {
+        int r1 = rgb1[0];
+        int g1 = rgb1[1];
+        int b1 = rgb1[2];
+
+        int r2 = rgb2[0];
+        int g2 = rgb2[1];
+        int b2 = rgb2[2];
+
+        // Coefficients below based on ITU.BT-601, ISBN 1-878707-09-4 (https://fourcc.org/fccyvrgb.php)
+        // The conversion coefficients for RGB to YUV422 are based on the ones for RGB to YUV.
+        // For both Y components, the coefficients are applied as given in the link to each input RGB pixel
+        // separately. For U and V, they are reduced by half to account for two RGB pixels contributing
+        // to the same U and V values. In other words, the U and V contributions from the two RGB pixels
+        // are averaged. The integer versions are obtained by multiplying the float versions by 16384
+        // and rounding to the nearest integer.
+
+        uchar y1 = saturate_cast<uchar>((int)( 0.257f*r1 + 0.504f*g1 + 0.098f*b1 + 16));
+        uchar y2 = saturate_cast<uchar>((int)( 0.257f*r2 + 0.504f*g2 + 0.098f*b2 + 16));
+        uchar u = saturate_cast<uchar>((int)(-0.074f*(r1+r2) - 0.1455f*(g1+g2) + 0.2195f*(b1+b2) + 128));
+        uchar v = saturate_cast<uchar>((int)( 0.2195f*(r1+r2) - 0.184f*(g1+g2) - 0.0355f*(b1+b2) + 128));
+
+        return YUV((idx==0)?y1:y2, u, v);
+    }
+};
+
 YUVreader* YUVreader::getReader(int code)
 {
     switch(code)
@@ -421,15 +487,27 @@ RGBreader* RGBreader::getReader(int code)
     {
     case COLOR_RGB2YUV_YV12:
     case COLOR_RGB2YUV_I420:
+    case COLOR_RGB2YUV_UYVY:
+    case COLOR_RGB2YUV_YUY2:
+    case COLOR_RGB2YUV_YVYU:
         return new RGB888Reader();
     case COLOR_BGR2YUV_YV12:
     case COLOR_BGR2YUV_I420:
+    case COLOR_BGR2YUV_UYVY:
+    case COLOR_BGR2YUV_YUY2:
+    case COLOR_BGR2YUV_YVYU:
         return new BGR888Reader();
     case COLOR_RGBA2YUV_I420:
     case COLOR_RGBA2YUV_YV12:
+    case COLOR_RGBA2YUV_UYVY:
+    case COLOR_RGBA2YUV_YUY2:
+    case COLOR_RGBA2YUV_YVYU:
         return new RGBA8888Reader();
     case COLOR_BGRA2YUV_YV12:
     case COLOR_BGRA2YUV_I420:
+    case COLOR_BGRA2YUV_UYVY:
+    case COLOR_BGRA2YUV_YUY2:
+    case COLOR_BGRA2YUV_YVYU:
         return new BGRA8888Reader();
     default:
         return 0;
@@ -505,6 +583,21 @@ YUVwriter* YUVwriter::getWriter(int code)
     case COLOR_RGBA2YUV_YV12:
     case COLOR_BGRA2YUV_YV12:
         return new YV12Writer();
+    case COLOR_RGB2YUV_UYVY:
+    case COLOR_BGR2YUV_UYVY:
+    case COLOR_RGBA2YUV_UYVY:
+    case COLOR_BGRA2YUV_UYVY:
+        return new UYVYWriter();
+    case COLOR_RGB2YUV_YUY2:
+    case COLOR_BGR2YUV_YUY2:
+    case COLOR_RGBA2YUV_YUY2:
+    case COLOR_BGRA2YUV_YUY2:
+        return new YUY2Writer();
+    case COLOR_RGB2YUV_YVYU:
+    case COLOR_BGR2YUV_YVYU:
+    case COLOR_RGBA2YUV_YVYU:
+    case COLOR_BGRA2YUV_YVYU:
+        return new YVYUWriter();
     case COLOR_RGB2YUV_I420:
     case COLOR_BGR2YUV_I420:
     case COLOR_RGBA2YUV_I420:
@@ -545,6 +638,21 @@ void referenceRGB2YUV(const Mat& rgb, Mat& yuv, RGBreader* rgbReader, YUVwriter*
             yuvWriter->write(yuv, row, col, cvt.convert(rgbReader->read(rgb, row, col)));
 }
 
+template<class convertor>
+void referenceRGB2YUV422(const Mat& rgb, Mat& yuv, RGBreader* rgbReader, YUVwriter* yuvWriter)
+{
+    convertor cvt;
+
+    for(int row = 0; row < rgb.rows; ++row)
+    {
+            for(int col = 0; col < rgb.cols; col+=2)
+            {
+                yuvWriter->write(yuv, row, col, cvt.convert(rgbReader->read(rgb, row, col), rgbReader->read(rgb, row, col+1), 0));
+                yuvWriter->write(yuv, row, col+1, cvt.convert(rgbReader->read(rgb, row, col), rgbReader->read(rgb, row, col+1), 1));
+            }
+    }
+}
+
 struct ConversionYUV
 {
     explicit ConversionYUV( const int code )
@@ -611,6 +719,28 @@ struct ConversionYUV
     GRAYwriter* grayWriter_;
 };
 
+bool is_rgb2yuv422(int code)
+{
+    switch (code)
+    {
+        case COLOR_RGB2YUV_UYVY:
+        case COLOR_BGR2YUV_UYVY:
+        case COLOR_RGBA2YUV_UYVY:
+        case COLOR_BGRA2YUV_UYVY:
+        case COLOR_RGB2YUV_YUY2:
+        case COLOR_BGR2YUV_YUY2:
+        case COLOR_RGBA2YUV_YUY2:
+        case COLOR_BGRA2YUV_YUY2:
+        case COLOR_RGB2YUV_YVYU:
+        case COLOR_BGR2YUV_YVYU:
+        case COLOR_RGBA2YUV_YVYU:
+        case COLOR_BGRA2YUV_YVYU:
+            return true;
+        default:
+            return false;
+    }
+}
+
 CV_ENUM(YUVCVTS, COLOR_YUV2RGB_NV12, COLOR_YUV2BGR_NV12, COLOR_YUV2RGB_NV21, COLOR_YUV2BGR_NV21,
                  COLOR_YUV2RGBA_NV12, COLOR_YUV2BGRA_NV12, COLOR_YUV2RGBA_NV21, COLOR_YUV2BGRA_NV21,
                  COLOR_YUV2RGB_YV12, COLOR_YUV2BGR_YV12, COLOR_YUV2RGB_IYUV, COLOR_YUV2BGR_IYUV,
@@ -620,13 +750,18 @@ CV_ENUM(YUVCVTS, COLOR_YUV2RGB_NV12, COLOR_YUV2BGR_NV12, COLOR_YUV2RGB_NV21, COL
                  COLOR_YUV2RGBA_YUY2, COLOR_YUV2BGRA_YUY2, COLOR_YUV2RGBA_YVYU, COLOR_YUV2BGRA_YVYU,
                  COLOR_YUV2GRAY_420, COLOR_YUV2GRAY_UYVY, COLOR_YUV2GRAY_YUY2,
                  COLOR_YUV2BGR, COLOR_YUV2RGB, COLOR_RGB2YUV_YV12, COLOR_BGR2YUV_YV12, COLOR_RGBA2YUV_YV12,
-                 COLOR_BGRA2YUV_YV12, COLOR_RGB2YUV_I420, COLOR_BGR2YUV_I420, COLOR_RGBA2YUV_I420, COLOR_BGRA2YUV_I420)
+                 COLOR_BGRA2YUV_YV12, COLOR_RGB2YUV_I420, COLOR_BGR2YUV_I420, COLOR_RGBA2YUV_I420, COLOR_BGRA2YUV_I420,
+                 COLOR_RGB2YUV_UYVY,  COLOR_BGR2YUV_UYVY,  COLOR_RGBA2YUV_UYVY, COLOR_BGRA2YUV_UYVY,
+                 COLOR_RGB2YUV_YUY2,  COLOR_BGR2YUV_YUY2,  COLOR_RGB2YUV_YVYU,  COLOR_BGR2YUV_YVYU,
+                 COLOR_RGBA2YUV_YUY2, COLOR_BGRA2YUV_YUY2, COLOR_RGBA2YUV_YVYU, COLOR_BGRA2YUV_YVYU)
 
 typedef ::testing::TestWithParam<YUVCVTS> Imgproc_ColorYUV;
 
 TEST_P(Imgproc_ColorYUV, accuracy)
 {
     int code = GetParam();
+    bool yuv422 = is_rgb2yuv422(code);
+
     RNG& random = theRNG();
 
     ConversionYUV cvt(code);
@@ -654,7 +789,12 @@ TEST_P(Imgproc_ColorYUV, accuracy)
         else if(cvt.grayWriter_)
             referenceYUV2GRAY<YUV2GRAY_Converter>(src, gold, cvt.yuvReader_, cvt.grayWriter_);
         else if(cvt.yuvWriter_)
-            referenceRGB2YUV<RGB2YUV_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+        {
+            if(!yuv422)
+                referenceRGB2YUV<RGB2YUV_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+            else
+                referenceRGB2YUV422<RGB2YUV422_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+        }
 
         cv::cvtColor(src, dst, code, -1);
 
@@ -665,6 +805,8 @@ TEST_P(Imgproc_ColorYUV, accuracy)
 TEST_P(Imgproc_ColorYUV, roi_accuracy)
 {
     int code = GetParam();
+    bool yuv422 = is_rgb2yuv422(code);
+
     RNG& random = theRNG();
 
     ConversionYUV cvt(code);
@@ -701,7 +843,12 @@ TEST_P(Imgproc_ColorYUV, roi_accuracy)
         else if(cvt.grayWriter_)
             referenceYUV2GRAY<YUV2GRAY_Converter>(src, gold, cvt.yuvReader_, cvt.grayWriter_);
         else if(cvt.yuvWriter_)
-            referenceRGB2YUV<RGB2YUV_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+        {
+            if(!yuv422)
+                referenceRGB2YUV<RGB2YUV_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+            else
+                referenceRGB2YUV422<RGB2YUV422_Converter>  (src, gold, cvt.rgbReader_, cvt.yuvWriter_);
+        }
 
         cv::cvtColor(src, dst, code, -1);
 
@@ -722,7 +869,11 @@ INSTANTIATE_TEST_CASE_P(cvt422, Imgproc_ColorYUV,
     ::testing::Values((int)COLOR_YUV2RGB_UYVY, (int)COLOR_YUV2BGR_UYVY, (int)COLOR_YUV2RGBA_UYVY, (int)COLOR_YUV2BGRA_UYVY,
                       (int)COLOR_YUV2RGB_YUY2, (int)COLOR_YUV2BGR_YUY2, (int)COLOR_YUV2RGB_YVYU, (int)COLOR_YUV2BGR_YVYU,
                       (int)COLOR_YUV2RGBA_YUY2, (int)COLOR_YUV2BGRA_YUY2, (int)COLOR_YUV2RGBA_YVYU, (int)COLOR_YUV2BGRA_YVYU,
-                      (int)COLOR_YUV2GRAY_UYVY, (int)COLOR_YUV2GRAY_YUY2));
+                      (int)COLOR_YUV2GRAY_UYVY, (int)COLOR_YUV2GRAY_YUY2,
+                      (int)COLOR_RGB2YUV_UYVY,  (int)COLOR_BGR2YUV_UYVY,  (int)COLOR_RGBA2YUV_UYVY, (int)COLOR_BGRA2YUV_UYVY,
+                      (int)COLOR_RGB2YUV_YUY2,  (int)COLOR_BGR2YUV_YUY2,  (int)COLOR_RGB2YUV_YVYU,  (int)COLOR_BGR2YUV_YVYU,
+                      (int)COLOR_RGBA2YUV_YUY2, (int)COLOR_BGRA2YUV_YUY2, (int)COLOR_RGBA2YUV_YVYU, (int)COLOR_BGRA2YUV_YVYU,
+                      (int)COLOR_RGB2YUV_YUY2));
 
 }
 
diff --git a/modules/imgproc/test/test_distancetransform.cpp b/modules/imgproc/test/test_distancetransform.cpp
index 742595631abd..bf272cd224f6 100644
--- a/modules/imgproc/test/test_distancetransform.cpp
+++ b/modules/imgproc/test/test_distancetransform.cpp
@@ -40,6 +40,7 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include <numeric>
 
 namespace opencv_test { namespace {
 
@@ -344,4 +345,75 @@ TEST(Imgproc_DistanceTransform, large_square_22732)
     EXPECT_EQ(0, nerrs) << "reference distance map is different from computed one at " << nerrs << " pixels\n";
 }
 
+BIGDATA_TEST(Imgproc_DistanceTransform, issue_23895_3x3)
+{
+    Mat src = Mat::zeros(50000, 50000, CV_8U), dist;
+    distanceTransform(src.col(0), dist, DIST_L2, DIST_MASK_3);
+    int nz = countNonZero(dist);
+    EXPECT_EQ(nz, 0);
+}
+
+BIGDATA_TEST(Imgproc_DistanceTransform, issue_23895_5x5)
+{
+    Mat src = Mat::zeros(50000, 50000, CV_8U), dist;
+    distanceTransform(src.col(0), dist, DIST_L2, DIST_MASK_5);
+    int nz = countNonZero(dist);
+    EXPECT_EQ(nz, 0);
+}
+
+BIGDATA_TEST(Imgproc_DistanceTransform, issue_23895_5x5_labels)
+{
+    Mat src = Mat::zeros(50000, 50000, CV_8U), dist, labels;
+    distanceTransform(src.col(0), dist, labels, DIST_L2, DIST_MASK_5);
+    int nz = countNonZero(dist);
+    EXPECT_EQ(nz, 0);
+}
+
+TEST(Imgproc_DistanceTransform, max_distance_3x3)
+{
+    Mat src = Mat::ones(1, 70000, CV_8U), dist;
+    src.at<uint8_t>(0, 0) = 0;
+    distanceTransform(src, dist, DIST_L2, DIST_MASK_3);
+
+    double minVal, maxVal;
+    minMaxLoc(dist, &minVal, &maxVal);
+    EXPECT_GE(maxVal, 65533);
+}
+
+TEST(Imgproc_DistanceTransform, max_distance_5x5)
+{
+    Mat src = Mat::ones(1, 70000, CV_8U), dist;
+    src.at<uint8_t>(0, 0) = 0;
+    distanceTransform(src, dist, DIST_L2, DIST_MASK_5);
+
+    double minVal, maxVal;
+    minMaxLoc(dist, &minVal, &maxVal);
+    EXPECT_GE(maxVal, 65533);
+}
+
+TEST(Imgproc_DistanceTransform, max_distance_5x5_labels)
+{
+    Mat src = Mat::ones(1, 70000, CV_8U), dist, labels;
+    src.at<uint8_t>(0, 0) = 0;
+    distanceTransform(src, dist, labels, DIST_L2, DIST_MASK_5);
+
+    double minVal, maxVal;
+    minMaxLoc(dist, &minVal, &maxVal);
+    EXPECT_GE(maxVal, 65533);
+}
+
+TEST(Imgproc_DistanceTransform, precise_long_dist)
+{
+    static const int maxDist = 1 << 16;
+    Mat src = Mat::ones(1, 70000, CV_8U), dist;
+    src.at<uint8_t>(0, 0) = 0;
+    distanceTransform(src, dist, DIST_L2, DIST_MASK_PRECISE, CV_32F);
+
+    Mat expected(src.size(), CV_32F);
+    std::iota(expected.begin<float>(), expected.end<float>(), 0.f);
+    expected.colRange(maxDist, expected.cols).setTo(maxDist);
+
+    EXPECT_EQ(cv::norm(expected, dist, NORM_INF), 0);
+}
+
 }} // namespace
diff --git a/modules/imgproc/test/test_drawing.cpp b/modules/imgproc/test/test_drawing.cpp
index 8afd0e00728b..a6b125bcd1b8 100644
--- a/modules/imgproc/test/test_drawing.cpp
+++ b/modules/imgproc/test/test_drawing.cpp
@@ -742,7 +742,7 @@ TEST(Drawing, fillpoly_fully)
             t1.x = (t1.x + offset.x) << (xy_shift - shift);
             t1.y = (t1.y + delta) >> shift;
 
-            if (lineType < CV_AA)
+            if (lineType < cv::LINE_AA)
             {
                 t0.x = (t0.x + (xy_one >> 1)) >> xy_shift;
                 t1.x = (t1.x + (xy_one >> 1)) >> xy_shift;
@@ -831,7 +831,7 @@ PARAM_TEST_CASE(FillPolyFully, unsigned, unsigned, int, int, Point, cv::LineType
             t1.x = (t1.x + offset.x) << (xy_shift - shift);
             t1.y = (t1.y + delta) >> shift;
 
-            if (lineType < CV_AA)
+            if (lineType < cv::LINE_AA)
             {
                 t0.x = (t0.x + (xy_one >> 1)) >> xy_shift;
                 t1.x = (t1.x + (xy_one >> 1)) >> xy_shift;
@@ -913,4 +913,126 @@ INSTANTIATE_TEST_CASE_P(
     )
 );
 
+TEST(Drawing, circle_overflow)
+{
+    applyTestTag(CV_TEST_TAG_VERYLONG);
+    cv::Mat1b matrix = cv::Mat1b::zeros(600, 600);
+    cv::Scalar kBlue = cv::Scalar(0, 0, 255);
+    cv::circle(matrix, cv::Point(275, -2147483318), 2147483647, kBlue, 1, 8, 0);
+}
+
+TEST(Drawing, circle_memory_access)
+{
+    cv::Mat1b matrix = cv::Mat1b::zeros(10, 10);
+    cv::Scalar kBlue = cv::Scalar(0, 0, 255);
+    cv::circle(matrix, cv::Point(-1, -1), 0, kBlue, 2, 8, 16);
+}
+
+inline static Mat mosaic2x2(Mat &img)
+{
+    const Size sz = img.size();
+    Mat res(sz * 2, img.type(), Scalar::all(0));
+    img.copyTo(res(Rect(Point(0, 0), sz)));
+    img.copyTo(res(Rect(Point(0, sz.height), sz)));
+    img.copyTo(res(Rect(Point(sz.width, 0), sz)));
+    img.copyTo(res(Rect(Point(sz.width, sz.height), sz)));
+    return res;
+}
+
+TEST(Drawing, contours_filled)
+{
+    const Scalar white(255);
+    const Scalar black(0);
+    const Size sz(100, 100);
+
+    Mat img(sz, CV_8UC1, black);
+    rectangle(img, Point(20, 20), Point(80, 80), white, -1);
+    rectangle(img, Point(30, 30), Point(70, 70), black, -1);
+    rectangle(img, Point(40, 40), Point(60, 60), white, -1);
+    img = mosaic2x2(img);
+
+    Mat img1(sz, CV_8UC1, black);
+    rectangle(img1, Point(20, 20), Point(80, 80), white, -1);
+    img1 = mosaic2x2(img1);
+
+    Mat img2(sz, CV_8UC1, black);
+    rectangle(img2, Point(20, 20), Point(80, 80), white, -1);
+    rectangle(img2, Point(30, 30), Point(70, 70), black, -1);
+    img2 = mosaic2x2(img2);
+
+    Mat img3(sz, CV_8UC1, black);
+    rectangle(img3, Point(40, 40), Point(60, 60), white, -1);
+    img3 = mosaic2x2(img3);
+
+    // inverted contours - corners and left edge adjusted
+    Mat imgi(sz, CV_8UC1, black);
+    rectangle(imgi, Point(29, 29), Point(71, 71), white, -1);
+    rectangle(imgi, Point(41, 41), Point(59, 59), black, -1);
+    imgi.at<uchar>(Point(29, 29)) = 0;
+    imgi.at<uchar>(Point(29, 71)) = 0;
+    imgi = mosaic2x2(imgi);
+
+    vector<vector<Point>> contours;
+    vector<Vec4i> hierarchy;
+    findContours(img, contours, hierarchy, RETR_TREE, CHAIN_APPROX_NONE);
+    ASSERT_EQ(12u, contours.size());
+
+    // NOTE:
+    // assuming contour tree has following structure (idx = 0, 1, ...):
+    //   idx (top level)
+    //     - idx + 1
+    //         - idx + 2
+    //   idx + 3 (top level)
+    //     - idx + 4
+    //         - idx + 5
+    //   ...
+    const vector<int> top_contours {0, 3, 6, 9};
+    {
+        // all contours
+        Mat res(img.size(), CV_8UC1, Scalar::all(0));
+        drawContours(res, contours, -1, white, -1, cv::LINE_8, hierarchy);
+        EXPECT_LT(cvtest::norm(img, res, NORM_INF), 1);
+    }
+    {
+        // all contours
+        Mat res(img.size(), CV_8UC1, Scalar::all(0));
+        drawContours(res, contours, -1, white, -1, cv::LINE_8, hierarchy, 3);
+        EXPECT_LT(cvtest::norm(img, res, NORM_INF), 1);
+    }
+    {
+        // all contours
+        Mat res(img.size(), CV_8UC1, Scalar::all(0));
+        drawContours(res, contours, -1, white, -1, cv::LINE_8, hierarchy, 0);
+        EXPECT_LT(cvtest::norm(img, res, NORM_INF), 1);
+    }
+    {
+        // all external contours one by one
+        Mat res(img.size(), CV_8UC1, Scalar::all(0));
+        for (int idx : top_contours)
+            drawContours(res, contours, idx, white, -1, cv::LINE_8, hierarchy, 0);
+        EXPECT_LT(cvtest::norm(img1, res, NORM_INF), 1);
+    }
+    {
+        // all external contours + 1-level deep hole (one by one)
+        Mat res(img.size(), CV_8UC1, Scalar::all(0));
+        for (int idx : top_contours)
+            drawContours(res, contours, idx, white, -1, cv::LINE_8, hierarchy, 1);
+        EXPECT_LT(cvtest::norm(img2, res, NORM_INF), 1);
+    }
+    {
+        // 2-level deep contours
+        Mat res(img.size(), CV_8UC1, Scalar::all(0));
+        for (int idx : top_contours)
+            drawContours(res, contours, idx + 2, white, -1, cv::LINE_8, hierarchy);
+        EXPECT_LT(cvtest::norm(img3, res, NORM_INF), 1);
+    }
+    {
+        // holes become inverted here, LINE_8 -> LINE_4
+        Mat res(img.size(), CV_8UC1, Scalar::all(0));
+        for (int idx : top_contours)
+            drawContours(res, contours, idx + 1, white, -1, cv::LINE_4, hierarchy);
+        EXPECT_LT(cvtest::norm(imgi, res, NORM_INF), 1);
+    }
+}
+
 }} // namespace
diff --git a/modules/imgproc/test/test_emd.cpp b/modules/imgproc/test/test_emd.cpp
index 51f05f374e97..59cd8381a135 100644
--- a/modules/imgproc/test/test_emd.cpp
+++ b/modules/imgproc/test/test_emd.cpp
@@ -1,93 +1,250 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
 
+#include "opencv2/imgproc.hpp"
 #include "test_precomp.hpp"
 
+using namespace cv;
+using namespace std;
+
 namespace opencv_test { namespace {
 
-class CV_EMDTest : public cvtest::BaseTest
-{
-public:
-    CV_EMDTest();
-protected:
-    void run(int);
-};
+//==============================================================================
+// Utility
 
+template <typename T>
+inline T sqr(T val)
+{
+    return val * val;
+}
 
-CV_EMDTest::CV_EMDTest()
+inline static float calcEMD(Mat w1, Mat w2, Mat& flow, int dist, int dims)
 {
+    float mass1 = 0.f, mass2 = 0.f, work = 0.f;
+    for (int i = 0; i < flow.rows; ++i)
+    {
+        mass1 += w1.at<float>(i, 0);
+        for (int j = 0; j < flow.cols; ++j)
+        {
+            if (i == 0)
+                mass2 += w2.at<float>(j, 0);
+            float dist_ = 0.f;
+            switch (dist)
+            {
+                case DIST_L1:
+                {
+                    for (int k = 1; k <= dims; ++k)
+                    {
+                        dist_ += abs(w1.at<float>(i, k) - w2.at<float>(j, k));
+                    }
+                    break;
+                }
+                case DIST_L2:
+                {
+                    for (int k = 1; k <= dims; ++k)
+                    {
+                        dist_ += sqr(w1.at<float>(i, k) - w2.at<float>(j, k));
+                    }
+                    dist_ = sqrt(dist_);
+                    break;
+                }
+                case DIST_C:
+                {
+                    for (int k = 1; k <= dims; ++k)
+                    {
+                        const float val = abs(w1.at<float>(i, k) - w2.at<float>(j, k));
+                        if (val > dist_)
+                            dist_ = val;
+                    }
+                    break;
+                }
+            }
+            const float weight = flow.at<float>(i, j);
+            work += dist_ * weight;
+        }
+    }
+    return work / max(mass1, mass2);
 }
 
-void CV_EMDTest::run( int )
+//==============================================================================
+
+TEST(Imgproc_EMD, regression)
 {
-    int code = cvtest::TS::OK;
-    const double success_error_level = 1e-6;
-    #define M 10000
-    double emd0 = 2460./210;
-    static float cost[] =
+    // input data
+    const float M = 10000;
+    Matx<float, 4, 1> w1 {50, 60, 50, 50};
+    Matx<float, 5, 1> w2 {30, 20, 70, 30, 60};
+    Matx<float, 4, 5> cost {16, 16, 13, 22, 17, 14, 14, 13, 19, 15,
+                            19, 19, 20, 23, M,  M,  0,  M,  0,  0};
+
+    // expected results
+    const double emd0 = 2460. / 210;
+    Matx<float, 4, 5> flow0 {0, 0, 50, 0, 0, 0, 0, 20, 0, 40, 30, 20, 0, 0, 0, 0, 0, 0, 30, 20};
+
+    // basic call with cost
+    {
+        float emd = 0.f;
+        ASSERT_NO_THROW(emd = EMD(w1, w2, DIST_USER, cost));
+        EXPECT_NEAR(emd, emd0, 1e-6 * emd0);
+    }
+
+    // basic call with cost and flow output
     {
-        16, 16, 13, 22, 17,
-        14, 14, 13, 19, 15,
-        19, 19, 20, 23,  M,
-        M ,  0,  M,  0,  0
-    };
-    static float  w1[] = { 50, 60, 50, 50 },
-                  w2[] = { 30, 20, 70, 30, 60 };
-    Mat _w1(4, 1, CV_32F, w1);
-    Mat _w2(5, 1, CV_32F, w2);
-    Mat _cost(_w1.rows, _w2.rows, CV_32F, cost);
-
-    float emd = EMD( _w1, _w2, -1, _cost );
-    if( fabs( emd - emd0 ) > success_error_level*emd0 )
+        Mat flow;
+        float emd = 0.f;
+        ASSERT_NO_THROW(emd = EMD(w1, w2, DIST_USER, cost, nullptr, flow));
+        EXPECT_NEAR(emd, emd0, 1e-6 * emd0);
+        EXPECT_MAT_NEAR(Mat(flow0), flow, 1e-6);
+    }
+    // no cost and DIST_USER - error
     {
-        ts->printf( cvtest::TS::LOG,
-            "The computed distance is %.2f, while it should be %.2f\n", emd, emd0 );
-        code = cvtest::TS::FAIL_BAD_ACCURACY;
+        Mat flow;
+        EXPECT_THROW(EMD(w1, w2, DIST_USER, noArray(), nullptr, flow), cv::Exception);
+        EXPECT_THROW(EMD(w1, w2, DIST_USER), cv::Exception);
     }
+}
+
+TEST(Imgproc_EMD, distance_types)
+{
+    // 1D (sum = 210)
+    Matx<float, 4, 2> w1 {50, 1, 60, 2, 50, 3, 50, 4};
+    Matx<float, 5, 2> w2 {30, 1, 20, 2, 70, 3, 30, 4, 60, 5};
+
+    // 2D (sum = 210)
+    Matx<float, 4, 3> w3 {50, 0, 0, 60, 0, 1, 50, 1, 0, 50, 1, 1};
+    Matx<float, 5, 3> w4 {20, 0, 1, 70, 1, 0, 30, 1, 1, 60, 2, 2, 30, 3, 3};
+
+    // basic call with all distance types
+    {
+        const vector<DistanceTypes> good_types {DIST_L1, DIST_L2, DIST_C};
+        for (const auto& dt : good_types)
+        {
+            SCOPED_TRACE(cv::format("dt=%d", dt));
+            float emd = 0.f;
+            Mat flow;
+            // 1D
+            {
+                ASSERT_NO_THROW(emd = EMD(w1, w2, dt, noArray(), nullptr, flow));
+                const float emd0 = calcEMD(Mat(w1), Mat(w2), flow, dt, 1);
+                EXPECT_NEAR(emd0, emd, 1e-6);
+            }
+            // 2D
+            {
+                ASSERT_NO_THROW(emd = EMD(w3, w4, dt, noArray(), nullptr, flow));
+                const float emd0 = calcEMD(Mat(w3), Mat(w4), flow, dt, 2);
+                EXPECT_NEAR(emd0, emd, 1e-6);
+            }
+        }
+    }
+}
+
+typedef testing::TestWithParam<int> Imgproc_EMD_dist;
+
+TEST_P(Imgproc_EMD_dist, random_flow_verify)
+{
+    const int dist = GetParam();
+    for (size_t iter = 0; iter < 100; ++iter)
+    {
+        SCOPED_TRACE(cv::format("iter=%zu", iter));
+        RNG& rng = TS::ptr()->get_rng();
+        const int dims = rng.uniform(1, 10);
+        Mat w1(rng.uniform(1, 10), dims + 1, CV_32FC1);
+        Mat w2(rng.uniform(1, 10), dims + 1, CV_32FC1);
 
-    if( code < 0 )
-        ts->set_failed_test_info( code );
+        // weights > 0
+        {
+            Mat w1_weights = w1.col(0);
+            Mat w2_weights = w2.col(0);
+            cvtest::randUni(rng, w1_weights, 0, 100);
+            cvtest::randUni(rng, w2_weights, 0, 100);
+        }
+
+        // coord
+        {
+            Mat w1_coord = w1.colRange(1, dims + 1);
+            Mat w2_coord = w2.colRange(1, dims + 1);
+            cvtest::randUni(rng, w1_coord, -10, +10);
+            cvtest::randUni(rng, w2_coord, -10, +10);
+        }
+
+        float emd1 = 0.f, emd2 = 0.f;
+        const float eps = 1e-5f;
+        Mat flow;
+        {
+            ASSERT_NO_THROW(emd1 = EMD(w1, w2, dist, noArray(), nullptr, flow));
+            const float emd0 = calcEMD(w1, w2, flow, dist, dims);
+            EXPECT_NEAR(emd0, emd1, eps);
+        }
+        {
+            ASSERT_NO_THROW(emd2 = EMD(w2, w1, dist, noArray(), nullptr, flow));
+            const float emd0 = calcEMD(w2, w1, flow, dist, dims);
+            EXPECT_NEAR(emd0, emd2, eps);
+        }
+        EXPECT_NEAR(emd1, emd2, eps);
+    }
 }
 
-TEST(Imgproc_EMD, regression) { CV_EMDTest test; test.safe_run(); }
+INSTANTIATE_TEST_CASE_P(, Imgproc_EMD_dist, testing::Values(DIST_L1, DIST_L2, DIST_C));
+
+
+TEST(Imgproc_EMD, invalid)
+{
+    Matx<float, 4, 2> w1 {50, 1, 60, 2, 50, 3, 50, 4};
+    Matx<float, 5, 2> w2 {30, 1, 20, 2, 70, 3, 30, 4, 60, 5};
+
+    // empty signature
+    {
+        Mat empty;
+        EXPECT_THROW(EMD(empty, w2, DIST_USER), cv::Exception);
+        EXPECT_THROW(EMD(w1, empty, DIST_USER), cv::Exception);
+    }
+
+    // zero total weight, negative weight
+    {
+        Matx<float, 3, 1> wz {0, 0, 0};
+        Matx<float, 3, 2> wz1 {0, 1, 0, 2, 0, 3};
+        Matx<float, 3, 1> wn {0, 3, -2};
+        Matx<float, 3, 2> wn1 {0, 1, 3, 2, -2, 3};
+        EXPECT_THROW(EMD(wz, w2, DIST_USER), cv::Exception);
+        EXPECT_THROW(EMD(wz1, w2, DIST_USER), cv::Exception);
+        EXPECT_THROW(EMD(wn, w2, DIST_USER), cv::Exception);
+        EXPECT_THROW(EMD(wn1, w2, DIST_USER), cv::Exception);
+    }
+
+    // user distance type, but no cost matrix provided or is wrong
+    {
+        Mat cost(3, 3, CV_32FC1, Scalar::all(0)), cost8u(4, 5, CV_8UC1, Scalar::all(0)), empty;
+        EXPECT_THROW(EMD(w1, w2, DIST_USER, noArray()), cv::Exception);
+        EXPECT_THROW(EMD(w1, w2, DIST_USER, empty), cv::Exception);
+        EXPECT_THROW(EMD(w1, w2, DIST_USER, cost8u), cv::Exception);
+        EXPECT_THROW(EMD(w1, w2, DIST_USER, cost), cv::Exception);
+    }
+
+    // lower_bound is set together with cost
+    {
+        Mat cost(4, 5, CV_32FC1, Scalar::all(0));
+        float bound = 0.f;
+        EXPECT_THROW(EMD(w1, w2, DIST_USER, cost, &bound), cv::Exception);
+    }
+
+    // zero dimensions with non-user distance type
+    const vector<DistanceTypes> good_types {DIST_L1, DIST_L2, DIST_C};
+    for (const auto& dt : good_types)
+    {
+        SCOPED_TRACE(cv::format("dt=%d", dt));
+        Matx<float, 4, 1> w01 {20, 30, 40, 50};
+        Matx<float, 5, 1> w02 {20, 30, 40, 50, 10};
+        EXPECT_THROW(EMD(w01, w02, dt), cv::Exception);
+    }
+
+    // wrong distance type
+    const vector<DistanceTypes> bad_types {DIST_L12, DIST_FAIR, DIST_WELSCH, DIST_HUBER};
+    for (const auto& dt : bad_types)
+    {
+        SCOPED_TRACE(cv::format("dt=%d", dt));
+        EXPECT_THROW(EMD(w1, w2, dt), cv::Exception);
+    }
+}
 
-}} // namespace
-/* End of file. */
+}}  // namespace opencv_test
diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp
index 02d5e232a2dd..a6e45709f379 100644
--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -307,7 +307,7 @@ void CV_MorphologyBaseTest::prepare_to_validation( int /*test_case_idx*/ )
             cvtest::add( dst, 1, src, -1, Scalar::all(0), dst, dst.type() );
         }
         else
-            CV_Error( CV_StsBadArg, "Unknown operation" );
+            CV_Error( cv::Error::StsBadArg, "Unknown operation" );
     }
 
     cvReleaseStructuringElement( &element );
@@ -772,9 +772,8 @@ void CV_BlurTest::get_test_array_types_and_sizes( int test_case_idx,
 
 void CV_BlurTest::run_func()
 {
-    cvSmooth( inplace ? test_array[OUTPUT][0] : test_array[INPUT][0],
-              test_array[OUTPUT][0], normalize ? CV_BLUR : CV_BLUR_NO_SCALE,
-              aperture_size.width, aperture_size.height );
+    cv::boxFilter(inplace ? test_mat[OUTPUT][0] : test_mat[INPUT][0], test_mat[OUTPUT][0],
+                  test_mat[OUTPUT][0].type(), aperture_size, cv::Point(-1, -1), normalize, cv::BORDER_REPLICATE);
 }
 
 
@@ -2380,4 +2379,176 @@ TEST(Imgproc, morphologyEx_small_input_22893)
     ASSERT_EQ(0, cvtest::norm(result, gold, NORM_INF));
 }
 
+TEST(Imgproc_sepFilter2D, identity)
+{
+    std::vector<uint8_t> kernelX{0, 0, 0, 1, 0, 0, 0};
+    std::vector<uint8_t> kernelY{0, 0, 1, 0, 0};
+
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::sepFilter2D(input, result, input.depth(), kernelX, kernelY);
+
+    EXPECT_EQ(0, cv::norm(result, input, NORM_INF));
+}
+
+TEST(Imgproc_sepFilter2D, shift)
+{
+    std::vector<float> kernelX{1, 0, 0};
+    std::vector<float> kernelY{0, 0, 1};
+
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::sepFilter2D(input, result, input.depth(), kernelX, kernelY);
+
+    int W = input.cols;
+    int H = input.rows;
+    Mat inputCrop = input(Range(1, H), Range(0, W - 1));
+    Mat resultCrop = result(Range(0, H - 1), Range(1, W));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+
+    // Checking borders. Should be BORDER_REFLECT_101
+
+    inputCrop = input(Range(H - 2, H - 1), Range(0, W - 1));
+    resultCrop = result(Range(H - 1, H), Range(1, W));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+
+    inputCrop = input(Range(1, H), Range(1, 2));
+    resultCrop = result(Range(0, H - 1), Range(0, 1));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+
+    inputCrop = input(Range(H - 2, H - 1), Range(1, 2));
+    resultCrop = result(Range(H - 1, H), Range(0, 1));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+}
+
+TEST(Imgproc_sepFilter2D, zeroPadding)
+{
+    std::vector<int> kernelX{1, 0, 0};
+    std::vector<int> kernelY{0, 0, 1};
+    Point anchor(-1, -1);
+    double delta = 0;
+
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::sepFilter2D(input, result, input.depth(), kernelX, kernelY, anchor, delta, BORDER_CONSTANT);
+
+    int W = input.cols;
+    int H = input.rows;
+    Mat inputCrop = input(Range(1, H), Range(0, W - 1));
+    Mat resultCrop = result(Range(0, H - 1), Range(1, W));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+
+    // Checking borders
+
+    resultCrop = result(Range(H - 1, H), Range(0, W));
+    EXPECT_EQ(0, cv::norm(resultCrop, NORM_INF));
+
+    resultCrop = result(Range(0, H), Range(0, 1));
+    EXPECT_EQ(0, cv::norm(resultCrop, NORM_INF));
+}
+
+TEST(Imgproc_sepFilter2D, anchor)
+{
+    std::vector<float> kernelX{0, 1, 0};
+    std::vector<float> kernelY{0, 1, 0};
+    Point anchor(2, 0);
+
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::sepFilter2D(input, result, input.depth(), kernelX, kernelY, anchor);
+
+    int W = input.cols;
+    int H = input.rows;
+    Mat inputCrop = input(Range(1, H), Range(0, W - 1));
+    Mat resultCrop = result(Range(0, H - 1), Range(1, W));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+
+    // Checking borders. Should be BORDER_REFLECT_101
+
+    inputCrop = input(Range(H - 2, H - 1), Range(0, W - 1));
+    resultCrop = result(Range(H - 1, H), Range(1, W));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+
+    inputCrop = input(Range(1, H), Range(1, 2));
+    resultCrop = result(Range(0, H - 1), Range(0, 1));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+
+    inputCrop = input(Range(H - 2, H - 1), Range(1, 2));
+    resultCrop = result(Range(H - 1, H), Range(0, 1));
+    EXPECT_EQ(0, cv::norm(resultCrop, inputCrop, NORM_INF));
+}
+
+TEST(Imgproc_sepFilter2D, delta)
+{
+    std::vector<float> kernelX{0, 0.5, 0};
+    std::vector<float> kernelY{0, 1, 0};
+    Point anchor(1, 1);
+    double delta = 5;
+
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::sepFilter2D(input, result, input.depth(), kernelX, kernelY, anchor, delta);
+
+    Mat gt = input / 2 + delta;
+    EXPECT_EQ(0, cv::norm(result, gt, NORM_INF));
+}
+
+typedef testing::TestWithParam<int> Imgproc_sepFilter2D_outTypes;
+TEST_P(Imgproc_sepFilter2D_outTypes, simple)
+{
+    int outputType = GetParam();
+    std::vector<float> kernelX{0, 0.5, 0};
+    std::vector<float> kernelY{0, 0.5, 0};
+    Point anchor(1, 1);
+    double delta = 5;
+
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::sepFilter2D(input, result, outputType, kernelX, kernelY, anchor, delta);
+
+    input.convertTo(input, outputType);
+    Mat gt = input / 4 + delta;
+    EXPECT_EQ(0, cv::norm(result, gt, NORM_INF));
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Imgproc_sepFilter2D_outTypes,
+    testing::Values(CV_16S, CV_32F, CV_64F),
+);
+
+typedef testing::TestWithParam<int> Imgproc_sepFilter2D_types;
+TEST_P(Imgproc_sepFilter2D_types, simple)
+{
+    int outputType = GetParam();
+    std::vector<float> kernelX{0, 0.5, 0};
+    std::vector<float> kernelY{0, 0.5, 0};
+    Point anchor(1, 1);
+    double delta = 5;
+
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    input.convertTo(input, outputType);
+    Mat result;
+
+    cv::sepFilter2D(input, result, outputType, kernelX, kernelY, anchor, delta);
+
+    Mat gt = input / 4 + delta;
+    EXPECT_EQ(0, cv::norm(result, gt, NORM_INF));
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Imgproc_sepFilter2D_types,
+    testing::Values(CV_16S, CV_32F, CV_64F),
+);
+
 }} // namespace
diff --git a/modules/imgproc/test/test_histograms.cpp b/modules/imgproc/test/test_histograms.cpp
index b57af774f2b4..efd045d31b61 100644
--- a/modules/imgproc/test/test_histograms.cpp
+++ b/modules/imgproc/test/test_histograms.cpp
@@ -1198,7 +1198,7 @@ void CV_CalcHistTest::run_func(void)
     }
 
     std::vector<cv::Mat> imagesv(cdims);
-    copy(images.begin(), images.begin() + cdims, imagesv.begin());
+    std::copy(images.begin(), images.begin() + cdims, imagesv.begin());
 
     Mat mask = images[CV_MAX_DIM];
     if( !CV_IS_SPARSE_HIST(hist[0]) )
@@ -1493,7 +1493,7 @@ void CV_CalcBackProjectTest::run_func(void)
     }
 
     std::vector<cv::Mat> imagesv(hdims);
-    copy(images.begin(), images.begin() + hdims, imagesv.begin());
+    std::copy(images.begin(), images.begin() + hdims, imagesv.begin());
 
     cv::Mat dst = images[CV_MAX_DIM+1];
 
diff --git a/modules/imgproc/test/test_houghlines.cpp b/modules/imgproc/test/test_houghlines.cpp
index 61b67d9873ba..2d784d7a7ac0 100644
--- a/modules/imgproc/test/test_houghlines.cpp
+++ b/modules/imgproc/test/test_houghlines.cpp
@@ -53,7 +53,7 @@ struct SimilarWith
     T value;
     float theta_eps;
     float rho_eps;
-    SimilarWith<T>(T val, float e, float r_e): value(val), theta_eps(e), rho_eps(r_e) { };
+    SimilarWith(T val, float e, float r_e): value(val), theta_eps(e), rho_eps(r_e) { }
     bool operator()(const T& other);
 };
 
diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
index 68208caa041f..e8840d231b49 100644
--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@@ -1311,6 +1311,73 @@ TEST(Imgproc_resize_area, regression_quarter_round)
     check_resize_area<uchar>(expected, actual, 0.5);
 }
 
+typedef tuple<int, int, int, int, bool> RemapRelativeParam;
+typedef testing::TestWithParam<RemapRelativeParam> Imgproc_RemapRelative;
+
+TEST_P(Imgproc_RemapRelative, validity)
+{
+    int srcType = CV_MAKE_TYPE(get<0>(GetParam()), get<1>(GetParam()));
+    int interpolation = get<2>(GetParam());
+    int borderType = get<3>(GetParam());
+    bool useFixedPoint = get<4>(GetParam());
+
+    const int nChannels = CV_MAT_CN(srcType);
+    const cv::Size size(127, 61);
+    cv::Mat data64FC1(1, size.area()*nChannels, CV_64FC1);
+    data64FC1.forEach<double>([&](double& pixel, const int* position) {pixel = static_cast<double>(position[1]);});
+
+    cv::Mat src;
+    data64FC1.reshape(nChannels, size.height).convertTo(src, srcType);
+
+    cv::Mat mapRelativeX32F(size, CV_32FC1);
+    mapRelativeX32F.setTo(cv::Scalar::all(-0.33));
+
+    cv::Mat mapRelativeY32F(size, CV_32FC1);
+    mapRelativeY32F.setTo(cv::Scalar::all(-0.33));
+
+    cv::Mat mapAbsoluteX32F = mapRelativeX32F.clone();
+    mapAbsoluteX32F.forEach<float>([&](float& pixel, const int* position) {
+        pixel += static_cast<float>(position[1]);
+        });
+
+    cv::Mat mapAbsoluteY32F = mapRelativeY32F.clone();
+    mapAbsoluteY32F.forEach<float>([&](float& pixel, const int* position) {
+        pixel += static_cast<float>(position[0]);
+        });
+
+    cv::Mat mapAbsoluteX16S;
+    cv::Mat mapAbsoluteY16S;
+    cv::Mat mapRelativeX16S;
+    cv::Mat mapRelativeY16S;
+    if (useFixedPoint)
+    {
+        const bool nninterpolation = (interpolation == cv::INTER_NEAREST) || (interpolation == cv::INTER_NEAREST_EXACT);
+        cv::convertMaps(mapAbsoluteX32F, mapAbsoluteY32F, mapAbsoluteX16S, mapAbsoluteY16S, CV_16SC2, nninterpolation);
+        cv::convertMaps(mapRelativeX32F, mapRelativeY32F, mapRelativeX16S, mapRelativeY16S, CV_16SC2, nninterpolation);
+    }
+
+    cv::Mat dstAbsolute;
+    cv::Mat dstRelative;
+    if (useFixedPoint)
+    {
+        cv::remap(src, dstAbsolute, mapAbsoluteX16S, mapAbsoluteY16S, interpolation, borderType);
+        cv::remap(src, dstRelative, mapRelativeX16S, mapRelativeY16S, interpolation | WARP_RELATIVE_MAP, borderType);
+    }
+    else
+    {
+        cv::remap(src, dstAbsolute, mapAbsoluteX32F, mapAbsoluteY32F, interpolation, borderType);
+        cv::remap(src, dstRelative, mapRelativeX32F, mapRelativeY32F, interpolation | WARP_RELATIVE_MAP, borderType);
+    }
+
+    EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0);
+};
+
+INSTANTIATE_TEST_CASE_P(ImgProc, Imgproc_RemapRelative, testing::Combine(
+    testing::Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    testing::Values(1, 3, 4),
+    testing::Values((int)INTER_NEAREST, (int)INTER_LINEAR, (int)INTER_CUBIC, (int)INTER_LANCZOS4),
+    testing::Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_WRAP, (int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
+    testing::Values(false, true)));
 
 //////////////////////////////////////////////////////////////////////////
 
@@ -1548,11 +1615,11 @@ TEST(Imgproc_linearPolar, identity)
     {
         linearPolar(src, dst,
             Point2f((N-1) * 0.5f, (N-1) * 0.5f), N * 0.5f,
-            CV_WARP_FILL_OUTLIERS | CV_INTER_LINEAR | CV_WARP_INVERSE_MAP);
+            cv::WARP_FILL_OUTLIERS | CV_INTER_LINEAR | cv::WARP_INVERSE_MAP);
 
         linearPolar(dst, src,
             Point2f((N-1) * 0.5f, (N-1) * 0.5f), N * 0.5f,
-            CV_WARP_FILL_OUTLIERS | CV_INTER_LINEAR);
+            cv::WARP_FILL_OUTLIERS | CV_INTER_LINEAR);
 
         double psnr = cvtest::PSNR(in(roi), src(roi));
         EXPECT_LE(25, psnr) << "iteration=" << i;
@@ -1589,11 +1656,11 @@ TEST(Imgproc_logPolar, identity)
     {
         logPolar(src, dst,
             Point2f((N-1) * 0.5f, (N-1) * 0.5f), M,
-            CV_WARP_FILL_OUTLIERS | CV_INTER_LINEAR | CV_WARP_INVERSE_MAP);
+            cv::WARP_FILL_OUTLIERS | CV_INTER_LINEAR | cv::WARP_INVERSE_MAP);
 
         logPolar(dst, src,
             Point2f((N-1) * 0.5f, (N-1) * 0.5f), M,
-            CV_WARP_FILL_OUTLIERS | CV_INTER_LINEAR);
+            cv::WARP_FILL_OUTLIERS | CV_INTER_LINEAR);
 
         double psnr = cvtest::PSNR(in(roi), src(roi));
         EXPECT_LE(25, psnr) << "iteration=" << i;
@@ -1625,11 +1692,11 @@ TEST(Imgproc_warpPolar, identity)
     Rect roi = Rect(0, 0, in.cols - ((N + 19) / 20), in.rows);
     Point2f center = Point2f((N - 1) * 0.5f, (N - 1) * 0.5f);
     double radius = N * 0.5;
-    int flags = CV_WARP_FILL_OUTLIERS | CV_INTER_LINEAR;
+    int flags = cv::WARP_FILL_OUTLIERS | CV_INTER_LINEAR;
     // test linearPolar
     for (int ki = 1; ki <= 5; ki++)
     {
-        warpPolar(src, dst, src.size(), center, radius, flags + WARP_POLAR_LINEAR + CV_WARP_INVERSE_MAP);
+        warpPolar(src, dst, src.size(), center, radius, flags + WARP_POLAR_LINEAR + cv::WARP_INVERSE_MAP);
         warpPolar(dst, src, src.size(), center, radius, flags + WARP_POLAR_LINEAR);
 
         double psnr = cv::PSNR(in(roi), src(roi));
@@ -1639,7 +1706,7 @@ TEST(Imgproc_warpPolar, identity)
     src = in.clone();
     for (int ki = 1; ki <= 5; ki++)
     {
-        warpPolar(src, dst, src.size(),center, radius, flags + WARP_POLAR_LOG + CV_WARP_INVERSE_MAP );
+        warpPolar(src, dst, src.size(),center, radius, flags + WARP_POLAR_LOG + cv::WARP_INVERSE_MAP );
         warpPolar(dst, src, src.size(),center, radius, flags + WARP_POLAR_LOG);
 
         double psnr = cv::PSNR(in(roi), src(roi));
@@ -1672,6 +1739,28 @@ TEST(Imgproc_Remap, issue_23562)
         remap(src, dst, mapx, mapy, INTER_LINEAR, BORDER_TRANSPARENT);
         ASSERT_EQ(0.0, cvtest::norm(ref, dst, NORM_INF)) << "channels=" << cn;
     }
+
+    mapx = Mat1f({3, 3}, {0, 1, 2, 0, 1, 2, 0, 1, 2});
+    mapy = Mat1f({3, 3}, {0, 0, 0, 1, 1, 1, 2, 2, 1.5});
+    for (int cn = 1; cn <= 4; ++cn) {
+        Mat src = cv::Mat(3, 3, CV_32FC(cn));
+        Mat dst = 10 * Mat::ones(3, 3, CV_32FC(cn));
+        for(int y = 0; y < 3; ++y) {
+            for(int x = 0; x < 3; ++x) {
+                for(int k = 0; k < cn; ++k) {
+                    src.ptr<float>(y,x)[k] = 10.f * y + x;
+                }
+            }
+        }
+
+        Mat ref = src.clone();
+        for(int k = 0; k < cn; ++k) {
+            ref.ptr<float>(2,2)[k] = (src.ptr<float>(1, 2)[k] + src.ptr<float>(2, 2)[k]) / 2.f;
+        }
+
+        remap(src, dst, mapx, mapy, INTER_LINEAR, BORDER_TRANSPARENT);
+        ASSERT_EQ(0.0, cvtest::norm(ref, dst, NORM_INF)) << "channels=" << cn;
+    }
 }
 
 }} // namespace
diff --git a/modules/imgproc/test/test_pc.cpp b/modules/imgproc/test/test_pc.cpp
index 7b06e3bd6572..87f0d804b2ea 100644
--- a/modules/imgproc/test/test_pc.cpp
+++ b/modules/imgproc/test/test_pc.cpp
@@ -301,7 +301,7 @@ static std::pair<double, double> divide_complex_numbers( const double nu_re, con
     const double result_re = nu_re * de_re + nu_im * de_im;
     const double result_im = nu_re * (-de_im) + nu_im * de_re;
     return std::pair<double, double>(result_re / result_de, result_im / result_de);
-};
+}
 
 /// Helper function to divide a DFT in src1 by a DFT in src2 with depths depth_t.  The DFTs are
 /// complex matrices.
diff --git a/modules/imgproc/test/test_precomp.hpp b/modules/imgproc/test/test_precomp.hpp
index ce5100914543..4df4a922b511 100644
--- a/modules/imgproc/test/test_precomp.hpp
+++ b/modules/imgproc/test/test_precomp.hpp
@@ -5,8 +5,11 @@
 #define __OPENCV_TEST_PRECOMP_HPP__
 
 #include "opencv2/ts.hpp"
+#include "opencv2/ts/ts_gtest.h"
+#include "opencv2/ts/ocl_test.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/imgproc/imgproc_c.h"
+#include "opencv2/core.hpp"
 
 #include "opencv2/core/private.hpp"
 
diff --git a/modules/imgproc/test/test_subdivision2d.cpp b/modules/imgproc/test/test_subdivision2d.cpp
index 0a366206b365..17549b6b1514 100644
--- a/modules/imgproc/test/test_subdivision2d.cpp
+++ b/modules/imgproc/test/test_subdivision2d.cpp
@@ -50,4 +50,4 @@ TEST(Imgproc_Subdiv2D_getTriangleList, regression_5788)
     EXPECT_EQ(trig_cnt, 105);
 }
 
-}};
+}}
diff --git a/modules/imgproc/test/test_thresh.cpp b/modules/imgproc/test/test_thresh.cpp
index 510560fd0553..f14f2e571646 100644
--- a/modules/imgproc/test/test_thresh.cpp
+++ b/modules/imgproc/test/test_thresh.cpp
@@ -63,7 +63,7 @@ class CV_ThreshTest : public cvtest::ArrayTest
 
 CV_ThreshTest::CV_ThreshTest(int test_type)
 {
-    CV_Assert( (test_type & CV_THRESH_MASK) == 0 );
+    CV_Assert( (test_type & cv::THRESH_MASK) == 0 );
     test_array[INPUT].push_back(NULL);
     test_array[OUTPUT].push_back(NULL);
     test_array[REF_OUTPUT].push_back(NULL);
@@ -84,7 +84,7 @@ void CV_ThreshTest::get_test_array_types_and_sizes( int test_case_idx,
     cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
     depth = depth == 0 ? CV_8U : depth == 1 ? CV_16S : depth == 2 ? CV_16U : depth == 3 ? CV_32F : CV_64F;
 
-    if ( extra_type == CV_THRESH_OTSU )
+    if ( extra_type == cv::THRESH_OTSU )
     {
         depth = cvtest::randInt(rng) % 2 == 0 ? CV_8U : CV_16U;
         cn = 1;
@@ -197,7 +197,7 @@ static void test_threshold( const Mat& _src, Mat& _dst,
     int width_n = _src.cols*cn, height = _src.rows;
     int ithresh = cvFloor(thresh);
     int imaxval, ithresh2;
-    if (extra_type == CV_THRESH_OTSU)
+    if (extra_type == cv::THRESH_OTSU)
     {
         thresh = compute_otsu_thresh(_src);
         ithresh = cvFloor(thresh);
@@ -228,7 +228,7 @@ static void test_threshold( const Mat& _src, Mat& _dst,
 
     switch( thresh_type )
     {
-    case CV_THRESH_BINARY:
+    case cv::THRESH_BINARY:
         for( i = 0; i < height; i++ )
         {
             if( depth == CV_8U )
@@ -268,7 +268,7 @@ static void test_threshold( const Mat& _src, Mat& _dst,
             }
         }
         break;
-    case CV_THRESH_BINARY_INV:
+    case cv::THRESH_BINARY_INV:
         for( i = 0; i < height; i++ )
         {
             if( depth == CV_8U )
@@ -308,7 +308,7 @@ static void test_threshold( const Mat& _src, Mat& _dst,
             }
         }
         break;
-    case CV_THRESH_TRUNC:
+    case cv::THRESH_TRUNC:
         for( i = 0; i < height; i++ )
         {
             if( depth == CV_8U )
@@ -363,7 +363,7 @@ static void test_threshold( const Mat& _src, Mat& _dst,
             }
         }
         break;
-    case CV_THRESH_TOZERO:
+    case cv::THRESH_TOZERO:
         for( i = 0; i < height; i++ )
         {
             if( depth == CV_8U )
@@ -418,7 +418,7 @@ static void test_threshold( const Mat& _src, Mat& _dst,
             }
         }
         break;
-    case CV_THRESH_TOZERO_INV:
+    case cv::THRESH_TOZERO_INV:
         for( i = 0; i < height; i++ )
         {
             if( depth == CV_8U )
@@ -486,7 +486,7 @@ void CV_ThreshTest::prepare_to_validation( int /*test_case_idx*/ )
 }
 
 TEST(Imgproc_Threshold, accuracy) { CV_ThreshTest test; test.safe_run(); }
-TEST(Imgproc_Threshold, accuracyOtsu) { CV_ThreshTest test(CV_THRESH_OTSU); test.safe_run(); }
+TEST(Imgproc_Threshold, accuracyOtsu) { CV_ThreshTest test(cv::THRESH_OTSU); test.safe_run(); }
 
 BIGDATA_TEST(Imgproc_Threshold, huge)
 {
@@ -541,4 +541,58 @@ TEST(Imgproc_Threshold, regression_THRESH_TOZERO_IPP_21258_Max)
     EXPECT_EQ(0, cv::norm(result, NORM_INF));
 }
 
+TEST(Imgproc_AdaptiveThreshold, mean)
+{
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::adaptiveThreshold(input, result, 255, ADAPTIVE_THRESH_MEAN_C, THRESH_BINARY, 15, 8);
+
+    const string gt_path = cvtest::findDataFile("../cv/imgproc/adaptive_threshold1.png");
+    Mat gt = imread(gt_path, IMREAD_GRAYSCALE);
+    EXPECT_EQ(0, cv::norm(result, gt, NORM_INF));
+}
+
+TEST(Imgproc_AdaptiveThreshold, mean_inv)
+{
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::adaptiveThreshold(input, result, 255, ADAPTIVE_THRESH_MEAN_C, THRESH_BINARY_INV, 15, 8);
+
+    const string gt_path = cvtest::findDataFile("../cv/imgproc/adaptive_threshold1.png");
+    Mat gt = imread(gt_path, IMREAD_GRAYSCALE);
+    gt = Mat(gt.rows, gt.cols, CV_8UC1, cv::Scalar(255)) - gt;
+    EXPECT_EQ(0, cv::norm(result, gt, NORM_INF));
+}
+
+TEST(Imgproc_AdaptiveThreshold, gauss)
+{
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::adaptiveThreshold(input, result, 200, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 21, -5);
+
+    const string gt_path = cvtest::findDataFile("../cv/imgproc/adaptive_threshold2.png");
+    Mat gt = imread(gt_path, IMREAD_GRAYSCALE);
+    EXPECT_EQ(0, cv::norm(result, gt, NORM_INF));
+}
+
+TEST(Imgproc_AdaptiveThreshold, gauss_inv)
+{
+    const string input_path = cvtest::findDataFile("../cv/shared/baboon.png");
+    Mat input = imread(input_path, IMREAD_GRAYSCALE);
+    Mat result;
+
+    cv::adaptiveThreshold(input, result, 200, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY_INV, 21, -5);
+
+    const string gt_path = cvtest::findDataFile("../cv/imgproc/adaptive_threshold2.png");
+    Mat gt = imread(gt_path, IMREAD_GRAYSCALE);
+    gt = Mat(gt.rows, gt.cols, CV_8UC1, cv::Scalar(200)) - gt;
+    EXPECT_EQ(0, cv::norm(result, gt, NORM_INF));
+}
+
 }} // namespace
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 7fe90a0cb369..7207997e1b36 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -17,7 +17,7 @@ ocv_add_module(java BINDINGS opencv_core opencv_imgproc PRIVATE_REQUIRED opencv_
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/common.cmake)
 
-# UTILITY: glob specific sources and append them to list (type is in H, CPP, JAVA, AIDL)
+# UTILITY: glob specific sources and append them to list (type is in H, CPP, JAVA)
 macro(glob_more_specific_sources _type _root _output)
   unset(_masks)
   if(${_type} STREQUAL "H")
@@ -26,8 +26,6 @@ macro(glob_more_specific_sources _type _root _output)
     set(_masks "${_root}/cpp/*.cpp")
   elseif(${_type} STREQUAL "JAVA")
     set(_masks "${_root}/java/*.java" "${_root}/java/*.java.in")
-  elseif(${_type} STREQUAL "AIDL")
-    set(_masks "${_root}/java/*.aidl")
   endif()
   if (_masks)
     file(GLOB _result ${_masks})
diff --git a/modules/java/android_sdk/CMakeLists.txt b/modules/java/android_sdk/CMakeLists.txt
index b3308c03f68c..b5fbc3d93dff 100644
--- a/modules/java/android_sdk/CMakeLists.txt
+++ b/modules/java/android_sdk/CMakeLists.txt
@@ -27,12 +27,19 @@ if(ANDROID_SDK_COMPATIBLE_TARGET)
   set(ANDROID_SDK_COMPATIBLE_TARGET "${ANDROID_SDK_COMPATIBLE_TARGET}" CACHE INTERNAL "")
 endif()
 string(REGEX REPLACE "android-" "" android_sdk_target_num ${ANDROID_SDK_COMPATIBLE_TARGET})
+
 if( (ANDROID_SDK_TARGET AND ANDROID_SDK_TARGET LESS 21) OR (android_sdk_target_num LESS 21) )
   message(STATUS "[OpenCV for Android SDK]: A new OpenGL Camera Bridge (CameraGLSurfaceView, CameraGLRendererBase, CameraRenderer, Camera2Renderer) is disabled, because ANDROID_SDK_TARGET (${android_sdk_target_num}) < 21")
 else()
   ocv_copyfiles_append_dir(JAVA_SRC_COPY "${OPENCV_JAVA_BINDINGS_DIR}/gen/android-21/java" "${java_src_dir}")
 endif()
 
+if( (ANDROID_SDK_TARGET AND ANDROID_SDK_TARGET LESS 24) OR (android_sdk_target_num LESS 24) )
+  message(STATUS "[OpenCV for Android SDK]: An experiemntal Native Camera is disabled, because ANDROID_SDK_TARGET (${android_sdk_target_num}) < 24")
+else()
+  ocv_copyfiles_append_dir(JAVA_SRC_COPY "${OPENCV_JAVA_BINDINGS_DIR}/gen/android-24/java" "${java_src_dir}")
+endif()
+
 # copy boilerplate
 file(GLOB_RECURSE seed_project_files_rel RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/android_lib/" "${CMAKE_CURRENT_SOURCE_DIR}/android_lib/*")
 list(REMOVE_ITEM seed_project_files_rel "${ANDROID_MANIFEST_FILE}")
@@ -113,6 +120,7 @@ else()  # gradle build
 #TODO: INSTALL ONLY
 ocv_copyfiles_append_dir(JAVA_SRC_COPY "${OPENCV_JAVA_BINDINGS_DIR}/gen/android/java" "${java_src_dir}")
 ocv_copyfiles_append_dir(JAVA_SRC_COPY "${OPENCV_JAVA_BINDINGS_DIR}/gen/android-21/java" "${java_src_dir}")
+ocv_copyfiles_append_dir(JAVA_SRC_COPY "${OPENCV_JAVA_BINDINGS_DIR}/gen/android-24/java" "${java_src_dir}")
 
 # copy boilerplate
 set(__base_dir "${CMAKE_CURRENT_SOURCE_DIR}/android_gradle_lib/")
@@ -145,7 +153,6 @@ set(depends ${the_module}_android_source_copy "${OPENCV_DEPHELPER}/${the_module}
 
 # build jar
 set(AAR_FILE "${OPENCV_JAVA_DIR}/build/outputs/aar/opencv-release.aar")
-ocv_update(OPENCV_GRADLE_VERBOSE_OPTIONS "-i")
 add_custom_command(
     OUTPUT "${AAR_FILE}" "${OPENCV_DEPHELPER}/${the_module}_android"
     COMMAND ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} "opencv:assemble"
diff --git a/modules/java/android_sdk/android_gradle_lib/build.gradle b/modules/java/android_sdk/android_gradle_lib/build.gradle
index 41cdb9e5e10b..4394bd9a4e39 100644
--- a/modules/java/android_sdk/android_gradle_lib/build.gradle
+++ b/modules/java/android_sdk/android_gradle_lib/build.gradle
@@ -5,6 +5,7 @@ def openCVersionName = "@OPENCV_VERSION@"
 def openCVersionCode = ((@OPENCV_VERSION_MAJOR@ * 100 + @OPENCV_VERSION_MINOR@) * 100 + @OPENCV_VERSION_PATCH@) * 10 + 0
 
 android {
+    @OPENCV_ANDROID_NAMESPACE_DECLARATION@
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
 
     defaultConfig {
@@ -41,7 +42,6 @@ android {
         main {
             jniLibs.srcDirs = ['../../jni']
             java.srcDirs = ['src']  // TODO Use original files instead of copied into build directory
-            aidl.srcDirs = ['src']
             res.srcDirs = ['@OpenCV_SOURCE_DIR@/modules/java/android_sdk/android_gradle_lib/res']
             manifest.srcFile 'AndroidManifest.xml'
         }
diff --git a/modules/java/android_sdk/build.gradle.in b/modules/java/android_sdk/build.gradle.in
index 6b71d1fe1122..d3e37d03629b 100644
--- a/modules/java/android_sdk/build.gradle.in
+++ b/modules/java/android_sdk/build.gradle.in
@@ -89,6 +89,7 @@
 //
 
 apply plugin: 'com.android.library'
+apply plugin: 'maven-publish'
 @KOTLIN_PLUGIN_DECLARATION@
 
 def openCVersionName = "@OPENCV_VERSION@"
@@ -120,8 +121,6 @@ android {
         targetCompatibility JavaVersion.VERSION_@ANDROID_GRADLE_JAVA_VERSION_INIT@
     }
 
-    @ANDROID_GRADLE_BUILD_FEATURE_AIDL@
-
     buildTypes {
         debug {
             packagingOptions {
@@ -137,16 +136,32 @@ android {
         }
     }
 
+    buildFeatures {
+        prefabPublishing true
+        buildConfig true
+    }
+    prefab {
+        opencv_jni_shared {
+            headers "native/jni/include"
+        }
+    }
+
     sourceSets {
         main {
             jniLibs.srcDirs = ['native/libs']
             java.srcDirs = ['java/src']
-            aidl.srcDirs = ['java/src']
             res.srcDirs = ['java/res']
             manifest.srcFile 'java/AndroidManifest.xml'
         }
     }
 
+    publishing {
+        singleVariant('release') {
+            withSourcesJar()
+            withJavadocJar()
+        }
+    }
+
     externalNativeBuild {
         cmake {
             path (project.projectDir.toString() + '/libcxx_helper/CMakeLists.txt')
@@ -154,5 +169,25 @@ android {
     }
 }
 
+publishing {
+    publications {
+        release(MavenPublication) {
+            groupId = 'org.opencv'
+            artifactId = 'opencv'
+            version = '@OPENCV_VERSION_PLAIN@'
+
+            afterEvaluate {
+               from components.release
+           }
+        }
+    }
+    repositories {
+        maven {
+            name = 'myrepo'
+            url = "${project.buildDir}/repo"
+        }
+    }
+}
+
 dependencies {
 }
diff --git a/modules/java/android_sdk/libcxx_helper/CMakeLists.txt b/modules/java/android_sdk/libcxx_helper/CMakeLists.txt
index bc9146f45744..c2dcd6d4d8af 100644
--- a/modules/java/android_sdk/libcxx_helper/CMakeLists.txt
+++ b/modules/java/android_sdk/libcxx_helper/CMakeLists.txt
@@ -1,4 +1,6 @@
 cmake_minimum_required(VERSION 3.6)
 
+project(opencv_jni_shared)
+
 # dummy target to bring libc++_shared.so into packages
 add_library(opencv_jni_shared STATIC dummy.cpp)
diff --git a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
index 017de7f26af6..6447f07b8221 100644
--- a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
+++ b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
@@ -45,6 +45,8 @@ public class JavaCamera2View extends CameraBridgeViewBase {
 
     protected ImageReader mImageReader;
     protected int mPreviewFormat = ImageFormat.YUV_420_888;
+    protected int mRequestTemplate = CameraDevice.TEMPLATE_PREVIEW;
+    private int mFrameRotation;
 
     protected CameraDevice mCameraDevice;
     protected CameraCaptureSession mCaptureSession;
@@ -85,8 +87,8 @@ private void stopBackgroundThread() {
         }
     }
 
-    protected boolean initializeCamera() {
-        Log.i(LOGTAG, "initializeCamera");
+    protected boolean selectCamera() {
+        Log.i(LOGTAG, "selectCamera");
         CameraManager manager = (CameraManager) getContext().getSystemService(Context.CAMERA_SERVICE);
         try {
             String camList[] = manager.getCameraIdList();
@@ -109,14 +111,10 @@ protected boolean initializeCamera() {
                     }
                 }
             }
-            if (mCameraID != null) {
-                Log.i(LOGTAG, "Opening camera: " + mCameraID);
-                manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
-            } else { // make JavaCamera2View behaves in the same way as JavaCameraView
-                Log.i(LOGTAG, "Trying to open camera with the value (" + mCameraIndex + ")");
+            if (mCameraID == null) { // make JavaCamera2View behaves in the same way as JavaCameraView
+                Log.i(LOGTAG, "Selecting camera by index (" + mCameraIndex + ")");
                 if (mCameraIndex < camList.length) {
                     mCameraID = camList[mCameraIndex];
-                    manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
                 } else {
                     // CAMERA_DISCONNECTED is used when the camera id is no longer valid
                     throw new CameraAccessException(CameraAccessException.CAMERA_DISCONNECTED);
@@ -124,11 +122,11 @@ protected boolean initializeCamera() {
             }
             return true;
         } catch (CameraAccessException e) {
-            Log.e(LOGTAG, "OpenCamera - Camera Access Exception", e);
+            Log.e(LOGTAG, "selectCamera - Camera Access Exception", e);
         } catch (IllegalArgumentException e) {
-            Log.e(LOGTAG, "OpenCamera - Illegal Argument Exception", e);
+            Log.e(LOGTAG, "selectCamera - Illegal Argument Exception", e);
         } catch (SecurityException e) {
-            Log.e(LOGTAG, "OpenCamera - Security Exception", e);
+            Log.e(LOGTAG, "selectCamera - Security Exception", e);
         }
         return false;
     }
@@ -155,6 +153,35 @@ public void onError(CameraDevice cameraDevice, int error) {
 
     };
 
+    protected CameraCaptureSession.StateCallback allocateSessionStateCallback() {
+        return new CameraCaptureSession.StateCallback() {
+            @Override
+            public void onConfigured(CameraCaptureSession cameraCaptureSession) {
+                Log.i(LOGTAG, "createCaptureSession::onConfigured");
+                if (null == mCameraDevice) {
+                    return; // camera is already closed
+                }
+                mCaptureSession = cameraCaptureSession;
+                try {
+                    mPreviewRequestBuilder.set(CaptureRequest.CONTROL_AF_MODE,
+                            CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
+                    mPreviewRequestBuilder.set(CaptureRequest.CONTROL_AE_MODE,
+                            CaptureRequest.CONTROL_AE_MODE_ON_AUTO_FLASH);
+
+                    mCaptureSession.setRepeatingRequest(mPreviewRequestBuilder.build(), null, mBackgroundHandler);
+                    Log.i(LOGTAG, "CameraPreviewSession has been started");
+                } catch (Exception e) {
+                    Log.e(LOGTAG, "createCaptureSession failed", e);
+                }
+            }
+
+            @Override
+            public void onConfigureFailed(CameraCaptureSession cameraCaptureSession) {
+                Log.e(LOGTAG, "createCameraPreviewSession failed");
+            }
+        };
+    }
+
     private void createCameraPreviewSession() {
         final int w = mPreviewSize.getWidth(), h = mPreviewSize.getHeight();
         Log.i(LOGTAG, "createCameraPreviewSession(" + w + "x" + h + ")");
@@ -174,6 +201,7 @@ private void createCameraPreviewSession() {
             mImageReader.setOnImageAvailableListener(new ImageReader.OnImageAvailableListener() {
                 @Override
                 public void onImageAvailable(ImageReader reader) {
+
                     Image image = reader.acquireLatestImage();
                     if (image == null)
                         return;
@@ -183,46 +211,20 @@ public void onImageAvailable(ImageReader reader) {
                     assert (planes.length == 3);
                     assert (image.getFormat() == mPreviewFormat);
 
-                    JavaCamera2Frame tempFrame = new JavaCamera2Frame(image);
+                    RotatedCameraFrame tempFrame = new RotatedCameraFrame(new JavaCamera2Frame(image), mFrameRotation);
                     deliverAndDrawFrame(tempFrame);
+                    tempFrame.mFrame.release();
                     tempFrame.release();
                     image.close();
                 }
             }, mBackgroundHandler);
             Surface surface = mImageReader.getSurface();
 
-            mPreviewRequestBuilder = mCameraDevice.createCaptureRequest(CameraDevice.TEMPLATE_PREVIEW);
+            mPreviewRequestBuilder = mCameraDevice.createCaptureRequest(mRequestTemplate);
             mPreviewRequestBuilder.addTarget(surface);
 
             mCameraDevice.createCaptureSession(Arrays.asList(surface),
-                new CameraCaptureSession.StateCallback() {
-                    @Override
-                    public void onConfigured(CameraCaptureSession cameraCaptureSession) {
-                        Log.i(LOGTAG, "createCaptureSession::onConfigured");
-                        if (null == mCameraDevice) {
-                            return; // camera is already closed
-                        }
-                        mCaptureSession = cameraCaptureSession;
-                        try {
-                            mPreviewRequestBuilder.set(CaptureRequest.CONTROL_AF_MODE,
-                                    CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
-                            mPreviewRequestBuilder.set(CaptureRequest.CONTROL_AE_MODE,
-                                    CaptureRequest.CONTROL_AE_MODE_ON_AUTO_FLASH);
-
-                            mCaptureSession.setRepeatingRequest(mPreviewRequestBuilder.build(), null, mBackgroundHandler);
-                            Log.i(LOGTAG, "CameraPreviewSession has been started");
-                        } catch (Exception e) {
-                            Log.e(LOGTAG, "createCaptureSession failed", e);
-                        }
-                    }
-
-                    @Override
-                    public void onConfigureFailed(CameraCaptureSession cameraCaptureSession) {
-                        Log.e(LOGTAG, "createCameraPreviewSession failed");
-                    }
-                },
-                null
-            );
+                                               allocateSessionStateCallback(), null);
         } catch (CameraAccessException e) {
             Log.e(LOGTAG, "createCameraPreviewSession", e);
         }
@@ -300,11 +302,22 @@ boolean calcPreviewSize(final int width, final int height) {
     protected boolean connectCamera(int width, int height) {
         Log.i(LOGTAG, "setCameraPreviewSize(" + width + "x" + height + ")");
         startBackgroundThread();
-        initializeCamera();
+        selectCamera();
         try {
+            CameraManager manager = (CameraManager) getContext().getSystemService(Context.CAMERA_SERVICE);
+            CameraCharacteristics characteristics = manager.getCameraCharacteristics(mCameraID);
+            mFrameRotation = getFrameRotation(
+                    characteristics.get(CameraCharacteristics.LENS_FACING) == CameraCharacteristics.LENS_FACING_FRONT,
+                    characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION));
+
             boolean needReconfig = calcPreviewSize(width, height);
-            mFrameWidth = mPreviewSize.getWidth();
-            mFrameHeight = mPreviewSize.getHeight();
+            if (mFrameRotation % 180 == 0) {
+                mFrameWidth = mPreviewSize.getWidth();
+                mFrameHeight = mPreviewSize.getHeight();
+            } else {
+                mFrameWidth = mPreviewSize.getHeight();
+                mFrameHeight = mPreviewSize.getWidth();
+            }
 
             if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
                 mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
@@ -319,8 +332,16 @@ protected boolean connectCamera(int width, int height) {
                     mCaptureSession.close();
                     mCaptureSession = null;
                 }
-                createCameraPreviewSession();
             }
+
+            if (mFpsMeter != null) {
+                mFpsMeter.setResolution(mFrameWidth, mFrameHeight);
+            }
+
+            Log.i(LOGTAG, "Opening camera: " + mCameraID);
+            manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
+        } catch (CameraAccessException e) {
+            Log.e(LOGTAG, "OpenCamera - Camera Access Exception", e);
         } catch (RuntimeException e) {
             throw new RuntimeException("Interrupted while setCameraPreviewSize.", e);
         }
@@ -435,6 +456,7 @@ public JavaCamera2Frame(Image image) {
             mGray = new Mat();
         }
 
+        @Override
         public void release() {
             mRgba.release();
             mGray.release();
diff --git a/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java b/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java
new file mode 100644
index 000000000000..b28c2121cd56
--- /dev/null
+++ b/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java
@@ -0,0 +1,233 @@
+package org.opencv.android;
+
+import org.opencv.core.Mat;
+import org.opencv.core.Size;
+
+import org.opencv.imgproc.Imgproc;
+
+import org.opencv.videoio.Videoio;
+import org.opencv.videoio.VideoCapture;
+import org.opencv.videoio.VideoWriter;
+
+import android.content.Context;
+import android.hardware.Camera;
+import android.util.AttributeSet;
+import android.util.Log;
+import android.view.ViewGroup.LayoutParams;
+
+/**
+ * This class is an implementation of a bridge between SurfaceView and OpenCV VideoCapture.
+ * The class  is experimental implementation and not recoomended for production usage.
+ */
+public class NativeCameraView extends CameraBridgeViewBase {
+
+    public static final String TAG = "NativeCameraView";
+    private boolean mStopThread;
+    private Thread mThread;
+
+    protected VideoCapture mCamera;
+    protected RotatedCameraFrame mFrame;
+
+    public NativeCameraView(Context context, int cameraId) {
+        super(context, cameraId);
+    }
+
+    public NativeCameraView(Context context, AttributeSet attrs) {
+        super(context, attrs);
+    }
+
+    @Override
+    protected boolean connectCamera(int width, int height) {
+
+        /* 1. We need to instantiate camera
+         * 2. We need to start thread which will be getting frames
+         */
+        /* First step - initialize camera connection */
+        if (!initializeCamera(width, height))
+            return false;
+
+        /* now we can start update thread */
+        mThread = new Thread(new CameraWorker());
+        mThread.start();
+
+        return true;
+    }
+
+    @Override
+    protected void disconnectCamera() {
+        /* 1. We need to stop thread which updating the frames
+         * 2. Stop camera and release it
+         */
+        if (mThread != null) {
+            try {
+                mStopThread = true;
+                mThread.join();
+            } catch (InterruptedException e) {
+                e.printStackTrace();
+            } finally {
+                mThread =  null;
+                mStopThread = false;
+            }
+        }
+
+        /* Now release camera */
+        releaseCamera();
+    }
+
+    public static class OpenCvSizeAccessor implements ListItemAccessor {
+
+        public int getWidth(Object obj) {
+            Size size  = (Size)obj;
+            return (int)size.width;
+        }
+
+        public int getHeight(Object obj) {
+            Size size  = (Size)obj;
+            return (int)size.height;
+        }
+
+    }
+
+    private boolean initializeCamera(int width, int height) {
+        synchronized (this) {
+            Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+            int localCameraIndex = mCameraIndex;
+            if (mCameraIndex == CAMERA_ID_ANY) {
+                Log.d(TAG, "Try to open default camera");
+                localCameraIndex = 0;
+            } else if (mCameraIndex == CAMERA_ID_BACK) {
+                Log.i(TAG, "Trying to open back camera");
+                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
+                    Camera.getCameraInfo( camIdx, cameraInfo );
+                    if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_BACK) {
+                        localCameraIndex = camIdx;
+                        break;
+                    }
+                }
+            } else if (mCameraIndex == CAMERA_ID_FRONT) {
+                Log.i(TAG, "Trying to open front camera");
+                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
+                    Camera.getCameraInfo( camIdx, cameraInfo );
+                    if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                        localCameraIndex = camIdx;
+                        break;
+                    }
+                }
+            }
+
+            if (localCameraIndex == CAMERA_ID_BACK) {
+                Log.e(TAG, "Back camera not found!");
+                return false;
+            } else if (localCameraIndex == CAMERA_ID_FRONT) {
+                Log.e(TAG, "Front camera not found!");
+                return false;
+            }
+
+            Log.d(TAG, "Try to open camera with index " + localCameraIndex);
+            mCamera = new VideoCapture(localCameraIndex, Videoio.CAP_ANDROID);
+
+            if (mCamera == null)
+                return false;
+            if (mCamera.isOpened() == false)
+                return false;
+
+            if (mCameraIndex != CAMERA_ID_BACK && mCameraIndex != CAMERA_ID_FRONT)
+                Camera.getCameraInfo(localCameraIndex, cameraInfo);
+            int frameRotation = getFrameRotation(
+                    cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT,
+                    cameraInfo.orientation);
+
+            mFrame = new RotatedCameraFrame(new NativeCameraFrame(mCamera), frameRotation);
+
+            mCamera.set(Videoio.CAP_PROP_FRAME_WIDTH, width);
+            mCamera.set(Videoio.CAP_PROP_FRAME_HEIGHT, height);
+
+            if (frameRotation % 180 == 0) {
+                mFrameWidth = (int) mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH);
+                mFrameHeight = (int) mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT);
+            } else {
+                mFrameWidth = (int) mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT);
+                mFrameHeight = (int) mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH);
+            }
+
+            if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
+                mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
+            else
+                mScale = 0;
+
+            if (mFpsMeter != null) {
+                mFpsMeter.setResolution(mFrameWidth, mFrameHeight);
+            }
+
+            AllocateCache();
+        }
+
+        Log.i(TAG, "Selected camera frame size = (" + mFrameWidth + ", " + mFrameHeight + ")");
+
+        return true;
+    }
+
+    private void releaseCamera() {
+        synchronized (this) {
+            if (mFrame != null) {
+                mFrame.mFrame.release();
+                mFrame.release();
+            }
+            if (mCamera != null) mCamera.release();
+        }
+    }
+
+    private static class NativeCameraFrame implements CvCameraViewFrame {
+
+        @Override
+        public Mat rgba() {
+            mCapture.set(Videoio.CAP_PROP_FOURCC, VideoWriter.fourcc('R','G','B','3'));
+            mCapture.retrieve(mBgr);
+            Log.d(TAG, "Retrived frame with size " + mBgr.cols() + "x" + mBgr.rows() + " and channels: " + mBgr.channels());
+            Imgproc.cvtColor(mBgr, mRgba, Imgproc.COLOR_RGB2RGBA);
+            return mRgba;
+        }
+
+        @Override
+        public Mat gray() {
+            mCapture.set(Videoio.CAP_PROP_FOURCC, VideoWriter.fourcc('G','R','E','Y'));
+            mCapture.retrieve(mGray);
+            Log.d(TAG, "Retrived frame with size " + mGray.cols() + "x" + mGray.rows() + " and channels: " + mGray.channels());
+            return mGray;
+        }
+
+        public NativeCameraFrame(VideoCapture capture) {
+            mCapture = capture;
+            mGray = new Mat();
+            mRgba = new Mat();
+            mBgr = new Mat();
+        }
+
+        @Override
+        public void release() {
+            if (mGray != null) mGray.release();
+            if (mRgba != null) mRgba.release();
+            if (mBgr != null) mBgr.release();
+        }
+
+        private VideoCapture mCapture;
+        private Mat mRgba;
+        private Mat mGray;
+        private Mat mBgr;
+    };
+
+    private class CameraWorker implements Runnable {
+
+        public void run() {
+            do {
+                if (!mCamera.grab()) {
+                    Log.e(TAG, "Camera frame grab failed");
+                    break;
+                }
+
+                deliverAndDrawFrame(mFrame);
+            } while (!mStopThread);
+        }
+    }
+
+}
diff --git a/modules/java/generator/android/java/org/opencv/android/AsyncServiceHelper.java b/modules/java/generator/android/java/org/opencv/android/AsyncServiceHelper.java
deleted file mode 100644
index cb3c6428d188..000000000000
--- a/modules/java/generator/android/java/org/opencv/android/AsyncServiceHelper.java
+++ /dev/null
@@ -1,391 +0,0 @@
-package org.opencv.android;
-
-import java.io.File;
-import java.util.StringTokenizer;
-
-import org.opencv.core.Core;
-import org.opencv.engine.OpenCVEngineInterface;
-
-import android.content.ComponentName;
-import android.content.Context;
-import android.content.Intent;
-import android.content.ServiceConnection;
-import android.net.Uri;
-import android.os.IBinder;
-import android.os.RemoteException;
-import android.util.Log;
-
-class AsyncServiceHelper
-{
-    public static boolean initOpenCV(String Version, final Context AppContext,
-            final LoaderCallbackInterface Callback)
-    {
-        AsyncServiceHelper helper = new AsyncServiceHelper(Version, AppContext, Callback);
-        Intent intent = new Intent("org.opencv.engine.BIND");
-        intent.setPackage("org.opencv.engine");
-        if (AppContext.bindService(intent, helper.mServiceConnection, Context.BIND_AUTO_CREATE))
-        {
-            return true;
-        }
-        else
-        {
-            AppContext.unbindService(helper.mServiceConnection);
-            InstallService(AppContext, Callback);
-            return false;
-        }
-    }
-
-    protected AsyncServiceHelper(String Version, Context AppContext, LoaderCallbackInterface Callback)
-    {
-        mOpenCVersion = Version;
-        mUserAppCallback = Callback;
-        mAppContext = AppContext;
-    }
-
-    protected static final String TAG = "OpenCVManager/Helper";
-    protected static final int MINIMUM_ENGINE_VERSION = 2;
-    protected OpenCVEngineInterface mEngineService;
-    protected LoaderCallbackInterface mUserAppCallback;
-    protected String mOpenCVersion;
-    protected Context mAppContext;
-    protected static boolean mServiceInstallationProgress = false;
-    protected static boolean mLibraryInstallationProgress = false;
-
-    protected static boolean InstallServiceQuiet(Context context)
-    {
-        boolean result = true;
-        try
-        {
-            Intent intent = new Intent(Intent.ACTION_VIEW, Uri.parse(OPEN_CV_SERVICE_URL));
-            intent.addFlags(Intent.FLAG_ACTIVITY_NEW_TASK);
-            context.startActivity(intent);
-        }
-        catch(Exception e)
-        {
-            result = false;
-        }
-
-        return result;
-    }
-
-    protected static void InstallService(final Context AppContext, final LoaderCallbackInterface Callback)
-    {
-        if (!mServiceInstallationProgress)
-        {
-                Log.d(TAG, "Request new service installation");
-                InstallCallbackInterface InstallQuery = new InstallCallbackInterface() {
-                private LoaderCallbackInterface mUserAppCallback = Callback;
-                public String getPackageName()
-                {
-                    return "OpenCV Manager";
-                }
-                public void install() {
-                    Log.d(TAG, "Trying to install OpenCV Manager via Google Play");
-
-                    boolean result = InstallServiceQuiet(AppContext);
-                    if (result)
-                    {
-                        mServiceInstallationProgress = true;
-                        Log.d(TAG, "Package installation started");
-                    }
-                    else
-                    {
-                        Log.d(TAG, "OpenCV package was not installed!");
-                        int Status = LoaderCallbackInterface.MARKET_ERROR;
-                        Log.d(TAG, "Init finished with status " + Status);
-                        Log.d(TAG, "Unbind from service");
-                        Log.d(TAG, "Calling using callback");
-                        mUserAppCallback.onManagerConnected(Status);
-                    }
-                }
-
-                public void cancel()
-                {
-                    Log.d(TAG, "OpenCV library installation was canceled");
-                    int Status = LoaderCallbackInterface.INSTALL_CANCELED;
-                    Log.d(TAG, "Init finished with status " + Status);
-                    Log.d(TAG, "Calling using callback");
-                    mUserAppCallback.onManagerConnected(Status);
-                }
-
-                public void wait_install()
-                {
-                    Log.e(TAG, "Installation was not started! Nothing to wait!");
-                }
-            };
-
-            Callback.onPackageInstall(InstallCallbackInterface.NEW_INSTALLATION, InstallQuery);
-        }
-        else
-        {
-            Log.d(TAG, "Waiting current installation process");
-            InstallCallbackInterface WaitQuery = new InstallCallbackInterface() {
-                private LoaderCallbackInterface mUserAppCallback = Callback;
-                public String getPackageName()
-                {
-                    return "OpenCV Manager";
-                }
-                public void install()
-                {
-                    Log.e(TAG, "Nothing to install we just wait current installation");
-                }
-                public void cancel()
-                {
-                    Log.d(TAG, "Waiting for OpenCV canceled by user");
-                    mServiceInstallationProgress = false;
-                    int Status = LoaderCallbackInterface.INSTALL_CANCELED;
-                    Log.d(TAG, "Init finished with status " + Status);
-                    Log.d(TAG, "Calling using callback");
-                    mUserAppCallback.onManagerConnected(Status);
-                }
-                public void wait_install()
-                {
-                     InstallServiceQuiet(AppContext);
-                }
-            };
-
-            Callback.onPackageInstall(InstallCallbackInterface.INSTALLATION_PROGRESS, WaitQuery);
-        }
-    }
-
-    /**
-     *  URL of OpenCV Manager page on Google Play Market.
-     */
-    protected static final String OPEN_CV_SERVICE_URL = "market://details?id=org.opencv.engine";
-
-    protected ServiceConnection mServiceConnection = new ServiceConnection()
-    {
-        public void onServiceConnected(ComponentName className, IBinder service)
-        {
-            Log.d(TAG, "Service connection created");
-            mEngineService = OpenCVEngineInterface.Stub.asInterface(service);
-            if (null == mEngineService)
-            {
-                Log.d(TAG, "OpenCV Manager Service connection fails. May be service was not installed?");
-                InstallService(mAppContext, mUserAppCallback);
-            }
-            else
-            {
-                mServiceInstallationProgress = false;
-                try
-                {
-                    if (mEngineService.getEngineVersion() < MINIMUM_ENGINE_VERSION)
-                    {
-                        Log.d(TAG, "Init finished with status " + LoaderCallbackInterface.INCOMPATIBLE_MANAGER_VERSION);
-                        Log.d(TAG, "Unbind from service");
-                        mAppContext.unbindService(mServiceConnection);
-                        Log.d(TAG, "Calling using callback");
-                        mUserAppCallback.onManagerConnected(LoaderCallbackInterface.INCOMPATIBLE_MANAGER_VERSION);
-                        return;
-                    }
-
-                    Log.d(TAG, "Trying to get library path");
-                    String path = mEngineService.getLibPathByVersion(mOpenCVersion);
-                    if ((null == path) || (path.length() == 0))
-                    {
-                        if (!mLibraryInstallationProgress)
-                        {
-                            InstallCallbackInterface InstallQuery = new InstallCallbackInterface() {
-                                public String getPackageName()
-                                {
-                                    return "OpenCV library";
-                                }
-                                public void install() {
-                                    Log.d(TAG, "Trying to install OpenCV lib via Google Play");
-                                    try
-                                    {
-                                        if (mEngineService.installVersion(mOpenCVersion))
-                                        {
-                                            mLibraryInstallationProgress = true;
-                                            Log.d(TAG, "Package installation started");
-                                            Log.d(TAG, "Unbind from service");
-                                            mAppContext.unbindService(mServiceConnection);
-                                        }
-                                        else
-                                        {
-                                            Log.d(TAG, "OpenCV package was not installed!");
-                                            Log.d(TAG, "Init finished with status " + LoaderCallbackInterface.MARKET_ERROR);
-                                            Log.d(TAG, "Unbind from service");
-                                            mAppContext.unbindService(mServiceConnection);
-                                            Log.d(TAG, "Calling using callback");
-                                            mUserAppCallback.onManagerConnected(LoaderCallbackInterface.MARKET_ERROR);
-                                        }
-                                    } catch (RemoteException e) {
-                                        e.printStackTrace();;
-                                        Log.d(TAG, "Init finished with status " + LoaderCallbackInterface.INIT_FAILED);
-                                        Log.d(TAG, "Unbind from service");
-                                        mAppContext.unbindService(mServiceConnection);
-                                        Log.d(TAG, "Calling using callback");
-                                        mUserAppCallback.onManagerConnected(LoaderCallbackInterface.INIT_FAILED);
-                                    }
-                                }
-                                public void cancel() {
-                                    Log.d(TAG, "OpenCV library installation was canceled");
-                                    Log.d(TAG, "Init finished with status " + LoaderCallbackInterface.INSTALL_CANCELED);
-                                    Log.d(TAG, "Unbind from service");
-                                    mAppContext.unbindService(mServiceConnection);
-                                    Log.d(TAG, "Calling using callback");
-                                    mUserAppCallback.onManagerConnected(LoaderCallbackInterface.INSTALL_CANCELED);
-                                }
-                                public void wait_install() {
-                                    Log.e(TAG, "Installation was not started! Nothing to wait!");
-                                }
-                            };
-
-                            mUserAppCallback.onPackageInstall(InstallCallbackInterface.NEW_INSTALLATION, InstallQuery);
-                        }
-                        else
-                        {
-                            InstallCallbackInterface WaitQuery = new InstallCallbackInterface() {
-                                public String getPackageName()
-                                {
-                                    return "OpenCV library";
-                                }
-
-                                public void install() {
-                                    Log.e(TAG, "Nothing to install we just wait current installation");
-                                }
-                                public void cancel()
-                                {
-                                    Log.d(TAG, "OpenCV library installation was canceled");
-                                    mLibraryInstallationProgress = false;
-                                    Log.d(TAG, "Init finished with status " + LoaderCallbackInterface.INSTALL_CANCELED);
-                                    Log.d(TAG, "Unbind from service");
-                                    mAppContext.unbindService(mServiceConnection);
-                                    Log.d(TAG, "Calling using callback");
-                                        mUserAppCallback.onManagerConnected(LoaderCallbackInterface.INSTALL_CANCELED);
-                                }
-                                public void wait_install() {
-                                    Log.d(TAG, "Waiting for current installation");
-                                    try
-                                    {
-                                        if (!mEngineService.installVersion(mOpenCVersion))
-                                        {
-                                            Log.d(TAG, "OpenCV package was not installed!");
-                                            Log.d(TAG, "Init finished with status " + LoaderCallbackInterface.MARKET_ERROR);
-                                            Log.d(TAG, "Calling using callback");
-                                            mUserAppCallback.onManagerConnected(LoaderCallbackInterface.MARKET_ERROR);
-                                        }
-                                        else
-                                        {
-                                            Log.d(TAG, "Waiting for package installation");
-                                        }
-
-                                        Log.d(TAG, "Unbind from service");
-                                        mAppContext.unbindService(mServiceConnection);
-
-                                    } catch (RemoteException e) {
-                                        e.printStackTrace();
-                                        Log.d(TAG, "Init finished with status " + LoaderCallbackInterface.INIT_FAILED);
-                                        Log.d(TAG, "Unbind from service");
-                                        mAppContext.unbindService(mServiceConnection);
-                                        Log.d(TAG, "Calling using callback");
-                                        mUserAppCallback.onManagerConnected(LoaderCallbackInterface.INIT_FAILED);
-                                    }
-                               }
-                            };
-
-                            mUserAppCallback.onPackageInstall(InstallCallbackInterface.INSTALLATION_PROGRESS, WaitQuery);
-                        }
-                        return;
-                    }
-                    else
-                    {
-                        Log.d(TAG, "Trying to get library list");
-                        mLibraryInstallationProgress = false;
-                        String libs = mEngineService.getLibraryList(mOpenCVersion);
-                        Log.d(TAG, "Library list: \"" + libs + "\"");
-                        Log.d(TAG, "First attempt to load libs");
-                        int status;
-                        if (initOpenCVLibs(path, libs))
-                        {
-                            Log.d(TAG, "First attempt to load libs is OK");
-                            String eol = System.getProperty("line.separator");
-                            for (String str : Core.getBuildInformation().split(eol))
-                                Log.i(TAG, str);
-
-                            status = LoaderCallbackInterface.SUCCESS;
-                        }
-                        else
-                        {
-                            Log.d(TAG, "First attempt to load libs fails");
-                            status = LoaderCallbackInterface.INIT_FAILED;
-                        }
-
-                        Log.d(TAG, "Init finished with status " + status);
-                        Log.d(TAG, "Unbind from service");
-                        mAppContext.unbindService(mServiceConnection);
-                        Log.d(TAG, "Calling using callback");
-                        mUserAppCallback.onManagerConnected(status);
-                    }
-                }
-                catch (RemoteException e)
-                {
-                    e.printStackTrace();
-                    Log.d(TAG, "Init finished with status " + LoaderCallbackInterface.INIT_FAILED);
-                    Log.d(TAG, "Unbind from service");
-                    mAppContext.unbindService(mServiceConnection);
-                    Log.d(TAG, "Calling using callback");
-                    mUserAppCallback.onManagerConnected(LoaderCallbackInterface.INIT_FAILED);
-                }
-            }
-        }
-
-        public void onServiceDisconnected(ComponentName className)
-        {
-            mEngineService = null;
-        }
-    };
-
-    private boolean loadLibrary(String AbsPath)
-    {
-        boolean result = true;
-
-        Log.d(TAG, "Trying to load library " + AbsPath);
-        try
-        {
-            System.load(AbsPath);
-            Log.d(TAG, "OpenCV libs init was ok!");
-        }
-        catch(UnsatisfiedLinkError e)
-        {
-            Log.d(TAG, "Cannot load library \"" + AbsPath + "\"");
-            e.printStackTrace();
-            result = false;
-        }
-
-        return result;
-    }
-
-    private boolean initOpenCVLibs(String Path, String Libs)
-    {
-        Log.d(TAG, "Trying to init OpenCV libs");
-        if ((null != Path) && (Path.length() != 0))
-        {
-            boolean result = true;
-            if ((null != Libs) && (Libs.length() != 0))
-            {
-                Log.d(TAG, "Trying to load libs by dependency list");
-                StringTokenizer splitter = new StringTokenizer(Libs, ";");
-                while(splitter.hasMoreTokens())
-                {
-                    String AbsLibraryPath = Path + File.separator + splitter.nextToken();
-                    result &= loadLibrary(AbsLibraryPath);
-                }
-            }
-            else
-            {
-                // If the dependencies list is not defined or empty.
-                String AbsLibraryPath = Path + File.separator + "libopencv_java4.so";
-                result = loadLibrary(AbsLibraryPath);
-            }
-
-            return result;
-        }
-        else
-        {
-            Log.d(TAG, "Library path \"" + Path + "\" is empty");
-            return false;
-        }
-    }
-}
diff --git a/modules/java/generator/android/java/org/opencv/android/BaseLoaderCallback.java b/modules/java/generator/android/java/org/opencv/android/BaseLoaderCallback.java
deleted file mode 100644
index 8ece66251438..000000000000
--- a/modules/java/generator/android/java/org/opencv/android/BaseLoaderCallback.java
+++ /dev/null
@@ -1,141 +0,0 @@
-package org.opencv.android;
-
-import android.app.Activity;
-import android.app.AlertDialog;
-import android.content.Context;
-import android.content.DialogInterface;
-import android.content.DialogInterface.OnClickListener;
-import android.util.Log;
-
-/**
- * Basic implementation of LoaderCallbackInterface.
- */
-public abstract class BaseLoaderCallback implements LoaderCallbackInterface {
-
-    public BaseLoaderCallback(Context AppContext) {
-        mAppContext = AppContext;
-    }
-
-    public void onManagerConnected(int status)
-    {
-        switch (status)
-        {
-            /** OpenCV initialization was successful. **/
-            case LoaderCallbackInterface.SUCCESS:
-            {
-                /** Application must override this method to handle successful library initialization. **/
-            } break;
-            /** OpenCV loader can not start Google Play Market. **/
-            case LoaderCallbackInterface.MARKET_ERROR:
-            {
-                Log.e(TAG, "Package installation failed!");
-                AlertDialog MarketErrorMessage = new AlertDialog.Builder(mAppContext).create();
-                MarketErrorMessage.setTitle("OpenCV Manager");
-                MarketErrorMessage.setMessage("Package installation failed!");
-                MarketErrorMessage.setCancelable(false); // This blocks the 'BACK' button
-                MarketErrorMessage.setButton(AlertDialog.BUTTON_POSITIVE, "OK", new OnClickListener() {
-                    public void onClick(DialogInterface dialog, int which) {
-                        finish();
-                    }
-                });
-                MarketErrorMessage.show();
-            } break;
-            /** Package installation has been canceled. **/
-            case LoaderCallbackInterface.INSTALL_CANCELED:
-            {
-                Log.d(TAG, "OpenCV library installation was canceled by user");
-                finish();
-            } break;
-            /** Application is incompatible with this version of OpenCV Manager. Possibly, a service update is required. **/
-            case LoaderCallbackInterface.INCOMPATIBLE_MANAGER_VERSION:
-            {
-                Log.d(TAG, "OpenCV Manager Service is uncompatible with this app!");
-                AlertDialog IncomatibilityMessage = new AlertDialog.Builder(mAppContext).create();
-                IncomatibilityMessage.setTitle("OpenCV Manager");
-                IncomatibilityMessage.setMessage("OpenCV Manager service is incompatible with this app. Try to update it via Google Play.");
-                IncomatibilityMessage.setCancelable(false); // This blocks the 'BACK' button
-                IncomatibilityMessage.setButton(AlertDialog.BUTTON_POSITIVE, "OK", new OnClickListener() {
-                    public void onClick(DialogInterface dialog, int which) {
-                        finish();
-                    }
-                });
-                IncomatibilityMessage.show();
-            } break;
-            /** Other status, i.e. INIT_FAILED. **/
-            default:
-            {
-                Log.e(TAG, "OpenCV loading failed!");
-                AlertDialog InitFailedDialog = new AlertDialog.Builder(mAppContext).create();
-                InitFailedDialog.setTitle("OpenCV error");
-                InitFailedDialog.setMessage("OpenCV was not initialised correctly. Application will be shut down");
-                InitFailedDialog.setCancelable(false); // This blocks the 'BACK' button
-                InitFailedDialog.setButton(AlertDialog.BUTTON_POSITIVE, "OK", new OnClickListener() {
-
-                    public void onClick(DialogInterface dialog, int which) {
-                        finish();
-                    }
-                });
-
-                InitFailedDialog.show();
-            } break;
-        }
-    }
-
-    public void onPackageInstall(final int operation, final InstallCallbackInterface callback)
-    {
-        switch (operation)
-        {
-            case InstallCallbackInterface.NEW_INSTALLATION:
-            {
-                AlertDialog InstallMessage = new AlertDialog.Builder(mAppContext).create();
-                InstallMessage.setTitle("Package not found");
-                InstallMessage.setMessage(callback.getPackageName() + " package was not found! Try to install it?");
-                InstallMessage.setCancelable(false); // This blocks the 'BACK' button
-                InstallMessage.setButton(AlertDialog.BUTTON_POSITIVE, "Yes", new OnClickListener()
-                {
-                    public void onClick(DialogInterface dialog, int which)
-                    {
-                        callback.install();
-                    }
-                });
-
-                InstallMessage.setButton(AlertDialog.BUTTON_NEGATIVE, "No", new OnClickListener() {
-
-                    public void onClick(DialogInterface dialog, int which)
-                    {
-                        callback.cancel();
-                    }
-                });
-
-                InstallMessage.show();
-            } break;
-            case InstallCallbackInterface.INSTALLATION_PROGRESS:
-            {
-                AlertDialog WaitMessage = new AlertDialog.Builder(mAppContext).create();
-                WaitMessage.setTitle("OpenCV is not ready");
-                WaitMessage.setMessage("Installation is in progress. Wait or exit?");
-                WaitMessage.setCancelable(false); // This blocks the 'BACK' button
-                WaitMessage.setButton(AlertDialog.BUTTON_POSITIVE, "Wait", new OnClickListener() {
-                    public void onClick(DialogInterface dialog, int which) {
-                        callback.wait_install();
-                    }
-                });
-                WaitMessage.setButton(AlertDialog.BUTTON_NEGATIVE, "Exit", new OnClickListener() {
-                    public void onClick(DialogInterface dialog, int which) {
-                        callback.cancel();
-                    }
-                });
-
-                WaitMessage.show();
-            } break;
-        }
-    }
-
-    void finish()
-    {
-        ((Activity) mAppContext).finish();
-    }
-
-    protected Context mAppContext;
-    private final static String TAG = "OCV/BaseLoaderCallback";
-}
diff --git a/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java b/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java
index 1993cf1407a0..4aa6a350f8a5 100644
--- a/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java
+++ b/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java
@@ -4,6 +4,7 @@
 
 import org.opencv.BuildConfig;
 import org.opencv.R;
+import org.opencv.core.Core;
 import org.opencv.core.Mat;
 import org.opencv.core.Size;
 
@@ -17,8 +18,10 @@
 import android.graphics.Rect;
 import android.util.AttributeSet;
 import android.util.Log;
+import android.view.Surface;
 import android.view.SurfaceHolder;
 import android.view.SurfaceView;
+import android.view.WindowManager;
 
 /**
  * This is a basic class, implementing the interaction with Camera and OpenCV library.
@@ -189,8 +192,93 @@ public interface CvCameraViewFrame {
          * This method returns single channel gray scale Mat with frame
          */
         public Mat gray();
+
+        public void release();
+    };
+
+    public class RotatedCameraFrame implements CvCameraViewFrame {
+        @Override
+        public Mat gray() {
+            if (mRotation != 0) {
+                Core.rotate(mFrame.gray(), mGrayRotated, getCvRotationCode(mRotation));
+                return mGrayRotated;
+            } else {
+                return mFrame.gray();
+            }
+        }
+
+        @Override
+        public Mat rgba() {
+            if (mRotation != 0) {
+                Core.rotate(mFrame.rgba(), mRgbaRotated, getCvRotationCode(mRotation));
+                return mRgbaRotated;
+            } else {
+                return mFrame.rgba();
+            }
+        }
+
+        private int getCvRotationCode(int degrees) {
+            if  (degrees == 90) {
+                return Core.ROTATE_90_CLOCKWISE;
+            } else if (degrees == 180) {
+                return Core.ROTATE_180;
+            } else {
+                return Core.ROTATE_90_COUNTERCLOCKWISE;
+            }
+        }
+
+        public RotatedCameraFrame(CvCameraViewFrame frame, int rotation) {
+            super();
+            mFrame = frame;
+            mRgbaRotated = new Mat();
+            mGrayRotated = new Mat();
+            mRotation = rotation;
+        }
+
+        @Override
+        public void release() {
+            mRgbaRotated.release();
+            mGrayRotated.release();
+        }
+
+        public CvCameraViewFrame mFrame;
+        private Mat mRgbaRotated;
+        private Mat mGrayRotated;
+        private int mRotation;
     };
 
+    /**
+     * Calculates how to rotate camera frame to match current screen orientation
+     */
+    protected int getFrameRotation(boolean cameraFacingFront, int cameraSensorOrientation) {
+        WindowManager windowManager = (WindowManager) getContext().getSystemService(Context.WINDOW_SERVICE);
+        int screenOrientation = windowManager.getDefaultDisplay().getRotation();
+        int screenRotation = 0;
+        switch (screenOrientation) {
+            case Surface.ROTATION_0:
+                screenRotation = 0;
+                break;
+            case Surface.ROTATION_90:
+                screenRotation = 90;
+                break;
+            case Surface.ROTATION_180:
+                screenRotation = 180;
+                break;
+            case Surface.ROTATION_270:
+                screenRotation = 270;
+                break;
+        }
+
+        int frameRotation;
+        if (cameraFacingFront) {
+            frameRotation = (cameraSensorOrientation + screenRotation) % 360;
+        } else {
+            frameRotation = (cameraSensorOrientation - screenRotation + 360) % 360;
+        }
+
+        return frameRotation;
+    }
+
     public void surfaceChanged(SurfaceHolder arg0, int arg1, int arg2, int arg3) {
         Log.d(TAG, "call surfaceChanged event");
         synchronized(mSyncObject) {
diff --git a/modules/java/generator/android/java/org/opencv/android/FpsMeter.java b/modules/java/generator/android/java/org/opencv/android/FpsMeter.java
index d22c68e41539..5727ead3f8f8 100644
--- a/modules/java/generator/android/java/org/opencv/android/FpsMeter.java
+++ b/modules/java/generator/android/java/org/opencv/android/FpsMeter.java
@@ -54,6 +54,7 @@ public void measure() {
     }
 
     public void setResolution(int width, int height) {
+        Log.d(TAG, "FpsMeter.setResolution " + Integer.valueOf(mWidth) + "x" + Integer.valueOf(mHeight));
         mWidth = width;
         mHeight = height;
     }
diff --git a/modules/java/generator/android/java/org/opencv/android/InstallCallbackInterface.java b/modules/java/generator/android/java/org/opencv/android/InstallCallbackInterface.java
deleted file mode 100644
index f68027a7ba26..000000000000
--- a/modules/java/generator/android/java/org/opencv/android/InstallCallbackInterface.java
+++ /dev/null
@@ -1,34 +0,0 @@
-package org.opencv.android;
-
-/**
- * Installation callback interface.
- */
-public interface InstallCallbackInterface
-{
-    /**
-     * New package installation is required.
-     */
-    static final int NEW_INSTALLATION = 0;
-    /**
-     * Current package installation is in progress.
-     */
-    static final int INSTALLATION_PROGRESS = 1;
-
-    /**
-     * Target package name.
-     * @return Return target package name.
-     */
-    public String getPackageName();
-    /**
-     * Installation is approved.
-     */
-    public void install();
-    /**
-     * Installation is canceled.
-     */
-    public void cancel();
-    /**
-     * Wait for package installation.
-     */
-    public void wait_install();
-};
diff --git a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java
index a7c72e43f00d..b76f186101b6 100644
--- a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java
+++ b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java
@@ -10,9 +10,12 @@
 import android.os.Build;
 import android.util.AttributeSet;
 import android.util.Log;
+import android.view.Surface;
 import android.view.ViewGroup.LayoutParams;
+import android.view.WindowManager;
 
 import org.opencv.BuildConfig;
+import org.opencv.core.Core;
 import org.opencv.core.CvType;
 import org.opencv.core.Mat;
 import org.opencv.core.Size;
@@ -39,7 +42,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
     private boolean mStopThread;
 
     protected Camera mCamera;
-    protected JavaCameraFrame[] mCameraFrame;
+    protected RotatedCameraFrame[] mCameraFrame;
     private SurfaceTexture mSurfaceTexture;
     private int mPreviewFormat = ImageFormat.NV21;
 
@@ -71,28 +74,20 @@ protected boolean initializeCamera(int width, int height) {
         boolean result = true;
         synchronized (this) {
             mCamera = null;
+            int cameraId = -1;
 
             if (mCameraIndex == CAMERA_ID_ANY) {
-                Log.d(TAG, "Trying to open camera with old open()");
-                try {
-                    mCamera = Camera.open();
-                }
-                catch (Exception e){
-                    Log.e(TAG, "Camera is not available (in use or does not exist): " + e.getLocalizedMessage());
-                }
-
-                if(mCamera == null && Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) {
-                    boolean connected = false;
-                    for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
-                        Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(camIdx) + ")");
-                        try {
-                            mCamera = Camera.open(camIdx);
-                            connected = true;
-                        } catch (RuntimeException e) {
-                            Log.e(TAG, "Camera #" + camIdx + "failed to open: " + e.getLocalizedMessage());
-                        }
-                        if (connected) break;
+                boolean connected = false;
+                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
+                    Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(camIdx) + ")");
+                    try {
+                        mCamera = Camera.open(camIdx);
+                        connected = true;
+                        cameraId = camIdx;
+                    } catch (RuntimeException e) {
+                        Log.e(TAG, "Camera #" + camIdx + "failed to open: " + e.getLocalizedMessage());
                     }
+                    if (connected) break;
                 }
             } else {
                 if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) {
@@ -126,6 +121,7 @@ protected boolean initializeCamera(int width, int height) {
                         Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(localCameraIndex) + ")");
                         try {
                             mCamera = Camera.open(localCameraIndex);
+                            cameraId = localCameraIndex;
                         } catch (RuntimeException e) {
                             Log.e(TAG, "Camera #" + localCameraIndex + "failed to open: " + e.getLocalizedMessage());
                         }
@@ -136,6 +132,11 @@ protected boolean initializeCamera(int width, int height) {
             if (mCamera == null)
                 return false;
 
+            android.hardware.Camera.CameraInfo info = new android.hardware.Camera.CameraInfo();
+            android.hardware.Camera.getCameraInfo(cameraId, info);
+            int frameRotation = getFrameRotation(
+                    info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT,
+                    info.orientation);
             /* Now set camera parameters */
             try {
                 Camera.Parameters params = mCamera.getParameters();
@@ -176,8 +177,16 @@ protected boolean initializeCamera(int width, int height) {
                     mCamera.setParameters(params);
                     params = mCamera.getParameters();
 
-                    mFrameWidth = params.getPreviewSize().width;
-                    mFrameHeight = params.getPreviewSize().height;
+                    int rawFrameWidth = params.getPreviewSize().width;
+                    int rawFrameHeight = params.getPreviewSize().height;
+
+                    if (frameRotation % 180 == 0) {
+                        mFrameWidth = params.getPreviewSize().width;
+                        mFrameHeight = params.getPreviewSize().height;
+                    } else {
+                        mFrameWidth = params.getPreviewSize().height;
+                        mFrameHeight = params.getPreviewSize().width;
+                    }
 
                     if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
                         mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
@@ -196,14 +205,14 @@ protected boolean initializeCamera(int width, int height) {
                     mCamera.setPreviewCallbackWithBuffer(this);
 
                     mFrameChain = new Mat[2];
-                    mFrameChain[0] = new Mat(mFrameHeight + (mFrameHeight/2), mFrameWidth, CvType.CV_8UC1);
-                    mFrameChain[1] = new Mat(mFrameHeight + (mFrameHeight/2), mFrameWidth, CvType.CV_8UC1);
+                    mFrameChain[0] = new Mat(rawFrameHeight + (rawFrameHeight/2), rawFrameWidth, CvType.CV_8UC1);
+                    mFrameChain[1] = new Mat(rawFrameHeight + (rawFrameHeight/2), rawFrameWidth, CvType.CV_8UC1);
 
                     AllocateCache();
 
-                    mCameraFrame = new JavaCameraFrame[2];
-                    mCameraFrame[0] = new JavaCameraFrame(mFrameChain[0], mFrameWidth, mFrameHeight);
-                    mCameraFrame[1] = new JavaCameraFrame(mFrameChain[1], mFrameWidth, mFrameHeight);
+                    mCameraFrame = new RotatedCameraFrame[2];
+                    mCameraFrame[0] = new RotatedCameraFrame(new JavaCameraFrame(mFrameChain[0], rawFrameWidth, rawFrameHeight), frameRotation);
+                    mCameraFrame[1] = new RotatedCameraFrame(new JavaCameraFrame(mFrameChain[1], rawFrameWidth, rawFrameHeight), frameRotation);
 
                     if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.HONEYCOMB) {
                         mSurfaceTexture = new SurfaceTexture(MAGIC_TEXTURE_ID);
@@ -240,7 +249,9 @@ protected void releaseCamera() {
                 mFrameChain[1].release();
             }
             if (mCameraFrame != null) {
+                mCameraFrame[0].mFrame.release();
                 mCameraFrame[0].release();
+                mCameraFrame[1].mFrame.release();
                 mCameraFrame[1].release();
             }
         }
@@ -336,6 +347,7 @@ public JavaCameraFrame(Mat Yuv420sp, int width, int height) {
             mRgba = new Mat();
         }
 
+        @Override
         public void release() {
             mRgba.release();
         }
diff --git a/modules/java/generator/android/java/org/opencv/android/LoaderCallbackInterface.java b/modules/java/generator/android/java/org/opencv/android/LoaderCallbackInterface.java
deleted file mode 100644
index a941e8377b6b..000000000000
--- a/modules/java/generator/android/java/org/opencv/android/LoaderCallbackInterface.java
+++ /dev/null
@@ -1,40 +0,0 @@
-package org.opencv.android;
-
-/**
- * Interface for callback object in case of asynchronous initialization of OpenCV.
- */
-public interface LoaderCallbackInterface
-{
-    /**
-     * OpenCV initialization finished successfully.
-     */
-    static final int SUCCESS = 0;
-    /**
-     * Google Play Market cannot be invoked.
-     */
-    static final int MARKET_ERROR = 2;
-    /**
-     * OpenCV library installation has been canceled by the user.
-     */
-    static final int INSTALL_CANCELED = 3;
-    /**
-     * This version of OpenCV Manager Service is incompatible with the app. Possibly, a service update is required.
-     */
-    static final int INCOMPATIBLE_MANAGER_VERSION = 4;
-    /**
-     * OpenCV library initialization has failed.
-     */
-    static final int INIT_FAILED = 0xff;
-
-    /**
-     * Callback method, called after OpenCV library initialization.
-     * @param status status of initialization (see initialization status constants).
-     */
-    public void onManagerConnected(int status);
-
-    /**
-     * Callback method, called in case the package installation is needed.
-     * @param callback answer object with approve and cancel methods and the package description.
-     */
-    public void onPackageInstall(final int operation, InstallCallbackInterface callback);
-};
diff --git a/modules/java/generator/android/java/org/opencv/android/OpenCVLoader.java.in b/modules/java/generator/android/java/org/opencv/android/OpenCVLoader.java.in
index 625c3daf27cf..91cc534e84f4 100644
--- a/modules/java/generator/android/java/org/opencv/android/OpenCVLoader.java.in
+++ b/modules/java/generator/android/java/org/opencv/android/OpenCVLoader.java.in
@@ -7,91 +7,6 @@ import android.content.Context;
  */
 public class OpenCVLoader
 {
-    /**
-     * OpenCV Library version 2.4.2.
-     */
-    public static final String OPENCV_VERSION_2_4_2 = "2.4.2";
-
-    /**
-     * OpenCV Library version 2.4.3.
-     */
-    public static final String OPENCV_VERSION_2_4_3 = "2.4.3";
-
-    /**
-     * OpenCV Library version 2.4.4.
-     */
-    public static final String OPENCV_VERSION_2_4_4 = "2.4.4";
-
-    /**
-     * OpenCV Library version 2.4.5.
-     */
-    public static final String OPENCV_VERSION_2_4_5 = "2.4.5";
-
-    /**
-     * OpenCV Library version 2.4.6.
-     */
-    public static final String OPENCV_VERSION_2_4_6 = "2.4.6";
-
-    /**
-     * OpenCV Library version 2.4.7.
-     */
-    public static final String OPENCV_VERSION_2_4_7 = "2.4.7";
-
-    /**
-     * OpenCV Library version 2.4.8.
-     */
-    public static final String OPENCV_VERSION_2_4_8 = "2.4.8";
-
-    /**
-     * OpenCV Library version 2.4.9.
-     */
-    public static final String OPENCV_VERSION_2_4_9 = "2.4.9";
-
-    /**
-     * OpenCV Library version 2.4.10.
-     */
-    public static final String OPENCV_VERSION_2_4_10 = "2.4.10";
-
-    /**
-     * OpenCV Library version 2.4.11.
-     */
-    public static final String OPENCV_VERSION_2_4_11 = "2.4.11";
-
-    /**
-     * OpenCV Library version 2.4.12.
-     */
-    public static final String OPENCV_VERSION_2_4_12 = "2.4.12";
-
-    /**
-     * OpenCV Library version 2.4.13.
-     */
-    public static final String OPENCV_VERSION_2_4_13 = "2.4.13";
-
-    /**
-     * OpenCV Library version 3.0.0.
-     */
-    public static final String OPENCV_VERSION_3_0_0 = "3.0.0";
-
-    /**
-     * OpenCV Library version 3.1.0.
-     */
-    public static final String OPENCV_VERSION_3_1_0 = "3.1.0";
-
-    /**
-     * OpenCV Library version 3.2.0.
-     */
-    public static final String OPENCV_VERSION_3_2_0 = "3.2.0";
-
-    /**
-     * OpenCV Library version 3.3.0.
-     */
-    public static final String OPENCV_VERSION_3_3_0 = "3.3.0";
-
-    /**
-     * OpenCV Library version 3.4.0.
-     */
-    public static final String OPENCV_VERSION_3_4_0 = "3.4.0";
-
     /**
      * Current OpenCV Library version
      */
@@ -99,9 +14,9 @@ public class OpenCVLoader
 
 
     /**
-     * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
-     * @return Returns true is initialization of OpenCV was successful.
+     * Synonym for initLocal. Deprecated.
      */
+    @Deprecated
     public static boolean initDebug()
     {
         return StaticHelper.initOpenCV(false);
@@ -109,24 +24,21 @@ public class OpenCVLoader
 
     /**
      * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
-     * @param InitCuda load and initialize CUDA runtime libraries.
      * @return Returns true is initialization of OpenCV was successful.
      */
-    public static boolean initDebug(boolean InitCuda)
+    public static boolean initLocal()
     {
-        return StaticHelper.initOpenCV(InitCuda);
+        return StaticHelper.initOpenCV(false);
     }
 
     /**
-     * Loads and initializes OpenCV library using OpenCV Engine service.
-     * @param Version OpenCV library version.
-     * @param AppContext application context for connecting to the service.
-     * @param Callback object, that implements LoaderCallbackInterface for handling the connection status.
-     * @return Returns true if initialization of OpenCV is successful.
+     * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
+     * @param InitCuda load and initialize CUDA runtime libraries.
+     * @return Returns true is initialization of OpenCV was successful.
      */
-    public static boolean initAsync(String Version, Context AppContext,
-            LoaderCallbackInterface Callback)
+    @Deprecated
+    public static boolean initDebug(boolean InitCuda)
     {
-        return AsyncServiceHelper.initOpenCV(Version, AppContext, Callback);
+        return StaticHelper.initOpenCV(InitCuda);
     }
 }
diff --git a/modules/java/generator/android/java/org/opencv/android/StaticHelper.java b/modules/java/generator/android/java/org/opencv/android/StaticHelper.java
index 934dd7570c17..a346cf9633e2 100644
--- a/modules/java/generator/android/java/org/opencv/android/StaticHelper.java
+++ b/modules/java/generator/android/java/org/opencv/android/StaticHelper.java
@@ -13,30 +13,10 @@ public static boolean initOpenCV(boolean InitCuda)
         String libs = "";
 
         if(InitCuda)
-        {
-            loadLibrary("cudart");
-            loadLibrary("nppc");
-            loadLibrary("nppi");
-            loadLibrary("npps");
-            loadLibrary("cufft");
-            loadLibrary("cublas");
-        }
-
-        Log.d(TAG, "Trying to get library list");
-
-        try
-        {
-            System.loadLibrary("opencv_info");
-            libs = getLibraryList();
-        }
-        catch(UnsatisfiedLinkError e)
-        {
-            Log.e(TAG, "OpenCV error: Cannot load info library for OpenCV");
-        }
+            Log.w(TAG, "CUDA support was removed!");
 
-        Log.d(TAG, "Library list: \"" + libs + "\"");
         Log.d(TAG, "First attempt to load libs");
-        if (initOpenCVLibs(libs))
+        if (loadLibrary("opencv_java4"))
         {
             Log.d(TAG, "First attempt to load libs is OK");
             String eol = System.getProperty("line.separator");
@@ -74,30 +54,6 @@ private static boolean loadLibrary(String Name)
         return result;
     }
 
-    private static boolean initOpenCVLibs(String Libs)
-    {
-        Log.d(TAG, "Trying to init OpenCV libs");
-
-        boolean result = true;
-
-        if ((null != Libs) && (Libs.length() != 0))
-        {
-            Log.d(TAG, "Trying to load libs by dependency list");
-            StringTokenizer splitter = new StringTokenizer(Libs, ";");
-            while(splitter.hasMoreTokens())
-            {
-                result &= loadLibrary(splitter.nextToken());
-            }
-        }
-        else
-        {
-            // If dependencies list is not defined or empty.
-            result = loadLibrary("opencv_java4");
-        }
-
-        return result;
-    }
-
     private static final String TAG = "OpenCV/StaticHelper";
 
     private static native String getLibraryList();
diff --git a/modules/java/generator/android/java/org/opencv/engine/OpenCVEngineInterface.aidl b/modules/java/generator/android/java/org/opencv/engine/OpenCVEngineInterface.aidl
deleted file mode 100644
index 21fe5f716ba2..000000000000
--- a/modules/java/generator/android/java/org/opencv/engine/OpenCVEngineInterface.aidl
+++ /dev/null
@@ -1,33 +0,0 @@
-package org.opencv.engine;
-
-/**
-* Class provides a Java interface for OpenCV Engine Service. It's synchronous with native OpenCVEngine class.
-*/
-interface OpenCVEngineInterface
-{
-    /**
-    * @return Returns service version.
-    */
-    int getEngineVersion();
-
-    /**
-    * Finds an installed OpenCV library.
-    * @param OpenCV version.
-    * @return Returns path to OpenCV native libs or an empty string if OpenCV can not be found.
-    */
-    String getLibPathByVersion(String version);
-
-    /**
-    * Tries to install defined version of OpenCV from Google Play Market.
-    * @param OpenCV version.
-    * @return Returns true if installation was successful or OpenCV package has been already installed.
-    */
-    boolean installVersion(String version);
-
-    /**
-    * Returns list of libraries in loading order, separated by semicolon.
-    * @param OpenCV version.
-    * @return Returns names of OpenCV libraries, separated by semicolon.
-    */
-    String getLibraryList(String version);
-}
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index 6f07ad913dc3..88523b2c3158 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -686,7 +686,7 @@ def gen_func(self, ci, fi, prop_name=''):
             msg = "// Return type '%s' is not supported, skipping the function\n\n" % fi.ctype
             self.skipped_func_list.append(c_decl + "\n" + msg)
             j_code.write( " "*4 + msg )
-            logging.warning("SKIP:" + c_decl.strip() + "\t due to RET type " + fi.ctype)
+            logging.info("SKIP:" + c_decl.strip() + "\t due to RET type " + fi.ctype)
             return
         for a in fi.args:
             if a.ctype not in type_dict:
@@ -698,7 +698,7 @@ def gen_func(self, ci, fi, prop_name=''):
                 msg = "// Unknown type '%s' (%s), skipping the function\n\n" % (a.ctype, a.out or "I")
                 self.skipped_func_list.append(c_decl + "\n" + msg)
                 j_code.write( " "*4 + msg )
-                logging.warning("SKIP:" + c_decl.strip() + "\t due to ARG type " + a.ctype + "/" + (a.out or "I"))
+                logging.info("SKIP:" + c_decl.strip() + "\t due to ARG type " + a.ctype + "/" + (a.out or "I"))
                 return
 
         self.ported_func_list.append(c_decl)
@@ -1240,13 +1240,13 @@ def finalize(self, output_jni_path):
 def copy_java_files(java_files_dir, java_base_path, default_package_path='org/opencv/'):
     global total_files, updated_files
     java_files = []
-    re_filter = re.compile(r'^.+\.(java|aidl|kt)(.in)?$')
+    re_filter = re.compile(r'^.+\.(java|kt)(.in)?$')
     for root, dirnames, filenames in os.walk(java_files_dir):
        java_files += [os.path.join(root, filename) for filename in filenames if re_filter.match(filename)]
     java_files = [f.replace('\\', '/') for f in java_files]
 
     re_package = re.compile(r'^package +(.+);')
-    re_prefix = re.compile(r'^.+[\+/]([^\+]+).(java|aidl|kt)(.in)?$')
+    re_prefix = re.compile(r'^.+[\+/]([^\+]+).(java|kt)(.in)?$')
     for java_file in java_files:
         src = checkFileRemap(java_file)
         with open(src, 'r') as f:
@@ -1419,7 +1419,8 @@ def sanitize_java_documentation_string(doc, type):
     java_base_path = os.path.join(dstdir, 'java'); mkdir_p(java_base_path)
     java_test_base_path = os.path.join(dstdir, 'test'); mkdir_p(java_test_base_path)
 
-    for (subdir, target_subdir) in [('src/java', 'java'), ('android/java', None), ('android-21/java', None)]:
+    for (subdir, target_subdir) in [('src/java', 'java'), ('android/java', None),
+                                    ('android-21/java', None), ('android-24/java', None)]:
         if target_subdir is None:
             target_subdir = subdir
         java_files_dir = os.path.join(SCRIPT_DIR, subdir)
diff --git a/modules/java/jar/build.xml.in b/modules/java/jar/build.xml.in
index 732b39857663..41ef1d55bae7 100644
--- a/modules/java/jar/build.xml.in
+++ b/modules/java/jar/build.xml.in
@@ -45,18 +45,23 @@
       @CMAKE_CONFIG_OPENCV_JAVADOC_LINK@
       additionalparam="--allow-script-in-comments"
       >
-      <Header>
+      <bottom>
          <![CDATA[
             <script>
               var url = window.location.href;
               var pos = url.lastIndexOf('/javadoc/');
               url = pos >= 0 ? (url.substring(0, pos) + '/javadoc/mymath.js') : (window.location.origin + '/mymath.js');
               var script = document.createElement('script');
-              script.src = '@OPENCV_MATHJAX_RELPATH@/MathJax.js?config=TeX-AMS-MML_HTMLorMML,' + url;
-              document.getElementsByTagName('head')[0].appendChild(script);
+              script.setAttribute("src", url);
+              script.setAttribute("defer", "")
+              document.head.appendChild(script);
+              script = document.createElement('script');
+              script.setAttribute("src", '@OPENCV_MATHJAX_RELPATH@/es5/tex-chtml.js');
+              script.setAttribute("defer", "")
+              document.head.appendChild(script);
             </script>
          ]]>
-      </Header>
+      </bottom>
     </javadoc>
   </target>
 
diff --git a/modules/java/test/android_test/.classpath b/modules/java/test/android_test/.classpath
deleted file mode 100644
index 9e004b007224..000000000000
--- a/modules/java/test/android_test/.classpath
+++ /dev/null
@@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<classpath>
-	<classpathentry kind="src" path="src"/>
-	<classpathentry kind="src" path="gen"/>
-	<classpathentry kind="con" path="com.android.ide.eclipse.adt.ANDROID_FRAMEWORK"/>
-	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
-	<classpathentry kind="con" path="com.android.ide.eclipse.adt.LIBRARIES"/>
-	<classpathentry kind="output" path="bin/classes"/>
-</classpath>
diff --git a/modules/java/test/android_test/.project b/modules/java/test/android_test/.project
deleted file mode 100644
index 44ab01462bbb..000000000000
--- a/modules/java/test/android_test/.project
+++ /dev/null
@@ -1,33 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>OpenCV_JavaAPI_Tests</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-		<buildCommand>
-			<name>com.android.ide.eclipse.adt.ResourceManagerBuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>com.android.ide.eclipse.adt.PreCompilerBuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>org.eclipse.jdt.core.javabuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>com.android.ide.eclipse.adt.ApkBuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-	</buildSpec>
-	<natures>
-		<nature>com.android.ide.eclipse.adt.AndroidNature</nature>
-		<nature>org.eclipse.jdt.core.javanature</nature>
-	</natures>
-</projectDescription>
diff --git a/modules/java/test/android_test/AndroidManifest.xml b/modules/java/test/android_test/AndroidManifest.xml
deleted file mode 100644
index 74508ad99106..000000000000
--- a/modules/java/test/android_test/AndroidManifest.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-      package="org.opencv.test"
-      android:versionCode="1"
-      android:versionName="1.0">
-
-    <uses-sdk android:minSdkVersion="8" />
-
-    <!-- We add an application tag here just so that we can indicate that
-         this package needs to link against the android.test library,
-         which is needed when building test cases. -->
-    <application>
-        <uses-library android:name="android.test.runner" />
-    </application>
-    <!--
-    This declares that this application uses the instrumentation test runner targeting
-    the package of org.opencv.  To run the tests use the command:
-    "adb shell am instrument -w org.opencv.test/android.test.InstrumentationTestRunner"
-    -->
-    <instrumentation android:name="org.opencv.test.OpenCVTestRunner"
-                     android:targetPackage="org.opencv.test"
-                     android:label="Tests for org.opencv"/>
-
-    <uses-permission android:name="android.permission.CAMERA"/>
-    <uses-feature android:name="android.hardware.camera" />
-    <uses-feature android:name="android.hardware.camera.autofocus" />
-    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
-</manifest>
diff --git a/modules/java/test/android_test/CMakeLists.txt b/modules/java/test/android_test/CMakeLists.txt
index b6e727f50ffd..c7f254e8501c 100644
--- a/modules/java/test/android_test/CMakeLists.txt
+++ b/modules/java/test/android_test/CMakeLists.txt
@@ -1,77 +1,25 @@
-if(NOT ANT_EXECUTABLE OR NOT ANDROID_EXECUTABLE OR NOT ANDROID_TOOLS_Pkg_Revision GREATER 13)
-  return()
-endif()
+project(opencv_test_android)
 
-project(opencv_test_java)
+set(OPENCV_ANDROID_TEST_DIR "${OpenCV_BINARY_DIR}/android_test" CACHE INTERNAL "")
+file(REMOVE_RECURSE "${OPENCV_ANDROID_TEST_DIR}")
 
-set(OPENCV_JAVA_TEST_DIR "${OpenCV_BINARY_DIR}/android_test" CACHE INTERNAL "")
-file(REMOVE_RECURSE "${OPENCV_JAVA_TEST_DIR}")
-file(REMOVE "${OPENCV_DEPHELPER}/${the_module}_test_source_copy")
+set(ANDROID_TESTS_SRC_DIRS
+"'${CMAKE_CURRENT_SOURCE_DIR}/src', \
+'${OpenCV_SOURCE_DIR}/modules/java/test/common_test/src', \
+'${CMAKE_BINARY_DIR}/modules/java_bindings_generator/gen/test'" CACHE INTERNAL "")
 
-set(test_dir "${CMAKE_CURRENT_SOURCE_DIR}")
+set(ANDROID_TESTS_RES_DIR "'${OpenCV_SOURCE_DIR}/modules/java/test/common_test/res'" CACHE INTERNAL "")
 
-set(depends "")
-
-# 1. gather and copy common test files (resources, utils, etc.)
-copy_common_tests("${CMAKE_CURRENT_SOURCE_DIR}/../common_test" "${OPENCV_JAVA_TEST_DIR}" depends)
-
-# 2. gather and copy tests from each module
-ocv_copyfiles_append_dir(JAVA_TEST_SRC_COPY "${OPENCV_JAVA_BINDINGS_DIR}/gen/test" "${OPENCV_JAVA_TEST_DIR}/src")
-
-list(APPEND depends gen_opencv_java_source "${OPENCV_DEPHELPER}/gen_opencv_java_source")
-ocv_copyfiles_add_target(${the_module}_test_source_copy JAVA_TEST_SRC_COPY "Copy Java(Android test) source files" ${depends})
-set(depends ${the_module}_test_source_copy "${OPENCV_DEPHELPER}/${the_module}_test_source_copy")
-
-# 3. gather and copy specific files for Android
-file(GLOB_RECURSE test_files RELATIVE "${test_dir}" "${test_dir}/res/*" "${test_dir}/src/*")
-foreach(f ${test_files} ${ANDROID_MANIFEST_FILE} ".classpath" ".project")
-  add_custom_command(
-      OUTPUT "${OPENCV_JAVA_TEST_DIR}/${f}"
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different "${test_dir}/${f}" "${OPENCV_JAVA_TEST_DIR}/${f}"
-      MAIN_DEPENDENCY "${test_dir}/${f}"
-      COMMENT "Copying ${f}")
-  list(APPEND depends "${test_dir}/${f}" "${OPENCV_JAVA_TEST_DIR}/${f}")
+list(APPEND TEST_PROJECT_FILES "CMakeLists.txt" "gradle.properties" "settings.gradle")
+foreach(TEST_PROJECT_FILE ${TEST_PROJECT_FILES})
+    file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/${TEST_PROJECT_FILE}" DESTINATION "${OPENCV_ANDROID_TEST_DIR}")
 endforeach()
+file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/tests_module/AndroidManifest.xml" DESTINATION "${OPENCV_ANDROID_TEST_DIR}/tests_module")
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tests_module/build.gradle.in" "${OPENCV_ANDROID_TEST_DIR}/tests_module/build.gradle" @ONLY)
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.gradle.in" "${OPENCV_ANDROID_TEST_DIR}/build.gradle" @ONLY)
 
-# fix Android project
-set(android_proj_target_files ${ANDROID_PROJECT_FILES})
-ocv_list_add_prefix(android_proj_target_files "${OPENCV_JAVA_TEST_DIR}/")
-file(RELATIVE_PATH __dep "${OPENCV_JAVA_TEST_DIR}" "${OpenCV_BINARY_DIR}/android_sdk")
-
-add_custom_command(
-    OUTPUT ${android_proj_target_files}
-    COMMAND ${CMAKE_COMMAND} -E remove ${android_proj_target_files}
-    COMMAND ${ANDROID_EXECUTABLE} --silent update test-project --path "${OPENCV_JAVA_TEST_DIR}" --main "${OpenCV_BINARY_DIR}/android_sdk"
-    COMMAND ${ANDROID_EXECUTABLE} --silent update project --path "${OPENCV_JAVA_TEST_DIR}" --library "${__dep}"
-    MAIN_DEPENDENCY "${OPENCV_JAVA_TEST_DIR}/${ANDROID_MANIFEST_FILE}"
-    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${ANDROID_MANIFEST_FILE}"
-    COMMENT "Updating Android Java API test project")
-
-list(APPEND depends ${android_proj_target_files})
-
-# build java part
-add_custom_command(
-    OUTPUT "${OPENCV_JAVA_TEST_DIR}/bin/OpenCVTest-debug.apk"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:opencv_java>" "${OPENCV_JAVA_TEST_DIR}/libs/${ANDROID_NDK_ABI_NAME}/$<TARGET_FILE_NAME:opencv_java>"
-    COMMAND ${ANT_EXECUTABLE} -q -noinput -k debug -Djava.target=1.6 -Djava.source=1.6
-    COMMAND ${CMAKE_COMMAND} -E touch "${OPENCV_JAVA_TEST_DIR}/bin/OpenCVTest-debug.apk" # needed because ant does not update the timestamp of updated apk
-    WORKING_DIRECTORY "${OPENCV_JAVA_TEST_DIR}"
-    MAIN_DEPENDENCY "${OPENCV_JAVA_TEST_DIR}/${ANDROID_MANIFEST_FILE}"
-    DEPENDS opencv_java_android opencv_java
-    DEPENDS ${depends})
-
-add_custom_target(${PROJECT_NAME} ALL SOURCES "${OPENCV_JAVA_TEST_DIR}/bin/OpenCVTest-debug.apk" "${CMAKE_CURRENT_SOURCE_DIR}/${ANDROID_MANIFEST_FILE}")
-add_dependencies(${PROJECT_NAME} opencv_java ${__android_project_chain})
-set(__android_project_chain ${PROJECT_NAME} CACHE INTERNAL "auxiliary variable used for Android progects chaining" FORCE)
-
-# put the final .apk to the OpenCV's bin folder
-add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different "${OPENCV_JAVA_TEST_DIR}/bin/OpenCVTest-debug.apk" "${OpenCV_BINARY_DIR}/bin/${PROJECT_NAME}.apk")
-
-add_dependencies(opencv_tests ${PROJECT_NAME})
+file(COPY "${OpenCV_SOURCE_DIR}/platforms/android/gradle-wrapper/gradlew" DESTINATION "${OPENCV_ANDROID_TEST_DIR}")
+file(COPY "${OpenCV_SOURCE_DIR}/platforms/android/gradle-wrapper/gradlew.bat" DESTINATION "${OPENCV_ANDROID_TEST_DIR}")
+file(COPY "${OpenCV_SOURCE_DIR}/platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.jar" DESTINATION "${OPENCV_ANDROID_TEST_DIR}/gradle/wrapper")
 
-if(PYTHON_DEFAULT_AVAILABLE)
-  set(CHECK_TEST_COVERAGE "${OPENCV_MODULE_opencv_java_LOCATION}/check-tests.py")
-  add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
-      COMMAND ${PYTHON_DEFAULT_EXECUTABLE} ${CHECK_TEST_COVERAGE} "${OPENCV_JAVA_TEST_DIR}/src" "${OPENCV_ANDROID_LIB_DIR}/src" > "${CMAKE_CURRENT_BINARY_DIR}/tests_coverage.log"
-      )
-endif()
+configure_file("${OpenCV_SOURCE_DIR}/platforms/android/gradle-wrapper/gradle/wrapper/gradle-wrapper.properties.in" "${OPENCV_ANDROID_TEST_DIR}/gradle/wrapper/gradle-wrapper.properties" @ONLY)
diff --git a/modules/java/test/android_test/build.gradle.in b/modules/java/test/android_test/build.gradle.in
new file mode 100644
index 000000000000..812268f0ab73
--- /dev/null
+++ b/modules/java/test/android_test/build.gradle.in
@@ -0,0 +1,27 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+
+    repositories {
+        google()
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:@ANDROID_GRADLE_PLUGIN_VERSION@'
+        classpath 'org.jetbrains.kotlin:kotlin-gradle-plugin:@KOTLIN_PLUGIN_VERSION@'
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        google()
+        jcenter()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
diff --git a/modules/java/test/android_test/gradle.properties b/modules/java/test/android_test/gradle.properties
new file mode 100644
index 000000000000..70c848ea94f7
--- /dev/null
+++ b/modules/java/test/android_test/gradle.properties
@@ -0,0 +1,18 @@
+# Project-wide Gradle settings.
+
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx2g
+
+android.useAndroidX=true
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
diff --git a/modules/java/test/android_test/settings.gradle b/modules/java/test/android_test/settings.gradle
new file mode 100644
index 000000000000..213d386b486c
--- /dev/null
+++ b/modules/java/test/android_test/settings.gradle
@@ -0,0 +1,6 @@
+rootProject.name = 'android_test'
+
+include ':opencv'
+project(':opencv').projectDir = new File('../opencv_android/opencv')
+
+include ':tests_module'
diff --git a/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java b/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java
index 0ebd0db5380e..dae2f5307b74 100644
--- a/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java
+++ b/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java
@@ -294,6 +294,13 @@ public static void assertArrayEquals(short[] ar1, short[] ar2) {
             //assertTrue(Math.abs(ar1[i].doubleValue() - ar2[i].doubleValue()) <= epsilon);
     }
 
+    public static void assertArrayEquals(byte[] ar1, byte[] ar2) {
+        assertEquals(ar1.length, ar2.length);
+
+        for (int i = 0; i < ar1.length; i++)
+            assertEquals(ar1[i], ar2[i]);
+    }
+
     public static void assertArrayEquals(double[] ar1, double[] ar2, double epsilon) {
         assertEquals(ar1.length, ar2.length);
 
diff --git a/modules/java/test/android_test/src/org/opencv/test/OpenCVTestRunner.java b/modules/java/test/android_test/src/org/opencv/test/OpenCVTestRunner.java
index c924cabe3f57..d9c1868b3853 100644
--- a/modules/java/test/android_test/src/org/opencv/test/OpenCVTestRunner.java
+++ b/modules/java/test/android_test/src/org/opencv/test/OpenCVTestRunner.java
@@ -4,52 +4,29 @@
 import java.io.IOException;
 import junit.framework.Assert;
 
-import org.opencv.android.BaseLoaderCallback;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.android.Utils;
 import org.opencv.core.Mat;
 
 import android.content.Context;
-import android.test.AndroidTestRunner;
-import android.test.InstrumentationTestRunner;
 import android.util.Log;
 
+import androidx.test.runner.AndroidJUnitRunner;
+
+
 /**
  * This only class is Android specific.
  */
 
-public class OpenCVTestRunner extends InstrumentationTestRunner {
+public class OpenCVTestRunner extends AndroidJUnitRunner {
 
     private static final long MANAGER_TIMEOUT = 3000;
     public static String LENA_PATH;
     public static String CHESS_PATH;
     public static String LBPCASCADE_FRONTALFACE_PATH;
     public static Context context;
-
-    private AndroidTestRunner androidTestRunner;
     private static String TAG = "opencv_test_java";
 
-    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(getContext()) {
-
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log("OpenCV loaded successfully");
-                    synchronized (this) {
-                        notify();
-                    }
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
-
     public static String getTempFileName(String extension)
     {
         File cache = context.getCacheDir();
@@ -76,30 +53,13 @@ static public void Log(Mat m) {
 
     @Override
     public void onStart() {
-        // try to load internal libs
-        if (!OpenCVLoader.initDebug()) {
-            // There is no internal OpenCV libs
-            // Using OpenCV Manager for initialization;
-
-            Log("Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION, getContext(), mLoaderCallback);
-
-            synchronized (this) {
-                try {
-                    wait(MANAGER_TIMEOUT);
-                } catch (InterruptedException e) {
-                    e.printStackTrace();
-                }
-            }
-        } else {
-            Log("OpenCV library found inside test package. Using it!");
-        }
+        Assert.assertTrue(OpenCVLoader.initLocal());
 
-        context = getContext();
+        context = getTargetContext();
         Assert.assertNotNull("Context can't be 'null'", context);
-        LENA_PATH = Utils.exportResource(context, R.drawable.lena);
-        CHESS_PATH = Utils.exportResource(context, R.drawable.chessboard);
-        LBPCASCADE_FRONTALFACE_PATH = Utils.exportResource(context, R.raw.lbpcascade_frontalface);
+        LENA_PATH = Utils.exportResource(context, context.getResources().getIdentifier("lena", "drawable", context.getPackageName()));
+        CHESS_PATH = Utils.exportResource(context, context.getResources().getIdentifier("chessboard", "drawable", context.getPackageName()));
+        //LBPCASCADE_FRONTALFACE_PATH = Utils.exportResource(context, R.raw.lbpcascade_frontalface);
 
         /*
          * The original idea about test order randomization is from
@@ -111,12 +71,6 @@ public void onStart() {
         super.onStart();
     }
 
-    @Override
-    protected AndroidTestRunner getAndroidTestRunner() {
-        androidTestRunner = super.getAndroidTestRunner();
-        return androidTestRunner;
-    }
-
     public static String getOutputFileName(String name)
     {
         return context.getExternalFilesDir(null).getAbsolutePath() + File.separatorChar + name;
diff --git a/modules/java/test/android_test/src/org/opencv/test/android/KotlinTest.kt b/modules/java/test/android_test/src/org/opencv/test/android/KotlinTest.kt
new file mode 100644
index 000000000000..8803cf9ee75a
--- /dev/null
+++ b/modules/java/test/android_test/src/org/opencv/test/android/KotlinTest.kt
@@ -0,0 +1,25 @@
+package org.opencv.test.android
+
+import org.opencv.core.CvType
+import org.opencv.core.Mat
+import org.opencv.core.times
+import org.opencv.test.OpenCVTestCase
+import kotlin.math.abs
+
+class KotlinTest : OpenCVTestCase() {
+    fun testMatrixMultiplication() {
+        val m1 = Mat.ones(2, 3, CvType.CV_32F)
+        val m2 = Mat.ones(3, 2, CvType.CV_32F)
+
+        val m3 = m1.matMul(m2)
+        val m4 = m1 * m2
+
+        val value1 = floatArrayOf(3f)
+        m3.get(0, 1, value1)
+
+        val value2 = floatArrayOf(5f)
+        m4[0, 1, value2]
+
+        assertGE(0.001, abs(value1[0] - value2[0]).toDouble())
+    }
+}
diff --git a/modules/java/test/android_test/src/org/opencv/test/android/UtilsTest.java b/modules/java/test/android_test/src/org/opencv/test/android/UtilsTest.java
index 6e6acf8193ca..09e3a2792591 100644
--- a/modules/java/test/android_test/src/org/opencv/test/android/UtilsTest.java
+++ b/modules/java/test/android_test/src/org/opencv/test/android/UtilsTest.java
@@ -17,6 +17,59 @@
 
 public class UtilsTest extends OpenCVTestCase {
 
+    private int[] testImgWH = new int[]{64, 48};
+    private byte[] testImgBgColor = new byte[]{1, 2, 3};
+    private int[] testImgRect = new int[] {15, 17, 25, 37};
+    private byte[] testImgRectColor = new byte[]{45, 15, 67};
+
+    private Mat createTestBGRImg() {
+        Mat img = new Mat(testImgWH[1], testImgWH[0], CvType.CV_8UC3,
+                          new Scalar(testImgBgColor[2], testImgBgColor[1], testImgBgColor[0]));
+        byte[] color = new byte[]{testImgRectColor[2], testImgRectColor[1], testImgRectColor[0]};
+
+        for (int i = testImgRect[1]; i < testImgRect[3]; i++) {
+            for (int j = testImgRect[0]; j < testImgRect[2]; j++) {
+                img.put(i, j, color);
+            }
+        }
+        return img;
+    }
+
+    private Bitmap createTestBitmap() {
+        Bitmap img = Bitmap.createBitmap(testImgWH[0], testImgWH[1], Bitmap.Config.ARGB_8888);
+        img.eraseColor(Color.argb(255, testImgBgColor[0], testImgBgColor[1] ,testImgBgColor[2]));
+
+        for (int i = testImgRect[1]; i < testImgRect[3]; i++) {
+            for (int j = testImgRect[0]; j < testImgRect[2]; j++) {
+                img.setPixel(j, i, Color.argb(
+                        255, testImgRectColor[0], testImgRectColor[1], testImgRectColor[2]));
+            }
+        }
+        return img;
+    }
+
+    public void testMatBitmapConversion() {
+        Mat mat = new Mat();
+        Imgproc.cvtColor(createTestBGRImg(), mat, Imgproc.COLOR_BGR2RGBA);
+        Bitmap bmp = createTestBitmap();
+
+        Bitmap convertedBmp = Bitmap.createBitmap(
+                Bitmap.createBitmap(testImgWH[0], testImgWH[1], Bitmap.Config.ARGB_8888));
+        Utils.matToBitmap(mat, convertedBmp);
+        assertTrue(bmp.sameAs(convertedBmp));
+
+        Mat convertedMat = new Mat();
+        Utils.bitmapToMat(bmp, convertedMat);
+        Mat diff = new Mat();
+        Core.absdiff(mat, convertedMat, diff);
+        Scalar channelsDiff = Core.sumElems(diff);
+        assertEquals(0.0, channelsDiff.val[0]);
+        assertEquals(0.0, channelsDiff.val[1]);
+        assertEquals(0.0, channelsDiff.val[2]);
+        assertEquals(0.0, channelsDiff.val[3]);
+    }
+
+
     public void testBitmapToMat() {
         BitmapFactory.Options opt16 = new BitmapFactory.Options();
         opt16.inPreferredConfig = Bitmap.Config.RGB_565;
diff --git a/modules/java/test/android_test/tests_module/AndroidManifest.xml b/modules/java/test/android_test/tests_module/AndroidManifest.xml
new file mode 100644
index 000000000000..324f41a651e4
--- /dev/null
+++ b/modules/java/test/android_test/tests_module/AndroidManifest.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+          package="org.opencv.samples.tutorial1"
+>
+
+    <application
+        android:theme="@android:style/Theme.NoTitleBar.Fullscreen" >
+    </application>
+
+</manifest>
diff --git a/modules/java/test/android_test/tests_module/build.gradle.in b/modules/java/test/android_test/tests_module/build.gradle.in
new file mode 100644
index 000000000000..049e6546bd3c
--- /dev/null
+++ b/modules/java/test/android_test/tests_module/build.gradle.in
@@ -0,0 +1,36 @@
+apply plugin: 'com.android.application'
+apply plugin: 'kotlin-android'
+
+android {
+    namespace 'org.opencv.tests'
+    compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
+    defaultConfig {
+        applicationId "org.opencv.tests"
+        minSdkVersion @ANDROID_MIN_SDK_VERSION@
+        targetSdkVersion @ANDROID_TARGET_SDK_VERSION@
+        versionCode 301
+        versionName "3.01"
+
+        testInstrumentationRunner "org.opencv.test.OpenCVTestRunner"
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    sourceSets {
+        androidTest {
+            java.srcDirs = [@ANDROID_TESTS_SRC_DIRS@]
+        }
+        main {
+            manifest.srcFile 'AndroidManifest.xml'
+            res.srcDirs = [@ANDROID_TESTS_RES_DIR@]
+        }
+    }
+}
+
+dependencies {
+    androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.1'
+    implementation project(':opencv')
+}
diff --git a/modules/java/test/pure_test/build.xml b/modules/java/test/pure_test/build.xml
index e596c82e9dc2..fa1e7c3ab927 100644
--- a/modules/java/test/pure_test/build.xml
+++ b/modules/java/test/pure_test/build.xml
@@ -42,7 +42,7 @@
 
   <target name="test" depends="jar">
     <mkdir dir="${test.dir}"/>
-    <junit printsummary="withOutAndErr" haltonfailure="false" haltonerror="false" showoutput="true" logfailedtests="true" maxmemory="256m">
+    <junit printsummary="withOutAndErr" failureproperty="junit_test.failed" haltonfailure="false" haltonerror="false" showoutput="true" logfailedtests="true" maxmemory="256m">
       <sysproperty key="java.library.path" path="${opencv.lib.path}"/>
       <env key="PATH" path="${opencv.lib.path}:${env.PATH}:${env.Path}"/>
       <env key="DYLD_LIBRARY_PATH" path="${env.OPENCV_SAVED_DYLD_LIBRARY_PATH}"/>  <!-- https://github.com/opencv/opencv/issues/14353 -->
@@ -65,6 +65,7 @@
       </fileset>
       <report format="noframes" todir="${test.dir}"/>
     </junitreport>
+    <fail message="JUnit test execution failed" if="junit_test.failed"/>
   </target>
 
   <target name="build" depends="jar">
diff --git a/modules/js/CMakeLists.txt b/modules/js/CMakeLists.txt
index 19f0b197904a..47cba260e165 100644
--- a/modules/js/CMakeLists.txt
+++ b/modules/js/CMakeLists.txt
@@ -69,7 +69,7 @@ if(COMPILE_FLAGS)
     set_target_properties(${the_module} PROPERTIES COMPILE_FLAGS ${COMPILE_FLAGS})
 endif()
 
-set(EMSCRIPTEN_LINK_FLAGS "${EMSCRIPTEN_LINK_FLAGS} --memory-init-file 0 -s TOTAL_MEMORY=128MB -s WASM_MEM_MAX=1GB -s ALLOW_MEMORY_GROWTH=1")
+set(EMSCRIPTEN_LINK_FLAGS "${EMSCRIPTEN_LINK_FLAGS} -s TOTAL_MEMORY=128MB -s WASM_MEM_MAX=1GB -s ALLOW_MEMORY_GROWTH=1")
 set(EMSCRIPTEN_LINK_FLAGS "${EMSCRIPTEN_LINK_FLAGS} -s MODULARIZE=1")
 set(EMSCRIPTEN_LINK_FLAGS "${EMSCRIPTEN_LINK_FLAGS} -s EXPORT_NAME=\"'cv'\" -s DEMANGLE_SUPPORT=1")
 set(EMSCRIPTEN_LINK_FLAGS "${EMSCRIPTEN_LINK_FLAGS} -s FORCE_FILESYSTEM=1 --use-preload-plugins --bind --post-js ${JS_HELPER} ${COMPILE_FLAGS}")
diff --git a/modules/js/generator/embindgen.py b/modules/js/generator/embindgen.py
index 8a16f92f5ec1..005ac9f175a4 100644
--- a/modules/js/generator/embindgen.py
+++ b/modules/js/generator/embindgen.py
@@ -482,7 +482,7 @@ def gen_function_binding_with_wrapper(self, func, ns_name, class_info):
                     ret_type = type_dict[ptr_type]
             for key in type_dict:
                 if key in ret_type:
-                    ret_type = re.sub('(^|[^\w])' + key + '($|[^\w])', type_dict[key], ret_type)
+                    ret_type = re.sub(r"\b" + key + r"\b", type_dict[key], ret_type)
             arg_types = []
             unwrapped_arg_types = []
             for arg in variant.args:
@@ -670,7 +670,7 @@ def gen_function_binding(self, func, class_info):
                     # Replace types. Instead of ret_type.replace we use regular
                     # expression to exclude false matches.
                     # See https://github.com/opencv/opencv/issues/15514
-                    ret_type = re.sub('(^|[^\w])' + key + '($|[^\w])', type_dict[key], ret_type)
+                    ret_type = re.sub(r"\b" + key + r"\b", type_dict[key], ret_type)
             if variant.constret and ret_type.startswith('const') == False:
                 ret_type = 'const ' + ret_type
             if variant.refret and ret_type.endswith('&') == False:
diff --git a/modules/js/src/core_bindings.cpp b/modules/js/src/core_bindings.cpp
index 60fe496ce37a..d5bf9b076c6c 100644
--- a/modules/js/src/core_bindings.cpp
+++ b/modules/js/src/core_bindings.cpp
@@ -89,14 +89,20 @@ using namespace cv;
 
 using namespace cv::segmentation;  // FIXIT
 
+#ifdef HAVE_OPENCV_OBJDETECT
 using namespace cv::aruco;
 typedef aruco::DetectorParameters aruco_DetectorParameters;
 typedef QRCodeDetectorAruco::Params QRCodeDetectorAruco_Params;
+#endif
 
 #ifdef HAVE_OPENCV_DNN
 using namespace cv::dnn;
 #endif
 
+#ifdef HAVE_OPENCV_FEATURES2D
+typedef SimpleBlobDetector::Params SimpleBlobDetector_Params;
+#endif
+
 #ifdef HAVE_OPENCV_VIDEO
 typedef TrackerMIL::Params TrackerMIL_Params;
 #endif
@@ -459,6 +465,7 @@ EMSCRIPTEN_BINDINGS(binding_utils)
     register_vector<char>("CharVector");
     register_vector<float>("FloatVector");
     register_vector<double>("DoubleVector");
+    register_vector<std::string>("StringVector");
     register_vector<cv::Point>("PointVector");
     register_vector<cv::Mat>("MatVector");
     register_vector<cv::Rect>("RectVector");
diff --git a/modules/js/src/helpers.js b/modules/js/src/helpers.js
index 9f1934c279ca..962a0b4d90a6 100644
--- a/modules/js/src/helpers.js
+++ b/modules/js/src/helpers.js
@@ -42,6 +42,10 @@ if (typeof Module.FS === 'undefined' && typeof FS !== 'undefined') {
     Module.FS = FS;
 }
 
+if (typeof cv === 'undefined') {
+    var cv = Module;
+}
+
 Module['imread'] = function(imageSource) {
     var img = null;
     if (typeof imageSource === 'string') {
diff --git a/modules/js/test/test_core.js b/modules/js/test/test_core.js
new file mode 100644
index 000000000000..14d4ffe72b4c
--- /dev/null
+++ b/modules/js/test/test_core.js
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+if (typeof module !== 'undefined' && module.exports) {
+    // The environment is Node.js
+    var cv = require('./opencv.js'); // eslint-disable-line no-var
+}
+
+QUnit.module('Core', {});
+
+QUnit.test('test_LUT', function(assert) {
+    // test LUT
+    {
+        let src = cv.matFromArray(3, 3, cv.CV_8UC1, [255, 128, 0, 0, 128, 255, 1, 2, 254]);
+        let lutTable = [];
+        for (let i = 0; i < 256; i++)
+        {
+           lutTable[i] = 255 - i;
+        }
+        let lut = cv.matFromArray(1, 256, cv.CV_8UC1, lutTable);
+        let dst = new cv.Mat();
+
+        cv.LUT(src, lut, dst);
+
+        //console.log(dst.data);
+        assert.equal(dst.ucharAt(0), 0);
+        assert.equal(dst.ucharAt(1), 127);
+        assert.equal(dst.ucharAt(2), 255);
+        assert.equal(dst.ucharAt(3), 255);
+        assert.equal(dst.ucharAt(4), 127);
+        assert.equal(dst.ucharAt(5), 0);
+        assert.equal(dst.ucharAt(6), 254);
+        assert.equal(dst.ucharAt(7), 253);
+        assert.equal(dst.ucharAt(8), 1);
+
+        src.delete();
+        lut.delete();
+        dst.delete();
+    }
+});
diff --git a/modules/js/test/test_features2d.js b/modules/js/test/test_features2d.js
index 45e3d4d715e8..c5eb73a123be 100644
--- a/modules/js/test/test_features2d.js
+++ b/modules/js/test/test_features2d.js
@@ -62,6 +62,15 @@ QUnit.test('Detectors', function(assert) {
   assert.equal(kp.size(), 53, 'AKAZE');
 });
 
+QUnit.test('SimpleBlobDetector', function(assert) {
+  let image = generateTestFrame();
+
+  let kp = new cv.KeyPointVector();
+  let sbd = new cv.SimpleBlobDetector();
+  sbd.detect(image, kp);
+  assert.equal(kp.size(), 0);
+});
+
 QUnit.test('BFMatcher', function(assert) {
   // Generate key points.
   let image = generateTestFrame();
diff --git a/modules/js/test/test_mat.js b/modules/js/test/test_mat.js
index 409ed1b12307..fd3611cd2c4e 100644
--- a/modules/js/test/test_mat.js
+++ b/modules/js/test/test_mat.js
@@ -73,7 +73,7 @@ if (typeof module !== 'undefined' && module.exports) {
     var cv = require('./opencv.js'); // eslint-disable-line no-var
 }
 
-QUnit.module('Core', {});
+QUnit.module('CoreMat', {});
 
 QUnit.test('test_mat_creation', function(assert) {
     // Mat constructors.
diff --git a/modules/js/test/tests.html b/modules/js/test/tests.html
index de64ca7a294d..b20013ec638c 100644
--- a/modules/js/test/tests.html
+++ b/modules/js/test/tests.html
@@ -52,12 +52,12 @@
               if (window.cv instanceof Promise) {
                 window.cv.then((target) => {
                    window.cv = target;
-                   //console.log(cv.getBuildInformation());
+                   console.log(cv.getBuildInformation());
                    QUnit.start();
                 })
               } else {
                 // for backward compatible
-                // console.log(cv.getBuildInformation());
+                console.log(cv.getBuildInformation());
                 QUnit.start();
               }
             },
@@ -108,6 +108,7 @@
         <script type="application/javascript" async src="opencv.js" onerror="opencvjs_LoadError()"></script>
         <script type="application/javascript" src="test_mat.js"></script>
         <script type="application/javascript" src="test_utils.js"></script>
+        <script type="application/javascript" src="test_core.js"></script>
         <script type="application/javascript" src="test_imgproc.js"></script>
         <script type="application/javascript" src="test_objdetect.js"></script>
         <script type="application/javascript" src="test_video.js"></script>
diff --git a/modules/js/test/tests.js b/modules/js/test/tests.js
index f3156f6ea080..74a4b87e4599 100644
--- a/modules/js/test/tests.js
+++ b/modules/js/test/tests.js
@@ -44,10 +44,15 @@ testrunner.options.maxBlockDuration = 20000; // cause opencv_js.js need time to
 testrunner.run(
     {
         code: 'opencv.js',
-        tests: ['test_mat.js', 'test_utils.js', 'test_imgproc.js',
-                'test_objdetect.js', 'test_video.js', 'test_features2d.js',
+        tests: ['test_mat.js',
+                'test_utils.js',
+                'test_core.js',
+                'test_imgproc.js',
+                'test_objdetect.js',
+                'test_video.js',
+                'test_features2d.js',
                 'test_photo.js',
-                'test_calib3d.js'
+                'test_calib3d.js',
         ],
     },
     function(err, report) {
diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp
index 3843564e1546..5f15c173a1ad 100644
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -223,7 +223,7 @@ class ANN_MLPImpl CV_FINAL : public ANN_MLP
     void setActivationFunction(int _activ_func, double _f_param1, double _f_param2) CV_OVERRIDE
     {
         if( _activ_func < 0 || _activ_func > LEAKYRELU)
-            CV_Error( CV_StsOutOfRange, "Unknown activation function" );
+            CV_Error( cv::Error::StsOutOfRange, "Unknown activation function" );
 
         activ_func = _activ_func;
 
@@ -322,7 +322,7 @@ class ANN_MLPImpl CV_FINAL : public ANN_MLP
             {
                 int n = layer_sizes[i];
                 if( n < 1 + (0 < i && i < l_count-1))
-                    CV_Error( CV_StsOutOfRange,
+                    CV_Error( cv::Error::StsOutOfRange,
                              "there should be at least one input and one output "
                              "and every hidden layer must have more than 1 neuron" );
                 max_lsize = std::max( max_lsize, n );
@@ -341,7 +341,7 @@ class ANN_MLPImpl CV_FINAL : public ANN_MLP
     float predict( InputArray _inputs, OutputArray _outputs, int ) const CV_OVERRIDE
     {
         if( !trained )
-            CV_Error( CV_StsError, "The network has not been trained or loaded" );
+            CV_Error( cv::Error::StsError, "The network has not been trained or loaded" );
 
         Mat inputs = _inputs.getMat();
         int type = inputs.type(), l_count = layer_count();
@@ -790,7 +790,7 @@ class ANN_MLPImpl CV_FINAL : public ANN_MLP
                 {
                     t = t*inv_scale[j*2] + inv_scale[2*j+1];
                     if( t < m1 || t > M1 )
-                        CV_Error( CV_StsOutOfRange,
+                        CV_Error( cv::Error::StsOutOfRange,
                                  "Some of new output training vector components run exceed the original range too much" );
                 }
             }
@@ -817,25 +817,25 @@ class ANN_MLPImpl CV_FINAL : public ANN_MLP
                            Mat& sample_weights, int flags )
     {
         if( layer_sizes.empty() )
-            CV_Error( CV_StsError,
+            CV_Error( cv::Error::StsError,
                      "The network has not been created. Use method create or the appropriate constructor" );
 
         if( (inputs.type() != CV_32F && inputs.type() != CV_64F) ||
             inputs.cols != layer_sizes[0] )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                      "input training data should be a floating-point matrix with "
                      "the number of rows equal to the number of training samples and "
                      "the number of columns equal to the size of 0-th (input) layer" );
 
         if( (outputs.type() != CV_32F && outputs.type() != CV_64F) ||
             outputs.cols != layer_sizes.back() )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                      "output training data should be a floating-point matrix with "
                      "the number of rows equal to the number of training samples and "
                      "the number of columns equal to the size of last (output) layer" );
 
         if( inputs.rows != outputs.rows )
-            CV_Error( CV_StsUnmatchedSizes, "The numbers of input and output samples do not match" );
+            CV_Error( cv::Error::StsUnmatchedSizes, "The numbers of input and output samples do not match" );
 
         Mat temp;
         double s = sum(sample_weights)[0];
@@ -1323,7 +1323,7 @@ class ANN_MLPImpl CV_FINAL : public ANN_MLP
             fs << "itePerStep" << params.itePerStep;
         }
         else
-            CV_Error(CV_StsError, "Unknown training method");
+            CV_Error(cv::Error::StsError, "Unknown training method");
 
         fs << "term_criteria" << "{";
         if( params.termCrit.type & TermCriteria::EPS )
@@ -1421,7 +1421,7 @@ class ANN_MLPImpl CV_FINAL : public ANN_MLP
                 params.itePerStep = tpn["itePerStep"];
             }
             else
-                CV_Error(CV_StsParseError, "Unknown training method (should be BACKPROP or RPROP)");
+                CV_Error(cv::Error::StsParseError, "Unknown training method (should be BACKPROP or RPROP)");
 
             FileNode tcn = tpn["term_criteria"];
             if( !tcn.empty() )
diff --git a/modules/ml/src/boost.cpp b/modules/ml/src/boost.cpp
index be9c9a7b4673..87e1fe4131bf 100644
--- a/modules/ml/src/boost.cpp
+++ b/modules/ml/src/boost.cpp
@@ -308,7 +308,7 @@ class DTreesImplForBoost CV_FINAL : public DTreesImpl
             }
         }
         else
-            CV_Error(CV_StsNotImplemented, "Unknown boosting type");
+            CV_Error(cv::Error::StsNotImplemented, "Unknown boosting type");
 
         /*if( bparams.boostType != Boost::LOGIT )
         {
@@ -387,7 +387,7 @@ class DTreesImplForBoost CV_FINAL : public DTreesImpl
     void write( FileStorage& fs ) const CV_OVERRIDE
     {
         if( roots.empty() )
-            CV_Error( CV_StsBadArg, "RTrees have not been trained" );
+            CV_Error( cv::Error::StsBadArg, "RTrees have not been trained" );
 
         writeFormat(fs);
         writeParams(fs);
diff --git a/modules/ml/src/data.cpp b/modules/ml/src/data.cpp
index fd7c8d101600..6f8035947b17 100644
--- a/modules/ml/src/data.cpp
+++ b/modules/ml/src/data.cpp
@@ -574,7 +574,7 @@ class TrainDataImpl CV_FINAL : public TrainData
             if( nvars == 0 )
             {
                 if( rowvals.empty() )
-                    CV_Error(CV_StsBadArg, "invalid CSV format; no data found");
+                    CV_Error(cv::Error::StsBadArg, "invalid CSV format; no data found");
                 nvars = (int)rowvals.size();
                 if( !varTypeSpec.empty() && varTypeSpec.size() > 0 )
                 {
@@ -637,7 +637,7 @@ class TrainDataImpl CV_FINAL : public TrainData
             {
                 for( i = ninputvars; i < nvars; i++ )
                     if( vtypes[i] == VAR_CATEGORICAL )
-                        CV_Error(CV_StsBadArg,
+                        CV_Error(cv::Error::StsBadArg,
                                  "If responses are vector values, not scalars, they must be marked as ordered responses");
             }
         }
@@ -724,14 +724,14 @@ class TrainDataImpl CV_FINAL : public TrainData
                 }
 
                 if ( ptr[3] != '[')
-                    CV_Error( CV_StsBadArg, errmsg );
+                    CV_Error( cv::Error::StsBadArg, errmsg );
 
                 ptr += 4; // pass "ord["
                 do
                 {
                     int b1 = (int)strtod( ptr, &stopstring );
                     if( *stopstring == 0 || (*stopstring != ',' && *stopstring != ']' && *stopstring != '-') )
-                        CV_Error( CV_StsBadArg, errmsg );
+                        CV_Error( cv::Error::StsBadArg, errmsg );
                     ptr = stopstring + 1;
                     if( (stopstring[0] == ',') || (stopstring[0] == ']'))
                     {
@@ -745,7 +745,7 @@ class TrainDataImpl CV_FINAL : public TrainData
                         {
                             int b2 = (int)strtod( ptr, &stopstring);
                             if ( (*stopstring == 0) || (*stopstring != ',' && *stopstring != ']') )
-                                CV_Error( CV_StsBadArg, errmsg );
+                                CV_Error( cv::Error::StsBadArg, errmsg );
                             ptr = stopstring + 1;
                             CV_Assert( 0 <= b1 && b1 <= b2 && b2 < nvars );
                             for (int i = b1; i <= b2; i++)
@@ -753,7 +753,7 @@ class TrainDataImpl CV_FINAL : public TrainData
                             specCounter += b2 - b1 + 1;
                         }
                         else
-                            CV_Error( CV_StsBadArg, errmsg );
+                            CV_Error( cv::Error::StsBadArg, errmsg );
 
                     }
                 }
@@ -762,7 +762,7 @@ class TrainDataImpl CV_FINAL : public TrainData
         }
 
         if( specCounter != nvars )
-            CV_Error( CV_StsBadArg, "type of some variables is not specified" );
+            CV_Error( cv::Error::StsBadArg, "type of some variables is not specified" );
     }
 
     void setTrainTestSplitRatio(double ratio, bool shuffle) CV_OVERRIDE
diff --git a/modules/ml/src/gbt.cpp b/modules/ml/src/gbt.cpp
index 57f2eb176b9f..d1d6f2c8ea02 100644
--- a/modules/ml/src/gbt.cpp
+++ b/modules/ml/src/gbt.cpp
@@ -218,7 +218,7 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag,
                 orig_response->data.fl[i] = (float) _responses->data.i[i*step];
         }; break;
         default:
-            CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
+            CV_Error(cv::Error::StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
     }
 
     if (!is_regression)
@@ -283,7 +283,7 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag,
                         sample_idx->data.i[active_samples_count++] = i;
 
             } break;
-            default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
+            default: CV_Error(cv::Error::StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
         }
     }
     else
@@ -1072,7 +1072,7 @@ void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode )
 
 
     if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS ||  params.loss_function_type == 2)
-        CV_ERROR( CV_StsBadArg, "Unknown loss function" );
+        CV_ERROR( cv::Error::StsBadArg, "Unknown loss function" );
 
     params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" );
     params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 );
@@ -1082,7 +1082,7 @@ void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode )
     {
         class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" );
         if( class_labels && !CV_IS_MAT(class_labels))
-            CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix");
+            CV_ERROR( cv::Error::StsParseError, "class_labels must stored as a matrix");
     }
     data->is_classifier = 0;
 
@@ -1105,7 +1105,7 @@ void CvGBTrees::write( CvFileStorage* fs, const char* name ) const
     cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT );
 
     if( !weak )
-        CV_ERROR( CV_StsBadArg, "The model has not been trained yet" );
+        CV_ERROR( cv::Error::StsBadArg, "The model has not been trained yet" );
 
     write_params( fs );
     cvWriteReal( fs, "base_value", base_value);
@@ -1170,13 +1170,13 @@ void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
 
         trees_fnode = cvGetFileNodeByName( fs, node, s.c_str() );
         if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
-            CV_ERROR( CV_StsParseError, "<trees_x> tag is missing" );
+            CV_ERROR( cv::Error::StsParseError, "<trees_x> tag is missing" );
 
         cvStartReadSeq( trees_fnode->data.seq, &reader );
         ntrees = trees_fnode->data.seq->total;
 
         if( ntrees != params.weak_count )
-            CV_ERROR( CV_StsUnmatchedSizes,
+            CV_ERROR( cv::Error::StsUnmatchedSizes,
             "The number of trees stored does not match <ntrees> tag value" );
 
         CV_CALL( storage = cvCreateMemStorage() );
diff --git a/modules/ml/src/inner_functions.cpp b/modules/ml/src/inner_functions.cpp
index 6b3affcebc8e..c0e5f45dd1ec 100644
--- a/modules/ml/src/inner_functions.cpp
+++ b/modules/ml/src/inner_functions.cpp
@@ -63,7 +63,7 @@ bool StatModel::train(const Ptr<TrainData>& trainData, int )
 {
     CV_TRACE_FUNCTION();
     CV_Assert(!trainData.empty());
-    CV_Error(CV_StsNotImplemented, "");
+    CV_Error(cv::Error::StsNotImplemented, "");
     return false;
 }
 
diff --git a/modules/ml/src/lr.cpp b/modules/ml/src/lr.cpp
index b43e1040454d..6f462e14c342 100644
--- a/modules/ml/src/lr.cpp
+++ b/modules/ml/src/lr.cpp
@@ -109,15 +109,15 @@ bool LogisticRegressionImpl::train(const Ptr<TrainData>& trainData, int)
     CV_Assert( !_labels_i.empty() && !_data_i.empty());
     if(_labels_i.cols != 1)
     {
-        CV_Error( CV_StsBadArg, "labels should be a column matrix" );
+        CV_Error( cv::Error::StsBadArg, "labels should be a column matrix" );
     }
     if(_data_i.type() != CV_32FC1 || _labels_i.type() != CV_32FC1)
     {
-        CV_Error( CV_StsBadArg, "data and labels must be a floating point matrix" );
+        CV_Error( cv::Error::StsBadArg, "data and labels must be a floating point matrix" );
     }
     if(_labels_i.rows != _data_i.rows)
     {
-        CV_Error( CV_StsBadArg, "number of rows in data and labels should be equal" );
+        CV_Error( cv::Error::StsBadArg, "number of rows in data and labels should be equal" );
     }
 
     // class labels
@@ -126,7 +126,7 @@ bool LogisticRegressionImpl::train(const Ptr<TrainData>& trainData, int)
     int num_classes = (int) this->forward_mapper.size();
     if(num_classes < 2)
     {
-        CV_Error( CV_StsBadArg, "data should have at least 2 classes" );
+        CV_Error( cv::Error::StsBadArg, "data should have at least 2 classes" );
     }
 
     // add a column of ones to the data (bias/intercept term)
@@ -174,7 +174,7 @@ bool LogisticRegressionImpl::train(const Ptr<TrainData>& trainData, int)
     this->learnt_thetas = thetas.clone();
     if( cvIsNaN( (double)sum(this->learnt_thetas)[0] ) )
     {
-        CV_Error( CV_StsBadArg, "check training parameters. Invalid training classifier" );
+        CV_Error( cv::Error::StsBadArg, "check training parameters. Invalid training classifier" );
     }
 
     // success
@@ -187,7 +187,7 @@ float LogisticRegressionImpl::predict(InputArray samples, OutputArray results, i
     // check if learnt_mats array is populated
     if(!this->isTrained())
     {
-        CV_Error( CV_StsBadArg, "classifier should be trained first" );
+        CV_Error( cv::Error::StsBadArg, "classifier should be trained first" );
     }
 
     // coefficient matrix
@@ -206,7 +206,7 @@ float LogisticRegressionImpl::predict(InputArray samples, OutputArray results, i
     Mat data = samples.getMat();
     if(data.type() != CV_32F)
     {
-        CV_Error( CV_StsBadArg, "data must be of floating type" );
+        CV_Error( cv::Error::StsBadArg, "data must be of floating type" );
     }
 
     // add a column of ones to the data (bias/intercept term)
@@ -327,7 +327,7 @@ double LogisticRegressionImpl::compute_cost(const Mat& _data, const Mat& _labels
 
     if(cvIsNaN( cost ) == 1)
     {
-        CV_Error( CV_StsBadArg, "check training parameters. Invalid training classifier" );
+        CV_Error( cv::Error::StsBadArg, "check training parameters. Invalid training classifier" );
     }
 
     return cost;
@@ -398,12 +398,12 @@ Mat LogisticRegressionImpl::batch_gradient_descent(const Mat& _data, const Mat&
     // implements batch gradient descent
     if(this->params.alpha<=0)
     {
-        CV_Error( CV_StsBadArg, "check training parameters (learning rate) for the classifier" );
+        CV_Error( cv::Error::StsBadArg, "check training parameters (learning rate) for the classifier" );
     }
 
     if(this->params.num_iters <= 0)
     {
-        CV_Error( CV_StsBadArg, "number of iterations cannot be zero or a negative number" );
+        CV_Error( cv::Error::StsBadArg, "number of iterations cannot be zero or a negative number" );
     }
 
     int llambda = 0;
@@ -439,12 +439,12 @@ Mat LogisticRegressionImpl::mini_batch_gradient_descent(const Mat& _data, const
 
     if(this->params.mini_batch_size <= 0 || this->params.alpha == 0)
     {
-        CV_Error( CV_StsBadArg, "check training parameters for the classifier" );
+        CV_Error( cv::Error::StsBadArg, "check training parameters for the classifier" );
     }
 
     if(this->params.num_iters <= 0)
     {
-        CV_Error( CV_StsBadArg, "number of iterations cannot be zero or a negative number" );
+        CV_Error( cv::Error::StsBadArg, "number of iterations cannot be zero or a negative number" );
     }
 
     Mat theta_p = _init_theta.clone();
@@ -551,7 +551,7 @@ void LogisticRegressionImpl::write(FileStorage& fs) const
     // check if open
     if(fs.isOpened() == 0)
     {
-        CV_Error(CV_StsBadArg,"file can't open. Check file path");
+        CV_Error(cv::Error::StsBadArg,"file can't open. Check file path");
     }
     writeFormat(fs);
     string desc = "Logistic Regression Classifier";
@@ -574,7 +574,7 @@ void LogisticRegressionImpl::read(const FileNode& fn)
     // check if empty
     if(fn.empty())
     {
-        CV_Error( CV_StsBadArg, "empty FileNode object" );
+        CV_Error( cv::Error::StsBadArg, "empty FileNode object" );
     }
 
     this->params.alpha = (double)fn["alpha"];
diff --git a/modules/ml/src/nbayes.cpp b/modules/ml/src/nbayes.cpp
index 60dda0c7d47c..910a1e1c37c3 100644
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@@ -101,7 +101,7 @@ class NormalBayesClassifierImpl : public NormalBayesClassifier
                 norm(var_idx, __var_idx, NORM_INF) != 0 ||
                 cls_labels.size() != __cls_labels.size() ||
                 norm(cls_labels, __cls_labels, NORM_INF) != 0 )
-                CV_Error( CV_StsBadArg,
+                CV_Error( cv::Error::StsBadArg,
                 "The new training data is inconsistent with the original training data; varIdx and the class labels should be the same" );
         }
 
@@ -312,11 +312,11 @@ class NormalBayesClassifierImpl : public NormalBayesClassifier
         bool rawOutput = (flags & RAW_OUTPUT) != 0;
 
         if( samples.type() != CV_32F || samples.cols != nallvars )
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                      "The input samples must be 32f matrix with the number of columns = nallvars" );
 
         if( (samples.rows > 1) && (! _results.needed()) )
-            CV_Error( CV_StsNullPtr,
+            CV_Error( cv::Error::StsNullPtr,
                      "When the number of input samples is >1, the output vector of results must be passed" );
 
         if( _results.needed() )
@@ -388,7 +388,7 @@ class NormalBayesClassifierImpl : public NormalBayesClassifier
         fn["var_all"] >> nallvars;
 
         if( nallvars <= 0 )
-            CV_Error( CV_StsParseError,
+            CV_Error( cv::Error::StsParseError,
                      "The field \"var_count\" of NBayes classifier is missing or non-positive" );
 
         fn["var_idx"] >> var_idx;
@@ -397,7 +397,7 @@ class NormalBayesClassifierImpl : public NormalBayesClassifier
         int nclasses = (int)cls_labels.total(), i;
 
         if( cls_labels.empty() || nclasses < 1 )
-            CV_Error( CV_StsParseError, "No or invalid \"cls_labels\" in NBayes classifier" );
+            CV_Error( cv::Error::StsParseError, "No or invalid \"cls_labels\" in NBayes classifier" );
 
         FileNodeIterator
             count_it = fn["count"].begin(),
diff --git a/modules/ml/src/precomp.hpp b/modules/ml/src/precomp.hpp
index 328cc4732a64..6a91ef98f073 100644
--- a/modules/ml/src/precomp.hpp
+++ b/modules/ml/src/precomp.hpp
@@ -131,13 +131,13 @@ namespace ml
         inline void setMaxCategories(int val)
         {
             if( val < 2 )
-                CV_Error( CV_StsOutOfRange, "max_categories should be >= 2" );
+                CV_Error( cv::Error::StsOutOfRange, "max_categories should be >= 2" );
             maxCategories = std::min(val, 15 );
         }
         inline void setMaxDepth(int val)
         {
             if( val < 0 )
-                CV_Error( CV_StsOutOfRange, "max_depth should be >= 0" );
+                CV_Error( cv::Error::StsOutOfRange, "max_depth should be >= 0" );
             maxDepth = std::min( val, 25 );
         }
         inline void setMinSampleCount(int val)
@@ -147,11 +147,11 @@ namespace ml
         inline void setCVFolds(int val)
         {
             if( val < 0 )
-                CV_Error( CV_StsOutOfRange,
+                CV_Error( cv::Error::StsOutOfRange,
                           "params.CVFolds should be =0 (the tree is not pruned) "
                           "or n>0 (tree is pruned using n-fold cross-validation)" );
             if(val > 1)
-                CV_Error( CV_StsNotImplemented,
+                CV_Error( cv::Error::StsNotImplemented,
                           "tree pruning using cross-validation is not implemented."
                           "Set CVFolds to 1");
 
@@ -162,7 +162,7 @@ namespace ml
         inline void setRegressionAccuracy(float val)
         {
             if( val < 0 )
-                CV_Error( CV_StsOutOfRange, "params.regression_accuracy should be >= 0" );
+                CV_Error( cv::Error::StsOutOfRange, "params.regression_accuracy should be >= 0" );
             regressionAccuracy = val;
         }
 
diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index 2cad961f99fa..41ffa03a5e98 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -309,7 +309,7 @@ class DTreesImplForRTrees CV_FINAL : public DTreesImpl
     {
         CV_TRACE_FUNCTION();
         if( roots.empty() )
-            CV_Error( CV_StsBadArg, "RTrees have not been trained" );
+            CV_Error( cv::Error::StsBadArg, "RTrees have not been trained" );
 
         writeFormat(fs);
         writeParams(fs);
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 6c3db22b723a..0d62f7858a23 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -95,11 +95,11 @@ const int QFLOAT_TYPE = DataDepth<Qfloat>::value;
 static void checkParamGrid(const ParamGrid& pg)
 {
     if( pg.minVal > pg.maxVal )
-        CV_Error( CV_StsBadArg, "Lower bound of the grid must be less then the upper one" );
+        CV_Error( cv::Error::StsBadArg, "Lower bound of the grid must be less then the upper one" );
     if( pg.minVal < DBL_EPSILON )
-        CV_Error( CV_StsBadArg, "Lower bound of the grid must be positive" );
+        CV_Error( cv::Error::StsBadArg, "Lower bound of the grid must be positive" );
     if( pg.logStep < 1. + FLT_EPSILON )
-        CV_Error( CV_StsBadArg, "Grid step must greater than 1" );
+        CV_Error( cv::Error::StsBadArg, "Grid step must greater than 1" );
 }
 
 // SVM training parameters
@@ -325,7 +325,7 @@ class SVMKernelImpl CV_FINAL : public SVM::Kernel
             calc_intersec(vcount, var_count, vecs, another, results);
             break;
         default:
-            CV_Error(CV_StsBadArg, "Unknown kernel type");
+            CV_Error(cv::Error::StsBadArg, "Unknown kernel type");
         }
         const Qfloat max_val = (Qfloat)(FLT_MAX*1e-3);
         for( int j = 0; j < vcount; j++ )
@@ -410,7 +410,7 @@ ParamGrid SVM::getDefaultGrid( int param_id )
         grid.logStep = 7; // total iterations = 3
     }
     else
-        cvError( CV_StsBadArg, "SVM::getDefaultGrid", "Invalid type of parameter "
+        cvError( cv::Error::StsBadArg, "SVM::getDefaultGrid", "Invalid type of parameter "
                 "(use one of SVM::C, SVM::GAMMA et al.)", __FILE__, __LINE__ );
     return grid;
 }
@@ -638,9 +638,6 @@ class SVMImpl CV_FINAL : public SVM
         #undef is_lower_bound
         #define is_lower_bound(i) (alpha_status[i] < 0)
 
-        #undef is_free
-        #define is_free(i) (alpha_status[i] == 0)
-
         #undef get_C
         #define get_C(i) (C[y[i]>0])
 
@@ -648,9 +645,6 @@ class SVMImpl CV_FINAL : public SVM
         #define update_alpha_status(i) \
             alpha_status[i] = (schar)(alpha[i] >= get_C(i) ? 1 : alpha[i] <= 0 ? -1 : 0)
 
-        #undef reconstruct_gradient
-        #define reconstruct_gradient() /* empty for now */
-
         bool solve_generic( SolutionInfo& si )
         {
             const schar* y = &y_vec[0];
@@ -1303,12 +1297,12 @@ class SVMImpl CV_FINAL : public SVM
             if( kernelType != LINEAR && kernelType != POLY &&
                 kernelType != SIGMOID && kernelType != RBF &&
                 kernelType != INTER && kernelType != CHI2)
-                CV_Error( CV_StsBadArg, "Unknown/unsupported kernel type" );
+                CV_Error( cv::Error::StsBadArg, "Unknown/unsupported kernel type" );
 
             if( kernelType == LINEAR )
                 params.gamma = 1;
             else if( params.gamma <= 0 )
-                CV_Error( CV_StsOutOfRange, "gamma parameter of the kernel must be positive" );
+                CV_Error( cv::Error::StsOutOfRange, "gamma parameter of the kernel must be positive" );
 
             if( kernelType != SIGMOID && kernelType != POLY )
                 params.coef0 = 0;
@@ -1316,14 +1310,14 @@ class SVMImpl CV_FINAL : public SVM
             if( kernelType != POLY )
                 params.degree = 0;
             else if( params.degree <= 0 )
-                CV_Error( CV_StsOutOfRange, "The kernel parameter <degree> must be positive" );
+                CV_Error( cv::Error::StsOutOfRange, "The kernel parameter <degree> must be positive" );
 
             kernel = makePtr<SVMKernelImpl>(params);
         }
         else
         {
             if (!kernel)
-                CV_Error( CV_StsBadArg, "Custom kernel is not set" );
+                CV_Error( cv::Error::StsBadArg, "Custom kernel is not set" );
         }
 
         int svmType = params.svmType;
@@ -1331,22 +1325,22 @@ class SVMImpl CV_FINAL : public SVM
         if( svmType != C_SVC && svmType != NU_SVC &&
             svmType != ONE_CLASS && svmType != EPS_SVR &&
             svmType != NU_SVR )
-            CV_Error( CV_StsBadArg, "Unknown/unsupported SVM type" );
+            CV_Error( cv::Error::StsBadArg, "Unknown/unsupported SVM type" );
 
         if( svmType == ONE_CLASS || svmType == NU_SVC )
             params.C = 0;
         else if( params.C <= 0 )
-            CV_Error( CV_StsOutOfRange, "The parameter C must be positive" );
+            CV_Error( cv::Error::StsOutOfRange, "The parameter C must be positive" );
 
         if( svmType == C_SVC || svmType == EPS_SVR )
             params.nu = 0;
         else if( params.nu <= 0 || params.nu >= 1 )
-            CV_Error( CV_StsOutOfRange, "The parameter nu must be between 0 and 1" );
+            CV_Error( cv::Error::StsOutOfRange, "The parameter nu must be between 0 and 1" );
 
         if( svmType != EPS_SVR )
             params.p = 0;
         else if( params.p <= 0 )
-            CV_Error( CV_StsOutOfRange, "The parameter p must be positive" );
+            CV_Error( cv::Error::StsOutOfRange, "The parameter p must be positive" );
 
         if( svmType != C_SVC )
             params.classWeights.release();
@@ -1437,7 +1431,7 @@ class SVMImpl CV_FINAL : public SVM
                 if( (cw.cols != 1 && cw.rows != 1) ||
                     (int)cw.total() != class_count ||
                     (cw.type() != CV_32F && cw.type() != CV_64F) )
-                    CV_Error( CV_StsBadArg, "params.class_weights must be 1d floating-point vector "
+                    CV_Error( cv::Error::StsBadArg, "params.class_weights must be 1d floating-point vector "
                         "containing as many elements as the number of classes" );
 
                 cw.convertTo(class_weights, CV_64F, params.C);
@@ -1452,7 +1446,7 @@ class SVMImpl CV_FINAL : public SVM
 
             //check that while cross-validation there were the samples from all the classes
             if ((int)class_ranges.size() < class_count + 1)
-                CV_Error( CV_StsBadArg, "While cross-validation one or more of the classes have "
+                CV_Error( cv::Error::StsBadArg, "While cross-validation one or more of the classes have "
                 "been fell out of the sample. Try to reduce <Params::k_fold>" );
 
             if( svmType == NU_SVC )
@@ -1626,7 +1620,7 @@ class SVMImpl CV_FINAL : public SVM
         {
             responses = data->getTrainNormCatResponses();
             if( responses.empty() )
-                CV_Error(CV_StsBadArg, "in the case of classification problem the responses must be categorical; "
+                CV_Error(cv::Error::StsBadArg, "in the case of classification problem the responses must be categorical; "
                                        "either specify varType when creating TrainData, or pass integer responses");
             class_labels = data->getClassLabels();
         }
@@ -1975,7 +1969,7 @@ class SVMImpl CV_FINAL : public SVM
                 }
             }
             else
-                CV_Error( CV_StsBadArg, "INTERNAL ERROR: Unknown SVM type, "
+                CV_Error( cv::Error::StsBadArg, "INTERNAL ERROR: Unknown SVM type, "
                          "the SVM structure is probably corrupted" );
         }
 
@@ -2118,7 +2112,7 @@ class SVMImpl CV_FINAL : public SVM
         int class_count = !class_labels.empty() ? (int)class_labels.total() :
                           params.svmType == ONE_CLASS ? 1 : 0;
         if( !isTrained() )
-            CV_Error( CV_StsParseError, "SVM model data is invalid, check sv_count, var_* and class_count tags" );
+            CV_Error( cv::Error::StsParseError, "SVM model data is invalid, check sv_count, var_* and class_count tags" );
 
         writeFormat(fs);
         write_params( fs );
@@ -2203,11 +2197,11 @@ class SVMImpl CV_FINAL : public SVM
             svm_type_str == "NU_SVR" ? NU_SVR : -1;
 
         if( svmType < 0 )
-            CV_Error( CV_StsParseError, "Missing or invalid SVM type" );
+            CV_Error( cv::Error::StsParseError, "Missing or invalid SVM type" );
 
         FileNode kernel_node = fn["kernel"];
         if( kernel_node.empty() )
-            CV_Error( CV_StsParseError, "SVM kernel tag is not found" );
+            CV_Error( cv::Error::StsParseError, "SVM kernel tag is not found" );
 
         String kernel_type_str = (String)kernel_node["type"];
         int kernelType =
@@ -2219,7 +2213,7 @@ class SVMImpl CV_FINAL : public SVM
             kernel_type_str == "INTER" ? INTER : CUSTOM;
 
         if( kernelType == CUSTOM )
-            CV_Error( CV_StsParseError, "Invalid SVM kernel type (or custom kernel)" );
+            CV_Error( cv::Error::StsParseError, "Invalid SVM kernel type (or custom kernel)" );
 
         _params.svmType = svmType;
         _params.kernelType = kernelType;
@@ -2259,7 +2253,7 @@ class SVMImpl CV_FINAL : public SVM
         int class_count = (int)fn["class_count"];
 
         if( sv_total <= 0 || var_count <= 0 )
-            CV_Error( CV_StsParseError, "SVM model data is invalid, check sv_count, var_* and class_count tags" );
+            CV_Error( cv::Error::StsParseError, "SVM model data is invalid, check sv_count, var_* and class_count tags" );
 
         FileNode m = fn["class_labels"];
         if( !m.empty() )
@@ -2269,7 +2263,7 @@ class SVMImpl CV_FINAL : public SVM
             m >> params.classWeights;
 
         if( class_count > 1 && (class_labels.empty() || (int)class_labels.total() != class_count))
-            CV_Error( CV_StsParseError, "Array of class labels is missing or invalid" );
+            CV_Error( cv::Error::StsParseError, "Array of class labels is missing or invalid" );
 
         // read support vectors
         FileNode sv_node = fn["support_vectors"];
diff --git a/modules/ml/src/svmsgd.cpp b/modules/ml/src/svmsgd.cpp
index 266c7cf300e0..ccb5ca3a85c3 100644
--- a/modules/ml/src/svmsgd.cpp
+++ b/modules/ml/src/svmsgd.cpp
@@ -375,7 +375,7 @@ bool SVMSGDImpl::isTrained() const
 void SVMSGDImpl::write(FileStorage& fs) const
 {
     if( !isTrained() )
-        CV_Error( CV_StsParseError, "SVMSGD model data is invalid, it hasn't been trained" );
+        CV_Error( cv::Error::StsParseError, "SVMSGD model data is invalid, it hasn't been trained" );
 
     writeFormat(fs);
     writeParams( fs );
@@ -437,7 +437,7 @@ void SVMSGDImpl::readParams( const FileNode& fn )
                                      svmsgdTypeStr == "ASGD" ? ASGD : -1;
 
     if( svmsgdType < 0 )
-        CV_Error( CV_StsParseError, "Missing or invalid SVMSGD type" );
+        CV_Error( cv::Error::StsParseError, "Missing or invalid SVMSGD type" );
 
     params.svmsgdType = svmsgdType;
 
@@ -447,7 +447,7 @@ void SVMSGDImpl::readParams( const FileNode& fn )
                                              marginTypeStr == "HARD_MARGIN" ? HARD_MARGIN : -1;
 
     if( marginType < 0 )
-        CV_Error( CV_StsParseError, "Missing or invalid margin type" );
+        CV_Error( cv::Error::StsParseError, "Missing or invalid margin type" );
 
     params.marginType = marginType;
 
@@ -517,7 +517,7 @@ void SVMSGDImpl::setOptimalParameters(int svmsgdType, int marginType)
         break;
 
     default:
-        CV_Error( CV_StsParseError, "SVMSGD model data is invalid" );
+        CV_Error( cv::Error::StsParseError, "SVMSGD model data is invalid" );
     }
 }
 }   //ml
diff --git a/modules/ml/src/testset.cpp b/modules/ml/src/testset.cpp
index 48cd1341543d..b20115d1994a 100644
--- a/modules/ml/src/testset.cpp
+++ b/modules/ml/src/testset.cpp
@@ -60,13 +60,13 @@ void createConcentricSpheresTestSet( int num_samples, int num_features, int num_
                                      OutputArray _samples, OutputArray _responses)
 {
     if( num_samples < 1 )
-        CV_Error( CV_StsBadArg, "num_samples parameter must be positive" );
+        CV_Error( cv::Error::StsBadArg, "num_samples parameter must be positive" );
 
     if( num_features < 1 )
-        CV_Error( CV_StsBadArg, "num_features parameter must be positive" );
+        CV_Error( cv::Error::StsBadArg, "num_features parameter must be positive" );
 
     if( num_classes < 1 )
-        CV_Error( CV_StsBadArg, "num_classes parameter must be positive" );
+        CV_Error( cv::Error::StsBadArg, "num_classes parameter must be positive" );
 
     int i, cur_class;
 
diff --git a/modules/ml/src/tree.cpp b/modules/ml/src/tree.cpp
index b69ddaece2df..3d8941344d99 100644
--- a/modules/ml/src/tree.cpp
+++ b/modules/ml/src/tree.cpp
@@ -404,7 +404,7 @@ int DTreesImpl::addNodeAndTrySplit( int parent, const vector<int>& sidx )
     {
         node.defaultDir = calcDir( node.split, sidx, sleft, sright );
         if( params.useSurrogates )
-            CV_Error( CV_StsNotImplemented, "surrogate splits are not implemented yet");
+            CV_Error( cv::Error::StsNotImplemented, "surrogate splits are not implemented yet");
 
         int left = addNodeAndTrySplit( nidx, sleft );
         int right = addNodeAndTrySplit( nidx, sright );
@@ -1445,7 +1445,7 @@ float DTreesImpl::predictTrees( const Range& range, const Mat& sample, int flags
 
                         int ival = cvRound(val);
                         if( ival != val )
-                            CV_Error( CV_StsBadArg,
+                            CV_Error( cv::Error::StsBadArg,
                                      "one of input categorical variable is not an integer" );
 
                         CV_Assert(cmap != NULL);
diff --git a/modules/ml/test/test_mltests.cpp b/modules/ml/test/test_mltests.cpp
index c7353057d3db..a67f6b0bf273 100644
--- a/modules/ml/test/test_mltests.cpp
+++ b/modules/ml/test/test_mltests.cpp
@@ -55,7 +55,7 @@ static DatasetDesc & getDataset(const string & name)
 
 // interfaces and templates
 
-template <typename T> string modelName() { return "Unknown"; };
+template <typename T> string modelName() { return "Unknown"; }
 template <typename T> Ptr<T> tuneModel(const DatasetDesc &, Ptr<T> m) { return m; }
 
 struct IModelFactory
diff --git a/modules/objc/generator/CMakeLists.txt b/modules/objc/generator/CMakeLists.txt
index b3cbbd3f5f71..bd8f8325b3e7 100644
--- a/modules/objc/generator/CMakeLists.txt
+++ b/modules/objc/generator/CMakeLists.txt
@@ -9,6 +9,8 @@ file(REMOVE_RECURSE "${OPENCV_OBJC_BINDINGS_DIR}/osx")
 file(REMOVE "${OPENCV_DEPHELPER}/gen_opencv_objc_source_osx")  # force re-run after CMake
 file(REMOVE_RECURSE "${OPENCV_OBJC_BINDINGS_DIR}/ios")
 file(REMOVE "${OPENCV_DEPHELPER}/gen_opencv_objc_source_ios")  # force re-run after CMake
+file(REMOVE_RECURSE "${OPENCV_OBJC_BINDINGS_DIR}/visionos")
+file(REMOVE "${OPENCV_DEPHELPER}/gen_opencv_objc_source_visionos")  # force re-run after CMake
 
 # This file is included from a subdirectory
 set(OBJC_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/..")
@@ -67,6 +69,8 @@ string(REPLACE "opencv_" "" MODULES "${OPENCV_OBJC_MODULES}")
 if(NOT DEFINED OPENCV_OBJC_TARGET AND APPLE_FRAMEWORK)
   if(IOS)
     set(OPENCV_OBJC_TARGET "ios")
+  elseif(XROS)
+    set(OPENCV_OBJC_TARGET "visionos")
   else()
     set(OPENCV_OBJC_TARGET "osx")
   endif()
@@ -117,6 +121,7 @@ if(OPENCV_OBJC_TARGET)
 else()
   ocv_add_objc_generated_target(osx)
   ocv_add_objc_generated_target(ios)
+  ocv_add_objc_generated_target(visionos)
 endif()
 
 add_custom_target(gen_opencv_objc_source
diff --git a/modules/objc/generator/gen_objc.py b/modules/objc/generator/gen_objc.py
index 3105e7eefd0c..58b7dd555c0c 100755
--- a/modules/objc/generator/gen_objc.py
+++ b/modules/objc/generator/gen_objc.py
@@ -1600,7 +1600,7 @@ def sanitize_documentation_string(doc, type):
     arg_parser = argparse.ArgumentParser(description='OpenCV Objective-C Wrapper Generator')
     arg_parser.add_argument('-p', '--parser', required=True, help='OpenCV header parser')
     arg_parser.add_argument('-c', '--config', required=True, help='OpenCV modules config')
-    arg_parser.add_argument('-t', '--target', required=True, help='Target (either ios or osx)')
+    arg_parser.add_argument('-t', '--target', required=True, help='Target (either ios or osx or visionos)')
     arg_parser.add_argument('-f', '--framework', required=True, help='Framework name')
 
     args=arg_parser.parse_args()
@@ -1671,6 +1671,7 @@ def sanitize_documentation_string(doc, type):
         logging.info("\nCommon headers (%d):\n%s", len(common_headers), pformat(common_headers))
 
         gendict_fname = os.path.join(misc_location, 'gen_dict.json')
+        module_source_map = {}
         if os.path.exists(gendict_fname):
             with open(gendict_fname) as f:
                 gen_type_dict = json.load(f)
@@ -1687,6 +1688,7 @@ def sanitize_documentation_string(doc, type):
             header_fix.update(gen_type_dict.get("header_fix", {}))
             enum_fix.update(gen_type_dict.get("enum_fix", {}))
             const_fix.update(gen_type_dict.get("const_fix", {}))
+            module_source_map = gen_type_dict.get("SourceMap", {})
             namespaces_dict.update(gen_type_dict.get("namespaces_dict", {}))
             module_imports += gen_type_dict.get("module_imports", [])
 
@@ -1695,15 +1697,10 @@ def sanitize_documentation_string(doc, type):
         if os.path.exists(objc_files_dir):
             copied_files += copy_objc_files(objc_files_dir, objc_base_path, module, True)
 
-        if args.target == 'ios':
-            ios_files_dir = os.path.join(misc_location, 'ios')
-            if os.path.exists(ios_files_dir):
-                copied_files += copy_objc_files(ios_files_dir, objc_base_path, module, True)
-
-        if args.target == 'osx':
-            osx_files_dir = os.path.join(misc_location, 'macosx')
-            if os.path.exists(osx_files_dir):
-                copied_files += copy_objc_files(osx_files_dir, objc_base_path, module, True)
+        target_path = 'macosx' if args.target == 'osx' else module_source_map.get(args.target, args.target)
+        target_files_dir = os.path.join(misc_location, target_path)
+        if os.path.exists(target_files_dir):
+            copied_files += copy_objc_files(target_files_dir, objc_base_path, module, True)
 
         objc_test_files_dir = os.path.join(misc_location, 'test')
         if os.path.exists(objc_test_files_dir):
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index 8b3cd7c97b94..7f1189060806 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -54,59 +54,61 @@
 @{
     @defgroup objdetect_cascade_classifier Cascade Classifier for Object Detection
 
-The object detector described below has been initially proposed by Paul Viola @cite Viola01 and
-improved by Rainer Lienhart @cite Lienhart02 .
-
-First, a classifier (namely a *cascade of boosted classifiers working with haar-like features*) is
-trained with a few hundred sample views of a particular object (i.e., a face or a car), called
-positive examples, that are scaled to the same size (say, 20x20), and negative examples - arbitrary
-images of the same size.
-
-After a classifier is trained, it can be applied to a region of interest (of the same size as used
-during the training) in an input image. The classifier outputs a "1" if the region is likely to show
-the object (i.e., face/car), and "0" otherwise. To search for the object in the whole image one can
-move the search window across the image and check every location using the classifier. The
-classifier is designed so that it can be easily "resized" in order to be able to find the objects of
-interest at different sizes, which is more efficient than resizing the image itself. So, to find an
-object of an unknown size in the image the scan procedure should be done several times at different
-scales.
-
-The word "cascade" in the classifier name means that the resultant classifier consists of several
-simpler classifiers (*stages*) that are applied subsequently to a region of interest until at some
-stage the candidate is rejected or all the stages are passed. The word "boosted" means that the
-classifiers at every stage of the cascade are complex themselves and they are built out of basic
-classifiers using one of four different boosting techniques (weighted voting). Currently Discrete
-Adaboost, Real Adaboost, Gentle Adaboost and Logitboost are supported. The basic classifiers are
-decision-tree classifiers with at least 2 leaves. Haar-like features are the input to the basic
-classifiers, and are calculated as described below. The current algorithm uses the following
-Haar-like features:
-
-![image](pics/haarfeatures.png)
-
-The feature used in a particular classifier is specified by its shape (1a, 2b etc.), position within
-the region of interest and the scale (this scale is not the same as the scale used at the detection
-stage, though these two scales are multiplied). For example, in the case of the third line feature
-(2c) the response is calculated as the difference between the sum of image pixels under the
-rectangle covering the whole feature (including the two white stripes and the black stripe in the
-middle) and the sum of the image pixels under the black stripe multiplied by 3 in order to
-compensate for the differences in the size of areas. The sums of pixel values over a rectangular
-regions are calculated rapidly using integral images (see below and the integral description).
-
-Check @ref tutorial_cascade_classifier "the corresponding tutorial" for more details.
-
-The following reference is for the detection part only. There is a separate application called
-opencv_traincascade that can train a cascade of boosted classifiers from a set of samples.
-
-@note In the new C++ interface it is also possible to use LBP (local binary pattern) features in
-addition to Haar-like features. .. [Viola01] Paul Viola and Michael J. Jones. Rapid Object Detection
-using a Boosted Cascade of Simple Features. IEEE CVPR, 2001. The paper is available online at
-<https://github.com/SvHey/thesis/blob/master/Literature/ObjectDetection/violaJones_CVPR2001.pdf>
+    The object detector described below has been initially proposed by Paul Viola @cite Viola01 and
+    improved by Rainer Lienhart @cite Lienhart02 .
+
+    First, a classifier (namely a *cascade of boosted classifiers working with haar-like features*) is
+    trained with a few hundred sample views of a particular object (i.e., a face or a car), called
+    positive examples, that are scaled to the same size (say, 20x20), and negative examples - arbitrary
+    images of the same size.
+
+    After a classifier is trained, it can be applied to a region of interest (of the same size as used
+    during the training) in an input image. The classifier outputs a "1" if the region is likely to show
+    the object (i.e., face/car), and "0" otherwise. To search for the object in the whole image one can
+    move the search window across the image and check every location using the classifier. The
+    classifier is designed so that it can be easily "resized" in order to be able to find the objects of
+    interest at different sizes, which is more efficient than resizing the image itself. So, to find an
+    object of an unknown size in the image the scan procedure should be done several times at different
+    scales.
+
+    The word "cascade" in the classifier name means that the resultant classifier consists of several
+    simpler classifiers (*stages*) that are applied subsequently to a region of interest until at some
+    stage the candidate is rejected or all the stages are passed. The word "boosted" means that the
+    classifiers at every stage of the cascade are complex themselves and they are built out of basic
+    classifiers using one of four different boosting techniques (weighted voting). Currently Discrete
+    Adaboost, Real Adaboost, Gentle Adaboost and Logitboost are supported. The basic classifiers are
+    decision-tree classifiers with at least 2 leaves. Haar-like features are the input to the basic
+    classifiers, and are calculated as described below. The current algorithm uses the following
+    Haar-like features:
+
+    ![image](pics/haarfeatures.png)
+
+    The feature used in a particular classifier is specified by its shape (1a, 2b etc.), position within
+    the region of interest and the scale (this scale is not the same as the scale used at the detection
+    stage, though these two scales are multiplied). For example, in the case of the third line feature
+    (2c) the response is calculated as the difference between the sum of image pixels under the
+    rectangle covering the whole feature (including the two white stripes and the black stripe in the
+    middle) and the sum of the image pixels under the black stripe multiplied by 3 in order to
+    compensate for the differences in the size of areas. The sums of pixel values over a rectangular
+    regions are calculated rapidly using integral images (see below and the integral description).
+
+    Check @ref tutorial_cascade_classifier "the corresponding tutorial" for more details.
+
+    The following reference is for the detection part only. There is a separate application called
+    opencv_traincascade that can train a cascade of boosted classifiers from a set of samples.
+
+    @note In the new C++ interface it is also possible to use LBP (local binary pattern) features in
+    addition to Haar-like features. .. [Viola01] Paul Viola and Michael J. Jones. Rapid Object Detection
+    using a Boosted Cascade of Simple Features. IEEE CVPR, 2001. The paper is available online at
+    <https://github.com/SvHey/thesis/blob/master/Literature/ObjectDetection/violaJones_CVPR2001.pdf>
 
     @defgroup objdetect_hog HOG (Histogram of Oriented Gradients) descriptor and object detector
     @defgroup objdetect_barcode Barcode detection and decoding
     @defgroup objdetect_qrcode QRCode detection and encoding
     @defgroup objdetect_dnn_face DNN-based face detection and recognition
-Check @ref tutorial_dnn_face "the corresponding tutorial" for more details.
+
+    Check @ref tutorial_dnn_face "the corresponding tutorial" for more details.
+
     @defgroup objdetect_common Common functions and classes
     @defgroup objdetect_aruco ArUco markers and boards detection for robust camera pose estimation
     @{
@@ -730,19 +732,21 @@ class CV_EXPORTS_W QRCodeEncoder {
         ECI_UTF8 = 26
     };
 
-    /** @brief QR code encoder parameters.
-     @param version The optional version of QR code (by default - maximum possible depending on
-                    the length of the string).
-     @param correction_level The optional level of error correction (by default - the lowest).
-     @param mode The optional encoding mode - Numeric, Alphanumeric, Byte, Kanji, ECI or Structured Append.
-     @param structure_number The optional number of QR codes to generate in Structured Append mode.
-    */
+    /** @brief QR code encoder parameters. */
     struct CV_EXPORTS_W_SIMPLE Params
     {
         CV_WRAP Params();
+
+        //! The optional version of QR code (by default - maximum possible depending on the length of the string).
         CV_PROP_RW int version;
+
+        //! The optional level of error correction (by default - the lowest).
         CV_PROP_RW CorrectionLevel correction_level;
+
+        //! The optional encoding mode - Numeric, Alphanumeric, Byte, Kanji, ECI or Structured Append.
         CV_PROP_RW EncodeMode mode;
+
+        //! The optional number of QR codes to generate in Structured Append mode.
         CV_PROP_RW int structure_number;
     };
 
@@ -852,7 +856,7 @@ class CV_EXPORTS_W_SIMPLE QRCodeDetectorAruco : public GraphicalCodeDetector {
     CV_WRAP QRCodeDetectorAruco& setDetectorParameters(const QRCodeDetectorAruco::Params& params);
 
     /** @brief Aruco detector parameters are used to search for the finder patterns. */
-    CV_WRAP aruco::DetectorParameters getArucoParameters();
+    CV_WRAP const aruco::DetectorParameters& getArucoParameters() const;
 
     /** @brief Aruco detector parameters are used to search for the finder patterns. */
     CV_WRAP void setArucoParameters(const aruco::DetectorParameters& params);
diff --git a/modules/objdetect/include/opencv2/objdetect/aruco_board.hpp b/modules/objdetect/include/opencv2/objdetect/aruco_board.hpp
index 1f4147440546..e8300c82bf00 100644
--- a/modules/objdetect/include/opencv2/objdetect/aruco_board.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/aruco_board.hpp
@@ -166,11 +166,11 @@ class CV_EXPORTS_W_SIMPLE CharucoBoard : public Board {
      */
     CV_WRAP std::vector<Point3f> getChessboardCorners() const;
 
-    /** @brief get CharucoBoard::nearestMarkerIdx
+    /** @brief get CharucoBoard::nearestMarkerIdx, for each charuco corner, nearest marker index in ids array
      */
     CV_PROP std::vector<std::vector<int> > getNearestMarkerIdx() const;
 
-    /** @brief get CharucoBoard::nearestMarkerCorners
+    /** @brief get CharucoBoard::nearestMarkerCorners, for each charuco corner, nearest marker corner id of each marker
      */
     CV_PROP std::vector<std::vector<int> > getNearestMarkerCorners() const;
 
diff --git a/modules/objdetect/include/opencv2/objdetect/aruco_detector.hpp b/modules/objdetect/include/opencv2/objdetect/aruco_detector.hpp
index f885a2af87a2..9d30d55d176e 100644
--- a/modules/objdetect/include/opencv2/objdetect/aruco_detector.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/aruco_detector.hpp
@@ -33,9 +33,10 @@ struct CV_EXPORTS_W_SIMPLE DetectorParameters {
         polygonalApproxAccuracyRate = 0.03;
         minCornerDistanceRate = 0.05;
         minDistanceToBorder = 3;
-        minMarkerDistanceRate = 0.05;
+        minMarkerDistanceRate = 0.125;
         cornerRefinementMethod = (int)CORNER_REFINE_NONE;
         cornerRefinementWinSize = 5;
+        relativeCornerRefinmentWinSize = 0.3f;
         cornerRefinementMaxIterations = 30;
         cornerRefinementMinAccuracy = 0.1;
         markerBorderBits = 1;
@@ -56,7 +57,7 @@ struct CV_EXPORTS_W_SIMPLE DetectorParameters {
         useAruco3Detection = false;
         minSideLengthCanonicalImg = 32;
         minMarkerLengthRatioOriginalImg = 0.0;
-    };
+    }
 
     /** @brief Read a new set of DetectorParameters from FileNode (use FileStorage.root()).
      */
@@ -99,18 +100,50 @@ struct CV_EXPORTS_W_SIMPLE DetectorParameters {
     /// minimum distance of any corner to the image border for detected markers (in pixels) (default 3)
     CV_PROP_RW int minDistanceToBorder;
 
-    /** @brief minimum mean distance beetween two marker corners to be considered imilar, so that the smaller one is removed.
+    /** @brief minimum average distance between the corners of the two markers to be grouped (default 0.125).
      *
-     * The rate is relative to the smaller perimeter of the two markers (default 0.05).
+     * The rate is relative to the smaller perimeter of the two markers.
+     * Two markers are grouped if average distance between the corners of the two markers is less than
+     * min(MarkerPerimeter1, MarkerPerimeter2)*minMarkerDistanceRate.
+     *
+     * default value is 0.125 because 0.125*MarkerPerimeter = (MarkerPerimeter / 4) * 0.5 = half the side of the marker.
+     *
+     * @note default value was changed from 0.05 after 4.8.1 release, because the filtering algorithm has been changed.
+     * Now a few candidates from the same group can be added to the list of candidates if they are far from each other.
+     * @sa minGroupDistance.
      */
     CV_PROP_RW double minMarkerDistanceRate;
 
+    /** @brief minimum average distance between the corners of the two markers in group to add them to the list of candidates
+     *
+     * The average distance between the corners of the two markers is calculated relative to its module size (default 0.21).
+     */
+    CV_PROP_RW float minGroupDistance = 0.21f;
+
     /** @brief default value CORNER_REFINE_NONE */
     CV_PROP_RW int cornerRefinementMethod;
 
-    /// window size for the corner refinement process (in pixels) (default 5).
+    /** @brief maximum window size for the corner refinement process (in pixels) (default 5).
+     *
+     * The window size may decrease if the ArUco marker is too small, check relativeCornerRefinmentWinSize.
+     * The final window size is calculated as:
+     * min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+     * where averageArucoModuleSize is average module size of ArUco marker in pixels.
+     * (ArUco marker is composed of black and white modules)
+     */
     CV_PROP_RW int cornerRefinementWinSize;
 
+    /** @brief Dynamic window size for corner refinement relative to Aruco module size (default 0.3).
+     *
+     * The final window size is calculated as:
+     * min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+     * where averageArucoModuleSize is average module size of ArUco marker in pixels.
+     * (ArUco marker is composed of black and white modules)
+     * In the case of markers located far from each other, it may be useful to increase the value of the parameter to 0.4-0.5.
+     * In the case of markers located close to each other, it may be useful to decrease the parameter value to 0.1-0.2.
+     */
+    CV_PROP_RW float relativeCornerRefinmentWinSize;
+
     /// maximum number of iterations for stop criteria of the corner refinement process (default 30).
     CV_PROP_RW int cornerRefinementMaxIterations;
 
@@ -219,7 +252,7 @@ struct CV_EXPORTS_W_SIMPLE RefineParameters {
      */
     CV_PROP_RW float minRepDistance;
 
-    /** @brief minRepDistance rate of allowed erroneous bits respect to the error correction capability of the used dictionary.
+    /** @brief errorCorrectionRate rate of allowed erroneous bits respect to the error correction capability of the used dictionary.
      *
      * -1 ignores the error correction step.
      */
diff --git a/modules/objdetect/include/opencv2/objdetect/aruco_dictionary.hpp b/modules/objdetect/include/opencv2/objdetect/aruco_dictionary.hpp
index c46b5fbfb531..bc7b934b2a60 100644
--- a/modules/objdetect/include/opencv2/objdetect/aruco_dictionary.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/aruco_dictionary.hpp
@@ -13,32 +13,39 @@ namespace aruco {
 //! @{
 
 
-/** @brief Dictionary/Set of markers, it contains the inner codification
+/** @brief Dictionary is a set of unique ArUco markers of the same size
  *
- * BytesList contains the marker codewords where:
+ * `bytesList` storing as 2-dimensions Mat with 4-th channels (CV_8UC4 type was used) and contains the marker codewords where:
  * - bytesList.rows is the dictionary size
- * - each marker is encoded using `nbytes = ceil(markerSize*markerSize/8.)`
+ * - each marker is encoded using `nbytes = ceil(markerSize*markerSize/8.)` bytes
  * - each row contains all 4 rotations of the marker, so its length is `4*nbytes`
- *
- * `bytesList.ptr(i)[k*nbytes + j]` is then the j-th byte of i-th marker, in its k-th rotation.
+ * - the byte order in the bytesList[i] row:
+ * `//bytes without rotation/bytes with rotation 1/bytes with rotation 2/bytes with rotation 3//`
+ * So `bytesList.ptr(i)[k*nbytes + j]` is the j-th byte of i-th marker, in its k-th rotation.
+ * @note Python bindings generate matrix with shape of bytesList `dictionary_size x nbytes x 4`,
+ * but it should be indexed like C++ version. Python example for j-th byte of i-th marker, in its k-th rotation:
+ * `aruco_dict.bytesList[id].ravel()[k*nbytes + j]`
  */
 class CV_EXPORTS_W_SIMPLE Dictionary {
 
     public:
-    CV_PROP_RW Mat bytesList;         // marker code information
-    CV_PROP_RW int markerSize;        // number of bits per dimension
-    CV_PROP_RW int maxCorrectionBits; // maximum number of bits that can be corrected
-
+    CV_PROP_RW Mat bytesList;         ///< marker code information. See class description for more details
+    CV_PROP_RW int markerSize;        ///< number of bits per dimension
+    CV_PROP_RW int maxCorrectionBits; ///< maximum number of bits that can be corrected
 
     CV_WRAP Dictionary();
 
+    /** @brief Basic ArUco dictionary constructor
+     *
+     * @param bytesList bits for all ArUco markers in dictionary see memory layout in the class description
+     * @param _markerSize ArUco marker size in units
+     * @param maxcorr maximum number of bits that can be corrected
+     */
     CV_WRAP Dictionary(const Mat &bytesList, int _markerSize, int maxcorr = 0);
 
-
-
     /** @brief Read a new dictionary from FileNode.
      *
-     * Dictionary format:\n
+     * Dictionary example in YAML format:\n
      * nmarkers: 35\n
      * markersize: 6\n
      * maxCorrectionBits: 5\n
@@ -54,13 +61,13 @@ class CV_EXPORTS_W_SIMPLE Dictionary {
 
     /** @brief Given a matrix of bits. Returns whether if marker is identified or not.
      *
-     * It returns by reference the correct id (if any) and the correct rotation
+     * Returns reference to the marker id in the dictionary (if any) and its rotation.
      */
     CV_WRAP bool identify(const Mat &onlyBits, CV_OUT int &idx, CV_OUT int &rotation, double maxCorrectionRate) const;
 
-    /** @brief Returns the distance of the input bits to the specific id.
+    /** @brief Returns Hamming distance of the input bits to the specific id.
      *
-     * If allRotations is true, the four posible bits rotation are considered
+     * If `allRotations` flag is set, the four posible marker rotations are considered
      */
     CV_WRAP int getDistanceToId(InputArray bits, int id, bool allRotations = true) const;
 
@@ -70,7 +77,7 @@ class CV_EXPORTS_W_SIMPLE Dictionary {
     CV_WRAP void generateImageMarker(int id, int sidePixels, OutputArray _img, int borderBits = 1) const;
 
 
-    /** @brief Transform matrix of bits to list of bytes in the 4 rotations
+    /** @brief Transform matrix of bits to list of bytes with 4 marker rotations
       */
     CV_WRAP static Mat getByteListFromBits(const Mat &bits);
 
diff --git a/modules/objdetect/include/opencv2/objdetect/barcode.hpp b/modules/objdetect/include/opencv2/objdetect/barcode.hpp
index 958490a4222e..c20b67c0b29e 100644
--- a/modules/objdetect/include/opencv2/objdetect/barcode.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/barcode.hpp
@@ -27,7 +27,7 @@ class CV_EXPORTS_W_SIMPLE BarcodeDetector : public cv::GraphicalCodeDetector
      * @param prototxt_path prototxt file path for the super resolution model
      * @param model_path model file path for the super resolution model
      */
-    CV_WRAP BarcodeDetector(const std::string &prototxt_path, const std::string &model_path);
+    CV_WRAP BarcodeDetector(CV_WRAP_FILE_PATH const std::string &prototxt_path, CV_WRAP_FILE_PATH const std::string &model_path);
     ~BarcodeDetector();
 
     /** @brief Decodes barcode in image once it's found by the detect() method.
@@ -57,6 +57,52 @@ class CV_EXPORTS_W_SIMPLE BarcodeDetector : public cv::GraphicalCodeDetector
                                       CV_OUT std::vector<std::string> &decoded_info,
                                       CV_OUT std::vector<std::string> &decoded_type,
                                       OutputArray points = noArray()) const;
+
+    /** @brief Get detector downsampling threshold.
+     *
+     * @return detector downsampling threshold
+     */
+    CV_WRAP double getDownsamplingThreshold() const;
+
+    /** @brief Set detector downsampling threshold.
+     *
+     * By default, the detect method resizes the input image to this limit if the smallest image size is is greater than the threshold.
+     * Increasing this value can improve detection accuracy and the number of results at the expense of performance.
+     * Correlates with detector scales. Setting this to a large value will disable downsampling.
+     * @param thresh downsampling limit to apply (default 512)
+     * @see setDetectorScales
+     */
+    CV_WRAP BarcodeDetector& setDownsamplingThreshold(double thresh);
+
+    /** @brief Returns detector box filter sizes.
+     *
+     * @param sizes output parameter for returning the sizes.
+     */
+    CV_WRAP void getDetectorScales(CV_OUT std::vector<float>& sizes) const;
+
+    /** @brief Set detector box filter sizes.
+     *
+     * Adjusts the value and the number of box filters used in the detect step.
+     * The filter sizes directly correlate with the expected line widths for a barcode. Corresponds to expected barcode distance.
+     * If the downsampling limit is increased, filter sizes need to be adjusted in an inversely proportional way.
+     * @param sizes box filter sizes, relative to minimum dimension of the image (default [0.01, 0.03, 0.06, 0.08])
+     */
+    CV_WRAP BarcodeDetector& setDetectorScales(const std::vector<float>& sizes);
+
+    /** @brief Get detector gradient magnitude threshold.
+     *
+     * @return detector gradient magnitude threshold.
+     */
+    CV_WRAP double getGradientThreshold() const;
+
+    /** @brief Set detector gradient magnitude threshold.
+     *
+     * Sets the coherence threshold for detected bounding boxes.
+     * Increasing this value will generate a closer fitted bounding box width and can reduce false-positives.
+     * Values between 16 and 1024 generally work, while too high of a value will remove valid detections.
+     * @param thresh gradient magnitude threshold (default 64).
+     */
+    CV_WRAP BarcodeDetector& setGradientThreshold(double thresh);
 };
 //! @}
 
diff --git a/modules/objdetect/include/opencv2/objdetect/charuco_detector.hpp b/modules/objdetect/include/opencv2/objdetect/charuco_detector.hpp
index a23960d55713..e10cb3f02542 100644
--- a/modules/objdetect/include/opencv2/objdetect/charuco_detector.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/charuco_detector.hpp
@@ -77,6 +77,9 @@ class CV_EXPORTS_W CharucoDetector : public Algorithm {
      * If camera parameters are provided, the process is based in an approximated pose estimation, else it is based on local homography.
      * Only visible corners are returned. For each corner, its corresponding identifier is also returned in charucoIds.
      * @sa findChessboardCorners
+     * @note After OpenCV 4.6.0, there was an incompatible change in the ChArUco pattern generation algorithm for even row counts.
+     * Use cv::aruco::CharucoBoard::setLegacyPattern() to ensure compatibility with patterns created using OpenCV versions prior to 4.6.0.
+     * For more information, see the issue: https://github.com/opencv/opencv/issues/23152
      */
     CV_WRAP void detectBoard(InputArray image, OutputArray charucoCorners, OutputArray charucoIds,
                              InputOutputArrayOfArrays markerCorners = noArray(),
diff --git a/modules/objdetect/include/opencv2/objdetect/detection_based_tracker.hpp b/modules/objdetect/include/opencv2/objdetect/detection_based_tracker.hpp
index fb96c668a5a4..8050278b4232 100644
--- a/modules/objdetect/include/opencv2/objdetect/detection_based_tracker.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/detection_based_tracker.hpp
@@ -192,7 +192,7 @@ class CV_EXPORTS DetectionBasedTracker
             {
                 lastPositions.push_back(rect);
                 id=getNextId();
-            };
+            }
 
             static int getNextId()
             {
diff --git a/modules/objdetect/include/opencv2/objdetect/face.hpp b/modules/objdetect/include/opencv2/objdetect/face.hpp
index a8e98c4012f9..bfa04cbd16eb 100644
--- a/modules/objdetect/include/opencv2/objdetect/face.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/face.hpp
@@ -20,7 +20,7 @@ model download link: https://github.com/opencv/opencv_zoo/tree/master/models/fac
 class CV_EXPORTS_W FaceDetectorYN
 {
 public:
-    virtual ~FaceDetectorYN() {};
+    virtual ~FaceDetectorYN() {}
 
     /** @brief Set the size for the network input, which overwrites the input size of creating model. Call this method when the size of input image does not match the input size when creating model
      *
@@ -71,7 +71,7 @@ class CV_EXPORTS_W FaceDetectorYN
      */
     CV_WRAP virtual int detect(InputArray image, OutputArray faces) = 0;
 
-    /** @brief Creates an instance of this class with given parameters
+    /** @brief Creates an instance of face detector class with given parameters
      *
      *  @param model the path to the requested model
      *  @param config the path to the config file for compability, which is not requested for ONNX models
@@ -82,14 +82,37 @@ class CV_EXPORTS_W FaceDetectorYN
      *  @param backend_id the id of backend
      *  @param target_id the id of target device
      */
-    CV_WRAP static Ptr<FaceDetectorYN> create(const String& model,
-                                              const String& config,
+    CV_WRAP static Ptr<FaceDetectorYN> create(CV_WRAP_FILE_PATH const String& model,
+                                              CV_WRAP_FILE_PATH const String& config,
                                               const Size& input_size,
                                               float score_threshold = 0.9f,
                                               float nms_threshold = 0.3f,
                                               int top_k = 5000,
                                               int backend_id = 0,
                                               int target_id = 0);
+
+    /** @overload
+     *
+     *  @param framework Name of origin framework
+     *  @param bufferModel A buffer with a content of binary file with weights
+     *  @param bufferConfig A buffer with a content of text file contains network configuration
+     *  @param input_size the size of the input image
+     *  @param score_threshold the threshold to filter out bounding boxes of score smaller than the given value
+     *  @param nms_threshold the threshold to suppress bounding boxes of IoU bigger than the given value
+     *  @param top_k keep top K bboxes before NMS
+     *  @param backend_id the id of backend
+     *  @param target_id the id of target device
+     */
+    CV_WRAP static Ptr<FaceDetectorYN> create(const String& framework,
+                                              const std::vector<uchar>& bufferModel,
+                                              const std::vector<uchar>& bufferConfig,
+                                              const Size& input_size,
+                                              float score_threshold = 0.9f,
+                                              float nms_threshold = 0.3f,
+                                              int top_k = 5000,
+                                              int backend_id = 0,
+                                              int target_id = 0);
+
 };
 
 /** @brief DNN-based face recognizer
@@ -99,7 +122,7 @@ model download link: https://github.com/opencv/opencv_zoo/tree/master/models/fac
 class CV_EXPORTS_W FaceRecognizerSF
 {
 public:
-    virtual ~FaceRecognizerSF() {};
+    virtual ~FaceRecognizerSF() {}
 
     /** @brief Definition of distance used for calculating the distance between two face features
      */
@@ -131,7 +154,7 @@ class CV_EXPORTS_W FaceRecognizerSF
      *  @param backend_id the id of backend
      *  @param target_id the id of target device
      */
-    CV_WRAP static Ptr<FaceRecognizerSF> create(const String& model, const String& config, int backend_id = 0, int target_id = 0);
+    CV_WRAP static Ptr<FaceRecognizerSF> create(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config, int backend_id = 0, int target_id = 0);
 };
 
 //! @}
diff --git a/modules/objdetect/include/opencv2/objdetect/graphical_code_detector.hpp b/modules/objdetect/include/opencv2/objdetect/graphical_code_detector.hpp
index 3535a8da1cc3..ed697c50c055 100644
--- a/modules/objdetect/include/opencv2/objdetect/graphical_code_detector.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/graphical_code_detector.hpp
@@ -66,6 +66,10 @@ class CV_EXPORTS_W_SIMPLE GraphicalCodeDetector {
     @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
     @param points optional output vector of vertices of the found graphical code quadrangles. Will be empty if not found.
     @param straight_code The optional vector of images containing binarized codes
+
+    - If there are QR codes encoded with a Structured Append mode on the image and all of them detected and decoded correctly,
+    method writes a full message to position corresponds to 0-th code in a sequence. The rest of QR codes from the same sequence
+    have empty string.
     */
     CV_WRAP bool detectAndDecodeMulti(InputArray img, CV_OUT std::vector<std::string>& decoded_info, OutputArray points = noArray(),
                                       OutputArrayOfArrays straight_code = noArray()) const;
@@ -78,4 +82,4 @@ class CV_EXPORTS_W_SIMPLE GraphicalCodeDetector {
 
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/objdetect/misc/java/test/BarcodeDetectorTest.java b/modules/objdetect/misc/java/test/BarcodeDetectorTest.java
index 92dfef667a9d..6ed149ca4ba4 100644
--- a/modules/objdetect/misc/java/test/BarcodeDetectorTest.java
+++ b/modules/objdetect/misc/java/test/BarcodeDetectorTest.java
@@ -16,9 +16,14 @@ public class BarcodeDetectorTest extends OpenCVTestCase {
     protected void setUp() throws Exception {
         super.setUp();
 
-        testDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
-        if (testDataPath == null)
-            throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
+        // relys on https://developer.android.com/reference/java/lang/System
+        isTestCaseEnabled = System.getProperties().getProperty("java.vm.name") != "Dalvik";
+
+        if (isTestCaseEnabled) {
+            testDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
+            if (testDataPath == null)
+                throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
+        }
     }
 
     public void testDetectAndDecode() {
diff --git a/modules/objdetect/misc/java/test/QRCodeDetectorTest.java b/modules/objdetect/misc/java/test/QRCodeDetectorTest.java
index 369e7b8cc339..af567cbc04c8 100644
--- a/modules/objdetect/misc/java/test/QRCodeDetectorTest.java
+++ b/modules/objdetect/misc/java/test/QRCodeDetectorTest.java
@@ -19,9 +19,14 @@ public class QRCodeDetectorTest extends OpenCVTestCase {
     protected void setUp() throws Exception {
         super.setUp();
 
-        testDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
-        if (testDataPath == null)
-            throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
+        // relys on https://developer.android.com/reference/java/lang/System
+        isTestCaseEnabled = System.getProperties().getProperty("java.vm.name") != "Dalvik";
+
+        if (isTestCaseEnabled) {
+            testDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
+            if (testDataPath == null)
+                throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
+        }
     }
 
     public void testDetectAndDecode() {
diff --git a/modules/objdetect/misc/python/test/test_objdetect_aruco.py b/modules/objdetect/misc/python/test/test_objdetect_aruco.py
index d63a19cd2fcd..8dd407d32f5a 100644
--- a/modules/objdetect/misc/python/test/test_objdetect_aruco.py
+++ b/modules/objdetect/misc/python/test/test_objdetect_aruco.py
@@ -186,6 +186,39 @@ def test_aruco_detector_refine(self):
         self.assertEqual((1, 4, 2), refine_corners[0].shape)
         np.testing.assert_array_equal(corners, refine_corners)
 
+    def test_charuco_refine(self):
+        aruco_dict = cv.aruco.getPredefinedDictionary(cv.aruco.DICT_6X6_50)
+        board_size = (3, 4)
+        board = cv.aruco.CharucoBoard(board_size, 1., .7, aruco_dict)
+        aruco_detector = cv.aruco.ArucoDetector(aruco_dict)
+        charuco_detector = cv.aruco.CharucoDetector(board)
+        cell_size = 100
+        image = board.generateImage((cell_size*board_size[0], cell_size*board_size[1]))
+        camera = np.array([[1, 0, 0.5],
+                           [0, 1, 0.5],
+                           [0, 0, 1]])
+        dist = np.array([0, 0, 0, 0, 0], dtype=np.float32).reshape(1, -1)
+
+        # generate gold corners of the ArUco markers for the test
+        gold_corners = np.array(board.getObjPoints())[:, :, 0:2]*cell_size
+
+        # detect corners
+        markerCorners, markerIds, _ = aruco_detector.detectMarkers(image)
+
+        # test refine
+        rejected = [markerCorners[-1]]
+        markerCorners, markerIds = markerCorners[:-1], markerIds[:-1]
+        markerCorners, markerIds, _, _ = aruco_detector.refineDetectedMarkers(image, board, markerCorners, markerIds,
+                                                                              rejected, cameraMatrix=camera, distCoeffs=dist)
+
+        charucoCorners, charucoIds, _, _ = charuco_detector.detectBoard(image, markerCorners=markerCorners,
+                                                                        markerIds=markerIds)
+        self.assertEqual(len(charucoIds), 6)
+        self.assertEqual(len(markerIds), 6)
+
+        for i, id in enumerate(markerIds.reshape(-1)):
+            np.testing.assert_allclose(gold_corners[id], markerCorners[i].reshape(4, 2), 0.01, 1.)
+
     def test_write_read_dictionary(self):
         try:
             aruco_dict = cv.aruco.getPredefinedDictionary(cv.aruco.DICT_5X5_50)
@@ -361,5 +394,69 @@ def test_charuco_match_image_points(self):
         self.assertEqual(2, img_points.shape[2])
         np.testing.assert_array_equal(chessboard_corners, obj_points[:, :, :2].reshape(-1, 2))
 
+    def test_draw_detected_markers(self):
+        detected_points = [[[10, 10], [50, 10], [50, 50], [10, 50]]]
+        img = np.zeros((60, 60), dtype=np.uint8)
+
+        # add extra dimension in Python to create Nx4 Mat with 2 channels
+        points1 = np.array(detected_points).reshape(-1, 4, 1, 2)
+        img = cv.aruco.drawDetectedMarkers(img, points1, borderColor=255)
+
+        # check that the marker borders are painted
+        contours, _ = cv.findContours(img, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
+        self.assertEqual(len(contours), 1)
+        self.assertEqual(img[10, 10], 255)
+        self.assertEqual(img[50, 10], 255)
+        self.assertEqual(img[50, 50], 255)
+        self.assertEqual(img[10, 50], 255)
+
+        # must throw Exception without extra dimension
+        points2 = np.array(detected_points)
+        with self.assertRaises(Exception):
+            img = cv.aruco.drawDetectedMarkers(img, points2, borderColor=255)
+
+    def test_draw_detected_charuco(self):
+        detected_points = [[[10, 10], [50, 10], [50, 50], [10, 50]]]
+        img = np.zeros((60, 60), dtype=np.uint8)
+
+        # add extra dimension in Python to create Nx1 Mat with 2 channels
+        points = np.array(detected_points).reshape(-1, 1, 2)
+        img = cv.aruco.drawDetectedCornersCharuco(img, points, cornerColor=255)
+
+        # check that the 4 charuco corners are painted
+        contours, _ = cv.findContours(img, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
+        self.assertEqual(len(contours), 4)
+        for contour in contours:
+            center_x = round(np.average(contour[:, 0, 0]))
+            center_y = round(np.average(contour[:, 0, 1]))
+            center = [center_x, center_y]
+            self.assertTrue(center in detected_points[0])
+
+        # must throw Exception without extra dimension
+        points2 = np.array(detected_points)
+        with self.assertRaises(Exception):
+            img = cv.aruco.drawDetectedCornersCharuco(img, points2, borderColor=255)
+
+    def test_draw_detected_diamonds(self):
+        detected_points = [[[10, 10], [50, 10], [50, 50], [10, 50]]]
+        img = np.zeros((60, 60), dtype=np.uint8)
+
+        # add extra dimension in Python to create Nx4 Mat with 2 channels
+        points = np.array(detected_points).reshape(-1, 4, 1, 2)
+        img = cv.aruco.drawDetectedDiamonds(img, points, borderColor=255)
+
+        # check that the diamonds borders are painted
+        contours, _ = cv.findContours(img, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
+        self.assertEqual(len(contours), 1)
+        self.assertEqual(img[10, 10], 255)
+        self.assertEqual(img[50, 10], 255)
+        self.assertEqual(img[50, 50], 255)
+        self.assertEqual(img[10, 50], 255)
+
+        # must throw Exception without extra dimension
+        points2 = np.array(detected_points)
+        with self.assertRaises(Exception):
+            img = cv.aruco.drawDetectedDiamonds(img, points2, borderColor=255)
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/objdetect/perf/perf_barcode.cpp b/modules/objdetect/perf/perf_barcode.cpp
index b960518a1ebe..9c464bbd11b1 100644
--- a/modules/objdetect/perf/perf_barcode.cpp
+++ b/modules/objdetect/perf/perf_barcode.cpp
@@ -30,6 +30,7 @@ PERF_TEST_P_(Perf_Barcode_multi, detect)
     }
     SANITY_CHECK_NOTHING();
     ASSERT_TRUE(res);
+    ASSERT_EQ(16ull, corners.size());
 }
 
 PERF_TEST_P_(Perf_Barcode_multi, detect_decode)
@@ -54,6 +55,8 @@ PERF_TEST_P_(Perf_Barcode_multi, detect_decode)
     }
     SANITY_CHECK_NOTHING();
     ASSERT_TRUE(res);
+    ASSERT_EQ(16ull, corners.size());
+    ASSERT_EQ(4ull, decoded_info.size());
 }
 
 PERF_TEST_P_(Perf_Barcode_single, detect)
@@ -76,6 +79,7 @@ PERF_TEST_P_(Perf_Barcode_single, detect)
     }
     SANITY_CHECK_NOTHING();
     ASSERT_TRUE(res);
+    ASSERT_EQ(4ull, corners.size());
 }
 
 PERF_TEST_P_(Perf_Barcode_single, detect_decode)
@@ -100,6 +104,8 @@ PERF_TEST_P_(Perf_Barcode_single, detect_decode)
     }
     SANITY_CHECK_NOTHING();
     ASSERT_TRUE(res);
+    ASSERT_EQ(4ull, corners.size());
+    ASSERT_EQ(1ull, decoded_info.size());
 }
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Perf_Barcode_multi,
diff --git a/modules/objdetect/perf/perf_qrcode_pipeline.cpp b/modules/objdetect/perf/perf_qrcode_pipeline.cpp
index 150ed8cbbee3..3d6360ca48e8 100644
--- a/modules/objdetect/perf/perf_qrcode_pipeline.cpp
+++ b/modules/objdetect/perf/perf_qrcode_pipeline.cpp
@@ -29,7 +29,6 @@ PERF_TEST_P_(Perf_Objdetect_QRCode, detect)
     SANITY_CHECK_NOTHING();
 }
 
-#ifdef HAVE_QUIRC
 PERF_TEST_P_(Perf_Objdetect_QRCode, decode)
 {
     const std::string name_current_image = GetParam();
@@ -52,7 +51,6 @@ PERF_TEST_P_(Perf_Objdetect_QRCode, decode)
     check_qr(root, name_current_image, "test_images", corners, {decoded_info}, pixels_error);
     SANITY_CHECK_NOTHING();
 }
-#endif
 
 typedef ::perf::TestBaseWithParam<std::tuple<std::string, std::string>> Perf_Objdetect_QRCode_Multi;
 
@@ -78,7 +76,6 @@ PERF_TEST_P_(Perf_Objdetect_QRCode_Multi, detectMulti)
     SANITY_CHECK_NOTHING();
 }
 
-#ifdef HAVE_QUIRC
 PERF_TEST_P_(Perf_Objdetect_QRCode_Multi, decodeMulti)
 {
     const std::string name_current_image = get<0>(GetParam());
@@ -116,7 +113,6 @@ PERF_TEST_P_(Perf_Objdetect_QRCode_Multi, decodeMulti)
     check_qr(root, name_current_image, "multiple_images", corners_result, decoded_info, pixels_error, true);
     SANITY_CHECK_NOTHING();
 }
-#endif
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Perf_Objdetect_QRCode,
     ::testing::Values(
@@ -163,7 +159,6 @@ PERF_TEST_P_(Perf_Objdetect_Not_QRCode, detect)
     SANITY_CHECK_NOTHING();
 }
 
-#ifdef HAVE_QUIRC
 PERF_TEST_P_(Perf_Objdetect_Not_QRCode, decode)
 {
     Mat straight_barcode;
@@ -195,7 +190,6 @@ PERF_TEST_P_(Perf_Objdetect_Not_QRCode, decode)
     TEST_CYCLE() ASSERT_TRUE(qrcode.decode(not_qr_code, corners, straight_barcode).empty());
     SANITY_CHECK_NOTHING();
 }
-#endif
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Perf_Objdetect_Not_QRCode,
       ::testing::Combine(
diff --git a/modules/objdetect/src/aruco/apriltag/apriltag_quad_thresh.cpp b/modules/objdetect/src/aruco/apriltag/apriltag_quad_thresh.cpp
index 38d1b2ffb810..96b7fe517e97 100644
--- a/modules/objdetect/src/aruco/apriltag/apriltag_quad_thresh.cpp
+++ b/modules/objdetect/src/aruco/apriltag/apriltag_quad_thresh.cpp
@@ -349,7 +349,7 @@ int quad_segment_maxima(const DetectorParameters &td, int sz, struct line_fit_pt
             }
             y[iy] = acc;
         }
-        copy(y.begin(), y.end(), errs.begin());
+        std::copy(y.begin(), y.end(), errs.begin());
     }
 
     std::vector<int> maxima(sz);
diff --git a/modules/objdetect/src/aruco/aruco_board.cpp b/modules/objdetect/src/aruco/aruco_board.cpp
index 2ad6c5757b0b..f5f274ec0cf1 100644
--- a/modules/objdetect/src/aruco/aruco_board.cpp
+++ b/modules/objdetect/src/aruco/aruco_board.cpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html
 
 #include "../precomp.hpp"
+#include <opencv2/core/utils/logger.hpp>
 #include "opencv2/objdetect/aruco_board.hpp"
 
 #include <opencv2/objdetect/aruco_dictionary.hpp>
@@ -98,21 +99,6 @@ void Board::Impl::generateImage(Size outSize, OutputArray img, int marginSize, i
     float sizeX = maxX - minX;
     float sizeY = maxY - minY;
 
-    // proportion transformations
-    float xReduction = sizeX / float(out.cols);
-    float yReduction = sizeY / float(out.rows);
-
-    // determine the zone where the markers are placed
-    if(xReduction > yReduction) {
-        int nRows = int(sizeY / xReduction);
-        int rowsMargins = (out.rows - nRows) / 2;
-        out.adjustROI(-rowsMargins, -rowsMargins, 0, 0);
-    } else {
-        int nCols = int(sizeX / yReduction);
-        int colsMargins = (out.cols - nCols) / 2;
-        out.adjustROI(0, 0, -colsMargins, -colsMargins);
-    }
-
     // now paint each marker
     Mat marker;
     Point2f outCorners[3];
@@ -250,7 +236,11 @@ GridBoard::GridBoard() {}
 GridBoard::GridBoard(const Size& size, float markerLength, float markerSeparation,
                      const Dictionary &dictionary, InputArray ids):
     Board(new GridBoardImpl(dictionary, size, markerLength, markerSeparation)) {
-
+    float onePin = markerLength / ((float)(dictionary.markerSize+2));
+    if (markerSeparation < onePin*.7f) {
+        CV_LOG_WARNING(NULL, "Marker border " << markerSeparation << " is less than 70% of ArUco pin size "
+            << onePin << ". Please increase markerSeparation or decrease markerLength for stable board detection");
+    }
     size_t totalMarkers = (size_t) size.width*size.height;
     CV_Assert(ids.empty() || totalMarkers == ids.total());
     vector<vector<Point3f> > objPoints;
@@ -319,8 +309,9 @@ struct CharucoBoardImpl : Board::Impl {
     // vector of chessboard 3D corners precalculated
     std::vector<Point3f> chessboardCorners;
 
-    // for each charuco corner, nearest marker id and nearest marker corner id of each marker
+    // for each charuco corner, nearest marker index in ids array
     std::vector<std::vector<int> > nearestMarkerIdx;
+    // for each charuco corner, nearest marker corner id of each marker
     std::vector<std::vector<int> > nearestMarkerCorners;
 
     void createCharucoBoard();
@@ -477,39 +468,44 @@ void CharucoBoardImpl::generateImage(Size outSize, OutputArray img, int marginSi
     Mat noMarginsImg =
         out.colRange(marginSize, out.cols - marginSize).rowRange(marginSize, out.rows - marginSize);
 
-    double totalLengthX, totalLengthY;
-    totalLengthX = squareLength * size.width;
-    totalLengthY = squareLength * size.height;
-
-    // proportional transformation
-    double xReduction = totalLengthX / double(noMarginsImg.cols);
-    double yReduction = totalLengthY / double(noMarginsImg.rows);
+    // the size of the chessboard square depends on the location of the chessboard
+    float pixInSquare = 0.f;
+    // the size of the chessboard in pixels
+    Size pixInChessboard(noMarginsImg.cols, noMarginsImg.rows);
 
     // determine the zone where the chessboard is placed
-    Mat chessboardZoneImg;
-    if(xReduction > yReduction) {
-        int nRows = int(totalLengthY / xReduction);
-        int rowsMargins = (noMarginsImg.rows - nRows) / 2;
-        chessboardZoneImg = noMarginsImg.rowRange(rowsMargins, noMarginsImg.rows - rowsMargins);
-    } else {
-        int nCols = int(totalLengthX / yReduction);
-        int colsMargins = (noMarginsImg.cols - nCols) / 2;
-        chessboardZoneImg = noMarginsImg.colRange(colsMargins, noMarginsImg.cols - colsMargins);
+    float pixInSquareX = (float)noMarginsImg.cols / (float)size.width;
+    float pixInSquareY = (float)noMarginsImg.rows / (float)size.height;
+    Point startChessboard(0, 0);
+    if (pixInSquareX <= pixInSquareY) {
+        // the width of "noMarginsImg" image determines the dimensions of the chessboard
+        pixInSquare = pixInSquareX;
+        pixInChessboard.height = cvRound(pixInSquare*size.height);
+        int rowsMargin = (noMarginsImg.rows - pixInChessboard.height) / 2;
+        startChessboard.y = rowsMargin;
+    }
+    else {
+        // the height of "noMarginsImg" image determines the dimensions of the chessboard
+        pixInSquare = pixInSquareY;
+        pixInChessboard.width = cvRound(pixInSquare*size.width);
+        int colsMargin = (noMarginsImg.cols - pixInChessboard.width) / 2;
+        startChessboard.x = colsMargin;
     }
+    // determine the zone where the chessboard is located
+    Mat chessboardZoneImg = noMarginsImg(Rect(startChessboard, pixInChessboard));
 
-    // determine the margins to draw only the markers
-    // take the minimum just to be sure
-    double squareSizePixels = min(double(chessboardZoneImg.cols) / double(size.width),
-                                  double(chessboardZoneImg.rows) / double(size.height));
+    // marker size in pixels
+    const float pixInMarker = markerLength/squareLength*pixInSquare;
+    // the size of the marker margin in pixels
+    const float pixInMarginMarker = 0.5f*(pixInSquare - pixInMarker);
 
-    double diffSquareMarkerLength = (squareLength - markerLength) / 2;
-    int diffSquareMarkerLengthPixels =
-        int(diffSquareMarkerLength * squareSizePixels / squareLength);
+    // determine the zone where the aruco markers are located
+    int endArucoX = cvRound(pixInSquare*(size.width-1)+pixInMarginMarker+pixInMarker);
+    int endArucoY = cvRound(pixInSquare*(size.height-1)+pixInMarginMarker+pixInMarker);
+    Mat arucoZone = chessboardZoneImg(Range(cvRound(pixInMarginMarker), endArucoY), Range(cvRound(pixInMarginMarker), endArucoX));
 
     // draw markers
-    Mat markersImg;
-    Board::Impl::generateImage(chessboardZoneImg.size(), markersImg, diffSquareMarkerLengthPixels, borderBits);
-    markersImg.copyTo(chessboardZoneImg);
+    Board::Impl::generateImage(arucoZone.size(), arucoZone, 0, borderBits);
 
     // now draw black squares
     for(int y = 0; y < size.height; y++) {
@@ -521,12 +517,11 @@ void CharucoBoardImpl::generateImage(Size outSize, OutputArray img, int marginSi
                 if(y % 2 != x % 2) continue; // white corner, dont do anything
             }
 
-            double startX, startY;
-            startX = squareSizePixels * double(x);
-            startY = squareSizePixels * double(y);
+            float startX = pixInSquare * float(x);
+            float startY = pixInSquare * float(y);
 
-            Mat squareZone = chessboardZoneImg.rowRange(int(startY), int(startY + squareSizePixels))
-                                 .colRange(int(startX), int(startX + squareSizePixels));
+            Mat squareZone = chessboardZoneImg(Range(cvRound(startY), cvRound(startY + pixInSquare)),
+                                               Range(cvRound(startX), cvRound(startX + pixInSquare)));
 
             squareZone.setTo(0);
         }
@@ -540,7 +535,12 @@ CharucoBoard::CharucoBoard(const Size& size, float squareLength, float markerLen
     Board(new CharucoBoardImpl(dictionary, size, squareLength, markerLength)) {
 
     CV_Assert(size.width > 1 && size.height > 1 && markerLength > 0 && squareLength > markerLength);
-
+    float onePin = markerLength / ((float)(dictionary.markerSize+2));
+    float markerSeparation = (squareLength - markerLength)/2.f;
+    if (markerSeparation < onePin*.7f) {
+        CV_LOG_WARNING(NULL, "Marker border " << markerSeparation << " is less than 70% of ArUco pin size "
+            << onePin <<". Please increase markerSeparation or decrease markerLength for stable board detection");
+    }
     ids.copyTo(impl->ids);
 
     static_pointer_cast<CharucoBoardImpl>(impl)->createCharucoBoard();
diff --git a/modules/objdetect/src/aruco/aruco_detector.cpp b/modules/objdetect/src/aruco/aruco_detector.cpp
index 395bb493388c..a1b04c81810e 100644
--- a/modules/objdetect/src/aruco/aruco_detector.cpp
+++ b/modules/objdetect/src/aruco/aruco_detector.cpp
@@ -35,6 +35,8 @@ static inline bool readWrite(DetectorParameters &params, const FileNode* readNod
     check |= readWriteParameter("minMarkerDistanceRate", params.minMarkerDistanceRate, readNode, writeStorage);
     check |= readWriteParameter("cornerRefinementMethod", params.cornerRefinementMethod, readNode, writeStorage);
     check |= readWriteParameter("cornerRefinementWinSize", params.cornerRefinementWinSize, readNode, writeStorage);
+    check |= readWriteParameter("relativeCornerRefinmentWinSize", params.relativeCornerRefinmentWinSize, readNode,
+                                writeStorage);
     check |= readWriteParameter("cornerRefinementMaxIterations", params.cornerRefinementMaxIterations,
                                 readNode, writeStorage);
     check |= readWriteParameter("cornerRefinementMinAccuracy", params.cornerRefinementMinAccuracy,
@@ -48,6 +50,7 @@ static inline bool readWrite(DetectorParameters &params, const FileNode* readNod
                                 readNode, writeStorage);
     check |= readWriteParameter("minOtsuStdDev", params.minOtsuStdDev, readNode, writeStorage);
     check |= readWriteParameter("errorCorrectionRate", params.errorCorrectionRate, readNode, writeStorage);
+    check |= readWriteParameter("minGroupDistance", params.minGroupDistance, readNode, writeStorage);
     // new aruco 3 functionality
     check |= readWriteParameter("useAruco3Detection", params.useAruco3Detection, readNode, writeStorage);
     check |= readWriteParameter("minSideLengthCanonicalImg", params.minSideLengthCanonicalImg, readNode, writeStorage);
@@ -142,10 +145,8 @@ static void _findMarkerContours(const Mat &in, vector<vector<Point2f> > &candida
         minPerimeterPixels = 4*minSize;
     }
 
-    Mat contoursImg;
-    in.copyTo(contoursImg);
     vector<vector<Point> > contours;
-    findContours(contoursImg, contours, RETR_LIST, CHAIN_APPROX_NONE);
+    findContours(in, contours, RETR_LIST, CHAIN_APPROX_NONE);
     // now filter list of contours
     for(unsigned int i = 0; i < contours.size(); i++) {
         // check perimeter
@@ -158,8 +159,7 @@ static void _findMarkerContours(const Mat &in, vector<vector<Point2f> > &candida
         if(approxCurve.size() != 4 || !isContourConvex(approxCurve)) continue;
 
         // check min distance between corners
-        double minDistSq =
-            max(contoursImg.cols, contoursImg.rows) * max(contoursImg.cols, contoursImg.rows);
+        double minDistSq = max(in.cols, in.rows) * max(in.cols, in.rows);
         for(int j = 0; j < 4; j++) {
             double d = (double)(approxCurve[j].x - approxCurve[(j + 1) % 4].x) *
                            (double)(approxCurve[j].x - approxCurve[(j + 1) % 4].x) +
@@ -174,9 +174,9 @@ static void _findMarkerContours(const Mat &in, vector<vector<Point2f> > &candida
         bool tooNearBorder = false;
         for(int j = 0; j < 4; j++) {
             if(approxCurve[j].x < minDistanceToBorder || approxCurve[j].y < minDistanceToBorder ||
-               approxCurve[j].x > contoursImg.cols - 1 - minDistanceToBorder ||
-               approxCurve[j].y > contoursImg.rows - 1 - minDistanceToBorder)
-                tooNearBorder = true;
+               approxCurve[j].x > in.cols - 1 - minDistanceToBorder ||
+               approxCurve[j].y > in.rows - 1 - minDistanceToBorder)
+               tooNearBorder = true;
         }
         if(tooNearBorder) continue;
 
@@ -210,149 +210,65 @@ static void _reorderCandidatesCorners(vector<vector<Point2f> > &candidates) {
     }
 }
 
-/**
-  * @brief to make sure that the corner's order of both candidates (default/white) is the same
-  */
-static vector<Point2f> alignContourOrder(Point2f corner, vector<Point2f> candidate) {
-    uint8_t r=0;
-    double min = norm( Vec2f( corner - candidate[0] ), NORM_L2SQR);
-    for(uint8_t pos=1; pos < 4; pos++) {
-        double nDiff = norm( Vec2f( corner - candidate[pos] ), NORM_L2SQR);
-        if(nDiff < min){
-            r = pos;
-            min =nDiff;
-        }
+static float getAverageModuleSize(const vector<Point2f>& markerCorners, int markerSize, int markerBorderBits) {
+    float averageArucoModuleSize = 0.f;
+    for (size_t i = 0ull; i < 4ull; i++) {
+        averageArucoModuleSize += sqrt(normL2Sqr<float>(Point2f(markerCorners[i] - markerCorners[(i+1ull) % 4ull])));
     }
-    std::rotate(candidate.begin(), candidate.begin() + r, candidate.end());
-    return candidate;
+    int numModules = markerSize + markerBorderBits * 2;
+    averageArucoModuleSize /= ((float)markerCorners.size()*numModules);
+    return averageArucoModuleSize;
 }
 
-/**
-  * @brief Check candidates that are too close to each other, save the potential candidates
-  *        (i.e. biggest/smallest contour) and remove the rest
-  */
-static void _filterTooCloseCandidates(const vector<vector<Point2f> > &candidatesIn,
-                                      vector<vector<vector<Point2f> > > &candidatesSetOut,
-                                      const vector<vector<Point> > &contoursIn,
-                                      vector<vector<vector<Point> > > &contoursSetOut,
-                                      double minMarkerDistanceRate, bool detectInvertedMarker) {
-
-    CV_Assert(minMarkerDistanceRate >= 0);
-    vector<int> candGroup;
-    candGroup.resize(candidatesIn.size(), -1);
-    vector<vector<unsigned int> > groupedCandidates;
-    for(unsigned int i = 0; i < candidatesIn.size(); i++) {
-        bool isSingleContour = true;
-        for(unsigned int j = i + 1; j < candidatesIn.size(); j++) {
-
-            int minimumPerimeter = min((int)contoursIn[i].size(), (int)contoursIn[j].size() );
-
-            // fc is the first corner considered on one of the markers, 4 combinations are possible
-            for(int fc = 0; fc < 4; fc++) {
-                double distSq = 0;
-                for(int c = 0; c < 4; c++) {
-                    // modC is the corner considering first corner is fc
-                    int modC = (c + fc) % 4;
-                    distSq += (candidatesIn[i][modC].x - candidatesIn[j][c].x) *
-                                  (candidatesIn[i][modC].x - candidatesIn[j][c].x) +
-                              (candidatesIn[i][modC].y - candidatesIn[j][c].y) *
-                                  (candidatesIn[i][modC].y - candidatesIn[j][c].y);
-                }
-                distSq /= 4.;
+static bool checkMarker1InMarker2(const vector<Point2f>& marker1, const vector<Point2f>& marker2) {
+    return pointPolygonTest(marker2, marker1[0], false) >= 0 && pointPolygonTest(marker2, marker1[1], false) >= 0 &&
+           pointPolygonTest(marker2, marker1[2], false) >= 0 && pointPolygonTest(marker2, marker1[3], false) >= 0;
+}
 
-                // if mean square distance is too low, remove the smaller one of the two markers
-                double minMarkerDistancePixels = double(minimumPerimeter) * minMarkerDistanceRate;
-                if(distSq < minMarkerDistancePixels * minMarkerDistancePixels) {
-                    isSingleContour = false;
-                    // i and j are not related to a group
-                    if(candGroup[i]<0 && candGroup[j]<0){
-                        // mark candidates with their corresponding group number
-                        candGroup[i] = candGroup[j] = (int)groupedCandidates.size();
+struct MarkerCandidate {
+    vector<Point2f> corners;
+    vector<Point> contour;
+    float perimeter = 0.f;
+};
 
-                        // create group
-                        vector<unsigned int> grouped;
-                        grouped.push_back(i);
-                        grouped.push_back(j);
-                        groupedCandidates.push_back( grouped );
-                    }
-                    // i is related to a group
-                    else if(candGroup[i] > -1 && candGroup[j] == -1){
-                        int group = candGroup[i];
-                        candGroup[j] = group;
+struct MarkerCandidateTree : MarkerCandidate{
+    int parent = -1;
+    int depth = 0;
+    vector<MarkerCandidate> closeContours;
 
-                        // add to group
-                        groupedCandidates[group].push_back( j );
-                    }
-                    // j is related to a group
-                    else if(candGroup[j] > -1 && candGroup[i] == -1){
-                        int group = candGroup[j];
-                        candGroup[i] = group;
+    MarkerCandidateTree() {}
 
-                        // add to group
-                        groupedCandidates[group].push_back( i );
-                    }
-                }
-            }
-        }
-        if (isSingleContour && candGroup[i] < 0)
-        {
-            candGroup[i] = (int)groupedCandidates.size();
-            vector<unsigned int> grouped;
-            grouped.push_back(i);
-            grouped.push_back(i); // step "save possible candidates" require minimum 2 elements
-            groupedCandidates.push_back(grouped);
+    MarkerCandidateTree(vector<Point2f>&& corners_, vector<Point>&& contour_) {
+        corners = std::move(corners_);
+        contour = std::move(contour_);
+        perimeter = 0.f;
+        for (size_t i = 0ull; i < 4ull; i++) {
+            perimeter += sqrt(normL2Sqr<float>(Point2f(corners[i] - corners[(i+1ull) % 4ull])));
         }
     }
 
-    // save possible candidates
-    candidatesSetOut.clear();
-    contoursSetOut.clear();
-
-    vector<vector<Point2f> > biggerCandidates;
-    vector<vector<Point> > biggerContours;
-    vector<vector<Point2f> > smallerCandidates;
-    vector<vector<Point> > smallerContours;
-
-    // save possible candidates
-    for(unsigned int i = 0; i < groupedCandidates.size(); i++) {
-        unsigned int smallerIdx = groupedCandidates[i][0];
-        unsigned int biggerIdx = smallerIdx;
-        double smallerArea = contourArea(candidatesIn[smallerIdx]);
-        double biggerArea = smallerArea;
-
-        // evaluate group elements
-        for(unsigned int j = 1; j < groupedCandidates[i].size(); j++) {
-            unsigned int currIdx = groupedCandidates[i][j];
-            double currArea = contourArea(candidatesIn[currIdx]);
-
-            // check if current contour is bigger
-            if(currArea >= biggerArea) {
-                biggerIdx = currIdx;
-                biggerArea = currArea;
-            }
+    bool operator<(const MarkerCandidateTree& m) const {
+        // sorting the contors in descending order
+        return perimeter > m.perimeter;
+    }
+};
 
-            // check if current contour is smaller
-            if(currArea < smallerArea && detectInvertedMarker) {
-                smallerIdx = currIdx;
-                smallerArea = currArea;
-            }
-        }
 
-        // add contours and candidates
-        biggerCandidates.push_back(candidatesIn[biggerIdx]);
-        biggerContours.push_back(contoursIn[biggerIdx]);
-        if(detectInvertedMarker) {
-            smallerCandidates.push_back(alignContourOrder(candidatesIn[biggerIdx][0], candidatesIn[smallerIdx]));
-            smallerContours.push_back(contoursIn[smallerIdx]);
+// returns the average distance between the marker points
+float static inline getAverageDistance(const std::vector<Point2f>& marker1, const std::vector<Point2f>& marker2) {
+    float minDistSq = std::numeric_limits<float>::max();
+    // fc is the first corner considered on one of the markers, 4 combinations are possible
+    for(int fc = 0; fc < 4; fc++) {
+        float distSq = 0;
+        for(int c = 0; c < 4; c++) {
+            // modC is the corner considering first corner is fc
+            int modC = (c + fc) % 4;
+            distSq += normL2Sqr<float>(marker1[modC] - marker2[c]);
         }
+        distSq /= 4.f;
+        minDistSq = min(minDistSq, distSq);
     }
-    // to preserve the structure :: candidateSet< defaultCandidates, whiteCandidates >
-    // default candidates
-    candidatesSetOut.push_back(biggerCandidates);
-    contoursSetOut.push_back(biggerContours);
-    // white candidates
-    candidatesSetOut.push_back(smallerCandidates);
-    contoursSetOut.push_back(smallerContours);
+    return sqrt(minDistSq);
 }
 
 /**
@@ -401,29 +317,6 @@ static void _detectInitialCandidates(const Mat &grey, vector<vector<Point2f> > &
 }
 
 
-/**
- * @brief Detect square candidates in the input image
- */
-static void _detectCandidates(InputArray _grayImage, vector<vector<vector<Point2f> > >& candidatesSetOut,
-                              vector<vector<vector<Point> > >& contoursSetOut, const DetectorParameters &_params) {
-    Mat grey = _grayImage.getMat();
-    CV_DbgAssert(grey.total() != 0);
-    CV_DbgAssert(grey.type() == CV_8UC1);
-
-    /// 1. DETECT FIRST SET OF CANDIDATES
-    vector<vector<Point2f> > candidates;
-    vector<vector<Point> > contours;
-    _detectInitialCandidates(grey, candidates, contours, _params);
-    /// 2. SORT CORNERS
-    _reorderCandidatesCorners(candidates);
-
-    /// 3. FILTER OUT NEAR CANDIDATE PAIRS
-    // save the outter/inner border (i.e. potential candidates)
-    _filterTooCloseCandidates(candidates, candidatesSetOut, contours, contoursSetOut,
-                              _params.minMarkerDistanceRate, _params.detectInvertedMarker);
-}
-
-
 /**
   * @brief Given an input image and candidate corners, extract the bits of the candidate, including
   * the border bits
@@ -525,12 +418,10 @@ static int _getBorderErrors(const Mat &bits, int markerSize, int borderSize) {
  *                           1 if the candidate is a black candidate (default candidate)
  *                           2 if the candidate is a white candidate
  */
-static uint8_t _identifyOneCandidate(const Dictionary& dictionary, InputArray _image,
+static uint8_t _identifyOneCandidate(const Dictionary& dictionary, const Mat& _image,
                                      const vector<Point2f>& _corners, int& idx,
                                      const DetectorParameters& params, int& rotation,
                                      const float scale = 1.f) {
-    CV_DbgAssert(_corners.size() == 4);
-    CV_DbgAssert(_image.getMat().total() != 0);
     CV_DbgAssert(params.markerBorderBits > 0);
     uint8_t typ=1;
     // get bits
@@ -608,91 +499,10 @@ static size_t _findOptPyrImageForCanonicalImg(
     return optLevel;
 }
 
-/**
- * @brief Identify square candidates according to a marker dictionary
- */
-
-static void _identifyCandidates(InputArray grey,
-                                const vector<Mat>& image_pyr,
-                                vector<vector<vector<Point2f> > >& _candidatesSet,
-                                vector<vector<vector<Point> > >& _contoursSet, const Dictionary &_dictionary,
-                                vector<vector<Point2f> >& _accepted, vector<vector<Point> >& _contours, vector<int>& ids,
-                                const DetectorParameters &params,
-                                OutputArrayOfArrays _rejected = noArray()) {
-    CV_DbgAssert(grey.getMat().total() != 0);
-    CV_DbgAssert(grey.getMat().type() == CV_8UC1);
-    int ncandidates = (int)_candidatesSet[0].size();
-    vector<vector<Point2f> > accepted;
-    vector<vector<Point2f> > rejected;
-    vector<vector<Point> > contours;
-
-    vector<int> idsTmp(ncandidates, -1);
-    vector<int> rotated(ncandidates, 0);
-    vector<uint8_t> validCandidates(ncandidates, 0);
-
-    //// Analyze each of the candidates
-    parallel_for_(Range(0, ncandidates), [&](const Range &range) {
-        const int begin = range.start;
-        const int end = range.end;
-
-        vector<vector<Point2f> >& candidates = params.detectInvertedMarker ? _candidatesSet[1] : _candidatesSet[0];
-        vector<vector<Point> >& contourS = params.detectInvertedMarker ? _contoursSet[1] : _contoursSet[0];
-
-        for(int i = begin; i < end; i++) {
-            int currId = -1;
-            // implements equation (4)
-            if (params.useAruco3Detection) {
-                const int perimeterOfContour = static_cast<int>(contourS[i].size());
-                const int min_perimeter = params.minSideLengthCanonicalImg * 4;
-                const size_t nearestImgId = _findOptPyrImageForCanonicalImg(image_pyr, grey.cols(), perimeterOfContour, min_perimeter);
-                const float scale = image_pyr[nearestImgId].cols / static_cast<float>(grey.cols());
-
-                validCandidates[i] = _identifyOneCandidate(_dictionary, image_pyr[nearestImgId], candidates[i], currId, params, rotated[i], scale);
-            }
-            else {
-                validCandidates[i] = _identifyOneCandidate(_dictionary, grey, candidates[i], currId, params, rotated[i]);
-            }
-
-            if(validCandidates[i] > 0)
-                idsTmp[i] = currId;
-        }
-    });
-
-    for(int i = 0; i < ncandidates; i++) {
-        if(validCandidates[i] > 0) {
-            // to choose the right set of candidates :: 0 for default, 1 for white markers
-            uint8_t set = validCandidates[i]-1;
-
-            // shift corner positions to the correct rotation
-            correctCornerPosition(_candidatesSet[set][i], rotated[i]);
-
-            if( !params.detectInvertedMarker && validCandidates[i] == 2 )
-                continue;
-
-            // add valid candidate
-            accepted.push_back(_candidatesSet[set][i]);
-            ids.push_back(idsTmp[i]);
-
-            contours.push_back(_contoursSet[set][i]);
-
-        } else {
-            rejected.push_back(_candidatesSet[0][i]);
-        }
-    }
-
-    // parse output
-    _accepted = accepted;
-
-    _contours= contours;
-
-    if(_rejected.needed()) {
-        _copyVector2Output(rejected, _rejected);
-    }
-}
 
 /**
  * Line fitting  A * B = C :: Called from function refineCandidateLines
- * @param nContours, contour-container
+ * @param nContours contour-container
  */
 static Point3f _interpolate2Dline(const vector<Point2f>& nContours){
     CV_Assert(nContours.size() >= 2);
@@ -748,10 +558,8 @@ static Point2f _getCrossPoint(Point3f nLine1, Point3f nLine2){
 
 /**
  * Refine Corners using the contour vector :: Called from function detectMarkers
- * @param nContours, contour-container
- * @param nCorners, candidate Corners
- * @param camMatrix, cameraMatrix input 3x3 floating-point camera matrix
- * @param distCoeff, distCoeffs vector of distortion coefficient
+ * @param nContours contour-container
+ * @param nCorners candidate Corners
  */
 static void _refineCandidateLines(vector<Point>& nContours, vector<Point2f>& nCorners){
     vector<Point2f> contour2f(nContours.begin(), nContours.end());
@@ -846,6 +654,210 @@ struct ArucoDetector::ArucoDetectorImpl {
     ArucoDetectorImpl(const Dictionary &_dictionary, const DetectorParameters &_detectorParams,
                       const RefineParameters& _refineParams): dictionary(_dictionary),
                       detectorParams(_detectorParams), refineParams(_refineParams) {}
+    /**
+     * @brief Detect square candidates in the input image
+     */
+    void detectCandidates(const Mat& grey, vector<vector<Point2f> >& candidates, vector<vector<Point> >& contours) {
+        /// 1. DETECT FIRST SET OF CANDIDATES
+        _detectInitialCandidates(grey, candidates, contours, detectorParams);
+        /// 2. SORT CORNERS
+        _reorderCandidatesCorners(candidates);
+    }
+
+    /**
+     * @brief  FILTER OUT NEAR CANDIDATE PAIRS
+     *
+     * save the outter/inner border (i.e. potential candidates) to vector<MarkerCandidateTree>,
+     * clear candidates and contours
+     */
+    vector<MarkerCandidateTree>
+    filterTooCloseCandidates(vector<vector<Point2f> > &candidates, vector<vector<Point> > &contours) {
+        CV_Assert(detectorParams.minMarkerDistanceRate >= 0.);
+        vector<MarkerCandidateTree> candidateTree(candidates.size());
+        for(size_t i = 0ull; i < candidates.size(); i++) {
+            candidateTree[i] = MarkerCandidateTree(std::move(candidates[i]), std::move(contours[i]));
+        }
+        candidates.clear();
+        contours.clear();
+
+        // sort candidates from big to small
+        std::stable_sort(candidateTree.begin(), candidateTree.end());
+        // group index for each candidate
+        vector<int> groupId(candidateTree.size(), -1);
+        vector<vector<size_t> > groupedCandidates;
+        vector<bool> isSelectedContours(candidateTree.size(), true);
+
+        size_t countSelectedContours = 0ull;
+        for (size_t i = 0ull; i < candidateTree.size(); i++) {
+            for (size_t j = i + 1ull; j < candidateTree.size(); j++) {
+                float minDist = getAverageDistance(candidateTree[i].corners, candidateTree[j].corners);
+                // if mean distance is too low, group markers
+                // the distance between the points of two independent markers should be more than half the side of the marker
+                // half the side of the marker = (perimeter / 4) * 0.5 = perimeter * 0.125
+                if(minDist < candidateTree[j].perimeter*(float)detectorParams.minMarkerDistanceRate) {
+                    isSelectedContours[i] = false;
+                    isSelectedContours[j] = false;
+                    // i and j are not related to a group
+                    if(groupId[i] < 0 && groupId[j] < 0){
+                        // mark candidates with their corresponding group number
+                        groupId[i] = groupId[j] = (int)groupedCandidates.size();
+                        // create group
+                        groupedCandidates.push_back({i, j});
+                    }
+                    // i is related to a group
+                    else if(groupId[i] > -1 && groupId[j] == -1) {
+                        int group = groupId[i];
+                        groupId[j] = group;
+                        // add to group
+                        groupedCandidates[group].push_back(j);
+                    }
+                    // j is related to a group
+                    else if(groupId[j] > -1 && groupId[i] == -1) {
+                        int group = groupId[j];
+                        groupId[i] = group;
+                        // add to group
+                        groupedCandidates[group].push_back(i);
+                    }
+                }
+            }
+            countSelectedContours += isSelectedContours[i];
+        }
+
+        for (vector<size_t>& grouped : groupedCandidates) {
+            if (detectorParams.detectInvertedMarker) // if detectInvertedMarker choose smallest contours
+                std::stable_sort(grouped.begin(), grouped.end(), [](const size_t &a, const size_t &b) {
+                    return a > b;
+                });
+            else // if detectInvertedMarker==false choose largest contours
+                std::stable_sort(grouped.begin(), grouped.end());
+            size_t currId = grouped[0];
+            isSelectedContours[currId] = true;
+            for (size_t i = 1ull; i < grouped.size(); i++) {
+                size_t id = grouped[i];
+                float dist = getAverageDistance(candidateTree[id].corners, candidateTree[currId].corners);
+                float moduleSize = getAverageModuleSize(candidateTree[id].corners, dictionary.markerSize, detectorParams.markerBorderBits);
+                if (dist > detectorParams.minGroupDistance*moduleSize) {
+                    currId = id;
+                    candidateTree[grouped[0]].closeContours.push_back(candidateTree[id]);
+                }
+            }
+        }
+
+        vector<MarkerCandidateTree> selectedCandidates(countSelectedContours + groupedCandidates.size());
+        countSelectedContours = 0ull;
+        for (size_t i = 0ull; i < candidateTree.size(); i++) {
+            if (isSelectedContours[i]) {
+                selectedCandidates[countSelectedContours] = std::move(candidateTree[i]);
+                countSelectedContours++;
+            }
+        }
+
+        // find hierarchy in the candidate tree
+        for (int i = (int)selectedCandidates.size()-1; i >= 0; i--) {
+            for (int j = i - 1; j >= 0; j--) {
+                if (checkMarker1InMarker2(selectedCandidates[i].corners, selectedCandidates[j].corners)) {
+                    selectedCandidates[i].parent = j;
+                    selectedCandidates[j].depth = max(selectedCandidates[j].depth, selectedCandidates[i].depth + 1);
+                    break;
+                }
+            }
+        }
+        return selectedCandidates;
+    }
+
+    /**
+     * @brief Identify square candidates according to a marker dictionary
+     */
+    void identifyCandidates(const Mat& grey, const vector<Mat>& image_pyr, vector<MarkerCandidateTree>& selectedContours,
+                            vector<vector<Point2f> >& accepted, vector<vector<Point> >& contours,
+                            vector<int>& ids, OutputArrayOfArrays _rejected = noArray()) {
+        size_t ncandidates = selectedContours.size();
+        vector<vector<Point2f> > rejected;
+
+        vector<int> idsTmp(ncandidates, -1);
+        vector<int> rotated(ncandidates, 0);
+        vector<uint8_t> validCandidates(ncandidates, 0);
+        vector<uint8_t> was(ncandidates, false);
+        bool checkCloseContours = true;
+
+        int maxDepth = 0;
+        for (size_t i = 0ull; i < selectedContours.size(); i++)
+            maxDepth = max(selectedContours[i].depth, maxDepth);
+        vector<vector<size_t>> depths(maxDepth+1);
+        for (size_t i = 0ull; i < selectedContours.size(); i++) {
+            depths[selectedContours[i].depth].push_back(i);
+        }
+
+        //// Analyze each of the candidates
+        int depth = 0;
+        size_t counter = 0;
+        while (counter < ncandidates) {
+            parallel_for_(Range(0, (int)depths[depth].size()), [&](const Range& range) {
+                const int begin = range.start;
+                const int end = range.end;
+                for (int i = begin; i < end; i++) {
+                    size_t v = depths[depth][i];
+                    was[v] = true;
+                    Mat img = grey;
+                    // implements equation (4)
+                    if (detectorParams.useAruco3Detection) {
+                        const int minPerimeter = detectorParams.minSideLengthCanonicalImg * 4;
+                        const size_t nearestImgId = _findOptPyrImageForCanonicalImg(image_pyr, grey.cols, static_cast<int>(selectedContours[v].contour.size()), minPerimeter);
+                        img = image_pyr[nearestImgId];
+                    }
+                    const float scale = detectorParams.useAruco3Detection ? img.cols / static_cast<float>(grey.cols) : 1.f;
+
+                    validCandidates[v] = _identifyOneCandidate(dictionary, img, selectedContours[v].corners, idsTmp[v], detectorParams, rotated[v], scale);
+
+                    if (validCandidates[v] == 0 && checkCloseContours) {
+                        for (const MarkerCandidate& closeMarkerCandidate: selectedContours[v].closeContours) {
+                            validCandidates[v] = _identifyOneCandidate(dictionary, img, closeMarkerCandidate.corners, idsTmp[v], detectorParams, rotated[v], scale);
+                            if (validCandidates[v] > 0) {
+                                selectedContours[v].corners = closeMarkerCandidate.corners;
+                                selectedContours[v].contour = closeMarkerCandidate.contour;
+                                break;
+                            }
+                        }
+                    }
+                }
+            });
+
+            // visit the parent vertices of the detected markers to skip identify parent contours
+            for(size_t v : depths[depth]) {
+                if(validCandidates[v] > 0) {
+                    int parent = selectedContours[v].parent;
+                    while (parent != -1) {
+                        if (!was[parent]) {
+                            was[parent] = true;
+                            counter++;
+                        }
+                        parent = selectedContours[parent].parent;
+                    }
+                }
+                counter++;
+            }
+            depth++;
+        }
+
+        for (size_t i = 0ull; i < selectedContours.size(); i++) {
+            if (validCandidates[i] > 0) {
+                    // shift corner positions to the correct rotation
+                    correctCornerPosition(selectedContours[i].corners, rotated[i]);
+
+                    accepted.push_back(selectedContours[i].corners);
+                    contours.push_back(selectedContours[i].contour);
+                    ids.push_back(idsTmp[i]);
+            }
+            else {
+                rejected.push_back(selectedContours[i].corners);
+            }
+        }
+
+        // parse output
+        if(_rejected.needed()) {
+            _copyVector2Output(rejected, _rejected);
+        }
+    }
 
 };
 
@@ -868,7 +880,7 @@ void ArucoDetector::detectMarkers(InputArray _image, OutputArrayOfArrays _corner
                 detectorParams.minMarkerLengthRatioOriginalImg == 0.0));
 
     Mat grey;
-    _convertToGrey(_image.getMat(), grey);
+    _convertToGrey(_image, grey);
 
     // Aruco3 functionality is the extension of Aruco.
     // The description can be found in:
@@ -919,23 +931,21 @@ void ArucoDetector::detectMarkers(InputArray _image, OutputArrayOfArrays _corner
     vector<vector<Point> > contours;
     vector<int> ids;
 
-    vector<vector<vector<Point2f> > > candidatesSet;
-    vector<vector<vector<Point> > > contoursSet;
-
     /// STEP 2.a Detect marker candidates :: using AprilTag
     if(detectorParams.cornerRefinementMethod == (int)CORNER_REFINE_APRILTAG){
         _apriltag(grey, detectorParams, candidates, contours);
-
-        candidatesSet.push_back(candidates);
-        contoursSet.push_back(contours);
     }
     /// STEP 2.b Detect marker candidates :: traditional way
-    else
-        _detectCandidates(grey, candidatesSet, contoursSet, detectorParams);
+    else {
+        arucoDetectorImpl->detectCandidates(grey, candidates, contours);
+    }
+
+     /// STEP 2.c FILTER OUT NEAR CANDIDATE PAIRS
+    auto selectedCandidates = arucoDetectorImpl->filterTooCloseCandidates(candidates, contours);
 
     /// STEP 2: Check candidate codification (identify markers)
-    _identifyCandidates(grey, grey_pyramid, candidatesSet, contoursSet, dictionary,
-                        candidates, contours, ids, detectorParams, _rejectedImgPoints);
+    arucoDetectorImpl->identifyCandidates(grey, grey_pyramid, selectedCandidates, candidates, contours,
+                                          ids, _rejectedImgPoints);
 
     /// STEP 3: Corner refinement :: use corner subpix
     if (detectorParams.cornerRefinementMethod == (int)CORNER_REFINE_SUBPIX) {
@@ -951,13 +961,15 @@ void ArucoDetector::detectMarkers(InputArray _image, OutputArrayOfArrays _corner
                     const float scale_init = (float) grey_pyramid[closest_pyr_image_idx].cols / grey.cols;
                     findCornerInPyrImage(scale_init, closest_pyr_image_idx, grey_pyramid, Mat(candidates[i]), detectorParams);
                 }
-                else
-                cornerSubPix(grey, Mat(candidates[i]),
-                             Size(detectorParams.cornerRefinementWinSize, detectorParams.cornerRefinementWinSize),
-                             Size(-1, -1),
-                             TermCriteria(TermCriteria::MAX_ITER | TermCriteria::EPS,
-                                          detectorParams.cornerRefinementMaxIterations,
-                                          detectorParams.cornerRefinementMinAccuracy));
+                else {
+                    int cornerRefinementWinSize = std::max(1, cvRound(detectorParams.relativeCornerRefinmentWinSize*
+                                                  getAverageModuleSize(candidates[i], dictionary.markerSize, detectorParams.markerBorderBits)));
+                    cornerRefinementWinSize = min(cornerRefinementWinSize, detectorParams.cornerRefinementWinSize);
+                    cornerSubPix(grey, Mat(candidates[i]), Size(cornerRefinementWinSize, cornerRefinementWinSize), Size(-1, -1),
+                                 TermCriteria(TermCriteria::MAX_ITER | TermCriteria::EPS,
+                                              detectorParams.cornerRefinementMaxIterations,
+                                              detectorParams.cornerRefinementMinAccuracy));
+                }
             }
         });
     }
@@ -1000,7 +1012,13 @@ static inline void _projectUndetectedMarkers(const Board &board, InputOutputArra
                                              OutputArray undetectedMarkersIds) {
     Mat rvec, tvec; // first estimate board pose with the current avaible markers
     Mat objPoints, imgPoints; // object and image points for the solvePnP function
-    board.matchImagePoints(detectedCorners, detectedIds, objPoints, imgPoints);
+    // To refine corners of ArUco markers the function refineDetectedMarkers() find an aruco markers pose from 3D-2D point correspondences.
+    // To find 3D-2D point correspondences uses matchImagePoints().
+    // The method matchImagePoints() works with ArUco corners (in Board/GridBoard cases) or with ChArUco corners (in CharucoBoard case).
+    // To refine corners of ArUco markers we need work with ArUco corners only in all boards.
+    // To call matchImagePoints() with ArUco corners for all boards we need to call matchImagePoints() from base class Board.
+    // The method matchImagePoints() implemented in Pimpl and we need to create temp Board object to call the base method.
+    Board(board.getObjPoints(), board.getDictionary(), board.getIds()).matchImagePoints(detectedCorners, detectedIds, objPoints, imgPoints);
     if (objPoints.total() < 4ull) // at least one marker from board so rvec and tvec are valid
         return;
     solvePnP(objPoints, imgPoints, cameraMatrix, distCoeffs, rvec, tvec);
@@ -1217,8 +1235,13 @@ void ArucoDetector::refineDetectedMarkers(InputArray _image, const Board& _board
                 CV_Assert(detectorParams.cornerRefinementWinSize > 0 &&
                           detectorParams.cornerRefinementMaxIterations > 0 &&
                           detectorParams.cornerRefinementMinAccuracy > 0);
+
+                std::vector<Point2f> marker(closestRotatedMarker.begin<Point2f>(), closestRotatedMarker.end<Point2f>());
+                int cornerRefinementWinSize = std::max(1, cvRound(detectorParams.relativeCornerRefinmentWinSize*
+                                              getAverageModuleSize(marker, dictionary.markerSize, detectorParams.markerBorderBits)));
+                cornerRefinementWinSize = min(cornerRefinementWinSize, detectorParams.cornerRefinementWinSize);
                 cornerSubPix(grey, closestRotatedMarker,
-                             Size(detectorParams.cornerRefinementWinSize, detectorParams.cornerRefinementWinSize),
+                             Size(cornerRefinementWinSize, cornerRefinementWinSize),
                              Size(-1, -1), TermCriteria(TermCriteria::MAX_ITER | TermCriteria::EPS,
                                                         detectorParams.cornerRefinementMaxIterations,
                                                         detectorParams.cornerRefinementMinAccuracy));
@@ -1309,24 +1332,26 @@ void drawDetectedMarkers(InputOutputArray _image, InputArrayOfArrays _corners,
     int nMarkers = (int)_corners.total();
     for(int i = 0; i < nMarkers; i++) {
         Mat currentMarker = _corners.getMat(i);
-        CV_Assert(currentMarker.total() == 4 && currentMarker.type() == CV_32FC2);
+        CV_Assert(currentMarker.total() == 4 && currentMarker.channels() == 2);
+        if (currentMarker.type() != CV_32SC2)
+            currentMarker.convertTo(currentMarker, CV_32SC2);
 
         // draw marker sides
         for(int j = 0; j < 4; j++) {
-            Point2f p0, p1;
-            p0 = currentMarker.ptr<Point2f>(0)[j];
-            p1 = currentMarker.ptr<Point2f>(0)[(j + 1) % 4];
+            Point p0, p1;
+            p0 = currentMarker.ptr<Point>(0)[j];
+            p1 = currentMarker.ptr<Point>(0)[(j + 1) % 4];
             line(_image, p0, p1, borderColor, 1);
         }
         // draw first corner mark
-        rectangle(_image, currentMarker.ptr<Point2f>(0)[0] - Point2f(3, 3),
-                  currentMarker.ptr<Point2f>(0)[0] + Point2f(3, 3), cornerColor, 1, LINE_AA);
+        rectangle(_image, currentMarker.ptr<Point>(0)[0] - Point(3, 3),
+                  currentMarker.ptr<Point>(0)[0] + Point(3, 3), cornerColor, 1, LINE_AA);
 
         // draw ID
         if(_ids.total() != 0) {
-            Point2f cent(0, 0);
+            Point cent(0, 0);
             for(int p = 0; p < 4; p++)
-                cent += currentMarker.ptr<Point2f>(0)[p];
+                cent += currentMarker.ptr<Point>(0)[p];
             cent = cent / 4.;
             stringstream s;
             s << "id=" << _ids.getMat().ptr<int>(0)[i];
diff --git a/modules/objdetect/src/aruco/aruco_dictionary.cpp b/modules/objdetect/src/aruco/aruco_dictionary.cpp
index f73cea335703..3d5f9b1bfd76 100644
--- a/modules/objdetect/src/aruco/aruco_dictionary.cpp
+++ b/modules/objdetect/src/aruco/aruco_dictionary.cpp
@@ -355,6 +355,7 @@ static int _getSelfDistance(const Mat &marker) {
 
 
 Dictionary extendDictionary(int nMarkers, int markerSize, const Dictionary &baseDictionary, int randomSeed) {
+    CV_Assert(nMarkers > 0);
     RNG rng((uint64)(randomSeed));
 
     Dictionary out = Dictionary(Mat(), markerSize);
@@ -370,7 +371,7 @@ Dictionary extendDictionary(int nMarkers, int markerSize, const Dictionary &base
     // if baseDictionary is provided, calculate its intermarker distance
     if(baseDictionary.bytesList.rows > 0) {
         CV_Assert(baseDictionary.markerSize == markerSize);
-        out.bytesList = baseDictionary.bytesList.clone();
+        out.bytesList = baseDictionary.bytesList.rowRange(0, min(nMarkers, baseDictionary.bytesList.rows)).clone();
 
         int minDistance = markerSize * markerSize + 1;
         for(int i = 0; i < out.bytesList.rows; i++) {
diff --git a/modules/objdetect/src/aruco/aruco_utils.cpp b/modules/objdetect/src/aruco/aruco_utils.cpp
index 4c0d2dc3f717..954ba1194194 100644
--- a/modules/objdetect/src/aruco/aruco_utils.cpp
+++ b/modules/objdetect/src/aruco/aruco_utils.cpp
@@ -38,12 +38,12 @@ void _copyVector2Output(vector<vector<Point2f> > &vec, OutputArrayOfArrays out,
     }
 }
 
-void _convertToGrey(InputArray _in, OutputArray _out) {
-    CV_Assert(_in.type() == CV_8UC1 || _in.type() == CV_8UC3);
-    if(_in.type() == CV_8UC3)
+void _convertToGrey(InputArray _in, Mat& _out) {
+    CV_Assert(_in.type() == CV_8UC1 || _in.type() == CV_8UC3 || _in.type() == CV_8UC4);
+    if(_in.type() != CV_8UC1)
         cvtColor(_in, _out, COLOR_BGR2GRAY);
     else
-        _in.copyTo(_out);
+        _out = _in.getMat();
 }
 
 }
diff --git a/modules/objdetect/src/aruco/aruco_utils.hpp b/modules/objdetect/src/aruco/aruco_utils.hpp
index d7d29a7d1810..009645b18bb3 100644
--- a/modules/objdetect/src/aruco/aruco_utils.hpp
+++ b/modules/objdetect/src/aruco/aruco_utils.hpp
@@ -16,9 +16,9 @@ namespace aruco {
 void _copyVector2Output(std::vector<std::vector<Point2f> > &vec, OutputArrayOfArrays out, const float scale = 1.f);
 
 /**
-  * @brief Convert input image to gray if it is a 3-channels image
+  * @brief Convert input image to gray if it is a BGR or BGRA image
   */
-void _convertToGrey(InputArray _in, OutputArray _out);
+void _convertToGrey(InputArray _in, Mat& _out);
 
 template<typename T>
 inline bool readParameter(const std::string& name, T& parameter, const FileNode& node)
diff --git a/modules/objdetect/src/aruco/charuco_detector.cpp b/modules/objdetect/src/aruco/charuco_detector.cpp
index 55a7f8d88845..4f08747289ad 100644
--- a/modules/objdetect/src/aruco/charuco_detector.cpp
+++ b/modules/objdetect/src/aruco/charuco_detector.cpp
@@ -5,6 +5,7 @@
 #include "../precomp.hpp"
 
 #include <opencv2/calib3d.hpp>
+#include <opencv2/core/utils/logger.hpp>
 #include "opencv2/objdetect/charuco_detector.hpp"
 #include "aruco_utils.hpp"
 
@@ -26,12 +27,12 @@ struct CharucoDetector::CharucoDetectorImpl {
     bool checkBoard(InputArrayOfArrays markerCorners, InputArray markerIds, InputArray charucoCorners, InputArray charucoIds) {
         vector<Mat> mCorners;
         markerCorners.getMatVector(mCorners);
-        Mat mIds = markerIds.getMat();
+        const Mat mIds = markerIds.getMat();
+        const Mat chCorners = charucoCorners.getMat();
+        const Mat chIds = charucoIds.getMat();
+        const vector<int>& boardIds = board.getIds();
 
-        Mat chCorners = charucoCorners.getMat();
-        Mat chIds = charucoIds.getMat();
-
-        vector<vector<int> > nearestMarkerIdx = board.getNearestMarkerIdx();
+        const vector<vector<int> > nearestMarkerIdx = board.getNearestMarkerIdx();
         vector<Point2f> distance(board.getNearestMarkerIdx().size(), Point2f(0.f, std::numeric_limits<float>::max()));
         // distance[i].x: max distance from the i-th charuco corner to charuco corner-forming markers.
         // The two charuco corner-forming markers of i-th charuco corner are defined in getNearestMarkerIdx()[i]
@@ -41,13 +42,19 @@ struct CharucoDetector::CharucoDetectorImpl {
             Point2f charucoCorner(chCorners.ptr<Point2f>(0)[i]);
             for (size_t j = 0ull; j < mIds.total(); j++) {
                 int idMaker = mIds.ptr<int>(0)[j];
+                // skip the check if the marker is not in the current board.
+                if (find(boardIds.begin(), boardIds.end(), idMaker) == boardIds.end())
+                    continue;
                 Point2f centerMarker((mCorners[j].ptr<Point2f>(0)[0] + mCorners[j].ptr<Point2f>(0)[1] +
                                       mCorners[j].ptr<Point2f>(0)[2] + mCorners[j].ptr<Point2f>(0)[3]) / 4.f);
                 float dist = sqrt(normL2Sqr<float>(centerMarker - charucoCorner));
-                // check distance from the charuco corner to charuco corner-forming markers
-                if (nearestMarkerIdx[chId][0] == idMaker || nearestMarkerIdx[chId][1] == idMaker) {
-                    int nearestCornerId =  nearestMarkerIdx[chId][0] == idMaker ? board.getNearestMarkerCorners()[chId][0] : board.getNearestMarkerCorners()[chId][1];
+                // nearestMarkerIdx contains for each charuco corner, nearest marker index in ids array
+                const int nearestMarkerId1 = boardIds[nearestMarkerIdx[chId][0]];
+                const int nearestMarkerId2 = boardIds[nearestMarkerIdx[chId][1]];
+                if (nearestMarkerId1 == idMaker || nearestMarkerId2 == idMaker) {
+                    int nearestCornerId = nearestMarkerId1 == idMaker ? board.getNearestMarkerCorners()[chId][0] : board.getNearestMarkerCorners()[chId][1];
                     Point2f nearestCorner = mCorners[j].ptr<Point2f>(0)[nearestCornerId];
+                    // distToNearest: distance from the charuco corner to charuco corner-forming markers
                     float distToNearest = sqrt(normL2Sqr<float>(nearestCorner - charucoCorner));
                     distance[chId].x = max(distance[chId].x, distToNearest);
                     // check that nearestCorner is nearest point
@@ -307,7 +314,9 @@ struct CharucoDetector::CharucoDetectorImpl {
             vector<vector<Point2f> > rejectedMarkers;
             arucoDetector.detectMarkers(image, _markerCorners, _markerIds, rejectedMarkers);
             if (charucoParameters.tryRefineMarkers)
-                arucoDetector.refineDetectedMarkers(image, board, _markerCorners,  _markerIds, rejectedMarkers);
+                arucoDetector.refineDetectedMarkers(image, board, _markerCorners, _markerIds, rejectedMarkers);
+            if (_markerCorners.empty() && _markerIds.empty())
+                return;
         }
         // if camera parameters are avaible, use approximated calibration
         if(!charucoParameters.cameraMatrix.empty())
@@ -363,6 +372,7 @@ void CharucoDetector::detectBoard(InputArray image, OutputArray charucoCorners,
                                   InputOutputArrayOfArrays markerCorners, InputOutputArray markerIds) const {
     charucoDetectorImpl->detectBoard(image, charucoCorners, charucoIds, markerCorners, markerIds);
     if (charucoDetectorImpl->checkBoard(markerCorners, markerIds, charucoCorners, charucoIds) == false) {
+        CV_LOG_DEBUG(NULL, "ChArUco board is built incorrectly");
         charucoCorners.release();
         charucoIds.release();
     }
@@ -507,20 +517,27 @@ void drawDetectedCornersCharuco(InputOutputArray _image, InputArray _charucoCorn
                                 InputArray _charucoIds, Scalar cornerColor) {
     CV_Assert(!_image.getMat().empty() &&
               (_image.getMat().channels() == 1 || _image.getMat().channels() == 3));
-    CV_Assert((_charucoCorners.getMat().total() == _charucoIds.getMat().total()) ||
-              _charucoIds.getMat().total() == 0);
-
-    size_t nCorners = _charucoCorners.getMat().total();
+    CV_Assert((_charucoCorners.total() == _charucoIds.total()) ||
+              _charucoIds.total() == 0);
+    CV_Assert(_charucoCorners.channels() == 2);
+
+    Mat charucoCorners = _charucoCorners.getMat();
+    if (charucoCorners.type() != CV_32SC2)
+        charucoCorners.convertTo(charucoCorners, CV_32SC2);
+    Mat charucoIds;
+    if (!_charucoIds.empty())
+        charucoIds = _charucoIds.getMat();
+    size_t nCorners = charucoCorners.total();
     for(size_t i = 0; i < nCorners; i++) {
-        Point2f corner = _charucoCorners.getMat().at<Point2f>((int)i);
+        Point corner = charucoCorners.at<Point>((int)i);
         // draw first corner mark
-        rectangle(_image, corner - Point2f(3, 3), corner + Point2f(3, 3), cornerColor, 1, LINE_AA);
+        rectangle(_image, corner - Point(3, 3), corner + Point(3, 3), cornerColor, 1, LINE_AA);
         // draw ID
         if(!_charucoIds.empty()) {
-            int id = _charucoIds.getMat().at<int>((int)i);
+            int id = charucoIds.at<int>((int)i);
             stringstream s;
             s << "id=" << id;
-            putText(_image, s.str(), corner + Point2f(5, -5), FONT_HERSHEY_SIMPLEX, 0.5,
+            putText(_image, s.str(), corner + Point(5, -5), FONT_HERSHEY_SIMPLEX, 0.5,
                     cornerColor, 2);
         }
     }
@@ -540,25 +557,27 @@ void drawDetectedDiamonds(InputOutputArray _image, InputArrayOfArrays _corners,
     int nMarkers = (int)_corners.total();
     for(int i = 0; i < nMarkers; i++) {
         Mat currentMarker = _corners.getMat(i);
-        CV_Assert(currentMarker.total() == 4 && currentMarker.type() == CV_32FC2);
+        CV_Assert(currentMarker.total() == 4 && currentMarker.channels() == 2);
+        if (currentMarker.type() != CV_32SC2)
+            currentMarker.convertTo(currentMarker, CV_32SC2);
 
         // draw marker sides
         for(int j = 0; j < 4; j++) {
-            Point2f p0, p1;
-            p0 = currentMarker.at< Point2f >(j);
-            p1 = currentMarker.at< Point2f >((j + 1) % 4);
+            Point p0, p1;
+            p0 = currentMarker.at<Point>(j);
+            p1 = currentMarker.at<Point>((j + 1) % 4);
             line(_image, p0, p1, borderColor, 1);
         }
 
         // draw first corner mark
-        rectangle(_image, currentMarker.at< Point2f >(0) - Point2f(3, 3),
-                  currentMarker.at< Point2f >(0) + Point2f(3, 3), cornerColor, 1, LINE_AA);
+        rectangle(_image, currentMarker.at<Point>(0) - Point(3, 3),
+                  currentMarker.at<Point>(0) + Point(3, 3), cornerColor, 1, LINE_AA);
 
         // draw id composed by four numbers
         if(_ids.total() != 0) {
-            Point2f cent(0, 0);
+            Point cent(0, 0);
             for(int p = 0; p < 4; p++)
-                cent += currentMarker.at< Point2f >(p);
+                cent += currentMarker.at<Point>(p);
             cent = cent / 4.;
             stringstream s;
             s << "id=" << _ids.getMat().at< Vec4i >(i);
diff --git a/modules/objdetect/src/barcode.cpp b/modules/objdetect/src/barcode.cpp
index 549ea84a0aca..1b963e424217 100644
--- a/modules/objdetect/src/barcode.cpp
+++ b/modules/objdetect/src/barcode.cpp
@@ -142,11 +142,15 @@ struct BarcodeImpl : public GraphicalCodeDetector::Impl
 public:
     shared_ptr<SuperScale> sr;
     bool use_nn_sr = false;
+    double detectorThrDownSample = 512.f;
+    vector<float> detectorWindowSizes = {0.01f, 0.03f, 0.06f, 0.08f};
+    double detectorThrGradMagnitude = 64.f;
 
 public:
     //=================
     // own methods
-    BarcodeImpl() = default;
+    BarcodeImpl() {}
+
     vector<Mat> initDecode(const Mat &src, const vector<vector<Point2f>> &points) const;
     bool decodeWithType(InputArray img,
                      InputArray points,
@@ -268,8 +272,8 @@ bool BarcodeImpl::detect(InputArray img, OutputArray points) const
     }
 
     Detect bardet;
-    bardet.init(inarr);
-    bardet.localization();
+    bardet.init(inarr, detectorThrDownSample);
+    bardet.localization(detectorWindowSizes, detectorThrGradMagnitude);
     if (!bardet.computeTransformationPoints())
     { return false; }
     vector<vector<Point2f>> pnts2f = bardet.getTransformationPoints();
@@ -302,13 +306,13 @@ string BarcodeImpl::detectAndDecode(InputArray img, OutputArray points, OutputAr
     CV_UNUSED(straight_code);
     vector<string> decoded_info;
     vector<string> decoded_type;
-    vector<Point> points_;
+    vector<Point2f> points_;
     if (!detectAndDecodeWithType(img, decoded_info, decoded_type, points_))
         return string();
     if (points_.size() < 4 || decoded_info.size() < 1)
         return string();
     points_.resize(4);
-    points.setTo(points_);
+    updatePointsResult(points, points_);
     return decoded_info[0];
 }
 
@@ -343,11 +347,11 @@ BarcodeDetector::BarcodeDetector(const string &prototxt_path, const string &mode
 {
     Ptr<BarcodeImpl> p_ = new BarcodeImpl();
     p = p_;
+    p_->sr = make_shared<SuperScale>();
     if (!prototxt_path.empty() && !model_path.empty())
     {
         CV_Assert(utils::fs::exists(prototxt_path));
         CV_Assert(utils::fs::exists(model_path));
-        p_->sr = make_shared<SuperScale>();
         int res = p_->sr->init(prototxt_path, model_path);
         CV_Assert(res == 0);
         p_->use_nn_sr = true;
@@ -370,5 +374,64 @@ bool BarcodeDetector::detectAndDecodeWithType(InputArray img, vector<string> &de
     return p_->detectAndDecodeWithType(img, decoded_info, decoded_type, points_);
 }
 
+double BarcodeDetector::getDownsamplingThreshold() const
+{
+    Ptr<BarcodeImpl> p_ = dynamic_pointer_cast<BarcodeImpl>(p);
+    CV_Assert(p_);
+
+    return p_->detectorThrDownSample;
+}
+
+BarcodeDetector& BarcodeDetector::setDownsamplingThreshold(double thresh)
+{
+    Ptr<BarcodeImpl> p_ = dynamic_pointer_cast<BarcodeImpl>(p);
+    CV_Assert(p_);
+    CV_Assert(thresh >= 64);
+
+    p_->detectorThrDownSample = thresh;
+    return *this;
+}
+
+void BarcodeDetector::getDetectorScales(CV_OUT std::vector<float>& sizes) const
+{
+    Ptr<BarcodeImpl> p_ = dynamic_pointer_cast<BarcodeImpl>(p);
+    CV_Assert(p_);
+
+    sizes = p_->detectorWindowSizes;
+}
+
+BarcodeDetector& BarcodeDetector::setDetectorScales(const std::vector<float>& sizes)
+{
+    Ptr<BarcodeImpl> p_ = dynamic_pointer_cast<BarcodeImpl>(p);
+    CV_Assert(p_);
+    CV_Assert(sizes.size() > 0 && sizes.size() <= 16);
+
+    for (const float &size : sizes) {
+        CV_Assert(size > 0 && size < 1);
+    }
+
+    p_->detectorWindowSizes = sizes;
+
+    return *this;
+}
+
+double BarcodeDetector::getGradientThreshold() const
+{
+    Ptr<BarcodeImpl> p_ = dynamic_pointer_cast<BarcodeImpl>(p);
+    CV_Assert(p_);
+
+    return p_->detectorThrGradMagnitude;
+}
+
+BarcodeDetector& BarcodeDetector::setGradientThreshold(double thresh)
+{
+    Ptr<BarcodeImpl> p_ = dynamic_pointer_cast<BarcodeImpl>(p);
+    CV_Assert(p_);
+    CV_Assert(thresh >= 0 && thresh < 1e4);
+
+    p_->detectorThrGradMagnitude = thresh;
+    return *this;
+}
+
 }// namespace barcode
 } // namespace cv
diff --git a/modules/objdetect/src/barcode_detector/bardetect.cpp b/modules/objdetect/src/barcode_detector/bardetect.cpp
index b156d1b25d9a..abb30bf5472f 100644
--- a/modules/objdetect/src/barcode_detector/bardetect.cpp
+++ b/modules/objdetect/src/barcode_detector/bardetect.cpp
@@ -136,13 +136,13 @@ static void NMSBoxes(const std::vector<RotatedRect>& bboxes, const std::vector<f
 
 //==============================================================================
 
-void Detect::init(const Mat &src)
+void Detect::init(const Mat &src, double detectorThreshDownSamplingLimit)
 {
     const double min_side = std::min(src.size().width, src.size().height);
-    if (min_side > 512.0)
+    if (min_side > detectorThreshDownSamplingLimit)
     {
         purpose = SHRINKING;
-        coeff_expansion = min_side / 512.0;
+        coeff_expansion = min_side / detectorThreshDownSamplingLimit;
         width = cvRound(src.size().width / coeff_expansion);
         height = cvRound(src.size().height / coeff_expansion);
         Size new_size(width, height);
@@ -171,19 +171,19 @@ void Detect::init(const Mat &src)
 }
 
 
-void Detect::localization()
+void Detect::localization(const std::vector<float>& detectorWindowSizes, double detectorThreshGradientMagnitude)
 {
 
     localization_bbox.clear();
     bbox_scores.clear();
 
     // get integral image
-    preprocess();
+    preprocess(detectorThreshGradientMagnitude);
     // empirical setting
-    static constexpr float SCALE_LIST[] = {0.01f, 0.03f, 0.06f, 0.08f};
+    //static constexpr float SCALE_LIST[] = {0.01f, 0.03f, 0.06f, 0.08f};
     const auto min_side = static_cast<float>(std::min(width, height));
     int window_size;
-    for (const float scale:SCALE_LIST)
+    for (const float scale: detectorWindowSizes)
     {
         window_size = cvRound(min_side * scale);
         if(window_size == 0) {
@@ -205,7 +205,20 @@ bool Detect::computeTransformationPoints()
     transformation_points.reserve(bbox_indices.size());
     RotatedRect rect;
     Point2f temp[4];
-    const float THRESHOLD_SCORE = float(width * height) / 300.f;
+
+    /**
+     * #24902 resolution invariant barcode detector
+     *
+     * refactor of THRESHOLD_SCORE = float(width * height) / 300.f
+     * wrt to rescaled input size - 300 value needs factorization
+     * only one factor pair matches a common aspect ratio of 4:3 ~ 20x15
+     * decomposing this yields THRESHOLD_SCORE = (width / 20) * (height / 15)
+     * therefore each factor was rescaled based by purpose (refsize was 512)
+     */
+    const float THRESHOLD_WSCALE = (purpose != UNCHANGED) ? 20 : (20 * width / 512.f);
+    const float THRESHOLD_HSCALE = (purpose != UNCHANGED) ? 15 : (15 * height / 512.f);
+    const float THRESHOLD_SCORE = (width / THRESHOLD_WSCALE) * (height / THRESHOLD_HSCALE);
+
     NMSBoxes(localization_bbox, bbox_scores, THRESHOLD_SCORE, 0.1f, bbox_indices);
 
     for (const auto &bbox_index : bbox_indices)
@@ -231,15 +244,14 @@ bool Detect::computeTransformationPoints()
 }
 
 
-void Detect::preprocess()
+void Detect::preprocess(double detectorGradientMagnitudeThresh)
 {
     Mat scharr_x, scharr_y, temp;
-    static constexpr double THRESHOLD_MAGNITUDE = 64.;
     Scharr(resized_barcode, scharr_x, CV_32F, 1, 0);
     Scharr(resized_barcode, scharr_y, CV_32F, 0, 1);
     // calculate magnitude of gradient and truncate
     magnitude(scharr_x, scharr_y, temp);
-    threshold(temp, temp, THRESHOLD_MAGNITUDE, 1, THRESH_BINARY);
+    threshold(temp, temp, detectorGradientMagnitudeThresh, 1, THRESH_BINARY);
     temp.convertTo(gradient_magnitude, CV_8U);
     integral(gradient_magnitude, integral_edges, CV_32F);
 
diff --git a/modules/objdetect/src/barcode_detector/bardetect.hpp b/modules/objdetect/src/barcode_detector/bardetect.hpp
index 9f084d20aa13..178fded36f5c 100644
--- a/modules/objdetect/src/barcode_detector/bardetect.hpp
+++ b/modules/objdetect/src/barcode_detector/bardetect.hpp
@@ -24,9 +24,9 @@ class Detect
 
 
 public:
-    void init(const Mat &src);
+    void init(const Mat &src, double detectorThreshDownSamplingLimit);
 
-    void localization();
+    void localization(const vector<float>& detectorWindowSizes, double detectorGradientMagnitudeThresh);
 
     vector<vector<Point2f>> getTransformationPoints()
     { return transformation_points; }
@@ -44,7 +44,7 @@ class Detect
     int height, width;
     Mat resized_barcode, gradient_magnitude, coherence, orientation, edge_nums, integral_x_sq, integral_y_sq, integral_xy, integral_edges;
 
-    void preprocess();
+    void preprocess(double detectorThreshGradientMagnitude);
 
     void calCoherence(int window_size);
 
diff --git a/modules/objdetect/src/face_detect.cpp b/modules/objdetect/src/face_detect.cpp
index 17c982d92f9a..441c55e78803 100644
--- a/modules/objdetect/src/face_detect.cpp
+++ b/modules/objdetect/src/face_detect.cpp
@@ -48,6 +48,35 @@ class FaceDetectorYNImpl : public FaceDetectorYN
         topK = top_k;
     }
 
+    FaceDetectorYNImpl(const String& framework,
+                       const std::vector<uchar>& bufferModel,
+                       const std::vector<uchar>& bufferConfig,
+                       const Size& input_size,
+                       float score_threshold,
+                       float nms_threshold,
+                       int top_k,
+                       int backend_id,
+                       int target_id)
+                       :divisor(32),
+                       strides({8, 16, 32})
+    {
+        net = dnn::readNet(framework, bufferModel, bufferConfig);
+        CV_Assert(!net.empty());
+
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
+
+        inputW = input_size.width;
+        inputH = input_size.height;
+
+        padW = (int((inputW - 1) / divisor) + 1) * divisor;
+        padH = (int((inputH - 1) / divisor) + 1) * divisor;
+
+        scoreThreshold = score_threshold;
+        nmsThreshold = nms_threshold;
+        topK = top_k;
+    }
+
     void setInputSize(const Size& input_size) override
     {
         inputW = input_size.width;
@@ -102,12 +131,21 @@ class FaceDetectorYNImpl : public FaceDetectorYN
             return 0;
         }
         CV_CheckEQ(input_image.size(), Size(inputW, inputH), "Size does not match. Call setInputSize(size) if input size does not match the preset size");
-        // Pad input_image with divisor 32
-        Mat pad_image = padWithDivisor(input_image);
-
-        // Build blob from input image
-        Mat input_blob = dnn::blobFromImage(pad_image);
 
+        Mat input_blob;
+        if(input_image.kind() == _InputArray::UMAT) {
+            // Pad input_image with divisor 32
+            UMat pad_image;
+            padWithDivisor(input_image, pad_image);
+            // Build blob from input image
+            input_blob = dnn::blobFromImage(pad_image);
+        } else {
+            // Pad input_image with divisor 32
+            Mat pad_image;
+            padWithDivisor(input_image, pad_image);
+            // Build blob from input image
+            input_blob = dnn::blobFromImage(pad_image);
+        }
         // Forward
         std::vector<String> output_names = { "cls_8", "cls_16", "cls_32", "obj_8", "obj_16", "obj_32", "bbox_8", "bbox_16", "bbox_32", "kps_8", "kps_16", "kps_32" };
         std::vector<Mat> output_blobs;
@@ -162,6 +200,9 @@ class FaceDetectorYNImpl : public FaceDetectorYN
                     float score = std::sqrt(cls_score * obj_score);
                     face.at<float>(0, 14) = score;
 
+                    // Checking if the score meets the threshold before adding the face
+                    if (score < scoreThreshold)
+                        continue;
                     // Get bounding box
                     float cx = ((c + bbox_v[idx * 4 + 0]) * strides[i]);
                     float cy = ((r + bbox_v[idx * 4 + 1]) * strides[i]);
@@ -217,13 +258,11 @@ class FaceDetectorYNImpl : public FaceDetectorYN
         }
     }
 
-    Mat padWithDivisor(InputArray& input_image)
+    void padWithDivisor(InputArray input_image, OutputArray pad_image)
     {
         int bottom = padH - inputH;
         int right = padW - inputW;
-        Mat pad_image;
         copyMakeBorder(input_image, pad_image, 0, bottom, 0, right, BORDER_CONSTANT, 0);
-        return pad_image;
     }
 private:
     dnn::Net net;
@@ -257,4 +296,22 @@ Ptr<FaceDetectorYN> FaceDetectorYN::create(const String& model,
 #endif
 }
 
+Ptr<FaceDetectorYN> FaceDetectorYN::create(const String& framework,
+                                           const std::vector<uchar>& bufferModel,
+                                           const std::vector<uchar>& bufferConfig,
+                                           const Size& input_size,
+                                           const float score_threshold,
+                                           const float nms_threshold,
+                                           const int top_k,
+                                           const int backend_id,
+                                           const int target_id)
+{
+#ifdef HAVE_OPENCV_DNN
+    return makePtr<FaceDetectorYNImpl>(framework, bufferModel, bufferConfig, input_size, score_threshold, nms_threshold, top_k, backend_id, target_id);
+#else
+    CV_UNUSED(bufferModel); CV_UNUSED(bufferConfig); CV_UNUSED(input_size); CV_UNUSED(score_threshold); CV_UNUSED(nms_threshold); CV_UNUSED(top_k); CV_UNUSED(backend_id); CV_UNUSED(target_id);
+    CV_Error(cv::Error::StsNotImplemented, "cv::FaceDetectorYN requires enabled 'dnn' module.");
+#endif
+}
+
 } // namespace cv
diff --git a/modules/objdetect/src/face_recognize.cpp b/modules/objdetect/src/face_recognize.cpp
index 497303e42b05..8183573ce982 100644
--- a/modules/objdetect/src/face_recognize.cpp
+++ b/modules/objdetect/src/face_recognize.cpp
@@ -25,7 +25,7 @@ class FaceRecognizerSFImpl : public FaceRecognizerSF
 
         net.setPreferableBackend(backend_id);
         net.setPreferableTarget(target_id);
-    };
+    }
     void alignCrop(InputArray _src_img, InputArray _face_mat, OutputArray _aligned_img) const override
     {
         Mat face_mat = _face_mat.getMat();
@@ -39,13 +39,13 @@ class FaceRecognizerSFImpl : public FaceRecognizerSF
         }
         Mat warp_mat = getSimilarityTransformMatrix(src_point);
         warpAffine(_src_img, _aligned_img, warp_mat, Size(112, 112), INTER_LINEAR);
-    };
+    }
     void feature(InputArray _aligned_img, OutputArray _face_feature) override
     {
         Mat inputBolb = dnn::blobFromImage(_aligned_img, 1, Size(112, 112), Scalar(0, 0, 0), true, false);
         net.setInput(inputBolb);
         net.forward(_face_feature);
-    };
+    }
     double match(InputArray _face_feature1, InputArray _face_feature2, int dis_type) const override
     {
         Mat face_feature1 = _face_feature1.getMat(), face_feature2 = _face_feature2.getMat();
@@ -60,7 +60,7 @@ class FaceRecognizerSFImpl : public FaceRecognizerSF
             throw std::invalid_argument("invalid parameter " + std::to_string(dis_type));
         }
 
-    };
+    }
 
 private:
     Mat getSimilarityTransformMatrix(float src[5][2]) const {
diff --git a/modules/objdetect/src/graphical_code_detector_impl.hpp b/modules/objdetect/src/graphical_code_detector_impl.hpp
index 76429222ffa9..987ccaab54f6 100644
--- a/modules/objdetect/src/graphical_code_detector_impl.hpp
+++ b/modules/objdetect/src/graphical_code_detector_impl.hpp
@@ -20,6 +20,21 @@ struct GraphicalCodeDetector::Impl {
                                       OutputArray points, OutputArrayOfArrays straight_code) const = 0;
 };
 
+class QRCodeDecoder {
+public:
+    virtual ~QRCodeDecoder();
+
+    static Ptr<QRCodeDecoder> create();
+
+    virtual bool decode(const Mat& straight, String& decoded_info) = 0;
+
+    QRCodeEncoder::EncodeMode mode;
+    QRCodeEncoder::ECIEncodings eci;
+    uint8_t parity = 0;
+    uint8_t sequence_num = 0;
+    uint8_t total_num = 1;
+};
+
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index b57e92ff9a30..b83263304f67 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -268,13 +268,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
         for ( i = 0; i < 256; i += 4)
         {
             v_store(_data + i, v_sqrt(idx));
-            idx += ifour;
+            idx = v_add(idx, ifour);
         }
     else
         for ( i = 0; i < 256; i += 4)
         {
             v_store(_data + i, idx);
-            idx += ifour;
+            idx = v_add(idx, ifour);
         }
 #else
     if( gammaCorrection )
@@ -320,7 +320,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
         for ( ; x <= end - 4; x += 4)
         {
             v_int32x4 mul_res = v_load(xmap + x);
-            mul_res += mul_res + mul_res;
+            mul_res = v_add(mul_res, v_add(mul_res, mul_res));
             v_store(xmap + x, mul_res);
         }
 #endif
@@ -444,34 +444,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
             {
                 int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
 
-                v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0);
-                v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1);
-                v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2);
+                v_float32x4 _dx0 = v_sub(v_load(lutCurr + x + widthP2 * 0 + 2), v_load(lutCurr + x + widthP2 * 0));
+                v_float32x4 _dx1 = v_sub(v_load(lutCurr + x + widthP2 * 1 + 2), v_load(lutCurr + x + widthP2 * 1));
+                v_float32x4 _dx2 = v_sub(v_load(lutCurr + x + widthP2 * 2 + 2), v_load(lutCurr + x + widthP2 * 2));
 
                 v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]);
-                v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1);
+                v_float32x4 _dy0 = v_sub(_dy00, v_load(lutPrev + x + widthP2 * 0 + 1));
 
                 v_store(lutNext+x+widthP2*0+1, _dy00);
 
                 v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]);
-                v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1);
+                v_float32x4 _dy1 = v_sub(_dy10, v_load(lutPrev + x + widthP2 * 1 + 1));
 
                 v_store(lutNext+x+widthP2*1+1, _dy10);
 
                 v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]);
-                v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1);
+                v_float32x4 _dy2 = v_sub(_dy20, v_load(lutPrev + x + widthP2 * 2 + 1));
 
                 v_store(lutNext+x+widthP2*2+1, _dy20);
 
-                v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
-                v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
-                v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2);
+                v_float32x4 _mag0 = v_add(v_mul(_dx0, _dx0), v_mul(_dy0, _dy0));
+                v_float32x4 _mag1 = v_add(v_mul(_dx1, _dx1), v_mul(_dy1, _dy1));
+                v_float32x4 _mag2 = v_add(v_mul(_dx2, _dx2), v_mul(_dy2, _dy2));
 
-                v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1);
+                v_float32x4 mask = v_reinterpret_as_f32(v_gt(_mag2, _mag1));
                 _dx2 = v_select(mask, _dx2, _dx1);
                 _dy2 = v_select(mask, _dy2, _dy1);
 
-                mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0);
+                mask = v_reinterpret_as_f32(v_gt(v_max(_mag2, _mag1), _mag0));
                 _dx2 = v_select(mask, _dx2, _dx0);
                 _dy2 = v_select(mask, _dy2, _dy0);
 
@@ -537,25 +537,25 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
             int x2 = x << 1;
             v_float32x4 _mag = v_load(dbuf + x + (width << 1));
             v_float32x4 _angle = v_load(dbuf + x + width * 3);
-            _angle = (_angleScale * _angle) - fhalf;
+            _angle = v_sub(v_mul(_angleScale, _angle), fhalf);
 
             v_int32x4 _hidx = v_floor(_angle);
-            _angle -= v_cvt_f32(_hidx);
+            _angle = v_sub(_angle, v_cvt_f32(_hidx));
 
-            v_float32x4 ft0 = _mag * (fone - _angle);
-            v_float32x4 ft1 = _mag * _angle;
+            v_float32x4 ft0 = v_mul(_mag, v_sub(fone, _angle));
+            v_float32x4 ft1 = v_mul(_mag, _angle);
 
             v_store_interleave(gradPtr + x2, ft0, ft1);
 
-            v_int32x4 mask0 = _hidx >> 31;
-            v_int32x4 it0 = mask0 & _nbins;
-            mask0 = (_hidx >= _nbins);
-            v_int32x4 it1 = mask0 & _nbins;
-            _hidx += (it0 - it1);
+            v_int32x4 mask0 = v_shr<31>(_hidx);
+            v_int32x4 it0 = v_and(mask0, _nbins);
+            mask0 = (v_ge(_hidx, _nbins));
+            v_int32x4 it1 = v_and(mask0, _nbins);
+            _hidx = v_add(_hidx, v_sub(it0, it1));
 
             it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
-            _hidx += ione;
-            _hidx &= (_hidx < _nbins);
+            _hidx = v_add(_hidx, ione);
+            _hidx = v_and(_hidx, v_lt(_hidx, _nbins));
             it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
             v_uint8x16 it2, it3;
             v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3);
@@ -707,9 +707,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
 
         for (; i <= blockSize.height - 4; i += 4)
         {
-            v_float32x4 t = idx - _bh;
-            t *= t;
-            idx += ifour;
+            v_float32x4 t = v_sub(idx, _bh);
+            t = v_mul(t, t);
+            idx = v_add(idx, ifour);
             v_store(_di + i, t);
         }
     #endif
@@ -725,9 +725,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
 
         for (; j <= blockSize.height - 4; j += 4)
         {
-            v_float32x4 t = idx - _bw;
-            t *= t;
-            idx += ifour;
+            v_float32x4 t = v_sub(idx, _bw);
+            t = v_mul(t, t);
+            idx = v_add(idx, ifour);
             v_store(_dj + j, t);
         }
     #endif
@@ -936,8 +936,8 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         int h0 = h[0], h1 = h[1];
 
         v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
-        v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
-        v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
+        v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights));
+        v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w);
 
         v_store(hist0, _t0);
         v_store(hist1, _t1);
@@ -984,8 +984,8 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         int h0 = h[0], h1 = h[1];
 
         v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
-        v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
-        v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
+        v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights));
+        v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w);
 
         v_store(hist0, _t0);
         v_store(hist1, _t1);
@@ -1057,12 +1057,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
 
 #if CV_SIMD128
     v_float32x4 p0 = v_load(hist);
-    v_float32x4 s = p0 * p0;
+    v_float32x4 s = v_mul(p0, p0);
 
     for (i = 4; i <= sz - 4; i += 4)
     {
         p0 = v_load(hist + i);
-        s += p0 * p0;
+        s = v_add(s, v_mul(p0, p0));
     }
     v_store(partSum, s);
 #else
@@ -1091,17 +1091,17 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
     v_float32x4 _scale = v_setall_f32(scale);
     static v_float32x4 _threshold = v_setall_f32(thresh);
 
-    v_float32x4 p = _scale * v_load(hist);
+    v_float32x4 p = v_mul(_scale, v_load(hist));
     p = v_min(p, _threshold);
-    s = p * p;
+    s = v_mul(p, p);
     v_store(hist, p);
 
     for(i = 4 ; i <= sz - 4; i += 4)
     {
         p = v_load(hist + i);
-        p *= _scale;
+        p = v_mul(p, _scale);
         p = v_min(p, _threshold);
-        s += p * p;
+        s = v_add(s, v_mul(p, p));
         v_store(hist + i, p);
     }
 
@@ -1137,7 +1137,7 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
     v_float32x4 _scale2 = v_setall_f32(scale);
     for ( ; i <= sz - 4; i += 4)
     {
-        v_float32x4 t = _scale2 * v_load(hist + i);
+        v_float32x4 t = v_mul(_scale2, v_load(hist + i));
         v_store(hist + i, t);
     }
 #endif
@@ -1593,14 +1593,14 @@ void HOGDescriptor::detect(InputArray _img,
 #if CV_SIMD128
             v_float32x4 _vec = v_load(vec);
             v_float32x4 _svmVec = v_load(svmVec);
-            v_float32x4 sum = _svmVec * _vec;
+            v_float32x4 sum = v_mul(_svmVec, _vec);
 
             for( k = 4; k <= blockHistogramSize - 4; k += 4 )
             {
                 _vec = v_load(vec + k);
                 _svmVec = v_load(svmVec + k);
 
-                sum += _vec * _svmVec;
+                sum = v_add(sum, v_mul(_vec, _svmVec));
             }
 
             v_store(partSum, sum);
@@ -3392,14 +3392,14 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
 #if CV_SIMD128
             v_float32x4 _vec = v_load(vec);
             v_float32x4 _svmVec = v_load(svmVec);
-            v_float32x4 sum = _svmVec * _vec;
+            v_float32x4 sum = v_mul(_svmVec, _vec);
 
             for( k = 4; k <= blockHistogramSize - 4; k += 4 )
             {
                 _vec = v_load(vec + k);
                 _svmVec = v_load(svmVec + k);
 
-                sum += _vec * _svmVec;
+                sum = v_add(sum, v_mul(_vec, _svmVec));
             }
 
             v_store(partSum, sum);
diff --git a/modules/objdetect/src/precomp.hpp b/modules/objdetect/src/precomp.hpp
index 790a98069764..63ca44007691 100644
--- a/modules/objdetect/src/precomp.hpp
+++ b/modules/objdetect/src/precomp.hpp
@@ -52,5 +52,7 @@
 #include "opencv2/core/private.hpp"
 
 #include <numeric>
+#include <array>
+#include <vector>
 
 #endif
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 9f64c6446259..b6350b83e69d 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -15,6 +15,7 @@
 #include "quirc.h"
 #endif
 
+#include <array>
 #include <limits>
 #include <cmath>
 #include <queue>
@@ -68,19 +69,14 @@ static void updatePointsResult(OutputArray points_, const vector<Point2f>& point
 
 static Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2)
 {
+    // Try to solve a two lines intersection (a1, a2) and (b1, b2) as a system of equations:
+    // a2 + u * (a1 - a2) = b2 + v * (b1 - b2)
     const float divisor = (a1.x - a2.x) * (b1.y - b2.y) - (a1.y - a2.y) * (b1.x - b2.x);
     const float eps = 0.001f;
     if (abs(divisor) < eps)
         return a2;
-    Point2f result_square_angle(
-                              ((a1.x * a2.y  -  a1.y * a2.x) * (b1.x - b2.x) -
-                               (b1.x * b2.y  -  b1.y * b2.x) * (a1.x - a2.x)) /
-                               divisor,
-                              ((a1.x * a2.y  -  a1.y * a2.x) * (b1.y - b2.y) -
-                               (b1.x * b2.y  -  b1.y * b2.x) * (a1.y - a2.y)) /
-                               divisor
-                              );
-    return result_square_angle;
+    const float u = ((b2.x - a2.x) * (b1.y - b2.y) + (b1.x - b2.x) * (a2.y - b2.y)) / divisor;
+    return a2 + u * (a1 - a2);
 }
 
 //      / | b
@@ -470,16 +466,25 @@ bool QRDetect::localization()
     CV_TRACE_FUNCTION();
     Point2f begin, end;
     vector<Vec3d> list_lines_x = searchHorizontalLines();
-    if( list_lines_x.empty() ) { return false; }
-    vector<Point2f> list_lines_y = separateVerticalLines(list_lines_x);
-    if( list_lines_y.empty() ) { return false; }
-
+    vector<Point2f> list_lines_y;
     Mat labels;
-    kmeans(list_lines_y, 3, labels,
-           TermCriteria( TermCriteria::EPS + TermCriteria::COUNT, 10, 0.1),
-           3, KMEANS_PP_CENTERS, localization_points);
+    if (!list_lines_x.empty())
+    {
+        list_lines_y = separateVerticalLines(list_lines_x);
+        if (!list_lines_y.empty())
+        {
+            kmeans(list_lines_y, 3, labels,
+                TermCriteria( TermCriteria::EPS + TermCriteria::COUNT, 10, 0.1),
+                3, KMEANS_PP_CENTERS, localization_points);
 
-    fixationPoints(localization_points);
+            fixationPoints(localization_points);
+        }
+    }
+
+    if (labels.empty())
+    {
+        localization_points.clear();
+    }
 
     bool square_flag = false, local_points_flag = false;
     double triangle_sides[3];
@@ -1020,6 +1025,17 @@ class QRDecode
     float coeff_expansion = 1.f;
     vector<Point2f> getOriginalPoints() {return original_points;}
     bool useAlignmentMarkers;
+
+    // Structured Append mode generates a sequence of QR codes.
+    // Final message is restored according to the index of the code in sequence.
+    // Different QR codes are grouped by a parity value.
+    bool isStructured() { return mode == QRCodeEncoder::EncodeMode::MODE_STRUCTURED_APPEND; }
+    struct {
+        uint8_t parity = 0;
+        uint8_t sequence_num = 0;
+        uint8_t total_num = 1;
+    } structure_info;
+
 protected:
     double getNumModules();
     Mat getHomography() {
@@ -1072,6 +1088,8 @@ class QRDecode
     std::string result_info;
     uint8_t version, version_size;
     float test_perspective_size;
+    QRCodeEncoder::EncodeMode mode;
+
     struct sortPairAsc
     {
         bool operator()(const std::pair<size_t, double> &a,
@@ -1254,14 +1272,14 @@ bool QRDecode::computeSidesPoints(const vector<Point> &result_integer_hull)
         {
             if (points.front().x > points.back().x)
             {
-                reverse(points.begin(), points.end());
+                std::reverse(points.begin(), points.end());
             }
         }
         else
         {
             if (points.front().y > points.back().y)
             {
-                reverse(points.begin(), points.end());
+                std::reverse(points.begin(), points.end());
             }
         }
         if (points.empty())
@@ -1554,9 +1572,9 @@ Point QRDecode::findClosestZeroPoint(Point2f original_point)
     Point zero_point;
 
     const int step = 2;
-    for (int i = orig_x - step; i >= 0 && i <= orig_x + step; i++)
+    for (int i = std::max(orig_x - step, 0); i >= 0 && i <= std::min(orig_x + step, bin_barcode.cols - 1); i++)
     {
-        for (int j = orig_y - step; j >= 0 && j <= orig_y + step; j++)
+        for (int j = std::max(orig_y - step, 0); j >= 0 && j <= std::min(orig_y + step, bin_barcode.rows - 1); j++)
         {
             Point p(i, j);
             value = bin_barcode.at<uint8_t>(p);
@@ -1637,7 +1655,7 @@ bool QRDecode::findPatternsVerticesPoints(vector<vector<Point> > &patterns_verti
             }
             if ((int)min_angle_pnts_indexes.size() == num_vertices) { break; }
         }
-        sort(min_angle_pnts_indexes.begin(), min_angle_pnts_indexes.end());
+        std::sort(min_angle_pnts_indexes.begin(), min_angle_pnts_indexes.end());
 
         vector<Point> contour_vertices_points;
 
@@ -1766,11 +1784,11 @@ bool QRDecode::findTempPatternsAddingPoints(vector<std::pair<int, vector<Point>
             }
             if (abs(p1.x - p2.x) > abs(p1.y - p2.y))
             {
-                sort(points.begin(), points.end(), sortPointsByX());
+                std::sort(points.begin(), points.end(), sortPointsByX());
             }
             else
             {
-                sort(points.begin(), points.end(), sortPointsByY());
+                std::sort(points.begin(), points.end(), sortPointsByY());
             }
 
             temp_patterns_add_points.push_back(std::pair<int, vector<Point> >(idx_curved_side,points));
@@ -1914,11 +1932,11 @@ void QRDecode::completeAndSortSides()
         Point p2 = it->second.back();
         if (abs(p1.x - p2.x) > abs(p1.y - p2.y))
         {
-            sort(it->second.begin(), it->second.end(), sortPointsByX());
+            std::sort(it->second.begin(), it->second.end(), sortPointsByX());
         }
         else
         {
-            sort(it->second.begin(), it->second.end(), sortPointsByY());
+            std::sort(it->second.begin(), it->second.end(), sortPointsByY());
         }
     }
 }
@@ -1934,7 +1952,7 @@ vector<vector<float> > QRDecode::computeSpline(const vector<int> &x_arr, const v
     }
     for (int i = 0; i < n - 1; i++)
     {
-        h[i] = static_cast<float>(y_arr[i + 1] - y_arr[i]);
+        h[i] = static_cast<float>(y_arr[i + 1] - y_arr[i]) + std::numeric_limits<float>::epsilon();
     }
     for (int i = 1; i < n - 1; i++)
     {
@@ -2080,8 +2098,8 @@ bool QRDecode::divideIntoEvenSegments(vector<vector<Point2f> > &segments_points)
                 Point2f segment_start = segments_points[i][j];
                 Point2f segment_end   = segments_points[i][j + 1];
                 vector<Point2f>::iterator it_start, it_end, it;
-                it_start = find(spline_lines[i].begin(), spline_lines[i].end(), segment_start);
-                it_end   = find(spline_lines[i].begin(), spline_lines[i].end(), segment_end);
+                it_start = std::find(spline_lines[i].begin(), spline_lines[i].end(), segment_start);
+                it_end   = std::find(spline_lines[i].begin(), spline_lines[i].end(), segment_end);
                 float max_dist_to_line = 0.0;
                 for (it = it_start; it != it_end; it++)
                 {
@@ -2732,8 +2750,62 @@ bool QRDecode::samplingForVersion()
     return true;
 }
 
+static bool checkASCIIcompatible(const uint8_t* str, const size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t byte = str[i];
+        if (byte >= 0x80)
+            return false;
+    }
+    return true;
+}
+
+static bool checkUTF8(const uint8_t* str, const size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t byte = str[i];
+        if (byte >= 0x80) {
+            // Check that symbol is encoded correctly.
+
+            // Count number of bytes per symbol as a number of leading non-zero bits
+            uint8_t numBytesPerSymbol;
+            if ((byte & 0xe0) == 0xc0)
+                numBytesPerSymbol = 2;
+            else if ((byte & 0xf0) == 0xe0)
+                numBytesPerSymbol = 3;
+            else if ((byte & 0xf8) == 0xf0)
+                numBytesPerSymbol = 4;
+            else
+                return false;
+
+            for (size_t j = 1; j < numBytesPerSymbol; ++j) {
+                if (i + j >= size || (str[i + j] & 0xc0) != 0x80) {
+                    return false;
+                }
+            }
+            i += numBytesPerSymbol - 1;
+        }
+    }
+    return true;
+}
+
+static std::string encodeUTF8_bytesarray(const uint8_t* str, const size_t size) {
+    std::ostringstream res;
+    for (size_t i = 0; i < size; ++i) {
+        uint8_t byte = str[i];
+        if (byte >= 0x80) {
+            res << (char)(0xc0 | (byte >> 6));
+            res << (char)(0x80 | (byte & 0x3f));
+        } else {
+            res << (char)byte;
+        }
+    }
+    return res.str();
+}
+
 bool QRDecode::decodingProcess()
 {
+    QRCodeEncoder::ECIEncodings eci;
+    const uint8_t* payload;
+    size_t payload_len;
 #ifdef HAVE_QUIRC
     if (straight.empty()) { return false; }
 
@@ -2761,20 +2833,87 @@ bool QRDecode::decodingProcess()
 
     if (errorCode != 0) { return false; }
 
-    for (int i = 0; i < qr_code_data.payload_len; i++)
-    {
-        result_info += qr_code_data.payload[i];
-    }
-    return true;
+    CV_LOG_INFO(NULL, "QR: decoded with .version=" << qr_code_data.version << " .data_type=" << qr_code_data.data_type << " .eci=" << qr_code_data.eci << " .payload_len=" << qr_code_data.payload_len)
+
+    mode = static_cast<QRCodeEncoder::EncodeMode>(qr_code_data.data_type);
+    eci = static_cast<QRCodeEncoder::ECIEncodings>(qr_code_data.eci);
+    payload = qr_code_data.payload;
+    payload_len = qr_code_data.payload_len;
 #else
-    return false;
+    auto decoder = QRCodeDecoder::create();
+    if (!decoder->decode(straight, result_info))
+        return false;
+    mode = decoder->mode;
+    eci = decoder->eci;
+    payload = reinterpret_cast<const uint8_t*>(result_info.c_str());
+    payload_len = result_info.size();
+    structure_info.parity = decoder->parity;
+    structure_info.sequence_num = decoder->sequence_num;
+    structure_info.total_num = decoder->total_num;
 #endif
 
+    // Check output string format
+    switch (mode)
+    {
+        case QRCodeEncoder::EncodeMode::MODE_NUMERIC:
+            if (!checkASCIIcompatible(payload, payload_len)) {
+                CV_LOG_INFO(NULL, "QR: DATA_TYPE_NUMERIC payload must be ACSII compatible string");
+                return false;
+            }
+            result_info.assign((const char*)payload, payload_len);
+            return true;
+        case QRCodeEncoder::EncodeMode::MODE_ALPHANUMERIC:
+            if (!checkASCIIcompatible(payload, payload_len)) {
+                CV_LOG_INFO(NULL, "QR: DATA_TYPE_ALPHA payload must be ASCII compatible string");
+                return false;
+            }
+            result_info.assign((const char*)payload, payload_len);
+            return true;
+        case QRCodeEncoder::EncodeMode::MODE_BYTE:
+            // https://en.wikipedia.org/wiki/Extended_Channel_Interpretation
+            if (eci == QRCodeEncoder::ECIEncodings::ECI_UTF8) {
+                CV_LOG_INFO(NULL, "QR: payload ECI is UTF-8");
+                if (!checkUTF8(payload, payload_len)) {
+                    CV_LOG_INFO(NULL, "QUIRC_DATA_TYPE_BYTE with UTF-8 ECI must be UTF-8 compatible string");
+                    return false;
+                }
+                result_info.assign((const char*)payload, payload_len);
+            } else if (eci == 25/*ECI_UTF_16BE*/) {
+                CV_LOG_INFO(NULL, "QR: UTF-16BE ECI is not supported");
+                return false;
+            } else if (checkASCIIcompatible(payload, payload_len)) {
+                CV_LOG_INFO(NULL, "QR: payload is ASCII compatible (special handling for symbols encoding is not needed)");
+                result_info.assign((const char*)payload, payload_len);
+            } else {
+                if (checkUTF8(payload, payload_len)) {
+                    CV_LOG_INFO(NULL, "QR: payload QUIRC_DATA_TYPE_BYTE is UTF-8 compatible, return as-is");
+                    result_info.assign((const char*)payload, payload_len);
+                } else {
+                    CV_LOG_INFO(NULL, "QR: assume 1-byte per symbol encoding");
+                    result_info = encodeUTF8_bytesarray(payload, payload_len);
+                }
+            }
+            return true;
+        case QRCodeEncoder::EncodeMode::MODE_KANJI:
+            // FIXIT BUG: we must return UTF-8 compatible string
+            CV_LOG_WARNING(NULL, "QR: Kanji is not supported properly");
+            result_info.assign((const char*)payload, payload_len);
+            return true;
+        case QRCodeEncoder::EncodeMode::MODE_ECI:
+            CV_LOG_WARNING(NULL, "QR: ECI is not supported properly");
+            result_info.assign((const char*)payload, payload_len);
+            return true;
+        case QRCodeEncoder::EncodeMode::MODE_STRUCTURED_APPEND:
+            result_info.assign((const char*)payload, payload_len);
+            return true;
+        default:
+            CV_LOG_WARNING(NULL, "QR: unsupported QR data type");
+            return false;
+    }
 }
 
 bool QRDecode::straightDecodingProcess()
 {
-#ifdef HAVE_QUIRC
     if (!updatePerspective(getHomography()))  { return false; }
     if (!versionDefinition())  { return false; }
     if (useAlignmentMarkers)
@@ -2782,31 +2921,23 @@ bool QRDecode::straightDecodingProcess()
     if (!samplingForVersion()) { return false; }
     if (!decodingProcess())    { return false; }
     return true;
-#else
-    std::cout << "Library QUIRC is not linked. No decoding is performed. Take it to the OpenCV repository." << std::endl;
-    return false;
-#endif
 }
 
 bool QRDecode::curvedDecodingProcess()
 {
-#ifdef HAVE_QUIRC
     if (!preparingCurvedQRCodes()) { return false; }
     if (!versionDefinition())  { return false; }
     if (!samplingForVersion()) { return false; }
     if (!decodingProcess())    { return false; }
     return true;
-#else
-    std::cout << "Library QUIRC is not linked. No decoding is performed. Take it to the OpenCV repository." << std::endl;
-    return false;
-#endif
 }
 
 QRDecode::QRDecode(bool _useAlignmentMarkers):
     useAlignmentMarkers(_useAlignmentMarkers),
     version(0),
     version_size(0),
-    test_perspective_size(0.f)
+    test_perspective_size(0.f),
+    mode(QRCodeEncoder::EncodeMode::MODE_AUTO)
     {}
 
 std::string ImplContour::decode(InputArray in, InputArray points, OutputArray straight_qrcode) const {
@@ -2949,7 +3080,10 @@ class QRDetectMulti : public QRDetect
     {
         bool operator()(const Point2f& a, const Point2f& b) const
         {
-            return a.y < b.y;
+            if (a.y != b.y)
+                return a.y < b.y;
+            else
+                return a.x < b.x;
         }
     };
     struct compareSquare
@@ -3973,11 +4107,44 @@ bool ImplContour::decodeMulti(
         }
         straight_qrcode.assign(tmp_straight_qrcodes);
     }
+
     decoded_info.clear();
     for (size_t i = 0; i < info.size(); i++)
     {
-       decoded_info.push_back(info[i]);
+        auto& decoder = qrdec[i];
+        if (!decoder.isStructured())
+        {
+            decoded_info.push_back(info[i]);
+            continue;
+        }
+
+        // Store final message corresponding to 0-th code in a sequence.
+        if (decoder.structure_info.sequence_num != 0)
+        {
+            decoded_info.push_back("");
+            continue;
+        }
+
+        cv::String decoded = info[i];
+        for (size_t idx = 1; idx < decoder.structure_info.total_num; ++idx)
+        {
+            auto it = std::find_if(qrdec.begin(), qrdec.end(), [&](QRDecode& dec) {
+                return dec.structure_info.parity == decoder.structure_info.parity &&
+                       dec.structure_info.sequence_num == idx;
+            });
+            if (it != qrdec.end())
+            {
+                decoded += info[it - qrdec.begin()];
+            }
+            else
+            {
+                decoded = "";
+                break;
+            }
+        }
+        decoded_info.push_back(decoded);
     }
+
     alignmentMarkers.resize(src_points.size());
     updateQrCorners.resize(src_points.size()*4ull);
     for (size_t i = 0ull; i < src_points.size(); i++) {
@@ -4382,25 +4549,14 @@ static
 vector<QRCode> analyzeFinderPatterns(const vector<vector<Point2f> > &corners, const Mat& img,
                                      const QRCodeDetectorAruco::Params& qrDetectorParameters) {
     vector<QRCode> qrCodes;
-    vector<FinderPatternInfo> patterns;
+    vector<FinderPatternInfo> patterns(corners.size());
     if (img.empty())
         return qrCodes;
     float maxModuleSize = 0.f;
     for (size_t i = 0ull; i < corners.size(); i++) {
         FinderPatternInfo pattern = FinderPatternInfo(corners[i]);
-        // TODO: improve thinning Aruco markers
-        bool isUniq = true;
-        for (const FinderPatternInfo& tmp : patterns) {
-            Point2f dist = pattern.center - tmp.center;
-            if (max(abs(dist.x), abs(dist.y)) < 3.f * tmp.moduleSize) {
-                isUniq = false;
-                break;
-            }
-        }
-        if (isUniq) {
-            patterns.push_back(pattern);
-            maxModuleSize = max(maxModuleSize, patterns.back().moduleSize);
-        }
+        patterns[i] = pattern;
+        maxModuleSize = max(maxModuleSize, pattern.moduleSize);
     }
     const int threshold = cvRound(qrDetectorParameters.minModuleSizeInPyramid * 12.5f) +
                           (cvRound(qrDetectorParameters.minModuleSizeInPyramid * 12.5f) % 2 ? 0 : 1);
@@ -4460,13 +4616,13 @@ vector<QRCode> analyzeFinderPatterns(const vector<vector<Point2f> > &corners, co
 struct PimplQRAruco : public ImplContour {
     QRCodeDetectorAruco::Params qrParams;
     aruco::ArucoDetector arucoDetector;
-    aruco::DetectorParameters arucoParams;
 
     PimplQRAruco() {
         Mat bits = Mat::ones(Size(5, 5), CV_8UC1);
         Mat(bits, Rect(1, 1, 3, 3)).setTo(Scalar(0));
         Mat byteList = aruco::Dictionary::getByteListFromBits(bits);
         aruco::Dictionary dictionary = aruco::Dictionary(byteList, 5, 4);
+        aruco::DetectorParameters arucoParams;
         arucoParams.minMarkerPerimeterRate = 0.02;
         arucoDetector = aruco::ArucoDetector(dictionary, arucoParams);
     }
@@ -4540,12 +4696,12 @@ QRCodeDetectorAruco& QRCodeDetectorAruco::setDetectorParameters(const QRCodeDete
     return *this;
 }
 
-aruco::DetectorParameters QRCodeDetectorAruco::getArucoParameters() {
-    return std::dynamic_pointer_cast<PimplQRAruco>(p)->arucoParams;
+const aruco::DetectorParameters& QRCodeDetectorAruco::getArucoParameters() const {
+    return std::dynamic_pointer_cast<PimplQRAruco>(p)->arucoDetector.getDetectorParameters();
 }
 
 void QRCodeDetectorAruco::setArucoParameters(const aruco::DetectorParameters& params) {
-    std::dynamic_pointer_cast<PimplQRAruco>(p)->arucoParams = params;
+    std::dynamic_pointer_cast<PimplQRAruco>(p)->arucoDetector.setDetectorParameters(params);
 }
 
 }  // namespace
diff --git a/modules/objdetect/src/qrcode_encoder.cpp b/modules/objdetect/src/qrcode_encoder.cpp
index 24a954889917..41b69ebe46b5 100644
--- a/modules/objdetect/src/qrcode_encoder.cpp
+++ b/modules/objdetect/src/qrcode_encoder.cpp
@@ -6,6 +6,8 @@
 
 #include "precomp.hpp"
 #include "qrcode_encoder_table.inl.hpp"
+#include "graphical_code_detector_impl.hpp"
+
 namespace cv
 {
 using std::vector;
@@ -19,6 +21,7 @@ const uint8_t INVALID_REGION_VALUE = 110;
 static void decToBin(const int dec_number, const int total_bits, std::vector<uint8_t> &bin_number);
 static uint8_t gfPow(uint8_t x, int power);
 static uint8_t gfMul(const uint8_t x, const uint8_t y);
+static uint8_t gfDiv(const uint8_t x, const uint8_t y);
 static void gfPolyMul(const vector<uint8_t> &p, const vector<uint8_t> &q, vector<uint8_t> &product);
 static void gfPolyDiv(const vector<uint8_t> &dividend, const vector<uint8_t> &divisor, const int ecc_num, vector<uint8_t> &quotient);
 static void polyGenerator(const int n, vector<uint8_t> &result);
@@ -51,6 +54,13 @@ static uint8_t gfMul(const uint8_t x, const uint8_t y)
     return gf_exp[(gf_log[x] + gf_log[y]) % 255];
 }
 
+static uint8_t gfDiv(const uint8_t x, const uint8_t y)
+{
+    if (x == 0 || y == 0)
+        return 0;
+    return gf_exp[(gf_log[x] + 255 - gf_log[y]) % 255];
+}
+
 static void gfPolyMul(const vector<uint8_t> &p, const vector<uint8_t> &q, vector<uint8_t> &product)
 {
     int len_p = (int)p.size();
@@ -141,6 +151,8 @@ static int mapSymbol(char c)
     return -1;
 }
 
+static void maskData(const Mat& original, const int mask_type_num, Mat &masked);
+
 QRCodeEncoder::QRCodeEncoder()
 {
     // nothing
@@ -196,17 +208,18 @@ class QRCodeEncoderImpl : public QRCodeEncoder
     uint8_t total_num;
     vector<Mat> final_qrcodes;
 
-    Ptr<VersionInfo> version_info;
-    Ptr<BlockParams> cur_ecc_params;
+    const VersionInfo* version_info;
+    const BlockParams* cur_ecc_params;
 
-    bool isNumeric(const std::string& input);
-    bool isAlphaNumeric(const std::string& input);
+    bool isNumeric(const std::string& input) const;
+    bool isAlphaNumeric(const std::string& input) const;
+    EncodeMode autoEncodeMode(const std::string &input) const ;
     bool encodeByte(const std::string& input, vector<uint8_t> &output);
     bool encodeAlpha(const std::string& input, vector<uint8_t> &output);
     bool encodeNumeric(const std::string& input, vector<uint8_t> &output);
     bool encodeECI(const std::string& input, vector<uint8_t> &output);
     bool encodeKanji(const std::string& input, vector<uint8_t> &output);
-    bool encodeAuto(const std::string& input, vector<uint8_t> &output);
+    bool encodeAuto(const std::string& input, vector<uint8_t> &output, EncodeMode *mode = nullptr);
     bool encodeStructure(const std::string& input, vector<uint8_t> &output);
     int eccLevelToCode(CorrectionLevel level);
     void padBitStream();
@@ -220,11 +233,10 @@ class QRCodeEncoderImpl : public QRCodeEncoder
     void formatGenerate(const int mask_type_num, vector<uint8_t> &format_array);
     void versionInfoGenerate(const int version_level_num, vector<uint8_t> &version_array);
     void fillReserved(const vector<uint8_t> &format_array, Mat &masked);
-    void maskData(const int mask_type_num, Mat &masked);
     void findAutoMaskType();
-    bool estimateVersion(const int input_length, vector<int> &possible_version);
+    bool estimateVersion(const int input_length, EncodeMode mode, vector<int> &possible_version);
     int versionAuto(const std::string &input_str);
-    int findVersionCapacity(const int input_length, const int ecc, const int version_begin, const int version_end);
+    int findVersionCapacity(const int input_length, const int ecc, const std::vector<int>& possible_versions);
     void generatingProcess(const std::string& input, Mat &qrcode);
     void generateQR(const std::string& input);
 };
@@ -247,17 +259,17 @@ int QRCodeEncoderImpl::eccLevelToCode(CorrectionLevel level)
         "CORRECT_LEVEL_L, CORRECT_LEVEL_M, CORRECT_LEVEL_Q, CORRECT_LEVEL_H." );
 }
 
-int QRCodeEncoderImpl::findVersionCapacity(const int input_length, const int ecc, const int version_begin, const int version_end)
+int QRCodeEncoderImpl::findVersionCapacity(const int input_length, const int ecc, const std::vector<int>& possible_versions)
 {
     int data_codewords, version_index = -1;
     const int byte_len = 8;
     version_index = -1;
 
-    for (int i = version_begin; i < version_end; i++)
+    for (int i : possible_versions)
     {
-        Ptr<BlockParams> tmp_ecc_params = makePtr<BlockParams>(version_info_database[i].ecc[ecc]);
-        data_codewords = tmp_ecc_params->data_codewords_in_G1 * tmp_ecc_params->num_blocks_in_G1 +
-                         tmp_ecc_params->data_codewords_in_G2 * tmp_ecc_params->num_blocks_in_G2;
+        auto& tmp_ecc_params = version_info_database[i].ecc[ecc];
+        data_codewords = tmp_ecc_params.data_codewords_in_G1 * tmp_ecc_params.num_blocks_in_G1 +
+                         tmp_ecc_params.data_codewords_in_G2 * tmp_ecc_params.num_blocks_in_G2;
 
         if (data_codewords * byte_len >= input_length)
         {
@@ -268,53 +280,76 @@ int QRCodeEncoderImpl::findVersionCapacity(const int input_length, const int ecc
     return version_index;
 }
 
-bool QRCodeEncoderImpl::estimateVersion(const int input_length, vector<int>& possible_version)
+static inline int getCapacity(int version, QRCodeEncoder::CorrectionLevel ecc_level, QRCodeEncoder::EncodeMode mode) {
+    const int* capacity = version_capacity_database[version].ec_level[ecc_level].encoding_modes;
+    switch (mode) {
+        case QRCodeEncoder::EncodeMode::MODE_NUMERIC:
+            return capacity[0];
+        case QRCodeEncoder::EncodeMode::MODE_ALPHANUMERIC:
+            return capacity[1];
+        case QRCodeEncoder::EncodeMode::MODE_BYTE:
+            return capacity[2];
+        case QRCodeEncoder::EncodeMode::MODE_KANJI:
+            return capacity[3];
+        default:
+            CV_Error(Error::StsNotImplemented, format("Unexpected mode %d", mode));
+    }
+}
+
+bool QRCodeEncoderImpl::estimateVersion(const int input_length, EncodeMode mode, vector<int>& possible_version)
 {
     possible_version.clear();
-    if (input_length > version_capacity_database[40].ec_level[ecc_level].encoding_modes[1])
-        return false;
-    if (input_length <= version_capacity_database[9].ec_level[ecc_level].encoding_modes[3])
-    {
-        possible_version.push_back(1);
-    }
-    else if (input_length <= version_capacity_database[9].ec_level[ecc_level].encoding_modes[1])
+
+    CV_Assert(mode != EncodeMode::MODE_AUTO);
+
+    if (input_length > getCapacity(MAX_VERSION, ecc_level, mode))
     {
-        possible_version.push_back(1);
-        possible_version.push_back(2);
+        return false;
     }
-    else if (input_length <= version_capacity_database[26].ec_level[ecc_level].encoding_modes[3])
+
+    int version = MAX_VERSION;
+
+    for (; version > 0; --version)
     {
-        possible_version.push_back(2);
+        if (input_length > getCapacity(version, ecc_level, mode)) {
+            break;
+        }
     }
-    else if (input_length <= version_capacity_database[26].ec_level[ecc_level].encoding_modes[1])
+
+    if (version < MAX_VERSION)
     {
-        possible_version.push_back(2);
-        possible_version.push_back(3);
+        version += 1;
     }
-    else
+
+    possible_version.push_back(version);
+
+    if (version < MAX_VERSION)
     {
-        possible_version.push_back(3);
+        possible_version.push_back(version + 1);
     }
+
     return true;
 }
 
 int QRCodeEncoderImpl::versionAuto(const std::string& input_str)
 {
-    vector<int> possible_version;
-    estimateVersion((int)input_str.length(), possible_version);
-    int tmp_version = 0;
     vector<uint8_t> payload_tmp;
-    int version_range[5] = {0, 1, 10, 27, 41};
-    for(size_t i = 0; i < possible_version.size(); i++)
-    {
-        int version_range_index = possible_version[i];
+    EncodeMode mode;
+    encodeAuto(input_str, payload_tmp, &mode);
 
-        encodeAuto(input_str, payload_tmp);
-        tmp_version = findVersionCapacity((int)payload_tmp.size(), ecc_level,
-                                version_range[version_range_index], version_range[version_range_index + 1]);
-        if(tmp_version != -1)
-            break;
+    vector<int> possible_version;
+    if (!estimateVersion((int)input_str.length(), mode, possible_version)) {
+        return -1;
     }
+
+    int nbits = static_cast<int>(payload_tmp.size());
+
+    // Extra info for structure's position, total and parity + mode of final message
+    if (mode_type == MODE_STRUCTURED_APPEND)
+        nbits += 4 + 4 + 8 + 4;
+
+    const auto tmp_version = findVersionCapacity(nbits, ecc_level, possible_version);
+
     return tmp_version;
 }
 
@@ -336,26 +371,29 @@ void QRCodeEncoderImpl::generateQR(const std::string &input)
     auto string_itr = input.begin();
     for (int i = struct_num; i > 0; --i)
     {
-        sequence_num = (uint8_t) i;
+        sequence_num = (uint8_t) (struct_num - i);
         size_t segment_begin = string_itr - input.begin();
         size_t segment_end = (input.end() - string_itr) / i;
 
         std::string input_info = input.substr(segment_begin, segment_end);
         string_itr += segment_end;
+
         int detected_version = versionAuto(input_info);
-        CV_Assert(detected_version != -1);
-        if (version_level == 0)
-            version_level = detected_version;
-        else if (version_level < detected_version)
+        int tmp_version_level = version_level;
+        if (detected_version == -1)
+            CV_Error(Error::StsBadArg, "The given input exceeds the maximum capacity of a QR code with the selected encoding mode and error correction level " );
+        else if (tmp_version_level == 0)
+            tmp_version_level = detected_version;
+        else if (tmp_version_level < detected_version)
             CV_Error(Error::StsBadArg, "The given version is not suitable for the given input string length ");
 
         payload.clear();
         payload.reserve(MAX_PAYLOAD_LEN);
         format = vector<uint8_t> (15, 255);
         version_reserved = vector<uint8_t> (18, 255);
-        version_size = (21 + (version_level - 1) * 4);
-        version_info = makePtr<VersionInfo>(version_info_database[version_level]);
-        cur_ecc_params = makePtr<BlockParams>(version_info->ecc[ecc_level]);
+        version_size = (21 + (tmp_version_level - 1) * 4);
+        version_info = &version_info_database[tmp_version_level];
+        cur_ecc_params = &version_info->ecc[ecc_level];
         original = Mat(Size(version_size, version_size), CV_8UC1, Scalar(255));
         masked_data = original.clone();
         Mat qrcode = masked_data.clone();
@@ -366,36 +404,10 @@ void QRCodeEncoderImpl::generateQR(const std::string &input)
 
 void QRCodeEncoderImpl::formatGenerate(const int mask_type_num, vector<uint8_t> &format_array)
 {
-    const int mask_bits_num = 3;
-    const int level_bits_num = 2;
-
-    std::vector<uint8_t> mask_type_bin(mask_bits_num);
-    std::vector<uint8_t> ec_level_bin(level_bits_num);
-    decToBin(mask_type_num, mask_bits_num, mask_type_bin);
-    decToBin(eccLevelToCode(ecc_level), level_bits_num, ec_level_bin);
-
-    std::vector<uint8_t> format_bits;
-    hconcat(ec_level_bin, mask_type_bin, format_bits);
-    std::reverse(format_bits.begin(), format_bits.end());
-
-    const int ecc_info_bits = 10;
-
-    std::vector<uint8_t> shift(ecc_info_bits, 0);
-    std::vector<uint8_t> polynomial;
-    hconcat(shift, format_bits, polynomial);
-
-    const int generator_len = 11;
-    const uint8_t generator_arr[generator_len] = {1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1};
-    std::vector<uint8_t> format_generator (generator_arr, generator_arr + sizeof(generator_arr) / sizeof(generator_arr[0]));
-    vector<uint8_t> ecc_code;
-    gfPolyDiv(polynomial, format_generator, ecc_info_bits, ecc_code);
-    hconcat(ecc_code, format_bits, format_array);
-
-    const uint8_t mask_arr[MAX_FORMAT_LENGTH] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1};
-    std::vector<uint8_t> system_mask (mask_arr, mask_arr + sizeof(mask_arr) / sizeof(mask_arr[0]));
-    for(int i = 0; i < MAX_FORMAT_LENGTH; i++)
-    {
-        format_array[i] ^= system_mask[i];
+    int idx = (eccLevelToCode(ecc_level) << 3) | mask_type_num;
+    format_array.resize(MAX_FORMAT_LENGTH);
+    for (int i = 0; i < MAX_FORMAT_LENGTH; ++i) {
+        format_array[i] = (formatInfoLUT[idx] >> i) & 1;
     }
 }
 
@@ -613,7 +625,7 @@ bool QRCodeEncoderImpl::encodeStructure(const std::string& input, vector<uint8_t
     return encodeAuto(input, output);
 }
 
-bool QRCodeEncoderImpl::isNumeric(const std::string& input)
+bool QRCodeEncoderImpl::isNumeric(const std::string& input) const
 {
     for (size_t i = 0; i < input.length(); i++)
     {
@@ -623,7 +635,7 @@ bool QRCodeEncoderImpl::isNumeric(const std::string& input)
     return true;
 }
 
-bool QRCodeEncoderImpl::isAlphaNumeric(const std::string& input)
+bool QRCodeEncoderImpl::isAlphaNumeric(const std::string& input) const
 {
     for (size_t i = 0; i < input.length(); i++)
     {
@@ -633,14 +645,56 @@ bool QRCodeEncoderImpl::isAlphaNumeric(const std::string& input)
     return true;
 }
 
-bool QRCodeEncoderImpl::encodeAuto(const std::string& input, vector<uint8_t>& output)
+QRCodeEncoder::EncodeMode QRCodeEncoderImpl::autoEncodeMode(const std::string &input) const
 {
     if (isNumeric(input))
-        encodeNumeric(input, output);
-    else if (isAlphaNumeric(input))
-        encodeAlpha(input, output);
-    else
-        encodeByte(input, output);
+    {
+        return EncodeMode::MODE_NUMERIC;
+    }
+
+    if (isAlphaNumeric(input))
+    {
+        return EncodeMode::MODE_ALPHANUMERIC;
+    }
+
+    return EncodeMode::MODE_BYTE;
+}
+
+bool QRCodeEncoderImpl::encodeAuto(const std::string& input, vector<uint8_t>& output, EncodeMode *mode)
+{
+    const auto selected_mode = autoEncodeMode(input);
+
+    CV_Assert(selected_mode != EncodeMode::MODE_AUTO);
+
+    switch (selected_mode)
+    {
+        case EncodeMode::MODE_NUMERIC:
+            encodeNumeric(input, output);
+            break;
+        case EncodeMode::MODE_ALPHANUMERIC:
+            encodeAlpha(input, output);
+            break;
+        case EncodeMode::MODE_STRUCTURED_APPEND:
+            encodeByte(input, output);
+            break;
+        case EncodeMode::MODE_BYTE:
+            encodeByte(input, output);
+            break;
+        case EncodeMode::MODE_KANJI:
+            encodeKanji(input, output);
+            break;
+        case EncodeMode::MODE_ECI:
+            encodeECI(input, output);
+            break;
+        default:
+            break;
+    }
+
+    if (mode != nullptr)
+    {
+        *mode = selected_mode;
+    }
+
     return true;
 }
 
@@ -703,7 +757,7 @@ bool QRCodeEncoderImpl::stringToBits(const std::string& input_info)
         default:
             return encodeAuto(input_info, payload);
     }
-};
+}
 
 void QRCodeEncoderImpl::eccGenerate(vector<vector<uint8_t> > &data_blocks, vector<vector<uint8_t> > &ecc_blocks)
 {
@@ -787,7 +841,7 @@ void QRCodeEncoderImpl::findAutoMaskType()
     {
         Mat test_result = masked_data.clone();
         vector<uint8_t> test_format = format;
-        maskData(cur_type, test_result);
+        maskData(original, cur_type, test_result);
         formatGenerate(cur_type, test_format);
         fillReserved(test_format, test_result);
         int continued_num = 0;
@@ -899,8 +953,9 @@ void QRCodeEncoderImpl::findAutoMaskType()
     mask_type = best_index;
 }
 
-void QRCodeEncoderImpl::maskData(const int mask_type_num, Mat& masked)
+void maskData(const Mat& original, const int mask_type_num, Mat& masked)
 {
+    int version_size = original.rows;
     for (int i = 0; i < version_size; i++)
     {
         for (int j = 0; j < version_size; j++)
@@ -1204,7 +1259,7 @@ void QRCodeEncoderImpl::structureFinalMessage()
     writeReservedArea();
     writeData();
     findAutoMaskType();
-    maskData(mask_type, masked_data);
+    maskData(original, mask_type, masked_data);
     formatGenerate(mask_type, format);
     versionInfoGenerate(version_level, version_reserved);
     fillReserved(format, masked_data);
@@ -1260,4 +1315,527 @@ Ptr<QRCodeEncoder> QRCodeEncoder::create(const QRCodeEncoder::Params& parameters
     return makePtr<QRCodeEncoderImpl>(parameters);
 }
 
+class QRCodeDecoderImpl : public QRCodeDecoder {
+public:
+    bool decode(const Mat& straight, String& decoded_info) CV_OVERRIDE;
+
+private:
+    QRCodeEncoder::CorrectionLevel level;
+    int version;
+
+    struct Bitstream {
+        int next(int bits) {
+            CV_Assert(idx < data.size());
+
+            int val = 0;
+            while (bits >= actualBits) {
+                val |= data[idx++] << (bits - actualBits);
+                bits -= actualBits;
+                actualBits = 8;
+            }
+            if (bits) {
+                val |= data[idx] >> (actualBits - bits);
+                actualBits -= bits;
+                data[idx] &= 255 >> (8 - actualBits);
+            }
+            return val;
+        }
+
+        bool empty() {
+            return idx >= data.size();
+        }
+
+        std::vector<uint8_t> data;
+        int actualBits = 8;
+        size_t idx = 0;
+    } bitstream;
+
+    bool run(const Mat& straight, String& decoded_info);
+    bool decodeFormatInfo(const Mat& straight, int& mask);
+    bool correctFormatInfo(uint16_t& format_info);
+    void extractCodewords(Mat& source, std::vector<uint8_t>& codewords);
+    bool errorCorrection(std::vector<uint8_t>& codewords);
+    bool errorCorrectionBlock(std::vector<uint8_t>& codewords);
+    void decodeSymbols(String& result);
+    void decodeNumeric(String& result);
+    void decodeAlpha(String& result);
+    void decodeByte(String& result);
+    void decodeECI(String& result);
+    void decodeKanji(String& result);
+    void decodeStructuredAppend(String& result);
+};
+
+QRCodeDecoder::~QRCodeDecoder()
+{
+    // nothing
+}
+
+Ptr<QRCodeDecoder> QRCodeDecoder::create() {
+    return makePtr<QRCodeDecoderImpl>();
+}
+
+bool QRCodeDecoderImpl::decode(const Mat& _straight, String& decoded_info) {
+    Mat straight = ~_straight;  // Invert modules
+    bool decoded = run(straight, decoded_info);
+    if (!decoded) {
+        cv::transpose(straight, straight);
+        decoded = run(straight, decoded_info);
+    }
+    return decoded;
+}
+
+// Unmask format info bits and apply error correction
+bool QRCodeDecoderImpl::correctFormatInfo(uint16_t& format_info) {
+    static const uint16_t mask_pattern = 0b101010000010010;
+
+    cv::Hamming hd;
+    for (int i = 0; i < 32; ++i) {
+        // Compute Hamming distance
+        int distance = hd(reinterpret_cast<const unsigned char*>(&formatInfoLUT[i]),
+                          reinterpret_cast<const unsigned char*>(&format_info), 2);
+        // Up to 3 bit errors might be corrected.
+        // So if distance is less or equal than 3 - we found a correct format info.
+        if (distance <= 3) {
+            format_info = formatInfoLUT[i] ^ mask_pattern;
+            return true;
+        }
+    }
+    return false;
+}
+
+bool QRCodeDecoderImpl::decodeFormatInfo(const Mat& straight, int& mask) {
+    // Read left-top format info
+    uint16_t format_info = 0;
+    for (int i = 0; i < 6; ++i)
+        format_info |= (straight.at<uint8_t>(i, 8) & 1) << i;
+
+    format_info |= (straight.at<uint8_t>(7, 8) & 1) << 6;
+    format_info |= (straight.at<uint8_t>(8, 8) & 1) << 7;
+    format_info |= (straight.at<uint8_t>(8, 7) & 1) << 8;
+
+    for (int i = 9; i < 15; ++i)
+        format_info |= (straight.at<uint8_t>(8, 14 - i) & 1) << i;
+
+    bool correct = correctFormatInfo(format_info);
+
+    // Format information 15bit sequence appears twice.
+    // Try extract format info from different position.
+    uint16_t format_info_dup = 0;
+    for (int i = 0; i < 8; ++i)
+        format_info_dup |= (straight.at<uint8_t>(8, straight.cols - 1 - i) & 1) << i;
+    for (int i = 0; i < 7; ++i)
+        format_info_dup |= (straight.at<uint8_t>(straight.rows - 7 + i, 8) & 1) << (i + 8);
+
+    if (correctFormatInfo(format_info_dup)) {
+        // Both strings must be the same
+        if (correct && format_info != format_info_dup)
+            return false;
+        format_info = format_info_dup;
+    } else {
+        if (!correct)
+            return false;
+    }
+
+    switch((format_info >> 13) & 0b11) {
+        case 0: level = QRCodeEncoder::CorrectionLevel::CORRECT_LEVEL_M; break;
+        case 1: level = QRCodeEncoder::CorrectionLevel::CORRECT_LEVEL_L; break;
+        case 2: level = QRCodeEncoder::CorrectionLevel::CORRECT_LEVEL_H; break;
+        case 3: level = QRCodeEncoder::CorrectionLevel::CORRECT_LEVEL_Q; break;
+    };
+    mask = (format_info >> 10) & 0b111;
+    return true;
+}
+
+bool QRCodeDecoderImpl::run(const Mat& straight, String& decoded_info) {
+    CV_Assert(straight.rows == straight.cols);
+    version = (straight.rows - 21) / 4 + 1;
+
+    decoded_info = "";
+    mode = static_cast<QRCodeEncoder::EncodeMode>(0);
+    eci = static_cast<QRCodeEncoder::ECIEncodings>(0);
+
+    // Decode format info
+    int maskPattern;
+    bool decoded = decodeFormatInfo(straight, maskPattern);
+    if (!decoded) {
+        return false;
+    }
+
+    // Generate data mask
+    Mat masked = straight.clone();
+    maskData(straight, maskPattern, masked);
+
+    extractCodewords(masked, bitstream.data);
+    if (!errorCorrection(bitstream.data)) {
+        return false;
+    }
+    decodeSymbols(decoded_info);
+    return true;
+}
+
+bool QRCodeDecoderImpl::errorCorrection(std::vector<uint8_t>& codewords) {
+    CV_CheckEQ((int)codewords.size(), version_info_database[version].total_codewords,
+               "Number of codewords");
+
+    int numBlocks = version_info_database[version].ecc[level].num_blocks_in_G1 +
+                    version_info_database[version].ecc[level].num_blocks_in_G2;
+    if (numBlocks == 1) {
+        return errorCorrectionBlock(codewords);
+    }
+
+    size_t numData = 0;
+    std::vector<int> blockSizes;
+    blockSizes.reserve(numBlocks);
+    for (int i = 0; i < version_info_database[version].ecc[level].num_blocks_in_G1; ++i) {
+        blockSizes.push_back(version_info_database[version].ecc[level].data_codewords_in_G1);
+        numData += blockSizes.back();
+    }
+    for (int i = 0; i < version_info_database[version].ecc[level].num_blocks_in_G2; ++i) {
+        blockSizes.push_back(version_info_database[version].ecc[level].data_codewords_in_G2);
+        numData += blockSizes.back();
+    }
+
+    // TODO: parallel_for
+    std::vector<std::vector<uint8_t>> blocks(numBlocks);
+    int minBlockSize = *std::min_element(blockSizes.begin(), blockSizes.end());
+    size_t offset = 0;
+    for (int i = 0; i < minBlockSize; ++i) {
+        for (int j = 0; j < numBlocks; ++j) {
+            blocks[j].push_back(codewords[offset++]);
+        }
+    }
+    // Put remaining data codewords
+    for (int j = 0; j < numBlocks; ++j) {
+        CV_Assert(blockSizes[j] == minBlockSize || blockSizes[j] == minBlockSize + 1);
+        if (blockSizes[j] > minBlockSize)
+            blocks[j].push_back(codewords[offset++]);
+    }
+    // Copy error correction codewords
+    int numEcc = version_info_database[version].ecc[level].ecc_codewords;
+    for (int i = 0; i < numEcc; ++i) {
+        for (int j = 0; j < numBlocks; ++j) {
+            blocks[j].push_back(codewords[offset++]);
+        }
+    }
+
+    parallel_for_(Range(0, numBlocks), [&](const Range& r) {
+        for (int i = r.start; i < r.end; ++i) {
+            if (!errorCorrectionBlock(blocks[i])) {
+                blocks[i].clear();
+                return;
+            }
+        }
+    });
+
+    // Collect blocks back after error correction. Trim error correction codewords.
+    codewords.resize(numData);
+    offset = 0;
+    for (size_t i = 0; i < blocks.size(); ++i) {
+        if (blocks[i].empty())
+            return false;
+        std::copy(blocks[i].begin(), blocks[i].end(), codewords.begin() + offset);
+        offset += blocks[i].size();
+    }
+
+    return true;
+}
+
+bool QRCodeDecoderImpl::errorCorrectionBlock(std::vector<uint8_t>& codewords) {
+    size_t numEcc = version_info_database[version].ecc[level].ecc_codewords;
+    size_t numSyndromes = numEcc;
+
+    // According to the ISO there is a formula for a number of the syndromes.
+    // However several tests don't pass the error correction step because of less number of syndromes:
+    // 1M: qrcodes/detection/lots/image001.jpg from BoofCV (8 syndromes by formula, 10 needed)
+    // 1L: Objdetect_QRCode_Multi.regression/13 (4 syndromes by formula, 6 needed)
+    // 2L: qrcodes/detection/brightness/image011.jpg from BoofCV (8 syndromes by formula, 10 needed)
+    if (numSyndromes % 2 == 1)
+        numSyndromes -= 1;
+
+    // Compute syndromes
+    bool hasError = false;
+    std::vector<uint8_t> syndromes(numSyndromes, codewords[0]);
+    for (size_t i = 0; i < syndromes.size(); ++i) {
+        for (size_t j = 1; j < codewords.size(); ++j) {
+            syndromes[i] = gfMul(syndromes[i], gfPow(2, static_cast<int>(i))) ^ codewords[j];
+        }
+        hasError |= syndromes[i] != 0;
+    }
+    if (!hasError) {
+        // Trim error correction codewords
+        codewords.resize(codewords.size() - numEcc);
+        return true;
+    }
+
+    // Run Berlekamp–Massey algorithm to find error positions (coefficients of locator poly)
+    size_t L = 0;   // number of assumed errors
+    size_t m = 1;   // shift value (between C and B)
+    uint8_t b = 1;  // discrepancy from last L update
+
+    std::vector<uint8_t> C(numSyndromes, 0);  // Error locator polynomial
+    std::vector<uint8_t> B(numSyndromes, 0);  // A copy of error locator from previos L update
+    C[0] = B[0] = 1;
+    for (size_t i = 0; i < numSyndromes; ++i) {
+        CV_Assert(m + L - 1 < C.size());  // m >= 1 on any iteration
+        uint8_t discrepancy = syndromes[i];
+        for (size_t j = 1; j <= L; ++j) {
+            discrepancy ^= gfMul(C[j], syndromes[i - j]);
+        }
+
+        if (discrepancy == 0) {
+            m += 1;
+        } else {
+            std::vector<uint8_t> C_copy = C;
+            uint8_t inv_b = gfDiv(1, b);
+            uint8_t tmp = gfMul(discrepancy, inv_b);
+
+            for (size_t j = 0; j < L; ++j) {
+                C[m + j] ^= gfMul(tmp, B[j]);
+            }
+
+            if (2 * L <= i) {
+                L = i + 1 - L;
+                B = C_copy;
+                b = discrepancy;
+                m = 1;
+            } else {
+                m += 1;
+            }
+        }
+    }
+
+    // There is an error at i-th position if i is a root of locator poly
+    std::vector<size_t> errLocs;
+    errLocs.reserve(L);
+    for (size_t i = 0; i < codewords.size(); ++i) {
+        uint8_t val = 1;
+        uint8_t pos = gfPow(2, static_cast<int>(i));
+        for (size_t j = 1; j <= L; ++j) {
+            val = gfMul(val, pos) ^ C[j];
+        }
+        if (val == 0) {
+            errLocs.push_back(static_cast<int>(codewords.size() - 1 - i));
+        }
+    }
+
+    // Number of assumed errors does not match number of error locations
+    if (errLocs.size() != L)
+        return false;
+
+    // Forney algorithm for error correction using syndromes and known error locations
+    std::vector<uint8_t> errEval;
+    gfPolyMul(C, syndromes, errEval);
+
+    for (size_t i = 0; i < errLocs.size(); ++i) {
+        uint8_t numenator = 0, denominator = 0;
+        uint8_t X = gfPow(2, static_cast<int>(codewords.size() - 1 - errLocs[i]));
+        uint8_t inv_X = gfDiv(1, X);
+
+        for (size_t j = 0; j < L; ++j) {
+            numenator = gfMul(numenator, inv_X) ^ errEval[L - 1 - j];
+        }
+
+        // Compute demoninator as a product of (1-X_i * X_k) for i != k
+        // TODO: optimize, there is a dubplicated compute
+        denominator = 1;
+        for (size_t j = 0; j < errLocs.size(); ++j) {
+            if (i == j)
+                continue;
+            uint8_t Xj = gfPow(2, static_cast<int>(codewords.size() - 1 - errLocs[j]));
+            denominator = gfMul(denominator, 1 ^ gfMul(inv_X, Xj));
+        }
+
+        uint8_t errValue = gfDiv(numenator, denominator);
+        codewords[errLocs[i]] ^= errValue;
+    }
+
+    // Trim error correction codewords
+    codewords.resize(codewords.size() - numEcc);
+    return true;
+}
+
+void QRCodeDecoderImpl::extractCodewords(Mat& source, std::vector<uint8_t>& codewords) {
+    const VersionInfo& version_info = version_info_database[version];
+
+    // Mask alignment markers
+    std::vector<int> alignCenters;
+    alignCenters.reserve(MAX_ALIGNMENT);
+    for (int i = 0; i < MAX_ALIGNMENT && version_info.alignment_pattern[i]; i++)
+        alignCenters.push_back(version_info.alignment_pattern[i]);
+
+    for (size_t i = 0; i < alignCenters.size(); i++)
+    {
+        for (size_t j = 0; j < alignCenters.size(); j++)
+        {
+            if ((i == alignCenters.size() - 1 && j == 0) || (i == 0 && j == 0) ||
+                (j == alignCenters.size() - 1 && i == 0))
+                continue;
+            int x = alignCenters[i];
+            int y = alignCenters[j];
+            Mat area = source({x - 2, x + 3}, {y - 2, y + 3});
+            area.setTo(INVALID_REGION_VALUE);
+        }
+    }
+
+    // Mask detection markers
+    source.rowRange(0, 9).colRange(source.cols - 8, source.cols).setTo(INVALID_REGION_VALUE);
+    source.rowRange(0, 9).colRange(0, 9).setTo(INVALID_REGION_VALUE);
+    source.colRange(0, 9).rowRange(source.rows - 8, source.rows).setTo(INVALID_REGION_VALUE);
+
+    // Mask Version Information blocks
+    if (version >= 7) {
+        source.rowRange(0, 6).colRange(source.cols - 12, source.cols - 9).setTo(INVALID_REGION_VALUE);
+        source.colRange(0, 6).rowRange(source.rows - 12, source.rows - 9).setTo(INVALID_REGION_VALUE);
+    }
+
+    // Mask timing pattern
+    source.row(6) = INVALID_REGION_VALUE;
+
+    std::vector<uint8_t> bits;
+    bits.reserve(source.total() - source.cols);
+    bool moveUpwards = true;
+    for (auto& data : {source.colRange(7, source.cols), source.colRange(0, 6)}) {
+        for (int i = data.cols / 2 - 1; i >= 0; --i) {
+            Mat col0 = data.col(i * 2);
+            Mat col1 = data.col(i * 2 + 1);
+            for (int j = 0; j < data.rows; ++j) {
+                if (moveUpwards) {
+                    bits.push_back(col1.at<uint8_t>(data.rows - 1 - j));
+                    bits.push_back(col0.at<uint8_t>(data.rows - 1 - j));
+                } else {
+                    bits.push_back(col1.at<uint8_t>(j));
+                    bits.push_back(col0.at<uint8_t>(j));
+                }
+            }
+            moveUpwards = !moveUpwards;
+        }
+    }
+
+    // Combine bits to codewords
+    size_t numCodewords = version_info.total_codewords;
+    codewords.resize(numCodewords);
+
+    size_t offset = 0;
+    for (size_t i = 0; i < numCodewords; ++i) {
+        codewords[i] = 0;
+        for (size_t j = 0; j < 8; ++j) {
+            while (bits[offset] == INVALID_REGION_VALUE) {
+                offset += 1;
+                CV_Assert(offset < bits.size());
+            }
+            codewords[i] |= (bits[offset] & 1) << (7 - j);
+            offset += 1;
+        }
+    }
+}
+
+void QRCodeDecoderImpl::decodeSymbols(String& result) {
+    CV_Assert(!bitstream.empty());
+
+    // Decode depends on the mode
+    result = "";
+    while (!bitstream.empty()) {
+        // Determine mode
+        auto currMode = static_cast<QRCodeEncoder::EncodeMode>(bitstream.next(4));
+        if (this->mode == 0) {
+            mode = currMode;
+        }
+
+        if (currMode == 0 || bitstream.empty())
+            return;
+        if (currMode == QRCodeEncoder::EncodeMode::MODE_NUMERIC)
+            decodeNumeric(result);
+        else if (currMode == QRCodeEncoder::EncodeMode::MODE_ALPHANUMERIC)
+            decodeAlpha(result);
+        else if (currMode == QRCodeEncoder::EncodeMode::MODE_BYTE)
+            decodeByte(result);
+        else if (currMode == QRCodeEncoder::EncodeMode::MODE_ECI)
+            decodeECI(result);
+        else if (currMode == QRCodeEncoder::EncodeMode::MODE_KANJI)
+            decodeKanji(result);
+        else if (currMode == QRCodeEncoder::EncodeMode::MODE_STRUCTURED_APPEND) {
+            sequence_num = static_cast<uint8_t>(bitstream.next(4));
+            total_num = static_cast<uint8_t>(1 + bitstream.next(4));
+            parity = static_cast<uint8_t>(bitstream.next(8));
+        }
+        else
+            CV_Error(Error::StsNotImplemented, format("mode %d", currMode));
+    }
+}
+
+void QRCodeDecoderImpl::decodeNumeric(String& result) {
+    int numDigits = bitstream.next(version <= 9 ? 10 : (version <= 26 ? 12 : 14));
+    for (int i = 0; i < numDigits / 3; ++i) {
+        int triple = bitstream.next(10);
+        result += static_cast<char>('0' + triple / 100);
+        result += static_cast<char>('0' + (triple / 10) % 10);
+        result += static_cast<char>('0' + triple % 10);
+    }
+    int remainingDigits = numDigits % 3;
+    if (remainingDigits) {
+        int triple = bitstream.next(remainingDigits == 1 ? 4 : 7);
+        if (remainingDigits == 2)
+            result += '0' + (triple / 10) % 10;
+        result += '0' + triple % 10;
+    }
+}
+
+void QRCodeDecoderImpl::decodeAlpha(String& result) {
+    static const char map[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+                               'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
+                               'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
+                               'U', 'V', 'W', 'X', 'Y', 'Z', ' ', '$', '%', '*',
+                               '+', '-', '.', '/', ':'};
+
+    int num = bitstream.next(version <= 9 ? 9 : (version <= 26 ? 11 : 13));
+    for (int i = 0; i < num / 2; ++i) {
+        int tuple = bitstream.next(11);
+        result += map[tuple / 45];
+        result += map[tuple % 45];
+    }
+    if (num % 2) {
+        int value = bitstream.next(6);
+        result += map[value];
+    }
+}
+
+void QRCodeDecoderImpl::decodeByte(String& result) {
+    int num = bitstream.next(version <= 9 ? 8 : 16);
+    for (int i = 0; i < num; ++i) {
+        result += static_cast<char>(bitstream.next(8));
+    }
+}
+
+void QRCodeDecoderImpl::decodeECI(String& result) {
+    int eciAssignValue = bitstream.next(8);
+    for (int i = 0; i < 8; ++i) {
+        if (eciAssignValue & 1 << (7 - i))
+            eciAssignValue |= bitstream.next(8) << (i + 1) * 8;
+        else
+            break;
+    }
+    if (this->eci == 0) {
+        this->eci = static_cast<QRCodeEncoder::ECIEncodings>(eciAssignValue);
+    }
+    decodeSymbols(result);
+
+}
+
+void QRCodeDecoderImpl::decodeKanji(String& result) {
+    int num = bitstream.next(version <= 9 ? 8 : (version <= 26 ? 10 : 12));
+    for (int i = 0; i < num; ++i) {
+        int data = bitstream.next(13);
+        int high_byte = data / 0xC0;
+        int low_byte = data - high_byte * 0xC0;
+        int symbol = (high_byte << 8) + low_byte;
+        if (0 <= symbol && symbol <= 0x9FFC - 0x8140) {
+            symbol += 0x8140;
+        } else if (0xE040 - 0xC140 <= symbol && symbol <= 0xEBBF - 0xC140) {
+            symbol += 0xC140;
+        }
+        result += (symbol >> 8) & 0xff;
+        result += symbol & 0xff;
+    }
+}
+
 }
diff --git a/modules/objdetect/src/qrcode_encoder_table.inl.hpp b/modules/objdetect/src/qrcode_encoder_table.inl.hpp
index fc2ec370381c..5a9c071aff7c 100644
--- a/modules/objdetect/src/qrcode_encoder_table.inl.hpp
+++ b/modules/objdetect/src/qrcode_encoder_table.inl.hpp
@@ -857,4 +857,13 @@ static const uint8_t gf_log[256] = {
         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf
 };
+
+// There are only 32 combinations of format info sequences.
+static const uint16_t formatInfoLUT[32] = {
+        0x5412, 0x5125, 0x5e7c, 0x5b4b, 0x45f9, 0x40ce, 0x4f97, 0x4aa0,
+        0x77c4, 0x72f3, 0x7daa, 0x789d, 0x662f, 0x6318, 0x6c41, 0x6976,
+        0x1689, 0x13be, 0x1ce7, 0x19d0, 0x0762, 0x0255, 0x0d0c, 0x083b,
+        0x355f, 0x3068, 0x3f31, 0x3a06, 0x24b4, 0x2183, 0x2eda, 0x2bed
+};
+
 }
diff --git a/modules/objdetect/test/test_aruco_tutorial.cpp b/modules/objdetect/test/test_aruco_tutorial.cpp
new file mode 100644
index 000000000000..1af91bc63784
--- /dev/null
+++ b/modules/objdetect/test/test_aruco_tutorial.cpp
@@ -0,0 +1,246 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include "opencv2/objdetect/aruco_detector.hpp"
+
+namespace opencv_test { namespace {
+
+
+TEST(CV_ArucoTutorial, can_find_singlemarkersoriginal)
+{
+    string img_path = cvtest::findDataFile("aruco/singlemarkersoriginal.jpg");
+    Mat image = imread(img_path);
+    aruco::ArucoDetector detector(aruco::getPredefinedDictionary(aruco::DICT_6X6_250));
+
+    vector<int> ids;
+    vector<vector<Point2f> > corners, rejected;
+    const size_t N = 6ull;
+    // corners of ArUco markers with indices goldCornersIds
+    const int goldCorners[N][8] = { {359,310, 404,310, 410,350, 362,350}, {427,255, 469,256, 477,289, 434,288},
+                                    {233,273, 190,273, 196,241, 237,241}, {298,185, 334,186, 335,212, 297,211},
+                                    {425,163, 430,186, 394,186, 390,162}, {195,155, 230,155, 227,178, 190,178} };
+    const int goldCornersIds[N] = { 40, 98, 62, 23, 124, 203};
+    map<int, const int*> mapGoldCorners;
+    for (size_t i = 0; i < N; i++)
+        mapGoldCorners[goldCornersIds[i]] = goldCorners[i];
+
+    detector.detectMarkers(image, corners, ids, rejected);
+
+    ASSERT_EQ(N, ids.size());
+    for (size_t i = 0; i < N; i++)
+    {
+        int arucoId = ids[i];
+        ASSERT_EQ(4ull, corners[i].size());
+        ASSERT_TRUE(mapGoldCorners.find(arucoId) != mapGoldCorners.end());
+        for (int j = 0; j < 4; j++)
+        {
+            EXPECT_NEAR(static_cast<float>(mapGoldCorners[arucoId][j * 2]), corners[i][j].x, 1.f);
+            EXPECT_NEAR(static_cast<float>(mapGoldCorners[arucoId][j * 2 + 1]), corners[i][j].y, 1.f);
+        }
+    }
+}
+
+TEST(CV_ArucoTutorial, can_find_gboriginal)
+{
+    string imgPath = cvtest::findDataFile("aruco/gboriginal.jpg");
+    Mat image = imread(imgPath);
+    string dictPath = cvtest::findDataFile("aruco/tutorial_dict.yml");
+    aruco::Dictionary dictionary;
+
+    FileStorage fs(dictPath, FileStorage::READ);
+    dictionary.aruco::Dictionary::readDictionary(fs.root()); // set marker from tutorial_dict.yml
+    aruco::DetectorParameters detectorParams;
+
+    aruco::ArucoDetector detector(dictionary, detectorParams);
+
+    vector<int> ids;
+    vector<vector<Point2f> > corners, rejected;
+    const size_t N = 35ull;
+    // corners of ArUco markers with indices 0, 1, ..., 34
+    const int goldCorners[N][8] = { {252,74, 286,81, 274,102, 238,95},    {295,82, 330,89, 319,111, 282,104},
+                                    {338,91, 375,99, 365,121, 327,113},   {383,100, 421,107, 412,130, 374,123},
+                                    {429,109, 468,116, 461,139, 421,132}, {235,100, 270,108, 257,130, 220,122},
+                                    {279,109, 316,117, 304,140, 266,133}, {324,119, 362,126, 352,150, 313,143},
+                                    {371,128, 410,136, 400,161, 360,152}, {418,139, 459,145, 451,170, 410,163},
+                                    {216,128, 253,136, 239,161, 200,152}, {262,138, 300,146, 287,172, 248,164},
+                                    {309,148, 349,156, 337,183, 296,174}, {358,158, 398,167, 388,194, 346,185},
+                                    {407,169, 449,176, 440,205, 397,196}, {196,158, 235,168, 218,195, 179,185},
+                                    {243,170, 283,178, 269,206, 228,197}, {293,180, 334,190, 321,218, 279,209},
+                                    {343,192, 385,200, 374,230, 330,220}, {395,203, 438,211, 429,241, 384,233},
+                                    {174,192, 215,201, 197,231, 156,221}, {223,204, 265,213, 249,244, 207,234},
+                                    {275,215, 317,225, 303,257, 259,246}, {327,227, 371,238, 359,270, 313,259},
+                                    {381,240, 426,249, 416,282, 369,273}, {151,228, 193,238, 173,271, 130,260},
+                                    {202,241, 245,251, 228,285, 183,274}, {255,254, 300,264, 284,299, 238,288},
+                                    {310,267, 355,278, 342,314, 295,302}, {366,281, 413,290, 402,327, 353,317},
+                                    {125,267, 168,278, 147,314, 102,303}, {178,281, 223,293, 204,330, 157,317},
+                                    {233,296, 280,307, 263,346, 214,333}, {291,310, 338,322, 323,363, 274,349},
+                                    {349,325, 399,336, 386,378, 335,366} };
+    map<int, const int*> mapGoldCorners;
+    for (int i = 0; i < static_cast<int>(N); i++)
+        mapGoldCorners[i] = goldCorners[i];
+
+    detector.detectMarkers(image, corners, ids, rejected);
+
+    ASSERT_EQ(N, ids.size());
+    for (size_t i = 0; i < N; i++)
+    {
+        int arucoId = ids[i];
+        ASSERT_EQ(4ull, corners[i].size());
+        ASSERT_TRUE(mapGoldCorners.find(arucoId) != mapGoldCorners.end());
+        for (int j = 0; j < 4; j++)
+        {
+            EXPECT_NEAR(static_cast<float>(mapGoldCorners[arucoId][j*2]), corners[i][j].x, 1.f);
+            EXPECT_NEAR(static_cast<float>(mapGoldCorners[arucoId][j*2+1]), corners[i][j].y, 1.f);
+        }
+    }
+}
+
+TEST(CV_ArucoTutorial, can_find_choriginal)
+{
+    string imgPath = cvtest::findDataFile("aruco/choriginal.jpg");
+    Mat image = imread(imgPath);
+    aruco::ArucoDetector detector(aruco::getPredefinedDictionary(aruco::DICT_6X6_250));
+
+    vector< int > ids;
+    vector< vector< Point2f > > corners, rejected;
+    const size_t N = 17ull;
+    // corners of aruco markers with indices goldCornersIds
+    const int goldCorners[N][8] = { {268,77,  290,80,  286,97,  263,94},  {360,90,  382,93,  379,111, 357,108},
+                                    {211,106, 233,109, 228,127, 205,123}, {306,120, 328,124, 325,142, 302,138},
+                                    {402,135, 425,139, 423,157, 400,154}, {247,152, 271,155, 267,174, 242,171},
+                                    {347,167, 371,171, 369,191, 344,187}, {185,185, 209,189, 203,210, 178,206},
+                                    {288,201, 313,206, 309,227, 284,223}, {393,218, 418,222, 416,245, 391,241},
+                                    {223,240, 250,244, 244,268, 217,263}, {333,258, 359,262, 356,286, 329,282},
+                                    {152,281, 179,285, 171,312, 143,307}, {267,300, 294,305, 289,331, 261,327},
+                                    {383,319, 410,324, 408,351, 380,347}, {194,347, 223,352, 216,382, 186,377},
+                                    {315,368, 345,373, 341,403, 310,398} };
+    map<int, const int*> mapGoldCorners;
+    for (int i = 0; i < static_cast<int>(N); i++)
+        mapGoldCorners[i] = goldCorners[i];
+
+    detector.detectMarkers(image, corners, ids, rejected);
+
+    ASSERT_EQ(N, ids.size());
+    for (size_t i = 0; i < N; i++)
+    {
+        int arucoId = ids[i];
+        ASSERT_EQ(4ull, corners[i].size());
+        ASSERT_TRUE(mapGoldCorners.find(arucoId) != mapGoldCorners.end());
+        for (int j = 0; j < 4; j++)
+        {
+            EXPECT_NEAR(static_cast<float>(mapGoldCorners[arucoId][j * 2]), corners[i][j].x, 1.f);
+            EXPECT_NEAR(static_cast<float>(mapGoldCorners[arucoId][j * 2 + 1]), corners[i][j].y, 1.f);
+        }
+    }
+}
+
+TEST(CV_ArucoTutorial, can_find_chocclusion)
+{
+    string imgPath = cvtest::findDataFile("aruco/chocclusion_original.jpg");
+    Mat image = imread(imgPath);
+    aruco::ArucoDetector detector(aruco::getPredefinedDictionary(aruco::DICT_6X6_250));
+
+    vector< int > ids;
+    vector< vector< Point2f > > corners, rejected;
+    const size_t N = 13ull;
+    // corners of aruco markers with indices goldCornersIds
+    const int goldCorners[N][8] = { {301,57, 322,62, 317,79, 295,73}, {391,80, 413,85, 408,103, 386,97},
+                                    {242,79, 264,85, 256,102, 234,96}, {334,103, 357,109, 352,126, 329,121},
+                                    {428,129, 451,134, 448,152, 425,146}, {274,128, 296,134, 290,153, 266,147},
+                                    {371,154, 394,160, 390,180, 366,174}, {208,155, 232,161, 223,181, 199,175},
+                                    {309,182, 333,188, 327,209, 302,203}, {411,210, 436,216, 432,238, 407,231},
+                                    {241,212, 267,219, 258,242, 232,235}, {167,244, 194,252, 183,277, 156,269},
+                                    {202,314, 230,322, 220,349, 191,341} };
+    map<int, const int*> mapGoldCorners;
+    const int goldCornersIds[N] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15};
+    for (int i = 0; i < static_cast<int>(N); i++)
+        mapGoldCorners[goldCornersIds[i]] = goldCorners[i];
+
+    detector.detectMarkers(image, corners, ids, rejected);
+
+    ASSERT_EQ(N, ids.size());
+    for (size_t i = 0; i < N; i++)
+    {
+        int arucoId = ids[i];
+        ASSERT_EQ(4ull, corners[i].size());
+        ASSERT_TRUE(mapGoldCorners.find(arucoId) != mapGoldCorners.end());
+        for (int j = 0; j < 4; j++)
+        {
+            EXPECT_NEAR(static_cast<float>(mapGoldCorners[arucoId][j * 2]), corners[i][j].x, 1.f);
+            EXPECT_NEAR(static_cast<float>(mapGoldCorners[arucoId][j * 2 + 1]), corners[i][j].y, 1.f);
+        }
+    }
+}
+
+TEST(CV_ArucoTutorial, can_find_diamondmarkers)
+{
+    string imgPath = cvtest::findDataFile("aruco/diamondmarkers.jpg");
+    Mat image = imread(imgPath);
+
+    string dictPath = cvtest::findDataFile("aruco/tutorial_dict.yml");
+    aruco::Dictionary dictionary;
+    FileStorage fs(dictPath, FileStorage::READ);
+    dictionary.aruco::Dictionary::readDictionary(fs.root()); // set marker from tutorial_dict.yml
+
+    string detectorPath = cvtest::findDataFile("aruco/detector_params.yml");
+    fs = FileStorage(detectorPath, FileStorage::READ);
+    aruco::DetectorParameters detectorParams;
+    detectorParams.readDetectorParameters(fs.root());
+    detectorParams.cornerRefinementMethod = aruco::CORNER_REFINE_APRILTAG;
+
+    aruco::CharucoBoard charucoBoard(Size(3, 3), 0.4f, 0.25f, dictionary);
+    aruco::CharucoDetector detector(charucoBoard, aruco::CharucoParameters(), detectorParams);
+
+    vector<int> ids;
+    vector<vector<Point2f> > corners, diamondCorners;
+    vector<Vec4i> diamondIds;
+    const size_t N = 12ull;
+    // corner indices of ArUco markers
+    const int goldCornersIds[N] = { 4, 12, 11, 3, 12, 10, 12, 10, 10, 11, 2, 11 };
+    map<int, int> counterGoldCornersIds;
+    for (int i = 0; i < static_cast<int>(N); i++)
+        counterGoldCornersIds[goldCornersIds[i]]++;
+
+    const size_t diamondsN = 3;
+    // corners of diamonds with Vec4i indices
+    const float goldDiamondCorners[diamondsN][8] = {{195.6f,150.9f, 213.5f,201.2f, 136.4f,215.3f, 122.4f,163.5f},
+                                            {501.1f,171.3f, 501.9f,208.5f, 446.2f,199.8f, 447.8f,163.3f},
+                                            {343.4f,361.2f, 359.7f,328.7f, 400.8f,344.6f, 385.7f,378.4f}};
+    auto comp = [](const Vec4i& a, const Vec4i& b) {
+        for (int i = 0; i < 3; i++)
+            if (a[i] != b[i]) return a[i] < b[i];
+        return a[3] < b[3];
+    };
+    map<Vec4i, const float*, decltype(comp)> goldDiamonds(comp);
+    goldDiamonds[Vec4i(10, 4, 11, 12)] = goldDiamondCorners[0];
+    goldDiamonds[Vec4i(10, 3, 11, 12)] = goldDiamondCorners[1];
+    goldDiamonds[Vec4i(10, 2, 11, 12)] = goldDiamondCorners[2];
+
+    detector.detectDiamonds(image, diamondCorners, diamondIds, corners, ids);
+    map<int, int> counterRes;
+
+    ASSERT_EQ(N, ids.size());
+    for (size_t i = 0; i < N; i++)
+    {
+        int arucoId = ids[i];
+        counterRes[arucoId]++;
+    }
+
+    ASSERT_EQ(counterGoldCornersIds, counterRes); // check the number of ArUco markers
+    ASSERT_EQ(goldDiamonds.size(), diamondIds.size()); // check the number of diamonds
+
+    for (size_t i = 0; i < goldDiamonds.size(); i++)
+    {
+        Vec4i diamondId = diamondIds[i];
+        ASSERT_TRUE(goldDiamonds.find(diamondId) != goldDiamonds.end());
+        for (int j = 0; j < 4; j++)
+        {
+            EXPECT_NEAR(goldDiamonds[diamondId][j * 2], diamondCorners[i][j].x, 0.5f);
+            EXPECT_NEAR(goldDiamonds[diamondId][j * 2 + 1], diamondCorners[i][j].y, 0.5f);
+        }
+    }
+}
+
+}} // namespace
diff --git a/modules/objdetect/test/test_arucodetection.cpp b/modules/objdetect/test/test_arucodetection.cpp
index 9cb6f07b334c..7145b5d6636c 100644
--- a/modules/objdetect/test/test_arucodetection.cpp
+++ b/modules/objdetect/test/test_arucodetection.cpp
@@ -613,6 +613,32 @@ TEST(CV_ArucoDetectMarkers, regression_2492)
     }
 }
 
+
+TEST(CV_ArucoDetectMarkers, regression_contour_24220)
+{
+    aruco::ArucoDetector detector;
+    vector<int> markerIds;
+    vector<vector<Point2f> > markerCorners;
+    string imgPath = cvtest::findDataFile("aruco/failmask9.png");
+    Mat image = imread(imgPath);
+
+    const size_t N = 1ull;
+    const int goldCorners[8] = {392,175, 99,257, 117,109, 365,44};
+    const int goldCornersId = 0;
+
+    detector.detectMarkers(image, markerCorners, markerIds);
+
+    ASSERT_EQ(N, markerIds.size());
+    ASSERT_EQ(4ull, markerCorners[0].size());
+    ASSERT_EQ(goldCornersId, markerIds[0]);
+    for (int j = 0; j < 4; j++)
+    {
+        EXPECT_NEAR(static_cast<float>(goldCorners[j * 2]), markerCorners[0][j].x, 1.f);
+        EXPECT_NEAR(static_cast<float>(goldCorners[j * 2 + 1]), markerCorners[0][j].y, 1.f);
+    }
+}
+
+
 struct ArucoThreading: public testing::TestWithParam<aruco::CornerRefineMethod>
 {
     struct NumThreadsSetter {
diff --git a/modules/objdetect/test/test_barcode.cpp b/modules/objdetect/test/test_barcode.cpp
index d8e2002f2313..94542ca39ba5 100644
--- a/modules/objdetect/test/test_barcode.cpp
+++ b/modules/objdetect/test/test_barcode.cpp
@@ -60,7 +60,7 @@ map<string, BarcodeResult> testResults {
     { "single/book.jpg", {"EAN_13", "9787115279460"} },
     { "single/bottle_1.jpg", {"EAN_13", "6922255451427"} },
     { "single/bottle_2.jpg", {"EAN_13", "6921168509256"} },
-    { "multiple/4_barcodes.jpg", {"EAN_13;EAN_13;EAN_13;EAN_13", "9787564350840;9783319200064;9787118081473;9787122276124"} }
+    { "multiple/4_barcodes.jpg", {"EAN_13;EAN_13;EAN_13;EAN_13", "9787564350840;9783319200064;9787118081473;9787122276124"} },
 };
 
 typedef testing::TestWithParam< string > BarcodeDetector_main;
@@ -95,6 +95,13 @@ TEST_P(BarcodeDetector_main, interface)
         EXPECT_EQ(1u, expected_lines.count(res));
     }
 
+    {
+        string res = det.detectAndDecode(img, points);
+        ASSERT_FALSE(res.empty());
+        EXPECT_EQ(1u, expected_lines.count(res));
+        EXPECT_EQ(4u, points.size());
+    }
+
     // common interface (multi)
     {
         bool res = det.detectMulti(img, points);
@@ -137,4 +144,87 @@ TEST(BarcodeDetector_base, invalid)
     EXPECT_ANY_THROW(bardet.decodeMulti(zero_image, corners, decoded_info));
 }
 
+struct ParamStruct
+{
+    double down_thresh;
+    vector<float> scales;
+    double grad_thresh;
+    unsigned res_count;
+};
+
+inline static std::ostream &operator<<(std::ostream &out, const ParamStruct &p)
+{
+    out << "(" << p.down_thresh << ", ";
+    for(float val : p.scales)
+        out << val << ", ";
+    out << p.grad_thresh << ")";
+    return out;
+}
+
+ParamStruct param_list[] = {
+    { 512, {0.01f, 0.03f, 0.06f, 0.08f}, 64, 4 }, // default values -> 4 codes
+    { 512, {0.01f, 0.03f, 0.06f, 0.08f}, 1024, 2 },
+    { 512, {0.01f, 0.03f, 0.06f, 0.08f}, 2048, 0 },
+    { 128, {0.01f, 0.03f, 0.06f, 0.08f}, 64, 3 },
+    { 64, {0.01f, 0.03f, 0.06f, 0.08f}, 64, 2 },
+    { 128, {0.0000001f}, 64, 1 },
+    { 128, {0.0000001f, 0.0001f}, 64, 1 },
+    { 128, {0.0000001f, 0.1f}, 64, 1 },
+    { 512, {0.1f}, 64, 0 },
+};
+
+typedef testing::TestWithParam<ParamStruct> BarcodeDetector_parameters_tune;
+
+TEST_P(BarcodeDetector_parameters_tune, accuracy)
+{
+    const ParamStruct param = GetParam();
+
+    const string fname = "multiple/4_barcodes.jpg";
+    const string image_path = findDataFile(string("barcode/") + fname);
+
+    const Mat img = imread(image_path);
+    ASSERT_FALSE(img.empty()) << "Can't read image: " << image_path;
+
+    auto bardet = barcode::BarcodeDetector();
+    bardet.setDownsamplingThreshold(param.down_thresh);
+    bardet.setDetectorScales(param.scales);
+    bardet.setGradientThreshold(param.grad_thresh);
+    vector<Point2f> points;
+    bardet.detectMulti(img, points);
+    EXPECT_EQ(points.size() / 4, param.res_count);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, BarcodeDetector_parameters_tune, testing::ValuesIn(param_list));
+
+TEST(BarcodeDetector_parameters, regression)
+{
+    const double expected_dt = 1024, expected_gt = 256;
+    const vector<float> expected_ds = {0.1f};
+    vector<float> ds_value = {0.0f};
+
+    auto bardet = barcode::BarcodeDetector();
+
+    bardet.setDownsamplingThreshold(expected_dt).setDetectorScales(expected_ds).setGradientThreshold(expected_gt);
+
+    double dt_value = bardet.getDownsamplingThreshold();
+    bardet.getDetectorScales(ds_value);
+    double gt_value = bardet.getGradientThreshold();
+
+    EXPECT_EQ(expected_dt, dt_value);
+    EXPECT_EQ(expected_ds, ds_value);
+    EXPECT_EQ(expected_gt, gt_value);
+}
+
+TEST(BarcodeDetector_parameters, invalid)
+{
+    auto bardet = barcode::BarcodeDetector();
+
+    EXPECT_ANY_THROW(bardet.setDownsamplingThreshold(-1));
+    EXPECT_ANY_THROW(bardet.setDetectorScales(vector<float> {}));
+    EXPECT_ANY_THROW(bardet.setDetectorScales(vector<float> {-1}));
+    EXPECT_ANY_THROW(bardet.setDetectorScales(vector<float> {1.5}));
+    EXPECT_ANY_THROW(bardet.setDetectorScales(vector<float> (17, 0.5)));
+    EXPECT_ANY_THROW(bardet.setGradientThreshold(-0.1));
+}
+
 }} // opencv_test::<anonymous>::
diff --git a/modules/objdetect/test/test_boarddetection.cpp b/modules/objdetect/test/test_boarddetection.cpp
index e47e6c3cb660..0c99e6de61f7 100644
--- a/modules/objdetect/test/test_boarddetection.cpp
+++ b/modules/objdetect/test/test_boarddetection.cpp
@@ -318,4 +318,12 @@ TEST(CV_ArucoGenerateBoard, regression_1226) {
     });
 }
 
+TEST(CV_ArucoDictionary, extendDictionary) {
+    aruco::Dictionary base_dictionary = aruco::getPredefinedDictionary(aruco::DICT_4X4_250);
+    aruco::Dictionary custom_dictionary = aruco::extendDictionary(150, 4, base_dictionary);
+
+    ASSERT_EQ(custom_dictionary.bytesList.rows, 150);
+    ASSERT_EQ(cv::norm(custom_dictionary.bytesList, base_dictionary.bytesList.rowRange(0, 150)), 0.);
+}
+
 }} // namespace
diff --git a/modules/objdetect/test/test_cascadeandhog.cpp b/modules/objdetect/test/test_cascadeandhog.cpp
index 4151b899e369..0a68bd9bb371 100644
--- a/modules/objdetect/test/test_cascadeandhog.cpp
+++ b/modules/objdetect/test/test_cascadeandhog.cpp
@@ -355,7 +355,7 @@ int CV_DetectorTest::validate( int detectorIdx, vector<vector<Rect> >& objects )
                     map[minIdx] = 1;
             }
         }
-        noPair += (int)count_if( map.begin(), map.end(), isZero );
+        noPair += (int)std::count_if( map.begin(), map.end(), isZero );
         totalNoPair += noPair;
 
         /*if( noPair > cvRound(valRects.size()*eps.noPair)+1 )
diff --git a/modules/objdetect/test/test_charucodetection.cpp b/modules/objdetect/test/test_charucodetection.cpp
index 9e561bc40a54..c0f6c93d506b 100644
--- a/modules/objdetect/test/test_charucodetection.cpp
+++ b/modules/objdetect/test/test_charucodetection.cpp
@@ -81,6 +81,18 @@ static Mat projectCharucoBoard(aruco::CharucoBoard& board, Mat cameraMatrix, dou
     return img;
 }
 
+static bool borderPixelsHaveSameColor(const Mat& image, uint8_t color) {
+    for (int j = 0; j < image.cols; j++) {
+        if (image.at<uint8_t>(0, j) != color || image.at<uint8_t>(image.rows-1, j) != color)
+            return false;
+    }
+    for (int i = 0; i < image.rows; i++) {
+        if (image.at<uint8_t>(i, 0) != color || image.at<uint8_t>(i, image.cols-1) != color)
+            return false;
+    }
+    return true;
+}
+
 /**
  * @brief Check Charuco detection
  */
@@ -650,7 +662,7 @@ TEST(Charuco, issue_14014)
     EXPECT_EQ(Size(4, 1), corners[0].size()); // check dimension of detected corners
 
     size_t numRejPoints = rejectedPoints.size();
-    ASSERT_EQ(rejectedPoints.size(), 26ull); // optional check to track regressions
+    ASSERT_EQ(rejectedPoints.size(), 24ull); // optional check to track regressions
     EXPECT_EQ(Size(4, 1), rejectedPoints[0].size()); // check dimension of detected corners
 
     detector.refineDetectedMarkers(img, board, corners, ids, rejectedPoints);
@@ -689,6 +701,58 @@ TEST(Charuco, testmatchImagePoints)
     }
 }
 
+typedef testing::TestWithParam<int> CharucoDraw;
+INSTANTIATE_TEST_CASE_P(/**/, CharucoDraw, testing::Values(CV_8UC2, CV_8SC2, CV_16UC2, CV_16SC2, CV_32SC2, CV_32FC2, CV_64FC2));
+TEST_P(CharucoDraw, testDrawDetected) {
+    vector<vector<Point>> detected_golds = {{Point(20, 20), Point(80, 20), Point(80, 80), Point2f(20, 80)}};
+    Point center_gold = (detected_golds[0][0] + detected_golds[0][1] + detected_golds[0][2] + detected_golds[0][3]) / 4;
+    int type = GetParam();
+    vector<Mat> detected(detected_golds[0].size(), Mat(4, 1, type));
+    // copy detected_golds to detected with any 2 channels type
+    for (size_t i = 0ull; i < detected_golds[0].size(); i++) {
+        detected[0].row((int)i) = Scalar(detected_golds[0][i].x, detected_golds[0][i].y);
+    }
+    vector<vector<Point>> contours;
+    Point detectedCenter;
+    Moments m;
+    Mat img;
+
+    // check drawDetectedMarkers
+    img = Mat::zeros(100, 100, CV_8UC1);
+    ASSERT_NO_THROW(aruco::drawDetectedMarkers(img, detected, noArray(), Scalar(255, 255, 255)));
+    // check that the marker borders are painted
+    findContours(img, contours, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE);
+    ASSERT_EQ(contours.size(), 1ull);
+    m = moments(contours[0]);
+    detectedCenter = Point(cvRound(m.m10/m.m00), cvRound(m.m01/m.m00));
+    ASSERT_EQ(detectedCenter, center_gold);
+
+
+    // check drawDetectedCornersCharuco
+    img = Mat::zeros(100, 100, CV_8UC1);
+    ASSERT_NO_THROW(aruco::drawDetectedCornersCharuco(img, detected[0], noArray(), Scalar(255, 255, 255)));
+    // check that the 4 charuco corners are painted
+    findContours(img, contours, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE);
+    ASSERT_EQ(contours.size(), 4ull);
+    for (size_t i = 0ull; i < 4ull; i++) {
+        m = moments(contours[i]);
+        detectedCenter = Point(cvRound(m.m10/m.m00), cvRound(m.m01/m.m00));
+        // detectedCenter must be in detected_golds
+        ASSERT_TRUE(find(detected_golds[0].begin(), detected_golds[0].end(), detectedCenter) != detected_golds[0].end());
+    }
+
+
+    // check drawDetectedDiamonds
+    img = Mat::zeros(100, 100, CV_8UC1);
+    ASSERT_NO_THROW(aruco::drawDetectedDiamonds(img, detected, noArray(), Scalar(255, 255, 255)));
+    // check that the diamonds borders are painted
+    findContours(img, contours, RETR_EXTERNAL, CHAIN_APPROX_SIMPLE);
+    ASSERT_EQ(contours.size(), 1ull);
+    m = moments(contours[0]);
+    detectedCenter = Point(cvRound(m.m10/m.m00), cvRound(m.m01/m.m00));
+    ASSERT_EQ(detectedCenter, center_gold);
+}
+
 typedef testing::TestWithParam<cv::Size> CharucoBoard;
 INSTANTIATE_TEST_CASE_P(/**/, CharucoBoard, testing::Values(Size(3, 2), Size(3, 2), Size(6, 2), Size(2, 6),
                                                             Size(3, 4), Size(4, 3), Size(7, 3), Size(3, 7)));
@@ -717,4 +781,163 @@ TEST_P(CharucoBoard, testWrongSizeDetection)
     ASSERT_TRUE(detectedCharucoIds.empty());
 }
 
+
+typedef testing::TestWithParam<std::tuple<cv::Size, float, cv::Size, int>> CharucoBoardGenerate;
+INSTANTIATE_TEST_CASE_P(/**/, CharucoBoardGenerate, testing::Values(make_tuple(Size(7, 4), 13.f, Size(400, 300), 24),
+                                                                    make_tuple(Size(12, 2), 13.f, Size(200, 150), 1),
+                                                                    make_tuple(Size(12, 2), 13.1f, Size(400, 300), 1)));
+TEST_P(CharucoBoardGenerate, issue_24806)
+{
+    aruco::Dictionary dict = aruco::getPredefinedDictionary(aruco::DICT_4X4_1000);
+    auto params = GetParam();
+    const Size boardSize = std::get<0>(params);
+    const float squareLength = std::get<1>(params), markerLength = 10.f;
+    Size imgSize = std::get<2>(params);
+    const aruco::CharucoBoard board(boardSize, squareLength, markerLength, dict);
+    const int marginSize = std::get<3>(params);
+    Mat boardImg;
+
+    // generate chessboard image
+    board.generateImage(imgSize, boardImg, marginSize);
+    // This condition checks that the width of the image determines the dimensions of the chessboard in this test
+    CV_Assert((float)(boardImg.cols) / (float)boardSize.width <=
+              (float)(boardImg.rows) / (float)boardSize.height);
+
+    // prepare data for chessboard image test
+    Mat noMarginsImg = boardImg(Range(marginSize, boardImg.rows - marginSize),
+                                Range(marginSize, boardImg.cols - marginSize));
+    const float pixInSquare = (float)(noMarginsImg.cols) / (float)boardSize.width;
+
+    Size pixInChessboard(cvRound(pixInSquare*boardSize.width), cvRound(pixInSquare*boardSize.height));
+    const Point startChessboard((noMarginsImg.cols - pixInChessboard.width) / 2,
+                                (noMarginsImg.rows - pixInChessboard.height) / 2);
+    Mat chessboardZoneImg = noMarginsImg(Rect(startChessboard, pixInChessboard));
+
+    // B - black pixel, W - white pixel
+    // chessboard corner 1:
+    // B W
+    // W B
+    Mat goldCorner1 = (Mat_<uint8_t>(2, 2) <<
+        0, 255,
+        255, 0);
+    // B - black pixel, W - white pixel
+    // chessboard corner 2:
+    // W B
+    // B W
+    Mat goldCorner2 = (Mat_<uint8_t>(2, 2) <<
+        255, 0,
+        0, 255);
+
+    // test chessboard corners in generated image
+    for (const Point3f& p: board.getChessboardCorners()) {
+        Point2f chessCorner(pixInSquare*(p.x/squareLength),
+                            pixInSquare*(p.y/squareLength));
+        Mat winCorner = chessboardZoneImg(Rect(Point(cvRound(chessCorner.x) - 1, cvRound(chessCorner.y) - 1), Size(2, 2)));
+        bool eq = (cv::countNonZero(goldCorner1 != winCorner) == 0) || (cv::countNonZero(goldCorner2 != winCorner) == 0);
+        ASSERT_TRUE(eq);
+    }
+
+    // marker size in pixels
+    const float pixInMarker = markerLength/squareLength*pixInSquare;
+    // the size of the marker margin in pixels
+    const float pixInMarginMarker = 0.5f*(pixInSquare - pixInMarker);
+
+    // determine the zone where the aruco markers are located
+    int endArucoX = cvRound(pixInSquare*(boardSize.width-1)+pixInMarginMarker+pixInMarker);
+    int endArucoY = cvRound(pixInSquare*(boardSize.height-1)+pixInMarginMarker+pixInMarker);
+    Mat arucoZone = chessboardZoneImg(Range(cvRound(pixInMarginMarker), endArucoY), Range(cvRound(pixInMarginMarker), endArucoX));
+
+    const auto& markerCorners = board.getObjPoints();
+    float minX, maxX, minY, maxY;
+    minX = maxX = markerCorners[0][0].x;
+    minY = maxY = markerCorners[0][0].y;
+    for (const auto& marker : markerCorners) {
+        for (const Point3f& objCorner : marker) {
+            minX = min(minX, objCorner.x);
+            maxX = max(maxX, objCorner.x);
+            minY = min(minY, objCorner.y);
+            maxY = max(maxY, objCorner.y);
+        }
+    }
+
+    Point2f outCorners[3];
+    for (const auto& marker : markerCorners) {
+        for (int i = 0; i < 3; i++) {
+            outCorners[i] = Point2f(marker[i].x, marker[i].y) - Point2f(minX, minY);
+            outCorners[i].x = outCorners[i].x / (maxX - minX) * float(arucoZone.cols);
+            outCorners[i].y = outCorners[i].y / (maxY - minY) * float(arucoZone.rows);
+        }
+        Size dst_sz(outCorners[2] - outCorners[0]); // assuming CCW order
+        dst_sz.width = dst_sz.height = std::min(dst_sz.width, dst_sz.height);
+        Rect borderRect = Rect(outCorners[0], dst_sz);
+
+        //The test checks the inner and outer borders of the Aruco markers.
+        //In the inner border of Aruco marker, all pixels should be black.
+        //In the outer border of Aruco marker, all pixels should be white.
+
+        Mat markerImg = arucoZone(borderRect);
+        bool markerBorderIsBlack = borderPixelsHaveSameColor(markerImg, 0);
+        ASSERT_EQ(markerBorderIsBlack, true);
+
+        Mat markerOuterBorder = markerImg;
+        markerOuterBorder.adjustROI(1, 1, 1, 1);
+        bool markerOuterBorderIsWhite = borderPixelsHaveSameColor(markerOuterBorder, 255);
+        ASSERT_EQ(markerOuterBorderIsWhite, true);
+    }
+}
+
+TEST(Charuco, testSeveralBoardsWithCustomIds)
+{
+    Size res{500, 500};
+    Mat K = (Mat_<double>(3,3) <<
+        0.5*res.width, 0, 0.5*res.width,
+        0, 0.5*res.height, 0.5*res.height,
+        0, 0, 1);
+
+    Mat expected_corners = (Mat_<float>(9,2) <<
+        200, 200,
+        250, 200,
+        300, 200,
+        200, 250,
+        250, 250,
+        300, 250,
+        200, 300,
+        250, 300,
+        300, 300
+    );
+
+
+    aruco::Dictionary dict = cv::aruco::getPredefinedDictionary(aruco::DICT_4X4_50);
+    vector<int> ids1 = {0, 1, 33, 3, 4, 5, 6, 8}, ids2 = {7, 9, 44, 11, 12, 13, 14, 15};
+    aruco::CharucoBoard board1(Size(4, 4), 1.f, .8f, dict, ids1), board2(Size(4, 4), 1.f, .8f, dict, ids2);
+
+    // generate ChArUco board
+    Mat gray;
+    {
+        Mat gray1, gray2;
+        board1.generateImage(Size(res.width, res.height), gray1, 150);
+        board2.generateImage(Size(res.width, res.height), gray2, 150);
+        hconcat(gray1, gray2, gray);
+    }
+
+    aruco::CharucoParameters charucoParameters;
+    charucoParameters.cameraMatrix = K;
+    aruco::CharucoDetector detector1(board1, charucoParameters), detector2(board2, charucoParameters);
+
+    vector<int> ids;
+    vector<Mat> corners;
+    Mat c_ids1, c_ids2, c_corners1, c_corners2;
+
+    detector1.detectBoard(gray, c_corners1, c_ids1, corners, ids);
+    detector2.detectBoard(gray, c_corners2, c_ids2, corners, ids);
+
+    ASSERT_EQ(ids.size(), size_t(16));
+    ASSERT_EQ(c_corners1.rows, expected_corners.rows);
+    EXPECT_NEAR(0, cvtest::norm(expected_corners, c_corners1.reshape(1), NORM_INF), 3e-1);
+
+    ASSERT_EQ(c_corners2.rows, expected_corners.rows);
+    expected_corners.col(0) += 500;
+    EXPECT_NEAR(0, cvtest::norm(expected_corners, c_corners2.reshape(1), NORM_INF), 3e-1);
+}
+
 }} // namespace
diff --git a/modules/objdetect/test/test_precomp.hpp b/modules/objdetect/test/test_precomp.hpp
index 88b8e9a4f569..452a0d78d67f 100644
--- a/modules/objdetect/test/test_precomp.hpp
+++ b/modules/objdetect/test/test_precomp.hpp
@@ -7,10 +7,6 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/objdetect.hpp"
 
-#if defined CV_CXX11
-  #include <random>
-#else
-  #include <cstdlib>
-#endif
+#include <random>
 
 #endif
diff --git a/modules/objdetect/test/test_qr_utils.hpp b/modules/objdetect/test/test_qr_utils.hpp
index cfbe1a5078b9..5186acc7adfc 100644
--- a/modules/objdetect/test/test_qr_utils.hpp
+++ b/modules/objdetect/test/test_qr_utils.hpp
@@ -47,7 +47,7 @@ void check_qr(const string& root, const string& name_current_image, const string
                     EXPECT_NEAR(y, corners[i].y, max_pixel_error);
                 }
             }
-#ifdef HAVE_QUIRC
+
             if (decoded_info.size() == 0ull)
                 return;
             if (isMulti) {
@@ -67,7 +67,7 @@ void check_qr(const string& root, const string& name_current_image, const string
                 std::string original_info = config["info"];
                 EXPECT_EQ(decoded_info[0], original_info);
             }
-#endif
+
             return; // done
         }
     }
diff --git a/modules/objdetect/test/test_qrcode.cpp b/modules/objdetect/test/test_qrcode.cpp
index 5e6ec6faf5db..9f1ea7a8009a 100644
--- a/modules/objdetect/test/test_qrcode.cpp
+++ b/modules/objdetect/test/test_qrcode.cpp
@@ -56,9 +56,8 @@ TEST(Objdetect_QRCode, generate_test_data)
         std::string decoded_info;
         ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
         EXPECT_TRUE(detectQRCode(src, corners));
-#ifdef HAVE_QUIRC
         EXPECT_TRUE(decodeQRCode(src, corners, decoded_info, straight_barcode));
-#endif
+
         file_config << "x" << "[:";
         for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].x; }
         file_config << "]";
@@ -95,9 +94,8 @@ TEST(Objdetect_QRCode_Close, generate_test_data)
         Size new_size(width, height);
         resize(src, barcode, new_size, 0, 0, INTER_LINEAR_EXACT);
         EXPECT_TRUE(detectQRCode(barcode, corners));
-#ifdef HAVE_QUIRC
         EXPECT_TRUE(decodeQRCode(barcode, corners, decoded_info, straight_barcode));
-#endif
+
         file_config << "x" << "[:";
         for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].x; }
         file_config << "]";
@@ -133,9 +131,8 @@ TEST(Objdetect_QRCode_Monitor, generate_test_data)
         Size new_size(width, height);
         resize(src, barcode, new_size, 0, 0, INTER_LINEAR_EXACT);
         EXPECT_TRUE(detectQRCode(barcode, corners));
-#ifdef HAVE_QUIRC
         EXPECT_TRUE(decodeQRCode(barcode, corners, decoded_info, straight_barcode));
-#endif
+
         file_config << "x" << "[:";
         for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].x; }
         file_config << "]";
@@ -165,9 +162,8 @@ TEST(Objdetect_QRCode_Curved, generate_test_data)
         std::string decoded_info;
         ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
         EXPECT_TRUE(detectQRCode(src, corners));
-#ifdef HAVE_QUIRC
         EXPECT_TRUE(decodeCurvedQRCode(src, corners, decoded_info, straight_barcode));
-#endif
+
         file_config << "x" << "[:";
         for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].x; }
         file_config << "]";
@@ -198,11 +194,10 @@ TEST(Objdetect_QRCode_Multi, generate_test_data)
         std::vector<Point> corners;
         QRCodeDetector qrcode;
         EXPECT_TRUE(qrcode.detectMulti(src, corners));
-#ifdef HAVE_QUIRC
         std::vector<cv::String> decoded_info;
         std::vector<Mat> straight_barcode;
         EXPECT_TRUE(qrcode.decodeMulti(src, corners, decoded_info, straight_barcode));
-#endif
+
         file_config << "x" << "[:";
         for(size_t j = 0; j < corners.size(); j += 4)
         {
@@ -256,15 +251,11 @@ TEST_P(Objdetect_QRCode, regression)
     std::vector<Point> corners;
     std::string decoded_info;
     QRCodeDetector qrcode;
-#ifdef HAVE_QUIRC
     decoded_info = qrcode.detectAndDecode(src, corners, straight_barcode);
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
     int expected_barcode_type = CV_8UC1;
     EXPECT_EQ(expected_barcode_type, straight_barcode.type());
-#else
-    ASSERT_TRUE(qrcode.detect(src, corners));
-#endif
     check_qr(root, name_current_image, "test_images", corners, {decoded_info}, pixels_error);
 }
 
@@ -287,15 +278,11 @@ TEST_P(Objdetect_QRCode_Close, regression)
     std::vector<Point> corners;
     std::string decoded_info;
     QRCodeDetector qrcode;
-#ifdef HAVE_QUIRC
     decoded_info = qrcode.detectAndDecode(barcode, corners, straight_barcode);
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
     int expected_barcode_type = CV_8UC1;
     EXPECT_EQ(expected_barcode_type, straight_barcode.type());
-#else
-    ASSERT_TRUE(qrcode.detect(barcode, corners));
-#endif
     check_qr(root, name_current_image, "close_images", corners, {decoded_info}, pixels_error);
 }
 
@@ -318,15 +305,11 @@ TEST_P(Objdetect_QRCode_Monitor, regression)
     std::vector<Point> corners;
     std::string decoded_info;
     QRCodeDetector qrcode;
-#ifdef HAVE_QUIRC
     decoded_info = qrcode.detectAndDecode(barcode, corners, straight_barcode);
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
     int expected_barcode_type = CV_8UC1;
     EXPECT_EQ(expected_barcode_type, straight_barcode.type());
-#else
-    ASSERT_TRUE(qrcode.detect(barcode, corners));
-#endif
     check_qr(root, name_current_image, "monitor_images", corners, {decoded_info}, pixels_error);
 }
 
@@ -344,15 +327,11 @@ TEST_P(Objdetect_QRCode_Curved, regression)
     std::vector<Point> corners;
     std::string decoded_info;
     QRCodeDetector qrcode;
-#ifdef HAVE_QUIRC
     decoded_info = qrcode.detectAndDecodeCurved(src, corners, straight_barcode);
     ASSERT_FALSE(corners.empty());
     ASSERT_FALSE(decoded_info.empty());
     int expected_barcode_type = CV_8UC1;
     EXPECT_EQ(expected_barcode_type, straight_barcode.type());
-#else
-    ASSERT_TRUE(qrcode.detect(src, corners));
-#endif
     check_qr(root, name_current_image, "test_images", corners, {decoded_info}, pixels_error);
 }
 
@@ -374,7 +353,6 @@ TEST_P(Objdetect_QRCode_Multi, regression)
         qrcode = QRCodeDetectorAruco();
     }
     std::vector<Point> corners;
-#ifdef HAVE_QUIRC
     std::vector<cv::String> decoded_info;
     std::vector<Mat> straight_barcode;
     EXPECT_TRUE(qrcode.detectAndDecodeMulti(src, decoded_info, corners, straight_barcode));
@@ -383,9 +361,6 @@ TEST_P(Objdetect_QRCode_Multi, regression)
     int expected_barcode_type = CV_8UC1;
     for(size_t i = 0; i < straight_barcode.size(); i++)
         EXPECT_EQ(expected_barcode_type, straight_barcode[i].type());
-#else
-    ASSERT_TRUE(qrcode.detectMulti(src, corners));
-#endif
     check_qr(root, name_current_image, "multiple_images", corners, decoded_info, pixels_error, true);
 }
 
@@ -398,7 +373,6 @@ INSTANTIATE_TEST_CASE_P(/**/, Objdetect_QRCode_Multi, testing::Combine(testing::
 
 TEST(Objdetect_QRCode_decodeMulti, decode_regression_16491)
 {
-#ifdef HAVE_QUIRC
     Mat zero_image = Mat::zeros(256, 256, CV_8UC1);
     Point corners_[] = {Point(16, 16), Point(128, 16), Point(128, 128), Point(16, 128),
                         Point(16, 16), Point(128, 16), Point(128, 128), Point(16, 128)};
@@ -413,7 +387,6 @@ TEST(Objdetect_QRCode_decodeMulti, decode_regression_16491)
     Mat mat_corners(2, 4, CV_32SC2, (void*)&vec_corners[0]);
     QRCodeDetector mat_qrcode;
     EXPECT_NO_THROW(mat_qrcode.decodeMulti(zero_image, mat_corners, decoded_info, straight_barcode));
-#endif
 }
 
 typedef testing::TestWithParam<std::string> Objdetect_QRCode_detectMulti;
@@ -449,7 +422,6 @@ TEST_P(Objdetect_QRCode_detectAndDecodeMulti, check_output_parameters_type_19363
     std::string image_path = findDataFile(root + name_current_image);
     Mat src = imread(image_path);
     ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
-#ifdef HAVE_QUIRC
     GraphicalCodeDetector qrcode = QRCodeDetector();
     if (method == "aruco_based") {
         qrcode = QRCodeDetectorAruco();
@@ -467,7 +439,6 @@ TEST_P(Objdetect_QRCode_detectAndDecodeMulti, check_output_parameters_type_19363
     ASSERT_FALSE(corners.empty());
     for(size_t i = 0; i < straight_barcode.size(); i++)
         EXPECT_EQ(expected_barcode_type, straight_barcode[i].type());
-#endif
 }
 INSTANTIATE_TEST_CASE_P(/**/, Objdetect_QRCode_detectAndDecodeMulti, testing::Values("contours_based", "aruco_based"));
 
@@ -487,9 +458,7 @@ TEST(Objdetect_QRCode_detect, detect_regression_20882)
     cv::String decoded_info;
     EXPECT_TRUE(qrcode.detect(src, corners));
     EXPECT_TRUE(!corners.empty());
-#ifdef HAVE_QUIRC
     EXPECT_NO_THROW(qrcode.decode(src, corners, straight_barcode));
-#endif
 }
 
 TEST(Objdetect_QRCode_basic, not_found_qrcode)
@@ -500,10 +469,8 @@ TEST(Objdetect_QRCode_basic, not_found_qrcode)
     Mat zero_image = Mat::zeros(256, 256, CV_8UC1);
     QRCodeDetector qrcode;
     EXPECT_FALSE(qrcode.detect(zero_image, corners));
-#ifdef HAVE_QUIRC
     corners = std::vector<Point>(4);
     EXPECT_ANY_THROW(qrcode.decode(zero_image, corners, straight_barcode));
-#endif
 }
 
 TEST(Objdetect_QRCode_detect, detect_regression_21287)
@@ -521,9 +488,7 @@ TEST(Objdetect_QRCode_detect, detect_regression_21287)
     cv::String decoded_info;
     EXPECT_TRUE(qrcode.detect(src, corners));
     EXPECT_TRUE(!corners.empty());
-#ifdef HAVE_QUIRC
     EXPECT_NO_THROW(qrcode.decode(src, corners, straight_barcode));
-#endif
 }
 
 TEST(Objdetect_QRCode_detect_flipped, regression_23249)
@@ -538,7 +503,6 @@ TEST(Objdetect_QRCode_detect_flipped, regression_23249)
 
     for(const auto &flipped_image : flipped_images){
         const std::string &image_name = flipped_image.first;
-        const std::string &expect_msg = flipped_image.second;
 
         std::string image_path = findDataFile(root + image_name);
         Mat src = imread(image_path);
@@ -550,11 +514,10 @@ TEST(Objdetect_QRCode_detect_flipped, regression_23249)
         EXPECT_TRUE(qrcode.detect(src, corners));
         EXPECT_TRUE(!corners.empty());
         std::string decoded_msg;
-        #ifdef HAVE_QUIRC
-            EXPECT_NO_THROW(decoded_msg = qrcode.decode(src, corners, straight_barcode));
-            ASSERT_FALSE(straight_barcode.empty()) << "Can't decode qrimage.";
-            EXPECT_EQ(expect_msg, decoded_msg);
-        #endif
+        const std::string &expect_msg = flipped_image.second;
+        EXPECT_NO_THROW(decoded_msg = qrcode.decode(src, corners, straight_barcode));
+        ASSERT_FALSE(straight_barcode.empty()) << "Can't decode qrimage.";
+        EXPECT_EQ(expect_msg, decoded_msg);
     }
 }
 
@@ -577,12 +540,10 @@ TEST(Objdetect_QRCode_decode, decode_regression_21929)
 
     EXPECT_TRUE(qrcode.detect(src, corners));
     EXPECT_TRUE(!corners.empty());
-#ifdef HAVE_QUIRC
     cv::String decoded_msg;
     EXPECT_NO_THROW(decoded_msg = qrcode.decode(src, corners, straight_barcode));
     ASSERT_FALSE(straight_barcode.empty()) << "Can't decode qrimage.";
     EXPECT_EQ(expect_msg, decoded_msg);
-#endif
 }
 
 TEST(Objdetect_QRCode_decode, decode_regression_version_25)
@@ -603,12 +564,11 @@ TEST(Objdetect_QRCode_decode, decode_regression_version_25)
 
     EXPECT_TRUE(qrcode.detect(src, corners));
     EXPECT_TRUE(!corners.empty());
-#ifdef HAVE_QUIRC
+
     cv::String decoded_msg;
     EXPECT_NO_THROW(decoded_msg = qrcode.decode(src, corners, straight_barcode));
     ASSERT_FALSE(straight_barcode.empty()) << "Can't decode qrimage.";
     EXPECT_EQ(expect_msg, decoded_msg);
-#endif
 }
 
 TEST_P(Objdetect_QRCode_detectAndDecodeMulti, decode_9_qrcodes_version7)
@@ -637,4 +597,86 @@ TEST_P(Objdetect_QRCode_detectAndDecodeMulti, decode_9_qrcodes_version7)
 
 #endif // UPDATE_QRCODE_TEST_DATA
 
+TEST(Objdetect_QRCode_detectAndDecode, utf8_output)
+{
+    const std::string name_current_image = "umlaut.png";
+    const std::string root = "qrcode/";
+
+    std::string image_path = findDataFile(root + name_current_image);
+    Mat src = imread(image_path);
+    ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path;
+
+    QRCodeDetector qrcode;
+    std::vector<Point> corners;
+    Mat straight;
+    std::string decoded_info = qrcode.detectAndDecode(src, corners, straight);
+    EXPECT_FALSE(decoded_info.empty());
+    EXPECT_NE(decoded_info.find("M\xc3\xbcllheimstrasse"), std::string::npos);
+}
+
+TEST_P(Objdetect_QRCode_detectAndDecodeMulti, detect_regression_24679)
+{
+    const std::string name_current_image = "issue_24679.png";
+    const std::string root = "qrcode/";
+
+    std::string image_path = findDataFile(root + name_current_image);
+    Mat img = imread(image_path);
+    const std::string method = GetParam();
+    GraphicalCodeDetector qrcode = QRCodeDetector();
+    if (method == "aruco_based") {
+        qrcode = QRCodeDetectorAruco();
+    }
+    std::vector<cv::String> decoded_info;
+    ASSERT_TRUE(qrcode.detectAndDecodeMulti(img, decoded_info));
+    EXPECT_EQ(decoded_info.size(), 4U);
+}
+
+TEST_P(Objdetect_QRCode_detectAndDecodeMulti, detect_regression_24011)
+{
+    const std::string name_current_image = "issue_24011.jpg";
+    const std::string root = "qrcode/";
+
+    std::string image_path = findDataFile(root + name_current_image);
+    Mat img = imread(image_path);
+    const std::string method = GetParam();
+    GraphicalCodeDetector qrcode = QRCodeDetector();
+    if (method == "aruco_based") {
+        qrcode = QRCodeDetectorAruco();
+    }
+    std::vector<cv::String> decoded_info;
+    ASSERT_TRUE(qrcode.detectAndDecodeMulti(img, decoded_info));
+    EXPECT_EQ(decoded_info.size(), 2U);
+}
+
+TEST(Objdetect_QRCode_detect, detect_regression_24450)
+{
+    const std::string name_current_image = "issue_24450.png";
+    const std::string root = "qrcode/";
+
+    std::string image_path = findDataFile(root + name_current_image);
+    Mat img = imread(image_path);
+    GraphicalCodeDetector qrcode = QRCodeDetector();
+    std::vector<Point2f> points;
+    ASSERT_TRUE(qrcode.detect(img, points));
+    EXPECT_EQ(points.size(), 4U);
+    img.at<Vec3b>(img.rows - 1, 296) = {};
+    ASSERT_TRUE(qrcode.detect(img, points));
+    EXPECT_EQ(points.size(), 4U);
+}
+
+TEST(Objdetect_QRCode_detect, detect_regression_22892)
+{
+    const std::string name_current_image = "issue_22892.png";
+    const std::string root = "qrcode/";
+
+    std::string image_path = findDataFile(root + name_current_image);
+    Mat img = imread(image_path);
+
+    QRCodeDetector qrcode;
+    std::vector<Point> corners;
+    Mat straight_code;
+    qrcode.detectAndDecodeCurved(img, corners, straight_code);
+    EXPECT_EQ(corners.size(), 4U);
+}
+
 }} // namespace
diff --git a/modules/objdetect/test/test_qrcode_encode.cpp b/modules/objdetect/test/test_qrcode_encode.cpp
index 14900c3078b2..87142e4690a9 100644
--- a/modules/objdetect/test/test_qrcode_encode.cpp
+++ b/modules/objdetect/test/test_qrcode_encode.cpp
@@ -5,16 +5,6 @@
 #include "test_precomp.hpp"
 namespace opencv_test { namespace {
 
-#if !defined CV_CXX11
-// Wrapper for generating seeded random number via std::rand.
-template<unsigned Seed>
-class SeededRandFunctor {
-public:
-    SeededRandFunctor() { std::srand(Seed); }
-    int operator()(int i) { return std::rand() % (i + 1); }
-};
-#endif
-
 std::string encode_qrcode_images_name[] = {
         "version1_mode1.png", "version1_mode2.png", "version1_mode4.png",
         "version2_mode1.png", "version2_mode2.png", "version2_mode4.png",
@@ -118,9 +108,7 @@ TEST(Objdetect_QRCode_Encode, generate_test_data)
         }
 
         std::string decoded_info = "";
-#ifdef HAVE_QUIRC
         EXPECT_TRUE(decodeQRCode(resized_src, corners, decoded_info, straight_barcode)) << "The QR code cannot be decoded: " << image_path;
-#endif
         file_config << "info" << decoded_info;
         file_config << "}";
     }
@@ -264,7 +252,8 @@ TEST(Objdetect_QRCode_Encode_Decode, regression)
                 int true_capacity = establishCapacity(mode, version, cur_capacity);
 
                 std::string input_info = symbol_set;
-                std::random_shuffle(input_info.begin(),input_info.end());
+                std::mt19937 rand_gen {1};
+                std::shuffle(input_info.begin(), input_info.end(), rand_gen);
                 int count = 0;
                 if((int)input_info.length() > true_capacity)
                 {
@@ -305,7 +294,6 @@ TEST(Objdetect_QRCode_Encode_Decode, regression)
                     corners[k].y = corners[k].y * height_ratio;
                 }
 
-#ifdef HAVE_QUIRC
                 Mat straight_barcode;
                 std::string output_info = QRCodeDetector().decode(resized_src, corners, straight_barcode);
                 EXPECT_FALSE(output_info.empty())
@@ -313,7 +301,6 @@ TEST(Objdetect_QRCode_Encode_Decode, regression)
                     << " version: " << version << " error correction level: " << (int)level;
                 EXPECT_EQ(input_info, output_info) << "The generated QRcode is not same as test data." << " Mode: " << (int)mode <<
                                                         " version: " << version << " error correction level: " << (int)level;
-#endif
             }
         }
     }
@@ -355,16 +342,14 @@ TEST(Objdetect_QRCode_Encode_Kanji, regression)
             corners[j].y = corners[j].y * height_ratio;
         }
 
-#ifdef HAVE_QUIRC
         Mat straight_barcode;
         std::string decoded_info = QRCodeDetector().decode(resized_src, corners, straight_barcode);
         EXPECT_FALSE(decoded_info.empty()) << "The generated QRcode cannot be decoded.";
         EXPECT_EQ(input_info, decoded_info);
-#endif
     }
 }
 
-TEST(Objdetect_QRCode_Encode_Decode_Structured_Append, DISABLED_regression)
+TEST(Objdetect_QRCode_Encode_Decode_Structured_Append, regression)
 {
     // disabled since QR decoder probably doesn't support structured append mode qr codes
     const std::string root = "qrcode/decode_encode";
@@ -390,15 +375,8 @@ TEST(Objdetect_QRCode_Encode_Decode_Structured_Append, DISABLED_regression)
         std::string symbol_set = config["symbols_set"];
 
         std::string input_info = symbol_set;
-#if defined CV_CXX11
-        // std::random_shuffle is deprecated since C++11 and removed in C++17.
-        // Use manually constructed RNG with a fixed seed and std::shuffle instead.
         std::mt19937 rand_gen {1};
         std::shuffle(input_info.begin(), input_info.end(), rand_gen);
-#else
-        SeededRandFunctor<1> rand_gen;
-        std::random_shuffle(input_info.begin(), input_info.end(), rand_gen);
-#endif
         for (int j = min_stuctures_num; j < max_stuctures_num; j++)
         {
             QRCodeEncoder::Params params;
@@ -407,43 +385,45 @@ TEST(Objdetect_QRCode_Encode_Decode_Structured_Append, DISABLED_regression)
             vector<Mat> qrcodes;
             encoder->encodeStructuredAppend(input_info, qrcodes);
             EXPECT_TRUE(!qrcodes.empty()) << "Can't generate this QR images";
+            CV_CheckEQ(qrcodes.size(), (size_t)j, "Number of QR codes");
 
-            std::string output_info = "";
+            std::vector<Point2f> corners(4 * qrcodes.size());
             for (size_t k = 0; k < qrcodes.size(); k++)
             {
                 Mat qrcode = qrcodes[k];
+                corners[4 * k] = Point2f(border_width, border_width);
+                corners[4 * k + 1] = Point2f(qrcode.cols * 1.0f - border_width, border_width);
+                corners[4 * k + 2] = Point2f(qrcode.cols * 1.0f - border_width, qrcode.rows * 1.0f - border_width);
+                corners[4 * k + 3] = Point2f(border_width, qrcode.rows * 1.0f - border_width);
 
-                std::vector<Point2f> corners(4);
-                corners[0] = Point2f(border_width, border_width);
-                corners[1] = Point2f(qrcode.cols * 1.0f - border_width, border_width);
-                corners[2] = Point2f(qrcode.cols * 1.0f - border_width, qrcode.rows * 1.0f - border_width);
-                corners[3] = Point2f(border_width, qrcode.rows * 1.0f - border_width);
+                float width_ratio = fixed_size.width * 1.0f / qrcode.cols;
+                float height_ratio = fixed_size.height * 1.0f / qrcode.rows;
+                resize(qrcode, qrcodes[k], fixed_size, 0, 0, INTER_AREA);
 
-                Mat resized_src;
-                resize(qrcode, resized_src, fixed_size, 0, 0, INTER_AREA);
-                float width_ratio =  resized_src.cols * 1.0f / qrcode.cols;
-                float height_ratio = resized_src.rows * 1.0f / qrcode.rows;
-                for(size_t m = 0; m < corners.size(); m++)
+                for (size_t ki = 0; ki < 4; ki++)
                 {
-                    corners[m].x = corners[m].x * width_ratio;
-                    corners[m].y = corners[m].y * height_ratio;
+                    corners[4 * k + ki].x = corners[4 * k + ki].x * width_ratio + fixed_size.width * k;
+                    corners[4 * k + ki].y = corners[4 * k + ki].y * height_ratio;
                 }
+            }
 
-#ifdef HAVE_QUIRC
-                Mat straight_barcode;
-                std::string decoded_info = QRCodeDetector().decode(resized_src, corners, straight_barcode);
-                EXPECT_FALSE(decoded_info.empty())
-                    << "The generated QRcode cannot be decoded." << " Mode: " << modes[i]
-                    << " structures number: " << k << "/" << j;
-                output_info += decoded_info;
-#endif
+            Mat resized_src;
+            hconcat(qrcodes, resized_src);
+
+            std::vector<cv::String> decoded_info;
+            cv::String output_info;
+            EXPECT_TRUE(QRCodeDetector().decodeMulti(resized_src, corners, decoded_info));
+            for (size_t k = 0; k < decoded_info.size(); ++k)
+            {
+                if (!decoded_info[k].empty())
+                    output_info = decoded_info[k];
             }
-#ifdef HAVE_QUIRC
+            EXPECT_FALSE(output_info.empty())
+                << "The generated QRcode cannot be decoded." << " Mode: " << modes[i]
+                << " structures number: " << j;
+
             EXPECT_EQ(input_info, output_info) << "The generated QRcode is not same as test data." << " Mode: " << mode <<
                                                   " structures number: " << j;
-#else
-            std::cout << "Mode=" << mode << ": Unable to verify generated QR codes - QUIRC is disabled" << std::endl;
-#endif
         }
     }
 }
@@ -554,4 +534,60 @@ TEST(Objdetect_QRCode_Encode_Decode, regression_issue22029)
     }
 }
 
+// This test reproduces issue https://github.com/opencv/opencv/issues/24366 only in a loop
+TEST(Objdetect_QRCode_Encode_Decode, auto_version_pick)
+{
+    cv::QRCodeEncoder::Params params;
+    params.correction_level = cv::QRCodeEncoder::CORRECT_LEVEL_L;
+    params.mode = cv::QRCodeEncoder::EncodeMode::MODE_AUTO;
+
+    cv::Ptr<cv::QRCodeEncoder> encoder = cv::QRCodeEncoder::create(params);
+
+    for (int len = 1; len < 19; len++) {
+        std::string input;
+        input.resize(len);
+        cv::randu(Mat(1, len, CV_8U, &input[0]), 'a', 'z' + 1);
+        cv::Mat qrcode;
+        encoder->encode(input, qrcode);
+    }
+}
+
+// Test two QR codes which error correction procedure requires more number of
+// syndroms that described in the ISO/IEC 18004
+typedef testing::TestWithParam<std::pair<std::string, std::string>> Objdetect_QRCode_decoding;
+TEST_P(Objdetect_QRCode_decoding, error_correction)
+{
+    const std::string filename = get<0>(GetParam());
+    const std::string expected = get<1>(GetParam());
+
+    QRCodeDetector qrcode;
+    cv::String decoded_msg;
+    Mat src = cv::imread(findDataFile("qrcode/" + filename), IMREAD_GRAYSCALE);
+
+    std::vector<Point2f> corners(4);
+    corners[0] = Point2f(0, 0);
+    corners[1] = Point2f(src.cols * 1.0f, 0);
+    corners[2] = Point2f(src.cols * 1.0f, src.rows * 1.0f);
+    corners[3] = Point2f(0, src.rows * 1.0f);
+
+    Mat resized_src;
+    resize(src, resized_src, fixed_size, 0, 0, INTER_AREA);
+    float width_ratio =  resized_src.cols * 1.0f / src.cols;
+    float height_ratio = resized_src.rows * 1.0f / src.rows;
+    for(size_t m = 0; m < corners.size(); m++)
+    {
+        corners[m].x = corners[m].x * width_ratio;
+        corners[m].y = corners[m].y * height_ratio;
+    }
+
+    Mat straight_barcode;
+    EXPECT_NO_THROW(decoded_msg = qrcode.decode(resized_src, corners, straight_barcode));
+    ASSERT_FALSE(straight_barcode.empty()) << "Can't decode qrimage " << filename;
+    EXPECT_EQ(expected, decoded_msg);
+}
+INSTANTIATE_TEST_CASE_P(/**/, Objdetect_QRCode_decoding, testing::ValuesIn(std::vector<std::pair<std::string, std::string>>{
+    {"err_correct_1M.png", "New"},
+    {"err_correct_2L.png", "Version 2 QR Code Test Image"},
+}));
+
 }} // namespace
diff --git a/modules/objdetect/tutorials/images/singlemarkersaxes2.jpg b/modules/objdetect/tutorials/images/singlemarkersaxes2.jpg
deleted file mode 100644
index dc8edee15d95..000000000000
Binary files a/modules/objdetect/tutorials/images/singlemarkersaxes2.jpg and /dev/null differ
diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index cef4e4da5918..392232851a45 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -55,30 +55,29 @@ This module includes photo processing algorithms
     @defgroup photo_denoise Denoising
     @defgroup photo_hdr HDR imaging
 
-This section describes high dynamic range imaging algorithms namely tonemapping, exposure alignment,
-camera calibration with multiple exposures and exposure fusion.
+    This section describes high dynamic range imaging algorithms namely tonemapping, exposure alignment,
+    camera calibration with multiple exposures and exposure fusion.
 
     @defgroup photo_decolor Contrast Preserving Decolorization
 
-Useful links:
+    Useful links:
 
-http://www.cse.cuhk.edu.hk/leojia/projects/color2gray/index.html
+    http://www.cse.cuhk.edu.hk/leojia/projects/color2gray/index.html
 
     @defgroup photo_clone Seamless Cloning
 
-Useful links:
+    Useful links:
 
-https://www.learnopencv.com/seamless-cloning-using-opencv-python-cpp
+    https://www.learnopencv.com/seamless-cloning-using-opencv-python-cpp
 
     @defgroup photo_render Non-Photorealistic Rendering
 
-Useful links:
+    Useful links:
 
-http://www.inf.ufrgs.br/~eslgastal/DomainTransform
+    http://www.inf.ufrgs.br/~eslgastal/DomainTransform
 
-https://www.learnopencv.com/non-photorealistic-rendering-using-opencv-python-c/
+    https://www.learnopencv.com/non-photorealistic-rendering-using-opencv-python-c/
 
-    @defgroup photo_c C API
 @}
   */
 
diff --git a/modules/photo/include/opencv2/photo/cuda.hpp b/modules/photo/include/opencv2/photo/cuda.hpp
index b6ab40a764b4..709ad2d26f9d 100644
--- a/modules/photo/include/opencv2/photo/cuda.hpp
+++ b/modules/photo/include/opencv2/photo/cuda.hpp
@@ -78,7 +78,7 @@ CV_WRAP inline void nonLocalMeans(const GpuMat& src, CV_OUT GpuMat& dst,
                             Stream& stream = Stream::Null())
 {
     nonLocalMeans(InputArray(src), OutputArray(dst), h, search_window, block_size, borderMode, stream);
-};
+}
 
 /** @brief Perform image denoising using Non-local Means Denoising algorithm
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
diff --git a/modules/photo/perf/perf_inpaint.cpp b/modules/photo/perf/perf_inpaint.cpp
index 4ebf86d582a2..972770234f54 100644
--- a/modules/photo/perf/perf_inpaint.cpp
+++ b/modules/photo/perf/perf_inpaint.cpp
@@ -6,6 +6,7 @@ namespace opencv_test
 CV_ENUM(InpaintingMethod, INPAINT_NS, INPAINT_TELEA)
 typedef tuple<Size, InpaintingMethod> InpaintArea_InpaintingMethod_t;
 typedef perf::TestBaseWithParam<InpaintArea_InpaintingMethod_t> InpaintArea_InpaintingMethod;
+typedef perf::TestBaseWithParam<InpaintingMethod> Perf_InpaintingMethod;
 
 
 PERF_TEST_P(InpaintArea_InpaintingMethod, inpaint,
@@ -34,4 +35,26 @@ PERF_TEST_P(InpaintArea_InpaintingMethod, inpaint,
     SANITY_CHECK(inpaintedArea);
 }
 
+PERF_TEST_P(Perf_InpaintingMethod, inpaintDots, InpaintingMethod::all())
+{
+    Mat src = imread(getDataPath("gpu/hog/road.png"));
+
+    int inpaintingMethod = GetParam();
+
+    Mat mask(src.size(), CV_8UC1, Scalar(0));
+    Mat result(src.size(), src.type());
+
+    for (int i = 0; i < src.size().height; i += 16) {
+        for (int j = 0; j < src.size().width; j += 16) {
+            mask.at<unsigned char>(i, j) = 255;
+        }
+    }
+
+    declare.in(src, mask).out(result).time(120);
+
+    TEST_CYCLE() inpaint(src, mask, result, 10.0, inpaintingMethod);
+
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
diff --git a/modules/photo/src/denoise_tvl1.cpp b/modules/photo/src/denoise_tvl1.cpp
index df756c4c85b9..5f49de142170 100644
--- a/modules/photo/src/denoise_tvl1.cpp
+++ b/modules/photo/src/denoise_tvl1.cpp
@@ -42,8 +42,6 @@
 #include <vector>
 #include <algorithm>
 
-#define ABSCLIP(val,threshold) MIN(MAX((val),-(threshold)),(threshold))
-
 namespace cv{
 
     class AddFloatToCharScaled{
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index d36c85a84063..9da5c0cf8c7c 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -174,7 +174,7 @@ class DistAbs
     static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
     {
         return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
-    };
+    }
 
     template <typename T, typename WT>
     static inline WT calcWeight(double dist, const float *h,
@@ -296,7 +296,7 @@ class DistSquared
     static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
     {
         return calcUpDownDist_<T>::f(a_up, a_down, b_up, b_down);
-    };
+    }
 
     template <typename T, typename WT>
     static inline WT calcWeight(double dist, const float *h,
diff --git a/modules/photo/src/inpaint.cpp b/modules/photo/src/inpaint.cpp
index b436168212ea..d9a7dbc58b8c 100644
--- a/modules/photo/src/inpaint.cpp
+++ b/modules/photo/src/inpaint.cpp
@@ -45,6 +45,9 @@
 //
 // */
 
+#include <queue>
+#include <type_traits>
+
 #include "precomp.hpp"
 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/photo/legacy/constants_c.h"
@@ -53,6 +56,16 @@
 #define CV_MAT_ELEM_PTR_FAST( mat, row, col, pix_size )  \
      ((mat).data.ptr + (size_t)(mat).step*(row) + (pix_size)*(col))
 
+template<typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type round_cast(float val) {
+   return cv::saturate_cast<T>(val);
+}
+
+template<typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type round_cast(float val) {
+   return cv::saturate_cast<T>(val + 0.5);
+}
+
 inline float
 min4( float a, float b, float c, float d )
 {
@@ -71,8 +84,16 @@ typedef struct CvHeapElem
 {
     float T;
     int i,j;
-    struct CvHeapElem* prev;
-    struct CvHeapElem* next;
+    int order;  // to keep insertion order
+
+    bool operator > (const CvHeapElem& rhs) const {
+        if (T > rhs.T) {
+            return true;
+        } else if (T < rhs.T) {
+            return false;
+        }
+        return order > rhs.order;
+    }
 }
 CvHeapElem;
 
@@ -84,42 +105,10 @@ class CvPriorityQueueFloat
     CvPriorityQueueFloat& operator=(const CvPriorityQueueFloat &); // assign disabled
 
 protected:
-    CvHeapElem *mem,*empty,*head,*tail;
-    int num,in;
+    std::priority_queue<CvHeapElem, std::vector<CvHeapElem>,std::greater<CvHeapElem> > queue;
+    int next_order;
 
 public:
-    bool Init( const CvMat* f )
-    {
-        int i,j;
-        for( i = num = 0; i < f->rows; i++ )
-        {
-            for( j = 0; j < f->cols; j++ )
-                num += CV_MAT_ELEM(*f,uchar,i,j)!=0;
-        }
-        if (num<=0) return false;
-        mem = (CvHeapElem*)cvAlloc((num+2)*sizeof(CvHeapElem));
-        if (mem==NULL) return false;
-
-        head       = mem;
-        head->i    = head->j = -1;
-        head->prev = NULL;
-        head->next = mem+1;
-        head->T    = -FLT_MAX;
-        empty      = mem+1;
-        for (i=1; i<=num; i++) {
-            mem[i].prev   = mem+i-1;
-            mem[i].next   = mem+i+1;
-            mem[i].i      = -1;
-            mem[i].T      = FLT_MAX;
-        }
-        tail       = mem+i;
-        tail->i    = tail->j = -1;
-        tail->prev = mem+i-1;
-        tail->next = NULL;
-        tail->T    = FLT_MAX;
-        return true;
-    }
-
     bool Add(const CvMat* f) {
         int i,j;
         for (i=0; i<f->rows; i++) {
@@ -133,71 +122,33 @@ class CvPriorityQueueFloat
     }
 
     bool Push(int i, int j, float T) {
-        CvHeapElem *tmp=empty,*add=empty;
-        if (empty==tail) return false;
-        while (tmp->prev->T>T) tmp = tmp->prev;
-        if (tmp!=empty) {
-            add->prev->next = add->next;
-            add->next->prev = add->prev;
-            empty = add->next;
-            add->prev = tmp->prev;
-            add->next = tmp;
-            add->prev->next = add;
-            add->next->prev = add;
-        } else {
-            empty = empty->next;
-        }
-        add->i = i;
-        add->j = j;
-        add->T = T;
-        in++;
-        //      printf("push i %3d  j %3d  T %12.4e  in %4d\n",i,j,T,in);
+        queue.push({T, i, j, next_order});
+        ++next_order;
         return true;
     }
 
     bool Pop(int *i, int *j) {
-        CvHeapElem *tmp=head->next;
-        if (empty==tmp) return false;
-        *i = tmp->i;
-        *j = tmp->j;
-        tmp->prev->next = tmp->next;
-        tmp->next->prev = tmp->prev;
-        tmp->prev = empty->prev;
-        tmp->next = empty;
-        tmp->prev->next = tmp;
-        tmp->next->prev = tmp;
-        empty = tmp;
-        in--;
-        //      printf("pop  i %3d  j %3d  T %12.4e  in %4d\n",tmp->i,tmp->j,tmp->T,in);
+        if (queue.empty()) {
+            return false;
+        }
+        *i = queue.top().i;
+        *j = queue.top().j;
+        queue.pop();
         return true;
     }
 
     bool Pop(int *i, int *j, float *T) {
-        CvHeapElem *tmp=head->next;
-        if (empty==tmp) return false;
-        *i = tmp->i;
-        *j = tmp->j;
-        *T = tmp->T;
-        tmp->prev->next = tmp->next;
-        tmp->next->prev = tmp->prev;
-        tmp->prev = empty->prev;
-        tmp->next = empty;
-        tmp->prev->next = tmp;
-        tmp->next->prev = tmp;
-        empty = tmp;
-        in--;
-        //      printf("pop  i %3d  j %3d  T %12.4e  in %4d\n",tmp->i,tmp->j,tmp->T,in);
+        if (queue.empty()) {
+            return false;
+        }
+        *i = queue.top().i;
+        *j = queue.top().j;
+        *T = queue.top().T;
+        queue.pop();
         return true;
     }
 
-    CvPriorityQueueFloat(void) {
-        num=in=0;
-        mem=empty=head=tail=NULL;
-    }
-
-    ~CvPriorityQueueFloat(void)
-    {
-        cvFree( &mem );
+    CvPriorityQueueFloat(void) : queue(), next_order() {
     }
 };
 
@@ -389,7 +340,7 @@ icvTeleaInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQu
                                     gradI.y=0;
                                  }
                               }
-                              Ia[color] += (float)w * (float)(CV_MAT_3COLOR_ELEM(*out,uchar,km,lm,color));
+                              Ia[color] += (float)w * (float)(CV_MAT_3COLOR_ELEM(*out,uchar,k-1,l-1,color));
                               Jx[color] -= (float)w * (float)(gradI.x*r.x);
                               Jy[color] -= (float)w * (float)(gradI.y*r.y);
                               s[color]  += w;
@@ -399,8 +350,8 @@ icvTeleaInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQu
                   }
                }
                for (color=0; color<=2; color++) {
-                  sat = (float)((Ia[color]/s[color]+(Jx[color]+Jy[color])/(sqrt(Jx[color]*Jx[color]+Jy[color]*Jy[color])+1.0e-20f)+0.5f));
-                  CV_MAT_3COLOR_ELEM(*out,uchar,i-1,j-1,color) = cv::saturate_cast<uchar>(sat);
+                  sat = (float)(Ia[color]/s[color]+(Jx[color]+Jy[color])/(sqrt(Jx[color]*Jx[color]+Jy[color]*Jy[color])+1.0e-20f));
+                  CV_MAT_3COLOR_ELEM(*out,uchar,i-1,j-1,color) = round_cast<uchar>(sat);
                }
 
                CV_MAT_ELEM(*f,uchar,i,j) = BAND;
@@ -501,7 +452,7 @@ icvTeleaInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQu
                                     gradI.y=0;
                                  }
                               }
-                              Ia += (float)w * (float)(CV_MAT_ELEM(*out,data_type,km,lm));
+                              Ia += (float)w * (float)(CV_MAT_ELEM(*out,data_type,k-1,l-1));
                               Jx -= (float)w * (float)(gradI.x*r.x);
                               Jy -= (float)w * (float)(gradI.y*r.y);
                               s  += w;
@@ -509,9 +460,9 @@ icvTeleaInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQu
                         }
                      }
                   }
-                  sat = (float)((Ia/s+(Jx+Jy)/(sqrt(Jx*Jx+Jy*Jy)+1.0e-20f)+0.5f));
+                  sat = (float)(Ia/s+(Jx+Jy)/(sqrt(Jx*Jx+Jy*Jy)+1.0e-20f));
                   {
-                  CV_MAT_ELEM(*out,data_type,i-1,j-1) = cv::saturate_cast<data_type>(sat);
+                  CV_MAT_ELEM(*out,data_type,i-1,j-1) = round_cast<data_type>(sat);
                   }
                }
 
@@ -604,7 +555,7 @@ icvNSInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQueue
                                  dir = (float)fabs(VectorScalMult(r,gradI)/sqrt(VectorLength(r)*VectorLength(gradI)));
                               }
                               w = dst*dir;
-                              Ia[color] += (float)w * (float)(CV_MAT_3COLOR_ELEM(*out,uchar,km,lm,color));
+                              Ia[color] += (float)w * (float)(CV_MAT_3COLOR_ELEM(*out,uchar,k-1,l-1,color));
                               s[color]  += w;
                            }
                         }
@@ -694,7 +645,7 @@ icvNSInpaintFMM(const CvMat *f, CvMat *t, CvMat *out, int range, CvPriorityQueue
                                  dir = (float)fabs(VectorScalMult(r,gradI)/sqrt(VectorLength(r)*VectorLength(gradI)));
                               }
                               w = dst*dir;
-                              Ia += (float)w * (float)(CV_MAT_ELEM(*out,data_type,km,lm));
+                              Ia += (float)w * (float)(CV_MAT_ELEM(*out,data_type,k-1,l-1));
                               s  += w;
                            }
                         }
@@ -753,18 +704,18 @@ icvInpaint( const CvArr* _input_img, const CvArr* _inpaint_mask, CvArr* _output_
     output_img = cvGetMat( _output_img, &output_hdr );
 
     if( !CV_ARE_SIZES_EQ(input_img,output_img) || !CV_ARE_SIZES_EQ(input_img,inpaint_mask))
-        CV_Error( CV_StsUnmatchedSizes, "All the input and output images must have the same size" );
+        CV_Error( cv::Error::StsUnmatchedSizes, "All the input and output images must have the same size" );
 
     if( (CV_MAT_TYPE(input_img->type) != CV_8U &&
          CV_MAT_TYPE(input_img->type) != CV_16U &&
          CV_MAT_TYPE(input_img->type) != CV_32F &&
         CV_MAT_TYPE(input_img->type) != CV_8UC3) ||
         !CV_ARE_TYPES_EQ(input_img,output_img) )
-        CV_Error( CV_StsUnsupportedFormat,
+        CV_Error( cv::Error::StsUnsupportedFormat,
         "8-bit, 16-bit unsigned or 32-bit float 1-channel and 8-bit 3-channel input/output images are supported" );
 
     if( CV_MAT_TYPE(inpaint_mask->type) != CV_8UC1 )
-        CV_Error( CV_StsUnsupportedFormat, "The mask must be 8-bit 1-channel image" );
+        CV_Error( cv::Error::StsUnsupportedFormat, "The mask must be 8-bit 1-channel image" );
 
     range = MAX(range,1);
     range = MIN(range,100);
@@ -786,8 +737,6 @@ icvInpaint( const CvArr* _input_img, const CvArr* _inpaint_mask, CvArr* _output_
     cvSet(t,cvScalar(1.0e6f,0,0,0));
     cv::dilate(cv::cvarrToMat(mask), cv::cvarrToMat(band), el_cross, cv::Point(1, 1));
     Heap=cv::makePtr<CvPriorityQueueFloat>();
-    if (!Heap->Init(band))
-        return;
     cvSub(band,mask,band,NULL);
     SET_BORDER1_C1(band,uchar,0);
     if (!Heap->Add(band))
@@ -803,8 +752,6 @@ icvInpaint( const CvArr* _input_img, const CvArr* _inpaint_mask, CvArr* _output_
         cv::dilate(cv::cvarrToMat(mask), cv::cvarrToMat(out), el_range);
         cvSub(out,mask,out,NULL);
         Out=cv::makePtr<CvPriorityQueueFloat>();
-        if (!Out->Init(out))
-            return;
         if (!Out->Add(band))
             return;
         cvSub(out,band,out,NULL);
diff --git a/modules/photo/test/test_inpaint.cpp b/modules/photo/test/test_inpaint.cpp
index 58806cbbc2f5..75bab57737df 100644
--- a/modules/photo/test/test_inpaint.cpp
+++ b/modules/photo/test/test_inpaint.cpp
@@ -116,9 +116,9 @@ void CV_InpaintTest::run( int )
 
 TEST(Photo_Inpaint, regression) { CV_InpaintTest test; test.safe_run(); }
 
-typedef testing::TestWithParam<tuple<int> > formats;
+typedef testing::TestWithParam<tuple<perf::MatType> > formats;
 
-TEST_P(formats, 1c)
+TEST_P(formats, basic)
 {
     const int type = get<0>(GetParam());
     Mat src(100, 100, type);
@@ -126,18 +126,18 @@ TEST_P(formats, 1c)
     Mat ref = src.clone();
     Mat dst, mask = Mat::zeros(src.size(), CV_8U);
 
-    circle(src, Point(50, 50), 5, Scalar(200), 6);
-    circle(mask, Point(50, 50), 5, Scalar(200), 6);
+    circle(src, Point(50, 50), 5, Scalar::all(200), 6);
+    circle(mask, Point(50, 50), 5, Scalar::all(200), 6);
     inpaint(src, mask, dst, 10, INPAINT_NS);
 
     Mat dst2;
     inpaint(src, mask, dst2, 10, INPAINT_TELEA);
 
-    ASSERT_LE(cv::norm(dst, ref, NORM_INF), 3.);
-    ASSERT_LE(cv::norm(dst2, ref, NORM_INF), 3.);
+    ASSERT_EQ(cv::norm(dst, ref, NORM_INF), 0.);
+    ASSERT_EQ(cv::norm(dst2, ref, NORM_INF), 0.);
 }
 
-INSTANTIATE_TEST_CASE_P(Photo_Inpaint, formats, testing::Values(CV_32F, CV_16U, CV_8U));
+INSTANTIATE_TEST_CASE_P(Photo_Inpaint, formats, testing::Values(CV_32FC1, CV_16UC1, CV_8UC1, CV_8UC3));
 
 TEST(Photo_InpaintBorders, regression)
 {
@@ -157,4 +157,30 @@ TEST(Photo_InpaintBorders, regression)
     ASSERT_TRUE(countNonZero(diff) == 0);
 }
 
+typedef testing::TestWithParam<tuple<perf::MatType>> Photo_InpaintSmallBorders;
+
+TEST_P(Photo_InpaintSmallBorders, regression)
+{
+    int type = get<0>(GetParam());
+    Mat img(5, 5, type, Scalar::all(128));
+    Mat expected = img.clone();
+
+    Mat mask = Mat::zeros(5, 5, CV_8U);
+    mask(Rect(1, 1, 3, 3)) = 255;
+
+    img.setTo(Scalar::all(0), mask);
+
+    Mat inpainted, diff;
+
+    inpaint(img, mask, inpainted, 1, INPAINT_TELEA);
+    cv::absdiff(inpainted, expected, diff);
+    ASSERT_EQ(countNonZero(diff.reshape(1)), 0);
+
+    inpaint(img, mask, inpainted, 1, INPAINT_NS);
+    cv::absdiff(inpainted, expected, diff);
+    ASSERT_EQ(countNonZero(diff.reshape(1)), 0);
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, Photo_InpaintSmallBorders,  Values(CV_8UC1, CV_8UC3));
+
 }} // namespace
diff --git a/modules/python/common.cmake b/modules/python/common.cmake
index a233fe023245..cd6c27984a0e 100644
--- a/modules/python/common.cmake
+++ b/modules/python/common.cmake
@@ -46,6 +46,7 @@ if(${PYTHON}_LIMITED_API)
   # support only python3.3+
   ocv_assert(${PYTHON}_VERSION_MAJOR EQUAL 3 AND ${PYTHON}_VERSION_MINOR GREATER 2)
   target_compile_definitions(${the_module} PRIVATE CVPY_DYNAMIC_INIT)
+  target_compile_definitions(${the_module} PRIVATE PYTHON3_LIMITED_API_VERSION=${PYTHON3_LIMITED_API_VERSION})
   if(WIN32)
     string(REPLACE
       "python${${PYTHON}_VERSION_MAJOR}${${PYTHON}_VERSION_MINOR}.lib"
diff --git a/modules/python/package/cv2/__init__.py b/modules/python/package/cv2/__init__.py
index 550482bd17db..7e148fc9f2b9 100644
--- a/modules/python/package/cv2/__init__.py
+++ b/modules/python/package/cv2/__init__.py
@@ -33,7 +33,7 @@ def __load_extra_py_code_for_module(base, name, enable_debug_print=False):
         # Extension doesn't contain extra py code
         return False
 
-    if not hasattr(base, name):
+    if base in sys.modules and not hasattr(sys.modules[base], name):
         setattr(sys.modules[base], name, py_module)
     sys.modules[export_module_name] = py_module
     # If it is C extension module it is already loaded by cv2 package
diff --git a/modules/python/package/setup.py b/modules/python/package/setup.py
index 11b204d603b4..068f6180cb1f 100644
--- a/modules/python/package/setup.py
+++ b/modules/python/package/setup.py
@@ -19,7 +19,7 @@ def main():
     os.chdir(SCRIPT_DIR)
 
     package_name = 'opencv'
-    package_version = os.environ.get('OPENCV_VERSION', '4.8.0')  # TODO
+    package_version = os.environ.get('OPENCV_VERSION', '4.10.0')  # TODO
 
     long_description = 'Open Source Computer Vision Library Python bindings'  # TODO
 
diff --git a/modules/python/python3/CMakeLists.txt b/modules/python/python3/CMakeLists.txt
index d95af21e04b3..da86ba5c5eae 100644
--- a/modules/python/python3/CMakeLists.txt
+++ b/modules/python/python3/CMakeLists.txt
@@ -2,15 +2,6 @@ if(NOT PYTHON3_INCLUDE_PATH OR NOT PYTHON3_NUMPY_INCLUDE_DIRS)
   ocv_module_disable(python3)
 endif()
 
-# Problem in numpy >=1.15 <1.17
-if(PYTHON3_LIMITED_API
-    AND NOT PYTHON3_NUMPY_VERSION VERSION_LESS "1.15"
-    AND PYTHON3_NUMPY_VERSION VERSION_LESS "1.17"
-  )
-  message(WARNING "Current NUMPY version (${PYTHON3_NUMPY_VERSION}) is not compatible with LIMITED_API.")
-  set(PYTHON3_LIMITED_API OFF)
-endif()
-
 set(the_description "The python3 bindings")
 set(MODULE_NAME python3)
 set(MODULE_INSTALL_SUBDIR python3)
diff --git a/modules/python/src2/cv2.hpp b/modules/python/src2/cv2.hpp
index 9293a593f24d..06080f1aa1ef 100644
--- a/modules/python/src2/cv2.hpp
+++ b/modules/python/src2/cv2.hpp
@@ -13,7 +13,10 @@
 // #define Py_DEBUG
 
 #if defined(CVPY_DYNAMIC_INIT) && !defined(Py_DEBUG)
-#   define Py_LIMITED_API 0x03030000
+#   ifndef PYTHON3_LIMITED_API_VERSION
+#       define PYTHON3_LIMITED_API_VERSION 0x03060000
+#   endif
+#   define Py_LIMITED_API PYTHON3_LIMITED_API_VERSION
 #endif
 
 #include <cmath>
@@ -39,12 +42,23 @@
 
 class ArgInfo
 {
+private:
+    static const uint32_t arg_outputarg_flag     = 0x1;
+    static const uint32_t arg_arithm_op_src_flag = 0x2;
+    static const uint32_t arg_pathlike_flag      = 0x4;
+
 public:
     const char* name;
     bool outputarg;
+    bool arithm_op_src;
+    bool pathlike;
     // more fields may be added if necessary
 
-    ArgInfo(const char* name_, bool outputarg_) : name(name_), outputarg(outputarg_) {}
+    ArgInfo(const char* name_, uint32_t arg_) :
+        name(name_),
+        outputarg((arg_ & arg_outputarg_flag) != 0),
+        arithm_op_src((arg_ & arg_arithm_op_src_flag) != 0),
+        pathlike((arg_ & arg_pathlike_flag) != 0) {}
 
 private:
     ArgInfo(const ArgInfo&) = delete;
diff --git a/modules/python/src2/cv2_convert.cpp b/modules/python/src2/cv2_convert.cpp
index 2e69586f47e7..35766b47c916 100644
--- a/modules/python/src2/cv2_convert.cpp
+++ b/modules/python/src2/cv2_convert.cpp
@@ -5,6 +5,7 @@
 
 #include "cv2_convert.hpp"
 #include "cv2_numpy.hpp"
+#include "cv2_util.hpp"
 #include "opencv2/core/utils/logger.hpp"
 
 PyTypeObject* pyopencv_Mat_TypePtr = nullptr;
@@ -24,6 +25,26 @@ static std::string pycv_dumpArray(const T* arr, int n)
     return out.str();
 }
 
+static inline std::string getArrayTypeName(PyArrayObject* arr)
+{
+    PyArray_Descr* dtype = PyArray_DESCR(arr);
+    PySafeObject dtype_str(PyObject_Str(reinterpret_cast<PyObject*>(dtype)));
+    if (!dtype_str)
+    {
+        // Fallback to typenum value
+        return cv::format("%d", PyArray_TYPE(arr));
+    }
+    std::string type_name;
+    if (!getUnicodeString(dtype_str, type_name))
+    {
+        // Failed to get string from bytes object - clear set TypeError and
+        // fallback to typenum value
+        PyErr_Clear();
+        return cv::format("%d", PyArray_TYPE(arr));
+    }
+    return type_name;
+}
+
 //======================================================================================================================
 
 // --- Mat
@@ -35,27 +56,46 @@ bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo& info)
     if(!o || o == Py_None)
     {
         if( !m.data )
-            m.allocator = &g_numpyAllocator;
+            m.allocator = &GetNumpyAllocator();
         return true;
     }
 
     if( PyInt_Check(o) )
     {
         double v[] = {static_cast<double>(PyInt_AsLong((PyObject*)o)), 0., 0., 0.};
+        if ( info.arithm_op_src )
+        {
+            // Normally cv.XXX(x) means cv.XXX( (x, 0., 0., 0.) );
+            // However  cv.add(mat,x) means cv::add(mat, (x,x,x,x) ).
+            v[1] = v[0];
+            v[2] = v[0];
+            v[3] = v[0];
+        }
         m = Mat(4, 1, CV_64F, v).clone();
         return true;
     }
     if( PyFloat_Check(o) )
     {
         double v[] = {PyFloat_AsDouble((PyObject*)o), 0., 0., 0.};
+
+       if ( info.arithm_op_src )
+        {
+            // Normally cv.XXX(x) means cv.XXX( (x, 0., 0., 0.) );
+            // However  cv.add(mat,x) means cv::add(mat, (x,x,x,x) ).
+            v[1] = v[0];
+            v[2] = v[0];
+            v[3] = v[0];
+        }
         m = Mat(4, 1, CV_64F, v).clone();
         return true;
     }
     if( PyTuple_Check(o) )
     {
-        int i, sz = (int)PyTuple_Size((PyObject*)o);
-        m = Mat(sz, 1, CV_64F);
-        for( i = 0; i < sz; i++ )
+        // see https://github.com/opencv/opencv/issues/24057
+        const int sz  = (int)PyTuple_Size((PyObject*)o);
+        const int sz2 = info.arithm_op_src ? std::max(4, sz) : sz; // Scalar has 4 elements.
+        m = Mat::zeros(sz2, 1, CV_64F);
+        for( int i = 0; i < sz; i++ )
         {
             PyObject* oi = PyTuple_GetItem(o, i);
             if( PyInt_Check(oi) )
@@ -80,6 +120,13 @@ bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo& info)
 
     PyArrayObject* oarr = (PyArrayObject*) o;
 
+    if (info.outputarg && !PyArray_ISWRITEABLE(oarr))
+    {
+        failmsg("%s marked as output argument, but provided NumPy array "
+                "marked as readonly", info.name);
+        return false;
+    }
+
     bool needcopy = false, needcast = false;
     int typenum = PyArray_TYPE(oarr), new_typenum = typenum;
     int type = typenum == NPY_UBYTE ? CV_8U :
@@ -102,7 +149,9 @@ bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo& info)
         }
         else
         {
-            failmsg("%s data type = %d is not supported", info.name, typenum);
+            const std::string dtype_name = getArrayTypeName(oarr);
+            failmsg("%s data type = %s is not supported", info.name,
+                    dtype_name.c_str());
             return false;
         }
     }
@@ -211,6 +260,31 @@ bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo& info)
         }
     }
 
+    // see https://github.com/opencv/opencv/issues/24057
+    if ( ( info.arithm_op_src ) && ( ndims == 1 ) && ( size[0] <= 4 ) )
+    {
+        const int sz  = size[0]; // Real Data Length(1, 2, 3 or 4)
+        const int sz2 = 4;       // Scalar has 4 elements.
+        m = Mat::zeros(sz2, 1, CV_64F);
+
+        const char *base_ptr = PyArray_BYTES(oarr);
+        for(int i = 0; i < sz; i++ )
+        {
+            PyObject* oi = PyArray_GETITEM(oarr, base_ptr + step[0] * i);
+            if( PyInt_Check(oi) )
+                m.at<double>(i) = (double)PyInt_AsLong(oi);
+            else if( PyFloat_Check(oi) )
+                m.at<double>(i) = (double)PyFloat_AsDouble(oi);
+            else
+            {
+                failmsg("%s has some non-numerical elements", info.name);
+                m.release();
+                return false;
+            }
+        }
+        return true;
+    }
+
     // handle degenerate case
     // FIXIT: Don't force 1D for Scalars
     if( ndims == 0) {
@@ -224,14 +298,14 @@ bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo& info)
 #endif
 
     m = Mat(ndims, size, type, PyArray_DATA(oarr), step);
-    m.u = g_numpyAllocator.allocate(o, ndims, size, type, step);
+    m.u = GetNumpyAllocator().allocate(o, ndims, size, type, step);
     m.addref();
 
     if( !needcopy )
     {
         Py_INCREF(o);
     }
-    m.allocator = &g_numpyAllocator;
+    m.allocator = &GetNumpyAllocator();
 
     return true;
 }
@@ -242,9 +316,9 @@ PyObject* pyopencv_from(const cv::Mat& m)
     if( !m.data )
         Py_RETURN_NONE;
     cv::Mat temp, *p = (cv::Mat*)&m;
-    if(!p->u || p->allocator != &g_numpyAllocator)
+    if(!p->u || p->allocator != &GetNumpyAllocator())
     {
-        temp.allocator = &g_numpyAllocator;
+        temp.allocator = &GetNumpyAllocator();
         ERRWRAP2(m.copyTo(temp));
         p = &temp;
     }
@@ -627,6 +701,18 @@ bool pyopencv_to(PyObject* obj, String &value, const ArgInfo& info)
         return true;
     }
     std::string str;
+
+#if ((PY_VERSION_HEX >= 0x03060000) && !defined(Py_LIMITED_API)) || (Py_LIMITED_API >= 0x03060000)
+    if (info.pathlike)
+    {
+        obj = PyOS_FSPath(obj);
+        if (PyErr_Occurred())
+        {
+            failmsg("Expected '%s' to be a str or path-like object", info.name);
+            return false;
+        }
+    }
+#endif
     if (getUnicodeString(obj, str))
     {
         value = str;
@@ -711,6 +797,21 @@ PyObject* pyopencv_from(const Rect& r)
     return Py_BuildValue("(iiii)", r.x, r.y, r.width, r.height);
 }
 
+template<>
+bool pyopencv_to(PyObject* obj, Rect2f& r, const ArgInfo& info)
+{
+    RefWrapper<float> values[] = {
+        RefWrapper<float>(r.x), RefWrapper<float>(r.y),
+        RefWrapper<float>(r.width), RefWrapper<float>(r.height)};
+    return parseSequence(obj, values, info);
+}
+
+template<>
+PyObject* pyopencv_from(const Rect2f& r)
+{
+    return Py_BuildValue("(ffff)", r.x, r.y, r.width, r.height);
+}
+
 template<>
 bool pyopencv_to(PyObject* obj, Rect2d& r, const ArgInfo& info)
 {
@@ -777,7 +878,7 @@ bool pyopencv_to(PyObject* obj, RotatedRect& dst, const ArgInfo& info)
     }
     {
         const String centerItemName = format("'%s' center point", info.name);
-        const ArgInfo centerItemInfo(centerItemName.c_str(), false);
+        const ArgInfo centerItemInfo(centerItemName.c_str(), 0);
         SafeSeqItem centerItem(obj, 0);
         if (!pyopencv_to(centerItem.item, dst.center, centerItemInfo))
         {
@@ -786,7 +887,7 @@ bool pyopencv_to(PyObject* obj, RotatedRect& dst, const ArgInfo& info)
     }
     {
         const String sizeItemName = format("'%s' size", info.name);
-        const ArgInfo sizeItemInfo(sizeItemName.c_str(), false);
+        const ArgInfo sizeItemInfo(sizeItemName.c_str(), 0);
         SafeSeqItem sizeItem(obj, 1);
         if (!pyopencv_to(sizeItem.item, dst.size, sizeItemInfo))
         {
@@ -795,7 +896,7 @@ bool pyopencv_to(PyObject* obj, RotatedRect& dst, const ArgInfo& info)
     }
     {
         const String angleItemName = format("'%s' angle", info.name);
-        const ArgInfo angleItemInfo(angleItemName.c_str(), false);
+        const ArgInfo angleItemInfo(angleItemName.c_str(), 0);
         SafeSeqItem angleItem(obj, 2);
         if (!pyopencv_to(angleItem.item, dst.angle, angleItemInfo))
         {
@@ -878,6 +979,21 @@ PyObject* pyopencv_from(const Point2d& p)
     return Py_BuildValue("(dd)", p.x, p.y);
 }
 
+template<>
+bool pyopencv_to(PyObject* obj, Point3i& p, const ArgInfo& info)
+{
+    RefWrapper<int> values[] = {RefWrapper<int>(p.x),
+                                RefWrapper<int>(p.y),
+                                RefWrapper<int>(p.z)};
+    return parseSequence(obj, values, info);
+}
+
+template<>
+PyObject* pyopencv_from(const Point3i& p)
+{
+    return Py_BuildValue("(iii)", p.x, p.y, p.z);
+}
+
 template<>
 bool pyopencv_to(PyObject* obj, Point3f& p, const ArgInfo& info)
 {
@@ -1045,7 +1161,7 @@ bool pyopencv_to(PyObject* obj, TermCriteria& dst, const ArgInfo& info)
     }
     {
         const String typeItemName = format("'%s' criteria type", info.name);
-        const ArgInfo typeItemInfo(typeItemName.c_str(), false);
+        const ArgInfo typeItemInfo(typeItemName.c_str(), 0);
         SafeSeqItem typeItem(obj, 0);
         if (!pyopencv_to(typeItem.item, dst.type, typeItemInfo))
         {
@@ -1054,7 +1170,7 @@ bool pyopencv_to(PyObject* obj, TermCriteria& dst, const ArgInfo& info)
     }
     {
         const String maxCountItemName = format("'%s' max count", info.name);
-        const ArgInfo maxCountItemInfo(maxCountItemName.c_str(), false);
+        const ArgInfo maxCountItemInfo(maxCountItemName.c_str(), 0);
         SafeSeqItem maxCountItem(obj, 1);
         if (!pyopencv_to(maxCountItem.item, dst.maxCount, maxCountItemInfo))
         {
@@ -1063,7 +1179,7 @@ bool pyopencv_to(PyObject* obj, TermCriteria& dst, const ArgInfo& info)
     }
     {
         const String epsilonItemName = format("'%s' epsilon", info.name);
-        const ArgInfo epsilonItemInfo(epsilonItemName.c_str(), false);
+        const ArgInfo epsilonItemInfo(epsilonItemName.c_str(), 0);
         SafeSeqItem epsilonItem(obj, 2);
         if (!pyopencv_to(epsilonItem.item, dst.epsilon, epsilonItemInfo))
         {
diff --git a/modules/python/src2/cv2_convert.hpp b/modules/python/src2/cv2_convert.hpp
index 43ef7b230280..0c0fbd7b96c4 100644
--- a/modules/python/src2/cv2_convert.hpp
+++ b/modules/python/src2/cv2_convert.hpp
@@ -156,6 +156,33 @@ struct PyOpenCV_Converter
     }
 };
 
+// There is conflict between "uint64_t" and "size_t".
+// They are the same type on some 32-bit platforms.
+template<typename T>
+struct PyOpenCV_Converter
+    < T, typename std::enable_if< std::is_same<uint64_t, T>::value && !std::is_same<uint64_t, size_t>::value >::type >
+{
+    static inline PyObject* from(const uint64_t& value)
+    {
+        return PyLong_FromUnsignedLongLong(value);
+    }
+
+    static inline bool to(PyObject* obj, uint64_t& value, const ArgInfo& info)
+    {
+        CV_UNUSED(info);
+        if(!obj || obj == Py_None)
+            return true;
+        if(PyInt_Check(obj))
+            value = (uint64_t)PyInt_AsUnsignedLongLongMask(obj);
+        else if(PyLong_Check(obj))
+            value = (uint64_t)PyLong_AsUnsignedLongLong(obj);
+        else
+            return false;
+        return value != (uint64_t)-1 || !PyErr_Occurred();
+    }
+};
+
+
 // --- uchar
 template<> bool pyopencv_to(PyObject* obj, uchar& value, const ArgInfo& info);
 template<> PyObject* pyopencv_from(const uchar& value);
@@ -187,6 +214,8 @@ template<> PyObject* pyopencv_from(const cv::Size_<float>& sz);
 // --- Rect
 template<> bool pyopencv_to(PyObject* obj, cv::Rect& r, const ArgInfo& info);
 template<> PyObject* pyopencv_from(const cv::Rect& r);
+template<> bool pyopencv_to(PyObject* obj, cv::Rect2f& r, const ArgInfo& info);
+template<> PyObject* pyopencv_from(const cv::Rect2f& r);
 template<> bool pyopencv_to(PyObject* obj, cv::Rect2d& r, const ArgInfo& info);
 template<> PyObject* pyopencv_from(const cv::Rect2d& r);
 
@@ -205,6 +234,8 @@ template<> bool pyopencv_to(PyObject* obj, cv::Point2f& p, const ArgInfo& info);
 template<> PyObject* pyopencv_from(const cv::Point2f& p);
 template<> bool pyopencv_to(PyObject* obj, cv::Point2d& p, const ArgInfo& info);
 template<> PyObject* pyopencv_from(const cv::Point2d& p);
+template<> bool pyopencv_to(PyObject* obj, cv::Point3i& p, const ArgInfo& info);
+template<> PyObject* pyopencv_from(const cv::Point3i& p);
 template<> bool pyopencv_to(PyObject* obj, cv::Point3f& p, const ArgInfo& info);
 template<> PyObject* pyopencv_from(const cv::Point3f& p);
 template<> bool pyopencv_to(PyObject* obj, cv::Point3d& p, const ArgInfo& info);
@@ -286,13 +317,13 @@ bool pyopencv_to(PyObject *obj, std::map<K,V> &map, const ArgInfo& info)
     while(PyDict_Next(obj, &pos, &py_key, &py_value))
     {
         K cpp_key;
-        if (!pyopencv_to(py_key, cpp_key, ArgInfo("key", false))) {
+        if (!pyopencv_to(py_key, cpp_key, ArgInfo("key", 0))) {
             failmsg("Can't parse dict key. Key on position %lu has a wrong type", pos);
             return false;
         }
 
         V cpp_value;
-        if (!pyopencv_to(py_value, cpp_value, ArgInfo("value", false))) {
+        if (!pyopencv_to(py_value, cpp_value, ArgInfo("value", 0))) {
             failmsg("Can't parse dict value. Value on position %lu has a wrong type", pos);
             return false;
         }
diff --git a/modules/python/src2/cv2_numpy.cpp b/modules/python/src2/cv2_numpy.cpp
index 63010b60269b..25922d6c6142 100644
--- a/modules/python/src2/cv2_numpy.cpp
+++ b/modules/python/src2/cv2_numpy.cpp
@@ -6,8 +6,6 @@
 #include "cv2_numpy.hpp"
 #include "cv2_util.hpp"
 
-NumpyAllocator g_numpyAllocator;
-
 using namespace cv;
 
 UMatData* NumpyAllocator::allocate(PyObject* o, int dims, const int* sizes, int type, size_t* step) const
diff --git a/modules/python/src2/cv2_numpy.hpp b/modules/python/src2/cv2_numpy.hpp
index 934333921d78..b37a7a878e6c 100644
--- a/modules/python/src2/cv2_numpy.hpp
+++ b/modules/python/src2/cv2_numpy.hpp
@@ -18,7 +18,7 @@ class NumpyAllocator : public cv::MatAllocator
     const cv::MatAllocator* stdAllocator;
 };
 
-extern NumpyAllocator g_numpyAllocator;
+inline NumpyAllocator& GetNumpyAllocator() {static NumpyAllocator gNumpyAllocator;return gNumpyAllocator;}
 
 //======================================================================================================================
 
diff --git a/modules/python/src2/cv2_util.cpp b/modules/python/src2/cv2_util.cpp
index d3691d3a59c9..817a4a8effc7 100644
--- a/modules/python/src2/cv2_util.cpp
+++ b/modules/python/src2/cv2_util.cpp
@@ -128,11 +128,7 @@ void pyPopulateArgumentConversionErrors()
         PySafeObject exception_message(PyObject_Str(exception_value));
         std::string message;
         getUnicodeString(exception_message, message);
-#ifdef CV_CXX11
         conversionErrorsTLS.getRef().push_back(std::move(message));
-#else
-        conversionErrorsTLS.getRef().push_back(message);
-#endif
     }
 }
 
diff --git a/modules/python/src2/cv2_util.hpp b/modules/python/src2/cv2_util.hpp
index 0d27e98825d5..a7deb5b57544 100644
--- a/modules/python/src2/cv2_util.hpp
+++ b/modules/python/src2/cv2_util.hpp
@@ -42,7 +42,7 @@ class PyEnsureGIL
 
 /**
  * Light weight RAII wrapper for `PyObject*` owning references.
- * In comparisson to C++11 `std::unique_ptr` with custom deleter, it provides
+ * In comparison to C++11 `std::unique_ptr` with custom deleter, it provides
  * implicit conversion functions that might be useful to initialize it with
  * Python functions those returns owning references through the `PyObject**`
  * e.g. `PyErr_Fetch` or directly pass it to functions those want to borrow
@@ -70,6 +70,10 @@ class PySafeObject
         return &obj_;
     }
 
+    operator bool() {
+        return obj_ != nullptr;
+    }
+
     PyObject* release()
     {
         PyObject* obj = obj_;
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 0ba643f12b47..29a91958ee90 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -109,7 +109,7 @@ def parse_symbol_name(cls, full_symbol_name, known_namespaces):
     if( PyMapping_HasKeyString(src, (char*)"$propname") )
     {
         tmp = PyMapping_GetItemString(src, (char*)"$propname");
-        ok = tmp && pyopencv_to_safe(tmp, dst.$propname, ArgInfo("$propname", false));
+        ok = tmp && pyopencv_to_safe(tmp, dst.$propname, ArgInfo("$propname", 0));
         Py_DECREF(tmp);
         if(!ok) return false;
     }""")
@@ -163,7 +163,7 @@ def parse_symbol_name(cls, full_symbol_name, known_namespaces):
         PyErr_SetString(PyExc_TypeError, "Cannot delete the ${member} attribute");
         return -1;
     }
-    return pyopencv_to_safe(value, p->v${access}${member}, ArgInfo("value", false)) ? 0 : -1;
+    return pyopencv_to_safe(value, p->v${access}${member}, ArgInfo("value", 0)) ? 0 : -1;
 }
 """)
 
@@ -181,7 +181,7 @@ def parse_symbol_name(cls, full_symbol_name, known_namespaces):
         failmsgp("Incorrect type of object (must be '${name}' or its derivative)");
         return -1;
     }
-    return pyopencv_to_safe(value, _self_${access}${member}, ArgInfo("value", false)) ? 0 : -1;
+    return pyopencv_to_safe(value, _self_${access}${member}, ArgInfo("value", 0)) ? 0 : -1;
 }
 """)
 
@@ -231,6 +231,8 @@ class FormatStrings:
     "c_string": ArgTypeInfo("char*", FormatStrings.string, '(char*)""'),
     "string": ArgTypeInfo("std::string", FormatStrings.object, None, True),
     "Stream": ArgTypeInfo("Stream", FormatStrings.object, 'Stream::Null()', True),
+    "cuda_Stream": ArgTypeInfo("cuda::Stream", FormatStrings.object, "cuda::Stream::Null()", True),
+    "cuda_GpuMat": ArgTypeInfo("cuda::GpuMat", FormatStrings.object, "cuda::GpuMat()", True),
     "UMat": ArgTypeInfo("UMat", FormatStrings.object, 'UMat()', True),  # FIXIT: switch to CV_EXPORTS_W_SIMPLE as UMat is already a some kind of smart pointer
 }
 
@@ -430,7 +432,7 @@ def gen_def(self, codegen):
         if self.constructor is not None:
             constructor_name = self.constructor.get_wrapper_name()
 
-        return 'CVPY_TYPE({}, {}, {}, {}, {}, {}, "{}");\n'.format(
+        return 'CVPY_TYPE({}, {}, {}, {}, {}, {}, "{}")\n'.format(
             self.export_name,
             self.class_id,
             self.cname if self.issimple else "Ptr<{}>".format(self.cname),
@@ -490,10 +492,18 @@ def export_name(self):
     def inputarg(self):
         return '/O' not in self._modifiers
 
+    @property
+    def arithm_op_src_arg(self):
+        return '/AOS' in self._modifiers
+
     @property
     def outputarg(self):
         return '/O' in self._modifiers or '/IO' in self._modifiers
 
+    @property
+    def pathlike(self):
+        return '/PATH' in self._modifiers
+
     @property
     def returnarg(self):
         return self.outputarg
@@ -509,14 +519,20 @@ def full_name(self):
         return self.enclosing_arg.name + '.' + self.name
 
     def isbig(self):
-        return self.tp in ["Mat", "vector_Mat", "cuda::GpuMat", "GpuMat", "vector_GpuMat", "UMat", "vector_UMat"] # or self.tp.startswith("vector")
+        return self.tp in ["Mat", "vector_Mat",
+                           "cuda::GpuMat", "cuda_GpuMat", "GpuMat",
+                           "vector_GpuMat", "vector_cuda_GpuMat",
+                           "UMat", "vector_UMat"] # or self.tp.startswith("vector")
 
     def crepr(self):
-        return "ArgInfo(\"%s\", %d)" % (self.name, self.outputarg)
+        arg  = 0x01 if self.outputarg else 0x0
+        arg += 0x02 if self.arithm_op_src_arg else 0x0
+        arg += 0x04 if self.pathlike else 0x0
+        return "ArgInfo(\"%s\", %d)" % (self.name, arg)
 
 
 def find_argument_class_info(argument_type, function_namespace,
-                            function_class_name, known_classes):
+                             function_class_name, known_classes):
     # type: (str, str, str, dict[str, ClassInfo]) -> ClassInfo | None
     """Tries to find corresponding class info for the provided argument type
 
@@ -1048,7 +1064,7 @@ def gen_code(self, codegen):
             else:
                 py_name = classinfo.full_export_name + "." + self.variants[0].wname
 
-            if not self.is_static:
+            if not self.is_static and not self.isconstructor:
                 cname = classinfo.cname + '::' + cname
         else:
             py_name = '.'.join([self.namespace, self.variants[0].wname])
@@ -1280,7 +1296,7 @@ def gen_enum_reg(self, enum_name):
         code = ""
         if re.sub(r"^cv\.", "", enum_name) != wname:
             code += "typedef {0} {1};\n".format(cname, wname)
-        code += "CV_PY_FROM_ENUM({0});\nCV_PY_TO_ENUM({0});\n\n".format(wname)
+        code += "CV_PY_FROM_ENUM({0})\nCV_PY_TO_ENUM({0})\n\n".format(wname)
         self.code_enums.write(code)
 
     def save(self, path, name, buf):
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index 710c7921794d..fa2d0077d959 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -90,6 +90,10 @@ def parse_arg(self, arg_str, argno):
             modlist.append("/IO")
             arg_str = arg_str.replace("CV_IN_OUT", "")
 
+        if "CV_WRAP_FILE_PATH" in arg_str:
+            modlist.append("/PATH")
+            arg_str = arg_str.replace("CV_WRAP_FILE_PATH", "")
+
         isarray = False
         npos = arg_str.find("CV_CARRAY")
         if npos >= 0:
@@ -451,8 +455,7 @@ def parse_func_decl(self, decl_str, mat="Mat", docstring=""):
                                                  ("CV_INLINE", ""),
                                                  ("CV_DEPRECATED", ""),
                                                  ("CV_DEPRECATED_EXTERNAL", ""),
-                                                 ("CV_NODISCARD_STD", ""),
-                                                 ("CV_NODISCARD", "")]).strip()
+                                                 ("CV_NODISCARD_STD", "")]).strip()
 
         if decl_str.strip().startswith('virtual'):
             virtual_method = True
@@ -510,9 +513,9 @@ def parse_func_decl(self, decl_str, mat="Mat", docstring=""):
             if rettype == classname or rettype == "~" + classname:
                 rettype, funcname = "", rettype
             else:
-                if bool(re.match('\w+\s+\(\*\w+\)\s*\(.*\)', decl_str)):
+                if bool(re.match(r'\w+\s+\(\*\w+\)\s*\(.*\)', decl_str)):
                     return [] # function typedef
-                elif bool(re.match('\w+\s+\(\w+::\*\w+\)\s*\(.*\)', decl_str)):
+                elif bool(re.match(r'\w+\s+\(\w+::\*\w+\)\s*\(.*\)', decl_str)):
                     return [] # class method typedef
                 elif bool(re.match('[A-Z_]+', decl_start)):
                     return [] # it seems to be a macro instantiation
@@ -535,6 +538,13 @@ def parse_func_decl(self, decl_str, mat="Mat", docstring=""):
 
         funcname = self.get_dotted_name(funcname)
 
+        # see https://github.com/opencv/opencv/issues/24057
+        is_arithm_op_func = funcname in {"cv.add",
+                                         "cv.subtract",
+                                         "cv.absdiff",
+                                         "cv.multiply",
+                                         "cv.divide"}
+
         if not self.wrap_mode:
             decl = self.parse_func_decl_no_wrap(decl_str, static_method, docstring)
             decl[0] = funcname
@@ -595,6 +605,8 @@ def parse_func_decl(self, decl_str, mat="Mat", docstring=""):
 
                         if arg_type == "InputArray":
                             arg_type = mat
+                            if is_arithm_op_func:
+                                modlist.append("/AOS") # Arithm Ope Source
                         elif arg_type == "InputOutputArray":
                             arg_type = mat
                             modlist.append("/IO")
@@ -618,6 +630,8 @@ def parse_func_decl(self, decl_str, mat="Mat", docstring=""):
                                                              ("noArray", arg_type)]).strip()
                     if '/IO' in modlist and '/O' in modlist:
                         modlist.remove('/O')
+                    if (arg_name.lower() == 'filename' or arg_name.lower() == 'filepath') and '/PATH' not in modlist:
+                        modlist.append('/PATH')
                     args.append([arg_type, arg_name, defval, modlist])
                 npos = arg_start-1
 
diff --git a/modules/python/src2/pycompat.hpp b/modules/python/src2/pycompat.hpp
index c8806dc8120c..05a390956268 100644
--- a/modules/python/src2/pycompat.hpp
+++ b/modules/python/src2/pycompat.hpp
@@ -53,6 +53,7 @@
 #define PyInt_CheckExact PyLong_CheckExact
 #define PyInt_AsLong PyLong_AsLong
 #define PyInt_AS_LONG PyLong_AS_LONG
+#define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
 #define PyInt_FromLong PyLong_FromLong
 #define PyNumber_Int PyNumber_Long
 
diff --git a/modules/python/src2/typing_stubs_generation/api_refinement.py b/modules/python/src2/typing_stubs_generation/api_refinement.py
index f23167f9147f..b04bb4a19681 100644
--- a/modules/python/src2/typing_stubs_generation/api_refinement.py
+++ b/modules/python/src2/typing_stubs_generation/api_refinement.py
@@ -2,18 +2,81 @@
     "apply_manual_api_refinement"
 ]
 
-from typing import Sequence, Callable
-from .nodes import NamespaceNode, FunctionNode, OptionalTypeNode
-from .ast_utils import find_function_node, SymbolName
+from typing import cast, Sequence, Callable, Iterable
+
+from .nodes import (NamespaceNode, FunctionNode, OptionalTypeNode, TypeNode,
+                    ClassProperty, PrimitiveTypeNode, ASTNodeTypeNode,
+                    AggregatedTypeNode, CallableTypeNode, AnyTypeNode,
+                    TupleTypeNode, UnionTypeNode, ProtocolClassNode,
+                    DictTypeNode, ClassTypeNode)
+from .ast_utils import (find_function_node, SymbolName,
+                        for_each_function_overload)
+from .types_conversion import create_type_node
 
 
 def apply_manual_api_refinement(root: NamespaceNode) -> None:
+    refine_highgui_module(root)
+    refine_cuda_module(root)
+    export_matrix_type_constants(root)
+    refine_dnn_module(root)
     # Export OpenCV exception class
     builtin_exception = root.add_class("Exception")
     builtin_exception.is_exported = False
-    root.add_class("error", (builtin_exception, ))
+    root.add_class("error", (builtin_exception, ), ERROR_CLASS_PROPERTIES)
     for symbol_name, refine_symbol in NODES_TO_REFINE.items():
         refine_symbol(root, symbol_name)
+    version_constant = root.add_constant("__version__", "<unused>")
+    version_constant._value_type = "str"
+
+    """
+    def redirectError(
+        onError: Callable[[int, str, str, str, int], None] | None
+    ) -> None: ...
+    """
+    root.add_function("redirectError", [
+        FunctionNode.Arg(
+            "onError",
+            OptionalTypeNode(
+                CallableTypeNode(
+                    "ErrorCallback",
+                    [
+                        PrimitiveTypeNode.int_(),
+                        PrimitiveTypeNode.str_(),
+                        PrimitiveTypeNode.str_(),
+                        PrimitiveTypeNode.str_(),
+                        PrimitiveTypeNode.int_()
+                    ]
+                )
+            )
+        )
+    ])
+
+
+def export_matrix_type_constants(root: NamespaceNode) -> None:
+    MAX_PREDEFINED_CHANNELS = 4
+
+    depth_names = ("CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S",
+                   "CV_32F", "CV_64F", "CV_16F")
+    for depth_value, depth_name in enumerate(depth_names):
+        # Export depth constants
+        root.add_constant(depth_name, str(depth_value))
+        # Export predefined types
+        for c in range(MAX_PREDEFINED_CHANNELS):
+            root.add_constant(f"{depth_name}C{c + 1}",
+                              f"{depth_value + 8 * c}")
+        # Export type creation function
+        root.add_function(
+            f"{depth_name}C",
+            (FunctionNode.Arg("channels", PrimitiveTypeNode.int_()), ),
+            FunctionNode.RetType(PrimitiveTypeNode.int_())
+        )
+    # Export CV_MAKETYPE
+    root.add_function(
+        "CV_MAKETYPE",
+        (FunctionNode.Arg("depth", PrimitiveTypeNode.int_()),
+         FunctionNode.Arg("channels", PrimitiveTypeNode.int_())),
+        FunctionNode.RetType(PrimitiveTypeNode.int_())
+    )
 
 
 def make_optional_arg(arg_name: str) -> Callable[[NamespaceNode, SymbolName], None]:
@@ -27,13 +90,230 @@ def _make_optional_arg(root_node: NamespaceNode,
                 continue
 
             overload.arguments[arg_idx].type_node = OptionalTypeNode(
-                overload.arguments[arg_idx].type_node
+                cast(TypeNode, overload.arguments[arg_idx].type_node)
             )
 
     return _make_optional_arg
 
 
-def _find_argument_index(arguments: Sequence[FunctionNode.Arg], name: str) -> int:
+def refine_cuda_module(root: NamespaceNode) -> None:
+    def fix_cudaoptflow_enums_names() -> None:
+        for class_name in ("NvidiaOpticalFlow_1_0", "NvidiaOpticalFlow_2_0"):
+            if class_name not in cuda_root.classes:
+                continue
+            opt_flow_class = cuda_root.classes[class_name]
+            _trim_class_name_from_argument_types(
+                for_each_function_overload(opt_flow_class), class_name
+            )
+
+    def fix_namespace_usage_scope(cuda_ns: NamespaceNode) -> None:
+        USED_TYPES = ("GpuMat", "Stream")
+
+        def fix_type_usage(type_node: TypeNode) -> None:
+            if isinstance(type_node, AggregatedTypeNode):
+                for item in type_node.items:
+                    fix_type_usage(item)
+            if isinstance(type_node, ASTNodeTypeNode):
+                if type_node._typename in USED_TYPES:
+                    type_node._typename = f"cuda_{type_node._typename}"
+
+        for overload in for_each_function_overload(cuda_ns):
+            if overload.return_type is not None:
+                fix_type_usage(overload.return_type.type_node)
+            for type_node in [arg.type_node for arg in overload.arguments
+                              if arg.type_node is not None]:
+                fix_type_usage(type_node)
+
+    if "cuda" not in root.namespaces:
+        return
+    cuda_root = root.namespaces["cuda"]
+    fix_cudaoptflow_enums_names()
+    for ns in [ns for ns_name, ns in root.namespaces.items()
+               if ns_name.startswith("cuda")]:
+        fix_namespace_usage_scope(ns)
+
+
+def refine_highgui_module(root: NamespaceNode) -> None:
+    # Check if library is built with enabled highgui module
+    if "destroyAllWindows" not in root.functions:
+        return
+    """
+    def createTrackbar(trackbarName: str,
+                       windowName: str,
+                       value: int,
+                       count: int,
+                       onChange: Callable[[int], None]) -> None: ...
+    """
+    root.add_function(
+        "createTrackbar",
+        [
+            FunctionNode.Arg("trackbarName", PrimitiveTypeNode.str_()),
+            FunctionNode.Arg("windowName", PrimitiveTypeNode.str_()),
+            FunctionNode.Arg("value", PrimitiveTypeNode.int_()),
+            FunctionNode.Arg("count", PrimitiveTypeNode.int_()),
+            FunctionNode.Arg("onChange",
+                             CallableTypeNode("TrackbarCallback",
+                                              PrimitiveTypeNode.int_("int"))),
+        ]
+    )
+    """
+    def createButton(buttonName: str,
+                     onChange: Callable[[tuple[int] | tuple[int, Any]], None],
+                     userData: Any | None = ...,
+                     buttonType: int = ...,
+                     initialButtonState: int = ...) -> None: ...
+    """
+    root.add_function(
+        "createButton",
+        [
+            FunctionNode.Arg("buttonName", PrimitiveTypeNode.str_()),
+            FunctionNode.Arg(
+                "onChange",
+                CallableTypeNode(
+                    "ButtonCallback",
+                    UnionTypeNode(
+                        "onButtonChangeCallbackData",
+                        [
+                            TupleTypeNode("onButtonChangeCallbackData",
+                                          [PrimitiveTypeNode.int_(), ]),
+                            TupleTypeNode("onButtonChangeCallbackData",
+                                          [PrimitiveTypeNode.int_(),
+                                           AnyTypeNode("void*")])
+                        ]
+                    )
+                )),
+            FunctionNode.Arg("userData",
+                             OptionalTypeNode(AnyTypeNode("void*")),
+                             default_value="None"),
+            FunctionNode.Arg("buttonType", PrimitiveTypeNode.int_(),
+                             default_value="0"),
+            FunctionNode.Arg("initialButtonState", PrimitiveTypeNode.int_(),
+                             default_value="0")
+        ]
+    )
+    """
+    def setMouseCallback(
+        windowName: str,
+        onMouse: Callback[[int, int, int, int, Any | None], None],
+        param: Any | None = ...
+    ) -> None: ...
+    """
+    root.add_function(
+        "setMouseCallback",
+        [
+            FunctionNode.Arg("windowName", PrimitiveTypeNode.str_()),
+            FunctionNode.Arg(
+                "onMouse",
+                CallableTypeNode("MouseCallback", [
+                    PrimitiveTypeNode.int_(),
+                    PrimitiveTypeNode.int_(),
+                    PrimitiveTypeNode.int_(),
+                    PrimitiveTypeNode.int_(),
+                    OptionalTypeNode(AnyTypeNode("void*"))
+                ])
+            ),
+            FunctionNode.Arg("param", OptionalTypeNode(AnyTypeNode("void*")),
+                             default_value="None")
+        ]
+    )
+
+
+def refine_dnn_module(root: NamespaceNode) -> None:
+    if "dnn" not in root.namespaces:
+        return
+    dnn_module = root.namespaces["dnn"]
+
+    """
+    class LayerProtocol(Protocol):
+        def __init__(
+            self, params: dict[str, DictValue],
+            blobs: typing.Sequence[cv2.typing.MatLike]
+        ) -> None: ...
+
+        def getMemoryShapes(
+            self, inputs: typing.Sequence[typing.Sequence[int]]
+        ) -> typing.Sequence[typing.Sequence[int]]: ...
+
+        def forward(
+            self, inputs: typing.Sequence[cv2.typing.MatLike]
+        ) -> typing.Sequence[cv2.typing.MatLike]: ...
+    """
+    layer_proto = ProtocolClassNode("LayerProtocol", dnn_module)
+    layer_proto.add_function(
+        "__init__",
+        arguments=[
+            FunctionNode.Arg(
+                "params",
+                DictTypeNode(
+                    "LayerParams", PrimitiveTypeNode.str_(),
+                    create_type_node("cv::dnn::DictValue")
+                )
+            ),
+            FunctionNode.Arg("blobs", create_type_node("vector<cv::Mat>"))
+        ]
+    )
+    layer_proto.add_function(
+        "getMemoryShapes",
+        arguments=[
+            FunctionNode.Arg("inputs",
+                             create_type_node("vector<vector<int>>"))
+        ],
+        return_type=FunctionNode.RetType(
+            create_type_node("vector<vector<int>>")
+        )
+    )
+    layer_proto.add_function(
+        "forward",
+        arguments=[
+            FunctionNode.Arg("inputs", create_type_node("vector<cv::Mat>"))
+        ],
+        return_type=FunctionNode.RetType(create_type_node("vector<cv::Mat>"))
+    )
+
+    """
+    def dnn_registerLayer(layerTypeName: str,
+                          layerClass: typing.Type[LayerProtocol]) -> None: ...
+    """
+    root.add_function(
+        "dnn_registerLayer",
+        arguments=[
+            FunctionNode.Arg("layerTypeName", PrimitiveTypeNode.str_()),
+            FunctionNode.Arg(
+                "layerClass",
+                ClassTypeNode(ASTNodeTypeNode(
+                    layer_proto.export_name, f"dnn.{layer_proto.export_name}"
+                ))
+            )
+        ]
+    )
+
+    """
+    def dnn_unregisterLayer(layerTypeName: str) -> None: ...
+    """
+    root.add_function(
+        "dnn_unregisterLayer",
+        arguments=[
+            FunctionNode.Arg("layerTypeName", PrimitiveTypeNode.str_())
+        ]
+    )
+
+
+def _trim_class_name_from_argument_types(
+    overloads: Iterable[FunctionNode.Overload],
+    class_name: str
+) -> None:
+    separator = f"{class_name}_"
+    for overload in overloads:
+        for arg in [arg for arg in overload.arguments
+                    if arg.type_node is not None]:
+            ast_node = cast(ASTNodeTypeNode, arg.type_node)
+            if class_name in ast_node.ctype_name:
+                fixed_name = ast_node._typename.split(separator)[-1]
+                ast_node._typename = fixed_name
+
+
+def _find_argument_index(arguments: Sequence[FunctionNode.Arg],
+                         name: str) -> int:
     for i, arg in enumerate(arguments):
         if arg.name == name:
             return i
@@ -46,3 +326,12 @@ def _find_argument_index(arguments: Sequence[FunctionNode.Arg], name: str) -> in
     SymbolName(("cv", ), (), "resize"): make_optional_arg("dsize"),
     SymbolName(("cv", ), (), "calcHist"): make_optional_arg("mask"),
 }
+
+ERROR_CLASS_PROPERTIES = (
+    ClassProperty("code", PrimitiveTypeNode.int_(), False),
+    ClassProperty("err", PrimitiveTypeNode.str_(), False),
+    ClassProperty("file", PrimitiveTypeNode.str_(), False),
+    ClassProperty("func", PrimitiveTypeNode.str_(), False),
+    ClassProperty("line", PrimitiveTypeNode.int_(), False),
+    ClassProperty("msg", PrimitiveTypeNode.str_(), False),
+)
diff --git a/modules/python/src2/typing_stubs_generation/ast_utils.py b/modules/python/src2/typing_stubs_generation/ast_utils.py
index 4cdf80726022..e8ef52d19ae0 100644
--- a/modules/python/src2/typing_stubs_generation/ast_utils.py
+++ b/modules/python/src2/typing_stubs_generation/ast_utils.py
@@ -1,5 +1,5 @@
 from typing import (NamedTuple, Sequence, Tuple, Union, List,
-                    Dict, Callable, Optional)
+                    Dict, Callable, Optional, Generator, cast)
 import keyword
 
 from .nodes import (ASTNode, NamespaceNode, ClassNode, FunctionNode,
@@ -204,9 +204,7 @@ def prepare_overload_arguments_and_return_type(variant):
                 outlist = variant.py_outlist
             for _, argno in outlist:
                 assert argno >= 0, \
-                    "Logic Error! Outlist contains function return type: {}".format(
-                        outlist
-                    )
+                    f"Logic Error! Outlist contains function return type: {outlist}"
 
                 ret_types.append(create_type_node(variant.args[argno].tp))
 
@@ -379,7 +377,7 @@ def get_enclosing_namespace(
                 node.full_export_name, node.native_name
             )
         if class_node_callback:
-            class_node_callback(parent_node)
+            class_node_callback(cast(ClassNode, parent_node))
         parent_node = parent_node.parent
     return parent_node
 
@@ -395,15 +393,44 @@ def get_enum_module_and_export_name(enum_node: EnumerationNode) -> Tuple[str, st
     Returns:
         Tuple[str, str]: a pair of enum export name and its full module name.
     """
+    enum_export_name = enum_node.export_name
+
     def update_full_export_name(class_node: ClassNode) -> None:
         nonlocal enum_export_name
         enum_export_name = class_node.export_name + "_" + enum_export_name
 
-    enum_export_name = enum_node.export_name
-    namespace_node = get_enclosing_namespace(enum_node, update_full_export_name)
+    namespace_node = get_enclosing_namespace(enum_node,
+                                             update_full_export_name)
     return enum_export_name, namespace_node.full_export_name
 
 
+def for_each_class(
+    node: Union[NamespaceNode, ClassNode]
+) -> Generator[ClassNode, None, None]:
+    for cls in node.classes.values():
+        yield cls
+        if len(cls.classes):
+            yield from for_each_class(cls)
+
+
+def for_each_function(
+    node: Union[NamespaceNode, ClassNode],
+    traverse_class_nodes: bool = True
+) -> Generator[FunctionNode, None, None]:
+    yield from node.functions.values()
+    if traverse_class_nodes:
+        for cls in for_each_class(node):
+            yield from for_each_function(cls)
+
+
+def for_each_function_overload(
+    node: Union[NamespaceNode, ClassNode],
+    traverse_class_nodes: bool = True
+) -> Generator[FunctionNode.Overload, None, None]:
+    for func in for_each_function(node, traverse_class_nodes):
+        yield from func.overloads
+
+
 if __name__ == '__main__':
     import doctest
     doctest.testmod()
diff --git a/modules/python/src2/typing_stubs_generation/generation.py b/modules/python/src2/typing_stubs_generation/generation.py
index f89200e0ed28..563c09cc87ae 100644
--- a/modules/python/src2/typing_stubs_generation/generation.py
+++ b/modules/python/src2/typing_stubs_generation/generation.py
@@ -3,17 +3,21 @@
 from io import StringIO
 from pathlib import Path
 import re
-from typing import (Generator, Type, Callable, NamedTuple, Union, Set, Dict,
+from typing import (Callable, NamedTuple, Union, Set, Dict,
                     Collection, Tuple, List)
 import warnings
 
-from .ast_utils import get_enclosing_namespace, get_enum_module_and_export_name
+from .ast_utils import (get_enclosing_namespace,
+                        get_enum_module_and_export_name,
+                        for_each_function_overload,
+                        for_each_class)
 
 from .predefined_types import PREDEFINED_TYPES
 from .api_refinement import apply_manual_api_refinement
 
-from .nodes import (ASTNode, ASTNodeType, NamespaceNode, ClassNode, FunctionNode,
-                    EnumerationNode, ConstantNode)
+from .nodes import (ASTNode, ASTNodeType, NamespaceNode, ClassNode,
+                    FunctionNode, EnumerationNode, ConstantNode,
+                    ProtocolClassNode)
 
 from .nodes.type_node import (TypeNode, AliasTypeNode, AliasRefTypeNode,
                               AggregatedTypeNode, ASTNodeTypeNode,
@@ -98,21 +102,24 @@ def _generate_typing_stubs(root: NamespaceNode, output_path: Path) -> None:
 
     output_stream = StringIO()
 
+    # Add empty __all__ dunder on top of the module
+    output_stream.write("__all__: list[str] = []\n\n")
+
     # Write required imports at the top of file
     _write_required_imports(required_imports, output_stream)
 
     _write_reexported_symbols_section(root, output_stream)
 
-    # Write constants section, because constants don't impose any dependencies
-    _generate_section_stub(StubSection("# Constants", ConstantNode), root,
-                           output_stream, 0)
     # NOTE: Enumerations require special handling, because all enumeration
     # constants are exposed as module attributes
-    has_enums = _generate_section_stub(StubSection("# Enumerations", EnumerationNode),
-                                       root, output_stream, 0)
+    has_enums = _generate_section_stub(
+        StubSection("# Enumerations", ASTNodeType.Enumeration), root,
+        output_stream, 0
+    )
     # Collect all enums from class level and export them to module level
     for class_node in root.classes.values():
-        if _generate_enums_from_classes_tree(class_node, output_stream, indent=0):
+        if _generate_enums_from_classes_tree(class_node, output_stream,
+                                             indent=0):
             has_enums = True
     # 2 empty lines between enum and classes definitions
     if has_enums:
@@ -130,14 +137,15 @@ def _generate_typing_stubs(root: NamespaceNode, output_path: Path) -> None:
 
 class StubSection(NamedTuple):
     name: str
-    node_type: Type[ASTNode]
+    node_type: ASTNodeType
 
 
 STUB_SECTIONS = (
-    StubSection("# Constants", ConstantNode),
-    # StubSection("# Enumerations", EnumerationNode), # Skipped for now (special rules)
-    StubSection("# Classes", ClassNode),
-    StubSection("# Functions", FunctionNode)
+    StubSection("# Constants", ASTNodeType.Constant),
+    # Enumerations are skipped due to special handling rules
+    # StubSection("# Enumerations", ASTNodeType.Enumeration),
+    StubSection("# Classes", ASTNodeType.Class),
+    StubSection("# Functions", ASTNodeType.Function)
 )
 
 
@@ -246,9 +254,9 @@ def create(cls) -> Object: ...
             else:
                 bases.append(base.export_name)
 
-        inheritance_str = "({})".format(
-            ', '.join(bases)
-        )
+        inheritance_str = f"({', '.join(bases)})"
+    elif isinstance(class_node, ProtocolClassNode):
+        inheritance_str = "(Protocol)"
     else:
         inheritance_str = ""
 
@@ -317,6 +325,10 @@ def write_constant_to_stream(export_name: str) -> None:
     export_name = extra_export_prefix + constant_node.export_name
     write_constant_to_stream(export_name)
     if generate_uppercase_version:
+        # Handle Python "magic" constants like __version__
+        if re.match(r"^__.*__$", export_name) is not None:
+            return export_name,
+
         uppercase_name = re.sub(r"([a-z])([A-Z])", r"\1_\2", export_name).upper()
         if export_name != uppercase_name:
             write_constant_to_stream(uppercase_name)
@@ -432,7 +444,7 @@ def _generate_function_stub(function_node: FunctionNode,
     elif function_node.is_static:
         decorators.append(" " * indent + "@staticmethod")
     if len(function_node.overloads) > 1:
-        decorators.append(" " * indent + "@typing.overload")
+        decorators.append(" " * indent + "@_typing.overload")
 
     function_module = get_enclosing_namespace(function_node)
     function_module_name = function_module.full_export_name
@@ -535,35 +547,12 @@ def check_overload_presence(node: Union[NamespaceNode, ClassNode]) -> bool:
             otherwise.
     """
     for func_node in node.functions.values():
-        if len(func_node.overloads):
+        if len(func_node.overloads) > 1:
             return True
     return False
 
 
-def _for_each_class(node: Union[NamespaceNode, ClassNode]) \
-        -> Generator[ClassNode, None, None]:
-    for cls in node.classes.values():
-        yield cls
-        if len(cls.classes):
-            yield from _for_each_class(cls)
-
-
-def _for_each_function(node: Union[NamespaceNode, ClassNode]) \
-        -> Generator[FunctionNode, None, None]:
-    for func in node.functions.values():
-        yield func
-    for cls in node.classes.values():
-        yield from _for_each_function(cls)
-
-
-def _for_each_function_overload(node: Union[NamespaceNode, ClassNode]) \
-        -> Generator[FunctionNode.Overload, None, None]:
-    for func in _for_each_function(node):
-        for overload in func.overloads:
-            yield overload
-
-
-def _collect_required_imports(root: NamespaceNode) -> Set[str]:
+def _collect_required_imports(root: NamespaceNode) -> Collection[str]:
     """Collects all imports required for classes and functions typing stubs
     declarations.
 
@@ -571,8 +560,8 @@ def _collect_required_imports(root: NamespaceNode) -> Set[str]:
         root (NamespaceNode): Namespace node to collect imports for
 
     Returns:
-        Set[str]: Collection of unique `import smth` statements required for
-        classes and function declarations of `root` node.
+        Collection[str]: Collection of unique `import smth` statements required
+        for classes and function declarations of `root` node.
     """
 
     def _add_required_usage_imports(type_node: TypeNode, imports: Set[str]):
@@ -585,10 +574,11 @@ def _add_required_usage_imports(type_node: TypeNode, imports: Set[str]):
     has_overload = check_overload_presence(root)
     # if there is no module-level functions with overload, check its presence
     # during class traversing, including their inner-classes
-    for cls in _for_each_class(root):
+    has_protocol = False
+    for cls in for_each_class(root):
         if not has_overload and check_overload_presence(cls):
             has_overload = True
-            required_imports.add("import typing")
+            required_imports.add("import typing as _typing")
         # Add required imports for class properties
         for prop in cls.properties:
             _add_required_usage_imports(prop.type_node, required_imports)
@@ -599,12 +589,15 @@ def _add_required_usage_imports(type_node: TypeNode, imports: Set[str]):
                 required_imports.add(
                     "import " + base_namespace.full_export_name
                 )
+        if isinstance(cls, ProtocolClassNode):
+            has_protocol = True
 
     if has_overload:
-        required_imports.add("import typing")
+        required_imports.add("import typing as _typing")
     # Importing modules required to resolve functions arguments
-    for overload in _for_each_function_overload(root):
-        for arg in filter(lambda a: a.type_node is not None, overload.arguments):
+    for overload in for_each_function_overload(root):
+        for arg in filter(lambda a: a.type_node is not None,
+                          overload.arguments):
             _add_required_usage_imports(arg.type_node, required_imports)  # type: ignore
         if overload.return_type is not None:
             _add_required_usage_imports(overload.return_type.type_node,
@@ -614,20 +607,42 @@ def _add_required_usage_imports(type_node: TypeNode, imports: Set[str]):
     if root_import in required_imports:
         required_imports.remove(root_import)
 
-    return required_imports
+    if has_protocol:
+        required_imports.add("import sys")
+    ordered_required_imports = sorted(required_imports)
+
+    # Protocol import always goes as last import statement
+    if has_protocol:
+        ordered_required_imports.append(
+            """if sys.version_info >= (3, 8):
+    from typing import Protocol
+else:
+    from typing_extensions import Protocol"""
+        )
+
+    return ordered_required_imports
 
 
 def _populate_reexported_symbols(root: NamespaceNode) -> None:
     # Re-export all submodules to allow referencing symbols in submodules
     # without submodule import. Example:
     # `cv2.aruco.ArucoDetector` should be accessible without `import cv2.aruco`
-    for submodule in root.namespaces.values():
-        root.reexported_submodules.append(submodule.export_name)
+    def _reexport_submodule(ns: NamespaceNode) -> None:
+        for submodule in ns.namespaces.values():
+            ns.reexported_submodules.append(submodule.export_name)
+            _reexport_submodule(submodule)
 
-    # Special cases, symbols defined in possible pure Python submodules should be
+    _reexport_submodule(root)
+
+    root.reexported_submodules.append("typing")
+
+    # Special cases, symbols defined in possible pure Python submodules
+    # should be
     root.reexported_submodules_symbols["mat_wrapper"].append("Mat")
 
-def _write_reexported_symbols_section(module: NamespaceNode, output_stream: StringIO) -> None:
+
+def _write_reexported_symbols_section(module: NamespaceNode,
+                                      output_stream: StringIO) -> None:
     """Write re-export section for the given module.
 
     Re-export statements have from `from module_name import smth as smth`.
@@ -674,7 +689,7 @@ def _write_required_imports(required_imports: Collection[str],
         output_stream (StringIO): Output stream for import statements.
     """
 
-    for required_import in sorted(required_imports):
+    for required_import in required_imports:
         output_stream.write(required_import)
         output_stream.write("\n")
     if len(required_imports):
@@ -722,10 +737,10 @@ def create_alias_for_enum_node(enum_node_alias: AliasTypeNode) -> ConditionalAli
         )
         return ConditionalAliasTypeNode(
             enum_export_name,
-            "typing.TYPE_CHECKING",
+            "_typing.TYPE_CHECKING",
             positive_branch_type=enum_node_alias,
             negative_branch_type=PrimitiveTypeNode.int_(enum_export_name),
-            condition_required_imports=("import typing", )
+            condition_required_imports=("import typing as _typing", )
         )
 
     def register_alias(alias_node: AliasTypeNode) -> None:
@@ -811,8 +826,8 @@ def register_alias(alias_node: AliasTypeNode) -> None:
 
 
 NODE_TYPE_TO_STUB_GENERATOR = {
-    ClassNode: _generate_class_stub,
-    ConstantNode: _generate_constant_stub,
-    EnumerationNode: _generate_enumeration_stub,
-    FunctionNode: _generate_function_stub
+    ASTNodeType.Class: _generate_class_stub,
+    ASTNodeType.Constant: _generate_constant_stub,
+    ASTNodeType.Enumeration: _generate_enumeration_stub,
+    ASTNodeType.Function: _generate_function_stub
 }
diff --git a/modules/python/src2/typing_stubs_generation/nodes/__init__.py b/modules/python/src2/typing_stubs_generation/nodes/__init__.py
index 0ee1df93d901..a2dbc499645a 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/__init__.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/__init__.py
@@ -1,11 +1,12 @@
 from .node import ASTNode, ASTNodeType
 from .namespace_node import NamespaceNode
-from .class_node import ClassNode, ClassProperty
+from .class_node import ClassNode, ClassProperty, ProtocolClassNode
 from .function_node import FunctionNode
 from .enumeration_node import EnumerationNode
 from .constant_node import ConstantNode
 from .type_node import (
     TypeNode, OptionalTypeNode, UnionTypeNode, NoneTypeNode, TupleTypeNode,
     ASTNodeTypeNode, AliasTypeNode, SequenceTypeNode, AnyTypeNode,
-    AggregatedTypeNode, NDArrayTypeNode, AliasRefTypeNode,
+    AggregatedTypeNode, NDArrayTypeNode, AliasRefTypeNode, PrimitiveTypeNode,
+    CallableTypeNode, DictTypeNode, ClassTypeNode
 )
diff --git a/modules/python/src2/typing_stubs_generation/nodes/class_node.py b/modules/python/src2/typing_stubs_generation/nodes/class_node.py
index b3d394786f9f..78bd01d61fec 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/class_node.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/class_node.py
@@ -63,8 +63,9 @@ def weight(self) -> int:
         return 1 + sum(base.weight for base in self.bases)
 
     @property
-    def children_types(self) -> Tuple[Type[ASTNode], ...]:
-        return (ClassNode, FunctionNode, EnumerationNode, ConstantNode)
+    def children_types(self) -> Tuple[ASTNodeType, ...]:
+        return (ASTNodeType.Class, ASTNodeType.Function,
+                ASTNodeType.Enumeration, ASTNodeType.Constant)
 
     @property
     def node_type(self) -> ASTNodeType:
@@ -72,19 +73,19 @@ def node_type(self) -> ASTNodeType:
 
     @property
     def classes(self) -> Dict[str, "ClassNode"]:
-        return self._children[ClassNode]
+        return self._children[ASTNodeType.Class]
 
     @property
     def functions(self) -> Dict[str, FunctionNode]:
-        return self._children[FunctionNode]
+        return self._children[ASTNodeType.Function]
 
     @property
     def enumerations(self) -> Dict[str, EnumerationNode]:
-        return self._children[EnumerationNode]
+        return self._children[ASTNodeType.Enumeration]
 
     @property
     def constants(self) -> Dict[str, ConstantNode]:
-        return self._children[ConstantNode]
+        return self._children[ASTNodeType.Constant]
 
     def add_class(self, name: str,
                   bases: Sequence["weakref.ProxyType[ClassNode]"] = (),
@@ -179,3 +180,11 @@ def resolve_type_nodes(self, root: ASTNode) -> None:
                     self.full_export_name, root.full_export_name, errors
                 )
             )
+
+
+class ProtocolClassNode(ClassNode):
+    def __init__(self, name: str, parent: Optional[ASTNode] = None,
+                 export_name: Optional[str] = None,
+                 properties: Sequence[ClassProperty] = ()) -> None:
+        super().__init__(name, parent, export_name, bases=(),
+                         properties=properties)
diff --git a/modules/python/src2/typing_stubs_generation/nodes/constant_node.py b/modules/python/src2/typing_stubs_generation/nodes/constant_node.py
index 63abd8bfb4ec..ad18b80b50f0 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/constant_node.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/constant_node.py
@@ -1,4 +1,4 @@
-from typing import Type, Optional, Tuple
+from typing import Optional, Tuple
 
 from .node import ASTNode, ASTNodeType
 
@@ -11,9 +11,10 @@ def __init__(self, name: str, value: str,
                  export_name: Optional[str] = None) -> None:
         super().__init__(name, parent, export_name)
         self.value = value
+        self._value_type = "int"
 
     @property
-    def children_types(self) -> Tuple[Type[ASTNode], ...]:
+    def children_types(self) -> Tuple[ASTNodeType, ...]:
         return ()
 
     @property
@@ -22,7 +23,7 @@ def node_type(self) -> ASTNodeType:
 
     @property
     def value_type(self) -> str:
-        return 'int'
+        return self._value_type
 
     def __str__(self) -> str:
         return "Constant('{}' exported as '{}': {})".format(
diff --git a/modules/python/src2/typing_stubs_generation/nodes/enumeration_node.py b/modules/python/src2/typing_stubs_generation/nodes/enumeration_node.py
index 249f29db1c3b..12b996d1a100 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/enumeration_node.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/enumeration_node.py
@@ -18,8 +18,8 @@ def __init__(self, name: str, is_scoped: bool = False,
         self.is_scoped = is_scoped
 
     @property
-    def children_types(self) -> Tuple[Type[ASTNode], ...]:
-        return (ConstantNode, )
+    def children_types(self) -> Tuple[ASTNodeType, ...]:
+        return (ASTNodeType.Constant, )
 
     @property
     def node_type(self) -> ASTNodeType:
@@ -27,7 +27,7 @@ def node_type(self) -> ASTNodeType:
 
     @property
     def constants(self) -> Dict[str, ConstantNode]:
-        return self._children[ConstantNode]
+        return self._children[ASTNodeType.Constant]
 
     def add_constant(self, name: str, value: str) -> ConstantNode:
         return self._add_child(ConstantNode, name, value=value)
diff --git a/modules/python/src2/typing_stubs_generation/nodes/function_node.py b/modules/python/src2/typing_stubs_generation/nodes/function_node.py
index 4ebb90633e01..568676f69a7c 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/function_node.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/function_node.py
@@ -1,4 +1,4 @@
-from typing import NamedTuple, Sequence, Type, Optional, Tuple, List
+from typing import NamedTuple, Sequence, Optional, Tuple, List
 
 from .node import ASTNode, ASTNodeType
 from .type_node import TypeNode, NoneTypeNode, TypeResolutionError
@@ -98,7 +98,7 @@ def node_type(self) -> ASTNodeType:
         return ASTNodeType.Function
 
     @property
-    def children_types(self) -> Tuple[Type[ASTNode], ...]:
+    def children_types(self) -> Tuple[ASTNodeType, ...]:
         return ()
 
     def add_overload(self, arguments: Sequence["FunctionNode.Arg"] = (),
diff --git a/modules/python/src2/typing_stubs_generation/nodes/namespace_node.py b/modules/python/src2/typing_stubs_generation/nodes/namespace_node.py
index 8d0d04b5a587..445ef77b7d93 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/namespace_node.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/namespace_node.py
@@ -1,7 +1,7 @@
 import itertools
 import weakref
 from collections import defaultdict
-from typing import Dict, List, Optional, Sequence, Tuple, Type
+from typing import Dict, List, Optional, Sequence, Tuple
 
 from .class_node import ClassNode, ClassProperty
 from .constant_node import ConstantNode
@@ -33,29 +33,29 @@ def node_type(self) -> ASTNodeType:
         return ASTNodeType.Namespace
 
     @property
-    def children_types(self) -> Tuple[Type[ASTNode], ...]:
-        return (NamespaceNode, ClassNode, FunctionNode,
-                EnumerationNode, ConstantNode)
+    def children_types(self) -> Tuple[ASTNodeType, ...]:
+        return (ASTNodeType.Namespace, ASTNodeType.Class, ASTNodeType.Function,
+                ASTNodeType.Enumeration, ASTNodeType.Constant)
 
     @property
     def namespaces(self) -> Dict[str, "NamespaceNode"]:
-        return self._children[NamespaceNode]
+        return self._children[ASTNodeType.Namespace]
 
     @property
     def classes(self) -> Dict[str, ClassNode]:
-        return self._children[ClassNode]
+        return self._children[ASTNodeType.Class]
 
     @property
     def functions(self) -> Dict[str, FunctionNode]:
-        return self._children[FunctionNode]
+        return self._children[ASTNodeType.Function]
 
     @property
     def enumerations(self) -> Dict[str, EnumerationNode]:
-        return self._children[EnumerationNode]
+        return self._children[ASTNodeType.Enumeration]
 
     @property
     def constants(self) -> Dict[str, ConstantNode]:
-        return self._children[ConstantNode]
+        return self._children[ASTNodeType.Constant]
 
     def add_namespace(self, name: str) -> "NamespaceNode":
         return self._add_child(NamespaceNode, name)
diff --git a/modules/python/src2/typing_stubs_generation/nodes/node.py b/modules/python/src2/typing_stubs_generation/nodes/node.py
index 126f66801812..20efb4714526 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/node.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/node.py
@@ -70,22 +70,22 @@ def __init__(self, name: str, parent: Optional["ASTNode"] = None,
         self._parent: Optional["ASTNode"] = None
         self.parent = parent
         self.is_exported = True
-        self._children: DefaultDict[NodeType, NameToNode] = defaultdict(dict)
+        self._children: DefaultDict[ASTNodeType, NameToNode] = defaultdict(dict)
 
     def __str__(self) -> str:
         return "{}('{}' exported as '{}')".format(
-            type(self).__name__.replace("Node", ""), self.name, self.export_name
+            self.node_type.name, self.name, self.export_name
         )
 
     def __repr__(self) -> str:
         return str(self)
 
     @abc.abstractproperty
-    def children_types(self) -> Tuple[Type["ASTNode"], ...]:
+    def children_types(self) -> Tuple[ASTNodeType, ...]:
         """Set of ASTNode types that are allowed to be children of this node
 
         Returns:
-            Tuple[Type[ASTNode], ...]: Types of children nodes
+            Tuple[ASTNodeType, ...]: Types of children nodes
         """
         pass
 
@@ -99,6 +99,9 @@ def node_type(self) -> ASTNodeType:
         """
         pass
 
+    def node_type_name(self) -> str:
+        return f"{self.node_type.name}::{self.name}"
+
     @property
     def name(self) -> str:
         return self.__name
@@ -126,11 +129,11 @@ def parent(self, value: Optional["ASTNode"]) -> None:
             "but got: {}".format(type(value))
 
         if value is not None:
-            value.__check_child_before_add(type(self), self.name)
+            value.__check_child_before_add(self, self.name)
 
         # Detach from previous parent
         if self._parent is not None:
-            self._parent._children[type(self)].pop(self.name)
+            self._parent._children[self.node_type].pop(self.name)
 
         if value is None:
             self._parent = None
@@ -138,28 +141,26 @@ def parent(self, value: Optional["ASTNode"]) -> None:
 
         # Set a weak reference to a new parent and add self to its children
         self._parent = weakref.proxy(value)
-        value._children[type(self)][self.name] = self
+        value._children[self.node_type][self.name] = self
 
-    def __check_child_before_add(self, child_type: Type[ASTNodeSubtype],
+    def __check_child_before_add(self, child: ASTNodeSubtype,
                                  name: str) -> None:
-        assert len(self.children_types) > 0, \
-            "Trying to add child node '{}::{}' to node '{}::{}' " \
-            "that can't have children nodes".format(child_type.__name__, name,
-                                                    type(self).__name__,
-                                                    self.name)
-
-        assert child_type in self.children_types, \
-            "Trying to add child node '{}::{}' to node '{}::{}' " \
+        assert len(self.children_types) > 0, (
+            f"Trying to add child node '{child.node_type_name}' to node "
+            f"'{self.node_type_name}' that can't have children nodes"
+        )
+
+        assert child.node_type in self.children_types, \
+            "Trying to add child node '{}' to node '{}' " \
             "that supports only ({}) as its children types".format(
-                child_type.__name__, name, type(self).__name__, self.name,
-                ",".join(t.__name__ for t in self.children_types)
+                child.node_type_name, self.node_type_name,
+                ",".join(t.name for t in self.children_types)
             )
 
-        if self._find_child(child_type, name) is not None:
+        if self._find_child(child.node_type, name) is not None:
             raise ValueError(
-                "Node '{}::{}' already has a child '{}::{}'".format(
-                    type(self).__name__, self.name, child_type.__name__, name
-                )
+                f"Node '{self.node_type_name}' already has a "
+                f"child '{child.node_type_name}'"
             )
 
     def _add_child(self, child_type: Type[ASTNodeSubtype], name: str,
@@ -180,15 +181,14 @@ def _add_child(self, child_type: Type[ASTNodeSubtype], name: str,
         Returns:
             ASTNodeSubtype: Created ASTNode
         """
-        self.__check_child_before_add(child_type, name)
         return child_type(name, parent=self, **kwargs)
 
-    def _find_child(self, child_type: Type[ASTNodeSubtype],
+    def _find_child(self, child_type: ASTNodeType,
                     name: str) -> Optional[ASTNodeSubtype]:
         """Looks for child node with the given type and name.
 
         Args:
-            child_type (Type[ASTNodeSubtype]): Type of the child node.
+            child_type (ASTNodeType): Type of the child node.
             name (str): Name of the child node.
 
         Returns:
diff --git a/modules/python/src2/typing_stubs_generation/nodes/type_node.py b/modules/python/src2/typing_stubs_generation/nodes/type_node.py
index 912adc6954ea..3f242e730e50 100644
--- a/modules/python/src2/typing_stubs_generation/nodes/type_node.py
+++ b/modules/python/src2/typing_stubs_generation/nodes/type_node.py
@@ -163,11 +163,11 @@ class AnyTypeNode(TypeNode):
     """
     @property
     def typename(self) -> str:
-        return "typing.Any"
+        return "_typing.Any"
 
     @property
     def required_usage_imports(self) -> Generator[str, None, None]:
-        yield "import typing"
+        yield "import typing as _typing"
 
 
 class PrimitiveTypeNode(TypeNode):
@@ -395,9 +395,12 @@ def dict_(cls, ctype_name: str, key_type: TypeNode, value_type: TypeNode,
 
 class ConditionalAliasTypeNode(TypeNode):
     """Type node representing an alias protected by condition checked in runtime.
+    For typing-related conditions, prefer using typing.TYPE_CHECKING. For a full explanation, see:
+    https://github.com/opencv/opencv/pull/23927#discussion_r1256326835
+
     Example:
     ```python
-    if numpy.lib.NumpyVersion(numpy.__version__) > "1.20.0" and sys.version_info >= (3, 9)
+    if typing.TYPE_CHECKING
         NumPyArray = numpy.ndarray[typing.Any, numpy.dtype[numpy.generic]]
     else:
         NumPyArray = numpy.ndarray
@@ -407,10 +410,10 @@ class ConditionalAliasTypeNode(TypeNode):
 
     ConditionalAliasTypeNode(
         "NumPyArray",
-        'numpy.lib.NumpyVersion(numpy.__version__) > "1.20.0" and sys.version_info >= (3, 9)',
+        'typing.TYPE_CHECKING',
         NDArrayTypeNode("NumPyArray"),
         NDArrayTypeNode("NumPyArray", use_numpy_generics=False),
-        condition_required_imports=("import numpy", "import sys")
+        condition_required_imports=("import typing",)
     )
     ```
     """
@@ -468,14 +471,14 @@ def resolve(self, root: ASTNode):
     def numpy_array_(cls, ctype_name: str, export_name: Optional[str] = None,
                      shape: Optional[Tuple[int, ...]] = None,
                      dtype: Optional[str] = None):
+        """Type subscription is not possible in python 3.8 and older numpy versions."""
         return cls(
             ctype_name,
-            ('numpy.lib.NumpyVersion(numpy.__version__) > "1.20.0" '
-             'and sys.version_info >= (3, 9)'),
+            "_typing.TYPE_CHECKING",
             NDArrayTypeNode(ctype_name, shape, dtype),
             NDArrayTypeNode(ctype_name, shape, dtype,
                             use_numpy_generics=False),
-            condition_required_imports=("import numpy", "import sys")
+            condition_required_imports=("import typing as _typing",)
         )
 
 
@@ -496,14 +499,14 @@ def typename(self) -> str:
         if self._use_numpy_generics:
             # NOTE: Shape is not fully supported yet
             dtype = self.dtype if self.dtype is not None else "numpy.generic"
-            return f"numpy.ndarray[typing.Any, numpy.dtype[{dtype}]]"
+            return f"numpy.ndarray[_typing.Any, numpy.dtype[{dtype}]]"
         return "numpy.ndarray"
 
     @property
     def required_usage_imports(self) -> Generator[str, None, None]:
         yield "import numpy"
         # if self.shape is None:
-        yield "import typing"
+        yield "import typing as _typing"
 
 
 class ASTNodeTypeNode(TypeNode):
@@ -665,13 +668,13 @@ def relative_typename(self, module: str) -> str:
 
     @property
     def required_definition_imports(self) -> Generator[str, None, None]:
-        yield "import typing"
+        yield "import typing as _typing"
         yield from super().required_definition_imports
 
     @property
     def required_usage_imports(self) -> Generator[str, None, None]:
         if TypeNode.compatible_to_runtime_usage:
-            yield "import typing"
+            yield "import typing as _typing"
         yield from super().required_usage_imports
 
     @abc.abstractproperty
@@ -692,7 +695,7 @@ def __init__(self, ctype_name: str, item: TypeNode) -> None:
 
     @property
     def type_format(self) -> str:
-        return "typing.Sequence[{}]"
+        return "_typing.Sequence[{}]"
 
     @property
     def types_separator(self) -> str:
@@ -706,7 +709,7 @@ class TupleTypeNode(ContainerTypeNode):
     @property
     def type_format(self) -> str:
         if TypeNode.compatible_to_runtime_usage:
-            return "typing.Tuple[{}]"
+            return "_typing.Tuple[{}]"
         return "tuple[{}]"
 
     @property
@@ -720,7 +723,7 @@ class UnionTypeNode(ContainerTypeNode):
     @property
     def type_format(self) -> str:
         if TypeNode.compatible_to_runtime_usage:
-            return "typing.Union[{}]"
+            return "_typing.Union[{}]"
         return "{}"
 
     @property
@@ -740,7 +743,7 @@ def __init__(self, value: TypeNode) -> None:
     @property
     def type_format(self) -> str:
         if TypeNode.compatible_to_runtime_usage:
-            return "typing.Optional[{}]"
+            return "_typing.Optional[{}]"
         return "{} | None"
 
     @property
@@ -766,7 +769,7 @@ def value_type(self) -> TypeNode:
     @property
     def type_format(self) -> str:
         if TypeNode.compatible_to_runtime_usage:
-            return "typing.Dict[{}]"
+            return "_typing.Dict[{}]"
         return "dict[{}]"
 
     @property
@@ -807,35 +810,50 @@ def ret_type(self) -> TypeNode:
 
     @property
     def typename(self) -> str:
-        return 'typing.Callable[[{}], {}]'.format(
+        return '_typing.Callable[[{}], {}]'.format(
             ', '.join(arg.typename for arg in self.arg_types),
             self.ret_type.typename
         )
 
     @property
     def full_typename(self) -> str:
-        return 'typing.Callable[[{}], {}]'.format(
+        return '_typing.Callable[[{}], {}]'.format(
             ', '.join(arg.full_typename for arg in self.arg_types),
             self.ret_type.full_typename
         )
 
     def relative_typename(self, module: str) -> str:
-        return 'typing.Callable[[{}], {}]'.format(
+        return '_typing.Callable[[{}], {}]'.format(
             ', '.join(arg.relative_typename(module) for arg in self.arg_types),
             self.ret_type.relative_typename(module)
         )
 
     @property
     def required_definition_imports(self) -> Generator[str, None, None]:
-        yield "import typing"
+        yield "import typing as _typing"
         yield from super().required_definition_imports
 
     @property
     def required_usage_imports(self) -> Generator[str, None, None]:
-        yield "import typing"
+        yield "import typing as _typing"
         yield from super().required_usage_imports
 
 
+class ClassTypeNode(ContainerTypeNode):
+    """Type node representing types themselves (refer to typing.Type)
+    """
+    def __init__(self, value: TypeNode) -> None:
+        super().__init__(value.ctype_name, (value,))
+
+    @property
+    def type_format(self) -> str:
+        return "_typing.Type[{}]"
+
+    @property
+    def types_separator(self) -> str:
+        return ", "
+
+
 def _resolve_symbol(root: Optional[ASTNode], full_symbol_name: str) -> Optional[ASTNode]:
     """Searches for a symbol with the given full export name in the AST
     starting from the `root`.
diff --git a/modules/python/src2/typing_stubs_generation/predefined_types.py b/modules/python/src2/typing_stubs_generation/predefined_types.py
index fe9a37a45efb..f5ba0bc29ec2 100644
--- a/modules/python/src2/typing_stubs_generation/predefined_types.py
+++ b/modules/python/src2/typing_stubs_generation/predefined_types.py
@@ -22,6 +22,10 @@
     PrimitiveTypeNode.int_("uchar"),
     PrimitiveTypeNode.int_("unsigned"),
     PrimitiveTypeNode.int_("int64"),
+    PrimitiveTypeNode.int_("uint8_t"),
+    PrimitiveTypeNode.int_("int8_t"),
+    PrimitiveTypeNode.int_("int32_t"),
+    PrimitiveTypeNode.int_("uint32_t"),
     PrimitiveTypeNode.int_("size_t"),
     PrimitiveTypeNode.float_("float"),
     PrimitiveTypeNode.float_("double"),
@@ -30,7 +34,10 @@
     PrimitiveTypeNode.str_("char"),
     PrimitiveTypeNode.str_("String"),
     PrimitiveTypeNode.str_("c_string"),
-    ConditionalAliasTypeNode.numpy_array_("NumPyArrayGeneric"),
+    ConditionalAliasTypeNode.numpy_array_(
+        "NumPyArrayNumeric",
+        dtype="numpy.integer[_typing.Any] | numpy.floating[_typing.Any]"
+    ),
     ConditionalAliasTypeNode.numpy_array_("NumPyArrayFloat32", dtype="numpy.float32"),
     ConditionalAliasTypeNode.numpy_array_("NumPyArrayFloat64", dtype="numpy.float64"),
     NoneTypeNode("void"),
@@ -38,7 +45,7 @@
     AliasTypeNode.union_(
         "Mat",
         items=(ASTNodeTypeNode("Mat", module_name="cv2.mat_wrapper"),
-               AliasRefTypeNode("NumPyArrayGeneric")),
+               AliasRefTypeNode("NumPyArrayNumeric")),
         export_name="MatLike"
     ),
     AliasTypeNode.sequence_("MatShape", PrimitiveTypeNode.int_()),
@@ -67,13 +74,15 @@
                             doc="Required length is 4"),
     AliasTypeNode.sequence_("Rect2i", PrimitiveTypeNode.int_(),
                             doc="Required length is 4"),
+    AliasTypeNode.sequence_("Rect2f", PrimitiveTypeNode.float_(),
+                            doc="Required length is 4"),
     AliasTypeNode.sequence_("Rect2d", PrimitiveTypeNode.float_(),
                             doc="Required length is 4"),
     AliasTypeNode.dict_("Moments", PrimitiveTypeNode.str_("Moments::key"),
                         PrimitiveTypeNode.float_("Moments::value")),
     AliasTypeNode.tuple_("RotatedRect",
                          items=(AliasRefTypeNode("Point2f"),
-                                AliasRefTypeNode("Size"),
+                                AliasRefTypeNode("Size2f"),
                                 PrimitiveTypeNode.float_()),
                          doc="Any type providing sequence protocol is supported"),
     AliasTypeNode.tuple_("TermCriteria",
diff --git a/modules/python/test/test_cuda.py b/modules/python/test/test_cuda.py
index 851a23e88021..c886342832f5 100644
--- a/modules/python/test/test_cuda.py
+++ b/modules/python/test/test_cuda.py
@@ -70,6 +70,74 @@ def test_cuda_release(self):
         self.assertTrue(cuMat.step == 0)
         self.assertTrue(cuMat.size() == (0, 0))
 
+    def test_cuda_convertTo(self):
+        # setup
+        npMat_8UC4 = (np.random.random((128, 128, 4)) * 255).astype(np.uint8)
+        npMat_32FC4 = npMat_8UC4.astype(np.single)
+        new_type = cv.CV_32FC4
+
+        # sync
+        # in/out
+        cuMat_8UC4 = cv.cuda_GpuMat(npMat_8UC4)
+        cuMat_32FC4 = cv.cuda_GpuMat(cuMat_8UC4.size(), new_type)
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type, cuMat_32FC4)
+        self.assertTrue(cuMat_32FC4.cudaPtr() == cuMat_32FC4_out.cudaPtr())
+        npMat_32FC4_out = cuMat_32FC4.download()
+        self.assertTrue(np.array_equal(npMat_32FC4, npMat_32FC4_out))
+        # out
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type)
+        npMat_32FC4_out = cuMat_32FC4.download()
+        self.assertTrue(np.array_equal(npMat_32FC4, npMat_32FC4_out))
+
+        # async
+        stream = cv.cuda.Stream()
+        cuMat_32FC4 = cv.cuda_GpuMat(cuMat_8UC4.size(), new_type)
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type, cuMat_32FC4)
+        # in/out
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type, 1, 0, stream, cuMat_32FC4)
+        self.assertTrue(cuMat_32FC4.cudaPtr() == cuMat_32FC4_out.cudaPtr())
+        npMat_32FC4_out = cuMat_32FC4.download(stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.array_equal(npMat_32FC4, npMat_32FC4_out))
+        # out
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type, 1, 0, stream)
+        npMat_32FC4_out = cuMat_32FC4.download(stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.array_equal(npMat_32FC4, npMat_32FC4_out))
+
+    def test_cuda_copyTo(self):
+        # setup
+        npMat_8UC4 = (np.random.random((128, 128, 4)) * 255).astype(np.uint8)
+
+        # sync
+        # in/out
+        cuMat_8UC4 = cv.cuda_GpuMat(npMat_8UC4)
+        cuMat_8UC4_dst = cv.cuda_GpuMat(cuMat_8UC4.size(), cuMat_8UC4.type())
+        cuMat_8UC4_out = cuMat_8UC4.copyTo(cuMat_8UC4_dst)
+        self.assertTrue(cuMat_8UC4_out.cudaPtr() == cuMat_8UC4_dst.cudaPtr())
+        npMat_8UC4_out = cuMat_8UC4_out.download()
+        self.assertTrue(np.array_equal(npMat_8UC4, npMat_8UC4_out))
+        # out
+        cuMat_8UC4_out =  cuMat_8UC4.copyTo()
+        npMat_8UC4_out = cuMat_8UC4_out.download()
+        self.assertTrue(np.array_equal(npMat_8UC4, npMat_8UC4_out))
+
+        # async
+        stream = cv.cuda.Stream()
+        # in/out
+        cuMat_8UC4 = cv.cuda_GpuMat(npMat_8UC4)
+        cuMat_8UC4_dst = cv.cuda_GpuMat(cuMat_8UC4.size(), cuMat_8UC4.type())
+        cuMat_8UC4_out = cuMat_8UC4.copyTo(cuMat_8UC4_dst, stream)
+        self.assertTrue(cuMat_8UC4_out.cudaPtr() == cuMat_8UC4_out.cudaPtr())
+        npMat_8UC4_out = cuMat_8UC4_dst.download(stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.array_equal(npMat_8UC4, npMat_8UC4_out))
+        # out
+        cuMat_8UC4_out = cuMat_8UC4.copyTo(stream)
+        npMat_8UC4_out = cuMat_8UC4_out.download(stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.array_equal(npMat_8UC4, npMat_8UC4_out))
+
     def test_cuda_denoising(self):
         self.assertEqual(True, hasattr(cv.cuda, 'fastNlMeansDenoising'))
         self.assertEqual(True, hasattr(cv.cuda, 'fastNlMeansDenoisingColored'))
diff --git a/modules/python/test/test_imread.py b/modules/python/test/test_imread.py
new file mode 100644
index 000000000000..b5f286d42696
--- /dev/null
+++ b/modules/python/test/test_imread.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+'''
+Test for imread
+'''
+
+# Python 2/3 compatibility
+from __future__ import print_function
+
+import cv2 as cv
+import numpy as np
+import sys
+
+from tests_common import NewOpenCVTests
+
+class imread_test(NewOpenCVTests):
+    def test_imread_to_buffer(self):
+        path = self.extraTestDataPath + '/cv/shared/lena.png'
+        ref = cv.imread(path)
+
+        img = np.zeros_like(ref)
+        cv.imread(path, img)
+        self.assertEqual(cv.norm(ref, img, cv.NORM_INF), 0.0)
+
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/python/test/test_misc.py b/modules/python/test/test_misc.py
index ec7f44de0f4a..08ab04d53d9e 100644
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@@ -42,6 +42,93 @@ def get_conversion_error_msg(value, expected, actual):
 def get_no_exception_msg(value):
     return 'Exception is not risen for {} of type {}'.format(value, type(value).__name__)
 
+
+def rpad(src, dst_size, pad_value=0):
+    """Extend `src` up to `dst_size` with given value.
+
+    Args:
+        src (np.ndarray | tuple | list): 1d array like object to pad.
+        dst_size (_type_): Desired `src` size after padding.
+        pad_value (int, optional): Padding value. Defaults to 0.
+
+    Returns:
+        np.ndarray: 1d array with len == `dst_size`.
+    """
+    src = np.asarray(src)
+    if len(src.shape) != 1:
+        raise ValueError("Only 1d arrays are supported")
+
+    # Considering the meaning, it is desirable to use np.pad().
+    # However, the old numpy doesn't include the following fixes and cannot work as expected.
+    # So an alternative fix that combines np.append() and np.fill() is used.
+    # https://docs.scipy.org/doc/numpy-1.13.0/release.html#support-for-returning-arrays-of-arbitrary-dimensions-in-apply-along-axis
+
+    return np.append(src, np.full( dst_size - len(src), pad_value, dtype=src.dtype) )
+
+def get_ocv_arithm_op_table(apply_saturation=False):
+    def saturate(func):
+        def wrapped_func(x, y):
+            dst_dtype = x.dtype
+            if apply_saturation:
+                if np.issubdtype(x.dtype, np.integer):
+                    x = x.astype(np.int64)
+            # Apply padding or truncation for array-like `y` inputs
+            if not isinstance(y, (float, int)):
+                if len(y) > x.shape[-1]:
+                    y = y[:x.shape[-1]]
+                else:
+                    y = rpad(y, x.shape[-1], pad_value=0)
+
+            dst = func(x, y)
+            if apply_saturation:
+                min_val, max_val = get_limits(dst_dtype)
+                dst = np.clip(dst, min_val, max_val)
+            return dst.astype(dst_dtype)
+        return wrapped_func
+
+    @saturate
+    def subtract(x, y):
+        return x - y
+
+    @saturate
+    def add(x, y):
+        return x + y
+
+    @saturate
+    def divide(x, y):
+        if not isinstance(y, (int, float)):
+            dst_dtype = np.result_type(x, y)
+            y = np.array(y).astype(dst_dtype)
+            _, max_value = get_limits(dst_dtype)
+            y[y == 0] = max_value
+
+        # to compatible between python2 and python3, it calicurates with float.
+        # python2: int / int = int
+        # python3: int / int = float
+        dst = 1.0 * x / y
+
+        if np.issubdtype(x.dtype, np.integer):
+            dst = np.rint(dst)
+        return dst
+
+    @saturate
+    def multiply(x, y):
+        return x * y
+
+    @saturate
+    def absdiff(x, y):
+        res = np.abs(x - y)
+        return res
+
+    return {
+        cv.subtract: subtract,
+        cv.add: add,
+        cv.multiply: multiply,
+        cv.divide: divide,
+        cv.absdiff: absdiff
+    }
+
+
 class Bindings(NewOpenCVTests):
 
     def test_inheritance(self):
@@ -219,6 +306,25 @@ def test_InputArrayOfArrays(self):
         #res6 = cv.utils.dumpInputArray([a, b])
         #self.assertEqual(res6, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=2 dims(-1)=1 size(-1)=2x1 type(0)=CV_32FC1 dims(0)=4 size(0)=[2 3 4 5]")
 
+    def test_unsupported_numpy_data_types_string_description(self):
+        for dtype in (object, str, np.complex128):
+            test_array = np.zeros((4, 4, 3), dtype=dtype)
+            msg = ".*type = {} is not supported".format(test_array.dtype)
+            if sys.version_info[0] < 3:
+                self.assertRaisesRegexp(
+                    Exception, msg, cv.utils.dumpInputArray, test_array
+                )
+            else:
+                self.assertRaisesRegex(
+                    Exception, msg, cv.utils.dumpInputArray, test_array
+                )
+
+    def test_numpy_writeable_flag_is_preserved(self):
+        array = np.zeros((10, 10, 1), dtype=np.uint8)
+        array.setflags(write=False)
+        with self.assertRaises(Exception):
+            cv.rectangle(array, (0, 0), (5, 5), (255), 2)
+
     def test_20968(self):
         pixel = np.uint8([[[40, 50, 200]]])
         _ = cv.cvtColor(pixel, cv.COLOR_RGB2BGR)  # should not raise exception
@@ -345,7 +451,7 @@ def test_parse_to_float_convertible(self):
         try_to_convert = partial(self._try_to_convert, cv.utils.dumpFloat)
         min_float, max_float = get_limits(ctypes.c_float)
         for convertible in (2, -13, 1.24, np.float32(32.45), float(32), np.double(12.23),
-                            np.float32(-12.3), np.float64(3.22), np.float_(-1.5), min_float,
+                            np.float32(-12.3), np.float64(3.22), min_float,
                             max_float, np.inf, -np.inf, float('Inf'), -float('Inf'),
                             np.double(np.inf), np.double(-np.inf), np.double(float('Inf')),
                             np.double(-float('Inf'))):
@@ -389,7 +495,7 @@ def test_parse_to_double_convertible(self):
         min_float, max_float = get_limits(ctypes.c_float)
         min_double, max_double = get_limits(ctypes.c_double)
         for convertible in (2, -13, 1.24, np.float32(32.45), float(2), np.double(12.23),
-                            np.float32(-12.3), np.float64(3.22), np.float_(-1.5), min_float,
+                            np.float32(-12.3), np.float64(3.22), min_float,
                             max_float, min_double, max_double, np.inf, -np.inf, float('Inf'),
                             -float('Inf'), np.double(np.inf), np.double(-np.inf),
                             np.double(float('Inf')), np.double(-float('Inf'))):
@@ -502,6 +608,14 @@ def test_wrap_rotated_rect(self):
         _, inter_pts = cv.rotatedRectangleIntersection(rect1, rect2)
         self.assertLess(np.max(np.abs(inter_pts.reshape(-1, 2) - pts)), 1e-4)
 
+    def test_result_rotated_rect_boundingRect2f(self):
+        center = (0, 0)
+        size = (10, 10)
+        angle = 0
+        gold_box = (-5.0, -5.0, 10.0, 10.0)
+        rect1 = cv.RotatedRect(center, size, angle)
+        bbox = rect1.boundingRect2f()
+        self.assertEqual(gold_box, bbox)
 
     def test_parse_to_rotated_rect_not_convertible(self):
         for not_convertible in ([], (), np.array([]), (123, (45, 34), 1), {1: 2, 3: 4}, 123,
@@ -797,6 +911,32 @@ def test_named_arguments_with_output_argument(self):
         np.testing.assert_equal(dst, src_copy)
         self.assertEqual(arguments_dump, 'lambda=25, sigma=5.5')
 
+    def test_arithm_op_without_saturation(self):
+        np.random.seed(4231568)
+        src = np.random.randint(20, 40, 8 * 4 * 3).astype(np.uint8).reshape(8, 4, 3)
+        operations = get_ocv_arithm_op_table(apply_saturation=False)
+        for ocv_op, numpy_op in operations.items():
+            for val in (2, 4, (5, ), (6, 4), (2., 4., 1.),
+                        np.uint8([1, 2, 2]), np.float64([5, 2, 6, 3]),):
+                dst = ocv_op(src, val)
+                expected = numpy_op(src, val)
+                # Temporarily allows a difference of 1 for arm64 workaround.
+                self.assertLess(np.max(np.abs(dst - expected)), 2,
+                  msg="Operation '{}' is failed for {}".format(ocv_op.__name__, val ) )
+
+    def test_arithm_op_with_saturation(self):
+        np.random.seed(4231568)
+        src = np.random.randint(20, 40, 4 * 8 * 4).astype(np.uint8).reshape(4, 8, 4)
+        operations = get_ocv_arithm_op_table(apply_saturation=True)
+
+        for ocv_op, numpy_op in operations.items():
+            for val in (10, 4, (40, ), (15, 12), (25., 41., 15.),
+                        np.uint8([1, 2, 20]), np.float64([50, 21, 64, 30]),):
+                dst = ocv_op(src, val)
+                expected = numpy_op(src, val)
+                # Temporarily allows a difference of 1 for arm64 workaround.
+                self.assertLess(np.max(np.abs(dst - expected)), 2,
+                  msg="Saturated Operation '{}' is failed for {}".format(ocv_op.__name__, val ) )
 
 class CanUsePurePythonModuleFunction(NewOpenCVTests):
     def test_can_get_ocv_version(self):
diff --git a/modules/python/test/test_pathlike.py b/modules/python/test/test_pathlike.py
new file mode 100644
index 000000000000..9122a757ceea
--- /dev/null
+++ b/modules/python/test/test_pathlike.py
@@ -0,0 +1,38 @@
+from tests_common import NewOpenCVTests, unittest
+import cv2 as cv
+import os
+
+
+def import_path():
+    import sys
+    if sys.version_info[0] < 3 or sys.version_info[1] < 6:
+        raise unittest.SkipTest('Python 3.6+ required')
+
+    from pathlib import Path
+    return Path
+
+
+class CanPassPathLike(NewOpenCVTests):
+    def test_pathlib_path(self):
+        Path = import_path()
+
+        img_path = self.find_file('cv/imgproc/stuff.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+
+        image_from_str = cv.imread(img_path)
+        self.assertIsNotNone(image_from_str)
+
+        image_from_path = cv.imread(Path(img_path))
+        self.assertIsNotNone(image_from_path)
+
+
+    def test_type_mismatch(self):
+        import_path() # checks python version
+
+        with self.assertRaises(cv.error) as context:
+            cv.imread(123)
+
+        self.assertTrue('str or path-like' in str(context.exception))
+
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/python/test/tests_common.py b/modules/python/test/tests_common.py
index ec49f46d0df9..d673dd7b789f 100644
--- a/modules/python/test/tests_common.py
+++ b/modules/python/test/tests_common.py
@@ -36,6 +36,8 @@ def find_file(self, filename, searchPaths=[], required=True):
                     return candidate
         if required:
             self.fail('File ' + filename + ' not found')
+        else:
+            self.skipTest('File ' + filename + ' not found')
         return None
 
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp b/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
index 074c9b6dfb8b..dea76c957bd7 100644
--- a/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
@@ -85,10 +85,10 @@ class CV_EXPORTS_W ExposureCompensator
     @param mask Image mask
         */
     CV_WRAP virtual void apply(int index, Point corner, InputOutputArray image, InputArray mask) = 0;
-    CV_WRAP virtual void getMatGains(CV_OUT std::vector<Mat>& ) {CV_Error(Error::StsInternal, "");};
-    CV_WRAP virtual void setMatGains(std::vector<Mat>& ) { CV_Error(Error::StsInternal, ""); };
-    CV_WRAP void setUpdateGain(bool b) { updateGain = b; };
-    CV_WRAP bool getUpdateGain() { return updateGain; };
+    CV_WRAP virtual void getMatGains(CV_OUT std::vector<Mat>& ) {CV_Error(Error::StsInternal, "");}
+    CV_WRAP virtual void setMatGains(std::vector<Mat>& ) { CV_Error(Error::StsInternal, ""); }
+    CV_WRAP void setUpdateGain(bool b) { updateGain = b; }
+    CV_WRAP bool getUpdateGain() { return updateGain; }
 protected :
     bool updateGain;
 };
@@ -101,8 +101,8 @@ class CV_EXPORTS_W NoExposureCompensator : public ExposureCompensator
     void feed(const std::vector<Point> &/*corners*/, const std::vector<UMat> &/*images*/,
               const std::vector<std::pair<UMat,uchar> > &/*masks*/) CV_OVERRIDE { }
     CV_WRAP void apply(int /*index*/, Point /*corner*/, InputOutputArray /*image*/, InputArray /*mask*/) CV_OVERRIDE { }
-    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; };
-    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; };
+    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; }
+    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; }
 };
 
 /** @brief Exposure compensator which tries to remove exposure related artifacts by adjusting image
diff --git a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
index b0ad1847cf95..e25668308ed8 100644
--- a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
@@ -61,7 +61,7 @@ struct CV_EXPORTS_W_SIMPLE ImageFeatures
     CV_PROP_RW Size img_size;
     CV_PROP_RW std::vector<KeyPoint> keypoints;
     CV_PROP_RW UMat descriptors;
-    CV_WRAP std::vector<KeyPoint> getKeypoints() { return keypoints; };
+    CV_WRAP std::vector<KeyPoint> getKeypoints() { return keypoints; }
 };
 /** @brief
 
@@ -109,8 +109,8 @@ struct CV_EXPORTS_W_SIMPLE MatchesInfo
     CV_PROP_RW int num_inliers;                    //!< Number of geometrically consistent matches
     CV_PROP_RW Mat H;                              //!< Estimated transformation
     CV_PROP_RW double confidence;                  //!< Confidence two images are from the same panorama
-    CV_WRAP std::vector<DMatch> getMatches() { return matches; };
-    CV_WRAP std::vector<uchar> getInliers() { return inliers_mask; };
+    CV_WRAP std::vector<DMatch> getMatches() { return matches; }
+    CV_WRAP std::vector<uchar> getInliers() { return inliers_mask; }
 };
 
 /** @brief Feature matchers base class. */
@@ -138,7 +138,7 @@ class CV_EXPORTS_W FeaturesMatcher
     @sa detail::MatchesInfo
     */
     CV_WRAP_AS(apply2) void operator ()(const std::vector<ImageFeatures> &features, CV_OUT std::vector<MatchesInfo> &pairwise_matches,
-                                        const cv::UMat &mask = cv::UMat()) { match(features, pairwise_matches, mask); };
+                                        const cv::UMat &mask = cv::UMat()) { match(features, pairwise_matches, mask); }
 
     /** @return True, if it's possible to use the same matcher instance in parallel, false otherwise
     */
diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp
index aa1ce5a6a732..0a5bf63de216 100644
--- a/modules/stitching/include/opencv2/stitching/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/warpers.hpp
@@ -53,7 +53,7 @@ namespace cv {
 
     public:
         CV_WRAP PyRotationWarper(String type, float scale);
-        CV_WRAP PyRotationWarper() {};
+        CV_WRAP PyRotationWarper() {}
         ~PyRotationWarper() {}
 
         /** @brief Projects the image point.
diff --git a/modules/stitching/src/camera.cpp b/modules/stitching/src/camera.cpp
index 149ba7476017..2f7ee97e062a 100644
--- a/modules/stitching/src/camera.cpp
+++ b/modules/stitching/src/camera.cpp
@@ -66,7 +66,7 @@ Mat CameraParams::K() const
     Mat_<double> k = Mat::eye(3, 3, CV_64F);
     k(0,0) = focal; k(0,2) = ppx;
     k(1,1) = focal * aspect; k(1,2) = ppy;
-    return std::move(k);
+    return Mat(k);
 }
 
 } // namespace detail
diff --git a/modules/stitching/src/motion_estimators.cpp b/modules/stitching/src/motion_estimators.cpp
index dfe929f9ba62..8d227409caaa 100644
--- a/modules/stitching/src/motion_estimators.cpp
+++ b/modules/stitching/src/motion_estimators.cpp
@@ -962,7 +962,7 @@ void waveCorrect(std::vector<Mat> &rmats, WaveCorrectKind kind)
     else if (kind == WAVE_CORRECT_VERT)
         rg1 = eigen_vecs.row(0).t();
     else
-        CV_Error(CV_StsBadArg, "unsupported kind of wave correction");
+        CV_Error(cv::Error::StsBadArg, "unsupported kind of wave correction");
 
     Mat img_k = Mat::zeros(3, 1, CV_32F);
     for (size_t i = 0; i < rmats.size(); ++i)
diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp
index debc0d2088bd..2a1177496afc 100644
--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -45,6 +45,7 @@
 
 #include "opencv2/opencv_modules.hpp"
 
+#include <array>
 #include <vector>
 #include <algorithm>
 #include <utility>
diff --git a/modules/ts/CMakeLists.txt b/modules/ts/CMakeLists.txt
index c1d249ea149a..63edae1e677a 100644
--- a/modules/ts/CMakeLists.txt
+++ b/modules/ts/CMakeLists.txt
@@ -47,3 +47,7 @@ if(OPENCV_DISABLE_THREAD_SUPPORT)
   # described in `ts_gtest.h`.
   ocv_target_compile_definitions(${the_module} PUBLIC GTEST_HAS_PTHREAD=0)
 endif()
+
+if(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+  ocv_target_link_libraries(${the_module} PUBLIC regex)
+endif()
\ No newline at end of file
diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp
index 86f2d077617e..83ef6fc7da6a 100644
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@@ -300,8 +300,8 @@ Mat randomMat(RNG& rng, Size size, int type, double minVal, double maxVal, bool
 Mat randomMat(RNG& rng, const vector<int>& size, int type, double minVal, double maxVal, bool useRoi);
 void add(const Mat& a, double alpha, const Mat& b, double beta,
                       Scalar gamma, Mat& c, int ctype, bool calcAbs=false);
-void multiply(const Mat& a, const Mat& b, Mat& c, double alpha=1);
-void divide(const Mat& a, const Mat& b, Mat& c, double alpha=1);
+void multiply(const Mat& a, const Mat& b, Mat& c, double alpha=1, int ctype=-1);
+void divide(const Mat& a, const Mat& b, Mat& c, double alpha=1, int ctype=-1);
 
 void convert(const Mat& src, cv::OutputArray dst, int dtype, double alpha=1, double beta=0);
 void copy(const Mat& src, Mat& dst, const Mat& mask=Mat(), bool invertMask=false);
@@ -941,13 +941,9 @@ namespace opencv_test {
 using namespace cvtest;
 using namespace cv;
 
-#ifdef CV_CXX11
 #define CVTEST_GUARD_SYMBOL(name) \
     class required_namespace_specificatin_here_for_symbol_ ## name {}; \
     using name = required_namespace_specificatin_here_for_symbol_ ## name;
-#else
-#define CVTEST_GUARD_SYMBOL(name) /* nothing */
-#endif
 
 CVTEST_GUARD_SYMBOL(norm)
 CVTEST_GUARD_SYMBOL(add)
diff --git a/modules/ts/include/opencv2/ts/cuda_test.hpp b/modules/ts/include/opencv2/ts/cuda_test.hpp
index 87b217fc13d2..73e8f8ba8412 100644
--- a/modules/ts/include/opencv2/ts/cuda_test.hpp
+++ b/modules/ts/include/opencv2/ts/cuda_test.hpp
@@ -202,6 +202,11 @@ namespace cvtest
         { \
           UnsafeTestBody(); \
         } \
+        catch (const cvtest::details::SkipTestExceptionBase& e) \
+        { \
+            printf("[     SKIP ] %s\n", e.what()); \
+            cv::cuda::resetDevice(); \
+        } \
         catch (...) \
         { \
           cv::cuda::resetDevice(); \
diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp
index efa48605108a..eebf4c594b35 100644
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@@ -13,7 +13,7 @@ void checkIppStatus();
 extern bool skipUnstableTests;
 extern bool runBigDataTests;
 extern int testThreads;
-extern int debugLevel;  //< 0 - no debug, 1 - basic test debug information, >1 - extra debug information
+extern int debugLevel;  ///< 0 - no debug, 1 - basic test debug information, >1 - extra debug information
 
 void testSetUp();
 void testTearDown();
@@ -31,6 +31,8 @@ bool checkBigDataTests();
 
 #define CV__TEST_INIT \
     CV__TEST_NAMESPACE_CHECK \
+    if (setUpSkipped) \
+        return; \
     ::cvtest::testSetUp();
 #define CV__TEST_CLEANUP ::cvtest::testTearDown();
 #define CV__TEST_BODY_IMPL(name) \
@@ -47,6 +49,25 @@ bool checkBigDataTests();
        } \
     } \
 
+#define CV__TEST_SETUP_IMPL(parent_class) { \
+  setUpSkipped = false; \
+  try { \
+    parent_class::SetUp(); \
+  } catch (const cvtest::details::SkipTestExceptionBase& e) { \
+    setUpSkipped = true; \
+    printf("[     SKIP ] %s\n", e.what()); \
+  } \
+} \
+
+struct SkipThisTest : public ::testing::Test {
+  SkipThisTest(const std::string& msg_) : msg(msg_) {}
+
+  virtual void TestBody() CV_OVERRIDE {
+      printf("[     SKIP ] %s\n", msg.c_str());
+  }
+
+  std::string msg;
+};
 
 #undef TEST
 #define TEST_(test_case_name, test_name, parent_class, bodyMethodName, BODY_ATTR, BODY_IMPL) \
@@ -54,12 +75,24 @@ bool checkBigDataTests();
      public:\
       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
      private:\
+      bool setUpSkipped = false; \
       virtual void TestBody() CV_OVERRIDE;\
       virtual void bodyMethodName() BODY_ATTR;\
+      virtual void SetUp() CV_OVERRIDE; \
       static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
       GTEST_DISALLOW_COPY_AND_ASSIGN_(\
           GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
     };\
+    class test_case_name##test_name##_factory : public ::testing::internal::TestFactoryBase { \
+     public:\
+      virtual ::testing::Test* CreateTest() { \
+        try { \
+          return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name); \
+        } catch (const cvtest::details::SkipTestExceptionBase& e) { \
+          return new SkipThisTest(e.what()); \
+        } \
+      } \
+    };\
     \
     ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
       ::test_info_ =\
@@ -69,9 +102,9 @@ bool checkBigDataTests();
             (::testing::internal::GetTestTypeId()), \
             parent_class::SetUpTestCase, \
             parent_class::TearDownTestCase, \
-            new ::testing::internal::TestFactoryImpl<\
-                GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
+            new test_case_name##test_name##_factory);\
     void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() BODY_IMPL( #test_case_name "_" #test_name ) \
+    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::SetUp() CV__TEST_SETUP_IMPL(parent_class) \
     void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::bodyMethodName()
 
 #define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, ::testing::Test, Body,, CV__TEST_BODY_IMPL)
@@ -107,12 +140,24 @@ bool checkBigDataTests();
      public:\
       GTEST_TEST_CLASS_NAME_(test_fixture, test_name)() {}\
      private:\
+      bool setUpSkipped = false; \
       virtual void TestBody() CV_OVERRIDE;\
       virtual void Body(); \
+      virtual void SetUp() CV_OVERRIDE; \
       static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
       GTEST_DISALLOW_COPY_AND_ASSIGN_(\
           GTEST_TEST_CLASS_NAME_(test_fixture, test_name));\
     };\
+    class test_fixture##test_name##_factory : public ::testing::internal::TestFactoryBase { \
+     public:\
+      virtual ::testing::Test* CreateTest() { \
+        try { \
+          return new GTEST_TEST_CLASS_NAME_(test_fixture, test_name); \
+        } catch (const cvtest::details::SkipTestExceptionBase& e) { \
+          return new SkipThisTest(e.what()); \
+        } \
+      } \
+    };\
     \
     ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_fixture, test_name)\
       ::test_info_ =\
@@ -122,9 +167,9 @@ bool checkBigDataTests();
             (::testing::internal::GetTypeId<test_fixture>()), \
             test_fixture::SetUpTestCase, \
             test_fixture::TearDownTestCase, \
-            new ::testing::internal::TestFactoryImpl<\
-                GTEST_TEST_CLASS_NAME_(test_fixture, test_name)>);\
+            new test_fixture##test_name##_factory);\
     void GTEST_TEST_CLASS_NAME_(test_fixture, test_name)::TestBody() CV__TEST_BODY_IMPL( #test_fixture "_" #test_name ) \
+    void GTEST_TEST_CLASS_NAME_(test_fixture, test_name)::SetUp() CV__TEST_SETUP_IMPL(test_fixture) \
     void GTEST_TEST_CLASS_NAME_(test_fixture, test_name)::Body()
 
 // Don't use directly
@@ -134,8 +179,10 @@ bool checkBigDataTests();
    public: \
     GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
    private: \
+    bool setUpSkipped = false; \
     virtual void bodyMethodName() BODY_ATTR; \
     virtual void TestBody() CV_OVERRIDE; \
+    virtual void SetUp() CV_OVERRIDE; \
     static int AddToRegistry() { \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
@@ -157,6 +204,7 @@ bool checkBigDataTests();
                              test_name)::gtest_registering_dummy_ = \
       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
     void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() BODY_IMPL( #test_case_name "_" #test_name ) \
+    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::SetUp() CV__TEST_SETUP_IMPL(test_case_name) \
     void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::bodyMethodName()
 
 #undef TEST_P
diff --git a/modules/ts/include/opencv2/ts/ts_gtest.h b/modules/ts/include/opencv2/ts/ts_gtest.h
index b1c6c12152db..49eb3a5ec79b 100644
--- a/modules/ts/include/opencv2/ts/ts_gtest.h
+++ b/modules/ts/include/opencv2/ts/ts_gtest.h
@@ -17501,6 +17501,7 @@ CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
         static_cast<ParamGenerator<T2> >(g2_)));
   }
 
+  CartesianProductHolder2(const CartesianProductHolder2 & other) = default;
  private:
   // No implementation - assignment is unsupported.
   void operator=(const CartesianProductHolder2& other) = delete;
@@ -17523,7 +17524,7 @@ CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
         static_cast<ParamGenerator<T2> >(g2_),
         static_cast<ParamGenerator<T3> >(g3_)));
   }
-
+  CartesianProductHolder3(const CartesianProductHolder3 &) = default;
  private:
   // No implementation - assignment is unsupported.
   void operator=(const CartesianProductHolder3& other) = delete;
@@ -17549,7 +17550,7 @@ CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
         static_cast<ParamGenerator<T3> >(g3_),
         static_cast<ParamGenerator<T4> >(g4_)));
   }
-
+  CartesianProductHolder4(const CartesianProductHolder4 &) = default;
  private:
   // No implementation - assignment is unsupported.
   void operator=(const CartesianProductHolder4& other) = delete;
@@ -17577,7 +17578,7 @@ CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
         static_cast<ParamGenerator<T4> >(g4_),
         static_cast<ParamGenerator<T5> >(g5_)));
   }
-
+  CartesianProductHolder5(const CartesianProductHolder5 &) = default;
  private:
   // No implementation - assignment is unsupported.
   void operator=(const CartesianProductHolder5& other) = delete;
@@ -17609,7 +17610,7 @@ CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
         static_cast<ParamGenerator<T5> >(g5_),
         static_cast<ParamGenerator<T6> >(g6_)));
   }
-
+  CartesianProductHolder6(const CartesianProductHolder6 &) = default;
  private:
   // No implementation - assignment is unsupported.
   void operator=(const CartesianProductHolder6& other) = delete;
@@ -17644,7 +17645,7 @@ CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
         static_cast<ParamGenerator<T6> >(g6_),
         static_cast<ParamGenerator<T7> >(g7_)));
   }
-
+  CartesianProductHolder7(const CartesianProductHolder7 &) = default;
  private:
   // No implementation - assignment is unsupported.
   void operator=(const CartesianProductHolder7& other) = delete;
@@ -17683,7 +17684,7 @@ CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
         static_cast<ParamGenerator<T7> >(g7_),
         static_cast<ParamGenerator<T8> >(g8_)));
   }
-
+  CartesianProductHolder8(const CartesianProductHolder8 &) = default;
  private:
   // No implementation - assignment is unsupported.
   void operator=(const CartesianProductHolder8& other) = delete;
@@ -17726,7 +17727,7 @@ CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
         static_cast<ParamGenerator<T8> >(g8_),
         static_cast<ParamGenerator<T9> >(g9_)));
   }
-
+  CartesianProductHolder9(const CartesianProductHolder9 &) = default;
  private:
   // No implementation - assignment is unsupported.
   void operator=(const CartesianProductHolder9& other) = delete;
@@ -21316,6 +21317,13 @@ AssertionResult CmpHelperEQFailure(const char* lhs_expression,
                    false);
 }
 
+// See https://github.com/opencv/opencv/issues/25674
+// Disable optimization for workaround to mis-branch for GCC14.
+#if defined(__GNUC__) && (__GNUC__ == 14)
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+#endif
+
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQ(const char* lhs_expression,
@@ -21329,6 +21337,10 @@ AssertionResult CmpHelperEQ(const char* lhs_expression,
   return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
 
+#if defined(__GNUC__) && (__GNUC__ == 14)
+#pragma GCC pop_options
+#endif
+
 // With this overloaded version, we allow anonymous enums to be used
 // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
 // can be implicitly cast to BiggestInt.
diff --git a/modules/ts/src/ts.cpp b/modules/ts/src/ts.cpp
index e5d36b435b2c..2d795cd3a2c1 100644
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@@ -643,7 +643,7 @@ void TS::update_context( BaseTest* test, int test_case_idx, bool update_ts_conte
     current_test_info.test = test;
     current_test_info.test_case_idx = test_case_idx;
     current_test_info.code = 0;
-    cvSetErrStatus( CV_StsOk );
+    cvSetErrStatus( cv::Error::StsOk );
 }
 
 
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index cd027661488a..a4713290b1a8 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -2551,30 +2551,31 @@ void max(const Mat& src1, double val, Mat& dst)
 }
 
 
-template<typename _Tp> static void
-muldiv_(const _Tp* src1, const _Tp* src2, _Tp* dst, size_t total, double scale, char op)
+template<typename SrcType, typename DstType> static void
+muldiv_(const SrcType* src1, const SrcType* src2, DstType* dst, size_t total, double scale, char op)
 {
-    if( op == '*' )
-        for( size_t i = 0; i < total; i++ )
-            dst[i] = saturate_cast<_Tp>((scale*src1[i])*src2[i]);
-    else if( src1 )
-        for( size_t i = 0; i < total; i++ )
-            dst[i] = src2[i] ? saturate_cast<_Tp>((scale*src1[i])/src2[i]) : 0;
-    else
-        for( size_t i = 0; i < total; i++ )
-            dst[i] = src2[i] ? saturate_cast<_Tp>(scale/src2[i]) : 0;
+    for( size_t i = 0; i < total; i++ )
+    {
+        double m1 = src1 ? (double)src1[i] : 1.0;
+        double m2 = src2 ? (double)src2[i] : 1.0;
+        if (op == '/')
+        {
+            m2 = abs(m2) > FLT_EPSILON ? (1.0 / m2) : 0;
+        }
+        dst[i] = saturate_cast<DstType>(scale * m1 * m2);
+    }
 }
 
-static void muldiv(const Mat& src1, const Mat& src2, Mat& dst, double scale, char op)
+static void muldiv(const Mat& src1, const Mat& src2, Mat& dst, int ctype, double scale, char op)
 {
-    dst.create(src2.dims, src2.size, src2.type());
+    dst.create(src2.dims, src2.size, (ctype >= 0 ? ctype : src2.type()));
     CV_Assert( src1.empty() || (src1.type() == src2.type() && src1.size == src2.size) );
     const Mat *arrays[]={&src1, &src2, &dst, 0};
     Mat planes[3];
 
     NAryMatIterator it(arrays, planes);
     size_t total = planes[1].total()*planes[1].channels();
-    size_t i, nplanes = it.nplanes, depth = src2.depth();
+    size_t i, nplanes = it.nplanes, srcDepth = src2.depth(), dstDepth = dst.depth();
 
     for( i = 0; i < nplanes; i++, ++it )
     {
@@ -2582,44 +2583,70 @@ static void muldiv(const Mat& src1, const Mat& src2, Mat& dst, double scale, cha
         const uchar* sptr2 = planes[1].ptr();
         uchar* dptr = planes[2].ptr();
 
-        switch( depth )
+        if (srcDepth == dstDepth)
         {
-        case CV_8U:
-            muldiv_((const uchar*)sptr1, (const uchar*)sptr2, (uchar*)dptr, total, scale, op);
-            break;
-        case CV_8S:
-            muldiv_((const schar*)sptr1, (const schar*)sptr2, (schar*)dptr, total, scale, op);
-            break;
-        case CV_16U:
-            muldiv_((const ushort*)sptr1, (const ushort*)sptr2, (ushort*)dptr, total, scale, op);
-            break;
-        case CV_16S:
-            muldiv_((const short*)sptr1, (const short*)sptr2, (short*)dptr, total, scale, op);
-            break;
-        case CV_32S:
-            muldiv_((const int*)sptr1, (const int*)sptr2, (int*)dptr, total, scale, op);
-            break;
-        case CV_32F:
-            muldiv_((const float*)sptr1, (const float*)sptr2, (float*)dptr, total, scale, op);
-            break;
-        case CV_64F:
-            muldiv_((const double*)sptr1, (const double*)sptr2, (double*)dptr, total, scale, op);
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "");
+            switch( srcDepth )
+            {
+            case CV_8U:
+                muldiv_((const uchar*)sptr1, (const uchar*)sptr2, (uchar*)dptr, total, scale, op);
+                break;
+            case CV_8S:
+                muldiv_((const schar*)sptr1, (const schar*)sptr2, (schar*)dptr, total, scale, op);
+                break;
+            case CV_16U:
+                muldiv_((const ushort*)sptr1, (const ushort*)sptr2, (ushort*)dptr, total, scale, op);
+                break;
+            case CV_16S:
+                muldiv_((const short*)sptr1, (const short*)sptr2, (short*)dptr, total, scale, op);
+                break;
+            case CV_32S:
+                muldiv_((const int*)sptr1, (const int*)sptr2, (int*)dptr, total, scale, op);
+                break;
+            case CV_32F:
+                muldiv_((const float*)sptr1, (const float*)sptr2, (float*)dptr, total, scale, op);
+                break;
+            case CV_64F:
+                muldiv_((const double*)sptr1, (const double*)sptr2, (double*)dptr, total, scale, op);
+                break;
+            default:
+                CV_Error(Error::StsUnsupportedFormat, "");
+            }
+        }
+        else
+        {
+            if (srcDepth == CV_8U && dstDepth == CV_16U)
+            {
+                muldiv_((const uchar*)sptr1, (const uchar*)sptr2, (ushort*)dptr, total, scale, op);
+            }
+            else if (srcDepth == CV_8S && dstDepth == CV_16S)
+            {
+                muldiv_((const schar*)sptr1, (const schar*)sptr2, (short*)dptr, total, scale, op);
+            }
+            else if (srcDepth == CV_8U && dstDepth == CV_32F)
+            {
+                muldiv_((const uchar*)sptr1, (const uchar*)sptr2, (float*)dptr, total, scale, op);
+            }
+            else if (srcDepth == CV_8S && dstDepth == CV_32F)
+            {
+                muldiv_((const schar*)sptr1, (const schar*)sptr2, (float*)dptr, total, scale, op);
+            }
+            else
+            {
+                CV_Error(Error::StsUnsupportedFormat, "This format combination is not supported yet");
+            }
         }
     }
 }
 
 
-void multiply(const Mat& src1, const Mat& src2, Mat& dst, double scale)
+void multiply(const Mat& src1, const Mat& src2, Mat& dst, double scale, int ctype)
 {
-    muldiv( src1, src2, dst, scale, '*' );
+    muldiv( src1, src2, dst, ctype, scale, '*' );
 }
 
-void divide(const Mat& src1, const Mat& src2, Mat& dst, double scale)
+void divide(const Mat& src1, const Mat& src2, Mat& dst, double scale, int ctype)
 {
-    muldiv( src1, src2, dst, scale, '/' );
+    muldiv( src1, src2, dst, ctype, scale, '/' );
 }
 
 
diff --git a/modules/video/include/opencv2/video.hpp b/modules/video/include/opencv2/video.hpp
index a3dde603992c..b1c19196d499 100644
--- a/modules/video/include/opencv2/video.hpp
+++ b/modules/video/include/opencv2/video.hpp
@@ -49,7 +49,6 @@
   @{
     @defgroup video_motion Motion Analysis
     @defgroup video_track Object Tracking
-    @defgroup video_c C API
   @}
 */
 
diff --git a/modules/video/include/opencv2/video/detail/tracking.detail.hpp b/modules/video/include/opencv2/video/detail/tracking.detail.hpp
index 1e6107900d91..3c7823b7dccc 100644
--- a/modules/video/include/opencv2/video/detail/tracking.detail.hpp
+++ b/modules/video/include/opencv2/video/detail/tracking.detail.hpp
@@ -171,7 +171,7 @@ width, height, orientation, etc.
 class CV_EXPORTS TrackerTargetState
 {
 public:
-    virtual ~TrackerTargetState() {};
+    virtual ~TrackerTargetState() {}
     /** @brief Get the position
     * @return The position
     */
diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp
index eb5a6c703070..df34a9f97cb6 100644
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -166,7 +166,7 @@ performance boost.
 The function implements a sparse iterative version of the Lucas-Kanade optical flow in pyramids. See
 @cite Bouguet00 . The function is parallelized with the TBB library.
 
-@note
+@note Some examples:
 
 -   An example using the Lucas-Kanade optical flow algorithm can be found at
     opencv_source_code/samples/cpp/lkdemo.cpp
@@ -213,7 +213,7 @@ The function finds an optical flow for each prev pixel using the @cite Farneback
 
 \f[\texttt{prev} (y,x)  \sim \texttt{next} ( y + \texttt{flow} (y,x)[1],  x + \texttt{flow} (y,x)[0])\f]
 
-@note
+@note Some examples:
 
 -   An example using the optical flow algorithm described by Gunnar Farneback can be found at
     opencv_source_code/samples/cpp/fback.cpp
@@ -564,6 +564,12 @@ class CV_EXPORTS_W VariationalRefinement : public DenseOpticalFlow
     /** @copybrief getGamma @see getGamma */
     CV_WRAP virtual void setGamma(float val) = 0;
 
+    /** @brief Norm value shift for robust penalizer
+    @see setEpsilon */
+    CV_WRAP virtual float getEpsilon() const = 0;
+    /** @copybrief getEpsilon @see getEpsilon */
+    CV_WRAP virtual void setEpsilon(float val) = 0;
+
     /** @brief Creates an instance of VariationalRefinement
     */
     CV_WRAP static Ptr<VariationalRefinement> create();
@@ -645,6 +651,12 @@ class CV_EXPORTS_W DISOpticalFlow : public DenseOpticalFlow
     /** @copybrief getVariationalRefinementGamma @see getVariationalRefinementGamma */
     CV_WRAP virtual void setVariationalRefinementGamma(float val) = 0;
 
+    /** @brief Norm value shift for robust penalizer
+    @see setVariationalRefinementEpsilon */
+    CV_WRAP virtual float getVariationalRefinementEpsilon() const = 0;
+    /** @copybrief getVariationalRefinementEpsilon @see getVariationalRefinementEpsilon */
+    CV_WRAP virtual void setVariationalRefinementEpsilon(float val) = 0;
+
 
     /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on
         by default as it typically provides a noticeable quality boost because of increased robustness to
@@ -887,6 +899,43 @@ class CV_EXPORTS_W TrackerNano : public Tracker
     //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
 };
 
+/** @brief the VIT tracker is a super lightweight dnn-based general object tracking.
+ *
+ *  VIT tracker is much faster and extremely lightweight due to special model structure, the model file is about 767KB.
+ *  Model download link: https://github.com/opencv/opencv_zoo/tree/main/models/object_tracking_vittrack
+ *  Author: PengyuLiu, 1872918507@qq.com
+ */
+class CV_EXPORTS_W TrackerVit : public Tracker
+{
+protected:
+    TrackerVit();  // use ::create()
+public:
+    virtual ~TrackerVit() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string net;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+        CV_PROP_RW Scalar meanvalue;
+        CV_PROP_RW Scalar stdvalue;
+    };
+
+    /** @brief Constructor
+    @param parameters vit tracker parameters TrackerVit::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerVit> create(const TrackerVit::Params& parameters = TrackerVit::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    // void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    // bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
 //! @} video_track
 
 } // cv
diff --git a/modules/video/src/dis_flow.cpp b/modules/video/src/dis_flow.cpp
index a260b8726bb9..75090d093dda 100644
--- a/modules/video/src/dis_flow.cpp
+++ b/modules/video/src/dis_flow.cpp
@@ -67,6 +67,7 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
     float variational_refinement_alpha;
     float variational_refinement_gamma;
     float variational_refinement_delta;
+    float variational_refinement_epsilon;
     bool use_mean_normalization;
     bool use_spatial_propagation;
 
@@ -92,6 +93,8 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
     void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; }
     float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; }
     void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; }
+    float getVariationalRefinementEpsilon() const CV_OVERRIDE { return variational_refinement_epsilon; }
+    void setVariationalRefinementEpsilon(float val) CV_OVERRIDE { variational_refinement_epsilon = val; }
 
     bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; }
     void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; }
@@ -219,6 +222,7 @@ DISOpticalFlowImpl::DISOpticalFlowImpl()
     variational_refinement_alpha = 20.f;
     variational_refinement_gamma = 10.f;
     variational_refinement_delta = 5.f;
+    variational_refinement_epsilon = 0.01f;
 
     border_size = 16;
     use_mean_normalization = true;
@@ -306,6 +310,7 @@ void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_fl
             variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
             variational_refinement_processors[i]->setDelta(variational_refinement_delta);
             variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
+            variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon);
             variational_refinement_processors[i]->setSorIterations(5);
             variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
 
@@ -520,16 +525,16 @@ DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOp
     v_expand(I0_row_8, I0_row_4_left, I0_row_4_right);                                                                 \
                                                                                                                        \
     /* Compute diffs between I0 and bilinearly interpolated I1: */                                                     \
-    I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) +                                              \
-                  w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) +                                      \
-                  w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) +                                         \
-                  w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) -                                 \
-                  v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left));                                                      \
-    I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) +                                            \
-                   w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) +                                    \
-                   w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) +                                       \
-                   w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) -                               \
-                   v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
+    I_diff_left = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left))),                             \
+                  v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left))),                                 \
+                  v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left))),                                    \
+                  v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)))),                           \
+                  v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left)));                                                     \
+    I_diff_right = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right))),                           \
+                   v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right))),                               \
+                   v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right))),                                  \
+                   v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)))),                         \
+                   v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right)));
 
 #define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW                                                                     \
     I0_ptr += I0_stride;                                                                                               \
@@ -572,9 +577,9 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *
             v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
 
             /* Update the sums: */
-            Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
-            Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
-            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            Ux_vec = v_add(Ux_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right))));
+            Uy_vec = v_add(Uy_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right))));
+            SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
 
             I0x_ptr += I0_stride;
             I0y_ptr += I0_stride;
@@ -640,10 +645,10 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr,
             v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
 
             /* Update the sums: */
-            sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
-            sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
-            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
-            sum_diff_vec += I_diff_left + I_diff_right;
+            sum_I0x_mul_vec = v_add(sum_I0x_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right))));
+            sum_I0y_mul_vec = v_add(sum_I0y_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right))));
+            sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
+            sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right));
 
             I0x_ptr += I0_stride;
             I0y_ptr += I0_stride;
@@ -692,7 +697,7 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri
         for (int row = 0; row < 8; row++)
         {
             HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
-            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
             HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
         }
         SSD = v_reduce_sum(SSD_vec);
@@ -728,8 +733,8 @@ inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int
         for (int row = 0; row < 8; row++)
         {
             HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
-            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
-            sum_diff_vec += I_diff_left + I_diff_right;
+            sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
+            sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right));
             HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
         }
         sum_diff = v_reduce_sum(sum_diff_vec);
@@ -1274,6 +1279,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow,
             variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
             variational_refinement_processors[i]->setDelta(variational_refinement_delta);
             variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
+            variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon);
             variational_refinement_processors[i]->setSorIterations(5);
             variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
 
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 8467035dbf43..6d51c0cf1acf 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -97,8 +97,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
                 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
 
-                v_int16x8 t1 = s2 - s0;
-                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+                v_int16x8 t1 = v_sub(s2, s0);
+                v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
 
                 v_store(trow0 + x, t0);
                 v_store(trow1 + x, t1);
@@ -134,8 +134,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
                 v_int16x8 s3 = v_load(trow1 + x);
                 v_int16x8 s4 = v_load(trow1 + x + cn);
 
-                v_int16x8 t0 = s1 - s0;
-                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+                v_int16x8 t0 = v_sub(s1, s0);
+                v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
 
                 v_store_interleave((drow + x*2), t0, t1);
             }
@@ -293,10 +293,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                 v_zip(v00, v01, t00, t01);
                 v_zip(v10, v11, t10, t11);
 
-                t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
-                t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
-                t0 = t0 >> (W_BITS1-5);
-                t1 = t1 >> (W_BITS1-5);
+                t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1));
+                t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1));
+                t0 = v_shr<W_BITS1 - 5>(t0);
+                t1 = v_shr<W_BITS1 - 5>(t1);
                 v_store(Iptr + x, v_pack(t0, t1));
 
                 v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -307,10 +307,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                 v_zip(v00, v01, t00, t01);
                 v_zip(v10, v11, t10, t11);
 
-                t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
-                t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
-                t0 = t0 >> W_BITS1;
-                t1 = t1 >> W_BITS1;
+                t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1));
+                t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1));
+                t0 = v_shr<W_BITS1>(t0);
+                t1 = v_shr<W_BITS1>(t1);
                 v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
                 v_store(dIptr, v00);
 
@@ -332,10 +332,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                 v_zip(v00, v01, t00, t01);
                 v_zip(v10, v11, t10, t11);
 
-                t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
-                t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
-                t0 = t0 >> W_BITS1;
-                t1 = t1 >> W_BITS1;
+                t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1));
+                t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1));
+                t0 = v_shr<W_BITS1>(t0);
+                t1 = v_shr<W_BITS1>(t1);
                 v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
                 v_store(dIptr + 4*2, v00);
 
@@ -548,18 +548,18 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                     v_zip(v00, v01, t00, t01);
                     v_zip(v10, v11, t10, t11);
 
-                    t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
-                    t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
-                    t0 = t0 >> (W_BITS1-5);
-                    t1 = t1 >> (W_BITS1-5);
-                    diff0 = v_pack(t0, t1) - diff0;
+                    t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1));
+                    t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1));
+                    t0 = v_shr<W_BITS1 - 5>(t0);
+                    t1 = v_shr<W_BITS1 - 5>(t1);
+                    diff0 = v_sub(v_pack(t0, t1), diff0);
                     v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
                     v00 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                     v01 = v_reinterpret_as_s16(v_load(dIptr + 8));
                     v_zip(v00, v01, v10, v11);
                     v_zip(diff2, diff1, v00, v01);
-                    qb0 += v_cvt_f32(v_dotprod(v00, v10));
-                    qb1 += v_cvt_f32(v_dotprod(v01, v11));
+                    qb0 = v_add(qb0, v_cvt_f32(v_dotprod(v00, v10)));
+                    qb1 = v_add(qb1, v_cvt_f32(v_dotprod(v01, v11)));
                 }
 #endif
 
@@ -647,7 +647,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 
 #if CV_SIMD128 && !CV_NEON
             v_float32x4 qf0, qf1;
-            v_recombine(v_interleave_pairs(qb0 + qb1), v_setzero_f32(), qf0, qf1);
+            v_recombine(v_interleave_pairs(v_add(qb0, qb1)), v_setzero_f32(), qf0, qf1);
             ib1 += v_reduce_sum(qf0);
             ib2 += v_reduce_sum(qf1);
 #endif
diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp
index 2b164b62d32d..02e878a57799 100644
--- a/modules/video/src/optflowgf.cpp
+++ b/modules/video/src/optflowgf.cpp
@@ -463,22 +463,22 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
                 const float *sptr0 = srow[m], *sptr1;
                 v_float32x4 g4 = v_load(simd_kernel);
                 v_float32x4 s0, s1, s2, s3;
-                s0 = v_load(sptr0 + x) * g4;
-                s1 = v_load(sptr0 + x + 4) * g4;
-                s2 = v_load(sptr0 + x + 8) * g4;
-                s3 = v_load(sptr0 + x + 12) * g4;
+                s0 = v_mul(v_load(sptr0 + x), g4);
+                s1 = v_mul(v_load(sptr0 + x + 4), g4);
+                s2 = v_mul(v_load(sptr0 + x + 8), g4);
+                s3 = v_mul(v_load(sptr0 + x + 12), g4);
 
                 for( i = 1; i <= m; i++ )
                 {
                     v_float32x4 x0, x1;
                     sptr0 = srow[m+i], sptr1 = srow[m-i];
                     g4 = v_load(simd_kernel + i*4);
-                    x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
-                    x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4);
+                    x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x));
+                    x1 = v_add(v_load(sptr0 + x + 4), v_load(sptr1 + x + 4));
                     s0 = v_muladd(x0, g4, s0);
                     s1 = v_muladd(x1, g4, s1);
-                    x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8);
-                    x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12);
+                    x0 = v_add(v_load(sptr0 + x + 8), v_load(sptr1 + x + 8));
+                    x1 = v_add(v_load(sptr0 + x + 12), v_load(sptr1 + x + 12));
                     s2 = v_muladd(x0, g4, s2);
                     s3 = v_muladd(x1, g4, s3);
                 }
@@ -493,13 +493,13 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
             {
                 const float *sptr0 = srow[m], *sptr1;
                 v_float32x4 g4 = v_load(simd_kernel);
-                v_float32x4 s0 = v_load(sptr0 + x) * g4;
+                v_float32x4 s0 = v_mul(v_load(sptr0 + x), g4);
 
                 for( i = 1; i <= m; i++ )
                 {
                     sptr0 = srow[m+i], sptr1 = srow[m-i];
                     g4 = v_load(simd_kernel + i*4);
-                    v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
+                    v_float32x4 x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x));
                     s0 = v_muladd(x0, g4, s0);
                 }
                 v_store(vsum + x, s0);
@@ -528,14 +528,14 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
             for( ; x <= width*5 - 8; x += 8 )
             {
                 v_float32x4 g4 = v_load(simd_kernel);
-                v_float32x4 s0 = v_load(vsum + x) * g4;
-                v_float32x4 s1 = v_load(vsum + x + 4) * g4;
+                v_float32x4 s0 = v_mul(v_load(vsum + x), g4);
+                v_float32x4 s1 = v_mul(v_load(vsum + x + 4), g4);
 
                 for( i = 1; i <= m; i++ )
                 {
                     g4 = v_load(simd_kernel + i*4);
-                    v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5);
-                    v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4);
+                    v_float32x4 x0 = v_add(v_load(vsum + x - i * 5), v_load(vsum + x + i * 5));
+                    v_float32x4 x1 = v_add(v_load(vsum + x - i * 5 + 4), v_load(vsum + x + i * 5 + 4));
                     s0 = v_muladd(x0, g4, s0);
                     s1 = v_muladd(x1, g4, s1);
                 }
diff --git a/modules/video/src/optical_flow_io.cpp b/modules/video/src/optical_flow_io.cpp
index 8f9efedc3322..ff34f250973c 100644
--- a/modules/video/src/optical_flow_io.cpp
+++ b/modules/video/src/optical_flow_io.cpp
@@ -52,12 +52,12 @@ CV_EXPORTS_W Mat readOpticalFlow( const String& path )
     Mat_<Point2f> flow;
     std::ifstream file(path.c_str(), std::ios_base::binary);
     if ( !file.good() )
-        return std::move(flow); // no file - return empty matrix
+        return Mat(); // no file - return empty matrix
 
     float tag;
     file.read((char*) &tag, sizeof(float));
     if ( tag != FLOW_TAG_FLOAT )
-        return std::move(flow);
+        return Mat();
 
     int width, height;
 
@@ -76,14 +76,14 @@ CV_EXPORTS_W Mat readOpticalFlow( const String& path )
             if ( !file.good() )
             {
                 flow.release();
-                return std::move(flow);
+                return Mat();
             }
 
             flow(i, j) = u;
         }
     }
     file.close();
-    return std::move(flow);
+    return Mat(flow);
 }
 
 CV_EXPORTS_W bool writeOpticalFlow( const String& path, InputArray flow )
diff --git a/modules/video/src/tracking/detail/tracker_mil_model.hpp b/modules/video/src/tracking/detail/tracker_mil_model.hpp
index dddfae5536e9..027ddd0679ee 100644
--- a/modules/video/src/tracking/detail/tracker_mil_model.hpp
+++ b/modules/video/src/tracking/detail/tracker_mil_model.hpp
@@ -36,7 +36,7 @@ class TrackerMILModel : public detail::TrackerModel
     /**
    * \brief Destructor
    */
-    ~TrackerMILModel() {};
+    ~TrackerMILModel() {}
 
     /**
    * \brief Set the mode
diff --git a/modules/video/src/tracking/detail/tracker_mil_state.hpp b/modules/video/src/tracking/detail/tracker_mil_state.hpp
index 12af1c33dfea..f4eeee979675 100644
--- a/modules/video/src/tracking/detail/tracker_mil_state.hpp
+++ b/modules/video/src/tracking/detail/tracker_mil_state.hpp
@@ -34,7 +34,7 @@ class CV_EXPORTS TrackerStateEstimatorMILBoosting : public TrackerStateEstimator
         */
         TrackerMILTargetState(const Point2f& position, int width, int height, bool foreground, const Mat& features);
 
-        ~TrackerMILTargetState() {};
+        ~TrackerMILTargetState() {}
 
         /** @brief Set label: true for target foreground, false for background
         @param foreground Label for background/foreground
diff --git a/modules/video/src/tracking/detail/tracking_online_mil.cpp b/modules/video/src/tracking/detail/tracking_online_mil.cpp
index c9472aa947c0..b1d74916f75c 100644
--- a/modules/video/src/tracking/detail/tracking_online_mil.cpp
+++ b/modules/video/src/tracking/detail/tracking_online_mil.cpp
@@ -29,7 +29,7 @@ class SortableElementRev
     bool operator<(SortableElementRev<T>& b)
     {
         return (_val < b._val);
-    };
+    }
 };
 
 static bool CompareSortableElementRev(const SortableElementRev<float>& i, const SortableElementRev<float>& j)
@@ -57,7 +57,7 @@ void sort_order_des(std::vector<T>& v, std::vector<int>& order)
         order[i] = v2[i]._ind;
         v[i] = v2[i]._val;
     }
-};
+}
 
 //implementations for strong classifier
 
diff --git a/modules/video/src/tracking/tracker_vit.cpp b/modules/video/src/tracking/tracker_vit.cpp
new file mode 100644
index 000000000000..bef42dbb4d7e
--- /dev/null
+++ b/modules/video/src/tracking/tracker_vit.cpp
@@ -0,0 +1,222 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Author, PengyuLiu, 1872918507@qq.com
+
+#include "../precomp.hpp"
+#ifdef HAVE_OPENCV_DNN
+#include "opencv2/dnn.hpp"
+#endif
+
+namespace cv {
+
+TrackerVit::TrackerVit()
+{
+    // nothing
+}
+
+TrackerVit::~TrackerVit()
+{
+    // nothing
+}
+
+TrackerVit::Params::Params()
+{
+    net = "vitTracker.onnx";
+    meanvalue = Scalar{0.485, 0.456, 0.406};
+    stdvalue = Scalar{0.229, 0.224, 0.225};
+#ifdef HAVE_OPENCV_DNN
+    backend = dnn::DNN_BACKEND_DEFAULT;
+    target = dnn::DNN_TARGET_CPU;
+#else
+    backend = -1;  // invalid value
+    target = -1;  // invalid value
+#endif
+}
+
+#ifdef HAVE_OPENCV_DNN
+
+class TrackerVitImpl : public TrackerVit
+{
+public:
+    TrackerVitImpl(const TrackerVit::Params& parameters)
+        : params(parameters)
+    {
+        net = dnn::readNet(params.net);
+        CV_Assert(!net.empty());
+
+        net.setPreferableBackend(params.backend);
+        net.setPreferableTarget(params.target);
+    }
+
+    void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    bool update(InputArray image, Rect& boundingBox) CV_OVERRIDE;
+    float getTrackingScore() CV_OVERRIDE;
+
+    Rect rect_last;
+    float tracking_score;
+
+    TrackerVit::Params params;
+
+
+protected:
+    void preprocess(const Mat& src, Mat& dst, Size size);
+
+    const Size searchSize{256, 256};
+    const Size templateSize{128, 128};
+
+    Mat hanningWindow;
+
+    dnn::Net net;
+    Mat image;
+};
+
+static void crop_image(const Mat& src, Mat& dst, Rect box, int factor)
+{
+    int x = box.x, y = box.y, w = box.width, h = box.height;
+    int crop_sz = cvCeil(sqrt(w * h) * factor);
+
+    int x1 = x + (w - crop_sz) / 2;
+    int x2 = x1 + crop_sz;
+    int y1 = y + (h - crop_sz) / 2;
+    int y2 = y1 + crop_sz;
+
+    int x1_pad = std::max(0, -x1);
+    int y1_pad = std::max(0, -y1);
+    int x2_pad = std::max(x2 - src.size[1] + 1, 0);
+    int y2_pad = std::max(y2 - src.size[0] + 1, 0);
+
+    Rect roi(x1 + x1_pad, y1 + y1_pad, x2 - x2_pad - x1 - x1_pad, y2 - y2_pad - y1 - y1_pad);
+    Mat im_crop = src(roi);
+    copyMakeBorder(im_crop, dst, y1_pad, y2_pad, x1_pad, x2_pad, BORDER_CONSTANT);
+}
+
+void TrackerVitImpl::preprocess(const Mat& src, Mat& dst, Size size)
+{
+    Mat mean = Mat(size, CV_32FC3, params.meanvalue);
+    Mat std = Mat(size, CV_32FC3, params.stdvalue);
+    mean = dnn::blobFromImage(mean, 1.0, Size(), Scalar(), false);
+    std = dnn::blobFromImage(std, 1.0, Size(), Scalar(), false);
+
+    Mat img;
+    resize(src, img, size);
+
+    dst = dnn::blobFromImage(img, 1.0, Size(), Scalar(), false);
+    dst /= 255;
+    dst = (dst - mean) / std;
+}
+
+static Mat hann1d(int sz, bool centered = true) {
+    Mat hanningWindow(sz, 1, CV_32FC1);
+    float* data = hanningWindow.ptr<float>(0);
+
+    if(centered) {
+        for(int i = 0; i < sz; i++) {
+            float val = 0.5f * (1.f - std::cos(static_cast<float>(2 * M_PI / (sz + 1)) * (i + 1)));
+            data[i] = val;
+        }
+    }
+    else {
+        int half_sz = sz / 2;
+        for(int i = 0; i <= half_sz; i++) {
+            float val = 0.5f * (1.f + std::cos(static_cast<float>(2 * M_PI / (sz + 2)) * i));
+            data[i] = val;
+            data[sz - 1 - i] = val;
+        }
+    }
+
+    return hanningWindow;
+}
+
+static Mat hann2d(Size size, bool centered = true) {
+    int rows = size.height;
+    int cols = size.width;
+
+    Mat hanningWindowRows = hann1d(rows, centered);
+    Mat hanningWindowCols = hann1d(cols, centered);
+
+    Mat hanningWindow = hanningWindowRows * hanningWindowCols.t();
+
+    return hanningWindow;
+}
+
+static Rect returnfromcrop(float x, float y, float w, float h, Rect res_Last)
+{
+    int cropwindowwh = 4 * cvFloor(sqrt(res_Last.width * res_Last.height));
+    int x0 = res_Last.x + (res_Last.width - cropwindowwh) / 2;
+    int y0 = res_Last.y + (res_Last.height - cropwindowwh) / 2;
+    Rect finalres;
+    finalres.x = cvFloor(x * cropwindowwh + x0);
+    finalres.y = cvFloor(y * cropwindowwh + y0);
+    finalres.width = cvFloor(w * cropwindowwh);
+    finalres.height = cvFloor(h * cropwindowwh);
+    return finalres;
+}
+
+void TrackerVitImpl::init(InputArray image_, const Rect &boundingBox_)
+{
+    image = image_.getMat().clone();
+    Mat crop;
+    crop_image(image, crop, boundingBox_, 2);
+    Mat blob;
+    preprocess(crop, blob, templateSize);
+    net.setInput(blob, "template");
+    Size size(16, 16);
+    hanningWindow = hann2d(size, true);
+    rect_last = boundingBox_;
+}
+
+bool TrackerVitImpl::update(InputArray image_, Rect &boundingBoxRes)
+{
+    image = image_.getMat().clone();
+    Mat crop;
+    crop_image(image, crop, rect_last, 4);
+    Mat blob;
+    preprocess(crop, blob, searchSize);
+    net.setInput(blob, "search");
+    std::vector<String> outputName = {"output1", "output2", "output3"};
+    std::vector<Mat> outs;
+    net.forward(outs, outputName);
+    CV_Assert(outs.size() == 3);
+
+    Mat conf_map = outs[0].reshape(0, {16, 16});
+    Mat size_map = outs[1].reshape(0, {2, 16, 16});
+    Mat offset_map = outs[2].reshape(0, {2, 16, 16});
+
+    multiply(conf_map, hanningWindow, conf_map);
+
+    double maxVal;
+    Point maxLoc;
+    minMaxLoc(conf_map, nullptr, &maxVal, nullptr, &maxLoc);
+    tracking_score = static_cast<float>(maxVal);
+
+    float cx = (maxLoc.x + offset_map.at<float>(0, maxLoc.y, maxLoc.x)) / 16;
+    float cy = (maxLoc.y + offset_map.at<float>(1, maxLoc.y, maxLoc.x)) / 16;
+    float w = size_map.at<float>(0, maxLoc.y, maxLoc.x);
+    float h = size_map.at<float>(1, maxLoc.y, maxLoc.x);
+
+    Rect finalres = returnfromcrop(cx - w / 2, cy - h / 2, w, h, rect_last);
+    rect_last = finalres;
+    boundingBoxRes = finalres;
+    return true;
+}
+
+float TrackerVitImpl::getTrackingScore()
+{
+    return tracking_score;
+}
+
+Ptr<TrackerVit> TrackerVit::create(const TrackerVit::Params& parameters)
+{
+    return makePtr<TrackerVitImpl>(parameters);
+}
+
+#else  // OPENCV_HAVE_DNN
+Ptr<TrackerVit> TrackerVit::create(const TrackerVit::Params& parameters)
+{
+    CV_UNUSED(parameters);
+    CV_Error(Error::StsNotImplemented, "to use vittrack, the tracking module needs to be built with opencv_dnn !");
+}
+#endif  // OPENCV_HAVE_DNN
+}
diff --git a/modules/video/src/variational_refinement.cpp b/modules/video/src/variational_refinement.cpp
index cca30f1ce75a..b1891c60df0c 100644
--- a/modules/video/src/variational_refinement.cpp
+++ b/modules/video/src/variational_refinement.cpp
@@ -76,6 +76,8 @@ class VariationalRefinementImpl CV_FINAL : public VariationalRefinement
     void setDelta(float val) CV_OVERRIDE { delta = val; }
     float getGamma() const CV_OVERRIDE { return gamma; }
     void setGamma(float val) CV_OVERRIDE { gamma = val; }
+    float getEpsilon() const CV_OVERRIDE { return epsilon; }
+    void setEpsilon(float val) CV_OVERRIDE { epsilon = val; }
 
   protected: //!< internal buffers
     /* This struct defines a special data layout for Mat_<float>. Original buffer is split into two: one for "red"
@@ -651,15 +653,15 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range
             pdU_vec = v_load(pdU + j);
             pdV_vec = v_load(pdV + j);
 
-            derivNorm_vec = pIx_vec * pIx_vec + pIy_vec * pIy_vec + zeta_vec;
-            Ik1z_vec = pIz_vec + pIx_vec * pdU_vec + pIy_vec * pdV_vec;
-            weight_vec = (delta_vec / v_sqrt(Ik1z_vec * Ik1z_vec / derivNorm_vec + eps_vec)) / derivNorm_vec;
+            derivNorm_vec = v_add(v_add(v_mul(pIx_vec, pIx_vec), v_mul(pIy_vec, pIy_vec)), zeta_vec);
+            Ik1z_vec = v_add(v_add(pIz_vec, v_mul(pIx_vec, pdU_vec)), v_mul(pIy_vec, pdV_vec));
+            weight_vec = v_div(v_div(delta_vec, v_sqrt(v_add(v_div(v_mul(Ik1z_vec, Ik1z_vec), derivNorm_vec), eps_vec))), derivNorm_vec);
 
-            pa11_vec = weight_vec * (pIx_vec * pIx_vec) + zeta_vec;
-            pa12_vec = weight_vec * (pIx_vec * pIy_vec);
-            pa22_vec = weight_vec * (pIy_vec * pIy_vec) + zeta_vec;
-            pb1_vec = zero_vec - weight_vec * (pIz_vec * pIx_vec);
-            pb2_vec = zero_vec - weight_vec * (pIz_vec * pIy_vec);
+            pa11_vec = v_add(v_mul(weight_vec, v_mul(pIx_vec, pIx_vec)), zeta_vec);
+            pa12_vec = v_mul(weight_vec, v_mul(pIx_vec, pIy_vec));
+            pa22_vec = v_add(v_mul(weight_vec, v_mul(pIy_vec, pIy_vec)), zeta_vec);
+            pb1_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIx_vec)));
+            pb2_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIy_vec)));
 
             pIxx_vec = v_load(pIxx + j);
             pIxy_vec = v_load(pIxy + j);
@@ -667,18 +669,17 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range
             pIxz_vec = v_load(pIxz + j);
             pIyz_vec = v_load(pIyz + j);
 
-            derivNorm_vec = pIxx_vec * pIxx_vec + pIxy_vec * pIxy_vec + zeta_vec;
-            derivNorm2_vec = pIyy_vec * pIyy_vec + pIxy_vec * pIxy_vec + zeta_vec;
-            Ik1zx_vec = pIxz_vec + pIxx_vec * pdU_vec + pIxy_vec * pdV_vec;
-            Ik1zy_vec = pIyz_vec + pIxy_vec * pdU_vec + pIyy_vec * pdV_vec;
-            weight_vec = gamma_vec / v_sqrt(Ik1zx_vec * Ik1zx_vec / derivNorm_vec +
-                                            Ik1zy_vec * Ik1zy_vec / derivNorm2_vec + eps_vec);
+            derivNorm_vec = v_add(v_add(v_mul(pIxx_vec, pIxx_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec);
+            derivNorm2_vec = v_add(v_add(v_mul(pIyy_vec, pIyy_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec);
+            Ik1zx_vec = v_add(v_add(pIxz_vec, v_mul(pIxx_vec, pdU_vec)), v_mul(pIxy_vec, pdV_vec));
+            Ik1zy_vec = v_add(v_add(pIyz_vec, v_mul(pIxy_vec, pdU_vec)), v_mul(pIyy_vec, pdV_vec));
+            weight_vec = v_div(gamma_vec, v_sqrt(v_add(v_add(v_div(v_mul(Ik1zx_vec, Ik1zx_vec), derivNorm_vec), v_div(v_mul(Ik1zy_vec, Ik1zy_vec), derivNorm2_vec)), eps_vec)));
 
-            pa11_vec += weight_vec * (pIxx_vec * pIxx_vec / derivNorm_vec + pIxy_vec * pIxy_vec / derivNorm2_vec);
-            pa12_vec += weight_vec * (pIxx_vec * pIxy_vec / derivNorm_vec + pIxy_vec * pIyy_vec / derivNorm2_vec);
-            pa22_vec += weight_vec * (pIxy_vec * pIxy_vec / derivNorm_vec + pIyy_vec * pIyy_vec / derivNorm2_vec);
-            pb1_vec -= weight_vec * (pIxx_vec * pIxz_vec / derivNorm_vec + pIxy_vec * pIyz_vec / derivNorm2_vec);
-            pb2_vec -= weight_vec * (pIxy_vec * pIxz_vec / derivNorm_vec + pIyy_vec * pIyz_vec / derivNorm2_vec);
+            pa11_vec = v_add(pa11_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxx_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm2_vec))));
+            pa12_vec = v_add(pa12_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyy_vec), derivNorm2_vec))));
+            pa22_vec = v_add(pa22_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyy_vec), derivNorm2_vec))));
+            pb1_vec = v_sub(pb1_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyz_vec), derivNorm2_vec))));
+            pb2_vec = v_sub(pb2_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyz_vec), derivNorm2_vec))));
 
             v_store(pa11 + j, pa11_vec);
             v_store(pa12 + j, pa12_vec);
@@ -850,26 +851,26 @@ void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()
             cW_u_vec = v_load(cW_u + j);
             cW_v_vec = v_load(cW_v + j);
 
-            ux_vec = v_load(cW_u_next + j) - cW_u_vec;
-            vx_vec = v_load(cW_v_next + j) - cW_v_vec;
-            uy_vec = v_load(cW_u_next_row + j) - cW_u_vec;
-            vy_vec = v_load(cW_v_next_row + j) - cW_v_vec;
+            ux_vec = v_sub(v_load(cW_u_next + j), cW_u_vec);
+            vx_vec = v_sub(v_load(cW_v_next + j), cW_v_vec);
+            uy_vec = v_sub(v_load(cW_u_next_row + j), cW_u_vec);
+            vy_vec = v_sub(v_load(cW_v_next_row + j), cW_v_vec);
             pWeight_vec =
-              alpha2_vec / v_sqrt(ux_vec * ux_vec + vx_vec * vx_vec + uy_vec * uy_vec + vy_vec * vy_vec + eps_vec);
+              v_div(alpha2_vec, v_sqrt(v_add(v_add(v_add(v_add(v_mul(ux_vec, ux_vec), v_mul(vx_vec, vx_vec)), v_mul(uy_vec, uy_vec)), v_mul(vy_vec, vy_vec)), eps_vec)));
             v_store(pWeight + j, pWeight_vec);
 
-            ux_vec = pWeight_vec * (v_load(pW_u_next + j) - v_load(pW_u + j));
-            vx_vec = pWeight_vec * (v_load(pW_v_next + j) - v_load(pW_v + j));
+            ux_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next + j), v_load(pW_u + j)));
+            vx_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next + j), v_load(pW_v + j)));
 
-            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
-            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
-            v_store(pB_u + j, v_load(pB_u + j) + ux_vec);
-            v_store(pB_v + j, v_load(pB_v + j) + vx_vec);
+            v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec));
+            v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec));
+            v_store(pB_u + j, v_add(v_load(pB_u + j), ux_vec));
+            v_store(pB_v + j, v_add(v_load(pB_v + j), vx_vec));
 
-            v_store(pA_u_next + j, v_load(pA_u_next + j) + pWeight_vec);
-            v_store(pA_v_next + j, v_load(pA_v_next + j) + pWeight_vec);
-            v_store(pB_u_next + j, v_load(pB_u_next + j) - ux_vec);
-            v_store(pB_v_next + j, v_load(pB_v_next + j) - vx_vec);
+            v_store(pA_u_next + j, v_add(v_load(pA_u_next + j), pWeight_vec));
+            v_store(pA_v_next + j, v_add(v_load(pA_v_next + j), pWeight_vec));
+            v_store(pB_u_next + j, v_sub(v_load(pB_u_next + j), ux_vec));
+            v_store(pB_v_next + j, v_sub(v_load(pB_v_next + j), vx_vec));
         }
 #endif
         for (; j < len - 1; j++)
@@ -956,18 +957,18 @@ void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator(
         for (; j < len - 3; j += 4)
         {
             pWeight_vec = v_load(pWeight + j);
-            uy_vec = pWeight_vec * (v_load(pW_u_next_row + j) - v_load(pW_u + j));
-            vy_vec = pWeight_vec * (v_load(pW_v_next_row + j) - v_load(pW_v + j));
-
-            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
-            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
-            v_store(pB_u + j, v_load(pB_u + j) + uy_vec);
-            v_store(pB_v + j, v_load(pB_v + j) + vy_vec);
-
-            v_store(pA_u_next_row + j, v_load(pA_u_next_row + j) + pWeight_vec);
-            v_store(pA_v_next_row + j, v_load(pA_v_next_row + j) + pWeight_vec);
-            v_store(pB_u_next_row + j, v_load(pB_u_next_row + j) - uy_vec);
-            v_store(pB_v_next_row + j, v_load(pB_v_next_row + j) - vy_vec);
+            uy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next_row + j), v_load(pW_u + j)));
+            vy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next_row + j), v_load(pW_v + j)));
+
+            v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec));
+            v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec));
+            v_store(pB_u + j, v_add(v_load(pB_u + j), uy_vec));
+            v_store(pB_v + j, v_add(v_load(pB_v + j), vy_vec));
+
+            v_store(pA_u_next_row + j, v_add(v_load(pA_u_next_row + j), pWeight_vec));
+            v_store(pA_v_next_row + j, v_add(v_load(pA_v_next_row + j), pWeight_vec));
+            v_store(pB_u_next_row + j, v_sub(v_load(pB_u_next_row + j), uy_vec));
+            v_store(pB_v_next_row + j, v_sub(v_load(pB_v_next_row + j), vy_vec));
         }
 #endif
         for (; j < len; j++)
@@ -1084,15 +1085,13 @@ void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &ran
             pdv_shifted_vec = v_reinterpret_as_f32(
               v_extract<3>(v_reinterpret_as_s32(pdv_prev_vec), v_reinterpret_as_s32(pdv_next_vec)));
 
-            sigmaU_vec = pW_shifted_vec * pdu_shifted_vec + pW_vec * pdu_next_vec + pW_prev_row_vec * pdu_prev_row_vec +
-                         pW_vec * pdu_next_row_vec;
-            sigmaV_vec = pW_shifted_vec * pdv_shifted_vec + pW_vec * pdv_next_vec + pW_prev_row_vec * pdv_prev_row_vec +
-                         pW_vec * pdv_next_row_vec;
+            sigmaU_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdu_shifted_vec), v_mul(pW_vec, pdu_next_vec)), v_mul(pW_prev_row_vec, pdu_prev_row_vec)), v_mul(pW_vec, pdu_next_row_vec));
+            sigmaV_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdv_shifted_vec), v_mul(pW_vec, pdv_next_vec)), v_mul(pW_prev_row_vec, pdv_prev_row_vec)), v_mul(pW_vec, pdv_next_row_vec));
 
             pdu_vec = v_load(pdu + j);
             pdv_vec = v_load(pdv + j);
-            pdu_vec += omega_vec * ((sigmaU_vec + v_load(pb1 + j) - pdv_vec * pa12_vec) / v_load(pa11 + j) - pdu_vec);
-            pdv_vec += omega_vec * ((sigmaV_vec + v_load(pb2 + j) - pdu_vec * pa12_vec) / v_load(pa22 + j) - pdv_vec);
+            pdu_vec = v_add(pdu_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaU_vec, v_load(pb1 + j)), v_mul(pdv_vec, pa12_vec)), v_load(pa11 + j)), pdu_vec)));
+            pdv_vec = v_add(pdv_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaV_vec, v_load(pb2 + j)), v_mul(pdu_vec, pa12_vec)), v_load(pa22 + j)), pdv_vec)));
             v_store(pdu + j, pdu_vec);
             v_store(pdv + j, pdv_vec);
 
diff --git a/modules/video/test/ocl/test_dis.cpp b/modules/video/test/ocl/test_dis.cpp
index 4df7f9a1970f..0a49452f13bf 100644
--- a/modules/video/test/ocl/test_dis.cpp
+++ b/modules/video/test/ocl/test_dis.cpp
@@ -46,7 +46,7 @@
 
 namespace opencv_test { namespace {
 
-CV_ENUM(DIS_TestPresets, DISOpticalFlow::PRESET_ULTRAFAST, DISOpticalFlow::PRESET_FAST, DISOpticalFlow::PRESET_MEDIUM);
+CV_ENUM(DIS_TestPresets, DISOpticalFlow::PRESET_ULTRAFAST, DISOpticalFlow::PRESET_FAST, DISOpticalFlow::PRESET_MEDIUM)
 
 typedef ocl::TSTestWithParam<DIS_TestPresets> OCL_DenseOpticalFlow_DIS;
 
diff --git a/modules/video/test/test_trackers.cpp b/modules/video/test/test_trackers.cpp
index 6ede40896cb9..aae4492bd724 100644
--- a/modules/video/test/test_trackers.cpp
+++ b/modules/video/test/test_trackers.cpp
@@ -160,4 +160,15 @@ TEST(NanoTrack, accuracy_NanoTrack_V2)
     checkTrackingAccuracy(tracker, 0.69);
 }
 
+TEST(vittrack, accuracy_vittrack)
+{
+    std::string model = cvtest::findDataFile("dnn/onnx/models/vitTracker.onnx");
+    cv::TrackerVit::Params params;
+    params.net = model;
+    cv::Ptr<Tracker> tracker = TrackerVit::create(params);
+    // NOTE: Test threshold was reduced from 0.67 (libjpeg-turbo) to 0.66 (libjpeg 9f),
+    // becase libjpeg and libjpeg-turbo produce slightly different images
+    checkTrackingAccuracy(tracker, 0.66);
+}
+
 }}  // namespace opencv_test::
diff --git a/modules/videoio/CMakeLists.txt b/modules/videoio/CMakeLists.txt
index eee706d306af..776c1bb84d74 100644
--- a/modules/videoio/CMakeLists.txt
+++ b/modules/videoio/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(VIDEOIO_ENABLE_PLUGINS_DEFAULT ON)
-if(EMSCRIPTEN OR IOS OR WINRT)
+if(EMSCRIPTEN OR IOS OR XROS OR WINRT)
   set(VIDEOIO_ENABLE_PLUGINS_DEFAULT OFF)
 endif()
 
@@ -197,7 +197,7 @@ if(TARGET ocv.3rdparty.aravis)
 endif()
 
 if(TARGET ocv.3rdparty.avfoundation)
-  if(IOS)
+  if(IOS OR XROS)
     list(APPEND videoio_srcs ${CMAKE_CURRENT_LIST_DIR}/src/cap_avfoundation.mm)
   else()
     list(APPEND videoio_srcs ${CMAKE_CURRENT_LIST_DIR}/src/cap_avfoundation_mac.mm)
@@ -236,25 +236,31 @@ if(TARGET ocv.3rdparty.android_native_camera)
 endif()
 
 if(TARGET ocv.3rdparty.obsensor)
-  list(APPEND videoio_srcs
-    ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor_capture.cpp)
-  list(APPEND videoio_hdrs
-    ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor_capture.hpp
-    ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_interface.hpp)
-  if(HAVE_OBSENSOR_MSMF)
+if(OBSENSOR_USE_ORBBEC_SDK)
     list(APPEND videoio_srcs
-      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
-      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_msmf.cpp)
-    list(APPEND videoio_hdrs
-      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
-      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_msmf.hpp)
-  elseif(HAVE_OBSENSOR_V4L2)
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor_liborbbec.hpp
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor_liborbbec.cpp)
+  else()
     list(APPEND videoio_srcs
-      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
-      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp)
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor_capture.cpp)
     list(APPEND videoio_hdrs
-      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
-      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_v4l2.hpp)
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor_capture.hpp
+      ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_interface.hpp)
+    if(HAVE_OBSENSOR_MSMF)
+      list(APPEND videoio_srcs
+        ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
+        ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_msmf.cpp)
+      list(APPEND videoio_hdrs
+        ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
+        ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_msmf.hpp)
+    elseif(HAVE_OBSENSOR_V4L2)
+      list(APPEND videoio_srcs
+        ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
+        ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_v4l2.cpp)
+      list(APPEND videoio_hdrs
+        ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_uvc_stream_channel.hpp
+        ${CMAKE_CURRENT_LIST_DIR}/src/cap_obsensor/obsensor_stream_channel_v4l2.hpp)
+    endif()
   endif()
   list(APPEND tgts ocv.3rdparty.obsensor)
 endif()
diff --git a/modules/videoio/cmake/detect_obsensor.cmake b/modules/videoio/cmake/detect_obsensor.cmake
index fe3f893b4896..cf5a9063a911 100644
--- a/modules/videoio/cmake/detect_obsensor.cmake
+++ b/modules/videoio/cmake/detect_obsensor.cmake
@@ -1,28 +1,54 @@
 # --- obsensor ---
 if(NOT HAVE_OBSENSOR)
-  if(WIN32)
-    check_include_file(mfapi.h HAVE_MFAPI)
-    check_include_file(vidcap.h HAVE_VIDCAP)
-    if(HAVE_MFAPI AND HAVE_VIDCAP)
-      set(HAVE_OBSENSOR TRUE)
-      set(HAVE_OBSENSOR_MSMF TRUE)
-      ocv_add_external_target(obsensor "" "" "HAVE_OBSENSOR;HAVE_OBSENSOR_MSMF")
-    else()
-      set(HAVE_OBSENSOR OFF)
-      set(HAVE_OBSENSOR_MSMF OFF)
-      if(NOT HAVE_MFAPI)
-        MESSAGE(STATUS "Could not find mfapi.h. Turning HAVE_OBSENSOR OFF")
-      endif()
-      if(NOT HAVE_VIDCAP)
-        MESSAGE(STATUS "Could not find vidcap.h. Turning HAVE_OBSENSOR OFF")
+  if(APPLE)
+    # force to use orbbec sdk on mac
+    set(OBSENSOR_USE_ORBBEC_SDK ON)
+  endif()
+
+  if(OBSENSOR_USE_ORBBEC_SDK)
+    include(${CMAKE_SOURCE_DIR}/3rdparty/orbbecsdk/orbbecsdk.cmake)
+    download_orbbec_sdk(ORBBEC_SDK_ROOT_DIR)
+    message(STATUS "ORBBEC_SDK_ROOT_DIR: ${ORBBEC_SDK_ROOT_DIR}")
+    if(ORBBEC_SDK_ROOT_DIR)
+      set(OrbbecSDK_DIR "${ORBBEC_SDK_ROOT_DIR}")
+      find_package(OrbbecSDK REQUIRED)
+      message(STATUS "OrbbecSDK_FOUND: ${OrbbecSDK_FOUND}")
+      message(STATUS "OrbbecSDK_INCLUDE_DIRS: ${OrbbecSDK_INCLUDE_DIRS}")
+      if(OrbbecSDK_FOUND)
+        set(HAVE_OBSENSOR TRUE)
+        set(HAVE_OBSENSOR_ORBBEC_SDK TRUE)
+        ocv_add_external_target(obsensor "${OrbbecSDK_INCLUDE_DIRS}" "${OrbbecSDK_LIBRARY}" "HAVE_OBSENSOR;HAVE_OBSENSOR_ORBBEC_SDK")
+        file(COPY ${OrbbecSDK_DLL_FILES} DESTINATION ${CMAKE_BINARY_DIR}/bin)
+        file(COPY ${OrbbecSDK_DLL_FILES} DESTINATION ${CMAKE_BINARY_DIR}/lib)
+        install(FILES ${OrbbecSDK_DLL_FILES} DESTINATION ${OPENCV_LIB_INSTALL_PATH})
+        ocv_install_3rdparty_licenses(OrbbecSDK ${OrbbecSDK_DIR}/LICENSE.txt)
       endif()
     endif()
-  elseif(UNIX)
-    check_include_file(linux/videodev2.h HAVE_CAMV4L2_OBSENSOR)
-    if(HAVE_CAMV4L2_OBSENSOR)
-      set(HAVE_OBSENSOR TRUE)
-      set(HAVE_OBSENSOR_V4L2 TRUE)
-      ocv_add_external_target(obsensor "" "" "HAVE_OBSENSOR;HAVE_OBSENSOR_V4L2")
+  else()
+    if(WIN32)
+      check_include_file(mfapi.h HAVE_MFAPI)
+      check_include_file(vidcap.h HAVE_VIDCAP)
+      if(HAVE_MFAPI AND HAVE_VIDCAP)
+        set(HAVE_OBSENSOR TRUE)
+        set(HAVE_OBSENSOR_MSMF TRUE)
+        ocv_add_external_target(obsensor "" "" "HAVE_OBSENSOR;HAVE_OBSENSOR_MSMF")
+      else()
+        set(HAVE_OBSENSOR OFF)
+        set(HAVE_OBSENSOR_MSMF OFF)
+        if(NOT HAVE_MFAPI)
+          MESSAGE(STATUS "Could not find mfapi.h. Turning HAVE_OBSENSOR OFF")
+        endif()
+        if(NOT HAVE_VIDCAP)
+          MESSAGE(STATUS "Could not find vidcap.h. Turning HAVE_OBSENSOR OFF")
+        endif()
+      endif()
+    elseif(UNIX)
+      check_include_file(linux/videodev2.h HAVE_CAMV4L2_OBSENSOR)
+      if(HAVE_CAMV4L2_OBSENSOR)
+        set(HAVE_OBSENSOR TRUE)
+        set(HAVE_OBSENSOR_V4L2 TRUE)
+        ocv_add_external_target(obsensor "" "" "HAVE_OBSENSOR;HAVE_OBSENSOR_V4L2")
+      endif()
     endif()
   endif()
 endif()
diff --git a/modules/videoio/doc/videoio_overview.markdown b/modules/videoio/doc/videoio_overview.markdown
index c61c36c5d8b7..041f2949a51e 100644
--- a/modules/videoio/doc/videoio_overview.markdown
+++ b/modules/videoio/doc/videoio_overview.markdown
@@ -1,12 +1,14 @@
 Video I/O with OpenCV Overview {#videoio_overview}
-===================================
+==============================
 
-### See also:
+@tableofcontents
+
+@sa
   - @ref videoio "Video I/O Code Reference"
   - Tutorials: @ref tutorial_table_of_content_app
 
 General Information
-===================
+-------------------
 
 The OpenCV @ref videoio module is a set of classes and functions to read and write video or images sequence.
 
@@ -53,10 +55,11 @@ cv::VideoCapture cap(filename, cv::CAP_MSMF);
 //or specify the apiPreference with open
 cap.open(filename, cv::CAP_MSMF);
 ```
-
 @sa cv::VideoCapture::open() , cv::VideoCapture::VideoCapture()
 
-#### How to enable backends
+
+How to enable backends
+----------------------
 
 There are two kinds of videoio backends: built-in backends and plugins which will be loaded at runtime (since OpenCV 4.1.0). Use functions cv::videoio_registry::getBackends, cv::videoio_registry::hasBackend and cv::videoio_registry::getBackendName to check actual presence of backend during runtime.
 
@@ -71,7 +74,9 @@ To enable dynamically-loaded videoio backend (currently supported: GStreamer and
 
 @note Don't forget to clean CMake cache when switching between these two modes
 
-#### Use 3rd party drivers or cameras
+
+Use 3rd party drivers or cameras
+--------------------------------
 
 Many industrial cameras or some video I/O devices don't provide standard driver interfaces
 for the operating system. Thus you can't use  VideoCapture or VideoWriter with these devices.
@@ -83,6 +88,7 @@ It is a common case that these libraries read/write images from/to a memory buff
 possible to make a `Mat` header for memory buffer (user-allocated data) and process it
 in-place using OpenCV functions. See cv::Mat::Mat() for more details.
 
+
 The FFmpeg library
 ------------------
 
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 1c3f0f5eb0ab..fb47036bbf56 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -108,7 +108,7 @@ enum VideoCaptureAPIs {
        CAP_PVAPI        = 800,          //!< PvAPI, Prosilica GigE SDK
        CAP_OPENNI       = 900,          //!< OpenNI (for Kinect)
        CAP_OPENNI_ASUS  = 910,          //!< OpenNI (for Asus Xtion)
-       CAP_ANDROID      = 1000,         //!< Android - not used
+       CAP_ANDROID      = 1000,         //!< MediaNDK (API Level 21+) and NDK Camera (API level 24+) for Android
        CAP_XIAPI        = 1100,         //!< XIMEA Camera API
        CAP_AVFOUNDATION = 1200,         //!< AVFoundation framework for iOS (OS X Lion will have the same API)
        CAP_GIGANETIX    = 1300,         //!< Smartek Giganetix GigEVisionSDK
@@ -128,7 +128,7 @@ enum VideoCaptureAPIs {
        CAP_INTEL_MFX    = 2300,         //!< Intel MediaSDK
        CAP_XINE         = 2400,         //!< XINE engine (Linux)
        CAP_UEYE         = 2500,         //!< uEye Camera API
-       CAP_OBSENSOR     = 2600,         //!< For Orbbec 3D-Sensor device/module (Astra+, Femto)
+       CAP_OBSENSOR     = 2600,         //!< For Orbbec 3D-Sensor device/module (Astra+, Femto, Astra2, Gemini2, Gemini2L, Gemini2XL, Femto Mega) attention: Astra2 cameras currently only support Windows and Linux kernel versions no higher than 4.15, and higher versions of Linux kernel may have exceptions.
      };
 
 
@@ -140,7 +140,7 @@ enum VideoCaptureAPIs {
 */
 enum VideoCaptureProperties {
        CAP_PROP_POS_MSEC       =0, //!< Current position of the video file in milliseconds.
-       CAP_PROP_POS_FRAMES     =1, //!< 0-based index of the frame to be decoded/captured next.
+       CAP_PROP_POS_FRAMES     =1, //!< 0-based index of the frame to be decoded/captured next. When the index i is set in RAW mode (CAP_PROP_FORMAT == -1) this will seek to the key frame k, where k <= i.
        CAP_PROP_POS_AVI_RATIO  =2, //!< Relative position of the video file: 0=start of the film, 1=end of the film.
        CAP_PROP_FRAME_WIDTH    =3, //!< Width of the frames in the video stream.
        CAP_PROP_FRAME_HEIGHT   =4, //!< Height of the frames in the video stream.
@@ -195,7 +195,7 @@ enum VideoCaptureProperties {
        CAP_PROP_HW_ACCELERATION_USE_OPENCL=52, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between HW accelerated decoder and cv::UMat.
        CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg and GStreamer back-ends only)
        CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg and GStreamer back-ends only)
-       CAP_PROP_STREAM_OPEN_TIME_USEC =55, //<! (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
+       CAP_PROP_STREAM_OPEN_TIME_USEC =55, //!< (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
        CAP_PROP_VIDEO_TOTAL_CHANNELS = 56, //!< (read-only) Number of video channels
        CAP_PROP_VIDEO_STREAM = 57, //!< (**open-only**) Specify video stream, 0-based index. Use -1 to disable video stream from file or IP cameras. Default value is 0.
        CAP_PROP_AUDIO_STREAM = 58, //!< (**open-only**) Specify stream in multi-language media files, -1 - disable audio processing or microphone. Default value is -1.
@@ -225,10 +225,13 @@ enum VideoWriterProperties {
   VIDEOWRITER_PROP_NSTRIPES = 3,   //!< Number of stripes for parallel encoding. -1 for auto detection.
   VIDEOWRITER_PROP_IS_COLOR = 4,   //!< If it is not zero, the encoder will expect and encode color frames, otherwise it
                                    //!< will work with grayscale frames.
-  VIDEOWRITER_PROP_DEPTH = 5,      //!< Defaults to CV_8U.
+  VIDEOWRITER_PROP_DEPTH = 5,      //!< Defaults to \ref CV_8U.
   VIDEOWRITER_PROP_HW_ACCELERATION = 6, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in VideoWriter constructor / .open() method. Default value is backend-specific.
   VIDEOWRITER_PROP_HW_DEVICE       = 7, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
   VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL= 8, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between cv::UMat and HW accelerated encoder.
+  VIDEOWRITER_PROP_RAW_VIDEO = 9, //!< (**open-only**) Set to non-zero to enable encapsulation of an encoded raw video stream. Each raw encoded video frame should be passed to VideoWriter::write() as single row or column of a \ref CV_8UC1 Mat. \note If the key frame interval is not 1 then it must be manually specified by the user. This can either be performed during initialization passing \ref VIDEOWRITER_PROP_KEY_INTERVAL as one of the extra encoder params  to \ref VideoWriter::VideoWriter(const String &, int, double, const Size &, const std::vector< int > &params) or afterwards by setting the \ref VIDEOWRITER_PROP_KEY_FLAG with \ref VideoWriter::set() before writing each frame. FFMpeg backend only.
+  VIDEOWRITER_PROP_KEY_INTERVAL = 10, //!< (**open-only**) Set the key frame interval using raw video encapsulation (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). Defaults to 1 when not set. FFMpeg backend only.
+  VIDEOWRITER_PROP_KEY_FLAG = 11, //!< Set to non-zero to signal that the following frames are key frames or zero if not, when encapsulating raw video (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). FFMpeg backend only.
 #ifndef CV_DOXYGEN
   CV__VIDEOWRITER_PROP_LATEST
 #endif
@@ -311,6 +314,10 @@ enum { CAP_PROP_OPENNI_OUTPUT_MODE       = 100,
        CAP_PROP_OPENNI2_MIRROR           = 111
      };
 
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 5054 )
+#endif
 //! OpenNI shortcuts
 enum { CAP_OPENNI_IMAGE_GENERATOR_PRESENT         = CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
        CAP_OPENNI_IMAGE_GENERATOR_OUTPUT_MODE     = CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_OUTPUT_MODE,
@@ -321,6 +328,9 @@ enum { CAP_OPENNI_IMAGE_GENERATOR_PRESENT         = CAP_OPENNI_IMAGE_GENERATOR +
        CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON = CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION,
        CAP_OPENNI_IR_GENERATOR_PRESENT            = CAP_OPENNI_IR_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
      };
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
 
 //! OpenNI data given from depth generator
 enum { CAP_OPENNI_DEPTH_MAP         = 0, //!< Depth values in mm (CV_16UC1)
@@ -950,7 +960,7 @@ class CV_EXPORTS_W VideoCapture
     CV_WRAP void setExceptionMode(bool enable) { throwOnFail = enable; }
 
     /// query if exception mode is active
-    CV_WRAP bool getExceptionMode() { return throwOnFail; }
+    CV_WRAP bool getExceptionMode() const { return throwOnFail; }
 
 
     /** @brief Wait for ready frames from VideoCapture.
@@ -1013,7 +1023,7 @@ class CV_EXPORTS_W VideoWriter
     VideoWriter::fourcc('P','I','M','1') is a MPEG-1 codec, VideoWriter::fourcc('M','J','P','G')
     is a motion-jpeg codec etc. List of codes can be obtained at
     [MSDN](https://docs.microsoft.com/en-us/windows/win32/medfound/video-fourccs) page
-    or with this [archived page](https://web.archive.org/web/20220316062600/http://www.fourcc.org/codecs.php)
+    or with this [page](https://fourcc.org/codecs.php)
     of the fourcc site for a more complete list). FFMPEG backend with MP4 container natively uses
     other values as fourcc code: see [ObjectType](http://mp4ra.org/#/codecs),
     so you may receive a warning message from OpenCV about fourcc code conversion.
@@ -1029,6 +1039,9 @@ class CV_EXPORTS_W VideoWriter
     - Most codecs are lossy. If you want lossless video file you need to use a lossless codecs
       (eg. FFMPEG FFV1, Huffman HFYU, Lagarith LAGS, etc...)
     - If FFMPEG is enabled, using `codec=0; fps=0;` you can create an uncompressed (raw) video file.
+    - If FFMPEG is used, we allow frames of odd width or height, but in this case we truncate
+      the rightmost column/the bottom row. Probably, this should be handled more elegantly,
+      but some internal functions inside FFMPEG swscale require even width/height.
     */
     CV_WRAP VideoWriter(const String& filename, int fourcc, double fps,
                 Size frameSize, bool isColor = true);
diff --git a/modules/videoio/include/opencv2/videoio/legacy/constants_c.h b/modules/videoio/include/opencv2/videoio/legacy/constants_c.h
index 91f85f87b8e1..f9831e358ae4 100644
--- a/modules/videoio/include/opencv2/videoio/legacy/constants_c.h
+++ b/modules/videoio/include/opencv2/videoio/legacy/constants_c.h
@@ -417,7 +417,7 @@ enum
 
 Simply call it with 4 chars fourcc code like `CV_FOURCC('I', 'Y', 'U', 'V')`
 
-List of codes can be obtained at [Video Codecs by FOURCC](http://www.fourcc.org/codecs.php) page.
+List of codes can be obtained at [Video Codecs by FOURCC](https://fourcc.org/codecs.php) page.
 FFMPEG backend with MP4 container natively uses other values as fourcc code:
 see [ObjectType](http://mp4ra.org/#/codecs).
 */
diff --git a/modules/videoio/include/opencv2/videoio/utils.private.hpp b/modules/videoio/include/opencv2/videoio/utils.private.hpp
new file mode 100644
index 000000000000..e331aaf2acb2
--- /dev/null
+++ b/modules/videoio/include/opencv2/videoio/utils.private.hpp
@@ -0,0 +1,15 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEOIO_UTILS_PRIVATE_HPP
+#define OPENCV_VIDEOIO_UTILS_PRIVATE_HPP
+
+#include "opencv2/core/cvdef.h"
+#include <string>
+
+namespace cv {
+CV_EXPORTS std::string icvExtractPattern(const std::string& filename, unsigned *offset);
+}
+
+#endif // OPENCV_VIDEOIO_UTILS_PRIVATE_HPP
diff --git a/modules/videoio/misc/objc/ios/CvPhotoCamera2.m b/modules/videoio/misc/objc/ios/CvPhotoCamera2.m
index 460cce6d32e5..281929c558ff 100644
--- a/modules/videoio/misc/objc/ios/CvPhotoCamera2.m
+++ b/modules/videoio/misc/objc/ios/CvPhotoCamera2.m
@@ -105,7 +105,7 @@ - (void)createStillImageOutput;
 {
     // setup still image output with jpeg codec
     self.stillImageOutput = [[AVCaptureStillImageOutput alloc] init];
-    NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecJPEG, AVVideoCodecKey, nil];
+    NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecTypeJPEG, AVVideoCodecKey, nil];
     [self.stillImageOutput setOutputSettings:outputSettings];
     [self.captureSession addOutput:self.stillImageOutput];
 
diff --git a/modules/videoio/misc/objc/ios/CvVideoCamera2.mm b/modules/videoio/misc/objc/ios/CvVideoCamera2.mm
index 7f4abdb5789c..188d6c5ec793 100644
--- a/modules/videoio/misc/objc/ios/CvVideoCamera2.mm
+++ b/modules/videoio/misc/objc/ios/CvVideoCamera2.mm
@@ -315,7 +315,7 @@ - (void)createVideoFileOutput {
     NSDictionary *outputSettings
      = [NSDictionary dictionaryWithObjectsAndKeys:[NSNumber numberWithInt:self.imageWidth], AVVideoWidthKey,
                                                   [NSNumber numberWithInt:self.imageHeight], AVVideoHeightKey,
-                                                  AVVideoCodecH264, AVVideoCodecKey,
+                                                  AVVideoCodecTypeH264, AVVideoCodecKey,
                                                   nil
      ];
 
diff --git a/modules/videoio/src/cap.cpp b/modules/videoio/src/cap.cpp
index be159062e3f2..f8572aad08f6 100644
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@@ -161,9 +161,9 @@ bool VideoCapture::open(const String& filename, int apiPreference, const std::ve
                     {
                         throw;
                     }
-                    CV_LOG_ERROR(NULL,
-                                 cv::format("VIDEOIO(%s): raised OpenCV exception:\n\n%s\n",
-                                            info.name, e.what()));
+                    CV_LOG_WARNING(NULL,
+                                   cv::format("VIDEOIO(%s): raised OpenCV exception:\n\n%s\n",
+                                              info.name, e.what()));
                 }
                 catch (const std::exception& e)
                 {
@@ -171,8 +171,8 @@ bool VideoCapture::open(const String& filename, int apiPreference, const std::ve
                     {
                         throw;
                     }
-                    CV_LOG_ERROR(NULL, cv::format("VIDEOIO(%s): raised C++ exception:\n\n%s\n",
-                                                  info.name, e.what()));
+                    CV_LOG_WARNING(NULL, cv::format("VIDEOIO(%s): raised C++ exception:\n\n%s\n",
+                                                    info.name, e.what()));
                 }
                 catch (...)
                 {
@@ -180,9 +180,9 @@ bool VideoCapture::open(const String& filename, int apiPreference, const std::ve
                     {
                         throw;
                     }
-                    CV_LOG_ERROR(NULL,
-                                 cv::format("VIDEOIO(%s): raised unknown C++ exception!\n\n",
-                                            info.name));
+                    CV_LOG_WARNING(NULL,
+                                   cv::format("VIDEOIO(%s): raised unknown C++ exception!\n\n",
+                                              info.name));
                 }
             }
             else
@@ -299,9 +299,9 @@ bool VideoCapture::open(int cameraNum, int apiPreference, const std::vector<int>
                     {
                         throw;
                     }
-                    CV_LOG_ERROR(NULL,
-                                 cv::format("VIDEOIO(%s): raised OpenCV exception:\n\n%s\n",
-                                            info.name, e.what()));
+                    CV_LOG_WARNING(NULL,
+                                   cv::format("VIDEOIO(%s): raised OpenCV exception:\n\n%s\n",
+                                              info.name, e.what()));
                 }
                 catch (const std::exception& e)
                 {
@@ -309,8 +309,8 @@ bool VideoCapture::open(int cameraNum, int apiPreference, const std::vector<int>
                     {
                         throw;
                     }
-                    CV_LOG_ERROR(NULL, cv::format("VIDEOIO(%s): raised C++ exception:\n\n%s\n",
-                                                  info.name, e.what()));
+                    CV_LOG_WARNING(NULL, cv::format("VIDEOIO(%s): raised C++ exception:\n\n%s\n",
+                                                    info.name, e.what()));
                 }
                 catch (...)
                 {
@@ -318,9 +318,9 @@ bool VideoCapture::open(int cameraNum, int apiPreference, const std::vector<int>
                     {
                         throw;
                     }
-                    CV_LOG_ERROR(NULL,
-                                 cv::format("VIDEOIO(%s): raised unknown C++ exception!\n\n",
-                                            info.name));
+                    CV_LOG_WARNING(NULL,
+                                   cv::format("VIDEOIO(%s): raised unknown C++ exception!\n\n",
+                                              info.name));
                 }
             }
             else
@@ -640,20 +640,20 @@ bool VideoWriter::open(const String& filename, int apiPreference, int fourcc, do
                 }
                 catch (const cv::Exception& e)
                 {
-                    CV_LOG_ERROR(NULL,
-                                 cv::format("VIDEOIO(%s): raised OpenCV exception:\n\n%s\n",
-                                            info.name, e.what()));
+                    CV_LOG_WARNING(NULL,
+                                   cv::format("VIDEOIO(%s): raised OpenCV exception:\n\n%s\n",
+                                              info.name, e.what()));
                 }
                 catch (const std::exception& e)
                 {
-                    CV_LOG_ERROR(NULL, cv::format("VIDEOIO(%s): raised C++ exception:\n\n%s\n",
-                                                  info.name, e.what()));
+                    CV_LOG_WARNING(NULL, cv::format("VIDEOIO(%s): raised C++ exception:\n\n%s\n",
+                                                    info.name, e.what()));
                 }
                 catch (...)
                 {
-                    CV_LOG_ERROR(NULL,
-                                 cv::format("VIDEOIO(%s): raised unknown C++ exception!\n\n",
-                                            info.name));
+                    CV_LOG_WARNING(NULL,
+                                   cv::format("VIDEOIO(%s): raised unknown C++ exception!\n\n",
+                                              info.name));
                 }
             }
             else
diff --git a/modules/videoio/src/cap_android_camera.cpp b/modules/videoio/src/cap_android_camera.cpp
index 84034e6208eb..5569b74144e3 100644
--- a/modules/videoio/src/cap_android_camera.cpp
+++ b/modules/videoio/src/cap_android_camera.cpp
@@ -117,10 +117,10 @@ static void OnDeviceError(void* /* ctx */, ACameraDevice* dev, int err) {
             LOGI("Camera in use");
             break;
         case ERROR_CAMERA_SERVICE:
-            LOGI("Fatal Error occured in Camera Service");
+            LOGI("Fatal Error occurred in Camera Service");
             break;
         case ERROR_CAMERA_DEVICE:
-            LOGI("Fatal Error occured in Camera Device");
+            LOGI("Fatal Error occurred in Camera Device");
             break;
         case ERROR_CAMERA_DISABLED:
             LOGI("Camera disabled");
@@ -269,7 +269,7 @@ class AndroidCameraCapture : public IVideoCapture
             if (mStatus != AMEDIA_OK) {
                 if (mStatus == AMEDIA_IMGREADER_NO_BUFFER_AVAILABLE) {
                     // this error is not fatal - we just need to wait for a buffer to become available
-                    LOGW("No Buffer Available error occured - waiting for callback");
+                    LOGW("No Buffer Available error occurred - waiting for callback");
                     waitingCapture = true;
                     captureSuccess = false;
                     auto start = std::chrono::system_clock::now();
@@ -533,6 +533,7 @@ class AndroidCameraCapture : public IVideoCapture
         cachedIndex = index;
         cameraManager = std::shared_ptr<ACameraManager>(ACameraManager_create(), deleter_ACameraManager);
         if (!cameraManager) {
+            LOGE("Cannot create camera manager!");
             return false;
         }
         ACameraIdList* cameraIds = nullptr;
@@ -591,6 +592,7 @@ class AndroidCameraCapture : public IVideoCapture
                 }
             }
         }
+        LOGI("Best resolution match: %dx%d", bestMatchWidth, bestMatchHeight);
 
         ACameraMetadata_const_entry val = { 0, };
         camera_status_t status = ACameraMetadata_getConstEntry(cameraMetadata.get(), ACAMERA_SENSOR_INFO_EXPOSURE_TIME_RANGE, &val);
@@ -654,7 +656,11 @@ class AndroidCameraCapture : public IVideoCapture
             return false;
         }
         sessionOutput = std::shared_ptr<ACaptureSessionOutput>(output, deleter_ACaptureSessionOutput);
-        ACaptureSessionOutputContainer_add(outputContainer.get(), sessionOutput.get());
+        cStatus = ACaptureSessionOutputContainer_add(outputContainer.get(), sessionOutput.get());
+        if (cStatus != ACAMERA_OK) {
+            LOGE("CaptureSessionOutput Container add failed with error code: %d", cStatus);
+            return false;
+        }
         sessionOutputAdded = true;
 
         ACameraOutputTarget* target;
diff --git a/modules/videoio/src/cap_avfoundation.mm b/modules/videoio/src/cap_avfoundation.mm
index 12fa42f5b52d..2df990392fa1 100644
--- a/modules/videoio/src/cap_avfoundation.mm
+++ b/modules/videoio/src/cap_avfoundation.mm
@@ -61,6 +61,8 @@
 
 #define DISABLE_AUTO_RESTART 999
 
+#if !TARGET_OS_VISION
+
 @interface CaptureDelegate : NSObject <AVCaptureVideoDataOutputSampleBufferDelegate>
 {
     int newFrame;
@@ -125,6 +127,7 @@ - (IplImage*)getOutput;
         int disableAutoRestart;
 };
 
+#endif
 
 /*****************************************************************************
  *
@@ -160,6 +163,7 @@ - (IplImage*)getOutput;
     uint32_t  mMode;
     int       mFormat;
 
+    void handleTracks(NSArray<AVAssetTrack *>* tracks, const char* filename);
     bool setupReadingAt(CMTime position);
     IplImage* retrieveFramePixelBuffer();
     int getPreferredOrientationDegrees() const;
@@ -217,15 +221,19 @@ - (IplImage*)getOutput;
 
 }
 
+
 cv::Ptr<cv::IVideoCapture> cv::create_AVFoundation_capture_cam(int index)
 {
+#if !TARGET_OS_VISION
     CvCaptureCAM* retval = new CvCaptureCAM(index);
     if (retval->didStart())
         return cv::makePtr<cv::LegacyCapture>(retval);
     delete retval;
+#endif
     return 0;
 }
 
+
 cv::Ptr<cv::IVideoWriter> cv::create_AVFoundation_writer(const std::string& filename, int fourcc,
                                                          double fps, const cv::Size &frameSize,
                                                          const cv::VideoWriterParameters& params)
@@ -245,6 +253,8 @@ - (IplImage*)getOutput;
  *
  *****************************************************************************/
 
+#if !TARGET_OS_VISION
+
 CvCaptureCAM::CvCaptureCAM(int cameraNum) {
     mCaptureSession = nil;
     mCaptureDeviceInput = nil;
@@ -773,6 +783,7 @@ -(int) updateImage {
 
 @end
 
+#endif
 
 /*****************************************************************************
  *
@@ -811,24 +822,26 @@ -(int) updateImage {
         return;
     }
 
-    NSArray *tracks = [mAsset tracksWithMediaType:AVMediaTypeVideo];
-    if ([tracks count] == 0) {
-        fprintf(stderr, "OpenCV: Couldn't read video stream from file \"%s\"\n", filename);
-        [localpool drain];
-        started = 0;
-        return;
-    }
-
-    mAssetTrack = [tracks[0] retain];
-
-    if ( ! setupReadingAt(kCMTimeZero) ) {
-        fprintf(stderr, "OpenCV: Couldn't read movie file \"%s\"\n", filename);
-        [localpool drain];
-        started = 0;
+// Available since iOS 15
+#if TARGET_OS_VISION || (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED >= 150000)
+    if (@available(iOS 15, *)) {
+        [mAsset loadTracksWithMediaType:AVMediaTypeVideo completionHandler:^(NSArray<AVAssetTrack *>* tracks, NSError* err) {
+            if (err != nil) {
+                handleTracks(tracks, filename);
+            }
+            [localpool drain];
+        }];
         return;
+    } else {
+#if !TARGET_OS_VISION
+        NSArray *tracks = [mAsset tracksWithMediaType:AVMediaTypeVideo];
+        handleTracks(tracks, filename);
+#endif
     }
-
-    started = 1;
+#else
+    NSArray *tracks = [mAsset tracksWithMediaType:AVMediaTypeVideo];
+    handleTracks(tracks, filename);
+#endif
     [localpool drain];
 }
 
@@ -850,6 +863,24 @@ -(int) updateImage {
     [localpool drain];
 }
 
+void CvCaptureFile::handleTracks(NSArray<AVAssetTrack *>* tracks, const char* filename) {
+    if ([tracks count] == 0) {
+        fprintf(stderr, "OpenCV: Couldn't read video stream from file \"%s\"\n", filename);
+        started = 0;
+        return;
+    }
+
+    mAssetTrack = [tracks[0] retain];
+
+    if ( ! setupReadingAt(kCMTimeZero) ) {
+        fprintf(stderr, "OpenCV: Couldn't read movie file \"%s\"\n", filename);
+        started = 0;
+        return;
+    }
+
+    started = 1;
+}
+
 bool CvCaptureFile::setupReadingAt(CMTime position) {
     if (mAssetReader) {
         if (mAssetReader.status == AVAssetReaderStatusReading) {
@@ -1269,25 +1300,25 @@ -(int) updateImage {
         //exception;
     }
 
-    // Three codec supported AVVideoCodecH264 AVVideoCodecJPEG AVVideoCodecTypeHEVC
+    // Three codec supported AVVideoCodecTypeH264 AVVideoCodecTypeJPEG AVVideoCodecTypeHEVC
     // On iPhone 3G H264 is not supported.
     if (fourcc == CV_FOURCC('J','P','E','G') || fourcc == CV_FOURCC('j','p','e','g') ||
             fourcc == CV_FOURCC('M','J','P','G') || fourcc == CV_FOURCC('m','j','p','g')){
-        codec = [AVVideoCodecJPEG copy]; // Use JPEG codec if specified, otherwise H264
+        codec = [AVVideoCodecTypeJPEG copy]; // Use JPEG codec if specified, otherwise H264
     }else if(fourcc == CV_FOURCC('H','2','6','4') || fourcc == CV_FOURCC('a','v','c','1')){
-            codec = [AVVideoCodecH264 copy];
+            codec = [AVVideoCodecTypeH264 copy];
 // Available since iOS 11
-#if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED >= 110000
+#if TARGET_OS_VISION || (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED >= 110000)
     }else if(fourcc == CV_FOURCC('H','2','6','5') || fourcc == CV_FOURCC('h','v','c','1') ||
             fourcc == CV_FOURCC('H','E','V','C') || fourcc == CV_FOURCC('h','e','v','c')){
         if (@available(iOS 11, *)) {
             codec = [AVVideoCodecTypeHEVC copy];
         } else {
-            codec = [AVVideoCodecH264 copy];
+            codec = [AVVideoCodecTypeH264 copy];
         }
 #endif
     }else{
-        codec = [AVVideoCodecH264 copy]; // default canonical H264.
+        codec = [AVVideoCodecTypeH264 copy]; // default canonical H264.
     }
 
     //NSLog(@"Path: %@", path);
@@ -1349,17 +1380,17 @@ -(int) updateImage {
     NSAutoreleasePool* localpool = [[NSAutoreleasePool alloc] init];
 
     [mMovieWriterInput markAsFinished];
-    [mMovieWriter finishWriting];
-    [mMovieWriter release];
-    [mMovieWriterInput release];
-    [mMovieWriterAdaptor release];
-    [path release];
-    [codec release];
-    [fileType release];
-    cvReleaseImage(&argbimage);
-
-    [localpool drain];
+    [mMovieWriter finishWritingWithCompletionHandler:^() {
+        [mMovieWriter release];
+        [mMovieWriterInput release];
+        [mMovieWriterAdaptor release];
+        [path release];
+        [codec release];
+        [fileType release];
+        cvReleaseImage(&argbimage);
 
+        [localpool drain];
+    }];
 }
 
 bool CvVideoWriter_AVFoundation::writeFrame(const IplImage* iplimage) {
diff --git a/modules/videoio/src/cap_avfoundation_mac.mm b/modules/videoio/src/cap_avfoundation_mac.mm
index c0ad4810d4b5..98df630c746e 100644
--- a/modules/videoio/src/cap_avfoundation_mac.mm
+++ b/modules/videoio/src/cap_avfoundation_mac.mm
@@ -1220,13 +1220,13 @@ -(int) updateImage {
         is_good = false;
     }
 
-    // Three codec supported AVVideoCodecH264 AVVideoCodecJPEG AVVideoCodecTypeHEVC
+    // Three codec supported AVVideoCodecTypeH264 AVVideoCodecTypeJPEG AVVideoCodecTypeHEVC
     // On iPhone 3G H264 is not supported.
     if (fourcc == CV_FOURCC('J','P','E','G') || fourcc == CV_FOURCC('j','p','e','g') ||
             fourcc == CV_FOURCC('M','J','P','G') || fourcc == CV_FOURCC('m','j','p','g')){
-        codec = [AVVideoCodecJPEG copy]; // Use JPEG codec if specified, otherwise H264
+        codec = [AVVideoCodecTypeJPEG copy]; // Use JPEG codec if specified, otherwise H264
     }else if(fourcc == CV_FOURCC('H','2','6','4') || fourcc == CV_FOURCC('a','v','c','1')){
-            codec = [AVVideoCodecH264 copy];
+            codec = [AVVideoCodecTypeH264 copy];
     // Available since macOS 10.13
 #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED >= 101300
     }else if(fourcc == CV_FOURCC('H','2','6','5') || fourcc == CV_FOURCC('h','v','c','1') ||
diff --git a/modules/videoio/src/cap_dshow.cpp b/modules/videoio/src/cap_dshow.cpp
index d6b2b9554561..21af06a147d4 100644
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@@ -2771,7 +2771,7 @@ int videoInput::start(int deviceID, videoDevice *VD){
     if(customSize){
         DebugPrintOut("SETUP: Default Format is set to %ix%i\n", currentWidth, currentHeight);
 
-        if (strcmp("OBS Virtual Camera", VD->nDeviceName) == 0)
+        if (strcmp("OBS Virtual Camera", VD->nDeviceName) == 0 || strcmp("Streamlabs Desktop Virtual Webcam", VD->nDeviceName) == 0)
         {
             // OBS Virtual Camera always returns S_OK on SetFormat(), even if it doesn't support
             // the actual format. So we have to choose a format that it supports manually, e.g. NV12.
diff --git a/modules/videoio/src/cap_ffmpeg.cpp b/modules/videoio/src/cap_ffmpeg.cpp
index 764bc3386468..ba509a3c4250 100644
--- a/modules/videoio/src/cap_ffmpeg.cpp
+++ b/modules/videoio/src/cap_ffmpeg.cpp
@@ -198,7 +198,11 @@ class CvVideoWriter_FFMPEG_proxy CV_FINAL :
         return ffmpegWriter->getProperty(propId);
     }
 
-    virtual bool setProperty(int, double) CV_OVERRIDE { return false; }
+    virtual bool setProperty(int propId, double value) CV_OVERRIDE {
+        if (!ffmpegWriter)
+            return 0;
+        return ffmpegWriter->setProperty(propId, value);
+    }
     virtual bool isOpened() const CV_OVERRIDE { return ffmpegWriter != 0; }
 
 protected:
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 982bc5c87df0..0546fbdfbedc 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -428,11 +428,15 @@ inline const char* _opencv_avcodec_get_name(CV_CODEC_ID id)
 }
 
 
-static
-inline int _opencv_ffmpeg_interrupt_callback(void *ptr)
+static int _opencv_ffmpeg_interrupt_callback(void *ptr)
 {
     AVInterruptCallbackMetadata* metadata = (AVInterruptCallbackMetadata*)ptr;
-    CV_Assert(metadata);
+
+    if(!metadata)
+    {
+        CV_LOG_WARNING(NULL, "Stream timeout without metadata passed");
+        return 0;
+    }
 
     if (metadata->timeout_after_ms == 0)
     {
@@ -442,9 +446,15 @@ inline int _opencv_ffmpeg_interrupt_callback(void *ptr)
     timespec now;
     get_monotonic_time(&now);
 
-    metadata->timeout = get_monotonic_time_diff_ms(metadata->value, now) > metadata->timeout_after_ms;
+    double timeout = get_monotonic_time_diff_ms(metadata->value, now);
+    metadata->timeout = timeout > metadata->timeout_after_ms;
+    if (metadata->timeout)
+    {
+        CV_LOG_WARNING(NULL, cv::format("Stream timeout triggered after %lf ms", timeout));
+        return -1;
+    }
 
-    return metadata->timeout ? -1 : 0;
+    return 0;
 }
 #endif
 
@@ -580,6 +590,7 @@ struct CvCapture_FFMPEG
     bool processRawPacket();
     bool rawMode;
     bool rawModeInitialized;
+    bool rawSeek;
     bool convertRGB;
     AVPacket packet_filtered;
 #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(58, 20, 100)
@@ -633,6 +644,7 @@ void CvCapture_FFMPEG::init()
 
     rawMode = false;
     rawModeInitialized = false;
+    rawSeek = false;
     convertRGB = true;
     memset(&packet_filtered, 0, sizeof(packet_filtered));
     av_init_packet(&packet_filtered);
@@ -915,7 +927,6 @@ class InternalFFMpegRegister
         if(!threadSafe)
             lock.lock();
         static InternalFFMpegRegister instance;
-        initLogger_();  // update logger setup unconditionally (GStreamer's libav plugin may override these settings)
     }
     static void initLogger_()
     {
@@ -953,6 +964,7 @@ class InternalFFMpegRegister
         /* register a callback function for synchronization */
         av_lockmgr_register(&LockCallBack);
 #endif
+        initLogger_();
     }
     ~InternalFFMpegRegister()
     {
@@ -1051,33 +1063,35 @@ bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters&
                 return false;
             }
         }
-        if (params.has(CAP_PROP_HW_ACCELERATION))
-        {
-            va_type = params.get<VideoAccelerationType>(CAP_PROP_HW_ACCELERATION);
-#if !USE_AV_HW_CODECS
-            if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+        if(!rawMode) {
+            if (params.has(CAP_PROP_HW_ACCELERATION))
             {
-                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: FFmpeg backend is build without acceleration support. Can't handle CAP_PROP_HW_ACCELERATION parameter. Bailout");
-                return false;
-            }
+                va_type = params.get<VideoAccelerationType>(CAP_PROP_HW_ACCELERATION);
+#if !USE_AV_HW_CODECS
+                if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+                {
+                    CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: FFmpeg backend is build without acceleration support. Can't handle CAP_PROP_HW_ACCELERATION parameter. Bailout");
+                    return false;
+                }
 #endif
-        }
-        if (params.has(CAP_PROP_HW_DEVICE))
-        {
-            hw_device = params.get<int>(CAP_PROP_HW_DEVICE);
-            if (va_type == VIDEO_ACCELERATION_NONE && hw_device != -1)
-            {
-                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
-                return false;
             }
-            if (va_type == VIDEO_ACCELERATION_ANY && hw_device != -1)
+            if (params.has(CAP_PROP_HW_DEVICE))
             {
-                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
-                return false;
+                hw_device = params.get<int>(CAP_PROP_HW_DEVICE);
+                if (va_type == VIDEO_ACCELERATION_NONE && hw_device != -1)
+                {
+                    CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
+                    return false;
+                }
+                if (va_type == VIDEO_ACCELERATION_ANY && hw_device != -1)
+                {
+                    CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
+                    return false;
+                }
+            }
+            if (params.has(CAP_PROP_HW_ACCELERATION_USE_OPENCL)) {
+                use_opencl = params.get<int>(CAP_PROP_HW_ACCELERATION_USE_OPENCL);
             }
-        }
-        if (params.has(CAP_PROP_HW_ACCELERATION_USE_OPENCL)) {
-            use_opencl = params.get<int>(CAP_PROP_HW_ACCELERATION_USE_OPENCL);
         }
 #if USE_AV_INTERRUPT_CALLBACK
         if (params.has(CAP_PROP_OPEN_TIMEOUT_MSEC))
@@ -1153,6 +1167,23 @@ bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters&
         CV_LOG_WARNING(NULL, "Unable to read codec parameters from stream (" << _opencv_ffmpeg_get_error_string(err) << ")");
         goto exit_func;
     }
+
+    if (rawMode) {
+        video_stream = av_find_best_stream(ic, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
+        if (video_stream < 0) {
+            close();
+            return false;
+        }
+        video_st = ic->streams[video_stream];
+#ifndef CV_FFMPEG_CODECPAR
+        frame.height = video_st->codec->height;
+        frame.width = video_st->codec->width;
+#else
+        frame.height = video_st->codecpar->height;
+        frame.width = video_st->codecpar->width;
+#endif
+        return true;
+    }
     for(i = 0; i < ic->nb_streams; i++)
     {
 #ifndef CV_FFMPEG_CODECPAR
@@ -1426,7 +1457,8 @@ bool CvCapture_FFMPEG::processRawPacket()
 #else
         AVCodecContext* ctx = ic->streams[video_stream]->codec;
         int err = av_bitstream_filter_filter(bsfc, ctx, NULL, &packet_filtered.data,
-            &packet_filtered.size, packet.data, packet.size, packet_filtered.flags & AV_PKT_FLAG_KEY);
+            &packet_filtered.size, packet.data, packet.size, packet.flags & AV_PKT_FLAG_KEY);
+        if (packet.flags & AV_PKT_FLAG_KEY) packet_filtered.flags |= AV_PKT_FLAG_KEY;
         if (err < 0)
         {
             CV_WARN("Packet filtering failed");
@@ -1440,6 +1472,10 @@ bool CvCapture_FFMPEG::processRawPacket()
 
 bool CvCapture_FFMPEG::grabFrame()
 {
+    if (rawSeek) {
+        rawSeek = false;
+        return true;
+    }
     bool valid = false;
 
     static const size_t max_read_attempts = cv::utils::getConfigurationParameterSizeT("OPENCV_FFMPEG_READ_ATTEMPTS", 4096);
@@ -1447,7 +1483,7 @@ bool CvCapture_FFMPEG::grabFrame()
     size_t cur_read_attempts = 0;
     size_t cur_decode_attempts = 0;
 
-    if( !ic || !video_st || !context )  return false;
+    if( !ic || !video_st || (!rawMode && !context) )  return false;
 
     if( ic->streams[video_stream]->nb_frames > 0 &&
         frame_number > ic->streams[video_stream]->nb_frames )
@@ -1464,7 +1500,7 @@ bool CvCapture_FFMPEG::grabFrame()
 
 #if USE_AV_SEND_FRAME_API
     // check if we can receive frame from previously decoded packet
-    valid = avcodec_receive_frame(context, picture) >= 0;
+    valid = rawMode ? false : avcodec_receive_frame(context, picture) >= 0;
 #endif
 
     // get the next frame
@@ -1548,12 +1584,19 @@ bool CvCapture_FFMPEG::grabFrame()
     }
 
     if (valid) {
-        if( picture_pts == AV_NOPTS_VALUE_ )
-            picture_pts = picture->CV_FFMPEG_PTS_FIELD != AV_NOPTS_VALUE_ && picture->CV_FFMPEG_PTS_FIELD != 0 ? picture->CV_FFMPEG_PTS_FIELD : picture->pkt_dts;
-        frame_number++;
+        if (picture_pts == AV_NOPTS_VALUE_) {
+            if (!rawMode)
+                picture_pts = picture->CV_FFMPEG_PTS_FIELD != AV_NOPTS_VALUE_ && picture->CV_FFMPEG_PTS_FIELD != 0 ? picture->CV_FFMPEG_PTS_FIELD : picture->pkt_dts;
+            else {
+                const AVPacket& packet_raw = packet.data != 0 ? packet : packet_filtered;
+                picture_pts = packet_raw.pts != AV_NOPTS_VALUE_ && packet_raw.pts != 0 ? packet_raw.pts : packet_raw.dts;
+                if (picture_pts < 0) picture_pts = 0;
+            }
+            frame_number++;
+        }
     }
 
-    if (!rawMode && valid && first_frame_number < 0)
+    if (valid && first_frame_number < 0)
         first_frame_number = dts_to_frame_number(picture_pts);
 
 #if USE_AV_INTERRUPT_CALLBACK
@@ -1567,7 +1610,7 @@ bool CvCapture_FFMPEG::grabFrame()
 
 bool CvCapture_FFMPEG::retrieveFrame(int flag, unsigned char** data, int* step, int* width, int* height, int* cn, int* depth)
 {
-    if (!video_st || !context)
+    if (!video_st || (!rawMode && !context))
         return false;
 
     if (rawMode || flag == extraDataIdx)
@@ -1735,7 +1778,7 @@ static inline double getCodecIdFourcc(const AVCodecID codec_id)
 
 double CvCapture_FFMPEG::getProperty( int property_id ) const
 {
-    if( !video_st || !context ) return 0;
+    if( !video_st || (!rawMode && !context) ) return 0;
 
     switch( property_id )
     {
@@ -1814,7 +1857,8 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
         //ic->start_time_realtime is in microseconds
         return ((double)ic->start_time_realtime);
     case CAP_PROP_N_THREADS:
-        return static_cast<double>(context->thread_count);
+        if (!rawMode)
+            return static_cast<double>(context->thread_count);
     default:
         break;
     }
@@ -1846,15 +1890,16 @@ int64_t CvCapture_FFMPEG::get_bitrate() const
 
 double CvCapture_FFMPEG::get_fps() const
 {
-#if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(55, 1, 100) && LIBAVFORMAT_VERSION_MICRO >= 100
-    double fps = r2d(av_guess_frame_rate(ic, ic->streams[video_stream], NULL));
-#else
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0) || LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(52, 111, 0)
     double fps = r2d(ic->streams[video_stream]->avg_frame_rate);
+#else
+    double fps = r2d(ic->streams[video_stream]->r_frame_rate);
+#endif
 
-#if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(52, 111, 0)
+#if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(55, 1, 100) && LIBAVFORMAT_VERSION_MICRO >= 100
     if (fps < eps_zero)
     {
-        fps = r2d(ic->streams[video_stream]->avg_frame_rate);
+        fps = r2d(av_guess_frame_rate(ic, ic->streams[video_stream], NULL));
     }
 #endif
 
@@ -1862,7 +1907,7 @@ double CvCapture_FFMPEG::get_fps() const
     {
         fps = 1.0 / r2d(ic->streams[video_stream]->time_base);
     }
-#endif
+
     return fps;
 }
 
@@ -1910,9 +1955,11 @@ void CvCapture_FFMPEG::get_rotation_angle()
 
 void CvCapture_FFMPEG::seek(int64_t _frame_number)
 {
-    CV_Assert(context);
+    if (!rawMode) {
+        CV_Assert(context);
+    }
     _frame_number = std::min(_frame_number, get_total_frames());
-    int delta = 16;
+    int delta = !rawMode ? 16 : 0;
 
     // if we have not grabbed a single frame before first seek, let's read the first frame
     // and get some valuable information during the process
@@ -1927,7 +1974,8 @@ void CvCapture_FFMPEG::seek(int64_t _frame_number)
         double  time_base  = r2d(ic->streams[video_stream]->time_base);
         time_stamp += (int64_t)(sec / time_base + 0.5);
         if (get_total_frames() > 1) av_seek_frame(ic, video_stream, time_stamp, AVSEEK_FLAG_BACKWARD);
-        avcodec_flush_buffers(context);
+        if(!rawMode)
+            avcodec_flush_buffers(context);
         if( _frame_number > 0 )
         {
             grabFrame();
@@ -1935,6 +1983,10 @@ void CvCapture_FFMPEG::seek(int64_t _frame_number)
             if( _frame_number > 1 )
             {
                 frame_number = dts_to_frame_number(picture_pts) - first_frame_number;
+                if (rawMode) {
+                    rawSeek = true;
+                    break;
+                }
                 //printf("_frame_number = %d, frame_number = %d, delta = %d\n",
                 //       (int)_frame_number, (int)frame_number, delta);
 
@@ -2032,6 +2084,7 @@ struct CvVideoWriter_FFMPEG
     bool writeFrame( const unsigned char* data, int step, int width, int height, int cn, int origin );
     bool writeHWFrame(cv::InputArray input);
     double getProperty(int propId) const;
+    bool setProperty(int, double);
 
     void init();
 
@@ -2055,6 +2108,9 @@ struct CvVideoWriter_FFMPEG
     VideoAccelerationType va_type;
     int               hw_device;
     int               use_opencl;
+    bool              encode_video;
+    int               idr_period;
+    bool              key_frame;
 };
 
 static const char * icvFFMPEGErrStr(int err)
@@ -2120,6 +2176,9 @@ void CvVideoWriter_FFMPEG::init()
     hw_device = -1;
     use_opencl = 0;
     ok = false;
+    encode_video = true;
+    idr_period = 0;
+    key_frame = false;
 }
 
 /**
@@ -2165,7 +2224,7 @@ static AVCodecContext * icv_configure_video_stream_FFMPEG(AVFormatContext *oc,
                                                    AVStream *st,
                                                    const AVCodec* codec,
                                                    int w, int h, int bitrate,
-                                                   double fps, AVPixelFormat pixel_format, int fourcc)
+                                                   double fps, AVPixelFormat pixel_format, int fourcc, AVCodecID codec_id)
 {
 #ifdef CV_FFMPEG_CODECPAR
     AVCodecContext *c = avcodec_alloc_context3(codec);
@@ -2176,9 +2235,7 @@ static AVCodecContext * icv_configure_video_stream_FFMPEG(AVFormatContext *oc,
 
     int frame_rate, frame_rate_base;
 
-    c->codec_id = codec->id;
-    c->codec_type = AVMEDIA_TYPE_VIDEO;
-    c->codec_tag = fourcc;
+    c->codec_id = codec ? codec->id : codec_id;
 
 #ifndef CV_FFMPEG_CODECPAR
     // Set per-codec defaults
@@ -2188,6 +2245,9 @@ static AVCodecContext * icv_configure_video_stream_FFMPEG(AVFormatContext *oc,
     c->codec_id = c_id;
 #endif
 
+    c->codec_type = AVMEDIA_TYPE_VIDEO;
+    c->codec_tag = fourcc;
+
     /* put sample parameters */
     int64_t lbit_rate = (int64_t)bitrate;
     lbit_rate += (bitrate / 2);
@@ -2286,6 +2346,29 @@ static AVCodecContext * icv_configure_video_stream_FFMPEG(AVFormatContext *oc,
 
 static const int OPENCV_NO_FRAMES_WRITTEN_CODE = 1000;
 
+static int icv_av_encapsulate_video_FFMPEG(AVFormatContext* oc, AVStream* video_st, AVCodecContext* c,
+    uint8_t* data, int sz, const int frame_idx, const bool key_frame)
+{
+#if LIBAVFORMAT_BUILD < CALC_FFMPEG_VERSION(57, 0, 0)
+    AVPacket pkt_;
+    av_init_packet(&pkt_);
+    AVPacket* pkt = &pkt_;
+#else
+    AVPacket* pkt = av_packet_alloc();
+#endif
+    if(key_frame)
+        pkt->flags |= PKT_FLAG_KEY;
+    pkt->pts = frame_idx;
+    pkt->size = sz;
+    pkt->data = data;
+    av_packet_rescale_ts(pkt, c->time_base, video_st->time_base);
+    int ret = av_write_frame(oc, pkt);
+#if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(57, 0, 0)
+    av_packet_free(&pkt);
+#endif
+    return ret;
+}
+
 static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st, AVCodecContext * c,
                                       uint8_t *, uint32_t,
                                       AVFrame * picture, int frame_idx)
@@ -2367,6 +2450,14 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
 /// write a frame with FFMPEG
 bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int width, int height, int cn, int origin )
 {
+    if (!encode_video) {
+        CV_Assert(cn == 1 && ((width > 0 && height == 1) || (width == 1 && height > 0 && step == 1)));
+        const bool set_key_frame = key_frame ? key_frame : idr_period ? frame_idx % idr_period == 0 : 1;
+        bool ret = icv_av_encapsulate_video_FFMPEG(oc, video_st, context, (uint8_t*)data, width, frame_idx, set_key_frame);
+        frame_idx++;
+        return ret;
+    }
+
     // check parameters
     if (input_pix_fmt == AV_PIX_FMT_BGR24) {
         if (cn != 3) {
@@ -2555,6 +2646,21 @@ double CvVideoWriter_FFMPEG::getProperty(int propId) const
     return 0;
 }
 
+bool CvVideoWriter_FFMPEG::setProperty(int property_id, double value)
+{
+    if (!video_st) return false;
+
+    switch (property_id)
+    {
+    case VIDEOWRITER_PROP_KEY_FLAG:
+        key_frame = static_cast<bool>(value);
+        break;
+    default:
+        return false;
+    }
+    return true;
+}
+
 /// close video output stream and free associated memory
 void CvVideoWriter_FFMPEG::close()
 {
@@ -2564,17 +2670,19 @@ void CvVideoWriter_FFMPEG::close()
     // TODO -- do we need to account for latency here?
 
     /* write the trailer, if any */
-    if (picture && ok && oc)
+    if ((!encode_video || picture) && ok && oc)
     {
 #if LIBAVFORMAT_BUILD < CALC_FFMPEG_VERSION(57, 0, 0)
         if (!(oc->oformat->flags & AVFMT_RAWPICTURE))
 #endif
         {
-            for(;;)
-            {
-                int ret = icv_av_write_frame_FFMPEG( oc, video_st, context, outbuf, outbuf_size, NULL, frame_idx);
-                if( ret == OPENCV_NO_FRAMES_WRITTEN_CODE || ret < 0 )
-                    break;
+            if (encode_video) {
+                for (;;)
+                {
+                    int ret = icv_av_write_frame_FFMPEG(oc, video_st, context, outbuf, outbuf_size, NULL, frame_idx);
+                    if (ret == OPENCV_NO_FRAMES_WRITTEN_CODE || ret < 0)
+                        break;
+                }
             }
         }
         av_write_trailer(oc);
@@ -2683,6 +2791,8 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
 
     close();
 
+    encode_video = !params.get(VIDEOWRITER_PROP_RAW_VIDEO, false);
+    idr_period = params.get(VIDEOWRITER_PROP_KEY_INTERVAL, 0);
     const bool is_color = params.get(VIDEOWRITER_PROP_IS_COLOR, true);
     const int depth = params.get(VIDEOWRITER_PROP_DEPTH, CV_8U);
     const bool is_supported = depth == CV_8U || (depth == CV_16U && !is_color);
@@ -2733,13 +2843,15 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
     if(fps <= 0)
         return false;
 
-    // we allow frames of odd width or height, but in this case we truncate
-    // the rightmost column/the bottom row. Probably, this should be handled more elegantly,
-    // but some internal functions inside FFMPEG swscale require even width/height.
-    width &= -2;
-    height &= -2;
-    if( width <= 0 || height <= 0 )
-        return false;
+    if (encode_video) {
+        // we allow frames of odd width or height, but in this case we truncate
+        // the rightmost column/the bottom row. Probably, this should be handled more elegantly,
+        // but some internal functions inside FFMPEG swscale require even width/height.
+        width &= -2;
+        height &= -2;
+        if (width <= 0 || height <= 0)
+            return false;
+    }
 
     /* auto detect the output format from the name and fourcc code. */
 
@@ -2990,41 +3102,46 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
     HWAccelIterator accel_iter(va_type, true/*isEncoder*/, dict);
     while (accel_iter.good())
     {
+        AVPixelFormat hw_format = AV_PIX_FMT_NONE;
+        AVHWDeviceType hw_type = AV_HWDEVICE_TYPE_NONE;
 #else
     do {
 #endif
+        if (encode_video) {
 #if USE_AV_HW_CODECS
-        accel_iter.parse_next();
-        AVHWDeviceType hw_type = accel_iter.hw_type();
-        codec = NULL;
-        AVPixelFormat hw_format = AV_PIX_FMT_NONE;
-        if (hw_device_ctx)
-            av_buffer_unref(&hw_device_ctx);
-        if (hw_type != AV_HWDEVICE_TYPE_NONE)
-        {
-            codec = hw_find_codec(codec_id, hw_type, av_codec_is_encoder, accel_iter.disabled_codecs().c_str(), &hw_format);
-            if (!codec)
-                continue;
+            accel_iter.parse_next();
+            hw_type = accel_iter.hw_type();
+            codec = NULL;
+            hw_format = AV_PIX_FMT_NONE;
+            if (hw_device_ctx)
+                av_buffer_unref(&hw_device_ctx);
+            if (hw_type != AV_HWDEVICE_TYPE_NONE)
+            {
+                codec = hw_find_codec(codec_id, hw_type, av_codec_is_encoder, accel_iter.disabled_codecs().c_str(), &hw_format);
+                if (!codec)
+                    continue;
 
-            hw_device_ctx = hw_create_device(hw_type, hw_device, accel_iter.device_subname(), use_opencl != 0);
-            if (!hw_device_ctx)
-                continue;
-        }
-        else if (hw_type == AV_HWDEVICE_TYPE_NONE)
+                hw_device_ctx = hw_create_device(hw_type, hw_device, accel_iter.device_subname(), use_opencl != 0);
+                if (!hw_device_ctx)
+                    continue;
+            }
+            else if (hw_type == AV_HWDEVICE_TYPE_NONE)
 #endif
-        {
-            codec = avcodec_find_encoder(codec_id);
-            if (!codec) {
-                CV_LOG_ERROR(NULL, "Could not find encoder for codec_id=" << (int)codec_id << ", error: "
+            {
+                codec = avcodec_find_encoder(codec_id);
+                if (!codec) {
+                    CV_LOG_ERROR(NULL, "Could not find encoder for codec_id=" << (int)codec_id << ", error: "
                         << icvFFMPEGErrStr(AVERROR_ENCODER_NOT_FOUND));
+                }
             }
+            if (!codec)
+                continue;
         }
-        if (!codec)
-            continue;
+
 #if USE_AV_HW_CODECS
-        AVPixelFormat format = (hw_format != AV_PIX_FMT_NONE) ? hw_format : codec_pix_fmt;
+            AVPixelFormat format = (hw_format != AV_PIX_FMT_NONE) ? hw_format : codec_pix_fmt;
 #else
-        AVPixelFormat format = codec_pix_fmt;
+            AVPixelFormat format = codec_pix_fmt;
 #endif
 
 #ifdef CV_FFMPEG_CODECPAR
@@ -3032,7 +3149,7 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
 #endif
         context = icv_configure_video_stream_FFMPEG(oc, video_st, codec,
                                               width, height, (int) (bitrate + 0.5),
-                                              fps, format, fourcc);
+                                              fps, format, fourcc, codec_id);
         if (!context)
         {
             continue;
@@ -3045,17 +3162,18 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
         av_dump_format(oc, 0, filename, 1);
 #endif
 #endif
-
+        if (encode_video) {
 #if USE_AV_HW_CODECS
-        if (hw_device_ctx) {
-            context->hw_device_ctx = av_buffer_ref(hw_device_ctx);
-            if (hw_format != AV_PIX_FMT_NONE) {
-                context->hw_frames_ctx = hw_create_frames(NULL, hw_device_ctx, width, height, hw_format);
-                if (!context->hw_frames_ctx)
-                    continue;
+            if (hw_device_ctx) {
+                context->hw_device_ctx = av_buffer_ref(hw_device_ctx);
+                if (hw_format != AV_PIX_FMT_NONE) {
+                    context->hw_frames_ctx = hw_create_frames(NULL, hw_device_ctx, width, height, hw_format);
+                    if (!context->hw_frames_ctx)
+                        continue;
+                }
             }
-        }
 #endif
+        }
 
         int64_t lbit_rate = (int64_t) context->bit_rate;
         lbit_rate += (int64_t)(bitrate / 2);
@@ -3064,7 +3182,7 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
         context->bit_rate = (int) lbit_rate;
 
         /* open the codec */
-        err = avcodec_open2(context, codec, NULL);
+        err = !encode_video ? 0 : avcodec_open2(context, codec, NULL);
         if (err >= 0) {
 #if USE_AV_HW_CODECS
             va_type = hw_type_to_va_type(hw_type);
@@ -3100,43 +3218,43 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
     avcodec_parameters_from_context(video_st->codecpar, context);
 #endif
 
-    outbuf = NULL;
-
-
+    if (encode_video) {
+        outbuf = NULL;
 #if LIBAVFORMAT_BUILD < CALC_FFMPEG_VERSION(57, 0, 0)
-    if (!(oc->oformat->flags & AVFMT_RAWPICTURE))
+        if (!(oc->oformat->flags & AVFMT_RAWPICTURE))
 #endif
-    {
-        /* allocate output buffer */
-        /* assume we will never get codec output with more than 4 bytes per pixel... */
-        outbuf_size = width*height*4;
-        outbuf = (uint8_t *) av_malloc(outbuf_size);
-    }
+        {
+            /* allocate output buffer */
+            /* assume we will never get codec output with more than 4 bytes per pixel... */
+            outbuf_size = width * height * 4;
+            outbuf = (uint8_t*)av_malloc(outbuf_size);
+        }
 
-    bool need_color_convert;
-    AVPixelFormat sw_pix_fmt = context->pix_fmt;
+        bool need_color_convert;
+        AVPixelFormat sw_pix_fmt = context->pix_fmt;
 #if USE_AV_HW_CODECS
-    if (context->hw_frames_ctx)
-        sw_pix_fmt = ((AVHWFramesContext*)context->hw_frames_ctx->data)->sw_format;
+        if (context->hw_frames_ctx)
+            sw_pix_fmt = ((AVHWFramesContext*)context->hw_frames_ctx->data)->sw_format;
 #endif
 
-    need_color_convert = (sw_pix_fmt != input_pix_fmt);
+        need_color_convert = (sw_pix_fmt != input_pix_fmt);
 
-    /* allocate the encoded raw picture */
-    picture = icv_alloc_picture_FFMPEG(sw_pix_fmt, context->width, context->height, need_color_convert);
-    if (!picture) {
-        return false;
-    }
-
-    /* if the output format is not our input format, then a temporary
-   picture of the input format is needed too. It is then converted
-   to the required output format */
-    input_picture = NULL;
-    if ( need_color_convert ) {
-        input_picture = icv_alloc_picture_FFMPEG(input_pix_fmt, context->width, context->height, false);
-        if (!input_picture) {
+        /* allocate the encoded raw picture */
+        picture = icv_alloc_picture_FFMPEG(sw_pix_fmt, context->width, context->height, need_color_convert);
+        if (!picture) {
             return false;
         }
+
+        /* if the output format is not our input format, then a temporary
+       picture of the input format is needed too. It is then converted
+       to the required output format */
+        input_picture = NULL;
+        if (need_color_convert) {
+            input_picture = icv_alloc_picture_FFMPEG(input_pix_fmt, context->width, context->height, false);
+            if (!input_picture) {
+                return false;
+            }
+        }
     }
 
     /* open the output file, if needed */
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index fc031d2b5fdb..41e98794b96a 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -114,6 +114,7 @@ template<> inline void GSafePtr_release<GstBuffer>(GstBuffer** pPtr) { if (pPtr)
 template<> inline void GSafePtr_release<GstSample>(GstSample** pPtr) { if (pPtr) { gst_sample_unref(*pPtr); *pPtr = NULL; } }
 template<> inline void GSafePtr_release<GstBus>(GstBus** pPtr) { if (pPtr) { gst_object_unref(G_OBJECT(*pPtr)); *pPtr = NULL; } }
 template<> inline void GSafePtr_release<GstMessage>(GstMessage** pPtr) { if (pPtr) { gst_message_unref(*pPtr); *pPtr = NULL; } }
+template<> inline void GSafePtr_release<GstQuery>(GstQuery** pPtr) { if (pPtr) { gst_query_unref(*pPtr); *pPtr = NULL; } }
 template<> inline void GSafePtr_release<GMainLoop>(GMainLoop** pPtr) { if (pPtr) { g_main_loop_unref(*pPtr); *pPtr = NULL; } }
 
 template<> inline void GSafePtr_release<GstEncodingVideoProfile>(GstEncodingVideoProfile** pPtr) { if (pPtr) { gst_encoding_profile_unref(*pPtr); *pPtr = NULL; } }
@@ -367,6 +368,7 @@ class GStreamerCapture CV_FINAL : public IVideoCapture
     gint          audioBitPerFrame;
     gint          audioSampleSize;
     std::string   audioFormat;
+    guint64       timestamp;
 
     Mat audioFrame;
     std::deque<uint8_t> bufferAudioData;
@@ -433,7 +435,8 @@ GStreamerCapture::GStreamerCapture() :
     audioSamplesPerSecond(44100),
     audioBitPerFrame(0),
     audioSampleSize(0),
-    audioFormat("S16LE")
+    audioFormat("S16LE"),
+    timestamp(0)
     , va_type(VIDEO_ACCELERATION_NONE)
     , hw_device(-1)
 {}
@@ -680,6 +683,11 @@ bool GStreamerCapture::grabVideoFrame()
                 stopFlag = true;
                 emulatedFrameNumber++;
             }
+            if (usedVideoSample)
+            {
+                auto *buffer = gst_sample_get_buffer((GstSample*)usedVideoSample);
+                timestamp = GST_BUFFER_PTS(buffer);
+            }
             returnFlag = true;
         }
     }
@@ -792,6 +800,7 @@ bool GStreamerCapture::grabAudioFrame()
                 CV_LOG_ERROR(NULL, "GStreamer: Failed. Buffer is empty");
                 return false;
             }
+            timestamp = GST_BUFFER_PTS(buf);
             if (!gst_buffer_map(buf, &map_info, GST_MAP_READ))
             {
                 CV_LOG_ERROR(NULL, "GStreamer: Failed to map GStreamer buffer to system memory");
@@ -1389,6 +1398,7 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
     GSafePtr<char> uri;
     GSafePtr<GstBus> bus;
 
+    GSafePtr<GstElement> queue;
     GSafePtr<GstElement> uridecodebin;
     GSafePtr<GstElement> color;
     GSafePtr<GstElement> convert;
@@ -1493,6 +1503,7 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
                     if (strstr(name, "opencvsink") != NULL || strstr(name, "appsink") != NULL)
                     {
                         sink.attach(GST_ELEMENT(gst_object_ref(element)));
+                        audiosink.attach(GST_ELEMENT(gst_object_ref(element)));
                     }
                     else if (strstr(name, COLOR_ELEM_NAME) != NULL)
                     {
@@ -1534,6 +1545,8 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
 
         if (videoStream >= 0)
         {
+            queue.reset(gst_element_factory_make("queue", NULL));
+            CV_Assert(queue);
             sink.reset(gst_element_factory_make("appsink", NULL));
             CV_Assert(sink);
             // videoconvert (in 0.10: ffmpegcolorspace, in 1.x autovideoconvert)
@@ -1541,7 +1554,7 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
             color.reset(gst_element_factory_make(COLOR_ELEM, NULL));
             CV_Assert(color);
 
-            gst_bin_add_many(GST_BIN(pipeline.get()), uridecodebin.get(), color.get(), sink.get(), NULL);
+            gst_bin_add_many(GST_BIN(pipeline.get()), queue.get(), uridecodebin.get(), color.get(), sink.get(), NULL);
 
             if (element_from_uri)
             {
@@ -1566,6 +1579,8 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
         }
         if (audioStream >= 0)
         {
+            queue.reset(gst_element_factory_make("queue", NULL));
+            CV_Assert(queue);
             convert.reset(gst_element_factory_make("audioconvert", NULL));
             resample.reset(gst_element_factory_make("audioresample", NULL));
             audiosink.reset(gst_element_factory_make("appsink", NULL));
@@ -1573,7 +1588,7 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
             CV_Assert(resample);
             CV_Assert(audiosink);
 
-            gst_bin_add_many (GST_BIN (pipeline.get()), uridecodebin.get(), convert.get(), resample.get(), audiosink.get(), NULL);
+            gst_bin_add_many (GST_BIN (pipeline.get()), queue.get(), uridecodebin.get(), convert.get(), resample.get(), audiosink.get(), NULL);
             if (!gst_element_link_many (convert.get(), resample.get(), audiosink.get(), NULL))
             {
                 CV_WARN("GStreamer(audio): cannot link convert -> resample -> sink");
@@ -1646,14 +1661,17 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
     }
     if (manualpipeline)
     {
-        GSafePtr<GstCaps> peer_caps;
-        GSafePtr<GstPad> sink_pad;
-        sink_pad.attach(gst_element_get_static_pad(sink, "sink"));
-        peer_caps.attach(gst_pad_peer_query_caps(sink_pad, NULL));
-        if (!gst_caps_can_intersect(caps, peer_caps))
+        if (videoStream >= 0)
         {
-            caps.attach(gst_caps_from_string("video/x-raw, format=(string){UYVY,YUY2,YVYU,NV12,NV21,YV12,I420,BGRA,RGBA,BGRx,RGBx,GRAY16_LE,GRAY16_BE}"));
-            CV_Assert(caps);
+            GSafePtr<GstCaps> peer_caps;
+            GSafePtr<GstPad> sink_pad;
+            sink_pad.attach(gst_element_get_static_pad(sink, "sink"));
+            peer_caps.attach(gst_pad_peer_query_caps(sink_pad, NULL));
+            if (!gst_caps_can_intersect(caps, peer_caps))
+            {
+                caps.attach(gst_caps_from_string("video/x-raw, format=(string){UYVY,YUY2,YVYU,NV12,NV21,YV12,I420,BGRA,RGBA,BGRx,RGBx,GRAY16_LE,GRAY16_BE}"));
+                CV_Assert(caps);
+            }
         }
     }
     if (videoStream >= 0)
@@ -1661,6 +1679,7 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
         gst_app_sink_set_caps(GST_APP_SINK(sink.get()), caps);
         caps.release();
     }
+
     {
         GST_DEBUG_BIN_TO_DOT_FILE(GST_BIN(pipeline.get()), GST_DEBUG_GRAPH_SHOW_ALL, "pipeline-init");
 
@@ -1688,18 +1707,6 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
             GSafePtr<GstCaps> buffer_caps;
             buffer_caps.attach(gst_pad_get_current_caps(pad));
 
-            GstFormat format;
-
-            format = GST_FORMAT_DEFAULT;
-            if(!gst_element_query_duration(sink, format, &duration))
-            {
-                handleMessage(pipeline);
-                CV_WARN("unable to query duration of stream");
-                duration = -1;
-            }
-
-            handleMessage(pipeline);
-
             const GstStructure *structure = gst_caps_get_structure(buffer_caps, 0);  // no lifetime transfer
             if (!gst_structure_get_int (structure, "width", &width) ||
                 !gst_structure_get_int (structure, "height", &height))
@@ -1708,13 +1715,55 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
             }
 
             gint num = 0, denom=1;
+            bool fps_query_success = true;
+
             if (!gst_structure_get_fraction(structure, "framerate", &num, &denom))
             {
                 CV_WARN("cannot query video fps");
+                fps_query_success = false;
             }
 
             fps = (double)num/(double)denom;
 
+            // If num == 0 and denom == 1 -> variable frame rate video.
+            if (fps_query_success && !(num == 0 && denom == 1))
+            {
+                GSafePtr<GstQuery> query;
+                query.attach(gst_query_new_duration(GST_FORMAT_DEFAULT));
+
+                gboolean res = gst_element_query(pipeline.get(), query);
+
+                if (res)
+                {
+                    gst_query_parse_duration(query, NULL, &duration);
+                }
+                else if (fps != 0)
+                {
+                    GSafePtr<GstQuery> query2;
+                    query2.attach(gst_query_new_duration(GST_FORMAT_TIME));
+                    gboolean res2 = gst_element_query(pipeline.get(), query2);
+
+                    if (res2)
+                    {
+                        gst_query_parse_duration(query2, NULL, &duration);
+                        duration = static_cast<gint64>((float)duration / GST_SECOND * fps);
+                        CV_WARN("frame count is estimated by duration and fps");
+                    }
+                    else
+                    {
+                        CV_WARN("unable to query duration of stream");
+                        duration = -1;
+                    }
+                }
+                else
+                {
+                    CV_WARN("unable to query frame count of stream and fps are not available to estimate it");
+                    duration = -1;
+                }
+            }
+
+            handleMessage(pipeline);
+
             {
                 GstFormat format_;
                 gint64 value_ = -1;
@@ -1814,20 +1863,7 @@ double GStreamerCapture::getProperty(int propId) const
     switch(propId)
     {
     case CV_CAP_PROP_POS_MSEC:
-        CV_LOG_ONCE_WARNING(NULL, "OpenCV | GStreamer: CAP_PROP_POS_MSEC property result may be unrealiable: "
-                                  "https://github.com/opencv/opencv/issues/19025");
-        if (audioStream != -1)
-        {
-            return usedVideoSampleTimeNS * 1e-6;
-        }
-        format = GST_FORMAT_TIME;
-        status = gst_element_query_position(sink.get(), CV_GST_FORMAT(format), &value);
-        if(!status) {
-            handleMessage(pipeline);
-            CV_WARN("GStreamer: unable to query position of stream");
-            return 0;
-        }
-        return value * 1e-6; // nano seconds to milli seconds
+        return double(timestamp) / GST_MSECOND;
     case CV_CAP_PROP_POS_FRAMES:
         if (!isPosFramesSupported)
         {
@@ -1859,7 +1895,7 @@ double GStreamerCapture::getProperty(int propId) const
     case CV_CAP_PROP_FPS:
         return fps;
     case CV_CAP_PROP_FRAME_COUNT:
-        return duration;
+        return (double)duration;
     case CV_CAP_PROP_BRIGHTNESS:
     case CV_CAP_PROP_CONTRAST:
     case CV_CAP_PROP_SATURATION:
@@ -1936,13 +1972,15 @@ bool GStreamerCapture::setProperty(int propId, double value)
         return false;
     }
 
-    bool wasPlaying = this->isPipelinePlaying();
-    if (wasPlaying)
+    bool needRestart = this->isPipelinePlaying() && (propId == CV_CAP_PROP_FRAME_WIDTH || propId == CV_CAP_PROP_FRAME_HEIGHT || propId == CV_CAP_PROP_FPS);
+    if (needRestart) {
         this->stopPipeline();
+    }
 
     switch(propId)
     {
     case CV_CAP_PROP_POS_MSEC:
+    {
         if(!gst_element_seek_simple(GST_ELEMENT(pipeline.get()), GST_FORMAT_TIME,
                                     flags, (gint64) (value * GST_MSECOND))) {
             handleMessage(pipeline);
@@ -1950,6 +1988,9 @@ bool GStreamerCapture::setProperty(int propId, double value)
         }
         else
         {
+            // Optimistically caching the target timestamp before reading the first frame from the new position since
+            // the timestamp in GStreamer can be reliable extracted from the read frames.
+            timestamp = (gint64)value;
             if (isPosFramesEmulated)
             {
                 if (value == 0)
@@ -1963,7 +2004,8 @@ bool GStreamerCapture::setProperty(int propId, double value)
                 }
             }
         }
-        break;
+        return true;
+    }
     case CV_CAP_PROP_POS_FRAMES:
     {
         if (!isPosFramesSupported)
@@ -1977,24 +2019,34 @@ bool GStreamerCapture::setProperty(int propId, double value)
                     return true;
                 }
             }
-            return false;
             CV_WARN("unable to seek");
+            return false;
+        }
+        // Certain mov and mp4 files seek incorrectly if the pipeline is not stopped before.
+        if (this->isPipelinePlaying()) {
+            this->stopPipeline();
         }
+
         if(!gst_element_seek_simple(GST_ELEMENT(pipeline.get()), GST_FORMAT_DEFAULT,
                                     flags, (gint64) value)) {
             handleMessage(pipeline);
             CV_WARN("GStreamer: unable to seek");
-            break;
+            return false;
         }
         // wait for status update
         gst_element_get_state(pipeline, NULL, NULL, GST_CLOCK_TIME_NONE);
         return true;
     }
     case CV_CAP_PROP_POS_AVI_RATIO:
+    {
+        // https://stackoverflow.com/questions/31290315
+        // GStreamer docs: GST_FORMAT_PERCENT (5) – percentage of stream (few, if any, elements implement this as of May 2009)
+        CV_WARN("GStreamer: seeking by file percent are not supported by most GStreamer elements");
         if(!gst_element_seek_simple(GST_ELEMENT(pipeline.get()), GST_FORMAT_PERCENT,
                                     flags, (gint64) (value * GST_FORMAT_PERCENT_MAX))) {
             handleMessage(pipeline);
             CV_WARN("GStreamer: unable to seek");
+            return false;
         }
         else
         {
@@ -2011,7 +2063,8 @@ bool GStreamerCapture::setProperty(int propId, double value)
                 }
             }
         }
-        break;
+        return true;
+    }
     case CV_CAP_PROP_FRAME_WIDTH:
         if(value > 0)
             setFilter("width", G_TYPE_INT, (int) value, 0);
@@ -2099,8 +2152,9 @@ bool GStreamerCapture::setProperty(int propId, double value)
         CV_WARN("GStreamer: unhandled property");
     }
 
-    if (wasPlaying)
+    if (needRestart) {
         this->startPipeline();
+    }
 
     return false;
 }
@@ -2572,7 +2626,7 @@ bool CvVideoWriter_GStreamer::open( const std::string &filename, int fourcc,
     if (stateret == GST_STATE_CHANGE_FAILURE)
     {
         handleMessage(pipeline);
-        CV_WARN("GStreamer: cannot put pipeline to play\n");
+        CV_WARN("GStreamer: cannot put pipeline to play");
         pipeline.release();
         return false;
     }
@@ -2766,7 +2820,8 @@ void handleMessage(GstElement * pipeline)
 
         if (gst_is_missing_plugin_message(msg))
         {
-            CV_WARN("your GStreamer installation is missing a required plugin");
+            CV_WARN("your GStreamer installation is missing a required plugin: " <<
+                    gst_missing_plugin_message_get_description(msg));
         }
         else
         {
@@ -2825,8 +2880,6 @@ CvResult CV_API_CALL cv_capture_open_with_params(
     if (!handle)
         return CV_ERROR_FAIL;
     *handle = NULL;
-    if (!filename)
-        return CV_ERROR_FAIL;
     GStreamerCapture *cap = 0;
     try
     {
diff --git a/modules/videoio/src/cap_images.cpp b/modules/videoio/src/cap_images.cpp
index b506dd1c0698..32b180b4e392 100644
--- a/modules/videoio/src/cap_images.cpp
+++ b/modules/videoio/src/cap_images.cpp
@@ -51,8 +51,8 @@
 
 #include "precomp.hpp"
 #include "opencv2/imgcodecs.hpp"
-
 #include "opencv2/core/utils/filesystem.hpp"
+#include "opencv2/videoio/utils.private.hpp"
 
 #if 0
 #define CV_WARN(message)
@@ -113,7 +113,16 @@ void CvCapture_Images::close()
 
 bool CvCapture_Images::grabFrame()
 {
-    cv::String filename = cv::format(filename_pattern.c_str(), (int)(firstframe + currentframe));
+    cv::String filename;
+    if (length == 1)
+        if (currentframe < length)
+            filename = filename_pattern;
+        else
+        {
+            return false;
+        }
+    else
+        filename = cv::format(filename_pattern.c_str(), (int)(firstframe + currentframe));
     CV_Assert(!filename.empty());
 
     if (grabbedInOpen)
@@ -200,7 +209,7 @@ bool CvCapture_Images::setProperty(int id, double value)
     return false;
 }
 
-static
+// static
 std::string icvExtractPattern(const std::string& filename, unsigned *offset)
 {
     size_t len = filename.size();
@@ -249,9 +258,7 @@ std::string icvExtractPattern(const std::string& filename, unsigned *offset)
         while (pos < len && !isdigit(filename[pos])) pos++;
 
         if (pos == len)
-        {
-            CV_Error_(Error::StsBadArg, ("CAP_IMAGES: can't find starting number (in the name of file): %s", filename.c_str()));
-        }
+            return "";
 
         std::string::size_type pos0 = pos;
 
@@ -292,44 +299,61 @@ bool CvCapture_Images::open(const std::string& _filename)
 
     CV_Assert(!_filename.empty());
     filename_pattern = icvExtractPattern(_filename, &offset);
-    CV_Assert(!filename_pattern.empty());
-
-    // determine the length of the sequence
-    for (length = 0; ;)
+    if (filename_pattern.empty())
+    {
+        filename_pattern = _filename;
+        if (!utils::fs::exists(filename_pattern))
+        {
+            CV_LOG_INFO(NULL, "CAP_IMAGES: File does not exist: " << filename_pattern);
+            close();
+            return false;
+        }
+        if (!haveImageReader(filename_pattern))
+        {
+            CV_LOG_INFO(NULL, "CAP_IMAGES: File is not an image: " << filename_pattern);
+            close();
+            return false;
+        }
+        length = 1;
+    }
+    else
     {
-        cv::String filename = cv::format(filename_pattern.c_str(), (int)(offset + length));
-        if (!utils::fs::exists(filename))
+        // determine the length of the sequence
+        for (length = 0; ;)
         {
-            if (length == 0 && offset == 0) // allow starting with 0 or 1
+            cv::String filename = cv::format(filename_pattern.c_str(), (int)(offset + length));
+            if (!utils::fs::exists(filename))
+            {
+                if (length == 0 && offset == 0) // allow starting with 0 or 1
+                {
+                    offset++;
+                    continue;
+                }
+                CV_LOG_INFO(NULL, "CAP_IMAGES: File does not exist: " << filename);
+                break;
+            }
+
+            if(!haveImageReader(filename))
             {
-                offset++;
-                continue;
+                CV_LOG_INFO(NULL, "CAP_IMAGES: File is not an image: " << filename);
+                break;
             }
-            break;
+
+            length++;
         }
 
-        if(!haveImageReader(filename))
+        if (length == 0)
         {
-            CV_LOG_INFO(NULL, "CAP_IMAGES: Stop scanning. Can't read image file: " << filename);
-            break;
+            close();
+            return false;
         }
 
-        length++;
+        firstframe = offset;
     }
-
-    if (length == 0)
-    {
-        close();
-        return false;
-    }
-
-    firstframe = offset;
-
     // grab frame to enable properties retrieval
-    bool grabRes = grabFrame();
+    bool grabRes = CvCapture_Images::grabFrame();
     grabbedInOpen = true;
     currentframe = 0;
-
     return grabRes;
 }
 
diff --git a/modules/videoio/src/cap_ios_photo_camera.mm b/modules/videoio/src/cap_ios_photo_camera.mm
index 9b44156efccc..c210f17f06a7 100644
--- a/modules/videoio/src/cap_ios_photo_camera.mm
+++ b/modules/videoio/src/cap_ios_photo_camera.mm
@@ -139,7 +139,7 @@ - (void)createStillImageOutput;
 {
     // setup still image output with jpeg codec
     self.stillImageOutput = [[AVCaptureStillImageOutput alloc] init];
-    NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecJPEG, AVVideoCodecKey, nil];
+    NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecTypeJPEG, AVVideoCodecKey, nil];
     [self.stillImageOutput setOutputSettings:outputSettings];
     [self.captureSession addOutput:self.stillImageOutput];
 
diff --git a/modules/videoio/src/cap_ios_video_camera.mm b/modules/videoio/src/cap_ios_video_camera.mm
index c1282ecbd898..82e81806a07e 100644
--- a/modules/videoio/src/cap_ios_video_camera.mm
+++ b/modules/videoio/src/cap_ios_video_camera.mm
@@ -376,7 +376,7 @@ - (void)createVideoFileOutput;
     NSDictionary *outputSettings
      = [NSDictionary dictionaryWithObjectsAndKeys:[NSNumber numberWithInt:self.imageWidth], AVVideoWidthKey,
                                                   [NSNumber numberWithInt:self.imageHeight], AVVideoHeightKey,
-                                                  AVVideoCodecH264, AVVideoCodecKey,
+                                                  AVVideoCodecTypeH264, AVVideoCodecKey,
                                                   nil
      ];
 
diff --git a/modules/videoio/src/cap_mfx_common.hpp b/modules/videoio/src/cap_mfx_common.hpp
index 9824e89dc5fa..b10d7115ba15 100644
--- a/modules/videoio/src/cap_mfx_common.hpp
+++ b/modules/videoio/src/cap_mfx_common.hpp
@@ -334,26 +334,11 @@ class DeviceHandler {
 
 
 // TODO: move to core::util?
-#ifdef CV_CXX11
 #include <thread>
 static void sleep_ms(int64 ms)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(ms));
 }
-#elif defined(__linux__)
-#include <time.h>
-static void sleep_ms(int64 ms)
-{
-    nanosleep(ms * 1000 * 1000);
-}
-#elif defined _WIN32
-static void sleep_ms(int64 ms)
-{
-    Sleep(ms);
-}
-#else
-#error "Can not detect sleep_ms() implementation"
-#endif
 
 
 // Linux specific
diff --git a/modules/videoio/src/cap_mjpeg_encoder.cpp b/modules/videoio/src/cap_mjpeg_encoder.cpp
index efac4093ae58..2bb01f4a81a7 100644
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@@ -95,7 +95,7 @@ static bool createEncodeHuffmanTable( const int* src, unsigned* table, int max_s
 
     if( size > max_size )
     {
-        CV_Error(CV_StsOutOfRange, "too big maximum Huffman code size");
+        CV_Error(cv::Error::StsOutOfRange, "too big maximum Huffman code size");
     }
 
     memset( table, 0, size*sizeof(table[0]));
@@ -268,7 +268,7 @@ class mjpeg_buffer_keeper
             m_buffer_list[0].finish();
 
             m_data_len = m_buffer_list[0].get_len();
-            m_last_bit_len = m_buffer_list[0].get_bits_free() ? 32 - m_buffer_list[0].get_bits_free() : 0;
+            m_last_bit_len = 32 - m_buffer_list[0].get_bits_free();
 
             return m_buffer_list[0].get_data();
         }
@@ -331,9 +331,14 @@ class mjpeg_buffer_keeper
         }
 
         //bits == 0 means that last element shouldn't be used.
-        m_output_buffer[m_data_len++] = currval;
-
-        m_last_bit_len = -bits;
+        if (bits != 0) {
+            m_output_buffer[m_data_len++] = currval;
+            m_last_bit_len = -bits;
+        }
+        else
+        {
+            m_last_bit_len = 32;
+        }
 
         return &m_output_buffer[0];
     }
@@ -482,7 +487,7 @@ class MotionJpegWriter : public IVideoWriter
             colorspace = COLORSPACE_YUV444P;
         }
         else
-            CV_Error(CV_StsBadArg, "Invalid combination of specified video colorspace and the input image colorspace");
+            CV_Error(cv::Error::StsBadArg, "Invalid combination of specified video colorspace and the input image colorspace");
 
         if( !rawstream ) {
             int avi_index = container.getAVIIndex(0, dc);
@@ -1167,8 +1172,6 @@ class MjpegEncoder : public ParallelLoopBody
         fdct_qtab(_fdct_qtab),
         cat_table(_cat_table)
     {
-#if 0  // disable parallel processing due to buffer overrun bug: https://github.com/opencv/opencv/issues/19634
-
         //empirically found value. if number of pixels is less than that value there is no sense to parallelize it.
         const int min_pixels_count = 96*96;
 
@@ -1194,12 +1197,6 @@ class MjpegEncoder : public ParallelLoopBody
 
         stripes_count = std::min(stripes_count, max_stripes);
 
-#else
-        if (nstripes > 1)
-            CV_LOG_ONCE_WARNING(NULL, "VIDEOIO/MJPEG: parallel processing is disabled: https://github.com/opencv/opencv/issues/19634");
-        stripes_count = 1;
-#endif
-
         m_buffer_list.allocate_buffers(stripes_count, (height*width*2)/stripes_count);
     }
 
diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp
index 78eefc34a392..d4afa2344036 100644
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@@ -18,6 +18,7 @@
 
 #include <windows.h>
 #include <guiddef.h>
+#include <initguid.h>
 #include <mfidl.h>
 #include <mfapi.h>
 #include <mfplay.h>
@@ -38,6 +39,7 @@
 #include <string>
 #include <algorithm>
 #include <deque>
+#include <iterator>
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
@@ -159,6 +161,11 @@ class ComPtr
 
 template <typename T> inline T absDiff(T a, T b) { return a >= b ? a - b : b - a; }
 
+// synonym for system MFVideoFormat_D16. D3DFMT_D16 = 80
+// added to fix builds with old MSVS and platform SDK
+// see https://learn.microsoft.com/en-us/windows/win32/medfound/video-subtype-guids#luminance-and-depth-formats
+DEFINE_MEDIATYPE_GUID( OCV_MFVideoFormat_D16, 80 );
+
 //==================================================================================================
 
 // Structure for collecting info about types of video which are supported by current video device
@@ -350,9 +357,7 @@ struct MediaType
     }
     bool VideoIsAvailable() const
     {
-        return ((subType == MFVideoFormat_RGB32) ||
-            (subType == MFVideoFormat_RGB24) ||
-            (subType == MFVideoFormat_YUY2));
+        return (subType != OCV_MFVideoFormat_D16);
     }
 };
 
@@ -702,7 +707,7 @@ class DeviceList
         if (FAILED(MFCreateAttributes(&attr, 1)) ||
             FAILED(attr->SetGUID(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE, sourceType)))
         {
-            CV_Error(CV_StsError, "Failed to create attributes");
+            CV_Error(cv::Error::StsError, "Failed to create attributes");
         }
         if (FAILED(MFEnumDeviceSources(attr.Get(), &devices, &count)))
         {
@@ -956,14 +961,14 @@ _ComPtr<IMFAttributes> CvCapture_MSMF::getDefaultSourceConfig(UINT32 num)
         FAILED(res->SetUINT32(MF_SOURCE_READER_ENABLE_ADVANCED_VIDEO_PROCESSING, true))
         )
     {
-        CV_Error(CV_StsError, "Failed to create attributes");
+        CV_Error(cv::Error::StsError, "Failed to create attributes");
     }
 #ifdef HAVE_MSMF_DXVA
     if (D3DMgr)
     {
         if (FAILED(res->SetUnknown(MF_SOURCE_READER_D3D_MANAGER, D3DMgr.Get())))
         {
-            CV_Error(CV_StsError, "Failed to create attributes");
+            CV_Error(cv::Error::StsError, "Failed to create attributes");
         }
     }
 #endif
@@ -1159,7 +1164,12 @@ bool CvCapture_MSMF::configureVideoOutput(MediaType newType, cv::uint32_t outFor
     {
         initStream(dwVideoStreamIndex, nativeFormat);
     }
-    return initStream(dwVideoStreamIndex, newFormat);
+    if (!initStream(dwVideoStreamIndex, newFormat))
+    {
+        return false;
+    }
+    outputVideoFormat = outFormat;
+    return true;
 }
 
 bool CvCapture_MSMF::configureOutput()
@@ -2719,8 +2729,6 @@ CvResult CV_API_CALL cv_capture_open_with_params(
     if (!handle)
         return CV_ERROR_FAIL;
     *handle = NULL;
-    if (!filename)
-        return CV_ERROR_FAIL;
     CaptureT* cap = 0;
     try
     {
diff --git a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_interface.hpp b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_interface.hpp
index ff78c5a69646..7337452359a5 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_stream_channel_interface.hpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_stream_channel_interface.hpp
@@ -37,6 +37,8 @@ namespace obsensor {
 #define OBSENSOR_ASTRA2_PID 0x0660 // pid of Orbbec Astra 2 Camera
 #define OBSENSOR_GEMINI2_PID 0x0670 // pid of Orbbec Gemini 2 Camera
 #define OBSENSOR_FEMTO_MEGA_PID 0x0669 // pid of Orbbec Femto Mega Camera
+#define OBSENSOR_GEMINI2L_PID 0x0673 // pid of Orbbec Gemini 2 L Camera
+#define OBSENSOR_GEMINI2XL_PID 0x0671 // pid of Orbbec Gemini 2 XL Camera
 
 enum StreamType
 {
diff --git a/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.cpp b/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
index 6e8a4f653c1f..76a963748bae 100644
--- a/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
+++ b/modules/videoio/src/cap_obsensor/obsensor_uvc_stream_channel.cpp
@@ -45,7 +45,9 @@ const uint8_t OB_EXT_CMD6[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0x7c, 0x00
 const uint8_t OB_EXT_CMD7[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0xfe, 0x12, 0x55, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00 };
 const uint8_t OB_EXT_CMD8[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0xfe, 0x13, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00 };
 const uint8_t OB_EXT_CMD9[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0xfa, 0x13, 0x4b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00 };
-const uint8_t OB_EXT_CMD10[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0xfa, 0x13, 0x3f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00 };
+const uint8_t OB_EXT_CMD11[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0xfe, 0x13, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+const uint8_t OB_EXT_CMD12[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0xfe, 0x13, 0x3f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00 };
+const uint8_t OB_EXT_CMD13[16] = { 0x47, 0x4d, 0x04, 0x00, 0x02, 0x00, 0xfa, 0x13, 0x4b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
 
 #if defined(HAVE_OBSENSOR_V4L2)
 #define fourCc2Int(a, b, c, d) \
@@ -269,14 +271,22 @@ bool IUvcStreamChannel::setProperty(int propId, const uint8_t* /*data*/, uint32_
             rst &= getXu(2, &rcvData, &rcvLen);
             rst &= setXu(2, OB_EXT_CMD6, sizeof(OB_EXT_CMD6));
             rst &= getXu(2, &rcvData, &rcvLen);
-        }
-        else if(OBSENSOR_ASTRA2_PID == devInfo_.pid ){
-            rst &= setXu(2, OB_EXT_CMD10, sizeof(OB_EXT_CMD8));
+        }else if(OBSENSOR_ASTRA2_PID == devInfo_.pid ){
+            rst &= setXu(2, OB_EXT_CMD12, sizeof(OB_EXT_CMD12));
             rst &= getXu(2, &rcvData, &rcvLen);
             rst &= setXu(2, OB_EXT_CMD6, sizeof(OB_EXT_CMD6));
             rst &= getXu(2, &rcvData, &rcvLen);
-        }
-        else{
+        }else if(OBSENSOR_GEMINI2L_PID == devInfo_.pid){
+            rst &= setXu(2, OB_EXT_CMD11, sizeof(OB_EXT_CMD11));
+            rst &= getXu(2, &rcvData, &rcvLen);
+            rst &= setXu(2, OB_EXT_CMD6, sizeof(OB_EXT_CMD6));
+            rst &= getXu(2, &rcvData, &rcvLen);
+        }else if(OBSENSOR_GEMINI2XL_PID == devInfo_.pid){
+            rst &= setXu(2, OB_EXT_CMD11, sizeof(OB_EXT_CMD11));
+            rst &= getXu(2, &rcvData, &rcvLen);
+            rst &= setXu(2, OB_EXT_CMD6, sizeof(OB_EXT_CMD6));
+            rst &= getXu(2, &rcvData, &rcvLen);
+        }else{
             rst &= setXu(2, OB_EXT_CMD0, sizeof(OB_EXT_CMD0));
             rst &= getXu(2, &rcvData, &rcvLen);
             rst &= setXu(2, OB_EXT_CMD1, sizeof(OB_EXT_CMD1));
@@ -305,14 +315,48 @@ bool IUvcStreamChannel::getProperty(int propId, uint8_t* recvData, uint32_t* rec
         if(OBSENSOR_GEMINI2_PID == devInfo_.pid){
             // return default param
             CameraParam param;
-            param.p0[0] = 516.652f;
-            param.p0[1] = 516.692f;
-            param.p0[2] = 322.988f;
-            param.p0[3] = 235.787f;
-            param.p1[0] = 516.652f;
-            param.p1[1] = 516.692f;
-            param.p1[2] = 322.988f;
-            param.p1[3] = 235.787f;
+            param.p0[0] = 519.342f;
+            param.p0[1] = 519.043f;
+            param.p0[2] = 319.41f;
+            param.p0[3] = 240.839f;
+            param.p1[0] = 519.342f;
+            param.p1[1] = 519.043f;
+            param.p1[2] = 319.41f;
+            param.p1[3] = 240.839f;
+            param.p6[0] = 640;
+            param.p6[1] = 480;
+            param.p7[0] = 640;
+            param.p7[1] = 480;
+            *recvDataSize = sizeof(CameraParam);
+            memcpy(recvData, &param, *recvDataSize);
+        }else if(OBSENSOR_GEMINI2L_PID == devInfo_.pid){
+            // return default param
+            CameraParam param;
+            param.p0[0] = 688.87f;
+            param.p0[1] = 688.922f;
+            param.p0[2] = 644.317f;
+            param.p0[3] = 354.382f;
+            param.p1[0] = 688.87f;
+            param.p1[1] = 688.922f;
+            param.p1[2] = 644.317f;
+            param.p1[3] = 354.382f;
+            param.p6[0] = 1280;
+            param.p6[1] = 720;
+            param.p7[0] = 1280;
+            param.p7[1] = 720;
+            *recvDataSize = sizeof(CameraParam);
+            memcpy(recvData, &param, *recvDataSize);
+        }else if(OBSENSOR_GEMINI2XL_PID == devInfo_.pid){
+            // return default param
+            CameraParam param;
+            param.p0[0] = 610.847f;
+            param.p0[1] = 610.829f;
+            param.p0[2] = 640.647f;
+            param.p0[3] = 401.817f;
+            param.p1[0] = 610.847f;
+            param.p1[1] = 610.829f;
+            param.p1[2] = 640.647f;
+            param.p1[3] = 401.817f;
             param.p6[0] = 640;
             param.p6[1] = 480;
             param.p7[0] = 640;
@@ -376,7 +420,7 @@ bool IUvcStreamChannel::getProperty(int propId, uint8_t* recvData, uint32_t* rec
 
 bool IUvcStreamChannel::initDepthFrameProcessor()
 {
-    if(OBSENSOR_GEMINI2_PID == devInfo_.pid || OBSENSOR_ASTRA2_PID == devInfo_.pid){
+    if( OBSENSOR_ASTRA2_PID == devInfo_.pid){
         uint8_t* rcvData;
         uint32_t rcvLen;
 
@@ -389,7 +433,30 @@ bool IUvcStreamChannel::initDepthFrameProcessor()
         depthFrameProcessor_ = makePtr<DepthFrameUnpacker>();
         return true;
     }
-    else if (streamType_ == OBSENSOR_STREAM_DEPTH && setXu(2, OB_EXT_CMD4, sizeof(OB_EXT_CMD4)))
+    else if(OBSENSOR_GEMINI2_PID == devInfo_.pid || OBSENSOR_GEMINI2L_PID == devInfo_.pid){
+        uint8_t* rcvData;
+        uint32_t rcvLen;
+
+        setXu(2, OB_EXT_CMD7, sizeof(OB_EXT_CMD7));
+        getXu(2, &rcvData, &rcvLen);
+
+        setXu(2, OB_EXT_CMD9, sizeof(OB_EXT_CMD9));
+        getXu(2, &rcvData, &rcvLen);
+        return true;
+    }
+    else if(OBSENSOR_GEMINI2XL_PID == devInfo_.pid){
+        uint8_t* rcvData;
+        uint32_t rcvLen;
+
+        setXu(2, OB_EXT_CMD7, sizeof(OB_EXT_CMD7));
+        getXu(2, &rcvData, &rcvLen);
+
+        setXu(2, OB_EXT_CMD13, sizeof(OB_EXT_CMD13));
+        getXu(2, &rcvData, &rcvLen);
+
+        return true;
+    }
+    else if(streamType_ == OBSENSOR_STREAM_DEPTH && setXu(2, OB_EXT_CMD4, sizeof(OB_EXT_CMD4)))
     {
         uint8_t* rcvData;
         uint32_t rcvLen;
diff --git a/modules/videoio/src/cap_obsensor_capture.cpp b/modules/videoio/src/cap_obsensor_capture.cpp
index 8138f0933310..4c64faee1140 100644
--- a/modules/videoio/src/cap_obsensor_capture.cpp
+++ b/modules/videoio/src/cap_obsensor_capture.cpp
@@ -23,7 +23,8 @@
 
 #include "cap_obsensor_capture.hpp"
 #include "cap_obsensor/obsensor_stream_channel_interface.hpp"
-#ifdef HAVE_OBSENSOR
+
+#if defined(HAVE_OBSENSOR) && !defined(HAVE_OBSENSOR_ORBBEC_SDK)
 namespace cv {
 Ptr<IVideoCapture> create_obsensor_capture(int index)
 {
@@ -34,10 +35,15 @@ VideoCapture_obsensor::VideoCapture_obsensor(int index) : isOpened_(false)
 {
     static const obsensor::StreamProfile colorProfile = { 640, 480, 30, obsensor::FRAME_FORMAT_MJPG };
     static const obsensor::StreamProfile depthProfile = {640, 480, 30, obsensor::FRAME_FORMAT_Y16};
-    static const obsensor::StreamProfile gemini2DepthProfile = {1280, 800, 30, obsensor::FRAME_FORMAT_Y14};
-    static const obsensor::StreamProfile astra2DepthProfile = {640, 480, 30, obsensor::FRAME_FORMAT_Y14};
+    static const obsensor::StreamProfile gemini2DepthProfile = {1280, 800, 30, obsensor::FRAME_FORMAT_Y16};
+    static const obsensor::StreamProfile astra2ColorProfile = {800, 600, 30, obsensor::FRAME_FORMAT_MJPG};
+    static const obsensor::StreamProfile astra2DepthProfile = {800, 600, 30, obsensor::FRAME_FORMAT_Y14};
     static const obsensor::StreamProfile megaColorProfile = {1280, 720, 30, obsensor::FRAME_FORMAT_MJPG};
     static const obsensor::StreamProfile megaDepthProfile = {640, 576, 30, obsensor::FRAME_FORMAT_Y16};
+    static const obsensor::StreamProfile gemini2lColorProfile = { 1280, 720, 30, obsensor::FRAME_FORMAT_MJPG};
+    static const obsensor::StreamProfile gemini2lDepthProfile = {1280, 800, 30, obsensor::FRAME_FORMAT_Y16};
+    static const obsensor::StreamProfile gemini2XlColorProfile = { 1280, 800, 10, obsensor::FRAME_FORMAT_MJPG};
+    static const obsensor::StreamProfile gemini2XlDepthProfile = {1280, 800, 10, obsensor::FRAME_FORMAT_Y16};
 
     streamChannelGroup_ = obsensor::getStreamChannelGroup(index);
     if (!streamChannelGroup_.empty())
@@ -52,6 +58,12 @@ VideoCapture_obsensor::VideoCapture_obsensor(int index) : isOpened_(false)
                 auto profile = colorProfile;
                 if(OBSENSOR_FEMTO_MEGA_PID == channel->getPid()){
                     profile = megaColorProfile;
+                }else if(OBSENSOR_GEMINI2L_PID == channel->getPid()){
+                    profile = gemini2lColorProfile;
+                }else if(OBSENSOR_ASTRA2_PID == channel->getPid()){
+                    profile = astra2ColorProfile;
+                }else if(OBSENSOR_GEMINI2XL_PID == channel->getPid()){
+                    profile = gemini2XlColorProfile;
                 }
                 channel->start(profile, [&](obsensor::Frame* frame) {
                     std::unique_lock<std::mutex> lk(frameMutex_);
@@ -74,8 +86,11 @@ VideoCapture_obsensor::VideoCapture_obsensor(int index) : isOpened_(false)
                 }
                 else if(OBSENSOR_FEMTO_MEGA_PID == channel->getPid()){
                     profile = megaDepthProfile;
+                }else if(OBSENSOR_GEMINI2L_PID == channel->getPid()){
+                    profile = gemini2lDepthProfile;
+                }else if(OBSENSOR_GEMINI2XL_PID == channel->getPid()){
+                    profile = gemini2XlDepthProfile;
                 }
-
                 channel->start(profile, [&](obsensor::Frame* frame) {
                     std::unique_lock<std::mutex> lk(frameMutex_);
                     depthFrame_ = Mat(frame->height, frame->width, CV_16UC1, frame->data, frame->width * 2).clone();
@@ -129,19 +144,27 @@ bool VideoCapture_obsensor::retrieveFrame(int outputType, OutputArray frame)
         if (!grabbedDepthFrame_.empty())
         {
             if(OBSENSOR_GEMINI2_PID == streamChannelGroup_.front()->getPid()){
-                grabbedDepthFrame_ = grabbedDepthFrame_*0.8;
+                const double DepthValueScaleGemini2 = 0.2;
+                grabbedDepthFrame_ = grabbedDepthFrame_*DepthValueScaleGemini2;
                 Rect rect(320, 160, 640, 480);
                 grabbedDepthFrame_(rect).copyTo(frame);
             }
             else if(OBSENSOR_ASTRA2_PID == streamChannelGroup_.front()->getPid()){
-                grabbedDepthFrame_ = grabbedDepthFrame_*0.8;
+                const double DepthValueScaleAstra2 = 0.8;
+                grabbedDepthFrame_ = grabbedDepthFrame_*DepthValueScaleAstra2;
                 grabbedDepthFrame_.copyTo(frame);
             }
             else if(OBSENSOR_FEMTO_MEGA_PID == streamChannelGroup_.front()->getPid()){
                 Rect rect(0, 0, 640, 360);
                 grabbedDepthFrame_(rect).copyTo(frame);
-            }
-            else{
+            }else if(OBSENSOR_GEMINI2L_PID == streamChannelGroup_.front()->getPid()){
+                const double DepthValueScaleGemini2L = 0.2;
+                grabbedDepthFrame_ = grabbedDepthFrame_*DepthValueScaleGemini2L;
+                Rect rect(0, 40, 1280, 720);
+                grabbedDepthFrame_(rect).copyTo(frame);
+            }else if(OBSENSOR_GEMINI2XL_PID == streamChannelGroup_.front()->getPid()){
+                grabbedDepthFrame_.copyTo(frame);
+            }else{
                 grabbedDepthFrame_.copyTo(frame);
             }
             grabbedDepthFrame_.release();
diff --git a/modules/videoio/src/cap_obsensor_capture.hpp b/modules/videoio/src/cap_obsensor_capture.hpp
index 821e6193a021..89ab403bb2e3 100644
--- a/modules/videoio/src/cap_obsensor_capture.hpp
+++ b/modules/videoio/src/cap_obsensor_capture.hpp
@@ -28,7 +28,8 @@
 
 #include "cap_obsensor/obsensor_stream_channel_interface.hpp"
 
-#ifdef HAVE_OBSENSOR
+#if defined(HAVE_OBSENSOR) && !defined(HAVE_OBSENSOR_ORBBEC_SDK)
+
 namespace cv {
 class VideoCapture_obsensor : public IVideoCapture
 {
diff --git a/modules/videoio/src/cap_obsensor_liborbbec.cpp b/modules/videoio/src/cap_obsensor_liborbbec.cpp
new file mode 100644
index 000000000000..ac581d81dce3
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor_liborbbec.cpp
@@ -0,0 +1,154 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2024 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*   Yu Shuai <daiyin@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "precomp.hpp"
+
+#if defined(HAVE_OBSENSOR) && defined(HAVE_OBSENSOR_ORBBEC_SDK)
+#include "libobsensor/ObSensor.hpp"
+#include "cap_obsensor_liborbbec.hpp"
+
+namespace cv
+{
+Ptr<IVideoCapture> create_obsensor_capture(int index)
+{
+    return makePtr<VideoCapture_obsensor>(index);
+}
+
+VideoCapture_obsensor::VideoCapture_obsensor(int)
+{
+    ob::Context::setLoggerToFile(OB_LOG_SEVERITY_OFF, "");
+    config = std::make_shared<ob::Config>();
+    pipe = std::make_shared<ob::Pipeline>();
+    auto colorProfiles = pipe->getStreamProfileList(OB_SENSOR_COLOR);
+    auto colorProfile = colorProfiles->getProfile(OB_PROFILE_DEFAULT);
+    config->enableStream(colorProfile->as<ob::VideoStreamProfile>());
+
+    auto depthProfiles = pipe->getStreamProfileList(OB_SENSOR_DEPTH);
+    auto depthProfile = depthProfiles->getProfile(OB_PROFILE_DEFAULT);
+    config->enableStream(depthProfile->as<ob::VideoStreamProfile>());
+
+    config->setAlignMode(ALIGN_D2C_SW_MODE);
+
+    pipe->start(config, [&](std::shared_ptr<ob::FrameSet> frameset) {
+        std::unique_lock<std::mutex> lk(videoFrameMutex);
+        colorFrame = frameset->colorFrame();
+        depthFrame = frameset->depthFrame();
+    });
+
+    auto param = pipe->getCameraParam();
+    camParam.p1[0] = param.rgbIntrinsic.fx;
+    camParam.p1[1] = param.rgbIntrinsic.fy;
+    camParam.p1[2] = param.rgbIntrinsic.cx;
+    camParam.p1[3] = param.rgbIntrinsic.cy;
+}
+
+VideoCapture_obsensor::~VideoCapture_obsensor(){
+    pipe->stop();
+}
+
+double VideoCapture_obsensor::getProperty(int propIdx) const
+{
+    double rst = 0.0;
+    propIdx = propIdx & (~CAP_OBSENSOR_GENERATORS_MASK);
+    switch (propIdx)
+    {
+    case CAP_PROP_OBSENSOR_INTRINSIC_FX:
+        rst = camParam.p1[0];
+        break;
+    case CAP_PROP_OBSENSOR_INTRINSIC_FY:
+        rst = camParam.p1[1];
+        break;
+    case CAP_PROP_OBSENSOR_INTRINSIC_CX:
+        rst = camParam.p1[2];
+        break;
+    case CAP_PROP_OBSENSOR_INTRINSIC_CY:
+        rst = camParam.p1[3];
+        break;
+    }
+    return rst;
+}
+
+bool VideoCapture_obsensor::setProperty(int, double)
+{
+    return false;
+}
+
+bool VideoCapture_obsensor::grabFrame()
+{
+    std::unique_lock<std::mutex> lk(videoFrameMutex);
+    grabbedColorFrame = colorFrame;
+    grabbedDepthFrame = depthFrame;
+
+    return grabbedColorFrame || grabbedDepthFrame;
+}
+
+bool VideoCapture_obsensor::retrieveFrame(int outputType, cv::OutputArray frame)
+{
+    switch (outputType)
+    {
+    case CAP_OBSENSOR_BGR_IMAGE:
+        if(grabbedColorFrame != nullptr){
+            auto format = grabbedColorFrame->format();
+            if(format != OB_FORMAT_MJPEG){
+                CV_LOG_WARNING(NULL, "Unsupported color frame format");
+                return false;
+            }
+            auto mjpgMat = Mat(1, grabbedColorFrame->dataSize() , CV_8UC1, grabbedColorFrame->data()).clone();
+            auto bgrMat = imdecode(mjpgMat, IMREAD_COLOR);
+            if(bgrMat.empty()){
+                CV_LOG_WARNING(NULL, "Failed to decode color frame");
+                return false;
+            }
+            bgrMat.copyTo(frame);
+            return true;
+        }
+        break;
+    case CAP_OBSENSOR_DEPTH_MAP:
+        if(grabbedDepthFrame != nullptr){
+            auto format = grabbedDepthFrame->format();
+            if(format != OB_FORMAT_Y16){
+                CV_LOG_WARNING(NULL, "Unsupported depth frame format");
+                return false;
+            }
+            Mat(grabbedDepthFrame->height(), grabbedDepthFrame->width(), CV_16UC1, grabbedDepthFrame->data()).copyTo(frame);
+            return true;
+        }
+        break;
+    default:
+        return false;
+    }
+
+    return false;
+}
+
+int VideoCapture_obsensor::getCaptureDomain()
+{
+    return CAP_OBSENSOR;
+}
+
+bool VideoCapture_obsensor::isOpened() const
+{
+    return true;
+}
+
+}
+#endif
diff --git a/modules/videoio/src/cap_obsensor_liborbbec.hpp b/modules/videoio/src/cap_obsensor_liborbbec.hpp
new file mode 100644
index 000000000000..13dbf413cad7
--- /dev/null
+++ b/modules/videoio/src/cap_obsensor_liborbbec.hpp
@@ -0,0 +1,74 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+* Copyright(C) 2024 by ORBBEC Technology., Inc.
+* Authors:
+*   Huang Zhenchang <yufeng@orbbec.com>
+*   Yu Shuai <daiyin@orbbec.com>
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef _CAP_LIBORBBEC_HPP_
+#define _CAP_LIBORBBEC_HPP_
+
+#if defined(HAVE_OBSENSOR) && defined(HAVE_OBSENSOR_ORBBEC_SDK)
+
+#include <libobsensor/ObSensor.hpp>
+#include <mutex>
+
+namespace cv
+{
+
+struct CameraParam
+{
+    float    p0[4];
+    float    p1[4];
+    float    p2[9];
+    float    p3[3];
+    float    p4[5];
+    float    p5[5];
+    uint32_t p6[2];
+    uint32_t p7[2];
+};
+
+class VideoCapture_obsensor : public IVideoCapture
+{
+public:
+    VideoCapture_obsensor(int index);
+    virtual ~VideoCapture_obsensor();
+
+    virtual double getProperty(int propIdx) const CV_OVERRIDE;
+    virtual bool setProperty(int propIdx, double propVal) CV_OVERRIDE;
+
+    virtual bool grabFrame() CV_OVERRIDE;
+    virtual bool retrieveFrame(int outputType, OutputArray frame) CV_OVERRIDE;
+    virtual int getCaptureDomain() CV_OVERRIDE;
+    virtual bool isOpened() const CV_OVERRIDE;
+
+protected:
+    std::mutex                 videoFrameMutex;
+    std::shared_ptr<ob::VideoFrame> colorFrame;
+    std::shared_ptr<ob::VideoFrame> depthFrame;
+    std::shared_ptr<ob::VideoFrame> grabbedColorFrame;
+    std::shared_ptr<ob::VideoFrame> grabbedDepthFrame;
+    std::shared_ptr<ob::Pipeline> pipe;
+    std::shared_ptr<ob::Config> config;
+    CameraParam camParam;
+};
+
+}
+
+#endif
+#endif
diff --git a/modules/videoio/src/cap_openni2.cpp b/modules/videoio/src/cap_openni2.cpp
index d4b3ae415310..214cc1a55904 100644
--- a/modules/videoio/src/cap_openni2.cpp
+++ b/modules/videoio/src/cap_openni2.cpp
@@ -90,7 +90,7 @@ struct OpenNI2Initializer
         openni::Status status = openni::OpenNI::initialize();
         if (status != openni::STATUS_OK)
         {
-            CV_Error(CV_StsError, std::string("Failed to initialize:") + openni::OpenNI::getExtendedError());
+            CV_Error(cv::Error::StsError, std::string("Failed to initialize:") + openni::OpenNI::getExtendedError());
         }
     }
 
@@ -150,7 +150,7 @@ class CvCapture_OpenNI2 : public CvCapture
     IplImage* retrieveIrImage();
 
     void toggleStream(int stream, bool toggle);
-    void readCamerasParams();
+    bool readCamerasParams();
 
     double getDepthGeneratorProperty(int propIdx) const;
     bool setDepthGeneratorProperty(int propIdx, double propVal);
@@ -274,7 +274,7 @@ CvCapture_OpenNI2::CvCapture_OpenNI2(int index, const char * filename) :
                 deviceURI = ldevs[index].getUri();
             else
             {
-                CV_Error(CV_StsError, "OpenCVKinect2: Device index exceeds the number of available OpenNI devices");
+                CV_Error(cv::Error::StsError, "OpenCVKinect2: Device index exceeds the number of available OpenNI devices");
             }
         }
     }
@@ -287,7 +287,7 @@ CvCapture_OpenNI2::CvCapture_OpenNI2(int index, const char * filename) :
     status = device.open(deviceURI);
     if (status != openni::STATUS_OK)
     {
-        CV_Error(CV_StsError, std::string("OpenCVKinect2: Failed to open device: ") + openni::OpenNI::getExtendedError());
+        CV_Error(cv::Error::StsError, std::string("OpenCVKinect2: Failed to open device: ") + openni::OpenNI::getExtendedError());
     }
 
     toggleStream(CV_DEPTH_STREAM, true);
@@ -361,7 +361,7 @@ void CvCapture_OpenNI2::toggleStream(int stream, bool toggle)
                     if (status != openni::STATUS_OK)
                     {
                         streams[stream].destroy();
-                        CV_Error(CV_StsError, std::string("OpenCVKinect2 : Couldn't set ") +
+                        CV_Error(cv::Error::StsError, std::string("OpenCVKinect2 : Couldn't set ") +
                                  stream_names[stream] + std::string(" stream output mode: ") +
                                  std::string(openni::OpenNI::getExtendedError()));
                     }
@@ -373,14 +373,14 @@ void CvCapture_OpenNI2::toggleStream(int stream, bool toggle)
             if (status != openni::STATUS_OK)
             {
                 streams[stream].destroy();
-                CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start ") +
+                CV_Error(cv::Error::StsError, std::string("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't start ") +
                          stream_names[stream] + std::string(" stream: ") +
                          std::string(openni::OpenNI::getExtendedError()));
             }
         }
         else
         {
-            CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find ") +
+            CV_Error(cv::Error::StsError, std::string("CvCapture_OpenNI2::CvCapture_OpenNI2 : Couldn't find ") +
                      stream_names[stream] + " stream: " +
                      std::string(openni::OpenNI::getExtendedError()));
         }
@@ -396,13 +396,14 @@ void CvCapture_OpenNI2::toggleStream(int stream, bool toggle)
 }
 
 
-void CvCapture_OpenNI2::readCamerasParams()
+bool CvCapture_OpenNI2::readCamerasParams()
 {
     double pixelSize = 0;
     if (streams[CV_DEPTH_STREAM].getProperty<double>(XN_STREAM_PROPERTY_ZERO_PLANE_PIXEL_SIZE, &pixelSize) != openni::STATUS_OK)
     {
-        CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read pixel size!" +
-                              std::string(openni::OpenNI::getExtendedError()));
+        CV_LOG_ERROR(NULL, "CvCapture_OpenNI2::readCamerasParams : Could not read pixel size!" +
+                           std::string(openni::OpenNI::getExtendedError()));
+        return false;
     }
 
     // pixel size @ VGA = pixel size @ SXGA x 2
@@ -412,14 +413,16 @@ void CvCapture_OpenNI2::readCamerasParams()
     unsigned long long zeroPlaneDistance; // in mm
     if (streams[CV_DEPTH_STREAM].getProperty(XN_STREAM_PROPERTY_ZERO_PLANE_DISTANCE, &zeroPlaneDistance) != openni::STATUS_OK)
     {
-        CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read virtual plane distance!" +
-                              std::string(openni::OpenNI::getExtendedError()));
+        CV_LOG_ERROR(NULL, "CvCapture_OpenNI2::readCamerasParams : Could not read virtual plane distance!" +
+                           std::string(openni::OpenNI::getExtendedError()));
+        return false;
     }
 
     if (streams[CV_DEPTH_STREAM].getProperty<double>(XN_STREAM_PROPERTY_EMITTER_DCMOS_DISTANCE, &baseline) != openni::STATUS_OK)
     {
-        CV_Error(CV_StsError, "CvCapture_OpenNI2::readCamerasParams : Could not read base line!" +
-                              std::string(openni::OpenNI::getExtendedError()));
+        CV_LOG_ERROR(NULL, "CvCapture_OpenNI2::readCamerasParams : Could not read base line!" +
+                           std::string(openni::OpenNI::getExtendedError()));
+        return false;
     }
 
     // baseline from cm -> mm
@@ -427,6 +430,8 @@ void CvCapture_OpenNI2::readCamerasParams()
 
     // focal length from mm -> pixels (valid for 640x480)
     depthFocalLength_VGA = (int)((double)zeroPlaneDistance / (double)pixelSize);
+
+    return true;
 }
 
 double CvCapture_OpenNI2::getProperty( int propIdx ) const
@@ -513,7 +518,7 @@ double CvCapture_OpenNI2::getCommonProperty( int propIdx ) const
         break;
     }
     default :
-        CV_Error( CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for getting.", propIdx) );
+        CV_LOG_WARNING( NULL, cv::format("Such parameter (propIdx=%d) isn't supported for getting.", propIdx) );
     }
 
     return propValue;
@@ -551,7 +556,7 @@ bool CvCapture_OpenNI2::setCommonProperty( int propIdx, double propValue )
         break;
 
     default:
-        CV_Error(CV_StsBadArg, cv::format("Such parameter (propIdx=%d) isn't supported for setting.", propIdx));
+        CV_LOG_WARNING(NULL, cv::format("Such parameter (propIdx=%d) isn't supported for setting.", propIdx));
     }
 
     return isSet;
@@ -585,12 +590,14 @@ double CvCapture_OpenNI2::getDepthGeneratorProperty( int propIdx ) const
         break;
     case CV_CAP_PROP_OPENNI_BASELINE :
         if(baseline <= 0)
-            const_cast<CvCapture_OpenNI2*>(this)->readCamerasParams();
+            if (!const_cast<CvCapture_OpenNI2*>(this)->readCamerasParams())
+                return 0;
         propValue = baseline;
         break;
     case CV_CAP_PROP_OPENNI_FOCAL_LENGTH :
         if(depthFocalLength_VGA <= 0)
-            const_cast<CvCapture_OpenNI2*>(this)->readCamerasParams();
+            if (!const_cast<CvCapture_OpenNI2*>(this)->readCamerasParams())
+                return 0;
         propValue = (double)depthFocalLength_VGA;
         break;
     case CV_CAP_PROP_OPENNI_REGISTRATION :
@@ -603,7 +610,7 @@ double CvCapture_OpenNI2::getDepthGeneratorProperty( int propIdx ) const
         propValue = streamFrames[CV_DEPTH_STREAM].getFrameIndex();
         break;
     default :
-        CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for getting.", propIdx) );
+        CV_LOG_WARNING( NULL, cv::format("Depth generator does not support such parameter (propIdx=%d) for getting.", propIdx) );
     }
 
     return propValue;
@@ -638,13 +645,17 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue
                         {
                             openni::Status status = device.setImageRegistrationMode(mode);
                             if( status != openni::STATUS_OK )
-                                CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setDepthGeneratorProperty: ") +
-                                         std::string(openni::OpenNI::getExtendedError()));
+                            {
+                                CV_LOG_ERROR(NULL, std::string("CvCapture_OpenNI2::setDepthGeneratorProperty: ") +
+                                                   std::string(openni::OpenNI::getExtendedError()));
+                            }
                             else
                                 isSet = true;
                         }
                         else
-                            CV_Error(CV_StsError, "CvCapture_OpenNI2::setDepthGeneratorProperty: Unsupported viewpoint.");
+                        {
+                            CV_LOG_ERROR(NULL, "CvCapture_OpenNI2::setDepthGeneratorProperty: Unsupported viewpoint.");
+                        }
                     }
                     else
                         isSet = true;
@@ -654,15 +665,17 @@ bool CvCapture_OpenNI2::setDepthGeneratorProperty( int propIdx, double propValue
             {
                 openni::Status status = device.setImageRegistrationMode(openni::IMAGE_REGISTRATION_OFF);
                 if( status != openni::STATUS_OK )
-                    CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setDepthGeneratorProperty: ") +
-                             std::string(openni::OpenNI::getExtendedError()));
+                {
+                    CV_LOG_ERROR(NULL, std::string("CvCapture_OpenNI2::setDepthGeneratorProperty: ") +
+                                       std::string(openni::OpenNI::getExtendedError()));
+                }
                 else
                     isSet = true;
             }
         }
         break;
     default:
-        CV_Error( CV_StsBadArg, cv::format("Depth generator does not support such parameter (propIdx=%d) for setting.", propIdx) );
+        CV_LOG_WARNING( NULL, cv::format("OpenNI2: Depth generator does not support such parameter (propIdx=%d) for setting.", propIdx) );
     }
 
     return isSet;
@@ -696,7 +709,7 @@ double CvCapture_OpenNI2::getImageGeneratorProperty( int propIdx ) const
         propValue = (double)streamFrames[CV_COLOR_STREAM].getFrameIndex();
         break;
     default :
-        CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.", propIdx) );
+        CV_LOG_WARNING( NULL, cv::format("OpenNI2: Image generator does not support such parameter (propIdx=%d) for getting.", propIdx) );
     }
 
     return propValue;
@@ -744,19 +757,22 @@ bool CvCapture_OpenNI2::setImageGeneratorProperty(int propIdx, double propValue)
                 mode.setFps(60);
                  break;
             default :
-                CV_Error( CV_StsBadArg, "Unsupported image generator output mode.");
+                CV_LOG_WARNING( NULL, "Unsupported image generator output mode.");
+                return false;
             }
 
             openni::Status status = streams[CV_COLOR_STREAM].setVideoMode( mode );
             if( status != openni::STATUS_OK )
-                CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setImageGeneratorProperty: ") +
-                         std::string(openni::OpenNI::getExtendedError()));
+            {
+                CV_LOG_ERROR(NULL, std::string("CvCapture_OpenNI2::setImageGeneratorProperty: ") +
+                                     std::string(openni::OpenNI::getExtendedError()));
+            }
             else
                 isSet = true;
             break;
         }
         default:
-            CV_Error( CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.", propIdx) );
+            CV_LOG_WARNING( NULL, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.", propIdx) );
         }
 
     return isSet;
@@ -790,7 +806,7 @@ double CvCapture_OpenNI2::getIrGeneratorProperty(int propIdx) const
         propValue = (double)streamFrames[CV_IR_STREAM].getFrameIndex();
         break;
     default:
-        CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.", propIdx));
+        CV_LOG_WARNING(NULL, cv::format("Image generator does not support such parameter (propIdx=%d) for getting.", propIdx));
     }
 
     return propValue;
@@ -838,19 +854,21 @@ bool CvCapture_OpenNI2::setIrGeneratorProperty(int propIdx, double propValue)
             mode.setFps(60);
             break;
         default:
-            CV_Error(CV_StsBadArg, "Unsupported image generator output mode.");
+            CV_LOG_WARNING(NULL, "Unsupported image generator output mode.");
         }
 
         openni::Status status = streams[CV_IR_STREAM].setVideoMode(mode);
         if (status != openni::STATUS_OK)
-            CV_Error(CV_StsError, std::string("CvCapture_OpenNI2::setImageGeneratorProperty: ") +
-                     std::string(openni::OpenNI::getExtendedError()));
+        {
+            CV_LOG_ERROR(NULL, std::string("CvCapture_OpenNI2::setImageGeneratorProperty: ") +
+                               std::string(openni::OpenNI::getExtendedError()));
+        }
         else
             isSet = true;
         break;
     }
     default:
-        CV_Error(CV_StsBadArg, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.", propIdx));
+        CV_LOG_WARNING(NULL, cv::format("Image generator does not support such parameter (propIdx=%d) for setting.", propIdx));
     }
 
     return isSet;
@@ -965,9 +983,10 @@ static void computeDisparity_32F( const openni::VideoFrameRef& depthMetaData, cv
 IplImage* CvCapture_OpenNI2::retrieveDisparityMap()
 {
     if (!streamFrames[CV_DEPTH_STREAM].isValid())
-        return 0;
+        return nullptr;
 
-    readCamerasParams();
+    if (!readCamerasParams())
+        return nullptr;
 
     cv::Mat disp32;
     computeDisparity_32F(streamFrames[CV_DEPTH_STREAM], disp32, baseline, depthFocalLength_VGA, noSampleValue, shadowValue);
@@ -980,9 +999,10 @@ IplImage* CvCapture_OpenNI2::retrieveDisparityMap()
 IplImage* CvCapture_OpenNI2::retrieveDisparityMap_32F()
 {
     if (!streamFrames[CV_DEPTH_STREAM].isValid())
-        return 0;
+        return nullptr;
 
-    readCamerasParams();
+    if (!readCamerasParams())
+        return nullptr;
 
     computeDisparity_32F(streamFrames[CV_DEPTH_STREAM], outputMaps[CV_CAP_OPENNI_DISPARITY_MAP_32F].mat, baseline, depthFocalLength_VGA, noSampleValue, shadowValue);
 
@@ -992,7 +1012,7 @@ IplImage* CvCapture_OpenNI2::retrieveDisparityMap_32F()
 IplImage* CvCapture_OpenNI2::retrieveValidDepthMask()
 {
     if (!streamFrames[CV_DEPTH_STREAM].isValid())
-        return 0;
+        return nullptr;
 
     cv::Mat d;
     getDepthMapFromMetaData(streamFrames[CV_DEPTH_STREAM], d, noSampleValue, shadowValue);
@@ -1006,7 +1026,7 @@ inline void getBGRImageFromMetaData( const openni::VideoFrameRef& imageMetaData,
 {
    cv::Mat bufferImage;
    if( imageMetaData.getVideoMode().getPixelFormat() != openni::PIXEL_FORMAT_RGB888 )
-        CV_Error( CV_StsUnsupportedFormat, "Unsupported format of grabbed image." );
+        CV_Error( cv::Error::StsUnsupportedFormat, "Unsupported format of grabbed image." );
 
    bgrImage.create(imageMetaData.getHeight(), imageMetaData.getWidth(), CV_8UC3);
    bufferImage.create(imageMetaData.getHeight(), imageMetaData.getWidth(), CV_8UC3);
@@ -1029,7 +1049,7 @@ inline void getGrayImageFromMetaData(const openni::VideoFrameRef& imageMetaData,
     }
     else
     {
-        CV_Error(CV_StsUnsupportedFormat, "Unsupported format of grabbed image.");
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported format of grabbed image.");
     }
 }
 
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index a5d8561c6b00..531af03d1aaf 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -260,6 +260,10 @@ typedef uint32_t __u32;
 #define V4L2_CID_IRIS_ABSOLUTE (V4L2_CID_CAMERA_CLASS_BASE+17)
 #endif
 
+#ifndef v4l2_fourcc_be
+#define v4l2_fourcc_be(a, b, c, d) (v4l2_fourcc(a, b, c, d) | (1U << 31))
+#endif
+
 #ifndef V4L2_PIX_FMT_Y10
 #define V4L2_PIX_FMT_Y10 v4l2_fourcc('Y', '1', '0', ' ')
 #endif
@@ -268,6 +272,14 @@ typedef uint32_t __u32;
 #define V4L2_PIX_FMT_Y12 v4l2_fourcc('Y', '1', '2', ' ')
 #endif
 
+#ifndef V4L2_PIX_FMT_Y16
+#define V4L2_PIX_FMT_Y16 v4l2_fourcc('Y', '1', '6', ' ')
+#endif
+
+#ifndef V4L2_PIX_FMT_Y16_BE
+#define V4L2_PIX_FMT_Y16_BE v4l2_fourcc_be('Y', '1', '6', ' ')
+#endif
+
 #ifndef V4L2_PIX_FMT_ABGR32
 #define V4L2_PIX_FMT_ABGR32  v4l2_fourcc('A', 'R', '2', '4')
 #endif
@@ -601,6 +613,7 @@ bool CvCaptureCAM_V4L::autosetup_capture_mode_v4l2()
             V4L2_PIX_FMT_NV21,
             V4L2_PIX_FMT_SBGGR8,
             V4L2_PIX_FMT_SGBRG8,
+            V4L2_PIX_FMT_SGRBG8,
             V4L2_PIX_FMT_XBGR32,
             V4L2_PIX_FMT_ABGR32,
             V4L2_PIX_FMT_SN9C10X,
@@ -609,6 +622,7 @@ bool CvCaptureCAM_V4L::autosetup_capture_mode_v4l2()
             V4L2_PIX_FMT_JPEG,
 #endif
             V4L2_PIX_FMT_Y16,
+            V4L2_PIX_FMT_Y16_BE,
             V4L2_PIX_FMT_Y12,
             V4L2_PIX_FMT_Y10,
             V4L2_PIX_FMT_GREY,
@@ -666,8 +680,10 @@ bool CvCaptureCAM_V4L::convertableToRgb() const
     case V4L2_PIX_FMT_SBGGR8:
     case V4L2_PIX_FMT_SN9C10X:
     case V4L2_PIX_FMT_SGBRG8:
+    case V4L2_PIX_FMT_SGRBG8:
     case V4L2_PIX_FMT_RGB24:
     case V4L2_PIX_FMT_Y16:
+    case V4L2_PIX_FMT_Y16_BE:
     case V4L2_PIX_FMT_Y10:
     case V4L2_PIX_FMT_GREY:
     case V4L2_PIX_FMT_BGR24:
@@ -716,6 +732,7 @@ void CvCaptureCAM_V4L::v4l2_create_frame()
             size.height = size.height * 3 / 2; // "1.5" channels
             break;
         case V4L2_PIX_FMT_Y16:
+        case V4L2_PIX_FMT_Y16_BE:
         case V4L2_PIX_FMT_Y12:
         case V4L2_PIX_FMT_Y10:
             depth = IPL_DEPTH_16U;
@@ -1304,174 +1321,6 @@ yuv411p_to_rgb24(int width, int height,
     }
 }
 
-/*
- * BAYER2RGB24 ROUTINE TAKEN FROM:
- *
- * Sonix SN9C10x based webcam basic I/F routines
- * Takafumi Mizuno <taka-qce@ls-a.jp>
- *
- */
-static void bayer2rgb24(long int WIDTH, long int HEIGHT, unsigned char *src, unsigned char *dst)
-{
-    long int i;
-    unsigned char *rawpt, *scanpt;
-    long int size;
-
-    rawpt = src;
-    scanpt = dst;
-    size = WIDTH*HEIGHT;
-
-    for ( i = 0; i < size; i++ ) {
-        if ( (i/WIDTH) % 2 == 0 ) {
-            if ( (i % 2) == 0 ) {
-                /* B */
-                if ( (i > WIDTH) && ((i % WIDTH) > 0) ) {
-                    *scanpt++ = (*(rawpt-WIDTH-1)+*(rawpt-WIDTH+1)+
-                            *(rawpt+WIDTH-1)+*(rawpt+WIDTH+1))/4;  /* R */
-                    *scanpt++ = (*(rawpt-1)+*(rawpt+1)+
-                            *(rawpt+WIDTH)+*(rawpt-WIDTH))/4;      /* G */
-                    *scanpt++ = *rawpt;                                     /* B */
-                } else {
-                    /* first line or left column */
-                    *scanpt++ = *(rawpt+WIDTH+1);           /* R */
-                    *scanpt++ = (*(rawpt+1)+*(rawpt+WIDTH))/2;      /* G */
-                    *scanpt++ = *rawpt;                             /* B */
-                }
-            } else {
-                /* (B)G */
-                if ( (i > WIDTH) && ((i % WIDTH) < (WIDTH-1)) ) {
-                    *scanpt++ = (*(rawpt+WIDTH)+*(rawpt-WIDTH))/2;  /* R */
-                    *scanpt++ = *rawpt;                                     /* G */
-                    *scanpt++ = (*(rawpt-1)+*(rawpt+1))/2;          /* B */
-                } else {
-                    /* first line or right column */
-                    *scanpt++ = *(rawpt+WIDTH);     /* R */
-                    *scanpt++ = *rawpt;             /* G */
-                    *scanpt++ = *(rawpt-1); /* B */
-                }
-            }
-        } else {
-            if ( (i % 2) == 0 ) {
-                /* G(R) */
-                if ( (i < (WIDTH*(HEIGHT-1))) && ((i % WIDTH) > 0) ) {
-                    *scanpt++ = (*(rawpt-1)+*(rawpt+1))/2;          /* R */
-                    *scanpt++ = *rawpt;                                     /* G */
-                    *scanpt++ = (*(rawpt+WIDTH)+*(rawpt-WIDTH))/2;  /* B */
-                } else {
-                    /* bottom line or left column */
-                    *scanpt++ = *(rawpt+1);         /* R */
-                    *scanpt++ = *rawpt;                     /* G */
-                    *scanpt++ = *(rawpt-WIDTH);             /* B */
-                }
-            } else {
-                /* R */
-                if ( i < (WIDTH*(HEIGHT-1)) && ((i % WIDTH) < (WIDTH-1)) ) {
-                    *scanpt++ = *rawpt;                                     /* R */
-                    *scanpt++ = (*(rawpt-1)+*(rawpt+1)+
-                            *(rawpt-WIDTH)+*(rawpt+WIDTH))/4;      /* G */
-                    *scanpt++ = (*(rawpt-WIDTH-1)+*(rawpt-WIDTH+1)+
-                            *(rawpt+WIDTH-1)+*(rawpt+WIDTH+1))/4;  /* B */
-                } else {
-                    /* bottom line or right column */
-                    *scanpt++ = *rawpt;                             /* R */
-                    *scanpt++ = (*(rawpt-1)+*(rawpt-WIDTH))/2;      /* G */
-                    *scanpt++ = *(rawpt-WIDTH-1);           /* B */
-                }
-            }
-        }
-        rawpt++;
-    }
-
-}
-
-// SGBRG to RGB24
-// for some reason, red and blue needs to be swapped
-// at least for  046d:092f Logitech, Inc. QuickCam Express Plus to work
-//see: http://www.siliconimaging.com/RGB%20Bayer.htm
-//and 4.6 at http://tldp.org/HOWTO/html_single/libdc1394-HOWTO/
-static void sgbrg2rgb24(long int WIDTH, long int HEIGHT, unsigned char *src, unsigned char *dst)
-{
-    long int i;
-    unsigned char *rawpt, *scanpt;
-    long int size;
-
-    rawpt = src;
-    scanpt = dst;
-    size = WIDTH*HEIGHT;
-
-    for ( i = 0; i < size; i++ )
-    {
-        if ( (i/WIDTH) % 2 == 0 ) //even row
-        {
-            if ( (i % 2) == 0 ) //even pixel
-            {
-                if ( (i > WIDTH) && ((i % WIDTH) > 0) )
-                {
-                    *scanpt++ = (*(rawpt-1)+*(rawpt+1))/2;       /* R */
-                    *scanpt++ = *(rawpt);                        /* G */
-                    *scanpt++ = (*(rawpt-WIDTH) + *(rawpt+WIDTH))/2;      /* B */
-                } else
-                {
-                    /* first line or left column */
-
-                    *scanpt++ = *(rawpt+1);           /* R */
-                    *scanpt++ = *(rawpt);             /* G */
-                    *scanpt++ =  *(rawpt+WIDTH);      /* B */
-                }
-            } else //odd pixel
-            {
-                if ( (i > WIDTH) && ((i % WIDTH) < (WIDTH-1)) )
-                {
-                    *scanpt++ = *(rawpt);       /* R */
-                    *scanpt++ = (*(rawpt-1)+*(rawpt+1)+*(rawpt-WIDTH)+*(rawpt+WIDTH))/4; /* G */
-                    *scanpt++ = (*(rawpt-WIDTH-1) + *(rawpt-WIDTH+1) + *(rawpt+WIDTH-1) + *(rawpt+WIDTH+1))/4;      /* B */
-                } else
-                {
-                    /* first line or right column */
-
-                    *scanpt++ = *(rawpt);       /* R */
-                    *scanpt++ = (*(rawpt-1)+*(rawpt+WIDTH))/2; /* G */
-                    *scanpt++ = *(rawpt+WIDTH-1);      /* B */
-                }
-            }
-        } else
-        { //odd row
-            if ( (i % 2) == 0 ) //even pixel
-            {
-                if ( (i < (WIDTH*(HEIGHT-1))) && ((i % WIDTH) > 0) )
-                {
-                    *scanpt++ =  (*(rawpt-WIDTH-1)+*(rawpt-WIDTH+1)+*(rawpt+WIDTH-1)+*(rawpt+WIDTH+1))/4;          /* R */
-                    *scanpt++ =  (*(rawpt-1)+*(rawpt+1)+*(rawpt-WIDTH)+*(rawpt+WIDTH))/4;      /* G */
-                    *scanpt++ =  *(rawpt); /* B */
-                } else
-                {
-                    /* bottom line or left column */
-
-                    *scanpt++ =  *(rawpt-WIDTH+1);          /* R */
-                    *scanpt++ =  (*(rawpt+1)+*(rawpt-WIDTH))/2;      /* G */
-                    *scanpt++ =  *(rawpt); /* B */
-                }
-            } else
-            { //odd pixel
-                if ( i < (WIDTH*(HEIGHT-1)) && ((i % WIDTH) < (WIDTH-1)) )
-                {
-                    *scanpt++ = (*(rawpt-WIDTH)+*(rawpt+WIDTH))/2;  /* R */
-                    *scanpt++ = *(rawpt);      /* G */
-                    *scanpt++ = (*(rawpt-1)+*(rawpt+1))/2; /* B */
-                } else
-                {
-                    /* bottom line or right column */
-
-                    *scanpt++ = (*(rawpt-WIDTH));  /* R */
-                    *scanpt++ = *(rawpt);      /* G */
-                    *scanpt++ = (*(rawpt-1)); /* B */
-                }
-            }
-        }
-        rawpt++;
-    }
-}
-
 #define CLAMP(x)        ((x)<0?0:((x)>255)?255:(x))
 
 typedef struct {
@@ -1673,24 +1522,6 @@ void CvCaptureCAM_V4L::convertToRgb(const Buffer &currentBuffer)
         yuv411p_to_rgb24(imageSize.width, imageSize.height,
                 start, (unsigned char*)frame.imageData);
         return;
-    case V4L2_PIX_FMT_SBGGR8:
-        bayer2rgb24(imageSize.width, imageSize.height,
-                start, (unsigned char*)frame.imageData);
-        return;
-
-    case V4L2_PIX_FMT_SN9C10X:
-        sonix_decompress_init();
-        sonix_decompress(imageSize.width, imageSize.height,
-                start, (unsigned char*)buffers[MAX_V4L_BUFFERS].memories[MEMORY_RGB].start);
-
-        bayer2rgb24(imageSize.width, imageSize.height,
-                (unsigned char*)buffers[MAX_V4L_BUFFERS].memories[MEMORY_RGB].start,
-                (unsigned char*)frame.imageData);
-        return;
-    case V4L2_PIX_FMT_SGBRG8:
-        sgbrg2rgb24(imageSize.width, imageSize.height,
-                start, (unsigned char*)frame.imageData);
-        return;
     default:
         break;
     }
@@ -1731,8 +1562,21 @@ void CvCaptureCAM_V4L::convertToRgb(const Buffer &currentBuffer)
         return;
     case V4L2_PIX_FMT_Y16:
     {
+        // https://www.kernel.org/doc/html/v4.10/media/uapi/v4l/pixfmt-y16.html
+        // This is a grey-scale image with a depth of 16 bits per pixel. The least significant byte is stored at lower memory addresses (little-endian).
+        // Note: 10-bits precision is not supported
+        cv::Mat temp(imageSize, CV_8UC1, buffers[MAX_V4L_BUFFERS].memories[MEMORY_RGB].start);
+        cv::extractChannel(cv::Mat(imageSize, CV_8UC2, start), temp, 1);  // 1 - second channel
+        cv::cvtColor(temp, destination, COLOR_GRAY2BGR);
+        return;
+    }
+    case V4L2_PIX_FMT_Y16_BE:
+    {
+        // https://www.kernel.org/doc/html/v4.10/media/uapi/v4l/pixfmt-y16-be.html
+        // This is a grey-scale image with a depth of 16 bits per pixel. The most significant byte is stored at lower memory addresses (big-endian).
+        // Note: 10-bits precision is not supported
         cv::Mat temp(imageSize, CV_8UC1, buffers[MAX_V4L_BUFFERS].memories[MEMORY_RGB].start);
-        cv::Mat(imageSize, CV_16UC1, start).convertTo(temp, CV_8U, 1.0 / 256);
+        cv::extractChannel(cv::Mat(imageSize, CV_8UC2, start), temp, 0);  // 0 - first channel
         cv::cvtColor(temp, destination, COLOR_GRAY2BGR);
         return;
     }
@@ -1750,6 +1594,36 @@ void CvCaptureCAM_V4L::convertToRgb(const Buffer &currentBuffer)
         cv::cvtColor(temp, destination, COLOR_GRAY2BGR);
         return;
     }
+    case V4L2_PIX_FMT_SN9C10X:
+    {
+        sonix_decompress_init();
+        sonix_decompress(imageSize.width, imageSize.height,
+                start, (unsigned char*)buffers[MAX_V4L_BUFFERS].memories[MEMORY_RGB].start);
+
+        cv::Mat cv_buf(imageSize, CV_8UC1, buffers[MAX_V4L_BUFFERS].memories[MEMORY_RGB].start);
+        cv::cvtColor(cv_buf, destination, COLOR_BayerRG2BGR);
+        return;
+    }
+    case V4L2_PIX_FMT_SRGGB8:
+    {
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC1, start), destination, COLOR_BayerBG2BGR);
+        return;
+    }
+    case V4L2_PIX_FMT_SBGGR8:
+    {
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC1, start), destination, COLOR_BayerRG2BGR);
+        return;
+    }
+    case V4L2_PIX_FMT_SGBRG8:
+    {
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC1, start), destination, COLOR_BayerGR2BGR);
+        return;
+    }
+    case V4L2_PIX_FMT_SGRBG8:
+    {
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC1, start), destination, COLOR_BayerGB2BGR);
+        return;
+    }
     case V4L2_PIX_FMT_GREY:
         cv::cvtColor(cv::Mat(imageSize, CV_8UC1, start), destination, COLOR_GRAY2BGR);
         break;
@@ -1854,8 +1728,12 @@ static inline cv::String capPropertyName(int prop)
         return "auto wb";
     case CAP_PROP_WB_TEMPERATURE:
         return "wb temperature";
+    case CAP_PROP_ORIENTATION_META:
+        return "orientation meta";
+    case CAP_PROP_ORIENTATION_AUTO:
+        return "orientation auto";
     default:
-        return "unknown";
+        return cv::format("unknown (%d)", prop);
     }
 }
 
@@ -1970,7 +1848,7 @@ bool CvCaptureCAM_V4L::controlInfo(int property_id, __u32 &_v4l2id, cv::Range &r
     v4l2_queryctrl queryctrl = v4l2_queryctrl();
     queryctrl.id = __u32(v4l2id);
     if (v4l2id == -1 || !tryIoctl(VIDIOC_QUERYCTRL, &queryctrl)) {
-        CV_LOG_INFO(NULL, "VIDEOIO(V4L2:" << deviceName << "): property " << capPropertyName(property_id) << " is not supported");
+        CV_LOG_INFO(NULL, "VIDEOIO(V4L2:" << deviceName << "): property '" << capPropertyName(property_id) << "' is not supported");
         return false;
     }
     _v4l2id = __u32(v4l2id);
@@ -2123,6 +2001,7 @@ bool CvCaptureCAM_V4L::setProperty( int property_id, double _value )
         }else{
             convert_rgb = false;
             releaseFrame();
+            v4l2_create_frame();
             return true;
         }
     case cv::CAP_PROP_FOURCC:
diff --git a/modules/videoio/src/cap_winrt_bridge.hpp b/modules/videoio/src/cap_winrt_bridge.hpp
index a1e134e6abed..b78f8544bb34 100644
--- a/modules/videoio/src/cap_winrt_bridge.hpp
+++ b/modules/videoio/src/cap_winrt_bridge.hpp
@@ -33,7 +33,7 @@
 #include <ppltasks.h>
 #include <concrt.h>
 #include <agile.h>
-#include <opencv2\core.hpp>
+#include <opencv2/core.hpp>
 
 #include <mutex>
 #include <memory>
@@ -114,4 +114,4 @@ class VideoioBridge
     cv::Mat backInputMat;
 
     int deviceIndex, width, height;
-};
\ No newline at end of file
+};
diff --git a/modules/videoio/src/container_avi.cpp b/modules/videoio/src/container_avi.cpp
index 3223e7709096..4aed7c888c61 100644
--- a/modules/videoio/src/container_avi.cpp
+++ b/modules/videoio/src/container_avi.cpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "opencv2/videoio/container_avi.private.hpp"
+#include <opencv2/core/utils/logger.hpp>
 #include <fstream>
 #include <limits>
 #include <typeinfo>
@@ -645,6 +646,11 @@ bool BitStream::open(const String& filename)
 {
     close();
     output.open(filename.c_str(), std::ios_base::binary);
+    if (!output.is_open())
+    {
+        CV_LOG_DEBUG(NULL, cv::format("Failed to open stream for writing to  \"%s\"", filename.c_str()));
+        return false;
+    }
     m_current = m_start;
     m_pos = 0;
     return true;
diff --git a/modules/videoio/test/test_audio.cpp b/modules/videoio/test/test_audio.cpp
index 961d8d809f8a..bf2b4bad680c 100644
--- a/modules/videoio/test/test_audio.cpp
+++ b/modules/videoio/test/test_audio.cpp
@@ -14,7 +14,7 @@ typedef std::tuple<std::string, int, int, double, VideoCaptureAPIs> param;
 class AudioBaseTest
 {
 protected:
-    AudioBaseTest(){};
+    AudioBaseTest(){}
     void getValidAudioData()
     {
         const double step = 3.14/22050;
@@ -157,7 +157,7 @@ class MediaTestFixture : public AudioBaseTest, public testing::TestWithParam <pa
             params = {  CAP_PROP_AUDIO_STREAM, 0,
                         CAP_PROP_VIDEO_STREAM, 0,
                         CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
-        };
+        }
 
     void doTest()
     {
@@ -186,6 +186,7 @@ class MediaTestFixture : public AudioBaseTest, public testing::TestWithParam <pa
                 double audio_shift = cap.get(CAP_PROP_AUDIO_SHIFT_NSEC);
                 double video0_timestamp = cap.get(CAP_PROP_POS_MSEC) * 1e-3;
                 audio0_timestamp = video0_timestamp + audio_shift * 1e-9;
+
                 std::cout << "video0 timestamp: " << video0_timestamp << "  audio0 timestamp: " << audio0_timestamp << " (audio shift nanoseconds: " << audio_shift << " , seconds: " << audio_shift * 1e-9 << ")" << std::endl;
             }
             ASSERT_TRUE(cap.retrieve(videoFrame));
@@ -228,7 +229,7 @@ class MediaTestFixture : public AudioBaseTest, public testing::TestWithParam <pa
                 EXPECT_NEAR(
                         cap.get(CAP_PROP_AUDIO_POS) / samplePerSecond + audio0_timestamp,
                         cap.get(CAP_PROP_POS_MSEC) * 1e-3,
-                        (1.0 / fps) * 0.3)
+                        (1.0 / fps) * 0.6)
                     << "CAP_PROP_AUDIO_POS=" << cap.get(CAP_PROP_AUDIO_POS) << " CAP_PROP_POS_MSEC=" << cap.get(CAP_PROP_POS_MSEC);
             }
             if (frame != 0 && frame != numberOfFrames-1 && audioData[0].size() != (size_t)numberOfSamples)
diff --git a/modules/videoio/test/test_camera.cpp b/modules/videoio/test/test_camera.cpp
index fc269959c393..f11fa3f25115 100644
--- a/modules/videoio/test/test_camera.cpp
+++ b/modules/videoio/test/test_camera.cpp
@@ -119,6 +119,26 @@ TEST(DISABLED_videoio_camera, v4l_read_mjpg)
     capture.release();
 }
 
+TEST(DISABLED_videoio_camera, msmf_read_yuyv)
+{
+    VideoCapture capture(CAP_MSMF);
+    ASSERT_TRUE(capture.isOpened());
+    ASSERT_TRUE(capture.set(CAP_PROP_FOURCC, VideoWriter::fourcc('Y', 'U', 'Y', 'V')));
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    int fourcc = (int)capture.get(CAP_PROP_FOURCC);
+    std::cout << "FOURCC code: " << cv::format("0x%8x", fourcc) << std::endl;
+    cv::Mat frame;
+    for (int i = 0; i < 10; i++)
+    {
+        capture >> frame;
+        EXPECT_EQ(2, frame.channels());
+    }
+    capture.release();
+}
+
 TEST(DISABLED_videoio_camera, v4l_open_mjpg)
 {
     VideoCapture capture;
@@ -203,6 +223,35 @@ TEST(DISABLED_videoio_camera, v4l_read_framesize)
     capture.release();
 }
 
+TEST(DISABLED_videoio_camera, v4l_rgb_convert)
+{
+    VideoCapture capture(CAP_V4L2);
+    ASSERT_TRUE(capture.isOpened());
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << " Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "      height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Pixel format: " << capture.get(cv::CAP_PROP_FORMAT) << std::endl;
+    if (capture.get(CAP_PROP_FOURCC) != VideoWriter::fourcc('Y', 'U', 'Y', 'V'))
+    {
+        throw SkipTestException("Camera does not support YUYV format");
+    }
+    capture.set(cv::CAP_PROP_CONVERT_RGB, 0);
+    std::cout << "New pixel format: " << capture.get(cv::CAP_PROP_FORMAT) << std::endl;
+
+    cv::Mat frame;
+    for (int i = 0; i < 10; i++)
+    {
+        int pixel_type  = (int)capture.get(cv::CAP_PROP_FORMAT);
+        int channels    = CV_MAT_CN(pixel_type);
+        int pixel_bytes = CV_ELEM_SIZE(pixel_type);
+
+        // YUYV is expected for most of popular USB cam (COLOR_YUV2BGR_YUYV conversion)
+        EXPECT_EQ(2, channels);
+        EXPECT_EQ(2, pixel_bytes);
+
+        capture >> frame;
+    }
+}
 
 static
 utils::Paths getTestCameras()
diff --git a/modules/videoio/test/test_ffmpeg.cpp b/modules/videoio/test/test_ffmpeg.cpp
index 35d425d5c15d..f4920e75c2f4 100644
--- a/modules/videoio/test/test_ffmpeg.cpp
+++ b/modules/videoio/test/test_ffmpeg.cpp
@@ -235,6 +235,256 @@ const videoio_container_params_t videoio_container_params[] =
 
 INSTANTIATE_TEST_CASE_P(/**/, videoio_container, testing::ValuesIn(videoio_container_params));
 
+typedef tuple<VideoCaptureAPIs, string, int, int, int, int, int> videoio_container_get_params_t;
+typedef testing::TestWithParam<videoio_container_get_params_t > videoio_container_get;
+
+TEST_P(videoio_container_get, read)
+{
+    const VideoCaptureAPIs api = get<0>(GetParam());
+
+    if (!videoio_registry::hasBackend(api))
+        throw SkipTestException("Backend was not found");
+
+    const string fileName = get<1>(GetParam());
+    const int height = get<2>(GetParam());
+    const int width = get<3>(GetParam());
+    const int nFrames = get<4>(GetParam());
+    const int bitrate = get<5>(GetParam());
+    const int fps = get<6>(GetParam());
+
+    VideoCapture container(findDataFile(fileName), api, { CAP_PROP_FORMAT, -1 });
+    if (!container.isOpened())
+        throw SkipTestException("Video stream is not supported");
+
+    const int heightProp = static_cast<int>(container.get(CAP_PROP_FRAME_HEIGHT));
+    ASSERT_EQ(height, heightProp);
+    const int widthProp = static_cast<int>(container.get(CAP_PROP_FRAME_WIDTH));
+    ASSERT_EQ(width, widthProp);
+    const int nFramesProp = static_cast<int>(container.get(CAP_PROP_FRAME_COUNT));
+    ASSERT_EQ(nFrames, nFramesProp);
+    const int bitrateProp = static_cast<int>(container.get(CAP_PROP_BITRATE));
+    ASSERT_EQ(bitrate, bitrateProp);
+    const double fpsProp = container.get(CAP_PROP_FPS);
+    ASSERT_EQ(fps, fpsProp);
+
+    vector<int> displayTimeMs;
+    int iFrame = 1;
+    while (container.grab()) {
+        displayTimeMs.push_back(static_cast<int>(container.get(CAP_PROP_POS_MSEC)));
+        const int iFrameProp = static_cast<int>(container.get(CAP_PROP_POS_FRAMES));
+        ASSERT_EQ(iFrame++, iFrameProp);
+    }
+    sort(displayTimeMs.begin(), displayTimeMs.end());
+    vector<int> displayTimeDiffMs(displayTimeMs.size());
+    std::adjacent_difference(displayTimeMs.begin(), displayTimeMs.end(), displayTimeDiffMs.begin());
+    auto minTimeMsIt = min_element(displayTimeDiffMs.begin() + 1, displayTimeDiffMs.end());
+    auto maxTimeMsIt = max_element(displayTimeDiffMs.begin() + 1, displayTimeDiffMs.end());
+    const int frameTimeMs = static_cast<int>(1000.0 / fps);
+    ASSERT_NEAR(frameTimeMs, *minTimeMsIt, 1);
+    ASSERT_NEAR(frameTimeMs, *maxTimeMsIt, 1);
+}
+
+const videoio_container_get_params_t videoio_container_get_params[] =
+{
+    videoio_container_get_params_t(CAP_FFMPEG, "video/big_buck_bunny.mp4", 384, 672, 125, 483, 24),
+    videoio_container_get_params_t(CAP_FFMPEG, "video/big_buck_bunny.mjpg.avi", 384, 672, 125, 2713, 24),
+    videoio_container_get_params_t(CAP_FFMPEG, "video/sample_322x242_15frames.yuv420p.libx264.mp4", 242, 322, 15, 542, 25)
+};
+
+INSTANTIATE_TEST_CASE_P(/**/, videoio_container_get, testing::ValuesIn(videoio_container_get_params));
+
+typedef tuple<string, string, int, int> videoio_encapsulate_params_t;
+typedef testing::TestWithParam< videoio_encapsulate_params_t > videoio_encapsulate;
+
+TEST_P(videoio_encapsulate, write)
+{
+    const VideoCaptureAPIs api = CAP_FFMPEG;
+    if (!videoio_registry::hasBackend(api))
+        throw SkipTestException("FFmpeg backend was not found");
+
+    const string fileName = findDataFile(get<0>(GetParam()));
+    const string ext = get<1>(GetParam());
+    const int idrPeriod = get<2>(GetParam());
+    const int nFrames = get<3>(GetParam());
+    const string fileNameOut = tempfile(cv::format("test_encapsulated_stream.%s", ext.c_str()).c_str());
+
+    // Use VideoWriter to encapsulate encoded video read with VideoReader
+    {
+        VideoCapture capRaw(fileName, api, { CAP_PROP_FORMAT, -1 });
+        ASSERT_TRUE(capRaw.isOpened());
+        const int width = static_cast<int>(capRaw.get(CAP_PROP_FRAME_WIDTH));
+        const int height = static_cast<int>(capRaw.get(CAP_PROP_FRAME_HEIGHT));
+        const double fps = capRaw.get(CAP_PROP_FPS);
+        const int codecExtradataIndex = static_cast<int>(capRaw.get(CAP_PROP_CODEC_EXTRADATA_INDEX));
+        Mat extraData;
+        capRaw.retrieve(extraData, codecExtradataIndex);
+        const int fourcc = static_cast<int>(capRaw.get(CAP_PROP_FOURCC));
+        const bool mpeg4 = (fourcc == fourccFromString("FMP4"));
+
+        VideoWriter container(fileNameOut, api, fourcc, fps, { width, height }, { VideoWriterProperties::VIDEOWRITER_PROP_RAW_VIDEO, 1, VideoWriterProperties::VIDEOWRITER_PROP_KEY_INTERVAL, idrPeriod });
+        ASSERT_TRUE(container.isOpened());
+        Mat rawFrame;
+        for (int i = 0; i < nFrames; i++) {
+            ASSERT_TRUE(capRaw.read(rawFrame));
+            ASSERT_FALSE(rawFrame.empty());
+            if (i == 0 && mpeg4) {
+                Mat tmp = rawFrame.clone();
+                const size_t newSzt = tmp.total() + extraData.total();
+                const int newSz = static_cast<int>(newSzt);
+                ASSERT_TRUE(newSzt == static_cast<size_t>(newSz));
+                rawFrame = Mat(1, newSz, CV_8UC1);
+                memcpy(rawFrame.data, extraData.data, extraData.total());
+                memcpy(rawFrame.data + extraData.total(), tmp.data, tmp.total());
+            }
+            container.write(rawFrame);
+        }
+        container.release();
+    }
+
+    std::cout << "Checking encapsulated video container: " << fileNameOut << std::endl;
+
+    // Check encapsulated video container is "identical" to the original
+    {
+        VideoCapture capReference(fileName), capActual(fileNameOut), capActualRaw(fileNameOut, api, { CAP_PROP_FORMAT, -1 });
+        ASSERT_TRUE(capReference.isOpened());
+        ASSERT_TRUE(capActual.isOpened());
+        ASSERT_TRUE(capActualRaw.isOpened());
+        const double fpsReference = capReference.get(CAP_PROP_FPS);
+        const double fpsActual = capActual.get(CAP_PROP_FPS);
+        ASSERT_NEAR(fpsReference, fpsActual, 1e-2);
+        const int nFramesActual = static_cast<int>(capActual.get(CAP_PROP_FRAME_COUNT));
+        ASSERT_EQ(nFrames, nFramesActual);
+
+        Mat reference, actual;
+        for (int i = 0; i < nFrames; i++) {
+            ASSERT_TRUE(capReference.read(reference));
+            ASSERT_FALSE(reference.empty());
+            ASSERT_TRUE(capActual.read(actual));
+            ASSERT_FALSE(actual.empty());
+            ASSERT_EQ(0, cvtest::norm(reference, actual, NORM_INF));
+
+            ASSERT_TRUE(capActualRaw.grab());
+            const bool keyFrameActual = capActualRaw.get(CAP_PROP_LRF_HAS_KEY_FRAME) == 1.;
+            const bool keyFrameReference = idrPeriod ? i % idrPeriod == 0 : 1;
+            ASSERT_EQ(keyFrameReference, keyFrameActual);
+        }
+    }
+
+    ASSERT_EQ(0, remove(fileNameOut.c_str()));
+}
+
+const videoio_encapsulate_params_t videoio_encapsulate_params[] =
+{
+    videoio_encapsulate_params_t("video/big_buck_bunny.h264", "avi", 125, 125),
+    videoio_encapsulate_params_t("video/big_buck_bunny.h265", "mp4", 125, 125),
+    videoio_encapsulate_params_t("video/big_buck_bunny.wmv", "wmv", 12, 13),
+    videoio_encapsulate_params_t("video/big_buck_bunny.mp4", "mp4", 12, 13),
+    videoio_encapsulate_params_t("video/big_buck_bunny.mjpg.avi", "mp4", 0, 4),
+    videoio_encapsulate_params_t("video/big_buck_bunny.mov", "mp4", 12, 13),
+    videoio_encapsulate_params_t("video/big_buck_bunny.avi", "mp4", 125, 125),
+    videoio_encapsulate_params_t("video/big_buck_bunny.mpg", "mp4", 12, 13),
+    videoio_encapsulate_params_t("video/VID00003-20100701-2204.wmv", "wmv", 12, 13),
+    videoio_encapsulate_params_t("video/VID00003-20100701-2204.mpg", "mp4", 12,13),
+    videoio_encapsulate_params_t("video/VID00003-20100701-2204.avi", "mp4", 12, 13),
+    videoio_encapsulate_params_t("video/VID00003-20100701-2204.3GP", "mp4", 51, 52),
+    videoio_encapsulate_params_t("video/sample_sorenson.avi", "mp4", 12, 13),
+    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libxvid.mp4", "mp4", 3, 4),
+    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.mpeg2video.mp4", "mp4", 12, 13),
+    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.mjpeg.mp4", "mp4", 0, 5),
+    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libx264.mp4", "avi", 15, 15),
+    videoio_encapsulate_params_t("../cv/tracking/faceocc2/data/faceocc2.webm", "webm", 128, 129),
+    videoio_encapsulate_params_t("../cv/video/1920x1080.avi", "mp4", 12, 13),
+    videoio_encapsulate_params_t("../cv/video/768x576.avi", "avi", 15, 16)
+    // Not supported by with FFmpeg:
+    //videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libx265.mp4", "mp4", 15, 15),
+    //videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4", "mp4", 15, 15),
+
+};
+
+INSTANTIATE_TEST_CASE_P(/**/, videoio_encapsulate, testing::ValuesIn(videoio_encapsulate_params));
+
+TEST(videoio_encapsulate_set_idr, write)
+{
+    const VideoCaptureAPIs api = CAP_FFMPEG;
+    if (!videoio_registry::hasBackend(api))
+        throw SkipTestException("FFmpeg backend was not found");
+
+    const string fileName = findDataFile("video/big_buck_bunny.mp4");
+    const string ext = "mp4";
+    const string fileNameOut = tempfile(cv::format("test_encapsulated_stream_set_idr.%s", ext.c_str()).c_str());
+
+    // Use VideoWriter to encapsulate encoded video read with VideoReader
+    {
+        VideoCapture capRaw(fileName, api, { CAP_PROP_FORMAT, -1 });
+        ASSERT_TRUE(capRaw.isOpened());
+        const int width = static_cast<int>(capRaw.get(CAP_PROP_FRAME_WIDTH));
+        const int height = static_cast<int>(capRaw.get(CAP_PROP_FRAME_HEIGHT));
+        const double fps = capRaw.get(CAP_PROP_FPS);
+        const int codecExtradataIndex = static_cast<int>(capRaw.get(CAP_PROP_CODEC_EXTRADATA_INDEX));
+        Mat extraData;
+        capRaw.retrieve(extraData, codecExtradataIndex);
+        const int fourcc = static_cast<int>(capRaw.get(CAP_PROP_FOURCC));
+        const bool mpeg4 = (fourcc == fourccFromString("FMP4"));
+
+        VideoWriter container(fileNameOut, api, fourcc, fps, { width, height }, { VideoWriterProperties::VIDEOWRITER_PROP_RAW_VIDEO, 1 });
+        ASSERT_TRUE(container.isOpened());
+        Mat rawFrame;
+        int i = 0;
+        while (capRaw.read(rawFrame)) {
+            ASSERT_FALSE(rawFrame.empty());
+            if (i == 0 && mpeg4) {
+                Mat tmp = rawFrame.clone();
+                const size_t newSzt = tmp.total() + extraData.total();
+                const int newSz = static_cast<int>(newSzt);
+                ASSERT_TRUE(newSzt == static_cast<size_t>(newSz));
+                rawFrame = Mat(1, newSz, CV_8UC1);
+                memcpy(rawFrame.data, extraData.data, extraData.total());
+                memcpy(rawFrame.data + extraData.total(), tmp.data, tmp.total());
+            }
+            if (capRaw.get(CAP_PROP_LRF_HAS_KEY_FRAME) != 0)
+                container.set(VideoWriterProperties::VIDEOWRITER_PROP_KEY_FLAG, 1);
+            else
+                container.set(VideoWriterProperties::VIDEOWRITER_PROP_KEY_FLAG, 0);
+            container.write(rawFrame);
+            i++;
+        }
+        container.release();
+    }
+
+    std::cout << "Checking encapsulated video container: " << fileNameOut << std::endl;
+
+    // Check encapsulated video container is "identical" to the original
+    {
+        VideoCapture capReference(fileName), capReferenceRaw(fileName, api, { CAP_PROP_FORMAT, -1 }), capActual(fileNameOut), capActualRaw(fileNameOut, api, { CAP_PROP_FORMAT, -1 });
+        ASSERT_TRUE(capReference.isOpened());
+        ASSERT_TRUE(capActual.isOpened());
+        ASSERT_TRUE(capReferenceRaw.isOpened());
+        ASSERT_TRUE(capActualRaw.isOpened());
+        const double fpsReference = capReference.get(CAP_PROP_FPS);
+        const double fpsActual = capActual.get(CAP_PROP_FPS);
+        ASSERT_EQ(fpsReference, fpsActual);
+        const int nFramesReference = static_cast<int>(capReference.get(CAP_PROP_FRAME_COUNT));
+        const int nFramesActual = static_cast<int>(capActual.get(CAP_PROP_FRAME_COUNT));
+        ASSERT_EQ(nFramesReference, nFramesActual);
+
+        Mat reference, actual;
+        for (int i = 0; i < nFramesReference; i++) {
+            ASSERT_TRUE(capReference.read(reference));
+            ASSERT_FALSE(reference.empty());
+            ASSERT_TRUE(capActual.read(actual));
+            ASSERT_FALSE(actual.empty());
+            ASSERT_EQ(0, cvtest::norm(reference, actual, NORM_INF));
+            ASSERT_TRUE(capReferenceRaw.grab());
+            ASSERT_TRUE(capActualRaw.grab());
+            const bool keyFrameReference = capActualRaw.get(CAP_PROP_LRF_HAS_KEY_FRAME) == 1.;
+            const bool keyFrameActual = capActualRaw.get(CAP_PROP_LRF_HAS_KEY_FRAME) == 1.;
+            ASSERT_EQ(keyFrameReference, keyFrameActual);
+        }
+    }
+
+    ASSERT_EQ(0, remove(fileNameOut.c_str()));
+}
+
 typedef tuple<string, string, int> videoio_skip_params_t;
 typedef testing::TestWithParam< videoio_skip_params_t > videoio_skip;
 
@@ -476,6 +726,14 @@ static void ffmpeg_check_read_raw(VideoCapture& cap)
     EXPECT_EQ(CV_8UC1, data.type()) << "CV_8UC1 != " << typeToString(data.type());
     EXPECT_TRUE(data.rows == 1 || data.cols == 1) << data.size;
     EXPECT_EQ((size_t)37118, data.total());
+
+    // 12 is the nearset key frame to frame 18
+    EXPECT_TRUE(cap.set(CAP_PROP_POS_FRAMES, 18.));
+    EXPECT_EQ(cap.get(CAP_PROP_POS_FRAMES), 12.);
+    cap >> data;
+    EXPECT_EQ(CV_8UC1, data.type()) << "CV_8UC1 != " << typeToString(data.type());
+    EXPECT_TRUE(data.rows == 1 || data.cols == 1) << data.size;
+    EXPECT_EQ((size_t)8726, data.total());
 }
 
 TEST(videoio_ffmpeg, ffmpeg_check_extra_data)
@@ -506,6 +764,14 @@ TEST(videoio_ffmpeg, open_with_property)
         CAP_PROP_FORMAT, -1  // demux only
     }));
 
+    // confirm properties are returned without initializing AVCodecContext
+    EXPECT_EQ(cap.get(CAP_PROP_FORMAT), -1);
+    EXPECT_EQ(static_cast<int>(cap.get(CAP_PROP_FOURCC)), fourccFromString("FMP4"));
+    EXPECT_EQ(cap.get(CAP_PROP_N_THREADS), 0.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_HEIGHT), 384.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_WIDTH), 672.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_COUNT), 125);
+    EXPECT_EQ(cap.get(CAP_PROP_FPS), 24.0);
     ffmpeg_check_read_raw(cap);
 }
 
@@ -519,6 +785,14 @@ TEST(videoio_ffmpeg, create_with_property)
         CAP_PROP_FORMAT, -1  // demux only
     });
 
+    // confirm properties are returned without initializing AVCodecContext
+    EXPECT_TRUE(cap.get(CAP_PROP_FORMAT) == -1);
+    EXPECT_EQ(static_cast<int>(cap.get(CAP_PROP_FOURCC)), fourccFromString("FMP4"));
+    EXPECT_EQ(cap.get(CAP_PROP_N_THREADS), 0.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_HEIGHT), 384.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_WIDTH), 672.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_COUNT), 125);
+    EXPECT_EQ(cap.get(CAP_PROP_FPS), 24.0);
     ffmpeg_check_read_raw(cap);
 }
 
diff --git a/modules/videoio/test/test_gstreamer.cpp b/modules/videoio/test/test_gstreamer.cpp
index a8c24be438a3..ef9a6765a84b 100644
--- a/modules/videoio/test/test_gstreamer.cpp
+++ b/modules/videoio/test/test_gstreamer.cpp
@@ -178,4 +178,47 @@ TEST(videoio_gstreamer, timeout_property)
     }
 }
 
+//==============================================================================
+// Seeking test with manual GStreamer pipeline
+typedef testing::TestWithParam<string> gstreamer_bunny;
+
+TEST_P(gstreamer_bunny, manual_seek)
+{
+    if (!videoio_registry::hasBackend(CAP_GSTREAMER))
+        throw SkipTestException("GStreamer backend was not found");
+
+    const string video_file = BunnyParameters::getFilename("." + GetParam());
+    const string pipeline = "filesrc location=" + video_file + " ! decodebin ! videoconvert ! video/x-raw, format=BGR ! appsink drop=1";
+    const double target_pos = 3000.0;
+    const double ms_per_frame = 1000.0 / BunnyParameters::getFps();
+    VideoCapture cap;
+    cap.open(pipeline, CAP_GSTREAMER);
+    ASSERT_TRUE(cap.isOpened());
+    Mat img;
+    for (int i = 0; i < 10; i++)
+    {
+        cap >> img;
+    }
+    EXPECT_FALSE(img.empty());
+    cap.set(CAP_PROP_POS_MSEC, target_pos);
+    cap >> img;
+    EXPECT_FALSE(img.empty());
+    double actual_pos = cap.get(CAP_PROP_POS_MSEC);
+    EXPECT_NEAR(actual_pos, target_pos, ms_per_frame);
+}
+
+static const string bunny_params[] = {
+    // string("wmv"),
+    string("mov"),
+    string("mp4"),
+    // string("mpg"),
+    string("avi"),
+    // string("h264"),
+    // string("h265"),
+    string("mjpg.avi")
+};
+
+INSTANTIATE_TEST_CASE_P(videoio, gstreamer_bunny, testing::ValuesIn(bunny_params));
+
+
 }} // namespace
diff --git a/modules/videoio/test/test_images.cpp b/modules/videoio/test/test_images.cpp
new file mode 100644
index 000000000000..ccf507a50d5a
--- /dev/null
+++ b/modules/videoio/test/test_images.cpp
@@ -0,0 +1,294 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include "opencv2/core/utils/filesystem.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/videoio/utils.private.hpp"
+
+using namespace std;
+
+namespace opencv_test { namespace {
+
+struct ImageCollection
+{
+    string dirname;
+    string base;
+    string ext;
+    size_t first_idx;
+    size_t last_idx;
+    size_t width;
+public:
+    ImageCollection(const char *dirname_template = "opencv_test_images")
+        : first_idx(0), last_idx(0), width(0)
+    {
+        dirname = cv::tempfile(dirname_template);
+        cv::utils::fs::createDirectory(dirname);
+    }
+    ~ImageCollection()
+    {
+        cleanup();
+    }
+    void cleanup()
+    {
+        cv::utils::fs::remove_all(dirname);
+    }
+    void generate(size_t count, size_t first = 0, size_t width_ = 4, const string & base_ = "test", const string & ext_ = "png")
+    {
+        base = base_;
+        ext = ext_;
+        first_idx = first;
+        last_idx = first + count - 1;
+        width = width_;
+        for (size_t idx = first_idx; idx <= last_idx; ++idx)
+        {
+            const string filename = getFilename(idx);
+            imwrite(filename, getFrame(idx));
+        }
+    }
+    string getFilename(size_t idx = 0) const
+    {
+        ostringstream buf;
+        buf << dirname << "/" << base << setw(width) << setfill('0') << idx << "." << ext;
+        return buf.str();
+    }
+    string getPatternFilename() const
+    {
+        ostringstream buf;
+        buf << dirname << "/" << base << "%0" << width << "d" << "." << ext;
+        return buf.str();
+    }
+    string getFirstFilename() const
+    {
+        return getFilename(first_idx);
+    }
+    Mat getFirstFrame() const
+    {
+        return getFrame(first_idx);
+    }
+    size_t getCount() const
+    {
+        return last_idx - first_idx + 1;
+    }
+    string getDirname() const
+    {
+        return dirname;
+    }
+    static Mat getFrame(size_t idx)
+    {
+        const int sz = 100; // 100x100 or bigger
+        Mat res(sz, sz, CV_8UC3, Scalar::all(0));
+        circle(res, Point(idx % 100), idx % 50, Scalar::all(255), 2, LINE_8);
+        return res;
+    }
+};
+
+//==================================================================================================
+
+TEST(videoio_images, basic_read)
+{
+    ImageCollection col;
+    col.generate(20);
+    VideoCapture cap(col.getFirstFilename(), CAP_IMAGES);
+    ASSERT_TRUE(cap.isOpened());
+    size_t idx = 0;
+    while (cap.isOpened()) // TODO: isOpened is always true, even if there are no more images
+    {
+        Mat img;
+        const bool read_res = cap.read(img);
+        if (!read_res)
+            break;
+        EXPECT_MAT_N_DIFF(img, col.getFrame(idx), 0);
+        ++idx;
+    }
+    EXPECT_EQ(col.getCount(), idx);
+}
+
+TEST(videoio_images, basic_write)
+{
+    // writer should create files: test0000.png, ... test0019.png
+    ImageCollection col;
+    col.generate(1);
+    VideoWriter wri(col.getFirstFilename(), CAP_IMAGES, 0, 0, col.getFrame(0).size());
+    ASSERT_TRUE(wri.isOpened());
+    size_t idx = 0;
+    while (wri.isOpened())
+    {
+        wri << col.getFrame(idx);
+        Mat actual = imread(col.getFilename(idx));
+        EXPECT_MAT_N_DIFF(col.getFrame(idx), actual, 0);
+        if (++idx >= 20)
+            break;
+    }
+    wri.release();
+    ASSERT_FALSE(wri.isOpened());
+}
+
+TEST(videoio_images, bad)
+{
+    ImageCollection col;
+    {
+        ostringstream buf; buf << col.getDirname() << "/missing0000.png";
+        VideoCapture cap(buf.str(), CAP_IMAGES);
+        EXPECT_FALSE(cap.isOpened());
+        Mat img;
+        EXPECT_FALSE(cap.read(img));
+    }
+}
+
+TEST(videoio_images, seek)
+{
+    // check files: test0005.png, ..., test0024.png
+    // seek to valid and invalid frame numbers
+    // position is zero-based: valid frame numbers are 0, ..., 19
+    const int count = 20;
+    ImageCollection col;
+    col.generate(count, 5);
+    VideoCapture cap(col.getFirstFilename(), CAP_IMAGES);
+    ASSERT_TRUE(cap.isOpened());
+    EXPECT_EQ((size_t)count, (size_t)cap.get(CAP_PROP_FRAME_COUNT));
+    vector<int> positions { count / 2, 0, 1, count - 1, count, count + 100, -1, -100 };
+    for (const auto &pos : positions)
+    {
+        Mat img;
+        const bool res = cap.set(CAP_PROP_POS_FRAMES, pos);
+        if (pos >= count || pos < 0) // invalid position
+        {
+//            EXPECT_FALSE(res); // TODO: backend clamps invalid value to valid range, actual result is 'true'
+        }
+        else
+        {
+            EXPECT_TRUE(res);
+            EXPECT_GE(1., cap.get(CAP_PROP_POS_AVI_RATIO));
+            EXPECT_NEAR((double)pos / (count - 1),  cap.get(CAP_PROP_POS_AVI_RATIO), 1e-2);
+            EXPECT_EQ(pos, static_cast<decltype(pos)>(cap.get(CAP_PROP_POS_FRAMES)));
+            EXPECT_TRUE(cap.read(img));
+            EXPECT_MAT_N_DIFF(img, col.getFrame(col.first_idx + pos), 0);
+        }
+    }
+}
+
+TEST(videoio_images, pattern_overflow)
+{
+    // check files: test0.png, ..., test11.png
+    ImageCollection col;
+    col.generate(12, 0, 1);
+
+    {
+        VideoCapture cap(col.getFirstFilename(), CAP_IMAGES);
+        ASSERT_TRUE(cap.isOpened());
+        for (size_t idx = col.first_idx; idx <= col.last_idx; ++idx)
+        {
+            Mat img;
+            EXPECT_TRUE(cap.read(img));
+            EXPECT_MAT_N_DIFF(img, col.getFrame(idx), 0);
+        }
+    }
+    {
+        VideoCapture cap(col.getPatternFilename(), CAP_IMAGES);
+        ASSERT_TRUE(cap.isOpened());
+        for (size_t idx = col.first_idx; idx <= col.last_idx; ++idx)
+        {
+            Mat img;
+            EXPECT_TRUE(cap.read(img));
+            EXPECT_MAT_N_DIFF(img, col.getFrame(idx), 0);
+        }
+    }
+}
+
+TEST(videoio_images, pattern_max)
+{
+    // max supported number width for starting image is 9 digits
+    // but following images can be read as well
+    // test999999999.png ; test1000000000.png
+    ImageCollection col;
+    col.generate(2, 1000000000 - 1);
+    {
+        VideoCapture cap(col.getFirstFilename(), CAP_IMAGES);
+        ASSERT_TRUE(cap.isOpened());
+        Mat img;
+        EXPECT_TRUE(cap.read(img));
+        EXPECT_MAT_N_DIFF(img, col.getFrame(col.first_idx), 0);
+        EXPECT_TRUE(cap.read(img));
+        EXPECT_MAT_N_DIFF(img, col.getFrame(col.first_idx + 1), 0);
+    }
+    {
+        VideoWriter wri(col.getFirstFilename(), CAP_IMAGES, 0, 0, col.getFirstFrame().size());
+        ASSERT_TRUE(wri.isOpened());
+        Mat img = col.getFrame(0);
+        wri.write(img);
+        wri.write(img);
+        Mat actual;
+        actual = imread(col.getFilename(col.first_idx));
+        EXPECT_MAT_N_DIFF(actual, img, 0);
+        actual = imread(col.getFilename(col.first_idx));
+        EXPECT_MAT_N_DIFF(actual, img, 0);
+    }
+}
+
+TEST(videoio_images, extract_pattern)
+{
+    unsigned offset = 0;
+
+    // Min and max values
+    EXPECT_EQ("%01d.png", cv::icvExtractPattern("0.png", &offset));
+    EXPECT_EQ(0u, offset);
+    EXPECT_EQ("%09d.png", cv::icvExtractPattern("999999999.png", &offset));
+    EXPECT_EQ(999999999u, offset);
+
+    // Regular usage - start, end, middle
+    EXPECT_EQ("abc%04ddef.png", cv::icvExtractPattern("abc0048def.png", &offset));
+    EXPECT_EQ(48u, offset);
+    EXPECT_EQ("%05dabcdef.png", cv::icvExtractPattern("00049abcdef.png", &offset));
+    EXPECT_EQ(49u, offset);
+    EXPECT_EQ("abcdef%06d.png", cv::icvExtractPattern("abcdef000050.png", &offset));
+    EXPECT_EQ(50u, offset);
+
+    // Minus handling (should not handle)
+    EXPECT_EQ("abcdef-%01d.png", cv::icvExtractPattern("abcdef-8.png", &offset));
+    EXPECT_EQ(8u, offset);
+
+    // Two numbers (should select first)
+    // TODO: shouldn't it be last number?
+    EXPECT_EQ("%01d-abcdef-8.png", cv::icvExtractPattern("7-abcdef-8.png", &offset));
+    EXPECT_EQ(7u, offset);
+
+    // Paths (should select filename)
+    EXPECT_EQ("images005/abcdef%03d.png", cv::icvExtractPattern("images005/abcdef006.png", &offset));
+    EXPECT_EQ(6u, offset);
+    // TODO: fix
+    // EXPECT_EQ("images03\\abcdef%02d.png", cv::icvExtractPattern("images03\\abcdef04.png", &offset));
+    // EXPECT_EQ(4, offset);
+    EXPECT_EQ("/home/user/test/0/3348/../../3442/./0/1/3/4/5/14304324234/%01d.png",
+              cv::icvExtractPattern("/home/user/test/0/3348/../../3442/./0/1/3/4/5/14304324234/2.png", &offset));
+    EXPECT_EQ(2u, offset);
+
+    // Patterns '%0?[0-9][du]'
+    EXPECT_EQ("test%d.png", cv::icvExtractPattern("test%d.png", &offset));
+    EXPECT_EQ(0u, offset);
+    EXPECT_EQ("test%0d.png", cv::icvExtractPattern("test%0d.png", &offset));
+    EXPECT_EQ(0u, offset);
+    EXPECT_EQ("test%09d.png", cv::icvExtractPattern("test%09d.png", &offset));
+    EXPECT_EQ(0u, offset);
+    EXPECT_EQ("test%5u.png", cv::icvExtractPattern("test%5u.png", &offset));
+    EXPECT_EQ(0u, offset);
+
+    // Invalid arguments
+    EXPECT_THROW(cv::icvExtractPattern(string(), &offset), cv::Exception);
+    // TODO: fix?
+    // EXPECT_EQ(0u, offset);
+    EXPECT_THROW(cv::icvExtractPattern("test%010d.png", &offset), cv::Exception);
+    EXPECT_EQ(0u, offset);
+    EXPECT_THROW(cv::icvExtractPattern("1000000000.png", &offset), cv::Exception);
+    EXPECT_EQ(0u, offset);
+    EXPECT_THROW(cv::icvExtractPattern("1.png", NULL), cv::Exception);
+}
+
+// TODO: should writer overwrite files?
+// TODO: is clamping good for seeking?
+// TODO: missing files? E.g. 3, 4, 6, 7, 8 (should it finish OR jump over OR return empty frame?)
+// TODO: non-numbered files (https://github.com/opencv/opencv/pull/23815)
+// TODO: when opening with pattern (e.g. test%01d.png), first frame can be only 0 (test0.png)
+
+}} // opencv_test::<anonymous>::
diff --git a/modules/videoio/test/test_precomp.hpp b/modules/videoio/test/test_precomp.hpp
index cffdf2bef45a..835177729b77 100644
--- a/modules/videoio/test/test_precomp.hpp
+++ b/modules/videoio/test/test_precomp.hpp
@@ -5,8 +5,11 @@
 #define __OPENCV_TEST_PRECOMP_HPP__
 
 #include <sstream>
+#include <algorithm>
+#include <numeric>
 
 #include "opencv2/ts.hpp"
+#include "opencv2/ts/ocl_test.hpp"
 #include "opencv2/videoio.hpp"
 #include "opencv2/videoio/registry.hpp"
 #include "opencv2/core/private.hpp"
@@ -55,6 +58,14 @@ inline std::string fourccToString(int fourcc)
     return cv::format("%c%c%c%c", fourcc & 255, (fourcc >> 8) & 255, (fourcc >> 16) & 255, (fourcc >> 24) & 255);
 }
 
+inline std::string fourccToStringSafe(int fourcc)
+{
+    std::string res = fourccToString(fourcc);
+    // TODO: return hex values for invalid characters
+    std::transform(res.begin(), res.end(), res.begin(), [](char c) -> char { return (c >= '0' && c <= 'z') ? c : (c == ' ' ? '_' : 'x'); });
+    return res;
+}
+
 inline int fourccFromString(const std::string &fourcc)
 {
     if (fourcc.size() != 4) return 0;
@@ -84,11 +95,11 @@ inline void generateFrame(int i, int FRAME_COUNT, cv::Mat & frame)
 class BunnyParameters
 {
 public:
-    inline static int    getWidth()  { return 672; };
-    inline static int    getHeight() { return 384; };
-    inline static int    getFps()    { return 24; };
-    inline static double getTime()   { return 5.21; };
-    inline static int    getCount()  { return cvRound(getFps() * getTime()); };
+    inline static int    getWidth()  { return 672; }
+    inline static int    getHeight() { return 384; }
+    inline static int    getFps()    { return 24; }
+    inline static double getTime()   { return 5.21; }
+    inline static int    getCount()  { return cvRound(getFps() * getTime()); }
     inline static std::string getFilename(const std::string &ext)
     {
         return cvtest::TS::ptr()->get_data_path() + "video/big_buck_bunny" + ext;
diff --git a/modules/videoio/test/test_v4l2.cpp b/modules/videoio/test/test_v4l2.cpp
new file mode 100644
index 000000000000..b336a6fd8a57
--- /dev/null
+++ b/modules/videoio/test/test_v4l2.cpp
@@ -0,0 +1,163 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Reference: https://www.kernel.org/doc/html/v4.8/media/v4l-drivers/vivid.html
+
+// create 1 virtual device of type CAP (0x1) at /dev/video10
+//   sudo modprobe vivid ndevs=1 node_types=0x1 vid_cap_nr=10
+// make sure user have read/write access (e.g. via group 'video')
+//   $ ls -l /dev/video10
+//   crw-rw----+ 1 root video ... /dev/video10
+// set environment variable:
+//   export OPENCV_TEST_V4L2_VIVID_DEVICE=/dev/video10
+// run v4l2 tests:
+//   opencv_test_videoio --gtest_filter=*videoio_v4l2*
+
+
+#ifdef HAVE_CAMV4L2
+
+// #define DUMP_CAMERA_FRAME
+
+#include "test_precomp.hpp"
+#include <opencv2/core/utils/configuration.private.hpp>
+#include <linux/videodev2.h>
+
+// workarounds for older versions
+#ifndef v4l2_fourcc_be
+#define v4l2_fourcc_be(a, b, c, d) (v4l2_fourcc(a, b, c, d) | (1U << 31))
+#endif
+#ifndef V4L2_PIX_FMT_Y10
+#define V4L2_PIX_FMT_Y10 v4l2_fourcc('Y', '1', '0', ' ')
+#endif
+#ifndef V4L2_PIX_FMT_Y12
+#define V4L2_PIX_FMT_Y12 v4l2_fourcc('Y', '1', '2', ' ')
+#endif
+#ifndef V4L2_PIX_FMT_ABGR32
+#define V4L2_PIX_FMT_ABGR32  v4l2_fourcc('A', 'R', '2', '4')
+#endif
+#ifndef V4L2_PIX_FMT_XBGR32
+#define V4L2_PIX_FMT_XBGR32  v4l2_fourcc('X', 'R', '2', '4')
+#endif
+#ifndef V4L2_PIX_FMT_Y16
+#define V4L2_PIX_FMT_Y16 v4l2_fourcc('Y', '1', '6', ' ')
+#endif
+#ifndef V4L2_PIX_FMT_Y16_BE
+#define V4L2_PIX_FMT_Y16_BE v4l2_fourcc_be('Y', '1', '6', ' ')
+#endif
+
+
+using namespace cv;
+
+namespace opencv_test { namespace {
+
+struct Format_Channels_Depth
+{
+    uint32_t pixel_format;
+    uint8_t channels;
+    uint8_t depth;
+    float mul_width;
+    float mul_height;
+};
+
+typedef testing::TestWithParam<Format_Channels_Depth> videoio_v4l2;
+
+TEST_P(videoio_v4l2, formats)
+{
+    utils::Paths devs = utils::getConfigurationParameterPaths("OPENCV_TEST_V4L2_VIVID_DEVICE");
+    if (devs.size() != 1)
+    {
+        throw SkipTestException("OPENCV_TEST_V4L2_VIVID_DEVICE is not set");
+    }
+    const string device = devs[0];
+    const Size sz(640, 480);
+    const Format_Channels_Depth params = GetParam();
+    const Size esz(sz.width * params.mul_width, sz.height * params.mul_height);
+
+    {
+        // Case with RAW output
+        VideoCapture cap;
+        ASSERT_TRUE(cap.open(device, CAP_V4L2));
+        // VideoCapture will set device's format automatically, vivid device will accept it
+        ASSERT_TRUE(cap.set(CAP_PROP_FOURCC, params.pixel_format));
+        ASSERT_TRUE(cap.set(CAP_PROP_CONVERT_RGB, false));
+        for (size_t idx = 0; idx < 3; ++idx)
+        {
+            Mat img;
+            EXPECT_TRUE(cap.grab());
+            EXPECT_TRUE(cap.retrieve(img));
+            if (params.pixel_format == V4L2_PIX_FMT_SRGGB8 ||
+                params.pixel_format == V4L2_PIX_FMT_SBGGR8 ||
+                params.pixel_format == V4L2_PIX_FMT_SGBRG8 ||
+                params.pixel_format == V4L2_PIX_FMT_SGRBG8)
+            {
+                EXPECT_EQ((size_t)esz.area(), img.total());
+            }
+            else
+            {
+                EXPECT_EQ(esz, img.size());
+            }
+            EXPECT_EQ(params.channels, img.channels());
+            EXPECT_EQ(params.depth, img.depth());
+        }
+    }
+    {
+        // case with BGR output
+        VideoCapture cap;
+        ASSERT_TRUE(cap.open(device, CAP_V4L2));
+        // VideoCapture will set device's format automatically, vivid device will accept it
+        ASSERT_TRUE(cap.set(CAP_PROP_FOURCC, params.pixel_format));
+        for (size_t idx = 0; idx < 3; ++idx)
+        {
+            Mat img;
+            EXPECT_TRUE(cap.grab());
+            EXPECT_TRUE(cap.retrieve(img));
+            EXPECT_EQ(sz, img.size());
+            EXPECT_EQ(3, img.channels());
+            EXPECT_EQ(CV_8U, img.depth());
+#ifdef DUMP_CAMERA_FRAME
+            std::string img_name = "frame_" + fourccToString(params.pixel_format);
+            // V4L2 flag for big-endian formats
+            if(params.pixel_format & (1 << 31))
+                img_name += "-BE";
+            cv::imwrite(img_name + ".png", img);
+#endif
+        }
+    }
+}
+
+vector<Format_Channels_Depth> all_params = {
+    { V4L2_PIX_FMT_YVU420, 1, CV_8U, 1.f, 1.5f },
+    { V4L2_PIX_FMT_YUV420, 1, CV_8U, 1.f, 1.5f },
+    { V4L2_PIX_FMT_NV12, 1, CV_8U, 1.f, 1.5f },
+    { V4L2_PIX_FMT_NV21, 1, CV_8U, 1.f, 1.5f },
+    { V4L2_PIX_FMT_YUV411P, 3, CV_8U, 1.f, 1.f },
+//    { V4L2_PIX_FMT_MJPEG, 1, CV_8U, 1.f, 1.f },
+//    { V4L2_PIX_FMT_JPEG, 1, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_YUYV, 2, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_UYVY, 2, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_SN9C10X, 3, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_SRGGB8, 1, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_SBGGR8, 1, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_SGBRG8, 1, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_SGRBG8, 1, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_RGB24, 3, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_Y16, 1, CV_16U, 1.f, 1.f },
+    { V4L2_PIX_FMT_Y16_BE, 1, CV_16U, 1.f, 1.f },
+    { V4L2_PIX_FMT_Y10, 1, CV_16U, 1.f, 1.f },
+    { V4L2_PIX_FMT_GREY, 1, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_BGR24, 3, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_XBGR32, 3, CV_8U, 1.f, 1.f },
+    { V4L2_PIX_FMT_ABGR32, 3, CV_8U, 1.f, 1.f },
+};
+
+inline static std::string param_printer(const testing::TestParamInfo<videoio_v4l2::ParamType>& info)
+{
+    return fourccToStringSafe(info.param.pixel_format);
+}
+
+INSTANTIATE_TEST_CASE_P(/*videoio_v4l2*/, videoio_v4l2, ValuesIn(all_params), param_printer);
+
+}} // opencv_test::<anonymous>::
+
+#endif // HAVE_CAMV4L2
diff --git a/modules/videoio/test/test_video_io.cpp b/modules/videoio/test/test_video_io.cpp
index 7d7944f5eb0b..9c9cdddbff24 100644
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@@ -67,6 +67,11 @@ class Videoio_Test_Base
             std::cout << "CAP_PROP_FRAME_COUNT is not supported by backend. Assume 50 frames." << std::endl;
             n_frames = 50;
         }
+        // GStreamer can't read frame count of big_buck_bunny.wmv
+        if (apiPref == CAP_GSTREAMER && ext == "wmv")
+        {
+            n_frames = 125;
+        }
 
         {
             SCOPED_TRACE("consecutive read");
@@ -166,7 +171,7 @@ class videoio_bunny : public Videoio_Test_Base, public testing::TestWithParam<Ba
         EXPECT_NO_THROW(count_prop = (int)cap.get(CAP_PROP_FRAME_COUNT));
         // mpg file reports 5.08 sec * 24 fps => property returns 122 frames
         // but actual number of frames returned is 125
-        if (ext != "mpg")
+        if (ext != "mpg" && !(apiPref == CAP_GSTREAMER && ext == "wmv"))
         {
             if (count_prop > 0)
             {
@@ -200,12 +205,11 @@ class videoio_bunny : public Videoio_Test_Base, public testing::TestWithParam<Ba
         if (!isBackendAvailable(apiPref, cv::videoio_registry::getStreamBackends()))
             throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
 
-        // GStreamer: https://github.com/opencv/opencv/issues/19025
-        if (apiPref == CAP_GSTREAMER)
+        if (((apiPref == CAP_GSTREAMER) && (ext == "avi")))
             throw SkipTestException(cv::String("Backend ") +  cv::videoio_registry::getBackendName(apiPref) +
-                    cv::String(" does not return reliable values for CAP_PROP_POS_MSEC property"));
+                    cv::String(" does not support CAP_PROP_POS_MSEC option"));
 
-        if (((apiPref == CAP_FFMPEG) && ((ext == "h264") || (ext == "h265"))))
+        if (((apiPref == CAP_FFMPEG || apiPref == CAP_GSTREAMER) && ((ext == "h264") || (ext == "h265"))))
             throw SkipTestException(cv::String("Backend ") +  cv::videoio_registry::getBackendName(apiPref) +
                     cv::String(" does not support CAP_PROP_POS_MSEC option"));
 
@@ -221,8 +225,8 @@ class videoio_bunny : public Videoio_Test_Base, public testing::TestWithParam<Ba
         // mpg file reports 5.08 sec * 24 fps => property returns 122 frames,but actual number of frames returned is 125
         // HACK: CAP_PROP_FRAME_COUNT is not supported for vmw + MSMF. Just force check for all 125 frames
         if (ext == "mpg")
-            EXPECT_GT(frame_count, 121);
-        else if ((ext == "wmv") && (apiPref == CAP_MSMF))
+            EXPECT_GE(frame_count, 114);
+        else if ((ext == "wmv") && (apiPref == CAP_MSMF || apiPref == CAP_GSTREAMER))
             frame_count = 125;
         else
             EXPECT_EQ(frame_count, 125);
@@ -240,6 +244,9 @@ class videoio_bunny : public Videoio_Test_Base, public testing::TestWithParam<Ba
             if (cvtest::debugLevel > 0)
                 std::cout << "i = " << i << ": timestamp = " << timestamp << std::endl;
             const double frame_period = 1000.f/bunny_param.getFps();
+            // big_buck_bunny.mpg starts at 0.500 msec
+            if ((ext == "mpg") && (apiPref == CAP_GSTREAMER))
+                timestamp -= 500.0;
             // NOTE: eps == frame_period, because videoCapture returns frame beginning timestamp or frame end
             // timestamp depending on codec and back-end. So the first frame has timestamp 0 or frame_period.
             EXPECT_NEAR(timestamp, i*frame_period, frame_period) << "i=" << i;
diff --git a/modules/world/CMakeLists.txt b/modules/world/CMakeLists.txt
index b14378599e86..b6a0fedde142 100644
--- a/modules/world/CMakeLists.txt
+++ b/modules/world/CMakeLists.txt
@@ -22,6 +22,24 @@ if(NOT OPENCV_INITIAL_PASS)
   set(ENABLE_PRECOMPILED_HEADERS OFF CACHE INTERNAL "" FORCE)
   project(opencv_world)
 
+  # MSVS 2014 (vc14): LINK : fatal error LNK1210: exceeded internal ILK size limit; link with /INCREMENTAL:NO
+  if(MSVC AND MSVC_VERSION EQUAL 1900)
+    foreach(flag_var
+            CMAKE_EXE_LINKER_FLAGS_DEBUG
+            CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
+            CMAKE_MODULE_LINKER_FLAGS_DEBUG
+            CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
+            CMAKE_SHARED_LINKER_FLAGS_DEBUG
+            CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
+    )
+      if(${flag_var} MATCHES "/INCREMENTAL")
+        string(REGEX REPLACE "/INCREMENTAL[^ ]*" "/INCREMENTAL:NO" ${flag_var} "${${flag_var}}")
+      else()
+        set(${flag_var} "${${flag_var}} /INCREMENTAL:NO*")
+      endif()
+    endforeach(flag_var)
+  endif()
+
   message(STATUS "Processing WORLD modules...")
   foreach(m ${OPENCV_MODULES_BUILD})
     set(the_module ${m})
diff --git a/platforms/android/aar-template/OpenCV/build.gradle.template b/platforms/android/aar-template/OpenCV/build.gradle.template
new file mode 100644
index 000000000000..b12802063b03
--- /dev/null
+++ b/platforms/android/aar-template/OpenCV/build.gradle.template
@@ -0,0 +1,116 @@
+plugins {
+    id 'com.android.library'
+    id 'maven-publish'
+    id 'kotlin-android'
+}
+
+android {
+    namespace 'org.opencv'
+    compileSdk ${COMPILE_SDK}
+
+    defaultConfig {
+        minSdk ${MIN_SDK}
+        targetSdk ${TARGET_SDK}
+
+        testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
+        externalNativeBuild {
+            cmake {
+                cppFlags ""
+                arguments "-DANDROID_STL=${LIB_TYPE}"
+            }
+        }
+        ndk {
+            abiFilters ${ABI_FILTERS}
+        }
+    }
+
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+        }
+    }
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_${JAVA_VERSION}
+        targetCompatibility JavaVersion.VERSION_${JAVA_VERSION}
+    }
+    externalNativeBuild {
+        cmake {
+            path file('src/main/cpp/CMakeLists.txt')
+        }
+    }
+    buildFeatures {
+        prefabPublishing true
+        buildConfig true
+    }
+    prefab {
+        ${LIB_NAME} {
+            headers "src/main/cpp/include"
+        }
+    }
+    sourceSets {
+        main {
+            java.srcDirs = ['src/main/java']
+            //jniLibs.srcDirs = ['libs']
+        }
+    }
+
+    publishing {
+        singleVariant('release') {
+            withSourcesJar()
+            withJavadocJar()
+        }
+    }
+}
+
+publishing {
+    publications {
+        release(MavenPublication) {
+            // Builds aar, sources jar and javadoc jar from project sources and creates maven
+            groupId = 'org.opencv'
+            artifactId = '${PACKAGE_NAME}'
+            version = '${OPENCV_VERSION}'
+            afterEvaluate {
+                from components.release
+            }
+        }
+        modified(MavenPublication) {
+            // Creates maven from opencv-release.aar
+            groupId = 'org.opencv'
+            artifactId = '${PACKAGE_NAME}'
+            version = '${OPENCV_VERSION}'
+            artifact("opencv-release.aar")
+            pom {
+                name = "OpenCV"
+                description = "Open Source Computer Vision Library"
+                url = "https://opencv.org/"
+                licenses {
+                    license {
+                        name = "The Apache License, Version 2.0"
+                        url = "https://github.com/opencv/opencv/blob/master/LICENSE"
+                    }
+                }
+                developers {
+                    developer {
+                        id = "admin"
+                        name = "OpenCV Team"
+                        email = "admin@opencv.org"
+                    }
+                }
+                scm {
+                    connection = "scm:git:https://github.com/opencv/opencv.git"
+                    url = "https://github.com/opencv/opencv"
+                }
+            }
+        }
+    }
+    repositories {
+        maven {
+            name = 'myrepo'
+            url = "${project.buildDir}/repo"
+        }
+    }
+}
+
+dependencies {
+}
diff --git a/platforms/android/aar-template/OpenCV/proguard-rules.pro b/platforms/android/aar-template/OpenCV/proguard-rules.pro
new file mode 100644
index 000000000000..481bb4348141
--- /dev/null
+++ b/platforms/android/aar-template/OpenCV/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
\ No newline at end of file
diff --git a/platforms/android/aar-template/OpenCV/src/main/AndroidManifest.xml b/platforms/android/aar-template/OpenCV/src/main/AndroidManifest.xml
new file mode 100644
index 000000000000..cce937ee783a
--- /dev/null
+++ b/platforms/android/aar-template/OpenCV/src/main/AndroidManifest.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
+
+</manifest>
\ No newline at end of file
diff --git a/platforms/android/aar-template/OpenCV/src/main/cpp/CMakeLists.txt.template b/platforms/android/aar-template/OpenCV/src/main/cpp/CMakeLists.txt.template
new file mode 100644
index 000000000000..02ef035e0a0f
--- /dev/null
+++ b/platforms/android/aar-template/OpenCV/src/main/cpp/CMakeLists.txt.template
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.6)
+
+project("opencv")
+
+add_library(${LIB_NAME} ${LIB_TYPE} native-lib.cpp)
diff --git a/platforms/android/aar-template/OpenCV/src/main/cpp/include/temp.h b/platforms/android/aar-template/OpenCV/src/main/cpp/include/temp.h
new file mode 100644
index 000000000000..4974d8ca96fd
--- /dev/null
+++ b/platforms/android/aar-template/OpenCV/src/main/cpp/include/temp.h
@@ -0,0 +1 @@
+// This empty .h file is used for creating an AAR with empty C++ lib that will be replaced with OpenCV C++ lib
\ No newline at end of file
diff --git a/platforms/android/aar-template/OpenCV/src/main/cpp/native-lib.cpp b/platforms/android/aar-template/OpenCV/src/main/cpp/native-lib.cpp
new file mode 100644
index 000000000000..73ac04eb7abd
--- /dev/null
+++ b/platforms/android/aar-template/OpenCV/src/main/cpp/native-lib.cpp
@@ -0,0 +1 @@
+// This empty .cpp file is used for creating an AAR with empty C++ lib that will be replaced with OpenCV C++ lib
\ No newline at end of file
diff --git a/platforms/android/aar-template/README.md b/platforms/android/aar-template/README.md
new file mode 100644
index 000000000000..a851364dd493
--- /dev/null
+++ b/platforms/android/aar-template/README.md
@@ -0,0 +1,29 @@
+## Scripts for creating an AAR package and a local Maven repository with OpenCV libraries for Android
+
+### How to run the scripts
+1. Set JAVA_HOME and ANDROID_HOME enviroment variables. For example:
+```
+export JAVA_HOME=~/Android Studio/jbr
+export ANDROID_HOME=~/Android/SDK
+```
+2. Download OpenCV SDK for Android
+3. Run build script for version with Java and a shared C++ library:
+```
+python build_java_shared_aar.py "~/opencv-4.7.0-android-sdk/OpenCV-android-sdk"
+```
+4. Run build script for version with static C++ libraries:
+```
+python build_static_aar.py "~/opencv-4.7.0-android-sdk/OpenCV-android-sdk"
+```
+The AAR libraries and the local Maven repository will be created in the **outputs** directory
+### Technical details
+The scripts consist of 5 steps:
+1. Preparing Android AAR library project template
+2. Adding Java code to the project. Adding C++ public headers for shared version to the project.
+3. Compiling the project to build an AAR package
+4. Adding C++ binary libraries to the AAR package. Adding C++ public headers for static version to the AAR package.
+5. Creating Maven repository with the AAR package
+
+There are a few minor limitations:
+1. Due to the AAR design the Java + shared C++ AAR package contains duplicates of C++ binary libraries, but the final user's Android application contains only one library instance.
+2. The compile definitions from cmake configs are skipped, but it shouldn't affect the library because the script uses precompiled C++ binaries from SDK.
diff --git a/platforms/android/aar-template/build.gradle b/platforms/android/aar-template/build.gradle
new file mode 100644
index 000000000000..280dab5da7a8
--- /dev/null
+++ b/platforms/android/aar-template/build.gradle
@@ -0,0 +1,5 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+plugins {
+    id 'com.android.library' version '7.3.0' apply false
+    id 'org.jetbrains.kotlin.android' version '1.8.20' apply false
+}
diff --git a/platforms/android/aar-template/gradle.properties b/platforms/android/aar-template/gradle.properties
new file mode 100644
index 000000000000..3e927b11efbf
--- /dev/null
+++ b/platforms/android/aar-template/gradle.properties
@@ -0,0 +1,21 @@
+# Project-wide Gradle settings.
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+# AndroidX package structure to make it clearer which packages are bundled with the
+# Android operating system, and which are packaged with your app's APK
+# https://developer.android.com/topic/libraries/support-library/androidx-rn
+android.useAndroidX=true
+# Enables namespacing of each library's R class so that its R class includes only the
+# resources declared in the library itself and none from the library's dependencies,
+# thereby reducing the size of the R class for that library
+android.nonTransitiveRClass=true
\ No newline at end of file
diff --git a/platforms/android/aar-template/gradle/wrapper/gradle-wrapper.jar b/platforms/android/aar-template/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 000000000000..e708b1c023ec
Binary files /dev/null and b/platforms/android/aar-template/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/platforms/android/aar-template/gradle/wrapper/gradle-wrapper.properties b/platforms/android/aar-template/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 000000000000..c4f47b9a2e08
--- /dev/null
+++ b/platforms/android/aar-template/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Mon Jul 10 11:57:38 SGT 2023
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-7.6.3-bin.zip
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
diff --git a/platforms/android/aar-template/gradlew b/platforms/android/aar-template/gradlew
new file mode 100755
index 000000000000..4f906e0c811f
--- /dev/null
+++ b/platforms/android/aar-template/gradlew
@@ -0,0 +1,185 @@
+#!/usr/bin/env sh
+
+#
+# Copyright 2015 the original author or authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=`expr $i + 1`
+    done
+    case $i in
+        0) set -- ;;
+        1) set -- "$args0" ;;
+        2) set -- "$args0" "$args1" ;;
+        3) set -- "$args0" "$args1" "$args2" ;;
+        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=`save "$@"`
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+exec "$JAVACMD" "$@"
diff --git a/platforms/android/aar-template/gradlew.bat b/platforms/android/aar-template/gradlew.bat
new file mode 100644
index 000000000000..107acd32c4e6
--- /dev/null
+++ b/platforms/android/aar-template/gradlew.bat
@@ -0,0 +1,89 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem      https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
+
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Resolve any "." and ".." in APP_HOME to make it shorter.
+for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/platforms/android/aar-template/settings.gradle b/platforms/android/aar-template/settings.gradle
new file mode 100644
index 000000000000..fb1a50602c6f
--- /dev/null
+++ b/platforms/android/aar-template/settings.gradle
@@ -0,0 +1,16 @@
+pluginManagement {
+    repositories {
+        google()
+        mavenCentral()
+        gradlePluginPortal()
+    }
+}
+dependencyResolutionManagement {
+    repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
+    repositories {
+        google()
+        mavenCentral()
+    }
+}
+rootProject.name = "OpenCV"
+include ':OpenCV'
diff --git a/platforms/android/android.toolchain.cmake b/platforms/android/android.toolchain.cmake
deleted file mode 100644
index 69c5598a97c3..000000000000
--- a/platforms/android/android.toolchain.cmake
+++ /dev/null
@@ -1,1795 +0,0 @@
-# Copyright (c) 2010-2011, Ethan Rublee
-# Copyright (c) 2011-2014, Andrey Kamaev
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1.  Redistributions of source code must retain the above copyright notice,
-#     this list of conditions and the following disclaimer.
-#
-# 2.  Redistributions in binary form must reproduce the above copyright notice,
-#     this list of conditions and the following disclaimer in the documentation
-#     and/or other materials provided with the distribution.
-#
-# 3.  Neither the name of the copyright holder nor the names of its
-#     contributors may be used to endorse or promote products derived from this
-#     software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-
-# ------------------------------------------------------------------------------
-#  Android CMake toolchain file, for use with the Android NDK r5-r10d
-#  Requires cmake 2.6.3 or newer (2.8.9 or newer is recommended).
-#  See home page: https://github.com/taka-no-me/android-cmake
-#
-#  Usage Linux:
-#   $ export ANDROID_NDK=/absolute/path/to/the/android-ndk
-#   $ mkdir build && cd build
-#   $ cmake -DCMAKE_TOOLCHAIN_FILE=path/to/the/android.toolchain.cmake ..
-#   $ make -j8
-#
-#  Usage Windows:
-#     You need native port of make to build your project.
-#     Android NDK r7 (and newer) already has make.exe on board.
-#     For older NDK you have to install it separately.
-#     For example, this one: http://gnuwin32.sourceforge.net/packages/make.htm
-#
-#   $ SET ANDROID_NDK=C:\absolute\path\to\the\android-ndk
-#   $ mkdir build && cd build
-#   $ cmake.exe -G"MinGW Makefiles"
-#       -DCMAKE_TOOLCHAIN_FILE=path\to\the\android.toolchain.cmake
-#       -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%\prebuilt\windows\bin\make.exe" ..
-#   $ cmake.exe --build .
-#
-#
-#  Options (can be set as cmake parameters: -D<option_name>=<value>):
-#    ANDROID_NDK=/opt/android-ndk - path to the NDK root.
-#      Can be set as environment variable. Can be set only at first cmake run.
-#
-#    ANDROID_ABI=armeabi-v7a - specifies the target Application Binary
-#      Interface (ABI). This option nearly matches to the APP_ABI variable
-#      used by ndk-build tool from Android NDK.
-#
-#      Possible targets are:
-#        "armeabi" - ARMv5TE based CPU with software floating point operations
-#        "armeabi-v7a" - ARMv7 based devices with hardware FPU instructions
-#            this ABI target is used by default
-#        "armeabi-v7a-hard with NEON" - ARMv7 based devices with hardware FPU instructions and hardfp
-#        "armeabi-v7a with NEON" - same as armeabi-v7a, but
-#            sets NEON as floating-point unit
-#        "armeabi-v7a with VFPV3" - same as armeabi-v7a, but
-#            sets VFPV3 as floating-point unit (has 32 registers instead of 16)
-#        "armeabi-v6 with VFP" - tuned for ARMv6 processors having VFP
-#        "x86" - IA-32 instruction set
-#        "mips" - MIPS32 instruction set
-#
-#      64-bit ABIs for NDK r10 and newer:
-#        "arm64-v8a" - ARMv8 AArch64 instruction set
-#        "x86_64" - Intel64 instruction set (r1)
-#        "mips64" - MIPS64 instruction set (r6)
-#
-#    ANDROID_NATIVE_API_LEVEL=android-8 - level of Android API compile for.
-#      Option is read-only when standalone toolchain is used.
-#      Note: building for "android-L" requires explicit configuration.
-#
-#    ANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.9 - the name of compiler
-#      toolchain to be used. The list of possible values depends on the NDK
-#      version. For NDK r10c the possible values are:
-#
-#        * aarch64-linux-android-4.9
-#        * aarch64-linux-android-clang3.4
-#        * aarch64-linux-android-clang3.5
-#        * arm-linux-androideabi-4.6
-#        * arm-linux-androideabi-4.8
-#        * arm-linux-androideabi-4.9 (default)
-#        * arm-linux-androideabi-clang3.4
-#        * arm-linux-androideabi-clang3.5
-#        * mips64el-linux-android-4.9
-#        * mips64el-linux-android-clang3.4
-#        * mips64el-linux-android-clang3.5
-#        * mipsel-linux-android-4.6
-#        * mipsel-linux-android-4.8
-#        * mipsel-linux-android-4.9
-#        * mipsel-linux-android-clang3.4
-#        * mipsel-linux-android-clang3.5
-#        * x86-4.6
-#        * x86-4.8
-#        * x86-4.9
-#        * x86-clang3.4
-#        * x86-clang3.5
-#        * x86_64-4.9
-#        * x86_64-clang3.4
-#        * x86_64-clang3.5
-#
-#    ANDROID_FORCE_ARM_BUILD=OFF - set ON to generate 32-bit ARM instructions
-#      instead of Thumb. Is not available for "armeabi-v6 with VFP"
-#      (is forced to be ON) ABI.
-#
-#    ANDROID_NO_UNDEFINED=ON - set ON to show all undefined symbols as linker
-#      errors even if they are not used.
-#
-#    ANDROID_SO_UNDEFINED=OFF - set ON to allow undefined symbols in shared
-#      libraries. Automatically turned for NDK r5x and r6x due to GLESv2
-#      problems.
-#
-#    ANDROID_STL=gnustl_static - specify the runtime to use.
-#
-#      Possible values are:
-#        none           -> Do not configure the runtime.
-#        system         -> Use the default minimal system C++ runtime library.
-#                          Implies -fno-rtti -fno-exceptions.
-#                          Is not available for standalone toolchain.
-#        system_re      -> Use the default minimal system C++ runtime library.
-#                          Implies -frtti -fexceptions.
-#                          Is not available for standalone toolchain.
-#        gabi++_static  -> Use the GAbi++ runtime as a static library.
-#                          Implies -frtti -fno-exceptions.
-#                          Available for NDK r7 and newer.
-#                          Is not available for standalone toolchain.
-#        gabi++_shared  -> Use the GAbi++ runtime as a shared library.
-#                          Implies -frtti -fno-exceptions.
-#                          Available for NDK r7 and newer.
-#                          Is not available for standalone toolchain.
-#        stlport_static -> Use the STLport runtime as a static library.
-#                          Implies -fno-rtti -fno-exceptions for NDK before r7.
-#                          Implies -frtti -fno-exceptions for NDK r7 and newer.
-#                          Is not available for standalone toolchain.
-#        stlport_shared -> Use the STLport runtime as a shared library.
-#                          Implies -fno-rtti -fno-exceptions for NDK before r7.
-#                          Implies -frtti -fno-exceptions for NDK r7 and newer.
-#                          Is not available for standalone toolchain.
-#        gnustl_static  -> Use the GNU STL as a static library.
-#                          Implies -frtti -fexceptions.
-#        gnustl_shared  -> Use the GNU STL as a shared library.
-#                          Implies -frtti -fno-exceptions.
-#                          Available for NDK r7b and newer.
-#                          Silently degrades to gnustl_static if not available.
-#        c++_static     -> Use the LLVM libc++ runtime as a static library.
-#                          Implies -frtti -fexceptions.
-#        c++_shared     -> Use the LLVM libc++ runtime as a shared library.
-#                          Implies -frtti -fno-exceptions.
-#
-#    ANDROID_STL_FORCE_FEATURES=ON - turn rtti and exceptions support based on
-#      chosen runtime. If disabled, then the user is responsible for settings
-#      these options.
-#
-#  What?:
-#    android-cmake toolchain searches for NDK/toolchain in the following order:
-#      ANDROID_NDK - cmake parameter
-#      ANDROID_NDK - environment variable
-#      ANDROID_STANDALONE_TOOLCHAIN - cmake parameter
-#      ANDROID_STANDALONE_TOOLCHAIN - environment variable
-#      ANDROID_NDK - default locations
-#      ANDROID_STANDALONE_TOOLCHAIN - default locations
-#
-#    Make sure to do the following in your scripts:
-#      SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${my_cxx_flags}" )
-#      SET( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${my_cxx_flags}" )
-#    The flags will be prepopulated with critical flags, so don't loose them.
-#    Also be aware that toolchain also sets configuration-specific compiler
-#    flags and linker flags.
-#
-#    ANDROID and BUILD_ANDROID will be set to true, you may test any of these
-#    variables to make necessary Android-specific configuration changes.
-#
-#    Also ARMEABI or ARMEABI_V7A or ARMEABI_V7A_HARD or X86 or MIPS or ARM64_V8A or X86_64 or MIPS64
-#    will be set true, mutually exclusive. NEON option will be set true
-#    if VFP is set to NEON.
-#
-# ------------------------------------------------------------------------------
-
-cmake_minimum_required( VERSION 2.8.12.2 )
-
-if( DEFINED CMAKE_CROSSCOMPILING )
- # subsequent toolchain loading is not really needed
- return()
-endif()
-
-if( CMAKE_TOOLCHAIN_FILE )
- # touch toolchain variable to suppress "unused variable" warning
-endif()
-
-# inherit settings in recursive loads
-get_property( _CMAKE_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE )
-if( _CMAKE_IN_TRY_COMPILE )
- include( "${CMAKE_CURRENT_SOURCE_DIR}/../android.toolchain.config.cmake" OPTIONAL )
-endif()
-
-# this one is important
-if( CMAKE_VERSION VERSION_GREATER "3.0.99" )
- set( CMAKE_SYSTEM_NAME Android )
-else()
- set( CMAKE_SYSTEM_NAME Linux )
-endif()
-
-# this one not so much
-set( CMAKE_SYSTEM_VERSION 1 )
-
-# rpath makes low sense for Android
-set( CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG "" )
-set( CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries." )
-
-# NDK search paths
-set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r10d -r10c -r10b -r10 -r9d -r9c -r9b -r9 -r8e -r8d -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
-if( NOT DEFINED ANDROID_NDK_SEARCH_PATHS )
- if( CMAKE_HOST_WIN32 )
-  file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS )
-  set( ANDROID_NDK_SEARCH_PATHS "${ANDROID_NDK_SEARCH_PATHS}" "$ENV{SystemDrive}/NVPACK" )
- else()
-  file( TO_CMAKE_PATH "$ENV{HOME}" ANDROID_NDK_SEARCH_PATHS )
-  set( ANDROID_NDK_SEARCH_PATHS /opt "${ANDROID_NDK_SEARCH_PATHS}/NVPACK" )
- endif()
-endif()
-if( NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH )
- set( ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH /opt/android-toolchain )
-endif()
-
-# known ABIs
-set( ANDROID_SUPPORTED_ABIS_arm "armeabi-v7a;armeabi;armeabi-v7a with NEON;armeabi-v7a-hard with NEON;armeabi-v7a with VFPV3;armeabi-v6 with VFP" )
-set( ANDROID_SUPPORTED_ABIS_arm64 "arm64-v8a" )
-set( ANDROID_SUPPORTED_ABIS_x86 "x86" )
-set( ANDROID_SUPPORTED_ABIS_x86_64 "x86_64" )
-set( ANDROID_SUPPORTED_ABIS_mips "mips" )
-set( ANDROID_SUPPORTED_ABIS_mips64 "mips64" )
-
-# API level defaults
-set( ANDROID_DEFAULT_NDK_API_LEVEL 9 )
-set( ANDROID_DEFAULT_NDK_API_LEVEL_arm64 21 )
-set( ANDROID_DEFAULT_NDK_API_LEVEL_x86 9 )
-set( ANDROID_DEFAULT_NDK_API_LEVEL_x86_64 21 )
-set( ANDROID_DEFAULT_NDK_API_LEVEL_mips 9 )
-set( ANDROID_DEFAULT_NDK_API_LEVEL_mips64 21 )
-
-
-macro( __LIST_FILTER listvar regex )
-  if( ${listvar} )
-    foreach( __val ${${listvar}} )
-      if( __val MATCHES "${regex}" )
-        list( REMOVE_ITEM ${listvar} "${__val}" )
-      endif()
-    endforeach()
-  endif()
-endmacro()
-
-macro( __INIT_VARIABLE var_name )
-  set( __test_path 0 )
-  foreach( __var ${ARGN} )
-    if( __var STREQUAL "PATH" )
-      set( __test_path 1 )
-      break()
-    endif()
-  endforeach()
-
-  if( __test_path AND NOT EXISTS "${${var_name}}" )
-    unset( ${var_name} CACHE )
-  endif()
-
-  if( " ${${var_name}}" STREQUAL " " )
-    set( __values 0 )
-    foreach( __var ${ARGN} )
-      if( __var STREQUAL "VALUES" )
-        set( __values 1 )
-      elseif( NOT __var STREQUAL "PATH" )
-        if( __var MATCHES "^ENV_.*$" )
-          string( REPLACE "ENV_" "" __var "${__var}" )
-          set( __value "$ENV{${__var}}" )
-        elseif( DEFINED ${__var} )
-          set( __value "${${__var}}" )
-        elseif( __values )
-          set( __value "${__var}" )
-        else()
-          set( __value "" )
-        endif()
-
-        if( NOT " ${__value}" STREQUAL " " AND (NOT __test_path OR EXISTS "${__value}") )
-          set( ${var_name} "${__value}" )
-          break()
-        endif()
-      endif()
-    endforeach()
-    unset( __value )
-    unset( __values )
-  endif()
-
-  if( __test_path )
-    file( TO_CMAKE_PATH "${${var_name}}" ${var_name} )
-  endif()
-  unset( __test_path )
-endmacro()
-
-macro( __DETECT_NATIVE_API_LEVEL _var _path )
-  set( __ndkApiLevelRegex "^[\t ]*#define[\t ]+__ANDROID_API__[\t ]+([0-9]+)[\t ]*.*$" )
-  file( STRINGS ${_path} __apiFileContent REGEX "${__ndkApiLevelRegex}" )
-  if( NOT __apiFileContent )
-    set( __ndkApiLevelRegex "^[\t ]*#define[\t ]+__ANDROID_API__[\t ]+__ANDROID_API_FUTURE__[\t ]*$" )
-    file( STRINGS ${_path} __apiFileContent REGEX "${__ndkApiLevelRegex}" )
-    if( __apiFileContent )
-      set(${_var} 10000)
-    else()
-      message( SEND_ERROR "Could not get Android native API level. Probably you have specified invalid level value, or your copy of NDK/toolchain is broken." )
-    endif()
-  else()
-    string( REGEX REPLACE "${__ndkApiLevelRegex}" "\\1" ${_var} "${__apiFileContent}" )
-  endif()
-  unset( __apiFileContent )
-  unset( __ndkApiLevelRegex )
-endmacro()
-
-macro( __DETECT_TOOLCHAIN_MACHINE_NAME _var _root )
- if( EXISTS "${_root}" )
-    file( GLOB __gccExePath RELATIVE "${_root}/bin/" "${_root}/bin/*-gcc${TOOL_OS_SUFFIX}" )
-    __LIST_FILTER( __gccExePath "^[.].*" )
-    list( LENGTH __gccExePath __gccExePathsCount )
-    if( NOT __gccExePathsCount EQUAL 1  AND NOT _CMAKE_IN_TRY_COMPILE )
-      message( WARNING "Could not determine machine name for compiler from ${_root}" )
-      set( ${_var} "" )
-    else()
-      get_filename_component( __gccExeName "${__gccExePath}" NAME_WE )
-      string( REPLACE "-gcc" "" ${_var} "${__gccExeName}" )
-    endif()
-    unset( __gccExePath )
-    unset( __gccExePathsCount )
-    unset( __gccExeName )
-  else()
-    set( ${_var} "" )
-  endif()
-endmacro()
-
-
-# fight against cygwin
-set( ANDROID_FORBID_SYGWIN TRUE CACHE BOOL "Prevent cmake from working under cygwin and using cygwin tools")
-mark_as_advanced( ANDROID_FORBID_SYGWIN )
-if( ANDROID_FORBID_SYGWIN )
- if( CYGWIN )
-  message( FATAL_ERROR "Android NDK and android-cmake toolchain are not welcome Cygwin. It is unlikely that this cmake toolchain will work under cygwin. But if you want to try then you can set cmake variable ANDROID_FORBID_SYGWIN to FALSE and rerun cmake." )
- endif()
-
- if( CMAKE_HOST_WIN32 )
-  # remove cygwin from PATH
-  set( __new_path "$ENV{PATH}")
-  __LIST_FILTER( __new_path "cygwin" )
-  set(ENV{PATH} "${__new_path}")
-  unset(__new_path)
- endif()
-endif()
-
-
-# detect current host platform
-if( NOT DEFINED ANDROID_NDK_HOST_X64 AND (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64" OR CMAKE_HOST_APPLE) )
- set( ANDROID_NDK_HOST_X64 1 CACHE BOOL "Try to use 64-bit compiler toolchain" )
- mark_as_advanced( ANDROID_NDK_HOST_X64 )
-endif()
-
-set( TOOL_OS_SUFFIX "" )
-if( CMAKE_HOST_APPLE )
- set( ANDROID_NDK_HOST_SYSTEM_NAME "darwin-x86_64" )
- set( ANDROID_NDK_HOST_SYSTEM_NAME2 "darwin-x86" )
-elseif( CMAKE_HOST_WIN32 )
- set( ANDROID_NDK_HOST_SYSTEM_NAME "windows-x86_64" )
- set( ANDROID_NDK_HOST_SYSTEM_NAME2 "windows" )
- set( TOOL_OS_SUFFIX ".exe" )
-elseif( CMAKE_HOST_UNIX )
- set( ANDROID_NDK_HOST_SYSTEM_NAME "linux-x86_64" )
- set( ANDROID_NDK_HOST_SYSTEM_NAME2 "linux-x86" )
-else()
- message( FATAL_ERROR "Cross-compilation on your platform is not supported by this cmake toolchain" )
-endif()
-
-if( NOT ANDROID_NDK_HOST_X64 )
- set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
-endif()
-
-# see if we have path to Android NDK
-if( NOT ANDROID_NDK AND NOT ANDROID_STANDALONE_TOOLCHAIN )
-  __INIT_VARIABLE( ANDROID_NDK PATH ENV_ANDROID_NDK )
-endif()
-if( NOT ANDROID_NDK )
- # see if we have path to Android standalone toolchain
- __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ENV_ANDROID_STANDALONE_TOOLCHAIN )
-
- if( NOT ANDROID_STANDALONE_TOOLCHAIN )
-  #try to find Android NDK in one of the default locations
-  set( __ndkSearchPaths )
-  foreach( __ndkSearchPath ${ANDROID_NDK_SEARCH_PATHS} )
-   foreach( suffix ${ANDROID_SUPPORTED_NDK_VERSIONS} )
-    list( APPEND __ndkSearchPaths "${__ndkSearchPath}/android-ndk${suffix}" )
-   endforeach()
-  endforeach()
-  __INIT_VARIABLE( ANDROID_NDK PATH VALUES ${__ndkSearchPaths} )
-  unset( __ndkSearchPaths )
-
-  if( ANDROID_NDK )
-   message( STATUS "Using default path for Android NDK: ${ANDROID_NDK}" )
-   message( STATUS "  If you prefer to use a different location, please define a cmake or environment variable: ANDROID_NDK" )
-  else()
-   #try to find Android standalone toolchain in one of the default locations
-   __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH )
-
-   if( ANDROID_STANDALONE_TOOLCHAIN )
-    message( STATUS "Using default path for standalone toolchain ${ANDROID_STANDALONE_TOOLCHAIN}" )
-    message( STATUS "  If you prefer to use a different location, please define the variable: ANDROID_STANDALONE_TOOLCHAIN" )
-   endif( ANDROID_STANDALONE_TOOLCHAIN )
-  endif( ANDROID_NDK )
- endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
-endif( NOT ANDROID_NDK )
-
-# remember found paths
-if( ANDROID_NDK )
- get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
- set( ANDROID_NDK "${ANDROID_NDK}" CACHE INTERNAL "Path of the Android NDK" FORCE )
- set( BUILD_WITH_ANDROID_NDK True )
- if( EXISTS "${ANDROID_NDK}/RELEASE.TXT" )
-  file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX "r[0-9]+[a-z]?" )
-  string( REGEX MATCH "r([0-9]+)([a-z]?)" ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
- else()
-  set( ANDROID_NDK_RELEASE "r1x" )
-  set( ANDROID_NDK_RELEASE_FULL "unreleased" )
- endif()
- string( REGEX REPLACE "r([0-9]+)([a-z]?)" "\\1*1000" ANDROID_NDK_RELEASE_NUM "${ANDROID_NDK_RELEASE}" )
- string( FIND " abcdefghijklmnopqastuvwxyz" "${CMAKE_MATCH_2}" __ndkReleaseLetterNum )
- math( EXPR ANDROID_NDK_RELEASE_NUM "${ANDROID_NDK_RELEASE_NUM}+${__ndkReleaseLetterNum}" )
-elseif( ANDROID_STANDALONE_TOOLCHAIN )
- get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
- # try to detect change
- if( CMAKE_AR )
-  string( LENGTH "${ANDROID_STANDALONE_TOOLCHAIN}" __length )
-  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidStandaloneToolchainPreviousPath )
-  if( NOT __androidStandaloneToolchainPreviousPath STREQUAL ANDROID_STANDALONE_TOOLCHAIN )
-   message( FATAL_ERROR "It is not possible to change path to the Android standalone toolchain on subsequent run." )
-  endif()
-  unset( __androidStandaloneToolchainPreviousPath )
-  unset( __length )
- endif()
- set( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" CACHE INTERNAL "Path of the Android standalone toolchain" FORCE )
- set( BUILD_WITH_STANDALONE_TOOLCHAIN True )
-else()
- list(GET ANDROID_NDK_SEARCH_PATHS 0 ANDROID_NDK_SEARCH_PATH)
- message( FATAL_ERROR "Could not find neither Android NDK nor Android standalone toolchain.
-    You should either set an environment variable:
-      export ANDROID_NDK=~/my-android-ndk
-    or
-      export ANDROID_STANDALONE_TOOLCHAIN=~/my-android-toolchain
-    or put the toolchain or NDK in the default path:
-      sudo ln -s ~/my-android-ndk ${ANDROID_NDK_SEARCH_PATH}/android-ndk
-      sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
-endif()
-
-# android NDK layout
-if( BUILD_WITH_ANDROID_NDK )
- if( NOT DEFINED ANDROID_NDK_LAYOUT )
-  # try to automatically detect the layout
-  if( EXISTS "${ANDROID_NDK}/RELEASE.TXT")
-   set( ANDROID_NDK_LAYOUT "RELEASE" )
-  elseif( EXISTS "${ANDROID_NDK}/../../linux-x86/toolchain/" )
-   set( ANDROID_NDK_LAYOUT "LINARO" )
-  elseif( EXISTS "${ANDROID_NDK}/../../gcc/" )
-   set( ANDROID_NDK_LAYOUT "ANDROID" )
-  endif()
- endif()
- set( ANDROID_NDK_LAYOUT "${ANDROID_NDK_LAYOUT}" CACHE STRING "The inner layout of NDK" )
- mark_as_advanced( ANDROID_NDK_LAYOUT )
- if( ANDROID_NDK_LAYOUT STREQUAL "LINARO" )
-  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
-  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../${ANDROID_NDK_HOST_SYSTEM_NAME}/toolchain" )
-  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
-  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
- elseif( ANDROID_NDK_LAYOUT STREQUAL "ANDROID" )
-  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
-  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../gcc/${ANDROID_NDK_HOST_SYSTEM_NAME}/arm" )
-  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
-  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
- else() # ANDROID_NDK_LAYOUT STREQUAL "RELEASE"
-  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/toolchains" )
-  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
-  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME2}" )
- endif()
- get_filename_component( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK_TOOLCHAINS_PATH}" ABSOLUTE )
-
- # try to detect change of NDK
- if( CMAKE_AR )
-  string( LENGTH "${ANDROID_NDK_TOOLCHAINS_PATH}" __length )
-  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
-  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK_TOOLCHAINS_PATH )
-   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
-   " )
-  endif()
-  unset( __androidNdkPreviousPath )
-  unset( __length )
- endif()
-endif()
-
-
-# get all the details about standalone toolchain
-if( BUILD_WITH_STANDALONE_TOOLCHAIN )
- __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
- set( ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
- set( __availableToolchains "standalone" )
- __DETECT_TOOLCHAIN_MACHINE_NAME( __availableToolchainMachines "${ANDROID_STANDALONE_TOOLCHAIN}" )
- if( NOT __availableToolchainMachines )
-  message( FATAL_ERROR "Could not determine machine name of your toolchain. Probably your Android standalone toolchain is broken." )
- endif()
- if( __availableToolchainMachines MATCHES x86_64 )
-  set( __availableToolchainArchs "x86_64" )
- elseif( __availableToolchainMachines MATCHES i686 )
-  set( __availableToolchainArchs "x86" )
- elseif( __availableToolchainMachines MATCHES aarch64 )
-  set( __availableToolchainArchs "arm64" )
- elseif( __availableToolchainMachines MATCHES arm )
-  set( __availableToolchainArchs "arm" )
- elseif( __availableToolchainMachines MATCHES mips64el )
-  set( __availableToolchainArchs "mips64" )
- elseif( __availableToolchainMachines MATCHES mipsel )
-  set( __availableToolchainArchs "mips" )
- endif()
- execute_process( COMMAND "${ANDROID_STANDALONE_TOOLCHAIN}/bin/${__availableToolchainMachines}-gcc${TOOL_OS_SUFFIX}" -dumpversion
-                  OUTPUT_VARIABLE __availableToolchainCompilerVersions OUTPUT_STRIP_TRAILING_WHITESPACE )
- string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9]+)?" __availableToolchainCompilerVersions "${__availableToolchainCompilerVersions}" )
- if( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/bin/clang${TOOL_OS_SUFFIX}" )
-  list( APPEND __availableToolchains "standalone-clang" )
-  list( APPEND __availableToolchainMachines ${__availableToolchainMachines} )
-  list( APPEND __availableToolchainArchs ${__availableToolchainArchs} )
-  list( APPEND __availableToolchainCompilerVersions ${__availableToolchainCompilerVersions} )
- endif()
-endif()
-
-macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __toolchain_subpath )
- foreach( __toolchain ${${__availableToolchainsLst}} )
-  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
-   SET( __toolchainVersionRegex "^TOOLCHAIN_VERSION[\t ]+:=[\t ]+(.*)$" )
-   FILE( STRINGS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}/setup.mk" __toolchainVersionStr REGEX "${__toolchainVersionRegex}" )
-   if( __toolchainVersionStr )
-    string( REGEX REPLACE "${__toolchainVersionRegex}" "\\1" __toolchainVersionStr "${__toolchainVersionStr}" )
-    string( REGEX REPLACE "-clang3[.][0-9]$" "-${__toolchainVersionStr}" __gcc_toolchain "${__toolchain}" )
-   else()
-    string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
-   endif()
-   unset( __toolchainVersionStr )
-   unset( __toolchainVersionRegex )
-  else()
-   set( __gcc_toolchain "${__toolchain}" )
-  endif()
-  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
-  if( __machine )
-   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
-   if( __machine MATCHES x86_64 )
-    set( __arch "x86_64" )
-   elseif( __machine MATCHES i686 )
-    set( __arch "x86" )
-   elseif( __machine MATCHES aarch64 )
-    set( __arch "arm64" )
-   elseif( __machine MATCHES arm )
-    set( __arch "arm" )
-   elseif( __machine MATCHES mips64el )
-    set( __arch "mips64" )
-   elseif( __machine MATCHES mipsel )
-    set( __arch "mips" )
-   else()
-    set( __arch "" )
-   endif()
-   #message("machine: !${__machine}!\narch: !${__arch}!\nversion: !${__version}!\ntoolchain: !${__toolchain}!\n")
-   if (__arch)
-    list( APPEND __availableToolchainMachines "${__machine}" )
-    list( APPEND __availableToolchainArchs "${__arch}" )
-    list( APPEND __availableToolchainCompilerVersions "${__version}" )
-    list( APPEND ${__availableToolchainsVar} "${__toolchain}" )
-   endif()
-  endif()
-  unset( __gcc_toolchain )
- endforeach()
-endmacro()
-
-# get all the details about NDK
-if( BUILD_WITH_ANDROID_NDK )
- file( GLOB ANDROID_SUPPORTED_NATIVE_API_LEVELS RELATIVE "${ANDROID_NDK}/platforms" "${ANDROID_NDK}/platforms/android-*" )
- string( REPLACE "android-" "" ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_SUPPORTED_NATIVE_API_LEVELS}" )
- set( __availableToolchains "" )
- set( __availableToolchainMachines "" )
- set( __availableToolchainArchs "" )
- set( __availableToolchainCompilerVersions "" )
- if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_TOOLCHAIN_NAME}/" )
-  # do not go through all toolchains if we know the name
-  set( __availableToolchainsLst "${ANDROID_TOOLCHAIN_NAME}" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
-   if( __availableToolchains )
-    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
-   endif()
-  endif()
- endif()
- if( NOT __availableToolchains )
-  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK_TOOLCHAINS_PATH}" "${ANDROID_NDK_TOOLCHAINS_PATH}/*" )
-  if( __availableToolchainsLst )
-   list(SORT __availableToolchainsLst) # we need clang to go after gcc
-  endif()
-  __LIST_FILTER( __availableToolchainsLst "^[.]" )
-  __LIST_FILTER( __availableToolchainsLst "llvm" )
-  __LIST_FILTER( __availableToolchainsLst "renderscript" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
-   if( __availableToolchains )
-    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
-   endif()
-  endif()
- endif()
- if( NOT __availableToolchains )
-  message( FATAL_ERROR "Could not find any working toolchain in the NDK. Probably your Android NDK is broken." )
- endif()
-endif()
-
-# build list of available ABIs
-set( ANDROID_SUPPORTED_ABIS "" )
-set( __uniqToolchainArchNames ${__availableToolchainArchs} )
-list( REMOVE_DUPLICATES __uniqToolchainArchNames )
-list( SORT __uniqToolchainArchNames )
-foreach( __arch ${__uniqToolchainArchNames} )
- list( APPEND ANDROID_SUPPORTED_ABIS ${ANDROID_SUPPORTED_ABIS_${__arch}} )
-endforeach()
-unset( __uniqToolchainArchNames )
-if( NOT ANDROID_SUPPORTED_ABIS )
- message( FATAL_ERROR "No one of known Android ABIs is supported by this cmake toolchain." )
-endif()
-
-# choose target ABI
-__INIT_VARIABLE( ANDROID_ABI VALUES ${ANDROID_SUPPORTED_ABIS} )
-# verify that target ABI is supported
-list( FIND ANDROID_SUPPORTED_ABIS "${ANDROID_ABI}" __androidAbiIdx )
-if( __androidAbiIdx EQUAL -1 )
- string( REPLACE ";" "\", \"" PRINTABLE_ANDROID_SUPPORTED_ABIS  "${ANDROID_SUPPORTED_ABIS}" )
- message( FATAL_ERROR "Specified ANDROID_ABI = \"${ANDROID_ABI}\" is not supported by this cmake toolchain or your NDK/toolchain.
-   Supported values are: \"${PRINTABLE_ANDROID_SUPPORTED_ABIS}\"
-   " )
-endif()
-unset( __androidAbiIdx )
-
-# set target ABI options
-if( ANDROID_ABI STREQUAL "x86" )
- set( X86 true )
- set( ANDROID_NDK_ABI_NAME "x86" )
- set( ANDROID_ARCH_NAME "x86" )
- set( ANDROID_LLVM_TRIPLE "i686-none-linux-android" )
- set( CMAKE_SYSTEM_PROCESSOR "i686" )
-elseif( ANDROID_ABI STREQUAL "x86_64" )
- set( X86 true )
- set( X86_64 true )
- set( ANDROID_NDK_ABI_NAME "x86_64" )
- set( ANDROID_ARCH_NAME "x86_64" )
- set( CMAKE_SYSTEM_PROCESSOR "x86_64" )
- set( ANDROID_LLVM_TRIPLE "x86_64-none-linux-android" )
-elseif( ANDROID_ABI STREQUAL "mips64" )
- set( MIPS64 true )
- set( ANDROID_NDK_ABI_NAME "mips64" )
- set( ANDROID_ARCH_NAME "mips64" )
- set( ANDROID_LLVM_TRIPLE "mips64el-none-linux-android" )
- set( CMAKE_SYSTEM_PROCESSOR "mips64" )
-elseif( ANDROID_ABI STREQUAL "mips" )
- set( MIPS true )
- set( ANDROID_NDK_ABI_NAME "mips" )
- set( ANDROID_ARCH_NAME "mips" )
- set( ANDROID_LLVM_TRIPLE "mipsel-none-linux-android" )
- set( CMAKE_SYSTEM_PROCESSOR "mips" )
-elseif( ANDROID_ABI STREQUAL "arm64-v8a" )
- set( ARM64_V8A true )
- set( ANDROID_NDK_ABI_NAME "arm64-v8a" )
- set( ANDROID_ARCH_NAME "arm64" )
- set( ANDROID_LLVM_TRIPLE "aarch64-none-linux-android" )
- set( CMAKE_SYSTEM_PROCESSOR "aarch64" )
- set( VFPV3 true )
- set( NEON true )
-elseif( ANDROID_ABI STREQUAL "armeabi" )
- set( ARMEABI true )
- set( ANDROID_NDK_ABI_NAME "armeabi" )
- set( ANDROID_ARCH_NAME "arm" )
- set( ANDROID_LLVM_TRIPLE "armv5te-none-linux-androideabi" )
- set( CMAKE_SYSTEM_PROCESSOR "armv5te" )
-elseif( ANDROID_ABI STREQUAL "armeabi-v6 with VFP" )
- set( ARMEABI_V6 true )
- set( ANDROID_NDK_ABI_NAME "armeabi" )
- set( ANDROID_ARCH_NAME "arm" )
- set( ANDROID_LLVM_TRIPLE "armv5te-none-linux-androideabi" )
- set( CMAKE_SYSTEM_PROCESSOR "armv6" )
- # need always fallback to older platform
- set( ARMEABI true )
-elseif( ANDROID_ABI STREQUAL "armeabi-v7a")
- set( ARMEABI_V7A true )
- set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
- set( ANDROID_ARCH_NAME "arm" )
- set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
- set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
-elseif( ANDROID_ABI STREQUAL "armeabi-v7a with VFPV3" )
- set( ARMEABI_V7A true )
- set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
- set( ANDROID_ARCH_NAME "arm" )
- set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
- set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
- set( VFPV3 true )
-elseif( ANDROID_ABI STREQUAL "armeabi-v7a with NEON" )
- set( ARMEABI_V7A true )
- set( ANDROID_NDK_ABI_NAME "armeabi-v7a" )
- set( ANDROID_ARCH_NAME "arm" )
- set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
- set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
- set( VFPV3 true )
- set( NEON true )
-elseif( ANDROID_ABI STREQUAL "armeabi-v7a-hard with NEON" )
- set( ARMEABI_V7A_HARD true )
- set( ANDROID_NDK_ABI_NAME "armeabi-v7a-hard" )
- set( ANDROID_ARCH_NAME "arm" )
- set( ANDROID_LLVM_TRIPLE "armv7-none-linux-androideabi" )
- set( CMAKE_SYSTEM_PROCESSOR "armv7-a" )
- set( VFPV3 true )
- set( NEON true )
-else()
- message( SEND_ERROR "Unknown ANDROID_ABI=\"${ANDROID_ABI}\" is specified." )
-endif()
-
-if( CMAKE_BINARY_DIR AND EXISTS "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" )
- # really dirty hack
- # it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
- file( APPEND "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" "SET(CMAKE_SYSTEM_PROCESSOR \"${CMAKE_SYSTEM_PROCESSOR}\")\n" )
-endif()
-
-if( ANDROID_ARCH_NAME STREQUAL "arm" AND NOT ARMEABI_V6 )
- __INIT_VARIABLE( ANDROID_FORCE_ARM_BUILD VALUES OFF )
- set( ANDROID_FORCE_ARM_BUILD ${ANDROID_FORCE_ARM_BUILD} CACHE BOOL "Use 32-bit ARM instructions instead of Thumb-1" FORCE )
- mark_as_advanced( ANDROID_FORCE_ARM_BUILD )
-else()
- unset( ANDROID_FORCE_ARM_BUILD CACHE )
-endif()
-
-# choose toolchain
-if( ANDROID_TOOLCHAIN_NAME )
- list( FIND __availableToolchains "${ANDROID_TOOLCHAIN_NAME}" __toolchainIdx )
- if( __toolchainIdx EQUAL -1 )
-  list( SORT __availableToolchains )
-  string( REPLACE ";" "\n  * " toolchains_list "${__availableToolchains}" )
-  set( toolchains_list "  * ${toolchains_list}")
-  message( FATAL_ERROR "Specified toolchain \"${ANDROID_TOOLCHAIN_NAME}\" is missing in your NDK or broken. Please verify that your NDK is working or select another compiler toolchain.
-To configure the toolchain set CMake variable ANDROID_TOOLCHAIN_NAME to one of the following values:\n${toolchains_list}\n" )
- endif()
- list( GET __availableToolchainArchs ${__toolchainIdx} __toolchainArch )
- if( NOT __toolchainArch STREQUAL ANDROID_ARCH_NAME )
-  message( SEND_ERROR "Selected toolchain \"${ANDROID_TOOLCHAIN_NAME}\" is not able to compile binaries for the \"${ANDROID_ARCH_NAME}\" platform." )
- endif()
-else()
- set( __toolchainIdx -1 )
- set( __applicableToolchains "" )
- set( __toolchainMaxVersion "0.0.0" )
- list( LENGTH __availableToolchains __availableToolchainsCount )
- math( EXPR __availableToolchainsCount "${__availableToolchainsCount}-1" )
- foreach( __idx RANGE ${__availableToolchainsCount} )
-  list( GET __availableToolchainArchs ${__idx} __toolchainArch )
-  if( __toolchainArch STREQUAL ANDROID_ARCH_NAME )
-   list( GET __availableToolchainCompilerVersions ${__idx} __toolchainVersion )
-   string( REPLACE "x" "99" __toolchainVersion "${__toolchainVersion}")
-   if( __toolchainVersion VERSION_GREATER __toolchainMaxVersion )
-    set( __toolchainMaxVersion "${__toolchainVersion}" )
-    set( __toolchainIdx ${__idx} )
-   endif()
-  endif()
- endforeach()
- unset( __availableToolchainsCount )
- unset( __toolchainMaxVersion )
- unset( __toolchainVersion )
-endif()
-unset( __toolchainArch )
-if( __toolchainIdx EQUAL -1 )
- message( FATAL_ERROR "No one of available compiler toolchains is able to compile for ${ANDROID_ARCH_NAME} platform." )
-endif()
-list( GET __availableToolchains ${__toolchainIdx} ANDROID_TOOLCHAIN_NAME )
-list( GET __availableToolchainMachines ${__toolchainIdx} ANDROID_TOOLCHAIN_MACHINE_NAME )
-list( GET __availableToolchainCompilerVersions ${__toolchainIdx} ANDROID_COMPILER_VERSION )
-
-unset( __toolchainIdx )
-unset( __availableToolchains )
-unset( __availableToolchainMachines )
-unset( __availableToolchainArchs )
-unset( __availableToolchainCompilerVersions )
-
-# choose native API level
-__INIT_VARIABLE( ANDROID_NATIVE_API_LEVEL ENV_ANDROID_NATIVE_API_LEVEL ANDROID_API_LEVEL ENV_ANDROID_API_LEVEL ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME} ANDROID_DEFAULT_NDK_API_LEVEL )
-string( REPLACE "android-" "" ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" )
-string( STRIP "${ANDROID_NATIVE_API_LEVEL}" ANDROID_NATIVE_API_LEVEL )
-# adjust API level
-set( __real_api_level ${ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME}} )
-foreach( __level ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
- if( (__level LESS ANDROID_NATIVE_API_LEVEL OR __level STREQUAL ANDROID_NATIVE_API_LEVEL) AND NOT __level LESS __real_api_level )
-  set( __real_api_level ${__level} )
- endif()
-endforeach()
-if( __real_api_level AND NOT ANDROID_NATIVE_API_LEVEL STREQUAL __real_api_level )
- message( STATUS "Adjusting Android API level 'android-${ANDROID_NATIVE_API_LEVEL}' to 'android-${__real_api_level}'")
- set( ANDROID_NATIVE_API_LEVEL ${__real_api_level} )
-endif()
-unset(__real_api_level)
-# validate
-list( FIND ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_NATIVE_API_LEVEL}" __levelIdx )
-if( __levelIdx EQUAL -1 )
- message( SEND_ERROR "Specified Android native API level 'android-${ANDROID_NATIVE_API_LEVEL}' is not supported by your NDK/toolchain.\nSupported values of ANDROID_NATIVE_API_LEVEL: ${ANDROID_SUPPORTED_NATIVE_API_LEVELS}" )
-else()
- if( BUILD_WITH_ANDROID_NDK )
-  if(EXISTS "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}/usr/include/android/api-level.h")
-    __DETECT_NATIVE_API_LEVEL( __realApiLevel "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}/usr/include/android/api-level.h" )
-  else()
-    __DETECT_NATIVE_API_LEVEL( __realApiLevel "${ANDROID_NDK}/sysroot/usr/include/android/api-level.h")
-  endif()
-
-  if( NOT __realApiLevel EQUAL ANDROID_NATIVE_API_LEVEL AND NOT __realApiLevel GREATER 9000 )
-   message( SEND_ERROR "Specified Android API level (${ANDROID_NATIVE_API_LEVEL}) does not match to the level found (${__realApiLevel}). Probably your copy of NDK is broken." )
-  endif()
-  unset( __realApiLevel )
- endif()
- set( ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" CACHE STRING "Android API level for native code" FORCE )
- set( CMAKE_ANDROID_API ${ANDROID_NATIVE_API_LEVEL} )
- if( CMAKE_VERSION VERSION_GREATER "2.8" )
-  list( SORT ANDROID_SUPPORTED_NATIVE_API_LEVELS )
-  set_property( CACHE ANDROID_NATIVE_API_LEVEL PROPERTY STRINGS ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
- endif()
-endif()
-unset( __levelIdx )
-
-
-# remember target ABI
-set( ANDROID_ABI "${ANDROID_ABI}" CACHE STRING "The target ABI for Android. If arm, then armeabi-v7a is recommended for hardware floating point." FORCE )
-if( CMAKE_VERSION VERSION_GREATER "2.8" )
- list( SORT ANDROID_SUPPORTED_ABIS_${ANDROID_ARCH_NAME} )
- set_property( CACHE ANDROID_ABI PROPERTY STRINGS ${ANDROID_SUPPORTED_ABIS_${ANDROID_ARCH_NAME}} )
-endif()
-
-
-# runtime choice (STL, rtti, exceptions)
-if( NOT ANDROID_STL )
-  set( ANDROID_STL gnustl_static )
-endif()
-set( ANDROID_STL "${ANDROID_STL}" CACHE STRING "C++ runtime" )
-set( ANDROID_STL_FORCE_FEATURES ON CACHE BOOL "automatically configure rtti and exceptions support based on C++ runtime" )
-mark_as_advanced( ANDROID_STL ANDROID_STL_FORCE_FEATURES )
-
-if( BUILD_WITH_ANDROID_NDK )
- if( NOT "${ANDROID_STL}" MATCHES "^(none|system|system_re|gabi\\+\\+_static|gabi\\+\\+_shared|stlport_static|stlport_shared|gnustl_static|gnustl_shared|c\\+\\+_static|c\\+\\+_shared)$")
-  message( FATAL_ERROR "ANDROID_STL is set to invalid value \"${ANDROID_STL}\".
-The possible values are:
-  none           -> Do not configure the runtime.
-  system         -> Use the default minimal system C++ runtime library.
-  system_re      -> Same as system but with rtti and exceptions.
-  gabi++_static  -> Use the GAbi++ runtime as a static library.
-  gabi++_shared  -> Use the GAbi++ runtime as a shared library.
-  stlport_static -> Use the STLport runtime as a static library.
-  stlport_shared -> Use the STLport runtime as a shared library.
-  gnustl_static  -> (default) Use the GNU STL as a static library.
-  gnustl_shared  -> Use the GNU STL as a shared library.
-  c++_shared     -> Use the LLVM libc++ runtime as a shared library.
-  c++_static     -> Use the LLVM libc++ runtime as a static library.
-" )
- endif()
-elseif( BUILD_WITH_STANDALONE_TOOLCHAIN )
- if( NOT "${ANDROID_STL}" MATCHES "^(none|gnustl_static|gnustl_shared|c\\+\\+_static|c\\+\\+_shared)$")
-  message( FATAL_ERROR "ANDROID_STL is set to invalid value \"${ANDROID_STL}\".
-The possible values are:
-  none           -> Do not configure the runtime.
-  gnustl_static  -> (default) Use the GNU STL as a static library.
-  gnustl_shared  -> Use the GNU STL as a shared library.
-  c++_shared     -> Use the LLVM libc++ runtime as a shared library.
-  c++_static     -> Use the LLVM libc++ runtime as a static library.
-" )
- endif()
-endif()
-
-unset( ANDROID_RTTI )
-unset( ANDROID_EXCEPTIONS )
-unset( ANDROID_STL_INCLUDE_DIRS )
-unset( __libstl )
-unset( __libsupcxx )
-
-if( NOT _CMAKE_IN_TRY_COMPILE AND ANDROID_NDK_RELEASE STREQUAL "r7b" AND ARMEABI_V7A AND NOT VFPV3 AND ANDROID_STL MATCHES "gnustl" )
- message( WARNING  "The GNU STL armeabi-v7a binaries from NDK r7b can crash non-NEON devices. The files provided with NDK r7b were not configured properly, resulting in crashes on Tegra2-based devices and others when trying to use certain floating-point functions (e.g., cosf, sinf, expf).
-You are strongly recommended to switch to another NDK release.
-" )
-endif()
-
-if( NOT _CMAKE_IN_TRY_COMPILE AND X86 AND ANDROID_STL MATCHES "gnustl" AND ANDROID_NDK_RELEASE STREQUAL "r6" )
-  message( WARNING  "The x86 system header file from NDK r6 has incorrect definition for ptrdiff_t. You are recommended to upgrade to a newer NDK release or manually patch the header:
-See https://android.googlesource.com/platform/development.git f907f4f9d4e56ccc8093df6fee54454b8bcab6c2
-  diff --git a/ndk/platforms/android-9/arch-x86/include/machine/_types.h b/ndk/platforms/android-9/arch-x86/include/machine/_types.h
-  index 5e28c64..65892a1 100644
-  --- a/ndk/platforms/android-9/arch-x86/include/machine/_types.h
-  +++ b/ndk/platforms/android-9/arch-x86/include/machine/_types.h
-  @@ -51,7 +51,11 @@ typedef long int       ssize_t;
-   #endif
-   #ifndef _PTRDIFF_T
-   #define _PTRDIFF_T
-  -typedef long           ptrdiff_t;
-  +#  ifdef __ANDROID__
-  +     typedef int            ptrdiff_t;
-  +#  else
-  +     typedef long           ptrdiff_t;
-  +#  endif
-   #endif
-" )
-endif()
-
-
-# setup paths and STL for standalone toolchain
-if( BUILD_WITH_STANDALONE_TOOLCHAIN )
- set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
- set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
- set( ANDROID_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot" )
- set( ANDROID_SYSROOT_INCLUDE "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include" )
-
- if( NOT ANDROID_STL STREQUAL "none" )
-  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/include/c++/${ANDROID_COMPILER_VERSION}" )
-  if( NOT EXISTS "${ANDROID_STL_INCLUDE_DIRS}" )
-   # old location ( pre r8c )
-   set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/include/c++/${ANDROID_COMPILER_VERSION}" )
-  endif()
-  if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/${CMAKE_SYSTEM_PROCESSOR}/bits" )
-   list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/${CMAKE_SYSTEM_PROCESSOR}" )
-  elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/thumb/bits" )
-   list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/thumb" )
-  else()
-   list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}" )
-  endif()
-  # always search static GNU STL to get the location of libsupc++.a
-  if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libstdc++.a" )
-   set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb" )
-  elseif( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libstdc++.a" )
-   set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}" )
-  elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libstdc++.a" )
-   set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb" )
-  elseif( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libstdc++.a" )
-   set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib" )
-  endif()
-  if( __libstl )
-   set( __libsupcxx "${__libstl}/libsupc++.a" )
-   set( __libstl    "${__libstl}/libstdc++.a" )
-  endif()
-  if( NOT EXISTS "${__libsupcxx}" )
-   message( FATAL_ERROR "The required libstdsupc++.a is missing in your standalone toolchain.
- Usually it happens because of bug in make-standalone-toolchain.sh script from NDK r7, r7b and r7c.
- You need to either upgrade to newer NDK or manually copy
-     $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a
- to
-     ${__libsupcxx}
-   " )
-  endif()
-  if( ANDROID_STL STREQUAL "gnustl_shared" )
-   if( (ARMEABI_V7A OR ARMEABI_V7A_HARD) AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libgnustl_shared.so" )
-    set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libgnustl_shared.so" )
-   elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libgnustl_shared.so" )
-    set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libgnustl_shared.so" )
-   elseif( EXISTS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libgnustl_shared.so" )
-    set( __libstl "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libgnustl_shared.so" )
-   endif()
-  endif()
- endif()
-endif()
-
-# clang
-if( "${ANDROID_TOOLCHAIN_NAME}" STREQUAL "standalone-clang" )
- set( ANDROID_COMPILER_IS_CLANG 1 )
- execute_process( COMMAND "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/clang${TOOL_OS_SUFFIX}" --version OUTPUT_VARIABLE ANDROID_CLANG_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE )
- string( REGEX MATCH "[0-9]+[.][0-9]+" ANDROID_CLANG_VERSION "${ANDROID_CLANG_VERSION}")
-elseif( "${ANDROID_TOOLCHAIN_NAME}" MATCHES "-clang3[.][0-9]?$" )
- string( REGEX MATCH "3[.][0-9]$" ANDROID_CLANG_VERSION "${ANDROID_TOOLCHAIN_NAME}")
- string( REGEX REPLACE "-clang${ANDROID_CLANG_VERSION}$" "-${ANDROID_COMPILER_VERSION}" ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
- if( NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}/bin/clang${TOOL_OS_SUFFIX}" )
-  message( FATAL_ERROR "Could not find the Clang compiler driver" )
- endif()
- set( ANDROID_COMPILER_IS_CLANG 1 )
- set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
-else()
- set( ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
- unset( ANDROID_COMPILER_IS_CLANG CACHE )
-endif()
-
-string( REPLACE "." "" _clang_name "clang${ANDROID_CLANG_VERSION}" )
-if( NOT EXISTS "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}" )
- set( _clang_name "clang" )
-endif()
-
-
-# setup paths and STL for NDK
-if( BUILD_WITH_ANDROID_NDK )
- set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
- set( ANDROID_SYSROOT "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}" )
- if( EXISTS "${ANDROID_SYSROOT}/usr/include" )
-   set( ANDROID_SYSROOT_INCLUDE "${ANDROID_SYSROOT}/usr/include" )
- else()
-   set( ANDROID_SYSROOT_INCLUDE "${ANDROID_NDK}/sysroot/usr/include" "${ANDROID_NDK}/sysroot/usr/include/${ANDROID_TOOLCHAIN_MACHINE_NAME}" )
- endif()
-
- if( ANDROID_STL STREQUAL "none" )
-  # do nothing
- elseif( ANDROID_STL STREQUAL "system" )
-  set( ANDROID_RTTI             OFF )
-  set( ANDROID_EXCEPTIONS       OFF )
-  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/system/include" )
- elseif( ANDROID_STL STREQUAL "system_re" )
-  set( ANDROID_RTTI             ON )
-  set( ANDROID_EXCEPTIONS       ON )
-  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/system/include" )
- elseif( ANDROID_STL MATCHES "gabi" )
-  if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
-   message( FATAL_ERROR "gabi++ is not available in your NDK. You have to upgrade to NDK r7 or newer to use gabi++.")
-  endif()
-  set( ANDROID_RTTI             ON )
-  set( ANDROID_EXCEPTIONS       OFF )
-  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/gabi++/include" )
-  set( __libstl                 "${ANDROID_NDK}/sources/cxx-stl/gabi++/libs/${ANDROID_NDK_ABI_NAME}/libgabi++_static.a" )
- elseif( ANDROID_STL MATCHES "stlport" )
-  if( NOT ANDROID_NDK_RELEASE_NUM LESS 8004 ) # before r8d
-   set( ANDROID_EXCEPTIONS       ON )
-  else()
-   set( ANDROID_EXCEPTIONS       OFF )
-  endif()
-  if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
-   set( ANDROID_RTTI            OFF )
-  else()
-   set( ANDROID_RTTI            ON )
-  endif()
-  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/stlport/stlport" )
-  set( __libstl                 "${ANDROID_NDK}/sources/cxx-stl/stlport/libs/${ANDROID_NDK_ABI_NAME}/libstlport_static.a" )
- elseif( ANDROID_STL MATCHES "gnustl" )
-  set( ANDROID_EXCEPTIONS       ON )
-  set( ANDROID_RTTI             ON )
-  if( EXISTS "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
-   if( ARMEABI_V7A AND ANDROID_COMPILER_VERSION VERSION_EQUAL "4.7" AND ANDROID_NDK_RELEASE STREQUAL "r8d" )
-    # gnustl binary for 4.7 compiler is buggy :(
-    # TODO: look for right fix
-    set( __libstl                "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/4.6" )
-   else()
-    set( __libstl                "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
-   endif()
-  else()
-   set( __libstl                "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++" )
-  endif()
-  set( ANDROID_STL_INCLUDE_DIRS "${__libstl}/include" "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/include" "${__libstl}/include/backward" )
-  if( EXISTS "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libgnustl_static.a" )
-   set( __libstl                "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libgnustl_static.a" )
-  else()
-   set( __libstl                "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libstdc++.a" )
-  endif()
- elseif( ANDROID_STL MATCHES "c\\+\\+" )
-  set( ANDROID_EXCEPTIONS       ON )
-  set( ANDROID_RTTI             ON )
-  set( __libstl                "${ANDROID_NDK}/sources/cxx-stl/llvm-libc++" )
-  set( __libstl                "${__libstl}/libs/${ANDROID_NDK_ABI_NAME}/libc++_static.a" )
-  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/android/support/include" "${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libcxx/include" "${ANDROID_NDK}/sources/cxx-stl/llvm-libc++abi/libcxxabi/include" )
- else()
-  message( FATAL_ERROR "Unknown runtime: ${ANDROID_STL}" )
- endif()
-
- # find libsupc++.a - rtti & exceptions
- if( ANDROID_STL STREQUAL "system_re" OR ANDROID_STL MATCHES "gnustl" )
-  set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r8b or newer
-  if( NOT EXISTS "${__libsupcxx}" )
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r7-r8
-  endif()
-  if( NOT EXISTS "${__libsupcxx}" ) # before r7
-   if( ARMEABI_V7A )
-    if( ANDROID_FORCE_ARM_BUILD )
-     set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" )
-    else()
-     set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" )
-    endif()
-   elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD )
-    set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/thumb/libsupc++.a" )
-   else()
-    set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" )
-   endif()
-  endif()
-  if( NOT EXISTS "${__libsupcxx}")
-   message( ERROR "Could not find libsupc++.a for a chosen platform. Either your NDK is not supported or is broken.")
-  endif()
- endif()
-endif()
-
-
-# case of shared STL linkage
-if( ANDROID_STL MATCHES "shared" AND DEFINED __libstl )
- string( REPLACE "_static.a" "_shared.so" __libstl "${__libstl}" )
- if( NOT EXISTS "${__libstl}" )
-   message( FATAL_ERROR "Unable to find shared library ${__libstl}" )
- endif()
-endif()
-
-
-# ccache support
-__INIT_VARIABLE( _ndk_ccache NDK_CCACHE ENV_NDK_CCACHE )
-if( _ndk_ccache )
- if( DEFINED NDK_CCACHE AND NOT EXISTS NDK_CCACHE )
-  unset( NDK_CCACHE CACHE )
- endif()
- find_program( NDK_CCACHE "${_ndk_ccache}" DOC "The path to ccache binary")
-else()
- unset( NDK_CCACHE CACHE )
-endif()
-unset( _ndk_ccache )
-
-
-# setup the cross-compiler
-if( NOT CMAKE_C_COMPILER )
- if( NDK_CCACHE AND NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
-  set( CMAKE_C_COMPILER   "${NDK_CCACHE}" CACHE PATH "ccache as C compiler" )
-  set( CMAKE_CXX_COMPILER "${NDK_CCACHE}" CACHE PATH "ccache as C++ compiler" )
-  if( ANDROID_COMPILER_IS_CLANG )
-   set( CMAKE_C_COMPILER_ARG1   "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}"   CACHE PATH "C compiler")
-   set( CMAKE_CXX_COMPILER_ARG1 "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
-  else()
-   set( CMAKE_C_COMPILER_ARG1   "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}" CACHE PATH "C compiler")
-   set( CMAKE_CXX_COMPILER_ARG1 "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-g++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
-  endif()
- else()
-  if( ANDROID_COMPILER_IS_CLANG )
-   set( CMAKE_C_COMPILER   "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}${TOOL_OS_SUFFIX}"   CACHE PATH "C compiler")
-   set( CMAKE_CXX_COMPILER "${ANDROID_CLANG_TOOLCHAIN_ROOT}/bin/${_clang_name}++${TOOL_OS_SUFFIX}" CACHE PATH "C++ compiler")
-  else()
-   set( CMAKE_C_COMPILER   "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}"    CACHE PATH "C compiler" )
-   set( CMAKE_CXX_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-g++${TOOL_OS_SUFFIX}"    CACHE PATH "C++ compiler" )
-  endif()
- endif()
- set( CMAKE_ASM_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc${TOOL_OS_SUFFIX}"     CACHE PATH "assembler" )
- set( CMAKE_STRIP        "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-strip${TOOL_OS_SUFFIX}"   CACHE PATH "strip" )
- if( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc-ar${TOOL_OS_SUFFIX}" )
-  # Use gcc-ar if we have it for better LTO support.
-  set( CMAKE_AR           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-gcc-ar${TOOL_OS_SUFFIX}"      CACHE PATH "archive" )
- else()
-  set( CMAKE_AR           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ar${TOOL_OS_SUFFIX}"      CACHE PATH "archive" )
- endif()
- set( CMAKE_LINKER       "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ld${TOOL_OS_SUFFIX}"      CACHE PATH "linker" )
- set( CMAKE_NM           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-nm${TOOL_OS_SUFFIX}"      CACHE PATH "nm" )
- set( CMAKE_OBJCOPY      "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-objcopy${TOOL_OS_SUFFIX}" CACHE PATH "objcopy" )
- set( CMAKE_OBJDUMP      "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-objdump${TOOL_OS_SUFFIX}" CACHE PATH "objdump" )
- set( CMAKE_RANLIB       "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ranlib${TOOL_OS_SUFFIX}"  CACHE PATH "ranlib" )
-endif()
-
-set( _CMAKE_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_MACHINE_NAME}-" )
-if( CMAKE_VERSION VERSION_LESS 2.8.5 )
- set( CMAKE_ASM_COMPILER_ARG1 "-c" )
-endif()
-if( APPLE )
- find_program( CMAKE_INSTALL_NAME_TOOL NAMES install_name_tool )
- if( NOT CMAKE_INSTALL_NAME_TOOL )
-  message( FATAL_ERROR "Could not find install_name_tool, please check your installation." )
- endif()
- mark_as_advanced( CMAKE_INSTALL_NAME_TOOL )
-endif()
-
-# Force set compilers because standard identification works badly for us
-if( CMAKE_VERSION VERSION_LESS 3.5.0 )
-  include( CMakeForceCompiler )
-  CMAKE_FORCE_C_COMPILER( "${CMAKE_C_COMPILER}" GNU )
-endif()
-if( ANDROID_COMPILER_IS_CLANG )
- set( CMAKE_C_COMPILER_ID Clang )
-endif()
-set( CMAKE_C_PLATFORM_ID Linux )
-if( X86_64 OR MIPS64 OR ARM64_V8A )
- set( CMAKE_C_SIZEOF_DATA_PTR 8 )
-else()
- set( CMAKE_C_SIZEOF_DATA_PTR 4 )
-endif()
-set( CMAKE_C_HAS_ISYSROOT 1 )
-set( CMAKE_C_COMPILER_ABI ELF )
-if( CMAKE_VERSION VERSION_LESS 3.5.0 )
-  CMAKE_FORCE_CXX_COMPILER( "${CMAKE_CXX_COMPILER}" GNU )
-endif()
-if( ANDROID_COMPILER_IS_CLANG )
- set( CMAKE_CXX_COMPILER_ID Clang)
-endif()
-set( CMAKE_CXX_PLATFORM_ID Linux )
-set( CMAKE_CXX_SIZEOF_DATA_PTR ${CMAKE_C_SIZEOF_DATA_PTR} )
-set( CMAKE_CXX_HAS_ISYSROOT 1 )
-set( CMAKE_CXX_COMPILER_ABI ELF )
-set( CMAKE_CXX_SOURCE_FILE_EXTENSIONS cc cp cxx cpp CPP c++ C )
-# force ASM compiler (required for CMake < 2.8.5)
-set( CMAKE_ASM_COMPILER_ID_RUN TRUE )
-set( CMAKE_ASM_COMPILER_ID GNU )
-set( CMAKE_ASM_COMPILER_WORKS TRUE )
-set( CMAKE_ASM_COMPILER_FORCED TRUE )
-set( CMAKE_COMPILER_IS_GNUASM 1)
-set( CMAKE_ASM_SOURCE_FILE_EXTENSIONS s S asm )
-
-foreach( lang C CXX ASM )
- if( ANDROID_COMPILER_IS_CLANG )
-  set( CMAKE_${lang}_COMPILER_VERSION ${ANDROID_CLANG_VERSION} )
- else()
-  set( CMAKE_${lang}_COMPILER_VERSION ${ANDROID_COMPILER_VERSION} )
- endif()
-endforeach()
-
-# flags and definitions
-remove_definitions( -DANDROID )
-add_definitions( -DANDROID )
-
-if( ANDROID_SYSROOT MATCHES "[ ;\"]" )
- if( CMAKE_HOST_WIN32 )
-  # try to convert path to 8.3 form
-  file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "@echo %~s1" )
-  execute_process( COMMAND "$ENV{ComSpec}" /c "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "${ANDROID_SYSROOT}"
-                   OUTPUT_VARIABLE __path OUTPUT_STRIP_TRAILING_WHITESPACE
-                   RESULT_VARIABLE __result ERROR_QUIET )
-  if( __result EQUAL 0 )
-   file( TO_CMAKE_PATH "${__path}" ANDROID_SYSROOT )
-   set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
-  else()
-   set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
-  endif()
- else()
-  set( ANDROID_CXX_FLAGS "'--sysroot=${ANDROID_SYSROOT}'" )
- endif()
- if( NOT _CMAKE_IN_TRY_COMPILE )
-  # quotes can break try_compile and compiler identification
-  message(WARNING "Path to your Android NDK (or toolchain) has non-alphanumeric symbols.\nThe build might be broken.\n")
- endif()
-else()
- set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
-endif()
-
-# NDK flags
-if (ARM64_V8A )
- set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS} -funwind-tables" )
- set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer -fstrict-aliasing" )
- set( ANDROID_CXX_FLAGS_DEBUG   "-fno-omit-frame-pointer -fno-strict-aliasing" )
- if( NOT ANDROID_COMPILER_IS_CLANG )
-  set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} -funswitch-loops -finline-limit=300" )
- endif()
-elseif( ARMEABI OR ARMEABI_V7A OR ARMEABI_V7A_HARD)
- set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funwind-tables" )
- if( NOT ANDROID_FORCE_ARM_BUILD AND NOT ARMEABI_V6 )
-  set( ANDROID_CXX_FLAGS_RELEASE "-mthumb -fomit-frame-pointer -fno-strict-aliasing" )
-  set( ANDROID_CXX_FLAGS_DEBUG   "-marm -fno-omit-frame-pointer -fno-strict-aliasing" )
-  if( NOT ANDROID_COMPILER_IS_CLANG )
-   set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -finline-limit=64" )
-  endif()
- else()
-  # always compile ARMEABI_V6 in arm mode; otherwise there is no difference from ARMEABI
-  set( ANDROID_CXX_FLAGS_RELEASE "-marm -fomit-frame-pointer -fstrict-aliasing" )
-  set( ANDROID_CXX_FLAGS_DEBUG   "-marm -fno-omit-frame-pointer -fno-strict-aliasing" )
-  if( NOT ANDROID_COMPILER_IS_CLANG )
-   set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funswitch-loops -finline-limit=300" )
-  endif()
- endif()
-elseif( X86 OR X86_64 )
- set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funwind-tables" )
- if( NOT ANDROID_COMPILER_IS_CLANG )
-  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -funswitch-loops -finline-limit=300" )
- endif()
- set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer -fstrict-aliasing" )
- set( ANDROID_CXX_FLAGS_DEBUG   "-fno-omit-frame-pointer -fno-strict-aliasing" )
-elseif( MIPS OR MIPS64 )
- set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS} -fno-strict-aliasing -finline-functions -funwind-tables -fmessage-length=0" )
- set( ANDROID_CXX_FLAGS_RELEASE "-fomit-frame-pointer" )
- set( ANDROID_CXX_FLAGS_DEBUG   "-fno-omit-frame-pointer" )
- if( NOT ANDROID_COMPILER_IS_CLANG )
-  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fno-inline-functions-called-once -fgcse-after-reload -frerun-cse-after-loop -frename-registers" )
-  set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE} -funswitch-loops -finline-limit=300" )
- endif()
-elseif()
- set( ANDROID_CXX_FLAGS_RELEASE "" )
- set( ANDROID_CXX_FLAGS_DEBUG   "" )
-endif()
-
-set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -fsigned-char" ) # good/necessary when porting desktop libraries
-
-if( NOT X86 AND NOT ANDROID_COMPILER_IS_CLANG )
- set( ANDROID_CXX_FLAGS "-Wno-psabi ${ANDROID_CXX_FLAGS}" )
-endif()
-
-if( NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.6" )
- set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -no-canonical-prefixes" ) # see https://android-review.googlesource.com/#/c/47564/
-endif()
-
-# ABI-specific flags
-if( ARMEABI_V7A_HARD )
- set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=hard -mhard-float -D_NDK_MATH_NO_SOFTFP=1" )
- if( NEON )
-  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=neon" )
- elseif( VFPV3 )
-  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3" )
- else()
-  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3-d16" )
- endif()
-elseif( ARMEABI_V7A )
- set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=softfp" )
- if( NEON )
-  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=neon" )
- elseif( VFPV3 )
-  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3" )
- else()
-  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -mfpu=vfpv3-d16" )
- endif()
-
-elseif( ARMEABI_V6 )
- set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv6 -mfloat-abi=softfp -mfpu=vfp" ) # vfp == vfpv2
-elseif( ARMEABI )
- set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv5te -mtune=xscale -msoft-float" )
-endif()
-
-if( ANDROID_STL MATCHES "gnustl" AND (EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}") )
- set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
- set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
- set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
-else()
- set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
- set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
- set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
-endif()
-
-# STL
-if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
- if( EXISTS "${__libstl}" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libstl}\"" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libstl}\"" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} \"${__libstl}\"" )
- endif()
- if( EXISTS "${__libsupcxx}" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libsupcxx}\"" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libsupcxx}\"" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
-  # C objects:
-  set( CMAKE_C_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_C_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_C_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_C_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_C_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_C_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_C_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_C_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
-  set( CMAKE_C_CREATE_SHARED_LIBRARY "${CMAKE_C_CREATE_SHARED_LIBRARY} \"${__libsupcxx}\"" )
-  set( CMAKE_C_CREATE_SHARED_MODULE  "${CMAKE_C_CREATE_SHARED_MODULE} \"${__libsupcxx}\"" )
-  set( CMAKE_C_LINK_EXECUTABLE       "${CMAKE_C_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
- endif()
- if( ANDROID_STL MATCHES "gnustl" )
-  if( NOT EXISTS "${ANDROID_LIBM_PATH}" )
-   set( ANDROID_LIBM_PATH -lm )
-  endif()
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} ${ANDROID_LIBM_PATH}" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} ${ANDROID_LIBM_PATH}" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} ${ANDROID_LIBM_PATH}" )
- endif()
-endif()
-
-# variables controlling optional build flags
-if( ANDROID_NDK_RELEASE_NUM LESS 7000 ) # before r7
- # libGLESv2.so in NDK's prior to r7 refers to missing external symbols.
- # So this flag option is required for all projects using OpenGL from native.
- __INIT_VARIABLE( ANDROID_SO_UNDEFINED                      VALUES ON )
-else()
- __INIT_VARIABLE( ANDROID_SO_UNDEFINED                      VALUES OFF )
-endif()
-__INIT_VARIABLE( ANDROID_NO_UNDEFINED                       VALUES ON )
-__INIT_VARIABLE( ANDROID_FUNCTION_LEVEL_LINKING             VALUES ON )
-__INIT_VARIABLE( ANDROID_GOLD_LINKER                        VALUES ON )
-__INIT_VARIABLE( ANDROID_NOEXECSTACK                        VALUES ON )
-__INIT_VARIABLE( ANDROID_RELRO                              VALUES ON )
-
-set( ANDROID_NO_UNDEFINED           ${ANDROID_NO_UNDEFINED}           CACHE BOOL "Show all undefined symbols as linker errors" )
-set( ANDROID_SO_UNDEFINED           ${ANDROID_SO_UNDEFINED}           CACHE BOOL "Allows or disallows undefined symbols in shared libraries" )
-set( ANDROID_FUNCTION_LEVEL_LINKING ${ANDROID_FUNCTION_LEVEL_LINKING} CACHE BOOL "Put each function in separate section and enable garbage collection of unused input sections at link time" )
-set( ANDROID_GOLD_LINKER            ${ANDROID_GOLD_LINKER}            CACHE BOOL "Enables gold linker" )
-set( ANDROID_NOEXECSTACK            ${ANDROID_NOEXECSTACK}            CACHE BOOL "Allows or disallows undefined symbols in shared libraries" )
-set( ANDROID_RELRO                  ${ANDROID_RELRO}                  CACHE BOOL "Enables RELRO - a memory corruption mitigation technique" )
-mark_as_advanced( ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_FUNCTION_LEVEL_LINKING ANDROID_GOLD_LINKER ANDROID_NOEXECSTACK ANDROID_RELRO )
-
-# linker flags
-set( ANDROID_LINKER_FLAGS "" )
-
-if( ARMEABI_V7A )
- # this is *required* to use the following linker flags that routes around
- # a CPU bug in some Cortex-A8 implementations:
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--fix-cortex-a8" )
-endif()
-
-if( ARMEABI_V7A_HARD )
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-warn-mismatch -lm_hard" )
-endif()
-
-if( ANDROID_NO_UNDEFINED )
- if( MIPS )
-  # there is some sysroot-related problem in mips linker...
-  if( NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
-   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined -Wl,-rpath-link,${ANDROID_SYSROOT}/usr/lib" )
-  endif()
- else()
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
- endif()
-endif()
-
-if( ANDROID_SO_UNDEFINED )
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-allow-shlib-undefined" )
-endif()
-
-if( ANDROID_FUNCTION_LEVEL_LINKING )
- set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS} -fdata-sections -ffunction-sections" )
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--gc-sections" )
-endif()
-
-if( ANDROID_COMPILER_VERSION VERSION_EQUAL "4.6" )
- if( ANDROID_GOLD_LINKER AND (CMAKE_HOST_UNIX OR ANDROID_NDK_RELEASE_NUM GREATER 8002) AND (ARMEABI OR ARMEABI_V7A OR ARMEABI_V7A_HARD OR X86) )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=gold" )
- elseif( ANDROID_NDK_RELEASE_NUM GREATER 8002 ) # after r8b
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=bfd" )
- elseif( ANDROID_NDK_RELEASE STREQUAL "r8b" AND ARMEABI AND NOT _CMAKE_IN_TRY_COMPILE )
-  message( WARNING "The default bfd linker from arm GCC 4.6 toolchain can fail with 'unresolvable R_ARM_THM_CALL relocation' error message. See https://code.google.com/p/android/issues/detail?id=35342
-  On Linux and OS X host platform you can workaround this problem using gold linker (default).
-  Rerun cmake with -DANDROID_GOLD_LINKER=ON option in case of problems.
-" )
- endif()
-endif() # version 4.6
-
-if( ANDROID_NOEXECSTACK )
- if( ANDROID_COMPILER_IS_CLANG )
-  set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS} -Xclang -mnoexecstack" )
- else()
-  set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS} -Wa,--noexecstack" )
- endif()
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-z,noexecstack" )
-endif()
-
-if( ANDROID_RELRO )
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,-z,relro -Wl,-z,now" )
-endif()
-
-if( ANDROID_COMPILER_IS_CLANG )
- set( ANDROID_CXX_FLAGS "-target ${ANDROID_LLVM_TRIPLE} -Qunused-arguments ${ANDROID_CXX_FLAGS}" )
- if( BUILD_WITH_ANDROID_NDK )
-  set( ANDROID_CXX_FLAGS "-gcc-toolchain ${ANDROID_TOOLCHAIN_ROOT} ${ANDROID_CXX_FLAGS}" )
- endif()
-endif()
-
-# cache flags
-set( CMAKE_CXX_FLAGS           ""                        CACHE STRING "c++ flags" )
-set( CMAKE_C_FLAGS             ""                        CACHE STRING "c flags" )
-set( CMAKE_CXX_FLAGS_RELEASE   "-O3 -DNDEBUG"            CACHE STRING "c++ Release flags" )
-set( CMAKE_C_FLAGS_RELEASE     "-O3 -DNDEBUG"            CACHE STRING "c Release flags" )
-set( CMAKE_CXX_FLAGS_DEBUG     "-O0 -g -DDEBUG -D_DEBUG" CACHE STRING "c++ Debug flags" )
-set( CMAKE_C_FLAGS_DEBUG       "-O0 -g -DDEBUG -D_DEBUG" CACHE STRING "c Debug flags" )
-set( CMAKE_SHARED_LINKER_FLAGS ""                        CACHE STRING "shared linker flags" )
-set( CMAKE_MODULE_LINKER_FLAGS ""                        CACHE STRING "module linker flags" )
-set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-z,nocopyreloc"      CACHE STRING "executable linker flags" )
-
-# put flags to cache (for debug purpose only)
-set( ANDROID_CXX_FLAGS         "${ANDROID_CXX_FLAGS}"         CACHE INTERNAL "Android specific c/c++ flags" )
-set( ANDROID_CXX_FLAGS_RELEASE "${ANDROID_CXX_FLAGS_RELEASE}" CACHE INTERNAL "Android specific c/c++ Release flags" )
-set( ANDROID_CXX_FLAGS_DEBUG   "${ANDROID_CXX_FLAGS_DEBUG}"   CACHE INTERNAL "Android specific c/c++ Debug flags" )
-set( ANDROID_LINKER_FLAGS      "${ANDROID_LINKER_FLAGS}"      CACHE INTERNAL "Android specific c/c++ linker flags" )
-
-# finish flags
-set( CMAKE_CXX_FLAGS           "${ANDROID_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" )
-set( CMAKE_C_FLAGS             "${ANDROID_CXX_FLAGS} ${CMAKE_C_FLAGS}" )
-set( CMAKE_CXX_FLAGS_RELEASE   "${ANDROID_CXX_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}" )
-set( CMAKE_C_FLAGS_RELEASE     "${ANDROID_CXX_FLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}" )
-set( CMAKE_CXX_FLAGS_DEBUG     "${ANDROID_CXX_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}" )
-set( CMAKE_C_FLAGS_DEBUG       "${ANDROID_CXX_FLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}" )
-set( CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}" )
-set( CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}" )
-set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
-
-if( MIPS AND BUILD_WITH_ANDROID_NDK AND ANDROID_NDK_RELEASE STREQUAL "r8" )
- set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
- set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
- set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
-endif()
-
-# pie/pic
-if( NOT (ANDROID_NATIVE_API_LEVEL LESS 16) AND (NOT DEFINED ANDROID_APP_PIE OR ANDROID_APP_PIE) AND (CMAKE_VERSION VERSION_GREATER 2.8.8) )
- set( CMAKE_POSITION_INDEPENDENT_CODE TRUE )
- set( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fPIE -pie")
-else()
- set( CMAKE_POSITION_INDEPENDENT_CODE FALSE )
- set( CMAKE_CXX_FLAGS "-fpic ${CMAKE_CXX_FLAGS}" )
- set( CMAKE_C_FLAGS   "-fpic ${CMAKE_C_FLAGS}" )
-endif()
-
-# configure rtti
-if( DEFINED ANDROID_RTTI AND ANDROID_STL_FORCE_FEATURES )
- if( ANDROID_RTTI )
-  set( CMAKE_CXX_FLAGS "-frtti ${CMAKE_CXX_FLAGS}" )
- else()
-  set( CMAKE_CXX_FLAGS "-fno-rtti ${CMAKE_CXX_FLAGS}" )
- endif()
-endif()
-
-# configure exceptions
-if( DEFINED ANDROID_EXCEPTIONS AND ANDROID_STL_FORCE_FEATURES )
- if( ANDROID_EXCEPTIONS )
-  set( CMAKE_CXX_FLAGS "-fexceptions ${CMAKE_CXX_FLAGS}" )
-  set( CMAKE_C_FLAGS "-fexceptions ${CMAKE_C_FLAGS}" )
- else()
-  set( CMAKE_CXX_FLAGS "-fno-exceptions ${CMAKE_CXX_FLAGS}" )
-  set( CMAKE_C_FLAGS "-fno-exceptions ${CMAKE_C_FLAGS}" )
- endif()
-endif()
-
-# global includes and link directories
-include_directories( SYSTEM "${ANDROID_SYSROOT_INCLUDE}" ${ANDROID_STL_INCLUDE_DIRS} )
-get_filename_component(__android_install_path "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" ABSOLUTE) # avoid CMP0015 policy warning
-link_directories( "${__android_install_path}" )
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DANDROID -D__ANDROID_API__=${ANDROID_NATIVE_API_LEVEL}" )
-set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DANDROID -D__ANDROID_API__=${ANDROID_NATIVE_API_LEVEL}" )
-
-# detect if need link crtbegin_so.o explicitly
-if( NOT DEFINED ANDROID_EXPLICIT_CRT_LINK )
- set( __cmd "${CMAKE_CXX_CREATE_SHARED_LIBRARY}" )
- string( REPLACE "<CMAKE_CXX_COMPILER>" "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}" __cmd "${__cmd}" )
- string( REPLACE "<CMAKE_C_COMPILER>"   "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}"   __cmd "${__cmd}" )
- string( REPLACE "<CMAKE_SHARED_LIBRARY_CXX_FLAGS>" "${CMAKE_CXX_FLAGS}" __cmd "${__cmd}" )
- string( REPLACE "<LANGUAGE_COMPILE_FLAGS>" "" __cmd "${__cmd}" )
- string( REPLACE "<LINK_FLAGS>" "${CMAKE_SHARED_LINKER_FLAGS}" __cmd "${__cmd}" )
- string( REPLACE "<CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS>" "-shared" __cmd "${__cmd}" )
- string( REPLACE "<CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG>" "" __cmd "${__cmd}" )
- string( REPLACE "<TARGET_SONAME>" "" __cmd "${__cmd}" )
- string( REPLACE "<TARGET>" "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain_crtlink_test.so" __cmd "${__cmd}" )
- string( REPLACE "<OBJECTS>" "\"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" __cmd "${__cmd}" )
- string( REPLACE "<LINK_LIBRARIES>" "" __cmd "${__cmd}" )
- separate_arguments( __cmd )
- foreach( __var ANDROID_NDK ANDROID_NDK_TOOLCHAINS_PATH ANDROID_STANDALONE_TOOLCHAIN )
-  if( ${__var} )
-   set( __tmp "${${__var}}" )
-   separate_arguments( __tmp )
-   string( REPLACE "${__tmp}" "${${__var}}" __cmd "${__cmd}")
-  endif()
- endforeach()
- string( REPLACE "'" "" __cmd "${__cmd}" )
- string( REPLACE "\"" "" __cmd "${__cmd}" )
- execute_process( COMMAND ${__cmd} RESULT_VARIABLE __cmd_result OUTPUT_QUIET ERROR_QUIET )
- if( __cmd_result EQUAL 0 )
-  set( ANDROID_EXPLICIT_CRT_LINK ON )
- else()
-  set( ANDROID_EXPLICIT_CRT_LINK OFF )
- endif()
-endif()
-
-if( ANDROID_EXPLICIT_CRT_LINK )
- set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
- set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
-endif()
-
-# setup output directories
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT 1)
-endif()
-set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )
-
-if( DEFINED LIBRARY_OUTPUT_PATH_ROOT
-      OR EXISTS "${CMAKE_SOURCE_DIR}/AndroidManifest.xml"
-      OR (EXISTS "${CMAKE_SOURCE_DIR}/../AndroidManifest.xml" AND EXISTS "${CMAKE_SOURCE_DIR}/../jni/") )
-  set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "Root for binaries output, set this to change where Android libs are installed to" )
-  if( NOT _CMAKE_IN_TRY_COMPILE )
-    if( EXISTS "${CMAKE_SOURCE_DIR}/jni/CMakeLists.txt" )
-      set( EXECUTABLE_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/bin/${ANDROID_NDK_ABI_NAME}" CACHE PATH "Output directory for applications" )
-    else()
-      set( EXECUTABLE_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/bin" CACHE PATH "Output directory for applications" )
-    endif()
-    set( LIBRARY_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/libs/${ANDROID_NDK_ABI_NAME}" CACHE PATH "Output directory for Android libs" )
-  endif()
-endif()
-
-# copy shaed stl library to build directory
-if( NOT _CMAKE_IN_TRY_COMPILE AND __libstl MATCHES "[.]so$" AND DEFINED LIBRARY_OUTPUT_PATH )
-  get_filename_component( __libstlname "${__libstl}" NAME )
-  execute_process( COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${__libstl}" "${LIBRARY_OUTPUT_PATH}/${__libstlname}" RESULT_VARIABLE __fileCopyProcess )
-  if( NOT __fileCopyProcess EQUAL 0 OR NOT EXISTS "${LIBRARY_OUTPUT_PATH}/${__libstlname}")
-    message( SEND_ERROR "Failed copying of ${__libstl} to the ${LIBRARY_OUTPUT_PATH}/${__libstlname}" )
-  endif()
-  unset( __fileCopyProcess )
-  unset( __libstlname )
-endif()
-
-
-# set these global flags for cmake client scripts to change behavior
-set( ANDROID True )
-set( BUILD_ANDROID True )
-
-# where is the target environment
-set( CMAKE_FIND_ROOT_PATH
-    "${ANDROID_TOOLCHAIN_ROOT}/bin"
-    "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}"
-    "${ANDROID_SYSROOT}"
-    "${ANDROID_NDK}/sysroot"  # NDK16+
-    "${CMAKE_INSTALL_PREFIX}"
-    "${CMAKE_INSTALL_PREFIX}/share" )
-
-# only search for libraries and includes in the ndk toolchain
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY)
-  set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE)
-  set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
-  set( CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY )
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM)
-  set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
-endif()
-
-macro(__cmake_find_root_save_and_reset)
-  foreach(v
-      CMAKE_FIND_ROOT_PATH_MODE_LIBRARY
-      CMAKE_FIND_ROOT_PATH_MODE_INCLUDE
-      CMAKE_FIND_ROOT_PATH_MODE_PACKAGE
-      CMAKE_FIND_ROOT_PATH_MODE_PROGRAM
-  )
-    set(__save_${v} ${${v}})
-    set(${v} NEVER)
-  endforeach()
-endmacro()
-
-macro(__cmake_find_root_restore)
-  foreach(v
-      CMAKE_FIND_ROOT_PATH_MODE_LIBRARY
-      CMAKE_FIND_ROOT_PATH_MODE_INCLUDE
-      CMAKE_FIND_ROOT_PATH_MODE_PACKAGE
-      CMAKE_FIND_ROOT_PATH_MODE_PROGRAM
-  )
-    set(${v} ${__save_${v}})
-    unset(__save_${v})
-  endforeach()
-endmacro()
-
-# macro to find packages on the host OS
-macro( find_host_package )
- __cmake_find_root_save_and_reset()
- if( CMAKE_HOST_WIN32 )
-  SET( WIN32 1 )
-  SET( UNIX )
- elseif( CMAKE_HOST_APPLE )
-  SET( APPLE 1 )
-  SET( UNIX )
- endif()
- find_package( ${ARGN} )
- SET( WIN32 )
- SET( APPLE )
- SET( UNIX 1 )
- __cmake_find_root_restore()
-endmacro()
-
-
-# macro to find programs on the host OS
-macro( find_host_program )
- __cmake_find_root_save_and_reset()
- if( CMAKE_HOST_WIN32 )
-  SET( WIN32 1 )
-  SET( UNIX )
- elseif( CMAKE_HOST_APPLE )
-  SET( APPLE 1 )
-  SET( UNIX )
- endif()
- find_program( ${ARGN} )
- SET( WIN32 )
- SET( APPLE )
- SET( UNIX 1 )
- __cmake_find_root_restore()
-endmacro()
-
-
-# export toolchain settings for the try_compile() command
-if( NOT _CMAKE_IN_TRY_COMPILE )
- set( __toolchain_config "")
- foreach( __var NDK_CCACHE  LIBRARY_OUTPUT_PATH_ROOT  ANDROID_FORBID_SYGWIN
-                ANDROID_NDK_HOST_X64
-                ANDROID_NDK
-                ANDROID_NDK_LAYOUT
-                ANDROID_STANDALONE_TOOLCHAIN
-                ANDROID_TOOLCHAIN_NAME
-                ANDROID_ABI
-                ANDROID_NATIVE_API_LEVEL
-                ANDROID_STL
-                ANDROID_STL_FORCE_FEATURES
-                ANDROID_FORCE_ARM_BUILD
-                ANDROID_NO_UNDEFINED
-                ANDROID_SO_UNDEFINED
-                ANDROID_FUNCTION_LEVEL_LINKING
-                ANDROID_GOLD_LINKER
-                ANDROID_NOEXECSTACK
-                ANDROID_RELRO
-                ANDROID_LIBM_PATH
-                ANDROID_EXPLICIT_CRT_LINK
-                ANDROID_APP_PIE
-                )
-  if( DEFINED ${__var} )
-   if( ${__var} MATCHES " ")
-    set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" CACHE INTERNAL \"\" )\n" )
-   else()
-    set( __toolchain_config "${__toolchain_config}set( ${__var} ${${__var}} CACHE INTERNAL \"\" )\n" )
-   endif()
-  endif()
- endforeach()
- file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/android.toolchain.config.cmake" "${__toolchain_config}" )
- unset( __toolchain_config )
-endif()
-
-
-# force cmake to produce / instead of \ in build commands for Ninja generator
-if( CMAKE_GENERATOR MATCHES "Ninja" AND CMAKE_HOST_WIN32 )
- # it is a bad hack after all
- # CMake generates Ninja makefiles with UNIX paths only if it thinks that we are going to build with MinGW
- set( CMAKE_COMPILER_IS_MINGW TRUE ) # tell CMake that we are MinGW
- set( CMAKE_CROSSCOMPILING TRUE )    # stop recursion
- enable_language( C )
- enable_language( CXX )
- # unset( CMAKE_COMPILER_IS_MINGW ) # can't unset because CMake does not convert back-slashes in response files without it
- unset( MINGW )
-endif()
-
-
-# Variables controlling behavior or set by cmake toolchain:
-#   ANDROID_ABI : "armeabi-v7a" (default), "armeabi", "armeabi-v7a with NEON", "armeabi-v7a-hard with NEON", "armeabi-v7a with VFPV3", "armeabi-v6 with VFP", "x86", "mips", "arm64-v8a", "x86_64", "mips64"
-#   ANDROID_NATIVE_API_LEVEL : 3,4,5,8,9,14,15,16,17,18,19,21 (depends on NDK version)
-#   ANDROID_STL : gnustl_static/gnustl_shared/stlport_static/stlport_shared/gabi++_static/gabi++_shared/system_re/system/none
-#   ANDROID_FORBID_SYGWIN : ON/OFF
-#   ANDROID_NO_UNDEFINED : ON/OFF
-#   ANDROID_SO_UNDEFINED : OFF/ON  (default depends on NDK version)
-#   ANDROID_FUNCTION_LEVEL_LINKING : ON/OFF
-#   ANDROID_GOLD_LINKER : ON/OFF
-#   ANDROID_NOEXECSTACK : ON/OFF
-#   ANDROID_RELRO : ON/OFF
-#   ANDROID_FORCE_ARM_BUILD : ON/OFF
-#   ANDROID_STL_FORCE_FEATURES : ON/OFF
-#   ANDROID_LIBM_PATH : path to libm.so (set to something like $(TOP)/out/target/product/<product_name>/obj/lib/libm.so) to workaround unresolved `sincos`
-# Can be set only at the first run:
-#   ANDROID_NDK : path to your NDK install
-#   NDK_CCACHE : path to your ccache executable
-#   ANDROID_TOOLCHAIN_NAME : the NDK name of compiler toolchain
-#   ANDROID_NDK_HOST_X64 : try to use x86_64 toolchain (default for x64 host systems)
-#   ANDROID_NDK_LAYOUT : the inner NDK structure (RELEASE, LINARO, ANDROID)
-#   LIBRARY_OUTPUT_PATH_ROOT : <any valid path>
-#   ANDROID_STANDALONE_TOOLCHAIN
-#
-# Primary read-only variables:
-#   ANDROID : always TRUE
-#   ARMEABI : TRUE for arm v6 and older devices
-#   ARMEABI_V6 : TRUE for arm v6
-#   ARMEABI_V7A : TRUE for arm v7a
-#   ARMEABI_V7A_HARD : TRUE for arm v7a with hardfp
-#   ARM64_V8A : TRUE for arm64-v8a
-#   NEON : TRUE if NEON unit is enabled
-#   VFPV3 : TRUE if VFP version 3 is enabled
-#   X86 : TRUE if configured for x86
-#   X86_64 : TRUE if configured for x86_64
-#   MIPS : TRUE if configured for mips
-#   MIPS64 : TRUE if configured for mips64
-#   BUILD_WITH_ANDROID_NDK : TRUE if NDK is used
-#   BUILD_WITH_STANDALONE_TOOLCHAIN : TRUE if standalone toolchain is used
-#   ANDROID_NDK_HOST_SYSTEM_NAME : "windows", "linux-x86" or "darwin-x86" depending on host platform
-#   ANDROID_NDK_ABI_NAME : "armeabi", "armeabi-v7a", "armeabi-v7a-hard", "x86", "mips", "arm64-v8a", "x86_64", "mips64" depending on ANDROID_ABI
-#   ANDROID_NDK_RELEASE : from r5 to r10d; set only for NDK
-#   ANDROID_NDK_RELEASE_NUM : numeric ANDROID_NDK_RELEASE version (1000*major+minor)
-#   ANDROID_ARCH_NAME : "arm", "x86", "mips", "arm64", "x86_64", "mips64" depending on ANDROID_ABI
-#   ANDROID_SYSROOT : path to the compiler sysroot
-#   ANDROID_SYSROOT_INCLUDE : paths to system include paths
-#   TOOL_OS_SUFFIX : "" or ".exe" depending on host platform
-#   ANDROID_COMPILER_IS_CLANG : TRUE if clang compiler is used
-#
-# Secondary (less stable) read-only variables:
-#   ANDROID_COMPILER_VERSION : GCC version used (not Clang version)
-#   ANDROID_CLANG_VERSION : version of clang compiler if clang is used
-#   ANDROID_CXX_FLAGS : C/C++ compiler flags required by Android platform
-#   ANDROID_SUPPORTED_ABIS : list of currently allowed values for ANDROID_ABI
-#   ANDROID_TOOLCHAIN_MACHINE_NAME : "arm-linux-androideabi", "arm-eabi" or "i686-android-linux"
-#   ANDROID_TOOLCHAIN_ROOT : path to the top level of toolchain (standalone or placed inside NDK)
-#   ANDROID_CLANG_TOOLCHAIN_ROOT : path to clang tools
-#   ANDROID_SUPPORTED_NATIVE_API_LEVELS : list of native API levels found inside NDK
-#   ANDROID_STL_INCLUDE_DIRS : stl include paths
-#   ANDROID_RTTI : if rtti is enabled by the runtime
-#   ANDROID_EXCEPTIONS : if exceptions are enabled by the runtime
-#   ANDROID_GCC_TOOLCHAIN_NAME : read-only, differs from ANDROID_TOOLCHAIN_NAME only if clang is used
-#
-# Defaults:
-#   ANDROID_DEFAULT_NDK_API_LEVEL
-#   ANDROID_DEFAULT_NDK_API_LEVEL_${ARCH}
-#   ANDROID_NDK_SEARCH_PATHS
-#   ANDROID_SUPPORTED_ABIS_${ARCH}
-#   ANDROID_SUPPORTED_NDK_VERSIONS
diff --git a/platforms/android/build-tests/test_gradle.sh b/platforms/android/build-tests/test_gradle.sh
index 9f1b233ff26c..2145786e2349 100755
--- a/platforms/android/build-tests/test_gradle.sh
+++ b/platforms/android/build-tests/test_gradle.sh
@@ -1,9 +1,11 @@
 #!/bin/bash -e
 SDK_DIR=$1
+
 echo "OpenCV Android SDK path: ${SDK_DIR}"
 
 ANDROID_HOME=${ANDROID_HOME:-${ANDROID_SDK_ROOT:-${ANDROID_SDK?Required ANDROID_HOME/ANDROID_SDK/ANDROID_SDK_ROOT}}}
 ANDROID_NDK=${ANDROID_NDK_HOME-${ANDROID_NDK:-${NDKROOT?Required ANDROID_NDK_HOME/ANDROID_NDK/NDKROOT}}}
+OPENCV_GRADLE_VERBOSE_OPTIONS=${OPENCV_GRADLE_VERBOSE_OPTIONS:-'-i'}
 
 echo "Android SDK: ${ANDROID_HOME}"
 echo "Android NDK: ${ANDROID_NDK}"
@@ -29,8 +31,12 @@ rm -rf "test-gradle"
 cp -rp "${SDK_DIR}" "test-gradle"
 echo "Cloning OpenCV Android SDK ... Done!"
 
+# drop cmake bin name and "bin" folder from path
+echo "ndk.dir=${ANDROID_NDK}" > "test-gradle/samples/local.properties"
+echo "cmake.dir=$(dirname $(dirname $(which cmake)))" >> "test-gradle/samples/local.properties"
+
 echo "Run gradle ..."
-(cd "test-gradle/samples"; ./gradlew -i assemble)
+(cd "test-gradle/samples"; ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} assemble)
 
 echo "#"
 echo "# Done!"
diff --git a/platforms/android/build-tests/test_gradle_aar.sh b/platforms/android/build-tests/test_gradle_aar.sh
new file mode 100755
index 000000000000..9aa37107f8f9
--- /dev/null
+++ b/platforms/android/build-tests/test_gradle_aar.sh
@@ -0,0 +1,48 @@
+#!/bin/bash -e
+SDK_DIR=$1
+LOCAL_MAVEN_REPO=$2
+echo "OpenCV Android SDK path: ${SDK_DIR}"
+echo "Use local maven repo from $LOCAL_MAVEN_REPO"
+
+ANDROID_HOME=${ANDROID_HOME:-${ANDROID_SDK_ROOT:-${ANDROID_SDK?Required ANDROID_HOME/ANDROID_SDK/ANDROID_SDK_ROOT}}}
+ANDROID_NDK=${ANDROID_NDK_HOME-${ANDROID_NDK:-${NDKROOT?Required ANDROID_NDK_HOME/ANDROID_NDK/NDKROOT}}}
+OPENCV_GRADLE_VERBOSE_OPTIONS=${OPENCV_GRADLE_VERBOSE_OPTIONS:-'-i'}
+
+echo "Android SDK: ${ANDROID_HOME}"
+echo "Android NDK: ${ANDROID_NDK}"
+
+if [ ! -d "${ANDROID_HOME}" ]; then
+  echo "FATAL: Missing Android SDK directory"
+  exit 1
+fi
+if [ ! -d "${ANDROID_NDK}" ]; then
+  echo "FATAL: Missing Android NDK directory"
+  exit 1
+fi
+
+export ANDROID_HOME=${ANDROID_HOME}
+export ANDROID_SDK=${ANDROID_HOME}
+export ANDROID_SDK_ROOT=${ANDROID_HOME}
+
+export ANDROID_NDK=${ANDROID_NDK}
+export ANDROID_NDK_HOME=${ANDROID_NDK}
+
+echo "Cloning OpenCV Android SDK ..."
+rm -rf "test-gradle-aar"
+mkdir test-gradle-aar
+cp -rp ${SDK_DIR}/samples/* test-gradle-aar/
+echo "Cloning OpenCV Android SDK ... Done!"
+
+# drop cmake bin name and "bin" folder from path
+echo "ndk.dir=${ANDROID_NDK}" > "test-gradle-aar/local.properties"
+echo "cmake.dir=$(dirname $(dirname $(which cmake)))" >> "test-gradle-aar/local.properties"
+
+sed -i "s/opencv_source = 'sdk_path'/opencv_source = 'maven_local'/g" test-gradle-aar/settings.gradle
+sed -i "s+opencv_maven_path = '<path_to_maven_repo>'+opencv_maven_path = 'file\\://$LOCAL_MAVEN_REPO'+g" test-gradle-aar/settings.gradle
+
+echo "Run gradle ..."
+(cd "test-gradle-aar"; ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} assemble)
+
+echo "#"
+echo "# Done!"
+echo "#"
diff --git a/platforms/android/build_aar.sh b/platforms/android/build_aar.sh
new file mode 100755
index 000000000000..79be25658a56
--- /dev/null
+++ b/platforms/android/build_aar.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -e
+SDK_DIR=$1
+
+echo "OpenCV Android SDK path: ${SDK_DIR}"
+
+ANDROID_HOME=${ANDROID_HOME:-${ANDROID_SDK_ROOT:-${ANDROID_SDK?Required ANDROID_HOME/ANDROID_SDK/ANDROID_SDK_ROOT}}}
+ANDROID_NDK=${ANDROID_NDK_HOME-${ANDROID_NDK:-${NDKROOT?Required ANDROID_NDK_HOME/ANDROID_NDK/NDKROOT}}}
+OPENCV_GRADLE_VERBOSE_OPTIONS=${OPENCV_GRADLE_VERBOSE_OPTIONS:-'-i'}
+
+echo "Android SDK: ${ANDROID_HOME}"
+echo "Android NDK: ${ANDROID_NDK}"
+
+if [ ! -d "${ANDROID_HOME}" ]; then
+  echo "FATAL: Missing Android SDK directory"
+  exit 1
+fi
+if [ ! -d "${ANDROID_NDK}" ]; then
+  echo "FATAL: Missing Android NDK directory"
+  exit 1
+fi
+
+export ANDROID_HOME=${ANDROID_HOME}
+export ANDROID_SDK=${ANDROID_HOME}
+export ANDROID_SDK_ROOT=${ANDROID_HOME}
+
+export ANDROID_NDK=${ANDROID_NDK}
+export ANDROID_NDK_HOME=${ANDROID_NDK}
+
+echo "Cloning OpenCV Android SDK ..."
+rm -rf "aar-build"
+cp -rp "${SDK_DIR}" "aar-build"
+echo "Cloning OpenCV Android SDK ... Done!"
+
+# drop cmake bin name and "bin" folder from path
+echo "ndk.dir=${ANDROID_NDK}" > "aar-build/samples/local.properties"
+echo "cmake.dir=$(dirname $(dirname $(which cmake)))" >> "aar-build/samples/local.properties"
+
+echo "Run gradle ..."
+(cd "aar-build/samples"; ./gradlew ${OPENCV_GRADLE_VERBOSE_OPTIONS} opencv:publishReleasePublicationToMyrepoRepository)
+
+mkdir "maven_repo"
+cp -r aar-build/sdk/build/repo/* ./maven_repo/
+
+echo "#"
+echo "# Done!"
+echo "#"
diff --git a/platforms/android/build_java_shared_aar.py b/platforms/android/build_java_shared_aar.py
new file mode 100755
index 000000000000..8e17a7a4d6a8
--- /dev/null
+++ b/platforms/android/build_java_shared_aar.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+
+import argparse
+from os import path
+import os
+import re
+import shutil
+import string
+import subprocess
+
+
+COPY_FROM_SDK_TO_ANDROID_PROJECT = [
+    ["sdk/native/jni/include", "OpenCV/src/main/cpp/include"],
+    ["sdk/java/src/org", "OpenCV/src/main/java/org"],
+    ["sdk/java/res", "OpenCV/src/main/res"]
+]
+
+COPY_FROM_SDK_TO_APK = [
+    ["sdk/native/libs/<ABI>/lib<LIB_NAME>.so", "jni/<ABI>/lib<LIB_NAME>.so"],
+    ["sdk/native/libs/<ABI>/lib<LIB_NAME>.so", "prefab/modules/<LIB_NAME>/libs/android.<ABI>/lib<LIB_NAME>.so"],
+]
+
+ANDROID_PROJECT_TEMPLATE_DIR = path.join(path.dirname(__file__), "aar-template")
+TEMP_DIR = "build_java_shared"
+ANDROID_PROJECT_DIR = path.join(TEMP_DIR, "AndroidProject")
+COMPILED_AAR_PATH_1 = path.join(ANDROID_PROJECT_DIR, "OpenCV/build/outputs/aar/OpenCV-release.aar") # original package name
+COMPILED_AAR_PATH_2 = path.join(ANDROID_PROJECT_DIR, "OpenCV/build/outputs/aar/opencv-release.aar") # lower case package name
+AAR_UNZIPPED_DIR = path.join(TEMP_DIR, "aar_unzipped")
+FINAL_AAR_PATH_TEMPLATE = "outputs/opencv_java_shared_<OPENCV_VERSION>.aar"
+FINAL_REPO_PATH = "outputs/maven_repo"
+MAVEN_PACKAGE_NAME = "opencv"
+
+def fill_template(src_path, dst_path, args_dict):
+    with open(src_path, "r") as f:
+        template_text = f.read()
+    template = string.Template(template_text)
+    text = template.safe_substitute(args_dict)
+    with open(dst_path, "w") as f:
+        f.write(text)
+
+def get_opencv_version(opencv_sdk_path):
+    version_hpp_path = path.join(opencv_sdk_path, "sdk/native/jni/include/opencv2/core/version.hpp")
+    with open(version_hpp_path, "rt") as f:
+        data = f.read()
+        major = re.search(r'^#define\W+CV_VERSION_MAJOR\W+(\d+)$', data, re.MULTILINE).group(1)
+        minor = re.search(r'^#define\W+CV_VERSION_MINOR\W+(\d+)$', data, re.MULTILINE).group(1)
+        revision = re.search(r'^#define\W+CV_VERSION_REVISION\W+(\d+)$', data, re.MULTILINE).group(1)
+        return "%(major)s.%(minor)s.%(revision)s" % locals()
+
+def get_compiled_aar_path(path1, path2):
+    if path.exists(path1):
+        return path1
+    elif path.exists(path2):
+        return path2
+    else:
+        raise Exception("Can't find compiled AAR path in [" + path1 + ", " + path2 + "]")
+
+def cleanup(paths_to_remove):
+    exists = False
+    for p in paths_to_remove:
+        if path.exists(p):
+            exists = True
+            if path.isdir(p):
+                shutil.rmtree(p)
+            else:
+                os.remove(p)
+            print("Removed", p)
+    if not exists:
+        print("Nothing to remove")
+
+def main(args):
+    opencv_version = get_opencv_version(args.opencv_sdk_path)
+    abis = os.listdir(path.join(args.opencv_sdk_path, "sdk/native/libs"))
+    lib_name = "opencv_java" + opencv_version.split(".")[0]
+    final_aar_path = FINAL_AAR_PATH_TEMPLATE.replace("<OPENCV_VERSION>", opencv_version)
+
+    print("Removing data from previous runs...")
+    cleanup([TEMP_DIR, final_aar_path, path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME)])
+
+    print("Preparing Android project...")
+    # ANDROID_PROJECT_TEMPLATE_DIR contains an Android project template that creates AAR
+    shutil.copytree(ANDROID_PROJECT_TEMPLATE_DIR, ANDROID_PROJECT_DIR)
+
+    # Configuring the Android project to Java + shared C++ lib version
+    shutil.rmtree(path.join(ANDROID_PROJECT_DIR, "OpenCV/src/main/cpp/include"))
+
+    fill_template(path.join(ANDROID_PROJECT_DIR, "OpenCV/build.gradle.template"),
+                  path.join(ANDROID_PROJECT_DIR, "OpenCV/build.gradle"),
+                  {"LIB_NAME": lib_name,
+                   "LIB_TYPE": "c++_shared",
+                   "PACKAGE_NAME": MAVEN_PACKAGE_NAME,
+                   "OPENCV_VERSION": opencv_version,
+                   "COMPILE_SDK": args.android_compile_sdk,
+                   "MIN_SDK": args.android_min_sdk,
+                   "TARGET_SDK": args.android_target_sdk,
+                   "ABI_FILTERS": ", ".join(['"' + x + '"' for x in abis]),
+                   "JAVA_VERSION": args.java_version,
+                   })
+    fill_template(path.join(ANDROID_PROJECT_DIR, "OpenCV/src/main/cpp/CMakeLists.txt.template"),
+                  path.join(ANDROID_PROJECT_DIR, "OpenCV/src/main/cpp/CMakeLists.txt"),
+                  {"LIB_NAME": lib_name, "LIB_TYPE": "SHARED"})
+
+    local_props = ""
+    if args.ndk_location:
+        local_props += "ndk.dir=" + args.ndk_location + "\n"
+    if args.cmake_location:
+        local_props += "cmake.dir=" + args.cmake_location + "\n"
+
+    if local_props:
+        with open(path.join(ANDROID_PROJECT_DIR, "local.properties"), "wt") as f:
+            f.write(local_props)
+
+    # Copying Java code and C++ public headers from SDK to the Android project
+    for src, dst in COPY_FROM_SDK_TO_ANDROID_PROJECT:
+        shutil.copytree(path.join(args.opencv_sdk_path, src),
+                        path.join(ANDROID_PROJECT_DIR, dst))
+
+    print("Running gradle assembleRelease...")
+    # Running gradle to build the Android project
+    cmd = ["./gradlew", "assembleRelease"]
+    if args.offline:
+        cmd = cmd + ["--offline"]
+    subprocess.run(cmd, shell=False, cwd=ANDROID_PROJECT_DIR, check=True)
+
+    print("Adding libs to AAR...")
+    # The created AAR package doesn't contain C++ shared libs.
+    # We need to add them manually.
+    # AAR package is just a zip archive.
+    complied_aar_path = get_compiled_aar_path(COMPILED_AAR_PATH_1, COMPILED_AAR_PATH_2) # two possible paths
+    shutil.unpack_archive(complied_aar_path, AAR_UNZIPPED_DIR, "zip")
+
+    for abi in abis:
+        for src, dst in COPY_FROM_SDK_TO_APK:
+            src = src.replace("<ABI>", abi).replace("<LIB_NAME>", lib_name)
+            dst = dst.replace("<ABI>", abi).replace("<LIB_NAME>", lib_name)
+            shutil.copy(path.join(args.opencv_sdk_path, src),
+                path.join(AAR_UNZIPPED_DIR, dst))
+
+    # Creating final AAR zip archive
+    os.makedirs("outputs", exist_ok=True)
+    shutil.make_archive(final_aar_path, "zip", AAR_UNZIPPED_DIR, ".")
+    os.rename(final_aar_path + ".zip", final_aar_path)
+
+    print("Creating local maven repo...")
+
+    shutil.copy(final_aar_path, path.join(ANDROID_PROJECT_DIR, "OpenCV/opencv-release.aar"))
+
+    print("Creating a maven repo from project sources (with sources jar and javadoc jar)...")
+    cmd = ["./gradlew", "publishReleasePublicationToMyrepoRepository"]
+    if args.offline:
+        cmd = cmd + ["--offline"]
+    subprocess.run(cmd, shell=False, cwd=ANDROID_PROJECT_DIR, check=True)
+
+    os.makedirs(path.join(FINAL_REPO_PATH, "org/opencv"), exist_ok=True)
+    shutil.move(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME),
+                path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME))
+
+    print("Creating a maven repo from modified AAR (with cpp libraries)...")
+    cmd = ["./gradlew", "publishModifiedPublicationToMyrepoRepository"]
+    if args.offline:
+        cmd = cmd + ["--offline"]
+    subprocess.run(cmd, shell=False, cwd=ANDROID_PROJECT_DIR, check=True)
+
+    # Replacing AAR from the first maven repo with modified AAR from the second maven repo
+    shutil.copytree(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME),
+                    path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME),
+                    dirs_exist_ok=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Builds AAR with Java and shared C++ libs from OpenCV SDK")
+    parser.add_argument('opencv_sdk_path')
+    parser.add_argument('--android_compile_sdk', default="31")
+    parser.add_argument('--android_min_sdk', default="21")
+    parser.add_argument('--android_target_sdk', default="31")
+    parser.add_argument('--java_version', default="1_8")
+    parser.add_argument('--ndk_location', default="")
+    parser.add_argument('--cmake_location', default="")
+    parser.add_argument('--offline', action="store_true", help="Force Gradle use offline mode")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/platforms/android/build_sdk.py b/platforms/android/build_sdk.py
index f445be10f659..6e03698a5eb0 100755
--- a/platforms/android/build_sdk.py
+++ b/platforms/android/build_sdk.py
@@ -161,6 +161,7 @@ def __init__(self, workdir, opencvdir, config):
         self.opencl = True if config.opencl else False
         self.no_kotlin = True if config.no_kotlin else False
         self.shared = True if config.shared else False
+        self.disable = args.disable
 
     def get_cmake(self):
         if not self.config.use_android_buildtools and check_executable(['cmake', '--version']):
@@ -215,7 +216,7 @@ def clean_library_build_dir(self):
         for d in ["CMakeCache.txt", "CMakeFiles/", "bin/", "libs/", "lib/", "package/", "install/samples/"]:
             rm_one(d)
 
-    def build_library(self, abi, do_install):
+    def build_library(self, abi, do_install, no_media_ndk):
         cmd = [self.cmake_path, "-GNinja"]
         cmake_vars = dict(
             CMAKE_TOOLCHAIN_FILE=self.get_toolchain_file(),
@@ -250,17 +251,25 @@ def build_library(self, abi, do_install):
             cmake_vars['BUILD_SHARED_LIBS'] = "ON"
 
         if self.config.modules_list is not None:
-            cmd.append("-DBUILD_LIST='%s'" % self.config.modules_list)
+            cmake_vars['BUILD_LIST'] = '%s' % self.config.modules_list
 
         if self.config.extra_modules_path is not None:
-            cmd.append("-DOPENCV_EXTRA_MODULES_PATH='%s'" % self.config.extra_modules_path)
+            cmake_vars['OPENCV_EXTRA_MODULES_PATH'] = '%s' % self.config.extra_modules_path
 
         if self.use_ccache == True:
-            cmd.append("-DNDK_CCACHE=ccache")
+            cmake_vars['NDK_CCACHE'] = 'ccache'
         if do_install:
-            cmd.extend(["-DBUILD_TESTS=ON", "-DINSTALL_TESTS=ON"])
+            cmake_vars['BUILD_TESTS'] = "ON"
+            cmake_vars['INSTALL_TESTS'] = "ON"
+
+        if no_media_ndk:
+            cmake_vars['WITH_ANDROID_MEDIANDK'] = "OFF"
 
         cmake_vars.update(abi.cmake_vars)
+
+        if len(self.disable) > 0:
+            cmake_vars.update({'WITH_%s' % f : "OFF" for f in self.disable})
+
         cmd += [ "-D%s='%s'" % (k, v) for (k, v) in cmake_vars.items() if v is not None]
         cmd.append(self.opencvdir)
         execute(cmd)
@@ -270,7 +279,7 @@ def build_library(self, abi, do_install):
         if self.no_samples_build:
             execute([self.ninja_path, "install" if (self.debug_info or self.debug) else "install/strip"])
         else:
-            execute([self.ninja_path, "-j1" if (self.debug_info or self.debug) else "-j3", "install" if (self.debug_info or self.debug) else "install/strip"])
+            execute([self.ninja_path, "-j1", "install" if (self.debug_info or self.debug) else "install/strip"])
 
     def build_javadoc(self):
         classpaths = []
@@ -370,6 +379,8 @@ def get_ndk_dir():
     parser.add_argument('--opencl', action="store_true", help="Enable OpenCL support")
     parser.add_argument('--no_kotlin', action="store_true", help="Disable Kotlin extensions")
     parser.add_argument('--shared', action="store_true", help="Build shared libraries")
+    parser.add_argument('--no_media_ndk', action="store_true", help="Do not link Media NDK (required for video I/O support)")
+    parser.add_argument('--disable', metavar='FEATURE', default=[], action='append', help='OpenCV features to disable (add WITH_*=OFF). To disable multiple, specify this flag again, e.g. "--disable TBB --disable OPENMP"')
     args = parser.parse_args()
 
     log.basicConfig(format='%(message)s', level=log.DEBUG)
@@ -447,7 +458,7 @@ def get_ndk_dir():
 
         os.chdir(builder.libdest)
         builder.clean_library_build_dir()
-        builder.build_library(abi, do_install)
+        builder.build_library(abi, do_install, args.no_media_ndk)
 
     builder.gather_results()
 
diff --git a/platforms/android/build_static_aar.py b/platforms/android/build_static_aar.py
new file mode 100755
index 000000000000..56cfbcbc1e0c
--- /dev/null
+++ b/platforms/android/build_static_aar.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+from os import path
+import os
+import shutil
+import subprocess
+
+from build_java_shared_aar import cleanup, fill_template, get_compiled_aar_path, get_opencv_version
+
+
+ANDROID_PROJECT_TEMPLATE_DIR = path.join(path.dirname(__file__), "aar-template")
+TEMP_DIR = "build_static"
+ANDROID_PROJECT_DIR = path.join(TEMP_DIR, "AndroidProject")
+COMPILED_AAR_PATH_1 = path.join(ANDROID_PROJECT_DIR, "OpenCV/build/outputs/aar/OpenCV-release.aar") # original package name
+COMPILED_AAR_PATH_2 = path.join(ANDROID_PROJECT_DIR, "OpenCV/build/outputs/aar/opencv-release.aar") # lower case package name
+AAR_UNZIPPED_DIR = path.join(TEMP_DIR, "aar_unzipped")
+FINAL_AAR_PATH_TEMPLATE = "outputs/opencv_static_<OPENCV_VERSION>.aar"
+FINAL_REPO_PATH = "outputs/maven_repo"
+MAVEN_PACKAGE_NAME = "opencv-static"
+
+
+def get_list_of_opencv_libs(sdk_dir):
+    files = os.listdir(path.join(sdk_dir, "sdk/native/staticlibs/arm64-v8a"))
+    libs = [f[3:-2] for f in files if f[:3] == "lib" and f[-2:] == ".a"]
+    return libs
+
+def get_list_of_3rdparty_libs(sdk_dir, abis):
+    libs = []
+    for abi in abis:
+        files = os.listdir(path.join(sdk_dir, "sdk/native/3rdparty/libs/" + abi))
+        cur_libs = [f[3:-2] for f in files if f[:3] == "lib" and f[-2:] == ".a"]
+        for lib in cur_libs:
+            if lib not in libs:
+                libs.append(lib)
+    return libs
+
+def add_printing_linked_libs(sdk_dir, opencv_libs):
+    """
+    Modifies CMakeLists.txt file in Android project, so it prints linked libraries for each OpenCV library"
+    """
+    sdk_jni_dir = sdk_dir + "/sdk/native/jni"
+    with open(path.join(ANDROID_PROJECT_DIR, "OpenCV/src/main/cpp/CMakeLists.txt"), "a") as f:
+        f.write('\nset(OpenCV_DIR "' + sdk_jni_dir + '")\n')
+        f.write('find_package(OpenCV REQUIRED)\n')
+        for lib_name in opencv_libs:
+            output_filename_prefix = "linkedlibs." + lib_name + "."
+            f.write('get_target_property(OUT "' + lib_name + '" INTERFACE_LINK_LIBRARIES)\n')
+            f.write('file(WRITE "' + output_filename_prefix + '${ANDROID_ABI}.txt" "${OUT}")\n')
+
+def read_linked_libs(lib_name, abis):
+    """
+    Reads linked libs for each OpenCV library from files, that was generated by gradle. See add_printing_linked_libs()
+    """
+    deps_lists = []
+    for abi in abis:
+         with open(path.join(ANDROID_PROJECT_DIR, "OpenCV/src/main/cpp", f"linkedlibs.{lib_name}.{abi}.txt")) as f:
+            text = f.read()
+            linked_libs = text.split(";")
+            linked_libs = [x.replace("$<LINK_ONLY:", "").replace(">", "") for x in linked_libs]
+            deps_lists.append(linked_libs)
+
+    return merge_dependencies_lists(deps_lists)
+
+def merge_dependencies_lists(deps_lists):
+    """
+    One library may have different dependencies for different ABIS.
+    We need to merge them into one list with all the dependencies preserving the order.
+    """
+    result = []
+    for d_list in deps_lists:
+        for i in range(len(d_list)):
+            if d_list[i] not in result:
+                if i == 0:
+                    result.append(d_list[i])
+                else:
+                    index = result.index(d_list[i-1])
+                    result = result[:index + 1] + [d_list[i]] + result[index + 1:]
+
+    return result
+
+def convert_deps_list_to_prefab(linked_libs, opencv_libs, external_libs):
+    """
+    Converting list of dependencies into prefab format.
+    """
+    prefab_linked_libs = []
+    for lib in linked_libs:
+        if (lib in opencv_libs) or (lib in external_libs):
+            prefab_linked_libs.append(":" + lib)
+        elif (lib[:3] == "lib" and lib[3:] in external_libs):
+            prefab_linked_libs.append(":" + lib[3:])
+        elif lib == "ocv.3rdparty.android_mediandk":
+            prefab_linked_libs += ["-landroid", "-llog", "-lmediandk"]
+            print("Warning: manualy handled ocv.3rdparty.android_mediandk dependency")
+        elif lib == "ocv.3rdparty.flatbuffers":
+            print("Warning: manualy handled ocv.3rdparty.flatbuffers dependency")
+        elif lib.startswith("ocv.3rdparty"):
+            raise Exception("Unknown lib " + lib)
+        else:
+            prefab_linked_libs.append("-l" + lib)
+    return prefab_linked_libs
+
+def main(args):
+    opencv_version = get_opencv_version(args.opencv_sdk_path)
+    abis = os.listdir(path.join(args.opencv_sdk_path, "sdk/native/libs"))
+    final_aar_path = FINAL_AAR_PATH_TEMPLATE.replace("<OPENCV_VERSION>", opencv_version)
+    sdk_dir = args.opencv_sdk_path
+
+    print("Removing data from previous runs...")
+    cleanup([TEMP_DIR, final_aar_path, path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME)])
+
+    print("Preparing Android project...")
+    # ANDROID_PROJECT_TEMPLATE_DIR contains an Android project template that creates AAR
+    shutil.copytree(ANDROID_PROJECT_TEMPLATE_DIR, ANDROID_PROJECT_DIR)
+
+    # Configuring the Android project to static C++ libs version
+    fill_template(path.join(ANDROID_PROJECT_DIR, "OpenCV/build.gradle.template"),
+                  path.join(ANDROID_PROJECT_DIR, "OpenCV/build.gradle"),
+                  {"LIB_NAME": "templib",
+                   "LIB_TYPE": "c++_static",
+                   "PACKAGE_NAME": MAVEN_PACKAGE_NAME,
+                   "OPENCV_VERSION": opencv_version,
+                   "COMPILE_SDK": args.android_compile_sdk,
+                   "MIN_SDK": args.android_min_sdk,
+                   "TARGET_SDK": args.android_target_sdk,
+                   "ABI_FILTERS": ", ".join(['"' + x + '"' for x in abis]),
+                   "JAVA_VERSION": args.java_version,
+                   })
+    fill_template(path.join(ANDROID_PROJECT_DIR, "OpenCV/src/main/cpp/CMakeLists.txt.template"),
+                  path.join(ANDROID_PROJECT_DIR, "OpenCV/src/main/cpp/CMakeLists.txt"),
+                  {"LIB_NAME": "templib", "LIB_TYPE": "STATIC"})
+
+    local_props = ""
+    if args.ndk_location:
+        local_props += "ndk.dir=" + args.ndk_location + "\n"
+    if args.cmake_location:
+        local_props += "cmake.dir=" + args.cmake_location + "\n"
+
+    if local_props:
+        with open(path.join(ANDROID_PROJECT_DIR, "local.properties"), "wt") as f:
+            f.write(local_props)
+
+    opencv_libs = get_list_of_opencv_libs(sdk_dir)
+    external_libs = get_list_of_3rdparty_libs(sdk_dir, abis)
+
+    add_printing_linked_libs(sdk_dir, opencv_libs)
+
+    print("Running gradle assembleRelease...")
+    cmd = ["./gradlew", "assembleRelease"]
+    if args.offline:
+        cmd = cmd + ["--offline"]
+    # Running gradle to build the Android project
+    subprocess.run(cmd, shell=False, cwd=ANDROID_PROJECT_DIR, check=True)
+
+    # The created AAR package contains only one empty libtemplib.a library.
+    # We need to add OpenCV libraries manually.
+    # AAR package is just a zip archive
+    complied_aar_path = get_compiled_aar_path(COMPILED_AAR_PATH_1, COMPILED_AAR_PATH_2) # two possible paths
+    shutil.unpack_archive(complied_aar_path, AAR_UNZIPPED_DIR, "zip")
+
+    print("Adding libs to AAR...")
+
+    # Copying 3rdparty libs from SDK into the AAR
+    for lib in external_libs:
+        for abi in abis:
+            os.makedirs(path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/libs/android." + abi))
+            if path.exists(path.join(sdk_dir, "sdk/native/3rdparty/libs/" + abi, "lib" + lib + ".a")):
+                shutil.copy(path.join(sdk_dir, "sdk/native/3rdparty/libs/" + abi, "lib" + lib + ".a"),
+                            path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/libs/android." + abi, "lib" + lib + ".a"))
+            else:
+                # One OpenCV library may have different dependency lists for different ABIs, but we can write only one
+                # full dependency list for all ABIs. So we just add empty .a library if this ABI doesn't have this dependency.
+                shutil.copy(path.join(AAR_UNZIPPED_DIR, "prefab/modules/templib/libs/android." + abi, "libtemplib.a"),
+                            path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/libs/android." + abi, "lib" + lib + ".a"))
+            shutil.copy(path.join(AAR_UNZIPPED_DIR, "prefab/modules/templib/libs/android." + abi + "/abi.json"),
+                        path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/libs/android." + abi + "/abi.json"))
+        shutil.copy(path.join(AAR_UNZIPPED_DIR, "prefab/modules/templib/module.json"),
+                    path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/module.json"))
+
+    # Copying OpenV libs from SDK into the AAR
+    for lib in opencv_libs:
+        for abi in abis:
+            os.makedirs(path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/libs/android." + abi))
+            shutil.copy(path.join(sdk_dir, "sdk/native/staticlibs/" + abi, "lib" + lib + ".a"),
+                        path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/libs/android." + abi, "lib" + lib + ".a"))
+            shutil.copy(path.join(AAR_UNZIPPED_DIR, "prefab/modules/templib/libs/android." + abi + "/abi.json"),
+                        path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/libs/android." + abi + "/abi.json"))
+        os.makedirs(path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/include/opencv2"))
+        shutil.copy(path.join(sdk_dir, "sdk/native/jni/include/opencv2/" + lib.replace("opencv_", "") + ".hpp"),
+                    path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/include/opencv2/" + lib.replace("opencv_", "") + ".hpp"))
+        shutil.copytree(path.join(sdk_dir, "sdk/native/jni/include/opencv2/" + lib.replace("opencv_", "")),
+                        path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/include/opencv2/" + lib.replace("opencv_", "")))
+
+        # Adding dependencies list
+        module_json_text = {
+            "export_libraries": convert_deps_list_to_prefab(read_linked_libs(lib, abis), opencv_libs, external_libs),
+            "android": {},
+        }
+        with open(path.join(AAR_UNZIPPED_DIR, "prefab/modules/" + lib + "/module.json"), "w") as f:
+            json.dump(module_json_text, f)
+
+    for h_file in ("cvconfig.h", "opencv.hpp", "opencv_modules.hpp"):
+        shutil.copy(path.join(sdk_dir, "sdk/native/jni/include/opencv2/" + h_file),
+                    path.join(AAR_UNZIPPED_DIR, "prefab/modules/opencv_core/include/opencv2/" + h_file))
+
+
+    shutil.rmtree(path.join(AAR_UNZIPPED_DIR, "prefab/modules/templib"))
+
+    # Creating final AAR zip archive
+    os.makedirs("outputs", exist_ok=True)
+    shutil.make_archive(final_aar_path, "zip", AAR_UNZIPPED_DIR, ".")
+    os.rename(final_aar_path + ".zip", final_aar_path)
+
+    print("Creating local maven repo...")
+
+    shutil.copy(final_aar_path, path.join(ANDROID_PROJECT_DIR, "OpenCV/opencv-release.aar"))
+
+    print("Creating a maven repo from project sources (with sources jar and javadoc jar)...")
+    cmd = ["./gradlew", "publishReleasePublicationToMyrepoRepository"]
+    if args.offline:
+        cmd = cmd + ["--offline"]
+    subprocess.run(cmd, shell=False, cwd=ANDROID_PROJECT_DIR, check=True)
+
+    os.makedirs(path.join(FINAL_REPO_PATH, "org/opencv"), exist_ok=True)
+    shutil.move(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME),
+                path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME))
+
+    print("Creating a maven repo from modified AAR (with cpp libraries)...")
+    cmd = ["./gradlew", "publishModifiedPublicationToMyrepoRepository"]
+    if args.offline:
+        cmd = cmd + ["--offline"]
+    subprocess.run(cmd, shell=False, cwd=ANDROID_PROJECT_DIR, check=True)
+
+    # Replacing AAR from the first maven repo with modified AAR from the second maven repo
+    shutil.copytree(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME),
+                    path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME),
+                    dirs_exist_ok=True)
+
+    print("Done")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Builds AAR with static C++ libs from OpenCV SDK")
+    parser.add_argument('opencv_sdk_path')
+    parser.add_argument('--android_compile_sdk', default="31")
+    parser.add_argument('--android_min_sdk', default="21")
+    parser.add_argument('--android_target_sdk', default="31")
+    parser.add_argument('--java_version', default="1_8")
+    parser.add_argument('--ndk_location', default="")
+    parser.add_argument('--cmake_location', default="")
+    parser.add_argument('--offline', action="store_true", help="Force Gradle use offline mode")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/platforms/android/gradle-wrapper/gradlew b/platforms/android/gradle-wrapper/gradlew
old mode 100644
new mode 100755
diff --git a/platforms/android/ndk-25.config.py b/platforms/android/ndk-25.config.py
index b6b12126eb77..5ca8525d5c6c 100644
--- a/platforms/android/ndk-25.config.py
+++ b/platforms/android/ndk-25.config.py
@@ -9,7 +9,7 @@
     # Docs: https://developer.android.com/studio/releases/gradle-plugin
     'ANDROID_GRADLE_PLUGIN_VERSION': '7.3.1',
     'GRADLE_VERSION': '7.5.1',
-    'KOTLIN_PLUGIN_VERSION': '1.5.20',
+    'KOTLIN_PLUGIN_VERSION': '1.8.20',
 }
 ABIs = [
     ABI("2", "armeabi-v7a", None, ndk_api_level=ANDROID_NATIVE_API_LEVEL, cmake_vars=cmake_common_vars),
diff --git a/platforms/android/service/CMakeLists.txt b/platforms/android/service/CMakeLists.txt
deleted file mode 100644
index 66e0c468a9d1..000000000000
--- a/platforms/android/service/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-if(NOT ANDROID_PROJECTS_BUILD_TYPE STREQUAL "ANT")
-  message(STATUS "Android OpenCV Manager is ignored")
-  return()
-endif()
-
-if(BUILD_ANDROID_SERVICE)
-  add_subdirectory(engine)
-endif()
-
-install(FILES "readme.txt" DESTINATION "apk/" COMPONENT libs)
diff --git a/platforms/android/service/engine/AndroidManifest.xml b/platforms/android/service/engine/AndroidManifest.xml
deleted file mode 100644
index 660152ed293d..000000000000
--- a/platforms/android/service/engine/AndroidManifest.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.opencv.engine"
-    android:versionCode="345@ANDROID_PLATFORM_ID@"
-    android:versionName="3.45">
-
-    <uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" android:targetSdkVersion="22"/>
-    <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>
-
-    <application
-        android:icon="@drawable/icon"
-        android:label="@string/app_name" android:allowBackup="true">
-
-    <service android:exported="true" android:name="OpenCVEngineService" android:process=":OpenCVEngineProcess">
-        <intent-filter>
-            <action android:name="org.opencv.engine.BIND"></action>
-        </intent-filter>
-    </service>
-
-    <activity
-        android:name="org.opencv.engine.manager.ManagerActivity"
-        android:label="@string/app_name"
-        android:screenOrientation="portrait">
-        <intent-filter>
-            <action android:name="android.intent.action.MAIN" />
-            <category android:name="android.intent.category.LAUNCHER" />
-        </intent-filter>
-    </activity>
-    </application>
-</manifest>
diff --git a/platforms/android/service/engine/CMakeLists.txt b/platforms/android/service/engine/CMakeLists.txt
deleted file mode 100644
index a31790a90316..000000000000
--- a/platforms/android/service/engine/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${ANDROID_MANIFEST_FILE}" "${OpenCV_BINARY_DIR}/platforms/android/service/engine/.build/${ANDROID_MANIFEST_FILE}"  @ONLY)
-unset(__android_project_chain CACHE)
-add_android_project(opencv_engine "${CMAKE_CURRENT_SOURCE_DIR}" SDK_TARGET 9 ${ANDROID_SDK_TARGET} IGNORE_JAVA ON IGNORE_MANIFEST ON COPY_LIBS ON)
diff --git a/platforms/android/service/engine/res/layout/main.xml b/platforms/android/service/engine/res/layout/main.xml
deleted file mode 100644
index a4717c8bf1fb..000000000000
--- a/platforms/android/service/engine/res/layout/main.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<ScrollView xmlns:android="http://schemas.android.com/apk/res/android"
-    android:id="@+id/ScrollBox"
-    android:layout_width="fill_parent"
-    android:layout_height="fill_parent" >
-
-    <LinearLayout
-        android:layout_width="fill_parent"
-        android:layout_height="wrap_content"
-        android:gravity="center_vertical|center_horizontal"
-        android:orientation="vertical"
-        android:scrollbarStyle="insideInset" >
-
-        <TextView
-            android:id="@+id/textView4"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:text="@string/about"
-            android:textAppearance="?android:attr/textAppearanceLarge" />
-
-        <TextView
-            android:id="@+id/textViewIntro"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:autoLink="web"
-            android:text="@string/intro"
-            android:textAppearance="?android:attr/textAppearanceSmall" />
-
-        <TextView
-            android:id="@+id/textView5"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:textAppearance="?android:attr/textAppearanceSmall" />
-
-        <TextView
-            android:id="@+id/textView6"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:textAppearance="?android:attr/textAppearanceSmall" />
-
-        <Button
-            android:id="@+id/CheckEngineUpdate"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_gravity="center_vertical|center_horizontal"
-            android:text="@string/checkUpdate" />
-    </LinearLayout>
-
-</ScrollView>
diff --git a/platforms/android/service/engine/res/values/strings.xml b/platforms/android/service/engine/res/values/strings.xml
deleted file mode 100644
index 2cd62ed2db0e..000000000000
--- a/platforms/android/service/engine/res/values/strings.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-    <string name="app_name">OpenCV Manager</string>
-    <string name="about">About</string>
-    <string name="checkUpdate">Check for update</string>
-    <string name="intro">OpenCV library is used by other applications for image enhancement, panorama stitching, object detection, recognition and tracking and so on. OpenCV Manager provides the best version of the OpenCV for your hardware. See opencv.org for details.</string>
-</resources>
diff --git a/platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java b/platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java
deleted file mode 100644
index 60a7cec289b9..000000000000
--- a/platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java
+++ /dev/null
@@ -1,118 +0,0 @@
-package org.opencv.engine;
-
-import java.io.File;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Scanner;
-import java.util.regex.Pattern;
-import java.util.regex.Matcher;
-import android.os.Build;
-import android.text.TextUtils;
-import android.util.Log;
-
-public class HardwareDetector {
-    private static String TAG = "OpenCVEngine/HardwareDetector";
-
-    public static final int ARCH_UNKNOWN = -1;
-
-    public static final int ARCH_X86 = 0x01000000;
-    public static final int ARCH_X86_64 = 0x02000000;
-    public static final int ARCH_ARM = 0x04000000;
-    public static final int ARCH_ARMv7 = 0x10000000;
-    public static final int ARCH_ARMv8 = 0x20000000;
-    public static final int ARCH_MIPS = 0x40000000;
-    public static final int ARCH_MIPS_64 = 0x80000000;
-
-    // Return CPU flags list
-    public static List<String> getFlags() {
-        Map<String, String> raw = getRawCpuInfo();
-        String f = raw.get("flags");
-        if (f == null)
-            f = raw.get("Features");
-        if (f == null)
-            return Arrays.asList();
-        return Arrays.asList(TextUtils.split(f, " "));
-    }
-
-    // Return CPU arch
-    public static int getAbi() {
-        List<String> abis = Arrays.asList(Build.CPU_ABI, Build.CPU_ABI2);
-        Log.i(TAG, "ABIs: " + abis.toString());
-        if (abis.contains("x86_64")) {
-            return ARCH_X86_64;
-        } else if (abis.contains("x86")) {
-            return ARCH_X86;
-        } else if (abis.contains("arm64-v8a")) {
-            return ARCH_ARMv8;
-        } else if (abis.contains("armeabi-v7a")
-                || abis.contains("armeabi-v7a-hard")) {
-            return ARCH_ARMv7;
-        } else if (abis.contains("armeabi")) {
-            return ARCH_ARM;
-        } else if (abis.contains("mips64")) {
-            return ARCH_MIPS_64;
-        } else if (abis.contains("mips")) {
-            return ARCH_MIPS;
-        }
-        return ARCH_UNKNOWN;
-    }
-
-    // Return hardware platform name
-    public static String getHardware() {
-        Map<String, String> raw = getRawCpuInfo();
-        return raw.get("Hardware");
-    }
-
-    // Return processor count
-    public static int getProcessorCount() {
-        int result = 0;
-        try {
-            Pattern pattern = Pattern.compile("(\\d)+(-(\\d+))?");
-            Scanner s = new Scanner(
-                    new File("/sys/devices/system/cpu/possible"));
-            if (s.hasNextLine()) {
-                String line = s.nextLine();
-                Log.d(TAG, "Got CPUs: " + line);
-                Matcher m = pattern.matcher(line);
-                while (m.find()) {
-                    int start = Integer.parseInt(m.group(1));
-                    int finish = start;
-                    if (m.group(3) != null) {
-                        finish = Integer.parseInt(m.group(3));
-                    }
-                    result += finish - start + 1;
-                    Log.d(TAG, "Got CPU range " + start + " ~ " + finish);
-                }
-            }
-        } catch (Exception e) {
-            Log.e(TAG, "Failed to read cpu count");
-            e.printStackTrace();
-        }
-        return result;
-
-    }
-
-    // Return parsed cpuinfo contents
-    public static Map<String, String> getRawCpuInfo() {
-        Map<String, String> map = new HashMap<String, String>();
-        try {
-            Scanner s = new Scanner(new File("/proc/cpuinfo"));
-            while (s.hasNextLine()) {
-                String line = s.nextLine();
-                String[] vals = line.split(": ");
-                if (vals.length > 1) {
-                    map.put(vals[0].trim(), vals[1].trim());
-                } else {
-                    Log.d(TAG, "Failed to parse cpuinfo: " + line);
-                }
-            }
-        } catch (Exception e) {
-            Log.e(TAG, "Failed to read cpuinfo");
-            e.printStackTrace();
-        }
-        return map;
-    }
-
-}
diff --git a/platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java b/platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java
deleted file mode 100644
index e4c936dcadba..000000000000
--- a/platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java
+++ /dev/null
@@ -1,31 +0,0 @@
-package org.opencv.engine;
-
-import android.content.Context;
-import android.content.Intent;
-import android.net.Uri;
-import android.util.Log;
-
-public class MarketConnector {
-    protected static final String OpenCVPackageNamePreffix = "org.opencv.lib";
-    private static final String TAG = "OpenCVEngine/MarketConnector";
-    protected Context mContext;
-
-    public MarketConnector(Context context) {
-        mContext = context;
-    }
-
-    public boolean InstallAppFromMarket(String AppID) {
-        Log.d(TAG, "Installing app: " + AppID);
-        boolean result = true;
-        try {
-            Uri uri = Uri.parse("market://details?id=" + AppID);
-            Intent intent = new Intent(Intent.ACTION_VIEW, uri);
-            intent.addFlags(Intent.FLAG_ACTIVITY_NEW_TASK);
-            mContext.startActivity(intent);
-        } catch (Exception e) {
-            Log.e(TAG, "Installation failed");
-            result = false;
-        }
-        return result;
-    }
-}
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl b/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
deleted file mode 100644
index 1f16aac2ac70..000000000000
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
+++ /dev/null
@@ -1,33 +0,0 @@
-package org.opencv.engine;
-
-/**
-* Class provides Java interface to OpenCV Engine Service. Is synchronous with native OpenCVEngine class.
-*/
-interface OpenCVEngineInterface
-{
-    /**
-    * @return Return service version
-    */
-    int getEngineVersion();
-
-    /**
-    * Find installed OpenCV library
-    * @param OpenCV version
-    * @return Returns path to OpenCV native libs or empty string if OpenCV was not found
-    */
-    String getLibPathByVersion(String version);
-
-    /**
-    * Try to install defined version of OpenCV from Google Play (Android Market).
-    * @param OpenCV version
-    * @return Returns true if installation was successful or OpenCV package has been already installed
-    */
-    boolean installVersion(String version);
-
-    /**
-    * Return list of libraries in loading order separated by ";" symbol
-    * @param OpenCV version
-    * @return Returns OpenCV libraries names separated by symbol ";" in loading order
-    */
-    String getLibraryList(String version);
-}
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java b/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
deleted file mode 100644
index 850bad5349ad..000000000000
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
+++ /dev/null
@@ -1,165 +0,0 @@
-package org.opencv.engine;
-
-import android.app.Service;
-import android.content.Intent;
-import android.content.pm.PackageManager.NameNotFoundException;
-import android.content.res.XmlResourceParser;
-import android.os.IBinder;
-import android.os.RemoteException;
-import android.util.Log;
-import android.text.TextUtils;
-import java.io.File;
-import java.lang.reflect.Field;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import org.xmlpull.v1.XmlPullParser;
-
-public class OpenCVEngineService extends Service {
-    private static final String TAG = "OpenCVEngine/Service";
-    private IBinder mEngineInterface = null;
-    private List<LibVariant> variants = new ArrayList<LibVariant>();
-
-    private class LibVariant {
-        public String version;
-        public List<String> files;
-
-        public void parseFile(XmlResourceParser p) {
-            try {
-                int eventType = p.getEventType();
-                while (eventType != XmlPullParser.END_DOCUMENT) {
-                    if (eventType == XmlPullParser.START_TAG) {
-                        if (p.getName().equals("library")) {
-                            parseLibraryTag(p);
-                        } else if (p.getName().equals("file")) {
-                            parseFileTag(p);
-                        }
-                    }
-                    eventType = p.next();
-                }
-            } catch (Exception e) {
-                Log.e(TAG, "Failed to parse xml library descriptor");
-            }
-        }
-
-        private void parseLibraryTag(XmlResourceParser p) {
-            version = p.getAttributeValue(null, "version");
-            files = new ArrayList<String>();
-        }
-
-        private void parseFileTag(XmlResourceParser p) {
-            files.add(p.getAttributeValue(null, "name"));
-        }
-
-        public boolean hasAllFiles(String path) {
-            boolean result = true;
-            List<String> actualFiles = Arrays.asList((new File(path)).list());
-            for (String f : files)
-                result &= actualFiles.contains(f);
-            return result;
-        }
-
-        public boolean isCompatible(String v) {
-            String[] expected = v.split("\\.");
-            String[] actual = version.split("\\.");
-            int i = 0;
-            for (; i < Math.min(expected.length, actual.length); ++i) {
-                int diff = Integer.valueOf(expected[i])
-                        - Integer.valueOf(actual[i]);
-                if (diff > 0 || (diff != 0 && i == 0)) {
-                    // requested version is greater than actual OR major version differs
-                    return false;
-                } else if (diff < 0) {
-                    // version is compatible
-                    return true;
-                }
-            }
-            if (expected.length > i) {
-                // requested version is longer than actual - 2.4.11.2 and 2.4.11
-                return false;
-            }
-            return true;
-        }
-
-        public String getFileList() {
-            return TextUtils.join(";", files);
-        }
-    }
-
-    public void onCreate() {
-        Log.d(TAG, "Service starting");
-        for (Field field : R.xml.class.getDeclaredFields()) {  // Build error here means that all config.xml files are missing (configuration problem)
-            Log.d(TAG, "Found config: " + field.getName());
-            final LibVariant lib = new LibVariant();
-            try {
-                final int id = field.getInt(R.xml.class);
-                final XmlResourceParser p = getResources().getXml(id);
-                lib.parseFile(p);
-            } catch (IllegalArgumentException e) {
-                e.printStackTrace();
-            } catch (IllegalAccessException e) {
-                e.printStackTrace();
-            }
-            if (lib.version != null
-                    && lib.files.size() != 0
-                    && lib.hasAllFiles(getApplication().getApplicationInfo().nativeLibraryDir)) {
-                variants.add(lib);
-            Log.d(TAG, "Added config: " + lib.version);
-            }
-        }
-        super.onCreate();
-        mEngineInterface = new OpenCVEngineInterface.Stub() {
-
-            @Override
-            public boolean installVersion(String version)
-                    throws RemoteException {
-                // DO NOTHING
-                return false;
-            }
-
-            @Override
-            public String getLibraryList(String version) throws RemoteException {
-                Log.i(TAG, "getLibraryList(" + version + ")");
-                for (LibVariant lib : variants) {
-                    Log.i(TAG, "checking " + lib.version + " ...");
-                    if (lib.isCompatible(version))
-                        return lib.getFileList();
-                }
-                return null;
-            }
-
-            @Override
-            public String getLibPathByVersion(String version)
-                    throws RemoteException {
-                // TODO: support API 8
-                return getApplication().getApplicationInfo().nativeLibraryDir;
-            }
-
-            @Override
-            public int getEngineVersion() throws RemoteException {
-                int version = 3450;
-                try {
-                    version = getPackageManager().getPackageInfo(getPackageName(), 0).versionCode;
-                } catch (NameNotFoundException e) {
-                    e.printStackTrace();
-                }
-                return version / 1000;
-            }
-        };
-    }
-
-    public IBinder onBind(Intent intent) {
-        Log.i(TAG, "Service onBind called for intent " + intent.toString());
-        return mEngineInterface;
-    }
-
-    public boolean onUnbind(Intent intent) {
-        Log.i(TAG, "Service onUnbind called for intent " + intent.toString());
-        return true;
-    }
-
-    public void OnDestroy() {
-        Log.i(TAG, "OpenCV Engine service destruction");
-    }
-
-}
diff --git a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
deleted file mode 100644
index 0f97a836d3f6..000000000000
--- a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
+++ /dev/null
@@ -1,113 +0,0 @@
-package org.opencv.engine.manager;
-
-import org.opencv.engine.MarketConnector;
-import org.opencv.engine.HardwareDetector;
-import org.opencv.engine.OpenCVEngineInterface;
-import org.opencv.engine.OpenCVEngineService;
-import org.opencv.engine.R;
-import android.app.Activity;
-import android.content.ComponentName;
-import android.content.Context;
-import android.content.Intent;
-import android.content.ServiceConnection;
-import android.os.Bundle;
-import android.os.IBinder;
-import android.os.RemoteException;
-import android.text.TextUtils;
-import android.util.Log;
-import android.view.View;
-import android.view.View.OnClickListener;
-import android.widget.Button;
-import android.widget.TextView;
-import android.widget.Toast;
-
-public class ManagerActivity extends Activity {
-    protected static final String TAG = "OpenCVEngine/Activity";
-    protected MarketConnector mMarket;
-    protected TextView mVersionText;
-    protected boolean mExtraInfo = false;
-
-    /** Called when the activity is first created. */
-    @Override
-    public void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main);
-
-        final Class<OpenCVEngineService> c = OpenCVEngineService.class;
-        final String packageName = c.getPackage().getName();
-
-        mMarket = new MarketConnector(this);
-
-        Button updateButton = (Button) findViewById(R.id.CheckEngineUpdate);
-        updateButton.setOnClickListener(new OnClickListener() {
-            public void onClick(View v) {
-                if (!mMarket.InstallAppFromMarket(packageName)) {
-                    Toast toast = Toast.makeText(getApplicationContext(),
-                            "Google Play is not available", Toast.LENGTH_SHORT);
-                    toast.show();
-                }
-            }
-        });
-
-        TextView aboutText = (TextView) findViewById(R.id.textView4);
-        aboutText.setText("About (" + packageName + ")");
-
-        if (mExtraInfo) {
-            TextView extraText = (TextView) findViewById(R.id.textView6);
-            extraText.setText(
-                    "CPU count: "
-                    + HardwareDetector.getProcessorCount()
-                    + "\nABI: 0x"
-                    + Integer.toHexString(HardwareDetector.getAbi())
-                    + "\nFlags: "
-                    + TextUtils.join(";", HardwareDetector.getFlags())
-                    + "\nHardware: "
-                    + HardwareDetector.getHardware());
-        }
-
-        mVersionText = (TextView) findViewById(R.id.textView5);
-        if (!bindService(new Intent(this, c),
-                new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE)) {
-            Log.e(TAG, "Failed to bind to service:" + c.getName());
-            mVersionText.setText("not available");
-        } else {
-            Log.d(TAG, "Successfully bound to service:" + c.getName());
-            mVersionText.setText("available");
-        }
-
-    }
-
-    protected class OpenCVEngineServiceConnection implements ServiceConnection {
-        public void onServiceDisconnected(ComponentName name) {
-            Log.d(TAG, "Handle: service disconnected");
-        }
-
-        public void onServiceConnected(ComponentName name, IBinder service) {
-            Log.d(TAG, "Handle: service connected");
-            OpenCVEngineInterface engine = OpenCVEngineInterface.Stub
-                    .asInterface(service);
-            if (engine == null) {
-                Log.e(TAG, "Cannot connect to OpenCV Manager Service!");
-                unbindService(this);
-                return;
-            }
-            Log.d(TAG, "Successful connection");
-            try {
-                String[] vars = { "2.4", "3.0" };
-                String res = new String();
-                for (String piece : vars) {
-                    res += "\n\t" + piece + " -> "
-                            + engine.getLibraryList(piece);
-                }
-                mVersionText.setText("Path: "
-                        + engine.getLibPathByVersion(null) + res);
-            } catch (RemoteException e) {
-                e.printStackTrace();
-                Log.e(TAG, "Call failed");
-            }
-            unbindService(this);
-        }
-    };
-
-}
diff --git a/platforms/android/service/readme.txt b/platforms/android/service/readme.txt
deleted file mode 100644
index b77adbca7984..000000000000
--- a/platforms/android/service/readme.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-How to select the proper version of OpenCV Manager
---------------------------------------------------
-
-Since version 1.7 several packages of OpenCV Manager are built. Every package is targeted for some
-specific hardware platform and includes corresponding OpenCV binaries. So, in all cases OpenCV
-Manager uses built-in version of OpenCV. The new package selection logic in most cases simplifies
-OpenCV installation on end user devices. In most cases OpenCV Manager may be installed automatically
-from Google Play.
-
-If Google Play is not available (i.e. on emulator, developer board, etc), you can install it
-manually using adb tool:
-
-    adb install <path-to-OpenCV-sdk>/apk/OpenCV_<version>_Manager_<app_version>_<platform>.apk
-
-Example: OpenCV_3.4.5-dev_Manager_3.45_armeabi-v7a.apk
-
-Use the list of platforms below to determine proper OpenCV Manager package for your device:
-
-- armeabi (ARMv5, ARMv6)
-- armeabi-v7a (ARMv7-A + NEON)
-- arm64-v8a
-- mips
-- mips64
-- x86
-- x86_64
diff --git a/platforms/apple/build_xcframework.py b/platforms/apple/build_xcframework.py
index 49878435d089..3916c135f947 100755
--- a/platforms/apple/build_xcframework.py
+++ b/platforms/apple/build_xcframework.py
@@ -31,6 +31,8 @@
     parser.add_argument('--framework_name', default='opencv2', help='Name of OpenCV xcframework (default: opencv2, will change to OpenCV in future version)')
     parser.add_argument('--iphoneos_archs', default=None, help='select iPhoneOS target ARCHS. Default is "armv7,arm64"')
     parser.add_argument('--iphonesimulator_archs', default=None, help='select iPhoneSimulator target ARCHS. Default is "x86_64,arm64"')
+    parser.add_argument('--visionos_archs', default=None, help='select visionOS target ARCHS. Default is "arm64"')
+    parser.add_argument('--visionsimulator_archs', default=None, help='select visionSimulator target ARCHS. Default is "arm64"')
     parser.add_argument('--macos_archs', default=None, help='Select MacOS ARCHS. Default is "x86_64,arm64"')
     parser.add_argument('--catalyst_archs', default=None, help='Select Catalyst ARCHS. Default is "x86_64,arm64"')
     parser.add_argument('--build_only_specified_archs', default=False, action='store_true', help='if enabled, only directly specified archs are built and defaults are ignored')
@@ -52,6 +54,13 @@
         iphonesimulator_archs = "x86_64,arm64"
     print('Using iPhoneSimulator ARCHS={}'.format(iphonesimulator_archs))
 
+    # Parse architectures from args
+    visionos_archs = args.visionos_archs
+    print('Using visionOS ARCHS={}'.format(visionos_archs))
+
+    visionsimulator_archs = args.visionsimulator_archs
+    print('Using visionSimulator ARCHS={}'.format(visionsimulator_archs))
+
     macos_archs = args.macos_archs
     if not macos_archs and not args.build_only_specified_archs:
         # Supply defaults
@@ -70,6 +79,7 @@
         # Phase 1: build .frameworks for each platform
         osx_script_path = os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../osx/build_framework.py')
         ios_script_path = os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../ios/build_framework.py')
+        visionos_script_path = os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../ios/build_visionos_framework.py')
 
         build_folders = []
 
@@ -91,6 +101,19 @@ def get_or_create_build_folder(base_dir, platform):
             command = ["python3", ios_script_path, build_folder, "--iphonesimulator_archs", iphonesimulator_archs, "--framework_name", args.framework_name, "--build_only_specified_archs"] + unknown_args
             print_header("Building iPhoneSimulator frameworks")
             execute(command, cwd=os.getcwd())
+        if visionos_archs:
+            build_folder = get_or_create_build_folder(args.out, "visionos")
+            build_folders.append(build_folder)
+            command = ["python3", visionos_script_path, build_folder, "--visionos_archs", visionos_archs, "--framework_name", args.framework_name, "--build_only_specified_archs"] + unknown_args
+            print_header("Building visionOS frameworks")
+            print(command)
+            execute(command, cwd=os.getcwd())
+        if visionsimulator_archs:
+            build_folder = get_or_create_build_folder(args.out, "visionsimulator")
+            build_folders.append(build_folder)
+            command = ["python3", visionos_script_path, build_folder, "--visionsimulator_archs", visionsimulator_archs, "--framework_name", args.framework_name, "--build_only_specified_archs"] + unknown_args
+            print_header("Building visionSimulator frameworks")
+            execute(command, cwd=os.getcwd())
         if macos_archs:
             build_folder = get_or_create_build_folder(args.out, "macos")
             build_folders.append(build_folder)
diff --git a/platforms/ios/Info.Dynamic.plist.in b/platforms/ios/Info.Dynamic.plist.in
index e48fffdb9d62..ce41a52d5b54 100644
--- a/platforms/ios/Info.Dynamic.plist.in
+++ b/platforms/ios/Info.Dynamic.plist.in
@@ -23,7 +23,7 @@
         <string>iPhoneOS</string>
     </array>
     <key>MinimumOSVersion</key>
-    <string>8.0</string>
+    <string>${IPHONEOS_DEPLOYMENT_TARGET}</string>
     <key>UIDeviceFamily</key>
     <array>
         <integer>1</integer>
diff --git a/platforms/ios/PrivacyInfo.xcprivacy b/platforms/ios/PrivacyInfo.xcprivacy
new file mode 100644
index 000000000000..f369de381311
--- /dev/null
+++ b/platforms/ios/PrivacyInfo.xcprivacy
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>NSPrivacyTracking</key>
+    <false/>
+    <key>NSPrivacyCollectedDataTypes</key>
+    <array/>
+    <key>NSPrivacyTrackingDomains</key>
+    <array/>
+    <key>NSPrivacyAccessedAPITypes</key>
+    <array>
+        <dict>
+            <key>NSPrivacyAccessedAPIType</key>
+            <string>NSPrivacyAccessedAPICategoryFileTimestamp</string>
+            <key>NSPrivacyAccessedAPITypeReasons</key>
+            <array>
+                <string>0A2A.1</string>
+            </array>
+        </dict>
+        <dict>
+            <key>NSPrivacyAccessedAPIType</key>
+            <string>NSPrivacyAccessedAPICategorySystemBootTime</string>
+            <key>NSPrivacyAccessedAPITypeReasons</key>
+            <array>
+                <string>35F9.1</string>
+            </array>
+        </dict>
+    </array>
+</dict>
+</plist>
diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py
index 77878280f7f5..1904a67ae7c9 100755
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -46,6 +46,9 @@ def copy_tree(src, dst):
 
 IPHONEOS_DEPLOYMENT_TARGET='9.0'  # default, can be changed via command line options or environment variable
 
+CURRENT_FILE_DIR = os.path.dirname(__file__)
+
+
 class Builder:
     def __init__(self, opencv, contrib, dynamic, bitcodedisabled, exclude, disable, enablenonfree, targets, debug, debug_info, framework_name, run_tests, build_docs, swiftdisabled):
         self.opencv = os.path.abspath(opencv)
@@ -254,9 +257,9 @@ def makeCMakeCmd(self, arch, target, dir, cmakeargs = []):
         toolchain = self.getToolchain(arch, target)
         cmakecmd = self.getCMakeArgs(arch, target) + \
             (["-DCMAKE_TOOLCHAIN_FILE=%s" % toolchain] if toolchain is not None else [])
-        if target.lower().startswith("iphoneos"):
+        if target.lower().startswith("iphoneos") or target.lower().startswith("xros"):
             cmakecmd.append("-DCPU_BASELINE=DETECT")
-        if target.lower().startswith("iphonesimulator"):
+        if target.lower().startswith("iphonesimulator") or target.lower().startswith("xrsimulator"):
             build_arch = check_output(["uname", "-m"]).decode('utf-8').rstrip()
             if build_arch != arch:
                 print("build_arch (%s) != arch (%s)" % (build_arch, arch))
@@ -341,7 +344,7 @@ def mergeLibs(self, builddir):
     def makeDynamicLib(self, builddir):
         target = builddir[(builddir.rfind("build-") + 6):]
         target_platform = target[(target.rfind("-") + 1):]
-        is_device = target_platform == "iphoneos" or target_platform == "catalyst"
+        is_device = target_platform == "iphoneos" or target_platform == "visionos" or target_platform == "catalyst"
         framework_dir = os.path.join(builddir, "install", "lib", self.framework_name + ".framework")
         if not os.path.exists(framework_dir):
             os.makedirs(framework_dir)
@@ -379,7 +382,7 @@ def makeDynamicLib(self, builddir):
                 "-framework", "CoreImage", "-framework", "CoreMedia", "-framework", "QuartzCore",
                 "-framework", "Accelerate", "-framework", "OpenCL",
             ]
-        elif target_platform == "iphoneos" or target_platform == "iphonesimulator":
+        elif target_platform == "iphoneos" or target_platform == "iphonesimulator" or  target_platform == "xros" or target_platform == "xrsimulator":
             framework_options = [
                 "-iframework", "%s/System/iOSSupport/System/Library/Frameworks" % sdk_dir,
                 "-framework", "AVFoundation", "-framework", "CoreGraphics",
@@ -477,6 +480,9 @@ def makeFramework(self, outdir, builddirs):
                 s = os.path.join(*l[0])
                 d = os.path.join(framework_dir, *l[1])
                 os.symlink(s, d)
+        # Copy Apple privacy manifest
+        shutil.copyfile(os.path.join(CURRENT_FILE_DIR, "PrivacyInfo.xcprivacy"),
+                        os.path.join(resdir, "PrivacyInfo.xcprivacy"))
 
     def copy_samples(self, outdir):
         return
diff --git a/platforms/ios/build_visionos_framework.py b/platforms/ios/build_visionos_framework.py
new file mode 100755
index 000000000000..96364c7b8b4a
--- /dev/null
+++ b/platforms/ios/build_visionos_framework.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+"""
+The script builds OpenCV.framework for visionOS.
+"""
+
+from __future__ import print_function
+import os, os.path, sys, argparse, traceback, multiprocessing
+
+# import common code
+# sys.path.insert(0, os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../ios'))
+from build_framework import Builder
+sys.path.insert(0, os.path.abspath(os.path.abspath(os.path.dirname(__file__))+'/../apple'))
+from cv_build_utils import print_error, get_cmake_version
+
+XROS_DEPLOYMENT_TARGET='1.0'  # default, can be changed via command line options or environment variable
+
+class visionOSBuilder(Builder):
+
+    def checkCMakeVersion(self):
+        assert get_cmake_version() >= (3, 17), "CMake 3.17 or later is required. Current version is {}".format(get_cmake_version())
+
+    def getObjcTarget(self, target):
+        return 'visionos'
+
+    def getToolchain(self, arch, target):
+        toolchain = os.path.join(self.opencv, "platforms", "ios", "cmake", "Toolchains", "Toolchain-%s_Xcode.cmake" % target)
+        return toolchain
+
+    def getCMakeArgs(self, arch, target):
+        args = Builder.getCMakeArgs(self, arch, target)
+        args = args + [
+            '-DVISIONOS_ARCH=%s' % arch
+        ]
+        return args
+
+    def getBuildCommand(self, arch, target):
+        buildcmd = [
+            "xcodebuild",
+            "XROS_DEPLOYMENT_TARGET=" + os.environ['XROS_DEPLOYMENT_TARGET'],
+            "ARCHS=%s" % arch,
+            "-sdk", target.lower(),
+            "-configuration", "Debug" if self.debug else "Release",
+            "-parallelizeTargets",
+            "-jobs", str(multiprocessing.cpu_count())
+        ]
+
+        return buildcmd
+
+    def getInfoPlist(self, builddirs):
+        return os.path.join(builddirs[0], "visionos", "Info.plist")
+
+
+if __name__ == "__main__":
+    folder = os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "../.."))
+    parser = argparse.ArgumentParser(description='The script builds OpenCV.framework for visionOS.')
+    # TODO: When we can make breaking changes, we should make the out argument explicit and required like in build_xcframework.py.
+    parser.add_argument('out', metavar='OUTDIR', help='folder to put built framework')
+    parser.add_argument('--opencv', metavar='DIR', default=folder, help='folder with opencv repository (default is "../.." relative to script location)')
+    parser.add_argument('--contrib', metavar='DIR', default=None, help='folder with opencv_contrib repository (default is "None" - build only main framework)')
+    parser.add_argument('--without', metavar='MODULE', default=[], action='append', help='OpenCV modules to exclude from the framework. To exclude multiple, specify this flag again, e.g. "--without video --without objc"')
+    parser.add_argument('--disable', metavar='FEATURE', default=[], action='append', help='OpenCV features to disable (add WITH_*=OFF). To disable multiple, specify this flag again, e.g. "--disable tbb --disable openmp"')
+    parser.add_argument('--dynamic', default=False, action='store_true', help='build dynamic framework (default is "False" - builds static framework)')
+    parser.add_argument('--enable_nonfree', default=False, dest='enablenonfree', action='store_true', help='enable non-free modules (disabled by default)')
+    parser.add_argument('--visionos_deployment_target', default=os.environ.get('XROS_DEPLOYMENT_TARGET', XROS_DEPLOYMENT_TARGET), help='specify XROS_DEPLOYMENT_TARGET')
+    parser.add_argument('--visionos_archs', default=None, help='select visionOS target ARCHS. Default is none')
+    parser.add_argument('--visionsimulator_archs', default=None, help='select visionSimulator target ARCHS. Default is none')
+    parser.add_argument('--debug', action='store_true', help='Build "Debug" binaries (CMAKE_BUILD_TYPE=Debug)')
+    parser.add_argument('--debug_info', action='store_true', help='Build with debug information (useful for Release mode: BUILD_WITH_DEBUG_INFO=ON)')
+    parser.add_argument('--framework_name', default='opencv2', dest='framework_name', help='Name of OpenCV framework (default: opencv2, will change to OpenCV in future version)')
+    parser.add_argument('--legacy_build', default=False, dest='legacy_build', action='store_true', help='Build legacy framework (default: False, equivalent to "--framework_name=opencv2 --without=objc")')
+    parser.add_argument('--run_tests', default=False, dest='run_tests', action='store_true', help='Run tests')
+    parser.add_argument('--build_docs', default=False, dest='build_docs', action='store_true', help='Build docs')
+    parser.add_argument('--disable-swift', default=False, dest='swiftdisabled', action='store_true', help='Disable building of Swift extensions')
+
+    args, unknown_args = parser.parse_known_args()
+    if unknown_args:
+        print("The following args are not recognized and will not be used: %s" % unknown_args)
+
+    os.environ['XROS_DEPLOYMENT_TARGET'] = args.visionos_deployment_target
+    print('Using XROS_DEPLOYMENT_TARGET=' + os.environ['XROS_DEPLOYMENT_TARGET'])
+
+    visionos_archs = None
+    if args.visionos_archs:
+        visionos_archs = args.visionos_archs.split(',')
+    print('Using visionOS ARCHS=' + str(visionos_archs))
+
+    visionsimulator_archs = None
+    if args.visionsimulator_archs:
+        visionsimulator_archs = args.visionsimulator_archs.split(',')
+    print('Using visionOS ARCHS=' + str(visionsimulator_archs))
+
+    # Prevent the build from happening if the same architecture is specified for multiple platforms.
+    # When `lipo` is run to stitch the frameworks together into a fat framework, it'll fail, so it's
+    # better to stop here while we're ahead.
+    if visionos_archs and visionsimulator_archs:
+        duplicate_archs = set(visionos_archs).intersection(visionsimulator_archs)
+        if duplicate_archs:
+            print_error("Cannot have the same architecture for multiple platforms in a fat framework! Consider using build_xcframework.py in the apple platform folder instead. Duplicate archs are %s" % duplicate_archs)
+            exit(1)
+
+    if args.legacy_build:
+        args.framework_name = "opencv2"
+        if not "objc" in args.without:
+            args.without.append("objc")
+
+    targets = []
+    if not visionos_archs and not visionsimulator_archs:
+        print_error("--visionos_archs and --visionsimulator_archs are undefined; nothing will be built.")
+        sys.exit(1)
+    if visionos_archs:
+        targets.append((visionos_archs, "XROS"))
+    if visionsimulator_archs:
+        targets.append((visionsimulator_archs, "XRSimulator")),
+
+    b = visionOSBuilder(args.opencv, args.contrib, args.dynamic, True, args.without, args.disable, args.enablenonfree, targets, args.debug, args.debug_info, args.framework_name, args.run_tests, args.build_docs, args.swiftdisabled)
+    b.build(args.out)
diff --git a/platforms/ios/cmake/Toolchains/Toolchain-XROS_Xcode.cmake b/platforms/ios/cmake/Toolchains/Toolchain-XROS_Xcode.cmake
new file mode 100644
index 000000000000..0f22bd73a9ca
--- /dev/null
+++ b/platforms/ios/cmake/Toolchains/Toolchain-XROS_Xcode.cmake
@@ -0,0 +1,5 @@
+message(STATUS "Setting up visionOS toolchain for VISIONOS_ARCH='${VISIONOS_ARCH}'")
+set(VISIONOS TRUE)
+set(XROS 1)
+include(${CMAKE_CURRENT_LIST_DIR}/common-ios-toolchain.cmake)
+message(STATUS "visionOS toolchain loaded")
diff --git a/platforms/ios/cmake/Toolchains/Toolchain-XRSimulator_Xcode.cmake b/platforms/ios/cmake/Toolchains/Toolchain-XRSimulator_Xcode.cmake
new file mode 100644
index 000000000000..aee12f4c5d34
--- /dev/null
+++ b/platforms/ios/cmake/Toolchains/Toolchain-XRSimulator_Xcode.cmake
@@ -0,0 +1,5 @@
+message(STATUS "Setting up visionSimulator toolchain for VISIONOS_ARCH='${VISIONOS_ARCH}'")
+set(VISIONSIMULATOR TRUE)
+set(XROS 1)
+include(${CMAKE_CURRENT_LIST_DIR}/common-ios-toolchain.cmake)
+message(STATUS "visionSimulator toolchain loaded")
diff --git a/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake b/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
index 9918c468b45c..7a14529c2d73 100644
--- a/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
+++ b/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
@@ -61,26 +61,45 @@ else()
   endmacro()
 endif() # IN_TRY_COMPILE
 
-if(NOT DEFINED IOS_ARCH)
+if((IPHONEOS OR IPHONESIMULATOR) AND NOT DEFINED IOS_ARCH)
   message(FATAL_ERROR "iOS toolchain requires ARCH option for proper configuration of compiler flags")
 endif()
-if(IOS_ARCH MATCHES "^arm64")
+if((VISIONOS OR VISIONSIMULATOR) AND NOT DEFINED VISIONOS_ARCH)
+  message(FATAL_ERROR "visionOS toolchain requires ARCH option for proper configuration of compiler flags")
+endif()
+if((IOS_ARCH MATCHES "^arm64") OR (VISIONOS_ARCH MATCHES "^arm64"))
   set(AARCH64 1)
 elseif(IOS_ARCH MATCHES "^armv")
   set(ARM 1)
-elseif(IOS_ARCH MATCHES "^x86_64")
+elseif((IOS_ARCH MATCHES "^x86_64") OR (VISIONOS_ARCH MATCHES "^x86_64"))
   set(X86_64 1)
 elseif(IOS_ARCH MATCHES "^i386")
   set(X86 1)
 else()
-  message(FATAL_ERROR "iOS toolchain doesn't recognize ARCH='${IOS_ARCH}' value")
+  if(IPHONEOS OR IPHONESIMULATOR)
+    message(FATAL_ERROR "invalid value of IOS_ARCH='${IOS_ARCH}'")
+  elseif(VISIONOS OR VISIONSIMULATOR)
+    message(FATAL_ERROR "invalid value of VISIONOS_ARCH='${VISIONOS_ARCH}'")
+  endif()
 endif()
 
 if(NOT DEFINED CMAKE_OSX_SYSROOT)
   if(IPHONEOS)
     set(CMAKE_OSX_SYSROOT "iphoneos")
+    set(SYSTEM "iOS")
+    set(ARCH "${IOS_ARCH}")
   elseif(IPHONESIMULATOR)
     set(CMAKE_OSX_SYSROOT "iphonesimulator")
+    set(SYSTEM "iOS")
+    set(ARCH "${IOS_ARCH}")
+  elseif(VISIONOS)
+    set(CMAKE_OSX_SYSROOT "xros")
+    set(SYSTEM "visionOS")
+    set(ARCH "${VISIONOS_ARCH}")
+  elseif(VISIONSIMULATOR)
+    set(CMAKE_OSX_SYSROOT "xrsimulator")
+    set(SYSTEM "visionOS")
+    set(ARCH "${VISIONOS_ARCH}")
   elseif(MAC_CATALYST)
     # Use MacOS SDK for Catalyst builds
     set(CMAKE_OSX_SYSROOT "macosx")
@@ -90,14 +109,25 @@ set(CMAKE_MACOSX_BUNDLE YES)
 set(CMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO")
 
 if(APPLE_FRAMEWORK AND NOT BUILD_SHARED_LIBS)
-  set(CMAKE_OSX_ARCHITECTURES "${IOS_ARCH}" CACHE INTERNAL "Build architecture for iOS" FORCE)
+  set(CMAKE_OSX_ARCHITECTURES "${ARCH}" CACHE INTERNAL "Build architecture for iOS/visionOS" FORCE)
 endif()
 
-if(NOT DEFINED IPHONEOS_DEPLOYMENT_TARGET AND NOT MAC_CATALYST)
+if((IPHONEOS OR IPHONESIMULATOR) AND NOT DEFINED IPHONEOS_DEPLOYMENT_TARGET)
   if(NOT DEFINED ENV{IPHONEOS_DEPLOYMENT_TARGET})
     message(FATAL_ERROR "IPHONEOS_DEPLOYMENT_TARGET is not specified")
   endif()
   set(IPHONEOS_DEPLOYMENT_TARGET "$ENV{IPHONEOS_DEPLOYMENT_TARGET}")
+  set(DEPLOYMENT_TARGET "${IPHONEOS_DEPLOYMENT_TARGET}")
+  set(DEPLOYMENT_TARGET_CMDLINE "IPHONEOS_DEPLOYMENT_TARGET=${IPHONEOS_DEPLOYMENT_TARGET}")
+endif()
+
+if((VISIONOS OR VISIONSIMULATOR) AND NOT DEFINED XROS_DEPLOYMENT_TARGET)
+  if(NOT DEFINED ENV{XROS_DEPLOYMENT_TARGET})
+    message(FATAL_ERROR "XROS_DEPLOYMENT_TARGET is not specified")
+  endif()
+  set(XROS_DEPLOYMENT_TARGET "$ENV{XROS_DEPLOYMENT_TARGET}")
+  set(DEPLOYMENT_TARGET "${XROS_DEPLOYMENT_TARGET}")
+  set(DEPLOYMENT_TARGET_CMDLINE "XROS_DEPLOYMENT_TARGET=${XROS_DEPLOYMENT_TARGET}")
 endif()
 
 if(NOT __IN_TRY_COMPILE)
@@ -124,9 +154,9 @@ if(NOT __IN_TRY_COMPILE)
       message(FATAL_ERROR "Can't prepare xcodebuild_wrapper")
     endif()
     if(APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
-      set(XCODEBUILD_EXTRA_ARGS "${XCODEBUILD_EXTRA_ARGS} IPHONEOS_DEPLOYMENT_TARGET=${IPHONEOS_DEPLOYMENT_TARGET} CODE_SIGN_IDENTITY='' CODE_SIGNING_REQUIRED=NO -sdk ${CMAKE_OSX_SYSROOT}")
+      set(XCODEBUILD_EXTRA_ARGS "${XCODEBUILD_EXTRA_ARGS} ${DEPLOYMENT_TARGET_CMDLINE} CODE_SIGN_IDENTITY='' CODE_SIGNING_REQUIRED=NO -sdk ${CMAKE_OSX_SYSROOT}")
     else()
-      set(XCODEBUILD_EXTRA_ARGS "${XCODEBUILD_EXTRA_ARGS} IPHONEOS_DEPLOYMENT_TARGET=${IPHONEOS_DEPLOYMENT_TARGET} CODE_SIGN_IDENTITY='' CODE_SIGNING_REQUIRED=NO ARCHS=${IOS_ARCH} -sdk ${CMAKE_OSX_SYSROOT}")
+      set(XCODEBUILD_EXTRA_ARGS "${XCODEBUILD_EXTRA_ARGS} ${DEPLOYMENT_TARGET_CMDLINE} CODE_SIGN_IDENTITY='' CODE_SIGNING_REQUIRED=NO ARCHS=${ARCH} -sdk ${CMAKE_OSX_SYSROOT}")
     endif()
     configure_file("${CMAKE_CURRENT_LIST_DIR}/xcodebuild_wrapper.in" "${_xcodebuild_wrapper_tmp}" @ONLY)
     file(COPY "${_xcodebuild_wrapper_tmp}" DESTINATION ${CMAKE_BINARY_DIR} FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
@@ -137,16 +167,16 @@ if(NOT __IN_TRY_COMPILE)
 endif()
 
 # Standard settings
-set(CMAKE_SYSTEM_NAME iOS)
+set(CMAKE_SYSTEM_NAME "${SYSTEM}")
 
 # Apple Framework settings
 if(APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
-  set(CMAKE_SYSTEM_VERSION "${IPHONEOS_DEPLOYMENT_TARGET}")
+  set(CMAKE_SYSTEM_VERSION "${DEPLOYMENT_TARGET}")
   set(CMAKE_C_SIZEOF_DATA_PTR 4)
   set(CMAKE_CXX_SIZEOF_DATA_PTR 4)
 else()
-  set(CMAKE_SYSTEM_VERSION "${IPHONEOS_DEPLOYMENT_TARGET}")
-  set(CMAKE_SYSTEM_PROCESSOR "${IOS_ARCH}")
+  set(CMAKE_SYSTEM_VERSION "${DEPLOYMENT_TARGET}")
+  set(CMAKE_SYSTEM_PROCESSOR "${ARCH}")
 
   if(AARCH64 OR X86_64)
     set(CMAKE_C_SIZEOF_DATA_PTR 8)
@@ -190,4 +220,4 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM)
   set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 endif()
 
-toolchain_save_config(IOS_ARCH IPHONEOS_DEPLOYMENT_TARGET)
+toolchain_save_config(IOS_ARCH VISIONOS_ARCH ARCH DEPLOYMENT_TARGET)
diff --git a/platforms/js/build_js.py b/platforms/js/build_js.py
index 3e8edfe4adb4..f0701212af3c 100644
--- a/platforms/js/build_js.py
+++ b/platforms/js/build_js.py
@@ -84,7 +84,6 @@ def get_cmake_cmd(self):
             "-DPYTHON_DEFAULT_EXECUTABLE=%s" % sys.executable,
                "-DENABLE_PIC=FALSE", # To workaround emscripten upstream backend issue https://github.com/emscripten-core/emscripten/issues/8761
                "-DCMAKE_BUILD_TYPE=Release",
-               "-DCMAKE_TOOLCHAIN_FILE='%s'" % self.get_toolchain_file(),
                "-DCPU_BASELINE=''",
                "-DCMAKE_INSTALL_PREFIX=/usr/local",
                "-DCPU_DISPATCH=''",
@@ -118,7 +117,7 @@ def get_cmake_cmd(self):
                "-DWITH_GPHOTO2=OFF",
                "-DWITH_LAPACK=OFF",
                "-DWITH_ITT=OFF",
-               "-DWITH_QUIRC=ON",
+               "-DWITH_QUIRC=OFF",
                "-DBUILD_ZLIB=ON",
                "-DBUILD_opencv_apps=OFF",
                "-DBUILD_opencv_calib3d=ON",
@@ -145,6 +144,8 @@ def get_cmake_cmd(self):
                "-DBUILD_PERF_TESTS=ON"]
         if self.options.cmake_option:
             cmd += self.options.cmake_option
+        if not self.options.cmake_option or all(["-DCMAKE_TOOLCHAIN_FILE" not in opt for opt in self.options.cmake_option]):
+            cmd.append("-DCMAKE_TOOLCHAIN_FILE='%s'" % self.get_toolchain_file())
         if self.options.build_doc:
             cmd.append("-DBUILD_DOCS=ON")
         else:
@@ -194,6 +195,7 @@ def get_build_flags(self):
             flags += self.options.build_flags
         if self.options.webnn:
             flags += "-s USE_WEBNN=1 "
+        flags += "-s EXPORTED_FUNCTIONS=\"['_malloc', '_free']\""
         return flags
 
     def config(self):
@@ -224,10 +226,12 @@ def build_loader(self):
 
     opencv_dir = os.path.abspath(os.path.join(SCRIPT_DIR, '../..'))
     emscripten_dir = None
-    if "EMSCRIPTEN" in os.environ:
+    if "EMSDK" in os.environ:
+        emscripten_dir = os.path.join(os.environ["EMSDK"], "upstream", "emscripten")
+    elif "EMSCRIPTEN" in os.environ:
         emscripten_dir = os.environ["EMSCRIPTEN"]
     else:
-        log.warning("EMSCRIPTEN environment variable is not available. Please properly activate Emscripten SDK and consider using 'emcmake' launcher")
+        log.warning("EMSCRIPTEN/EMSDK environment variable is not available. Please properly activate Emscripten SDK and consider using 'emcmake' launcher")
 
     parser = argparse.ArgumentParser(description='Build OpenCV.js by Emscripten')
     parser.add_argument("build_dir", help="Building directory (and output)")
@@ -256,7 +260,8 @@ def build_loader(self):
                         help="Specify configuration file with own list of exported into JS functions")
     parser.add_argument('--webnn', action="store_true", help="Enable WebNN Backend")
 
-    args = parser.parse_args()
+    transformed_args = ["--cmake_option=%s".format(arg) if arg[:2] == "-D" else arg for arg in sys.argv[1:]]
+    args = parser.parse_args(transformed_args)
 
     log.debug("Args: %s", args)
 
@@ -266,7 +271,7 @@ def build_loader(self):
         del os.environ['EMMAKEN_JUST_CONFIGURE']  # avoid linker errors with NODERAWFS message then using 'emcmake' launcher
 
     if args.emscripten_dir is None:
-        log.error("Cannot get Emscripten path, please use 'emcmake' launcher or specify it either by EMSCRIPTEN environment variable or --emscripten_dir option.")
+        log.error("Cannot get Emscripten path, please use 'emcmake' launcher or specify it either by EMSCRIPTEN/EMSDK environment variable or --emscripten_dir option.")
         sys.exit(-1)
 
     builder = Builder(args)
diff --git a/platforms/js/opencv_js.config.py b/platforms/js/opencv_js.config.py
index 69891ea71a07..1251062e4c0f 100644
--- a/platforms/js/opencv_js.config.py
+++ b/platforms/js/opencv_js.config.py
@@ -9,6 +9,7 @@
         'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed',
         'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat',
         'setLogLevel', 'getLogLevel',
+        'LUT',
     ],
     'Algorithm': [],
 }
@@ -127,7 +128,9 @@
              'aruco_CharucoDetector': ['CharucoDetector', 'setBoard', 'setCharucoParameters', 'setDetectorParameters', 'setRefineParameters', 'detectBoard', 'detectDiamonds'],
              'QRCodeDetectorAruco_Params': ['Params'],
              'QRCodeDetectorAruco': ['QRCodeDetectorAruco', 'decode', 'detect', 'detectAndDecode', 'detectMulti', 'decodeMulti', 'detectAndDecodeMulti', 'setDetectorParameters', 'setArucoParameters'],
-             'barcode_BarcodeDetector': ['BarcodeDetector', 'decode', 'detect', 'detectAndDecode', 'detectMulti', 'decodeMulti', 'detectAndDecodeMulti', 'decodeWithType', 'detectAndDecodeWithType']
+             'barcode_BarcodeDetector': ['BarcodeDetector', 'decode', 'detect', 'detectAndDecode', 'detectMulti', 'decodeMulti', 'detectAndDecodeMulti', 'decodeWithType', 'detectAndDecodeWithType'],
+             'FaceDetectorYN': ['setInputSize', 'getInputSize', 'setScoreThreshold', 'getScoreThreshold', 'setNMSThreshold', 'getNMSThreshold',
+                                'setTopK', 'getTopK', 'detect', 'create'],
 }
 
 video = {
@@ -146,7 +149,7 @@
     'TrackerMIL_Params': [],
 }
 
-dnn = {'dnn_Net': ['setInput', 'forward', 'setPreferableBackend'],
+dnn = {'dnn_Net': ['setInput', 'forward', 'setPreferableBackend','getUnconnectedOutLayersNames'],
        '': ['readNetFromCaffe', 'readNetFromTensorflow', 'readNetFromTorch', 'readNetFromDarknet',
             'readNetFromONNX', 'readNetFromTFLite', 'readNet', 'blobFromImage']}
 
@@ -157,7 +160,8 @@
               'FastFeatureDetector': ['create', 'setThreshold', 'getThreshold', 'setNonmaxSuppression', 'getNonmaxSuppression', 'setType', 'getType', 'getDefaultName'],
               'AgastFeatureDetector': ['create', 'setThreshold', 'getThreshold', 'setNonmaxSuppression', 'getNonmaxSuppression', 'setType', 'getType', 'getDefaultName'],
               'GFTTDetector': ['create', 'setMaxFeatures', 'getMaxFeatures', 'setQualityLevel', 'getQualityLevel', 'setMinDistance', 'getMinDistance', 'setBlockSize', 'getBlockSize', 'setHarrisDetector', 'getHarrisDetector', 'setK', 'getK', 'getDefaultName'],
-              # 'SimpleBlobDetector': ['create'],
+              'SimpleBlobDetector': ['create', 'setParams', 'getParams', 'getDefaultName'],
+              'SimpleBlobDetector_Params': [],
               'KAZE': ['create', 'setExtended', 'getExtended', 'setUpright', 'getUpright', 'setThreshold', 'getThreshold', 'setNOctaves', 'getNOctaves', 'setNOctaveLayers', 'getNOctaveLayers', 'setDiffusivity', 'getDiffusivity', 'getDefaultName'],
               'AKAZE': ['create', 'setDescriptorType', 'getDescriptorType', 'setDescriptorSize', 'getDescriptorSize', 'setDescriptorChannels', 'getDescriptorChannels', 'setThreshold', 'getThreshold', 'setNOctaves', 'getNOctaves', 'setNOctaveLayers', 'getNOctaveLayers', 'setDiffusivity', 'getDiffusivity', 'getDefaultName'],
               'DescriptorMatcher': ['add', 'clear', 'empty', 'isMaskSupported', 'train', 'match', 'knnMatch', 'radiusMatch', 'clone', 'create'],
diff --git a/platforms/linux/riscv-gnu.toolchain.cmake b/platforms/linux/riscv-gnu.toolchain.cmake
index 662fb6bddb82..1657bd16813d 100644
--- a/platforms/linux/riscv-gnu.toolchain.cmake
+++ b/platforms/linux/riscv-gnu.toolchain.cmake
@@ -25,22 +25,22 @@ if(NOT DEFINED TOOLCHAIN_COMPILER_LOCATION_HINT)
 endif()
 
 if(NOT DEFINED CMAKE_C_COMPILER)
-  find_program(CMAKE_C_COMPILER NAMES ${GNU_MACHINE}-gcc${__GCC_VER_SUFFIX} ${TOOLCHAIN_COMPILER_LOCATION_HINT})
+  find_program(CMAKE_C_COMPILER NAMES ${GNU_MACHINE}-gcc${__GCC_VER_SUFFIX} PATHS ${TOOLCHAIN_COMPILER_LOCATION_HINT})
 else()
   #message(WARNING "CMAKE_C_COMPILER=${CMAKE_C_COMPILER} is defined")
 endif()
 if(NOT DEFINED CMAKE_CXX_COMPILER)
-  find_program(CMAKE_CXX_COMPILER NAMES ${GNU_MACHINE}-g++${__GCC_VER_SUFFIX} ${TOOLCHAIN_COMPILER_LOCATION_HINT})
+  find_program(CMAKE_CXX_COMPILER NAMES ${GNU_MACHINE}-g++${__GCC_VER_SUFFIX} PATHS ${TOOLCHAIN_COMPILER_LOCATION_HINT})
 else()
   #message(WARNING "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} is defined")
 endif()
 if(NOT DEFINED CMAKE_LINKER)
-  find_program(CMAKE_LINKER NAMES ${GNU_MACHINE}-ld${__GCC_VER_SUFFIX} ${GNU_MACHINE}-ld ${TOOLCHAIN_COMPILER_LOCATION_HINT})
+  find_program(CMAKE_LINKER NAMES ${GNU_MACHINE}-ld${__GCC_VER_SUFFIX} ${GNU_MACHINE}-ld PATHS ${TOOLCHAIN_COMPILER_LOCATION_HINT})
 else()
   #message(WARNING "CMAKE_LINKER=${CMAKE_LINKER} is defined")
 endif()
 if(NOT DEFINED CMAKE_AR)
-  find_program(CMAKE_AR NAMES ${GNU_MACHINE}-ar${__GCC_VER_SUFFIX} ${GNU_MACHINE}-ar ${TOOLCHAIN_COMPILER_LOCATION_HINT})
+  find_program(CMAKE_AR NAMES ${GNU_MACHINE}-ar${__GCC_VER_SUFFIX} ${GNU_MACHINE}-ar PATHS ${TOOLCHAIN_COMPILER_LOCATION_HINT})
 else()
   #message(WARNING "CMAKE_AR=${CMAKE_AR} is defined")
 endif()
diff --git a/platforms/linux/riscv64-071-gcc.toolchain.cmake b/platforms/linux/riscv64-071-gcc.toolchain.cmake
index 53e4a7fced25..05420065701b 100644
--- a/platforms/linux/riscv64-071-gcc.toolchain.cmake
+++ b/platforms/linux/riscv64-071-gcc.toolchain.cmake
@@ -4,5 +4,54 @@ set(CMAKE_SYSTEM_PROCESSOR riscv64)
 set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++)
 set(CMAKE_C_COMPILER  riscv64-unknown-linux-gnu-gcc)
 
-set(CMAKE_CXX_FLAGS_INIT "-march=rv64gcv -mabi=lp64d -D__riscv_vector_071")
-set(CMAKE_C_FLAGS_INIT "-march=rv64gcv -mabi=lp64d -D__riscv_vector_071")
+# MangoPi MQ Pro - C906FD, C906FDV
+# Lichee Pi 4A - C910, C910V (?)
+# CanMV K230 - C908, C908V
+
+# See https://github.com/T-head-Semi/gcc/blob/xuantie-gcc-10.4.0/gcc/config/riscv/riscv-cores.def
+
+set(_enable_vector OFF)
+if(CORE STREQUAL "C906FD")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+elseif(CORE STREQUAL "C906FDV")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  # Disabled due to limited 64-bit SEW support
+  # set(_enable_vector ON)
+elseif(CORE STREQUAL "C908")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c908 -mabi=lp64d  -mtune=c908")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c908 -mabi=lp64d  -mtune=c908")
+elseif(CORE STREQUAL "C908V")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c908v -mabi=lp64d  -mtune=c908")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c908v -mabi=lp64d  -mtune=c908")
+  set(_enable_vector ON) # RVV 1.0
+elseif(CORE STREQUAL "C910")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c910 -mabi=lp64d -mtune=c910")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c910 -mabi=lp64d -mtune=c910")
+elseif(CORE STREQUAL "C910V")
+  set(CMAKE_C_FLAGS_INIT "-march=rv64imafdcv0p7xthead -mabi=lp64d")
+  set(CMAKE_CXX_FLAGS_INIT "-march=rv64imafdcv0p7xthead -mabi=lp64d")
+  set(_enable_vector ON) # RVV 0.7.1
+elseif(CORE STREQUAL "C920")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c920 -mabi=lp64d  -mtune=c920")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c920 -mabi=lp64d  -mtune=c920")
+  set(_enable_vector ON) # RVV 0.7.1
+elseif(CORE STREQUAL "C920V2")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c920v2 -mabi=lp64d  -mtune=c920v2")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c920v2 -mabi=lp64d  -mtune=c920v2")
+  set(_enable_vector ON) # RVV 1.0
+else()
+  set(CMAKE_C_FLAGS_INIT "-march=rv64imafdc_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d")
+  set(CMAKE_CXX_FLAGS_INIT "-march=rv64imafdc_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d")
+endif()
+
+if(_enable_vector)
+  set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} -D__riscv_vector_071 -mrvv-vector-bits=128")
+  set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -D__riscv_vector_071 -mrvv-vector-bits=128")
+endif()
+
+if(ENABLE_GCOV)
+  set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fprofile-arcs -ftest-coverage")
+  set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} -fprofile-arcs -ftest-coverage")
+endif()
diff --git a/platforms/linux/riscv64-andes-gcc.toolchain.cmake b/platforms/linux/riscv64-andes-gcc.toolchain.cmake
new file mode 100755
index 000000000000..9b9c0b524678
--- /dev/null
+++ b/platforms/linux/riscv64-andes-gcc.toolchain.cmake
@@ -0,0 +1,25 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+message(STATUS "RISCV: $ENV{RISCV}")
+message(STATUS "RISCV_GCC_INSTALL_ROOT: $ENV{RISCV_GCC_INSTALL_ROOT}")
+
+set(RISCV_GCC_INSTALL_ROOT $ENV{RISCV} CACHE PATH "Path to GCC for RISC-V cross compiler installation directory")
+
+set(CMAKE_C_COMPILER  ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc)
+set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++)
+
+# fix toolchain macro
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ANDES=1")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ANDES=1")
+
+# enable rvp
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp")
+
+# fix segment address
+
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-Ttext-segment=0x50000")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Ttext-segment=0x50000")
diff --git a/platforms/maven/opencv-it/pom.xml b/platforms/maven/opencv-it/pom.xml
index 8e987002d8b8..6b0e0f54d3c8 100644
--- a/platforms/maven/opencv-it/pom.xml
+++ b/platforms/maven/opencv-it/pom.xml
@@ -4,7 +4,7 @@
     <parent>
         <groupId>org.opencv</groupId>
         <artifactId>opencv-parent</artifactId>
-        <version>4.8.0</version>
+        <version>4.10.0</version>
     </parent>
     <groupId>org.opencv</groupId>
     <artifactId>opencv-it</artifactId>
diff --git a/platforms/maven/opencv/pom.xml b/platforms/maven/opencv/pom.xml
index 2aba1156f379..8e5221f0a987 100644
--- a/platforms/maven/opencv/pom.xml
+++ b/platforms/maven/opencv/pom.xml
@@ -4,7 +4,7 @@
     <parent>
         <groupId>org.opencv</groupId>
         <artifactId>opencv-parent</artifactId>
-        <version>4.8.0</version>
+        <version>4.10.0</version>
     </parent>
     <groupId>org.opencv</groupId>
     <artifactId>opencv</artifactId>
diff --git a/platforms/maven/pom.xml b/platforms/maven/pom.xml
index b6b57ed9168e..c0b4f776973e 100644
--- a/platforms/maven/pom.xml
+++ b/platforms/maven/pom.xml
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>org.opencv</groupId>
     <artifactId>opencv-parent</artifactId>
-    <version>4.8.0</version>
+    <version>4.10.0</version>
     <packaging>pom</packaging>
     <name>OpenCV Parent POM</name>
     <licenses>
diff --git a/samples/_winpack_run_python_sample.cmd b/samples/_winpack_run_python_sample.cmd
index 1ee8bb9a134c..bfb405dc0af3 100644
--- a/samples/_winpack_run_python_sample.cmd
+++ b/samples/_winpack_run_python_sample.cmd
@@ -23,6 +23,8 @@ IF %ERRORLEVEL% EQU 0 (
   GOTO :PYTHON_FOUND
 )
 
+CALL :QUERY_PYTHON 3.12
+IF %ERRORLEVEL% EQU 0 GOTO :PYTHON_FOUND
 CALL :QUERY_PYTHON 3.11
 IF %ERRORLEVEL% EQU 0 GOTO :PYTHON_FOUND
 CALL :QUERY_PYTHON 3.10
diff --git a/samples/android/15-puzzle/build.gradle.in b/samples/android/15-puzzle/build.gradle.in
index ad5c9f93f9ef..e7f6b4af5653 100644
--- a/samples/android/15-puzzle/build.gradle.in
+++ b/samples/android/15-puzzle/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.puzzle15'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.puzzle15"
@@ -18,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -27,5 +27,11 @@ android {
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == 'sdk_path') {
+        println 'Using OpenCV from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == 'maven_local' || gradle.opencv_source == 'maven_central') {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java b/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java
index db888b9c7807..9a075cf91fbf 100644
--- a/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java
+++ b/samples/android/15-puzzle/src/org/opencv/samples/puzzle15/Puzzle15Activity.java
@@ -1,8 +1,6 @@
 package org.opencv.samples.puzzle15;
 
-import org.opencv.android.BaseLoaderCallback;
 import org.opencv.android.CameraActivity;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.core.Mat;
 import org.opencv.android.CameraBridgeViewBase;
@@ -16,6 +14,7 @@
 import android.view.MotionEvent;
 import android.view.View;
 import android.view.WindowManager;
+import android.widget.Toast;
 
 import java.util.Collections;
 import java.util.List;
@@ -29,36 +28,22 @@ public class Puzzle15Activity extends CameraActivity implements CvCameraViewList
     private MenuItem             mItemHideNumbers;
     private MenuItem             mItemStartNewGame;
 
-
     private int                  mGameWidth;
     private int                  mGameHeight;
 
-    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(this) {
-
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-
-                    /* Now enable camera view to start receiving frames */
-                    mOpenCvCameraView.setOnTouchListener(Puzzle15Activity.this);
-                    mOpenCvCameraView.enableView();
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
-
     @Override
     public void onCreate(Bundle savedInstanceState) {
         super.onCreate(savedInstanceState);
         getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
 
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
         Log.d(TAG, "Creating and setting view");
         mOpenCvCameraView = (CameraBridgeViewBase) new JavaCameraView(this, -1);
         setContentView(mOpenCvCameraView);
@@ -80,12 +65,9 @@ public void onPause()
     public void onResume()
     {
         super.onResume();
-        if (!OpenCVLoader.initDebug()) {
-            Log.d(TAG, "Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_3_0_0, this, mLoaderCallback);
-        } else {
-            Log.d(TAG, "OpenCV library found inside package. Using it!");
-            mLoaderCallback.onManagerConnected(LoaderCallbackInterface.SUCCESS);
+        if (mOpenCvCameraView != null) {
+            mOpenCvCameraView.setOnTouchListener(Puzzle15Activity.this);
+            mOpenCvCameraView.enableView();
         }
     }
 
diff --git a/samples/android/CMakeLists.txt b/samples/android/CMakeLists.txt
index 360f1aa2cb0a..fe6318f48eff 100644
--- a/samples/android/CMakeLists.txt
+++ b/samples/android/CMakeLists.txt
@@ -8,9 +8,12 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations)
 
 add_subdirectory(15-puzzle)
 add_subdirectory(face-detection)
+add_subdirectory(qr-detection)
 add_subdirectory(image-manipulations)
 add_subdirectory(camera-calibration)
 add_subdirectory(color-blob-detection)
+add_subdirectory(mobilenet-objdetect)
+add_subdirectory(video-recorder)
 add_subdirectory(tutorial-1-camerapreview)
 add_subdirectory(tutorial-2-mixedprocessing)
 add_subdirectory(tutorial-3-cameracontrol)
diff --git a/samples/android/build.gradle.in b/samples/android/build.gradle.in
index f36fe216f5a9..d1598991321f 100644
--- a/samples/android/build.gradle.in
+++ b/samples/android/build.gradle.in
@@ -19,6 +19,11 @@ allprojects {
     repositories {
         google()
         jcenter()
+        if (gradle.opencv_source == "maven_local") {
+            maven {
+                url gradle.opencv_maven_path
+            }
+        }
     }
 }
 
diff --git a/samples/android/camera-calibration/build.gradle.in b/samples/android/camera-calibration/build.gradle.in
index d62b15186789..8c97fb22ab9b 100644
--- a/samples/android/camera-calibration/build.gradle.in
+++ b/samples/android/camera-calibration/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.cameracalibration'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.cameracalibration"
@@ -18,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -27,5 +27,11 @@ android {
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/camera-calibration/src/org/opencv/samples/cameracalibration/CameraCalibrationActivity.java b/samples/android/camera-calibration/src/org/opencv/samples/cameracalibration/CameraCalibrationActivity.java
index 30a6ad9c1d48..eb2855c2db10 100644
--- a/samples/android/camera-calibration/src/org/opencv/samples/cameracalibration/CameraCalibrationActivity.java
+++ b/samples/android/camera-calibration/src/org/opencv/samples/cameracalibration/CameraCalibrationActivity.java
@@ -13,12 +13,10 @@
 
 package org.opencv.samples.cameracalibration;
 
-import org.opencv.android.BaseLoaderCallback;
 import org.opencv.android.CameraActivity;
 import org.opencv.android.CameraBridgeViewBase;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener2;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.core.Mat;
 
@@ -49,24 +47,6 @@ public class CameraCalibrationActivity extends CameraActivity implements CvCamer
     private int mWidth;
     private int mHeight;
 
-    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-            case LoaderCallbackInterface.SUCCESS:
-            {
-                Log.i(TAG, "OpenCV loaded successfully");
-                mOpenCvCameraView.enableView();
-                mOpenCvCameraView.setOnTouchListener(CameraCalibrationActivity.this);
-            } break;
-            default:
-            {
-                super.onManagerConnected(status);
-            } break;
-            }
-        }
-    };
-
     public CameraCalibrationActivity() {
         Log.i(TAG, "Instantiated new " + this.getClass());
     }
@@ -75,6 +55,15 @@ public CameraCalibrationActivity() {
     public void onCreate(Bundle savedInstanceState) {
         Log.i(TAG, "called onCreate");
         super.onCreate(savedInstanceState);
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
         getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
 
         setContentView(R.layout.camera_calibration_surface_view);
@@ -96,12 +85,9 @@ public void onPause()
     public void onResume()
     {
         super.onResume();
-        if (!OpenCVLoader.initDebug()) {
-            Log.d(TAG, "Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_3_0_0, this, mLoaderCallback);
-        } else {
-            Log.d(TAG, "OpenCV library found inside package. Using it!");
-            mLoaderCallback.onManagerConnected(LoaderCallbackInterface.SUCCESS);
+        if (mOpenCvCameraView != null) {
+            mOpenCvCameraView.enableView();
+            mOpenCvCameraView.setOnTouchListener(CameraCalibrationActivity.this);
         }
     }
 
@@ -136,23 +122,22 @@ public boolean onPrepareOptionsMenu (Menu menu) {
 
     @Override
     public boolean onOptionsItemSelected(MenuItem item) {
-        switch (item.getItemId()) {
-        case R.id.calibration:
+        if (item.getItemId() == R.id.calibration) {
             mOnCameraFrameRender =
                 new OnCameraFrameRender(new CalibrationFrameRender(mCalibrator));
             item.setChecked(true);
             return true;
-        case R.id.undistortion:
+        } else if (item.getItemId() == R.id.undistortion) {
             mOnCameraFrameRender =
                 new OnCameraFrameRender(new UndistortionFrameRender(mCalibrator));
             item.setChecked(true);
             return true;
-        case R.id.comparison:
+        } else if (item.getItemId() == R.id.comparison) {
             mOnCameraFrameRender =
                 new OnCameraFrameRender(new ComparisonFrameRender(mCalibrator, mWidth, mHeight, getResources()));
             item.setChecked(true);
             return true;
-        case R.id.calibrate:
+        } else if (item.getItemId() == R.id.calibrate) {
             final Resources res = getResources();
             if (mCalibrator.getCornersBufferSize() < 2) {
                 (Toast.makeText(this, res.getString(R.string.more_samples), Toast.LENGTH_SHORT)).show();
@@ -196,7 +181,7 @@ protected void onPostExecute(Void result) {
                 }
             }.execute();
             return true;
-        default:
+        } else {
             return super.onOptionsItemSelected(item);
         }
     }
diff --git a/samples/android/color-blob-detection/build.gradle.in b/samples/android/color-blob-detection/build.gradle.in
index 8900e25c1652..bd2933897026 100644
--- a/samples/android/color-blob-detection/build.gradle.in
+++ b/samples/android/color-blob-detection/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.colorblobdetect'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.colorblobdetect"
@@ -18,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -27,5 +27,11 @@ android {
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/color-blob-detection/src/org/opencv/samples/colorblobdetect/ColorBlobDetectionActivity.java b/samples/android/color-blob-detection/src/org/opencv/samples/colorblobdetect/ColorBlobDetectionActivity.java
index a656dc501985..5b3d2971c37c 100644
--- a/samples/android/color-blob-detection/src/org/opencv/samples/colorblobdetect/ColorBlobDetectionActivity.java
+++ b/samples/android/color-blob-detection/src/org/opencv/samples/colorblobdetect/ColorBlobDetectionActivity.java
@@ -3,10 +3,8 @@
 import java.util.Collections;
 import java.util.List;
 
-import org.opencv.android.BaseLoaderCallback;
 import org.opencv.android.CameraActivity;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.core.Core;
 import org.opencv.core.CvType;
@@ -28,6 +26,7 @@
 import android.view.WindowManager;
 import android.view.View.OnTouchListener;
 import android.view.SurfaceView;
+import android.widget.Toast;
 
 public class ColorBlobDetectionActivity extends CameraActivity implements OnTouchListener, CvCameraViewListener2 {
     private static final String  TAG              = "OCVSample::Activity";
@@ -43,24 +42,6 @@ public class ColorBlobDetectionActivity extends CameraActivity implements OnTouc
 
     private CameraBridgeViewBase mOpenCvCameraView;
 
-    private BaseLoaderCallback  mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-                    mOpenCvCameraView.enableView();
-                    mOpenCvCameraView.setOnTouchListener(ColorBlobDetectionActivity.this);
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
-
     public ColorBlobDetectionActivity() {
         Log.i(TAG, "Instantiated new " + this.getClass());
     }
@@ -70,6 +51,15 @@ public ColorBlobDetectionActivity() {
     public void onCreate(Bundle savedInstanceState) {
         Log.i(TAG, "called onCreate");
         super.onCreate(savedInstanceState);
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
         requestWindowFeature(Window.FEATURE_NO_TITLE);
         getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
 
@@ -92,12 +82,9 @@ public void onPause()
     public void onResume()
     {
         super.onResume();
-        if (!OpenCVLoader.initDebug()) {
-            Log.d(TAG, "Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_3_0_0, this, mLoaderCallback);
-        } else {
-            Log.d(TAG, "OpenCV library found inside package. Using it!");
-            mLoaderCallback.onManagerConnected(LoaderCallbackInterface.SUCCESS);
+        if (mOpenCvCameraView != null) {
+            mOpenCvCameraView.enableView();
+            mOpenCvCameraView.setOnTouchListener(ColorBlobDetectionActivity.this);
         }
     }
 
diff --git a/samples/android/face-detection/CMakeLists.txt b/samples/android/face-detection/CMakeLists.txt
index bbcfa32f0830..3e246f699823 100644
--- a/samples/android/face-detection/CMakeLists.txt
+++ b/samples/android/face-detection/CMakeLists.txt
@@ -1,12 +1,16 @@
 set(sample example-face-detection)
 
-if(BUILD_FAT_JAVA_LIB)
-  set(native_deps opencv_java)
-else()
-  set(native_deps opencv_objdetect)
-endif()
+ocv_download(FILENAME "face_detection_yunet_2023mar.onnx"
+             HASH "4ae92eeb150c82ce15ac80738b3b8167"
+             URL
+               "${OPENCV_FACE_DETECT_YN_URL}"
+               "$ENV{OPENCV_FACE_DETECT_YN_URL}"
+               "https://media.githubusercontent.com/media/opencv/opencv_zoo/main/models/face_detection_yunet/face_detection_yunet_2023mar.onnx"
+             DESTINATION_DIR "${CMAKE_CURRENT_LIST_DIR}/res/raw"
+             ID OPENCV_FACE_DETECT_YN
+             STATUS res)
 
-add_android_project(${sample} "${CMAKE_CURRENT_SOURCE_DIR}" LIBRARY_DEPS "${OPENCV_ANDROID_LIB_DIR}" SDK_TARGET 11 "${ANDROID_SDK_TARGET}" NATIVE_DEPS ${native_deps})
+add_android_project(${sample} "${CMAKE_CURRENT_SOURCE_DIR}" LIBRARY_DEPS "${OPENCV_ANDROID_LIB_DIR}" SDK_TARGET 11 "${ANDROID_SDK_TARGET}")
 if(TARGET ${sample})
   add_dependencies(opencv_android_examples ${sample})
 endif()
diff --git a/samples/android/face-detection/build.gradle.in b/samples/android/face-detection/build.gradle.in
index 9e3b187a8453..72dabd664dc4 100644
--- a/samples/android/face-detection/build.gradle.in
+++ b/samples/android/face-detection/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.facedetect'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.facedetect"
@@ -8,13 +9,6 @@ android {
         targetSdkVersion @ANDROID_TARGET_SDK_VERSION@
         versionCode 301
         versionName "3.01"
-
-        externalNativeBuild {
-            cmake {
-                arguments "-DOpenCV_DIR=" + project(':opencv').projectDir + "/@ANDROID_PROJECT_JNI_PATH@"@OPENCV_ANDROID_CMAKE_EXTRA_ARGS@
-                targets "detection_based_tracker"
-            }
-        }
     }
     buildTypes {
         release {
@@ -25,19 +19,19 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
     }
-    externalNativeBuild {
-        cmake {
-             path '@ANDROID_SAMPLE_JNI_PATH@/CMakeLists.txt'
-        }
-    }
 }
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/face-detection/gradle/AndroidManifest.xml b/samples/android/face-detection/gradle/AndroidManifest.xml
index 157b318d3b6f..f018df2eecf6 100644
--- a/samples/android/face-detection/gradle/AndroidManifest.xml
+++ b/samples/android/face-detection/gradle/AndroidManifest.xml
@@ -9,9 +9,8 @@
 
         <activity
                   android:exported="true"
-                  android:name="FdActivity"
+                  android:name="FaceDetectActivity"
                   android:label="@string/app_name"
-                  android:screenOrientation="landscape"
                   android:configChanges="keyboardHidden|orientation">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
diff --git a/samples/android/face-detection/jni/Android.mk b/samples/android/face-detection/jni/Android.mk
deleted file mode 100644
index e882cac0c1ee..000000000000
--- a/samples/android/face-detection/jni/Android.mk
+++ /dev/null
@@ -1,23 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-
-include $(CLEAR_VARS)
-
-#OPENCV_INSTALL_MODULES:=off
-#OPENCV_LIB_TYPE:=SHARED
-ifdef OPENCV_ANDROID_SDK
-  ifneq ("","$(wildcard $(OPENCV_ANDROID_SDK)/OpenCV.mk)")
-    include ${OPENCV_ANDROID_SDK}/OpenCV.mk
-  else
-    include ${OPENCV_ANDROID_SDK}/sdk/native/jni/OpenCV.mk
-  endif
-else
-  include ../../sdk/native/jni/OpenCV.mk
-endif
-
-LOCAL_SRC_FILES  := DetectionBasedTracker_jni.cpp
-LOCAL_C_INCLUDES += $(LOCAL_PATH)
-LOCAL_LDLIBS     += -llog -ldl
-
-LOCAL_MODULE     := detection_based_tracker
-
-include $(BUILD_SHARED_LIBRARY)
diff --git a/samples/android/face-detection/jni/Application.mk b/samples/android/face-detection/jni/Application.mk
deleted file mode 100644
index 4fffcb2838f5..000000000000
--- a/samples/android/face-detection/jni/Application.mk
+++ /dev/null
@@ -1,4 +0,0 @@
-APP_STL := gnustl_static
-APP_CPPFLAGS := -frtti -fexceptions
-APP_ABI := armeabi-v7a
-APP_PLATFORM := android-8
diff --git a/samples/android/face-detection/jni/CMakeLists.txt b/samples/android/face-detection/jni/CMakeLists.txt
deleted file mode 100644
index 15f8f4db385d..000000000000
--- a/samples/android/face-detection/jni/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-cmake_minimum_required(VERSION 3.6)
-
-set(target detection_based_tracker)
-project(${target} CXX)
-
-set(ANDROID_OPENCV_COMPONENTS "opencv_java" CACHE STRING "")
-message(STATUS "ANDROID_ABI=${ANDROID_ABI}")
-find_package(OpenCV REQUIRED COMPONENTS ${ANDROID_OPENCV_COMPONENTS})
-
-file(GLOB srcs *.cpp *.c)
-file(GLOB hdrs *.hpp *.h)
-
-include_directories("${CMAKE_CURRENT_LIST_DIR}")
-add_library(${target} SHARED ${srcs} ${hdrs})
-target_link_libraries(${target} ${ANDROID_OPENCV_COMPONENTS})
diff --git a/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp b/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp
deleted file mode 100644
index 6ce36bc527a1..000000000000
--- a/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp
+++ /dev/null
@@ -1,251 +0,0 @@
-#include <DetectionBasedTracker_jni.h>
-#include <opencv2/core.hpp>
-#include <opencv2/objdetect.hpp>
-
-#include <string>
-#include <vector>
-
-#include <android/log.h>
-
-#define LOG_TAG "FaceDetection/DetectionBasedTracker"
-#define LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__))
-
-using namespace std;
-using namespace cv;
-
-inline void vector_Rect_to_Mat(vector<Rect>& v_rect, Mat& mat)
-{
-    mat = Mat(v_rect, true);
-}
-
-class CascadeDetectorAdapter: public DetectionBasedTracker::IDetector
-{
-public:
-    CascadeDetectorAdapter(cv::Ptr<cv::CascadeClassifier> detector):
-            IDetector(),
-            Detector(detector)
-    {
-        LOGD("CascadeDetectorAdapter::Detect::Detect");
-        CV_Assert(detector);
-    }
-
-    void detect(const cv::Mat &Image, std::vector<cv::Rect> &objects)
-    {
-        LOGD("CascadeDetectorAdapter::Detect: begin");
-        LOGD("CascadeDetectorAdapter::Detect: scaleFactor=%.2f, minNeighbours=%d, minObjSize=(%dx%d), maxObjSize=(%dx%d)", scaleFactor, minNeighbours, minObjSize.width, minObjSize.height, maxObjSize.width, maxObjSize.height);
-        Detector->detectMultiScale(Image, objects, scaleFactor, minNeighbours, 0, minObjSize, maxObjSize);
-        LOGD("CascadeDetectorAdapter::Detect: end");
-    }
-
-    virtual ~CascadeDetectorAdapter()
-    {
-        LOGD("CascadeDetectorAdapter::Detect::~Detect");
-    }
-
-private:
-    CascadeDetectorAdapter();
-    cv::Ptr<cv::CascadeClassifier> Detector;
-};
-
-struct DetectorAgregator
-{
-    cv::Ptr<CascadeDetectorAdapter> mainDetector;
-    cv::Ptr<CascadeDetectorAdapter> trackingDetector;
-
-    cv::Ptr<DetectionBasedTracker> tracker;
-    DetectorAgregator(cv::Ptr<CascadeDetectorAdapter>& _mainDetector, cv::Ptr<CascadeDetectorAdapter>& _trackingDetector):
-            mainDetector(_mainDetector),
-            trackingDetector(_trackingDetector)
-    {
-        CV_Assert(_mainDetector);
-        CV_Assert(_trackingDetector);
-
-        DetectionBasedTracker::Parameters DetectorParams;
-        tracker = makePtr<DetectionBasedTracker>(mainDetector, trackingDetector, DetectorParams);
-    }
-};
-
-JNIEXPORT jlong JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeCreateObject
-(JNIEnv * jenv, jclass, jstring jFileName, jint faceSize)
-{
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeCreateObject enter");
-    const char* jnamestr = jenv->GetStringUTFChars(jFileName, NULL);
-    string stdFileName(jnamestr);
-    jlong result = 0;
-
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeCreateObject");
-
-    try
-    {
-        cv::Ptr<CascadeDetectorAdapter> mainDetector = makePtr<CascadeDetectorAdapter>(
-            makePtr<CascadeClassifier>(stdFileName));
-        cv::Ptr<CascadeDetectorAdapter> trackingDetector = makePtr<CascadeDetectorAdapter>(
-            makePtr<CascadeClassifier>(stdFileName));
-        result = (jlong)new DetectorAgregator(mainDetector, trackingDetector);
-        if (faceSize > 0)
-        {
-            mainDetector->setMinObjectSize(Size(faceSize, faceSize));
-            //trackingDetector->setMinObjectSize(Size(faceSize, faceSize));
-        }
-    }
-    catch(const cv::Exception& e)
-    {
-        LOGD("nativeCreateObject caught cv::Exception: %s", e.what());
-        jclass je = jenv->FindClass("org/opencv/core/CvException");
-        if(!je)
-            je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, e.what());
-    }
-    catch (...)
-    {
-        LOGD("nativeCreateObject caught unknown exception");
-        jclass je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, "Unknown exception in JNI code of DetectionBasedTracker.nativeCreateObject()");
-        return 0;
-    }
-
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeCreateObject exit");
-    return result;
-}
-
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeDestroyObject
-(JNIEnv * jenv, jclass, jlong thiz)
-{
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeDestroyObject");
-
-    try
-    {
-        if(thiz != 0)
-        {
-            ((DetectorAgregator*)thiz)->tracker->stop();
-            delete (DetectorAgregator*)thiz;
-        }
-    }
-    catch(const cv::Exception& e)
-    {
-        LOGD("nativeestroyObject caught cv::Exception: %s", e.what());
-        jclass je = jenv->FindClass("org/opencv/core/CvException");
-        if(!je)
-            je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, e.what());
-    }
-    catch (...)
-    {
-        LOGD("nativeDestroyObject caught unknown exception");
-        jclass je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, "Unknown exception in JNI code of DetectionBasedTracker.nativeDestroyObject()");
-    }
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeDestroyObject exit");
-}
-
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeStart
-(JNIEnv * jenv, jclass, jlong thiz)
-{
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeStart");
-
-    try
-    {
-        ((DetectorAgregator*)thiz)->tracker->run();
-    }
-    catch(const cv::Exception& e)
-    {
-        LOGD("nativeStart caught cv::Exception: %s", e.what());
-        jclass je = jenv->FindClass("org/opencv/core/CvException");
-        if(!je)
-            je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, e.what());
-    }
-    catch (...)
-    {
-        LOGD("nativeStart caught unknown exception");
-        jclass je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, "Unknown exception in JNI code of DetectionBasedTracker.nativeStart()");
-    }
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeStart exit");
-}
-
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeStop
-(JNIEnv * jenv, jclass, jlong thiz)
-{
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeStop");
-
-    try
-    {
-        ((DetectorAgregator*)thiz)->tracker->stop();
-    }
-    catch(const cv::Exception& e)
-    {
-        LOGD("nativeStop caught cv::Exception: %s", e.what());
-        jclass je = jenv->FindClass("org/opencv/core/CvException");
-        if(!je)
-            je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, e.what());
-    }
-    catch (...)
-    {
-        LOGD("nativeStop caught unknown exception");
-        jclass je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, "Unknown exception in JNI code of DetectionBasedTracker.nativeStop()");
-    }
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeStop exit");
-}
-
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeSetFaceSize
-(JNIEnv * jenv, jclass, jlong thiz, jint faceSize)
-{
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeSetFaceSize -- BEGIN");
-
-    try
-    {
-        if (faceSize > 0)
-        {
-            ((DetectorAgregator*)thiz)->mainDetector->setMinObjectSize(Size(faceSize, faceSize));
-            //((DetectorAgregator*)thiz)->trackingDetector->setMinObjectSize(Size(faceSize, faceSize));
-        }
-    }
-    catch(const cv::Exception& e)
-    {
-        LOGD("nativeStop caught cv::Exception: %s", e.what());
-        jclass je = jenv->FindClass("org/opencv/core/CvException");
-        if(!je)
-            je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, e.what());
-    }
-    catch (...)
-    {
-        LOGD("nativeSetFaceSize caught unknown exception");
-        jclass je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, "Unknown exception in JNI code of DetectionBasedTracker.nativeSetFaceSize()");
-    }
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeSetFaceSize -- END");
-}
-
-
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeDetect
-(JNIEnv * jenv, jclass, jlong thiz, jlong imageGray, jlong faces)
-{
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeDetect");
-
-    try
-    {
-        vector<Rect> RectFaces;
-        ((DetectorAgregator*)thiz)->tracker->process(*((Mat*)imageGray));
-        ((DetectorAgregator*)thiz)->tracker->getObjects(RectFaces);
-        *((Mat*)faces) = Mat(RectFaces, true);
-    }
-    catch(const cv::Exception& e)
-    {
-        LOGD("nativeCreateObject caught cv::Exception: %s", e.what());
-        jclass je = jenv->FindClass("org/opencv/core/CvException");
-        if(!je)
-            je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, e.what());
-    }
-    catch (...)
-    {
-        LOGD("nativeDetect caught unknown exception");
-        jclass je = jenv->FindClass("java/lang/Exception");
-        jenv->ThrowNew(je, "Unknown exception in JNI code DetectionBasedTracker.nativeDetect()");
-    }
-    LOGD("Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeDetect END");
-}
diff --git a/samples/android/face-detection/jni/DetectionBasedTracker_jni.h b/samples/android/face-detection/jni/DetectionBasedTracker_jni.h
deleted file mode 100644
index 7e0541d81021..000000000000
--- a/samples/android/face-detection/jni/DetectionBasedTracker_jni.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class org_opencv_samples_fd_DetectionBasedTracker */
-
-#ifndef _Included_org_opencv_samples_fd_DetectionBasedTracker
-#define _Included_org_opencv_samples_fd_DetectionBasedTracker
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     org_opencv_samples_fd_DetectionBasedTracker
- * Method:    nativeCreateObject
- * Signature: (Ljava/lang/String;F)J
- */
-JNIEXPORT jlong JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeCreateObject
-  (JNIEnv *, jclass, jstring, jint);
-
-/*
- * Class:     org_opencv_samples_fd_DetectionBasedTracker
- * Method:    nativeDestroyObject
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeDestroyObject
-  (JNIEnv *, jclass, jlong);
-
-/*
- * Class:     org_opencv_samples_fd_DetectionBasedTracker
- * Method:    nativeStart
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeStart
-  (JNIEnv *, jclass, jlong);
-
-/*
- * Class:     org_opencv_samples_fd_DetectionBasedTracker
- * Method:    nativeStop
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeStop
-  (JNIEnv *, jclass, jlong);
-
-  /*
-   * Class:     org_opencv_samples_fd_DetectionBasedTracker
-   * Method:    nativeSetFaceSize
-   * Signature: (JI)V
-   */
-  JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeSetFaceSize
-  (JNIEnv *, jclass, jlong, jint);
-
-/*
- * Class:     org_opencv_samples_fd_DetectionBasedTracker
- * Method:    nativeDetect
- * Signature: (JJJ)V
- */
-JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_nativeDetect
-  (JNIEnv *, jclass, jlong, jlong, jlong);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/samples/android/face-detection/res/raw/lbpcascade_frontalface.xml b/samples/android/face-detection/res/raw/lbpcascade_frontalface.xml
deleted file mode 100644
index fc7648ef5138..000000000000
--- a/samples/android/face-detection/res/raw/lbpcascade_frontalface.xml
+++ /dev/null
@@ -1,1505 +0,0 @@
-<?xml version="1.0"?>
-<!--
-number of positive samples 3000
-number of negative samples 1500
--->
-<opencv_storage>
-<cascade type_id="opencv-cascade-classifier">
-  <stageType>BOOST</stageType>
-  <featureType>LBP</featureType>
-  <height>24</height>
-  <width>24</width>
-  <stageParams>
-    <boostType>GAB</boostType>
-    <minHitRate>0.9950000047683716</minHitRate>
-    <maxFalseAlarm>0.5000000000000000</maxFalseAlarm>
-    <weightTrimRate>0.9500000000000000</weightTrimRate>
-    <maxDepth>1</maxDepth>
-    <maxWeakCount>100</maxWeakCount></stageParams>
-  <featureParams>
-    <maxCatCount>256</maxCatCount></featureParams>
-  <stageNum>20</stageNum>
-  <stages>
-    <!-- stage 0 -->
-    <_>
-      <maxWeakCount>3</maxWeakCount>
-      <stageThreshold>-0.7520892024040222</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 46 -67130709 -21569 -1426120013 -1275125205 -21585
-            -16385 587145899 -24005</internalNodes>
-          <leafValues>
-            -0.6543210148811340 0.8888888955116272</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 13 -163512766 -769593758 -10027009 -262145 -514457854
-            -193593353 -524289 -1</internalNodes>
-          <leafValues>
-            -0.7739216089248657 0.7278633713722229</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 2 -363936790 -893203669 -1337948010 -136907894
-            1088782736 -134217726 -741544961 -1590337</internalNodes>
-          <leafValues>
-            -0.7068563103675842 0.6761534214019775</leafValues></_></weakClassifiers></_>
-    <!-- stage 1 -->
-    <_>
-      <maxWeakCount>4</maxWeakCount>
-      <stageThreshold>-0.4872078299522400</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 84 2147483647 1946124287 -536870913 2147450879
-            738132490 1061101567 243204619 2147446655</internalNodes>
-          <leafValues>
-            -0.8083735704421997 0.7685696482658386</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 21 2147483647 263176079 1879048191 254749487 1879048191
-            -134252545 -268435457 801111999</internalNodes>
-          <leafValues>
-            -0.7698410153388977 0.6592915654182434</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 106 -98110272 1610939566 -285484400 -850010381
-            -189334372 -1671954433 -571026695 -262145</internalNodes>
-          <leafValues>
-            -0.7506558895111084 0.5444605946540833</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 48 -798690576 -131075 1095771153 -237144073 -65569 -1
-            -216727745 -69206049</internalNodes>
-          <leafValues>
-            -0.7775990366935730 0.5465461611747742</leafValues></_></weakClassifiers></_>
-    <!-- stage 2 -->
-    <_>
-      <maxWeakCount>4</maxWeakCount>
-      <stageThreshold>-1.1592328548431396</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 47 -21585 -20549 -100818262 -738254174 -20561 -36865
-            -151016790 -134238549</internalNodes>
-          <leafValues>
-            -0.5601882934570313 0.7743113040924072</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 12 -286003217 183435247 -268994614 -421330945
-            -402686081 1090387966 -286785545 -402653185</internalNodes>
-          <leafValues>
-            -0.6124526262283325 0.6978127956390381</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 26 -50347012 970882927 -50463492 -1253377 -134218251
-            -50364513 -33619992 -172490753</internalNodes>
-          <leafValues>
-            -0.6114496588706970 0.6537628173828125</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 8 -273 -135266321 1877977738 -2088243418 -134217987
-            2146926575 -18910642 1095231247</internalNodes>
-          <leafValues>
-            -0.6854077577590942 0.5403239130973816</leafValues></_></weakClassifiers></_>
-    <!-- stage 3 -->
-    <_>
-      <maxWeakCount>5</maxWeakCount>
-      <stageThreshold>-0.7562355995178223</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 96 -1273 1870659519 -20971602 -67633153 -134250731
-            2004875127 -250 -150995969</internalNodes>
-          <leafValues>
-            -0.4051094949245453 0.7584033608436585</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 33 -868162224 -76810262 -4262145 -257 1465211989
-            -268959873 -2656269 -524289</internalNodes>
-          <leafValues>
-            -0.7388162612915039 0.5340843200683594</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 57 -12817 -49 -541103378 -152950 -38993 -20481 -1153876
-            -72478976</internalNodes>
-          <leafValues>
-            -0.6582943797111511 0.5339496731758118</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 125 -269484161 -452984961 -319816180 -1594032130 -2111
-            -990117891 -488975296 -520947741</internalNodes>
-          <leafValues>
-            -0.5981323719024658 0.5323504805564880</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 53 557787431 670265215 -1342193665 -1075892225
-            1998528318 1056964607 -33570977 -1</internalNodes>
-          <leafValues>
-            -0.6498787999153137 0.4913350641727448</leafValues></_></weakClassifiers></_>
-    <!-- stage 4 -->
-    <_>
-      <maxWeakCount>5</maxWeakCount>
-      <stageThreshold>-0.8085358142852783</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 60 -536873708 880195381 -16842788 -20971521 -176687276
-            -168427659 -16777260 -33554626</internalNodes>
-          <leafValues>
-            -0.5278195738792419 0.6946372389793396</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 7 -1 -62981529 -1090591130 805330978 -8388827 -41945787
-            -39577 -531118985</internalNodes>
-          <leafValues>
-            -0.5206505060195923 0.6329920291900635</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 98 -725287348 1347747543 -852489 -16809993 1489881036
-            -167903241 -1 -1</internalNodes>
-          <leafValues>
-            -0.7516061067581177 0.4232024252414703</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 44 -32777 1006582562 -65 935312171 -8388609 -1078198273
-            -1 733886267</internalNodes>
-          <leafValues>
-            -0.7639313936233521 0.4123568832874298</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 24 -85474705 2138828511 -1036436754 817625855
-            1123369029 -58796809 -1013468481 -194513409</internalNodes>
-          <leafValues>
-            -0.5123769044876099 0.5791834592819214</leafValues></_></weakClassifiers></_>
-    <!-- stage 5 -->
-    <_>
-      <maxWeakCount>5</maxWeakCount>
-      <stageThreshold>-0.5549971461296082</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 42 -17409 -20481 -268457797 -134239493 -17473 -1 -21829
-            -21846</internalNodes>
-          <leafValues>
-            -0.3763174116611481 0.7298233509063721</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 6 -805310737 -2098262358 -269504725 682502698
-            2147483519 1740574719 -1090519233 -268472385</internalNodes>
-          <leafValues>
-            -0.5352765917778015 0.5659480094909668</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 61 -67109678 -6145 -8 -87884584 -20481 -1073762305
-            -50856216 -16849696</internalNodes>
-          <leafValues>
-            -0.5678374171257019 0.4961479902267456</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 123 -138428633 1002418167 -1359008245 -1908670465
-            -1346685918 910098423 -1359010520 -1346371657</internalNodes>
-          <leafValues>
-            -0.5706262588500977 0.4572288393974304</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 9 -89138513 -4196353 1256531674 -1330665426 1216308261
-            -36190633 33498198 -151796633</internalNodes>
-          <leafValues>
-            -0.5344601869583130 0.4672054052352905</leafValues></_></weakClassifiers></_>
-    <!-- stage 6 -->
-    <_>
-      <maxWeakCount>5</maxWeakCount>
-      <stageThreshold>-0.8776460289955139</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 105 1073769576 206601725 -34013449 -33554433 -789514004
-            -101384321 -690225153 -264193</internalNodes>
-          <leafValues>
-            -0.7700348496437073 0.5943940877914429</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 30 -1432340997 -823623681 -49153 -34291724 -269484035
-            -1342767105 -1078198273 -1277955</internalNodes>
-          <leafValues>
-            -0.5043668746948242 0.6151274442672730</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 35 -1067385040 -195758209 -436748425 -134217731
-            -50855988 -129 -1 -1</internalNodes>
-          <leafValues>
-            -0.6808040738105774 0.4667325913906097</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 119 832534325 -34111555 -26050561 -423659521 -268468364
-            2105014143 -2114244 -17367185</internalNodes>
-          <leafValues>
-            -0.4927591383457184 0.5401885509490967</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 82 -1089439888 -1080524865 2143059967 -1114121
-            -1140949004 -3 -2361356 -739516</internalNodes>
-          <leafValues>
-            -0.6445107460021973 0.4227822124958038</leafValues></_></weakClassifiers></_>
-    <!-- stage 7 -->
-    <_>
-      <maxWeakCount>6</maxWeakCount>
-      <stageThreshold>-1.1139287948608398</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 52 -1074071553 -1074003969 -1 -1280135430 -5324817 -1
-            -335548482 582134442</internalNodes>
-          <leafValues>
-            -0.5307556986808777 0.6258179545402527</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 99 -706937396 -705364068 -540016724 -570495027
-            -570630659 -587857963 -33628164 -35848193</internalNodes>
-          <leafValues>
-            -0.5227634310722351 0.5049746036529541</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 18 -2035630093 42119158 -268503053 -1671444 261017599
-            1325432815 1954394111 -805306449</internalNodes>
-          <leafValues>
-            -0.4983572661876679 0.5106441378593445</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 111 -282529488 -1558073088 1426018736 -170526448
-            -546832487 -5113037 -34243375 -570427929</internalNodes>
-          <leafValues>
-            -0.4990860521793366 0.5060507059097290</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 92 1016332500 -606301707 915094269 -1080086049
-            -1837027144 -1361600280 2147318747 1067975613</internalNodes>
-          <leafValues>
-            -0.5695009231567383 0.4460467398166657</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 51 -656420166 -15413034 -141599534 -603435836
-            1505950458 -787556946 -79823438 -1326199134</internalNodes>
-          <leafValues>
-            -0.6590405106544495 0.3616424500942230</leafValues></_></weakClassifiers></_>
-    <!-- stage 8 -->
-    <_>
-      <maxWeakCount>7</maxWeakCount>
-      <stageThreshold>-0.8243625760078430</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 28 -901591776 -201916417 -262 -67371009 -143312112
-            -524289 -41943178 -1</internalNodes>
-          <leafValues>
-            -0.4972776770591736 0.6027074456214905</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 112 -4507851 -411340929 -268437513 -67502145 -17350859
-            -32901 -71344315 -29377</internalNodes>
-          <leafValues>
-            -0.4383158981800079 0.5966237187385559</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 69 -75894785 -117379438 -239063587 -12538500 1485072126
-            2076233213 2123118847 801906927</internalNodes>
-          <leafValues>
-            -0.6386105418205261 0.3977999985218048</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 19 -823480413 786628589 -16876049 -1364262914 242165211
-            1315930109 -696268833 -455082829</internalNodes>
-          <leafValues>
-            -0.5512794256210327 0.4282079637050629</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 73 -521411968 6746762 -1396236286 -2038436114
-            -185612509 57669627 -143132877 -1041235973</internalNodes>
-          <leafValues>
-            -0.6418755054473877 0.3549866080284119</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 126 -478153869 1076028979 -1645895615 1365298272
-            -557859073 -339771473 1442574528 -1058802061</internalNodes>
-          <leafValues>
-            -0.4841901361942291 0.4668019413948059</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 45 -246350404 -1650402048 -1610612745 -788400696
-            1467604861 -2787397 1476263935 -4481349</internalNodes>
-          <leafValues>
-            -0.5855734348297119 0.3879135847091675</leafValues></_></weakClassifiers></_>
-    <!-- stage 9 -->
-    <_>
-      <maxWeakCount>7</maxWeakCount>
-      <stageThreshold>-1.2237116098403931</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 114 -24819 1572863935 -16809993 -67108865 2146778388
-            1433927541 -268608444 -34865205</internalNodes>
-          <leafValues>
-            -0.2518476545810700 0.7088654041290283</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 97 -1841359 -134271049 -32769 -5767369 -1116675 -2185
-            -8231 -33603327</internalNodes>
-          <leafValues>
-            -0.4303432404994965 0.5283288359642029</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 25 -1359507589 -1360593090 -1073778729 -269553812
-            -809512977 1744707583 -41959433 -134758978</internalNodes>
-          <leafValues>
-            -0.4259553551673889 0.5440809130668640</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 34 729753407 -134270989 -1140907329 -235200777
-            658456383 2147467263 -1140900929 -16385</internalNodes>
-          <leafValues>
-            -0.5605589151382446 0.4220733344554901</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 134 -310380553 -420675595 -193005472 -353568129
-            1205338070 -990380036 887604324 -420544526</internalNodes>
-          <leafValues>
-            -0.5192656517028809 0.4399855434894562</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 16 -1427119361 1978920959 -287119734 -487068946
-            114759245 -540578051 -707510259 -671660453</internalNodes>
-          <leafValues>
-            -0.5013077259063721 0.4570254683494568</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 74 -738463762 -889949281 -328301948 -121832450
-            -1142658284 -1863576559 2146417353 -263185</internalNodes>
-          <leafValues>
-            -0.4631414115428925 0.4790246188640595</leafValues></_></weakClassifiers></_>
-    <!-- stage 10 -->
-    <_>
-      <maxWeakCount>7</maxWeakCount>
-      <stageThreshold>-0.5544230937957764</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 113 -76228780 -65538 -1 -67174401 -148007 -33 -221796
-            -272842924</internalNodes>
-          <leafValues>
-            -0.3949716091156006 0.6082032322883606</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 110 369147696 -1625232112 2138570036 -1189900 790708019
-            -1212613127 799948719 -4456483</internalNodes>
-          <leafValues>
-            -0.4855885505676270 0.4785369932651520</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 37 784215839 -290015241 536832799 -402984963
-            -1342414991 -838864897 -176769 -268456129</internalNodes>
-          <leafValues>
-            -0.4620285332202911 0.4989669024944305</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 41 -486418688 -171915327 -340294900 -21938 -519766032
-            -772751172 -73096060 -585322623</internalNodes>
-          <leafValues>
-            -0.6420643329620361 0.3624351918697357</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 117 -33554953 -475332625 -1423463824 -2077230421
-            -4849669 -2080505925 -219032928 -1071915349</internalNodes>
-          <leafValues>
-            -0.4820112884044647 0.4632140696048737</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 65 -834130468 -134217476 -1349314083 -1073803559
-            -619913764 -1449131844 -1386890321 -1979118423</internalNodes>
-          <leafValues>
-            -0.4465552568435669 0.5061788558959961</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 56 -285249779 1912569855 -16530 -1731022870 -1161904146
-            -1342177297 -268439634 -1464078708</internalNodes>
-          <leafValues>
-            -0.5190586447715759 0.4441480338573456</leafValues></_></weakClassifiers></_>
-    <!-- stage 11 -->
-    <_>
-      <maxWeakCount>7</maxWeakCount>
-      <stageThreshold>-0.7161560654640198</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 20 1246232575 1078001186 -10027057 60102 -277348353
-            -43646987 -1210581153 1195769615</internalNodes>
-          <leafValues>
-            -0.4323809444904327 0.5663768053054810</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 15 -778583572 -612921106 -578775890 -4036478
-            -1946580497 -1164766570 -1986687009 -12103599</internalNodes>
-          <leafValues>
-            -0.4588732719421387 0.4547033011913300</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 129 -1073759445 2013231743 -1363169553 -1082459201
-            -1414286549 868185983 -1356133589 -1077936257</internalNodes>
-          <leafValues>
-            -0.5218553543090820 0.4111092388629913</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 102 -84148365 -2093417722 -1204850272 564290299
-            -67121221 -1342177350 -1309195902 -776734797</internalNodes>
-          <leafValues>
-            -0.4920000731945038 0.4326725304126740</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 88 -25694458 67104495 -290216278 -168563037 2083877442
-            1702788383 -144191964 -234882162</internalNodes>
-          <leafValues>
-            -0.4494568109512329 0.4448510706424713</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 59 -857980836 904682741 -1612267521 232279415
-            1550862252 -574825221 -357380888 -4579409</internalNodes>
-          <leafValues>
-            -0.5180826783180237 0.3888972699642181</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 27 -98549440 -137838400 494928389 -246013630 939541351
-            -1196072350 -620603549 2137216273</internalNodes>
-          <leafValues>
-            -0.6081240773200989 0.3333222270011902</leafValues></_></weakClassifiers></_>
-    <!-- stage 12 -->
-    <_>
-      <maxWeakCount>8</maxWeakCount>
-      <stageThreshold>-0.6743940711021423</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 29 -150995201 2071191945 -1302151626 536934335
-            -1059008937 914128709 1147328110 -268369925</internalNodes>
-          <leafValues>
-            -0.1790193915367127 0.6605972051620483</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 128 -134509479 1610575703 -1342177289 1861484541
-            -1107833788 1577058173 -333558568 -136319041</internalNodes>
-          <leafValues>
-            -0.3681024610996246 0.5139749646186829</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 70 -1 1060154476 -1090984524 -630918524 -539492875
-            779616255 -839568424 -321</internalNodes>
-          <leafValues>
-            -0.3217232525348663 0.6171553134918213</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 4 -269562385 -285029906 -791084350 -17923776 235286671
-            1275504943 1344390399 -966276889</internalNodes>
-          <leafValues>
-            -0.4373284578323364 0.4358185231685638</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 76 17825984 -747628419 595427229 1474759671 575672208
-            -1684005538 872217086 -1155858277</internalNodes>
-          <leafValues>
-            -0.4404836893081665 0.4601220190525055</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 124 -336593039 1873735591 -822231622 -355795238
-            -470820869 -1997537409 -1057132384 -1015285005</internalNodes>
-          <leafValues>
-            -0.4294152259826660 0.4452161788940430</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 54 -834212130 -593694721 -322142257 -364892500
-            -951029539 -302125121 -1615106053 -79249765</internalNodes>
-          <leafValues>
-            -0.3973052501678467 0.4854526817798615</leafValues></_>
-        <!-- tree 7 -->
-        <_>
-          <internalNodes>
-            0 -1 95 1342144479 2147431935 -33554561 -47873 -855685912 -1
-            1988052447 536827383</internalNodes>
-          <leafValues>
-            -0.7054683566093445 0.2697997391223908</leafValues></_></weakClassifiers></_>
-    <!-- stage 13 -->
-    <_>
-      <maxWeakCount>9</maxWeakCount>
-      <stageThreshold>-1.2042298316955566</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 39 1431368960 -183437936 -537002499 -137497097
-            1560590321 -84611081 -2097193 -513</internalNodes>
-          <leafValues>
-            -0.5905947685241699 0.5101932883262634</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 120 -1645259691 2105491231 2130706431 1458995007
-            -8567536 -42483883 -33780003 -21004417</internalNodes>
-          <leafValues>
-            -0.4449204802513123 0.4490709304809570</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 89 -612381022 -505806938 -362027516 -452985106
-            275854917 1920431639 -12600561 -134221825</internalNodes>
-          <leafValues>
-            -0.4693818688392639 0.4061094820499420</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 14 -805573153 -161 -554172679 -530519488 -16779441
-            2000682871 -33604275 -150997129</internalNodes>
-          <leafValues>
-            -0.3600351214408875 0.5056326985359192</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 67 6192 435166195 1467449341 2046691505 -1608493775
-            -4755729 -1083162625 -71365637</internalNodes>
-          <leafValues>
-            -0.4459891915321350 0.4132415652275085</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 86 -41689215 -3281034 1853357967 -420712635 -415924289
-            -270209208 -1088293113 -825311232</internalNodes>
-          <leafValues>
-            -0.4466069042682648 0.4135067760944367</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 80 -117391116 -42203396 2080374461 -188709 -542008165
-            -356831940 -1091125345 -1073796897</internalNodes>
-          <leafValues>
-            -0.3394956290721893 0.5658645033836365</leafValues></_>
-        <!-- tree 7 -->
-        <_>
-          <internalNodes>
-            0 -1 75 -276830049 1378714472 -1342181951 757272098
-            1073740607 -282199241 -415761549 170896931</internalNodes>
-          <leafValues>
-            -0.5346512198448181 0.3584479391574860</leafValues></_>
-        <!-- tree 8 -->
-        <_>
-          <internalNodes>
-            0 -1 55 -796075825 -123166849 2113667055 -217530421
-            -1107432194 -16385 -806359809 -391188771</internalNodes>
-          <leafValues>
-            -0.4379335641860962 0.4123645126819611</leafValues></_></weakClassifiers></_>
-    <!-- stage 14 -->
-    <_>
-      <maxWeakCount>10</maxWeakCount>
-      <stageThreshold>-0.8402050137519836</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 71 -890246622 15525883 -487690486 47116238 -1212319899
-            -1291847681 -68159890 -469829921</internalNodes>
-          <leafValues>
-            -0.2670986354351044 0.6014143228530884</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 31 -1361180685 -1898008841 -1090588811 -285410071
-            -1074016265 -840443905 2147221487 -262145</internalNodes>
-          <leafValues>
-            -0.4149844348430634 0.4670888185501099</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 40 1426190596 1899364271 2142731795 -142607505
-            -508232452 -21563393 -41960001 -65</internalNodes>
-          <leafValues>
-            -0.4985891580581665 0.3719584941864014</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 109 -201337965 10543906 -236498096 -746195597
-            1974565825 -15204415 921907633 -190058309</internalNodes>
-          <leafValues>
-            -0.4568729996681213 0.3965812027454376</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 130 -595026732 -656401928 -268649235 -571490699
-            -440600392 -133131 -358810952 -2004088646</internalNodes>
-          <leafValues>
-            -0.4770836830139160 0.3862601518630981</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 66 941674740 -1107882114 1332789109 -67691015
-            -1360463693 -1556612430 -609108546 733546933</internalNodes>
-          <leafValues>
-            -0.4877715110778809 0.3778986334800720</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 49 -17114945 -240061474 1552871558 -82775604 -932393844
-            -1308544889 -532635478 -99042357</internalNodes>
-          <leafValues>
-            -0.3721654713153839 0.4994400143623352</leafValues></_>
-        <!-- tree 7 -->
-        <_>
-          <internalNodes>
-            0 -1 133 -655906006 1405502603 -939205164 1884929228
-            -498859222 559417357 -1928559445 -286264385</internalNodes>
-          <leafValues>
-            -0.3934195041656494 0.4769641458988190</leafValues></_>
-        <!-- tree 8 -->
-        <_>
-          <internalNodes>
-            0 -1 0 -335837777 1860677295 -90 -1946186226 931096183
-            251612987 2013265917 -671232197</internalNodes>
-          <leafValues>
-            -0.4323300719261169 0.4342164099216461</leafValues></_>
-        <!-- tree 9 -->
-        <_>
-          <internalNodes>
-            0 -1 103 37769424 -137772680 374692301 2002666345 -536176194
-            -1644484728 807009019 1069089930</internalNodes>
-          <leafValues>
-            -0.4993278682231903 0.3665378093719482</leafValues></_></weakClassifiers></_>
-    <!-- stage 15 -->
-    <_>
-      <maxWeakCount>9</maxWeakCount>
-      <stageThreshold>-1.1974394321441650</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 43 -5505 2147462911 2143265466 -4511070 -16450 -257
-            -201348440 -71333206</internalNodes>
-          <leafValues>
-            -0.3310225307941437 0.5624626278877258</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 90 -136842268 -499330741 2015250980 -87107126
-            -641665744 -788524639 -1147864792 -134892563</internalNodes>
-          <leafValues>
-            -0.5266560912132263 0.3704403042793274</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 104 -146800880 -1780368555 2111170033 -140904684
-            -16777551 -1946681885 -1646463595 -839131947</internalNodes>
-          <leafValues>
-            -0.4171888828277588 0.4540435671806335</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 85 -832054034 -981663763 -301990281 -578814081
-            -932319000 -1997406723 -33555201 -69206017</internalNodes>
-          <leafValues>
-            -0.4556705355644226 0.3704262077808380</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 24 -118492417 -1209026825 1119023838 -1334313353
-            1112948738 -297319313 1378887291 -139469193</internalNodes>
-          <leafValues>
-            -0.4182529747486115 0.4267231225967407</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 78 -1714382628 -2353704 -112094959 -549613092
-            -1567058760 -1718550464 -342315012 -1074972227</internalNodes>
-          <leafValues>
-            -0.3625369668006897 0.4684656262397766</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 5 -85219702 316836394 -33279 1904970288 2117267315
-            -260901769 -621461759 -88607770</internalNodes>
-          <leafValues>
-            -0.4742925167083740 0.3689507246017456</leafValues></_>
-        <!-- tree 7 -->
-        <_>
-          <internalNodes>
-            0 -1 11 -294654041 -353603585 -1641159686 -50331921
-            -2080899877 1145569279 -143132713 -152044037</internalNodes>
-          <leafValues>
-            -0.3666271567344666 0.4580127298831940</leafValues></_>
-        <!-- tree 8 -->
-        <_>
-          <internalNodes>
-            0 -1 32 1887453658 -638545712 -1877976819 -34320972
-            -1071067983 -661345416 -583338277 1060190561</internalNodes>
-          <leafValues>
-            -0.4567637443542481 0.3894708156585693</leafValues></_></weakClassifiers></_>
-    <!-- stage 16 -->
-    <_>
-      <maxWeakCount>9</maxWeakCount>
-      <stageThreshold>-0.5733128190040588</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 122 -994063296 1088745462 -318837116 -319881377
-            1102566613 1165490103 -121679694 -134744129</internalNodes>
-          <leafValues>
-            -0.4055117964744568 0.5487945079803467</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 68 -285233233 -538992907 1811935199 -369234005 -529
-            -20593 -20505 -1561401854</internalNodes>
-          <leafValues>
-            -0.3787897229194641 0.4532003402709961</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 58 -1335245632 1968917183 1940861695 536816369
-            -1226071367 -570908176 457026619 1000020667</internalNodes>
-          <leafValues>
-            -0.4258328974246979 0.4202791750431061</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 94 -1360318719 -1979797897 -50435249 -18646473
-            -608879292 -805306691 -269304244 -17840167</internalNodes>
-          <leafValues>
-            -0.4561023116111755 0.4002747833728790</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 87 2062765935 -16449 -1275080721 -16406 45764335
-            -1090552065 -772846337 -570464322</internalNodes>
-          <leafValues>
-            -0.4314672648906708 0.4086346626281738</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 127 -536896021 1080817663 -738234288 -965478709
-            -2082767969 1290855887 1993822934 -990381609</internalNodes>
-          <leafValues>
-            -0.4174543321132660 0.4249868988990784</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 3 -818943025 168730891 -293610428 -79249354 669224671
-            621166734 1086506807 1473768907</internalNodes>
-          <leafValues>
-            -0.4321364760398865 0.4090838730335236</leafValues></_>
-        <!-- tree 7 -->
-        <_>
-          <internalNodes>
-            0 -1 79 -68895696 -67107736 -1414315879 -841676168
-            -619843344 -1180610531 -1081990469 1043203389</internalNodes>
-          <leafValues>
-            -0.5018386244773865 0.3702533841133118</leafValues></_>
-        <!-- tree 8 -->
-        <_>
-          <internalNodes>
-            0 -1 116 -54002134 -543485719 -2124882422 -1437445858
-            -115617074 -1195787391 -1096024366 -2140472445</internalNodes>
-          <leafValues>
-            -0.5037505626678467 0.3564981222152710</leafValues></_></weakClassifiers></_>
-    <!-- stage 17 -->
-    <_>
-      <maxWeakCount>9</maxWeakCount>
-      <stageThreshold>-0.4892596900463104</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 132 -67113211 2003808111 1862135111 846461923 -2752
-            2002237273 -273154752 1937223539</internalNodes>
-          <leafValues>
-            -0.2448196411132813 0.5689709186553955</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 62 1179423888 -78064940 -611839555 -539167899
-            -1289358360 -1650810108 -892540499 -1432827684</internalNodes>
-          <leafValues>
-            -0.4633283913135529 0.3587929606437683</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 23 -285212705 -78450761 -656212031 -264050110 -27787425
-            -1334349961 -547662981 -135796924</internalNodes>
-          <leafValues>
-            -0.3731099069118500 0.4290455579757690</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 77 341863476 403702016 -550588417 1600194541
-            -1080690735 951127993 -1388580949 -1153717473</internalNodes>
-          <leafValues>
-            -0.3658909499645233 0.4556473195552826</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 22 -586880702 -204831512 -100644596 -39319550
-            -1191150794 705692513 457203315 -75806957</internalNodes>
-          <leafValues>
-            -0.5214384198188782 0.3221037387847900</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 72 -416546870 545911370 -673716192 -775559454
-            -264113598 139424 -183369982 -204474641</internalNodes>
-          <leafValues>
-            -0.4289036989212036 0.4004956185817719</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 50 -1026505020 -589692154 -1740499937 -1563770497
-            1348491006 -60710713 -1109853489 -633909413</internalNodes>
-          <leafValues>
-            -0.4621542394161224 0.3832748532295227</leafValues></_>
-        <!-- tree 7 -->
-        <_>
-          <internalNodes>
-            0 -1 108 -1448872304 -477895040 -1778390608 -772418127
-            -1789923416 -1612057181 -805306693 -1415842113</internalNodes>
-          <leafValues>
-            -0.3711548447608948 0.4612701535224915</leafValues></_>
-        <!-- tree 8 -->
-        <_>
-          <internalNodes>
-            0 -1 92 407905424 -582449988 52654751 -1294472 -285103725
-            -74633006 1871559083 1057955850</internalNodes>
-          <leafValues>
-            -0.5180652141571045 0.3205870389938355</leafValues></_></weakClassifiers></_>
-    <!-- stage 18 -->
-    <_>
-      <maxWeakCount>10</maxWeakCount>
-      <stageThreshold>-0.5911940932273865</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 81 4112 -1259563825 -846671428 -100902460 1838164148
-            -74153752 -90653988 -1074263896</internalNodes>
-          <leafValues>
-            -0.2592592537403107 0.5873016119003296</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 1 -285216785 -823206977 -1085589 -1081346 1207959293
-            1157103471 2097133565 -2097169</internalNodes>
-          <leafValues>
-            -0.3801195919513702 0.4718827307224274</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 121 -12465 -536875169 2147478367 2130706303 -37765492
-            -866124467 -318782328 -1392509185</internalNodes>
-          <leafValues>
-            -0.3509117066860199 0.5094807147979736</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 38 2147449663 -20741 -16794757 1945873146 -16710 -1
-            -8406341 -67663041</internalNodes>
-          <leafValues>
-            -0.4068757295608521 0.4130136370658875</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 17 -155191713 866117231 1651407483 548272812 -479201468
-            -447742449 1354229504 -261884429</internalNodes>
-          <leafValues>
-            -0.4557141065597534 0.3539792001247406</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 100 -225319378 -251682065 -492783986 -792341777
-            -1287261695 1393643841 -11274182 -213909521</internalNodes>
-          <leafValues>
-            -0.4117803275585175 0.4118592441082001</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 63 -382220122 -2002072729 -51404800 -371201558
-            -923011069 -2135301457 -2066104743 -1042557441</internalNodes>
-          <leafValues>
-            -0.4008397758007050 0.4034757018089294</leafValues></_>
-        <!-- tree 7 -->
-        <_>
-          <internalNodes>
-            0 -1 101 -627353764 -48295149 1581203952 -436258614
-            -105268268 -1435893445 -638126888 -1061107126</internalNodes>
-          <leafValues>
-            -0.5694189667701721 0.2964762747287750</leafValues></_>
-        <!-- tree 8 -->
-        <_>
-          <internalNodes>
-            0 -1 118 -8399181 1058107691 -621022752 -251003468 -12582915
-            -574619739 -994397789 -1648362021</internalNodes>
-          <leafValues>
-            -0.3195341229438782 0.5294018983840942</leafValues></_>
-        <!-- tree 9 -->
-        <_>
-          <internalNodes>
-            0 -1 92 -348343812 -1078389516 1717960437 364735981
-            -1783841602 -4883137 -457572354 -1076950384</internalNodes>
-          <leafValues>
-            -0.3365339040756226 0.5067458748817444</leafValues></_></weakClassifiers></_>
-    <!-- stage 19 -->
-    <_>
-      <maxWeakCount>10</maxWeakCount>
-      <stageThreshold>-0.7612916231155396</stageThreshold>
-      <weakClassifiers>
-        <!-- tree 0 -->
-        <_>
-          <internalNodes>
-            0 -1 10 -1976661318 -287957604 -1659497122 -782068 43591089
-            -453637880 1435470000 -1077438561</internalNodes>
-          <leafValues>
-            -0.4204545319080353 0.5165745615959168</leafValues></_>
-        <!-- tree 1 -->
-        <_>
-          <internalNodes>
-            0 -1 131 -67110925 14874979 -142633168 -1338923040
-            2046713291 -2067933195 1473503712 -789579837</internalNodes>
-          <leafValues>
-            -0.3762553930282593 0.4075302779674530</leafValues></_>
-        <!-- tree 2 -->
-        <_>
-          <internalNodes>
-            0 -1 83 -272814301 -1577073 -1118685 -305156120 -1052289
-            -1073813756 -538971154 -355523038</internalNodes>
-          <leafValues>
-            -0.4253497421741486 0.3728055357933044</leafValues></_>
-        <!-- tree 3 -->
-        <_>
-          <internalNodes>
-            0 -1 135 -2233 -214486242 -538514758 573747007 -159390971
-            1994225489 -973738098 -203424005</internalNodes>
-          <leafValues>
-            -0.3601998090744019 0.4563256204128265</leafValues></_>
-        <!-- tree 4 -->
-        <_>
-          <internalNodes>
-            0 -1 115 -261031688 -1330369299 -641860609 1029570301
-            -1306461192 -1196149518 -1529767778 683139823</internalNodes>
-          <leafValues>
-            -0.4034293889999390 0.4160816967487335</leafValues></_>
-        <!-- tree 5 -->
-        <_>
-          <internalNodes>
-            0 -1 64 -572993608 -34042628 -417865 -111109 -1433365268
-            -19869715 -1920939864 -1279457063</internalNodes>
-          <leafValues>
-            -0.3620899617671967 0.4594142735004425</leafValues></_>
-        <!-- tree 6 -->
-        <_>
-          <internalNodes>
-            0 -1 36 -626275097 -615256993 1651946018 805366393
-            2016559730 -430780849 -799868165 -16580645</internalNodes>
-          <leafValues>
-            -0.3903816640377045 0.4381459355354309</leafValues></_>
-        <!-- tree 7 -->
-        <_>
-          <internalNodes>
-            0 -1 93 1354797300 -1090957603 1976418270 -1342502178
-            -1851873892 -1194637077 -1153521668 -1108399474</internalNodes>
-          <leafValues>
-            -0.3591445386409760 0.4624078869819641</leafValues></_>
-        <!-- tree 8 -->
-        <_>
-          <internalNodes>
-            0 -1 91 68157712 1211368313 -304759523 1063017136 798797750
-            -275513546 648167355 -1145357350</internalNodes>
-          <leafValues>
-            -0.4297670423984528 0.4023293554782867</leafValues></_>
-        <!-- tree 9 -->
-        <_>
-          <internalNodes>
-            0 -1 107 -546318240 -1628569602 -163577944 -537002306
-            -545456389 -1325465645 -380446736 -1058473386</internalNodes>
-          <leafValues>
-            -0.5727006793022156 0.2995934784412384</leafValues></_></weakClassifiers></_></stages>
-  <features>
-    <_>
-      <rect>
-        0 0 3 5</rect></_>
-    <_>
-      <rect>
-        0 0 4 2</rect></_>
-    <_>
-      <rect>
-        0 0 6 3</rect></_>
-    <_>
-      <rect>
-        0 1 2 3</rect></_>
-    <_>
-      <rect>
-        0 1 3 3</rect></_>
-    <_>
-      <rect>
-        0 1 3 7</rect></_>
-    <_>
-      <rect>
-        0 4 3 3</rect></_>
-    <_>
-      <rect>
-        0 11 3 4</rect></_>
-    <_>
-      <rect>
-        0 12 8 4</rect></_>
-    <_>
-      <rect>
-        0 14 4 3</rect></_>
-    <_>
-      <rect>
-        1 0 5 3</rect></_>
-    <_>
-      <rect>
-        1 1 2 2</rect></_>
-    <_>
-      <rect>
-        1 3 3 1</rect></_>
-    <_>
-      <rect>
-        1 7 4 4</rect></_>
-    <_>
-      <rect>
-        1 12 2 2</rect></_>
-    <_>
-      <rect>
-        1 13 4 1</rect></_>
-    <_>
-      <rect>
-        1 14 4 3</rect></_>
-    <_>
-      <rect>
-        1 17 3 2</rect></_>
-    <_>
-      <rect>
-        2 0 2 3</rect></_>
-    <_>
-      <rect>
-        2 1 2 2</rect></_>
-    <_>
-      <rect>
-        2 2 4 6</rect></_>
-    <_>
-      <rect>
-        2 3 4 4</rect></_>
-    <_>
-      <rect>
-        2 7 2 1</rect></_>
-    <_>
-      <rect>
-        2 11 2 3</rect></_>
-    <_>
-      <rect>
-        2 17 3 2</rect></_>
-    <_>
-      <rect>
-        3 0 2 2</rect></_>
-    <_>
-      <rect>
-        3 1 7 3</rect></_>
-    <_>
-      <rect>
-        3 7 2 1</rect></_>
-    <_>
-      <rect>
-        3 7 2 4</rect></_>
-    <_>
-      <rect>
-        3 18 2 2</rect></_>
-    <_>
-      <rect>
-        4 0 2 3</rect></_>
-    <_>
-      <rect>
-        4 3 2 1</rect></_>
-    <_>
-      <rect>
-        4 6 2 1</rect></_>
-    <_>
-      <rect>
-        4 6 2 5</rect></_>
-    <_>
-      <rect>
-        4 7 5 2</rect></_>
-    <_>
-      <rect>
-        4 8 4 3</rect></_>
-    <_>
-      <rect>
-        4 18 2 2</rect></_>
-    <_>
-      <rect>
-        5 0 2 2</rect></_>
-    <_>
-      <rect>
-        5 3 4 4</rect></_>
-    <_>
-      <rect>
-        5 6 2 5</rect></_>
-    <_>
-      <rect>
-        5 9 2 2</rect></_>
-    <_>
-      <rect>
-        5 10 2 2</rect></_>
-    <_>
-      <rect>
-        6 3 4 4</rect></_>
-    <_>
-      <rect>
-        6 4 4 3</rect></_>
-    <_>
-      <rect>
-        6 5 2 3</rect></_>
-    <_>
-      <rect>
-        6 5 2 5</rect></_>
-    <_>
-      <rect>
-        6 5 4 3</rect></_>
-    <_>
-      <rect>
-        6 6 4 2</rect></_>
-    <_>
-      <rect>
-        6 6 4 4</rect></_>
-    <_>
-      <rect>
-        6 18 1 2</rect></_>
-    <_>
-      <rect>
-        6 21 2 1</rect></_>
-    <_>
-      <rect>
-        7 0 3 7</rect></_>
-    <_>
-      <rect>
-        7 4 2 3</rect></_>
-    <_>
-      <rect>
-        7 9 5 1</rect></_>
-    <_>
-      <rect>
-        7 21 2 1</rect></_>
-    <_>
-      <rect>
-        8 0 1 4</rect></_>
-    <_>
-      <rect>
-        8 5 2 2</rect></_>
-    <_>
-      <rect>
-        8 5 3 2</rect></_>
-    <_>
-      <rect>
-        8 17 3 1</rect></_>
-    <_>
-      <rect>
-        8 18 1 2</rect></_>
-    <_>
-      <rect>
-        9 0 5 3</rect></_>
-    <_>
-      <rect>
-        9 2 2 6</rect></_>
-    <_>
-      <rect>
-        9 5 1 1</rect></_>
-    <_>
-      <rect>
-        9 11 1 1</rect></_>
-    <_>
-      <rect>
-        9 16 1 1</rect></_>
-    <_>
-      <rect>
-        9 16 2 1</rect></_>
-    <_>
-      <rect>
-        9 17 1 1</rect></_>
-    <_>
-      <rect>
-        9 18 1 1</rect></_>
-    <_>
-      <rect>
-        10 5 1 2</rect></_>
-    <_>
-      <rect>
-        10 5 3 3</rect></_>
-    <_>
-      <rect>
-        10 7 1 5</rect></_>
-    <_>
-      <rect>
-        10 8 1 1</rect></_>
-    <_>
-      <rect>
-        10 9 1 1</rect></_>
-    <_>
-      <rect>
-        10 10 1 1</rect></_>
-    <_>
-      <rect>
-        10 10 1 2</rect></_>
-    <_>
-      <rect>
-        10 14 3 3</rect></_>
-    <_>
-      <rect>
-        10 15 1 1</rect></_>
-    <_>
-      <rect>
-        10 15 2 1</rect></_>
-    <_>
-      <rect>
-        10 16 1 1</rect></_>
-    <_>
-      <rect>
-        10 16 2 1</rect></_>
-    <_>
-      <rect>
-        10 17 1 1</rect></_>
-    <_>
-      <rect>
-        10 21 1 1</rect></_>
-    <_>
-      <rect>
-        11 3 2 2</rect></_>
-    <_>
-      <rect>
-        11 5 1 2</rect></_>
-    <_>
-      <rect>
-        11 5 3 3</rect></_>
-    <_>
-      <rect>
-        11 5 4 6</rect></_>
-    <_>
-      <rect>
-        11 6 1 1</rect></_>
-    <_>
-      <rect>
-        11 7 2 2</rect></_>
-    <_>
-      <rect>
-        11 8 1 2</rect></_>
-    <_>
-      <rect>
-        11 10 1 1</rect></_>
-    <_>
-      <rect>
-        11 10 1 2</rect></_>
-    <_>
-      <rect>
-        11 15 1 1</rect></_>
-    <_>
-      <rect>
-        11 17 1 1</rect></_>
-    <_>
-      <rect>
-        11 18 1 1</rect></_>
-    <_>
-      <rect>
-        12 0 2 2</rect></_>
-    <_>
-      <rect>
-        12 1 2 5</rect></_>
-    <_>
-      <rect>
-        12 2 4 1</rect></_>
-    <_>
-      <rect>
-        12 3 1 3</rect></_>
-    <_>
-      <rect>
-        12 7 3 4</rect></_>
-    <_>
-      <rect>
-        12 10 3 2</rect></_>
-    <_>
-      <rect>
-        12 11 1 1</rect></_>
-    <_>
-      <rect>
-        12 12 3 2</rect></_>
-    <_>
-      <rect>
-        12 14 4 3</rect></_>
-    <_>
-      <rect>
-        12 17 1 1</rect></_>
-    <_>
-      <rect>
-        12 21 2 1</rect></_>
-    <_>
-      <rect>
-        13 6 2 5</rect></_>
-    <_>
-      <rect>
-        13 7 3 5</rect></_>
-    <_>
-      <rect>
-        13 11 3 2</rect></_>
-    <_>
-      <rect>
-        13 17 2 2</rect></_>
-    <_>
-      <rect>
-        13 17 3 2</rect></_>
-    <_>
-      <rect>
-        13 18 1 2</rect></_>
-    <_>
-      <rect>
-        13 18 2 2</rect></_>
-    <_>
-      <rect>
-        14 0 2 2</rect></_>
-    <_>
-      <rect>
-        14 1 1 3</rect></_>
-    <_>
-      <rect>
-        14 2 3 2</rect></_>
-    <_>
-      <rect>
-        14 7 2 1</rect></_>
-    <_>
-      <rect>
-        14 13 2 1</rect></_>
-    <_>
-      <rect>
-        14 13 3 3</rect></_>
-    <_>
-      <rect>
-        14 17 2 2</rect></_>
-    <_>
-      <rect>
-        15 0 2 2</rect></_>
-    <_>
-      <rect>
-        15 0 2 3</rect></_>
-    <_>
-      <rect>
-        15 4 3 2</rect></_>
-    <_>
-      <rect>
-        15 4 3 6</rect></_>
-    <_>
-      <rect>
-        15 6 3 2</rect></_>
-    <_>
-      <rect>
-        15 11 3 4</rect></_>
-    <_>
-      <rect>
-        15 13 3 2</rect></_>
-    <_>
-      <rect>
-        15 17 2 2</rect></_>
-    <_>
-      <rect>
-        15 17 3 2</rect></_>
-    <_>
-      <rect>
-        16 1 2 3</rect></_>
-    <_>
-      <rect>
-        16 3 2 4</rect></_>
-    <_>
-      <rect>
-        16 6 1 1</rect></_>
-    <_>
-      <rect>
-        16 16 2 2</rect></_>
-    <_>
-      <rect>
-        17 1 2 2</rect></_>
-    <_>
-      <rect>
-        17 1 2 5</rect></_>
-    <_>
-      <rect>
-        17 12 2 2</rect></_>
-    <_>
-      <rect>
-        18 0 2 2</rect></_></features></cascade>
-</opencv_storage>
diff --git a/samples/android/face-detection/src/org/opencv/samples/facedetect/DetectionBasedTracker.java b/samples/android/face-detection/src/org/opencv/samples/facedetect/DetectionBasedTracker.java
deleted file mode 100644
index 6179f1bdc1a9..000000000000
--- a/samples/android/face-detection/src/org/opencv/samples/facedetect/DetectionBasedTracker.java
+++ /dev/null
@@ -1,41 +0,0 @@
-package org.opencv.samples.facedetect;
-
-import org.opencv.core.Mat;
-import org.opencv.core.MatOfRect;
-
-public class DetectionBasedTracker
-{
-    public DetectionBasedTracker(String cascadeName, int minFaceSize) {
-        mNativeObj = nativeCreateObject(cascadeName, minFaceSize);
-    }
-
-    public void start() {
-        nativeStart(mNativeObj);
-    }
-
-    public void stop() {
-        nativeStop(mNativeObj);
-    }
-
-    public void setMinFaceSize(int size) {
-        nativeSetFaceSize(mNativeObj, size);
-    }
-
-    public void detect(Mat imageGray, MatOfRect faces) {
-        nativeDetect(mNativeObj, imageGray.getNativeObjAddr(), faces.getNativeObjAddr());
-    }
-
-    public void release() {
-        nativeDestroyObject(mNativeObj);
-        mNativeObj = 0;
-    }
-
-    private long mNativeObj = 0;
-
-    private static native long nativeCreateObject(String cascadeName, int minFaceSize);
-    private static native void nativeDestroyObject(long thiz);
-    private static native void nativeStart(long thiz);
-    private static native void nativeStop(long thiz);
-    private static native void nativeSetFaceSize(long thiz, int size);
-    private static native void nativeDetect(long thiz, long inputImage, long faces);
-}
diff --git a/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java b/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java
new file mode 100644
index 000000000000..1ba50aec871a
--- /dev/null
+++ b/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java
@@ -0,0 +1,204 @@
+package org.opencv.samples.facedetect;
+
+import java.lang.Math;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.List;
+
+import org.opencv.android.CameraActivity;
+import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
+import org.opencv.android.OpenCVLoader;
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfByte;
+import org.opencv.core.Point;
+import org.opencv.core.Rect;
+import org.opencv.core.Scalar;
+import org.opencv.core.Size;
+import org.opencv.android.CameraBridgeViewBase;
+import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener2;
+import org.opencv.objdetect.FaceDetectorYN;
+import org.opencv.imgproc.Imgproc;
+
+import android.app.Activity;
+import android.content.Context;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.Menu;
+import android.view.MenuItem;
+import android.view.WindowManager;
+import android.widget.Toast;
+
+public class FaceDetectActivity extends CameraActivity implements CvCameraViewListener2 {
+
+    private static final String    TAG  = "OCVSample::Activity";
+
+    private static final Scalar    BOX_COLOR         = new Scalar(0, 255, 0);
+    private static final Scalar    RIGHT_EYE_COLOR   = new Scalar(255, 0, 0);
+    private static final Scalar    LEFT_EYE_COLOR    = new Scalar(0, 0, 255);
+    private static final Scalar    NOSE_TIP_COLOR    = new Scalar(0, 255, 0);
+    private static final Scalar    MOUTH_RIGHT_COLOR = new Scalar(255, 0, 255);
+    private static final Scalar    MOUTH_LEFT_COLOR  = new Scalar(0, 255, 255);
+
+    private Mat                    mRgba;
+    private Mat                    mBgr;
+    private Mat                    mBgrScaled;
+    private Size                   mInputSize = null;
+    private float                  mScale = 2.f;
+    private MatOfByte              mModelBuffer;
+    private MatOfByte              mConfigBuffer;
+    private FaceDetectorYN         mFaceDetector;
+    private Mat                    mFaces;
+
+    private CameraBridgeViewBase   mOpenCvCameraView;
+
+    public FaceDetectActivity() {
+        Log.i(TAG, "Instantiated new " + this.getClass());
+    }
+
+    /** Called when the activity is first created. */
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.i(TAG, "called onCreate");
+        super.onCreate(savedInstanceState);
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
+        byte[] buffer;
+        try {
+            // load cascade file from application resources
+            InputStream is = getResources().openRawResource(R.raw.face_detection_yunet_2023mar);
+
+            int size = is.available();
+            buffer = new byte[size];
+            int bytesRead = is.read(buffer);
+            is.close();
+        } catch (IOException e) {
+            e.printStackTrace();
+            Log.e(TAG, "Failed to ONNX model from resources! Exception thrown: " + e);
+            (Toast.makeText(this, "Failed to ONNX model from resources!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
+        mModelBuffer = new MatOfByte(buffer);
+        mConfigBuffer = new MatOfByte();
+
+        mFaceDetector = FaceDetectorYN.create("onnx", mModelBuffer, mConfigBuffer, new Size(320, 320));
+        if (mFaceDetector == null) {
+            Log.e(TAG, "Failed to create FaceDetectorYN!");
+            (Toast.makeText(this, "Failed to create FaceDetectorYN!", Toast.LENGTH_LONG)).show();
+            return;
+        } else
+            Log.i(TAG, "FaceDetectorYN initialized successfully!");
+
+
+        getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+
+        setContentView(R.layout.face_detect_surface_view);
+
+        mOpenCvCameraView = (CameraBridgeViewBase) findViewById(R.id.fd_activity_surface_view);
+        mOpenCvCameraView.setVisibility(CameraBridgeViewBase.VISIBLE);
+        mOpenCvCameraView.setCvCameraViewListener(this);
+    }
+
+    @Override
+    public void onPause()
+    {
+        super.onPause();
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.disableView();
+    }
+
+    @Override
+    public void onResume()
+    {
+        super.onResume();
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.enableView();
+    }
+
+    @Override
+    protected List<? extends CameraBridgeViewBase> getCameraViewList() {
+        return Collections.singletonList(mOpenCvCameraView);
+    }
+
+    public void onDestroy() {
+        super.onDestroy();
+        mOpenCvCameraView.disableView();
+    }
+
+    public void onCameraViewStarted(int width, int height) {
+        mRgba = new Mat();
+        mBgr = new Mat();
+        mBgrScaled = new Mat();
+        mFaces = new Mat();
+    }
+
+    public void onCameraViewStopped() {
+        mRgba.release();
+        mBgr.release();
+        mBgrScaled.release();
+        mFaces.release();
+    }
+
+    public void visualize(Mat rgba, Mat faces) {
+
+        int thickness = 2;
+        float[] faceData = new float[faces.cols() * faces.channels()];
+
+        for (int i = 0; i < faces.rows(); i++)
+        {
+            faces.get(i, 0, faceData);
+
+            Log.d(TAG, "Detected face (" + faceData[0] + ", " + faceData[1] + ", " +
+                                           faceData[2] + ", " + faceData[3] + ")");
+
+            // Draw bounding box
+            Imgproc.rectangle(rgba, new Rect(Math.round(mScale*faceData[0]), Math.round(mScale*faceData[1]),
+                                             Math.round(mScale*faceData[2]), Math.round(mScale*faceData[3])),
+                              BOX_COLOR, thickness);
+            // Draw landmarks
+            Imgproc.circle(rgba, new Point(Math.round(mScale*faceData[4]), Math.round(mScale*faceData[5])),
+                           2, RIGHT_EYE_COLOR, thickness);
+            Imgproc.circle(rgba, new Point(Math.round(mScale*faceData[6]), Math.round(mScale*faceData[7])),
+                           2, LEFT_EYE_COLOR, thickness);
+            Imgproc.circle(rgba, new Point(Math.round(mScale*faceData[8]), Math.round(mScale*faceData[9])),
+                           2, NOSE_TIP_COLOR, thickness);
+            Imgproc.circle(rgba, new Point(Math.round(mScale*faceData[10]), Math.round(mScale*faceData[11])),
+                           2, MOUTH_RIGHT_COLOR, thickness);
+            Imgproc.circle(rgba, new Point(Math.round(mScale*faceData[12]), Math.round(mScale*faceData[13])),
+                           2, MOUTH_LEFT_COLOR, thickness);
+        }
+    }
+
+    public Mat onCameraFrame(CvCameraViewFrame inputFrame) {
+
+        mRgba = inputFrame.rgba();
+
+        Size inputSize = new Size(Math.round(mRgba.cols()/mScale), Math.round(mRgba.rows()/mScale));
+        if (mInputSize == null || !mInputSize.equals(inputSize)) {
+            mInputSize = inputSize;
+            mFaceDetector.setInputSize(mInputSize);
+        }
+
+        Imgproc.cvtColor(mRgba, mBgr, Imgproc.COLOR_RGBA2BGR);
+        Imgproc.resize(mBgr, mBgrScaled, mInputSize);
+
+        if (mFaceDetector != null) {
+            int status = mFaceDetector.detect(mBgrScaled, mFaces);
+            Log.d(TAG, "Detector returned status " + status);
+            visualize(mRgba, mFaces);
+        }
+
+        return mRgba;
+    }
+}
diff --git a/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java b/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java
deleted file mode 100644
index 979c3da785fd..000000000000
--- a/samples/android/face-detection/src/org/opencv/samples/facedetect/FdActivity.java
+++ /dev/null
@@ -1,259 +0,0 @@
-package org.opencv.samples.facedetect;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.List;
-
-import org.opencv.android.BaseLoaderCallback;
-import org.opencv.android.CameraActivity;
-import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
-import org.opencv.android.LoaderCallbackInterface;
-import org.opencv.android.OpenCVLoader;
-import org.opencv.core.Core;
-import org.opencv.core.Mat;
-import org.opencv.core.MatOfRect;
-import org.opencv.core.Rect;
-import org.opencv.core.Scalar;
-import org.opencv.core.Size;
-import org.opencv.android.CameraBridgeViewBase;
-import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener2;
-import org.opencv.objdetect.CascadeClassifier;
-import org.opencv.imgproc.Imgproc;
-
-import android.app.Activity;
-import android.content.Context;
-import android.os.Bundle;
-import android.util.Log;
-import android.view.Menu;
-import android.view.MenuItem;
-import android.view.WindowManager;
-
-public class FdActivity extends CameraActivity implements CvCameraViewListener2 {
-
-    private static final String    TAG                 = "OCVSample::Activity";
-    private static final Scalar    FACE_RECT_COLOR     = new Scalar(0, 255, 0, 255);
-    public static final int        JAVA_DETECTOR       = 0;
-    public static final int        NATIVE_DETECTOR     = 1;
-
-    private MenuItem               mItemFace50;
-    private MenuItem               mItemFace40;
-    private MenuItem               mItemFace30;
-    private MenuItem               mItemFace20;
-    private MenuItem               mItemType;
-
-    private Mat                    mRgba;
-    private Mat                    mGray;
-    private File                   mCascadeFile;
-    private CascadeClassifier      mJavaDetector;
-    private DetectionBasedTracker  mNativeDetector;
-
-    private int                    mDetectorType       = JAVA_DETECTOR;
-    private String[]               mDetectorName;
-
-    private float                  mRelativeFaceSize   = 0.2f;
-    private int                    mAbsoluteFaceSize   = 0;
-
-    private CameraBridgeViewBase   mOpenCvCameraView;
-
-    private BaseLoaderCallback  mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-
-                    // Load native library after(!) OpenCV initialization
-                    System.loadLibrary("detection_based_tracker");
-
-                    try {
-                        // load cascade file from application resources
-                        InputStream is = getResources().openRawResource(R.raw.lbpcascade_frontalface);
-                        File cascadeDir = getDir("cascade", Context.MODE_PRIVATE);
-                        mCascadeFile = new File(cascadeDir, "lbpcascade_frontalface.xml");
-                        FileOutputStream os = new FileOutputStream(mCascadeFile);
-
-                        byte[] buffer = new byte[4096];
-                        int bytesRead;
-                        while ((bytesRead = is.read(buffer)) != -1) {
-                            os.write(buffer, 0, bytesRead);
-                        }
-                        is.close();
-                        os.close();
-
-                        mJavaDetector = new CascadeClassifier(mCascadeFile.getAbsolutePath());
-                        if (mJavaDetector.empty()) {
-                            Log.e(TAG, "Failed to load cascade classifier");
-                            mJavaDetector = null;
-                        } else
-                            Log.i(TAG, "Loaded cascade classifier from " + mCascadeFile.getAbsolutePath());
-
-                        mNativeDetector = new DetectionBasedTracker(mCascadeFile.getAbsolutePath(), 0);
-
-                        cascadeDir.delete();
-
-                    } catch (IOException e) {
-                        e.printStackTrace();
-                        Log.e(TAG, "Failed to load cascade. Exception thrown: " + e);
-                    }
-
-                    mOpenCvCameraView.enableView();
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
-
-    public FdActivity() {
-        mDetectorName = new String[2];
-        mDetectorName[JAVA_DETECTOR] = "Java";
-        mDetectorName[NATIVE_DETECTOR] = "Native (tracking)";
-
-        Log.i(TAG, "Instantiated new " + this.getClass());
-    }
-
-    /** Called when the activity is first created. */
-    @Override
-    public void onCreate(Bundle savedInstanceState) {
-        Log.i(TAG, "called onCreate");
-        super.onCreate(savedInstanceState);
-        getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
-
-        setContentView(R.layout.face_detect_surface_view);
-
-        mOpenCvCameraView = (CameraBridgeViewBase) findViewById(R.id.fd_activity_surface_view);
-        mOpenCvCameraView.setVisibility(CameraBridgeViewBase.VISIBLE);
-        mOpenCvCameraView.setCvCameraViewListener(this);
-    }
-
-    @Override
-    public void onPause()
-    {
-        super.onPause();
-        if (mOpenCvCameraView != null)
-            mOpenCvCameraView.disableView();
-    }
-
-    @Override
-    public void onResume()
-    {
-        super.onResume();
-        if (!OpenCVLoader.initDebug()) {
-            Log.d(TAG, "Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_3_0_0, this, mLoaderCallback);
-        } else {
-            Log.d(TAG, "OpenCV library found inside package. Using it!");
-            mLoaderCallback.onManagerConnected(LoaderCallbackInterface.SUCCESS);
-        }
-    }
-
-    @Override
-    protected List<? extends CameraBridgeViewBase> getCameraViewList() {
-        return Collections.singletonList(mOpenCvCameraView);
-    }
-
-    public void onDestroy() {
-        super.onDestroy();
-        mOpenCvCameraView.disableView();
-    }
-
-    public void onCameraViewStarted(int width, int height) {
-        mGray = new Mat();
-        mRgba = new Mat();
-    }
-
-    public void onCameraViewStopped() {
-        mGray.release();
-        mRgba.release();
-    }
-
-    public Mat onCameraFrame(CvCameraViewFrame inputFrame) {
-
-        mRgba = inputFrame.rgba();
-        mGray = inputFrame.gray();
-
-        if (mAbsoluteFaceSize == 0) {
-            int height = mGray.rows();
-            if (Math.round(height * mRelativeFaceSize) > 0) {
-                mAbsoluteFaceSize = Math.round(height * mRelativeFaceSize);
-            }
-            mNativeDetector.setMinFaceSize(mAbsoluteFaceSize);
-        }
-
-        MatOfRect faces = new MatOfRect();
-
-        if (mDetectorType == JAVA_DETECTOR) {
-            if (mJavaDetector != null)
-                mJavaDetector.detectMultiScale(mGray, faces, 1.1, 2, 2, // TODO: objdetect.CV_HAAR_SCALE_IMAGE
-                        new Size(mAbsoluteFaceSize, mAbsoluteFaceSize), new Size());
-        }
-        else if (mDetectorType == NATIVE_DETECTOR) {
-            if (mNativeDetector != null)
-                mNativeDetector.detect(mGray, faces);
-        }
-        else {
-            Log.e(TAG, "Detection method is not selected!");
-        }
-
-        Rect[] facesArray = faces.toArray();
-        for (int i = 0; i < facesArray.length; i++)
-            Imgproc.rectangle(mRgba, facesArray[i].tl(), facesArray[i].br(), FACE_RECT_COLOR, 3);
-
-        return mRgba;
-    }
-
-    @Override
-    public boolean onCreateOptionsMenu(Menu menu) {
-        Log.i(TAG, "called onCreateOptionsMenu");
-        mItemFace50 = menu.add("Face size 50%");
-        mItemFace40 = menu.add("Face size 40%");
-        mItemFace30 = menu.add("Face size 30%");
-        mItemFace20 = menu.add("Face size 20%");
-        mItemType   = menu.add(mDetectorName[mDetectorType]);
-        return true;
-    }
-
-    @Override
-    public boolean onOptionsItemSelected(MenuItem item) {
-        Log.i(TAG, "called onOptionsItemSelected; selected item: " + item);
-        if (item == mItemFace50)
-            setMinFaceSize(0.5f);
-        else if (item == mItemFace40)
-            setMinFaceSize(0.4f);
-        else if (item == mItemFace30)
-            setMinFaceSize(0.3f);
-        else if (item == mItemFace20)
-            setMinFaceSize(0.2f);
-        else if (item == mItemType) {
-            int tmpDetectorType = (mDetectorType + 1) % mDetectorName.length;
-            item.setTitle(mDetectorName[tmpDetectorType]);
-            setDetectorType(tmpDetectorType);
-        }
-        return true;
-    }
-
-    private void setMinFaceSize(float faceSize) {
-        mRelativeFaceSize = faceSize;
-        mAbsoluteFaceSize = 0;
-    }
-
-    private void setDetectorType(int type) {
-        if (mDetectorType != type) {
-            mDetectorType = type;
-
-            if (type == NATIVE_DETECTOR) {
-                Log.i(TAG, "Detection Based Tracker enabled");
-                mNativeDetector.start();
-            } else {
-                Log.i(TAG, "Cascade detector enabled");
-                mNativeDetector.stop();
-            }
-        }
-    }
-}
diff --git a/samples/android/image-manipulations/build.gradle.in b/samples/android/image-manipulations/build.gradle.in
index 0d685ed41014..3c5034ea9b75 100644
--- a/samples/android/image-manipulations/build.gradle.in
+++ b/samples/android/image-manipulations/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.imagemanipulations'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.imagemanipulations"
@@ -18,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -27,5 +27,11 @@ android {
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/image-manipulations/src/org/opencv/samples/imagemanipulations/ImageManipulationsActivity.java b/samples/android/image-manipulations/src/org/opencv/samples/imagemanipulations/ImageManipulationsActivity.java
index 5ab5e5f06d3d..1bea019640d5 100644
--- a/samples/android/image-manipulations/src/org/opencv/samples/imagemanipulations/ImageManipulationsActivity.java
+++ b/samples/android/image-manipulations/src/org/opencv/samples/imagemanipulations/ImageManipulationsActivity.java
@@ -4,10 +4,8 @@
 import java.util.Collections;
 import java.util.List;
 
-import org.opencv.android.BaseLoaderCallback;
 import org.opencv.android.CameraActivity;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.core.Core;
 import org.opencv.core.CvType;
@@ -26,6 +24,7 @@
 import android.view.Menu;
 import android.view.MenuItem;
 import android.view.WindowManager;
+import android.widget.Toast;
 
 public class ImageManipulationsActivity extends CameraActivity implements CvCameraViewListener2 {
     private static final String  TAG                 = "OCVSample::Activity";
@@ -67,23 +66,6 @@ public class ImageManipulationsActivity extends CameraActivity implements CvCame
 
     public static int           viewMode = VIEW_MODE_RGBA;
 
-    private BaseLoaderCallback  mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-                    mOpenCvCameraView.enableView();
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
-
     public ImageManipulationsActivity() {
         Log.i(TAG, "Instantiated new " + this.getClass());
     }
@@ -93,6 +75,15 @@ public ImageManipulationsActivity() {
     public void onCreate(Bundle savedInstanceState) {
         Log.i(TAG, "called onCreate");
         super.onCreate(savedInstanceState);
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
         getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
 
         setContentView(R.layout.image_manipulations_surface_view);
@@ -114,13 +105,8 @@ public void onPause()
     public void onResume()
     {
         super.onResume();
-        if (!OpenCVLoader.initDebug()) {
-            Log.d(TAG, "Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_3_0_0, this, mLoaderCallback);
-        } else {
-            Log.d(TAG, "OpenCV library found inside package. Using it!");
-            mLoaderCallback.onManagerConnected(LoaderCallbackInterface.SUCCESS);
-        }
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.enableView();
     }
 
     @Override
diff --git a/samples/android/mobilenet-objdetect/CMakeLists.txt b/samples/android/mobilenet-objdetect/CMakeLists.txt
index 4e41ebfb28f4..85abfae6a7c6 100644
--- a/samples/android/mobilenet-objdetect/CMakeLists.txt
+++ b/samples/android/mobilenet-objdetect/CMakeLists.txt
@@ -1,5 +1,25 @@
 set(sample example-mobilenet-objdetect)
 
+ocv_download(FILENAME "mobilenet_iter_73000.caffemodel"
+             HASH "bbcb3b6a0afe1ec89e1288096b5b8c66"
+             URL
+               "${OPENCV_MOBILENET_SSD_WEIGHTS_URL}"
+               "$ENV{OPENCV_MOBILENET_SSD_WEIGHTS_URL}"
+               "https://raw.githubusercontent.com/chuanqi305/MobileNet-SSD/97406996b1eee2d40eb0a00ae567cf41e23369f9/mobilenet_iter_73000.caffemodel"
+             DESTINATION_DIR "${CMAKE_CURRENT_LIST_DIR}/res/raw"
+             ID OPENCV_MOBILENET_SSD_WEIGHTS
+             STATUS res)
+
+ocv_download(FILENAME "deploy.prototxt"
+             HASH "f1978dc4fe20c680e850ce99830c5945"
+             URL
+               "${OPENCV_MOBILENET_SSD_CONFIG_URL}"
+               "$ENV{OPENCV_MOBILENET_SSD_CONFIG_URL}"
+               "https://raw.githubusercontent.com/chuanqi305/MobileNet-SSD/97406996b1eee2d40eb0a00ae567cf41e23369f9/deploy.prototxt"
+             DESTINATION_DIR "${CMAKE_CURRENT_LIST_DIR}/res/raw"
+             ID OPENCV_MOBILENET_SSD_CONFIG
+             STATUS res)
+
 add_android_project(${sample} "${CMAKE_CURRENT_SOURCE_DIR}" LIBRARY_DEPS "${OPENCV_ANDROID_LIB_DIR}" SDK_TARGET 11 "${ANDROID_SDK_TARGET}")
 if(TARGET ${sample})
   add_dependencies(opencv_android_examples ${sample})
diff --git a/samples/android/mobilenet-objdetect/build.gradle.in b/samples/android/mobilenet-objdetect/build.gradle.in
index e8238f7324d8..9e8e49b6682b 100644
--- a/samples/android/mobilenet-objdetect/build.gradle.in
+++ b/samples/android/mobilenet-objdetect/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.opencv_mobilenet'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.opencv_mobilenet"
@@ -18,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -27,5 +27,11 @@ android {
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/mobilenet-objdetect/gradle/AndroidManifest.xml b/samples/android/mobilenet-objdetect/gradle/AndroidManifest.xml
index 473a5db3d051..9cc994cb3254 100644
--- a/samples/android/mobilenet-objdetect/gradle/AndroidManifest.xml
+++ b/samples/android/mobilenet-objdetect/gradle/AndroidManifest.xml
@@ -1,15 +1,11 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-          package="org.opencv.samples.opencv_mobilenet"
->
+          package="org.opencv.samples.opencv_mobilenet">
 
     <application
-        android:allowBackup="true"
-        android:icon="@mipmap/ic_launcher"
         android:label="@string/app_name"
-        android:roundIcon="@mipmap/ic_launcher_round"
-        android:supportsRtl="true"
-        android:theme="@style/Theme.AppCompat.NoActionBar">  <!--Full screen mode-->
+        android:icon="@drawable/icon">
+        <!-- //! [mobilenet_tutorial] -->
         <activity
                   android:exported="true"
                   android:name=".MainActivity"
@@ -29,3 +25,4 @@
     <uses-feature android:name="android.hardware.camera.front.autofocus" android:required="false"/>
 
 </manifest>
+<!-- //! [mobilenet_tutorial] -->
diff --git a/platforms/android/service/engine/res/drawable/icon.png b/samples/android/mobilenet-objdetect/res/drawable/icon.png
similarity index 100%
rename from platforms/android/service/engine/res/drawable/icon.png
rename to samples/android/mobilenet-objdetect/res/drawable/icon.png
diff --git a/samples/android/mobilenet-objdetect/res/values/strings.xml b/samples/android/mobilenet-objdetect/res/values/strings.xml
index a7adc522df76..615813efc43a 100644
--- a/samples/android/mobilenet-objdetect/res/values/strings.xml
+++ b/samples/android/mobilenet-objdetect/res/values/strings.xml
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
 <resources>
-    <string name="app_name">opencv_mobilenet</string>
+    <string name="app_name">OCV Mobilenet Objdetect</string>
 </resources>
diff --git a/samples/android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java b/samples/android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java
index 44b4ba3d6ef1..6d417a5214fe 100644
--- a/samples/android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java
+++ b/samples/android/mobilenet-objdetect/src/org/opencv/samples/opencv_mobilenet/MainActivity.java
@@ -1,19 +1,25 @@
 package org.opencv.samples.opencv_mobilenet;
-
+/*
+// snippet was added for Android tutorial
+//! [mobilenet_tutorial_package]
+package com.example.myapplication;
+//! [mobilenet_tutorial_package]
+*/
+//! [mobilenet_tutorial]
 import android.content.Context;
 import android.content.res.AssetManager;
 import android.os.Bundle;
-import android.support.v7.app.AppCompatActivity;
 import android.util.Log;
+import android.widget.Toast;
 
-import org.opencv.android.BaseLoaderCallback;
+import org.opencv.android.CameraActivity;
 import org.opencv.android.CameraBridgeViewBase;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener2;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.core.Core;
 import org.opencv.core.Mat;
+import org.opencv.core.MatOfByte;
 import org.opencv.core.Point;
 import org.opencv.core.Scalar;
 import org.opencv.core.Size;
@@ -21,40 +27,44 @@
 import org.opencv.dnn.Dnn;
 import org.opencv.imgproc.Imgproc;
 
-import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.FileOutputStream;
+import java.io.InputStream;
 import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
 
-public class MainActivity extends AppCompatActivity implements CvCameraViewListener2 {
-
-    // Initialize OpenCV manager.
-    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS: {
-                    Log.i(TAG, "OpenCV loaded successfully");
-                    mOpenCvCameraView.enableView();
-                    break;
-                }
-                default: {
-                    super.onManagerConnected(status);
-                    break;
-                }
-            }
-        }
-    };
+public class MainActivity extends CameraActivity implements CvCameraViewListener2 {
 
     @Override
     public void onResume() {
         super.onResume();
-        OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION, this, mLoaderCallback);
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.enableView();
     }
 
     @Override
     protected void onCreate(Bundle savedInstanceState) {
         super.onCreate(savedInstanceState);
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
+        //! [init_model_from_memory]
+        mModelBuffer = loadFileFromResource(R.raw.mobilenet_iter_73000);
+        mConfigBuffer = loadFileFromResource(R.raw.deploy);
+        if (mModelBuffer == null || mConfigBuffer == null) {
+            Log.e(TAG, "Failed to load model from resources");
+        } else
+            Log.i(TAG, "Model files loaded successfully");
+
+        net = Dnn.readNet("caffe", mModelBuffer, mConfigBuffer);
+        Log.i(TAG, "Network loaded successfully");
+        //! [init_model_from_memory]
+
         setContentView(R.layout.activity_main);
 
         // Set up camera listener.
@@ -63,12 +73,30 @@ protected void onCreate(Bundle savedInstanceState) {
         mOpenCvCameraView.setCvCameraViewListener(this);
     }
 
+    @Override
+    public void onPause()
+    {
+        super.onPause();
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.disableView();
+    }
+
+    @Override
+    protected List<? extends CameraBridgeViewBase> getCameraViewList() {
+        return Collections.singletonList(mOpenCvCameraView);
+    }
+
+    public void onDestroy() {
+        super.onDestroy();
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.disableView();
+
+        mModelBuffer.release();
+        mConfigBuffer.release();
+    }
+
     // Load a network.
     public void onCameraViewStarted(int width, int height) {
-        String proto = getPath("MobileNetSSD_deploy.prototxt", this);
-        String weights = getPath("MobileNetSSD_deploy.caffemodel", this);
-        net = Dnn.readNetFromCaffe(proto, weights);
-        Log.i(TAG, "Network loaded successfully");
     }
 
     public Mat onCameraFrame(CvCameraViewFrame inputFrame) {
@@ -80,10 +108,12 @@ public Mat onCameraFrame(CvCameraViewFrame inputFrame) {
         final double THRESHOLD = 0.2;
 
         // Get a new frame
+        Log.d(TAG, "handle new frame!");
         Mat frame = inputFrame.rgba();
         Imgproc.cvtColor(frame, frame, Imgproc.COLOR_RGBA2RGB);
 
         // Forward image through network.
+        //! [mobilenet_handle_frame]
         Mat blob = Dnn.blobFromImage(frame, IN_SCALE_FACTOR,
                 new Size(IN_WIDTH, IN_HEIGHT),
                 new Scalar(MEAN_VAL, MEAN_VAL, MEAN_VAL), /*swapRB*/false, /*crop*/false);
@@ -121,37 +151,36 @@ public Mat onCameraFrame(CvCameraViewFrame inputFrame) {
                         Imgproc.FONT_HERSHEY_SIMPLEX, 0.5, new Scalar(0, 0, 0));
             }
         }
+        //! [mobilenet_handle_frame]
+
         return frame;
     }
 
     public void onCameraViewStopped() {}
 
-    // Upload file to storage and return a path.
-    private static String getPath(String file, Context context) {
-        AssetManager assetManager = context.getAssets();
-
-        BufferedInputStream inputStream = null;
+    //! [mobilenet_tutorial_resource]
+    private MatOfByte loadFileFromResource(int id) {
+       byte[] buffer;
         try {
-            // Read data from assets.
-            inputStream = new BufferedInputStream(assetManager.open(file));
-            byte[] data = new byte[inputStream.available()];
-            inputStream.read(data);
-            inputStream.close();
-
-            // Create copy file in storage.
-            File outFile = new File(context.getFilesDir(), file);
-            FileOutputStream os = new FileOutputStream(outFile);
-            os.write(data);
-            os.close();
-            // Return a path to file which may be read in common way.
-            return outFile.getAbsolutePath();
-        } catch (IOException ex) {
-            Log.i(TAG, "Failed to upload a file");
+            // load cascade file from application resources
+            InputStream is = getResources().openRawResource(id);
+
+            int size = is.available();
+            buffer = new byte[size];
+            int bytesRead = is.read(buffer);
+            is.close();
+        } catch (IOException e) {
+            e.printStackTrace();
+            Log.e(TAG, "Failed to ONNX model from resources! Exception thrown: " + e);
+            (Toast.makeText(this, "Failed to ONNX model from resources!", Toast.LENGTH_LONG)).show();
+            return null;
         }
-        return "";
+
+        return new MatOfByte(buffer);
     }
+    //! [mobilenet_tutorial_resource]
 
-    private static final String TAG = "OpenCV/Sample/MobileNet";
+    private static final String TAG = "OpenCV-MobileNet";
     private static final String[] classNames = {"background",
             "aeroplane", "bicycle", "bird", "boat",
             "bottle", "bus", "car", "cat", "chair",
@@ -159,6 +188,9 @@ private static String getPath(String file, Context context) {
             "motorbike", "person", "pottedplant",
             "sheep", "sofa", "train", "tvmonitor"};
 
-    private Net net;
+    private MatOfByte            mConfigBuffer;
+    private MatOfByte            mModelBuffer;
+    private Net                  net;
     private CameraBridgeViewBase mOpenCvCameraView;
 }
+//! [mobilenet_tutorial]
diff --git a/samples/android/qr-detection/AndroidManifest.xml b/samples/android/qr-detection/AndroidManifest.xml
new file mode 100644
index 000000000000..deb4c6ff122a
--- /dev/null
+++ b/samples/android/qr-detection/AndroidManifest.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.opencv.samples.qrdetection"
+    android:versionCode="301"
+    android:versionName="3.01" >
+
+    <uses-sdk android:minSdkVersion="8"/>
+
+    <application
+        android:icon="@drawable/icon"
+        android:label="@string/app_name" >
+
+        <activity
+            android:name=".QRdetectionActivity"
+            android:label="@string/app_name"
+            android:screenOrientation="landscape"
+            android:configChanges="keyboardHidden|orientation" >
+
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+    <uses-permission android:name="android.permission.CAMERA"/>
+
+    <uses-feature android:name="android.hardware.camera" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.autofocus" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.front" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.front.autofocus" android:required="false"/>
+
+</manifest>
diff --git a/samples/android/qr-detection/CMakeLists.txt b/samples/android/qr-detection/CMakeLists.txt
new file mode 100644
index 000000000000..00c76dc1d6df
--- /dev/null
+++ b/samples/android/qr-detection/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(sample example-qr-detection)
+
+add_android_project(${sample} "${CMAKE_CURRENT_SOURCE_DIR}" LIBRARY_DEPS "${OPENCV_ANDROID_LIB_DIR}" SDK_TARGET 11 "${ANDROID_SDK_TARGET}")
+if(TARGET ${sample})
+  add_dependencies(opencv_android_examples ${sample})
+endif()
diff --git a/samples/android/qr-detection/build.gradle.in b/samples/android/qr-detection/build.gradle.in
new file mode 100644
index 000000000000..0951b70cdb42
--- /dev/null
+++ b/samples/android/qr-detection/build.gradle.in
@@ -0,0 +1,35 @@
+apply plugin: 'com.android.application'
+
+android {
+    namespace 'org.opencv.samples.qrdetection'
+    compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
+    defaultConfig {
+        applicationId "org.opencv.samples.qrdetection"
+        minSdkVersion @ANDROID_MIN_SDK_VERSION@
+        targetSdkVersion @ANDROID_TARGET_SDK_VERSION@
+        versionCode 301
+        versionName "3.01"
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    sourceSets {
+        main {
+            java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
+            res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
+            manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
+        }
+    }
+}
+
+dependencies {
+    //implementation fileTree(dir: 'libs', include: ['*.jar'])
+    if (gradle.opencv_source == "sdk_path") {
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
+}
diff --git a/samples/android/qr-detection/gradle/AndroidManifest.xml b/samples/android/qr-detection/gradle/AndroidManifest.xml
new file mode 100644
index 000000000000..83d4aafc8c95
--- /dev/null
+++ b/samples/android/qr-detection/gradle/AndroidManifest.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.opencv.samples.qrdetection">
+
+    <application
+        android:icon="@drawable/icon"
+        android:label="@string/app_name">
+
+        <activity
+            android:exported="true"
+            android:name=".QRdetectionActivity"
+            android:label="@string/app_name"
+            android:screenOrientation="landscape"
+            android:configChanges="keyboardHidden|orientation" >
+
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+    <uses-permission android:name="android.permission.CAMERA"/>
+
+    <uses-feature android:name="android.hardware.camera" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.autofocus" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.front" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.front.autofocus" android:required="false"/>
+
+</manifest>
diff --git a/samples/android/qr-detection/res/drawable/icon.png b/samples/android/qr-detection/res/drawable/icon.png
new file mode 100644
index 000000000000..630454927b59
Binary files /dev/null and b/samples/android/qr-detection/res/drawable/icon.png differ
diff --git a/samples/android/qr-detection/res/values/strings.xml b/samples/android/qr-detection/res/values/strings.xml
new file mode 100644
index 000000000000..824161eee5fd
--- /dev/null
+++ b/samples/android/qr-detection/res/values/strings.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <string name="app_name">OpenCV QR Detector</string>
+</resources>
diff --git a/samples/android/qr-detection/src/org/opencv/samples/qrdetection/QRProcessor.java b/samples/android/qr-detection/src/org/opencv/samples/qrdetection/QRProcessor.java
new file mode 100644
index 000000000000..f3217d7ffe2b
--- /dev/null
+++ b/samples/android/qr-detection/src/org/opencv/samples/qrdetection/QRProcessor.java
@@ -0,0 +1,87 @@
+package org.opencv.samples.qrdetection;
+
+import org.opencv.core.Core;
+import org.opencv.core.Mat;
+import org.opencv.core.MatOfPoint;
+import org.opencv.core.Scalar;
+import org.opencv.core.Size;
+import org.opencv.core.Point;
+import org.opencv.imgproc.Imgproc;
+import org.opencv.objdetect.GraphicalCodeDetector;
+import org.opencv.objdetect.QRCodeDetector;
+import org.opencv.objdetect.QRCodeDetectorAruco;
+
+import android.util.Log;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class QRProcessor {
+    private GraphicalCodeDetector detector;
+    private static final String TAG = "QRProcessor";
+    private Scalar LineColor = new Scalar(255, 0, 0);
+    private Scalar FontColor = new Scalar(0, 0, 255);
+
+    public QRProcessor(boolean useArucoDetector) {
+        if (useArucoDetector)
+            detector = new QRCodeDetectorAruco();
+        else
+            detector = new QRCodeDetector();
+    }
+
+    private boolean findQRs(Mat inputFrame, List<String> decodedInfo, MatOfPoint points,
+                           boolean tryDecode, boolean multiDetect) {
+        boolean result = false;
+        if (multiDetect) {
+            if (tryDecode)
+                result = detector.detectAndDecodeMulti(inputFrame, decodedInfo, points);
+            else
+                result = detector.detectMulti(inputFrame, points);
+        }
+        else {
+            if(tryDecode) {
+                String s = detector.detectAndDecode(inputFrame, points);
+                result = !points.empty();
+                if (result)
+                    decodedInfo.add(s);
+            }
+            else {
+                result = detector.detect(inputFrame, points);
+            }
+        }
+        return result;
+    }
+
+    private void renderQRs(Mat inputFrame, List<String> decodedInfo, MatOfPoint points) {
+        for (int i = 0; i < points.rows(); i++) {
+            for (int j = 0; j < points.cols(); j++) {
+                Point pt1 = new Point(points.get(i, j));
+                Point pt2 = new Point(points.get(i, (j + 1) % 4));
+                Imgproc.line(inputFrame, pt1, pt2, LineColor, 3);
+            }
+            if (!decodedInfo.isEmpty()) {
+                String decode = decodedInfo.get(i);
+                if (decode.length() > 15) {
+                    decode = decode.substring(0, 12) + "...";
+                }
+                int baseline[] = {0};
+                Size textSize = Imgproc.getTextSize(decode, Imgproc.FONT_HERSHEY_COMPLEX, .95, 3, baseline);
+                Scalar sum = Core.sumElems(points.row(i));
+                Point start = new Point(sum.val[0] / 4. - textSize.width / 2., sum.val[1] / 4. - textSize.height / 2.);
+                Imgproc.putText(inputFrame, decode, start, Imgproc.FONT_HERSHEY_COMPLEX, .95, FontColor, 3);
+            }
+        }
+    }
+
+    /* this method to be called from the outside. It processes the frame to find QR codes. */
+    public synchronized Mat handleFrame(Mat inputFrame, boolean tryDecode, boolean multiDetect) {
+        List<String> decodedInfo = new ArrayList<String>();
+        MatOfPoint points = new MatOfPoint();
+        boolean result = findQRs(inputFrame, decodedInfo, points, tryDecode, multiDetect);
+        if (result) {
+            renderQRs(inputFrame, decodedInfo, points);
+        }
+        points.release();
+        return inputFrame;
+    }
+}
diff --git a/samples/android/qr-detection/src/org/opencv/samples/qrdetection/QRdetectionActivity.java b/samples/android/qr-detection/src/org/opencv/samples/qrdetection/QRdetectionActivity.java
new file mode 100644
index 000000000000..39361e2d98a9
--- /dev/null
+++ b/samples/android/qr-detection/src/org/opencv/samples/qrdetection/QRdetectionActivity.java
@@ -0,0 +1,130 @@
+package org.opencv.samples.qrdetection;
+
+import org.opencv.android.CameraActivity;
+import org.opencv.android.OpenCVLoader;
+import org.opencv.core.Mat;
+import org.opencv.android.CameraBridgeViewBase;
+import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener;
+import org.opencv.android.JavaCameraView;
+
+import android.os.Bundle;
+import android.util.Log;
+import android.view.Menu;
+import android.view.MenuItem;
+import android.view.WindowManager;
+import android.widget.Toast;
+
+import java.util.Collections;
+import java.util.List;
+
+public class QRdetectionActivity extends CameraActivity implements CvCameraViewListener {
+
+    private static final String  TAG = "QRdetection::Activity";
+
+    private CameraBridgeViewBase mOpenCvCameraView;
+    private QRProcessor    mQRDetector;
+    private MenuItem             mItemQRCodeDetectorAruco;
+    private MenuItem             mItemQRCodeDetector;
+    private MenuItem             mItemTryDecode;
+    private MenuItem             mItemMulti;
+
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
+        Log.d(TAG, "Creating and setting view");
+        mOpenCvCameraView = new JavaCameraView(this, -1);
+        setContentView(mOpenCvCameraView);
+        mOpenCvCameraView.setVisibility(CameraBridgeViewBase.VISIBLE);
+        mOpenCvCameraView.setCvCameraViewListener(this);
+        mQRDetector = new QRProcessor(true);
+    }
+
+    @Override
+    public void onPause()
+    {
+        super.onPause();
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.disableView();
+    }
+
+    @Override
+    public void onResume()
+    {
+        super.onResume();
+        if (mOpenCvCameraView != null) {
+            mOpenCvCameraView.enableView();
+        }
+    }
+
+    @Override
+    protected List<? extends CameraBridgeViewBase> getCameraViewList() {
+        return Collections.singletonList(mOpenCvCameraView);
+    }
+
+    public void onDestroy() {
+        super.onDestroy();
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.disableView();
+    }
+
+    @Override
+    public boolean onCreateOptionsMenu(Menu menu) {
+        Log.i(TAG, "called onCreateOptionsMenu");
+        mItemQRCodeDetectorAruco = menu.add("Aruco-based QR code detector");
+        mItemQRCodeDetectorAruco.setCheckable(true);
+        mItemQRCodeDetectorAruco.setChecked(true);
+
+        mItemQRCodeDetector = menu.add("Legacy QR code detector");
+        mItemQRCodeDetector.setCheckable(true);
+        mItemQRCodeDetector.setChecked(false);
+
+        mItemTryDecode = menu.add("Try to decode QR codes");
+        mItemTryDecode.setCheckable(true);
+        mItemTryDecode.setChecked(true);
+
+        mItemMulti = menu.add("Use multi detect/decode");
+        mItemMulti.setCheckable(true);
+        mItemMulti.setChecked(true);
+
+        return true;
+    }
+
+    @Override
+    public boolean onOptionsItemSelected(MenuItem item) {
+        Log.i(TAG, "Menu Item selected " + item);
+        if (item == mItemQRCodeDetector && !mItemQRCodeDetector.isChecked()) {
+            mQRDetector = new QRProcessor(false);
+            mItemQRCodeDetector.setChecked(true);
+            mItemQRCodeDetectorAruco.setChecked(false);
+        } else if (item == mItemQRCodeDetectorAruco && !mItemQRCodeDetectorAruco.isChecked()) {
+            mQRDetector = new QRProcessor(true);
+            mItemQRCodeDetector.setChecked(false);
+            mItemQRCodeDetectorAruco.setChecked(true);
+        } else if (item == mItemTryDecode) {
+            mItemTryDecode.setChecked(!mItemTryDecode.isChecked());
+        } else if (item == mItemMulti) {
+            mItemMulti.setChecked(!mItemMulti.isChecked());
+        }
+        return true;
+    }
+
+    public void onCameraViewStarted(int width, int height) {
+    }
+
+    public void onCameraViewStopped() {
+    }
+
+    public Mat onCameraFrame(Mat inputFrame) {
+        return mQRDetector.handleFrame(inputFrame, mItemTryDecode.isChecked(), mItemMulti.isChecked());
+    }
+}
diff --git a/samples/android/tutorial-1-camerapreview/build.gradle.in b/samples/android/tutorial-1-camerapreview/build.gradle.in
index 5a649175dcb2..7b308b2abbca 100644
--- a/samples/android/tutorial-1-camerapreview/build.gradle.in
+++ b/samples/android/tutorial-1-camerapreview/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.tutorial1'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.tutorial1"
@@ -18,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -27,5 +27,11 @@ android {
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml b/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml
index 79aaf3f5d48e..56c9ad32d61b 100644
--- a/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml
+++ b/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml
@@ -12,7 +12,6 @@
                   android:exported="true"
                   android:name="Tutorial1Activity"
                   android:label="@string/app_name"
-                  android:screenOrientation="landscape"
                   android:configChanges="keyboardHidden|orientation">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
@@ -27,11 +26,13 @@
                       android:largeScreens="true"
                       android:anyDensity="true" />
 
+    <!--[camera_permissions]-->
     <uses-permission android:name="android.permission.CAMERA"/>
 
     <uses-feature android:name="android.hardware.camera" android:required="false"/>
     <uses-feature android:name="android.hardware.camera.autofocus" android:required="false"/>
     <uses-feature android:name="android.hardware.camera.front" android:required="false"/>
     <uses-feature android:name="android.hardware.camera.front.autofocus" android:required="false"/>
+    <!--[camera_permissions]-->
 
 </manifest>
diff --git a/samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml b/samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml
index 77f1c5fba8a7..398efca0bbf5 100644
--- a/samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml
+++ b/samples/android/tutorial-1-camerapreview/res/layout/tutorial1_surface_view.xml
@@ -4,6 +4,7 @@
     android:layout_width="match_parent"
     android:layout_height="match_parent" >
 
+    <!-- [camera_view] -->
     <org.opencv.android.JavaCameraView
         android:layout_width="fill_parent"
         android:layout_height="fill_parent"
@@ -11,5 +12,6 @@
         android:id="@+id/tutorial1_activity_java_surface_view"
         opencv:show_fps="true"
         opencv:camera_id="any" />
+    <!-- [camera_view] -->
 
 </FrameLayout>
diff --git a/samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java b/samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java
index 4c9a39b3f78f..e8bc5327cd18 100644
--- a/samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java
+++ b/samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java
@@ -1,9 +1,7 @@
 package org.opencv.samples.tutorial1;
 
-import org.opencv.android.BaseLoaderCallback;
 import org.opencv.android.CameraActivity;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.core.Mat;
 import org.opencv.android.CameraBridgeViewBase;
@@ -11,9 +9,9 @@
 
 import android.os.Bundle;
 import android.util.Log;
-import android.view.MenuItem;
 import android.view.SurfaceView;
 import android.view.WindowManager;
+import android.widget.Toast;
 
 import java.util.Collections;
 import java.util.List;
@@ -22,25 +20,6 @@ public class Tutorial1Activity extends CameraActivity implements CvCameraViewLis
     private static final String TAG = "OCVSample::Activity";
 
     private CameraBridgeViewBase mOpenCvCameraView;
-    private boolean              mIsJavaCamera = true;
-    private MenuItem             mItemSwitchCamera = null;
-
-    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-                    mOpenCvCameraView.enableView();
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
 
     public Tutorial1Activity() {
         Log.i(TAG, "Instantiated new " + this.getClass());
@@ -51,7 +30,20 @@ public Tutorial1Activity() {
     public void onCreate(Bundle savedInstanceState) {
         Log.i(TAG, "called onCreate");
         super.onCreate(savedInstanceState);
+
+        //! [ocv_loader_init]
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+        //! [ocv_loader_init]
+
+        //! [keep_screen]
         getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+        //! [keep_screen]
 
         setContentView(R.layout.tutorial1_surface_view);
 
@@ -74,13 +66,8 @@ public void onPause()
     public void onResume()
     {
         super.onResume();
-        if (!OpenCVLoader.initDebug()) {
-            Log.d(TAG, "Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_3_0_0, this, mLoaderCallback);
-        } else {
-            Log.d(TAG, "OpenCV library found inside package. Using it!");
-            mLoaderCallback.onManagerConnected(LoaderCallbackInterface.SUCCESS);
-        }
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.enableView();
     }
 
     @Override
@@ -88,18 +75,22 @@ protected List<? extends CameraBridgeViewBase> getCameraViewList() {
         return Collections.singletonList(mOpenCvCameraView);
     }
 
+    @Override
     public void onDestroy() {
         super.onDestroy();
         if (mOpenCvCameraView != null)
             mOpenCvCameraView.disableView();
     }
 
+    @Override
     public void onCameraViewStarted(int width, int height) {
     }
 
+    @Override
     public void onCameraViewStopped() {
     }
 
+    @Override
     public Mat onCameraFrame(CvCameraViewFrame inputFrame) {
         return inputFrame.rgba();
     }
diff --git a/samples/android/tutorial-2-mixedprocessing/build.gradle.in b/samples/android/tutorial-2-mixedprocessing/build.gradle.in
index 7eca49b1b17f..a156f4224090 100644
--- a/samples/android/tutorial-2-mixedprocessing/build.gradle.in
+++ b/samples/android/tutorial-2-mixedprocessing/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.tutorial2'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.tutorial2"
@@ -11,7 +12,14 @@ android {
 
         externalNativeBuild {
             cmake {
-                arguments "-DOpenCV_DIR=" + project(':opencv').projectDir + "/@ANDROID_PROJECT_JNI_PATH@"@OPENCV_ANDROID_CMAKE_EXTRA_ARGS@
+                if (gradle.opencv_source == "sdk_path") {
+                    arguments "-DOpenCV_DIR=" + project(':opencv').projectDir + "/@ANDROID_PROJECT_JNI_PATH@",
+                              "-DOPENCV_FROM_SDK=TRUE"@OPENCV_ANDROID_CMAKE_EXTRA_ARGS@
+
+                } else {
+                    arguments "-DOPENCV_VERSION_MAJOR=@OPENCV_VERSION_MAJOR@",
+                              "-DOPENCV_FROM_SDK=FALSE"@OPENCV_ANDROID_CMAKE_EXTRA_ARGS@
+                }
                 targets "mixed_sample"
             }
         }
@@ -25,7 +33,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -35,9 +42,20 @@ android {
              path '@ANDROID_SAMPLE_JNI_PATH@/CMakeLists.txt'
         }
     }
+    buildFeatures {
+        if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+            prefab true
+        }
+    }
 }
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/tutorial-2-mixedprocessing/jni/Android.mk b/samples/android/tutorial-2-mixedprocessing/jni/Android.mk
deleted file mode 100644
index 6cd88bd7cfac..000000000000
--- a/samples/android/tutorial-2-mixedprocessing/jni/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-
-include $(CLEAR_VARS)
-
-ifdef OPENCV_ANDROID_SDK
-  ifneq ("","$(wildcard $(OPENCV_ANDROID_SDK)/OpenCV.mk)")
-    include ${OPENCV_ANDROID_SDK}/OpenCV.mk
-  else
-    include ${OPENCV_ANDROID_SDK}/sdk/native/jni/OpenCV.mk
-  endif
-else
-  include ../../sdk/native/jni/OpenCV.mk
-endif
-
-LOCAL_MODULE    := mixed_sample
-LOCAL_SRC_FILES := jni_part.cpp
-LOCAL_LDLIBS +=  -llog -ldl
-
-include $(BUILD_SHARED_LIBRARY)
diff --git a/samples/android/tutorial-2-mixedprocessing/jni/Application.mk b/samples/android/tutorial-2-mixedprocessing/jni/Application.mk
deleted file mode 100644
index 4fffcb2838f5..000000000000
--- a/samples/android/tutorial-2-mixedprocessing/jni/Application.mk
+++ /dev/null
@@ -1,4 +0,0 @@
-APP_STL := gnustl_static
-APP_CPPFLAGS := -frtti -fexceptions
-APP_ABI := armeabi-v7a
-APP_PLATFORM := android-8
diff --git a/samples/android/tutorial-2-mixedprocessing/jni/CMakeLists.txt b/samples/android/tutorial-2-mixedprocessing/jni/CMakeLists.txt
index 5b34f8b9480e..37e9a698bbea 100644
--- a/samples/android/tutorial-2-mixedprocessing/jni/CMakeLists.txt
+++ b/samples/android/tutorial-2-mixedprocessing/jni/CMakeLists.txt
@@ -3,7 +3,14 @@ cmake_minimum_required(VERSION 3.6)
 set(target mixed_sample)
 project(${target} CXX)
 
-set(ANDROID_OPENCV_COMPONENTS "opencv_java" CACHE STRING "")
+if (OPENCV_FROM_SDK)
+  message(STATUS "Using OpenCV from local SDK")
+  set(ANDROID_OPENCV_COMPONENTS "opencv_java" CACHE STRING "")
+else()
+  message(STATUS "Using OpenCV from AAR (Maven repo)")
+  set(ANDROID_OPENCV_COMPONENTS "OpenCV::opencv_java${OPENCV_VERSION_MAJOR}" CACHE STRING "")
+endif()
+
 message(STATUS "ANDROID_ABI=${ANDROID_ABI}")
 find_package(OpenCV REQUIRED COMPONENTS ${ANDROID_OPENCV_COMPONENTS})
 
diff --git a/samples/android/tutorial-2-mixedprocessing/src/org/opencv/samples/tutorial2/Tutorial2Activity.java b/samples/android/tutorial-2-mixedprocessing/src/org/opencv/samples/tutorial2/Tutorial2Activity.java
index 617247afa9fe..a5419c61da86 100644
--- a/samples/android/tutorial-2-mixedprocessing/src/org/opencv/samples/tutorial2/Tutorial2Activity.java
+++ b/samples/android/tutorial-2-mixedprocessing/src/org/opencv/samples/tutorial2/Tutorial2Activity.java
@@ -1,9 +1,7 @@
 package org.opencv.samples.tutorial2;
 
-import org.opencv.android.BaseLoaderCallback;
 import org.opencv.android.CameraActivity;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.core.CvType;
 import org.opencv.core.Mat;
@@ -16,6 +14,7 @@
 import android.view.Menu;
 import android.view.MenuItem;
 import android.view.WindowManager;
+import android.widget.Toast;
 
 import java.util.Collections;
 import java.util.List;
@@ -40,27 +39,6 @@ public class Tutorial2Activity extends CameraActivity implements CvCameraViewLis
 
     private CameraBridgeViewBase   mOpenCvCameraView;
 
-    private BaseLoaderCallback  mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-
-                    // Load native library after(!) OpenCV initialization
-                    System.loadLibrary("mixed_sample");
-
-                    mOpenCvCameraView.enableView();
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
-
     public Tutorial2Activity() {
         Log.i(TAG, "Instantiated new " + this.getClass());
     }
@@ -70,6 +48,18 @@ public Tutorial2Activity() {
     public void onCreate(Bundle savedInstanceState) {
         Log.i(TAG, "called onCreate");
         super.onCreate(savedInstanceState);
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
+        // Load native library after(!) OpenCV initialization
+        System.loadLibrary("mixed_sample");
+
         getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
 
         setContentView(R.layout.tutorial2_surface_view);
@@ -101,13 +91,8 @@ public void onPause()
     public void onResume()
     {
         super.onResume();
-        if (!OpenCVLoader.initDebug()) {
-            Log.d(TAG, "Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_3_0_0, this, mLoaderCallback);
-        } else {
-            Log.d(TAG, "OpenCV library found inside package. Using it!");
-            mLoaderCallback.onManagerConnected(LoaderCallbackInterface.SUCCESS);
-        }
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.enableView();
     }
 
     @Override
diff --git a/samples/android/tutorial-3-cameracontrol/build.gradle.in b/samples/android/tutorial-3-cameracontrol/build.gradle.in
index 0ba304f5e51f..d9c7f29ac3d3 100644
--- a/samples/android/tutorial-3-cameracontrol/build.gradle.in
+++ b/samples/android/tutorial-3-cameracontrol/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.tutorial3'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.tutorial3"
@@ -18,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -27,5 +27,11 @@ android {
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3Activity.java b/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3Activity.java
index c487f11e8d70..9a4a9ea6197c 100644
--- a/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3Activity.java
+++ b/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3Activity.java
@@ -6,17 +6,18 @@
 import java.util.List;
 import java.util.ListIterator;
 
-import org.opencv.android.BaseLoaderCallback;
 import org.opencv.android.CameraActivity;
 import org.opencv.android.CameraBridgeViewBase;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
-import org.opencv.android.LoaderCallbackInterface;
 import org.opencv.android.OpenCVLoader;
 import org.opencv.core.Mat;
 import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener2;
 
+import android.Manifest;
 import android.annotation.SuppressLint;
+import android.content.pm.PackageManager;
 import android.hardware.Camera.Size;
+import android.os.Build;
 import android.os.Bundle;
 import android.os.Environment;
 import android.util.Log;
@@ -43,24 +44,6 @@ public class Tutorial3Activity extends CameraActivity implements CvCameraViewLis
     private MenuItem[] mResolutionMenuItems;
     private SubMenu mResolutionMenu;
 
-    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(this) {
-        @Override
-        public void onManagerConnected(int status) {
-            switch (status) {
-                case LoaderCallbackInterface.SUCCESS:
-                {
-                    Log.i(TAG, "OpenCV loaded successfully");
-                    mOpenCvCameraView.enableView();
-                    mOpenCvCameraView.setOnTouchListener(Tutorial3Activity.this);
-                } break;
-                default:
-                {
-                    super.onManagerConnected(status);
-                } break;
-            }
-        }
-    };
-
     public Tutorial3Activity() {
         Log.i(TAG, "Instantiated new " + this.getClass());
     }
@@ -70,6 +53,15 @@ public Tutorial3Activity() {
     public void onCreate(Bundle savedInstanceState) {
         Log.i(TAG, "called onCreate");
         super.onCreate(savedInstanceState);
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            (Toast.makeText(this, "OpenCV initialization failed!", Toast.LENGTH_LONG)).show();
+            return;
+        }
+
         getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
 
         setContentView(R.layout.tutorial3_surface_view);
@@ -93,12 +85,9 @@ public void onPause()
     public void onResume()
     {
         super.onResume();
-        if (!OpenCVLoader.initDebug()) {
-            Log.d(TAG, "Internal OpenCV library not found. Using OpenCV Manager for initialization");
-            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_3_0_0, this, mLoaderCallback);
-        } else {
-            Log.d(TAG, "OpenCV library found inside package. Using it!");
-            mLoaderCallback.onManagerConnected(LoaderCallbackInterface.SUCCESS);
+        if (mOpenCvCameraView != null) {
+            mOpenCvCameraView.enableView();
+            mOpenCvCameraView.setOnTouchListener(Tutorial3Activity.this);
         }
     }
 
@@ -191,10 +180,17 @@ else if (item.getGroupId() == 2)
     @Override
     public boolean onTouch(View v, MotionEvent event) {
         Log.i(TAG,"onTouch event");
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+            if (checkSelfPermission(Manifest.permission.WRITE_EXTERNAL_STORAGE)
+                != PackageManager.PERMISSION_GRANTED) {
+                String[] permissions = {Manifest.permission.WRITE_EXTERNAL_STORAGE};
+                requestPermissions(permissions, 1);
+                return false;
+            }
+        }
         SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss");
         String currentDateandTime = sdf.format(new Date());
-        String fileName = Environment.getExternalStorageDirectory().getPath() +
-                               "/sample_picture_" + currentDateandTime + ".jpg";
+        String fileName = "sample_picture_" + currentDateandTime + ".jpg";
         mOpenCvCameraView.takePicture(fileName);
         Toast.makeText(this, fileName + " saved", Toast.LENGTH_SHORT).show();
         return false;
diff --git a/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3View.java b/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3View.java
index d102cbcdef08..fbc2949cec22 100644
--- a/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3View.java
+++ b/samples/android/tutorial-3-cameracontrol/src/org/opencv/samples/tutorial3/Tutorial3View.java
@@ -1,14 +1,24 @@
 package org.opencv.samples.tutorial3;
 
 import java.io.FileOutputStream;
+import java.io.OutputStream;
 import java.util.List;
+import java.util.Objects;
 
 import org.opencv.android.JavaCameraView;
 
+import android.content.ContentResolver;
+import android.content.ContentValues;
 import android.content.Context;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
 import android.hardware.Camera;
 import android.hardware.Camera.PictureCallback;
 import android.hardware.Camera.Size;
+import android.net.Uri;
+import android.os.Build;
+import android.os.Environment;
+import android.provider.MediaStore;
 import android.util.AttributeSet;
 import android.util.Log;
 
@@ -73,15 +83,36 @@ public void onPictureTaken(byte[] data, Camera camera) {
         mCamera.setPreviewCallback(this);
 
         // Write the image in a file (in jpeg format)
-        try {
-            FileOutputStream fos = new FileOutputStream(mPictureFileName);
-
-            fos.write(data);
-            fos.close();
-
-        } catch (java.io.IOException e) {
-            Log.e("PictureDemo", "Exception in photoCallback", e);
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) {
+            Bitmap bitmap = BitmapFactory.decodeByteArray(data, 0, data.length);
+            new Thread(new Runnable() {
+                @Override
+                public void run() {
+                    ContentResolver resolver = getContext().getContentResolver();
+                    ContentValues contentValues = new ContentValues();
+                    contentValues.put(MediaStore.MediaColumns.DISPLAY_NAME, mPictureFileName);
+                    contentValues.put(MediaStore.MediaColumns.MIME_TYPE, "image/jpg");
+                    contentValues.put(MediaStore.MediaColumns.RELATIVE_PATH, Environment.DIRECTORY_PICTURES);
+                    Uri imageUri = resolver.insert(MediaStore.Images.Media.EXTERNAL_CONTENT_URI, contentValues);
+                    try {
+                        OutputStream fos = resolver.openOutputStream(Objects.requireNonNull(imageUri));
+                        bitmap.compress(Bitmap.CompressFormat.JPEG, 100, fos);
+                        Objects.requireNonNull(fos).close();
+                    } catch (java.io.IOException e) {
+                        Log.e("PictureDemo", "Exception in photoCallback", e);
+                    }
+                }
+            }).start();
+        } else {
+            mPictureFileName = Environment.getExternalStoragePublicDirectory(Environment.DIRECTORY_PICTURES).getPath()
+                               + "/" + mPictureFileName;
+            try {
+                FileOutputStream fos = new FileOutputStream(mPictureFileName);
+                fos.write(data);
+                fos.close();
+            } catch (java.io.IOException e) {
+                Log.e("PictureDemo", "Exception in photoCallback", e);
+            }
         }
-
     }
 }
diff --git a/samples/android/tutorial-4-opencl/CMakeLists.txt b/samples/android/tutorial-4-opencl/CMakeLists.txt
index 94e955fe362a..92d24050ffd8 100644
--- a/samples/android/tutorial-4-opencl/CMakeLists.txt
+++ b/samples/android/tutorial-4-opencl/CMakeLists.txt
@@ -1,12 +1,4 @@
 set(sample example-tutorial-4-opencl)
-if(NOT DEFINED ANDROID_OPENCL_SDK)
-  message(STATUS "Sample ${sample} is disabled, because ANDROID_OPENCL_SDK is not specified")
-  return()
-endif()
-if(ANDROID_NATIVE_API_LEVEL LESS 14)
-  message(STATUS "Sample ${sample} is disabled, because ANDROID_NATIVE_API_LEVEL < 14")
-  return()
-endif()
 
 if(BUILD_FAT_JAVA_LIB)
   set(native_deps opencv_java)
@@ -14,12 +6,10 @@ else()
   set(native_deps opencv_imgproc)
 endif()
 
-include_directories(${ANDROID_OPENCL_SDK}/include)
-link_directories(${ANDROID_OPENCL_SDK}/lib/${ANDROID_NDK_ABI_NAME})
 add_android_project(${sample} "${CMAKE_CURRENT_SOURCE_DIR}"
     LIBRARY_DEPS "${OPENCV_ANDROID_LIB_DIR}"
     SDK_TARGET 21 "${ANDROID_SDK_TARGET}"
-    NATIVE_DEPS ${native_deps} -lGLESv2 -lEGL -lOpenCL
+    NATIVE_DEPS ${native_deps} -lGLESv2 -lEGL
     COPY_LIBS YES
 )
 if(TARGET ${sample})
diff --git a/samples/android/tutorial-4-opencl/build.gradle.in b/samples/android/tutorial-4-opencl/build.gradle.in
index 2cb9a7bcb961..8eeb12b17d46 100644
--- a/samples/android/tutorial-4-opencl/build.gradle.in
+++ b/samples/android/tutorial-4-opencl/build.gradle.in
@@ -1,6 +1,7 @@
 apply plugin: 'com.android.application'
 
 android {
+    namespace 'org.opencv.samples.tutorial4'
     compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
     defaultConfig {
         applicationId "org.opencv.samples.tutorial4"
@@ -11,7 +12,16 @@ android {
 
         externalNativeBuild {
             cmake {
-                arguments "-DOpenCV_DIR=" + project(':opencv').projectDir + "/@ANDROID_PROJECT_JNI_PATH@"@OPENCV_ANDROID_CMAKE_EXTRA_ARGS@
+                if (gradle.opencv_source == "sdk_path") {
+                    arguments "-DOpenCV_DIR=" + project(':opencv').projectDir + "/@ANDROID_PROJECT_JNI_PATH@",
+                              "-DOPENCV_FROM_SDK=TRUE",
+                              "-DANDROID_OPENCL_SDK=@ANDROID_OPENCL_SDK@" @OPENCV_ANDROID_CMAKE_EXTRA_ARGS@
+
+                } else {
+                    arguments "-DOPENCV_VERSION_MAJOR=@OPENCV_VERSION_MAJOR@",
+                              "-DOPENCV_FROM_SDK=FALSE",
+                              "-DANDROID_OPENCL_SDK=@ANDROID_OPENCL_SDK@" @OPENCV_ANDROID_CMAKE_EXTRA_ARGS@
+                }
                 targets "JNIpart"
             }
         }
@@ -25,7 +35,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -35,9 +44,20 @@ android {
              path '@ANDROID_SAMPLE_JNI_PATH@/CMakeLists.txt'
         }
     }
+    buildFeatures {
+        if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+            prefab true
+        }
+    }
 }
 
 dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation project(':opencv')
+    if (gradle.opencv_source == "sdk_path") {
+        println 'Using OpenCV from SDK'
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        println 'Using OpenCV from Maven repo'
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
 }
diff --git a/samples/android/tutorial-4-opencl/gradle/AndroidManifest.xml b/samples/android/tutorial-4-opencl/gradle/AndroidManifest.xml
index 7cef5cc67584..245daa859ee6 100644
--- a/samples/android/tutorial-4-opencl/gradle/AndroidManifest.xml
+++ b/samples/android/tutorial-4-opencl/gradle/AndroidManifest.xml
@@ -4,10 +4,6 @@
     android:versionCode="1"
     android:versionName="1.0" >
 
-    <uses-sdk
-        android:minSdkVersion="14"
-        android:targetSdkVersion="21" />
-
     <uses-feature android:glEsVersion="0x00020000" android:required="true"/>
     <uses-feature android:name="android.hardware.camera"/>
     <uses-feature android:name="android.hardware.camera2" android:required="false"/>
diff --git a/samples/android/tutorial-4-opencl/jni/Android.mk b/samples/android/tutorial-4-opencl/jni/Android.mk
deleted file mode 100644
index dacd0f665c71..000000000000
--- a/samples/android/tutorial-4-opencl/jni/Android.mk
+++ /dev/null
@@ -1,27 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-
-# add OpenCV
-include $(CLEAR_VARS)
-OPENCV_INSTALL_MODULES:=on
-ifdef OPENCV_ANDROID_SDK
-  ifneq ("","$(wildcard $(OPENCV_ANDROID_SDK)/OpenCV.mk)")
-    include ${OPENCV_ANDROID_SDK}/OpenCV.mk
-  else
-    include ${OPENCV_ANDROID_SDK}/sdk/native/jni/OpenCV.mk
-  endif
-else
-  include ../../sdk/native/jni/OpenCV.mk
-endif
-
-ifndef OPENCL_SDK
-  $(error Specify OPENCL_SDK to Android OpenCL SDK location)
-endif
-
-# add OpenCL
-LOCAL_C_INCLUDES += $(OPENCL_SDK)/include
-LOCAL_LDLIBS += -L$(OPENCL_SDK)/lib/$(TARGET_ARCH_ABI) -lOpenCL
-
-LOCAL_MODULE    := JNIpart
-LOCAL_SRC_FILES := jni.c CLprocessor.cpp
-LOCAL_LDLIBS    += -llog -lGLESv2 -lEGL
-include $(BUILD_SHARED_LIBRARY)
\ No newline at end of file
diff --git a/samples/android/tutorial-4-opencl/jni/Application.mk b/samples/android/tutorial-4-opencl/jni/Application.mk
deleted file mode 100644
index 06db65762a8d..000000000000
--- a/samples/android/tutorial-4-opencl/jni/Application.mk
+++ /dev/null
@@ -1,4 +0,0 @@
-APP_STL := gnustl_static
-APP_GNUSTL_FORCE_CPP_FEATURES := exceptions rtti
-APP_ABI := armeabi-v7a
-APP_PLATFORM := android-14
diff --git a/samples/android/tutorial-4-opencl/jni/CLprocessor.cpp b/samples/android/tutorial-4-opencl/jni/CLprocessor.cpp
index 27b878b3fad1..91be84f9b4dc 100644
--- a/samples/android/tutorial-4-opencl/jni/CLprocessor.cpp
+++ b/samples/android/tutorial-4-opencl/jni/CLprocessor.cpp
@@ -1,6 +1,8 @@
+#ifdef OPENCL_FOUND
 #define __CL_ENABLE_EXCEPTIONS
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS /*let's give a chance for OpenCL 1.1 devices*/
-#include <CL/cl.hpp>
+#include <CL/opencl.hpp>
+#endif
 
 #include <GLES2/gl2.h>
 #include <EGL/egl.h>
@@ -10,7 +12,9 @@
 #include <opencv2/core/ocl.hpp>
 
 #include "common.hpp"
+#include "CLprocessor.hpp"
 
+#ifdef OPENCL_FOUND
 const char oclProgB2B[] = "// clBuffer to clBuffer";
 const char oclProgI2B[] = "// clImage to clBuffer";
 const char oclProgI2I[] = \
@@ -33,7 +37,7 @@ const char oclProgI2I[] = \
     "    write_imagef(imgOut, pos, sum*10); \n" \
     "} \n";
 
-void dumpCLinfo()
+static void dumpCLinfo()
 {
     LOGD("*** OpenCL info ***");
     try
@@ -83,10 +87,11 @@ cl::CommandQueue theQueue;
 cl::Program theProgB2B, theProgI2B, theProgI2I;
 bool haveOpenCL = false;
 
-extern "C" void initCL()
+//![init_opencl]
+int initCL()
 {
     dumpCLinfo();
-
+    LOGE("initCL: start initCL");
     EGLDisplay mEglDisplay = eglGetCurrentDisplay();
     if (mEglDisplay == EGL_NO_DISPLAY)
         LOGE("initCL: eglGetCurrentDisplay() returned 'EGL_NO_DISPLAY', error = %x", eglGetError());
@@ -133,21 +138,26 @@ extern "C" void initCL()
     catch(const cl::Error& e)
     {
         LOGE("cl::Error: %s (%d)", e.what(), e.err());
+        return 1;
     }
     catch(const std::exception& e)
     {
         LOGE("std::exception: %s", e.what());
+        return 2;
     }
     catch(...)
     {
         LOGE( "OpenCL info: unknown error while initializing OpenCL stuff" );
+        return 3;
     }
     LOGD("initCL completed");
-}
 
-extern "C" void closeCL()
-{
+    if (haveOpenCL)
+        return 0;
+    else
+        return 4;
 }
+//![init_opencl]
 
 #define GL_TEXTURE_2D 0x0DE1
 void procOCL_I2I(int texIn, int texOut, int w, int h)
@@ -160,6 +170,7 @@ void procOCL_I2I(int texIn, int texOut, int w, int h)
     }
 
     LOGD("procOCL_I2I(%d, %d, %d, %d)", texIn, texOut, w, h);
+//![process_pure_opencl]
     cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, texIn);
     cl::ImageGL imgOut(theContext, CL_MEM_WRITE_ONLY, GL_TEXTURE_2D, 0, texOut);
     std::vector < cl::Memory > images;
@@ -187,6 +198,7 @@ void procOCL_I2I(int texIn, int texOut, int w, int h)
     theQueue.enqueueReleaseGLObjects(&images);
     theQueue.finish();
     LOGD("enqueueReleaseGLObjects() costs %d ms", getTimeInterval(t));
+//![process_pure_opencl]
 }
 
 void procOCL_OCV(int texIn, int texOut, int w, int h)
@@ -198,6 +210,7 @@ void procOCL_OCV(int texIn, int texOut, int w, int h)
         return;
     }
 
+//![process_tapi]
     int64_t t = getTimeMs();
     cl::ImageGL imgIn (theContext, CL_MEM_READ_ONLY,  GL_TEXTURE_2D, 0, texIn);
     std::vector < cl::Memory > images(1, imgIn);
@@ -224,11 +237,22 @@ void procOCL_OCV(int texIn, int texOut, int w, int h)
     cl_command_queue q = (cl_command_queue)cv::ocl::Queue::getDefault().ptr();
     size_t offset = 0;
     size_t origin[3] = { 0, 0, 0 };
-    size_t region[3] = { w, h, 1 };
+    size_t region[3] = { (size_t)w, (size_t)h, 1 };
     CV_Assert(clEnqueueCopyBufferToImage (q, clBuffer, imgOut(), offset, origin, region, 0, NULL, NULL) == CL_SUCCESS);
     theQueue.enqueueReleaseGLObjects(&images);
     cv::ocl::finish();
     LOGD("uploading results to texture costs %d ms", getTimeInterval(t));
+//![process_tapi]
+}
+#else
+int initCL()
+{
+    return 5;
+}
+#endif
+
+void closeCL()
+{
 }
 
 void drawFrameProcCPU(int w, int h, int texOut)
@@ -263,7 +287,7 @@ void drawFrameProcCPU(int w, int h, int texOut)
 
 enum ProcMode {PROC_MODE_NO_PROC=0, PROC_MODE_CPU=1, PROC_MODE_OCL_DIRECT=2, PROC_MODE_OCL_OCV=3};
 
-extern "C" void processFrame(int tex1, int tex2, int w, int h, int mode)
+void processFrame(int tex1, int tex2, int w, int h, int mode)
 {
     switch(mode)
     {
@@ -271,12 +295,14 @@ extern "C" void processFrame(int tex1, int tex2, int w, int h, int mode)
     case PROC_MODE_CPU:
         drawFrameProcCPU(w, h, tex2);
         break;
+#ifdef OPENCL_FOUND
     case PROC_MODE_OCL_DIRECT:
         procOCL_I2I(tex1, tex2, w, h);
         break;
     case PROC_MODE_OCL_OCV:
         procOCL_OCV(tex1, tex2, w, h);
         break;
+#endif
     default:
         LOGE("Unexpected processing mode: %d", mode);
     }
diff --git a/samples/android/tutorial-4-opencl/jni/CLprocessor.hpp b/samples/android/tutorial-4-opencl/jni/CLprocessor.hpp
new file mode 100644
index 000000000000..c293f253b58d
--- /dev/null
+++ b/samples/android/tutorial-4-opencl/jni/CLprocessor.hpp
@@ -0,0 +1,8 @@
+#ifndef __CL_PROCESSOR_HPP__
+#define __CL_PROCESSOR_HPP__
+
+int initCL();
+void closeCL();
+void processFrame(int tex1, int tex2, int w, int h, int mode);
+
+#endif
diff --git a/samples/android/tutorial-4-opencl/jni/CMakeLists.txt b/samples/android/tutorial-4-opencl/jni/CMakeLists.txt
index 4fdea1356ed6..d1a1dcf234d3 100644
--- a/samples/android/tutorial-4-opencl/jni/CMakeLists.txt
+++ b/samples/android/tutorial-4-opencl/jni/CMakeLists.txt
@@ -1,15 +1,42 @@
 cmake_minimum_required(VERSION 3.6)
 
-set(target mixed_sample)
+set(target JNIpart)
 project(${target} CXX)
 
-set(ANDROID_OPENCV_COMPONENTS "opencv_java" CACHE STRING "")
+if (OPENCV_FROM_SDK)
+  message(STATUS "Using OpenCV from local SDK")
+  set(ANDROID_OPENCV_COMPONENTS "opencv_java" CACHE STRING "")
+else()
+  message(STATUS "Using OpenCV from AAR (Maven repo)")
+  set(ANDROID_OPENCV_COMPONENTS "OpenCV::opencv_java${OPENCV_VERSION_MAJOR}" CACHE STRING "")
+endif()
+
 message(STATUS "ANDROID_ABI=${ANDROID_ABI}")
 find_package(OpenCV REQUIRED COMPONENTS ${ANDROID_OPENCV_COMPONENTS})
+find_package(OpenCL QUIET)
 
 file(GLOB srcs *.cpp *.c)
 file(GLOB hdrs *.hpp *.h)
 
 include_directories("${CMAKE_CURRENT_LIST_DIR}")
 add_library(${target} SHARED ${srcs} ${hdrs})
-target_link_libraries(${target} ${ANDROID_OPENCV_COMPONENTS} -lGLESv2 -lEGL -lOpenCL)
+
+target_link_libraries(${target} ${ANDROID_OPENCV_COMPONENTS} -lGLESv2 -lEGL -llog)
+
+if(OpenCL_FOUND)
+  include_directories(${OpenCL_INCLUDE_DIRS})
+  target_link_libraries(${target} ${OpenCL_LIBRARIES})
+  add_definitions("-DOPENCL_FOUND")
+elseif(NOT ("${ANDROID_OPENCL_SDK}" STREQUAL ""))
+  include_directories(${ANDROID_OPENCL_SDK}/include)
+  link_directories(${ANDROID_OPENCL_SDK}/lib)
+  target_link_directories(${target} PRIVATE ${ANDROID_OPENCL_SDK}/lib)
+
+  set_target_properties(${target} PROPERTIES LINK_FLAGS "-Wl,--allow-shlib-undefined")
+  target_link_libraries(${target} -lOpenCL)
+
+  add_definitions("-DOPENCL_FOUND")
+  add_definitions("-DCL_HPP_MINIMUM_OPENCL_VERSION=120")
+  add_definitions("-DCL_HPP_TARGET_OPENCL_VERSION=120")
+  add_definitions("-DCL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY")
+endif()
diff --git a/samples/android/tutorial-4-opencl/jni/jni.c b/samples/android/tutorial-4-opencl/jni/jni.c
deleted file mode 100644
index 0c48ab6063a1..000000000000
--- a/samples/android/tutorial-4-opencl/jni/jni.c
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <jni.h>
-
-int initCL();
-void closeCL();
-void processFrame(int tex1, int tex2, int w, int h, int mode);
-
-JNIEXPORT jint JNICALL Java_org_opencv_samples_tutorial4_NativePart_initCL(JNIEnv * env, jclass cls)
-{
-    return initCL();
-}
-
-JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial4_NativePart_closeCL(JNIEnv * env, jclass cls)
-{
-    closeCL();
-}
-
-JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial4_NativePart_processFrame(JNIEnv * env, jclass cls, jint tex1, jint tex2, jint w, jint h, jint mode)
-{
-    processFrame(tex1, tex2, w, h, mode);
-}
diff --git a/samples/android/tutorial-4-opencl/jni/jni.cpp b/samples/android/tutorial-4-opencl/jni/jni.cpp
new file mode 100644
index 000000000000..354fcb30b0f8
--- /dev/null
+++ b/samples/android/tutorial-4-opencl/jni/jni.cpp
@@ -0,0 +1,36 @@
+#include <jni.h>
+#include "CLprocessor.hpp"
+
+extern "C" {
+JNIEXPORT jboolean JNICALL Java_org_opencv_samples_tutorial4_NativePart_builtWithOpenCL(JNIEnv * env, jclass cls);
+
+JNIEXPORT jint JNICALL Java_org_opencv_samples_tutorial4_NativePart_initCL(JNIEnv * env, jclass cls);
+
+JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial4_NativePart_closeCL(JNIEnv * env, jclass cls);
+
+JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial4_NativePart_processFrame(JNIEnv * env, jclass cls, jint tex1, jint tex2, jint w, jint h, jint mode);
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_samples_tutorial4_NativePart_builtWithOpenCL(JNIEnv * env, jclass cls)
+{
+#ifdef OPENCL_FOUND
+    return JNI_TRUE;
+#else
+    return JNI_FALSE;
+#endif
+}
+
+JNIEXPORT jint JNICALL Java_org_opencv_samples_tutorial4_NativePart_initCL(JNIEnv * env, jclass cls)
+{
+    return initCL();
+}
+
+JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial4_NativePart_closeCL(JNIEnv * env, jclass cls)
+{
+    closeCL();
+}
+
+JNIEXPORT void JNICALL Java_org_opencv_samples_tutorial4_NativePart_processFrame(JNIEnv * env, jclass cls, jint tex1, jint tex2, jint w, jint h, jint mode)
+{
+    processFrame(tex1, tex2, w, h, mode);
+}
+} // extern "C"
diff --git a/samples/android/tutorial-4-opencl/res/menu/menu.xml b/samples/android/tutorial-4-opencl/res/menu/menu.xml
deleted file mode 100644
index a737e39d2c33..000000000000
--- a/samples/android/tutorial-4-opencl/res/menu/menu.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<menu xmlns:android="http://schemas.android.com/apk/res/android" >
-    <group android:checkableBehavior="single">
-        <item android:id="@+id/no_proc" android:title="No processing" />
-        <item android:id="@+id/cpu" android:title="Use CPU code" />
-        <item android:id="@+id/ocl_direct" android:title="Use OpenCL direct" />
-        <item android:id="@+id/ocl_ocv" android:title="Use OpenCL via OpenCV" />
-    </group>
-</menu>
diff --git a/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/MyGLSurfaceView.java b/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/MyGLSurfaceView.java
index edaf34631ca2..baa63e6a583b 100644
--- a/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/MyGLSurfaceView.java
+++ b/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/MyGLSurfaceView.java
@@ -13,6 +13,7 @@
 import android.widget.TextView;
 import android.widget.Toast;
 
+//![minimal_surface_view]
 public class MyGLSurfaceView extends CameraGLSurfaceView implements CameraGLSurfaceView.CameraTextureListener {
 
     static final String LOGTAG = "MyGLSurfaceView";
@@ -65,7 +66,8 @@ public void run() {
                 Toast.makeText(getContext(), "onCameraViewStarted", Toast.LENGTH_SHORT).show();
             }
         });
-        NativePart.initCL();
+        if (NativePart.builtWithOpenCL())
+            NativePart.initCL();
         frameCounter = 0;
         lastNanoTime = System.nanoTime();
     }
@@ -110,3 +112,4 @@ public void run() {
         return true;
     }
 }
+//![minimal_surface_view]
diff --git a/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/NativePart.java b/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/NativePart.java
index e3d11709a9ed..bed42d2f14b9 100644
--- a/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/NativePart.java
+++ b/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/NativePart.java
@@ -1,5 +1,6 @@
 package org.opencv.samples.tutorial4;
 
+//![native_part]
 public class NativePart {
     static
     {
@@ -12,7 +13,9 @@ public class NativePart {
     public static final int PROCESSING_MODE_OCL_DIRECT = 2;
     public static final int PROCESSING_MODE_OCL_OCV = 3;
 
+    public static native boolean builtWithOpenCL();
     public static native int initCL();
     public static native void closeCL();
     public static native void processFrame(int tex1, int tex2, int w, int h, int mode);
 }
+//![native_part]
diff --git a/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/Tutorial4Activity.java b/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/Tutorial4Activity.java
index 0be55df65e32..8738501ede56 100644
--- a/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/Tutorial4Activity.java
+++ b/samples/android/tutorial-4-opencl/src/org/opencv/samples/tutorial4/Tutorial4Activity.java
@@ -1,6 +1,5 @@
 package org.opencv.samples.tutorial4;
 
-import android.app.Activity;
 import android.content.pm.ActivityInfo;
 import android.os.Bundle;
 import android.view.Menu;
@@ -10,11 +9,20 @@
 import android.view.WindowManager;
 import android.widget.TextView;
 
-public class Tutorial4Activity extends Activity {
+import org.opencv.android.CameraActivity;
+
+public class Tutorial4Activity extends CameraActivity {
 
     private MyGLSurfaceView mView;
     private TextView mProcMode;
 
+    private boolean builtWithOpenCL = false;
+
+    private MenuItem mItemNoProc;
+    private MenuItem mItemCpu;
+    private MenuItem mItemOclDirect;
+    private MenuItem mItemOclOpenCV;
+
     @Override
     public void onCreate(Bundle savedInstanceState) {
         super.onCreate(savedInstanceState);
@@ -38,6 +46,7 @@ public void run() {
             }
         });
 
+        builtWithOpenCL = NativePart.builtWithOpenCL();
         mView.setProcessingMode(NativePart.PROCESSING_MODE_NO_PROCESSING);
     }
 
@@ -55,48 +64,37 @@ protected void onResume() {
 
     @Override
     public boolean onCreateOptionsMenu(Menu menu) {
-        MenuInflater inflater = getMenuInflater();
-        inflater.inflate(R.menu.menu, menu);
-        return super.onCreateOptionsMenu(menu);
+        mItemNoProc = menu.add("No processing");
+        mItemCpu = menu.add("Use CPU code");
+        if (builtWithOpenCL) {
+            mItemOclOpenCV = menu.add("Use OpenCL via OpenCV");
+            mItemOclDirect = menu.add("Use OpenCL direct");
+        }
+        return true;
     }
 
     @Override
     public boolean onOptionsItemSelected(MenuItem item) {
-        switch (item.getItemId()) {
-        case R.id.no_proc:
-            runOnUiThread(new Runnable() {
-                public void run() {
-                    mProcMode.setText("Processing mode: No Processing");
-                }
-            });
-            mView.setProcessingMode(NativePart.PROCESSING_MODE_NO_PROCESSING);
-            return true;
-        case R.id.cpu:
-            runOnUiThread(new Runnable() {
-                public void run() {
-                    mProcMode.setText("Processing mode: CPU");
-                }
-            });
-            mView.setProcessingMode(NativePart.PROCESSING_MODE_CPU);
-            return true;
-        case R.id.ocl_direct:
-            runOnUiThread(new Runnable() {
-                public void run() {
-                    mProcMode.setText("Processing mode: OpenCL direct");
-                }
-            });
-            mView.setProcessingMode(NativePart.PROCESSING_MODE_OCL_DIRECT);
-            return true;
-        case R.id.ocl_ocv:
-            runOnUiThread(new Runnable() {
-                public void run() {
-                    mProcMode.setText("Processing mode: OpenCL via OpenCV (TAPI)");
-                }
-            });
-            mView.setProcessingMode(NativePart.PROCESSING_MODE_OCL_OCV);
-            return true;
-        default:
-            return false;
+        String procName = "Not selected";
+        int procMode = NativePart.PROCESSING_MODE_NO_PROCESSING;
+
+        if (item == mItemNoProc) {
+            procMode = NativePart.PROCESSING_MODE_NO_PROCESSING;
+            procName = "Processing mode: No Processing";
+        } else if (item == mItemCpu) {
+            procMode = NativePart.PROCESSING_MODE_CPU;
+            procName = "Processing mode: CPU";
+        } else if (item == mItemOclOpenCV && builtWithOpenCL) {
+            procMode = NativePart.PROCESSING_MODE_OCL_OCV;
+            procName = "Processing mode: OpenCL via OpenCV (TAPI)";
+        } else if (item == mItemOclDirect && builtWithOpenCL) {
+            procMode = NativePart.PROCESSING_MODE_OCL_DIRECT;
+            procName = "Processing mode: OpenCL direct";
         }
+
+        mView.setProcessingMode(procMode);
+        mProcMode.setText(procName);
+
+        return true;
     }
-}
\ No newline at end of file
+}
diff --git a/samples/android/video-recorder/CMakeLists.txt b/samples/android/video-recorder/CMakeLists.txt
new file mode 100644
index 000000000000..e833d8fd6f5e
--- /dev/null
+++ b/samples/android/video-recorder/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(sample example-video-recorder)
+
+if(BUILD_FAT_JAVA_LIB)
+  set(native_deps opencv_java)
+else()
+  set(native_deps videoio)
+endif()
+
+add_android_project(${sample} "${CMAKE_CURRENT_SOURCE_DIR}" LIBRARY_DEPS "${OPENCV_ANDROID_LIB_DIR}" SDK_TARGET 11 "${ANDROID_SDK_TARGET}" NATIVE_DEPS ${native_deps})
+if(TARGET ${sample})
+  add_dependencies(opencv_android_examples ${sample})
+endif()
diff --git a/samples/android/video-recorder/build.gradle.in b/samples/android/video-recorder/build.gradle.in
new file mode 100644
index 000000000000..d096f3190a2c
--- /dev/null
+++ b/samples/android/video-recorder/build.gradle.in
@@ -0,0 +1,35 @@
+apply plugin: 'com.android.application'
+
+android {
+    namespace 'org.opencv.samples.recorder'
+    compileSdkVersion @ANDROID_COMPILE_SDK_VERSION@
+    defaultConfig {
+        applicationId "org.opencv.samples.recorder"
+        minSdkVersion @ANDROID_MIN_SDK_VERSION@
+        targetSdkVersion @ANDROID_TARGET_SDK_VERSION@
+        versionCode 301
+        versionName "3.01"
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    sourceSets {
+        main {
+            java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
+            res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
+            manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
+        }
+    }
+}
+
+dependencies {
+    //implementation fileTree(dir: 'libs', include: ['*.jar'])
+    if (gradle.opencv_source == "sdk_path") {
+        implementation project(':opencv')
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
+        implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
+    }
+}
diff --git a/samples/android/video-recorder/gradle/AndroidManifest.xml b/samples/android/video-recorder/gradle/AndroidManifest.xml
new file mode 100644
index 000000000000..fc4129c0f202
--- /dev/null
+++ b/samples/android/video-recorder/gradle/AndroidManifest.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+          package="org.opencv.samples.recorder"
+>
+
+    <application
+        android:label="@string/app_name"
+        android:icon="@drawable/icon"
+        android:theme="@android:style/Theme.NoTitleBar.Fullscreen" >
+
+        <activity
+                  android:exported="true"
+                  android:name="RecorderActivity"
+                  android:label="@string/app_name"
+                  android:screenOrientation="landscape"
+                  android:configChanges="keyboardHidden|orientation">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+    <supports-screens android:resizeable="true"
+                      android:smallScreens="true"
+                      android:normalScreens="true"
+                      android:largeScreens="true"
+                      android:anyDensity="true" />
+
+    <uses-permission android:name="android.permission.CAMERA"/>
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
+
+    <uses-feature android:name="android.hardware.camera" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.autofocus" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.front" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.front.autofocus" android:required="false"/>
+
+</manifest>
diff --git a/samples/android/video-recorder/res/drawable/icon.png b/samples/android/video-recorder/res/drawable/icon.png
new file mode 100644
index 000000000000..630454927b59
Binary files /dev/null and b/samples/android/video-recorder/res/drawable/icon.png differ
diff --git a/samples/android/video-recorder/res/layout/recorder_surface_view.xml b/samples/android/video-recorder/res/layout/recorder_surface_view.xml
new file mode 100644
index 000000000000..9df6114140ee
--- /dev/null
+++ b/samples/android/video-recorder/res/layout/recorder_surface_view.xml
@@ -0,0 +1,38 @@
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    xmlns:opencv="http://schemas.android.com/apk/res-auto"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent" >
+
+    <Button
+        android:id="@+id/btn1"
+        android:layout_width="wrap_content"
+        android:layout_height="50dp"
+        android:layout_margin="10dp"
+        android:text="Start Camera" />
+
+    <TextView
+        android:id="@+id/textview1"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_gravity="right"
+        android:layout_margin="10dp"
+        android:text="Status: Initialized"
+        android:textColor="#FF0000" />
+
+    <org.opencv.android.JavaCameraView
+        android:layout_width="fill_parent"
+        android:layout_height="fill_parent"
+        android:visibility="gone"
+        android:id="@+id/recorder_activity_java_surface_view"
+        opencv:show_fps="true"
+        opencv:camera_id="any" />
+
+    <ImageView
+        android:id="@+id/image_view"
+        android:layout_width="fill_parent"
+        android:layout_height="fill_parent"
+        android:visibility="gone"
+        />
+
+</FrameLayout>
diff --git a/samples/android/video-recorder/res/values/strings.xml b/samples/android/video-recorder/res/values/strings.xml
new file mode 100644
index 000000000000..f53bbc5a4d8e
--- /dev/null
+++ b/samples/android/video-recorder/res/values/strings.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <string name="app_name">OpenCV Video Recorder</string>
+</resources>
diff --git a/samples/android/video-recorder/src/org/opencv/samples/recorder/RecorderActivity.java b/samples/android/video-recorder/src/org/opencv/samples/recorder/RecorderActivity.java
new file mode 100644
index 000000000000..d4faea51b24c
--- /dev/null
+++ b/samples/android/video-recorder/src/org/opencv/samples/recorder/RecorderActivity.java
@@ -0,0 +1,340 @@
+package org.opencv.samples.recorder;
+
+import org.opencv.android.CameraActivity;
+import org.opencv.android.CameraBridgeViewBase.CvCameraViewFrame;
+import org.opencv.android.OpenCVLoader;
+import org.opencv.android.Utils;
+import org.opencv.core.Mat;
+import org.opencv.core.Size;
+import org.opencv.imgproc.Imgproc;
+import org.opencv.videoio.VideoCapture;
+import org.opencv.videoio.VideoWriter;
+import org.opencv.android.CameraBridgeViewBase;
+import org.opencv.android.CameraBridgeViewBase.CvCameraViewListener2;
+import org.opencv.videoio.Videoio;
+
+import android.graphics.Bitmap;
+import android.os.Bundle;
+import android.os.Handler;
+import android.util.Log;
+import android.view.MenuItem;
+import android.view.SurfaceView;
+import android.view.WindowManager;
+import android.view.View;
+import android.widget.Button;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.Toast;
+
+import java.io.File;
+import java.util.Collections;
+import java.util.List;
+
+public class RecorderActivity extends CameraActivity implements CvCameraViewListener2, View.OnClickListener {
+    private static final String TAG = "OCVSample::Activity";
+    private static final String FILENAME_MP4 = "sample_video1.mp4";
+    private static final String FILENAME_AVI = "sample_video1.avi";
+
+    private static final int STATUS_FINISHED_PLAYBACK = 0;
+    private static final int STATUS_PREVIEW = 1;
+    private static final int STATUS_RECORDING = 2;
+    private static final int STATUS_PLAYING = 3;
+    private static final int STATUS_ERROR = 4;
+
+    private String mVideoFilename;
+    private boolean mUseBuiltInMJPG = false;
+
+    private int mStatus = STATUS_FINISHED_PLAYBACK;
+    private int mFPS = 30;
+    private int mWidth = 0, mHeight = 0;
+
+    private CameraBridgeViewBase mOpenCvCameraView;
+    private ImageView mImageView;
+    private Button mTriggerButton;
+    private TextView mStatusTextView;
+    Runnable mPlayerThread;
+
+    private VideoWriter mVideoWriter = null;
+    private VideoCapture mVideoCapture = null;
+    private Mat mVideoFrame;
+    private Mat mRenderFrame;
+
+    public RecorderActivity() {
+        Log.i(TAG, "Instantiated new " + this.getClass());
+    }
+
+    /** Called when the activity is first created. */
+    @Override
+    public void onCreate(Bundle savedInstanceState) {
+        Log.i(TAG, "called onCreate");
+        super.onCreate(savedInstanceState);
+        getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+
+        setContentView(R.layout.recorder_surface_view);
+
+        mStatusTextView = (TextView) findViewById(R.id.textview1);
+        mStatusTextView.bringToFront();
+
+        if (OpenCVLoader.initLocal()) {
+            Log.i(TAG, "OpenCV loaded successfully");
+        } else {
+            Log.e(TAG, "OpenCV initialization failed!");
+            mStatus = STATUS_ERROR;
+            mStatusTextView.setText("Error: Can't initialize OpenCV");
+            return;
+        }
+
+        mOpenCvCameraView = (CameraBridgeViewBase) findViewById(R.id.recorder_activity_java_surface_view);
+        mOpenCvCameraView.setVisibility(SurfaceView.GONE);
+        mOpenCvCameraView.setCvCameraViewListener(this);
+        mOpenCvCameraView.disableView();
+
+        mImageView = (ImageView) findViewById(R.id.image_view);
+
+        mTriggerButton = (Button) findViewById(R.id.btn1);
+        mTriggerButton.setOnClickListener(this);
+        mTriggerButton.bringToFront();
+
+        if (mUseBuiltInMJPG)
+            mVideoFilename = getFilesDir() + "/" + FILENAME_AVI;
+        else
+            mVideoFilename = getFilesDir() + "/" + FILENAME_MP4;
+    }
+
+    @Override
+    public void onPause()
+    {
+        Log.d(TAG, "Pause");
+        super.onPause();
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.disableView();
+        mImageView.setVisibility(SurfaceView.GONE);
+        if (mVideoWriter != null) {
+            mVideoWriter.release();
+            mVideoWriter = null;
+        }
+        if (mVideoCapture != null) {
+            mVideoCapture.release();
+            mVideoCapture = null;
+        }
+        mStatus = STATUS_FINISHED_PLAYBACK;
+        mStatusTextView.setText("Status: Finished playback");
+        mTriggerButton.setText("Start Camera");
+
+        mVideoFrame.release();
+        mRenderFrame.release();
+    }
+
+    @Override
+    public void onResume()
+    {
+        Log.d(TAG, "onResume");
+        super.onResume();
+
+        mVideoFrame = new Mat();
+        mRenderFrame = new Mat();
+
+        changeStatus();
+    }
+
+    @Override
+    protected List<? extends CameraBridgeViewBase> getCameraViewList() {
+        return Collections.singletonList(mOpenCvCameraView);
+    }
+
+    public void onDestroy() {
+        Log.d(TAG, "called onDestroy");
+        super.onDestroy();
+        if (mOpenCvCameraView != null)
+            mOpenCvCameraView.disableView();
+        if (mVideoWriter != null)
+            mVideoWriter.release();
+        if (mVideoCapture != null)
+            mVideoCapture.release();
+    }
+
+    public void onCameraViewStarted(int width, int height) {
+        Log.d(TAG, "Camera view started " + String.valueOf(width) + "x" + String.valueOf(height));
+        mWidth = width;
+        mHeight = height;
+    }
+
+    public void onCameraViewStopped() {
+        Log.d(TAG, "Camera view stopped");
+    }
+
+    public Mat onCameraFrame(CvCameraViewFrame inputFrame)
+    {
+        Log.d(TAG, "Camera frame arrived");
+
+        Mat rgbMat = inputFrame.rgba();
+
+        Log.d(TAG, "Size: " + rgbMat.width() + "x" + rgbMat.height());
+
+        if (mVideoWriter != null && mVideoWriter.isOpened()) {
+            Imgproc.cvtColor(rgbMat, mVideoFrame, Imgproc.COLOR_RGBA2BGR);
+            mVideoWriter.write(mVideoFrame);
+        }
+
+        return rgbMat;
+    }
+
+    @Override
+    public void onClick(View view) {
+        Log.i(TAG,"onClick event");
+        changeStatus();
+    }
+
+    public void changeStatus() {
+        switch(mStatus) {
+            case STATUS_ERROR:
+                Toast.makeText(this, "Error", Toast.LENGTH_LONG).show();
+                break;
+            case STATUS_FINISHED_PLAYBACK:
+                if (!startPreview()) {
+                    setErrorStatus();
+                    break;
+                }
+                mStatus = STATUS_PREVIEW;
+                mStatusTextView.setText("Status: Camera preview");
+                mTriggerButton.setText("Start recording");
+                break;
+            case STATUS_PREVIEW:
+                if (!startRecording()) {
+                    setErrorStatus();
+                    break;
+                }
+                mStatus = STATUS_RECORDING;
+                mStatusTextView.setText("Status: recording video");
+                mTriggerButton.setText(" Stop and play video");
+                break;
+            case STATUS_RECORDING:
+                if (!stopRecording()) {
+                    setErrorStatus();
+                    break;
+                }
+                if (!startPlayback()) {
+                    setErrorStatus();
+                    break;
+                }
+                mStatus = STATUS_PLAYING;
+                mStatusTextView.setText("Status: Playing video");
+                mTriggerButton.setText("Stop playback");
+                break;
+            case STATUS_PLAYING:
+                if (!stopPlayback()) {
+                    setErrorStatus();
+                    break;
+                }
+                mStatus = STATUS_FINISHED_PLAYBACK;
+                mStatusTextView.setText("Status: Finished playback");
+                mTriggerButton.setText("Start Camera");
+                break;
+        }
+    }
+
+    public void setErrorStatus() {
+        mStatus = STATUS_ERROR;
+        mStatusTextView.setText("Status: Error");
+    }
+
+    public boolean startPreview() {
+        mOpenCvCameraView.enableView();
+        mOpenCvCameraView.setVisibility(View.VISIBLE);
+        return true;
+    }
+
+    public boolean startRecording() {
+        Log.i(TAG,"Starting recording");
+
+        File file = new File(mVideoFilename);
+        file.delete();
+
+        mVideoWriter = new VideoWriter();
+        if (!mUseBuiltInMJPG) {
+            mVideoWriter.open(mVideoFilename, Videoio.CAP_ANDROID, VideoWriter.fourcc('H', '2', '6', '4'), mFPS, new Size(mWidth, mHeight));
+            if (!mVideoWriter.isOpened()) {
+                Log.i(TAG,"Can't record H264. Switching to MJPG");
+                mUseBuiltInMJPG = true;
+                mVideoFilename = getFilesDir() + "/" + FILENAME_AVI;
+            }
+        }
+
+        if (mUseBuiltInMJPG) {
+            mVideoWriter.open(mVideoFilename, VideoWriter.fourcc('M', 'J', 'P', 'G'), mFPS, new Size(mWidth, mHeight));
+        }
+
+        Log.d(TAG, "Size: " + String.valueOf(mWidth) + "x" + String.valueOf(mHeight));
+        Log.d(TAG, "File: " + mVideoFilename);
+
+        if (mVideoWriter.isOpened()) {
+            Toast.makeText(this, "Record started to file " + mVideoFilename, Toast.LENGTH_LONG).show();
+            return true;
+        } else {
+            Toast.makeText(this, "Failed to start a record", Toast.LENGTH_LONG).show();
+            return false;
+        }
+    }
+
+    public boolean stopRecording() {
+        Log.i(TAG, "Finishing recording");
+        mOpenCvCameraView.disableView();
+        mOpenCvCameraView.setVisibility(SurfaceView.GONE);
+        mVideoWriter.release();
+        mVideoWriter = null;
+        return true;
+    }
+
+    public boolean startPlayback() {
+        mImageView.setVisibility(SurfaceView.VISIBLE);
+
+        if (!mUseBuiltInMJPG){
+            mVideoCapture = new VideoCapture(mVideoFilename, Videoio.CAP_ANDROID);
+        } else {
+            mVideoCapture = new VideoCapture(mVideoFilename, Videoio.CAP_OPENCV_MJPEG);
+        }
+
+        if (!mVideoCapture.isOpened()) {
+            Log.e(TAG, "Can't open video");
+            Toast.makeText(this, "Can't open file " + mVideoFilename, Toast.LENGTH_SHORT).show();
+            return false;
+        }
+
+        Toast.makeText(this, "Starting playback from file " + mVideoFilename, Toast.LENGTH_SHORT).show();
+
+        mPlayerThread = new Runnable() {
+            @Override
+            public void run() {
+                if (mVideoCapture == null || !mVideoCapture.isOpened()) {
+                    return;
+                }
+                mVideoCapture.read(mVideoFrame);
+                if (mVideoFrame.empty()) {
+                    if (mStatus == STATUS_PLAYING) {
+                        changeStatus();
+                    }
+                    return;
+                }
+                // VideoCapture with CAP_ANDROID generates RGB frames instead of BGR
+                // https://github.com/opencv/opencv/issues/24687
+                Imgproc.cvtColor(mVideoFrame, mRenderFrame, mUseBuiltInMJPG ? Imgproc.COLOR_BGR2RGBA: Imgproc.COLOR_RGB2RGBA);
+                Bitmap bmp = Bitmap.createBitmap(mRenderFrame.cols(), mRenderFrame.rows(), Bitmap.Config.ARGB_8888);
+                Utils.matToBitmap(mRenderFrame, bmp);
+                mImageView.setImageBitmap(bmp);
+                Handler h = new Handler();
+                h.postDelayed(this, 33);
+            }
+        };
+
+        mPlayerThread.run();
+        return true;
+    }
+
+    public boolean stopPlayback() {
+        mVideoCapture.release();
+        mVideoCapture = null;
+        mImageView.setVisibility(SurfaceView.GONE);
+        return true;
+    }
+
+}
diff --git a/samples/cpp/aruco_dict_utils.cpp b/samples/cpp/aruco_dict_utils.cpp
index 4a33f15bbfee..20fecd82e9b9 100644
--- a/samples/cpp/aruco_dict_utils.cpp
+++ b/samples/cpp/aruco_dict_utils.cpp
@@ -283,7 +283,7 @@ int main(int argc, char *argv[])
     int markerSize = parser.get<int>("markerSize");
     bool checkFlippedMarkers = parser.get<bool>("r");
 
-    aruco::Dictionary dictionary = aruco::getPredefinedDictionary(0);
+    aruco::Dictionary dictionary = aruco::getPredefinedDictionary(cv::aruco::DICT_4X4_50);
 
     if (parser.has("d")) {
         string arucoDictName = parser.get<string>("d");
diff --git a/samples/cpp/simd_basic.cpp b/samples/cpp/simd_basic.cpp
index 9af4d91cef03..ef78c39a4566 100644
--- a/samples/cpp/simd_basic.cpp
+++ b/samples/cpp/simd_basic.cpp
@@ -38,8 +38,8 @@ int main(int /*argc*/, char** /*argv*/)
 
     printf("==================  arithm check  =================\n");
     v_uint8 a = vx_setall_u8(10);
-    v_uint8 c = a + vx_setall_u8(45);
-    printf("(vx_setall_u8(10) + vx_setall_u8(45)).get0() => %d\n", (int)c.get0());
+    v_uint8 c = v_add(a, vx_setall_u8(45));
+    printf("v_get0(vx_setall_u8(10) + vx_setall_u8(45)) => %d\n", (int)v_get0(c));
 #else
     printf("\nSIMD intrinsics are not available. Check compilation target and passed build options.\n");
 #endif
diff --git a/samples/cpp/train_HOG.cpp b/samples/cpp/train_HOG.cpp
index 4a160fe4ebc6..c8355ee591b4 100644
--- a/samples/cpp/train_HOG.cpp
+++ b/samples/cpp/train_HOG.cpp
@@ -47,7 +47,7 @@ void convert_to_ml( const vector< Mat > & train_samples, Mat& trainData )
     //--Convert data
     const int rows = (int)train_samples.size();
     const int cols = (int)std::max( train_samples[0].cols, train_samples[0].rows );
-    Mat tmp( 1, cols, CV_32FC1 ); //< used for transposition if needed
+    Mat tmp( 1, cols, CV_32FC1 ); ///< used for transposition if needed
     trainData = Mat( rows, cols, CV_32FC1 );
 
     for( size_t i = 0 ; i < train_samples.size(); ++i )
diff --git a/samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp b/samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
index 2eeff1cf0d17..b9736da065d9 100644
--- a/samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
+++ b/samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
@@ -45,9 +45,9 @@ int main( int argc, char** argv )
     printf( " ** Press 'r' to set the border to be replicated \n");
     printf( " ** Press 'ESC' to exit the program \n");
 
-    //![create_window]
+//![create_window]
     namedWindow( window_name, WINDOW_AUTOSIZE );
-    //![create_window]
+//![create_window]
 
     //![init_arguments]
     // Initialize arguments for the filter
diff --git a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
index e46af18dcd1e..a80962b4c354 100644
--- a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
+++ b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
@@ -354,7 +354,7 @@ int main(int argc, char* argv[])
     }
     else {
         // default dictionary
-        dictionary = cv::aruco::getPredefinedDictionary(0);
+        dictionary = cv::aruco::getPredefinedDictionary(cv::aruco::DICT_4X4_50);
     }
     cv::aruco::CharucoBoard ch_board({s.boardSize.width, s.boardSize.height}, s.squareSize, s.markerSize, dictionary);
     cv::aruco::CharucoDetector ch_detector(ch_board);
diff --git a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp
index 2dcc1ff107c6..5fbe81cd1dcb 100644
--- a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp
+++ b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp
@@ -2,6 +2,8 @@
 #include <opencv2/core.hpp>
 #include <opencv2/imgcodecs.hpp>
 
+#define PARALLEL_FOR_LAMBDA
+
 using namespace std;
 using namespace cv;
 
@@ -33,6 +35,8 @@ int mandelbrotFormula(const complex<float> &z0, const int maxIter=500) {
 }
 //! [mandelbrot-grayscale-value]
 
+#ifndef PARALLEL_FOR_LAMBDA
+
 //! [mandelbrot-parallel]
 class ParallelMandelbrot : public ParallelLoopBody
 {
@@ -71,6 +75,8 @@ class ParallelMandelbrot : public ParallelLoopBody
 };
 //! [mandelbrot-parallel]
 
+#endif // !PARALLEL_FOR_LAMBDA
+
 //! [mandelbrot-sequential]
 void sequentialMandelbrot(Mat &img, const float x1, const float y1, const float scaleX, const float scaleY)
 {
@@ -102,7 +108,7 @@ int main()
 
     double t1 = (double) getTickCount();
 
-    #ifdef CV_CXX11
+#ifdef PARALLEL_FOR_LAMBDA
 
     //! [mandelbrot-parallel-call-cxx11]
     parallel_for_(Range(0, mandelbrotImg.rows*mandelbrotImg.cols), [&](const Range& range){
@@ -121,14 +127,14 @@ int main()
     });
     //! [mandelbrot-parallel-call-cxx11]
 
-    #else
+#else // PARALLEL_FOR_LAMBDA
 
     //! [mandelbrot-parallel-call]
     ParallelMandelbrot parallelMandelbrot(mandelbrotImg, x1, y1, scaleX, scaleY);
     parallel_for_(Range(0, mandelbrotImg.rows*mandelbrotImg.cols), parallelMandelbrot);
     //! [mandelbrot-parallel-call]
 
-    #endif
+#endif // PARALLEL_FOR_LAMBDA
 
     t1 = ((double) getTickCount() - t1) / getTickFrequency();
     cout << "Parallel Mandelbrot: " << t1 << " s" << endl;
diff --git a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp
index cfa9d22b0d05..cab73874a41d 100644
--- a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp
+++ b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp
@@ -4,6 +4,8 @@
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 
+#define PARALLEL_FOR_LAMBDA
+
 using namespace std;
 using namespace cv;
 
@@ -47,7 +49,8 @@ void conv_seq(Mat src, Mat &dst, Mat kernel)
 }
 //! [convolution-sequential]
 
-#ifdef CV_CXX11
+#ifdef PARALLEL_FOR_LAMBDA
+
 void conv_parallel(Mat src, Mat &dst, Mat kernel)
 {
     int rows = src.rows, cols = src.cols;
@@ -118,7 +121,8 @@ void conv_parallel_row_split(Mat src, Mat &dst, Mat kernel)
                     });
     //! [convolution-parallel-cxx11-row-split]
 }
-#else
+
+#else // PARALLEL_FOR_LAMBDA
 
 //! [convolution-parallel]
 class parallelConvolution : public ParallelLoopBody
@@ -235,7 +239,7 @@ void conv_parallel_row_split(Mat src, Mat &dst, Mat kernel)
     //! [convolution-parallel-function-row]
 }
 
-#endif
+#endif // PARALLEL_FOR_LAMBDA
 
 static void help(char *progName)
 {
@@ -329,4 +333,4 @@ int main(int argc, char *argv[])
     // imwrite("dst.png", dst);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp b/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
index ac1c205258e3..d9e0d1f94db2 100644
--- a/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
+++ b/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
@@ -59,12 +59,12 @@ int main(int,char**)
     cout << "C = " << endl << " " << C << endl << endl;
     //! [comma]
     // do the same with initializer_list
-#ifdef CV_CXX11
+
     //! [list]
     C = (Mat_<double>({0, -1, 0, -1, 5, -1, 0, -1, 0})).reshape(3);
     cout << "C = " << endl << " " << C << endl << endl;
     //! [list]
-#endif
+
     //! [clone]
     Mat RowClone = C.row(1).clone();
     cout << "RowClone = " << endl << " " << RowClone << endl << endl;
diff --git a/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp b/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp
index 9be4170d7b5f..52018461c300 100644
--- a/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp
+++ b/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp
@@ -85,7 +85,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int
 
     //! [convolution-1D-main]
     //! [convolution-1D-main-h1]
-    int step = v_float32().nlanes;
+    int step = VTraits<v_float32x4>::vlanes();
     float *sptr = src_32.ptr<float>(row), *kptr = kernel.ptr<float>(rowk);
     for (int k = 0; k < ksize; k++)
     {
@@ -96,7 +96,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int
         for (i = 0; i + step < len; i += step)
         {
             v_float32 window = vx_load(sptr + i + k);
-            v_float32 sum = vx_load(ans + i) + kernel_wide * window;
+            v_float32 sum = v_add(vx_load(ans + i), v_mul(kernel_wide, window));
             v_store(ans + i, sum);
         }
     //! [convolution-1D-main-h2]
@@ -122,7 +122,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel)
 
     copyMakeBorder(src, src, sz, sz, 0, 0, BORDER_REPLICATE);
 
-    int step = v_float32().nlanes;
+    int step = VTraits<v_float32x4>::vlanes();
     //! [convolution-2D-init]
 
     //! [convolution-2D-main]
@@ -135,7 +135,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel)
             int j;
             for (j = 0; j + step < cols; j += step)
             {
-                v_float32 sum = vx_load(&dst.ptr<float>(i)[j]) + vx_load(&ans[j]);
+                v_float32 sum = v_add(vx_load(&dst.ptr<float>(i)[j]), vx_load(&ans[j]));
                 v_store(&dst.ptr<float>(i)[j], sum);
             }
 
diff --git a/samples/cpp/tutorial_code/objectDetection/aruco_samples_utility.hpp b/samples/cpp/tutorial_code/objectDetection/aruco_samples_utility.hpp
new file mode 100644
index 000000000000..05c52e1133bb
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/aruco_samples_utility.hpp
@@ -0,0 +1,91 @@
+#include <opencv2/highgui.hpp>
+#include <opencv2/objdetect/aruco_detector.hpp>
+#include <opencv2/calib3d.hpp>
+#include <ctime>
+
+namespace {
+inline static bool readCameraParameters(const std::string& filename, cv::Mat &camMatrix, cv::Mat &distCoeffs) {
+    cv::FileStorage fs(filename, cv::FileStorage::READ);
+    if (!fs.isOpened())
+        return false;
+    fs["camera_matrix"] >> camMatrix;
+    fs["distortion_coefficients"] >> distCoeffs;
+    return true;
+}
+
+inline static bool saveCameraParams(const std::string &filename, cv::Size imageSize, float aspectRatio, int flags,
+                                    const cv::Mat &cameraMatrix, const cv::Mat &distCoeffs, double totalAvgErr) {
+    cv::FileStorage fs(filename, cv::FileStorage::WRITE);
+    if (!fs.isOpened())
+        return false;
+
+    time_t tt;
+    time(&tt);
+    struct tm *t2 = localtime(&tt);
+    char buf[1024];
+    strftime(buf, sizeof(buf) - 1, "%c", t2);
+
+    fs << "calibration_time" << buf;
+    fs << "image_width" << imageSize.width;
+    fs << "image_height" << imageSize.height;
+
+    if (flags & cv::CALIB_FIX_ASPECT_RATIO) fs << "aspectRatio" << aspectRatio;
+
+    if (flags != 0) {
+        sprintf(buf, "flags: %s%s%s%s",
+                flags & cv::CALIB_USE_INTRINSIC_GUESS ? "+use_intrinsic_guess" : "",
+                flags & cv::CALIB_FIX_ASPECT_RATIO ? "+fix_aspectRatio" : "",
+                flags & cv::CALIB_FIX_PRINCIPAL_POINT ? "+fix_principal_point" : "",
+                flags & cv::CALIB_ZERO_TANGENT_DIST ? "+zero_tangent_dist" : "");
+    }
+    fs << "flags" << flags;
+    fs << "camera_matrix" << cameraMatrix;
+    fs << "distortion_coefficients" << distCoeffs;
+    fs << "avg_reprojection_error" << totalAvgErr;
+    return true;
+}
+
+inline static cv::aruco::DetectorParameters readDetectorParamsFromCommandLine(cv::CommandLineParser &parser) {
+    cv::aruco::DetectorParameters detectorParams;
+    if (parser.has("dp")) {
+        cv::FileStorage fs(parser.get<std::string>("dp"), cv::FileStorage::READ);
+        bool readOk = detectorParams.readDetectorParameters(fs.root());
+        if(!readOk) {
+            throw std::runtime_error("Invalid detector parameters file\n");
+        }
+    }
+    return detectorParams;
+}
+
+inline static void readCameraParamsFromCommandLine(cv::CommandLineParser &parser, cv::Mat& camMatrix, cv::Mat& distCoeffs) {
+    //! [camDistCoeffs]
+    if(parser.has("c")) {
+        bool readOk = readCameraParameters(parser.get<std::string>("c"), camMatrix, distCoeffs);
+        if(!readOk) {
+            throw std::runtime_error("Invalid camera file\n");
+        }
+    }
+    //! [camDistCoeffs]
+}
+
+inline static cv::aruco::Dictionary readDictionatyFromCommandLine(cv::CommandLineParser &parser) {
+    cv::aruco::Dictionary dictionary;
+    if (parser.has("cd")) {
+        cv::FileStorage fs(parser.get<std::string>("cd"), cv::FileStorage::READ);
+        bool readOk = dictionary.readDictionary(fs.root());
+        if(!readOk) {
+            throw std::runtime_error("Invalid dictionary file\n");
+        }
+    }
+    else {
+        int dictionaryId = parser.has("d") ? parser.get<int>("d"): cv::aruco::DICT_4X4_50;
+        if (!parser.has("d")) {
+            std::cout << "The default DICT_4X4_50 dictionary has been selected, you could "
+                         "select the specific dictionary using flags -d or -cd." << std::endl;
+        }
+        dictionary = cv::aruco::getPredefinedDictionary(dictionaryId);
+    }
+    return dictionary;
+}
+
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/calibrate_camera.cpp b/samples/cpp/tutorial_code/objectDetection/calibrate_camera.cpp
new file mode 100644
index 000000000000..b415477019c5
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/calibrate_camera.cpp
@@ -0,0 +1,188 @@
+#include <ctime>
+#include <iostream>
+#include <vector>
+#include <opencv2/calib3d.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/objdetect/aruco_detector.hpp>
+#include "aruco_samples_utility.hpp"
+
+using namespace std;
+using namespace cv;
+
+
+namespace {
+const char* about =
+        "Calibration using a ArUco Planar Grid board\n"
+        "  To capture a frame for calibration, press 'c',\n"
+        "  If input comes from video, press any key for next frame\n"
+        "  To finish capturing, press 'ESC' key and calibration starts.\n";
+const char* keys  =
+        "{w        |       | Number of squares in X direction }"
+        "{h        |       | Number of squares in Y direction }"
+        "{l        |       | Marker side length (in meters) }"
+        "{s        |       | Separation between two consecutive markers in the grid (in meters) }"
+        "{d        |       | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{@outfile |cam.yml| Output file with calibrated camera parameters }"
+        "{v        |       | Input from video file, if ommited, input comes from camera }"
+        "{ci       | 0     | Camera id if input doesnt come from video (-v) }"
+        "{dp       |       | File of marker detector parameters }"
+        "{rs       | false | Apply refind strategy }"
+        "{zt       | false | Assume zero tangential distortion }"
+        "{a        |       | Fix aspect ratio (fx/fy) to this value }"
+        "{pc       | false | Fix the principal point at the center }";
+}
+
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    if(argc < 6) {
+        parser.printMessage();
+        return 0;
+    }
+
+    int markersX = parser.get<int>("w");
+    int markersY = parser.get<int>("h");
+    float markerLength = parser.get<float>("l");
+    float markerSeparation = parser.get<float>("s");
+    string outputFile = parser.get<string>(0);
+
+    int calibrationFlags = 0;
+    float aspectRatio = 1;
+    if(parser.has("a")) {
+        calibrationFlags |= CALIB_FIX_ASPECT_RATIO;
+        aspectRatio = parser.get<float>("a");
+    }
+    if(parser.get<bool>("zt")) calibrationFlags |= CALIB_ZERO_TANGENT_DIST;
+    if(parser.get<bool>("pc")) calibrationFlags |= CALIB_FIX_PRINCIPAL_POINT;
+
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+    aruco::DetectorParameters detectorParams = readDetectorParamsFromCommandLine(parser);
+
+    bool refindStrategy = parser.get<bool>("rs");
+    int camId = parser.get<int>("ci");
+    String video;
+
+    if(parser.has("v")) {
+        video = parser.get<String>("v");
+    }
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    VideoCapture inputVideo;
+    int waitTime;
+    if(!video.empty()) {
+        inputVideo.open(video);
+        waitTime = 0;
+    } else {
+        inputVideo.open(camId);
+        waitTime = 10;
+    }
+
+    //! [CalibrationWithArucoBoard1]
+    // Create board object and ArucoDetector
+    aruco::GridBoard gridboard(Size(markersX, markersY), markerLength, markerSeparation, dictionary);
+    aruco::ArucoDetector detector(dictionary, detectorParams);
+
+    // Collected frames for calibration
+    vector<vector<vector<Point2f>>> allMarkerCorners;
+    vector<vector<int>> allMarkerIds;
+    Size imageSize;
+
+    while(inputVideo.grab()) {
+        Mat image, imageCopy;
+        inputVideo.retrieve(image);
+
+        vector<int> markerIds;
+        vector<vector<Point2f>> markerCorners, rejectedMarkers;
+
+        // Detect markers
+        detector.detectMarkers(image, markerCorners, markerIds, rejectedMarkers);
+
+        // Refind strategy to detect more markers
+        if(refindStrategy) {
+            detector.refineDetectedMarkers(image, gridboard, markerCorners, markerIds, rejectedMarkers);
+        }
+        //! [CalibrationWithArucoBoard1]
+
+        // Draw results
+        image.copyTo(imageCopy);
+
+        if(!markerIds.empty()) {
+            aruco::drawDetectedMarkers(imageCopy, markerCorners, markerIds);
+        }
+
+        putText(imageCopy, "Press 'c' to add current frame. 'ESC' to finish and calibrate",
+                Point(10, 20), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255, 0, 0), 2);
+        imshow("out", imageCopy);
+
+        // Wait for key pressed
+        char key = (char)waitKey(waitTime);
+
+        if(key == 27) {
+             break;
+        }
+
+        //! [CalibrationWithArucoBoard2]
+        if(key == 'c' && !markerIds.empty()) {
+            cout << "Frame captured" << endl;
+            allMarkerCorners.push_back(markerCorners);
+            allMarkerIds.push_back(markerIds);
+            imageSize = image.size();
+        }
+    }
+    //! [CalibrationWithArucoBoard2]
+
+    if(allMarkerIds.empty()) {
+        throw std::runtime_error("Not enough captures for calibration\n");
+    }
+
+    //! [CalibrationWithArucoBoard3]
+    Mat cameraMatrix, distCoeffs;
+
+    if(calibrationFlags & CALIB_FIX_ASPECT_RATIO) {
+        cameraMatrix = Mat::eye(3, 3, CV_64F);
+        cameraMatrix.at<double>(0, 0) = aspectRatio;
+    }
+
+    // Prepare data for calibration
+    vector<Point3f> objectPoints;
+    vector<Point2f> imagePoints;
+    vector<Mat> processedObjectPoints, processedImagePoints;
+    size_t nFrames = allMarkerCorners.size();
+
+    for(size_t frame = 0; frame < nFrames; frame++) {
+        Mat currentImgPoints, currentObjPoints;
+
+        gridboard.matchImagePoints(allMarkerCorners[frame], allMarkerIds[frame], currentObjPoints, currentImgPoints);
+
+        if(currentImgPoints.total() > 0 && currentObjPoints.total() > 0) {
+            processedImagePoints.push_back(currentImgPoints);
+            processedObjectPoints.push_back(currentObjPoints);
+        }
+    }
+
+    // Calibrate camera
+    double repError = calibrateCamera(processedObjectPoints, processedImagePoints, imageSize, cameraMatrix, distCoeffs,
+                                      noArray(), noArray(), noArray(), noArray(), noArray(), calibrationFlags);
+    //! [CalibrationWithArucoBoard3]
+    bool saveOk = saveCameraParams(outputFile, imageSize, aspectRatio, calibrationFlags,
+                                   cameraMatrix, distCoeffs, repError);
+
+    if(!saveOk) {
+        throw std::runtime_error("Cannot save output file\n");
+    }
+
+    cout << "Rep Error: " << repError << endl;
+    cout << "Calibration saved to " << outputFile << endl;
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/calibrate_camera_charuco.cpp b/samples/cpp/tutorial_code/objectDetection/calibrate_camera_charuco.cpp
new file mode 100644
index 000000000000..5bea807db9df
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/calibrate_camera_charuco.cpp
@@ -0,0 +1,216 @@
+#include <iostream>
+#include <vector>
+#include <opencv2/calib3d.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/objdetect/charuco_detector.hpp>
+#include "aruco_samples_utility.hpp"
+
+using namespace std;
+using namespace cv;
+
+namespace {
+const char* about =
+        "Calibration using a ChArUco board\n"
+        "  To capture a frame for calibration, press 'c',\n"
+        "  If input comes from video, press any key for next frame\n"
+        "  To finish capturing, press 'ESC' key and calibration starts.\n";
+const char* keys  =
+        "{w        |       | Number of squares in X direction }"
+        "{h        |       | Number of squares in Y direction }"
+        "{sl       |       | Square side length (in meters) }"
+        "{ml       |       | Marker side length (in meters) }"
+        "{d        |       | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{@outfile |cam.yml| Output file with calibrated camera parameters }"
+        "{v        |       | Input from video file, if ommited, input comes from camera }"
+        "{ci       | 0     | Camera id if input doesnt come from video (-v) }"
+        "{dp       |       | File of marker detector parameters }"
+        "{rs       | false | Apply refind strategy }"
+        "{zt       | false | Assume zero tangential distortion }"
+        "{a        |       | Fix aspect ratio (fx/fy) to this value }"
+        "{pc       | false | Fix the principal point at the center }"
+        "{sc       | false | Show detected chessboard corners after calibration }";
+}
+
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    if(argc < 7) {
+        parser.printMessage();
+        return 0;
+    }
+
+    int squaresX = parser.get<int>("w");
+    int squaresY = parser.get<int>("h");
+    float squareLength = parser.get<float>("sl");
+    float markerLength = parser.get<float>("ml");
+    string outputFile = parser.get<string>(0);
+
+    bool showChessboardCorners = parser.get<bool>("sc");
+
+    int calibrationFlags = 0;
+    float aspectRatio = 1;
+    if(parser.has("a")) {
+        calibrationFlags |= CALIB_FIX_ASPECT_RATIO;
+        aspectRatio = parser.get<float>("a");
+    }
+    if(parser.get<bool>("zt")) calibrationFlags |= CALIB_ZERO_TANGENT_DIST;
+    if(parser.get<bool>("pc")) calibrationFlags |= CALIB_FIX_PRINCIPAL_POINT;
+
+    aruco::DetectorParameters detectorParams = readDetectorParamsFromCommandLine(parser);
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+
+    bool refindStrategy = parser.get<bool>("rs");
+    int camId = parser.get<int>("ci");
+    String video;
+
+    if(parser.has("v")) {
+        video = parser.get<String>("v");
+    }
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    VideoCapture inputVideo;
+    int waitTime;
+    if(!video.empty()) {
+        inputVideo.open(video);
+        waitTime = 0;
+    } else {
+        inputVideo.open(camId);
+        waitTime = 10;
+    }
+
+    aruco::CharucoParameters charucoParams;
+    if(refindStrategy) {
+        charucoParams.tryRefineMarkers = true;
+    }
+
+    //! [CalibrationWithCharucoBoard1]
+    // Create charuco board object and CharucoDetector
+    aruco::CharucoBoard board(Size(squaresX, squaresY), squareLength, markerLength, dictionary);
+    aruco::CharucoDetector detector(board, charucoParams, detectorParams);
+
+    // Collect data from each frame
+    vector<Mat> allCharucoCorners, allCharucoIds;
+
+    vector<vector<Point2f>> allImagePoints;
+    vector<vector<Point3f>> allObjectPoints;
+
+    vector<Mat> allImages;
+    Size imageSize;
+
+    while(inputVideo.grab()) {
+        Mat image, imageCopy;
+        inputVideo.retrieve(image);
+
+        vector<int> markerIds;
+        vector<vector<Point2f>> markerCorners;
+        Mat currentCharucoCorners, currentCharucoIds;
+        vector<Point3f> currentObjectPoints;
+        vector<Point2f> currentImagePoints;
+
+        // Detect ChArUco board
+        detector.detectBoard(image, currentCharucoCorners, currentCharucoIds);
+        //! [CalibrationWithCharucoBoard1]
+
+        // Draw results
+        image.copyTo(imageCopy);
+        if(!markerIds.empty()) {
+            aruco::drawDetectedMarkers(imageCopy, markerCorners);
+        }
+
+        if(currentCharucoCorners.total() > 3) {
+            aruco::drawDetectedCornersCharuco(imageCopy, currentCharucoCorners, currentCharucoIds);
+        }
+
+        putText(imageCopy, "Press 'c' to add current frame. 'ESC' to finish and calibrate",
+                Point(10, 20), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255, 0, 0), 2);
+
+        imshow("out", imageCopy);
+
+        // Wait for key pressed
+        char key = (char)waitKey(waitTime);
+
+        if(key == 27) {
+            break;
+        }
+
+        //! [CalibrationWithCharucoBoard2]
+        if(key == 'c' && currentCharucoCorners.total() > 3) {
+            // Match image points
+            board.matchImagePoints(currentCharucoCorners, currentCharucoIds, currentObjectPoints, currentImagePoints);
+
+            if(currentImagePoints.empty() || currentObjectPoints.empty()) {
+                cout << "Point matching failed, try again." << endl;
+                continue;
+            }
+
+            cout << "Frame captured" << endl;
+
+            allCharucoCorners.push_back(currentCharucoCorners);
+            allCharucoIds.push_back(currentCharucoIds);
+            allImagePoints.push_back(currentImagePoints);
+            allObjectPoints.push_back(currentObjectPoints);
+            allImages.push_back(image);
+
+            imageSize = image.size();
+        }
+    }
+    //! [CalibrationWithCharucoBoard2]
+
+    if(allCharucoCorners.size() < 4) {
+        cerr << "Not enough corners for calibration" << endl;
+        return 0;
+    }
+
+    //! [CalibrationWithCharucoBoard3]
+    Mat cameraMatrix, distCoeffs;
+
+    if(calibrationFlags & CALIB_FIX_ASPECT_RATIO) {
+        cameraMatrix = Mat::eye(3, 3, CV_64F);
+        cameraMatrix.at<double>(0, 0) = aspectRatio;
+    }
+
+    // Calibrate camera using ChArUco
+    double repError = calibrateCamera(allObjectPoints, allImagePoints, imageSize, cameraMatrix, distCoeffs,
+                                      noArray(), noArray(), noArray(), noArray(), noArray(), calibrationFlags);
+    //! [CalibrationWithCharucoBoard3]
+
+    bool saveOk =  saveCameraParams(outputFile, imageSize, aspectRatio, calibrationFlags,
+                                    cameraMatrix, distCoeffs, repError);
+
+    if(!saveOk) {
+        cerr << "Cannot save output file" << endl;
+        return 0;
+    }
+
+    cout << "Rep Error: " << repError << endl;
+    cout << "Calibration saved to " << outputFile << endl;
+
+    // Show interpolated charuco corners for debugging
+    if(showChessboardCorners) {
+        for(size_t frame = 0; frame < allImages.size(); frame++) {
+            Mat imageCopy = allImages[frame].clone();
+
+            if(allCharucoCorners[frame].total() > 0) {
+                aruco::drawDetectedCornersCharuco(imageCopy, allCharucoCorners[frame], allCharucoIds[frame]);
+            }
+
+            imshow("out", imageCopy);
+            char key = (char)waitKey(0);
+            if(key == 27) {
+                break;
+            }
+        }
+    }
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/create_board.cpp b/samples/cpp/tutorial_code/objectDetection/create_board.cpp
new file mode 100644
index 000000000000..b1864ffc1919
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/create_board.cpp
@@ -0,0 +1,75 @@
+#include <opencv2/highgui.hpp>
+#include <opencv2/objdetect/aruco_detector.hpp>
+#include <iostream>
+#include "aruco_samples_utility.hpp"
+
+using namespace cv;
+
+namespace {
+const char* about = "Create an ArUco grid board image";
+const char* keys  =
+        "{@outfile |<none> | Output image }"
+        "{w        |       | Number of markers in X direction }"
+        "{h        |       | Number of markers in Y direction }"
+        "{l        |       | Marker side length (in pixels) }"
+        "{s        |       | Separation between two consecutive markers in the grid (in pixels)}"
+        "{d        |       | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{m        |       | Margins size (in pixels). Default is marker separation (-s) }"
+        "{bb       | 1     | Number of bits in marker borders }"
+        "{si       | false | show generated image }";
+}
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    if(argc < 7) {
+        parser.printMessage();
+        return 0;
+    }
+
+    int markersX = parser.get<int>("w");
+    int markersY = parser.get<int>("h");
+    int markerLength = parser.get<int>("l");
+    int markerSeparation = parser.get<int>("s");
+    int margins = markerSeparation;
+    if(parser.has("m")) {
+        margins = parser.get<int>("m");
+    }
+
+    int borderBits = parser.get<int>("bb");
+    bool showImage = parser.get<bool>("si");
+
+    String out = parser.get<String>(0);
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    Size imageSize;
+    imageSize.width = markersX * (markerLength + markerSeparation) - markerSeparation + 2 * margins;
+    imageSize.height =
+        markersY * (markerLength + markerSeparation) - markerSeparation + 2 * margins;
+
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+    aruco::GridBoard board(Size(markersX, markersY), float(markerLength), float(markerSeparation), dictionary);
+
+    // show created board
+    //! [aruco_generate_board_image]
+    Mat boardImage;
+    board.generateImage(imageSize, boardImage, margins, borderBits);
+    //! [aruco_generate_board_image]
+
+    if(showImage) {
+        imshow("board", boardImage);
+        waitKey(0);
+    }
+
+    imwrite(out, boardImage);
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/create_board_charuco.cpp b/samples/cpp/tutorial_code/objectDetection/create_board_charuco.cpp
new file mode 100644
index 000000000000..b76708817aee
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/create_board_charuco.cpp
@@ -0,0 +1,77 @@
+#include <opencv2/highgui.hpp>
+#include <opencv2/objdetect/charuco_detector.hpp>
+#include <iostream>
+#include "aruco_samples_utility.hpp"
+
+using namespace cv;
+
+namespace {
+const char* about = "Create a ChArUco board image";
+//! [charuco_detect_board_keys]
+const char* keys  =
+        "{@outfile |res.png| Output image }"
+        "{w        |  5    | Number of squares in X direction }"
+        "{h        |  7    | Number of squares in Y direction }"
+        "{sl       |  100  | Square side length (in pixels) }"
+        "{ml       |  60   | Marker side length (in pixels) }"
+        "{d        |       | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{m        |       | Margins size (in pixels). Default is (squareLength-markerLength) }"
+        "{bb       | 1     | Number of bits in marker borders }"
+        "{si       | false | show generated image }";
+}
+//! [charuco_detect_board_keys]
+
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+    if (argc == 1) {
+        parser.printMessage();
+    }
+
+    int squaresX = parser.get<int>("w");
+    int squaresY = parser.get<int>("h");
+    int squareLength = parser.get<int>("sl");
+    int markerLength = parser.get<int>("ml");
+    int margins = squareLength - markerLength;
+    if(parser.has("m")) {
+        margins = parser.get<int>("m");
+    }
+
+    int borderBits = parser.get<int>("bb");
+    bool showImage = parser.get<bool>("si");
+
+    std::string pathOutImg = parser.get<std::string>(0);
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    //! [create_charucoBoard]
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+    cv::aruco::CharucoBoard board(Size(squaresX, squaresY), (float)squareLength, (float)markerLength, dictionary);
+    //! [create_charucoBoard]
+
+    // show created board
+    //! [generate_charucoBoard]
+    Mat boardImage;
+    Size imageSize;
+    imageSize.width = squaresX * squareLength + 2 * margins;
+    imageSize.height = squaresY * squareLength + 2 * margins;
+    board.generateImage(imageSize, boardImage, margins, borderBits);
+    //! [generate_charucoBoard]
+
+    if(showImage) {
+        imshow("board", boardImage);
+        waitKey(0);
+    }
+
+    if (pathOutImg != "")
+        imwrite(pathOutImg, boardImage);
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/create_diamond.cpp b/samples/cpp/tutorial_code/objectDetection/create_diamond.cpp
new file mode 100644
index 000000000000..0db00e7e02fb
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/create_diamond.cpp
@@ -0,0 +1,72 @@
+#include <opencv2/highgui.hpp>
+#include <opencv2/objdetect/charuco_detector.hpp>
+#include <vector>
+#include <iostream>
+#include "aruco_samples_utility.hpp"
+
+using namespace std;
+using namespace cv;
+
+namespace {
+const char* about = "Create a ChArUco marker image";
+const char* keys  =
+        "{@outfile |   res.png | Output image }"
+        "{sl       |   100     | Square side length (in pixels) }"
+        "{ml       |   60      | Marker side length (in pixels) }"
+        "{cd       |           | Input file with custom dictionary }"
+        "{d        |   10      | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{ids      |0, 1, 2, 3 | Four ids for the ChArUco marker: id1,id2,id3,id4 }"
+        "{m        |   0       | Margins size (in pixels) }"
+        "{bb       |   1       | Number of bits in marker borders }"
+        "{si       |   false   | show generated image }";
+}
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    int squareLength = parser.get<int>("sl");
+    int markerLength = parser.get<int>("ml");
+    string idsString = parser.get<string>("ids");
+    int margins = parser.get<int>("m");
+    int borderBits = parser.get<int>("bb");
+    bool showImage = parser.get<bool>("si");
+    string out = parser.get<string>(0);
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    istringstream ss(idsString);
+    vector<string> splittedIds;
+    string token;
+    while(getline(ss, token, ','))
+        splittedIds.push_back(token);
+    if(splittedIds.size() < 4) {
+        throw std::runtime_error("Incorrect ids format\n");
+    }
+    Vec4i ids;
+    for(int i = 0; i < 4; i++)
+        ids[i] = atoi(splittedIds[i].c_str());
+
+    //! [generate_diamond]
+    vector<int> diamondIds = {ids[0], ids[1], ids[2], ids[3]};
+    aruco::CharucoBoard charucoBoard(Size(3, 3), (float)squareLength, (float)markerLength, dictionary, diamondIds);
+    Mat markerImg;
+    charucoBoard.generateImage(Size(3*squareLength + 2*margins, 3*squareLength + 2*margins), markerImg, margins, borderBits);
+    //! [generate_diamond]
+
+    if(showImage) {
+        imshow("board", markerImg);
+        waitKey(0);
+    }
+
+    if (out != "")
+        imwrite(out, markerImg);
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/create_marker.cpp b/samples/cpp/tutorial_code/objectDetection/create_marker.cpp
new file mode 100644
index 000000000000..560e51c9745c
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/create_marker.cpp
@@ -0,0 +1,56 @@
+#include <opencv2/highgui.hpp>
+#include <opencv2/objdetect/aruco_detector.hpp>
+#include <iostream>
+#include "aruco_samples_utility.hpp"
+
+using namespace cv;
+
+namespace {
+const char* about = "Create an ArUco marker image";
+
+//! [aruco_create_markers_keys]
+const char* keys  =
+        "{@outfile |res.png| Output image }"
+        "{d        | 0     | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{id       | 0     | Marker id in the dictionary }"
+        "{ms       | 200   | Marker size in pixels }"
+        "{bb       | 1     | Number of bits in marker borders }"
+        "{si       | false | show generated image }";
+}
+//! [aruco_create_markers_keys]
+
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    int markerId = parser.get<int>("id");
+    int borderBits = parser.get<int>("bb");
+    int markerSize = parser.get<int>("ms");
+    bool showImage = parser.get<bool>("si");
+
+    String out = parser.get<String>(0);
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+
+    Mat markerImg;
+    aruco::generateImageMarker(dictionary, markerId, markerSize, markerImg, borderBits);
+
+    if(showImage) {
+        imshow("marker", markerImg);
+        waitKey(0);
+    }
+
+    imwrite(out, markerImg);
+
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/detect_board.cpp b/samples/cpp/tutorial_code/objectDetection/detect_board.cpp
new file mode 100644
index 000000000000..a9c74f2fdc84
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/detect_board.cpp
@@ -0,0 +1,155 @@
+#include <iostream>
+#include <vector>
+#include <opencv2/highgui.hpp>
+#include <opencv2/objdetect/aruco_detector.hpp>
+#include "aruco_samples_utility.hpp"
+
+using namespace std;
+using namespace cv;
+
+namespace {
+const char* about = "Pose estimation using a ArUco Planar Grid board";
+
+//! [aruco_detect_board_keys]
+const char* keys  =
+        "{w        |       | Number of squares in X direction }"
+        "{h        |       | Number of squares in Y direction }"
+        "{l        |       | Marker side length (in pixels) }"
+        "{s        |       | Separation between two consecutive markers in the grid (in pixels)}"
+        "{d        |       | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{c        |       | Output file with calibrated camera parameters }"
+        "{v        |       | Input from video or image file, if omitted, input comes from camera }"
+        "{ci       | 0     | Camera id if input doesnt come from video (-v) }"
+        "{dp       |       | File of marker detector parameters }"
+        "{rs       |       | Apply refind strategy }"
+        "{r        |       | show rejected candidates too }";
+}
+//! [aruco_detect_board_keys]
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    if(argc < 7) {
+        parser.printMessage();
+        return 0;
+    }
+
+    //! [aruco_detect_board_full_sample]
+    int markersX = parser.get<int>("w");
+    int markersY = parser.get<int>("h");
+    float markerLength = parser.get<float>("l");
+    float markerSeparation = parser.get<float>("s");
+    bool showRejected = parser.has("r");
+    bool refindStrategy = parser.has("rs");
+    int camId = parser.get<int>("ci");
+
+
+    Mat camMatrix, distCoeffs;
+    readCameraParamsFromCommandLine(parser, camMatrix, distCoeffs);
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+    aruco::DetectorParameters detectorParams = readDetectorParamsFromCommandLine(parser);
+
+    String video;
+    if(parser.has("v")) {
+        video = parser.get<String>("v");
+    }
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    aruco::ArucoDetector detector(dictionary, detectorParams);
+    VideoCapture inputVideo;
+    int waitTime;
+    if(!video.empty()) {
+        inputVideo.open(video);
+        waitTime = 0;
+    } else {
+        inputVideo.open(camId);
+        waitTime = 10;
+    }
+
+    float axisLength = 0.5f * ((float)min(markersX, markersY) * (markerLength + markerSeparation) +
+                               markerSeparation);
+
+    // Create GridBoard object
+    //! [aruco_create_board]
+    aruco::GridBoard board(Size(markersX, markersY), markerLength, markerSeparation, dictionary);
+    //! [aruco_create_board]
+
+    // Also you could create Board object
+    //vector<vector<Point3f> > objPoints; // array of object points of all the marker corners in the board
+    //vector<int> ids; // vector of the identifiers of the markers in the board
+    //aruco::Board board(objPoints, dictionary, ids);
+
+    double totalTime = 0;
+    int totalIterations = 0;
+
+    while(inputVideo.grab()) {
+        Mat image, imageCopy;
+        inputVideo.retrieve(image);
+
+        double tick = (double)getTickCount();
+
+        vector<int> ids;
+        vector<vector<Point2f>> corners, rejected;
+        Vec3d rvec, tvec;
+
+        //! [aruco_detect_and_refine]
+
+        // Detect markers
+        detector.detectMarkers(image, corners, ids, rejected);
+
+        // Refind strategy to detect more markers
+        if(refindStrategy)
+            detector.refineDetectedMarkers(image, board, corners, ids, rejected, camMatrix,
+                                           distCoeffs);
+
+        //! [aruco_detect_and_refine]
+
+        // Estimate board pose
+        int markersOfBoardDetected = 0;
+        if(!ids.empty()) {
+            // Get object and image points for the solvePnP function
+            cv::Mat objPoints, imgPoints;
+            board.matchImagePoints(corners, ids, objPoints, imgPoints);
+
+            // Find pose
+            cv::solvePnP(objPoints, imgPoints, camMatrix, distCoeffs, rvec, tvec);
+
+            markersOfBoardDetected = (int)objPoints.total() / 4;
+        }
+
+        double currentTime = ((double)getTickCount() - tick) / getTickFrequency();
+        totalTime += currentTime;
+        totalIterations++;
+        if(totalIterations % 30 == 0) {
+            cout << "Detection Time = " << currentTime * 1000 << " ms "
+                 << "(Mean = " << 1000 * totalTime / double(totalIterations) << " ms)" << endl;
+        }
+
+        // Draw results
+        image.copyTo(imageCopy);
+        if(!ids.empty())
+            aruco::drawDetectedMarkers(imageCopy, corners, ids);
+
+        if(showRejected && !rejected.empty())
+            aruco::drawDetectedMarkers(imageCopy, rejected, noArray(), Scalar(100, 0, 255));
+
+        if(markersOfBoardDetected > 0)
+            cv::drawFrameAxes(imageCopy, camMatrix, distCoeffs, rvec, tvec, axisLength);
+
+        imshow("out", imageCopy);
+        char key = (char)waitKey(waitTime);
+        if(key == 27) break;
+    //! [aruco_detect_board_full_sample]
+    }
+
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp b/samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp
new file mode 100644
index 000000000000..c02318d6eb3f
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/detect_board_charuco.cpp
@@ -0,0 +1,144 @@
+#include <opencv2/highgui.hpp>
+//! [charucohdr]
+#include <opencv2/objdetect/charuco_detector.hpp>
+//! [charucohdr]
+#include <vector>
+#include <iostream>
+#include "aruco_samples_utility.hpp"
+
+using namespace std;
+using namespace cv;
+
+namespace {
+const char* about = "Pose estimation using a ChArUco board";
+const char* keys  =
+        "{w        |       | Number of squares in X direction }"
+        "{h        |       | Number of squares in Y direction }"
+        "{sl       |       | Square side length (in meters) }"
+        "{ml       |       | Marker side length (in meters) }"
+        "{d        |       | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{c        |       | Output file with calibrated camera parameters }"
+        "{v        |       | Input from video or image file, if ommited, input comes from camera }"
+        "{ci       | 0     | Camera id if input doesnt come from video (-v) }"
+        "{dp       |       | File of marker detector parameters }"
+        "{rs       |       | Apply refind strategy }";
+}
+
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    if(argc < 6) {
+        parser.printMessage();
+        return 0;
+    }
+
+    //! [charuco_detect_board_full_sample]
+    int squaresX = parser.get<int>("w");
+    int squaresY = parser.get<int>("h");
+    float squareLength = parser.get<float>("sl");
+    float markerLength = parser.get<float>("ml");
+    bool refine = parser.has("rs");
+    int camId = parser.get<int>("ci");
+
+    string video;
+    if(parser.has("v")) {
+        video = parser.get<string>("v");
+    }
+
+    Mat camMatrix, distCoeffs;
+    readCameraParamsFromCommandLine(parser, camMatrix, distCoeffs);
+    aruco::DetectorParameters detectorParams = readDetectorParamsFromCommandLine(parser);
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    VideoCapture inputVideo;
+    int waitTime = 0;
+    if(!video.empty()) {
+        inputVideo.open(video);
+    } else {
+        inputVideo.open(camId);
+        waitTime = 10;
+    }
+
+    float axisLength = 0.5f * ((float)min(squaresX, squaresY) * (squareLength));
+
+    // create charuco board object
+    aruco::CharucoBoard charucoBoard(Size(squaresX, squaresY), squareLength, markerLength, dictionary);
+
+    // create charuco detector
+    aruco::CharucoParameters charucoParams;
+    charucoParams.tryRefineMarkers = refine; // if tryRefineMarkers, refineDetectedMarkers() will be used in detectBoard()
+    charucoParams.cameraMatrix = camMatrix; // cameraMatrix can be used in detectBoard()
+    charucoParams.distCoeffs = distCoeffs; // distCoeffs can be used in detectBoard()
+    aruco::CharucoDetector charucoDetector(charucoBoard, charucoParams, detectorParams);
+
+    double totalTime = 0;
+    int totalIterations = 0;
+
+    while(inputVideo.grab()) {
+        //! [inputImg]
+        Mat image, imageCopy;
+        inputVideo.retrieve(image);
+        //! [inputImg]
+
+        double tick = (double)getTickCount();
+
+        vector<int> markerIds, charucoIds;
+        vector<vector<Point2f> > markerCorners;
+        vector<Point2f> charucoCorners;
+        Vec3d rvec, tvec;
+
+        //! [interpolateCornersCharuco]
+        // detect markers and charuco corners
+        charucoDetector.detectBoard(image, charucoCorners, charucoIds, markerCorners, markerIds);
+        //! [interpolateCornersCharuco]
+
+        //! [poseCharuco]
+        // estimate charuco board pose
+        bool validPose = false;
+        if(camMatrix.total() != 0 && distCoeffs.total() != 0 && charucoIds.size() >= 4) {
+            Mat objPoints, imgPoints;
+            charucoBoard.matchImagePoints(charucoCorners, charucoIds, objPoints, imgPoints);
+            validPose = solvePnP(objPoints, imgPoints, camMatrix, distCoeffs, rvec, tvec);
+        }
+        //! [poseCharuco]
+
+        double currentTime = ((double)getTickCount() - tick) / getTickFrequency();
+        totalTime += currentTime;
+        totalIterations++;
+        if(totalIterations % 30 == 0) {
+            cout << "Detection Time = " << currentTime * 1000 << " ms "
+                 << "(Mean = " << 1000 * totalTime / double(totalIterations) << " ms)" << endl;
+        }
+
+        // draw results
+        image.copyTo(imageCopy);
+        if(markerIds.size() > 0) {
+            aruco::drawDetectedMarkers(imageCopy, markerCorners);
+        }
+
+        if(charucoIds.size() > 0) {
+            //! [drawDetectedCornersCharuco]
+            aruco::drawDetectedCornersCharuco(imageCopy, charucoCorners, charucoIds, cv::Scalar(255, 0, 0));
+            //! [drawDetectedCornersCharuco]
+        }
+
+        if(validPose)
+            cv::drawFrameAxes(imageCopy, camMatrix, distCoeffs, rvec, tvec, axisLength);
+
+        imshow("out", imageCopy);
+        if(waitKey(waitTime) == 27) break;
+    }
+    //! [charuco_detect_board_full_sample]
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/detect_diamonds.cpp b/samples/cpp/tutorial_code/objectDetection/detect_diamonds.cpp
new file mode 100644
index 000000000000..f6a6236b2ab0
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/detect_diamonds.cpp
@@ -0,0 +1,187 @@
+#include <opencv2/highgui.hpp>
+#include <vector>
+#include <iostream>
+#include <opencv2/objdetect/charuco_detector.hpp>
+#include "aruco_samples_utility.hpp"
+
+using namespace std;
+using namespace cv;
+
+
+namespace {
+const char* about = "Detect ChArUco markers";
+const char* keys  =
+        "{sl       | 100   | Square side length (in meters) }"
+        "{ml       | 60    | Marker side length (in meters) }"
+        "{d        | 10    | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{c        |       | Output file with calibrated camera parameters }"
+        "{as       |       | Automatic scale. The provided number is multiplied by the last"
+        "diamond id becoming an indicator of the square length. In this case, the -sl and "
+        "-ml are only used to know the relative length relation between squares and markers }"
+        "{v        |       | Input from video file, if ommited, input comes from camera }"
+        "{ci       | 0     | Camera id if input doesnt come from video (-v) }"
+        "{dp       |       | File of marker detector parameters }"
+        "{refine   |       | Corner refinement: CORNER_REFINE_NONE=0, CORNER_REFINE_SUBPIX=1,"
+        "CORNER_REFINE_CONTOUR=2, CORNER_REFINE_APRILTAG=3}";
+
+const string refineMethods[4] = {
+    "None",
+    "Subpixel",
+    "Contour",
+    "AprilTag"
+};
+
+}
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    float squareLength = parser.get<float>("sl");
+    float markerLength = parser.get<float>("ml");
+    bool estimatePose = parser.has("c");
+    bool autoScale = parser.has("as");
+    float autoScaleFactor = autoScale ? parser.get<float>("as") : 1.f;
+
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+    Mat camMatrix, distCoeffs;
+    readCameraParamsFromCommandLine(parser, camMatrix, distCoeffs);
+
+    aruco::DetectorParameters detectorParams = readDetectorParamsFromCommandLine(parser);
+    if (parser.has("refine")) {
+        // override cornerRefinementMethod read from config file
+        int user_method = parser.get<aruco::CornerRefineMethod>("refine");
+        if (user_method < 0 || user_method >= 4)
+        {
+            std::cout << "Corner refinement method should be in range 0..3" << std::endl;
+            return 0;
+        }
+        detectorParams.cornerRefinementMethod = user_method;
+    }
+    std::cout << "Corner refinement method: " << refineMethods[detectorParams.cornerRefinementMethod] << std::endl;
+
+    int camId = parser.get<int>("ci");
+    String video;
+
+    if(parser.has("v")) {
+        video = parser.get<String>("v");
+    }
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    VideoCapture inputVideo;
+    int waitTime;
+    if(!video.empty()) {
+        inputVideo.open(video);
+        waitTime = 0;
+    } else {
+        inputVideo.open(camId);
+        waitTime = 10;
+    }
+
+    double totalTime = 0;
+    int totalIterations = 0;
+
+    aruco::CharucoBoard charucoBoard(Size(3, 3), squareLength, markerLength, dictionary);
+    aruco::CharucoDetector detector(charucoBoard, aruco::CharucoParameters(), detectorParams);
+
+    while(inputVideo.grab()) {
+        Mat image, imageCopy;
+        inputVideo.retrieve(image);
+
+        double tick = (double)getTickCount();
+
+        //! [detect_diamonds]
+        vector<int> markerIds;
+        vector<Vec4i> diamondIds;
+        vector<vector<Point2f> > markerCorners, diamondCorners;
+        vector<Vec3d> rvecs, tvecs;
+
+        detector.detectDiamonds(image, diamondCorners, diamondIds, markerCorners, markerIds);
+        //! [detect_diamonds]
+
+        //! [diamond_pose_estimation]
+        // estimate diamond pose
+        size_t N = diamondIds.size();
+        if(estimatePose && N > 0) {
+            cv::Mat objPoints(4, 1, CV_32FC3);
+            rvecs.resize(N);
+            tvecs.resize(N);
+            if(!autoScale) {
+                // set coordinate system
+                objPoints.ptr<Vec3f>(0)[0] = Vec3f(-squareLength/2.f, squareLength/2.f, 0);
+                objPoints.ptr<Vec3f>(0)[1] = Vec3f(squareLength/2.f, squareLength/2.f, 0);
+                objPoints.ptr<Vec3f>(0)[2] = Vec3f(squareLength/2.f, -squareLength/2.f, 0);
+                objPoints.ptr<Vec3f>(0)[3] = Vec3f(-squareLength/2.f, -squareLength/2.f, 0);
+                // Calculate pose for each marker
+                for (size_t i = 0ull; i < N; i++)
+                    solvePnP(objPoints, diamondCorners.at(i), camMatrix, distCoeffs, rvecs.at(i), tvecs.at(i));
+                //! [diamond_pose_estimation]
+                /* //! [diamond_pose_estimation_as_charuco]
+                for (size_t i = 0ull; i < N; i++) { // estimate diamond pose as Charuco board
+                    Mat objPoints_b, imgPoints;
+                    // The coordinate system of the diamond is placed in the board plane centered in the bottom left corner
+                    vector<int> charucoIds = {0, 1, 3, 2}; // if CCW order, Z axis pointing in the plane
+                    // vector<int> charucoIds = {0, 2, 3, 1}; // if CW order, Z axis pointing out the plane
+                    charucoBoard.matchImagePoints(diamondCorners[i], charucoIds, objPoints_b, imgPoints);
+                    solvePnP(objPoints_b, imgPoints, camMatrix, distCoeffs, rvecs[i], tvecs[i]);
+                }
+                //! [diamond_pose_estimation_as_charuco] */
+            }
+            else {
+                // if autoscale, extract square size from last diamond id
+                for(size_t i = 0; i < N; i++) {
+                    float sqLenScale = autoScaleFactor * float(diamondIds[i].val[3]);
+                    vector<vector<Point2f> > currentCorners;
+                    vector<Vec3d> currentRvec, currentTvec;
+                    currentCorners.push_back(diamondCorners[i]);
+                    // set coordinate system
+                    objPoints.ptr<Vec3f>(0)[0] = Vec3f(-sqLenScale/2.f, sqLenScale/2.f, 0);
+                    objPoints.ptr<Vec3f>(0)[1] = Vec3f(sqLenScale/2.f, sqLenScale/2.f, 0);
+                    objPoints.ptr<Vec3f>(0)[2] = Vec3f(sqLenScale/2.f, -sqLenScale/2.f, 0);
+                    objPoints.ptr<Vec3f>(0)[3] = Vec3f(-sqLenScale/2.f, -sqLenScale/2.f, 0);
+                    solvePnP(objPoints, diamondCorners.at(i), camMatrix, distCoeffs, rvecs.at(i), tvecs.at(i));
+                }
+            }
+        }
+
+
+        double currentTime = ((double)getTickCount() - tick) / getTickFrequency();
+        totalTime += currentTime;
+        totalIterations++;
+        if(totalIterations % 30 == 0) {
+            cout << "Detection Time = " << currentTime * 1000 << " ms "
+                 << "(Mean = " << 1000 * totalTime / double(totalIterations) << " ms)" << endl;
+        }
+
+
+        // draw results
+        image.copyTo(imageCopy);
+        if(markerIds.size() > 0)
+            aruco::drawDetectedMarkers(imageCopy, markerCorners);
+
+        //! [draw_diamonds]
+        if(diamondIds.size() > 0) {
+            aruco::drawDetectedDiamonds(imageCopy, diamondCorners, diamondIds);
+        //! [draw_diamonds]
+
+            //! [draw_diamond_pose_estimation]
+            if(estimatePose) {
+                for(size_t i = 0u; i < diamondIds.size(); i++)
+                    cv::drawFrameAxes(imageCopy, camMatrix, distCoeffs, rvecs[i], tvecs[i], squareLength*1.1f);
+            }
+            //! [draw_diamond_pose_estimation]
+        }
+        imshow("out", imageCopy);
+        char key = (char)waitKey(waitTime);
+        if(key == 27) break;
+    }
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/detect_markers.cpp b/samples/cpp/tutorial_code/objectDetection/detect_markers.cpp
new file mode 100644
index 000000000000..f220b84565ba
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/detect_markers.cpp
@@ -0,0 +1,159 @@
+#include <opencv2/highgui.hpp>
+#include <opencv2/objdetect/aruco_detector.hpp>
+#include <iostream>
+#include "aruco_samples_utility.hpp"
+
+using namespace std;
+using namespace cv;
+
+namespace {
+const char* about = "Basic marker detection";
+
+//! [aruco_detect_markers_keys]
+const char* keys  =
+        "{d        | 0     | dictionary: DICT_4X4_50=0, DICT_4X4_100=1, DICT_4X4_250=2,"
+        "DICT_4X4_1000=3, DICT_5X5_50=4, DICT_5X5_100=5, DICT_5X5_250=6, DICT_5X5_1000=7, "
+        "DICT_6X6_50=8, DICT_6X6_100=9, DICT_6X6_250=10, DICT_6X6_1000=11, DICT_7X7_50=12,"
+        "DICT_7X7_100=13, DICT_7X7_250=14, DICT_7X7_1000=15, DICT_ARUCO_ORIGINAL = 16,"
+        "DICT_APRILTAG_16h5=17, DICT_APRILTAG_25h9=18, DICT_APRILTAG_36h10=19, DICT_APRILTAG_36h11=20}"
+        "{cd       |       | Input file with custom dictionary }"
+        "{v        |       | Input from video or image file, if ommited, input comes from camera }"
+        "{ci       | 0     | Camera id if input doesnt come from video (-v) }"
+        "{c        |       | Camera intrinsic parameters. Needed for camera pose }"
+        "{l        | 0.1   | Marker side length (in meters). Needed for correct scale in camera pose }"
+        "{dp       |       | File of marker detector parameters }"
+        "{r        |       | show rejected candidates too }"
+        "{refine   |       | Corner refinement: CORNER_REFINE_NONE=0, CORNER_REFINE_SUBPIX=1,"
+        "CORNER_REFINE_CONTOUR=2, CORNER_REFINE_APRILTAG=3}";
+
+//! [aruco_detect_markers_keys]
+
+const string refineMethods[4] = {
+    "None",
+    "Subpixel",
+    "Contour",
+    "AprilTag"
+};
+
+}
+
+int main(int argc, char *argv[]) {
+    CommandLineParser parser(argc, argv, keys);
+    parser.about(about);
+
+    bool showRejected = parser.has("r");
+    bool estimatePose = parser.has("c");
+    float markerLength = parser.get<float>("l");
+
+    aruco::DetectorParameters detectorParams = readDetectorParamsFromCommandLine(parser);
+    aruco::Dictionary dictionary = readDictionatyFromCommandLine(parser);
+
+    if (parser.has("refine")) {
+        // override cornerRefinementMethod read from config file
+        int user_method = parser.get<aruco::CornerRefineMethod>("refine");
+        if (user_method < 0 || user_method >= 4)
+        {
+            std::cout << "Corner refinement method should be in range 0..3" << std::endl;
+            return 0;
+        }
+        detectorParams.cornerRefinementMethod = user_method;
+    }
+
+    std::cout << "Corner refinement method: " << refineMethods[detectorParams.cornerRefinementMethod] << std::endl;
+
+    int camId = parser.get<int>("ci");
+
+    String video;
+    if(parser.has("v")) {
+        video = parser.get<String>("v");
+    }
+
+    if(!parser.check()) {
+        parser.printErrors();
+        return 0;
+    }
+
+    //! [aruco_pose_estimation1]
+    Mat camMatrix, distCoeffs;
+    if(estimatePose) {
+        // You can read camera parameters from tutorial_camera_params.yml
+        readCameraParamsFromCommandLine(parser, camMatrix, distCoeffs);
+    }
+    //! [aruco_pose_estimation1]
+    //! [aruco_detect_markers]
+    cv::aruco::ArucoDetector detector(dictionary, detectorParams);
+    cv::VideoCapture inputVideo;
+    int waitTime;
+    if(!video.empty()) {
+        inputVideo.open(video);
+        waitTime = 0;
+    } else {
+        inputVideo.open(camId);
+        waitTime = 10;
+    }
+
+    double totalTime = 0;
+    int totalIterations = 0;
+
+    //! [aruco_pose_estimation2]
+    // set coordinate system
+    cv::Mat objPoints(4, 1, CV_32FC3);
+    objPoints.ptr<Vec3f>(0)[0] = Vec3f(-markerLength/2.f, markerLength/2.f, 0);
+    objPoints.ptr<Vec3f>(0)[1] = Vec3f(markerLength/2.f, markerLength/2.f, 0);
+    objPoints.ptr<Vec3f>(0)[2] = Vec3f(markerLength/2.f, -markerLength/2.f, 0);
+    objPoints.ptr<Vec3f>(0)[3] = Vec3f(-markerLength/2.f, -markerLength/2.f, 0);
+    //! [aruco_pose_estimation2]
+
+    while(inputVideo.grab()) {
+        cv::Mat image, imageCopy;
+        inputVideo.retrieve(image);
+
+        double tick = (double)getTickCount();
+
+        //! [aruco_pose_estimation3]
+        vector<int> ids;
+        vector<vector<Point2f> > corners, rejected;
+
+        // detect markers and estimate pose
+        detector.detectMarkers(image, corners, ids, rejected);
+
+        size_t nMarkers = corners.size();
+        vector<Vec3d> rvecs(nMarkers), tvecs(nMarkers);
+
+        if(estimatePose && !ids.empty()) {
+            // Calculate pose for each marker
+            for (size_t i = 0; i < nMarkers; i++) {
+                solvePnP(objPoints, corners.at(i), camMatrix, distCoeffs, rvecs.at(i), tvecs.at(i));
+            }
+        }
+        //! [aruco_pose_estimation3]
+        double currentTime = ((double)getTickCount() - tick) / getTickFrequency();
+        totalTime += currentTime;
+        totalIterations++;
+        if(totalIterations % 30 == 0) {
+            cout << "Detection Time = " << currentTime * 1000 << " ms "
+                 << "(Mean = " << 1000 * totalTime / double(totalIterations) << " ms)" << endl;
+        }
+        //! [aruco_draw_pose_estimation]
+        // draw results
+        image.copyTo(imageCopy);
+        if(!ids.empty()) {
+            cv::aruco::drawDetectedMarkers(imageCopy, corners, ids);
+
+            if(estimatePose) {
+                for(unsigned int i = 0; i < ids.size(); i++)
+                    cv::drawFrameAxes(imageCopy, camMatrix, distCoeffs, rvecs[i], tvecs[i], markerLength * 1.5f, 2);
+            }
+        }
+        //! [aruco_draw_pose_estimation]
+
+        if(showRejected && !rejected.empty())
+            cv::aruco::drawDetectedMarkers(imageCopy, rejected, noArray(), Scalar(100, 0, 255));
+
+        imshow("out", imageCopy);
+        char key = (char)waitKey(waitTime);
+        if(key == 27) break;
+    }
+    //! [aruco_detect_markers]
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/objectDetection/detector_params.yml b/samples/cpp/tutorial_code/objectDetection/detector_params.yml
new file mode 100644
index 000000000000..1155c25126df
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/detector_params.yml
@@ -0,0 +1,30 @@
+%YAML:1.0
+adaptiveThreshWinSizeMin: 3
+adaptiveThreshWinSizeMax: 23
+adaptiveThreshWinSizeStep: 10
+adaptiveThreshWinSize: 21
+adaptiveThreshConstant: 7
+minMarkerPerimeterRate: 0.03
+maxMarkerPerimeterRate: 4.0
+polygonalApproxAccuracyRate: 0.05
+minCornerDistanceRate: 0.05
+minDistanceToBorder: 3
+minMarkerDistance: 10.0
+minMarkerDistanceRate: 0.05
+cornerRefinementMethod: 0
+cornerRefinementWinSize: 5
+cornerRefinementMaxIterations: 30
+cornerRefinementMinAccuracy: 0.1
+markerBorderBits: 1
+perspectiveRemovePixelPerCell: 8
+perspectiveRemoveIgnoredMarginPerCell: 0.13
+maxErroneousBitsInBorderRate: 0.04
+minOtsuStdDev: 5.0
+errorCorrectionRate: 0.6
+
+# new aruco 3 functionality
+useAruco3Detection: 0
+minSideLengthCanonicalImg: 32 # 16, 32, 64 --> tau_c from the paper
+minMarkerLengthRatioOriginalImg: 0.02 # range [0,0.2] --> tau_i from the paper
+cameraMotionSpeed: 0.1 # range [0,1) --> tau_s from the paper
+useGlobalThreshold: 0
diff --git a/samples/cpp/tutorial_code/objectDetection/tutorial_camera_charuco.yml b/samples/cpp/tutorial_code/objectDetection/tutorial_camera_charuco.yml
new file mode 100644
index 000000000000..f1ded5993ad6
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/tutorial_camera_charuco.yml
@@ -0,0 +1,21 @@
+%YAML:1.0
+---
+calibration_time: "Wed 08 Dec 2021 05:13:09 PM MSK"
+image_width: 640
+image_height: 480
+flags: 0
+camera_matrix: !!opencv-matrix
+   rows: 3
+   cols: 3
+   dt: d
+   data: [ 4.5251072219637672e+02, 0., 3.1770297317353277e+02, 0.,
+       4.5676707935146891e+02, 2.7775155919135995e+02, 0., 0., 1. ]
+distortion_coefficients: !!opencv-matrix
+   rows: 1
+   cols: 5
+   dt: d
+   data: [ 1.2136925618707872e-01, -1.0854664722560681e+00,
+       1.1786843796668460e-04, -4.6240686046485508e-04,
+       2.9542589406810080e+00 ]
+avg_reprojection_error: 1.8234905535936044e-01
+info: "The camera calibration parameters were obtained by img_00.jpg-img_03.jpg from aruco/tutorials/aruco_calibration/images"
diff --git a/samples/cpp/tutorial_code/objectDetection/tutorial_camera_params.yml b/samples/cpp/tutorial_code/objectDetection/tutorial_camera_params.yml
new file mode 100644
index 000000000000..69d2d6d22ffc
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/tutorial_camera_params.yml
@@ -0,0 +1,14 @@
+%YAML:1.0
+camera_matrix: !!opencv-matrix
+   rows: 3
+   cols: 3
+   dt: d
+   data: [ 628.158, 0., 324.099,
+       0., 628.156, 260.908,
+       0., 0., 1. ]
+distortion_coefficients: !!opencv-matrix
+   rows: 5
+   cols: 1
+   dt: d
+   data: [ 0.0995485, -0.206384,
+       0.00754589, 0.00336531, 0 ]
diff --git a/samples/cpp/tutorial_code/objectDetection/tutorial_dict.yml b/samples/cpp/tutorial_code/objectDetection/tutorial_dict.yml
new file mode 100644
index 000000000000..af5c87e4e0d9
--- /dev/null
+++ b/samples/cpp/tutorial_code/objectDetection/tutorial_dict.yml
@@ -0,0 +1,38 @@
+%YAML:1.0
+nmarkers: 35
+markersize: 6
+marker_0: "101011111011111001001001101100000000"
+marker_1: "000000000010011001010011111010111000"
+marker_2: "011001100000001010000101111101001101"
+marker_3: "001000111111000111011001110000011111"
+marker_4: "100110110100101111000000111101110011"
+marker_5: "010101101110111000111010111100010111"
+marker_6: "101001000110011110101001010100110100"
+marker_7: "011010100100110000011101110110100010"
+marker_8: "111110001000101000110001010010111101"
+marker_9: "011101101100110111001100100001010100"
+marker_10: "100001100001010001110001011000000111"
+marker_11: "110010010010011100101111111000001111"
+marker_12: "110101001001010110011111010110001101"
+marker_13: "001111000001000100010001101001010001"
+marker_14: "000000010010101010111110110011010011"
+marker_15: "110001110111100101110011111100111010"
+marker_16: "101011001110001010110011111011001110"
+marker_17: "101110111101110100101101011001010111"
+marker_18: "000100111000111101010011010101000101"
+marker_19: "001110001110001101100101110100000011"
+marker_20: "100101101100010110110110110001100011"
+marker_21: "010110001001011010000100111000110110"
+marker_22: "001000000000100100000000010100010010"
+marker_23: "101001110010100110000111111010010000"
+marker_24: "111001101010001100011010010001011100"
+marker_25: "101000010001010000110100111101101001"
+marker_26: "101010000001010011001010110110000001"
+marker_27: "100101001000010101001000111101111110"
+marker_28: "010010100110010011110001110101011100"
+marker_29: "011001000101100001101111010001001111"
+marker_30: "000111011100011110001101111011011001"
+marker_31: "010100001011000100111101110001101010"
+marker_32: "100101101001101010111111101101110100"
+marker_33: "101101001010111000000100110111010101"
+marker_34: "011111010000111011111110110101100101"
diff --git a/samples/cpp/videocapture_obsensor.cpp b/samples/cpp/videocapture_obsensor.cpp
index 98eb9f479fbd..ce71a1808b7f 100644
--- a/samples/cpp/videocapture_obsensor.cpp
+++ b/samples/cpp/videocapture_obsensor.cpp
@@ -1,3 +1,7 @@
+/**
+ * attention: Astra2 cameras currently only support Windows and Linux kernel versions no higher than 4.15, and higher versions of Linux kernel may have exceptions.
+*/
+
 #include <opencv2/videoio.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/imgproc.hpp>
@@ -21,6 +25,11 @@ int main()
     Mat image;
     Mat depthMap;
     Mat adjDepthMap;
+
+    // Minimum depth value
+    const double minVal = 300;
+    // Maximum depth value
+    const double maxVal = 5000;
     while (true)
     {
         // Grab depth map like this:
@@ -36,7 +45,7 @@ int main()
 
             if (obsensorCapture.retrieve(depthMap, CAP_OBSENSOR_DEPTH_MAP))
             {
-                normalize(depthMap, adjDepthMap, 0, 255, NORM_MINMAX, CV_8UC1);
+                depthMap.convertTo(adjDepthMap, CV_8U, 255.0 / (maxVal - minVal), -minVal * 255.0 / (maxVal - minVal));
                 applyColorMap(adjDepthMap, adjDepthMap, COLORMAP_JET);
                 imshow("DEPTH", adjDepthMap);
             }
@@ -45,7 +54,7 @@ int main()
             static const float alpha = 0.6f;
             if (!image.empty() && !depthMap.empty())
             {
-                normalize(depthMap, adjDepthMap, 0, 255, NORM_MINMAX, CV_8UC1);
+                depthMap.convertTo(adjDepthMap, CV_8U, 255.0 / (maxVal - minVal), -minVal * 255.0 / (maxVal - minVal));
                 cv::resize(adjDepthMap, adjDepthMap, cv::Size(image.cols, image.rows));
                 for (int i = 0; i < image.rows; i++)
                 {
@@ -71,4 +80,4 @@ int main()
             break;
     }
     return 0;
-}
\ No newline at end of file
+}
diff --git a/samples/data/dnn/object_detection_classes_yolov4.txt b/samples/data/dnn/object_detection_classes_yolo.txt
similarity index 100%
rename from samples/data/dnn/object_detection_classes_yolov4.txt
rename to samples/data/dnn/object_detection_classes_yolo.txt
diff --git a/samples/dnn/README.md b/samples/dnn/README.md
index b31da10b228c..9724dbce869b 100644
--- a/samples/dnn/README.md
+++ b/samples/dnn/README.md
@@ -78,7 +78,7 @@ AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ] | 0.528     | 0.528          |
 ```
 
 ## References
-* [Models downloading script](https://github.com/opencv/opencv/samples/dnn/download_models.py)
+* [Models downloading script](https://github.com/opencv/opencv/blob/4.x/samples/dnn/download_models.py)
 * [Configuration files adopted for OpenCV](https://github.com/opencv/opencv_extra/tree/4.x/testdata/dnn)
 * [How to import models from TensorFlow Object Detection API](https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API)
 * [Names of classes from different datasets](https://github.com/opencv/opencv/tree/4.x/samples/data/dnn)
diff --git a/samples/dnn/common.py b/samples/dnn/common.py
index db9283b5d86d..4765506eace8 100644
--- a/samples/dnn/common.py
+++ b/samples/dnn/common.py
@@ -79,6 +79,10 @@ def add_preproc_args(zoo, parser, sample):
                  help='Indicate that model works with RGB input images instead BGR ones.')
     add_argument(zoo, parser, 'classes',
                  help='Optional path to a text file with names of classes to label detected objects.')
+    add_argument(zoo, parser, 'postprocessing', type=str,
+                 help='Post-processing kind depends on model topology.')
+    add_argument(zoo, parser, 'background_label_id', type=int, default=-1,
+                 help='An index of background class in predictions. If not negative, exclude such class from list of classes.')
 
 
 def findFile(filename):
diff --git a/samples/dnn/fast_neural_style.py b/samples/dnn/fast_neural_style.py
index 912c2f083228..43b8b121d65f 100644
--- a/samples/dnn/fast_neural_style.py
+++ b/samples/dnn/fast_neural_style.py
@@ -5,15 +5,15 @@
 
 parser = argparse.ArgumentParser(
         description='This script is used to run style transfer models from '
-                    'https://github.com/jcjohnson/fast-neural-style using OpenCV')
+                    'https://github.com/onnx/models/tree/main/vision/style_transfer/fast_neural_style using OpenCV')
 parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
-parser.add_argument('--model', help='Path to .t7 model')
+parser.add_argument('--model', help='Path to .onnx model')
 parser.add_argument('--width', default=-1, type=int, help='Resize input to specific width.')
 parser.add_argument('--height', default=-1, type=int, help='Resize input to specific height.')
 parser.add_argument('--median_filter', default=0, type=int, help='Kernel size of postprocessing blurring.')
 args = parser.parse_args()
 
-net = cv.dnn.readNetFromTorch(cv.samples.findFile(args.model))
+net = cv.dnn.readNet(cv.samples.findFile(args.model))
 net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
 
 if args.input:
@@ -31,16 +31,12 @@
     inWidth = args.width if args.width != -1 else frame.shape[1]
     inHeight = args.height if args.height != -1 else frame.shape[0]
     inp = cv.dnn.blobFromImage(frame, 1.0, (inWidth, inHeight),
-                              (103.939, 116.779, 123.68), swapRB=False, crop=False)
+                               swapRB=True, crop=False)
 
     net.setInput(inp)
     out = net.forward()
 
     out = out.reshape(3, out.shape[2], out.shape[3])
-    out[0] += 103.939
-    out[1] += 116.779
-    out[2] += 123.68
-    out /= 255
     out = out.transpose(1, 2, 0)
 
     t, _ = net.getPerfProfile()
@@ -50,4 +46,7 @@
     if args.median_filter:
         out = cv.medianBlur(out, args.median_filter)
 
+    out = np.clip(out, 0, 255)
+    out = out.astype(np.uint8)
+
     cv.imshow('Styled image', out)
diff --git a/samples/dnn/js_face_recognition.html b/samples/dnn/js_face_recognition.html
index d94ead1e58df..3b4ed390d7b4 100644
--- a/samples/dnn/js_face_recognition.html
+++ b/samples/dnn/js_face_recognition.html
@@ -12,27 +12,40 @@
 
 //! [Run face detection model]
 function detectFaces(img) {
-  var blob = cv.blobFromImage(img, 1, {width: 192, height: 144}, [104, 117, 123, 0], false, false);
-  netDet.setInput(blob);
-  var out = netDet.forward();
-
+  netDet.setInputSize(new cv.Size(img.cols, img.rows));
+  var out = new cv.Mat();
+  netDet.detect(img, out);
   var faces = [];
-  for (var i = 0, n = out.data32F.length; i < n; i += 7) {
-    var confidence = out.data32F[i + 2];
-    var left = out.data32F[i + 3] * img.cols;
-    var top = out.data32F[i + 4] * img.rows;
-    var right = out.data32F[i + 5] * img.cols;
-    var bottom = out.data32F[i + 6] * img.rows;
+  for (var i = 0, n = out.data32F.length; i < n; i += 15) {
+    var left = out.data32F[i];
+    var top = out.data32F[i + 1];
+    var right = (out.data32F[i] + out.data32F[i + 2]);
+    var bottom = (out.data32F[i + 1] + out.data32F[i + 3]);
     left = Math.min(Math.max(0, left), img.cols - 1);
+    top = Math.min(Math.max(0, top), img.rows - 1);
     right = Math.min(Math.max(0, right), img.cols - 1);
     bottom = Math.min(Math.max(0, bottom), img.rows - 1);
-    top = Math.min(Math.max(0, top), img.rows - 1);
 
-    if (confidence > 0.5 && left < right && top < bottom) {
-      faces.push({x: left, y: top, width: right - left, height: bottom - top})
+    if (left < right && top < bottom) {
+      faces.push({
+        x: left,
+        y: top,
+        width: right - left,
+        height: bottom - top,
+        x1: out.data32F[i + 4] < 0 || out.data32F[i + 4] > img.cols - 1 ? -1 : out.data32F[i + 4],
+        y1: out.data32F[i + 5] < 0 || out.data32F[i + 5] > img.rows - 1 ? -1 : out.data32F[i + 5],
+        x2: out.data32F[i + 6] < 0 || out.data32F[i + 6] > img.cols - 1 ? -1 : out.data32F[i + 6],
+        y2: out.data32F[i + 7] < 0 || out.data32F[i + 7] > img.rows - 1 ? -1 : out.data32F[i + 7],
+        x3: out.data32F[i + 8] < 0 || out.data32F[i + 8] > img.cols - 1 ? -1 : out.data32F[i + 8],
+        y3: out.data32F[i + 9] < 0 || out.data32F[i + 9] > img.rows - 1 ? -1 : out.data32F[i + 9],
+        x4: out.data32F[i + 10] < 0 || out.data32F[i + 10] > img.cols - 1 ? -1 : out.data32F[i + 10],
+        y4: out.data32F[i + 11] < 0 || out.data32F[i + 11] > img.rows - 1 ? -1 : out.data32F[i + 11],
+        x5: out.data32F[i + 12] < 0 || out.data32F[i + 12] > img.cols - 1 ? -1 : out.data32F[i + 12],
+        y5: out.data32F[i + 13] < 0 || out.data32F[i + 13] > img.rows - 1 ? -1 : out.data32F[i + 13],
+        confidence: out.data32F[i + 14]
+      })
     }
   }
-  blob.delete();
   out.delete();
   return faces;
 };
@@ -40,7 +53,7 @@
 
 //! [Get 128 floating points feature vector]
 function face2vec(face) {
-  var blob = cv.blobFromImage(face, 1.0 / 255, {width: 96, height: 96}, [0, 0, 0, 0], true, false)
+  var blob = cv.blobFromImage(face, 1.0, {width: 112, height: 112}, [0, 0, 0, 0], true, false)
   netRecogn.setInput(blob);
   var vec = netRecogn.forward();
   blob.delete();
@@ -53,7 +66,7 @@
   var vec = face2vec(face);
 
   var bestMatchName = 'unknown';
-  var bestMatchScore = 0.5;  // Actually, the minimum is -1 but we use it as a threshold.
+  var bestMatchScore = 30;  // Threshold for face recognition.
   for (name in persons) {
     var personVec = persons[name];
     var score = vec.dot(personVec);
@@ -69,24 +82,25 @@
 
 function loadModels(callback) {
   var utils = new Utils('');
-  var proto = 'https://raw.githubusercontent.com/opencv/opencv/4.x/samples/dnn/face_detector/deploy_lowres.prototxt';
-  var weights = 'https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel';
-  var recognModel = 'https://raw.githubusercontent.com/pyannote/pyannote-data/master/openface.nn4.small2.v1.t7';
-  utils.createFileFromUrl('face_detector.prototxt', proto, () => {
-    document.getElementById('status').innerHTML = 'Downloading face_detector.caffemodel';
-    utils.createFileFromUrl('face_detector.caffemodel', weights, () => {
-      document.getElementById('status').innerHTML = 'Downloading OpenFace model';
-      utils.createFileFromUrl('face_recognition.t7', recognModel, () => {
-        document.getElementById('status').innerHTML = '';
-        netDet = cv.readNetFromCaffe('face_detector.prototxt', 'face_detector.caffemodel');
-        netRecogn = cv.readNetFromTorch('face_recognition.t7');
-        callback();
-      });
+  var detectModel = 'https://media.githubusercontent.com/media/opencv/opencv_zoo/main/models/face_detection_yunet/face_detection_yunet_2023mar.onnx';
+  var recognModel =  'https://media.githubusercontent.com/media/opencv/opencv_zoo/main/models/face_recognition_sface/face_recognition_sface_2021dec.onnx';
+  document.getElementById('status').innerHTML = 'Downloading YuNet model';
+  utils.createFileFromUrl('face_detection_yunet_2023mar.onnx', detectModel, () => {
+    document.getElementById('status').innerHTML = 'Downloading OpenFace model';
+    utils.createFileFromUrl('face_recognition_sface_2021dec.onnx', recognModel, () => {
+      document.getElementById('status').innerHTML = '';
+      netDet = new cv.FaceDetectorYN("face_detection_yunet_2023mar.onnx", "", new cv.Size(320, 320), 0.9, 0.3, 5000);
+      netRecogn = cv.readNet('face_recognition_sface_2021dec.onnx');
+      callback();
     });
   });
 };
 
 function main() {
+  if(!cv.FaceDetectorYN){
+    alert(`Error: This sample require OpenCV.js built with FaceDetectorYN. Please rebuild it with FaceDetectorYN or use the latest version of OpenCV.js.`);
+    return;
+  }
   // Create a camera object.
   var output = document.getElementById('output');
   var camera = document.createElement("video");
@@ -121,8 +135,8 @@
       persons[name] = face2vec(face).clone();
 
       var canvas = document.createElement("canvas");
-      canvas.setAttribute("width", 96);
-      canvas.setAttribute("height", 96);
+      canvas.setAttribute("width", 112);
+      canvas.setAttribute("height", 112);
       var cell = document.getElementById("targetImgs").insertCell(0);
       cell.appendChild(canvas);
 
@@ -146,6 +160,16 @@
     var faces = detectFaces(frameBGR);
     faces.forEach(function(rect) {
       cv.rectangle(frame, {x: rect.x, y: rect.y}, {x: rect.x + rect.width, y: rect.y + rect.height}, [0, 255, 0, 255]);
+      if(rect.x1>0 && rect.y1>0)
+        cv.circle(frame, {x: rect.x1, y: rect.y1}, 2, [255, 0, 0, 255], 2)
+      if(rect.x2>0 && rect.y2>0)
+        cv.circle(frame, {x: rect.x2, y: rect.y2}, 2, [0, 0, 255, 255], 2)
+      if(rect.x3>0 && rect.y3>0)
+        cv.circle(frame, {x: rect.x3, y: rect.y3}, 2, [0, 255, 0, 255], 2)
+      if(rect.x4>0 && rect.y4>0)
+        cv.circle(frame, {x: rect.x4, y: rect.y4}, 2, [255, 0, 255, 255], 2)
+      if(rect.x5>0 && rect.y5>0)
+        cv.circle(frame, {x: rect.x5, y: rect.y5}, 2, [0, 255, 255, 255], 2)
 
       var face = frameBGR.roi(rect);
       var name = recognize(face);
diff --git a/samples/dnn/models.yml b/samples/dnn/models.yml
index e6452796057d..b14dae327c7b 100644
--- a/samples/dnn/models.yml
+++ b/samples/dnn/models.yml
@@ -18,10 +18,68 @@ opencv_fd:
   rgb: false
   sample: "object_detection"
 
+# YOLOv8 object detection family from ultralytics (https://github.com/ultralytics/ultralytics)
+# Might be used for all YOLOv8n YOLOv8s YOLOv8m YOLOv8l and YOLOv8x
+yolov8x:
+  load_info:
+    url: "https://huggingface.co/cabelo/yolov8/resolve/main/yolov8x.onnx?download=true"
+    sha1: "462f15d668c046d38e27d3df01fe8142dd004cb4"
+  model: "yolov8x.onnx"
+  mean: 0.0
+  scale: 0.00392
+  width: 640
+  height: 640
+  rgb: true
+  classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
+  sample: "yolo_detector"
+
+yolov8s:
+  load_info:
+    url: "https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov8s.onnx"
+    sha1: "82cd83984396fe929909ecb58212b0e86d0904b1"
+  model: "yolov8s.onnx"
+  mean: 0.0
+  scale: 0.00392
+  width: 640
+  height: 640
+  rgb: true
+  classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
+  sample: "yolo_detector"
+
+yolov8n:
+  load_info:
+    url: "https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov8n.onnx"
+    sha1: "68f864475d06e2ec4037181052739f268eeac38d"
+  model: "yolov8n.onnx"
+  mean: 0.0
+  scale: 0.00392
+  width: 640
+  height: 640
+  rgb: true
+  classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
+  sample: "yolo_detector"
+
+yolov8m:
+  load_info:
+    url: "https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov8m.onnx"
+    sha1: "656ffeb4f3b067bc30df956728b5f9c61a4cb090"
+  model: "yolov8m.onnx"
+  mean: 0.0
+  scale: 0.00392
+  width: 640
+  height: 640
+  rgb: true
+  classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
+  sample: "yolo_detector"
+
 # YOLO4 object detection family from Darknet (https://github.com/AlexeyAB/darknet)
 # YOLO object detection family from Darknet (https://pjreddie.com/darknet/yolo/)
 # Might be used for all YOLOv2, TinyYolov2, YOLOv3, YOLOv4 and TinyYolov4
-yolo:
+yolov4:
   load_info:
     url: "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights"
     sha1: "0143deb6c46fcc7f74dd35bf3c14edc3784e99ee"
@@ -32,7 +90,8 @@ yolo:
   width: 416
   height: 416
   rgb: true
-  classes: "object_detection_classes_yolov4.txt"
+  classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
   sample: "object_detection"
 
 yolov4-tiny:
@@ -46,7 +105,23 @@ yolov4-tiny:
   width: 416
   height: 416
   rgb: true
-  classes: "object_detection_classes_yolov4.txt"
+  classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
+  sample: "object_detection"
+
+yolov3:
+  load_info:
+    url: "https://pjreddie.com/media/files/yolov3.weights"
+    sha1: "520878f12e97cf820529daea502acca380f1cb8e"
+  model: "yolov3.weights"
+  config: "yolov3.cfg"
+  mean: [0, 0, 0]
+  scale: 0.00392
+  width: 416
+  height: 416
+  rgb: true
+  classes: "object_detection_classes_yolo.txt"
+  background_label_id: 0
   sample: "object_detection"
 
 tiny-yolo-voc:
@@ -61,6 +136,21 @@ tiny-yolo-voc:
   height: 416
   rgb: true
   classes: "object_detection_classes_pascal_voc.txt"
+  background_label_id: 0
+  sample: "object_detection"
+
+yolov8:
+  load_info:
+    url: "https://github.com/CVHub520/X-AnyLabeling/releases/download/v0.1.0/yolov8n.onnx"
+    sha1: "68f864475d06e2ec4037181052739f268eeac38d"
+  model: "yolov8n.onnx"
+  mean: [0, 0, 0]
+  scale: 0.00392
+  width: 640
+  height: 640
+  rgb: true
+  postprocessing: "yolov8"
+  classes: "object_detection_classes_yolo.txt"
   sample: "object_detection"
 
 # Caffe implementation of SSD model from https://github.com/chuanqi305/MobileNet-SSD
@@ -178,3 +268,15 @@ fcn8s:
   height: 500
   rgb: false
   sample: "segmentation"
+
+fcnresnet101:
+  load_info:
+    url: "https://github.com/onnx/models/raw/fb8271d5d5d9b90dbb1eb5e8e40f8f580fb248b3/vision/object_detection_segmentation/fcn/model/fcn-resnet101-11.onnx"
+    sha1: "e7e76474bf6b73334ab32c4be1374c9e605f5aed"
+  model: "fcn-resnet101-11.onnx"
+  mean: [103.5, 116.2, 123.6]
+  scale: 0.019
+  width: 500
+  height: 500
+  rgb: false
+  sample: "segmentation"
diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp
index 6fc8b2ab6164..a0c255fd4eee 100644
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -5,7 +5,7 @@
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 
-#if defined(CV_CXX11) && defined(HAVE_THREADS)
+#if defined(HAVE_THREADS)
 #define USE_THREADS 1
 #endif
 
diff --git a/samples/dnn/object_detection.py b/samples/dnn/object_detection.py
index 0ca55861596d..875ed3929f67 100644
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@@ -2,6 +2,7 @@
 import argparse
 import numpy as np
 import sys
+import copy
 import time
 from threading import Thread
 if sys.version_info[0] == 2:
@@ -27,7 +28,7 @@
                     help='For models from TensorFlow Object Detection API, you may '
                          'pass a .config file which was used for training through --config '
                          'argument. This way an additional .pbtxt file with TensorFlow graph will be created.')
-parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'dldt'],
+parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'dldt', 'onnx'],
                     help='Optional name of an origin framework of the model. '
                          'Detect it automatically if it does not set.')
 parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
@@ -86,7 +87,7 @@
         classes = f.read().rstrip('\n').split('\n')
 
 # Load a network
-net = cv.dnn.readNet(cv.samples.findFile(args.model), cv.samples.findFile(args.config), args.framework)
+net = cv.dnn.readNet(args.model, args.config, args.framework)
 net.setPreferableBackend(args.backend)
 net.setPreferableTarget(args.target)
 outNames = net.getUnconnectedOutLayersNames()
@@ -145,20 +146,32 @@ def drawPred(classId, conf, left, top, right, bottom):
                     classIds.append(int(detection[1]) - 1)  # Skip background label
                     confidences.append(float(confidence))
                     boxes.append([left, top, width, height])
-    elif lastLayer.type == 'Region':
+    elif lastLayer.type == 'Region' or args.postprocessing == 'yolov8':
         # Network produces output blob with a shape NxC where N is a number of
         # detected objects and C is a number of classes + 4 where the first 4
         # numbers are [center_x, center_y, width, height]
+        if args.postprocessing == 'yolov8':
+            box_scale_w = frameWidth / args.width
+            box_scale_h = frameHeight / args.height
+        else:
+            box_scale_w = frameWidth
+            box_scale_h = frameHeight
+
         for out in outs:
+            if args.postprocessing == 'yolov8':
+                out = out[0].transpose(1, 0)
+
             for detection in out:
-                scores = detection[5:]
+                scores = detection[4:]
+                if args.background_label_id >= 0:
+                    scores = np.delete(scores, args.background_label_id)
                 classId = np.argmax(scores)
                 confidence = scores[classId]
                 if confidence > confThreshold:
-                    center_x = int(detection[0] * frameWidth)
-                    center_y = int(detection[1] * frameHeight)
-                    width = int(detection[2] * frameWidth)
-                    height = int(detection[3] * frameHeight)
+                    center_x = int(detection[0] * box_scale_w)
+                    center_y = int(detection[1] * box_scale_h)
+                    width = int(detection[2] * box_scale_w)
+                    height = int(detection[3] * box_scale_h)
                     left = int(center_x - width / 2)
                     top = int(center_y - height / 2)
                     classIds.append(classId)
@@ -170,7 +183,7 @@ def drawPred(classId, conf, left, top, right, bottom):
 
     # NMS is used inside Region layer only on DNN_BACKEND_OPENCV for another backends we need NMS in sample
     # or NMS is required if number of outputs > 1
-    if len(outNames) > 1 or lastLayer.type == 'Region' and args.backend != cv.dnn.DNN_BACKEND_OPENCV:
+    if len(outNames) > 1 or (lastLayer.type == 'Region' or args.postprocessing == 'yolov8') and args.backend != cv.dnn.DNN_BACKEND_OPENCV:
         indices = []
         classIds = np.array(classIds)
         boxes = np.array(boxes)
@@ -181,7 +194,6 @@ def drawPred(classId, conf, left, top, right, bottom):
             conf = confidences[class_indices]
             box  = boxes[class_indices].tolist()
             nms_indices = cv.dnn.NMSBoxes(box, conf, confThreshold, nmsThreshold)
-            nms_indices = nms_indices[:, 0] if len(nms_indices) else []
             indices.extend(class_indices[nms_indices])
     else:
         indices = np.arange(0, len(classIds))
@@ -282,11 +294,11 @@ def processingThreadBody():
                 futureOutputs.append(net.forwardAsync())
             else:
                 outs = net.forward(outNames)
-                predictionsQueue.put(np.copy(outs))
+                predictionsQueue.put(copy.deepcopy(outs))
 
         while futureOutputs and futureOutputs[0].wait_for(0):
             out = futureOutputs[0].get()
-            predictionsQueue.put(np.copy([out]))
+            predictionsQueue.put(copy.deepcopy([out]))
 
             del futureOutputs[0]
 
diff --git a/samples/dnn/optical_flow.py b/samples/dnn/optical_flow.py
index da2a5808f2d5..efff6b70684c 100644
--- a/samples/dnn/optical_flow.py
+++ b/samples/dnn/optical_flow.py
@@ -1,13 +1,19 @@
 #!/usr/bin/env python
 '''
-This sample using FlowNet v2 model to calculate optical flow.
-Original paper: https://arxiv.org/abs/1612.01925.
-Original repo:  https://github.com/lmb-freiburg/flownet2.
+This sample using FlowNet v2 and RAFT model to calculate optical flow.
+
+FlowNet v2 Original Paper: https://arxiv.org/abs/1612.01925.
+FlowNet v2 Repo:  https://github.com/lmb-freiburg/flownet2.
 
 Download the converted .caffemodel model from https://drive.google.com/open?id=16qvE9VNmU39NttpZwZs81Ga8VYQJDaWZ
 and .prototxt from https://drive.google.com/file/d/1RyNIUsan1ZOh2hpYIH36A-jofAvJlT6a/view?usp=sharing.
 Otherwise download original model from https://lmb.informatik.uni-freiburg.de/resources/binaries/flownet2/flownet2-models.tar.gz,
 convert .h5 model to .caffemodel and modify original .prototxt using .prototxt from link above.
+
+RAFT Original Paper: https://arxiv.org/pdf/2003.12039.pdf
+RAFT Repo: https://github.com/princeton-vl/RAFT
+
+Download the .onnx model from here https://github.com/opencv/opencv_zoo/raw/281d232cd99cd920853106d853c440edd35eb442/models/optical_flow_estimation_raft/optical_flow_estimation_raft_2023aug.onnx.
 '''
 
 import argparse
@@ -17,8 +23,11 @@
 
 
 class OpticalFlow(object):
-    def __init__(self, proto, model, height, width):
-        self.net = cv.dnn.readNetFromCaffe(proto, model)
+    def __init__(self, model, height, width, proto=""):
+        if proto:
+            self.net = cv.dnn.readNetFromCaffe(proto, model)
+        else:
+            self.net = cv.dnn.readNet(model)
         self.net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
         self.height = height
         self.width = width
@@ -26,8 +35,10 @@ def __init__(self, proto, model, height, width):
     def compute_flow(self, first_img, second_img):
         inp0 = cv.dnn.blobFromImage(first_img, size=(self.width, self.height))
         inp1 = cv.dnn.blobFromImage(second_img, size=(self.width, self.height))
+        self.net.setInputsNames(["img0", "img1"])
         self.net.setInput(inp0, "img0")
         self.net.setInput(inp1, "img1")
+
         flow = self.net.forward()
         output = self.motion_to_color(flow)
         return output
@@ -46,7 +57,7 @@ def motion_to_color(self, flow):
         rad = rad[..., np.newaxis] / maxrad
         a = np.arctan2(-fy / maxrad, -fx / maxrad) / np.pi
         fk = (a + 1) / 2.0 * (ncols - 1)
-        k0 = fk.astype(np.int)
+        k0 = fk.astype(np.int32)
         k1 = (k0 + 1) % ncols
         f = fk[..., np.newaxis] - k0[..., np.newaxis]
 
@@ -59,41 +70,47 @@ def motion_to_color(self, flow):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Use this script to calculate optical flow using FlowNetv2',
+    parser = argparse.ArgumentParser(description='Use this script to calculate optical flow',
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('-input', '-i', required=True, help='Path to input video file. Skip this argument to capture frames from a camera.')
     parser.add_argument('--height', default=320, type=int, help='Input height')
     parser.add_argument('--width', default=448, type=int, help='Input width')
-    parser.add_argument('--proto', '-p', default='FlowNet2_deploy_anysize.prototxt', help='Path to prototxt.')
-    parser.add_argument('--model', '-m', default='FlowNet2_weights.caffemodel', help='Path to caffemodel.')
+    parser.add_argument('--proto', '-p', default='', help='Path to prototxt.')
+    parser.add_argument('--model', '-m', required=True, help='Path to model.')
     args, _ = parser.parse_known_args()
 
-    if not os.path.isfile(args.model) or not os.path.isfile(args.proto):
-        raise OSError("Prototxt or caffemodel not exist")
+    if not os.path.isfile(args.model):
+        raise OSError("Model does not exist")
+    if args.proto and not os.path.isfile(args.proto):
+        raise OSError("Prototxt does not exist")
 
     winName = 'Calculation optical flow in OpenCV'
     cv.namedWindow(winName, cv.WINDOW_NORMAL)
     cap = cv.VideoCapture(args.input if args.input else 0)
     hasFrame, first_frame = cap.read()
 
-    divisor = 64.
-    var = {}
-    var['ADAPTED_WIDTH'] = int(np.ceil(args.width/divisor) * divisor)
-    var['ADAPTED_HEIGHT'] = int(np.ceil(args.height/divisor) * divisor)
-    var['SCALE_WIDTH'] = args.width / float(var['ADAPTED_WIDTH'])
-    var['SCALE_HEIGHT'] = args.height / float(var['ADAPTED_HEIGHT'])
+    if args.proto:
+        divisor = 64.
+        var = {}
+        var['ADAPTED_WIDTH'] = int(np.ceil(args.width/divisor) * divisor)
+        var['ADAPTED_HEIGHT'] = int(np.ceil(args.height/divisor) * divisor)
+        var['SCALE_WIDTH'] = args.width / float(var['ADAPTED_WIDTH'])
+        var['SCALE_HEIGHT'] = args.height / float(var['ADAPTED_HEIGHT'])
+
+        config = ''
+        proto = open(args.proto).readlines()
+        for line in proto:
+            for key, value in var.items():
+                tag = "$%s$" % key
+                line = line.replace(tag, str(value))
+            config += line
 
-    config = ''
-    proto = open(args.proto).readlines()
-    for line in proto:
-        for key, value in var.items():
-            tag = "$%s$" % key
-            line = line.replace(tag, str(value))
-        config += line
+        caffemodel = open(args.model, 'rb').read()
 
-    caffemodel = open(args.model, 'rb').read()
+        opt_flow = OpticalFlow(caffemodel, var['ADAPTED_HEIGHT'], var['ADAPTED_WIDTH'], bytearray(config.encode()))
+    else:
+        opt_flow = OpticalFlow(args.model, 360, 480)
 
-    opt_flow = OpticalFlow(bytearray(config.encode()), caffemodel, var['ADAPTED_HEIGHT'], var['ADAPTED_WIDTH'])
     while cv.waitKey(1) < 0:
         hasFrame, second_frame = cap.read()
         if not hasFrame:
diff --git a/samples/dnn/segmentation.py b/samples/dnn/segmentation.py
index 09f3f8dd1159..8e4e43522534 100644
--- a/samples/dnn/segmentation.py
+++ b/samples/dnn/segmentation.py
@@ -14,7 +14,7 @@
 parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
                     help='An optional path to file with preprocessing parameters.')
 parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
-parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet'],
+parser.add_argument('--framework', choices=['caffe', 'tensorflow', 'torch', 'darknet', 'onnx'],
                     help='Optional name of an origin framework of the model. '
                          'Detect it automatically if it does not set.')
 parser.add_argument('--colors', help='Optional path to a text file with colors for an every class. '
diff --git a/samples/dnn/text_detection.cpp b/samples/dnn/text_detection.cpp
index 24902abfd726..d97ec7bc9ba7 100644
--- a/samples/dnn/text_detection.cpp
+++ b/samples/dnn/text_detection.cpp
@@ -31,8 +31,8 @@ const char* keys =
     "{ help  h              | | Print help message. }"
     "{ input i              | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
     "{ detModel dmp         | | Path to a binary .pb file contains trained detector network.}"
-    "{ width                | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
-    "{ height               | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
+    "{ width                | 320 | Preprocess input image by resizing to a specific width. It should be a multiple of 32. }"
+    "{ height               | 320 | Preprocess input image by resizing to a specific height. It should be a multiple of 32. }"
     "{ thr                  | 0.5 | Confidence threshold. }"
     "{ nms                  | 0.4 | Non-maximum suppression threshold. }"
     "{ recModel rmp         | | Path to a binary .onnx file contains trained CRNN text recognition model. "
diff --git a/samples/dnn/vit_tracker.cpp b/samples/dnn/vit_tracker.cpp
new file mode 100644
index 000000000000..02e5cea83fd5
--- /dev/null
+++ b/samples/dnn/vit_tracker.cpp
@@ -0,0 +1,176 @@
+// VitTracker
+// model: https://github.com/opencv/opencv_zoo/tree/main/models/object_tracking_vittrack
+
+#include <iostream>
+#include <cmath>
+
+#include <opencv2/dnn.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/video.hpp>
+
+using namespace cv;
+using namespace cv::dnn;
+
+const char *keys =
+        "{ help     h  |   | Print help message }"
+        "{ input    i  |   | Full path to input video folder, the specific camera index. (empty for camera 0) }"
+        "{ net    | vitTracker.onnx | Path to onnx model of vitTracker.onnx}"
+        "{ backend     | 0 | Choose one of computation backends: "
+                            "0: automatically (by default), "
+                            "1: Halide language (http://halide-lang.org/), "
+                            "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                            "3: OpenCV implementation, "
+                            "4: VKCOM, "
+                            "5: CUDA },"
+        "{ target      | 0 | Choose one of target computation devices: "
+                            "0: CPU target (by default), "
+                            "1: OpenCL, "
+                            "2: OpenCL fp16 (half-float precision), "
+                            "3: VPU, "
+                            "4: Vulkan, "
+                            "6: CUDA, "
+                            "7: CUDA fp16 (half-float preprocess) }"
+;
+
+static
+int run(int argc, char** argv)
+{
+    // Parse command line arguments.
+    CommandLineParser parser(argc, argv, keys);
+
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    std::string inputName = parser.get<String>("input");
+    std::string net = parser.get<String>("net");
+    int backend = parser.get<int>("backend");
+    int target = parser.get<int>("target");
+
+    Ptr<TrackerVit> tracker;
+    try
+    {
+        TrackerVit::Params params;
+        params.net = samples::findFile(net);
+        params.backend = backend;
+        params.target = target;
+        tracker = TrackerVit::create(params);
+    }
+    catch (const cv::Exception& ee)
+    {
+        std::cerr << "Exception: " << ee.what() << std::endl;
+        std::cout << "Can't load the network by using the following files:" << std::endl;
+        std::cout << "net : " << net << std::endl;
+        return 2;
+    }
+
+    const std::string winName = "vitTracker";
+    namedWindow(winName, WINDOW_AUTOSIZE);
+
+    // Open a video file or an image file or a camera stream.
+    VideoCapture cap;
+
+    if (inputName.empty() || (isdigit(inputName[0]) && inputName.size() == 1))
+    {
+        int c = inputName.empty() ? 0 : inputName[0] - '0';
+        std::cout << "Trying to open camera #" << c << " ..." << std::endl;
+        if (!cap.open(c))
+        {
+            std::cout << "Capture from camera #" << c << " didn't work. Specify -i=<video> parameter to read from video file" << std::endl;
+            return 2;
+        }
+    }
+    else if (inputName.size())
+    {
+        inputName = samples::findFileOrKeep(inputName);
+        if (!cap.open(inputName))
+        {
+            std::cout << "Could not open: " << inputName << std::endl;
+            return 2;
+        }
+    }
+
+    // Read the first image.
+    Mat image;
+    cap >> image;
+    if (image.empty())
+    {
+        std::cerr << "Can't capture frame!" << std::endl;
+        return 2;
+    }
+
+    Mat image_select = image.clone();
+    putText(image_select, "Select initial bounding box you want to track.", Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+    putText(image_select, "And Press the ENTER key.", Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+
+    Rect selectRect = selectROI(winName, image_select);
+    std::cout << "ROI=" << selectRect << std::endl;
+
+    tracker->init(image, selectRect);
+
+    TickMeter tickMeter;
+
+    for (int count = 0; ; ++count)
+    {
+        cap >> image;
+        if (image.empty())
+        {
+            std::cerr << "Can't capture frame " << count << ". End of video stream?" << std::endl;
+            break;
+        }
+
+        Rect rect;
+
+        tickMeter.start();
+        bool ok = tracker->update(image, rect);
+        tickMeter.stop();
+
+        float score = tracker->getTrackingScore();
+
+        std::cout << "frame " << count <<
+            ": predicted score=" << score <<
+            "  rect=" << rect <<
+            "  time=" << tickMeter.getTimeMilli() << "ms" <<
+            std::endl;
+
+        Mat render_image = image.clone();
+
+        if (ok)
+        {
+            rectangle(render_image, rect, Scalar(0, 255, 0), 2);
+
+            std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
+            std::string scoreLabel = format("Score: %f", score);
+            putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+            putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+        }
+
+        imshow(winName, render_image);
+
+        tickMeter.reset();
+
+        int c = waitKey(1);
+        if (c == 27 /*ESC*/)
+            break;
+    }
+
+    std::cout << "Exit" << std::endl;
+    return 0;
+}
+
+
+int main(int argc, char **argv)
+{
+    try
+    {
+        return run(argc, argv);
+    }
+    catch (const std::exception& e)
+    {
+        std::cerr << "FATAL: C++ exception: " << e.what() << std::endl;
+        return 1;
+    }
+}
diff --git a/samples/dnn/yolo_detector.cpp b/samples/dnn/yolo_detector.cpp
new file mode 100644
index 000000000000..b439b0d4bc45
--- /dev/null
+++ b/samples/dnn/yolo_detector.cpp
@@ -0,0 +1,370 @@
+/**
+ * @file yolo_detector.cpp
+ * @brief Yolo Object Detection Sample
+ * @author OpenCV team
+ */
+
+//![includes]
+#include <opencv2/dnn.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <fstream>
+#include <sstream>
+#include "iostream"
+#include "common.hpp"
+#include <opencv2/highgui.hpp>
+//![includes]
+
+using namespace cv;
+using namespace cv::dnn;
+
+void getClasses(std::string classesFile);
+void drawPrediction(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);
+void yoloPostProcessing(
+    std::vector<Mat>& outs,
+    std::vector<int>& keep_classIds,
+    std::vector<float>& keep_confidences,
+    std::vector<Rect2d>& keep_boxes,
+    float conf_threshold,
+    float iou_threshold,
+    const std::string& test_name
+);
+
+std::vector<std::string> classes;
+
+
+std::string keys =
+    "{ help  h     |   | Print help message. }"
+    "{ device      | 0 | camera device number. }"
+    "{ model       | onnx/models/yolox_s_inf_decoder.onnx | Default model. }"
+    "{ yolo        | yolox | yolo model version. }"
+    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
+    "{ classes     | | Optional path to a text file with names of classes to label detected objects. }"
+    "{ thr         | .5 | Confidence threshold. }"
+    "{ nms         | .4 | Non-maximum suppression threshold. }"
+    "{ mean        | 0.0 | Normalization constant. }"
+    "{ scale       | 1.0 | Preprocess input image by multiplying on a scale factor. }"
+    "{ width       | 640 | Preprocess input image by resizing to a specific width. }"
+    "{ height      | 640 | Preprocess input image by resizing to a specific height. }"
+    "{ rgb         | 1 | Indicate that model works with RGB input images instead BGR ones. }"
+    "{ padvalue    | 114.0 | padding value. }"
+    "{ paddingmode | 2 | Choose one of computation backends: "
+                         "0: resize to required input size without extra processing, "
+                         "1: Image will be cropped after resize, "
+                         "2: Resize image to the desired size while preserving the aspect ratio of original image }"
+    "{ backend     |  0 | Choose one of computation backends: "
+                         "0: automatically (by default), "
+                         "1: Halide language (http://halide-lang.org/), "
+                         "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                         "3: OpenCV implementation, "
+                         "4: VKCOM, "
+                         "5: CUDA }"
+    "{ target      | 0 | Choose one of target computation devices: "
+                         "0: CPU target (by default), "
+                         "1: OpenCL, "
+                         "2: OpenCL fp16 (half-float precision), "
+                         "3: VPU, "
+                         "4: Vulkan, "
+                         "6: CUDA, "
+                         "7: CUDA fp16 (half-float preprocess) }"
+    "{ async       | 0 | Number of asynchronous forwards at the same time. "
+                        "Choose 0 for synchronous mode }";
+
+void getClasses(std::string classesFile)
+{
+    std::ifstream ifs(classesFile.c_str());
+    if (!ifs.is_open())
+        CV_Error(Error::StsError, "File " + classesFile  + " not found");
+    std::string line;
+    while (std::getline(ifs, line))
+        classes.push_back(line);
+}
+
+void drawPrediction(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
+{
+    rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));
+
+    std::string label = format("%.2f", conf);
+    if (!classes.empty())
+    {
+        CV_Assert(classId < (int)classes.size());
+        label = classes[classId] + ": " + label;
+    }
+
+    int baseLine;
+    Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+    top = max(top, labelSize.height);
+    rectangle(frame, Point(left, top - labelSize.height),
+              Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
+    putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
+}
+
+void yoloPostProcessing(
+    std::vector<Mat>& outs,
+    std::vector<int>& keep_classIds,
+    std::vector<float>& keep_confidences,
+    std::vector<Rect2d>& keep_boxes,
+    float conf_threshold,
+    float iou_threshold,
+    const std::string& test_name)
+{
+    // Retrieve
+    std::vector<int> classIds;
+    std::vector<float> confidences;
+    std::vector<Rect2d> boxes;
+
+    if (test_name == "yolov8")
+    {
+        cv::transposeND(outs[0], {0, 2, 1}, outs[0]);
+    }
+
+    if (test_name == "yolonas")
+    {
+        // outs contains 2 elemets of shape [1, 8400, 80] and [1, 8400, 4]. Concat them to get [1, 8400, 84]
+        Mat concat_out;
+        // squeeze the first dimension
+        outs[0] = outs[0].reshape(1, outs[0].size[1]);
+        outs[1] = outs[1].reshape(1, outs[1].size[1]);
+        cv::hconcat(outs[1], outs[0], concat_out);
+        outs[0] = concat_out;
+        // remove the second element
+        outs.pop_back();
+        // unsqueeze the first dimension
+        outs[0] = outs[0].reshape(0, std::vector<int>{1, 8400, 84});
+    }
+
+    for (auto preds : outs)
+    {
+        preds = preds.reshape(1, preds.size[1]); // [1, 8400, 85] -> [8400, 85]
+        for (int i = 0; i < preds.rows; ++i)
+        {
+            // filter out non object
+            float obj_conf = (test_name == "yolov8" || test_name == "yolonas") ? 1.0f : preds.at<float>(i, 4) ;
+            if (obj_conf < conf_threshold)
+                continue;
+
+            Mat scores = preds.row(i).colRange((test_name == "yolov8" || test_name == "yolonas") ? 4 : 5, preds.cols);
+            double conf;
+            Point maxLoc;
+            minMaxLoc(scores, 0, &conf, 0, &maxLoc);
+
+            conf = (test_name == "yolov8" || test_name == "yolonas") ? conf : conf * obj_conf;
+            if (conf < conf_threshold)
+                continue;
+
+            // get bbox coords
+            float* det = preds.ptr<float>(i);
+            double cx = det[0];
+            double cy = det[1];
+            double w = det[2];
+            double h = det[3];
+
+            // [x1, y1, x2, y2]
+            if (test_name == "yolonas"){
+                boxes.push_back(Rect2d(cx, cy, w, h));
+            } else {
+                boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h,
+                                        cx + 0.5 * w, cy + 0.5 * h));
+            }
+            classIds.push_back(maxLoc.x);
+            confidences.push_back(static_cast<float>(conf));
+        }
+    }
+
+    // NMS
+    std::vector<int> keep_idx;
+    NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, keep_idx);
+
+    for (auto i : keep_idx)
+    {
+        keep_classIds.push_back(classIds[i]);
+        keep_confidences.push_back(confidences[i]);
+        keep_boxes.push_back(boxes[i]);
+    }
+}
+
+/**
+ * @function main
+ * @brief Main function
+ */
+int main(int argc, char** argv)
+{
+    CommandLineParser parser(argc, argv, keys);
+    parser.about("Use this script to run object detection deep learning networks using OpenCV.");
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    CV_Assert(parser.has("model"));
+    CV_Assert(parser.has("yolo"));
+    // if model is default, use findFile to get the full path otherwise use the given path
+    std::string weightPath = findFile(parser.get<String>("model"));
+    std::string yolo_model = parser.get<String>("yolo");
+
+    float confThreshold = parser.get<float>("thr");
+    float nmsThreshold = parser.get<float>("nms");
+    //![preprocess_params]
+    float paddingValue = parser.get<float>("padvalue");
+    bool swapRB = parser.get<bool>("rgb");
+    int inpWidth = parser.get<int>("width");
+    int inpHeight = parser.get<int>("height");
+    Scalar scale = parser.get<float>("scale");
+    Scalar mean = parser.get<Scalar>("mean");
+    ImagePaddingMode paddingMode = static_cast<ImagePaddingMode>(parser.get<int>("paddingmode"));
+    //![preprocess_params]
+
+    // check if yolo model is valid
+    if (yolo_model != "yolov5" && yolo_model != "yolov6"
+        && yolo_model != "yolov7" && yolo_model != "yolov8"
+        && yolo_model != "yolox" && yolo_model != "yolonas")
+        CV_Error(Error::StsError, "Invalid yolo model: " + yolo_model);
+
+    // get classes
+    if (parser.has("classes"))
+    {
+        getClasses(findFile(parser.get<String>("classes")));
+    }
+
+    // load model
+    //![read_net]
+    Net net = readNet(weightPath);
+    int backend = parser.get<int>("backend");
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(parser.get<int>("target"));
+    //![read_net]
+
+    VideoCapture cap;
+    Mat img;
+    bool isImage = false;
+    bool isCamera = false;
+
+    // Check if input is given
+    if (parser.has("input"))
+    {
+        String input = parser.get<String>("input");
+        // Check if the input is an image
+        if (input.find(".jpg") != String::npos || input.find(".png") != String::npos)
+        {
+            img = imread(findFile(input));
+            if (img.empty())
+            {
+                CV_Error(Error::StsError, "Cannot read image file: " + input);
+            }
+            isImage = true;
+        }
+        else
+        {
+            cap.open(input);
+            if (!cap.isOpened())
+            {
+                CV_Error(Error::StsError, "Cannot open video " + input);
+            }
+            isCamera = true;
+        }
+    }
+    else
+    {
+        int cameraIndex = parser.get<int>("device");
+        cap.open(cameraIndex);
+        if (!cap.isOpened())
+        {
+            CV_Error(Error::StsError, cv::format("Cannot open camera #%d", cameraIndex));
+        }
+        isCamera = true;
+    }
+
+    // image pre-processing
+    //![preprocess_call]
+    Size size(inpWidth, inpHeight);
+    Image2BlobParams imgParams(
+        scale,
+        size,
+        mean,
+        swapRB,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        paddingMode,
+        paddingValue);
+
+    // rescale boxes back to original image
+    Image2BlobParams paramNet;
+            paramNet.scalefactor = scale;
+            paramNet.size = size;
+            paramNet.mean = mean;
+            paramNet.swapRB = swapRB;
+            paramNet.paddingmode = paddingMode;
+    //![preprocess_call]
+
+    //![forward_buffers]
+    std::vector<Mat> outs;
+    std::vector<int> keep_classIds;
+    std::vector<float> keep_confidences;
+    std::vector<Rect2d> keep_boxes;
+    std::vector<Rect> boxes;
+    //![forward_buffers]
+
+    Mat inp;
+    while (waitKey(1) < 0)
+    {
+
+        if (isCamera)
+            cap >> img;
+        if (img.empty())
+        {
+            std::cout << "Empty frame" << std::endl;
+            waitKey();
+            break;
+        }
+        //![preprocess_call_func]
+        inp = blobFromImageWithParams(img, imgParams);
+        //![preprocess_call_func]
+
+        //![forward]
+        net.setInput(inp);
+        net.forward(outs, net.getUnconnectedOutLayersNames());
+        //![forward]
+
+        //![postprocess]
+        yoloPostProcessing(
+            outs, keep_classIds, keep_confidences, keep_boxes,
+            confThreshold, nmsThreshold,
+            yolo_model);
+        //![postprocess]
+
+        // covert Rect2d to Rect
+        //![draw_boxes]
+        for (auto box : keep_boxes)
+        {
+            boxes.push_back(Rect(cvFloor(box.x), cvFloor(box.y), cvFloor(box.width - box.x), cvFloor(box.height - box.y)));
+        }
+
+        paramNet.blobRectsToImageRects(boxes, boxes, img.size());
+
+        for (size_t idx = 0; idx < boxes.size(); ++idx)
+        {
+            Rect box = boxes[idx];
+            drawPrediction(keep_classIds[idx], keep_confidences[idx], box.x, box.y,
+                    box.width + box.x, box.height + box.y, img);
+        }
+
+        const std::string kWinName = "Yolo Object Detector";
+        namedWindow(kWinName, WINDOW_NORMAL);
+        imshow(kWinName, img);
+        //![draw_boxes]
+
+        outs.clear();
+        keep_classIds.clear();
+        keep_confidences.clear();
+        keep_boxes.clear();
+        boxes.clear();
+
+        if (isImage)
+        {
+            waitKey();
+            break;
+        }
+    }
+}
diff --git a/samples/python/tracker.py b/samples/python/tracker.py
index 42b17018c955..f520454ba54e 100644
--- a/samples/python/tracker.py
+++ b/samples/python/tracker.py
@@ -22,6 +22,7 @@
                     [--dasiamrpn_backend DASIAMRPN_BACKEND]
                     [--dasiamrpn_target DASIAMRPN_TARGET]
                     [--nanotrack_backbone NANOTRACK_BACKEND] [--nanotrack_headneck NANOTRACK_TARGET]
+                    [--vittrack_net VITTRACK_MODEL]
 '''
 
 # Python 2/3 compatibility
@@ -61,6 +62,10 @@ def createTracker(self):
             params.backbone = args.nanotrack_backbone
             params.neckhead = args.nanotrack_headneck
             tracker = cv.TrackerNano_create(params)
+        elif self.trackerAlgorithm == 'vittrack':
+            params = cv.TrackerVit_Params()
+            params.net = args.vittrack_net
+            tracker = cv.TrackerVit_create(params)
         else:
             sys.exit("Tracker {} is not recognized. Please use one of three available: mil, goturn, dasiamrpn, nanotrack.".format(self.trackerAlgorithm))
         return tracker
@@ -126,7 +131,7 @@ def run(self):
     print(__doc__)
     parser = argparse.ArgumentParser(description="Run tracker")
     parser.add_argument("--input", type=str, default="vtest.avi", help="Path to video source")
-    parser.add_argument("--tracker_algo", type=str, default="nanotrack", help="One of available tracking algorithms: mil, goturn, dasiamrpn, nanotrack")
+    parser.add_argument("--tracker_algo", type=str, default="nanotrack", help="One of available tracking algorithms: mil, goturn, dasiamrpn, nanotrack, vittrack")
     parser.add_argument("--goturn", type=str, default="goturn.prototxt", help="Path to GOTURN architecture")
     parser.add_argument("--goturn_model", type=str, default="goturn.caffemodel", help="Path to GOTERN model")
     parser.add_argument("--dasiamrpn_net", type=str, default="dasiamrpn_model.onnx", help="Path to onnx model of DaSiamRPN net")
@@ -134,6 +139,7 @@ def run(self):
     parser.add_argument("--dasiamrpn_kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Path to onnx model of DaSiamRPN kernel_cls1")
     parser.add_argument("--nanotrack_backbone", type=str, default="nanotrack_backbone_sim.onnx", help="Path to onnx model of NanoTrack backBone")
     parser.add_argument("--nanotrack_headneck", type=str, default="nanotrack_head_sim.onnx", help="Path to onnx model of NanoTrack headNeck")
+    parser.add_argument("--vittrack_net", type=str, default="vitTracker.onnx", help="Path to onnx model of  vittrack")
 
     args = parser.parse_args()
     App(args).run()
diff --git a/samples/python/tst_scene_render.py b/samples/python/tst_scene_render.py
index 9d09ea7b9e32..c3eb69ef9caf 100644
--- a/samples/python/tst_scene_render.py
+++ b/samples/python/tst_scene_render.py
@@ -25,7 +25,7 @@ def __init__(self, bgImg = None, fgImg = None,
         if bgImg is not None:
             self.sceneBg = bgImg.copy()
         else:
-            self.sceneBg = np.zeros(defaultSize, defaultSize, np.uint8)
+            self.sceneBg = np.zeros((defaultSize, defaultSize,3), np.uint8)
 
         self.w = self.sceneBg.shape[0]
         self.h = self.sceneBg.shape[1]
@@ -85,7 +85,7 @@ def getNextFrame(self):
             img[self.currentCenter[0]:self.currentCenter[0]+self.foreground.shape[0],
              self.currentCenter[1]:self.currentCenter[1]+self.foreground.shape[1]] = self.foreground
         else:
-            self.currentRect = self.initialRect + np.int( 30*cos(self.time*self.speed) + 50*sin(self.time*self.speed))
+            self.currentRect = self.initialRect + int( 30*cos(self.time*self.speed) + 50*sin(self.time*self.speed))
             if self.deformation:
                 self.currentRect[1:3] += int(self.h/20*cos(self.time))
             cv.fillConvexPoly(img, self.currentRect, (0, 0, 255))